2 











    2 
















    1 





    1 































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Implementation of the Transmission Control Protocol(TCP).
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *                Linus Torvalds, <torvalds@cs.helsinki.fi>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Matthew Dillon, <dillon@apollo.west.oic.com>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Jorge Cwik, <jorge@laser.satlink.net>
 *
 * Fixes:
 *                Alan Cox        :        Numerous verify_area() calls
 *                Alan Cox        :        Set the ACK bit on a reset
 *                Alan Cox        :        Stopped it crashing if it closed while
 *                                        sk->inuse=1 and was trying to connect
 *                                        (tcp_err()).
 *                Alan Cox        :        All icmp error handling was broken
 *                                        pointers passed where wrong and the
 *                                        socket was looked up backwards. Nobody
 *                                        tested any icmp error code obviously.
 *                Alan Cox        :        tcp_err() now handled properly. It
 *                                        wakes people on errors. poll
 *                                        behaves and the icmp error race
 *                                        has gone by moving it into sock.c
 *                Alan Cox        :        tcp_send_reset() fixed to work for
 *                                        everything not just packets for
 *                                        unknown sockets.
 *                Alan Cox        :        tcp option processing.
 *                Alan Cox        :        Reset tweaked (still not 100%) [Had
 *                                        syn rule wrong]
 *                Herp Rosmanith  :        More reset fixes
 *                Alan Cox        :        No longer acks invalid rst frames.
 *                                        Acking any kind of RST is right out.
 *                Alan Cox        :        Sets an ignore me flag on an rst
 *                                        receive otherwise odd bits of prattle
 *                                        escape still
 *                Alan Cox        :        Fixed another acking RST frame bug.
 *                                        Should stop LAN workplace lockups.
 *                Alan Cox        :         Some tidyups using the new skb list
 *                                        facilities
 *                Alan Cox        :        sk->keepopen now seems to work
 *                Alan Cox        :        Pulls options out correctly on accepts
 *                Alan Cox        :        Fixed assorted sk->rqueue->next errors
 *                Alan Cox        :        PSH doesn't end a TCP read. Switched a
 *                                        bit to skb ops.
 *                Alan Cox        :        Tidied tcp_data to avoid a potential
 *                                        nasty.
 *                Alan Cox        :        Added some better commenting, as the
 *                                        tcp is hard to follow
 *                Alan Cox        :        Removed incorrect check for 20 * psh
 *        Michael O'Reilly        :        ack < copied bug fix.
 *        Johannes Stille                :        Misc tcp fixes (not all in yet).
 *                Alan Cox        :        FIN with no memory -> CRASH
 *                Alan Cox        :        Added socket option proto entries.
 *                                        Also added awareness of them to accept.
 *                Alan Cox        :        Added TCP options (SOL_TCP)
 *                Alan Cox        :        Switched wakeup calls to callbacks,
 *                                        so the kernel can layer network
 *                                        sockets.
 *                Alan Cox        :        Use ip_tos/ip_ttl settings.
 *                Alan Cox        :        Handle FIN (more) properly (we hope).
 *                Alan Cox        :        RST frames sent on unsynchronised
 *                                        state ack error.
 *                Alan Cox        :        Put in missing check for SYN bit.
 *                Alan Cox        :        Added tcp_select_window() aka NET2E
 *                                        window non shrink trick.
 *                Alan Cox        :        Added a couple of small NET2E timer
 *                                        fixes
 *                Charles Hedrick :        TCP fixes
 *                Toomas Tamm        :        TCP window fixes
 *                Alan Cox        :        Small URG fix to rlogin ^C ack fight
 *                Charles Hedrick        :        Rewrote most of it to actually work
 *                Linus                :        Rewrote tcp_read() and URG handling
 *                                        completely
 *                Gerhard Koerting:        Fixed some missing timer handling
 *                Matthew Dillon  :        Reworked TCP machine states as per RFC
 *                Gerhard Koerting:        PC/TCP workarounds
 *                Adam Caldwell        :        Assorted timer/timing errors
 *                Matthew Dillon        :        Fixed another RST bug
 *                Alan Cox        :        Move to kernel side addressing changes.
 *                Alan Cox        :        Beginning work on TCP fastpathing
 *                                        (not yet usable)
 *                Arnt Gulbrandsen:        Turbocharged tcp_check() routine.
 *                Alan Cox        :        TCP fast path debugging
 *                Alan Cox        :        Window clamping
 *                Michael Riepe        :        Bug in tcp_check()
 *                Matt Dillon        :        More TCP improvements and RST bug fixes
 *                Matt Dillon        :        Yet more small nasties remove from the
 *                                        TCP code (Be very nice to this man if
 *                                        tcp finally works 100%) 8)
 *                Alan Cox        :        BSD accept semantics.
 *                Alan Cox        :        Reset on closedown bug.
 *        Peter De Schrijver        :        ENOTCONN check missing in tcp_sendto().
 *                Michael Pall        :        Handle poll() after URG properly in
 *                                        all cases.
 *                Michael Pall        :        Undo the last fix in tcp_read_urg()
 *                                        (multi URG PUSH broke rlogin).
 *                Michael Pall        :        Fix the multi URG PUSH problem in
 *                                        tcp_readable(), poll() after URG
 *                                        works now.
 *                Michael Pall        :        recv(...,MSG_OOB) never blocks in the
 *                                        BSD api.
 *                Alan Cox        :        Changed the semantics of sk->socket to
 *                                        fix a race and a signal problem with
 *                                        accept() and async I/O.
 *                Alan Cox        :        Relaxed the rules on tcp_sendto().
 *                Yury Shevchuk        :        Really fixed accept() blocking problem.
 *                Craig I. Hagan  :        Allow for BSD compatible TIME_WAIT for
 *                                        clients/servers which listen in on
 *                                        fixed ports.
 *                Alan Cox        :        Cleaned the above up and shrank it to
 *                                        a sensible code size.
 *                Alan Cox        :        Self connect lockup fix.
 *                Alan Cox        :        No connect to multicast.
 *                Ross Biro        :        Close unaccepted children on master
 *                                        socket close.
 *                Alan Cox        :        Reset tracing code.
 *                Alan Cox        :        Spurious resets on shutdown.
 *                Alan Cox        :        Giant 15 minute/60 second timer error
 *                Alan Cox        :        Small whoops in polling before an
 *                                        accept.
 *                Alan Cox        :        Kept the state trace facility since
 *                                        it's handy for debugging.
 *                Alan Cox        :        More reset handler fixes.
 *                Alan Cox        :        Started rewriting the code based on
 *                                        the RFC's for other useful protocol
 *                                        references see: Comer, KA9Q NOS, and
 *                                        for a reference on the difference
 *                                        between specifications and how BSD
 *                                        works see the 4.4lite source.
 *                A.N.Kuznetsov        :        Don't time wait on completion of tidy
 *                                        close.
 *                Linus Torvalds        :        Fin/Shutdown & copied_seq changes.
 *                Linus Torvalds        :        Fixed BSD port reuse to work first syn
 *                Alan Cox        :        Reimplemented timers as per the RFC
 *                                        and using multiple timers for sanity.
 *                Alan Cox        :        Small bug fixes, and a lot of new
 *                                        comments.
 *                Alan Cox        :        Fixed dual reader crash by locking
 *                                        the buffers (much like datagram.c)
 *                Alan Cox        :        Fixed stuck sockets in probe. A probe
 *                                        now gets fed up of retrying without
 *                                        (even a no space) answer.
 *                Alan Cox        :        Extracted closing code better
 *                Alan Cox        :        Fixed the closing state machine to
 *                                        resemble the RFC.
 *                Alan Cox        :        More 'per spec' fixes.
 *                Jorge Cwik        :        Even faster checksumming.
 *                Alan Cox        :        tcp_data() doesn't ack illegal PSH
 *                                        only frames. At least one pc tcp stack
 *                                        generates them.
 *                Alan Cox        :        Cache last socket.
 *                Alan Cox        :        Per route irtt.
 *                Matt Day        :        poll()->select() match BSD precisely on error
 *                Alan Cox        :        New buffers
 *                Marc Tamsky        :        Various sk->prot->retransmits and
 *                                        sk->retransmits misupdating fixed.
 *                                        Fixed tcp_write_timeout: stuck close,
 *                                        and TCP syn retries gets used now.
 *                Mark Yarvis        :        In tcp_read_wakeup(), don't send an
 *                                        ack if state is TCP_CLOSED.
 *                Alan Cox        :        Look up device on a retransmit - routes may
 *                                        change. Doesn't yet cope with MSS shrink right
 *                                        but it's a start!
 *                Marc Tamsky        :        Closing in closing fixes.
 *                Mike Shaver        :        RFC1122 verifications.
 *                Alan Cox        :        rcv_saddr errors.
 *                Alan Cox        :        Block double connect().
 *                Alan Cox        :        Small hooks for enSKIP.
 *                Alexey Kuznetsov:        Path MTU discovery.
 *                Alan Cox        :        Support soft errors.
 *                Alan Cox        :        Fix MTU discovery pathological case
 *                                        when the remote claims no mtu!
 *                Marc Tamsky        :        TCP_CLOSE fix.
 *                Colin (G3TNE)        :        Send a reset on syn ack replies in
 *                                        window but wrong (fixes NT lpd problems)
 *                Pedro Roque        :        Better TCP window handling, delayed ack.
 *                Joerg Reuter        :        No modification of locked buffers in
 *                                        tcp_do_retransmit()
 *                Eric Schenk        :        Changed receiver side silly window
 *                                        avoidance algorithm to BSD style
 *                                        algorithm. This doubles throughput
 *                                        against machines running Solaris,
 *                                        and seems to result in general
 *                                        improvement.
 *        Stefan Magdalinski        :        adjusted tcp_readable() to fix FIONREAD
 *        Willy Konynenberg        :        Transparent proxying support.
 *        Mike McLagan                :        Routing by source
 *                Keith Owens        :        Do proper merging with partial SKB's in
 *                                        tcp_do_sendmsg to avoid burstiness.
 *                Eric Schenk        :        Fix fast close down bug with
 *                                        shutdown() followed by close().
 *                Andi Kleen         :        Make poll agree with SIGIO
 *        Salvatore Sanfilippo        :        Support SO_LINGER with linger == 1 and
 *                                        lingertime == 0 (RFC 793 ABORT Call)
 *        Hirokazu Takahashi        :        Use copy_from_user() instead of
 *                                        csum_and_copy_from_user() if possible.
 *
 * Description of States:
 *
 *        TCP_SYN_SENT                sent a connection request, waiting for ack
 *
 *        TCP_SYN_RECV                received a connection request, sent ack,
 *                                waiting for final ack in three-way handshake.
 *
 *        TCP_ESTABLISHED                connection established
 *
 *        TCP_FIN_WAIT1                our side has shutdown, waiting to complete
 *                                transmission of remaining buffered data
 *
 *        TCP_FIN_WAIT2                all buffered data sent, waiting for remote
 *                                to shutdown
 *
 *        TCP_CLOSING                both sides have shutdown but we still have
 *                                data we have to finish sending
 *
 *        TCP_TIME_WAIT                timeout to catch resent junk before entering
 *                                closed, can only be entered from FIN_WAIT2
 *                                or CLOSING.  Required because the other end
 *                                may not have gotten our last ACK causing it
 *                                to retransmit the data packet (which we ignore)
 *
 *        TCP_CLOSE_WAIT                remote side has shutdown and is waiting for
 *                                us to finish writing our data and to shutdown
 *                                (we have to close() to move on to LAST_ACK)
 *
 *        TCP_LAST_ACK                out side has shutdown after remote has
 *                                shutdown.  There may still be data in our
 *                                buffer that we have to finish sending
 *
 *        TCP_CLOSE                socket is finished
 */

#define pr_fmt(fmt) "TCP: " fmt

#include <crypto/md5.h>
#include <crypto/utils.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/poll.h>
#include <linux/inet_diag.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/skbuff.h>
#include <linux/splice.h>
#include <linux/net.h>
#include <linux/socket.h>
#include <linux/random.h>
#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/errqueue.h>
#include <linux/static_key.h>
#include <linux/btf.h>

#include <net/icmp.h>
#include <net/inet_common.h>
#include <net/inet_ecn.h>
#include <net/tcp.h>
#include <net/tcp_ecn.h>
#include <net/mptcp.h>
#include <net/proto_memory.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/psp.h>
#include <net/sock.h>
#include <net/rstreason.h>

#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <net/busy_poll.h>
#include <net/hotdata.h>
#include <trace/events/tcp.h>
#include <net/rps.h>

#include "../core/devmem.h"

/* Track pending CMSGs. */
enum {
        TCP_CMSG_INQ = 1,
        TCP_CMSG_TS = 2
};

DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);

DEFINE_PER_CPU(u32, tcp_tw_isn);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn);

long sysctl_tcp_mem[3] __read_mostly;

DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);

#if IS_ENABLED(CONFIG_SMC)
DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
EXPORT_SYMBOL(tcp_have_smc);
#endif

/*
 * Current number of TCP sockets.
 */
struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;

/*
 * Pressure flag: try to collapse.
 * Technical note: it is used by multiple contexts non atomically.
 * All the __sk_mem_schedule() is of this nature: accounting
 * is strict, actions are advisory and have some latency.
 */
unsigned long tcp_memory_pressure __read_mostly;
EXPORT_SYMBOL_GPL(tcp_memory_pressure);

void tcp_enter_memory_pressure(struct sock *sk)
{
        unsigned long val;

        if (READ_ONCE(tcp_memory_pressure))
                return;
        val = jiffies;

        if (!val)
                val--;
        if (!cmpxchg(&tcp_memory_pressure, 0, val))
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
}

void tcp_leave_memory_pressure(struct sock *sk)
{
        unsigned long val;

        if (!READ_ONCE(tcp_memory_pressure))
                return;
        val = xchg(&tcp_memory_pressure, 0);
        if (val)
                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
                              jiffies_to_msecs(jiffies - val));
}

/* Convert seconds to retransmits based on initial and max timeout */
static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
{
        u8 res = 0;

        if (seconds > 0) {
                int period = timeout;

                res = 1;
                while (seconds > period && res < 255) {
                        res++;
                        timeout <<= 1;
                        if (timeout > rto_max)
                                timeout = rto_max;
                        period += timeout;
                }
        }
        return res;
}

/* Convert retransmits to seconds based on initial and max timeout */
static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
{
        int period = 0;

        if (retrans > 0) {
                period = timeout;
                while (--retrans) {
                        timeout <<= 1;
                        if (timeout > rto_max)
                                timeout = rto_max;
                        period += timeout;
                }
        }
        return period;
}

static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
{
        u32 rate = READ_ONCE(tp->rate_delivered);
        u32 intv = READ_ONCE(tp->rate_interval_us);
        u64 rate64 = 0;

        if (rate && intv) {
                rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
                do_div(rate64, intv);
        }
        return rate64;
}

#ifdef CONFIG_TCP_MD5SIG
void tcp_md5_destruct_sock(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tp->md5sig_info) {

                tcp_clear_md5_list(sk);
                kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1));
                static_branch_slow_dec_deferred(&tcp_md5_needed);
        }
}
#endif

/* Address-family independent initialization for a tcp_sock.
 *
 * NOTE: A lot of things set to zero explicitly by call to
 *       sk_alloc() so need not be done here.
 */
void tcp_init_sock(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int rto_min_us, rto_max_ms;

        tp->out_of_order_queue = RB_ROOT;
        sk->tcp_rtx_queue = RB_ROOT;
        tcp_init_xmit_timers(sk);
        INIT_LIST_HEAD(&tp->tsq_node);
        INIT_LIST_HEAD(&tp->tsorted_sent_queue);

        icsk->icsk_rto = TCP_TIMEOUT_INIT;

        rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms);
        icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms);

        rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us);
        icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us);
        icsk->icsk_delack_max = TCP_DELACK_MAX;
        tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
        minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);

        /* So many TCP implementations out there (incorrectly) count the
         * initial SYN frame in their delayed-ACK and congestion control
         * algorithms that we must have the following bandaid to talk
         * efficiently to them.  -DaveM
         */
        tcp_snd_cwnd_set(tp, TCP_INIT_CWND);

        /* There's a bubble in the pipe until at least the first ACK. */
        tp->app_limited = ~0U;
        tp->rate_app_limited = 1;

        /* See draft-stevens-tcpca-spec-01 for discussion of the
         * initialization of these values.
         */
        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
        tp->snd_cwnd_clamp = ~0;
        tp->mss_cache = TCP_MSS_DEFAULT;

        tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
        tcp_assign_congestion_control(sk);

        tp->tsoffset = 0;
        tp->rack.reo_wnd_steps = 1;

        sk->sk_write_space = sk_stream_write_space;
        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

        icsk->icsk_sync_mss = tcp_sync_mss;

        WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
        WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
        tcp_scaling_ratio_init(sk);

        set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
        sk_sockets_allocated_inc(sk);
        xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1);
}

static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
{
        struct sk_buff *skb = tcp_write_queue_tail(sk);
        u32 tsflags = sockc->tsflags;

        if (unlikely(!skb))
                skb = skb_rb_last(&sk->tcp_rtx_queue);

        if (tsflags && skb) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);
                struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

                sock_tx_timestamp(sk, sockc, &shinfo->tx_flags);
                if (tsflags & SOF_TIMESTAMPING_TX_ACK)
                        tcb->txstamp_ack |= TSTAMP_ACK_SK;
                if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
                        shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
        }

        if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
            SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb)
                bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB);
}

/* @wake is one when sk_stream_write_space() calls us.
 * This sends EPOLLOUT only if notsent_bytes is half the limit.
 * This mimics the strategy used in sock_def_write_space().
 */
bool tcp_stream_memory_free(const struct sock *sk, int wake)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);

        return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
}
EXPORT_SYMBOL(tcp_stream_memory_free);

static bool tcp_stream_is_readable(struct sock *sk, int target)
{
        if (tcp_epollin_ready(sk, target))
                return true;
        return sk_is_readable(sk);
}

/*
 *        Wait for a TCP event.
 *
 *        Note that we don't need to lock the socket, as the upper poll layers
 *        take care of normal races (between the test and the event) and we don't
 *        go look at any of the socket buffers directly.
 */
__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
        __poll_t mask;
        struct sock *sk = sock->sk;
        const struct tcp_sock *tp = tcp_sk(sk);
        u8 shutdown;
        int state;

        sock_poll_wait(file, sock, wait);

        state = inet_sk_state_load(sk);
        if (state == TCP_LISTEN)
                return inet_csk_listen_poll(sk);

        /* Socket is not locked. We are protected from async events
         * by poll logic and correct handling of state changes
         * made by other threads is impossible in any case.
         */

        mask = 0;

        /*
         * EPOLLHUP is certainly not done right. But poll() doesn't
         * have a notion of HUP in just one direction, and for a
         * socket the read side is more interesting.
         *
         * Some poll() documentation says that EPOLLHUP is incompatible
         * with the EPOLLOUT/POLLWR flags, so somebody should check this
         * all. But careful, it tends to be safer to return too many
         * bits than too few, and you can easily break real applications
         * if you don't tell them that something has hung up!
         *
         * Check-me.
         *
         * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
         * our fs/select.c). It means that after we received EOF,
         * poll always returns immediately, making impossible poll() on write()
         * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
         * if and only if shutdown has been made in both directions.
         * Actually, it is interesting to look how Solaris and DUX
         * solve this dilemma. I would prefer, if EPOLLHUP were maskable,
         * then we could set it on SND_SHUTDOWN. BTW examples given
         * in Stevens' books assume exactly this behaviour, it explains
         * why EPOLLHUP is incompatible with EPOLLOUT.        --ANK
         *
         * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
         * blocking on fresh not-connected or disconnected socket. --ANK
         */
        shutdown = READ_ONCE(sk->sk_shutdown);
        if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
                mask |= EPOLLHUP;
        if (shutdown & RCV_SHUTDOWN)
                mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

        /* Connected or passive Fast Open socket? */
        if (state != TCP_SYN_SENT &&
            (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
                int target = sock_rcvlowat(sk, 0, INT_MAX);
                u16 urg_data = READ_ONCE(tp->urg_data);

                if (unlikely(urg_data) &&
                    READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
                    !sock_flag(sk, SOCK_URGINLINE))
                        target++;

                if (tcp_stream_is_readable(sk, target))
                        mask |= EPOLLIN | EPOLLRDNORM;

                if (!(shutdown & SEND_SHUTDOWN)) {
                        if (__sk_stream_is_writeable(sk, 1)) {
                                mask |= EPOLLOUT | EPOLLWRNORM;
                        } else {  /* send SIGIO later */
                                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);

                                /* Race breaker. If space is freed after
                                 * wspace test but before the flags are set,
                                 * IO signal will be lost. Memory barrier
                                 * pairs with the input side.
                                 */
                                smp_mb__after_atomic();
                                if (__sk_stream_is_writeable(sk, 1))
                                        mask |= EPOLLOUT | EPOLLWRNORM;
                        }
                } else
                        mask |= EPOLLOUT | EPOLLWRNORM;

                if (urg_data & TCP_URG_VALID)
                        mask |= EPOLLPRI;
        } else if (state == TCP_SYN_SENT &&
                   inet_test_bit(DEFER_CONNECT, sk)) {
                /* Active TCP fastopen socket with defer_connect
                 * Return EPOLLOUT so application can call write()
                 * in order for kernel to generate SYN+data
                 */
                mask |= EPOLLOUT | EPOLLWRNORM;
        }
        /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
        smp_rmb();
        if (READ_ONCE(sk->sk_err) ||
            !skb_queue_empty_lockless(&sk->sk_error_queue))
                mask |= EPOLLERR;

        return mask;
}
EXPORT_SYMBOL(tcp_poll);

int tcp_ioctl(struct sock *sk, int cmd, int *karg)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int answ;
        bool slow;

        switch (cmd) {
        case SIOCINQ:
                if (sk->sk_state == TCP_LISTEN)
                        return -EINVAL;

                slow = lock_sock_fast(sk);
                answ = tcp_inq(sk);
                unlock_sock_fast(sk, slow);
                break;
        case SIOCATMARK:
                answ = READ_ONCE(tp->urg_data) &&
                       READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
                break;
        case SIOCOUTQ:
                if (sk->sk_state == TCP_LISTEN)
                        return -EINVAL;

                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
                        answ = 0;
                else
                        answ = READ_ONCE(tp->write_seq) - tp->snd_una;
                break;
        case SIOCOUTQNSD:
                if (sk->sk_state == TCP_LISTEN)
                        return -EINVAL;

                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
                        answ = 0;
                else
                        answ = READ_ONCE(tp->write_seq) -
                               READ_ONCE(tp->snd_nxt);
                break;
        default:
                return -ENOIOCTLCMD;
        }

        *karg = answ;
        return 0;
}

void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
        TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
        tp->pushed_seq = tp->write_seq;
}

static inline bool forced_push(const struct tcp_sock *tp)
{
        return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}

void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

        tcb->seq     = tcb->end_seq = tp->write_seq;
        tcb->tcp_flags = TCPHDR_ACK;
        __skb_header_release(skb);
        psp_enqueue_set_decrypted(sk, skb);
        tcp_add_write_queue_tail(sk, skb);
        sk_wmem_queued_add(sk, skb->truesize);
        sk_mem_charge(sk, skb->truesize);
        if (tp->nonagle & TCP_NAGLE_PUSH)
                tp->nonagle &= ~TCP_NAGLE_PUSH;

        tcp_slow_start_after_idle_check(sk);
}

static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
{
        if (flags & MSG_OOB)
                tp->snd_up = tp->write_seq;
}

/* If a not yet filled skb is pushed, do not send it if
 * we have data packets in Qdisc or NIC queues :
 * Because TX completion will happen shortly, it gives a chance
 * to coalesce future sendmsg() payload into this skb, without
 * need for a timer, and with no latency trade off.
 * As packets containing data payload have a bigger truesize
 * than pure acks (dataless) packets, the last checks prevent
 * autocorking if we only have an ACK in Qdisc/NIC queues,
 * or if TX completion was delayed after we processed ACK packet.
 */
static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
                                int size_goal)
{
        return skb->len < size_goal &&
               READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
               !tcp_rtx_queue_empty(sk) &&
               refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
               tcp_skb_can_collapse_to(skb);
}

void tcp_push(struct sock *sk, int flags, int mss_now,
              int nonagle, int size_goal)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;

        skb = tcp_write_queue_tail(sk);
        if (!skb)
                return;
        if (!(flags & MSG_MORE) || forced_push(tp))
                tcp_mark_push(tp, skb);

        tcp_mark_urg(tp, flags);

        if (tcp_should_autocork(sk, skb, size_goal)) {

                /* avoid atomic op if TSQ_THROTTLED bit is already set */
                if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
                        set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
                        smp_mb__after_atomic();
                }
                /* It is possible TX completion already happened
                 * before we set TSQ_THROTTLED.
                 */
                if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
                        return;
        }

        if (flags & MSG_MORE)
                nonagle = TCP_NAGLE_CORK;

        __tcp_push_pending_frames(sk, mss_now, nonagle);
}

int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
                         unsigned int offset, size_t len)
{
        struct tcp_splice_state *tss = rd_desc->arg.data;
        int ret;

        ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
                              min(rd_desc->count, len), tss->flags);
        if (ret > 0)
                rd_desc->count -= ret;
        return ret;
}

static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
{
        /* Store TCP splice context information in read_descriptor_t. */
        read_descriptor_t rd_desc = {
                .arg.data = tss,
                .count          = tss->len,
        };

        return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
}

/**
 *  tcp_splice_read - splice data from TCP socket to a pipe
 * @sock:        socket to splice from
 * @ppos:        position (not valid)
 * @pipe:        pipe to splice to
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    Will read pages from given socket and fill them into a pipe.
 *
 **/
ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
                        struct pipe_inode_info *pipe, size_t len,
                        unsigned int flags)
{
        struct sock *sk = sock->sk;
        struct tcp_splice_state tss = {
                .pipe = pipe,
                .len = len,
                .flags = flags,
        };
        long timeo;
        ssize_t spliced;
        int ret;

        sock_rps_record_flow(sk);
        /*
         * We can't seek on a socket input
         */
        if (unlikely(*ppos))
                return -ESPIPE;

        ret = spliced = 0;

        lock_sock(sk);

        timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
        while (tss.len) {
                ret = __tcp_splice_read(sk, &tss);
                if (ret < 0)
                        break;
                else if (!ret) {
                        if (spliced)
                                break;
                        if (sock_flag(sk, SOCK_DONE))
                                break;
                        if (sk->sk_err) {
                                ret = sock_error(sk);
                                break;
                        }
                        if (sk->sk_shutdown & RCV_SHUTDOWN)
                                break;
                        if (sk->sk_state == TCP_CLOSE) {
                                /*
                                 * This occurs when user tries to read
                                 * from never connected socket.
                                 */
                                ret = -ENOTCONN;
                                break;
                        }
                        if (!timeo) {
                                ret = -EAGAIN;
                                break;
                        }
                        /* if __tcp_splice_read() got nothing while we have
                         * an skb in receive queue, we do not want to loop.
                         * This might happen with URG data.
                         */
                        if (!skb_queue_empty(&sk->sk_receive_queue))
                                break;
                        ret = sk_wait_data(sk, &timeo, NULL);
                        if (ret < 0)
                                break;
                        if (signal_pending(current)) {
                                ret = sock_intr_errno(timeo);
                                break;
                        }
                        continue;
                }
                tss.len -= ret;
                spliced += ret;

                if (!tss.len || !timeo)
                        break;
                release_sock(sk);
                lock_sock(sk);

                if (tcp_recv_should_stop(sk))
                        break;
        }

        release_sock(sk);

        if (spliced)
                return spliced;

        return ret;
}

/* We allow to exceed memory limits for FIN packets to expedite
 * connection tear down and (memory) recovery.
 * Otherwise tcp_send_fin() could be tempted to either delay FIN
 * or even be forced to close flow without any FIN.
 * In general, we want to allow one skb per socket to avoid hangs
 * with edge trigger epoll()
 */
void sk_forced_mem_schedule(struct sock *sk, int size)
{
        int delta, amt;

        delta = size - sk->sk_forward_alloc;
        if (delta <= 0)
                return;

        amt = sk_mem_pages(delta);
        sk_forward_alloc_add(sk, amt << PAGE_SHIFT);

        if (mem_cgroup_sk_enabled(sk))
                mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);

        if (sk->sk_bypass_prot_mem)
                return;

        sk_memory_allocated_add(sk, amt);
}

struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
                                     bool force_schedule)
{
        struct sk_buff *skb;

        skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
        if (likely(skb)) {
                bool mem_scheduled;

                skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
                if (force_schedule) {
                        mem_scheduled = true;
                        sk_forced_mem_schedule(sk, skb->truesize);
                } else {
                        mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
                }
                if (likely(mem_scheduled)) {
                        skb_reserve(skb, MAX_TCP_HEADER);
                        skb->ip_summed = CHECKSUM_PARTIAL;
                        INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
                        return skb;
                }
                __kfree_skb(skb);
        } else {
                if (!sk->sk_bypass_prot_mem)
                        tcp_enter_memory_pressure(sk);
                sk_stream_moderate_sndbuf(sk);
        }
        return NULL;
}

static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
                                       int large_allowed)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 new_size_goal, size_goal;

        if (!large_allowed)
                return mss_now;

        /* Note : tcp_tso_autosize() will eventually split this later */
        new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);

        /* We try hard to avoid divides here */
        size_goal = tp->gso_segs * mss_now;
        if (unlikely(new_size_goal < size_goal ||
                     new_size_goal >= size_goal + mss_now)) {
                tp->gso_segs = min_t(u16, new_size_goal / mss_now,
                                     sk->sk_gso_max_segs);
                size_goal = tp->gso_segs * mss_now;
        }

        return max(size_goal, mss_now);
}

int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
{
        int mss_now;

        mss_now = tcp_current_mss(sk);
        *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));

        return mss_now;
}

/* In some cases, sendmsg() could have added an skb to the write queue,
 * but failed adding payload on it. We need to remove it to consume less
 * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger
 * epoll() users. Another reason is that tcp_write_xmit() does not like
 * finding an empty skb in the write queue.
 */
void tcp_remove_empty_skb(struct sock *sk)
{
        struct sk_buff *skb = tcp_write_queue_tail(sk);

        if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
                tcp_unlink_write_queue(skb, sk);
                if (tcp_write_queue_empty(sk))
                        tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
                tcp_wmem_free_skb(sk, skb);
        }
}

/* skb changing from pure zc to mixed, must charge zc */
static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
{
        if (unlikely(skb_zcopy_pure(skb))) {
                u32 extra = skb->truesize -
                            SKB_TRUESIZE(skb_end_offset(skb));

                if (!sk_wmem_schedule(sk, extra))
                        return -ENOMEM;

                sk_mem_charge(sk, extra);
                skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
        }
        return 0;
}


int tcp_wmem_schedule(struct sock *sk, int copy)
{
        int left;

        if (likely(sk_wmem_schedule(sk, copy)))
                return copy;

        /* We could be in trouble if we have nothing queued.
         * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0]
         * to guarantee some progress.
         */
        left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]) - sk->sk_wmem_queued;
        if (left > 0)
                sk_forced_mem_schedule(sk, min(left, copy));
        return min(copy, sk->sk_forward_alloc);
}

void tcp_free_fastopen_req(struct tcp_sock *tp)
{
        if (tp->fastopen_req) {
                kfree(tp->fastopen_req);
                tp->fastopen_req = NULL;
        }
}

int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
                         size_t size, struct ubuf_info *uarg)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_sock *inet = inet_sk(sk);
        struct sockaddr *uaddr = msg->msg_name;
        int err, flags;

        if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
              TFO_CLIENT_ENABLE) ||
            (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
             uaddr->sa_family == AF_UNSPEC))
                return -EOPNOTSUPP;
        if (tp->fastopen_req)
                return -EALREADY; /* Another Fast Open is in progress */

        tp->fastopen_req = kzalloc_obj(struct tcp_fastopen_request,
                                       sk->sk_allocation);
        if (unlikely(!tp->fastopen_req))
                return -ENOBUFS;
        tp->fastopen_req->data = msg;
        tp->fastopen_req->size = size;
        tp->fastopen_req->uarg = uarg;

        if (inet_test_bit(DEFER_CONNECT, sk)) {
                err = tcp_connect(sk);
                /* Same failure procedure as in tcp_v4/6_connect */
                if (err) {
                        tcp_set_state(sk, TCP_CLOSE);
                        inet->inet_dport = 0;
                        sk->sk_route_caps = 0;
                }
        }
        flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
        err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr,
                                    msg->msg_namelen, flags, 1);
        /* fastopen_req could already be freed in __inet_stream_connect
         * if the connection times out or gets rst
         */
        if (tp->fastopen_req) {
                *copied = tp->fastopen_req->copied;
                tcp_free_fastopen_req(tp);
                inet_clear_bit(DEFER_CONNECT, sk);
        }
        return err;
}

/* If a gap is detected between sends, mark the socket application-limited. */
void tcp_rate_check_app_limited(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (/* We have less than one packet to send. */
            tp->write_seq - tp->snd_nxt < tp->mss_cache &&
            /* Nothing in sending host's qdisc queues or NIC tx queue. */
            sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
            /* We are not limited by CWND. */
            tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
            /* All lost packets have been retransmitted. */
            tp->lost_out <= tp->retrans_out)
                tp->app_limited =
                        (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
}
EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);

int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
        struct net_devmem_dmabuf_binding *binding = NULL;
        struct tcp_sock *tp = tcp_sk(sk);
        struct ubuf_info *uarg = NULL;
        struct sk_buff *skb;
        struct sockcm_cookie sockc;
        int flags, err, copied = 0;
        int mss_now = 0, size_goal, copied_syn = 0;
        int process_backlog = 0;
        int sockc_err = 0;
        int zc = 0;
        long timeo;

        flags = msg->msg_flags;

        sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) };
        if (msg->msg_controllen) {
                sockc_err = sock_cmsg_send(sk, msg, &sockc);
                /* Don't return error until MSG_FASTOPEN has been processed;
                 * that may succeed even if the cmsg is invalid.
                 */
        }

        if ((flags & MSG_ZEROCOPY) && size) {
                if (msg->msg_ubuf) {
                        uarg = msg->msg_ubuf;
                        if (sk->sk_route_caps & NETIF_F_SG)
                                zc = MSG_ZEROCOPY;
                } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
                        skb = tcp_write_queue_tail(sk);
                        uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb),
                                                    !sockc_err && sockc.dmabuf_id);
                        if (!uarg) {
                                err = -ENOBUFS;
                                goto out_err;
                        }
                        if (sk->sk_route_caps & NETIF_F_SG)
                                zc = MSG_ZEROCOPY;
                        else
                                uarg_to_msgzc(uarg)->zerocopy = 0;

                        if (!sockc_err && sockc.dmabuf_id) {
                                binding = net_devmem_get_binding(sk, sockc.dmabuf_id);
                                if (IS_ERR(binding)) {
                                        err = PTR_ERR(binding);
                                        binding = NULL;
                                        goto out_err;
                                }
                        }
                }
        } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
                if (sk->sk_route_caps & NETIF_F_SG)
                        zc = MSG_SPLICE_PAGES;
        }

        if (!sockc_err && sockc.dmabuf_id &&
            (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) {
                err = -EINVAL;
                goto out_err;
        }

        if (unlikely(flags & MSG_FASTOPEN ||
                     inet_test_bit(DEFER_CONNECT, sk)) &&
            !tp->repair) {
                err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
                if (err == -EINPROGRESS && copied_syn > 0)
                        goto out;
                else if (err)
                        goto out_err;
        }

        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);

        tcp_rate_check_app_limited(sk);  /* is sending application-limited? */

        /* Wait for a connection to finish. One exception is TCP Fast Open
         * (passive side) where data is allowed to be sent before a connection
         * is fully established.
         */
        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
            !tcp_passive_fastopen(sk)) {
                err = sk_stream_wait_connect(sk, &timeo);
                if (err != 0)
                        goto do_error;
        }

        if (unlikely(tp->repair)) {
                if (tp->repair_queue == TCP_RECV_QUEUE) {
                        copied = tcp_send_rcvq(sk, msg, size);
                        goto out_nopush;
                }

                err = -EINVAL;
                if (tp->repair_queue == TCP_NO_QUEUE)
                        goto out_err;

                /* 'common' sending to sendq */
        }

        if (sockc_err) {
                err = sockc_err;
                goto out_err;
        }

        /* This should be in poll */
        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);

        /* Ok commence sending. */
        copied = 0;

restart:
        mss_now = tcp_send_mss(sk, &size_goal, flags);

        err = -EPIPE;
        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
                goto do_error;

        while (msg_data_left(msg)) {
                int copy = 0;

                skb = tcp_write_queue_tail(sk);
                if (skb)
                        copy = size_goal - skb->len;

                trace_tcp_sendmsg_locked(sk, msg, skb, size_goal);

                if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
                        bool first_skb;

new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_space;

                        if (unlikely(process_backlog >= 16)) {
                                process_backlog = 0;
                                if (sk_flush_backlog(sk))
                                        goto restart;
                        }
                        first_skb = tcp_rtx_and_write_queues_empty(sk);
                        skb = tcp_stream_alloc_skb(sk, sk->sk_allocation,
                                                   first_skb);
                        if (!skb)
                                goto wait_for_space;

                        process_backlog++;

#ifdef CONFIG_SKB_DECRYPTED
                        skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
#endif
                        tcp_skb_entail(sk, skb);
                        copy = size_goal;

                        /* All packets are restored as if they have
                         * already been sent. skb_mstamp_ns isn't set to
                         * avoid wrong rtt estimation.
                         */
                        if (tp->repair)
                                TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
                }

                /* Try to append data to the end of skb. */
                if (copy > msg_data_left(msg))
                        copy = msg_data_left(msg);

                if (zc == 0) {
                        bool merge = true;
                        int i = skb_shinfo(skb)->nr_frags;
                        struct page_frag *pfrag = sk_page_frag(sk);

                        if (!sk_page_frag_refill(sk, pfrag))
                                goto wait_for_space;

                        if (!skb_can_coalesce(skb, i, pfrag->page,
                                              pfrag->offset)) {
                                if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
                                        tcp_mark_push(tp, skb);
                                        goto new_segment;
                                }
                                merge = false;
                        }

                        copy = min_t(int, copy, pfrag->size - pfrag->offset);

                        if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) {
                                if (tcp_downgrade_zcopy_pure(sk, skb))
                                        goto wait_for_space;
                                skb_zcopy_downgrade_managed(skb);
                        }

                        copy = tcp_wmem_schedule(sk, copy);
                        if (!copy)
                                goto wait_for_space;

                        err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
                                                       pfrag->page,
                                                       pfrag->offset,
                                                       copy);
                        if (err)
                                goto do_error;

                        /* Update the skb. */
                        if (merge) {
                                skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
                        } else {
                                skb_fill_page_desc(skb, i, pfrag->page,
                                                   pfrag->offset, copy);
                                page_ref_inc(pfrag->page);
                        }
                        pfrag->offset += copy;
                } else if (zc == MSG_ZEROCOPY)  {
                        /* First append to a fragless skb builds initial
                         * pure zerocopy skb
                         */
                        if (!skb->len)
                                skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;

                        if (!skb_zcopy_pure(skb)) {
                                copy = tcp_wmem_schedule(sk, copy);
                                if (!copy)
                                        goto wait_for_space;
                        }

                        err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg,
                                                       binding);
                        if (err == -EMSGSIZE || err == -EEXIST) {
                                tcp_mark_push(tp, skb);
                                goto new_segment;
                        }
                        if (err < 0)
                                goto do_error;
                        copy = err;
                } else if (zc == MSG_SPLICE_PAGES) {
                        /* Splice in data if we can; copy if we can't. */
                        if (tcp_downgrade_zcopy_pure(sk, skb))
                                goto wait_for_space;
                        copy = tcp_wmem_schedule(sk, copy);
                        if (!copy)
                                goto wait_for_space;

                        err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
                        if (err < 0) {
                                if (err == -EMSGSIZE) {
                                        tcp_mark_push(tp, skb);
                                        goto new_segment;
                                }
                                goto do_error;
                        }
                        copy = err;

                        if (!(flags & MSG_NO_SHARED_FRAGS))
                                skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;

                        sk_wmem_queued_add(sk, copy);
                        sk_mem_charge(sk, copy);
                }

                if (!copied)
                        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;

                WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
                TCP_SKB_CB(skb)->end_seq += copy;
                tcp_skb_pcount_set(skb, 0);

                copied += copy;
                if (!msg_data_left(msg)) {
                        if (unlikely(flags & MSG_EOR))
                                TCP_SKB_CB(skb)->eor = 1;
                        goto out;
                }

                if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
                        continue;

                if (forced_push(tp)) {
                        tcp_mark_push(tp, skb);
                        __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
                } else if (skb == tcp_send_head(sk))
                        tcp_push_one(sk, mss_now);
                continue;

wait_for_space:
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                tcp_remove_empty_skb(sk);
                if (copied)
                        tcp_push(sk, flags & ~MSG_MORE, mss_now,
                                 TCP_NAGLE_PUSH, size_goal);

                err = sk_stream_wait_memory(sk, &timeo);
                if (err != 0)
                        goto do_error;

                mss_now = tcp_send_mss(sk, &size_goal, flags);
        }

out:
        if (copied) {
                tcp_tx_timestamp(sk, &sockc);
                tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
        }
out_nopush:
        /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
        if (uarg && !msg->msg_ubuf)
                net_zcopy_put(uarg);
        if (binding)
                net_devmem_dmabuf_binding_put(binding);
        return copied + copied_syn;

do_error:
        tcp_remove_empty_skb(sk);

        if (copied + copied_syn)
                goto out;
out_err:
        /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
        if (uarg && !msg->msg_ubuf)
                net_zcopy_put_abort(uarg, true);
        err = sk_stream_error(sk, flags, err);
        /* make sure we wake any epoll edge trigger waiter */
        if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
                READ_ONCE(sk->sk_write_space)(sk);
                tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
        }
        if (binding)
                net_devmem_dmabuf_binding_put(binding);

        return err;
}
EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);

int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
        int ret;

        lock_sock(sk);
        ret = tcp_sendmsg_locked(sk, msg, size);
        release_sock(sk);

        return ret;
}
EXPORT_SYMBOL(tcp_sendmsg);

void tcp_splice_eof(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct tcp_sock *tp = tcp_sk(sk);
        int mss_now, size_goal;

        if (!tcp_write_queue_tail(sk))
                return;

        lock_sock(sk);
        mss_now = tcp_send_mss(sk, &size_goal, 0);
        tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
        release_sock(sk);
}

/*
 *        Handle reading urgent data. BSD has very simple semantics for
 *        this, no blocking and very strange errors 8)
 */

static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
{
        struct tcp_sock *tp = tcp_sk(sk);

        /* No URG data to read. */
        if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
            tp->urg_data == TCP_URG_READ)
                return -EINVAL;        /* Yes this is right ! */

        if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
                return -ENOTCONN;

        if (tp->urg_data & TCP_URG_VALID) {
                int err = 0;
                char c = tp->urg_data;

                if (!(flags & MSG_PEEK))
                        WRITE_ONCE(tp->urg_data, TCP_URG_READ);

                /* Read urgent data. */
                msg->msg_flags |= MSG_OOB;

                if (len > 0) {
                        if (!(flags & MSG_TRUNC))
                                err = memcpy_to_msg(msg, &c, 1);
                        len = 1;
                } else
                        msg->msg_flags |= MSG_TRUNC;

                return err ? -EFAULT : len;
        }

        if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
                return 0;

        /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
         * the available implementations agree in this case:
         * this call should never block, independent of the
         * blocking state of the socket.
         * Mike <pall@rz.uni-karlsruhe.de>
         */
        return -EAGAIN;
}

static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
{
        struct sk_buff *skb;
        int copied = 0, err = 0;

        skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
                err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
                if (err)
                        return err;
                copied += skb->len;
        }

        skb_queue_walk(&sk->sk_write_queue, skb) {
                err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
                if (err)
                        break;

                copied += skb->len;
        }

        return err ?: copied;
}

/* Clean up the receive buffer for full frames taken by the user,
 * then send an ACK if necessary.  COPIED is the number of bytes
 * tcp_recvmsg has given to the user so far, it speeds up the
 * calculation of whether or not we must ACK for the sake of
 * a window update.
 */
void __tcp_cleanup_rbuf(struct sock *sk, int copied)
{
        struct tcp_sock *tp = tcp_sk(sk);
        bool time_to_ack = false;

        if (inet_csk_ack_scheduled(sk)) {
                const struct inet_connection_sock *icsk = inet_csk(sk);

                if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
                    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
                    /*
                     * If this read emptied read buffer, we send ACK, if
                     * connection is not bidirectional, user drained
                     * receive buffer and there was a small segment
                     * in queue.
                     */
                    (copied > 0 &&
                     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
                      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
                       !inet_csk_in_pingpong_mode(sk))) &&
                      !atomic_read(&sk->sk_rmem_alloc)))
                        time_to_ack = true;
        }

        /* We send an ACK if we can now advertise a non-zero window
         * which has been raised "significantly".
         *
         * Even if window raised up to infinity, do not send window open ACK
         * in states, where we will not receive more. It is useless.
         */
        if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
                __u32 rcv_window_now = tcp_receive_window(tp);

                /* Optimize, __tcp_select_window() is not cheap. */
                if (2*rcv_window_now <= tp->window_clamp) {
                        __u32 new_window = __tcp_select_window(sk);

                        /* Send ACK now, if this read freed lots of space
                         * in our buffer. Certainly, new_window is new window.
                         * We can advertise it now, if it is not less than current one.
                         * "Lots" means "at least twice" here.
                         */
                        if (new_window && new_window >= 2 * rcv_window_now)
                                time_to_ack = true;
                }
        }
        if (time_to_ack) {
                tcp_mstamp_refresh(tp);
                tcp_send_ack(sk);
        }
}

void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
        struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
        struct tcp_sock *tp = tcp_sk(sk);

        WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
             "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
             tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
        __tcp_cleanup_rbuf(sk, copied);
}

static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
{
        __skb_unlink(skb, &sk->sk_receive_queue);
        if (likely(skb->destructor == sock_rfree)) {
                sock_rfree(skb);
                skb->destructor = NULL;
                skb->sk = NULL;
                return skb_attempt_defer_free(skb);
        }
        __kfree_skb(skb);
}

struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
        struct sk_buff *skb;
        u32 offset;

        while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
                offset = seq - TCP_SKB_CB(skb)->seq;
                if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
                        pr_err_once("%s: found a SYN, please report !\n", __func__);
                        offset--;
                }
                if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
                        *off = offset;
                        return skb;
                }
                /* This looks weird, but this can happen if TCP collapsing
                 * splitted a fat GRO packet, while we released socket lock
                 * in skb_splice_bits()
                 */
                tcp_eat_recv_skb(sk, skb);
        }
        return NULL;
}
EXPORT_SYMBOL(tcp_recv_skb);

/*
 * This routine provides an alternative to tcp_recvmsg() for routines
 * that would like to handle copying from skbuffs directly in 'sendfile'
 * fashion.
 * Note:
 *        - It is assumed that the socket was locked by the caller.
 *        - The routine does not block.
 *        - At present, there is no support for reading OOB data
 *          or for 'peeking' the socket using this routine
 *          (although both would be easy to implement).
 */
static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
                           sk_read_actor_t recv_actor, bool noack,
                           u32 *copied_seq)
{
        struct sk_buff *skb;
        struct tcp_sock *tp = tcp_sk(sk);
        u32 seq = *copied_seq;
        u32 offset;
        int copied = 0;

        if (sk->sk_state == TCP_LISTEN)
                return -ENOTCONN;
        while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
                if (offset < skb->len) {
                        int used;
                        size_t len;

                        len = skb->len - offset;
                        /* Stop reading if we hit a patch of urgent data */
                        if (unlikely(tp->urg_data)) {
                                u32 urg_offset = tp->urg_seq - seq;
                                if (urg_offset < len)
                                        len = urg_offset;
                                if (!len)
                                        break;
                        }
                        used = recv_actor(desc, skb, offset, len);
                        if (used <= 0) {
                                if (!copied)
                                        copied = used;
                                break;
                        }
                        if (WARN_ON_ONCE(used > len))
                                used = len;
                        seq += used;
                        copied += used;
                        offset += used;

                        /* If recv_actor drops the lock (e.g. TCP splice
                         * receive) the skb pointer might be invalid when
                         * getting here: tcp_collapse might have deleted it
                         * while aggregating skbs from the socket queue.
                         */
                        skb = tcp_recv_skb(sk, seq - 1, &offset);
                        if (!skb)
                                break;
                        /* TCP coalescing might have appended data to the skb.
                         * Try to splice more frags
                         */
                        if (offset + 1 != skb->len)
                                continue;
                }
                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
                        tcp_eat_recv_skb(sk, skb);
                        ++seq;
                        break;
                }
                tcp_eat_recv_skb(sk, skb);
                if (!desc->count)
                        break;
                WRITE_ONCE(*copied_seq, seq);
        }
        WRITE_ONCE(*copied_seq, seq);

        if (noack)
                goto out;

        tcp_rcv_space_adjust(sk);

        /* Clean up data we have read: This will do ACK frames. */
        if (copied > 0) {
                tcp_recv_skb(sk, seq, &offset);
                tcp_cleanup_rbuf(sk, copied);
        }
out:
        return copied;
}

int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
                  sk_read_actor_t recv_actor)
{
        return __tcp_read_sock(sk, desc, recv_actor, false,
                               &tcp_sk(sk)->copied_seq);
}
EXPORT_SYMBOL(tcp_read_sock);

int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
                        sk_read_actor_t recv_actor, bool noack,
                        u32 *copied_seq)
{
        return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq);
}

int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
        struct sk_buff *skb;
        int copied = 0;

        if (sk->sk_state == TCP_LISTEN)
                return -ENOTCONN;

        while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
                u8 tcp_flags;
                int used;

                __skb_unlink(skb, &sk->sk_receive_queue);
                WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
                tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
                used = recv_actor(sk, skb);
                if (used < 0) {
                        if (!copied)
                                copied = used;
                        break;
                }
                copied += used;

                if (tcp_flags & TCPHDR_FIN)
                        break;
        }
        return copied;
}

void tcp_read_done(struct sock *sk, size_t len)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 seq = tp->copied_seq;
        struct sk_buff *skb;
        size_t left;
        u32 offset;

        if (sk->sk_state == TCP_LISTEN)
                return;

        left = len;
        while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
                int used;

                used = min_t(size_t, skb->len - offset, left);
                seq += used;
                left -= used;

                if (skb->len > offset + used)
                        break;

                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
                        tcp_eat_recv_skb(sk, skb);
                        ++seq;
                        break;
                }
                tcp_eat_recv_skb(sk, skb);
        }
        WRITE_ONCE(tp->copied_seq, seq);

        tcp_rcv_space_adjust(sk);

        /* Clean up data we have read: This will do ACK frames. */
        if (left != len)
                tcp_cleanup_rbuf(sk, len - left);
}
EXPORT_SYMBOL(tcp_read_done);

int tcp_peek_len(struct socket *sock)
{
        return tcp_inq(sock->sk);
}

/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
int tcp_set_rcvlowat(struct sock *sk, int val)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int space, cap;

        if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
                cap = sk->sk_rcvbuf >> 1;
        else
                cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
        val = min(val, cap);
        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);

        /* Check if we need to signal EPOLLIN right now */
        tcp_data_ready(sk);

        if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
                return 0;

        space = tcp_space_from_win(sk, val);
        if (space > sk->sk_rcvbuf) {
                WRITE_ONCE(sk->sk_rcvbuf, space);

                if (tp->window_clamp && tp->window_clamp < val)
                        WRITE_ONCE(tp->window_clamp, val);
        }
        return 0;
}

void tcp_set_rcvbuf(struct sock *sk, int val)
{
        tcp_set_window_clamp(sk, tcp_win_from_space(sk, val));
}

#ifdef CONFIG_MMU
static const struct vm_operations_struct tcp_vm_ops = {
};

int tcp_mmap(struct file *file, struct socket *sock,
             struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_WRITE | VM_EXEC))
                return -EPERM;
        vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC);

        /* Instruct vm_insert_page() to not mmap_read_lock(mm) */
        vm_flags_set(vma, VM_MIXEDMAP);

        vma->vm_ops = &tcp_vm_ops;
        return 0;
}

static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
                                       u32 *offset_frag)
{
        skb_frag_t *frag;

        if (unlikely(offset_skb >= skb->len))
                return NULL;

        offset_skb -= skb_headlen(skb);
        if ((int)offset_skb < 0 || skb_has_frag_list(skb))
                return NULL;

        frag = skb_shinfo(skb)->frags;
        while (offset_skb) {
                if (skb_frag_size(frag) > offset_skb) {
                        *offset_frag = offset_skb;
                        return frag;
                }
                offset_skb -= skb_frag_size(frag);
                ++frag;
        }
        *offset_frag = 0;
        return frag;
}

static bool can_map_frag(const skb_frag_t *frag)
{
        struct page *page;

        if (skb_frag_size(frag) != PAGE_SIZE || skb_frag_off(frag))
                return false;

        page = skb_frag_page(frag);

        if (PageCompound(page) || page->mapping)
                return false;

        return true;
}

static int find_next_mappable_frag(const skb_frag_t *frag,
                                   int remaining_in_skb)
{
        int offset = 0;

        if (likely(can_map_frag(frag)))
                return 0;

        while (offset < remaining_in_skb && !can_map_frag(frag)) {
                offset += skb_frag_size(frag);
                ++frag;
        }
        return offset;
}

static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
                                          struct tcp_zerocopy_receive *zc,
                                          struct sk_buff *skb, u32 offset)
{
        u32 frag_offset, partial_frag_remainder = 0;
        int mappable_offset;
        skb_frag_t *frag;

        /* worst case: skip to next skb. try to improve on this case below */
        zc->recv_skip_hint = skb->len - offset;

        /* Find the frag containing this offset (and how far into that frag) */
        frag = skb_advance_to_frag(skb, offset, &frag_offset);
        if (!frag)
                return;

        if (frag_offset) {
                struct skb_shared_info *info = skb_shinfo(skb);

                /* We read part of the last frag, must recvmsg() rest of skb. */
                if (frag == &info->frags[info->nr_frags - 1])
                        return;

                /* Else, we must at least read the remainder in this frag. */
                partial_frag_remainder = skb_frag_size(frag) - frag_offset;
                zc->recv_skip_hint -= partial_frag_remainder;
                ++frag;
        }

        /* partial_frag_remainder: If part way through a frag, must read rest.
         * mappable_offset: Bytes till next mappable frag, *not* counting bytes
         * in partial_frag_remainder.
         */
        mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
        zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
}

static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
                              int flags, struct scm_timestamping_internal *tss,
                              int *cmsg_flags);
static int receive_fallback_to_copy(struct sock *sk,
                                    struct tcp_zerocopy_receive *zc, int inq,
                                    struct scm_timestamping_internal *tss)
{
        unsigned long copy_address = (unsigned long)zc->copybuf_address;
        struct msghdr msg = {};
        int err;

        zc->length = 0;
        zc->recv_skip_hint = 0;

        if (copy_address != zc->copybuf_address)
                return -EINVAL;

        err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq,
                          &msg.msg_iter);
        if (err)
                return err;

        err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT,
                                 tss, &zc->msg_flags);
        if (err < 0)
                return err;

        zc->copybuf_len = err;
        if (likely(zc->copybuf_len)) {
                struct sk_buff *skb;
                u32 offset;

                skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
                if (skb)
                        tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
        }
        return 0;
}

static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
                                   struct sk_buff *skb, u32 copylen,
                                   u32 *offset, u32 *seq)
{
        unsigned long copy_address = (unsigned long)zc->copybuf_address;
        struct msghdr msg = {};
        int err;

        if (copy_address != zc->copybuf_address)
                return -EINVAL;

        err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen,
                          &msg.msg_iter);
        if (err)
                return err;
        err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
        if (err)
                return err;
        zc->recv_skip_hint -= copylen;
        *offset += copylen;
        *seq += copylen;
        return (__s32)copylen;
}

static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
                                  struct sock *sk,
                                  struct sk_buff *skb,
                                  u32 *seq,
                                  s32 copybuf_len,
                                  struct scm_timestamping_internal *tss)
{
        u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);

        if (!copylen)
                return 0;
        /* skb is null if inq < PAGE_SIZE. */
        if (skb) {
                offset = *seq - TCP_SKB_CB(skb)->seq;
        } else {
                skb = tcp_recv_skb(sk, *seq, &offset);
                if (TCP_SKB_CB(skb)->has_rxtstamp) {
                        tcp_update_recv_tstamps(skb, tss);
                        zc->msg_flags |= TCP_CMSG_TS;
                }
        }

        zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
                                                  seq);
        return zc->copybuf_len < 0 ? 0 : copylen;
}

static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
                                              struct page **pending_pages,
                                              unsigned long pages_remaining,
                                              unsigned long *address,
                                              u32 *length,
                                              u32 *seq,
                                              struct tcp_zerocopy_receive *zc,
                                              u32 total_bytes_to_map,
                                              int err)
{
        /* At least one page did not map. Try zapping if we skipped earlier. */
        if (err == -EBUSY &&
            zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
                u32 maybe_zap_len;

                maybe_zap_len = total_bytes_to_map -  /* All bytes to map */
                                *length + /* Mapped or pending */
                                (pages_remaining * PAGE_SIZE); /* Failed map. */
                zap_vma_range(vma, *address, maybe_zap_len);
                err = 0;
        }

        if (!err) {
                unsigned long leftover_pages = pages_remaining;
                int bytes_mapped;

                /* We called zap_vma_range, try to reinsert. */
                err = vm_insert_pages(vma, *address,
                                      pending_pages,
                                      &pages_remaining);
                bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
                *seq += bytes_mapped;
                *address += bytes_mapped;
        }
        if (err) {
                /* Either we were unable to zap, OR we zapped, retried an
                 * insert, and still had an issue. Either ways, pages_remaining
                 * is the number of pages we were unable to map, and we unroll
                 * some state we speculatively touched before.
                 */
                const int bytes_not_mapped = PAGE_SIZE * pages_remaining;

                *length -= bytes_not_mapped;
                zc->recv_skip_hint += bytes_not_mapped;
        }
        return err;
}

static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
                                        struct page **pages,
                                        unsigned int pages_to_map,
                                        unsigned long *address,
                                        u32 *length,
                                        u32 *seq,
                                        struct tcp_zerocopy_receive *zc,
                                        u32 total_bytes_to_map)
{
        unsigned long pages_remaining = pages_to_map;
        unsigned int pages_mapped;
        unsigned int bytes_mapped;
        int err;

        err = vm_insert_pages(vma, *address, pages, &pages_remaining);
        pages_mapped = pages_to_map - (unsigned int)pages_remaining;
        bytes_mapped = PAGE_SIZE * pages_mapped;
        /* Even if vm_insert_pages fails, it may have partially succeeded in
         * mapping (some but not all of the pages).
         */
        *seq += bytes_mapped;
        *address += bytes_mapped;

        if (likely(!err))
                return 0;

        /* Error: maybe zap and retry + rollback state for failed inserts. */
        return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
                pages_remaining, address, length, seq, zc, total_bytes_to_map,
                err);
}

#define TCP_VALID_ZC_MSG_FLAGS   (TCP_CMSG_TS)
static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
                                      struct tcp_zerocopy_receive *zc,
                                      struct scm_timestamping_internal *tss)
{
        unsigned long msg_control_addr;
        struct msghdr cmsg_dummy;

        msg_control_addr = (unsigned long)zc->msg_control;
        cmsg_dummy.msg_control_user = (void __user *)msg_control_addr;
        cmsg_dummy.msg_controllen =
                (__kernel_size_t)zc->msg_controllen;
        cmsg_dummy.msg_flags = in_compat_syscall()
                ? MSG_CMSG_COMPAT : 0;
        cmsg_dummy.msg_control_is_user = true;
        zc->msg_flags = 0;
        if (zc->msg_control == msg_control_addr &&
            zc->msg_controllen == cmsg_dummy.msg_controllen) {
                tcp_recv_timestamp(&cmsg_dummy, sk, tss);
                zc->msg_control = (__u64)
                        ((uintptr_t)cmsg_dummy.msg_control_user);
                zc->msg_controllen =
                        (__u64)cmsg_dummy.msg_controllen;
                zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
        }
}

static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
                                           unsigned long address,
                                           bool *mmap_locked)
{
        struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);

        if (vma) {
                if (vma->vm_ops != &tcp_vm_ops) {
                        vma_end_read(vma);
                        return NULL;
                }
                *mmap_locked = false;
                return vma;
        }

        mmap_read_lock(mm);
        vma = vma_lookup(mm, address);
        if (!vma || vma->vm_ops != &tcp_vm_ops) {
                mmap_read_unlock(mm);
                return NULL;
        }
        *mmap_locked = true;
        return vma;
}

#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
static int tcp_zerocopy_receive(struct sock *sk,
                                struct tcp_zerocopy_receive *zc,
                                struct scm_timestamping_internal *tss)
{
        u32 length = 0, offset, vma_len, avail_len, copylen = 0;
        unsigned long address = (unsigned long)zc->address;
        struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
        s32 copybuf_len = zc->copybuf_len;
        struct tcp_sock *tp = tcp_sk(sk);
        const skb_frag_t *frags = NULL;
        unsigned int pages_to_map = 0;
        struct vm_area_struct *vma;
        struct sk_buff *skb = NULL;
        u32 seq = tp->copied_seq;
        u32 total_bytes_to_map;
        int inq = tcp_inq(sk);
        bool mmap_locked;
        int ret;

        zc->copybuf_len = 0;
        zc->msg_flags = 0;

        if (address & (PAGE_SIZE - 1) || address != zc->address)
                return -EINVAL;

        if (sk->sk_state == TCP_LISTEN)
                return -ENOTCONN;

        sock_rps_record_flow(sk);

        if (inq && inq <= copybuf_len)
                return receive_fallback_to_copy(sk, zc, inq, tss);

        if (inq < PAGE_SIZE) {
                zc->length = 0;
                zc->recv_skip_hint = inq;
                if (!inq && sock_flag(sk, SOCK_DONE))
                        return -EIO;
                return 0;
        }

        vma = find_tcp_vma(current->mm, address, &mmap_locked);
        if (!vma)
                return -EINVAL;

        vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
        avail_len = min_t(u32, vma_len, inq);
        total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
        if (total_bytes_to_map) {
                if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
                        zap_vma_range(vma, address, total_bytes_to_map);
                zc->length = total_bytes_to_map;
                zc->recv_skip_hint = 0;
        } else {
                zc->length = avail_len;
                zc->recv_skip_hint = avail_len;
        }
        ret = 0;
        while (length + PAGE_SIZE <= zc->length) {
                int mappable_offset;
                struct page *page;

                if (zc->recv_skip_hint < PAGE_SIZE) {
                        u32 offset_frag;

                        if (skb) {
                                if (zc->recv_skip_hint > 0)
                                        break;
                                skb = skb->next;
                                offset = seq - TCP_SKB_CB(skb)->seq;
                        } else {
                                skb = tcp_recv_skb(sk, seq, &offset);
                        }

                        if (!skb_frags_readable(skb))
                                break;

                        if (TCP_SKB_CB(skb)->has_rxtstamp) {
                                tcp_update_recv_tstamps(skb, tss);
                                zc->msg_flags |= TCP_CMSG_TS;
                        }
                        zc->recv_skip_hint = skb->len - offset;
                        frags = skb_advance_to_frag(skb, offset, &offset_frag);
                        if (!frags || offset_frag)
                                break;
                }

                mappable_offset = find_next_mappable_frag(frags,
                                                          zc->recv_skip_hint);
                if (mappable_offset) {
                        zc->recv_skip_hint = mappable_offset;
                        break;
                }
                page = skb_frag_page(frags);
                if (WARN_ON_ONCE(!page))
                        break;

                prefetchw(page);
                pages[pages_to_map++] = page;
                length += PAGE_SIZE;
                zc->recv_skip_hint -= PAGE_SIZE;
                frags++;
                if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
                    zc->recv_skip_hint < PAGE_SIZE) {
                        /* Either full batch, or we're about to go to next skb
                         * (and we cannot unroll failed ops across skbs).
                         */
                        ret = tcp_zerocopy_vm_insert_batch(vma, pages,
                                                           pages_to_map,
                                                           &address, &length,
                                                           &seq, zc,
                                                           total_bytes_to_map);
                        if (ret)
                                goto out;
                        pages_to_map = 0;
                }
        }
        if (pages_to_map) {
                ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
                                                   &address, &length, &seq,
                                                   zc, total_bytes_to_map);
        }
out:
        if (mmap_locked)
                mmap_read_unlock(current->mm);
        else
                vma_end_read(vma);
        /* Try to copy straggler data. */
        if (!ret)
                copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);

        if (length + copylen) {
                WRITE_ONCE(tp->copied_seq, seq);
                tcp_rcv_space_adjust(sk);

                /* Clean up data we have read: This will do ACK frames. */
                tcp_recv_skb(sk, seq, &offset);
                tcp_cleanup_rbuf(sk, length + copylen);
                ret = 0;
                if (length == zc->length)
                        zc->recv_skip_hint = 0;
        } else {
                if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
                        ret = -EIO;
        }
        zc->length = length;
        return ret;
}
#endif

/* Similar to __sock_recv_timestamp, but does not require an skb */
void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
                        struct scm_timestamping_internal *tss)
{
        int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
        u32 tsflags = READ_ONCE(sk->sk_tsflags);

        if (tss->ts[0]) {
                if (sock_flag(sk, SOCK_RCVTSTAMP)) {
                        struct timespec64 tv = ktime_to_timespec64(tss->ts[0]);

                        if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
                                if (new_tstamp) {
                                        struct __kernel_timespec kts = {
                                                .tv_sec = tv.tv_sec,
                                                .tv_nsec = tv.tv_nsec,
                                        };
                                        put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
                                                 sizeof(kts), &kts);
                                } else {
                                        struct __kernel_old_timespec ts_old = {
                                                .tv_sec = tv.tv_sec,
                                                .tv_nsec = tv.tv_nsec,
                                        };
                                        put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
                                                 sizeof(ts_old), &ts_old);
                                }
                        } else {
                                if (new_tstamp) {
                                        struct __kernel_sock_timeval stv = {
                                                .tv_sec = tv.tv_sec,
                                                .tv_usec = tv.tv_nsec / 1000,
                                        };
                                        put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
                                                 sizeof(stv), &stv);
                                } else {
                                        struct __kernel_old_timeval otv = {
                                                .tv_sec = tv.tv_sec,
                                                .tv_usec = tv.tv_nsec / 1000,
                                        };
                                        put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
                                                 sizeof(otv), &otv);
                                }
                        }
                }

                if (!(tsflags & SOF_TIMESTAMPING_SOFTWARE &&
                    (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE ||
                     !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))))
                        tss->ts[0] = 0;
        }

        if (tss->ts[2]) {
                if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE &&
                    (tsflags & SOF_TIMESTAMPING_RX_HARDWARE ||
                     !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))))
                        tss->ts[2] = 0;
        }

        if (tss->ts[0] | tss->ts[2]) {
                tss->ts[1] = 0;
                if (sock_flag(sk, SOCK_TSTAMP_NEW))
                        put_cmsg_scm_timestamping64(msg, tss);
                else
                        put_cmsg_scm_timestamping(msg, tss);
        }
}

static int tcp_inq_hint(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        u32 copied_seq = READ_ONCE(tp->copied_seq);
        u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
        int inq;

        inq = rcv_nxt - copied_seq;
        if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
                lock_sock(sk);
                inq = tp->rcv_nxt - tp->copied_seq;
                release_sock(sk);
        }
        /* After receiving a FIN, tell the user-space to continue reading
         * by returning a non-zero inq.
         */
        if (inq == 0 && sock_flag(sk, SOCK_DONE))
                inq = 1;
        return inq;
}

/* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */
struct tcp_xa_pool {
        u8                max; /* max <= MAX_SKB_FRAGS */
        u8                idx; /* idx <= max */
        __u32                tokens[MAX_SKB_FRAGS];
        netmem_ref        netmems[MAX_SKB_FRAGS];
};

static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p)
{
        int i;

        /* Commit part that has been copied to user space. */
        for (i = 0; i < p->idx; i++)
                __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY,
                             (__force void *)p->netmems[i], GFP_KERNEL);
        /* Rollback what has been pre-allocated and is no longer needed. */
        for (; i < p->max; i++)
                __xa_erase(&sk->sk_user_frags, p->tokens[i]);

        p->max = 0;
        p->idx = 0;
}

static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p)
{
        if (!p->max)
                return;

        xa_lock_bh(&sk->sk_user_frags);

        tcp_xa_pool_commit_locked(sk, p);

        xa_unlock_bh(&sk->sk_user_frags);
}

static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p,
                              unsigned int max_frags)
{
        int err, k;

        if (p->idx < p->max)
                return 0;

        xa_lock_bh(&sk->sk_user_frags);

        tcp_xa_pool_commit_locked(sk, p);

        for (k = 0; k < max_frags; k++) {
                err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k],
                                 XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL);
                if (err)
                        break;
        }

        xa_unlock_bh(&sk->sk_user_frags);

        p->max = k;
        p->idx = 0;
        return k ? 0 : err;
}

/* On error, returns the -errno. On success, returns number of bytes sent to the
 * user. May not consume all of @remaining_len.
 */
static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
                              unsigned int offset, struct msghdr *msg,
                              int remaining_len)
{
        struct dmabuf_cmsg dmabuf_cmsg = { 0 };
        struct tcp_xa_pool tcp_xa_pool;
        unsigned int start;
        int i, copy, n;
        int sent = 0;
        int err = 0;

        tcp_xa_pool.max = 0;
        tcp_xa_pool.idx = 0;
        do {
                start = skb_headlen(skb);

                if (skb_frags_readable(skb)) {
                        err = -ENODEV;
                        goto out;
                }

                /* Copy header. */
                copy = start - offset;
                if (copy > 0) {
                        copy = min(copy, remaining_len);

                        n = copy_to_iter(skb->data + offset, copy,
                                         &msg->msg_iter);
                        if (n != copy) {
                                err = -EFAULT;
                                goto out;
                        }

                        offset += copy;
                        remaining_len -= copy;

                        /* First a dmabuf_cmsg for # bytes copied to user
                         * buffer.
                         */
                        memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg));
                        dmabuf_cmsg.frag_size = copy;
                        err = put_cmsg_notrunc(msg, SOL_SOCKET,
                                               SO_DEVMEM_LINEAR,
                                               sizeof(dmabuf_cmsg),
                                               &dmabuf_cmsg);
                        if (err)
                                goto out;

                        sent += copy;

                        if (remaining_len == 0)
                                goto out;
                }

                /* after that, send information of dmabuf pages through a
                 * sequence of cmsg
                 */
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        struct net_iov *niov;
                        u64 frag_offset;
                        int end;

                        /* !skb_frags_readable() should indicate that ALL the
                         * frags in this skb are dmabuf net_iovs. We're checking
                         * for that flag above, but also check individual frags
                         * here. If the tcp stack is not setting
                         * skb_frags_readable() correctly, we still don't want
                         * to crash here.
                         */
                        if (!skb_frag_net_iov(frag)) {
                                net_err_ratelimited("Found non-dmabuf skb with net_iov");
                                err = -ENODEV;
                                goto out;
                        }

                        niov = skb_frag_net_iov(frag);
                        if (!net_is_devmem_iov(niov)) {
                                err = -ENODEV;
                                goto out;
                        }

                        end = start + skb_frag_size(frag);
                        copy = end - offset;

                        if (copy > 0) {
                                copy = min(copy, remaining_len);

                                frag_offset = net_iov_virtual_addr(niov) +
                                              skb_frag_off(frag) + offset -
                                              start;
                                dmabuf_cmsg.frag_offset = frag_offset;
                                dmabuf_cmsg.frag_size = copy;
                                err = tcp_xa_pool_refill(sk, &tcp_xa_pool,
                                                         skb_shinfo(skb)->nr_frags - i);
                                if (err)
                                        goto out;

                                /* Will perform the exchange later */
                                dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx];
                                dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov);

                                offset += copy;
                                remaining_len -= copy;

                                err = put_cmsg_notrunc(msg, SOL_SOCKET,
                                                       SO_DEVMEM_DMABUF,
                                                       sizeof(dmabuf_cmsg),
                                                       &dmabuf_cmsg);
                                if (err)
                                        goto out;

                                atomic_long_inc(&niov->desc.pp_ref_count);
                                tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag);

                                sent += copy;

                                if (remaining_len == 0)
                                        goto out;
                        }
                        start = end;
                }

                tcp_xa_pool_commit(sk, &tcp_xa_pool);
                if (!remaining_len)
                        goto out;

                /* if remaining_len is not satisfied yet, we need to go to the
                 * next frag in the frag_list to satisfy remaining_len.
                 */
                skb = skb_shinfo(skb)->frag_list ?: skb->next;

                offset = offset - start;
        } while (skb);

        if (remaining_len) {
                err = -EFAULT;
                goto out;
        }

out:
        tcp_xa_pool_commit(sk, &tcp_xa_pool);
        if (!sent)
                sent = err;

        return sent;
}

/*
 *        This routine copies from a sock struct into the user buffer.
 *
 *        Technical note: in 2.3 we work on _locked_ socket, so that
 *        tricks with *seq access order and skb->users are not required.
 *        Probably, code can be easily improved even more.
 */

static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
                              int flags, struct scm_timestamping_internal *tss,
                              int *cmsg_flags)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int last_copied_dmabuf = -1; /* uninitialized */
        int copied = 0;
        u32 peek_seq;
        u32 *seq;
        unsigned long used;
        int err;
        int target;                /* Read at least this many bytes */
        long timeo;
        struct sk_buff *skb, *last;
        u32 peek_offset = 0;
        u32 urg_hole = 0;

        err = -ENOTCONN;
        if (sk->sk_state == TCP_LISTEN)
                goto out;

        if (tp->recvmsg_inq)
                *cmsg_flags = TCP_CMSG_INQ;
        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        /* Urgent data needs to be handled specially. */
        if (flags & MSG_OOB)
                goto recv_urg;

        if (unlikely(tp->repair)) {
                err = -EPERM;
                if (!(flags & MSG_PEEK))
                        goto out;

                if (tp->repair_queue == TCP_SEND_QUEUE)
                        goto recv_sndq;

                err = -EINVAL;
                if (tp->repair_queue == TCP_NO_QUEUE)
                        goto out;

                /* 'common' recv queue MSG_PEEK-ing */
        }

        seq = &tp->copied_seq;
        if (flags & MSG_PEEK) {
                peek_offset = max(sk_peek_offset(sk, flags), 0);
                peek_seq = tp->copied_seq + peek_offset;
                seq = &peek_seq;
        }

        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);

        do {
                u32 offset;

                /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
                if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
                        if (copied)
                                break;
                        if (signal_pending(current)) {
                                copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
                                break;
                        }
                }

                /* Next get a buffer. */

                last = skb_peek_tail(&sk->sk_receive_queue);
                skb_queue_walk(&sk->sk_receive_queue, skb) {
                        last = skb;
                        /* Now that we have two receive queues this
                         * shouldn't happen.
                         */
                        if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
                                 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
                                 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
                                 flags))
                                break;

                        offset = *seq - TCP_SKB_CB(skb)->seq;
                        if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
                                pr_err_once("%s: found a SYN, please report !\n", __func__);
                                offset--;
                        }
                        if (offset < skb->len)
                                goto found_ok_skb;
                        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                                goto found_fin_ok;
                        WARN(!(flags & MSG_PEEK),
                             "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
                             *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
                }

                /* Well, if we have backlog, try to process it now yet. */

                if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
                        break;

                if (copied) {
                        if (!timeo ||
                            tcp_recv_should_stop(sk))
                                break;
                } else {
                        if (sock_flag(sk, SOCK_DONE))
                                break;

                        if (sk->sk_err) {
                                copied = sock_error(sk);
                                break;
                        }

                        if (sk->sk_shutdown & RCV_SHUTDOWN)
                                break;

                        if (sk->sk_state == TCP_CLOSE) {
                                /* This occurs when user tries to read
                                 * from never connected socket.
                                 */
                                copied = -ENOTCONN;
                                break;
                        }

                        if (!timeo) {
                                copied = -EAGAIN;
                                break;
                        }

                        if (signal_pending(current)) {
                                copied = sock_intr_errno(timeo);
                                break;
                        }
                }

                if (copied >= target) {
                        /* Do not sleep, just process backlog. */
                        __sk_flush_backlog(sk);
                } else {
                        tcp_cleanup_rbuf(sk, copied);
                        err = sk_wait_data(sk, &timeo, last);
                        if (err < 0) {
                                err = copied ? : err;
                                goto out;
                        }
                }

                if ((flags & MSG_PEEK) &&
                    (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) {
                        net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
                                            current->comm,
                                            task_pid_nr(current));
                        peek_seq = tp->copied_seq + peek_offset;
                }
                continue;

found_ok_skb:
                /* Ok so how much can we use? */
                used = skb->len - offset;
                if (len < used)
                        used = len;

                /* Do we have urgent data here? */
                if (unlikely(tp->urg_data)) {
                        u32 urg_offset = tp->urg_seq - *seq;
                        if (urg_offset < used) {
                                if (!urg_offset) {
                                        if (!sock_flag(sk, SOCK_URGINLINE)) {
                                                WRITE_ONCE(*seq, *seq + 1);
                                                urg_hole++;
                                                offset++;
                                                used--;
                                                if (!used)
                                                        goto skip_copy;
                                        }
                                } else
                                        used = urg_offset;
                        }
                }

                if (!(flags & MSG_TRUNC)) {
                        if (last_copied_dmabuf != -1 &&
                            last_copied_dmabuf != !skb_frags_readable(skb))
                                break;

                        if (skb_frags_readable(skb)) {
                                err = skb_copy_datagram_msg(skb, offset, msg,
                                                            used);
                                if (err) {
                                        /* Exception. Bailout! */
                                        if (!copied)
                                                copied = -EFAULT;
                                        break;
                                }
                        } else {
                                if (!(flags & MSG_SOCK_DEVMEM)) {
                                        /* dmabuf skbs can only be received
                                         * with the MSG_SOCK_DEVMEM flag.
                                         */
                                        if (!copied)
                                                copied = -EFAULT;

                                        break;
                                }

                                err = tcp_recvmsg_dmabuf(sk, skb, offset, msg,
                                                         used);
                                if (err < 0) {
                                        if (!copied)
                                                copied = err;

                                        break;
                                }
                                used = err;
                        }
                }

                last_copied_dmabuf = !skb_frags_readable(skb);

                WRITE_ONCE(*seq, *seq + used);
                copied += used;
                len -= used;
                if (flags & MSG_PEEK)
                        sk_peek_offset_fwd(sk, used);
                else
                        sk_peek_offset_bwd(sk, used);
                tcp_rcv_space_adjust(sk);

skip_copy:
                if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
                        WRITE_ONCE(tp->urg_data, 0);
                        tcp_fast_path_check(sk);
                }

                if (TCP_SKB_CB(skb)->has_rxtstamp) {
                        tcp_update_recv_tstamps(skb, tss);
                        *cmsg_flags |= TCP_CMSG_TS;
                }

                if (used + offset < skb->len)
                        continue;

                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                        goto found_fin_ok;
                if (!(flags & MSG_PEEK))
                        tcp_eat_recv_skb(sk, skb);
                continue;

found_fin_ok:
                /* Process the FIN. */
                WRITE_ONCE(*seq, *seq + 1);
                if (!(flags & MSG_PEEK))
                        tcp_eat_recv_skb(sk, skb);
                break;
        } while (len > 0);

        /* According to UNIX98, msg_name/msg_namelen are ignored
         * on connected socket. I was just happy when found this 8) --ANK
         */

        /* Clean up data we have read: This will do ACK frames. */
        tcp_cleanup_rbuf(sk, copied);
        return copied;

out:
        return err;

recv_urg:
        err = tcp_recv_urg(sk, msg, len, flags);
        goto out;

recv_sndq:
        err = tcp_peek_sndq(sk, msg, len);
        goto out;
}

int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
{
        int cmsg_flags = 0, ret;
        struct scm_timestamping_internal tss;

        if (unlikely(flags & MSG_ERRQUEUE))
                return inet_recv_error(sk, msg, len);

        if (sk_can_busy_loop(sk) &&
            skb_queue_empty_lockless(&sk->sk_receive_queue) &&
            sk->sk_state == TCP_ESTABLISHED)
                sk_busy_loop(sk, flags & MSG_DONTWAIT);

        lock_sock(sk);
        ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
        release_sock(sk);

        if ((cmsg_flags | msg->msg_get_inq) && ret >= 0) {
                if (cmsg_flags & TCP_CMSG_TS)
                        tcp_recv_timestamp(msg, sk, &tss);
                if ((cmsg_flags & TCP_CMSG_INQ) | msg->msg_get_inq) {
                        msg->msg_inq = tcp_inq_hint(sk);
                        if (cmsg_flags & TCP_CMSG_INQ)
                                put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
                                         sizeof(msg->msg_inq), &msg->msg_inq);
                }
        }
        return ret;
}

void tcp_set_state(struct sock *sk, int state)
{
        int oldstate = sk->sk_state;

        /* We defined a new enum for TCP states that are exported in BPF
         * so as not force the internal TCP states to be frozen. The
         * following checks will detect if an internal state value ever
         * differs from the BPF value. If this ever happens, then we will
         * need to remap the internal value to the BPF value before calling
         * tcp_call_bpf_2arg.
         */
        BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
        BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
        BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
        BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
        BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
        BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
        BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
        BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
        BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
        BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
        BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
        BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
        BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE);
        BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);

        /* bpf uapi header bpf.h defines an anonymous enum with values
         * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux
         * is able to emit this enum in DWARF due to the above BUILD_BUG_ON.
         * But clang built vmlinux does not have this enum in DWARF
         * since clang removes the above code before generating IR/debuginfo.
         * Let us explicitly emit the type debuginfo to ensure the
         * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF
         * regardless of which compiler is used.
         */
        BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);

        if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
                tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);

        switch (state) {
        case TCP_ESTABLISHED:
                if (oldstate != TCP_ESTABLISHED)
                        TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
                break;
        case TCP_CLOSE_WAIT:
                if (oldstate == TCP_SYN_RECV)
                        TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
                break;

        case TCP_CLOSE:
                if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
                        TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);

                sk->sk_prot->unhash(sk);
                if (inet_csk(sk)->icsk_bind_hash &&
                    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
                        inet_put_port(sk);
                fallthrough;
        default:
                if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT)
                        TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
        }

        /* Change state AFTER socket is unhashed to avoid closed
         * socket sitting in hash tables.
         */
        inet_sk_state_store(sk, state);
}
EXPORT_SYMBOL_GPL(tcp_set_state);

/*
 *        State processing on a close. This implements the state shift for
 *        sending our FIN frame. Note that we only send a FIN for some
 *        states. A shutdown() may have already sent the FIN, or we may be
 *        closed.
 */

static const unsigned char new_state[16] = {
  /* current state:        new state:      action:        */
  [0 /* (Invalid) */]        = TCP_CLOSE,
  [TCP_ESTABLISHED]        = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
  [TCP_SYN_SENT]        = TCP_CLOSE,
  [TCP_SYN_RECV]        = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
  [TCP_FIN_WAIT1]        = TCP_FIN_WAIT1,
  [TCP_FIN_WAIT2]        = TCP_FIN_WAIT2,
  [TCP_TIME_WAIT]        = TCP_CLOSE,
  [TCP_CLOSE]                = TCP_CLOSE,
  [TCP_CLOSE_WAIT]        = TCP_LAST_ACK  | TCP_ACTION_FIN,
  [TCP_LAST_ACK]        = TCP_LAST_ACK,
  [TCP_LISTEN]                = TCP_CLOSE,
  [TCP_CLOSING]                = TCP_CLOSING,
  [TCP_NEW_SYN_RECV]        = TCP_CLOSE,        /* should not happen ! */
};

static int tcp_close_state(struct sock *sk)
{
        int next = (int)new_state[sk->sk_state];
        int ns = next & TCP_STATE_MASK;

        tcp_set_state(sk, ns);

        return next & TCP_ACTION_FIN;
}

/*
 *        Shutdown the sending side of a connection. Much like close except
 *        that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
 */

void tcp_shutdown(struct sock *sk, int how)
{
        /*        We need to grab some memory, and put together a FIN,
         *        and then put it into the queue to be sent.
         *                Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
         */
        if (!(how & SEND_SHUTDOWN))
                return;

        /* If we've already sent a FIN, or it's a closed state, skip this. */
        if ((1 << sk->sk_state) &
            (TCPF_ESTABLISHED | TCPF_SYN_SENT |
             TCPF_CLOSE_WAIT)) {
                /* Clear out any half completed packets.  FIN if needed. */
                if (tcp_close_state(sk))
                        tcp_send_fin(sk);
        }
}

int tcp_orphan_count_sum(void)
{
        int i, total = 0;

        for_each_possible_cpu(i)
                total += per_cpu(tcp_orphan_count, i);

        return max(total, 0);
}

static int tcp_orphan_cache;
static struct timer_list tcp_orphan_timer;
#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)

static void tcp_orphan_update(struct timer_list *unused)
{
        WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
        mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
}

static bool tcp_too_many_orphans(int shift)
{
        return READ_ONCE(tcp_orphan_cache) << shift >
                READ_ONCE(sysctl_tcp_max_orphans);
}

static bool tcp_out_of_memory(const struct sock *sk)
{
        if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
            sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
                return true;
        return false;
}

bool tcp_check_oom(const struct sock *sk, int shift)
{
        bool too_many_orphans, out_of_socket_memory;

        too_many_orphans = tcp_too_many_orphans(shift);
        out_of_socket_memory = tcp_out_of_memory(sk);

        if (too_many_orphans)
                net_info_ratelimited("too many orphaned sockets\n");
        if (out_of_socket_memory)
                net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
        return too_many_orphans || out_of_socket_memory;
}

void __tcp_close(struct sock *sk, long timeout)
{
        bool data_was_unread = false;
        struct sk_buff *skb;
        int state;

        WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);

        if (sk->sk_state == TCP_LISTEN) {
                tcp_set_state(sk, TCP_CLOSE);

                /* Special case. */
                inet_csk_listen_stop(sk);

                goto adjudge_to_death;
        }

        /*  We need to flush the recv. buffs.  We do this only on the
         *  descriptor close, not protocol-sourced closes, because the
         *  reader process may not have drained the data yet!
         */
        while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
                u32 end_seq = TCP_SKB_CB(skb)->end_seq;

                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                        end_seq--;
                if (after(end_seq, tcp_sk(sk)->copied_seq))
                        data_was_unread = true;
                tcp_eat_recv_skb(sk, skb);
        }

        /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
        if (sk->sk_state == TCP_CLOSE)
                goto adjudge_to_death;

        /* As outlined in RFC 2525, section 2.17, we send a RST here because
         * data was lost. To witness the awful effects of the old behavior of
         * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
         * GET in an FTP client, suspend the process, wait for the client to
         * advertise a zero window, then kill -9 the FTP client, wheee...
         * Note: timeout is always zero in such a case.
         */
        if (unlikely(tcp_sk(sk)->repair)) {
                sk->sk_prot->disconnect(sk, 0);
        } else if (data_was_unread) {
                /* Unread data was tossed, zap the connection. */
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
                tcp_set_state(sk, TCP_CLOSE);
                tcp_send_active_reset(sk, sk->sk_allocation,
                                      SK_RST_REASON_TCP_ABORT_ON_CLOSE);
        } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
                /* Check zero linger _after_ checking for unread data. */
                sk->sk_prot->disconnect(sk, 0);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
        } else if (tcp_close_state(sk)) {
                /* We FIN if the application ate all the data before
                 * zapping the connection.
                 */

                /* RED-PEN. Formally speaking, we have broken TCP state
                 * machine. State transitions:
                 *
                 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
                 * TCP_SYN_RECV        -> TCP_FIN_WAIT1 (it is difficult)
                 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
                 *
                 * are legal only when FIN has been sent (i.e. in window),
                 * rather than queued out of window. Purists blame.
                 *
                 * F.e. "RFC state" is ESTABLISHED,
                 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
                 *
                 * The visible declinations are that sometimes
                 * we enter time-wait state, when it is not required really
                 * (harmless), do not send active resets, when they are
                 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
                 * they look as CLOSING or LAST_ACK for Linux)
                 * Probably, I missed some more holelets.
                 *                                                 --ANK
                 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
                 * in a single packet! (May consider it later but will
                 * probably need API support or TCP_CORK SYN-ACK until
                 * data is written and socket is closed.)
                 */
                tcp_send_fin(sk);
        }

        sk_stream_wait_close(sk, timeout);

adjudge_to_death:
        state = sk->sk_state;
        sock_hold(sk);
        sock_orphan(sk);

        local_bh_disable();
        bh_lock_sock(sk);
        /* remove backlog if any, without releasing ownership. */
        __release_sock(sk);

        tcp_orphan_count_inc();

        /* Have we already been destroyed by a softirq or backlog? */
        if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
                goto out;

        /*        This is a (useful) BSD violating of the RFC. There is a
         *        problem with TCP as specified in that the other end could
         *        keep a socket open forever with no application left this end.
         *        We use a 1 minute timeout (about the same as BSD) then kill
         *        our end. If they send after that then tough - BUT: long enough
         *        that we won't make the old 4*rto = almost no time - whoops
         *        reset mistake.
         *
         *        Nope, it was not mistake. It is really desired behaviour
         *        f.e. on http servers, when such sockets are useless, but
         *        consume significant resources. Let's do it with special
         *        linger2        option.                                        --ANK
         */

        if (sk->sk_state == TCP_FIN_WAIT2) {
                struct tcp_sock *tp = tcp_sk(sk);
                if (READ_ONCE(tp->linger2) < 0) {
                        tcp_set_state(sk, TCP_CLOSE);
                        tcp_send_active_reset(sk, GFP_ATOMIC,
                                              SK_RST_REASON_TCP_ABORT_ON_LINGER);
                        __NET_INC_STATS(sock_net(sk),
                                        LINUX_MIB_TCPABORTONLINGER);
                } else {
                        const int tmo = tcp_fin_time(sk);

                        if (tmo > TCP_TIMEWAIT_LEN) {
                                tcp_reset_keepalive_timer(sk,
                                                tmo - TCP_TIMEWAIT_LEN);
                        } else {
                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
                                goto out;
                        }
                }
        }
        if (sk->sk_state != TCP_CLOSE) {
                if (tcp_check_oom(sk, 0)) {
                        tcp_set_state(sk, TCP_CLOSE);
                        tcp_send_active_reset(sk, GFP_ATOMIC,
                                              SK_RST_REASON_TCP_ABORT_ON_MEMORY);
                        __NET_INC_STATS(sock_net(sk),
                                        LINUX_MIB_TCPABORTONMEMORY);
                } else if (!check_net(sock_net(sk))) {
                        /* Not possible to send reset; just close */
                        tcp_set_state(sk, TCP_CLOSE);
                }
        }

        if (sk->sk_state == TCP_CLOSE) {
                struct request_sock *req;

                req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
                                                lockdep_sock_is_held(sk));
                /* We could get here with a non-NULL req if the socket is
                 * aborted (e.g., closed with unread data) before 3WHS
                 * finishes.
                 */
                if (req)
                        reqsk_fastopen_remove(sk, req, false);
                inet_csk_destroy_sock(sk);
        }
        /* Otherwise, socket is reprieved until protocol close. */

out:
        bh_unlock_sock(sk);
        local_bh_enable();
}

void tcp_close(struct sock *sk, long timeout)
{
        lock_sock(sk);
        __tcp_close(sk, timeout);
        release_sock(sk);
        if (!sk->sk_net_refcnt)
                inet_csk_clear_xmit_timers_sync(sk);
        sock_put(sk);
}
EXPORT_SYMBOL(tcp_close);

/* These states need RST on ABORT according to RFC793 */

static inline bool tcp_need_reset(int state)
{
        return (1 << state) &
               (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
                TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
}

static void tcp_rtx_queue_purge(struct sock *sk)
{
        struct rb_node *p = rb_first(&sk->tcp_rtx_queue);

        tcp_sk(sk)->highest_sack = NULL;
        while (p) {
                struct sk_buff *skb = rb_to_skb(p);

                p = rb_next(p);
                /* Since we are deleting whole queue, no need to
                 * list_del(&skb->tcp_tsorted_anchor)
                 */
                tcp_rtx_queue_unlink(skb, sk);
                tcp_wmem_free_skb(sk, skb);
        }
}

void tcp_write_queue_purge(struct sock *sk)
{
        struct sk_buff *skb;

        tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
        while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
                tcp_skb_tsorted_anchor_cleanup(skb);
                tcp_wmem_free_skb(sk, skb);
        }
        tcp_rtx_queue_purge(sk);
        INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
        tcp_clear_all_retrans_hints(tcp_sk(sk));
        tcp_sk(sk)->packets_out = 0;
        inet_csk(sk)->icsk_backoff = 0;
}

int tcp_disconnect(struct sock *sk, int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int old_state = sk->sk_state;
        struct request_sock *req;
        u32 seq;

        if (old_state != TCP_CLOSE)
                tcp_set_state(sk, TCP_CLOSE);

        /* ABORT function of RFC793 */
        if (old_state == TCP_LISTEN) {
                inet_csk_listen_stop(sk);
        } else if (unlikely(tp->repair)) {
                WRITE_ONCE(sk->sk_err, ECONNABORTED);
        } else if (tcp_need_reset(old_state)) {
                tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_STATE);
                WRITE_ONCE(sk->sk_err, ECONNRESET);
        } else if (tp->snd_nxt != tp->write_seq &&
                   (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
                /* The last check adjusts for discrepancy of Linux wrt. RFC
                 * states
                 */
                tcp_send_active_reset(sk, gfp_any(),
                                      SK_RST_REASON_TCP_DISCONNECT_WITH_DATA);
                WRITE_ONCE(sk->sk_err, ECONNRESET);
        } else if (old_state == TCP_SYN_SENT)
                WRITE_ONCE(sk->sk_err, ECONNRESET);

        tcp_clear_xmit_timers(sk);
        __skb_queue_purge(&sk->sk_receive_queue);
        WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
        WRITE_ONCE(tp->urg_data, 0);
        sk_set_peek_off(sk, -1);
        tcp_write_queue_purge(sk);
        tcp_fastopen_active_disable_ofo_check(sk);
        skb_rbtree_purge(&tp->out_of_order_queue);

        inet->inet_dport = 0;

        inet_bhash2_reset_saddr(sk);

        WRITE_ONCE(sk->sk_shutdown, 0);
        sock_reset_flag(sk, SOCK_DONE);
        tp->srtt_us = 0;
        tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
        tp->rcv_rtt_last_tsecr = 0;

        seq = tp->write_seq + tp->max_window + 2;
        if (!seq)
                seq = 1;
        WRITE_ONCE(tp->write_seq, seq);

        icsk->icsk_backoff = 0;
        WRITE_ONCE(icsk->icsk_probes_out, 0);
        icsk->icsk_probes_tstamp = 0;
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
        WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN);
        WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX);
        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
        tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
        tp->snd_cwnd_cnt = 0;
        tp->is_cwnd_limited = 0;
        tp->max_packets_out = 0;
        tp->window_clamp = 0;
        tp->delivered = 0;
        tp->delivered_ce = 0;
        tp->accecn_fail_mode = 0;
        tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
        tcp_accecn_init_counters(tp);
        tp->prev_ecnfield = 0;
        tp->accecn_opt_tstamp = 0;
        tp->pkts_acked_ewma = 0;
        if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
                icsk->icsk_ca_ops->release(sk);
        memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
        icsk->icsk_ca_initialized = 0;
        tcp_set_ca_state(sk, TCP_CA_Open);
        tp->is_sack_reneg = 0;
        tcp_clear_retrans(tp);
        tp->total_retrans = 0;
        inet_csk_delack_init(sk);
        /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
         * issue in __tcp_select_window()
         */
        icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
        memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
        __sk_dst_reset(sk);
        dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL)));
        tcp_saved_syn_free(tp);
        tp->compressed_ack = 0;
        tp->segs_in = 0;
        tp->segs_out = 0;
        tp->bytes_sent = 0;
        tp->bytes_acked = 0;
        tp->bytes_received = 0;
        tp->bytes_retrans = 0;
        tp->data_segs_in = 0;
        tp->data_segs_out = 0;
        tp->duplicate_sack[0].start_seq = 0;
        tp->duplicate_sack[0].end_seq = 0;
        tp->dsack_dups = 0;
        tp->reord_seen = 0;
        tp->retrans_out = 0;
        tp->sacked_out = 0;
        tp->tlp_high_seq = 0;
        tp->last_oow_ack_time = 0;
        tp->plb_rehash = 0;
        /* There's a bubble in the pipe until at least the first ACK. */
        tp->app_limited = ~0U;
        tp->rate_app_limited = 1;
        tp->rack.mstamp = 0;
        tp->rack.advanced = 0;
        tp->rack.reo_wnd_steps = 1;
        tp->rack.last_delivered = 0;
        tp->rack.reo_wnd_persist = 0;
        tp->rack.dsack_seen = 0;
        tp->syn_data_acked = 0;
        tp->syn_fastopen_child = 0;
        tp->rx_opt.saw_tstamp = 0;
        tp->rx_opt.dsack = 0;
        tp->rx_opt.num_sacks = 0;
        tp->rcv_ooopack = 0;


        /* Clean up fastopen related fields */
        req = rcu_dereference_protected(tp->fastopen_rsk,
                                        lockdep_sock_is_held(sk));
        if (req)
                reqsk_fastopen_remove(sk, req, false);
        tcp_free_fastopen_req(tp);
        inet_clear_bit(DEFER_CONNECT, sk);
        tp->fastopen_client_fail = 0;

        WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);

        if (sk->sk_frag.page) {
                put_page(sk->sk_frag.page);
                sk->sk_frag.page = NULL;
                sk->sk_frag.offset = 0;
        }
        sk_error_report(sk);
        return 0;
}
EXPORT_SYMBOL(tcp_disconnect);

static inline bool tcp_can_repair_sock(const struct sock *sk)
{
        return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
                (sk->sk_state != TCP_LISTEN);
}

static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
{
        struct tcp_repair_window opt;

        if (!tp->repair)
                return -EPERM;

        if (len != sizeof(opt))
                return -EINVAL;

        if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
                return -EFAULT;

        if (opt.max_window < opt.snd_wnd)
                return -EINVAL;

        if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
                return -EINVAL;

        if (after(opt.rcv_wup, tp->rcv_nxt))
                return -EINVAL;

        tp->snd_wl1        = opt.snd_wl1;
        tp->snd_wnd        = opt.snd_wnd;
        tp->max_window        = opt.max_window;

        tp->rcv_wnd        = opt.rcv_wnd;
        tp->rcv_wup        = opt.rcv_wup;
        tp->rcv_mwnd_seq = opt.rcv_wup + opt.rcv_wnd;

        return 0;
}

static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
                unsigned int len)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_repair_opt opt;
        size_t offset = 0;

        while (len >= sizeof(opt)) {
                if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
                        return -EFAULT;

                offset += sizeof(opt);
                len -= sizeof(opt);

                switch (opt.opt_code) {
                case TCPOPT_MSS:
                        tp->rx_opt.mss_clamp = opt.opt_val;
                        tcp_mtup_init(sk);
                        break;
                case TCPOPT_WINDOW:
                        {
                                u16 snd_wscale = opt.opt_val & 0xFFFF;
                                u16 rcv_wscale = opt.opt_val >> 16;

                                if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
                                        return -EFBIG;

                                tp->rx_opt.snd_wscale = snd_wscale;
                                tp->rx_opt.rcv_wscale = rcv_wscale;
                                tp->rx_opt.wscale_ok = 1;
                        }
                        break;
                case TCPOPT_SACK_PERM:
                        if (opt.opt_val != 0)
                                return -EINVAL;

                        tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
                        break;
                case TCPOPT_TIMESTAMP:
                        if (opt.opt_val != 0)
                                return -EINVAL;

                        tp->rx_opt.tstamp_ok = 1;
                        break;
                }
        }

        return 0;
}

DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);

static void tcp_enable_tx_delay(struct sock *sk, int val)
{
        struct tcp_sock *tp = tcp_sk(sk);
        s32 delta = (val - tp->tcp_tx_delay) << 3;

        if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) {
                static int __tcp_tx_delay_enabled = 0;

                if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
                        static_branch_enable(&tcp_tx_delay_enabled);
                        pr_info("TCP_TX_DELAY enabled\n");
                }
        }
        /* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us,
         * tp->rtt_min, icsk_rto and sk->sk_pacing_rate.
         * This is best effort.
         */
        if (delta && sk->sk_state == TCP_ESTABLISHED) {
                s64 srtt = (s64)tp->srtt_us + delta;

                tp->srtt_us = clamp_t(s64, srtt, 1, ~0U);

                /* Note: does not deal with non zero icsk_backoff */
                tcp_set_rto(sk);

                minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);

                tcp_update_pacing_rate(sk);
        }
}

/* When set indicates to always queue non-full frames.  Later the user clears
 * this option and we transmit any pending partial frames in the queue.  This is
 * meant to be used alongside sendfile() to get properly filled frames when the
 * user (for example) must write out headers with a write() call first and then
 * use sendfile to send out the data parts.
 *
 * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
 * TCP_NODELAY.
 */
void __tcp_sock_set_cork(struct sock *sk, bool on)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (on) {
                tp->nonagle |= TCP_NAGLE_CORK;
        } else {
                tp->nonagle &= ~TCP_NAGLE_CORK;
                if (tp->nonagle & TCP_NAGLE_OFF)
                        tp->nonagle |= TCP_NAGLE_PUSH;
                tcp_push_pending_frames(sk);
        }
}

void tcp_sock_set_cork(struct sock *sk, bool on)
{
        lock_sock(sk);
        __tcp_sock_set_cork(sk, on);
        release_sock(sk);
}
EXPORT_SYMBOL(tcp_sock_set_cork);

/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
 * remembered, but it is not activated until cork is cleared.
 *
 * However, when TCP_NODELAY is set we make an explicit push, which overrides
 * even TCP_CORK for currently queued segments.
 */
void __tcp_sock_set_nodelay(struct sock *sk, bool on)
{
        if (on) {
                tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
                tcp_push_pending_frames(sk);
        } else {
                tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
        }
}

void tcp_sock_set_nodelay(struct sock *sk)
{
        lock_sock(sk);
        __tcp_sock_set_nodelay(sk, true);
        release_sock(sk);
}
EXPORT_SYMBOL(tcp_sock_set_nodelay);

static void __tcp_sock_set_quickack(struct sock *sk, int val)
{
        if (!val) {
                inet_csk_enter_pingpong_mode(sk);
                return;
        }

        inet_csk_exit_pingpong_mode(sk);
        if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
            inet_csk_ack_scheduled(sk)) {
                inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
                tcp_cleanup_rbuf(sk, 1);
                if (!(val & 1))
                        inet_csk_enter_pingpong_mode(sk);
        }
}

void tcp_sock_set_quickack(struct sock *sk, int val)
{
        lock_sock(sk);
        __tcp_sock_set_quickack(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(tcp_sock_set_quickack);

int tcp_sock_set_syncnt(struct sock *sk, int val)
{
        if (val < 1 || val > MAX_TCP_SYNCNT)
                return -EINVAL;

        WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
        return 0;
}
EXPORT_SYMBOL(tcp_sock_set_syncnt);

int tcp_sock_set_user_timeout(struct sock *sk, int val)
{
        /* Cap the max time in ms TCP will retry or probe the window
         * before giving up and aborting (ETIMEDOUT) a connection.
         */
        if (val < 0)
                return -EINVAL;

        WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
        return 0;
}
EXPORT_SYMBOL(tcp_sock_set_user_timeout);

int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (val < 1 || val > MAX_TCP_KEEPIDLE)
                return -EINVAL;

        /* Paired with WRITE_ONCE() in keepalive_time_when() */
        WRITE_ONCE(tp->keepalive_time, val * HZ);
        if (sock_flag(sk, SOCK_KEEPOPEN) &&
            !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
                u32 elapsed = keepalive_time_elapsed(tp);

                if (tp->keepalive_time > elapsed)
                        elapsed = tp->keepalive_time - elapsed;
                else
                        elapsed = 0;
                tcp_reset_keepalive_timer(sk, elapsed);
        }

        return 0;
}

int tcp_sock_set_keepidle(struct sock *sk, int val)
{
        int err;

        lock_sock(sk);
        err = tcp_sock_set_keepidle_locked(sk, val);
        release_sock(sk);
        return err;
}
EXPORT_SYMBOL(tcp_sock_set_keepidle);

int tcp_sock_set_keepintvl(struct sock *sk, int val)
{
        if (val < 1 || val > MAX_TCP_KEEPINTVL)
                return -EINVAL;

        WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
        return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepintvl);

int tcp_sock_set_keepcnt(struct sock *sk, int val)
{
        if (val < 1 || val > MAX_TCP_KEEPCNT)
                return -EINVAL;

        /* Paired with READ_ONCE() in keepalive_probes() */
        WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
        return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepcnt);

int tcp_set_window_clamp(struct sock *sk, int val)
{
        u32 old_window_clamp, new_window_clamp, new_rcv_ssthresh;
        struct tcp_sock *tp = tcp_sk(sk);

        if (!val) {
                if (sk->sk_state != TCP_CLOSE)
                        return -EINVAL;
                WRITE_ONCE(tp->window_clamp, 0);
                return 0;
        }

        old_window_clamp = tp->window_clamp;
        new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val);

        if (new_window_clamp == old_window_clamp)
                return 0;

        WRITE_ONCE(tp->window_clamp, new_window_clamp);

        /* Need to apply the reserved mem provisioning only
         * when shrinking the window clamp.
         */
        if (new_window_clamp < old_window_clamp) {
                __tcp_adjust_rcv_ssthresh(sk, new_window_clamp);
        } else {
                new_rcv_ssthresh = min(tp->rcv_wnd, new_window_clamp);
                tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh);
        }
        return 0;
}

int tcp_sock_set_maxseg(struct sock *sk, int val)
{
        /* Values greater than interface MTU won't take effect. However
         * at the point when this call is done we typically don't yet
         * know which interface is going to be used
         */
        if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW))
                return -EINVAL;

        WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val);
        return 0;
}

/*
 *        Socket option code for TCP.
 */
int do_tcp_setsockopt(struct sock *sk, int level, int optname,
                      sockptr_t optval, unsigned int optlen)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct net *net = sock_net(sk);
        int val;
        int err = 0;

        /* These are data/string values, all the others are ints */
        switch (optname) {
        case TCP_CONGESTION: {
                char name[TCP_CA_NAME_MAX];

                if (optlen < 1)
                        return -EINVAL;

                val = strncpy_from_sockptr(name, optval,
                                        min_t(long, TCP_CA_NAME_MAX-1, optlen));
                if (val < 0)
                        return -EFAULT;
                name[val] = 0;

                sockopt_lock_sock(sk);
                err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(),
                                                 sockopt_ns_capable(sock_net(sk)->user_ns,
                                                                    CAP_NET_ADMIN));
                sockopt_release_sock(sk);
                return err;
        }
        case TCP_ULP: {
                char name[TCP_ULP_NAME_MAX];

                if (optlen < 1)
                        return -EINVAL;

                val = strncpy_from_sockptr(name, optval,
                                        min_t(long, TCP_ULP_NAME_MAX - 1,
                                              optlen));
                if (val < 0)
                        return -EFAULT;
                name[val] = 0;

                sockopt_lock_sock(sk);
                err = tcp_set_ulp(sk, name);
                sockopt_release_sock(sk);
                return err;
        }
        case TCP_FASTOPEN_KEY: {
                __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
                __u8 *backup_key = NULL;

                /* Allow a backup key as well to facilitate key rotation
                 * First key is the active one.
                 */
                if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
                    optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
                        return -EINVAL;

                if (copy_from_sockptr(key, optval, optlen))
                        return -EFAULT;

                if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
                        backup_key = key + TCP_FASTOPEN_KEY_LENGTH;

                return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
        }
        default:
                /* fallthru */
                break;
        }

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        /* Handle options that can be set without locking the socket. */
        switch (optname) {
        case TCP_SYNCNT:
                return tcp_sock_set_syncnt(sk, val);
        case TCP_USER_TIMEOUT:
                return tcp_sock_set_user_timeout(sk, val);
        case TCP_KEEPINTVL:
                return tcp_sock_set_keepintvl(sk, val);
        case TCP_KEEPCNT:
                return tcp_sock_set_keepcnt(sk, val);
        case TCP_LINGER2:
                if (val < 0)
                        WRITE_ONCE(tp->linger2, -1);
                else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
                        WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
                else
                        WRITE_ONCE(tp->linger2, val * HZ);
                return 0;
        case TCP_DEFER_ACCEPT:
                /* Translate value in seconds to number of retransmits */
                WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
                           secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
                                           TCP_RTO_MAX / HZ));
                return 0;
        case TCP_RTO_MAX_MS:
                if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC)
                        return -EINVAL;
                WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val));
                return 0;
        case TCP_RTO_MIN_US: {
                int rto_min = usecs_to_jiffies(val);

                if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN)
                        return -EINVAL;
                WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min);
                return 0;
        }
        case TCP_DELACK_MAX_US: {
                int delack_max = usecs_to_jiffies(val);

                if (delack_max > TCP_DELACK_MAX || delack_max < TCP_TIMEOUT_MIN)
                        return -EINVAL;
                WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max);
                return 0;
        }
        case TCP_MAXSEG:
                return tcp_sock_set_maxseg(sk, val);
        }

        sockopt_lock_sock(sk);

        switch (optname) {
        case TCP_NODELAY:
                __tcp_sock_set_nodelay(sk, val);
                break;

        case TCP_THIN_LINEAR_TIMEOUTS:
                if (val < 0 || val > 1)
                        err = -EINVAL;
                else
                        tp->thin_lto = val;
                break;

        case TCP_THIN_DUPACK:
                if (val < 0 || val > 1)
                        err = -EINVAL;
                break;

        case TCP_REPAIR:
                if (!tcp_can_repair_sock(sk))
                        err = -EPERM;
                else if (val == TCP_REPAIR_ON) {
                        tp->repair = 1;
                        sk->sk_reuse = SK_FORCE_REUSE;
                        tp->repair_queue = TCP_NO_QUEUE;
                } else if (val == TCP_REPAIR_OFF) {
                        tp->repair = 0;
                        sk->sk_reuse = SK_NO_REUSE;
                        tcp_send_window_probe(sk);
                } else if (val == TCP_REPAIR_OFF_NO_WP) {
                        tp->repair = 0;
                        sk->sk_reuse = SK_NO_REUSE;
                } else
                        err = -EINVAL;

                break;

        case TCP_REPAIR_QUEUE:
                if (!tp->repair)
                        err = -EPERM;
                else if ((unsigned int)val < TCP_QUEUES_NR)
                        tp->repair_queue = val;
                else
                        err = -EINVAL;
                break;

        case TCP_QUEUE_SEQ:
                if (sk->sk_state != TCP_CLOSE) {
                        err = -EPERM;
                } else if (tp->repair_queue == TCP_SEND_QUEUE) {
                        if (!tcp_rtx_queue_empty(sk))
                                err = -EPERM;
                        else
                                WRITE_ONCE(tp->write_seq, val);
                } else if (tp->repair_queue == TCP_RECV_QUEUE) {
                        if (tp->rcv_nxt != tp->copied_seq) {
                                err = -EPERM;
                        } else {
                                WRITE_ONCE(tp->rcv_nxt, val);
                                WRITE_ONCE(tp->copied_seq, val);
                        }
                } else {
                        err = -EINVAL;
                }
                break;

        case TCP_REPAIR_OPTIONS:
                if (!tp->repair)
                        err = -EINVAL;
                else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
                        err = tcp_repair_options_est(sk, optval, optlen);
                else
                        err = -EPERM;
                break;

        case TCP_CORK:
                __tcp_sock_set_cork(sk, val);
                break;

        case TCP_KEEPIDLE:
                err = tcp_sock_set_keepidle_locked(sk, val);
                break;
        case TCP_SAVE_SYN:
                /* 0: disable, 1: enable, 2: start from ether_header */
                if (val < 0 || val > 2)
                        err = -EINVAL;
                else
                        tp->save_syn = val;
                break;

        case TCP_WINDOW_CLAMP:
                err = tcp_set_window_clamp(sk, val);
                break;

        case TCP_QUICKACK:
                __tcp_sock_set_quickack(sk, val);
                break;

        case TCP_AO_REPAIR:
                if (!tcp_can_repair_sock(sk)) {
                        err = -EPERM;
                        break;
                }
                err = tcp_ao_set_repair(sk, optval, optlen);
                break;
#ifdef CONFIG_TCP_AO
        case TCP_AO_ADD_KEY:
        case TCP_AO_DEL_KEY:
        case TCP_AO_INFO: {
                /* If this is the first TCP-AO setsockopt() on the socket,
                 * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR
                 * in any state.
                 */
                if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
                        goto ao_parse;
                if (rcu_dereference_protected(tcp_sk(sk)->ao_info,
                                              lockdep_sock_is_held(sk)))
                        goto ao_parse;
                if (tp->repair)
                        goto ao_parse;
                err = -EISCONN;
                break;
ao_parse:
                err = tp->af_specific->ao_parse(sk, optname, optval, optlen);
                break;
        }
#endif
#ifdef CONFIG_TCP_MD5SIG
        case TCP_MD5SIG:
        case TCP_MD5SIG_EXT:
                err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
                break;
#endif
        case TCP_FASTOPEN:
                if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
                    TCPF_LISTEN))) {
                        tcp_fastopen_init_key_once(net);

                        fastopen_queue_tune(sk, val);
                } else {
                        err = -EINVAL;
                }
                break;
        case TCP_FASTOPEN_CONNECT:
                if (val > 1 || val < 0) {
                        err = -EINVAL;
                } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
                           TFO_CLIENT_ENABLE) {
                        if (sk->sk_state == TCP_CLOSE)
                                tp->fastopen_connect = val;
                        else
                                err = -EINVAL;
                } else {
                        err = -EOPNOTSUPP;
                }
                break;
        case TCP_FASTOPEN_NO_COOKIE:
                if (val > 1 || val < 0)
                        err = -EINVAL;
                else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                        err = -EINVAL;
                else
                        tp->fastopen_no_cookie = val;
                break;
        case TCP_TIMESTAMP:
                if (!tp->repair) {
                        err = -EPERM;
                        break;
                }
                /* val is an opaque field,
                 * and low order bit contains usec_ts enable bit.
                 * Its a best effort, and we do not care if user makes an error.
                 */
                tp->tcp_usec_ts = val & 1;
                WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
                break;
        case TCP_REPAIR_WINDOW:
                err = tcp_repair_set_window(tp, optval, optlen);
                break;
        case TCP_NOTSENT_LOWAT:
                WRITE_ONCE(tp->notsent_lowat, val);
                READ_ONCE(sk->sk_write_space)(sk);
                break;
        case TCP_INQ:
                if (val > 1 || val < 0)
                        err = -EINVAL;
                else
                        tp->recvmsg_inq = val;
                break;
        case TCP_TX_DELAY:
                /* tp->srtt_us is u32, and is shifted by 3 */
                if (val < 0 || val >= (1U << (31 - 3))) {
                        err = -EINVAL;
                        break;
                }
                tcp_enable_tx_delay(sk, val);
                WRITE_ONCE(tp->tcp_tx_delay, val);
                break;
        default:
                err = -ENOPROTOOPT;
                break;
        }

        sockopt_release_sock(sk);
        return err;
}

int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                   unsigned int optlen)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        if (level != SOL_TCP)
                /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
                return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname,
                                                                optval, optlen);
        return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}

static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
                                      struct tcp_info *info)
{
        u64 stats[__TCP_CHRONO_MAX], total = 0;
        enum tcp_chrono i;

        for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
                stats[i] = tp->chrono_stat[i - 1];
                if (i == tp->chrono_type)
                        stats[i] += tcp_jiffies32 - tp->chrono_start;
                stats[i] *= USEC_PER_SEC / HZ;
                total += stats[i];
        }

        info->tcpi_busy_time = total;
        info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
        info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
}

/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
        const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const u8 ect1_idx = INET_ECN_ECT_1 - 1;
        const u8 ect0_idx = INET_ECN_ECT_0 - 1;
        const u8 ce_idx = INET_ECN_CE - 1;
        unsigned long rate;
        u32 now;
        u64 rate64;
        bool slow;

        memset(info, 0, sizeof(*info));
        if (sk->sk_type != SOCK_STREAM)
                return;

        info->tcpi_state = inet_sk_state_load(sk);

        /* Report meaningful fields for all TCP states, including listeners */
        rate = READ_ONCE(sk->sk_pacing_rate);
        rate64 = (rate != ~0UL) ? rate : ~0ULL;
        info->tcpi_pacing_rate = rate64;

        rate = READ_ONCE(sk->sk_max_pacing_rate);
        rate64 = (rate != ~0UL) ? rate : ~0ULL;
        info->tcpi_max_pacing_rate = rate64;

        info->tcpi_reordering = tp->reordering;
        info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);

        if (info->tcpi_state == TCP_LISTEN) {
                /* listeners aliased fields :
                 * tcpi_unacked -> Number of children ready for accept()
                 * tcpi_sacked  -> max backlog
                 */
                info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
                info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
                return;
        }

        slow = lock_sock_fast(sk);

        info->tcpi_ca_state = icsk->icsk_ca_state;
        info->tcpi_retransmits = icsk->icsk_retransmits;
        info->tcpi_probes = icsk->icsk_probes_out;
        info->tcpi_backoff = icsk->icsk_backoff;

        if (tp->rx_opt.tstamp_ok)
                info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
        if (tcp_is_sack(tp))
                info->tcpi_options |= TCPI_OPT_SACK;
        if (tp->rx_opt.wscale_ok) {
                info->tcpi_options |= TCPI_OPT_WSCALE;
                info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
                info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
        }

        if (tcp_ecn_mode_any(tp))
                info->tcpi_options |= TCPI_OPT_ECN;
        if (tp->ecn_flags & TCP_ECN_SEEN)
                info->tcpi_options |= TCPI_OPT_ECN_SEEN;
        if (tp->syn_data_acked)
                info->tcpi_options |= TCPI_OPT_SYN_DATA;
        if (tp->tcp_usec_ts)
                info->tcpi_options |= TCPI_OPT_USEC_TS;
        if (tp->syn_fastopen_child)
                info->tcpi_options |= TCPI_OPT_TFO_CHILD;

        info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
        info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
                                                tcp_delack_max(sk)));
        info->tcpi_snd_mss = tp->mss_cache;
        info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;

        info->tcpi_unacked = tp->packets_out;
        info->tcpi_sacked = tp->sacked_out;

        info->tcpi_lost = tp->lost_out;
        info->tcpi_retrans = tp->retrans_out;

        now = tcp_jiffies32;
        info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
        info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
        info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);

        info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
        info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
        info->tcpi_rtt = tp->srtt_us >> 3;
        info->tcpi_rttvar = tp->mdev_us >> 2;
        info->tcpi_snd_ssthresh = tp->snd_ssthresh;
        info->tcpi_advmss = tp->advmss;

        info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
        info->tcpi_rcv_space = tp->rcvq_space.space;

        info->tcpi_total_retrans = tp->total_retrans;

        info->tcpi_bytes_acked = tp->bytes_acked;
        info->tcpi_bytes_received = tp->bytes_received;
        info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
        tcp_get_info_chrono_stats(tp, info);

        info->tcpi_segs_out = tp->segs_out;

        /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */
        info->tcpi_segs_in = READ_ONCE(tp->segs_in);
        info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);

        info->tcpi_min_rtt = tcp_min_rtt(tp);
        info->tcpi_data_segs_out = tp->data_segs_out;

        info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
        rate64 = tcp_compute_delivery_rate(tp);
        if (rate64)
                info->tcpi_delivery_rate = rate64;
        info->tcpi_delivered = tp->delivered;
        info->tcpi_delivered_ce = tp->delivered_ce;
        info->tcpi_bytes_sent = tp->bytes_sent;
        info->tcpi_bytes_retrans = tp->bytes_retrans;
        info->tcpi_dsack_dups = tp->dsack_dups;
        info->tcpi_reord_seen = tp->reord_seen;
        info->tcpi_rcv_ooopack = tp->rcv_ooopack;
        info->tcpi_snd_wnd = tp->snd_wnd;
        info->tcpi_rcv_wnd = tp->rcv_wnd;
        info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
        info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;

        info->tcpi_total_rto = tp->total_rto;
        info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
        info->tcpi_total_rto_time = tp->total_rto_time;
        if (tp->rto_stamp)
                info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;

        if (tcp_ecn_disabled(tp))
                info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED;
        else if (tcp_ecn_mode_rfc3168(tp))
                info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168;
        else if (tcp_ecn_mode_accecn(tp))
                info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN;
        else if (tcp_ecn_mode_pending(tp))
                info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING;
        info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
        info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
        info->tcpi_received_ce = tp->received_ce;
        info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx];
        info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx];
        info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx];
        info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx];
        info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx];
        info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx];

        unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);

static size_t tcp_opt_stats_get_size(void)
{
        return
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
                nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
                nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
                nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
                nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
                nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */
                0;
}

/* Returns TTL or hop limit of an incoming packet from skb. */
static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
{
        if (skb->protocol == htons(ETH_P_IP))
                return ip_hdr(skb)->ttl;
        else if (skb->protocol == htons(ETH_P_IPV6))
                return ipv6_hdr(skb)->hop_limit;
        else
                return 0;
}

struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
                                               const struct sk_buff *orig_skb,
                                               const struct sk_buff *ack_skb)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *stats;
        struct tcp_info info;
        unsigned long rate;
        u64 rate64;

        stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
        if (!stats)
                return NULL;

        tcp_get_info_chrono_stats(tp, &info);
        nla_put_u64_64bit(stats, TCP_NLA_BUSY,
                          info.tcpi_busy_time, TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
                          info.tcpi_rwnd_limited, TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
                          info.tcpi_sndbuf_limited, TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
                          tp->data_segs_out, TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
                          tp->total_retrans, TCP_NLA_PAD);

        rate = READ_ONCE(sk->sk_pacing_rate);
        rate64 = (rate != ~0UL) ? rate : ~0ULL;
        nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);

        rate64 = tcp_compute_delivery_rate(tp);
        nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);

        nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
        nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
        nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));

        nla_put_u8(stats, TCP_NLA_RECUR_RETRANS,
                   READ_ONCE(inet_csk(sk)->icsk_retransmits));
        nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
        nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
        nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
        nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);

        nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
        nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);

        nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
                          TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
                          TCP_NLA_PAD);
        nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
        nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
        nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
        nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
        nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
                    max_t(int, 0, tp->write_seq - tp->snd_nxt));
        nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
                          TCP_NLA_PAD);
        if (ack_skb)
                nla_put_u8(stats, TCP_NLA_TTL,
                           tcp_skb_ttl_or_hop_limit(ack_skb));

        nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
        return stats;
}

int do_tcp_getsockopt(struct sock *sk, int level,
                      int optname, sockptr_t optval, sockptr_t optlen)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        int user_mss;
        int val, len;

        if (copy_from_sockptr(&len, optlen, sizeof(int)))
                return -EFAULT;

        if (len < 0)
                return -EINVAL;

        len = min_t(unsigned int, len, sizeof(int));

        switch (optname) {
        case TCP_MAXSEG:
                val = tp->mss_cache;
                user_mss = READ_ONCE(tp->rx_opt.user_mss);
                if (user_mss &&
                    ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                        val = user_mss;
                if (tp->repair)
                        val = tp->rx_opt.mss_clamp;
                break;
        case TCP_NODELAY:
                val = !!(tp->nonagle&TCP_NAGLE_OFF);
                break;
        case TCP_CORK:
                val = !!(tp->nonagle&TCP_NAGLE_CORK);
                break;
        case TCP_KEEPIDLE:
                val = keepalive_time_when(tp) / HZ;
                break;
        case TCP_KEEPINTVL:
                val = keepalive_intvl_when(tp) / HZ;
                break;
        case TCP_KEEPCNT:
                val = keepalive_probes(tp);
                break;
        case TCP_SYNCNT:
                val = READ_ONCE(icsk->icsk_syn_retries) ? :
                        READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
                break;
        case TCP_LINGER2:
                val = READ_ONCE(tp->linger2);
                if (val >= 0)
                        val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
                break;
        case TCP_DEFER_ACCEPT:
                val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
                val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ,
                                      TCP_RTO_MAX / HZ);
                break;
        case TCP_WINDOW_CLAMP:
                val = READ_ONCE(tp->window_clamp);
                break;
        case TCP_INFO: {
                struct tcp_info info;

                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;

                tcp_get_info(sk, &info);

                len = min_t(unsigned int, len, sizeof(info));
                if (copy_to_sockptr(optlen, &len, sizeof(int)))
                        return -EFAULT;
                if (copy_to_sockptr(optval, &info, len))
                        return -EFAULT;
                return 0;
        }
        case TCP_CC_INFO: {
                const struct tcp_congestion_ops *ca_ops;
                union tcp_cc_info info;
                size_t sz = 0;
                int attr;

                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;

                ca_ops = icsk->icsk_ca_ops;
                if (ca_ops && ca_ops->get_info)
                        sz = ca_ops->get_info(sk, ~0U, &attr, &info);

                len = min_t(unsigned int, len, sz);
                if (copy_to_sockptr(optlen, &len, sizeof(int)))
                        return -EFAULT;
                if (copy_to_sockptr(optval, &info, len))
                        return -EFAULT;
                return 0;
        }
        case TCP_QUICKACK:
                val = !inet_csk_in_pingpong_mode(sk);
                break;

        case TCP_CONGESTION:
                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;
                len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
                if (copy_to_sockptr(optlen, &len, sizeof(int)))
                        return -EFAULT;
                if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len))
                        return -EFAULT;
                return 0;

        case TCP_ULP:
                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;
                len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
                if (!icsk->icsk_ulp_ops) {
                        len = 0;
                        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                                return -EFAULT;
                        return 0;
                }
                if (copy_to_sockptr(optlen, &len, sizeof(int)))
                        return -EFAULT;
                if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len))
                        return -EFAULT;
                return 0;

        case TCP_FASTOPEN_KEY: {
                u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
                unsigned int key_len;

                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;

                key_len = tcp_fastopen_get_cipher(net, icsk, key) *
                                TCP_FASTOPEN_KEY_LENGTH;
                len = min_t(unsigned int, len, key_len);
                if (copy_to_sockptr(optlen, &len, sizeof(int)))
                        return -EFAULT;
                if (copy_to_sockptr(optval, key, len))
                        return -EFAULT;
                return 0;
        }
        case TCP_THIN_LINEAR_TIMEOUTS:
                val = tp->thin_lto;
                break;

        case TCP_THIN_DUPACK:
                val = 0;
                break;

        case TCP_REPAIR:
                val = tp->repair;
                break;

        case TCP_REPAIR_QUEUE:
                if (tp->repair)
                        val = tp->repair_queue;
                else
                        return -EINVAL;
                break;

        case TCP_REPAIR_WINDOW: {
                struct tcp_repair_window opt;

                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;

                if (len != sizeof(opt))
                        return -EINVAL;

                if (!tp->repair)
                        return -EPERM;

                opt.snd_wl1        = tp->snd_wl1;
                opt.snd_wnd        = tp->snd_wnd;
                opt.max_window        = tp->max_window;
                opt.rcv_wnd        = tp->rcv_wnd;
                opt.rcv_wup        = tp->rcv_wup;

                if (copy_to_sockptr(optval, &opt, len))
                        return -EFAULT;
                return 0;
        }
        case TCP_QUEUE_SEQ:
                if (tp->repair_queue == TCP_SEND_QUEUE)
                        val = tp->write_seq;
                else if (tp->repair_queue == TCP_RECV_QUEUE)
                        val = tp->rcv_nxt;
                else
                        return -EINVAL;
                break;

        case TCP_USER_TIMEOUT:
                val = READ_ONCE(icsk->icsk_user_timeout);
                break;

        case TCP_FASTOPEN:
                val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
                break;

        case TCP_FASTOPEN_CONNECT:
                val = tp->fastopen_connect;
                break;

        case TCP_FASTOPEN_NO_COOKIE:
                val = tp->fastopen_no_cookie;
                break;

        case TCP_TX_DELAY:
                val = READ_ONCE(tp->tcp_tx_delay);
                break;

        case TCP_TIMESTAMP:
                val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
                if (tp->tcp_usec_ts)
                        val |= 1;
                else
                        val &= ~1;
                break;
        case TCP_NOTSENT_LOWAT:
                val = READ_ONCE(tp->notsent_lowat);
                break;
        case TCP_INQ:
                val = tp->recvmsg_inq;
                break;
        case TCP_SAVE_SYN:
                val = tp->save_syn;
                break;
        case TCP_SAVED_SYN: {
                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;

                sockopt_lock_sock(sk);
                if (tp->saved_syn) {
                        if (len < tcp_saved_syn_len(tp->saved_syn)) {
                                len = tcp_saved_syn_len(tp->saved_syn);
                                if (copy_to_sockptr(optlen, &len, sizeof(int))) {
                                        sockopt_release_sock(sk);
                                        return -EFAULT;
                                }
                                sockopt_release_sock(sk);
                                return -EINVAL;
                        }
                        len = tcp_saved_syn_len(tp->saved_syn);
                        if (copy_to_sockptr(optlen, &len, sizeof(int))) {
                                sockopt_release_sock(sk);
                                return -EFAULT;
                        }
                        if (copy_to_sockptr(optval, tp->saved_syn->data, len)) {
                                sockopt_release_sock(sk);
                                return -EFAULT;
                        }
                        tcp_saved_syn_free(tp);
                        sockopt_release_sock(sk);
                } else {
                        sockopt_release_sock(sk);
                        len = 0;
                        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                                return -EFAULT;
                }
                return 0;
        }
#ifdef CONFIG_MMU
        case TCP_ZEROCOPY_RECEIVE: {
                struct scm_timestamping_internal tss;
                struct tcp_zerocopy_receive zc = {};
                int err;

                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;
                if (len < 0 ||
                    len < offsetofend(struct tcp_zerocopy_receive, length))
                        return -EINVAL;
                if (unlikely(len > sizeof(zc))) {
                        err = check_zeroed_sockptr(optval, sizeof(zc),
                                                   len - sizeof(zc));
                        if (err < 1)
                                return err == 0 ? -EINVAL : err;
                        len = sizeof(zc);
                        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                                return -EFAULT;
                }
                if (copy_from_sockptr(&zc, optval, len))
                        return -EFAULT;
                if (zc.reserved)
                        return -EINVAL;
                if (zc.msg_flags &  ~(TCP_VALID_ZC_MSG_FLAGS))
                        return -EINVAL;
                sockopt_lock_sock(sk);
                err = tcp_zerocopy_receive(sk, &zc, &tss);
                err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
                                                          &zc, &len, err);
                sockopt_release_sock(sk);
                if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
                        goto zerocopy_rcv_cmsg;
                switch (len) {
                case offsetofend(struct tcp_zerocopy_receive, msg_flags):
                        goto zerocopy_rcv_cmsg;
                case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
                case offsetofend(struct tcp_zerocopy_receive, msg_control):
                case offsetofend(struct tcp_zerocopy_receive, flags):
                case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
                case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
                case offsetofend(struct tcp_zerocopy_receive, err):
                        goto zerocopy_rcv_sk_err;
                case offsetofend(struct tcp_zerocopy_receive, inq):
                        goto zerocopy_rcv_inq;
                case offsetofend(struct tcp_zerocopy_receive, length):
                default:
                        goto zerocopy_rcv_out;
                }
zerocopy_rcv_cmsg:
                if (zc.msg_flags & TCP_CMSG_TS)
                        tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
                else
                        zc.msg_flags = 0;
zerocopy_rcv_sk_err:
                if (!err)
                        zc.err = sock_error(sk);
zerocopy_rcv_inq:
                zc.inq = tcp_inq_hint(sk);
zerocopy_rcv_out:
                if (!err && copy_to_sockptr(optval, &zc, len))
                        err = -EFAULT;
                return err;
        }
#endif
        case TCP_AO_REPAIR:
                if (!tcp_can_repair_sock(sk))
                        return -EPERM;
                return tcp_ao_get_repair(sk, optval, optlen);
        case TCP_AO_GET_KEYS:
        case TCP_AO_INFO: {
                int err;

                sockopt_lock_sock(sk);
                if (optname == TCP_AO_GET_KEYS)
                        err = tcp_ao_get_mkts(sk, optval, optlen);
                else
                        err = tcp_ao_get_sock_info(sk, optval, optlen);
                sockopt_release_sock(sk);

                return err;
        }
        case TCP_IS_MPTCP:
                val = 0;
                break;
        case TCP_RTO_MAX_MS:
                val = jiffies_to_msecs(tcp_rto_max(sk));
                break;
        case TCP_RTO_MIN_US:
                val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min));
                break;
        case TCP_DELACK_MAX_US:
                val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max));
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                return -EFAULT;
        if (copy_to_sockptr(optval, &val, len))
                return -EFAULT;
        return 0;
}

bool tcp_bpf_bypass_getsockopt(int level, int optname)
{
        /* TCP do_tcp_getsockopt has optimized getsockopt implementation
         * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
         */
        if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
                return true;

        return false;
}

int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                   int __user *optlen)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (level != SOL_TCP)
                /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
                return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname,
                                                                optval, optlen);
        return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval),
                                 USER_SOCKPTR(optlen));
}

#ifdef CONFIG_TCP_MD5SIG
void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb,
                           unsigned int header_len)
{
        const unsigned int head_data_len = skb_headlen(skb) > header_len ?
                                           skb_headlen(skb) - header_len : 0;
        const struct skb_shared_info *shi = skb_shinfo(skb);
        struct sk_buff *frag_iter;
        unsigned int i;

        md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len);

        for (i = 0; i < shi->nr_frags; ++i) {
                const skb_frag_t *f = &shi->frags[i];
                u32 p_off, p_len, copied;
                const void *vaddr;
                struct page *p;

                skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
                                      p, p_off, p_len, copied) {
                        vaddr = kmap_local_page(p);
                        md5_update(ctx, vaddr + p_off, p_len);
                        kunmap_local(vaddr);
                }
        }

        skb_walk_frags(skb, frag_iter)
                tcp_md5_hash_skb_data(ctx, frag_iter, 0);
}

void tcp_md5_hash_key(struct md5_ctx *ctx,
                      const struct tcp_md5sig_key *key)
{
        u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */

        /* We use data_race() because tcp_md5_do_add() might change
         * key->key under us
         */
        data_race(({ md5_update(ctx, key->key, keylen), 0; }));
}

/* Called with rcu_read_lock() */
static enum skb_drop_reason
tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
                     const void *saddr, const void *daddr,
                     int family, int l3index, const __u8 *hash_location)
{
        /* This gets called for each TCP segment that has TCP-MD5 option.
         * We have 2 drop cases:
         * o An MD5 signature is present, but we're not expecting one.
         * o The MD5 signature is wrong.
         */
        const struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_key *key;
        u8 newhash[16];

        key = tcp_md5_do_lookup(sk, l3index, saddr, family);
        if (!key) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
                trace_tcp_hash_md5_unexpected(sk, skb);
                return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
        }

        /* Check the signature.
         * To support dual stack listeners, we need to handle
         * IPv4-mapped case.
         */
        if (family == AF_INET)
                tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
        else
                tp->af_specific->calc_md5_hash(newhash, key, NULL, skb);
        if (crypto_memneq(hash_location, newhash, 16)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
                trace_tcp_hash_md5_mismatch(sk, skb);
                return SKB_DROP_REASON_TCP_MD5FAILURE;
        }
        return SKB_NOT_DROPPED_YET;
}
#else
static inline enum skb_drop_reason
tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
                     const void *saddr, const void *daddr,
                     int family, int l3index, const __u8 *hash_location)
{
        return SKB_NOT_DROPPED_YET;
}

#endif

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
/*
 * Parse Signature options
 */
int tcp_do_parse_auth_options(const struct tcphdr *th,
                              const u8 **md5_hash, const u8 **ao_hash)
{
        int length = (th->doff << 2) - sizeof(*th);
        const u8 *ptr = (const u8 *)(th + 1);
        unsigned int minlen = TCPOLEN_MD5SIG;

        if (IS_ENABLED(CONFIG_TCP_AO))
                minlen = sizeof(struct tcp_ao_hdr) + 1;

        *md5_hash = NULL;
        *ao_hash = NULL;

        /* If not enough data remaining, we can short cut */
        while (length >= minlen) {
                int opcode = *ptr++;
                int opsize;

                switch (opcode) {
                case TCPOPT_EOL:
                        return 0;
                case TCPOPT_NOP:
                        length--;
                        continue;
                default:
                        opsize = *ptr++;
                        if (opsize < 2 || opsize > length)
                                return -EINVAL;
                        if (opcode == TCPOPT_MD5SIG) {
                                if (opsize != TCPOLEN_MD5SIG)
                                        return -EINVAL;
                                if (unlikely(*md5_hash || *ao_hash))
                                        return -EEXIST;
                                *md5_hash = ptr;
                        } else if (opcode == TCPOPT_AO) {
                                if (opsize <= sizeof(struct tcp_ao_hdr))
                                        return -EINVAL;
                                if (unlikely(*md5_hash || *ao_hash))
                                        return -EEXIST;
                                *ao_hash = ptr;
                        }
                }
                ptr += opsize - 2;
                length -= opsize;
        }
        return 0;
}
#endif

/* Called with rcu_read_lock() */
enum skb_drop_reason
tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
                 const struct sk_buff *skb,
                 const void *saddr, const void *daddr,
                 int family, int dif, int sdif)
{
        const struct tcphdr *th = tcp_hdr(skb);
        const struct tcp_ao_hdr *aoh;
        const __u8 *md5_location;
        int l3index;

        /* Invalid option or two times meet any of auth options */
        if (tcp_parse_auth_options(th, &md5_location, &aoh)) {
                trace_tcp_hash_bad_header(sk, skb);
                return SKB_DROP_REASON_TCP_AUTH_HDR;
        }

        if (req) {
                if (tcp_rsk_used_ao(req) != !!aoh) {
                        u8 keyid, rnext, maclen;

                        if (aoh) {
                                keyid = aoh->keyid;
                                rnext = aoh->rnext_keyid;
                                maclen = tcp_ao_hdr_maclen(aoh);
                        } else {
                                keyid = rnext = maclen = 0;
                        }

                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
                        trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen);
                        return SKB_DROP_REASON_TCP_AOFAILURE;
                }
        }

        /* sdif set, means packet ingressed via a device
         * in an L3 domain and dif is set to the l3mdev
         */
        l3index = sdif ? dif : 0;

        /* Fast path: unsigned segments */
        if (likely(!md5_location && !aoh)) {
                /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid
                 * for the remote peer. On TCP-AO established connection
                 * the last key is impossible to remove, so there's
                 * always at least one current_key.
                 */
                if (tcp_ao_required(sk, saddr, family, l3index, true)) {
                        trace_tcp_hash_ao_required(sk, skb);
                        return SKB_DROP_REASON_TCP_AONOTFOUND;
                }
                if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) {
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
                        trace_tcp_hash_md5_required(sk, skb);
                        return SKB_DROP_REASON_TCP_MD5NOTFOUND;
                }
                return SKB_NOT_DROPPED_YET;
        }

        if (aoh)
                return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh);

        return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family,
                                    l3index, md5_location);
}

void tcp_done(struct sock *sk)
{
        struct request_sock *req;

        /* We might be called with a new socket, after
         * inet_csk_prepare_forced_close() has been called
         * so we can not use lockdep_sock_is_held(sk)
         */
        req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);

        if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
                TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);

        tcp_set_state(sk, TCP_CLOSE);
        tcp_clear_xmit_timers(sk);
        if (req)
                reqsk_fastopen_remove(sk, req, false);

        WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_state_change(sk);
        else
                inet_csk_destroy_sock(sk);
}
EXPORT_SYMBOL_GPL(tcp_done);

int tcp_abort(struct sock *sk, int err)
{
        int state = inet_sk_state_load(sk);

        if (state == TCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);

                local_bh_disable();
                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
                local_bh_enable();
                return 0;
        }
        if (state == TCP_TIME_WAIT) {
                struct inet_timewait_sock *tw = inet_twsk(sk);

                refcount_inc(&tw->tw_refcnt);
                local_bh_disable();
                inet_twsk_deschedule_put(tw);
                local_bh_enable();
                return 0;
        }

        /* BPF context ensures sock locking. */
        if (!has_current_bpf_ctx())
                /* Don't race with userspace socket closes such as tcp_close. */
                lock_sock(sk);

        /* Avoid closing the same socket twice. */
        if (sk->sk_state == TCP_CLOSE) {
                if (!has_current_bpf_ctx())
                        release_sock(sk);
                return -ENOENT;
        }

        if (sk->sk_state == TCP_LISTEN) {
                tcp_set_state(sk, TCP_CLOSE);
                inet_csk_listen_stop(sk);
        }

        /* Don't race with BH socket closes such as inet_csk_listen_stop. */
        local_bh_disable();
        bh_lock_sock(sk);

        if (tcp_need_reset(sk->sk_state))
                tcp_send_active_reset(sk, GFP_ATOMIC,
                                      SK_RST_REASON_TCP_STATE);
        tcp_done_with_error(sk, err);

        bh_unlock_sock(sk);
        local_bh_enable();
        if (!has_current_bpf_ctx())
                release_sock(sk);
        return 0;
}
EXPORT_SYMBOL_GPL(tcp_abort);

extern struct tcp_congestion_ops tcp_reno;

static __initdata unsigned long thash_entries;
static int __init set_thash_entries(char *str)
{
        ssize_t ret;

        if (!str)
                return 0;

        ret = kstrtoul(str, 0, &thash_entries);
        if (ret)
                return 0;

        return 1;
}
__setup("thash_entries=", set_thash_entries);

static void __init tcp_init_mem(void)
{
        unsigned long limit = nr_free_buffer_pages() / 16;

        limit = max(limit, 128UL);
        sysctl_tcp_mem[0] = limit / 4 * 3;                /* 4.68 % */
        sysctl_tcp_mem[1] = limit;                        /* 6.25 % */
        sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;        /* 9.37 % */
}

static void __init tcp_struct_check(void)
{
        /* TX read-mostly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
#if IS_ENABLED(CONFIG_TLS_DEVICE)
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked);
#endif

        /* TXRX read-mostly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio);

        /* RX read-mostly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);

        /* TX read-write hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);

        /* TXRX read-write hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_mwnd_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);

        /* RX read-write hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, pkts_acked_ewma);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
}

void __init tcp_init(void)
{
        int max_rshare, max_wshare, cnt;
        unsigned long limit;
        unsigned int i;

        BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
        BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
                     sizeof_field(struct sk_buff, cb));

        tcp_struct_check();

        percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);

        timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
        mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);

        inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
                            thash_entries, 21,  /* one slot per 2 MB*/
                            0, 64 * 1024);
        tcp_hashinfo.bind_bucket_cachep =
                kmem_cache_create("tcp_bind_bucket",
                                  sizeof(struct inet_bind_bucket), 0,
                                  SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                  SLAB_ACCOUNT,
                                  NULL);
        tcp_hashinfo.bind2_bucket_cachep =
                kmem_cache_create("tcp_bind2_bucket",
                                  sizeof(struct inet_bind2_bucket), 0,
                                  SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                  SLAB_ACCOUNT,
                                  NULL);

        /* Size and allocate the main established and bind bucket
         * hash tables.
         *
         * The methodology is similar to that of the buffer cache.
         */
        tcp_hashinfo.ehash =
                alloc_large_system_hash("TCP established",
                                        sizeof(struct inet_ehash_bucket),
                                        thash_entries,
                                        17, /* one slot per 128 KB of memory */
                                        0,
                                        NULL,
                                        &tcp_hashinfo.ehash_mask,
                                        0,
                                        thash_entries ? 0 : 512 * 1024);
        for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
                INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);

        if (inet_ehash_locks_alloc(&tcp_hashinfo))
                panic("TCP: failed to alloc ehash_locks");
        tcp_hashinfo.bhash =
                alloc_large_system_hash("TCP bind",
                                        2 * sizeof(struct inet_bind_hashbucket),
                                        tcp_hashinfo.ehash_mask + 1,
                                        17, /* one slot per 128 KB of memory */
                                        0,
                                        &tcp_hashinfo.bhash_size,
                                        NULL,
                                        0,
                                        64 * 1024);
        tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
        tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size;
        for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
                spin_lock_init(&tcp_hashinfo.bhash[i].lock);
                INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
                spin_lock_init(&tcp_hashinfo.bhash2[i].lock);
                INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
        }

        tcp_hashinfo.pernet = false;

        cnt = tcp_hashinfo.ehash_mask + 1;
        sysctl_tcp_max_orphans = cnt / 2;

        tcp_init_mem();
        /* Set per-socket limits to no more than 1/128 the pressure threshold */
        limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
        max_wshare = min(4UL*1024*1024, limit);
        max_rshare = min(32UL*1024*1024, limit);

        init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
        init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
        init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);

        init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
        init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
        init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);

        pr_info("Hash tables configured (established %u bind %u)\n",
                tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);

        tcp_v4_init();
        tcp_metrics_init();
        BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
        tcp_tsq_work_init();
        mptcp_init();
}















































































































































































































































































































































































































   19 





   15 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   19 






   19 















   19 


   17 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   19 






























































   18 

   19 

   17 















   19 













   18 











   19 








   17 




   18 


























   19 







   19 

   18 














   18 



   17 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
// SPDX-License-Identifier: GPL-2.0+
/*
 *  Base port operations for 8250/16550-type serial ports
 *
 *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
 *  Split from 8250_core.c, Copyright (C) 2001 Russell King.
 *
 * A note about mapbase / membase
 *
 *  mapbase is the physical address of the IO port.
 *  membase is an 'ioremapped' cookie.
 */

#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/irq.h>
#include <linux/console.h>
#include <linux/gpio/consumer.h>
#include <linux/lockdep.h>
#include <linux/sysrq.h>
#include <linux/delay.h>
#include <linux/platform_device.h>
#include <linux/tty.h>
#include <linux/ratelimit.h>
#include <linux/tty_flip.h>
#include <linux/serial.h>
#include <linux/serial_8250.h>
#include <linux/nmi.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
#include <linux/ktime.h>

#include <asm/io.h>
#include <asm/irq.h>

#include "8250.h"

/*
 * Here we define the default xmit fifo size used for each type of UART.
 */
static const struct serial8250_config uart_config[] = {
        [PORT_UNKNOWN] = {
                .name                = "unknown",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_8250] = {
                .name                = "8250",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16450] = {
                .name                = "16450",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16550] = {
                .name                = "16550",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16550A] = {
                .name                = "16550A",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_CIRRUS] = {
                .name                = "Cirrus",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16650] = {
                .name                = "ST16650",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_16650V2] = {
                .name                = "ST16650V2",
                .fifo_size        = 32,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
                                  UART_FCR_T_TRIG_00,
                .rxtrig_bytes        = {8, 16, 24, 28},
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_16750] = {
                .name                = "TI16750",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
                                  UART_FCR7_64BYTE,
                .rxtrig_bytes        = {1, 16, 32, 56},
                .flags                = UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE,
        },
        [PORT_STARTECH] = {
                .name                = "Startech",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16C950] = {
                .name                = "16C950/954",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
                .rxtrig_bytes        = {16, 32, 112, 120},
                /* UART_CAP_EFR breaks billionon CF bluetooth card. */
                .flags                = UART_CAP_FIFO | UART_CAP_SLEEP,
        },
        [PORT_16654] = {
                .name                = "ST16654",
                .fifo_size        = 64,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
                                  UART_FCR_T_TRIG_10,
                .rxtrig_bytes        = {8, 16, 56, 60},
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_16850] = {
                .name                = "XR16850",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_RSA] = {
                .name                = "RSA",
                .fifo_size        = 2048,
                .tx_loadsz        = 2048,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_NS16550A] = {
                .name                = "NS16550A",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_NATSEMI,
        },
        [PORT_XSCALE] = {
                .name                = "XScale",
                .fifo_size        = 32,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE,
        },
        [PORT_OCTEON] = {
                .name                = "OCTEON",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_U6_16550A] = {
                .name                = "U6_16550A",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_TEGRA] = {
                .name                = "Tegra",
                .fifo_size        = 32,
                .tx_loadsz        = 8,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
                                  UART_FCR_T_TRIG_01,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO | UART_CAP_RTOIE,
        },
        [PORT_XR17D15X] = {
                .name                = "XR17D15X",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
                                  UART_CAP_SLEEP,
        },
        [PORT_XR17V35X] = {
                .name                = "XR17V35X",
                .fifo_size        = 256,
                .tx_loadsz        = 256,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 |
                                  UART_FCR_T_TRIG_11,
                .flags                = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
                                  UART_CAP_SLEEP,
        },
        [PORT_LPC3220] = {
                .name                = "LPC3220",
                .fifo_size        = 64,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
                                  UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_BRCM_TRUMANAGE] = {
                .name                = "TruManage",
                .fifo_size        = 1,
                .tx_loadsz        = 1024,
                .flags                = UART_CAP_HFIFO,
        },
        [PORT_8250_CIR] = {
                .name                = "CIR port"
        },
        [PORT_ALTR_16550_F32] = {
                .name                = "Altera 16550 FIFO32",
                .fifo_size        = 32,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 8, 16, 30},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_ALTR_16550_F64] = {
                .name                = "Altera 16550 FIFO64",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 16, 32, 62},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_ALTR_16550_F128] = {
                .name                = "Altera 16550 FIFO128",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 32, 64, 126},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        /*
         * tx_loadsz is set to 63-bytes instead of 64-bytes to implement
         * workaround of errata A-008006 which states that tx_loadsz should
         * be configured less than Maximum supported fifo bytes.
         */
        [PORT_16550A_FSL64] = {
                .name                = "16550A_FSL64",
                .fifo_size        = 64,
                .tx_loadsz        = 63,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
                                  UART_FCR7_64BYTE,
                .flags                = UART_CAP_FIFO | UART_CAP_NOTEMT,
        },
        [PORT_RT2880] = {
                .name                = "Palmchip BK-3103",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_DA830] = {
                .name                = "TI DA8xx/66AK2x",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
                                  UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_MTK_BTIF] = {
                .name                = "MediaTek BTIF",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO |
                                  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_NPCM] = {
                .name                = "Nuvoton 16550",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
                                  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_SUNIX] = {
                .name                = "Sunix",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 32, 64, 112},
                .flags                = UART_CAP_FIFO | UART_CAP_SLEEP,
        },
        [PORT_ASPEED_VUART] = {
                .name                = "ASPEED VUART",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_MCHP16550A] = {
                .name           = "MCHP16550A",
                .fifo_size      = 256,
                .tx_loadsz      = 256,
                .fcr            = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
                .rxtrig_bytes   = {2, 66, 130, 194},
                .flags          = UART_CAP_FIFO,
        },
        [PORT_BCM7271] = {
                .name                = "Broadcom BCM7271 UART",
                .fifo_size        = 32,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
                .rxtrig_bytes        = {1, 8, 16, 30},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
};

/* Uart divisor latch read */
static u32 default_serial_dl_read(struct uart_8250_port *up)
{
        /* Assign these in pieces to truncate any bits above 7.  */
        unsigned char dll = serial_in(up, UART_DLL);
        unsigned char dlm = serial_in(up, UART_DLM);

        return dll | dlm << 8;
}

/* Uart divisor latch write */
static void default_serial_dl_write(struct uart_8250_port *up, u32 value)
{
        serial_out(up, UART_DLL, value & 0xff);
        serial_out(up, UART_DLM, value >> 8 & 0xff);
}

#ifdef CONFIG_HAS_IOPORT
static u32 hub6_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        outb(p->hub6 - 1 + offset, p->iobase);
        return inb(p->iobase + 1);
}

static void hub6_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        outb(p->hub6 - 1 + offset, p->iobase);
        outb(value, p->iobase + 1);
}
#endif /* CONFIG_HAS_IOPORT */

static u32 mem_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        return readb(p->membase + offset);
}

static void mem_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        writeb(value, p->membase + offset);
}

static void mem16_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        writew(value, p->membase + offset);
}

static u32 mem16_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        return readw(p->membase + offset);
}

static void mem32_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        writel(value, p->membase + offset);
}

static u32 mem32_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        return readl(p->membase + offset);
}

static void mem32be_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        iowrite32be(value, p->membase + offset);
}

static u32 mem32be_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        return ioread32be(p->membase + offset);
}

#ifdef CONFIG_HAS_IOPORT
static u32 io_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        return inb(p->iobase + offset);
}

static void io_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        outb(value, p->iobase + offset);
}
#endif
static u32 no_serial_in(struct uart_port *p, unsigned int offset)
{
        return ~0U;
}

static void no_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
}

static int serial8250_default_handle_irq(struct uart_port *port);

static void set_io_from_upio(struct uart_port *p)
{
        struct uart_8250_port *up = up_to_u8250p(p);

        up->dl_read = default_serial_dl_read;
        up->dl_write = default_serial_dl_write;

        switch (p->iotype) {
#ifdef CONFIG_HAS_IOPORT
        case UPIO_HUB6:
                p->serial_in = hub6_serial_in;
                p->serial_out = hub6_serial_out;
                break;
#endif

        case UPIO_MEM:
                p->serial_in = mem_serial_in;
                p->serial_out = mem_serial_out;
                break;

        case UPIO_MEM16:
                p->serial_in = mem16_serial_in;
                p->serial_out = mem16_serial_out;
                break;

        case UPIO_MEM32:
                p->serial_in = mem32_serial_in;
                p->serial_out = mem32_serial_out;
                break;

        case UPIO_MEM32BE:
                p->serial_in = mem32be_serial_in;
                p->serial_out = mem32be_serial_out;
                break;
#ifdef CONFIG_HAS_IOPORT
        case UPIO_PORT:
                p->serial_in = io_serial_in;
                p->serial_out = io_serial_out;
                break;
#endif
        default:
                WARN(p->iotype != UPIO_PORT || p->iobase,
                     "Unsupported UART type %x\n", p->iotype);
                p->serial_in = no_serial_in;
                p->serial_out = no_serial_out;
        }
        /* Remember loaded iotype */
        up->cur_iotype = p->iotype;
        p->handle_irq = serial8250_default_handle_irq;
}

static void
serial_port_out_sync(struct uart_port *p, int offset, int value)
{
        switch (p->iotype) {
        case UPIO_MEM:
        case UPIO_MEM16:
        case UPIO_MEM32:
        case UPIO_MEM32BE:
        case UPIO_AU:
                p->serial_out(p, offset, value);
                p->serial_in(p, UART_LCR);        /* safe, no side-effects */
                break;
        default:
                p->serial_out(p, offset, value);
        }
}

/*
 * FIFO support.
 */
void serial8250_clear_fifos(struct uart_8250_port *p)
{
        if (p->capabilities & UART_CAP_FIFO) {
                serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO);
                serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO |
                               UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
                serial_out(p, UART_FCR, 0);
        }
}
EXPORT_SYMBOL_NS_GPL(serial8250_clear_fifos, "SERIAL_8250");

static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t);
static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t);

void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p)
{
        serial8250_clear_fifos(p);
        serial_out(p, UART_FCR, p->fcr);
}
EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos);

void serial8250_rpm_get(struct uart_8250_port *p)
{
        if (!(p->capabilities & UART_CAP_RPM))
                return;
        pm_runtime_get_sync(p->port.dev);
}
EXPORT_SYMBOL_GPL(serial8250_rpm_get);

void serial8250_rpm_put(struct uart_8250_port *p)
{
        if (!(p->capabilities & UART_CAP_RPM))
                return;
        pm_runtime_mark_last_busy(p->port.dev);
        pm_runtime_put_autosuspend(p->port.dev);
}
EXPORT_SYMBOL_GPL(serial8250_rpm_put);

/**
 *        serial8250_em485_init() - put uart_8250_port into rs485 emulating
 *        @p:        uart_8250_port port instance
 *
 *        The function is used to start rs485 software emulating on the
 *        &struct uart_8250_port* @p. Namely, RTS is switched before/after
 *        transmission. The function is idempotent, so it is safe to call it
 *        multiple times.
 *
 *        The caller MUST enable interrupt on empty shift register before
 *        calling serial8250_em485_init(). This interrupt is not a part of
 *        8250 standard, but implementation defined.
 *
 *        The function is supposed to be called from .rs485_config callback
 *        or from any other callback protected with p->port.lock spinlock.
 *
 *        See also serial8250_em485_destroy()
 *
 *        Return 0 - success, -errno - otherwise
 */
static int serial8250_em485_init(struct uart_8250_port *p)
{
        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&p->port.lock);

        if (p->em485)
                goto deassert_rts;

        p->em485 = kmalloc_obj(struct uart_8250_em485, GFP_ATOMIC);
        if (!p->em485)
                return -ENOMEM;

        hrtimer_setup(&p->em485->stop_tx_timer, &serial8250_em485_handle_stop_tx, CLOCK_MONOTONIC,
                      HRTIMER_MODE_REL);
        hrtimer_setup(&p->em485->start_tx_timer, &serial8250_em485_handle_start_tx, CLOCK_MONOTONIC,
                      HRTIMER_MODE_REL);
        p->em485->port = p;
        p->em485->active_timer = NULL;
        p->em485->tx_stopped = true;

deassert_rts:
        if (p->em485->tx_stopped)
                p->rs485_stop_tx(p, true);

        return 0;
}

/**
 *        serial8250_em485_destroy() - put uart_8250_port into normal state
 *        @p:        uart_8250_port port instance
 *
 *        The function is used to stop rs485 software emulating on the
 *        &struct uart_8250_port* @p. The function is idempotent, so it is safe to
 *        call it multiple times.
 *
 *        The function is supposed to be called from .rs485_config callback
 *        or from any other callback protected with p->port.lock spinlock.
 *
 *        See also serial8250_em485_init()
 */
void serial8250_em485_destroy(struct uart_8250_port *p)
{
        if (!p->em485)
                return;

        hrtimer_cancel(&p->em485->start_tx_timer);
        hrtimer_cancel(&p->em485->stop_tx_timer);

        kfree(p->em485);
        p->em485 = NULL;
}
EXPORT_SYMBOL_GPL(serial8250_em485_destroy);

struct serial_rs485 serial8250_em485_supported = {
        .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND |
                 SER_RS485_TERMINATE_BUS | SER_RS485_RX_DURING_TX,
        .delay_rts_before_send = 1,
        .delay_rts_after_send = 1,
};
EXPORT_SYMBOL_GPL(serial8250_em485_supported);

/**
 * serial8250_em485_config() - generic ->rs485_config() callback
 * @port: uart port
 * @termios: termios structure
 * @rs485: rs485 settings
 *
 * Generic callback usable by 8250 uart drivers to activate rs485 settings
 * if the uart is incapable of driving RTS as a Transmit Enable signal in
 * hardware, relying on software emulation instead.
 */
int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
                            struct serial_rs485 *rs485)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /*
         * Both serial8250_em485_init() and serial8250_em485_destroy()
         * are idempotent.
         */
        if (rs485->flags & SER_RS485_ENABLED)
                return serial8250_em485_init(up);

        serial8250_em485_destroy(up);
        return 0;
}
EXPORT_SYMBOL_GPL(serial8250_em485_config);

/*
 * These two wrappers ensure that enable_runtime_pm_tx() can be called more than
 * once and disable_runtime_pm_tx() will still disable RPM because the fifo is
 * empty and the HW can idle again.
 */
static void serial8250_rpm_get_tx(struct uart_8250_port *p)
{
        unsigned char rpm_active;

        if (!(p->capabilities & UART_CAP_RPM))
                return;

        rpm_active = xchg(&p->rpm_tx_active, 1);
        if (rpm_active)
                return;
        pm_runtime_get_sync(p->port.dev);
}

static void serial8250_rpm_put_tx(struct uart_8250_port *p)
{
        unsigned char rpm_active;

        if (!(p->capabilities & UART_CAP_RPM))
                return;

        rpm_active = xchg(&p->rpm_tx_active, 0);
        if (!rpm_active)
                return;
        pm_runtime_mark_last_busy(p->port.dev);
        pm_runtime_put_autosuspend(p->port.dev);
}

/*
 * IER sleep support.  UARTs which have EFRs need the "extended
 * capability" bit enabled.  Note that on XR16C850s, we need to
 * reset LCR to write to IER.
 */
static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
{
        unsigned char lcr = 0, efr = 0;

        guard(serial8250_rpm)(p);

        if (!(p->capabilities & UART_CAP_SLEEP))
                return;

        /* Synchronize UART_IER access against the console. */
        guard(uart_port_lock_irq)(&p->port);

        if (p->capabilities & UART_CAP_EFR) {
                lcr = serial_in(p, UART_LCR);
                efr = serial_in(p, UART_EFR);
                serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
                serial_out(p, UART_EFR, UART_EFR_ECB);
                serial_out(p, UART_LCR, 0);
        }
        serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
        if (p->capabilities & UART_CAP_EFR) {
                serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
                serial_out(p, UART_EFR, efr);
                serial_out(p, UART_LCR, lcr);
        }
}

/* Clear the interrupt registers. */
static void serial8250_clear_interrupts(struct uart_port *port)
{
        serial_port_in(port, UART_LSR);
        serial_port_in(port, UART_RX);
        serial_port_in(port, UART_IIR);
        serial_port_in(port, UART_MSR);
}

static void serial8250_clear_IER(struct uart_8250_port *up)
{
        if (up->capabilities & UART_CAP_UUE)
                serial_out(up, UART_IER, UART_IER_UUE);
        else
                serial_out(up, UART_IER, 0);
}

/*
 * This is a quickie test to see how big the FIFO is.
 * It doesn't work at all the time, more's the pity.
 */
static int size_fifo(struct uart_8250_port *up)
{
        unsigned char old_fcr, old_mcr, old_lcr;
        u32 old_dl;
        int count;

        old_lcr = serial_in(up, UART_LCR);
        serial_out(up, UART_LCR, 0);
        old_fcr = serial_in(up, UART_FCR);
        old_mcr = serial8250_in_MCR(up);
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
                    UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
        serial8250_out_MCR(up, UART_MCR_LOOP);
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        old_dl = serial_dl_read(up);
        serial_dl_write(up, 0x0001);
        serial_out(up, UART_LCR, UART_LCR_WLEN8);
        for (count = 0; count < 256; count++)
                serial_out(up, UART_TX, count);
        mdelay(20);/* FIXME - schedule_timeout */
        for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) &&
             (count < 256); count++)
                serial_in(up, UART_RX);
        serial_out(up, UART_FCR, old_fcr);
        serial8250_out_MCR(up, old_mcr);
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        serial_dl_write(up, old_dl);
        serial_out(up, UART_LCR, old_lcr);

        return count;
}

/*
 * Read UART ID using the divisor method - set DLL and DLM to zero
 * and the revision will be in DLL and device type in DLM.  We
 * preserve the device state across this.
 */
static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p)
{
        unsigned char old_lcr;
        unsigned int id, old_dl;

        old_lcr = serial_in(p, UART_LCR);
        serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A);
        old_dl = serial_dl_read(p);
        serial_dl_write(p, 0);
        id = serial_dl_read(p);
        serial_dl_write(p, old_dl);

        serial_out(p, UART_LCR, old_lcr);

        return id;
}

/*
 * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's.
 * When this function is called we know it is at least a StarTech
 * 16650 V2, but it might be one of several StarTech UARTs, or one of
 * its clones.  (We treat the broken original StarTech 16650 V1 as a
 * 16550, and why not?  Startech doesn't seem to even acknowledge its
 * existence.)
 *
 * What evil have men's minds wrought...
 */
static void autoconfig_has_efr(struct uart_8250_port *up)
{
        unsigned int id1, id2, id3, rev;

        /*
         * Everything with an EFR has SLEEP
         */
        up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;

        /*
         * First we check to see if it's an Oxford Semiconductor UART.
         *
         * If we have to do this here because some non-National
         * Semiconductor clone chips lock up if you try writing to the
         * LSR register (which serial_icr_read does)
         */

        /*
         * Check for Oxford Semiconductor 16C950.
         *
         * EFR [4] must be set else this test fails.
         *
         * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca)
         * claims that it's needed for 952 dual UART's (which are not
         * recommended for new designs).
         */
        up->acr = 0;
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
        serial_out(up, UART_EFR, UART_EFR_ECB);
        serial_out(up, UART_LCR, 0x00);
        id1 = serial_icr_read(up, UART_ID1);
        id2 = serial_icr_read(up, UART_ID2);
        id3 = serial_icr_read(up, UART_ID3);
        rev = serial_icr_read(up, UART_REV);

        if (id1 == 0x16 && id2 == 0xC9 &&
            (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) {
                up->port.type = PORT_16C950;

                /*
                 * Enable work around for the Oxford Semiconductor 952 rev B
                 * chip which causes it to seriously miscalculate baud rates
                 * when DLL is 0.
                 */
                if (id3 == 0x52 && rev == 0x01)
                        up->bugs |= UART_BUG_QUOT;
                return;
        }

        /*
         * We check for a XR16C850 by setting DLL and DLM to 0, and then
         * reading back DLL and DLM.  The chip type depends on the DLM
         * value read back:
         *  0x10 - XR16C850 and the DLL contains the chip revision.
         *  0x12 - XR16C2850.
         *  0x14 - XR16C854.
         */
        id1 = autoconfig_read_divisor_id(up);

        id2 = id1 >> 8;
        if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) {
                up->port.type = PORT_16850;
                return;
        }

        /*
         * It wasn't an XR16C850.
         *
         * We distinguish between the '654 and the '650 by counting
         * how many bytes are in the FIFO.  I'm using this for now,
         * since that's the technique that was sent to me in the
         * serial driver update, but I'm not convinced this works.
         * I've had problems doing this in the past.  -TYT
         */
        if (size_fifo(up) == 64)
                up->port.type = PORT_16654;
        else
                up->port.type = PORT_16650V2;
}

/*
 * We detected a chip without a FIFO.  Only two fall into
 * this category - the original 8250 and the 16450.  The
 * 16450 has a scratch register (accessible with LCR=0)
 */
static void autoconfig_8250(struct uart_8250_port *up)
{
        unsigned char scratch, status1, status2;

        up->port.type = PORT_8250;

        scratch = serial_in(up, UART_SCR);
        serial_out(up, UART_SCR, 0xa5);
        status1 = serial_in(up, UART_SCR);
        serial_out(up, UART_SCR, 0x5a);
        status2 = serial_in(up, UART_SCR);
        serial_out(up, UART_SCR, scratch);

        if (status1 == 0xa5 && status2 == 0x5a)
                up->port.type = PORT_16450;
}

static int broken_efr(struct uart_8250_port *up)
{
        /*
         * Exar ST16C2550 "A2" devices incorrectly detect as
         * having an EFR, and report an ID of 0x0201.  See
         * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html
         */
        if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16)
                return 1;

        return 0;
}

/*
 * We know that the chip has FIFOs.  Does it have an EFR?  The
 * EFR is located in the same register position as the IIR and
 * we know the top two bits of the IIR are currently set.  The
 * EFR should contain zero.  Try to read the EFR.
 */
static void autoconfig_16550a(struct uart_8250_port *up)
{
        unsigned char status1, status2;
        unsigned int iersave;

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&up->port.lock);

        up->port.type = PORT_16550A;
        up->capabilities |= UART_CAP_FIFO;

        if (!IS_ENABLED(CONFIG_SERIAL_8250_16550A_VARIANTS) &&
            !(up->port.flags & UPF_FULL_PROBE))
                return;

        /*
         * Check for presence of the EFR when DLAB is set.
         * Only ST16C650V1 UARTs pass this test.
         */
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        if (serial_in(up, UART_EFR) == 0) {
                serial_out(up, UART_EFR, 0xA8);
                if (serial_in(up, UART_EFR) != 0) {
                        up->port.type = PORT_16650;
                        up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
                } else {
                        serial_out(up, UART_LCR, 0);
                        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
                                   UART_FCR7_64BYTE);
                        status1 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750;
                        serial_out(up, UART_FCR, 0);
                        serial_out(up, UART_LCR, 0);

                        if (status1 == UART_IIR_FIFO_ENABLED_16750)
                                up->port.type = PORT_16550A_FSL64;
                }
                serial_out(up, UART_EFR, 0);
                return;
        }

        /*
         * Maybe it requires 0xbf to be written to the LCR.
         * (other ST16C650V2 UARTs, TI16C752A, etc)
         */
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
        if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) {
                autoconfig_has_efr(up);
                return;
        }

        /*
         * Check for a National Semiconductor SuperIO chip.
         * Attempt to switch to bank 2, read the value of the LOOP bit
         * from EXCR1. Switch back to bank 0, change it in MCR. Then
         * switch back to bank 2, read it from EXCR1 again and check
         * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2
         */
        serial_out(up, UART_LCR, 0);
        status1 = serial8250_in_MCR(up);
        serial_out(up, UART_LCR, 0xE0);
        status2 = serial_in(up, 0x02); /* EXCR1 */

        if (!((status2 ^ status1) & UART_MCR_LOOP)) {
                serial_out(up, UART_LCR, 0);
                serial8250_out_MCR(up, status1 ^ UART_MCR_LOOP);
                serial_out(up, UART_LCR, 0xE0);
                status2 = serial_in(up, 0x02); /* EXCR1 */
                serial_out(up, UART_LCR, 0);
                serial8250_out_MCR(up, status1);

                if ((status2 ^ status1) & UART_MCR_LOOP) {
                        unsigned short quot;

                        serial_out(up, UART_LCR, 0xE0);

                        quot = serial_dl_read(up);
                        quot <<= 3;

                        if (ns16550a_goto_highspeed(up))
                                serial_dl_write(up, quot);

                        serial_out(up, UART_LCR, 0);

                        up->port.uartclk = 921600*16;
                        up->port.type = PORT_NS16550A;
                        up->capabilities |= UART_NATSEMI;
                        return;
                }
        }

        /*
         * No EFR.  Try to detect a TI16750, which only sets bit 5 of
         * the IIR when 64 byte FIFO mode is enabled when DLAB is set.
         * Try setting it with and without DLAB set.  Cheap clones
         * set bit 5 without DLAB set.
         */
        serial_out(up, UART_LCR, 0);
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
        status1 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750;
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);

        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
        status2 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750;
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);

        serial_out(up, UART_LCR, 0);

        if (status1 == UART_IIR_FIFO_ENABLED_16550A &&
            status2 == UART_IIR_FIFO_ENABLED_16750) {
                up->port.type = PORT_16750;
                up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP;
                return;
        }

        /*
         * Try writing and reading the UART_IER_UUE bit (b6).
         * If it works, this is probably one of the Xscale platform's
         * internal UARTs.
         * We're going to explicitly set the UUE bit to 0 before
         * trying to write and read a 1 just to make sure it's not
         * already a 1 and maybe locked there before we even start.
         */
        iersave = serial_in(up, UART_IER);
        serial_out(up, UART_IER, iersave & ~UART_IER_UUE);
        if (!(serial_in(up, UART_IER) & UART_IER_UUE)) {
                /*
                 * OK it's in a known zero state, try writing and reading
                 * without disturbing the current state of the other bits.
                 */
                serial_out(up, UART_IER, iersave | UART_IER_UUE);
                if (serial_in(up, UART_IER) & UART_IER_UUE) {
                        /*
                         * It's an Xscale.
                         * We'll leave the UART_IER_UUE bit set to 1 (enabled).
                         */
                        up->port.type = PORT_XSCALE;
                        up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE;
                        return;
                }
        }
        serial_out(up, UART_IER, iersave);

        /*
         * We distinguish between 16550A and U6 16550A by counting
         * how many bytes are in the FIFO.
         */
        if (up->port.type == PORT_16550A && size_fifo(up) == 64) {
                up->port.type = PORT_U6_16550A;
                up->capabilities |= UART_CAP_AFE;
        }
}

/*
 * This routine is called by rs_init() to initialize a specific serial
 * port.  It determines what type of UART chip this serial port is
 * using: 8250, 16450, 16550, 16550A.  The important question is
 * whether or not this UART is a 16550A or not, since this will
 * determine whether or not we can use its FIFO features or not.
 */
static void autoconfig(struct uart_8250_port *up)
{
        unsigned char status1, scratch, scratch2, scratch3;
        unsigned char save_lcr, save_mcr;
        struct uart_port *port = &up->port;
        unsigned long flags;
        unsigned int old_capabilities;

        if (!port->iobase && !port->mapbase && !port->membase)
                return;

        /*
         * We really do need global IRQs disabled here - we're going to
         * be frobbing the chips IRQ enable register to see if it exists.
         *
         * Synchronize UART_IER access against the console.
         */
        uart_port_lock_irqsave(port, &flags);

        up->capabilities = 0;
        up->bugs = 0;

        if (!(port->flags & UPF_BUGGY_UART)) {
                /*
                 * Do a simple existence test first; if we fail this,
                 * there's no point trying anything else.
                 *
                 * 0x80 is used as a nonsense port to prevent against
                 * false positives due to ISA bus float.  The
                 * assumption is that 0x80 is a non-existent port;
                 * which should be safe since include/asm/io.h also
                 * makes this assumption.
                 *
                 * Note: this is safe as long as MCR bit 4 is clear
                 * and the device is in "PC" mode.
                 */
                scratch = serial_in(up, UART_IER);
                serial_out(up, UART_IER, 0);
#if defined(__i386__) && defined(CONFIG_HAS_IOPORT)
                outb(0xff, 0x080);
#endif
                /*
                 * Mask out IER[7:4] bits for test as some UARTs (e.g. TL
                 * 16C754B) allow only to modify them if an EFR bit is set.
                 */
                scratch2 = serial_in(up, UART_IER) & UART_IER_ALL_INTR;
                serial_out(up, UART_IER, UART_IER_ALL_INTR);
#if defined(__i386__) && defined(CONFIG_HAS_IOPORT)
                outb(0, 0x080);
#endif
                scratch3 = serial_in(up, UART_IER) & UART_IER_ALL_INTR;
                serial_out(up, UART_IER, scratch);
                if (scratch2 != 0 || scratch3 != UART_IER_ALL_INTR) {
                        /*
                         * We failed; there's nothing here
                         */
                        uart_port_unlock_irqrestore(port, flags);
                        return;
                }
        }

        save_mcr = serial8250_in_MCR(up);
        save_lcr = serial_in(up, UART_LCR);

        /*
         * Check to see if a UART is really there.  Certain broken
         * internal modems based on the Rockwell chipset fail this
         * test, because they apparently don't implement the loopback
         * test mode.  So this test is skipped on the COM 1 through
         * COM 4 ports.  This *should* be safe, since no board
         * manufacturer would be stupid enough to design a board
         * that conflicts with COM 1-4 --- we hope!
         */
        if (!(port->flags & UPF_SKIP_TEST)) {
                serial8250_out_MCR(up, UART_MCR_LOOP | UART_MCR_OUT2 | UART_MCR_RTS);
                status1 = serial_in(up, UART_MSR) & UART_MSR_STATUS_BITS;
                serial8250_out_MCR(up, save_mcr);
                if (status1 != (UART_MSR_DCD | UART_MSR_CTS)) {
                        uart_port_unlock_irqrestore(port, flags);
                        return;
                }
        }

        /*
         * We're pretty sure there's a port here.  Lets find out what
         * type of port it is.  The IIR top two bits allows us to find
         * out if it's 8250 or 16450, 16550, 16550A or later.  This
         * determines what we test for next.
         *
         * We also initialise the EFR (if any) to zero for later.  The
         * EFR occupies the same register location as the FCR and IIR.
         */
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
        serial_out(up, UART_EFR, 0);
        serial_out(up, UART_LCR, 0);

        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);

        switch (serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED) {
        case UART_IIR_FIFO_ENABLED_8250:
                autoconfig_8250(up);
                break;
        case UART_IIR_FIFO_ENABLED_16550:
                port->type = PORT_16550;
                break;
        case UART_IIR_FIFO_ENABLED_16550A:
                autoconfig_16550a(up);
                break;
        default:
                port->type = PORT_UNKNOWN;
                break;
        }

        rsa_autoconfig(up);

        serial_out(up, UART_LCR, save_lcr);

        port->fifosize = uart_config[up->port.type].fifo_size;
        old_capabilities = up->capabilities;
        up->capabilities = uart_config[port->type].flags;
        up->tx_loadsz = uart_config[port->type].tx_loadsz;

        if (port->type != PORT_UNKNOWN) {
                /*
                 * Reset the UART.
                 */
                rsa_reset(up);
                serial8250_out_MCR(up, save_mcr);
                serial8250_clear_fifos(up);
                serial_in(up, UART_RX);
                serial8250_clear_IER(up);
        }

        uart_port_unlock_irqrestore(port, flags);

        /*
         * Check if the device is a Fintek F81216A
         */
        if (port->type == PORT_16550A && port->iotype == UPIO_PORT)
                fintek_8250_probe(up);

        if (up->capabilities != old_capabilities) {
                dev_warn(port->dev, "detected caps %08x should be %08x\n",
                         old_capabilities, up->capabilities);
        }
}

static void autoconfig_irq(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        unsigned char save_mcr, save_ier;
        unsigned char save_ICP = 0;
        unsigned int ICP = 0;
        unsigned long irqs;
        int irq;

        if (port->flags & UPF_FOURPORT) {
                ICP = (port->iobase & 0xfe0) | 0x1f;
                save_ICP = inb_p(ICP);
                outb_p(0x80, ICP);
                inb_p(ICP);
        }

        /* forget possible initially masked and pending IRQ */
        probe_irq_off(probe_irq_on());
        save_mcr = serial8250_in_MCR(up);
        /* Synchronize UART_IER access against the console. */
        scoped_guard(uart_port_lock_irq, port)
                save_ier = serial_in(up, UART_IER);
        serial8250_out_MCR(up, UART_MCR_OUT1 | UART_MCR_OUT2);

        irqs = probe_irq_on();
        serial8250_out_MCR(up, 0);
        udelay(10);
        if (port->flags & UPF_FOURPORT) {
                serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS);
        } else {
                serial8250_out_MCR(up,
                        UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2);
        }
        /* Synchronize UART_IER access against the console. */
        scoped_guard(uart_port_lock_irq, port)
                serial_out(up, UART_IER, UART_IER_ALL_INTR);
        serial8250_clear_interrupts(port);
        serial_out(up, UART_TX, 0xFF);
        udelay(20);
        irq = probe_irq_off(irqs);

        serial8250_out_MCR(up, save_mcr);
        /* Synchronize UART_IER access against the console. */
        scoped_guard(uart_port_lock_irq, port)
                serial_out(up, UART_IER, save_ier);

        if (port->flags & UPF_FOURPORT)
                outb_p(save_ICP, ICP);

        port->irq = (irq > 0) ? irq : 0;
}

static void serial8250_stop_rx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        guard(serial8250_rpm)(up);

        up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
        serial_port_out(port, UART_IER, up->ier);
}

/**
 * serial8250_em485_stop_tx() - generic ->rs485_stop_tx() callback
 * @p: uart 8250 port
 * @toggle_ier: true to allow enabling receive interrupts
 *
 * Generic callback usable by 8250 uart drivers to stop rs485 transmission.
 */
void serial8250_em485_stop_tx(struct uart_8250_port *p, bool toggle_ier)
{
        unsigned char mcr = serial8250_in_MCR(p);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&p->port.lock);

        if (p->port.rs485.flags & SER_RS485_RTS_AFTER_SEND)
                mcr |= UART_MCR_RTS;
        else
                mcr &= ~UART_MCR_RTS;
        serial8250_out_MCR(p, mcr);

        /*
         * Empty the RX FIFO, we are not interested in anything
         * received during the half-duplex transmission.
         * Enable previously disabled RX interrupts.
         */
        if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) {
                serial8250_clear_and_reinit_fifos(p);

                if (toggle_ier) {
                        p->ier |= UART_IER_RLSI | UART_IER_RDI;
                        serial_port_out(&p->port, UART_IER, p->ier);
                }
        }
}
EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);

static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t)
{
        struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485,
                        stop_tx_timer);
        struct uart_8250_port *p = em485->port;

        guard(serial8250_rpm)(p);
        guard(uart_port_lock_irqsave)(&p->port);

        if (em485->active_timer == &em485->stop_tx_timer) {
                p->rs485_stop_tx(p, true);
                em485->active_timer = NULL;
                em485->tx_stopped = true;
        }

        return HRTIMER_NORESTART;
}

static void start_hrtimer_ms(struct hrtimer *hrt, unsigned long msec)
{
        hrtimer_start(hrt, ms_to_ktime(msec), HRTIMER_MODE_REL);
}

static void __stop_tx_rs485(struct uart_8250_port *p, u64 stop_delay)
{
        struct uart_8250_em485 *em485 = p->em485;

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&p->port.lock);

        stop_delay += (u64)p->port.rs485.delay_rts_after_send * NSEC_PER_MSEC;

        /*
         * rs485_stop_tx() is going to set RTS according to config
         * AND flush RX FIFO if required.
         */
        if (stop_delay > 0) {
                em485->active_timer = &em485->stop_tx_timer;
                hrtimer_start(&em485->stop_tx_timer, ns_to_ktime(stop_delay), HRTIMER_MODE_REL);
        } else {
                p->rs485_stop_tx(p, true);
                em485->active_timer = NULL;
                em485->tx_stopped = true;
        }
}

static inline void __stop_tx(struct uart_8250_port *p)
{
        struct uart_8250_em485 *em485 = p->em485;

        if (em485) {
                u16 lsr = serial_lsr_in(p);
                u64 stop_delay = 0;

                if (!(lsr & UART_LSR_THRE))
                        return;
                /*
                 * To provide required timing and allow FIFO transfer,
                 * __stop_tx_rs485() must be called only when both FIFO and
                 * shift register are empty. The device driver should either
                 * enable interrupt on TEMT or set UART_CAP_NOTEMT that will
                 * enlarge stop_tx_timer by the tx time of one frame to cover
                 * for emptying of the shift register.
                 */
                if (!(lsr & UART_LSR_TEMT)) {
                        if (!(p->capabilities & UART_CAP_NOTEMT))
                                return;
                        /*
                         * RTS might get deasserted too early with the normal
                         * frame timing formula. It seems to suggest THRE might
                         * get asserted already during tx of the stop bit
                         * rather than after it is fully sent.
                         * Roughly estimate 1 extra bit here with / 7.
                         */
                        stop_delay = p->port.frame_time + DIV_ROUND_UP(p->port.frame_time, 7);
                }

                __stop_tx_rs485(p, stop_delay);
        }

        if (serial8250_clear_THRI(p))
                serial8250_rpm_put_tx(p);
}

static void serial8250_stop_tx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        guard(serial8250_rpm)(up);
        __stop_tx(up);

        /*
         * We really want to stop the transmitter from sending.
         */
        if (port->type == PORT_16C950) {
                up->acr |= UART_ACR_TXDIS;
                serial_icr_write(up, UART_ACR, up->acr);
        }
}

static inline void __start_tx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (up->dma && !up->dma->tx_dma(up))
                return;

        if (serial8250_set_THRI(up)) {
                if (up->bugs & UART_BUG_TXEN) {
                        u16 lsr = serial_lsr_in(up);

                        if (lsr & UART_LSR_THRE)
                                serial8250_tx_chars(up);
                }
        }

        /*
         * Re-enable the transmitter if we disabled it.
         */
        if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) {
                up->acr &= ~UART_ACR_TXDIS;
                serial_icr_write(up, UART_ACR, up->acr);
        }
}

/**
 * serial8250_em485_start_tx() - generic ->rs485_start_tx() callback
 * @up: uart 8250 port
 * @toggle_ier: true to allow disabling receive interrupts
 *
 * Generic callback usable by 8250 uart drivers to start rs485 transmission.
 * Assumes that setting the RTS bit in the MCR register means RTS is high.
 * (Some chips use inverse semantics.)  Further assumes that reception is
 * stoppable by disabling the UART_IER_RDI interrupt.  (Some chips set the
 * UART_LSR_DR bit even when UART_IER_RDI is disabled, foiling this approach.)
 */
void serial8250_em485_start_tx(struct uart_8250_port *up, bool toggle_ier)
{
        unsigned char mcr = serial8250_in_MCR(up);

        if (!(up->port.rs485.flags & SER_RS485_RX_DURING_TX) && toggle_ier)
                serial8250_stop_rx(&up->port);

        if (up->port.rs485.flags & SER_RS485_RTS_ON_SEND)
                mcr |= UART_MCR_RTS;
        else
                mcr &= ~UART_MCR_RTS;
        serial8250_out_MCR(up, mcr);
}
EXPORT_SYMBOL_GPL(serial8250_em485_start_tx);

/* Returns false, if start_tx_timer was setup to defer TX start */
static bool start_tx_rs485(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        struct uart_8250_em485 *em485 = up->em485;

        /*
         * While serial8250_em485_handle_stop_tx() is a noop if
         * em485->active_timer != &em485->stop_tx_timer, it might happen that
         * the timer is still armed and triggers only after the current bunch of
         * chars is send and em485->active_timer == &em485->stop_tx_timer again.
         * So cancel the timer. There is still a theoretical race condition if
         * the timer is already running and only comes around to check for
         * em485->active_timer when &em485->stop_tx_timer is armed again.
         */
        if (em485->active_timer == &em485->stop_tx_timer)
                hrtimer_try_to_cancel(&em485->stop_tx_timer);

        em485->active_timer = NULL;

        if (em485->tx_stopped) {
                em485->tx_stopped = false;

                up->rs485_start_tx(up, true);

                if (up->port.rs485.delay_rts_before_send > 0) {
                        em485->active_timer = &em485->start_tx_timer;
                        start_hrtimer_ms(&em485->start_tx_timer,
                                         up->port.rs485.delay_rts_before_send);
                        return false;
                }
        }

        return true;
}

static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t)
{
        struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485,
                        start_tx_timer);
        struct uart_8250_port *p = em485->port;

        guard(uart_port_lock_irqsave)(&p->port);

        if (em485->active_timer == &em485->start_tx_timer) {
                __start_tx(&p->port);
                em485->active_timer = NULL;
        }

        return HRTIMER_NORESTART;
}

static void serial8250_start_tx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        struct uart_8250_em485 *em485 = up->em485;

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        if (!port->x_char && kfifo_is_empty(&port->state->port.xmit_fifo))
                return;

        serial8250_rpm_get_tx(up);

        if (em485) {
                if ((em485->active_timer == &em485->start_tx_timer) ||
                    !start_tx_rs485(port))
                        return;
        }
        __start_tx(port);
}

static void serial8250_throttle(struct uart_port *port)
{
        port->throttle(port);
}

static void serial8250_unthrottle(struct uart_port *port)
{
        port->unthrottle(port);
}

static void serial8250_disable_ms(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        /* no MSR capabilities */
        if (up->bugs & UART_BUG_NOMSR)
                return;

        mctrl_gpio_disable_ms_no_sync(up->gpios);

        up->ier &= ~UART_IER_MSI;
        serial_port_out(port, UART_IER, up->ier);
}

static void serial8250_enable_ms(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        /* no MSR capabilities */
        if (up->bugs & UART_BUG_NOMSR)
                return;

        mctrl_gpio_enable_ms(up->gpios);

        up->ier |= UART_IER_MSI;

        guard(serial8250_rpm)(up);
        serial_port_out(port, UART_IER, up->ier);
}

void serial8250_read_char(struct uart_8250_port *up, u16 lsr)
{
        struct uart_port *port = &up->port;
        u8 ch, flag = TTY_NORMAL;

        if (likely(lsr & UART_LSR_DR))
                ch = serial_in(up, UART_RX);
        else
                /*
                 * Intel 82571 has a Serial Over Lan device that will
                 * set UART_LSR_BI without setting UART_LSR_DR when
                 * it receives a break. To avoid reading from the
                 * receive buffer without UART_LSR_DR bit set, we
                 * just force the read character to be 0
                 */
                ch = 0;

        port->icount.rx++;

        lsr |= up->lsr_saved_flags;
        up->lsr_saved_flags = 0;

        if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) {
                if (lsr & UART_LSR_BI) {
                        lsr &= ~(UART_LSR_FE | UART_LSR_PE);
                        port->icount.brk++;
                        /*
                         * We do the SysRQ and SAK checking
                         * here because otherwise the break
                         * may get masked by ignore_status_mask
                         * or read_status_mask.
                         */
                        if (uart_handle_break(port))
                                return;
                } else if (lsr & UART_LSR_PE)
                        port->icount.parity++;
                else if (lsr & UART_LSR_FE)
                        port->icount.frame++;
                if (lsr & UART_LSR_OE)
                        port->icount.overrun++;

                /*
                 * Mask off conditions which should be ignored.
                 */
                lsr &= port->read_status_mask;

                if (lsr & UART_LSR_BI) {
                        dev_dbg(port->dev, "handling break\n");
                        flag = TTY_BREAK;
                } else if (lsr & UART_LSR_PE)
                        flag = TTY_PARITY;
                else if (lsr & UART_LSR_FE)
                        flag = TTY_FRAME;
        }
        if (uart_prepare_sysrq_char(port, ch))
                return;

        uart_insert_char(port, lsr, UART_LSR_OE, ch, flag);
}
EXPORT_SYMBOL_GPL(serial8250_read_char);

/*
 * serial8250_rx_chars - Read characters. The first LSR value must be passed in.
 *
 * Returns LSR bits. The caller should rely only on non-Rx related LSR bits
 * (such as THRE) because the LSR value might come from an already consumed
 * character.
 */
u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr)
{
        struct uart_port *port = &up->port;
        int max_count = 256;

        do {
                serial8250_read_char(up, lsr);
                if (--max_count == 0)
                        break;
                lsr = serial_in(up, UART_LSR);
        } while (lsr & (UART_LSR_DR | UART_LSR_BI));

        tty_flip_buffer_push(&port->state->port);
        return lsr;
}
EXPORT_SYMBOL_GPL(serial8250_rx_chars);

void serial8250_tx_chars(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        struct tty_port *tport = &port->state->port;
        int count;

        if (port->x_char) {
                uart_xchar_out(port, UART_TX);
                return;
        }
        if (uart_tx_stopped(port)) {
                serial8250_stop_tx(port);
                return;
        }
        if (kfifo_is_empty(&tport->xmit_fifo)) {
                __stop_tx(up);
                return;
        }

        count = up->tx_loadsz;
        do {
                unsigned char c;

                if (!uart_fifo_get(port, &c))
                        break;

                serial_out(up, UART_TX, c);
                if (up->bugs & UART_BUG_TXRACE) {
                        /*
                         * The Aspeed BMC virtual UARTs have a bug where data
                         * may get stuck in the BMC's Tx FIFO from bursts of
                         * writes on the APB interface.
                         *
                         * Delay back-to-back writes by a read cycle to avoid
                         * stalling the VUART. Read a register that won't have
                         * side-effects and discard the result.
                         */
                        serial_in(up, UART_SCR);
                }

                if ((up->capabilities & UART_CAP_HFIFO) &&
                    !uart_lsr_tx_empty(serial_in(up, UART_LSR)))
                        break;
                /* The BCM2835 MINI UART THRE bit is really a not-full bit. */
                if ((up->capabilities & UART_CAP_MINI) &&
                    !(serial_in(up, UART_LSR) & UART_LSR_THRE))
                        break;
        } while (--count > 0);

        if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
                uart_write_wakeup(port);

        /*
         * With RPM enabled, we have to wait until the FIFO is empty before the
         * HW can go idle. So we get here once again with empty FIFO and disable
         * the interrupt and RPM in __stop_tx()
         */
        if (kfifo_is_empty(&tport->xmit_fifo) &&
            !(up->capabilities & UART_CAP_RPM))
                __stop_tx(up);
}
EXPORT_SYMBOL_GPL(serial8250_tx_chars);

/* Caller holds uart port lock */
unsigned int serial8250_modem_status(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        unsigned int status = serial_in(up, UART_MSR);

        status |= up->msr_saved_flags;
        up->msr_saved_flags = 0;
        if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI &&
            port->state != NULL) {
                if (status & UART_MSR_TERI)
                        port->icount.rng++;
                if (status & UART_MSR_DDSR)
                        port->icount.dsr++;
                if (status & UART_MSR_DDCD)
                        uart_handle_dcd_change(port, status & UART_MSR_DCD);
                if (status & UART_MSR_DCTS)
                        uart_handle_cts_change(port, status & UART_MSR_CTS);

                wake_up_interruptible(&port->state->port.delta_msr_wait);
        }

        return status;
}
EXPORT_SYMBOL_GPL(serial8250_modem_status);

static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
{
        switch (iir & 0x3f) {
        case UART_IIR_THRI:
                /*
                 * Postpone DMA or not decision to IIR_RDI or IIR_RX_TIMEOUT
                 * because it's impossible to do an informed decision about
                 * that with IIR_THRI.
                 *
                 * This also fixes one known DMA Rx corruption issue where
                 * DR is asserted but DMA Rx only gets a corrupted zero byte
                 * (too early DR?).
                 */
                return false;
        case UART_IIR_RDI:
                if (!up->dma->rx_running)
                        break;
                fallthrough;
        case UART_IIR_RLSI:
        case UART_IIR_RX_TIMEOUT:
                serial8250_rx_dma_flush(up);
                return true;
        }
        return up->dma->rx_dma(up);
}

/*
 * Context: port's lock must be held by the caller.
 */
void serial8250_handle_irq_locked(struct uart_port *port, unsigned int iir)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        struct tty_port *tport = &port->state->port;
        bool skip_rx = false;
        u16 status;

        lockdep_assert_held_once(&port->lock);

        status = serial_lsr_in(up);

        /*
         * If port is stopped and there are no error conditions in the
         * FIFO, then don't drain the FIFO, as this may lead to TTY buffer
         * overflow. Not servicing, RX FIFO would trigger auto HW flow
         * control when FIFO occupancy reaches preset threshold, thus
         * halting RX. This only works when auto HW flow control is
         * available.
         */
        if (!(status & (UART_LSR_FIFOE | UART_LSR_BRK_ERROR_BITS)) &&
            (port->status & (UPSTAT_AUTOCTS | UPSTAT_AUTORTS)) &&
            !(up->ier & (UART_IER_RLSI | UART_IER_RDI)))
                skip_rx = true;

        if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) {
                struct irq_data *d;

                d = irq_get_irq_data(port->irq);
                if (d && irqd_is_wakeup_set(d))
                        pm_wakeup_event(tport->tty->dev, 0);
                if (!up->dma || handle_rx_dma(up, iir))
                        status = serial8250_rx_chars(up, status);
        }
        serial8250_modem_status(up);
        if ((status & UART_LSR_THRE) && (up->ier & UART_IER_THRI)) {
                if (!up->dma || up->dma->tx_err)
                        serial8250_tx_chars(up);
                else if (!up->dma->tx_running)
                        __stop_tx(up);
        }
}
EXPORT_SYMBOL_NS_GPL(serial8250_handle_irq_locked, "SERIAL_8250");

/*
 * This handles the interrupt from one port.
 */
int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
{
        if (iir & UART_IIR_NO_INT)
                return 0;

        guard(uart_port_lock_irqsave)(port);
        serial8250_handle_irq_locked(port, iir);

        return 1;
}
EXPORT_SYMBOL_GPL(serial8250_handle_irq);

static int serial8250_default_handle_irq(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int iir;

        guard(serial8250_rpm)(up);

        iir = serial_port_in(port, UART_IIR);
        return serial8250_handle_irq(port, iir);
}

/*
 * Newer 16550 compatible parts such as the SC16C650 & Altera 16550 Soft IP
 * have a programmable TX threshold that triggers the THRE interrupt in
 * the IIR register. In this case, the THRE interrupt indicates the FIFO
 * has space available. Load it up with tx_loadsz bytes.
 */
static int serial8250_tx_threshold_handle_irq(struct uart_port *port)
{
        unsigned int iir = serial_port_in(port, UART_IIR);

        /* TX Threshold IRQ triggered so load up FIFO */
        if ((iir & UART_IIR_ID) == UART_IIR_THRI) {
                struct uart_8250_port *up = up_to_u8250p(port);

                guard(uart_port_lock_irqsave)(port);
                serial8250_tx_chars(up);
        }

        iir = serial_port_in(port, UART_IIR);
        return serial8250_handle_irq(port, iir);
}

static unsigned int serial8250_tx_empty(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        guard(serial8250_rpm)(up);
        guard(uart_port_lock_irqsave)(port);

        if (!serial8250_tx_dma_running(up) && uart_lsr_tx_empty(serial_lsr_in(up)))
                return TIOCSER_TEMT;

        return 0;
}

unsigned int serial8250_do_get_mctrl(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int status;
        unsigned int val;

        scoped_guard(serial8250_rpm, up)
                status = serial8250_modem_status(up);

        val = serial8250_MSR_to_TIOCM(status);
        if (up->gpios)
                return mctrl_gpio_get(up->gpios, &val);

        return val;
}
EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl);

static unsigned int serial8250_get_mctrl(struct uart_port *port)
{
        if (port->get_mctrl)
                return port->get_mctrl(port);
        return serial8250_do_get_mctrl(port);
}

void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned char mcr;

        mcr = serial8250_TIOCM_to_MCR(mctrl);

        mcr |= up->mcr;

        serial8250_out_MCR(up, mcr);
}
EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl);

static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
{
        if (port->rs485.flags & SER_RS485_ENABLED)
                return;

        if (port->set_mctrl)
                port->set_mctrl(port, mctrl);
        else
                serial8250_do_set_mctrl(port, mctrl);
}

static void serial8250_break_ctl(struct uart_port *port, int break_state)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        guard(serial8250_rpm)(up);
        guard(uart_port_lock_irqsave)(port);

        if (break_state == -1)
                up->lcr |= UART_LCR_SBC;
        else
                up->lcr &= ~UART_LCR_SBC;
        serial_port_out(port, UART_LCR, up->lcr);
}

/* Returns true if @bits were set, false on timeout */
static bool wait_for_lsr(struct uart_8250_port *up, int bits)
{
        unsigned int status, tmout;

        /*
         * Wait for a character to be sent. Fallback to a safe default
         * timeout value if @frame_time is not available.
         */
        if (up->port.frame_time)
                tmout = up->port.frame_time * 2 / NSEC_PER_USEC;
        else
                tmout = 10000;

        for (;;) {
                status = serial_lsr_in(up);

                if ((status & bits) == bits)
                        break;
                if (--tmout == 0)
                        break;
                udelay(1);
                touch_nmi_watchdog();
        }

        return (tmout != 0);
}

/* Wait for transmitter and holding register to empty with timeout */
static void wait_for_xmitr(struct uart_8250_port *up, int bits)
{
        unsigned int tmout;

        wait_for_lsr(up, bits);

        /* Wait up to 1s for flow control if necessary */
        if (up->port.flags & UPF_CONS_FLOW) {
                for (tmout = 1000000; tmout; tmout--) {
                        unsigned int msr = serial_in(up, UART_MSR);
                        up->msr_saved_flags |= msr & MSR_SAVE_FLAGS;
                        if (msr & UART_MSR_CTS)
                                break;
                        udelay(1);
                        touch_nmi_watchdog();
                }
        }
}

#ifdef CONFIG_CONSOLE_POLL
/*
 * Console polling routines for writing and reading from the uart while
 * in an interrupt or debug context.
 */

static int serial8250_get_poll_char(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        u16 lsr;

        guard(serial8250_rpm)(up);

        lsr = serial_port_in(port, UART_LSR);
        if (!(lsr & UART_LSR_DR))
                return NO_POLL_CHAR;

        return serial_port_in(port, UART_RX);
}


static void serial8250_put_poll_char(struct uart_port *port,
                         unsigned char c)
{
        unsigned int ier;
        struct uart_8250_port *up = up_to_u8250p(port);

        /*
         * Normally the port is locked to synchronize UART_IER access
         * against the console. However, this function is only used by
         * KDB/KGDB, where it may not be possible to acquire the port
         * lock because all other CPUs are quiesced. The quiescence
         * should allow safe lockless usage here.
         */

        guard(serial8250_rpm)(up);
        /*
         *        First save the IER then disable the interrupts
         */
        ier = serial_port_in(port, UART_IER);
        serial8250_clear_IER(up);

        wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
        /*
         *        Send the character out.
         */
        serial_port_out(port, UART_TX, c);

        /*
         *        Finally, wait for transmitter to become empty
         *        and restore the IER
         */
        wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
        serial_port_out(port, UART_IER, ier);
}

#endif /* CONFIG_CONSOLE_POLL */

static void serial8250_startup_special(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        switch (port->type) {
        case PORT_16C950: {
                /*
                 * Wake up and initialize UART
                 *
                 * Synchronize UART_IER access against the console.
                 */
                guard(uart_port_lock_irqsave)(port);
                up->acr = 0;
                serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
                serial_port_out(port, UART_EFR, UART_EFR_ECB);
                serial_port_out(port, UART_IER, 0);
                serial_port_out(port, UART_LCR, 0);
                serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
                serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
                serial_port_out(port, UART_EFR, UART_EFR_ECB);
                serial_port_out(port, UART_LCR, 0);
                break;
        }
        case PORT_DA830:
                /*
                 * Reset the port
                 *
                 * Synchronize UART_IER access against the console.
                 */
                scoped_guard(uart_port_lock_irqsave, port) {
                        serial_port_out(port, UART_IER, 0);
                        serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
                }
                mdelay(10);

                /* Enable Tx, Rx and free run mode */
                serial_port_out(port, UART_DA830_PWREMU_MGMT,
                                UART_DA830_PWREMU_MGMT_UTRST |
                                UART_DA830_PWREMU_MGMT_URRST |
                                UART_DA830_PWREMU_MGMT_FREE);
                break;
        case PORT_RSA:
                rsa_enable(up);
                break;
        }
}

static void serial8250_set_TRG_levels(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        switch (port->type) {
        /* For a XR16C850, we need to set the trigger levels */
        case PORT_16850: {
                u8 fctr;

                serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);

                fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX);
                fctr |= UART_FCTR_TRGD;
                serial_port_out(port, UART_FCTR, fctr | UART_FCTR_RX);
                serial_port_out(port, UART_TRG, UART_TRG_96);
                serial_port_out(port, UART_FCTR, fctr | UART_FCTR_TX);
                serial_port_out(port, UART_TRG, UART_TRG_96);

                serial_port_out(port, UART_LCR, 0);
                break;
        }
        /* For the Altera 16550 variants, set TX threshold trigger level. */
        case PORT_ALTR_16550_F32:
        case PORT_ALTR_16550_F64:
        case PORT_ALTR_16550_F128:
                if (port->fifosize <= 1)
                        return;

                /* Bounds checking of TX threshold (valid 0 to fifosize-2) */
                if (up->tx_loadsz < 2 || up->tx_loadsz > port->fifosize) {
                        dev_err(port->dev, "TX FIFO Threshold errors, skipping\n");
                        return;
                }
                serial_port_out(port, UART_ALTR_AFR, UART_ALTR_EN_TXFIFO_LW);
                serial_port_out(port, UART_ALTR_TX_LOW, port->fifosize - up->tx_loadsz);
                port->handle_irq = serial8250_tx_threshold_handle_irq;
                break;
        }
}

static void serial8250_THRE_test(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        bool iir_noint1, iir_noint2;

        if (!port->irq)
                return;

        if (up->port.flags & UPF_NO_THRE_TEST)
                return;

        disable_irq(port->irq);

        /*
         * Test for UARTs that do not reassert THRE when the transmitter is idle and the interrupt
         * has already been cleared.  Real 16550s should always reassert this interrupt whenever the
         * transmitter is idle and the interrupt is enabled.  Delays are necessary to allow register
         * changes to become visible.
         *
         * Synchronize UART_IER access against the console.
         */
        scoped_guard(uart_port_lock_irqsave, port) {
                wait_for_xmitr(up, UART_LSR_THRE);
                serial_port_out_sync(port, UART_IER, UART_IER_THRI);
                udelay(1); /* allow THRE to set */
                iir_noint1 = serial_port_in(port, UART_IIR) & UART_IIR_NO_INT;
                serial_port_out(port, UART_IER, 0);
                serial_port_out_sync(port, UART_IER, UART_IER_THRI);
                udelay(1); /* allow a working UART time to re-assert THRE */
                iir_noint2 = serial_port_in(port, UART_IIR) & UART_IIR_NO_INT;
                serial_port_out(port, UART_IER, 0);
        }

        enable_irq(port->irq);

        /*
         * If the interrupt is not reasserted, or we otherwise don't trust the iir, setup a timer to
         * kick the UART on a regular basis.
         */
        if ((!iir_noint1 && iir_noint2) || up->port.flags & UPF_BUG_THRE)
                up->bugs |= UART_BUG_THRE;
}

static void serial8250_init_mctrl(struct uart_port *port)
{
        if (port->flags & UPF_FOURPORT) {
                if (!port->irq)
                        port->mctrl |= TIOCM_OUT1;
        } else {
                /* Most PC uarts need OUT2 raised to enable interrupts. */
                if (port->irq)
                        port->mctrl |= TIOCM_OUT2;
        }

        serial8250_set_mctrl(port, port->mctrl);
}

static void serial8250_iir_txen_test(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        bool lsr_temt, iir_noint;

        if (port->quirks & UPQ_NO_TXEN_TEST)
                return;

        /* Do a quick test to see if we receive an interrupt when we enable the TX irq. */
        serial_port_out(port, UART_IER, UART_IER_THRI);
        lsr_temt = serial_port_in(port, UART_LSR) & UART_LSR_TEMT;
        iir_noint = serial_port_in(port, UART_IIR) & UART_IIR_NO_INT;
        serial_port_out(port, UART_IER, 0);

        /*
         * Serial over Lan (SoL) hack:
         * Intel 8257x Gigabit ethernet chips have a 16550 emulation, to be used for Serial Over
         * Lan.  Those chips take a longer time than a normal serial device to signalize that a
         * transmission data was queued. Due to that, the above test generally fails. One solution
         * would be to delay the reading of iir. However, this is not reliable, since the timeout is
         * variable. So, in case of UPQ_NO_TXEN_TEST, let's just don't test if we receive TX irq.
         * This way, we'll never enable UART_BUG_TXEN.
         */
        if (lsr_temt && iir_noint) {
                if (!(up->bugs & UART_BUG_TXEN)) {
                        up->bugs |= UART_BUG_TXEN;
                        dev_dbg(port->dev, "enabling bad tx status workarounds\n");
                }
                return;
        }

        /* FIXME: why is this needed? */
        up->bugs &= ~UART_BUG_TXEN;
}

static void serial8250_initialize(struct uart_port *port)
{
        guard(uart_port_lock_irqsave)(port);
        serial_port_out(port, UART_LCR, UART_LCR_WLEN8);

        serial8250_init_mctrl(port);
        serial8250_iir_txen_test(port);
}

int serial8250_do_startup(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        int retval;

        if (!port->fifosize)
                port->fifosize = uart_config[port->type].fifo_size;
        if (!up->tx_loadsz)
                up->tx_loadsz = uart_config[port->type].tx_loadsz;
        if (!up->capabilities)
                up->capabilities = uart_config[port->type].flags;
        up->mcr = 0;

        if (port->iotype != up->cur_iotype)
                set_io_from_upio(port);

        guard(serial8250_rpm)(up);

        serial8250_startup_special(port);

        /*
         * Clear the FIFO buffers and disable them.
         * (they will be reenabled in set_termios())
         */
        serial8250_clear_fifos(up);

        serial8250_clear_interrupts(port);

        /*
         * At this point, there's no way the LSR could still be 0xff;
         * if it is, then bail out, because there's likely no UART
         * here.
         */
        if (!(port->flags & UPF_BUGGY_UART) &&
            (serial_port_in(port, UART_LSR) == 0xff)) {
                dev_info_ratelimited(port->dev, "LSR safety check engaged!\n");
                return -ENODEV;
        }

        serial8250_set_TRG_levels(port);

        /* Check if we need to have shared IRQs */
        if (port->irq && (up->port.flags & UPF_SHARE_IRQ))
                up->port.irqflags |= IRQF_SHARED;

        retval = up->ops->setup_irq(up);
        if (retval)
                return retval;

        serial8250_THRE_test(port);

        up->ops->setup_timer(up);

        serial8250_initialize(port);

        /*
         * Clear the interrupt registers again for luck, and clear the
         * saved flags to avoid getting false values from polling
         * routines or the previous session.
         */
        serial8250_clear_interrupts(port);
        up->lsr_saved_flags = 0;
        up->msr_saved_flags = 0;

        /*
         * Request DMA channels for both RX and TX.
         */
        if (up->dma) {
                const char *msg = NULL;

                if (uart_console(port))
                        msg = "forbid DMA for kernel console";
                else if (serial8250_request_dma(up))
                        msg = "failed to request DMA";
                if (msg) {
                        dev_warn_ratelimited(port->dev, "%s\n", msg);
                        up->dma = NULL;
                }
        }

        /*
         * Set the IER shadow for rx interrupts but defer actual interrupt
         * enable until after the FIFOs are enabled; otherwise, an already-
         * active sender can swamp the interrupt handler with "too much work".
         */
        up->ier = UART_IER_RLSI | UART_IER_RDI;

        if (port->flags & UPF_FOURPORT) {
                unsigned int icp;
                /*
                 * Enable interrupts on the AST Fourport board
                 */
                icp = (port->iobase & 0xfe0) | 0x01f;
                outb_p(0x80, icp);
                inb_p(icp);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(serial8250_do_startup);

static int serial8250_startup(struct uart_port *port)
{
        if (port->startup)
                return port->startup(port);
        return serial8250_do_startup(port);
}

void serial8250_do_shutdown(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        u32 lcr;

        serial8250_rpm_get(up);
        /*
         * Disable interrupts from this port
         *
         * Synchronize UART_IER access against the console.
         */
        scoped_guard(uart_port_lock_irqsave, port) {
                up->ier = 0;
                serial_port_out(port, UART_IER, 0);
        }

        synchronize_irq(port->irq);

        serial8250_release_dma(up);
        up->dma = NULL;

        scoped_guard(uart_port_lock_irqsave, port) {
                if (port->flags & UPF_FOURPORT) {
                        /* reset interrupts on the AST Fourport board */
                        inb((port->iobase & 0xfe0) | 0x1f);
                        port->mctrl |= TIOCM_OUT1;
                } else
                        port->mctrl &= ~TIOCM_OUT2;

                serial8250_set_mctrl(port, port->mctrl);

                /* Disable break condition */
                lcr = serial_port_in(port, UART_LCR);
                lcr &= ~UART_LCR_SBC;
                serial_port_out(port, UART_LCR, lcr);
        }

        serial8250_clear_fifos(up);

        rsa_disable(up);

        /*
         * Read data port to reset things, and then unlink from
         * the IRQ chain.
         */
        serial_port_in(port, UART_RX);
        /*
         * LCR writes on DW UART can trigger late (unmaskable) IRQs.
         * Handle them before releasing the handler.
         */
        synchronize_irq(port->irq);

        serial8250_rpm_put(up);

        up->ops->release_irq(up);
}
EXPORT_SYMBOL_GPL(serial8250_do_shutdown);

static void serial8250_shutdown(struct uart_port *port)
{
        if (port->shutdown)
                port->shutdown(port);
        else
                serial8250_do_shutdown(port);
}

static void serial8250_flush_buffer(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (up->dma)
                serial8250_tx_dma_flush(up);
}

static unsigned int serial8250_do_get_divisor(struct uart_port *port, unsigned int baud)
{
        upf_t magic_multiplier = port->flags & UPF_MAGIC_MULTIPLIER;
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int quot;

        /*
         * Handle magic divisors for baud rates above baud_base on SMSC
         * Super I/O chips.  We clamp custom rates from clk/6 and clk/12
         * up to clk/4 (0x8001) and clk/8 (0x8002) respectively.  These
         * magic divisors actually reprogram the baud rate generator's
         * reference clock derived from chips's 14.318MHz clock input.
         *
         * Documentation claims that with these magic divisors the base
         * frequencies of 7.3728MHz and 3.6864MHz are used respectively
         * for the extra baud rates of 460800bps and 230400bps rather
         * than the usual base frequency of 1.8462MHz.  However empirical
         * evidence contradicts that.
         *
         * Instead bit 7 of the DLM register (bit 15 of the divisor) is
         * effectively used as a clock prescaler selection bit for the
         * base frequency of 7.3728MHz, always used.  If set to 0, then
         * the base frequency is divided by 4 for use by the Baud Rate
         * Generator, for the usual arrangement where the value of 1 of
         * the divisor produces the baud rate of 115200bps.  Conversely,
         * if set to 1 and high-speed operation has been enabled with the
         * Serial Port Mode Register in the Device Configuration Space,
         * then the base frequency is supplied directly to the Baud Rate
         * Generator, so for the divisor values of 0x8001, 0x8002, 0x8003,
         * 0x8004, etc. the respective baud rates produced are 460800bps,
         * 230400bps, 153600bps, 115200bps, etc.
         *
         * In all cases only low 15 bits of the divisor are used to divide
         * the baud base and therefore 32767 is the maximum divisor value
         * possible, even though documentation says that the programmable
         * Baud Rate Generator is capable of dividing the internal PLL
         * clock by any divisor from 1 to 65535.
         */
        if (magic_multiplier && baud >= port->uartclk / 6)
                quot = 0x8001;
        else if (magic_multiplier && baud >= port->uartclk / 12)
                quot = 0x8002;
        else
                quot = uart_get_divisor(port, baud);

        /*
         * Oxford Semi 952 rev B workaround
         */
        if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0)
                quot++;

        return quot;
}

static unsigned int serial8250_get_divisor(struct uart_port *port,
                                           unsigned int baud,
                                           unsigned int *frac)
{
        if (port->get_divisor)
                return port->get_divisor(port, baud, frac);

        return serial8250_do_get_divisor(port, baud);
}

static unsigned char serial8250_compute_lcr(struct uart_8250_port *up, tcflag_t c_cflag)
{
        u8 lcr = UART_LCR_WLEN(tty_get_char_size(c_cflag));

        if (c_cflag & CSTOPB)
                lcr |= UART_LCR_STOP;
        if (c_cflag & PARENB)
                lcr |= UART_LCR_PARITY;
        if (!(c_cflag & PARODD))
                lcr |= UART_LCR_EPAR;
        if (c_cflag & CMSPAR)
                lcr |= UART_LCR_SPAR;

        return lcr;
}

void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
                               unsigned int quot)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Workaround to enable 115200 baud on OMAP1510 internal ports */
        if (is_omap1510_8250(up)) {
                if (baud == 115200) {
                        quot = 1;
                        serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1);
                } else
                        serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0);
        }

        /*
         * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2,
         * otherwise just set DLAB
         */
        if (up->capabilities & UART_NATSEMI)
                serial_port_out(port, UART_LCR, 0xe0);
        else
                serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB);

        serial_dl_write(up, quot);
}
EXPORT_SYMBOL_GPL(serial8250_do_set_divisor);

static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
                                   unsigned int quot, unsigned int quot_frac)
{
        if (port->set_divisor)
                port->set_divisor(port, baud, quot, quot_frac);
        else
                serial8250_do_set_divisor(port, baud, quot);
}

static unsigned int serial8250_get_baud_rate(struct uart_port *port,
                                             struct ktermios *termios,
                                             const struct ktermios *old)
{
        unsigned int tolerance = port->uartclk / 100;
        unsigned int min;
        unsigned int max;

        /*
         * Handle magic divisors for baud rates above baud_base on SMSC
         * Super I/O chips.  Enable custom rates of clk/4 and clk/8, but
         * disable divisor values beyond 32767, which are unavailable.
         */
        if (port->flags & UPF_MAGIC_MULTIPLIER) {
                min = port->uartclk / 16 / UART_DIV_MAX >> 1;
                max = (port->uartclk + tolerance) / 4;
        } else {
                min = port->uartclk / 16 / UART_DIV_MAX;
                max = (port->uartclk + tolerance) / 16;
        }

        /*
         * Ask the core to calculate the divisor for us.
         * Allow 1% tolerance at the upper limit so uart clks marginally
         * slower than nominal still match standard baud rates without
         * causing transmission errors.
         */
        return uart_get_baud_rate(port, termios, old, min, max);
}

/*
 * Note in order to avoid the tty port mutex deadlock don't use the next method
 * within the uart port callbacks. Primarily it's supposed to be utilized to
 * handle a sudden reference clock rate change.
 */
void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk)
{
        struct tty_port *tport = &port->state->port;

        scoped_guard(tty_port_tty, tport) {
                struct tty_struct *tty = scoped_tty();

                guard(rwsem_write)(&tty->termios_rwsem);
                guard(mutex)(&tport->mutex);

                if (port->uartclk == uartclk)
                        return;

                port->uartclk = uartclk;

                if (!tty_port_initialized(tport))
                        return;

                serial8250_do_set_termios(port, &tty->termios, NULL);

                return;
        }
        guard(mutex)(&tport->mutex);
        port->uartclk = uartclk;
}
EXPORT_SYMBOL_GPL(serial8250_update_uartclk);

static void serial8250_set_mini(struct uart_port *port, struct ktermios *termios)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (!(up->capabilities & UART_CAP_MINI))
                return;

        termios->c_cflag &= ~(CSTOPB | PARENB | PARODD | CMSPAR);

        tcflag_t csize = termios->c_cflag & CSIZE;
        if (csize == CS5 || csize == CS6) {
                termios->c_cflag &= ~CSIZE;
                termios->c_cflag |= CS7;
        }
}

static void serial8250_set_trigger_for_slow_speed(struct uart_port *port, struct ktermios *termios,
                                                  unsigned int baud)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (!(up->capabilities & UART_CAP_FIFO))
                return;
        if (port->fifosize <= 1)
                return;
        if (baud >= 2400)
                return;
        if (up->dma)
                return;

        up->fcr &= ~UART_FCR_TRIGGER_MASK;
        up->fcr |= UART_FCR_TRIGGER_1;
}

/*
 * MCR-based auto flow control. When AFE is enabled, RTS will be deasserted when the receive FIFO
 * contains more characters than the trigger, or the MCR RTS bit is cleared.
 */
static void serial8250_set_afe(struct uart_port *port, struct ktermios *termios)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (!(up->capabilities & UART_CAP_AFE))
                return;

        up->mcr &= ~UART_MCR_AFE;
        if (termios->c_cflag & CRTSCTS)
                up->mcr |= UART_MCR_AFE;
}

static void serial8250_set_errors_and_ignores(struct uart_port *port, struct ktermios *termios)
{
        /*
         * Specify which conditions may be considered for error handling and the ignoring of
         * characters. The actual ignoring of characters only occurs if the bit is set in
         * @ignore_status_mask as well.
         */
        port->read_status_mask = UART_LSR_OE | UART_LSR_DR;
        if (termios->c_iflag & INPCK)
                port->read_status_mask |= UART_LSR_FE | UART_LSR_PE;
        if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
                port->read_status_mask |= UART_LSR_BI;

        /* Characters to ignore */
        port->ignore_status_mask = 0;
        if (termios->c_iflag & IGNPAR)
                port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE;
        if (termios->c_iflag & IGNBRK) {
                port->ignore_status_mask |= UART_LSR_BI;
                /*
                 * If we're ignoring parity and break indicators, ignore overruns too (for real raw
                 * support).
                 */
                if (termios->c_iflag & IGNPAR)
                        port->ignore_status_mask |= UART_LSR_OE;
        }

        /* ignore all characters if CREAD is not set */
        if ((termios->c_cflag & CREAD) == 0)
                port->ignore_status_mask |= UART_LSR_DR;
}

static void serial8250_set_ier(struct uart_port *port, struct ktermios *termios)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* CTS flow control flag and modem status interrupts */
        up->ier &= ~UART_IER_MSI;
        if (!(up->bugs & UART_BUG_NOMSR) && UART_ENABLE_MS(&up->port, termios->c_cflag))
                up->ier |= UART_IER_MSI;
        if (up->capabilities & UART_CAP_UUE)
                up->ier |= UART_IER_UUE;
        if (up->capabilities & UART_CAP_RTOIE)
                up->ier |= UART_IER_RTOIE;

        serial_port_out(port, UART_IER, up->ier);
}

static void serial8250_set_efr(struct uart_port *port, struct ktermios *termios)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        u8 efr_reg = UART_EFR;
        u8 efr = 0;

        if (!(up->capabilities & UART_CAP_EFR))
                return;

        /*
         * TI16C752/Startech hardware flow control.  FIXME:
         * - TI16C752 requires control thresholds to be set.
         * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled.
         */
        if (termios->c_cflag & CRTSCTS)
                efr |= UART_EFR_CTS;

        if (port->flags & UPF_EXAR_EFR)
                efr_reg = UART_XR_EFR;

        serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
        serial_port_out(port, efr_reg, efr);
}

static void serial8250_set_fcr(struct uart_port *port, struct ktermios *termios)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        bool is_16750 = port->type == PORT_16750;

        if (is_16750)
                serial_port_out(port, UART_FCR, up->fcr);

        /*
         * LCR DLAB must be reset to enable 64-byte FIFO mode. If the FCR is written without DLAB
         * set, this mode will be disabled.
         */
        serial_port_out(port, UART_LCR, up->lcr);

        if (is_16750)
                return;

        /* emulated UARTs (Lucent Venus 167x) need two steps */
        if (up->fcr & UART_FCR_ENABLE_FIFO)
                serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO);

        serial_port_out(port, UART_FCR, up->fcr);
}

void
serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
                          const struct ktermios *old)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int baud, quot, frac = 0;
        u8 lcr;

        serial8250_set_mini(port, termios);
        lcr = serial8250_compute_lcr(up, termios->c_cflag);
        baud = serial8250_get_baud_rate(port, termios, old);
        quot = serial8250_get_divisor(port, baud, &frac);

        /*
         * Ok, we're now changing the port state. Do it with interrupts disabled.
         *
         * Synchronize UART_IER access against the console.
         */
        scoped_guard(serial8250_rpm, up) {
                guard(uart_port_lock_irqsave)(port);

                up->lcr = lcr;
                serial8250_set_trigger_for_slow_speed(port, termios, baud);
                serial8250_set_afe(port, termios);
                uart_update_timeout(port, termios->c_cflag, baud);
                serial8250_set_errors_and_ignores(port, termios);
                serial8250_set_ier(port, termios);
                serial8250_set_efr(port, termios);
                serial8250_set_divisor(port, baud, quot, frac);
                serial8250_set_fcr(port, termios);
                serial8250_set_mctrl(port, port->mctrl);
        }

        /* Don't rewrite B0 */
        if (tty_termios_baud_rate(termios))
                tty_termios_encode_baud_rate(termios, baud, baud);
}
EXPORT_SYMBOL(serial8250_do_set_termios);

static void
serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
                       const struct ktermios *old)
{
        if (port->set_termios)
                port->set_termios(port, termios, old);
        else
                serial8250_do_set_termios(port, termios, old);
}

void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios)
{
        if (termios->c_line == N_PPS) {
                port->flags |= UPF_HARDPPS_CD;
                guard(uart_port_lock_irq)(port);
                serial8250_enable_ms(port);
        } else {
                port->flags &= ~UPF_HARDPPS_CD;
                if (!UART_ENABLE_MS(port, termios->c_cflag)) {
                        guard(uart_port_lock_irq)(port);
                        serial8250_disable_ms(port);
                }
        }
}
EXPORT_SYMBOL_GPL(serial8250_do_set_ldisc);

static void
serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios)
{
        if (port->set_ldisc)
                port->set_ldisc(port, termios);
        else
                serial8250_do_set_ldisc(port, termios);
}

void serial8250_do_pm(struct uart_port *port, unsigned int state,
                      unsigned int oldstate)
{
        struct uart_8250_port *p = up_to_u8250p(port);

        serial8250_set_sleep(p, state != 0);
}
EXPORT_SYMBOL(serial8250_do_pm);

static void
serial8250_pm(struct uart_port *port, unsigned int state,
              unsigned int oldstate)
{
        if (port->pm)
                port->pm(port, state, oldstate);
        else
                serial8250_do_pm(port, state, oldstate);
}

static unsigned int serial8250_port_size(struct uart_8250_port *pt)
{
        if (pt->port.mapsize)
                return pt->port.mapsize;
        if (is_omap1_8250(pt))
                return 0x16 << pt->port.regshift;

        return 8 << pt->port.regshift;
}

/*
 * Resource handling.
 */
static int serial8250_request_std_resource(struct uart_8250_port *up)
{
        unsigned int size = serial8250_port_size(up);
        struct uart_port *port = &up->port;

        switch (port->iotype) {
        case UPIO_AU:
        case UPIO_TSI:
        case UPIO_MEM32:
        case UPIO_MEM32BE:
        case UPIO_MEM16:
        case UPIO_MEM:
                if (!port->mapbase)
                        return -EINVAL;

                if (!request_mem_region(port->mapbase, size, "serial"))
                        return -EBUSY;

                if (port->flags & UPF_IOREMAP) {
                        port->membase = ioremap(port->mapbase, size);
                        if (!port->membase) {
                                release_mem_region(port->mapbase, size);
                                return -ENOMEM;
                        }
                }
                return 0;
        case UPIO_HUB6:
        case UPIO_PORT:
                if (!request_region(port->iobase, size, "serial"))
                        return -EBUSY;
                return 0;
        case UPIO_UNKNOWN:
                break;
        }

        return 0;
}

static void serial8250_release_std_resource(struct uart_8250_port *up)
{
        unsigned int size = serial8250_port_size(up);
        struct uart_port *port = &up->port;

        switch (port->iotype) {
        case UPIO_AU:
        case UPIO_TSI:
        case UPIO_MEM32:
        case UPIO_MEM32BE:
        case UPIO_MEM16:
        case UPIO_MEM:
                if (!port->mapbase)
                        break;

                if (port->flags & UPF_IOREMAP) {
                        iounmap(port->membase);
                        port->membase = NULL;
                }

                release_mem_region(port->mapbase, size);
                break;

        case UPIO_HUB6:
        case UPIO_PORT:
                release_region(port->iobase, size);
                break;
        case UPIO_UNKNOWN:
                break;
        }
}

static void serial8250_release_port(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        serial8250_release_std_resource(up);
}

static int serial8250_request_port(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        return serial8250_request_std_resource(up);
}

static int fcr_get_rxtrig_bytes(struct uart_8250_port *up)
{
        const struct serial8250_config *conf_type = &uart_config[up->port.type];
        unsigned char bytes;

        bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)];

        return bytes ? bytes : -EOPNOTSUPP;
}

static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes)
{
        const struct serial8250_config *conf_type = &uart_config[up->port.type];
        int i;

        if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)])
                return -EOPNOTSUPP;

        for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) {
                if (bytes < conf_type->rxtrig_bytes[i])
                        /* Use the nearest lower value */
                        return (--i) << UART_FCR_R_TRIG_SHIFT;
        }

        return UART_FCR_R_TRIG_11;
}

static int do_get_rxtrig(struct tty_port *port)
{
        struct uart_state *state = container_of(port, struct uart_state, port);
        struct uart_port *uport = state->uart_port;
        struct uart_8250_port *up = up_to_u8250p(uport);

        if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1)
                return -EINVAL;

        return fcr_get_rxtrig_bytes(up);
}

static int do_serial8250_get_rxtrig(struct tty_port *port)
{
        int rxtrig_bytes;

        mutex_lock(&port->mutex);
        rxtrig_bytes = do_get_rxtrig(port);
        mutex_unlock(&port->mutex);

        return rxtrig_bytes;
}

static ssize_t rx_trig_bytes_show(struct device *dev,
        struct device_attribute *attr, char *buf)
{
        struct tty_port *port = dev_get_drvdata(dev);
        int rxtrig_bytes;

        rxtrig_bytes = do_serial8250_get_rxtrig(port);
        if (rxtrig_bytes < 0)
                return rxtrig_bytes;

        return sysfs_emit(buf, "%d\n", rxtrig_bytes);
}

static int do_set_rxtrig(struct tty_port *port, unsigned char bytes)
{
        struct uart_state *state = container_of(port, struct uart_state, port);
        struct uart_port *uport = state->uart_port;
        struct uart_8250_port *up = up_to_u8250p(uport);
        int rxtrig;

        if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1)
                return -EINVAL;

        rxtrig = bytes_to_fcr_rxtrig(up, bytes);
        if (rxtrig < 0)
                return rxtrig;

        serial8250_clear_fifos(up);
        up->fcr &= ~UART_FCR_TRIGGER_MASK;
        up->fcr |= (unsigned char)rxtrig;
        serial_out(up, UART_FCR, up->fcr);
        return 0;
}

static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes)
{
        int ret;

        mutex_lock(&port->mutex);
        ret = do_set_rxtrig(port, bytes);
        mutex_unlock(&port->mutex);

        return ret;
}

static ssize_t rx_trig_bytes_store(struct device *dev,
        struct device_attribute *attr, const char *buf, size_t count)
{
        struct tty_port *port = dev_get_drvdata(dev);
        unsigned char bytes;
        int ret;

        if (!count)
                return -EINVAL;

        ret = kstrtou8(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = do_serial8250_set_rxtrig(port, bytes);
        if (ret < 0)
                return ret;

        return count;
}

static DEVICE_ATTR_RW(rx_trig_bytes);

static struct attribute *serial8250_dev_attrs[] = {
        &dev_attr_rx_trig_bytes.attr,
        NULL
};

static struct attribute_group serial8250_dev_attr_group = {
        .attrs = serial8250_dev_attrs,
};

static void register_dev_spec_attr_grp(struct uart_8250_port *up)
{
        const struct serial8250_config *conf_type = &uart_config[up->port.type];

        if (conf_type->rxtrig_bytes[0])
                up->port.attr_group = &serial8250_dev_attr_group;
}

static void serial8250_config_port(struct uart_port *port, int flags)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        int ret;

        /*
         * Find the region that we can probe for.  This in turn
         * tells us whether we can probe for the type of port.
         */
        ret = serial8250_request_std_resource(up);
        if (ret < 0)
                return;

        if (port->iotype != up->cur_iotype)
                set_io_from_upio(port);

        if (flags & UART_CONFIG_TYPE)
                autoconfig(up);

        /* HW bugs may trigger IRQ while IIR == NO_INT */
        if (port->type == PORT_TEGRA)
                up->bugs |= UART_BUG_NOMSR;

        if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ)
                autoconfig_irq(up);

        if (port->type == PORT_UNKNOWN)
                serial8250_release_std_resource(up);

        register_dev_spec_attr_grp(up);
        up->fcr = uart_config[up->port.type].fcr;
}

static int
serial8250_verify_port(struct uart_port *port, struct serial_struct *ser)
{
        if (ser->irq >= irq_get_nr_irqs() || ser->irq < 0 ||
            ser->baud_base < 9600 || ser->type < PORT_UNKNOWN ||
            ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS ||
            ser->type == PORT_STARTECH)
                return -EINVAL;
        return 0;
}

static const char *serial8250_type(struct uart_port *port)
{
        int type = port->type;

        if (type >= ARRAY_SIZE(uart_config))
                type = 0;
        return uart_config[type].name;
}

static const struct uart_ops serial8250_pops = {
        .tx_empty        = serial8250_tx_empty,
        .set_mctrl        = serial8250_set_mctrl,
        .get_mctrl        = serial8250_get_mctrl,
        .stop_tx        = serial8250_stop_tx,
        .start_tx        = serial8250_start_tx,
        .throttle        = serial8250_throttle,
        .unthrottle        = serial8250_unthrottle,
        .stop_rx        = serial8250_stop_rx,
        .enable_ms        = serial8250_enable_ms,
        .break_ctl        = serial8250_break_ctl,
        .startup        = serial8250_startup,
        .shutdown        = serial8250_shutdown,
        .flush_buffer        = serial8250_flush_buffer,
        .set_termios        = serial8250_set_termios,
        .set_ldisc        = serial8250_set_ldisc,
        .pm                = serial8250_pm,
        .type                = serial8250_type,
        .release_port        = serial8250_release_port,
        .request_port        = serial8250_request_port,
        .config_port        = serial8250_config_port,
        .verify_port        = serial8250_verify_port,
#ifdef CONFIG_CONSOLE_POLL
        .poll_get_char = serial8250_get_poll_char,
        .poll_put_char = serial8250_put_poll_char,
#endif
};

void serial8250_init_port(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        spin_lock_init(&port->lock);
        port->ctrl_id = 0;
        port->pm = NULL;
        port->ops = &serial8250_pops;
        port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE);

        up->cur_iotype = UPIO_UNKNOWN;
}
EXPORT_SYMBOL_GPL(serial8250_init_port);

void serial8250_set_defaults(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        if (up->port.flags & UPF_FIXED_TYPE) {
                unsigned int type = up->port.type;

                if (!up->port.fifosize)
                        up->port.fifosize = uart_config[type].fifo_size;
                if (!up->tx_loadsz)
                        up->tx_loadsz = uart_config[type].tx_loadsz;
                if (!up->capabilities)
                        up->capabilities = uart_config[type].flags;
        }

        set_io_from_upio(port);

        /* default dma handlers */
        if (up->dma) {
                if (!up->dma->tx_dma)
                        up->dma->tx_dma = serial8250_tx_dma;
                if (!up->dma->rx_dma)
                        up->dma->rx_dma = serial8250_rx_dma;
        }
}
EXPORT_SYMBOL_GPL(serial8250_set_defaults);

void serial8250_fifo_wait_for_lsr_thre(struct uart_8250_port *up, unsigned int count)
{
        unsigned int i;

        for (i = 0; i < count; i++) {
                if (wait_for_lsr(up, UART_LSR_THRE))
                        return;
        }
}
EXPORT_SYMBOL_NS_GPL(serial8250_fifo_wait_for_lsr_thre, "SERIAL_8250");

#ifdef CONFIG_SERIAL_8250_CONSOLE

static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
{
        serial_port_out(port, UART_TX, ch);
}

static void serial8250_console_wait_putchar(struct uart_port *port, unsigned char ch)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        wait_for_xmitr(up, UART_LSR_THRE);
        serial8250_console_putchar(port, ch);
}

/*
 *        Restore serial console when h/w power-off detected
 */
static void serial8250_console_restore(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        struct ktermios termios;
        unsigned int baud, quot, frac = 0;

        termios.c_cflag = port->cons->cflag;
        termios.c_ispeed = port->cons->ispeed;
        termios.c_ospeed = port->cons->ospeed;
        if (port->state->port.tty && termios.c_cflag == 0) {
                termios.c_cflag = port->state->port.tty->termios.c_cflag;
                termios.c_ispeed = port->state->port.tty->termios.c_ispeed;
                termios.c_ospeed = port->state->port.tty->termios.c_ospeed;
        }

        baud = serial8250_get_baud_rate(port, &termios, NULL);
        quot = serial8250_get_divisor(port, baud, &frac);

        serial8250_set_divisor(port, baud, quot, frac);
        serial_port_out(port, UART_LCR, up->lcr);
        serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
}

/*
 * Print a string to the serial port using the device FIFO
 *
 * It sends fifosize bytes and then waits for the fifo
 * to get empty.
 */
static void serial8250_console_fifo_write(struct uart_8250_port *up,
                                          const char *s, unsigned int count)
{
        const char *end = s + count;
        unsigned int fifosize = up->tx_loadsz;
        struct uart_port *port = &up->port;
        unsigned int tx_count = 0;
        bool cr_sent = false;
        unsigned int i;

        while (s != end) {
                /* Allow timeout for each byte of a possibly full FIFO */
                serial8250_fifo_wait_for_lsr_thre(up, fifosize);

                for (i = 0; i < fifosize && s != end; ++i) {
                        if (*s == '\n' && !cr_sent) {
                                serial8250_console_putchar(port, '\r');
                                cr_sent = true;
                        } else {
                                serial8250_console_putchar(port, *s++);
                                cr_sent = false;
                        }
                }
                tx_count = i;
        }

        /*
         * Allow timeout for each byte written since the caller will only wait
         * for UART_LSR_BOTH_EMPTY using the timeout of a single character
         */
        serial8250_fifo_wait_for_lsr_thre(up, tx_count);
}

/*
 *        Print a string to the serial port trying not to disturb
 *        any possible real use of the port...
 *
 *        The console_lock must be held when we get here.
 *
 *        Doing runtime PM is really a bad idea for the kernel console.
 *        Thus, we assume the function is called when device is powered up.
 */
void serial8250_console_write(struct uart_8250_port *up, const char *s,
                              unsigned int count)
{
        struct uart_8250_em485 *em485 = up->em485;
        struct uart_port *port = &up->port;
        unsigned long flags;
        unsigned int ier, use_fifo;
        int locked = 1;

        touch_nmi_watchdog();

        if (oops_in_progress)
                locked = uart_port_trylock_irqsave(port, &flags);
        else
                uart_port_lock_irqsave(port, &flags);

        /*
         *        First save the IER then disable the interrupts
         */
        ier = serial_port_in(port, UART_IER);
        serial8250_clear_IER(up);

        /* check scratch reg to see if port powered off during system sleep */
        if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
                serial8250_console_restore(up);
                up->canary = 0;
        }

        if (em485) {
                if (em485->tx_stopped)
                        up->rs485_start_tx(up, false);
                mdelay(port->rs485.delay_rts_before_send);
        }

        use_fifo = (up->capabilities & UART_CAP_FIFO) &&
                /*
                 * BCM283x requires to check the fifo
                 * after each byte.
                 */
                !(up->capabilities & UART_CAP_MINI) &&
                /*
                 * tx_loadsz contains the transmit fifo size
                 */
                up->tx_loadsz > 1 &&
                (up->fcr & UART_FCR_ENABLE_FIFO) &&
                port->state &&
                test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) &&
                /*
                 * After we put a data in the fifo, the controller will send
                 * it regardless of the CTS state. Therefore, only use fifo
                 * if we don't use control flow.
                 */
                !(up->port.flags & UPF_CONS_FLOW);

        if (likely(use_fifo))
                serial8250_console_fifo_write(up, s, count);
        else
                uart_console_write(port, s, count, serial8250_console_wait_putchar);

        /*
         *        Finally, wait for transmitter to become empty
         *        and restore the IER
         */
        wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);

        if (em485) {
                mdelay(port->rs485.delay_rts_after_send);
                if (em485->tx_stopped)
                        up->rs485_stop_tx(up, false);
        }

        serial_port_out(port, UART_IER, ier);

        /*
         *        The receive handling will happen properly because the
         *        receive ready bit will still be set; it is not cleared
         *        on read.  However, modem control will not, we must
         *        call it if we have saved something in the saved flags
         *        while processing with interrupts off.
         */
        if (up->msr_saved_flags)
                serial8250_modem_status(up);

        if (locked)
                uart_port_unlock_irqrestore(port, flags);
}

static unsigned int probe_baud(struct uart_port *port)
{
        unsigned char lcr, dll, dlm;
        unsigned int quot;

        lcr = serial_port_in(port, UART_LCR);
        serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB);
        dll = serial_port_in(port, UART_DLL);
        dlm = serial_port_in(port, UART_DLM);
        serial_port_out(port, UART_LCR, lcr);

        quot = (dlm << 8) | dll;
        return (port->uartclk / 16) / quot;
}

int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
{
        int baud = 9600;
        int bits = 8;
        int parity = 'n';
        int flow = 'n';
        int ret;

        if (!port->iobase && !port->membase)
                return -ENODEV;

        if (options)
                uart_parse_options(options, &baud, &parity, &bits, &flow);
        else if (probe)
                baud = probe_baud(port);

        ret = uart_set_options(port, port->cons, baud, parity, bits, flow);
        if (ret)
                return ret;

        if (port->dev)
                pm_runtime_get_sync(port->dev);

        return 0;
}

int serial8250_console_exit(struct uart_port *port)
{
        if (port->dev)
                pm_runtime_put_sync(port->dev);

        return 0;
}

#endif /* CONFIG_SERIAL_8250_CONSOLE */

MODULE_DESCRIPTION("Base port operations for 8250/16550-type serial ports");
MODULE_LICENSE("GPL");















































































   10 














   11 









































   10 











































































































































   11 
































































   11 











   11 





    9 








   11 




















   11 













   11 








   11 












   11 


   11 














   11 



   11 









   10 




















   10 


















   11 


    9 




   10 


   11 

   10 


   11 

































   10 









   10 








   10 







































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions related to setting various queue properties from drivers
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blk-integrity.h>
#include <linux/pagemap.h>
#include <linux/backing-dev-defs.h>
#include <linux/gcd.h>
#include <linux/lcm.h>
#include <linux/jiffies.h>
#include <linux/gfp.h>
#include <linux/dma-mapping.h>
#include <linux/t10-pi.h>
#include <linux/crc64.h>

#include "blk.h"
#include "blk-rq-qos.h"
#include "blk-wbt.h"

void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
{
        WRITE_ONCE(q->rq_timeout, timeout);
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);

/**
 * blk_set_stacking_limits - set default limits for stacking devices
 * @lim:  the queue_limits structure to reset
 *
 * Prepare queue limits for applying limits from underlying devices using
 * blk_stack_limits().
 */
void blk_set_stacking_limits(struct queue_limits *lim)
{
        memset(lim, 0, sizeof(*lim));
        lim->logical_block_size = SECTOR_SIZE;
        lim->physical_block_size = SECTOR_SIZE;
        lim->io_min = SECTOR_SIZE;
        lim->discard_granularity = SECTOR_SIZE;
        lim->dma_alignment = SECTOR_SIZE - 1;
        lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;

        /* Inherit limits from component devices */
        lim->max_segments = USHRT_MAX;
        lim->max_discard_segments = USHRT_MAX;
        lim->max_hw_sectors = UINT_MAX;
        lim->max_segment_size = UINT_MAX;
        lim->max_sectors = UINT_MAX;
        lim->max_dev_sectors = UINT_MAX;
        lim->max_write_zeroes_sectors = UINT_MAX;
        lim->max_hw_wzeroes_unmap_sectors = UINT_MAX;
        lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
        lim->max_hw_zone_append_sectors = UINT_MAX;
        lim->max_user_discard_sectors = UINT_MAX;
        lim->atomic_write_hw_max = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);

void blk_apply_bdi_limits(struct backing_dev_info *bdi,
                struct queue_limits *lim)
{
        u64 io_opt = lim->io_opt;

        /*
         * For read-ahead of large files to be effective, we need to read ahead
         * at least twice the optimal I/O size. For rotational devices that do
         * not report an optimal I/O size (e.g. ATA HDDs), use the maximum I/O
         * size to avoid falling back to the (rather inefficient) small default
         * read-ahead size.
         *
         * There is no hardware limitation for the read-ahead size and the user
         * might have increased the read-ahead size through sysfs, so don't ever
         * decrease it.
         */
        if (!io_opt && (lim->features & BLK_FEAT_ROTATIONAL))
                io_opt = (u64)lim->max_sectors << SECTOR_SHIFT;

        bdi->ra_pages = max3(bdi->ra_pages,
                                io_opt * 2 >> PAGE_SHIFT,
                                VM_READAHEAD_PAGES);
        bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
}

static int blk_validate_zoned_limits(struct queue_limits *lim)
{
        if (!(lim->features & BLK_FEAT_ZONED)) {
                if (WARN_ON_ONCE(lim->max_open_zones) ||
                    WARN_ON_ONCE(lim->max_active_zones) ||
                    WARN_ON_ONCE(lim->zone_write_granularity) ||
                    WARN_ON_ONCE(lim->max_zone_append_sectors))
                        return -EINVAL;
                return 0;
        }

        if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
                return -EINVAL;

        /*
         * Given that active zones include open zones, the maximum number of
         * open zones cannot be larger than the maximum number of active zones.
         */
        if (lim->max_active_zones &&
            lim->max_open_zones > lim->max_active_zones)
                return -EINVAL;

        if (lim->zone_write_granularity < lim->logical_block_size)
                lim->zone_write_granularity = lim->logical_block_size;

        /*
         * The Zone Append size is limited by the maximum I/O size and the zone
         * size given that it can't span zones.
         *
         * If no max_hw_zone_append_sectors limit is provided, the block layer
         * will emulated it, else we're also bound by the hardware limit.
         */
        lim->max_zone_append_sectors =
                min_not_zero(lim->max_hw_zone_append_sectors,
                        min(lim->chunk_sectors, lim->max_hw_sectors));
        return 0;
}

static int blk_validate_integrity_limits(struct queue_limits *lim)
{
        struct blk_integrity *bi = &lim->integrity;

        if (!bi->metadata_size) {
                if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE ||
                    bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) {
                        pr_warn("invalid PI settings.\n");
                        return -EINVAL;
                }
                bi->flags |= BLK_INTEGRITY_NOGENERATE | BLK_INTEGRITY_NOVERIFY;
                return 0;
        }

        if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) {
                pr_warn("integrity support disabled.\n");
                return -EINVAL;
        }

        if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE &&
            (bi->flags & BLK_INTEGRITY_REF_TAG)) {
                pr_warn("ref tag not support without checksum.\n");
                return -EINVAL;
        }

        if (bi->pi_offset + bi->pi_tuple_size > bi->metadata_size) {
                pr_warn("pi_offset (%u) + pi_tuple_size (%u) exceeds metadata_size (%u)\n",
                        bi->pi_offset, bi->pi_tuple_size, bi->metadata_size);
                return -EINVAL;
        }

        switch (bi->csum_type) {
        case BLK_INTEGRITY_CSUM_NONE:
                if (bi->pi_tuple_size) {
                        pr_warn("pi_tuple_size must be 0 when checksum type is none\n");
                        return -EINVAL;
                }
                break;
        case BLK_INTEGRITY_CSUM_CRC:
        case BLK_INTEGRITY_CSUM_IP:
                if (bi->pi_tuple_size != sizeof(struct t10_pi_tuple)) {
                        pr_warn("pi_tuple_size mismatch for T10 PI: expected %zu, got %u\n",
                                 sizeof(struct t10_pi_tuple),
                                 bi->pi_tuple_size);
                        return -EINVAL;
                }
                break;
        case BLK_INTEGRITY_CSUM_CRC64:
                if (bi->pi_tuple_size != sizeof(struct crc64_pi_tuple)) {
                        pr_warn("pi_tuple_size mismatch for CRC64 PI: expected %zu, got %u\n",
                                 sizeof(struct crc64_pi_tuple),
                                 bi->pi_tuple_size);
                        return -EINVAL;
                }
                break;
        }

        if (!bi->interval_exp) {
                bi->interval_exp = ilog2(lim->logical_block_size);
        } else if (bi->interval_exp < SECTOR_SHIFT ||
                   bi->interval_exp > ilog2(lim->logical_block_size)) {
                pr_warn("invalid interval_exp %u\n", bi->interval_exp);
                return -EINVAL;
        }

        /*
         * Some IO controllers can not handle data intervals straddling
         * multiple bio_vecs.  For those, enforce alignment so that those are
         * never generated, and that each buffer is aligned as expected.
         */
        if (!(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE) && bi->csum_type) {
                lim->dma_alignment = max(lim->dma_alignment,
                                        (1U << bi->interval_exp) - 1);
        }

        /*
         * The block layer automatically adds integrity data for bios that don't
         * already have it.  Limit the I/O size so that a single maximum size
         * metadata segment can cover the integrity data for the entire I/O.
         */
        lim->max_sectors = min(lim->max_sectors,
                max_integrity_io_size(lim) >> SECTOR_SHIFT);

        return 0;
}

/*
 * Returns max guaranteed bytes which we can fit in a bio.
 *
 * We request that an atomic_write is ITER_UBUF iov_iter (so a single vector),
 * so we assume that we can fit in at least PAGE_SIZE in a segment, apart from
 * the first and last segments.
 */
static unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *lim)
{
        unsigned int max_segments = min(BIO_MAX_VECS, lim->max_segments);
        unsigned int length;

        length = min(max_segments, 2) * lim->logical_block_size;
        if (max_segments > 2)
                length += (max_segments - 2) * PAGE_SIZE;

        return length;
}

static void blk_atomic_writes_update_limits(struct queue_limits *lim)
{
        unsigned int unit_limit = min(lim->max_hw_sectors << SECTOR_SHIFT,
                                        blk_queue_max_guaranteed_bio(lim));

        unit_limit = rounddown_pow_of_two(unit_limit);

        lim->atomic_write_max_sectors =
                min(lim->atomic_write_hw_max >> SECTOR_SHIFT,
                        lim->max_hw_sectors);
        lim->atomic_write_unit_min =
                min(lim->atomic_write_hw_unit_min, unit_limit);
        lim->atomic_write_unit_max =
                min(lim->atomic_write_hw_unit_max, unit_limit);
        lim->atomic_write_boundary_sectors =
                lim->atomic_write_hw_boundary >> SECTOR_SHIFT;
}

/*
 * Test whether any boundary is aligned with any chunk size. Stacked
 * devices store any stripe size in t->chunk_sectors.
 */
static bool blk_valid_atomic_writes_boundary(unsigned int chunk_sectors,
                                        unsigned int boundary_sectors)
{
        if (!chunk_sectors || !boundary_sectors)
                return true;

        if (boundary_sectors > chunk_sectors &&
            boundary_sectors % chunk_sectors)
                return false;

        if (chunk_sectors > boundary_sectors &&
            chunk_sectors % boundary_sectors)
                return false;

        return true;
}

static void blk_validate_atomic_write_limits(struct queue_limits *lim)
{
        unsigned int boundary_sectors;
        unsigned int atomic_write_hw_max_sectors =
                        lim->atomic_write_hw_max >> SECTOR_SHIFT;

        if (!(lim->features & BLK_FEAT_ATOMIC_WRITES))
                goto unsupported;

        /* UINT_MAX indicates stacked limits in initial state */
        if (lim->atomic_write_hw_max == UINT_MAX)
                goto unsupported;

        if (!lim->atomic_write_hw_max)
                goto unsupported;

        if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_min)))
                goto unsupported;

        if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_max)))
                goto unsupported;

        if (WARN_ON_ONCE(lim->atomic_write_hw_unit_min >
                         lim->atomic_write_hw_unit_max))
                goto unsupported;

        if (WARN_ON_ONCE(lim->atomic_write_hw_unit_max >
                         lim->atomic_write_hw_max))
                goto unsupported;

        if (WARN_ON_ONCE(lim->chunk_sectors &&
                        atomic_write_hw_max_sectors > lim->chunk_sectors))
                goto unsupported;

        boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT;

        if (boundary_sectors) {
                if (WARN_ON_ONCE(lim->atomic_write_hw_max >
                                 lim->atomic_write_hw_boundary))
                        goto unsupported;

                if (WARN_ON_ONCE(!blk_valid_atomic_writes_boundary(
                        lim->chunk_sectors, boundary_sectors)))
                        goto unsupported;

                /*
                 * The boundary size just needs to be a multiple of unit_max
                 * (and not necessarily a power-of-2), so this following check
                 * could be relaxed in future.
                 * Furthermore, if needed, unit_max could even be reduced so
                 * that it is compliant with a !power-of-2 boundary.
                 */
                if (!is_power_of_2(boundary_sectors))
                        goto unsupported;
        }

        blk_atomic_writes_update_limits(lim);
        return;

unsupported:
        lim->atomic_write_max_sectors = 0;
        lim->atomic_write_boundary_sectors = 0;
        lim->atomic_write_unit_min = 0;
        lim->atomic_write_unit_max = 0;
}

/*
 * Check that the limits in lim are valid, initialize defaults for unset
 * values, and cap values based on others where needed.
 */
int blk_validate_limits(struct queue_limits *lim)
{
        unsigned int max_hw_sectors;
        unsigned int logical_block_sectors;
        unsigned long seg_size;
        int err;

        /*
         * Unless otherwise specified, default to 512 byte logical blocks and a
         * physical block size equal to the logical block size.
         */
        if (!lim->logical_block_size)
                lim->logical_block_size = SECTOR_SIZE;
        else if (blk_validate_block_size(lim->logical_block_size)) {
                pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size);
                return -EINVAL;
        }
        if (lim->physical_block_size < lim->logical_block_size) {
                lim->physical_block_size = lim->logical_block_size;
        } else if (!is_power_of_2(lim->physical_block_size)) {
                pr_warn("Invalid physical block size (%d)\n", lim->physical_block_size);
                return -EINVAL;
        }

        /*
         * The minimum I/O size defaults to the physical block size unless
         * explicitly overridden.
         */
        if (lim->io_min < lim->physical_block_size)
                lim->io_min = lim->physical_block_size;

        /*
         * The optimal I/O size may not be aligned to physical block size
         * (because it may be limited by dma engines which have no clue about
         * block size of the disks attached to them), so we round it down here.
         */
        lim->io_opt = round_down(lim->io_opt, lim->physical_block_size);

        /*
         * max_hw_sectors has a somewhat weird default for historical reason,
         * but driver really should set their own instead of relying on this
         * value.
         *
         * The block layer relies on the fact that every driver can
         * handle at lest a page worth of data per I/O, and needs the value
         * aligned to the logical block size.
         */
        if (!lim->max_hw_sectors)
                lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
        if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS))
                return -EINVAL;
        logical_block_sectors = lim->logical_block_size >> SECTOR_SHIFT;
        if (WARN_ON_ONCE(logical_block_sectors > lim->max_hw_sectors))
                return -EINVAL;
        lim->max_hw_sectors = round_down(lim->max_hw_sectors,
                        logical_block_sectors);

        /*
         * The actual max_sectors value is a complex beast and also takes the
         * max_dev_sectors value (set by SCSI ULPs) and a user configurable
         * value into account.  The ->max_sectors value is always calculated
         * from these, so directly setting it won't have any effect.
         */
        max_hw_sectors = min_not_zero(lim->max_hw_sectors,
                                lim->max_dev_sectors);
        if (lim->max_user_sectors) {
                if (lim->max_user_sectors < BLK_MIN_SEGMENT_SIZE / SECTOR_SIZE)
                        return -EINVAL;
                lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
        } else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) {
                lim->max_sectors =
                        min(max_hw_sectors, lim->io_opt >> SECTOR_SHIFT);
        } else if (lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) {
                lim->max_sectors =
                        min(max_hw_sectors, lim->io_min >> SECTOR_SHIFT);
        } else {
                lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
        }
        lim->max_sectors = round_down(lim->max_sectors,
                        logical_block_sectors);

        /*
         * Random default for the maximum number of segments.  Driver should not
         * rely on this and set their own.
         */
        if (!lim->max_segments)
                lim->max_segments = BLK_MAX_SEGMENTS;

        if (lim->max_hw_wzeroes_unmap_sectors &&
            lim->max_hw_wzeroes_unmap_sectors != lim->max_write_zeroes_sectors)
                return -EINVAL;
        lim->max_wzeroes_unmap_sectors = min(lim->max_hw_wzeroes_unmap_sectors,
                        lim->max_user_wzeroes_unmap_sectors);

        lim->max_discard_sectors =
                min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);

        /*
         * When discard is not supported, discard_granularity should be reported
         * as 0 to userspace.
         */
        if (lim->max_discard_sectors)
                lim->discard_granularity =
                        max(lim->discard_granularity, lim->physical_block_size);
        else
                lim->discard_granularity = 0;

        if (!lim->max_discard_segments)
                lim->max_discard_segments = 1;

        /*
         * By default there is no limit on the segment boundary alignment,
         * but if there is one it can't be smaller than the page size as
         * that would break all the normal I/O patterns.
         */
        if (!lim->seg_boundary_mask)
                lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
        if (WARN_ON_ONCE(lim->seg_boundary_mask < BLK_MIN_SEGMENT_SIZE - 1))
                return -EINVAL;

        /*
         * Stacking device may have both virtual boundary and max segment
         * size limit, so allow this setting now, and long-term the two
         * might need to move out of stacking limits since we have immutable
         * bvec and lower layer bio splitting is supposed to handle the two
         * correctly.
         */
        if (lim->virt_boundary_mask) {
                if (!lim->max_segment_size)
                        lim->max_segment_size = UINT_MAX;
        } else {
                /*
                 * The maximum segment size has an odd historic 64k default that
                 * drivers probably should override.  Just like the I/O size we
                 * require drivers to at least handle a full page per segment.
                 */
                if (!lim->max_segment_size)
                        lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
                if (WARN_ON_ONCE(lim->max_segment_size < BLK_MIN_SEGMENT_SIZE))
                        return -EINVAL;
        }

        /* setup max segment size for building new segment in fast path */
        if (lim->seg_boundary_mask > lim->max_segment_size - 1)
                seg_size = lim->max_segment_size;
        else
                seg_size = lim->seg_boundary_mask + 1;
        lim->max_fast_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE);

        /*
         * We require drivers to at least do logical block aligned I/O, but
         * historically could not check for that due to the separate calls
         * to set the limits.  Once the transition is finished the check
         * below should be narrowed down to check the logical block size.
         */
        if (!lim->dma_alignment)
                lim->dma_alignment = SECTOR_SIZE - 1;
        if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE))
                return -EINVAL;

        if (lim->alignment_offset) {
                lim->alignment_offset &= (lim->physical_block_size - 1);
                lim->flags &= ~BLK_FLAG_MISALIGNED;
        }

        if (!(lim->features & BLK_FEAT_WRITE_CACHE))
                lim->features &= ~BLK_FEAT_FUA;

        blk_validate_atomic_write_limits(lim);

        err = blk_validate_integrity_limits(lim);
        if (err)
                return err;
        return blk_validate_zoned_limits(lim);
}
EXPORT_SYMBOL_GPL(blk_validate_limits);

/*
 * Set the default limits for a newly allocated queue.  @lim contains the
 * initial limits set by the driver, which could be no limit in which case
 * all fields are cleared to zero.
 */
int blk_set_default_limits(struct queue_limits *lim)
{
        /*
         * Most defaults are set by capping the bounds in blk_validate_limits,
         * but these limits are special and need an explicit initialization to
         * the max value here.
         */
        lim->max_user_discard_sectors = UINT_MAX;
        lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
        return blk_validate_limits(lim);
}

/**
 * queue_limits_commit_update - commit an atomic update of queue limits
 * @q:                queue to update
 * @lim:        limits to apply
 *
 * Apply the limits in @lim that were obtained from queue_limits_start_update()
 * and updated by the caller to @q.  The caller must have frozen the queue or
 * ensure that there are no outstanding I/Os by other means.
 *
 * Returns 0 if successful, else a negative error code.
 */
int queue_limits_commit_update(struct request_queue *q,
                struct queue_limits *lim)
{
        int error;

        lockdep_assert_held(&q->limits_lock);

        error = blk_validate_limits(lim);
        if (error)
                goto out_unlock;

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        if (q->crypto_profile && lim->integrity.tag_size) {
                pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together.\n");
                error = -EINVAL;
                goto out_unlock;
        }
#endif

        q->limits = *lim;
        if (q->disk)
                blk_apply_bdi_limits(q->disk->bdi, lim);
out_unlock:
        mutex_unlock(&q->limits_lock);
        return error;
}
EXPORT_SYMBOL_GPL(queue_limits_commit_update);

/**
 * queue_limits_commit_update_frozen - commit an atomic update of queue limits
 * @q:                queue to update
 * @lim:        limits to apply
 *
 * Apply the limits in @lim that were obtained from queue_limits_start_update()
 * and updated with the new values by the caller to @q.  Freezes the queue
 * before the update and unfreezes it after.
 *
 * Returns 0 if successful, else a negative error code.
 */
int queue_limits_commit_update_frozen(struct request_queue *q,
                struct queue_limits *lim)
{
        unsigned int memflags;
        int ret;

        memflags = blk_mq_freeze_queue(q);
        ret = queue_limits_commit_update(q, lim);
        blk_mq_unfreeze_queue(q, memflags);

        return ret;
}
EXPORT_SYMBOL_GPL(queue_limits_commit_update_frozen);

/**
 * queue_limits_set - apply queue limits to queue
 * @q:                queue to update
 * @lim:        limits to apply
 *
 * Apply the limits in @lim that were freshly initialized to @q.
 * To update existing limits use queue_limits_start_update() and
 * queue_limits_commit_update() instead.
 *
 * Returns 0 if successful, else a negative error code.
 */
int queue_limits_set(struct request_queue *q, struct queue_limits *lim)
{
        mutex_lock(&q->limits_lock);
        return queue_limits_commit_update(q, lim);
}
EXPORT_SYMBOL_GPL(queue_limits_set);

static int queue_limit_alignment_offset(const struct queue_limits *lim,
                sector_t sector)
{
        unsigned int granularity = max(lim->physical_block_size, lim->io_min);
        unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT)
                << SECTOR_SHIFT;

        return (granularity + lim->alignment_offset - alignment) % granularity;
}

static unsigned int queue_limit_discard_alignment(
                const struct queue_limits *lim, sector_t sector)
{
        unsigned int alignment, granularity, offset;

        if (!lim->max_discard_sectors)
                return 0;

        /* Why are these in bytes, not sectors? */
        alignment = lim->discard_alignment >> SECTOR_SHIFT;
        granularity = lim->discard_granularity >> SECTOR_SHIFT;

        /* Offset of the partition start in 'granularity' sectors */
        offset = sector_div(sector, granularity);

        /* And why do we do this modulus *again* in blkdev_issue_discard()? */
        offset = (granularity + alignment - offset) % granularity;

        /* Turn it back into bytes, gaah */
        return offset << SECTOR_SHIFT;
}

static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs)
{
        sectors = round_down(sectors, lbs >> SECTOR_SHIFT);
        if (sectors < PAGE_SIZE >> SECTOR_SHIFT)
                sectors = PAGE_SIZE >> SECTOR_SHIFT;
        return sectors;
}

/* Check if second and later bottom devices are compliant */
static bool blk_stack_atomic_writes_tail(struct queue_limits *t,
                                struct queue_limits *b)
{
        /* We're not going to support different boundary sizes.. yet */
        if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary)
                return false;

        /* Can't support this */
        if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max)
                return false;

        /* Or this */
        if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min)
                return false;

        t->atomic_write_hw_max = min(t->atomic_write_hw_max,
                                b->atomic_write_hw_max);
        t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min,
                                b->atomic_write_hw_unit_min);
        t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max,
                                b->atomic_write_hw_unit_max);
        return true;
}

static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t)
{
        unsigned int chunk_bytes;

        if (!t->chunk_sectors)
                return;

        /*
         * If chunk sectors is so large that its value in bytes overflows
         * UINT_MAX, then just shift it down so it definitely will fit.
         * We don't support atomic writes of such a large size anyway.
         */
        if (check_shl_overflow(t->chunk_sectors, SECTOR_SHIFT, &chunk_bytes))
                chunk_bytes = t->chunk_sectors;

        /*
         * Find values for limits which work for chunk size.
         * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk
         * size, as the chunk size is not restricted to a power-of-2.
         * So we need to find highest power-of-2 which works for the chunk
         * size.
         * As an example scenario, we could have t->unit_max = 16K and
         * t->chunk_sectors = 24KB. For this case, reduce t->unit_max to a
         * value aligned with both limits, i.e. 8K in this example.
         */
        t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max,
                                        max_pow_of_two_factor(chunk_bytes));

        t->atomic_write_hw_unit_min = min(t->atomic_write_hw_unit_min,
                                          t->atomic_write_hw_unit_max);
        t->atomic_write_hw_max = min(t->atomic_write_hw_max, chunk_bytes);
}

/* Check stacking of first bottom device */
static bool blk_stack_atomic_writes_head(struct queue_limits *t,
                                struct queue_limits *b)
{
        if (!blk_valid_atomic_writes_boundary(t->chunk_sectors,
                        b->atomic_write_hw_boundary >> SECTOR_SHIFT))
                return false;

        t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
        t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
        t->atomic_write_hw_max = b->atomic_write_hw_max;
        t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
        return true;
}

static void blk_stack_atomic_writes_limits(struct queue_limits *t,
                                struct queue_limits *b, sector_t start)
{
        if (!(b->features & BLK_FEAT_ATOMIC_WRITES))
                goto unsupported;

        if (!b->atomic_write_hw_unit_min)
                goto unsupported;

        if (!blk_atomic_write_start_sect_aligned(start, b))
                goto unsupported;

        /* UINT_MAX indicates no stacking of bottom devices yet */
        if (t->atomic_write_hw_max == UINT_MAX) {
                if (!blk_stack_atomic_writes_head(t, b))
                        goto unsupported;
        } else {
                if (!blk_stack_atomic_writes_tail(t, b))
                        goto unsupported;
        }
        blk_stack_atomic_writes_chunk_sectors(t);
        return;

unsupported:
        t->atomic_write_hw_max = 0;
        t->atomic_write_hw_unit_max = 0;
        t->atomic_write_hw_unit_min = 0;
        t->atomic_write_hw_boundary = 0;
}

/**
 * blk_stack_limits - adjust queue_limits for stacked devices
 * @t:        the stacking driver limits (top device)
 * @b:  the underlying queue limits (bottom, component device)
 * @start:  first data sector within component device
 *
 * Description:
 *    This function is used by stacking drivers like MD and DM to ensure
 *    that all component devices have compatible block sizes and
 *    alignments.  The stacking driver must provide a queue_limits
 *    struct (top) and then iteratively call the stacking function for
 *    all component (bottom) devices.  The stacking function will
 *    attempt to combine the values and ensure proper alignment.
 *
 *    Returns 0 if the top and bottom queue_limits are compatible.  The
 *    top device's block sizes and alignment offsets may be adjusted to
 *    ensure alignment with the bottom device. If no compatible sizes
 *    and alignments exist, -1 is returned and the resulting top
 *    queue_limits will have the misaligned flag set to indicate that
 *    the alignment_offset is undefined.
 */
int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                     sector_t start)
{
        unsigned int top, bottom, alignment;
        int ret = 0;

        t->features |= (b->features & BLK_FEAT_INHERIT_MASK);

        /*
         * Some feaures need to be supported both by the stacking driver and all
         * underlying devices.  The stacking driver sets these flags before
         * stacking the limits, and this will clear the flags if any of the
         * underlying devices does not support it.
         */
        if (!(b->features & BLK_FEAT_NOWAIT))
                t->features &= ~BLK_FEAT_NOWAIT;
        if (!(b->features & BLK_FEAT_POLL))
                t->features &= ~BLK_FEAT_POLL;

        t->flags |= (b->flags & BLK_FLAG_MISALIGNED);

        t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
        t->max_user_sectors = min_not_zero(t->max_user_sectors,
                        b->max_user_sectors);
        t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
        t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
        t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
                                        b->max_write_zeroes_sectors);
        t->max_user_wzeroes_unmap_sectors =
                        min(t->max_user_wzeroes_unmap_sectors,
                            b->max_user_wzeroes_unmap_sectors);
        t->max_hw_wzeroes_unmap_sectors =
                        min(t->max_hw_wzeroes_unmap_sectors,
                            b->max_hw_wzeroes_unmap_sectors);

        t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
                                        b->max_hw_zone_append_sectors);

        t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
                                            b->seg_boundary_mask);
        t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask,
                                            b->virt_boundary_mask);

        t->max_segments = min_not_zero(t->max_segments, b->max_segments);
        t->max_discard_segments = min_not_zero(t->max_discard_segments,
                                               b->max_discard_segments);
        t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
                                                 b->max_integrity_segments);

        t->max_segment_size = min_not_zero(t->max_segment_size,
                                           b->max_segment_size);

        alignment = queue_limit_alignment_offset(b, start);

        /* Bottom device has different alignment.  Check that it is
         * compatible with the current top alignment.
         */
        if (t->alignment_offset != alignment) {

                top = max(t->physical_block_size, t->io_min)
                        + t->alignment_offset;
                bottom = max(b->physical_block_size, b->io_min) + alignment;

                /* Verify that top and bottom intervals line up */
                if (max(top, bottom) % min(top, bottom)) {
                        t->flags |= BLK_FLAG_MISALIGNED;
                        ret = -1;
                }
        }

        t->logical_block_size = max(t->logical_block_size,
                                    b->logical_block_size);

        t->physical_block_size = max(t->physical_block_size,
                                     b->physical_block_size);

        t->io_min = max(t->io_min, b->io_min);
        t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
        t->dma_alignment = max(t->dma_alignment, b->dma_alignment);

        /* Set non-power-of-2 compatible chunk_sectors boundary */
        if (b->chunk_sectors)
                t->chunk_sectors = gcd(t->chunk_sectors, b->chunk_sectors);

        /* Physical block size a multiple of the logical block size? */
        if (t->physical_block_size & (t->logical_block_size - 1)) {
                t->physical_block_size = t->logical_block_size;
                t->flags |= BLK_FLAG_MISALIGNED;
                ret = -1;
        }

        /* Minimum I/O a multiple of the physical block size? */
        if (t->io_min & (t->physical_block_size - 1)) {
                t->io_min = t->physical_block_size;
                t->flags |= BLK_FLAG_MISALIGNED;
                ret = -1;
        }

        /* Optimal I/O a multiple of the physical block size? */
        if (t->io_opt & (t->physical_block_size - 1)) {
                t->io_opt = 0;
                t->flags |= BLK_FLAG_MISALIGNED;
                ret = -1;
        }

        /* chunk_sectors a multiple of the physical block size? */
        if (t->chunk_sectors % (t->physical_block_size >> SECTOR_SHIFT)) {
                t->chunk_sectors = 0;
                t->flags |= BLK_FLAG_MISALIGNED;
                ret = -1;
        }

        /* Find lowest common alignment_offset */
        t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment)
                % max(t->physical_block_size, t->io_min);

        /* Verify that new alignment_offset is on a logical block boundary */
        if (t->alignment_offset & (t->logical_block_size - 1)) {
                t->flags |= BLK_FLAG_MISALIGNED;
                ret = -1;
        }

        t->max_sectors = blk_round_down_sectors(t->max_sectors, t->logical_block_size);
        t->max_hw_sectors = blk_round_down_sectors(t->max_hw_sectors, t->logical_block_size);
        t->max_dev_sectors = blk_round_down_sectors(t->max_dev_sectors, t->logical_block_size);

        /* Discard alignment and granularity */
        if (b->discard_granularity) {
                alignment = queue_limit_discard_alignment(b, start);

                t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
                                                      b->max_discard_sectors);
                t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors,
                                                         b->max_hw_discard_sectors);
                t->discard_granularity = max(t->discard_granularity,
                                             b->discard_granularity);
                t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) %
                        t->discard_granularity;
        }
        t->max_secure_erase_sectors = min_not_zero(t->max_secure_erase_sectors,
                                                   b->max_secure_erase_sectors);
        t->zone_write_granularity = max(t->zone_write_granularity,
                                        b->zone_write_granularity);
        if (!(t->features & BLK_FEAT_ZONED)) {
                t->zone_write_granularity = 0;
                t->max_zone_append_sectors = 0;
        }
        blk_stack_atomic_writes_limits(t, b, start);

        return ret;
}
EXPORT_SYMBOL(blk_stack_limits);

/**
 * queue_limits_stack_bdev - adjust queue_limits for stacked devices
 * @t:        the stacking driver limits (top device)
 * @bdev:  the underlying block device (bottom)
 * @offset:  offset to beginning of data within component device
 * @pfx: prefix to use for warnings logged
 *
 * Description:
 *    This function is used by stacking drivers like MD and DM to ensure
 *    that all component devices have compatible block sizes and
 *    alignments.  The stacking driver must provide a queue_limits
 *    struct (top) and then iteratively call the stacking function for
 *    all component (bottom) devices.  The stacking function will
 *    attempt to combine the values and ensure proper alignment.
 */
void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
                sector_t offset, const char *pfx)
{
        if (blk_stack_limits(t, bdev_limits(bdev),
                        get_start_sect(bdev) + offset))
                pr_notice("%s: Warning: Device %pg is misaligned\n",
                        pfx, bdev);
}
EXPORT_SYMBOL_GPL(queue_limits_stack_bdev);

/**
 * queue_limits_stack_integrity - stack integrity profile
 * @t: target queue limits
 * @b: base queue limits
 *
 * Check if the integrity profile in the @b can be stacked into the
 * target @t.  Stacking is possible if either:
 *
 *   a) does not have any integrity information stacked into it yet
 *   b) the integrity profile in @b is identical to the one in @t
 *
 * If @b can be stacked into @t, return %true.  Else return %false and clear the
 * integrity information in @t.
 */
bool queue_limits_stack_integrity(struct queue_limits *t,
                struct queue_limits *b)
{
        struct blk_integrity *ti = &t->integrity;
        struct blk_integrity *bi = &b->integrity;

        if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY))
                return true;

        if (ti->flags & BLK_INTEGRITY_STACKED) {
                if (ti->metadata_size != bi->metadata_size)
                        goto incompatible;
                if (ti->interval_exp != bi->interval_exp)
                        goto incompatible;
                if (ti->tag_size != bi->tag_size)
                        goto incompatible;
                if (ti->csum_type != bi->csum_type)
                        goto incompatible;
                if (ti->pi_tuple_size != bi->pi_tuple_size)
                        goto incompatible;
                if ((ti->flags & BLK_INTEGRITY_REF_TAG) !=
                    (bi->flags & BLK_INTEGRITY_REF_TAG))
                        goto incompatible;
                if ((ti->flags & BLK_SPLIT_INTERVAL_CAPABLE) &&
                    !(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE))
                        ti->flags &= ~BLK_SPLIT_INTERVAL_CAPABLE;
        } else {
                ti->flags = BLK_INTEGRITY_STACKED;
                ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) |
                             (bi->flags & BLK_INTEGRITY_REF_TAG) |
                             (bi->flags & BLK_SPLIT_INTERVAL_CAPABLE);
                ti->csum_type = bi->csum_type;
                ti->pi_tuple_size = bi->pi_tuple_size;
                ti->metadata_size = bi->metadata_size;
                ti->pi_offset = bi->pi_offset;
                ti->interval_exp = bi->interval_exp;
                ti->tag_size = bi->tag_size;
        }
        return true;

incompatible:
        memset(ti, 0, sizeof(*ti));
        return false;
}
EXPORT_SYMBOL_GPL(queue_limits_stack_integrity);

/**
 * blk_set_queue_depth - tell the block layer about the device queue depth
 * @q:                the request queue for the device
 * @depth:                queue depth
 *
 */
void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
{
        q->queue_depth = depth;
        rq_qos_queue_depth_changed(q);
}
EXPORT_SYMBOL(blk_set_queue_depth);

int bdev_alignment_offset(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q->limits.flags & BLK_FLAG_MISALIGNED)
                return -1;
        if (bdev_is_partition(bdev))
                return queue_limit_alignment_offset(&q->limits,
                                bdev->bd_start_sect);
        return q->limits.alignment_offset;
}
EXPORT_SYMBOL_GPL(bdev_alignment_offset);

unsigned int bdev_discard_alignment(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (bdev_is_partition(bdev))
                return queue_limit_discard_alignment(&q->limits,
                                bdev->bd_start_sect);
        return q->limits.discard_alignment;
}
EXPORT_SYMBOL_GPL(bdev_discard_alignment);



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_64_H
#define _ASM_X86_UACCESS_64_H

/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/lockdep.h>
#include <linux/kasan-checks.h>
#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
#include <asm/percpu.h>

#ifdef MODULE
  #define runtime_const_ptr(sym) (sym)
#else
  #include <asm/runtime-const.h>
#endif
extern unsigned long USER_PTR_MAX;

#ifdef CONFIG_ADDRESS_MASKING
/*
 * Mask out tag bits from the address.
 */
static inline unsigned long __untagged_addr(unsigned long addr)
{
        asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]",
                                X86_FEATURE_LAM)
             : [addr] "+r" (addr)
             : [mask] "m" (__my_cpu_var(tlbstate_untag_mask)));

        return addr;
}

#define untagged_addr(addr)        ({                                        \
        unsigned long __addr = (__force unsigned long)(addr);                \
        (__force __typeof__(addr))__untagged_addr(__addr);                \
})

static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
                                                   unsigned long addr)
{
        mmap_assert_locked(mm);
        return addr & (mm)->context.untag_mask;
}

#define untagged_addr_remote(mm, addr)        ({                                \
        unsigned long __addr = (__force unsigned long)(addr);                \
        (__force __typeof__(addr))__untagged_addr_remote(mm, __addr);        \
})

#endif

#define valid_user_address(x) \
        likely((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX))

/*
 * Masking the user address is an alternative to a conditional
 * user_access_begin that can avoid the fencing. This only works
 * for dense accesses starting at the address.
 */
static inline void __user *mask_user_address(const void __user *ptr)
{
        void __user *ret;
        asm("cmp %1,%0\n\t"
            "cmova %1,%0"
                :"=r" (ret)
                :"r" (runtime_const_ptr(USER_PTR_MAX)),
                 "0" (ptr));
        return ret;
}
#define masked_user_access_begin(x) ({                                \
        auto __masked_ptr = (x);                                \
        __masked_ptr = mask_user_address(__masked_ptr);                \
        __uaccess_begin(); __masked_ptr; })

/*
 * User pointers can have tag bits on x86-64.  This scheme tolerates
 * arbitrary values in those bits rather then masking them off.
 *
 * Enforce two rules:
 * 1. 'ptr' must be in the user part of the address space
 * 2. 'ptr+size' must not overflow into kernel addresses
 *
 * Note that we always have at least one guard page between the
 * max user address and the non-canonical gap, allowing us to
 * ignore small sizes entirely.
 *
 * In fact, we could probably remove the size check entirely, since
 * any kernel accesses will be in increasing address order starting
 * at 'ptr'.
 *
 * That's a separate optimization, for now just handle the small
 * constant case.
 */
static inline bool __access_ok(const void __user *ptr, unsigned long size)
{
        if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
                return valid_user_address(ptr);
        } else {
                unsigned long sum = size + (__force unsigned long)ptr;

                return valid_user_address(sum) && sum >= (__force unsigned long)ptr;
        }
}
#define __access_ok __access_ok

/*
 * Copy To/From Userspace
 */

/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
rep_movs_alternative(void *to, const void *from, unsigned len);

static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned long len)
{
        stac();
        /*
         * If CPU has FSRM feature, use 'rep movs'.
         * Otherwise, use rep_movs_alternative.
         */
        asm volatile(
                "1:\n\t"
                ALTERNATIVE("rep movsb",
                            "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
                "2:\n"
                _ASM_EXTABLE_UA(1b, 2b)
                :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
                : : "memory", "rax");
        clac();
        return len;
}

static __always_inline __must_check unsigned long
raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
{
        return copy_user_generic(dst, (__force void *)src, size);
}

static __always_inline __must_check unsigned long
raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
        return copy_user_generic((__force void *)dst, src, size);
}

#define copy_to_nontemporal copy_to_nontemporal
extern size_t copy_to_nontemporal(void *dst, const void *src, size_t size);
extern size_t copy_user_flushcache(void *dst, const void __user *src, size_t size);

static inline int
copy_from_user_inatomic_nontemporal(void *dst, const void __user *src,
                                  unsigned size)
{
        long ret;
        kasan_check_write(dst, size);
        src = mask_user_address(src);
        stac();
        ret = copy_to_nontemporal(dst, (__force const void *)src, size);
        clac();
        return ret;
}

static inline size_t
copy_from_user_flushcache(void *dst, const void __user *src, size_t size)
{
        kasan_check_write(dst, size);
        return copy_user_flushcache(dst, src, size);
}

/*
 * Zero Userspace.
 */

__must_check unsigned long
rep_stos_alternative(void __user *addr, unsigned long len);

static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{
        might_fault();
        stac();

        /*
         * No memory constraint because it doesn't change any memory gcc
         * knows about.
         */
        asm volatile(
                "1:\n\t"
                ALTERNATIVE("rep stosb",
                            "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
                "2:\n"
               _ASM_EXTABLE_UA(1b, 2b)
               : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
               : "a" (0));

        clac();

        return size;
}

static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
{
        if (__access_ok(to, n))
                return __clear_user(to, n);
        return n;
}
#endif /* _ASM_X86_UACCESS_64_H */

















































































































































































    2 








    1 


    2 
    1 


    2 




    1 


    2 
    2 



    2 

    2 







    2 
















    2 






    2 


    2 
    1 


































    2 




    1 
    2 







    2 


















    2 






    1 


    1 







    1 

    1 
    1 




    1 

    1 
    1 




    1 

    1 
    1 




    1 

    1 





    1 

    1 





    1 

    1 





    1 

    1 






































































































































































































































































    1 






















    1 
































    1 

    1 


    1 


    1 



    1 









    1 











    1 







    1 





































































































    1 



    1 
















































































































    1 



    1 





















    4 










    3 


















    1 






    1 































    1 






    1 






    1 

    1 






























    1 






    1 











    1 



    1 


























































































    1 

























    1 
















    1 




    1 
    1 



































































































































    1 






    1 

































    1 



    1 

    1 














    1 
    1 









    1 













    1 











































    1 


    1 






















    1 


    1 







































































    1 












































































































































































































































































































































































































    4 







    4 






    4 






    4 














    1 








    1 



















    1 




































































    3 
    3 






    3 







    2 



    4 





    4 


    1 


















    1 




    1 






















    2 
































    1 












































    1 





























    1 



























































































    1 

    3 













    3 

    1 









































































































    2 














    1 















    2 
































    2 




    2 























    2 











    1 

    2 

    1 
    1 


    2 





    1 
















































































    2 










    2 





    2 




    2 































    2 

































    2 



    2 











    2 
    2 
    2 
    2 
    2 
    2 
    2 
    1 
    2 
    2 





    2 
    2 
































































































































































































































    1 


















    1 





    1 






    1 
    1 



    1 







    1 





























































































    1 

































    1 










    1 


































    2 



    2 





    2 


    2 
    2 






    2 













    2 
    2 






    2 













    2 
    2 






    2 














    2 
    1 









    1 

    2 


    2 


































































































    2 





































    2 






























    2 





    2 

    1 

    2 





















    2 

































    1 





    1 
































































































































































































































































































    1 
    1 










































    1 






    1 




















    1 

















    1 
















    1 



    1 











    1 


















    1 






















































































    1 



















































    1 

























    1 


    1 


    1 







    1 

    1 

















    1 









    1 





















    1 


    1 



























    1 






































    2 

    2 

    2 






    2 


    1 


    2 














    2 



    1 







    1 





    2 














    1 







    1 
























    1 












    2 




    2 




















    2 



































































    2 




























    2 




    1 













































    1 





    2 








    2 










    1 







    2 


    2 







    2 


















    1 
















    1 
























    2 






    2 












    2 










    2 















    2 



















    2 






    2 






    2 















    2 














    1 







    2 


















    2 
    2 



    2 


    2 












    2 




























































































    2 

    1 
















    1 



    1 


    1 






    1 


    1 








    1 


















    1 










    1 
    1 

































    2 









































    2 











































    2 



















    1 






    1 













    1 





    2 






    1 







    1 
















    2 

















    1 

















    2 



    2 

    2 






    2 



    2 















    2 



    2 












    2 



















































    1 









































    1 

    1 

    1 










    2 
















    1 





    1 







    1 

























    1 






    1 



    1 











































    1 








    1 















    1 




















    1 



    1 

















    1 























    1 






    1 






    1 



































































    1 
    1 







    1 


    2 















    2 


    2 



    1 










    1 









    1 





    2 







































































































































































































































































































































































    1 




    1 











    1 

    1 


    1 



    1 





    1 














    1 









    1 





    1 






    1 




    1 







    1 






    1 



















    1 
    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 


















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/super.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/vfs.h>
#include <linux/random.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
#include <linux/dax.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/fsnotify.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/fserror.h>

#include "ext4.h"
#include "ext4_extents.h"        /* Needed for trace points definition */
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "mballoc.h"
#include "fsmap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>

static struct ext4_lazy_init *ext4_li_info;
static DEFINE_MUTEX(ext4_li_mtx);
static struct ratelimit_state ext4_mount_msg_ratelimit;

static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
static int ext4_show_options(struct seq_file *seq, struct dentry *root);
static void ext4_update_super(struct super_block *sb);
static int ext4_commit_super(struct super_block *sb);
static int ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
static int ext4_clear_journal_err(struct super_block *sb,
                                  struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
                                            unsigned int journal_inum);
static int ext4_validate_options(struct fs_context *fc);
static int ext4_check_opt_consistency(struct fs_context *fc,
                                      struct super_block *sb);
static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
static int ext4_get_tree(struct fs_context *fc);
static int ext4_reconfigure(struct fs_context *fc);
static void ext4_fc_free(struct fs_context *fc);
static int ext4_init_fs_context(struct fs_context *fc);
static void ext4_kill_sb(struct super_block *sb);
static const struct fs_parameter_spec ext4_param_specs[];

/*
 * Lock ordering
 *
 * page fault path:
 * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
 *   -> page lock -> i_data_sem (rw)
 *
 * buffered write path:
 * sb_start_write -> i_mutex -> mmap_lock
 * sb_start_write -> i_mutex -> transaction start -> page lock ->
 *   i_data_sem (rw)
 *
 * truncate:
 * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
 *   page lock
 * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
 *   i_data_sem (rw)
 *
 * direct IO:
 * sb_start_write -> i_mutex -> mmap_lock
 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
 *
 * writepages:
 * transaction start -> page lock(s) -> i_data_sem (rw)
 */

static const struct fs_context_operations ext4_context_ops = {
        .parse_param        = ext4_parse_param,
        .get_tree        = ext4_get_tree,
        .reconfigure        = ext4_reconfigure,
        .free                = ext4_fc_free,
};


#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ext2",
        .init_fs_context        = ext4_init_fs_context,
        .parameters                = ext4_param_specs,
        .kill_sb                = ext4_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
MODULE_ALIAS("ext2");
#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type)
#else
#define IS_EXT2_SB(sb) (0)
#endif


static struct file_system_type ext3_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ext3",
        .init_fs_context        = ext4_init_fs_context,
        .parameters                = ext4_param_specs,
        .kill_sb                = ext4_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type)


static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
                                  bh_end_io_t *end_io, bool simu_fail)
{
        if (simu_fail) {
                clear_buffer_uptodate(bh);
                unlock_buffer(bh);
                return;
        }

        /*
         * buffer's verified bit is no longer valid after reading from
         * disk again due to write out error, clear it to make sure we
         * recheck the buffer contents.
         */
        clear_buffer_verified(bh);

        bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
        get_bh(bh);
        submit_bh(REQ_OP_READ | op_flags, bh);
}

void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
                         bh_end_io_t *end_io, bool simu_fail)
{
        BUG_ON(!buffer_locked(bh));

        if (ext4_buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return;
        }
        __ext4_read_bh(bh, op_flags, end_io, simu_fail);
}

int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
                 bh_end_io_t *end_io, bool simu_fail)
{
        BUG_ON(!buffer_locked(bh));

        if (ext4_buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return 0;
        }

        __ext4_read_bh(bh, op_flags, end_io, simu_fail);

        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return 0;
        return -EIO;
}

int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
        lock_buffer(bh);
        if (!wait) {
                ext4_read_bh_nowait(bh, op_flags, NULL, false);
                return 0;
        }
        return ext4_read_bh(bh, op_flags, NULL, false);
}

/*
 * This works like __bread_gfp() except it uses ERR_PTR for error
 * returns.  Currently with sb_bread it's impossible to distinguish
 * between ENOMEM and EIO situations (since both result in a NULL
 * return.
 */
static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
                                               sector_t block,
                                               blk_opf_t op_flags, gfp_t gfp)
{
        struct buffer_head *bh;
        int ret;

        bh = sb_getblk_gfp(sb, block, gfp);
        if (bh == NULL)
                return ERR_PTR(-ENOMEM);
        if (ext4_buffer_uptodate(bh))
                return bh;

        ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
        if (ret) {
                put_bh(bh);
                return ERR_PTR(ret);
        }
        return bh;
}

struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
                                   blk_opf_t op_flags)
{
        gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
                        ~__GFP_FS) | __GFP_MOVABLE;

        return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
}

struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
                                            sector_t block)
{
        gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
                        ~__GFP_FS);

        return __ext4_sb_bread_gfp(sb, block, 0, gfp);
}

struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
                                         sector_t block)
{
        gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
                        ~__GFP_FS) | __GFP_MOVABLE | __GFP_NOFAIL;

        return __ext4_sb_bread_gfp(sb, block, 0, gfp);
}

void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
        struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
                        sb->s_blocksize, GFP_NOWAIT);

        if (likely(bh)) {
                if (trylock_buffer(bh))
                        ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false);
                brelse(bh);
        }
}

static int ext4_verify_csum_type(struct super_block *sb,
                                 struct ext4_super_block *es)
{
        if (!ext4_has_feature_metadata_csum(sb))
                return 1;

        return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}

__le32 ext4_superblock_csum(struct ext4_super_block *es)
{
        int offset = offsetof(struct ext4_super_block, s_checksum);
        __u32 csum;

        csum = ext4_chksum(~0, (char *)es, offset);

        return cpu_to_le32(csum);
}

static int ext4_superblock_csum_verify(struct super_block *sb,
                                       struct ext4_super_block *es)
{
        if (!ext4_has_feature_metadata_csum(sb))
                return 1;

        return es->s_checksum == ext4_superblock_csum(es);
}

void ext4_superblock_csum_set(struct super_block *sb)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (!ext4_has_feature_metadata_csum(sb))
                return;

        es->s_checksum = ext4_superblock_csum(es);
}

ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_block_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
}

ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_inode_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
}

ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_inode_table_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
}

__u32 ext4_free_group_clusters(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
}

__u32 ext4_free_inodes_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0);
}

__u32 ext4_used_dirs_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
}

__u32 ext4_itable_unused_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_itable_unused_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
}

void ext4_block_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
}

void ext4_inode_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
}

void ext4_inode_table_set(struct super_block *sb,
                          struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
}

void ext4_free_group_clusters_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
}

void ext4_free_inodes_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count));
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16));
}

void ext4_used_dirs_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
}

void ext4_itable_unused_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}

static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
{
        now = clamp_val(now, 0, (1ull << 40) - 1);

        *lo = cpu_to_le32(lower_32_bits(now));
        *hi = upper_32_bits(now);
}

static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
{
        return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
}
#define ext4_update_tstamp(es, tstamp) \
        __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
                             ktime_get_real_seconds())
#define ext4_get_tstamp(es, tstamp) \
        __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)

/*
 * The ext4_maybe_update_superblock() function checks and updates the
 * superblock if needed.
 *
 * This function is designed to update the on-disk superblock only under
 * certain conditions to prevent excessive disk writes and unnecessary
 * waking of the disk from sleep. The superblock will be updated if:
 * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last
 *    superblock update
 * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the
 *    last superblock update.
 *
 * @sb: The superblock
 */
static void ext4_maybe_update_superblock(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        journal_t *journal = sbi->s_journal;
        time64_t now;
        __u64 last_update;
        __u64 lifetime_write_kbytes;
        __u64 diff_size;

        if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
            !(sb->s_flags & SB_ACTIVE) || !journal ||
            journal->j_flags & JBD2_UNMOUNT)
                return;

        now = ktime_get_real_seconds();
        last_update = ext4_get_tstamp(es, s_wtime);

        if (likely(now - last_update < sbi->s_sb_update_sec))
                return;

        lifetime_write_kbytes = sbi->s_kbytes_written +
                ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
                  sbi->s_sectors_written_start) >> 1);

        /* Get the number of kilobytes not written to disk to account
         * for statistics and compare with a multiple of 16 MB. This
         * is used to determine when the next superblock commit should
         * occur (i.e. not more often than once per 16MB if there was
         * less written in an hour).
         */
        diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);

        if (diff_size > sbi->s_sb_update_kb)
                schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}

static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
        struct super_block                *sb = journal->j_private;

        BUG_ON(txn->t_state == T_FINISHED);

        ext4_process_freed_data(sb, txn->t_tid);
        ext4_maybe_update_superblock(sb);
}

static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode,
                struct folio *folio)
{
        struct buffer_head *bh, *head;
        struct journal_head *jh;
        transaction_t *trans = READ_ONCE(jinode->i_transaction);

        bh = head = folio_buffers(folio);
        do {
                /*
                 * We have to redirty a page in these cases:
                 * 1) If buffer is dirty, it means the page was dirty because it
                 * contains a buffer that needs checkpointing. So the dirty bit
                 * needs to be preserved so that checkpointing writes the buffer
                 * properly.
                 * 2) If buffer is not part of the committing transaction
                 * (we may have just accidentally come across this buffer because
                 * inode range tracking is not exact) or if the currently running
                 * transaction already contains this buffer as well, dirty bit
                 * needs to be preserved so that the buffer gets writeprotected
                 * properly on running transaction's commit.
                 */
                jh = bh2jh(bh);
                if (buffer_dirty(bh) ||
                    (jh && (jh->b_transaction != trans ||
                            jh->b_next_transaction)))
                        return true;
        } while ((bh = bh->b_this_page) != head);

        return false;
}

static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
        loff_t range_start, range_end;
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
        };
        struct folio *folio = NULL;
        int error;

        if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end))
                return 0;

        wbc.range_start = range_start;
        wbc.range_end = range_end;

        /*
         * writeback_iter() already checks for dirty pages and calls
         * folio_clear_dirty_for_io(), which we want to write protect the
         * folios.
         *
         * However, we may have to redirty a folio sometimes.
         */
        while ((folio = writeback_iter(mapping, &wbc, folio, &error))) {
                if (ext4_journalled_writepage_needs_redirty(jinode, folio))
                        folio_redirty_for_writepage(&wbc, folio);
                folio_unlock(folio);
        }

        return error;
}

static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        int ret;

        if (ext4_should_journal_data(jinode->i_vfs_inode))
                ret = ext4_journalled_submit_inode_data_buffers(jinode);
        else
                ret = ext4_normal_submit_inode_data_buffers(jinode);
        return ret;
}

static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
{
        int ret = 0;

        if (!ext4_should_journal_data(jinode->i_vfs_inode))
                ret = jbd2_journal_finish_inode_data_buffers(jinode);

        return ret;
}

static bool system_going_down(void)
{
        return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
                || system_state == SYSTEM_RESTART;
}

struct ext4_err_translation {
        int code;
        int errno;
};

#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }

static struct ext4_err_translation err_translation[] = {
        EXT4_ERR_TRANSLATE(EIO),
        EXT4_ERR_TRANSLATE(ENOMEM),
        EXT4_ERR_TRANSLATE(EFSBADCRC),
        EXT4_ERR_TRANSLATE(EFSCORRUPTED),
        EXT4_ERR_TRANSLATE(ENOSPC),
        EXT4_ERR_TRANSLATE(ENOKEY),
        EXT4_ERR_TRANSLATE(EROFS),
        EXT4_ERR_TRANSLATE(EFBIG),
        EXT4_ERR_TRANSLATE(EEXIST),
        EXT4_ERR_TRANSLATE(ERANGE),
        EXT4_ERR_TRANSLATE(EOVERFLOW),
        EXT4_ERR_TRANSLATE(EBUSY),
        EXT4_ERR_TRANSLATE(ENOTDIR),
        EXT4_ERR_TRANSLATE(ENOTEMPTY),
        EXT4_ERR_TRANSLATE(ESHUTDOWN),
        EXT4_ERR_TRANSLATE(EFAULT),
};

static int ext4_errno_to_code(int errno)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(err_translation); i++)
                if (err_translation[i].errno == errno)
                        return err_translation[i].code;
        return EXT4_ERR_UNKNOWN;
}

static void save_error_info(struct super_block *sb, int error,
                            __u32 ino, __u64 block,
                            const char *func, unsigned int line)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* We default to EFSCORRUPTED error... */
        if (error == 0)
                error = EFSCORRUPTED;

        spin_lock(&sbi->s_error_lock);
        sbi->s_add_error_count++;
        sbi->s_last_error_code = error;
        sbi->s_last_error_line = line;
        sbi->s_last_error_ino = ino;
        sbi->s_last_error_block = block;
        sbi->s_last_error_func = func;
        sbi->s_last_error_time = ktime_get_real_seconds();
        if (!sbi->s_first_error_time) {
                sbi->s_first_error_code = error;
                sbi->s_first_error_line = line;
                sbi->s_first_error_ino = ino;
                sbi->s_first_error_block = block;
                sbi->s_first_error_func = func;
                sbi->s_first_error_time = sbi->s_last_error_time;
        }
        spin_unlock(&sbi->s_error_lock);
}

/* Deal with the reporting of failure conditions on a filesystem such as
 * inconsistencies detected or read IO failures.
 *
 * On ext2, we can store the error state of the filesystem in the
 * superblock.  That is not possible on ext4, because we may have other
 * write ordering constraints on the superblock which prevent us from
 * writing it out straight away; and given that the journal is about to
 * be aborted, we can't rely on the current, or future, transactions to
 * write out the superblock safely.
 *
 * We'll just use the jbd2_journal_abort() error code to record an error in
 * the journal instead.  On recovery, the journal will complain about
 * that error until we've noted it down and cleared it.
 *
 * If force_ro is set, we unconditionally force the filesystem into an
 * ABORT|READONLY state, unless the error response on the fs has been set to
 * panic in which case we take the easy way out and panic immediately. This is
 * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
 * at a critical moment in log management.
 */
static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
                              __u32 ino, __u64 block,
                              const char *func, unsigned int line)
{
        journal_t *journal = EXT4_SB(sb)->s_journal;
        bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT);

        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
        if (test_opt(sb, WARN_ON_ERROR))
                WARN_ON_ONCE(1);

        if (!continue_fs && !ext4_emergency_ro(sb) && journal)
                jbd2_journal_abort(journal, -error);

        if (!bdev_read_only(sb->s_bdev)) {
                save_error_info(sb, error, ino, block, func, line);
                /*
                 * In case the fs should keep running, we need to writeout
                 * superblock through the journal. Due to lock ordering
                 * constraints, it may not be safe to do it right here so we
                 * defer superblock flushing to a workqueue. We just need to be
                 * careful when the journal is already shutting down. If we get
                 * here in that case, just update the sb directly as the last
                 * transaction won't commit anyway.
                 */
                if (continue_fs && journal &&
                    !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY))
                        schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
                else
                        ext4_commit_super(sb);
        }

        /*
         * We force ERRORS_RO behavior when system is rebooting. Otherwise we
         * could panic during 'reboot -f' as the underlying device got already
         * disabled.
         */
        if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
        }

        if (ext4_emergency_ro(sb) || continue_fs)
                return;

        ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
        /*
         * We don't set SB_RDONLY because that requires sb->s_umount
         * semaphore and setting it without proper remount procedure is
         * confusing code such as freeze_super() leading to deadlocks
         * and other problems.
         */
        set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
}

static void update_super_work(struct work_struct *work)
{
        struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
                                                s_sb_upd_work);
        journal_t *journal = sbi->s_journal;
        handle_t *handle;

        /*
         * If the journal is still running, we have to write out superblock
         * through the journal to avoid collisions of other journalled sb
         * updates.
         *
         * We use directly jbd2 functions here to avoid recursing back into
         * ext4 error handling code during handling of previous errors.
         */
        if (!ext4_emergency_state(sbi->s_sb) &&
            !sb_rdonly(sbi->s_sb) && journal) {
                struct buffer_head *sbh = sbi->s_sbh;
                bool call_notify_err = false;

                handle = jbd2_journal_start(journal, 1);
                if (IS_ERR(handle))
                        goto write_directly;
                if (jbd2_journal_get_write_access(handle, sbh)) {
                        jbd2_journal_stop(handle);
                        goto write_directly;
                }

                if (sbi->s_add_error_count > 0)
                        call_notify_err = true;

                ext4_update_super(sbi->s_sb);
                if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
                        ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
                                 "superblock detected");
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
                }

                if (jbd2_journal_dirty_metadata(handle, sbh)) {
                        jbd2_journal_stop(handle);
                        goto write_directly;
                }
                jbd2_journal_stop(handle);

                if (call_notify_err)
                        ext4_notify_error_sysfs(sbi);

                return;
        }
write_directly:
        /*
         * Write through journal failed. Write sb directly to get error info
         * out and hope for the best.
         */
        ext4_commit_super(sbi->s_sb);
        ext4_notify_error_sysfs(sbi);
}

#define ext4_error_ratelimit(sb)                                        \
                ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),        \
                             "EXT4-fs error")

void __ext4_error(struct super_block *sb, const char *function,
                  unsigned int line, bool force_ro, int error, __u64 block,
                  const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (unlikely(ext4_emergency_state(sb)))
                return;

        trace_ext4_error(sb, function, line);
        if (ext4_error_ratelimit(sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                printk(KERN_CRIT
                       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
                       sb->s_id, function, line, current->comm, &vaf);
                va_end(args);
        }
        fserror_report_metadata(sb, error ? -abs(error) : -EFSCORRUPTED,
                                GFP_ATOMIC);

        ext4_handle_error(sb, force_ro, error, 0, block, function, line);
}

void __ext4_error_inode(struct inode *inode, const char *function,
                        unsigned int line, ext4_fsblk_t block, int error,
                        const char *fmt, ...)
{
        va_list args;
        struct va_format vaf;

        if (unlikely(ext4_emergency_state(inode->i_sb)))
                return;

        trace_ext4_error(inode->i_sb, function, line);
        if (ext4_error_ratelimit(inode->i_sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                if (block)
                        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
                               "inode #%llu: block %llu: comm %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               block, current->comm, &vaf);
                else
                        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
                               "inode #%llu: comm %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               current->comm, &vaf);
                va_end(args);
        }
        fserror_report_file_metadata(inode,
                                     error ? -abs(error) : -EFSCORRUPTED,
                                     GFP_ATOMIC);

        ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
                          function, line);
}

void __ext4_error_file(struct file *file, const char *function,
                       unsigned int line, ext4_fsblk_t block,
                       const char *fmt, ...)
{
        va_list args;
        struct va_format vaf;
        struct inode *inode = file_inode(file);
        char pathname[80], *path;

        if (unlikely(ext4_emergency_state(inode->i_sb)))
                return;

        trace_ext4_error(inode->i_sb, function, line);
        if (ext4_error_ratelimit(inode->i_sb)) {
                path = file_path(file, pathname, sizeof(pathname));
                if (IS_ERR(path))
                        path = "(unknown)";
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                if (block)
                        printk(KERN_CRIT
                               "EXT4-fs error (device %s): %s:%d: inode #%llu: "
                               "block %llu: comm %s: path %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               block, current->comm, path, &vaf);
                else
                        printk(KERN_CRIT
                               "EXT4-fs error (device %s): %s:%d: inode #%llu: "
                               "comm %s: path %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               current->comm, path, &vaf);
                va_end(args);
        }
        fserror_report_file_metadata(inode, -EFSCORRUPTED, GFP_ATOMIC);

        ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
                          function, line);
}

const char *ext4_decode_error(struct super_block *sb, int errno,
                              char nbuf[16])
{
        char *errstr = NULL;

        switch (errno) {
        case -EFSCORRUPTED:
                errstr = "Corrupt filesystem";
                break;
        case -EFSBADCRC:
                errstr = "Filesystem failed CRC";
                break;
        case -EIO:
                errstr = "IO failure";
                break;
        case -ENOMEM:
                errstr = "Out of memory";
                break;
        case -EROFS:
                if (!sb || (EXT4_SB(sb)->s_journal &&
                            EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
                        errstr = "Journal has aborted";
                else
                        errstr = "Readonly filesystem";
                break;
        default:
                /* If the caller passed in an extra buffer for unknown
                 * errors, textualise them now.  Else we just return
                 * NULL. */
                if (nbuf) {
                        /* Check for truncated error codes... */
                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
                                errstr = nbuf;
                }
                break;
        }

        return errstr;
}

/* __ext4_std_error decodes expected errors from journaling functions
 * automatically and invokes the appropriate error response.  */

void __ext4_std_error(struct super_block *sb, const char *function,
                      unsigned int line, int errno)
{
        char nbuf[16];
        const char *errstr;

        if (unlikely(ext4_emergency_state(sb)))
                return;

        /* Special case: if the error is EROFS, and we're not already
         * inside a transaction, then there's really no point in logging
         * an error. */
        if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
                return;

        if (ext4_error_ratelimit(sb)) {
                errstr = ext4_decode_error(sb, errno, nbuf);
                printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
                       sb->s_id, function, line, errstr);
        }
        fserror_report_metadata(sb, errno ? -abs(errno) : -EFSCORRUPTED,
                                GFP_ATOMIC);

        ext4_handle_error(sb, false, -errno, 0, 0, function, line);
}

void __ext4_msg(struct super_block *sb,
                const char *prefix, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (sb) {
                atomic_inc(&EXT4_SB(sb)->s_msg_count);
                if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state),
                                  "EXT4-fs"))
                        return;
        }

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        if (sb)
                printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        else
                printk("%sEXT4-fs: %pV\n", prefix, &vaf);
        va_end(args);
}

static int ext4_warning_ratelimit(struct super_block *sb)
{
        atomic_inc(&EXT4_SB(sb)->s_warning_count);
        return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
                            "EXT4-fs warning");
}

void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (!ext4_warning_ratelimit(sb))
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
               sb->s_id, function, line, &vaf);
        va_end(args);
}

void __ext4_warning_inode(const struct inode *inode, const char *function,
                          unsigned int line, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (!ext4_warning_ratelimit(inode->i_sb))
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
               "inode #%llu: comm %s: %pV\n", inode->i_sb->s_id,
               function, line, inode->i_ino, current->comm, &vaf);
        va_end(args);
}

void __ext4_grp_locked_error(const char *function, unsigned int line,
                             struct super_block *sb, ext4_group_t grp,
                             u64 ino, ext4_fsblk_t block,
                             const char *fmt, ...)
__releases(bitlock)
__acquires(bitlock)
{
        struct va_format vaf;
        va_list args;

        if (unlikely(ext4_emergency_state(sb)))
                return;

        trace_ext4_error(sb, function, line);
        if (ext4_error_ratelimit(sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
                       sb->s_id, function, line, grp);
                if (ino)
                        printk(KERN_CONT "inode %llu: ", ino);
                if (block)
                        printk(KERN_CONT "block %llu:",
                               (unsigned long long) block);
                printk(KERN_CONT "%pV\n", &vaf);
                va_end(args);
        }

        if (test_opt(sb, ERRORS_CONT)) {
                if (test_opt(sb, WARN_ON_ERROR))
                        WARN_ON_ONCE(1);
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                if (!bdev_read_only(sb->s_bdev)) {
                        save_error_info(sb, EFSCORRUPTED, ino, block, function,
                                        line);
                        schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
                }
                return;
        }
        ext4_unlock_group(sb, grp);
        ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line);
        /*
         * We only get here in the ERRORS_RO case; relocking the group
         * may be dangerous, but nothing bad will happen since the
         * filesystem will have already been marked read/only and the
         * journal has been aborted.  We return 1 as a hint to callers
         * who might what to use the return value from
         * ext4_grp_locked_error() to distinguish between the
         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
         * aggressively from the ext4 function in question, with a
         * more appropriate error code.
         */
        ext4_lock_group(sb, grp);
        return;
}

void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                     ext4_group_t group,
                                     unsigned int flags)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
        int ret;

        if (!grp || !gdp)
                return;
        if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
                ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
                                            &grp->bb_state);
                if (!ret)
                        percpu_counter_sub(&sbi->s_freeclusters_counter,
                                           grp->bb_free);
        }

        if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
                ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
                                            &grp->bb_state);
                if (!ret && gdp) {
                        int count;

                        count = ext4_free_inodes_count(sb, gdp);
                        percpu_counter_sub(&sbi->s_freeinodes_counter,
                                           count);
                }
        }
}

void ext4_update_dynamic_rev(struct super_block *sb)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
                return;

        ext4_warning(sb,
                     "updating to rev %d because of new feature flag, "
                     "running e2fsck is recommended",
                     EXT4_DYNAMIC_REV);

        es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
        es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
        es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
        /* leave es->s_feature_*compat flags alone */
        /* es->s_uuid will be set by e2fsck if empty */

        /*
         * The rest of the superblock fields should be zero, and if not it
         * means they are likely already in use, so leave them alone.  We
         * can leave it up to e2fsck to clean up any inconsistencies there.
         */
}

static inline struct inode *orphan_list_entry(struct list_head *l)
{
        return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
}

static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
{
        struct list_head *l;

        ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
                 le32_to_cpu(sbi->s_es->s_last_orphan));

        printk(KERN_ERR "sb_info orphan list:\n");
        list_for_each(l, &sbi->s_orphan) {
                struct inode *inode = orphan_list_entry(l);
                printk(KERN_ERR "  "
                       "inode %s:%llu at %p: mode %o, nlink %d, next %d\n",
                       inode->i_sb->s_id, inode->i_ino, inode,
                       inode->i_mode, inode->i_nlink,
                       NEXT_ORPHAN(inode));
        }
}

#ifdef CONFIG_QUOTA
static int ext4_quota_off(struct super_block *sb, int type);

static inline void ext4_quotas_off(struct super_block *sb, int type)
{
        BUG_ON(type > EXT4_MAXQUOTAS);

        /* Use our quota_off function to clear inode flags etc. */
        for (type--; type >= 0; type--)
                ext4_quota_off(sb, type);
}

/*
 * This is a helper function which is used in the mount/remount
 * codepaths (which holds s_umount) to fetch the quota file name.
 */
static inline char *get_qf_name(struct super_block *sb,
                                struct ext4_sb_info *sbi,
                                int type)
{
        return rcu_dereference_protected(sbi->s_qf_names[type],
                                         lockdep_is_held(&sb->s_umount));
}
#else
static inline void ext4_quotas_off(struct super_block *sb, int type)
{
}
#endif

static int ext4_percpu_param_init(struct ext4_sb_info *sbi)
{
        ext4_fsblk_t block;
        int err;

        block = ext4_count_free_clusters(sbi->s_sb);
        ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block));
        err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
                                  GFP_KERNEL);
        if (!err) {
                unsigned long freei = ext4_count_free_inodes(sbi->s_sb);
                sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
                                          GFP_KERNEL);
        }
        if (!err)
                err = percpu_counter_init(&sbi->s_dirs_counter,
                                          ext4_count_dirs(sbi->s_sb), GFP_KERNEL);
        if (!err)
                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
                                          GFP_KERNEL);
        if (!err)
                err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
                                          GFP_KERNEL);
        if (!err)
                err = percpu_init_rwsem(&sbi->s_writepages_rwsem);

        if (err)
                ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory");

        return err;
}

static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi)
{
        percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
        percpu_free_rwsem(&sbi->s_writepages_rwsem);
}

static void ext4_group_desc_free(struct ext4_sb_info *sbi)
{
        struct buffer_head **group_desc;
        int i;

        group_desc = rcu_access_pointer(sbi->s_group_desc);
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(group_desc[i]);
        kvfree(group_desc);
}

static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
{
        struct flex_groups **flex_groups;
        int i;

        flex_groups = rcu_access_pointer(sbi->s_flex_groups);
        if (flex_groups) {
                for (i = 0; i < sbi->s_flex_groups_allocated; i++)
                        kvfree(flex_groups[i]);
                kvfree(flex_groups);
        }
}

static void ext4_put_super(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int aborted = 0;
        int err;

        /*
         * Unregister sysfs before destroying jbd2 journal.
         * Since we could still access attr_journal_task attribute via sysfs
         * path which could have sbi->s_journal->j_task as NULL
         * Unregister sysfs before flush sbi->s_sb_upd_work.
         * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
         * read metadata verify failed then will queue error work.
         * update_super_work will call start_this_handle may trigger
         * BUG_ON.
         */
        ext4_unregister_sysfs(sb);

        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
                ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
                         &sb->s_uuid);

        ext4_unregister_li_request(sb);
        ext4_quotas_off(sb, EXT4_MAXQUOTAS);

        destroy_workqueue(sbi->rsv_conversion_wq);
        ext4_release_orphan_info(sb);

        if (sbi->s_journal) {
                aborted = is_journal_aborted(sbi->s_journal);
                err = ext4_journal_destroy(sbi, sbi->s_journal);
                if ((err < 0) && !aborted) {
                        ext4_abort(sb, -err, "Couldn't clean up the journal");
                }
        } else
                flush_work(&sbi->s_sb_upd_work);

        ext4_es_unregister_shrinker(sbi);
        timer_shutdown_sync(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);

        if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) {
                if (!aborted) {
                        ext4_clear_feature_journal_needs_recovery(sb);
                        ext4_clear_feature_orphan_present(sb);
                        es->s_state = cpu_to_le16(sbi->s_mount_state);
                }
                ext4_commit_super(sb);
        }

        ext4_group_desc_free(sbi);
        ext4_flex_groups_free(sbi);

        WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) &&
                     percpu_counter_sum(&sbi->s_dirtyclusters_counter));
        ext4_percpu_param_destroy(sbi);
#ifdef CONFIG_QUOTA
        for (int i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(get_qf_name(sb, sbi, i));
#endif

        /* Debugging code just in case the in-memory inode orphan list
         * isn't empty.  The on-disk one can be non-empty if we've
         * detected an error and taken the fs readonly, but the
         * in-memory list had better be clean by this point. */
        if (!list_empty(&sbi->s_orphan))
                dump_orphan_list(sb, sbi);
        ASSERT(list_empty(&sbi->s_orphan));

        sync_blockdev(sb->s_bdev);
        invalidate_bdev(sb->s_bdev);
        if (sbi->s_journal_bdev_file) {
                /*
                 * Invalidate the journal device's buffers.  We don't want them
                 * floating about in memory - the physical journal device may
                 * hotswapped, and it breaks the `ro-after' testing code.
                 */
                sync_blockdev(file_bdev(sbi->s_journal_bdev_file));
                invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
        }

        ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
        sbi->s_ea_inode_cache = NULL;

        ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
        sbi->s_ea_block_cache = NULL;

        ext4_stop_mmpd(sbi);

        brelse(sbi->s_sbh);
        sb->s_fs_info = NULL;
        /*
         * Now that we are completely done shutting down the
         * superblock, we need to actually destroy the kobject.
         */
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
        kfree(sbi->s_blockgroup_lock);
        fs_put_dax(sbi->s_daxdev, NULL);
        fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#if IS_ENABLED(CONFIG_UNICODE)
        utf8_unload(sb->s_encoding);
#endif
        kfree(sbi);
}

static struct kmem_cache *ext4_inode_cachep;

/*
 * Called inside transaction, so use GFP_NOFS
 */
static struct inode *ext4_alloc_inode(struct super_block *sb)
{
        struct ext4_inode_info *ei;

        ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;

        inode_set_iversion(&ei->vfs_inode, 1);
        ei->i_flags = 0;
        ext4_clear_state_flags(ei);        /* Only relevant on 32-bit archs */
        spin_lock_init(&ei->i_raw_lock);
        ei->i_prealloc_node = RB_ROOT;
        atomic_set(&ei->i_prealloc_active, 0);
        rwlock_init(&ei->i_prealloc_lock);
        ext4_es_init_tree(&ei->i_es_tree);
        rwlock_init(&ei->i_es_lock);
        INIT_LIST_HEAD(&ei->i_es_list);
        ei->i_es_all_nr = 0;
        ei->i_es_shk_nr = 0;
        ei->i_es_shrink_lblk = 0;
        ei->i_es_seq = 0;
        ei->i_reserved_data_blocks = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
        ext4_init_pending_tree(&ei->i_pending_tree);
#ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
        memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
#endif
        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
        ext4_fc_init_inode(&ei->vfs_inode);
        spin_lock_init(&ei->i_fc_lock);
        mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
        return &ei->vfs_inode;
}

static int ext4_drop_inode(struct inode *inode)
{
        int drop = inode_generic_drop(inode);

        if (!drop)
                drop = fscrypt_drop_inode(inode);

        trace_ext4_drop_inode(inode, drop);
        return drop;
}

static void ext4_free_in_core_inode(struct inode *inode)
{
        fscrypt_free_inode(inode);
        if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
                pr_warn("%s: inode %llu still in fc list",
                        __func__, inode->i_ino);
        }
        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}

static void ext4_destroy_inode(struct inode *inode)
{
        if (ext4_inode_orphan_tracked(inode)) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %llu (%p): inode tracked as orphan!",
                         inode->i_ino, EXT4_I(inode));
                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
                                EXT4_I(inode), sizeof(struct ext4_inode_info),
                                true);
                dump_stack();
        }

        if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
            WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!",
                         inode->i_ino, EXT4_I(inode),
                         EXT4_I(inode)->i_reserved_data_blocks);
}

static void ext4_shutdown(struct super_block *sb)
{
       ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH);
}

static void init_once(void *foo)
{
        struct ext4_inode_info *ei = foo;

        INIT_LIST_HEAD(&ei->i_orphan);
        init_rwsem(&ei->xattr_sem);
        init_rwsem(&ei->i_data_sem);
        inode_init_once(&ei->vfs_inode);
        ext4_fc_init_inode(&ei->vfs_inode);
#ifdef CONFIG_FS_ENCRYPTION
        ei->i_crypt_info = NULL;
#endif
}

static int __init init_inodecache(void)
{
        struct kmem_cache_args args = {
                .useroffset = offsetof(struct ext4_inode_info, i_data),
                .usersize = sizeof_field(struct ext4_inode_info, i_data),
                .use_freeptr_offset = true,
                .freeptr_offset = offsetof(struct ext4_inode_info, i_flags),
                .ctor = init_once,
        };

        ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
                                sizeof(struct ext4_inode_info),
                                &args,
                                SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT);

        if (ext4_inode_cachep == NULL)
                return -ENOMEM;
        return 0;
}

static void destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(ext4_inode_cachep);
}

void ext4_clear_inode(struct inode *inode)
{
        ext4_fc_del(inode);
        if (!EXT4_SB(inode->i_sb)->s_journal)
                mmb_invalidate(&EXT4_I(inode)->i_metadata_bhs);
        clear_inode(inode);
        ext4_discard_preallocations(inode);
        /*
         * We must remove the inode from the hash before ext4_free_inode()
         * clears the bit in inode bitmap as otherwise another process reusing
         * the inode will block in insert_inode_hash() waiting for inode
         * eviction to complete while holding transaction handle open, but
         * ext4_evict_inode() still running for that inode could block waiting
         * for transaction commit if the inode is marked as IS_SYNC => deadlock.
         *
         * Removing the inode from the hash here is safe. There are two cases
         * to consider:
         * 1) The inode still has references to it (i_nlink > 0). In that case
         * we are keeping the inode and once we remove the inode from the hash,
         * iget() can create the new inode structure for the same inode number
         * and we are fine with that as all IO on behalf of the inode is
         * finished.
         * 2) We are deleting the inode (i_nlink == 0). In that case inode
         * number cannot be reused until ext4_free_inode() clears the bit in
         * the inode bitmap, at which point all IO is done and reuse is fine
         * again.
         */
        remove_inode_hash(inode);
        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
        dquot_drop(inode);
        if (EXT4_I(inode)->jinode) {
                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
                                               EXT4_I(inode)->jinode);
                jbd2_free_inode(EXT4_I(inode)->jinode);
                EXT4_I(inode)->jinode = NULL;
        }
        fscrypt_put_encryption_info(inode);
}

static struct inode *ext4_nfs_get_inode(struct super_block *sb,
                                        u64 ino, u32 generation)
{
        struct inode *inode;

        /*
         * Currently we don't know the generation for parent directory, so
         * a generation of 0 means "accept any"
         */
        inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
        if (IS_ERR(inode))
                return ERR_CAST(inode);
        if (generation && inode->i_generation != generation) {
                iput(inode);
                return ERR_PTR(-ESTALE);
        }

        return inode;
}

static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
{
        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
}

static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
{
        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
}

static int ext4_nfs_commit_metadata(struct inode *inode)
{
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL
        };

        trace_ext4_nfs_commit_metadata(inode);
        return ext4_write_inode(inode, &wbc);
}

#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])

static int ext4_write_dquot(struct dquot *dquot);
static int ext4_acquire_dquot(struct dquot *dquot);
static int ext4_release_dquot(struct dquot *dquot);
static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         const struct path *path);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off);
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                             unsigned int flags);

static struct dquot __rcu **ext4_get_dquots(struct inode *inode)
{
        return EXT4_I(inode)->i_dquot;
}

static const struct dquot_operations ext4_quota_operations = {
        .get_reserved_space        = ext4_get_reserved_space,
        .write_dquot                = ext4_write_dquot,
        .acquire_dquot                = ext4_acquire_dquot,
        .release_dquot                = ext4_release_dquot,
        .mark_dirty                = ext4_mark_dquot_dirty,
        .write_info                = ext4_write_info,
        .alloc_dquot                = dquot_alloc,
        .destroy_dquot                = dquot_destroy,
        .get_projid                = ext4_get_projid,
        .get_inode_usage        = ext4_get_inode_usage,
        .get_next_id                = dquot_get_next_id,
};

static const struct quotactl_ops ext4_qctl_operations = {
        .quota_on        = ext4_quota_on,
        .quota_off        = ext4_quota_off,
        .quota_sync        = dquot_quota_sync,
        .get_state        = dquot_get_state,
        .set_info        = dquot_set_dqinfo,
        .get_dqblk        = dquot_get_dqblk,
        .set_dqblk        = dquot_set_dqblk,
        .get_nextdqblk        = dquot_get_next_dqblk,
};
#endif

static const struct super_operations ext4_sops = {
        .alloc_inode        = ext4_alloc_inode,
        .free_inode        = ext4_free_in_core_inode,
        .destroy_inode        = ext4_destroy_inode,
        .write_inode        = ext4_write_inode,
        .dirty_inode        = ext4_dirty_inode,
        .drop_inode        = ext4_drop_inode,
        .evict_inode        = ext4_evict_inode,
        .put_super        = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
        .freeze_fs        = ext4_freeze,
        .unfreeze_fs        = ext4_unfreeze,
        .statfs                = ext4_statfs,
        .show_options        = ext4_show_options,
        .shutdown        = ext4_shutdown,
#ifdef CONFIG_QUOTA
        .quota_read        = ext4_quota_read,
        .quota_write        = ext4_quota_write,
        .get_dquots        = ext4_get_dquots,
#endif
};

static const struct export_operations ext4_export_ops = {
        .encode_fh = generic_encode_ino32_fh,
        .fh_to_dentry = ext4_fh_to_dentry,
        .fh_to_parent = ext4_fh_to_parent,
        .get_parent = ext4_get_parent,
        .commit_metadata = ext4_nfs_commit_metadata,
};

enum {
        Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
        Opt_resgid, Opt_resuid, Opt_sb,
        Opt_nouid32, Opt_debug, Opt_removed,
        Opt_user_xattr, Opt_acl,
        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
        Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
        Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
        Opt_inlinecrypt,
        Opt_usrjquota, Opt_grpjquota, Opt_quota,
        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
        Opt_usrquota, Opt_grpquota, Opt_prjquota,
        Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
        Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
        Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
        Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
        Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
#ifdef CONFIG_EXT4_DEBUG
        Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
};

static const struct constant_table ext4_param_errors[] = {
        {"continue",        EXT4_MOUNT_ERRORS_CONT},
        {"panic",        EXT4_MOUNT_ERRORS_PANIC},
        {"remount-ro",        EXT4_MOUNT_ERRORS_RO},
        {}
};

static const struct constant_table ext4_param_data[] = {
        {"journal",        EXT4_MOUNT_JOURNAL_DATA},
        {"ordered",        EXT4_MOUNT_ORDERED_DATA},
        {"writeback",        EXT4_MOUNT_WRITEBACK_DATA},
        {}
};

static const struct constant_table ext4_param_data_err[] = {
        {"abort",        Opt_data_err_abort},
        {"ignore",        Opt_data_err_ignore},
        {}
};

static const struct constant_table ext4_param_jqfmt[] = {
        {"vfsold",        QFMT_VFS_OLD},
        {"vfsv0",        QFMT_VFS_V0},
        {"vfsv1",        QFMT_VFS_V1},
        {}
};

static const struct constant_table ext4_param_dax[] = {
        {"always",        Opt_dax_always},
        {"inode",        Opt_dax_inode},
        {"never",        Opt_dax_never},
        {}
};

/*
 * Mount option specification
 * We don't use fsparam_flag_no because of the way we set the
 * options and the way we show them in _ext4_show_options(). To
 * keep the changes to a minimum, let's keep the negative options
 * separate for now.
 */
static const struct fs_parameter_spec ext4_param_specs[] = {
        fsparam_flag        ("bsddf",                Opt_bsd_df),
        fsparam_flag        ("minixdf",                Opt_minix_df),
        fsparam_flag        ("grpid",                Opt_grpid),
        fsparam_flag        ("bsdgroups",                Opt_grpid),
        fsparam_flag        ("nogrpid",                Opt_nogrpid),
        fsparam_flag        ("sysvgroups",                Opt_nogrpid),
        fsparam_gid        ("resgid",                Opt_resgid),
        fsparam_uid        ("resuid",                Opt_resuid),
        fsparam_u32        ("sb",                        Opt_sb),
        fsparam_enum        ("errors",                Opt_errors, ext4_param_errors),
        fsparam_flag        ("nouid32",                Opt_nouid32),
        fsparam_flag        ("debug",                Opt_debug),
        fsparam_flag        ("oldalloc",                Opt_removed),
        fsparam_flag        ("orlov",                Opt_removed),
        fsparam_flag        ("user_xattr",                Opt_user_xattr),
        fsparam_flag        ("acl",                        Opt_acl),
        fsparam_flag        ("norecovery",                Opt_noload),
        fsparam_flag        ("noload",                Opt_noload),
        fsparam_flag        ("bh",                        Opt_removed),
        fsparam_flag        ("nobh",                Opt_removed),
        fsparam_u32        ("commit",                Opt_commit),
        fsparam_u32        ("min_batch_time",        Opt_min_batch_time),
        fsparam_u32        ("max_batch_time",        Opt_max_batch_time),
        fsparam_u32        ("journal_dev",                Opt_journal_dev),
        fsparam_bdev        ("journal_path",        Opt_journal_path),
        fsparam_flag        ("journal_checksum",        Opt_journal_checksum),
        fsparam_flag        ("nojournal_checksum",        Opt_nojournal_checksum),
        fsparam_flag        ("journal_async_commit",Opt_journal_async_commit),
        fsparam_flag        ("abort",                Opt_abort),
        fsparam_enum        ("data",                Opt_data, ext4_param_data),
        fsparam_enum        ("data_err",                Opt_data_err,
                                                ext4_param_data_err),
        fsparam_string_empty
                        ("usrjquota",                Opt_usrjquota),
        fsparam_string_empty
                        ("grpjquota",                Opt_grpjquota),
        fsparam_enum        ("jqfmt",                Opt_jqfmt, ext4_param_jqfmt),
        fsparam_flag        ("grpquota",                Opt_grpquota),
        fsparam_flag        ("quota",                Opt_quota),
        fsparam_flag        ("noquota",                Opt_noquota),
        fsparam_flag        ("usrquota",                Opt_usrquota),
        fsparam_flag        ("prjquota",                Opt_prjquota),
        fsparam_flag        ("barrier",                Opt_barrier),
        fsparam_u32        ("barrier",                Opt_barrier),
        fsparam_flag        ("nobarrier",                Opt_nobarrier),
        fsparam_flag        ("i_version",                Opt_removed),
        fsparam_flag        ("dax",                        Opt_dax),
        fsparam_enum        ("dax",                        Opt_dax_type, ext4_param_dax),
        fsparam_u32        ("stripe",                Opt_stripe),
        fsparam_flag        ("delalloc",                Opt_delalloc),
        fsparam_flag        ("nodelalloc",                Opt_nodelalloc),
        fsparam_flag        ("warn_on_error",        Opt_warn_on_error),
        fsparam_flag        ("nowarn_on_error",        Opt_nowarn_on_error),
        fsparam_u32        ("debug_want_extra_isize",
                                                Opt_debug_want_extra_isize),
        fsparam_flag        ("mblk_io_submit",        Opt_removed),
        fsparam_flag        ("nomblk_io_submit",        Opt_removed),
        fsparam_flag        ("block_validity",        Opt_block_validity),
        fsparam_flag        ("noblock_validity",        Opt_noblock_validity),
        fsparam_u32        ("inode_readahead_blks",
                                                Opt_inode_readahead_blks),
        fsparam_u32        ("journal_ioprio",        Opt_journal_ioprio),
        fsparam_u32        ("auto_da_alloc",        Opt_auto_da_alloc),
        fsparam_flag        ("auto_da_alloc",        Opt_auto_da_alloc),
        fsparam_flag        ("noauto_da_alloc",        Opt_noauto_da_alloc),
        fsparam_flag        ("dioread_nolock",        Opt_dioread_nolock),
        fsparam_flag        ("nodioread_nolock",        Opt_dioread_lock),
        fsparam_flag        ("dioread_lock",        Opt_dioread_lock),
        fsparam_flag        ("discard",                Opt_discard),
        fsparam_flag        ("nodiscard",                Opt_nodiscard),
        fsparam_u32        ("init_itable",                Opt_init_itable),
        fsparam_flag        ("init_itable",                Opt_init_itable),
        fsparam_flag        ("noinit_itable",        Opt_noinit_itable),
#ifdef CONFIG_EXT4_DEBUG
        fsparam_flag        ("fc_debug_force",        Opt_fc_debug_force),
        fsparam_u32        ("fc_debug_max_replay",        Opt_fc_debug_max_replay),
#endif
        fsparam_u32        ("max_dir_size_kb",        Opt_max_dir_size_kb),
        fsparam_flag        ("test_dummy_encryption",
                                                Opt_test_dummy_encryption),
        fsparam_string        ("test_dummy_encryption",
                                                Opt_test_dummy_encryption),
        fsparam_flag        ("inlinecrypt",                Opt_inlinecrypt),
        fsparam_flag        ("nombcache",                Opt_nombcache),
        fsparam_flag        ("no_mbcache",                Opt_nombcache),        /* for backward compatibility */
        fsparam_flag        ("prefetch_block_bitmaps",
                                                Opt_removed),
        fsparam_flag        ("no_prefetch_block_bitmaps",
                                                Opt_no_prefetch_block_bitmaps),
        fsparam_s32        ("mb_optimize_scan",        Opt_mb_optimize_scan),
        fsparam_string        ("check",                Opt_removed),        /* mount option from ext2/3 */
        fsparam_flag        ("nocheck",                Opt_removed),        /* mount option from ext2/3 */
        fsparam_flag        ("reservation",                Opt_removed),        /* mount option from ext2/3 */
        fsparam_flag        ("noreservation",        Opt_removed),        /* mount option from ext2/3 */
        fsparam_u32        ("journal",                Opt_removed),        /* mount option from ext2/3 */
        {}
};


#define MOPT_SET        0x0001
#define MOPT_CLEAR        0x0002
#define MOPT_NOSUPPORT        0x0004
#define MOPT_EXPLICIT        0x0008
#ifdef CONFIG_QUOTA
#define MOPT_Q                0
#define MOPT_QFMT        0x0010
#else
#define MOPT_Q                MOPT_NOSUPPORT
#define MOPT_QFMT        MOPT_NOSUPPORT
#endif
#define MOPT_NO_EXT2        0x0020
#define MOPT_NO_EXT3        0x0040
#define MOPT_EXT4_ONLY        (MOPT_NO_EXT2 | MOPT_NO_EXT3)
#define MOPT_SKIP        0x0080
#define        MOPT_2                0x0100

static const struct mount_opts {
        int        token;
        int        mount_opt;
        int        flags;
} ext4_mount_opts[] = {
        {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
        {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
        {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
        {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
        {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
        {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
        {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
         MOPT_EXT4_ONLY | MOPT_SET},
        {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
        {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
        {Opt_delalloc, EXT4_MOUNT_DELALLOC,
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
        {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
        {Opt_commit, 0, MOPT_NO_EXT2},
        {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
                                    EXT4_MOUNT_JOURNAL_CHECKSUM),
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
        {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2},
        {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
        {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
        {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
        {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
        {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
        {Opt_dax_type, 0, MOPT_EXT4_ONLY},
        {Opt_journal_dev, 0, MOPT_NO_EXT2},
        {Opt_journal_path, 0, MOPT_NO_EXT2},
        {Opt_journal_ioprio, 0, MOPT_NO_EXT2},
        {Opt_data, 0, MOPT_NO_EXT2},
        {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
#else
        {Opt_acl, 0, MOPT_NOSUPPORT},
#endif
        {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
        {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
        {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
        {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
                       EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
                                                        MOPT_CLEAR | MOPT_Q},
        {Opt_usrjquota, 0, MOPT_Q},
        {Opt_grpjquota, 0, MOPT_Q},
        {Opt_jqfmt, 0, MOPT_QFMT},
        {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
        {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
         MOPT_SET},
#ifdef CONFIG_EXT4_DEBUG
        {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
         MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
#endif
        {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2},
        {Opt_err, 0, 0}
};

#if IS_ENABLED(CONFIG_UNICODE)
static const struct ext4_sb_encodings {
        __u16 magic;
        char *name;
        unsigned int version;
} ext4_sb_encoding_map[] = {
        {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)},
};

static const struct ext4_sb_encodings *
ext4_sb_read_encoding(const struct ext4_super_block *es)
{
        __u16 magic = le16_to_cpu(es->s_encoding);
        int i;

        for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
                if (magic == ext4_sb_encoding_map[i].magic)
                        return &ext4_sb_encoding_map[i];

        return NULL;
}
#endif

#define EXT4_SPEC_JQUOTA                        (1 <<  0)
#define EXT4_SPEC_JQFMT                                (1 <<  1)
#define EXT4_SPEC_DATAJ                                (1 <<  2)
#define EXT4_SPEC_SB_BLOCK                        (1 <<  3)
#define EXT4_SPEC_JOURNAL_DEV                        (1 <<  4)
#define EXT4_SPEC_JOURNAL_IOPRIO                (1 <<  5)
#define EXT4_SPEC_s_want_extra_isize                (1 <<  7)
#define EXT4_SPEC_s_max_batch_time                (1 <<  8)
#define EXT4_SPEC_s_min_batch_time                (1 <<  9)
#define EXT4_SPEC_s_inode_readahead_blks        (1 << 10)
#define EXT4_SPEC_s_li_wait_mult                (1 << 11)
#define EXT4_SPEC_s_max_dir_size_kb                (1 << 12)
#define EXT4_SPEC_s_stripe                        (1 << 13)
#define EXT4_SPEC_s_resuid                        (1 << 14)
#define EXT4_SPEC_s_resgid                        (1 << 15)
#define EXT4_SPEC_s_commit_interval                (1 << 16)
#define EXT4_SPEC_s_fc_debug_max_replay                (1 << 17)
#define EXT4_SPEC_s_sb_block                        (1 << 18)
#define EXT4_SPEC_mb_optimize_scan                (1 << 19)

struct ext4_fs_context {
        char                *s_qf_names[EXT4_MAXQUOTAS];
        struct fscrypt_dummy_policy dummy_enc_policy;
        int                s_jquota_fmt;        /* Format of quota to use */
#ifdef CONFIG_EXT4_DEBUG
        int s_fc_debug_max_replay;
#endif
        unsigned short        qname_spec;
        unsigned long        vals_s_flags;        /* Bits to set in s_flags */
        unsigned long        mask_s_flags;        /* Bits changed in s_flags */
        unsigned long        journal_devnum;
        unsigned long        s_commit_interval;
        unsigned long        s_stripe;
        unsigned int        s_inode_readahead_blks;
        unsigned int        s_want_extra_isize;
        unsigned int        s_li_wait_mult;
        unsigned int        s_max_dir_size_kb;
        unsigned int        journal_ioprio;
        unsigned int        vals_s_mount_opt;
        unsigned int        mask_s_mount_opt;
        unsigned int        vals_s_mount_opt2;
        unsigned int        mask_s_mount_opt2;
        unsigned int        opt_flags;        /* MOPT flags */
        unsigned int        spec;
        u32                s_max_batch_time;
        u32                s_min_batch_time;
        kuid_t                s_resuid;
        kgid_t                s_resgid;
        ext4_fsblk_t        s_sb_block;
};

static void ext4_fc_free(struct fs_context *fc)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        int i;

        if (!ctx)
                return;

        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(ctx->s_qf_names[i]);

        fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
        kfree(ctx);
}

int ext4_init_fs_context(struct fs_context *fc)
{
        struct ext4_fs_context *ctx;

        ctx = kzalloc_obj(struct ext4_fs_context);
        if (!ctx)
                return -ENOMEM;

        fc->fs_private = ctx;
        fc->ops = &ext4_context_ops;

        /* i_version is always enabled now */
        fc->sb_flags |= SB_I_VERSION;

        return 0;
}

#ifdef CONFIG_QUOTA
/*
 * Note the name of the specified quota file.
 */
static int note_qf_name(struct fs_context *fc, int qtype,
                       struct fs_parameter *param)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        char *qname;

        if (param->size < 1) {
                ext4_msg(NULL, KERN_ERR, "Missing quota name");
                return -EINVAL;
        }
        if (strchr(param->string, '/')) {
                ext4_msg(NULL, KERN_ERR,
                         "quotafile must be on filesystem root");
                return -EINVAL;
        }
        if (ctx->s_qf_names[qtype]) {
                if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) {
                        ext4_msg(NULL, KERN_ERR,
                                 "%s quota file already specified",
                                 QTYPE2NAME(qtype));
                        return -EINVAL;
                }
                return 0;
        }

        qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
        if (!qname) {
                ext4_msg(NULL, KERN_ERR,
                         "Not enough memory for storing quotafile name");
                return -ENOMEM;
        }
        ctx->s_qf_names[qtype] = qname;
        ctx->qname_spec |= 1 << qtype;
        ctx->spec |= EXT4_SPEC_JQUOTA;
        return 0;
}

/*
 * Clear the name of the specified quota file.
 */
static int unnote_qf_name(struct fs_context *fc, int qtype)
{
        struct ext4_fs_context *ctx = fc->fs_private;

        kfree(ctx->s_qf_names[qtype]);

        ctx->s_qf_names[qtype] = NULL;
        ctx->qname_spec |= 1 << qtype;
        ctx->spec |= EXT4_SPEC_JQUOTA;
        return 0;
}
#endif

static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
                                            struct ext4_fs_context *ctx)
{
        int err;

        if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
                ext4_msg(NULL, KERN_WARNING,
                         "test_dummy_encryption option not supported");
                return -EINVAL;
        }
        err = fscrypt_parse_test_dummy_encryption(param,
                                                  &ctx->dummy_enc_policy);
        if (err == -EINVAL) {
                ext4_msg(NULL, KERN_WARNING,
                         "Value of option \"%s\" is unrecognized", param->key);
        } else if (err == -EEXIST) {
                ext4_msg(NULL, KERN_WARNING,
                         "Conflicting test_dummy_encryption options");
                return -EINVAL;
        }
        return err;
}

#define EXT4_SET_CTX(name)                                                \
static inline __maybe_unused                                                \
void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag)        \
{                                                                        \
        ctx->mask_s_##name |= flag;                                        \
        ctx->vals_s_##name |= flag;                                        \
}

#define EXT4_CLEAR_CTX(name)                                                \
static inline __maybe_unused                                                \
void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag)        \
{                                                                        \
        ctx->mask_s_##name |= flag;                                        \
        ctx->vals_s_##name &= ~flag;                                        \
}

#define EXT4_TEST_CTX(name)                                                \
static inline unsigned long                                                \
ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag)        \
{                                                                        \
        return (ctx->vals_s_##name & flag);                                \
}

EXT4_SET_CTX(flags); /* set only */
EXT4_SET_CTX(mount_opt);
EXT4_CLEAR_CTX(mount_opt);
EXT4_TEST_CTX(mount_opt);
EXT4_SET_CTX(mount_opt2);
EXT4_CLEAR_CTX(mount_opt2);
EXT4_TEST_CTX(mount_opt2);

static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct fs_parse_result result;
        const struct mount_opts *m;
        int is_remount;
        int token;

        token = fs_parse(fc, ext4_param_specs, param, &result);
        if (token < 0)
                return token;
        is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;

        for (m = ext4_mount_opts; m->token != Opt_err; m++)
                if (token == m->token)
                        break;

        ctx->opt_flags |= m->flags;

        if (m->flags & MOPT_EXPLICIT) {
                if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC);
                } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
                        ctx_set_mount_opt2(ctx,
                                       EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM);
                } else
                        return -EINVAL;
        }

        if (m->flags & MOPT_NOSUPPORT) {
                ext4_msg(NULL, KERN_ERR, "%s option not supported",
                         param->key);
                return 0;
        }

        switch (token) {
#ifdef CONFIG_QUOTA
        case Opt_usrjquota:
                if (!*param->string)
                        return unnote_qf_name(fc, USRQUOTA);
                else
                        return note_qf_name(fc, USRQUOTA, param);
        case Opt_grpjquota:
                if (!*param->string)
                        return unnote_qf_name(fc, GRPQUOTA);
                else
                        return note_qf_name(fc, GRPQUOTA, param);
#endif
        case Opt_sb:
                if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
                        ext4_msg(NULL, KERN_WARNING,
                                 "Ignoring %s option on remount", param->key);
                } else {
                        ctx->s_sb_block = result.uint_32;
                        ctx->spec |= EXT4_SPEC_s_sb_block;
                }
                return 0;
        case Opt_removed:
                ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
                         param->key);
                return 0;
        case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
                ctx_set_flags(ctx, SB_INLINECRYPT);
#else
                ext4_msg(NULL, KERN_ERR, "inline encryption not supported");
#endif
                return 0;
        case Opt_errors:
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK);
                ctx_set_mount_opt(ctx, result.uint_32);
                return 0;
#ifdef CONFIG_QUOTA
        case Opt_jqfmt:
                ctx->s_jquota_fmt = result.uint_32;
                ctx->spec |= EXT4_SPEC_JQFMT;
                return 0;
#endif
        case Opt_data:
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
                ctx_set_mount_opt(ctx, result.uint_32);
                ctx->spec |= EXT4_SPEC_DATAJ;
                return 0;
        case Opt_commit:
                if (result.uint_32 == 0)
                        result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE;
                else if (result.uint_32 > INT_MAX / HZ) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Invalid commit interval %d, "
                                 "must be smaller than %d",
                                 result.uint_32, INT_MAX / HZ);
                        return -EINVAL;
                }
                ctx->s_commit_interval = HZ * result.uint_32;
                ctx->spec |= EXT4_SPEC_s_commit_interval;
                return 0;
        case Opt_debug_want_extra_isize:
                if ((result.uint_32 & 1) || (result.uint_32 < 4)) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Invalid want_extra_isize %d", result.uint_32);
                        return -EINVAL;
                }
                ctx->s_want_extra_isize = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_want_extra_isize;
                return 0;
        case Opt_max_batch_time:
                ctx->s_max_batch_time = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_max_batch_time;
                return 0;
        case Opt_min_batch_time:
                ctx->s_min_batch_time = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_min_batch_time;
                return 0;
        case Opt_inode_readahead_blks:
                if (result.uint_32 &&
                    (result.uint_32 > (1 << 30) ||
                     !is_power_of_2(result.uint_32))) {
                        ext4_msg(NULL, KERN_ERR,
                                 "EXT4-fs: inode_readahead_blks must be "
                                 "0 or a power of 2 smaller than 2^31");
                        return -EINVAL;
                }
                ctx->s_inode_readahead_blks = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_inode_readahead_blks;
                return 0;
        case Opt_init_itable:
                ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE);
                ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
                if (param->type == fs_value_is_string)
                        ctx->s_li_wait_mult = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_li_wait_mult;
                return 0;
        case Opt_max_dir_size_kb:
                ctx->s_max_dir_size_kb = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
                return 0;
#ifdef CONFIG_EXT4_DEBUG
        case Opt_fc_debug_max_replay:
                ctx->s_fc_debug_max_replay = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay;
                return 0;
#endif
        case Opt_stripe:
                ctx->s_stripe = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_stripe;
                return 0;
        case Opt_resuid:
                ctx->s_resuid = result.uid;
                ctx->spec |= EXT4_SPEC_s_resuid;
                return 0;
        case Opt_resgid:
                ctx->s_resgid = result.gid;
                ctx->spec |= EXT4_SPEC_s_resgid;
                return 0;
        case Opt_journal_dev:
                if (is_remount) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Cannot specify journal on remount");
                        return -EINVAL;
                }
                ctx->journal_devnum = result.uint_32;
                ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
                return 0;
        case Opt_journal_path:
        {
                struct inode *journal_inode;
                struct path path;
                int error;

                if (is_remount) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Cannot specify journal on remount");
                        return -EINVAL;
                }

                error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
                if (error) {
                        ext4_msg(NULL, KERN_ERR, "error: could not find "
                                 "journal device path");
                        return -EINVAL;
                }

                journal_inode = d_inode(path.dentry);
                ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev);
                ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
                path_put(&path);
                return 0;
        }
        case Opt_journal_ioprio:
                if (result.uint_32 > 7) {
                        ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority"
                                 " (must be 0-7)");
                        return -EINVAL;
                }
                ctx->journal_ioprio =
                        IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32);
                ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
                return 0;
        case Opt_test_dummy_encryption:
                return ext4_parse_test_dummy_encryption(param, ctx);
        case Opt_dax:
        case Opt_dax_type:
#ifdef CONFIG_FS_DAX
        {
                int type = (token == Opt_dax) ?
                           Opt_dax : result.uint_32;

                switch (type) {
                case Opt_dax:
                case Opt_dax_always:
                        ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
                        ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
                        break;
                case Opt_dax_never:
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
                        break;
                case Opt_dax_inode:
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
                        ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
                        /* Strictly for printing options */
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE);
                        break;
                }
                return 0;
        }
#else
                ext4_msg(NULL, KERN_INFO, "dax option not supported");
                return -EINVAL;
#endif
        case Opt_data_err:
                if (result.uint_32 == Opt_data_err_abort)
                        ctx_set_mount_opt(ctx, m->mount_opt);
                else if (result.uint_32 == Opt_data_err_ignore)
                        ctx_clear_mount_opt(ctx, m->mount_opt);
                return 0;
        case Opt_mb_optimize_scan:
                if (result.int_32 == 1) {
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
                        ctx->spec |= EXT4_SPEC_mb_optimize_scan;
                } else if (result.int_32 == 0) {
                        ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
                        ctx->spec |= EXT4_SPEC_mb_optimize_scan;
                } else {
                        ext4_msg(NULL, KERN_WARNING,
                                 "mb_optimize_scan should be set to 0 or 1.");
                        return -EINVAL;
                }
                return 0;
        }

        /*
         * At this point we should only be getting options requiring MOPT_SET,
         * or MOPT_CLEAR. Anything else is a bug
         */
        if (m->token == Opt_err) {
                ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s",
                         param->key);
                WARN_ON(1);
                return -EINVAL;
        }

        else {
                unsigned int set = 0;

                if ((param->type == fs_value_is_flag) ||
                    result.uint_32 > 0)
                        set = 1;

                if (m->flags & MOPT_CLEAR)
                        set = !set;
                else if (unlikely(!(m->flags & MOPT_SET))) {
                        ext4_msg(NULL, KERN_WARNING,
                                 "buggy handling of option %s",
                                 param->key);
                        WARN_ON(1);
                        return -EINVAL;
                }
                if (m->flags & MOPT_2) {
                        if (set != 0)
                                ctx_set_mount_opt2(ctx, m->mount_opt);
                        else
                                ctx_clear_mount_opt2(ctx, m->mount_opt);
                } else {
                        if (set != 0)
                                ctx_set_mount_opt(ctx, m->mount_opt);
                        else
                                ctx_clear_mount_opt(ctx, m->mount_opt);
                }
        }

        return 0;
}

static int parse_options(struct fs_context *fc, char *options)
{
        struct fs_parameter param;
        int ret;
        char *key;

        if (!options)
                return 0;

        while ((key = strsep(&options, ",")) != NULL) {
                if (*key) {
                        size_t v_len = 0;
                        char *value = strchr(key, '=');

                        param.type = fs_value_is_flag;
                        param.string = NULL;

                        if (value) {
                                if (value == key)
                                        continue;

                                *value++ = 0;
                                v_len = strlen(value);
                                param.string = kmemdup_nul(value, v_len,
                                                           GFP_KERNEL);
                                if (!param.string)
                                        return -ENOMEM;
                                param.type = fs_value_is_string;
                        }

                        param.key = key;
                        param.size = v_len;

                        ret = ext4_parse_param(fc, &param);
                        kfree(param.string);
                        if (ret < 0)
                                return ret;
                }
        }

        ret = ext4_validate_options(fc);
        if (ret < 0)
                return ret;

        return 0;
}

static int parse_apply_sb_mount_options(struct super_block *sb,
                                        struct ext4_fs_context *m_ctx)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char s_mount_opts[64];
        struct ext4_fs_context *s_ctx = NULL;
        struct fs_context *fc = NULL;
        int ret = -ENOMEM;

        if (!sbi->s_es->s_mount_opts[0])
                return 0;

        if (strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts) < 0)
                return -E2BIG;

        fc = kzalloc_obj(struct fs_context);
        if (!fc)
                return -ENOMEM;

        s_ctx = kzalloc_obj(struct ext4_fs_context);
        if (!s_ctx)
                goto out_free;

        fc->fs_private = s_ctx;
        fc->s_fs_info = sbi;

        ret = parse_options(fc, s_mount_opts);
        if (ret < 0)
                goto parse_failed;

        ret = ext4_check_opt_consistency(fc, sb);
        if (ret < 0) {
parse_failed:
                ext4_msg(sb, KERN_WARNING,
                         "failed to parse options in superblock: %s",
                         s_mount_opts);
                ret = 0;
                goto out_free;
        }

        if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV)
                m_ctx->journal_devnum = s_ctx->journal_devnum;
        if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
                m_ctx->journal_ioprio = s_ctx->journal_ioprio;

        ext4_apply_options(fc, sb);
        ret = 0;

out_free:
        ext4_fc_free(fc);
        kfree(fc);
        return ret;
}

static void ext4_apply_quota_options(struct fs_context *fc,
                                     struct super_block *sb)
{
#ifdef CONFIG_QUOTA
        bool quota_feature = ext4_has_feature_quota(sb);
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *qname;
        int i;

        if (quota_feature)
                return;

        if (ctx->spec & EXT4_SPEC_JQUOTA) {
                for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                        if (!(ctx->qname_spec & (1 << i)))
                                continue;

                        qname = ctx->s_qf_names[i]; /* May be NULL */
                        if (qname)
                                set_opt(sb, QUOTA);
                        ctx->s_qf_names[i] = NULL;
                        qname = rcu_replace_pointer(sbi->s_qf_names[i], qname,
                                                lockdep_is_held(&sb->s_umount));
                        if (qname)
                                kfree_rcu_mightsleep(qname);
                }
        }

        if (ctx->spec & EXT4_SPEC_JQFMT)
                sbi->s_jquota_fmt = ctx->s_jquota_fmt;
#endif
}

/*
 * Check quota settings consistency.
 */
static int ext4_check_quota_consistency(struct fs_context *fc,
                                        struct super_block *sb)
{
#ifdef CONFIG_QUOTA
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        bool quota_feature = ext4_has_feature_quota(sb);
        bool quota_loaded = sb_any_quota_loaded(sb);
        bool usr_qf_name, grp_qf_name, usrquota, grpquota;
        int quota_flags, i;

        /*
         * We do the test below only for project quotas. 'usrquota' and
         * 'grpquota' mount options are allowed even without quota feature
         * to support legacy quotas in quota files.
         */
        if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) &&
            !ext4_has_feature_project(sb)) {
                ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. "
                         "Cannot enable project quota enforcement.");
                return -EINVAL;
        }

        quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
                      EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA;
        if (quota_loaded &&
            ctx->mask_s_mount_opt & quota_flags &&
            !ctx_test_mount_opt(ctx, quota_flags))
                goto err_quota_change;

        if (ctx->spec & EXT4_SPEC_JQUOTA) {

                for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                        if (!(ctx->qname_spec & (1 << i)))
                                continue;

                        if (quota_loaded &&
                            !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i])
                                goto err_jquota_change;

                        if (sbi->s_qf_names[i] && ctx->s_qf_names[i] &&
                            strcmp(get_qf_name(sb, sbi, i),
                                   ctx->s_qf_names[i]) != 0)
                                goto err_jquota_specified;
                }

                if (quota_feature) {
                        ext4_msg(NULL, KERN_INFO,
                                 "Journaled quota options ignored when "
                                 "QUOTA feature is enabled");
                        return 0;
                }
        }

        if (ctx->spec & EXT4_SPEC_JQFMT) {
                if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded)
                        goto err_jquota_change;
                if (quota_feature) {
                        ext4_msg(NULL, KERN_INFO, "Quota format mount options "
                                 "ignored when QUOTA feature is enabled");
                        return 0;
                }
        }

        /* Make sure we don't mix old and new quota format */
        usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) ||
                       ctx->s_qf_names[USRQUOTA]);
        grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) ||
                       ctx->s_qf_names[GRPQUOTA]);

        usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
                    test_opt(sb, USRQUOTA));

        grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) ||
                    test_opt(sb, GRPQUOTA));

        if (usr_qf_name) {
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
                usrquota = false;
        }
        if (grp_qf_name) {
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
                grpquota = false;
        }

        if (usr_qf_name || grp_qf_name) {
                if (usrquota || grpquota) {
                        ext4_msg(NULL, KERN_ERR, "old and new quota "
                                 "format mixing");
                        return -EINVAL;
                }

                if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) {
                        ext4_msg(NULL, KERN_ERR, "journaled quota format "
                                 "not specified");
                        return -EINVAL;
                }
        }

        return 0;

err_quota_change:
        ext4_msg(NULL, KERN_ERR,
                 "Cannot change quota options when quota turned on");
        return -EINVAL;
err_jquota_change:
        ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota "
                 "options when quota turned on");
        return -EINVAL;
err_jquota_specified:
        ext4_msg(NULL, KERN_ERR, "%s quota file already specified",
                 QTYPE2NAME(i));
        return -EINVAL;
#else
        return 0;
#endif
}

static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
                                            struct super_block *sb)
{
        const struct ext4_fs_context *ctx = fc->fs_private;
        const struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
                return 0;

        if (!ext4_has_feature_encrypt(sb)) {
                ext4_msg(NULL, KERN_WARNING,
                         "test_dummy_encryption requires encrypt feature");
                return -EINVAL;
        }
        /*
         * This mount option is just for testing, and it's not worthwhile to
         * implement the extra complexity (e.g. RCU protection) that would be
         * needed to allow it to be set or changed during remount.  We do allow
         * it to be specified during remount, but only if there is no change.
         */
        if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
                if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
                                                 &ctx->dummy_enc_policy))
                        return 0;
                ext4_msg(NULL, KERN_WARNING,
                         "Can't set or change test_dummy_encryption on remount");
                return -EINVAL;
        }
        /* Also make sure s_mount_opts didn't contain a conflicting value. */
        if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
                if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
                                                 &ctx->dummy_enc_policy))
                        return 0;
                ext4_msg(NULL, KERN_WARNING,
                         "Conflicting test_dummy_encryption options");
                return -EINVAL;
        }
        return 0;
}

static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
                                             struct super_block *sb)
{
        if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
            /* if already set, it was already verified to be the same */
            fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
                return;
        EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
        memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
        ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
}

static int ext4_check_opt_consistency(struct fs_context *fc,
                                      struct super_block *sb)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = fc->s_fs_info;
        int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
        int err;

        if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
                ext4_msg(NULL, KERN_ERR,
                         "Mount option(s) incompatible with ext2");
                return -EINVAL;
        }
        if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
                ext4_msg(NULL, KERN_ERR,
                         "Mount option(s) incompatible with ext3");
                return -EINVAL;
        }

        if (ctx->s_want_extra_isize >
            (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) {
                ext4_msg(NULL, KERN_ERR,
                         "Invalid want_extra_isize %d",
                         ctx->s_want_extra_isize);
                return -EINVAL;
        }

        err = ext4_check_test_dummy_encryption(fc, sb);
        if (err)
                return err;

        if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
                if (!sbi->s_journal) {
                        ext4_msg(NULL, KERN_WARNING,
                                 "Remounting file system with no journal "
                                 "so ignoring journalled data option");
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
                } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) !=
                           test_opt(sb, DATA_FLAGS)) {
                        ext4_msg(NULL, KERN_ERR, "Cannot change data mode "
                                 "on remount");
                        return -EINVAL;
                }
        }

        if (is_remount) {
                if (!sbi->s_journal &&
                    ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) {
                        ext4_msg(NULL, KERN_WARNING,
                                 "Remounting fs w/o journal so ignoring data_err option");
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT);
                }

                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
                    (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
                        ext4_msg(NULL, KERN_ERR, "can't mount with "
                                 "both data=journal and dax");
                        return -EINVAL;
                }

                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
                    (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
                     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
fail_dax_change_remount:
                        ext4_msg(NULL, KERN_ERR, "can't change "
                                 "dax mount option while remounting");
                        return -EINVAL;
                } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) &&
                         (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
                          (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) {
                        goto fail_dax_change_remount;
                } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) &&
                           ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
                            (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
                            !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
                        goto fail_dax_change_remount;
                }
        }

        return ext4_check_quota_consistency(fc, sb);
}

static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = fc->s_fs_info;

        sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
        sbi->s_mount_opt |= ctx->vals_s_mount_opt;
        sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
        sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
        sb->s_flags &= ~ctx->mask_s_flags;
        sb->s_flags |= ctx->vals_s_flags;

#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
        APPLY(s_commit_interval);
        APPLY(s_stripe);
        APPLY(s_max_batch_time);
        APPLY(s_min_batch_time);
        APPLY(s_want_extra_isize);
        APPLY(s_inode_readahead_blks);
        APPLY(s_max_dir_size_kb);
        APPLY(s_li_wait_mult);
        APPLY(s_resgid);
        APPLY(s_resuid);

#ifdef CONFIG_EXT4_DEBUG
        APPLY(s_fc_debug_max_replay);
#endif

        ext4_apply_quota_options(fc, sb);
        ext4_apply_test_dummy_encryption(ctx, sb);
}


static int ext4_validate_options(struct fs_context *fc)
{
#ifdef CONFIG_QUOTA
        struct ext4_fs_context *ctx = fc->fs_private;
        char *usr_qf_name, *grp_qf_name;

        usr_qf_name = ctx->s_qf_names[USRQUOTA];
        grp_qf_name = ctx->s_qf_names[GRPQUOTA];

        if (usr_qf_name || grp_qf_name) {
                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name)
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);

                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name)
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);

                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
                    ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) {
                        ext4_msg(NULL, KERN_ERR, "old and new quota "
                                 "format mixing");
                        return -EINVAL;
                }
        }
#endif
        return 1;
}

static inline void ext4_show_quota_options(struct seq_file *seq,
                                           struct super_block *sb)
{
#if defined(CONFIG_QUOTA)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *usr_qf_name, *grp_qf_name;

        if (sbi->s_jquota_fmt) {
                char *fmtname = "";

                switch (sbi->s_jquota_fmt) {
                case QFMT_VFS_OLD:
                        fmtname = "vfsold";
                        break;
                case QFMT_VFS_V0:
                        fmtname = "vfsv0";
                        break;
                case QFMT_VFS_V1:
                        fmtname = "vfsv1";
                        break;
                }
                seq_printf(seq, ",jqfmt=%s", fmtname);
        }

        rcu_read_lock();
        usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
        grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
        if (usr_qf_name)
                seq_show_option(seq, "usrjquota", usr_qf_name);
        if (grp_qf_name)
                seq_show_option(seq, "grpjquota", grp_qf_name);
        rcu_read_unlock();
#endif
}

static const char *token2str(int token)
{
        const struct fs_parameter_spec *spec;

        for (spec = ext4_param_specs; spec->name != NULL; spec++)
                if (spec->opt == token && !spec->type)
                        break;
        return spec->name;
}

/*
 * Show an option if
 *  - it's set to a non-default value OR
 *  - if the per-sb default is different from the global default
 */
static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
                              int nodefs)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int def_errors;
        const struct mount_opts *m;
        char sep = nodefs ? '\n' : ',';

#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)

        if (sbi->s_sb_block != 1)
                SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);

        for (m = ext4_mount_opts; m->token != Opt_err; m++) {
                int want_set = m->flags & MOPT_SET;
                int opt_2 = m->flags & MOPT_2;
                unsigned int mount_opt, def_mount_opt;

                if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
                    m->flags & MOPT_SKIP)
                        continue;

                if (opt_2) {
                        mount_opt = sbi->s_mount_opt2;
                        def_mount_opt = sbi->s_def_mount_opt2;
                } else {
                        mount_opt = sbi->s_mount_opt;
                        def_mount_opt = sbi->s_def_mount_opt;
                }
                /* skip if same as the default */
                if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt)))
                        continue;
                /* select Opt_noFoo vs Opt_Foo */
                if ((want_set &&
                     (mount_opt & m->mount_opt) != m->mount_opt) ||
                    (!want_set && (mount_opt & m->mount_opt)))
                        continue;
                SEQ_OPTS_PRINT("%s", token2str(m->token));
        }

        if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
            ext4_get_resuid(es) != EXT4_DEF_RESUID)
                SEQ_OPTS_PRINT("resuid=%u",
                                from_kuid_munged(&init_user_ns, sbi->s_resuid));
        if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
            ext4_get_resgid(es) != EXT4_DEF_RESGID)
                SEQ_OPTS_PRINT("resgid=%u",
                                from_kgid_munged(&init_user_ns, sbi->s_resgid));
        def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
        if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
                SEQ_OPTS_PUTS("errors=remount-ro");
        if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
                SEQ_OPTS_PUTS("errors=continue");
        if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
                SEQ_OPTS_PUTS("errors=panic");
        if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
                SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
        if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
                SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
        if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
                SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
        if (nodefs && sb->s_flags & SB_I_VERSION)
                SEQ_OPTS_PUTS("i_version");
        if (nodefs || sbi->s_stripe)
                SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
        if (nodefs || EXT4_MOUNT_DATA_FLAGS &
                        (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                        SEQ_OPTS_PUTS("data=journal");
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                        SEQ_OPTS_PUTS("data=ordered");
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                        SEQ_OPTS_PUTS("data=writeback");
        }
        if (nodefs ||
            sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
                SEQ_OPTS_PRINT("inode_readahead_blks=%u",
                               sbi->s_inode_readahead_blks);

        if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
                       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
                SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
        if (nodefs || sbi->s_max_dir_size_kb)
                SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
        if (test_opt(sb, DATA_ERR_ABORT))
                SEQ_OPTS_PUTS("data_err=abort");

        fscrypt_show_test_dummy_encryption(seq, sep, sb);

        if (sb->s_flags & SB_INLINECRYPT)
                SEQ_OPTS_PUTS("inlinecrypt");

        if (test_opt(sb, DAX_ALWAYS)) {
                if (IS_EXT2_SB(sb))
                        SEQ_OPTS_PUTS("dax");
                else
                        SEQ_OPTS_PUTS("dax=always");
        } else if (test_opt2(sb, DAX_NEVER)) {
                SEQ_OPTS_PUTS("dax=never");
        } else if (test_opt2(sb, DAX_INODE)) {
                SEQ_OPTS_PUTS("dax=inode");
        }

        if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
                        !test_opt2(sb, MB_OPTIMIZE_SCAN)) {
                SEQ_OPTS_PUTS("mb_optimize_scan=0");
        } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
                        test_opt2(sb, MB_OPTIMIZE_SCAN)) {
                SEQ_OPTS_PUTS("mb_optimize_scan=1");
        }

        if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS))
                SEQ_OPTS_PUTS("prefetch_block_bitmaps");

        if (ext4_emergency_ro(sb))
                SEQ_OPTS_PUTS("emergency_ro");

        if (ext4_forced_shutdown(sb))
                SEQ_OPTS_PUTS("shutdown");

        ext4_show_quota_options(seq, sb);
        return 0;
}

static int ext4_show_options(struct seq_file *seq, struct dentry *root)
{
        return _ext4_show_options(seq, root->d_sb, 0);
}

int ext4_seq_options_show(struct seq_file *seq, void *offset)
{
        struct super_block *sb = seq->private;
        int rc;

        seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
        rc = _ext4_show_options(seq, sb, 1);
        seq_putc(seq, '\n');
        return rc;
}

static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                            int read_only)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err = 0;

        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
                ext4_msg(sb, KERN_ERR, "revision level too high, "
                         "forcing read-only mode");
                err = -EROFS;
                goto done;
        }
        if (read_only)
                goto done;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
                ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
                         "running e2fsck is recommended");
        else if (sbi->s_mount_state & EXT4_ERROR_FS)
                ext4_msg(sb, KERN_WARNING,
                         "warning: mounting fs with errors, "
                         "running e2fsck is recommended");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
                ext4_msg(sb, KERN_WARNING,
                         "warning: maximal mount count reached, "
                         "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
                 (ext4_get_tstamp(es, s_lastcheck) +
                  le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
                ext4_msg(sb, KERN_WARNING,
                         "warning: checktime reached, "
                         "running e2fsck is recommended");
        if (!sbi->s_journal)
                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        ext4_update_tstamp(es, s_mtime);
        if (sbi->s_journal) {
                ext4_set_feature_journal_needs_recovery(sb);
                if (ext4_has_feature_orphan_file(sb))
                        ext4_set_feature_orphan_present(sb);
        }

        err = ext4_commit_super(sb);
done:
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt, sbi->s_mount_opt2);
        return err;
}

int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct flex_groups **old_groups, **new_groups;
        int size, i, j;

        if (!sbi->s_log_groups_per_flex)
                return 0;

        size = ext4_flex_group(sbi, ngroup - 1) + 1;
        if (size <= sbi->s_flex_groups_allocated)
                return 0;

        new_groups = kvzalloc(roundup_pow_of_two(size *
                              sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
        if (!new_groups) {
                ext4_msg(sb, KERN_ERR,
                         "not enough memory for %d flex group pointers", size);
                return -ENOMEM;
        }
        for (i = sbi->s_flex_groups_allocated; i < size; i++) {
                new_groups[i] = kvzalloc(roundup_pow_of_two(
                                         sizeof(struct flex_groups)),
                                         GFP_KERNEL);
                if (!new_groups[i]) {
                        for (j = sbi->s_flex_groups_allocated; j < i; j++)
                                kvfree(new_groups[j]);
                        kvfree(new_groups);
                        ext4_msg(sb, KERN_ERR,
                                 "not enough memory for %d flex groups", size);
                        return -ENOMEM;
                }
        }
        rcu_read_lock();
        old_groups = rcu_dereference(sbi->s_flex_groups);
        if (old_groups)
                memcpy(new_groups, old_groups,
                       (sbi->s_flex_groups_allocated *
                        sizeof(struct flex_groups *)));
        rcu_read_unlock();
        rcu_assign_pointer(sbi->s_flex_groups, new_groups);
        sbi->s_flex_groups_allocated = size;
        if (old_groups)
                ext4_kvfree_array_rcu(old_groups);
        return 0;
}

static int ext4_fill_flex_info(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
        struct flex_groups *fg;
        ext4_group_t flex_group;
        int i, err;

        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
        if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
                sbi->s_log_groups_per_flex = 0;
                return 1;
        }

        err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
        if (err)
                goto failed;

        for (i = 0; i < sbi->s_groups_count; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);

                flex_group = ext4_flex_group(sbi, i);
                fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
                atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
                atomic64_add(ext4_free_group_clusters(sb, gdp),
                             &fg->free_clusters);
                atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
        }

        return 1;
failed:
        return 0;
}

static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
                                   struct ext4_group_desc *gdp)
{
        int offset = offsetof(struct ext4_group_desc, bg_checksum);
        __u16 crc = 0;
        __le32 le_group = cpu_to_le32(block_group);
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (ext4_has_feature_metadata_csum(sbi->s_sb)) {
                /* Use new metadata_csum algorithm */
                __u32 csum32;
                __u16 dummy_csum = 0;

                csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group,
                                     sizeof(le_group));
                csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset);
                csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum,
                                     sizeof(dummy_csum));
                offset += sizeof(dummy_csum);
                if (offset < sbi->s_desc_size)
                        csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset,
                                             sbi->s_desc_size - offset);

                crc = csum32 & 0xFFFF;
                goto out;
        }

        /* old crc16 code */
        if (!ext4_has_feature_gdt_csum(sb))
                return 0;

        crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
        crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
        crc = crc16(crc, (__u8 *)gdp, offset);
        offset += sizeof(gdp->bg_checksum); /* skip checksum */
        /* for checksum of struct ext4_group_desc do the rest...*/
        if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size)
                crc = crc16(crc, (__u8 *)gdp + offset,
                            sbi->s_desc_size - offset);

out:
        return cpu_to_le16(crc);
}

int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
                                struct ext4_group_desc *gdp)
{
        if (ext4_has_group_desc_csum(sb) &&
            (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
                return 0;

        return 1;
}

void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
                              struct ext4_group_desc *gdp)
{
        if (!ext4_has_group_desc_csum(sb))
                return;
        gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
}

/* Called at mount-time, super-block is locked */
static int ext4_check_descriptors(struct super_block *sb,
                                  ext4_fsblk_t sb_block,
                                  ext4_group_t *first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
        ext4_fsblk_t last_block;
        ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
        ext4_fsblk_t block_bitmap;
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
        int flexbg_flag = 0;
        ext4_group_t i, grp = sbi->s_groups_count;

        if (ext4_has_feature_flex_bg(sb))
                flexbg_flag = 1;

        ext4_debug("Checking group descriptors");

        for (i = 0; i < sbi->s_groups_count; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);

                if (i == sbi->s_groups_count - 1 || flexbg_flag)
                        last_block = ext4_blocks_count(sbi->s_es) - 1;
                else
                        last_block = first_block +
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);

                if ((grp == sbi->s_groups_count) &&
                   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        grp = i;

                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Block bitmap for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (block_bitmap >= sb_block + 1 &&
                    block_bitmap <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Block bitmap for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Block bitmap for group %u not in group "
                               "(block %llu)!", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode bitmap for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_bitmap >= sb_block + 1 &&
                    inode_bitmap <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode bitmap for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode bitmap for group %u not in group "
                               "(block %llu)!", i, inode_bitmap);
                        return 0;
                }
                inode_table = ext4_inode_table(sb, gdp);
                if (inode_table == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode table for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_table >= sb_block + 1 &&
                    inode_table <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode table for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode table for group %u not in group "
                               "(block %llu)!", i, inode_table);
                        return 0;
                }
                ext4_lock_group(sb, i);
                if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Checksum for group %u failed (%u!=%u)",
                                 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
                                     gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!sb_rdonly(sb)) {
                                ext4_unlock_group(sb, i);
                                return 0;
                        }
                }
                ext4_unlock_group(sb, i);
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
        if (NULL != first_not_zeroed)
                *first_not_zeroed = grp;
        return 1;
}

/*
 * Maximal extent format file size.
 * Resulting logical blkno at s_maxbytes must fit in our on-disk
 * extent format containers, within a sector_t, and within i_blocks
 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
 * so that won't be a limiting factor.
 *
 * However there is other limiting factor. We do store extents in the form
 * of starting block and length, hence the resulting length of the extent
 * covering maximum file size must fit into on-disk format containers as
 * well. Given that length is always by 1 unit bigger than max unit (because
 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
 *
 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
 */
static loff_t ext4_max_size(int blkbits, int has_huge_files)
{
        loff_t res;
        loff_t upper_limit = MAX_LFS_FILESIZE;

        BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));

        if (!has_huge_files) {
                upper_limit = (1LL << 32) - 1;

                /* total blocks in file system block size */
                upper_limit >>= (blkbits - 9);
                upper_limit <<= blkbits;
        }

        /*
         * 32-bit extent-start container, ee_block. We lower the maxbytes
         * by one fs block, so ee_len can cover the extent of maximum file
         * size
         */
        res = (1LL << 32) - 1;
        res <<= blkbits;

        /* Sanity check against vm- & vfs- imposed limits */
        if (res > upper_limit)
                res = upper_limit;

        return res;
}

/*
 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
 * We need to be 1 filesystem block less than the 2^48 sector limit.
 */
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
        loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
        int meta_blocks;
        unsigned int ppb = 1 << (bits - 2);

        /*
         * This is calculated to be the largest file size for a dense, block
         * mapped file such that the file's total number of 512-byte sectors,
         * including data and all indirect blocks, does not exceed (2^48 - 1).
         *
         * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
         * number of 512-byte sectors of the file.
         */
        if (!has_huge_files) {
                /*
                 * !has_huge_files or implies that the inode i_block field
                 * represents total file blocks in 2^32 512-byte sectors ==
                 * size of vfs inode i_blocks * 8
                 */
                upper_limit = (1LL << 32) - 1;

                /* total blocks in file system block size */
                upper_limit >>= (bits - 9);

        } else {
                /*
                 * We use 48 bit ext4_inode i_blocks
                 * With EXT4_HUGE_FILE_FL set the i_blocks
                 * represent total number of blocks in
                 * file system block size
                 */
                upper_limit = (1LL << 48) - 1;

        }

        /* Compute how many blocks we can address by block tree */
        res += ppb;
        res += ppb * ppb;
        res += ((loff_t)ppb) * ppb * ppb;
        /* Compute how many metadata blocks are needed */
        meta_blocks = 1;
        meta_blocks += 1 + ppb;
        meta_blocks += 1 + ppb + ppb * ppb;
        /* Does block tree limit file size? */
        if (res + meta_blocks <= upper_limit)
                goto check_lfs;

        res = upper_limit;
        /* How many metadata blocks are needed for addressing upper_limit? */
        upper_limit -= EXT4_NDIR_BLOCKS;
        /* indirect blocks */
        meta_blocks = 1;
        upper_limit -= ppb;
        /* double indirect blocks */
        if (upper_limit < ppb * ppb) {
                meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
                res -= meta_blocks;
                goto check_lfs;
        }
        meta_blocks += 1 + ppb;
        upper_limit -= ppb * ppb;
        /* tripple indirect blocks for the rest */
        meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
                DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
        res -= meta_blocks;
check_lfs:
        res <<= bits;
        if (res > MAX_LFS_FILESIZE)
                res = MAX_LFS_FILESIZE;

        return res;
}

static ext4_fsblk_t descriptor_loc(struct super_block *sb,
                                   ext4_fsblk_t logical_sb_block, int nr)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t bg, first_meta_bg;
        int has_super = 0;

        first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);

        if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
                return logical_sb_block + nr + 1;
        bg = sbi->s_desc_per_block * nr;
        if (ext4_bg_has_super(sb, bg))
                has_super = 1;

        /*
         * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
         * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
         * on modern mke2fs or blksize > 1k on older mke2fs) then we must
         * compensate.
         */
        if (sb->s_blocksize == 1024 && nr == 0 &&
            le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
                has_super++;

        return (has_super + ext4_group_first_block_no(sb, bg));
}

/**
 * ext4_get_stripe_size: Get the stripe size.
 * @sbi: In memory super block info
 *
 * If we have specified it via mount option, then
 * use the mount option value. If the value specified at mount time is
 * greater than the blocks per group use the super block value.
 * If the super block value is greater than blocks per group return 0.
 * Allocator needs it be less than blocks per group.
 *
 */
static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
{
        unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
        unsigned long stripe_width =
                        le32_to_cpu(sbi->s_es->s_raid_stripe_width);
        int ret;

        if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
                ret = sbi->s_stripe;
        else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
                ret = stripe_width;
        else if (stride && stride <= sbi->s_blocks_per_group)
                ret = stride;
        else
                ret = 0;

        /*
         * If the stripe width is 1, this makes no sense and
         * we set it to 0 to turn off stripe handling code.
         */
        if (ret <= 1)
                ret = 0;

        return ret;
}

/*
 * Check whether this filesystem can be mounted based on
 * the features present and the RDONLY/RDWR mount requested.
 * Returns 1 if this filesystem can be mounted as requested,
 * 0 if it cannot be.
 */
int ext4_feature_set_ok(struct super_block *sb, int readonly)
{
        if (ext4_has_unknown_ext4_incompat_features(sb)) {
                ext4_msg(sb, KERN_ERR,
                        "Couldn't mount because of "
                        "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
                        ~EXT4_FEATURE_INCOMPAT_SUPP));
                return 0;
        }

        if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Filesystem with casefold feature cannot be "
                         "mounted without CONFIG_UNICODE");
                return 0;
        }

        if (readonly)
                return 1;

        if (ext4_has_feature_readonly(sb)) {
                ext4_msg(sb, KERN_INFO, "filesystem is read-only");
                sb->s_flags |= SB_RDONLY;
                return 1;
        }

        /* Check that feature set is OK for a read-write mount */
        if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
                ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
                         "unsupported optional features (%x)",
                         (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
                                ~EXT4_FEATURE_RO_COMPAT_SUPP));
                return 0;
        }
        if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Can't support bigalloc feature without "
                         "extents feature\n");
                return 0;
        }
        if (ext4_has_feature_bigalloc(sb) &&
            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
                ext4_msg(sb, KERN_WARNING,
                         "bad geometry: bigalloc file system with non-zero "
                         "first_data_block\n");
                return 0;
        }

#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
        if (!readonly && (ext4_has_feature_quota(sb) ||
                          ext4_has_feature_project(sb))) {
                ext4_msg(sb, KERN_ERR,
                         "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
                return 0;
        }
#endif  /* CONFIG_QUOTA */
        return 1;
}

/*
 * This function is called once a day by default if we have errors logged
 * on the file system.
 * Use the err_report_sec sysfs attribute to disable or adjust its call
 * freequency.
 */
void print_daily_error_info(struct timer_list *t)
{
        struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report);
        struct super_block *sb = sbi->s_sb;
        struct ext4_super_block *es = sbi->s_es;

        if (es->s_error_count)
                /* fsck newer than v1.41.13 is needed to clean this condition. */
                ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
                         le32_to_cpu(es->s_error_count));
        if (es->s_first_error_time) {
                printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
                       sb->s_id,
                       ext4_get_tstamp(es, s_first_error_time),
                       (int) sizeof(es->s_first_error_func),
                       es->s_first_error_func,
                       le32_to_cpu(es->s_first_error_line));
                if (es->s_first_error_ino)
                        printk(KERN_CONT ": inode %u",
                               le32_to_cpu(es->s_first_error_ino));
                if (es->s_first_error_block)
                        printk(KERN_CONT ": block %llu", (unsigned long long)
                               le64_to_cpu(es->s_first_error_block));
                printk(KERN_CONT "\n");
        }
        if (es->s_last_error_time) {
                printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
                       sb->s_id,
                       ext4_get_tstamp(es, s_last_error_time),
                       (int) sizeof(es->s_last_error_func),
                       es->s_last_error_func,
                       le32_to_cpu(es->s_last_error_line));
                if (es->s_last_error_ino)
                        printk(KERN_CONT ": inode %u",
                               le32_to_cpu(es->s_last_error_ino));
                if (es->s_last_error_block)
                        printk(KERN_CONT ": block %llu", (unsigned long long)
                               le64_to_cpu(es->s_last_error_block));
                printk(KERN_CONT "\n");
        }

        if (sbi->s_err_report_sec)
                mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec));
}

/* Find next suitable group and run ext4_init_inode_table */
static int ext4_run_li_request(struct ext4_li_request *elr)
{
        struct ext4_group_desc *gdp = NULL;
        struct super_block *sb = elr->lr_super;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        ext4_group_t group = elr->lr_next_group;
        unsigned int prefetch_ios = 0;
        int ret = 0;
        int nr = EXT4_SB(sb)->s_mb_prefetch;
        u64 start_time;

        if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
                elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios);
                ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr);
                trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr);
                if (group >= elr->lr_next_group) {
                        ret = 1;
                        if (elr->lr_first_not_zeroed != ngroups &&
                            !ext4_emergency_state(sb) && !sb_rdonly(sb) &&
                            test_opt(sb, INIT_INODE_TABLE)) {
                                elr->lr_next_group = elr->lr_first_not_zeroed;
                                elr->lr_mode = EXT4_LI_MODE_ITABLE;
                                ret = 0;
                        }
                }
                return ret;
        }

        for (; group < ngroups; group++) {
                gdp = ext4_get_group_desc(sb, group, NULL);
                if (!gdp) {
                        ret = 1;
                        break;
                }

                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        break;
        }

        if (group >= ngroups)
                ret = 1;

        if (!ret) {
                start_time = ktime_get_ns();
                ret = ext4_init_inode_table(sb, group,
                                            elr->lr_timeout ? 0 : 1);
                trace_ext4_lazy_itable_init(sb, group);
                if (elr->lr_timeout == 0) {
                        elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) *
                                EXT4_SB(elr->lr_super)->s_li_wait_mult);
                }
                elr->lr_next_sched = jiffies + elr->lr_timeout;
                elr->lr_next_group = group + 1;
        }
        return ret;
}

/*
 * Remove lr_request from the list_request and free the
 * request structure. Should be called with li_list_mtx held
 */
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
        if (!elr)
                return;

        list_del(&elr->lr_request);
        EXT4_SB(elr->lr_super)->s_li_request = NULL;
        kfree(elr);
}

static void ext4_unregister_li_request(struct super_block *sb)
{
        mutex_lock(&ext4_li_mtx);
        if (!ext4_li_info) {
                mutex_unlock(&ext4_li_mtx);
                return;
        }

        mutex_lock(&ext4_li_info->li_list_mtx);
        ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
        mutex_unlock(&ext4_li_info->li_list_mtx);
        mutex_unlock(&ext4_li_mtx);
}

static struct task_struct *ext4_lazyinit_task;

/*
 * This is the function where ext4lazyinit thread lives. It walks
 * through the request list searching for next scheduled filesystem.
 * When such a fs is found, run the lazy initialization request
 * (ext4_rn_li_request) and keep track of the time spend in this
 * function. Based on that time we compute next schedule time of
 * the request. When walking through the list is complete, compute
 * next waking time and put itself into sleep.
 */
static int ext4_lazyinit_thread(void *arg)
{
        struct ext4_lazy_init *eli = arg;
        struct list_head *pos, *n;
        struct ext4_li_request *elr;
        unsigned long next_wakeup, cur;

        BUG_ON(NULL == eli);
        set_freezable();

cont_thread:
        while (true) {
                bool next_wakeup_initialized = false;

                next_wakeup = 0;
                mutex_lock(&eli->li_list_mtx);
                if (list_empty(&eli->li_request_list)) {
                        mutex_unlock(&eli->li_list_mtx);
                        goto exit_thread;
                }
                list_for_each_safe(pos, n, &eli->li_request_list) {
                        int err = 0;
                        int progress = 0;
                        elr = list_entry(pos, struct ext4_li_request,
                                         lr_request);

                        if (time_before(jiffies, elr->lr_next_sched)) {
                                if (!next_wakeup_initialized ||
                                    time_before(elr->lr_next_sched, next_wakeup)) {
                                        next_wakeup = elr->lr_next_sched;
                                        next_wakeup_initialized = true;
                                }
                                continue;
                        }
                        if (down_read_trylock(&elr->lr_super->s_umount)) {
                                if (sb_start_write_trylock(elr->lr_super)) {
                                        progress = 1;
                                        /*
                                         * We hold sb->s_umount, sb can not
                                         * be removed from the list, it is
                                         * now safe to drop li_list_mtx
                                         */
                                        mutex_unlock(&eli->li_list_mtx);
                                        err = ext4_run_li_request(elr);
                                        sb_end_write(elr->lr_super);
                                        mutex_lock(&eli->li_list_mtx);
                                        n = pos->next;
                                }
                                up_read((&elr->lr_super->s_umount));
                        }
                        /* error, remove the lazy_init job */
                        if (err) {
                                ext4_remove_li_request(elr);
                                continue;
                        }
                        if (!progress) {
                                elr->lr_next_sched = jiffies +
                                        get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
                        }
                        if (!next_wakeup_initialized ||
                            time_before(elr->lr_next_sched, next_wakeup)) {
                                next_wakeup = elr->lr_next_sched;
                                next_wakeup_initialized = true;
                        }
                }
                mutex_unlock(&eli->li_list_mtx);

                try_to_freeze();

                cur = jiffies;
                if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) {
                        cond_resched();
                        continue;
                }

                schedule_timeout_interruptible(next_wakeup - cur);

                if (kthread_should_stop()) {
                        ext4_clear_request_list();
                        goto exit_thread;
                }
        }

exit_thread:
        /*
         * It looks like the request list is empty, but we need
         * to check it under the li_list_mtx lock, to prevent any
         * additions into it, and of course we should lock ext4_li_mtx
         * to atomically free the list and ext4_li_info, because at
         * this point another ext4 filesystem could be registering
         * new one.
         */
        mutex_lock(&ext4_li_mtx);
        mutex_lock(&eli->li_list_mtx);
        if (!list_empty(&eli->li_request_list)) {
                mutex_unlock(&eli->li_list_mtx);
                mutex_unlock(&ext4_li_mtx);
                goto cont_thread;
        }
        mutex_unlock(&eli->li_list_mtx);
        kfree(ext4_li_info);
        ext4_li_info = NULL;
        mutex_unlock(&ext4_li_mtx);

        return 0;
}

static void ext4_clear_request_list(void)
{
        struct list_head *pos, *n;
        struct ext4_li_request *elr;

        mutex_lock(&ext4_li_info->li_list_mtx);
        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
                elr = list_entry(pos, struct ext4_li_request,
                                 lr_request);
                ext4_remove_li_request(elr);
        }
        mutex_unlock(&ext4_li_info->li_list_mtx);
}

static int ext4_run_lazyinit_thread(void)
{
        ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
                                         ext4_li_info, "ext4lazyinit");
        if (IS_ERR(ext4_lazyinit_task)) {
                int err = PTR_ERR(ext4_lazyinit_task);
                ext4_clear_request_list();
                kfree(ext4_li_info);
                ext4_li_info = NULL;
                printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
                                 "initialization thread\n",
                                 err);
                return err;
        }
        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
        return 0;
}

/*
 * Check whether it make sense to run itable init. thread or not.
 * If there is at least one uninitialized inode table, return
 * corresponding group number, else the loop goes through all
 * groups and return total number of groups.
 */
static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
{
        ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
        struct ext4_group_desc *gdp = NULL;

        if (!ext4_has_group_desc_csum(sb))
                return ngroups;

        for (group = 0; group < ngroups; group++) {
                gdp = ext4_get_group_desc(sb, group, NULL);
                if (!gdp)
                        continue;

                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        break;
        }

        return group;
}

static int ext4_li_info_new(void)
{
        struct ext4_lazy_init *eli = NULL;

        eli = kzalloc_obj(*eli);
        if (!eli)
                return -ENOMEM;

        INIT_LIST_HEAD(&eli->li_request_list);
        mutex_init(&eli->li_list_mtx);

        eli->li_state |= EXT4_LAZYINIT_QUIT;

        ext4_li_info = eli;

        return 0;
}

static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
                                            ext4_group_t start)
{
        struct ext4_li_request *elr;

        elr = kzalloc_obj(*elr);
        if (!elr)
                return NULL;

        elr->lr_super = sb;
        elr->lr_first_not_zeroed = start;
        if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
                elr->lr_mode = EXT4_LI_MODE_ITABLE;
                elr->lr_next_group = start;
        } else {
                elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
        }

        /*
         * Randomize first schedule time of the request to
         * spread the inode table initialization requests
         * better.
         */
        elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
        return elr;
}

int ext4_register_li_request(struct super_block *sb,
                             ext4_group_t first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_li_request *elr = NULL;
        ext4_group_t ngroups = sbi->s_groups_count;
        int ret = 0;

        mutex_lock(&ext4_li_mtx);
        if (sbi->s_li_request != NULL) {
                /*
                 * Reset timeout so it can be computed again, because
                 * s_li_wait_mult might have changed.
                 */
                sbi->s_li_request->lr_timeout = 0;
                goto out;
        }

        if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
            (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
             (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
                goto out;

        elr = ext4_li_request_new(sb, first_not_zeroed);
        if (!elr) {
                ret = -ENOMEM;
                goto out;
        }

        if (NULL == ext4_li_info) {
                ret = ext4_li_info_new();
                if (ret)
                        goto out;
        }

        mutex_lock(&ext4_li_info->li_list_mtx);
        list_add(&elr->lr_request, &ext4_li_info->li_request_list);
        mutex_unlock(&ext4_li_info->li_list_mtx);

        sbi->s_li_request = elr;
        /*
         * set elr to NULL here since it has been inserted to
         * the request_list and the removal and free of it is
         * handled by ext4_clear_request_list from now on.
         */
        elr = NULL;

        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
                ret = ext4_run_lazyinit_thread();
                if (ret)
                        goto out;
        }
out:
        mutex_unlock(&ext4_li_mtx);
        if (ret)
                kfree(elr);
        return ret;
}

/*
 * We do not need to lock anything since this is called on
 * module unload.
 */
static void ext4_destroy_lazyinit_thread(void)
{
        /*
         * If thread exited earlier
         * there's nothing to be done.
         */
        if (!ext4_li_info || !ext4_lazyinit_task)
                return;

        kthread_stop(ext4_lazyinit_task);
}

static int set_journal_csum_feature_set(struct super_block *sb)
{
        int ret = 1;
        int compat, incompat;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (ext4_has_feature_metadata_csum(sb)) {
                /* journal checksum v3 */
                compat = 0;
                incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
        } else {
                /* journal checksum v1 */
                compat = JBD2_FEATURE_COMPAT_CHECKSUM;
                incompat = 0;
        }

        jbd2_journal_clear_features(sbi->s_journal,
                        JBD2_FEATURE_COMPAT_CHECKSUM, 0,
                        JBD2_FEATURE_INCOMPAT_CSUM_V3 |
                        JBD2_FEATURE_INCOMPAT_CSUM_V2);
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                ret = jbd2_journal_set_features(sbi->s_journal,
                                compat, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
                                incompat);
        } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
                ret = jbd2_journal_set_features(sbi->s_journal,
                                compat, 0,
                                incompat);
                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        } else {
                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        }

        return ret;
}

/*
 * Note: calculating the overhead so we can be compatible with
 * historical BSD practice is quite difficult in the face of
 * clusters/bigalloc.  This is because multiple metadata blocks from
 * different block group can end up in the same allocation cluster.
 * Calculating the exact overhead in the face of clustered allocation
 * requires either O(all block bitmaps) in memory or O(number of block
 * groups**2) in time.  We will still calculate the superblock for
 * older file systems --- and if we come across with a bigalloc file
 * system with zero in s_overhead_clusters the estimate will be close to
 * correct especially for very large cluster sizes --- but for newer
 * file systems, it's better to calculate this figure once at mkfs
 * time, and store it in the superblock.  If the superblock value is
 * present (even for non-bigalloc file systems), we will use it.
 */
static int count_overhead(struct super_block *sb, ext4_group_t grp,
                          char *buf)
{
        struct ext4_sb_info        *sbi = EXT4_SB(sb);
        struct ext4_group_desc        *gdp;
        ext4_fsblk_t                first_block, last_block, b;
        ext4_group_t                i, ngroups = ext4_get_groups_count(sb);
        int                        s, j, count = 0;
        int                        has_super = ext4_bg_has_super(sb, grp);

        if (!ext4_has_feature_bigalloc(sb))
                return (has_super + ext4_bg_num_gdb(sb, grp) +
                        (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
                        sbi->s_itb_per_group + 2);

        first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
                (grp * EXT4_BLOCKS_PER_GROUP(sb));
        last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                b = ext4_block_bitmap(sb, gdp);
                if (b >= first_block && b <= last_block) {
                        ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
                        count++;
                }
                b = ext4_inode_bitmap(sb, gdp);
                if (b >= first_block && b <= last_block) {
                        ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
                        count++;
                }
                b = ext4_inode_table(sb, gdp);
                if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
                        for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
                                int c = EXT4_B2C(sbi, b - first_block);
                                ext4_set_bit(c, buf);
                                count++;
                        }
                if (i != grp)
                        continue;
                s = 0;
                if (ext4_bg_has_super(sb, grp)) {
                        ext4_set_bit(s++, buf);
                        count++;
                }
                j = ext4_bg_num_gdb(sb, grp);
                if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
                        ext4_error(sb, "Invalid number of block group "
                                   "descriptor blocks: %d", j);
                        j = EXT4_BLOCKS_PER_GROUP(sb) - s;
                }
                count += j;
                for (; j > 0; j--)
                        ext4_set_bit(EXT4_B2C(sbi, s++), buf);
        }
        if (!count)
                return 0;
        return EXT4_CLUSTERS_PER_GROUP(sb) -
                ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
}

/*
 * Compute the overhead and stash it in sbi->s_overhead
 */
int ext4_calculate_overhead(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        struct inode *j_inode;
        unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        ext4_fsblk_t overhead = 0;
        char *buf = kvmalloc(sb->s_blocksize, GFP_NOFS | __GFP_ZERO);

        if (!buf)
                return -ENOMEM;

        /*
         * Compute the overhead (FS structures).  This is constant
         * for a given filesystem unless the number of block groups
         * changes so we cache the previous value until it does.
         */

        /*
         * All of the blocks before first_data_block are overhead
         */
        overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));

        /*
         * Add the overhead found in each block group
         */
        for (i = 0; i < ngroups; i++) {
                int blks;

                blks = count_overhead(sb, i, buf);
                overhead += blks;
                if (blks)
                        memset(buf, 0, sb->s_blocksize);
                cond_resched();
        }

        /*
         * Add the internal journal blocks whether the journal has been
         * loaded or not
         */
        if (sbi->s_journal && !sbi->s_journal_bdev_file)
                overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
        else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
                /* j_inum for internal journal is non-zero */
                j_inode = ext4_get_journal_inode(sb, j_inum);
                if (!IS_ERR(j_inode)) {
                        j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
                        overhead += EXT4_NUM_B2C(sbi, j_blocks);
                        iput(j_inode);
                } else {
                        ext4_msg(sb, KERN_ERR, "can't get journal size");
                }
        }
        sbi->s_overhead = overhead;
        smp_wmb();
        kvfree(buf);
        return 0;
}

static void ext4_set_resv_clusters(struct super_block *sb)
{
        ext4_fsblk_t resv_clusters;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /*
         * There's no need to reserve anything when we aren't using extents.
         * The space estimates are exact, there are no unwritten extents,
         * hole punching doesn't need new metadata... This is needed especially
         * to keep ext2/3 backward compatibility.
         */
        if (!ext4_has_feature_extents(sb))
                return;
        /*
         * By default we reserve 2% or 4096 clusters, whichever is smaller.
         * This should cover the situations where we can not afford to run
         * out of space like for example punch hole, or converting
         * unwritten extents in delalloc path. In most cases such
         * allocation would require 1, or 2 blocks, higher numbers are
         * very rare.
         */
        resv_clusters = (ext4_blocks_count(sbi->s_es) >>
                         sbi->s_cluster_bits);

        do_div(resv_clusters, 50);
        resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);

        atomic64_set(&sbi->s_resv_clusters, resv_clusters);
}

static const char *ext4_quota_mode(struct super_block *sb)
{
#ifdef CONFIG_QUOTA
        if (!ext4_quota_capable(sb))
                return "none";

        if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
                return "journalled";
        else
                return "writeback";
#else
        return "disabled";
#endif
}

static void ext4_setup_csum_trigger(struct super_block *sb,
                                    enum ext4_journal_trigger_type type,
                                    void (*trigger)(
                                        struct jbd2_buffer_trigger_type *type,
                                        struct buffer_head *bh,
                                        void *mapped_data,
                                        size_t size))
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        sbi->s_journal_triggers[type].sb = sb;
        sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
}

static void ext4_free_sbi(struct ext4_sb_info *sbi)
{
        if (!sbi)
                return;

        kfree(sbi->s_blockgroup_lock);
        fs_put_dax(sbi->s_daxdev, NULL);
        kfree(sbi);
}

static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
{
        struct ext4_sb_info *sbi;

        sbi = kzalloc_obj(*sbi);
        if (!sbi)
                return NULL;

        sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
                                           NULL, NULL);

        sbi->s_blockgroup_lock =
                kzalloc_obj(struct blockgroup_lock);

        if (!sbi->s_blockgroup_lock)
                goto err_out;

        sb->s_fs_info = sbi;
        sbi->s_sb = sb;
        return sbi;
err_out:
        fs_put_dax(sbi->s_daxdev, NULL);
        kfree(sbi);
        return NULL;
}

static void ext4_set_def_opts(struct super_block *sb,
                              struct ext4_super_block *es)
{
        unsigned long def_mount_opts;

        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sb, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
                set_opt(sb, GRPID);
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sb, NO_UID32);
        /* xattr user namespace & acls are now defaulted on */
        set_opt(sb, XATTR_USER);
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        set_opt(sb, POSIX_ACL);
#endif
        if (ext4_has_feature_fast_commit(sb))
                set_opt2(sb, JOURNAL_FAST_COMMIT);
        /* don't forget to enable journal_csum when metadata_csum is enabled. */
        if (ext4_has_feature_metadata_csum(sb))
                set_opt(sb, JOURNAL_CHECKSUM);

        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
                set_opt(sb, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
                set_opt(sb, WRITEBACK_DATA);

        if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
                set_opt(sb, ERRORS_PANIC);
        else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
                set_opt(sb, ERRORS_CONT);
        else
                set_opt(sb, ERRORS_RO);
        /* block_validity enabled by default; disable with noblock_validity */
        set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
                set_opt(sb, DISCARD);

        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
                set_opt(sb, BARRIER);

        /*
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
        if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                set_opt(sb, DELALLOC);

        set_opt(sb, DIOREAD_NOLOCK);
}

static int ext4_handle_clustersize(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int clustersize;

        /* Handle clustersize */
        clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
        if (ext4_has_feature_bigalloc(sb)) {
                if (clustersize < sb->s_blocksize) {
                        ext4_msg(sb, KERN_ERR,
                                 "cluster size (%d) smaller than "
                                 "block size (%lu)", clustersize, sb->s_blocksize);
                        return -EINVAL;
                }
                sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
                        le32_to_cpu(es->s_log_block_size);
        } else {
                if (clustersize != sb->s_blocksize) {
                        ext4_msg(sb, KERN_ERR,
                                 "fragment/cluster size (%d) != "
                                 "block size (%lu)", clustersize, sb->s_blocksize);
                        return -EINVAL;
                }
                if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
                        ext4_msg(sb, KERN_ERR,
                                 "#blocks per group too big: %lu",
                                 sbi->s_blocks_per_group);
                        return -EINVAL;
                }
                sbi->s_cluster_bits = 0;
        }
        sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group);
        if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
                ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu",
                         sbi->s_clusters_per_group);
                return -EINVAL;
        }
        if (sbi->s_blocks_per_group !=
            (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
                ext4_msg(sb, KERN_ERR,
                         "blocks per group (%lu) and clusters per group (%lu) inconsistent",
                         sbi->s_blocks_per_group, sbi->s_clusters_per_group);
                return -EINVAL;
        }
        sbi->s_cluster_ratio = clustersize / sb->s_blocksize;

        /* Do we have standard group size of clustersize * 8 blocks ? */
        if (sbi->s_blocks_per_group == clustersize << 3)
                set_opt2(sb, STD_GROUP_SIZE);

        return 0;
}

/*
 * ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
 * With non-bigalloc filesystem awu will be based upon filesystem blocksize
 * & bdev awu units.
 * With bigalloc it will be based upon bigalloc cluster size & bdev awu units.
 * @sb: super block
 */
static void ext4_atomic_write_init(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct block_device *bdev = sb->s_bdev;
        unsigned int clustersize = EXT4_CLUSTER_SIZE(sb);

        if (!bdev_can_atomic_write(bdev))
                return;

        if (!ext4_has_feature_extents(sb))
                return;

        sbi->s_awu_min = max(sb->s_blocksize,
                              bdev_atomic_write_unit_min_bytes(bdev));
        sbi->s_awu_max = min(clustersize,
                              bdev_atomic_write_unit_max_bytes(bdev));
        if (sbi->s_awu_min && sbi->s_awu_max &&
            sbi->s_awu_min <= sbi->s_awu_max) {
                ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
                         sbi->s_awu_min, sbi->s_awu_max);
        } else {
                sbi->s_awu_min = 0;
                sbi->s_awu_max = 0;
        }
}

static void ext4_fast_commit_init(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* Initialize fast commit stuff */
        atomic_set(&sbi->s_fc_subtid, 0);
        INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
        INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
        INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
        INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
        sbi->s_fc_bytes = 0;
        ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        sbi->s_fc_ineligible_tid = 0;
        mutex_init(&sbi->s_fc_lock);
        memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
        sbi->s_fc_replay_state.fc_regions = NULL;
        sbi->s_fc_replay_state.fc_regions_size = 0;
        sbi->s_fc_replay_state.fc_regions_used = 0;
        sbi->s_fc_replay_state.fc_regions_valid = 0;
        sbi->s_fc_replay_state.fc_modified_inodes = NULL;
        sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
        sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
}

static int ext4_inode_info_init(struct super_block *sb,
                                struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
                sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
                sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
        } else {
                sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
                sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
                if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
                        ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
                                 sbi->s_first_ino);
                        return -EINVAL;
                }
                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > sb->s_blocksize)) {
                        ext4_msg(sb, KERN_ERR,
                               "unsupported inode size: %d",
                               sbi->s_inode_size);
                        ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
                        return -EINVAL;
                }
                /*
                 * i_atime_extra is the last extra field available for
                 * [acm]times in struct ext4_inode. Checking for that
                 * field should suffice to ensure we have extra space
                 * for all three.
                 */
                if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
                        sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
                        sb->s_time_gran = 1;
                        sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
                } else {
                        sb->s_time_gran = NSEC_PER_SEC;
                        sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
                }
                sb->s_time_min = EXT4_TIMESTAMP_MIN;
        }

        if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
                        EXT4_GOOD_OLD_INODE_SIZE;
                if (ext4_has_feature_extra_isize(sb)) {
                        unsigned v, max = (sbi->s_inode_size -
                                           EXT4_GOOD_OLD_INODE_SIZE);

                        v = le16_to_cpu(es->s_want_extra_isize);
                        if (v > max) {
                                ext4_msg(sb, KERN_ERR,
                                         "bad s_want_extra_isize: %d", v);
                                return -EINVAL;
                        }
                        if (sbi->s_want_extra_isize < v)
                                sbi->s_want_extra_isize = v;

                        v = le16_to_cpu(es->s_min_extra_isize);
                        if (v > max) {
                                ext4_msg(sb, KERN_ERR,
                                         "bad s_min_extra_isize: %d", v);
                                return -EINVAL;
                        }
                        if (sbi->s_want_extra_isize < v)
                                sbi->s_want_extra_isize = v;
                }
        }

        return 0;
}

#if IS_ENABLED(CONFIG_UNICODE)
static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
{
        const struct ext4_sb_encodings *encoding_info;
        struct unicode_map *encoding;
        __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);

        if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
                return 0;

        encoding_info = ext4_sb_read_encoding(es);
        if (!encoding_info) {
                ext4_msg(sb, KERN_ERR,
                        "Encoding requested by superblock is unknown");
                return -EINVAL;
        }

        encoding = utf8_load(encoding_info->version);
        if (IS_ERR(encoding)) {
                ext4_msg(sb, KERN_ERR,
                        "can't mount with superblock charset: %s-%u.%u.%u "
                        "not supported by the kernel. flags: 0x%x.",
                        encoding_info->name,
                        unicode_major(encoding_info->version),
                        unicode_minor(encoding_info->version),
                        unicode_rev(encoding_info->version),
                        encoding_flags);
                return -EINVAL;
        }
        ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
                "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
                unicode_major(encoding_info->version),
                unicode_minor(encoding_info->version),
                unicode_rev(encoding_info->version),
                encoding_flags);

        sb->s_encoding = encoding;
        sb->s_encoding_flags = encoding_flags;

        return 0;
}
#else
static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
{
        return 0;
}
#endif

static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* Warn if metadata_csum and gdt_csum are both set. */
        if (ext4_has_feature_metadata_csum(sb) &&
            ext4_has_feature_gdt_csum(sb))
                ext4_warning(sb, "metadata_csum and uninit_bg are "
                             "redundant flags; please run fsck.");

        /* Check for a known checksum algorithm */
        if (!ext4_verify_csum_type(sb, es)) {
                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
                         "unknown checksum algorithm.");
                return -EINVAL;
        }
        ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
                                ext4_orphan_file_block_trigger);

        /* Check superblock checksum */
        if (!ext4_superblock_csum_verify(sb, es)) {
                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
                         "invalid superblock checksum.  Run e2fsck?");
                return -EFSBADCRC;
        }

        /* Precompute checksum seed for all metadata */
        if (ext4_has_feature_csum_seed(sb))
                sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
        else if (ext4_has_feature_metadata_csum(sb) ||
                 ext4_has_feature_ea_inode(sb))
                sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid,
                                               sizeof(es->s_uuid));
        return 0;
}

static int ext4_check_feature_compatibility(struct super_block *sb,
                                            struct ext4_super_block *es,
                                            int silent)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
            (ext4_has_compat_features(sb) ||
             ext4_has_ro_compat_features(sb) ||
             ext4_has_incompat_features(sb)))
                ext4_msg(sb, KERN_WARNING,
                       "feature flags set on rev 0 fs, "
                       "running e2fsck is recommended");

        if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
                set_opt2(sb, HURD_COMPAT);
                if (ext4_has_feature_64bit(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "The Hurd can't support 64-bit file systems");
                        return -EINVAL;
                }

                /*
                 * ea_inode feature uses l_i_version field which is not
                 * available in HURD_COMPAT mode.
                 */
                if (ext4_has_feature_ea_inode(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "ea_inode feature is not supported for Hurd");
                        return -EINVAL;
                }
        }

        if (IS_EXT2_SB(sb)) {
                if (ext2_feature_set_ok(sb))
                        ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
                                 "using the ext4 subsystem");
                else {
                        /*
                         * If we're probing be silent, if this looks like
                         * it's actually an ext[34] filesystem.
                         */
                        if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
                                return -EINVAL;
                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
                                 "to feature incompatibilities");
                        return -EINVAL;
                }
        }

        if (IS_EXT3_SB(sb)) {
                if (ext3_feature_set_ok(sb))
                        ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
                                 "using the ext4 subsystem");
                else {
                        /*
                         * If we're probing be silent, if this looks like
                         * it's actually an ext4 filesystem.
                         */
                        if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
                                return -EINVAL;
                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
                                 "to feature incompatibilities");
                        return -EINVAL;
                }
        }

        /*
         * Check feature flags regardless of the revision level, since we
         * previously didn't change the revision level when setting the flags,
         * so there is a chance incompat flags are set on a rev 0 filesystem.
         */
        if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
                return -EINVAL;

        if (sbi->s_daxdev) {
                if (sb->s_blocksize == PAGE_SIZE)
                        set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
                else
                        ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
        }

        if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
                if (ext4_has_feature_inline_data(sb)) {
                        ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
                                        " that may contain inline data");
                        return -EINVAL;
                }
                if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
                        ext4_msg(sb, KERN_ERR,
                                "DAX unsupported by block device.");
                        return -EINVAL;
                }
        }

        if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
                ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
                         es->s_encryption_level);
                return -EINVAL;
        }

        return 0;
}

static int ext4_check_geometry(struct super_block *sb,
                               struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        __u64 blocks_count;
        int err;

        if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
                ext4_msg(sb, KERN_ERR,
                         "Number of reserved GDT blocks insanely large: %d",
                         le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
                return -EINVAL;
        }
        /*
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
        err = generic_check_addressable(sb->s_blocksize_bits,
                                        ext4_blocks_count(es));
        if (err) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                return err;
        }

        /* check blocks count against device size */
        blocks_count = sb_bdev_nr_blocks(sb);
        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
                       "exceeds size of device (%llu blocks)",
                       ext4_blocks_count(es), blocks_count);
                return -EINVAL;
        }

        /*
         * It makes no sense for the first data block to be beyond the end
         * of the filesystem.
         */
        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
                         "block %u is beyond end of filesystem (%llu)",
                         le32_to_cpu(es->s_first_data_block),
                         ext4_blocks_count(es));
                return -EINVAL;
        }
        if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
            (sbi->s_cluster_ratio == 1)) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
                         "block is 0 with a 1k block and cluster size");
                return -EINVAL;
        }

        blocks_count = (ext4_blocks_count(es) -
                        le32_to_cpu(es->s_first_data_block) +
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
                ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
                       "(block count %llu, first data block %u, "
                       "blocks per group %lu)", blocks_count,
                       ext4_blocks_count(es),
                       le32_to_cpu(es->s_first_data_block),
                       EXT4_BLOCKS_PER_GROUP(sb));
                return -EINVAL;
        }
        sbi->s_groups_count = blocks_count;
        sbi->s_blockfile_groups = min(sbi->s_groups_count,
                        (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
        if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
            le32_to_cpu(es->s_inodes_count)) {
                ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
                         le32_to_cpu(es->s_inodes_count),
                         ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
                return -EINVAL;
        }

        return 0;
}

static int ext4_group_desc_init(struct super_block *sb,
                                struct ext4_super_block *es,
                                ext4_fsblk_t logical_sb_block,
                                ext4_group_t *first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned int db_count;
        ext4_fsblk_t block;
        int i;

        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
        if (ext4_has_feature_meta_bg(sb)) {
                if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
                        ext4_msg(sb, KERN_WARNING,
                                 "first meta block group too large: %u "
                                 "(group descriptor block count %u)",
                                 le32_to_cpu(es->s_first_meta_bg), db_count);
                        return -EINVAL;
                }
        }
        rcu_assign_pointer(sbi->s_group_desc,
                           kvmalloc_objs(struct buffer_head *, db_count));
        if (sbi->s_group_desc == NULL) {
                ext4_msg(sb, KERN_ERR, "not enough memory");
                return -ENOMEM;
        }

        bgl_lock_init(sbi->s_blockgroup_lock);

        /* Pre-read the descriptors into the buffer cache */
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logical_sb_block, i);
                ext4_sb_breadahead_unmovable(sb, block);
        }

        for (i = 0; i < db_count; i++) {
                struct buffer_head *bh;

                block = descriptor_loc(sb, logical_sb_block, i);
                bh = ext4_sb_bread_unmovable(sb, block);
                if (IS_ERR(bh)) {
                        ext4_msg(sb, KERN_ERR,
                               "can't read group descriptor %d", i);
                        sbi->s_gdb_count = i;
                        return PTR_ERR(bh);
                }
                rcu_read_lock();
                rcu_dereference(sbi->s_group_desc)[i] = bh;
                rcu_read_unlock();
        }
        sbi->s_gdb_count = db_count;
        if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                return -EFSCORRUPTED;
        }

        return 0;
}

static int ext4_load_and_init_journal(struct super_block *sb,
                                      struct ext4_super_block *es,
                                      struct ext4_fs_context *ctx)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;

        err = ext4_load_journal(sb, es, ctx->journal_devnum);
        if (err)
                return err;

        if (ext4_has_feature_64bit(sb) &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
                goto out;
        }

        if (!set_journal_csum_feature_set(sb)) {
                ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
                         "feature set");
                goto out;
        }

        if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
                !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                          JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
                ext4_msg(sb, KERN_ERR,
                        "Failed to set fast commit journal feature");
                goto out;
        }

        /* We have now updated the journal if required, so we can
         * validate the data journaling mode. */
        switch (test_opt(sb, DATA_FLAGS)) {
        case 0:
                /* No mode set, assume a default based on the journal
                 * capabilities: ORDERED_DATA if the journal can
                 * cope, else JOURNAL_DATA
                 */
                if (jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
                        set_opt(sb, ORDERED_DATA);
                        sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
                } else {
                        set_opt(sb, JOURNAL_DATA);
                        sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
                }
                break;

        case EXT4_MOUNT_ORDERED_DATA:
        case EXT4_MOUNT_WRITEBACK_DATA:
                if (!jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
                        ext4_msg(sb, KERN_ERR, "Journal does not support "
                               "requested data journaling mode");
                        goto out;
                }
                break;
        default:
                break;
        }

        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
            test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                ext4_msg(sb, KERN_ERR, "can't mount with "
                        "journal_async_commit in data=ordered mode");
                goto out;
        }

        set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);

        sbi->s_journal->j_submit_inode_data_buffers =
                ext4_journal_submit_inode_data_buffers;
        sbi->s_journal->j_finish_inode_data_buffers =
                ext4_journal_finish_inode_data_buffers;

        return 0;

out:
        ext4_journal_destroy(sbi, sbi->s_journal);
        return -EINVAL;
}

static int ext4_check_journal_data_mode(struct super_block *sb)
{
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
                            "data=journal disables delayed allocation, "
                            "dioread_nolock, O_DIRECT and fast_commit support!\n");
                /* can't mount with both data=journal and dioread_nolock. */
                clear_opt(sb, DIOREAD_NOLOCK);
                clear_opt2(sb, JOURNAL_FAST_COMMIT);
                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and delalloc");
                        return -EINVAL;
                }
                if (test_opt(sb, DAX_ALWAYS)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and dax");
                        return -EINVAL;
                }
                if (ext4_has_feature_encrypt(sb)) {
                        ext4_msg(sb, KERN_WARNING,
                                 "encrypted files will use data=ordered "
                                 "instead of data journaling mode");
                }
                if (test_opt(sb, DELALLOC))
                        clear_opt(sb, DELALLOC);
        } else {
                sb->s_iflags |= SB_I_CGROUPWB;
        }

        return 0;
}

static const char *ext4_has_journal_option(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
                return "journal_async_commit";
        if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM))
                return "journal_checksum";
        if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
                return "commit=";
        if (EXT4_MOUNT_DATA_FLAGS &
            (sbi->s_mount_opt ^ sbi->s_def_mount_opt))
                return "data=";
        if (test_opt(sb, DATA_ERR_ABORT))
                return "data_err=abort";
        return NULL;
}

/*
 * Limit the maximum folio order to 2048 blocks to prevent overestimation
 * of reserve handle credits during the folio writeback in environments
 * where the PAGE_SIZE exceeds 4KB.
 */
#define EXT4_MAX_PAGECACHE_ORDER(sb)                \
                umin(MAX_PAGECACHE_ORDER, (11 + (sb)->s_blocksize_bits - PAGE_SHIFT))
static void ext4_set_max_mapping_order(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                sbi->s_max_folio_order = sbi->s_min_folio_order;
        else
                sbi->s_max_folio_order = EXT4_MAX_PAGECACHE_ORDER(sb);
}

static int ext4_check_large_folio(struct super_block *sb)
{
        const char *err_str = NULL;

        if (ext4_has_feature_encrypt(sb))
                err_str = "encrypt";

        if (!err_str) {
                ext4_set_max_mapping_order(sb);
        } else if (sb->s_blocksize > PAGE_SIZE) {
                ext4_msg(sb, KERN_ERR, "bs(%lu) > ps(%lu) unsupported for %s",
                         sb->s_blocksize, PAGE_SIZE, err_str);
                return -EINVAL;
        }

        return 0;
}

static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
                           int silent)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es;
        ext4_fsblk_t logical_sb_block;
        unsigned long offset = 0;
        struct buffer_head *bh;
        int ret = -EINVAL;
        int blocksize;

        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
                ext4_msg(sb, KERN_ERR, "unable to set blocksize");
                return -EINVAL;
        }

        /*
         * The ext4 superblock will not be buffer aligned for other than 1kB
         * block sizes.  We need to calculate the offset from buffer start.
         */
        if (blocksize != EXT4_MIN_BLOCK_SIZE) {
                logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
                offset = do_div(logical_sb_block, blocksize);
        } else {
                logical_sb_block = sbi->s_sb_block;
        }

        bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
        if (IS_ERR(bh)) {
                ext4_msg(sb, KERN_ERR, "unable to read superblock");
                return PTR_ERR(bh);
        }
        /*
         * Note: s_es must be initialized as soon as possible because
         *       some ext4 macro-instructions depend on its value
         */
        es = (struct ext4_super_block *) (bh->b_data + offset);
        sbi->s_es = es;
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT4_SUPER_MAGIC) {
                if (!silent)
                        ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
                goto out;
        }

        if (le32_to_cpu(es->s_log_block_size) >
            (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
                ext4_msg(sb, KERN_ERR,
                         "Invalid log block size: %u",
                         le32_to_cpu(es->s_log_block_size));
                goto out;
        }
        if (le32_to_cpu(es->s_log_cluster_size) >
            (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
                ext4_msg(sb, KERN_ERR,
                         "Invalid log cluster size: %u",
                         le32_to_cpu(es->s_log_cluster_size));
                goto out;
        }

        blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);

        /*
         * If the default block size is not the same as the real block size,
         * we need to reload it.
         */
        if (sb->s_blocksize == blocksize)
                goto success;

        /*
         * bh must be released before kill_bdev(), otherwise
         * it won't be freed and its page also. kill_bdev()
         * is called by sb_set_blocksize().
         */
        brelse(bh);
        /* Validate the filesystem blocksize */
        if (!sb_set_blocksize(sb, blocksize)) {
                ext4_msg(sb, KERN_ERR, "bad block size %d",
                                blocksize);
                bh = NULL;
                goto out;
        }

        logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
        offset = do_div(logical_sb_block, blocksize);
        bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
        if (IS_ERR(bh)) {
                ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
                ret = PTR_ERR(bh);
                bh = NULL;
                goto out;
        }
        es = (struct ext4_super_block *)(bh->b_data + offset);
        sbi->s_es = es;
        if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
                ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
                goto out;
        }

success:
        sbi->s_min_folio_order = get_order(blocksize);
        *lsb = logical_sb_block;
        sbi->s_sbh = bh;
        return 0;
out:
        brelse(bh);
        return ret;
}

static int ext4_hash_info_init(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        unsigned int i;

        sbi->s_def_hash_version = es->s_def_hash_version;

        if (sbi->s_def_hash_version > DX_HASH_LAST) {
                ext4_msg(sb, KERN_ERR,
                         "Invalid default hash set in the superblock");
                return -EINVAL;
        } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) {
                ext4_msg(sb, KERN_ERR,
                         "SIPHASH is not a valid default hash value");
                return -EINVAL;
        }

        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);

        if (ext4_has_feature_dir_index(sb)) {
                i = le32_to_cpu(es->s_flags);
                if (i & EXT2_FLAGS_UNSIGNED_HASH)
                        sbi->s_hash_unsigned = 3;
                else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
#ifdef __CHAR_UNSIGNED__
                        if (!sb_rdonly(sb))
                                es->s_flags |=
                                        cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
                        sbi->s_hash_unsigned = 3;
#else
                        if (!sb_rdonly(sb))
                                es->s_flags |=
                                        cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
#endif
                }
        }
        return 0;
}

static int ext4_block_group_meta_init(struct super_block *sb, int silent)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int has_huge_files;

        has_huge_files = ext4_has_feature_huge_file(sb);
        sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
                                                      has_huge_files);
        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);

        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
        if (ext4_has_feature_64bit(sb)) {
                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
                    !is_power_of_2(sbi->s_desc_size)) {
                        ext4_msg(sb, KERN_ERR,
                               "unsupported descriptor size %lu",
                               sbi->s_desc_size);
                        return -EINVAL;
                }
        } else
                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;

        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);

        sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
                if (!silent)
                        ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
                return -EINVAL;
        }
        if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
            sbi->s_inodes_per_group > sb->s_blocksize * 8) {
                ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
                         sbi->s_inodes_per_group);
                return -EINVAL;
        }
        sbi->s_itb_per_group = sbi->s_inodes_per_group /
                                        sbi->s_inodes_per_block;
        sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
        sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));

        return 0;
}

/*
 * It's hard to get stripe aligned blocks if stripe is not aligned with
 * cluster, just disable stripe and alert user to simplify code and avoid
 * stripe aligned allocation which will rarely succeed.
 */
static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        return (stripe > 0 && sbi->s_cluster_ratio > 1 &&
                stripe % sbi->s_cluster_ratio != 0);
}

static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
{
        struct ext4_super_block *es = NULL;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t logical_sb_block;
        struct inode *root;
        int needs_recovery;
        int err;
        ext4_group_t first_not_zeroed;
        struct ext4_fs_context *ctx = fc->fs_private;
        int silent = fc->sb_flags & SB_SILENT;

        /* Set defaults for the variables that will be set during parsing */
        if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
                ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;

        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sectors_written_start =
                part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);

        err = ext4_load_super(sb, &logical_sb_block, silent);
        if (err)
                goto out_fail;

        es = sbi->s_es;
        sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);

        err = ext4_init_metadata_csum(sb, es);
        if (err)
                goto failed_mount;

        ext4_set_def_opts(sb, es);

        sbi->s_resuid = make_kuid(&init_user_ns, ext4_get_resuid(es));
        sbi->s_resgid = make_kgid(&init_user_ns, ext4_get_resuid(es));
        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
        sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB;
        sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC;

        /*
         * set default s_li_wait_mult for lazyinit, for the case there is
         * no mount option specified.
         */
        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;

        err = ext4_inode_info_init(sb, es);
        if (err)
                goto failed_mount;

        err = parse_apply_sb_mount_options(sb, ctx);
        if (err < 0)
                goto failed_mount;

        sbi->s_def_mount_opt = sbi->s_mount_opt;
        sbi->s_def_mount_opt2 = sbi->s_mount_opt2;

        err = ext4_check_opt_consistency(fc, sb);
        if (err < 0)
                goto failed_mount;

        ext4_apply_options(fc, sb);

        err = ext4_check_large_folio(sb);
        if (err < 0)
                goto failed_mount;

        err = ext4_encoding_init(sb, es);
        if (err)
                goto failed_mount;

        err = ext4_check_journal_data_mode(sb);
        if (err)
                goto failed_mount;

        sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);

        /* HSM events are allowed by default. */
        sb->s_iflags |= SB_I_ALLOW_HSM;

        err = ext4_check_feature_compatibility(sb, es, silent);
        if (err)
                goto failed_mount;

        err = ext4_block_group_meta_init(sb, silent);
        if (err)
                goto failed_mount;

        err = ext4_hash_info_init(sb);
        if (err)
                goto failed_mount;

        err = ext4_handle_clustersize(sb);
        if (err)
                goto failed_mount;

        err = ext4_check_geometry(sb, es);
        if (err)
                goto failed_mount;

        timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
        spin_lock_init(&sbi->s_error_lock);
        mutex_init(&sbi->s_error_notify_mutex);
        INIT_WORK(&sbi->s_sb_upd_work, update_super_work);

        err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
        if (err)
                goto failed_mount3;

        err = ext4_es_register_shrinker(sbi);
        if (err)
                goto failed_mount3;

        sbi->s_stripe = ext4_get_stripe_size(sbi);
        if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) {
                ext4_msg(sb, KERN_WARNING,
                         "stripe (%lu) is not aligned with cluster size (%u), "
                         "stripe is disabled",
                         sbi->s_stripe, sbi->s_cluster_ratio);
                sbi->s_stripe = 0;
        }
        sbi->s_extent_max_zeroout_kb = 32;

        /*
         * set up enough so that it can read an inode
         */
        sb->s_op = &ext4_sops;
        sb->s_export_op = &ext4_export_ops;
        sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_FS_ENCRYPTION
        sb->s_cop = &ext4_cryptops;
#endif
#ifdef CONFIG_FS_VERITY
        sb->s_vop = &ext4_verityops;
#endif
#ifdef CONFIG_QUOTA
        sb->dq_op = &ext4_quota_operations;
        if (ext4_has_feature_quota(sb))
                sb->s_qcop = &dquot_quotactl_sysfile_ops;
        else
                sb->s_qcop = &ext4_qctl_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
        super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid));
        super_set_sysfs_name_bdev(sb);

        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);

        spin_lock_init(&sbi->s_bdev_wb_lock);

        ext4_atomic_write_init(sb);
        ext4_fast_commit_init(sb);

        sb->s_root = NULL;

        needs_recovery = (es->s_last_orphan != 0 ||
                          ext4_has_feature_orphan_present(sb) ||
                          ext4_has_feature_journal_needs_recovery(sb));

        if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
                err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
                if (err)
                        goto failed_mount3a;
        }

        err = -EINVAL;
        /*
         * The first inode we look at is the journal inode.  Don't try
         * root first: it may be modified in the journal!
         */
        if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
                err = ext4_load_and_init_journal(sb, es, ctx);
                if (err)
                        goto failed_mount3a;
                if (bdev_read_only(sb->s_bdev))
                    needs_recovery = 0;
        } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
                   ext4_has_feature_journal_needs_recovery(sb)) {
                ext4_msg(sb, KERN_ERR, "required journal recovery "
                       "suppressed and not mounted read-only");
                goto failed_mount3a;
        } else {
                const char *journal_option;

                /* Nojournal mode, all journal mount options are illegal */
                journal_option = ext4_has_journal_option(sb);
                if (journal_option != NULL) {
                        ext4_msg(sb, KERN_ERR,
                                 "can't mount with %s, fs mounted w/o journal",
                                 journal_option);
                        goto failed_mount3a;
                }

                sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
                clear_opt(sb, JOURNAL_CHECKSUM);
                clear_opt(sb, DATA_FLAGS);
                clear_opt2(sb, JOURNAL_FAST_COMMIT);
                sbi->s_journal = NULL;
                needs_recovery = 0;
        }

        if (!test_opt(sb, NO_MBCACHE)) {
                sbi->s_ea_block_cache = ext4_xattr_create_cache();
                if (!sbi->s_ea_block_cache) {
                        ext4_msg(sb, KERN_ERR,
                                 "Failed to create ea_block_cache");
                        err = -EINVAL;
                        goto failed_mount_wq;
                }

                if (ext4_has_feature_ea_inode(sb)) {
                        sbi->s_ea_inode_cache = ext4_xattr_create_cache();
                        if (!sbi->s_ea_inode_cache) {
                                ext4_msg(sb, KERN_ERR,
                                         "Failed to create ea_inode_cache");
                                err = -EINVAL;
                                goto failed_mount_wq;
                        }
                }
        }

        /*
         * Get the # of file system overhead blocks from the
         * superblock if present.
         */
        sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
        /* ignore the precalculated value if it is ridiculous */
        if (sbi->s_overhead > ext4_blocks_count(es))
                sbi->s_overhead = 0;
        /*
         * If the bigalloc feature is not enabled recalculating the
         * overhead doesn't take long, so we might as well just redo
         * it to make sure we are using the correct value.
         */
        if (!ext4_has_feature_bigalloc(sb))
                sbi->s_overhead = 0;
        if (sbi->s_overhead == 0) {
                err = ext4_calculate_overhead(sb);
                if (err)
                        goto failed_mount_wq;
        }

        /*
         * The maximum number of concurrent works can be high and
         * concurrency isn't really necessary.  Limit it to 1.
         */
        EXT4_SB(sb)->rsv_conversion_wq =
                alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
        if (!EXT4_SB(sb)->rsv_conversion_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
                err = -ENOMEM;
                goto failed_mount4;
        }

        /*
         * The jbd2_journal_load will have done any necessary log recovery,
         * so we can safely mount the rest of the filesystem now.
         */

        root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
        if (IS_ERR(root)) {
                ext4_msg(sb, KERN_ERR, "get root inode failed");
                err = PTR_ERR(root);
                root = NULL;
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                iput(root);
                err = -EFSCORRUPTED;
                goto failed_mount4;
        }

        generic_set_sb_d_ops(sb);
        sb->s_root = d_make_root(root);
        if (!sb->s_root) {
                ext4_msg(sb, KERN_ERR, "get root dentry failed");
                err = -ENOMEM;
                goto failed_mount4;
        }

        err = ext4_setup_super(sb, es, sb_rdonly(sb));
        if (err == -EROFS) {
                sb->s_flags |= SB_RDONLY;
        } else if (err)
                goto failed_mount4a;

        ext4_set_resv_clusters(sb);

        if (test_opt(sb, BLOCK_VALIDITY)) {
                err = ext4_setup_system_zone(sb);
                if (err) {
                        ext4_msg(sb, KERN_ERR, "failed to initialize system "
                                 "zone (%d)", err);
                        goto failed_mount4a;
                }
        }
        ext4_fc_replay_cleanup(sb);

        ext4_ext_init(sb);

        /*
         * Enable optimize_scan if number of groups is > threshold. This can be
         * turned off by passing "mb_optimize_scan=0". This can also be
         * turned on forcefully by passing "mb_optimize_scan=1".
         */
        if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
                if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
                        set_opt2(sb, MB_OPTIMIZE_SCAN);
                else
                        clear_opt2(sb, MB_OPTIMIZE_SCAN);
        }

        err = ext4_percpu_param_init(sbi);
        if (err)
                goto failed_mount5;

        err = ext4_mb_init(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
                         err);
                goto failed_mount5;
        }

        /*
         * We can only set up the journal commit callback once
         * mballoc is initialized
         */
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback =
                        ext4_journal_commit_callback;

        if (ext4_has_feature_flex_bg(sb))
                if (!ext4_fill_flex_info(sb)) {
                        ext4_msg(sb, KERN_ERR,
                               "unable to initialize "
                               "flex_bg meta info!");
                        err = -ENOMEM;
                        goto failed_mount6;
                }

        err = ext4_register_li_request(sb, first_not_zeroed);
        if (err)
                goto failed_mount6;

        err = ext4_init_orphan_info(sb);
        if (err)
                goto failed_mount7;
#ifdef CONFIG_QUOTA
        /* Enable quota usage during mount. */
        if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
                err = ext4_enable_quotas(sb);
                if (err)
                        goto failed_mount8;
        }
#endif  /* CONFIG_QUOTA */

        /*
         * Save the original bdev mapping's wb_err value which could be
         * used to detect the metadata async write error.
         */
        errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
                                 &sbi->s_bdev_wb_err);
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
        /*
         * Update the checksum after updating free space/inode counters and
         * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
         * checksum in the buffer cache until it is written out and
         * e2fsprogs programs trying to open a file system immediately
         * after it is mounted can fail.
         */
        ext4_superblock_csum_set(sb);
        if (needs_recovery) {
                ext4_msg(sb, KERN_INFO, "recovery complete");
                err = ext4_mark_recovery_complete(sb, es);
                if (err)
                        goto failed_mount9;
        }

        if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) {
                ext4_msg(sb, KERN_WARNING,
                         "mounting with \"discard\" option, but the device does not support discard");
                clear_opt(sb, DISCARD);
        }

        if (es->s_error_count) {
                sbi->s_err_report_sec = 5*60;        /* first time  5 minutes */
                mod_timer(&sbi->s_err_report,
                                  jiffies + secs_to_jiffies(sbi->s_err_report_sec));
        }
        sbi->s_err_report_sec = 24*60*60; /* Once a day */

        /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
        ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
        ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
        ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
        atomic_set(&sbi->s_warning_count, 0);
        atomic_set(&sbi->s_msg_count, 0);

        /* Register sysfs after all initializations are complete. */
        err = ext4_register_sysfs(sb);
        if (err)
                goto failed_mount9;

        return 0;

failed_mount9:
        ext4_quotas_off(sb, EXT4_MAXQUOTAS);
failed_mount8: __maybe_unused
        ext4_release_orphan_info(sb);
failed_mount7:
        ext4_unregister_li_request(sb);
failed_mount6:
        ext4_mb_release(sb);
        ext4_flex_groups_free(sbi);
failed_mount5:
        ext4_percpu_param_destroy(sbi);
        ext4_ext_release(sb);
        ext4_release_system_zone(sb);
failed_mount4a:
        dput(sb->s_root);
        sb->s_root = NULL;
failed_mount4:
        ext4_msg(sb, KERN_ERR, "mount failed");
        if (EXT4_SB(sb)->rsv_conversion_wq)
                destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
        ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
        sbi->s_ea_inode_cache = NULL;

        ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
        sbi->s_ea_block_cache = NULL;

        if (sbi->s_journal) {
                ext4_journal_destroy(sbi, sbi->s_journal);
        }
failed_mount3a:
        ext4_es_unregister_shrinker(sbi);
failed_mount3:
        /* flush s_sb_upd_work before sbi destroy */
        flush_work(&sbi->s_sb_upd_work);
        ext4_stop_mmpd(sbi);
        timer_delete_sync(&sbi->s_err_report);
        ext4_group_desc_free(sbi);
failed_mount:
#if IS_ENABLED(CONFIG_UNICODE)
        utf8_unload(sb->s_encoding);
#endif

#ifdef CONFIG_QUOTA
        for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(get_qf_name(sb, sbi, i));
#endif
        fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
        brelse(sbi->s_sbh);
        if (sbi->s_journal_bdev_file) {
                invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
                bdev_fput(sbi->s_journal_bdev_file);
        }
out_fail:
        invalidate_bdev(sb->s_bdev);
        sb->s_fs_info = NULL;
        return err;
}

static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi;
        const char *descr;
        int ret;

        sbi = ext4_alloc_sbi(sb);
        if (!sbi)
                return -ENOMEM;

        fc->s_fs_info = sbi;

        /* Cleanup superblock name */
        strreplace(sb->s_id, '/', '!');

        sbi->s_sb_block = 1;        /* Default super block location */
        if (ctx->spec & EXT4_SPEC_s_sb_block)
                sbi->s_sb_block = ctx->s_sb_block;

        ret = __ext4_fill_super(fc, sb);
        if (ret < 0)
                goto free_sbi;

        if (sbi->s_journal) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                        descr = " journalled data mode";
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                        descr = " ordered data mode";
                else
                        descr = " writeback data mode";
        } else
                descr = "out journal";

        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
                ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. "
                         "Quota mode: %s.", &sb->s_uuid,
                         sb_rdonly(sb) ? "ro" : "r/w", descr,
                         ext4_quota_mode(sb));

        /* Update the s_overhead_clusters if necessary */
        ext4_update_overhead(sb, false);
        return 0;

free_sbi:
        ext4_free_sbi(sbi);
        fc->s_fs_info = NULL;
        return ret;
}

static int ext4_get_tree(struct fs_context *fc)
{
        return get_tree_bdev(fc, ext4_fill_super);
}

/*
 * Setup any per-fs journal parameters now.  We'll do this both on
 * initial mount, once the journal has been initialised but before we've
 * done any recovery; and again on any subsequent remount.
 */
static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        journal->j_commit_interval = sbi->s_commit_interval;
        journal->j_min_batch_time = sbi->s_min_batch_time;
        journal->j_max_batch_time = sbi->s_max_batch_time;
        ext4_fc_init(sb, journal);

        write_lock(&journal->j_state_lock);
        if (test_opt(sb, BARRIER))
                journal->j_flags |= JBD2_BARRIER;
        else
                journal->j_flags &= ~JBD2_BARRIER;
        /*
         * Always enable journal cycle record option, letting the journal
         * records log transactions continuously between each mount.
         */
        journal->j_flags |= JBD2_CYCLE_RECORD;
        write_unlock(&journal->j_state_lock);
}

static struct inode *ext4_get_journal_inode(struct super_block *sb,
                                             unsigned int journal_inum)
{
        struct inode *journal_inode;

        /*
         * Test for the existence of a valid inode on disk.  Bad things
         * happen if we iget() an unused inode, as the subsequent iput()
         * will try to delete it.
         */
        journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
        if (IS_ERR(journal_inode)) {
                ext4_msg(sb, KERN_ERR, "no journal found");
                return ERR_CAST(journal_inode);
        }
        if (!journal_inode->i_nlink) {
                make_bad_inode(journal_inode);
                iput(journal_inode);
                ext4_msg(sb, KERN_ERR, "journal inode is deleted");
                return ERR_PTR(-EFSCORRUPTED);
        }
        if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
                ext4_msg(sb, KERN_ERR, "invalid journal inode");
                iput(journal_inode);
                return ERR_PTR(-EFSCORRUPTED);
        }

        ext4_debug("Journal inode found at %p: %lld bytes\n",
                  journal_inode, journal_inode->i_size);
        return journal_inode;
}

static int ext4_journal_bmap(journal_t *journal, sector_t *block)
{
        struct ext4_map_blocks map;
        int ret;

        if (journal->j_inode == NULL)
                return 0;

        map.m_lblk = *block;
        map.m_len = 1;
        ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0);
        if (ret <= 0) {
                ext4_msg(journal->j_inode->i_sb, KERN_CRIT,
                         "journal bmap failed: block %llu ret %d\n",
                         *block, ret);
                jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED);
                return ret;
        }
        *block = map.m_pblk;
        return 0;
}

static journal_t *ext4_open_inode_journal(struct super_block *sb,
                                          unsigned int journal_inum)
{
        struct inode *journal_inode;
        journal_t *journal;

        journal_inode = ext4_get_journal_inode(sb, journal_inum);
        if (IS_ERR(journal_inode))
                return ERR_CAST(journal_inode);

        journal = jbd2_journal_init_inode(journal_inode);
        if (IS_ERR(journal)) {
                ext4_msg(sb, KERN_ERR, "Could not load journal inode");
                iput(journal_inode);
                return ERR_CAST(journal);
        }
        journal->j_private = sb;
        journal->j_bmap = ext4_journal_bmap;
        ext4_init_journal_params(sb, journal);
        return journal;
}

static struct file *ext4_get_journal_blkdev(struct super_block *sb,
                                        dev_t j_dev, ext4_fsblk_t *j_start,
                                        ext4_fsblk_t *j_len)
{
        struct buffer_head *bh;
        struct block_device *bdev;
        struct file *bdev_file;
        int hblock, blocksize;
        ext4_fsblk_t sb_block;
        unsigned long offset;
        struct ext4_super_block *es;
        int errno;

        bdev_file = bdev_file_open_by_dev(j_dev,
                BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
                sb, &fs_holder_ops);
        if (IS_ERR(bdev_file)) {
                ext4_msg(sb, KERN_ERR,
                         "failed to open journal device unknown-block(%u,%u) %ld",
                         MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
                return bdev_file;
        }

        bdev = file_bdev(bdev_file);
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
                ext4_msg(sb, KERN_ERR,
                        "blocksize too small for journal device");
                errno = -EINVAL;
                goto out_bdev;
        }

        sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
        offset = EXT4_MIN_BLOCK_SIZE % blocksize;
        set_blocksize(bdev_file, blocksize);
        bh = __bread(bdev, sb_block, blocksize);
        if (!bh) {
                ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
                       "external journal");
                errno = -EINVAL;
                goto out_bdev;
        }

        es = (struct ext4_super_block *) (bh->b_data + offset);
        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
                ext4_msg(sb, KERN_ERR, "external journal has bad superblock");
                errno = -EFSCORRUPTED;
                goto out_bh;
        }

        if ((le32_to_cpu(es->s_feature_ro_compat) &
             EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
            es->s_checksum != ext4_superblock_csum(es)) {
                ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
                errno = -EFSCORRUPTED;
                goto out_bh;
        }

        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
                ext4_msg(sb, KERN_ERR, "journal UUID does not match");
                errno = -EFSCORRUPTED;
                goto out_bh;
        }

        *j_start = sb_block + 1;
        *j_len = ext4_blocks_count(es);
        brelse(bh);
        return bdev_file;

out_bh:
        brelse(bh);
out_bdev:
        bdev_fput(bdev_file);
        return ERR_PTR(errno);
}

static journal_t *ext4_open_dev_journal(struct super_block *sb,
                                        dev_t j_dev)
{
        journal_t *journal;
        ext4_fsblk_t j_start;
        ext4_fsblk_t j_len;
        struct file *bdev_file;
        int errno = 0;

        bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
        if (IS_ERR(bdev_file))
                return ERR_CAST(bdev_file);

        journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start,
                                        j_len, sb->s_blocksize);
        if (IS_ERR(journal)) {
                ext4_msg(sb, KERN_ERR, "failed to create device journal");
                errno = PTR_ERR(journal);
                goto out_bdev;
        }
        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
                ext4_msg(sb, KERN_ERR, "External journal has more than one "
                                        "user (unsupported) - %d",
                        be32_to_cpu(journal->j_superblock->s_nr_users));
                errno = -EINVAL;
                goto out_journal;
        }
        journal->j_private = sb;
        EXT4_SB(sb)->s_journal_bdev_file = bdev_file;
        ext4_init_journal_params(sb, journal);
        return journal;

out_journal:
        ext4_journal_destroy(EXT4_SB(sb), journal);
out_bdev:
        bdev_fput(bdev_file);
        return ERR_PTR(errno);
}

static int ext4_load_journal(struct super_block *sb,
                             struct ext4_super_block *es,
                             unsigned long journal_devnum)
{
        journal_t *journal;
        unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
        dev_t journal_dev;
        int err = 0;
        int really_read_only;
        int journal_dev_ro;

        if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
                return -EFSCORRUPTED;

        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                ext4_msg(sb, KERN_INFO, "external journal device major/minor "
                        "numbers have changed");
                journal_dev = new_decode_dev(journal_devnum);
        } else
                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));

        if (journal_inum && journal_dev) {
                ext4_msg(sb, KERN_ERR,
                         "filesystem has both journal inode and journal device!");
                return -EINVAL;
        }

        if (journal_inum) {
                journal = ext4_open_inode_journal(sb, journal_inum);
                if (IS_ERR(journal))
                        return PTR_ERR(journal);
        } else {
                journal = ext4_open_dev_journal(sb, journal_dev);
                if (IS_ERR(journal))
                        return PTR_ERR(journal);
        }

        journal_dev_ro = bdev_read_only(journal->j_dev);
        really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;

        if (journal_dev_ro && !sb_rdonly(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "journal device read-only, try mounting with '-o ro'");
                err = -EROFS;
                goto err_out;
        }

        /*
         * Are we loading a blank journal or performing recovery after a
         * crash?  For recovery, we need to check in advance whether we
         * can get read-write access to the device.
         */
        if (ext4_has_feature_journal_needs_recovery(sb)) {
                if (sb_rdonly(sb)) {
                        ext4_msg(sb, KERN_INFO, "INFO: recovery "
                                        "required on readonly filesystem");
                        if (really_read_only) {
                                ext4_msg(sb, KERN_ERR, "write access "
                                        "unavailable, cannot proceed "
                                        "(try mounting with noload)");
                                err = -EROFS;
                                goto err_out;
                        }
                        ext4_msg(sb, KERN_INFO, "write access will "
                               "be enabled during recovery");
                }
        }

        if (!(journal->j_flags & JBD2_BARRIER))
                ext4_msg(sb, KERN_INFO, "barriers disabled");

        if (!ext4_has_feature_journal_needs_recovery(sb))
                err = jbd2_journal_wipe(journal, !really_read_only);
        if (!err) {
                char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
                __le16 orig_state;
                bool changed = false;

                if (save)
                        memcpy(save, ((char *) es) +
                               EXT4_S_ERR_START, EXT4_S_ERR_LEN);
                err = jbd2_journal_load(journal);
                if (save && memcmp(((char *) es) + EXT4_S_ERR_START,
                                   save, EXT4_S_ERR_LEN)) {
                        memcpy(((char *) es) + EXT4_S_ERR_START,
                               save, EXT4_S_ERR_LEN);
                        changed = true;
                }
                kfree(save);
                orig_state = es->s_state;
                es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state &
                                           EXT4_ERROR_FS);
                if (orig_state != es->s_state)
                        changed = true;
                /* Write out restored error information to the superblock */
                if (changed && !really_read_only) {
                        int err2;
                        err2 = ext4_commit_super(sb);
                        err = err ? : err2;
                }
        }

        if (err) {
                ext4_msg(sb, KERN_ERR, "error loading journal");
                goto err_out;
        }

        EXT4_SB(sb)->s_journal = journal;
        err = ext4_clear_journal_err(sb, es);
        if (err) {
                ext4_journal_destroy(EXT4_SB(sb), journal);
                return err;
        }

        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
                ext4_commit_super(sb);
        }
        if (!really_read_only && journal_inum &&
            journal_inum != le32_to_cpu(es->s_journal_inum)) {
                es->s_journal_inum = cpu_to_le32(journal_inum);
                ext4_commit_super(sb);
        }

        return 0;

err_out:
        ext4_journal_destroy(EXT4_SB(sb), journal);
        return err;
}

/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */
static void ext4_update_super(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        struct buffer_head *sbh = sbi->s_sbh;

        lock_buffer(sbh);
        /*
         * If the file system is mounted read-only, don't update the
         * superblock write time.  This avoids updating the superblock
         * write time when we are mounting the root file system
         * read/only but we need to replay the journal; at that point,
         * for people who are east of GMT and who make their clock
         * tick in localtime for Windows bug-for-bug compatibility,
         * the clock is set in the future, and this will cause e2fsck
         * to complain and force a full file system check.
         */
        if (!sb_rdonly(sb))
                ext4_update_tstamp(es, s_wtime);
        es->s_kbytes_written =
                cpu_to_le64(sbi->s_kbytes_written +
                    ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
                      sbi->s_sectors_written_start) >> 1));
        if (percpu_counter_initialized(&sbi->s_freeclusters_counter))
                ext4_free_blocks_count_set(es,
                        EXT4_C2B(sbi, percpu_counter_sum_positive(
                                &sbi->s_freeclusters_counter)));
        if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
                es->s_free_inodes_count =
                        cpu_to_le32(percpu_counter_sum_positive(
                                &sbi->s_freeinodes_counter));
        /* Copy error information to the on-disk superblock */
        spin_lock(&sbi->s_error_lock);
        if (sbi->s_add_error_count > 0) {
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                if (!es->s_first_error_time && !es->s_first_error_time_hi) {
                        __ext4_update_tstamp(&es->s_first_error_time,
                                             &es->s_first_error_time_hi,
                                             sbi->s_first_error_time);
                        strtomem_pad(es->s_first_error_func,
                                     sbi->s_first_error_func, 0);
                        es->s_first_error_line =
                                cpu_to_le32(sbi->s_first_error_line);
                        es->s_first_error_ino =
                                cpu_to_le32(sbi->s_first_error_ino);
                        es->s_first_error_block =
                                cpu_to_le64(sbi->s_first_error_block);
                        es->s_first_error_errcode =
                                ext4_errno_to_code(sbi->s_first_error_code);
                }
                __ext4_update_tstamp(&es->s_last_error_time,
                                     &es->s_last_error_time_hi,
                                     sbi->s_last_error_time);
                strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0);
                es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
                es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
                es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
                es->s_last_error_errcode =
                                ext4_errno_to_code(sbi->s_last_error_code);
                /*
                 * Start the daily error reporting function if it hasn't been
                 * started already and sbi->s_err_report_sec is not zero
                 */
                if (!es->s_error_count && !sbi->s_err_report_sec)
                        mod_timer(&sbi->s_err_report,
                                          jiffies + secs_to_jiffies(sbi->s_err_report_sec));
                le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
                sbi->s_add_error_count = 0;
        }
        spin_unlock(&sbi->s_error_lock);

        ext4_superblock_csum_set(sb);
        unlock_buffer(sbh);
}

static int ext4_commit_super(struct super_block *sb)
{
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;

        if (!sbh)
                return -EINVAL;

        ext4_update_super(sb);

        lock_buffer(sbh);
        /* Buffer got discarded which means block device got invalidated */
        if (!buffer_mapped(sbh)) {
                unlock_buffer(sbh);
                return -EIO;
        }

        if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
                /*
                 * Oh, dear.  A previous attempt to write the
                 * superblock failed.  This could happen because the
                 * USB device was yanked out.  Or it could happen to
                 * be a transient write error and maybe the block will
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
                ext4_msg(sb, KERN_ERR, "previous I/O error to "
                       "superblock detected");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
        get_bh(sbh);
        /* Clear potential dirty bit if it was journalled update */
        clear_buffer_dirty(sbh);
        sbh->b_end_io = end_buffer_write_sync;
        submit_bh(REQ_OP_WRITE | REQ_SYNC |
                  (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
        wait_on_buffer(sbh);
        if (buffer_write_io_error(sbh)) {
                ext4_msg(sb, KERN_ERR, "I/O error while writing "
                       "superblock");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
                return -EIO;
        }
        return 0;
}

/*
 * Have we just finished recovery?  If so, and if we are mounting (or
 * remounting) the filesystem readonly, then we will end up with a
 * consistent fs on disk.  Record that fact.
 */
static int ext4_mark_recovery_complete(struct super_block *sb,
                                       struct ext4_super_block *es)
{
        int err;
        journal_t *journal = EXT4_SB(sb)->s_journal;

        if (!ext4_has_feature_journal(sb)) {
                if (journal != NULL) {
                        ext4_error(sb, "Journal got removed while the fs was "
                                   "mounted!");
                        return -EFSCORRUPTED;
                }
                return 0;
        }
        jbd2_journal_lock_updates(journal);
        err = jbd2_journal_flush(journal, 0);
        if (err < 0)
                goto out;

        if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
            ext4_has_feature_orphan_present(sb))) {
                if (!ext4_orphan_file_empty(sb)) {
                        ext4_error(sb, "Orphan file not empty on read-only fs.");
                        err = -EFSCORRUPTED;
                        goto out;
                }
                ext4_clear_feature_journal_needs_recovery(sb);
                ext4_clear_feature_orphan_present(sb);
                ext4_commit_super(sb);
        }
out:
        jbd2_journal_unlock_updates(journal);
        return err;
}

/*
 * If we are mounting (or read-write remounting) a filesystem whose journal
 * has recorded an error from a previous lifetime, move that error to the
 * main filesystem now.
 */
static int ext4_clear_journal_err(struct super_block *sb,
                                   struct ext4_super_block *es)
{
        journal_t *journal;
        int j_errno;
        const char *errstr;

        if (!ext4_has_feature_journal(sb)) {
                ext4_error(sb, "Journal got removed while the fs was mounted!");
                return -EFSCORRUPTED;
        }

        journal = EXT4_SB(sb)->s_journal;

        /*
         * Now check for any error status which may have been recorded in the
         * journal by a prior ext4_error() or ext4_abort()
         */

        j_errno = jbd2_journal_errno(journal);
        if (j_errno) {
                char nbuf[16];

                errstr = ext4_decode_error(sb, j_errno, nbuf);
                ext4_warning(sb, "Filesystem error recorded "
                             "from previous mount: %s", errstr);

                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                j_errno = ext4_commit_super(sb);
                if (j_errno)
                        return j_errno;
                ext4_warning(sb, "Marked fs in need of filesystem check.");

                jbd2_journal_clear_err(journal);
                jbd2_journal_update_sb_errno(journal);
        }
        return 0;
}

/*
 * Force the running and committing transactions to commit,
 * and wait on the commit.
 */
int ext4_force_commit(struct super_block *sb)
{
        return ext4_journal_force_commit(EXT4_SB(sb)->s_journal);
}

static int ext4_sync_fs(struct super_block *sb, int wait)
{
        int ret = 0;
        tid_t target;
        bool needs_barrier = false;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        ret = ext4_emergency_state(sb);
        if (unlikely(ret))
                return ret;

        trace_ext4_sync_fs(sb, wait);
        flush_workqueue(sbi->rsv_conversion_wq);
        /*
         * Writeback quota in non-journalled quota case - journalled quota has
         * no dirty dquots
         */
        dquot_writeback_dquots(sb, -1);
        /*
         * Data writeback is possible w/o journal transaction, so barrier must
         * being sent at the end of the function. But we can skip it if
         * transaction_commit will do it for us.
         */
        if (sbi->s_journal) {
                target = jbd2_get_latest_transaction(sbi->s_journal);
                if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
                    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
                        needs_barrier = true;

                if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
                        if (wait)
                                ret = jbd2_log_wait_commit(sbi->s_journal,
                                                           target);
                }
        } else if (wait && test_opt(sb, BARRIER))
                needs_barrier = true;
        if (needs_barrier) {
                int err;
                err = blkdev_issue_flush(sb->s_bdev);
                if (!ret)
                        ret = err;
        }

        return ret;
}

/*
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
 *
 * Note that only this function cannot bring a filesystem to be in a clean
 * state independently. It relies on upper layer to stop all data & metadata
 * modifications.
 */
static int ext4_freeze(struct super_block *sb)
{
        int error = 0;
        journal_t *journal = EXT4_SB(sb)->s_journal;

        if (journal) {
                /* Now we set up the journal barrier. */
                jbd2_journal_lock_updates(journal);

                /*
                 * Don't clear the needs_recovery flag if we failed to
                 * flush the journal.
                 */
                error = jbd2_journal_flush(journal, 0);
                if (error < 0)
                        goto out;

                /* Journal blocked and flushed, clear needs_recovery flag. */
                ext4_clear_feature_journal_needs_recovery(sb);
                if (ext4_orphan_file_empty(sb))
                        ext4_clear_feature_orphan_present(sb);
        }

        error = ext4_commit_super(sb);
out:
        if (journal)
                /* we rely on upper layer to stop further updates */
                jbd2_journal_unlock_updates(journal);
        return error;
}

/*
 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
 * flag here, even though the filesystem is not technically dirty yet.
 */
static int ext4_unfreeze(struct super_block *sb)
{
        if (ext4_emergency_state(sb))
                return 0;

        if (EXT4_SB(sb)->s_journal) {
                /* Reset the needs_recovery flag before the fs is unlocked. */
                ext4_set_feature_journal_needs_recovery(sb);
                if (ext4_has_feature_orphan_file(sb))
                        ext4_set_feature_orphan_present(sb);
        }

        ext4_commit_super(sb);
        return 0;
}

/*
 * Structure to save mount options for ext4_remount's benefit
 */
struct ext4_mount_options {
        unsigned long s_mount_opt;
        unsigned long s_mount_opt2;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned long s_commit_interval;
        u32 s_min_batch_time, s_max_batch_time;
#ifdef CONFIG_QUOTA
        int s_jquota_fmt;
        char *s_qf_names[EXT4_MAXQUOTAS];
#endif
};

static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
        ext4_group_t g;
        int err = 0;
        int alloc_ctx;
#ifdef CONFIG_QUOTA
        int enable_quota = 0;
        int i, j;
        char *to_free[EXT4_MAXQUOTAS];
#endif


        /* Store the original options */
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
        old_opts.s_min_batch_time = sbi->s_min_batch_time;
        old_opts.s_max_batch_time = sbi->s_max_batch_time;
#ifdef CONFIG_QUOTA
        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                if (sbi->s_qf_names[i]) {
                        char *qf_name = get_qf_name(sb, sbi, i);

                        old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
                        if (!old_opts.s_qf_names[i]) {
                                for (j = 0; j < i; j++)
                                        kfree(old_opts.s_qf_names[j]);
                                return -ENOMEM;
                        }
                } else
                        old_opts.s_qf_names[i] = NULL;
#endif
        if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) {
                if (sbi->s_journal && sbi->s_journal->j_task->io_context)
                        ctx->journal_ioprio =
                                sbi->s_journal->j_task->io_context->ioprio;
                else
                        ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;

        }

        if ((ctx->spec & EXT4_SPEC_s_stripe) &&
            ext4_is_stripe_incompatible(sb, ctx->s_stripe)) {
                ext4_msg(sb, KERN_WARNING,
                         "stripe (%lu) is not aligned with cluster size (%u), "
                         "stripe is disabled",
                         ctx->s_stripe, sbi->s_cluster_ratio);
                ctx->s_stripe = 0;
        }

        /*
         * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
         * two calls to ext4_should_dioread_nolock() to return inconsistent
         * values, triggering WARN_ON in ext4_add_complete_io(). we grab
         * here s_writepages_rwsem to avoid race between writepages ops and
         * remount.
         */
        alloc_ctx = ext4_writepages_down_write(sb);
        ext4_apply_options(fc, sb);
        ext4_writepages_up_write(sb, alloc_ctx);

        if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
            test_opt(sb, JOURNAL_CHECKSUM)) {
                ext4_msg(sb, KERN_ERR, "changing journal_checksum "
                         "during remount not supported; ignoring");
                sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
        }

        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and delalloc");
                        err = -EINVAL;
                        goto restore_opts;
                }
                if (test_opt(sb, DIOREAD_NOLOCK)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and dioread_nolock");
                        err = -EINVAL;
                        goto restore_opts;
                }
        } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
                if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                "journal_async_commit in data=ordered mode");
                        err = -EINVAL;
                        goto restore_opts;
                }
        }

        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
                ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
                err = -EINVAL;
                goto restore_opts;
        }

        if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) &&
            !test_opt(sb, DELALLOC)) {
                ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount");
                err = -EINVAL;
                goto restore_opts;
        }

        sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);

        es = sbi->s_es;

        if (sbi->s_journal) {
                ext4_init_journal_params(sb, sbi->s_journal);
                set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
        }

        /* Flush outstanding errors before changing fs state */
        flush_work(&sbi->s_sb_upd_work);

        if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
                if (ext4_emergency_state(sb)) {
                        err = -EROFS;
                        goto restore_opts;
                }

                if (fc->sb_flags & SB_RDONLY) {
                        err = sync_filesystem(sb);
                        if (err < 0)
                                goto restore_opts;
                        err = dquot_suspend(sb, -1);
                        if (err < 0)
                                goto restore_opts;

                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
                         */
                        sb->s_flags |= SB_RDONLY;

                        /*
                         * OK, test if we are remounting a valid rw partition
                         * readonly, and if so set the rdonly flag and then
                         * mark the partition as valid again.
                         */
                        if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
                            (sbi->s_mount_state & EXT4_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);

                        if (sbi->s_journal) {
                                /*
                                 * We let remount-ro finish even if marking fs
                                 * as clean failed...
                                 */
                                ext4_mark_recovery_complete(sb, es);
                        }
                } else {
                        /* Make sure we can mount this feature set readwrite */
                        if (ext4_has_feature_readonly(sb) ||
                            !ext4_feature_set_ok(sb, 0)) {
                                err = -EROFS;
                                goto restore_opts;
                        }
                        /*
                         * Make sure the group descriptor checksums
                         * are sane.  If they aren't, refuse to remount r/w.
                         */
                        for (g = 0; g < sbi->s_groups_count; g++) {
                                struct ext4_group_desc *gdp =
                                        ext4_get_group_desc(sb, g, NULL);

                                if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
                                        ext4_msg(sb, KERN_ERR,
               "ext4_remount: Checksum for group %u failed (%u!=%u)",
                g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EFSBADCRC;
                                        goto restore_opts;
                                }
                        }

                        /*
                         * If we have an unprocessed orphan list hanging
                         * around from a previously readonly bdev mount,
                         * require a full umount/remount for now.
                         */
                        if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
                                ext4_msg(sb, KERN_WARNING, "Couldn't "
                                       "remount RDWR because of unprocessed "
                                       "orphan inode list.  Please "
                                       "umount/remount instead");
                                err = -EINVAL;
                                goto restore_opts;
                        }

                        /*
                         * Mounting a RDONLY partition read-write, so reread
                         * and store the current valid flag.  (It may have
                         * been changed by e2fsck since we originally mounted
                         * the partition.)
                         */
                        if (sbi->s_journal) {
                                err = ext4_clear_journal_err(sb, es);
                                if (err)
                                        goto restore_opts;
                        }
                        sbi->s_mount_state = (le16_to_cpu(es->s_state) &
                                              ~EXT4_FC_REPLAY);

                        err = ext4_setup_super(sb, es, 0);
                        if (err)
                                goto restore_opts;

                        sb->s_flags &= ~SB_RDONLY;
                        if (ext4_has_feature_mmp(sb)) {
                                err = ext4_multi_mount_protect(sb,
                                                le64_to_cpu(es->s_mmp_block));
                                if (err)
                                        goto restore_opts;
                        }
#ifdef CONFIG_QUOTA
                        enable_quota = 1;
#endif
                }
        }

        /*
         * Handle creation of system zone data early because it can fail.
         * Releasing of existing data is done when we are sure remount will
         * succeed.
         */
        if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
                err = ext4_setup_system_zone(sb);
                if (err)
                        goto restore_opts;
        }

        if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
                err = ext4_commit_super(sb);
                if (err)
                        goto restore_opts;
        }

#ifdef CONFIG_QUOTA
        if (enable_quota) {
                if (sb_any_quota_suspended(sb))
                        dquot_resume(sb, -1);
                else if (ext4_has_feature_quota(sb)) {
                        err = ext4_enable_quotas(sb);
                        if (err)
                                goto restore_opts;
                }
        }
        /* Release old quota file names */
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(old_opts.s_qf_names[i]);
#endif
        if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
                ext4_release_system_zone(sb);

        /*
         * Reinitialize lazy itable initialization thread based on
         * current settings
         */
        if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
                ext4_unregister_li_request(sb);
        else {
                ext4_group_t first_not_zeroed;
                first_not_zeroed = ext4_has_uninit_itable(sb);
                ext4_register_li_request(sb, first_not_zeroed);
        }

        if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
                ext4_stop_mmpd(sbi);

        /*
         * Handle aborting the filesystem as the last thing during remount to
         * avoid obsure errors during remount when some option changes fail to
         * apply due to shutdown filesystem.
         */
        if (test_opt2(sb, ABORT))
                ext4_abort(sb, ESHUTDOWN, "Abort forced by user");

        return 0;

restore_opts:
        /*
         * If there was a failing r/w to ro transition, we may need to
         * re-enable quota
         */
        if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
            sb_any_quota_suspended(sb))
                dquot_resume(sb, -1);

        alloc_ctx = ext4_writepages_down_write(sb);
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
        sbi->s_min_batch_time = old_opts.s_min_batch_time;
        sbi->s_max_batch_time = old_opts.s_max_batch_time;
        ext4_writepages_up_write(sb, alloc_ctx);

        if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
                ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                to_free[i] = get_qf_name(sb, sbi, i);
                rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
        }
        synchronize_rcu();
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(to_free[i]);
#endif
        if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
                ext4_stop_mmpd(sbi);
        return err;
}

static int ext4_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        int ret;
        bool old_ro = sb_rdonly(sb);

        fc->s_fs_info = EXT4_SB(sb);

        ret = ext4_check_opt_consistency(fc, sb);
        if (ret < 0)
                return ret;

        ret = __ext4_remount(fc, sb);
        if (ret < 0)
                return ret;

        ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.",
                 &sb->s_uuid,
                 (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : "");

        return 0;
}

#ifdef CONFIG_QUOTA
static int ext4_statfs_project(struct super_block *sb,
                               kprojid_t projid, struct kstatfs *buf)
{
        struct kqid qid;
        struct dquot *dquot;
        u64 limit;
        u64 curblock;

        qid = make_kqid_projid(projid);
        dquot = dqget(sb, qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        spin_lock(&dquot->dq_dqb_lock);

        limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
                             dquot->dq_dqb.dqb_bhardlimit);
        limit >>= sb->s_blocksize_bits;

        if (limit) {
                uint64_t        remaining = 0;

                curblock = (dquot->dq_dqb.dqb_curspace +
                            dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
                if (limit > curblock)
                        remaining = limit - curblock;

                buf->f_blocks = min(buf->f_blocks, limit);
                buf->f_bfree = min(buf->f_bfree, remaining);
                buf->f_bavail = min(buf->f_bavail, remaining);
        }

        limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
                             dquot->dq_dqb.dqb_ihardlimit);
        if (limit) {
                uint64_t        remaining = 0;

                if (limit > dquot->dq_dqb.dqb_curinodes)
                        remaining = limit - dquot->dq_dqb.dqb_curinodes;

                buf->f_files = min(buf->f_files, limit);
                buf->f_ffree = min(buf->f_ffree, remaining);
        }

        spin_unlock(&dquot->dq_dqb_lock);
        dqput(dquot);
        return 0;
}
#endif

static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_fsblk_t overhead = 0, resv_blocks;
        s64 bfree;
        resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));

        if (!test_opt(sb, MINIX_DF))
                overhead = sbi->s_overhead;

        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
        bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
        /* prevent underflow in case that few free space is available */
        buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
        buf->f_bavail = buf->f_bfree -
                        (ext4_r_blocks_count(es) + resv_blocks);
        if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
        buf->f_namelen = EXT4_NAME_LEN;
        buf->f_fsid = uuid_to_fsid(es->s_uuid);

#ifdef CONFIG_QUOTA
        if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
            sb_has_quota_limits_enabled(sb, PRJQUOTA))
                ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
#endif
        return 0;
}


#ifdef CONFIG_QUOTA

/*
 * Helper functions so that transaction is started before we acquire dqio_sem
 * to keep correct lock ordering of transaction > dqio_sem
 */
static inline struct inode *dquot_to_inode(struct dquot *dquot)
{
        return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
}

static int ext4_write_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;
        struct inode *inode;

        inode = dquot_to_inode(dquot);
        handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
                                    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                               "Failed to commit dquot type %d",
                               dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static int ext4_acquire_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;

        handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
                                    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_acquire(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                              "Failed to acquire dquot type %d",
                              dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static int ext4_release_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;
        bool freeze_protected = false;

        /*
         * Trying to sb_start_intwrite() in a running transaction
         * can result in a deadlock. Further, running transactions
         * are already protected from freezing.
         */
        if (!ext4_journal_current_handle()) {
                sb_start_intwrite(dquot->dq_sb);
                freeze_protected = true;
        }

        handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
                                    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle)) {
                /* Release dquot anyway to avoid endless cycle in dqput() */
                dquot_release(dquot);
                if (freeze_protected)
                        sb_end_intwrite(dquot->dq_sb);
                return PTR_ERR(handle);
        }
        ret = dquot_release(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                               "Failed to release dquot type %d",
                               dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;

        if (freeze_protected)
                sb_end_intwrite(dquot->dq_sb);

        return ret;
}

static int ext4_mark_dquot_dirty(struct dquot *dquot)
{
        struct super_block *sb = dquot->dq_sb;

        if (ext4_is_quota_journalled(sb)) {
                dquot_mark_dquot_dirty(dquot);
                return ext4_write_dquot(dquot);
        } else {
                return dquot_mark_dquot_dirty(dquot);
        }
}

static int ext4_write_info(struct super_block *sb, int type)
{
        int ret, err;
        handle_t *handle;

        /* Data block + inode block */
        handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit_info(sb, type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static void lockdep_set_quota_inode(struct inode *inode, int subclass)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        /* The first argument of lockdep_set_subclass has to be
         * *exactly* the same as the argument to init_rwsem() --- in
         * this case, in init_once() --- or lockdep gets unhappy
         * because the name of the lock is set using the
         * stringification of the argument to init_rwsem().
         */
        (void) ei;        /* shut up clang warning if !CONFIG_LOCKDEP */
        lockdep_set_subclass(&ei->i_data_sem, subclass);
}

/*
 * Standard function to be called on quota_on
 */
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         const struct path *path)
{
        int err;

        if (!test_opt(sb, QUOTA))
                return -EINVAL;

        /* Quotafile not on the same filesystem? */
        if (path->dentry->d_sb != sb)
                return -EXDEV;

        /* Quota already enabled for this file? */
        if (IS_NOQUOTA(d_inode(path->dentry)))
                return -EBUSY;

        /* Journaling quota? */
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
                if (path->dentry->d_parent != sb->s_root)
                        ext4_msg(sb, KERN_WARNING,
                                "Quota file not on filesystem root. "
                                "Journaled quota will not work");
                sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
        } else {
                /*
                 * Clear the flag just in case mount options changed since
                 * last time.
                 */
                sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
        }

        lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
        err = dquot_quota_on(sb, type, format_id, path);
        if (!err) {
                struct inode *inode = d_inode(path->dentry);
                handle_t *handle;

                /*
                 * Set inode flags to prevent userspace from messing with quota
                 * files. If this fails, we return success anyway since quotas
                 * are already enabled and this is not a hard failure.
                 */
                inode_lock(inode);
                handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
                if (IS_ERR(handle))
                        goto unlock_inode;
                EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
                inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
                                S_NOATIME | S_IMMUTABLE);
                err = ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
        unlock_inode:
                inode_unlock(inode);
                if (err)
                        dquot_quota_off(sb, type);
        }
        if (err)
                lockdep_set_quota_inode(path->dentry->d_inode,
                                             I_DATA_SEM_NORMAL);
        return err;
}

static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
{
        switch (type) {
        case USRQUOTA:
                return qf_inum == EXT4_USR_QUOTA_INO;
        case GRPQUOTA:
                return qf_inum == EXT4_GRP_QUOTA_INO;
        case PRJQUOTA:
                return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
        default:
                BUG();
        }
}

static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                             unsigned int flags)
{
        int err;
        struct inode *qf_inode;
        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
        };

        BUG_ON(!ext4_has_feature_quota(sb));

        if (!qf_inums[type])
                return -EPERM;

        if (!ext4_check_quota_inum(type, qf_inums[type])) {
                ext4_error(sb, "Bad quota inum: %lu, type: %d",
                                qf_inums[type], type);
                return -EUCLEAN;
        }

        qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
        if (IS_ERR(qf_inode)) {
                ext4_error(sb, "Bad quota inode: %lu, type: %d",
                                qf_inums[type], type);
                return PTR_ERR(qf_inode);
        }

        /* Don't account quota for quota files to avoid recursion */
        qf_inode->i_flags |= S_NOQUOTA;
        lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
        err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
        if (err)
                lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
        iput(qf_inode);

        return err;
}

/* Enable usage tracking for all quota types. */
int ext4_enable_quotas(struct super_block *sb)
{
        int type, err = 0;
        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
        };
        bool quota_mopt[EXT4_MAXQUOTAS] = {
                test_opt(sb, USRQUOTA),
                test_opt(sb, GRPQUOTA),
                test_opt(sb, PRJQUOTA),
        };

        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
        for (type = 0; type < EXT4_MAXQUOTAS; type++) {
                if (qf_inums[type]) {
                        err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
                                DQUOT_USAGE_ENABLED |
                                (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
                        if (err) {
                                ext4_warning(sb,
                                        "Failed to enable quota tracking "
                                        "(type=%d, err=%d, ino=%lu). "
                                        "Please run e2fsck to fix.", type,
                                        err, qf_inums[type]);

                                ext4_quotas_off(sb, type);
                                return err;
                        }
                }
        }
        return 0;
}

static int ext4_quota_off(struct super_block *sb, int type)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        handle_t *handle;
        int err;

        /* Force all delayed allocation blocks to be allocated.
         * Caller already holds s_umount sem */
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);

        if (!inode || !igrab(inode))
                goto out;

        err = dquot_quota_off(sb, type);
        if (err || ext4_has_feature_quota(sb))
                goto out_put;
        /*
         * When the filesystem was remounted read-only first, we cannot cleanup
         * inode flags here. Bad luck but people should be using QUOTA feature
         * these days anyway.
         */
        if (sb_rdonly(sb))
                goto out_put;

        inode_lock(inode);
        /*
         * Update modification times of quota files when userspace can
         * start looking at them. If we fail, we return success anyway since
         * this is not a hard failure and quotas are already disabled.
         */
        handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto out_unlock;
        }
        EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
        inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        err = ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
out_unlock:
        inode_unlock(inode);
out_put:
        lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
        iput(inode);
        return err;
out:
        return dquot_quota_off(sb, type);
}

/* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
 * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int offset = off & (sb->s_blocksize - 1);
        int tocopy;
        size_t toread;
        struct buffer_head *bh;
        loff_t i_size = i_size_read(inode);

        if (off > i_size)
                return 0;
        if (off+len > i_size)
                len = i_size-off;
        toread = len;
        while (toread > 0) {
                tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
                bh = ext4_bread(NULL, inode, blk, 0);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                if (!bh)        /* A hole? */
                        memset(data, 0, tocopy);
                else
                        memcpy(data, bh->b_data+offset, tocopy);
                brelse(bh);
                offset = 0;
                toread -= tocopy;
                data += tocopy;
                blk++;
        }
        return len;
}

/* Write to quotafile (we know the transaction is already started and has
 * enough credits) */
static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
        int retries = 0;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();

        if (!handle) {
                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
                        " cancelled because transaction is not started",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
        /*
         * Since we account only one data block in transaction credits,
         * then it is impossible to cross a block boundary.
         */
        if (sb->s_blocksize - offset < len) {
                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
                        " cancelled because not block aligned",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }

        do {
                bh = ext4_bread(handle, inode, blk,
                                EXT4_GET_BLOCKS_CREATE |
                                EXT4_GET_BLOCKS_METADATA_NOFAIL);
        } while (PTR_ERR(bh) == -ENOSPC &&
                 ext4_should_retry_alloc(inode->i_sb, &retries));
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (!bh)
                goto out;
        BUFFER_TRACE(bh, "get write access");
        err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
        if (err) {
                brelse(bh);
                return err;
        }
        lock_buffer(bh);
        memcpy(bh->b_data+offset, data, len);
        flush_dcache_folio(bh->b_folio);
        unlock_buffer(bh);
        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        brelse(bh);
out:
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (unlikely(err2 && !err))
                        err = err2;
        }
        return err ? err : len;
}
#endif

#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static inline void register_as_ext2(void)
{
        int err = register_filesystem(&ext2_fs_type);
        if (err)
                printk(KERN_WARNING
                       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
}

static inline void unregister_as_ext2(void)
{
        unregister_filesystem(&ext2_fs_type);
}

static inline int ext2_feature_set_ok(struct super_block *sb)
{
        if (ext4_has_unknown_ext2_incompat_features(sb))
                return 0;
        if (sb_rdonly(sb))
                return 1;
        if (ext4_has_unknown_ext2_ro_compat_features(sb))
                return 0;
        return 1;
}
#else
static inline void register_as_ext2(void) { }
static inline void unregister_as_ext2(void) { }
static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
#endif

static inline void register_as_ext3(void)
{
        int err = register_filesystem(&ext3_fs_type);
        if (err)
                printk(KERN_WARNING
                       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
}

static inline void unregister_as_ext3(void)
{
        unregister_filesystem(&ext3_fs_type);
}

static inline int ext3_feature_set_ok(struct super_block *sb)
{
        if (ext4_has_unknown_ext3_incompat_features(sb))
                return 0;
        if (!ext4_has_feature_journal(sb))
                return 0;
        if (sb_rdonly(sb))
                return 1;
        if (ext4_has_unknown_ext3_ro_compat_features(sb))
                return 0;
        return 1;
}

static void ext4_kill_sb(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL;

        kill_block_super(sb);

        if (bdev_file)
                bdev_fput(bdev_file);
}

static struct file_system_type ext4_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ext4",
        .init_fs_context        = ext4_init_fs_context,
        .parameters                = ext4_param_specs,
        .kill_sb                = ext4_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME |
                                  FS_LBS,
};
MODULE_ALIAS_FS("ext4");

static int __init ext4_init_fs(void)
{
        int err;

        ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
        ext4_li_info = NULL;

        /* Build-time check for flags consistency */
        ext4_check_flag_values();

        err = ext4_init_es();
        if (err)
                return err;

        err = ext4_init_pending();
        if (err)
                goto out7;

        err = ext4_init_post_read_processing();
        if (err)
                goto out6;

        err = ext4_init_pageio();
        if (err)
                goto out5;

        err = ext4_init_system_zone();
        if (err)
                goto out4;

        err = ext4_init_sysfs();
        if (err)
                goto out3;

        err = ext4_init_mballoc();
        if (err)
                goto out2;
        err = init_inodecache();
        if (err)
                goto out1;

        err = ext4_fc_init_dentry_cache();
        if (err)
                goto out05;

        register_as_ext3();
        register_as_ext2();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;

        return 0;
out:
        unregister_as_ext2();
        unregister_as_ext3();
        ext4_fc_destroy_dentry_cache();
out05:
        destroy_inodecache();
out1:
        ext4_exit_mballoc();
out2:
        ext4_exit_sysfs();
out3:
        ext4_exit_system_zone();
out4:
        ext4_exit_pageio();
out5:
        ext4_exit_post_read_processing();
out6:
        ext4_exit_pending();
out7:
        ext4_exit_es();

        return err;
}

static void __exit ext4_exit_fs(void)
{
        ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        ext4_fc_destroy_dentry_cache();
        destroy_inodecache();
        ext4_exit_mballoc();
        ext4_exit_sysfs();
        ext4_exit_system_zone();
        ext4_exit_pageio();
        ext4_exit_post_read_processing();
        ext4_exit_es();
        ext4_exit_pending();
}

MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
MODULE_DESCRIPTION("Fourth Extended Filesystem");
MODULE_LICENSE("GPL");
module_init(ext4_init_fs)
module_exit(ext4_exit_fs)































    1 


























































    1 












    1 




    1 



















    1 






    1 












    1 


















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS B-tree node cache
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Originally written by Seiji Kihara.
 * Fully revised by Ryusuke Konishi for stabilization and simplification.
 *
 */

#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/mm.h>
#include <linux/backing-dev.h>
#include <linux/gfp.h>
#include "nilfs.h"
#include "mdt.h"
#include "dat.h"
#include "page.h"
#include "btnode.h"


/**
 * nilfs_init_btnc_inode - initialize B-tree node cache inode
 * @btnc_inode: inode to be initialized
 *
 * nilfs_init_btnc_inode() sets up an inode for B-tree node cache.
 */
void nilfs_init_btnc_inode(struct inode *btnc_inode)
{
        struct nilfs_inode_info *ii = NILFS_I(btnc_inode);

        btnc_inode->i_mode = S_IFREG;
        ii->i_flags = 0;
        memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap));
        mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS);
        btnc_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops;
}

void nilfs_btnode_cache_clear(struct address_space *btnc)
{
        invalidate_mapping_pages(btnc, 0, -1);
        truncate_inode_pages(btnc, 0);
}

struct buffer_head *
nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
{
        struct inode *inode = btnc->host;
        struct buffer_head *bh;

        bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);

        if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
                     buffer_dirty(bh))) {
                /*
                 * The block buffer at the specified new address was already
                 * in use.  This can happen if it is a virtual block number
                 * and has been reallocated due to corruption of the bitmap
                 * used to manage its allocation state (if not, the buffer
                 * clearing of an abandoned b-tree node is missing somewhere).
                 */
                nilfs_error(inode->i_sb,
                            "state inconsistency probably due to duplicate use of b-tree node block address %llu (ino=%llu)",
                            (unsigned long long)blocknr, inode->i_ino);
                goto failed;
        }
        memset(bh->b_data, 0, i_blocksize(inode));
        bh->b_blocknr = blocknr;
        set_buffer_mapped(bh);
        set_buffer_uptodate(bh);

        folio_unlock(bh->b_folio);
        folio_put(bh->b_folio);
        return bh;

failed:
        folio_unlock(bh->b_folio);
        folio_put(bh->b_folio);
        brelse(bh);
        return ERR_PTR(-EIO);
}

int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
                              sector_t pblocknr, blk_opf_t opf,
                              struct buffer_head **pbh, sector_t *submit_ptr)
{
        struct buffer_head *bh;
        struct inode *inode = btnc->host;
        struct folio *folio;
        int err;

        bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
        if (unlikely(!bh))
                return -ENOMEM;

        err = -EEXIST; /* internal code */
        folio = bh->b_folio;

        if (buffer_uptodate(bh) || buffer_dirty(bh))
                goto found;

        if (pblocknr == 0) {
                pblocknr = blocknr;
                if (inode->i_ino != NILFS_DAT_INO) {
                        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;

                        /* blocknr is a virtual block number */
                        err = nilfs_dat_translate(nilfs->ns_dat, blocknr,
                                                  &pblocknr);
                        if (unlikely(err)) {
                                brelse(bh);
                                goto out_locked;
                        }
                }
        }

        if (opf & REQ_RAHEAD) {
                if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
                        err = -EBUSY; /* internal code */
                        brelse(bh);
                        goto out_locked;
                }
        } else { /* opf == REQ_OP_READ */
                lock_buffer(bh);
        }
        if (buffer_uptodate(bh)) {
                unlock_buffer(bh);
                err = -EEXIST; /* internal code */
                goto found;
        }
        set_buffer_mapped(bh);
        bh->b_blocknr = pblocknr; /* set block address for read */
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
        submit_bh(opf, bh);
        bh->b_blocknr = blocknr; /* set back to the given block address */
        *submit_ptr = pblocknr;
        err = 0;
found:
        *pbh = bh;

out_locked:
        folio_unlock(folio);
        folio_put(folio);
        return err;
}

/**
 * nilfs_btnode_delete - delete B-tree node buffer
 * @bh: buffer to be deleted
 *
 * nilfs_btnode_delete() invalidates the specified buffer and delete the page
 * including the buffer if the page gets unbusy.
 */
void nilfs_btnode_delete(struct buffer_head *bh)
{
        struct address_space *mapping;
        struct folio *folio = bh->b_folio;
        pgoff_t index = folio->index;
        int still_dirty;

        folio_get(folio);
        folio_lock(folio);
        folio_wait_writeback(folio);

        nilfs_forget_buffer(bh);
        still_dirty = folio_test_dirty(folio);
        mapping = folio->mapping;
        folio_unlock(folio);
        folio_put(folio);

        if (!still_dirty && mapping)
                invalidate_inode_pages2_range(mapping, index, index);
}

/**
 * nilfs_btnode_prepare_change_key - prepare to change the search key of a
 *                                   b-tree node block
 * @btnc: page cache in which the b-tree node block is buffered
 * @ctxt: structure for exchanging context information for key change
 *
 * nilfs_btnode_prepare_change_key() prepares to move the contents of the
 * b-tree node block of the old key given in the "oldkey" member of @ctxt to
 * the position of the new key given in the "newkey" member of @ctxt in the
 * page cache @btnc.  Here, the key of the block is an index in units of
 * blocks, and if the page and block sizes match, it matches the page index
 * in the page cache.
 *
 * If the page size and block size match, this function attempts to move the
 * entire folio, and in preparation for this, inserts the original folio into
 * the new index of the cache.  If this insertion fails or if the page size
 * and block size are different, it falls back to a copy preparation using
 * nilfs_btnode_create_block(), inserts a new block at the position
 * corresponding to "newkey", and stores the buffer head pointer in the
 * "newbh" member of @ctxt.
 *
 * Note that the current implementation does not support folio sizes larger
 * than the page size.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_btnode_prepare_change_key(struct address_space *btnc,
                                    struct nilfs_btnode_chkey_ctxt *ctxt)
{
        struct buffer_head *obh, *nbh;
        struct inode *inode = btnc->host;
        __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
        int err;

        if (oldkey == newkey)
                return 0;

        obh = ctxt->bh;
        ctxt->newbh = NULL;

        if (inode->i_blkbits == PAGE_SHIFT) {
                struct folio *ofolio = obh->b_folio;
                folio_lock(ofolio);
retry:
                /* BUG_ON(oldkey != obh->b_folio->index); */
                if (unlikely(oldkey != ofolio->index))
                        NILFS_FOLIO_BUG(ofolio,
                                       "invalid oldkey %lld (newkey=%lld)",
                                       (unsigned long long)oldkey,
                                       (unsigned long long)newkey);

                xa_lock_irq(&btnc->i_pages);
                err = __xa_insert(&btnc->i_pages, newkey, ofolio, GFP_NOFS);
                xa_unlock_irq(&btnc->i_pages);
                /*
                 * Note: folio->index will not change to newkey until
                 * nilfs_btnode_commit_change_key() will be called.
                 * To protect the folio in intermediate state, the folio lock
                 * is held.
                 */
                if (!err)
                        return 0;
                else if (err != -EBUSY)
                        goto failed_unlock;

                err = invalidate_inode_pages2_range(btnc, newkey, newkey);
                if (!err)
                        goto retry;
                /* fallback to copy mode */
                folio_unlock(ofolio);
        }

        nbh = nilfs_btnode_create_block(btnc, newkey);
        if (IS_ERR(nbh))
                return PTR_ERR(nbh);

        BUG_ON(nbh == obh);
        ctxt->newbh = nbh;
        return 0;

 failed_unlock:
        folio_unlock(obh->b_folio);
        return err;
}

/**
 * nilfs_btnode_commit_change_key - commit the change of the search key of
 *                                  a b-tree node block
 * @btnc: page cache in which the b-tree node block is buffered
 * @ctxt: structure for exchanging context information for key change
 *
 * nilfs_btnode_commit_change_key() executes the key change based on the
 * context @ctxt prepared by nilfs_btnode_prepare_change_key().  If no valid
 * block buffer is prepared in "newbh" of @ctxt (i.e., a full folio move),
 * this function removes the folio from the old index and completes the move.
 * Otherwise, it copies the block data and inherited flag states of "oldbh"
 * to "newbh" and clears the "oldbh" from the cache.  In either case, the
 * relocated buffer is marked as dirty.
 *
 * As with nilfs_btnode_prepare_change_key(), the current implementation does
 * not support folio sizes larger than the page size.
 */
void nilfs_btnode_commit_change_key(struct address_space *btnc,
                                    struct nilfs_btnode_chkey_ctxt *ctxt)
{
        struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
        __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
        struct folio *ofolio;

        if (oldkey == newkey)
                return;

        if (nbh == NULL) {        /* blocksize == pagesize */
                ofolio = obh->b_folio;
                if (unlikely(oldkey != ofolio->index))
                        NILFS_FOLIO_BUG(ofolio,
                                       "invalid oldkey %lld (newkey=%lld)",
                                       (unsigned long long)oldkey,
                                       (unsigned long long)newkey);
                mark_buffer_dirty(obh);

                xa_lock_irq(&btnc->i_pages);
                __xa_erase(&btnc->i_pages, oldkey);
                __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY);
                xa_unlock_irq(&btnc->i_pages);

                ofolio->index = obh->b_blocknr = newkey;
                folio_unlock(ofolio);
        } else {
                nilfs_copy_buffer(nbh, obh);
                mark_buffer_dirty(nbh);

                nbh->b_blocknr = newkey;
                ctxt->bh = nbh;
                nilfs_btnode_delete(obh); /* will decrement bh->b_count */
        }
}

/**
 * nilfs_btnode_abort_change_key - abort the change of the search key of a
 *                                 b-tree node block
 * @btnc: page cache in which the b-tree node block is buffered
 * @ctxt: structure for exchanging context information for key change
 *
 * nilfs_btnode_abort_change_key() cancels the key change associated with the
 * context @ctxt prepared via nilfs_btnode_prepare_change_key() and performs
 * any necessary cleanup.  If no valid block buffer is prepared in "newbh" of
 * @ctxt, this function removes the folio from the destination index and aborts
 * the move.  Otherwise, it clears "newbh" from the cache.
 *
 * As with nilfs_btnode_prepare_change_key(), the current implementation does
 * not support folio sizes larger than the page size.
 */
void nilfs_btnode_abort_change_key(struct address_space *btnc,
                                   struct nilfs_btnode_chkey_ctxt *ctxt)
{
        struct buffer_head *nbh = ctxt->newbh;
        __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;

        if (oldkey == newkey)
                return;

        if (nbh == NULL) {        /* blocksize == pagesize */
                xa_erase_irq(&btnc->i_pages, newkey);
                folio_unlock(ctxt->bh->b_folio);
        } else {
                /*
                 * When canceling a buffer that a prepare operation has
                 * allocated to copy a node block to another location, use
                 * nilfs_btnode_delete() to initialize and release the buffer
                 * so that the buffer flags will not be in an inconsistent
                 * state when it is reallocated.
                 */
                nilfs_btnode_delete(nbh);
        }
}















































































































    1 

























    1 



























































































    1 







    1 


















































































































































































































































































































































    1 



    1 
















































































































































































































































































































































































































































































































































    1 













    1 




























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
// SPDX-License-Identifier: GPL-2.0-only
/*
 * kexec.c - kexec system call core code.
 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/btf.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/liveupdate.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
#include <linux/ioport.h>
#include <linux/hardirq.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/utsname.h>
#include <linux/numa.h>
#include <linux/suspend.h>
#include <linux/device.h>
#include <linux/freezer.h>
#include <linux/panic_notifier.h>
#include <linux/pm.h>
#include <linux/cpu.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/console.h>
#include <linux/vmalloc.h>
#include <linux/swap.h>
#include <linux/syscore_ops.h>
#include <linux/compiler.h>
#include <linux/hugetlb.h>
#include <linux/objtool.h>
#include <linux/kmsg_dump.h>
#include <linux/dma-map-ops.h>
#include <linux/sysfs.h>

#include <asm/page.h>
#include <asm/sections.h>

#include "kexec_internal.h"

atomic_t __kexec_lock = ATOMIC_INIT(0);

/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;

bool kexec_file_dbg_print;

/*
 * When kexec transitions to the new kernel there is a one-to-one
 * mapping between physical and virtual addresses.  On processors
 * where you can disable the MMU this is trivial, and easy.  For
 * others it is still a simple predictable page table to setup.
 *
 * In that environment kexec copies the new kernel to its final
 * resting place.  This means I can only support memory whose
 * physical address can fit in an unsigned long.  In particular
 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
 * If the assembly stub has more restrictive requirements
 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
 * defined more restrictively in <asm/kexec.h>.
 *
 * The code for the transition from the current kernel to the
 * new kernel is placed in the control_code_buffer, whose size
 * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
 * page of memory is necessary, but some architectures require more.
 * Because this memory must be identity mapped in the transition from
 * virtual to physical addresses it must live in the range
 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
 * modifiable.
 *
 * The assembly stub in the control code buffer is passed a linked list
 * of descriptor pages detailing the source pages of the new kernel,
 * and the destination addresses of those source pages.  As this data
 * structure is not used in the context of the current OS, it must
 * be self-contained.
 *
 * The code has been made to work with highmem pages and will use a
 * destination page in its final resting place (if it happens
 * to allocate it).  The end product of this is that most of the
 * physical address space, and most of RAM can be used.
 *
 * Future directions include:
 *  - allocating a page table with the control code buffer identity
 *    mapped, to simplify machine_kexec and make kexec_on_panic more
 *    reliable.
 */

/*
 * KIMAGE_NO_DEST is an impossible destination address..., for
 * allocating pages whose destination address we do not care about.
 */
#define KIMAGE_NO_DEST (-1UL)
#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)

static struct page *kimage_alloc_page(struct kimage *image,
                                       gfp_t gfp_mask,
                                       unsigned long dest);

int sanity_check_segment_list(struct kimage *image)
{
        int i;
        unsigned long nr_segments = image->nr_segments;
        unsigned long total_pages = 0;
        unsigned long nr_pages = totalram_pages();

        /*
         * Verify we have good destination addresses.  The caller is
         * responsible for making certain we don't attempt to load
         * the new image into invalid or reserved areas of RAM.  This
         * just verifies it is an address we can use.
         *
         * Since the kernel does everything in page size chunks ensure
         * the destination addresses are page aligned.  Too many
         * special cases crop of when we don't do this.  The most
         * insidious is getting overlapping destination addresses
         * simply because addresses are changed to page size
         * granularity.
         */
        for (i = 0; i < nr_segments; i++) {
                unsigned long mstart, mend;

                mstart = image->segment[i].mem;
                mend   = mstart + image->segment[i].memsz;
                if (mstart > mend)
                        return -EADDRNOTAVAIL;
                if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
                        return -EADDRNOTAVAIL;
                if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
                        return -EADDRNOTAVAIL;
        }

        /* Verify our destination addresses do not overlap.
         * If we alloed overlapping destination addresses
         * through very weird things can happen with no
         * easy explanation as one segment stops on another.
         */
        for (i = 0; i < nr_segments; i++) {
                unsigned long mstart, mend;
                unsigned long j;

                mstart = image->segment[i].mem;
                mend   = mstart + image->segment[i].memsz;
                for (j = 0; j < i; j++) {
                        unsigned long pstart, pend;

                        pstart = image->segment[j].mem;
                        pend   = pstart + image->segment[j].memsz;
                        /* Do the segments overlap ? */
                        if ((mend > pstart) && (mstart < pend))
                                return -EINVAL;
                }
        }

        /* Ensure our buffer sizes are strictly less than
         * our memory sizes.  This should always be the case,
         * and it is easier to check up front than to be surprised
         * later on.
         */
        for (i = 0; i < nr_segments; i++) {
                if (image->segment[i].bufsz > image->segment[i].memsz)
                        return -EINVAL;
        }

        /*
         * Verify that no more than half of memory will be consumed. If the
         * request from userspace is too large, a large amount of time will be
         * wasted allocating pages, which can cause a soft lockup.
         */
        for (i = 0; i < nr_segments; i++) {
                if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2)
                        return -EINVAL;

                total_pages += PAGE_COUNT(image->segment[i].memsz);
        }

        if (total_pages > nr_pages / 2)
                return -EINVAL;

#ifdef CONFIG_CRASH_DUMP
        /*
         * Verify we have good destination addresses.  Normally
         * the caller is responsible for making certain we don't
         * attempt to load the new image into invalid or reserved
         * areas of RAM.  But crash kernels are preloaded into a
         * reserved area of ram.  We must ensure the addresses
         * are in the reserved area otherwise preloading the
         * kernel could corrupt things.
         */

        if (image->type == KEXEC_TYPE_CRASH) {
                for (i = 0; i < nr_segments; i++) {
                        unsigned long mstart, mend;

                        mstart = image->segment[i].mem;
                        mend = mstart + image->segment[i].memsz - 1;
                        /* Ensure we are within the crash kernel limits */
                        if ((mstart < phys_to_boot_phys(crashk_res.start)) ||
                            (mend > phys_to_boot_phys(crashk_res.end)))
                                return -EADDRNOTAVAIL;
                }
        }
#endif

        /*
         * The destination addresses are searched from system RAM rather than
         * being allocated from the buddy allocator, so they are not guaranteed
         * to be accepted by the current kernel.  Accept the destination
         * addresses before kexec swaps their content with the segments' source
         * pages to avoid accessing memory before it is accepted.
         */
        for (i = 0; i < nr_segments; i++)
                accept_memory(image->segment[i].mem, image->segment[i].memsz);

        return 0;
}

struct kimage *do_kimage_alloc_init(void)
{
        struct kimage *image;

        /* Allocate a controlling structure */
        image = kzalloc_obj(*image);
        if (!image)
                return NULL;

        image->entry = &image->head;
        image->last_entry = &image->head;
        image->control_page = ~0; /* By default this does not apply */
        image->type = KEXEC_TYPE_DEFAULT;

        /* Initialize the list of control pages */
        INIT_LIST_HEAD(&image->control_pages);

        /* Initialize the list of destination pages */
        INIT_LIST_HEAD(&image->dest_pages);

        /* Initialize the list of unusable pages */
        INIT_LIST_HEAD(&image->unusable_pages);

#ifdef CONFIG_CRASH_HOTPLUG
        image->hp_action = KEXEC_CRASH_HP_NONE;
        image->elfcorehdr_index = -1;
        image->elfcorehdr_updated = false;
#endif

        return image;
}

int kimage_is_destination_range(struct kimage *image,
                                        unsigned long start,
                                        unsigned long end)
{
        unsigned long i;

        for (i = 0; i < image->nr_segments; i++) {
                unsigned long mstart, mend;

                mstart = image->segment[i].mem;
                mend = mstart + image->segment[i].memsz - 1;
                if ((end >= mstart) && (start <= mend))
                        return 1;
        }

        return 0;
}

static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
{
        struct page *pages;

        if (fatal_signal_pending(current))
                return NULL;
        pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
        if (pages) {
                unsigned int count, i;

                pages->mapping = NULL;
                set_page_private(pages, order);
                count = 1 << order;
                for (i = 0; i < count; i++)
                        SetPageReserved(pages + i);

                arch_kexec_post_alloc_pages(page_address(pages), count,
                                            gfp_mask);

                if (gfp_mask & __GFP_ZERO)
                        for (i = 0; i < count; i++)
                                clear_highpage(pages + i);
        }

        return pages;
}

static void kimage_free_pages(struct page *page)
{
        unsigned int order, count, i;

        order = page_private(page);
        count = 1 << order;

        arch_kexec_pre_free_pages(page_address(page), count);

        for (i = 0; i < count; i++)
                ClearPageReserved(page + i);
        __free_pages(page, order);
}

void kimage_free_page_list(struct list_head *list)
{
        struct page *page, *next;

        list_for_each_entry_safe(page, next, list, lru) {
                list_del(&page->lru);
                kimage_free_pages(page);
        }
}

static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
                                                        unsigned int order)
{
        /* Control pages are special, they are the intermediaries
         * that are needed while we copy the rest of the pages
         * to their final resting place.  As such they must
         * not conflict with either the destination addresses
         * or memory the kernel is already using.
         *
         * The only case where we really need more than one of
         * these are for architectures where we cannot disable
         * the MMU and must instead generate an identity mapped
         * page table for all of the memory.
         *
         * At worst this runs in O(N) of the image size.
         */
        struct list_head extra_pages;
        struct page *pages;
        unsigned int count;

        count = 1 << order;
        INIT_LIST_HEAD(&extra_pages);

        /* Loop while I can allocate a page and the page allocated
         * is a destination page.
         */
        do {
                unsigned long pfn, epfn, addr, eaddr;

                pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
                if (!pages)
                        break;
                pfn   = page_to_boot_pfn(pages);
                epfn  = pfn + count;
                addr  = pfn << PAGE_SHIFT;
                eaddr = (epfn << PAGE_SHIFT) - 1;
                if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
                              kimage_is_destination_range(image, addr, eaddr)) {
                        list_add(&pages->lru, &extra_pages);
                        pages = NULL;
                }
        } while (!pages);

        if (pages) {
                /* Remember the allocated page... */
                list_add(&pages->lru, &image->control_pages);

                /* Because the page is already in it's destination
                 * location we will never allocate another page at
                 * that address.  Therefore kimage_alloc_pages
                 * will not return it (again) and we don't need
                 * to give it an entry in image->segment[].
                 */
        }
        /* Deal with the destination pages I have inadvertently allocated.
         *
         * Ideally I would convert multi-page allocations into single
         * page allocations, and add everything to image->dest_pages.
         *
         * For now it is simpler to just free the pages.
         */
        kimage_free_page_list(&extra_pages);

        return pages;
}

#ifdef CONFIG_CRASH_DUMP
static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                                                      unsigned int order)
{
        /* Control pages are special, they are the intermediaries
         * that are needed while we copy the rest of the pages
         * to their final resting place.  As such they must
         * not conflict with either the destination addresses
         * or memory the kernel is already using.
         *
         * Control pages are also the only pags we must allocate
         * when loading a crash kernel.  All of the other pages
         * are specified by the segments and we just memcpy
         * into them directly.
         *
         * The only case where we really need more than one of
         * these are for architectures where we cannot disable
         * the MMU and must instead generate an identity mapped
         * page table for all of the memory.
         *
         * Given the low demand this implements a very simple
         * allocator that finds the first hole of the appropriate
         * size in the reserved memory region, and allocates all
         * of the memory up to and including the hole.
         */
        unsigned long hole_start, hole_end, size;
        struct page *pages;

        pages = NULL;
        size = (1 << order) << PAGE_SHIFT;
        hole_start = ALIGN(image->control_page, size);
        hole_end   = hole_start + size - 1;
        while (hole_end <= crashk_res.end) {
                unsigned long i;

                cond_resched();

                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
                        break;
                /* See if I overlap any of the segments */
                for (i = 0; i < image->nr_segments; i++) {
                        unsigned long mstart, mend;

                        mstart = image->segment[i].mem;
                        mend   = mstart + image->segment[i].memsz - 1;
                        if ((hole_end >= mstart) && (hole_start <= mend)) {
                                /* Advance the hole to the end of the segment */
                                hole_start = ALIGN(mend, size);
                                hole_end   = hole_start + size - 1;
                                break;
                        }
                }
                /* If I don't overlap any segments I have found my hole! */
                if (i == image->nr_segments) {
                        pages = pfn_to_page(hole_start >> PAGE_SHIFT);
                        image->control_page = hole_end + 1;
                        break;
                }
        }

        /* Ensure that these pages are decrypted if SME is enabled. */
        if (pages)
                arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);

        return pages;
}
#endif


struct page *kimage_alloc_control_pages(struct kimage *image,
                                         unsigned int order)
{
        struct page *pages = NULL;

        switch (image->type) {
        case KEXEC_TYPE_DEFAULT:
                pages = kimage_alloc_normal_control_pages(image, order);
                break;
#ifdef CONFIG_CRASH_DUMP
        case KEXEC_TYPE_CRASH:
                pages = kimage_alloc_crash_control_pages(image, order);
                break;
#endif
        }

        return pages;
}

static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
{
        if (*image->entry != 0)
                image->entry++;

        if (image->entry == image->last_entry) {
                kimage_entry_t *ind_page;
                struct page *page;

                page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
                if (!page)
                        return -ENOMEM;

                ind_page = page_address(page);
                *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION;
                image->entry = ind_page;
                image->last_entry = ind_page +
                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
        }
        *image->entry = entry;
        image->entry++;
        *image->entry = 0;

        return 0;
}

static int kimage_set_destination(struct kimage *image,
                                   unsigned long destination)
{
        destination &= PAGE_MASK;

        return kimage_add_entry(image, destination | IND_DESTINATION);
}


static int kimage_add_page(struct kimage *image, unsigned long page)
{
        page &= PAGE_MASK;

        return kimage_add_entry(image, page | IND_SOURCE);
}


static void kimage_free_extra_pages(struct kimage *image)
{
        /* Walk through and free any extra destination pages I may have */
        kimage_free_page_list(&image->dest_pages);

        /* Walk through and free any unusable pages I have cached */
        kimage_free_page_list(&image->unusable_pages);

}

void kimage_terminate(struct kimage *image)
{
        if (*image->entry != 0)
                image->entry++;

        *image->entry = IND_DONE;
}

#define for_each_kimage_entry(image, ptr, entry) \
        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
                ptr = (entry & IND_INDIRECTION) ? \
                        boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)

static void kimage_free_entry(kimage_entry_t entry)
{
        struct page *page;

        page = boot_pfn_to_page(entry >> PAGE_SHIFT);
        kimage_free_pages(page);
}

static void kimage_free_cma(struct kimage *image)
{
        unsigned long i;

        for (i = 0; i < image->nr_segments; i++) {
                struct page *cma = image->segment_cma[i];
                u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT;

                if (!cma)
                        continue;

                arch_kexec_pre_free_pages(page_address(cma), nr_pages);
                dma_release_from_contiguous(NULL, cma, nr_pages);
                image->segment_cma[i] = NULL;
        }

}

void kimage_free(struct kimage *image)
{
        kimage_entry_t *ptr, entry;
        kimage_entry_t ind = 0;

        if (!image)
                return;

#ifdef CONFIG_CRASH_DUMP
        if (image->vmcoreinfo_data_copy) {
                crash_update_vmcoreinfo_safecopy(NULL);
                vunmap(image->vmcoreinfo_data_copy);
        }
#endif

        kimage_free_extra_pages(image);
        for_each_kimage_entry(image, ptr, entry) {
                if (entry & IND_INDIRECTION) {
                        /* Free the previous indirection page */
                        if (ind & IND_INDIRECTION)
                                kimage_free_entry(ind);
                        /* Save this indirection page until we are
                         * done with it.
                         */
                        ind = entry;
                } else if (entry & IND_SOURCE)
                        kimage_free_entry(entry);
        }
        /* Free the final indirection page */
        if (ind & IND_INDIRECTION)
                kimage_free_entry(ind);

        /* Handle any machine specific cleanup */
        machine_kexec_cleanup(image);

        /* Free the kexec control pages... */
        kimage_free_page_list(&image->control_pages);

        /* Free CMA allocations */
        kimage_free_cma(image);

        /*
         * Free up any temporary buffers allocated. This might hit if
         * error occurred much later after buffer allocation.
         */
        if (image->file_mode)
                kimage_file_post_load_cleanup(image);

        kfree(image);
}

static kimage_entry_t *kimage_dst_used(struct kimage *image,
                                        unsigned long page)
{
        kimage_entry_t *ptr, entry;
        unsigned long destination = 0;

        for_each_kimage_entry(image, ptr, entry) {
                if (entry & IND_DESTINATION)
                        destination = entry & PAGE_MASK;
                else if (entry & IND_SOURCE) {
                        if (page == destination)
                                return ptr;
                        destination += PAGE_SIZE;
                }
        }

        return NULL;
}

static struct page *kimage_alloc_page(struct kimage *image,
                                        gfp_t gfp_mask,
                                        unsigned long destination)
{
        /*
         * Here we implement safeguards to ensure that a source page
         * is not copied to its destination page before the data on
         * the destination page is no longer useful.
         *
         * To do this we maintain the invariant that a source page is
         * either its own destination page, or it is not a
         * destination page at all.
         *
         * That is slightly stronger than required, but the proof
         * that no problems will not occur is trivial, and the
         * implementation is simply to verify.
         *
         * When allocating all pages normally this algorithm will run
         * in O(N) time, but in the worst case it will run in O(N^2)
         * time.   If the runtime is a problem the data structures can
         * be fixed.
         */
        struct page *page;
        unsigned long addr;

        /*
         * Walk through the list of destination pages, and see if I
         * have a match.
         */
        list_for_each_entry(page, &image->dest_pages, lru) {
                addr = page_to_boot_pfn(page) << PAGE_SHIFT;
                if (addr == destination) {
                        list_del(&page->lru);
                        return page;
                }
        }
        page = NULL;
        while (1) {
                kimage_entry_t *old;

                /* Allocate a page, if we run out of memory give up */
                page = kimage_alloc_pages(gfp_mask, 0);
                if (!page)
                        return NULL;
                /* If the page cannot be used file it away */
                if (page_to_boot_pfn(page) >
                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
                        list_add(&page->lru, &image->unusable_pages);
                        continue;
                }
                addr = page_to_boot_pfn(page) << PAGE_SHIFT;

                /* If it is the destination page we want use it */
                if (addr == destination)
                        break;

                /* If the page is not a destination page use it */
                if (!kimage_is_destination_range(image, addr,
                                                  addr + PAGE_SIZE - 1))
                        break;

                /*
                 * I know that the page is someones destination page.
                 * See if there is already a source page for this
                 * destination page.  And if so swap the source pages.
                 */
                old = kimage_dst_used(image, addr);
                if (old) {
                        /* If so move it */
                        unsigned long old_addr;
                        struct page *old_page;

                        old_addr = *old & PAGE_MASK;
                        old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT);
                        copy_highpage(page, old_page);
                        *old = addr | (*old & ~PAGE_MASK);

                        /* The old page I have found cannot be a
                         * destination page, so return it if it's
                         * gfp_flags honor the ones passed in.
                         */
                        if (!(gfp_mask & __GFP_HIGHMEM) &&
                            PageHighMem(old_page)) {
                                kimage_free_pages(old_page);
                                continue;
                        }
                        page = old_page;
                        break;
                }
                /* Place the page on the destination list, to be used later */
                list_add(&page->lru, &image->dest_pages);
        }

        return page;
}

static int kimage_load_cma_segment(struct kimage *image, int idx)
{
        struct kexec_segment *segment = &image->segment[idx];
        struct page *cma = image->segment_cma[idx];
        char *ptr = page_address(cma);
        size_t ubytes, mbytes;
        int result = 0;
        unsigned char __user *buf = NULL;
        unsigned char *kbuf = NULL;

        if (image->file_mode)
                kbuf = segment->kbuf;
        else
                buf = segment->buf;
        ubytes = segment->bufsz;
        mbytes = segment->memsz;

        /* Then copy from source buffer to the CMA one */
        while (mbytes) {
                size_t uchunk, mchunk;

                mchunk = min_t(size_t, mbytes, PAGE_SIZE);
                uchunk = min(ubytes, mchunk);

                if (uchunk) {
                        /* For file based kexec, source pages are in kernel memory */
                        if (image->file_mode)
                                memcpy(ptr, kbuf, uchunk);
                        else
                                result = copy_from_user(ptr, buf, uchunk);
                        ubytes -= uchunk;
                        if (image->file_mode)
                                kbuf += uchunk;
                        else
                                buf += uchunk;
                }

                if (result) {
                        result = -EFAULT;
                        goto out;
                }

                ptr    += mchunk;
                mbytes -= mchunk;

                cond_resched();
        }

        /* Clear any remainder */
        memset(ptr, 0, mbytes);

out:
        return result;
}

static int kimage_load_normal_segment(struct kimage *image, int idx)
{
        struct kexec_segment *segment = &image->segment[idx];
        unsigned long maddr;
        size_t ubytes, mbytes;
        int result;
        unsigned char __user *buf = NULL;
        unsigned char *kbuf = NULL;

        if (image->file_mode)
                kbuf = segment->kbuf;
        else
                buf = segment->buf;
        ubytes = segment->bufsz;
        mbytes = segment->memsz;
        maddr = segment->mem;

        if (image->segment_cma[idx])
                return kimage_load_cma_segment(image, idx);

        result = kimage_set_destination(image, maddr);
        if (result < 0)
                goto out;

        while (mbytes) {
                struct page *page;
                char *ptr;
                size_t uchunk, mchunk;

                page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
                if (!page) {
                        result  = -ENOMEM;
                        goto out;
                }
                result = kimage_add_page(image, page_to_boot_pfn(page)
                                                                << PAGE_SHIFT);
                if (result < 0)
                        goto out;

                ptr = kmap_local_page(page);
                /* Start with a clear page */
                clear_page(ptr);
                mchunk = min_t(size_t, mbytes, PAGE_SIZE);
                uchunk = min(ubytes, mchunk);

                if (uchunk) {
                        /* For file based kexec, source pages are in kernel memory */
                        if (image->file_mode)
                                memcpy(ptr, kbuf, uchunk);
                        else
                                result = copy_from_user(ptr, buf, uchunk);
                        ubytes -= uchunk;
                        if (image->file_mode)
                                kbuf += uchunk;
                        else
                                buf += uchunk;
                }
                kunmap_local(ptr);
                if (result) {
                        result = -EFAULT;
                        goto out;
                }
                maddr  += mchunk;
                mbytes -= mchunk;

                cond_resched();
        }
out:
        return result;
}

#ifdef CONFIG_CRASH_DUMP
static int kimage_load_crash_segment(struct kimage *image, int idx)
{
        /* For crash dumps kernels we simply copy the data from
         * user space to it's destination.
         * We do things a page at a time for the sake of kmap.
         */
        struct kexec_segment *segment = &image->segment[idx];
        unsigned long maddr;
        size_t ubytes, mbytes;
        int result;
        unsigned char __user *buf = NULL;
        unsigned char *kbuf = NULL;

        result = 0;
        if (image->file_mode)
                kbuf = segment->kbuf;
        else
                buf = segment->buf;
        ubytes = segment->bufsz;
        mbytes = segment->memsz;
        maddr = segment->mem;
        while (mbytes) {
                struct page *page;
                char *ptr;
                size_t uchunk, mchunk;

                page = boot_pfn_to_page(maddr >> PAGE_SHIFT);
                if (!page) {
                        result  = -ENOMEM;
                        goto out;
                }
                arch_kexec_post_alloc_pages(page_address(page), 1, 0);
                ptr = kmap_local_page(page);
                mchunk = min_t(size_t, mbytes, PAGE_SIZE);
                uchunk = min(ubytes, mchunk);
                if (mchunk > uchunk) {
                        /* Zero the trailing part of the page */
                        memset(ptr + uchunk, 0, mchunk - uchunk);
                }

                if (uchunk) {
                        /* For file based kexec, source pages are in kernel memory */
                        if (image->file_mode)
                                memcpy(ptr, kbuf, uchunk);
                        else
                                result = copy_from_user(ptr, buf, uchunk);
                        ubytes -= uchunk;
                        if (image->file_mode)
                                kbuf += uchunk;
                        else
                                buf += uchunk;
                }
                kexec_flush_icache_page(page);
                kunmap_local(ptr);
                arch_kexec_pre_free_pages(page_address(page), 1);
                if (result) {
                        result = -EFAULT;
                        goto out;
                }
                maddr  += mchunk;
                mbytes -= mchunk;

                cond_resched();
        }
out:
        return result;
}
#endif

int kimage_load_segment(struct kimage *image, int idx)
{
        int result = -ENOMEM;

        switch (image->type) {
        case KEXEC_TYPE_DEFAULT:
                result = kimage_load_normal_segment(image, idx);
                break;
#ifdef CONFIG_CRASH_DUMP
        case KEXEC_TYPE_CRASH:
                result = kimage_load_crash_segment(image, idx);
                break;
#endif
        }

        return result;
}

void *kimage_map_segment(struct kimage *image, int idx)
{
        unsigned long addr, size, eaddr;
        unsigned long src_page_addr, dest_page_addr = 0;
        kimage_entry_t *ptr, entry;
        struct page **src_pages;
        unsigned int npages;
        struct page *cma;
        void *vaddr = NULL;
        int i;

        cma = image->segment_cma[idx];
        if (cma)
                return page_address(cma);

        addr = image->segment[idx].mem;
        size = image->segment[idx].memsz;
        eaddr = addr + size;
        /*
         * Collect the source pages and map them in a contiguous VA range.
         */
        npages = PFN_UP(eaddr) - PFN_DOWN(addr);
        src_pages = kmalloc_objs(*src_pages, npages);
        if (!src_pages) {
                pr_err("Could not allocate ima pages array.\n");
                return NULL;
        }

        i = 0;
        for_each_kimage_entry(image, ptr, entry) {
                if (entry & IND_DESTINATION) {
                        dest_page_addr = entry & PAGE_MASK;
                } else if (entry & IND_SOURCE) {
                        if (dest_page_addr >= addr && dest_page_addr < eaddr) {
                                src_page_addr = entry & PAGE_MASK;
                                src_pages[i++] =
                                        virt_to_page(__va(src_page_addr));
                                if (i == npages)
                                        break;
                                dest_page_addr += PAGE_SIZE;
                        }
                }
        }

        /* Sanity check. */
        WARN_ON(i < npages);

        vaddr = vmap(src_pages, npages, VM_MAP, PAGE_KERNEL);
        kfree(src_pages);

        if (!vaddr)
                pr_err("Could not map ima buffer.\n");

        return vaddr;
}

void kimage_unmap_segment(void *segment_buffer)
{
        if (is_vmalloc_addr(segment_buffer))
                vunmap(segment_buffer);
}

struct kexec_load_limit {
        /* Mutex protects the limit count. */
        struct mutex mutex;
        int limit;
};

static struct kexec_load_limit load_limit_reboot = {
        .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
        .limit = -1,
};

static struct kexec_load_limit load_limit_panic = {
        .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
        .limit = -1,
};

struct kimage *kexec_image;
struct kimage *kexec_crash_image;
static int kexec_load_disabled;

#ifdef CONFIG_SYSCTL
static int kexec_limit_handler(const struct ctl_table *table, int write,
                               void *buffer, size_t *lenp, loff_t *ppos)
{
        struct kexec_load_limit *limit = table->data;
        int val;
        struct ctl_table tmp = {
                .data = &val,
                .maxlen = sizeof(val),
                .mode = table->mode,
        };
        int ret;

        if (write) {
                ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
                if (ret)
                        return ret;

                if (val < 0)
                        return -EINVAL;

                mutex_lock(&limit->mutex);
                if (limit->limit != -1 && val >= limit->limit)
                        ret = -EINVAL;
                else
                        limit->limit = val;
                mutex_unlock(&limit->mutex);

                return ret;
        }

        mutex_lock(&limit->mutex);
        val = limit->limit;
        mutex_unlock(&limit->mutex);

        return proc_dointvec(&tmp, write, buffer, lenp, ppos);
}

static const struct ctl_table kexec_core_sysctls[] = {
        {
                .procname        = "kexec_load_disabled",
                .data                = &kexec_load_disabled,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                /* only handle a transition from default "0" to "1" */
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ONE,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "kexec_load_limit_panic",
                .data                = &load_limit_panic,
                .mode                = 0644,
                .proc_handler        = kexec_limit_handler,
        },
        {
                .procname        = "kexec_load_limit_reboot",
                .data                = &load_limit_reboot,
                .mode                = 0644,
                .proc_handler        = kexec_limit_handler,
        },
};

static int __init kexec_core_sysctl_init(void)
{
        register_sysctl_init("kernel", kexec_core_sysctls);
        return 0;
}
late_initcall(kexec_core_sysctl_init);
#endif

bool kexec_load_permitted(int kexec_image_type)
{
        struct kexec_load_limit *limit;

        /*
         * Only the superuser can use the kexec syscall and if it has not
         * been disabled.
         */
        if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
                return false;

        /* Check limit counter and decrease it.*/
        limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
                &load_limit_panic : &load_limit_reboot;
        mutex_lock(&limit->mutex);
        if (!limit->limit) {
                mutex_unlock(&limit->mutex);
                return false;
        }
        if (limit->limit != -1)
                limit->limit--;
        mutex_unlock(&limit->mutex);

        return true;
}

/*
 * Move into place and start executing a preloaded standalone
 * executable.  If nothing was preloaded return an error.
 */
int kernel_kexec(void)
{
        int error = 0;

        if (!kexec_trylock())
                return -EBUSY;
        if (!kexec_image) {
                error = -EINVAL;
                goto Unlock;
        }

        error = liveupdate_reboot();
        if (error)
                goto Unlock;

#ifdef CONFIG_KEXEC_JUMP
        if (kexec_image->preserve_context) {
                /*
                 * This flow is analogous to hibernation flows that occur
                 * before creating an image and before jumping from the
                 * restore kernel to the image one, so it uses the same
                 * device callbacks as those two flows.
                 */
                pm_prepare_console();
                error = freeze_processes();
                if (error) {
                        error = -EBUSY;
                        goto Restore_console;
                }
                console_suspend_all();
                error = dpm_suspend_start(PMSG_FREEZE);
                if (error)
                        goto Resume_devices;
                /*
                 * dpm_suspend_end() must be called after dpm_suspend_start()
                 * to complete the transition, like in the hibernation flows
                 * mentioned above.
                 */
                error = dpm_suspend_end(PMSG_FREEZE);
                if (error)
                        goto Resume_devices;
                error = suspend_disable_secondary_cpus();
                if (error)
                        goto Enable_cpus;
                local_irq_disable();
                error = syscore_suspend();
                if (error)
                        goto Enable_irqs;
        } else
#endif
        {
                kexec_in_progress = true;
                kernel_restart_prepare("kexec reboot");
                migrate_to_reboot_cpu();
                syscore_shutdown();

                /*
                 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
                 * no further code needs to use CPU hotplug (which is true in
                 * the reboot case). However, the kexec path depends on using
                 * CPU hotplug again; so re-enable it here.
                 */
                cpu_hotplug_enable();
                pr_notice("Starting new kernel\n");
                machine_shutdown();
        }

        kmsg_dump(KMSG_DUMP_SHUTDOWN);
        machine_kexec(kexec_image);

#ifdef CONFIG_KEXEC_JUMP
        if (kexec_image->preserve_context) {
                /*
                 * This flow is analogous to hibernation flows that occur after
                 * creating an image and after the image kernel has got control
                 * back, and in case the devices have been reset or otherwise
                 * manipulated in the meantime, it uses the device callbacks
                 * used by the latter.
                 */
                syscore_resume();
 Enable_irqs:
                local_irq_enable();
 Enable_cpus:
                suspend_enable_secondary_cpus();
                dpm_resume_start(PMSG_RESTORE);
 Resume_devices:
                dpm_resume_end(PMSG_RESTORE);
                console_resume_all();
                thaw_processes();
 Restore_console:
                pm_restore_console();
        }
#endif

 Unlock:
        kexec_unlock();
        return error;
}

static ssize_t loaded_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n", !!kexec_image);
}
static struct kobj_attribute loaded_attr = __ATTR_RO(loaded);

#ifdef CONFIG_CRASH_DUMP
static ssize_t crash_loaded_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
}
static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded);

#ifdef CONFIG_CRASH_RESERVE
static ssize_t crash_cma_ranges_show(struct kobject *kobj,
                                     struct kobj_attribute *attr, char *buf)
{

        ssize_t len = 0;
        int i;

        for (i = 0; i < crashk_cma_cnt; ++i) {
                len += sysfs_emit_at(buf, len, "%08llx-%08llx\n",
                                     crashk_cma_ranges[i].start,
                                     crashk_cma_ranges[i].end);
        }
        return len;
}
static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges);
#endif

static ssize_t crash_size_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        ssize_t size = crash_get_memory_size();

        if (size < 0)
                return size;

        return sysfs_emit(buf, "%zd\n", size);
}
static ssize_t crash_size_store(struct kobject *kobj,
                                struct kobj_attribute *attr,
                                const char *buf, size_t count)
{
        unsigned long cnt;
        int ret;

        if (kstrtoul(buf, 0, &cnt))
                return -EINVAL;

        ret = crash_shrink_memory(cnt);
        return ret < 0 ? ret : count;
}
static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size);

#ifdef CONFIG_CRASH_HOTPLUG
static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
                               struct kobj_attribute *attr, char *buf)
{
        unsigned int sz = crash_get_elfcorehdr_size();

        return sysfs_emit(buf, "%u\n", sz);
}
static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size);

#endif /* CONFIG_CRASH_HOTPLUG */
#endif /* CONFIG_CRASH_DUMP */

static struct attribute *kexec_attrs[] = {
        &loaded_attr.attr,
#ifdef CONFIG_CRASH_DUMP
        &crash_loaded_attr.attr,
        &crash_size_attr.attr,
#ifdef CONFIG_CRASH_RESERVE
        &crash_cma_ranges_attr.attr,
#endif
#ifdef CONFIG_CRASH_HOTPLUG
        &crash_elfcorehdr_size_attr.attr,
#endif
#endif
        NULL
};

struct kexec_link_entry {
        const char *target;
        const char *name;
};

static struct kexec_link_entry kexec_links[] = {
        { "loaded", "kexec_loaded" },
#ifdef CONFIG_CRASH_DUMP
        { "crash_loaded", "kexec_crash_loaded" },
        { "crash_size", "kexec_crash_size" },
#ifdef CONFIG_CRASH_RESERVE
        {"crash_cma_ranges", "kexec_crash_cma_ranges"},
#endif
#ifdef CONFIG_CRASH_HOTPLUG
        { "crash_elfcorehdr_size", "crash_elfcorehdr_size" },
#endif
#endif
};

static struct kobject *kexec_kobj;
ATTRIBUTE_GROUPS(kexec);

static int __init init_kexec_sysctl(void)
{
        int error;
        int i;

        kexec_kobj = kobject_create_and_add("kexec", kernel_kobj);
        if (!kexec_kobj) {
                pr_err("failed to create kexec kobject\n");
                return -ENOMEM;
        }

        error = sysfs_create_groups(kexec_kobj, kexec_groups);
        if (error)
                goto kset_exit;

        for (i = 0; i < ARRAY_SIZE(kexec_links); i++) {
                error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj,
                                                             kexec_links[i].target,
                                                             kexec_links[i].name);
                if (error)
                        pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error);
        }

        return 0;

kset_exit:
        kobject_put(kexec_kobj);
        return error;
}

subsys_initcall(init_kexec_sysctl);






























































































































































    1 





























    1 









    1 





























































































    1 












    1 




    1 





    1 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _INET_ECN_H_
#define _INET_ECN_H_

#include <linux/ip.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>

#include <net/inet_sock.h>
#include <net/dsfield.h>
#include <net/checksum.h>

enum {
        INET_ECN_NOT_ECT = 0,
        INET_ECN_ECT_1 = 1,
        INET_ECN_ECT_0 = 2,
        INET_ECN_CE = 3,
        INET_ECN_MASK = 3,
};

extern int sysctl_tunnel_ecn_log;

static inline int INET_ECN_is_ce(__u8 dsfield)
{
        return (dsfield & INET_ECN_MASK) == INET_ECN_CE;
}

static inline int INET_ECN_is_not_ect(__u8 dsfield)
{
        return (dsfield & INET_ECN_MASK) == INET_ECN_NOT_ECT;
}

static inline int INET_ECN_is_capable(__u8 dsfield)
{
        return dsfield & INET_ECN_ECT_0;
}

/*
 * RFC 3168 9.1.1
 *  The full-functionality option for ECN encapsulation is to copy the
 *  ECN codepoint of the inside header to the outside header on
 *  encapsulation if the inside header is not-ECT or ECT, and to set the
 *  ECN codepoint of the outside header to ECT(0) if the ECN codepoint of
 *  the inside header is CE.
 */
static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner)
{
        outer &= ~INET_ECN_MASK;
        outer |= !INET_ECN_is_ce(inner) ? (inner & INET_ECN_MASK) :
                                          INET_ECN_ECT_0;
        return outer;
}

/* Apply either ECT(0) or ECT(1) */
static inline void __INET_ECN_xmit(struct sock *sk, bool use_ect_1)
{
        __u8 ect = use_ect_1 ? INET_ECN_ECT_1 : INET_ECN_ECT_0;

        /* Mask the complete byte in case the connection alternates between
         * ECT(0) and ECT(1).
         */
        inet_sk(sk)->tos &= ~INET_ECN_MASK;
        inet_sk(sk)->tos |= ect;
        if (inet6_sk(sk)) {
                inet6_sk(sk)->tclass &= ~INET_ECN_MASK;
                inet6_sk(sk)->tclass |= ect;
        }
}

static inline void INET_ECN_xmit(struct sock *sk)
{
        __INET_ECN_xmit(sk, false);
}

static inline void INET_ECN_dontxmit(struct sock *sk)
{
        inet_sk(sk)->tos &= ~INET_ECN_MASK;
        if (inet6_sk(sk) != NULL)
                inet6_sk(sk)->tclass &= ~INET_ECN_MASK;
}

#define IP6_ECN_flow_init(label) do {                \
      (label) &= ~htonl(INET_ECN_MASK << 20);        \
    } while (0)

#define        IP6_ECN_flow_xmit(sk, label) do {                                \
        if (INET_ECN_is_capable(inet6_sk(sk)->tclass))                        \
                (label) |= htonl(INET_ECN_ECT_0 << 20);                        \
    } while (0)

static inline int IP_ECN_set_ce(struct iphdr *iph)
{
        u32 ecn = (iph->tos + 1) & INET_ECN_MASK;
        __be16 check_add;

        /*
         * After the last operation we have (in binary):
         * INET_ECN_NOT_ECT => 01
         * INET_ECN_ECT_1   => 10
         * INET_ECN_ECT_0   => 11
         * INET_ECN_CE      => 00
         */
        if (!(ecn & 2))
                return !ecn;

        /*
         * The following gives us:
         * INET_ECN_ECT_1 => check += htons(0xFFFD)
         * INET_ECN_ECT_0 => check += htons(0xFFFE)
         */
        check_add = (__force __be16)((__force u16)htons(0xFFFB) +
                                     (__force u16)htons(ecn));

        iph->check = csum16_add(iph->check, check_add);
        iph->tos |= INET_ECN_CE;
        return 1;
}

static inline int IP_ECN_set_ect1(struct iphdr *iph)
{
        if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0)
                return 0;

        iph->check = csum16_add(iph->check, htons(0x1));
        iph->tos ^= INET_ECN_MASK;
        return 1;
}

static inline void IP_ECN_clear(struct iphdr *iph)
{
        iph->tos &= ~INET_ECN_MASK;
}

static inline void ipv4_copy_dscp(unsigned int dscp, struct iphdr *inner)
{
        dscp &= ~INET_ECN_MASK;
        ipv4_change_dsfield(inner, INET_ECN_MASK, dscp);
}

struct ipv6hdr;

/* Note:
 * IP_ECN_set_ce() has to tweak IPV4 checksum when setting CE,
 * meaning both changes have no effect on skb->csum if/when CHECKSUM_COMPLETE
 * In IPv6 case, no checksum compensates the change in IPv6 header,
 * so we have to update skb->csum.
 */
static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph)
{
        __be32 from, to;

        if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph)))
                return 0;

        from = *(__be32 *)iph;
        to = from | htonl(INET_ECN_CE << 20);
        *(__be32 *)iph = to;
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from),
                                     (__force __wsum)to);
        return 1;
}

static inline int IP6_ECN_set_ect1(struct sk_buff *skb, struct ipv6hdr *iph)
{
        __be32 from, to;

        if ((ipv6_get_dsfield(iph) & INET_ECN_MASK) != INET_ECN_ECT_0)
                return 0;

        from = *(__be32 *)iph;
        to = from ^ htonl(INET_ECN_MASK << 20);
        *(__be32 *)iph = to;
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from),
                                     (__force __wsum)to);
        return 1;
}

static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner)
{
        dscp &= ~INET_ECN_MASK;
        ipv6_change_dsfield(inner, INET_ECN_MASK, dscp);
}

static inline int INET_ECN_set_ce(struct sk_buff *skb)
{
        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                if (skb_network_header(skb) + sizeof(struct iphdr) <=
                    skb_tail_pointer(skb))
                        return IP_ECN_set_ce(ip_hdr(skb));
                break;

        case cpu_to_be16(ETH_P_IPV6):
                if (skb_network_header(skb) + sizeof(struct ipv6hdr) <=
                    skb_tail_pointer(skb))
                        return IP6_ECN_set_ce(skb, ipv6_hdr(skb));
                break;
        }

        return 0;
}

static inline int skb_get_dsfield(struct sk_buff *skb)
{
        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
                        break;
                return ipv4_get_dsfield(ip_hdr(skb));

        case cpu_to_be16(ETH_P_IPV6):
                if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
                        break;
                return ipv6_get_dsfield(ipv6_hdr(skb));
        }

        return -1;
}

static inline int INET_ECN_set_ect1(struct sk_buff *skb)
{
        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                if (skb_network_header(skb) + sizeof(struct iphdr) <=
                    skb_tail_pointer(skb))
                        return IP_ECN_set_ect1(ip_hdr(skb));
                break;

        case cpu_to_be16(ETH_P_IPV6):
                if (skb_network_header(skb) + sizeof(struct ipv6hdr) <=
                    skb_tail_pointer(skb))
                        return IP6_ECN_set_ect1(skb, ipv6_hdr(skb));
                break;
        }

        return 0;
}

/*
 * RFC 6040 4.2
 *  To decapsulate the inner header at the tunnel egress, a compliant
 *  tunnel egress MUST set the outgoing ECN field to the codepoint at the
 *  intersection of the appropriate arriving inner header (row) and outer
 *  header (column) in Figure 4
 *
 *      +---------+------------------------------------------------+
 *      |Arriving |            Arriving Outer Header               |
 *      |   Inner +---------+------------+------------+------------+
 *      |  Header | Not-ECT | ECT(0)     | ECT(1)     |     CE     |
 *      +---------+---------+------------+------------+------------+
 *      | Not-ECT | Not-ECT |Not-ECT(!!!)|Not-ECT(!!!)| <drop>(!!!)|
 *      |  ECT(0) |  ECT(0) | ECT(0)     | ECT(1)     |     CE     |
 *      |  ECT(1) |  ECT(1) | ECT(1) (!) | ECT(1)     |     CE     |
 *      |    CE   |      CE |     CE     |     CE(!!!)|     CE     |
 *      +---------+---------+------------+------------+------------+
 *
 *             Figure 4: New IP in IP Decapsulation Behaviour
 *
 *  returns 0 on success
 *          1 if something is broken and should be logged (!!! above)
 *          2 if packet should be dropped
 */
static inline int __INET_ECN_decapsulate(__u8 outer, __u8 inner, bool *set_ce)
{
        if (INET_ECN_is_not_ect(inner)) {
                switch (outer & INET_ECN_MASK) {
                case INET_ECN_NOT_ECT:
                        return 0;
                case INET_ECN_ECT_0:
                case INET_ECN_ECT_1:
                        return 1;
                case INET_ECN_CE:
                        return 2;
                }
        }

        *set_ce = INET_ECN_is_ce(outer);
        return 0;
}

static inline int INET_ECN_decapsulate(struct sk_buff *skb,
                                       __u8 outer, __u8 inner)
{
        bool set_ce = false;
        int rc;

        rc = __INET_ECN_decapsulate(outer, inner, &set_ce);
        if (!rc) {
                if (set_ce)
                        INET_ECN_set_ce(skb);
                else if ((outer & INET_ECN_MASK) == INET_ECN_ECT_1)
                        INET_ECN_set_ect1(skb);
        }

        return rc;
}

static inline int IP_ECN_decapsulate(const struct iphdr *oiph,
                                     struct sk_buff *skb)
{
        __u8 inner;

        switch (skb_protocol(skb, true)) {
        case htons(ETH_P_IP):
                inner = ip_hdr(skb)->tos;
                break;
        case htons(ETH_P_IPV6):
                inner = ipv6_get_dsfield(ipv6_hdr(skb));
                break;
        default:
                return 0;
        }

        return INET_ECN_decapsulate(skb, oiph->tos, inner);
}

static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h,
                                      struct sk_buff *skb)
{
        __u8 inner;

        switch (skb_protocol(skb, true)) {
        case htons(ETH_P_IP):
                inner = ip_hdr(skb)->tos;
                break;
        case htons(ETH_P_IPV6):
                inner = ipv6_get_dsfield(ipv6_hdr(skb));
                break;
        default:
                return 0;
        }

        return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner);
}
#endif






























    3 












    3 

    3 

    3 







    3 
    3 

























    3 















    3 








    3 
    3 
    3 



































    3 











    3 
























    3 






























































































    3 

    3 

    3 



    3 

    3 













    3 










































    3 









    3 





















    3 















    3 













    2 











    3 









































    3 











    3 



















    3 



    3 
    3 







    2 






    3 













    3 









    3 


    3 





















    3 
    3 

























    3 




    3 


    3 












    3 

































    3 



































    3 















    3 


    3 







    3 



    3 















    3 








    3 








    3 
















    3 


    3 























    3 




    3 





    3 


















    3 

    3 






















































    3 








































































    1 








    3 







    3 


















































































































































    3 



    3 



    3 
    3 




    3 























    3 








    3 


























    3 



    3 



    3 























































































































    3 






























    3 


    3 








    2 



    3 










    3 






    3 











    3 






    3 


    3 


    3 




















    3 
    3 

    3 










    3 
































    3 




    3 





    3 





























    3 



    3 








    2 













    3 


    2 













    3 









    3 


    3 






    3 























































































    3 




    3 

















































    2 



    1 






































    2 


















    2 

















    2 




















    3 
















    3 































    3 
    3 


    3 
    3 

    3 


















    3 


    3 







    3 





    3 













    3 





    3 

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/power/runtime.c - Helper functions for device runtime PM
 *
 * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
 * Copyright (C) 2010 Alan Stern <stern@rowland.harvard.edu>
 */
#include <linux/sched/mm.h>
#include <linux/ktime.h>
#include <linux/hrtimer.h>
#include <linux/export.h>
#include <linux/pm_runtime.h>
#include <linux/pm_wakeirq.h>
#include <linux/rculist.h>
#include <trace/events/rpm.h>

#include "../base.h"
#include "power.h"

typedef int (*pm_callback_t)(struct device *);

static inline pm_callback_t get_callback_ptr(const void *start, size_t offset)
{
        return *(pm_callback_t *)(start + offset);
}

static pm_callback_t __rpm_get_driver_callback(struct device *dev,
                                               size_t cb_offset)
{
        if (dev->driver && dev->driver->pm)
                return get_callback_ptr(dev->driver->pm, cb_offset);

        return NULL;
}

static pm_callback_t __rpm_get_callback(struct device *dev, size_t cb_offset)
{
        const struct dev_pm_ops *ops;
        pm_callback_t cb = NULL;

        if (dev->pm_domain)
                ops = &dev->pm_domain->ops;
        else if (dev->type && dev->type->pm)
                ops = dev->type->pm;
        else if (dev->class && dev->class->pm)
                ops = dev->class->pm;
        else if (dev->bus && dev->bus->pm)
                ops = dev->bus->pm;
        else
                ops = NULL;

        if (ops)
                cb = get_callback_ptr(ops, cb_offset);

        if (!cb)
                cb = __rpm_get_driver_callback(dev, cb_offset);

        return cb;
}

#define RPM_GET_CALLBACK(dev, callback) \
                __rpm_get_callback(dev, offsetof(struct dev_pm_ops, callback))

static int rpm_resume(struct device *dev, int rpmflags);
static int rpm_suspend(struct device *dev, int rpmflags);

/**
 * update_pm_runtime_accounting - Update the time accounting of power states
 * @dev: Device to update the accounting for
 *
 * In order to be able to have time accounting of the various power states
 * (as used by programs such as PowerTOP to show the effectiveness of runtime
 * PM), we need to track the time spent in each state.
 * update_pm_runtime_accounting must be called each time before the
 * runtime_status field is updated, to account the time in the old state
 * correctly.
 */
static void update_pm_runtime_accounting(struct device *dev)
{
        u64 now, last, delta;

        if (dev->power.disable_depth > 0)
                return;

        last = dev->power.accounting_timestamp;

        now = ktime_get_mono_fast_ns();
        dev->power.accounting_timestamp = now;

        /*
         * Because ktime_get_mono_fast_ns() is not monotonic during
         * timekeeping updates, ensure that 'now' is after the last saved
         * timestamp.
         */
        if (now < last)
                return;

        delta = now - last;

        if (dev->power.runtime_status == RPM_SUSPENDED)
                dev->power.suspended_time += delta;
        else
                dev->power.active_time += delta;
}

static void __update_runtime_status(struct device *dev, enum rpm_status status)
{
        update_pm_runtime_accounting(dev);
        trace_rpm_status(dev, status);
        dev->power.runtime_status = status;
}

static u64 rpm_get_accounted_time(struct device *dev, bool suspended)
{
        u64 time;
        unsigned long flags;

        spin_lock_irqsave(&dev->power.lock, flags);

        update_pm_runtime_accounting(dev);
        time = suspended ? dev->power.suspended_time : dev->power.active_time;

        spin_unlock_irqrestore(&dev->power.lock, flags);

        return time;
}

u64 pm_runtime_active_time(struct device *dev)
{
        return rpm_get_accounted_time(dev, false);
}

u64 pm_runtime_suspended_time(struct device *dev)
{
        return rpm_get_accounted_time(dev, true);
}
EXPORT_SYMBOL_GPL(pm_runtime_suspended_time);

/**
 * pm_runtime_deactivate_timer - Deactivate given device's suspend timer.
 * @dev: Device to handle.
 */
static void pm_runtime_deactivate_timer(struct device *dev)
{
        if (dev->power.timer_expires > 0) {
                hrtimer_try_to_cancel(&dev->power.suspend_timer);
                dev->power.timer_expires = 0;
        }
}

/**
 * pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests.
 * @dev: Device to handle.
 */
static void pm_runtime_cancel_pending(struct device *dev)
{
        pm_runtime_deactivate_timer(dev);
        /*
         * In case there's a request pending, make sure its work function will
         * return without doing anything.
         */
        dev->power.request = RPM_REQ_NONE;
}

/*
 * pm_runtime_autosuspend_expiration - Get a device's autosuspend-delay expiration time.
 * @dev: Device to handle.
 *
 * Compute the autosuspend-delay expiration time based on the device's
 * power.last_busy time.  If the delay has already expired or is disabled
 * (negative) or the power.use_autosuspend flag isn't set, return 0.
 * Otherwise return the expiration time in nanoseconds (adjusted to be nonzero).
 *
 * This function may be called either with or without dev->power.lock held.
 * Either way it can be racy, since power.last_busy may be updated at any time.
 */
u64 pm_runtime_autosuspend_expiration(struct device *dev)
{
        int autosuspend_delay;
        u64 expires;

        if (!dev->power.use_autosuspend)
                return 0;

        autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay);
        if (autosuspend_delay < 0)
                return 0;

        expires  = READ_ONCE(dev->power.last_busy);
        expires += (u64)autosuspend_delay * NSEC_PER_MSEC;
        if (expires > ktime_get_mono_fast_ns())
                return expires;        /* Expires in the future */

        return 0;
}
EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration);

static int dev_memalloc_noio(struct device *dev, void *data)
{
        return dev->power.memalloc_noio;
}

/*
 * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag.
 * @dev: Device to handle.
 * @enable: True for setting the flag and False for clearing the flag.
 *
 * Set the flag for all devices in the path from the device to the
 * root device in the device tree if @enable is true, otherwise clear
 * the flag for devices in the path whose siblings don't set the flag.
 *
 * The function should only be called by block device, or network
 * device driver for solving the deadlock problem during runtime
 * resume/suspend:
 *
 *     If memory allocation with GFP_KERNEL is called inside runtime
 *     resume/suspend callback of any one of its ancestors(or the
 *     block device itself), the deadlock may be triggered inside the
 *     memory allocation since it might not complete until the block
 *     device becomes active and the involved page I/O finishes. The
 *     situation is pointed out first by Alan Stern. Network device
 *     are involved in iSCSI kind of situation.
 *
 * The lock of dev_hotplug_mutex is held in the function for handling
 * hotplug race because pm_runtime_set_memalloc_noio() may be called
 * in async probe().
 *
 * The function should be called between device_add() and device_del()
 * on the affected device(block/network device).
 */
void pm_runtime_set_memalloc_noio(struct device *dev, bool enable)
{
        static DEFINE_MUTEX(dev_hotplug_mutex);

        mutex_lock(&dev_hotplug_mutex);
        for (;;) {
                bool enabled;

                /* hold power lock since bitfield is not SMP-safe. */
                spin_lock_irq(&dev->power.lock);
                enabled = dev->power.memalloc_noio;
                dev->power.memalloc_noio = enable;
                spin_unlock_irq(&dev->power.lock);

                /*
                 * not need to enable ancestors any more if the device
                 * has been enabled.
                 */
                if (enabled && enable)
                        break;

                dev = dev->parent;

                /*
                 * clear flag of the parent device only if all the
                 * children don't set the flag because ancestor's
                 * flag was set by any one of the descendants.
                 */
                if (!dev || (!enable &&
                    device_for_each_child(dev, NULL, dev_memalloc_noio)))
                        break;
        }
        mutex_unlock(&dev_hotplug_mutex);
}
EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio);

/**
 * rpm_check_suspend_allowed - Test whether a device may be suspended.
 * @dev: Device to test.
 */
static int rpm_check_suspend_allowed(struct device *dev)
{
        int retval = 0;

        if (dev->power.runtime_error)
                retval = -EINVAL;
        else if (dev->power.disable_depth > 0)
                retval = -EACCES;
        else if (atomic_read(&dev->power.usage_count))
                retval = -EAGAIN;
        else if (!dev->power.ignore_children && atomic_read(&dev->power.child_count))
                retval = -EBUSY;

        /* Pending resume requests take precedence over suspends. */
        else if ((dev->power.deferred_resume &&
            dev->power.runtime_status == RPM_SUSPENDING) ||
            (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME))
                retval = -EAGAIN;
        else if (__dev_pm_qos_resume_latency(dev) == 0)
                retval = -EPERM;
        else if (dev->power.runtime_status == RPM_SUSPENDED)
                retval = 1;

        return retval;
}

static int rpm_get_suppliers(struct device *dev)
{
        struct device_link *link;

        list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
                                device_links_read_lock_held()) {
                int retval;

                if (!device_link_test(link, DL_FLAG_PM_RUNTIME))
                        continue;

                retval = pm_runtime_get_sync(link->supplier);
                /* Ignore suppliers with disabled runtime PM. */
                if (retval < 0 && retval != -EACCES) {
                        pm_runtime_put_noidle(link->supplier);
                        return retval;
                }
                refcount_inc(&link->rpm_active);
        }
        return 0;
}

/**
 * pm_runtime_release_supplier - Drop references to device link's supplier.
 * @link: Target device link.
 *
 * Drop all runtime PM references associated with @link to its supplier device.
 */
void pm_runtime_release_supplier(struct device_link *link)
{
        struct device *supplier = link->supplier;

        /*
         * The additional power.usage_count check is a safety net in case
         * the rpm_active refcount becomes saturated, in which case
         * refcount_dec_not_one() would return true forever, but it is not
         * strictly necessary.
         */
        while (refcount_dec_not_one(&link->rpm_active) &&
               atomic_read(&supplier->power.usage_count) > 0)
                pm_runtime_put_noidle(supplier);
}

static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend)
{
        struct device_link *link;

        list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
                                device_links_read_lock_held()) {
                pm_runtime_release_supplier(link);
                if (try_to_suspend)
                        pm_request_idle(link->supplier);
        }
}

static void rpm_put_suppliers(struct device *dev)
{
        __rpm_put_suppliers(dev, true);
}

static void rpm_suspend_suppliers(struct device *dev)
{
        struct device_link *link;
        int idx = device_links_read_lock();

        list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
                                device_links_read_lock_held())
                pm_request_idle(link->supplier);

        device_links_read_unlock(idx);
}

/**
 * __rpm_callback - Run a given runtime PM callback for a given device.
 * @cb: Runtime PM callback to run.
 * @dev: Device to run the callback for.
 */
static int __rpm_callback(int (*cb)(struct device *), struct device *dev)
        __releases(&dev->power.lock) __acquires(&dev->power.lock)
{
        int retval = 0, idx;
        bool use_links = dev->power.links_count > 0;

        if (dev->power.irq_safe) {
                spin_unlock(&dev->power.lock);
        } else {
                spin_unlock_irq(&dev->power.lock);

                /*
                 * Resume suppliers if necessary.
                 *
                 * The device's runtime PM status cannot change until this
                 * routine returns, so it is safe to read the status outside of
                 * the lock.
                 */
                if (use_links && dev->power.runtime_status == RPM_RESUMING) {
                        idx = device_links_read_lock();

                        retval = rpm_get_suppliers(dev);
                        if (retval) {
                                rpm_put_suppliers(dev);
                                goto fail;
                        }

                        device_links_read_unlock(idx);
                }
        }

        if (cb)
                retval = cb(dev);

        if (dev->power.irq_safe) {
                spin_lock(&dev->power.lock);
        } else {
                /*
                 * If the device is suspending and the callback has returned
                 * success, drop the usage counters of the suppliers that have
                 * been reference counted on its resume.
                 *
                 * Do that if resume fails too.
                 */
                if (use_links &&
                    ((dev->power.runtime_status == RPM_SUSPENDING && !retval) ||
                    (dev->power.runtime_status == RPM_RESUMING && retval))) {
                        idx = device_links_read_lock();

                        __rpm_put_suppliers(dev, false);

fail:
                        device_links_read_unlock(idx);
                }

                spin_lock_irq(&dev->power.lock);
        }

        return retval;
}

/**
 * rpm_callback - Run a given runtime PM callback for a given device.
 * @cb: Runtime PM callback to run.
 * @dev: Device to run the callback for.
 */
static int rpm_callback(int (*cb)(struct device *), struct device *dev)
{
        int retval;

        if (dev->power.memalloc_noio) {
                unsigned int noio_flag;

                /*
                 * Deadlock might be caused if memory allocation with
                 * GFP_KERNEL happens inside runtime_suspend and
                 * runtime_resume callbacks of one block device's
                 * ancestor or the block device itself. Network
                 * device might be thought as part of iSCSI block
                 * device, so network device and its ancestor should
                 * be marked as memalloc_noio too.
                 */
                noio_flag = memalloc_noio_save();
                retval = __rpm_callback(cb, dev);
                memalloc_noio_restore(noio_flag);
        } else {
                retval = __rpm_callback(cb, dev);
        }

        /*
         * Since -EACCES means that runtime PM is disabled for the given device,
         * it should not be returned by runtime PM callbacks.  If it is returned
         * nevertheless, assume it to be a transient error and convert it to
         * -EAGAIN.
         */
        if (retval == -EACCES)
                retval = -EAGAIN;

        if (retval != -EAGAIN && retval != -EBUSY)
                dev->power.runtime_error = retval;

        return retval;
}

/**
 * rpm_idle - Notify device bus type if the device can be suspended.
 * @dev: Device to notify the bus type about.
 * @rpmflags: Flag bits.
 *
 * Check if the device's runtime PM status allows it to be suspended.  If
 * another idle notification has been started earlier, return immediately.  If
 * the RPM_ASYNC flag is set then queue an idle-notification request; otherwise
 * run the ->runtime_idle() callback directly. If the ->runtime_idle callback
 * doesn't exist or if it returns 0, call rpm_suspend with the RPM_AUTO flag.
 *
 * This function must be called under dev->power.lock with interrupts disabled.
 */
static int rpm_idle(struct device *dev, int rpmflags)
{
        int (*callback)(struct device *);
        int retval;

        trace_rpm_idle(dev, rpmflags);
        retval = rpm_check_suspend_allowed(dev);
        if (retval < 0)
                ;        /* Conditions are wrong. */

        else if ((rpmflags & RPM_GET_PUT) && retval == 1)
                ;        /* put() is allowed in RPM_SUSPENDED */

        /* Idle notifications are allowed only in the RPM_ACTIVE state. */
        else if (dev->power.runtime_status != RPM_ACTIVE)
                retval = -EAGAIN;

        /*
         * Any pending request other than an idle notification takes
         * precedence over us, except that the timer may be running.
         */
        else if (dev->power.request_pending &&
            dev->power.request > RPM_REQ_IDLE)
                retval = -EAGAIN;

        /* Act as though RPM_NOWAIT is always set. */
        else if (dev->power.idle_notification)
                retval = -EINPROGRESS;

        if (retval)
                goto out;

        /* Pending requests need to be canceled. */
        dev->power.request = RPM_REQ_NONE;

        callback = RPM_GET_CALLBACK(dev, runtime_idle);

        /* If no callback assume success. */
        if (!callback || dev->power.no_callbacks)
                goto out;

        /* Carry out an asynchronous or a synchronous idle notification. */
        if (rpmflags & RPM_ASYNC) {
                dev->power.request = RPM_REQ_IDLE;
                if (!dev->power.request_pending) {
                        dev->power.request_pending = true;
                        queue_work(pm_wq, &dev->power.work);
                }
                trace_rpm_return_int(dev, _THIS_IP_, 0);
                return 0;
        }

        dev->power.idle_notification = true;

        if (dev->power.irq_safe)
                spin_unlock(&dev->power.lock);
        else
                spin_unlock_irq(&dev->power.lock);

        retval = callback(dev);

        if (dev->power.irq_safe)
                spin_lock(&dev->power.lock);
        else
                spin_lock_irq(&dev->power.lock);

        dev->power.idle_notification = false;
        wake_up_all(&dev->power.wait_queue);

 out:
        trace_rpm_return_int(dev, _THIS_IP_, retval);
        return retval ? retval : rpm_suspend(dev, rpmflags | RPM_AUTO);
}

/**
 * rpm_suspend - Carry out runtime suspend of given device.
 * @dev: Device to suspend.
 * @rpmflags: Flag bits.
 *
 * Check if the device's runtime PM status allows it to be suspended.
 * Cancel a pending idle notification, autosuspend or suspend. If
 * another suspend has been started earlier, either return immediately
 * or wait for it to finish, depending on the RPM_NOWAIT and RPM_ASYNC
 * flags. If the RPM_ASYNC flag is set then queue a suspend request;
 * otherwise run the ->runtime_suspend() callback directly. When
 * ->runtime_suspend succeeded, if a deferred resume was requested while
 * the callback was running then carry it out, otherwise send an idle
 * notification for its parent (if the suspend succeeded and both
 * ignore_children of parent->power and irq_safe of dev->power are not set).
 * If ->runtime_suspend failed with -EAGAIN or -EBUSY, and if the RPM_AUTO
 * flag is set and the next autosuspend-delay expiration time is in the
 * future, schedule another autosuspend attempt.
 *
 * This function must be called under dev->power.lock with interrupts disabled.
 */
static int rpm_suspend(struct device *dev, int rpmflags)
        __releases(&dev->power.lock) __acquires(&dev->power.lock)
{
        int (*callback)(struct device *);
        struct device *parent = NULL;
        int retval;

        trace_rpm_suspend(dev, rpmflags);

 repeat:
        retval = rpm_check_suspend_allowed(dev);
        if (retval < 0)
                goto out;        /* Conditions are wrong. */

        /* Synchronous suspends are not allowed in the RPM_RESUMING state. */
        if (dev->power.runtime_status == RPM_RESUMING && !(rpmflags & RPM_ASYNC))
                retval = -EAGAIN;

        if (retval)
                goto out;

        /* If the autosuspend_delay time hasn't expired yet, reschedule. */
        if ((rpmflags & RPM_AUTO) && dev->power.runtime_status != RPM_SUSPENDING) {
                u64 expires = pm_runtime_autosuspend_expiration(dev);

                if (expires != 0) {
                        /* Pending requests need to be canceled. */
                        dev->power.request = RPM_REQ_NONE;

                        /*
                         * Optimization: If the timer is already running and is
                         * set to expire at or before the autosuspend delay,
                         * avoid the overhead of resetting it.  Just let it
                         * expire; pm_suspend_timer_fn() will take care of the
                         * rest.
                         */
                        if (!(dev->power.timer_expires &&
                            dev->power.timer_expires <= expires)) {
                                /*
                                 * We add a slack of 25% to gather wakeups
                                 * without sacrificing the granularity.
                                 */
                                u64 slack = (u64)READ_ONCE(dev->power.autosuspend_delay) *
                                                    (NSEC_PER_MSEC >> 2);

                                dev->power.timer_expires = expires;
                                hrtimer_start_range_ns(&dev->power.suspend_timer,
                                                       ns_to_ktime(expires),
                                                       slack,
                                                       HRTIMER_MODE_ABS);
                        }
                        dev->power.timer_autosuspends = 1;
                        goto out;
                }
        }

        /* Other scheduled or pending requests need to be canceled. */
        pm_runtime_cancel_pending(dev);

        if (dev->power.runtime_status == RPM_SUSPENDING) {
                DEFINE_WAIT(wait);

                if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) {
                        retval = -EINPROGRESS;
                        goto out;
                }

                if (dev->power.irq_safe) {
                        spin_unlock(&dev->power.lock);

                        cpu_relax();

                        spin_lock(&dev->power.lock);
                        goto repeat;
                }

                /* Wait for the other suspend running in parallel with us. */
                for (;;) {
                        prepare_to_wait(&dev->power.wait_queue, &wait,
                                        TASK_UNINTERRUPTIBLE);
                        if (dev->power.runtime_status != RPM_SUSPENDING)
                                break;

                        spin_unlock_irq(&dev->power.lock);

                        schedule();

                        spin_lock_irq(&dev->power.lock);
                }
                finish_wait(&dev->power.wait_queue, &wait);
                goto repeat;
        }

        if (dev->power.no_callbacks)
                goto no_callback;        /* Assume success. */

        /* Carry out an asynchronous or a synchronous suspend. */
        if (rpmflags & RPM_ASYNC) {
                dev->power.request = (rpmflags & RPM_AUTO) ?
                    RPM_REQ_AUTOSUSPEND : RPM_REQ_SUSPEND;
                if (!dev->power.request_pending) {
                        dev->power.request_pending = true;
                        queue_work(pm_wq, &dev->power.work);
                }
                goto out;
        }

        __update_runtime_status(dev, RPM_SUSPENDING);

        callback = RPM_GET_CALLBACK(dev, runtime_suspend);

        dev_pm_enable_wake_irq_check(dev, true);
        retval = rpm_callback(callback, dev);
        if (retval)
                goto fail;

        dev_pm_enable_wake_irq_complete(dev);

 no_callback:
        __update_runtime_status(dev, RPM_SUSPENDED);
        pm_runtime_deactivate_timer(dev);

        if (dev->parent) {
                parent = dev->parent;
                atomic_add_unless(&parent->power.child_count, -1, 0);
        }
        wake_up_all(&dev->power.wait_queue);

        if (dev->power.deferred_resume) {
                dev->power.deferred_resume = false;
                rpm_resume(dev, 0);
                retval = -EAGAIN;
                goto out;
        }

        if (dev->power.irq_safe)
                goto out;

        /* Maybe the parent is now able to suspend. */
        if (parent && !parent->power.ignore_children) {
                spin_unlock(&dev->power.lock);

                spin_lock(&parent->power.lock);
                rpm_idle(parent, RPM_ASYNC);
                spin_unlock(&parent->power.lock);

                spin_lock(&dev->power.lock);
        }
        /* Maybe the suppliers are now able to suspend. */
        if (dev->power.links_count > 0) {
                spin_unlock_irq(&dev->power.lock);

                rpm_suspend_suppliers(dev);

                spin_lock_irq(&dev->power.lock);
        }

 out:
        trace_rpm_return_int(dev, _THIS_IP_, retval);

        return retval;

 fail:
        dev_pm_disable_wake_irq_check(dev, true);
        __update_runtime_status(dev, RPM_ACTIVE);
        dev->power.deferred_resume = false;
        wake_up_all(&dev->power.wait_queue);

        /*
         * On transient errors, if the callback routine failed an autosuspend,
         * and if the last_busy time has been updated so that there is a new
         * autosuspend expiration time, automatically reschedule another
         * autosuspend.
         */
        if (!dev->power.runtime_error && (rpmflags & RPM_AUTO) &&
            pm_runtime_autosuspend_expiration(dev) != 0)
                goto repeat;

        pm_runtime_cancel_pending(dev);

        goto out;
}

/**
 * rpm_resume - Carry out runtime resume of given device.
 * @dev: Device to resume.
 * @rpmflags: Flag bits.
 *
 * Check if the device's runtime PM status allows it to be resumed.  Cancel
 * any scheduled or pending requests.  If another resume has been started
 * earlier, either return immediately or wait for it to finish, depending on the
 * RPM_NOWAIT and RPM_ASYNC flags.  Similarly, if there's a suspend running in
 * parallel with this function, either tell the other process to resume after
 * suspending (deferred_resume) or wait for it to finish.  If the RPM_ASYNC
 * flag is set then queue a resume request; otherwise run the
 * ->runtime_resume() callback directly.  Queue an idle notification for the
 * device if the resume succeeded.
 *
 * This function must be called under dev->power.lock with interrupts disabled.
 */
static int rpm_resume(struct device *dev, int rpmflags)
        __releases(&dev->power.lock) __acquires(&dev->power.lock)
{
        int (*callback)(struct device *);
        struct device *parent = NULL;
        int retval = 0;

        trace_rpm_resume(dev, rpmflags);

 repeat:
        if (dev->power.runtime_error) {
                retval = -EINVAL;
        } else if (dev->power.disable_depth > 0) {
                if (dev->power.runtime_status == RPM_ACTIVE &&
                    dev->power.last_status == RPM_ACTIVE)
                        retval = 1;
                else if (rpmflags & RPM_TRANSPARENT)
                        goto out;
                else
                        retval = -EACCES;
        }
        if (retval)
                goto out;

        /*
         * Other scheduled or pending requests need to be canceled.  Small
         * optimization: If an autosuspend timer is running, leave it running
         * rather than cancelling it now only to restart it again in the near
         * future.
         */
        dev->power.request = RPM_REQ_NONE;
        if (!dev->power.timer_autosuspends)
                pm_runtime_deactivate_timer(dev);

        if (dev->power.runtime_status == RPM_ACTIVE) {
                retval = 1;
                goto out;
        }

        if (dev->power.runtime_status == RPM_RESUMING ||
            dev->power.runtime_status == RPM_SUSPENDING) {
                DEFINE_WAIT(wait);

                if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) {
                        if (dev->power.runtime_status == RPM_SUSPENDING) {
                                dev->power.deferred_resume = true;
                                if (rpmflags & RPM_NOWAIT)
                                        retval = -EINPROGRESS;
                        } else {
                                retval = -EINPROGRESS;
                        }
                        goto out;
                }

                if (dev->power.irq_safe) {
                        spin_unlock(&dev->power.lock);

                        cpu_relax();

                        spin_lock(&dev->power.lock);
                        goto repeat;
                }

                /* Wait for the operation carried out in parallel with us. */
                for (;;) {
                        prepare_to_wait(&dev->power.wait_queue, &wait,
                                        TASK_UNINTERRUPTIBLE);
                        if (dev->power.runtime_status != RPM_RESUMING &&
                            dev->power.runtime_status != RPM_SUSPENDING)
                                break;

                        spin_unlock_irq(&dev->power.lock);

                        schedule();

                        spin_lock_irq(&dev->power.lock);
                }
                finish_wait(&dev->power.wait_queue, &wait);
                goto repeat;
        }

        /*
         * See if we can skip waking up the parent.  This is safe only if
         * power.no_callbacks is set, because otherwise we don't know whether
         * the resume will actually succeed.
         */
        if (dev->power.no_callbacks && !parent && dev->parent) {
                spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING);
                if (dev->parent->power.disable_depth > 0 ||
                    dev->parent->power.ignore_children ||
                    dev->parent->power.runtime_status == RPM_ACTIVE) {
                        atomic_inc(&dev->parent->power.child_count);
                        spin_unlock(&dev->parent->power.lock);
                        retval = 1;
                        goto no_callback;        /* Assume success. */
                }
                spin_unlock(&dev->parent->power.lock);
        }

        /* Carry out an asynchronous or a synchronous resume. */
        if (rpmflags & RPM_ASYNC) {
                dev->power.request = RPM_REQ_RESUME;
                if (!dev->power.request_pending) {
                        dev->power.request_pending = true;
                        queue_work(pm_wq, &dev->power.work);
                }
                retval = 0;
                goto out;
        }

        if (!parent && dev->parent) {
                /*
                 * Increment the parent's usage counter and resume it if
                 * necessary.  Not needed if dev is irq-safe; then the
                 * parent is permanently resumed.
                 */
                parent = dev->parent;
                if (dev->power.irq_safe)
                        goto skip_parent;

                spin_unlock(&dev->power.lock);

                pm_runtime_get_noresume(parent);

                spin_lock(&parent->power.lock);
                /*
                 * Resume the parent if it has runtime PM enabled and not been
                 * set to ignore its children.
                 */
                if (!parent->power.disable_depth &&
                    !parent->power.ignore_children) {
                        rpm_resume(parent, 0);
                        if (parent->power.runtime_status != RPM_ACTIVE)
                                retval = -EBUSY;
                }
                spin_unlock(&parent->power.lock);

                spin_lock(&dev->power.lock);
                if (retval)
                        goto out;

                goto repeat;
        }
 skip_parent:

        if (dev->power.no_callbacks)
                goto no_callback;        /* Assume success. */

        __update_runtime_status(dev, RPM_RESUMING);

        callback = RPM_GET_CALLBACK(dev, runtime_resume);

        dev_pm_disable_wake_irq_check(dev, false);
        retval = rpm_callback(callback, dev);
        if (retval) {
                __update_runtime_status(dev, RPM_SUSPENDED);
                pm_runtime_cancel_pending(dev);
                dev_pm_enable_wake_irq_check(dev, false);
        } else {
 no_callback:
                __update_runtime_status(dev, RPM_ACTIVE);
                pm_runtime_mark_last_busy(dev);
                if (parent)
                        atomic_inc(&parent->power.child_count);
        }
        wake_up_all(&dev->power.wait_queue);

        if (retval >= 0)
                rpm_idle(dev, RPM_ASYNC);

 out:
        if (parent && !dev->power.irq_safe) {
                spin_unlock_irq(&dev->power.lock);

                pm_runtime_put(parent);

                spin_lock_irq(&dev->power.lock);
        }

        trace_rpm_return_int(dev, _THIS_IP_, retval);

        return retval;
}

/**
 * pm_runtime_work - Universal runtime PM work function.
 * @work: Work structure used for scheduling the execution of this function.
 *
 * Use @work to get the device object the work is to be done for, determine what
 * is to be done and execute the appropriate runtime PM function.
 */
static void pm_runtime_work(struct work_struct *work)
{
        struct device *dev = container_of(work, struct device, power.work);
        enum rpm_request req;

        spin_lock_irq(&dev->power.lock);

        if (!dev->power.request_pending)
                goto out;

        req = dev->power.request;
        dev->power.request = RPM_REQ_NONE;
        dev->power.request_pending = false;

        switch (req) {
        case RPM_REQ_NONE:
                break;
        case RPM_REQ_IDLE:
                rpm_idle(dev, RPM_NOWAIT);
                break;
        case RPM_REQ_SUSPEND:
                rpm_suspend(dev, RPM_NOWAIT);
                break;
        case RPM_REQ_AUTOSUSPEND:
                rpm_suspend(dev, RPM_NOWAIT | RPM_AUTO);
                break;
        case RPM_REQ_RESUME:
                rpm_resume(dev, RPM_NOWAIT);
                break;
        }

 out:
        spin_unlock_irq(&dev->power.lock);
}

/**
 * pm_suspend_timer_fn - Timer function for pm_schedule_suspend().
 * @timer: hrtimer used by pm_schedule_suspend().
 *
 * Check if the time is right and queue a suspend request.
 */
static enum hrtimer_restart  pm_suspend_timer_fn(struct hrtimer *timer)
{
        struct device *dev = container_of(timer, struct device, power.suspend_timer);
        unsigned long flags;
        u64 expires;

        spin_lock_irqsave(&dev->power.lock, flags);

        expires = dev->power.timer_expires;
        /*
         * If 'expires' is after the current time, we've been called
         * too early.
         */
        if (expires > 0 && expires <= ktime_get_mono_fast_ns()) {
                dev->power.timer_expires = 0;
                rpm_suspend(dev, dev->power.timer_autosuspends ?
                    (RPM_ASYNC | RPM_AUTO) : RPM_ASYNC);
        }

        spin_unlock_irqrestore(&dev->power.lock, flags);

        return HRTIMER_NORESTART;
}

/**
 * pm_schedule_suspend - Set up a timer to submit a suspend request in future.
 * @dev: Device to suspend.
 * @delay: Time to wait before submitting a suspend request, in milliseconds.
 */
int pm_schedule_suspend(struct device *dev, unsigned int delay)
{
        unsigned long flags;
        u64 expires;
        int retval;

        spin_lock_irqsave(&dev->power.lock, flags);

        if (!delay) {
                retval = rpm_suspend(dev, RPM_ASYNC);
                goto out;
        }

        retval = rpm_check_suspend_allowed(dev);
        if (retval)
                goto out;

        /* Other scheduled or pending requests need to be canceled. */
        pm_runtime_cancel_pending(dev);

        expires = ktime_get_mono_fast_ns() + (u64)delay * NSEC_PER_MSEC;
        dev->power.timer_expires = expires;
        dev->power.timer_autosuspends = 0;
        hrtimer_start(&dev->power.suspend_timer, expires, HRTIMER_MODE_ABS);

 out:
        spin_unlock_irqrestore(&dev->power.lock, flags);

        return retval;
}
EXPORT_SYMBOL_GPL(pm_schedule_suspend);

static int rpm_drop_usage_count(struct device *dev)
{
        int ret;

        ret = atomic_sub_return(1, &dev->power.usage_count);
        if (ret >= 0)
                return ret;

        /*
         * Because rpm_resume() does not check the usage counter, it will resume
         * the device even if the usage counter is 0 or negative, so it is
         * sufficient to increment the usage counter here to reverse the change
         * made above.
         */
        atomic_inc(&dev->power.usage_count);
        dev_warn(dev, "Runtime PM usage count underflow!\n");
        return -EINVAL;
}

/**
 * __pm_runtime_idle - Entry point for runtime idle operations.
 * @dev: Device to send idle notification for.
 * @rpmflags: Flag bits.
 *
 * If the RPM_GET_PUT flag is set, decrement the device's usage count and
 * return immediately if it is larger than zero (if it becomes negative, log a
 * warning, increment it, and return an error).  Then carry out an idle
 * notification, either synchronous or asynchronous.
 *
 * This routine may be called in atomic context if the RPM_ASYNC flag is set,
 * or if pm_runtime_irq_safe() has been called.
 */
int __pm_runtime_idle(struct device *dev, int rpmflags)
{
        unsigned long flags;
        int retval;

        if (rpmflags & RPM_GET_PUT) {
                retval = rpm_drop_usage_count(dev);
                if (retval < 0) {
                        return retval;
                } else if (retval > 0) {
                        trace_rpm_usage(dev, rpmflags);
                        return 0;
                }
        }

        might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe);

        spin_lock_irqsave(&dev->power.lock, flags);
        retval = rpm_idle(dev, rpmflags);
        spin_unlock_irqrestore(&dev->power.lock, flags);

        return retval;
}
EXPORT_SYMBOL_GPL(__pm_runtime_idle);

/**
 * __pm_runtime_suspend - Entry point for runtime put/suspend operations.
 * @dev: Device to suspend.
 * @rpmflags: Flag bits.
 *
 * If the RPM_GET_PUT flag is set, decrement the device's usage count and
 * return immediately if it is larger than zero (if it becomes negative, log a
 * warning, increment it, and return an error).  Then carry out a suspend,
 * either synchronous or asynchronous.
 *
 * This routine may be called in atomic context if the RPM_ASYNC flag is set,
 * or if pm_runtime_irq_safe() has been called.
 */
int __pm_runtime_suspend(struct device *dev, int rpmflags)
{
        unsigned long flags;
        int retval;

        if (rpmflags & RPM_GET_PUT) {
                retval = rpm_drop_usage_count(dev);
                if (retval < 0) {
                        return retval;
                } else if (retval > 0) {
                        trace_rpm_usage(dev, rpmflags);
                        return 0;
                }
        }

        might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe);

        spin_lock_irqsave(&dev->power.lock, flags);
        retval = rpm_suspend(dev, rpmflags);
        spin_unlock_irqrestore(&dev->power.lock, flags);

        return retval;
}
EXPORT_SYMBOL_GPL(__pm_runtime_suspend);

/**
 * __pm_runtime_resume - Entry point for runtime resume operations.
 * @dev: Device to resume.
 * @rpmflags: Flag bits.
 *
 * If the RPM_GET_PUT flag is set, increment the device's usage count.  Then
 * carry out a resume, either synchronous or asynchronous.
 *
 * This routine may be called in atomic context if the RPM_ASYNC flag is set,
 * or if pm_runtime_irq_safe() has been called.
 */
int __pm_runtime_resume(struct device *dev, int rpmflags)
{
        unsigned long flags;
        int retval;

        might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe &&
                        dev->power.runtime_status != RPM_ACTIVE);

        if (rpmflags & RPM_GET_PUT)
                atomic_inc(&dev->power.usage_count);

        spin_lock_irqsave(&dev->power.lock, flags);
        retval = rpm_resume(dev, rpmflags);
        spin_unlock_irqrestore(&dev->power.lock, flags);

        return retval;
}
EXPORT_SYMBOL_GPL(__pm_runtime_resume);

/**
 * pm_runtime_get_conditional - Conditionally bump up device usage counter.
 * @dev: Device to handle.
 * @ign_usage_count: Whether or not to look at the current usage counter value.
 *
 * Return -EINVAL if runtime PM is disabled for @dev.
 *
 * Otherwise, if its runtime PM status is %RPM_ACTIVE and (1) @ign_usage_count
 * is set, or (2) @dev is not ignoring children and its active child count is
 * nonzero, or (3) the runtime PM usage counter of @dev is not zero, increment
 * the usage counter of @dev and return 1.
 *
 * Otherwise, return 0 without changing the usage counter.
 *
 * If @ign_usage_count is %true, this function can be used to prevent suspending
 * the device when its runtime PM status is %RPM_ACTIVE.
 *
 * If @ign_usage_count is %false, this function can be used to prevent
 * suspending the device when both its runtime PM status is %RPM_ACTIVE and its
 * runtime PM usage counter is not zero.
 *
 * The caller is responsible for decrementing the runtime PM usage counter of
 * @dev after this function has returned a positive value for it.
 */
static int pm_runtime_get_conditional(struct device *dev, bool ign_usage_count)
{
        unsigned long flags;
        int retval;

        spin_lock_irqsave(&dev->power.lock, flags);
        if (dev->power.disable_depth > 0) {
                retval = -EINVAL;
        } else if (dev->power.runtime_status != RPM_ACTIVE) {
                retval = 0;
        } else if (ign_usage_count || (!dev->power.ignore_children &&
                   atomic_read(&dev->power.child_count) > 0)) {
                retval = 1;
                atomic_inc(&dev->power.usage_count);
        } else {
                retval = atomic_inc_not_zero(&dev->power.usage_count);
        }
        trace_rpm_usage(dev, 0);
        spin_unlock_irqrestore(&dev->power.lock, flags);

        return retval;
}

/**
 * pm_runtime_get_if_active - Bump up runtime PM usage counter if the device is
 *                              in active state
 * @dev: Target device.
 *
 * Increment the runtime PM usage counter of @dev if its runtime PM status is
 * %RPM_ACTIVE, in which case it returns 1. If the device is in a different
 * state, 0 is returned. -EINVAL is returned if runtime PM is disabled for the
 * device, in which case also the usage_count will remain unmodified.
 */
int pm_runtime_get_if_active(struct device *dev)
{
        return pm_runtime_get_conditional(dev, true);
}
EXPORT_SYMBOL_GPL(pm_runtime_get_if_active);

/**
 * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter.
 * @dev: Target device.
 *
 * Increment the runtime PM usage counter of @dev if its runtime PM status is
 * %RPM_ACTIVE and its runtime PM usage counter is greater than 0 or it is not
 * ignoring children and its active child count is nonzero.  1 is returned in
 * this case.
 *
 * If @dev is in a different state or it is not in use (that is, its usage
 * counter is 0, or it is ignoring children, or its active child count is 0),
 * 0 is returned.
 *
 * -EINVAL is returned if runtime PM is disabled for the device, in which case
 * also the usage counter of @dev is not updated.
 */
int pm_runtime_get_if_in_use(struct device *dev)
{
        return pm_runtime_get_conditional(dev, false);
}
EXPORT_SYMBOL_GPL(pm_runtime_get_if_in_use);

/**
 * __pm_runtime_set_status - Set runtime PM status of a device.
 * @dev: Device to handle.
 * @status: New runtime PM status of the device.
 *
 * If runtime PM of the device is disabled or its power.runtime_error field is
 * different from zero, the status may be changed either to RPM_ACTIVE, or to
 * RPM_SUSPENDED, as long as that reflects the actual state of the device.
 * However, if the device has a parent and the parent is not active, and the
 * parent's power.ignore_children flag is unset, the device's status cannot be
 * set to RPM_ACTIVE, so -EBUSY is returned in that case.
 *
 * If successful, __pm_runtime_set_status() clears the power.runtime_error field
 * and the device parent's counter of unsuspended children is modified to
 * reflect the new status.  If the new status is RPM_SUSPENDED, an idle
 * notification request for the parent is submitted.
 *
 * If @dev has any suppliers (as reflected by device links to them), and @status
 * is RPM_ACTIVE, they will be activated upfront and if the activation of one
 * of them fails, the status of @dev will be changed to RPM_SUSPENDED (instead
 * of the @status value) and the suppliers will be deacticated on exit.  The
 * error returned by the failing supplier activation will be returned in that
 * case.
 */
int __pm_runtime_set_status(struct device *dev, unsigned int status)
{
        struct device *parent = dev->parent;
        bool notify_parent = false;
        unsigned long flags;
        int error = 0;

        if (status != RPM_ACTIVE && status != RPM_SUSPENDED)
                return -EINVAL;

        spin_lock_irqsave(&dev->power.lock, flags);

        /*
         * Prevent PM-runtime from being enabled for the device or return an
         * error if it is enabled already and working.
         */
        if (dev->power.runtime_error || dev->power.disable_depth)
                dev->power.disable_depth++;
        else
                error = -EAGAIN;

        spin_unlock_irqrestore(&dev->power.lock, flags);

        if (error)
                return error;

        /*
         * If the new status is RPM_ACTIVE, the suppliers can be activated
         * upfront regardless of the current status, because next time
         * rpm_put_suppliers() runs, the rpm_active refcounts of the links
         * involved will be dropped down to one anyway.
         */
        if (status == RPM_ACTIVE) {
                int idx = device_links_read_lock();

                error = rpm_get_suppliers(dev);
                if (error)
                        status = RPM_SUSPENDED;

                device_links_read_unlock(idx);
        }

        spin_lock_irqsave(&dev->power.lock, flags);

        if (dev->power.runtime_status == status || !parent)
                goto out_set;

        if (status == RPM_SUSPENDED) {
                atomic_add_unless(&parent->power.child_count, -1, 0);
                notify_parent = !parent->power.ignore_children;
        } else {
                spin_lock_nested(&parent->power.lock, SINGLE_DEPTH_NESTING);

                /*
                 * It is invalid to put an active child under a parent that is
                 * not active, has runtime PM enabled and the
                 * 'power.ignore_children' flag unset.
                 */
                if (!parent->power.disable_depth &&
                    !parent->power.ignore_children &&
                    parent->power.runtime_status != RPM_ACTIVE) {
                        dev_err(dev, "runtime PM trying to activate child device %s but parent (%s) is not active\n",
                                dev_name(dev),
                                dev_name(parent));
                        error = -EBUSY;
                } else if (dev->power.runtime_status == RPM_SUSPENDED) {
                        atomic_inc(&parent->power.child_count);
                }

                spin_unlock(&parent->power.lock);

                if (error) {
                        status = RPM_SUSPENDED;
                        goto out;
                }
        }

 out_set:
        __update_runtime_status(dev, status);
        if (!error)
                dev->power.runtime_error = 0;

 out:
        spin_unlock_irqrestore(&dev->power.lock, flags);

        if (notify_parent)
                pm_request_idle(parent);

        if (status == RPM_SUSPENDED) {
                int idx = device_links_read_lock();

                rpm_put_suppliers(dev);

                device_links_read_unlock(idx);
        }

        pm_runtime_enable(dev);

        return error;
}
EXPORT_SYMBOL_GPL(__pm_runtime_set_status);

/**
 * __pm_runtime_barrier - Cancel pending requests and wait for completions.
 * @dev: Device to handle.
 *
 * Flush all pending requests for the device from pm_wq and wait for all
 * runtime PM operations involving the device in progress to complete.
 *
 * Should be called under dev->power.lock with interrupts disabled.
 */
static void __pm_runtime_barrier(struct device *dev)
{
        pm_runtime_deactivate_timer(dev);

        if (dev->power.request_pending) {
                dev->power.request = RPM_REQ_NONE;
                spin_unlock_irq(&dev->power.lock);

                cancel_work_sync(&dev->power.work);

                spin_lock_irq(&dev->power.lock);
                dev->power.request_pending = false;
        }

        if (dev->power.runtime_status == RPM_SUSPENDING ||
            dev->power.runtime_status == RPM_RESUMING ||
            dev->power.idle_notification) {
                DEFINE_WAIT(wait);

                /* Suspend, wake-up or idle notification in progress. */
                for (;;) {
                        prepare_to_wait(&dev->power.wait_queue, &wait,
                                        TASK_UNINTERRUPTIBLE);
                        if (dev->power.runtime_status != RPM_SUSPENDING
                            && dev->power.runtime_status != RPM_RESUMING
                            && !dev->power.idle_notification)
                                break;
                        spin_unlock_irq(&dev->power.lock);

                        schedule();

                        spin_lock_irq(&dev->power.lock);
                }
                finish_wait(&dev->power.wait_queue, &wait);
        }
}

/**
 * pm_runtime_barrier - Flush pending requests and wait for completions.
 * @dev: Device to handle.
 *
 * Prevent the device from being suspended by incrementing its usage counter and
 * if there's a pending resume request for the device, wake the device up.
 * Next, make sure that all pending requests for the device have been flushed
 * from pm_wq and wait for all runtime PM operations involving the device in
 * progress to complete.
 */
void pm_runtime_barrier(struct device *dev)
{
        pm_runtime_get_noresume(dev);
        spin_lock_irq(&dev->power.lock);

        if (dev->power.request_pending
            && dev->power.request == RPM_REQ_RESUME)
                rpm_resume(dev, 0);

        __pm_runtime_barrier(dev);

        spin_unlock_irq(&dev->power.lock);
        pm_runtime_put_noidle(dev);
}
EXPORT_SYMBOL_GPL(pm_runtime_barrier);

bool pm_runtime_block_if_disabled(struct device *dev)
{
        bool ret;

        spin_lock_irq(&dev->power.lock);

        ret = !pm_runtime_enabled(dev);
        if (ret && dev->power.last_status == RPM_INVALID)
                dev->power.last_status = RPM_BLOCKED;

        spin_unlock_irq(&dev->power.lock);

        return ret;
}

void pm_runtime_unblock(struct device *dev)
{
        spin_lock_irq(&dev->power.lock);

        if (dev->power.last_status == RPM_BLOCKED)
                dev->power.last_status = RPM_INVALID;

        spin_unlock_irq(&dev->power.lock);
}

void __pm_runtime_disable(struct device *dev, bool check_resume)
{
        spin_lock_irq(&dev->power.lock);

        if (dev->power.disable_depth > 0) {
                dev->power.disable_depth++;
                goto out;
        }

        /*
         * Wake up the device if there's a resume request pending, because that
         * means there probably is some I/O to process and disabling runtime PM
         * shouldn't prevent the device from processing the I/O.
         */
        if (check_resume && dev->power.request_pending &&
            dev->power.request == RPM_REQ_RESUME) {
                /*
                 * Prevent suspends and idle notifications from being carried
                 * out after we have woken up the device.
                 */
                pm_runtime_get_noresume(dev);

                rpm_resume(dev, 0);

                pm_runtime_put_noidle(dev);
        }

        /* Update time accounting before disabling PM-runtime. */
        update_pm_runtime_accounting(dev);

        if (!dev->power.disable_depth++) {
                __pm_runtime_barrier(dev);
                dev->power.last_status = dev->power.runtime_status;
        }

 out:
        spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(__pm_runtime_disable);

/**
 * pm_runtime_enable - Enable runtime PM of a device.
 * @dev: Device to handle.
 */
void pm_runtime_enable(struct device *dev)
{
        unsigned long flags;

        spin_lock_irqsave(&dev->power.lock, flags);

        if (!dev->power.disable_depth) {
                dev_warn(dev, "Unbalanced %s!\n", __func__);
                goto out;
        }

        if (--dev->power.disable_depth > 0)
                goto out;

        if (dev->power.last_status == RPM_BLOCKED) {
                dev_warn(dev, "Attempt to enable runtime PM when it is blocked\n");
                dump_stack();
        }
        dev->power.last_status = RPM_INVALID;
        dev->power.accounting_timestamp = ktime_get_mono_fast_ns();

        if (dev->power.runtime_status == RPM_SUSPENDED &&
            !dev->power.ignore_children &&
            atomic_read(&dev->power.child_count) > 0)
                dev_warn(dev, "Enabling runtime PM for inactive device with active children\n");

out:
        spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_runtime_enable);

static void pm_runtime_set_suspended_action(void *data)
{
        pm_runtime_set_suspended(data);
}

/**
 * devm_pm_runtime_set_active_enabled - set_active version of devm_pm_runtime_enable.
 *
 * @dev: Device to handle.
 */
int devm_pm_runtime_set_active_enabled(struct device *dev)
{
        int err;

        err = pm_runtime_set_active(dev);
        if (err)
                return err;

        err = devm_add_action_or_reset(dev, pm_runtime_set_suspended_action, dev);
        if (err)
                return err;

        return devm_pm_runtime_enable(dev);
}
EXPORT_SYMBOL_GPL(devm_pm_runtime_set_active_enabled);

static void pm_runtime_disable_action(void *data)
{
        pm_runtime_dont_use_autosuspend(data);
        pm_runtime_disable(data);
}

/**
 * devm_pm_runtime_enable - devres-enabled version of pm_runtime_enable.
 *
 * NOTE: this will also handle calling pm_runtime_dont_use_autosuspend() for
 * you at driver exit time if needed.
 *
 * @dev: Device to handle.
 */
int devm_pm_runtime_enable(struct device *dev)
{
        pm_runtime_enable(dev);

        return devm_add_action_or_reset(dev, pm_runtime_disable_action, dev);
}
EXPORT_SYMBOL_GPL(devm_pm_runtime_enable);

static void pm_runtime_put_noidle_action(void *data)
{
        pm_runtime_put_noidle(data);
}

/**
 * devm_pm_runtime_get_noresume - devres-enabled version of pm_runtime_get_noresume.
 *
 * @dev: Device to handle.
 */
int devm_pm_runtime_get_noresume(struct device *dev)
{
        pm_runtime_get_noresume(dev);

        return devm_add_action_or_reset(dev, pm_runtime_put_noidle_action, dev);
}
EXPORT_SYMBOL_GPL(devm_pm_runtime_get_noresume);

/**
 * pm_runtime_forbid - Block runtime PM of a device.
 * @dev: Device to handle.
 *
 * Resume @dev if already suspended and block runtime suspend of @dev in such
 * a way that it can be unblocked via the /sys/devices/.../power/control
 * interface, or otherwise by calling pm_runtime_allow().
 *
 * Calling this function many times in a row has the same effect as calling it
 * once.
 */
void pm_runtime_forbid(struct device *dev)
{
        spin_lock_irq(&dev->power.lock);
        if (!dev->power.runtime_auto)
                goto out;

        dev->power.runtime_auto = false;
        atomic_inc(&dev->power.usage_count);
        rpm_resume(dev, 0);

 out:
        spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(pm_runtime_forbid);

/**
 * pm_runtime_allow - Unblock runtime PM of a device.
 * @dev: Device to handle.
 *
 * Unblock runtime suspend of @dev after it has been blocked by
 * pm_runtime_forbid() (for instance, if it has been blocked via the
 * /sys/devices/.../power/control interface), check if @dev can be
 * suspended and suspend it in that case.
 *
 * Calling this function many times in a row has the same effect as calling it
 * once.
 */
void pm_runtime_allow(struct device *dev)
{
        int ret;

        spin_lock_irq(&dev->power.lock);
        if (dev->power.runtime_auto)
                goto out;

        dev->power.runtime_auto = true;
        ret = rpm_drop_usage_count(dev);
        if (ret == 0)
                rpm_idle(dev, RPM_AUTO | RPM_ASYNC);
        else if (ret > 0)
                trace_rpm_usage(dev, RPM_AUTO | RPM_ASYNC);

 out:
        spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(pm_runtime_allow);

/**
 * pm_runtime_no_callbacks - Ignore runtime PM callbacks for a device.
 * @dev: Device to handle.
 *
 * Set the power.no_callbacks flag, which tells the PM core that this
 * device is power-managed through its parent and has no runtime PM
 * callbacks of its own.  The runtime sysfs attributes will be removed.
 */
void pm_runtime_no_callbacks(struct device *dev)
{
        spin_lock_irq(&dev->power.lock);
        dev->power.no_callbacks = 1;
        spin_unlock_irq(&dev->power.lock);
        if (device_is_registered(dev))
                rpm_sysfs_remove(dev);
}
EXPORT_SYMBOL_GPL(pm_runtime_no_callbacks);

/**
 * pm_runtime_irq_safe - Leave interrupts disabled during callbacks.
 * @dev: Device to handle
 *
 * Set the power.irq_safe flag, which tells the PM core that the
 * ->runtime_suspend() and ->runtime_resume() callbacks for this device should
 * always be invoked with the spinlock held and interrupts disabled.  It also
 * causes the parent's usage counter to be permanently incremented, preventing
 * the parent from runtime suspending -- otherwise an irq-safe child might have
 * to wait for a non-irq-safe parent.
 */
void pm_runtime_irq_safe(struct device *dev)
{
        if (dev->parent)
                pm_runtime_get_sync(dev->parent);

        spin_lock_irq(&dev->power.lock);
        dev->power.irq_safe = 1;
        spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(pm_runtime_irq_safe);

/**
 * update_autosuspend - Handle a change to a device's autosuspend settings.
 * @dev: Device to handle.
 * @old_delay: The former autosuspend_delay value.
 * @old_use: The former use_autosuspend value.
 *
 * Prevent runtime suspend if the new delay is negative and use_autosuspend is
 * set; otherwise allow it.  Send an idle notification if suspends are allowed.
 *
 * This function must be called under dev->power.lock with interrupts disabled.
 */
static void update_autosuspend(struct device *dev, int old_delay, int old_use)
{
        int delay = dev->power.autosuspend_delay;

        /* Should runtime suspend be prevented now? */
        if (dev->power.use_autosuspend && delay < 0) {

                /* If it used to be allowed then prevent it. */
                if (!old_use || old_delay >= 0) {
                        atomic_inc(&dev->power.usage_count);
                        rpm_resume(dev, 0);
                } else {
                        trace_rpm_usage(dev, 0);
                }
        }

        /* Runtime suspend should be allowed now. */
        else {

                /* If it used to be prevented then allow it. */
                if (old_use && old_delay < 0)
                        atomic_dec(&dev->power.usage_count);

                /* Maybe we can autosuspend now. */
                rpm_idle(dev, RPM_AUTO);
        }
}

/**
 * pm_runtime_set_autosuspend_delay - Set a device's autosuspend_delay value.
 * @dev: Device to handle.
 * @delay: Value of the new delay in milliseconds.
 *
 * Set the device's power.autosuspend_delay value.  If it changes to negative
 * and the power.use_autosuspend flag is set, prevent runtime suspends.  If it
 * changes the other way, allow runtime suspends.
 */
void pm_runtime_set_autosuspend_delay(struct device *dev, int delay)
{
        int old_delay, old_use;

        spin_lock_irq(&dev->power.lock);
        old_delay = dev->power.autosuspend_delay;
        old_use = dev->power.use_autosuspend;
        dev->power.autosuspend_delay = delay;
        update_autosuspend(dev, old_delay, old_use);
        spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(pm_runtime_set_autosuspend_delay);

/**
 * __pm_runtime_use_autosuspend - Set a device's use_autosuspend flag.
 * @dev: Device to handle.
 * @use: New value for use_autosuspend.
 *
 * Set the device's power.use_autosuspend flag, and allow or prevent runtime
 * suspends as needed.
 */
void __pm_runtime_use_autosuspend(struct device *dev, bool use)
{
        int old_delay, old_use;

        spin_lock_irq(&dev->power.lock);
        old_delay = dev->power.autosuspend_delay;
        old_use = dev->power.use_autosuspend;
        dev->power.use_autosuspend = use;
        update_autosuspend(dev, old_delay, old_use);
        spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(__pm_runtime_use_autosuspend);

/**
 * pm_runtime_init - Initialize runtime PM fields in given device object.
 * @dev: Device object to initialize.
 */
void pm_runtime_init(struct device *dev)
{
        dev->power.runtime_status = RPM_SUSPENDED;
        dev->power.last_status = RPM_INVALID;
        dev->power.idle_notification = false;

        dev->power.disable_depth = 1;
        atomic_set(&dev->power.usage_count, 0);

        dev->power.runtime_error = 0;

        atomic_set(&dev->power.child_count, 0);
        pm_suspend_ignore_children(dev, false);
        dev->power.runtime_auto = true;

        dev->power.request_pending = false;
        dev->power.request = RPM_REQ_NONE;
        dev->power.deferred_resume = false;
        dev->power.needs_force_resume = false;
        INIT_WORK(&dev->power.work, pm_runtime_work);

        dev->power.timer_expires = 0;
        hrtimer_setup(&dev->power.suspend_timer, pm_suspend_timer_fn, CLOCK_MONOTONIC,
                      HRTIMER_MODE_ABS);

        init_waitqueue_head(&dev->power.wait_queue);
}

/**
 * pm_runtime_reinit - Re-initialize runtime PM fields in given device object.
 * @dev: Device object to re-initialize.
 */
void pm_runtime_reinit(struct device *dev)
{
        if (pm_runtime_enabled(dev))
                return;

        if (dev->power.runtime_status == RPM_ACTIVE)
                pm_runtime_set_suspended(dev);

        if (dev->power.irq_safe) {
                spin_lock_irq(&dev->power.lock);
                dev->power.irq_safe = 0;
                spin_unlock_irq(&dev->power.lock);
                if (dev->parent)
                        pm_runtime_put(dev->parent);
        }
        /*
         * Clear power.needs_force_resume in case it has been set by
         * pm_runtime_force_suspend() invoked from a driver remove callback.
         */
        dev->power.needs_force_resume = false;
}

/**
 * pm_runtime_remove - Prepare for removing a device from device hierarchy.
 * @dev: Device object being removed from device hierarchy.
 */
void pm_runtime_remove(struct device *dev)
{
        __pm_runtime_disable(dev, false);
        flush_work(&dev->power.work);
        pm_runtime_reinit(dev);
}

/**
 * pm_runtime_get_suppliers - Resume and reference-count supplier devices.
 * @dev: Consumer device.
 */
void pm_runtime_get_suppliers(struct device *dev)
{
        struct device_link *link;
        int idx;

        idx = device_links_read_lock();

        dev_for_each_link_to_supplier(link, dev)
                if (device_link_test(link, DL_FLAG_PM_RUNTIME)) {
                        link->supplier_preactivated = true;
                        pm_runtime_get_sync(link->supplier);
                }

        device_links_read_unlock(idx);
}

/**
 * pm_runtime_put_suppliers - Drop references to supplier devices.
 * @dev: Consumer device.
 */
void pm_runtime_put_suppliers(struct device *dev)
{
        struct device_link *link;
        int idx;

        idx = device_links_read_lock();

        list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
                                device_links_read_lock_held())
                if (link->supplier_preactivated) {
                        link->supplier_preactivated = false;
                        pm_runtime_put(link->supplier);
                }

        device_links_read_unlock(idx);
}

void pm_runtime_new_link(struct device *dev)
{
        spin_lock_irq(&dev->power.lock);
        dev->power.links_count++;
        spin_unlock_irq(&dev->power.lock);
}

static void pm_runtime_drop_link_count(struct device *dev)
{
        spin_lock_irq(&dev->power.lock);
        WARN_ON(dev->power.links_count == 0);
        dev->power.links_count--;
        spin_unlock_irq(&dev->power.lock);
}

/**
 * pm_runtime_drop_link - Prepare for device link removal.
 * @link: Device link going away.
 *
 * Drop the link count of the consumer end of @link and decrement the supplier
 * device's runtime PM usage counter as many times as needed to drop all of the
 * PM runtime reference to it from the consumer.
 */
void pm_runtime_drop_link(struct device_link *link)
{
        if (!device_link_test(link, DL_FLAG_PM_RUNTIME))
                return;

        pm_runtime_drop_link_count(link->consumer);
        pm_runtime_release_supplier(link);
        pm_request_idle(link->supplier);
}

static pm_callback_t get_callback(struct device *dev, size_t cb_offset)
{
        /*
         * Setting power.strict_midlayer means that the middle layer
         * code does not want its runtime PM callbacks to be invoked via
         * pm_runtime_force_suspend() and pm_runtime_force_resume(), so
         * return a direct pointer to the driver callback in that case.
         */
        if (dev_pm_strict_midlayer_is_set(dev))
                return __rpm_get_driver_callback(dev, cb_offset);

        return __rpm_get_callback(dev, cb_offset);
}

#define GET_CALLBACK(dev, callback) \
                get_callback(dev, offsetof(struct dev_pm_ops, callback))

/**
 * pm_runtime_force_suspend - Force a device into suspend state if needed.
 * @dev: Device to suspend.
 *
 * Disable runtime PM so we safely can check the device's runtime PM status and
 * if it is active, invoke its ->runtime_suspend callback to suspend it and
 * change its runtime PM status field to RPM_SUSPENDED.  Also, if the device's
 * usage and children counters don't indicate that the device was in use before
 * the system-wide transition under way, decrement its parent's children counter
 * (if there is a parent).  Keep runtime PM disabled to preserve the state
 * unless we encounter errors.
 *
 * Typically this function may be invoked from a system suspend callback to make
 * sure the device is put into low power state and it should only be used during
 * system-wide PM transitions to sleep states.  It assumes that the analogous
 * pm_runtime_force_resume() will be used to resume the device.
 */
int pm_runtime_force_suspend(struct device *dev)
{
        int (*callback)(struct device *);
        int ret;

        pm_runtime_disable(dev);
        if (pm_runtime_status_suspended(dev) || dev->power.needs_force_resume)
                return 0;

        callback = GET_CALLBACK(dev, runtime_suspend);

        dev_pm_enable_wake_irq_check(dev, true);
        ret = callback ? callback(dev) : 0;
        if (ret)
                goto err;

        dev_pm_enable_wake_irq_complete(dev);

        /*
         * If the device can stay in suspend after the system-wide transition
         * to the working state that will follow, drop the children counter of
         * its parent and the usage counters of its suppliers.  Otherwise, set
         * power.needs_force_resume to let pm_runtime_force_resume() know that
         * the device needs to be taken care of and to prevent this function
         * from handling the device again in case the device is passed to it
         * once more subsequently.
         */
        if (pm_runtime_need_not_resume(dev))
                pm_runtime_set_suspended(dev);
        else
                dev->power.needs_force_resume = true;

        return 0;

err:
        dev_pm_disable_wake_irq_check(dev, true);
        pm_runtime_enable(dev);
        return ret;
}
EXPORT_SYMBOL_GPL(pm_runtime_force_suspend);

#ifdef CONFIG_PM_SLEEP

/**
 * pm_runtime_force_resume - Force a device into resume state if needed.
 * @dev: Device to resume.
 *
 * This function expects that either pm_runtime_force_suspend() has put the
 * device into a low-power state prior to calling it, or the device had been
 * runtime-suspended before the preceding system-wide suspend transition and it
 * was left in suspend during that transition.
 *
 * The actions carried out by pm_runtime_force_suspend(), or by a runtime
 * suspend in general, are reversed and the device is brought back into full
 * power if it is expected to be used on system resume, which is the case when
 * its needs_force_resume flag is set or when its smart_suspend flag is set and
 * its runtime PM status is "active".
 *
 * In other cases, the resume is deferred to be managed via runtime PM.
 *
 * Typically, this function may be invoked from a system resume callback.
 */
int pm_runtime_force_resume(struct device *dev)
{
        int (*callback)(struct device *);
        int ret = 0;

        if (!dev->power.needs_force_resume && (!dev_pm_smart_suspend(dev) ||
            pm_runtime_status_suspended(dev)))
                goto out;

        callback = GET_CALLBACK(dev, runtime_resume);

        dev_pm_disable_wake_irq_check(dev, false);
        ret = callback ? callback(dev) : 0;
        if (ret) {
                pm_runtime_set_suspended(dev);
                dev_pm_enable_wake_irq_check(dev, false);
                goto out;
        }

        pm_runtime_mark_last_busy(dev);

out:
        /*
         * The smart_suspend flag can be cleared here because it is not going
         * to be necessary until the next system-wide suspend transition that
         * will update it again.
         */
        dev->power.smart_suspend = false;
        /*
         * Also clear needs_force_resume to make this function skip devices that
         * have been seen by it once.
         */
        dev->power.needs_force_resume = false;

        pm_runtime_enable(dev);
        return ret;
}
EXPORT_SYMBOL_GPL(pm_runtime_force_resume);

bool pm_runtime_need_not_resume(struct device *dev)
{
        return atomic_read(&dev->power.usage_count) <= 1 &&
                (atomic_read(&dev->power.child_count) == 0 ||
                 dev->power.ignore_children);
}

#endif /* CONFIG_PM_SLEEP */
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 








    5 





    5 











    4 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Multicast support for IPv6
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c
 */

/* Changes:
 *
 *        yoshfuji        : fix format of router-alert option
 *        YOSHIFUJI Hideaki @USAGI:
 *                Fixed source address for MLD message based on
 *                <draft-ietf-magma-mld-source-05.txt>.
 *        YOSHIFUJI Hideaki @USAGI:
 *                - Ignore Queries for invalid addresses.
 *                - MLD for link-local addresses.
 *        David L Stevens <dlstevens@us.ibm.com>:
 *                - MLDv2 support
 */

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/jiffies.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/route.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/pkt_sched.h>
#include <net/mld.h>
#include <linux/workqueue.h>

#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>

#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/sock.h>
#include <net/snmp.h>

#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/if_inet6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/inet_common.h>

#include <net/ip6_checksum.h>

/* Ensure that we have struct in6_addr aligned on 32bit word. */
static int __mld2_query_bugs[] __attribute__((__unused__)) = {
        BUILD_BUG_ON_ZERO(offsetof(struct mld2_query, mld2q_srcs) % 4),
        BUILD_BUG_ON_ZERO(offsetof(struct mld2_report, mld2r_grec) % 4),
        BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4)
};

static struct workqueue_struct *mld_wq;
static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT;

static void igmp6_join_group(struct ifmcaddr6 *ma);
static void igmp6_leave_group(struct ifmcaddr6 *ma);
static void mld_mca_work(struct work_struct *work);

static void mld_ifc_event(struct inet6_dev *idev);
static bool mld_in_v1_mode(const struct inet6_dev *idev);
static int sf_setstate(struct ifmcaddr6 *pmc);
static void sf_markstate(struct ifmcaddr6 *pmc);
static void ip6_mc_clear_src(struct ifmcaddr6 *pmc);
static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
                          int sfmode, int sfcount, const struct in6_addr *psfsrc,
                          int delta);
static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
                          int sfmode, int sfcount, const struct in6_addr *psfsrc,
                          int delta);
static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
                            struct inet6_dev *idev);
static int __ipv6_dev_mc_inc(struct net_device *dev,
                             const struct in6_addr *addr, unsigned int mode);

#define MLD_QRV_DEFAULT                2
/* RFC3810, 9.2. Query Interval */
#define MLD_QI_DEFAULT                (125 * HZ)
/* RFC3810, 9.3. Query Response Interval */
#define MLD_QRI_DEFAULT                (10 * HZ)

/* RFC3810, 8.1 Query Version Distinctions */
#define MLD_V1_QUERY_LEN        24
#define MLD_V2_QUERY_LEN_MIN        28

#define IPV6_MLD_MAX_MSF        64

int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF;
int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT;

#define mc_assert_locked(idev)                        \
        lockdep_assert_held(&(idev)->mc_lock)

#define mc_dereference(e, idev) \
        rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock))

#define sock_dereference(e, sk) \
        rcu_dereference_protected(e, lockdep_sock_is_held(sk))

#define for_each_pmc_socklock(np, sk, pmc)                        \
        for (pmc = sock_dereference((np)->ipv6_mc_list, sk);        \
             pmc;                                                \
             pmc = sock_dereference(pmc->next, sk))

#define for_each_pmc_rcu(np, pmc)                                \
        for (pmc = rcu_dereference((np)->ipv6_mc_list);                \
             pmc;                                                \
             pmc = rcu_dereference(pmc->next))

#define for_each_psf_mclock(mc, psf)                                \
        for (psf = mc_dereference((mc)->mca_sources, mc->idev);        \
             psf;                                                \
             psf = mc_dereference(psf->sf_next, mc->idev))

#define for_each_psf_rcu(mc, psf)                                \
        for (psf = rcu_dereference((mc)->mca_sources);                \
             psf;                                                \
             psf = rcu_dereference(psf->sf_next))

#define for_each_psf_tomb(mc, psf)                                \
        for (psf = mc_dereference((mc)->mca_tomb, mc->idev);        \
             psf;                                                \
             psf = mc_dereference(psf->sf_next, mc->idev))

#define for_each_mc_mclock(idev, mc)                                \
        for (mc = mc_dereference((idev)->mc_list, idev);        \
             mc;                                                \
             mc = mc_dereference(mc->next, idev))

#define for_each_mc_rcu(idev, mc)                                \
        for (mc = rcu_dereference((idev)->mc_list);             \
             mc;                                                \
             mc = rcu_dereference(mc->next))

#define for_each_mc_tomb(idev, mc)                                \
        for (mc = mc_dereference((idev)->mc_tomb, idev);        \
             mc;                                                \
             mc = mc_dereference(mc->next, idev))

static int unsolicited_report_interval(struct inet6_dev *idev)
{
        int iv;

        if (mld_in_v1_mode(idev))
                iv = READ_ONCE(idev->cnf.mldv1_unsolicited_report_interval);
        else
                iv = READ_ONCE(idev->cnf.mldv2_unsolicited_report_interval);

        return iv > 0 ? iv : 1;
}

static struct net_device *ip6_mc_find_dev(struct net *net,
                                          const struct in6_addr *group,
                                          int ifindex)
{
        struct net_device *dev = NULL;
        struct rt6_info *rt;

        if (ifindex == 0) {
                rcu_read_lock();
                rt = rt6_lookup(net, group, NULL, 0, NULL, 0);
                if (rt) {
                        dev = dst_dev_rcu(&rt->dst);
                        dev_hold(dev);
                        ip6_rt_put(rt);
                }
                rcu_read_unlock();
        } else {
                dev = dev_get_by_index(net, ifindex);
        }

        return dev;
}

/*
 *        socket join on multicast group
 */
static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
                               const struct in6_addr *addr, unsigned int mode)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct ipv6_mc_socklist *mc_lst;
        struct net *net = sock_net(sk);
        struct net_device *dev = NULL;
        int err;

        if (!ipv6_addr_is_multicast(addr))
                return -EINVAL;

        for_each_pmc_socklock(np, sk, mc_lst) {
                if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
                    ipv6_addr_equal(&mc_lst->addr, addr))
                        return -EADDRINUSE;
        }

        mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
        if (!mc_lst)
                return -ENOMEM;

        mc_lst->next = NULL;
        mc_lst->addr = *addr;

        dev = ip6_mc_find_dev(net, addr, ifindex);
        if (!dev) {
                sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
                return -ENODEV;
        }

        mc_lst->ifindex = dev->ifindex;
        mc_lst->sfmode = mode;
        RCU_INIT_POINTER(mc_lst->sflist, NULL);

        /* now add/increase the group membership on the device */
        err = __ipv6_dev_mc_inc(dev, addr, mode);

        dev_put(dev);

        if (err) {
                sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
                return err;
        }

        mc_lst->next = np->ipv6_mc_list;
        rcu_assign_pointer(np->ipv6_mc_list, mc_lst);

        return 0;
}

int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
        return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE);
}
EXPORT_SYMBOL(ipv6_sock_mc_join);

int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
                          const struct in6_addr *addr, unsigned int mode)
{
        return __ipv6_sock_mc_join(sk, ifindex, addr, mode);
}

/*
 *        socket leave on multicast group
 */
static void __ipv6_sock_mc_drop(struct sock *sk, struct ipv6_mc_socklist *mc_lst)
{
        struct net *net = sock_net(sk);
        struct net_device *dev;

        dev = dev_get_by_index(net, mc_lst->ifindex);
        if (dev) {
                struct inet6_dev *idev = in6_dev_get(dev);

                ip6_mc_leave_src(sk, mc_lst, idev);

                if (idev) {
                        __ipv6_dev_mc_dec(idev, &mc_lst->addr);
                        in6_dev_put(idev);
                }

                dev_put(dev);
        } else {
                ip6_mc_leave_src(sk, mc_lst, NULL);
        }

        atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
        kfree_rcu(mc_lst, rcu);
}

int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct ipv6_mc_socklist __rcu **lnk;
        struct ipv6_mc_socklist *mc_lst;

        if (!ipv6_addr_is_multicast(addr))
                return -EINVAL;

        for (lnk = &np->ipv6_mc_list;
             (mc_lst = sock_dereference(*lnk, sk)) != NULL;
              lnk = &mc_lst->next) {
                if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
                    ipv6_addr_equal(&mc_lst->addr, addr)) {
                        *lnk = mc_lst->next;
                        __ipv6_sock_mc_drop(sk, mc_lst);
                        return 0;
                }
        }

        return -EADDRNOTAVAIL;
}
EXPORT_SYMBOL(ipv6_sock_mc_drop);

static struct inet6_dev *ip6_mc_find_idev(struct net *net,
                                          const struct in6_addr *group,
                                          int ifindex)
{
        struct net_device *dev;
        struct inet6_dev *idev;

        dev = ip6_mc_find_dev(net, group, ifindex);
        if (!dev)
                return NULL;

        idev = in6_dev_get(dev);
        dev_put(dev);

        return idev;
}

void __ipv6_sock_mc_close(struct sock *sk)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct ipv6_mc_socklist *mc_lst;

        while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) {
                np->ipv6_mc_list = mc_lst->next;
                __ipv6_sock_mc_drop(sk, mc_lst);
        }
}

void ipv6_sock_mc_close(struct sock *sk)
{
        struct ipv6_pinfo *np = inet6_sk(sk);

        if (!rcu_access_pointer(np->ipv6_mc_list))
                return;

        lock_sock(sk);
        __ipv6_sock_mc_close(sk);
        release_sock(sk);
}

int ip6_mc_source(int add, int omode, struct sock *sk,
                  struct group_source_req *pgsr)
{
        struct ipv6_pinfo *inet6 = inet6_sk(sk);
        struct in6_addr *source, *group;
        struct net *net = sock_net(sk);
        struct ipv6_mc_socklist *pmc;
        struct ip6_sf_socklist *psl;
        struct inet6_dev *idev;
        int leavegroup = 0;
        int i, j, rv;
        int err;

        source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr;
        group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr;

        if (!ipv6_addr_is_multicast(group))
                return -EINVAL;

        idev = ip6_mc_find_idev(net, group, pgsr->gsr_interface);
        if (!idev)
                return -ENODEV;

        mutex_lock(&idev->mc_lock);

        if (idev->dead) {
                err = -ENODEV;
                goto done;
        }

        err = -EADDRNOTAVAIL;

        for_each_pmc_socklock(inet6, sk, pmc) {
                if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
                        continue;
                if (ipv6_addr_equal(&pmc->addr, group))
                        break;
        }
        if (!pmc) {                /* must have a prior join */
                err = -EINVAL;
                goto done;
        }
        /* if a source filter was set, must be the same mode as before */
        if (rcu_access_pointer(pmc->sflist)) {
                if (pmc->sfmode != omode) {
                        err = -EINVAL;
                        goto done;
                }
        } else if (pmc->sfmode != omode) {
                /* allow mode switches for empty-set filters */
                ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
                ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
                pmc->sfmode = omode;
        }

        psl = sock_dereference(pmc->sflist, sk);
        if (!add) {
                if (!psl)
                        goto done;        /* err = -EADDRNOTAVAIL */
                rv = !0;
                for (i = 0; i < psl->sl_count; i++) {
                        rv = !ipv6_addr_equal(&psl->sl_addr[i], source);
                        if (rv == 0)
                                break;
                }
                if (rv)                /* source not found */
                        goto done;        /* err = -EADDRNOTAVAIL */

                /* special case - (INCLUDE, empty) == LEAVE_GROUP */
                if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
                        leavegroup = 1;
                        goto done;
                }

                /* update the interface filter */
                ip6_mc_del_src(idev, group, omode, 1, source, 1);

                for (j = i+1; j < psl->sl_count; j++)
                        psl->sl_addr[j-1] = psl->sl_addr[j];
                psl->sl_count--;
                err = 0;
                goto done;
        }
        /* else, add a new source to the filter */

        if (psl && psl->sl_count >= sysctl_mld_max_msf) {
                err = -ENOBUFS;
                goto done;
        }
        if (!psl || psl->sl_count == psl->sl_max) {
                struct ip6_sf_socklist *newpsl;
                int count = IP6_SFBLOCK;

                if (psl)
                        count += psl->sl_max;
                newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
                                      GFP_KERNEL);
                if (!newpsl) {
                        err = -ENOBUFS;
                        goto done;
                }
                newpsl->sl_max = count;
                newpsl->sl_count = count - IP6_SFBLOCK;
                if (psl) {
                        for (i = 0; i < psl->sl_count; i++)
                                newpsl->sl_addr[i] = psl->sl_addr[i];
                        atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
                                   &sk->sk_omem_alloc);
                }
                rcu_assign_pointer(pmc->sflist, newpsl);
                kfree_rcu(psl, rcu);
                psl = newpsl;
        }
        rv = 1;        /* > 0 for insert logic below if sl_count is 0 */
        for (i = 0; i < psl->sl_count; i++) {
                rv = !ipv6_addr_equal(&psl->sl_addr[i], source);
                if (rv == 0) /* There is an error in the address. */
                        goto done;
        }
        for (j = psl->sl_count-1; j >= i; j--)
                psl->sl_addr[j+1] = psl->sl_addr[j];
        psl->sl_addr[i] = *source;
        psl->sl_count++;
        err = 0;
        /* update the interface list */
        ip6_mc_add_src(idev, group, omode, 1, source, 1);
done:
        mutex_unlock(&idev->mc_lock);
        in6_dev_put(idev);
        if (leavegroup)
                err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
        return err;
}

int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
                    struct sockaddr_storage *list)
{
        struct ipv6_pinfo *inet6 = inet6_sk(sk);
        struct ip6_sf_socklist *newpsl, *psl;
        struct net *net = sock_net(sk);
        const struct in6_addr *group;
        struct ipv6_mc_socklist *pmc;
        struct inet6_dev *idev;
        int leavegroup = 0;
        int i, err;

        group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;

        if (!ipv6_addr_is_multicast(group))
                return -EINVAL;
        if (gsf->gf_fmode != MCAST_INCLUDE &&
            gsf->gf_fmode != MCAST_EXCLUDE)
                return -EINVAL;

        idev = ip6_mc_find_idev(net, group, gsf->gf_interface);
        if (!idev)
                return -ENODEV;

        mutex_lock(&idev->mc_lock);

        if (idev->dead) {
                err = -ENODEV;
                goto done;
        }

        err = 0;

        if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
                leavegroup = 1;
                goto done;
        }

        for_each_pmc_socklock(inet6, sk, pmc) {
                if (pmc->ifindex != gsf->gf_interface)
                        continue;
                if (ipv6_addr_equal(&pmc->addr, group))
                        break;
        }
        if (!pmc) {                /* must have a prior join */
                err = -EINVAL;
                goto done;
        }
        if (gsf->gf_numsrc) {
                newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
                                                      gsf->gf_numsrc),
                                      GFP_KERNEL);
                if (!newpsl) {
                        err = -ENOBUFS;
                        goto done;
                }
                newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc;
                for (i = 0; i < newpsl->sl_count; ++i, ++list) {
                        struct sockaddr_in6 *psin6;

                        psin6 = (struct sockaddr_in6 *)list;
                        newpsl->sl_addr[i] = psin6->sin6_addr;
                }

                err = ip6_mc_add_src(idev, group, gsf->gf_fmode,
                                     newpsl->sl_count, newpsl->sl_addr, 0);
                if (err) {
                        sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr,
                                                             newpsl->sl_max));
                        goto done;
                }
        } else {
                newpsl = NULL;
                ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
        }

        psl = sock_dereference(pmc->sflist, sk);
        if (psl) {
                ip6_mc_del_src(idev, group, pmc->sfmode,
                               psl->sl_count, psl->sl_addr, 0);
                atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
                           &sk->sk_omem_alloc);
        } else {
                ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
        }

        rcu_assign_pointer(pmc->sflist, newpsl);
        kfree_rcu(psl, rcu);
        pmc->sfmode = gsf->gf_fmode;
        err = 0;
done:
        mutex_unlock(&idev->mc_lock);
        in6_dev_put(idev);
        if (leavegroup)
                err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
        return err;
}

int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
                  sockptr_t optval, size_t ss_offset)
{
        struct ipv6_pinfo *inet6 = inet6_sk(sk);
        const struct in6_addr *group;
        struct ipv6_mc_socklist *pmc;
        struct ip6_sf_socklist *psl;
        unsigned int count;
        int i, copycount;

        group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;

        if (!ipv6_addr_is_multicast(group))
                return -EINVAL;

        for_each_pmc_socklock(inet6, sk, pmc) {
                if (pmc->ifindex != gsf->gf_interface)
                        continue;
                if (ipv6_addr_equal(group, &pmc->addr))
                        break;
        }
        if (!pmc)                /* must have a prior join */
                return -EADDRNOTAVAIL;

        gsf->gf_fmode = pmc->sfmode;
        psl = sock_dereference(pmc->sflist, sk);
        count = psl ? psl->sl_count : 0;

        copycount = min(count, gsf->gf_numsrc);
        gsf->gf_numsrc = count;
        for (i = 0; i < copycount; i++) {
                struct sockaddr_in6 *psin6;
                struct sockaddr_storage ss;

                psin6 = (struct sockaddr_in6 *)&ss;
                memset(&ss, 0, sizeof(ss));
                psin6->sin6_family = AF_INET6;
                psin6->sin6_addr = psl->sl_addr[i];
                if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss)))
                        return -EFAULT;
                ss_offset += sizeof(ss);
        }
        return 0;
}

bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
                    const struct in6_addr *src_addr)
{
        const struct ipv6_pinfo *np = inet6_sk(sk);
        const struct ipv6_mc_socklist *mc;
        const struct ip6_sf_socklist *psl;
        bool rv = true;

        rcu_read_lock();
        for_each_pmc_rcu(np, mc) {
                if (ipv6_addr_equal(&mc->addr, mc_addr))
                        break;
        }
        if (!mc) {
                rcu_read_unlock();
                return inet6_test_bit(MC6_ALL, sk);
        }
        psl = rcu_dereference(mc->sflist);
        if (!psl) {
                rv = mc->sfmode == MCAST_EXCLUDE;
        } else {
                int i;

                for (i = 0; i < psl->sl_count; i++) {
                        if (ipv6_addr_equal(&psl->sl_addr[i], src_addr))
                                break;
                }
                if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
                        rv = false;
                if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
                        rv = false;
        }
        rcu_read_unlock();

        return rv;
}

static void igmp6_group_added(struct ifmcaddr6 *mc)
{
        struct net_device *dev = mc->idev->dev;
        char buf[MAX_ADDR_LEN];

        mc_assert_locked(mc->idev);

        if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) <
            IPV6_ADDR_SCOPE_LINKLOCAL)
                return;

        if (!(mc->mca_flags&MAF_LOADED)) {
                mc->mca_flags |= MAF_LOADED;
                if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
                        dev_mc_add(dev, buf);
        }

        if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT))
                return;

        if (mld_in_v1_mode(mc->idev)) {
                igmp6_join_group(mc);
                return;
        }
        /* else v2 */

        /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we
         * should not send filter-mode change record as the mode
         * should be from IN() to IN(A).
         */
        if (mc->mca_sfmode == MCAST_EXCLUDE)
                mc->mca_crcount = mc->idev->mc_qrv;

        mld_ifc_event(mc->idev);
}

static void igmp6_group_dropped(struct ifmcaddr6 *mc)
{
        struct net_device *dev = mc->idev->dev;
        char buf[MAX_ADDR_LEN];

        mc_assert_locked(mc->idev);

        if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) <
            IPV6_ADDR_SCOPE_LINKLOCAL)
                return;

        if (mc->mca_flags&MAF_LOADED) {
                mc->mca_flags &= ~MAF_LOADED;
                if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
                        dev_mc_del(dev, buf);
        }

        if (mc->mca_flags & MAF_NOREPORT)
                return;

        if (!mc->idev->dead)
                igmp6_leave_group(mc);

        if (cancel_delayed_work(&mc->mca_work))
                refcount_dec(&mc->mca_refcnt);
}

/* deleted ifmcaddr6 manipulation */
static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
        struct ifmcaddr6 *pmc;

        mc_assert_locked(idev);

        /* this is an "ifmcaddr6" for convenience; only the fields below
         * are actually used. In particular, the refcnt and users are not
         * used for management of the delete list. Using the same structure
         * for deleted items allows change reports to use common code with
         * non-deleted or query-response MCA's.
         */
        pmc = kzalloc_obj(*pmc);
        if (!pmc)
                return;

        pmc->idev = im->idev;
        in6_dev_hold(idev);
        pmc->mca_addr = im->mca_addr;
        pmc->mca_crcount = idev->mc_qrv;
        pmc->mca_sfmode = im->mca_sfmode;
        if (pmc->mca_sfmode == MCAST_INCLUDE) {
                struct ip6_sf_list *psf;

                rcu_assign_pointer(pmc->mca_tomb,
                                   mc_dereference(im->mca_tomb, idev));
                rcu_assign_pointer(pmc->mca_sources,
                                   mc_dereference(im->mca_sources, idev));
                RCU_INIT_POINTER(im->mca_tomb, NULL);
                RCU_INIT_POINTER(im->mca_sources, NULL);

                for_each_psf_mclock(pmc, psf)
                        psf->sf_crcount = pmc->mca_crcount;
        }

        rcu_assign_pointer(pmc->next, idev->mc_tomb);
        rcu_assign_pointer(idev->mc_tomb, pmc);
}

static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
        struct ip6_sf_list *psf, *sources, *tomb;
        struct in6_addr *pmca = &im->mca_addr;
        struct ifmcaddr6 *pmc, *pmc_prev;

        mc_assert_locked(idev);

        pmc_prev = NULL;
        for_each_mc_tomb(idev, pmc) {
                if (ipv6_addr_equal(&pmc->mca_addr, pmca))
                        break;
                pmc_prev = pmc;
        }
        if (!pmc)
                return;
        if (pmc_prev)
                rcu_assign_pointer(pmc_prev->next, pmc->next);
        else
                rcu_assign_pointer(idev->mc_tomb, pmc->next);

        im->idev = pmc->idev;
        if (im->mca_sfmode == MCAST_INCLUDE) {
                tomb = rcu_replace_pointer(im->mca_tomb,
                                           mc_dereference(pmc->mca_tomb, pmc->idev),
                                           lockdep_is_held(&im->idev->mc_lock));
                rcu_assign_pointer(pmc->mca_tomb, tomb);

                sources = rcu_replace_pointer(im->mca_sources,
                                              mc_dereference(pmc->mca_sources, pmc->idev),
                                              lockdep_is_held(&im->idev->mc_lock));
                rcu_assign_pointer(pmc->mca_sources, sources);
                for_each_psf_mclock(im, psf)
                        psf->sf_crcount = idev->mc_qrv;
        } else {
                im->mca_crcount = idev->mc_qrv;
        }
        ip6_mc_clear_src(pmc);
        in6_dev_put(pmc->idev);
        kfree_rcu(pmc, rcu);
}

static void mld_clear_delrec(struct inet6_dev *idev)
{
        struct ifmcaddr6 *pmc, *nextpmc;

        mc_assert_locked(idev);

        pmc = mc_dereference(idev->mc_tomb, idev);
        RCU_INIT_POINTER(idev->mc_tomb, NULL);

        for (; pmc; pmc = nextpmc) {
                nextpmc = mc_dereference(pmc->next, idev);
                ip6_mc_clear_src(pmc);
                in6_dev_put(pmc->idev);
                kfree_rcu(pmc, rcu);
        }

        /* clear dead sources, too */
        for_each_mc_mclock(idev, pmc) {
                struct ip6_sf_list *psf, *psf_next;

                psf = mc_dereference(pmc->mca_tomb, idev);
                RCU_INIT_POINTER(pmc->mca_tomb, NULL);
                for (; psf; psf = psf_next) {
                        psf_next = mc_dereference(psf->sf_next, idev);
                        kfree_rcu(psf, rcu);
                }
        }
}

static void mld_clear_query(struct inet6_dev *idev)
{
        spin_lock_bh(&idev->mc_query_lock);
        __skb_queue_purge(&idev->mc_query_queue);
        spin_unlock_bh(&idev->mc_query_lock);
}

static void mld_clear_report(struct inet6_dev *idev)
{
        spin_lock_bh(&idev->mc_report_lock);
        __skb_queue_purge(&idev->mc_report_queue);
        spin_unlock_bh(&idev->mc_report_lock);
}

static void ma_put(struct ifmcaddr6 *mc)
{
        if (refcount_dec_and_test(&mc->mca_refcnt)) {
                in6_dev_put(mc->idev);
                kfree_rcu(mc, rcu);
        }
}

static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
                                   const struct in6_addr *addr,
                                   unsigned int mode)
{
        struct ifmcaddr6 *mc;

        mc_assert_locked(idev);

        mc = kzalloc_obj(*mc);
        if (!mc)
                return NULL;

        INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work);

        mc->mca_addr = *addr;
        mc->idev = idev; /* reference taken by caller */
        mc->mca_users = 1;
        /* mca_stamp should be updated upon changes */
        mc->mca_cstamp = mc->mca_tstamp = jiffies;
        refcount_set(&mc->mca_refcnt, 1);

        mc->mca_sfmode = mode;
        mc->mca_sfcount[mode] = 1;

        if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
            IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
                mc->mca_flags |= MAF_NOREPORT;

        return mc;
}

static void inet6_ifmcaddr_notify(struct net_device *dev,
                                  const struct ifmcaddr6 *ifmca, int event)
{
        struct inet6_fill_args fillargs = {
                .portid = 0,
                .seq = 0,
                .event = event,
                .flags = 0,
                .netnsid = -1,
                .force_rt_scope_universe = true,
        };
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -ENOMEM;

        skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
                        nla_total_size(sizeof(struct in6_addr)) +
                        nla_total_size(sizeof(struct ifa_cacheinfo)),
                        GFP_KERNEL);
        if (!skb)
                goto error;

        err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs);
        if (err < 0) {
                WARN_ON_ONCE(err == -EMSGSIZE);
                nlmsg_free(skb);
                goto error;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL);
        return;
error:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err);
}

/*
 *        device multicast group inc (add if not found)
 */
static int __ipv6_dev_mc_inc(struct net_device *dev,
                             const struct in6_addr *addr, unsigned int mode)
{
        struct inet6_dev *idev;
        struct ifmcaddr6 *mc;

        /* we need to take a reference on idev */
        idev = in6_dev_get(dev);
        if (!idev)
                return -EINVAL;

        mutex_lock(&idev->mc_lock);

        if (READ_ONCE(idev->dead)) {
                mutex_unlock(&idev->mc_lock);
                in6_dev_put(idev);
                return -ENODEV;
        }

        for_each_mc_mclock(idev, mc) {
                if (ipv6_addr_equal(&mc->mca_addr, addr)) {
                        mc->mca_users++;
                        ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
                        mutex_unlock(&idev->mc_lock);
                        in6_dev_put(idev);
                        return 0;
                }
        }

        mc = mca_alloc(idev, addr, mode);
        if (!mc) {
                mutex_unlock(&idev->mc_lock);
                in6_dev_put(idev);
                return -ENOMEM;
        }

        rcu_assign_pointer(mc->next, idev->mc_list);
        rcu_assign_pointer(idev->mc_list, mc);

        mld_del_delrec(idev, mc);
        igmp6_group_added(mc);
        inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST);
        mutex_unlock(&idev->mc_lock);

        return 0;
}

int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
{
        return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE);
}
EXPORT_SYMBOL(ipv6_dev_mc_inc);

/*
 * device multicast group del
 */
int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
{
        struct ifmcaddr6 *ma, __rcu **map;

        mutex_lock(&idev->mc_lock);

        for (map = &idev->mc_list;
             (ma = mc_dereference(*map, idev));
             map = &ma->next) {
                if (ipv6_addr_equal(&ma->mca_addr, addr)) {
                        if (--ma->mca_users == 0) {
                                *map = ma->next;

                                igmp6_group_dropped(ma);
                                inet6_ifmcaddr_notify(idev->dev, ma,
                                                      RTM_DELMULTICAST);
                                ip6_mc_clear_src(ma);
                                mutex_unlock(&idev->mc_lock);

                                ma_put(ma);
                                return 0;
                        }
                        mutex_unlock(&idev->mc_lock);
                        return 0;
                }
        }

        mutex_unlock(&idev->mc_lock);
        return -ENOENT;
}

int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr)
{
        struct inet6_dev *idev;
        int err;

        idev = in6_dev_get(dev);
        if (!idev)
                return -ENODEV;

        err = __ipv6_dev_mc_dec(idev, addr);
        in6_dev_put(idev);

        return err;
}
EXPORT_SYMBOL(ipv6_dev_mc_dec);

/*
 *        check if the interface/address pair is valid
 */
bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
                         const struct in6_addr *src_addr)
{
        struct inet6_dev *idev;
        struct ifmcaddr6 *mc;
        bool rv = false;

        rcu_read_lock();
        idev = __in6_dev_get(dev);
        if (!idev)
                goto unlock;
        for_each_mc_rcu(idev, mc) {
                if (ipv6_addr_equal(&mc->mca_addr, group))
                        break;
        }
        if (!mc)
                goto unlock;
        if (src_addr && !ipv6_addr_any(src_addr)) {
                struct ip6_sf_list *psf;

                for_each_psf_rcu(mc, psf) {
                        if (ipv6_addr_equal(&psf->sf_addr, src_addr))
                                break;
                }
                if (psf)
                        rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) ||
                                READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) !=
                                READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]);
                else
                        rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0;
        } else {
                rv = true; /* don't filter unspecified source */
        }
unlock:
        rcu_read_unlock();
        return rv;
}

static void mld_gq_start_work(struct inet6_dev *idev)
{
        unsigned long tv = get_random_u32_below(idev->mc_maxdelay);

        mc_assert_locked(idev);

        idev->mc_gq_running = 1;
        if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2))
                in6_dev_hold(idev);
}

static void mld_gq_stop_work(struct inet6_dev *idev)
{
        mc_assert_locked(idev);

        idev->mc_gq_running = 0;
        if (cancel_delayed_work(&idev->mc_gq_work))
                __in6_dev_put(idev);
}

static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay)
{
        unsigned long tv = get_random_u32_below(delay);

        mc_assert_locked(idev);

        if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2))
                in6_dev_hold(idev);
}

static void mld_ifc_stop_work(struct inet6_dev *idev)
{
        mc_assert_locked(idev);

        idev->mc_ifc_count = 0;
        if (cancel_delayed_work(&idev->mc_ifc_work))
                __in6_dev_put(idev);
}

static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay)
{
        unsigned long tv = get_random_u32_below(delay);

        mc_assert_locked(idev);

        if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2))
                in6_dev_hold(idev);
}

static void mld_dad_stop_work(struct inet6_dev *idev)
{
        if (cancel_delayed_work(&idev->mc_dad_work))
                __in6_dev_put(idev);
}

static void mld_query_stop_work(struct inet6_dev *idev)
{
        spin_lock_bh(&idev->mc_query_lock);
        if (cancel_delayed_work(&idev->mc_query_work))
                __in6_dev_put(idev);
        spin_unlock_bh(&idev->mc_query_lock);
}

static void mld_report_stop_work(struct inet6_dev *idev)
{
        if (cancel_delayed_work_sync(&idev->mc_report_work))
                __in6_dev_put(idev);
}

/* IGMP handling (alias multicast ICMPv6 messages) */
static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
{
        unsigned long delay = resptime;

        mc_assert_locked(ma->idev);

        /* Do not start work for these addresses */
        if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) ||
            IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
                return;

        if (cancel_delayed_work(&ma->mca_work)) {
                refcount_dec(&ma->mca_refcnt);
                delay = ma->mca_work.timer.expires - jiffies;
        }

        if (delay >= resptime)
                delay = get_random_u32_below(resptime);

        if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
                refcount_inc(&ma->mca_refcnt);
        ma->mca_flags |= MAF_TIMER_RUNNING;
}

/* mark EXCLUDE-mode sources */
static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
                             const struct in6_addr *srcs)
{
        struct ip6_sf_list *psf;
        int i, scount;

        mc_assert_locked(pmc->idev);

        scount = 0;
        for_each_psf_mclock(pmc, psf) {
                if (scount == nsrcs)
                        break;
                for (i = 0; i < nsrcs; i++) {
                        /* skip inactive filters */
                        if (psf->sf_count[MCAST_INCLUDE] ||
                            pmc->mca_sfcount[MCAST_EXCLUDE] !=
                            psf->sf_count[MCAST_EXCLUDE])
                                break;
                        if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
                                scount++;
                                break;
                        }
                }
        }
        pmc->mca_flags &= ~MAF_GSQUERY;
        if (scount == nsrcs)        /* all sources excluded */
                return false;
        return true;
}

static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
                            const struct in6_addr *srcs)
{
        struct ip6_sf_list *psf;
        int i, scount;

        mc_assert_locked(pmc->idev);

        if (pmc->mca_sfmode == MCAST_EXCLUDE)
                return mld_xmarksources(pmc, nsrcs, srcs);

        /* mark INCLUDE-mode sources */

        scount = 0;
        for_each_psf_mclock(pmc, psf) {
                if (scount == nsrcs)
                        break;
                for (i = 0; i < nsrcs; i++) {
                        if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
                                psf->sf_gsresp = 1;
                                scount++;
                                break;
                        }
                }
        }
        if (!scount) {
                pmc->mca_flags &= ~MAF_GSQUERY;
                return false;
        }
        pmc->mca_flags |= MAF_GSQUERY;
        return true;
}

static int mld_force_mld_version(const struct inet6_dev *idev)
{
        const struct net *net = dev_net(idev->dev);
        int all_force;

        all_force = READ_ONCE(net->ipv6.devconf_all->force_mld_version);
        /* Normally, both are 0 here. If enforcement to a particular is
         * being used, individual device enforcement will have a lower
         * precedence over 'all' device (.../conf/all/force_mld_version).
         */
        return all_force ?: READ_ONCE(idev->cnf.force_mld_version);
}

static bool mld_in_v2_mode_only(const struct inet6_dev *idev)
{
        return mld_force_mld_version(idev) == 2;
}

static bool mld_in_v1_mode_only(const struct inet6_dev *idev)
{
        return mld_force_mld_version(idev) == 1;
}

static bool mld_in_v1_mode(const struct inet6_dev *idev)
{
        if (mld_in_v2_mode_only(idev))
                return false;
        if (mld_in_v1_mode_only(idev))
                return true;
        if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen))
                return true;

        return false;
}

static void mld_set_v1_mode(struct inet6_dev *idev)
{
        /* RFC3810, relevant sections:
         *  - 9.1. Robustness Variable
         *  - 9.2. Query Interval
         *  - 9.3. Query Response Interval
         *  - 9.12. Older Version Querier Present Timeout
         */
        unsigned long switchback;

        switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri;

        idev->mc_v1_seen = jiffies + switchback;
}

static void mld_update_qrv(struct inet6_dev *idev,
                           const struct mld2_query *mlh2)
{
        /* RFC3810, relevant sections:
         *  - 5.1.8. QRV (Querier's Robustness Variable)
         *  - 9.1. Robustness Variable
         */

        /* The value of the Robustness Variable MUST NOT be zero,
         * and SHOULD NOT be one. Catch this here if we ever run
         * into such a case in future.
         */
        const int min_qrv = min(MLD_QRV_DEFAULT, sysctl_mld_qrv);
        WARN_ON(idev->mc_qrv == 0);

        if (mlh2->mld2q_qrv > 0)
                idev->mc_qrv = mlh2->mld2q_qrv;

        if (unlikely(idev->mc_qrv < min_qrv)) {
                net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n",
                                     idev->mc_qrv, min_qrv);
                idev->mc_qrv = min_qrv;
        }
}

static void mld_update_qi(struct inet6_dev *idev,
                          const struct mld2_query *mlh2)
{
        /* RFC3810, relevant sections:
         *  - 5.1.9. QQIC (Querier's Query Interval Code)
         *  - 9.2. Query Interval
         *  - 9.12. Older Version Querier Present Timeout
         *    (the [Query Interval] in the last Query received)
         */
        unsigned long mc_qqi;

        if (mlh2->mld2q_qqic < 128) {
                mc_qqi = mlh2->mld2q_qqic;
        } else {
                unsigned long mc_man, mc_exp;

                mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic);
                mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic);

                mc_qqi = (mc_man | 0x10) << (mc_exp + 3);
        }

        idev->mc_qi = mc_qqi * HZ;
}

static void mld_update_qri(struct inet6_dev *idev,
                           const struct mld2_query *mlh2)
{
        /* RFC3810, relevant sections:
         *  - 5.1.3. Maximum Response Code
         *  - 9.3. Query Response Interval
         */
        idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2));
}

static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld,
                          unsigned long *max_delay, bool v1_query)
{
        unsigned long mldv1_md;

        /* Ignore v1 queries */
        if (mld_in_v2_mode_only(idev))
                return -EINVAL;

        mldv1_md = ntohs(mld->mld_maxdelay);

        /* When in MLDv1 fallback and a MLDv2 router start-up being
         * unaware of current MLDv1 operation, the MRC == MRD mapping
         * only works when the exponential algorithm is not being
         * used (as MLDv1 is unaware of such things).
         *
         * According to the RFC author, the MLDv2 implementations
         * he's aware of all use a MRC < 32768 on start up queries.
         *
         * Thus, should we *ever* encounter something else larger
         * than that, just assume the maximum possible within our
         * reach.
         */
        if (!v1_query)
                mldv1_md = min(mldv1_md, MLDV1_MRD_MAX_COMPAT);

        *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL);

        /* MLDv1 router present: we need to go into v1 mode *only*
         * when an MLDv1 query is received as per section 9.12. of
         * RFC3810! And we know from RFC2710 section 3.7 that MLDv1
         * queries MUST be of exactly 24 octets.
         */
        if (v1_query)
                mld_set_v1_mode(idev);

        /* cancel MLDv2 report work */
        mld_gq_stop_work(idev);
        /* cancel the interface change work */
        mld_ifc_stop_work(idev);
        /* clear deleted report items */
        mld_clear_delrec(idev);

        return 0;
}

static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
                           unsigned long *max_delay)
{
        *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL);

        mld_update_qrv(idev, mld);
        mld_update_qi(idev, mld);
        mld_update_qri(idev, mld);

        idev->mc_maxdelay = *max_delay;

        return;
}

/* called with rcu_read_lock() */
void igmp6_event_query(struct sk_buff *skb)
{
        struct inet6_dev *idev = __in6_dev_get(skb->dev);

        if (!idev || idev->dead)
                goto out;

        spin_lock_bh(&idev->mc_query_lock);
        if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) {
                __skb_queue_tail(&idev->mc_query_queue, skb);
                if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0))
                        in6_dev_hold(idev);
                skb = NULL;
        }
        spin_unlock_bh(&idev->mc_query_lock);
out:
        kfree_skb(skb);
}

static void __mld_query_work(struct sk_buff *skb)
{
        struct mld2_query *mlh2 = NULL;
        const struct in6_addr *group;
        unsigned long max_delay;
        struct inet6_dev *idev;
        struct ifmcaddr6 *ma;
        struct mld_msg *mld;
        int group_type;
        int mark = 0;
        int len, err;

        if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
                goto kfree_skb;

        /* compute payload length excluding extension headers */
        len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr);
        len -= skb_network_header_len(skb);

        /* RFC3810 6.2
         * Upon reception of an MLD message that contains a Query, the node
         * checks if the source address of the message is a valid link-local
         * address, if the Hop Limit is set to 1, and if the Router Alert
         * option is present in the Hop-By-Hop Options header of the IPv6
         * packet.  If any of these checks fails, the packet is dropped.
         */
        if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) ||
            ipv6_hdr(skb)->hop_limit != 1 ||
            !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) ||
            IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD))
                goto kfree_skb;

        idev = in6_dev_get(skb->dev);
        if (!idev)
                goto kfree_skb;

        mld = (struct mld_msg *)icmp6_hdr(skb);
        group = &mld->mld_mca;
        group_type = ipv6_addr_type(group);

        if (group_type != IPV6_ADDR_ANY &&
            !(group_type&IPV6_ADDR_MULTICAST))
                goto out;

        if (len < MLD_V1_QUERY_LEN) {
                goto out;
        } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) {
                err = mld_process_v1(idev, mld, &max_delay,
                                     len == MLD_V1_QUERY_LEN);
                if (err < 0)
                        goto out;
        } else if (len >= MLD_V2_QUERY_LEN_MIN) {
                int srcs_offset = sizeof(struct mld2_query) -
                                  sizeof(struct icmp6hdr);

                if (!pskb_may_pull(skb, srcs_offset))
                        goto out;

                mlh2 = (struct mld2_query *)skb_transport_header(skb);

                mld_process_v2(idev, mlh2, &max_delay);

                if (group_type == IPV6_ADDR_ANY) { /* general query */
                        if (mlh2->mld2q_nsrcs)
                                goto out; /* no sources allowed */

                        mld_gq_start_work(idev);
                        goto out;
                }
                /* mark sources to include, if group & source-specific */
                if (mlh2->mld2q_nsrcs != 0) {
                        if (!pskb_may_pull(skb, srcs_offset +
                            ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr)))
                                goto out;

                        mlh2 = (struct mld2_query *)skb_transport_header(skb);
                        mark = 1;
                }
        } else {
                goto out;
        }

        if (group_type == IPV6_ADDR_ANY) {
                for_each_mc_mclock(idev, ma) {
                        igmp6_group_queried(ma, max_delay);
                }
        } else {
                for_each_mc_mclock(idev, ma) {
                        if (!ipv6_addr_equal(group, &ma->mca_addr))
                                continue;
                        if (ma->mca_flags & MAF_TIMER_RUNNING) {
                                /* gsquery <- gsquery && mark */
                                if (!mark)
                                        ma->mca_flags &= ~MAF_GSQUERY;
                        } else {
                                /* gsquery <- mark */
                                if (mark)
                                        ma->mca_flags |= MAF_GSQUERY;
                                else
                                        ma->mca_flags &= ~MAF_GSQUERY;
                        }
                        if (!(ma->mca_flags & MAF_GSQUERY) ||
                            mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs))
                                igmp6_group_queried(ma, max_delay);
                        break;
                }
        }

out:
        in6_dev_put(idev);
kfree_skb:
        consume_skb(skb);
}

static void mld_query_work(struct work_struct *work)
{
        struct inet6_dev *idev = container_of(to_delayed_work(work),
                                              struct inet6_dev,
                                              mc_query_work);
        struct sk_buff_head q;
        struct sk_buff *skb;
        bool rework = false;
        int cnt = 0;

        skb_queue_head_init(&q);

        spin_lock_bh(&idev->mc_query_lock);
        while ((skb = __skb_dequeue(&idev->mc_query_queue))) {
                __skb_queue_tail(&q, skb);

                if (++cnt >= MLD_MAX_QUEUE) {
                        rework = true;
                        break;
                }
        }
        spin_unlock_bh(&idev->mc_query_lock);

        mutex_lock(&idev->mc_lock);
        while ((skb = __skb_dequeue(&q)))
                __mld_query_work(skb);
        mutex_unlock(&idev->mc_lock);

        if (rework && queue_delayed_work(mld_wq, &idev->mc_query_work, 0))
                return;

        in6_dev_put(idev);
}

/* called with rcu_read_lock() */
void igmp6_event_report(struct sk_buff *skb)
{
        struct inet6_dev *idev = __in6_dev_get(skb->dev);

        if (!idev || idev->dead)
                goto out;

        spin_lock_bh(&idev->mc_report_lock);
        if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) {
                __skb_queue_tail(&idev->mc_report_queue, skb);
                if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0))
                        in6_dev_hold(idev);
                skb = NULL;
        }
        spin_unlock_bh(&idev->mc_report_lock);
out:
        kfree_skb(skb);
}

static void __mld_report_work(struct sk_buff *skb)
{
        struct inet6_dev *idev;
        struct ifmcaddr6 *ma;
        struct mld_msg *mld;
        int addr_type;

        /* Our own report looped back. Ignore it. */
        if (skb->pkt_type == PACKET_LOOPBACK)
                goto kfree_skb;

        /* send our report if the MC router may not have heard this report */
        if (skb->pkt_type != PACKET_MULTICAST &&
            skb->pkt_type != PACKET_BROADCAST)
                goto kfree_skb;

        if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr)))
                goto kfree_skb;

        mld = (struct mld_msg *)icmp6_hdr(skb);

        /* Drop reports with not link local source */
        addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
        if (addr_type != IPV6_ADDR_ANY &&
            !(addr_type&IPV6_ADDR_LINKLOCAL))
                goto kfree_skb;

        idev = in6_dev_get(skb->dev);
        if (!idev)
                goto kfree_skb;

        /*
         *        Cancel the work for this group
         */

        for_each_mc_mclock(idev, ma) {
                if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) {
                        if (cancel_delayed_work(&ma->mca_work))
                                refcount_dec(&ma->mca_refcnt);
                        ma->mca_flags &= ~(MAF_LAST_REPORTER |
                                           MAF_TIMER_RUNNING);
                        break;
                }
        }

        in6_dev_put(idev);
kfree_skb:
        consume_skb(skb);
}

static void mld_report_work(struct work_struct *work)
{
        struct inet6_dev *idev = container_of(to_delayed_work(work),
                                              struct inet6_dev,
                                              mc_report_work);
        struct sk_buff_head q;
        struct sk_buff *skb;
        bool rework = false;
        int cnt = 0;

        skb_queue_head_init(&q);
        spin_lock_bh(&idev->mc_report_lock);
        while ((skb = __skb_dequeue(&idev->mc_report_queue))) {
                __skb_queue_tail(&q, skb);

                if (++cnt >= MLD_MAX_QUEUE) {
                        rework = true;
                        break;
                }
        }
        spin_unlock_bh(&idev->mc_report_lock);

        mutex_lock(&idev->mc_lock);
        while ((skb = __skb_dequeue(&q)))
                __mld_report_work(skb);
        mutex_unlock(&idev->mc_lock);

        if (rework && queue_delayed_work(mld_wq, &idev->mc_report_work, 0))
                return;

        in6_dev_put(idev);
}

static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
                  int gdeleted, int sdeleted)
{
        switch (type) {
        case MLD2_MODE_IS_INCLUDE:
        case MLD2_MODE_IS_EXCLUDE:
                if (gdeleted || sdeleted)
                        return false;
                if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) {
                        if (pmc->mca_sfmode == MCAST_INCLUDE)
                                return true;
                        /* don't include if this source is excluded
                         * in all filters
                         */
                        if (psf->sf_count[MCAST_INCLUDE])
                                return type == MLD2_MODE_IS_INCLUDE;
                        return pmc->mca_sfcount[MCAST_EXCLUDE] ==
                                psf->sf_count[MCAST_EXCLUDE];
                }
                return false;
        case MLD2_CHANGE_TO_INCLUDE:
                if (gdeleted || sdeleted)
                        return false;
                return psf->sf_count[MCAST_INCLUDE] != 0;
        case MLD2_CHANGE_TO_EXCLUDE:
                if (gdeleted || sdeleted)
                        return false;
                if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 ||
                    psf->sf_count[MCAST_INCLUDE])
                        return false;
                return pmc->mca_sfcount[MCAST_EXCLUDE] ==
                        psf->sf_count[MCAST_EXCLUDE];
        case MLD2_ALLOW_NEW_SOURCES:
                if (gdeleted || !psf->sf_crcount)
                        return false;
                return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted;
        case MLD2_BLOCK_OLD_SOURCES:
                if (pmc->mca_sfmode == MCAST_INCLUDE)
                        return gdeleted || (psf->sf_crcount && sdeleted);
                return psf->sf_crcount && !gdeleted && !sdeleted;
        }
        return false;
}

static int
mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted)
{
        struct ip6_sf_list *psf;
        int scount = 0;

        for_each_psf_mclock(pmc, psf) {
                if (!is_in(pmc, psf, type, gdeleted, sdeleted))
                        continue;
                scount++;
        }
        return scount;
}

static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb,
                       struct net_device *dev, const struct in6_addr *saddr,
                       const struct in6_addr *daddr, int proto, int len)
{
        struct ipv6hdr *hdr;

        skb->protocol = htons(ETH_P_IPV6);
        skb->dev = dev;

        skb_reset_network_header(skb);
        skb_put(skb, sizeof(struct ipv6hdr));
        hdr = ipv6_hdr(skb);

        ip6_flow_hdr(hdr, 0, 0);

        hdr->payload_len = htons(len);
        hdr->nexthdr = proto;
        hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit);

        hdr->saddr = *saddr;
        hdr->daddr = *daddr;
}

static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
{
        u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT,
                     2, 0, 0, IPV6_TLV_PADN, 0 };
        struct net_device *dev = idev->dev;
        int hlen = LL_RESERVED_SPACE(dev);
        int tlen = dev->needed_tailroom;
        const struct in6_addr *saddr;
        struct in6_addr addr_buf;
        struct mld2_report *pmr;
        struct sk_buff *skb;
        unsigned int size;
        struct sock *sk;
        struct net *net;

        /* we assume size > sizeof(ra) here
         * Also try to not allocate high-order pages for big MTU
         */
        size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen;
        skb = alloc_skb(size, GFP_KERNEL);
        if (!skb)
                return NULL;

        skb->priority = TC_PRIO_CONTROL;
        skb_reserve(skb, hlen);
        skb_tailroom_reserve(skb, mtu, tlen);

        rcu_read_lock();

        net = dev_net_rcu(dev);
        sk = net->ipv6.igmp_sk;
        skb_set_owner_w(skb, sk);

        if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
                /* <draft-ietf-magma-mld-source-05.txt>:
                 * use unspecified address as the source address
                 * when a valid link-local address is not available.
                 */
                saddr = &in6addr_any;
        } else
                saddr = &addr_buf;

        ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0);

        rcu_read_unlock();

        skb_put_data(skb, ra, sizeof(ra));

        skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data);
        skb_put(skb, sizeof(*pmr));
        pmr = (struct mld2_report *)skb_transport_header(skb);
        pmr->mld2r_type = ICMPV6_MLD2_REPORT;
        pmr->mld2r_resv1 = 0;
        pmr->mld2r_cksum = 0;
        pmr->mld2r_resv2 = 0;
        pmr->mld2r_ngrec = 0;
        return skb;
}

static void mld_sendpack(struct sk_buff *skb)
{
        struct ipv6hdr *pip6 = ipv6_hdr(skb);
        struct mld2_report *pmr =
                              (struct mld2_report *)skb_transport_header(skb);
        int payload_len, mldlen;
        struct inet6_dev *idev;
        struct net *net = dev_net(skb->dev);
        int err;
        struct flowi6 fl6;
        struct dst_entry *dst;

        rcu_read_lock();
        idev = __in6_dev_get(skb->dev);
        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);

        payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) -
                sizeof(*pip6);
        mldlen = skb_tail_pointer(skb) - skb_transport_header(skb);
        pip6->payload_len = htons(payload_len);

        pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
                                           IPPROTO_ICMPV6,
                                           csum_partial(skb_transport_header(skb),
                                                        mldlen, 0));

        icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT,
                         &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
                         skb->dev->ifindex);
        dst = icmp6_dst_alloc(skb->dev, &fl6);

        err = 0;
        if (IS_ERR(dst)) {
                err = PTR_ERR(dst);
                dst = NULL;
        }
        skb_dst_set(skb, dst);
        if (err)
                goto err_out;

        err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
                      net, net->ipv6.igmp_sk, skb, NULL, skb->dev,
                      dst_output);
out:
        if (!err) {
                ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT);
                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
        } else {
                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
        }

        rcu_read_unlock();
        return;

err_out:
        kfree_skb(skb);
        goto out;
}

static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
{
        return sizeof(struct mld2_grec) + 16 * mld_scount(pmc,type,gdel,sdel);
}

static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
        int type, struct mld2_grec **ppgr, unsigned int mtu)
{
        struct mld2_report *pmr;
        struct mld2_grec *pgr;

        if (!skb) {
                skb = mld_newpack(pmc->idev, mtu);
                if (!skb)
                        return NULL;
        }
        pgr = skb_put(skb, sizeof(struct mld2_grec));
        pgr->grec_type = type;
        pgr->grec_auxwords = 0;
        pgr->grec_nsrcs = 0;
        pgr->grec_mca = pmc->mca_addr;        /* structure copy */
        pmr = (struct mld2_report *)skb_transport_header(skb);
        pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1);
        *ppgr = pgr;
        return skb;
}

#define AVAILABLE(skb)        ((skb) ? skb_availroom(skb) : 0)

static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
                                int type, int gdeleted, int sdeleted,
                                int crsend)
{
        struct ip6_sf_list *psf, *psf_prev, *psf_next;
        int scount, stotal, first, isquery, truncate;
        struct ip6_sf_list __rcu **psf_list;
        struct inet6_dev *idev = pmc->idev;
        struct net_device *dev = idev->dev;
        struct mld2_grec *pgr = NULL;
        struct mld2_report *pmr;
        unsigned int mtu;

        mc_assert_locked(idev);

        if (pmc->mca_flags & MAF_NOREPORT)
                return skb;

        mtu = READ_ONCE(dev->mtu);
        if (mtu < IPV6_MIN_MTU)
                return skb;

        isquery = type == MLD2_MODE_IS_INCLUDE ||
                  type == MLD2_MODE_IS_EXCLUDE;
        truncate = type == MLD2_MODE_IS_EXCLUDE ||
                    type == MLD2_CHANGE_TO_EXCLUDE;

        stotal = scount = 0;

        psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources;

        if (!rcu_access_pointer(*psf_list))
                goto empty_source;

        pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL;

        /* EX and TO_EX get a fresh packet, if needed */
        if (truncate) {
                if (pmr && pmr->mld2r_ngrec &&
                    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
                        if (skb)
                                mld_sendpack(skb);
                        skb = mld_newpack(idev, mtu);
                }
        }
        first = 1;
        psf_prev = NULL;
        for (psf = mc_dereference(*psf_list, idev);
             psf;
             psf = psf_next) {
                struct in6_addr *psrc;

                psf_next = mc_dereference(psf->sf_next, idev);

                if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) {
                        psf_prev = psf;
                        continue;
                }

                /* Based on RFC3810 6.1. Should not send source-list change
                 * records when there is a filter mode change.
                 */
                if (((gdeleted && pmc->mca_sfmode == MCAST_EXCLUDE) ||
                     (!gdeleted && pmc->mca_crcount)) &&
                    (type == MLD2_ALLOW_NEW_SOURCES ||
                     type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount)
                        goto decrease_sf_crcount;

                /* clear marks on query responses */
                if (isquery)
                        psf->sf_gsresp = 0;

                if (AVAILABLE(skb) < sizeof(*psrc) +
                    first*sizeof(struct mld2_grec)) {
                        if (truncate && !first)
                                break;         /* truncate these */
                        if (pgr)
                                pgr->grec_nsrcs = htons(scount);
                        if (skb)
                                mld_sendpack(skb);
                        skb = mld_newpack(idev, mtu);
                        first = 1;
                        scount = 0;
                }
                if (first) {
                        skb = add_grhead(skb, pmc, type, &pgr, mtu);
                        first = 0;
                }
                if (!skb)
                        return NULL;
                psrc = skb_put(skb, sizeof(*psrc));
                *psrc = psf->sf_addr;
                scount++; stotal++;
                if ((type == MLD2_ALLOW_NEW_SOURCES ||
                     type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
decrease_sf_crcount:
                        psf->sf_crcount--;
                        if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
                                if (psf_prev)
                                        rcu_assign_pointer(psf_prev->sf_next,
                                                           mc_dereference(psf->sf_next, idev));
                                else
                                        rcu_assign_pointer(*psf_list,
                                                           mc_dereference(psf->sf_next, idev));
                                kfree_rcu(psf, rcu);
                                continue;
                        }
                }
                psf_prev = psf;
        }

empty_source:
        if (!stotal) {
                if (type == MLD2_ALLOW_NEW_SOURCES ||
                    type == MLD2_BLOCK_OLD_SOURCES)
                        return skb;
                if (pmc->mca_crcount || isquery || crsend) {
                        /* make sure we have room for group header */
                        if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) {
                                mld_sendpack(skb);
                                skb = NULL; /* add_grhead will get a new one */
                        }
                        skb = add_grhead(skb, pmc, type, &pgr, mtu);
                }
        }
        if (pgr)
                pgr->grec_nsrcs = htons(scount);

        if (isquery)
                pmc->mca_flags &= ~MAF_GSQUERY;        /* clear query state */
        return skb;
}

static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc)
{
        struct sk_buff *skb = NULL;
        int type;

        mc_assert_locked(idev);

        if (!pmc) {
                for_each_mc_mclock(idev, pmc) {
                        if (pmc->mca_flags & MAF_NOREPORT)
                                continue;
                        if (pmc->mca_sfcount[MCAST_EXCLUDE])
                                type = MLD2_MODE_IS_EXCLUDE;
                        else
                                type = MLD2_MODE_IS_INCLUDE;
                        skb = add_grec(skb, pmc, type, 0, 0, 0);
                }
        } else {
                if (pmc->mca_sfcount[MCAST_EXCLUDE])
                        type = MLD2_MODE_IS_EXCLUDE;
                else
                        type = MLD2_MODE_IS_INCLUDE;
                skb = add_grec(skb, pmc, type, 0, 0, 0);
        }
        if (skb)
                mld_sendpack(skb);
}

/* remove zero-count source records from a source filter list */
static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev)
{
        struct ip6_sf_list *psf_prev, *psf_next, *psf;

        psf_prev = NULL;
        for (psf = mc_dereference(*ppsf, idev);
             psf;
             psf = psf_next) {
                psf_next = mc_dereference(psf->sf_next, idev);
                if (psf->sf_crcount == 0) {
                        if (psf_prev)
                                rcu_assign_pointer(psf_prev->sf_next,
                                                   mc_dereference(psf->sf_next, idev));
                        else
                                rcu_assign_pointer(*ppsf,
                                                   mc_dereference(psf->sf_next, idev));
                        kfree_rcu(psf, rcu);
                } else {
                        psf_prev = psf;
                }
        }
}

static void mld_send_cr(struct inet6_dev *idev)
{
        struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next;
        struct sk_buff *skb = NULL;
        int type, dtype;

        /* deleted MCA's */
        pmc_prev = NULL;
        for (pmc = mc_dereference(idev->mc_tomb, idev);
             pmc;
             pmc = pmc_next) {
                pmc_next = mc_dereference(pmc->next, idev);
                if (pmc->mca_sfmode == MCAST_INCLUDE) {
                        type = MLD2_BLOCK_OLD_SOURCES;
                        dtype = MLD2_BLOCK_OLD_SOURCES;
                        skb = add_grec(skb, pmc, type, 1, 0, 0);
                        skb = add_grec(skb, pmc, dtype, 1, 1, 0);
                }
                if (pmc->mca_crcount) {
                        if (pmc->mca_sfmode == MCAST_EXCLUDE) {
                                type = MLD2_CHANGE_TO_INCLUDE;
                                skb = add_grec(skb, pmc, type, 1, 0, 0);
                        }
                        pmc->mca_crcount--;
                        if (pmc->mca_crcount == 0) {
                                mld_clear_zeros(&pmc->mca_tomb, idev);
                                mld_clear_zeros(&pmc->mca_sources, idev);
                        }
                }
                if (pmc->mca_crcount == 0 &&
                    !rcu_access_pointer(pmc->mca_tomb) &&
                    !rcu_access_pointer(pmc->mca_sources)) {
                        if (pmc_prev)
                                rcu_assign_pointer(pmc_prev->next, pmc_next);
                        else
                                rcu_assign_pointer(idev->mc_tomb, pmc_next);
                        in6_dev_put(pmc->idev);
                        kfree_rcu(pmc, rcu);
                } else
                        pmc_prev = pmc;
        }

        /* change recs */
        for_each_mc_mclock(idev, pmc) {
                if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
                        type = MLD2_BLOCK_OLD_SOURCES;
                        dtype = MLD2_ALLOW_NEW_SOURCES;
                } else {
                        type = MLD2_ALLOW_NEW_SOURCES;
                        dtype = MLD2_BLOCK_OLD_SOURCES;
                }
                skb = add_grec(skb, pmc, type, 0, 0, 0);
                skb = add_grec(skb, pmc, dtype, 0, 1, 0);        /* deleted sources */

                /* filter mode changes */
                if (pmc->mca_crcount) {
                        if (pmc->mca_sfmode == MCAST_EXCLUDE)
                                type = MLD2_CHANGE_TO_EXCLUDE;
                        else
                                type = MLD2_CHANGE_TO_INCLUDE;
                        skb = add_grec(skb, pmc, type, 0, 0, 0);
                        pmc->mca_crcount--;
                }
        }
        if (!skb)
                return;
        (void) mld_sendpack(skb);
}

static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
{
        const struct in6_addr *snd_addr, *saddr;
        int err, len, payload_len, full_len;
        struct in6_addr addr_buf;
        struct inet6_dev *idev;
        struct sk_buff *skb;
        struct mld_msg *hdr;
        int hlen = LL_RESERVED_SPACE(dev);
        int tlen = dev->needed_tailroom;
        u8 ra[8] = { IPPROTO_ICMPV6, 0,
                     IPV6_TLV_ROUTERALERT, 2, 0, 0,
                     IPV6_TLV_PADN, 0 };
        struct dst_entry *dst;
        struct flowi6 fl6;
        struct net *net;
        struct sock *sk;

        if (type == ICMPV6_MGM_REDUCTION)
                snd_addr = &in6addr_linklocal_allrouters;
        else
                snd_addr = addr;

        len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
        payload_len = len + sizeof(ra);
        full_len = sizeof(struct ipv6hdr) + payload_len;

        skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL);

        rcu_read_lock();

        net = dev_net_rcu(dev);
        idev = __in6_dev_get(dev);
        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
        if (!skb) {
                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
                rcu_read_unlock();
                return;
        }
        sk = net->ipv6.igmp_sk;
        skb_set_owner_w(skb, sk);

        skb->priority = TC_PRIO_CONTROL;
        skb_reserve(skb, hlen);

        if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
                /* <draft-ietf-magma-mld-source-05.txt>:
                 * use unspecified address as the source address
                 * when a valid link-local address is not available.
                 */
                saddr = &in6addr_any;
        } else
                saddr = &addr_buf;

        ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len);

        skb_put_data(skb, ra, sizeof(ra));

        hdr = skb_put_zero(skb, sizeof(struct mld_msg));
        hdr->mld_type = type;
        hdr->mld_mca = *addr;

        hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len,
                                         IPPROTO_ICMPV6,
                                         csum_partial(hdr, len, 0));

        icmpv6_flow_init(sk, &fl6, type,
                         &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
                         skb->dev->ifindex);
        dst = icmp6_dst_alloc(skb->dev, &fl6);
        if (IS_ERR(dst)) {
                err = PTR_ERR(dst);
                goto err_out;
        }

        skb_dst_set(skb, dst);
        err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
                      net, sk, skb, NULL, skb->dev,
                      dst_output);
out:
        if (!err) {
                ICMP6MSGOUT_INC_STATS(net, idev, type);
                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
        } else
                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);

        rcu_read_unlock();
        return;

err_out:
        kfree_skb(skb);
        goto out;
}

static void mld_send_initial_cr(struct inet6_dev *idev)
{
        struct ifmcaddr6 *pmc;
        struct sk_buff *skb;
        int type;

        mc_assert_locked(idev);

        if (mld_in_v1_mode(idev))
                return;

        skb = NULL;
        for_each_mc_mclock(idev, pmc) {
                if (pmc->mca_sfcount[MCAST_EXCLUDE])
                        type = MLD2_CHANGE_TO_EXCLUDE;
                else
                        type = MLD2_ALLOW_NEW_SOURCES;
                skb = add_grec(skb, pmc, type, 0, 0, 1);
        }
        if (skb)
                mld_sendpack(skb);
}

void ipv6_mc_dad_complete(struct inet6_dev *idev)
{
        mutex_lock(&idev->mc_lock);
        idev->mc_dad_count = idev->mc_qrv;
        if (idev->mc_dad_count) {
                mld_send_initial_cr(idev);
                idev->mc_dad_count--;
                if (idev->mc_dad_count)
                        mld_dad_start_work(idev,
                                           unsolicited_report_interval(idev));
        }
        mutex_unlock(&idev->mc_lock);
}

static void mld_dad_work(struct work_struct *work)
{
        struct inet6_dev *idev = container_of(to_delayed_work(work),
                                              struct inet6_dev,
                                              mc_dad_work);
        mutex_lock(&idev->mc_lock);
        mld_send_initial_cr(idev);
        if (idev->mc_dad_count) {
                idev->mc_dad_count--;
                if (idev->mc_dad_count)
                        mld_dad_start_work(idev,
                                           unsolicited_report_interval(idev));
        }
        mutex_unlock(&idev->mc_lock);
        in6_dev_put(idev);
}

static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
                           const struct in6_addr *psfsrc)
{
        struct ip6_sf_list *psf, *psf_prev;
        int rv = 0;

        mc_assert_locked(pmc->idev);

        psf_prev = NULL;
        for_each_psf_mclock(pmc, psf) {
                if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
                        break;
                psf_prev = psf;
        }
        if (!psf || psf->sf_count[sfmode] == 0) {
                /* source filter not found, or count wrong =>  bug */
                return -ESRCH;
        }
        WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1);
        if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
                struct inet6_dev *idev = pmc->idev;

                /* no more filters for this source */
                if (psf_prev)
                        rcu_assign_pointer(psf_prev->sf_next,
                                           mc_dereference(psf->sf_next, idev));
                else
                        rcu_assign_pointer(pmc->mca_sources,
                                           mc_dereference(psf->sf_next, idev));

                if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) &&
                    !mld_in_v1_mode(idev)) {
                        psf->sf_crcount = idev->mc_qrv;
                        rcu_assign_pointer(psf->sf_next,
                                           mc_dereference(pmc->mca_tomb, idev));
                        rcu_assign_pointer(pmc->mca_tomb, psf);
                        rv = 1;
                } else {
                        kfree_rcu(psf, rcu);
                }
        }
        return rv;
}

static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
                          int sfmode, int sfcount, const struct in6_addr *psfsrc,
                          int delta)
{
        struct ifmcaddr6 *pmc;
        int        changerec = 0;
        int        i, err;

        if (!idev)
                return -ENODEV;

        mc_assert_locked(idev);

        for_each_mc_mclock(idev, pmc) {
                if (ipv6_addr_equal(pmca, &pmc->mca_addr))
                        break;
        }
        if (!pmc)
                return -ESRCH;

        sf_markstate(pmc);
        if (!delta) {
                if (!pmc->mca_sfcount[sfmode])
                        return -EINVAL;

                pmc->mca_sfcount[sfmode]--;
        }
        err = 0;
        for (i = 0; i < sfcount; i++) {
                int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]);

                changerec |= rv > 0;
                if (!err && rv < 0)
                        err = rv;
        }
        if (pmc->mca_sfmode == MCAST_EXCLUDE &&
            pmc->mca_sfcount[MCAST_EXCLUDE] == 0 &&
            pmc->mca_sfcount[MCAST_INCLUDE]) {
                struct ip6_sf_list *psf;

                /* filter mode change */
                pmc->mca_sfmode = MCAST_INCLUDE;
                pmc->mca_crcount = idev->mc_qrv;
                idev->mc_ifc_count = pmc->mca_crcount;
                for_each_psf_mclock(pmc, psf)
                        psf->sf_crcount = 0;
                mld_ifc_event(pmc->idev);
        } else if (sf_setstate(pmc) || changerec) {
                mld_ifc_event(pmc->idev);
        }

        return err;
}

/* Add multicast single-source filter to the interface list */
static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode,
                           const struct in6_addr *psfsrc)
{
        struct ip6_sf_list *psf, *psf_prev;

        mc_assert_locked(pmc->idev);

        psf_prev = NULL;
        for_each_psf_mclock(pmc, psf) {
                if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
                        break;
                psf_prev = psf;
        }
        if (!psf) {
                psf = kzalloc_obj(*psf);
                if (!psf)
                        return -ENOBUFS;

                psf->sf_addr = *psfsrc;
                if (psf_prev) {
                        rcu_assign_pointer(psf_prev->sf_next, psf);
                } else {
                        rcu_assign_pointer(pmc->mca_sources, psf);
                }
        }
        WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1);
        return 0;
}

static void sf_markstate(struct ifmcaddr6 *pmc)
{
        int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
        struct ip6_sf_list *psf;

        mc_assert_locked(pmc->idev);

        for_each_psf_mclock(pmc, psf) {
                if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
                        psf->sf_oldin = mca_xcount ==
                                psf->sf_count[MCAST_EXCLUDE] &&
                                !psf->sf_count[MCAST_INCLUDE];
                } else {
                        psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
                }
        }
}

static int sf_setstate(struct ifmcaddr6 *pmc)
{
        int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
        struct ip6_sf_list *psf, *dpsf;
        int qrv = pmc->idev->mc_qrv;
        int new_in, rv;

        mc_assert_locked(pmc->idev);

        rv = 0;
        for_each_psf_mclock(pmc, psf) {
                if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
                        new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
                                !psf->sf_count[MCAST_INCLUDE];
                } else
                        new_in = psf->sf_count[MCAST_INCLUDE] != 0;
                if (new_in) {
                        if (!psf->sf_oldin) {
                                struct ip6_sf_list *prev = NULL;

                                for_each_psf_tomb(pmc, dpsf) {
                                        if (ipv6_addr_equal(&dpsf->sf_addr,
                                            &psf->sf_addr))
                                                break;
                                        prev = dpsf;
                                }
                                if (dpsf) {
                                        if (prev)
                                                rcu_assign_pointer(prev->sf_next,
                                                                   mc_dereference(dpsf->sf_next,
                                                                                  pmc->idev));
                                        else
                                                rcu_assign_pointer(pmc->mca_tomb,
                                                                   mc_dereference(dpsf->sf_next,
                                                                                  pmc->idev));
                                        kfree_rcu(dpsf, rcu);
                                }
                                psf->sf_crcount = qrv;
                                rv++;
                        }
                } else if (psf->sf_oldin) {
                        psf->sf_crcount = 0;
                        /*
                         * add or update "delete" records if an active filter
                         * is now inactive
                         */

                        for_each_psf_tomb(pmc, dpsf)
                                if (ipv6_addr_equal(&dpsf->sf_addr,
                                    &psf->sf_addr))
                                        break;
                        if (!dpsf) {
                                dpsf = kmalloc_obj(*dpsf);
                                if (!dpsf)
                                        continue;
                                *dpsf = *psf;
                                rcu_assign_pointer(dpsf->sf_next,
                                                   mc_dereference(pmc->mca_tomb, pmc->idev));
                                rcu_assign_pointer(pmc->mca_tomb, dpsf);
                        }
                        dpsf->sf_crcount = qrv;
                        rv++;
                }
        }
        return rv;
}

/* Add multicast source filter list to the interface list */
static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
                          int sfmode, int sfcount, const struct in6_addr *psfsrc,
                          int delta)
{
        struct ifmcaddr6 *pmc;
        int        isexclude;
        int        i, err;

        if (!idev)
                return -ENODEV;

        mc_assert_locked(idev);

        for_each_mc_mclock(idev, pmc) {
                if (ipv6_addr_equal(pmca, &pmc->mca_addr))
                        break;
        }
        if (!pmc)
                return -ESRCH;

        sf_markstate(pmc);
        isexclude = pmc->mca_sfmode == MCAST_EXCLUDE;
        if (!delta)
                WRITE_ONCE(pmc->mca_sfcount[sfmode],
                           pmc->mca_sfcount[sfmode] + 1);
        err = 0;
        for (i = 0; i < sfcount; i++) {
                err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]);
                if (err)
                        break;
        }
        if (err) {
                int j;

                if (!delta)
                        WRITE_ONCE(pmc->mca_sfcount[sfmode],
                                   pmc->mca_sfcount[sfmode] - 1);
                for (j = 0; j < i; j++)
                        ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]);
        } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) {
                struct ip6_sf_list *psf;

                /* filter mode change */
                if (pmc->mca_sfcount[MCAST_EXCLUDE])
                        pmc->mca_sfmode = MCAST_EXCLUDE;
                else if (pmc->mca_sfcount[MCAST_INCLUDE])
                        pmc->mca_sfmode = MCAST_INCLUDE;
                /* else no filters; keep old mode for reports */

                pmc->mca_crcount = idev->mc_qrv;
                idev->mc_ifc_count = pmc->mca_crcount;
                for_each_psf_mclock(pmc, psf)
                        psf->sf_crcount = 0;
                mld_ifc_event(idev);
        } else if (sf_setstate(pmc)) {
                mld_ifc_event(idev);
        }
        return err;
}

static void ip6_mc_clear_src(struct ifmcaddr6 *pmc)
{
        struct ip6_sf_list *psf, *nextpsf;

        mc_assert_locked(pmc->idev);

        for (psf = mc_dereference(pmc->mca_tomb, pmc->idev);
             psf;
             psf = nextpsf) {
                nextpsf = mc_dereference(psf->sf_next, pmc->idev);
                kfree_rcu(psf, rcu);
        }
        RCU_INIT_POINTER(pmc->mca_tomb, NULL);
        for (psf = mc_dereference(pmc->mca_sources, pmc->idev);
             psf;
             psf = nextpsf) {
                nextpsf = mc_dereference(psf->sf_next, pmc->idev);
                kfree_rcu(psf, rcu);
        }
        RCU_INIT_POINTER(pmc->mca_sources, NULL);
        pmc->mca_sfmode = MCAST_EXCLUDE;
        pmc->mca_sfcount[MCAST_INCLUDE] = 0;
        /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */
        WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1);
}

static void igmp6_join_group(struct ifmcaddr6 *ma)
{
        unsigned long delay;

        mc_assert_locked(ma->idev);

        if (ma->mca_flags & MAF_NOREPORT)
                return;

        igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);

        delay = get_random_u32_below(unsolicited_report_interval(ma->idev));

        if (cancel_delayed_work(&ma->mca_work)) {
                refcount_dec(&ma->mca_refcnt);
                delay = ma->mca_work.timer.expires - jiffies;
        }

        if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
                refcount_inc(&ma->mca_refcnt);
        ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER;
}

static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
                            struct inet6_dev *idev)
{
        struct ip6_sf_socklist *psl;
        int err;

        psl = sock_dereference(iml->sflist, sk);

        if (idev)
                mutex_lock(&idev->mc_lock);

        if (!psl) {
                /* any-source empty exclude case */
                err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
        } else {
                err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
                                     psl->sl_count, psl->sl_addr, 0);
                RCU_INIT_POINTER(iml->sflist, NULL);
                atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
                           &sk->sk_omem_alloc);
                kfree_rcu(psl, rcu);
        }

        if (idev)
                mutex_unlock(&idev->mc_lock);

        return err;
}

static void igmp6_leave_group(struct ifmcaddr6 *ma)
{
        mc_assert_locked(ma->idev);

        if (mld_in_v1_mode(ma->idev)) {
                if (ma->mca_flags & MAF_LAST_REPORTER) {
                        igmp6_send(&ma->mca_addr, ma->idev->dev,
                                ICMPV6_MGM_REDUCTION);
                }
        } else {
                mld_add_delrec(ma->idev, ma);
                mld_ifc_event(ma->idev);
        }
}

static void mld_gq_work(struct work_struct *work)
{
        struct inet6_dev *idev = container_of(to_delayed_work(work),
                                              struct inet6_dev,
                                              mc_gq_work);

        mutex_lock(&idev->mc_lock);
        mld_send_report(idev, NULL);
        idev->mc_gq_running = 0;
        mutex_unlock(&idev->mc_lock);

        in6_dev_put(idev);
}

static void mld_ifc_work(struct work_struct *work)
{
        struct inet6_dev *idev = container_of(to_delayed_work(work),
                                              struct inet6_dev,
                                              mc_ifc_work);

        mutex_lock(&idev->mc_lock);
        mld_send_cr(idev);

        if (idev->mc_ifc_count) {
                idev->mc_ifc_count--;
                if (idev->mc_ifc_count)
                        mld_ifc_start_work(idev,
                                           unsolicited_report_interval(idev));
        }
        mutex_unlock(&idev->mc_lock);
        in6_dev_put(idev);
}

static void mld_ifc_event(struct inet6_dev *idev)
{
        mc_assert_locked(idev);

        if (mld_in_v1_mode(idev))
                return;

        idev->mc_ifc_count = idev->mc_qrv;
        mld_ifc_start_work(idev, 1);
}

static void mld_mca_work(struct work_struct *work)
{
        struct ifmcaddr6 *ma = container_of(to_delayed_work(work),
                                            struct ifmcaddr6, mca_work);

        mutex_lock(&ma->idev->mc_lock);
        if (mld_in_v1_mode(ma->idev))
                igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
        else
                mld_send_report(ma->idev, ma);
        ma->mca_flags |=  MAF_LAST_REPORTER;
        ma->mca_flags &= ~MAF_TIMER_RUNNING;
        mutex_unlock(&ma->idev->mc_lock);

        ma_put(ma);
}

/* Device changing type */

void ipv6_mc_unmap(struct inet6_dev *idev)
{
        struct ifmcaddr6 *i;

        /* Install multicast list, except for all-nodes (already installed) */

        mutex_lock(&idev->mc_lock);
        for_each_mc_mclock(idev, i)
                igmp6_group_dropped(i);
        mutex_unlock(&idev->mc_lock);
}

void ipv6_mc_remap(struct inet6_dev *idev)
{
        ipv6_mc_up(idev);
}

/* Device going down */
void ipv6_mc_down(struct inet6_dev *idev)
{
        struct ifmcaddr6 *i;

        mutex_lock(&idev->mc_lock);
        /* Withdraw multicast list */
        for_each_mc_mclock(idev, i)
                igmp6_group_dropped(i);
        mutex_unlock(&idev->mc_lock);

        /* Should stop work after group drop. or we will
         * start work again in mld_ifc_event()
         */
        mld_query_stop_work(idev);
        mld_report_stop_work(idev);

        mutex_lock(&idev->mc_lock);
        mld_ifc_stop_work(idev);
        mld_gq_stop_work(idev);
        mutex_unlock(&idev->mc_lock);

        mld_dad_stop_work(idev);
}

static void ipv6_mc_reset(struct inet6_dev *idev)
{
        idev->mc_qrv = sysctl_mld_qrv;
        idev->mc_qi = MLD_QI_DEFAULT;
        idev->mc_qri = MLD_QRI_DEFAULT;
        idev->mc_v1_seen = 0;
        idev->mc_maxdelay = unsolicited_report_interval(idev);
}

/* Device going up */

void ipv6_mc_up(struct inet6_dev *idev)
{
        struct ifmcaddr6 *i;

        /* Install multicast list, except for all-nodes (already installed) */

        ipv6_mc_reset(idev);
        mutex_lock(&idev->mc_lock);
        for_each_mc_mclock(idev, i) {
                mld_del_delrec(idev, i);
                igmp6_group_added(i);
        }
        mutex_unlock(&idev->mc_lock);
}

/* IPv6 device initialization. */

void ipv6_mc_init_dev(struct inet6_dev *idev)
{
        idev->mc_gq_running = 0;
        INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work);
        RCU_INIT_POINTER(idev->mc_tomb, NULL);
        idev->mc_ifc_count = 0;
        INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work);
        INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work);
        INIT_DELAYED_WORK(&idev->mc_query_work, mld_query_work);
        INIT_DELAYED_WORK(&idev->mc_report_work, mld_report_work);
        skb_queue_head_init(&idev->mc_query_queue);
        skb_queue_head_init(&idev->mc_report_queue);
        spin_lock_init(&idev->mc_query_lock);
        spin_lock_init(&idev->mc_report_lock);
        mutex_init(&idev->mc_lock);
        ipv6_mc_reset(idev);
}

/*
 *        Device is about to be destroyed: clean up.
 */

void ipv6_mc_destroy_dev(struct inet6_dev *idev)
{
        struct ifmcaddr6 *i;

        /* Deactivate works */
        ipv6_mc_down(idev);
        mutex_lock(&idev->mc_lock);
        mld_clear_delrec(idev);
        mutex_unlock(&idev->mc_lock);
        mld_clear_query(idev);
        mld_clear_report(idev);

        /* Delete all-nodes address. */
        /* We cannot call ipv6_dev_mc_dec() directly, our caller in
         * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will
         * fail.
         */
        __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes);

        if (idev->cnf.forwarding)
                __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters);

        mutex_lock(&idev->mc_lock);
        while ((i = mc_dereference(idev->mc_list, idev))) {
                rcu_assign_pointer(idev->mc_list, mc_dereference(i->next, idev));

                ip6_mc_clear_src(i);
                ma_put(i);
        }
        mutex_unlock(&idev->mc_lock);
}

static void ipv6_mc_rejoin_groups(struct inet6_dev *idev)
{
        struct ifmcaddr6 *pmc;

        mutex_lock(&idev->mc_lock);
        if (mld_in_v1_mode(idev)) {
                for_each_mc_mclock(idev, pmc)
                        igmp6_join_group(pmc);
        } else {
                mld_send_report(idev, NULL);
        }
        mutex_unlock(&idev->mc_lock);
}

static int ipv6_mc_netdev_event(struct notifier_block *this,
                                unsigned long event,
                                void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct inet6_dev *idev = __in6_dev_get(dev);

        switch (event) {
        case NETDEV_RESEND_IGMP:
                if (idev)
                        ipv6_mc_rejoin_groups(idev);
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block igmp6_netdev_notifier = {
        .notifier_call = ipv6_mc_netdev_event,
};

#ifdef CONFIG_PROC_FS
struct igmp6_mc_iter_state {
        struct seq_net_private p;
        struct net_device *dev;
        struct inet6_dev *idev;
};

#define igmp6_mc_seq_private(seq)        ((struct igmp6_mc_iter_state *)(seq)->private)

static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq)
{
        struct ifmcaddr6 *im = NULL;
        struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
        struct net *net = seq_file_net(seq);

        state->idev = NULL;
        for_each_netdev_rcu(net, state->dev) {
                struct inet6_dev *idev;
                idev = __in6_dev_get(state->dev);
                if (!idev)
                        continue;

                im = rcu_dereference(idev->mc_list);
                if (im) {
                        state->idev = idev;
                        break;
                }
        }
        return im;
}

static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im)
{
        struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);

        im = rcu_dereference(im->next);
        while (!im) {
                state->dev = next_net_device_rcu(state->dev);
                if (!state->dev) {
                        state->idev = NULL;
                        break;
                }
                state->idev = __in6_dev_get(state->dev);
                if (!state->idev)
                        continue;
                im = rcu_dereference(state->idev->mc_list);
        }
        return im;
}

static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos)
{
        struct ifmcaddr6 *im = igmp6_mc_get_first(seq);
        if (im)
                while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL)
                        --pos;
        return pos ? NULL : im;
}

static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        rcu_read_lock();
        return igmp6_mc_get_idx(seq, *pos);
}

static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v);

        ++*pos;
        return im;
}

static void igmp6_mc_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);

        if (likely(state->idev))
                state->idev = NULL;
        state->dev = NULL;
        rcu_read_unlock();
}

static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
{
        struct ifmcaddr6 *im = (struct ifmcaddr6 *)v;
        struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);

        seq_printf(seq,
                   "%-4d %-15s %pi6 %5d %08X %ld\n",
                   state->dev->ifindex, state->dev->name,
                   &im->mca_addr,
                   im->mca_users, im->mca_flags,
                   (im->mca_flags & MAF_TIMER_RUNNING) ?
                   jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0);
        return 0;
}

static const struct seq_operations igmp6_mc_seq_ops = {
        .start        =        igmp6_mc_seq_start,
        .next        =        igmp6_mc_seq_next,
        .stop        =        igmp6_mc_seq_stop,
        .show        =        igmp6_mc_seq_show,
};

struct igmp6_mcf_iter_state {
        struct seq_net_private p;
        struct net_device *dev;
        struct inet6_dev *idev;
        struct ifmcaddr6 *im;
};

#define igmp6_mcf_seq_private(seq)        ((struct igmp6_mcf_iter_state *)(seq)->private)

static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq)
{
        struct ip6_sf_list *psf = NULL;
        struct ifmcaddr6 *im = NULL;
        struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
        struct net *net = seq_file_net(seq);

        state->idev = NULL;
        state->im = NULL;
        for_each_netdev_rcu(net, state->dev) {
                struct inet6_dev *idev;
                idev = __in6_dev_get(state->dev);
                if (unlikely(idev == NULL))
                        continue;

                im = rcu_dereference(idev->mc_list);
                if (likely(im)) {
                        psf = rcu_dereference(im->mca_sources);
                        if (likely(psf)) {
                                state->im = im;
                                state->idev = idev;
                                break;
                        }
                }
        }
        return psf;
}

static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf)
{
        struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);

        psf = rcu_dereference(psf->sf_next);
        while (!psf) {
                state->im = rcu_dereference(state->im->next);
                while (!state->im) {
                        state->dev = next_net_device_rcu(state->dev);
                        if (!state->dev) {
                                state->idev = NULL;
                                goto out;
                        }
                        state->idev = __in6_dev_get(state->dev);
                        if (!state->idev)
                                continue;
                        state->im = rcu_dereference(state->idev->mc_list);
                }
                psf = rcu_dereference(state->im->mca_sources);
        }
out:
        return psf;
}

static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos)
{
        struct ip6_sf_list *psf = igmp6_mcf_get_first(seq);
        if (psf)
                while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL)
                        --pos;
        return pos ? NULL : psf;
}

static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        rcu_read_lock();
        return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}

static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct ip6_sf_list *psf;
        if (v == SEQ_START_TOKEN)
                psf = igmp6_mcf_get_first(seq);
        else
                psf = igmp6_mcf_get_next(seq, v);
        ++*pos;
        return psf;
}

static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);

        if (likely(state->im))
                state->im = NULL;
        if (likely(state->idev))
                state->idev = NULL;

        state->dev = NULL;
        rcu_read_unlock();
}

static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
{
        struct ip6_sf_list *psf = (struct ip6_sf_list *)v;
        struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "Idx Device                Multicast Address                   Source Address    INC    EXC\n");
        } else {
                seq_printf(seq,
                           "%3d %6.6s %pi6 %pi6 %6lu %6lu\n",
                           state->dev->ifindex, state->dev->name,
                           &state->im->mca_addr,
                           &psf->sf_addr,
                           READ_ONCE(psf->sf_count[MCAST_INCLUDE]),
                           READ_ONCE(psf->sf_count[MCAST_EXCLUDE]));
        }
        return 0;
}

static const struct seq_operations igmp6_mcf_seq_ops = {
        .start        =        igmp6_mcf_seq_start,
        .next        =        igmp6_mcf_seq_next,
        .stop        =        igmp6_mcf_seq_stop,
        .show        =        igmp6_mcf_seq_show,
};

static int __net_init igmp6_proc_init(struct net *net)
{
        int err;

        err = -ENOMEM;
        if (!proc_create_net("igmp6", 0444, net->proc_net, &igmp6_mc_seq_ops,
                        sizeof(struct igmp6_mc_iter_state)))
                goto out;
        if (!proc_create_net("mcfilter6", 0444, net->proc_net,
                        &igmp6_mcf_seq_ops,
                        sizeof(struct igmp6_mcf_iter_state)))
                goto out_proc_net_igmp6;

        err = 0;
out:
        return err;

out_proc_net_igmp6:
        remove_proc_entry("igmp6", net->proc_net);
        goto out;
}

static void __net_exit igmp6_proc_exit(struct net *net)
{
        remove_proc_entry("mcfilter6", net->proc_net);
        remove_proc_entry("igmp6", net->proc_net);
}
#else
static inline int igmp6_proc_init(struct net *net)
{
        return 0;
}
static inline void igmp6_proc_exit(struct net *net)
{
}
#endif

static int __net_init igmp6_net_init(struct net *net)
{
        int err;

        err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6,
                                   SOCK_RAW, IPPROTO_ICMPV6, net);
        if (err < 0) {
                pr_err("Failed to initialize the IGMP6 control socket (err %d)\n",
                       err);
                goto out;
        }

        inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1;
        net->ipv6.igmp_sk->sk_allocation = GFP_KERNEL;

        err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6,
                                   SOCK_RAW, IPPROTO_ICMPV6, net);
        if (err < 0) {
                pr_err("Failed to initialize the IGMP6 autojoin socket (err %d)\n",
                       err);
                goto out_sock_create;
        }

        err = igmp6_proc_init(net);
        if (err)
                goto out_sock_create_autojoin;

        return 0;

out_sock_create_autojoin:
        inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk);
out_sock_create:
        inet_ctl_sock_destroy(net->ipv6.igmp_sk);
out:
        return err;
}

static void __net_exit igmp6_net_exit(struct net *net)
{
        inet_ctl_sock_destroy(net->ipv6.igmp_sk);
        inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk);
        igmp6_proc_exit(net);
}

static struct pernet_operations igmp6_net_ops = {
        .init = igmp6_net_init,
        .exit = igmp6_net_exit,
};

int __init igmp6_init(void)
{
        int err;

        err = register_pernet_subsys(&igmp6_net_ops);
        if (err)
                return err;

        mld_wq = create_workqueue("mld");
        if (!mld_wq) {
                unregister_pernet_subsys(&igmp6_net_ops);
                return -ENOMEM;
        }

        return err;
}

int __init igmp6_late_init(void)
{
        return register_netdevice_notifier(&igmp6_netdev_notifier);
}

void igmp6_cleanup(void)
{
        unregister_pernet_subsys(&igmp6_net_ops);
        destroy_workqueue(mld_wq);
}

void igmp6_late_cleanup(void)
{
        unregister_netdevice_notifier(&igmp6_netdev_notifier);
}

































    2 






















    1 






    2 






























    2 











    1 




    1 


    1 




    1 















    1 


    1 














































    1 


    1 


    1 

    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (c) 2013
 *  Minchan Kim <minchan@kernel.org>
 */
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/bio.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cpumask.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "decompressor.h"
#include "squashfs.h"

/*
 * This file implements multi-threaded decompression in the
 * decompressor framework
 */


/*
 * The reason that multiply two is that a CPU can request new I/O
 * while it is waiting previous request.
 */
#define MAX_DECOMPRESSOR        (num_online_cpus() * 2)


static int squashfs_max_decompressors(void)
{
        return MAX_DECOMPRESSOR;
}

struct squashfs_stream {
        void                        *comp_opts;
        struct list_head        strm_list;
        struct mutex                mutex;
        int                        avail_decomp;
        wait_queue_head_t        wait;
};


struct decomp_stream {
        void *stream;
        struct list_head list;
};


static void put_decomp_stream(struct decomp_stream *decomp_strm,
                                struct squashfs_stream *stream)
{
        mutex_lock(&stream->mutex);
        list_add(&decomp_strm->list, &stream->strm_list);
        mutex_unlock(&stream->mutex);
        wake_up(&stream->wait);
}

static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
                                void *comp_opts)
{
        struct squashfs_stream *stream;
        struct decomp_stream *decomp_strm = NULL;
        int err = -ENOMEM;

        stream = kzalloc_obj(*stream);
        if (!stream)
                goto out;

        stream->comp_opts = comp_opts;
        mutex_init(&stream->mutex);
        INIT_LIST_HEAD(&stream->strm_list);
        init_waitqueue_head(&stream->wait);

        /*
         * We should have a decompressor at least as default
         * so if we fail to allocate new decompressor dynamically,
         * we could always fall back to default decompressor and
         * file system works.
         */
        decomp_strm = kmalloc_obj(*decomp_strm);
        if (!decomp_strm)
                goto out;

        decomp_strm->stream = msblk->decompressor->init(msblk,
                                                stream->comp_opts);
        if (IS_ERR(decomp_strm->stream)) {
                err = PTR_ERR(decomp_strm->stream);
                goto out;
        }

        list_add(&decomp_strm->list, &stream->strm_list);
        stream->avail_decomp = 1;
        return stream;

out:
        kfree(decomp_strm);
        kfree(stream);
        return ERR_PTR(err);
}


static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
        struct squashfs_stream *stream = msblk->stream;
        if (stream) {
                struct decomp_stream *decomp_strm;

                while (!list_empty(&stream->strm_list)) {
                        decomp_strm = list_entry(stream->strm_list.prev,
                                                struct decomp_stream, list);
                        list_del(&decomp_strm->list);
                        msblk->decompressor->free(decomp_strm->stream);
                        kfree(decomp_strm);
                        stream->avail_decomp--;
                }
                WARN_ON(stream->avail_decomp);
                kfree(stream->comp_opts);
                kfree(stream);
        }
}


static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
                                        struct squashfs_stream *stream)
{
        struct decomp_stream *decomp_strm;

        while (1) {
                mutex_lock(&stream->mutex);

                /* There is available decomp_stream */
                if (!list_empty(&stream->strm_list)) {
                        decomp_strm = list_entry(stream->strm_list.prev,
                                struct decomp_stream, list);
                        list_del(&decomp_strm->list);
                        mutex_unlock(&stream->mutex);
                        break;
                }

                /*
                 * If there is no available decomp and already full,
                 * let's wait for releasing decomp from other users.
                 */
                if (stream->avail_decomp >= msblk->max_thread_num)
                        goto wait;

                /* Let's allocate new decomp */
                decomp_strm = kmalloc_obj(*decomp_strm);
                if (!decomp_strm)
                        goto wait;

                decomp_strm->stream = msblk->decompressor->init(msblk,
                                                stream->comp_opts);
                if (IS_ERR(decomp_strm->stream)) {
                        kfree(decomp_strm);
                        goto wait;
                }

                stream->avail_decomp++;
                WARN_ON(stream->avail_decomp > msblk->max_thread_num);

                mutex_unlock(&stream->mutex);
                break;
wait:
                /*
                 * If system memory is tough, let's for other's
                 * releasing instead of hurting VM because it could
                 * make page cache thrashing.
                 */
                mutex_unlock(&stream->mutex);
                wait_event(stream->wait,
                        !list_empty(&stream->strm_list));
        }

        return decomp_strm;
}


static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
                        int offset, int length,
                        struct squashfs_page_actor *output)
{
        int res;
        struct squashfs_stream *stream = msblk->stream;
        struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream);
        res = msblk->decompressor->decompress(msblk, decomp_stream->stream,
                bio, offset, length, output);
        put_decomp_stream(decomp_stream, stream);
        if (res < 0)
                ERROR("%s decompression failed, data probably corrupt\n",
                        msblk->decompressor->name);
        return res;
}

const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi = {
        .create = squashfs_decompressor_create,
        .destroy = squashfs_decompressor_destroy,
        .decompress = squashfs_decompress,
        .max_decompressors = squashfs_max_decompressors,
};



























































































    1 
    1 
















    1 


    1 


    1 

    1 



















































    1 
    1 
    1 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  9P entry point
 *
 *  Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
 *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
 *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/kmod.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
#include <net/9p/9p.h>
#include <linux/fs.h>
#include <net/9p/client.h>
#include <net/9p/transport.h>
#include <linux/list.h>
#include <linux/spinlock.h>

#ifdef CONFIG_NET_9P_DEBUG
unsigned int p9_debug_level;        /* feature-rific global debug level  */
EXPORT_SYMBOL(p9_debug_level);
module_param_named(debug, p9_debug_level, uint, 0);
MODULE_PARM_DESC(debug, "9P debugging level");

void _p9_debug(enum p9_debug_flags level, const char *func,
               const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if ((p9_debug_level & level) != level)
                return;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        if (level == P9_DEBUG_9P)
                pr_notice("(%8.8d) %pV", task_pid_nr(current), &vaf);
        else
                pr_notice("-- %s (%d): %pV", func, task_pid_nr(current), &vaf);

        va_end(args);
}
EXPORT_SYMBOL(_p9_debug);
#endif

/* Dynamic Transport Registration Routines */

static DEFINE_SPINLOCK(v9fs_trans_lock);
static LIST_HEAD(v9fs_trans_list);

/**
 * v9fs_register_trans - register a new transport with 9p
 * @m: structure describing the transport module and entry points
 *
 */
void v9fs_register_trans(struct p9_trans_module *m)
{
        spin_lock(&v9fs_trans_lock);
        list_add_tail(&m->list, &v9fs_trans_list);
        spin_unlock(&v9fs_trans_lock);
}
EXPORT_SYMBOL(v9fs_register_trans);

/**
 * v9fs_unregister_trans - unregister a 9p transport
 * @m: the transport to remove
 *
 */
void v9fs_unregister_trans(struct p9_trans_module *m)
{
        spin_lock(&v9fs_trans_lock);
        list_del_init(&m->list);
        spin_unlock(&v9fs_trans_lock);
}
EXPORT_SYMBOL(v9fs_unregister_trans);

static struct p9_trans_module *_p9_get_trans_by_name(const char *s)
{
        struct p9_trans_module *t, *found = NULL;

        spin_lock(&v9fs_trans_lock);

        list_for_each_entry(t, &v9fs_trans_list, list)
                if (strcmp(t->name, s) == 0 &&
                    try_module_get(t->owner)) {
                        found = t;
                        break;
                }

        spin_unlock(&v9fs_trans_lock);

        return found;
}

/**
 * v9fs_get_trans_by_name - get transport with the matching name
 * @s: string identifying transport
 *
 */
struct p9_trans_module *v9fs_get_trans_by_name(const char *s)
{
        struct p9_trans_module *found = NULL;

        found = _p9_get_trans_by_name(s);

#ifdef CONFIG_MODULES
        if (!found) {
                request_module("9p-%s", s);
                found = _p9_get_trans_by_name(s);
        }
#endif

        return found;
}
EXPORT_SYMBOL(v9fs_get_trans_by_name);

static const char * const v9fs_default_transports[] = {
        "virtio", "tcp", "fd", "unix", "xen", "rdma",
};

/**
 * v9fs_get_default_trans - get the default transport
 *
 */

struct p9_trans_module *v9fs_get_default_trans(void)
{
        struct p9_trans_module *t, *found = NULL;
        int i;

        spin_lock(&v9fs_trans_lock);

        list_for_each_entry(t, &v9fs_trans_list, list)
                if (t->def && try_module_get(t->owner)) {
                        found = t;
                        break;
                }

        if (!found)
                list_for_each_entry(t, &v9fs_trans_list, list)
                        if (try_module_get(t->owner)) {
                                found = t;
                                break;
                        }

        spin_unlock(&v9fs_trans_lock);

        for (i = 0; !found && i < ARRAY_SIZE(v9fs_default_transports); i++)
                found = v9fs_get_trans_by_name(v9fs_default_transports[i]);

        return found;
}
EXPORT_SYMBOL(v9fs_get_default_trans);

/**
 * v9fs_put_trans - put trans
 * @m: transport to put
 *
 */
void v9fs_put_trans(struct p9_trans_module *m)
{
        if (m)
                module_put(m->owner);
}
EXPORT_SYMBOL(v9fs_put_trans);

/**
 * init_p9 - Initialize module
 *
 */
static int __init init_p9(void)
{
        int ret;

        ret = p9_client_init();
        if (ret)
                return ret;

        p9_error_init();
        pr_info("Installing 9P2000 support\n");

        return ret;
}

/**
 * exit_p9 - shutdown module
 *
 */

static void __exit exit_p9(void)
{
        pr_info("Unloading 9P2000 support\n");

        p9_client_exit();
}

module_init(init_p9)
module_exit(exit_p9)

MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Plan 9 Resource Sharing Support (9P2000)");







































    9 


    7 

    8 






   11 




   11 





   11 









   10 






















    9 

   11 

























   11 



   11 

    8 


   10 
























































    7 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * fs/inotify_user.c - inotify support for userspace
 *
 * Authors:
 *        John McCutchan        <ttb@tentacle.dhs.org>
 *        Robert Love        <rml@novell.com>
 *
 * Copyright (C) 2005 John McCutchan
 * Copyright 2006 Hewlett-Packard Development Company, L.P.
 *
 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
 * inotify was largely rewritten to make use of the fsnotify infrastructure
 */

#include <linux/dcache.h> /* d_unlinked */
#include <linux/fs.h> /* struct inode */
#include <linux/fsnotify_backend.h>
#include <linux/inotify.h>
#include <linux/path.h> /* struct path */
#include <linux/slab.h> /* kmem_* */
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/user.h>
#include <linux/sched/mm.h>

#include "inotify.h"

/*
 * Check if 2 events contain the same information.
 */
static bool event_compare(struct fsnotify_event *old_fsn,
                          struct fsnotify_event *new_fsn)
{
        struct inotify_event_info *old, *new;

        old = INOTIFY_E(old_fsn);
        new = INOTIFY_E(new_fsn);
        if (old->mask & FS_IN_IGNORED)
                return false;
        if ((old->mask == new->mask) &&
            (old->wd == new->wd) &&
            (old->name_len == new->name_len) &&
            (!old->name_len || !strcmp(old->name, new->name)))
                return true;
        return false;
}

static int inotify_merge(struct fsnotify_group *group,
                         struct fsnotify_event *event)
{
        struct list_head *list = &group->notification_list;
        struct fsnotify_event *last_event;

        last_event = list_entry(list->prev, struct fsnotify_event, list);
        return event_compare(last_event, event);
}

int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
                               struct inode *inode, struct inode *dir,
                               const struct qstr *name, u32 cookie)
{
        struct inotify_inode_mark *i_mark;
        struct inotify_event_info *event;
        struct fsnotify_event *fsn_event;
        struct fsnotify_group *group = inode_mark->group;
        int ret;
        int len = 0, wd;
        int alloc_len = sizeof(struct inotify_event_info);
        struct mem_cgroup *old_memcg;

        if (name) {
                len = name->len;
                alloc_len += len + 1;
        }

        pr_debug("%s: group=%p mark=%p mask=%x\n", __func__, group, inode_mark,
                 mask);

        i_mark = container_of(inode_mark, struct inotify_inode_mark,
                              fsn_mark);

        /*
         * We can be racing with mark being detached. Don't report event with
         * invalid wd.
         */
        wd = READ_ONCE(i_mark->wd);
        if (wd == -1)
                return 0;
        /*
         * Whoever is interested in the event, pays for the allocation. Do not
         * trigger OOM killer in the target monitoring memcg as it may have
         * security repercussion.
         */
        old_memcg = set_active_memcg(group->memcg);
        event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        set_active_memcg(old_memcg);

        if (unlikely(!event)) {
                /*
                 * Treat lost event due to ENOMEM the same way as queue
                 * overflow to let userspace know event was lost.
                 */
                fsnotify_queue_overflow(group);
                return -ENOMEM;
        }

        /*
         * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events
         * for fanotify. inotify never reported IN_ISDIR with those events.
         * It looks like an oversight, but to avoid the risk of breaking
         * existing inotify programs, mask the flag out from those events.
         */
        if (mask & (IN_MOVE_SELF | IN_DELETE_SELF))
                mask &= ~IN_ISDIR;

        fsn_event = &event->fse;
        fsnotify_init_event(fsn_event);
        event->mask = mask;
        event->wd = wd;
        event->sync_cookie = cookie;
        event->name_len = len;
        if (len)
                strscpy(event->name, name->name, event->name_len + 1);

        ret = fsnotify_add_event(group, fsn_event, inotify_merge);
        if (ret) {
                /* Our event wasn't used in the end. Free it. */
                fsnotify_destroy_event(group, fsn_event);
        }

        if (inode_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
                fsnotify_destroy_mark(inode_mark, group);

        return 0;
}

static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
{
        inotify_ignored_and_remove_idr(fsn_mark, group);
}

/*
 * This is NEVER supposed to be called.  Inotify marks should either have been
 * removed from the idr when the watch was removed or in the
 * fsnotify_destroy_mark_by_group() call when the inotify instance was being
 * torn down.  This is only called if the idr is about to be freed but there
 * are still marks in it.
 */
static int idr_callback(int id, void *p, void *data)
{
        struct fsnotify_mark *fsn_mark;
        struct inotify_inode_mark *i_mark;
        static bool warned = false;

        if (warned)
                return 0;

        warned = true;
        fsn_mark = p;
        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);

        WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
                "idr.  Probably leaking memory\n", id, p, data);

        /*
         * I'm taking the liberty of assuming that the mark in question is a
         * valid address and I'm dereferencing it.  This might help to figure
         * out why we got here and the panic is no worse than the original
         * BUG() that was here.
         */
        if (fsn_mark)
                printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n",
                        fsn_mark->group, i_mark->wd);
        return 0;
}

static void inotify_free_group_priv(struct fsnotify_group *group)
{
        /* ideally the idr is empty and we won't hit the BUG in the callback */
        idr_for_each(&group->inotify_data.idr, idr_callback, group);
        idr_destroy(&group->inotify_data.idr);
        if (group->inotify_data.ucounts)
                dec_inotify_instances(group->inotify_data.ucounts);
}

static void inotify_free_event(struct fsnotify_group *group,
                               struct fsnotify_event *fsn_event)
{
        kfree(INOTIFY_E(fsn_event));
}

/* ding dong the mark is dead */
static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
{
        struct inotify_inode_mark *i_mark;

        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);

        kmem_cache_free(inotify_inode_mark_cachep, i_mark);
}

const struct fsnotify_ops inotify_fsnotify_ops = {
        .handle_inode_event = inotify_handle_inode_event,
        .free_group_priv = inotify_free_group_priv,
        .free_event = inotify_free_event,
        .freeing_mark = inotify_freeing_mark,
        .free_mark = inotify_free_mark,
};















































































































    1 




    1 












    1 




















































    1 
    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
// SPDX-License-Identifier: GPL-2.0
/*
 *  Floating proportions with flexible aging period
 *
 *   Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
 *
 * The goal of this code is: Given different types of event, measure proportion
 * of each type of event over time. The proportions are measured with
 * exponentially decaying history to give smooth transitions. A formula
 * expressing proportion of event of type 'j' is:
 *
 *   p_{j} = (\Sum_{i>=0} x_{i,j}/2^{i+1})/(\Sum_{i>=0} x_i/2^{i+1})
 *
 * Where x_{i,j} is j's number of events in i-th last time period and x_i is
 * total number of events in i-th last time period.
 *
 * Note that p_{j}'s are normalised, i.e.
 *
 *   \Sum_{j} p_{j} = 1,
 *
 * This formula can be straightforwardly computed by maintaining denominator
 * (let's call it 'd') and for each event type its numerator (let's call it
 * 'n_j'). When an event of type 'j' happens, we simply need to do:
 *   n_j++; d++;
 *
 * When a new period is declared, we could do:
 *   d /= 2
 *   for each j
 *     n_j /= 2
 *
 * To avoid iteration over all event types, we instead shift numerator of event
 * j lazily when someone asks for a proportion of event j or when event j
 * occurs. This can bit trivially implemented by remembering last period in
 * which something happened with proportion of type j.
 */
#include <linux/flex_proportions.h>

int fprop_global_init(struct fprop_global *p, gfp_t gfp)
{
        int err;

        p->period = 0;
        /* Use 1 to avoid dealing with periods with 0 events... */
        err = percpu_counter_init(&p->events, 1, gfp);
        if (err)
                return err;
        seqcount_init(&p->sequence);
        return 0;
}

void fprop_global_destroy(struct fprop_global *p)
{
        percpu_counter_destroy(&p->events);
}

/*
 * Declare @periods new periods. It is upto the caller to make sure period
 * transitions cannot happen in parallel.
 *
 * The function returns true if the proportions are still defined and false
 * if aging zeroed out all events. This can be used to detect whether declaring
 * further periods has any effect.
 */
bool fprop_new_period(struct fprop_global *p, int periods)
{
        s64 events = percpu_counter_sum(&p->events);
        unsigned long flags;

        /*
         * Don't do anything if there are no events.
         */
        if (events <= 1)
                return false;
        local_irq_save(flags);
        write_seqcount_begin(&p->sequence);
        if (periods < 64)
                events -= events >> periods;
        /* Use addition to avoid losing events happening between sum and set */
        percpu_counter_add(&p->events, -events);
        p->period += periods;
        write_seqcount_end(&p->sequence);
        local_irq_restore(flags);

        return true;
}

/*
 * ---- PERCPU ----
 */
#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))

int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp)
{
        int err;

        err = percpu_counter_init(&pl->events, 0, gfp);
        if (err)
                return err;
        pl->period = 0;
        raw_spin_lock_init(&pl->lock);
        return 0;
}

void fprop_local_destroy_percpu(struct fprop_local_percpu *pl)
{
        percpu_counter_destroy(&pl->events);
}

static void fprop_reflect_period_percpu(struct fprop_global *p,
                                        struct fprop_local_percpu *pl)
{
        unsigned int period = p->period;
        unsigned long flags;

        /* Fast path - period didn't change */
        if (pl->period == period)
                return;
        raw_spin_lock_irqsave(&pl->lock, flags);
        /* Someone updated pl->period while we were spinning? */
        if (pl->period >= period) {
                raw_spin_unlock_irqrestore(&pl->lock, flags);
                return;
        }
        /* Aging zeroed our fraction? */
        if (period - pl->period < BITS_PER_LONG) {
                s64 val = percpu_counter_read(&pl->events);

                if (val < (nr_cpu_ids * PROP_BATCH))
                        val = percpu_counter_sum(&pl->events);

                percpu_counter_add_batch(&pl->events,
                        -val + (val >> (period-pl->period)), PROP_BATCH);
        } else
                percpu_counter_set(&pl->events, 0);
        pl->period = period;
        raw_spin_unlock_irqrestore(&pl->lock, flags);
}

/* Event of type pl happened */
void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl,
                long nr)
{
        fprop_reflect_period_percpu(p, pl);
        percpu_counter_add_batch(&pl->events, nr, PROP_BATCH);
        percpu_counter_add(&p->events, nr);
}

void fprop_fraction_percpu(struct fprop_global *p,
                           struct fprop_local_percpu *pl,
                           unsigned long *numerator, unsigned long *denominator)
{
        unsigned int seq;
        s64 num, den;

        do {
                seq = read_seqcount_begin(&p->sequence);
                fprop_reflect_period_percpu(p, pl);
                num = percpu_counter_read_positive(&pl->events);
                den = percpu_counter_read_positive(&p->events);
        } while (read_seqcount_retry(&p->sequence, seq));

        /*
         * Make fraction <= 1 and denominator > 0 even in presence of percpu
         * counter errors
         */
        if (den <= num) {
                if (num)
                        den = num;
                else
                        den = 1;
        }
        *denominator = den;
        *numerator = num;
}

/*
 * Like __fprop_add_percpu() except that event is counted only if the given
 * type has fraction smaller than @max_frac/FPROP_FRAC_BASE
 */
void __fprop_add_percpu_max(struct fprop_global *p,
                struct fprop_local_percpu *pl, int max_frac, long nr)
{
        if (unlikely(max_frac < FPROP_FRAC_BASE)) {
                unsigned long numerator, denominator;
                s64 tmp;

                fprop_fraction_percpu(p, pl, &numerator, &denominator);
                /* Adding 'nr' to fraction exceeds max_frac/FPROP_FRAC_BASE? */
                tmp = (u64)denominator * max_frac -
                                        ((u64)numerator << FPROP_FRAC_SHIFT);
                if (tmp < 0) {
                        /* Maximum fraction already exceeded? */
                        return;
                } else if (tmp < nr * (FPROP_FRAC_BASE - max_frac)) {
                        /* Add just enough for the fraction to saturate */
                        nr = div_u64(tmp + FPROP_FRAC_BASE - max_frac - 1,
                                        FPROP_FRAC_BASE - max_frac);
                }
        }

        __fprop_add_percpu(p, pl, nr);
}






















   12 

















































































































    1 








































    3 

























































































































    1 

















    1 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/backing-dev.h
 *
 * low-level device information and state which is propagated up through
 * to high-level code.
 */

#ifndef _LINUX_BACKING_DEV_H
#define _LINUX_BACKING_DEV_H

#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/device.h>
#include <linux/writeback.h>
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>

static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
{
        kref_get(&bdi->refcnt);
        return bdi;
}

struct backing_dev_info *bdi_get_by_id(u64 id);
void bdi_put(struct backing_dev_info *bdi);

__printf(2, 3)
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
__printf(2, 0)
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
                    va_list args);
void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner);
void bdi_unregister(struct backing_dev_info *bdi);

struct backing_dev_info *bdi_alloc(int node_id);

void wb_start_background_writeback(struct bdi_writeback *wb);
void wb_workfn(struct work_struct *work);

void wb_wait_for_completion(struct wb_completion *done);

extern spinlock_t bdi_lock;
extern struct list_head bdi_list;

extern struct workqueue_struct *bdi_wq;

static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
        return test_bit(WB_has_dirty_io, &wb->state);
}

static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
{
        /*
         * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
         * any dirty wbs.  See wb_update_write_bandwidth().
         */
        return atomic_long_read(&bdi->tot_write_bandwidth);
}

static inline void wb_stat_mod(struct bdi_writeback *wb,
                                 enum wb_stat_item item, s64 amount)
{
        percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
}

static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_read_positive(&wb->stat[item]);
}

static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_sum_positive(&wb->stat[item]);
}

extern void wb_writeout_inc(struct bdi_writeback *wb);

/*
 * maximal error of a stat counter.
 */
static inline unsigned long wb_stat_error(void)
{
#ifdef CONFIG_SMP
        return nr_cpu_ids * WB_STAT_BATCH;
#else
        return 1;
#endif
}

/* BDI ratio is expressed as part per 1000000 for finer granularity. */
#define BDI_RATIO_SCALE 10000

u64 bdi_get_min_bytes(struct backing_dev_info *bdi);
u64 bdi_get_max_bytes(struct backing_dev_info *bdi);
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio);
int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes);
int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes);
int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit);

/*
 * Flags in backing_dev_info::capability
 *
 * BDI_CAP_WRITEBACK:                Supports dirty page writeback, and dirty pages
 *                                should contribute to accounting
 * BDI_CAP_STRICTLIMIT:                Keep number of dirty pages below bdi threshold
 */
#define BDI_CAP_WRITEBACK                (1 << 0)
#define BDI_CAP_STRICTLIMIT                (1 << 1)

extern struct backing_dev_info noop_backing_dev_info;

int bdi_init(struct backing_dev_info *bdi);

/**
 * writeback_in_progress - determine whether there is writeback in progress
 * @wb: bdi_writeback of interest
 *
 * Determine whether there is writeback waiting to be handled against a
 * bdi_writeback.
 */
static inline bool writeback_in_progress(struct bdi_writeback *wb)
{
        return test_bit(WB_writeback_running, &wb->state);
}

struct backing_dev_info *inode_to_bdi(struct inode *inode);

static inline bool mapping_can_writeback(struct address_space *mapping)
{
        return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
}

/* Must not be used by file systems that support cgroup writeback */
static inline int bdi_wb_dirty_exceeded(struct backing_dev_info *bdi)
{
        return bdi->wb.dirty_exceeded;
}

/* Must not be used by file systems that support cgroup writeback */
static inline void bdi_wb_stat_mod(struct inode *inode, enum wb_stat_item item,
                                   s64 amount)
{
        wb_stat_mod(&inode_to_bdi(inode)->wb, item, amount);
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css);
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css,
                                    gfp_t gfp);
void wb_memcg_offline(struct mem_cgroup *memcg);
void wb_blkcg_offline(struct cgroup_subsys_state *css);

/**
 * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
 * @inode: inode of interest
 *
 * Cgroup writeback requires support from the filesystem.  Also, both memcg and
 * iocg have to be on the default hierarchy.  Test whether all conditions are
 * met.
 *
 * Note that the test result may change dynamically on the same inode
 * depending on how memcg and iocg are configured.
 */
static inline bool inode_cgwb_enabled(struct inode *inode)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);

        return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
                cgroup_subsys_on_dfl(io_cgrp_subsys) &&
                (bdi->capabilities & BDI_CAP_WRITEBACK) &&
                (inode->i_sb->s_iflags & SB_I_CGROUPWB);
}

/**
 * wb_find_current - find wb for %current on a bdi
 * @bdi: bdi of interest
 *
 * Find the wb of @bdi which matches both the memcg and blkcg of %current.
 * Must be called under rcu_read_lock() which protects the returend wb.
 * NULL if not found.
 */
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;

        memcg_css = task_css(current, memory_cgrp_id);
        if (!memcg_css->parent)
                return &bdi->wb;

        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);

        /*
         * %current's blkcg equals the effective blkcg of its memcg.  No
         * need to use the relatively expensive cgroup_get_e_css().
         */
        if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
                return wb;
        return NULL;
}

/**
 * wb_get_create_current - get or create wb for %current on a bdi
 * @bdi: bdi of interest
 * @gfp: allocation mask
 *
 * Equivalent to wb_get_create() on %current's memcg.  This function is
 * called from a relatively hot path and optimizes the common cases using
 * wb_find_current().
 */
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        struct bdi_writeback *wb;

        rcu_read_lock();
        wb = wb_find_current(bdi);
        if (wb && unlikely(!wb_tryget(wb)))
                wb = NULL;
        rcu_read_unlock();

        if (unlikely(!wb)) {
                struct cgroup_subsys_state *memcg_css;

                memcg_css = task_get_css(current, memory_cgrp_id);
                wb = wb_get_create(bdi, memcg_css, gfp);
                css_put(memcg_css);
        }
        return wb;
}

/**
 * inode_to_wb - determine the wb of an inode
 * @inode: inode of interest
 *
 * Returns the wb @inode is currently associated with.  The caller must be
 * holding either @inode->i_lock, the i_pages lock, or the
 * associated wb's list_lock.
 */
static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(debug_locks &&
                     (inode->i_sb->s_iflags & SB_I_CGROUPWB) &&
                     (!lockdep_is_held(&inode->i_lock) &&
                      !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) &&
                      !lockdep_is_held(&inode->i_wb->list_lock)));
#endif
        return inode->i_wb;
}

static inline struct bdi_writeback *inode_to_wb_wbc(
                                struct inode *inode,
                                struct writeback_control *wbc)
{
        /*
         * If wbc does not have inode attached, it means cgroup writeback was
         * disabled when wbc started. Just use the default wb in that case.
         */
        return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb;
}

/**
 * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
 * @inode: target inode
 * @cookie: output param, to be passed to the end function
 *
 * The caller wants to access the wb associated with @inode but isn't
 * holding inode->i_lock, the i_pages lock or wb->list_lock.  This
 * function determines the wb associated with @inode and ensures that the
 * association doesn't change until the transaction is finished with
 * unlocked_inode_to_wb_end().
 *
 * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
 * can't sleep during the transaction.  IRQs may or may not be disabled on
 * return.
 */
static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        rcu_read_lock();

        /*
         * Paired with a release fence in inode_do_switch_wbs() and
         * ensures that we see the new wb if we see cleared I_WB_SWITCH.
         */
        cookie->locked = inode_state_read_once(inode) & I_WB_SWITCH;
        smp_rmb();

        if (unlikely(cookie->locked))
                xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);

        /*
         * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages
         * lock.  inode_to_wb() will bark.  Deref directly.
         */
        return inode->i_wb;
}

/**
 * unlocked_inode_to_wb_end - end inode wb access transaction
 * @inode: target inode
 * @cookie: @cookie from unlocked_inode_to_wb_begin()
 */
static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
        if (unlikely(cookie->locked))
                xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags);

        rcu_read_unlock();
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline bool inode_cgwb_enabled(struct inode *inode)
{
        return false;
}

static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        return &bdi->wb;
}

static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        return &bdi->wb;
}

static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
        return &inode_to_bdi(inode)->wb;
}

static inline struct bdi_writeback *inode_to_wb_wbc(
                                struct inode *inode,
                                struct writeback_control *wbc)
{
        return inode_to_wb(inode);
}


static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        return inode_to_wb(inode);
}

static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
}

static inline void wb_memcg_offline(struct mem_cgroup *memcg)
{
}

static inline void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

const char *bdi_dev_name(struct backing_dev_info *bdi);

#endif        /* _LINUX_BACKING_DEV_H */






























































































    1 















































    1 








    1 















































































































































































































































































































































































    1 












    1 

















































































    1 


























































































    1 








    1 




    1 


    1 


    1 











    1 













































































































































































    1 





    1 



















    1 

















    1 











    1 









    1 






    1 



































    1 


    1 


































































































































    1 














































    1 











    1 




























    1 













    1 





































































































    1 







    1 












    1 
































    1 







    1 





    1 
    1 
    1 

























    1 








    1 











































    1 






























































































































































































































































































































    1 



































































































    1 








    1 






    1 




















    1 
    1 










    1 














































































































































































































































































































































































































































































































































    1 













    1 

























    1 























































































































































































































































































































    1 



















    1 




    1 



























































































































































    1 


    1 






















    1 














    1 







    1 


















    1 


    1 






























    1 


    1 
















    1 
    1 













    1 




    1 











    1 












    1 

    1 









































































    1 








    1 














































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/spinlock.h>

#include <linux/mm.h>
#include <linux/memfd.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/secretmem.h>

#include <linux/sched/signal.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/folio_batch.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>

#include <asm/mmu_context.h>
#include <asm/tlbflush.h>

#include "internal.h"
#include "swap.h"

static inline void sanity_check_pinned_pages(struct page **pages,
                                             unsigned long npages)
{
        if (!IS_ENABLED(CONFIG_DEBUG_VM))
                return;

        /*
         * We only pin anonymous pages if they are exclusive. Once pinned, we
         * can no longer turn them possibly shared and PageAnonExclusive() will
         * stick around until the page is freed.
         *
         * We'd like to verify that our pinned anonymous pages are still mapped
         * exclusively. The issue with anon THP is that we don't know how
         * they are/were mapped when pinning them. However, for anon
         * THP we can assume that either the given page (PTE-mapped THP) or
         * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
         * neither is the case, there is certainly something wrong.
         */
        for (; npages; npages--, pages++) {
                struct page *page = *pages;
                struct folio *folio;

                if (!page)
                        continue;

                folio = page_folio(page);

                if (is_zero_page(page) ||
                    !folio_test_anon(folio))
                        continue;
                if (!folio_test_large(folio) || folio_test_hugetlb(folio))
                        VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio);
                else
                        /* Either a PTE-mapped or a PMD-mapped THP. */
                        VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) &&
                                             !PageAnonExclusive(page), page);
        }
}

/*
 * Return the folio with ref appropriately incremented,
 * or NULL if that failed.
 */
static inline struct folio *try_get_folio(struct page *page, int refs)
{
        struct folio *folio;

retry:
        folio = page_folio(page);
        if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
                return NULL;
        if (unlikely(!folio_ref_try_add(folio, refs)))
                return NULL;

        /*
         * At this point we have a stable reference to the folio; but it
         * could be that between calling page_folio() and the refcount
         * increment, the folio was split, in which case we'd end up
         * holding a reference on a folio that has nothing to do with the page
         * we were given anymore.
         * So now that the folio is stable, recheck that the page still
         * belongs to this folio.
         */
        if (unlikely(page_folio(page) != folio)) {
                folio_put_refs(folio, refs);
                goto retry;
        }

        return folio;
}

static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
{
        if (flags & FOLL_PIN) {
                if (is_zero_folio(folio))
                        return;
                node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
                if (folio_has_pincount(folio))
                        atomic_sub(refs, &folio->_pincount);
                else
                        refs *= GUP_PIN_COUNTING_BIAS;
        }

        folio_put_refs(folio, refs);
}

/**
 * try_grab_folio() - add a folio's refcount by a flag-dependent amount
 * @folio:    pointer to folio to be grabbed
 * @refs:     the value to (effectively) add to the folio's refcount
 * @flags:    gup flags: these are the FOLL_* flag values
 *
 * This might not do anything at all, depending on the flags argument.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
 * time.
 *
 * Return: 0 for success, or if no action was required (if neither FOLL_PIN
 * nor FOLL_GET was set, nothing is done). A negative error code for failure:
 *
 *   -ENOMEM                FOLL_GET or FOLL_PIN was set, but the folio could not
 *                        be grabbed.
 *
 * It is called when we have a stable reference for the folio, typically in
 * GUP slow path.
 */
int __must_check try_grab_folio(struct folio *folio, int refs,
                                unsigned int flags)
{
        if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
                return -ENOMEM;

        if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio)))
                return -EREMOTEIO;

        if (flags & FOLL_GET)
                folio_ref_add(folio, refs);
        else if (flags & FOLL_PIN) {
                /*
                 * Don't take a pin on the zero page - it's not going anywhere
                 * and it is used in a *lot* of places.
                 */
                if (is_zero_folio(folio))
                        return 0;

                /*
                 * Increment the normal page refcount field at least once,
                 * so that the page really is pinned.
                 */
                if (folio_has_pincount(folio)) {
                        folio_ref_add(folio, refs);
                        atomic_add(refs, &folio->_pincount);
                } else {
                        folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS);
                }

                node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
        }

        return 0;
}

/**
 * unpin_user_page() - release a dma-pinned page
 * @page:            pointer to page to be released
 *
 * Pages that were pinned via pin_user_pages*() must be released via either
 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
 * that such pages can be separately tracked and uniquely handled. In
 * particular, interactions with RDMA and filesystems need special handling.
 */
void unpin_user_page(struct page *page)
{
        sanity_check_pinned_pages(&page, 1);
        gup_put_folio(page_folio(page), 1, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_page);

/**
 * unpin_folio() - release a dma-pinned folio
 * @folio:         pointer to folio to be released
 *
 * Folios that were pinned via memfd_pin_folios() or other similar routines
 * must be released either using unpin_folio() or unpin_folios().
 */
void unpin_folio(struct folio *folio)
{
        gup_put_folio(folio, 1, FOLL_PIN);
}
EXPORT_SYMBOL_GPL(unpin_folio);

/**
 * folio_add_pin - Try to get an additional pin on a pinned folio
 * @folio: The folio to be pinned
 *
 * Get an additional pin on a folio we already have a pin on.  Makes no change
 * if the folio is a zero_page.
 */
void folio_add_pin(struct folio *folio)
{
        if (is_zero_folio(folio))
                return;

        /*
         * Similar to try_grab_folio(): be sure to *also* increment the normal
         * page refcount field at least once, so that the page really is
         * pinned.
         */
        if (folio_has_pincount(folio)) {
                WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
                folio_ref_inc(folio);
                atomic_inc(&folio->_pincount);
        } else {
                WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);
                folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
        }
}

static inline struct folio *gup_folio_range_next(struct page *start,
                unsigned long npages, unsigned long i, unsigned int *ntails)
{
        struct page *next = start + i;
        struct folio *folio = page_folio(next);
        unsigned int nr = 1;

        if (folio_test_large(folio))
                nr = min_t(unsigned int, npages - i,
                           folio_nr_pages(folio) - folio_page_idx(folio, next));

        *ntails = nr;
        return folio;
}

static inline struct folio *gup_folio_next(struct page **list,
                unsigned long npages, unsigned long i, unsigned int *ntails)
{
        struct folio *folio = page_folio(list[i]);
        unsigned int nr;

        for (nr = i + 1; nr < npages; nr++) {
                if (page_folio(list[nr]) != folio)
                        break;
        }

        *ntails = nr - i;
        return folio;
}

/**
 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
 * @pages:  array of pages to be maybe marked dirty, and definitely released.
 * @npages: number of pages in the @pages array.
 * @make_dirty: whether to mark the pages dirty
 *
 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
 * variants called on that page.
 *
 * For each page in the @pages array, make that page (or its head page, if a
 * compound page) dirty, if @make_dirty is true, and if the page was previously
 * listed as clean. In any case, releases all pages using unpin_user_page(),
 * possibly via unpin_user_pages(), for the non-dirty case.
 *
 * Please see the unpin_user_page() documentation for details.
 *
 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 * required, then the caller should a) verify that this is really correct,
 * because _lock() is usually required, and b) hand code it:
 * set_page_dirty_lock(), unpin_user_page().
 *
 */
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        if (!make_dirty) {
                unpin_user_pages(pages, npages);
                return;
        }

        sanity_check_pinned_pages(pages, npages);
        for (i = 0; i < npages; i += nr) {
                folio = gup_folio_next(pages, npages, i, &nr);
                /*
                 * Checking PageDirty at this point may race with
                 * clear_page_dirty_for_io(), but that's OK. Two key
                 * cases:
                 *
                 * 1) This code sees the page as already dirty, so it
                 * skips the call to set_page_dirty(). That could happen
                 * because clear_page_dirty_for_io() called
                 * folio_mkclean(), followed by set_page_dirty().
                 * However, now the page is going to get written back,
                 * which meets the original intention of setting it
                 * dirty, so all is well: clear_page_dirty_for_io() goes
                 * on to call TestClearPageDirty(), and write the page
                 * back.
                 *
                 * 2) This code sees the page as clean, so it calls
                 * set_page_dirty(). The page stays dirty, despite being
                 * written back, so it gets written back again in the
                 * next writeback cycle. This is harmless.
                 */
                if (!folio_test_dirty(folio)) {
                        folio_lock(folio);
                        folio_mark_dirty(folio);
                        folio_unlock(folio);
                }
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);

/**
 * unpin_user_page_range_dirty_lock() - release and optionally dirty
 * gup-pinned page range
 *
 * @page:  the starting page of a range maybe marked dirty, and definitely released.
 * @npages: number of consecutive pages to release.
 * @make_dirty: whether to mark the pages dirty
 *
 * "gup-pinned page range" refers to a range of pages that has had one of the
 * pin_user_pages() variants called on that page.
 *
 * The page range must be truly physically contiguous: the page range
 * corresponds to a contiguous PFN range and all pages can be iterated
 * naturally.
 *
 * For the page ranges defined by [page .. page+npages], make that range (or
 * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
 * page range was previously listed as clean.
 *
 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 * required, then the caller should a) verify that this is really correct,
 * because _lock() is usually required, and b) hand code it:
 * set_page_dirty_lock(), unpin_user_page().
 *
 */
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
                                      bool make_dirty)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        VM_WARN_ON_ONCE(!page_range_contiguous(page, npages));

        for (i = 0; i < npages; i += nr) {
                folio = gup_folio_range_next(page, npages, i, &nr);
                if (make_dirty && !folio_test_dirty(folio)) {
                        folio_lock(folio);
                        folio_mark_dirty(folio);
                        folio_unlock(folio);
                }
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}
EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);

static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        /*
         * Don't perform any sanity checks because we might have raced with
         * fork() and some anonymous pages might now actually be shared --
         * which is why we're unpinning after all.
         */
        for (i = 0; i < npages; i += nr) {
                folio = gup_folio_next(pages, npages, i, &nr);
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}

/**
 * unpin_user_pages() - release an array of gup-pinned pages.
 * @pages:  array of pages to be marked dirty and released.
 * @npages: number of pages in the @pages array.
 *
 * For each page in the @pages array, release the page using unpin_user_page().
 *
 * Please see the unpin_user_page() documentation for details.
 */
void unpin_user_pages(struct page **pages, unsigned long npages)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        /*
         * If this WARN_ON() fires, then the system *might* be leaking pages (by
         * leaving them pinned), but probably not. More likely, gup/pup returned
         * a hard -ERRNO error to the caller, who erroneously passed it here.
         */
        if (WARN_ON(IS_ERR_VALUE(npages)))
                return;

        sanity_check_pinned_pages(pages, npages);
        for (i = 0; i < npages; i += nr) {
                if (!pages[i]) {
                        nr = 1;
                        continue;
                }
                folio = gup_folio_next(pages, npages, i, &nr);
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}
EXPORT_SYMBOL(unpin_user_pages);

/**
 * unpin_user_folio() - release pages of a folio
 * @folio:  pointer to folio to be released
 * @npages: number of pages of same folio
 *
 * Release npages of the folio
 */
void unpin_user_folio(struct folio *folio, unsigned long npages)
{
        gup_put_folio(folio, npages, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_folio);

/**
 * unpin_folios() - release an array of gup-pinned folios.
 * @folios:  array of folios to be marked dirty and released.
 * @nfolios: number of folios in the @folios array.
 *
 * For each folio in the @folios array, release the folio using gup_put_folio.
 *
 * Please see the unpin_folio() documentation for details.
 */
void unpin_folios(struct folio **folios, unsigned long nfolios)
{
        unsigned long i = 0, j;

        /*
         * If this WARN_ON() fires, then the system *might* be leaking folios
         * (by leaving them pinned), but probably not. More likely, gup/pup
         * returned a hard -ERRNO error to the caller, who erroneously passed
         * it here.
         */
        if (WARN_ON(IS_ERR_VALUE(nfolios)))
                return;

        while (i < nfolios) {
                for (j = i + 1; j < nfolios; j++)
                        if (folios[i] != folios[j])
                                break;

                if (folios[i])
                        gup_put_folio(folios[i], j - i, FOLL_PIN);
                i = j;
        }
}
EXPORT_SYMBOL_GPL(unpin_folios);

/*
 * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
 * lifecycle.  Avoid setting the bit unless necessary, or it might cause write
 * cache bouncing on large SMP machines for concurrent pinned gups.
 */
static inline void mm_set_has_pinned_flag(struct mm_struct *mm)
{
        if (!mm_flags_test(MMF_HAS_PINNED, mm))
                mm_flags_set(MMF_HAS_PINNED, mm);
}

#ifdef CONFIG_MMU

#ifdef CONFIG_HAVE_GUP_FAST
/**
 * try_grab_folio_fast() - Attempt to get or pin a folio in fast path.
 * @page:  pointer to page to be grabbed
 * @refs:  the value to (effectively) add to the folio's refcount
 * @flags: gup flags: these are the FOLL_* flag values.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
 * same time. (That's true throughout the get_user_pages*() and
 * pin_user_pages*() APIs.) Cases:
 *
 *    FOLL_GET: folio's refcount will be incremented by @refs.
 *
 *    FOLL_PIN on large folios: folio's refcount will be incremented by
 *    @refs, and its pincount will be incremented by @refs.
 *
 *    FOLL_PIN on single-page folios: folio's refcount will be incremented by
 *    @refs * GUP_PIN_COUNTING_BIAS.
 *
 * Return: The folio containing @page (with refcount appropriately
 * incremented) for success, or NULL upon failure. If neither FOLL_GET
 * nor FOLL_PIN was set, that's considered failure, and furthermore,
 * a likely bug in the caller, so a warning is also emitted.
 *
 * It uses add ref unless zero to elevate the folio refcount and must be called
 * in fast path only.
 */
static struct folio *try_grab_folio_fast(struct page *page, int refs,
                                         unsigned int flags)
{
        struct folio *folio;

        /* Raise warn if it is not called in fast GUP */
        VM_WARN_ON_ONCE(!irqs_disabled());

        if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
                return NULL;

        if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
                return NULL;

        if (flags & FOLL_GET)
                return try_get_folio(page, refs);

        /* FOLL_PIN is set */

        /*
         * Don't take a pin on the zero page - it's not going anywhere
         * and it is used in a *lot* of places.
         */
        if (is_zero_page(page))
                return page_folio(page);

        folio = try_get_folio(page, refs);
        if (!folio)
                return NULL;

        /*
         * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
         * right zone, so fail and let the caller fall back to the slow
         * path.
         */
        if (unlikely((flags & FOLL_LONGTERM) &&
                     !folio_is_longterm_pinnable(folio))) {
                folio_put_refs(folio, refs);
                return NULL;
        }

        /*
         * When pinning a large folio, use an exact count to track it.
         *
         * However, be sure to *also* increment the normal folio
         * refcount field at least once, so that the folio really
         * is pinned.  That's why the refcount from the earlier
         * try_get_folio() is left intact.
         */
        if (folio_has_pincount(folio))
                atomic_add(refs, &folio->_pincount);
        else
                folio_ref_add(folio,
                                refs * (GUP_PIN_COUNTING_BIAS - 1));
        /*
         * Adjust the pincount before re-checking the PTE for changes.
         * This is essentially a smp_mb() and is paired with a memory
         * barrier in folio_try_share_anon_rmap_*().
         */
        smp_mb__after_atomic();

        node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);

        return folio;
}
#endif        /* CONFIG_HAVE_GUP_FAST */

/* Common code for can_follow_write_* */
static inline bool can_follow_write_common(struct page *page,
                struct vm_area_struct *vma, unsigned int flags)
{
        /* Maybe FOLL_FORCE is set to override it? */
        if (!(flags & FOLL_FORCE))
                return false;

        /* But FOLL_FORCE has no effect on shared mappings */
        if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
                return false;

        /* ... or read-only private ones */
        if (!(vma->vm_flags & VM_MAYWRITE))
                return false;

        /* ... or already writable ones that just need to take a write fault */
        if (vma->vm_flags & VM_WRITE)
                return false;

        /*
         * See can_change_pte_writable(): we broke COW and could map the page
         * writable if we have an exclusive anonymous page ...
         */
        return page && PageAnon(page) && PageAnonExclusive(page);
}

static struct page *no_page_table(struct vm_area_struct *vma,
                                  unsigned int flags, unsigned long address)
{
        if (!(flags & FOLL_DUMP))
                return NULL;

        /*
         * When core dumping, we don't want to allocate unnecessary pages or
         * page tables.  Return error instead of NULL to skip handle_mm_fault,
         * then get_dump_page() will return NULL to leave a hole in the dump.
         * But we can only make this optimization where a hole would surely
         * be zero-filled if handle_mm_fault() actually did handle it.
         */
        if (is_vm_hugetlb_page(vma)) {
                struct hstate *h = hstate_vma(vma);

                if (!hugetlbfs_pagecache_present(h, vma, address))
                        return ERR_PTR(-EFAULT);
        } else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) {
                return ERR_PTR(-EFAULT);
        }

        return NULL;
}

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */
static inline bool can_follow_write_pud(pud_t pud, struct page *page,
                                        struct vm_area_struct *vma,
                                        unsigned int flags)
{
        /* If the pud is writable, we can write to the page. */
        if (pud_write(pud))
                return true;

        return can_follow_write_common(page, vma, flags);
}

static struct page *follow_huge_pud(struct vm_area_struct *vma,
                                    unsigned long addr, pud_t *pudp,
                                    int flags, unsigned long *page_mask)
{
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
        pud_t pud = *pudp;
        unsigned long pfn = pud_pfn(pud);
        int ret;

        assert_spin_locked(pud_lockptr(mm, pudp));

        if (!pud_present(pud))
                return NULL;

        if ((flags & FOLL_WRITE) &&
            !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags))
                return NULL;

        pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
        page = pfn_to_page(pfn);

        if (!pud_write(pud) && gup_must_unshare(vma, flags, page))
                return ERR_PTR(-EMLINK);

        ret = try_grab_folio(page_folio(page), 1, flags);
        if (ret)
                page = ERR_PTR(ret);
        else
                *page_mask = HPAGE_PUD_NR - 1;

        return page;
}

/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
                                        struct vm_area_struct *vma,
                                        unsigned int flags)
{
        /* If the pmd is writable, we can write to the page. */
        if (pmd_write(pmd))
                return true;

        if (!can_follow_write_common(page, vma, flags))
                return false;

        /* ... and a write-fault isn't required for other reasons. */
        if (pmd_needs_soft_dirty_wp(vma, pmd))
                return false;
        return !userfaultfd_huge_pmd_wp(vma, pmd);
}

static struct page *follow_huge_pmd(struct vm_area_struct *vma,
                                    unsigned long addr, pmd_t *pmd,
                                    unsigned int flags,
                                    unsigned long *page_mask)
{
        struct mm_struct *mm = vma->vm_mm;
        pmd_t pmdval = *pmd;
        struct page *page;
        int ret;

        assert_spin_locked(pmd_lockptr(mm, pmd));

        page = pmd_page(pmdval);
        if ((flags & FOLL_WRITE) &&
            !can_follow_write_pmd(pmdval, page, vma, flags))
                return NULL;

        /* Avoid dumping huge zero page */
        if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval))
                return ERR_PTR(-EFAULT);

        if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
                return NULL;

        if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
                return ERR_PTR(-EMLINK);

        VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
                             !PageAnonExclusive(page), page);

        ret = try_grab_folio(page_folio(page), 1, flags);
        if (ret)
                return ERR_PTR(ret);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH))
                touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
#endif        /* CONFIG_TRANSPARENT_HUGEPAGE */

        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        *page_mask = HPAGE_PMD_NR - 1;

        return page;
}

#else  /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
static struct page *follow_huge_pud(struct vm_area_struct *vma,
                                    unsigned long addr, pud_t *pudp,
                                    int flags, unsigned long *page_mask)
{
        return NULL;
}

static struct page *follow_huge_pmd(struct vm_area_struct *vma,
                                    unsigned long addr, pmd_t *pmd,
                                    unsigned int flags,
                                    unsigned long *page_mask)
{
        return NULL;
}
#endif        /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */

static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
                pte_t *pte, unsigned int flags)
{
        if (flags & FOLL_TOUCH) {
                pte_t orig_entry = ptep_get(pte);
                pte_t entry = orig_entry;

                if (flags & FOLL_WRITE)
                        entry = pte_mkdirty(entry);
                entry = pte_mkyoung(entry);

                if (!pte_same(orig_entry, entry)) {
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        update_mmu_cache(vma, address, pte);
                }
        }

        /* Proper page table entry exists, but no corresponding struct page */
        return -EEXIST;
}

/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
static inline bool can_follow_write_pte(pte_t pte, struct page *page,
                                        struct vm_area_struct *vma,
                                        unsigned int flags)
{
        /* If the pte is writable, we can write to the page. */
        if (pte_write(pte))
                return true;

        if (!can_follow_write_common(page, vma, flags))
                return false;

        /* ... and a write-fault isn't required for other reasons. */
        if (pte_needs_soft_dirty_wp(vma, pte))
                return false;
        return !userfaultfd_pte_wp(vma, pte);
}

static struct page *follow_page_pte(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd, unsigned int flags)
{
        struct mm_struct *mm = vma->vm_mm;
        struct folio *folio;
        struct page *page;
        spinlock_t *ptl;
        pte_t *ptep, pte;
        int ret;

        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (!ptep)
                return no_page_table(vma, flags, address);
        pte = ptep_get(ptep);
        if (!pte_present(pte))
                goto no_page;
        if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))
                goto no_page;

        page = vm_normal_page(vma, address, pte);

        /*
         * We only care about anon pages in can_follow_write_pte().
         */
        if ((flags & FOLL_WRITE) &&
            !can_follow_write_pte(pte, page, vma, flags)) {
                page = NULL;
                goto out;
        }

        if (unlikely(!page)) {
                if (flags & FOLL_DUMP) {
                        /* Avoid special (like zero) pages in core dumps */
                        page = ERR_PTR(-EFAULT);
                        goto out;
                }

                if (is_zero_pfn(pte_pfn(pte))) {
                        page = pte_page(pte);
                } else {
                        ret = follow_pfn_pte(vma, address, ptep, flags);
                        page = ERR_PTR(ret);
                        goto out;
                }
        }
        folio = page_folio(page);

        if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
                page = ERR_PTR(-EMLINK);
                goto out;
        }

        VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
                             !PageAnonExclusive(page), page);

        /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
        ret = try_grab_folio(folio, 1, flags);
        if (unlikely(ret)) {
                page = ERR_PTR(ret);
                goto out;
        }

        /*
         * We need to make the page accessible if and only if we are going
         * to access its content (the FOLL_PIN case).  Please see
         * Documentation/core-api/pin_user_pages.rst for details.
         */
        if (flags & FOLL_PIN) {
                ret = arch_make_folio_accessible(folio);
                if (ret) {
                        unpin_user_page(page);
                        page = ERR_PTR(ret);
                        goto out;
                }
        }
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !folio_test_dirty(folio))
                        folio_mark_dirty(folio);
                /*
                 * pte_mkyoung() would be more correct here, but atomic care
                 * is needed to avoid losing the dirty bit: it is easier to use
                 * folio_mark_accessed().
                 */
                folio_mark_accessed(folio);
        }
out:
        pte_unmap_unlock(ptep, ptl);
        return page;
no_page:
        pte_unmap_unlock(ptep, ptl);
        if (!pte_none(pte))
                return NULL;
        return no_page_table(vma, flags, address);
}

static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                                    unsigned long address, pud_t *pudp,
                                    unsigned int flags,
                                    unsigned long *page_mask)
{
        pmd_t *pmd, pmdval;
        spinlock_t *ptl;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;

        pmd = pmd_offset(pudp, address);
        pmdval = pmdp_get_lockless(pmd);
        if (pmd_none(pmdval))
                return no_page_table(vma, flags, address);
        if (!pmd_present(pmdval))
                return no_page_table(vma, flags, address);
        if (likely(!pmd_leaf(pmdval)))
                return follow_page_pte(vma, address, pmd, flags);

        if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
                return no_page_table(vma, flags, address);

        ptl = pmd_lock(mm, pmd);
        pmdval = *pmd;
        if (unlikely(!pmd_present(pmdval))) {
                spin_unlock(ptl);
                return no_page_table(vma, flags, address);
        }
        if (unlikely(!pmd_leaf(pmdval))) {
                spin_unlock(ptl);
                return follow_page_pte(vma, address, pmd, flags);
        }
        if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {
                spin_unlock(ptl);
                split_huge_pmd(vma, pmd, address);
                /* If pmd was left empty, stuff a page table in there quickly */
                return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
                        follow_page_pte(vma, address, pmd, flags);
        }
        page = follow_huge_pmd(vma, address, pmd, flags, page_mask);
        spin_unlock(ptl);
        return page;
}

static struct page *follow_pud_mask(struct vm_area_struct *vma,
                                    unsigned long address, p4d_t *p4dp,
                                    unsigned int flags,
                                    unsigned long *page_mask)
{
        pud_t *pudp, pud;
        spinlock_t *ptl;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;

        pudp = pud_offset(p4dp, address);
        pud = pudp_get(pudp);
        if (!pud_present(pud))
                return no_page_table(vma, flags, address);
        if (pud_leaf(pud)) {
                ptl = pud_lock(mm, pudp);
                page = follow_huge_pud(vma, address, pudp, flags, page_mask);
                spin_unlock(ptl);
                if (page)
                        return page;
                return no_page_table(vma, flags, address);
        }
        if (unlikely(pud_bad(pud)))
                return no_page_table(vma, flags, address);

        return follow_pmd_mask(vma, address, pudp, flags, page_mask);
}

static struct page *follow_p4d_mask(struct vm_area_struct *vma,
                                    unsigned long address, pgd_t *pgdp,
                                    unsigned int flags,
                                    unsigned long *page_mask)
{
        p4d_t *p4dp, p4d;

        p4dp = p4d_offset(pgdp, address);
        p4d = p4dp_get(p4dp);
        BUILD_BUG_ON(p4d_leaf(p4d));

        if (!p4d_present(p4d) || p4d_bad(p4d))
                return no_page_table(vma, flags, address);

        return follow_pud_mask(vma, address, p4dp, flags, page_mask);
}

/**
 * follow_page_mask - look up a page descriptor from a user-virtual address
 * @vma: vm_area_struct mapping @address
 * @address: virtual address to look up
 * @flags: flags modifying lookup behaviour
 * @page_mask: a pointer to output page_mask
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 *
 * When getting an anonymous page and the caller has to trigger unsharing
 * of a shared anonymous page first, -EMLINK is returned. The caller should
 * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
 * relevant with FOLL_PIN and !FOLL_WRITE.
 *
 * On output, @page_mask is set according to the size of the page.
 *
 * Return: the mapped (struct page *), %NULL if no mapping exists, or
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
 */
static struct page *follow_page_mask(struct vm_area_struct *vma,
                              unsigned long address, unsigned int flags,
                              unsigned long *page_mask)
{
        pgd_t *pgd;
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;

        vma_pgtable_walk_begin(vma);

        *page_mask = 0;
        pgd = pgd_offset(mm, address);

        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                page = no_page_table(vma, flags, address);
        else
                page = follow_p4d_mask(vma, address, pgd, flags, page_mask);

        vma_pgtable_walk_end(vma);

        return page;
}

static int get_gate_page(struct mm_struct *mm, unsigned long address,
                unsigned int gup_flags, struct vm_area_struct **vma,
                struct page **page)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        pte_t entry;
        int ret = -EFAULT;

        /* user gate pages are read-only */
        if (gup_flags & FOLL_WRITE)
                return -EFAULT;
        pgd = pgd_offset(mm, address);
        if (pgd_none(*pgd))
                return -EFAULT;
        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d))
                return -EFAULT;
        pud = pud_offset(p4d, address);
        if (pud_none(*pud))
                return -EFAULT;
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return -EFAULT;
        pte = pte_offset_map(pmd, address);
        if (!pte)
                return -EFAULT;
        entry = ptep_get(pte);
        if (pte_none(entry))
                goto unmap;
        *vma = get_gate_vma(mm);
        if (!page)
                goto out;
        *page = vm_normal_page(*vma, address, entry);
        if (!*page) {
                if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
                        goto unmap;
                *page = pte_page(entry);
        }
        ret = try_grab_folio(page_folio(*page), 1, gup_flags);
        if (unlikely(ret))
                goto unmap;
out:
        ret = 0;
unmap:
        pte_unmap(pte);
        return ret;
}

/*
 * mmap_lock must be held on entry.  If @flags has FOLL_UNLOCKABLE but not
 * FOLL_NOWAIT, the mmap_lock may be released.  If it is, *@locked will be set
 * to 0 and -EBUSY returned.
 */
static int faultin_page(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags, bool unshare,
                int *locked)
{
        unsigned int fault_flags = 0;
        vm_fault_t ret;

        if (flags & FOLL_NOFAULT)
                return -EFAULT;
        if (flags & FOLL_WRITE)
                fault_flags |= FAULT_FLAG_WRITE;
        if (flags & FOLL_REMOTE)
                fault_flags |= FAULT_FLAG_REMOTE;
        if (flags & FOLL_UNLOCKABLE) {
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
                /*
                 * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
                 * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
                 * That's because some callers may not be prepared to
                 * handle early exits caused by non-fatal signals.
                 */
                if (flags & FOLL_INTERRUPTIBLE)
                        fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
        }
        if (flags & FOLL_NOWAIT)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
        if (flags & FOLL_TRIED) {
                /*
                 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
                 * can co-exist
                 */
                fault_flags |= FAULT_FLAG_TRIED;
        }
        if (unshare) {
                fault_flags |= FAULT_FLAG_UNSHARE;
                /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
                VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE);
        }

        ret = handle_mm_fault(vma, address, fault_flags, NULL);

        if (ret & VM_FAULT_COMPLETED) {
                /*
                 * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
                 * mmap lock in the page fault handler. Sanity check this.
                 */
                WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
                *locked = 0;

                /*
                 * We should do the same as VM_FAULT_RETRY, but let's not
                 * return -EBUSY since that's not reflecting the reality of
                 * what has happened - we've just fully completed a page
                 * fault, with the mmap lock released.  Use -EAGAIN to show
                 * that we want to take the mmap lock _again_.
                 */
                return -EAGAIN;
        }

        if (ret & VM_FAULT_ERROR) {
                int err = vm_fault_to_errno(ret, flags);

                if (err)
                        return err;
                BUG();
        }

        if (ret & VM_FAULT_RETRY) {
                if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
                        *locked = 0;
                return -EBUSY;
        }

        return 0;
}

/*
 * Writing to file-backed mappings which require folio dirty tracking using GUP
 * is a fundamentally broken operation, as kernel write access to GUP mappings
 * do not adhere to the semantics expected by a file system.
 *
 * Consider the following scenario:-
 *
 * 1. A folio is written to via GUP which write-faults the memory, notifying
 *    the file system and dirtying the folio.
 * 2. Later, writeback is triggered, resulting in the folio being cleaned and
 *    the PTE being marked read-only.
 * 3. The GUP caller writes to the folio, as it is mapped read/write via the
 *    direct mapping.
 * 4. The GUP caller, now done with the page, unpins it and sets it dirty
 *    (though it does not have to).
 *
 * This results in both data being written to a folio without writenotify, and
 * the folio being dirtied unexpectedly (if the caller decides to do so).
 */
static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
                                          unsigned long gup_flags)
{
        /*
         * If we aren't pinning then no problematic write can occur. A long term
         * pin is the most egregious case so this is the case we disallow.
         */
        if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
            (FOLL_PIN | FOLL_LONGTERM))
                return true;

        /*
         * If the VMA does not require dirty tracking then no problematic write
         * can occur either.
         */
        return !vma_needs_dirty_tracking(vma);
}

static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
        vm_flags_t vm_flags = vma->vm_flags;
        int write = (gup_flags & FOLL_WRITE);
        int foreign = (gup_flags & FOLL_REMOTE);
        bool vma_anon = vma_is_anonymous(vma);

        if (vm_flags & (VM_IO | VM_PFNMAP))
                return -EFAULT;

        if ((gup_flags & FOLL_ANON) && !vma_anon)
                return -EFAULT;

        if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
                return -EOPNOTSUPP;

        if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
                return -EOPNOTSUPP;

        if (vma_is_secretmem(vma))
                return -EFAULT;

        if (write) {
                if (!vma_anon &&
                    !writable_file_mapping_allowed(vma, gup_flags))
                        return -EFAULT;

                if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
                        if (!(gup_flags & FOLL_FORCE))
                                return -EFAULT;
                        /*
                         * We used to let the write,force case do COW in a
                         * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
                         * set a breakpoint in a read-only mapping of an
                         * executable, without corrupting the file (yet only
                         * when that file had been opened for writing!).
                         * Anon pages in shared mappings are surprising: now
                         * just reject it.
                         */
                        if (!is_cow_mapping(vm_flags))
                                return -EFAULT;
                }
        } else if (!(vm_flags & VM_READ)) {
                if (!(gup_flags & FOLL_FORCE))
                        return -EFAULT;
                /*
                 * Is there actually any vma we can reach here which does not
                 * have VM_MAYREAD set?
                 */
                if (!(vm_flags & VM_MAYREAD))
                        return -EFAULT;
        }
        /*
         * gups are always data accesses, not instruction
         * fetches, so execute=false here
         */
        if (!arch_vma_access_permitted(vma, write, false, foreign))
                return -EFAULT;
        return 0;
}

/*
 * This is "vma_lookup()", but with a warning if we would have
 * historically expanded the stack in the GUP code.
 */
static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm,
         unsigned long addr)
{
#ifdef CONFIG_STACK_GROWSUP
        return vma_lookup(mm, addr);
#else
        static volatile unsigned long next_warn;
        struct vm_area_struct *vma;
        unsigned long now, next;

        vma = find_vma(mm, addr);
        if (!vma || (addr >= vma->vm_start))
                return vma;

        /* Only warn for half-way relevant accesses */
        if (!(vma->vm_flags & VM_GROWSDOWN))
                return NULL;
        if (vma->vm_start - addr > 65536)
                return NULL;

        /* Let's not warn more than once an hour.. */
        now = jiffies; next = next_warn;
        if (next && time_before(now, next))
                return NULL;
        next_warn = now + 60*60*HZ;

        /* Let people know things may have changed. */
        pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n",
                current->comm, task_pid_nr(current),
                vma->vm_start, vma->vm_end, addr);
        dump_stack();
        return NULL;
#endif
}

/**
 * __get_user_pages() - pin user pages in memory
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying pin behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long. Or NULL, if caller
 *                only intends to ensure the pages are faulted in.
 * @locked:     whether we're still with the mmap_lock held
 *
 * Returns either number of pages pinned (which may be less than the
 * number requested), or an error. Details about the return value:
 *
 * -- If nr_pages is 0, returns 0.
 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
 * -- If nr_pages is >0, and some pages were pinned, returns the number of
 *    pages pinned. Again, this may be less than nr_pages.
 * -- 0 return value is possible when the fault would need to be retried.
 *
 * The caller is responsible for releasing returned @pages, via put_page().
 *
 * Must be called with mmap_lock held.  It may be released.  See below.
 *
 * __get_user_pages walks a process's page tables and takes a reference to
 * each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * __get_user_pages returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re-faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 * appropriate) must be called after the page is finished with, and
 * before put_page is called.
 *
 * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
 * be released. If this happens *@locked will be set to 0 on return.
 *
 * A caller using such a combination of @gup_flags must therefore hold the
 * mmap_lock for reading only, and recognize when it's been released. Otherwise,
 * it must be held for either reading or writing and will not be released.
 *
 * In most cases, get_user_pages or get_user_pages_fast should be used
 * instead of __get_user_pages. __get_user_pages should be used only if
 * you need some special @gup_flags.
 */
static long __get_user_pages(struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
                int *locked)
{
        long ret = 0, i = 0;
        struct vm_area_struct *vma = NULL;
        unsigned long page_mask = 0;

        if (!nr_pages)
                return 0;

        start = untagged_addr_remote(mm, start);

        VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));

        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
                        (FOLL_PIN | FOLL_GET));

        do {
                struct page *page;
                unsigned int page_increm;

                /* first iteration or cross vma bound */
                if (!vma || start >= vma->vm_end) {
                        /*
                         * MADV_POPULATE_(READ|WRITE) wants to handle VMA
                         * lookups+error reporting differently.
                         */
                        if (gup_flags & FOLL_MADV_POPULATE) {
                                vma = vma_lookup(mm, start);
                                if (!vma) {
                                        ret = -ENOMEM;
                                        goto out;
                                }
                                if (check_vma_flags(vma, gup_flags)) {
                                        ret = -EINVAL;
                                        goto out;
                                }
                                goto retry;
                        }
                        vma = gup_vma_lookup(mm, start);
                        if (!vma && in_gate_area(mm, start)) {
                                ret = get_gate_page(mm, start & PAGE_MASK,
                                                gup_flags, &vma,
                                                pages ? &page : NULL);
                                if (ret)
                                        goto out;
                                page_mask = 0;
                                goto next_page;
                        }

                        if (!vma) {
                                ret = -EFAULT;
                                goto out;
                        }
                        ret = check_vma_flags(vma, gup_flags);
                        if (ret)
                                goto out;
                }
retry:
                /*
                 * If we have a pending SIGKILL, don't keep faulting pages and
                 * potentially allocating memory.
                 */
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        goto out;
                }
                cond_resched();

                page = follow_page_mask(vma, start, gup_flags, &page_mask);
                if (!page || PTR_ERR(page) == -EMLINK) {
                        ret = faultin_page(vma, start, gup_flags,
                                           PTR_ERR(page) == -EMLINK, locked);
                        switch (ret) {
                        case 0:
                                goto retry;
                        case -EBUSY:
                        case -EAGAIN:
                                ret = 0;
                                fallthrough;
                        case -EFAULT:
                        case -ENOMEM:
                        case -EHWPOISON:
                                goto out;
                        }
                        BUG();
                } else if (PTR_ERR(page) == -EEXIST) {
                        /*
                         * Proper page table entry exists, but no corresponding
                         * struct page. If the caller expects **pages to be
                         * filled in, bail out now, because that can't be done
                         * for this page.
                         */
                        if (pages) {
                                ret = PTR_ERR(page);
                                goto out;
                        }
                } else if (IS_ERR(page)) {
                        ret = PTR_ERR(page);
                        goto out;
                }
next_page:
                page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
                if (page_increm > nr_pages)
                        page_increm = nr_pages;

                if (pages) {
                        struct page *subpage;
                        unsigned int j;

                        /*
                         * This must be a large folio (and doesn't need to
                         * be the whole folio; it can be part of it), do
                         * the refcount work for all the subpages too.
                         *
                         * NOTE: here the page may not be the head page
                         * e.g. when start addr is not thp-size aligned.
                         * try_grab_folio() should have taken care of tail
                         * pages.
                         */
                        if (page_increm > 1) {
                                struct folio *folio = page_folio(page);

                                /*
                                 * Since we already hold refcount on the
                                 * large folio, this should never fail.
                                 */
                                if (try_grab_folio(folio, page_increm - 1,
                                                   gup_flags)) {
                                        /*
                                         * Release the 1st page ref if the
                                         * folio is problematic, fail hard.
                                         */
                                        gup_put_folio(folio, 1, gup_flags);
                                        ret = -EFAULT;
                                        goto out;
                                }
                        }

                        for (j = 0; j < page_increm; j++) {
                                subpage = page + j;
                                pages[i + j] = subpage;
                                flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
                                flush_dcache_page(subpage);
                        }
                }

                i += page_increm;
                start += page_increm * PAGE_SIZE;
                nr_pages -= page_increm;
        } while (nr_pages);
out:
        return i ? i : ret;
}

static bool vma_permits_fault(struct vm_area_struct *vma,
                              unsigned int fault_flags)
{
        bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
        bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
        vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;

        if (!(vm_flags & vma->vm_flags))
                return false;

        /*
         * The architecture might have a hardware protection
         * mechanism other than read/write that can deny access.
         *
         * gup always represents data access, not instruction
         * fetches, so execute=false here:
         */
        if (!arch_vma_access_permitted(vma, write, false, foreign))
                return false;

        return true;
}

/**
 * fixup_user_fault() - manually resolve a user page fault
 * @mm:                mm_struct of target mm
 * @address:        user address
 * @fault_flags:flags to pass down to handle_mm_fault()
 * @unlocked:        did we unlock the mmap_lock while retrying, maybe NULL if caller
 *                does not allow retry. If NULL, the caller must guarantee
 *                that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
 *
 * This is meant to be called in the specific scenario where for locking reasons
 * we try to access user memory in atomic context (within a pagefault_disable()
 * section), this returns -EFAULT, and we want to resolve the user fault before
 * trying again.
 *
 * Typically this is meant to be used by the futex code.
 *
 * The main difference with get_user_pages() is that this function will
 * unconditionally call handle_mm_fault() which will in turn perform all the
 * necessary SW fixup of the dirty and young bits in the PTE, while
 * get_user_pages() only guarantees to update these in the struct page.
 *
 * This is important for some architectures where those bits also gate the
 * access permission to the page because they are maintained in software.  On
 * such architectures, gup() will not be enough to make a subsequent access
 * succeed.
 *
 * This function will not return with an unlocked mmap_lock. So it has not the
 * same semantics wrt the @mm->mmap_lock as does filemap_fault().
 */
int fixup_user_fault(struct mm_struct *mm,
                     unsigned long address, unsigned int fault_flags,
                     bool *unlocked)
{
        struct vm_area_struct *vma;
        vm_fault_t ret;

        address = untagged_addr_remote(mm, address);

        if (unlocked)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

retry:
        vma = gup_vma_lookup(mm, address);
        if (!vma)
                return -EFAULT;

        if (!vma_permits_fault(vma, fault_flags))
                return -EFAULT;

        if ((fault_flags & FAULT_FLAG_KILLABLE) &&
            fatal_signal_pending(current))
                return -EINTR;

        ret = handle_mm_fault(vma, address, fault_flags, NULL);

        if (ret & VM_FAULT_COMPLETED) {
                /*
                 * NOTE: it's a pity that we need to retake the lock here
                 * to pair with the unlock() in the callers. Ideally we
                 * could tell the callers so they do not need to unlock.
                 */
                mmap_read_lock(mm);
                *unlocked = true;
                return 0;
        }

        if (ret & VM_FAULT_ERROR) {
                int err = vm_fault_to_errno(ret, 0);

                if (err)
                        return err;
                BUG();
        }

        if (ret & VM_FAULT_RETRY) {
                mmap_read_lock(mm);
                *unlocked = true;
                fault_flags |= FAULT_FLAG_TRIED;
                goto retry;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(fixup_user_fault);

/*
 * GUP always responds to fatal signals.  When FOLL_INTERRUPTIBLE is
 * specified, it'll also respond to generic signals.  The caller of GUP
 * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
 */
static bool gup_signal_pending(unsigned int flags)
{
        if (fatal_signal_pending(current))
                return true;

        if (!(flags & FOLL_INTERRUPTIBLE))
                return false;

        return signal_pending(current);
}

/*
 * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
 * the caller. This function may drop the mmap_lock. If it does so, then it will
 * set (*locked = 0).
 *
 * (*locked == 0) means that the caller expects this function to acquire and
 * drop the mmap_lock. Therefore, the value of *locked will still be zero when
 * the function returns, even though it may have changed temporarily during
 * function execution.
 *
 * Please note that this function, unlike __get_user_pages(), will not return 0
 * for nr_pages > 0, unless FOLL_NOWAIT is used.
 */
static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
                                                unsigned long start,
                                                unsigned long nr_pages,
                                                struct page **pages,
                                                int *locked,
                                                unsigned int flags)
{
        long ret, pages_done;
        bool must_unlock = false;

        if (!nr_pages)
                return 0;

        /*
         * The internal caller expects GUP to manage the lock internally and the
         * lock must be released when this returns.
         */
        if (!*locked) {
                if (mmap_read_lock_killable(mm))
                        return -EAGAIN;
                must_unlock = true;
                *locked = 1;
        }
        else
                mmap_assert_locked(mm);

        if (flags & FOLL_PIN)
                mm_set_has_pinned_flag(mm);

        /*
         * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
         * is to set FOLL_GET if the caller wants pages[] filled in (but has
         * carelessly failed to specify FOLL_GET), so keep doing that, but only
         * for FOLL_GET, not for the newer FOLL_PIN.
         *
         * FOLL_PIN always expects pages to be non-null, but no need to assert
         * that here, as any failures will be obvious enough.
         */
        if (pages && !(flags & FOLL_PIN))
                flags |= FOLL_GET;

        pages_done = 0;
        for (;;) {
                ret = __get_user_pages(mm, start, nr_pages, flags, pages,
                                       locked);
                if (!(flags & FOLL_UNLOCKABLE)) {
                        /* VM_FAULT_RETRY couldn't trigger, bypass */
                        pages_done = ret;
                        break;
                }

                /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
                VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages));

                if (ret > 0) {
                        nr_pages -= ret;
                        pages_done += ret;
                        if (!nr_pages)
                                break;
                }
                if (*locked) {
                        /*
                         * VM_FAULT_RETRY didn't trigger or it was a
                         * FOLL_NOWAIT.
                         */
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }
                /*
                 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
                 * For the prefault case (!pages) we only update counts.
                 */
                if (likely(pages))
                        pages += ret;
                start += ret << PAGE_SHIFT;

                /* The lock was temporarily dropped, so we must unlock later */
                must_unlock = true;

retry:
                /*
                 * Repeat on the address that fired VM_FAULT_RETRY
                 * with both FAULT_FLAG_ALLOW_RETRY and
                 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
                 * by fatal signals of even common signals, depending on
                 * the caller's request. So we need to check it before we
                 * start trying again otherwise it can loop forever.
                 */
                if (gup_signal_pending(flags)) {
                        if (!pages_done)
                                pages_done = -EINTR;
                        break;
                }

                ret = mmap_read_lock_killable(mm);
                if (ret) {
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }

                *locked = 1;
                ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
                                       pages, locked);
                if (!*locked) {
                        /* Continue to retry until we succeeded */
                        VM_WARN_ON_ONCE(ret != 0);
                        goto retry;
                }
                if (ret != 1) {
                        VM_WARN_ON_ONCE(ret > 1);
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }
                nr_pages--;
                pages_done++;
                if (!nr_pages)
                        break;
                if (likely(pages))
                        pages++;
                start += PAGE_SIZE;
        }
        if (must_unlock && *locked) {
                /*
                 * We either temporarily dropped the lock, or the caller
                 * requested that we both acquire and drop the lock. Either way,
                 * we must now unlock, and notify the caller of that state.
                 */
                mmap_read_unlock(mm);
                *locked = 0;
        }

        /*
         * Failing to pin anything implies something has gone wrong (except when
         * FOLL_NOWAIT is specified).
         */
        if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT)))
                return -EFAULT;

        return pages_done;
}

/**
 * populate_vma_page_range() -  populate a range of pages in the vma.
 * @vma:   target vma
 * @start: start address
 * @end:   end address
 * @locked: whether the mmap_lock is still held
 *
 * This takes care of mlocking the pages too if VM_LOCKED is set.
 *
 * Return either number of pages pinned in the vma, or a negative error
 * code on error.
 *
 * vma->vm_mm->mmap_lock must be held.
 *
 * If @locked is NULL, it may be held for read or write and will
 * be unperturbed.
 *
 * If @locked is non-NULL, it must held for read only and may be
 * released.  If it's released, *@locked will be set to 0.
 */
long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *locked)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long nr_pages = (end - start) / PAGE_SIZE;
        int local_locked = 1;
        int gup_flags;
        long ret;

        VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
        VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
        VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma);
        VM_WARN_ON_ONCE_VMA(end   > vma->vm_end, vma);
        mmap_assert_locked(mm);

        /*
         * Rightly or wrongly, the VM_LOCKONFAULT case has never used
         * faultin_page() to break COW, so it has no work to do here.
         */
        if (vma->vm_flags & VM_LOCKONFAULT)
                return nr_pages;

        /* ... similarly, we've never faulted in PROT_NONE pages */
        if (!vma_is_accessible(vma))
                return -EFAULT;

        gup_flags = FOLL_TOUCH;
        /*
         * We want to touch writable mappings with a write fault in order
         * to break COW, except for shared mappings because these don't COW
         * and we would not want to dirty them for nothing.
         *
         * Otherwise, do a read fault, and use FOLL_FORCE in case it's not
         * readable (ie write-only or executable).
         */
        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
        else
                gup_flags |= FOLL_FORCE;

        if (locked)
                gup_flags |= FOLL_UNLOCKABLE;

        /*
         * We made sure addr is within a VMA, so the following will
         * not result in a stack expansion that recurses back here.
         */
        ret = __get_user_pages(mm, start, nr_pages, gup_flags,
                               NULL, locked ? locked : &local_locked);
        lru_add_drain();
        return ret;
}

/*
 * faultin_page_range() - populate (prefault) page tables inside the
 *                          given range readable/writable
 *
 * This takes care of mlocking the pages, too, if VM_LOCKED is set.
 *
 * @mm: the mm to populate page tables in
 * @start: start address
 * @end: end address
 * @write: whether to prefault readable or writable
 * @locked: whether the mmap_lock is still held
 *
 * Returns either number of processed pages in the MM, or a negative error
 * code on error (see __get_user_pages()). Note that this function reports
 * errors related to VMAs, such as incompatible mappings, as expected by
 * MADV_POPULATE_(READ|WRITE).
 *
 * The range must be page-aligned.
 *
 * mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
 */
long faultin_page_range(struct mm_struct *mm, unsigned long start,
                        unsigned long end, bool write, int *locked)
{
        unsigned long nr_pages = (end - start) / PAGE_SIZE;
        int gup_flags;
        long ret;

        VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
        VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
        mmap_assert_locked(mm);

        /*
         * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
         *               the page dirty with FOLL_WRITE -- which doesn't make a
         *               difference with !FOLL_FORCE, because the page is writable
         *               in the page table.
         * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
         *                  a poisoned page.
         * !FOLL_FORCE: Require proper access permissions.
         */
        gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
                    FOLL_MADV_POPULATE;
        if (write)
                gup_flags |= FOLL_WRITE;

        ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
                                      gup_flags);
        lru_add_drain();
        return ret;
}

/*
 * __mm_populate - populate and/or mlock pages within a range of address space.
 *
 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
 * flags. VMAs must be already marked with the desired vm_flags, and
 * mmap_lock must not be held.
 */
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
        struct mm_struct *mm = current->mm;
        unsigned long end, nstart, nend;
        struct vm_area_struct *vma = NULL;
        int locked = 0;
        long ret = 0;

        end = start + len;

        for (nstart = start; nstart < end; nstart = nend) {
                /*
                 * We want to fault in pages for [nstart; end) address range.
                 * Find first corresponding VMA.
                 */
                if (!locked) {
                        locked = 1;
                        mmap_read_lock(mm);
                        vma = find_vma_intersection(mm, nstart, end);
                } else if (nstart >= vma->vm_end)
                        vma = find_vma_intersection(mm, vma->vm_end, end);

                if (!vma)
                        break;
                /*
                 * Set [nstart; nend) to intersection of desired address
                 * range with the first VMA. Also, skip undesirable VMA types.
                 */
                nend = min(end, vma->vm_end);
                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                        continue;
                if (nstart < vma->vm_start)
                        nstart = vma->vm_start;
                /*
                 * Now fault in a range of pages. populate_vma_page_range()
                 * double checks the vma flags, so that it won't mlock pages
                 * if the vma was already munlocked.
                 */
                ret = populate_vma_page_range(vma, nstart, nend, &locked);
                if (ret < 0) {
                        if (ignore_errors) {
                                ret = 0;
                                continue;        /* continue at next VMA */
                        }
                        break;
                }
                nend = nstart + ret * PAGE_SIZE;
                ret = 0;
        }
        if (locked)
                mmap_read_unlock(mm);
        return ret;        /* 0 or negative error code */
}
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
                unsigned long nr_pages, struct page **pages,
                int *locked, unsigned int foll_flags)
{
        struct vm_area_struct *vma;
        bool must_unlock = false;
        vm_flags_t vm_flags;
        long i;

        if (!nr_pages)
                return 0;

        /*
         * The internal caller expects GUP to manage the lock internally and the
         * lock must be released when this returns.
         */
        if (!*locked) {
                if (mmap_read_lock_killable(mm))
                        return -EAGAIN;
                must_unlock = true;
                *locked = 1;
        }

        /* calculate required read or write permissions.
         * If FOLL_FORCE is set, we only require the "MAY" flags.
         */
        vm_flags  = (foll_flags & FOLL_WRITE) ?
                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
        vm_flags &= (foll_flags & FOLL_FORCE) ?
                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);

        for (i = 0; i < nr_pages; i++) {
                vma = find_vma(mm, start);
                if (!vma)
                        break;

                /* protect what we can, including chardevs */
                if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
                    !(vm_flags & vma->vm_flags))
                        break;

                if (pages) {
                        pages[i] = virt_to_page((void *)start);
                        if (pages[i])
                                get_page(pages[i]);
                }

                start = (start + PAGE_SIZE) & PAGE_MASK;
        }

        if (must_unlock && *locked) {
                mmap_read_unlock(mm);
                *locked = 0;
        }

        return i ? : -EFAULT;
}
#endif /* !CONFIG_MMU */

/**
 * fault_in_writeable - fault in userspace address range for writing
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 */
size_t fault_in_writeable(char __user *uaddr, size_t size)
{
        const unsigned long start = (unsigned long)uaddr;
        const unsigned long end = start + size;
        unsigned long cur;

        if (unlikely(size == 0))
                return 0;
        if (!user_write_access_begin(uaddr, size))
                return size;

        /* Stop once we overflow to 0. */
        for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
                unsafe_put_user(0, (char __user *)cur, out);
out:
        user_write_access_end();
        if (size > cur - start)
                return size - (cur - start);
        return 0;
}
EXPORT_SYMBOL(fault_in_writeable);

/**
 * fault_in_subpage_writeable - fault in an address range for writing
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Fault in a user address range for writing while checking for permissions at
 * sub-page granularity (e.g. arm64 MTE). This function should be used when
 * the caller cannot guarantee forward progress of a copy_to_user() loop.
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 */
size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
{
        size_t faulted_in;

        /*
         * Attempt faulting in at page granularity first for page table
         * permission checking. The arch-specific probe_subpage_writeable()
         * functions may not check for this.
         */
        faulted_in = size - fault_in_writeable(uaddr, size);
        if (faulted_in)
                faulted_in -= probe_subpage_writeable(uaddr, faulted_in);

        return size - faulted_in;
}
EXPORT_SYMBOL(fault_in_subpage_writeable);

/*
 * fault_in_safe_writeable - fault in an address range for writing
 * @uaddr: start of address range
 * @size: length of address range
 *
 * Faults in an address range for writing.  This is primarily useful when we
 * already know that some or all of the pages in the address range aren't in
 * memory.
 *
 * Unlike fault_in_writeable(), this function is non-destructive.
 *
 * Note that we don't pin or otherwise hold the pages referenced that we fault
 * in.  There's no guarantee that they'll stay in memory for any duration of
 * time.
 *
 * Returns the number of bytes not faulted in, like copy_to_user() and
 * copy_from_user().
 */
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
        const unsigned long start = (unsigned long)uaddr;
        const unsigned long end = start + size;
        unsigned long cur;
        struct mm_struct *mm = current->mm;
        bool unlocked = false;

        if (unlikely(size == 0))
                return 0;

        mmap_read_lock(mm);
        /* Stop once we overflow to 0. */
        for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
                if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))
                        break;
        mmap_read_unlock(mm);

        if (size > cur - start)
                return size - (cur - start);
        return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);

/**
 * fault_in_readable - fault in userspace address range for reading
 * @uaddr: start of user address range
 * @size: size of user address range
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 */
size_t fault_in_readable(const char __user *uaddr, size_t size)
{
        const unsigned long start = (unsigned long)uaddr;
        const unsigned long end = start + size;
        unsigned long cur;
        volatile char c;

        if (unlikely(size == 0))
                return 0;
        if (!user_read_access_begin(uaddr, size))
                return size;

        /* Stop once we overflow to 0. */
        for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
                unsafe_get_user(c, (const char __user *)cur, out);
out:
        user_read_access_end();
        (void)c;
        if (size > cur - start)
                return size - (cur - start);
        return 0;
}
EXPORT_SYMBOL(fault_in_readable);

/**
 * get_dump_page() - pin user page in memory while writing it to core dump
 * @addr: user address
 * @locked: a pointer to an int denoting whether the mmap sem is held
 *
 * Returns struct page pointer of user page pinned for dump,
 * to be freed afterwards by put_page().
 *
 * Returns NULL on any kind of failure - a hole must then be inserted into
 * the corefile, to preserve alignment with its headers; and also returns
 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
 * allowing a hole to be left in the corefile to save disk space.
 *
 * Called without mmap_lock (takes and releases the mmap_lock by itself).
 */
#ifdef CONFIG_ELF_CORE
struct page *get_dump_page(unsigned long addr, int *locked)
{
        struct page *page;
        int ret;

        ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
                                      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
        return (ret == 1) ? page : NULL;
}
#endif /* CONFIG_ELF_CORE */

#ifdef CONFIG_MIGRATION

/*
 * An array of either pages or folios ("pofs"). Although it may seem tempting to
 * avoid this complication, by simply interpreting a list of folios as a list of
 * pages, that approach won't work in the longer term, because eventually the
 * layouts of struct page and struct folio will become completely different.
 * Furthermore, this pof approach avoids excessive page_folio() calls.
 */
struct pages_or_folios {
        union {
                struct page **pages;
                struct folio **folios;
                void **entries;
        };
        bool has_folios;
        long nr_entries;
};

static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
{
        if (pofs->has_folios)
                return pofs->folios[i];
        return page_folio(pofs->pages[i]);
}

static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
{
        pofs->entries[i] = NULL;
}

static void pofs_unpin(struct pages_or_folios *pofs)
{
        if (pofs->has_folios)
                unpin_folios(pofs->folios, pofs->nr_entries);
        else
                unpin_user_pages(pofs->pages, pofs->nr_entries);
}

static struct folio *pofs_next_folio(struct folio *folio,
                struct pages_or_folios *pofs, long *index_ptr)
{
        long i = *index_ptr + 1;

        if (!pofs->has_folios && folio_test_large(folio)) {
                const unsigned long start_pfn = folio_pfn(folio);
                const unsigned long end_pfn = start_pfn + folio_nr_pages(folio);

                for (; i < pofs->nr_entries; i++) {
                        unsigned long pfn = page_to_pfn(pofs->pages[i]);

                        /* Is this page part of this folio? */
                        if (pfn < start_pfn || pfn >= end_pfn)
                                break;
                }
        }

        if (unlikely(i == pofs->nr_entries))
                return NULL;
        *index_ptr = i;

        return pofs_get_folio(pofs, i);
}

/*
 * Returns the number of collected folios. Return value is always >= 0.
 */
static unsigned long collect_longterm_unpinnable_folios(
                struct list_head *movable_folio_list,
                struct pages_or_folios *pofs)
{
        unsigned long collected = 0;
        struct folio *folio;
        int drained = 0;
        long i = 0;

        for (folio = pofs_get_folio(pofs, i); folio;
             folio = pofs_next_folio(folio, pofs, &i)) {

                if (folio_is_longterm_pinnable(folio))
                        continue;

                collected++;

                if (folio_is_device_coherent(folio))
                        continue;

                if (folio_test_hugetlb(folio)) {
                        folio_isolate_hugetlb(folio, movable_folio_list);
                        continue;
                }

                if (drained == 0 && folio_may_be_lru_cached(folio) &&
                                folio_ref_count(folio) !=
                                folio_expected_ref_count(folio) + 1) {
                        lru_add_drain();
                        drained = 1;
                }
                if (drained == 1 && folio_may_be_lru_cached(folio) &&
                                folio_ref_count(folio) !=
                                folio_expected_ref_count(folio) + 1) {
                        lru_add_drain_all();
                        drained = 2;
                }

                if (!folio_isolate_lru(folio))
                        continue;

                list_add_tail(&folio->lru, movable_folio_list);
                node_stat_mod_folio(folio,
                                    NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                    folio_nr_pages(folio));
        }

        return collected;
}

/*
 * Unpins all folios and migrates device coherent folios and movable_folio_list.
 * Returns -EAGAIN if all folios were successfully migrated or -errno for
 * failure (or partial success).
 */
static int
migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list,
                                   struct pages_or_folios *pofs)
{
        int ret;
        unsigned long i;

        for (i = 0; i < pofs->nr_entries; i++) {
                struct folio *folio = pofs_get_folio(pofs, i);

                if (folio_is_device_coherent(folio)) {
                        /*
                         * Migration will fail if the folio is pinned, so
                         * convert the pin on the source folio to a normal
                         * reference.
                         */
                        pofs_clear_entry(pofs, i);
                        folio_get(folio);
                        gup_put_folio(folio, 1, FOLL_PIN);

                        if (migrate_device_coherent_folio(folio)) {
                                ret = -EBUSY;
                                goto err;
                        }

                        continue;
                }

                /*
                 * We can't migrate folios with unexpected references, so drop
                 * the reference obtained by __get_user_pages_locked().
                 * Migrating folios have been added to movable_folio_list after
                 * calling folio_isolate_lru() which takes a reference so the
                 * folio won't be freed if it's migrating.
                 */
                unpin_folio(folio);
                pofs_clear_entry(pofs, i);
        }

        if (!list_empty(movable_folio_list)) {
                struct migration_target_control mtc = {
                        .nid = NUMA_NO_NODE,
                        .gfp_mask = GFP_USER | __GFP_NOWARN,
                        .reason = MR_LONGTERM_PIN,
                };

                if (migrate_pages(movable_folio_list, alloc_migration_target,
                                  NULL, (unsigned long)&mtc, MIGRATE_SYNC,
                                  MR_LONGTERM_PIN, NULL)) {
                        ret = -ENOMEM;
                        goto err;
                }
        }

        putback_movable_pages(movable_folio_list);

        return -EAGAIN;

err:
        pofs_unpin(pofs);
        putback_movable_pages(movable_folio_list);

        return ret;
}

static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
{
        LIST_HEAD(movable_folio_list);
        unsigned long collected;

        collected = collect_longterm_unpinnable_folios(&movable_folio_list,
                                                       pofs);
        if (!collected)
                return 0;

        return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
}

/*
 * Check whether all folios are *allowed* to be pinned indefinitely (long term).
 * Rather confusingly, all folios in the range are required to be pinned via
 * FOLL_PIN, before calling this routine.
 *
 * Return values:
 *
 * 0: if everything is OK and all folios in the range are allowed to be pinned,
 * then this routine leaves all folios pinned and returns zero for success.
 *
 * -EAGAIN: if any folios in the range are not allowed to be pinned, then this
 * routine will migrate those folios away, unpin all the folios in the range. If
 * migration of the entire set of folios succeeds, then -EAGAIN is returned. The
 * caller should re-pin the entire range with FOLL_PIN and then call this
 * routine again.
 *
 * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this
 * indicates a migration failure. The caller should give up, and propagate the
 * error back up the call stack. The caller does not need to unpin any folios in
 * that case, because this routine will do the unpinning.
 */
static long check_and_migrate_movable_folios(unsigned long nr_folios,
                                             struct folio **folios)
{
        struct pages_or_folios pofs = {
                .folios = folios,
                .has_folios = true,
                .nr_entries = nr_folios,
        };

        return check_and_migrate_movable_pages_or_folios(&pofs);
}

/*
 * Return values and behavior are the same as those for
 * check_and_migrate_movable_folios().
 */
static long check_and_migrate_movable_pages(unsigned long nr_pages,
                                            struct page **pages)
{
        struct pages_or_folios pofs = {
                .pages = pages,
                .has_folios = false,
                .nr_entries = nr_pages,
        };

        return check_and_migrate_movable_pages_or_folios(&pofs);
}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
                                            struct page **pages)
{
        return 0;
}

static long check_and_migrate_movable_folios(unsigned long nr_folios,
                                             struct folio **folios)
{
        return 0;
}
#endif /* CONFIG_MIGRATION */

/*
 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
 * allows us to process the FOLL_LONGTERM flag.
 */
static long __gup_longterm_locked(struct mm_struct *mm,
                                  unsigned long start,
                                  unsigned long nr_pages,
                                  struct page **pages,
                                  int *locked,
                                  unsigned int gup_flags)
{
        unsigned int flags;
        long rc, nr_pinned_pages;

        if (!(gup_flags & FOLL_LONGTERM))
                return __get_user_pages_locked(mm, start, nr_pages, pages,
                                               locked, gup_flags);

        flags = memalloc_pin_save();
        do {
                nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
                                                          pages, locked,
                                                          gup_flags);
                if (nr_pinned_pages <= 0) {
                        rc = nr_pinned_pages;
                        break;
                }

                /* FOLL_LONGTERM implies FOLL_PIN */
                rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
        } while (rc == -EAGAIN);
        memalloc_pin_restore(flags);
        return rc ? rc : nr_pinned_pages;
}

/*
 * Check that the given flags are valid for the exported gup/pup interface, and
 * update them with the required flags that the caller must have set.
 */
static bool is_valid_gup_args(struct page **pages, int *locked,
                              unsigned int *gup_flags_p, unsigned int to_set)
{
        unsigned int gup_flags = *gup_flags_p;

        /*
         * These flags not allowed to be specified externally to the gup
         * interfaces:
         * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
         * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote()
         * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
         */
        if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))
                return false;

        gup_flags |= to_set;
        if (locked) {
                /* At the external interface locked must be set */
                if (WARN_ON_ONCE(*locked != 1))
                        return false;

                gup_flags |= FOLL_UNLOCKABLE;
        }

        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
                         (FOLL_PIN | FOLL_GET)))
                return false;

        /* LONGTERM can only be specified when pinning */
        if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
                return false;

        /* Pages input must be given if using GET/PIN */
        if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
                return false;

        /* We want to allow the pgmap to be hot-unplugged at all times */
        if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
                         (gup_flags & FOLL_PCI_P2PDMA)))
                return false;

        *gup_flags_p = gup_flags;
        return true;
}

#ifdef CONFIG_MMU
/**
 * get_user_pages_remote() - pin user pages in memory
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long. Or NULL, if caller
 *                only intends to ensure the pages are faulted in.
 * @locked:        pointer to lock flag indicating whether lock is held and
 *                subsequently whether VM_FAULT_RETRY functionality can be
 *                utilised. Lock must initially be held.
 *
 * Returns either number of pages pinned (which may be less than the
 * number requested), or an error. Details about the return value:
 *
 * -- If nr_pages is 0, returns 0.
 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
 * -- If nr_pages is >0, and some pages were pinned, returns the number of
 *    pages pinned. Again, this may be less than nr_pages.
 *
 * The caller is responsible for releasing returned @pages, via put_page().
 *
 * Must be called with mmap_lock held for read or write.
 *
 * get_user_pages_remote walks a process's page tables and takes a reference
 * to each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * get_user_pages_remote returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re-faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
 * be called after the page is finished with, and before put_page is called.
 *
 * get_user_pages_remote is typically used for fewer-copy IO operations,
 * to get a handle on the memory by some means other than accesses
 * via the user virtual addresses. The pages may be submitted for
 * DMA to devices or accessed via their kernel linear mapping (via the
 * kmap APIs). Care should be taken to use the correct cache flushing APIs.
 *
 * See also get_user_pages_fast, for performance critical applications.
 *
 * get_user_pages_remote should be phased out in favor of
 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
 * should use get_user_pages_remote because it cannot pass
 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
 */
long get_user_pages_remote(struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
                int *locked)
{
        int local_locked = 1;

        if (!is_valid_gup_args(pages, locked, &gup_flags,
                               FOLL_TOUCH | FOLL_REMOTE))
                return -EINVAL;

        return __get_user_pages_locked(mm, start, nr_pages, pages,
                                       locked ? locked : &local_locked,
                                       gup_flags);
}
EXPORT_SYMBOL(get_user_pages_remote);

#else /* CONFIG_MMU */
long get_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked)
{
        return 0;
}
#endif /* !CONFIG_MMU */

/**
 * get_user_pages() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying lookup behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long. Or NULL, if caller
 *              only intends to ensure the pages are faulted in.
 *
 * This is the same as get_user_pages_remote(), just with a less-flexible
 * calling convention where we assume that the mm being operated on belongs to
 * the current task, and doesn't allow passing of a locked parameter.  We also
 * obviously don't pass FOLL_REMOTE in here.
 */
long get_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages)
{
        int locked = 1;

        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
                return -EINVAL;

        return __get_user_pages_locked(current->mm, start, nr_pages, pages,
                                       &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages);

/*
 * get_user_pages_unlocked() is suitable to replace the form:
 *
 *      mmap_read_lock(mm);
 *      get_user_pages(mm, ..., pages, NULL);
 *      mmap_read_unlock(mm);
 *
 *  with:
 *
 *      get_user_pages_unlocked(mm, ..., pages);
 *
 * It is functionally equivalent to get_user_pages_fast so
 * get_user_pages_fast should be used instead if specific gup_flags
 * (e.g. FOLL_FORCE) are not required.
 */
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                             struct page **pages, unsigned int gup_flags)
{
        int locked = 0;

        if (!is_valid_gup_args(pages, NULL, &gup_flags,
                               FOLL_TOUCH | FOLL_UNLOCKABLE))
                return -EINVAL;

        return __get_user_pages_locked(current->mm, start, nr_pages, pages,
                                       &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages_unlocked);

/*
 * GUP-fast
 *
 * get_user_pages_fast attempts to pin user pages by walking the page
 * tables directly and avoids taking locks. Thus the walker needs to be
 * protected from page table pages being freed from under it, and should
 * block any THP splits.
 *
 * One way to achieve this is to have the walker disable interrupts, and
 * rely on IPIs from the TLB flushing code blocking before the page table
 * pages are freed. This is unsuitable for architectures that do not need
 * to broadcast an IPI when invalidating TLBs.
 *
 * Another way to achieve this is to batch up page table containing pages
 * belonging to more than one mm_user, then rcu_sched a callback to free those
 * pages. Disabling interrupts will allow the gup_fast() walker to both block
 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
 * (which is a relatively rare event). The code below adopts this strategy.
 *
 * Before activating this code, please be aware that the following assumptions
 * are currently made:
 *
 *  *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
 *  free pages containing page tables or TLB flushing requires IPI broadcast.
 *
 *  *) ptes can be read atomically by the architecture.
 *
 *  *) valid user addresses are below TASK_MAX_SIZE
 *
 * The last two assumptions can be relaxed by the addition of helper functions.
 *
 * This code is based heavily on the PowerPC implementation by Nick Piggin.
 */
#ifdef CONFIG_HAVE_GUP_FAST
/*
 * Used in the GUP-fast path to determine whether GUP is permitted to work on
 * a specific folio.
 *
 * This call assumes the caller has pinned the folio, that the lowest page table
 * level still points to this folio, and that interrupts have been disabled.
 *
 * GUP-fast must reject all secretmem folios.
 *
 * Writing to pinned file-backed dirty tracked folios is inherently problematic
 * (see comment describing the writable_file_mapping_allowed() function). We
 * therefore try to avoid the most egregious case of a long-term mapping doing
 * so.
 *
 * This function cannot be as thorough as that one as the VMA is not available
 * in the fast path, so instead we whitelist known good cases and if in doubt,
 * fall back to the slow path.
 */
static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
{
        bool reject_file_backed = false;
        struct address_space *mapping;
        bool check_secretmem = false;
        unsigned long mapping_flags;

        /*
         * If we aren't pinning then no problematic write can occur. A long term
         * pin is the most egregious case so this is the one we disallow.
         */
        if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
            (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
                reject_file_backed = true;

        /* We hold a folio reference, so we can safely access folio fields. */

        /* secretmem folios are always order-0 folios. */
        if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
                check_secretmem = true;

        if (!reject_file_backed && !check_secretmem)
                return true;

        if (WARN_ON_ONCE(folio_test_slab(folio)))
                return false;

        /* hugetlb neither requires dirty-tracking nor can be secretmem. */
        if (folio_test_hugetlb(folio))
                return true;

        /*
         * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
         * cannot proceed, which means no actions performed under RCU can
         * proceed either.
         *
         * inodes and thus their mappings are freed under RCU, which means the
         * mapping cannot be freed beneath us and thus we can safely dereference
         * it.
         */
        lockdep_assert_irqs_disabled();

        /*
         * However, there may be operations which _alter_ the mapping, so ensure
         * we read it once and only once.
         */
        mapping = READ_ONCE(folio->mapping);

        /*
         * The mapping may have been truncated, in any case we cannot determine
         * if this mapping is safe - fall back to slow path to determine how to
         * proceed.
         */
        if (!mapping)
                return false;

        /* Anonymous folios pose no problem. */
        mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS;
        if (mapping_flags)
                return mapping_flags & FOLIO_MAPPING_ANON;

        /*
         * At this point, we know the mapping is non-null and points to an
         * address_space object.
         */
        if (check_secretmem && secretmem_mapping(mapping))
                return false;
        /* The only remaining allowed file system is shmem. */
        return !reject_file_backed || shmem_mapping(mapping);
}

#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
/*
 * GUP-fast relies on pte change detection to avoid concurrent pgtable
 * operations.
 *
 * To pin the page, GUP-fast needs to do below in order:
 * (1) pin the page (by prefetching pte), then (2) check pte not changed.
 *
 * For the rest of pgtable operations where pgtable updates can be racy
 * with GUP-fast, we need to do (1) clear pte, then (2) check whether page
 * is pinned.
 *
 * Above will work for all pte-level operations, including THP split.
 *
 * For THP collapse, it's a bit more complicated because GUP-fast may be
 * walking a pgtable page that is being freed (pte is still valid but pmd
 * can be cleared already).  To avoid race in such condition, we need to
 * also check pmd here to make sure pmd doesn't change (corresponds to
 * pmdp_collapse_flush() in the THP collapse code path).
 */
static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        int ret = 0;
        pte_t *ptep, *ptem;

        ptem = ptep = pte_offset_map(&pmd, addr);
        if (!ptep)
                return 0;
        do {
                pte_t pte = ptep_get_lockless(ptep);
                struct page *page;
                struct folio *folio;

                /*
                 * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
                 * pte_access_permitted() better should reject these pages
                 * either way: otherwise, GUP-fast might succeed in
                 * cases where ordinary GUP would fail due to VMA access
                 * permissions.
                 */
                if (pte_protnone(pte))
                        goto pte_unmap;

                if (!pte_access_permitted(pte, flags & FOLL_WRITE))
                        goto pte_unmap;

                if (pte_special(pte))
                        goto pte_unmap;

                /* If it's not marked as special it must have a valid memmap. */
                VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);

                folio = try_grab_folio_fast(page, 1, flags);
                if (!folio)
                        goto pte_unmap;

                if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
                    unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }

                if (!gup_fast_folio_allowed(folio, flags)) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }

                if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }

                /*
                 * We need to make the page accessible if and only if we are
                 * going to access its content (the FOLL_PIN case).  Please
                 * see Documentation/core-api/pin_user_pages.rst for
                 * details.
                 */
                if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }
                folio_set_referenced(folio);
                pages[*nr] = page;
                (*nr)++;
        } while (ptep++, addr += PAGE_SIZE, addr != end);

        ret = 1;

pte_unmap:
        pte_unmap(ptem);
        return ret;
}
#else

/*
 * If we can't determine whether or not a pte is special, then fail immediately
 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
 * to be special.
 *
 * For a futex to be placed on a THP tail page, get_futex_key requires a
 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
 * useful to have gup_fast_pmd_leaf even if we can't operate on ptes.
 */
static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        return 0;
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */

static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        struct page *page;
        struct folio *folio;
        int refs;

        if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        if (pmd_special(orig))
                return 0;

        refs = (end - addr) >> PAGE_SHIFT;
        page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);

        folio = try_grab_folio_fast(page, refs, flags);
        if (!folio)
                return 0;

        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!gup_fast_folio_allowed(folio, flags)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }
        if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        pages += *nr;
        *nr += refs;
        for (; refs; refs--)
                *(pages++) = page++;
        folio_set_referenced(folio);
        return 1;
}

static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        struct page *page;
        struct folio *folio;
        int refs;

        if (!pud_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        if (pud_special(orig))
                return 0;

        refs = (end - addr) >> PAGE_SHIFT;
        page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);

        folio = try_grab_folio_fast(page, refs, flags);
        if (!folio)
                return 0;

        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!gup_fast_folio_allowed(folio, flags)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        pages += *nr;
        *nr += refs;
        for (; refs; refs--)
                *(pages++) = page++;
        folio_set_referenced(folio);
        return 1;
}

static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long next;
        pmd_t *pmdp;

        pmdp = pmd_offset_lockless(pudp, pud, addr);
        do {
                pmd_t pmd = pmdp_get_lockless(pmdp);

                next = pmd_addr_end(addr, end);
                if (!pmd_present(pmd))
                        return 0;

                if (unlikely(pmd_leaf(pmd))) {
                        /* See gup_fast_pte_range() */
                        if (pmd_protnone(pmd))
                                return 0;

                        if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags,
                                pages, nr))
                                return 0;

                } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,
                                               pages, nr))
                        return 0;
        } while (pmdp++, addr = next, addr != end);

        return 1;
}

static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long next;
        pud_t *pudp;

        pudp = pud_offset_lockless(p4dp, p4d, addr);
        do {
                pud_t pud = pudp_get(pudp);

                next = pud_addr_end(addr, end);
                if (unlikely(!pud_present(pud)))
                        return 0;
                if (unlikely(pud_leaf(pud))) {
                        if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags,
                                               pages, nr))
                                return 0;
                } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,
                                               pages, nr))
                        return 0;
        } while (pudp++, addr = next, addr != end);

        return 1;
}

static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long next;
        p4d_t *p4dp;

        p4dp = p4d_offset_lockless(pgdp, pgd, addr);
        do {
                p4d_t p4d = p4dp_get(p4dp);

                next = p4d_addr_end(addr, end);
                if (!p4d_present(p4d))
                        return 0;
                BUILD_BUG_ON(p4d_leaf(p4d));
                if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,
                                        pages, nr))
                        return 0;
        } while (p4dp++, addr = next, addr != end);

        return 1;
}

static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
                unsigned int flags, struct page **pages, int *nr)
{
        unsigned long next;
        pgd_t *pgdp;

        pgdp = pgd_offset(current->mm, addr);
        do {
                pgd_t pgd = pgdp_get(pgdp);

                next = pgd_addr_end(addr, end);
                if (pgd_none(pgd))
                        return;
                BUILD_BUG_ON(pgd_leaf(pgd));
                if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
                                        pages, nr))
                        return;
        } while (pgdp++, addr = next, addr != end);
}
#else
static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end,
                unsigned int flags, struct page **pages, int *nr)
{
}
#endif /* CONFIG_HAVE_GUP_FAST */

#ifndef gup_fast_permitted
/*
 * Check if it's allowed to use get_user_pages_fast_only() for the range, or
 * we need to fall back to the slow version:
 */
static bool gup_fast_permitted(unsigned long start, unsigned long end)
{
        return true;
}
#endif

static unsigned long gup_fast(unsigned long start, unsigned long end,
                unsigned int gup_flags, struct page **pages)
{
        unsigned long flags;
        int nr_pinned = 0;
        unsigned seq;

        if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) ||
            !gup_fast_permitted(start, end))
                return 0;

        if (gup_flags & FOLL_PIN) {
                if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq))
                        return 0;
        }

        /*
         * Disable interrupts. The nested form is used, in order to allow full,
         * general purpose use of this routine.
         *
         * With interrupts disabled, we block page table pages from being freed
         * from under us. See struct mmu_table_batch comments in
         * include/asm-generic/tlb.h for more details.
         *
         * We do not adopt an rcu_read_lock() here as we also want to block IPIs
         * that come from callers of tlb_remove_table_sync_one().
         */
        local_irq_save(flags);
        gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
        local_irq_restore(flags);

        /*
         * When pinning pages for DMA there could be a concurrent write protect
         * from fork() via copy_page_range(), in this case always fail GUP-fast.
         */
        if (gup_flags & FOLL_PIN) {
                if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
                        gup_fast_unpin_user_pages(pages, nr_pinned);
                        return 0;
                } else {
                        sanity_check_pinned_pages(pages, nr_pinned);
                }
        }
        return nr_pinned;
}

static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages)
{
        unsigned long len, end;
        unsigned long nr_pinned;
        int locked = 0;
        int ret;

        if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
                                       FOLL_FORCE | FOLL_PIN | FOLL_GET |
                                       FOLL_FAST_ONLY | FOLL_NOFAULT |
                                       FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))
                return -EINVAL;

        if (gup_flags & FOLL_PIN)
                mm_set_has_pinned_flag(current->mm);

        if (!(gup_flags & FOLL_FAST_ONLY))
                might_lock_read(&current->mm->mmap_lock);

        start = untagged_addr(start) & PAGE_MASK;
        len = nr_pages << PAGE_SHIFT;
        if (check_add_overflow(start, len, &end))
                return -EOVERFLOW;
        if (end > TASK_SIZE_MAX)
                return -EFAULT;

        nr_pinned = gup_fast(start, end, gup_flags, pages);
        if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
                return nr_pinned;

        /* Slow path: try to get the remaining pages with get_user_pages */
        start += nr_pinned << PAGE_SHIFT;
        pages += nr_pinned;
        ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
                                    pages, &locked,
                                    gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
        if (ret < 0) {
                /*
                 * The caller has to unpin the pages we already pinned so
                 * returning -errno is not an option
                 */
                if (nr_pinned)
                        return nr_pinned;
                return ret;
        }
        return ret + nr_pinned;
}

/**
 * get_user_pages_fast_only() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
 * the regular GUP.
 *
 * If the architecture does not support this function, simply return with no
 * pages pinned.
 *
 * Careful, careful! COW breaking can go either way, so a non-write
 * access can get ambiguous page results. If you call this function without
 * 'write' set, you'd better be sure that you're ok with that ambiguity.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages)
{
        /*
         * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
         * because gup fast is always a "pin with a +1 page refcount" request.
         *
         * FOLL_FAST_ONLY is required in order to match the API description of
         * this routine: no fall back to regular ("slow") GUP.
         */
        if (!is_valid_gup_args(pages, NULL, &gup_flags,
                               FOLL_GET | FOLL_FAST_ONLY))
                return -EINVAL;

        return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast_only);

/**
 * get_user_pages_fast() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Attempt to pin user pages in memory without taking mm->mmap_lock.
 * If not successful, it will fall back to taking the lock and
 * calling get_user_pages().
 *
 * Returns number of pages pinned. This may be fewer than the number requested.
 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
 * -errno.
 */
int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages)
{
        /*
         * The caller may or may not have explicitly set FOLL_GET; either way is
         * OK. However, internally (within mm/gup.c), gup fast variants must set
         * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
         * request.
         */
        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
                return -EINVAL;
        return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);

/**
 * pin_user_pages_fast() - pin user pages in memory without taking locks
 *
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
 * get_user_pages_fast() for documentation on the function arguments, because
 * the arguments here are identical.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for further details.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page() will not remove pins from it.
 */
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages)
{
        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
                return -EINVAL;
        return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);

/**
 * pin_user_pages_remote() - pin pages of a remote process
 *
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long.
 * @locked:        pointer to lock flag indicating whether lock is held and
 *                subsequently whether VM_FAULT_RETRY functionality can be
 *                utilised. Lock must initially be held.
 *
 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
 * get_user_pages_remote() for documentation on the function arguments, because
 * the arguments here are identical.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for details.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page*() will not remove pins from it.
 */
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked)
{
        int local_locked = 1;

        if (!is_valid_gup_args(pages, locked, &gup_flags,
                               FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
                return 0;
        return __gup_longterm_locked(mm, start, nr_pages, pages,
                                     locked ? locked : &local_locked,
                                     gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_remote);

/**
 * pin_user_pages() - pin user pages in memory for use by other devices
 *
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long.
 *
 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
 * FOLL_PIN is set.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for details.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page*() will not remove pins from it.
 */
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages)
{
        int locked = 1;

        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
                return 0;
        return __gup_longterm_locked(current->mm, start, nr_pages,
                                     pages, &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);

/*
 * pin_user_pages_unlocked() is the FOLL_PIN variant of
 * get_user_pages_unlocked(). Behavior is the same, except that this one sets
 * FOLL_PIN and rejects FOLL_GET.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page*() will not remove pins from it.
 */
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                             struct page **pages, unsigned int gup_flags)
{
        int locked = 0;

        if (!is_valid_gup_args(pages, NULL, &gup_flags,
                               FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
                return 0;

        return __gup_longterm_locked(current->mm, start, nr_pages, pages,
                                     &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);

/**
 * memfd_pin_folios() - pin folios associated with a memfd
 * @memfd:      the memfd whose folios are to be pinned
 * @start:      the first memfd offset
 * @end:        the last memfd offset (inclusive)
 * @folios:     array that receives pointers to the folios pinned
 * @max_folios: maximum number of entries in @folios
 * @offset:     the offset into the first folio
 *
 * Attempt to pin folios associated with a memfd in the contiguous range
 * [start, end]. Given that a memfd is either backed by shmem or hugetlb,
 * the folios can either be found in the page cache or need to be allocated
 * if necessary. Once the folios are located, they are all pinned via
 * FOLL_PIN and @offset is populatedwith the offset into the first folio.
 * And, eventually, these pinned folios must be released either using
 * unpin_folios() or unpin_folio().
 *
 * It must be noted that the folios may be pinned for an indefinite amount
 * of time. And, in most cases, the duration of time they may stay pinned
 * would be controlled by the userspace. This behavior is effectively the
 * same as using FOLL_LONGTERM with other GUP APIs.
 *
 * Returns number of folios pinned, which could be less than @max_folios
 * as it depends on the folio sizes that cover the range [start, end].
 * If no folios were pinned, it returns -errno.
 */
long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
                      struct folio **folios, unsigned int max_folios,
                      pgoff_t *offset)
{
        unsigned int flags, nr_folios, nr_found;
        unsigned int i, pgshift = PAGE_SHIFT;
        pgoff_t start_idx, end_idx;
        struct folio *folio = NULL;
        struct folio_batch fbatch;
        struct hstate *h;
        long ret = -EINVAL;

        if (start < 0 || start > end || !max_folios)
                return -EINVAL;

        if (!memfd)
                return -EINVAL;

        if (!shmem_file(memfd) && !is_file_hugepages(memfd))
                return -EINVAL;

        if (end >= i_size_read(file_inode(memfd)))
                return -EINVAL;

        if (is_file_hugepages(memfd)) {
                h = hstate_file(memfd);
                pgshift = huge_page_shift(h);
        }

        flags = memalloc_pin_save();
        do {
                nr_folios = 0;
                start_idx = start >> pgshift;
                end_idx = end >> pgshift;
                if (is_file_hugepages(memfd)) {
                        start_idx <<= huge_page_order(h);
                        end_idx <<= huge_page_order(h);
                }

                folio_batch_init(&fbatch);
                while (start_idx <= end_idx && nr_folios < max_folios) {
                        /*
                         * In most cases, we should be able to find the folios
                         * in the page cache. If we cannot find them for some
                         * reason, we try to allocate them and add them to the
                         * page cache.
                         */
                        nr_found = filemap_get_folios_contig(memfd->f_mapping,
                                                             &start_idx,
                                                             end_idx,
                                                             &fbatch);
                        if (folio) {
                                folio_put(folio);
                                folio = NULL;
                        }

                        for (i = 0; i < nr_found; i++) {
                                folio = fbatch.folios[i];

                                if (try_grab_folio(folio, 1, FOLL_PIN)) {
                                        folio_batch_release(&fbatch);
                                        ret = -EINVAL;
                                        goto err;
                                }

                                if (nr_folios == 0)
                                        *offset = offset_in_folio(folio, start);

                                folios[nr_folios] = folio;
                                if (++nr_folios == max_folios)
                                        break;
                        }

                        folio = NULL;
                        folio_batch_release(&fbatch);
                        if (!nr_found) {
                                folio = memfd_alloc_folio(memfd, start_idx);
                                if (IS_ERR(folio)) {
                                        ret = PTR_ERR(folio);
                                        if (ret != -EEXIST)
                                                goto err;
                                        folio = NULL;
                                }
                        }
                }

                ret = check_and_migrate_movable_folios(nr_folios, folios);
        } while (ret == -EAGAIN);

        memalloc_pin_restore(flags);
        return ret ? ret : nr_folios;
err:
        memalloc_pin_restore(flags);
        unpin_folios(folios, nr_folios);

        return ret;
}
EXPORT_SYMBOL_GPL(memfd_pin_folios);

/**
 * folio_add_pins() - add pins to an already-pinned folio
 * @folio: the folio to add more pins to
 * @pins: number of pins to add
 *
 * Try to add more pins to an already-pinned folio. The semantics
 * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot
 * be changed.
 *
 * This function is helpful when having obtained a pin on a large folio
 * using memfd_pin_folios(), but wanting to logically unpin parts
 * (e.g., individual pages) of the folio later, for example, using
 * unpin_user_page_range_dirty_lock().
 *
 * This is not the right interface to initially pin a folio.
 */
int folio_add_pins(struct folio *folio, unsigned int pins)
{
        VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));

        return try_grab_folio(folio, pins, FOLL_PIN);
}
EXPORT_SYMBOL_GPL(folio_add_pins);






































































































































































































































































































































































































































































































































































































































































































































































































































































   27 















































































































































































    3 









    3 





























































































































   24 

    3 


























    1 
    3 

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SEQLOCK_H
#define __LINUX_SEQLOCK_H

/*
 * seqcount_t / seqlock_t - a reader-writer consistency mechanism with
 * lockless readers (read-only retry loops), and no writer starvation.
 *
 * See Documentation/locking/seqlock.rst
 *
 * Copyrights:
 * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli
 * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH
 */

#include <linux/compiler.h>
#include <linux/cleanup.h>
#include <linux/kcsan-checks.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/preempt.h>
#include <linux/seqlock_types.h>
#include <linux/spinlock.h>

#include <asm/processor.h>

/*
 * The seqlock seqcount_t interface does not prescribe a precise sequence of
 * read begin/retry/end. For readers, typically there is a call to
 * read_seqcount_begin() and read_seqcount_retry(), however, there are more
 * esoteric cases which do not follow this pattern.
 *
 * As a consequence, we take the following best-effort approach for raw usage
 * via seqcount_t under KCSAN: upon beginning a seq-reader critical section,
 * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as
 * atomics; if there is a matching read_seqcount_retry() call, no following
 * memory operations are considered atomic. Usage of the seqlock_t interface
 * is not affected.
 */
#define KCSAN_SEQLOCK_REGION_MAX 1000

static inline void __seqcount_init(seqcount_t *s, const char *name,
                                          struct lock_class_key *key)
{
        /*
         * Make sure we are not reinitializing a held lock:
         */
        lockdep_init_map(&s->dep_map, name, key, 0);
        s->sequence = 0;
}

#ifdef CONFIG_DEBUG_LOCK_ALLOC

# define SEQCOUNT_DEP_MAP_INIT(lockname)                                \
                .dep_map = { .name = #lockname }

/**
 * seqcount_init() - runtime initializer for seqcount_t
 * @s: Pointer to the seqcount_t instance
 */
# define seqcount_init(s)                                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                __seqcount_init((s), #s, &__key);                        \
        } while (0)

static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
{
        seqcount_t *l = (seqcount_t *)s;
        unsigned long flags;

        local_irq_save(flags);
        seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_);
        seqcount_release(&l->dep_map, _RET_IP_);
        local_irq_restore(flags);
}

#else
# define SEQCOUNT_DEP_MAP_INIT(lockname)
# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
# define seqcount_lockdep_reader_access(x)
#endif

/**
 * SEQCNT_ZERO() - static initializer for seqcount_t
 * @name: Name of the seqcount_t instance
 */
#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) }

/*
 * Sequence counters with associated locks (seqcount_LOCKNAME_t)
 *
 * A sequence counter which associates the lock used for writer
 * serialization at initialization time. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * For associated locks which do not implicitly disable preemption,
 * preemption protection is enforced in the write side function.
 *
 * Lockdep is never used in any for the raw write variants.
 *
 * See Documentation/locking/seqlock.rst
 */

/*
 * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated
 * @seqcount:        The real sequence counter
 * @lock:        Pointer to the associated lock
 *
 * A plain sequence counter with external writer synchronization by
 * LOCKNAME @lock. The lock is associated to the sequence counter in the
 * static initializer or init function. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * LOCKNAME:        raw_spinlock, spinlock, rwlock or mutex
 */

/*
 * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t
 * @s:                Pointer to the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated lock
 */

#define seqcount_LOCKNAME_init(s, _lock, lockname)                        \
        do {                                                                \
                seqcount_##lockname##_t *____s = (s);                        \
                seqcount_init(&____s->seqcount);                        \
                __SEQ_LOCK(____s->lock = (_lock));                        \
        } while (0)

#define seqcount_raw_spinlock_init(s, lock)        seqcount_LOCKNAME_init(s, lock, raw_spinlock)
#define seqcount_spinlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, spinlock)
#define seqcount_rwlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, rwlock)
#define seqcount_mutex_init(s, lock)                seqcount_LOCKNAME_init(s, lock, mutex)

/*
 * SEQCOUNT_LOCKNAME()        - Instantiate seqcount_LOCKNAME_t and helpers
 * seqprop_LOCKNAME_*()        - Property accessors for seqcount_LOCKNAME_t
 *
 * @lockname:                "LOCKNAME" part of seqcount_LOCKNAME_t
 * @locktype:                LOCKNAME canonical C data type
 * @preemptible:        preemptibility of above locktype
 * @lockbase:                prefix for associated lock/unlock
 */
#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)        \
static __always_inline seqcount_t *                                        \
__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)                        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline const seqcount_t *                                \
__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s)        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline unsigned                                                \
__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)        \
{                                                                        \
        unsigned seq = smp_load_acquire(&s->seqcount.sequence);                \
                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return seq;                                                \
                                                                        \
        if (preemptible && unlikely(seq & 1)) {                                \
                __SEQ_LOCK(lockbase##_lock(s->lock));                        \
                __SEQ_LOCK(lockbase##_unlock(s->lock));                        \
                                                                        \
                /*                                                        \
                 * Re-read the sequence counter since the (possibly        \
                 * preempted) writer made progress.                        \
                 */                                                        \
                seq = smp_load_acquire(&s->seqcount.sequence);                \
        }                                                                \
                                                                        \
        return seq;                                                        \
}                                                                        \
                                                                        \
static __always_inline bool                                                \
__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)        \
{                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return preemptible;                                        \
                                                                        \
        /* PREEMPT_RT relies on the above LOCK+UNLOCK */                \
        return false;                                                        \
}                                                                        \
                                                                        \
static __always_inline void                                                \
__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)                \
{                                                                        \
        __SEQ_LOCK(lockdep_assert_held(s->lock));                        \
}

/*
 * __seqprop() for seqcount_t
 */

static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
{
        return s;
}

static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
{
        return s;
}

static inline unsigned __seqprop_sequence(const seqcount_t *s)
{
        return smp_load_acquire(&s->sequence);
}

static inline bool __seqprop_preemptible(const seqcount_t *s)
{
        return false;
}

static inline void __seqprop_assert(const seqcount_t *s)
{
        lockdep_assert_preemption_disabled();
}

#define __SEQ_RT        IS_ENABLED(CONFIG_PREEMPT_RT)

SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
#undef SEQCOUNT_LOCKNAME

/*
 * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
 * @name:        Name of the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated LOCKNAME
 */

#define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) {                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
        __SEQ_LOCK(.lock        = (assoc_lock))                                \
}

#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_RWLOCK_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_MUTEX_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_WW_MUTEX_ZERO(name, lock)         SEQCOUNT_LOCKNAME_ZERO(name, lock)

#define __seqprop_case(s, lockname, prop)                                \
        seqcount_##lockname##_t: __seqprop_##lockname##_##prop

#define __seqprop(s, prop) _Generic(*(s),                                \
        seqcount_t:                __seqprop_##prop,                        \
        __seqprop_case((s),        raw_spinlock,        prop),                        \
        __seqprop_case((s),        spinlock,        prop),                        \
        __seqprop_case((s),        rwlock,                prop),                        \
        __seqprop_case((s),        mutex,                prop))

#define seqprop_ptr(s)                        __seqprop(s, ptr)(s)
#define seqprop_const_ptr(s)                __seqprop(s, const_ptr)(s)
#define seqprop_sequence(s)                __seqprop(s, sequence)(s)
#define seqprop_preemptible(s)                __seqprop(s, preemptible)(s)
#define seqprop_assert(s)                __seqprop(s, assert)(s)

/**
 * __read_seqcount_begin() - begin a seqcount_t read section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define __read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned __seq;                                                        \
                                                                        \
        while (unlikely((__seq = seqprop_sequence(s)) & 1))                \
                cpu_relax();                                                \
                                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount_begin(s) __read_seqcount_begin(s)

/**
 * read_seqcount_begin() - begin a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define read_seqcount_begin(s)                                                \
({                                                                        \
        seqcount_lockdep_reader_access(seqprop_const_ptr(s));                \
        raw_read_seqcount_begin(s);                                        \
})

/**
 * raw_read_seqcount() - read the raw seqcount_t counter value
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_read_seqcount opens a read critical section of the given
 * seqcount_t, without any lockdep checking, and without checking or
 * masking the sequence counter LSB. Calling code is responsible for
 * handling that.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount(s)                                                \
({                                                                        \
        unsigned __seq = seqprop_sequence(s);                                \
                                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_seqcount_try_begin() - begin a seqcount_t read critical section
 *                            w/o lockdep and w/o counter stabilization
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count to be passed to read_seqcount_retry()
 *
 * Similar to raw_seqcount_begin(), except it enables eliding the critical
 * section entirely if odd, instead of doing the speculation knowing it will
 * fail.
 *
 * Useful when counter stabilization is more or less equivalent to taking
 * the lock and there is a slowpath that does that.
 *
 * If true, start will be set to the (even) sequence count read.
 *
 * Return: true when a read critical section is started.
 */
#define raw_seqcount_try_begin(s, start)                                \
({                                                                        \
        start = raw_read_seqcount(s);                                        \
        !(start & 1);                                                        \
})

/**
 * raw_seqcount_begin() - begin a seqcount_t read critical section w/o
 *                        lockdep and w/o counter stabilization
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_seqcount_begin opens a read critical section of the given
 * seqcount_t. Unlike read_seqcount_begin(), this function will not wait
 * for the count to stabilize. If a writer is active when it begins, it
 * will fail the read_seqcount_retry() at the end of the read critical
 * section instead of stabilizing at the beginning of it.
 *
 * Use this only in special kernel hot paths where the read section is
 * small and has a high probability of success through other external
 * means. It will save a single branching instruction.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_seqcount_begin(s)                                                \
({                                                                        \
        /*                                                                \
         * If the counter is odd, let read_seqcount_retry() fail        \
         * by decrementing the counter.                                        \
         */                                                                \
        raw_read_seqcount(s) & ~1;                                        \
})

/**
 * __read_seqcount_retry() - end a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: true if a read section retry is required, else false
 */
#define __read_seqcount_retry(s, start)                                        \
        do___read_seqcount_retry(seqprop_const_ptr(s), start)

static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        kcsan_atomic_next(0);
        return unlikely(READ_ONCE(s->sequence) != start);
}

/**
 * read_seqcount_retry() - end a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * read_seqcount_retry closes the read critical section of given
 * seqcount_t.  If the critical section was invalid, it must be ignored
 * (and typically retried).
 *
 * Return: true if a read section retry is required, else false
 */
#define read_seqcount_retry(s, start)                                        \
        do_read_seqcount_retry(seqprop_const_ptr(s), start)

static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        smp_rmb();
        return do___read_seqcount_retry(s, start);
}

/**
 * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: check write_seqcount_begin()
 */
#define raw_write_seqcount_begin(s)                                        \
do {                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_raw_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_raw_write_seqcount_begin(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
}

/**
 * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: check write_seqcount_end()
 */
#define raw_write_seqcount_end(s)                                        \
do {                                                                        \
        do_raw_write_seqcount_end(seqprop_ptr(s));                        \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_raw_write_seqcount_end(seqcount_t *s)
{
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_begin_nested() - start a seqcount_t write section with
 *                                 custom lockdep nesting level
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @subclass: lockdep nesting level
 *
 * See Documentation/locking/lockdep-design.rst
 * Context: check write_seqcount_begin()
 */
#define write_seqcount_begin_nested(s, subclass)                        \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin_nested(seqprop_ptr(s), subclass);        \
} while (0)

static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
{
        seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
        do_raw_write_seqcount_begin(s);
}

/**
 * write_seqcount_begin() - start a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: sequence counter write side sections must be serialized and
 * non-preemptible. Preemption will be automatically disabled if and
 * only if the seqcount write serialization lock is associated, and
 * preemptible.  If readers can be invoked from hardirq or softirq
 * context, interrupts or bottom halves must be respectively disabled.
 */
#define write_seqcount_begin(s)                                                \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_write_seqcount_begin(seqcount_t *s)
{
        do_write_seqcount_begin_nested(s, 0);
}

/**
 * write_seqcount_end() - end a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: Preemption will be automatically re-enabled if and only if
 * the seqcount write serialization lock is associated, and preemptible.
 */
#define write_seqcount_end(s)                                                \
do {                                                                        \
        do_write_seqcount_end(seqprop_ptr(s));                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_write_seqcount_end(seqcount_t *s)
{
        seqcount_release(&s->dep_map, _RET_IP_);
        do_raw_write_seqcount_end(s);
}

/**
 * raw_write_seqcount_barrier() - do a seqcount_t write barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * This can be used to provide an ordering guarantee instead of the usual
 * consistency guarantee. It is one wmb cheaper, because it can collapse
 * the two back-to-back wmb()s.
 *
 * Note that writes surrounding the barrier should be declared atomic (e.g.
 * via WRITE_ONCE): a) to ensure the writes become visible to other threads
 * atomically, avoiding compiler optimizations; b) to document which writes are
 * meant to propagate to the reader critical section. This is necessary because
 * neither writes before nor after the barrier are enclosed in a seq-writer
 * critical section that would ensure readers are aware of ongoing writes::
 *
 *        seqcount_t seq;
 *        bool X = true, Y = false;
 *
 *        void read(void)
 *        {
 *                bool x, y;
 *
 *                do {
 *                        int s = read_seqcount_begin(&seq);
 *
 *                        x = X; y = Y;
 *
 *                } while (read_seqcount_retry(&seq, s));
 *
 *                BUG_ON(!x && !y);
 *      }
 *
 *      void write(void)
 *      {
 *                WRITE_ONCE(Y, true);
 *
 *                raw_write_seqcount_barrier(seq);
 *
 *                WRITE_ONCE(X, false);
 *      }
 */
#define raw_write_seqcount_barrier(s)                                        \
        do_raw_write_seqcount_barrier(seqprop_ptr(s))

static inline void do_raw_write_seqcount_barrier(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_invalidate() - invalidate in-progress seqcount_t read
 *                               side operations
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * After write_seqcount_invalidate, no seqcount_t read side operations
 * will complete successfully and see data older than this.
 */
#define write_seqcount_invalidate(s)                                        \
        do_write_seqcount_invalidate(seqprop_ptr(s))

static inline void do_write_seqcount_invalidate(seqcount_t *s)
{
        smp_wmb();
        kcsan_nestable_atomic_begin();
        s->sequence+=2;
        kcsan_nestable_atomic_end();
}

/*
 * Latch sequence counters (seqcount_latch_t)
 *
 * A sequence counter variant where the counter even/odd value is used to
 * switch between two copies of protected data. This allows the read path,
 * typically NMIs, to safely interrupt the write side critical section.
 *
 * As the write sections are fully preemptible, no special handling for
 * PREEMPT_RT is needed.
 */
typedef struct {
        seqcount_t seqcount;
} seqcount_latch_t;

/**
 * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t
 * @seq_name: Name of the seqcount_latch_t instance
 */
#define SEQCNT_LATCH_ZERO(seq_name) {                                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
}

/**
 * seqcount_latch_init() - runtime initializer for seqcount_latch_t
 * @s: Pointer to the seqcount_latch_t instance
 */
#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount)

/**
 * raw_read_seqcount_latch() - pick even/odd latch data copy
 * @s: Pointer to seqcount_latch_t
 *
 * See raw_write_seqcount_latch() for details and a full reader/writer
 * usage example.
 *
 * Return: sequence counter raw value. Use the lowest bit as an index for
 * picking which data copy to read. The full counter must then be checked
 * with raw_read_seqcount_latch_retry().
 */
static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
        /*
         * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
         * Due to the dependent load, a full smp_rmb() is not needed.
         */
        return READ_ONCE(s->seqcount.sequence);
}

/**
 * read_seqcount_latch() - pick even/odd latch data copy
 * @s: Pointer to seqcount_latch_t
 *
 * See write_seqcount_latch() for details and a full reader/writer usage
 * example.
 *
 * Return: sequence counter raw value. Use the lowest bit as an index for
 * picking which data copy to read. The full counter must then be checked
 * with read_seqcount_latch_retry().
 */
static __always_inline unsigned read_seqcount_latch(const seqcount_latch_t *s)
{
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
        return raw_read_seqcount_latch(s);
}

/**
 * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
 * @s:                Pointer to seqcount_latch_t
 * @start:        count, from raw_read_seqcount_latch()
 *
 * Return: true if a read section retry is required, else false
 */
static __always_inline int
raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
        smp_rmb();
        return unlikely(READ_ONCE(s->seqcount.sequence) != start);
}

/**
 * read_seqcount_latch_retry() - end a seqcount_latch_t read section
 * @s:                Pointer to seqcount_latch_t
 * @start:        count, from read_seqcount_latch()
 *
 * Return: true if a read section retry is required, else false
 */
static __always_inline int
read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
        kcsan_atomic_next(0);
        return raw_read_seqcount_latch_retry(s, start);
}

/**
 * raw_write_seqcount_latch() - redirect latch readers to even/odd copy
 * @s: Pointer to seqcount_latch_t
 */
static __always_inline void raw_write_seqcount_latch(seqcount_latch_t *s)
{
        smp_wmb();        /* prior stores before incrementing "sequence" */
        s->seqcount.sequence++;
        smp_wmb();      /* increment "sequence" before following stores */
}

/**
 * write_seqcount_latch_begin() - redirect latch readers to odd copy
 * @s: Pointer to seqcount_latch_t
 *
 * The latch technique is a multiversion concurrency control method that allows
 * queries during non-atomic modifications. If you can guarantee queries never
 * interrupt the modification -- e.g. the concurrency is strictly between CPUs
 * -- you most likely do not need this.
 *
 * Where the traditional RCU/lockless data structures rely on atomic
 * modifications to ensure queries observe either the old or the new state the
 * latch allows the same for non-atomic updates. The trade-off is doubling the
 * cost of storage; we have to maintain two copies of the entire data
 * structure.
 *
 * Very simply put: we first modify one copy and then the other. This ensures
 * there is always one copy in a stable state, ready to give us an answer.
 *
 * The basic form is a data structure like::
 *
 *        struct latch_struct {
 *                seqcount_latch_t        seq;
 *                struct data_struct        data[2];
 *        };
 *
 * Where a modification, which is assumed to be externally serialized, does the
 * following::
 *
 *        void latch_modify(struct latch_struct *latch, ...)
 *        {
 *                write_seqcount_latch_begin(&latch->seq);
 *                modify(latch->data[0], ...);
 *                write_seqcount_latch(&latch->seq);
 *                modify(latch->data[1], ...);
 *                write_seqcount_latch_end(&latch->seq);
 *        }
 *
 * The query will have a form like::
 *
 *        struct entry *latch_query(struct latch_struct *latch, ...)
 *        {
 *                struct entry *entry;
 *                unsigned seq, idx;
 *
 *                do {
 *                        seq = read_seqcount_latch(&latch->seq);
 *
 *                        idx = seq & 0x01;
 *                        entry = data_query(latch->data[idx], ...);
 *
 *                // This includes needed smp_rmb()
 *                } while (read_seqcount_latch_retry(&latch->seq, seq));
 *
 *                return entry;
 *        }
 *
 * So during the modification, queries are first redirected to data[1]. Then we
 * modify data[0]. When that is complete, we redirect queries back to data[0]
 * and we can modify data[1].
 *
 * NOTE:
 *
 *        The non-requirement for atomic modifications does _NOT_ include
 *        the publishing of new entries in the case where data is a dynamic
 *        data structure.
 *
 *        An iteration might start in data[0] and get suspended long enough
 *        to miss an entire modification sequence, once it resumes it might
 *        observe the new entry.
 *
 * NOTE2:
 *
 *        When data is a dynamic data structure; one should use regular RCU
 *        patterns to manage the lifetimes of the objects within.
 */
static __always_inline void write_seqcount_latch_begin(seqcount_latch_t *s)
{
        kcsan_nestable_atomic_begin();
        raw_write_seqcount_latch(s);
}

/**
 * write_seqcount_latch() - redirect latch readers to even copy
 * @s: Pointer to seqcount_latch_t
 */
static __always_inline void write_seqcount_latch(seqcount_latch_t *s)
{
        raw_write_seqcount_latch(s);
}

/**
 * write_seqcount_latch_end() - end a seqcount_latch_t write section
 * @s:                Pointer to seqcount_latch_t
 *
 * Marks the end of a seqcount_latch_t writer section, after all copies of the
 * latch-protected data have been updated.
 */
static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s)
{
        kcsan_nestable_atomic_end();
}

#define __SEQLOCK_UNLOCKED(lockname)                                        \
        {                                                                \
                .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
                .lock =        __SPIN_LOCK_UNLOCKED(lockname)                        \
        }

/**
 * seqlock_init() - dynamic initializer for seqlock_t
 * @sl: Pointer to the seqlock_t instance
 */
#define seqlock_init(sl)                                                \
        do {                                                                \
                spin_lock_init(&(sl)->lock);                                \
                seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock);        \
        } while (0)

/**
 * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t
 * @sl: Name of the seqlock_t instance
 */
#define DEFINE_SEQLOCK(sl) \
                seqlock_t sl = __SEQLOCK_UNLOCKED(sl)

/**
 * read_seqbegin() - start a seqlock_t read side critical section
 * @sl: Pointer to seqlock_t
 *
 * Return: count, to be passed to read_seqretry()
 */
static inline unsigned read_seqbegin(const seqlock_t *sl)
        __acquires_shared(sl) __no_context_analysis
{
        return read_seqcount_begin(&sl->seqcount);
}

/**
 * read_seqretry() - end a seqlock_t read side section
 * @sl: Pointer to seqlock_t
 * @start: count, from read_seqbegin()
 *
 * read_seqretry closes the read side critical section of given seqlock_t.
 * If the critical section was invalid, it must be ignored (and typically
 * retried).
 *
 * Return: true if a read section retry is required, else false
 */
static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
        __releases_shared(sl) __no_context_analysis
{
        return read_seqcount_retry(&sl->seqcount, start);
}

/*
 * For all seqlock_t write side functions, use the internal
 * do_write_seqcount_begin() instead of generic write_seqcount_begin().
 * This way, no redundant lockdep_assert_held() checks are added.
 */

/**
 * write_seqlock() - start a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_seqlock opens a write side critical section for the given
 * seqlock_t.  It also implicitly acquires the spinlock_t embedded inside
 * that sequential lock. All seqlock_t write side sections are thus
 * automatically serialized and non-preemptible.
 *
 * Context: if the seqlock_t read section, or other write side critical
 * sections, can be invoked from hardirq or softirq contexts, use the
 * _irqsave or _bh variants of this function instead.
 */
static inline void write_seqlock(seqlock_t *sl)
        __acquires(sl) __no_context_analysis
{
        spin_lock(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock() - end a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock closes the (serialized and non-preemptible) write side
 * critical section of given seqlock_t.
 */
static inline void write_sequnlock(seqlock_t *sl)
        __releases(sl) __no_context_analysis
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock(&sl->lock);
}

/**
 * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of write_seqlock(). Use only if the read side section, or
 * other write side sections, can be invoked from softirq contexts.
 */
static inline void write_seqlock_bh(seqlock_t *sl)
        __acquires(sl) __no_context_analysis
{
        spin_lock_bh(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_bh closes the serialized, non-preemptible, and
 * softirqs-disabled, seqlock_t write side critical section opened with
 * write_seqlock_bh().
 */
static inline void write_sequnlock_bh(seqlock_t *sl)
        __releases(sl) __no_context_analysis
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_bh(&sl->lock);
}

/**
 * write_seqlock_irq() - start a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of write_seqlock(). Use only if the read side section, or
 * other write sections, can be invoked from hardirq contexts.
 */
static inline void write_seqlock_irq(seqlock_t *sl)
        __acquires(sl) __no_context_analysis
{
        spin_lock_irq(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_irq() - end a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_irq closes the serialized and non-interruptible
 * seqlock_t write side section opened with write_seqlock_irq().
 */
static inline void write_sequnlock_irq(seqlock_t *sl)
        __releases(sl) __no_context_analysis
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
        __acquires(sl) __no_context_analysis
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
        return flags;
}

/**
 * write_seqlock_irqsave() - start a non-interruptible seqlock_t write
 *                           section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to write_sequnlock_irqrestore().
 *
 * _irqsave variant of write_seqlock(). Use it only if the read side
 * section, or other write sections, can be invoked from hardirq context.
 */
#define write_seqlock_irqsave(lock, flags)                                \
        do { flags = __write_seqlock_irqsave(lock); } while (0)

/**
 * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write
 *                                section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller's saved interrupt state, from write_seqlock_irqsave()
 *
 * write_sequnlock_irqrestore closes the serialized and non-interruptible
 * seqlock_t write section previously opened with write_seqlock_irqsave().
 */
static inline void
write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
        __releases(sl) __no_context_analysis
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqlock_excl() - begin a seqlock_t locking reader section
 * @sl:        Pointer to seqlock_t
 *
 * read_seqlock_excl opens a seqlock_t locking reader critical section.  A
 * locking reader exclusively locks out *both* other writers *and* other
 * locking readers, but it does not update the embedded sequence number.
 *
 * Locking readers act like a normal spin_lock()/spin_unlock().
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * The opened read section must be closed with read_sequnlock_excl().
 */
static inline void read_seqlock_excl(seqlock_t *sl)
        __acquires_shared(sl) __no_context_analysis
{
        spin_lock(&sl->lock);
}

/**
 * read_sequnlock_excl() - end a seqlock_t locking reader critical section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl(seqlock_t *sl)
        __releases_shared(sl) __no_context_analysis
{
        spin_unlock(&sl->lock);
}

/**
 * read_seqlock_excl_bh() - start a seqlock_t locking reader section with
 *                            softirqs disabled
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of read_seqlock_excl(). Use this variant only if the
 * seqlock_t write side section, *or other read sections*, can be invoked
 * from softirq contexts.
 */
static inline void read_seqlock_excl_bh(seqlock_t *sl)
        __acquires_shared(sl) __no_context_analysis
{
        spin_lock_bh(&sl->lock);
}

/**
 * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking
 *                              reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_bh(seqlock_t *sl)
        __releases_shared(sl) __no_context_analysis
{
        spin_unlock_bh(&sl->lock);
}

/**
 * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking
 *                             reader section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
static inline void read_seqlock_excl_irq(seqlock_t *sl)
        __acquires_shared(sl) __no_context_analysis
{
        spin_lock_irq(&sl->lock);
}

/**
 * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t
 *                             locking reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_irq(seqlock_t *sl)
        __releases_shared(sl) __no_context_analysis
{
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
        __acquires_shared(sl) __no_context_analysis
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        return flags;
}

/**
 * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t
 *                                 locking reader section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to read_sequnlock_excl_irqrestore().
 *
 * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
#define read_seqlock_excl_irqsave(lock, flags)                                \
        do { flags = __read_seqlock_excl_irqsave(lock); } while (0)

/**
 * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t
 *                                      locking reader section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave()
 */
static inline void
read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
        __releases_shared(sl) __no_context_analysis
{
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader
 * @lock: Pointer to seqlock_t
 * @seq : Marker and return parameter. If the passed value is even, the
 * reader will become a *lockless* seqlock_t reader as in read_seqbegin().
 * If the passed value is odd, the reader will become a *locking* reader
 * as in read_seqlock_excl().  In the first call to this function, the
 * caller *must* initialize and pass an even value to @seq; this way, a
 * lockless read can be optimistically tried first.
 *
 * read_seqbegin_or_lock is an API designed to optimistically try a normal
 * lockless seqlock_t read section first.  If an odd counter is found, the
 * lockless read trial has failed, and the next read iteration transforms
 * itself into a full seqlock_t locking reader.
 *
 * This is typically used to avoid seqlock_t lockless readers starvation
 * (too much retry loops) in the case of a sharp spike in write side
 * activity.
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * Check Documentation/locking/seqlock.rst for template example code.
 *
 * Return: the encountered sequence counter value, through the @seq
 * parameter, which is overloaded as a return parameter. This returned
 * value must be checked with need_seqretry(). If the read section need to
 * be retried, this returned value must also be passed as the @seq
 * parameter of the next read_seqbegin_or_lock() iteration.
 */
static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
        __acquires_shared(lock) __no_context_analysis
{
        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl(lock);
}

/**
 * need_seqretry() - validate seqlock_t "locking or lockless" read section
 * @lock: Pointer to seqlock_t
 * @seq: sequence count, from read_seqbegin_or_lock()
 *
 * Return: true if a read section retry is required, false otherwise
 */
static inline int need_seqretry(seqlock_t *lock, int seq)
        __releases_shared(lock) __no_context_analysis
{
        return !(seq & 1) && read_seqretry(lock, seq);
}

/**
 * done_seqretry() - end seqlock_t "locking or lockless" reader section
 * @lock: Pointer to seqlock_t
 * @seq: count, from read_seqbegin_or_lock()
 *
 * done_seqretry finishes the seqlock_t read side critical section started
 * with read_seqbegin_or_lock() and validated by need_seqretry().
 */
static inline void done_seqretry(seqlock_t *lock, int seq)
        __no_context_analysis
{
        if (seq & 1)
                read_sequnlock_excl(lock);
}

/**
 * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or
 *                                   a non-interruptible locking reader
 * @lock: Pointer to seqlock_t
 * @seq:  Marker and return parameter. Check read_seqbegin_or_lock().
 *
 * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if
 * the seqlock_t write section, *or other read sections*, can be invoked
 * from hardirq context.
 *
 * Note: Interrupts will be disabled only for "locking reader" mode.
 *
 * Return:
 *
 *   1. The saved local interrupts state in case of a locking reader, to
 *      be passed to done_seqretry_irqrestore().
 *
 *   2. The encountered sequence counter value, returned through @seq
 *      overloaded as a return parameter. Check read_seqbegin_or_lock().
 */
static inline unsigned long
read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
        __acquires_shared(lock) __no_context_analysis
{
        unsigned long flags = 0;

        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl_irqsave(lock, flags);

        return flags;
}

/**
 * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a
 *                                non-interruptible locking reader section
 * @lock:  Pointer to seqlock_t
 * @seq:   Count, from read_seqbegin_or_lock_irqsave()
 * @flags: Caller's saved local interrupt state in case of a locking
 *           reader, also from read_seqbegin_or_lock_irqsave()
 *
 * This is the _irqrestore variant of done_seqretry(). The read section
 * must've been opened with read_seqbegin_or_lock_irqsave(), and validated
 * by need_seqretry().
 */
static inline void
done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
        __no_context_analysis
{
        if (seq & 1)
                read_sequnlock_excl_irqrestore(lock, flags);
}

enum ss_state {
        ss_done = 0,
        ss_lock,
        ss_lock_irqsave,
        ss_lockless,
};

struct ss_tmp {
        enum ss_state        state;
        unsigned long        data;
        spinlock_t        *lock;
        spinlock_t        *lock_irqsave;
};

static __always_inline void __scoped_seqlock_cleanup(struct ss_tmp *sst)
        __no_context_analysis
{
        if (sst->lock)
                spin_unlock(sst->lock);
        if (sst->lock_irqsave)
                spin_unlock_irqrestore(sst->lock_irqsave, sst->data);
}

extern void __scoped_seqlock_invalid_target(void);

#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || defined(CONFIG_KASAN)
/*
 * For some reason some GCC-8 architectures (nios2, alpha) have trouble
 * determining that the ss_done state is impossible in __scoped_seqlock_next()
 * below.
 *
 * Similarly KASAN is known to confuse compilers enough to break this. But we
 * don't care about code quality for KASAN builds anyway.
 */
static inline void __scoped_seqlock_bug(void) { }
#else
/*
 * Canary for compiler optimization -- if the compiler doesn't realize this is
 * an impossible state, it very likely generates sub-optimal code here.
 */
extern void __scoped_seqlock_bug(void);
#endif

static __always_inline void
__scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target)
        __no_context_analysis
{
        switch (sst->state) {
        case ss_done:
                __scoped_seqlock_bug();
                return;

        case ss_lock:
        case ss_lock_irqsave:
                sst->state = ss_done;
                return;

        case ss_lockless:
                if (!read_seqretry(lock, sst->data)) {
                        sst->state = ss_done;
                        return;
                }
                break;
        }

        switch (target) {
        case ss_done:
                __scoped_seqlock_invalid_target();
                return;

        case ss_lock:
                sst->lock = &lock->lock;
                spin_lock(sst->lock);
                sst->state = ss_lock;
                return;

        case ss_lock_irqsave:
                sst->lock_irqsave = &lock->lock;
                spin_lock_irqsave(sst->lock_irqsave, sst->data);
                sst->state = ss_lock_irqsave;
                return;

        case ss_lockless:
                sst->data = read_seqbegin(lock);
                return;
        }
}

/*
 * Context analysis no-op helper to release seqlock at the end of the for-scope;
 * the alias analysis of the compiler will recognize that the pointer @s is an
 * alias to @_seqlock passed to read_seqbegin(_seqlock) below.
 */
static __always_inline void __scoped_seqlock_cleanup_ctx(struct ss_tmp **s)
        __releases_shared(*((seqlock_t **)s)) __no_context_analysis {}

#define __scoped_seqlock_read(_seqlock, _target, _s)                        \
        for (struct ss_tmp _s __cleanup(__scoped_seqlock_cleanup) =        \
             { .state = ss_lockless, .data = read_seqbegin(_seqlock) }, \
             *__UNIQUE_ID(ctx) __cleanup(__scoped_seqlock_cleanup_ctx) =\
                (struct ss_tmp *)_seqlock;                                \
             _s.state != ss_done;                                        \
             __scoped_seqlock_next(&_s, _seqlock, _target))

/**
 * scoped_seqlock_read() - execute the read-side critical section
 *                         without manual sequence counter handling
 *                         or calls to other helpers
 * @_seqlock: pointer to seqlock_t protecting the data
 * @_target: an enum ss_state: one of {ss_lock, ss_lock_irqsave, ss_lockless}
 *           indicating the type of critical read section
 *
 * Example::
 *
 *     scoped_seqlock_read (&lock, ss_lock) {
 *         // read-side critical section
 *     }
 *
 * Starts with a lockess pass first. If it fails, restarts the critical
 * section with the lock held.
 */
#define scoped_seqlock_read(_seqlock, _target)                                \
        __scoped_seqlock_read(_seqlock, _target, __UNIQUE_ID(seqlock))

DEFINE_LOCK_GUARD_1(seqlock_init, seqlock_t, seqlock_init(_T->lock), /* */)
DECLARE_LOCK_GUARD_1_ATTRS(seqlock_init, __acquires(_T), __releases(*(seqlock_t **)_T))
#define class_seqlock_init_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(seqlock_init, _T)

#endif /* __LINUX_SEQLOCK_H */











































































































































































































































































































































































































   16 



















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
// SPDX-License-Identifier: GPL-2.0+
/*
 *  Universal/legacy driver for 8250/16550-type serial ports
 *
 *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
 *
 *  Copyright (C) 2001 Russell King.
 *
 *  Supports:
 *              early_serial_setup() ports
 *              userspace-configurable "phantom" ports
 *              serial8250_register_8250_port() ports
 */

#include <linux/acpi.h>
#include <linux/hashtable.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/console.h>
#include <linux/sysrq.h>
#include <linux/delay.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
#include <linux/tty.h>
#include <linux/ratelimit.h>
#include <linux/tty_flip.h>
#include <linux/serial.h>
#include <linux/serial_8250.h>
#include <linux/nmi.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/string_helpers.h>
#include <linux/uaccess.h>
#include <linux/io.h>

#include <asm/irq.h>

#include "8250.h"

#define PASS_LIMIT        512

struct irq_info {
        struct                        hlist_node node;
        int                        irq;
        spinlock_t                lock;        /* Protects list not the hash */
        struct list_head        *head;
};

#define IRQ_HASH_BITS                5        /* Can be adjusted later */
static DEFINE_HASHTABLE(irq_lists, IRQ_HASH_BITS);
static DEFINE_MUTEX(hash_mutex);        /* Used to walk the hash */

static bool skip_txen_test;
module_param(skip_txen_test, bool, 0644);
MODULE_PARM_DESC(skip_txen_test, "Skip checking for the TXEN bug at init time");

/*
 * This is the serial driver's interrupt routine.
 *
 * Arjan thinks the old way was overly complex, so it got simplified.
 * Alan disagrees, saying that need the complexity to handle the weird
 * nature of ISA shared interrupts.  (This is a special exception.)
 *
 * In order to handle ISA shared interrupts properly, we need to check
 * that all ports have been serviced, and therefore the ISA interrupt
 * line has been de-asserted.
 *
 * This means we need to loop through all ports. checking that they
 * don't have an interrupt pending.
 */
static irqreturn_t serial8250_interrupt(int irq, void *dev_id)
{
        struct irq_info *i = dev_id;
        struct list_head *l, *end = NULL;
        int pass_counter = 0, handled = 0;

        guard(spinlock)(&i->lock);

        l = i->head;
        do {
                struct uart_8250_port *up = list_entry(l, struct uart_8250_port, list);
                struct uart_port *port = &up->port;

                if (port->handle_irq(port)) {
                        handled = 1;
                        end = NULL;
                } else if (end == NULL)
                        end = l;

                l = l->next;

                if (l == i->head && pass_counter++ > PASS_LIMIT)
                        break;
        } while (l != end);

        return IRQ_RETVAL(handled);
}

/*
 * To support ISA shared interrupts, we need to have one interrupt
 * handler that ensures that the IRQ line has been deasserted
 * before returning.  Failing to do this will result in the IRQ
 * line being stuck active, and, since ISA irqs are edge triggered,
 * no more IRQs will be seen.
 */
static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up)
{
        spin_lock_irq(&i->lock);

        if (!list_empty(i->head)) {
                if (i->head == &up->list)
                        i->head = i->head->next;
                list_del(&up->list);
        } else {
                BUG_ON(i->head != &up->list);
                i->head = NULL;
        }
        spin_unlock_irq(&i->lock);
        /* List empty so throw away the hash node */
        if (i->head == NULL) {
                hlist_del(&i->node);
                kfree(i);
        }
}

/*
 * Either:
 * - find the corresponding info in the hashtable and return it, or
 * - allocate a new one, add it to the hashtable and return it.
 */
static struct irq_info *serial_get_or_create_irq_info(const struct uart_8250_port *up)
{
        struct irq_info *i;

        guard(mutex)(&hash_mutex);

        hash_for_each_possible(irq_lists, i, node, up->port.irq)
                if (i->irq == up->port.irq)
                        return i;

        i = kzalloc_obj(*i);
        if (i == NULL)
                return ERR_PTR(-ENOMEM);

        spin_lock_init(&i->lock);
        i->irq = up->port.irq;
        hash_add(irq_lists, &i->node, i->irq);

        return i;
}

static int serial_link_irq_chain(struct uart_8250_port *up)
{
        struct irq_info *i;
        int ret;

        i = serial_get_or_create_irq_info(up);
        if (IS_ERR(i))
                return PTR_ERR(i);

        scoped_guard(spinlock_irq, &i->lock) {
                if (i->head) {
                        list_add(&up->list, i->head);

                        return 0;
                }

                INIT_LIST_HEAD(&up->list);
                i->head = &up->list;
        }

        ret = request_irq(up->port.irq, serial8250_interrupt, up->port.irqflags, up->port.name, i);
        if (ret < 0)
                serial_do_unlink(i, up);

        return ret;
}

static void serial_unlink_irq_chain(struct uart_8250_port *up)
{
        struct irq_info *i;

        guard(mutex)(&hash_mutex);

        hash_for_each_possible(irq_lists, i, node, up->port.irq)
                if (i->irq == up->port.irq) {
                        if (WARN_ON(i->head == NULL))
                                return;

                        if (list_empty(i->head))
                                free_irq(up->port.irq, i);

                        serial_do_unlink(i, up);

                        return;
                }

        WARN_ON(1);
}

/*
 * This function is used to handle ports that do not have an
 * interrupt.  This doesn't work very well for 16450's, but gives
 * barely passable results for a 16550A.  (Although at the expense
 * of much CPU overhead).
 */
static void serial8250_timeout(struct timer_list *t)
{
        struct uart_8250_port *up = timer_container_of(up, t, timer);

        up->port.handle_irq(&up->port);
        mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port));
}

static void serial8250_backup_timeout(struct timer_list *t)
{
        struct uart_8250_port *up = timer_container_of(up, t, timer);
        unsigned int iir, ier = 0, lsr;
        unsigned long flags;

        uart_port_lock_irqsave(&up->port, &flags);

        /*
         * Must disable interrupts or else we risk racing with the interrupt
         * based handler.
         */
        if (up->port.irq) {
                ier = serial_in(up, UART_IER);
                serial_out(up, UART_IER, 0);
        }

        iir = serial_in(up, UART_IIR);

        /*
         * This should be a safe test for anyone who doesn't trust the
         * IIR bits on their UART, but it's specifically designed for
         * the "Diva" UART used on the management processor on many HP
         * ia64 and parisc boxes.
         */
        lsr = serial_lsr_in(up);
        if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) &&
            (!kfifo_is_empty(&up->port.state->port.xmit_fifo) ||
             up->port.x_char) &&
            (lsr & UART_LSR_THRE)) {
                iir &= ~(UART_IIR_ID | UART_IIR_NO_INT);
                iir |= UART_IIR_THRI;
        }

        if (!(iir & UART_IIR_NO_INT))
                serial8250_tx_chars(up);

        if (up->port.irq)
                serial_out(up, UART_IER, ier);

        uart_port_unlock_irqrestore(&up->port, flags);

        /* Standard timer interval plus 0.2s to keep the port running */
        mod_timer(&up->timer,
                jiffies + uart_poll_timeout(&up->port) + HZ / 5);
}

static void univ8250_setup_timer(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        /*
         * The above check will only give an accurate result the first time
         * the port is opened so this value needs to be preserved.
         */
        if (up->bugs & UART_BUG_THRE) {
                pr_debug("%s - using backup timer\n", port->name);

                up->timer.function = serial8250_backup_timeout;
                mod_timer(&up->timer, jiffies +
                          uart_poll_timeout(port) + HZ / 5);
        }

        /*
         * If the "interrupt" for this port doesn't correspond with any
         * hardware interrupt, we use a timer-based system.  The original
         * driver used to do this with IRQ0.
         */
        if (!port->irq)
                mod_timer(&up->timer, jiffies + uart_poll_timeout(port));
}

static int univ8250_setup_irq(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        if (port->irq)
                return serial_link_irq_chain(up);

        return 0;
}

static void univ8250_release_irq(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        timer_delete_sync(&up->timer);
        up->timer.function = serial8250_timeout;
        if (port->irq)
                serial_unlink_irq_chain(up);
}

const struct uart_ops *univ8250_port_base_ops;
struct uart_ops univ8250_port_ops;

static const struct uart_8250_ops univ8250_driver_ops = {
        .setup_irq        = univ8250_setup_irq,
        .release_irq        = univ8250_release_irq,
        .setup_timer        = univ8250_setup_timer,
};

static struct uart_8250_port serial8250_ports[UART_NR];

/**
 * serial8250_get_port - retrieve struct uart_8250_port
 * @line: serial line number
 *
 * This function retrieves struct uart_8250_port for the specific line.
 * This struct *must* *not* be used to perform a 8250 or serial core operation
 * which is not accessible otherwise. Its only purpose is to make the struct
 * accessible to the runtime-pm callbacks for context suspend/restore.
 * The lock assumption made here is none because runtime-pm suspend/resume
 * callbacks should not be invoked if there is any operation performed on the
 * port.
 */
struct uart_8250_port *serial8250_get_port(int line)
{
        return &serial8250_ports[line];
}
EXPORT_SYMBOL_GPL(serial8250_get_port);

static inline void serial8250_apply_quirks(struct uart_8250_port *up)
{
        up->port.quirks |= skip_txen_test ? UPQ_NO_TXEN_TEST : 0;
}

struct uart_8250_port *serial8250_setup_port(int index)
{
        struct uart_8250_port *up;

        if (index >= UART_NR)
                return NULL;

        up = &serial8250_ports[index];
        up->port.line = index;
        up->port.port_id = index;

        serial8250_init_port(up);
        if (!univ8250_port_base_ops)
                univ8250_port_base_ops = up->port.ops;
        up->port.ops = &univ8250_port_ops;

        timer_setup(&up->timer, serial8250_timeout, 0);

        up->ops = &univ8250_driver_ops;

        serial8250_set_defaults(up);

        return up;
}

void __init serial8250_register_ports(struct uart_driver *drv, struct device *dev)
{
        int i;

        for (i = 0; i < nr_uarts; i++) {
                struct uart_8250_port *up = &serial8250_ports[i];

                if (up->port.type == PORT_8250_CIR)
                        continue;

                if (up->port.dev)
                        continue;

                up->port.dev = dev;

                if (uart_console_registered(&up->port))
                        pm_runtime_get_sync(up->port.dev);

                serial8250_apply_quirks(up);
                uart_add_one_port(drv, &up->port);
        }
}

#ifdef CONFIG_SERIAL_8250_CONSOLE

static void univ8250_console_write(struct console *co, const char *s,
                                   unsigned int count)
{
        struct uart_8250_port *up = &serial8250_ports[co->index];

        serial8250_console_write(up, s, count);
}

static int univ8250_console_setup(struct console *co, char *options)
{
        struct uart_8250_port *up;
        struct uart_port *port;
        int retval, i;

        /*
         * Check whether an invalid uart number has been specified, and
         * if so, search for the first available port that does have
         * console support.
         */
        if (co->index < 0 || co->index >= UART_NR)
                co->index = 0;

        /*
         * If the console is past the initial isa ports, init more ports up to
         * co->index as needed and increment nr_uarts accordingly.
         */
        for (i = nr_uarts; i <= co->index; i++) {
                up = serial8250_setup_port(i);
                if (!up)
                        return -ENODEV;
                nr_uarts++;
        }

        port = &serial8250_ports[co->index].port;
        /* link port to console */
        uart_port_set_cons(port, co);

        retval = serial8250_console_setup(port, options, false);
        if (retval != 0)
                uart_port_set_cons(port, NULL);
        return retval;
}

static int univ8250_console_exit(struct console *co)
{
        struct uart_port *port;

        port = &serial8250_ports[co->index].port;
        return serial8250_console_exit(port);
}

/**
 *        univ8250_console_match - non-standard console matching
 *        @co:          registering console
 *        @name:          name from console command line
 *        @idx:          index from console command line
 *        @options: ptr to option string from console command line
 *
 *        Only attempts to match console command lines of the form:
 *            console=uart[8250],io|mmio|mmio16|mmio32,<addr>[,<options>]
 *            console=uart[8250],0x<addr>[,<options>]
 *        This form is used to register an initial earlycon boot console and
 *        replace it with the serial8250_console at 8250 driver init.
 *
 *        Performs console setup for a match (as required by interface)
 *        If no <options> are specified, then assume the h/w is already setup.
 *
 *        Returns 0 if console matches; otherwise non-zero to use default matching
 */
static int univ8250_console_match(struct console *co, char *name, int idx,
                                  char *options)
{
        char match[] = "uart";        /* 8250-specific earlycon name */
        enum uart_iotype iotype;
        resource_size_t addr;
        int i;

        if (strncmp(name, match, 4) != 0)
                return -ENODEV;

        if (uart_parse_earlycon(options, &iotype, &addr, &options))
                return -ENODEV;

        /* try to match the port specified on the command line */
        for (i = 0; i < nr_uarts; i++) {
                struct uart_port *port = &serial8250_ports[i].port;

                if (port->iotype != iotype)
                        continue;
                if ((iotype == UPIO_MEM || iotype == UPIO_MEM16 ||
                     iotype == UPIO_MEM32 || iotype == UPIO_MEM32BE)
                    && (port->mapbase != addr))
                        continue;
                if (iotype == UPIO_PORT && port->iobase != addr)
                        continue;

                co->index = i;
                uart_port_set_cons(port, co);
                return serial8250_console_setup(port, options, true);
        }

        return -ENODEV;
}

static struct console univ8250_console = {
        .name                = "ttyS",
        .write                = univ8250_console_write,
        .device                = uart_console_device,
        .setup                = univ8250_console_setup,
        .exit                = univ8250_console_exit,
        .match                = univ8250_console_match,
        .flags                = CON_PRINTBUFFER | CON_ANYTIME,
        .index                = -1,
        .data                = &serial8250_reg,
};

static int __init univ8250_console_init(void)
{
        if (nr_uarts == 0)
                return -ENODEV;

        serial8250_isa_init_ports();
        register_console(&univ8250_console);
        return 0;
}
console_initcall(univ8250_console_init);

#define SERIAL8250_CONSOLE        (&univ8250_console)
#else
#define SERIAL8250_CONSOLE        NULL
#endif

struct uart_driver serial8250_reg = {
        .owner                        = THIS_MODULE,
        .driver_name                = "serial_8250",
        .dev_name                = "ttyS",
        .major                        = TTY_MAJOR,
        .minor                        = 64,
        .cons                        = SERIAL8250_CONSOLE,
};

/*
 * early_serial_setup - early registration for 8250 ports
 *
 * Setup an 8250 port structure prior to console initialisation.  Use
 * after console initialisation will cause undefined behaviour.
 */
int __init early_serial_setup(struct uart_port *port)
{
        struct uart_port *p;

        if (port->line >= ARRAY_SIZE(serial8250_ports) || nr_uarts == 0)
                return -ENODEV;

        serial8250_isa_init_ports();
        p = &serial8250_ports[port->line].port;
        p->iobase       = port->iobase;
        p->membase      = port->membase;
        p->irq          = port->irq;
        p->irqflags     = port->irqflags;
        p->uartclk      = port->uartclk;
        p->fifosize     = port->fifosize;
        p->regshift     = port->regshift;
        p->iotype       = port->iotype;
        p->flags        = port->flags;
        p->mapbase      = port->mapbase;
        p->mapsize      = port->mapsize;
        p->private_data = port->private_data;
        p->type                = port->type;
        p->line                = port->line;

        serial8250_set_defaults(up_to_u8250p(p));

        if (port->serial_in)
                p->serial_in = port->serial_in;
        if (port->serial_out)
                p->serial_out = port->serial_out;
        if (port->handle_irq)
                p->handle_irq = port->handle_irq;

        return 0;
}

/**
 *        serial8250_suspend_port - suspend one serial port
 *        @line:  serial line number
 *
 *        Suspend one serial port.
 */
void serial8250_suspend_port(int line)
{
        struct uart_8250_port *up = &serial8250_ports[line];
        struct uart_port *port = &up->port;

        if (!console_suspend_enabled && uart_console(port) &&
            port->type != PORT_8250) {
                unsigned char canary = 0xa5;

                serial_out(up, UART_SCR, canary);
                if (serial_in(up, UART_SCR) == canary)
                        up->canary = canary;
        }

        uart_suspend_port(&serial8250_reg, port);
}
EXPORT_SYMBOL(serial8250_suspend_port);

/**
 *        serial8250_resume_port - resume one serial port
 *        @line:  serial line number
 *
 *        Resume one serial port.
 */
void serial8250_resume_port(int line)
{
        struct uart_8250_port *up = &serial8250_ports[line];
        struct uart_port *port = &up->port;

        up->canary = 0;

        if (up->capabilities & UART_NATSEMI) {
                /* Ensure it's still in high speed mode */
                serial_port_out(port, UART_LCR, 0xE0);

                ns16550a_goto_highspeed(up);

                serial_port_out(port, UART_LCR, 0);
                port->uartclk = 921600*16;
        }
        uart_resume_port(&serial8250_reg, port);
}
EXPORT_SYMBOL(serial8250_resume_port);

/*
 * serial8250_register_8250_port and serial8250_unregister_port allows for
 * 16x50 serial ports to be configured at run-time, to support PCMCIA
 * modems and PCI multiport cards.
 */
static DEFINE_MUTEX(serial_mutex);

static struct uart_8250_port *serial8250_find_match_or_unused(const struct uart_port *port)
{
        int i;

        /*
         * First, find a port entry which matches.
         */
        for (i = 0; i < nr_uarts; i++)
                if (uart_match_port(&serial8250_ports[i].port, port))
                        return &serial8250_ports[i];

        /* try line number first if still available */
        i = port->line;
        if (i < nr_uarts && serial8250_ports[i].port.type == PORT_UNKNOWN &&
                        serial8250_ports[i].port.iobase == 0)
                return &serial8250_ports[i];
        /*
         * We didn't find a matching entry, so look for the first
         * free entry.  We look for one which hasn't been previously
         * used (indicated by zero iobase).
         */
        for (i = 0; i < nr_uarts; i++)
                if (serial8250_ports[i].port.type == PORT_UNKNOWN &&
                    serial8250_ports[i].port.iobase == 0)
                        return &serial8250_ports[i];

        /*
         * That also failed.  Last resort is to find any entry which
         * doesn't have a real port associated with it.
         */
        for (i = 0; i < nr_uarts; i++)
                if (serial8250_ports[i].port.type == PORT_UNKNOWN)
                        return &serial8250_ports[i];

        return NULL;
}

static void serial_8250_overrun_backoff_work(struct work_struct *work)
{
        struct uart_8250_port *up = container_of(to_delayed_work(work), struct uart_8250_port,
                                                 overrun_backoff);

        guard(uart_port_lock_irqsave)(&up->port);
        up->ier |= UART_IER_RLSI | UART_IER_RDI;
        serial_out(up, UART_IER, up->ier);
}

/**
 *        serial8250_register_8250_port - register a serial port
 *        @up: serial port template
 *
 *        Configure the serial port specified by the request. If the
 *        port exists and is in use, it is hung up and unregistered
 *        first.
 *
 *        The port is then probed and if necessary the IRQ is autodetected
 *        If this fails an error is returned.
 *
 *        On success the port is ready to use and the line number is returned.
 */
int serial8250_register_8250_port(const struct uart_8250_port *up)
{
        struct uart_8250_port *uart;
        int ret;

        if (up->port.uartclk == 0)
                return -EINVAL;

        guard(mutex)(&serial_mutex);

        uart = serial8250_find_match_or_unused(&up->port);
        if (!uart) {
                /*
                 * If the port is past the initial isa ports, initialize a new
                 * port and increment nr_uarts accordingly.
                 */
                uart = serial8250_setup_port(nr_uarts);
                if (!uart)
                        return -ENOSPC;
                nr_uarts++;
        }

        /* Check if it is CIR already. We check this below again, see there why. */
        if (uart->port.type == PORT_8250_CIR)
                return -ENODEV;

        if (uart->port.dev)
                uart_remove_one_port(&serial8250_reg, &uart->port);

        uart->port.ctrl_id        = up->port.ctrl_id;
        uart->port.port_id        = up->port.port_id;
        uart->port.iobase       = up->port.iobase;
        uart->port.membase      = up->port.membase;
        uart->port.irq          = up->port.irq;
        uart->port.irqflags     = up->port.irqflags;
        uart->port.uartclk      = up->port.uartclk;
        uart->port.fifosize     = up->port.fifosize;
        uart->port.regshift     = up->port.regshift;
        uart->port.iotype       = up->port.iotype;
        uart->port.flags        = up->port.flags | UPF_BOOT_AUTOCONF;
        uart->bugs                = up->bugs;
        uart->port.mapbase      = up->port.mapbase;
        uart->port.mapsize      = up->port.mapsize;
        uart->port.private_data = up->port.private_data;
        uart->tx_loadsz                = up->tx_loadsz;
        uart->capabilities        = up->capabilities;
        uart->port.throttle        = up->port.throttle;
        uart->port.unthrottle        = up->port.unthrottle;
        uart->port.rs485_config        = up->port.rs485_config;
        uart->port.rs485_supported = up->port.rs485_supported;
        uart->port.rs485        = up->port.rs485;
        uart->rs485_start_tx        = up->rs485_start_tx;
        uart->rs485_stop_tx        = up->rs485_stop_tx;
        uart->lsr_save_mask        = up->lsr_save_mask;
        uart->dma                = up->dma;

        /* Take tx_loadsz from fifosize if it wasn't set separately */
        if (uart->port.fifosize && !uart->tx_loadsz)
                uart->tx_loadsz = uart->port.fifosize;

        if (up->port.dev) {
                uart->port.dev = up->port.dev;
                ret = uart_get_rs485_mode(&uart->port);
                if (ret)
                        goto err;
        }

        if (up->port.flags & UPF_FIXED_TYPE)
                uart->port.type = up->port.type;

        /*
         * Only call mctrl_gpio_init(), if the device has no ACPI
         * companion device
         */
        if (!has_acpi_companion(uart->port.dev)) {
                struct mctrl_gpios *gpios = mctrl_gpio_init(&uart->port, 0);
                if (IS_ERR(gpios)) {
                        ret = PTR_ERR(gpios);
                        goto err;
                } else {
                        uart->gpios = gpios;
                }
        }

        serial8250_set_defaults(uart);

        /* Possibly override default I/O functions.  */
        if (up->port.serial_in)
                uart->port.serial_in = up->port.serial_in;
        if (up->port.serial_out)
                uart->port.serial_out = up->port.serial_out;
        if (up->port.handle_irq)
                uart->port.handle_irq = up->port.handle_irq;
        /*  Possibly override set_termios call */
        if (up->port.set_termios)
                uart->port.set_termios = up->port.set_termios;
        if (up->port.set_ldisc)
                uart->port.set_ldisc = up->port.set_ldisc;
        if (up->port.get_mctrl)
                uart->port.get_mctrl = up->port.get_mctrl;
        if (up->port.set_mctrl)
                uart->port.set_mctrl = up->port.set_mctrl;
        if (up->port.get_divisor)
                uart->port.get_divisor = up->port.get_divisor;
        if (up->port.set_divisor)
                uart->port.set_divisor = up->port.set_divisor;
        if (up->port.startup)
                uart->port.startup = up->port.startup;
        if (up->port.shutdown)
                uart->port.shutdown = up->port.shutdown;
        if (up->port.pm)
                uart->port.pm = up->port.pm;
        if (up->port.handle_break)
                uart->port.handle_break = up->port.handle_break;
        if (up->dl_read)
                uart->dl_read = up->dl_read;
        if (up->dl_write)
                uart->dl_write = up->dl_write;

        /* Check the type (again)! It might have changed by the port.type assignment above. */
        if (uart->port.type != PORT_8250_CIR) {
                if (uart_console_registered(&uart->port))
                        pm_runtime_get_sync(uart->port.dev);

                if (serial8250_isa_config != NULL)
                        serial8250_isa_config(0, &uart->port,
                                        &uart->capabilities);

                serial8250_apply_quirks(uart);
                ret = uart_add_one_port(&serial8250_reg,
                                        &uart->port);
                if (ret)
                        goto err;

                ret = uart->port.line;
        } else {
                dev_info(uart->port.dev,
                        "skipping CIR port at 0x%lx / 0x%llx, IRQ %d\n",
                        uart->port.iobase,
                        (unsigned long long)uart->port.mapbase,
                        uart->port.irq);

                ret = 0;
        }

        if (!uart->lsr_save_mask)
                uart->lsr_save_mask = LSR_SAVE_FLAGS;        /* Use default LSR mask */

        /* Initialise interrupt backoff work if required */
        if (up->overrun_backoff_time_ms > 0) {
                uart->overrun_backoff_time_ms =
                        up->overrun_backoff_time_ms;
                INIT_DELAYED_WORK(&uart->overrun_backoff,
                                serial_8250_overrun_backoff_work);
        } else {
                uart->overrun_backoff_time_ms = 0;
        }

        return ret;

err:
        uart->port.dev = NULL;
        return ret;
}
EXPORT_SYMBOL(serial8250_register_8250_port);

/**
 *        serial8250_unregister_port - remove a 16x50 serial port at runtime
 *        @line: serial line number
 *
 *        Remove one serial port.  This may not be called from interrupt
 *        context.  We hand the port back to the our control.
 */
void serial8250_unregister_port(int line)
{
        struct uart_8250_port *uart = &serial8250_ports[line];

        guard(mutex)(&serial_mutex);

        if (uart->em485) {
                guard(uart_port_lock_irqsave)(&uart->port);
                serial8250_em485_destroy(uart);
        }

        uart_remove_one_port(&serial8250_reg, &uart->port);
        if (serial8250_isa_devs) {
                uart->port.flags &= ~UPF_BOOT_AUTOCONF;
                uart->port.type = PORT_UNKNOWN;
                uart->port.dev = &serial8250_isa_devs->dev;
                uart->port.port_id = line;
                uart->capabilities = 0;
                serial8250_init_port(uart);
                serial8250_apply_quirks(uart);
                uart_add_one_port(&serial8250_reg, &uart->port);
        } else {
                uart->port.dev = NULL;
        }
}
EXPORT_SYMBOL(serial8250_unregister_port);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Generic 8250/16x50 serial driver");

























































    1 


























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * A generic kernel FIFO implementation
 *
 * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
 */

#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/export.h>
#include <linux/kfifo.h>
#include <linux/log2.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/uaccess.h>

/*
 * internal helper to calculate the unused elements in a fifo
 */
static inline unsigned int kfifo_unused(struct __kfifo *fifo)
{
        return (fifo->mask + 1) - (fifo->in - fifo->out);
}

int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size,
                size_t esize, gfp_t gfp_mask, int node)
{
        /*
         * round up to the next power of 2, since our 'let the indices
         * wrap' technique works only in this case.
         */
        size = roundup_pow_of_two(size);

        fifo->in = 0;
        fifo->out = 0;
        fifo->esize = esize;

        if (size < 2) {
                fifo->data = NULL;
                fifo->mask = 0;
                return -EINVAL;
        }

        fifo->data = kmalloc_array_node(size, esize, gfp_mask, node);

        if (!fifo->data) {
                fifo->mask = 0;
                return -ENOMEM;
        }
        fifo->mask = size - 1;

        return 0;
}
EXPORT_SYMBOL(__kfifo_alloc_node);

void __kfifo_free(struct __kfifo *fifo)
{
        kfree(fifo->data);
        fifo->in = 0;
        fifo->out = 0;
        fifo->esize = 0;
        fifo->data = NULL;
        fifo->mask = 0;
}
EXPORT_SYMBOL(__kfifo_free);

int __kfifo_init(struct __kfifo *fifo, void *buffer,
                unsigned int size, size_t esize)
{
        size /= esize;

        if (!is_power_of_2(size))
                size = rounddown_pow_of_two(size);

        fifo->in = 0;
        fifo->out = 0;
        fifo->esize = esize;
        fifo->data = buffer;

        if (size < 2) {
                fifo->mask = 0;
                return -EINVAL;
        }
        fifo->mask = size - 1;

        return 0;
}
EXPORT_SYMBOL(__kfifo_init);

static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
                unsigned int len, unsigned int off)
{
        unsigned int size = fifo->mask + 1;
        unsigned int esize = fifo->esize;
        unsigned int l;

        off &= fifo->mask;
        if (esize != 1) {
                off *= esize;
                size *= esize;
                len *= esize;
        }
        l = min(len, size - off);

        memcpy(fifo->data + off, src, l);
        memcpy(fifo->data, src + l, len - l);
        /*
         * make sure that the data in the fifo is up to date before
         * incrementing the fifo->in index counter
         */
        smp_wmb();
}

unsigned int __kfifo_in(struct __kfifo *fifo,
                const void *buf, unsigned int len)
{
        unsigned int l;

        l = kfifo_unused(fifo);
        if (len > l)
                len = l;

        kfifo_copy_in(fifo, buf, len, fifo->in);
        fifo->in += len;
        return len;
}
EXPORT_SYMBOL(__kfifo_in);

static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
                unsigned int len, unsigned int off)
{
        unsigned int size = fifo->mask + 1;
        unsigned int esize = fifo->esize;
        unsigned int l;

        off &= fifo->mask;
        if (esize != 1) {
                off *= esize;
                size *= esize;
                len *= esize;
        }
        l = min(len, size - off);

        memcpy(dst, fifo->data + off, l);
        memcpy(dst + l, fifo->data, len - l);
        /*
         * make sure that the data is copied before
         * incrementing the fifo->out index counter
         */
        smp_wmb();
}

unsigned int __kfifo_out_peek(struct __kfifo *fifo,
                void *buf, unsigned int len)
{
        unsigned int l;

        l = fifo->in - fifo->out;
        if (len > l)
                len = l;

        kfifo_copy_out(fifo, buf, len, fifo->out);
        return len;
}
EXPORT_SYMBOL(__kfifo_out_peek);

unsigned int __kfifo_out_linear(struct __kfifo *fifo,
                unsigned int *tail, unsigned int n)
{
        unsigned int size = fifo->mask + 1;
        unsigned int off = fifo->out & fifo->mask;

        if (tail)
                *tail = off;

        return min3(n, fifo->in - fifo->out, size - off);
}
EXPORT_SYMBOL(__kfifo_out_linear);

unsigned int __kfifo_out(struct __kfifo *fifo,
                void *buf, unsigned int len)
{
        len = __kfifo_out_peek(fifo, buf, len);
        fifo->out += len;
        return len;
}
EXPORT_SYMBOL(__kfifo_out);

static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
        const void __user *from, unsigned int len, unsigned int off,
        unsigned int *copied)
{
        unsigned int size = fifo->mask + 1;
        unsigned int esize = fifo->esize;
        unsigned int l;
        unsigned long ret;

        off &= fifo->mask;
        if (esize != 1) {
                off *= esize;
                size *= esize;
                len *= esize;
        }
        l = min(len, size - off);

        ret = copy_from_user(fifo->data + off, from, l);
        if (unlikely(ret))
                ret = DIV_ROUND_UP(ret + len - l, esize);
        else {
                ret = copy_from_user(fifo->data, from + l, len - l);
                if (unlikely(ret))
                        ret = DIV_ROUND_UP(ret, esize);
        }
        /*
         * make sure that the data in the fifo is up to date before
         * incrementing the fifo->in index counter
         */
        smp_wmb();
        *copied = len - ret * esize;
        /* return the number of elements which are not copied */
        return ret;
}

int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
                unsigned long len, unsigned int *copied)
{
        unsigned int l;
        unsigned long ret;
        unsigned int esize = fifo->esize;
        int err;

        if (esize != 1)
                len /= esize;

        l = kfifo_unused(fifo);
        if (len > l)
                len = l;

        ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
        if (unlikely(ret)) {
                len -= ret;
                err = -EFAULT;
        } else
                err = 0;
        fifo->in += len;
        return err;
}
EXPORT_SYMBOL(__kfifo_from_user);

static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
                unsigned int len, unsigned int off, unsigned int *copied)
{
        unsigned int l;
        unsigned long ret;
        unsigned int size = fifo->mask + 1;
        unsigned int esize = fifo->esize;

        off &= fifo->mask;
        if (esize != 1) {
                off *= esize;
                size *= esize;
                len *= esize;
        }
        l = min(len, size - off);

        ret = copy_to_user(to, fifo->data + off, l);
        if (unlikely(ret))
                ret = DIV_ROUND_UP(ret + len - l, esize);
        else {
                ret = copy_to_user(to + l, fifo->data, len - l);
                if (unlikely(ret))
                        ret = DIV_ROUND_UP(ret, esize);
        }
        /*
         * make sure that the data is copied before
         * incrementing the fifo->out index counter
         */
        smp_wmb();
        *copied = len - ret * esize;
        /* return the number of elements which are not copied */
        return ret;
}

int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
                unsigned long len, unsigned int *copied)
{
        unsigned int l;
        unsigned long ret;
        unsigned int esize = fifo->esize;
        int err;

        if (esize != 1)
                len /= esize;

        l = fifo->in - fifo->out;
        if (len > l)
                len = l;
        ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
        if (unlikely(ret)) {
                len -= ret;
                err = -EFAULT;
        } else
                err = 0;
        fifo->out += len;
        return err;
}
EXPORT_SYMBOL(__kfifo_to_user);

static unsigned int setup_sgl_buf(struct __kfifo *fifo, struct scatterlist *sgl,
                                  unsigned int data_offset, int nents,
                                  unsigned int len, dma_addr_t dma)
{
        const void *buf = fifo->data + data_offset;

        if (!nents || !len)
                return 0;

        sg_set_buf(sgl, buf, len);

        if (dma != DMA_MAPPING_ERROR) {
                sg_dma_address(sgl) = dma + data_offset;
                sg_dma_len(sgl) = len;
        }

        return 1;
}

static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
                int nents, unsigned int len, unsigned int off, dma_addr_t dma)
{
        unsigned int size = fifo->mask + 1;
        unsigned int esize = fifo->esize;
        unsigned int len_to_end;
        unsigned int n;

        off &= fifo->mask;
        if (esize != 1) {
                off *= esize;
                size *= esize;
                len *= esize;
        }
        len_to_end = min(len, size - off);

        n = setup_sgl_buf(fifo, sgl, off, nents, len_to_end, dma);
        n += setup_sgl_buf(fifo, sgl + n, 0, nents - n, len - len_to_end, dma);

        return n;
}

unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
                struct scatterlist *sgl, int nents, unsigned int len,
                dma_addr_t dma)
{
        unsigned int l;

        l = kfifo_unused(fifo);
        if (len > l)
                len = l;

        return setup_sgl(fifo, sgl, nents, len, fifo->in, dma);
}
EXPORT_SYMBOL(__kfifo_dma_in_prepare);

unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
                struct scatterlist *sgl, int nents, unsigned int len,
                dma_addr_t dma)
{
        unsigned int l;

        l = fifo->in - fifo->out;
        if (len > l)
                len = l;

        return setup_sgl(fifo, sgl, nents, len, fifo->out, dma);
}
EXPORT_SYMBOL(__kfifo_dma_out_prepare);

unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
{
        unsigned int max = (1 << (recsize << 3)) - 1;

        if (len > max)
                return max;
        return len;
}
EXPORT_SYMBOL(__kfifo_max_r);

#define        __KFIFO_PEEK(data, out, mask) \
        ((data)[(out) & (mask)])
/*
 * __kfifo_peek_n internal helper function for determinate the length of
 * the next record in the fifo
 */
static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
{
        unsigned int l;
        unsigned int mask = fifo->mask;
        unsigned char *data = fifo->data;

        l = __KFIFO_PEEK(data, fifo->out, mask);

        if (--recsize)
                l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;

        return l;
}

#define        __KFIFO_POKE(data, in, mask, val) \
        ( \
        (data)[(in) & (mask)] = (unsigned char)(val) \
        )

/*
 * __kfifo_poke_n internal helper function for storing the length of
 * the record into the fifo
 */
static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
{
        unsigned int mask = fifo->mask;
        unsigned char *data = fifo->data;

        __KFIFO_POKE(data, fifo->in, mask, n);

        if (recsize > 1)
                __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
}

unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
{
        return __kfifo_peek_n(fifo, recsize);
}
EXPORT_SYMBOL(__kfifo_len_r);

unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
                unsigned int len, size_t recsize)
{
        if (len + recsize > kfifo_unused(fifo))
                return 0;

        __kfifo_poke_n(fifo, len, recsize);

        kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
        fifo->in += len + recsize;
        return len;
}
EXPORT_SYMBOL(__kfifo_in_r);

static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
        void *buf, unsigned int len, size_t recsize, unsigned int *n)
{
        *n = __kfifo_peek_n(fifo, recsize);

        if (len > *n)
                len = *n;

        kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
        return len;
}

unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
                unsigned int len, size_t recsize)
{
        unsigned int n;

        if (fifo->in == fifo->out)
                return 0;

        return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
}
EXPORT_SYMBOL(__kfifo_out_peek_r);

unsigned int __kfifo_out_linear_r(struct __kfifo *fifo,
                unsigned int *tail, unsigned int n, size_t recsize)
{
        if (fifo->in == fifo->out)
                return 0;

        if (tail)
                *tail = fifo->out + recsize;

        return min(n, __kfifo_peek_n(fifo, recsize));
}
EXPORT_SYMBOL(__kfifo_out_linear_r);

unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
                unsigned int len, size_t recsize)
{
        unsigned int n;

        if (fifo->in == fifo->out)
                return 0;

        len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
        fifo->out += n + recsize;
        return len;
}
EXPORT_SYMBOL(__kfifo_out_r);

void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
{
        unsigned int n;

        n = __kfifo_peek_n(fifo, recsize);
        fifo->out += n + recsize;
}
EXPORT_SYMBOL(__kfifo_skip_r);

int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
        unsigned long len, unsigned int *copied, size_t recsize)
{
        unsigned long ret;

        len = __kfifo_max_r(len, recsize);

        if (len + recsize > kfifo_unused(fifo)) {
                *copied = 0;
                return 0;
        }

        __kfifo_poke_n(fifo, len, recsize);

        ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
        if (unlikely(ret)) {
                *copied = 0;
                return -EFAULT;
        }
        fifo->in += len + recsize;
        return 0;
}
EXPORT_SYMBOL(__kfifo_from_user_r);

int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
        unsigned long len, unsigned int *copied, size_t recsize)
{
        unsigned long ret;
        unsigned int n;

        if (fifo->in == fifo->out) {
                *copied = 0;
                return 0;
        }

        n = __kfifo_peek_n(fifo, recsize);
        if (len > n)
                len = n;

        ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
        if (unlikely(ret)) {
                *copied = 0;
                return -EFAULT;
        }
        fifo->out += n + recsize;
        return 0;
}
EXPORT_SYMBOL(__kfifo_to_user_r);

unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
        struct scatterlist *sgl, int nents, unsigned int len, size_t recsize,
        dma_addr_t dma)
{
        BUG_ON(!nents);

        len = __kfifo_max_r(len, recsize);

        if (len + recsize > kfifo_unused(fifo))
                return 0;

        return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize, dma);
}
EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);

void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
        unsigned int len, size_t recsize)
{
        len = __kfifo_max_r(len, recsize);
        __kfifo_poke_n(fifo, len, recsize);
        fifo->in += len + recsize;
}
EXPORT_SYMBOL(__kfifo_dma_in_finish_r);

unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
        struct scatterlist *sgl, int nents, unsigned int len, size_t recsize,
        dma_addr_t dma)
{
        BUG_ON(!nents);

        len = __kfifo_max_r(len, recsize);

        if (len + recsize > fifo->in - fifo->out)
                return 0;

        return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize, dma);
}
EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);











































   33 



















   33 




















   33 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef IOPRIO_H
#define IOPRIO_H

#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/iocontext.h>

#include <uapi/linux/ioprio.h>

/*
 * Default IO priority.
 */
#define IOPRIO_DEFAULT        IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0)

/*
 * Check that a priority value has a valid class.
 */
static inline bool ioprio_valid(unsigned short ioprio)
{
        unsigned short class = IOPRIO_PRIO_CLASS(ioprio);

        return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE;
}

/*
 * if process has set io priority explicitly, use that. if not, convert
 * the cpu scheduler nice value to an io priority
 */
static inline int task_nice_ioprio(struct task_struct *task)
{
        return (task_nice(task) + 20) / 5;
}

/*
 * This is for the case where the task hasn't asked for a specific IO class.
 * Check for idle and rt task process, and return appropriate IO class.
 */
static inline int task_nice_ioclass(struct task_struct *task)
{
        if (task->policy == SCHED_IDLE)
                return IOPRIO_CLASS_IDLE;
        else if (rt_or_dl_task_policy(task))
                return IOPRIO_CLASS_RT;
        else
                return IOPRIO_CLASS_BE;
}

#ifdef CONFIG_BLOCK
/*
 * If the task has set an I/O priority, use that. Otherwise, return
 * the default I/O priority.
 *
 * Expected to be called for current task or with task_lock() held to keep
 * io_context stable.
 */
static inline int __get_task_ioprio(struct task_struct *p)
{
        struct io_context *ioc = p->io_context;
        int prio;

        if (!ioc)
                return IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
                                         task_nice_ioprio(p));

        if (p != current)
                lockdep_assert_held(&p->alloc_lock);

        prio = ioc->ioprio;
        if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
                prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
                                         task_nice_ioprio(p));
        return prio;
}
#else
static inline int __get_task_ioprio(struct task_struct *p)
{
        return IOPRIO_DEFAULT;
}
#endif /* CONFIG_BLOCK */

static inline int get_current_ioprio(void)
{
        return __get_task_ioprio(current);
}

extern int set_task_ioprio(struct task_struct *task, int ioprio);

#ifdef CONFIG_BLOCK
extern int ioprio_check_cap(int ioprio);
#else
static inline int ioprio_check_cap(int ioprio)
{
        return -ENOTBLK;
}
#endif /* CONFIG_BLOCK */

#endif





































































































































































































































































































































































    1 


    1 










































































































































































































































    1 
































    1 





















    1 












    1 

    1 






    1 


















    1 





    1 













    1 
















    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
// SPDX-License-Identifier: GPL-2.0-or-later

#include <linux/plist.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/freezer.h>

#include "futex.h"

/*
 * READ this before attempting to hack on futexes!
 *
 * Basic futex operation and ordering guarantees
 * =============================================
 *
 * The waiter reads the futex value in user space and calls
 * futex_wait(). This function computes the hash bucket and acquires
 * the hash bucket lock. After that it reads the futex user space value
 * again and verifies that the data has not changed. If it has not changed
 * it enqueues itself into the hash bucket, releases the hash bucket lock
 * and schedules.
 *
 * The waker side modifies the user space value of the futex and calls
 * futex_wake(). This function computes the hash bucket and acquires the
 * hash bucket lock. Then it looks for waiters on that futex in the hash
 * bucket and wakes them.
 *
 * In futex wake up scenarios where no tasks are blocked on a futex, taking
 * the hb spinlock can be avoided and simply return. In order for this
 * optimization to work, ordering guarantees must exist so that the waiter
 * being added to the list is acknowledged when the list is concurrently being
 * checked by the waker, avoiding scenarios like the following:
 *
 * CPU 0                               CPU 1
 * val = *futex;
 * sys_futex(WAIT, futex, val);
 *   futex_wait(futex, val);
 *   uval = *futex;
 *                                     *futex = newval;
 *                                     sys_futex(WAKE, futex);
 *                                       futex_wake(futex);
 *                                       if (queue_empty())
 *                                         return;
 *   if (uval == val)
 *      lock(hash_bucket(futex));
 *      queue();
 *     unlock(hash_bucket(futex));
 *     schedule();
 *
 * This would cause the waiter on CPU 0 to wait forever because it
 * missed the transition of the user space value from val to newval
 * and the waker did not find the waiter in the hash bucket queue.
 *
 * The correct serialization ensures that a waiter either observes
 * the changed user space value before blocking or is woken by a
 * concurrent waker:
 *
 * CPU 0                                 CPU 1
 * val = *futex;
 * sys_futex(WAIT, futex, val);
 *   futex_wait(futex, val);
 *
 *   waiters++; (a)
 *   smp_mb(); (A) <-- paired with -.
 *                                  |
 *   lock(hash_bucket(futex));      |
 *                                  |
 *   uval = *futex;                 |
 *                                  |        *futex = newval;
 *                                  |        sys_futex(WAKE, futex);
 *                                  |          futex_wake(futex);
 *                                  |
 *                                  `--------> smp_mb(); (B)
 *   if (uval == val)
 *     queue();
 *     unlock(hash_bucket(futex));
 *     schedule();                         if (waiters)
 *                                           lock(hash_bucket(futex));
 *   else                                    wake_waiters(futex);
 *     waiters--; (b)                        unlock(hash_bucket(futex));
 *
 * Where (A) orders the waiters increment and the futex value read through
 * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write
 * to futex and the waiters read (see futex_hb_waiters_pending()).
 *
 * This yields the following case (where X:=waiters, Y:=futex):
 *
 *        X = Y = 0
 *
 *        w[X]=1                w[Y]=1
 *        MB                MB
 *        r[Y]=y                r[X]=x
 *
 * Which guarantees that x==0 && y==0 is impossible; which translates back into
 * the guarantee that we cannot both miss the futex variable change and the
 * enqueue.
 *
 * Note that a new waiter is accounted for in (a) even when it is possible that
 * the wait call can return error, in which case we backtrack from it in (b).
 * Refer to the comment in futex_q_lock().
 *
 * Similarly, in order to account for waiters being requeued on another
 * address we always increment the waiters for the destination bucket before
 * acquiring the lock. It then decrements them again  after releasing it -
 * the code that actually moves the futex(es) between hash buckets (requeue_futex)
 * will do the additional required waiter count housekeeping. This is done for
 * double_lock_hb() and double_unlock_hb(), respectively.
 */

bool __futex_wake_mark(struct futex_q *q)
{
        if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
                return false;

        __futex_unqueue(q);
        /*
         * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
         * is written, without taking any locks. This is possible in the event
         * of a spurious wakeup, for example. A memory barrier is required here
         * to prevent the following store to lock_ptr from getting ahead of the
         * plist_del in __futex_unqueue().
         */
        smp_store_release(&q->lock_ptr, NULL);

        return true;
}

/*
 * The hash bucket lock must be held when this is called.
 * Afterwards, the futex_q must not be accessed. Callers
 * must ensure to later call wake_up_q() for the actual
 * wakeups to occur.
 */
void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
{
        struct task_struct *p = q->task;

        get_task_struct(p);

        if (!__futex_wake_mark(q)) {
                put_task_struct(p);
                return;
        }

        /*
         * Queue the task for later wakeup for after we've released
         * the hb->lock.
         */
        wake_q_add_safe(wake_q, p);
}

/*
 * Wake up waiters matching bitset queued on this futex (uaddr).
 */
int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
        struct futex_q *this, *next;
        union futex_key key = FUTEX_KEY_INIT;
        DEFINE_WAKE_Q(wake_q);
        int ret;

        if (!bitset)
                return -EINVAL;

        ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
        if (unlikely(ret != 0))
                return ret;

        if ((flags & FLAGS_STRICT) && !nr_wake)
                return 0;

        CLASS(hb, hb)(&key);

        /* Make sure we really have tasks to wakeup */
        if (!futex_hb_waiters_pending(hb))
                return ret;

        spin_lock(&hb->lock);

        plist_for_each_entry_safe(this, next, &hb->chain, list) {
                if (futex_match (&this->key, &key)) {
                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
                                break;
                        }

                        /* Check if one of the bits is set in both bitsets */
                        if (!(this->bitset & bitset))
                                continue;

                        this->wake(&wake_q, this);
                        if (++ret >= nr_wake)
                                break;
                }
        }

        spin_unlock(&hb->lock);
        wake_up_q(&wake_q);
        return ret;
}

static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
{
        unsigned int op =          (encoded_op & 0x70000000) >> 28;
        unsigned int cmp =          (encoded_op & 0x0f000000) >> 24;
        int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
        int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
        int oldval, ret;

        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
                if (oparg < 0 || oparg > 31) {
                        /*
                         * kill this print and return -EINVAL when userspace
                         * is sane again
                         */
                        pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
                                            current->comm, oparg);
                        oparg &= 31;
                }
                oparg = 1 << oparg;
        }

        pagefault_disable();
        ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
        pagefault_enable();
        if (ret)
                return ret;

        switch (cmp) {
        case FUTEX_OP_CMP_EQ:
                return oldval == cmparg;
        case FUTEX_OP_CMP_NE:
                return oldval != cmparg;
        case FUTEX_OP_CMP_LT:
                return oldval < cmparg;
        case FUTEX_OP_CMP_GE:
                return oldval >= cmparg;
        case FUTEX_OP_CMP_LE:
                return oldval <= cmparg;
        case FUTEX_OP_CMP_GT:
                return oldval > cmparg;
        default:
                return -ENOSYS;
        }
}

/*
 * Wake up all waiters hashed on the physical page that is mapped
 * to this virtual address:
 */
int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
                  int nr_wake, int nr_wake2, int op)
{
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        struct futex_q *this, *next;
        int ret, op_ret;
        DEFINE_WAKE_Q(wake_q);

retry:
        ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
        if (unlikely(ret != 0))
                return ret;
        ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
        if (unlikely(ret != 0))
                return ret;

retry_private:
        if (1) {
                CLASS(hb, hb1)(&key1);
                CLASS(hb, hb2)(&key2);

                double_lock_hb(hb1, hb2);
                op_ret = futex_atomic_op_inuser(op, uaddr2);
                if (unlikely(op_ret < 0)) {
                        double_unlock_hb(hb1, hb2);

                        if (!IS_ENABLED(CONFIG_MMU) ||
                            unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
                                /*
                                 * we don't get EFAULT from MMU faults if we don't have
                                 * an MMU, but we might get them from range checking
                                 */
                                ret = op_ret;
                                return ret;
                        }

                        if (op_ret == -EFAULT) {
                                ret = fault_in_user_writeable(uaddr2);
                                if (ret)
                                        return ret;
                        }

                        cond_resched();
                        if (!(flags & FLAGS_SHARED))
                                goto retry_private;
                        goto retry;
                }

                plist_for_each_entry_safe(this, next, &hb1->chain, list) {
                        if (futex_match(&this->key, &key1)) {
                                if (this->pi_state || this->rt_waiter) {
                                        ret = -EINVAL;
                                        goto out_unlock;
                                }
                                this->wake(&wake_q, this);
                                if (++ret >= nr_wake)
                                        break;
                        }
                }

                if (op_ret > 0) {
                        op_ret = 0;
                        plist_for_each_entry_safe(this, next, &hb2->chain, list) {
                                if (futex_match(&this->key, &key2)) {
                                        if (this->pi_state || this->rt_waiter) {
                                                ret = -EINVAL;
                                                goto out_unlock;
                                        }
                                        this->wake(&wake_q, this);
                                        if (++op_ret >= nr_wake2)
                                                break;
                                }
                        }
                        ret += op_ret;
                }

out_unlock:
                double_unlock_hb(hb1, hb2);
        }
        wake_up_q(&wake_q);
        return ret;
}

static long futex_wait_restart(struct restart_block *restart);

/**
 * futex_do_wait() - wait for wakeup, timeout, or signal
 * @q:                the futex_q to queue up on
 * @timeout:        the prepared hrtimer_sleeper, or null for no timeout
 */
void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
{
        /* Arm the timer */
        if (timeout)
                hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);

        /*
         * If we have been removed from the hash list, then another task
         * has tried to wake us, and we can skip the call to schedule().
         */
        if (likely(!plist_node_empty(&q->list))) {
                /*
                 * If the timer has already expired, current will already be
                 * flagged for rescheduling. Only call schedule if there
                 * is no timeout, or if it has yet to expire.
                 */
                if (!timeout || timeout->task)
                        schedule();
        }
        __set_current_state(TASK_RUNNING);
}

/**
 * futex_unqueue_multiple - Remove various futexes from their hash bucket
 * @v:           The list of futexes to unqueue
 * @count: Number of futexes in the list
 *
 * Helper to unqueue a list of futexes. This can't fail.
 *
 * Return:
 *  - >=0 - Index of the last futex that was awoken;
 *  - -1  - No futex was awoken
 */
int futex_unqueue_multiple(struct futex_vector *v, int count)
{
        int ret = -1, i;

        for (i = 0; i < count; i++) {
                if (!futex_unqueue(&v[i].q))
                        ret = i;
        }

        return ret;
}

/**
 * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
 * @vs:                The futex list to wait on
 * @count:        The size of the list
 * @woken:        Index of the last woken futex, if any. Used to notify the
 *                caller that it can return this index to userspace (return parameter)
 *
 * Prepare multiple futexes in a single step and enqueue them. This may fail if
 * the futex list is invalid or if any futex was already awoken. On success the
 * task is ready to interruptible sleep.
 *
 * Return:
 *  -  1 - One of the futexes was woken by another thread
 *  -  0 - Success
 *  - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
 */
int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
{
        bool retry = false;
        int ret, i;
        u32 uval;

        /*
         * Make sure to have a reference on the private_hash such that we
         * don't block on rehash after changing the task state below.
         */
        guard(private_hash)();

        /*
         * Enqueuing multiple futexes is tricky, because we need to enqueue
         * each futex on the list before dealing with the next one to avoid
         * deadlocking on the hash bucket. But, before enqueuing, we need to
         * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
         * lose any wake events, which cannot be done before the get_futex_key
         * of the next key, because it calls get_user_pages, which can sleep.
         * Thus, we fetch the list of futexes keys in two steps, by first
         * pinning all the memory keys in the futex key, and only then we read
         * each key and queue the corresponding futex.
         *
         * Private futexes doesn't need to recalculate hash in retry, so skip
         * get_futex_key() when retrying.
         */
retry:
        for (i = 0; i < count; i++) {
                if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
                        continue;

                ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
                                    vs[i].w.flags,
                                    &vs[i].q.key, FUTEX_READ);

                if (unlikely(ret))
                        return ret;
        }

        set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);

        for (i = 0; i < count; i++) {
                u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
                struct futex_q *q = &vs[i].q;
                u32 val = vs[i].w.val;

                if (1) {
                        CLASS(hb, hb)(&q->key);

                        futex_q_lock(q, hb);
                        ret = futex_get_value_locked(&uval, uaddr);

                        if (!ret && uval == val) {
                                /*
                                 * The bucket lock can't be held while dealing with the
                                 * next futex. Queue each futex at this moment so hb can
                                 * be unlocked.
                                 */
                                futex_queue(q, hb, current);
                                continue;
                        }

                        futex_q_unlock(hb);
                        __release(q->lock_ptr);
                }
                __set_current_state(TASK_RUNNING);

                /*
                 * Even if something went wrong, if we find out that a futex
                 * was woken, we don't return error and return this index to
                 * userspace
                 */
                *woken = futex_unqueue_multiple(vs, i);
                if (*woken >= 0)
                        return 1;

                if (ret) {
                        /*
                         * If we need to handle a page fault, we need to do so
                         * without any lock and any enqueued futex (otherwise
                         * we could lose some wakeup). So we do it here, after
                         * undoing all the work done so far. In success, we
                         * retry all the work.
                         */
                        if (get_user(uval, uaddr))
                                return -EFAULT;

                        retry = true;
                        goto retry;
                }

                if (uval != val)
                        return -EWOULDBLOCK;
        }

        return 0;
}

/**
 * futex_sleep_multiple - Check sleeping conditions and sleep
 * @vs:    List of futexes to wait for
 * @count: Length of vs
 * @to:    Timeout
 *
 * Sleep if and only if the timeout hasn't expired and no futex on the list has
 * been woken up.
 */
static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
                                 struct hrtimer_sleeper *to)
{
        if (to && !to->task)
                return;

        for (; count; count--, vs++) {
                if (!READ_ONCE(vs->q.lock_ptr))
                        return;
        }

        schedule();
}

/**
 * futex_wait_multiple - Prepare to wait on and enqueue several futexes
 * @vs:                The list of futexes to wait on
 * @count:        The number of objects
 * @to:                Timeout before giving up and returning to userspace
 *
 * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
 * sleeps on a group of futexes and returns on the first futex that is
 * wake, or after the timeout has elapsed.
 *
 * Return:
 *  - >=0 - Hint to the futex that was awoken
 *  - <0  - On error
 */
int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
                        struct hrtimer_sleeper *to)
{
        int ret, hint = 0;

        if (to)
                hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);

        while (1) {
                ret = futex_wait_multiple_setup(vs, count, &hint);
                if (ret) {
                        if (ret > 0) {
                                /* A futex was woken during setup */
                                ret = hint;
                        }
                        return ret;
                }

                futex_sleep_multiple(vs, count, to);

                __set_current_state(TASK_RUNNING);

                ret = futex_unqueue_multiple(vs, count);
                if (ret >= 0)
                        return ret;

                if (to && !to->task)
                        return -ETIMEDOUT;
                else if (signal_pending(current))
                        return -ERESTARTSYS;
                /*
                 * The final case is a spurious wakeup, for
                 * which just retry.
                 */
        }
}

/**
 * futex_wait_setup() - Prepare to wait on a futex
 * @uaddr:        the futex userspace address
 * @val:        the expected value
 * @flags:        futex flags (FLAGS_SHARED, etc.)
 * @q:                the associated futex_q
 * @key2:        the second futex_key if used for requeue PI
 * @task:        Task queueing this futex
 *
 * Setup the futex_q and locate the hash_bucket.  Get the futex value and
 * compare it with the expected value.  Handle atomic faults internally.
 * Return with the hb lock held on success, and unlocked on failure.
 *
 * Return:
 *  -  0 - uaddr contains val and hb has been locked;
 *  - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not
 *           be read, does not contain the expected value or is not properly aligned.
 */
int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                     struct futex_q *q, union futex_key *key2,
                     struct task_struct *task)
{
        u32 uval;
        int ret;

        /*
         * Access the page AFTER the hash-bucket is locked.
         * Order is important:
         *
         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
         *
         * The basic logical guarantee of a futex is that it blocks ONLY
         * if cond(var) is known to be true at the time of blocking, for
         * any cond.  If we locked the hash-bucket after testing *uaddr, that
         * would open a race condition where we could block indefinitely with
         * cond(var) false, which would violate the guarantee.
         *
         * On the other hand, we insert q and release the hash-bucket only
         * after testing *uaddr.  This guarantees that futex_wait() will NOT
         * absorb a wakeup if *uaddr does not match the desired values
         * while the syscall executes.
         */
retry:
        ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
        if (unlikely(ret != 0))
                return ret;

retry_private:
        if (1) {
                CLASS(hb, hb)(&q->key);

                futex_q_lock(q, hb);

                ret = futex_get_value_locked(&uval, uaddr);

                if (ret) {
                        futex_q_unlock(hb);
                        __release(q->lock_ptr);

                        ret = get_user(uval, uaddr);
                        if (ret)
                                return ret;

                        if (!(flags & FLAGS_SHARED))
                                goto retry_private;

                        goto retry;
                }

                if (uval != val) {
                        futex_q_unlock(hb);
                        __release(q->lock_ptr);
                        return -EWOULDBLOCK;
                }

                if (key2 && futex_match(&q->key, key2)) {
                        futex_q_unlock(hb);
                        __release(q->lock_ptr);
                        return -EINVAL;
                }

                /*
                 * The task state is guaranteed to be set before another task can
                 * wake it. set_current_state() is implemented using smp_store_mb() and
                 * futex_queue() calls spin_unlock() upon completion, both serializing
                 * access to the hash list and forcing another memory barrier.
                 */
                if (task == current)
                        set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                futex_queue(q, hb, task);
        }

        return ret;
}

int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
                 struct hrtimer_sleeper *to, u32 bitset)
{
        struct futex_q q = futex_q_init;
        int ret;

        if (!bitset)
                return -EINVAL;

        q.bitset = bitset;

retry:
        /*
         * Prepare to wait on uaddr. On success, it holds hb->lock and q
         * is initialized.
         */
        ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current);
        if (ret)
                return ret;

        /* futex_queue and wait for wakeup, timeout, or a signal. */
        futex_do_wait(&q, to);

        /* If we were woken (and unqueued), we succeeded, whatever. */
        if (!futex_unqueue(&q))
                return 0;

        if (to && !to->task)
                return -ETIMEDOUT;

        /*
         * We expect signal_pending(current), but we might be the
         * victim of a spurious wakeup as well.
         */
        if (!signal_pending(current))
                goto retry;

        return -ERESTARTSYS;
}

int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
{
        struct hrtimer_sleeper timeout, *to;
        struct restart_block *restart;
        int ret;

        to = futex_setup_timer(abs_time, &timeout, flags,
                               current->timer_slack_ns);

        ret = __futex_wait(uaddr, flags, val, to, bitset);

        /* No timeout, nothing to clean up. */
        if (!to)
                return ret;

        hrtimer_cancel(&to->timer);
        destroy_hrtimer_on_stack(&to->timer);

        if (ret == -ERESTARTSYS) {
                restart = &current->restart_block;
                restart->futex.uaddr = uaddr;
                restart->futex.val = val;
                restart->futex.time = *abs_time;
                restart->futex.bitset = bitset;
                restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;

                return set_restart_fn(restart, futex_wait_restart);
        }

        return ret;
}

static long futex_wait_restart(struct restart_block *restart)
{
        u32 __user *uaddr = restart->futex.uaddr;
        ktime_t *tp = NULL;

        if (restart->futex.flags & FLAGS_HAS_TIMEOUT)
                tp = &restart->futex.time;

        restart->fn = do_no_restart_syscall;

        return (long)futex_wait(uaddr, restart->futex.flags,
                                restart->futex.val, tp, restart->futex.bitset);
}






























    2 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef DECOMPRESSOR_H
#define DECOMPRESSOR_H
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
 * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * decompressor.h
 */

#include <linux/bio.h>

struct squashfs_decompressor {
        void        *(*init)(struct squashfs_sb_info *, void *);
        void        *(*comp_opts)(struct squashfs_sb_info *, void *, int);
        void        (*free)(void *);
        int        (*decompress)(struct squashfs_sb_info *, void *,
                struct bio *, int, int, struct squashfs_page_actor *);
        int        id;
        char        *name;
        int        alloc_buffer;
        int        supported;
};

static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk,
                                                        void *buff, int length)
{
        return msblk->decompressor->comp_opts ?
                msblk->decompressor->comp_opts(msblk, buff, length) : NULL;
}

#ifdef CONFIG_SQUASHFS_XZ
extern const struct squashfs_decompressor squashfs_xz_comp_ops;
#endif

#ifdef CONFIG_SQUASHFS_LZ4
extern const struct squashfs_decompressor squashfs_lz4_comp_ops;
#endif

#ifdef CONFIG_SQUASHFS_LZO
extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
#endif

#ifdef CONFIG_SQUASHFS_ZLIB
extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
#endif

#ifdef CONFIG_SQUASHFS_ZSTD
extern const struct squashfs_decompressor squashfs_zstd_comp_ops;
#endif

#endif





































    2 







    1 



























    2 
    2 




    1 
    1 




    2 
    2 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
 * cleaned up code to current version of sparse and added the slicing-by-8
 * algorithm to the closely similar existing slicing-by-4 algorithm.
 *
 * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
 * Nicer crc32 functions/docs submitted by linux@horizon.com.  Thanks!
 * Code was from the public domain, copyright abandoned.  Code was
 * subsequently included in the kernel, thus was re-licensed under the
 * GNU GPL v2.
 *
 * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
 * Same crc32 function was used in 5 other places in the kernel.
 * I made one version, and deleted the others.
 * There are various incantations of crc32().  Some use a seed of 0 or ~0.
 * Some xor at the end with ~0.  The generic crc32() function takes
 * seed as an argument, and doesn't xor at the end.  Then individual
 * users can do whatever they need.
 *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
 *   fs/jffs2 uses seed 0, doesn't xor with ~0.
 *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
 */

/* see: Documentation/staging/crc32.rst for a description of algorithms */

#include <linux/crc32.h>
#include <linux/export.h>
#include <linux/module.h>
#include <linux/types.h>

#include "crc32table.h"

static inline u32 __maybe_unused
crc32_le_base(u32 crc, const u8 *p, size_t len)
{
        while (len--)
                crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++];
        return crc;
}

static inline u32 __maybe_unused
crc32_be_base(u32 crc, const u8 *p, size_t len)
{
        while (len--)
                crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++];
        return crc;
}

static inline u32 __maybe_unused
crc32c_base(u32 crc, const u8 *p, size_t len)
{
        while (len--)
                crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++];
        return crc;
}

#ifdef CONFIG_CRC32_ARCH
#include "crc32.h" /* $(SRCARCH)/crc32.h */

u32 crc32_optimizations(void)
{
        return crc32_optimizations_arch();
}
EXPORT_SYMBOL(crc32_optimizations);
#else
#define crc32_le_arch crc32_le_base
#define crc32_be_arch crc32_be_base
#define crc32c_arch crc32c_base
#endif

u32 crc32_le(u32 crc, const void *p, size_t len)
{
        return crc32_le_arch(crc, p, len);
}
EXPORT_SYMBOL(crc32_le);

u32 crc32_be(u32 crc, const void *p, size_t len)
{
        return crc32_be_arch(crc, p, len);
}
EXPORT_SYMBOL(crc32_be);

u32 crc32c(u32 crc, const void *p, size_t len)
{
        return crc32c_arch(crc, p, len);
}
EXPORT_SYMBOL(crc32c);

#ifdef crc32_mod_init_arch
static int __init crc32_mod_init(void)
{
        crc32_mod_init_arch();
        return 0;
}
subsys_initcall(crc32_mod_init);

static void __exit crc32_mod_exit(void)
{
}
module_exit(crc32_mod_exit);
#endif

MODULE_DESCRIPTION("CRC32 library functions");
MODULE_LICENSE("GPL");



























































































































































































































































































































































































































































    1 





    1 

















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
// SPDX-License-Identifier: GPL-2.0
/*
  USB Driver layer for GSM modems

  Copyright (C) 2005  Matthias Urlichs <smurf@smurf.noris.de>

  Portions copied from the Keyspan driver by Hugh Blemings <hugh@blemings.org>

  History: see the git log.

  Work sponsored by: Sigos GmbH, Germany <info@sigos.de>

  This driver exists because the "normal" serial driver doesn't work too well
  with GSM modems. Issues:
  - data loss -- one single Receive URB is not nearly enough
  - controlling the baud rate doesn't make sense
*/

#define DRIVER_AUTHOR "Matthias Urlichs <smurf@smurf.noris.de>"
#define DRIVER_DESC "USB Driver for GSM modems"

#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/tty.h>
#include <linux/tty_flip.h>
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/uaccess.h>
#include <linux/usb.h>
#include <linux/usb/cdc.h>
#include <linux/usb/serial.h>
#include <linux/serial.h>
#include "usb-wwan.h"

/*
 * Generate DTR/RTS signals on the port using the SET_CONTROL_LINE_STATE request
 * in CDC ACM.
 */
static int usb_wwan_send_setup(struct usb_serial_port *port)
{
        struct usb_serial *serial = port->serial;
        struct usb_wwan_port_private *portdata;
        int val = 0;
        int ifnum;
        int res;

        portdata = usb_get_serial_port_data(port);

        if (portdata->dtr_state)
                val |= USB_CDC_CTRL_DTR;
        if (portdata->rts_state)
                val |= USB_CDC_CTRL_RTS;

        ifnum = serial->interface->cur_altsetting->desc.bInterfaceNumber;

        res = usb_autopm_get_interface(serial->interface);
        if (res)
                return res;

        res = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0),
                                USB_CDC_REQ_SET_CONTROL_LINE_STATE,
                                USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE,
                                val, ifnum, NULL, 0, USB_CTRL_SET_TIMEOUT);

        usb_autopm_put_interface(port->serial->interface);

        return res;
}

void usb_wwan_dtr_rts(struct usb_serial_port *port, int on)
{
        struct usb_wwan_port_private *portdata;
        struct usb_wwan_intf_private *intfdata;

        intfdata = usb_get_serial_data(port->serial);

        if (!intfdata->use_send_setup)
                return;

        portdata = usb_get_serial_port_data(port);
        /* FIXME: locking */
        portdata->rts_state = on;
        portdata->dtr_state = on;

        usb_wwan_send_setup(port);
}
EXPORT_SYMBOL(usb_wwan_dtr_rts);

int usb_wwan_tiocmget(struct tty_struct *tty)
{
        struct usb_serial_port *port = tty->driver_data;
        unsigned int value;
        struct usb_wwan_port_private *portdata;

        portdata = usb_get_serial_port_data(port);

        value = ((portdata->rts_state) ? TIOCM_RTS : 0) |
            ((portdata->dtr_state) ? TIOCM_DTR : 0) |
            ((portdata->cts_state) ? TIOCM_CTS : 0) |
            ((portdata->dsr_state) ? TIOCM_DSR : 0) |
            ((portdata->dcd_state) ? TIOCM_CAR : 0) |
            ((portdata->ri_state) ? TIOCM_RNG : 0);

        return value;
}
EXPORT_SYMBOL(usb_wwan_tiocmget);

int usb_wwan_tiocmset(struct tty_struct *tty,
                      unsigned int set, unsigned int clear)
{
        struct usb_serial_port *port = tty->driver_data;
        struct usb_wwan_port_private *portdata;
        struct usb_wwan_intf_private *intfdata;

        portdata = usb_get_serial_port_data(port);
        intfdata = usb_get_serial_data(port->serial);

        if (!intfdata->use_send_setup)
                return -EINVAL;

        /* FIXME: what locks portdata fields ? */
        if (set & TIOCM_RTS)
                portdata->rts_state = 1;
        if (set & TIOCM_DTR)
                portdata->dtr_state = 1;

        if (clear & TIOCM_RTS)
                portdata->rts_state = 0;
        if (clear & TIOCM_DTR)
                portdata->dtr_state = 0;
        return usb_wwan_send_setup(port);
}
EXPORT_SYMBOL(usb_wwan_tiocmset);

int usb_wwan_write(struct tty_struct *tty, struct usb_serial_port *port,
                   const unsigned char *buf, int count)
{
        struct usb_wwan_port_private *portdata;
        struct usb_wwan_intf_private *intfdata;
        int i;
        int left, todo;
        struct urb *this_urb = NULL;        /* spurious */
        int err;
        unsigned long flags;

        portdata = usb_get_serial_port_data(port);
        intfdata = usb_get_serial_data(port->serial);

        dev_dbg(&port->dev, "%s: write (%d chars)\n", __func__, count);

        left = count;
        for (i = 0; left > 0 && i < N_OUT_URB; i++) {
                todo = left;
                if (todo > OUT_BUFLEN)
                        todo = OUT_BUFLEN;

                this_urb = portdata->out_urbs[i];
                if (test_and_set_bit(i, &portdata->out_busy)) {
                        if (time_before(jiffies,
                                        portdata->tx_start_time[i] + 10 * HZ))
                                continue;
                        usb_unlink_urb(this_urb);
                        continue;
                }
                dev_dbg(&port->dev, "%s: endpoint %d buf %d\n", __func__,
                        usb_pipeendpoint(this_urb->pipe), i);

                err = usb_autopm_get_interface_async(port->serial->interface);
                if (err < 0) {
                        clear_bit(i, &portdata->out_busy);
                        break;
                }

                /* send the data */
                memcpy(this_urb->transfer_buffer, buf, todo);
                this_urb->transfer_buffer_length = todo;

                spin_lock_irqsave(&intfdata->susp_lock, flags);
                if (intfdata->suspended) {
                        usb_anchor_urb(this_urb, &portdata->delayed);
                        spin_unlock_irqrestore(&intfdata->susp_lock, flags);
                } else {
                        intfdata->in_flight++;
                        spin_unlock_irqrestore(&intfdata->susp_lock, flags);
                        err = usb_submit_urb(this_urb, GFP_ATOMIC);
                        if (err) {
                                dev_err(&port->dev,
                                        "%s: submit urb %d failed: %d\n",
                                        __func__, i, err);
                                clear_bit(i, &portdata->out_busy);
                                spin_lock_irqsave(&intfdata->susp_lock, flags);
                                intfdata->in_flight--;
                                spin_unlock_irqrestore(&intfdata->susp_lock,
                                                       flags);
                                usb_autopm_put_interface_async(port->serial->interface);
                                break;
                        }
                }

                portdata->tx_start_time[i] = jiffies;
                buf += todo;
                left -= todo;
        }

        count -= left;
        dev_dbg(&port->dev, "%s: wrote (did %d)\n", __func__, count);
        return count;
}
EXPORT_SYMBOL(usb_wwan_write);

static void usb_wwan_indat_callback(struct urb *urb)
{
        int err;
        int endpoint;
        struct usb_serial_port *port;
        struct device *dev;
        unsigned char *data = urb->transfer_buffer;
        int status = urb->status;

        endpoint = usb_pipeendpoint(urb->pipe);
        port = urb->context;
        dev = &port->dev;

        if (status) {
                dev_dbg(dev, "%s: nonzero status: %d on endpoint %02x.\n",
                        __func__, status, endpoint);

                /* don't resubmit on fatal errors */
                if (status == -ESHUTDOWN || status == -ENOENT)
                        return;
        } else {
                if (urb->actual_length) {
                        tty_insert_flip_string(&port->port, data,
                                        urb->actual_length);
                        tty_flip_buffer_push(&port->port);
                } else
                        dev_dbg(dev, "%s: empty read urb received\n", __func__);
        }
        /* Resubmit urb so we continue receiving */
        err = usb_submit_urb(urb, GFP_ATOMIC);
        if (err) {
                if (err != -EPERM && err != -ENODEV) {
                        dev_err(dev, "%s: resubmit read urb failed. (%d)\n",
                                __func__, err);
                        /* busy also in error unless we are killed */
                        usb_mark_last_busy(port->serial->dev);
                }
        } else {
                usb_mark_last_busy(port->serial->dev);
        }
}

static void usb_wwan_outdat_callback(struct urb *urb)
{
        struct usb_serial_port *port;
        struct usb_wwan_port_private *portdata;
        struct usb_wwan_intf_private *intfdata;
        unsigned long flags;
        int i;

        port = urb->context;
        intfdata = usb_get_serial_data(port->serial);

        usb_serial_port_softint(port);
        usb_autopm_put_interface_async(port->serial->interface);
        portdata = usb_get_serial_port_data(port);
        spin_lock_irqsave(&intfdata->susp_lock, flags);
        intfdata->in_flight--;
        spin_unlock_irqrestore(&intfdata->susp_lock, flags);

        for (i = 0; i < N_OUT_URB; ++i) {
                if (portdata->out_urbs[i] == urb) {
                        smp_mb__before_atomic();
                        clear_bit(i, &portdata->out_busy);
                        break;
                }
        }
}

unsigned int usb_wwan_write_room(struct tty_struct *tty)
{
        struct usb_serial_port *port = tty->driver_data;
        struct usb_wwan_port_private *portdata;
        int i;
        unsigned int data_len = 0;
        struct urb *this_urb;

        portdata = usb_get_serial_port_data(port);

        for (i = 0; i < N_OUT_URB; i++) {
                this_urb = portdata->out_urbs[i];
                if (this_urb && !test_bit(i, &portdata->out_busy))
                        data_len += OUT_BUFLEN;
        }

        dev_dbg(&port->dev, "%s: %u\n", __func__, data_len);
        return data_len;
}
EXPORT_SYMBOL(usb_wwan_write_room);

unsigned int usb_wwan_chars_in_buffer(struct tty_struct *tty)
{
        struct usb_serial_port *port = tty->driver_data;
        struct usb_wwan_port_private *portdata;
        int i;
        unsigned int data_len = 0;
        struct urb *this_urb;

        portdata = usb_get_serial_port_data(port);

        for (i = 0; i < N_OUT_URB; i++) {
                this_urb = portdata->out_urbs[i];
                /* FIXME: This locking is insufficient as this_urb may
                   go unused during the test */
                if (this_urb && test_bit(i, &portdata->out_busy))
                        data_len += this_urb->transfer_buffer_length;
        }
        dev_dbg(&port->dev, "%s: %u\n", __func__, data_len);
        return data_len;
}
EXPORT_SYMBOL(usb_wwan_chars_in_buffer);

int usb_wwan_open(struct tty_struct *tty, struct usb_serial_port *port)
{
        struct usb_wwan_port_private *portdata;
        struct usb_wwan_intf_private *intfdata;
        struct usb_serial *serial = port->serial;
        int i, err;
        struct urb *urb;

        portdata = usb_get_serial_port_data(port);
        intfdata = usb_get_serial_data(serial);

        if (port->interrupt_in_urb) {
                err = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL);
                if (err) {
                        dev_err(&port->dev, "%s: submit int urb failed: %d\n",
                                __func__, err);
                }
        }

        /* Start reading from the IN endpoint */
        for (i = 0; i < N_IN_URB; i++) {
                urb = portdata->in_urbs[i];
                if (!urb)
                        continue;
                err = usb_submit_urb(urb, GFP_KERNEL);
                if (err) {
                        dev_err(&port->dev,
                                "%s: submit read urb %d failed: %d\n",
                                __func__, i, err);
                }
        }

        spin_lock_irq(&intfdata->susp_lock);
        if (++intfdata->open_ports == 1)
                serial->interface->needs_remote_wakeup = 1;
        spin_unlock_irq(&intfdata->susp_lock);
        /* this balances a get in the generic USB serial code */
        usb_autopm_put_interface(serial->interface);

        return 0;
}
EXPORT_SYMBOL(usb_wwan_open);

static void unbusy_queued_urb(struct urb *urb,
                                        struct usb_wwan_port_private *portdata)
{
        int i;

        for (i = 0; i < N_OUT_URB; i++) {
                if (urb == portdata->out_urbs[i]) {
                        clear_bit(i, &portdata->out_busy);
                        break;
                }
        }
}

void usb_wwan_close(struct usb_serial_port *port)
{
        int i;
        struct usb_serial *serial = port->serial;
        struct usb_wwan_port_private *portdata;
        struct usb_wwan_intf_private *intfdata = usb_get_serial_data(serial);
        struct urb *urb;

        portdata = usb_get_serial_port_data(port);

        /*
         * Need to take susp_lock to make sure port is not already being
         * resumed, but no need to hold it due to the tty-port initialized
         * flag.
         */
        spin_lock_irq(&intfdata->susp_lock);
        if (--intfdata->open_ports == 0)
                serial->interface->needs_remote_wakeup = 0;
        spin_unlock_irq(&intfdata->susp_lock);

        for (;;) {
                urb = usb_get_from_anchor(&portdata->delayed);
                if (!urb)
                        break;
                unbusy_queued_urb(urb, portdata);
                usb_autopm_put_interface_async(serial->interface);
        }

        for (i = 0; i < N_IN_URB; i++)
                usb_kill_urb(portdata->in_urbs[i]);
        for (i = 0; i < N_OUT_URB; i++)
                usb_kill_urb(portdata->out_urbs[i]);
        usb_kill_urb(port->interrupt_in_urb);

        usb_autopm_get_interface_no_resume(serial->interface);
}
EXPORT_SYMBOL(usb_wwan_close);

static struct urb *usb_wwan_setup_urb(struct usb_serial_port *port,
                                      int endpoint,
                                      int dir, void *ctx, char *buf, int len,
                                      void (*callback) (struct urb *))
{
        struct usb_serial *serial = port->serial;
        struct usb_wwan_intf_private *intfdata = usb_get_serial_data(serial);
        struct urb *urb;

        urb = usb_alloc_urb(0, GFP_KERNEL);        /* No ISO */
        if (!urb)
                return NULL;

        usb_fill_bulk_urb(urb, serial->dev,
                          usb_sndbulkpipe(serial->dev, endpoint) | dir,
                          buf, len, callback, ctx);

        if (intfdata->use_zlp && dir == USB_DIR_OUT)
                urb->transfer_flags |= URB_ZERO_PACKET;

        return urb;
}

int usb_wwan_port_probe(struct usb_serial_port *port)
{
        struct usb_wwan_port_private *portdata;
        struct urb *urb;
        u8 *buffer;
        int i;

        if (!port->bulk_in_size || !port->bulk_out_size)
                return -ENODEV;

        portdata = kzalloc_obj(*portdata);
        if (!portdata)
                return -ENOMEM;

        init_usb_anchor(&portdata->delayed);

        for (i = 0; i < N_IN_URB; i++) {
                buffer = (u8 *)__get_free_page(GFP_KERNEL);
                if (!buffer)
                        goto bail_out_error;
                portdata->in_buffer[i] = buffer;

                urb = usb_wwan_setup_urb(port, port->bulk_in_endpointAddress,
                                                USB_DIR_IN, port,
                                                buffer, IN_BUFLEN,
                                                usb_wwan_indat_callback);
                portdata->in_urbs[i] = urb;
        }

        for (i = 0; i < N_OUT_URB; i++) {
                buffer = kmalloc(OUT_BUFLEN, GFP_KERNEL);
                if (!buffer)
                        goto bail_out_error2;
                portdata->out_buffer[i] = buffer;

                urb = usb_wwan_setup_urb(port, port->bulk_out_endpointAddress,
                                                USB_DIR_OUT, port,
                                                buffer, OUT_BUFLEN,
                                                usb_wwan_outdat_callback);
                portdata->out_urbs[i] = urb;
        }

        usb_set_serial_port_data(port, portdata);

        return 0;

bail_out_error2:
        for (i = 0; i < N_OUT_URB; i++) {
                usb_free_urb(portdata->out_urbs[i]);
                kfree(portdata->out_buffer[i]);
        }
bail_out_error:
        for (i = 0; i < N_IN_URB; i++) {
                usb_free_urb(portdata->in_urbs[i]);
                free_page((unsigned long)portdata->in_buffer[i]);
        }
        kfree(portdata);

        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(usb_wwan_port_probe);

void usb_wwan_port_remove(struct usb_serial_port *port)
{
        int i;
        struct usb_wwan_port_private *portdata;

        portdata = usb_get_serial_port_data(port);
        usb_set_serial_port_data(port, NULL);

        for (i = 0; i < N_IN_URB; i++) {
                usb_free_urb(portdata->in_urbs[i]);
                free_page((unsigned long)portdata->in_buffer[i]);
        }
        for (i = 0; i < N_OUT_URB; i++) {
                usb_free_urb(portdata->out_urbs[i]);
                kfree(portdata->out_buffer[i]);
        }

        kfree(portdata);
}
EXPORT_SYMBOL(usb_wwan_port_remove);

#ifdef CONFIG_PM
static void stop_urbs(struct usb_serial *serial)
{
        int i, j;
        struct usb_serial_port *port;
        struct usb_wwan_port_private *portdata;

        for (i = 0; i < serial->num_ports; ++i) {
                port = serial->port[i];
                portdata = usb_get_serial_port_data(port);
                if (!portdata)
                        continue;
                for (j = 0; j < N_IN_URB; j++)
                        usb_kill_urb(portdata->in_urbs[j]);
                for (j = 0; j < N_OUT_URB; j++)
                        usb_kill_urb(portdata->out_urbs[j]);
                usb_kill_urb(port->interrupt_in_urb);
        }
}

int usb_wwan_suspend(struct usb_serial *serial, pm_message_t message)
{
        struct usb_wwan_intf_private *intfdata = usb_get_serial_data(serial);

        spin_lock_irq(&intfdata->susp_lock);
        if (PMSG_IS_AUTO(message)) {
                if (intfdata->in_flight) {
                        spin_unlock_irq(&intfdata->susp_lock);
                        return -EBUSY;
                }
        }
        intfdata->suspended = 1;
        spin_unlock_irq(&intfdata->susp_lock);

        stop_urbs(serial);

        return 0;
}
EXPORT_SYMBOL(usb_wwan_suspend);

/* Caller must hold susp_lock. */
static int usb_wwan_submit_delayed_urbs(struct usb_serial_port *port)
{
        struct usb_serial *serial = port->serial;
        struct usb_wwan_intf_private *data = usb_get_serial_data(serial);
        struct usb_wwan_port_private *portdata;
        struct urb *urb;
        int err_count = 0;
        int err;

        portdata = usb_get_serial_port_data(port);

        for (;;) {
                urb = usb_get_from_anchor(&portdata->delayed);
                if (!urb)
                        break;

                err = usb_submit_urb(urb, GFP_ATOMIC);
                if (err) {
                        dev_err(&port->dev, "%s: submit urb failed: %d\n",
                                        __func__, err);
                        err_count++;
                        unbusy_queued_urb(urb, portdata);
                        usb_autopm_put_interface_async(serial->interface);
                        continue;
                }
                data->in_flight++;
        }

        if (err_count)
                return -EIO;

        return 0;
}

int usb_wwan_resume(struct usb_serial *serial)
{
        int i, j;
        struct usb_serial_port *port;
        struct usb_wwan_intf_private *intfdata = usb_get_serial_data(serial);
        struct usb_wwan_port_private *portdata;
        struct urb *urb;
        int err;
        int err_count = 0;

        spin_lock_irq(&intfdata->susp_lock);
        for (i = 0; i < serial->num_ports; i++) {
                port = serial->port[i];

                if (!tty_port_initialized(&port->port))
                        continue;

                portdata = usb_get_serial_port_data(port);

                if (port->interrupt_in_urb) {
                        err = usb_submit_urb(port->interrupt_in_urb,
                                        GFP_ATOMIC);
                        if (err) {
                                dev_err(&port->dev,
                                        "%s: submit int urb failed: %d\n",
                                        __func__, err);
                                err_count++;
                        }
                }

                err = usb_wwan_submit_delayed_urbs(port);
                if (err)
                        err_count++;

                for (j = 0; j < N_IN_URB; j++) {
                        urb = portdata->in_urbs[j];
                        err = usb_submit_urb(urb, GFP_ATOMIC);
                        if (err < 0) {
                                dev_err(&port->dev,
                                        "%s: submit read urb %d failed: %d\n",
                                        __func__, i, err);
                                err_count++;
                        }
                }
        }
        intfdata->suspended = 0;
        spin_unlock_irq(&intfdata->susp_lock);

        if (err_count)
                return -EIO;

        return 0;
}
EXPORT_SYMBOL(usb_wwan_resume);
#endif

MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);
MODULE_LICENSE("GPL v2");
























    3 






















    3 
























    3 





    3 















    3 
    2 
    3 














    3 

    3 
































































































    3 





























































































































































    3 








































    2 
    2 
    3 









    3 










    3 























































    3 


















    3 






    3 
















    3 











    3 

















    3 





























































































































































































    1 

    1 












































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
// SPDX-License-Identifier: GPL-2.0
/*
 * Released under the GPLv2 only.
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/log2.h>
#include <linux/kmsan.h>
#include <linux/usb.h>
#include <linux/wait.h>
#include <linux/usb/hcd.h>
#include <linux/scatterlist.h>

#define to_urb(d) container_of(d, struct urb, kref)


static void urb_destroy(struct kref *kref)
{
        struct urb *urb = to_urb(kref);

        if (urb->transfer_flags & URB_FREE_BUFFER)
                kfree(urb->transfer_buffer);

        kfree(urb);
}

/**
 * usb_init_urb - initializes a urb so that it can be used by a USB driver
 * @urb: pointer to the urb to initialize
 *
 * Initializes a urb so that the USB subsystem can use it properly.
 *
 * If a urb is created with a call to usb_alloc_urb() it is not
 * necessary to call this function.  Only use this if you allocate the
 * space for a struct urb on your own.  If you call this function, be
 * careful when freeing the memory for your urb that it is no longer in
 * use by the USB core.
 *
 * Only use this function if you _really_ understand what you are doing.
 */
void usb_init_urb(struct urb *urb)
{
        if (urb) {
                memset(urb, 0, sizeof(*urb));
                kref_init(&urb->kref);
                INIT_LIST_HEAD(&urb->urb_list);
                INIT_LIST_HEAD(&urb->anchor_list);
        }
}
EXPORT_SYMBOL_GPL(usb_init_urb);

/**
 * usb_alloc_urb - creates a new urb for a USB driver to use
 * @iso_packets: number of iso packets for this urb
 * @mem_flags: the type of memory to allocate, see kmalloc() for a list of
 *        valid options for this.
 *
 * Creates an urb for the USB driver to use, initializes a few internal
 * structures, increments the usage counter, and returns a pointer to it.
 *
 * If the driver want to use this urb for interrupt, control, or bulk
 * endpoints, pass '0' as the number of iso packets.
 *
 * The driver must call usb_free_urb() when it is finished with the urb.
 *
 * Return: A pointer to the new urb, or %NULL if no memory is available.
 */
struct urb *usb_alloc_urb(int iso_packets, gfp_t mem_flags)
{
        struct urb *urb;

        urb = kmalloc_flex(*urb, iso_frame_desc, iso_packets, mem_flags);
        if (!urb)
                return NULL;
        usb_init_urb(urb);
        return urb;
}
EXPORT_SYMBOL_GPL(usb_alloc_urb);

/**
 * usb_free_urb - frees the memory used by a urb when all users of it are finished
 * @urb: pointer to the urb to free, may be NULL
 *
 * Must be called when a user of a urb is finished with it.  When the last user
 * of the urb calls this function, the memory of the urb is freed.
 *
 * Note: The transfer buffer associated with the urb is not freed unless the
 * URB_FREE_BUFFER transfer flag is set.
 */
void usb_free_urb(struct urb *urb)
{
        if (urb)
                kref_put(&urb->kref, urb_destroy);
}
EXPORT_SYMBOL_GPL(usb_free_urb);

/**
 * usb_get_urb - increments the reference count of the urb
 * @urb: pointer to the urb to modify, may be NULL
 *
 * This must be  called whenever a urb is transferred from a device driver to a
 * host controller driver.  This allows proper reference counting to happen
 * for urbs.
 *
 * Return: A pointer to the urb with the incremented reference counter.
 */
struct urb *usb_get_urb(struct urb *urb)
{
        if (urb)
                kref_get(&urb->kref);
        return urb;
}
EXPORT_SYMBOL_GPL(usb_get_urb);

/**
 * usb_anchor_urb - anchors an URB while it is processed
 * @urb: pointer to the urb to anchor
 * @anchor: pointer to the anchor
 *
 * This can be called to have access to URBs which are to be executed
 * without bothering to track them
 */
void usb_anchor_urb(struct urb *urb, struct usb_anchor *anchor)
{
        unsigned long flags;

        spin_lock_irqsave(&anchor->lock, flags);
        usb_get_urb(urb);
        list_add_tail(&urb->anchor_list, &anchor->urb_list);
        urb->anchor = anchor;

        if (unlikely(anchor->poisoned))
                atomic_inc(&urb->reject);

        spin_unlock_irqrestore(&anchor->lock, flags);
}
EXPORT_SYMBOL_GPL(usb_anchor_urb);

static int usb_anchor_check_wakeup(struct usb_anchor *anchor)
{
        return atomic_read(&anchor->suspend_wakeups) == 0 &&
                list_empty(&anchor->urb_list);
}

/* Callers must hold anchor->lock */
static void __usb_unanchor_urb(struct urb *urb, struct usb_anchor *anchor)
{
        urb->anchor = NULL;
        list_del(&urb->anchor_list);
        usb_put_urb(urb);
        if (usb_anchor_check_wakeup(anchor))
                wake_up(&anchor->wait);
}

/**
 * usb_unanchor_urb - unanchors an URB
 * @urb: pointer to the urb to anchor
 *
 * Call this to stop the system keeping track of this URB
 */
void usb_unanchor_urb(struct urb *urb)
{
        unsigned long flags;
        struct usb_anchor *anchor;

        if (!urb)
                return;

        anchor = urb->anchor;
        if (!anchor)
                return;

        spin_lock_irqsave(&anchor->lock, flags);
        /*
         * At this point, we could be competing with another thread which
         * has the same intention. To protect the urb from being unanchored
         * twice, only the winner of the race gets the job.
         */
        if (likely(anchor == urb->anchor))
                __usb_unanchor_urb(urb, anchor);
        spin_unlock_irqrestore(&anchor->lock, flags);
}
EXPORT_SYMBOL_GPL(usb_unanchor_urb);

/*-------------------------------------------------------------------*/

static const int pipetypes[4] = {
        PIPE_CONTROL, PIPE_ISOCHRONOUS, PIPE_BULK, PIPE_INTERRUPT
};

/**
 * usb_pipe_type_check - sanity check of a specific pipe for a usb device
 * @dev: struct usb_device to be checked
 * @pipe: pipe to check
 *
 * This performs a light-weight sanity check for the endpoint in the
 * given usb device.  It returns 0 if the pipe is valid for the specific usb
 * device, otherwise a negative error code.
 */
int usb_pipe_type_check(struct usb_device *dev, unsigned int pipe)
{
        const struct usb_host_endpoint *ep;

        ep = usb_pipe_endpoint(dev, pipe);
        if (!ep)
                return -EINVAL;
        if (usb_pipetype(pipe) != pipetypes[usb_endpoint_type(&ep->desc)])
                return -EINVAL;
        return 0;
}
EXPORT_SYMBOL_GPL(usb_pipe_type_check);

/**
 * usb_urb_ep_type_check - sanity check of endpoint in the given urb
 * @urb: urb to be checked
 *
 * This performs a light-weight sanity check for the endpoint in the
 * given urb.  It returns 0 if the urb contains a valid endpoint, otherwise
 * a negative error code.
 */
int usb_urb_ep_type_check(const struct urb *urb)
{
        return usb_pipe_type_check(urb->dev, urb->pipe);
}
EXPORT_SYMBOL_GPL(usb_urb_ep_type_check);

/**
 * usb_submit_urb - issue an asynchronous transfer request for an endpoint
 * @urb: pointer to the urb describing the request
 * @mem_flags: the type of memory to allocate, see kmalloc() for a list
 *        of valid options for this.
 *
 * This submits a transfer request, and transfers control of the URB
 * describing that request to the USB subsystem.  Request completion will
 * be indicated later, asynchronously, by calling the completion handler.
 * The three types of completion are success, error, and unlink
 * (a software-induced fault, also called "request cancellation").
 *
 * URBs may be submitted in interrupt context.
 *
 * The caller must have correctly initialized the URB before submitting
 * it.  Functions such as usb_fill_bulk_urb() and usb_fill_control_urb() are
 * available to ensure that most fields are correctly initialized, for
 * the particular kind of transfer, although they will not initialize
 * any transfer flags.
 *
 * If the submission is successful, the complete() callback from the URB
 * will be called exactly once, when the USB core and Host Controller Driver
 * (HCD) are finished with the URB.  When the completion function is called,
 * control of the URB is returned to the device driver which issued the
 * request.  The completion handler may then immediately free or reuse that
 * URB.
 *
 * With few exceptions, USB device drivers should never access URB fields
 * provided by usbcore or the HCD until its complete() is called.
 * The exceptions relate to periodic transfer scheduling.  For both
 * interrupt and isochronous urbs, as part of successful URB submission
 * urb->interval is modified to reflect the actual transfer period used
 * (normally some power of two units).  And for isochronous urbs,
 * urb->start_frame is modified to reflect when the URB's transfers were
 * scheduled to start.
 *
 * Not all isochronous transfer scheduling policies will work, but most
 * host controller drivers should easily handle ISO queues going from now
 * until 10-200 msec into the future.  Drivers should try to keep at
 * least one or two msec of data in the queue; many controllers require
 * that new transfers start at least 1 msec in the future when they are
 * added.  If the driver is unable to keep up and the queue empties out,
 * the behavior for new submissions is governed by the URB_ISO_ASAP flag.
 * If the flag is set, or if the queue is idle, then the URB is always
 * assigned to the first available (and not yet expired) slot in the
 * endpoint's schedule.  If the flag is not set and the queue is active
 * then the URB is always assigned to the next slot in the schedule
 * following the end of the endpoint's previous URB, even if that slot is
 * in the past.  When a packet is assigned in this way to a slot that has
 * already expired, the packet is not transmitted and the corresponding
 * usb_iso_packet_descriptor's status field will return -EXDEV.  If this
 * would happen to all the packets in the URB, submission fails with a
 * -EXDEV error code.
 *
 * For control endpoints, the synchronous usb_control_msg() call is
 * often used (in non-interrupt context) instead of this call.
 * That is often used through convenience wrappers, for the requests
 * that are standardized in the USB 2.0 specification.  For bulk
 * endpoints, a synchronous usb_bulk_msg() call is available.
 *
 * Return:
 * 0 on successful submissions. A negative error number otherwise.
 *
 * Request Queuing:
 *
 * URBs may be submitted to endpoints before previous ones complete, to
 * minimize the impact of interrupt latencies and system overhead on data
 * throughput.  With that queuing policy, an endpoint's queue would never
 * be empty.  This is required for continuous isochronous data streams,
 * and may also be required for some kinds of interrupt transfers. Such
 * queuing also maximizes bandwidth utilization by letting USB controllers
 * start work on later requests before driver software has finished the
 * completion processing for earlier (successful) requests.
 *
 * As of Linux 2.6, all USB endpoint transfer queues support depths greater
 * than one.  This was previously a HCD-specific behavior, except for ISO
 * transfers.  Non-isochronous endpoint queues are inactive during cleanup
 * after faults (transfer errors or cancellation).
 *
 * Reserved Bandwidth Transfers:
 *
 * Periodic transfers (interrupt or isochronous) are performed repeatedly,
 * using the interval specified in the urb.  Submitting the first urb to
 * the endpoint reserves the bandwidth necessary to make those transfers.
 * If the USB subsystem can't allocate sufficient bandwidth to perform
 * the periodic request, submitting such a periodic request should fail.
 *
 * For devices under xHCI, the bandwidth is reserved at configuration time, or
 * when the alt setting is selected.  If there is not enough bus bandwidth, the
 * configuration/alt setting request will fail.  Therefore, submissions to
 * periodic endpoints on devices under xHCI should never fail due to bandwidth
 * constraints.
 *
 * Device drivers must explicitly request that repetition, by ensuring that
 * some URB is always on the endpoint's queue (except possibly for short
 * periods during completion callbacks).  When there is no longer an urb
 * queued, the endpoint's bandwidth reservation is canceled.  This means
 * drivers can use their completion handlers to ensure they keep bandwidth
 * they need, by reinitializing and resubmitting the just-completed urb
 * until the driver longer needs that periodic bandwidth.
 *
 * Memory Flags:
 *
 * The general rules for how to decide which mem_flags to use
 * are the same as for kmalloc.  There are four
 * different possible values; GFP_KERNEL, GFP_NOFS, GFP_NOIO and
 * GFP_ATOMIC.
 *
 * GFP_NOFS is not ever used, as it has not been implemented yet.
 *
 * GFP_ATOMIC is used when
 *   (a) you are inside a completion handler, an interrupt, bottom half,
 *       tasklet or timer, or
 *   (b) you are holding a spinlock or rwlock (does not apply to
 *       semaphores), or
 *   (c) current->state != TASK_RUNNING, this is the case only after
 *       you've changed it.
 *
 * GFP_NOIO is used in the block io path and error handling of storage
 * devices.
 *
 * All other situations use GFP_KERNEL.
 *
 * Some more specific rules for mem_flags can be inferred, such as
 *  (1) start_xmit, timeout, and receive methods of network drivers must
 *      use GFP_ATOMIC (they are called with a spinlock held);
 *  (2) queuecommand methods of scsi drivers must use GFP_ATOMIC (also
 *      called with a spinlock held);
 *  (3) If you use a kernel thread with a network driver you must use
 *      GFP_NOIO, unless (b) or (c) apply;
 *  (4) after you have done a down() you can use GFP_KERNEL, unless (b) or (c)
 *      apply or your are in a storage driver's block io path;
 *  (5) USB probe and disconnect can use GFP_KERNEL unless (b) or (c) apply; and
 *  (6) changing firmware on a running storage or net device uses
 *      GFP_NOIO, unless b) or c) apply
 *
 */
int usb_submit_urb(struct urb *urb, gfp_t mem_flags)
{
        int                                xfertype, max;
        struct usb_device                *dev;
        struct usb_host_endpoint        *ep;
        int                                is_out;
        unsigned int                        allowed;
        bool                                is_eusb2_isoch_double;

        if (!urb || !urb->complete)
                return -EINVAL;
        if (urb->hcpriv) {
                WARN_ONCE(1, "URB %p submitted while active\n", urb);
                return -EBUSY;
        }

        dev = urb->dev;
        if ((!dev) || (dev->state < USB_STATE_UNAUTHENTICATED))
                return -ENODEV;

        /* For now, get the endpoint from the pipe.  Eventually drivers
         * will be required to set urb->ep directly and we will eliminate
         * urb->pipe.
         */
        ep = usb_pipe_endpoint(dev, urb->pipe);
        if (!ep)
                return -ENOENT;

        urb->ep = ep;
        urb->status = -EINPROGRESS;
        urb->actual_length = 0;

        /* Lots of sanity checks, so HCDs can rely on clean data
         * and don't need to duplicate tests
         */
        xfertype = usb_endpoint_type(&ep->desc);
        if (xfertype == USB_ENDPOINT_XFER_CONTROL) {
                struct usb_ctrlrequest *setup =
                                (struct usb_ctrlrequest *) urb->setup_packet;

                if (!setup)
                        return -ENOEXEC;
                is_out = !(setup->bRequestType & USB_DIR_IN) ||
                                !setup->wLength;
                dev_WARN_ONCE(&dev->dev, (usb_pipeout(urb->pipe) != is_out),
                                "BOGUS control dir, pipe %x doesn't match bRequestType %x\n",
                                urb->pipe, setup->bRequestType);
                if (le16_to_cpu(setup->wLength) != urb->transfer_buffer_length) {
                        dev_dbg(&dev->dev, "BOGUS control len %d doesn't match transfer length %d\n",
                                        le16_to_cpu(setup->wLength),
                                        urb->transfer_buffer_length);
                        return -EBADR;
                }
        } else {
                is_out = usb_endpoint_dir_out(&ep->desc);
        }

        /* Clear the internal flags and cache the direction for later use */
        urb->transfer_flags &= ~(URB_DIR_MASK | URB_DMA_MAP_SINGLE |
                        URB_DMA_MAP_PAGE | URB_DMA_MAP_SG | URB_MAP_LOCAL |
                        URB_SETUP_MAP_SINGLE | URB_SETUP_MAP_LOCAL |
                        URB_DMA_SG_COMBINED);
        urb->transfer_flags |= (is_out ? URB_DIR_OUT : URB_DIR_IN);
        kmsan_handle_urb(urb, is_out);

        if (xfertype != USB_ENDPOINT_XFER_CONTROL &&
                        dev->state < USB_STATE_CONFIGURED)
                return -ENODEV;

        max = usb_endpoint_maxp(&ep->desc);
        is_eusb2_isoch_double = usb_endpoint_is_hs_isoc_double(dev, ep);
        if (!max && !is_eusb2_isoch_double) {
                dev_dbg(&dev->dev,
                        "bogus endpoint ep%d%s in %s (bad maxpacket %d)\n",
                        usb_endpoint_num(&ep->desc), is_out ? "out" : "in",
                        __func__, max);
                return -EMSGSIZE;
        }

        /* periodic transfers limit size per frame/uframe,
         * but drivers only control those sizes for ISO.
         * while we're checking, initialize return status.
         */
        if (xfertype == USB_ENDPOINT_XFER_ISOC) {
                int        n, len;

                /* SuperSpeed isoc endpoints have up to 16 bursts of up to
                 * 3 packets each
                 */
                if (dev->speed >= USB_SPEED_SUPER) {
                        int     burst = 1 + ep->ss_ep_comp.bMaxBurst;
                        int     mult = USB_SS_MULT(ep->ss_ep_comp.bmAttributes);
                        max *= burst;
                        max *= mult;
                }

                if (dev->speed == USB_SPEED_SUPER_PLUS &&
                    USB_SS_SSP_ISOC_COMP(ep->ss_ep_comp.bmAttributes)) {
                        struct usb_ssp_isoc_ep_comp_descriptor *isoc_ep_comp;

                        isoc_ep_comp = &ep->ssp_isoc_ep_comp;
                        max = le32_to_cpu(isoc_ep_comp->dwBytesPerInterval);
                }

                /* High speed, 1-3 packets/uframe, max 6 for eUSB2 double bw */
                if (dev->speed == USB_SPEED_HIGH) {
                        if (is_eusb2_isoch_double)
                                max = le32_to_cpu(ep->eusb2_isoc_ep_comp.dwBytesPerInterval);
                        else
                                max *= usb_endpoint_maxp_mult(&ep->desc);
                }

                if (urb->number_of_packets <= 0)
                        return -EINVAL;
                for (n = 0; n < urb->number_of_packets; n++) {
                        len = urb->iso_frame_desc[n].length;
                        if (len < 0 || len > max)
                                return -EMSGSIZE;
                        urb->iso_frame_desc[n].status = -EXDEV;
                        urb->iso_frame_desc[n].actual_length = 0;
                }
        } else if (urb->num_sgs && !urb->dev->bus->no_sg_constraint) {
                struct scatterlist *sg;
                int i;

                for_each_sg(urb->sg, sg, urb->num_sgs - 1, i)
                        if (sg->length % max)
                                return -EINVAL;
        }

        /* the I/O buffer must be mapped/unmapped, except when length=0 */
        if (urb->transfer_buffer_length > INT_MAX)
                return -EMSGSIZE;

        /*
         * stuff that drivers shouldn't do, but which shouldn't
         * cause problems in HCDs if they get it wrong.
         */

        /* Check that the pipe's type matches the endpoint's type */
        if (usb_pipe_type_check(urb->dev, urb->pipe))
                dev_warn_once(&dev->dev, "BOGUS urb xfer, pipe %x != type %x\n",
                        usb_pipetype(urb->pipe), pipetypes[xfertype]);

        /* Check against a simple/standard policy */
        allowed = (URB_NO_TRANSFER_DMA_MAP | URB_NO_INTERRUPT | URB_DIR_MASK |
                        URB_FREE_BUFFER);
        switch (xfertype) {
        case USB_ENDPOINT_XFER_BULK:
        case USB_ENDPOINT_XFER_INT:
                if (is_out)
                        allowed |= URB_ZERO_PACKET;
                fallthrough;
        default:                        /* all non-iso endpoints */
                if (!is_out)
                        allowed |= URB_SHORT_NOT_OK;
                break;
        case USB_ENDPOINT_XFER_ISOC:
                allowed |= URB_ISO_ASAP;
                break;
        }
        allowed &= urb->transfer_flags;

        /* warn if submitter gave bogus flags */
        if (allowed != urb->transfer_flags)
                dev_WARN(&dev->dev, "BOGUS urb flags, %x --> %x\n",
                        urb->transfer_flags, allowed);

        /*
         * Force periodic transfer intervals to be legal values that are
         * a power of two (so HCDs don't need to).
         *
         * FIXME want bus->{intr,iso}_sched_horizon values here.  Each HC
         * supports different values... this uses EHCI/UHCI defaults (and
         * EHCI can use smaller non-default values).
         */
        switch (xfertype) {
        case USB_ENDPOINT_XFER_ISOC:
        case USB_ENDPOINT_XFER_INT:
                /* too small? */
                if (urb->interval <= 0)
                        return -EINVAL;

                /* too big? */
                switch (dev->speed) {
                case USB_SPEED_SUPER_PLUS:
                case USB_SPEED_SUPER:        /* units are 125us */
                        /* Handle up to 2^(16-1) microframes */
                        if (urb->interval > (1 << 15))
                                return -EINVAL;
                        max = 1 << 15;
                        break;
                case USB_SPEED_HIGH:        /* units are microframes */
                        /* NOTE usb handles 2^15 */
                        if (urb->interval > (1024 * 8))
                                urb->interval = 1024 * 8;
                        max = 1024 * 8;
                        break;
                case USB_SPEED_FULL:        /* units are frames/msec */
                case USB_SPEED_LOW:
                        if (xfertype == USB_ENDPOINT_XFER_INT) {
                                if (urb->interval > 255)
                                        return -EINVAL;
                                /* NOTE ohci only handles up to 32 */
                                max = 128;
                        } else {
                                if (urb->interval > 1024)
                                        urb->interval = 1024;
                                /* NOTE usb and ohci handle up to 2^15 */
                                max = 1024;
                        }
                        break;
                default:
                        return -EINVAL;
                }
                /* Round down to a power of 2, no more than max */
                urb->interval = min(max, 1 << ilog2(urb->interval));
        }

        return usb_hcd_submit_urb(urb, mem_flags);
}
EXPORT_SYMBOL_GPL(usb_submit_urb);

/*-------------------------------------------------------------------*/

/**
 * usb_unlink_urb - abort/cancel a transfer request for an endpoint
 * @urb: pointer to urb describing a previously submitted request,
 *        may be NULL
 *
 * This routine cancels an in-progress request.  URBs complete only once
 * per submission, and may be canceled only once per submission.
 * Successful cancellation means termination of @urb will be expedited
 * and the completion handler will be called with a status code
 * indicating that the request has been canceled (rather than any other
 * code).
 *
 * Drivers should not call this routine or related routines, such as
 * usb_kill_urb(), after their disconnect method has returned. The
 * disconnect function should synchronize with a driver's I/O routines
 * to insure that all URB-related activity has completed before it returns.
 *
 * This request is asynchronous, however the HCD might call the ->complete()
 * callback during unlink. Therefore when drivers call usb_unlink_urb(), they
 * must not hold any locks that may be taken by the completion function.
 * Success is indicated by returning -EINPROGRESS, at which time the URB will
 * probably not yet have been given back to the device driver. When it is
 * eventually called, the completion function will see @urb->status ==
 * -ECONNRESET.
 * Failure is indicated by usb_unlink_urb() returning any other value.
 * Unlinking will fail when @urb is not currently "linked" (i.e., it was
 * never submitted, or it was unlinked before, or the hardware is already
 * finished with it), even if the completion handler has not yet run.
 *
 * The URB must not be deallocated while this routine is running.  In
 * particular, when a driver calls this routine, it must insure that the
 * completion handler cannot deallocate the URB.
 *
 * Return: -EINPROGRESS on success. See description for other values on
 * failure.
 *
 * Unlinking and Endpoint Queues:
 *
 * [The behaviors and guarantees described below do not apply to virtual
 * root hubs but only to endpoint queues for physical USB devices.]
 *
 * Host Controller Drivers (HCDs) place all the URBs for a particular
 * endpoint in a queue.  Normally the queue advances as the controller
 * hardware processes each request.  But when an URB terminates with an
 * error its queue generally stops (see below), at least until that URB's
 * completion routine returns.  It is guaranteed that a stopped queue
 * will not restart until all its unlinked URBs have been fully retired,
 * with their completion routines run, even if that's not until some time
 * after the original completion handler returns.  The same behavior and
 * guarantee apply when an URB terminates because it was unlinked.
 *
 * Bulk and interrupt endpoint queues are guaranteed to stop whenever an
 * URB terminates with any sort of error, including -ECONNRESET, -ENOENT,
 * and -EREMOTEIO.  Control endpoint queues behave the same way except
 * that they are not guaranteed to stop for -EREMOTEIO errors.  Queues
 * for isochronous endpoints are treated differently, because they must
 * advance at fixed rates.  Such queues do not stop when an URB
 * encounters an error or is unlinked.  An unlinked isochronous URB may
 * leave a gap in the stream of packets; it is undefined whether such
 * gaps can be filled in.
 *
 * Note that early termination of an URB because a short packet was
 * received will generate a -EREMOTEIO error if and only if the
 * URB_SHORT_NOT_OK flag is set.  By setting this flag, USB device
 * drivers can build deep queues for large or complex bulk transfers
 * and clean them up reliably after any sort of aborted transfer by
 * unlinking all pending URBs at the first fault.
 *
 * When a control URB terminates with an error other than -EREMOTEIO, it
 * is quite likely that the status stage of the transfer will not take
 * place.
 */
int usb_unlink_urb(struct urb *urb)
{
        if (!urb)
                return -EINVAL;
        if (!urb->dev)
                return -ENODEV;
        if (!urb->ep)
                return -EIDRM;
        return usb_hcd_unlink_urb(urb, -ECONNRESET);
}
EXPORT_SYMBOL_GPL(usb_unlink_urb);

/**
 * usb_kill_urb - cancel a transfer request and wait for it to finish
 * @urb: pointer to URB describing a previously submitted request,
 *        may be NULL
 *
 * This routine cancels an in-progress request.  It is guaranteed that
 * upon return all completion handlers will have finished and the URB
 * will be totally idle and available for reuse.  These features make
 * this an ideal way to stop I/O in a disconnect() callback or close()
 * function.  If the request has not already finished or been unlinked
 * the completion handler will see urb->status == -ENOENT.
 *
 * While the routine is running, attempts to resubmit the URB will fail
 * with error -EPERM.  Thus even if the URB's completion handler always
 * tries to resubmit, it will not succeed and the URB will become idle.
 *
 * The URB must not be deallocated while this routine is running.  In
 * particular, when a driver calls this routine, it must insure that the
 * completion handler cannot deallocate the URB.
 *
 * This routine may not be used in an interrupt context (such as a bottom
 * half or a completion handler), or when holding a spinlock, or in other
 * situations where the caller can't schedule().
 *
 * This routine should not be called by a driver after its disconnect
 * method has returned.
 */
void usb_kill_urb(struct urb *urb)
{
        might_sleep();
        if (!(urb && urb->dev && urb->ep))
                return;
        atomic_inc(&urb->reject);
        /*
         * Order the write of urb->reject above before the read
         * of urb->use_count below.  Pairs with the barriers in
         * __usb_hcd_giveback_urb() and usb_hcd_submit_urb().
         */
        smp_mb__after_atomic();

        usb_hcd_unlink_urb(urb, -ENOENT);
        wait_event(usb_kill_urb_queue, atomic_read(&urb->use_count) == 0);

        atomic_dec(&urb->reject);
}
EXPORT_SYMBOL_GPL(usb_kill_urb);

/**
 * usb_poison_urb - reliably kill a transfer and prevent further use of an URB
 * @urb: pointer to URB describing a previously submitted request,
 *        may be NULL
 *
 * This routine cancels an in-progress request.  It is guaranteed that
 * upon return all completion handlers will have finished and the URB
 * will be totally idle and cannot be reused.  These features make
 * this an ideal way to stop I/O in a disconnect() callback.
 * If the request has not already finished or been unlinked
 * the completion handler will see urb->status == -ENOENT.
 *
 * After and while the routine runs, attempts to resubmit the URB will fail
 * with error -EPERM.  Thus even if the URB's completion handler always
 * tries to resubmit, it will not succeed and the URB will become idle.
 *
 * The URB must not be deallocated while this routine is running.  In
 * particular, when a driver calls this routine, it must insure that the
 * completion handler cannot deallocate the URB.
 *
 * This routine may not be used in an interrupt context (such as a bottom
 * half or a completion handler), or when holding a spinlock, or in other
 * situations where the caller can't schedule().
 *
 * This routine should not be called by a driver after its disconnect
 * method has returned.
 */
void usb_poison_urb(struct urb *urb)
{
        might_sleep();
        if (!urb)
                return;
        atomic_inc(&urb->reject);
        /*
         * Order the write of urb->reject above before the read
         * of urb->use_count below.  Pairs with the barriers in
         * __usb_hcd_giveback_urb() and usb_hcd_submit_urb().
         */
        smp_mb__after_atomic();

        if (!urb->dev || !urb->ep)
                return;

        usb_hcd_unlink_urb(urb, -ENOENT);
        wait_event(usb_kill_urb_queue, atomic_read(&urb->use_count) == 0);
}
EXPORT_SYMBOL_GPL(usb_poison_urb);

void usb_unpoison_urb(struct urb *urb)
{
        if (!urb)
                return;

        atomic_dec(&urb->reject);
}
EXPORT_SYMBOL_GPL(usb_unpoison_urb);

/**
 * usb_block_urb - reliably prevent further use of an URB
 * @urb: pointer to URB to be blocked, may be NULL
 *
 * After the routine has run, attempts to resubmit the URB will fail
 * with error -EPERM.  Thus even if the URB's completion handler always
 * tries to resubmit, it will not succeed and the URB will become idle.
 *
 * The URB must not be deallocated while this routine is running.  In
 * particular, when a driver calls this routine, it must insure that the
 * completion handler cannot deallocate the URB.
 */
void usb_block_urb(struct urb *urb)
{
        if (!urb)
                return;

        atomic_inc(&urb->reject);
}
EXPORT_SYMBOL_GPL(usb_block_urb);

/**
 * usb_kill_anchored_urbs - kill all URBs associated with an anchor
 * @anchor: anchor the requests are bound to
 *
 * This kills all outstanding URBs starting from the back of the queue,
 * with guarantee that no completer callbacks will take place from the
 * anchor after this function returns.
 *
 * This routine should not be called by a driver after its disconnect
 * method has returned.
 */
void usb_kill_anchored_urbs(struct usb_anchor *anchor)
{
        struct urb *victim;
        int surely_empty;

        do {
                spin_lock_irq(&anchor->lock);
                while (!list_empty(&anchor->urb_list)) {
                        victim = list_entry(anchor->urb_list.prev,
                                            struct urb, anchor_list);
                        /* make sure the URB isn't freed before we kill it */
                        usb_get_urb(victim);
                        spin_unlock_irq(&anchor->lock);
                        /* this will unanchor the URB */
                        usb_kill_urb(victim);
                        usb_put_urb(victim);
                        spin_lock_irq(&anchor->lock);
                }
                surely_empty = usb_anchor_check_wakeup(anchor);

                spin_unlock_irq(&anchor->lock);
                cpu_relax();
        } while (!surely_empty);
}
EXPORT_SYMBOL_GPL(usb_kill_anchored_urbs);


/**
 * usb_poison_anchored_urbs - cease all traffic from an anchor
 * @anchor: anchor the requests are bound to
 *
 * this allows all outstanding URBs to be poisoned starting
 * from the back of the queue. Newly added URBs will also be
 * poisoned
 *
 * This routine should not be called by a driver after its disconnect
 * method has returned.
 */
void usb_poison_anchored_urbs(struct usb_anchor *anchor)
{
        struct urb *victim;
        int surely_empty;

        do {
                spin_lock_irq(&anchor->lock);
                anchor->poisoned = 1;
                while (!list_empty(&anchor->urb_list)) {
                        victim = list_entry(anchor->urb_list.prev,
                                            struct urb, anchor_list);
                        /* make sure the URB isn't freed before we kill it */
                        usb_get_urb(victim);
                        spin_unlock_irq(&anchor->lock);
                        /* this will unanchor the URB */
                        usb_poison_urb(victim);
                        usb_put_urb(victim);
                        spin_lock_irq(&anchor->lock);
                }
                surely_empty = usb_anchor_check_wakeup(anchor);

                spin_unlock_irq(&anchor->lock);
                cpu_relax();
        } while (!surely_empty);
}
EXPORT_SYMBOL_GPL(usb_poison_anchored_urbs);

/**
 * usb_unpoison_anchored_urbs - let an anchor be used successfully again
 * @anchor: anchor the requests are bound to
 *
 * Reverses the effect of usb_poison_anchored_urbs
 * the anchor can be used normally after it returns
 */
void usb_unpoison_anchored_urbs(struct usb_anchor *anchor)
{
        unsigned long flags;
        struct urb *lazarus;

        spin_lock_irqsave(&anchor->lock, flags);
        list_for_each_entry(lazarus, &anchor->urb_list, anchor_list) {
                usb_unpoison_urb(lazarus);
        }
        anchor->poisoned = 0;
        spin_unlock_irqrestore(&anchor->lock, flags);
}
EXPORT_SYMBOL_GPL(usb_unpoison_anchored_urbs);

/**
 * usb_anchor_suspend_wakeups
 * @anchor: the anchor you want to suspend wakeups on
 *
 * Call this to stop the last urb being unanchored from waking up any
 * usb_wait_anchor_empty_timeout waiters. This is used in the hcd urb give-
 * back path to delay waking up until after the completion handler has run.
 */
void usb_anchor_suspend_wakeups(struct usb_anchor *anchor)
{
        if (anchor)
                atomic_inc(&anchor->suspend_wakeups);
}
EXPORT_SYMBOL_GPL(usb_anchor_suspend_wakeups);

/**
 * usb_anchor_resume_wakeups
 * @anchor: the anchor you want to resume wakeups on
 *
 * Allow usb_wait_anchor_empty_timeout waiters to be woken up again, and
 * wake up any current waiters if the anchor is empty.
 */
void usb_anchor_resume_wakeups(struct usb_anchor *anchor)
{
        if (!anchor)
                return;

        atomic_dec(&anchor->suspend_wakeups);
        if (usb_anchor_check_wakeup(anchor))
                wake_up(&anchor->wait);
}
EXPORT_SYMBOL_GPL(usb_anchor_resume_wakeups);

/**
 * usb_wait_anchor_empty_timeout - wait for an anchor to be unused
 * @anchor: the anchor you want to become unused
 * @timeout: how long you are willing to wait in milliseconds
 *
 * Call this is you want to be sure all an anchor's
 * URBs have finished
 *
 * Return: Non-zero if the anchor became unused. Zero on timeout.
 */
int usb_wait_anchor_empty_timeout(struct usb_anchor *anchor,
                                  unsigned int timeout)
{
        return wait_event_timeout(anchor->wait,
                                  usb_anchor_check_wakeup(anchor),
                                  msecs_to_jiffies(timeout));
}
EXPORT_SYMBOL_GPL(usb_wait_anchor_empty_timeout);

/**
 * usb_get_from_anchor - get an anchor's oldest urb
 * @anchor: the anchor whose urb you want
 *
 * This will take the oldest urb from an anchor,
 * unanchor and return it
 *
 * Return: The oldest urb from @anchor, or %NULL if @anchor has no
 * urbs associated with it.
 */
struct urb *usb_get_from_anchor(struct usb_anchor *anchor)
{
        struct urb *victim;
        unsigned long flags;

        spin_lock_irqsave(&anchor->lock, flags);
        if (!list_empty(&anchor->urb_list)) {
                victim = list_entry(anchor->urb_list.next, struct urb,
                                    anchor_list);
                usb_get_urb(victim);
                __usb_unanchor_urb(victim, anchor);
        } else {
                victim = NULL;
        }
        spin_unlock_irqrestore(&anchor->lock, flags);

        return victim;
}

EXPORT_SYMBOL_GPL(usb_get_from_anchor);

/**
 * usb_scuttle_anchored_urbs - unanchor all an anchor's urbs
 * @anchor: the anchor whose urbs you want to unanchor
 *
 * use this to get rid of all an anchor's urbs
 */
void usb_scuttle_anchored_urbs(struct usb_anchor *anchor)
{
        struct urb *victim;
        unsigned long flags;
        int surely_empty;

        do {
                spin_lock_irqsave(&anchor->lock, flags);
                while (!list_empty(&anchor->urb_list)) {
                        victim = list_entry(anchor->urb_list.prev,
                                            struct urb, anchor_list);
                        __usb_unanchor_urb(victim, anchor);
                }
                surely_empty = usb_anchor_check_wakeup(anchor);

                spin_unlock_irqrestore(&anchor->lock, flags);
                cpu_relax();
        } while (!surely_empty);
}

EXPORT_SYMBOL_GPL(usb_scuttle_anchored_urbs);

/**
 * usb_anchor_empty - is an anchor empty
 * @anchor: the anchor you want to query
 *
 * Return: 1 if the anchor has no urbs associated with it.
 */
int usb_anchor_empty(struct usb_anchor *anchor)
{
        return list_empty(&anchor->urb_list);
}

EXPORT_SYMBOL_GPL(usb_anchor_empty);





























































































































   11 



























































   19 












   14 





    5 









    3 















    3 






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * include/net/l3mdev.h - L3 master device API
 * Copyright (c) 2015 Cumulus Networks
 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
 */
#ifndef _NET_L3MDEV_H_
#define _NET_L3MDEV_H_

#include <net/dst.h>
#include <net/fib_rules.h>

enum l3mdev_type {
        L3MDEV_TYPE_UNSPEC,
        L3MDEV_TYPE_VRF,
        __L3MDEV_TYPE_MAX
};

#define L3MDEV_TYPE_MAX (__L3MDEV_TYPE_MAX - 1)

typedef int (*lookup_by_table_id_t)(struct net *net, u32 table_d);

/**
 * struct l3mdev_ops - l3mdev operations
 *
 * @l3mdev_fib_table: Get FIB table id to use for lookups
 *
 * @l3mdev_l3_rcv:    Hook in L3 receive path
 *
 * @l3mdev_l3_out:    Hook in L3 output path
 *
 * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations
 */

struct l3mdev_ops {
        u32                (*l3mdev_fib_table)(const struct net_device *dev);
        struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
                                          struct sk_buff *skb, u16 proto);
        struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev,
                                          struct sock *sk, struct sk_buff *skb,
                                          u16 proto);

        /* IPv6 ops */
        struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
                                                 struct flowi6 *fl6);
};

#ifdef CONFIG_NET_L3_MASTER_DEV

int l3mdev_table_lookup_register(enum l3mdev_type l3type,
                                 lookup_by_table_id_t fn);

void l3mdev_table_lookup_unregister(enum l3mdev_type l3type,
                                    lookup_by_table_id_t fn);

int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net,
                                      u32 table_id);

int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
                          struct fib_lookup_arg *arg);

static inline
bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex)
{
        return !(fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF) &&
               fl->flowi_l3mdev == iifindex;
}

static inline
bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex)
{
        return fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF &&
               fl->flowi_l3mdev == oifindex;
}

void l3mdev_update_flow(struct net *net, struct flowi *fl);

int l3mdev_master_ifindex_rcu(const struct net_device *dev);
static inline int l3mdev_master_ifindex(struct net_device *dev)
{
        int ifindex;

        rcu_read_lock();
        ifindex = l3mdev_master_ifindex_rcu(dev);
        rcu_read_unlock();

        return ifindex;
}

static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;
        int rc = 0;

        if (ifindex) {
                rcu_read_lock();

                dev = dev_get_by_index_rcu(net, ifindex);
                if (dev)
                        rc = l3mdev_master_ifindex_rcu(dev);

                rcu_read_unlock();
        }

        return rc;
}

static inline
struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
{
        /* netdev_master_upper_dev_get_rcu calls
         * list_first_or_null_rcu to walk the upper dev list.
         * list_first_or_null_rcu does not handle a const arg. We aren't
         * making changes, just want the master device from that list so
         * typecast to remove the const
         */
        struct net_device *dev = (struct net_device *)_dev;
        struct net_device *master;

        if (!dev)
                return NULL;

        if (netif_is_l3_master(dev))
                master = dev;
        else if (netif_is_l3_slave(dev))
                master = netdev_master_upper_dev_get_rcu(dev);
        else
                master = NULL;

        return master;
}

int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex);
static inline
int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
{
        rcu_read_lock();
        ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex);
        rcu_read_unlock();

        return ifindex;
}

u32 l3mdev_fib_table_rcu(const struct net_device *dev);
u32 l3mdev_fib_table_by_index(struct net *net, int ifindex);
static inline u32 l3mdev_fib_table(const struct net_device *dev)
{
        u32 tb_id;

        rcu_read_lock();
        tb_id = l3mdev_fib_table_rcu(dev);
        rcu_read_unlock();

        return tb_id;
}

static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
{
        struct net_device *dev;
        bool rc = false;

        if (ifindex == 0)
                return false;

        rcu_read_lock();

        dev = dev_get_by_index_rcu(net, ifindex);
        if (dev)
                rc = netif_is_l3_master(dev);

        rcu_read_unlock();

        return rc;
}

struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);

static inline
struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
{
        struct net_device *master = NULL;

        if (netif_is_l3_slave(skb->dev))
                master = netdev_master_upper_dev_get_rcu(skb->dev);
        else if (netif_is_l3_master(skb->dev) ||
                 netif_has_l3_rx_handler(skb->dev))
                master = skb->dev;

        if (master && master->l3mdev_ops->l3mdev_l3_rcv)
                skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto);

        return skb;
}

static inline
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
{
        return l3mdev_l3_rcv(skb, AF_INET);
}

static inline
struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
{
        return l3mdev_l3_rcv(skb, AF_INET6);
}

static inline
struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = skb_dst_dev_rcu(skb);
        if (netif_is_l3_slave(dev)) {
                struct net_device *master;

                master = netdev_master_upper_dev_get_rcu(dev);
                if (master && master->l3mdev_ops->l3mdev_l3_out)
                        skb = master->l3mdev_ops->l3mdev_l3_out(master, sk,
                                                                skb, proto);
        }
        rcu_read_unlock();

        return skb;
}

static inline
struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
{
        return l3mdev_l3_out(sk, skb, AF_INET);
}

static inline
struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
{
        return l3mdev_l3_out(sk, skb, AF_INET6);
}
#else

static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
{
        return 0;
}
static inline int l3mdev_master_ifindex(struct net_device *dev)
{
        return 0;
}

static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
{
        return 0;
}

static inline
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
{
        return 0;
}
static inline
int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
{
        return 0;
}

static inline
struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
{
        return NULL;
}

static inline u32 l3mdev_fib_table_rcu(const struct net_device *dev)
{
        return 0;
}
static inline u32 l3mdev_fib_table(const struct net_device *dev)
{
        return 0;
}
static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
{
        return 0;
}

static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
{
        return false;
}

static inline
struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
{
        return NULL;
}

static inline
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
{
        return skb;
}

static inline
struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
{
        return skb;
}

static inline
struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
{
        return skb;
}

static inline
struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
{
        return skb;
}

static inline
int l3mdev_table_lookup_register(enum l3mdev_type l3type,
                                 lookup_by_table_id_t fn)
{
        return -EOPNOTSUPP;
}

static inline
void l3mdev_table_lookup_unregister(enum l3mdev_type l3type,
                                    lookup_by_table_id_t fn)
{
}

static inline
int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net,
                                      u32 table_id)
{
        return -ENODEV;
}

static inline
int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
                          struct fib_lookup_arg *arg)
{
        return 1;
}

static inline
bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex)
{
        return false;
}

static inline
bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex)
{
        return false;
}

static inline
void l3mdev_update_flow(struct net *net, struct flowi *fl)
{
}
#endif

#endif /* _NET_L3MDEV_H_ */















































































































































































   13 

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMEKEEPING_H
#define _LINUX_TIMEKEEPING_H

#include <linux/errno.h>
#include <linux/clocksource_ids.h>
#include <linux/ktime.h>

/* Included from linux/ktime.h */

void timekeeping_init(void);
extern int timekeeping_suspended;

/* Architecture timer tick functions: */
extern void legacy_timer_tick(unsigned long ticks);

/*
 * Get and set timeofday
 */
extern int do_settimeofday64(const struct timespec64 *ts);
extern int do_sys_settimeofday64(const struct timespec64 *tv,
                                 const struct timezone *tz);

/*
 * ktime_get() family - read the current time in a multitude of ways.
 *
 * The default time reference is CLOCK_MONOTONIC, starting at
 * boot time but not counting the time spent in suspend.
 * For other references, use the functions with "real", "clocktai",
 * "boottime" and "raw" suffixes.
 *
 * To get the time in a different format, use the ones with
 * "ns", "ts64" and "seconds" suffix.
 *
 * See Documentation/core-api/timekeeping.rst for more details.
 */


/*
 * timespec64 based interfaces
 */
extern void ktime_get_raw_ts64(struct timespec64 *ts);
extern void ktime_get_ts64(struct timespec64 *ts);
extern void ktime_get_real_ts64(struct timespec64 *tv);
extern void ktime_get_coarse_ts64(struct timespec64 *ts);
extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
extern void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts);

/* Multigrain timestamp interfaces */
extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
extern unsigned long timekeeping_get_mg_floor_swaps(void);

void getboottime64(struct timespec64 *ts);

/*
 * time64_t base interfaces
 */
extern time64_t ktime_get_seconds(void);
extern time64_t __ktime_get_real_seconds(void);
extern time64_t ktime_get_real_seconds(void);

/*
 * ktime_t based interfaces
 */

enum tk_offsets {
        TK_OFFS_REAL,
        TK_OFFS_BOOT,
        TK_OFFS_TAI,
        TK_OFFS_MAX,
};

extern ktime_t ktime_get(void);
extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs);
extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
extern ktime_t ktime_get_raw(void);
extern u32 ktime_get_resolution_ns(void);

/**
 * ktime_get_real - get the real (wall-) time in ktime_t format
 *
 * Returns: real (wall) time in ktime_t format
 */
static inline ktime_t ktime_get_real(void)
{
        return ktime_get_with_offset(TK_OFFS_REAL);
}

static inline ktime_t ktime_get_coarse_real(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_REAL);
}

/**
 * ktime_get_boottime - Get monotonic time since boot in ktime_t format
 *
 * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the
 * time spent in suspend.
 *
 * Returns: monotonic time since boot in ktime_t format
 */
static inline ktime_t ktime_get_boottime(void)
{
        return ktime_get_with_offset(TK_OFFS_BOOT);
}

static inline ktime_t ktime_get_coarse_boottime(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_BOOT);
}

/**
 * ktime_get_clocktai - Get the TAI time of day in ktime_t format
 *
 * Returns: the TAI time of day in ktime_t format
 */
static inline ktime_t ktime_get_clocktai(void)
{
        return ktime_get_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse_clocktai(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse(void)
{
        struct timespec64 ts;

        ktime_get_coarse_ts64(&ts);
        return timespec64_to_ktime(ts);
}

static inline u64 ktime_get_coarse_ns(void)
{
        return ktime_to_ns(ktime_get_coarse());
}

static inline u64 ktime_get_coarse_real_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_real());
}

static inline u64 ktime_get_coarse_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_boottime());
}

static inline u64 ktime_get_coarse_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_clocktai());
}

/**
 * ktime_mono_to_real - Convert monotonic time to clock realtime
 * @mono: monotonic time to convert
 *
 * Returns: time converted to realtime clock
 */
static inline ktime_t ktime_mono_to_real(ktime_t mono)
{
        return ktime_mono_to_any(mono, TK_OFFS_REAL);
}

/**
 * ktime_get_ns - Get the current time in nanoseconds
 *
 * Returns: current time converted to nanoseconds
 */
static inline u64 ktime_get_ns(void)
{
        return ktime_to_ns(ktime_get());
}

/**
 * ktime_get_real_ns - Get the current real/wall time in nanoseconds
 *
 * Returns: current real time converted to nanoseconds
 */
static inline u64 ktime_get_real_ns(void)
{
        return ktime_to_ns(ktime_get_real());
}

/**
 * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds
 *
 * Returns: current boottime converted to nanoseconds
 */
static inline u64 ktime_get_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_boottime());
}

/**
 * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds
 *
 * Returns: current TAI time converted to nanoseconds
 */
static inline u64 ktime_get_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_clocktai());
}

/**
 * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds
 *
 * Returns: current raw monotonic time converted to nanoseconds
 */
static inline u64 ktime_get_raw_ns(void)
{
        return ktime_to_ns(ktime_get_raw());
}

extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
extern u64 ktime_get_boot_fast_ns(void);
extern u64 ktime_get_tai_fast_ns(void);
extern u64 ktime_get_real_fast_ns(void);

/*
 * timespec64/time64_t interfaces utilizing the ktime based ones
 * for API completeness, these could be implemented more efficiently
 * if needed.
 */
static inline void ktime_get_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_boottime());
}

static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_boottime());
}

static inline time64_t ktime_get_boottime_seconds(void)
{
        return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC);
}

static inline void ktime_get_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_clocktai());
}

static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_clocktai());
}

static inline time64_t ktime_get_clocktai_seconds(void)
{
        return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC);
}

/*
 * RTC specific
 */
extern bool timekeeping_rtc_skipsuspend(void);
extern bool timekeeping_rtc_skipresume(void);

extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta);

/*
 * Auxiliary clock interfaces
 */
#ifdef CONFIG_POSIX_AUX_CLOCKS
extern bool ktime_get_aux(clockid_t id, ktime_t *kt);
extern bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt);
#else
static inline bool ktime_get_aux(clockid_t id, ktime_t *kt) { return false; }
static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { return false; }
#endif

/**
 * struct system_time_snapshot - simultaneous raw/real time capture with
 *                                 counter value
 * @cycles:        Clocksource counter value to produce the system times
 * @real:        Realtime system time
 * @boot:        Boot time
 * @raw:        Monotonic raw system time
 * @cs_id:        Clocksource ID
 * @clock_was_set_seq:        The sequence number of clock-was-set events
 * @cs_was_changed_seq:        The sequence number of clocksource change events
 */
struct system_time_snapshot {
        u64                        cycles;
        ktime_t                        real;
        ktime_t                        boot;
        ktime_t                        raw;
        enum clocksource_ids        cs_id;
        unsigned int                clock_was_set_seq;
        u8                        cs_was_changed_seq;
};

/**
 * struct system_device_crosststamp - system/device cross-timestamp
 *                                      (synchronized capture)
 * @device:                Device time
 * @sys_realtime:        Realtime simultaneous with device time
 * @sys_monoraw:        Monotonic raw simultaneous with device time
 */
struct system_device_crosststamp {
        ktime_t device;
        ktime_t sys_realtime;
        ktime_t sys_monoraw;
};

/**
 * struct system_counterval_t - system counter value with the ID of the
 *                                corresponding clocksource
 * @cycles:        System counter value
 * @cs_id:        Clocksource ID corresponding to system counter value. Used by
 *                timekeeping code to verify comparability of two cycle values.
 *                The default ID, CSID_GENERIC, does not identify a specific
 *                clocksource.
 * @use_nsecs:        @cycles is in nanoseconds.
 */
struct system_counterval_t {
        u64                        cycles;
        enum clocksource_ids        cs_id;
        bool                        use_nsecs;
};

extern bool ktime_real_to_base_clock(ktime_t treal,
                                     enum clocksource_ids base_id, u64 *cycles);
extern bool timekeeping_clocksource_has_base(enum clocksource_ids id);

/*
 * Get cross timestamp between system clock and device clock
 */
extern int get_device_system_crosststamp(
                        int (*get_time_fn)(ktime_t *device_time,
                                struct system_counterval_t *system_counterval,
                                void *ctx),
                        void *ctx,
                        struct system_time_snapshot *history,
                        struct system_device_crosststamp *xtstamp);

/*
 * Simultaneously snapshot realtime and monotonic raw clocks
 */
extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);

/*
 * Persistent clock related interfaces
 */
extern int persistent_clock_is_local;

extern void read_persistent_clock64(struct timespec64 *ts);
void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
                                          struct timespec64 *boot_offset);
#ifdef CONFIG_GENERIC_CMOS_UPDATE
extern int update_persistent_clock64(struct timespec64 now);
#endif

#endif
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the BSD Socket
 *                interface as the means of communication with the user level.
 *
 * Authors:        Lotsa people, from code originally in tcp
 */

#ifndef _INET6_HASHTABLES_H
#define _INET6_HASHTABLES_H


#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#include <linux/ipv6.h>
#include <linux/types.h>
#include <linux/jhash.h>

#include <net/inet_sock.h>

#include <net/ipv6.h>
#include <net/netns/hash.h>

struct inet_hashinfo;

void inet6_init_ehash_secret(void);

static inline unsigned int __inet6_ehashfn(const u32 lhash,
                                    const u16 lport,
                                    const u32 fhash,
                                    const __be16 fport,
                                    const u32 initval)
{
        const u32 ports = (((u32)lport) << 16) | (__force u32)fport;
        return jhash_3words(lhash, fhash, ports, initval);
}

/*
 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 *
 * The sockhash lock must be held as a reader here.
 */
struct sock *__inet6_lookup_established(const struct net *net,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 hnum, const int dif,
                                        const int sdif);

typedef u32 (inet6_ehashfn_t)(const struct net *net,
                               const struct in6_addr *laddr, const u16 lport,
                               const struct in6_addr *faddr, const __be16 fport);

inet6_ehashfn_t inet6_ehashfn;

INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn);

struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk,
                                    struct sk_buff *skb, int doff,
                                    const struct in6_addr *saddr,
                                    __be16 sport,
                                    const struct in6_addr *daddr,
                                    unsigned short hnum,
                                    inet6_ehashfn_t *ehashfn);

struct sock *inet6_lookup_listener(const struct net *net,
                                   struct sk_buff *skb, int doff,
                                   const struct in6_addr *saddr,
                                   const __be16 sport,
                                   const struct in6_addr *daddr,
                                   const unsigned short hnum,
                                   const int dif, const int sdif);

struct sock *inet6_lookup_run_sk_lookup(const struct net *net,
                                        int protocol,
                                        struct sk_buff *skb, int doff,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 hnum, const int dif,
                                        inet6_ehashfn_t *ehashfn);

static inline struct sock *__inet6_lookup(const struct net *net,
                                          struct sk_buff *skb, int doff,
                                          const struct in6_addr *saddr,
                                          const __be16 sport,
                                          const struct in6_addr *daddr,
                                          const u16 hnum,
                                          const int dif, const int sdif,
                                          bool *refcounted)
{
        struct sock *sk = __inet6_lookup_established(net, saddr, sport,
                                                     daddr, hnum,
                                                     dif, sdif);
        *refcounted = true;
        if (sk)
                return sk;
        *refcounted = false;
        return inet6_lookup_listener(net, skb, doff, saddr, sport,
                                     daddr, hnum, dif, sdif);
}

static inline
struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff,
                              const struct in6_addr *saddr, const __be16 sport,
                              const struct in6_addr *daddr, const __be16 dport,
                              bool *refcounted, inet6_ehashfn_t *ehashfn)
{
        struct sock *sk, *reuse_sk;
        bool prefetched;

        sk = skb_steal_sock(skb, refcounted, &prefetched);
        if (!sk)
                return NULL;

        if (!prefetched || !sk_fullsock(sk))
                return sk;

        if (sk->sk_protocol == IPPROTO_TCP) {
                if (sk->sk_state != TCP_LISTEN)
                        return sk;
        } else if (sk->sk_protocol == IPPROTO_UDP) {
                if (sk->sk_state != TCP_CLOSE)
                        return sk;
        } else {
                return sk;
        }

        reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
                                          saddr, sport, daddr, ntohs(dport),
                                          ehashfn);
        if (!reuse_sk)
                return sk;

        /* We've chosen a new reuseport sock which is never refcounted. This
         * implies that sk also isn't refcounted.
         */
        WARN_ON_ONCE(*refcounted);

        return reuse_sk;
}

static inline struct sock *__inet6_lookup_skb(struct sk_buff *skb, int doff,
                                              const __be16 sport,
                                              const __be16 dport,
                                              int iif, int sdif,
                                              bool *refcounted)
{
        struct net *net = skb_dst_dev_net_rcu(skb);
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
        struct sock *sk;

        sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport,
                              refcounted, inet6_ehashfn);
        if (IS_ERR(sk))
                return NULL;
        if (sk)
                return sk;

        return __inet6_lookup(net, skb, doff, &ip6h->saddr, sport,
                              &ip6h->daddr, ntohs(dport),
                              iif, sdif, refcounted);
}

struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff,
                          const struct in6_addr *saddr, const __be16 sport,
                          const struct in6_addr *daddr, const __be16 dport,
                          const int dif);

static inline bool inet6_match(const struct net *net, const struct sock *sk,
                               const struct in6_addr *saddr,
                               const struct in6_addr *daddr,
                               const __portpair ports,
                               const int dif, const int sdif)
{
        if (!net_eq(sock_net(sk), net) ||
            sk->sk_family != AF_INET6 ||
            READ_ONCE(sk->sk_portpair) != ports ||
            !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) ||
            !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
                return false;

        /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
        return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
                                    sdif);
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

#endif /* _INET6_HASHTABLES_H */



























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_DAX_H
#define _LINUX_DAX_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/radix-tree.h>

typedef unsigned long dax_entry_t;

struct dax_device;
struct gendisk;
struct iomap_ops;
struct iomap_iter;
struct iomap;

enum dax_access_mode {
        DAX_ACCESS,
        DAX_RECOVERY_WRITE,
};

struct dax_operations {
        /*
         * direct_access: translate a device-relative
         * logical-page-offset into an absolute physical pfn. Return the
         * number of pages available for DAX at that pfn.
         */
        long (*direct_access)(struct dax_device *, pgoff_t, long,
                        enum dax_access_mode, void **, unsigned long *);
        /* zero_page_range: required operation. Zero page range   */
        int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
        /*
         * recovery_write: recover a poisoned range by DAX device driver
         * capable of clearing poison.
         */
        size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff,
                        void *addr, size_t bytes, struct iov_iter *iter);
};

struct dax_holder_operations {
        /*
         * notify_failure - notify memory failure into inner holder device
         * @dax_dev: the dax device which contains the holder
         * @offset: offset on this dax device where memory failure occurs
         * @len: length of this memory failure event
         * @flags: action flags for memory failure handler
         */
        int (*notify_failure)(struct dax_device *dax_dev, u64 offset,
                        u64 len, int mf_flags);
};

#if IS_ENABLED(CONFIG_DAX)
struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
void *dax_holder(struct dax_device *dax_dev);
void put_dax(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
bool dax_write_cache_enabled(struct dax_device *dax_dev);
bool dax_synchronous(struct dax_device *dax_dev);
void set_dax_nocache(struct dax_device *dax_dev);
void set_dax_nomc(struct dax_device *dax_dev);
void set_dax_synchronous(struct dax_device *dax_dev);
size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
                void *addr, size_t bytes, struct iov_iter *i);
/*
 * Check if given mapping is supported by the file / underlying device.
 */
static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc,
                                            const struct inode *inode,
                                            struct dax_device *dax_dev)
{
        if (!vma_desc_test(desc, VMA_SYNC_BIT))
                return true;
        if (!IS_DAX(inode))
                return false;
        return dax_synchronous(dax_dev);
}
#else
static inline void *dax_holder(struct dax_device *dax_dev)
{
        return NULL;
}
static inline struct dax_device *alloc_dax(void *private,
                const struct dax_operations *ops)
{
        return ERR_PTR(-EOPNOTSUPP);
}
static inline void put_dax(struct dax_device *dax_dev)
{
}
static inline void kill_dax(struct dax_device *dax_dev)
{
}
static inline void dax_write_cache(struct dax_device *dax_dev, bool wc)
{
}
static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
{
        return false;
}
static inline bool dax_synchronous(struct dax_device *dax_dev)
{
        return true;
}
static inline void set_dax_nocache(struct dax_device *dax_dev)
{
}
static inline void set_dax_nomc(struct dax_device *dax_dev)
{
}
static inline void set_dax_synchronous(struct dax_device *dax_dev)
{
}
static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc,
                                            const struct inode *inode,
                                            struct dax_device *dax_dev)
{
        return !vma_desc_test(desc, VMA_SYNC_BIT);
}
static inline size_t dax_recovery_write(struct dax_device *dax_dev,
                pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
{
        return 0;
}
#endif

struct writeback_control;
#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
void dax_remove_host(struct gendisk *disk);
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
                void *holder, const struct dax_holder_operations *ops);
void fs_put_dax(struct dax_device *dax_dev, void *holder);
#else
static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
        return 0;
}
static inline void dax_remove_host(struct gendisk *disk)
{
}
static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
                u64 *start_off, void *holder,
                const struct dax_holder_operations *ops)
{
        return NULL;
}
static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
{
}
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */

#if IS_ENABLED(CONFIG_FS_DAX)
int dax_writeback_mapping_range(struct address_space *mapping,
                struct dax_device *dax_dev, struct writeback_control *wbc);

struct page *dax_layout_busy_page(struct address_space *mapping);
struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
dax_entry_t dax_lock_folio(struct folio *folio);
void dax_unlock_folio(struct folio *folio, dax_entry_t cookie);
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
                unsigned long index, struct page **page);
void dax_unlock_mapping_entry(struct address_space *mapping,
                unsigned long index, dax_entry_t cookie);
#else
static inline struct page *dax_layout_busy_page(struct address_space *mapping)
{
        return NULL;
}

static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
{
        return NULL;
}

static inline int dax_writeback_mapping_range(struct address_space *mapping,
                struct dax_device *dax_dev, struct writeback_control *wbc)
{
        return -EOPNOTSUPP;
}

static inline dax_entry_t dax_lock_folio(struct folio *folio)
{
        if (IS_DAX(folio->mapping->host))
                return ~0UL;
        return 0;
}

static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
{
}

static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
                unsigned long index, struct page **page)
{
        return 0;
}

static inline void dax_unlock_mapping_entry(struct address_space *mapping,
                unsigned long index, dax_entry_t cookie)
{
}
#endif

int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
                const struct iomap_ops *ops);
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
                const struct iomap_ops *ops);
int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
                const struct iomap_ops *ops);

static inline bool dax_page_is_idle(struct page *page)
{
        return page && page_ref_count(page) == 0;
}

#if IS_ENABLED(CONFIG_DAX)
int dax_read_lock(void);
void dax_read_unlock(int id);
#else
static inline int dax_read_lock(void)
{
        return 0;
}

static inline void dax_read_unlock(int id)
{
}
#endif /* CONFIG_DAX */

#if !IS_ENABLED(CONFIG_FS_DAX)
static inline int __must_check dax_break_layout(struct inode *inode,
                            loff_t start, loff_t end, void (cb)(struct inode *))
{
        return 0;
}

static inline void dax_break_layout_final(struct inode *inode)
{
}
#endif

bool dax_alive(struct dax_device *dax_dev);
void *dax_get_private(struct dax_device *dax_dev);
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
                enum dax_access_mode mode, void **kaddr, unsigned long *pfn);
size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
                        size_t nr_pages);
int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len,
                int mf_flags);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);

ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops);
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
                        unsigned long *pfnp, int *errp,
                        const struct iomap_ops *ops);
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
                unsigned int order, unsigned long pfn);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
void dax_delete_mapping_range(struct address_space *mapping,
                                loff_t start, loff_t end);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index);
int __must_check dax_break_layout(struct inode *inode, loff_t start,
                                loff_t end, void (cb)(struct inode *));
static inline int __must_check dax_break_layout_inode(struct inode *inode,
                                                void (cb)(struct inode *))
{
        return dax_break_layout(inode, 0, LLONG_MAX, cb);
}
void dax_break_layout_final(struct inode *inode);
int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
                                  struct inode *dest, loff_t destoff,
                                  loff_t len, bool *is_same,
                                  const struct iomap_ops *ops);
int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                              struct file *file_out, loff_t pos_out,
                              loff_t *len, unsigned int remap_flags,
                              const struct iomap_ops *ops);
static inline bool dax_mapping(struct address_space *mapping)
{
        return mapping->host && IS_DAX(mapping->host);
}

/*
 * Due to dax's memory and block duo personalities, hwpoison reporting
 * takes into consideration which personality is presently visible.
 * When dax acts like a block device, such as in block IO, an encounter of
 * dax hwpoison is reported as -EIO.
 * When dax acts like memory, such as in page fault, a detection of hwpoison
 * is reported as -EHWPOISON which leads to VM_FAULT_HWPOISON.
 */
static inline int dax_mem2blk_err(int err)
{
        return (err == -EHWPOISON) ? -EIO : err;
}

#ifdef CONFIG_DEV_DAX_HMEM_DEVICES
void hmem_register_resource(int target_nid, struct resource *r);
#else
static inline void hmem_register_resource(int target_nid, struct resource *r)
{
}
#endif

typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
                            const struct resource *res);
int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
#endif












































































































































   14 























   13 


   14 

































































































































































































































































































































































































































































































































































































































































































   11 



























































































































































































































































































































































































































































































   11 












   11 


   11 
























































































































































































































































   11 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
// SPDX-License-Identifier: GPL-2.0
/*
 *  gendisk handling
 *
 * Portions Copyright (C) 2020 Christoph Hellwig
 */

#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/fs.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/log2.h>
#include <linux/pm_runtime.h>
#include <linux/badblocks.h>
#include <linux/part_stat.h>
#include <linux/blktrace_api.h>

#include "blk-throttle.h"
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
#include "blk-cgroup.h"

static struct kobject *block_depr;

/*
 * Unique, monotonically increasing sequential number associated with block
 * devices instances (i.e. incremented each time a device is attached).
 * Associating uevents with block devices in userspace is difficult and racy:
 * the uevent netlink socket is lossy, and on slow and overloaded systems has
 * a very high latency.
 * Block devices do not have exclusive owners in userspace, any process can set
 * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
 * can be reused again and again).
 * A userspace process setting up a block device and watching for its events
 * cannot thus reliably tell whether an event relates to the device it just set
 * up or another earlier instance with the same name.
 * This sequential number allows userspace processes to solve this problem, and
 * uniquely associate an uevent to the lifetime to a device.
 */
static atomic64_t diskseq;

/* for extended dynamic devt allocation, currently only one major is used */
#define NR_EXT_DEVT                (1 << MINORBITS)
static DEFINE_IDA(ext_devt_ida);

void set_capacity(struct gendisk *disk, sector_t sectors)
{
        if (sectors > BLK_DEV_MAX_SECTORS) {
                pr_warn_once("%s: truncate capacity from %lld to %lld\n",
                                disk->disk_name, sectors,
                                BLK_DEV_MAX_SECTORS);
                sectors = BLK_DEV_MAX_SECTORS;
        }

        bdev_set_nr_sectors(disk->part0, sectors);
}
EXPORT_SYMBOL(set_capacity);

/*
 * Set disk capacity and notify if the size is not currently zero and will not
 * be set to zero.  Returns true if a uevent was sent, otherwise false.
 */
bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
{
        sector_t capacity = get_capacity(disk);
        char *envp[] = { "RESIZE=1", NULL };

        set_capacity(disk, size);

        /*
         * Only print a message and send a uevent if the gendisk is user visible
         * and alive.  This avoids spamming the log and udev when setting the
         * initial capacity during probing.
         */
        if (size == capacity ||
            !disk_live(disk) ||
            (disk->flags & GENHD_FL_HIDDEN))
                return false;

        pr_info_ratelimited("%s: detected capacity change from %lld to %lld\n",
                disk->disk_name, capacity, size);

        /*
         * Historically we did not send a uevent for changes to/from an empty
         * device.
         */
        if (!capacity || !size)
                return false;
        kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
        return true;
}
EXPORT_SYMBOL_GPL(set_capacity_and_notify);

static void part_stat_read_all(struct block_device *part,
                struct disk_stats *stat)
{
        int cpu;

        memset(stat, 0, sizeof(struct disk_stats));
        for_each_possible_cpu(cpu) {
                struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
                int group;

                for (group = 0; group < NR_STAT_GROUPS; group++) {
                        stat->nsecs[group] += ptr->nsecs[group];
                        stat->sectors[group] += ptr->sectors[group];
                        stat->ios[group] += ptr->ios[group];
                        stat->merges[group] += ptr->merges[group];
                }

                stat->io_ticks += ptr->io_ticks;
        }
}

static void bdev_count_inflight_rw(struct block_device *part,
                unsigned int inflight[2], bool mq_driver)
{
        int write = 0;
        int read = 0;
        int cpu;

        if (mq_driver) {
                blk_mq_in_driver_rw(part, inflight);
                return;
        }

        for_each_possible_cpu(cpu) {
                read += part_stat_local_read_cpu(part, in_flight[READ], cpu);
                write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu);
        }

        /*
         * While iterating all CPUs, some IOs may be issued from a CPU already
         * traversed and complete on a CPU that has not yet been traversed,
         * causing the inflight number to be negative.
         */
        inflight[READ] = read > 0 ? read : 0;
        inflight[WRITE] = write > 0 ? write : 0;
}

/**
 * bdev_count_inflight - get the number of inflight IOs for a block device.
 *
 * @part: the block device.
 *
 * Inflight here means started IO accounting, from bdev_start_io_acct() for
 * bio-based block device, and from blk_account_io_start() for rq-based block
 * device.
 */
unsigned int bdev_count_inflight(struct block_device *part)
{
        unsigned int inflight[2] = {0};

        bdev_count_inflight_rw(part, inflight, false);

        return inflight[READ] + inflight[WRITE];
}
EXPORT_SYMBOL_GPL(bdev_count_inflight);

/*
 * Can be deleted altogether. Later.
 *
 */
#define BLKDEV_MAJOR_HASH_SIZE 255
static struct blk_major_name {
        struct blk_major_name *next;
        int major;
        char name[16];
#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
        void (*probe)(dev_t devt);
#endif
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
static DEFINE_MUTEX(major_names_lock);
static DEFINE_SPINLOCK(major_names_spinlock);

/* index in the above - for now: assume no multimajor ranges */
static inline int major_to_index(unsigned major)
{
        return major % BLKDEV_MAJOR_HASH_SIZE;
}

#ifdef CONFIG_PROC_FS
void blkdev_show(struct seq_file *seqf, off_t offset)
{
        struct blk_major_name *dp;

        spin_lock(&major_names_spinlock);
        for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
                if (dp->major == offset)
                        seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
        spin_unlock(&major_names_spinlock);
}
#endif /* CONFIG_PROC_FS */

/**
 * __register_blkdev - register a new block device
 *
 * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
 *         @major = 0, try to allocate any unused major number.
 * @name: the name of the new block device as a zero terminated string
 * @probe: pre-devtmpfs / pre-udev callback used to create disks when their
 *           pre-created device node is accessed. When a probe call uses
 *           add_disk() and it fails the driver must cleanup resources. This
 *           interface may soon be removed.
 *
 * The @name must be unique within the system.
 *
 * The return value depends on the @major input parameter:
 *
 *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
 *    then the function returns zero on success, or a negative error code
 *  - if any unused major number was requested with @major = 0 parameter
 *    then the return value is the allocated major number in range
 *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
 *
 * See Documentation/admin-guide/devices.txt for the list of allocated
 * major numbers.
 *
 * Use register_blkdev instead for any new code.
 */
int __register_blkdev(unsigned int major, const char *name,
                void (*probe)(dev_t devt))
{
        struct blk_major_name **n, *p;
        int index, ret = 0;

        mutex_lock(&major_names_lock);

        /* temporary */
        if (major == 0) {
                for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
                        if (major_names[index] == NULL)
                                break;
                }

                if (index == 0) {
                        printk("%s: failed to get major for %s\n",
                               __func__, name);
                        ret = -EBUSY;
                        goto out;
                }
                major = index;
                ret = major;
        }

        if (major >= BLKDEV_MAJOR_MAX) {
                pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
                       __func__, major, BLKDEV_MAJOR_MAX-1, name);

                ret = -EINVAL;
                goto out;
        }

        p = kmalloc_obj(struct blk_major_name);
        if (p == NULL) {
                ret = -ENOMEM;
                goto out;
        }

        p->major = major;
#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
        p->probe = probe;
#endif
        strscpy(p->name, name, sizeof(p->name));
        p->next = NULL;
        index = major_to_index(major);

        spin_lock(&major_names_spinlock);
        for (n = &major_names[index]; *n; n = &(*n)->next) {
                if ((*n)->major == major)
                        break;
        }
        if (!*n)
                *n = p;
        else
                ret = -EBUSY;
        spin_unlock(&major_names_spinlock);

        if (ret < 0) {
                printk("register_blkdev: cannot get major %u for %s\n",
                       major, name);
                kfree(p);
        }
out:
        mutex_unlock(&major_names_lock);
        return ret;
}
EXPORT_SYMBOL(__register_blkdev);

void unregister_blkdev(unsigned int major, const char *name)
{
        struct blk_major_name **n;
        struct blk_major_name *p = NULL;
        int index = major_to_index(major);

        mutex_lock(&major_names_lock);
        spin_lock(&major_names_spinlock);
        for (n = &major_names[index]; *n; n = &(*n)->next)
                if ((*n)->major == major)
                        break;
        if (!*n || strcmp((*n)->name, name)) {
                WARN_ON(1);
        } else {
                p = *n;
                *n = p->next;
        }
        spin_unlock(&major_names_spinlock);
        mutex_unlock(&major_names_lock);
        kfree(p);
}

EXPORT_SYMBOL(unregister_blkdev);

int blk_alloc_ext_minor(void)
{
        int idx;

        idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL);
        if (idx == -ENOSPC)
                return -EBUSY;
        return idx;
}

void blk_free_ext_minor(unsigned int minor)
{
        ida_free(&ext_devt_ida, minor);
}

void disk_uevent(struct gendisk *disk, enum kobject_action action)
{
        struct block_device *part;
        unsigned long idx;

        rcu_read_lock();
        xa_for_each(&disk->part_tbl, idx, part) {
                if (bdev_is_partition(part) && !bdev_nr_sectors(part))
                        continue;
                if (!kobject_get_unless_zero(&part->bd_device.kobj))
                        continue;

                rcu_read_unlock();
                kobject_uevent(bdev_kobj(part), action);
                put_device(&part->bd_device);
                rcu_read_lock();
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(disk_uevent);

int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
{
        struct file *file;
        int ret = 0;

        if (!disk_has_partscan(disk))
                return -EINVAL;
        if (disk->open_partitions)
                return -EBUSY;

        /*
         * If the device is opened exclusively by current thread already, it's
         * safe to scan partitons, otherwise, use bd_prepare_to_claim() to
         * synchronize with other exclusive openers and other partition
         * scanners.
         */
        if (!(mode & BLK_OPEN_EXCL)) {
                ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions,
                                          NULL);
                if (ret)
                        return ret;
        }

        set_bit(GD_NEED_PART_SCAN, &disk->state);
        file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL,
                                     NULL, NULL);
        if (IS_ERR(file))
                ret = PTR_ERR(file);
        else
                fput(file);

        /*
         * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,
         * and this will cause that re-assemble partitioned raid device will
         * creat partition for underlying disk.
         */
        clear_bit(GD_NEED_PART_SCAN, &disk->state);
        if (!(mode & BLK_OPEN_EXCL))
                bd_abort_claiming(disk->part0, disk_scan_partitions);
        return ret;
}

static void add_disk_final(struct gendisk *disk)
{
        struct device *ddev = disk_to_dev(disk);

        if (!(disk->flags & GENHD_FL_HIDDEN)) {
                /* Make sure the first partition scan will be proceed */
                if (get_capacity(disk) && disk_has_partscan(disk))
                        set_bit(GD_NEED_PART_SCAN, &disk->state);

                bdev_add(disk->part0, ddev->devt);
                if (get_capacity(disk))
                        disk_scan_partitions(disk, BLK_OPEN_READ);

                /*
                 * Announce the disk and partitions after all partitions are
                 * created. (for hidden disks uevents remain suppressed forever)
                 */
                dev_set_uevent_suppress(ddev, 0);
                disk_uevent(disk, KOBJ_ADD);
        }

        blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
        disk_add_events(disk);
        set_bit(GD_ADDED, &disk->state);
}

static int __add_disk(struct device *parent, struct gendisk *disk,
                      const struct attribute_group **groups,
                      struct fwnode_handle *fwnode)

{
        struct device *ddev = disk_to_dev(disk);
        int ret;

        if (WARN_ON_ONCE(bdev_nr_sectors(disk->part0) > BLK_DEV_MAX_SECTORS))
                return -EINVAL;

        if (queue_is_mq(disk->queue)) {
                /*
                 * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers.
                 */
                if (disk->fops->submit_bio || disk->fops->poll_bio)
                        return -EINVAL;
        } else {
                if (!disk->fops->submit_bio)
                        return -EINVAL;
                bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO);
        }

        /*
         * If the driver provides an explicit major number it also must provide
         * the number of minors numbers supported, and those will be used to
         * setup the gendisk.
         * Otherwise just allocate the device numbers for both the whole device
         * and all partitions from the extended dev_t space.
         */
        ret = -EINVAL;
        if (disk->major) {
                if (WARN_ON(!disk->minors))
                        goto out;

                if (disk->minors > DISK_MAX_PARTS) {
                        pr_err("block: can't allocate more than %d partitions\n",
                                DISK_MAX_PARTS);
                        disk->minors = DISK_MAX_PARTS;
                }
                if (disk->first_minor > MINORMASK ||
                    disk->minors > MINORMASK + 1 ||
                    disk->first_minor + disk->minors > MINORMASK + 1)
                        goto out;
        } else {
                if (WARN_ON(disk->minors))
                        goto out;

                ret = blk_alloc_ext_minor();
                if (ret < 0)
                        goto out;
                disk->major = BLOCK_EXT_MAJOR;
                disk->first_minor = ret;
        }

        /* delay uevents, until we scanned partition table */
        dev_set_uevent_suppress(ddev, 1);

        ddev->parent = parent;
        ddev->groups = groups;
        dev_set_name(ddev, "%s", disk->disk_name);
        if (fwnode)
                device_set_node(ddev, fwnode);
        if (!(disk->flags & GENHD_FL_HIDDEN))
                ddev->devt = MKDEV(disk->major, disk->first_minor);
        ret = device_add(ddev);
        if (ret)
                goto out_free_ext_minor;

        ret = disk_alloc_events(disk);
        if (ret)
                goto out_device_del;

        ret = sysfs_create_link(block_depr, &ddev->kobj,
                                kobject_name(&ddev->kobj));
        if (ret)
                goto out_device_del;

        /*
         * avoid probable deadlock caused by allocating memory with
         * GFP_KERNEL in runtime_resume callback of its all ancestor
         * devices
         */
        pm_runtime_set_memalloc_noio(ddev, true);

        disk->part0->bd_holder_dir =
                kobject_create_and_add("holders", &ddev->kobj);
        if (!disk->part0->bd_holder_dir) {
                ret = -ENOMEM;
                goto out_del_block_link;
        }
        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
        if (!disk->slave_dir) {
                ret = -ENOMEM;
                goto out_put_holder_dir;
        }

        ret = blk_register_queue(disk);
        if (ret)
                goto out_put_slave_dir;

        if (!(disk->flags & GENHD_FL_HIDDEN)) {
                ret = bdi_register(disk->bdi, "%u:%u",
                                   disk->major, disk->first_minor);
                if (ret)
                        goto out_unregister_queue;
                bdi_set_owner(disk->bdi, ddev);
                ret = sysfs_create_link(&ddev->kobj,
                                        &disk->bdi->dev->kobj, "bdi");
                if (ret)
                        goto out_unregister_bdi;
        } else {
                /*
                 * Even if the block_device for a hidden gendisk is not
                 * registered, it needs to have a valid bd_dev so that the
                 * freeing of the dynamic major works.
                 */
                disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor);
        }
        return 0;

out_unregister_bdi:
        if (!(disk->flags & GENHD_FL_HIDDEN))
                bdi_unregister(disk->bdi);
out_unregister_queue:
        blk_unregister_queue(disk);
        rq_qos_exit(disk->queue);
out_put_slave_dir:
        kobject_put(disk->slave_dir);
        disk->slave_dir = NULL;
out_put_holder_dir:
        kobject_put(disk->part0->bd_holder_dir);
out_del_block_link:
        sysfs_remove_link(block_depr, dev_name(ddev));
        pm_runtime_set_memalloc_noio(ddev, false);
out_device_del:
        device_del(ddev);
out_free_ext_minor:
        if (disk->major == BLOCK_EXT_MAJOR)
                blk_free_ext_minor(disk->first_minor);
out:
        return ret;
}

/**
 * add_disk_fwnode - add disk information to kernel list with fwnode
 * @parent: parent device for the disk
 * @disk: per-device partitioning information
 * @groups: Additional per-device sysfs groups
 * @fwnode: attached disk fwnode
 *
 * This function registers the partitioning information in @disk
 * with the kernel. Also attach a fwnode to the disk device.
 */
int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
                                 const struct attribute_group **groups,
                                 struct fwnode_handle *fwnode)
{
        struct blk_mq_tag_set *set;
        unsigned int memflags;
        int ret;

        if (queue_is_mq(disk->queue)) {
                set = disk->queue->tag_set;
                memflags = memalloc_noio_save();
                down_read(&set->update_nr_hwq_lock);
                ret = __add_disk(parent, disk, groups, fwnode);
                up_read(&set->update_nr_hwq_lock);
                memalloc_noio_restore(memflags);
        } else {
                ret = __add_disk(parent, disk, groups, fwnode);
        }

        /*
         * add_disk_final() needn't to read `nr_hw_queues`, so move it out
         * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary
         * lock dependency on `disk->open_mutex` from scanning partition.
         */
        if (!ret)
                add_disk_final(disk);
        return ret;
}
EXPORT_SYMBOL_GPL(add_disk_fwnode);

/**
 * device_add_disk - add disk information to kernel list
 * @parent: parent device for the disk
 * @disk: per-device partitioning information
 * @groups: Additional per-device sysfs groups
 *
 * This function registers the partitioning information in @disk
 * with the kernel.
 */
int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
                                 const struct attribute_group **groups)
{
        return add_disk_fwnode(parent, disk, groups, NULL);
}
EXPORT_SYMBOL(device_add_disk);

static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
{
        struct block_device *bdev;
        unsigned long idx;

        /*
         * On surprise disk removal, bdev_mark_dead() may call into file
         * systems below. Make it clear that we're expecting to not hold
         * disk->open_mutex.
         */
        lockdep_assert_not_held(&disk->open_mutex);

        rcu_read_lock();
        xa_for_each(&disk->part_tbl, idx, bdev) {
                if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
                        continue;
                rcu_read_unlock();

                bdev_mark_dead(bdev, surprise);

                put_device(&bdev->bd_device);
                rcu_read_lock();
        }
        rcu_read_unlock();
}

static bool __blk_mark_disk_dead(struct gendisk *disk)
{
        /*
         * Fail any new I/O.
         */
        if (test_and_set_bit(GD_DEAD, &disk->state))
                return false;

        if (test_bit(GD_OWNS_QUEUE, &disk->state))
                blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue);

        /*
         * Stop buffered writers from dirtying pages that can't be written out.
         */
        set_capacity(disk, 0);

        /*
         * Prevent new I/O from crossing bio_queue_enter().
         */
        return blk_queue_start_drain(disk->queue);
}

/**
 * blk_mark_disk_dead - mark a disk as dead
 * @disk: disk to mark as dead
 *
 * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O
 * to this disk.
 */
void blk_mark_disk_dead(struct gendisk *disk)
{
        __blk_mark_disk_dead(disk);
        blk_report_disk_dead(disk, true);
}
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);

static void __del_gendisk(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct block_device *part;
        unsigned long idx;
        bool start_drain;

        might_sleep();

        if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN)))
                return;

        disk_del_events(disk);

        /*
         * Prevent new openers by unlinked the bdev inode.
         */
        mutex_lock(&disk->open_mutex);
        xa_for_each(&disk->part_tbl, idx, part)
                bdev_unhash(part);
        mutex_unlock(&disk->open_mutex);

        /*
         * Tell the file system to write back all dirty data and shut down if
         * it hasn't been notified earlier.
         */
        if (!test_bit(GD_DEAD, &disk->state))
                blk_report_disk_dead(disk, false);

        /*
         * Drop all partitions now that the disk is marked dead.
         */
        mutex_lock(&disk->open_mutex);
        start_drain = __blk_mark_disk_dead(disk);
        if (start_drain)
                blk_freeze_acquire_lock(q);
        xa_for_each_start(&disk->part_tbl, idx, part, 1)
                drop_partition(part);
        mutex_unlock(&disk->open_mutex);

        if (!(disk->flags & GENHD_FL_HIDDEN)) {
                sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");

                /*
                 * Unregister bdi before releasing device numbers (as they can
                 * get reused and we'd get clashes in sysfs).
                 */
                bdi_unregister(disk->bdi);
        }

        blk_unregister_queue(disk);

        kobject_put(disk->part0->bd_holder_dir);
        kobject_put(disk->slave_dir);
        disk->slave_dir = NULL;

        part_stat_set_all(disk->part0, 0);
        disk->part0->bd_stamp = 0;
        sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
        pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
        device_del(disk_to_dev(disk));

        blk_mq_freeze_queue_wait(q);

        blk_throtl_cancel_bios(disk);

        blk_sync_queue(q);
        blk_flush_integrity();

        if (queue_is_mq(q))
                blk_mq_cancel_work_sync(q);

        rq_qos_exit(q);

        /*
         * If the disk does not own the queue, allow using passthrough requests
         * again.  Else leave the queue frozen to fail all I/O.
         */
        if (!test_bit(GD_OWNS_QUEUE, &disk->state))
                __blk_mq_unfreeze_queue(q, true);
        else if (queue_is_mq(q))
                blk_mq_exit_queue(q);

        if (start_drain)
                blk_unfreeze_release_lock(q);
}

static void disable_elv_switch(struct request_queue *q)
{
        struct blk_mq_tag_set *set = q->tag_set;
        WARN_ON_ONCE(!queue_is_mq(q));

        down_write(&set->update_nr_hwq_lock);
        blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q);
        up_write(&set->update_nr_hwq_lock);
}

/**
 * del_gendisk - remove the gendisk
 * @disk: the struct gendisk to remove
 *
 * Removes the gendisk and all its associated resources. This deletes the
 * partitions associated with the gendisk, and unregisters the associated
 * request_queue.
 *
 * This is the counter to the respective device_add_disk() call.
 *
 * The final removal of the struct gendisk happens when its refcount reaches 0
 * with put_disk(), which should be called after del_gendisk(), if
 * device_add_disk() was used.
 *
 * Drivers exist which depend on the release of the gendisk to be synchronous,
 * it should not be deferred.
 *
 * Context: can sleep
 */
void del_gendisk(struct gendisk *disk)
{
        struct blk_mq_tag_set *set;
        unsigned int memflags;

        if (!queue_is_mq(disk->queue)) {
                __del_gendisk(disk);
        } else {
                set = disk->queue->tag_set;

                disable_elv_switch(disk->queue);

                memflags = memalloc_noio_save();
                down_read(&set->update_nr_hwq_lock);
                __del_gendisk(disk);
                up_read(&set->update_nr_hwq_lock);
                memalloc_noio_restore(memflags);
        }
}
EXPORT_SYMBOL(del_gendisk);

/**
 * invalidate_disk - invalidate the disk
 * @disk: the struct gendisk to invalidate
 *
 * A helper to invalidates the disk. It will clean the disk's associated
 * buffer/page caches and reset its internal states so that the disk
 * can be reused by the drivers.
 *
 * Context: can sleep
 */
void invalidate_disk(struct gendisk *disk)
{
        struct block_device *bdev = disk->part0;

        invalidate_bdev(bdev);
        bdev->bd_mapping->wb_err = 0;
        set_capacity(disk, 0);
}
EXPORT_SYMBOL(invalidate_disk);

/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
                                        struct device_attribute *attr,
                                        char *page)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!disk->bb)
                return sysfs_emit(page, "\n");

        return badblocks_show(disk->bb, page, 0);
}

static ssize_t disk_badblocks_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *page, size_t len)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!disk->bb)
                return -ENXIO;

        return badblocks_store(disk->bb, page, len, 0);
}

#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
static bool blk_probe_dev(dev_t devt)
{
        unsigned int major = MAJOR(devt);
        struct blk_major_name **n;

        mutex_lock(&major_names_lock);
        for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
                if ((*n)->major == major && (*n)->probe) {
                        (*n)->probe(devt);
                        mutex_unlock(&major_names_lock);
                        return true;
                }
        }
        mutex_unlock(&major_names_lock);
        return false;
}

void blk_request_module(dev_t devt)
{
        int error;

        if (blk_probe_dev(devt))
                return;

        error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt));
        /* Make old-style 2.4 aliases work */
        if (error > 0)
                error = request_module("block-major-%d", MAJOR(devt));
        if (!error)
                blk_probe_dev(devt);
}
#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */

#ifdef CONFIG_PROC_FS
/* iterator */
static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
{
        loff_t skip = *pos;
        struct class_dev_iter *iter;
        struct device *dev;

        iter = kmalloc_obj(*iter);
        if (!iter)
                return ERR_PTR(-ENOMEM);

        seqf->private = iter;
        class_dev_iter_init(iter, &block_class, NULL, &disk_type);
        do {
                dev = class_dev_iter_next(iter);
                if (!dev)
                        return NULL;
        } while (skip--);

        return dev_to_disk(dev);
}

static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
{
        struct device *dev;

        (*pos)++;
        dev = class_dev_iter_next(seqf->private);
        if (dev)
                return dev_to_disk(dev);

        return NULL;
}

static void disk_seqf_stop(struct seq_file *seqf, void *v)
{
        struct class_dev_iter *iter = seqf->private;

        /* stop is called even after start failed :-( */
        if (iter) {
                class_dev_iter_exit(iter);
                kfree(iter);
                seqf->private = NULL;
        }
}

static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
{
        void *p;

        p = disk_seqf_start(seqf, pos);
        if (!IS_ERR_OR_NULL(p) && !*pos)
                seq_puts(seqf, "major minor  #blocks  name\n\n");
        return p;
}

static int show_partition(struct seq_file *seqf, void *v)
{
        struct gendisk *sgp = v;
        struct block_device *part;
        unsigned long idx;

        if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
                return 0;

        rcu_read_lock();
        xa_for_each(&sgp->part_tbl, idx, part) {
                if (!bdev_nr_sectors(part))
                        continue;
                seq_printf(seqf, "%4d  %7d %10llu %pg\n",
                           MAJOR(part->bd_dev), MINOR(part->bd_dev),
                           bdev_nr_sectors(part) >> 1, part);
        }
        rcu_read_unlock();
        return 0;
}

static const struct seq_operations partitions_op = {
        .start        = show_partition_start,
        .next        = disk_seqf_next,
        .stop        = disk_seqf_stop,
        .show        = show_partition
};
#endif

static int __init genhd_device_init(void)
{
        int error;

        error = class_register(&block_class);
        if (unlikely(error))
                return error;
        blk_dev_init();

        register_blkdev(BLOCK_EXT_MAJOR, "blkext");

        /* create top-level block dir */
        block_depr = kobject_create_and_add("block", NULL);
        return 0;
}

subsys_initcall(genhd_device_init);

static ssize_t disk_range_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sysfs_emit(buf, "%d\n", disk->minors);
}

static ssize_t disk_ext_range_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sysfs_emit(buf, "%d\n",
                (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
}

static ssize_t disk_removable_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sysfs_emit(buf, "%d\n",
                       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
}

static ssize_t disk_hidden_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sysfs_emit(buf, "%d\n",
                       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}

static ssize_t disk_ro_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
}

ssize_t part_size_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
}

ssize_t part_stat_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
{
        struct block_device *bdev = dev_to_bdev(dev);
        struct disk_stats stat;
        unsigned int inflight;

        inflight = bdev_count_inflight(bdev);
        if (inflight) {
                part_stat_lock();
                update_io_ticks(bdev, jiffies, true);
                part_stat_unlock();
        }
        part_stat_read_all(bdev, &stat);
        return sysfs_emit(buf,
                "%8lu %8lu %8llu %8u "
                "%8lu %8lu %8llu %8u "
                "%8u %8u %8u "
                "%8lu %8lu %8llu %8u "
                "%8lu %8u"
                "\n",
                stat.ios[STAT_READ],
                stat.merges[STAT_READ],
                (unsigned long long)stat.sectors[STAT_READ],
                (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
                stat.ios[STAT_WRITE],
                stat.merges[STAT_WRITE],
                (unsigned long long)stat.sectors[STAT_WRITE],
                (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
                inflight,
                jiffies_to_msecs(stat.io_ticks),
                (unsigned int)div_u64(stat.nsecs[STAT_READ] +
                                      stat.nsecs[STAT_WRITE] +
                                      stat.nsecs[STAT_DISCARD] +
                                      stat.nsecs[STAT_FLUSH],
                                                NSEC_PER_MSEC),
                stat.ios[STAT_DISCARD],
                stat.merges[STAT_DISCARD],
                (unsigned long long)stat.sectors[STAT_DISCARD],
                (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
                stat.ios[STAT_FLUSH],
                (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
}

/*
 * Show the number of IOs issued to driver.
 * For bio-based device, started from bdev_start_io_acct();
 * For rq-based device, started from blk_mq_start_request();
 */
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct block_device *bdev = dev_to_bdev(dev);
        struct request_queue *q = bdev_get_queue(bdev);
        unsigned int inflight[2] = {0};

        bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q));

        return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]);
}

static ssize_t disk_capability_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        dev_warn_once(dev, "the capability attribute has been deprecated.\n");
        return sysfs_emit(buf, "0\n");
}

static ssize_t disk_alignment_offset_show(struct device *dev,
                                          struct device_attribute *attr,
                                          char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0));
}

static ssize_t disk_discard_alignment_show(struct device *dev,
                                           struct device_attribute *attr,
                                           char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0));
}

static ssize_t diskseq_show(struct device *dev,
                            struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sysfs_emit(buf, "%llu\n", disk->diskseq);
}

static ssize_t partscan_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
}

static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);

#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n",
                       bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL));
}

ssize_t part_fail_store(struct device *dev,
                        struct device_attribute *attr,
                        const char *buf, size_t count)
{
        int i;

        if (count > 0 && sscanf(buf, "%d", &i) > 0) {
                if (i)
                        bdev_set_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
                else
                        bdev_clear_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
        }
        return count;
}

static struct device_attribute dev_attr_fail =
        __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
#endif /* CONFIG_FAIL_MAKE_REQUEST */

#ifdef CONFIG_FAIL_IO_TIMEOUT
static struct device_attribute dev_attr_fail_timeout =
        __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
#endif

static struct attribute *disk_attrs[] = {
        &dev_attr_range.attr,
        &dev_attr_ext_range.attr,
        &dev_attr_removable.attr,
        &dev_attr_hidden.attr,
        &dev_attr_ro.attr,
        &dev_attr_size.attr,
        &dev_attr_alignment_offset.attr,
        &dev_attr_discard_alignment.attr,
        &dev_attr_capability.attr,
        &dev_attr_stat.attr,
        &dev_attr_inflight.attr,
        &dev_attr_badblocks.attr,
        &dev_attr_events.attr,
        &dev_attr_events_async.attr,
        &dev_attr_events_poll_msecs.attr,
        &dev_attr_diskseq.attr,
        &dev_attr_partscan.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
        &dev_attr_fail.attr,
#endif
#ifdef CONFIG_FAIL_IO_TIMEOUT
        &dev_attr_fail_timeout.attr,
#endif
        NULL
};

static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
{
        struct device *dev = container_of(kobj, typeof(*dev), kobj);
        struct gendisk *disk = dev_to_disk(dev);

        if (a == &dev_attr_badblocks.attr && !disk->bb)
                return 0;
        return a->mode;
}

static struct attribute_group disk_attr_group = {
        .attrs = disk_attrs,
        .is_visible = disk_visible,
};

static const struct attribute_group *disk_attr_groups[] = {
        &disk_attr_group,
#ifdef CONFIG_BLK_DEV_IO_TRACE
        &blk_trace_attr_group,
#endif
#ifdef CONFIG_BLK_DEV_INTEGRITY
        &blk_integrity_attr_group,
#endif
        NULL
};

/**
 * disk_release - releases all allocated resources of the gendisk
 * @dev: the device representing this disk
 *
 * This function releases all allocated resources of the gendisk.
 *
 * Drivers which used device_add_disk() have a gendisk with a request_queue
 * assigned. Since the request_queue sits on top of the gendisk for these
 * drivers we also call blk_put_queue() for them, and we expect the
 * request_queue refcount to reach 0 at this point, and so the request_queue
 * will also be freed prior to the disk.
 *
 * Context: can sleep
 */
static void disk_release(struct device *dev)
{
        struct gendisk *disk = dev_to_disk(dev);

        might_sleep();
        WARN_ON_ONCE(disk_live(disk));

        blk_trace_remove(disk->queue);

        /*
         * To undo the all initialization from blk_mq_init_allocated_queue in
         * case of a probe failure where add_disk is never called we have to
         * call blk_mq_exit_queue here. We can't do this for the more common
         * teardown case (yet) as the tagset can be gone by the time the disk
         * is released once it was added.
         */
        if (queue_is_mq(disk->queue) &&
            test_bit(GD_OWNS_QUEUE, &disk->state) &&
            !test_bit(GD_ADDED, &disk->state))
                blk_mq_exit_queue(disk->queue);

        blkcg_exit_disk(disk);

        bioset_exit(&disk->bio_split);

        disk_release_events(disk);
        kfree(disk->random);
        disk_free_zone_resources(disk);
        xa_destroy(&disk->part_tbl);

        kobject_put(&disk->queue_kobj);
        disk->queue->disk = NULL;
        blk_put_queue(disk->queue);

        if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk)
                disk->fops->free_disk(disk);

        bdev_drop(disk->part0);        /* frees the disk */
}

static int block_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        const struct gendisk *disk = dev_to_disk(dev);

        return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
}

const struct class block_class = {
        .name                = "block",
        .dev_uevent        = block_uevent,
};

static char *block_devnode(const struct device *dev, umode_t *mode,
                           kuid_t *uid, kgid_t *gid)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (disk->fops->devnode)
                return disk->fops->devnode(disk, mode);
        return NULL;
}

const struct device_type disk_type = {
        .name                = "disk",
        .groups                = disk_attr_groups,
        .release        = disk_release,
        .devnode        = block_devnode,
};

#ifdef CONFIG_PROC_FS
/*
 * aggregate disk stat collector.  Uses the same stats that the sysfs
 * entries do, above, but makes them available through one seq_file.
 *
 * The output looks suspiciously like /proc/partitions with a bunch of
 * extra fields.
 */
static int diskstats_show(struct seq_file *seqf, void *v)
{
        struct gendisk *gp = v;
        struct block_device *hd;
        unsigned int inflight;
        struct disk_stats stat;
        unsigned long idx;

        /*
        if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
                seq_puts(seqf,        "major minor name"
                                "     rio rmerge rsect ruse wio wmerge "
                                "wsect wuse running use aveq"
                                "\n\n");
        */

        rcu_read_lock();
        xa_for_each(&gp->part_tbl, idx, hd) {
                if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
                        continue;

                inflight = bdev_count_inflight(hd);
                if (inflight) {
                        part_stat_lock();
                        update_io_ticks(hd, jiffies, true);
                        part_stat_unlock();
                }
                part_stat_read_all(hd, &stat);
                seq_put_decimal_ull_width(seqf, "",  MAJOR(hd->bd_dev), 4);
                seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7);
                seq_printf(seqf, " %pg", hd);
                seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]);
                seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]);
                seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]);
                seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ],
                                                                     NSEC_PER_MSEC));
                seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]);
                seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]);
                seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]);
                seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
                                                                     NSEC_PER_MSEC));
                seq_put_decimal_ull(seqf, " ", inflight);
                seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks));
                seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] +
                                                                     stat.nsecs[STAT_WRITE] +
                                                                     stat.nsecs[STAT_DISCARD] +
                                                                     stat.nsecs[STAT_FLUSH],
                                                                     NSEC_PER_MSEC));
                seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]);
                seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]);
                seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]);
                seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
                                                                     NSEC_PER_MSEC));
                seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]);
                seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
                                                                     NSEC_PER_MSEC));
                seq_putc(seqf, '\n');
        }
        rcu_read_unlock();

        return 0;
}

static const struct seq_operations diskstats_op = {
        .start        = disk_seqf_start,
        .next        = disk_seqf_next,
        .stop        = disk_seqf_stop,
        .show        = diskstats_show
};

static int __init proc_genhd_init(void)
{
        proc_create_seq("diskstats", 0, NULL, &diskstats_op);
        proc_create_seq("partitions", 0, NULL, &partitions_op);
        return 0;
}
module_init(proc_genhd_init);
#endif /* CONFIG_PROC_FS */

dev_t part_devt(struct gendisk *disk, u8 partno)
{
        struct block_device *part;
        dev_t devt = 0;

        rcu_read_lock();
        part = xa_load(&disk->part_tbl, partno);
        if (part)
                devt = part->bd_dev;
        rcu_read_unlock();

        return devt;
}

struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
                struct lock_class_key *lkclass)
{
        struct gendisk *disk;

        disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
        if (!disk)
                return NULL;

        if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
                goto out_free_disk;

        disk->bdi = bdi_alloc(node_id);
        if (!disk->bdi)
                goto out_free_bioset;

        /* bdev_alloc() might need the queue, set before the first call */
        disk->queue = q;

        disk->part0 = bdev_alloc(disk, 0);
        if (!disk->part0)
                goto out_free_bdi;

        disk->node_id = node_id;
        mutex_init(&disk->open_mutex);
        xa_init(&disk->part_tbl);
        if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
                goto out_destroy_part_tbl;

        if (blkcg_init_disk(disk))
                goto out_erase_part0;

        disk_init_zone_resources(disk);
        rand_initialize_disk(disk);
        disk_to_dev(disk)->class = &block_class;
        disk_to_dev(disk)->type = &disk_type;
        device_initialize(disk_to_dev(disk));
        inc_diskseq(disk);
        q->disk = disk;
        lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
        INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
        mutex_init(&disk->rqos_state_mutex);
        kobject_init(&disk->queue_kobj, &blk_queue_ktype);
        return disk;

out_erase_part0:
        xa_erase(&disk->part_tbl, 0);
out_destroy_part_tbl:
        xa_destroy(&disk->part_tbl);
        disk->part0->bd_disk = NULL;
        bdev_drop(disk->part0);
out_free_bdi:
        bdi_put(disk->bdi);
out_free_bioset:
        bioset_exit(&disk->bio_split);
out_free_disk:
        kfree(disk);
        return NULL;
}

struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
                struct lock_class_key *lkclass)
{
        struct queue_limits default_lim = { };
        struct request_queue *q;
        struct gendisk *disk;

        q = blk_alloc_queue(lim ? lim : &default_lim, node);
        if (IS_ERR(q))
                return ERR_CAST(q);

        disk = __alloc_disk_node(q, node, lkclass);
        if (!disk) {
                blk_put_queue(q);
                return ERR_PTR(-ENOMEM);
        }
        set_bit(GD_OWNS_QUEUE, &disk->state);
        return disk;
}
EXPORT_SYMBOL(__blk_alloc_disk);

/**
 * put_disk - decrements the gendisk refcount
 * @disk: the struct gendisk to decrement the refcount for
 *
 * This decrements the refcount for the struct gendisk. When this reaches 0
 * we'll have disk_release() called.
 *
 * Note: for blk-mq disk put_disk must be called before freeing the tag_set
 * when handling probe errors (that is before add_disk() is called).
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
 */
void put_disk(struct gendisk *disk)
{
        if (disk)
                put_device(disk_to_dev(disk));
}
EXPORT_SYMBOL(put_disk);

static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
        char event[] = "DISK_RO=1";
        char *envp[] = { event, NULL };

        if (!ro)
                event[8] = '0';
        kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}

/**
 * set_disk_ro - set a gendisk read-only
 * @disk:        gendisk to operate on
 * @read_only:        %true to set the disk read-only, %false set the disk read/write
 *
 * This function is used to indicate whether a given disk device should have its
 * read-only flag set. set_disk_ro() is typically used by device drivers to
 * indicate whether the underlying physical device is write-protected.
 */
void set_disk_ro(struct gendisk *disk, bool read_only)
{
        if (read_only) {
                if (test_and_set_bit(GD_READ_ONLY, &disk->state))
                        return;
        } else {
                if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
                        return;
        }
        set_disk_ro_uevent(disk, read_only);
}
EXPORT_SYMBOL(set_disk_ro);

void inc_diskseq(struct gendisk *disk)
{
        disk->diskseq = atomic64_inc_return(&diskseq);
}

































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 

    1 
    1 
    1 




    1 
    1 
    1 



    1 






    1 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2009, Christoph Hellwig
 * All Rights Reserved.
 *
 * NOTE: none of these tracepoints shall be considered a stable kernel ABI
 * as they can change at any time.
 *
 * Current conventions for printing numbers measuring specific units:
 *
 * agno: allocation group number
 *
 * agino: per-AG inode number
 * ino: filesystem inode number
 *
 * agbno: per-AG block number in fs blocks
 * rgbno: per-rtgroup block number in fs blocks
 * startblock: physical block number for file mappings.  This is either a
 *             segmented fsblock for data device mappings, or a rfsblock
 *             for realtime device mappings
 * fsbcount: number of blocks in an extent, in fs blocks
 *
 * gbno: generic allocation group block number.  This is an agbno for
 *       space in a per-AG or a rgbno for space in a realtime group.
 *
 * daddr: physical block number in 512b blocks
 * bbcount: number of blocks in a physical extent, in 512b blocks
 *
 * rtx: physical rt extent number for extent mappings
 * rtxcount: number of rt extents in an extent mapping
 *
 * owner: reverse-mapping owner, usually inodes
 *
 * fileoff: file offset, in fs blocks
 * pos: file offset, in bytes
 * bytecount: number of bytes
 *
 * dablk: directory or xattr block offset, in filesystem blocks
 *
 * disize: ondisk file size, in bytes
 * isize: incore file size, in bytes
 *
 * forkoff: inode fork offset, in bytes
 *
 * ireccount: number of inode records
 *
 * Numbers describing space allocations (blocks, extents, inodes) should be
 * formatted in hexadecimal.
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM xfs

#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_XFS_H

#include <linux/tracepoint.h>

struct xfs_agf;
struct xfs_ail;
struct xfs_alloc_arg;
struct xfs_attr_list_context;
struct xfs_buf_log_item;
struct xfs_da_args;
struct xfs_da_node_entry;
struct xfs_dquot;
struct xfs_log_item;
struct xlog;
struct xlog_ticket;
struct xlog_recover;
struct xlog_recover_item;
struct xlog_rec_header;
struct xlog_in_core;
struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
struct xfs_btree_cur;
struct xfs_defer_op_type;
struct xfs_refcount_irec;
struct xfs_fsmap;
struct xfs_fsmap_irec;
struct xfs_group;
struct xfs_rmap_irec;
struct xfs_icreate_log;
struct xfs_iunlink_item;
struct xfs_owner_info;
struct xfs_trans_res;
struct xfs_inobt_rec_incore;
union xfs_btree_ptr;
struct xfs_dqtrx;
struct xfs_icwalk;
struct xfs_perag;
struct xfbtree;
struct xfs_btree_ops;
struct xfs_bmap_intent;
struct xfs_exchmaps_intent;
struct xfs_exchmaps_req;
struct xfs_exchrange;
struct xfs_getparents;
struct xfs_parent_irec;
struct xfs_attrlist_cursor_kern;
struct xfs_extent_free_item;
struct xfs_rmap_intent;
struct xfs_refcount_intent;
struct xfs_metadir_update;
struct xfs_rtgroup;
struct xfs_open_zone;
struct xfs_healthmon_event;
struct xfs_healthmon;
struct fserror_event;

#define XFS_ATTR_FILTER_FLAGS \
        { XFS_ATTR_ROOT,        "ROOT" }, \
        { XFS_ATTR_SECURE,        "SECURE" }, \
        { XFS_ATTR_INCOMPLETE,        "INCOMPLETE" }, \
        { XFS_ATTR_PARENT,        "PARENT" }

DECLARE_EVENT_CLASS(xfs_attr_list_class,
        TP_PROTO(struct xfs_attr_list_context *ctx),
        TP_ARGS(ctx),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(u32, hashval)
                __field(u32, blkno)
                __field(u32, offset)
                __field(void *, buffer)
                __field(int, bufsize)
                __field(int, count)
                __field(int, firstu)
                __field(int, dupcnt)
                __field(unsigned int, attr_filter)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
                __entry->ino = ctx->dp->i_ino;
                __entry->hashval = ctx->cursor.hashval;
                __entry->blkno = ctx->cursor.blkno;
                __entry->offset = ctx->cursor.offset;
                __entry->buffer = ctx->buffer;
                __entry->bufsize = ctx->bufsize;
                __entry->count = ctx->count;
                __entry->firstu = ctx->firstu;
                __entry->attr_filter = ctx->attr_filter;
        ),
        TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
                  "buffer %p size %u count %u firstu %u filter %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __entry->hashval,
                   __entry->blkno,
                   __entry->offset,
                   __entry->dupcnt,
                   __entry->buffer,
                   __entry->bufsize,
                   __entry->count,
                   __entry->firstu,
                   __print_flags(__entry->attr_filter, "|",
                                 XFS_ATTR_FILTER_FLAGS)
        )
)

#define DEFINE_ATTR_LIST_EVENT(name) \
DEFINE_EVENT(xfs_attr_list_class, name, \
        TP_PROTO(struct xfs_attr_list_context *ctx), \
        TP_ARGS(ctx))
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);

TRACE_EVENT(xfs_calc_atomic_write_unit_max,
        TP_PROTO(struct xfs_mount *mp, enum xfs_group_type type,
                 unsigned int max_write, unsigned int max_ioend,
                 unsigned int max_gsize, unsigned int awu_max),
        TP_ARGS(mp, type, max_write, max_ioend, max_gsize, awu_max),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(unsigned int, max_write)
                __field(unsigned int, max_ioend)
                __field(unsigned int, max_gsize)
                __field(unsigned int, awu_max)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->type = type;
                __entry->max_write = max_write;
                __entry->max_ioend = max_ioend;
                __entry->max_gsize = max_gsize;
                __entry->awu_max = awu_max;
        ),
        TP_printk("dev %d:%d %s max_write %u max_ioend %u max_gsize %u awu_max %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->max_write,
                  __entry->max_ioend,
                  __entry->max_gsize,
                  __entry->awu_max)
);

TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks,
        TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
                 unsigned int step_size, unsigned int logres,
                 unsigned int blockcount),
        TP_ARGS(mp, per_intent, step_size, logres, blockcount),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, per_intent)
                __field(unsigned int, step_size)
                __field(unsigned int, logres)
                __field(unsigned int, blockcount)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->per_intent = per_intent;
                __entry->step_size = step_size;
                __entry->logres = logres;
                __entry->blockcount = blockcount;
        ),
        TP_printk("dev %d:%d per_intent %u step_size %u logres %u blockcount %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->per_intent,
                  __entry->step_size,
                  __entry->logres,
                  __entry->blockcount)
);

TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry,
        TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
                 unsigned int step_size, unsigned int blockcount,
                 unsigned int min_logblocks, unsigned int logres),
        TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, per_intent)
                __field(unsigned int, step_size)
                __field(unsigned int, blockcount)
                __field(unsigned int, min_logblocks)
                __field(unsigned int, cur_logblocks)
                __field(unsigned int, logres)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->per_intent = per_intent;
                __entry->step_size = step_size;
                __entry->blockcount = blockcount;
                __entry->min_logblocks = min_logblocks;
                __entry->cur_logblocks = mp->m_sb.sb_logblocks;
                __entry->logres = logres;
        ),
        TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->per_intent,
                  __entry->step_size,
                  __entry->blockcount,
                  __entry->min_logblocks,
                  __entry->cur_logblocks,
                  __entry->logres)
);

TRACE_EVENT(xlog_intent_recovery_failed,
        TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
                 int error),
        TP_ARGS(mp, ops, error),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __string(name, ops->name)
                __field(int, error)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __assign_str(name);
                __entry->error = error;
        ),
        TP_printk("dev %d:%d optype %s error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->error)
);

DECLARE_EVENT_CLASS(xfs_perag_class,
        TP_PROTO(const struct xfs_perag *pag, unsigned long caller_ip),
        TP_ARGS(pag, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(int, refcount)
                __field(int, active_refcount)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->agno = pag_agno(pag);
                __entry->refcount = atomic_read(&pag->pag_group.xg_ref);
                __entry->active_refcount =
                        atomic_read(&pag->pag_group.xg_active_ref);
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d agno 0x%x passive refs %d active refs %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->refcount,
                  __entry->active_refcount,
                  (char *)__entry->caller_ip)
);

#define DEFINE_PERAG_REF_EVENT(name)        \
DEFINE_EVENT(xfs_perag_class, name,        \
        TP_PROTO(const struct xfs_perag *pag, unsigned long caller_ip), \
        TP_ARGS(pag, caller_ip))
DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
DEFINE_PERAG_REF_EVENT(xfs_reclaim_inodes_count);

TRACE_DEFINE_ENUM(XG_TYPE_AG);
TRACE_DEFINE_ENUM(XG_TYPE_RTG);

DECLARE_EVENT_CLASS(xfs_group_class,
        TP_PROTO(struct xfs_group *xg, unsigned long caller_ip),
        TP_ARGS(xg, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(int, refcount)
                __field(int, active_refcount)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = xg->xg_mount->m_super->s_dev;
                __entry->type = xg->xg_type;
                __entry->agno = xg->xg_gno;
                __entry->refcount = atomic_read(&xg->xg_ref);
                __entry->active_refcount = atomic_read(&xg->xg_active_ref);
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d %sno 0x%x passive refs %d active refs %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->refcount,
                  __entry->active_refcount,
                  (char *)__entry->caller_ip)
);

#define DEFINE_GROUP_REF_EVENT(name)        \
DEFINE_EVENT(xfs_group_class, name,        \
        TP_PROTO(struct xfs_group *xg, unsigned long caller_ip), \
        TP_ARGS(xg, caller_ip))
DEFINE_GROUP_REF_EVENT(xfs_group_get);
DEFINE_GROUP_REF_EVENT(xfs_group_hold);
DEFINE_GROUP_REF_EVENT(xfs_group_put);
DEFINE_GROUP_REF_EVENT(xfs_group_grab);
DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
DEFINE_GROUP_REF_EVENT(xfs_group_rele);

#ifdef CONFIG_XFS_RT
DECLARE_EVENT_CLASS(xfs_zone_class,
        TP_PROTO(struct xfs_rtgroup *rtg),
        TP_ARGS(rtg),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_rgnumber_t, rgno)
                __field(xfs_rgblock_t, used)
                __field(unsigned int, nr_open)
        ),
        TP_fast_assign(
                struct xfs_mount        *mp = rtg_mount(rtg);

                __entry->dev = mp->m_super->s_dev;
                __entry->rgno = rtg_rgno(rtg);
                __entry->used = rtg_rmap(rtg)->i_used_blocks;
                __entry->nr_open = mp->m_zone_info->zi_nr_open_zones;
        ),
        TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rgno,
                  __entry->used,
                  __entry->nr_open)
);

#define DEFINE_ZONE_EVENT(name)                                \
DEFINE_EVENT(xfs_zone_class, name,                        \
        TP_PROTO(struct xfs_rtgroup *rtg),                \
        TP_ARGS(rtg))
DEFINE_ZONE_EVENT(xfs_zone_emptied);
DEFINE_ZONE_EVENT(xfs_zone_full);
DEFINE_ZONE_EVENT(xfs_zone_opened);
DEFINE_ZONE_EVENT(xfs_zone_reset);
DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened);
DEFINE_ZONE_EVENT(xfs_zone_gc_target_stolen);

TRACE_EVENT(xfs_zone_free_blocks,
        TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
                 xfs_extlen_t len),
        TP_ARGS(rtg, rgbno, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_rgnumber_t, rgno)
                __field(xfs_rgblock_t, used)
                __field(xfs_rgblock_t, rgbno)
                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                __entry->dev = rtg_mount(rtg)->m_super->s_dev;
                __entry->rgno = rtg_rgno(rtg);
                __entry->used = rtg_rmap(rtg)->i_used_blocks;
                __entry->rgbno = rgbno;
                __entry->len = len;
        ),
        TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rgno,
                  __entry->used,
                  __entry->rgbno,
                  __entry->len)
);

DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
        TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
                 xfs_extlen_t len),
        TP_ARGS(oz, rgbno, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_rgnumber_t, rgno)
                __field(xfs_rgblock_t, used)
                __field(xfs_rgblock_t, allocated)
                __field(xfs_rgblock_t, written)
                __field(xfs_rgblock_t, rgbno)
                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
                __entry->rgno = rtg_rgno(oz->oz_rtg);
                __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
                __entry->allocated = oz->oz_allocated;
                __entry->written = oz->oz_written;
                __entry->rgbno = rgbno;
                __entry->len = len;
        ),
        TP_printk("dev %d:%d rgno 0x%x used 0x%x alloced 0x%x written 0x%x rgbno 0x%x len 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rgno,
                  __entry->used,
                  __entry->allocated,
                  __entry->written,
                  __entry->rgbno,
                  __entry->len)
);

#define DEFINE_ZONE_ALLOC_EVENT(name)                                \
DEFINE_EVENT(xfs_zone_alloc_class, name,                        \
        TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,        \
                 xfs_extlen_t len),                                \
        TP_ARGS(oz, rgbno, len))
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_spurious_open);

TRACE_EVENT(xfs_zone_gc_select_victim,
        TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket),
        TP_ARGS(rtg, bucket),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_rgnumber_t, rgno)
                __field(xfs_rgblock_t, used)
                __field(unsigned int, bucket)
        ),
        TP_fast_assign(
                __entry->dev = rtg_mount(rtg)->m_super->s_dev;
                __entry->rgno = rtg_rgno(rtg);
                __entry->used = rtg_rmap(rtg)->i_used_blocks;
                __entry->bucket = bucket;
        ),
        TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rgno,
                  __entry->used,
                  __entry->bucket)
);

TRACE_EVENT(xfs_zones_mount,
        TP_PROTO(struct xfs_mount *mp),
        TP_ARGS(mp),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_rgnumber_t, rgcount)
                __field(uint32_t, blocks)
                __field(unsigned int, max_open_zones)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->rgcount = mp->m_sb.sb_rgcount;
                __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks;
                __entry->max_open_zones = mp->m_max_open_zones;
        ),
        TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                __entry->rgcount,
                __entry->blocks,
                __entry->max_open_zones)
);
#endif /* CONFIG_XFS_RT */

TRACE_EVENT(xfs_inodegc_worker,
        TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
        TP_ARGS(mp, shrinker_hits),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, shrinker_hits)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->shrinker_hits = shrinker_hits;
        ),
        TP_printk("dev %d:%d shrinker_hits %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->shrinker_hits)
);

DECLARE_EVENT_CLASS(xfs_fs_class,
        TP_PROTO(struct xfs_mount *mp, void *caller_ip),
        TP_ARGS(mp, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long long, mflags)
                __field(unsigned long, opstate)
                __field(unsigned long, sbflags)
                __field(void *, caller_ip)
        ),
        TP_fast_assign(
                if (mp) {
                        __entry->dev = mp->m_super->s_dev;
                        __entry->mflags = mp->m_features;
                        __entry->opstate = mp->m_opstate;
                        __entry->sbflags = mp->m_super->s_flags;
                }
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d m_features 0x%llx opstate (%s) s_flags 0x%lx caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->mflags,
                  __print_flags(__entry->opstate, "|", XFS_OPSTATE_STRINGS),
                  __entry->sbflags,
                  __entry->caller_ip)
);

#define DEFINE_FS_EVENT(name)        \
DEFINE_EVENT(xfs_fs_class, name,                                        \
        TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
        TP_ARGS(mp, caller_ip))
DEFINE_FS_EVENT(xfs_inodegc_flush);
DEFINE_FS_EVENT(xfs_inodegc_push);
DEFINE_FS_EVENT(xfs_inodegc_start);
DEFINE_FS_EVENT(xfs_inodegc_stop);
DEFINE_FS_EVENT(xfs_inodegc_queue);
DEFINE_FS_EVENT(xfs_inodegc_throttle);
DEFINE_FS_EVENT(xfs_fs_sync_fs);
DEFINE_FS_EVENT(xfs_blockgc_start);
DEFINE_FS_EVENT(xfs_blockgc_stop);
DEFINE_FS_EVENT(xfs_blockgc_worker);
DEFINE_FS_EVENT(xfs_blockgc_flush_all);

TRACE_EVENT(xfs_inodegc_shrinker_scan,
        TP_PROTO(struct xfs_mount *mp, struct shrink_control *sc,
                 void *caller_ip),
        TP_ARGS(mp, sc, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long, nr_to_scan)
                __field(void *, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->nr_to_scan = sc->nr_to_scan;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d nr_to_scan %lu caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan,
                  __entry->caller_ip)
);

DECLARE_EVENT_CLASS(xfs_ag_class,
        TP_PROTO(const struct xfs_perag *pag),
        TP_ARGS(pag),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->agno = pag_agno(pag);
        ),
        TP_printk("dev %d:%d agno 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno)
);
#define DEFINE_AG_EVENT(name)        \
DEFINE_EVENT(xfs_ag_class, name,        \
        TP_PROTO(const struct xfs_perag *pag),        \
        TP_ARGS(pag))

DEFINE_AG_EVENT(xfs_read_agf);
DEFINE_AG_EVENT(xfs_alloc_read_agf);
DEFINE_AG_EVENT(xfs_read_agi);
DEFINE_AG_EVENT(xfs_ialloc_read_agi);

TRACE_EVENT(xfs_attr_list_node_descend,
        TP_PROTO(struct xfs_attr_list_context *ctx,
                 struct xfs_da_node_entry *btree),
        TP_ARGS(ctx, btree),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(u32, hashval)
                __field(u32, blkno)
                __field(u32, offset)
                __field(void *, buffer)
                __field(int, bufsize)
                __field(int, count)
                __field(int, firstu)
                __field(int, dupcnt)
                __field(unsigned int, attr_filter)
                __field(u32, bt_hashval)
                __field(u32, bt_before)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
                __entry->ino = ctx->dp->i_ino;
                __entry->hashval = ctx->cursor.hashval;
                __entry->blkno = ctx->cursor.blkno;
                __entry->offset = ctx->cursor.offset;
                __entry->buffer = ctx->buffer;
                __entry->bufsize = ctx->bufsize;
                __entry->count = ctx->count;
                __entry->firstu = ctx->firstu;
                __entry->attr_filter = ctx->attr_filter;
                __entry->bt_hashval = be32_to_cpu(btree->hashval);
                __entry->bt_before = be32_to_cpu(btree->before);
        ),
        TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
                  "buffer %p size %u count %u firstu %u filter %s "
                  "node hashval %u, node before %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __entry->hashval,
                   __entry->blkno,
                   __entry->offset,
                   __entry->dupcnt,
                   __entry->buffer,
                   __entry->bufsize,
                   __entry->count,
                   __entry->firstu,
                   __print_flags(__entry->attr_filter, "|",
                                 XFS_ATTR_FILTER_FLAGS),
                   __entry->bt_hashval,
                   __entry->bt_before)
);

DECLARE_EVENT_CLASS(xfs_bmap_class,
        TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state,
                 unsigned long caller_ip),
        TP_ARGS(ip, cur, state, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(void *, leaf)
                __field(int, pos)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
                __field(xfs_exntst_t, state)
                __field(int, bmap_state)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                struct xfs_ifork        *ifp;
                struct xfs_bmbt_irec        r;

                ifp = xfs_iext_state_to_fork(ip, state);
                xfs_iext_get_extent(ifp, cur, &r);
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->leaf = cur->leaf;
                __entry->pos = cur->pos;
                __entry->startoff = r.br_startoff;
                __entry->startblock = r.br_startblock;
                __entry->blockcount = r.br_blockcount;
                __entry->state = r.br_state;
                __entry->bmap_state = state;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d ino 0x%llx state %s cur %p/%d "
                  "fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx flag %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
                  __entry->leaf,
                  __entry->pos,
                  __entry->startoff,
                  (int64_t)__entry->startblock,
                  __entry->blockcount,
                  __entry->state,
                  (char *)__entry->caller_ip)
)

#define DEFINE_BMAP_EVENT(name) \
DEFINE_EVENT(xfs_bmap_class, name, \
        TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state, \
                 unsigned long caller_ip), \
        TP_ARGS(ip, cur, state, caller_ip))
DEFINE_BMAP_EVENT(xfs_iext_insert);
DEFINE_BMAP_EVENT(xfs_iext_remove);
DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
DEFINE_BMAP_EVENT(xfs_bmap_post_update);
DEFINE_BMAP_EVENT(xfs_read_extent);
DEFINE_BMAP_EVENT(xfs_write_extent);

DECLARE_EVENT_CLASS(xfs_buf_class,
        TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
        TP_ARGS(bp, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_daddr_t, bno)
                __field(int, nblks)
                __field(int, hold)
                __field(int, pincount)
                __field(unsigned, lockval)
                __field(unsigned, flags)
                __field(unsigned long, caller_ip)
                __field(const void *, buf_ops)
        ),
        TP_fast_assign(
                __entry->dev = bp->b_target->bt_dev;
                __entry->bno = xfs_buf_daddr(bp);
                __entry->nblks = bp->b_length;
                __entry->hold = bp->b_lockref.count;
                __entry->pincount = atomic_read(&bp->b_pin_count);
                __entry->lockval = bp->b_sema.count;
                __entry->flags = bp->b_flags;
                __entry->caller_ip = caller_ip;
                __entry->buf_ops = bp->b_ops;
        ),
        TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
                  "lock %d flags %s bufops %pS caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->bno,
                  __entry->nblks,
                  __entry->hold,
                  __entry->pincount,
                  __entry->lockval,
                  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
                  __entry->buf_ops,
                  (void *)__entry->caller_ip)
)

#define DEFINE_BUF_EVENT(name) \
DEFINE_EVENT(xfs_buf_class, name, \
        TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
        TP_ARGS(bp, caller_ip))
DEFINE_BUF_EVENT(xfs_buf_init);
DEFINE_BUF_EVENT(xfs_buf_free);
DEFINE_BUF_EVENT(xfs_buf_hold);
DEFINE_BUF_EVENT(xfs_buf_rele);
DEFINE_BUF_EVENT(xfs_buf_iodone);
DEFINE_BUF_EVENT(xfs_buf_submit);
DEFINE_BUF_EVENT(xfs_buf_lock);
DEFINE_BUF_EVENT(xfs_buf_lock_done);
DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
DEFINE_BUF_EVENT(xfs_buf_trylock);
DEFINE_BUF_EVENT(xfs_buf_unlock);
DEFINE_BUF_EVENT(xfs_buf_iowait);
DEFINE_BUF_EVENT(xfs_buf_iowait_done);
DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
DEFINE_BUF_EVENT(xfs_buf_backing_folio);
DEFINE_BUF_EVENT(xfs_buf_backing_kmem);
DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc);
DEFINE_BUF_EVENT(xfs_buf_backing_fallback);

/* not really buffer traces, but the buf provides useful information */
DEFINE_BUF_EVENT(xfs_btree_corrupt);
DEFINE_BUF_EVENT(xfs_reset_dqcounts);

/* pass flags explicitly */
DECLARE_EVENT_CLASS(xfs_buf_flags_class,
        TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
        TP_ARGS(bp, flags, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_daddr_t, bno)
                __field(unsigned int, length)
                __field(int, hold)
                __field(int, pincount)
                __field(unsigned, lockval)
                __field(unsigned, flags)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = bp->b_target->bt_dev;
                __entry->bno = xfs_buf_daddr(bp);
                __entry->length = bp->b_length;
                __entry->flags = flags;
                __entry->hold = bp->b_lockref.count;
                __entry->pincount = atomic_read(&bp->b_pin_count);
                __entry->lockval = bp->b_sema.count;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
                  "lock %d flags %s caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->bno,
                  __entry->length,
                  __entry->hold,
                  __entry->pincount,
                  __entry->lockval,
                  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
                  (void *)__entry->caller_ip)
)

#define DEFINE_BUF_FLAGS_EVENT(name) \
DEFINE_EVENT(xfs_buf_flags_class, name, \
        TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
        TP_ARGS(bp, flags, caller_ip))
DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
DEFINE_BUF_FLAGS_EVENT(xfs_buf_readahead);

TRACE_EVENT(xfs_buf_ioerror,
        TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip),
        TP_ARGS(bp, error, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_daddr_t, bno)
                __field(unsigned int, length)
                __field(unsigned, flags)
                __field(int, hold)
                __field(int, pincount)
                __field(unsigned, lockval)
                __field(int, error)
                __field(xfs_failaddr_t, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = bp->b_target->bt_dev;
                __entry->bno = xfs_buf_daddr(bp);
                __entry->length = bp->b_length;
                __entry->hold = bp->b_lockref.count;
                __entry->pincount = atomic_read(&bp->b_pin_count);
                __entry->lockval = bp->b_sema.count;
                __entry->error = error;
                __entry->flags = bp->b_flags;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
                  "lock %d error %d flags %s caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->bno,
                  __entry->length,
                  __entry->hold,
                  __entry->pincount,
                  __entry->lockval,
                  __entry->error,
                  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
                  (void *)__entry->caller_ip)
);

DECLARE_EVENT_CLASS(xfs_buf_item_class,
        TP_PROTO(struct xfs_buf_log_item *bip),
        TP_ARGS(bip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_daddr_t, buf_bno)
                __field(unsigned int, buf_len)
                __field(int, buf_hold)
                __field(int, buf_pincount)
                __field(int, buf_lockval)
                __field(unsigned, buf_flags)
                __field(unsigned, bli_recur)
                __field(int, bli_refcount)
                __field(unsigned, bli_flags)
                __field(unsigned long, li_flags)
        ),
        TP_fast_assign(
                __entry->dev = bip->bli_buf->b_target->bt_dev;
                __entry->bli_flags = bip->bli_flags;
                __entry->bli_recur = bip->bli_recur;
                __entry->bli_refcount = atomic_read(&bip->bli_refcount);
                __entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
                __entry->buf_len = bip->bli_buf->b_length;
                __entry->buf_flags = bip->bli_buf->b_flags;
                __entry->buf_hold = bip->bli_buf->b_lockref.count;
                __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
                __entry->buf_lockval = bip->bli_buf->b_sema.count;
                __entry->li_flags = bip->bli_item.li_flags;
        ),
        TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
                  "lock %d flags %s recur %d refcount %d bliflags %s "
                  "liflags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->buf_bno,
                  __entry->buf_len,
                  __entry->buf_hold,
                  __entry->buf_pincount,
                  __entry->buf_lockval,
                  __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
                  __entry->bli_recur,
                  __entry->bli_refcount,
                  __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
                  __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
)

#define DEFINE_BUF_ITEM_EVENT(name) \
DEFINE_EVENT(xfs_buf_item_class, name, \
        TP_PROTO(struct xfs_buf_log_item *bip), \
        TP_ARGS(bip))
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_release);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur);
DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bdetach);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);

DECLARE_EVENT_CLASS(xfs_filestream_class,
        TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino),
        TP_ARGS(pag, ino),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_agnumber_t, agno)
                __field(int, streams)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->ino = ino;
                __entry->agno = pag_agno(pag);
                __entry->streams = atomic_read(&pag->pagf_fstrms);
        ),
        TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->agno,
                  __entry->streams)
)
#define DEFINE_FILESTREAM_EVENT(name) \
DEFINE_EVENT(xfs_filestream_class, name, \
        TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino), \
        TP_ARGS(pag, ino))
DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);

TRACE_EVENT(xfs_filestream_pick,
        TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino),
        TP_ARGS(pag, ino),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_agnumber_t, agno)
                __field(int, streams)
                __field(xfs_extlen_t, free)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->ino = ino;
                __entry->agno = pag_agno(pag);
                __entry->streams = atomic_read(&pag->pagf_fstrms);
                __entry->free = pag->pagf_freeblks;
        ),
        TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->agno,
                  __entry->streams,
                  __entry->free)
);

DECLARE_EVENT_CLASS(xfs_lock_class,
        TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
                 unsigned long caller_ip),
        TP_ARGS(ip,  lock_flags, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(int, lock_flags)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->lock_flags = lock_flags;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d ino 0x%llx flags %s caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
                  (void *)__entry->caller_ip)
)

#define DEFINE_LOCK_EVENT(name) \
DEFINE_EVENT(xfs_lock_class, name, \
        TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
                 unsigned long caller_ip), \
        TP_ARGS(ip,  lock_flags, caller_ip))
DEFINE_LOCK_EVENT(xfs_ilock);
DEFINE_LOCK_EVENT(xfs_ilock_nowait);
DEFINE_LOCK_EVENT(xfs_ilock_demote);
DEFINE_LOCK_EVENT(xfs_iunlock);

DECLARE_EVENT_CLASS(xfs_inode_class,
        TP_PROTO(struct xfs_inode *ip),
        TP_ARGS(ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(unsigned long, iflags)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->iflags = ip->i_flags;
        ),
        TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->iflags)
)

#define DEFINE_INODE_EVENT(name) \
DEFINE_EVENT(xfs_inode_class, name, \
        TP_PROTO(struct xfs_inode *ip), \
        TP_ARGS(ip))
DEFINE_INODE_EVENT(xfs_iget_skip);
DEFINE_INODE_EVENT(xfs_iget_recycle);
DEFINE_INODE_EVENT(xfs_iget_recycle_fail);
DEFINE_INODE_EVENT(xfs_iget_hit);
DEFINE_INODE_EVENT(xfs_iget_miss);

DEFINE_INODE_EVENT(xfs_getattr);
DEFINE_INODE_EVENT(xfs_setattr);
DEFINE_INODE_EVENT(xfs_readlink);
DEFINE_INODE_EVENT(xfs_inactive_symlink);
DEFINE_INODE_EVENT(xfs_alloc_file_space);
DEFINE_INODE_EVENT(xfs_free_file_space);
DEFINE_INODE_EVENT(xfs_zero_file_space);
DEFINE_INODE_EVENT(xfs_collapse_file_space);
DEFINE_INODE_EVENT(xfs_insert_file_space);
DEFINE_INODE_EVENT(xfs_readdir);
#ifdef CONFIG_XFS_POSIX_ACL
DEFINE_INODE_EVENT(xfs_get_acl);
#endif
DEFINE_INODE_EVENT(xfs_vm_bmap);
DEFINE_INODE_EVENT(xfs_file_ioctl);
#ifdef CONFIG_COMPAT
DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
#endif
DEFINE_INODE_EVENT(xfs_ioctl_setattr);
DEFINE_INODE_EVENT(xfs_dir_fsync);
DEFINE_INODE_EVENT(xfs_file_fsync);
DEFINE_INODE_EVENT(xfs_destroy_inode);
DEFINE_INODE_EVENT(xfs_update_time);

DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
DEFINE_INODE_EVENT(xfs_dquot_dqdetach);

DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
DEFINE_INODE_EVENT(xfs_inode_set_reclaimable);
DEFINE_INODE_EVENT(xfs_inode_reclaiming);
DEFINE_INODE_EVENT(xfs_inode_set_need_inactive);
DEFINE_INODE_EVENT(xfs_inode_inactivating);

/*
 * ftrace's __print_symbolic requires that all enum values be wrapped in the
 * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
 * ring buffer.  Somehow this was only worth mentioning in the ftrace sample
 * code.
 */
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);

DECLARE_EVENT_CLASS(xfs_fault_class,
        TP_PROTO(struct xfs_inode *ip, unsigned int order),
        TP_ARGS(ip, order),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(unsigned int, order)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->order = order;
        ),
        TP_printk("dev %d:%d ino 0x%llx order %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->order)
)

#define DEFINE_FAULT_EVENT(name) \
DEFINE_EVENT(xfs_fault_class, name, \
        TP_PROTO(struct xfs_inode *ip, unsigned int order), \
        TP_ARGS(ip, order))
DEFINE_FAULT_EVENT(xfs_read_fault);
DEFINE_FAULT_EVENT(xfs_write_fault);

DECLARE_EVENT_CLASS(xfs_iref_class,
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
        TP_ARGS(ip, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(int, count)
                __field(int, pincount)
                __field(unsigned long, iflags)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->count = icount_read(VFS_I(ip));
                __entry->pincount = atomic_read(&ip->i_pincount);
                __entry->iflags = ip->i_flags;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d ino 0x%llx count %d pincount %d iflags 0x%lx caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->count,
                  __entry->pincount,
                  __entry->iflags,
                  (char *)__entry->caller_ip)
)

TRACE_EVENT(xfs_iomap_prealloc_size,
        TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift,
                 unsigned int writeio_blocks),
        TP_ARGS(ip, blocks, shift, writeio_blocks),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_fsblock_t, blocks)
                __field(int, shift)
                __field(unsigned int, writeio_blocks)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->blocks = blocks;
                __entry->shift = shift;
                __entry->writeio_blocks = writeio_blocks;
        ),
        TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d "
                  "m_allocsize_blocks %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
                  __entry->blocks, __entry->shift, __entry->writeio_blocks)
)

TRACE_EVENT(xfs_irec_merge_pre,
        TP_PROTO(const struct xfs_perag *pag,
                 const struct xfs_inobt_rec_incore *rec,
                 const struct xfs_inobt_rec_incore *nrec),
        TP_ARGS(pag, rec, nrec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agino_t, agino)
                __field(uint16_t, holemask)
                __field(xfs_agino_t, nagino)
                __field(uint16_t, nholemask)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->agno = pag_agno(pag);
                __entry->agino = rec->ir_startino;
                __entry->holemask = rec->ir_holemask;
                __entry->nagino = nrec->ir_startino;
                __entry->nholemask = nrec->ir_holemask;
        ),
        TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x new_agino 0x%x new_holemask 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agino,
                  __entry->holemask,
                  __entry->nagino,
                  __entry->nholemask)
)

TRACE_EVENT(xfs_irec_merge_post,
        TP_PROTO(const struct xfs_perag *pag,
                 const struct xfs_inobt_rec_incore *nrec),
        TP_ARGS(pag, nrec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agino_t, agino)
                __field(uint16_t, holemask)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->agno = pag_agno(pag);
                __entry->agino = nrec->ir_startino;
                __entry->holemask = nrec->ir_holemask;
        ),
        TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x",
                  MAJOR(__entry->dev),
                  MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agino,
                  __entry->holemask)
)

#define DEFINE_IREF_EVENT(name) \
DEFINE_EVENT(xfs_iref_class, name, \
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
        TP_ARGS(ip, caller_ip))
DEFINE_IREF_EVENT(xfs_irele);
DEFINE_IREF_EVENT(xfs_inode_pin);
DEFINE_IREF_EVENT(xfs_inode_unpin);
DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
DEFINE_IREF_EVENT(xfs_inode_push_pinned);
DEFINE_IREF_EVENT(xfs_inode_push_stale);

DECLARE_EVENT_CLASS(xfs_namespace_class,
        TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name),
        TP_ARGS(dp, name),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, dp_ino)
                __field(int, namelen)
                __dynamic_array(char, name, name->len)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(dp)->i_sb->s_dev;
                __entry->dp_ino = dp->i_ino;
                __entry->namelen = name->len;
                memcpy(__get_str(name), name->name, name->len);
        ),
        TP_printk("dev %d:%d dp ino 0x%llx name %.*s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->dp_ino,
                  __entry->namelen,
                  __get_str(name))
)

#define DEFINE_NAMESPACE_EVENT(name) \
DEFINE_EVENT(xfs_namespace_class, name, \
        TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), \
        TP_ARGS(dp, name))
DEFINE_NAMESPACE_EVENT(xfs_remove);
DEFINE_NAMESPACE_EVENT(xfs_link);
DEFINE_NAMESPACE_EVENT(xfs_lookup);
DEFINE_NAMESPACE_EVENT(xfs_create);
DEFINE_NAMESPACE_EVENT(xfs_symlink);

TRACE_EVENT(xfs_rename,
        TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
                 struct xfs_name *src_name, struct xfs_name *target_name),
        TP_ARGS(src_dp, target_dp, src_name, target_name),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, src_dp_ino)
                __field(xfs_ino_t, target_dp_ino)
                __field(int, src_namelen)
                __field(int, target_namelen)
                __dynamic_array(char, src_name, src_name->len)
                __dynamic_array(char, target_name, target_name->len)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
                __entry->src_dp_ino = src_dp->i_ino;
                __entry->target_dp_ino = target_dp->i_ino;
                __entry->src_namelen = src_name->len;
                __entry->target_namelen = target_name->len;
                memcpy(__get_str(src_name), src_name->name, src_name->len);
                memcpy(__get_str(target_name), target_name->name,
                        target_name->len);
        ),
        TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
                  " src name %.*s target name %.*s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->src_dp_ino,
                  __entry->target_dp_ino,
                  __entry->src_namelen,
                  __get_str(src_name),
                  __entry->target_namelen,
                  __get_str(target_name))
)

DECLARE_EVENT_CLASS(xfs_dquot_class,
        TP_PROTO(struct xfs_dquot *dqp),
        TP_ARGS(dqp),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u32, id)
                __field(xfs_dqtype_t, type)
                __field(unsigned, flags)
                __field(unsigned, nrefs)
                __field(unsigned long long, res_bcount)
                __field(unsigned long long, res_rtbcount)
                __field(unsigned long long, res_icount)

                __field(unsigned long long, bcount)
                __field(unsigned long long, rtbcount)
                __field(unsigned long long, icount)

                __field(unsigned long long, blk_hardlimit)
                __field(unsigned long long, blk_softlimit)
                __field(unsigned long long, rtb_hardlimit)
                __field(unsigned long long, rtb_softlimit)
                __field(unsigned long long, ino_hardlimit)
                __field(unsigned long long, ino_softlimit)
        ),
        TP_fast_assign(
                __entry->dev = dqp->q_mount->m_super->s_dev;
                __entry->id = dqp->q_id;
                __entry->type = dqp->q_type;
                __entry->flags = dqp->q_flags;
                __entry->nrefs = data_race(dqp->q_lockref.count);

                __entry->res_bcount = dqp->q_blk.reserved;
                __entry->res_rtbcount = dqp->q_rtb.reserved;
                __entry->res_icount = dqp->q_ino.reserved;

                __entry->bcount = dqp->q_blk.count;
                __entry->rtbcount = dqp->q_rtb.count;
                __entry->icount = dqp->q_ino.count;

                __entry->blk_hardlimit = dqp->q_blk.hardlimit;
                __entry->blk_softlimit = dqp->q_blk.softlimit;
                __entry->rtb_hardlimit = dqp->q_rtb.hardlimit;
                __entry->rtb_softlimit = dqp->q_rtb.softlimit;
                __entry->ino_hardlimit = dqp->q_ino.hardlimit;
                __entry->ino_softlimit = dqp->q_ino.softlimit;
        ),
        TP_printk("dev %d:%d id 0x%x type %s flags %s nrefs %u "
                  "res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx "
                  "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
                  "rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx "
                  "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->id,
                  __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
                  __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
                  __entry->nrefs,
                  __entry->res_bcount,
                  __entry->res_rtbcount,
                  __entry->res_icount,
                  __entry->bcount,
                  __entry->blk_hardlimit,
                  __entry->blk_softlimit,
                  __entry->rtbcount,
                  __entry->rtb_hardlimit,
                  __entry->rtb_softlimit,
                  __entry->icount,
                  __entry->ino_hardlimit,
                  __entry->ino_softlimit)
)

#define DEFINE_DQUOT_EVENT(name) \
DEFINE_EVENT(xfs_dquot_class, name, \
        TP_PROTO(struct xfs_dquot *dqp), \
        TP_ARGS(dqp))
DEFINE_DQUOT_EVENT(xfs_dqadjust);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
DEFINE_DQUOT_EVENT(xfs_dqattach_get);
DEFINE_DQUOT_EVENT(xfs_dqalloc);
DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
DEFINE_DQUOT_EVENT(xfs_dqread);
DEFINE_DQUOT_EVENT(xfs_dqread_fail);
DEFINE_DQUOT_EVENT(xfs_dqget_hit);
DEFINE_DQUOT_EVENT(xfs_dqget_miss);
DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
DEFINE_DQUOT_EVENT(xfs_dqget_dup);
DEFINE_DQUOT_EVENT(xfs_dqrele);
DEFINE_DQUOT_EVENT(xfs_dqrele_free);
DEFINE_DQUOT_EVENT(xfs_dqflush);
DEFINE_DQUOT_EVENT(xfs_dqflush_force);
DEFINE_DQUOT_EVENT(xfs_dqflush_done);
DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before);
DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after);

TRACE_EVENT(xfs_trans_mod_dquot,
        TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp,
                 unsigned int field, int64_t delta),
        TP_ARGS(tp, dqp, field, delta),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_dqtype_t, type)
                __field(unsigned int, flags)
                __field(unsigned int, dqid)
                __field(unsigned int, field)
                __field(int64_t, delta)
        ),
        TP_fast_assign(
                __entry->dev = tp->t_mountp->m_super->s_dev;
                __entry->type = dqp->q_type;
                __entry->flags = dqp->q_flags;
                __entry->dqid = dqp->q_id;
                __entry->field = field;
                __entry->delta = delta;
        ),
        TP_printk("dev %d:%d dquot id 0x%x type %s flags %s field %s delta %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->dqid,
                  __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
                  __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
                  __print_flags(__entry->field, "|", XFS_QMOPT_FLAGS),
                  __entry->delta)
);

DECLARE_EVENT_CLASS(xfs_dqtrx_class,
        TP_PROTO(struct xfs_dqtrx *qtrx),
        TP_ARGS(qtrx),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_dqtype_t, type)
                __field(unsigned int, flags)
                __field(u32, dqid)

                __field(uint64_t, blk_res)
                __field(int64_t,  bcount_delta)
                __field(int64_t,  delbcnt_delta)

                __field(uint64_t, rtblk_res)
                __field(uint64_t, rtblk_res_used)
                __field(int64_t,  rtbcount_delta)
                __field(int64_t,  delrtb_delta)

                __field(uint64_t, ino_res)
                __field(uint64_t, ino_res_used)
                __field(int64_t,  icount_delta)
        ),
        TP_fast_assign(
                __entry->dev = qtrx->qt_dquot->q_mount->m_super->s_dev;
                __entry->type = qtrx->qt_dquot->q_type;
                __entry->flags = qtrx->qt_dquot->q_flags;
                __entry->dqid = qtrx->qt_dquot->q_id;

                __entry->blk_res = qtrx->qt_blk_res;
                __entry->bcount_delta = qtrx->qt_bcount_delta;
                __entry->delbcnt_delta = qtrx->qt_delbcnt_delta;

                __entry->rtblk_res = qtrx->qt_rtblk_res;
                __entry->rtblk_res_used = qtrx->qt_rtblk_res_used;
                __entry->rtbcount_delta = qtrx->qt_rtbcount_delta;
                __entry->delrtb_delta = qtrx->qt_delrtb_delta;

                __entry->ino_res = qtrx->qt_ino_res;
                __entry->ino_res_used = qtrx->qt_ino_res_used;
                __entry->icount_delta = qtrx->qt_icount_delta;
        ),
        TP_printk("dev %d:%d dquot id 0x%x type %s flags %s "
                  "blk_res %llu bcount_delta %lld delbcnt_delta %lld "
                  "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld "
                  "ino_res %llu ino_res_used %llu icount_delta %lld",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                __entry->dqid,
                  __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
                  __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),

                __entry->blk_res,
                __entry->bcount_delta,
                __entry->delbcnt_delta,

                __entry->rtblk_res,
                __entry->rtblk_res_used,
                __entry->rtbcount_delta,
                __entry->delrtb_delta,

                __entry->ino_res,
                __entry->ino_res_used,
                __entry->icount_delta)
)

#define DEFINE_DQTRX_EVENT(name) \
DEFINE_EVENT(xfs_dqtrx_class, name, \
        TP_PROTO(struct xfs_dqtrx *qtrx), \
        TP_ARGS(qtrx))
DEFINE_DQTRX_EVENT(xfs_trans_apply_dquot_deltas);
DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_before);
DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_after);

DECLARE_EVENT_CLASS(xfs_loggrant_class,
        TP_PROTO(struct xlog *log, struct xlog_ticket *tic),
        TP_ARGS(log, tic),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long, tic)
                __field(char, ocnt)
                __field(char, cnt)
                __field(int, curr_res)
                __field(int, unit_res)
                __field(unsigned int, flags)
                __field(int, reserveq)
                __field(int, writeq)
                __field(uint64_t, grant_reserve_bytes)
                __field(uint64_t, grant_write_bytes)
                __field(uint64_t, tail_space)
                __field(int, curr_cycle)
                __field(int, curr_block)
                __field(xfs_lsn_t, tail_lsn)
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
                __entry->tic = (unsigned long)tic;
                __entry->ocnt = tic->t_ocnt;
                __entry->cnt = tic->t_cnt;
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
                __entry->reserveq = list_empty(&log->l_reserve_head.waiters);
                __entry->writeq = list_empty(&log->l_write_head.waiters);
                __entry->tail_space = READ_ONCE(log->l_tail_space);
                __entry->grant_reserve_bytes = __entry->tail_space +
                        atomic64_read(&log->l_reserve_head.grant);
                __entry->grant_write_bytes = __entry->tail_space +
                        atomic64_read(&log->l_write_head.grant);
                __entry->curr_cycle = log->l_curr_cycle;
                __entry->curr_block = log->l_curr_block;
                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
        TP_printk("dev %d:%d tic 0x%lx t_ocnt %u t_cnt %u t_curr_res %u "
                  "t_unit_res %u t_flags %s reserveq %s writeq %s "
                  "tail space %llu grant_reserve_bytes %llu "
                  "grant_write_bytes %llu curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tic,
                  __entry->ocnt,
                  __entry->cnt,
                  __entry->curr_res,
                  __entry->unit_res,
                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
                  __entry->reserveq ? "empty" : "active",
                  __entry->writeq ? "empty" : "active",
                  __entry->tail_space,
                  __entry->grant_reserve_bytes,
                  __entry->grant_write_bytes,
                  __entry->curr_cycle,
                  __entry->curr_block,
                  CYCLE_LSN(__entry->tail_lsn),
                  BLOCK_LSN(__entry->tail_lsn)
        )
)

#define DEFINE_LOGGRANT_EVENT(name) \
DEFINE_EVENT(xfs_loggrant_class, name, \
        TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
        TP_ARGS(log, tic))
DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);

DECLARE_EVENT_CLASS(xfs_log_item_class,
        TP_PROTO(struct xfs_log_item *lip),
        TP_ARGS(lip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(void *, lip)
                __field(uint, type)
                __field(unsigned long, flags)
                __field(xfs_lsn_t, lsn)
        ),
        TP_fast_assign(
                __entry->dev = lip->li_log->l_mp->m_super->s_dev;
                __entry->lip = lip;
                __entry->type = lip->li_type;
                __entry->flags = lip->li_flags;
                __entry->lsn = lip->li_lsn;
        ),
        TP_printk("dev %d:%d lip %p lsn %d/%d type %s flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lip,
                  CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
                  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
                  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
)

TRACE_EVENT(xfs_log_force,
        TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip),
        TP_ARGS(mp, lsn, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_lsn_t, lsn)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->lsn = lsn;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d lsn 0x%llx caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lsn, (void *)__entry->caller_ip)
)

#define DEFINE_LOG_ITEM_EVENT(name) \
DEFINE_EVENT(xfs_log_item_class, name, \
        TP_PROTO(struct xfs_log_item *lip), \
        TP_ARGS(lip))
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
DEFINE_LOG_ITEM_EVENT(xlog_ail_insert_abort);
DEFINE_LOG_ITEM_EVENT(xfs_trans_free_abort);

DECLARE_EVENT_CLASS(xfs_ail_push_class,
        TP_PROTO(struct xfs_ail *ailp, uint type, unsigned long flags, xfs_lsn_t lsn),
        TP_ARGS(ailp, type, flags, lsn),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(uint, type)
                __field(unsigned long, flags)
                __field(xfs_lsn_t, lsn)
        ),
        TP_fast_assign(
                __entry->dev = ailp->ail_log->l_mp->m_super->s_dev;
                __entry->type = type;
                __entry->flags = flags;
                __entry->lsn = lsn;
        ),
        TP_printk("dev %d:%d lsn %d/%d type %s flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
                  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
                  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
)

#define DEFINE_AIL_PUSH_EVENT(name) \
DEFINE_EVENT(xfs_ail_push_class, name, \
        TP_PROTO(struct xfs_ail *ailp, uint type, unsigned long flags, xfs_lsn_t lsn), \
        TP_ARGS(ailp, type, flags, lsn))
DEFINE_AIL_PUSH_EVENT(xfs_ail_push);
DEFINE_AIL_PUSH_EVENT(xfs_ail_pinned);
DEFINE_AIL_PUSH_EVENT(xfs_ail_locked);
DEFINE_AIL_PUSH_EVENT(xfs_ail_flushing);

DECLARE_EVENT_CLASS(xfs_ail_class,
        TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
        TP_ARGS(lip, old_lsn, new_lsn),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(void *, lip)
                __field(uint, type)
                __field(unsigned long, flags)
                __field(xfs_lsn_t, old_lsn)
                __field(xfs_lsn_t, new_lsn)
        ),
        TP_fast_assign(
                __entry->dev = lip->li_log->l_mp->m_super->s_dev;
                __entry->lip = lip;
                __entry->type = lip->li_type;
                __entry->flags = lip->li_flags;
                __entry->old_lsn = old_lsn;
                __entry->new_lsn = new_lsn;
        ),
        TP_printk("dev %d:%d lip %p old lsn %d/%d new lsn %d/%d type %s flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lip,
                  CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
                  CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
                  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
                  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
)

#define DEFINE_AIL_EVENT(name) \
DEFINE_EVENT(xfs_ail_class, name, \
        TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \
        TP_ARGS(lip, old_lsn, new_lsn))
DEFINE_AIL_EVENT(xfs_ail_insert);
DEFINE_AIL_EVENT(xfs_ail_move);
DEFINE_AIL_EVENT(xfs_ail_delete);

TRACE_EVENT(xfs_log_assign_tail_lsn,
        TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn),
        TP_ARGS(log, new_lsn),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_lsn_t, new_lsn)
                __field(xfs_lsn_t, old_lsn)
                __field(xfs_lsn_t, head_lsn)
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
                __entry->new_lsn = new_lsn;
                __entry->old_lsn = atomic64_read(&log->l_tail_lsn);
                __entry->head_lsn = log->l_ailp->ail_head_lsn;
        ),
        TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, head lsn %d/%d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
                  CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
                  CYCLE_LSN(__entry->head_lsn), BLOCK_LSN(__entry->head_lsn))
)

DECLARE_EVENT_CLASS(xfs_file_class,
        TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
        TP_ARGS(iocb, iter),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_fsize_t, size)
                __field(loff_t, offset)
                __field(size_t, count)
        ),
        TP_fast_assign(
                __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
                __entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
                __entry->size = XFS_I(file_inode(iocb->ki_filp))->i_disk_size;
                __entry->offset = iocb->ki_pos;
                __entry->count = iov_iter_count(iter);
        ),
        TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->offset,
                  __entry->count)
)

#define DEFINE_RW_EVENT(name)                \
DEFINE_EVENT(xfs_file_class, name,        \
        TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),                \
        TP_ARGS(iocb, iter))
DEFINE_RW_EVENT(xfs_file_buffered_read);
DEFINE_RW_EVENT(xfs_file_direct_read);
DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);

TRACE_EVENT(xfs_iomap_atomic_write_cow,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
        TP_ARGS(ip, offset, count),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_off_t, offset)
                __field(ssize_t, count)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->offset = offset;
                __entry->count = count;
        ),
        TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->offset,
                  __entry->count)
)

DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
                 int whichfork, struct xfs_bmbt_irec *irec),
        TP_ARGS(ip, offset, count, whichfork, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(loff_t, size)
                __field(loff_t, offset)
                __field(size_t, count)
                __field(int, whichfork)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->size = ip->i_disk_size;
                __entry->offset = offset;
                __entry->count = count;
                __entry->whichfork = whichfork;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx "
                  "fork %s startoff 0x%llx startblock 0x%llx fsbcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->offset,
                  __entry->count,
                  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
                  __entry->startoff,
                  (int64_t)__entry->startblock,
                  __entry->blockcount)
)

#define DEFINE_IMAP_EVENT(name)        \
DEFINE_EVENT(xfs_imap_class, name,        \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,        \
                 int whichfork, struct xfs_bmbt_irec *irec),                \
        TP_ARGS(ip, offset, count, whichfork, irec))
DEFINE_IMAP_EVENT(xfs_map_blocks_found);
DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IMAP_EVENT(xfs_iomap_alloc);
DEFINE_IMAP_EVENT(xfs_iomap_found);

DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
        TP_ARGS(ip, offset, count),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(loff_t, isize)
                __field(loff_t, disize)
                __field(loff_t, offset)
                __field(size_t, count)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->isize = VFS_I(ip)->i_size;
                __entry->disize = ip->i_disk_size;
                __entry->offset = offset;
                __entry->count = count;
        ),
        TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
                  "pos 0x%llx bytecount 0x%zx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->isize,
                  __entry->disize,
                  __entry->offset,
                  __entry->count)
);

#define DEFINE_SIMPLE_IO_EVENT(name)        \
DEFINE_EVENT(xfs_simple_io_class, name,        \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),        \
        TP_ARGS(ip, offset, count))
DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);

DECLARE_EVENT_CLASS(xfs_itrunc_class,
        TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
        TP_ARGS(ip, new_size),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_fsize_t, size)
                __field(xfs_fsize_t, new_size)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->size = ip->i_disk_size;
                __entry->new_size = new_size;
        ),
        TP_printk("dev %d:%d ino 0x%llx disize 0x%llx new_size 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->new_size)
)

#define DEFINE_ITRUNC_EVENT(name) \
DEFINE_EVENT(xfs_itrunc_class, name, \
        TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
        TP_ARGS(ip, new_size))
DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);

TRACE_EVENT(xfs_bunmap,
        TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len,
                 int flags, unsigned long caller_ip),
        TP_ARGS(ip, fileoff, len, flags, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_fsize_t, size)
                __field(xfs_fileoff_t, fileoff)
                __field(xfs_filblks_t, len)
                __field(unsigned long, caller_ip)
                __field(int, flags)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->size = ip->i_disk_size;
                __entry->fileoff = fileoff;
                __entry->len = len;
                __entry->caller_ip = caller_ip;
                __entry->flags = flags;
        ),
        TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx "
                  "flags %s caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->fileoff,
                  __entry->len,
                  __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
                  (void *)__entry->caller_ip)

);

DECLARE_EVENT_CLASS(xfs_extent_busy_class,
        TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
                 xfs_extlen_t len),
        TP_ARGS(xg, agbno, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                __entry->dev = xg->xg_mount->m_super->s_dev;
                __entry->type = xg->xg_type;
                __entry->agno = xg->xg_gno;
                __entry->agbno = agbno;
                __entry->len = len;
        ),
        TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agbno,
                  __entry->len)
);
#define DEFINE_BUSY_EVENT(name) \
DEFINE_EVENT(xfs_extent_busy_class, name, \
        TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
                 xfs_extlen_t len), \
        TP_ARGS(xg, agbno, len))
DEFINE_BUSY_EVENT(xfs_extent_busy);
DEFINE_BUSY_EVENT(xfs_extent_busy_force);
DEFINE_BUSY_EVENT(xfs_extent_busy_reuse);
DEFINE_BUSY_EVENT(xfs_extent_busy_clear);

TRACE_EVENT(xfs_extent_busy_trim,
        TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
                 xfs_extlen_t len, xfs_agblock_t tbno, xfs_extlen_t tlen),
        TP_ARGS(xg, agbno, len, tbno, tlen),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
                __field(xfs_agblock_t, tbno)
                __field(xfs_extlen_t, tlen)
        ),
        TP_fast_assign(
                __entry->dev = xg->xg_mount->m_super->s_dev;
                __entry->type = xg->xg_type;
                __entry->agno = xg->xg_gno;
                __entry->agbno = agbno;
                __entry->len = len;
                __entry->tbno = tbno;
                __entry->tlen = tlen;
        ),
        TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agbno,
                  __entry->len,
                  __entry->tbno,
                  __entry->tlen)
);

#ifdef CONFIG_XFS_RT
TRACE_EVENT(xfs_rtalloc_extent_busy,
        TP_PROTO(struct xfs_rtgroup *rtg, xfs_rtxnum_t start,
                 xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen,
                 xfs_rtxlen_t len, xfs_rtxlen_t prod, xfs_rtxnum_t rtx,
                 unsigned busy_gen),
        TP_ARGS(rtg, start, minlen, maxlen, len, prod, rtx, busy_gen),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_rgnumber_t, rgno)
                __field(xfs_rtxnum_t, start)
                __field(xfs_rtxlen_t, minlen)
                __field(xfs_rtxlen_t, maxlen)
                __field(xfs_rtxlen_t, mod)
                __field(xfs_rtxlen_t, prod)
                __field(xfs_rtxlen_t, len)
                __field(xfs_rtxnum_t, rtx)
                __field(unsigned, busy_gen)
        ),
        TP_fast_assign(
                __entry->dev = rtg_mount(rtg)->m_super->s_dev;
                __entry->rgno = rtg_rgno(rtg);
                __entry->start = start;
                __entry->minlen = minlen;
                __entry->maxlen = maxlen;
                __entry->prod = prod;
                __entry->len = len;
                __entry->rtx = rtx;
                __entry->busy_gen = busy_gen;
        ),
        TP_printk("dev %d:%d rgno 0x%x startrtx 0x%llx minlen %u maxlen %u "
                  "prod %u len %u rtx 0%llx busy_gen 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rgno,
                  __entry->start,
                  __entry->minlen,
                  __entry->maxlen,
                  __entry->prod,
                  __entry->len,
                  __entry->rtx,
                  __entry->busy_gen)
)

TRACE_EVENT(xfs_rtalloc_extent_busy_trim,
        TP_PROTO(struct xfs_rtgroup *rtg, xfs_rtxnum_t old_rtx,
                 xfs_rtxlen_t old_len, xfs_rtxnum_t new_rtx,
                 xfs_rtxlen_t new_len),
        TP_ARGS(rtg, old_rtx, old_len, new_rtx, new_len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_rgnumber_t, rgno)
                __field(xfs_rtxnum_t, old_rtx)
                __field(xfs_rtxnum_t, new_rtx)
                __field(xfs_rtxlen_t, old_len)
                __field(xfs_rtxlen_t, new_len)
        ),
        TP_fast_assign(
                __entry->dev = rtg_mount(rtg)->m_super->s_dev;
                __entry->rgno = rtg_rgno(rtg);
                __entry->old_rtx = old_rtx;
                __entry->old_len = old_len;
                __entry->new_rtx = new_rtx;
                __entry->new_len = new_len;
        ),
        TP_printk("dev %d:%d rgno 0x%x rtx 0x%llx rtxcount 0x%x -> rtx 0x%llx rtxcount 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rgno,
                  __entry->old_rtx,
                  __entry->old_len,
                  __entry->new_rtx,
                  __entry->new_len)
);
#endif /* CONFIG_XFS_RT */

DECLARE_EVENT_CLASS(xfs_agf_class,
        TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
                 unsigned long caller_ip),
        TP_ARGS(mp, agf, flags, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(int, flags)
                __field(__u32, length)
                __field(__u32, bno_root)
                __field(__u32, cnt_root)
                __field(__u32, bno_level)
                __field(__u32, cnt_level)
                __field(__u32, flfirst)
                __field(__u32, fllast)
                __field(__u32, flcount)
                __field(__u32, freeblks)
                __field(__u32, longest)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = be32_to_cpu(agf->agf_seqno),
                __entry->flags = flags;
                __entry->length = be32_to_cpu(agf->agf_length),
                __entry->bno_root = be32_to_cpu(agf->agf_bno_root),
                __entry->cnt_root = be32_to_cpu(agf->agf_cnt_root),
                __entry->bno_level = be32_to_cpu(agf->agf_bno_level),
                __entry->cnt_level = be32_to_cpu(agf->agf_cnt_level),
                __entry->flfirst = be32_to_cpu(agf->agf_flfirst),
                __entry->fllast = be32_to_cpu(agf->agf_fllast),
                __entry->flcount = be32_to_cpu(agf->agf_flcount),
                __entry->freeblks = be32_to_cpu(agf->agf_freeblks),
                __entry->longest = be32_to_cpu(agf->agf_longest);
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d agno 0x%x flags %s length %u roots b %u c %u "
                  "levels b %u c %u flfirst %u fllast %u flcount %u "
                  "freeblks %u longest %u caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
                  __entry->length,
                  __entry->bno_root,
                  __entry->cnt_root,
                  __entry->bno_level,
                  __entry->cnt_level,
                  __entry->flfirst,
                  __entry->fllast,
                  __entry->flcount,
                  __entry->freeblks,
                  __entry->longest,
                  (void *)__entry->caller_ip)
);
#define DEFINE_AGF_EVENT(name) \
DEFINE_EVENT(xfs_agf_class, name, \
        TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, \
                 unsigned long caller_ip), \
        TP_ARGS(mp, agf, flags, caller_ip))
DEFINE_AGF_EVENT(xfs_agf);
DEFINE_AGF_EVENT(xfs_agfl_reset);

TRACE_EVENT(xfs_free_extent,
        TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
                 xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft,
                 int haveright),
        TP_ARGS(pag, agbno, len, resv, haveleft, haveright),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
                __field(int, resv)
                __field(int, haveleft)
                __field(int, haveright)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->agno = pag_agno(pag);
                __entry->agbno = agbno;
                __entry->len = len;
                __entry->resv = resv;
                __entry->haveleft = haveleft;
                __entry->haveright = haveright;
        ),
        TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x resv %d %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
                  __entry->resv,
                  __entry->haveleft ?
                        (__entry->haveright ? "both" : "left") :
                        (__entry->haveright ? "right" : "none"))

);

DECLARE_EVENT_CLASS(xfs_alloc_class,
        TP_PROTO(struct xfs_alloc_arg *args),
        TP_ARGS(args),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, minlen)
                __field(xfs_extlen_t, maxlen)
                __field(xfs_extlen_t, mod)
                __field(xfs_extlen_t, prod)
                __field(xfs_extlen_t, minleft)
                __field(xfs_extlen_t, total)
                __field(xfs_extlen_t, alignment)
                __field(xfs_extlen_t, minalignslop)
                __field(xfs_extlen_t, len)
                __field(char, wasdel)
                __field(char, wasfromfl)
                __field(int, resv)
                __field(int, datatype)
                __field(xfs_agnumber_t, highest_agno)
        ),
        TP_fast_assign(
                __entry->dev = args->mp->m_super->s_dev;
                __entry->agno = args->agno;
                __entry->agbno = args->agbno;
                __entry->minlen = args->minlen;
                __entry->maxlen = args->maxlen;
                __entry->mod = args->mod;
                __entry->prod = args->prod;
                __entry->minleft = args->minleft;
                __entry->total = args->total;
                __entry->alignment = args->alignment;
                __entry->minalignslop = args->minalignslop;
                __entry->len = args->len;
                __entry->wasdel = args->wasdel;
                __entry->wasfromfl = args->wasfromfl;
                __entry->resv = args->resv;
                __entry->datatype = args->datatype;
                __entry->highest_agno = args->tp->t_highest_agno;
        ),
        TP_printk("dev %d:%d agno 0x%x agbno 0x%x minlen %u maxlen %u mod %u "
                  "prod %u minleft %u total %u alignment %u minalignslop %u "
                  "len %u wasdel %d wasfromfl %d resv %d "
                  "datatype 0x%x highest_agno 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->minlen,
                  __entry->maxlen,
                  __entry->mod,
                  __entry->prod,
                  __entry->minleft,
                  __entry->total,
                  __entry->alignment,
                  __entry->minalignslop,
                  __entry->len,
                  __entry->wasdel,
                  __entry->wasfromfl,
                  __entry->resv,
                  __entry->datatype,
                  __entry->highest_agno)
)

#define DEFINE_ALLOC_EVENT(name) \
DEFINE_EVENT(xfs_alloc_class, name, \
        TP_PROTO(struct xfs_alloc_arg *args), \
        TP_ARGS(args))
DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
DEFINE_ALLOC_EVENT(xfs_alloc_cur);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_right);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_left);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done);
DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_skip_deadlock);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);

DEFINE_ALLOC_EVENT(xfs_alloc_vextent_this_ag);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_start_ag);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_first_ag);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_exact_bno);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_near_bno);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_finish);

TRACE_EVENT(xfs_alloc_cur_check,
        TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t bno,
                 xfs_extlen_t len, xfs_extlen_t diff, bool new),
        TP_ARGS(cur, bno, len, diff, new),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __string(name, cur->bc_ops->name)
                __field(xfs_agblock_t, bno)
                __field(xfs_extlen_t, len)
                __field(xfs_extlen_t, diff)
                __field(bool, new)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __assign_str(name);
                __entry->bno = bno;
                __entry->len = len;
                __entry->diff = diff;
                __entry->new = new;
        ),
        TP_printk("dev %d:%d %sbt agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->bno, __entry->len, __entry->diff, __entry->new)
)

DECLARE_EVENT_CLASS(xfs_da_class,
        TP_PROTO(struct xfs_da_args *args),
        TP_ARGS(args),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __dynamic_array(char, name, args->namelen)
                __field(int, namelen)
                __field(xfs_dahash_t, hashval)
                __field(xfs_ino_t, inumber)
                __field(uint32_t, op_flags)
                __field(xfs_ino_t, owner)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
                __entry->ino = args->dp->i_ino;
                if (args->namelen)
                        memcpy(__get_str(name), args->name, args->namelen);
                __entry->namelen = args->namelen;
                __entry->hashval = args->hashval;
                __entry->inumber = args->inumber;
                __entry->op_flags = args->op_flags;
                __entry->owner = args->owner;
        ),
        TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
                  "inumber 0x%llx op_flags %s owner 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->namelen,
                  __entry->namelen ? __get_str(name) : NULL,
                  __entry->namelen,
                  __entry->hashval,
                  __entry->inumber,
                  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
                  __entry->owner)
)

#define DEFINE_DIR2_EVENT(name) \
DEFINE_EVENT(xfs_da_class, name, \
        TP_PROTO(struct xfs_da_args *args), \
        TP_ARGS(args))
DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);

DECLARE_EVENT_CLASS(xfs_attr_class,
        TP_PROTO(struct xfs_da_args *args),
        TP_ARGS(args),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __dynamic_array(char, name, args->namelen)
                __field(int, namelen)
                __field(int, valuelen)
                __field(xfs_dahash_t, hashval)
                __field(unsigned int, attr_filter)
                __field(uint32_t, op_flags)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
                __entry->ino = args->dp->i_ino;
                if (args->namelen)
                        memcpy(__get_str(name), args->name, args->namelen);
                __entry->namelen = args->namelen;
                __entry->valuelen = args->valuelen;
                __entry->hashval = args->hashval;
                __entry->attr_filter = args->attr_filter;
                __entry->op_flags = args->op_flags;
        ),
        TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
                  "hashval 0x%x filter %s op_flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->namelen,
                  __entry->namelen ? __get_str(name) : NULL,
                  __entry->namelen,
                  __entry->valuelen,
                  __entry->hashval,
                  __print_flags(__entry->attr_filter, "|",
                                XFS_ATTR_FILTER_FLAGS),
                  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
)

#define DEFINE_ATTR_EVENT(name) \
DEFINE_EVENT(xfs_attr_class, name, \
        TP_PROTO(struct xfs_da_args *args), \
        TP_ARGS(args))
DEFINE_ATTR_EVENT(xfs_attr_sf_add);
DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
DEFINE_ATTR_EVENT(xfs_attr_sf_create);
DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
DEFINE_ATTR_EVENT(xfs_attr_sf_replace);
DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);

DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after);
DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag);
DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag);
DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags);
DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);

DEFINE_ATTR_EVENT(xfs_attr_node_addname);
DEFINE_ATTR_EVENT(xfs_attr_node_get);
DEFINE_ATTR_EVENT(xfs_attr_node_replace);

DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);

#define DEFINE_DA_EVENT(name) \
DEFINE_EVENT(xfs_da_class, name, \
        TP_PROTO(struct xfs_da_args *args), \
        TP_ARGS(args))
DEFINE_DA_EVENT(xfs_da_split);
DEFINE_DA_EVENT(xfs_da_join);
DEFINE_DA_EVENT(xfs_da_link_before);
DEFINE_DA_EVENT(xfs_da_link_after);
DEFINE_DA_EVENT(xfs_da_unlink_back);
DEFINE_DA_EVENT(xfs_da_unlink_forward);
DEFINE_DA_EVENT(xfs_da_root_split);
DEFINE_DA_EVENT(xfs_da_root_join);
DEFINE_DA_EVENT(xfs_da_node_add);
DEFINE_DA_EVENT(xfs_da_node_create);
DEFINE_DA_EVENT(xfs_da_node_split);
DEFINE_DA_EVENT(xfs_da_node_remove);
DEFINE_DA_EVENT(xfs_da_node_rebalance);
DEFINE_DA_EVENT(xfs_da_node_unbalance);
DEFINE_DA_EVENT(xfs_da_node_toosmall);
DEFINE_DA_EVENT(xfs_da_swap_lastblock);
DEFINE_DA_EVENT(xfs_da_grow_inode);
DEFINE_DA_EVENT(xfs_da_shrink_inode);
DEFINE_DA_EVENT(xfs_da_fixhashpath);
DEFINE_DA_EVENT(xfs_da_path_shift);

DECLARE_EVENT_CLASS(xfs_dir2_space_class,
        TP_PROTO(struct xfs_da_args *args, int idx),
        TP_ARGS(args, idx),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(uint32_t, op_flags)
                __field(int, idx)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
                __entry->ino = args->dp->i_ino;
                __entry->op_flags = args->op_flags;
                __entry->idx = idx;
        ),
        TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
                  __entry->idx)
)

#define DEFINE_DIR2_SPACE_EVENT(name) \
DEFINE_EVENT(xfs_dir2_space_class, name, \
        TP_PROTO(struct xfs_da_args *args, int idx), \
        TP_ARGS(args, idx))
DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);

TRACE_EVENT(xfs_dir2_leafn_moveents,
        TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
        TP_ARGS(args, src_idx, dst_idx, count),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(uint32_t, op_flags)
                __field(int, src_idx)
                __field(int, dst_idx)
                __field(int, count)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
                __entry->ino = args->dp->i_ino;
                __entry->op_flags = args->op_flags;
                __entry->src_idx = src_idx;
                __entry->dst_idx = dst_idx;
                __entry->count = count;
        ),
        TP_printk("dev %d:%d ino 0x%llx op_flags %s "
                  "src_idx %d dst_idx %d count %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
                  __entry->src_idx,
                  __entry->dst_idx,
                  __entry->count)
);

#define XFS_SWAPEXT_INODES \
        { 0,        "target" }, \
        { 1,        "temp" }

TRACE_DEFINE_ENUM(XFS_DINODE_FMT_DEV);
TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL);
TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS);
TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE);
TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID);
TRACE_DEFINE_ENUM(XFS_DINODE_FMT_META_BTREE);

DECLARE_EVENT_CLASS(xfs_swap_extent_class,
        TP_PROTO(struct xfs_inode *ip, int which),
        TP_ARGS(ip, which),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, which)
                __field(xfs_ino_t, ino)
                __field(int, format)
                __field(xfs_extnum_t, nex)
                __field(int, broot_size)
                __field(int, fork_off)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->which = which;
                __entry->ino = ip->i_ino;
                __entry->format = ip->i_df.if_format;
                __entry->nex = ip->i_df.if_nextents;
                __entry->broot_size = ip->i_df.if_broot_bytes;
                __entry->fork_off = xfs_inode_fork_boff(ip);
        ),
        TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
                  "broot size %d, forkoff 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
                  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
                  __entry->nex,
                  __entry->broot_size,
                  __entry->fork_off)
)

#define DEFINE_SWAPEXT_EVENT(name) \
DEFINE_EVENT(xfs_swap_extent_class, name, \
        TP_PROTO(struct xfs_inode *ip, int which), \
        TP_ARGS(ip, which))

DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);

TRACE_EVENT(xfs_log_recover,
        TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
        TP_ARGS(log, headblk, tailblk),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_daddr_t, headblk)
                __field(xfs_daddr_t, tailblk)
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
                __entry->headblk = headblk;
                __entry->tailblk = tailblk;
        ),
        TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
                  __entry->tailblk)
)

TRACE_EVENT(xfs_log_recover_record,
        TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
        TP_ARGS(log, rhead, pass),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_lsn_t, lsn)
                __field(int, len)
                __field(int, num_logops)
                __field(int, pass)
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
                __entry->lsn = be64_to_cpu(rhead->h_lsn);
                __entry->len = be32_to_cpu(rhead->h_len);
                __entry->num_logops = be32_to_cpu(rhead->h_num_logops);
                __entry->pass = pass;
        ),
        TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lsn, __entry->len, __entry->num_logops,
                   __entry->pass)
)

DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
        TP_PROTO(struct xlog *log, struct xlog_recover *trans,
                struct xlog_recover_item *item, int pass),
        TP_ARGS(log, trans, item, pass),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long, item)
                __field(xlog_tid_t, tid)
                __field(xfs_lsn_t, lsn)
                __field(int, type)
                __field(int, pass)
                __field(int, count)
                __field(int, total)
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
                __entry->item = (unsigned long)item;
                __entry->tid = trans->r_log_tid;
                __entry->lsn = trans->r_lsn;
                __entry->type = ITEM_TYPE(item);
                __entry->pass = pass;
                __entry->count = item->ri_cnt;
                __entry->total = item->ri_total;
        ),
        TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item %p, "
                  "item type %s item region count/total %d/%d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tid,
                  __entry->lsn,
                  __entry->pass,
                  (void *)__entry->item,
                  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
                  __entry->count,
                  __entry->total)
)

#define DEFINE_LOG_RECOVER_ITEM(name) \
DEFINE_EVENT(xfs_log_recover_item_class, name, \
        TP_PROTO(struct xlog *log, struct xlog_recover *trans, \
                struct xlog_recover_item *item, int pass), \
        TP_ARGS(log, trans, item, pass))

DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);

DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
        TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f),
        TP_ARGS(log, buf_f),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int64_t, blkno)
                __field(unsigned short, len)
                __field(unsigned short, flags)
                __field(unsigned short, size)
                __field(unsigned int, map_size)
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
                __entry->blkno = buf_f->blf_blkno;
                __entry->len = buf_f->blf_len;
                __entry->flags = buf_f->blf_flags;
                __entry->size = buf_f->blf_size;
                __entry->map_size = buf_f->blf_map_size;
        ),
        TP_printk("dev %d:%d daddr 0x%llx, bbcount 0x%x, flags 0x%x, size %d, "
                        "map_size %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blkno,
                  __entry->len,
                  __entry->flags,
                  __entry->size,
                  __entry->map_size)
)

#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
        TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), \
        TP_ARGS(log, buf_f))

DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);

DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
        TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f),
        TP_ARGS(log, in_f),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(unsigned short, size)
                __field(int, fields)
                __field(unsigned short, asize)
                __field(unsigned short, dsize)
                __field(int64_t, blkno)
                __field(int, len)
                __field(int, boffset)
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
                __entry->ino = in_f->ilf_ino;
                __entry->size = in_f->ilf_size;
                __entry->fields = in_f->ilf_fields;
                __entry->asize = in_f->ilf_asize;
                __entry->dsize = in_f->ilf_dsize;
                __entry->blkno = in_f->ilf_blkno;
                __entry->len = in_f->ilf_len;
                __entry->boffset = in_f->ilf_boffset;
        ),
        TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
                        "dsize %d, daddr 0x%llx, bbcount 0x%x, boffset %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->fields,
                  __entry->asize,
                  __entry->dsize,
                  __entry->blkno,
                  __entry->len,
                  __entry->boffset)
)
#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
        TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), \
        TP_ARGS(log, in_f))

DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);

DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class,
        TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f),
        TP_ARGS(log, in_f),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(unsigned int, count)
                __field(unsigned int, isize)
                __field(xfs_agblock_t, length)
                __field(unsigned int, gen)
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
                __entry->agno = be32_to_cpu(in_f->icl_ag);
                __entry->agbno = be32_to_cpu(in_f->icl_agbno);
                __entry->count = be32_to_cpu(in_f->icl_count);
                __entry->isize = be32_to_cpu(in_f->icl_isize);
                __entry->length = be32_to_cpu(in_f->icl_length);
                __entry->gen = be32_to_cpu(in_f->icl_gen);
        ),
        TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x ireccount %u isize %u gen 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->length,
                  __entry->count,
                  __entry->isize,
                  __entry->gen)
)
#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \
DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \
        TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), \
        TP_ARGS(log, in_f))

DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel);
DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover);

DECLARE_EVENT_CLASS(xfs_discard_class,
        TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
                 xfs_extlen_t len),
        TP_ARGS(xg, agbno, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                __entry->dev = xg->xg_mount->m_super->s_dev;
                __entry->type = xg->xg_type;
                __entry->agno = xg->xg_gno;
                __entry->agbno = agbno;
                __entry->len = len;
        ),
        TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len)
)

#define DEFINE_DISCARD_EVENT(name) \
DEFINE_EVENT(xfs_discard_class, name, \
        TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
                 xfs_extlen_t len), \
        TP_ARGS(xg, agbno, len))
DEFINE_DISCARD_EVENT(xfs_discard_extent);
DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
DEFINE_DISCARD_EVENT(xfs_discard_exclude);
DEFINE_DISCARD_EVENT(xfs_discard_busy);

DECLARE_EVENT_CLASS(xfs_rtdiscard_class,
        TP_PROTO(struct xfs_mount *mp,
                 xfs_rtblock_t rtbno, xfs_rtblock_t len),
        TP_ARGS(mp, rtbno, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_rtblock_t, rtbno)
                __field(xfs_rtblock_t, len)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_rtdev_targp->bt_dev;
                __entry->rtbno = rtbno;
                __entry->len = len;
        ),
        TP_printk("dev %d:%d rtbno 0x%llx rtbcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rtbno,
                  __entry->len)
)

#define DEFINE_RTDISCARD_EVENT(name) \
DEFINE_EVENT(xfs_rtdiscard_class, name, \
        TP_PROTO(struct xfs_mount *mp, \
                 xfs_rtblock_t rtbno, xfs_rtblock_t len), \
        TP_ARGS(mp, rtbno, len))
DEFINE_RTDISCARD_EVENT(xfs_discard_rtextent);
DEFINE_RTDISCARD_EVENT(xfs_discard_rttoosmall);

DECLARE_EVENT_CLASS(xfs_btree_cur_class,
        TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
        TP_ARGS(cur, level, bp),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __string(name, cur->bc_ops->name)
                __field(int, level)
                __field(int, nlevels)
                __field(int, ptr)
                __field(xfs_daddr_t, daddr)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __assign_str(name);
                __entry->level = level;
                __entry->nlevels = cur->bc_nlevels;
                __entry->ptr = cur->bc_levels[level].ptr;
                __entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
        ),
        TP_printk("dev %d:%d %sbt level %d/%d ptr %d daddr 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->level,
                  __entry->nlevels,
                  __entry->ptr,
                  (unsigned long long)__entry->daddr)
)

#define DEFINE_BTREE_CUR_EVENT(name) \
DEFINE_EVENT(xfs_btree_cur_class, name, \
        TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), \
        TP_ARGS(cur, level, bp))
DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys);
DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);

TRACE_EVENT(xfs_btree_alloc_block,
        TP_PROTO(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, int stat,
                 int error),
        TP_ARGS(cur, ptr, stat, error),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_ino_t, ino)
                __string(name, cur->bc_ops->name)
                __field(int, error)
                __field(xfs_agblock_t, agbno)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                switch (cur->bc_ops->type) {
                case XFS_BTREE_TYPE_INODE:
                        __entry->agno = 0;
                        __entry->ino = cur->bc_ino.ip->i_ino;
                        break;
                case XFS_BTREE_TYPE_AG:
                        __entry->agno = cur->bc_group->xg_gno;
                        __entry->ino = 0;
                        break;
                case XFS_BTREE_TYPE_MEM:
                        __entry->agno = 0;
                        __entry->ino = 0;
                        break;
                }
                __assign_str(name);
                __entry->error = error;
                if (!error && stat) {
                        if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
                                xfs_fsblock_t        fsb = be64_to_cpu(ptr->l);

                                __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp,
                                                                fsb);
                                __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp,
                                                                fsb);
                        } else {
                                __entry->agbno = be32_to_cpu(ptr->s);
                        }
                } else {
                        __entry->agbno = NULLAGBLOCK;
                }
        ),
        TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->agno,
                  __entry->ino,
                  __entry->agbno,
                  __entry->error)
);

TRACE_EVENT(xfs_btree_free_block,
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_buf *bp),
        TP_ARGS(cur, bp),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_ino_t, ino)
                __string(name, cur->bc_ops->name)
                __field(xfs_agblock_t, agbno)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->agno = xfs_daddr_to_agno(cur->bc_mp,
                                                        xfs_buf_daddr(bp));
                if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
                        __entry->ino = cur->bc_ino.ip->i_ino;
                else
                        __entry->ino = 0;
                __assign_str(name);
                __entry->agbno = xfs_daddr_to_agbno(cur->bc_mp,
                                                        xfs_buf_daddr(bp));
        ),
        TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->agno,
                  __entry->ino,
                  __entry->agbno)
);

/* deferred ops */
struct xfs_defer_pending;

DECLARE_EVENT_CLASS(xfs_defer_class,
        TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip),
        TP_ARGS(tp, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(struct xfs_trans *, tp)
                __field(char, committed)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = tp->t_mountp->m_super->s_dev;
                __entry->tp = tp;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d tp %p caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tp,
                  (char *)__entry->caller_ip)
)
#define DEFINE_DEFER_EVENT(name) \
DEFINE_EVENT(xfs_defer_class, name, \
        TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), \
        TP_ARGS(tp, caller_ip))

DECLARE_EVENT_CLASS(xfs_defer_error_class,
        TP_PROTO(struct xfs_trans *tp, int error),
        TP_ARGS(tp, error),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(struct xfs_trans *, tp)
                __field(char, committed)
                __field(int, error)
        ),
        TP_fast_assign(
                __entry->dev = tp->t_mountp->m_super->s_dev;
                __entry->tp = tp;
                __entry->error = error;
        ),
        TP_printk("dev %d:%d tp %p err %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tp,
                  __entry->error)
)
#define DEFINE_DEFER_ERROR_EVENT(name) \
DEFINE_EVENT(xfs_defer_error_class, name, \
        TP_PROTO(struct xfs_trans *tp, int error), \
        TP_ARGS(tp, error))

DECLARE_EVENT_CLASS(xfs_defer_pending_class,
        TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp),
        TP_ARGS(mp, dfp),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __string(name, dfp->dfp_ops->name)
                __field(void *, intent)
                __field(unsigned int, flags)
                __field(char, committed)
                __field(int, nr)
        ),
        TP_fast_assign(
                __entry->dev = mp ? mp->m_super->s_dev : 0;
                __assign_str(name);
                __entry->intent = dfp->dfp_intent;
                __entry->flags = dfp->dfp_flags;
                __entry->committed = dfp->dfp_done != NULL;
                __entry->nr = dfp->dfp_count;
        ),
        TP_printk("dev %d:%d optype %s intent %p flags %s committed %d nr %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->intent,
                  __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS),
                  __entry->committed,
                  __entry->nr)
)
#define DEFINE_DEFER_PENDING_EVENT(name) \
DEFINE_EVENT(xfs_defer_pending_class, name, \
        TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \
        TP_ARGS(mp, dfp))

DEFINE_DEFER_EVENT(xfs_defer_cancel);
DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
DEFINE_DEFER_EVENT(xfs_defer_finish);
DEFINE_DEFER_EVENT(xfs_defer_finish_done);

DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);

DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);

DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class,
        TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free),
        TP_ARGS(mp, free),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
                __field(unsigned int, flags)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->type = free->xefi_group->xg_type;
                __entry->agno = free->xefi_group->xg_gno;
                __entry->agbno = xfs_fsb_to_gbno(mp, free->xefi_startblock,
                                                free->xefi_group->xg_type);
                __entry->len = free->xefi_blockcount;
                __entry->flags = free->xefi_flags;
        ),
        TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x flags 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
                  __entry->flags)
);
#define DEFINE_FREE_EXTENT_DEFERRED_EVENT(name) \
DEFINE_EVENT(xfs_free_extent_deferred_class, name, \
        TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free), \
        TP_ARGS(mp, free))
DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred);
DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_defer);
DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_deferred);

DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
        TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
                 void *item),
        TP_ARGS(mp, dfp, item),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __string(name, dfp->dfp_ops->name)
                __field(void *, intent)
                __field(void *, item)
                __field(char, committed)
                __field(unsigned int, flags)
                __field(int, nr)
        ),
        TP_fast_assign(
                __entry->dev = mp ? mp->m_super->s_dev : 0;
                __assign_str(name);
                __entry->intent = dfp->dfp_intent;
                __entry->item = item;
                __entry->committed = dfp->dfp_done != NULL;
                __entry->flags = dfp->dfp_flags;
                __entry->nr = dfp->dfp_count;
        ),
        TP_printk("dev %d:%d optype %s intent %p item %p flags %s committed %d nr %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->intent,
                  __entry->item,
                  __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS),
                  __entry->committed,
                  __entry->nr)
)
#define DEFINE_DEFER_PENDING_ITEM_EVENT(name) \
DEFINE_EVENT(xfs_defer_pending_item_class, name, \
        TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, \
                 void *item), \
        TP_ARGS(mp, dfp, item))

DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_add_item);
DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_cancel_item);
DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);

/* rmap tracepoints */
DECLARE_EVENT_CLASS(xfs_rmap_class,
        TP_PROTO(struct xfs_btree_cur *cur,
                 xfs_agblock_t gbno, xfs_extlen_t len, bool unwritten,
                 const struct xfs_owner_info *oinfo),
        TP_ARGS(cur, gbno, len, unwritten, oinfo),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, gbno)
                __field(xfs_extlen_t, len)
                __field(uint64_t, owner)
                __field(uint64_t, offset)
                __field(unsigned long, flags)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->type = cur->bc_group->xg_type;
                __entry->agno = cur->bc_group->xg_gno;
                __entry->gbno = gbno;
                __entry->len = len;
                __entry->owner = oinfo->oi_owner;
                __entry->offset = oinfo->oi_offset;
                __entry->flags = oinfo->oi_flags;
                if (unwritten)
                        __entry->flags |= XFS_RMAP_UNWRITTEN;
        ),
        TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->gbno,
                  __entry->len,
                  __entry->owner,
                  __entry->offset,
                  __entry->flags)
);
#define DEFINE_RMAP_EVENT(name) \
DEFINE_EVENT(xfs_rmap_class, name, \
        TP_PROTO(struct xfs_btree_cur *cur, \
                 xfs_agblock_t gbno, xfs_extlen_t len, bool unwritten, \
                 const struct xfs_owner_info *oinfo), \
        TP_ARGS(cur, gbno, len, unwritten, oinfo))

/* btree cursor error/%ip tracepoint class */
DECLARE_EVENT_CLASS(xfs_btree_error_class,
        TP_PROTO(struct xfs_btree_cur *cur, int error,
                 unsigned long caller_ip),
        TP_ARGS(cur, error, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_ino_t, ino)
                __field(int, error)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                switch (cur->bc_ops->type) {
                case XFS_BTREE_TYPE_INODE:
                        __entry->agno = 0;
                        __entry->ino = cur->bc_ino.ip->i_ino;
                        break;
                case XFS_BTREE_TYPE_AG:
                        __entry->agno = cur->bc_group->xg_gno;
                        __entry->ino = 0;
                        break;
                case XFS_BTREE_TYPE_MEM:
                        __entry->agno = 0;
                        __entry->ino = 0;
                        break;
                }
                __entry->error = error;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d agno 0x%x ino 0x%llx error %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->ino,
                  __entry->error,
                  (char *)__entry->caller_ip)
);

#define DEFINE_BTREE_ERROR_EVENT(name) \
DEFINE_EVENT(xfs_btree_error_class, name, \
        TP_PROTO(struct xfs_btree_cur *cur, int error, \
                 unsigned long caller_ip), \
        TP_ARGS(cur, error, caller_ip))

DEFINE_RMAP_EVENT(xfs_rmap_unmap);
DEFINE_RMAP_EVENT(xfs_rmap_unmap_done);
DEFINE_BTREE_ERROR_EVENT(xfs_rmap_unmap_error);
DEFINE_RMAP_EVENT(xfs_rmap_map);
DEFINE_RMAP_EVENT(xfs_rmap_map_done);
DEFINE_BTREE_ERROR_EVENT(xfs_rmap_map_error);
DEFINE_RMAP_EVENT(xfs_rmap_convert);
DEFINE_RMAP_EVENT(xfs_rmap_convert_done);
DEFINE_BTREE_ERROR_EVENT(xfs_rmap_convert_error);

TRACE_EVENT(xfs_rmap_convert_state,
        TP_PROTO(struct xfs_btree_cur *cur, int state,
                 unsigned long caller_ip),
        TP_ARGS(cur, state, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(int, state)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->type = cur->bc_group->xg_type;
                __entry->agno = cur->bc_group->xg_gno;
                __entry->state = state;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d %sno 0x%x state %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->state,
                  (char *)__entry->caller_ip)
);

DECLARE_EVENT_CLASS(xfs_rmapbt_class,
        TP_PROTO(struct xfs_btree_cur *cur,
                 xfs_agblock_t gbno, xfs_extlen_t len,
                 uint64_t owner, uint64_t offset, unsigned int flags),
        TP_ARGS(cur, gbno, len, owner, offset, flags),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, gbno)
                __field(xfs_extlen_t, len)
                __field(uint64_t, owner)
                __field(uint64_t, offset)
                __field(unsigned int, flags)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->type = cur->bc_group->xg_type;
                __entry->agno = cur->bc_group->xg_gno;
                __entry->gbno = gbno;
                __entry->len = len;
                __entry->owner = owner;
                __entry->offset = offset;
                __entry->flags = flags;
        ),
        TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->gbno,
                  __entry->len,
                  __entry->owner,
                  __entry->offset,
                  __entry->flags)
);
#define DEFINE_RMAPBT_EVENT(name) \
DEFINE_EVENT(xfs_rmapbt_class, name, \
        TP_PROTO(struct xfs_btree_cur *cur, \
                 xfs_agblock_t gbno, xfs_extlen_t len, \
                 uint64_t owner, uint64_t offset, unsigned int flags), \
        TP_ARGS(cur, gbno, len, owner, offset, flags))

TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);
TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP);
TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED);
TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT);
TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED);
TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC);
TRACE_DEFINE_ENUM(XFS_RMAP_FREE);

DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
        TP_PROTO(struct xfs_mount *mp, struct xfs_rmap_intent *ri),
        TP_ARGS(mp, ri),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long long, owner)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, gbno)
                __field(int, whichfork)
                __field(xfs_fileoff_t, l_loff)
                __field(xfs_filblks_t, l_len)
                __field(xfs_exntst_t, l_state)
                __field(int, op)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->type = ri->ri_group->xg_type;
                __entry->agno = ri->ri_group->xg_gno;
                __entry->gbno = xfs_fsb_to_gbno(mp,
                                                ri->ri_bmap.br_startblock,
                                                ri->ri_group->xg_type);
                __entry->owner = ri->ri_owner;
                __entry->whichfork = ri->ri_whichfork;
                __entry->l_loff = ri->ri_bmap.br_startoff;
                __entry->l_len = ri->ri_bmap.br_blockcount;
                __entry->l_state = ri->ri_bmap.br_state;
                __entry->op = ri->ri_type;
        ),
        TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->op, XFS_RMAP_INTENT_STRINGS),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->gbno,
                  __entry->owner,
                  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
                  __entry->l_loff,
                  __entry->l_len,
                  __entry->l_state)
);
#define DEFINE_RMAP_DEFERRED_EVENT(name) \
DEFINE_EVENT(xfs_rmap_deferred_class, name, \
        TP_PROTO(struct xfs_mount *mp, struct xfs_rmap_intent *ri), \
        TP_ARGS(mp, ri))
DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer);
DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);

DEFINE_RMAPBT_EVENT(xfs_rmap_update);
DEFINE_RMAPBT_EVENT(xfs_rmap_insert);
DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
DEFINE_BTREE_ERROR_EVENT(xfs_rmap_insert_error);
DEFINE_BTREE_ERROR_EVENT(xfs_rmap_delete_error);
DEFINE_BTREE_ERROR_EVENT(xfs_rmap_update_error);

DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query);
DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_candidate);
DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range);
DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);

/* deferred bmbt updates */
TRACE_DEFINE_ENUM(XFS_BMAP_MAP);
TRACE_DEFINE_ENUM(XFS_BMAP_UNMAP);

DECLARE_EVENT_CLASS(xfs_bmap_deferred_class,
        TP_PROTO(struct xfs_bmap_intent *bi),
        TP_ARGS(bi),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(xfs_ino_t, ino)
                __field(unsigned long long, gbno)
                __field(int, whichfork)
                __field(xfs_fileoff_t, l_loff)
                __field(xfs_filblks_t, l_len)
                __field(xfs_exntst_t, l_state)
                __field(int, op)
        ),
        TP_fast_assign(
                struct xfs_inode        *ip = bi->bi_owner;
                struct xfs_mount        *mp = ip->i_mount;

                __entry->dev = mp->m_super->s_dev;
                __entry->type = bi->bi_group->xg_type;
                __entry->agno = bi->bi_group->xg_gno;
                if (bi->bi_group->xg_type == XG_TYPE_RTG &&
                    !xfs_has_rtgroups(mp)) {
                        /*
                         * Legacy rt filesystems do not have allocation groups
                         * ondisk.  We emulate this incore with one gigantic
                         * rtgroup whose size can exceed a 32-bit block number.
                         * For this tracepoint, we report group 0 and a 64-bit
                         * group block number.
                         */
                        __entry->gbno = bi->bi_bmap.br_startblock;
                } else {
                        __entry->gbno = xfs_fsb_to_gbno(mp,
                                                bi->bi_bmap.br_startblock,
                                                bi->bi_group->xg_type);
                }
                __entry->ino = ip->i_ino;
                __entry->whichfork = bi->bi_whichfork;
                __entry->l_loff = bi->bi_bmap.br_startoff;
                __entry->l_len = bi->bi_bmap.br_blockcount;
                __entry->l_state = bi->bi_bmap.br_state;
                __entry->op = bi->bi_type;
        ),
        TP_printk("dev %d:%d op %s ino 0x%llx %sno 0x%x gbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS),
                  __entry->ino,
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->gbno,
                  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
                  __entry->l_loff,
                  __entry->l_len,
                  __entry->l_state)
);
#define DEFINE_BMAP_DEFERRED_EVENT(name) \
DEFINE_EVENT(xfs_bmap_deferred_class, name, \
        TP_PROTO(struct xfs_bmap_intent *bi), \
        TP_ARGS(bi))
DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer);
DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred);

/* per-AG reservation */
DECLARE_EVENT_CLASS(xfs_ag_resv_class,
        TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv,
                 xfs_extlen_t len),
        TP_ARGS(pag, resv, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(int, resv)
                __field(xfs_extlen_t, freeblks)
                __field(xfs_extlen_t, flcount)
                __field(xfs_extlen_t, reserved)
                __field(xfs_extlen_t, asked)
                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                struct xfs_ag_resv        *r = xfs_perag_resv(pag, resv);

                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->agno = pag_agno(pag);
                __entry->resv = resv;
                __entry->freeblks = pag->pagf_freeblks;
                __entry->flcount = pag->pagf_flcount;
                __entry->reserved = r ? r->ar_reserved : 0;
                __entry->asked = r ? r->ar_asked : 0;
                __entry->len = len;
        ),
        TP_printk("dev %d:%d agno 0x%x resv %d freeblks %u flcount %u "
                  "resv %u ask %u len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->resv,
                  __entry->freeblks,
                  __entry->flcount,
                  __entry->reserved,
                  __entry->asked,
                  __entry->len)
)
#define DEFINE_AG_RESV_EVENT(name) \
DEFINE_EVENT(xfs_ag_resv_class, name, \
        TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \
                 xfs_extlen_t len), \
        TP_ARGS(pag, type, len))

/* per-AG reservation tracepoints */
DEFINE_AG_RESV_EVENT(xfs_ag_resv_init);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_free);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);

TRACE_EVENT(xfs_ag_resv_init_error,
        TP_PROTO(const struct xfs_perag *pag, int error,
                 unsigned long caller_ip),
        TP_ARGS(pag, error, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(int, error)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->agno = pag_agno(pag);
                __entry->error = error;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d agno 0x%x error %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->error,
                  (char *)__entry->caller_ip)
);

/* refcount tracepoint classes */

DECLARE_EVENT_CLASS(xfs_refcount_class,
        TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno,
                xfs_extlen_t len),
        TP_ARGS(cur, gbno, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, gbno)
                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->type = cur->bc_group->xg_type;
                __entry->agno = cur->bc_group->xg_gno;
                __entry->gbno = gbno;
                __entry->len = len;
        ),
        TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->gbno,
                  __entry->len)
);
#define DEFINE_REFCOUNT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_class, name, \
        TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, \
                xfs_extlen_t len), \
        TP_ARGS(cur, gbno, len))

TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
TRACE_EVENT(xfs_refcount_lookup,
        TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno,
                xfs_lookup_t dir),
        TP_ARGS(cur, gbno, dir),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, gbno)
                __field(xfs_lookup_t, dir)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->type = cur->bc_group->xg_type;
                __entry->agno = cur->bc_group->xg_gno;
                __entry->gbno = gbno;
                __entry->dir = dir;
        ),
        TP_printk("dev %d:%d %sno 0x%x gbno 0x%x cmp %s(%d)",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->gbno,
                  __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR),
                  __entry->dir)
)

/* single-rcext tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec),
        TP_ARGS(cur, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(enum xfs_refc_domain, domain)
                __field(xfs_agblock_t, startblock)
                __field(xfs_extlen_t, blockcount)
                __field(xfs_nlink_t, refcount)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->type = cur->bc_group->xg_type;
                __entry->agno = cur->bc_group->xg_gno;
                __entry->domain = irec->rc_domain;
                __entry->startblock = irec->rc_startblock;
                __entry->blockcount = irec->rc_blockcount;
                __entry->refcount = irec->rc_refcount;
        ),
        TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
                  __entry->startblock,
                  __entry->blockcount,
                  __entry->refcount)
)

#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_extent_class, name, \
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec), \
        TP_ARGS(cur, irec))

/* single-rcext and an agbno tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec,
                 xfs_agblock_t gbno),
        TP_ARGS(cur, irec, gbno),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(enum xfs_refc_domain, domain)
                __field(xfs_agblock_t, startblock)
                __field(xfs_extlen_t, blockcount)
                __field(xfs_nlink_t, refcount)
                __field(xfs_agblock_t, gbno)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->type = cur->bc_group->xg_type;
                __entry->agno = cur->bc_group->xg_gno;
                __entry->domain = irec->rc_domain;
                __entry->startblock = irec->rc_startblock;
                __entry->blockcount = irec->rc_blockcount;
                __entry->refcount = irec->rc_refcount;
                __entry->gbno = gbno;
        ),
        TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
                  __entry->startblock,
                  __entry->blockcount,
                  __entry->refcount,
                  __entry->gbno)
)

#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \
                 xfs_agblock_t gbno), \
        TP_ARGS(cur, irec, gbno))

/* double-rcext tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
                 struct xfs_refcount_irec *i2),
        TP_ARGS(cur, i1, i2),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(enum xfs_refc_domain, i1_domain)
                __field(xfs_agblock_t, i1_startblock)
                __field(xfs_extlen_t, i1_blockcount)
                __field(xfs_nlink_t, i1_refcount)
                __field(enum xfs_refc_domain, i2_domain)
                __field(xfs_agblock_t, i2_startblock)
                __field(xfs_extlen_t, i2_blockcount)
                __field(xfs_nlink_t, i2_refcount)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->type = cur->bc_group->xg_type;
                __entry->agno = cur->bc_group->xg_gno;
                __entry->i1_domain = i1->rc_domain;
                __entry->i1_startblock = i1->rc_startblock;
                __entry->i1_blockcount = i1->rc_blockcount;
                __entry->i1_refcount = i1->rc_refcount;
                __entry->i2_domain = i2->rc_domain;
                __entry->i2_startblock = i2->rc_startblock;
                __entry->i2_blockcount = i2->rc_blockcount;
                __entry->i2_refcount = i2->rc_refcount;
        ),
        TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
                  "dom %s gbno 0x%x fsbcount 0x%x refcount %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
                  __entry->i1_startblock,
                  __entry->i1_blockcount,
                  __entry->i1_refcount,
                  __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
                  __entry->i2_startblock,
                  __entry->i2_blockcount,
                  __entry->i2_refcount)
)

#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
                 struct xfs_refcount_irec *i2), \
        TP_ARGS(cur, i1, i2))

/* double-rcext and an agbno tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
                 struct xfs_refcount_irec *i2, xfs_agblock_t gbno),
        TP_ARGS(cur, i1, i2, gbno),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(enum xfs_refc_domain, i1_domain)
                __field(xfs_agblock_t, i1_startblock)
                __field(xfs_extlen_t, i1_blockcount)
                __field(xfs_nlink_t, i1_refcount)
                __field(enum xfs_refc_domain, i2_domain)
                __field(xfs_agblock_t, i2_startblock)
                __field(xfs_extlen_t, i2_blockcount)
                __field(xfs_nlink_t, i2_refcount)
                __field(xfs_agblock_t, gbno)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->type = cur->bc_group->xg_type;
                __entry->agno = cur->bc_group->xg_gno;
                __entry->i1_domain = i1->rc_domain;
                __entry->i1_startblock = i1->rc_startblock;
                __entry->i1_blockcount = i1->rc_blockcount;
                __entry->i1_refcount = i1->rc_refcount;
                __entry->i2_domain = i2->rc_domain;
                __entry->i2_startblock = i2->rc_startblock;
                __entry->i2_blockcount = i2->rc_blockcount;
                __entry->i2_refcount = i2->rc_refcount;
                __entry->gbno = gbno;
        ),
        TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
                  "dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
                  __entry->i1_startblock,
                  __entry->i1_blockcount,
                  __entry->i1_refcount,
                  __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
                  __entry->i2_startblock,
                  __entry->i2_blockcount,
                  __entry->i2_refcount,
                  __entry->gbno)
)

#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
                struct xfs_refcount_irec *i2, xfs_agblock_t gbno), \
        TP_ARGS(cur, i1, i2, gbno))

/* triple-rcext tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
                struct xfs_refcount_irec *i2, struct xfs_refcount_irec *i3),
        TP_ARGS(cur, i1, i2, i3),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(enum xfs_refc_domain, i1_domain)
                __field(xfs_agblock_t, i1_startblock)
                __field(xfs_extlen_t, i1_blockcount)
                __field(xfs_nlink_t, i1_refcount)
                __field(enum xfs_refc_domain, i2_domain)
                __field(xfs_agblock_t, i2_startblock)
                __field(xfs_extlen_t, i2_blockcount)
                __field(xfs_nlink_t, i2_refcount)
                __field(enum xfs_refc_domain, i3_domain)
                __field(xfs_agblock_t, i3_startblock)
                __field(xfs_extlen_t, i3_blockcount)
                __field(xfs_nlink_t, i3_refcount)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __entry->type = cur->bc_group->xg_type;
                __entry->agno = cur->bc_group->xg_gno;
                __entry->i1_domain = i1->rc_domain;
                __entry->i1_startblock = i1->rc_startblock;
                __entry->i1_blockcount = i1->rc_blockcount;
                __entry->i1_refcount = i1->rc_refcount;
                __entry->i2_domain = i2->rc_domain;
                __entry->i2_startblock = i2->rc_startblock;
                __entry->i2_blockcount = i2->rc_blockcount;
                __entry->i2_refcount = i2->rc_refcount;
                __entry->i3_domain = i3->rc_domain;
                __entry->i3_startblock = i3->rc_startblock;
                __entry->i3_blockcount = i3->rc_blockcount;
                __entry->i3_refcount = i3->rc_refcount;
        ),
        TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
                  "dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
                  "dom %s gbno 0x%x fsbcount 0x%x refcount %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
                  __entry->i1_startblock,
                  __entry->i1_blockcount,
                  __entry->i1_refcount,
                  __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
                  __entry->i2_startblock,
                  __entry->i2_blockcount,
                  __entry->i2_refcount,
                  __print_symbolic(__entry->i3_domain, XFS_REFC_DOMAIN_STRINGS),
                  __entry->i3_startblock,
                  __entry->i3_blockcount,
                  __entry->i3_refcount)
);

#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
        TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
                struct xfs_refcount_irec *i2, struct xfs_refcount_irec *i3), \
        TP_ARGS(cur, i1, i2, i3))

/* refcount btree tracepoints */
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_insert_error);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_delete_error);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_update_error);

/* refcount adjustment tracepoints */
DEFINE_REFCOUNT_EVENT(xfs_refcount_increase);
DEFINE_REFCOUNT_EVENT(xfs_refcount_decrease);
DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_increase);
DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_decrease);
DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent);
DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent);
DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent);
DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent);
DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent);
DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_adjust_error);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_adjust_cow_error);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_center_extents_error);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_modify_extent_error);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_split_extent_error);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_left_extent_error);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_right_extent_error);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_left_extent_error);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_right_extent_error);

/* reflink helpers */
DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared);
DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared_result);
DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_shared_error);

TRACE_DEFINE_ENUM(XFS_REFCOUNT_INCREASE);
TRACE_DEFINE_ENUM(XFS_REFCOUNT_DECREASE);
TRACE_DEFINE_ENUM(XFS_REFCOUNT_ALLOC_COW);
TRACE_DEFINE_ENUM(XFS_REFCOUNT_FREE_COW);

DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
        TP_PROTO(struct xfs_mount *mp, struct xfs_refcount_intent *refc),
        TP_ARGS(mp, refc),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(xfs_agnumber_t, agno)
                __field(int, op)
                __field(xfs_agblock_t, gbno)
                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->type = refc->ri_group->xg_type;
                __entry->agno = refc->ri_group->xg_gno;
                __entry->op = refc->ri_type;
                __entry->gbno = xfs_fsb_to_gbno(mp, refc->ri_startblock,
                                                   refc->ri_group->xg_type);
                __entry->len = refc->ri_blockcount;
        ),
        TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x fsbcount 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->agno,
                  __entry->gbno,
                  __entry->len)
);
#define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \
DEFINE_EVENT(xfs_refcount_deferred_class, name, \
        TP_PROTO(struct xfs_mount *mp, struct xfs_refcount_intent *refc), \
        TP_ARGS(mp, refc))
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_finish_one_leftover);

/* simple inode-based error/%ip tracepoint class */
DECLARE_EVENT_CLASS(xfs_inode_error_class,
        TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip),
        TP_ARGS(ip, error, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(int, error)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->error = error;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d ino 0x%llx error %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->error,
                  (char *)__entry->caller_ip)
);

#define DEFINE_INODE_ERROR_EVENT(name) \
DEFINE_EVENT(xfs_inode_error_class, name, \
        TP_PROTO(struct xfs_inode *ip, int error, \
                 unsigned long caller_ip), \
        TP_ARGS(ip, error, caller_ip))

/* reflink tracepoint classes */

/* two-file io tracepoint class */
DECLARE_EVENT_CLASS(xfs_double_io_class,
        TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len,
                 struct xfs_inode *dest, xfs_off_t doffset),
        TP_ARGS(src, soffset, len, dest, doffset),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, src_ino)
                __field(loff_t, src_isize)
                __field(loff_t, src_disize)
                __field(loff_t, src_offset)
                __field(long long, len)
                __field(xfs_ino_t, dest_ino)
                __field(loff_t, dest_isize)
                __field(loff_t, dest_disize)
                __field(loff_t, dest_offset)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(src)->i_sb->s_dev;
                __entry->src_ino = src->i_ino;
                __entry->src_isize = VFS_I(src)->i_size;
                __entry->src_disize = src->i_disk_size;
                __entry->src_offset = soffset;
                __entry->len = len;
                __entry->dest_ino = dest->i_ino;
                __entry->dest_isize = VFS_I(dest)->i_size;
                __entry->dest_disize = dest->i_disk_size;
                __entry->dest_offset = doffset;
        ),
        TP_printk("dev %d:%d bytecount 0x%llx "
                  "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> "
                  "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->len,
                  __entry->src_ino,
                  __entry->src_isize,
                  __entry->src_disize,
                  __entry->src_offset,
                  __entry->dest_ino,
                  __entry->dest_isize,
                  __entry->dest_disize,
                  __entry->dest_offset)
)

#define DEFINE_DOUBLE_IO_EVENT(name)        \
DEFINE_EVENT(xfs_double_io_class, name,        \
        TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, \
                 struct xfs_inode *dest, xfs_off_t doffset), \
        TP_ARGS(src, soffset, len, dest, doffset))

/* inode/irec events */
DECLARE_EVENT_CLASS(xfs_inode_irec_class,
        TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec),
        TP_ARGS(ip, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_fileoff_t, lblk)
                __field(xfs_extlen_t, len)
                __field(xfs_fsblock_t, pblk)
                __field(int, state)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->lblk = irec->br_startoff;
                __entry->len = irec->br_blockcount;
                __entry->pblk = irec->br_startblock;
                __entry->state = irec->br_state;
        ),
        TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx fsbcount 0x%x startblock 0x%llx st %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->lblk,
                  __entry->len,
                  __entry->pblk,
                  __entry->state)
);
#define DEFINE_INODE_IREC_EVENT(name) \
DEFINE_EVENT(xfs_inode_irec_class, name, \
        TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \
        TP_ARGS(ip, irec))

/* inode iomap invalidation events */
DECLARE_EVENT_CLASS(xfs_wb_invalid_class,
        TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork),
        TP_ARGS(ip, iomap, wpcseq, whichfork),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(u64, addr)
                __field(loff_t, pos)
                __field(u64, len)
                __field(u16, type)
                __field(u16, flags)
                __field(u32, wpcseq)
                __field(u32, forkseq)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->addr = iomap->addr;
                __entry->pos = iomap->offset;
                __entry->len = iomap->length;
                __entry->type = iomap->type;
                __entry->flags = iomap->flags;
                __entry->wpcseq = wpcseq;
                __entry->forkseq = READ_ONCE(xfs_ifork_ptr(ip, whichfork)->if_seq);
        ),
        TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->pos,
                  __entry->addr,
                  __entry->len,
                  __entry->type,
                  __entry->flags,
                  __entry->wpcseq,
                  __entry->forkseq)
);
#define DEFINE_WB_INVALID_EVENT(name) \
DEFINE_EVENT(xfs_wb_invalid_class, name, \
        TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), \
        TP_ARGS(ip, iomap, wpcseq, whichfork))
DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid);
DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid);

DECLARE_EVENT_CLASS(xfs_iomap_invalid_class,
        TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap),
        TP_ARGS(ip, iomap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(u64, addr)
                __field(loff_t, pos)
                __field(u64, len)
                __field(u64, validity_cookie)
                __field(u64, inodeseq)
                __field(u16, type)
                __field(u16, flags)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->addr = iomap->addr;
                __entry->pos = iomap->offset;
                __entry->len = iomap->length;
                __entry->validity_cookie = iomap->validity_cookie;
                __entry->type = iomap->type;
                __entry->flags = iomap->flags;
                __entry->inodeseq = xfs_iomap_inode_sequence(ip, iomap->flags);
        ),
        TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->pos,
                  __entry->addr,
                  __entry->len,
                  __entry->type,
                  __entry->flags,
                  __entry->validity_cookie,
                  __entry->inodeseq)
);
#define DEFINE_IOMAP_INVALID_EVENT(name) \
DEFINE_EVENT(xfs_iomap_invalid_class, name, \
        TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), \
        TP_ARGS(ip, iomap))
DEFINE_IOMAP_INVALID_EVENT(xfs_iomap_invalid);

/* refcount/reflink tracepoint definitions */

/* reflink tracepoints */
DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
TRACE_EVENT(xfs_reflink_remap_blocks,
        TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
                 xfs_filblks_t len, struct xfs_inode *dest,
                 xfs_fileoff_t doffset),
        TP_ARGS(src, soffset, len, dest, doffset),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, src_ino)
                __field(xfs_fileoff_t, src_lblk)
                __field(xfs_filblks_t, len)
                __field(xfs_ino_t, dest_ino)
                __field(xfs_fileoff_t, dest_lblk)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(src)->i_sb->s_dev;
                __entry->src_ino = src->i_ino;
                __entry->src_lblk = soffset;
                __entry->len = len;
                __entry->dest_ino = dest->i_ino;
                __entry->dest_lblk = doffset;
        ),
        TP_printk("dev %d:%d fsbcount 0x%llx "
                  "ino 0x%llx fileoff 0x%llx -> ino 0x%llx fileoff 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->len,
                  __entry->src_ino,
                  __entry->src_lblk,
                  __entry->dest_ino,
                  __entry->dest_lblk)
);
DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src);
DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest);

/* unshare tracepoints */
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);

/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);

DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);

DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);


DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);

/* rmap swapext tracepoints */
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);

/* fsmap traces */
TRACE_EVENT(xfs_fsmap_mapping,
        TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
                 const struct xfs_fsmap_irec *frec),
        TP_ARGS(mp, keydev, agno, frec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_daddr_t, start_daddr)
                __field(xfs_daddr_t, len_daddr)
                __field(uint64_t, owner)
                __field(uint64_t, offset)
                __field(unsigned int, flags)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->keydev = new_decode_dev(keydev);
                __entry->agno = agno;
                __entry->agbno = frec->rec_key;
                __entry->start_daddr = frec->start_daddr;
                __entry->len_daddr = frec->len_daddr;
                __entry->owner = frec->owner;
                __entry->offset = frec->offset;
                __entry->flags = frec->rm_flags;
        ),
        TP_printk("dev %d:%d keydev %d:%d agno 0x%x gbno 0x%x start_daddr 0x%llx len_daddr 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->start_daddr,
                  __entry->len_daddr,
                  __entry->owner,
                  __entry->offset,
                  __entry->flags)
);

DECLARE_EVENT_CLASS(xfs_fsmap_group_key_class,
        TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
                 const struct xfs_rmap_irec *rmap),
        TP_ARGS(mp, keydev, agno, rmap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(uint64_t, owner)
                __field(uint64_t, offset)
                __field(unsigned int, flags)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->keydev = new_decode_dev(keydev);
                __entry->agno = agno;
                __entry->agbno = rmap->rm_startblock;
                __entry->owner = rmap->rm_owner;
                __entry->offset = rmap->rm_offset;
                __entry->flags = rmap->rm_flags;
        ),
        TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->owner,
                  __entry->offset,
                  __entry->flags)
)
#define DEFINE_FSMAP_GROUP_KEY_EVENT(name) \
DEFINE_EVENT(xfs_fsmap_group_key_class, name, \
        TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \
                 const struct xfs_rmap_irec *rmap), \
        TP_ARGS(mp, keydev, agno, rmap))
DEFINE_FSMAP_GROUP_KEY_EVENT(xfs_fsmap_low_group_key);
DEFINE_FSMAP_GROUP_KEY_EVENT(xfs_fsmap_high_group_key);

DECLARE_EVENT_CLASS(xfs_fsmap_linear_key_class,
        TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_fsblock_t bno),
        TP_ARGS(mp, keydev, bno),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(xfs_fsblock_t, bno)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->keydev = new_decode_dev(keydev);
                __entry->bno = bno;
        ),
        TP_printk("dev %d:%d keydev %d:%d bno 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->bno)
)
#define DEFINE_FSMAP_LINEAR_KEY_EVENT(name) \
DEFINE_EVENT(xfs_fsmap_linear_key_class, name, \
        TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), \
        TP_ARGS(mp, keydev, bno))
DEFINE_FSMAP_LINEAR_KEY_EVENT(xfs_fsmap_low_linear_key);
DEFINE_FSMAP_LINEAR_KEY_EVENT(xfs_fsmap_high_linear_key);

DECLARE_EVENT_CLASS(xfs_getfsmap_class,
        TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap),
        TP_ARGS(mp, fsmap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(xfs_daddr_t, block)
                __field(xfs_daddr_t, len)
                __field(uint64_t, owner)
                __field(uint64_t, offset)
                __field(uint64_t, flags)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->keydev = new_decode_dev(fsmap->fmr_device);
                __entry->block = fsmap->fmr_physical;
                __entry->len = fsmap->fmr_length;
                __entry->owner = fsmap->fmr_owner;
                __entry->offset = fsmap->fmr_offset;
                __entry->flags = fsmap->fmr_flags;
        ),
        TP_printk("dev %d:%d keydev %d:%d daddr 0x%llx bbcount 0x%llx owner 0x%llx fileoff_daddr 0x%llx flags 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->block,
                  __entry->len,
                  __entry->owner,
                  __entry->offset,
                  __entry->flags)
)
#define DEFINE_GETFSMAP_EVENT(name) \
DEFINE_EVENT(xfs_getfsmap_class, name, \
        TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), \
        TP_ARGS(mp, fsmap))
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);

DECLARE_EVENT_CLASS(xfs_trans_resv_class,
        TP_PROTO(struct xfs_mount *mp, unsigned int type,
                 struct xfs_trans_res *res),
        TP_ARGS(mp, type, res),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, type)
                __field(uint, logres)
                __field(int, logcount)
                __field(int, logflags)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->type = type;
                __entry->logres = res->tr_logres;
                __entry->logcount = res->tr_logcount;
                __entry->logflags = res->tr_logflags;
        ),
        TP_printk("dev %d:%d type %d logres %u logcount %d flags 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->type,
                  __entry->logres,
                  __entry->logcount,
                  __entry->logflags)
)

#define DEFINE_TRANS_RESV_EVENT(name) \
DEFINE_EVENT(xfs_trans_resv_class, name, \
        TP_PROTO(struct xfs_mount *mp, unsigned int type, \
                 struct xfs_trans_res *res), \
        TP_ARGS(mp, type, res))
DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc);
DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize);

TRACE_EVENT(xfs_log_get_max_trans_res,
        TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res),
        TP_ARGS(mp, res),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(uint, logres)
                __field(int, logcount)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->logres = res->tr_logres;
                __entry->logcount = res->tr_logcount;
        ),
        TP_printk("dev %d:%d logres %u logcount %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->logres,
                  __entry->logcount)
);

DECLARE_EVENT_CLASS(xfs_trans_class,
        TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip),
        TP_ARGS(tp, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(uint32_t, tid)
                __field(uint32_t, flags)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = tp->t_mountp->m_super->s_dev;
                __entry->tid = 0;
                if (tp->t_ticket)
                        __entry->tid = tp->t_ticket->t_tid;
                __entry->flags = tp->t_flags;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d trans %x flags 0x%x caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tid,
                  __entry->flags,
                  (char *)__entry->caller_ip)
)

#define DEFINE_TRANS_EVENT(name) \
DEFINE_EVENT(xfs_trans_class, name, \
        TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), \
        TP_ARGS(tp, caller_ip))
DEFINE_TRANS_EVENT(xfs_trans_alloc);
DEFINE_TRANS_EVENT(xfs_trans_cancel);
DEFINE_TRANS_EVENT(xfs_trans_commit);
DEFINE_TRANS_EVENT(xfs_trans_dup);
DEFINE_TRANS_EVENT(xfs_trans_free);
DEFINE_TRANS_EVENT(xfs_trans_roll);
DEFINE_TRANS_EVENT(xfs_trans_add_item);
DEFINE_TRANS_EVENT(xfs_trans_commit_items);
DEFINE_TRANS_EVENT(xfs_trans_free_items);

TRACE_EVENT(xfs_iunlink_update_bucket,
        TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
                 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
        TP_ARGS(pag, bucket, old_ptr, new_ptr),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(unsigned int, bucket)
                __field(xfs_agino_t, old_ptr)
                __field(xfs_agino_t, new_ptr)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->agno = pag_agno(pag);
                __entry->bucket = bucket;
                __entry->old_ptr = old_ptr;
                __entry->new_ptr = new_ptr;
        ),
        TP_printk("dev %d:%d agno 0x%x bucket %u old 0x%x new 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->bucket,
                  __entry->old_ptr,
                  __entry->new_ptr)
);

TRACE_EVENT(xfs_iunlink_update_dinode,
        TP_PROTO(const struct xfs_iunlink_item *iup, xfs_agino_t old_ptr),
        TP_ARGS(iup, old_ptr),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agino_t, agino)
                __field(xfs_agino_t, old_ptr)
                __field(xfs_agino_t, new_ptr)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(iup->pag)->m_super->s_dev;
                __entry->agno = pag_agno(iup->pag);
                __entry->agino =
                        XFS_INO_TO_AGINO(iup->ip->i_mount, iup->ip->i_ino);
                __entry->old_ptr = old_ptr;
                __entry->new_ptr = iup->next_agino;
        ),
        TP_printk("dev %d:%d agno 0x%x agino 0x%x old 0x%x new 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agino,
                  __entry->old_ptr,
                  __entry->new_ptr)
);

TRACE_EVENT(xfs_iunlink_reload_next,
        TP_PROTO(struct xfs_inode *ip),
        TP_ARGS(ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agino_t, agino)
                __field(xfs_agino_t, prev_agino)
                __field(xfs_agino_t, next_agino)
        ),
        TP_fast_assign(
                __entry->dev = ip->i_mount->m_super->s_dev;
                __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
                __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
                __entry->prev_agino = ip->i_prev_unlinked;
                __entry->next_agino = ip->i_next_unlinked;
        ),
        TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agino,
                  __entry->prev_agino,
                  __entry->next_agino)
);

TRACE_EVENT(xfs_inode_reload_unlinked_bucket,
        TP_PROTO(struct xfs_inode *ip),
        TP_ARGS(ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agino_t, agino)
        ),
        TP_fast_assign(
                __entry->dev = ip->i_mount->m_super->s_dev;
                __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
                __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
        ),
        TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agino,
                  __entry->agino % XFS_AGI_UNLINKED_BUCKETS)
);

DECLARE_EVENT_CLASS(xfs_ag_inode_class,
        TP_PROTO(struct xfs_inode *ip),
        TP_ARGS(ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agino_t, agino)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
                __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
        ),
        TP_printk("dev %d:%d agno 0x%x agino 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno, __entry->agino)
)

#define DEFINE_AGINODE_EVENT(name) \
DEFINE_EVENT(xfs_ag_inode_class, name, \
        TP_PROTO(struct xfs_inode *ip), \
        TP_ARGS(ip))
DEFINE_AGINODE_EVENT(xfs_iunlink);
DEFINE_AGINODE_EVENT(xfs_iunlink_remove);

DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
        TP_PROTO(struct xfs_mount *mp, unsigned int flags),
        TP_ARGS(mp, flags),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, flags)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->flags = flags;
        ),
        TP_printk("dev %d:%d flags 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->flags)
);
#define DEFINE_FS_CORRUPT_EVENT(name)        \
DEFINE_EVENT(xfs_fs_corrupt_class, name,        \
        TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
        TP_ARGS(mp, flags))
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_corrupt);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);

DECLARE_EVENT_CLASS(xfs_group_corrupt_class,
        TP_PROTO(const struct xfs_group *xg, unsigned int flags),
        TP_ARGS(xg, flags),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(uint32_t, index)
                __field(unsigned int, flags)
        ),
        TP_fast_assign(
                __entry->dev = xg->xg_mount->m_super->s_dev;
                __entry->type = xg->xg_type;
                __entry->index = xg->xg_gno;
                __entry->flags = flags;
        ),
        TP_printk("dev %d:%d %sno 0x%x flags 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->index, __entry->flags)
);
#define DEFINE_GROUP_CORRUPT_EVENT(name)        \
DEFINE_EVENT(xfs_group_corrupt_class, name,        \
        TP_PROTO(const struct xfs_group *xg, unsigned int flags), \
        TP_ARGS(xg, flags))
DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_sick);
DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_corrupt);
DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_healthy);
DEFINE_GROUP_CORRUPT_EVENT(xfs_group_unfixed_corruption);

DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
        TP_PROTO(struct xfs_inode *ip, unsigned int flags),
        TP_ARGS(ip, flags),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(unsigned int, flags)
        ),
        TP_fast_assign(
                __entry->dev = ip->i_mount->m_super->s_dev;
                __entry->ino = ip->i_ino;
                __entry->flags = flags;
        ),
        TP_printk("dev %d:%d ino 0x%llx flags 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->flags)
);
#define DEFINE_INODE_CORRUPT_EVENT(name)        \
DEFINE_EVENT(xfs_inode_corrupt_class, name,        \
        TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
        TP_ARGS(ip, flags))
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt);
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_unfixed_corruption);

TRACE_EVENT(xfs_iwalk_ag_rec,
        TP_PROTO(const struct xfs_perag *pag, \
                 struct xfs_inobt_rec_incore *irec),
        TP_ARGS(pag, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agino_t, startino)
                __field(uint64_t, freemask)
        ),
        TP_fast_assign(
                __entry->dev = pag_mount(pag)->m_super->s_dev;
                __entry->agno = pag_agno(pag);
                __entry->startino = irec->ir_startino;
                __entry->freemask = irec->ir_free;
        ),
        TP_printk("dev %d:%d agno 0x%x startino 0x%x freemask 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
                  __entry->startino, __entry->freemask)
)

TRACE_EVENT(xfs_pwork_init,
        TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
        TP_ARGS(mp, nr_threads, pid),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, nr_threads)
                __field(pid_t, pid)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->nr_threads = nr_threads;
                __entry->pid = pid;
        ),
        TP_printk("dev %d:%d nr_threads %u pid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_threads, __entry->pid)
)

TRACE_EVENT(xfs_check_new_dalign,
        TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
        TP_ARGS(mp, new_dalign, calc_rootino),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, new_dalign)
                __field(xfs_ino_t, sb_rootino)
                __field(xfs_ino_t, calc_rootino)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->new_dalign = new_dalign;
                __entry->sb_rootino = mp->m_sb.sb_rootino;
                __entry->calc_rootino = calc_rootino;
        ),
        TP_printk("dev %d:%d new_dalign %d sb_rootino 0x%llx calc_rootino 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->new_dalign, __entry->sb_rootino,
                  __entry->calc_rootino)
)

TRACE_EVENT(xfs_btree_commit_afakeroot,
        TP_PROTO(struct xfs_btree_cur *cur),
        TP_ARGS(cur),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __string(name, cur->bc_ops->name)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(unsigned int, levels)
                __field(unsigned int, blocks)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __assign_str(name);
                __entry->agno = cur->bc_group->xg_gno;
                __entry->agbno = cur->bc_ag.afake->af_root;
                __entry->levels = cur->bc_ag.afake->af_levels;
                __entry->blocks = cur->bc_ag.afake->af_blocks;
        ),
        TP_printk("dev %d:%d %sbt agno 0x%x levels %u blocks %u root %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->agno,
                  __entry->levels,
                  __entry->blocks,
                  __entry->agbno)
)

TRACE_EVENT(xfs_btree_commit_ifakeroot,
        TP_PROTO(struct xfs_btree_cur *cur),
        TP_ARGS(cur),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __string(name, cur->bc_ops->name)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agino_t, agino)
                __field(unsigned int, levels)
                __field(unsigned int, blocks)
                __field(int, whichfork)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __assign_str(name);
                __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
                                        cur->bc_ino.ip->i_ino);
                __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
                                        cur->bc_ino.ip->i_ino);
                __entry->levels = cur->bc_ino.ifake->if_levels;
                __entry->blocks = cur->bc_ino.ifake->if_blocks;
                __entry->whichfork = cur->bc_ino.whichfork;
        ),
        TP_printk("dev %d:%d %sbt agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->agno,
                  __entry->agino,
                  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
                  __entry->levels,
                  __entry->blocks)
)

TRACE_EVENT(xfs_btree_bload_level_geometry,
        TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
                 uint64_t nr_this_level, unsigned int nr_per_block,
                 unsigned int desired_npb, uint64_t blocks,
                 uint64_t blocks_with_extra),
        TP_ARGS(cur, level, nr_this_level, nr_per_block, desired_npb, blocks,
                blocks_with_extra),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __string(name, cur->bc_ops->name)
                __field(unsigned int, level)
                __field(unsigned int, nlevels)
                __field(uint64_t, nr_this_level)
                __field(unsigned int, nr_per_block)
                __field(unsigned int, desired_npb)
                __field(unsigned long long, blocks)
                __field(unsigned long long, blocks_with_extra)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __assign_str(name);
                __entry->level = level;
                __entry->nlevels = cur->bc_nlevels;
                __entry->nr_this_level = nr_this_level;
                __entry->nr_per_block = nr_per_block;
                __entry->desired_npb = desired_npb;
                __entry->blocks = blocks;
                __entry->blocks_with_extra = blocks_with_extra;
        ),
        TP_printk("dev %d:%d %sbt level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->level,
                  __entry->nlevels,
                  __entry->nr_this_level,
                  __entry->nr_per_block,
                  __entry->desired_npb,
                  __entry->blocks,
                  __entry->blocks_with_extra)
)

TRACE_EVENT(xfs_btree_bload_block,
        TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
                 uint64_t block_idx, uint64_t nr_blocks,
                 union xfs_btree_ptr *ptr, unsigned int nr_records),
        TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __string(name, cur->bc_ops->name)
                __field(unsigned int, level)
                __field(unsigned long long, block_idx)
                __field(unsigned long long, nr_blocks)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(unsigned int, nr_records)
        ),
        TP_fast_assign(
                __entry->dev = cur->bc_mp->m_super->s_dev;
                __assign_str(name);
                __entry->level = level;
                __entry->block_idx = block_idx;
                __entry->nr_blocks = nr_blocks;
                if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
                        xfs_fsblock_t        fsb = be64_to_cpu(ptr->l);

                        __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
                        __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb);
                } else {
                        __entry->agno = cur->bc_group->xg_gno;
                        __entry->agbno = be32_to_cpu(ptr->s);
                }
                __entry->nr_records = nr_records;
        ),
        TP_printk("dev %d:%d %sbt level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __get_str(name),
                  __entry->level,
                  __entry->block_idx,
                  __entry->nr_blocks,
                  __entry->agno,
                  __entry->agbno,
                  __entry->nr_records)
)

DECLARE_EVENT_CLASS(xfs_timestamp_range_class,
        TP_PROTO(struct xfs_mount *mp, time64_t min, time64_t max),
        TP_ARGS(mp, min, max),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(long long, min)
                __field(long long, max)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->min = min;
                __entry->max = max;
        ),
        TP_printk("dev %d:%d min %lld max %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->min,
                  __entry->max)
)

#define DEFINE_TIMESTAMP_RANGE_EVENT(name) \
DEFINE_EVENT(xfs_timestamp_range_class, name, \
        TP_PROTO(struct xfs_mount *mp, long long min, long long max), \
        TP_ARGS(mp, min, max))
DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range);
DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);

DECLARE_EVENT_CLASS(xfs_icwalk_class,
        TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw,
                 unsigned long caller_ip),
        TP_ARGS(mp, icw, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(__u32, flags)
                __field(uint32_t, uid)
                __field(uint32_t, gid)
                __field(prid_t, prid)
                __field(__u64, min_file_size)
                __field(long, scan_limit)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->flags = icw ? icw->icw_flags : 0;
                __entry->uid = icw ? from_kuid(mp->m_super->s_user_ns,
                                                icw->icw_uid) : 0;
                __entry->gid = icw ? from_kgid(mp->m_super->s_user_ns,
                                                icw->icw_gid) : 0;
                __entry->prid = icw ? icw->icw_prid : 0;
                __entry->min_file_size = icw ? icw->icw_min_file_size : 0;
                __entry->scan_limit = icw ? icw->icw_scan_limit : 0;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %ld caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->flags,
                  __entry->uid,
                  __entry->gid,
                  __entry->prid,
                  __entry->min_file_size,
                  __entry->scan_limit,
                  (char *)__entry->caller_ip)
);
#define DEFINE_ICWALK_EVENT(name)        \
DEFINE_EVENT(xfs_icwalk_class, name,        \
        TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw, \
                 unsigned long caller_ip), \
        TP_ARGS(mp, icw, caller_ip))
DEFINE_ICWALK_EVENT(xfs_ioc_free_eofblocks);
DEFINE_ICWALK_EVENT(xfs_blockgc_free_space);

TRACE_DEFINE_ENUM(XLOG_STATE_ACTIVE);
TRACE_DEFINE_ENUM(XLOG_STATE_WANT_SYNC);
TRACE_DEFINE_ENUM(XLOG_STATE_SYNCING);
TRACE_DEFINE_ENUM(XLOG_STATE_DONE_SYNC);
TRACE_DEFINE_ENUM(XLOG_STATE_CALLBACK);
TRACE_DEFINE_ENUM(XLOG_STATE_DIRTY);

DECLARE_EVENT_CLASS(xlog_iclog_class,
        TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip),
        TP_ARGS(iclog, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(uint32_t, state)
                __field(int32_t, refcount)
                __field(uint32_t, offset)
                __field(uint32_t, flags)
                __field(unsigned long long, lsn)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = iclog->ic_log->l_mp->m_super->s_dev;
                __entry->state = iclog->ic_state;
                __entry->refcount = atomic_read(&iclog->ic_refcnt);
                __entry->offset = iclog->ic_offset;
                __entry->flags = iclog->ic_flags;
                __entry->lsn = be64_to_cpu(iclog->ic_header->h_lsn);
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->state, XLOG_STATE_STRINGS),
                  __entry->refcount,
                  __entry->offset,
                  __entry->lsn,
                  __print_flags(__entry->flags, "|", XLOG_ICL_STRINGS),
                  (char *)__entry->caller_ip)

);

#define DEFINE_ICLOG_EVENT(name)        \
DEFINE_EVENT(xlog_iclog_class, name,        \
        TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), \
        TP_ARGS(iclog, caller_ip))

DEFINE_ICLOG_EVENT(xlog_iclog_activate);
DEFINE_ICLOG_EVENT(xlog_iclog_clean);
DEFINE_ICLOG_EVENT(xlog_iclog_callback);
DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_start);
DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_done);
DEFINE_ICLOG_EVENT(xlog_iclog_force);
DEFINE_ICLOG_EVENT(xlog_iclog_force_lsn);
DEFINE_ICLOG_EVENT(xlog_iclog_get_space);
DEFINE_ICLOG_EVENT(xlog_iclog_release);
DEFINE_ICLOG_EVENT(xlog_iclog_switch);
DEFINE_ICLOG_EVENT(xlog_iclog_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_syncing);
DEFINE_ICLOG_EVENT(xlog_iclog_sync_done);
DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
DEFINE_ICLOG_EVENT(xlog_iclog_write);

TRACE_DEFINE_ENUM(XFS_DAS_UNINIT);
TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD);
TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE);
TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD);
TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE);
TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD);
TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE);
TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT);
TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT);
TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE);
TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD);
TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT);
TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR);
TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT);
TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT);
TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE);
TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD);
TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT);
TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR);
TRACE_DEFINE_ENUM(XFS_DAS_DONE);

DECLARE_EVENT_CLASS(xfs_das_state_class,
        TP_PROTO(int das, struct xfs_inode *ip),
        TP_ARGS(das, ip),
        TP_STRUCT__entry(
                __field(int, das)
                __field(xfs_ino_t, ino)
        ),
        TP_fast_assign(
                __entry->das = das;
                __entry->ino = ip->i_ino;
        ),
        TP_printk("state change %s ino 0x%llx",
                  __print_symbolic(__entry->das, XFS_DAS_STRINGS),
                  __entry->ino)
)

#define DEFINE_DAS_STATE_EVENT(name) \
DEFINE_EVENT(xfs_das_state_class, name, \
        TP_PROTO(int das, struct xfs_inode *ip), \
        TP_ARGS(das, ip))
DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);


TRACE_EVENT(xfs_force_shutdown,
        TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname,
                 int line_num),
        TP_ARGS(mp, ptag, flags, fname, line_num),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, ptag)
                __field(int, flags)
                __string(fname, fname)
                __field(int, line_num)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->ptag = ptag;
                __entry->flags = flags;
                __assign_str(fname);
                __entry->line_num = line_num;
        ),
        TP_printk("dev %d:%d tag %s flags %s file %s line_num %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                __print_flags(__entry->ptag, "|", XFS_PTAG_STRINGS),
                __print_flags(__entry->flags, "|", XFS_SHUTDOWN_STRINGS),
                __get_str(fname),
                __entry->line_num)
);

#ifdef CONFIG_XFS_DRAIN_INTENTS
DECLARE_EVENT_CLASS(xfs_group_intents_class,
        TP_PROTO(const struct xfs_group *xg, void *caller_ip),
        TP_ARGS(xg, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_group_type, type)
                __field(uint32_t, index)
                __field(long, nr_intents)
                __field(void *, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = xg->xg_mount->m_super->s_dev;
                __entry->type = xg->xg_type;
                __entry->index = xg->xg_gno;
                __entry->nr_intents =
                        atomic_read(&xg->xg_intents_drain.dr_count);
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d %sno 0x%x intents %ld caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XG_TYPE_STRINGS),
                  __entry->index,
                  __entry->nr_intents,
                  __entry->caller_ip)
);

#define DEFINE_GROUP_INTENTS_EVENT(name)        \
DEFINE_EVENT(xfs_group_intents_class, name,                                        \
        TP_PROTO(const struct xfs_group *xg, void *caller_ip), \
        TP_ARGS(xg, caller_ip))
DEFINE_GROUP_INTENTS_EVENT(xfs_group_intent_hold);
DEFINE_GROUP_INTENTS_EVENT(xfs_group_intent_rele);
DEFINE_GROUP_INTENTS_EVENT(xfs_group_wait_intents);

#endif /* CONFIG_XFS_DRAIN_INTENTS */

#ifdef CONFIG_XFS_MEMORY_BUFS
TRACE_EVENT(xmbuf_create,
        TP_PROTO(struct xfs_buftarg *btp),
        TP_ARGS(btp),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long, ino)
        ),
        TP_fast_assign(
                struct file        *file = btp->bt_file;

                __entry->dev = btp->bt_mount->m_super->s_dev;
                __entry->ino = file_inode(file)->i_ino;
        ),
        TP_printk("dev %d:%d xmino 0x%lx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino)
);

TRACE_EVENT(xmbuf_free,
        TP_PROTO(struct xfs_buftarg *btp),
        TP_ARGS(btp),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long, ino)
                __field(unsigned long long, bytes)
                __field(loff_t, size)
        ),
        TP_fast_assign(
                struct file        *file = btp->bt_file;
                struct inode        *inode = file_inode(file);

                __entry->dev = btp->bt_mount->m_super->s_dev;
                __entry->size = i_size_read(inode);
                __entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes;
                __entry->ino = inode->i_ino;
        ),
        TP_printk("dev %d:%d xmino 0x%lx mem_bytes 0x%llx isize 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->bytes,
                  __entry->size)
);
#endif /* CONFIG_XFS_MEMORY_BUFS */

#ifdef CONFIG_XFS_BTREE_IN_MEM
TRACE_EVENT(xfbtree_init,
        TP_PROTO(struct xfs_mount *mp, struct xfbtree *xfbt,
                 const struct xfs_btree_ops *ops),
        TP_ARGS(mp, xfbt, ops),
        TP_STRUCT__entry(
                __field(const void *, btree_ops)
                __field(unsigned long, xfino)
                __field(unsigned int, leaf_mxr)
                __field(unsigned int, leaf_mnr)
                __field(unsigned int, node_mxr)
                __field(unsigned int, node_mnr)
                __field(unsigned long long, owner)
        ),
        TP_fast_assign(
                __entry->btree_ops = ops;
                __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
                __entry->leaf_mxr = xfbt->maxrecs[0];
                __entry->node_mxr = xfbt->maxrecs[1];
                __entry->leaf_mnr = xfbt->minrecs[0];
                __entry->node_mnr = xfbt->minrecs[1];
                __entry->owner = xfbt->owner;
        ),
        TP_printk("xfino 0x%lx btree_ops %pS owner 0x%llx leaf_mxr %u leaf_mnr %u node_mxr %u node_mnr %u",
                  __entry->xfino,
                  __entry->btree_ops,
                  __entry->owner,
                  __entry->leaf_mxr,
                  __entry->leaf_mnr,
                  __entry->node_mxr,
                  __entry->node_mnr)
);

DECLARE_EVENT_CLASS(xfbtree_buf_class,
        TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp),
        TP_ARGS(xfbt, bp),
        TP_STRUCT__entry(
                __field(unsigned long, xfino)
                __field(xfs_daddr_t, bno)
                __field(int, nblks)
                __field(int, hold)
                __field(int, pincount)
                __field(unsigned int, lockval)
                __field(unsigned int, flags)
        ),
        TP_fast_assign(
                __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
                __entry->bno = xfs_buf_daddr(bp);
                __entry->nblks = bp->b_length;
                __entry->hold = bp->b_lockref.count;
                __entry->pincount = atomic_read(&bp->b_pin_count);
                __entry->lockval = bp->b_sema.count;
                __entry->flags = bp->b_flags;
        ),
        TP_printk("xfino 0x%lx daddr 0x%llx bbcount 0x%x hold %d pincount %d lock %d flags %s",
                  __entry->xfino,
                  (unsigned long long)__entry->bno,
                  __entry->nblks,
                  __entry->hold,
                  __entry->pincount,
                  __entry->lockval,
                  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS))
)

#define DEFINE_XFBTREE_BUF_EVENT(name) \
DEFINE_EVENT(xfbtree_buf_class, name, \
        TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), \
        TP_ARGS(xfbt, bp))
DEFINE_XFBTREE_BUF_EVENT(xfbtree_create_root_buf);
DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_commit_buf);
DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_cancel_buf);

DECLARE_EVENT_CLASS(xfbtree_freesp_class,
        TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur,
                 xfs_fileoff_t fileoff),
        TP_ARGS(xfbt, cur, fileoff),
        TP_STRUCT__entry(
                __field(unsigned long, xfino)
                __string(btname, cur->bc_ops->name)
                __field(int, nlevels)
                __field(xfs_fileoff_t, fileoff)
        ),
        TP_fast_assign(
                __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
                __assign_str(btname);
                __entry->nlevels = cur->bc_nlevels;
                __entry->fileoff = fileoff;
        ),
        TP_printk("xfino 0x%lx %sbt nlevels %d fileoff 0x%llx",
                  __entry->xfino,
                  __get_str(btname),
                  __entry->nlevels,
                  (unsigned long long)__entry->fileoff)
)

#define DEFINE_XFBTREE_FREESP_EVENT(name) \
DEFINE_EVENT(xfbtree_freesp_class, name, \
        TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, \
                 xfs_fileoff_t fileoff), \
        TP_ARGS(xfbt, cur, fileoff))
DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block);
DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block);
#endif /* CONFIG_XFS_BTREE_IN_MEM */

/* exchmaps tracepoints */
#define XFS_EXCHMAPS_STRINGS \
        { XFS_EXCHMAPS_ATTR_FORK,                "ATTRFORK" }, \
        { XFS_EXCHMAPS_SET_SIZES,                "SETSIZES" }, \
        { XFS_EXCHMAPS_INO1_WRITTEN,                "INO1_WRITTEN" }, \
        { XFS_EXCHMAPS_CLEAR_INO1_REFLINK,        "CLEAR_INO1_REFLINK" }, \
        { XFS_EXCHMAPS_CLEAR_INO2_REFLINK,        "CLEAR_INO2_REFLINK" }, \
        { __XFS_EXCHMAPS_INO2_SHORTFORM,        "INO2_SF" }

DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping1_skip);
DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping1);
DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping2);
DEFINE_ITRUNC_EVENT(xfs_exchmaps_update_inode_size);

#define XFS_EXCHRANGE_INODES \
        { 1,        "file1" }, \
        { 2,        "file2" }

DECLARE_EVENT_CLASS(xfs_exchrange_inode_class,
        TP_PROTO(struct xfs_inode *ip, int whichfile),
        TP_ARGS(ip, whichfile),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, whichfile)
                __field(xfs_ino_t, ino)
                __field(int, format)
                __field(xfs_extnum_t, nex)
                __field(int, broot_size)
                __field(int, fork_off)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->whichfile = whichfile;
                __entry->ino = ip->i_ino;
                __entry->format = ip->i_df.if_format;
                __entry->nex = ip->i_df.if_nextents;
                __entry->fork_off = xfs_inode_fork_boff(ip);
        ),
        TP_printk("dev %d:%d ino 0x%llx whichfile %s format %s num_extents %llu forkoff 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_symbolic(__entry->whichfile, XFS_EXCHRANGE_INODES),
                  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
                  __entry->nex,
                  __entry->fork_off)
)

#define DEFINE_EXCHRANGE_INODE_EVENT(name) \
DEFINE_EVENT(xfs_exchrange_inode_class, name, \
        TP_PROTO(struct xfs_inode *ip, int whichfile), \
        TP_ARGS(ip, whichfile))

DEFINE_EXCHRANGE_INODE_EVENT(xfs_exchrange_before);
DEFINE_EXCHRANGE_INODE_EVENT(xfs_exchrange_after);
DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error);

#define XFS_EXCHANGE_RANGE_FLAGS_STRS \
        { XFS_EXCHANGE_RANGE_TO_EOF,                "TO_EOF" }, \
        { XFS_EXCHANGE_RANGE_DSYNC        ,        "DSYNC" }, \
        { XFS_EXCHANGE_RANGE_DRY_RUN,                "DRY_RUN" }, \
        { XFS_EXCHANGE_RANGE_FILE1_WRITTEN,        "F1_WRITTEN" }, \
        { __XFS_EXCHANGE_RANGE_UPD_CMTIME1,        "CMTIME1" }, \
        { __XFS_EXCHANGE_RANGE_UPD_CMTIME2,        "CMTIME2" }, \
        { __XFS_EXCHANGE_RANGE_CHECK_FRESH2,        "FRESH2" }

/* file exchange-range tracepoint class */
DECLARE_EVENT_CLASS(xfs_exchrange_class,
        TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip1,
                 struct xfs_inode *ip2),
        TP_ARGS(fxr, ip1, ip2),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ip1_ino)
                __field(loff_t, ip1_isize)
                __field(loff_t, ip1_disize)
                __field(xfs_ino_t, ip2_ino)
                __field(loff_t, ip2_isize)
                __field(loff_t, ip2_disize)

                __field(loff_t, file1_offset)
                __field(loff_t, file2_offset)
                __field(unsigned long long, length)
                __field(unsigned long long, flags)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip1)->i_sb->s_dev;
                __entry->ip1_ino = ip1->i_ino;
                __entry->ip1_isize = VFS_I(ip1)->i_size;
                __entry->ip1_disize = ip1->i_disk_size;
                __entry->ip2_ino = ip2->i_ino;
                __entry->ip2_isize = VFS_I(ip2)->i_size;
                __entry->ip2_disize = ip2->i_disk_size;

                __entry->file1_offset = fxr->file1_offset;
                __entry->file2_offset = fxr->file2_offset;
                __entry->length = fxr->length;
                __entry->flags = fxr->flags;
        ),
        TP_printk("dev %d:%d flags %s bytecount 0x%llx "
                  "ino1 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> "
                  "ino2 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                   __print_flags_u64(__entry->flags, "|", XFS_EXCHANGE_RANGE_FLAGS_STRS),
                  __entry->length,
                  __entry->ip1_ino,
                  __entry->ip1_isize,
                  __entry->ip1_disize,
                  __entry->file1_offset,
                  __entry->ip2_ino,
                  __entry->ip2_isize,
                  __entry->ip2_disize,
                  __entry->file2_offset)
)

#define DEFINE_EXCHRANGE_EVENT(name)        \
DEFINE_EVENT(xfs_exchrange_class, name,        \
        TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip1, \
                 struct xfs_inode *ip2), \
        TP_ARGS(fxr, ip1, ip2))
DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep);
DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush);
DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings);

TRACE_EVENT(xfs_exchrange_freshness,
        TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip2),
        TP_ARGS(fxr, ip2),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ip2_ino)
                __field(long long, ip2_mtime)
                __field(long long, ip2_ctime)
                __field(int, ip2_mtime_nsec)
                __field(int, ip2_ctime_nsec)

                __field(xfs_ino_t, file2_ino)
                __field(long long, file2_mtime)
                __field(long long, file2_ctime)
                __field(int, file2_mtime_nsec)
                __field(int, file2_ctime_nsec)
        ),
        TP_fast_assign(
                struct timespec64        ts64;
                struct inode                *inode2 = VFS_I(ip2);

                __entry->dev = inode2->i_sb->s_dev;
                __entry->ip2_ino = ip2->i_ino;

                ts64 = inode_get_ctime(inode2);
                __entry->ip2_ctime = ts64.tv_sec;
                __entry->ip2_ctime_nsec = ts64.tv_nsec;

                ts64 = inode_get_mtime(inode2);
                __entry->ip2_mtime = ts64.tv_sec;
                __entry->ip2_mtime_nsec = ts64.tv_nsec;

                __entry->file2_ino = fxr->file2_ino;
                __entry->file2_mtime = fxr->file2_mtime.tv_sec;
                __entry->file2_ctime = fxr->file2_ctime.tv_sec;
                __entry->file2_mtime_nsec = fxr->file2_mtime.tv_nsec;
                __entry->file2_ctime_nsec = fxr->file2_ctime.tv_nsec;
        ),
        TP_printk("dev %d:%d "
                  "ino 0x%llx mtime %lld:%d ctime %lld:%d -> "
                  "file 0x%llx mtime %lld:%d ctime %lld:%d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ip2_ino,
                  __entry->ip2_mtime,
                  __entry->ip2_mtime_nsec,
                  __entry->ip2_ctime,
                  __entry->ip2_ctime_nsec,
                  __entry->file2_ino,
                  __entry->file2_mtime,
                  __entry->file2_mtime_nsec,
                  __entry->file2_ctime,
                  __entry->file2_ctime_nsec)
);

TRACE_EVENT(xfs_exchmaps_overhead,
        TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks,
                 unsigned long long rmapbt_blocks),
        TP_ARGS(mp, bmbt_blocks, rmapbt_blocks),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long long, bmbt_blocks)
                __field(unsigned long long, rmapbt_blocks)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->bmbt_blocks = bmbt_blocks;
                __entry->rmapbt_blocks = rmapbt_blocks;
        ),
        TP_printk("dev %d:%d bmbt_blocks 0x%llx rmapbt_blocks 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->bmbt_blocks,
                  __entry->rmapbt_blocks)
);

DECLARE_EVENT_CLASS(xfs_exchmaps_estimate_class,
        TP_PROTO(const struct xfs_exchmaps_req *req),
        TP_ARGS(req),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino1)
                __field(xfs_ino_t, ino2)
                __field(xfs_fileoff_t, startoff1)
                __field(xfs_fileoff_t, startoff2)
                __field(xfs_filblks_t, blockcount)
                __field(uint64_t, flags)
                __field(xfs_filblks_t, ip1_bcount)
                __field(xfs_filblks_t, ip2_bcount)
                __field(xfs_filblks_t, ip1_rtbcount)
                __field(xfs_filblks_t, ip2_rtbcount)
                __field(unsigned long long, resblks)
                __field(unsigned long long, nr_exchanges)
        ),
        TP_fast_assign(
                __entry->dev = req->ip1->i_mount->m_super->s_dev;
                __entry->ino1 = req->ip1->i_ino;
                __entry->ino2 = req->ip2->i_ino;
                __entry->startoff1 = req->startoff1;
                __entry->startoff2 = req->startoff2;
                __entry->blockcount = req->blockcount;
                __entry->flags = req->flags;
                __entry->ip1_bcount = req->ip1_bcount;
                __entry->ip2_bcount = req->ip2_bcount;
                __entry->ip1_rtbcount = req->ip1_rtbcount;
                __entry->ip2_rtbcount = req->ip2_rtbcount;
                __entry->resblks = req->resblks;
                __entry->nr_exchanges = req->nr_exchanges;
        ),
        TP_printk("dev %d:%d ino1 0x%llx fileoff1 0x%llx ino2 0x%llx fileoff2 0x%llx fsbcount 0x%llx flags (%s) bcount1 0x%llx rtbcount1 0x%llx bcount2 0x%llx rtbcount2 0x%llx resblks 0x%llx nr_exchanges %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino1, __entry->startoff1,
                  __entry->ino2, __entry->startoff2,
                  __entry->blockcount,
                  __print_flags_u64(__entry->flags, "|", XFS_EXCHMAPS_STRINGS),
                  __entry->ip1_bcount,
                  __entry->ip1_rtbcount,
                  __entry->ip2_bcount,
                  __entry->ip2_rtbcount,
                  __entry->resblks,
                  __entry->nr_exchanges)
);

#define DEFINE_EXCHMAPS_ESTIMATE_EVENT(name)        \
DEFINE_EVENT(xfs_exchmaps_estimate_class, name,        \
        TP_PROTO(const struct xfs_exchmaps_req *req), \
        TP_ARGS(req))
DEFINE_EXCHMAPS_ESTIMATE_EVENT(xfs_exchmaps_initial_estimate);
DEFINE_EXCHMAPS_ESTIMATE_EVENT(xfs_exchmaps_final_estimate);

DECLARE_EVENT_CLASS(xfs_exchmaps_intent_class,
        TP_PROTO(struct xfs_mount *mp, const struct xfs_exchmaps_intent *xmi),
        TP_ARGS(mp, xmi),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino1)
                __field(xfs_ino_t, ino2)
                __field(uint64_t, flags)
                __field(xfs_fileoff_t, startoff1)
                __field(xfs_fileoff_t, startoff2)
                __field(xfs_filblks_t, blockcount)
                __field(xfs_fsize_t, isize1)
                __field(xfs_fsize_t, isize2)
                __field(xfs_fsize_t, new_isize1)
                __field(xfs_fsize_t, new_isize2)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->ino1 = xmi->xmi_ip1->i_ino;
                __entry->ino2 = xmi->xmi_ip2->i_ino;
                __entry->flags = xmi->xmi_flags;
                __entry->startoff1 = xmi->xmi_startoff1;
                __entry->startoff2 = xmi->xmi_startoff2;
                __entry->blockcount = xmi->xmi_blockcount;
                __entry->isize1 = xmi->xmi_ip1->i_disk_size;
                __entry->isize2 = xmi->xmi_ip2->i_disk_size;
                __entry->new_isize1 = xmi->xmi_isize1;
                __entry->new_isize2 = xmi->xmi_isize2;
        ),
        TP_printk("dev %d:%d ino1 0x%llx fileoff1 0x%llx ino2 0x%llx fileoff2 0x%llx fsbcount 0x%llx flags (%s) isize1 0x%llx newisize1 0x%llx isize2 0x%llx newisize2 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino1, __entry->startoff1,
                  __entry->ino2, __entry->startoff2,
                  __entry->blockcount,
                  __print_flags_u64(__entry->flags, "|", XFS_EXCHMAPS_STRINGS),
                  __entry->isize1, __entry->new_isize1,
                  __entry->isize2, __entry->new_isize2)
);

#define DEFINE_EXCHMAPS_INTENT_EVENT(name)        \
DEFINE_EVENT(xfs_exchmaps_intent_class, name,        \
        TP_PROTO(struct xfs_mount *mp, const struct xfs_exchmaps_intent *xmi), \
        TP_ARGS(mp, xmi))
DEFINE_EXCHMAPS_INTENT_EVENT(xfs_exchmaps_defer);
DEFINE_EXCHMAPS_INTENT_EVENT(xfs_exchmaps_recover);

TRACE_EVENT(xfs_exchmaps_delta_nextents_step,
        TP_PROTO(struct xfs_mount *mp,
                 const struct xfs_bmbt_irec *left,
                 const struct xfs_bmbt_irec *curr,
                 const struct xfs_bmbt_irec *new,
                 const struct xfs_bmbt_irec *right,
                 int delta, unsigned int state),
        TP_ARGS(mp, left, curr, new, right, delta, state),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_fileoff_t, loff)
                __field(xfs_fsblock_t, lstart)
                __field(xfs_filblks_t, lcount)
                __field(xfs_fileoff_t, coff)
                __field(xfs_fsblock_t, cstart)
                __field(xfs_filblks_t, ccount)
                __field(xfs_fileoff_t, noff)
                __field(xfs_fsblock_t, nstart)
                __field(xfs_filblks_t, ncount)
                __field(xfs_fileoff_t, roff)
                __field(xfs_fsblock_t, rstart)
                __field(xfs_filblks_t, rcount)
                __field(int, delta)
                __field(unsigned int, state)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->loff = left->br_startoff;
                __entry->lstart = left->br_startblock;
                __entry->lcount = left->br_blockcount;
                __entry->coff = curr->br_startoff;
                __entry->cstart = curr->br_startblock;
                __entry->ccount = curr->br_blockcount;
                __entry->noff = new->br_startoff;
                __entry->nstart = new->br_startblock;
                __entry->ncount = new->br_blockcount;
                __entry->roff = right->br_startoff;
                __entry->rstart = right->br_startblock;
                __entry->rcount = right->br_blockcount;
                __entry->delta = delta;
                __entry->state = state;
        ),
        TP_printk("dev %d:%d left 0x%llx:0x%llx:0x%llx; curr 0x%llx:0x%llx:0x%llx <- new 0x%llx:0x%llx:0x%llx; right 0x%llx:0x%llx:0x%llx delta %d state 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                __entry->loff, __entry->lstart, __entry->lcount,
                __entry->coff, __entry->cstart, __entry->ccount,
                __entry->noff, __entry->nstart, __entry->ncount,
                __entry->roff, __entry->rstart, __entry->rcount,
                __entry->delta, __entry->state)
);

TRACE_EVENT(xfs_exchmaps_delta_nextents,
        TP_PROTO(const struct xfs_exchmaps_req *req, int64_t d_nexts1,
                 int64_t d_nexts2),
        TP_ARGS(req, d_nexts1, d_nexts2),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino1)
                __field(xfs_ino_t, ino2)
                __field(xfs_extnum_t, nexts1)
                __field(xfs_extnum_t, nexts2)
                __field(int64_t, d_nexts1)
                __field(int64_t, d_nexts2)
        ),
        TP_fast_assign(
                int whichfork = xfs_exchmaps_reqfork(req);

                __entry->dev = req->ip1->i_mount->m_super->s_dev;
                __entry->ino1 = req->ip1->i_ino;
                __entry->ino2 = req->ip2->i_ino;
                __entry->nexts1 = xfs_ifork_ptr(req->ip1, whichfork)->if_nextents;
                __entry->nexts2 = xfs_ifork_ptr(req->ip2, whichfork)->if_nextents;
                __entry->d_nexts1 = d_nexts1;
                __entry->d_nexts2 = d_nexts2;
        ),
        TP_printk("dev %d:%d ino1 0x%llx nexts %llu ino2 0x%llx nexts %llu delta1 %lld delta2 %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino1, __entry->nexts1,
                  __entry->ino2, __entry->nexts2,
                  __entry->d_nexts1, __entry->d_nexts2)
);

DECLARE_EVENT_CLASS(xfs_getparents_rec_class,
        TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi,
                 const struct xfs_attr_list_context *context,
                 const struct xfs_getparents_rec *pptr),
        TP_ARGS(ip, ppi, context, pptr),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(unsigned int, firstu)
                __field(unsigned short, reclen)
                __field(unsigned int, bufsize)
                __field(xfs_ino_t, parent_ino)
                __field(unsigned int, parent_gen)
                __string(name, pptr->gpr_name)
        ),
        TP_fast_assign(
                __entry->dev = ip->i_mount->m_super->s_dev;
                __entry->ino = ip->i_ino;
                __entry->firstu = context->firstu;
                __entry->reclen = pptr->gpr_reclen;
                __entry->bufsize = ppi->gp_bufsize;
                __entry->parent_ino = pptr->gpr_parent.ha_fid.fid_ino;
                __entry->parent_gen = pptr->gpr_parent.ha_fid.fid_gen;
                __assign_str(name);
        ),
        TP_printk("dev %d:%d ino 0x%llx firstu %u reclen %u bufsize %u parent_ino 0x%llx parent_gen 0x%x name '%s'",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->firstu,
                  __entry->reclen,
                  __entry->bufsize,
                  __entry->parent_ino,
                  __entry->parent_gen,
                  __get_str(name))
)
#define DEFINE_XFS_GETPARENTS_REC_EVENT(name) \
DEFINE_EVENT(xfs_getparents_rec_class, name, \
        TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, \
                 const struct xfs_attr_list_context *context, \
                 const struct xfs_getparents_rec *pptr), \
        TP_ARGS(ip, ppi, context, pptr))
DEFINE_XFS_GETPARENTS_REC_EVENT(xfs_getparents_put_listent);
DEFINE_XFS_GETPARENTS_REC_EVENT(xfs_getparents_expand_lastrec);

DECLARE_EVENT_CLASS(xfs_getparents_class,
        TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi,
                 const struct xfs_attrlist_cursor_kern *cur),
        TP_ARGS(ip, ppi, cur),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(unsigned short, iflags)
                __field(unsigned short, oflags)
                __field(unsigned int, bufsize)
                __field(unsigned int, hashval)
                __field(unsigned int, blkno)
                __field(unsigned int, offset)
                __field(int, initted)
        ),
        TP_fast_assign(
                __entry->dev = ip->i_mount->m_super->s_dev;
                __entry->ino = ip->i_ino;
                __entry->iflags = ppi->gp_iflags;
                __entry->oflags = ppi->gp_oflags;
                __entry->bufsize = ppi->gp_bufsize;
                __entry->hashval = cur->hashval;
                __entry->blkno = cur->blkno;
                __entry->offset = cur->offset;
                __entry->initted = cur->initted;
        ),
        TP_printk("dev %d:%d ino 0x%llx iflags 0x%x oflags 0x%x bufsize %u cur_init? %d hashval 0x%x blkno %u offset %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->iflags,
                  __entry->oflags,
                  __entry->bufsize,
                  __entry->initted,
                  __entry->hashval,
                  __entry->blkno,
                  __entry->offset)
)
#define DEFINE_XFS_GETPARENTS_EVENT(name) \
DEFINE_EVENT(xfs_getparents_class, name, \
        TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, \
                 const struct xfs_attrlist_cursor_kern *cur), \
        TP_ARGS(ip, ppi, cur))
DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_begin);
DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_end);

DECLARE_EVENT_CLASS(xfs_metadir_update_class,
        TP_PROTO(const struct xfs_metadir_update *upd),
        TP_ARGS(upd),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, dp_ino)
                __field(xfs_ino_t, ino)
                __string(fname, upd->path)
        ),
        TP_fast_assign(
                __entry->dev = upd->dp->i_mount->m_super->s_dev;
                __entry->dp_ino = upd->dp->i_ino;
                __entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO;
                __assign_str(fname);
        ),
        TP_printk("dev %d:%d dp 0x%llx fname '%s' ino 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->dp_ino,
                  __get_str(fname),
                  __entry->ino)
)

#define DEFINE_METADIR_UPDATE_EVENT(name) \
DEFINE_EVENT(xfs_metadir_update_class, name, \
        TP_PROTO(const struct xfs_metadir_update *upd), \
        TP_ARGS(upd))
DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_start_create);
DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_start_link);
DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_commit);
DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_cancel);
DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_try_create);
DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_create);
DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_link);

DECLARE_EVENT_CLASS(xfs_metadir_update_error_class,
        TP_PROTO(const struct xfs_metadir_update *upd, int error),
        TP_ARGS(upd, error),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, dp_ino)
                __field(xfs_ino_t, ino)
                __field(int, error)
                __string(fname, upd->path)
        ),
        TP_fast_assign(
                __entry->dev = upd->dp->i_mount->m_super->s_dev;
                __entry->dp_ino = upd->dp->i_ino;
                __entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO;
                __entry->error = error;
                __assign_str(fname);
        ),
        TP_printk("dev %d:%d dp 0x%llx fname '%s' ino 0x%llx error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->dp_ino,
                  __get_str(fname),
                  __entry->ino,
                  __entry->error)
)

#define DEFINE_METADIR_UPDATE_ERROR_EVENT(name) \
DEFINE_EVENT(xfs_metadir_update_error_class, name, \
        TP_PROTO(const struct xfs_metadir_update *upd, int error), \
        TP_ARGS(upd, error))
DEFINE_METADIR_UPDATE_ERROR_EVENT(xfs_metadir_teardown);

DECLARE_EVENT_CLASS(xfs_metadir_class,
        TP_PROTO(struct xfs_inode *dp, struct xfs_name *name,
                 xfs_ino_t ino),
        TP_ARGS(dp, name, ino),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, dp_ino)
                __field(xfs_ino_t, ino)
                __field(int, ftype)
                __field(int, namelen)
                __dynamic_array(char, name, name->len)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(dp)->i_sb->s_dev;
                __entry->dp_ino = dp->i_ino;
                __entry->ino = ino,
                __entry->ftype = name->type;
                __entry->namelen = name->len;
                memcpy(__get_str(name), name->name, name->len);
        ),
        TP_printk("dev %d:%d dir 0x%llx type %s name '%.*s' ino 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->dp_ino,
                  __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
                  __entry->namelen,
                  __get_str(name),
                  __entry->ino)
)

#define DEFINE_METADIR_EVENT(name) \
DEFINE_EVENT(xfs_metadir_class, name, \
        TP_PROTO(struct xfs_inode *dp, struct xfs_name *name, \
                 xfs_ino_t ino), \
        TP_ARGS(dp, name, ino))
DEFINE_METADIR_EVENT(xfs_metadir_lookup);

/* metadata inode space reservations */

DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
        TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len),
        TP_ARGS(mp, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long long, freeblks)
                __field(unsigned long long, reserved)
                __field(unsigned long long, asked)
                __field(unsigned long long, used)
                __field(unsigned long long, len)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
                __entry->reserved = mp->m_metafile_resv_avail;
                __entry->asked = mp->m_metafile_resv_target;
                __entry->used = mp->m_metafile_resv_used;
                __entry->len = len;
        ),
        TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->freeblks,
                  __entry->reserved,
                  __entry->asked,
                  __entry->used,
                  __entry->len)
)
#define DEFINE_METAFILE_RESV_EVENT(name) \
DEFINE_EVENT(xfs_metafile_resv_class, name, \
        TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \
        TP_ARGS(mp, len))
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error);

#ifdef CONFIG_XFS_RT
TRACE_EVENT(xfs_growfs_check_rtgeom,
        TP_PROTO(const struct xfs_mount *mp, unsigned int min_logfsbs),
        TP_ARGS(mp, min_logfsbs),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, logblocks)
                __field(unsigned int, min_logfsbs)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->logblocks = mp->m_sb.sb_logblocks;
                __entry->min_logfsbs = min_logfsbs;
        ),
        TP_printk("dev %d:%d logblocks %u min_logfsbs %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->logblocks,
                  __entry->min_logfsbs)
);
#endif /* CONFIG_XFS_RT */

TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);

DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
        TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
                 uint64_t delta, unsigned long caller_ip),
        TP_ARGS(mp, ctr, delta, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(enum xfs_free_counter, ctr)
                __field(uint64_t, delta)
                __field(uint64_t, avail)
                __field(uint64_t, total)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->ctr = ctr;
                __entry->delta = delta;
                __entry->avail = mp->m_free[ctr].res_avail;
                __entry->total = mp->m_free[ctr].res_total;
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR),
                  __entry->delta,
                  __entry->avail,
                  __entry->total,
                  (char *)__entry->caller_ip)
)
#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \
DEFINE_EVENT(xfs_freeblocks_resv_class, name, \
        TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \
                 uint64_t delta, unsigned long caller_ip), \
        TP_ARGS(mp, ctr, delta, caller_ip))
DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved);
DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc);

TRACE_EVENT(xfs_healthmon_lost_event,
        TP_PROTO(const struct xfs_healthmon *hm),
        TP_ARGS(hm),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long long, lost_prev)
        ),
        TP_fast_assign(
                __entry->dev = hm->dev;
                __entry->lost_prev = hm->lost_prev_event;
        ),
        TP_printk("dev %d:%d lost_prev %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lost_prev)
);

#define XFS_HEALTHMON_FLAGS_STRINGS \
        { XFS_HEALTH_MONITOR_VERBOSE,        "verbose" }
#define XFS_HEALTHMON_FMT_STRINGS \
        { XFS_HEALTH_MONITOR_FMT_V0,        "v0" }

TRACE_EVENT(xfs_healthmon_create,
        TP_PROTO(dev_t dev, u64 flags, u8 format),
        TP_ARGS(dev, flags, format),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, flags)
                __field(u8, format)
        ),
        TP_fast_assign(
                __entry->dev = dev;
                __entry->flags = flags;
                __entry->format = format;
        ),
        TP_printk("dev %d:%d flags %s format %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_flags(__entry->flags, "|", XFS_HEALTHMON_FLAGS_STRINGS),
                  __print_symbolic(__entry->format, XFS_HEALTHMON_FMT_STRINGS))
);

TRACE_EVENT(xfs_healthmon_copybuf,
        TP_PROTO(const struct xfs_healthmon *hm, const struct iov_iter *iov),
        TP_ARGS(hm, iov),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(size_t, bufsize)
                __field(size_t, inpos)
                __field(size_t, outpos)
                __field(size_t, to_copy)
                __field(size_t, iter_count)
        ),
        TP_fast_assign(
                __entry->dev = hm->dev;
                __entry->bufsize = hm->bufsize;
                __entry->inpos = hm->bufhead;
                __entry->outpos = hm->buftail;
                if (hm->bufhead > hm->buftail)
                        __entry->to_copy = hm->bufhead - hm->buftail;
                else
                        __entry->to_copy = 0;
                __entry->iter_count = iov_iter_count(iov);
        ),
        TP_printk("dev %d:%d bufsize %zu in_pos %zu out_pos %zu to_copy %zu iter_count %zu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->bufsize,
                  __entry->inpos,
                  __entry->outpos,
                  __entry->to_copy,
                  __entry->iter_count)
);

DECLARE_EVENT_CLASS(xfs_healthmon_class,
        TP_PROTO(const struct xfs_healthmon *hm),
        TP_ARGS(hm),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, events)
                __field(unsigned long long, lost_prev)
        ),
        TP_fast_assign(
                __entry->dev = hm->dev;
                __entry->events = hm->events;
                __entry->lost_prev = hm->lost_prev_event;
        ),
        TP_printk("dev %d:%d events %u lost_prev? %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->events,
                  __entry->lost_prev)
);
#define DEFINE_HEALTHMON_EVENT(name) \
DEFINE_EVENT(xfs_healthmon_class, name, \
        TP_PROTO(const struct xfs_healthmon *hm), \
        TP_ARGS(hm))
DEFINE_HEALTHMON_EVENT(xfs_healthmon_read_start);
DEFINE_HEALTHMON_EVENT(xfs_healthmon_read_finish);
DEFINE_HEALTHMON_EVENT(xfs_healthmon_release);
DEFINE_HEALTHMON_EVENT(xfs_healthmon_detach);
DEFINE_HEALTHMON_EVENT(xfs_healthmon_report_unmount);

#define XFS_HEALTHMON_TYPE_STRINGS \
        { XFS_HEALTHMON_LOST,                "lost" }, \
        { XFS_HEALTHMON_UNMOUNT,        "unmount" }, \
        { XFS_HEALTHMON_SICK,                "sick" }, \
        { XFS_HEALTHMON_CORRUPT,        "corrupt" }, \
        { XFS_HEALTHMON_HEALTHY,        "healthy" }, \
        { XFS_HEALTHMON_SHUTDOWN,        "shutdown" }

#define XFS_HEALTHMON_DOMAIN_STRINGS \
        { XFS_HEALTHMON_MOUNT,                "mount" }, \
        { XFS_HEALTHMON_FS,                "fs" }, \
        { XFS_HEALTHMON_AG,                "ag" }, \
        { XFS_HEALTHMON_INODE,                "inode" }, \
        { XFS_HEALTHMON_RTGROUP,        "rtgroup" }

TRACE_DEFINE_ENUM(XFS_HEALTHMON_LOST);
TRACE_DEFINE_ENUM(XFS_HEALTHMON_SHUTDOWN);
TRACE_DEFINE_ENUM(XFS_HEALTHMON_UNMOUNT);
TRACE_DEFINE_ENUM(XFS_HEALTHMON_SICK);
TRACE_DEFINE_ENUM(XFS_HEALTHMON_CORRUPT);
TRACE_DEFINE_ENUM(XFS_HEALTHMON_HEALTHY);

TRACE_DEFINE_ENUM(XFS_HEALTHMON_MOUNT);
TRACE_DEFINE_ENUM(XFS_HEALTHMON_FS);
TRACE_DEFINE_ENUM(XFS_HEALTHMON_AG);
TRACE_DEFINE_ENUM(XFS_HEALTHMON_INODE);
TRACE_DEFINE_ENUM(XFS_HEALTHMON_RTGROUP);

DECLARE_EVENT_CLASS(xfs_healthmon_event_class,
        TP_PROTO(const struct xfs_healthmon *hm,
                 const struct xfs_healthmon_event *event),
        TP_ARGS(hm, event),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, type)
                __field(unsigned int, domain)
                __field(unsigned int, mask)
                __field(unsigned long long, ino)
                __field(unsigned int, gen)
                __field(unsigned int, group)
                __field(unsigned long long, offset)
                __field(unsigned long long, length)
                __field(unsigned long long, lostcount)
        ),
        TP_fast_assign(
                __entry->dev = hm->dev;
                __entry->type = event->type;
                __entry->domain = event->domain;
                __entry->mask = 0;
                __entry->group = 0;
                __entry->ino = 0;
                __entry->gen = 0;
                __entry->offset = 0;
                __entry->length = 0;
                __entry->lostcount = 0;
                switch (__entry->domain) {
                case XFS_HEALTHMON_MOUNT:
                        switch (__entry->type) {
                        case XFS_HEALTHMON_SHUTDOWN:
                                __entry->mask = event->flags;
                                break;
                        case XFS_HEALTHMON_LOST:
                                __entry->lostcount = event->lostcount;
                                break;
                        }
                        break;
                case XFS_HEALTHMON_FS:
                        __entry->mask = event->fsmask;
                        break;
                case XFS_HEALTHMON_AG:
                case XFS_HEALTHMON_RTGROUP:
                        __entry->mask = event->grpmask;
                        __entry->group = event->group;
                        break;
                case XFS_HEALTHMON_INODE:
                        __entry->mask = event->imask;
                        __entry->ino = event->ino;
                        __entry->gen = event->gen;
                        break;
                case XFS_HEALTHMON_DATADEV:
                case XFS_HEALTHMON_LOGDEV:
                case XFS_HEALTHMON_RTDEV:
                        __entry->offset = event->daddr;
                        __entry->length = event->bbcount;
                        break;
                case XFS_HEALTHMON_FILERANGE:
                        __entry->ino = event->fino;
                        __entry->gen = event->fgen;
                        __entry->offset = event->fpos;
                        __entry->length = event->flen;
                        break;
                }
        ),
        TP_printk("dev %d:%d type %s domain %s mask 0x%x ino 0x%llx gen 0x%x offset 0x%llx len 0x%llx group 0x%x lost %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS),
                  __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS),
                  __entry->mask,
                  __entry->ino,
                  __entry->gen,
                  __entry->offset,
                  __entry->length,
                  __entry->group,
                  __entry->lostcount)
);
#define DEFINE_HEALTHMONEVENT_EVENT(name) \
DEFINE_EVENT(xfs_healthmon_event_class, name, \
        TP_PROTO(const struct xfs_healthmon *hm, \
                 const struct xfs_healthmon_event *event), \
        TP_ARGS(hm, event))
DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_insert);
DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_push);
DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_pop);
DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_format);
DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_format_overflow);
DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_drop);
DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_merge);

TRACE_EVENT(xfs_healthmon_report_fs,
        TP_PROTO(const struct xfs_healthmon *hm,
                 unsigned int old_mask, unsigned int new_mask,
                 const struct xfs_healthmon_event *event),
        TP_ARGS(hm, old_mask, new_mask, event),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, type)
                __field(unsigned int, domain)
                __field(unsigned int, old_mask)
                __field(unsigned int, new_mask)
                __field(unsigned int, fsmask)
        ),
        TP_fast_assign(
                __entry->dev = hm->dev;
                __entry->type = event->type;
                __entry->domain = event->domain;
                __entry->old_mask = old_mask;
                __entry->new_mask = new_mask;
                __entry->fsmask = event->fsmask;
        ),
        TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x fsmask 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS),
                  __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS),
                  __entry->old_mask,
                  __entry->new_mask,
                  __entry->fsmask)
);

TRACE_EVENT(xfs_healthmon_report_group,
        TP_PROTO(const struct xfs_healthmon *hm,
                 unsigned int old_mask, unsigned int new_mask,
                 const struct xfs_healthmon_event *event),
        TP_ARGS(hm, old_mask, new_mask, event),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, type)
                __field(unsigned int, domain)
                __field(unsigned int, old_mask)
                __field(unsigned int, new_mask)
                __field(unsigned int, grpmask)
                __field(unsigned int, group)
        ),
        TP_fast_assign(
                __entry->dev = hm->dev;
                __entry->type = event->type;
                __entry->domain = event->domain;
                __entry->old_mask = old_mask;
                __entry->new_mask = new_mask;
                __entry->grpmask = event->grpmask;
                __entry->group = event->group;
        ),
        TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x grpmask 0x%x group 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS),
                  __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS),
                  __entry->old_mask,
                  __entry->new_mask,
                  __entry->grpmask,
                  __entry->group)
);

TRACE_EVENT(xfs_healthmon_report_inode,
        TP_PROTO(const struct xfs_healthmon *hm,
                 unsigned int old_mask, unsigned int new_mask,
                 const struct xfs_healthmon_event *event),
        TP_ARGS(hm, old_mask, new_mask, event),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, type)
                __field(unsigned int, domain)
                __field(unsigned int, old_mask)
                __field(unsigned int, new_mask)
                __field(unsigned int, imask)
                __field(unsigned long long, ino)
                __field(unsigned int, gen)
        ),
        TP_fast_assign(
                __entry->dev = hm->dev;
                __entry->type = event->type;
                __entry->domain = event->domain;
                __entry->old_mask = old_mask;
                __entry->new_mask = new_mask;
                __entry->imask = event->imask;
                __entry->ino = event->ino;
                __entry->gen = event->gen;
        ),
        TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x imask 0x%x ino 0x%llx gen 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS),
                  __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS),
                  __entry->old_mask,
                  __entry->new_mask,
                  __entry->imask,
                  __entry->ino,
                  __entry->gen)
);

TRACE_EVENT(xfs_healthmon_report_shutdown,
        TP_PROTO(const struct xfs_healthmon *hm, uint32_t shutdown_flags),
        TP_ARGS(hm, shutdown_flags),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(uint32_t, shutdown_flags)
        ),
        TP_fast_assign(
                __entry->dev = hm->dev;
                __entry->shutdown_flags = shutdown_flags;
        ),
        TP_printk("dev %d:%d shutdown_flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_flags(__entry->shutdown_flags, "|", XFS_SHUTDOWN_STRINGS))
);

#define XFS_DEVICE_STRINGS \
        { XFS_DEV_DATA,                "datadev" }, \
        { XFS_DEV_RT,                "rtdev" }, \
        { XFS_DEV_LOG,                "logdev" }

TRACE_DEFINE_ENUM(XFS_DEV_DATA);
TRACE_DEFINE_ENUM(XFS_DEV_RT);
TRACE_DEFINE_ENUM(XFS_DEV_LOG);

TRACE_EVENT(xfs_healthmon_report_media,
        TP_PROTO(const struct xfs_healthmon *hm, enum xfs_device fdev,
                 const struct xfs_healthmon_event *event),
        TP_ARGS(hm, fdev, event),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, error_dev)
                __field(uint64_t, daddr)
                __field(uint64_t, bbcount)
        ),
        TP_fast_assign(
                __entry->dev = hm->dev;
                __entry->error_dev = fdev;
                __entry->daddr = event->daddr;
                __entry->bbcount = event->bbcount;
        ),
        TP_printk("dev %d:%d %s daddr 0x%llx bbcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __print_symbolic(__entry->error_dev, XFS_DEVICE_STRINGS),
                  __entry->daddr,
                  __entry->bbcount)
);

#define FS_ERROR_STRINGS \
        { FSERR_BUFFERED_READ,                "buffered_read" }, \
        { FSERR_BUFFERED_WRITE,                "buffered_write" }, \
        { FSERR_DIRECTIO_READ,                "directio_read" }, \
        { FSERR_DIRECTIO_WRITE,                "directio_write" }, \
        { FSERR_DATA_LOST,                "data_lost" }, \
        { FSERR_METADATA,                "metadata" }

TRACE_DEFINE_ENUM(FSERR_BUFFERED_READ);
TRACE_DEFINE_ENUM(FSERR_BUFFERED_WRITE);
TRACE_DEFINE_ENUM(FSERR_DIRECTIO_READ);
TRACE_DEFINE_ENUM(FSERR_DIRECTIO_WRITE);
TRACE_DEFINE_ENUM(FSERR_DATA_LOST);
TRACE_DEFINE_ENUM(FSERR_METADATA);

TRACE_EVENT(xfs_healthmon_report_file_ioerror,
        TP_PROTO(const struct xfs_healthmon *hm,
                 const struct fserror_event *p),
        TP_ARGS(hm, p),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned int, type)
                __field(unsigned long long, ino)
                __field(unsigned int, gen)
                __field(long long, pos)
                __field(unsigned long long, len)
                __field(int, error)
        ),
        TP_fast_assign(
                __entry->dev = hm->dev;
                __entry->type = p->type;
                __entry->ino = XFS_I(p->inode)->i_ino;
                __entry->gen = p->inode->i_generation;
                __entry->pos = p->pos;
                __entry->len = p->len;
                __entry->error = p->error;
        ),
        TP_printk("dev %d:%d ino 0x%llx gen 0x%x op %s pos 0x%llx bytecount 0x%llx error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->gen,
                  __print_symbolic(__entry->type, FS_ERROR_STRINGS),
                  __entry->pos,
                  __entry->len,
                  __entry->error)
);

TRACE_EVENT(xfs_verify_media,
        TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
                 dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount,
                 const struct folio *folio),
        TP_ARGS(mp, me, fdev, daddr, bbcount, folio),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, fdev)
                __field(xfs_daddr_t, start_daddr)
                __field(xfs_daddr_t, end_daddr)
                __field(unsigned int, flags)
                __field(xfs_daddr_t, daddr)
                __field(uint64_t, bbcount)
                __field(unsigned int, bufsize)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_ddev_targp->bt_dev;
                __entry->fdev = fdev;
                __entry->start_daddr = me->me_start_daddr;
                __entry->end_daddr = me->me_end_daddr;
                __entry->flags = me->me_flags;
                __entry->daddr = daddr;
                __entry->bbcount = bbcount;
                __entry->bufsize = folio_size(folio);
        ),
        TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx bufsize 0x%x",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->fdev), MINOR(__entry->fdev),
                  __entry->start_daddr,
                  __entry->end_daddr,
                  __entry->flags,
                  __entry->daddr,
                  __entry->bbcount,
                  __entry->bufsize)
);

TRACE_EVENT(xfs_verify_media_end,
        TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
                 dev_t fdev),
        TP_ARGS(mp, me, fdev),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, fdev)
                __field(xfs_daddr_t, start_daddr)
                __field(xfs_daddr_t, end_daddr)
                __field(int, ioerror)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_ddev_targp->bt_dev;
                __entry->fdev = fdev;
                __entry->start_daddr = me->me_start_daddr;
                __entry->end_daddr = me->me_end_daddr;
                __entry->ioerror = me->me_ioerror;
        ),
        TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx ioerror %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->fdev), MINOR(__entry->fdev),
                  __entry->start_daddr,
                  __entry->end_daddr,
                  __entry->ioerror)
);

TRACE_EVENT(xfs_verify_media_error,
        TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
                 dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount,
                 blk_status_t status),
        TP_ARGS(mp, me, fdev, daddr, bbcount, status),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, fdev)
                __field(xfs_daddr_t, start_daddr)
                __field(xfs_daddr_t, end_daddr)
                __field(unsigned int, flags)
                __field(xfs_daddr_t, daddr)
                __field(uint64_t, bbcount)
                __field(int, error)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_ddev_targp->bt_dev;
                __entry->fdev = fdev;
                __entry->start_daddr = me->me_start_daddr;
                __entry->end_daddr = me->me_end_daddr;
                __entry->flags = me->me_flags;
                __entry->daddr = daddr;
                __entry->bbcount = bbcount;
                __entry->error = blk_status_to_errno(status);
        ),
        TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->fdev), MINOR(__entry->fdev),
                  __entry->start_daddr,
                  __entry->end_daddr,
                  __entry->flags,
                  __entry->daddr,
                  __entry->bbcount,
                  __entry->error)
);

#endif /* _TRACE_XFS_H */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE xfs_trace
#include <trace/define_trace.h>










































































































































































































































































    1 

   13 















   10 

    1 
   13 


























































































































































































































































































































































































































































































































































































































































































































    2 




































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_MQ_H
#define BLK_MQ_H

#include <linux/blkdev.h>
#include <linux/sbitmap.h>
#include <linux/lockdep.h>
#include <linux/scatterlist.h>
#include <linux/prefetch.h>
#include <linux/srcu.h>
#include <linux/rw_hint.h>
#include <linux/rwsem.h>

struct blk_mq_tags;
struct blk_flush_queue;
struct io_comp_batch;

#define BLKDEV_MIN_RQ        4
#define BLKDEV_DEFAULT_RQ        128

enum rq_end_io_ret {
        RQ_END_IO_NONE,
        RQ_END_IO_FREE,
};

typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t,
                                          const struct io_comp_batch *);

/*
 * request flags */
typedef __u32 __bitwise req_flags_t;

/* Keep rqf_name[] in sync with the definitions below */
enum rqf_flags {
        /* drive already may have started this one */
        __RQF_STARTED,
        /* request for flush sequence */
        __RQF_FLUSH_SEQ,
        /* merge of different types, fail separately */
        __RQF_MIXED_MERGE,
        /* don't call prep for this one */
        __RQF_DONTPREP,
        /* use hctx->sched_tags */
        __RQF_SCHED_TAGS,
        /* use an I/O scheduler for this request */
        __RQF_USE_SCHED,
        /* vaguely specified driver internal error.  Ignored by block layer */
        __RQF_FAILED,
        /* don't warn about errors */
        __RQF_QUIET,
        /* account into disk and partition IO statistics */
        __RQF_IO_STAT,
        /* runtime pm request */
        __RQF_PM,
        /* on IO scheduler merge hash */
        __RQF_HASHED,
        /* track IO completion time */
        __RQF_STATS,
        /* Look at ->special_vec for the actual data payload instead of the
           bio chain. */
        __RQF_SPECIAL_PAYLOAD,
        /* request completion needs to be signaled to zone write plugging. */
        __RQF_ZONE_WRITE_PLUGGING,
        /* ->timeout has been called, don't expire again */
        __RQF_TIMED_OUT,
        __RQF_RESV,
        __RQF_BITS
};

#define RQF_STARTED                ((__force req_flags_t)(1 << __RQF_STARTED))
#define RQF_FLUSH_SEQ                ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ))
#define RQF_MIXED_MERGE                ((__force req_flags_t)(1 << __RQF_MIXED_MERGE))
#define RQF_DONTPREP                ((__force req_flags_t)(1 << __RQF_DONTPREP))
#define RQF_SCHED_TAGS                ((__force req_flags_t)(1 << __RQF_SCHED_TAGS))
#define RQF_USE_SCHED                ((__force req_flags_t)(1 << __RQF_USE_SCHED))
#define RQF_FAILED                ((__force req_flags_t)(1 << __RQF_FAILED))
#define RQF_QUIET                ((__force req_flags_t)(1 << __RQF_QUIET))
#define RQF_IO_STAT                ((__force req_flags_t)(1 << __RQF_IO_STAT))
#define RQF_PM                        ((__force req_flags_t)(1 << __RQF_PM))
#define RQF_HASHED                ((__force req_flags_t)(1 << __RQF_HASHED))
#define RQF_STATS                ((__force req_flags_t)(1 << __RQF_STATS))
#define RQF_SPECIAL_PAYLOAD        \
                        ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD))
#define RQF_ZONE_WRITE_PLUGGING        \
                        ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING))
#define RQF_TIMED_OUT                ((__force req_flags_t)(1 << __RQF_TIMED_OUT))
#define RQF_RESV                ((__force req_flags_t)(1 << __RQF_RESV))

/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
        (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)

enum mq_rq_state {
        MQ_RQ_IDLE                = 0,
        MQ_RQ_IN_FLIGHT                = 1,
        MQ_RQ_COMPLETE                = 2,
};

/*
 * Try to put the fields that are referenced together in the same cacheline.
 *
 * If you modify this structure, make sure to update blk_rq_init() and
 * especially blk_mq_rq_ctx_init() to take care of the added fields.
 */
struct request {
        struct request_queue *q;
        struct blk_mq_ctx *mq_ctx;
        struct blk_mq_hw_ctx *mq_hctx;

        blk_opf_t cmd_flags;                /* op and common flags */
        req_flags_t rq_flags;

        int tag;
        int internal_tag;

        unsigned int timeout;

        /* the following two fields are internal, NEVER access directly */
        unsigned int __data_len;        /* total data len */
        sector_t __sector;                /* sector cursor */

        struct bio *bio;
        struct bio *biotail;

        union {
                struct list_head queuelist;
                struct request *rq_next;
        };

        struct block_device *part;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
        /* Time that the first bio started allocating this request. */
        u64 alloc_time_ns;
#endif
        /* Time that this request was allocated for this IO. */
        u64 start_time_ns;
        /* Time that I/O was submitted to the device. */
        u64 io_start_time_ns;

#ifdef CONFIG_BLK_WBT
        unsigned short wbt_flags;
#endif
        /*
         * rq sectors used for blk stats. It has the same value
         * with blk_rq_sectors(rq), except that it never be zeroed
         * by completion.
         */
        unsigned short stats_sectors;

        /*
         * Number of scatter-gather DMA addr+len pairs after
         * physical address coalescing is performed.
         */
        unsigned short nr_phys_segments;
        unsigned short nr_integrity_segments;

        /*
         * The lowest set bit for address gaps between physical segments. This
         * provides information necessary for dma optimization opprotunities,
         * like for testing if the segments can be coalesced against the
         * device's iommu granule.
         */
        unsigned char phys_gap_bit;

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        struct bio_crypt_ctx *crypt_ctx;
        struct blk_crypto_keyslot *crypt_keyslot;
#endif

        enum mq_rq_state state;
        atomic_t ref;

        unsigned long deadline;

        /*
         * The hash is used inside the scheduler, and killed once the
         * request reaches the dispatch list. The ipi_list is only used
         * to queue the request for softirq completion, which is long
         * after the request has been unhashed (and even removed from
         * the dispatch list).
         */
        union {
                struct hlist_node hash;        /* merge hash */
                struct llist_node ipi_list;
        };

        /*
         * The rb_node is only used inside the io scheduler, requests
         * are pruned when moved to the dispatch queue. special_vec must
         * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be
         * insert into an IO scheduler.
         */
        union {
                struct rb_node rb_node;        /* sort/lookup */
                struct bio_vec special_vec;
        };

        /*
         * Three pointers are available for the IO schedulers, if they need
         * more they have to dynamically allocate it.
         */
        struct {
                struct io_cq                *icq;
                void                        *priv[2];
        } elv;

        struct {
                unsigned int                seq;
                rq_end_io_fn                *saved_end_io;
        } flush;

        u64 fifo_time;

        /*
         * completion callback.
         */
        rq_end_io_fn *end_io;
        void *end_io_data;
};

/*
 * Returns a mask with all bits starting at req->phys_gap_bit set to 1.
 */
static inline unsigned long req_phys_gap_mask(const struct request *req)
{
        return ~(((1 << req->phys_gap_bit) >> 1) - 1);
}

static inline enum req_op req_op(const struct request *req)
{
        return req->cmd_flags & REQ_OP_MASK;
}

static inline bool blk_rq_is_passthrough(struct request *rq)
{
        return blk_op_is_passthrough(rq->cmd_flags);
}

static inline unsigned short req_get_ioprio(struct request *req)
{
        if (req->bio)
                return req->bio->bi_ioprio;
        return 0;
}

#define rq_data_dir(rq)                (op_is_write(req_op(rq)) ? WRITE : READ)

#define rq_dma_dir(rq) \
        (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)

static inline int rq_list_empty(const struct rq_list *rl)
{
        return rl->head == NULL;
}

static inline void rq_list_init(struct rq_list *rl)
{
        rl->head = NULL;
        rl->tail = NULL;
}

static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq)
{
        rq->rq_next = NULL;
        if (rl->tail)
                rl->tail->rq_next = rq;
        else
                rl->head = rq;
        rl->tail = rq;
}

static inline void rq_list_add_head(struct rq_list *rl, struct request *rq)
{
        rq->rq_next = rl->head;
        rl->head = rq;
        if (!rl->tail)
                rl->tail = rq;
}

static inline struct request *rq_list_pop(struct rq_list *rl)
{
        struct request *rq = rl->head;

        if (rq) {
                rl->head = rl->head->rq_next;
                if (!rl->head)
                        rl->tail = NULL;
                rq->rq_next = NULL;
        }

        return rq;
}

static inline struct request *rq_list_peek(struct rq_list *rl)
{
        return rl->head;
}

#define rq_list_for_each(rl, pos)                                        \
        for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next)

#define rq_list_for_each_safe(rl, pos, nxt)                                \
        for (pos = rq_list_peek((rl)), nxt = pos->rq_next;                \
                pos; pos = nxt, nxt = pos ? pos->rq_next : NULL)

/**
 * enum blk_eh_timer_return - How the timeout handler should proceed
 * @BLK_EH_DONE: The block driver completed the command or will complete it at
 *        a later time.
 * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the
 *        request to complete.
 */
enum blk_eh_timer_return {
        BLK_EH_DONE,
        BLK_EH_RESET_TIMER,
};

/**
 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
 * block device
 */
struct blk_mq_hw_ctx {
        struct {
                /** @lock: Protects the dispatch list. */
                spinlock_t                lock;
                /**
                 * @dispatch: Used for requests that are ready to be
                 * dispatched to the hardware but for some reason (e.g. lack of
                 * resources) could not be sent to the hardware. As soon as the
                 * driver can send new requests, requests at this list will
                 * be sent first for a fairer dispatch.
                 */
                struct list_head        dispatch;
                 /**
                  * @state: BLK_MQ_S_* flags. Defines the state of the hw
                  * queue (active, scheduled to restart, stopped).
                  */
                unsigned long                state;
        } ____cacheline_aligned_in_smp;

        /**
         * @run_work: Used for scheduling a hardware queue run at a later time.
         */
        struct delayed_work        run_work;
        /** @cpumask: Map of available CPUs where this hctx can run. */
        cpumask_var_t                cpumask;
        /**
         * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
         * selection from @cpumask.
         */
        int                        next_cpu;
        /**
         * @next_cpu_batch: Counter of how many works left in the batch before
         * changing to the next CPU.
         */
        int                        next_cpu_batch;

        /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
        unsigned long                flags;

        /**
         * @sched_data: Pointer owned by the IO scheduler attached to a request
         * queue. It's up to the IO scheduler how to use this pointer.
         */
        void                        *sched_data;
        /**
         * @queue: Pointer to the request queue that owns this hardware context.
         */
        struct request_queue        *queue;
        /** @fq: Queue of requests that need to perform a flush operation. */
        struct blk_flush_queue        *fq;

        /**
         * @driver_data: Pointer to data owned by the block driver that created
         * this hctx
         */
        void                        *driver_data;

        /**
         * @ctx_map: Bitmap for each software queue. If bit is on, there is a
         * pending request in that software queue.
         */
        struct sbitmap                ctx_map;

        /**
         * @dispatch_from: Software queue to be used when no scheduler was
         * selected.
         */
        struct blk_mq_ctx        *dispatch_from;
        /**
         * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
         * decide if the hw_queue is busy using Exponential Weighted Moving
         * Average algorithm.
         */
        unsigned int                dispatch_busy;

        /** @type: HCTX_TYPE_* flags. Type of hardware queue. */
        unsigned short                type;
        /** @nr_ctx: Number of software queues. */
        unsigned short                nr_ctx;
        /** @ctxs: Array of software queues. */
        struct blk_mq_ctx        **ctxs;

        /** @dispatch_wait_lock: Lock for dispatch_wait queue. */
        spinlock_t                dispatch_wait_lock;
        /**
         * @dispatch_wait: Waitqueue to put requests when there is no tag
         * available at the moment, to wait for another try in the future.
         */
        wait_queue_entry_t        dispatch_wait;

        /**
         * @wait_index: Index of next available dispatch_wait queue to insert
         * requests.
         */
        atomic_t                wait_index;

        /**
         * @tags: Tags owned by the block driver. A tag at this set is only
         * assigned when a request is dispatched from a hardware queue.
         */
        struct blk_mq_tags        *tags;
        /**
         * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
         * scheduler associated with a request queue, a tag is assigned when
         * that request is allocated. Else, this member is not used.
         */
        struct blk_mq_tags        *sched_tags;

        /** @numa_node: NUMA node the storage adapter has been connected to. */
        unsigned int                numa_node;
        /** @queue_num: Index of this hardware queue. */
        unsigned int                queue_num;

        /**
         * @nr_active: Number of active requests. Only used when a tag set is
         * shared across request queues.
         */
        atomic_t                nr_active;

        /** @cpuhp_online: List to store request if CPU is going to die */
        struct hlist_node        cpuhp_online;
        /** @cpuhp_dead: List to store request if some CPU die. */
        struct hlist_node        cpuhp_dead;
        /** @kobj: Kernel object for sysfs. */
        struct kobject                kobj;

#ifdef CONFIG_BLK_DEBUG_FS
        /**
         * @debugfs_dir: debugfs directory for this hardware queue. Named
         * as cpu<cpu_number>.
         */
        struct dentry                *debugfs_dir;
        /** @sched_debugfs_dir:        debugfs directory for the scheduler. */
        struct dentry                *sched_debugfs_dir;
#endif

        /**
         * @hctx_list: if this hctx is not in use, this is an entry in
         * q->unused_hctx_list.
         */
        struct list_head        hctx_list;
};

/**
 * struct blk_mq_queue_map - Map software queues to hardware queues
 * @mq_map:       CPU ID to hardware queue index map. This is an array
 *        with nr_cpu_ids elements. Each element has a value in the range
 *        [@queue_offset, @queue_offset + @nr_queues).
 * @nr_queues:    Number of hardware queues to map CPU IDs onto.
 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
 *        driver to map each hardware queue type (enum hctx_type) onto a distinct
 *        set of hardware queues.
 */
struct blk_mq_queue_map {
        unsigned int *mq_map;
        unsigned int nr_queues;
        unsigned int queue_offset;
};

/**
 * enum hctx_type - Type of hardware queue
 * @HCTX_TYPE_DEFAULT:        All I/O not otherwise accounted for.
 * @HCTX_TYPE_READ:        Just for READ I/O.
 * @HCTX_TYPE_POLL:        Polled I/O of any kind.
 * @HCTX_MAX_TYPES:        Number of types of hctx.
 */
enum hctx_type {
        HCTX_TYPE_DEFAULT,
        HCTX_TYPE_READ,
        HCTX_TYPE_POLL,

        HCTX_MAX_TYPES,
};

/**
 * struct blk_mq_tag_set - tag set that can be shared between request queues
 * @ops:           Pointers to functions that implement block driver behavior.
 * @map:           One or more ctx -> hctx mappings. One map exists for each
 *                   hardware queue type (enum hctx_type) that the driver wishes
 *                   to support. There are no restrictions on maps being of the
 *                   same size, and it's perfectly legal to share maps between
 *                   types.
 * @nr_maps:           Number of elements in the @map array. A number in the range
 *                   [1, HCTX_MAX_TYPES].
 * @nr_hw_queues:  Number of hardware queues supported by the block driver that
 *                   owns this data structure.
 * @queue_depth:   Number of tags per hardware queue, reserved tags included.
 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
 *                   allocations.
 * @cmd_size:           Number of additional bytes to allocate per request. The block
 *                   driver owns these additional bytes.
 * @numa_node:           NUMA node the storage adapter has been connected to.
 * @timeout:           Request processing timeout in jiffies.
 * @flags:           Zero or more BLK_MQ_F_* flags.
 * @driver_data:   Pointer to data owned by the block driver that created this
 *                   tag set.
 * @tags:           Tag sets. One tag set per hardware queue. Has @nr_hw_queues
 *                   elements.
 * @shared_tags:
 *                   Shared set of tags. Has @nr_hw_queues elements. If set,
 *                   shared by all @tags.
 * @tag_list_lock: Serializes tag_list accesses.
 * @tag_list:           List of the request queues that use this tag set. See also
 *                   request_queue.tag_set_list.
 * @srcu:           Use as lock when type of the request queue is blocking
 *                   (BLK_MQ_F_BLOCKING).
 * @tags_srcu:           SRCU used to defer freeing of tags page_list to prevent
 *                   use-after-free when iterating tags.
 * @update_nr_hwq_lock:
 *                    Synchronize updating nr_hw_queues with add/del disk &
 *                    switching elevator.
 */
struct blk_mq_tag_set {
        const struct blk_mq_ops        *ops;
        struct blk_mq_queue_map        map[HCTX_MAX_TYPES];
        unsigned int                nr_maps;
        unsigned int                nr_hw_queues;
        unsigned int                queue_depth;
        unsigned int                reserved_tags;
        unsigned int                cmd_size;
        int                        numa_node;
        unsigned int                timeout;
        unsigned int                flags;
        void                        *driver_data;

        struct blk_mq_tags        **tags;

        struct blk_mq_tags        *shared_tags;

        struct mutex                tag_list_lock;
        struct list_head        tag_list;
        struct srcu_struct        *srcu;
        struct srcu_struct        tags_srcu;

        struct rw_semaphore        update_nr_hwq_lock;
};

/**
 * struct blk_mq_queue_data - Data about a request inserted in a queue
 *
 * @rq:   Request pointer.
 * @last: If it is the last request in the queue.
 */
struct blk_mq_queue_data {
        struct request *rq;
        bool last;
};

typedef bool (busy_tag_iter_fn)(struct request *, void *);

/**
 * struct blk_mq_ops - Callback functions that implements block driver
 * behaviour.
 */
struct blk_mq_ops {
        /**
         * @queue_rq: Queue a new request from block IO.
         */
        blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *,
                                 const struct blk_mq_queue_data *);

        /**
         * @commit_rqs: If a driver uses bd->last to judge when to submit
         * requests to hardware, it must define this function. In case of errors
         * that make us stop issuing further requests, this hook serves the
         * purpose of kicking the hardware (which the last request otherwise
         * would have done).
         */
        void (*commit_rqs)(struct blk_mq_hw_ctx *);

        /**
         * @queue_rqs: Queue a list of new requests. Driver is guaranteed
         * that each request belongs to the same queue. If the driver doesn't
         * empty the @rqlist completely, then the rest will be queued
         * individually by the block layer upon return.
         */
        void (*queue_rqs)(struct rq_list *rqlist);

        /**
         * @get_budget: Reserve budget before queue request, once .queue_rq is
         * run, it is driver's responsibility to release the
         * reserved budget. Also we have to handle failure case
         * of .get_budget for avoiding I/O deadlock.
         */
        int (*get_budget)(struct request_queue *);

        /**
         * @put_budget: Release the reserved budget.
         */
        void (*put_budget)(struct request_queue *, int);

        /**
         * @set_rq_budget_token: store rq's budget token
         */
        void (*set_rq_budget_token)(struct request *, int);
        /**
         * @get_rq_budget_token: retrieve rq's budget token
         */
        int (*get_rq_budget_token)(struct request *);

        /**
         * @timeout: Called on request timeout.
         */
        enum blk_eh_timer_return (*timeout)(struct request *);

        /**
         * @poll: Called to poll for completion of a specific tag.
         */
        int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);

        /**
         * @complete: Mark the request as complete.
         */
        void (*complete)(struct request *);

        /**
         * @init_hctx: Called when the block layer side of a hardware queue has
         * been set up, allowing the driver to allocate/init matching
         * structures.
         */
        int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int);
        /**
         * @exit_hctx: Ditto for exit/teardown.
         */
        void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);

        /**
         * @init_request: Called for every command allocated by the block layer
         * to allow the driver to set up driver specific data.
         *
         * Tag greater than or equal to queue_depth is for setting up
         * flush request.
         */
        int (*init_request)(struct blk_mq_tag_set *set, struct request *,
                            unsigned int, unsigned int);
        /**
         * @exit_request: Ditto for exit/teardown.
         */
        void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
                             unsigned int);

        /**
         * @cleanup_rq: Called before freeing one request which isn't completed
         * yet, and usually for freeing the driver private data.
         */
        void (*cleanup_rq)(struct request *);

        /**
         * @busy: If set, returns whether or not this queue currently is busy.
         */
        bool (*busy)(struct request_queue *);

        /**
         * @map_queues: This allows drivers specify their own queue mapping by
         * overriding the setup-time function that builds the mq_map.
         */
        void (*map_queues)(struct blk_mq_tag_set *set);

#ifdef CONFIG_BLK_DEBUG_FS
        /**
         * @show_rq: Used by the debugfs implementation to show driver-specific
         * information about a request.
         */
        void (*show_rq)(struct seq_file *m, struct request *rq);
#endif
};

/* Keep hctx_flag_name[] in sync with the definitions below */
enum {
        BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
        /*
         * Set when this device requires underlying blk-mq device for
         * completing IO:
         */
        BLK_MQ_F_STACKING        = 1 << 2,
        BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
        BLK_MQ_F_BLOCKING        = 1 << 4,

        /*
         * Alloc tags on a round-robin base instead of the first available one.
         */
        BLK_MQ_F_TAG_RR                = 1 << 5,

        /*
         * Select 'none' during queue registration in case of a single hwq
         * or shared hwqs instead of 'mq-deadline'.
         */
        BLK_MQ_F_NO_SCHED_BY_DEFAULT        = 1 << 6,

        BLK_MQ_F_MAX = 1 << 7,
};

#define BLK_MQ_MAX_DEPTH        (10240)
#define BLK_MQ_NO_HCTX_IDX        (-1U)

enum {
        /* Keep hctx_state_name[] in sync with the definitions below */
        BLK_MQ_S_STOPPED,
        BLK_MQ_S_TAG_ACTIVE,
        BLK_MQ_S_SCHED_RESTART,
        /* hw queue is inactive after all its CPUs become offline */
        BLK_MQ_S_INACTIVE,
        BLK_MQ_S_MAX
};

struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata,
                struct lock_class_key *lkclass);
#define blk_mq_alloc_disk(set, lim, queuedata)                                \
({                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __blk_mq_alloc_disk(set, lim, queuedata, &__key);                \
})
struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
                struct lock_class_key *lkclass);
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata);
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                struct request_queue *q);
void blk_mq_destroy_queue(struct request_queue *);

int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
                const struct blk_mq_ops *ops, unsigned int queue_depth,
                unsigned int set_flags);
void blk_mq_free_tag_set(struct blk_mq_tag_set *set);

void blk_mq_free_request(struct request *rq);
int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
                unsigned int poll_flags);

bool blk_mq_queue_inflight(struct request_queue *q);

enum {
        /* return when out of requests */
        BLK_MQ_REQ_NOWAIT        = (__force blk_mq_req_flags_t)(1 << 0),
        /* allocate from reserved pool */
        BLK_MQ_REQ_RESERVED        = (__force blk_mq_req_flags_t)(1 << 1),
        /* set RQF_PM */
        BLK_MQ_REQ_PM                = (__force blk_mq_req_flags_t)(1 << 2),
};

struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
                blk_mq_req_flags_t flags);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                blk_opf_t opf, blk_mq_req_flags_t flags,
                unsigned int hctx_idx);

/*
 * Tag address space map.
 */
struct blk_mq_tags {
        unsigned int nr_tags;
        unsigned int nr_reserved_tags;
        unsigned int active_queues;

        struct sbitmap_queue bitmap_tags;
        struct sbitmap_queue breserved_tags;

        struct request **rqs;
        struct request **static_rqs;
        struct list_head page_list;

        /*
         * used to clear request reference in rqs[] before freeing one
         * request pool
         */
        spinlock_t lock;
        struct rcu_head rcu_head;
};

static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
                                               unsigned int tag)
{
        if (tag < tags->nr_tags) {
                prefetch(tags->rqs[tag]);
                return tags->rqs[tag];
        }

        return NULL;
}

enum {
        BLK_MQ_UNIQUE_TAG_BITS = 16,
        BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
};

u32 blk_mq_unique_tag(struct request *rq);

static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
{
        return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
}

static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
{
        return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
}

/**
 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
 * @rq: target request.
 */
static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
{
        return READ_ONCE(rq->state);
}

static inline int blk_mq_request_started(struct request *rq)
{
        return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
}

static inline int blk_mq_request_completed(struct request *rq)
{
        return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
}

/*
 * 
 * Set the state to complete when completing a request from inside ->queue_rq.
 * This is used by drivers that want to ensure special complete actions that
 * need access to the request are called on failure, e.g. by nvme for
 * multipathing.
 */
static inline void blk_mq_set_request_complete(struct request *rq)
{
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
}

/*
 * Complete the request directly instead of deferring it to softirq or
 * completing it another CPU. Useful in preemptible instead of an interrupt.
 */
static inline void blk_mq_complete_request_direct(struct request *rq,
                   void (*complete)(struct request *rq))
{
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
        complete(rq);
}

void blk_mq_start_request(struct request *rq);
void blk_mq_end_request(struct request *rq, blk_status_t error);
void __blk_mq_end_request(struct request *rq, blk_status_t error);
void blk_mq_end_request_batch(struct io_comp_batch *ib);

/*
 * Only need start/end time stamping if we have iostat or
 * blk stats enabled, or using an IO scheduler.
 */
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
        return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
}

static inline bool blk_mq_is_reserved_rq(struct request *rq)
{
        return rq->rq_flags & RQF_RESV;
}

/**
 * blk_mq_add_to_batch() - add a request to the completion batch
 * @req: The request to add to batch
 * @iob: The batch to add the request
 * @is_error: Specify true if the request failed with an error
 * @complete: The completaion handler for the request
 *
 * Batched completions only work when there is no I/O error and no special
 * ->end_io handler.
 *
 * Return: true when the request was added to the batch, otherwise false
 */
static inline bool blk_mq_add_to_batch(struct request *req,
                                       struct io_comp_batch *iob, bool is_error,
                                       void (*complete)(struct io_comp_batch *))
{
        /*
         * Check various conditions that exclude batch processing:
         * 1) No batch container
         * 2) Has scheduler data attached
         * 3) Not a passthrough request and end_io set
         * 4) Not a passthrough request and failed with an error
         */
        if (!iob)
                return false;
        if (req->rq_flags & RQF_SCHED_TAGS)
                return false;
        if (!blk_rq_is_passthrough(req)) {
                if (req->end_io)
                        return false;
                if (is_error)
                        return false;
        }

        if (!iob->complete)
                iob->complete = complete;
        else if (iob->complete != complete)
                return false;
        iob->need_ts |= blk_mq_need_time_stamp(req);
        rq_list_add_tail(&iob->req_list, req);
        return true;
}

void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_complete_request(struct request *rq);
bool blk_mq_complete_request_remote(struct request *rq);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_stop_hw_queues(struct request_queue *q);
void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_quiesce_queue(struct request_queue *q);
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set);
void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
void blk_mq_unquiesce_queue(struct request_queue *q);
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv);
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
void blk_mq_freeze_queue_nomemsave(struct request_queue *q);
void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q);
static inline unsigned int __must_check
blk_mq_freeze_queue(struct request_queue *q)
{
        unsigned int memflags = memalloc_noio_save();

        blk_mq_freeze_queue_nomemsave(q);
        return memflags;
}
static inline void
blk_mq_unfreeze_queue(struct request_queue *q, unsigned int memflags)
{
        blk_mq_unfreeze_queue_nomemrestore(q);
        memalloc_noio_restore(memflags);
}
void blk_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout);
void blk_mq_unfreeze_queue_non_owner(struct request_queue *q);
void blk_freeze_queue_start_non_owner(struct request_queue *q);

unsigned int blk_mq_num_possible_queues(unsigned int max_queues);
unsigned int blk_mq_num_online_queues(unsigned int max_queues);
void blk_mq_map_queues(struct blk_mq_queue_map *qmap);
void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
                          struct device *dev, unsigned int offset);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);

void blk_mq_quiesce_queue_nowait(struct request_queue *q);

unsigned int blk_mq_rq_cpu(struct request *rq);

bool __blk_should_fake_timeout(struct request_queue *q);
static inline bool blk_should_fake_timeout(struct request_queue *q)
{
        if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
            test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
                return __blk_should_fake_timeout(q);
        return false;
}

/**
 * blk_mq_rq_from_pdu - cast a PDU to a request
 * @pdu: the PDU (Protocol Data Unit) to be casted
 *
 * Return: request
 *
 * Driver command data is immediately after the request. So subtract request
 * size to get back to the original request.
 */
static inline struct request *blk_mq_rq_from_pdu(void *pdu)
{
        return pdu - sizeof(struct request);
}

/**
 * blk_mq_rq_to_pdu - cast a request to a PDU
 * @rq: the request to be casted
 *
 * Return: pointer to the PDU
 *
 * Driver command data is immediately after the request. So add request to get
 * the PDU.
 */
static inline void *blk_mq_rq_to_pdu(struct request *rq)
{
        return rq + 1;
}

static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id)
{
        struct blk_mq_hw_ctx *hctx;

        rcu_read_lock();
        hctx = rcu_dereference(q->queue_hw_ctx)[id];
        rcu_read_unlock();

        return hctx;
}

#define queue_for_each_hw_ctx(q, hctx, i)                                \
        for ((i) = 0; (i) < (q)->nr_hw_queues &&                        \
             ({ hctx = queue_hctx((q), i); 1; }); (i)++)

#define hctx_for_each_ctx(hctx, ctx, i)                                        \
        for ((i) = 0; (i) < (hctx)->nr_ctx &&                                \
             ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)

static inline void blk_mq_cleanup_rq(struct request *rq)
{
        if (rq->q->mq_ops->cleanup_rq)
                rq->q->mq_ops->cleanup_rq(rq);
}

void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
                struct lock_class_key *key);

static inline bool rq_is_sync(struct request *rq)
{
        return op_is_sync(rq->cmd_flags);
}

void blk_rq_init(struct request_queue *q, struct request *rq);
int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
                struct bio_set *bs, gfp_t gfp_mask,
                int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
void blk_rq_unprep_clone(struct request *rq);
blk_status_t blk_insert_cloned_request(struct request *rq);

struct rq_map_data {
        struct page **pages;
        unsigned long offset;
        unsigned short page_order;
        unsigned short nr_entries;
        bool null_mapped;
        bool from_user;
};

int blk_rq_map_user(struct request_queue *, struct request *,
                struct rq_map_data *, void __user *, unsigned long, gfp_t);
int blk_rq_map_user_io(struct request *, struct rq_map_data *,
                void __user *, unsigned long, gfp_t, bool, int, bool, int);
int blk_rq_map_user_iov(struct request_queue *, struct request *,
                struct rq_map_data *, const struct iov_iter *, gfp_t);
int blk_rq_unmap_user(struct bio *);
int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len,
                gfp_t gfp);
int blk_rq_append_bio(struct request *rq, struct bio *bio);
void blk_execute_rq_nowait(struct request *rq, bool at_head);
blk_status_t blk_execute_rq(struct request *rq, bool at_head);
bool blk_rq_is_poll(struct request *rq);

struct req_iterator {
        struct bvec_iter iter;
        struct bio *bio;
};

#define __rq_for_each_bio(_bio, rq)        \
        if ((rq->bio))                        \
                for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)

#define rq_for_each_segment(bvl, _rq, _iter)                        \
        __rq_for_each_bio(_iter.bio, _rq)                        \
                bio_for_each_segment(bvl, _iter.bio, _iter.iter)

#define rq_for_each_bvec(bvl, _rq, _iter)                        \
        __rq_for_each_bio(_iter.bio, _rq)                        \
                bio_for_each_bvec(bvl, _iter.bio, _iter.iter)

#define rq_iter_last(bvec, _iter)                                \
                (_iter.bio->bi_next == NULL &&                        \
                 bio_iter_last(bvec, _iter.iter))

/*
 * blk_rq_pos()                        : the current sector
 * blk_rq_bytes()                : bytes left in the entire request
 * blk_rq_cur_bytes()                : bytes left in the current segment
 * blk_rq_sectors()                : sectors left in the entire request
 * blk_rq_cur_sectors()                : sectors left in the current segment
 * blk_rq_stats_sectors()        : sectors of the entire request used for stats
 */
static inline sector_t blk_rq_pos(const struct request *rq)
{
        return rq->__sector;
}

static inline unsigned int blk_rq_bytes(const struct request *rq)
{
        return rq->__data_len;
}

static inline int blk_rq_cur_bytes(const struct request *rq)
{
        if (!rq->bio)
                return 0;
        if (!bio_has_data(rq->bio))        /* dataless requests such as discard */
                return rq->bio->bi_iter.bi_size;
        return bio_iovec(rq->bio).bv_len;
}

static inline unsigned int blk_rq_sectors(const struct request *rq)
{
        return blk_rq_bytes(rq) >> SECTOR_SHIFT;
}

static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
{
        return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
}

static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
{
        return rq->stats_sectors;
}

/*
 * Some commands like WRITE SAME have a payload or data transfer size which
 * is different from the size of the request.  Any driver that supports such
 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
 * calculate the data transfer size.
 */
static inline unsigned int blk_rq_payload_bytes(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return rq->special_vec.bv_len;
        return blk_rq_bytes(rq);
}

/*
 * Return the first full biovec in the request.  The caller needs to check that
 * there are any bvecs before calling this helper.
 */
static inline struct bio_vec req_bvec(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return rq->special_vec;
        return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
}

static inline unsigned int blk_rq_count_bios(struct request *rq)
{
        unsigned int nr_bios = 0;
        struct bio *bio;

        __rq_for_each_bio(bio, rq)
                nr_bios++;

        return nr_bios;
}

void blk_steal_bios(struct bio_list *list, struct request *rq);

/*
 * Request completion related functions.
 *
 * blk_update_request() completes given number of bytes and updates
 * the request without completing it.
 */
bool blk_update_request(struct request *rq, blk_status_t error,
                               unsigned int nr_bytes);
void blk_abort_request(struct request *);

/*
 * Number of physical segments as sent to the device.
 *
 * Normally this is the number of discontiguous data segments sent by the
 * submitter.  But for data-less command like discard we might have no
 * actual data segments submitted, but the driver might have to add it's
 * own special payload.  In that case we still return 1 here so that this
 * special payload will be mapped.
 */
static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return 1;
        return rq->nr_phys_segments;
}

/*
 * Number of discard segments (or ranges) the driver needs to fill in.
 * Each discard bio merged into a request is counted as one segment.
 */
static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
{
        return max_t(unsigned short, rq->nr_phys_segments, 1);
}

/**
 * blk_rq_nr_bvec - return number of bvecs in a request
 * @rq: request to calculate bvecs for
 *
 * Returns the number of bvecs.
 */
static inline unsigned int blk_rq_nr_bvec(struct request *rq)
{
        struct req_iterator rq_iter;
        struct bio_vec bv;
        unsigned int nr_bvec = 0;

        rq_for_each_bvec(bv, rq, rq_iter)
                nr_bvec++;

        return nr_bvec;
}

int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
                struct scatterlist **last_sg);
static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist)
{
        struct scatterlist *last_sg = NULL;

        return __blk_rq_map_sg(rq, sglist, &last_sg);
}
void blk_dump_rq_flags(struct request *, char *);

#endif /* BLK_MQ_H */










































   20 


















































   17 









   20 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/bitops.h>

#include <asm/word-at-a-time.h>

/*
 * Do a strnlen, return length of string *with* final '\0'.
 * 'count' is the user-supplied count, while 'max' is the
 * address space maximum.
 *
 * Return 0 for exceptions (which includes hitting the address
 * space maximum), or 'count+1' if hitting the user-supplied
 * maximum count.
 *
 * NOTE! We can sometimes overshoot the user-supplied maximum
 * if it fits in a aligned 'long'. The caller needs to check
 * the return value against "> max".
 */
static __always_inline long do_strnlen_user(const char __user *src, unsigned long count, unsigned long max)
{
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
        unsigned long align, res = 0;
        unsigned long c;

        /*
         * Do everything aligned. But that means that we
         * need to also expand the maximum..
         */
        align = (sizeof(unsigned long) - 1) & (unsigned long)src;
        src -= align;
        max += align;

        unsafe_get_user(c, (unsigned long __user *)src, efault);
        c |= aligned_byte_mask(align);

        for (;;) {
                unsigned long data;
                if (has_zero(c, &data, &constants)) {
                        data = prep_zero_mask(c, data, &constants);
                        data = create_zero_mask(data);
                        return res + find_zero(data) + 1 - align;
                }
                res += sizeof(unsigned long);
                /* We already handled 'unsigned long' bytes. Did we do it all ? */
                if (unlikely(max <= sizeof(unsigned long)))
                        break;
                max -= sizeof(unsigned long);
                unsafe_get_user(c, (unsigned long __user *)(src+res), efault);
        }
        res -= align;

        /*
         * Uhhuh. We hit 'max'. But was that the user-specified maximum
         * too? If so, return the marker for "too long".
         */
        if (res >= count)
                return count+1;

        /*
         * Nope: we hit the address space limit, and we still had more
         * characters the caller would have wanted. That's 0.
         */
efault:
        return 0;
}

/**
 * strnlen_user: - Get the size of a user string INCLUDING final NUL.
 * @str: The string to measure.
 * @count: Maximum count (including NUL character)
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * Get the size of a NUL-terminated string in user space.
 *
 * Returns the size of the string INCLUDING the terminating NUL.
 * If the string is too long, returns a number larger than @count. User
 * has to check the return value against "> count".
 * On exception (or invalid count), returns 0.
 *
 * NOTE! You should basically never use this function. There is
 * almost never any valid case for using the length of a user space
 * string, since the string can be changed at any time by other
 * threads. Use "strncpy_from_user()" instead to get a stable copy
 * of the string.
 */
long strnlen_user(const char __user *str, long count)
{
        unsigned long max_addr, src_addr;

        if (unlikely(count <= 0))
                return 0;

        if (can_do_masked_user_access()) {
                long retval;

                str = masked_user_read_access_begin(str);
                retval = do_strnlen_user(str, count, count);
                user_read_access_end();
                return retval;
        }

        max_addr = TASK_SIZE_MAX;
        src_addr = (unsigned long)untagged_addr(str);
        if (likely(src_addr < max_addr)) {
                unsigned long max = max_addr - src_addr;
                long retval;

                /*
                 * Truncate 'max' to the user-specified limit, so that
                 * we only have one limit we need to check in the loop
                 */
                if (max > count)
                        max = count;

                if (user_read_access_begin(str, max)) {
                        retval = do_strnlen_user(str, count, max);
                        user_read_access_end();
                        return retval;
                }
        }
        return 0;
}
EXPORT_SYMBOL(strnlen_user);






















































    4 












    2 



































































































    2 




















































































   12 


























   11 












    2 












































































    1 





















   14 










   12 























   14 












































   14 





















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM block

#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_BLOCK_H

#include <linux/blktrace_api.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/tracepoint.h>
#include <uapi/linux/ioprio.h>

#define RWBS_LEN        10

#define IOPRIO_CLASS_STRINGS \
        { IOPRIO_CLASS_NONE,        "none" }, \
        { IOPRIO_CLASS_RT,        "rt" }, \
        { IOPRIO_CLASS_BE,        "be" }, \
        { IOPRIO_CLASS_IDLE,        "idle" }, \
        { IOPRIO_CLASS_INVALID,        "invalid"}

#ifdef CONFIG_BUFFER_HEAD
DECLARE_EVENT_CLASS(block_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh),

        TP_STRUCT__entry (
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  size_t,        size                        )
        ),

        TP_fast_assign(
                __entry->dev                = bh->b_bdev->bd_dev;
                __entry->sector                = bh->b_blocknr;
                __entry->size                = bh->b_size;
        ),

        TP_printk("%d,%d sector=%llu size=%zu",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                (unsigned long long)__entry->sector, __entry->size
        )
);

/**
 * block_touch_buffer - mark a buffer accessed
 * @bh: buffer_head being touched
 *
 * Called from touch_buffer().
 */
DEFINE_EVENT(block_buffer, block_touch_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh)
);

/**
 * block_dirty_buffer - mark a buffer dirty
 * @bh: buffer_head being dirtied
 *
 * Called from mark_buffer_dirty().
 */
DEFINE_EVENT(block_buffer, block_dirty_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh)
);
#endif /* CONFIG_BUFFER_HEAD */

/**
 * block_rq_requeue - place block IO request back on a queue
 * @rq: block IO operation request
 *
 * The block operation request @rq is being placed back into queue
 * @q.  For some reason the request was not completed and needs to be
 * put back in the queue.
 */
TRACE_EVENT(block_rq_requeue,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  unsigned short, ioprio                )
                __array(  char,                rwbs,        RWBS_LEN        )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->q->disk ? disk_devt(rq->q->disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);
                __entry->ioprio    = req_get_ioprio(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                __get_str(cmd)[0] = '\0';
        ),

        TP_printk("%d,%d %s (%s) %llu + %u %s,%u,%u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector, __entry->nr_sector,
                  __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio),
                                   IOPRIO_CLASS_STRINGS),
                  IOPRIO_PRIO_HINT(__entry->ioprio),
                  IOPRIO_PRIO_LEVEL(__entry->ioprio),  0)
);

DECLARE_EVENT_CLASS(block_rq_completion,

        TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  int        ,        error                        )
                __field(  unsigned short, ioprio                )
                __array(  char,                rwbs,        RWBS_LEN        )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->q->disk ? disk_devt(rq->q->disk) : 0;
                __entry->sector    = blk_rq_pos(rq);
                __entry->nr_sector = nr_bytes >> 9;
                __entry->error     = blk_status_to_errno(error);
                __entry->ioprio    = req_get_ioprio(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                __get_str(cmd)[0] = '\0';
        ),

        TP_printk("%d,%d %s (%s) %llu + %u %s,%u,%u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector, __entry->nr_sector,
                  __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio),
                                   IOPRIO_CLASS_STRINGS),
                  IOPRIO_PRIO_HINT(__entry->ioprio),
                  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->error)
);

/**
 * block_rq_complete - block IO operation completed by device driver
 * @rq: block operations request
 * @error: status code
 * @nr_bytes: number of completed bytes
 *
 * The block_rq_complete tracepoint event indicates that some portion
 * of operation request has been completed by the device driver.  If
 * the @rq->bio is %NULL, then there is absolutely no additional work to
 * do for the request. If @rq->bio is non-NULL then there is
 * additional work required to complete the request.
 */
DEFINE_EVENT(block_rq_completion, block_rq_complete,

        TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes)
);

/**
 * block_rq_error - block IO operation error reported by device driver
 * @rq: block operations request
 * @error: status code
 * @nr_bytes: number of completed bytes
 *
 * The block_rq_error tracepoint event indicates that some portion
 * of operation request has failed as reported by the device driver.
 */
DEFINE_EVENT(block_rq_completion, block_rq_error,

        TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes)
);

DECLARE_EVENT_CLASS(block_rq,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  unsigned int,        bytes                        )
                __field(  unsigned short, ioprio                )
                __array(  char,                rwbs,        RWBS_LEN        )
                __array(  char,         comm,   TASK_COMM_LEN   )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->q->disk ? disk_devt(rq->q->disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);
                __entry->bytes     = blk_rq_bytes(rq);
                __entry->ioprio           = req_get_ioprio(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                __get_str(cmd)[0] = '\0';
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %u (%s) %llu + %u %s,%u,%u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __entry->bytes, __get_str(cmd),
                  (unsigned long long)__entry->sector, __entry->nr_sector,
                  __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio),
                                   IOPRIO_CLASS_STRINGS),
                  IOPRIO_PRIO_HINT(__entry->ioprio),
                  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
);

/**
 * block_rq_insert - insert block operation request into queue
 * @rq: block IO operation request
 *
 * Called immediately before block operation request @rq is inserted
 * into queue @q.  The fields in the operation request @rq struct can
 * be examined to determine which device and sectors the pending
 * operation would access.
 */
DEFINE_EVENT(block_rq, block_rq_insert,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_rq_issue - issue pending block IO request operation to device driver
 * @rq: block IO operation request
 *
 * Called when block operation request @rq from queue @q is sent to a
 * device driver for processing.
 */
DEFINE_EVENT(block_rq, block_rq_issue,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_rq_merge - merge request with another one in the elevator
 * @rq: block IO operation request
 *
 * Called when block operation request @rq from queue @q is merged to another
 * request queued in the elevator.
 */
DEFINE_EVENT(block_rq, block_rq_merge,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_io_start - insert a request for execution
 * @rq: block IO operation request
 *
 * Called when block operation request @rq is queued for execution
 */
DEFINE_EVENT(block_rq, block_io_start,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_io_done - block IO operation request completed
 * @rq: block IO operation request
 *
 * Called when block operation request @rq is completed
 */
DEFINE_EVENT(block_rq, block_io_done,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_bio_complete - completed all work on the block operation
 * @q: queue holding the block operation
 * @bio: block operation completed
 *
 * This tracepoint indicates there is no further work to do on this
 * block IO operation @bio.
 */
TRACE_EVENT(block_bio_complete,

        TP_PROTO(struct request_queue *q, struct bio *bio),

        TP_ARGS(q, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned,        nr_sector        )
                __field( int,                error                )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                __entry->error                = blk_status_to_errno(bio->bi_status);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
        ),

        TP_printk("%d,%d %s %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->error)
);

DECLARE_EVENT_CLASS(block_bio,

        TP_PROTO(struct bio *bio),

        TP_ARGS(bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_bio_backmerge - merging block operation to the end of an existing operation
 * @bio: new block operation to merge
 *
 * Merging block request @bio to the end of an existing block request.
 */
DEFINE_EVENT(block_bio, block_bio_backmerge,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_bio_frontmerge - merging block operation to the beginning of an existing operation
 * @bio: new block operation to merge
 *
 * Merging block IO operation @bio to the beginning of an existing block request.
 */
DEFINE_EVENT(block_bio, block_bio_frontmerge,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_bio_queue - putting new block IO operation in queue
 * @bio: new block operation
 *
 * About to place the block IO operation @bio into queue @q.
 */
DEFINE_EVENT(block_bio, block_bio_queue,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_getrq - get a free request entry in queue for block IO operations
 * @bio: pending block IO operation (can be %NULL)
 *
 * A request struct has been allocated to handle the block IO operation @bio.
 */
DEFINE_EVENT(block_bio, block_getrq,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * blk_zone_append_update_request_bio - update bio sector after zone append
 * @rq: the completed request that sets the bio sector
 *
 * Update the bio's bi_sector after a zone append command has been completed.
 */
DEFINE_EVENT(block_rq, blk_zone_append_update_request_bio,
             TP_PROTO(struct request *rq),
             TP_ARGS(rq)
);

/**
 * block_plug - keep operations requests in request queue
 * @q: request queue to plug
 *
 * Plug the request queue @q.  Do not allow block operation requests
 * to be sent to the device driver. Instead, accumulate requests in
 * the queue to improve throughput performance of the block device.
 */
TRACE_EVENT(block_plug,

        TP_PROTO(struct request_queue *q),

        TP_ARGS(q),

        TP_STRUCT__entry(
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("[%s]", __entry->comm)
);

DECLARE_EVENT_CLASS(block_unplug,

        TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),

        TP_ARGS(q, depth, explicit),

        TP_STRUCT__entry(
                __field( int,                nr_rq                        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->nr_rq = depth;
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
);

/**
 * block_unplug - release of operations requests in request queue
 * @q: request queue to unplug
 * @depth: number of requests just added to the queue
 * @explicit: whether this was an explicit unplug, or one from schedule()
 *
 * Unplug request queue @q because device driver is scheduled to work
 * on elements in the request queue.
 */
DEFINE_EVENT(block_unplug, block_unplug,

        TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),

        TP_ARGS(q, depth, explicit)
);

/**
 * block_split - split a single bio struct into two bio structs
 * @bio: block operation being split
 * @new_sector: The starting sector for the new bio
 *
 * The bio request @bio needs to be split into two bio requests.  The newly
 * created @bio request starts at @new_sector. This split may be required due to
 * hardware limitations such as operation crossing device boundaries in a RAID
 * system.
 */
TRACE_EVENT(block_split,

        TP_PROTO(struct bio *bio, unsigned int new_sector),

        TP_ARGS(bio, new_sector),

        TP_STRUCT__entry(
                __field( dev_t,                dev                                )
                __field( sector_t,        sector                                )
                __field( sector_t,        new_sector                        )
                __array( char,                rwbs,                RWBS_LEN        )
                __array( char,                comm,                TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->new_sector        = new_sector;
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu / %llu [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  (unsigned long long)__entry->new_sector,
                  __entry->comm)
);

/**
 * block_bio_remap - map request for a logical device to the raw device
 * @bio: revised operation
 * @dev: original device for the operation
 * @from: original sector for the operation
 *
 * An operation for a logical device has been mapped to the
 * raw block device.
 */
TRACE_EVENT(block_bio_remap,

        TP_PROTO(struct bio *bio, dev_t dev, sector_t from),

        TP_ARGS(bio, dev, from),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sector        )
                __field( dev_t,                old_dev                )
                __field( sector_t,        old_sector        )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                __entry->old_dev        = dev;
                __entry->old_sector        = from;
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
        ),

        TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector,
                  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
                  (unsigned long long)__entry->old_sector)
);

/**
 * block_rq_remap - map request for a block operation request
 * @rq: block IO operation request
 * @dev: device for the operation
 * @from: original sector for the operation
 *
 * The block operation request @rq in @q has been remapped.  The block
 * operation request @rq holds the current information and @from hold
 * the original sector.
 */
TRACE_EVENT(block_rq_remap,

        TP_PROTO(struct request *rq, dev_t dev, sector_t from),

        TP_ARGS(rq, dev, from),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sector        )
                __field( dev_t,                old_dev                )
                __field( sector_t,        old_sector        )
                __field( unsigned int,        nr_bios                )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = disk_devt(rq->q->disk);
                __entry->sector                = blk_rq_pos(rq);
                __entry->nr_sector        = blk_rq_sectors(rq);
                __entry->old_dev        = dev;
                __entry->old_sector        = from;
                __entry->nr_bios        = blk_rq_count_bios(rq);
                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
        ),

        TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector,
                  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
                  (unsigned long long)__entry->old_sector, __entry->nr_bios)
);

/**
 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
 * @bio: The block IO operation sent down to the device
 * @nr_sectors: The number of sectors affected by this operation
 *
 * Execute a zone management operation on a specified range of zones. This
 * range is encoded in %nr_sectors, which has to be a multiple of the zone
 * size.
 */
TRACE_EVENT(blkdev_zone_mgmt,

        TP_PROTO(struct bio *bio, sector_t nr_sectors),

        TP_ARGS(bio, nr_sectors),

        TP_STRUCT__entry(
            __field(  dev_t,        dev                )
            __field(  sector_t,        sector                )
            __field(  sector_t, nr_sectors        )
            __array(  char,        rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
            __entry->dev        = bio_dev(bio);
            __entry->sector        = bio->bi_iter.bi_sector;
            __entry->nr_sectors        = bio_sectors(bio);
            blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
        ),

        TP_printk("%d,%d %s %llu + %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sectors)
);

DECLARE_EVENT_CLASS(block_zwplug,

        TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector,
                 unsigned int nr_sectors),

        TP_ARGS(q, zno, sector, nr_sectors),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( unsigned int,        zno                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sectors        )
        ),

        TP_fast_assign(
                __entry->dev                = disk_devt(q->disk);
                __entry->zno                = zno;
                __entry->sector                = sector;
                __entry->nr_sectors        = nr_sectors;
        ),

        TP_printk("%d,%d zone %u, BIO %llu + %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->zno,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sectors)
);

DEFINE_EVENT(block_zwplug, disk_zone_wplug_add_bio,

        TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector,
                 unsigned int nr_sectors),

        TP_ARGS(q, zno, sector, nr_sectors)
);

DEFINE_EVENT(block_zwplug, blk_zone_wplug_bio,

        TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector,
                 unsigned int nr_sectors),

        TP_ARGS(q, zno, sector, nr_sectors)
);

#endif /* _TRACE_BLOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MBCACHE_H
#define _LINUX_MBCACHE_H

#include <linux/hash.h>
#include <linux/list_bl.h>
#include <linux/list.h>
#include <linux/atomic.h>
#include <linux/fs.h>

struct mb_cache;

/* Cache entry flags */
enum {
        MBE_REFERENCED_B = 0,
        MBE_REUSABLE_B
};

struct mb_cache_entry {
        /* List of entries in cache - protected by cache->c_list_lock */
        struct list_head        e_list;
        /*
         * Hash table list - protected by hash chain bitlock. The entry is
         * guaranteed to be hashed while e_refcnt > 0.
         */
        struct hlist_bl_node        e_hash_list;
        /*
         * Entry refcount. Once it reaches zero, entry is unhashed and freed.
         * While refcount > 0, the entry is guaranteed to stay in the hash and
         * e.g. mb_cache_entry_try_delete() will fail.
         */
        atomic_t                e_refcnt;
        /* Key in hash - stable during lifetime of the entry */
        u32                        e_key;
        unsigned long                e_flags;
        /* User provided value - stable during lifetime of the entry */
        u64                        e_value;
};

struct mb_cache *mb_cache_create(int bucket_bits);
void mb_cache_destroy(struct mb_cache *cache);

int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
                          u64 value, bool reusable);
void __mb_cache_entry_free(struct mb_cache *cache,
                           struct mb_cache_entry *entry);
void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
static inline void mb_cache_entry_put(struct mb_cache *cache,
                                      struct mb_cache_entry *entry)
{
        unsigned int cnt = atomic_dec_return(&entry->e_refcnt);

        if (cnt > 0) {
                if (cnt <= 2)
                        wake_up_var(&entry->e_refcnt);
                return;
        }
        __mb_cache_entry_free(cache, entry);
}

struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
                                                    u32 key, u64 value);
struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
                                          u64 value);
struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
                                                 u32 key);
struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
                                                struct mb_cache_entry *entry);
void mb_cache_entry_touch(struct mb_cache *cache,
                          struct mb_cache_entry *entry);

#endif        /* _LINUX_MBCACHE_H */




















































    8 









    8 
























   10 
























    9 

   11 
    8 






   11 
   10 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

/*
 * Basic idea behind the notification queue: An fsnotify group (like inotify)
 * sends the userspace notification about events asynchronously some time after
 * the event happened.  When inotify gets an event it will need to add that
 * event to the group notify queue.  Since a single event might need to be on
 * multiple group's notification queues we can't add the event directly to each
 * queue and instead add a small "event_holder" to each queue.  This event_holder
 * has a pointer back to the original event.  Since the majority of events are
 * going to end up on one, and only one, notification queue we embed one
 * event_holder into each event.  This means we have a single allocation instead
 * of always needing two.  If the embedded event_holder is already in use by
 * another group a new event_holder (from fsnotify_event_holder_cachep) will be
 * allocated and used.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/spinlock.h>

#include <linux/atomic.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);

/**
 * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
 * Called from fsnotify_move, which is inlined into filesystem modules.
 */
u32 fsnotify_get_cookie(void)
{
        return atomic_inc_return(&fsnotify_sync_cookie);
}
EXPORT_SYMBOL_GPL(fsnotify_get_cookie);

void fsnotify_destroy_event(struct fsnotify_group *group,
                            struct fsnotify_event *event)
{
        /* Overflow events are per-group and we don't want to free them */
        if (!event || event == group->overflow_event)
                return;
        /*
         * If the event is still queued, we have a problem... Do an unreliable
         * lockless check first to avoid locking in the common case. The
         * locking may be necessary for permission events which got removed
         * from the list by a different CPU than the one freeing the event.
         */
        if (!list_empty(&event->list)) {
                spin_lock(&group->notification_lock);
                WARN_ON(!list_empty(&event->list));
                spin_unlock(&group->notification_lock);
        }
        group->ops->free_event(group, event);
}

/*
 * Try to add an event to the notification queue.
 * The group can later pull this event off the queue to deal with.
 * The group can use the @merge hook to merge the event with a queued event.
 * The group can use the @insert hook to insert the event into hash table.
 * The function returns:
 * 0 if the event was added to a queue
 * 1 if the event was merged with some other queued event
 * 2 if the event was not queued - either the queue of events has overflown
 *   or the group is shutting down.
 */
int fsnotify_insert_event(struct fsnotify_group *group,
                          struct fsnotify_event *event,
                          int (*merge)(struct fsnotify_group *,
                                       struct fsnotify_event *),
                          void (*insert)(struct fsnotify_group *,
                                         struct fsnotify_event *))
{
        int ret = 0;
        struct list_head *list = &group->notification_list;

        pr_debug("%s: group=%p event=%p\n", __func__, group, event);

        spin_lock(&group->notification_lock);

        if (group->shutdown) {
                spin_unlock(&group->notification_lock);
                return 2;
        }

        if (event == group->overflow_event ||
            group->q_len >= group->max_events) {
                ret = 2;
                /* Queue overflow event only if it isn't already queued */
                if (!list_empty(&group->overflow_event->list)) {
                        spin_unlock(&group->notification_lock);
                        return ret;
                }
                event = group->overflow_event;
                goto queue;
        }

        if (!list_empty(list) && merge) {
                ret = merge(group, event);
                if (ret) {
                        spin_unlock(&group->notification_lock);
                        return ret;
                }
        }

queue:
        group->q_len++;
        list_add_tail(&event->list, list);
        if (insert)
                insert(group, event);
        spin_unlock(&group->notification_lock);

        wake_up(&group->notification_waitq);
        kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
        return ret;
}

void fsnotify_remove_queued_event(struct fsnotify_group *group,
                                  struct fsnotify_event *event)
{
        assert_spin_locked(&group->notification_lock);
        /*
         * We need to init list head for the case of overflow event so that
         * check in fsnotify_add_event() works
         */
        list_del_init(&event->list);
        group->q_len--;
}

/*
 * Return the first event on the notification list without removing it.
 * Returns NULL if the list is empty.
 */
struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
{
        assert_spin_locked(&group->notification_lock);

        if (fsnotify_notify_queue_is_empty(group))
                return NULL;

        return list_first_entry(&group->notification_list,
                                struct fsnotify_event, list);
}

/*
 * Remove and return the first event from the notification list.  It is the
 * responsibility of the caller to destroy the obtained event
 */
struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
{
        struct fsnotify_event *event = fsnotify_peek_first_event(group);

        if (!event)
                return NULL;

        pr_debug("%s: group=%p event=%p\n", __func__, group, event);

        fsnotify_remove_queued_event(group, event);

        return event;
}

/*
 * Called when a group is being torn down to clean up any outstanding
 * event notifications.
 */
void fsnotify_flush_notify(struct fsnotify_group *group)
{
        struct fsnotify_event *event;

        spin_lock(&group->notification_lock);
        while (!fsnotify_notify_queue_is_empty(group)) {
                event = fsnotify_remove_first_event(group);
                spin_unlock(&group->notification_lock);
                fsnotify_destroy_event(group, event);
                spin_lock(&group->notification_lock);
        }
        spin_unlock(&group->notification_lock);
}

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/ioctl.c
 *
 * Copyright (C) 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 */

#include <linux/fs.h>
#include <linux/capability.h>
#include <linux/time.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/random.h>
#include <linux/uaccess.h>
#include <linux/delay.h>
#include <linux/iversion.h>
#include <linux/fileattr.h>
#include <linux/uuid.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include <linux/fsmap.h>
#include "fsmap.h"
#include <trace/events/ext4.h>
#include <linux/fserror.h>

typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi,
                                     struct ext4_super_block *es,
                                     const void *arg);

/*
 * Superblock modification callback function for changing file system
 * label
 */
static void ext4_sb_setlabel(struct ext4_sb_info *sbi,
                             struct ext4_super_block *es, const void *arg)
{
        /* Sanity check, this should never happen */
        BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX);

        memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX);
}

/*
 * Superblock modification callback function for changing file system
 * UUID.
 */
static void ext4_sb_setuuid(struct ext4_sb_info *sbi,
                            struct ext4_super_block *es, const void *arg)
{
        memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE);
}

static
int ext4_update_primary_sb(struct super_block *sb, handle_t *handle,
                           ext4_update_sb_callback func,
                           const void *arg)
{
        int err = 0;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh = sbi->s_sbh;
        struct ext4_super_block *es = sbi->s_es;

        trace_ext4_update_sb(sb, bh->b_blocknr, 1);

        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, sb,
                                            bh,
                                            EXT4_JTR_NONE);
        if (err)
                goto out_err;

        lock_buffer(bh);
        func(sbi, es, arg);
        ext4_superblock_csum_set(sb);
        unlock_buffer(bh);

        if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) {
                ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
                         "superblock detected");
                clear_buffer_write_io_error(bh);
                set_buffer_uptodate(bh);
        }

        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (err)
                goto out_err;
        err = sync_dirty_buffer(bh);
out_err:
        ext4_std_error(sb, err);
        return err;
}

/*
 * Update one backup superblock in the group 'grp' using the callback
 * function 'func' and argument 'arg'. If the handle is NULL the
 * modification is not journalled.
 *
 * Returns: 0 when no modification was done (no superblock in the group)
 *            1 when the modification was successful
 *           <0 on error
 */
static int ext4_update_backup_sb(struct super_block *sb,
                                 handle_t *handle, ext4_group_t grp,
                                 ext4_update_sb_callback func, const void *arg)
{
        int err = 0;
        ext4_fsblk_t sb_block;
        struct buffer_head *bh;
        unsigned long offset = 0;
        struct ext4_super_block *es;

        if (!ext4_bg_has_super(sb, grp))
                return 0;

        /*
         * For the group 0 there is always 1k padding, so we have
         * either adjust offset, or sb_block depending on blocksize
         */
        if (grp == 0) {
                sb_block = 1 * EXT4_MIN_BLOCK_SIZE;
                offset = do_div(sb_block, sb->s_blocksize);
        } else {
                sb_block = ext4_group_first_block_no(sb, grp);
                offset = 0;
        }

        trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0);

        bh = ext4_sb_bread(sb, sb_block, 0);
        if (IS_ERR(bh))
                return PTR_ERR(bh);

        if (handle) {
                BUFFER_TRACE(bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, sb,
                                                    bh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto out_bh;
        }

        es = (struct ext4_super_block *) (bh->b_data + offset);
        lock_buffer(bh);
        if (ext4_has_feature_metadata_csum(sb) &&
            es->s_checksum != ext4_superblock_csum(es)) {
                ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
                "superblock %llu", sb_block);
                unlock_buffer(bh);
                goto out_bh;
        }
        func(EXT4_SB(sb), es, arg);
        if (ext4_has_feature_metadata_csum(sb))
                es->s_checksum = ext4_superblock_csum(es);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);

        if (handle) {
                err = ext4_handle_dirty_metadata(handle, NULL, bh);
                if (err)
                        goto out_bh;
        } else {
                BUFFER_TRACE(bh, "marking dirty");
                mark_buffer_dirty(bh);
        }
        err = sync_dirty_buffer(bh);

out_bh:
        brelse(bh);
        ext4_std_error(sb, err);
        return (err) ? err : 1;
}

/*
 * Update primary and backup superblocks using the provided function
 * func and argument arg.
 *
 * Only the primary superblock and at most two backup superblock
 * modifications are journalled; the rest is modified without journal.
 * This is safe because e2fsck will re-write them if there is a problem,
 * and we're very unlikely to ever need more than two backups.
 */
static
int ext4_update_superblocks_fn(struct super_block *sb,
                               ext4_update_sb_callback func,
                               const void *arg)
{
        handle_t *handle;
        ext4_group_t ngroups;
        unsigned int three = 1;
        unsigned int five = 5;
        unsigned int seven = 7;
        int err = 0, ret, i;
        ext4_group_t grp, primary_grp;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /*
         * We can't update superblocks while the online resize is running
         */
        if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
                                  &sbi->s_ext4_flags)) {
                ext4_msg(sb, KERN_ERR, "Can't modify superblock while"
                         "performing online resize");
                return -EBUSY;
        }

        /*
         * We're only going to update primary superblock and two
         * backup superblocks in this transaction.
         */
        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto out;
        }

        /* Update primary superblock */
        err = ext4_update_primary_sb(sb, handle, func, arg);
        if (err) {
                ext4_msg(sb, KERN_ERR, "Failed to update primary "
                         "superblock");
                goto out_journal;
        }

        primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr);
        ngroups = ext4_get_groups_count(sb);

        /*
         * Update backup superblocks. We have to start from group 0
         * because it might not be where the primary superblock is
         * if the fs is mounted with -o sb=<backup_sb_block>
         */
        i = 0;
        grp = 0;
        while (grp < ngroups) {
                /* Skip primary superblock */
                if (grp == primary_grp)
                        goto next_grp;

                ret = ext4_update_backup_sb(sb, handle, grp, func, arg);
                if (ret < 0) {
                        /* Ignore bad checksum; try to update next sb */
                        if (ret == -EFSBADCRC)
                                goto next_grp;
                        err = ret;
                        goto out_journal;
                }

                i += ret;
                if (handle && i > 1) {
                        /*
                         * We're only journalling primary superblock and
                         * two backup superblocks; the rest is not
                         * journalled.
                         */
                        err = ext4_journal_stop(handle);
                        if (err)
                                goto out;
                        handle = NULL;
                }
next_grp:
                grp = ext4_list_backups(sb, &three, &five, &seven);
        }

out_journal:
        if (handle) {
                ret = ext4_journal_stop(handle);
                if (ret && !err)
                        err = ret;
        }
out:
        clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags);
        smp_mb__after_atomic();
        return err ? err : 0;
}

/*
 * Swap memory between @a and @b for @len bytes.
 *
 * @a:          pointer to first memory area
 * @b:          pointer to second memory area
 * @len:        number of bytes to swap
 *
 */
static void memswap(void *a, void *b, size_t len)
{
        unsigned char *ap, *bp;

        ap = (unsigned char *)a;
        bp = (unsigned char *)b;
        while (len-- > 0) {
                swap(*ap, *bp);
                ap++;
                bp++;
        }
}

/*
 * Swap i_data and associated attributes between @inode1 and @inode2.
 * This function is used for the primary swap between inode1 and inode2
 * and also to revert this primary swap in case of errors.
 *
 * Therefore you have to make sure, that calling this method twice
 * will revert all changes.
 *
 * @inode1:     pointer to first inode
 * @inode2:     pointer to second inode
 */
static void swap_inode_data(struct inode *inode1, struct inode *inode2)
{
        loff_t isize;
        struct ext4_inode_info *ei1;
        struct ext4_inode_info *ei2;
        unsigned long tmp;
        struct timespec64 ts1, ts2;

        ei1 = EXT4_I(inode1);
        ei2 = EXT4_I(inode2);

        swap(inode1->i_version, inode2->i_version);

        ts1 = inode_get_atime(inode1);
        ts2 = inode_get_atime(inode2);
        inode_set_atime_to_ts(inode1, ts2);
        inode_set_atime_to_ts(inode2, ts1);

        ts1 = inode_get_mtime(inode1);
        ts2 = inode_get_mtime(inode2);
        inode_set_mtime_to_ts(inode1, ts2);
        inode_set_mtime_to_ts(inode2, ts1);

        memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
        tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
        ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) |
                (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP);
        ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP);
        swap(ei1->i_disksize, ei2->i_disksize);
        ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
        ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);

        isize = i_size_read(inode1);
        i_size_write(inode1, i_size_read(inode2));
        i_size_write(inode2, isize);
}

void ext4_reset_inode_seed(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __le32 inum = cpu_to_le32(inode->i_ino);
        __le32 gen = cpu_to_le32(inode->i_generation);
        __u32 csum;

        if (!ext4_has_feature_metadata_csum(inode->i_sb))
                return;

        csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
        ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}

/*
 * Swap the information from the given @inode and the inode
 * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
 * important fields of the inodes.
 *
 * @sb:         the super block of the filesystem
 * @idmap:        idmap of the mount the inode was found from
 * @inode:      the inode to swap with EXT4_BOOT_LOADER_INO
 *
 */
static long swap_inode_boot_loader(struct super_block *sb,
                                struct mnt_idmap *idmap,
                                struct inode *inode)
{
        handle_t *handle;
        int err;
        struct inode *inode_bl;
        struct ext4_inode_info *ei_bl;
        qsize_t size, size_bl, diff;
        blkcnt_t blocks;
        unsigned short bytes;

        inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO,
                        EXT4_IGET_SPECIAL | EXT4_IGET_BAD);
        if (IS_ERR(inode_bl))
                return PTR_ERR(inode_bl);
        ei_bl = EXT4_I(inode_bl);

        /* Protect orig inodes against a truncate and make sure,
         * that only 1 swap_inode_boot_loader is running. */
        lock_two_nondirectories(inode, inode_bl);

        if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
            IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
            (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ||
            ext4_has_inline_data(inode)) {
                err = -EINVAL;
                goto journal_err_out;
        }

        if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
            !inode_owner_or_capable(idmap, inode) ||
            !capable(CAP_SYS_ADMIN)) {
                err = -EPERM;
                goto journal_err_out;
        }

        filemap_invalidate_lock(inode->i_mapping);
        err = filemap_write_and_wait(inode->i_mapping);
        if (err)
                goto err_out;

        err = filemap_write_and_wait(inode_bl->i_mapping);
        if (err)
                goto err_out;

        /* Wait for all existing dio workers */
        inode_dio_wait(inode);
        inode_dio_wait(inode_bl);

        truncate_inode_pages(&inode->i_data, 0);
        truncate_inode_pages(&inode_bl->i_data, 0);

        handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
        if (IS_ERR(handle)) {
                err = -EINVAL;
                goto err_out;
        }
        ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle);

        /* Protect extent tree against block allocations via delalloc */
        ext4_double_down_write_data_sem(inode, inode_bl);

        if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) {
                /* this inode has never been used as a BOOT_LOADER */
                set_nlink(inode_bl, 1);
                i_uid_write(inode_bl, 0);
                i_gid_write(inode_bl, 0);
                inode_bl->i_flags = 0;
                ei_bl->i_flags = 0;
                inode_set_iversion(inode_bl, 1);
                i_size_write(inode_bl, 0);
                EXT4_I(inode_bl)->i_disksize = inode_bl->i_size;
                inode_bl->i_mode = S_IFREG;
                if (ext4_has_feature_extents(sb)) {
                        ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
                        ext4_ext_tree_init(handle, inode_bl);
                } else
                        memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
        }

        err = dquot_initialize(inode);
        if (err)
                goto err_out1;

        size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes;
        size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes;
        diff = size - size_bl;
        swap_inode_data(inode, inode_bl);

        inode_set_ctime_current(inode);
        inode_set_ctime_current(inode_bl);
        inode_inc_iversion(inode);

        inode->i_generation = get_random_u32();
        inode_bl->i_generation = get_random_u32();
        ext4_reset_inode_seed(inode);
        ext4_reset_inode_seed(inode_bl);

        ext4_discard_preallocations(inode);

        err = ext4_mark_inode_dirty(handle, inode);
        if (err < 0) {
                /* No need to update quota information. */
                ext4_warning(inode->i_sb,
                        "couldn't mark inode #%llu dirty (err %d)",
                        inode->i_ino, err);
                /* Revert all changes: */
                swap_inode_data(inode, inode_bl);
                ext4_mark_inode_dirty(handle, inode);
                goto err_out1;
        }

        blocks = inode_bl->i_blocks;
        bytes = inode_bl->i_bytes;
        inode_bl->i_blocks = inode->i_blocks;
        inode_bl->i_bytes = inode->i_bytes;
        err = ext4_mark_inode_dirty(handle, inode_bl);
        if (err < 0) {
                /* No need to update quota information. */
                ext4_warning(inode_bl->i_sb,
                        "couldn't mark inode #%llu dirty (err %d)",
                        inode_bl->i_ino, err);
                goto revert;
        }

        /* Bootloader inode should not be counted into quota information. */
        if (diff > 0)
                dquot_free_space(inode, diff);
        else
                err = dquot_alloc_space(inode, -1 * diff);

        if (err < 0) {
revert:
                /* Revert all changes: */
                inode_bl->i_blocks = blocks;
                inode_bl->i_bytes = bytes;
                swap_inode_data(inode, inode_bl);
                ext4_mark_inode_dirty(handle, inode);
                ext4_mark_inode_dirty(handle, inode_bl);
        }

err_out1:
        ext4_journal_stop(handle);
        ext4_double_up_write_data_sem(inode, inode_bl);

err_out:
        filemap_invalidate_unlock(inode->i_mapping);
journal_err_out:
        unlock_two_nondirectories(inode, inode_bl);
        iput(inode_bl);
        return err;
}

/*
 * If immutable is set and we are not clearing it, we're not allowed to change
 * anything else in the inode.  Don't error out if we're only trying to set
 * immutable on an immutable file.
 */
static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
                                      unsigned int flags)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int oldflags = ei->i_flags;

        if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL))
                return 0;

        if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL))
                return -EPERM;
        if (ext4_has_feature_project(inode->i_sb) &&
            __kprojid_val(ei->i_projid) != new_projid)
                return -EPERM;

        return 0;
}

static void ext4_dax_dontcache(struct inode *inode, unsigned int flags)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (S_ISDIR(inode->i_mode))
                return;

        if (test_opt2(inode->i_sb, DAX_NEVER) ||
            test_opt(inode->i_sb, DAX_ALWAYS))
                return;

        if ((ei->i_flags ^ flags) & EXT4_DAX_FL)
                d_mark_dontcache(inode);
}

static bool dax_compatible(struct inode *inode, unsigned int oldflags,
                           unsigned int flags)
{
        /* Allow the DAX flag to be changed on inline directories */
        if (S_ISDIR(inode->i_mode)) {
                flags &= ~EXT4_INLINE_DATA_FL;
                oldflags &= ~EXT4_INLINE_DATA_FL;
        }

        if (flags & EXT4_DAX_FL) {
                if ((oldflags & EXT4_DAX_MUT_EXCL) ||
                     ext4_test_inode_state(inode,
                                          EXT4_STATE_VERITY_IN_PROGRESS)) {
                        return false;
                }
        }

        if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL))
                        return false;

        return true;
}

static int ext4_ioctl_setflags(struct inode *inode,
                               unsigned int flags)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        handle_t *handle = NULL;
        int err = -EPERM, migrate = 0;
        struct ext4_iloc iloc;
        unsigned int oldflags, mask, i;
        struct super_block *sb = inode->i_sb;

        /* Is it quota file? Do not allow user to mess with it */
        if (ext4_is_quota_file(inode))
                goto flags_out;

        oldflags = ei->i_flags;
        /*
         * The JOURNAL_DATA flag can only be changed by
         * the relevant capability.
         */
        if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
                if (!capable(CAP_SYS_RESOURCE))
                        goto flags_out;
        }

        if (!dax_compatible(inode, oldflags, flags)) {
                err = -EOPNOTSUPP;
                goto flags_out;
        }

        if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
                migrate = 1;

        if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) {
                if (!ext4_has_feature_casefold(sb)) {
                        err = -EOPNOTSUPP;
                        goto flags_out;
                }

                if (!S_ISDIR(inode->i_mode)) {
                        err = -ENOTDIR;
                        goto flags_out;
                }

                if (!ext4_empty_dir(inode)) {
                        err = -ENOTEMPTY;
                        goto flags_out;
                }
        }

        /*
         * Wait for all pending directio and then flush all the dirty pages
         * for this file.  The flush marks all the pages readonly, so any
         * subsequent attempt to write to the file (particularly mmap pages)
         * will come through the filesystem and fail.
         */
        if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) &&
            (flags & EXT4_IMMUTABLE_FL)) {
                inode_dio_wait(inode);
                err = filemap_write_and_wait(inode->i_mapping);
                if (err)
                        goto flags_out;
        }

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto flags_out;
        }
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto flags_err;

        ext4_dax_dontcache(inode, flags);

        for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
                if (!(mask & EXT4_FL_USER_MODIFIABLE))
                        continue;
                /* These flags get special treatment later */
                if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL)
                        continue;
                if (mask & flags)
                        ext4_set_inode_flag(inode, i);
                else
                        ext4_clear_inode_flag(inode, i);
        }

        ext4_set_inode_flags(inode, false);

        inode_set_ctime_current(inode);
        inode_inc_iversion(inode);

        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
        ext4_journal_stop(handle);
        if (err)
                goto flags_out;

        if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
                /*
                 * Changes to the journaling mode can cause unsafe changes to
                 * S_DAX if the inode is DAX
                 */
                if (IS_DAX(inode)) {
                        err = -EBUSY;
                        goto flags_out;
                }

                err = ext4_change_inode_journal_flag(inode,
                                                     flags & EXT4_JOURNAL_DATA_FL);
                if (err)
                        goto flags_out;
        }
        if (migrate) {
                if (flags & EXT4_EXTENTS_FL)
                        err = ext4_ext_migrate(inode);
                else
                        err = ext4_ind_migrate(inode);
        }

flags_out:
        return err;
}

#ifdef CONFIG_QUOTA
static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
{
        struct super_block *sb = inode->i_sb;
        struct ext4_inode_info *ei = EXT4_I(inode);
        int err, rc;
        handle_t *handle;
        kprojid_t kprojid;
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct dquot *transfer_to[MAXQUOTAS] = { };

        if (!ext4_has_feature_project(sb)) {
                if (projid != EXT4_DEF_PROJID)
                        return -EOPNOTSUPP;
                else
                        return 0;
        }

        if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE)
                return -EOPNOTSUPP;

        kprojid = make_kprojid(&init_user_ns, (projid_t)projid);

        if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
                return 0;

        err = -EPERM;
        /* Is it quota file? Do not allow user to mess with it */
        if (ext4_is_quota_file(inode))
                return err;

        err = dquot_initialize(inode);
        if (err)
                return err;

        err = ext4_get_inode_loc(inode, &iloc);
        if (err)
                return err;

        raw_inode = ext4_raw_inode(&iloc);
        if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
                err = ext4_expand_extra_isize(inode,
                                              EXT4_SB(sb)->s_want_extra_isize,
                                              &iloc);
                if (err)
                        return err;
        } else {
                brelse(iloc.bh);
        }

        handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
                EXT4_QUOTA_INIT_BLOCKS(sb) +
                EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out_stop;

        transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
        if (!IS_ERR(transfer_to[PRJQUOTA])) {

                /* __dquot_transfer() calls back ext4_get_inode_usage() which
                 * counts xattr inode references.
                 */
                down_read(&EXT4_I(inode)->xattr_sem);
                err = __dquot_transfer(inode, transfer_to);
                up_read(&EXT4_I(inode)->xattr_sem);
                dqput(transfer_to[PRJQUOTA]);
                if (err)
                        goto out_dirty;
        }

        EXT4_I(inode)->i_projid = kprojid;
        inode_set_ctime_current(inode);
        inode_inc_iversion(inode);
out_dirty:
        rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
        if (!err)
                err = rc;
out_stop:
        ext4_journal_stop(handle);
        return err;
}
#else
static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
{
        if (projid != EXT4_DEF_PROJID)
                return -EOPNOTSUPP;
        return 0;
}
#endif

int ext4_force_shutdown(struct super_block *sb, u32 flags)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int ret;

        if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH)
                return -EINVAL;

        if (ext4_forced_shutdown(sb))
                return 0;

        ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags);
        trace_ext4_shutdown(sb, flags);

        switch (flags) {
        case EXT4_GOING_FLAGS_DEFAULT:
                ret = bdev_freeze(sb->s_bdev);
                if (ret)
                        return ret;
                set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
                bdev_thaw(sb->s_bdev);
                break;
        case EXT4_GOING_FLAGS_LOGFLUSH:
                set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
                if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) {
                        (void) ext4_force_commit(sb);
                        jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN);
                }
                break;
        case EXT4_GOING_FLAGS_NOLOGFLUSH:
                set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
                if (sbi->s_journal && !is_journal_aborted(sbi->s_journal))
                        jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN);
                break;
        default:
                return -EINVAL;
        }
        clear_opt(sb, DISCARD);
        fserror_report_shutdown(sb, GFP_KERNEL);
        return 0;
}

static int ext4_ioctl_shutdown(struct super_block *sb, unsigned long arg)
{
        u32 flags;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (get_user(flags, (__u32 __user *)arg))
                return -EFAULT;

        return ext4_force_shutdown(sb, flags);
}

struct getfsmap_info {
        struct super_block        *gi_sb;
        struct fsmap_head __user *gi_data;
        unsigned int                gi_idx;
        __u32                        gi_last_flags;
};

static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv)
{
        struct getfsmap_info *info = priv;
        struct fsmap fm;

        trace_ext4_getfsmap_mapping(info->gi_sb, xfm);

        info->gi_last_flags = xfm->fmr_flags;
        ext4_fsmap_from_internal(info->gi_sb, &fm, xfm);
        if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm,
                        sizeof(struct fsmap)))
                return -EFAULT;

        return 0;
}

static int ext4_ioc_getfsmap(struct super_block *sb,
                             struct fsmap_head __user *arg)
{
        struct getfsmap_info info = { NULL };
        struct ext4_fsmap_head xhead = {0};
        struct fsmap_head head;
        bool aborted = false;
        int error;

        if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
                return -EFAULT;
        if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
            memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
                       sizeof(head.fmh_keys[0].fmr_reserved)) ||
            memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
                       sizeof(head.fmh_keys[1].fmr_reserved)))
                return -EINVAL;
        /*
         * ext4 doesn't report file extents at all, so the only valid
         * file offsets are the magic ones (all zeroes or all ones).
         */
        if (head.fmh_keys[0].fmr_offset ||
            (head.fmh_keys[1].fmr_offset != 0 &&
             head.fmh_keys[1].fmr_offset != -1ULL))
                return -EINVAL;

        xhead.fmh_iflags = head.fmh_iflags;
        xhead.fmh_count = head.fmh_count;
        ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]);
        ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]);

        trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]);
        trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]);

        info.gi_sb = sb;
        info.gi_data = arg;
        error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info);
        if (error == EXT4_QUERY_RANGE_ABORT)
                aborted = true;
        else if (error)
                return error;

        /* If we didn't abort, set the "last" flag in the last fmx */
        if (!aborted && info.gi_idx) {
                info.gi_last_flags |= FMR_OF_LAST;
                if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags,
                                 &info.gi_last_flags,
                                 sizeof(info.gi_last_flags)))
                        return -EFAULT;
        }

        /* copy back header */
        head.fmh_entries = xhead.fmh_entries;
        head.fmh_oflags = xhead.fmh_oflags;
        if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
                return -EFAULT;

        return 0;
}

static long ext4_ioctl_group_add(struct file *file,
                                 struct ext4_new_group_data *input)
{
        struct super_block *sb = file_inode(file)->i_sb;
        int err, err2=0;

        err = ext4_resize_begin(sb);
        if (err)
                return err;

        if (ext4_has_feature_bigalloc(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Online resizing not supported with bigalloc");
                err = -EOPNOTSUPP;
                goto group_add_out;
        }

        err = mnt_want_write_file(file);
        if (err)
                goto group_add_out;

        err = ext4_group_add(sb, input);
        if (EXT4_SB(sb)->s_journal) {
                ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL);
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
                err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        }
        if (err == 0)
                err = err2;
        mnt_drop_write_file(file);
        if (!err && ext4_has_group_desc_csum(sb) &&
            test_opt(sb, INIT_INODE_TABLE))
                err = ext4_register_li_request(sb, input->group);
group_add_out:
        err2 = ext4_resize_end(sb, false);
        if (err == 0)
                err = err2;
        return err;
}

int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct ext4_inode_info *ei = EXT4_I(inode);
        u32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE;

        if (S_ISREG(inode->i_mode))
                flags &= ~FS_PROJINHERIT_FL;

        fileattr_fill_flags(fa, flags);
        if (ext4_has_feature_project(inode->i_sb))
                fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);

        return 0;
}

int ext4_fileattr_set(struct mnt_idmap *idmap,
                      struct dentry *dentry, struct file_kattr *fa)
{
        struct inode *inode = d_inode(dentry);
        u32 flags = fa->flags;
        int err = -EOPNOTSUPP;

        if (flags & ~EXT4_FL_USER_VISIBLE)
                goto out;

        /*
         * chattr(1) grabs flags via GETFLAGS, modifies the result and
         * passes that to SETFLAGS. So we cannot easily make SETFLAGS
         * more restrictive than just silently masking off visible but
         * not settable flags as we always did.
         */
        flags &= EXT4_FL_USER_MODIFIABLE;
        if (ext4_mask_flags(inode->i_mode, flags) != flags)
                goto out;
        err = ext4_ioctl_check_immutable(inode, fa->fsx_projid, flags);
        if (err)
                goto out;
        err = ext4_ioctl_setflags(inode, flags);
        if (err)
                goto out;
        err = ext4_ioctl_setproject(inode, fa->fsx_projid);
out:
        return err;
}

/* So that the fiemap access checks can't overflow on 32 bit machines. */
#define FIEMAP_MAX_EXTENTS        (UINT_MAX / sizeof(struct fiemap_extent))

static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
{
        struct fiemap fiemap;
        struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
        struct fiemap_extent_info fieinfo = { 0, };
        struct inode *inode = file_inode(filp);
        int error;

        if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
                return -EFAULT;

        if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
                return -EINVAL;

        fieinfo.fi_flags = fiemap.fm_flags;
        fieinfo.fi_extents_max = fiemap.fm_extent_count;
        fieinfo.fi_extents_start = ufiemap->fm_extents;

        error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start,
                        fiemap.fm_length);
        fiemap.fm_flags = fieinfo.fi_flags;
        fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
        if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
                error = -EFAULT;

        return error;
}

static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
{
        int err = 0;
        __u32 flags = 0;
        unsigned int flush_flags = 0;
        struct super_block *sb = file_inode(filp)->i_sb;

        if (copy_from_user(&flags, (__u32 __user *)arg,
                                sizeof(__u32)))
                return -EFAULT;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        /* check for invalid bits set */
        if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) ||
                                ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
                                (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
                return -EINVAL;

        if (!EXT4_SB(sb)->s_journal)
                return -ENODEV;

        if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
            !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev))
                return -EOPNOTSUPP;

        if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
                return 0;

        if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD)
                flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD;

        if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) {
                flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT;
                pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow");
        }

        jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
        err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags);
        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);

        return err;
}

static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label)
{
        size_t len;
        int ret = 0;
        char new_label[EXT4_LABEL_MAX + 1];
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        /*
         * Copy the maximum length allowed for ext4 label with one more to
         * find the required terminating null byte in order to test the
         * label length. The on disk label doesn't need to be null terminated.
         */
        if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1))
                return -EFAULT;

        len = strnlen(new_label, EXT4_LABEL_MAX + 1);
        if (len > EXT4_LABEL_MAX)
                return -EINVAL;

        /*
         * Clear the buffer after the new label
         */
        memset(new_label + len, 0, EXT4_LABEL_MAX - len);

        ret = mnt_want_write_file(filp);
        if (ret)
                return ret;

        ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label);

        mnt_drop_write_file(filp);
        return ret;
}

static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label)
{
        char label[EXT4_LABEL_MAX + 1];

        /*
         * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because
         * FSLABEL_MAX must include terminating null byte, while s_volume_name
         * does not have to.
         */
        BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);

        lock_buffer(sbi->s_sbh);
        memtostr_pad(label, sbi->s_es->s_volume_name);
        unlock_buffer(sbi->s_sbh);

        if (copy_to_user(user_label, label, sizeof(label)))
                return -EFAULT;
        return 0;
}

static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
                        struct fsuuid __user *ufsuuid)
{
        struct fsuuid fsuuid;
        __u8 uuid[UUID_SIZE];

        if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
                return -EFAULT;

        if (fsuuid.fsu_len == 0) {
                fsuuid.fsu_len = UUID_SIZE;
                if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len,
                                        sizeof(fsuuid.fsu_len)))
                        return -EFAULT;
                return 0;
        }

        if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0)
                return -EINVAL;

        lock_buffer(sbi->s_sbh);
        memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
        unlock_buffer(sbi->s_sbh);

        fsuuid.fsu_len = UUID_SIZE;
        if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) ||
            copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
                return -EFAULT;
        return 0;
}

static int ext4_ioctl_setuuid(struct file *filp,
                        const struct fsuuid __user *ufsuuid)
{
        int ret = 0;
        struct super_block *sb = file_inode(filp)->i_sb;
        struct fsuuid fsuuid;
        __u8 uuid[UUID_SIZE];

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        /*
         * If any checksums (group descriptors or metadata) are being used
         * then the checksum seed feature is required to change the UUID.
         */
        if (((ext4_has_feature_gdt_csum(sb) ||
              ext4_has_feature_metadata_csum(sb))
                        && !ext4_has_feature_csum_seed(sb))
                || ext4_has_feature_stable_inodes(sb))
                return -EOPNOTSUPP;

        if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
                return -EFAULT;

        if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
                return -EINVAL;

        if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE))
                return -EFAULT;

        ret = mnt_want_write_file(filp);
        if (ret)
                return ret;

        ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid);
        mnt_drop_write_file(filp);

        return ret;
}


#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR |    \
        EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \
        EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \
        EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \
        EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \
        EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \
        EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \
        EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \
        EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \
        EXT4_TUNE_FL_ENCODING_FLAGS)

#define EXT4_TUNE_SET_COMPAT_SUPP \
                (EXT4_FEATURE_COMPAT_DIR_INDEX |        \
                 EXT4_FEATURE_COMPAT_STABLE_INODES)
#define EXT4_TUNE_SET_INCOMPAT_SUPP \
                (EXT4_FEATURE_INCOMPAT_EXTENTS |        \
                 EXT4_FEATURE_INCOMPAT_EA_INODE |        \
                 EXT4_FEATURE_INCOMPAT_ENCRYPT |        \
                 EXT4_FEATURE_INCOMPAT_CSUM_SEED |        \
                 EXT4_FEATURE_INCOMPAT_LARGEDIR |        \
                 EXT4_FEATURE_INCOMPAT_CASEFOLD)
#define EXT4_TUNE_SET_RO_COMPAT_SUPP \
                (EXT4_FEATURE_RO_COMPAT_LARGE_FILE |        \
                 EXT4_FEATURE_RO_COMPAT_DIR_NLINK |        \
                 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE |        \
                 EXT4_FEATURE_RO_COMPAT_PROJECT |        \
                 EXT4_FEATURE_RO_COMPAT_VERITY)

#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0)
#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0)
#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0)

#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL |        \
                          SB_ENC_NO_COMPAT_FALLBACK_FL)

static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi,
                                  struct ext4_tune_sb_params __user *params)
{
        struct ext4_tune_sb_params ret;
        struct ext4_super_block *es = sbi->s_es;

        memset(&ret, 0, sizeof(ret));
        ret.set_flags = TUNE_OPS_SUPPORTED;
        ret.errors_behavior = le16_to_cpu(es->s_errors);
        ret.mnt_count = le16_to_cpu(es->s_mnt_count);
        ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count);
        ret.checkinterval = le32_to_cpu(es->s_checkinterval);
        ret.last_check_time = le32_to_cpu(es->s_lastcheck);
        ret.reserved_blocks = ext4_r_blocks_count(es);
        ret.blocks_count = ext4_blocks_count(es);
        ret.reserved_uid = ext4_get_resuid(es);
        ret.reserved_gid = ext4_get_resgid(es);
        ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts);
        ret.def_hash_alg = es->s_def_hash_version;
        ret.raid_stride = le16_to_cpu(es->s_raid_stride);
        ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width);
        ret.encoding = le16_to_cpu(es->s_encoding);
        ret.encoding_flags = le16_to_cpu(es->s_encoding_flags);
        strscpy_pad(ret.mount_opts, es->s_mount_opts);
        ret.feature_compat = le32_to_cpu(es->s_feature_compat);
        ret.feature_incompat = le32_to_cpu(es->s_feature_incompat);
        ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat);
        ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP;
        ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP;
        ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP;
        ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP;
        ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP;
        ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP;
        if (copy_to_user(params, &ret, sizeof(ret)))
                return -EFAULT;
        return 0;
}

static void ext4_sb_setparams(struct ext4_sb_info *sbi,
                              struct ext4_super_block *es, const void *arg)
{
        const struct ext4_tune_sb_params *params = arg;

        if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR)
                es->s_errors = cpu_to_le16(params->errors_behavior);
        if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT)
                es->s_mnt_count = cpu_to_le16(params->mnt_count);
        if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT)
                es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count);
        if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL)
                es->s_checkinterval = cpu_to_le32(params->checkinterval);
        if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME)
                es->s_lastcheck = cpu_to_le32(params->last_check_time);
        if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) {
                ext4_fsblk_t blk = params->reserved_blocks;

                es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
                es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
        }
        if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) {
                int uid = params->reserved_uid;

                es->s_def_resuid = cpu_to_le16(uid & 0xFFFF);
                es->s_def_resuid_hi = cpu_to_le16(uid >> 16);
        }
        if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) {
                int gid = params->reserved_gid;

                es->s_def_resgid = cpu_to_le16(gid & 0xFFFF);
                es->s_def_resgid_hi = cpu_to_le16(gid >> 16);
        }
        if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS)
                es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts);
        if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
                es->s_def_hash_version = params->def_hash_alg;
        if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE)
                es->s_raid_stride = cpu_to_le16(params->raid_stride);
        if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH)
                es->s_raid_stripe_width =
                        cpu_to_le32(params->raid_stripe_width);
        if (params->set_flags & EXT4_TUNE_FL_ENCODING)
                es->s_encoding = cpu_to_le16(params->encoding);
        if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)
                es->s_encoding_flags = cpu_to_le16(params->encoding_flags);
        strscpy_pad(es->s_mount_opts, params->mount_opts);
        if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) {
                es->s_feature_compat |=
                        cpu_to_le32(params->set_feature_compat_mask);
                es->s_feature_incompat |=
                        cpu_to_le32(params->set_feature_incompat_mask);
                es->s_feature_ro_compat |=
                        cpu_to_le32(params->set_feature_ro_compat_mask);
                es->s_feature_compat &=
                        ~cpu_to_le32(params->clear_feature_compat_mask);
                es->s_feature_incompat &=
                        ~cpu_to_le32(params->clear_feature_incompat_mask);
                es->s_feature_ro_compat &=
                        ~cpu_to_le32(params->clear_feature_ro_compat_mask);
                if (params->set_feature_compat_mask &
                    EXT4_FEATURE_COMPAT_DIR_INDEX)
                        es->s_def_hash_version = sbi->s_def_hash_version;
                if (params->set_feature_incompat_mask &
                    EXT4_FEATURE_INCOMPAT_CSUM_SEED)
                        es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed);
        }
        if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK)
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
}

static int ext4_ioctl_set_tune_sb(struct file *filp,
                                  struct ext4_tune_sb_params __user *in)
{
        struct ext4_tune_sb_params params;
        struct super_block *sb = file_inode(filp)->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int enabling_casefold = 0;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (copy_from_user(&params, in, sizeof(params)))
                return -EFAULT;

        if (strnlen(params.mount_opts, sizeof(params.mount_opts)) ==
            sizeof(params.mount_opts))
                return -E2BIG;

        if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0)
                return -EOPNOTSUPP;

        if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) &&
            (params.errors_behavior > EXT4_ERRORS_PANIC))
                return -EINVAL;

        if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) &&
            (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2))
                return -EINVAL;
        if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) &&
            ((params.def_hash_alg > DX_HASH_LAST) ||
             (params.def_hash_alg == DX_HASH_SIPHASH)))
                return -EINVAL;
        if ((params.set_flags & EXT4_TUNE_FL_FEATURES) &&
            (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES))
                return -EINVAL;

        if (params.set_flags & EXT4_TUNE_FL_FEATURES) {
                params.set_feature_compat_mask =
                        params.feature_compat &
                        ~le32_to_cpu(es->s_feature_compat);
                params.set_feature_incompat_mask =
                        params.feature_incompat &
                        ~le32_to_cpu(es->s_feature_incompat);
                params.set_feature_ro_compat_mask =
                        params.feature_ro_compat &
                        ~le32_to_cpu(es->s_feature_ro_compat);
                params.clear_feature_compat_mask =
                        ~params.feature_compat &
                        le32_to_cpu(es->s_feature_compat);
                params.clear_feature_incompat_mask =
                        ~params.feature_incompat &
                        le32_to_cpu(es->s_feature_incompat);
                params.clear_feature_ro_compat_mask =
                        ~params.feature_ro_compat &
                        le32_to_cpu(es->s_feature_ro_compat);
                params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES;
        }
        if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) {
                if ((params.set_feature_compat_mask &
                     ~EXT4_TUNE_SET_COMPAT_SUPP) ||
                    (params.set_feature_incompat_mask &
                     ~EXT4_TUNE_SET_INCOMPAT_SUPP) ||
                    (params.set_feature_ro_compat_mask &
                     ~EXT4_TUNE_SET_RO_COMPAT_SUPP) ||
                    (params.clear_feature_compat_mask &
                     ~EXT4_TUNE_CLEAR_COMPAT_SUPP) ||
                    (params.clear_feature_incompat_mask &
                     ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) ||
                    (params.clear_feature_ro_compat_mask &
                     ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP))
                        return -EOPNOTSUPP;

                /*
                 * Filter out the features that are already set from
                 * the set_mask.
                 */
                params.set_feature_compat_mask &=
                        ~le32_to_cpu(es->s_feature_compat);
                params.set_feature_incompat_mask &=
                        ~le32_to_cpu(es->s_feature_incompat);
                params.set_feature_ro_compat_mask &=
                        ~le32_to_cpu(es->s_feature_ro_compat);
                if ((params.set_feature_incompat_mask &
                     EXT4_FEATURE_INCOMPAT_CASEFOLD)) {
                        enabling_casefold = 1;
                        if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) {
                                params.encoding = EXT4_ENC_UTF8_12_1;
                                params.set_flags |= EXT4_TUNE_FL_ENCODING;
                        }
                        if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) {
                                params.encoding_flags = 0;
                                params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS;
                        }
                }
                if ((params.set_feature_compat_mask &
                     EXT4_FEATURE_COMPAT_DIR_INDEX)) {
                        uuid_t        uu;

                        memcpy(&uu, sbi->s_hash_seed, UUID_SIZE);
                        if (uuid_is_null(&uu))
                                generate_random_uuid((char *)
                                                     &sbi->s_hash_seed);
                        if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
                                sbi->s_def_hash_version = params.def_hash_alg;
                        else if (sbi->s_def_hash_version == 0)
                                sbi->s_def_hash_version = DX_HASH_HALF_MD4;
                        if (!(es->s_flags &
                              cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) &&
                            !(es->s_flags &
                              cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) {
#ifdef __CHAR_UNSIGNED__
                                sbi->s_hash_unsigned = 3;
#else
                                sbi->s_hash_unsigned = 0;
#endif
                        }
                }
        }
        if (params.set_flags & EXT4_TUNE_FL_ENCODING) {
                if (!enabling_casefold)
                        return -EINVAL;
                if (params.encoding == 0)
                        params.encoding = EXT4_ENC_UTF8_12_1;
                else if (params.encoding != EXT4_ENC_UTF8_12_1)
                        return -EINVAL;
        }
        if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) {
                if (!enabling_casefold)
                        return -EINVAL;
                if (params.encoding_flags & ~SB_ENC_SUPP_MASK)
                        return -EINVAL;
        }

        ret = mnt_want_write_file(filp);
        if (ret)
                return ret;

        ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, &params);
        mnt_drop_write_file(filp);

        if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
                sbi->s_def_hash_version = params.def_hash_alg;

        return ret;
}

static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        struct inode *inode = file_inode(filp);
        struct super_block *sb = inode->i_sb;
        struct mnt_idmap *idmap = file_mnt_idmap(filp);

        ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);

        switch (cmd) {
        case FS_IOC_GETFSMAP:
                return ext4_ioc_getfsmap(sb, (void __user *)arg);
        case EXT4_IOC_GETVERSION:
        case EXT4_IOC_GETVERSION_OLD:
                return put_user(inode->i_generation, (int __user *) arg);
        case EXT4_IOC_SETVERSION:
        case EXT4_IOC_SETVERSION_OLD: {
                handle_t *handle;
                struct ext4_iloc iloc;
                __u32 generation;
                int err;

                if (!inode_owner_or_capable(idmap, inode))
                        return -EPERM;

                if (ext4_has_feature_metadata_csum(inode->i_sb)) {
                        ext4_warning(sb, "Setting inode version is not "
                                     "supported with metadata_csum enabled.");
                        return -ENOTTY;
                }

                err = mnt_want_write_file(filp);
                if (err)
                        return err;
                if (get_user(generation, (int __user *) arg)) {
                        err = -EFAULT;
                        goto setversion_out;
                }

                inode_lock(inode);
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
                        goto unlock_out;
                }
                err = ext4_reserve_inode_write(handle, inode, &iloc);
                if (err == 0) {
                        inode_set_ctime_current(inode);
                        inode_inc_iversion(inode);
                        inode->i_generation = generation;
                        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
                }
                ext4_journal_stop(handle);

unlock_out:
                inode_unlock(inode);
setversion_out:
                mnt_drop_write_file(filp);
                return err;
        }
        case EXT4_IOC_GROUP_EXTEND: {
                ext4_fsblk_t n_blocks_count;
                int err, err2=0;

                err = ext4_resize_begin(sb);
                if (err)
                        return err;

                if (get_user(n_blocks_count, (__u32 __user *)arg)) {
                        err = -EFAULT;
                        goto group_extend_out;
                }

                if (ext4_has_feature_bigalloc(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "Online resizing not supported with bigalloc");
                        err = -EOPNOTSUPP;
                        goto group_extend_out;
                }

                err = mnt_want_write_file(filp);
                if (err)
                        goto group_extend_out;

                err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
                if (EXT4_SB(sb)->s_journal) {
                        ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE,
                                                NULL);
                        jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
                        err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
                        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
                }
                if (err == 0)
                        err = err2;
                mnt_drop_write_file(filp);
group_extend_out:
                err2 = ext4_resize_end(sb, false);
                if (err == 0)
                        err = err2;
                return err;
        }

        case EXT4_IOC_MOVE_EXT: {
                struct move_extent me;
                int err;

                if (!(filp->f_mode & FMODE_READ) ||
                    !(filp->f_mode & FMODE_WRITE))
                        return -EBADF;

                if (copy_from_user(&me,
                        (struct move_extent __user *)arg, sizeof(me)))
                        return -EFAULT;
                me.moved_len = 0;

                CLASS(fd, donor)(me.donor_fd);
                if (fd_empty(donor))
                        return -EBADF;

                if (!(fd_file(donor)->f_mode & FMODE_WRITE))
                        return -EBADF;

                err = mnt_want_write_file(filp);
                if (err)
                        return err;

                err = ext4_move_extents(filp, fd_file(donor), me.orig_start,
                                        me.donor_start, me.len, &me.moved_len);
                mnt_drop_write_file(filp);

                if (copy_to_user((struct move_extent __user *)arg,
                                 &me, sizeof(me)))
                        err = -EFAULT;
                return err;
        }

        case EXT4_IOC_GROUP_ADD: {
                struct ext4_new_group_data input;

                if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
                                sizeof(input)))
                        return -EFAULT;

                return ext4_ioctl_group_add(filp, &input);
        }

        case EXT4_IOC_MIGRATE:
        {
                int err;
                if (!inode_owner_or_capable(idmap, inode))
                        return -EACCES;

                err = mnt_want_write_file(filp);
                if (err)
                        return err;
                /*
                 * inode_mutex prevent write and truncate on the file.
                 * Read still goes through. We take i_data_sem in
                 * ext4_ext_swap_inode_data before we switch the
                 * inode format to prevent read.
                 */
                inode_lock((inode));
                err = ext4_ext_migrate(inode);
                inode_unlock((inode));
                mnt_drop_write_file(filp);
                return err;
        }

        case EXT4_IOC_ALLOC_DA_BLKS:
        {
                int err;
                if (!inode_owner_or_capable(idmap, inode))
                        return -EACCES;

                err = mnt_want_write_file(filp);
                if (err)
                        return err;
                err = ext4_alloc_da_blocks(inode);
                mnt_drop_write_file(filp);
                return err;
        }

        case EXT4_IOC_SWAP_BOOT:
        {
                int err;
                if (!(filp->f_mode & FMODE_WRITE))
                        return -EBADF;
                err = mnt_want_write_file(filp);
                if (err)
                        return err;
                err = swap_inode_boot_loader(sb, idmap, inode);
                mnt_drop_write_file(filp);
                return err;
        }

        case EXT4_IOC_RESIZE_FS: {
                ext4_fsblk_t n_blocks_count;
                int err = 0, err2 = 0;
                ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;

                if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
                                   sizeof(__u64))) {
                        return -EFAULT;
                }

                err = ext4_resize_begin(sb);
                if (err)
                        return err;

                err = mnt_want_write_file(filp);
                if (err)
                        goto resizefs_out;

                err = ext4_resize_fs(sb, n_blocks_count);
                if (EXT4_SB(sb)->s_journal) {
                        ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL);
                        jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
                        err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
                        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
                }
                if (err == 0)
                        err = err2;
                mnt_drop_write_file(filp);
                if (!err && (o_group < EXT4_SB(sb)->s_groups_count) &&
                    ext4_has_group_desc_csum(sb) &&
                    test_opt(sb, INIT_INODE_TABLE))
                        err = ext4_register_li_request(sb, o_group);

resizefs_out:
                err2 = ext4_resize_end(sb, true);
                if (err == 0)
                        err = err2;
                return err;
        }

        case FITRIM:
        {
                struct fstrim_range range;
                int ret = 0;

                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;

                if (!bdev_max_discard_sectors(sb->s_bdev))
                        return -EOPNOTSUPP;

                /*
                 * We haven't replayed the journal, so we cannot use our
                 * block-bitmap-guided storage zapping commands.
                 */
                if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb))
                        return -EROFS;

                if (copy_from_user(&range, (struct fstrim_range __user *)arg,
                    sizeof(range)))
                        return -EFAULT;

                ret = ext4_trim_fs(sb, &range);
                if (ret < 0)
                        return ret;

                if (copy_to_user((struct fstrim_range __user *)arg, &range,
                    sizeof(range)))
                        return -EFAULT;

                return 0;
        }
        case EXT4_IOC_PRECACHE_EXTENTS:
        {
                int ret;

                inode_lock_shared(inode);
                ret = ext4_ext_precache(inode);
                inode_unlock_shared(inode);
                return ret;
        }
        case FS_IOC_SET_ENCRYPTION_POLICY:
                if (!ext4_has_feature_encrypt(sb))
                        return -EOPNOTSUPP;
                return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);

        case FS_IOC_GET_ENCRYPTION_PWSALT:
                return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg);

        case FS_IOC_GET_ENCRYPTION_POLICY:
                if (!ext4_has_feature_encrypt(sb))
                        return -EOPNOTSUPP;
                return fscrypt_ioctl_get_policy(filp, (void __user *)arg);

        case FS_IOC_GET_ENCRYPTION_POLICY_EX:
                if (!ext4_has_feature_encrypt(sb))
                        return -EOPNOTSUPP;
                return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg);

        case FS_IOC_ADD_ENCRYPTION_KEY:
                if (!ext4_has_feature_encrypt(sb))
                        return -EOPNOTSUPP;
                return fscrypt_ioctl_add_key(filp, (void __user *)arg);

        case FS_IOC_REMOVE_ENCRYPTION_KEY:
                if (!ext4_has_feature_encrypt(sb))
                        return -EOPNOTSUPP;
                return fscrypt_ioctl_remove_key(filp, (void __user *)arg);

        case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
                if (!ext4_has_feature_encrypt(sb))
                        return -EOPNOTSUPP;
                return fscrypt_ioctl_remove_key_all_users(filp,
                                                          (void __user *)arg);
        case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
                if (!ext4_has_feature_encrypt(sb))
                        return -EOPNOTSUPP;
                return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);

        case FS_IOC_GET_ENCRYPTION_NONCE:
                if (!ext4_has_feature_encrypt(sb))
                        return -EOPNOTSUPP;
                return fscrypt_ioctl_get_nonce(filp, (void __user *)arg);

        case EXT4_IOC_CLEAR_ES_CACHE:
        {
                if (!inode_owner_or_capable(idmap, inode))
                        return -EACCES;
                ext4_clear_inode_es(inode);
                return 0;
        }

        case EXT4_IOC_GETSTATE:
        {
                __u32        state = 0;

                if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED))
                        state |= EXT4_STATE_FLAG_EXT_PRECACHED;
                if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                        state |= EXT4_STATE_FLAG_NEW;
                if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
                        state |= EXT4_STATE_FLAG_NEWENTRY;
                if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE))
                        state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE;

                return put_user(state, (__u32 __user *) arg);
        }

        case EXT4_IOC_GET_ES_CACHE:
                return ext4_ioctl_get_es_cache(filp, arg);

        case EXT4_IOC_SHUTDOWN:
                return ext4_ioctl_shutdown(sb, arg);

        case FS_IOC_ENABLE_VERITY:
                if (!ext4_has_feature_verity(sb))
                        return -EOPNOTSUPP;
                return fsverity_ioctl_enable(filp, (const void __user *)arg);

        case FS_IOC_MEASURE_VERITY:
                if (!ext4_has_feature_verity(sb))
                        return -EOPNOTSUPP;
                return fsverity_ioctl_measure(filp, (void __user *)arg);

        case FS_IOC_READ_VERITY_METADATA:
                if (!ext4_has_feature_verity(sb))
                        return -EOPNOTSUPP;
                return fsverity_ioctl_read_metadata(filp,
                                                    (const void __user *)arg);

        case EXT4_IOC_CHECKPOINT:
                return ext4_ioctl_checkpoint(filp, arg);

        case FS_IOC_GETFSLABEL:
                return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg);

        case FS_IOC_SETFSLABEL:
                return ext4_ioctl_setlabel(filp,
                                           (const void __user *)arg);

        case EXT4_IOC_GETFSUUID:
                return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg);
        case EXT4_IOC_SETFSUUID:
                return ext4_ioctl_setuuid(filp, (const void __user *)arg);
        case EXT4_IOC_GET_TUNE_SB_PARAM:
                return ext4_ioctl_get_tune_sb(EXT4_SB(sb),
                                              (void __user *)arg);
        case EXT4_IOC_SET_TUNE_SB_PARAM:
                return ext4_ioctl_set_tune_sb(filp, (void __user *)arg);
        default:
                return -ENOTTY;
        }
}

long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        return __ext4_ioctl(filp, cmd, arg);
}

#ifdef CONFIG_COMPAT
long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        /* These are just misnamed, they actually get/put from/to user an int */
        switch (cmd) {
        case EXT4_IOC32_GETVERSION:
                cmd = EXT4_IOC_GETVERSION;
                break;
        case EXT4_IOC32_SETVERSION:
                cmd = EXT4_IOC_SETVERSION;
                break;
        case EXT4_IOC32_GROUP_EXTEND:
                cmd = EXT4_IOC_GROUP_EXTEND;
                break;
        case EXT4_IOC32_GETVERSION_OLD:
                cmd = EXT4_IOC_GETVERSION_OLD;
                break;
        case EXT4_IOC32_SETVERSION_OLD:
                cmd = EXT4_IOC_SETVERSION_OLD;
                break;
        case EXT4_IOC32_GETRSVSZ:
                cmd = EXT4_IOC_GETRSVSZ;
                break;
        case EXT4_IOC32_SETRSVSZ:
                cmd = EXT4_IOC_SETRSVSZ;
                break;
        case EXT4_IOC32_GROUP_ADD: {
                struct compat_ext4_new_group_input __user *uinput;
                struct ext4_new_group_data input;
                int err;

                uinput = compat_ptr(arg);
                err = get_user(input.group, &uinput->group);
                err |= get_user(input.block_bitmap, &uinput->block_bitmap);
                err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
                err |= get_user(input.inode_table, &uinput->inode_table);
                err |= get_user(input.blocks_count, &uinput->blocks_count);
                err |= get_user(input.reserved_blocks,
                                &uinput->reserved_blocks);
                if (err)
                        return -EFAULT;
                return ext4_ioctl_group_add(file, &input);
        }
        case EXT4_IOC_MOVE_EXT:
        case EXT4_IOC_RESIZE_FS:
        case FITRIM:
        case EXT4_IOC_PRECACHE_EXTENTS:
        case FS_IOC_SET_ENCRYPTION_POLICY:
        case FS_IOC_GET_ENCRYPTION_PWSALT:
        case FS_IOC_GET_ENCRYPTION_POLICY:
        case FS_IOC_GET_ENCRYPTION_POLICY_EX:
        case FS_IOC_ADD_ENCRYPTION_KEY:
        case FS_IOC_REMOVE_ENCRYPTION_KEY:
        case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
        case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
        case FS_IOC_GET_ENCRYPTION_NONCE:
        case EXT4_IOC_SHUTDOWN:
        case FS_IOC_GETFSMAP:
        case FS_IOC_ENABLE_VERITY:
        case FS_IOC_MEASURE_VERITY:
        case FS_IOC_READ_VERITY_METADATA:
        case EXT4_IOC_CLEAR_ES_CACHE:
        case EXT4_IOC_GETSTATE:
        case EXT4_IOC_GET_ES_CACHE:
        case EXT4_IOC_CHECKPOINT:
        case FS_IOC_GETFSLABEL:
        case FS_IOC_SETFSLABEL:
        case EXT4_IOC_GETFSUUID:
        case EXT4_IOC_SETFSUUID:
                break;
        default:
                return -ENOIOCTLCMD;
        }
        return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif

static void set_overhead(struct ext4_sb_info *sbi,
                         struct ext4_super_block *es, const void *arg)
{
        es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
}

int ext4_update_overhead(struct super_block *sb, bool force)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (ext4_emergency_state(sb) || sb_rdonly(sb))
                return 0;
        if (!force &&
            (sbi->s_overhead == 0 ||
             sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)))
                return 0;
        return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead);
}

























    1 





    1 
    1 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Landlock - Credential hooks
 *
 * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
 * Copyright © 2018-2020 ANSSI
 * Copyright © 2024-2025 Microsoft Corporation
 */

#include <linux/binfmts.h>
#include <linux/cred.h>
#include <linux/lsm_hooks.h>

#include "common.h"
#include "cred.h"
#include "ruleset.h"
#include "setup.h"

static void hook_cred_transfer(struct cred *const new,
                               const struct cred *const old)
{
        const struct landlock_cred_security *const old_llcred =
                landlock_cred(old);

        landlock_get_ruleset(old_llcred->domain);
        *landlock_cred(new) = *old_llcred;
}

static int hook_cred_prepare(struct cred *const new,
                             const struct cred *const old, const gfp_t gfp)
{
        hook_cred_transfer(new, old);
        return 0;
}

static void hook_cred_free(struct cred *const cred)
{
        struct landlock_ruleset *const dom = landlock_cred(cred)->domain;

        if (dom)
                landlock_put_ruleset_deferred(dom);
}

#ifdef CONFIG_AUDIT

static int hook_bprm_creds_for_exec(struct linux_binprm *const bprm)
{
        /* Resets for each execution. */
        landlock_cred(bprm->cred)->domain_exec = 0;
        return 0;
}

#endif /* CONFIG_AUDIT */

static struct security_hook_list landlock_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(cred_prepare, hook_cred_prepare),
        LSM_HOOK_INIT(cred_transfer, hook_cred_transfer),
        LSM_HOOK_INIT(cred_free, hook_cred_free),

#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(bprm_creds_for_exec, hook_bprm_creds_for_exec),
#endif /* CONFIG_AUDIT */
};

__init void landlock_add_cred_hooks(void)
{
        security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
                           &landlock_lsmid);
}























































































































































































































    2 









    2 






    2 













    2 
    2 



























    1 



    1 








    2 









































































































































































































































































































































































































































































































































































































































































































    1 





    1 





































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
// SPDX-License-Identifier: GPL-2.0-or-later
/* linux/net/ipv4/arp.c
 *
 * Copyright (C) 1994 by Florian  La Roche
 *
 * This module implements the Address Resolution Protocol ARP (RFC 826),
 * which is used to convert IP addresses (or in the future maybe other
 * high-level addresses) into a low-level hardware address (like an Ethernet
 * address).
 *
 * Fixes:
 *                Alan Cox        :        Removed the Ethernet assumptions in
 *                                        Florian's code
 *                Alan Cox        :        Fixed some small errors in the ARP
 *                                        logic
 *                Alan Cox        :        Allow >4K in /proc
 *                Alan Cox        :        Make ARP add its own protocol entry
 *                Ross Martin     :       Rewrote arp_rcv() and arp_get_info()
 *                Stephen Henson        :        Add AX25 support to arp_get_info()
 *                Alan Cox        :        Drop data when a device is downed.
 *                Alan Cox        :        Use init_timer().
 *                Alan Cox        :        Double lock fixes.
 *                Martin Seine        :        Move the arphdr structure
 *                                        to if_arp.h for compatibility.
 *                                        with BSD based programs.
 *                Andrew Tridgell :       Added ARP netmask code and
 *                                        re-arranged proxy handling.
 *                Alan Cox        :        Changed to use notifiers.
 *                Niibe Yutaka        :        Reply for this device or proxies only.
 *                Alan Cox        :        Don't proxy across hardware types!
 *                Jonathan Naylor :        Added support for NET/ROM.
 *                Mike Shaver     :       RFC1122 checks.
 *                Jonathan Naylor :        Only lookup the hardware address for
 *                                        the correct hardware type.
 *                Germano Caronni        :        Assorted subtle races.
 *                Craig Schlenter :        Don't modify permanent entry
 *                                        during arp_rcv.
 *                Russ Nelson        :        Tidied up a few bits.
 *                Alexey Kuznetsov:        Major changes to caching and behaviour,
 *                                        eg intelligent arp probing and
 *                                        generation
 *                                        of host down events.
 *                Alan Cox        :        Missing unlock in device events.
 *                Eckes                :        ARP ioctl control errors.
 *                Alexey Kuznetsov:        Arp free fix.
 *                Manuel Rodriguez:        Gratuitous ARP.
 *              Jonathan Layes  :       Added arpd support through kerneld
 *                                      message queue (960314)
 *                Mike Shaver        :        /proc/sys/net/ipv4/arp_* support
 *                Mike McLagan    :        Routing by source
 *                Stuart Cheshire        :        Metricom and grat arp fixes
 *                                        *** FOR 2.1 clean this up ***
 *                Lawrence V. Stefani: (08/12/96) Added FDDI support.
 *                Alan Cox        :        Took the AP1000 nasty FDDI hack and
 *                                        folded into the mainstream FDDI code.
 *                                        Ack spit, Linus how did you allow that
 *                                        one in...
 *                Jes Sorensen        :        Make FDDI work again in 2.1.x and
 *                                        clean up the APFDDI & gen. FDDI bits.
 *                Alexey Kuznetsov:        new arp state machine;
 *                                        now it is in net/core/neighbour.c.
 *                Krzysztof Halasa:        Added Frame Relay ARP support.
 *                Arnaldo C. Melo :        convert /proc/net/arp to seq_file
 *                Shmulik Hen:                Split arp_send to arp_create and
 *                                        arp_xmit so intermediate drivers like
 *                                        bonding can change the skb before
 *                                        sending (e.g. insert 8021q tag).
 *                Harald Welte        :        convert to make use of jenkins hash
 *                Jesper D. Brouer:       Proxy ARP PVLAN RFC 3069 support.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/capability.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/hex.h>
#include <linux/in.h>
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/fddidevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/net.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/route.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/ax25.h>
#include <net/netrom.h>
#include <net/dst_metadata.h>
#include <net/ip_tunnels.h>

#include <linux/uaccess.h>

#include <linux/netfilter_arp.h>

/*
 *        Interface to generic neighbour cache.
 */
static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
static bool arp_key_eq(const struct neighbour *n, const void *pkey);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
static void parp_redo(struct sk_buff *skb);
static int arp_is_multicast(const void *pkey);

static const struct neigh_ops arp_generic_ops = {
        .family =                AF_INET,
        .solicit =                arp_solicit,
        .error_report =                arp_error_report,
        .output =                neigh_resolve_output,
        .connected_output =        neigh_connected_output,
};

static const struct neigh_ops arp_hh_ops = {
        .family =                AF_INET,
        .solicit =                arp_solicit,
        .error_report =                arp_error_report,
        .output =                neigh_resolve_output,
        .connected_output =        neigh_resolve_output,
};

static const struct neigh_ops arp_direct_ops = {
        .family =                AF_INET,
        .output =                neigh_direct_output,
        .connected_output =        neigh_direct_output,
};

struct neigh_table arp_tbl = {
        .family                = AF_INET,
        .key_len        = 4,
        .protocol        = cpu_to_be16(ETH_P_IP),
        .hash                = arp_hash,
        .key_eq                = arp_key_eq,
        .constructor        = arp_constructor,
        .proxy_redo        = parp_redo,
        .is_multicast        = arp_is_multicast,
        .id                = "arp_cache",
        .parms                = {
                .tbl                        = &arp_tbl,
                .reachable_time                = 30 * HZ,
                .data        = {
                        [NEIGH_VAR_MCAST_PROBES] = 3,
                        [NEIGH_VAR_UCAST_PROBES] = 3,
                        [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
                        [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
                        [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
                        [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
                        [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
                        [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT,
                        [NEIGH_VAR_PROXY_QLEN] = 64,
                        [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
                        [NEIGH_VAR_PROXY_DELAY]        = (8 * HZ) / 10,
                        [NEIGH_VAR_LOCKTIME] = 1 * HZ,
                },
        },
        .gc_interval        = 30 * HZ,
        .gc_thresh1        = 128,
        .gc_thresh2        = 512,
        .gc_thresh3        = 1024,
};
EXPORT_SYMBOL(arp_tbl);

int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
{
        switch (dev->type) {
        case ARPHRD_ETHER:
        case ARPHRD_FDDI:
        case ARPHRD_IEEE802:
                ip_eth_mc_map(addr, haddr);
                return 0;
        case ARPHRD_INFINIBAND:
                ip_ib_mc_map(addr, dev->broadcast, haddr);
                return 0;
        case ARPHRD_IPGRE:
                ip_ipgre_mc_map(addr, dev->broadcast, haddr);
                return 0;
        default:
                if (dir) {
                        memcpy(haddr, dev->broadcast, dev->addr_len);
                        return 0;
                }
        }
        return -EINVAL;
}


static u32 arp_hash(const void *pkey,
                    const struct net_device *dev,
                    __u32 *hash_rnd)
{
        return arp_hashfn(pkey, dev, hash_rnd);
}

static bool arp_key_eq(const struct neighbour *neigh, const void *pkey)
{
        return neigh_key_eq32(neigh, pkey);
}

static int arp_constructor(struct neighbour *neigh)
{
        __be32 addr;
        struct net_device *dev = neigh->dev;
        struct in_device *in_dev;
        struct neigh_parms *parms;
        u32 inaddr_any = INADDR_ANY;

        if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
                memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len);

        addr = *(__be32 *)neigh->primary_key;
        rcu_read_lock();
        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev) {
                rcu_read_unlock();
                return -EINVAL;
        }

        neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);

        parms = in_dev->arp_parms;
        __neigh_parms_put(neigh->parms);
        neigh->parms = neigh_parms_clone(parms);
        rcu_read_unlock();

        if (!dev->header_ops) {
                neigh->nud_state = NUD_NOARP;
                neigh->ops = &arp_direct_ops;
                neigh->output = neigh_direct_output;
        } else {
                /* Good devices (checked by reading texts, but only Ethernet is
                   tested)

                   ARPHRD_ETHER: (ethernet, apfddi)
                   ARPHRD_FDDI: (fddi)
                   ARPHRD_IEEE802: (tr)
                   ARPHRD_METRICOM: (strip)
                   ARPHRD_ARCNET:
                   etc. etc. etc.

                   ARPHRD_IPDDP will also work, if author repairs it.
                   I did not it, because this driver does not work even
                   in old paradigm.
                 */

                if (neigh->type == RTN_MULTICAST) {
                        neigh->nud_state = NUD_NOARP;
                        arp_mc_map(addr, neigh->ha, dev, 1);
                } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
                        neigh->nud_state = NUD_NOARP;
                        memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
                } else if (neigh->type == RTN_BROADCAST ||
                           (dev->flags & IFF_POINTOPOINT)) {
                        neigh->nud_state = NUD_NOARP;
                        memcpy(neigh->ha, dev->broadcast, dev->addr_len);
                }

                if (dev->header_ops->cache)
                        neigh->ops = &arp_hh_ops;
                else
                        neigh->ops = &arp_generic_ops;

                if (neigh->nud_state & NUD_VALID)
                        neigh->output = neigh->ops->connected_output;
                else
                        neigh->output = neigh->ops->output;
        }
        return 0;
}

static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
{
        dst_link_failure(skb);
        kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
}

/* Create and send an arp packet. */
static void arp_send_dst(int type, int ptype, __be32 dest_ip,
                         struct net_device *dev, __be32 src_ip,
                         const unsigned char *dest_hw,
                         const unsigned char *src_hw,
                         const unsigned char *target_hw,
                         struct dst_entry *dst)
{
        struct sk_buff *skb;

        /* arp on this interface. */
        if (dev->flags & IFF_NOARP)
                return;

        skb = arp_create(type, ptype, dest_ip, dev, src_ip,
                         dest_hw, src_hw, target_hw);
        if (!skb)
                return;

        skb_dst_set(skb, dst_clone(dst));
        arp_xmit(skb);
}

void arp_send(int type, int ptype, __be32 dest_ip,
              struct net_device *dev, __be32 src_ip,
              const unsigned char *dest_hw, const unsigned char *src_hw,
              const unsigned char *target_hw)
{
        arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw,
                     target_hw, NULL);
}
EXPORT_SYMBOL(arp_send);

static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
        __be32 saddr = 0;
        u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
        struct net_device *dev = neigh->dev;
        __be32 target = *(__be32 *)neigh->primary_key;
        int probes = atomic_read(&neigh->probes);
        struct in_device *in_dev;
        struct dst_entry *dst = NULL;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev) {
                rcu_read_unlock();
                return;
        }
        switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
        default:
        case 0:                /* By default announce any local IP */
                if (skb && inet_addr_type_dev_table(dev_net(dev), dev,
                                          ip_hdr(skb)->saddr) == RTN_LOCAL)
                        saddr = ip_hdr(skb)->saddr;
                break;
        case 1:                /* Restrict announcements of saddr in same subnet */
                if (!skb)
                        break;
                saddr = ip_hdr(skb)->saddr;
                if (inet_addr_type_dev_table(dev_net(dev), dev,
                                             saddr) == RTN_LOCAL) {
                        /* saddr should be known to target */
                        if (inet_addr_onlink(in_dev, target, saddr))
                                break;
                }
                saddr = 0;
                break;
        case 2:                /* Avoid secondary IPs, get a primary/preferred one */
                break;
        }
        rcu_read_unlock();

        if (!saddr)
                saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);

        probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
        if (probes < 0) {
                if (!(READ_ONCE(neigh->nud_state) & NUD_VALID))
                        pr_debug("trying to ucast probe in NUD_INVALID\n");
                neigh_ha_snapshot(dst_ha, neigh, dev);
                dst_hw = dst_ha;
        } else {
                probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
                if (probes < 0) {
                        neigh_app_ns(neigh);
                        return;
                }
        }

        if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE))
                dst = skb_dst(skb);
        arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
                     dst_hw, dev->dev_addr, NULL, dst);
}

static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
{
        struct net *net = dev_net(in_dev->dev);
        int scope;

        switch (IN_DEV_ARP_IGNORE(in_dev)) {
        case 0:        /* Reply, the tip is already validated */
                return 0;
        case 1:        /* Reply only if tip is configured on the incoming interface */
                sip = 0;
                scope = RT_SCOPE_HOST;
                break;
        case 2:        /*
                 * Reply only if tip is configured on the incoming interface
                 * and is in same subnet as sip
                 */
                scope = RT_SCOPE_HOST;
                break;
        case 3:        /* Do not reply for scope host addresses */
                sip = 0;
                scope = RT_SCOPE_LINK;
                in_dev = NULL;
                break;
        case 4:        /* Reserved */
        case 5:
        case 6:
        case 7:
                return 0;
        case 8:        /* Do not reply */
                return 1;
        default:
                return 0;
        }
        return !inet_confirm_addr(net, in_dev, sip, tip, scope);
}

static int arp_accept(struct in_device *in_dev, __be32 sip)
{
        struct net *net = dev_net(in_dev->dev);
        int scope = RT_SCOPE_LINK;

        switch (IN_DEV_ARP_ACCEPT(in_dev)) {
        case 0: /* Don't create new entries from garp */
                return 0;
        case 1: /* Create new entries from garp */
                return 1;
        case 2: /* Create a neighbor in the arp table only if sip
                 * is in the same subnet as an address configured
                 * on the interface that received the garp message
                 */
                return !!inet_confirm_addr(net, in_dev, sip, 0, scope);
        default:
                return 0;
        }
}

static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
{
        struct rtable *rt;
        int flag = 0;
        /*unsigned long now; */
        struct net *net = dev_net(dev);

        rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev),
                             RT_SCOPE_UNIVERSE);
        if (IS_ERR(rt))
                return 1;
        if (rt->dst.dev != dev) {
                __NET_INC_STATS(net, LINUX_MIB_ARPFILTER);
                flag = 1;
        }
        ip_rt_put(rt);
        return flag;
}

/*
 * Check if we can use proxy ARP for this path
 */
static inline int arp_fwd_proxy(struct in_device *in_dev,
                                struct net_device *dev,        struct rtable *rt)
{
        struct in_device *out_dev;
        int imi, omi = -1;

        if (rt->dst.dev == dev)
                return 0;

        if (!IN_DEV_PROXY_ARP(in_dev))
                return 0;
        imi = IN_DEV_MEDIUM_ID(in_dev);
        if (imi == 0)
                return 1;
        if (imi == -1)
                return 0;

        /* place to check for proxy_arp for routes */

        out_dev = __in_dev_get_rcu(rt->dst.dev);
        if (out_dev)
                omi = IN_DEV_MEDIUM_ID(out_dev);

        return omi != imi && omi != -1;
}

/*
 * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
 *
 * RFC3069 supports proxy arp replies back to the same interface.  This
 * is done to support (ethernet) switch features, like RFC 3069, where
 * the individual ports are not allowed to communicate with each
 * other, BUT they are allowed to talk to the upstream router.  As
 * described in RFC 3069, it is possible to allow these hosts to
 * communicate through the upstream router, by proxy_arp'ing.
 *
 * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
 *
 *  This technology is known by different names:
 *    In RFC 3069 it is called VLAN Aggregation.
 *    Cisco and Allied Telesyn call it Private VLAN.
 *    Hewlett-Packard call it Source-Port filtering or port-isolation.
 *    Ericsson call it MAC-Forced Forwarding (RFC Draft).
 *
 */
static inline int arp_fwd_pvlan(struct in_device *in_dev,
                                struct net_device *dev,        struct rtable *rt,
                                __be32 sip, __be32 tip)
{
        /* Private VLAN is only concerned about the same ethernet segment */
        if (rt->dst.dev != dev)
                return 0;

        /* Don't reply on self probes (often done by windowz boxes)*/
        if (sip == tip)
                return 0;

        if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
                return 1;
        else
                return 0;
}

/*
 *        Interface to link layer: send routine and receive handler.
 */

/*
 *        Create an arp packet. If dest_hw is not set, we create a broadcast
 *        message.
 */
struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
                           struct net_device *dev, __be32 src_ip,
                           const unsigned char *dest_hw,
                           const unsigned char *src_hw,
                           const unsigned char *target_hw)
{
        struct sk_buff *skb;
        struct arphdr *arp;
        unsigned char *arp_ptr;
        int hlen = LL_RESERVED_SPACE(dev);
        int tlen = dev->needed_tailroom;

        /*
         *        Allocate a buffer
         */

        skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
        if (!skb)
                return NULL;

        skb_reserve(skb, hlen);
        skb_reset_network_header(skb);
        skb_put(skb, arp_hdr_len(dev));
        skb->dev = dev;
        skb->protocol = htons(ETH_P_ARP);
        if (!src_hw)
                src_hw = dev->dev_addr;
        if (!dest_hw)
                dest_hw = dev->broadcast;

        /* Fill the device header for the ARP frame.
         * Note: skb->head can be changed.
         */
        if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0)
                goto out;

        arp = arp_hdr(skb);
        /*
         * Fill out the arp protocol part.
         *
         * The arp hardware type should match the device type, except for FDDI,
         * which (according to RFC 1390) should always equal 1 (Ethernet).
         */
        /*
         *        Exceptions everywhere. AX.25 uses the AX.25 PID value not the
         *        DIX code for the protocol. Make these device structure fields.
         */
        switch (dev->type) {
        default:
                arp->ar_hrd = htons(dev->type);
                arp->ar_pro = htons(ETH_P_IP);
                break;

#if IS_ENABLED(CONFIG_AX25)
        case ARPHRD_AX25:
                arp->ar_hrd = htons(ARPHRD_AX25);
                arp->ar_pro = htons(AX25_P_IP);
                break;

#if IS_ENABLED(CONFIG_NETROM)
        case ARPHRD_NETROM:
                arp->ar_hrd = htons(ARPHRD_NETROM);
                arp->ar_pro = htons(AX25_P_IP);
                break;
#endif
#endif

#if IS_ENABLED(CONFIG_FDDI)
        case ARPHRD_FDDI:
                arp->ar_hrd = htons(ARPHRD_ETHER);
                arp->ar_pro = htons(ETH_P_IP);
                break;
#endif
        }

        arp->ar_hln = dev->addr_len;
        arp->ar_pln = 4;
        arp->ar_op = htons(type);

        arp_ptr = (unsigned char *)(arp + 1);

        memcpy(arp_ptr, src_hw, dev->addr_len);
        arp_ptr += dev->addr_len;
        memcpy(arp_ptr, &src_ip, 4);
        arp_ptr += 4;

        switch (dev->type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
        case ARPHRD_IEEE1394:
                break;
#endif
        default:
                if (target_hw)
                        memcpy(arp_ptr, target_hw, dev->addr_len);
                else
                        memset(arp_ptr, 0, dev->addr_len);
                arp_ptr += dev->addr_len;
        }
        memcpy(arp_ptr, &dest_ip, 4);

        return skb;

out:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(arp_create);

static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return dev_queue_xmit(skb);
}

/*
 *        Send an arp packet.
 */
void arp_xmit(struct sk_buff *skb)
{
        rcu_read_lock();
        /* Send it off, maybe filter it using firewalling first.  */
        NF_HOOK(NFPROTO_ARP, NF_ARP_OUT,
                dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev,
                arp_xmit_finish);
        rcu_read_unlock();
}
EXPORT_SYMBOL(arp_xmit);

static bool arp_is_garp(struct net *net, struct net_device *dev,
                        int *addr_type, __be16 ar_op,
                        __be32 sip, __be32 tip,
                        unsigned char *sha, unsigned char *tha)
{
        bool is_garp = tip == sip;

        /* Gratuitous ARP _replies_ also require target hwaddr to be
         * the same as source.
         */
        if (is_garp && ar_op == htons(ARPOP_REPLY))
                is_garp =
                        /* IPv4 over IEEE 1394 doesn't provide target
                         * hardware address field in its ARP payload.
                         */
                        tha &&
                        !memcmp(tha, sha, dev->addr_len);

        if (is_garp) {
                *addr_type = inet_addr_type_dev_table(net, dev, sip);
                if (*addr_type != RTN_UNICAST)
                        is_garp = false;
        }
        return is_garp;
}

/*
 *        Process an arp request.
 */

static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        struct arphdr *arp;
        unsigned char *arp_ptr;
        struct rtable *rt;
        unsigned char *sha;
        unsigned char *tha = NULL;
        __be32 sip, tip;
        u16 dev_type = dev->type;
        int addr_type;
        struct neighbour *n;
        struct dst_entry *reply_dst = NULL;
        bool is_garp = false;

        /* arp_rcv below verifies the ARP header and verifies the device
         * is ARP'able.
         */

        if (!in_dev)
                goto out_free_skb;

        arp = arp_hdr(skb);

        switch (dev_type) {
        default:
                if (arp->ar_pro != htons(ETH_P_IP) ||
                    htons(dev_type) != arp->ar_hrd)
                        goto out_free_skb;
                break;
        case ARPHRD_ETHER:
        case ARPHRD_FDDI:
        case ARPHRD_IEEE802:
                /*
                 * ETHERNET, and Fibre Channel (which are IEEE 802
                 * devices, according to RFC 2625) devices will accept ARP
                 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
                 * This is the case also of FDDI, where the RFC 1390 says that
                 * FDDI devices should accept ARP hardware of (1) Ethernet,
                 * however, to be more robust, we'll accept both 1 (Ethernet)
                 * or 6 (IEEE 802.2)
                 */
                if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
                     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
                    arp->ar_pro != htons(ETH_P_IP))
                        goto out_free_skb;
                break;
        case ARPHRD_AX25:
                if (arp->ar_pro != htons(AX25_P_IP) ||
                    arp->ar_hrd != htons(ARPHRD_AX25))
                        goto out_free_skb;
                break;
        case ARPHRD_NETROM:
                if (arp->ar_pro != htons(AX25_P_IP) ||
                    arp->ar_hrd != htons(ARPHRD_NETROM))
                        goto out_free_skb;
                break;
        }

        /* Understand only these message types */

        if (arp->ar_op != htons(ARPOP_REPLY) &&
            arp->ar_op != htons(ARPOP_REQUEST))
                goto out_free_skb;

/*
 *        Extract fields
 */
        arp_ptr = (unsigned char *)(arp + 1);
        sha        = arp_ptr;
        arp_ptr += dev->addr_len;
        memcpy(&sip, arp_ptr, 4);
        arp_ptr += 4;
        switch (dev_type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
        case ARPHRD_IEEE1394:
                break;
#endif
        default:
                tha = arp_ptr;
                arp_ptr += dev->addr_len;
        }
        memcpy(&tip, arp_ptr, 4);
/*
 *        Check for bad requests for 127.x.x.x and requests for multicast
 *        addresses.  If this is one such, delete it.
 */
        if (ipv4_is_multicast(tip) ||
            (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
                goto out_free_skb;

 /*
  *        For some 802.11 wireless deployments (and possibly other networks),
  *        there will be an ARP proxy and gratuitous ARP frames are attacks
  *        and thus should not be accepted.
  */
        if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
                goto out_free_skb;

/*
 *     Special case: We must set Frame Relay source Q.922 address
 */
        if (dev_type == ARPHRD_DLCI)
                sha = dev->broadcast;

/*
 *  Process entry.  The idea here is we want to send a reply if it is a
 *  request for us or if it is a request for someone else that we hold
 *  a proxy for.  We want to add an entry to our cache if it is a reply
 *  to us or if it is a request for our address.
 *  (The assumption for this last is that if someone is requesting our
 *  address, they are probably intending to talk to us, so it saves time
 *  if we cache their address.  Their address is also probably not in
 *  our cache, since ours is not in their cache.)
 *
 *  Putting this another way, we only care about replies if they are to
 *  us, in which case we add them to the cache.  For requests, we care
 *  about those for us and those for our proxies.  We reply to both,
 *  and in the case of requests for us we add the requester to the arp
 *  cache.
 */

        if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
                reply_dst = (struct dst_entry *)
                            iptunnel_metadata_reply(skb_metadata_dst(skb),
                                                    GFP_ATOMIC);

        /* Special case: IPv4 duplicate address detection packet (RFC2131) */
        if (sip == 0) {
                if (arp->ar_op == htons(ARPOP_REQUEST) &&
                    inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
                    !arp_ignore(in_dev, sip, tip))
                        arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
                                     sha, dev->dev_addr, sha, reply_dst);
                goto out_consume_skb;
        }

        if (arp->ar_op == htons(ARPOP_REQUEST) &&
            ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {

                rt = skb_rtable(skb);
                addr_type = rt->rt_type;

                if (addr_type == RTN_LOCAL) {
                        int dont_send;

                        dont_send = arp_ignore(in_dev, sip, tip);
                        if (!dont_send && IN_DEV_ARPFILTER(in_dev))
                                dont_send = arp_filter(sip, tip, dev);
                        if (!dont_send) {
                                n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
                                if (n) {
                                        arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
                                                     sip, dev, tip, sha,
                                                     dev->dev_addr, sha,
                                                     reply_dst);
                                        neigh_release(n);
                                }
                        }
                        goto out_consume_skb;
                } else if (IN_DEV_FORWARD(in_dev)) {
                        if (addr_type == RTN_UNICAST  &&
                            (arp_fwd_proxy(in_dev, dev, rt) ||
                             arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
                             (rt->dst.dev != dev &&
                              pneigh_lookup(&arp_tbl, net, &tip, dev)))) {
                                n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
                                if (n)
                                        neigh_release(n);

                                if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
                                    skb->pkt_type == PACKET_HOST ||
                                    NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
                                        arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
                                                     sip, dev, tip, sha,
                                                     dev->dev_addr, sha,
                                                     reply_dst);
                                } else {
                                        pneigh_enqueue(&arp_tbl,
                                                       in_dev->arp_parms, skb);
                                        goto out_free_dst;
                                }
                                goto out_consume_skb;
                        }
                }
        }

        /* Update our ARP tables */

        n = __neigh_lookup(&arp_tbl, &sip, dev, 0);

        addr_type = -1;
        if (n || arp_accept(in_dev, sip)) {
                is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
                                      sip, tip, sha, tha);
        }

        if (arp_accept(in_dev, sip)) {
                /* Unsolicited ARP is not accepted by default.
                   It is possible, that this option should be enabled for some
                   devices (strip is candidate)
                 */
                if (!n &&
                    (is_garp ||
                     (arp->ar_op == htons(ARPOP_REPLY) &&
                      (addr_type == RTN_UNICAST ||
                       (addr_type < 0 &&
                        /* postpone calculation to as late as possible */
                        inet_addr_type_dev_table(net, dev, sip) ==
                                RTN_UNICAST)))))
                        n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
        }

        if (n) {
                int state = NUD_REACHABLE;
                int override;

                /* If several different ARP replies follows back-to-back,
                   use the FIRST one. It is possible, if several proxy
                   agents are active. Taking the first reply prevents
                   arp trashing and chooses the fastest router.
                 */
                override = time_after(jiffies,
                                      n->updated +
                                      NEIGH_VAR(n->parms, LOCKTIME)) ||
                           is_garp;

                /* Broadcast replies and request packets
                   do not assert neighbour reachability.
                 */
                if (arp->ar_op != htons(ARPOP_REPLY) ||
                    skb->pkt_type != PACKET_HOST)
                        state = NUD_STALE;
                neigh_update(n, sha, state,
                             override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0);
                neigh_release(n);
        }

out_consume_skb:
        consume_skb(skb);

out_free_dst:
        dst_release(reply_dst);
        return NET_RX_SUCCESS;

out_free_skb:
        kfree_skb(skb);
        return NET_RX_DROP;
}

static void parp_redo(struct sk_buff *skb)
{
        arp_process(dev_net(skb->dev), NULL, skb);
}

static int arp_is_multicast(const void *pkey)
{
        return ipv4_is_multicast(*((__be32 *)pkey));
}

/*
 *        Receive an arp request from the device layer.
 */

static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
                   struct packet_type *pt, struct net_device *orig_dev)
{
        enum skb_drop_reason drop_reason;
        const struct arphdr *arp;

        /* do not tweak dropwatch on an ARP we will ignore */
        if (dev->flags & IFF_NOARP ||
            skb->pkt_type == PACKET_OTHERHOST ||
            skb->pkt_type == PACKET_LOOPBACK)
                goto consumeskb;

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb)
                goto out_of_mem;

        /* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
        drop_reason = pskb_may_pull_reason(skb, arp_hdr_len(dev));
        if (drop_reason != SKB_NOT_DROPPED_YET)
                goto freeskb;

        arp = arp_hdr(skb);
        if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) {
                drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
                goto freeskb;
        }

        memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));

        return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
                       dev_net(dev), NULL, skb, dev, NULL,
                       arp_process);

consumeskb:
        consume_skb(skb);
        return NET_RX_SUCCESS;
freeskb:
        kfree_skb_reason(skb, drop_reason);
out_of_mem:
        return NET_RX_DROP;
}

/*
 *        User level interface (ioctl)
 */

static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r,
                                              bool getarp)
{
        struct net_device *dev;

        if (getarp)
                dev = dev_get_by_name_rcu(net, r->arp_dev);
        else
                dev = __dev_get_by_name(net, r->arp_dev);
        if (!dev)
                return ERR_PTR(-ENODEV);

        /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */
        if (!r->arp_ha.sa_family)
                r->arp_ha.sa_family = dev->type;

        if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type)
                return ERR_PTR(-EINVAL);

        return dev;
}

static struct net_device *arp_req_dev(struct net *net, struct arpreq *r)
{
        struct net_device *dev;
        struct rtable *rt;
        __be32 ip;

        if (r->arp_dev[0])
                return arp_req_dev_by_name(net, r, false);

        if (r->arp_flags & ATF_PUBL)
                return NULL;

        ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;

        rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK);
        if (IS_ERR(rt))
                return ERR_CAST(rt);

        dev = rt->dst.dev;
        ip_rt_put(rt);

        if (!dev)
                return ERR_PTR(-EINVAL);

        return dev;
}

/*
 *        Set (create) an ARP cache entry.
 */

static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
{
        if (!dev) {
                IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
                return 0;
        }
        if (__in_dev_get_rtnl_net(dev)) {
                IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on);
                return 0;
        }
        return -ENXIO;
}

static int arp_req_set_public(struct net *net, struct arpreq *r,
                struct net_device *dev)
{
        __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;

        if (!dev && (r->arp_flags & ATF_COM)) {
                dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
                                      r->arp_ha.sa_data);
                if (!dev)
                        return -ENODEV;
        }
        if (mask) {
                __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;

                return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false);
        }

        return arp_req_set_proxy(net, dev, 1);
}

static int arp_req_set(struct net *net, struct arpreq *r)
{
        struct neighbour *neigh;
        struct net_device *dev;
        __be32 ip;
        int err;

        dev = arp_req_dev(net, r);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        if (r->arp_flags & ATF_PUBL)
                return arp_req_set_public(net, r, dev);

        switch (dev->type) {
#if IS_ENABLED(CONFIG_FDDI)
        case ARPHRD_FDDI:
                /*
                 * According to RFC 1390, FDDI devices should accept ARP
                 * hardware types of 1 (Ethernet).  However, to be more
                 * robust, we'll accept hardware types of either 1 (Ethernet)
                 * or 6 (IEEE 802.2).
                 */
                if (r->arp_ha.sa_family != ARPHRD_FDDI &&
                    r->arp_ha.sa_family != ARPHRD_ETHER &&
                    r->arp_ha.sa_family != ARPHRD_IEEE802)
                        return -EINVAL;
                break;
#endif
        default:
                if (r->arp_ha.sa_family != dev->type)
                        return -EINVAL;
                break;
        }

        ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;

        neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
        err = PTR_ERR(neigh);
        if (!IS_ERR(neigh)) {
                unsigned int state = NUD_STALE;

                if (r->arp_flags & ATF_PERM) {
                        r->arp_flags |= ATF_COM;
                        state = NUD_PERMANENT;
                }

                err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
                                   r->arp_ha.sa_data : NULL, state,
                                   NEIGH_UPDATE_F_OVERRIDE |
                                   NEIGH_UPDATE_F_ADMIN, 0);
                neigh_release(neigh);
        }
        return err;
}

static unsigned int arp_state_to_flags(struct neighbour *neigh)
{
        if (neigh->nud_state&NUD_PERMANENT)
                return ATF_PERM | ATF_COM;
        else if (neigh->nud_state&NUD_VALID)
                return ATF_COM;
        else
                return 0;
}

/*
 *        Get an ARP cache entry.
 */

static int arp_req_get(struct net *net, struct arpreq *r)
{
        __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
        struct neighbour *neigh;
        struct net_device *dev;

        if (!r->arp_dev[0])
                return -ENODEV;

        dev = arp_req_dev_by_name(net, r, true);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        neigh = neigh_lookup(&arp_tbl, &ip, dev);
        if (!neigh)
                return -ENXIO;

        if (READ_ONCE(neigh->nud_state) & NUD_NOARP) {
                neigh_release(neigh);
                return -ENXIO;
        }

        read_lock_bh(&neigh->lock);
        memcpy(r->arp_ha.sa_data, neigh->ha,
               min(dev->addr_len, sizeof(r->arp_ha.sa_data)));
        r->arp_flags = arp_state_to_flags(neigh);
        read_unlock_bh(&neigh->lock);

        neigh_release(neigh);

        r->arp_ha.sa_family = dev->type;
        netdev_copy_name(dev, r->arp_dev);

        return 0;
}

int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
{
        struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
        int err = -ENXIO;
        struct neigh_table *tbl = &arp_tbl;

        if (neigh) {
                if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) {
                        neigh_release(neigh);
                        return 0;
                }

                if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP)
                        err = neigh_update(neigh, NULL, NUD_FAILED,
                                           NEIGH_UPDATE_F_OVERRIDE|
                                           NEIGH_UPDATE_F_ADMIN, 0);
                spin_lock_bh(&tbl->lock);
                neigh_release(neigh);
                neigh_remove_one(neigh);
                spin_unlock_bh(&tbl->lock);
        }

        return err;
}

static int arp_req_delete_public(struct net *net, struct arpreq *r,
                struct net_device *dev)
{
        __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;

        if (mask) {
                __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;

                return pneigh_delete(&arp_tbl, net, &ip, dev);
        }

        return arp_req_set_proxy(net, dev, 0);
}

static int arp_req_delete(struct net *net, struct arpreq *r)
{
        struct net_device *dev;
        __be32 ip;

        dev = arp_req_dev(net, r);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        if (r->arp_flags & ATF_PUBL)
                return arp_req_delete_public(net, r, dev);

        ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;

        return arp_invalidate(dev, ip, true);
}

/*
 *        Handle an ARP layer I/O control request.
 */

int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
        struct arpreq r;
        __be32 *netmask;
        int err;

        switch (cmd) {
        case SIOCDARP:
        case SIOCSARP:
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                fallthrough;
        case SIOCGARP:
                err = copy_from_user(&r, arg, sizeof(struct arpreq));
                if (err)
                        return -EFAULT;
                break;
        default:
                return -EINVAL;
        }

        if (r.arp_pa.sa_family != AF_INET)
                return -EPFNOSUPPORT;

        if (!(r.arp_flags & ATF_PUBL) &&
            (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
                return -EINVAL;

        netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr;
        if (!(r.arp_flags & ATF_NETMASK))
                *netmask = htonl(0xFFFFFFFFUL);
        else if (*netmask && *netmask != htonl(0xFFFFFFFFUL))
                return -EINVAL;

        switch (cmd) {
        case SIOCDARP:
                rtnl_net_lock(net);
                err = arp_req_delete(net, &r);
                rtnl_net_unlock(net);
                break;
        case SIOCSARP:
                rtnl_net_lock(net);
                err = arp_req_set(net, &r);
                rtnl_net_unlock(net);
                break;
        case SIOCGARP:
                rcu_read_lock();
                err = arp_req_get(net, &r);
                rcu_read_unlock();

                if (!err && copy_to_user(arg, &r, sizeof(r)))
                        err = -EFAULT;
                break;
        }

        return err;
}

static int arp_netdev_event(struct notifier_block *this, unsigned long event,
                            void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct netdev_notifier_change_info *change_info;
        struct in_device *in_dev;
        bool evict_nocarrier;

        switch (event) {
        case NETDEV_CHANGEADDR:
                neigh_changeaddr(&arp_tbl, dev);
                rt_cache_flush(dev_net(dev));
                break;
        case NETDEV_CHANGE:
                change_info = ptr;
                if (change_info->flags_changed & IFF_NOARP)
                        neigh_changeaddr(&arp_tbl, dev);

                in_dev = __in_dev_get_rtnl(dev);
                if (!in_dev)
                        evict_nocarrier = true;
                else
                        evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev);

                if (evict_nocarrier && !netif_carrier_ok(dev))
                        neigh_carrier_down(&arp_tbl, dev);
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block arp_netdev_notifier = {
        .notifier_call = arp_netdev_event,
};

/* Note, that it is not on notifier chain.
   It is necessary, that this routine was called after route cache will be
   flushed.
 */
void arp_ifdown(struct net_device *dev)
{
        neigh_ifdown(&arp_tbl, dev);
}


/*
 *        Called once on startup.
 */

static struct packet_type arp_packet_type __read_mostly = {
        .type =        cpu_to_be16(ETH_P_ARP),
        .func =        arp_rcv,
};

#ifdef CONFIG_PROC_FS
#if IS_ENABLED(CONFIG_AX25)

/*
 *        ax25 -> ASCII conversion
 */
static void ax2asc2(ax25_address *a, char *buf)
{
        char c, *s;
        int n;

        for (n = 0, s = buf; n < 6; n++) {
                c = (a->ax25_call[n] >> 1) & 0x7F;

                if (c != ' ')
                        *s++ = c;
        }

        *s++ = '-';
        n = (a->ax25_call[6] >> 1) & 0x0F;
        if (n > 9) {
                *s++ = '1';
                n -= 10;
        }

        *s++ = n + '0';
        *s++ = '\0';

        if (*buf == '\0' || *buf == '-') {
                buf[0] = '*';
                buf[1] = '\0';
        }
}
#endif /* CONFIG_AX25 */

#define HBUFFERLEN 30

static void arp_format_neigh_entry(struct seq_file *seq,
                                   struct neighbour *n)
{
        char hbuffer[HBUFFERLEN];
        int k, j;
        char tbuf[16];
        struct net_device *dev = n->dev;
        int hatype = dev->type;

        read_lock(&n->lock);
        /* Convert hardware address to XX:XX:XX:XX ... form. */
#if IS_ENABLED(CONFIG_AX25)
        if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
                ax2asc2((ax25_address *)n->ha, hbuffer);
        else {
#endif
        for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
                hbuffer[k++] = hex_asc_hi(n->ha[j]);
                hbuffer[k++] = hex_asc_lo(n->ha[j]);
                hbuffer[k++] = ':';
        }
        if (k != 0)
                --k;
        hbuffer[k] = 0;
#if IS_ENABLED(CONFIG_AX25)
        }
#endif
        sprintf(tbuf, "%pI4", n->primary_key);
        seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s     *        %s\n",
                   tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
        read_unlock(&n->lock);
}

static void arp_format_pneigh_entry(struct seq_file *seq,
                                    struct pneigh_entry *n)
{
        struct net_device *dev = n->dev;
        int hatype = dev ? dev->type : 0;
        char tbuf[16];

        sprintf(tbuf, "%pI4", n->key);
        seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
                   tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
                   dev ? dev->name : "*");
}

static int arp_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "IP address       HW type     Flags       "
                              "HW address            Mask     Device\n");
        } else {
                struct neigh_seq_state *state = seq->private;

                if (state->flags & NEIGH_SEQ_IS_PNEIGH)
                        arp_format_pneigh_entry(seq, v);
                else
                        arp_format_neigh_entry(seq, v);
        }

        return 0;
}

static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
{
        /* Don't want to confuse "arp -a" w/ magic entries,
         * so we tell the generic iterator to skip NUD_NOARP.
         */
        return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
}

static const struct seq_operations arp_seq_ops = {
        .start        = arp_seq_start,
        .next        = neigh_seq_next,
        .stop        = neigh_seq_stop,
        .show        = arp_seq_show,
};
#endif /* CONFIG_PROC_FS */

static int __net_init arp_net_init(struct net *net)
{
        if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops,
                        sizeof(struct neigh_seq_state)))
                return -ENOMEM;
        return 0;
}

static void __net_exit arp_net_exit(struct net *net)
{
        remove_proc_entry("arp", net->proc_net);
}

static struct pernet_operations arp_net_ops = {
        .init = arp_net_init,
        .exit = arp_net_exit,
};

void __init arp_init(void)
{
        neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);

        dev_add_pack(&arp_packet_type);
        register_pernet_subsys(&arp_net_ops);
#ifdef CONFIG_SYSCTL
        neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
#endif
        register_netdevice_notifier(&arp_netdev_notifier);
}

















































































    1 






    1 


    1 




    1 


    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/* zutil.h -- internal interface and configuration of the compression library
 * Copyright (C) 1995-1998 Jean-loup Gailly.
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

/* WARNING: this file should *not* be used by applications. It is
   part of the implementation of the compression library and is
   subject to change. Applications should only use zlib.h.
 */

/* @(#) $Id: zutil.h,v 1.1 2000/01/01 03:32:23 davem Exp $ */

#ifndef _Z_UTIL_H
#define _Z_UTIL_H

#include <linux/zlib.h>
#include <linux/string.h>
#include <linux/kernel.h>

typedef unsigned char  uch;
typedef unsigned short ush;
typedef unsigned long  ulg;

        /* common constants */

#define STORED_BLOCK 0
#define STATIC_TREES 1
#define DYN_TREES    2
/* The three kinds of block type */

#define MIN_MATCH  3
#define MAX_MATCH  258
/* The minimum and maximum match lengths */

#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */

        /* target dependencies */

        /* Common defaults */

#ifndef OS_CODE
#  define OS_CODE  0x03  /* assume Unix */
#endif

         /* functions */

typedef uLong (*check_func) (uLong check, const Byte *buf,
                                       uInt len);


                        /* checksum functions */

#define BASE 65521L /* largest prime smaller than 65536 */
#define NMAX 5552
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */

#define DO1(buf,i)  {s1 += buf[i]; s2 += s1;}
#define DO2(buf,i)  DO1(buf,i); DO1(buf,i+1);
#define DO4(buf,i)  DO2(buf,i); DO2(buf,i+2);
#define DO8(buf,i)  DO4(buf,i); DO4(buf,i+4);
#define DO16(buf)   DO8(buf,0); DO8(buf,8);

/* ========================================================================= */
/*
     Update a running Adler-32 checksum with the bytes buf[0..len-1] and
   return the updated checksum. If buf is NULL, this function returns
   the required initial value for the checksum.
   An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
   much faster. Usage example:

     uLong adler = zlib_adler32(0L, NULL, 0);

     while (read_buffer(buffer, length) != EOF) {
       adler = zlib_adler32(adler, buffer, length);
     }
     if (adler != original_adler) error();
*/
static inline uLong zlib_adler32(uLong adler,
                                 const Byte *buf,
                                 uInt len)
{
    unsigned long s1 = adler & 0xffff;
    unsigned long s2 = (adler >> 16) & 0xffff;
    int k;

    if (buf == NULL) return 1L;

    while (len > 0) {
        k = len < NMAX ? len : NMAX;
        len -= k;
        while (k >= 16) {
            DO16(buf);
            buf += 16;
            k -= 16;
        }
        if (k != 0) do {
            s1 += *buf++;
            s2 += s1;
        } while (--k);
        s1 %= BASE;
        s2 %= BASE;
    }
    return (s2 << 16) | s1;
}

#endif /* _Z_UTIL_H */











































    1 




    1 







    1 











































































































    1 









































    1 



    1 
    1 















    1 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
// SPDX-License-Identifier: GPL-2.0-or-later
/* Request key authorisation token key definition.
 *
 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * See Documentation/security/keys/request-key.rst
 */

#include <linux/sched.h>
#include <linux/err.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include "internal.h"
#include <keys/request_key_auth-type.h>

static int request_key_auth_preparse(struct key_preparsed_payload *);
static void request_key_auth_free_preparse(struct key_preparsed_payload *);
static int request_key_auth_instantiate(struct key *,
                                        struct key_preparsed_payload *);
static void request_key_auth_describe(const struct key *, struct seq_file *);
static void request_key_auth_revoke(struct key *);
static void request_key_auth_destroy(struct key *);
static long request_key_auth_read(const struct key *, char *, size_t);

/*
 * The request-key authorisation key type definition.
 */
struct key_type key_type_request_key_auth = {
        .name                = ".request_key_auth",
        .def_datalen        = sizeof(struct request_key_auth),
        .preparse        = request_key_auth_preparse,
        .free_preparse        = request_key_auth_free_preparse,
        .instantiate        = request_key_auth_instantiate,
        .describe        = request_key_auth_describe,
        .revoke                = request_key_auth_revoke,
        .destroy        = request_key_auth_destroy,
        .read                = request_key_auth_read,
};

static int request_key_auth_preparse(struct key_preparsed_payload *prep)
{
        return 0;
}

static void request_key_auth_free_preparse(struct key_preparsed_payload *prep)
{
}

/*
 * Instantiate a request-key authorisation key.
 */
static int request_key_auth_instantiate(struct key *key,
                                        struct key_preparsed_payload *prep)
{
        rcu_assign_keypointer(key, (struct request_key_auth *)prep->data);
        return 0;
}

/*
 * Describe an authorisation token.
 */
static void request_key_auth_describe(const struct key *key,
                                      struct seq_file *m)
{
        struct request_key_auth *rka = dereference_key_rcu(key);

        if (!rka)
                return;

        seq_puts(m, "key:");
        seq_puts(m, key->description);
        if (key_is_positive(key))
                seq_printf(m, " pid:%d ci:%zu", rka->pid, rka->callout_len);
}

/*
 * Read the callout_info data (retrieves the callout information).
 * - the key's semaphore is read-locked
 */
static long request_key_auth_read(const struct key *key,
                                  char *buffer, size_t buflen)
{
        struct request_key_auth *rka = dereference_key_locked(key);
        size_t datalen;
        long ret;

        if (!rka)
                return -EKEYREVOKED;

        datalen = rka->callout_len;
        ret = datalen;

        /* we can return the data as is */
        if (buffer && buflen > 0) {
                if (buflen > datalen)
                        buflen = datalen;

                memcpy(buffer, rka->callout_info, buflen);
        }

        return ret;
}

static void free_request_key_auth(struct request_key_auth *rka)
{
        if (!rka)
                return;
        key_put(rka->target_key);
        key_put(rka->dest_keyring);
        if (rka->cred)
                put_cred(rka->cred);
        kfree(rka->callout_info);
        kfree(rka);
}

/*
 * Dispose of the request_key_auth record under RCU conditions
 */
static void request_key_auth_rcu_disposal(struct rcu_head *rcu)
{
        struct request_key_auth *rka =
                container_of(rcu, struct request_key_auth, rcu);

        free_request_key_auth(rka);
}

/*
 * Handle revocation of an authorisation token key.
 *
 * Called with the key sem write-locked.
 */
static void request_key_auth_revoke(struct key *key)
{
        struct request_key_auth *rka = dereference_key_locked(key);

        kenter("{%d}", key->serial);
        rcu_assign_keypointer(key, NULL);
        call_rcu(&rka->rcu, request_key_auth_rcu_disposal);
}

/*
 * Destroy an instantiation authorisation token key.
 */
static void request_key_auth_destroy(struct key *key)
{
        struct request_key_auth *rka = rcu_access_pointer(key->payload.rcu_data0);

        kenter("{%d}", key->serial);
        if (rka) {
                rcu_assign_keypointer(key, NULL);
                call_rcu(&rka->rcu, request_key_auth_rcu_disposal);
        }
}

/*
 * Create an authorisation token for /sbin/request-key or whoever to gain
 * access to the caller's security data.
 */
struct key *request_key_auth_new(struct key *target, const char *op,
                                 const void *callout_info, size_t callout_len,
                                 struct key *dest_keyring)
{
        struct request_key_auth *rka, *irka;
        const struct cred *cred = current_cred();
        struct key *authkey = NULL;
        char desc[20];
        int ret = -ENOMEM;

        kenter("%d,", target->serial);

        /* allocate a auth record */
        rka = kzalloc_obj(*rka);
        if (!rka)
                goto error;
        rka->callout_info = kmemdup(callout_info, callout_len, GFP_KERNEL);
        if (!rka->callout_info)
                goto error_free_rka;
        rka->callout_len = callout_len;
        strscpy(rka->op, op, sizeof(rka->op));

        /* see if the calling process is already servicing the key request of
         * another process */
        if (cred->request_key_auth) {
                /* it is - use that instantiation context here too */
                down_read(&cred->request_key_auth->sem);

                /* if the auth key has been revoked, then the key we're
                 * servicing is already instantiated */
                if (test_bit(KEY_FLAG_REVOKED,
                             &cred->request_key_auth->flags)) {
                        up_read(&cred->request_key_auth->sem);
                        ret = -EKEYREVOKED;
                        goto error_free_rka;
                }

                irka = cred->request_key_auth->payload.data[0];
                rka->cred = get_cred(irka->cred);
                rka->pid = irka->pid;

                up_read(&cred->request_key_auth->sem);
        }
        else {
                /* it isn't - use this process as the context */
                rka->cred = get_cred(cred);
                rka->pid = current->pid;
        }

        rka->target_key = key_get(target);
        rka->dest_keyring = key_get(dest_keyring);

        /* allocate the auth key */
        sprintf(desc, "%x", target->serial);

        authkey = key_alloc(&key_type_request_key_auth, desc,
                            cred->fsuid, cred->fsgid, cred,
                            KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_POS_LINK |
                            KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL);
        if (IS_ERR(authkey)) {
                ret = PTR_ERR(authkey);
                goto error_free_rka;
        }

        /* construct the auth key */
        ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL);
        if (ret < 0)
                goto error_put_authkey;

        kleave(" = {%d,%d}", authkey->serial, refcount_read(&authkey->usage));
        return authkey;

error_put_authkey:
        key_put(authkey);
error_free_rka:
        free_request_key_auth(rka);
error:
        kleave("= %d", ret);
        return ERR_PTR(ret);
}

/*
 * Search the current process's keyrings for the authorisation key for
 * instantiation of a key.
 */
struct key *key_get_instantiation_authkey(key_serial_t target_id)
{
        char description[16];
        struct keyring_search_context ctx = {
                .index_key.type                = &key_type_request_key_auth,
                .index_key.description        = description,
                .cred                        = current_cred(),
                .match_data.cmp                = key_default_cmp,
                .match_data.raw_data        = description,
                .match_data.lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
                .flags                        = (KEYRING_SEARCH_DO_STATE_CHECK |
                                           KEYRING_SEARCH_RECURSE),
        };
        struct key *authkey;
        key_ref_t authkey_ref;

        ctx.index_key.desc_len = sprintf(description, "%x", target_id);

        rcu_read_lock();
        authkey_ref = search_process_keyrings_rcu(&ctx);
        rcu_read_unlock();

        if (IS_ERR(authkey_ref)) {
                authkey = ERR_CAST(authkey_ref);
                if (authkey == ERR_PTR(-EAGAIN))
                        authkey = ERR_PTR(-ENOKEY);
                goto error;
        }

        authkey = key_ref_to_ptr(authkey_ref);
        if (test_bit(KEY_FLAG_REVOKED, &authkey->flags)) {
                key_put(authkey);
                authkey = ERR_PTR(-EKEYREVOKED);
        }

error:
        return authkey;
}


































































































    2 



























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * NET                Generic infrastructure for Network protocols.
 *
 *                Definitions for request_sock
 *
 * Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 *                 From code originally in include/net/tcp.h
 */
#ifndef _REQUEST_SOCK_H
#define _REQUEST_SOCK_H

#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/refcount.h>

#include <net/sock.h>
#include <net/rstreason.h>

struct request_sock;
struct sk_buff;
struct dst_entry;
struct proto;

struct request_sock_ops {
        int                family;
        unsigned int        obj_size;
        struct kmem_cache        *slab;
        char                *slab_name;
        void                (*send_ack)(const struct sock *sk, struct sk_buff *skb,
                                    struct request_sock *req);
        void                (*send_reset)(const struct sock *sk,
                                      struct sk_buff *skb,
                                      enum sk_rst_reason reason);
        void                (*destructor)(struct request_sock *req);
};

struct saved_syn {
        u32 mac_hdrlen;
        u32 network_hdrlen;
        u32 tcp_hdrlen;
        u8 data[];
};

/* struct request_sock - mini sock to represent a connection request
 */
struct request_sock {
        struct sock_common                __req_common;
#define rsk_refcnt                        __req_common.skc_refcnt
#define rsk_hash                        __req_common.skc_hash
#define rsk_listener                        __req_common.skc_listener
#define rsk_window_clamp                __req_common.skc_window_clamp
#define rsk_rcv_wnd                        __req_common.skc_rcv_wnd

        struct request_sock                *dl_next;
        u16                                mss;
        u8                                num_retrans; /* number of retransmits */
        u8                                syncookie:1; /* True if
                                                      * 1) tcpopts needs to be encoded in
                                                      *    TS of SYN+ACK
                                                      * 2) ACK is validated by BPF kfunc.
                                                      */
        u8                                num_timeout:7; /* number of timeouts */
        u32                                ts_recent;
        struct timer_list                rsk_timer;
        const struct request_sock_ops        *rsk_ops;
        struct sock                        *sk;
        struct saved_syn                *saved_syn;
        u32                                secid;
        u32                                peer_secid;
        u32                                timeout;
};

static inline struct request_sock *inet_reqsk(const struct sock *sk)
{
        return (struct request_sock *)sk;
}

static inline struct sock *req_to_sk(struct request_sock *req)
{
        return (struct sock *)req;
}

/**
 * skb_steal_sock - steal a socket from an sk_buff
 * @skb: sk_buff to steal the socket from
 * @refcounted: is set to true if the socket is reference-counted
 * @prefetched: is set to true if the socket was assigned from bpf
 */
static inline struct sock *skb_steal_sock(struct sk_buff *skb,
                                          bool *refcounted, bool *prefetched)
{
        struct sock *sk = skb->sk;

        if (!sk) {
                *prefetched = false;
                *refcounted = false;
                return NULL;
        }

        *prefetched = skb_sk_is_prefetched(skb);
        if (*prefetched) {
#if IS_ENABLED(CONFIG_SYN_COOKIES)
                if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
                        struct request_sock *req = inet_reqsk(sk);

                        *refcounted = false;
                        sk = req->rsk_listener;
                        req->rsk_listener = NULL;
                        return sk;
                }
#endif
                *refcounted = sk_is_refcounted(sk);
        } else {
                *refcounted = true;
        }

        skb->destructor = NULL;
        skb->sk = NULL;
        return sk;
}

void __reqsk_free(struct request_sock *req);

static inline void reqsk_free(struct request_sock *req)
{
        DEBUG_NET_WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0);
        __reqsk_free(req);
}

static inline void reqsk_put(struct request_sock *req)
{
        if (refcount_dec_and_test(&req->rsk_refcnt))
                __reqsk_free(req);
}

/*
 * For a TCP Fast Open listener -
 *        lock - protects the access to all the reqsk, which is co-owned by
 *                the listener and the child socket.
 *        qlen - pending TFO requests (still in TCP_SYN_RECV).
 *        max_qlen - max TFO reqs allowed before TFO is disabled.
 *
 *        XXX (TFO) - ideally these fields can be made as part of "listen_sock"
 *        structure above. But there is some implementation difficulty due to
 *        listen_sock being part of request_sock_queue hence will be freed when
 *        a listener is stopped. But TFO related fields may continue to be
 *        accessed even after a listener is closed, until its sk_refcnt drops
 *        to 0 implying no more outstanding TFO reqs. One solution is to keep
 *        listen_opt around until        sk_refcnt drops to 0. But there is some other
 *        complexity that needs to be resolved. E.g., a listener can be disabled
 *        temporarily through shutdown()->tcp_disconnect(), and re-enabled later.
 */
struct fastopen_queue {
        struct request_sock        *rskq_rst_head; /* Keep track of past TFO */
        struct request_sock        *rskq_rst_tail; /* requests that caused RST.
                                                 * This is part of the defense
                                                 * against spoofing attack.
                                                 */
        spinlock_t        lock;
        int                qlen;                /* # of pending (TCP_SYN_RECV) reqs */
        int                max_qlen;        /* != 0 iff TFO is currently enabled */

        struct tcp_fastopen_context __rcu *ctx; /* cipher context for cookie */
};

/** struct request_sock_queue - queue of request_socks
 *
 * @rskq_accept_head - FIFO head of established children
 * @rskq_accept_tail - FIFO tail of established children
 * @rskq_defer_accept - User waits for some data after accept()
 *
 */
struct request_sock_queue {
        spinlock_t                rskq_lock;
        u8                        rskq_defer_accept;
        u8                        synflood_warned;

        atomic_t                qlen;
        atomic_t                young;

        struct request_sock        *rskq_accept_head;
        struct request_sock        *rskq_accept_tail;
        struct fastopen_queue        fastopenq;  /* Check max_qlen != 0 to determine
                                             * if TFO is enabled.
                                             */
};

void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
                           bool reset);

static inline bool reqsk_queue_empty(const struct request_sock_queue *queue)
{
        return READ_ONCE(queue->rskq_accept_head) == NULL;
}

static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue,
                                                      struct sock *parent)
{
        struct request_sock *req;

        spin_lock_bh(&queue->rskq_lock);
        req = queue->rskq_accept_head;
        if (req) {
                sk_acceptq_removed(parent);
                WRITE_ONCE(queue->rskq_accept_head, req->dl_next);
                if (queue->rskq_accept_head == NULL)
                        queue->rskq_accept_tail = NULL;
        }
        spin_unlock_bh(&queue->rskq_lock);
        return req;
}

static inline void reqsk_queue_removed(struct request_sock_queue *queue,
                                       const struct request_sock *req)
{
        if (req->num_timeout == 0)
                atomic_dec(&queue->young);
        atomic_dec(&queue->qlen);
}

static inline void reqsk_queue_added(struct request_sock_queue *queue)
{
        atomic_inc(&queue->young);
        atomic_inc(&queue->qlen);
}

static inline int reqsk_queue_len(const struct request_sock_queue *queue)
{
        return atomic_read(&queue->qlen);
}

static inline int reqsk_queue_len_young(const struct request_sock_queue *queue)
{
        return atomic_read(&queue->young);
}

/* RFC 7323 2.3 Using the Window Scale Option
 *  The window field (SEG.WND) of every outgoing segment, with the
 *  exception of <SYN> segments, MUST be right-shifted by
 *  Rcv.Wind.Shift bits.
 *
 * This means the SEG.WND carried in SYNACK can not exceed 65535.
 * We use this property to harden TCP stack while in NEW_SYN_RECV state.
 */
static inline u32 tcp_synack_window(const struct request_sock *req)
{
        return min(req->rsk_rcv_wnd, 65535U);
}
#endif /* _REQUEST_SOCK_H */
























































































































































































































































































































































































































































































































































































































































































































































































































   24 







   24 




































































































    1 









































































    1 




















    1 







    1 

















    1 

    1 




    1 




    1 





    1 





    1 




    1 




    1 




    1 

    1 

    1 





    1 
    1 


    1 








































































































































































































































































































































































































    1 









   20 





   21 





































































   21 



















































































   21 



























































   21 

























   24 








   23 













   22 





   22 













    1 




   21 

   22 





















    1 










    1 

    1 



































   20 











   20 




   23 
   20 









   19 


















   21 






























































   22 







   19 

   22 










    1 


    1 








   20 


   24 






   23 




   24 





























































































































































































































































































































































   21 






   21 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  TUN - Universal TUN/TAP device driver.
 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
 *
 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
 */

/*
 *  Changes:
 *
 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
 *    Add TUNSETLINK ioctl to set the link encapsulation
 *
 *  Mark Smith <markzzzsmith@yahoo.com.au>
 *    Use eth_random_addr() for tap MAC address.
 *
 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
 *    Fixes in packet dropping, queue length setting and queue wakeup.
 *    Increased default tx queue length.
 *    Added ethtool API.
 *    Minor cleanups
 *
 *  Daniel Podlejski <underley@underley.eu.org>
 *    Modifications for 2.3.99-pre5 kernel.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#define DRV_NAME        "tun"
#define DRV_VERSION        "1.6"
#define DRV_DESCRIPTION        "Universal TUN/TAP device driver"
#define DRV_COPYRIGHT        "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/miscdevice.h>
#include <linux/ethtool.h>
#include <linux/rtnetlink.h>
#include <linux/compat.h>
#include <linux/if.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_tun.h>
#include <linux/if_vlan.h>
#include <linux/crc32.h>
#include <linux/math.h>
#include <linux/nsproxy.h>
#include <linux/virtio_net.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
#include <net/xdp.h>
#include <net/ip_tunnels.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/mutex.h>
#include <linux/ieee802154.h>
#include <uapi/linux/if_ltalk.h>
#include <uapi/linux/if_fddi.h>
#include <uapi/linux/if_hippi.h>
#include <uapi/linux/if_fc.h>
#include <net/ax25.h>
#include <net/rose.h>
#include <net/6lowpan.h>
#include <net/rps.h>

#include <linux/uaccess.h>
#include <linux/proc_fs.h>

#include "tun_vnet.h"

static void tun_default_link_ksettings(struct net_device *dev,
                                       struct ethtool_link_ksettings *cmd);

#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

/* TUN device flags */

/* IFF_ATTACH_QUEUE is never stored in device flags,
 * overload it to mean fasync when stored there.
 */
#define TUN_FASYNC        IFF_ATTACH_QUEUE

#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
                      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)

#define GOODCOPY_LEN 128

#define FLT_EXACT_COUNT 8
struct tap_filter {
        unsigned int    count;    /* Number of addrs. Zero means disabled */
        u32             mask[2];  /* Mask of the hashed addrs */
        unsigned char        addr[FLT_EXACT_COUNT][ETH_ALEN];
};

/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
 * to max number of VCPUs in guest. */
#define MAX_TAP_QUEUES 256
#define MAX_TAP_FLOWS  4096

#define TUN_FLOW_EXPIRE (3 * HZ)

/* A tun_file connects an open character device to a tuntap netdevice. It
 * also contains all socket related structures (except sock_fprog and tap_filter)
 * to serve as one transmit queue for tuntap device. The sock_fprog and
 * tap_filter were kept in tun_struct since they were used for filtering for the
 * netdevice not for a specific queue (at least I didn't see the requirement for
 * this).
 *
 * RCU usage:
 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
 * other can only be read while rcu_read_lock or rtnl_lock is held.
 */
struct tun_file {
        struct sock sk;
        struct socket socket;
        struct tun_struct __rcu *tun;
        struct fasync_struct *fasync;
        /* only used for fasnyc */
        unsigned int flags;
        union {
                u16 queue_index;
                unsigned int ifindex;
        };
        struct napi_struct napi;
        bool napi_enabled;
        bool napi_frags_enabled;
        struct mutex napi_mutex;        /* Protects access to the above napi */
        struct list_head next;
        struct tun_struct *detached;
        struct ptr_ring tx_ring;
        struct xdp_rxq_info xdp_rxq;
};

struct tun_page {
        struct page *page;
        int count;
};

struct tun_flow_entry {
        struct hlist_node hash_link;
        struct rcu_head rcu;
        struct tun_struct *tun;

        u32 rxhash;
        u32 rps_rxhash;
        int queue_index;
        unsigned long updated ____cacheline_aligned_in_smp;
};

#define TUN_NUM_FLOW_ENTRIES 1024
#define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)

struct tun_prog {
        struct rcu_head rcu;
        struct bpf_prog *prog;
};

/* Since the socket were moved to tun_file, to preserve the behavior of persist
 * device, socket filter, sndbuf and vnet header size were restore when the
 * file were attached to a persist device.
 */
struct tun_struct {
        struct tun_file __rcu        *tfiles[MAX_TAP_QUEUES];
        unsigned int            numqueues;
        unsigned int                 flags;
        kuid_t                        owner;
        kgid_t                        group;

        struct net_device        *dev;
        netdev_features_t        set_features;
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
                          NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4 | \
                          NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM)

        int                        align;
        int                        vnet_hdr_sz;
        int                        sndbuf;
        struct tap_filter        txflt;
        struct sock_fprog        fprog;
        /* protected by rtnl lock */
        bool                        filter_attached;
        u32                        msg_enable;
        spinlock_t lock;
        struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
        struct timer_list flow_gc_timer;
        unsigned long ageing_time;
        unsigned int numdisabled;
        struct list_head disabled;
        void *security;
        u32 flow_count;
        u32 rx_batched;
        atomic_long_t rx_frame_errors;
        struct bpf_prog __rcu *xdp_prog;
        struct tun_prog __rcu *steering_prog;
        struct tun_prog __rcu *filter_prog;
        struct ethtool_link_ksettings link_ksettings;
        /* init args */
        struct file *file;
        struct ifreq *ifr;
};

struct veth {
        __be16 h_vlan_proto;
        __be16 h_vlan_TCI;
};

static void tun_flow_init(struct tun_struct *tun);
static void tun_flow_uninit(struct tun_struct *tun);

static int tun_napi_receive(struct napi_struct *napi, int budget)
{
        struct tun_file *tfile = container_of(napi, struct tun_file, napi);
        struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
        struct sk_buff_head process_queue;
        struct sk_buff *skb;
        int received = 0;

        __skb_queue_head_init(&process_queue);

        spin_lock(&queue->lock);
        skb_queue_splice_tail_init(queue, &process_queue);
        spin_unlock(&queue->lock);

        while (received < budget && (skb = __skb_dequeue(&process_queue))) {
                napi_gro_receive(napi, skb);
                ++received;
        }

        if (!skb_queue_empty(&process_queue)) {
                spin_lock(&queue->lock);
                skb_queue_splice(&process_queue, queue);
                spin_unlock(&queue->lock);
        }

        return received;
}

static int tun_napi_poll(struct napi_struct *napi, int budget)
{
        unsigned int received;

        received = tun_napi_receive(napi, budget);

        if (received < budget)
                napi_complete_done(napi, received);

        return received;
}

static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
                          bool napi_en, bool napi_frags)
{
        tfile->napi_enabled = napi_en;
        tfile->napi_frags_enabled = napi_en && napi_frags;
        if (napi_en) {
                netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll);
                napi_enable(&tfile->napi);
        }
}

static void tun_napi_enable(struct tun_file *tfile)
{
        if (tfile->napi_enabled)
                napi_enable(&tfile->napi);
}

static void tun_napi_disable(struct tun_file *tfile)
{
        if (tfile->napi_enabled)
                napi_disable(&tfile->napi);
}

static void tun_napi_del(struct tun_file *tfile)
{
        if (tfile->napi_enabled)
                netif_napi_del(&tfile->napi);
}

static bool tun_napi_frags_enabled(const struct tun_file *tfile)
{
        return tfile->napi_frags_enabled;
}

static inline u32 tun_hashfn(u32 rxhash)
{
        return rxhash & TUN_MASK_FLOW_ENTRIES;
}

static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
{
        struct tun_flow_entry *e;

        hlist_for_each_entry_rcu(e, head, hash_link) {
                if (e->rxhash == rxhash)
                        return e;
        }
        return NULL;
}

static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
                                              struct hlist_head *head,
                                              u32 rxhash, u16 queue_index)
{
        struct tun_flow_entry *e = kmalloc_obj(*e, GFP_ATOMIC);

        if (e) {
                netif_info(tun, tx_queued, tun->dev,
                           "create flow: hash %u index %u\n",
                           rxhash, queue_index);
                e->updated = jiffies;
                e->rxhash = rxhash;
                e->rps_rxhash = 0;
                e->queue_index = queue_index;
                e->tun = tun;
                hlist_add_head_rcu(&e->hash_link, head);
                ++tun->flow_count;
        }
        return e;
}

static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
{
        netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n",
                   e->rxhash, e->queue_index);
        hlist_del_rcu(&e->hash_link);
        kfree_rcu(e, rcu);
        --tun->flow_count;
}

static void tun_flow_flush(struct tun_struct *tun)
{
        int i;

        spin_lock_bh(&tun->lock);
        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
                struct tun_flow_entry *e;
                struct hlist_node *n;

                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
                        tun_flow_delete(tun, e);
        }
        spin_unlock_bh(&tun->lock);
}

static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
{
        int i;

        spin_lock_bh(&tun->lock);
        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
                struct tun_flow_entry *e;
                struct hlist_node *n;

                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
                        if (e->queue_index == queue_index)
                                tun_flow_delete(tun, e);
                }
        }
        spin_unlock_bh(&tun->lock);
}

static void tun_flow_cleanup(struct timer_list *t)
{
        struct tun_struct *tun = timer_container_of(tun, t, flow_gc_timer);
        unsigned long delay = tun->ageing_time;
        unsigned long next_timer = jiffies + delay;
        unsigned long count = 0;
        int i;

        spin_lock(&tun->lock);
        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
                struct tun_flow_entry *e;
                struct hlist_node *n;

                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
                        unsigned long this_timer;

                        this_timer = e->updated + delay;
                        if (time_before_eq(this_timer, jiffies)) {
                                tun_flow_delete(tun, e);
                                continue;
                        }
                        count++;
                        if (time_before(this_timer, next_timer))
                                next_timer = this_timer;
                }
        }

        if (count)
                mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
        spin_unlock(&tun->lock);
}

static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
                            struct tun_file *tfile)
{
        struct hlist_head *head;
        struct tun_flow_entry *e;
        unsigned long delay = tun->ageing_time;
        u16 queue_index = tfile->queue_index;

        head = &tun->flows[tun_hashfn(rxhash)];

        rcu_read_lock();

        e = tun_flow_find(head, rxhash);
        if (likely(e)) {
                /* TODO: keep queueing to old queue until it's empty? */
                if (READ_ONCE(e->queue_index) != queue_index)
                        WRITE_ONCE(e->queue_index, queue_index);
                if (e->updated != jiffies)
                        e->updated = jiffies;
                sock_rps_record_flow_hash(e->rps_rxhash);
        } else {
                spin_lock_bh(&tun->lock);
                if (!tun_flow_find(head, rxhash) &&
                    tun->flow_count < MAX_TAP_FLOWS)
                        tun_flow_create(tun, head, rxhash, queue_index);

                if (!timer_pending(&tun->flow_gc_timer))
                        mod_timer(&tun->flow_gc_timer,
                                  round_jiffies_up(jiffies + delay));
                spin_unlock_bh(&tun->lock);
        }

        rcu_read_unlock();
}

/* Save the hash received in the stack receive path and update the
 * flow_hash table accordingly.
 */
static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
{
        if (unlikely(e->rps_rxhash != hash))
                e->rps_rxhash = hash;
}

/* We try to identify a flow through its rxhash. The reason that
 * we do not check rxq no. is because some cards(e.g 82599), chooses
 * the rxq based on the txq where the last packet of the flow comes. As
 * the userspace application move between processors, we may get a
 * different rxq no. here.
 */
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
        struct tun_flow_entry *e;
        u32 txq, numqueues;

        numqueues = READ_ONCE(tun->numqueues);

        txq = __skb_get_hash_symmetric(skb);
        e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
        if (e) {
                tun_flow_save_rps_rxhash(e, txq);
                txq = e->queue_index;
        } else {
                txq = reciprocal_scale(txq, numqueues);
        }

        return txq;
}

static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
        struct tun_prog *prog;
        u32 numqueues;
        u16 ret = 0;

        numqueues = READ_ONCE(tun->numqueues);
        if (!numqueues)
                return 0;

        prog = rcu_dereference(tun->steering_prog);
        if (prog)
                ret = bpf_prog_run_clear_cb(prog->prog, skb);

        return ret % numqueues;
}

static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
                            struct net_device *sb_dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        u16 ret;

        rcu_read_lock();
        if (rcu_dereference(tun->steering_prog))
                ret = tun_ebpf_select_queue(tun, skb);
        else
                ret = tun_automq_select_queue(tun, skb);
        rcu_read_unlock();

        return ret;
}

static inline bool tun_not_capable(struct tun_struct *tun)
{
        const struct cred *cred = current_cred();
        struct net *net = dev_net(tun->dev);

        return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
                (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
                !ns_capable(net->user_ns, CAP_NET_ADMIN);
}

static void tun_set_real_num_queues(struct tun_struct *tun)
{
        netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
        netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
}

static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
{
        tfile->detached = tun;
        list_add_tail(&tfile->next, &tun->disabled);
        ++tun->numdisabled;
}

static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
{
        struct tun_struct *tun = tfile->detached;

        tfile->detached = NULL;
        list_del_init(&tfile->next);
        --tun->numdisabled;
        return tun;
}

void tun_ptr_free(void *ptr)
{
        if (!ptr)
                return;
        if (tun_is_xdp_frame(ptr)) {
                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);

                xdp_return_frame(xdpf);
        } else {
                __skb_array_destroy_skb(ptr);
        }
}
EXPORT_SYMBOL_GPL(tun_ptr_free);

static void tun_queue_purge(struct tun_file *tfile)
{
        void *ptr;

        while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
                tun_ptr_free(ptr);

        skb_queue_purge(&tfile->sk.sk_write_queue);
        skb_queue_purge(&tfile->sk.sk_error_queue);
}

static void __tun_detach(struct tun_file *tfile, bool clean)
{
        struct tun_file *ntfile;
        struct tun_struct *tun;

        tun = rtnl_dereference(tfile->tun);

        if (tun && clean) {
                if (!tfile->detached)
                        tun_napi_disable(tfile);
                tun_napi_del(tfile);
        }

        if (tun && !tfile->detached) {
                u16 index = tfile->queue_index;
                BUG_ON(index >= tun->numqueues);

                rcu_assign_pointer(tun->tfiles[index],
                                   tun->tfiles[tun->numqueues - 1]);
                ntfile = rtnl_dereference(tun->tfiles[index]);
                ntfile->queue_index = index;
                ntfile->xdp_rxq.queue_index = index;
                rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
                                   NULL);

                --tun->numqueues;
                if (clean) {
                        RCU_INIT_POINTER(tfile->tun, NULL);
                        sock_put(&tfile->sk);
                } else {
                        tun_disable_queue(tun, tfile);
                        tun_napi_disable(tfile);
                }

                synchronize_net();
                tun_flow_delete_by_queue(tun, tun->numqueues + 1);
                /* Drop read queue */
                tun_queue_purge(tfile);
                tun_set_real_num_queues(tun);
        } else if (tfile->detached && clean) {
                tun = tun_enable_queue(tfile);
                sock_put(&tfile->sk);
        }

        if (clean) {
                if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
                        netif_carrier_off(tun->dev);

                        if (!(tun->flags & IFF_PERSIST) &&
                            tun->dev->reg_state == NETREG_REGISTERED)
                                unregister_netdevice(tun->dev);
                }
                if (tun)
                        xdp_rxq_info_unreg(&tfile->xdp_rxq);
                ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
        }
}

static void tun_detach(struct tun_file *tfile, bool clean)
{
        struct tun_struct *tun;
        struct net_device *dev;

        rtnl_lock();
        tun = rtnl_dereference(tfile->tun);
        dev = tun ? tun->dev : NULL;
        __tun_detach(tfile, clean);
        if (dev)
                netdev_state_change(dev);
        rtnl_unlock();

        if (clean)
                sock_put(&tfile->sk);
}

static void tun_detach_all(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct tun_file *tfile, *tmp;
        int i, n = tun->numqueues;

        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                BUG_ON(!tfile);
                tun_napi_disable(tfile);
                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
                RCU_INIT_POINTER(tfile->tun, NULL);
                --tun->numqueues;
        }
        list_for_each_entry(tfile, &tun->disabled, next) {
                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
                RCU_INIT_POINTER(tfile->tun, NULL);
        }
        BUG_ON(tun->numqueues != 0);

        synchronize_net();
        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                tun_napi_del(tfile);
                /* Drop read queue */
                tun_queue_purge(tfile);
                xdp_rxq_info_unreg(&tfile->xdp_rxq);
                sock_put(&tfile->sk);
        }
        list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
                tun_napi_del(tfile);
                tun_enable_queue(tfile);
                tun_queue_purge(tfile);
                xdp_rxq_info_unreg(&tfile->xdp_rxq);
                sock_put(&tfile->sk);
        }
        BUG_ON(tun->numdisabled != 0);

        if (tun->flags & IFF_PERSIST)
                module_put(THIS_MODULE);
}

static int tun_attach(struct tun_struct *tun, struct file *file,
                      bool skip_filter, bool napi, bool napi_frags,
                      bool publish_tun)
{
        struct tun_file *tfile = file->private_data;
        struct net_device *dev = tun->dev;
        int err;

        err = security_tun_dev_attach(tfile->socket.sk, tun->security);
        if (err < 0)
                goto out;

        err = -EINVAL;
        if (rtnl_dereference(tfile->tun) && !tfile->detached)
                goto out;

        err = -EBUSY;
        if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
                goto out;

        err = -E2BIG;
        if (!tfile->detached &&
            tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
                goto out;

        err = 0;

        /* Re-attach the filter to persist device */
        if (!skip_filter && (tun->filter_attached == true)) {
                lock_sock(tfile->socket.sk);
                err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
                release_sock(tfile->socket.sk);
                if (!err)
                        goto out;
        }

        if (!tfile->detached &&
            ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
                            GFP_KERNEL, tun_ptr_free)) {
                err = -ENOMEM;
                goto out;
        }

        tfile->queue_index = tun->numqueues;
        tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;

        if (tfile->detached) {
                /* Re-attach detached tfile, updating XDP queue_index */
                WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));

                if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
                        tfile->xdp_rxq.queue_index = tfile->queue_index;
        } else {
                /* Setup XDP RX-queue info, for new tfile getting attached */
                err = xdp_rxq_info_reg(&tfile->xdp_rxq,
                                       tun->dev, tfile->queue_index, 0);
                if (err < 0)
                        goto out;
                err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
                                                 MEM_TYPE_PAGE_SHARED, NULL);
                if (err < 0) {
                        xdp_rxq_info_unreg(&tfile->xdp_rxq);
                        goto out;
                }
                err = 0;
        }

        if (tfile->detached) {
                tun_enable_queue(tfile);
                tun_napi_enable(tfile);
        } else {
                sock_hold(&tfile->sk);
                tun_napi_init(tun, tfile, napi, napi_frags);
        }

        if (rtnl_dereference(tun->xdp_prog))
                sock_set_flag(&tfile->sk, SOCK_XDP);

        /* device is allowed to go away first, so no need to hold extra
         * refcnt.
         */

        /* Publish tfile->tun and tun->tfiles only after we've fully
         * initialized tfile; otherwise we risk using half-initialized
         * object.
         */
        if (publish_tun)
                rcu_assign_pointer(tfile->tun, tun);
        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
        tun->numqueues++;
        tun_set_real_num_queues(tun);
out:
        return err;
}

static struct tun_struct *tun_get(struct tun_file *tfile)
{
        struct tun_struct *tun;

        rcu_read_lock();
        tun = rcu_dereference(tfile->tun);
        if (tun)
                dev_hold(tun->dev);
        rcu_read_unlock();

        return tun;
}

static void tun_put(struct tun_struct *tun)
{
        dev_put(tun->dev);
}

/* TAP filtering */
static void addr_hash_set(u32 *mask, const u8 *addr)
{
        int n = ether_crc(ETH_ALEN, addr) >> 26;
        mask[n >> 5] |= (1 << (n & 31));
}

static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
{
        int n = ether_crc(ETH_ALEN, addr) >> 26;
        return mask[n >> 5] & (1 << (n & 31));
}

static int update_filter(struct tap_filter *filter, void __user *arg)
{
        struct { u8 u[ETH_ALEN]; } *addr;
        struct tun_filter uf;
        int err, alen, n, nexact;

        if (copy_from_user(&uf, arg, sizeof(uf)))
                return -EFAULT;

        if (!uf.count) {
                /* Disabled */
                filter->count = 0;
                return 0;
        }

        alen = ETH_ALEN * uf.count;
        addr = memdup_user(arg + sizeof(uf), alen);
        if (IS_ERR(addr))
                return PTR_ERR(addr);

        /* The filter is updated without holding any locks. Which is
         * perfectly safe. We disable it first and in the worst
         * case we'll accept a few undesired packets. */
        filter->count = 0;
        wmb();

        /* Use first set of addresses as an exact filter */
        for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
                memcpy(filter->addr[n], addr[n].u, ETH_ALEN);

        nexact = n;

        /* Remaining multicast addresses are hashed,
         * unicast will leave the filter disabled. */
        memset(filter->mask, 0, sizeof(filter->mask));
        for (; n < uf.count; n++) {
                if (!is_multicast_ether_addr(addr[n].u)) {
                        err = 0; /* no filter */
                        goto free_addr;
                }
                addr_hash_set(filter->mask, addr[n].u);
        }

        /* For ALLMULTI just set the mask to all ones.
         * This overrides the mask populated above. */
        if ((uf.flags & TUN_FLT_ALLMULTI))
                memset(filter->mask, ~0, sizeof(filter->mask));

        /* Now enable the filter */
        wmb();
        filter->count = nexact;

        /* Return the number of exact filters */
        err = nexact;
free_addr:
        kfree(addr);
        return err;
}

/* Returns: 0 - drop, !=0 - accept */
static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
        /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
         * at this point. */
        struct ethhdr *eh = (struct ethhdr *) skb->data;
        int i;

        /* Exact match */
        for (i = 0; i < filter->count; i++)
                if (ether_addr_equal(eh->h_dest, filter->addr[i]))
                        return 1;

        /* Inexact match (multicast only) */
        if (is_multicast_ether_addr(eh->h_dest))
                return addr_hash_test(filter->mask, eh->h_dest);

        return 0;
}

/*
 * Checks whether the packet is accepted or not.
 * Returns: 0 - drop, !=0 - accept
 */
static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
        if (!filter->count)
                return 1;

        return run_filter(filter, skb);
}

/* Network device part of the driver */

static const struct ethtool_ops tun_ethtool_ops;

static int tun_net_init(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct ifreq *ifr = tun->ifr;
        int err;

        spin_lock_init(&tun->lock);

        err = security_tun_dev_alloc_security(&tun->security);
        if (err < 0)
                return err;

        tun_flow_init(tun);

        dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
        dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
                           TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
                           NETIF_F_HW_VLAN_STAG_TX;
        dev->hw_enc_features = dev->hw_features;
        dev->features = dev->hw_features;
        dev->vlan_features = dev->features &
                             ~(NETIF_F_HW_VLAN_CTAG_TX |
                               NETIF_F_HW_VLAN_STAG_TX);
        dev->lltx = true;

        tun->flags = (tun->flags & ~TUN_FEATURES) |
                      (ifr->ifr_flags & TUN_FEATURES);

        INIT_LIST_HEAD(&tun->disabled);
        err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI,
                         ifr->ifr_flags & IFF_NAPI_FRAGS, false);
        if (err < 0) {
                tun_flow_uninit(tun);
                security_tun_dev_free_security(tun->security);
                return err;
        }
        return 0;
}

/* Net device detach from fd. */
static void tun_net_uninit(struct net_device *dev)
{
        tun_detach_all(dev);
}

/* Net device open. */
static int tun_net_open(struct net_device *dev)
{
        netif_tx_start_all_queues(dev);

        return 0;
}

/* Net device close. */
static int tun_net_close(struct net_device *dev)
{
        netif_tx_stop_all_queues(dev);
        return 0;
}

/* Net device start xmit */
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{
#ifdef CONFIG_RPS
        if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
                /* Select queue was not called for the skbuff, so we extract the
                 * RPS hash and save it into the flow_table here.
                 */
                struct tun_flow_entry *e;
                __u32 rxhash;

                rxhash = __skb_get_hash_symmetric(skb);
                e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
                if (e)
                        tun_flow_save_rps_rxhash(e, rxhash);
        }
#endif
}

static unsigned int run_ebpf_filter(struct tun_struct *tun,
                                    struct sk_buff *skb,
                                    int len)
{
        struct tun_prog *prog = rcu_dereference(tun->filter_prog);

        if (prog)
                len = bpf_prog_run_clear_cb(prog->prog, skb);

        return len;
}

/* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct tun_struct *tun = netdev_priv(dev);
        int txq = skb->queue_mapping;
        struct netdev_queue *queue;
        struct tun_file *tfile;
        int len = skb->len;

        rcu_read_lock();
        tfile = rcu_dereference(tun->tfiles[txq]);

        /* Drop packet if interface is not attached */
        if (!tfile) {
                drop_reason = SKB_DROP_REASON_DEV_READY;
                goto drop;
        }

        if (!rcu_dereference(tun->steering_prog))
                tun_automq_xmit(tun, skb);

        netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len);

        /* Drop if the filter does not like it.
         * This is a noop if the filter is disabled.
         * Filter can be enabled only for the TAP devices. */
        if (!check_filter(&tun->txflt, skb)) {
                drop_reason = SKB_DROP_REASON_TAP_TXFILTER;
                goto drop;
        }

        if (tfile->socket.sk->sk_filter) {
                drop_reason = sk_filter_reason(tfile->socket.sk, skb);
                if (drop_reason)
                        goto drop;
        }

        len = run_ebpf_filter(tun, skb, len);
        if (len == 0) {
                drop_reason = SKB_DROP_REASON_TAP_FILTER;
                goto drop;
        }

        if (pskb_trim(skb, len)) {
                drop_reason = SKB_DROP_REASON_NOMEM;
                goto drop;
        }

        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
                drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
                goto drop;
        }

        skb_tx_timestamp(skb);

        /* Orphan the skb - required as we might hang on to it
         * for indefinite time.
         */
        skb_orphan(skb);

        nf_reset_ct(skb);

        if (ptr_ring_produce(&tfile->tx_ring, skb)) {
                drop_reason = SKB_DROP_REASON_FULL_RING;
                goto drop;
        }

        /* dev->lltx requires to do our own update of trans_start */
        queue = netdev_get_tx_queue(dev, txq);
        txq_trans_cond_update(queue);

        /* Notify and wake up reader process */
        if (tfile->flags & TUN_FASYNC)
                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
        tfile->socket.sk->sk_data_ready(tfile->socket.sk);

        rcu_read_unlock();
        return NETDEV_TX_OK;

drop:
        dev_core_stats_tx_dropped_inc(dev);
        skb_tx_error(skb);
        kfree_skb_reason(skb, drop_reason);
        rcu_read_unlock();
        return NET_XMIT_DROP;
}

static void tun_net_mclist(struct net_device *dev)
{
        /*
         * This callback is supposed to deal with mc filter in
         * _rx_ path and has nothing to do with the _tx_ path.
         * In rx path we always accept everything userspace gives us.
         */
}

static netdev_features_t tun_net_fix_features(struct net_device *dev,
        netdev_features_t features)
{
        struct tun_struct *tun = netdev_priv(dev);

        return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
}

static void tun_set_headroom(struct net_device *dev, int new_hr)
{
        struct tun_struct *tun = netdev_priv(dev);

        if (new_hr < NET_SKB_PAD)
                new_hr = NET_SKB_PAD;

        tun->align = new_hr;
}

static void
tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        struct tun_struct *tun = netdev_priv(dev);

        dev_get_tstats64(dev, stats);

        stats->rx_frame_errors +=
                (unsigned long)atomic_long_read(&tun->rx_frame_errors);
}

static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
                       struct netlink_ext_ack *extack)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct tun_file *tfile;
        struct bpf_prog *old_prog;
        int i;

        old_prog = rtnl_dereference(tun->xdp_prog);
        rcu_assign_pointer(tun->xdp_prog, prog);
        if (old_prog)
                bpf_prog_put(old_prog);

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                if (prog)
                        sock_set_flag(&tfile->sk, SOCK_XDP);
                else
                        sock_reset_flag(&tfile->sk, SOCK_XDP);
        }
        list_for_each_entry(tfile, &tun->disabled, next) {
                if (prog)
                        sock_set_flag(&tfile->sk, SOCK_XDP);
                else
                        sock_reset_flag(&tfile->sk, SOCK_XDP);
        }

        return 0;
}

static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
{
        switch (xdp->command) {
        case XDP_SETUP_PROG:
                return tun_xdp_set(dev, xdp->prog, xdp->extack);
        default:
                return -EINVAL;
        }
}

static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
{
        if (new_carrier) {
                struct tun_struct *tun = netdev_priv(dev);

                if (!tun->numqueues)
                        return -EPERM;

                netif_carrier_on(dev);
        } else {
                netif_carrier_off(dev);
        }
        return 0;
}

static const struct net_device_ops tun_netdev_ops = {
        .ndo_init                = tun_net_init,
        .ndo_uninit                = tun_net_uninit,
        .ndo_open                = tun_net_open,
        .ndo_stop                = tun_net_close,
        .ndo_start_xmit                = tun_net_xmit,
        .ndo_fix_features        = tun_net_fix_features,
        .ndo_select_queue        = tun_select_queue,
        .ndo_set_rx_headroom        = tun_set_headroom,
        .ndo_get_stats64        = tun_net_get_stats64,
        .ndo_change_carrier        = tun_net_change_carrier,
};

static void __tun_xdp_flush_tfile(struct tun_file *tfile)
{
        /* Notify and wake up reader process */
        if (tfile->flags & TUN_FASYNC)
                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
        tfile->socket.sk->sk_data_ready(tfile->socket.sk);
}

static int tun_xdp_xmit(struct net_device *dev, int n,
                        struct xdp_frame **frames, u32 flags)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct tun_file *tfile;
        u32 numqueues;
        int nxmit = 0;
        int i;

        if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
                return -EINVAL;

        rcu_read_lock();

resample:
        numqueues = READ_ONCE(tun->numqueues);
        if (!numqueues) {
                rcu_read_unlock();
                return -ENXIO; /* Caller will free/return all frames */
        }

        tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
                                            numqueues]);
        if (unlikely(!tfile))
                goto resample;

        spin_lock(&tfile->tx_ring.producer_lock);
        for (i = 0; i < n; i++) {
                struct xdp_frame *xdp = frames[i];
                /* Encode the XDP flag into lowest bit for consumer to differ
                 * XDP buffer from sk_buff.
                 */
                void *frame = tun_xdp_to_ptr(xdp);

                if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
                        dev_core_stats_tx_dropped_inc(dev);
                        break;
                }
                nxmit++;
        }
        spin_unlock(&tfile->tx_ring.producer_lock);

        if (flags & XDP_XMIT_FLUSH)
                __tun_xdp_flush_tfile(tfile);

        rcu_read_unlock();
        return nxmit;
}

static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
{
        struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
        int nxmit;

        if (unlikely(!frame))
                return -EOVERFLOW;

        nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH);
        if (!nxmit)
                xdp_return_frame_rx_napi(frame);
        return nxmit;
}

static const struct net_device_ops tap_netdev_ops = {
        .ndo_init                = tun_net_init,
        .ndo_uninit                = tun_net_uninit,
        .ndo_open                = tun_net_open,
        .ndo_stop                = tun_net_close,
        .ndo_start_xmit                = tun_net_xmit,
        .ndo_fix_features        = tun_net_fix_features,
        .ndo_set_rx_mode        = tun_net_mclist,
        .ndo_set_mac_address        = eth_mac_addr,
        .ndo_validate_addr        = eth_validate_addr,
        .ndo_select_queue        = tun_select_queue,
        .ndo_features_check        = passthru_features_check,
        .ndo_set_rx_headroom        = tun_set_headroom,
        .ndo_bpf                = tun_xdp,
        .ndo_xdp_xmit                = tun_xdp_xmit,
        .ndo_change_carrier        = tun_net_change_carrier,
};

static void tun_flow_init(struct tun_struct *tun)
{
        int i;

        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
                INIT_HLIST_HEAD(&tun->flows[i]);

        tun->ageing_time = TUN_FLOW_EXPIRE;
        timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0);
        mod_timer(&tun->flow_gc_timer,
                  round_jiffies_up(jiffies + tun->ageing_time));
}

static void tun_flow_uninit(struct tun_struct *tun)
{
        timer_delete_sync(&tun->flow_gc_timer);
        tun_flow_flush(tun);
}

#define MIN_MTU 68
#define MAX_MTU 65535

/* Initialize net device. */
static void tun_net_initialize(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        switch (tun->flags & TUN_TYPE_MASK) {
        case IFF_TUN:
                dev->netdev_ops = &tun_netdev_ops;
                dev->header_ops = &ip_tunnel_header_ops;

                /* Point-to-Point TUN Device */
                dev->hard_header_len = 0;
                dev->addr_len = 0;
                dev->mtu = 1500;

                /* Zero header length */
                dev->type = ARPHRD_NONE;
                dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
                break;

        case IFF_TAP:
                dev->netdev_ops = &tap_netdev_ops;
                /* Ethernet TAP Device */
                ether_setup(dev);
                dev->priv_flags &= ~IFF_TX_SKB_SHARING;
                dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;

                eth_hw_addr_random(dev);

                /* Currently tun does not support XDP, only tap does. */
                dev->xdp_features = NETDEV_XDP_ACT_BASIC |
                                    NETDEV_XDP_ACT_REDIRECT |
                                    NETDEV_XDP_ACT_NDO_XMIT;

                break;
        }

        dev->min_mtu = MIN_MTU;
        dev->max_mtu = MAX_MTU - dev->hard_header_len;
}

static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile)
{
        struct sock *sk = tfile->socket.sk;

        return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
}

/* Character device part */

/* Poll */
static __poll_t tun_chr_poll(struct file *file, poll_table *wait)
{
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = tun_get(tfile);
        struct sock *sk;
        __poll_t mask = 0;

        if (!tun)
                return EPOLLERR;

        sk = tfile->socket.sk;

        poll_wait(file, sk_sleep(sk), wait);

        if (!ptr_ring_empty(&tfile->tx_ring))
                mask |= EPOLLIN | EPOLLRDNORM;

        /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to
         * guarantee EPOLLOUT to be raised by either here or
         * tun_sock_write_space(). Then process could get notification
         * after it writes to a down device and meets -EIO.
         */
        if (tun_sock_writeable(tun, tfile) ||
            (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
             tun_sock_writeable(tun, tfile)))
                mask |= EPOLLOUT | EPOLLWRNORM;

        if (tun->dev->reg_state != NETREG_REGISTERED)
                mask = EPOLLERR;

        tun_put(tun);
        return mask;
}

static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
                                            size_t len,
                                            const struct iov_iter *it)
{
        struct sk_buff *skb;
        size_t linear;
        int err;
        int i;

        if (it->nr_segs > MAX_SKB_FRAGS + 1 ||
            len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN))
                return ERR_PTR(-EMSGSIZE);

        local_bh_disable();
        skb = napi_get_frags(&tfile->napi);
        local_bh_enable();
        if (!skb)
                return ERR_PTR(-ENOMEM);

        linear = iov_iter_single_seg_count(it);
        err = __skb_grow(skb, linear);
        if (err)
                goto free;

        skb->len = len;
        skb->data_len = len - linear;
        skb->truesize += skb->data_len;

        for (i = 1; i < it->nr_segs; i++) {
                const struct iovec *iov = iter_iov(it) + i;
                size_t fragsz = iov->iov_len;
                struct page *page;
                void *frag;

                if (fragsz == 0 || fragsz > PAGE_SIZE) {
                        err = -EINVAL;
                        goto free;
                }
                frag = netdev_alloc_frag(fragsz);
                if (!frag) {
                        err = -ENOMEM;
                        goto free;
                }
                page = virt_to_head_page(frag);
                skb_fill_page_desc(skb, i - 1, page,
                                   frag - page_address(page), fragsz);
        }

        return skb;
free:
        /* frees skb and all frags allocated with napi_alloc_frag() */
        napi_free_frags(&tfile->napi);
        return ERR_PTR(err);
}

/* prepad is the amount to reserve at front.  len is length after that.
 * linear is a hint as to how much to copy (usually headers). */
static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
                                     size_t prepad, size_t len,
                                     size_t linear, int noblock)
{
        struct sock *sk = tfile->socket.sk;
        struct sk_buff *skb;
        int err;

        /* Under a page?  Don't bother with paged skb. */
        if (prepad + len < PAGE_SIZE)
                linear = len;

        if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
                linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
                                   &err, PAGE_ALLOC_COSTLY_ORDER);
        if (!skb)
                return ERR_PTR(err);

        skb_reserve(skb, prepad);
        skb_put(skb, linear);
        skb->data_len = len - linear;
        skb->len += len - linear;

        return skb;
}

static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
                           struct sk_buff *skb, int more)
{
        struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
        struct sk_buff_head process_queue;
        u32 rx_batched = tun->rx_batched;
        bool rcv = false;

        if (!rx_batched || (!more && skb_queue_empty(queue))) {
                local_bh_disable();
                skb_record_rx_queue(skb, tfile->queue_index);
                netif_receive_skb(skb);
                local_bh_enable();
                return;
        }

        spin_lock(&queue->lock);
        if (!more || skb_queue_len(queue) == rx_batched) {
                __skb_queue_head_init(&process_queue);
                skb_queue_splice_tail_init(queue, &process_queue);
                rcv = true;
        } else {
                __skb_queue_tail(queue, skb);
        }
        spin_unlock(&queue->lock);

        if (rcv) {
                struct sk_buff *nskb;

                local_bh_disable();
                while ((nskb = __skb_dequeue(&process_queue))) {
                        skb_record_rx_queue(nskb, tfile->queue_index);
                        netif_receive_skb(nskb);
                }
                skb_record_rx_queue(skb, tfile->queue_index);
                netif_receive_skb(skb);
                local_bh_enable();
        }
}

static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
                              int len, int noblock, bool zerocopy)
{
        if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                return false;

        if (tfile->socket.sk->sk_sndbuf != INT_MAX)
                return false;

        if (!noblock)
                return false;

        if (zerocopy)
                return false;

        if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) +
            SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
                return false;

        return true;
}

static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
                                       struct page_frag *alloc_frag, char *buf,
                                       int buflen, int len, int pad,
                                       int metasize)
{
        struct sk_buff *skb = build_skb(buf, buflen);

        if (!skb)
                return ERR_PTR(-ENOMEM);

        skb_reserve(skb, pad);
        skb_put(skb, len);
        if (metasize)
                skb_metadata_set(skb, metasize);
        skb_set_owner_w(skb, tfile->socket.sk);

        get_page(alloc_frag->page);
        alloc_frag->offset += buflen;

        return skb;
}

static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
                       struct xdp_buff *xdp, u32 act)
{
        int err;

        switch (act) {
        case XDP_REDIRECT:
                err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
                if (err) {
                        dev_core_stats_rx_dropped_inc(tun->dev);
                        return err;
                }
                dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
                break;
        case XDP_TX:
                err = tun_xdp_tx(tun->dev, xdp);
                if (err < 0) {
                        dev_core_stats_rx_dropped_inc(tun->dev);
                        return err;
                }
                dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
                break;
        case XDP_PASS:
                break;
        default:
                bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act);
                fallthrough;
        case XDP_ABORTED:
                trace_xdp_exception(tun->dev, xdp_prog, act);
                fallthrough;
        case XDP_DROP:
                dev_core_stats_rx_dropped_inc(tun->dev);
                break;
        }

        return act;
}

static struct sk_buff *tun_build_skb(struct tun_struct *tun,
                                     struct tun_file *tfile,
                                     struct iov_iter *from,
                                     struct virtio_net_hdr *hdr,
                                     int len, int *skb_xdp)
{
        struct page_frag *alloc_frag = &current->task_frag;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        struct bpf_prog *xdp_prog;
        int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
        char *buf;
        size_t copied;
        int pad = TUN_RX_PAD;
        int metasize = 0;
        int err = 0;

        rcu_read_lock();
        xdp_prog = rcu_dereference(tun->xdp_prog);
        if (xdp_prog)
                pad += XDP_PACKET_HEADROOM;
        buflen += SKB_DATA_ALIGN(len + pad);
        rcu_read_unlock();

        alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
        if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
                return ERR_PTR(-ENOMEM);

        buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
        copied = copy_page_from_iter(alloc_frag->page,
                                     alloc_frag->offset + pad,
                                     len, from);
        if (copied != len)
                return ERR_PTR(-EFAULT);

        /* There's a small window that XDP may be set after the check
         * of xdp_prog above, this should be rare and for simplicity
         * we do XDP on skb in case the headroom is not enough.
         */
        if (hdr->gso_type || !xdp_prog) {
                *skb_xdp = 1;
                return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
                                       pad, metasize);
        }

        *skb_xdp = 0;

        local_bh_disable();
        rcu_read_lock();
        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
        xdp_prog = rcu_dereference(tun->xdp_prog);
        if (xdp_prog) {
                struct xdp_buff xdp;
                u32 act;

                xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq);
                xdp_prepare_buff(&xdp, buf, pad, len, true);

                act = bpf_prog_run_xdp(xdp_prog, &xdp);
                if (act == XDP_REDIRECT || act == XDP_TX) {
                        get_page(alloc_frag->page);
                        alloc_frag->offset += buflen;
                }
                err = tun_xdp_act(tun, xdp_prog, &xdp, act);
                if (err < 0) {
                        if (act == XDP_REDIRECT || act == XDP_TX)
                                put_page(alloc_frag->page);
                        goto out;
                }

                if (err == XDP_REDIRECT)
                        xdp_do_flush();
                if (err != XDP_PASS)
                        goto out;

                pad = xdp.data - xdp.data_hard_start;
                len = xdp.data_end - xdp.data;

                /* It is known that the xdp_buff was prepared with metadata
                 * support, so the metasize will never be negative.
                 */
                metasize = xdp.data - xdp.data_meta;
        }
        bpf_net_ctx_clear(bpf_net_ctx);
        rcu_read_unlock();
        local_bh_enable();

        return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad,
                               metasize);

out:
        bpf_net_ctx_clear(bpf_net_ctx);
        rcu_read_unlock();
        local_bh_enable();
        return NULL;
}

/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                            void *msg_control, struct iov_iter *from,
                            int noblock, bool more)
{
        struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
        struct sk_buff *skb;
        size_t total_len = iov_iter_count(from);
        size_t len = total_len, align = tun->align, linear;
        struct virtio_net_hdr_v1_hash_tunnel hdr;
        struct virtio_net_hdr *gso;
        int good_linear;
        int copylen;
        int hdr_len = 0;
        bool zerocopy = false;
        int err;
        u32 rxhash = 0;
        int skb_xdp = 1;
        bool frags = tun_napi_frags_enabled(tfile);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        netdev_features_t features = 0;

        /*
         * Keep it easy and always zero the whole buffer, even if the
         * tunnel-related field will be touched only when the feature
         * is enabled and the hdr size id compatible.
         */
        memset(&hdr, 0, sizeof(hdr));
        gso = (struct virtio_net_hdr *)&hdr;

        if (!(tun->flags & IFF_NO_PI)) {
                if (len < sizeof(pi))
                        return -EINVAL;
                len -= sizeof(pi);

                if (!copy_from_iter_full(&pi, sizeof(pi), from))
                        return -EFAULT;
        }

        if (tun->flags & IFF_VNET_HDR) {
                int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);

                features = tun_vnet_hdr_guest_features(vnet_hdr_sz);
                hdr_len = __tun_vnet_hdr_get(vnet_hdr_sz, tun->flags,
                                             features, from, gso);
                if (hdr_len < 0)
                        return hdr_len;

                len -= vnet_hdr_sz;
        }

        if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
                align += NET_IP_ALIGN;
                if (unlikely(len < ETH_HLEN || (hdr_len && hdr_len < ETH_HLEN)))
                        return -EINVAL;
        }

        good_linear = SKB_MAX_HEAD(align);

        if (msg_control) {
                struct iov_iter i = *from;

                /* There are 256 bytes to be copied in skb, so there is
                 * enough room for skb expand head in case it is used.
                 * The rest of the buffer is mapped from userspace.
                 */
                copylen = min(hdr_len ? hdr_len : GOODCOPY_LEN, good_linear);
                linear = copylen;
                iov_iter_advance(&i, copylen);
                if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
                        zerocopy = true;
        }

        if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
                /* For the packet that is not easy to be processed
                 * (e.g gso or jumbo packet), we will do it at after
                 * skb was created with generic XDP routine.
                 */
                skb = tun_build_skb(tun, tfile, from, gso, len, &skb_xdp);
                err = PTR_ERR_OR_ZERO(skb);
                if (err)
                        goto drop;
                if (!skb)
                        return total_len;
        } else {
                if (!zerocopy) {
                        copylen = len;
                        linear = min(hdr_len, good_linear);
                }

                if (frags) {
                        mutex_lock(&tfile->napi_mutex);
                        skb = tun_napi_alloc_frags(tfile, copylen, from);
                        /* tun_napi_alloc_frags() enforces a layout for the skb.
                         * If zerocopy is enabled, then this layout will be
                         * overwritten by zerocopy_sg_from_iter().
                         */
                        zerocopy = false;
                } else {
                        if (!linear)
                                linear = min_t(size_t, good_linear, copylen);

                        skb = tun_alloc_skb(tfile, align, copylen, linear,
                                            noblock);
                }

                err = PTR_ERR_OR_ZERO(skb);
                if (err)
                        goto drop;

                if (zerocopy)
                        err = zerocopy_sg_from_iter(skb, from);
                else
                        err = skb_copy_datagram_from_iter(skb, 0, from, len);

                if (err) {
                        err = -EFAULT;
                        drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
                        goto drop;
                }
        }

        if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, &hdr)) {
                atomic_long_inc(&tun->rx_frame_errors);
                err = -EINVAL;
                goto free_skb;
        }

        switch (tun->flags & TUN_TYPE_MASK) {
        case IFF_TUN:
                if (tun->flags & IFF_NO_PI) {
                        u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0;

                        switch (ip_version) {
                        case 4:
                                pi.proto = htons(ETH_P_IP);
                                break;
                        case 6:
                                pi.proto = htons(ETH_P_IPV6);
                                break;
                        default:
                                err = -EINVAL;
                                goto drop;
                        }
                }

                skb_reset_mac_header(skb);
                skb->protocol = pi.proto;
                skb->dev = tun->dev;
                break;
        case IFF_TAP:
                if (frags && !pskb_may_pull(skb, ETH_HLEN)) {
                        err = -ENOMEM;
                        drop_reason = SKB_DROP_REASON_HDR_TRUNC;
                        goto drop;
                }
                skb->protocol = eth_type_trans(skb, tun->dev);
                break;
        }

        /* copy skb_ubuf_info for callback when skb has no error */
        if (zerocopy) {
                skb_zcopy_init(skb, msg_control);
        } else if (msg_control) {
                struct ubuf_info *uarg = msg_control;
                uarg->ops->complete(NULL, uarg, false);
        }

        skb_reset_network_header(skb);
        skb_probe_transport_header(skb);
        skb_record_rx_queue(skb, tfile->queue_index);

        if (skb_xdp) {
                struct bpf_prog *xdp_prog;
                int ret;

                local_bh_disable();
                rcu_read_lock();
                xdp_prog = rcu_dereference(tun->xdp_prog);
                if (xdp_prog) {
                        ret = do_xdp_generic(xdp_prog, &skb);
                        if (ret != XDP_PASS) {
                                rcu_read_unlock();
                                local_bh_enable();
                                goto unlock_frags;
                        }

                        if (frags && skb != tfile->napi.skb)
                                tfile->napi.skb = skb;
                }
                rcu_read_unlock();
                local_bh_enable();
        }

        /* Compute the costly rx hash only if needed for flow updates.
         * We may get a very small possibility of OOO during switching, not
         * worth to optimize.
         */
        if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 &&
            !tfile->detached)
                rxhash = __skb_get_hash_symmetric(skb);

        rcu_read_lock();
        if (unlikely(!(tun->dev->flags & IFF_UP))) {
                err = -EIO;
                rcu_read_unlock();
                drop_reason = SKB_DROP_REASON_DEV_READY;
                goto drop;
        }

        if (frags) {
                u32 headlen;

                /* Exercise flow dissector code path. */
                skb_push(skb, ETH_HLEN);
                headlen = eth_get_headlen(tun->dev, skb->data,
                                          skb_headlen(skb));

                if (unlikely(headlen > skb_headlen(skb))) {
                        WARN_ON_ONCE(1);
                        err = -ENOMEM;
                        dev_core_stats_rx_dropped_inc(tun->dev);
napi_busy:
                        napi_free_frags(&tfile->napi);
                        rcu_read_unlock();
                        mutex_unlock(&tfile->napi_mutex);
                        return err;
                }

                if (likely(napi_schedule_prep(&tfile->napi))) {
                        local_bh_disable();
                        napi_gro_frags(&tfile->napi);
                        napi_complete(&tfile->napi);
                        local_bh_enable();
                } else {
                        err = -EBUSY;
                        goto napi_busy;
                }
                mutex_unlock(&tfile->napi_mutex);
        } else if (tfile->napi_enabled) {
                struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
                int queue_len;

                spin_lock_bh(&queue->lock);

                if (unlikely(tfile->detached)) {
                        spin_unlock_bh(&queue->lock);
                        rcu_read_unlock();
                        err = -EBUSY;
                        goto free_skb;
                }

                __skb_queue_tail(queue, skb);
                queue_len = skb_queue_len(queue);
                spin_unlock(&queue->lock);

                if (!more || queue_len > NAPI_POLL_WEIGHT)
                        napi_schedule(&tfile->napi);

                local_bh_enable();
        } else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
                tun_rx_batched(tun, tfile, skb, more);
        } else {
                netif_rx(skb);
        }
        rcu_read_unlock();

        preempt_disable();
        dev_sw_netstats_rx_add(tun->dev, len);
        preempt_enable();

        if (rxhash)
                tun_flow_update(tun, rxhash, tfile);

        return total_len;

drop:
        if (err != -EAGAIN)
                dev_core_stats_rx_dropped_inc(tun->dev);

free_skb:
        if (!IS_ERR_OR_NULL(skb))
                kfree_skb_reason(skb, drop_reason);

unlock_frags:
        if (frags) {
                tfile->napi.skb = NULL;
                mutex_unlock(&tfile->napi_mutex);
        }

        return err ?: total_len;
}

static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = tun_get(tfile);
        ssize_t result;
        int noblock = 0;

        if (!tun)
                return -EBADFD;

        if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
                noblock = 1;

        result = tun_get_user(tun, tfile, NULL, from, noblock, false);

        tun_put(tun);
        return result;
}

static ssize_t tun_put_user_xdp(struct tun_struct *tun,
                                struct tun_file *tfile,
                                struct xdp_frame *xdp_frame,
                                struct iov_iter *iter)
{
        int vnet_hdr_sz = 0;
        size_t size = xdp_frame->len;
        ssize_t ret;

        if (tun->flags & IFF_VNET_HDR) {
                struct virtio_net_hdr gso = { 0 };

                vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
                ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
                if (ret)
                        return ret;
        }

        ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;

        preempt_disable();
        dev_sw_netstats_tx_add(tun->dev, 1, ret);
        preempt_enable();

        return ret;
}

/* Put packet to the user space buffer */
static ssize_t tun_put_user(struct tun_struct *tun,
                            struct tun_file *tfile,
                            struct sk_buff *skb,
                            struct iov_iter *iter)
{
        struct tun_pi pi = { 0, skb->protocol };
        ssize_t total;
        int vlan_offset = 0;
        int vlan_hlen = 0;
        int vnet_hdr_sz = 0;
        int ret;

        if (skb_vlan_tag_present(skb))
                vlan_hlen = VLAN_HLEN;

        if (tun->flags & IFF_VNET_HDR)
                vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);

        total = skb->len + vlan_hlen + vnet_hdr_sz;

        if (!(tun->flags & IFF_NO_PI)) {
                if (iov_iter_count(iter) < sizeof(pi))
                        return -EINVAL;

                total += sizeof(pi);
                if (iov_iter_count(iter) < total) {
                        /* Packet will be striped */
                        pi.flags |= TUN_PKT_STRIP;
                }

                if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
                        return -EFAULT;
        }

        if (vnet_hdr_sz) {
                struct virtio_net_hdr_v1_hash_tunnel hdr;
                struct virtio_net_hdr *gso;

                ret = tun_vnet_hdr_tnl_from_skb(tun->flags, tun->dev, skb,
                                                &hdr);
                if (ret)
                        return ret;

                /*
                 * Drop the packet if the configured header size is too small
                 * WRT the enabled offloads.
                 */
                gso = (struct virtio_net_hdr *)&hdr;
                ret = __tun_vnet_hdr_put(vnet_hdr_sz, tun->dev->features,
                                         iter, gso);
                if (ret)
                        return ret;
        }

        if (vlan_hlen) {
                int ret;
                struct veth veth;

                veth.h_vlan_proto = skb->vlan_proto;
                veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));

                vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);

                ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
                if (ret || !iov_iter_count(iter))
                        goto done;

                ret = copy_to_iter(&veth, sizeof(veth), iter);
                if (ret != sizeof(veth) || !iov_iter_count(iter))
                        goto done;
        }

        skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);

done:
        /* caller is in process context, */
        preempt_disable();
        dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen);
        preempt_enable();

        return total;
}

static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
{
        DECLARE_WAITQUEUE(wait, current);
        void *ptr = NULL;
        int error = 0;

        ptr = ptr_ring_consume(&tfile->tx_ring);
        if (ptr)
                goto out;
        if (noblock) {
                error = -EAGAIN;
                goto out;
        }

        add_wait_queue(&tfile->socket.wq.wait, &wait);

        while (1) {
                set_current_state(TASK_INTERRUPTIBLE);
                ptr = ptr_ring_consume(&tfile->tx_ring);
                if (ptr)
                        break;
                if (signal_pending(current)) {
                        error = -ERESTARTSYS;
                        break;
                }
                if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
                        error = -EFAULT;
                        break;
                }

                schedule();
        }

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&tfile->socket.wq.wait, &wait);

out:
        *err = error;
        return ptr;
}

static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
                           struct iov_iter *to,
                           int noblock, void *ptr)
{
        ssize_t ret;
        int err;

        if (!iov_iter_count(to)) {
                tun_ptr_free(ptr);
                return 0;
        }

        if (!ptr) {
                /* Read frames from ring */
                ptr = tun_ring_recv(tfile, noblock, &err);
                if (!ptr)
                        return err;
        }

        if (tun_is_xdp_frame(ptr)) {
                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);

                ret = tun_put_user_xdp(tun, tfile, xdpf, to);
                xdp_return_frame(xdpf);
        } else {
                struct sk_buff *skb = ptr;

                ret = tun_put_user(tun, tfile, skb, to);
                if (unlikely(ret < 0))
                        kfree_skb(skb);
                else
                        consume_skb(skb);
        }

        return ret;
}

static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = tun_get(tfile);
        ssize_t len = iov_iter_count(to), ret;
        int noblock = 0;

        if (!tun)
                return -EBADFD;

        if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
                noblock = 1;

        ret = tun_do_read(tun, tfile, to, noblock, NULL);
        ret = min_t(ssize_t, ret, len);
        if (ret > 0)
                iocb->ki_pos = ret;
        tun_put(tun);
        return ret;
}

static void tun_prog_free(struct rcu_head *rcu)
{
        struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);

        bpf_prog_destroy(prog->prog);
        kfree(prog);
}

static int __tun_set_ebpf(struct tun_struct *tun,
                          struct tun_prog __rcu **prog_p,
                          struct bpf_prog *prog)
{
        struct tun_prog *old, *new = NULL;

        if (prog) {
                new = kmalloc_obj(*new);
                if (!new)
                        return -ENOMEM;
                new->prog = prog;
        }

        spin_lock_bh(&tun->lock);
        old = rcu_dereference_protected(*prog_p,
                                        lockdep_is_held(&tun->lock));
        rcu_assign_pointer(*prog_p, new);
        spin_unlock_bh(&tun->lock);

        if (old)
                call_rcu(&old->rcu, tun_prog_free);

        return 0;
}

static void tun_free_netdev(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        BUG_ON(!(list_empty(&tun->disabled)));

        tun_flow_uninit(tun);
        security_tun_dev_free_security(tun->security);
        __tun_set_ebpf(tun, &tun->steering_prog, NULL);
        __tun_set_ebpf(tun, &tun->filter_prog, NULL);
}

static void tun_setup(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        tun->owner = INVALID_UID;
        tun->group = INVALID_GID;
        tun_default_link_ksettings(dev, &tun->link_ksettings);

        dev->ethtool_ops = &tun_ethtool_ops;
        dev->needs_free_netdev = true;
        dev->priv_destructor = tun_free_netdev;
        /* We prefer our own queue length */
        dev->tx_queue_len = TUN_READQ_SIZE;
}

/* Trivial set of netlink ops to allow deleting tun or tap
 * device with netlink.
 */
static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
                        struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack,
                       "tun/tap creation via rtnetlink is not supported.");
        return -EOPNOTSUPP;
}

static size_t tun_get_size(const struct net_device *dev)
{
        BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t));
        BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t));

        return nla_total_size(sizeof(uid_t)) + /* OWNER */
               nla_total_size(sizeof(gid_t)) + /* GROUP */
               nla_total_size(sizeof(u8)) + /* TYPE */
               nla_total_size(sizeof(u8)) + /* PI */
               nla_total_size(sizeof(u8)) + /* VNET_HDR */
               nla_total_size(sizeof(u8)) + /* PERSIST */
               nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */
               nla_total_size(sizeof(u32)) + /* NUM_QUEUES */
               nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */
               0;
}

static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK))
                goto nla_put_failure;
        if (uid_valid(tun->owner) &&
            nla_put_u32(skb, IFLA_TUN_OWNER,
                        from_kuid_munged(current_user_ns(), tun->owner)))
                goto nla_put_failure;
        if (gid_valid(tun->group) &&
            nla_put_u32(skb, IFLA_TUN_GROUP,
                        from_kgid_munged(current_user_ns(), tun->group)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE,
                       !!(tun->flags & IFF_MULTI_QUEUE)))
                goto nla_put_failure;
        if (tun->flags & IFF_MULTI_QUEUE) {
                if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues))
                        goto nla_put_failure;
                if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES,
                                tun->numdisabled))
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static struct rtnl_link_ops tun_link_ops __read_mostly = {
        .kind                = DRV_NAME,
        .priv_size        = sizeof(struct tun_struct),
        .setup                = tun_setup,
        .validate        = tun_validate,
        .get_size       = tun_get_size,
        .fill_info      = tun_fill_info,
};

static void tun_sock_write_space(struct sock *sk)
{
        struct tun_file *tfile;
        wait_queue_head_t *wqueue;

        if (!sock_writeable(sk))
                return;

        if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
                return;

        wqueue = sk_sleep(sk);
        if (wqueue && waitqueue_active(wqueue))
                wake_up_interruptible_sync_poll(wqueue, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

        tfile = container_of(sk, struct tun_file, sk);
        kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
}

static void tun_put_page(struct tun_page *tpage)
{
        if (tpage->page)
                __page_frag_cache_drain(tpage->page, tpage->count);
}

static int tun_xdp_one(struct tun_struct *tun,
                       struct tun_file *tfile,
                       struct xdp_buff *xdp, int *flush,
                       struct tun_page *tpage)
{
        unsigned int datasize = xdp->data_end - xdp->data;
        struct virtio_net_hdr *gso = xdp->data_hard_start;
        struct virtio_net_hdr_v1_hash_tunnel *tnl_hdr;
        struct bpf_prog *xdp_prog;
        struct sk_buff *skb = NULL;
        struct sk_buff_head *queue;
        netdev_features_t features;
        u32 rxhash = 0, act;
        int buflen = xdp->frame_sz;
        int metasize = 0;
        int ret = 0;
        bool skb_xdp = false;
        struct page *page;

        if (unlikely(datasize < ETH_HLEN))
                return -EINVAL;

        xdp_prog = rcu_dereference(tun->xdp_prog);
        if (xdp_prog) {
                if (gso->gso_type) {
                        skb_xdp = true;
                        goto build;
                }

                xdp_init_buff(xdp, buflen, &tfile->xdp_rxq);

                act = bpf_prog_run_xdp(xdp_prog, xdp);
                ret = tun_xdp_act(tun, xdp_prog, xdp, act);
                if (ret < 0) {
                        put_page(virt_to_head_page(xdp->data));
                        return ret;
                }

                switch (ret) {
                case XDP_REDIRECT:
                        *flush = true;
                        fallthrough;
                case XDP_TX:
                        return 0;
                case XDP_PASS:
                        break;
                default:
                        page = virt_to_head_page(xdp->data);
                        if (tpage->page == page) {
                                ++tpage->count;
                        } else {
                                tun_put_page(tpage);
                                tpage->page = page;
                                tpage->count = 1;
                        }
                        return 0;
                }
        }

build:
        skb = build_skb(xdp->data_hard_start, buflen);
        if (!skb) {
                ret = -ENOMEM;
                goto out;
        }

        skb_reserve(skb, xdp->data - xdp->data_hard_start);
        skb_put(skb, xdp->data_end - xdp->data);

        /* The externally provided xdp_buff may have no metadata support, which
         * is marked by xdp->data_meta being xdp->data + 1. This will lead to a
         * metasize of -1 and is the reason why the condition checks for > 0.
         */
        metasize = xdp->data - xdp->data_meta;
        if (metasize > 0)
                skb_metadata_set(skb, metasize);

        features = tun_vnet_hdr_guest_features(READ_ONCE(tun->vnet_hdr_sz));
        tnl_hdr = (struct virtio_net_hdr_v1_hash_tunnel *)gso;
        if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, tnl_hdr)) {
                atomic_long_inc(&tun->rx_frame_errors);
                kfree_skb(skb);
                ret = -EINVAL;
                goto out;
        }

        skb->protocol = eth_type_trans(skb, tun->dev);
        skb_reset_network_header(skb);
        skb_probe_transport_header(skb);
        skb_record_rx_queue(skb, tfile->queue_index);

        if (skb_xdp) {
                ret = do_xdp_generic(xdp_prog, &skb);
                if (ret != XDP_PASS) {
                        ret = 0;
                        goto out;
                }
        }

        if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
            !tfile->detached)
                rxhash = __skb_get_hash_symmetric(skb);

        if (tfile->napi_enabled) {
                queue = &tfile->sk.sk_write_queue;
                spin_lock(&queue->lock);

                if (unlikely(tfile->detached)) {
                        spin_unlock(&queue->lock);
                        kfree_skb(skb);
                        return -EBUSY;
                }

                __skb_queue_tail(queue, skb);
                spin_unlock(&queue->lock);
                ret = 1;
        } else {
                netif_receive_skb(skb);
                ret = 0;
        }

        /* No need to disable preemption here since this function is
         * always called with bh disabled
         */
        dev_sw_netstats_rx_add(tun->dev, datasize);

        if (rxhash)
                tun_flow_update(tun, rxhash, tfile);

out:
        return ret;
}

static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
        int ret, i;
        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
        struct tun_struct *tun = tun_get(tfile);
        struct tun_msg_ctl *ctl = m->msg_control;
        struct xdp_buff *xdp;

        if (!tun)
                return -EBADFD;

        if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
            ctl && ctl->type == TUN_MSG_PTR) {
                struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
                struct tun_page tpage;
                int n = ctl->num;
                int flush = 0, queued = 0;

                memset(&tpage, 0, sizeof(tpage));

                local_bh_disable();
                rcu_read_lock();
                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

                for (i = 0; i < n; i++) {
                        xdp = &((struct xdp_buff *)ctl->ptr)[i];
                        ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
                        if (ret > 0)
                                queued += ret;
                }

                if (flush)
                        xdp_do_flush();

                if (tfile->napi_enabled && queued > 0)
                        napi_schedule(&tfile->napi);

                bpf_net_ctx_clear(bpf_net_ctx);
                rcu_read_unlock();
                local_bh_enable();

                tun_put_page(&tpage);

                ret = total_len;
                goto out;
        }

        ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
                           m->msg_flags & MSG_DONTWAIT,
                           m->msg_flags & MSG_MORE);
out:
        tun_put(tun);
        return ret;
}

static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
                       int flags)
{
        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
        struct tun_struct *tun = tun_get(tfile);
        void *ptr = m->msg_control;
        int ret;

        if (!tun) {
                ret = -EBADFD;
                goto out_free;
        }

        if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
                ret = -EINVAL;
                goto out_put_tun;
        }
        if (flags & MSG_ERRQUEUE) {
                ret = sock_recv_errqueue(sock->sk, m, total_len,
                                         SOL_PACKET, TUN_TX_TIMESTAMP);
                goto out;
        }
        ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
        if (ret > (ssize_t)total_len) {
                m->msg_flags |= MSG_TRUNC;
                ret = flags & MSG_TRUNC ? ret : total_len;
        }
out:
        tun_put(tun);
        return ret;

out_put_tun:
        tun_put(tun);
out_free:
        tun_ptr_free(ptr);
        return ret;
}

static int tun_ptr_peek_len(void *ptr)
{
        if (likely(ptr)) {
                if (tun_is_xdp_frame(ptr)) {
                        struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);

                        return xdpf->len;
                }
                return __skb_array_len_with_tag(ptr);
        } else {
                return 0;
        }
}

static int tun_peek_len(struct socket *sock)
{
        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
        struct tun_struct *tun;
        int ret = 0;

        tun = tun_get(tfile);
        if (!tun)
                return 0;

        ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
        tun_put(tun);

        return ret;
}

/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
        .peek_len = tun_peek_len,
        .sendmsg = tun_sendmsg,
        .recvmsg = tun_recvmsg,
};

static struct proto tun_proto = {
        .name                = "tun",
        .owner                = THIS_MODULE,
        .obj_size        = sizeof(struct tun_file),
};

static int tun_flags(struct tun_struct *tun)
{
        return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
}

static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr,
                              char *buf)
{
        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
        return sysfs_emit(buf, "0x%x\n", tun_flags(tun));
}

static ssize_t owner_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
        return uid_valid(tun->owner)?
                sysfs_emit(buf, "%u\n",
                           from_kuid_munged(current_user_ns(), tun->owner)) :
                sysfs_emit(buf, "-1\n");
}

static ssize_t group_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
        return gid_valid(tun->group) ?
                sysfs_emit(buf, "%u\n",
                           from_kgid_munged(current_user_ns(), tun->group)) :
                sysfs_emit(buf, "-1\n");
}

static DEVICE_ATTR_RO(tun_flags);
static DEVICE_ATTR_RO(owner);
static DEVICE_ATTR_RO(group);

static struct attribute *tun_dev_attrs[] = {
        &dev_attr_tun_flags.attr,
        &dev_attr_owner.attr,
        &dev_attr_group.attr,
        NULL
};

static const struct attribute_group tun_attr_group = {
        .attrs = tun_dev_attrs
};

static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
{
        struct tun_struct *tun;
        struct tun_file *tfile = file->private_data;
        struct net_device *dev;
        int err;

        if (tfile->detached)
                return -EINVAL;

        if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
                if (!capable(CAP_NET_ADMIN))
                        return -EPERM;

                if (!(ifr->ifr_flags & IFF_NAPI) ||
                    (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
                        return -EINVAL;
        }

        dev = __dev_get_by_name(net, ifr->ifr_name);
        if (dev) {
                if (ifr->ifr_flags & IFF_TUN_EXCL)
                        return -EBUSY;
                if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
                        tun = netdev_priv(dev);
                else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
                        tun = netdev_priv(dev);
                else
                        return -EINVAL;

                if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
                    !!(tun->flags & IFF_MULTI_QUEUE))
                        return -EINVAL;

                if (tun_not_capable(tun))
                        return -EPERM;
                err = security_tun_dev_open(tun->security);
                if (err < 0)
                        return err;

                err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
                                 ifr->ifr_flags & IFF_NAPI,
                                 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
                if (err < 0)
                        return err;

                if (tun->flags & IFF_MULTI_QUEUE &&
                    (tun->numqueues + tun->numdisabled > 1)) {
                        /* One or more queue has already been attached, no need
                         * to initialize the device again.
                         */
                        netdev_state_change(dev);
                        return 0;
                }

                tun->flags = (tun->flags & ~TUN_FEATURES) |
                              (ifr->ifr_flags & TUN_FEATURES);

                netdev_state_change(dev);
        } else {
                char *name;
                unsigned long flags = 0;
                int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
                             MAX_TAP_QUEUES : 1;

                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                err = security_tun_dev_create();
                if (err < 0)
                        return err;

                /* Set dev type */
                if (ifr->ifr_flags & IFF_TUN) {
                        /* TUN device */
                        flags |= IFF_TUN;
                        name = "tun%d";
                } else if (ifr->ifr_flags & IFF_TAP) {
                        /* TAP device */
                        flags |= IFF_TAP;
                        name = "tap%d";
                } else
                        return -EINVAL;

                if (*ifr->ifr_name)
                        name = ifr->ifr_name;

                dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
                                       NET_NAME_UNKNOWN, tun_setup, queues,
                                       queues);

                if (!dev)
                        return -ENOMEM;

                dev_net_set(dev, net);
                dev->rtnl_link_ops = &tun_link_ops;
                dev->ifindex = tfile->ifindex;
                dev->sysfs_groups[0] = &tun_attr_group;

                tun = netdev_priv(dev);
                tun->dev = dev;
                tun->flags = flags;
                tun->txflt.count = 0;
                tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);

                tun->align = NET_SKB_PAD;
                tun->filter_attached = false;
                tun->sndbuf = tfile->socket.sk->sk_sndbuf;
                tun->rx_batched = 0;
                RCU_INIT_POINTER(tun->steering_prog, NULL);

                tun->ifr = ifr;
                tun->file = file;

                tun_net_initialize(dev);

                err = register_netdevice(tun->dev);
                if (err < 0) {
                        free_netdev(dev);
                        return err;
                }
                /* free_netdev() won't check refcnt, to avoid race
                 * with dev_put() we need publish tun after registration.
                 */
                rcu_assign_pointer(tfile->tun, tun);
        }

        if (ifr->ifr_flags & IFF_NO_CARRIER)
                netif_carrier_off(tun->dev);
        else
                netif_carrier_on(tun->dev);

        /* Make sure persistent devices do not get stuck in
         * xoff state.
         */
        if (netif_running(tun->dev))
                netif_tx_wake_all_queues(tun->dev);

        strscpy(ifr->ifr_name, tun->dev->name);
        return 0;
}

static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
{
        strscpy(ifr->ifr_name, tun->dev->name);

        ifr->ifr_flags = tun_flags(tun);

}

#define PLAIN_GSO (NETIF_F_GSO_UDP_L4 | NETIF_F_TSO | NETIF_F_TSO6)

/* This is like a cut-down ethtool ops, except done via tun fd so no
 * privs required. */
static int set_offload(struct tun_struct *tun, unsigned long arg)
{
        netdev_features_t features = 0;

        if (arg & TUN_F_CSUM) {
                features |= NETIF_F_HW_CSUM;
                arg &= ~TUN_F_CSUM;

                if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
                        if (arg & TUN_F_TSO_ECN) {
                                features |= NETIF_F_TSO_ECN;
                                arg &= ~TUN_F_TSO_ECN;
                        }
                        if (arg & TUN_F_TSO4)
                                features |= NETIF_F_TSO;
                        if (arg & TUN_F_TSO6)
                                features |= NETIF_F_TSO6;
                        arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
                }

                arg &= ~TUN_F_UFO;

                /* TODO: for now USO4 and USO6 should work simultaneously */
                if (arg & TUN_F_USO4 && arg & TUN_F_USO6) {
                        features |= NETIF_F_GSO_UDP_L4;
                        arg &= ~(TUN_F_USO4 | TUN_F_USO6);
                }

                /*
                 * Tunnel offload is allowed only if some plain offload is
                 * available, too.
                 */
                if (features & PLAIN_GSO && arg & TUN_F_UDP_TUNNEL_GSO) {
                        features |= NETIF_F_GSO_UDP_TUNNEL;
                        if (arg & TUN_F_UDP_TUNNEL_GSO_CSUM)
                                features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
                        arg &= ~(TUN_F_UDP_TUNNEL_GSO |
                                 TUN_F_UDP_TUNNEL_GSO_CSUM);
                }
        }

        /* This gives the user a way to test for new features in future by
         * trying to set them. */
        if (arg)
                return -EINVAL;

        tun->set_features = features;
        tun->dev->wanted_features &= ~TUN_USER_FEATURES;
        tun->dev->wanted_features |= features;
        netdev_update_features(tun->dev);

        return 0;
}

static void tun_detach_filter(struct tun_struct *tun, int n)
{
        int i;
        struct tun_file *tfile;

        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                lock_sock(tfile->socket.sk);
                sk_detach_filter(tfile->socket.sk);
                release_sock(tfile->socket.sk);
        }

        tun->filter_attached = false;
}

static int tun_attach_filter(struct tun_struct *tun)
{
        int i, ret = 0;
        struct tun_file *tfile;

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                lock_sock(tfile->socket.sk);
                ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
                release_sock(tfile->socket.sk);
                if (ret) {
                        tun_detach_filter(tun, i);
                        return ret;
                }
        }

        tun->filter_attached = true;
        return ret;
}

static void tun_set_sndbuf(struct tun_struct *tun)
{
        struct tun_file *tfile;
        int i;

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                tfile->socket.sk->sk_sndbuf = tun->sndbuf;
        }
}

static int tun_set_queue(struct file *file, struct ifreq *ifr)
{
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun;
        int ret = 0;

        rtnl_lock();

        if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
                tun = tfile->detached;
                if (!tun) {
                        ret = -EINVAL;
                        goto unlock;
                }
                ret = security_tun_dev_attach_queue(tun->security);
                if (ret < 0)
                        goto unlock;
                ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
                                 tun->flags & IFF_NAPI_FRAGS, true);
        } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
                tun = rtnl_dereference(tfile->tun);
                if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
                        ret = -EINVAL;
                else
                        __tun_detach(tfile, false);
        } else
                ret = -EINVAL;

        if (ret >= 0)
                netdev_state_change(tun->dev);

unlock:
        rtnl_unlock();
        return ret;
}

static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
                        void __user *data)
{
        struct bpf_prog *prog;
        int fd;

        if (copy_from_user(&fd, data, sizeof(fd)))
                return -EFAULT;

        if (fd == -1) {
                prog = NULL;
        } else {
                prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
                if (IS_ERR(prog))
                        return PTR_ERR(prog);
        }

        return __tun_set_ebpf(tun, prog_p, prog);
}

/* Return correct value for tun->dev->addr_len based on tun->dev->type. */
static unsigned char tun_get_addr_len(unsigned short type)
{
        switch (type) {
        case ARPHRD_IP6GRE:
        case ARPHRD_TUNNEL6:
                return sizeof(struct in6_addr);
        case ARPHRD_IPGRE:
        case ARPHRD_TUNNEL:
        case ARPHRD_SIT:
                return 4;
        case ARPHRD_ETHER:
                return ETH_ALEN;
        case ARPHRD_IEEE802154:
        case ARPHRD_IEEE802154_MONITOR:
                return IEEE802154_EXTENDED_ADDR_LEN;
        case ARPHRD_PHONET_PIPE:
        case ARPHRD_PPP:
        case ARPHRD_NONE:
                return 0;
        case ARPHRD_6LOWPAN:
                return EUI64_ADDR_LEN;
        case ARPHRD_FDDI:
                return FDDI_K_ALEN;
        case ARPHRD_HIPPI:
                return HIPPI_ALEN;
        case ARPHRD_IEEE802:
                return FC_ALEN;
        case ARPHRD_ROSE:
                return ROSE_ADDR_LEN;
        case ARPHRD_NETROM:
                return AX25_ADDR_LEN;
        case ARPHRD_LOCALTLK:
                return LTALK_ALEN;
        default:
                return 0;
        }
}

static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg, int ifreq_len)
{
        struct tun_file *tfile = file->private_data;
        struct net *net = sock_net(&tfile->sk);
        struct tun_struct *tun;
        void __user* argp = (void __user*)arg;
        unsigned int carrier;
        struct ifreq ifr;
        kuid_t owner;
        kgid_t group;
        int ifindex;
        int sndbuf;
        int ret;
        bool do_notify = false;

        if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
            (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
                if (copy_from_user(&ifr, argp, ifreq_len))
                        return -EFAULT;
        } else {
                memset(&ifr, 0, sizeof(ifr));
        }
        if (cmd == TUNGETFEATURES) {
                /* Currently this just means: "what IFF flags are valid?".
                 * This is needed because we never checked for invalid flags on
                 * TUNSETIFF.
                 */
                return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER |
                                TUN_FEATURES, (unsigned int __user*)argp);
        } else if (cmd == TUNSETQUEUE) {
                return tun_set_queue(file, &ifr);
        } else if (cmd == SIOCGSKNS) {
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                return open_related_ns(&net->ns, get_net_ns);
        }

        rtnl_lock();

        tun = tun_get(tfile);
        if (cmd == TUNSETIFF) {
                ret = -EEXIST;
                if (tun)
                        goto unlock;

                ifr.ifr_name[IFNAMSIZ-1] = '\0';

                ret = tun_set_iff(net, file, &ifr);

                if (ret)
                        goto unlock;

                if (copy_to_user(argp, &ifr, ifreq_len))
                        ret = -EFAULT;
                goto unlock;
        }
        if (cmd == TUNSETIFINDEX) {
                ret = -EPERM;
                if (tun)
                        goto unlock;

                ret = -EFAULT;
                if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
                        goto unlock;
                ret = -EINVAL;
                if (ifindex < 0)
                        goto unlock;
                ret = 0;
                tfile->ifindex = ifindex;
                goto unlock;
        }

        ret = -EBADFD;
        if (!tun)
                goto unlock;

        netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);

        net = dev_net(tun->dev);
        ret = 0;
        switch (cmd) {
        case TUNGETIFF:
                tun_get_iff(tun, &ifr);

                if (tfile->detached)
                        ifr.ifr_flags |= IFF_DETACH_QUEUE;
                if (!tfile->socket.sk->sk_filter)
                        ifr.ifr_flags |= IFF_NOFILTER;

                if (copy_to_user(argp, &ifr, ifreq_len))
                        ret = -EFAULT;
                break;

        case TUNSETNOCSUM:
                /* Disable/Enable checksum */

                /* [unimplemented] */
                netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n",
                           arg ? "disabled" : "enabled");
                break;

        case TUNSETPERSIST:
                /* Disable/Enable persist mode. Keep an extra reference to the
                 * module to prevent the module being unprobed.
                 */
                if (arg && !(tun->flags & IFF_PERSIST)) {
                        tun->flags |= IFF_PERSIST;
                        __module_get(THIS_MODULE);
                        do_notify = true;
                }
                if (!arg && (tun->flags & IFF_PERSIST)) {
                        tun->flags &= ~IFF_PERSIST;
                        module_put(THIS_MODULE);
                        do_notify = true;
                }

                netif_info(tun, drv, tun->dev, "persist %s\n",
                           arg ? "enabled" : "disabled");
                break;

        case TUNSETOWNER:
                /* Set owner of the device */
                owner = make_kuid(current_user_ns(), arg);
                if (!uid_valid(owner)) {
                        ret = -EINVAL;
                        break;
                }
                tun->owner = owner;
                do_notify = true;
                netif_info(tun, drv, tun->dev, "owner set to %u\n",
                           from_kuid(&init_user_ns, tun->owner));
                break;

        case TUNSETGROUP:
                /* Set group of the device */
                group = make_kgid(current_user_ns(), arg);
                if (!gid_valid(group)) {
                        ret = -EINVAL;
                        break;
                }
                tun->group = group;
                do_notify = true;
                netif_info(tun, drv, tun->dev, "group set to %u\n",
                           from_kgid(&init_user_ns, tun->group));
                break;

        case TUNSETLINK:
                /* Only allow setting the type when the interface is down */
                if (tun->dev->flags & IFF_UP) {
                        netif_info(tun, drv, tun->dev,
                                   "Linktype set failed because interface is up\n");
                        ret = -EBUSY;
                } else {
                        ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE,
                                                       tun->dev);
                        ret = notifier_to_errno(ret);
                        if (ret) {
                                netif_info(tun, drv, tun->dev,
                                           "Refused to change device type\n");
                                break;
                        }
                        tun->dev->type = (int) arg;
                        tun->dev->addr_len = tun_get_addr_len(tun->dev->type);
                        netif_info(tun, drv, tun->dev, "linktype set to %d\n",
                                   tun->dev->type);
                        call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
                                                 tun->dev);
                }
                break;

        case TUNSETDEBUG:
                tun->msg_enable = (u32)arg;
                break;

        case TUNSETOFFLOAD:
                ret = set_offload(tun, arg);
                break;

        case TUNSETTXFILTER:
                /* Can be set only for TAPs */
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = update_filter(&tun->txflt, (void __user *)arg);
                break;

        case SIOCGIFHWADDR:
                /* Get hw address */
                netif_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name);
                if (copy_to_user(argp, &ifr, ifreq_len))
                        ret = -EFAULT;
                break;

        case SIOCSIFHWADDR:
                /* Set hw address */
                if (tun->dev->addr_len > sizeof(ifr.ifr_hwaddr)) {
                        ret = -EINVAL;
                        break;
                }
                ret = dev_set_mac_address_user(tun->dev,
                                               (struct sockaddr_storage *)&ifr.ifr_hwaddr,
                                               NULL);
                break;

        case TUNGETSNDBUF:
                sndbuf = tfile->socket.sk->sk_sndbuf;
                if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
                        ret = -EFAULT;
                break;

        case TUNSETSNDBUF:
                if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
                        ret = -EFAULT;
                        break;
                }
                if (sndbuf <= 0) {
                        ret = -EINVAL;
                        break;
                }

                tun->sndbuf = sndbuf;
                tun_set_sndbuf(tun);
                break;

        case TUNATTACHFILTER:
                /* Can be set only for TAPs */
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = -EFAULT;
                if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
                        break;

                ret = tun_attach_filter(tun);
                break;

        case TUNDETACHFILTER:
                /* Can be set only for TAPs */
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = 0;
                tun_detach_filter(tun, tun->numqueues);
                break;

        case TUNGETFILTER:
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = -EFAULT;
                if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
                        break;
                ret = 0;
                break;

        case TUNSETSTEERINGEBPF:
                ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
                break;

        case TUNSETFILTEREBPF:
                ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
                break;

        case TUNSETCARRIER:
                ret = -EFAULT;
                if (copy_from_user(&carrier, argp, sizeof(carrier)))
                        goto unlock;

                ret = tun_net_change_carrier(tun->dev, (bool)carrier);
                break;

        case TUNGETDEVNETNS:
                ret = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto unlock;
                ret = open_related_ns(&net->ns, get_net_ns);
                break;

        default:
                ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp);
                break;
        }

        if (do_notify)
                netdev_state_change(tun->dev);

unlock:
        rtnl_unlock();
        if (tun)
                tun_put(tun);
        return ret;
}

static long tun_chr_ioctl(struct file *file,
                          unsigned int cmd, unsigned long arg)
{
        return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
}

#ifdef CONFIG_COMPAT
static long tun_chr_compat_ioctl(struct file *file,
                         unsigned int cmd, unsigned long arg)
{
        switch (cmd) {
        case TUNSETIFF:
        case TUNGETIFF:
        case TUNSETTXFILTER:
        case TUNGETSNDBUF:
        case TUNSETSNDBUF:
        case SIOCGIFHWADDR:
        case SIOCSIFHWADDR:
                arg = (unsigned long)compat_ptr(arg);
                break;
        default:
                arg = (compat_ulong_t)arg;
                break;
        }

        /*
         * compat_ifreq is shorter than ifreq, so we must not access beyond
         * the end of that structure. All fields that are used in this
         * driver are compatible though, we don't need to convert the
         * contents.
         */
        return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
}
#endif /* CONFIG_COMPAT */

static int tun_chr_fasync(int fd, struct file *file, int on)
{
        struct tun_file *tfile = file->private_data;
        int ret;

        if (on) {
                ret = file_f_owner_allocate(file);
                if (ret)
                        goto out;
        }

        if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
                goto out;

        if (on) {
                __f_setown(file, task_pid(current), PIDTYPE_TGID, 0);
                tfile->flags |= TUN_FASYNC;
        } else
                tfile->flags &= ~TUN_FASYNC;
        ret = 0;
out:
        return ret;
}

static int tun_chr_open(struct inode *inode, struct file * file)
{
        struct net *net = current->nsproxy->net_ns;
        struct tun_file *tfile;

        tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
                                            &tun_proto, 0);
        if (!tfile)
                return -ENOMEM;
        if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
                sk_free(&tfile->sk);
                return -ENOMEM;
        }

        mutex_init(&tfile->napi_mutex);
        RCU_INIT_POINTER(tfile->tun, NULL);
        tfile->flags = 0;
        tfile->ifindex = 0;

        init_waitqueue_head(&tfile->socket.wq.wait);

        tfile->socket.file = file;
        tfile->socket.ops = &tun_socket_ops;

        sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid());

        tfile->sk.sk_write_space = tun_sock_write_space;
        tfile->sk.sk_sndbuf = INT_MAX;

        file->private_data = tfile;
        INIT_LIST_HEAD(&tfile->next);

        sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);

        /* tun groks IOCB_NOWAIT just fine, mark it as such */
        file->f_mode |= FMODE_NOWAIT;
        return 0;
}

static int tun_chr_close(struct inode *inode, struct file *file)
{
        struct tun_file *tfile = file->private_data;

        tun_detach(tfile, true);

        return 0;
}

#ifdef CONFIG_PROC_FS
static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file)
{
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun;
        struct ifreq ifr;

        memset(&ifr, 0, sizeof(ifr));

        rtnl_lock();
        tun = tun_get(tfile);
        if (tun)
                tun_get_iff(tun, &ifr);
        rtnl_unlock();

        if (tun)
                tun_put(tun);

        seq_printf(m, "iff:\t%s\n", ifr.ifr_name);
}
#endif

static const struct file_operations tun_fops = {
        .owner        = THIS_MODULE,
        .read_iter  = tun_chr_read_iter,
        .write_iter = tun_chr_write_iter,
        .poll        = tun_chr_poll,
        .unlocked_ioctl        = tun_chr_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl = tun_chr_compat_ioctl,
#endif
        .open        = tun_chr_open,
        .release = tun_chr_close,
        .fasync = tun_chr_fasync,
#ifdef CONFIG_PROC_FS
        .show_fdinfo = tun_chr_show_fdinfo,
#endif
};

static struct miscdevice tun_miscdev = {
        .minor = TUN_MINOR,
        .name = "tun",
        .nodename = "net/tun",
        .fops = &tun_fops,
};

/* ethtool interface */

static void tun_default_link_ksettings(struct net_device *dev,
                                       struct ethtool_link_ksettings *cmd)
{
        ethtool_link_ksettings_zero_link_mode(cmd, supported);
        ethtool_link_ksettings_zero_link_mode(cmd, advertising);
        cmd->base.speed                = SPEED_10000;
        cmd->base.duplex        = DUPLEX_FULL;
        cmd->base.port                = PORT_TP;
        cmd->base.phy_address        = 0;
        cmd->base.autoneg        = AUTONEG_DISABLE;
}

static int tun_get_link_ksettings(struct net_device *dev,
                                  struct ethtool_link_ksettings *cmd)
{
        struct tun_struct *tun = netdev_priv(dev);

        memcpy(cmd, &tun->link_ksettings, sizeof(*cmd));
        return 0;
}

static int tun_set_link_ksettings(struct net_device *dev,
                                  const struct ethtool_link_ksettings *cmd)
{
        struct tun_struct *tun = netdev_priv(dev);

        memcpy(&tun->link_ksettings, cmd, sizeof(*cmd));
        return 0;
}

static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
        struct tun_struct *tun = netdev_priv(dev);

        strscpy(info->driver, DRV_NAME, sizeof(info->driver));
        strscpy(info->version, DRV_VERSION, sizeof(info->version));

        switch (tun->flags & TUN_TYPE_MASK) {
        case IFF_TUN:
                strscpy(info->bus_info, "tun", sizeof(info->bus_info));
                break;
        case IFF_TAP:
                strscpy(info->bus_info, "tap", sizeof(info->bus_info));
                break;
        }
}

static u32 tun_get_msglevel(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        return tun->msg_enable;
}

static void tun_set_msglevel(struct net_device *dev, u32 value)
{
        struct tun_struct *tun = netdev_priv(dev);

        tun->msg_enable = value;
}

static int tun_get_coalesce(struct net_device *dev,
                            struct ethtool_coalesce *ec,
                            struct kernel_ethtool_coalesce *kernel_coal,
                            struct netlink_ext_ack *extack)
{
        struct tun_struct *tun = netdev_priv(dev);

        ec->rx_max_coalesced_frames = tun->rx_batched;

        return 0;
}

static int tun_set_coalesce(struct net_device *dev,
                            struct ethtool_coalesce *ec,
                            struct kernel_ethtool_coalesce *kernel_coal,
                            struct netlink_ext_ack *extack)
{
        struct tun_struct *tun = netdev_priv(dev);

        if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
                tun->rx_batched = NAPI_POLL_WEIGHT;
        else
                tun->rx_batched = ec->rx_max_coalesced_frames;

        return 0;
}

static void tun_get_channels(struct net_device *dev,
                             struct ethtool_channels *channels)
{
        struct tun_struct *tun = netdev_priv(dev);

        channels->combined_count = tun->numqueues;
        channels->max_combined = tun->flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1;
}

static const struct ethtool_ops tun_ethtool_ops = {
        .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
        .get_drvinfo        = tun_get_drvinfo,
        .get_msglevel        = tun_get_msglevel,
        .set_msglevel        = tun_set_msglevel,
        .get_link        = ethtool_op_get_link,
        .get_channels   = tun_get_channels,
        .get_ts_info        = ethtool_op_get_ts_info,
        .get_coalesce   = tun_get_coalesce,
        .set_coalesce   = tun_set_coalesce,
        .get_link_ksettings = tun_get_link_ksettings,
        .set_link_ksettings = tun_set_link_ksettings,
};

static int tun_queue_resize(struct tun_struct *tun)
{
        struct net_device *dev = tun->dev;
        struct tun_file *tfile;
        struct ptr_ring **rings;
        int n = tun->numqueues + tun->numdisabled;
        int ret, i;

        rings = kmalloc_objs(*rings, n);
        if (!rings)
                return -ENOMEM;

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                rings[i] = &tfile->tx_ring;
        }
        list_for_each_entry(tfile, &tun->disabled, next)
                rings[i++] = &tfile->tx_ring;

        ret = ptr_ring_resize_multiple_bh(rings, n,
                                          dev->tx_queue_len, GFP_KERNEL,
                                          tun_ptr_free);

        kfree(rings);
        return ret;
}

static int tun_device_event(struct notifier_block *unused,
                            unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct tun_struct *tun = netdev_priv(dev);
        int i;

        if (dev->rtnl_link_ops != &tun_link_ops)
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_CHANGE_TX_QUEUE_LEN:
                if (tun_queue_resize(tun))
                        return NOTIFY_BAD;
                break;
        case NETDEV_UP:
                for (i = 0; i < tun->numqueues; i++) {
                        struct tun_file *tfile;

                        tfile = rtnl_dereference(tun->tfiles[i]);
                        tfile->socket.sk->sk_write_space(tfile->socket.sk);
                }
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block tun_notifier_block __read_mostly = {
        .notifier_call        = tun_device_event,
};

static int __init tun_init(void)
{
        int ret = 0;

        pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);

        ret = rtnl_link_register(&tun_link_ops);
        if (ret) {
                pr_err("Can't register link_ops\n");
                goto err_linkops;
        }

        ret = misc_register(&tun_miscdev);
        if (ret) {
                pr_err("Can't register misc device %d\n", TUN_MINOR);
                goto err_misc;
        }

        ret = register_netdevice_notifier(&tun_notifier_block);
        if (ret) {
                pr_err("Can't register netdevice notifier\n");
                goto err_notifier;
        }

        return  0;

err_notifier:
        misc_deregister(&tun_miscdev);
err_misc:
        rtnl_link_unregister(&tun_link_ops);
err_linkops:
        return ret;
}

static void __exit tun_cleanup(void)
{
        misc_deregister(&tun_miscdev);
        rtnl_link_unregister(&tun_link_ops);
        unregister_netdevice_notifier(&tun_notifier_block);
}

/* Get an underlying socket object from tun file.  Returns error unless file is
 * attached to a device.  The returned object works like a packet socket, it
 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
 * holding a reference to the file for as long as the socket is in use. */
struct socket *tun_get_socket(struct file *file)
{
        struct tun_file *tfile;
        if (file->f_op != &tun_fops)
                return ERR_PTR(-EINVAL);
        tfile = file->private_data;
        if (!tfile)
                return ERR_PTR(-EBADFD);
        return &tfile->socket;
}
EXPORT_SYMBOL_GPL(tun_get_socket);

struct ptr_ring *tun_get_tx_ring(struct file *file)
{
        struct tun_file *tfile;

        if (file->f_op != &tun_fops)
                return ERR_PTR(-EINVAL);
        tfile = file->private_data;
        if (!tfile)
                return ERR_PTR(-EBADFD);
        return &tfile->tx_ring;
}
EXPORT_SYMBOL_GPL(tun_get_tx_ring);

module_init(tun_init);
module_exit(tun_cleanup);
MODULE_DESCRIPTION(DRV_DESCRIPTION);
MODULE_AUTHOR(DRV_COPYRIGHT);
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(TUN_MINOR);
MODULE_ALIAS("devname:net/tun");
MODULE_IMPORT_NS("NETDEV_INTERNAL");



















































    2 

























































    2 










    1 
    1 






















    1 







    2 



    1 

    2 





































































    1 










    1 

















    1 








    1 






















































    2 

    2 



















































    1 














    1 









































    2 



















    2 





    2 







    2 


    1 







    1 


























    2 












    1 






















    1 







    2 































    1 









    1 














    1 



    1 








    1 




    1 




    1 



    1 
    1 














    2 
    1 


    1 



    1 




























































    1 







    2 





























    1 


























































    1 













    1 






    1 








    1 














    1 

































































































    1 






























































    1 



















































































































































    1 







    1 





















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
// SPDX-License-Identifier: GPL-2.0-or-later
/* Basic authentication token and access key management
 *
 * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/init.h>
#include <linux/poison.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/workqueue.h>
#include <linux/random.h>
#include <linux/err.h>
#include "internal.h"

struct kmem_cache *key_jar;
struct rb_root                key_serial_tree; /* tree of keys indexed by serial */
DEFINE_SPINLOCK(key_serial_lock);

struct rb_root        key_user_tree; /* tree of quota records indexed by UID */
DEFINE_SPINLOCK(key_user_lock);

unsigned int key_quota_root_maxkeys = 1000000;        /* root's key count quota */
unsigned int key_quota_root_maxbytes = 25000000; /* root's key space quota */
unsigned int key_quota_maxkeys = 200;                /* general key count quota */
unsigned int key_quota_maxbytes = 20000;        /* general key space quota */

static LIST_HEAD(key_types_list);
static DECLARE_RWSEM(key_types_sem);

/* We serialise key instantiation and link */
DEFINE_MUTEX(key_construction_mutex);

#ifdef KEY_DEBUGGING
void __key_check(const struct key *key)
{
        printk("__key_check: key %p {%08x} should be {%08x}\n",
               key, key->magic, KEY_DEBUG_MAGIC);
        BUG();
}
#endif

/*
 * Get the key quota record for a user, allocating a new record if one doesn't
 * already exist.
 */
struct key_user *key_user_lookup(kuid_t uid)
{
        struct key_user *candidate = NULL, *user;
        struct rb_node *parent, **p;

try_again:
        parent = NULL;
        p = &key_user_tree.rb_node;
        spin_lock(&key_user_lock);

        /* search the tree for a user record with a matching UID */
        while (*p) {
                parent = *p;
                user = rb_entry(parent, struct key_user, node);

                if (uid_lt(uid, user->uid))
                        p = &(*p)->rb_left;
                else if (uid_gt(uid, user->uid))
                        p = &(*p)->rb_right;
                else
                        goto found;
        }

        /* if we get here, we failed to find a match in the tree */
        if (!candidate) {
                /* allocate a candidate user record if we don't already have
                 * one */
                spin_unlock(&key_user_lock);

                user = NULL;
                candidate = kmalloc_obj(struct key_user);
                if (unlikely(!candidate))
                        goto out;

                /* the allocation may have scheduled, so we need to repeat the
                 * search lest someone else added the record whilst we were
                 * asleep */
                goto try_again;
        }

        /* if we get here, then the user record still hadn't appeared on the
         * second pass - so we use the candidate record */
        refcount_set(&candidate->usage, 1);
        atomic_set(&candidate->nkeys, 0);
        atomic_set(&candidate->nikeys, 0);
        candidate->uid = uid;
        candidate->qnkeys = 0;
        candidate->qnbytes = 0;
        spin_lock_init(&candidate->lock);
        mutex_init(&candidate->cons_lock);

        rb_link_node(&candidate->node, parent, p);
        rb_insert_color(&candidate->node, &key_user_tree);
        spin_unlock(&key_user_lock);
        user = candidate;
        goto out;

        /* okay - we found a user record for this UID */
found:
        refcount_inc(&user->usage);
        spin_unlock(&key_user_lock);
        kfree(candidate);
out:
        return user;
}

/*
 * Dispose of a user structure
 */
void key_user_put(struct key_user *user)
{
        if (refcount_dec_and_lock(&user->usage, &key_user_lock)) {
                rb_erase(&user->node, &key_user_tree);
                spin_unlock(&key_user_lock);

                kfree(user);
        }
}

/*
 * Allocate a serial number for a key.  These are assigned randomly to avoid
 * security issues through covert channel problems.
 */
static inline void key_alloc_serial(struct key *key)
{
        struct rb_node *parent, **p;
        struct key *xkey;

        /* propose a random serial number and look for a hole for it in the
         * serial number tree */
        do {
                get_random_bytes(&key->serial, sizeof(key->serial));

                key->serial >>= 1; /* negative numbers are not permitted */
        } while (key->serial < 3);

        spin_lock(&key_serial_lock);

attempt_insertion:
        parent = NULL;
        p = &key_serial_tree.rb_node;

        while (*p) {
                parent = *p;
                xkey = rb_entry(parent, struct key, serial_node);

                if (key->serial < xkey->serial)
                        p = &(*p)->rb_left;
                else if (key->serial > xkey->serial)
                        p = &(*p)->rb_right;
                else
                        goto serial_exists;
        }

        /* we've found a suitable hole - arrange for this key to occupy it */
        rb_link_node(&key->serial_node, parent, p);
        rb_insert_color(&key->serial_node, &key_serial_tree);

        spin_unlock(&key_serial_lock);
        return;

        /* we found a key with the proposed serial number - walk the tree from
         * that point looking for the next unused serial number */
serial_exists:
        for (;;) {
                key->serial++;
                if (key->serial < 3) {
                        key->serial = 3;
                        goto attempt_insertion;
                }

                parent = rb_next(parent);
                if (!parent)
                        goto attempt_insertion;

                xkey = rb_entry(parent, struct key, serial_node);
                if (key->serial < xkey->serial)
                        goto attempt_insertion;
        }
}

/**
 * key_alloc - Allocate a key of the specified type.
 * @type: The type of key to allocate.
 * @desc: The key description to allow the key to be searched out.
 * @uid: The owner of the new key.
 * @gid: The group ID for the new key's group permissions.
 * @cred: The credentials specifying UID namespace.
 * @perm: The permissions mask of the new key.
 * @flags: Flags specifying quota properties.
 * @restrict_link: Optional link restriction for new keyrings.
 *
 * Allocate a key of the specified type with the attributes given.  The key is
 * returned in an uninstantiated state and the caller needs to instantiate the
 * key before returning.
 *
 * The restrict_link structure (if not NULL) will be freed when the
 * keyring is destroyed, so it must be dynamically allocated.
 *
 * The user's key count quota is updated to reflect the creation of the key and
 * the user's key data quota has the default for the key type reserved.  The
 * instantiation function should amend this as necessary.  If insufficient
 * quota is available, -EDQUOT will be returned.
 *
 * The LSM security modules can prevent a key being created, in which case
 * -EACCES will be returned.
 *
 * Returns a pointer to the new key if successful and an error code otherwise.
 *
 * Note that the caller needs to ensure the key type isn't uninstantiated.
 * Internally this can be done by locking key_types_sem.  Externally, this can
 * be done by either never unregistering the key type, or making sure
 * key_alloc() calls don't race with module unloading.
 */
struct key *key_alloc(struct key_type *type, const char *desc,
                      kuid_t uid, kgid_t gid, const struct cred *cred,
                      key_perm_t perm, unsigned long flags,
                      struct key_restriction *restrict_link)
{
        struct key_user *user = NULL;
        struct key *key;
        size_t desclen, quotalen;
        int ret;
        unsigned long irqflags;

        key = ERR_PTR(-EINVAL);
        if (!desc || !*desc)
                goto error;

        if (type->vet_description) {
                ret = type->vet_description(desc);
                if (ret < 0) {
                        key = ERR_PTR(ret);
                        goto error;
                }
        }

        desclen = strlen(desc);
        quotalen = desclen + 1 + type->def_datalen;

        /* get hold of the key tracking for this user */
        user = key_user_lookup(uid);
        if (!user)
                goto no_memory_1;

        /* check that the user's quota permits allocation of another key and
         * its description */
        if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) {
                unsigned maxkeys = uid_eq(uid, GLOBAL_ROOT_UID) ?
                        key_quota_root_maxkeys : key_quota_maxkeys;
                unsigned maxbytes = uid_eq(uid, GLOBAL_ROOT_UID) ?
                        key_quota_root_maxbytes : key_quota_maxbytes;

                spin_lock_irqsave(&user->lock, irqflags);
                if (!(flags & KEY_ALLOC_QUOTA_OVERRUN)) {
                        if (user->qnkeys + 1 > maxkeys ||
                            user->qnbytes + quotalen > maxbytes ||
                            user->qnbytes + quotalen < user->qnbytes)
                                goto no_quota;
                }

                user->qnkeys++;
                user->qnbytes += quotalen;
                spin_unlock_irqrestore(&user->lock, irqflags);
        }

        /* allocate and initialise the key and its description */
        key = kmem_cache_zalloc(key_jar, GFP_KERNEL);
        if (!key)
                goto no_memory_2;

        key->index_key.desc_len = desclen;
        key->index_key.description = kmemdup(desc, desclen + 1, GFP_KERNEL);
        if (!key->index_key.description)
                goto no_memory_3;
        key->index_key.type = type;
        key_set_index_key(&key->index_key);

        refcount_set(&key->usage, 1);
        init_rwsem(&key->sem);
        lockdep_set_class(&key->sem, &type->lock_class);
        key->user = user;
        key->quotalen = quotalen;
        key->datalen = type->def_datalen;
        key->uid = uid;
        key->gid = gid;
        key->perm = perm;
        key->expiry = TIME64_MAX;
        key->restrict_link = restrict_link;
        key->last_used_at = ktime_get_real_seconds();

        key->flags |= 1 << KEY_FLAG_USER_ALIVE;
        if (!(flags & KEY_ALLOC_NOT_IN_QUOTA))
                key->flags |= 1 << KEY_FLAG_IN_QUOTA;
        if (flags & KEY_ALLOC_BUILT_IN)
                key->flags |= 1 << KEY_FLAG_BUILTIN;
        if (flags & KEY_ALLOC_UID_KEYRING)
                key->flags |= 1 << KEY_FLAG_UID_KEYRING;
        if (flags & KEY_ALLOC_SET_KEEP)
                key->flags |= 1 << KEY_FLAG_KEEP;

#ifdef KEY_DEBUGGING
        key->magic = KEY_DEBUG_MAGIC;
#endif

        /* let the security module know about the key */
        ret = security_key_alloc(key, cred, flags);
        if (ret < 0)
                goto security_error;

        /* publish the key by giving it a serial number */
        refcount_inc(&key->domain_tag->usage);
        atomic_inc(&user->nkeys);
        key_alloc_serial(key);

error:
        return key;

security_error:
        kfree(key->description);
        kmem_cache_free(key_jar, key);
        if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) {
                spin_lock_irqsave(&user->lock, irqflags);
                user->qnkeys--;
                user->qnbytes -= quotalen;
                spin_unlock_irqrestore(&user->lock, irqflags);
        }
        key_user_put(user);
        key = ERR_PTR(ret);
        goto error;

no_memory_3:
        kmem_cache_free(key_jar, key);
no_memory_2:
        if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) {
                spin_lock_irqsave(&user->lock, irqflags);
                user->qnkeys--;
                user->qnbytes -= quotalen;
                spin_unlock_irqrestore(&user->lock, irqflags);
        }
        key_user_put(user);
no_memory_1:
        key = ERR_PTR(-ENOMEM);
        goto error;

no_quota:
        spin_unlock_irqrestore(&user->lock, irqflags);
        key_user_put(user);
        key = ERR_PTR(-EDQUOT);
        goto error;
}
EXPORT_SYMBOL(key_alloc);

/**
 * key_payload_reserve - Adjust data quota reservation for the key's payload
 * @key: The key to make the reservation for.
 * @datalen: The amount of data payload the caller now wants.
 *
 * Adjust the amount of the owning user's key data quota that a key reserves.
 * If the amount is increased, then -EDQUOT may be returned if there isn't
 * enough free quota available.
 *
 * If successful, 0 is returned.
 */
int key_payload_reserve(struct key *key, size_t datalen)
{
        int delta = (int)datalen - key->datalen;
        int ret = 0;

        key_check(key);

        /* contemplate the quota adjustment */
        if (delta != 0 && test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) {
                unsigned maxbytes = uid_eq(key->user->uid, GLOBAL_ROOT_UID) ?
                        key_quota_root_maxbytes : key_quota_maxbytes;
                unsigned long flags;

                spin_lock_irqsave(&key->user->lock, flags);

                if (delta > 0 &&
                    (key->user->qnbytes + delta > maxbytes ||
                     key->user->qnbytes + delta < key->user->qnbytes)) {
                        ret = -EDQUOT;
                }
                else {
                        key->user->qnbytes += delta;
                        key->quotalen += delta;
                }
                spin_unlock_irqrestore(&key->user->lock, flags);
        }

        /* change the recorded data length if that didn't generate an error */
        if (ret == 0)
                key->datalen = datalen;

        return ret;
}
EXPORT_SYMBOL(key_payload_reserve);

/*
 * Change the key state to being instantiated.
 */
static void mark_key_instantiated(struct key *key, int reject_error)
{
        /* Commit the payload before setting the state; barrier versus
         * key_read_state().
         */
        smp_store_release(&key->state,
                          (reject_error < 0) ? reject_error : KEY_IS_POSITIVE);
}

/*
 * Instantiate a key and link it into the target keyring atomically.  Must be
 * called with the target keyring's semaphore writelocked.  The target key's
 * semaphore need not be locked as instantiation is serialised by
 * key_construction_mutex.
 */
static int __key_instantiate_and_link(struct key *key,
                                      struct key_preparsed_payload *prep,
                                      struct key *keyring,
                                      struct key *authkey,
                                      struct assoc_array_edit **_edit)
{
        int ret, awaken;

        key_check(key);
        key_check(keyring);

        awaken = 0;
        ret = -EBUSY;

        mutex_lock(&key_construction_mutex);

        /* can't instantiate twice */
        if (key->state == KEY_IS_UNINSTANTIATED) {
                /* instantiate the key */
                ret = key->type->instantiate(key, prep);

                if (ret == 0) {
                        /* mark the key as being instantiated */
                        atomic_inc(&key->user->nikeys);
                        mark_key_instantiated(key, 0);
                        notify_key(key, NOTIFY_KEY_INSTANTIATED, 0);

                        if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags))
                                awaken = 1;

                        /* and link it into the destination keyring */
                        if (keyring) {
                                if (test_bit(KEY_FLAG_KEEP, &keyring->flags))
                                        set_bit(KEY_FLAG_KEEP, &key->flags);

                                __key_link(keyring, key, _edit);
                        }

                        /* disable the authorisation key */
                        if (authkey)
                                key_invalidate(authkey);

                        if (prep->expiry != TIME64_MAX)
                                key_set_expiry(key, prep->expiry);
                }
        }

        mutex_unlock(&key_construction_mutex);

        /* wake up anyone waiting for a key to be constructed */
        if (awaken)
                wake_up_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT);

        return ret;
}

/**
 * key_instantiate_and_link - Instantiate a key and link it into the keyring.
 * @key: The key to instantiate.
 * @data: The data to use to instantiate the keyring.
 * @datalen: The length of @data.
 * @keyring: Keyring to create a link in on success (or NULL).
 * @authkey: The authorisation token permitting instantiation.
 *
 * Instantiate a key that's in the uninstantiated state using the provided data
 * and, if successful, link it in to the destination keyring if one is
 * supplied.
 *
 * If successful, 0 is returned, the authorisation token is revoked and anyone
 * waiting for the key is woken up.  If the key was already instantiated,
 * -EBUSY will be returned.
 */
int key_instantiate_and_link(struct key *key,
                             const void *data,
                             size_t datalen,
                             struct key *keyring,
                             struct key *authkey)
{
        struct key_preparsed_payload prep;
        struct assoc_array_edit *edit = NULL;
        int ret;

        memset(&prep, 0, sizeof(prep));
        prep.orig_description = key->description;
        prep.data = data;
        prep.datalen = datalen;
        prep.quotalen = key->type->def_datalen;
        prep.expiry = TIME64_MAX;
        if (key->type->preparse) {
                ret = key->type->preparse(&prep);
                if (ret < 0)
                        goto error;
        }

        if (keyring) {
                ret = __key_link_lock(keyring, &key->index_key);
                if (ret < 0)
                        goto error;

                ret = __key_link_begin(keyring, &key->index_key, &edit);
                if (ret < 0)
                        goto error_link_end;

                if (keyring->restrict_link && keyring->restrict_link->check) {
                        struct key_restriction *keyres = keyring->restrict_link;

                        ret = keyres->check(keyring, key->type, &prep.payload,
                                            keyres->key);
                        if (ret < 0)
                                goto error_link_end;
                }
        }

        ret = __key_instantiate_and_link(key, &prep, keyring, authkey, &edit);

error_link_end:
        if (keyring)
                __key_link_end(keyring, &key->index_key, edit);

error:
        if (key->type->preparse)
                key->type->free_preparse(&prep);
        return ret;
}

EXPORT_SYMBOL(key_instantiate_and_link);

/**
 * key_reject_and_link - Negatively instantiate a key and link it into the keyring.
 * @key: The key to instantiate.
 * @timeout: The timeout on the negative key.
 * @error: The error to return when the key is hit.
 * @keyring: Keyring to create a link in on success (or NULL).
 * @authkey: The authorisation token permitting instantiation.
 *
 * Negatively instantiate a key that's in the uninstantiated state and, if
 * successful, set its timeout and stored error and link it in to the
 * destination keyring if one is supplied.  The key and any links to the key
 * will be automatically garbage collected after the timeout expires.
 *
 * Negative keys are used to rate limit repeated request_key() calls by causing
 * them to return the stored error code (typically ENOKEY) until the negative
 * key expires.
 *
 * If successful, 0 is returned, the authorisation token is revoked and anyone
 * waiting for the key is woken up.  If the key was already instantiated,
 * -EBUSY will be returned.
 */
int key_reject_and_link(struct key *key,
                        unsigned timeout,
                        unsigned error,
                        struct key *keyring,
                        struct key *authkey)
{
        struct assoc_array_edit *edit = NULL;
        int ret, awaken, link_ret = 0;

        key_check(key);
        key_check(keyring);

        awaken = 0;
        ret = -EBUSY;

        if (keyring) {
                if (keyring->restrict_link)
                        return -EPERM;

                link_ret = __key_link_lock(keyring, &key->index_key);
                if (link_ret == 0) {
                        link_ret = __key_link_begin(keyring, &key->index_key, &edit);
                        if (link_ret < 0)
                                __key_link_end(keyring, &key->index_key, edit);
                }
        }

        mutex_lock(&key_construction_mutex);

        /* can't instantiate twice */
        if (key->state == KEY_IS_UNINSTANTIATED) {
                /* mark the key as being negatively instantiated */
                atomic_inc(&key->user->nikeys);
                mark_key_instantiated(key, -error);
                notify_key(key, NOTIFY_KEY_INSTANTIATED, -error);
                key_set_expiry(key, ktime_get_real_seconds() + timeout);

                if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags))
                        awaken = 1;

                ret = 0;

                /* and link it into the destination keyring */
                if (keyring && link_ret == 0)
                        __key_link(keyring, key, &edit);

                /* disable the authorisation key */
                if (authkey)
                        key_invalidate(authkey);
        }

        mutex_unlock(&key_construction_mutex);

        if (keyring && link_ret == 0)
                __key_link_end(keyring, &key->index_key, edit);

        /* wake up anyone waiting for a key to be constructed */
        if (awaken)
                wake_up_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT);

        return ret == 0 ? link_ret : ret;
}
EXPORT_SYMBOL(key_reject_and_link);

/**
 * key_put - Discard a reference to a key.
 * @key: The key to discard a reference from.
 *
 * Discard a reference to a key, and when all the references are gone, we
 * schedule the cleanup task to come and pull it out of the tree in process
 * context at some later time.
 */
void key_put(struct key *key)
{
        if (key) {
                key_check(key);

                if (refcount_dec_and_test(&key->usage)) {
                        unsigned long flags;

                        /* deal with the user's key tracking and quota */
                        if (test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) {
                                spin_lock_irqsave(&key->user->lock, flags);
                                key->user->qnkeys--;
                                key->user->qnbytes -= key->quotalen;
                                spin_unlock_irqrestore(&key->user->lock, flags);
                        }
                        /* Mark key as safe for GC after key->user done. */
                        clear_bit_unlock(KEY_FLAG_USER_ALIVE, &key->flags);
                        schedule_work(&key_gc_work);
                }
        }
}
EXPORT_SYMBOL(key_put);

/*
 * Find a key by its serial number.
 */
struct key *key_lookup(key_serial_t id)
{
        struct rb_node *n;
        struct key *key;

        spin_lock(&key_serial_lock);

        /* search the tree for the specified key */
        n = key_serial_tree.rb_node;
        while (n) {
                key = rb_entry(n, struct key, serial_node);

                if (id < key->serial)
                        n = n->rb_left;
                else if (id > key->serial)
                        n = n->rb_right;
                else
                        goto found;
        }

not_found:
        key = ERR_PTR(-ENOKEY);
        goto error;

found:
        /* A key is allowed to be looked up only if someone still owns a
         * reference to it - otherwise it's awaiting the gc.
         */
        if (!refcount_inc_not_zero(&key->usage))
                goto not_found;

error:
        spin_unlock(&key_serial_lock);
        return key;
}
EXPORT_SYMBOL(key_lookup);

/*
 * Find and lock the specified key type against removal.
 *
 * We return with the sem read-locked if successful.  If the type wasn't
 * available -ENOKEY is returned instead.
 */
struct key_type *key_type_lookup(const char *type)
{
        struct key_type *ktype;

        down_read(&key_types_sem);

        /* look up the key type to see if it's one of the registered kernel
         * types */
        list_for_each_entry(ktype, &key_types_list, link) {
                if (strcmp(ktype->name, type) == 0)
                        goto found_kernel_type;
        }

        up_read(&key_types_sem);
        ktype = ERR_PTR(-ENOKEY);

found_kernel_type:
        return ktype;
}

void key_set_timeout(struct key *key, unsigned timeout)
{
        time64_t expiry = TIME64_MAX;

        /* make the changes with the locks held to prevent races */
        down_write(&key->sem);

        if (timeout > 0)
                expiry = ktime_get_real_seconds() + timeout;
        key_set_expiry(key, expiry);

        up_write(&key->sem);
}
EXPORT_SYMBOL_GPL(key_set_timeout);

/*
 * Unlock a key type locked by key_type_lookup().
 */
void key_type_put(struct key_type *ktype)
{
        up_read(&key_types_sem);
}

/*
 * Attempt to update an existing key.
 *
 * The key is given to us with an incremented refcount that we need to discard
 * if we get an error.
 */
static inline key_ref_t __key_update(key_ref_t key_ref,
                                     struct key_preparsed_payload *prep)
{
        struct key *key = key_ref_to_ptr(key_ref);
        int ret;

        /* need write permission on the key to update it */
        ret = key_permission(key_ref, KEY_NEED_WRITE);
        if (ret < 0)
                goto error;

        ret = -EEXIST;
        if (!key->type->update)
                goto error;

        down_write(&key->sem);

        ret = key->type->update(key, prep);
        if (ret == 0) {
                /* Updating a negative key positively instantiates it */
                mark_key_instantiated(key, 0);
                notify_key(key, NOTIFY_KEY_UPDATED, 0);
        }

        up_write(&key->sem);

        if (ret < 0)
                goto error;
out:
        return key_ref;

error:
        key_put(key);
        key_ref = ERR_PTR(ret);
        goto out;
}

/*
 * Create or potentially update a key. The combined logic behind
 * key_create_or_update() and key_create()
 */
static key_ref_t __key_create_or_update(key_ref_t keyring_ref,
                                        const char *type,
                                        const char *description,
                                        const void *payload,
                                        size_t plen,
                                        key_perm_t perm,
                                        unsigned long flags,
                                        bool allow_update)
{
        struct keyring_index_key index_key = {
                .description        = description,
        };
        struct key_preparsed_payload prep;
        struct assoc_array_edit *edit = NULL;
        const struct cred *cred = current_cred();
        struct key *keyring, *key = NULL;
        key_ref_t key_ref;
        int ret;
        struct key_restriction *restrict_link = NULL;

        /* look up the key type to see if it's one of the registered kernel
         * types */
        index_key.type = key_type_lookup(type);
        if (IS_ERR(index_key.type)) {
                key_ref = ERR_PTR(-ENODEV);
                goto error;
        }

        key_ref = ERR_PTR(-EINVAL);
        if (!index_key.type->instantiate ||
            (!index_key.description && !index_key.type->preparse))
                goto error_put_type;

        keyring = key_ref_to_ptr(keyring_ref);

        key_check(keyring);

        if (!(flags & KEY_ALLOC_BYPASS_RESTRICTION))
                restrict_link = keyring->restrict_link;

        key_ref = ERR_PTR(-ENOTDIR);
        if (keyring->type != &key_type_keyring)
                goto error_put_type;

        memset(&prep, 0, sizeof(prep));
        prep.orig_description = description;
        prep.data = payload;
        prep.datalen = plen;
        prep.quotalen = index_key.type->def_datalen;
        prep.expiry = TIME64_MAX;
        if (index_key.type->preparse) {
                ret = index_key.type->preparse(&prep);
                if (ret < 0) {
                        key_ref = ERR_PTR(ret);
                        goto error_free_prep;
                }
                if (!index_key.description)
                        index_key.description = prep.description;
                key_ref = ERR_PTR(-EINVAL);
                if (!index_key.description)
                        goto error_free_prep;
        }
        index_key.desc_len = strlen(index_key.description);
        key_set_index_key(&index_key);

        ret = __key_link_lock(keyring, &index_key);
        if (ret < 0) {
                key_ref = ERR_PTR(ret);
                goto error_free_prep;
        }

        ret = __key_link_begin(keyring, &index_key, &edit);
        if (ret < 0) {
                key_ref = ERR_PTR(ret);
                goto error_link_end;
        }

        if (restrict_link && restrict_link->check) {
                ret = restrict_link->check(keyring, index_key.type,
                                           &prep.payload, restrict_link->key);
                if (ret < 0) {
                        key_ref = ERR_PTR(ret);
                        goto error_link_end;
                }
        }

        /* if we're going to allocate a new key, we're going to have
         * to modify the keyring */
        ret = key_permission(keyring_ref, KEY_NEED_WRITE);
        if (ret < 0) {
                key_ref = ERR_PTR(ret);
                goto error_link_end;
        }

        /* if it's requested and possible to update this type of key, search
         * for an existing key of the same type and description in the
         * destination keyring and update that instead if possible
         */
        if (allow_update) {
                if (index_key.type->update) {
                        key_ref = find_key_to_update(keyring_ref, &index_key);
                        if (key_ref)
                                goto found_matching_key;
                }
        } else {
                key_ref = find_key_to_update(keyring_ref, &index_key);
                if (key_ref) {
                        key_ref_put(key_ref);
                        key_ref = ERR_PTR(-EEXIST);
                        goto error_link_end;
                }
        }

        /* if the client doesn't provide, decide on the permissions we want */
        if (perm == KEY_PERM_UNDEF) {
                perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
                perm |= KEY_USR_VIEW;

                if (index_key.type->read)
                        perm |= KEY_POS_READ;

                if (index_key.type == &key_type_keyring ||
                    index_key.type->update)
                        perm |= KEY_POS_WRITE;
        }

        /* allocate a new key */
        key = key_alloc(index_key.type, index_key.description,
                        cred->fsuid, cred->fsgid, cred, perm, flags, NULL);
        if (IS_ERR(key)) {
                key_ref = ERR_CAST(key);
                goto error_link_end;
        }

        /* instantiate it and link it into the target keyring */
        ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &edit);
        if (ret < 0) {
                key_put(key);
                key_ref = ERR_PTR(ret);
                goto error_link_end;
        }

        security_key_post_create_or_update(keyring, key, payload, plen, flags,
                                           true);

        key_ref = make_key_ref(key, is_key_possessed(keyring_ref));

error_link_end:
        __key_link_end(keyring, &index_key, edit);
error_free_prep:
        if (index_key.type->preparse)
                index_key.type->free_preparse(&prep);
error_put_type:
        key_type_put(index_key.type);
error:
        return key_ref;

 found_matching_key:
        /* we found a matching key, so we're going to try to update it
         * - we can drop the locks first as we have the key pinned
         */
        __key_link_end(keyring, &index_key, edit);

        key = key_ref_to_ptr(key_ref);
        if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) {
                ret = wait_for_key_construction(key, true);
                if (ret < 0) {
                        key_ref_put(key_ref);
                        key_ref = ERR_PTR(ret);
                        goto error_free_prep;
                }
        }

        key_ref = __key_update(key_ref, &prep);

        if (!IS_ERR(key_ref))
                security_key_post_create_or_update(keyring, key, payload, plen,
                                                   flags, false);

        goto error_free_prep;
}

/**
 * key_create_or_update - Update or create and instantiate a key.
 * @keyring_ref: A pointer to the destination keyring with possession flag.
 * @type: The type of key.
 * @description: The searchable description for the key.
 * @payload: The data to use to instantiate or update the key.
 * @plen: The length of @payload.
 * @perm: The permissions mask for a new key.
 * @flags: The quota flags for a new key.
 *
 * Search the destination keyring for a key of the same description and if one
 * is found, update it, otherwise create and instantiate a new one and create a
 * link to it from that keyring.
 *
 * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be
 * concocted.
 *
 * Returns a pointer to the new key if successful, -ENODEV if the key type
 * wasn't available, -ENOTDIR if the keyring wasn't a keyring, -EACCES if the
 * caller isn't permitted to modify the keyring or the LSM did not permit
 * creation of the key.
 *
 * On success, the possession flag from the keyring ref will be tacked on to
 * the key ref before it is returned.
 */
key_ref_t key_create_or_update(key_ref_t keyring_ref,
                               const char *type,
                               const char *description,
                               const void *payload,
                               size_t plen,
                               key_perm_t perm,
                               unsigned long flags)
{
        return __key_create_or_update(keyring_ref, type, description, payload,
                                      plen, perm, flags, true);
}
EXPORT_SYMBOL(key_create_or_update);

/**
 * key_create - Create and instantiate a key.
 * @keyring_ref: A pointer to the destination keyring with possession flag.
 * @type: The type of key.
 * @description: The searchable description for the key.
 * @payload: The data to use to instantiate or update the key.
 * @plen: The length of @payload.
 * @perm: The permissions mask for a new key.
 * @flags: The quota flags for a new key.
 *
 * Create and instantiate a new key and link to it from the destination keyring.
 *
 * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be
 * concocted.
 *
 * Returns a pointer to the new key if successful, -EEXIST if a key with the
 * same description already exists, -ENODEV if the key type wasn't available,
 * -ENOTDIR if the keyring wasn't a keyring, -EACCES if the caller isn't
 * permitted to modify the keyring or the LSM did not permit creation of the
 * key.
 *
 * On success, the possession flag from the keyring ref will be tacked on to
 * the key ref before it is returned.
 */
key_ref_t key_create(key_ref_t keyring_ref,
                     const char *type,
                     const char *description,
                     const void *payload,
                     size_t plen,
                     key_perm_t perm,
                     unsigned long flags)
{
        return __key_create_or_update(keyring_ref, type, description, payload,
                                      plen, perm, flags, false);
}
EXPORT_SYMBOL(key_create);

/**
 * key_update - Update a key's contents.
 * @key_ref: The pointer (plus possession flag) to the key.
 * @payload: The data to be used to update the key.
 * @plen: The length of @payload.
 *
 * Attempt to update the contents of a key with the given payload data.  The
 * caller must be granted Write permission on the key.  Negative keys can be
 * instantiated by this method.
 *
 * Returns 0 on success, -EACCES if not permitted and -EOPNOTSUPP if the key
 * type does not support updating.  The key type may return other errors.
 */
int key_update(key_ref_t key_ref, const void *payload, size_t plen)
{
        struct key_preparsed_payload prep;
        struct key *key = key_ref_to_ptr(key_ref);
        int ret;

        key_check(key);

        /* the key must be writable */
        ret = key_permission(key_ref, KEY_NEED_WRITE);
        if (ret < 0)
                return ret;

        /* attempt to update it if supported */
        if (!key->type->update)
                return -EOPNOTSUPP;

        memset(&prep, 0, sizeof(prep));
        prep.data = payload;
        prep.datalen = plen;
        prep.quotalen = key->type->def_datalen;
        prep.expiry = TIME64_MAX;
        if (key->type->preparse) {
                ret = key->type->preparse(&prep);
                if (ret < 0)
                        goto error;
        }

        down_write(&key->sem);

        ret = key->type->update(key, &prep);
        if (ret == 0) {
                /* Updating a negative key positively instantiates it */
                mark_key_instantiated(key, 0);
                notify_key(key, NOTIFY_KEY_UPDATED, 0);
        }

        up_write(&key->sem);

error:
        if (key->type->preparse)
                key->type->free_preparse(&prep);
        return ret;
}
EXPORT_SYMBOL(key_update);

/**
 * key_revoke - Revoke a key.
 * @key: The key to be revoked.
 *
 * Mark a key as being revoked and ask the type to free up its resources.  The
 * revocation timeout is set and the key and all its links will be
 * automatically garbage collected after key_gc_delay amount of time if they
 * are not manually dealt with first.
 */
void key_revoke(struct key *key)
{
        time64_t time;

        key_check(key);

        /* make sure no one's trying to change or use the key when we mark it
         * - we tell lockdep that we might nest because we might be revoking an
         *   authorisation key whilst holding the sem on a key we've just
         *   instantiated
         */
        down_write_nested(&key->sem, 1);
        if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags)) {
                notify_key(key, NOTIFY_KEY_REVOKED, 0);
                if (key->type->revoke)
                        key->type->revoke(key);

                /* set the death time to no more than the expiry time */
                time = ktime_get_real_seconds();
                if (key->revoked_at == 0 || key->revoked_at > time) {
                        key->revoked_at = time;
                        key_schedule_gc(key->revoked_at + key_gc_delay);
                }
        }

        up_write(&key->sem);
}
EXPORT_SYMBOL(key_revoke);

/**
 * key_invalidate - Invalidate a key.
 * @key: The key to be invalidated.
 *
 * Mark a key as being invalidated and have it cleaned up immediately.  The key
 * is ignored by all searches and other operations from this point.
 */
void key_invalidate(struct key *key)
{
        kenter("%d", key_serial(key));

        key_check(key);

        if (!test_bit(KEY_FLAG_INVALIDATED, &key->flags)) {
                down_write_nested(&key->sem, 1);
                if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) {
                        notify_key(key, NOTIFY_KEY_INVALIDATED, 0);
                        key_schedule_gc_links();
                }
                up_write(&key->sem);
        }
}
EXPORT_SYMBOL(key_invalidate);

/**
 * generic_key_instantiate - Simple instantiation of a key from preparsed data
 * @key: The key to be instantiated
 * @prep: The preparsed data to load.
 *
 * Instantiate a key from preparsed data.  We assume we can just copy the data
 * in directly and clear the old pointers.
 *
 * This can be pointed to directly by the key type instantiate op pointer.
 */
int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
{
        int ret;

        pr_devel("==>%s()\n", __func__);

        ret = key_payload_reserve(key, prep->quotalen);
        if (ret == 0) {
                rcu_assign_keypointer(key, prep->payload.data[0]);
                key->payload.data[1] = prep->payload.data[1];
                key->payload.data[2] = prep->payload.data[2];
                key->payload.data[3] = prep->payload.data[3];
                prep->payload.data[0] = NULL;
                prep->payload.data[1] = NULL;
                prep->payload.data[2] = NULL;
                prep->payload.data[3] = NULL;
        }
        pr_devel("<==%s() = %d\n", __func__, ret);
        return ret;
}
EXPORT_SYMBOL(generic_key_instantiate);

/**
 * register_key_type - Register a type of key.
 * @ktype: The new key type.
 *
 * Register a new key type.
 *
 * Returns 0 on success or -EEXIST if a type of this name already exists.
 */
int register_key_type(struct key_type *ktype)
{
        struct key_type *p;
        int ret;

        memset(&ktype->lock_class, 0, sizeof(ktype->lock_class));

        ret = -EEXIST;
        down_write(&key_types_sem);

        /* disallow key types with the same name */
        list_for_each_entry(p, &key_types_list, link) {
                if (strcmp(p->name, ktype->name) == 0)
                        goto out;
        }

        /* store the type */
        list_add(&ktype->link, &key_types_list);

        pr_notice("Key type %s registered\n", ktype->name);
        ret = 0;

out:
        up_write(&key_types_sem);
        return ret;
}
EXPORT_SYMBOL(register_key_type);

/**
 * unregister_key_type - Unregister a type of key.
 * @ktype: The key type.
 *
 * Unregister a key type and mark all the extant keys of this type as dead.
 * Those keys of this type are then destroyed to get rid of their payloads and
 * they and their links will be garbage collected as soon as possible.
 */
void unregister_key_type(struct key_type *ktype)
{
        down_write(&key_types_sem);
        list_del_init(&ktype->link);
        downgrade_write(&key_types_sem);
        key_gc_keytype(ktype);
        pr_notice("Key type %s unregistered\n", ktype->name);
        up_read(&key_types_sem);
}
EXPORT_SYMBOL(unregister_key_type);

/*
 * Initialise the key management state.
 */
void __init key_init(void)
{
        /* allocate a slab in which we can store keys */
        key_jar = kmem_cache_create("key_jar", sizeof(struct key),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);

        /* add the special key types */
        list_add_tail(&key_type_keyring.link, &key_types_list);
        list_add_tail(&key_type_dead.link, &key_types_list);
        list_add_tail(&key_type_user.link, &key_types_list);
        list_add_tail(&key_type_logon.link, &key_types_list);

        /* record the root user tracking */
        rb_link_node(&root_key_user.node,
                     NULL,
                     &key_user_tree.rb_node);

        rb_insert_color(&root_key_user.node,
                        &key_user_tree);
}











































































































































































































































































































































































































































    8 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kmem

#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KMEM_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

TRACE_EVENT(kmem_cache_alloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 struct kmem_cache *s,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, s, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __string(        name,                s->name                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                node                )
                __field(        bool,                accounted        )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __assign_str(name);
                __entry->bytes_req        = s->object_size;
                __entry->bytes_alloc        = s->size;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->node                = node;
                __entry->accounted        = IS_ENABLED(CONFIG_MEMCG) ?
                                          ((gfp_flags & __GFP_ACCOUNT) ||
                                          (s->flags & SLAB_ACCOUNT)) : false;
        ),

        TP_printk("call_site=%pS ptr=%p name=%s bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __get_str(name),
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node,
                __entry->accounted ? "true" : "false")
);

TRACE_EVENT(kmalloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 size_t bytes_req,
                 size_t bytes_alloc,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                node                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = bytes_req;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->node                = node;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node,
                (IS_ENABLED(CONFIG_MEMCG) &&
                 (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false")
);

TRACE_EVENT(kfree,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
        ),

        TP_printk("call_site=%pS ptr=%p",
                  (void *)__entry->call_site, __entry->ptr)
);

TRACE_EVENT(kmem_cache_free,

        TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s),

        TP_ARGS(call_site, ptr, s),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __string(        name,                s->name                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __assign_str(name);
        ),

        TP_printk("call_site=%pS ptr=%p name=%s",
                  (void *)__entry->call_site, __entry->ptr, __get_str(name))
);

TRACE_EVENT(mm_page_free,

        TP_PROTO(struct page *page, unsigned int order),

        TP_ARGS(page, order),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
                __entry->order                = order;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn,
                        __entry->order)
);

TRACE_EVENT(mm_page_free_batched,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
        ),

        TP_printk("page=%p pfn=0x%lx order=0",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn)
);

TRACE_EVENT(mm_page_alloc,

        TP_PROTO(struct page *page, unsigned int order,
                        gfp_t gfp_flags, int migratetype),

        TP_ARGS(page, order, gfp_flags, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                show_gfp_flags(__entry->gfp_flags))
);

DECLARE_EVENT_CLASS(mm_page,

        TP_PROTO(struct page *page, unsigned int order, int migratetype,
                 int percpu_refill),

        TP_ARGS(page, order, migratetype, percpu_refill),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
                __field(        int,                percpu_refill        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
                __entry->percpu_refill        = percpu_refill;
        ),

        TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                __entry->percpu_refill)
);

DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,

        TP_PROTO(struct page *page, unsigned int order, int migratetype,
                 int percpu_refill),

        TP_ARGS(page, order, migratetype, percpu_refill)
);

TRACE_EVENT(mm_page_pcpu_drain,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d",
                pfn_to_page(__entry->pfn), __entry->pfn,
                __entry->order, __entry->migratetype)
);

TRACE_EVENT(mm_page_alloc_extfrag,

        TP_PROTO(struct page *page,
                int alloc_order, int fallback_order,
                int alloc_migratetype, int fallback_migratetype),

        TP_ARGS(page,
                alloc_order, fallback_order,
                alloc_migratetype, fallback_migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                        )
                __field(        int,                alloc_order                )
                __field(        int,                fallback_order                )
                __field(        int,                alloc_migratetype        )
                __field(        int,                fallback_migratetype        )
                __field(        int,                change_ownership        )
        ),

        TP_fast_assign(
                __entry->pfn                        = page_to_pfn(page);
                __entry->alloc_order                = alloc_order;
                __entry->fallback_order                = fallback_order;
                __entry->alloc_migratetype        = alloc_migratetype;
                __entry->fallback_migratetype        = fallback_migratetype;
                __entry->change_ownership        = (alloc_migratetype ==
                                        get_pageblock_migratetype(page));
        ),

        TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
                pfn_to_page(__entry->pfn),
                __entry->pfn,
                __entry->alloc_order,
                __entry->fallback_order,
                pageblock_order,
                __entry->alloc_migratetype,
                __entry->fallback_migratetype,
                __entry->fallback_order < pageblock_order,
                __entry->change_ownership)
);

TRACE_EVENT(mm_setup_per_zone_wmarks,

        TP_PROTO(struct zone *zone),

        TP_ARGS(zone),

        TP_STRUCT__entry(
                __field(int, node_id)
                __string(name, zone->name)
                __field(unsigned long, watermark_min)
                __field(unsigned long, watermark_low)
                __field(unsigned long, watermark_high)
                __field(unsigned long, watermark_promo)
        ),

        TP_fast_assign(
                __entry->node_id = zone->zone_pgdat->node_id;
                __assign_str(name);
                __entry->watermark_min = zone->_watermark[WMARK_MIN];
                __entry->watermark_low = zone->_watermark[WMARK_LOW];
                __entry->watermark_high = zone->_watermark[WMARK_HIGH];
                __entry->watermark_promo = zone->_watermark[WMARK_PROMO];
        ),

        TP_printk("node_id=%d zone name=%s watermark min=%lu low=%lu high=%lu promo=%lu",
                  __entry->node_id,
                  __get_str(name),
                  __entry->watermark_min,
                  __entry->watermark_low,
                  __entry->watermark_high,
                  __entry->watermark_promo)
);

TRACE_EVENT(mm_setup_per_zone_lowmem_reserve,

        TP_PROTO(struct zone *zone, struct zone *upper_zone, long lowmem_reserve),

        TP_ARGS(zone, upper_zone, lowmem_reserve),

        TP_STRUCT__entry(
                __field(int, node_id)
                __string(name, zone->name)
                __string(upper_name, upper_zone->name)
                __field(long, lowmem_reserve)
        ),

        TP_fast_assign(
                __entry->node_id = zone->zone_pgdat->node_id;
                __assign_str(name);
                __assign_str(upper_name);
                __entry->lowmem_reserve = lowmem_reserve;
        ),

        TP_printk("node_id=%d zone name=%s upper_zone name=%s lowmem_reserve_pages=%ld",
                  __entry->node_id,
                  __get_str(name),
                  __get_str(upper_name),
                  __entry->lowmem_reserve)
);

TRACE_EVENT(mm_calculate_totalreserve_pages,

        TP_PROTO(unsigned long totalreserve_pages),

        TP_ARGS(totalreserve_pages),

        TP_STRUCT__entry(
                __field(unsigned long, totalreserve_pages)
        ),

        TP_fast_assign(
                __entry->totalreserve_pages = totalreserve_pages;
        ),

        TP_printk("totalreserve_pages=%lu", __entry->totalreserve_pages)
);


/*
 * Required for uniquely and securely identifying mm in rss_stat tracepoint.
 */
#ifndef __PTR_TO_HASHVAL
static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
{
        int ret;
        unsigned long hashval;

        ret = ptr_to_hashval(ptr, &hashval);
        if (ret)
                return 0;

        /* The hashed value is only 32-bit */
        return (unsigned int)hashval;
}
#define __PTR_TO_HASHVAL
#endif

#define TRACE_MM_PAGES                \
        EM(MM_FILEPAGES)        \
        EM(MM_ANONPAGES)        \
        EM(MM_SWAPENTS)                \
        EMe(MM_SHMEMPAGES)

#undef EM
#undef EMe

#define EM(a)        TRACE_DEFINE_ENUM(a);
#define EMe(a)        TRACE_DEFINE_ENUM(a);

TRACE_MM_PAGES

#undef EM
#undef EMe

#define EM(a)        { a, #a },
#define EMe(a)        { a, #a }

TRACE_EVENT(rss_stat,

        TP_PROTO(struct mm_struct *mm,
                int member),

        TP_ARGS(mm, member),

        TP_STRUCT__entry(
                __field(unsigned int, mm_id)
                __field(unsigned int, curr)
                __field(int, member)
                __field(long, size)
        ),

        TP_fast_assign(
                __entry->mm_id = mm_ptr_to_hash(mm);
                /*
                 * curr is true if the mm matches the current task's mm_struct.
                 * Since kthreads (PF_KTHREAD) have no mm_struct of their own
                 * but can borrow one via kthread_use_mm(), we must filter them
                 * out to avoid incorrectly attributing the RSS update to them.
                 */
                __entry->curr = current->mm == mm && !(current->flags & PF_KTHREAD);
                __entry->member = member;
                __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
                                                            << PAGE_SHIFT);
        ),

        TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
                __entry->mm_id,
                __entry->curr,
                __print_symbolic(__entry->member, TRACE_MM_PAGES),
                __entry->size)
        );
#endif /* _TRACE_KMEM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>









































































































































    1 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright IBM Corporation, 2021
 *
 * Author: Mike Rapoport <rppt@linux.ibm.com>
 */

#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/swap.h>
#include <linux/mount.h>
#include <linux/memfd.h>
#include <linux/bitops.h>
#include <linux/printk.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/pseudo_fs.h>
#include <linux/secretmem.h>
#include <linux/set_memory.h>
#include <linux/sched/signal.h>

#include <uapi/linux/magic.h>

#include <asm/tlbflush.h>

#include "internal.h"

#undef pr_fmt
#define pr_fmt(fmt) "secretmem: " fmt

/*
 * Define mode and flag masks to allow validation of the system call
 * parameters.
 */
#define SECRETMEM_MODE_MASK        (0x0)
#define SECRETMEM_FLAGS_MASK        SECRETMEM_MODE_MASK

static bool secretmem_enable __ro_after_init = 1;
module_param_named(enable, secretmem_enable, bool, 0400);
MODULE_PARM_DESC(secretmem_enable,
                 "Enable secretmem and memfd_secret(2) system call");

static atomic_t secretmem_users;

bool secretmem_active(void)
{
        return !!atomic_read(&secretmem_users);
}

static vm_fault_t secretmem_fault(struct vm_fault *vmf)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        struct inode *inode = file_inode(vmf->vma->vm_file);
        pgoff_t offset = vmf->pgoff;
        gfp_t gfp = vmf->gfp_mask;
        unsigned long addr;
        struct folio *folio;
        vm_fault_t ret;
        int err;

        if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
                return vmf_error(-EINVAL);

        filemap_invalidate_lock_shared(mapping);

retry:
        folio = filemap_lock_folio(mapping, offset);
        if (IS_ERR(folio)) {
                folio = folio_alloc(gfp | __GFP_ZERO, 0);
                if (!folio) {
                        ret = VM_FAULT_OOM;
                        goto out;
                }

                err = set_direct_map_invalid_noflush(folio_page(folio, 0));
                if (err) {
                        folio_put(folio);
                        ret = vmf_error(err);
                        goto out;
                }

                __folio_mark_uptodate(folio);
                err = filemap_add_folio(mapping, folio, offset, gfp);
                if (unlikely(err)) {
                        /*
                         * If a split of large page was required, it
                         * already happened when we marked the page invalid
                         * which guarantees that this call won't fail
                         */
                        set_direct_map_default_noflush(folio_page(folio, 0));
                        folio_put(folio);
                        if (err == -EEXIST)
                                goto retry;

                        ret = vmf_error(err);
                        goto out;
                }

                addr = (unsigned long)folio_address(folio);
                flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
        }

        vmf->page = folio_file_page(folio, vmf->pgoff);
        ret = VM_FAULT_LOCKED;

out:
        filemap_invalidate_unlock_shared(mapping);
        return ret;
}

static const struct vm_operations_struct secretmem_vm_ops = {
        .fault = secretmem_fault,
};

static int secretmem_release(struct inode *inode, struct file *file)
{
        atomic_dec(&secretmem_users);
        return 0;
}

static int secretmem_mmap_prepare(struct vm_area_desc *desc)
{
        const unsigned long len = vma_desc_size(desc);

        if (!vma_desc_test_any(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT))
                return -EINVAL;

        vma_desc_set_flags(desc, VMA_LOCKED_BIT, VMA_DONTDUMP_BIT);
        if (!mlock_future_ok(desc->mm, /*is_vma_locked=*/ true, len))
                return -EAGAIN;
        desc->vm_ops = &secretmem_vm_ops;

        return 0;
}

bool vma_is_secretmem(struct vm_area_struct *vma)
{
        return vma->vm_ops == &secretmem_vm_ops;
}

static const struct file_operations secretmem_fops = {
        .release        = secretmem_release,
        .mmap_prepare        = secretmem_mmap_prepare,
};

static int secretmem_migrate_folio(struct address_space *mapping,
                struct folio *dst, struct folio *src, enum migrate_mode mode)
{
        return -EBUSY;
}

static void secretmem_free_folio(struct folio *folio)
{
        set_direct_map_default_noflush(folio_page(folio, 0));
        folio_zero_segment(folio, 0, folio_size(folio));
}

const struct address_space_operations secretmem_aops = {
        .dirty_folio        = noop_dirty_folio,
        .free_folio        = secretmem_free_folio,
        .migrate_folio        = secretmem_migrate_folio,
};

static int secretmem_setattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        struct address_space *mapping = inode->i_mapping;
        unsigned int ia_valid = iattr->ia_valid;
        int ret;

        filemap_invalidate_lock(mapping);

        if ((ia_valid & ATTR_SIZE) && inode->i_size)
                ret = -EINVAL;
        else
                ret = simple_setattr(idmap, dentry, iattr);

        filemap_invalidate_unlock(mapping);

        return ret;
}

static const struct inode_operations secretmem_iops = {
        .setattr = secretmem_setattr,
};

static struct vfsmount *secretmem_mnt;

static struct file *secretmem_file_create(unsigned long flags)
{
        struct file *file;
        struct inode *inode;
        const char *anon_name = "[secretmem]";

        inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL);
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
                                 O_RDWR | O_LARGEFILE, &secretmem_fops);
        if (IS_ERR(file))
                goto err_free_inode;

        mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
        mapping_set_unevictable(inode->i_mapping);

        inode->i_op = &secretmem_iops;
        inode->i_mapping->a_ops = &secretmem_aops;

        /* pretend we are a normal file with zero size */
        inode->i_mode |= S_IFREG;
        inode->i_size = 0;

        atomic_inc(&secretmem_users);

        return file;

err_free_inode:
        iput(inode);
        return file;
}

SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
{
        /* make sure local flags do not conflict with global fcntl.h */
        BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);

        if (!secretmem_enable || !can_set_direct_map())
                return -ENOSYS;

        if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
                return -EINVAL;
        if (atomic_read(&secretmem_users) < 0)
                return -ENFILE;

        return FD_ADD(flags & O_CLOEXEC, secretmem_file_create(flags));
}

static int secretmem_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx;

        ctx = init_pseudo(fc, SECRETMEM_MAGIC);
        if (!ctx)
                return -ENOMEM;

        fc->s_iflags |= SB_I_NOEXEC;
        fc->s_iflags |= SB_I_NODEV;
        return 0;
}

static struct file_system_type secretmem_fs = {
        .name                = "secretmem",
        .init_fs_context = secretmem_init_fs_context,
        .kill_sb        = kill_anon_super,
};

static int __init secretmem_init(void)
{
        if (!secretmem_enable || !can_set_direct_map())
                return 0;

        secretmem_mnt = kern_mount(&secretmem_fs);
        if (IS_ERR(secretmem_mnt))
                return PTR_ERR(secretmem_mnt);

        return 0;
}
fs_initcall(secretmem_init);



















































































































































    3 

    3 






































    3 




































































































































    3 




    3 

























































































































    3 








    3 








    3 
    3 

















    3 














    3 





















    3 












































































    3 







    3 










    3 


    3 






    3 

    2 


    3 






















    3 


































    3 











    3 





    3 
























    3 





















    3 











    2 


    2 



    2 























































    3 

































    3 





    3 



    3 




















    3 





    3 






































    3 










    2 








































    3 






    2 











    3 




    2 




    3 









    3 







































    3 


























    3 















    3 



    3 



    3 

























    3 






    3 















    3 
















    3 














































    3 















    3 











    3 








    3 




















    3 

    3 














    3 














    3 



















    3 





    3 

    3 


    3 













    3 
    2 


    3 

    3 














    3 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/dd.c - The core device/driver interactions.
 *
 * This file contains the (sometimes tricky) code that controls the
 * interactions between devices and drivers, which primarily includes
 * driver binding and unbinding.
 *
 * All of this code used to exist in drivers/base/bus.c, but was
 * relocated to here in the name of compartmentalization (since it wasn't
 * strictly code just for the 'struct bus_type'.
 *
 * Copyright (c) 2002-5 Patrick Mochel
 * Copyright (c) 2002-3 Open Source Development Labs
 * Copyright (c) 2007-2009 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2007-2009 Novell Inc.
 */

#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/delay.h>
#include <linux/dma-map-ops.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/async.h>
#include <linux/pm_domain.h>
#include <linux/pm_runtime.h>
#include <linux/pinctrl/devinfo.h>
#include <linux/slab.h>

#include "base.h"
#include "power/power.h"

/*
 * Deferred Probe infrastructure.
 *
 * Sometimes driver probe order matters, but the kernel doesn't always have
 * dependency information which means some drivers will get probed before a
 * resource it depends on is available.  For example, an SDHCI driver may
 * first need a GPIO line from an i2c GPIO controller before it can be
 * initialized.  If a required resource is not available yet, a driver can
 * request probing to be deferred by returning -EPROBE_DEFER from its probe hook
 *
 * Deferred probe maintains two lists of devices, a pending list and an active
 * list.  A driver returning -EPROBE_DEFER causes the device to be added to the
 * pending list.  A successful driver probe will trigger moving all devices
 * from the pending to the active list so that the workqueue will eventually
 * retry them.
 *
 * The deferred_probe_mutex must be held any time the deferred_probe_*_list
 * of the (struct device*)->p->deferred_probe pointers are manipulated
 */
static DEFINE_MUTEX(deferred_probe_mutex);
static LIST_HEAD(deferred_probe_pending_list);
static LIST_HEAD(deferred_probe_active_list);
static atomic_t deferred_trigger_count = ATOMIC_INIT(0);
static bool initcalls_done;

/* Save the async probe drivers' name from kernel cmdline */
#define ASYNC_DRV_NAMES_MAX_LEN        256
static char async_probe_drv_names[ASYNC_DRV_NAMES_MAX_LEN];
static bool async_probe_default;

/*
 * In some cases, like suspend to RAM or hibernation, It might be reasonable
 * to prohibit probing of devices as it could be unsafe.
 * Once defer_all_probes is true all drivers probes will be forcibly deferred.
 */
static bool defer_all_probes;

static void __device_set_deferred_probe_reason(const struct device *dev, char *reason)
{
        kfree(dev->p->deferred_probe_reason);
        dev->p->deferred_probe_reason = reason;
}

/*
 * deferred_probe_work_func() - Retry probing devices in the active list.
 */
static void deferred_probe_work_func(struct work_struct *work)
{
        struct device *dev;
        struct device_private *private;
        /*
         * This block processes every device in the deferred 'active' list.
         * Each device is removed from the active list and passed to
         * bus_probe_device() to re-attempt the probe.  The loop continues
         * until every device in the active list is removed and retried.
         *
         * Note: Once the device is removed from the list and the mutex is
         * released, it is possible for the device get freed by another thread
         * and cause a illegal pointer dereference.  This code uses
         * get/put_device() to ensure the device structure cannot disappear
         * from under our feet.
         */
        mutex_lock(&deferred_probe_mutex);
        while (!list_empty(&deferred_probe_active_list)) {
                private = list_first_entry(&deferred_probe_active_list,
                                        typeof(*dev->p), deferred_probe);
                dev = private->device;
                list_del_init(&private->deferred_probe);

                get_device(dev);

                __device_set_deferred_probe_reason(dev, NULL);

                /*
                 * Drop the mutex while probing each device; the probe path may
                 * manipulate the deferred list
                 */
                mutex_unlock(&deferred_probe_mutex);

                /*
                 * Force the device to the end of the dpm_list since
                 * the PM code assumes that the order we add things to
                 * the list is a good order for suspend but deferred
                 * probe makes that very unsafe.
                 */
                device_pm_move_to_tail(dev);

                dev_dbg(dev, "Retrying from deferred list\n");
                bus_probe_device(dev);
                mutex_lock(&deferred_probe_mutex);

                put_device(dev);
        }
        mutex_unlock(&deferred_probe_mutex);
}
static DECLARE_WORK(deferred_probe_work, deferred_probe_work_func);

void driver_deferred_probe_add(struct device *dev)
{
        if (!dev->can_match)
                return;

        mutex_lock(&deferred_probe_mutex);
        if (list_empty(&dev->p->deferred_probe)) {
                dev_dbg(dev, "Added to deferred list\n");
                list_add_tail(&dev->p->deferred_probe, &deferred_probe_pending_list);
        }
        mutex_unlock(&deferred_probe_mutex);
}

void driver_deferred_probe_del(struct device *dev)
{
        mutex_lock(&deferred_probe_mutex);
        if (!list_empty(&dev->p->deferred_probe)) {
                dev_dbg(dev, "Removed from deferred list\n");
                list_del_init(&dev->p->deferred_probe);
                __device_set_deferred_probe_reason(dev, NULL);
        }
        mutex_unlock(&deferred_probe_mutex);
}

static bool driver_deferred_probe_enable;
/**
 * driver_deferred_probe_trigger() - Kick off re-probing deferred devices
 *
 * This functions moves all devices from the pending list to the active
 * list and schedules the deferred probe workqueue to process them.  It
 * should be called anytime a driver is successfully bound to a device.
 *
 * Note, there is a race condition in multi-threaded probe. In the case where
 * more than one device is probing at the same time, it is possible for one
 * probe to complete successfully while another is about to defer. If the second
 * depends on the first, then it will get put on the pending list after the
 * trigger event has already occurred and will be stuck there.
 *
 * The atomic 'deferred_trigger_count' is used to determine if a successful
 * trigger has occurred in the midst of probing a driver. If the trigger count
 * changes in the midst of a probe, then deferred processing should be triggered
 * again.
 */
void driver_deferred_probe_trigger(void)
{
        if (!driver_deferred_probe_enable)
                return;

        /*
         * A successful probe means that all the devices in the pending list
         * should be triggered to be reprobed.  Move all the deferred devices
         * into the active list so they can be retried by the workqueue
         */
        mutex_lock(&deferred_probe_mutex);
        atomic_inc(&deferred_trigger_count);
        list_splice_tail_init(&deferred_probe_pending_list,
                              &deferred_probe_active_list);
        mutex_unlock(&deferred_probe_mutex);

        /*
         * Kick the re-probe thread.  It may already be scheduled, but it is
         * safe to kick it again.
         */
        queue_work(system_dfl_wq, &deferred_probe_work);
}

/**
 * device_block_probing() - Block/defer device's probes
 *
 *        It will disable probing of devices and defer their probes instead.
 */
void device_block_probing(void)
{
        defer_all_probes = true;
        /* sync with probes to avoid races. */
        wait_for_device_probe();
}

/**
 * device_unblock_probing() - Unblock/enable device's probes
 *
 *        It will restore normal behavior and trigger re-probing of deferred
 * devices.
 */
void device_unblock_probing(void)
{
        defer_all_probes = false;
        driver_deferred_probe_trigger();
}

/**
 * device_set_deferred_probe_reason() - Set defer probe reason message for device
 * @dev: the pointer to the struct device
 * @vaf: the pointer to va_format structure with message
 */
void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf)
{
        const char *drv = dev_driver_string(dev);
        char *reason;

        mutex_lock(&deferred_probe_mutex);

        reason = kasprintf(GFP_KERNEL, "%s: %pV", drv, vaf);
        __device_set_deferred_probe_reason(dev, reason);

        mutex_unlock(&deferred_probe_mutex);
}

/*
 * deferred_devs_show() - Show the devices in the deferred probe pending list.
 */
static int deferred_devs_show(struct seq_file *s, void *data)
{
        struct device_private *curr;

        mutex_lock(&deferred_probe_mutex);

        list_for_each_entry(curr, &deferred_probe_pending_list, deferred_probe)
                seq_printf(s, "%s\t%s", dev_name(curr->device),
                           curr->deferred_probe_reason ?: "\n");

        mutex_unlock(&deferred_probe_mutex);

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(deferred_devs);

static int driver_deferred_probe_timeout = CONFIG_DRIVER_DEFERRED_PROBE_TIMEOUT;

static int __init deferred_probe_timeout_setup(char *str)
{
        int timeout;

        if (!kstrtoint(str, 10, &timeout))
                driver_deferred_probe_timeout = timeout;
        return 1;
}
__setup("deferred_probe_timeout=", deferred_probe_timeout_setup);

/**
 * driver_deferred_probe_check_state() - Check deferred probe state
 * @dev: device to check
 *
 * Return:
 * * -ENODEV if initcalls have completed and modules are disabled.
 * * -ETIMEDOUT if the deferred probe timeout was set and has expired
 *   and modules are enabled.
 * * -EPROBE_DEFER in other cases.
 *
 * Drivers or subsystems can opt-in to calling this function instead of directly
 * returning -EPROBE_DEFER.
 */
int driver_deferred_probe_check_state(struct device *dev)
{
        if (!IS_ENABLED(CONFIG_MODULES) && initcalls_done) {
                dev_warn(dev, "ignoring dependency for device, assuming no driver\n");
                return -ENODEV;
        }

        if (!driver_deferred_probe_timeout && initcalls_done) {
                dev_warn(dev, "deferred probe timeout, ignoring dependency\n");
                return -ETIMEDOUT;
        }

        return -EPROBE_DEFER;
}
EXPORT_SYMBOL_GPL(driver_deferred_probe_check_state);

static void deferred_probe_timeout_work_func(struct work_struct *work)
{
        struct device_private *p;

        fw_devlink_drivers_done();

        driver_deferred_probe_timeout = 0;
        driver_deferred_probe_trigger();
        flush_work(&deferred_probe_work);

        mutex_lock(&deferred_probe_mutex);
        list_for_each_entry(p, &deferred_probe_pending_list, deferred_probe)
                dev_warn(p->device, "deferred probe pending: %s", p->deferred_probe_reason ?: "(reason unknown)\n");
        mutex_unlock(&deferred_probe_mutex);

        fw_devlink_probing_done();
}
static DECLARE_DELAYED_WORK(deferred_probe_timeout_work, deferred_probe_timeout_work_func);

void deferred_probe_extend_timeout(void)
{
        /*
         * If the work hasn't been queued yet or if the work expired, don't
         * start a new one.
         */
        if (cancel_delayed_work(&deferred_probe_timeout_work)) {
                schedule_delayed_work(&deferred_probe_timeout_work,
                                driver_deferred_probe_timeout * HZ);
                pr_debug("Extended deferred probe timeout by %d secs\n",
                                        driver_deferred_probe_timeout);
        }
}

/**
 * deferred_probe_initcall() - Enable probing of deferred devices
 *
 * We don't want to get in the way when the bulk of drivers are getting probed.
 * Instead, this initcall makes sure that deferred probing is delayed until
 * late_initcall time.
 */
static int deferred_probe_initcall(void)
{
        debugfs_create_file("devices_deferred", 0444, NULL, NULL,
                            &deferred_devs_fops);

        driver_deferred_probe_enable = true;
        driver_deferred_probe_trigger();
        /* Sort as many dependencies as possible before exiting initcalls */
        flush_work(&deferred_probe_work);
        initcalls_done = true;

        if (!IS_ENABLED(CONFIG_MODULES))
                fw_devlink_drivers_done();

        /*
         * Trigger deferred probe again, this time we won't defer anything
         * that is optional
         */
        driver_deferred_probe_trigger();
        flush_work(&deferred_probe_work);

        if (driver_deferred_probe_timeout > 0) {
                schedule_delayed_work(&deferred_probe_timeout_work,
                        driver_deferred_probe_timeout * HZ);
        }

        if (!IS_ENABLED(CONFIG_MODULES))
                fw_devlink_probing_done();

        return 0;
}
late_initcall(deferred_probe_initcall);

static void __exit deferred_probe_exit(void)
{
        debugfs_lookup_and_remove("devices_deferred", NULL);
}
__exitcall(deferred_probe_exit);

int __device_set_driver_override(struct device *dev, const char *s, size_t len)
{
        const char *new = NULL, *old;

        if (!s)
                return -EINVAL;

        /*
         * The stored value will be used in sysfs show callback (sysfs_emit()),
         * which has a length limit of PAGE_SIZE and adds a trailing newline.
         * Thus we can store one character less to avoid truncation during sysfs
         * show.
         */
        if (len >= (PAGE_SIZE - 1))
                return -EINVAL;

        /*
         * Compute the real length of the string in case userspace sends us a
         * bunch of \0 characters like python likes to do.
         */
        len = strlen(s);

        /* Handle trailing newline */
        if (len) {
                char *cp;

                cp = strnchr(s, len, '\n');
                if (cp)
                        len = cp - s;
        }

        /*
         * If empty string or "\n" passed, new remains NULL, clearing
         * the driver_override.name.
         */
        if (len) {
                new = kstrndup(s, len, GFP_KERNEL);
                if (!new)
                        return -ENOMEM;
        }

        scoped_guard(spinlock, &dev->driver_override.lock) {
                old = dev->driver_override.name;
                dev->driver_override.name = new;
        }

        kfree(old);

        return 0;
}
EXPORT_SYMBOL_GPL(__device_set_driver_override);

/**
 * device_is_bound() - Check if device is bound to a driver
 * @dev: device to check
 *
 * Returns true if passed device has already finished probing successfully
 * against a driver.
 *
 * This function must be called with the device lock held.
 */
bool device_is_bound(struct device *dev)
{
        return dev->p && klist_node_attached(&dev->p->knode_driver);
}
EXPORT_SYMBOL_GPL(device_is_bound);

static void driver_bound(struct device *dev)
{
        if (device_is_bound(dev)) {
                dev_warn(dev, "%s: device already bound\n", __func__);
                return;
        }

        dev_dbg(dev, "driver: '%s': %s: bound to device\n", dev->driver->name,
                __func__);

        klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices);
        device_links_driver_bound(dev);

        device_pm_check_callbacks(dev);

        /*
         * Make sure the device is no longer in one of the deferred lists and
         * kick off retrying all pending devices
         */
        driver_deferred_probe_del(dev);
        driver_deferred_probe_trigger();

        bus_notify(dev, BUS_NOTIFY_BOUND_DRIVER);
        kobject_uevent(&dev->kobj, KOBJ_BIND);
}

static ssize_t coredump_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        device_lock(dev);
        dev->driver->coredump(dev);
        device_unlock(dev);

        return count;
}
static DEVICE_ATTR_WO(coredump);

static int driver_sysfs_add(struct device *dev)
{
        int ret;

        bus_notify(dev, BUS_NOTIFY_BIND_DRIVER);

        ret = sysfs_create_link(&dev->driver->p->kobj, &dev->kobj,
                                kobject_name(&dev->kobj));
        if (ret)
                goto fail;

        ret = sysfs_create_link(&dev->kobj, &dev->driver->p->kobj,
                                "driver");
        if (ret)
                goto rm_dev;

        if (!IS_ENABLED(CONFIG_DEV_COREDUMP) || !dev->driver->coredump)
                return 0;

        ret = device_create_file(dev, &dev_attr_coredump);
        if (!ret)
                return 0;

        sysfs_remove_link(&dev->kobj, "driver");

rm_dev:
        sysfs_remove_link(&dev->driver->p->kobj,
                          kobject_name(&dev->kobj));

fail:
        return ret;
}

static void driver_sysfs_remove(struct device *dev)
{
        struct device_driver *drv = dev->driver;

        if (drv) {
                if (drv->coredump)
                        device_remove_file(dev, &dev_attr_coredump);
                sysfs_remove_link(&drv->p->kobj, kobject_name(&dev->kobj));
                sysfs_remove_link(&dev->kobj, "driver");
        }
}

/**
 * device_bind_driver - bind a driver to one device.
 * @dev: device.
 *
 * Allow manual attachment of a driver to a device.
 * Caller must have already set @dev->driver.
 *
 * Note that this does not modify the bus reference count.
 * Please verify that is accounted for before calling this.
 * (It is ok to call with no other effort from a driver's probe() method.)
 *
 * This function must be called with the device lock held.
 *
 * Callers should prefer to use device_driver_attach() instead.
 */
int device_bind_driver(struct device *dev)
{
        int ret;

        ret = driver_sysfs_add(dev);
        if (!ret) {
                device_links_force_bind(dev);
                driver_bound(dev);
        }
        else
                bus_notify(dev, BUS_NOTIFY_DRIVER_NOT_BOUND);
        return ret;
}
EXPORT_SYMBOL_GPL(device_bind_driver);

static atomic_t probe_count = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(probe_waitqueue);

static ssize_t state_synced_store(struct device *dev,
                                  struct device_attribute *attr,
                                  const char *buf, size_t count)
{
        int ret = 0;

        if (strcmp("1", buf))
                return -EINVAL;

        device_lock(dev);
        if (!dev->state_synced) {
                dev->state_synced = true;
                dev_sync_state(dev);
        } else {
                ret = -EINVAL;
        }
        device_unlock(dev);

        return ret ? ret : count;
}

static ssize_t state_synced_show(struct device *dev,
                                 struct device_attribute *attr, char *buf)
{
        bool val;

        device_lock(dev);
        val = dev->state_synced;
        device_unlock(dev);

        return sysfs_emit(buf, "%u\n", val);
}
static DEVICE_ATTR_RW(state_synced);

static void device_unbind_cleanup(struct device *dev)
{
        devres_release_all(dev);
        if (dev->driver->p_cb.post_unbind_rust)
                dev->driver->p_cb.post_unbind_rust(dev);
        arch_teardown_dma_ops(dev);
        kfree(dev->dma_range_map);
        dev->dma_range_map = NULL;
        device_set_driver(dev, NULL);
        dev_set_drvdata(dev, NULL);
        dev_pm_domain_detach(dev, dev->power.detach_power_off);
        if (dev->pm_domain && dev->pm_domain->dismiss)
                dev->pm_domain->dismiss(dev);
        pm_runtime_reinit(dev);
        dev_pm_set_driver_flags(dev, 0);
}

static void device_remove(struct device *dev)
{
        device_remove_file(dev, &dev_attr_state_synced);
        device_remove_groups(dev, dev->driver->dev_groups);

        if (dev->bus && dev->bus->remove)
                dev->bus->remove(dev);
        else if (dev->driver->remove)
                dev->driver->remove(dev);
}

static int call_driver_probe(struct device *dev, const struct device_driver *drv)
{
        int ret = 0;

        if (dev->bus->probe)
                ret = dev->bus->probe(dev);
        else if (drv->probe)
                ret = drv->probe(dev);

        switch (ret) {
        case 0:
                break;
        case -EPROBE_DEFER:
                /* Driver requested deferred probing */
                dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
                break;
        case -ENODEV:
        case -ENXIO:
                dev_dbg(dev, "probe with driver %s rejects match %d\n",
                        drv->name, ret);
                break;
        default:
                /* driver matched but the probe failed */
                dev_err(dev, "probe with driver %s failed with error %d\n",
                        drv->name, ret);
                break;
        }

        return ret;
}

static int really_probe(struct device *dev, const struct device_driver *drv)
{
        bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
                           !drv->suppress_bind_attrs;
        int ret, link_ret;

        if (defer_all_probes) {
                /*
                 * Value of defer_all_probes can be set only by
                 * device_block_probing() which, in turn, will call
                 * wait_for_device_probe() right after that to avoid any races.
                 */
                dev_dbg(dev, "Driver %s force probe deferral\n", drv->name);
                return -EPROBE_DEFER;
        }

        link_ret = device_links_check_suppliers(dev);
        if (link_ret == -EPROBE_DEFER)
                return link_ret;

        dev_dbg(dev, "bus: '%s': %s: probing driver %s with device\n",
                drv->bus->name, __func__, drv->name);
        if (!list_empty(&dev->devres_head)) {
                dev_crit(dev, "Resources present before probing\n");
                ret = -EBUSY;
                goto done;
        }

re_probe:
        device_set_driver(dev, drv);

        /* If using pinctrl, bind pins now before probing */
        ret = pinctrl_bind_pins(dev);
        if (ret)
                goto pinctrl_bind_failed;

        if (dev->bus->dma_configure) {
                ret = dev->bus->dma_configure(dev);
                if (ret)
                        goto pinctrl_bind_failed;
        }

        ret = driver_sysfs_add(dev);
        if (ret) {
                dev_err(dev, "%s: driver_sysfs_add failed\n", __func__);
                goto sysfs_failed;
        }

        if (dev->pm_domain && dev->pm_domain->activate) {
                ret = dev->pm_domain->activate(dev);
                if (ret)
                        goto probe_failed;
        }

        ret = call_driver_probe(dev, drv);
        if (ret) {
                /*
                 * If fw_devlink_best_effort is active (denoted by -EAGAIN), the
                 * device might actually probe properly once some of its missing
                 * suppliers have probed. So, treat this as if the driver
                 * returned -EPROBE_DEFER.
                 */
                if (link_ret == -EAGAIN)
                        ret = -EPROBE_DEFER;

                /*
                 * Return probe errors as positive values so that the callers
                 * can distinguish them from other errors.
                 */
                ret = -ret;
                goto probe_failed;
        }

        ret = device_add_groups(dev, drv->dev_groups);
        if (ret) {
                dev_err(dev, "device_add_groups() failed\n");
                goto dev_groups_failed;
        }

        if (dev_has_sync_state(dev)) {
                ret = device_create_file(dev, &dev_attr_state_synced);
                if (ret) {
                        dev_err(dev, "state_synced sysfs add failed\n");
                        goto dev_sysfs_state_synced_failed;
                }
        }

        if (test_remove) {
                test_remove = false;

                device_remove(dev);
                driver_sysfs_remove(dev);
                if (dev->bus && dev->bus->dma_cleanup)
                        dev->bus->dma_cleanup(dev);
                device_unbind_cleanup(dev);

                goto re_probe;
        }

        pinctrl_init_done(dev);

        if (dev->pm_domain && dev->pm_domain->sync)
                dev->pm_domain->sync(dev);

        driver_bound(dev);
        dev_dbg(dev, "bus: '%s': %s: bound device to driver %s\n",
                drv->bus->name, __func__, drv->name);
        goto done;

dev_sysfs_state_synced_failed:
dev_groups_failed:
        device_remove(dev);
probe_failed:
        driver_sysfs_remove(dev);
sysfs_failed:
        bus_notify(dev, BUS_NOTIFY_DRIVER_NOT_BOUND);
        if (dev->bus && dev->bus->dma_cleanup)
                dev->bus->dma_cleanup(dev);
pinctrl_bind_failed:
        device_links_no_driver(dev);
        device_unbind_cleanup(dev);
done:
        return ret;
}

/*
 * For initcall_debug, show the driver probe time.
 */
static int really_probe_debug(struct device *dev, const struct device_driver *drv)
{
        ktime_t calltime, rettime;
        int ret;

        calltime = ktime_get();
        ret = really_probe(dev, drv);
        rettime = ktime_get();
        /*
         * Don't change this to pr_debug() because that requires
         * CONFIG_DYNAMIC_DEBUG and we want a simple 'initcall_debug' on the
         * kernel commandline to print this all the time at the debug level.
         */
        printk(KERN_DEBUG "probe of %s returned %d after %lld usecs\n",
                 dev_name(dev), ret, ktime_us_delta(rettime, calltime));
        return ret;
}

/**
 * driver_probe_done
 * Determine if the probe sequence is finished or not.
 *
 * Should somehow figure out how to use a semaphore, not an atomic variable...
 */
bool __init driver_probe_done(void)
{
        int local_probe_count = atomic_read(&probe_count);

        pr_debug("%s: probe_count = %d\n", __func__, local_probe_count);
        return !local_probe_count;
}

/**
 * wait_for_device_probe
 * Wait for device probing to be completed.
 */
void wait_for_device_probe(void)
{
        /* wait for the deferred probe workqueue to finish */
        flush_work(&deferred_probe_work);

        /* wait for the known devices to complete their probing */
        wait_event(probe_waitqueue, atomic_read(&probe_count) == 0);
        async_synchronize_full();
}
EXPORT_SYMBOL_GPL(wait_for_device_probe);

static int __driver_probe_device(const struct device_driver *drv, struct device *dev)
{
        int ret = 0;

        if (dev->p->dead || !device_is_registered(dev))
                return -ENODEV;
        if (dev->driver)
                return -EBUSY;

        /*
         * In device_add(), the "struct device" gets linked into the subsystem's
         * list of devices and broadcast to userspace (via uevent) before we're
         * quite ready to probe. Those open pathways to driver probe before
         * we've finished enough of device_add() to reliably support probe.
         * Detect this and tell other pathways to try again later. device_add()
         * itself will also try to probe immediately after setting
         * "ready_to_probe".
         */
        if (!dev_ready_to_probe(dev))
                return dev_err_probe(dev, -EPROBE_DEFER, "Device not ready to probe\n");

        /*
         * Set can_match = true after calling dev_ready_to_probe(), so
         * driver_deferred_probe_add() won't actually add the device to the
         * deferred probe list when dev_ready_to_probe() returns false.
         *
         * When dev_ready_to_probe() returns false, it means that device_add()
         * will do another probe() attempt for us.
         */
        dev->can_match = true;
        dev_dbg(dev, "bus: '%s': %s: matched device with driver %s\n",
                drv->bus->name, __func__, drv->name);

        pm_runtime_get_suppliers(dev);
        if (dev->parent)
                pm_runtime_get_sync(dev->parent);

        pm_runtime_barrier(dev);
        if (initcall_debug)
                ret = really_probe_debug(dev, drv);
        else
                ret = really_probe(dev, drv);
        pm_request_idle(dev);

        if (dev->parent)
                pm_runtime_put(dev->parent);

        pm_runtime_put_suppliers(dev);
        return ret;
}

/**
 * driver_probe_device - attempt to bind device & driver together
 * @drv: driver to bind a device to
 * @dev: device to try to bind to the driver
 *
 * This function returns -ENODEV if the device is not registered, -EBUSY if it
 * already has a driver, 0 if the device is bound successfully and a positive
 * (inverted) error code for failures from the ->probe method.
 *
 * This function must be called with @dev lock held.  When called for a
 * USB interface, @dev->parent lock must be held as well.
 *
 * If the device has a parent, runtime-resume the parent before driver probing.
 */
static int driver_probe_device(const struct device_driver *drv, struct device *dev)
{
        int trigger_count = atomic_read(&deferred_trigger_count);
        int ret;

        atomic_inc(&probe_count);
        ret = __driver_probe_device(drv, dev);
        if (ret == -EPROBE_DEFER || ret == EPROBE_DEFER) {
                driver_deferred_probe_add(dev);

                /*
                 * Did a trigger occur while probing? Need to re-trigger if yes
                 */
                if (trigger_count != atomic_read(&deferred_trigger_count) &&
                    !defer_all_probes)
                        driver_deferred_probe_trigger();
        }
        atomic_dec(&probe_count);
        wake_up_all(&probe_waitqueue);
        return ret;
}

static inline bool cmdline_requested_async_probing(const char *drv_name)
{
        bool async_drv;

        async_drv = parse_option_str(async_probe_drv_names, drv_name);

        return (async_probe_default != async_drv);
}

/* The option format is "driver_async_probe=drv_name1,drv_name2,..." */
static int __init save_async_options(char *buf)
{
        if (strlen(buf) >= ASYNC_DRV_NAMES_MAX_LEN)
                pr_warn("Too long list of driver names for 'driver_async_probe'!\n");

        strscpy(async_probe_drv_names, buf, ASYNC_DRV_NAMES_MAX_LEN);
        async_probe_default = parse_option_str(async_probe_drv_names, "*");

        return 1;
}
__setup("driver_async_probe=", save_async_options);

static bool driver_allows_async_probing(const struct device_driver *drv)
{
        switch (drv->probe_type) {
        case PROBE_PREFER_ASYNCHRONOUS:
                return true;

        case PROBE_FORCE_SYNCHRONOUS:
                return false;

        default:
                if (cmdline_requested_async_probing(drv->name))
                        return true;

                if (module_requested_async_probing(drv->owner))
                        return true;

                return false;
        }
}

struct device_attach_data {
        struct device *dev;

        /*
         * Indicates whether we are considering asynchronous probing or
         * not. Only initial binding after device or driver registration
         * (including deferral processing) may be done asynchronously, the
         * rest is always synchronous, as we expect it is being done by
         * request from userspace.
         */
        bool check_async;

        /*
         * Indicates if we are binding synchronous or asynchronous drivers.
         * When asynchronous probing is enabled we'll execute 2 passes
         * over drivers: first pass doing synchronous probing and second
         * doing asynchronous probing (if synchronous did not succeed -
         * most likely because there was no driver requiring synchronous
         * probing - and we found asynchronous driver during first pass).
         * The 2 passes are done because we can't shoot asynchronous
         * probe for given device and driver from bus_for_each_drv() since
         * driver pointer is not guaranteed to stay valid once
         * bus_for_each_drv() iterates to the next driver on the bus.
         */
        bool want_async;

        /*
         * We'll set have_async to 'true' if, while scanning for matching
         * driver, we'll encounter one that requests asynchronous probing.
         */
        bool have_async;
};

static int __device_attach_driver(struct device_driver *drv, void *_data)
{
        struct device_attach_data *data = _data;
        struct device *dev = data->dev;
        bool async_allowed;
        int ret;

        ret = driver_match_device(drv, dev);
        if (ret == 0) {
                /* no match */
                return 0;
        } else if (ret == -EPROBE_DEFER) {
                dev_dbg(dev, "Device match requests probe deferral\n");
                dev->can_match = true;
                driver_deferred_probe_add(dev);
                /*
                 * Device can't match with a driver right now, so don't attempt
                 * to match or bind with other drivers on the bus.
                 */
                return ret;
        } else if (ret < 0) {
                dev_dbg(dev, "Bus failed to match device: %d\n", ret);
                return ret;
        } /* ret > 0 means positive match */

        async_allowed = driver_allows_async_probing(drv);

        if (async_allowed)
                data->have_async = true;

        if (data->check_async && async_allowed != data->want_async)
                return 0;

        /*
         * Ignore errors returned by ->probe so that the next driver can try
         * its luck.
         */
        ret = driver_probe_device(drv, dev);
        if (ret < 0)
                return ret;
        return ret == 0;
}

static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
{
        struct device *dev = _dev;
        struct device_attach_data data = {
                .dev                = dev,
                .check_async        = true,
                .want_async        = true,
        };

        device_lock(dev);

        /*
         * Check if device has already been removed or claimed. This may
         * happen with driver loading, device discovery/registration,
         * and deferred probe processing happens all at once with
         * multiple threads.
         */
        if (dev->p->dead || dev->driver)
                goto out_unlock;

        if (dev->parent)
                pm_runtime_get_sync(dev->parent);

        bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver);
        dev_dbg(dev, "async probe completed\n");

        pm_request_idle(dev);

        if (dev->parent)
                pm_runtime_put(dev->parent);
out_unlock:
        device_unlock(dev);

        put_device(dev);
}

static int __device_attach(struct device *dev, bool allow_async)
{
        int ret = 0;
        bool async = false;

        device_lock(dev);
        if (dev->p->dead) {
                goto out_unlock;
        } else if (dev->driver) {
                if (device_is_bound(dev)) {
                        ret = 1;
                        goto out_unlock;
                }
                ret = device_bind_driver(dev);
                if (ret == 0)
                        ret = 1;
                else {
                        device_set_driver(dev, NULL);
                        ret = 0;
                }
        } else {
                struct device_attach_data data = {
                        .dev = dev,
                        .check_async = allow_async,
                        .want_async = false,
                };

                if (dev->parent)
                        pm_runtime_get_sync(dev->parent);

                ret = bus_for_each_drv(dev->bus, NULL, &data,
                                        __device_attach_driver);
                if (!ret && allow_async && data.have_async) {
                        /*
                         * If we could not find appropriate driver
                         * synchronously and we are allowed to do
                         * async probes and there are drivers that
                         * want to probe asynchronously, we'll
                         * try them.
                         */
                        dev_dbg(dev, "scheduling asynchronous probe\n");
                        get_device(dev);
                        async = true;
                } else {
                        pm_request_idle(dev);
                }

                if (dev->parent)
                        pm_runtime_put(dev->parent);
        }
out_unlock:
        device_unlock(dev);
        if (async)
                async_schedule_dev(__device_attach_async_helper, dev);
        return ret;
}

/**
 * device_attach - try to attach device to a driver.
 * @dev: device.
 *
 * Walk the list of drivers that the bus has and call
 * driver_probe_device() for each pair. If a compatible
 * pair is found, break out and return.
 *
 * Returns 1 if the device was bound to a driver;
 * 0 if no matching driver was found;
 * -ENODEV if the device is not registered.
 *
 * When called for a USB interface, @dev->parent lock must be held.
 */
int device_attach(struct device *dev)
{
        return __device_attach(dev, false);
}
EXPORT_SYMBOL_GPL(device_attach);

void device_initial_probe(struct device *dev)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);

        if (!sp)
                return;

        if (sp->drivers_autoprobe)
                __device_attach(dev, true);

        subsys_put(sp);
}

/*
 * __device_driver_lock - acquire locks needed to manipulate dev->drv
 * @dev: Device we will update driver info for
 * @parent: Parent device. Needed if the bus requires parent lock
 *
 * This function will take the required locks for manipulating dev->drv.
 * Normally this will just be the @dev lock, but when called for a USB
 * interface, @parent lock will be held as well.
 */
static void __device_driver_lock(struct device *dev, struct device *parent)
{
        if (parent && dev->bus->need_parent_lock)
                device_lock(parent);
        device_lock(dev);
}

/*
 * __device_driver_unlock - release locks needed to manipulate dev->drv
 * @dev: Device we will update driver info for
 * @parent: Parent device. Needed if the bus requires parent lock
 *
 * This function will release the required locks for manipulating dev->drv.
 * Normally this will just be the @dev lock, but when called for a
 * USB interface, @parent lock will be released as well.
 */
static void __device_driver_unlock(struct device *dev, struct device *parent)
{
        device_unlock(dev);
        if (parent && dev->bus->need_parent_lock)
                device_unlock(parent);
}

/**
 * device_driver_attach - attach a specific driver to a specific device
 * @drv: Driver to attach
 * @dev: Device to attach it to
 *
 * Manually attach driver to a device. Will acquire both @dev lock and
 * @dev->parent lock if needed. Returns 0 on success, -ERR on failure.
 */
int device_driver_attach(const struct device_driver *drv, struct device *dev)
{
        int ret;

        __device_driver_lock(dev, dev->parent);
        ret = __driver_probe_device(drv, dev);
        __device_driver_unlock(dev, dev->parent);

        /* also return probe errors as normal negative errnos */
        if (ret > 0)
                ret = -ret;
        if (ret == -EPROBE_DEFER)
                return -EAGAIN;
        return ret;
}
EXPORT_SYMBOL_GPL(device_driver_attach);

static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie)
{
        struct device *dev = _dev;
        const struct device_driver *drv;
        int ret;

        __device_driver_lock(dev, dev->parent);
        drv = dev->p->async_driver;
        dev->p->async_driver = NULL;
        ret = driver_probe_device(drv, dev);
        __device_driver_unlock(dev, dev->parent);

        dev_dbg(dev, "driver %s async attach completed: %d\n", drv->name, ret);

        put_device(dev);
}

static int __driver_attach(struct device *dev, void *data)
{
        const struct device_driver *drv = data;
        bool async = false;
        int ret;

        /*
         * Lock device and try to bind to it. We drop the error
         * here and always return 0, because we need to keep trying
         * to bind to devices and some drivers will return an error
         * simply if it didn't support the device.
         *
         * driver_probe_device() will spit a warning if there
         * is an error.
         */

        ret = driver_match_device(drv, dev);
        if (ret == 0) {
                /* no match */
                return 0;
        } else if (ret == -EPROBE_DEFER) {
                dev_dbg(dev, "Device match requests probe deferral\n");
                dev->can_match = true;
                driver_deferred_probe_add(dev);
                /*
                 * Driver could not match with device, but may match with
                 * another device on the bus.
                 */
                return 0;
        } else if (ret < 0) {
                dev_dbg(dev, "Bus failed to match device: %d\n", ret);
                /*
                 * Driver could not match with device, but may match with
                 * another device on the bus.
                 */
                return 0;
        } /* ret > 0 means positive match */

        if (driver_allows_async_probing(drv)) {
                /*
                 * Instead of probing the device synchronously we will
                 * probe it asynchronously to allow for more parallelism.
                 *
                 * We only take the device lock here in order to guarantee
                 * that the dev->driver and async_driver fields are protected
                 */
                dev_dbg(dev, "probing driver %s asynchronously\n", drv->name);
                device_lock(dev);
                if (!dev->driver && !dev->p->async_driver) {
                        get_device(dev);
                        dev->p->async_driver = drv;
                        async = true;
                }
                device_unlock(dev);
                if (async)
                        async_schedule_dev(__driver_attach_async_helper, dev);
                return 0;
        }

        __device_driver_lock(dev, dev->parent);
        driver_probe_device(drv, dev);
        __device_driver_unlock(dev, dev->parent);

        return 0;
}

/**
 * driver_attach - try to bind driver to devices.
 * @drv: driver.
 *
 * Walk the list of devices that the bus has on it and try to
 * match the driver with each one.  If driver_probe_device()
 * returns 0 and the @dev->driver is set, we've found a
 * compatible pair.
 */
int driver_attach(const struct device_driver *drv)
{
        /* The (void *) will be put back to const * in __driver_attach() */
        return bus_for_each_dev(drv->bus, NULL, (void *)drv, __driver_attach);
}
EXPORT_SYMBOL_GPL(driver_attach);

/*
 * __device_release_driver() must be called with @dev lock held.
 * When called for a USB interface, @dev->parent lock must be held as well.
 */
static void __device_release_driver(struct device *dev, struct device *parent)
{
        struct device_driver *drv;

        drv = dev->driver;
        if (drv) {
                pm_runtime_get_sync(dev);

                while (device_links_busy(dev)) {
                        __device_driver_unlock(dev, parent);

                        device_links_unbind_consumers(dev);

                        __device_driver_lock(dev, parent);
                        /*
                         * A concurrent invocation of the same function might
                         * have released the driver successfully while this one
                         * was waiting, so check for that.
                         */
                        if (dev->driver != drv) {
                                pm_runtime_put(dev);
                                return;
                        }
                }

                driver_sysfs_remove(dev);

                bus_notify(dev, BUS_NOTIFY_UNBIND_DRIVER);

                pm_runtime_put_sync(dev);

                device_remove(dev);

                if (dev->bus && dev->bus->dma_cleanup)
                        dev->bus->dma_cleanup(dev);

                device_unbind_cleanup(dev);
                device_links_driver_cleanup(dev);

                klist_remove(&dev->p->knode_driver);
                device_pm_check_callbacks(dev);

                bus_notify(dev, BUS_NOTIFY_UNBOUND_DRIVER);
                kobject_uevent(&dev->kobj, KOBJ_UNBIND);
        }
}

void device_release_driver_internal(struct device *dev,
                                    const struct device_driver *drv,
                                    struct device *parent)
{
        __device_driver_lock(dev, parent);

        if (!drv || drv == dev->driver)
                __device_release_driver(dev, parent);

        __device_driver_unlock(dev, parent);
}

/**
 * device_release_driver - manually detach device from driver.
 * @dev: device.
 *
 * Manually detach device from driver.
 * When called for a USB interface, @dev->parent lock must be held.
 *
 * If this function is to be called with @dev->parent lock held, ensure that
 * the device's consumers are unbound in advance or that their locks can be
 * acquired under the @dev->parent lock.
 */
void device_release_driver(struct device *dev)
{
        /*
         * If anyone calls device_release_driver() recursively from
         * within their ->remove callback for the same device, they
         * will deadlock right here.
         */
        device_release_driver_internal(dev, NULL, NULL);
}
EXPORT_SYMBOL_GPL(device_release_driver);

/**
 * device_driver_detach - detach driver from a specific device
 * @dev: device to detach driver from
 *
 * Detach driver from device. Will acquire both @dev lock and @dev->parent
 * lock if needed.
 */
void device_driver_detach(struct device *dev)
{
        device_release_driver_internal(dev, NULL, dev->parent);
}

/**
 * driver_detach - detach driver from all devices it controls.
 * @drv: driver.
 */
void driver_detach(const struct device_driver *drv)
{
        struct device_private *dev_prv;
        struct device *dev;

        if (driver_allows_async_probing(drv))
                async_synchronize_full();

        for (;;) {
                spin_lock(&drv->p->klist_devices.k_lock);
                if (list_empty(&drv->p->klist_devices.k_list)) {
                        spin_unlock(&drv->p->klist_devices.k_lock);
                        break;
                }
                dev_prv = list_last_entry(&drv->p->klist_devices.k_list,
                                     struct device_private,
                                     knode_driver.n_node);
                dev = dev_prv->device;
                get_device(dev);
                spin_unlock(&drv->p->klist_devices.k_lock);
                device_release_driver_internal(dev, drv, dev->parent);
                put_device(dev);
        }
}


























































































































    1 

    1 









    1 































































   11 





































































































































































































































































































































    1 































































    1 














































































    6 














    1 









    1 






    1 















    4 


    1 
    4 






    1 











































































































    3 

























   17 
    1 



   19 




   21 


    1 


    1 

    1 











   14 


    2 














    7 

























































































































































   21 




















   22 






































    2 


   22 














   21 

































   22 










   22 












    6 





   14 








   14 

    2 










   14 


   14 







    1 














    7 







    7 













    5 

















    7 










    1 
    3 




    3 








    2 









    3 

    1 






    2 










































































































    1 


















    1 





















































   23 



    3 













   11 






    6 














    1 

















































































    3 





   11 



















   21 



   15 

    5 



    1 














   21 














    1 




















    4 



























































    2 

    2 


    4 









    4 


    4 


    4 








    1 






    3 







































































































































































    1 













    1 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
#include <linux/filter.h>
#include <net/dsa.h>
#include <net/dst_metadata.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/gre.h>
#include <net/pptp.h>
#include <net/tipc.h>
#include <linux/igmp.h>
#include <linux/icmp.h>
#include <linux/sctp.h>
#include <linux/dccp.h>
#include <linux/if_tunnel.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <linux/stddef.h>
#include <linux/if_ether.h>
#include <linux/if_hsr.h>
#include <linux/mpls.h>
#include <linux/tcp.h>
#include <linux/ptp_classify.h>
#include <net/flow_dissector.h>
#include <net/pkt_cls.h>
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
#include <linux/bpf.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_labels.h>
#endif
#include <linux/bpf-netns.h>

static void dissector_set_key(struct flow_dissector *flow_dissector,
                              enum flow_dissector_key_id key_id)
{
        flow_dissector->used_keys |= (1ULL << key_id);
}

void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
                             const struct flow_dissector_key *key,
                             unsigned int key_count)
{
        unsigned int i;

        memset(flow_dissector, 0, sizeof(*flow_dissector));

        for (i = 0; i < key_count; i++, key++) {
                /* User should make sure that every key target offset is within
                 * boundaries of unsigned short.
                 */
                BUG_ON(key->offset > USHRT_MAX);
                BUG_ON(dissector_uses_key(flow_dissector,
                                          key->key_id));

                dissector_set_key(flow_dissector, key->key_id);
                flow_dissector->offset[key->key_id] = key->offset;
        }

        /* Ensure that the dissector always includes control and basic key.
         * That way we are able to avoid handling lack of these in fast path.
         */
        BUG_ON(!dissector_uses_key(flow_dissector,
                                   FLOW_DISSECTOR_KEY_CONTROL));
        BUG_ON(!dissector_uses_key(flow_dissector,
                                   FLOW_DISSECTOR_KEY_BASIC));
}
EXPORT_SYMBOL(skb_flow_dissector_init);

#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach_check(struct net *net,
                                         struct bpf_prog *prog)
{
        enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;

        if (net == &init_net) {
                /* BPF flow dissector in the root namespace overrides
                 * any per-net-namespace one. When attaching to root,
                 * make sure we don't have any BPF program attached
                 * to the non-root namespaces.
                 */
                struct net *ns;

                for_each_net(ns) {
                        if (ns == &init_net)
                                continue;
                        if (rcu_access_pointer(ns->bpf.run_array[type]))
                                return -EEXIST;
                }
        } else {
                /* Make sure root flow dissector is not attached
                 * when attaching to the non-root namespace.
                 */
                if (rcu_access_pointer(init_net.bpf.run_array[type]))
                        return -EEXIST;
        }

        return 0;
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * skb_flow_get_ports - extract the upper layer ports and return them
 * @skb: sk_buff to extract the ports from
 * @thoff: transport header offset
 * @ip_proto: protocol for which to get port offset
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
 *
 * The function will try to retrieve the ports at offset thoff + poff where poff
 * is the protocol port offset returned from proto_ports_offset
 */
__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
                          const void *data, int hlen)
{
        int poff = proto_ports_offset(ip_proto);

        if (!data) {
                data = skb->data;
                hlen = skb_headlen(skb);
        }

        if (poff >= 0) {
                __be32 *ports, _ports;

                ports = __skb_header_pointer(skb, thoff + poff,
                                             sizeof(_ports), data, hlen, &_ports);
                if (ports)
                        return *ports;
        }

        return 0;
}
EXPORT_SYMBOL(skb_flow_get_ports);

static bool icmp_has_id(u8 type)
{
        switch (type) {
        case ICMP_ECHO:
        case ICMP_ECHOREPLY:
        case ICMP_TIMESTAMP:
        case ICMP_TIMESTAMPREPLY:
        case ICMPV6_ECHO_REQUEST:
        case ICMPV6_ECHO_REPLY:
                return true;
        }

        return false;
}

/**
 * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields
 * @skb: sk_buff to extract from
 * @key_icmp: struct flow_dissector_key_icmp to fill
 * @data: raw buffer pointer to the packet
 * @thoff: offset to extract at
 * @hlen: packet header length
 */
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
                           struct flow_dissector_key_icmp *key_icmp,
                           const void *data, int thoff, int hlen)
{
        struct icmphdr *ih, _ih;

        ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih);
        if (!ih)
                return;

        key_icmp->type = ih->type;
        key_icmp->code = ih->code;

        /* As we use 0 to signal that the Id field is not present,
         * avoid confusion with packets without such field
         */
        if (icmp_has_id(ih->type))
                key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
        else
                key_icmp->id = 0;
}
EXPORT_SYMBOL(skb_flow_get_icmp_tci);

/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet
 * using skb_flow_get_icmp_tci().
 */
static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
                                    void *target_container, const void *data,
                                    int thoff, int hlen)
{
        struct flow_dissector_key_icmp *key_icmp;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP))
                return;

        key_icmp = skb_flow_dissector_target(flow_dissector,
                                             FLOW_DISSECTOR_KEY_ICMP,
                                             target_container);

        skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
}

static void __skb_flow_dissect_ah(const struct sk_buff *skb,
                                  struct flow_dissector *flow_dissector,
                                  void *target_container, const void *data,
                                  int nhoff, int hlen)
{
        struct flow_dissector_key_ipsec *key_ah;
        struct ip_auth_hdr _hdr, *hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_ah = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IPSEC,
                                           target_container);

        key_ah->spi = hdr->spi;
}

static void __skb_flow_dissect_esp(const struct sk_buff *skb,
                                   struct flow_dissector *flow_dissector,
                                   void *target_container, const void *data,
                                   int nhoff, int hlen)
{
        struct flow_dissector_key_ipsec *key_esp;
        struct ip_esp_hdr _hdr, *hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_esp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_IPSEC,
                                            target_container);

        key_esp->spi = hdr->spi;
}

static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb,
                                      struct flow_dissector *flow_dissector,
                                      void *target_container, const void *data,
                                      int nhoff, int hlen)
{
        struct flow_dissector_key_l2tpv3 *key_l2tpv3;
        struct {
                __be32 session_id;
        } *hdr, _hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_l2tpv3 = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_L2TPV3,
                                               target_container);

        key_l2tpv3->session_id = hdr->session_id;
}

void skb_flow_dissect_meta(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container)
{
        struct flow_dissector_key_meta *meta;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
                return;

        meta = skb_flow_dissector_target(flow_dissector,
                                         FLOW_DISSECTOR_KEY_META,
                                         target_container);
        meta->ingress_ifindex = skb->skb_iif;
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        if (tc_skb_ext_tc_enabled()) {
                struct tc_skb_ext *ext;

                ext = skb_ext_find(skb, TC_SKB_EXT);
                if (ext)
                        meta->l2_miss = ext->l2_miss;
        }
#endif
}
EXPORT_SYMBOL(skb_flow_dissect_meta);

static void
skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type,
                                 u32 ctrl_flags,
                                 struct flow_dissector *flow_dissector,
                                 void *target_container)
{
        struct flow_dissector_key_control *ctrl;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL))
                return;

        ctrl = skb_flow_dissector_target(flow_dissector,
                                         FLOW_DISSECTOR_KEY_ENC_CONTROL,
                                         target_container);
        ctrl->addr_type = type;
        ctrl->flags = ctrl_flags;
}

void
skb_flow_dissect_ct(const struct sk_buff *skb,
                    struct flow_dissector *flow_dissector,
                    void *target_container, u16 *ctinfo_map,
                    size_t mapsize, bool post_ct, u16 zone)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        struct flow_dissector_key_ct *key;
        enum ip_conntrack_info ctinfo;
        struct nf_conn_labels *cl;
        struct nf_conn *ct;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT))
                return;

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct && !post_ct)
                return;

        key = skb_flow_dissector_target(flow_dissector,
                                        FLOW_DISSECTOR_KEY_CT,
                                        target_container);

        if (!ct) {
                key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
                                TCA_FLOWER_KEY_CT_FLAGS_INVALID;
                key->ct_zone = zone;
                return;
        }

        if (ctinfo < mapsize)
                key->ct_state = ctinfo_map[ctinfo];
#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
        key->ct_zone = ct->zone.id;
#endif
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
        key->ct_mark = READ_ONCE(ct->mark);
#endif

        cl = nf_ct_labels_find(ct);
        if (cl)
                memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels));
#endif /* CONFIG_NF_CONNTRACK */
}
EXPORT_SYMBOL(skb_flow_dissect_ct);

void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
                             void *target_container)
{
        struct ip_tunnel_info *info;
        struct ip_tunnel_key *key;
        u32 ctrl_flags = 0;

        /* A quick check to see if there might be something to do. */
        if (!dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_KEYID) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_PORTS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IP) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_OPTS))
                return;

        info = skb_tunnel_info(skb);
        if (!info)
                return;

        key = &info->key;

        if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM;
        if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT;
        if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM;
        if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT;

        switch (ip_tunnel_info_af(info)) {
        case AF_INET:
                skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                 ctrl_flags, flow_dissector,
                                                 target_container);
                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
                        struct flow_dissector_key_ipv4_addrs *ipv4;

                        ipv4 = skb_flow_dissector_target(flow_dissector,
                                                         FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
                                                         target_container);
                        ipv4->src = key->u.ipv4.src;
                        ipv4->dst = key->u.ipv4.dst;
                }
                break;
        case AF_INET6:
                skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                 ctrl_flags, flow_dissector,
                                                 target_container);
                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
                        struct flow_dissector_key_ipv6_addrs *ipv6;

                        ipv6 = skb_flow_dissector_target(flow_dissector,
                                                         FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
                                                         target_container);
                        ipv6->src = key->u.ipv6.src;
                        ipv6->dst = key->u.ipv6.dst;
                }
                break;
        default:
                skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector,
                                                 target_container);
                break;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
                struct flow_dissector_key_keyid *keyid;

                keyid = skb_flow_dissector_target(flow_dissector,
                                                  FLOW_DISSECTOR_KEY_ENC_KEYID,
                                                  target_container);
                keyid->keyid = tunnel_id_to_key32(key->tun_id);
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
                struct flow_dissector_key_ports *tp;

                tp = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_ENC_PORTS,
                                               target_container);
                tp->src = key->tp_src;
                tp->dst = key->tp_dst;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
                struct flow_dissector_key_ip *ip;

                ip = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_ENC_IP,
                                               target_container);
                ip->tos = key->tos;
                ip->ttl = key->ttl;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
                struct flow_dissector_key_enc_opts *enc_opt;
                IP_TUNNEL_DECLARE_FLAGS(flags) = { };
                u32 val;

                enc_opt = skb_flow_dissector_target(flow_dissector,
                                                    FLOW_DISSECTOR_KEY_ENC_OPTS,
                                                    target_container);

                if (!info->options_len)
                        return;

                enc_opt->len = info->options_len;
                ip_tunnel_info_opts_get(enc_opt->data, info);

                ip_tunnel_set_options_present(flags);
                ip_tunnel_flags_and(flags, info->key.tun_flags, flags);

                val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM,
                                    IP_TUNNEL_GENEVE_OPT_BIT);
                enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;
        }
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);

void skb_flow_dissect_hash(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container)
{
        struct flow_dissector_key_hash *key;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH))
                return;

        key = skb_flow_dissector_target(flow_dissector,
                                        FLOW_DISSECTOR_KEY_HASH,
                                        target_container);

        key->hash = skb_get_hash_raw(skb);
}
EXPORT_SYMBOL(skb_flow_dissect_hash);

static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data, int nhoff,
                        int hlen, int lse_index, bool *entropy_label)
{
        struct mpls_label *hdr, _hdr;
        u32 entry, label, bos;

        if (!dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
            !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
                return FLOW_DISSECT_RET_OUT_GOOD;

        if (lse_index >= FLOW_DIS_MPLS_MAX)
                return FLOW_DISSECT_RET_OUT_GOOD;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
                                   hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        entry = ntohl(hdr->entry);
        label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
        bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
                struct flow_dissector_key_mpls *key_mpls;
                struct flow_dissector_mpls_lse *lse;

                key_mpls = skb_flow_dissector_target(flow_dissector,
                                                     FLOW_DISSECTOR_KEY_MPLS,
                                                     target_container);
                lse = &key_mpls->ls[lse_index];

                lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
                lse->mpls_bos = bos;
                lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
                lse->mpls_label = label;
                dissector_set_mpls_lse(key_mpls, lse_index);
        }

        if (*entropy_label &&
            dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
                struct flow_dissector_key_keyid *key_keyid;

                key_keyid = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
                                                      target_container);
                key_keyid->keyid = cpu_to_be32(label);
        }

        *entropy_label = label == MPLS_LABEL_ENTROPY;

        return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN;
}

static enum flow_dissect_ret
__skb_flow_dissect_arp(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int nhoff, int hlen)
{
        struct flow_dissector_key_arp *key_arp;
        struct {
                unsigned char ar_sha[ETH_ALEN];
                unsigned char ar_sip[4];
                unsigned char ar_tha[ETH_ALEN];
                unsigned char ar_tip[4];
        } *arp_eth, _arp_eth;
        const struct arphdr *arp;
        struct arphdr _arp;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP))
                return FLOW_DISSECT_RET_OUT_GOOD;

        arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
                                   hlen, &_arp);
        if (!arp)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
            arp->ar_pro != htons(ETH_P_IP) ||
            arp->ar_hln != ETH_ALEN ||
            arp->ar_pln != 4 ||
            (arp->ar_op != htons(ARPOP_REPLY) &&
             arp->ar_op != htons(ARPOP_REQUEST)))
                return FLOW_DISSECT_RET_OUT_BAD;

        arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
                                       sizeof(_arp_eth), data,
                                       hlen, &_arp_eth);
        if (!arp_eth)
                return FLOW_DISSECT_RET_OUT_BAD;

        key_arp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_ARP,
                                            target_container);

        memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip));
        memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip));

        /* Only store the lower byte of the opcode;
         * this covers ARPOP_REPLY and ARPOP_REQUEST.
         */
        key_arp->op = ntohs(arp->ar_op) & 0xff;

        ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
        ether_addr_copy(key_arp->tha, arp_eth->ar_tha);

        return FLOW_DISSECT_RET_OUT_GOOD;
}

static enum flow_dissect_ret
__skb_flow_dissect_cfm(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int nhoff, int hlen)
{
        struct flow_dissector_key_cfm *key, *hdr, _hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM))
                return FLOW_DISSECT_RET_OUT_GOOD;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM,
                                        target_container);

        key->mdl_ver = hdr->mdl_ver;
        key->opcode = hdr->opcode;

        return FLOW_DISSECT_RET_OUT_GOOD;
}

static enum flow_dissect_ret
__skb_flow_dissect_gre(const struct sk_buff *skb,
                       struct flow_dissector_key_control *key_control,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       __be16 *p_proto, int *p_nhoff, int *p_hlen,
                       unsigned int flags)
{
        struct flow_dissector_key_keyid *key_keyid;
        struct gre_base_hdr *hdr, _hdr;
        int offset = 0;
        u16 gre_ver;

        hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr),
                                   data, *p_hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        /* Only look inside GRE without routing */
        if (hdr->flags & GRE_ROUTING)
                return FLOW_DISSECT_RET_OUT_GOOD;

        /* Only look inside GRE for version 0 and 1 */
        gre_ver = ntohs(hdr->flags & GRE_VERSION);
        if (gre_ver > 1)
                return FLOW_DISSECT_RET_OUT_GOOD;

        *p_proto = hdr->protocol;
        if (gre_ver) {
                /* Version1 must be PPTP, and check the flags */
                if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
                        return FLOW_DISSECT_RET_OUT_GOOD;
        }

        offset += sizeof(struct gre_base_hdr);

        if (hdr->flags & GRE_CSUM)
                offset += sizeof_field(struct gre_full_hdr, csum) +
                          sizeof_field(struct gre_full_hdr, reserved1);

        if (hdr->flags & GRE_KEY) {
                const __be32 *keyid;
                __be32 _keyid;

                keyid = __skb_header_pointer(skb, *p_nhoff + offset,
                                             sizeof(_keyid),
                                             data, *p_hlen, &_keyid);
                if (!keyid)
                        return FLOW_DISSECT_RET_OUT_BAD;

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_GRE_KEYID)) {
                        key_keyid = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_GRE_KEYID,
                                                              target_container);
                        if (gre_ver == 0)
                                key_keyid->keyid = *keyid;
                        else
                                key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
                }
                offset += sizeof_field(struct gre_full_hdr, key);
        }

        if (hdr->flags & GRE_SEQ)
                offset += sizeof_field(struct pptp_gre_header, seq);

        if (gre_ver == 0) {
                if (*p_proto == htons(ETH_P_TEB)) {
                        const struct ethhdr *eth;
                        struct ethhdr _eth;

                        eth = __skb_header_pointer(skb, *p_nhoff + offset,
                                                   sizeof(_eth),
                                                   data, *p_hlen, &_eth);
                        if (!eth)
                                return FLOW_DISSECT_RET_OUT_BAD;
                        *p_proto = eth->h_proto;
                        offset += sizeof(*eth);

                        /* Cap headers that we access via pointers at the
                         * end of the Ethernet header as our maximum alignment
                         * at that point is only 2 bytes.
                         */
                        if (NET_IP_ALIGN)
                                *p_hlen = *p_nhoff + offset;
                }
        } else { /* version 1, must be PPTP */
                u8 _ppp_hdr[PPP_HDRLEN];
                u8 *ppp_hdr;

                if (hdr->flags & GRE_ACK)
                        offset += sizeof_field(struct pptp_gre_header, ack);

                ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
                                               sizeof(_ppp_hdr),
                                               data, *p_hlen, _ppp_hdr);
                if (!ppp_hdr)
                        return FLOW_DISSECT_RET_OUT_BAD;

                switch (PPP_PROTOCOL(ppp_hdr)) {
                case PPP_IP:
                        *p_proto = htons(ETH_P_IP);
                        break;
                case PPP_IPV6:
                        *p_proto = htons(ETH_P_IPV6);
                        break;
                default:
                        /* Could probably catch some more like MPLS */
                        break;
                }

                offset += PPP_HDRLEN;
        }

        *p_nhoff += offset;
        key_control->flags |= FLOW_DIS_ENCAPSULATION;
        if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
                return FLOW_DISSECT_RET_OUT_GOOD;

        return FLOW_DISSECT_RET_PROTO_AGAIN;
}

/**
 * __skb_flow_dissect_batadv() - dissect batman-adv header
 * @skb: sk_buff to with the batman-adv header
 * @key_control: flow dissectors control key
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @p_proto: pointer used to update the protocol to process next
 * @p_nhoff: pointer used to update inner network header offset
 * @hlen: packet header length
 * @flags: any combination of FLOW_DISSECTOR_F_*
 *
 * ETH_P_BATMAN packets are tried to be dissected. Only
 * &struct batadv_unicast packets are actually processed because they contain an
 * inner ethernet header and are usually followed by actual network header. This
 * allows the flow dissector to continue processing the packet.
 *
 * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found,
 *  FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation,
 *  otherwise FLOW_DISSECT_RET_OUT_BAD
 */
static enum flow_dissect_ret
__skb_flow_dissect_batadv(const struct sk_buff *skb,
                          struct flow_dissector_key_control *key_control,
                          const void *data, __be16 *p_proto, int *p_nhoff,
                          int hlen, unsigned int flags)
{
        struct {
                struct batadv_unicast_packet batadv_unicast;
                struct ethhdr eth;
        } *hdr, _hdr;

        hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen,
                                   &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (hdr->batadv_unicast.packet_type != BATADV_UNICAST)
                return FLOW_DISSECT_RET_OUT_BAD;

        *p_proto = hdr->eth.h_proto;
        *p_nhoff += sizeof(*hdr);

        key_control->flags |= FLOW_DIS_ENCAPSULATION;
        if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
                return FLOW_DISSECT_RET_OUT_GOOD;

        return FLOW_DISSECT_RET_PROTO_AGAIN;
}

static void
__skb_flow_dissect_tcp(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int thoff, int hlen)
{
        struct flow_dissector_key_tcp *key_tcp;
        struct tcphdr *th, _th;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP))
                return;

        th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th);
        if (!th)
                return;

        if (unlikely(__tcp_hdrlen(th) < sizeof(_th)))
                return;

        key_tcp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_TCP,
                                            target_container);
        key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF));
}

static void
__skb_flow_dissect_ports(const struct sk_buff *skb,
                         struct flow_dissector *flow_dissector,
                         void *target_container, const void *data,
                         int nhoff, u8 ip_proto, int hlen)
{
        struct flow_dissector_key_ports_range *key_ports_range = NULL;
        struct flow_dissector_key_ports *key_ports = NULL;
        __be32 ports;

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
                key_ports = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_PORTS,
                                                      target_container);

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE))
                key_ports_range = skb_flow_dissector_target(flow_dissector,
                                                            FLOW_DISSECTOR_KEY_PORTS_RANGE,
                                                            target_container);

        if (!key_ports && !key_ports_range)
                return;

        ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);

        if (key_ports)
                key_ports->ports = ports;

        if (key_ports_range)
                key_ports_range->tp.ports = ports;
}

static void
__skb_flow_dissect_ipv4(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        const struct iphdr *iph)
{
        struct flow_dissector_key_ip *key_ip;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
                return;

        key_ip = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IP,
                                           target_container);
        key_ip->tos = iph->tos;
        key_ip->ttl = iph->ttl;
}

static void
__skb_flow_dissect_ipv6(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        const struct ipv6hdr *iph)
{
        struct flow_dissector_key_ip *key_ip;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
                return;

        key_ip = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IP,
                                           target_container);
        key_ip->tos = ipv6_get_dsfield(iph);
        key_ip->ttl = iph->hop_limit;
}

/* Maximum number of protocol headers that can be parsed in
 * __skb_flow_dissect
 */
#define MAX_FLOW_DISSECT_HDRS        15

static bool skb_flow_dissect_allowed(int *num_hdrs)
{
        ++*num_hdrs;

        return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
}

static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
                                     struct flow_dissector *flow_dissector,
                                     void *target_container)
{
        struct flow_dissector_key_ports_range *key_ports_range = NULL;
        struct flow_dissector_key_ports *key_ports = NULL;
        struct flow_dissector_key_control *key_control;
        struct flow_dissector_key_basic *key_basic;
        struct flow_dissector_key_addrs *key_addrs;
        struct flow_dissector_key_tags *key_tags;

        key_control = skb_flow_dissector_target(flow_dissector,
                                                FLOW_DISSECTOR_KEY_CONTROL,
                                                target_container);
        key_control->thoff = flow_keys->thoff;
        if (flow_keys->is_frag)
                key_control->flags |= FLOW_DIS_IS_FRAGMENT;
        if (flow_keys->is_first_frag)
                key_control->flags |= FLOW_DIS_FIRST_FRAG;
        if (flow_keys->is_encap)
                key_control->flags |= FLOW_DIS_ENCAPSULATION;

        key_basic = skb_flow_dissector_target(flow_dissector,
                                              FLOW_DISSECTOR_KEY_BASIC,
                                              target_container);
        key_basic->n_proto = flow_keys->n_proto;
        key_basic->ip_proto = flow_keys->ip_proto;

        if (flow_keys->addr_proto == ETH_P_IP &&
            dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
                key_addrs = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                      target_container);
                key_addrs->v4addrs.src = flow_keys->ipv4_src;
                key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
                key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
                   dissector_uses_key(flow_dissector,
                                      FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
                key_addrs = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                      target_container);
                memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src,
                       sizeof(key_addrs->v6addrs.src));
                memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst,
                       sizeof(key_addrs->v6addrs.dst));
                key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
                key_ports = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_PORTS,
                                                      target_container);
                key_ports->src = flow_keys->sport;
                key_ports->dst = flow_keys->dport;
        }
        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_PORTS_RANGE)) {
                key_ports_range = skb_flow_dissector_target(flow_dissector,
                                                            FLOW_DISSECTOR_KEY_PORTS_RANGE,
                                                            target_container);
                key_ports_range->tp.src = flow_keys->sport;
                key_ports_range->tp.dst = flow_keys->dport;
        }

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
                key_tags = skb_flow_dissector_target(flow_dissector,
                                                     FLOW_DISSECTOR_KEY_FLOW_LABEL,
                                                     target_container);
                key_tags->flow_label = ntohl(flow_keys->flow_label);
        }
}

u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
                     __be16 proto, int nhoff, int hlen, unsigned int flags)
{
        struct bpf_flow_keys *flow_keys = ctx->flow_keys;
        u32 result;

        /* Pass parameters to the BPF program */
        memset(flow_keys, 0, sizeof(*flow_keys));
        flow_keys->n_proto = proto;
        flow_keys->nhoff = nhoff;
        flow_keys->thoff = flow_keys->nhoff;

        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
                     (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL !=
                     (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP !=
                     (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
        flow_keys->flags = flags;

        result = bpf_prog_run_pin_on_cpu(prog, ctx);

        flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
        flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
                                   flow_keys->nhoff, hlen);

        return result;
}

static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr)
{
        return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0;
}

/**
 * __skb_flow_dissect - extract the flow_keys struct and return it
 * @net: associated network namespace, derived from @skb if NULL
 * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
 * @flow_dissector: list of keys to dissect
 * @target_container: target structure to put dissected values into
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
 * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
 * @flags: flags that control the dissection process, e.g.
 *         FLOW_DISSECTOR_F_STOP_AT_ENCAP.
 *
 * The function will try to retrieve individual keys into target specified
 * by flow_dissector from either the skbuff or a raw buffer specified by the
 * rest parameters.
 *
 * Caller must take care of zeroing target container memory.
 */
bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        __be16 proto, int nhoff, int hlen, unsigned int flags)
{
        struct flow_dissector_key_control *key_control;
        struct flow_dissector_key_basic *key_basic;
        struct flow_dissector_key_addrs *key_addrs;
        struct flow_dissector_key_tags *key_tags;
        struct flow_dissector_key_vlan *key_vlan;
        enum flow_dissect_ret fdret;
        enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
        bool mpls_el = false;
        int mpls_lse = 0;
        int num_hdrs = 0;
        u8 ip_proto = 0;
        bool ret;

        if (!data) {
                data = skb->data;
                proto = skb_vlan_tag_present(skb) ?
                         skb->vlan_proto : skb->protocol;
                nhoff = skb_network_offset(skb);
                hlen = skb_headlen(skb);
#if IS_ENABLED(CONFIG_NET_DSA)
                if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) &&
                             proto == htons(ETH_P_XDSA))) {
                        struct metadata_dst *md_dst = skb_metadata_dst(skb);
                        const struct dsa_device_ops *ops;
                        int offset = 0;

                        ops = skb->dev->dsa_ptr->tag_ops;
                        /* Only DSA header taggers break flow dissection */
                        if (ops->needed_headroom &&
                            (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) {
                                if (ops->flow_dissect)
                                        ops->flow_dissect(skb, &proto, &offset);
                                else
                                        dsa_tag_generic_flow_dissect(skb,
                                                                     &proto,
                                                                     &offset);
                                hlen -= offset;
                                nhoff += offset;
                        }
                }
#endif
        }

        /* It is ensured by skb_flow_dissector_init() that control key will
         * be always present.
         */
        key_control = skb_flow_dissector_target(flow_dissector,
                                                FLOW_DISSECTOR_KEY_CONTROL,
                                                target_container);

        /* It is ensured by skb_flow_dissector_init() that basic key will
         * be always present.
         */
        key_basic = skb_flow_dissector_target(flow_dissector,
                                              FLOW_DISSECTOR_KEY_BASIC,
                                              target_container);

        rcu_read_lock();

        if (skb) {
                if (!net) {
                        if (skb->dev)
                                net = dev_net_rcu(skb->dev);
                        else if (skb->sk)
                                net = sock_net(skb->sk);
                }
        }

        DEBUG_NET_WARN_ON_ONCE(!net);
        if (net) {
                enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
                struct bpf_prog_array *run_array;

                run_array = rcu_dereference(init_net.bpf.run_array[type]);
                if (!run_array)
                        run_array = rcu_dereference(net->bpf.run_array[type]);

                if (run_array) {
                        struct bpf_flow_keys flow_keys;
                        struct bpf_flow_dissector ctx = {
                                .flow_keys = &flow_keys,
                                .data = data,
                                .data_end = data + hlen,
                        };
                        __be16 n_proto = proto;
                        struct bpf_prog *prog;
                        u32 result;

                        if (skb) {
                                ctx.skb = skb;
                                /* we can't use 'proto' in the skb case
                                 * because it might be set to skb->vlan_proto
                                 * which has been pulled from the data
                                 */
                                n_proto = skb->protocol;
                        }

                        prog = READ_ONCE(run_array->items[0].prog);
                        result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
                                                  hlen, flags);
                        if (result != BPF_FLOW_DISSECTOR_CONTINUE) {
                                __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
                                                         target_container);
                                rcu_read_unlock();
                                return result == BPF_OK;
                        }
                }
        }

        rcu_read_unlock();

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
                struct ethhdr *eth = eth_hdr(skb);
                struct flow_dissector_key_eth_addrs *key_eth_addrs;

                key_eth_addrs = skb_flow_dissector_target(flow_dissector,
                                                          FLOW_DISSECTOR_KEY_ETH_ADDRS,
                                                          target_container);
                memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs));
        }

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) {
                struct flow_dissector_key_num_of_vlans *key_num_of_vlans;

                key_num_of_vlans = skb_flow_dissector_target(flow_dissector,
                                                             FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
                                                             target_container);
                key_num_of_vlans->num_of_vlans = 0;
        }

proto_again:
        fdret = FLOW_DISSECT_RET_CONTINUE;

        switch (proto) {
        case htons(ETH_P_IP): {
                const struct iphdr *iph;
                struct iphdr _iph;

                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
                if (!iph || iph->ihl < 5) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += iph->ihl * 4;

                ip_proto = iph->protocol;

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                              target_container);

                        memcpy(&key_addrs->v4addrs.src, &iph->saddr,
                               sizeof(key_addrs->v4addrs.src));
                        memcpy(&key_addrs->v4addrs.dst, &iph->daddr,
                               sizeof(key_addrs->v4addrs.dst));
                        key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                }

                __skb_flow_dissect_ipv4(skb, flow_dissector,
                                        target_container, data, iph);

                if (ip_is_fragment(iph)) {
                        key_control->flags |= FLOW_DIS_IS_FRAGMENT;

                        if (iph->frag_off & htons(IP_OFFSET)) {
                                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                break;
                        } else {
                                key_control->flags |= FLOW_DIS_FIRST_FRAG;
                                if (!(flags &
                                      FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) {
                                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                        break;
                                }
                        }
                }

                break;
        }
        case htons(ETH_P_IPV6): {
                const struct ipv6hdr *iph;
                struct ipv6hdr _iph;

                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
                if (!iph) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                ip_proto = iph->nexthdr;
                nhoff += sizeof(struct ipv6hdr);

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                              target_container);

                        memcpy(&key_addrs->v6addrs.src, &iph->saddr,
                               sizeof(key_addrs->v6addrs.src));
                        memcpy(&key_addrs->v6addrs.dst, &iph->daddr,
                               sizeof(key_addrs->v6addrs.dst));
                        key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                }

                if ((dissector_uses_key(flow_dissector,
                                        FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
                     (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
                    ip6_flowlabel(iph)) {
                        __be32 flow_label = ip6_flowlabel(iph);

                        if (dissector_uses_key(flow_dissector,
                                               FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
                                key_tags = skb_flow_dissector_target(flow_dissector,
                                                                     FLOW_DISSECTOR_KEY_FLOW_LABEL,
                                                                     target_container);
                                key_tags->flow_label = ntohl(flow_label);
                        }
                        if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) {
                                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                break;
                        }
                }

                __skb_flow_dissect_ipv6(skb, flow_dissector,
                                        target_container, data, iph);

                break;
        }
        case htons(ETH_P_8021AD):
        case htons(ETH_P_8021Q): {
                const struct vlan_hdr *vlan = NULL;
                struct vlan_hdr _vlan;
                __be16 saved_vlan_tpid = proto;

                if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX &&
                    skb && skb_vlan_tag_present(skb)) {
                        proto = skb->protocol;
                } else {
                        vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
                                                    data, hlen, &_vlan);
                        if (!vlan) {
                                fdret = FLOW_DISSECT_RET_OUT_BAD;
                                break;
                        }

                        proto = vlan->h_vlan_encapsulated_proto;
                        nhoff += sizeof(*vlan);
                }

                if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) &&
                    !(key_control->flags & FLOW_DIS_ENCAPSULATION)) {
                        struct flow_dissector_key_num_of_vlans *key_nvs;

                        key_nvs = skb_flow_dissector_target(flow_dissector,
                                                            FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
                                                            target_container);
                        key_nvs->num_of_vlans++;
                }

                if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
                        dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
                } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
                        dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN;
                } else {
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                        break;
                }

                if (dissector_uses_key(flow_dissector, dissector_vlan)) {
                        key_vlan = skb_flow_dissector_target(flow_dissector,
                                                             dissector_vlan,
                                                             target_container);

                        if (!vlan) {
                                key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
                                key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb);
                        } else {
                                key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
                                        VLAN_VID_MASK;
                                key_vlan->vlan_priority =
                                        (ntohs(vlan->h_vlan_TCI) &
                                         VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
                        }
                        key_vlan->vlan_tpid = saved_vlan_tpid;
                        key_vlan->vlan_eth_type = proto;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;
        }
        case htons(ETH_P_PPP_SES): {
                struct {
                        struct pppoe_hdr hdr;
                        __be16 proto;
                } *hdr, _hdr;
                u16 ppp_proto;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                /* least significant bit of the most significant octet
                 * indicates if protocol field was compressed
                 */
                ppp_proto = ntohs(hdr->proto);
                if (ppp_proto & 0x0100) {
                        ppp_proto = ppp_proto >> 8;
                        nhoff += PPPOE_SES_HLEN - 1;
                } else {
                        nhoff += PPPOE_SES_HLEN;
                }

                if (ppp_proto == PPP_IP) {
                        proto = htons(ETH_P_IP);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_IPV6) {
                        proto = htons(ETH_P_IPV6);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_MPLS_UC) {
                        proto = htons(ETH_P_MPLS_UC);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_MPLS_MC) {
                        proto = htons(ETH_P_MPLS_MC);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto_is_valid(ppp_proto)) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                } else {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_PPPOE)) {
                        struct flow_dissector_key_pppoe *key_pppoe;

                        key_pppoe = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_PPPOE,
                                                              target_container);
                        key_pppoe->session_id = hdr->hdr.sid;
                        key_pppoe->ppp_proto = htons(ppp_proto);
                        key_pppoe->type = htons(ETH_P_PPP_SES);
                }
                break;
        }
        case htons(ETH_P_TIPC): {
                struct tipc_basic_hdr *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr),
                                           data, hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_TIPC)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_TIPC,
                                                              target_container);
                        key_addrs->tipckey.key = tipc_hdr_rps_key(hdr);
                        key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC;
                }
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }

        case htons(ETH_P_MPLS_UC):
        case htons(ETH_P_MPLS_MC):
                fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
                                                target_container, data,
                                                nhoff, hlen, mpls_lse,
                                                &mpls_el);
                nhoff += sizeof(struct mpls_label);
                mpls_lse++;
                break;
        case htons(ETH_P_FCOE):
                if ((hlen - nhoff) < FCOE_HEADER_LEN) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += FCOE_HEADER_LEN;
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;

        case htons(ETH_P_ARP):
        case htons(ETH_P_RARP):
                fdret = __skb_flow_dissect_arp(skb, flow_dissector,
                                               target_container, data,
                                               nhoff, hlen);
                break;

        case htons(ETH_P_BATMAN):
                fdret = __skb_flow_dissect_batadv(skb, key_control, data,
                                                  &proto, &nhoff, hlen, flags);
                break;

        case htons(ETH_P_1588): {
                struct ptp_header *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
                                           hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += sizeof(struct ptp_header);
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }

        case htons(ETH_P_PRP):
        case htons(ETH_P_HSR): {
                struct hsr_tag *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen,
                                           &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                proto = hdr->encap_proto;
                nhoff += HSR_HLEN;
                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;
        }

        case htons(ETH_P_CFM):
                fdret = __skb_flow_dissect_cfm(skb, flow_dissector,
                                               target_container, data,
                                               nhoff, hlen);
                break;

        default:
                fdret = FLOW_DISSECT_RET_OUT_BAD;
                break;
        }

        /* Process result of proto processing */
        switch (fdret) {
        case FLOW_DISSECT_RET_OUT_GOOD:
                goto out_good;
        case FLOW_DISSECT_RET_PROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto proto_again;
                goto out_good;
        case FLOW_DISSECT_RET_CONTINUE:
        case FLOW_DISSECT_RET_IPPROTO_AGAIN:
                break;
        case FLOW_DISSECT_RET_OUT_BAD:
        default:
                goto out_bad;
        }

ip_proto_again:
        fdret = FLOW_DISSECT_RET_CONTINUE;

        switch (ip_proto) {
        case IPPROTO_GRE:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector,
                                               target_container, data,
                                               &proto, &nhoff, &hlen, flags);
                break;

        case NEXTHDR_HOP:
        case NEXTHDR_ROUTING:
        case NEXTHDR_DEST: {
                u8 _opthdr[2], *opthdr;

                if (proto != htons(ETH_P_IPV6))
                        break;

                opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
                                              data, hlen, &_opthdr);
                if (!opthdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                ip_proto = opthdr[0];
                nhoff += (opthdr[1] + 1) << 3;

                fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
                break;
        }
        case NEXTHDR_FRAGMENT: {
                struct frag_hdr _fh, *fh;

                if (proto != htons(ETH_P_IPV6))
                        break;

                fh = __skb_header_pointer(skb, nhoff, sizeof(_fh),
                                          data, hlen, &_fh);

                if (!fh) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                key_control->flags |= FLOW_DIS_IS_FRAGMENT;

                nhoff += sizeof(_fh);
                ip_proto = fh->nexthdr;

                if (!(fh->frag_off & htons(IP6_OFFSET))) {
                        key_control->flags |= FLOW_DIS_FIRST_FRAG;
                        if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
                                fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
                                break;
                        }
                }

                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }
        case IPPROTO_IPIP:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                proto = htons(ETH_P_IP);

                key_control->flags |= FLOW_DIS_ENCAPSULATION;
                if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;

        case IPPROTO_IPV6:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                proto = htons(ETH_P_IPV6);

                key_control->flags |= FLOW_DIS_ENCAPSULATION;
                if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;


        case IPPROTO_MPLS:
                proto = htons(ETH_P_MPLS_UC);
                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;

        case IPPROTO_TCP:
                __skb_flow_dissect_tcp(skb, flow_dissector, target_container,
                                       data, nhoff, hlen);
                break;

        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                __skb_flow_dissect_icmp(skb, flow_dissector, target_container,
                                        data, nhoff, hlen);
                break;
        case IPPROTO_L2TP:
                __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container,
                                          data, nhoff, hlen);
                break;
        case IPPROTO_ESP:
                __skb_flow_dissect_esp(skb, flow_dissector, target_container,
                                       data, nhoff, hlen);
                break;
        case IPPROTO_AH:
                __skb_flow_dissect_ah(skb, flow_dissector, target_container,
                                      data, nhoff, hlen);
                break;
        default:
                break;
        }

        if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT))
                __skb_flow_dissect_ports(skb, flow_dissector, target_container,
                                         data, nhoff, ip_proto, hlen);

        /* Process result of IP proto processing */
        switch (fdret) {
        case FLOW_DISSECT_RET_PROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto proto_again;
                break;
        case FLOW_DISSECT_RET_IPPROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto ip_proto_again;
                break;
        case FLOW_DISSECT_RET_OUT_GOOD:
        case FLOW_DISSECT_RET_CONTINUE:
                break;
        case FLOW_DISSECT_RET_OUT_BAD:
        default:
                goto out_bad;
        }

out_good:
        ret = true;

out:
        key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
        key_basic->n_proto = proto;
        key_basic->ip_proto = ip_proto;

        return ret;

out_bad:
        ret = false;
        goto out;
}
EXPORT_SYMBOL(__skb_flow_dissect);

static siphash_aligned_key_t hashrnd;
static __always_inline void __flow_hash_secret_init(void)
{
        net_get_random_once(&hashrnd, sizeof(hashrnd));
}

static const void *flow_keys_hash_start(const struct flow_keys *flow)
{
        BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT);
        return &flow->FLOW_KEYS_HASH_START_FIELD;
}

static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
{
        size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);

        BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));

        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                diff -= sizeof(flow->addrs.v4addrs);
                break;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                diff -= sizeof(flow->addrs.v6addrs);
                break;
        case FLOW_DISSECTOR_KEY_TIPC:
                diff -= sizeof(flow->addrs.tipckey);
                break;
        }
        return sizeof(*flow) - diff;
}

__be32 flow_get_u32_src(const struct flow_keys *flow)
{
        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                return flow->addrs.v4addrs.src;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                return (__force __be32)ipv6_addr_hash(
                        &flow->addrs.v6addrs.src);
        case FLOW_DISSECTOR_KEY_TIPC:
                return flow->addrs.tipckey.key;
        default:
                return 0;
        }
}
EXPORT_SYMBOL(flow_get_u32_src);

__be32 flow_get_u32_dst(const struct flow_keys *flow)
{
        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                return flow->addrs.v4addrs.dst;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                return (__force __be32)ipv6_addr_hash(
                        &flow->addrs.v6addrs.dst);
        default:
                return 0;
        }
}
EXPORT_SYMBOL(flow_get_u32_dst);

/* Sort the source and destination IP and the ports,
 * to have consistent hash within the two directions
 */
static inline void __flow_hash_consistentify(struct flow_keys *keys)
{
        int addr_diff, i;

        switch (keys->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                if ((__force u32)keys->addrs.v4addrs.dst <
                    (__force u32)keys->addrs.v4addrs.src)
                        swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);

                if ((__force u16)keys->ports.dst <
                    (__force u16)keys->ports.src) {
                        swap(keys->ports.src, keys->ports.dst);
                }
                break;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                addr_diff = memcmp(&keys->addrs.v6addrs.dst,
                                   &keys->addrs.v6addrs.src,
                                   sizeof(keys->addrs.v6addrs.dst));
                if (addr_diff < 0) {
                        for (i = 0; i < 4; i++)
                                swap(keys->addrs.v6addrs.src.s6_addr32[i],
                                     keys->addrs.v6addrs.dst.s6_addr32[i]);
                }
                if ((__force u16)keys->ports.dst <
                    (__force u16)keys->ports.src) {
                        swap(keys->ports.src, keys->ports.dst);
                }
                break;
        }
}

static inline u32 __flow_hash_from_keys(struct flow_keys *keys,
                                        const siphash_key_t *keyval)
{
        u32 hash;

        __flow_hash_consistentify(keys);

        hash = siphash(flow_keys_hash_start(keys),
                       flow_keys_hash_length(keys), keyval);
        if (!hash)
                hash = 1;

        return hash;
}

u32 flow_hash_from_keys(struct flow_keys *keys)
{
        __flow_hash_secret_init();
        return __flow_hash_from_keys(keys, &hashrnd);
}
EXPORT_SYMBOL(flow_hash_from_keys);

u32 flow_hash_from_keys_seed(struct flow_keys *keys,
                             const siphash_key_t *keyval)
{
        return __flow_hash_from_keys(keys, keyval);
}
EXPORT_SYMBOL(flow_hash_from_keys_seed);

static inline u32 ___skb_get_hash(const struct sk_buff *skb,
                                  struct flow_keys *keys,
                                  const siphash_key_t *keyval)
{
        skb_flow_dissect_flow_keys(skb, keys,
                                   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);

        return __flow_hash_from_keys(keys, keyval);
}

struct _flow_keys_digest_data {
        __be16        n_proto;
        u8        ip_proto;
        u8        padding;
        __be32        ports;
        __be32        src;
        __be32        dst;
};

void make_flow_keys_digest(struct flow_keys_digest *digest,
                           const struct flow_keys *flow)
{
        struct _flow_keys_digest_data *data =
            (struct _flow_keys_digest_data *)digest;

        BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));

        memset(digest, 0, sizeof(*digest));

        data->n_proto = flow->basic.n_proto;
        data->ip_proto = flow->basic.ip_proto;
        data->ports = flow->ports.ports;
        data->src = flow->addrs.v4addrs.src;
        data->dst = flow->addrs.v4addrs.dst;
}
EXPORT_SYMBOL(make_flow_keys_digest);

static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;

u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb)
{
        struct flow_keys keys;

        __flow_hash_secret_init();

        memset(&keys, 0, sizeof(keys));
        __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric,
                           &keys, NULL, 0, 0, 0, 0);

        return __flow_hash_from_keys(&keys, &hashrnd);
}
EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net);

/**
 * __skb_get_hash_net: calculate a flow hash
 * @net: associated network namespace, derived from @skb if NULL
 * @skb: sk_buff to calculate flow hash from
 *
 * This function calculates a flow hash based on src/dst addresses
 * and src/dst port numbers.  Sets hash in skb to non-zero hash value
 * on success, zero indicates no valid hash.  Also, sets l4_hash in skb
 * if hash is a canonical 4-tuple hash over transport ports.
 */
void __skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
        struct flow_keys keys;
        u32 hash;

        memset(&keys, 0, sizeof(keys));

        __skb_flow_dissect(net, skb, &flow_keys_dissector,
                           &keys, NULL, 0, 0, 0,
                           FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);

        __flow_hash_secret_init();

        hash = __flow_hash_from_keys(&keys, &hashrnd);

        __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
EXPORT_SYMBOL(__skb_get_hash_net);

__u32 skb_get_hash_perturb(const struct sk_buff *skb,
                           const siphash_key_t *perturb)
{
        struct flow_keys keys;

        return ___skb_get_hash(skb, &keys, perturb);
}
EXPORT_SYMBOL(skb_get_hash_perturb);

u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                   const struct flow_keys_basic *keys, int hlen)
{
        u32 poff = keys->control.thoff;

        /* skip L4 headers for fragments after the first */
        if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) &&
            !(keys->control.flags & FLOW_DIS_FIRST_FRAG))
                return poff;

        switch (keys->basic.ip_proto) {
        case IPPROTO_TCP: {
                /* access doff as u8 to avoid unaligned access */
                const u8 *doff;
                u8 _doff;

                doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
                                            data, hlen, &_doff);
                if (!doff)
                        return poff;

                poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
                break;
        }
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
                poff += sizeof(struct udphdr);
                break;
        /* For the rest, we do not really care about header
         * extensions at this point for now.
         */
        case IPPROTO_ICMP:
                poff += sizeof(struct icmphdr);
                break;
        case IPPROTO_ICMPV6:
                poff += sizeof(struct icmp6hdr);
                break;
        case IPPROTO_IGMP:
                poff += sizeof(struct igmphdr);
                break;
        case IPPROTO_DCCP:
                poff += sizeof(struct dccp_hdr);
                break;
        case IPPROTO_SCTP:
                poff += sizeof(struct sctphdr);
                break;
        }

        return poff;
}

/**
 * skb_get_poff - get the offset to the payload
 * @skb: sk_buff to get the payload offset from
 *
 * The function will get the offset to the payload as far as it could
 * be dissected.  The main user is currently BPF, so that we can dynamically
 * truncate packets without needing to push actual payload to the user
 * space and can analyze headers only, instead.
 */
u32 skb_get_poff(const struct sk_buff *skb)
{
        struct flow_keys_basic keys;

        if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                              NULL, 0, 0, 0, 0))
                return 0;

        return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
}

__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
{
        memset(keys, 0, sizeof(*keys));

        memcpy(&keys->addrs.v6addrs.src, &fl6->saddr,
            sizeof(keys->addrs.v6addrs.src));
        memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr,
            sizeof(keys->addrs.v6addrs.dst));
        keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        keys->ports.src = fl6->fl6_sport;
        keys->ports.dst = fl6->fl6_dport;
        keys->keyid.keyid = fl6->fl6_gre_key;
        keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
        keys->basic.ip_proto = fl6->flowi6_proto;

        return flow_hash_from_keys(keys);
}
EXPORT_SYMBOL(__get_hash_from_flowi6);

static const struct flow_dissector_key flow_keys_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v4addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v6addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_TIPC,
                .offset = offsetof(struct flow_keys, addrs.tipckey),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct flow_keys, ports),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_VLAN,
                .offset = offsetof(struct flow_keys, vlan),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
                .offset = offsetof(struct flow_keys, tags),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
                .offset = offsetof(struct flow_keys, keyid),
        },
};

static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v4addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v6addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct flow_keys, ports),
        },
};

static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
};

struct flow_dissector flow_keys_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_dissector);

struct flow_dissector flow_keys_basic_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_basic_dissector);

static int __init init_default_flow_dissectors(void)
{
        skb_flow_dissector_init(&flow_keys_dissector,
                                flow_keys_dissector_keys,
                                ARRAY_SIZE(flow_keys_dissector_keys));
        skb_flow_dissector_init(&flow_keys_dissector_symmetric,
                                flow_keys_dissector_symmetric_keys,
                                ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
        skb_flow_dissector_init(&flow_keys_basic_dissector,
                                flow_keys_basic_dissector_keys,
                                ARRAY_SIZE(flow_keys_basic_dissector_keys));
        return 0;
}
core_initcall(init_default_flow_dissectors);






























































































































































    1 

















    2 












    2 








    2 






    1 



























    1 

















































































































    1 

























































































    1 





































































































































































































































































    1 
    2 






























































































































    1 
    2 

























































































































































    1 

















    2 



































    2 

























    1 














































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
// SPDX-License-Identifier: GPL-2.0+
/*
 * Sysfs support implementation.
 *
 * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
 * Copyright (C) 2014 HGST, Inc., a Western Digital Company.
 *
 * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com>
 */

#include <linux/kobject.h>

#include "nilfs.h"
#include "mdt.h"
#include "sufile.h"
#include "cpfile.h"
#include "sysfs.h"

/* /sys/fs/<nilfs>/ */
static struct kset *nilfs_kset;

#define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \
static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \
                                        struct attribute *attr, char *buf) \
{ \
        struct the_nilfs *nilfs = container_of(kobj->parent, \
                                                struct the_nilfs, \
                                                ns_##parent_name##_kobj); \
        struct nilfs_##name##_attr *a = container_of(attr, \
                                                struct nilfs_##name##_attr, \
                                                attr); \
        return a->show ? a->show(a, nilfs, buf) : 0; \
} \
static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \
                                         struct attribute *attr, \
                                         const char *buf, size_t len) \
{ \
        struct the_nilfs *nilfs = container_of(kobj->parent, \
                                                struct the_nilfs, \
                                                ns_##parent_name##_kobj); \
        struct nilfs_##name##_attr *a = container_of(attr, \
                                                struct nilfs_##name##_attr, \
                                                attr); \
        return a->store ? a->store(a, nilfs, buf, len) : 0; \
} \
static const struct sysfs_ops nilfs_##name##_attr_ops = { \
        .show        = nilfs_##name##_attr_show, \
        .store        = nilfs_##name##_attr_store, \
}

#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
static void nilfs_##name##_attr_release(struct kobject *kobj) \
{ \
        struct nilfs_sysfs_##parent_name##_subgroups *subgroups = container_of(kobj, \
                                                struct nilfs_sysfs_##parent_name##_subgroups, \
                                                sg_##name##_kobj); \
        complete(&subgroups->sg_##name##_kobj_unregister); \
} \
static const struct kobj_type nilfs_##name##_ktype = { \
        .default_groups        = nilfs_##name##_groups, \
        .sysfs_ops        = &nilfs_##name##_attr_ops, \
        .release        = nilfs_##name##_attr_release, \
}

#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \
static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
{ \
        struct kobject *parent; \
        struct kobject *kobj; \
        struct completion *kobj_unregister; \
        struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
        int err; \
        subgroups = nilfs->ns_##parent_name##_subgroups; \
        kobj = &subgroups->sg_##name##_kobj; \
        kobj_unregister = &subgroups->sg_##name##_kobj_unregister; \
        parent = &nilfs->ns_##parent_name##_kobj; \
        kobj->kset = nilfs_kset; \
        init_completion(kobj_unregister); \
        err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \
                                    #name); \
        if (err) \
                kobject_put(kobj); \
        return err; \
} \
static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \
{ \
        kobject_put(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
}

/************************************************************************
 *                        NILFS snapshot attrs                          *
 ************************************************************************/

static ssize_t
nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr,
                                 struct nilfs_root *root, char *buf)
{
        return sysfs_emit(buf, "%llu\n",
                        (unsigned long long)atomic64_read(&root->inodes_count));
}

static ssize_t
nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr,
                                 struct nilfs_root *root, char *buf)
{
        return sysfs_emit(buf, "%llu\n",
                        (unsigned long long)atomic64_read(&root->blocks_count));
}

static const char snapshot_readme_str[] =
        "The group contains details about mounted snapshot.\n\n"
        "(1) inodes_count\n\tshow number of inodes for snapshot.\n\n"
        "(2) blocks_count\n\tshow number of blocks for snapshot.\n\n";

static ssize_t
nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr,
                            struct nilfs_root *root, char *buf)
{
        return sysfs_emit(buf, snapshot_readme_str);
}

NILFS_SNAPSHOT_RO_ATTR(inodes_count);
NILFS_SNAPSHOT_RO_ATTR(blocks_count);
NILFS_SNAPSHOT_RO_ATTR(README);

static struct attribute *nilfs_snapshot_attrs[] = {
        NILFS_SNAPSHOT_ATTR_LIST(inodes_count),
        NILFS_SNAPSHOT_ATTR_LIST(blocks_count),
        NILFS_SNAPSHOT_ATTR_LIST(README),
        NULL,
};
ATTRIBUTE_GROUPS(nilfs_snapshot);

static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj,
                                        struct attribute *attr, char *buf)
{
        struct nilfs_root *root =
                        container_of(kobj, struct nilfs_root, snapshot_kobj);
        struct nilfs_snapshot_attr *a =
                        container_of(attr, struct nilfs_snapshot_attr, attr);

        return a->show ? a->show(a, root, buf) : 0;
}

static ssize_t nilfs_snapshot_attr_store(struct kobject *kobj,
                                         struct attribute *attr,
                                         const char *buf, size_t len)
{
        struct nilfs_root *root =
                        container_of(kobj, struct nilfs_root, snapshot_kobj);
        struct nilfs_snapshot_attr *a =
                        container_of(attr, struct nilfs_snapshot_attr, attr);

        return a->store ? a->store(a, root, buf, len) : 0;
}

static void nilfs_snapshot_attr_release(struct kobject *kobj)
{
        struct nilfs_root *root = container_of(kobj, struct nilfs_root,
                                                snapshot_kobj);
        complete(&root->snapshot_kobj_unregister);
}

static const struct sysfs_ops nilfs_snapshot_attr_ops = {
        .show        = nilfs_snapshot_attr_show,
        .store        = nilfs_snapshot_attr_store,
};

static const struct kobj_type nilfs_snapshot_ktype = {
        .default_groups        = nilfs_snapshot_groups,
        .sysfs_ops        = &nilfs_snapshot_attr_ops,
        .release        = nilfs_snapshot_attr_release,
};

int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root)
{
        struct the_nilfs *nilfs;
        struct kobject *parent;
        int err;

        nilfs = root->nilfs;
        parent = &nilfs->ns_dev_subgroups->sg_mounted_snapshots_kobj;
        root->snapshot_kobj.kset = nilfs_kset;
        init_completion(&root->snapshot_kobj_unregister);

        if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
                err = kobject_init_and_add(&root->snapshot_kobj,
                                            &nilfs_snapshot_ktype,
                                            &nilfs->ns_dev_kobj,
                                            "current_checkpoint");
        } else {
                err = kobject_init_and_add(&root->snapshot_kobj,
                                            &nilfs_snapshot_ktype,
                                            parent,
                                            "%llu", root->cno);
        }

        if (err)
                kobject_put(&root->snapshot_kobj);

        return err;
}

void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root)
{
        kobject_put(&root->snapshot_kobj);
}

/************************************************************************
 *                    NILFS mounted snapshots attrs                     *
 ************************************************************************/

static const char mounted_snapshots_readme_str[] =
        "The mounted_snapshots group contains group for\n"
        "every mounted snapshot.\n";

static ssize_t
nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr,
                                    struct the_nilfs *nilfs, char *buf)
{
        return sysfs_emit(buf, mounted_snapshots_readme_str);
}

NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README);

static struct attribute *nilfs_mounted_snapshots_attrs[] = {
        NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README),
        NULL,
};
ATTRIBUTE_GROUPS(nilfs_mounted_snapshots);

NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev);
NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev);
NILFS_DEV_INT_GROUP_FNS(mounted_snapshots, dev);

/************************************************************************
 *                      NILFS checkpoints attrs                         *
 ************************************************************************/

static ssize_t
nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
                                            struct the_nilfs *nilfs,
                                            char *buf)
{
        __u64 ncheckpoints;
        struct nilfs_cpstat cpstat;
        int err;

        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
        up_read(&nilfs->ns_segctor_sem);
        if (err < 0) {
                nilfs_err(nilfs->ns_sb, "unable to get checkpoint stat: err=%d",
                          err);
                return err;
        }

        ncheckpoints = cpstat.cs_ncps;

        return sysfs_emit(buf, "%llu\n", ncheckpoints);
}

static ssize_t
nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
                                        struct the_nilfs *nilfs,
                                        char *buf)
{
        __u64 nsnapshots;
        struct nilfs_cpstat cpstat;
        int err;

        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
        up_read(&nilfs->ns_segctor_sem);
        if (err < 0) {
                nilfs_err(nilfs->ns_sb, "unable to get checkpoint stat: err=%d",
                          err);
                return err;
        }

        nsnapshots = cpstat.cs_nsss;

        return sysfs_emit(buf, "%llu\n", nsnapshots);
}

static ssize_t
nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr,
                                            struct the_nilfs *nilfs,
                                            char *buf)
{
        __u64 last_cno;

        spin_lock(&nilfs->ns_last_segment_lock);
        last_cno = nilfs->ns_last_cno;
        spin_unlock(&nilfs->ns_last_segment_lock);

        return sysfs_emit(buf, "%llu\n", last_cno);
}

static ssize_t
nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
                                        struct the_nilfs *nilfs,
                                        char *buf)
{
        __u64 cno;

        down_read(&nilfs->ns_segctor_sem);
        cno = nilfs->ns_cno;
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%llu\n", cno);
}

static const char checkpoints_readme_str[] =
        "The checkpoints group contains attributes that describe\n"
        "details about volume's checkpoints.\n\n"
        "(1) checkpoints_number\n\tshow number of checkpoints on volume.\n\n"
        "(2) snapshots_number\n\tshow number of snapshots on volume.\n\n"
        "(3) last_seg_checkpoint\n"
        "\tshow checkpoint number of the latest segment.\n\n"
        "(4) next_checkpoint\n\tshow next checkpoint number.\n\n";

static ssize_t
nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr,
                                struct the_nilfs *nilfs, char *buf)
{
        return sysfs_emit(buf, checkpoints_readme_str);
}

NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number);
NILFS_CHECKPOINTS_RO_ATTR(snapshots_number);
NILFS_CHECKPOINTS_RO_ATTR(last_seg_checkpoint);
NILFS_CHECKPOINTS_RO_ATTR(next_checkpoint);
NILFS_CHECKPOINTS_RO_ATTR(README);

static struct attribute *nilfs_checkpoints_attrs[] = {
        NILFS_CHECKPOINTS_ATTR_LIST(checkpoints_number),
        NILFS_CHECKPOINTS_ATTR_LIST(snapshots_number),
        NILFS_CHECKPOINTS_ATTR_LIST(last_seg_checkpoint),
        NILFS_CHECKPOINTS_ATTR_LIST(next_checkpoint),
        NILFS_CHECKPOINTS_ATTR_LIST(README),
        NULL,
};
ATTRIBUTE_GROUPS(nilfs_checkpoints);

NILFS_DEV_INT_GROUP_OPS(checkpoints, dev);
NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev);
NILFS_DEV_INT_GROUP_FNS(checkpoints, dev);

/************************************************************************
 *                        NILFS segments attrs                          *
 ************************************************************************/

static ssize_t
nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr,
                                     struct the_nilfs *nilfs,
                                     char *buf)
{
        return sysfs_emit(buf, "%lu\n", nilfs->ns_nsegments);
}

static ssize_t
nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr,
                                        struct the_nilfs *nilfs,
                                        char *buf)
{
        return sysfs_emit(buf, "%lu\n", nilfs->ns_blocks_per_segment);
}

static ssize_t
nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr,
                                    struct the_nilfs *nilfs,
                                    char *buf)
{
        unsigned long ncleansegs;

        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);

        return sysfs_emit(buf, "%lu\n", ncleansegs);
}

static ssize_t
nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
                                    struct the_nilfs *nilfs,
                                    char *buf)
{
        struct nilfs_sustat sustat;
        int err;

        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
        up_read(&nilfs->ns_segctor_sem);
        if (err < 0) {
                nilfs_err(nilfs->ns_sb, "unable to get segment stat: err=%d",
                          err);
                return err;
        }

        return sysfs_emit(buf, "%llu\n", sustat.ss_ndirtysegs);
}

static const char segments_readme_str[] =
        "The segments group contains attributes that describe\n"
        "details about volume's segments.\n\n"
        "(1) segments_number\n\tshow number of segments on volume.\n\n"
        "(2) blocks_per_segment\n\tshow number of blocks in segment.\n\n"
        "(3) clean_segments\n\tshow count of clean segments.\n\n"
        "(4) dirty_segments\n\tshow count of dirty segments.\n\n";

static ssize_t
nilfs_segments_README_show(struct nilfs_segments_attr *attr,
                            struct the_nilfs *nilfs,
                            char *buf)
{
        return sysfs_emit(buf, segments_readme_str);
}

NILFS_SEGMENTS_RO_ATTR(segments_number);
NILFS_SEGMENTS_RO_ATTR(blocks_per_segment);
NILFS_SEGMENTS_RO_ATTR(clean_segments);
NILFS_SEGMENTS_RO_ATTR(dirty_segments);
NILFS_SEGMENTS_RO_ATTR(README);

static struct attribute *nilfs_segments_attrs[] = {
        NILFS_SEGMENTS_ATTR_LIST(segments_number),
        NILFS_SEGMENTS_ATTR_LIST(blocks_per_segment),
        NILFS_SEGMENTS_ATTR_LIST(clean_segments),
        NILFS_SEGMENTS_ATTR_LIST(dirty_segments),
        NILFS_SEGMENTS_ATTR_LIST(README),
        NULL,
};
ATTRIBUTE_GROUPS(nilfs_segments);

NILFS_DEV_INT_GROUP_OPS(segments, dev);
NILFS_DEV_INT_GROUP_TYPE(segments, dev);
NILFS_DEV_INT_GROUP_FNS(segments, dev);

/************************************************************************
 *                        NILFS segctor attrs                           *
 ************************************************************************/

static ssize_t
nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr,
                                    struct the_nilfs *nilfs,
                                    char *buf)
{
        sector_t last_pseg;

        spin_lock(&nilfs->ns_last_segment_lock);
        last_pseg = nilfs->ns_last_pseg;
        spin_unlock(&nilfs->ns_last_segment_lock);

        return sysfs_emit(buf, "%llu\n",
                        (unsigned long long)last_pseg);
}

static ssize_t
nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr,
                                        struct the_nilfs *nilfs,
                                        char *buf)
{
        u64 last_seq;

        spin_lock(&nilfs->ns_last_segment_lock);
        last_seq = nilfs->ns_last_seq;
        spin_unlock(&nilfs->ns_last_segment_lock);

        return sysfs_emit(buf, "%llu\n", last_seq);
}

static ssize_t
nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr,
                                        struct the_nilfs *nilfs,
                                        char *buf)
{
        __u64 last_cno;

        spin_lock(&nilfs->ns_last_segment_lock);
        last_cno = nilfs->ns_last_cno;
        spin_unlock(&nilfs->ns_last_segment_lock);

        return sysfs_emit(buf, "%llu\n", last_cno);
}

static ssize_t
nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
                                        struct the_nilfs *nilfs,
                                        char *buf)
{
        u64 seg_seq;

        down_read(&nilfs->ns_segctor_sem);
        seg_seq = nilfs->ns_seg_seq;
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%llu\n", seg_seq);
}

static ssize_t
nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
                                         struct the_nilfs *nilfs,
                                         char *buf)
{
        __u64 segnum;

        down_read(&nilfs->ns_segctor_sem);
        segnum = nilfs->ns_segnum;
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%llu\n", segnum);
}

static ssize_t
nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
                                 struct the_nilfs *nilfs,
                                 char *buf)
{
        __u64 nextnum;

        down_read(&nilfs->ns_segctor_sem);
        nextnum = nilfs->ns_nextnum;
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%llu\n", nextnum);
}

static ssize_t
nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
                                        struct the_nilfs *nilfs,
                                        char *buf)
{
        unsigned long pseg_offset;

        down_read(&nilfs->ns_segctor_sem);
        pseg_offset = nilfs->ns_pseg_offset;
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%lu\n", pseg_offset);
}

static ssize_t
nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
                                        struct the_nilfs *nilfs,
                                        char *buf)
{
        __u64 cno;

        down_read(&nilfs->ns_segctor_sem);
        cno = nilfs->ns_cno;
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%llu\n", cno);
}

static ssize_t
nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr,
                                        struct the_nilfs *nilfs,
                                        char *buf)
{
        time64_t ctime;

        down_read(&nilfs->ns_segctor_sem);
        ctime = nilfs->ns_ctime;
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%ptTs\n", &ctime);
}

static ssize_t
nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
                                            struct the_nilfs *nilfs,
                                            char *buf)
{
        time64_t ctime;

        down_read(&nilfs->ns_segctor_sem);
        ctime = nilfs->ns_ctime;
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%llu\n", ctime);
}

static ssize_t
nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr,
                                         struct the_nilfs *nilfs,
                                         char *buf)
{
        time64_t nongc_ctime;

        down_read(&nilfs->ns_segctor_sem);
        nongc_ctime = nilfs->ns_nongc_ctime;
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%ptTs\n", &nongc_ctime);
}

static ssize_t
nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
                                                struct the_nilfs *nilfs,
                                                char *buf)
{
        time64_t nongc_ctime;

        down_read(&nilfs->ns_segctor_sem);
        nongc_ctime = nilfs->ns_nongc_ctime;
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%llu\n", nongc_ctime);
}

static ssize_t
nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
                                            struct the_nilfs *nilfs,
                                            char *buf)
{
        u32 ndirtyblks;

        down_read(&nilfs->ns_segctor_sem);
        ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
        up_read(&nilfs->ns_segctor_sem);

        return sysfs_emit(buf, "%u\n", ndirtyblks);
}

static const char segctor_readme_str[] =
        "The segctor group contains attributes that describe\n"
        "segctor thread activity details.\n\n"
        "(1) last_pseg_block\n"
        "\tshow start block number of the latest segment.\n\n"
        "(2) last_seg_sequence\n"
        "\tshow sequence value of the latest segment.\n\n"
        "(3) last_seg_checkpoint\n"
        "\tshow checkpoint number of the latest segment.\n\n"
        "(4) current_seg_sequence\n\tshow segment sequence counter.\n\n"
        "(5) current_last_full_seg\n"
        "\tshow index number of the latest full segment.\n\n"
        "(6) next_full_seg\n"
        "\tshow index number of the full segment index to be used next.\n\n"
        "(7) next_pseg_offset\n"
        "\tshow offset of next partial segment in the current full segment.\n\n"
        "(8) next_checkpoint\n\tshow next checkpoint number.\n\n"
        "(9) last_seg_write_time\n"
        "\tshow write time of the last segment in human-readable format.\n\n"
        "(10) last_seg_write_time_secs\n"
        "\tshow write time of the last segment in seconds.\n\n"
        "(11) last_nongc_write_time\n"
        "\tshow write time of the last segment not for cleaner operation "
        "in human-readable format.\n\n"
        "(12) last_nongc_write_time_secs\n"
        "\tshow write time of the last segment not for cleaner operation "
        "in seconds.\n\n"
        "(13) dirty_data_blocks_count\n"
        "\tshow number of dirty data blocks.\n\n";

static ssize_t
nilfs_segctor_README_show(struct nilfs_segctor_attr *attr,
                          struct the_nilfs *nilfs, char *buf)
{
        return sysfs_emit(buf, segctor_readme_str);
}

NILFS_SEGCTOR_RO_ATTR(last_pseg_block);
NILFS_SEGCTOR_RO_ATTR(last_seg_sequence);
NILFS_SEGCTOR_RO_ATTR(last_seg_checkpoint);
NILFS_SEGCTOR_RO_ATTR(current_seg_sequence);
NILFS_SEGCTOR_RO_ATTR(current_last_full_seg);
NILFS_SEGCTOR_RO_ATTR(next_full_seg);
NILFS_SEGCTOR_RO_ATTR(next_pseg_offset);
NILFS_SEGCTOR_RO_ATTR(next_checkpoint);
NILFS_SEGCTOR_RO_ATTR(last_seg_write_time);
NILFS_SEGCTOR_RO_ATTR(last_seg_write_time_secs);
NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time);
NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time_secs);
NILFS_SEGCTOR_RO_ATTR(dirty_data_blocks_count);
NILFS_SEGCTOR_RO_ATTR(README);

static struct attribute *nilfs_segctor_attrs[] = {
        NILFS_SEGCTOR_ATTR_LIST(last_pseg_block),
        NILFS_SEGCTOR_ATTR_LIST(last_seg_sequence),
        NILFS_SEGCTOR_ATTR_LIST(last_seg_checkpoint),
        NILFS_SEGCTOR_ATTR_LIST(current_seg_sequence),
        NILFS_SEGCTOR_ATTR_LIST(current_last_full_seg),
        NILFS_SEGCTOR_ATTR_LIST(next_full_seg),
        NILFS_SEGCTOR_ATTR_LIST(next_pseg_offset),
        NILFS_SEGCTOR_ATTR_LIST(next_checkpoint),
        NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time),
        NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time_secs),
        NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time),
        NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time_secs),
        NILFS_SEGCTOR_ATTR_LIST(dirty_data_blocks_count),
        NILFS_SEGCTOR_ATTR_LIST(README),
        NULL,
};
ATTRIBUTE_GROUPS(nilfs_segctor);

NILFS_DEV_INT_GROUP_OPS(segctor, dev);
NILFS_DEV_INT_GROUP_TYPE(segctor, dev);
NILFS_DEV_INT_GROUP_FNS(segctor, dev);

/************************************************************************
 *                        NILFS superblock attrs                        *
 ************************************************************************/

static ssize_t
nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr,
                                     struct the_nilfs *nilfs,
                                     char *buf)
{
        time64_t sbwtime;

        down_read(&nilfs->ns_sem);
        sbwtime = nilfs->ns_sbwtime;
        up_read(&nilfs->ns_sem);

        return sysfs_emit(buf, "%ptTs\n", &sbwtime);
}

static ssize_t
nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr,
                                         struct the_nilfs *nilfs,
                                         char *buf)
{
        time64_t sbwtime;

        down_read(&nilfs->ns_sem);
        sbwtime = nilfs->ns_sbwtime;
        up_read(&nilfs->ns_sem);

        return sysfs_emit(buf, "%llu\n", sbwtime);
}

static ssize_t
nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
                                      struct the_nilfs *nilfs,
                                      char *buf)
{
        unsigned int sbwcount;

        down_read(&nilfs->ns_sem);
        sbwcount = nilfs->ns_sbwcount;
        up_read(&nilfs->ns_sem);

        return sysfs_emit(buf, "%u\n", sbwcount);
}

static ssize_t
nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
                                            struct the_nilfs *nilfs,
                                            char *buf)
{
        unsigned int sb_update_freq;

        down_read(&nilfs->ns_sem);
        sb_update_freq = nilfs->ns_sb_update_freq;
        up_read(&nilfs->ns_sem);

        return sysfs_emit(buf, "%u\n", sb_update_freq);
}

static ssize_t
nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
                                            struct the_nilfs *nilfs,
                                            const char *buf, size_t count)
{
        unsigned int val;
        int err;

        err = kstrtouint(skip_spaces(buf), 0, &val);
        if (err) {
                nilfs_err(nilfs->ns_sb, "unable to convert string: err=%d",
                          err);
                return err;
        }

        if (val < NILFS_SB_FREQ) {
                val = NILFS_SB_FREQ;
                nilfs_warn(nilfs->ns_sb,
                           "superblock update frequency cannot be lesser than 10 seconds");
        }

        down_write(&nilfs->ns_sem);
        nilfs->ns_sb_update_freq = val;
        up_write(&nilfs->ns_sem);

        return count;
}

static const char sb_readme_str[] =
        "The superblock group contains attributes that describe\n"
        "superblock's details.\n\n"
        "(1) sb_write_time\n\tshow previous write time of super block "
        "in human-readable format.\n\n"
        "(2) sb_write_time_secs\n\tshow previous write time of super block "
        "in seconds.\n\n"
        "(3) sb_write_count\n\tshow write count of super block.\n\n"
        "(4) sb_update_frequency\n"
        "\tshow/set interval of periodical update of superblock (in seconds).\n\n"
        "\tYou can set preferable frequency of superblock update by command:\n\n"
        "\t'echo <val> > /sys/fs/<nilfs>/<dev>/superblock/sb_update_frequency'\n";

static ssize_t
nilfs_superblock_README_show(struct nilfs_superblock_attr *attr,
                                struct the_nilfs *nilfs, char *buf)
{
        return sysfs_emit(buf, sb_readme_str);
}

NILFS_SUPERBLOCK_RO_ATTR(sb_write_time);
NILFS_SUPERBLOCK_RO_ATTR(sb_write_time_secs);
NILFS_SUPERBLOCK_RO_ATTR(sb_write_count);
NILFS_SUPERBLOCK_RW_ATTR(sb_update_frequency);
NILFS_SUPERBLOCK_RO_ATTR(README);

static struct attribute *nilfs_superblock_attrs[] = {
        NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time),
        NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time_secs),
        NILFS_SUPERBLOCK_ATTR_LIST(sb_write_count),
        NILFS_SUPERBLOCK_ATTR_LIST(sb_update_frequency),
        NILFS_SUPERBLOCK_ATTR_LIST(README),
        NULL,
};
ATTRIBUTE_GROUPS(nilfs_superblock);

NILFS_DEV_INT_GROUP_OPS(superblock, dev);
NILFS_DEV_INT_GROUP_TYPE(superblock, dev);
NILFS_DEV_INT_GROUP_FNS(superblock, dev);

/************************************************************************
 *                        NILFS device attrs                            *
 ************************************************************************/

static
ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr,
                                struct the_nilfs *nilfs,
                                char *buf)
{
        struct nilfs_super_block *raw_sb;
        u32 major;
        u16 minor;

        down_read(&nilfs->ns_sem);
        raw_sb = nilfs->ns_sbp[0];
        major = le32_to_cpu(raw_sb->s_rev_level);
        minor = le16_to_cpu(raw_sb->s_minor_rev_level);
        up_read(&nilfs->ns_sem);

        return sysfs_emit(buf, "%d.%d\n", major, minor);
}

static
ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr,
                                 struct the_nilfs *nilfs,
                                 char *buf)
{
        return sysfs_emit(buf, "%u\n", nilfs->ns_blocksize);
}

static
ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr,
                                    struct the_nilfs *nilfs,
                                    char *buf)
{
        struct nilfs_super_block *raw_sb;
        u64 dev_size;

        down_read(&nilfs->ns_sem);
        raw_sb = nilfs->ns_sbp[0];
        dev_size = le64_to_cpu(raw_sb->s_dev_size);
        up_read(&nilfs->ns_sem);

        return sysfs_emit(buf, "%llu\n", dev_size);
}

static
ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr,
                                   struct the_nilfs *nilfs,
                                   char *buf)
{
        sector_t free_blocks = 0;

        nilfs_count_free_blocks(nilfs, &free_blocks);
        return sysfs_emit(buf, "%llu\n",
                        (unsigned long long)free_blocks);
}

static
ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr,
                            struct the_nilfs *nilfs,
                            char *buf)
{
        struct nilfs_super_block *raw_sb;
        ssize_t len;

        down_read(&nilfs->ns_sem);
        raw_sb = nilfs->ns_sbp[0];
        len = sysfs_emit(buf, "%pUb\n", raw_sb->s_uuid);
        up_read(&nilfs->ns_sem);

        return len;
}

static
ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr,
                                    struct the_nilfs *nilfs,
                                    char *buf)
{
        struct nilfs_super_block *raw_sb;
        ssize_t len;

        down_read(&nilfs->ns_sem);
        raw_sb = nilfs->ns_sbp[0];
        len = scnprintf(buf, sizeof(raw_sb->s_volume_name), "%s\n",
                        raw_sb->s_volume_name);
        up_read(&nilfs->ns_sem);

        return len;
}

static const char dev_readme_str[] =
        "The <device> group contains attributes that describe file system\n"
        "partition's details.\n\n"
        "(1) revision\n\tshow NILFS file system revision.\n\n"
        "(2) blocksize\n\tshow volume block size in bytes.\n\n"
        "(3) device_size\n\tshow volume size in bytes.\n\n"
        "(4) free_blocks\n\tshow count of free blocks on volume.\n\n"
        "(5) uuid\n\tshow volume's UUID.\n\n"
        "(6) volume_name\n\tshow volume's name.\n\n";

static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr,
                                     struct the_nilfs *nilfs,
                                     char *buf)
{
        return sysfs_emit(buf, dev_readme_str);
}

NILFS_DEV_RO_ATTR(revision);
NILFS_DEV_RO_ATTR(blocksize);
NILFS_DEV_RO_ATTR(device_size);
NILFS_DEV_RO_ATTR(free_blocks);
NILFS_DEV_RO_ATTR(uuid);
NILFS_DEV_RO_ATTR(volume_name);
NILFS_DEV_RO_ATTR(README);

static struct attribute *nilfs_dev_attrs[] = {
        NILFS_DEV_ATTR_LIST(revision),
        NILFS_DEV_ATTR_LIST(blocksize),
        NILFS_DEV_ATTR_LIST(device_size),
        NILFS_DEV_ATTR_LIST(free_blocks),
        NILFS_DEV_ATTR_LIST(uuid),
        NILFS_DEV_ATTR_LIST(volume_name),
        NILFS_DEV_ATTR_LIST(README),
        NULL,
};
ATTRIBUTE_GROUPS(nilfs_dev);

static ssize_t nilfs_dev_attr_show(struct kobject *kobj,
                                    struct attribute *attr, char *buf)
{
        struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
                                                ns_dev_kobj);
        struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr,
                                                attr);

        return a->show ? a->show(a, nilfs, buf) : 0;
}

static ssize_t nilfs_dev_attr_store(struct kobject *kobj,
                                    struct attribute *attr,
                                    const char *buf, size_t len)
{
        struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
                                                ns_dev_kobj);
        struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr,
                                                attr);

        return a->store ? a->store(a, nilfs, buf, len) : 0;
}

static void nilfs_dev_attr_release(struct kobject *kobj)
{
        struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
                                                ns_dev_kobj);
        complete(&nilfs->ns_dev_kobj_unregister);
}

static const struct sysfs_ops nilfs_dev_attr_ops = {
        .show        = nilfs_dev_attr_show,
        .store        = nilfs_dev_attr_store,
};

static const struct kobj_type nilfs_dev_ktype = {
        .default_groups        = nilfs_dev_groups,
        .sysfs_ops        = &nilfs_dev_attr_ops,
        .release        = nilfs_dev_attr_release,
};

int nilfs_sysfs_create_device_group(struct super_block *sb)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        size_t devgrp_size = sizeof(struct nilfs_sysfs_dev_subgroups);
        int err;

        nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
        if (unlikely(!nilfs->ns_dev_subgroups)) {
                err = -ENOMEM;
                nilfs_err(sb, "unable to allocate memory for device group");
                goto failed_create_device_group;
        }

        nilfs->ns_dev_kobj.kset = nilfs_kset;
        init_completion(&nilfs->ns_dev_kobj_unregister);
        err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL,
                                    "%s", sb->s_id);
        if (err)
                goto cleanup_dev_kobject;

        err = nilfs_sysfs_create_mounted_snapshots_group(nilfs);
        if (err)
                goto cleanup_dev_kobject;

        err = nilfs_sysfs_create_checkpoints_group(nilfs);
        if (err)
                goto delete_mounted_snapshots_group;

        err = nilfs_sysfs_create_segments_group(nilfs);
        if (err)
                goto delete_checkpoints_group;

        err = nilfs_sysfs_create_superblock_group(nilfs);
        if (err)
                goto delete_segments_group;

        err = nilfs_sysfs_create_segctor_group(nilfs);
        if (err)
                goto delete_superblock_group;

        return 0;

delete_superblock_group:
        nilfs_sysfs_delete_superblock_group(nilfs);

delete_segments_group:
        nilfs_sysfs_delete_segments_group(nilfs);

delete_checkpoints_group:
        nilfs_sysfs_delete_checkpoints_group(nilfs);

delete_mounted_snapshots_group:
        nilfs_sysfs_delete_mounted_snapshots_group(nilfs);

cleanup_dev_kobject:
        kobject_put(&nilfs->ns_dev_kobj);
        kfree(nilfs->ns_dev_subgroups);

failed_create_device_group:
        return err;
}

void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
{
        nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
        nilfs_sysfs_delete_checkpoints_group(nilfs);
        nilfs_sysfs_delete_segments_group(nilfs);
        nilfs_sysfs_delete_superblock_group(nilfs);
        nilfs_sysfs_delete_segctor_group(nilfs);
        kobject_del(&nilfs->ns_dev_kobj);
        kobject_put(&nilfs->ns_dev_kobj);
        kfree(nilfs->ns_dev_subgroups);
}

/************************************************************************
 *                        NILFS feature attrs                           *
 ************************************************************************/

static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
                                            struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d.%d\n",
                        NILFS_CURRENT_REV, NILFS_MINOR_REV);
}

static const char features_readme_str[] =
        "The features group contains attributes that describe NILFS file\n"
        "system driver features.\n\n"
        "(1) revision\n\tshow current revision of NILFS file system driver.\n";

static ssize_t nilfs_feature_README_show(struct kobject *kobj,
                                         struct kobj_attribute *attr,
                                         char *buf)
{
        return sysfs_emit(buf, features_readme_str);
}

NILFS_FEATURE_RO_ATTR(revision);
NILFS_FEATURE_RO_ATTR(README);

static struct attribute *nilfs_feature_attrs[] = {
        NILFS_FEATURE_ATTR_LIST(revision),
        NILFS_FEATURE_ATTR_LIST(README),
        NULL,
};

static const struct attribute_group nilfs_feature_attr_group = {
        .name = "features",
        .attrs = nilfs_feature_attrs,
};

int __init nilfs_sysfs_init(void)
{
        int err;

        nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
        if (!nilfs_kset) {
                err = -ENOMEM;
                nilfs_err(NULL, "unable to create sysfs entry: err=%d", err);
                goto failed_sysfs_init;
        }

        err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
        if (unlikely(err)) {
                nilfs_err(NULL, "unable to create feature group: err=%d", err);
                goto cleanup_sysfs_init;
        }

        return 0;

cleanup_sysfs_init:
        kset_unregister(nilfs_kset);

failed_sysfs_init:
        return err;
}

void nilfs_sysfs_exit(void)
{
        sysfs_remove_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
        kset_unregister(nilfs_kset);
}










   63 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_INTERNAL_H
#define __X86_KERNEL_FPU_INTERNAL_H

extern struct fpstate init_fpstate;

/* CPU feature check wrappers */
static __always_inline __pure bool use_xsave(void)
{
        return cpu_feature_enabled(X86_FEATURE_XSAVE);
}

static __always_inline __pure bool use_fxsr(void)
{
        return cpu_feature_enabled(X86_FEATURE_FXSR);
}

#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
# define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; })
#endif

/* Used in init.c */
extern void fpstate_init_user(struct fpstate *fpstate);
extern void fpstate_reset(struct fpu *fpu);

#endif
















































































    4 








    4 







    3 
    1 














    4 













    4 




















































































































   35 



   24 








   24 








   20 



   10 















    1 

























   35 
   34 


    1 























































































































































   37 
















   36 


    6 



    4 
   16 




   38 

    4 







   34 









   32 

   35 







   23 
   39 
    3 
   19 




   34 
    3 
    9 



   36 


    4 




    4 







   40 
    3 
   17 




   38 





   42 
   21 
   38 



   39 























   16 









   16 
















   34 


   33 




   18 

















   30 

   33 

   20 
   33 









































































   36 



   37 








    1 





































































































    1 





    1 



















































































































   12 







   11 







































































































































































































































































































































































































































































































































































































































































































































































   10 










    1 









    1 











    1 




    1 











    1 




















































































































































































































































































































































































































































































































































































































































































































































































   17 





































    1 

   10 


















   13 






























    1 




















   38 

























   39 





    5 









    1 









   43 






   36 





   38 

   24 





   21 
   36 


    6 






   38 








    1 













































   37 

   30 







   34 
   38 
























    1 

















































   38 





















   43 







   20 

   37 











   30 

   38 





    6 



    1 






   23 

    6 




   25 
    1 


   21 








   31 



   16 

   18 























   21 

   35 

   19 
























   35 







   35 





















   27 























    2 




    2 







































   18 

























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/lib/vsprintf.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */
/*
 * Wirzenius wrote this portably, Torvalds fucked it up :-)
 */

/*
 * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com>
 * - changed to provide snprintf and vsnprintf functions
 * So Feb  1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de>
 * - scnprintf and vscnprintf
 */

#include <linux/stdarg.h>
#include <linux/build_bug.h>
#include <linux/clk.h>
#include <linux/clk-provider.h>
#include <linux/errname.h>
#include <linux/module.h>        /* for KSYM_SYMBOL_LEN */
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/hex.h>
#include <linux/kernel.h>
#include <linux/kallsyms.h>
#include <linux/math64.h>
#include <linux/uaccess.h>
#include <linux/ioport.h>
#include <linux/dcache.h>
#include <linux/cred.h>
#include <linux/rtc.h>
#include <linux/sprintf.h>
#include <linux/time.h>
#include <linux/uuid.h>
#include <linux/of.h>
#include <net/addrconf.h>
#include <linux/siphash.h>
#include <linux/compiler.h>
#include <linux/property.h>
#include <linux/notifier.h>
#ifdef CONFIG_BLOCK
#include <linux/blkdev.h>
#endif

#include "../mm/internal.h"        /* For the trace_print_flags arrays */

#include <asm/page.h>                /* for PAGE_SIZE */
#include <asm/byteorder.h>        /* cpu_to_le16 */
#include <linux/unaligned.h>

#include <linux/string_helpers.h>
#include "kstrtox.h"

/* Disable pointer hashing if requested */
bool no_hash_pointers __ro_after_init;
EXPORT_SYMBOL_GPL(no_hash_pointers);

/*
 * Hashed pointers policy selected by "hash_pointers=..." boot param
 *
 * `auto`   - Hashed pointers enabled unless disabled by slub_debug_enabled=true
 * `always` - Hashed pointers enabled unconditionally
 * `never`  - Hashed pointers disabled unconditionally
 */
enum hash_pointers_policy {
        HASH_PTR_AUTO = 0,
        HASH_PTR_ALWAYS,
        HASH_PTR_NEVER
};
static enum hash_pointers_policy hash_pointers_mode __initdata;

noinline
static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars)
{
        const char *cp;
        unsigned long long result = 0ULL;
        size_t prefix_chars;
        unsigned int rv;

        cp = _parse_integer_fixup_radix(startp, &base);
        prefix_chars = cp - startp;
        if (prefix_chars < max_chars) {
                rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars);
                /* FIXME */
                cp += (rv & ~KSTRTOX_OVERFLOW);
        } else {
                /* Field too short for prefix + digit, skip over without converting */
                cp = startp + max_chars;
        }

        if (endp)
                *endp = (char *)cp;

        return result;
}

/**
 * simple_strtoull - convert a string to an unsigned long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoull instead.
 */
noinline
unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoull(cp, endp, base, INT_MAX);
}
EXPORT_SYMBOL(simple_strtoull);

/**
 * simple_strtoul - convert a string to an unsigned long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoul instead.
 */
unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
{
        return simple_strtoull(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtoul);

unsigned long simple_strntoul(const char *cp, char **endp, unsigned int base,
                              size_t max_chars)
{
        return simple_strntoull(cp, endp, base, max_chars);
}
EXPORT_SYMBOL(simple_strntoul);

/**
 * simple_strtol - convert a string to a signed long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtol instead.
 */
long simple_strtol(const char *cp, char **endp, unsigned int base)
{
        if (*cp == '-')
                return -simple_strtoul(cp + 1, endp, base);

        return simple_strtoul(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtol);

noinline
static long long simple_strntoll(const char *cp, char **endp, unsigned int base, size_t max_chars)
{
        /*
         * simple_strntoull() safely handles receiving max_chars==0 in the
         * case cp[0] == '-' && max_chars == 1.
         * If max_chars == 0 we can drop through and pass it to simple_strntoull()
         * and the content of *cp is irrelevant.
         */
        if (*cp == '-' && max_chars > 0)
                return -simple_strntoull(cp + 1, endp, base, max_chars - 1);

        return simple_strntoull(cp, endp, base, max_chars);
}

/**
 * simple_strtoll - convert a string to a signed long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoll instead.
 */
long long simple_strtoll(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoll(cp, endp, base, INT_MAX);
}
EXPORT_SYMBOL(simple_strtoll);

static inline int skip_atoi(const char **s)
{
        int i = 0;

        do {
                i = i*10 + *((*s)++) - '0';
        } while (isdigit(**s));

        return i;
}

/*
 * Decimal conversion is by far the most typical, and is used for
 * /proc and /sys data. This directly impacts e.g. top performance
 * with many processes running. We optimize it for speed by emitting
 * two characters at a time, using a 200 byte lookup table. This
 * roughly halves the number of multiplications compared to computing
 * the digits one at a time. Implementation strongly inspired by the
 * previous version, which in turn used ideas described at
 * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
 * from the author, Douglas W. Jones).
 *
 * It turns out there is precisely one 26 bit fixed-point
 * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
 * range happens to be somewhat larger (x <= 1073741898), but that's
 * irrelevant for our purpose.
 *
 * For dividing a number in the range [10^4, 10^6-1] by 100, we still
 * need a 32x32->64 bit multiply, so we simply use the same constant.
 *
 * For dividing a number in the range [100, 10^4-1] by 100, there are
 * several options. The simplest is (x * 0x147b) >> 19, which is valid
 * for all x <= 43698.
 */

static const u16 decpair[100] = {
#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
        _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
        _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
        _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
        _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
        _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
        _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
        _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
        _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
        _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
        _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
#undef _
};

/*
 * This will print a single '0' even if r == 0, since we would
 * immediately jump to out_r where two 0s would be written but only
 * one of them accounted for in buf. This is needed by ip4_string
 * below. All other callers pass a non-zero value of r.
*/
static noinline_for_stack
char *put_dec_trunc8(char *buf, unsigned r)
{
        unsigned q;

        /* 1 <= r < 10^8 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 1 <= q < 10^6 */
        if (q < 100)
                goto out_q;

        /*  100 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 1 <= r < 10^4 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
out_q:
        /* 1 <= q < 100 */
        r = q;
out_r:
        /* 1 <= r < 100 */
        *((u16 *)buf) = decpair[r];
        buf += r < 10 ? 1 : 2;
        return buf;
}

#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
static noinline_for_stack
char *put_dec_full8(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
        buf += 2;
        return buf;
}

static noinline_for_stack
char *put_dec(char *buf, unsigned long long n)
{
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n <= 1.6e11 */
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n < 1e8 */
        return put_dec_trunc8(buf, n);
}

#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64

static void
put_dec_full4(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
}

/*
 * Call put_dec_full4 on x % 10000, return x / 10000.
 * The approximation x/10000 == (x * 0x346DC5D7) >> 43
 * holds for all x < 1,128,869,999.  The largest value this
 * helper will ever be asked to convert is 1,125,520,955.
 * (second call in the put_dec code, assuming n is all-ones).
 */
static noinline_for_stack
unsigned put_dec_helper4(char *buf, unsigned x)
{
        uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;

        put_dec_full4(buf, x - q * 10000);
        return q;
}

/* Based on code by Douglas W. Jones found at
 * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
 * (with permission from the author).
 * Performs no 64-bit division and hence should be fast on 32-bit machines.
 */
static
char *put_dec(char *buf, unsigned long long n)
{
        uint32_t d3, d2, d1, q, h;

        if (n < 100*1000*1000)
                return put_dec_trunc8(buf, n);

        d1  = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
        h   = (n >> 32);
        d2  = (h      ) & 0xffff;
        d3  = (h >> 16); /* implicit "& 0xffff" */

        /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
             = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
        q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
        q = put_dec_helper4(buf, q);

        q += 7671 * d3 + 9496 * d2 + 6 * d1;
        q = put_dec_helper4(buf+4, q);

        q += 4749 * d3 + 42 * d2;
        q = put_dec_helper4(buf+8, q);

        q += 281 * d3;
        buf += 12;
        if (q)
                buf = put_dec_trunc8(buf, q);
        else while (buf[-1] == '0')
                --buf;

        return buf;
}

#endif

/*
 * Convert passed number to decimal string.
 * Returns the length of string.  On buffer overflow, returns 0.
 *
 * If speed is not important, use snprintf(). It's easy to read the code.
 */
int num_to_str(char *buf, int size, unsigned long long num, unsigned int width)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[sizeof(num) * 3] __aligned(2);
        int idx, len;

        /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
        if (num <= 9) {
                tmp[0] = '0' + num;
                len = 1;
        } else {
                len = put_dec(tmp, num) - tmp;
        }

        if (len > size || width > size)
                return 0;

        if (width > len) {
                width = width - len;
                for (idx = 0; idx < width; idx++)
                        buf[idx] = ' ';
        } else {
                width = 0;
        }

        for (idx = 0; idx < len; ++idx)
                buf[idx + width] = tmp[len - idx - 1];

        return len + width;
}

#define SIGN        1                /* unsigned/signed */
#define LEFT        2                /* left justified */
#define PLUS        4                /* show plus */
#define SPACE        8                /* space if plus */
#define ZEROPAD        16                /* pad with zero, must be 16 == '0' - ' ' */
#define SMALL        32                /* use lowercase in hex (must be 32 == 0x20) */
#define SPECIAL        64                /* prefix hex with "0x", octal with "0" */

static_assert(ZEROPAD == ('0' - ' '));
static_assert(SMALL == ('a' ^ 'A'));

enum format_state {
        FORMAT_STATE_NONE, /* Just a string part */
        FORMAT_STATE_NUM,
        FORMAT_STATE_WIDTH,
        FORMAT_STATE_PRECISION,
        FORMAT_STATE_CHAR,
        FORMAT_STATE_STR,
        FORMAT_STATE_PTR,
        FORMAT_STATE_PERCENT_CHAR,
        FORMAT_STATE_INVALID,
};

struct printf_spec {
        unsigned char        flags;                /* flags to number() */
        unsigned char        base;                /* number base, 8, 10 or 16 only */
        short                precision;        /* # of digits/chars */
        int                field_width;        /* width of output field */
} __packed;
static_assert(sizeof(struct printf_spec) == 8);

#define FIELD_WIDTH_MAX ((1 << 23) - 1)
#define PRECISION_MAX ((1 << 15) - 1)

static noinline_for_stack
char *number(char *buf, char *end, unsigned long long num,
             struct printf_spec spec)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[3 * sizeof(num)] __aligned(2);
        char sign;
        char locase;
        int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
        int i;
        bool is_zero = num == 0LL;
        int field_width = spec.field_width;
        int precision = spec.precision;

        /* locase = 0 or 0x20. ORing digits or letters with 'locase'
         * produces same digits or (maybe lowercased) letters */
        locase = (spec.flags & SMALL);
        if (spec.flags & LEFT)
                spec.flags &= ~ZEROPAD;
        sign = 0;
        if (spec.flags & SIGN) {
                if ((signed long long)num < 0) {
                        sign = '-';
                        num = -(signed long long)num;
                        field_width--;
                } else if (spec.flags & PLUS) {
                        sign = '+';
                        field_width--;
                } else if (spec.flags & SPACE) {
                        sign = ' ';
                        field_width--;
                }
        }
        if (need_pfx) {
                if (spec.base == 16)
                        field_width -= 2;
                else if (!is_zero)
                        field_width--;
        }

        /* generate full string in tmp[], in reverse order */
        i = 0;
        if (num < spec.base)
                tmp[i++] = hex_asc_upper[num] | locase;
        else if (spec.base != 10) { /* 8 or 16 */
                int mask = spec.base - 1;
                int shift = 3;

                if (spec.base == 16)
                        shift = 4;
                do {
                        tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
                        num >>= shift;
                } while (num);
        } else { /* base 10 */
                i = put_dec(tmp, num) - tmp;
        }

        /* printing 100 using %2d gives "100", not "00" */
        if (i > precision)
                precision = i;
        /* leading space padding */
        field_width -= precision;
        if (!(spec.flags & (ZEROPAD | LEFT))) {
                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = ' ';
                        ++buf;
                }
        }
        /* sign */
        if (sign) {
                if (buf < end)
                        *buf = sign;
                ++buf;
        }
        /* "0x" / "0" prefix */
        if (need_pfx) {
                if (spec.base == 16 || !is_zero) {
                        if (buf < end)
                                *buf = '0';
                        ++buf;
                }
                if (spec.base == 16) {
                        if (buf < end)
                                *buf = ('X' | locase);
                        ++buf;
                }
        }
        /* zero or space padding */
        if (!(spec.flags & LEFT)) {
                char c = ' ' + (spec.flags & ZEROPAD);

                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = c;
                        ++buf;
                }
        }
        /* hmm even more zero padding? */
        while (i <= --precision) {
                if (buf < end)
                        *buf = '0';
                ++buf;
        }
        /* actual digits of result */
        while (--i >= 0) {
                if (buf < end)
                        *buf = tmp[i];
                ++buf;
        }
        /* trailing space padding */
        while (--field_width >= 0) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }

        return buf;
}

#define special_hex_spec(size)                                        \
(struct printf_spec) {                                                \
        .field_width = 2 + 2 * (size),                /* 0x + hex */        \
        .flags = SPECIAL | SMALL | ZEROPAD,                        \
        .base = 16,                                                \
        .precision = -1,                                        \
}

static noinline_for_stack
char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
{
        return number(buf, end, num, special_hex_spec(size));
}

static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
{
        size_t size;
        if (buf >= end)        /* nowhere to put anything */
                return;
        size = end - buf;
        if (size <= spaces) {
                memset(buf, ' ', size);
                return;
        }
        if (len) {
                if (len > size - spaces)
                        len = size - spaces;
                memmove(buf + spaces, buf, len);
        }
        memset(buf, ' ', spaces);
}

/*
 * Handle field width padding for a string.
 * @buf: current buffer position
 * @n: length of string
 * @end: end of output buffer
 * @spec: for field width and flags
 * Returns: new buffer position after padding.
 */
static noinline_for_stack
char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
{
        unsigned spaces;

        if (likely(n >= spec.field_width))
                return buf;
        /* we want to pad the sucker */
        spaces = spec.field_width - n;
        if (!(spec.flags & LEFT)) {
                move_right(buf - n, end, n, spaces);
                return buf + spaces;
        }
        while (spaces--) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }
        return buf;
}

/* Handle string from a well known address. */
static char *string_nocheck(char *buf, char *end, const char *s,
                            struct printf_spec spec)
{
        int len = 0;
        int lim = spec.precision;

        while (lim--) {
                char c = *s++;
                if (!c)
                        break;
                if (buf < end)
                        *buf = c;
                ++buf;
                ++len;
        }
        return widen_string(buf, len, end, spec);
}

static char *err_ptr(char *buf, char *end, void *ptr,
                     struct printf_spec spec)
{
        int err = PTR_ERR(ptr);
        const char *sym = errname(err);

        if (sym)
                return string_nocheck(buf, end, sym, spec);

        /*
         * Somebody passed ERR_PTR(-1234) or some other non-existing
         * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to
         * printing it as its decimal representation.
         */
        spec.flags |= SIGN;
        spec.base = 10;
        return number(buf, end, err, spec);
}

/* Be careful: error messages must fit into the given buffer. */
static char *error_string(char *buf, char *end, const char *s,
                          struct printf_spec spec)
{
        /*
         * Hard limit to avoid a completely insane messages. It actually
         * works pretty well because most error messages are in
         * the many pointer format modifiers.
         */
        if (spec.precision == -1)
                spec.precision = 2 * sizeof(void *);

        return string_nocheck(buf, end, s, spec);
}

/*
 * Do not call any complex external code here. Nested printk()/vsprintf()
 * might cause infinite loops. Failures might break printk() and would
 * be hard to debug.
 */
static const char *check_pointer_msg(const void *ptr)
{
        if (!ptr)
                return "(null)";

        if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr))
                return "(efault)";

        return NULL;
}

static int check_pointer(char **buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        const char *err_msg;

        err_msg = check_pointer_msg(ptr);
        if (err_msg) {
                *buf = error_string(*buf, end, err_msg, spec);
                return -EFAULT;
        }

        return 0;
}

static noinline_for_stack
char *string(char *buf, char *end, const char *s,
             struct printf_spec spec)
{
        if (check_pointer(&buf, end, s, spec))
                return buf;

        return string_nocheck(buf, end, s, spec);
}

static char *pointer_string(char *buf, char *end,
                            const void *ptr,
                            struct printf_spec spec)
{
        spec.base = 16;
        spec.flags |= SMALL;
        if (spec.field_width == -1) {
                spec.field_width = 2 * sizeof(ptr);
                spec.flags |= ZEROPAD;
        }

        return number(buf, end, (unsigned long int)ptr, spec);
}

/* Make pointers available for printing early in the boot sequence. */
static int debug_boot_weak_hash __ro_after_init;

static int __init debug_boot_weak_hash_enable(char *str)
{
        debug_boot_weak_hash = 1;
        pr_info("debug_boot_weak_hash enabled\n");
        return 0;
}
early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable);

static bool filled_random_ptr_key __read_mostly;
static siphash_key_t ptr_key __read_mostly;

static int fill_ptr_key(struct notifier_block *nb, unsigned long action, void *data)
{
        get_random_bytes(&ptr_key, sizeof(ptr_key));

        /* Pairs with smp_rmb() before reading ptr_key. */
        smp_wmb();
        WRITE_ONCE(filled_random_ptr_key, true);
        return NOTIFY_DONE;
}

static int __init vsprintf_init_hashval(void)
{
        static struct notifier_block fill_ptr_key_nb = { .notifier_call = fill_ptr_key };
        execute_with_initialized_rng(&fill_ptr_key_nb);
        return 0;
}
subsys_initcall(vsprintf_init_hashval)

/* Maps a pointer to a 32 bit unique identifier. */
static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        unsigned long hashval;

        if (!READ_ONCE(filled_random_ptr_key))
                return -EBUSY;

        /* Pairs with smp_wmb() after writing ptr_key. */
        smp_rmb();

#ifdef CONFIG_64BIT
        hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
        /*
         * Mask off the first 32 bits, this makes explicit that we have
         * modified the address (and 32 bits is plenty for a unique ID).
         */
        hashval = hashval & 0xffffffff;
#else
        hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
#endif
        *hashval_out = hashval;
        return 0;
}

int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        return __ptr_to_hashval(ptr, hashval_out);
}

static char *ptr_to_id(char *buf, char *end, const void *ptr,
                       struct printf_spec spec)
{
        const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
        unsigned long hashval;
        int ret;

        /*
         * Print the real pointer value for NULL and error pointers,
         * as they are not actual addresses.
         */
        if (IS_ERR_OR_NULL(ptr))
                return pointer_string(buf, end, ptr, spec);

        /* When debugging early boot use non-cryptographically secure hash. */
        if (unlikely(debug_boot_weak_hash)) {
                hashval = hash_long((unsigned long)ptr, 32);
                return pointer_string(buf, end, (const void *)hashval, spec);
        }

        ret = __ptr_to_hashval(ptr, &hashval);
        if (ret) {
                spec.field_width = 2 * sizeof(ptr);
                /* string length must be less than default_width */
                return error_string(buf, end, str, spec);
        }

        return pointer_string(buf, end, (const void *)hashval, spec);
}

static char *default_pointer(char *buf, char *end, const void *ptr,
                             struct printf_spec spec)
{
        /*
         * default is to _not_ leak addresses, so hash before printing,
         * unless no_hash_pointers is specified on the command line.
         */
        if (unlikely(no_hash_pointers))
                return pointer_string(buf, end, ptr, spec);

        return ptr_to_id(buf, end, ptr, spec);
}

int kptr_restrict __read_mostly;

static noinline_for_stack
char *restricted_pointer(char *buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        switch (kptr_restrict) {
        case 0:
                /* Handle as %p, hash and do _not_ leak addresses. */
                return default_pointer(buf, end, ptr, spec);
        case 1: {
                const struct cred *cred;

                /*
                 * kptr_restrict==1 cannot be used in IRQ context
                 * because its test for CAP_SYSLOG would be meaningless.
                 */
                if (in_hardirq() || in_serving_softirq() || in_nmi()) {
                        if (spec.field_width == -1)
                                spec.field_width = 2 * sizeof(ptr);
                        return error_string(buf, end, "pK-error", spec);
                }

                /*
                 * Only print the real pointer value if the current
                 * process has CAP_SYSLOG and is running with the
                 * same credentials it started with. This is because
                 * access to files is checked at open() time, but %pK
                 * checks permission at read() time. We don't want to
                 * leak pointer values if a binary opens a file using
                 * %pK and then elevates privileges before reading it.
                 */
                cred = current_cred();
                if (!has_capability_noaudit(current, CAP_SYSLOG) ||
                    !uid_eq(cred->euid, cred->uid) ||
                    !gid_eq(cred->egid, cred->gid))
                        ptr = NULL;
                break;
        }
        case 2:
        default:
                /* Always print 0's for %pK */
                ptr = NULL;
                break;
        }

        return pointer_string(buf, end, ptr, spec);
}

static noinline_for_stack
char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
                  const char *fmt)
{
        const char *array[4], *s;
        const struct dentry *p;
        int depth;
        int i, n;

        switch (fmt[1]) {
                case '2': case '3': case '4':
                        depth = fmt[1] - '0';
                        break;
                default:
                        depth = 1;
        }

        rcu_read_lock();
        for (i = 0; i < depth; i++, d = p) {
                if (check_pointer(&buf, end, d, spec)) {
                        rcu_read_unlock();
                        return buf;
                }

                p = READ_ONCE(d->d_parent);
                array[i] = READ_ONCE(d->d_name.name);
                if (p == d) {
                        if (i)
                                array[i] = "";
                        i++;
                        break;
                }
        }
        s = array[--i];
        for (n = 0; n != spec.precision; n++, buf++) {
                char c = *s++;
                if (!c) {
                        if (!i)
                                break;
                        c = '/';
                        s = array[--i];
                }
                if (buf < end)
                        *buf = c;
        }
        rcu_read_unlock();
        return widen_string(buf, n, end, spec);
}

static noinline_for_stack
char *file_dentry_name(char *buf, char *end, const struct file *f,
                        struct printf_spec spec, const char *fmt)
{
        if (check_pointer(&buf, end, f, spec))
                return buf;

        return dentry_name(buf, end, f->f_path.dentry, spec, fmt);
}
#ifdef CONFIG_BLOCK
static noinline_for_stack
char *bdev_name(char *buf, char *end, struct block_device *bdev,
                struct printf_spec spec, const char *fmt)
{
        struct gendisk *hd;

        if (check_pointer(&buf, end, bdev, spec))
                return buf;

        hd = bdev->bd_disk;
        buf = string(buf, end, hd->disk_name, spec);
        if (bdev_is_partition(bdev)) {
                if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) {
                        if (buf < end)
                                *buf = 'p';
                        buf++;
                }
                buf = number(buf, end, bdev_partno(bdev), spec);
        }
        return buf;
}
#endif

static noinline_for_stack
char *symbol_string(char *buf, char *end, void *ptr,
                    struct printf_spec spec, const char *fmt)
{
        unsigned long value;
#ifdef CONFIG_KALLSYMS
        char sym[KSYM_SYMBOL_LEN];
#endif

        if (fmt[1] == 'R')
                ptr = __builtin_extract_return_addr(ptr);
        value = (unsigned long)ptr;

#ifdef CONFIG_KALLSYMS
        if (*fmt == 'B' && fmt[1] == 'b')
                sprint_backtrace_build_id(sym, value);
        else if (*fmt == 'B')
                sprint_backtrace(sym, value);
        else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b')))
                sprint_symbol_build_id(sym, value);
        else if (*fmt != 's')
                sprint_symbol(sym, value);
        else
                sprint_symbol_no_offset(sym, value);

        return string_nocheck(buf, end, sym, spec);
#else
        return special_hex_number(buf, end, value, sizeof(void *));
#endif
}

static const struct printf_spec default_str_spec = {
        .field_width = -1,
        .precision = -1,
};

static const struct printf_spec default_flag_spec = {
        .base = 16,
        .precision = -1,
        .flags = SPECIAL | SMALL,
};

static const struct printf_spec default_dec_spec = {
        .base = 10,
        .precision = -1,
};

static const struct printf_spec default_dec02_spec = {
        .base = 10,
        .field_width = 2,
        .precision = -1,
        .flags = ZEROPAD,
};

static const struct printf_spec default_dec04_spec = {
        .base = 10,
        .field_width = 4,
        .precision = -1,
        .flags = ZEROPAD,
};

static noinline_for_stack
char *hex_range(char *buf, char *end, u64 start_val, u64 end_val,
                struct printf_spec spec)
{
        buf = number(buf, end, start_val, spec);
        if (start_val == end_val)
                return buf;

        if (buf < end)
                *buf = '-';
        ++buf;
        return number(buf, end, end_val, spec);
}

static noinline_for_stack
char *resource_string(char *buf, char *end, struct resource *res,
                      struct printf_spec spec, const char *fmt)
{
#ifndef IO_RSRC_PRINTK_SIZE
#define IO_RSRC_PRINTK_SIZE        6
#endif

#ifndef MEM_RSRC_PRINTK_SIZE
#define MEM_RSRC_PRINTK_SIZE        10
#endif
        static const struct printf_spec io_spec = {
                .base = 16,
                .field_width = IO_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec mem_spec = {
                .base = 16,
                .field_width = MEM_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec bus_spec = {
                .base = 16,
                .field_width = 2,
                .precision = -1,
                .flags = SMALL | ZEROPAD,
        };
        static const struct printf_spec str_spec = {
                .field_width = -1,
                .precision = 10,
                .flags = LEFT,
        };

        /* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8)
         * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */
#define RSRC_BUF_SIZE                ((2 * sizeof(resource_size_t)) + 4)
#define FLAG_BUF_SIZE                (2 * sizeof(res->flags))
#define DECODED_BUF_SIZE        sizeof("[mem - 64bit pref window disabled]")
#define RAW_BUF_SIZE                sizeof("[mem - flags 0x]")
        char sym[MAX(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE,
                     2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)];

        char *p = sym, *pend = sym + sizeof(sym);
        int decode = (fmt[0] == 'R') ? 1 : 0;
        const struct printf_spec *specp;

        if (check_pointer(&buf, end, res, spec))
                return buf;

        *p++ = '[';
        if (res->flags & IORESOURCE_IO) {
                p = string_nocheck(p, pend, "io  ", str_spec);
                specp = &io_spec;
        } else if (res->flags & IORESOURCE_MEM) {
                p = string_nocheck(p, pend, "mem ", str_spec);
                specp = &mem_spec;
        } else if (res->flags & IORESOURCE_IRQ) {
                p = string_nocheck(p, pend, "irq ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_DMA) {
                p = string_nocheck(p, pend, "dma ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_BUS) {
                p = string_nocheck(p, pend, "bus ", str_spec);
                specp = &bus_spec;
        } else {
                p = string_nocheck(p, pend, "??? ", str_spec);
                specp = &mem_spec;
                decode = 0;
        }
        if (decode && res->flags & IORESOURCE_UNSET) {
                p = string_nocheck(p, pend, "size ", str_spec);
                p = number(p, pend, resource_size(res), *specp);
        } else {
                p = hex_range(p, pend, res->start, res->end, *specp);
        }
        if (decode) {
                if (res->flags & IORESOURCE_MEM_64)
                        p = string_nocheck(p, pend, " 64bit", str_spec);
                if (res->flags & IORESOURCE_PREFETCH)
                        p = string_nocheck(p, pend, " pref", str_spec);
                if (res->flags & IORESOURCE_WINDOW)
                        p = string_nocheck(p, pend, " window", str_spec);
                if (res->flags & IORESOURCE_DISABLED)
                        p = string_nocheck(p, pend, " disabled", str_spec);
        } else {
                p = string_nocheck(p, pend, " flags ", str_spec);
                p = number(p, pend, res->flags, default_flag_spec);
        }
        *p++ = ']';
        *p = '\0';

        return string_nocheck(buf, end, sym, spec);
}

static noinline_for_stack
char *range_string(char *buf, char *end, const struct range *range,
                   struct printf_spec spec, const char *fmt)
{
        char sym[sizeof("[range 0x0123456789abcdef-0x0123456789abcdef]")];
        char *p = sym, *pend = sym + sizeof(sym);

        if (check_pointer(&buf, end, range, spec))
                return buf;

        p = string_nocheck(p, pend, "[range ", default_str_spec);
        p = hex_range(p, pend, range->start, range->end, special_hex_spec(sizeof(range->start)));
        *p++ = ']';
        *p = '\0';

        return string_nocheck(buf, end, sym, spec);
}

static noinline_for_stack
char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                 const char *fmt)
{
        int i, len = 1;                /* if we pass '%ph[CDN]', field width remains
                                   negative value, fallback to the default */
        char separator;

        if (spec.field_width == 0)
                /* nothing to print */
                return buf;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'C':
                separator = ':';
                break;
        case 'D':
                separator = '-';
                break;
        case 'N':
                separator = 0;
                break;
        default:
                separator = ' ';
                break;
        }

        if (spec.field_width > 0)
                len = min_t(int, spec.field_width, 64);

        for (i = 0; i < len; ++i) {
                if (buf < end)
                        *buf = hex_asc_hi(addr[i]);
                ++buf;
                if (buf < end)
                        *buf = hex_asc_lo(addr[i]);
                ++buf;

                if (separator && i != len - 1) {
                        if (buf < end)
                                *buf = separator;
                        ++buf;
                }
        }

        return buf;
}

static noinline_for_stack
char *bitmap_string(char *buf, char *end, const unsigned long *bitmap,
                    struct printf_spec spec, const char *fmt)
{
        const int CHUNKSZ = 32;
        int nr_bits = max_t(int, spec.field_width, 0);
        int i, chunksz;
        bool first = true;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        /* reused to print numbers */
        spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 };

        chunksz = nr_bits & (CHUNKSZ - 1);
        if (chunksz == 0)
                chunksz = CHUNKSZ;

        i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ;
        for (; i >= 0; i -= CHUNKSZ) {
                u32 chunkmask, val;
                int word, bit;

                chunkmask = ((1ULL << chunksz) - 1);
                word = i / BITS_PER_LONG;
                bit = i % BITS_PER_LONG;
                val = (bitmap[word] >> bit) & chunkmask;

                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                spec.field_width = DIV_ROUND_UP(chunksz, 4);
                buf = number(buf, end, val, spec);

                chunksz = CHUNKSZ;
        }
        return buf;
}

static noinline_for_stack
char *bitmap_list_string(char *buf, char *end, const unsigned long *bitmap,
                         struct printf_spec spec, const char *fmt)
{
        int nr_bits = max_t(int, spec.field_width, 0);
        bool first = true;
        int rbot, rtop;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) {
                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                buf = number(buf, end, rbot, default_dec_spec);
                if (rtop == rbot + 1)
                        continue;

                if (buf < end)
                        *buf = '-';
                buf = number(++buf, end, rtop - 1, default_dec_spec);
        }
        return buf;
}

static noinline_for_stack
char *mac_address_string(char *buf, char *end, u8 *addr,
                         struct printf_spec spec, const char *fmt)
{
        char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")];
        char *p = mac_addr;
        int i;
        char separator;
        bool reversed = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                separator = '-';
                break;

        case 'R':
                reversed = true;
                fallthrough;

        default:
                separator = ':';
                break;
        }

        for (i = 0; i < 6; i++) {
                if (reversed)
                        p = hex_byte_pack(p, addr[5 - i]);
                else
                        p = hex_byte_pack(p, addr[i]);

                if (fmt[0] == 'M' && i != 5)
                        *p++ = separator;
        }
        *p = '\0';

        return string_nocheck(buf, end, mac_addr, spec);
}

static noinline_for_stack
char *ip4_string(char *p, const u8 *addr, const char *fmt)
{
        int i;
        bool leading_zeros = (fmt[0] == 'i');
        int index;
        int step;

        switch (fmt[2]) {
        case 'h':
#ifdef __BIG_ENDIAN
                index = 0;
                step = 1;
#else
                index = 3;
                step = -1;
#endif
                break;
        case 'l':
                index = 3;
                step = -1;
                break;
        case 'n':
        case 'b':
        default:
                index = 0;
                step = 1;
                break;
        }
        for (i = 0; i < 4; i++) {
                char temp[4] __aligned(2);        /* hold each IP quad in reverse order */
                int digits = put_dec_trunc8(temp, addr[index]) - temp;
                if (leading_zeros) {
                        if (digits < 3)
                                *p++ = '0';
                        if (digits < 2)
                                *p++ = '0';
                }
                /* reverse the digits in the quad */
                while (digits--)
                        *p++ = temp[digits];
                if (i < 3)
                        *p++ = '.';
                index += step;
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_compressed_string(char *p, const char *addr)
{
        int i, j, range;
        unsigned char zerolength[8];
        int longest = 1;
        int colonpos = -1;
        u16 word;
        u8 hi, lo;
        bool needcolon = false;
        bool useIPv4;
        struct in6_addr in6;

        memcpy(&in6, addr, sizeof(struct in6_addr));

        useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6);

        memset(zerolength, 0, sizeof(zerolength));

        if (useIPv4)
                range = 6;
        else
                range = 8;

        /* find position of longest 0 run */
        for (i = 0; i < range; i++) {
                for (j = i; j < range; j++) {
                        if (in6.s6_addr16[j] != 0)
                                break;
                        zerolength[i]++;
                }
        }
        for (i = 0; i < range; i++) {
                if (zerolength[i] > longest) {
                        longest = zerolength[i];
                        colonpos = i;
                }
        }
        if (longest == 1)                /* don't compress a single 0 */
                colonpos = -1;

        /* emit address */
        for (i = 0; i < range; i++) {
                if (i == colonpos) {
                        if (needcolon || i == 0)
                                *p++ = ':';
                        *p++ = ':';
                        needcolon = false;
                        i += longest - 1;
                        continue;
                }
                if (needcolon) {
                        *p++ = ':';
                        needcolon = false;
                }
                /* hex u16 without leading 0s */
                word = ntohs(in6.s6_addr16[i]);
                hi = word >> 8;
                lo = word & 0xff;
                if (hi) {
                        if (hi > 0x0f)
                                p = hex_byte_pack(p, hi);
                        else
                                *p++ = hex_asc_lo(hi);
                        p = hex_byte_pack(p, lo);
                }
                else if (lo > 0x0f)
                        p = hex_byte_pack(p, lo);
                else
                        *p++ = hex_asc_lo(lo);
                needcolon = true;
        }

        if (useIPv4) {
                if (needcolon)
                        *p++ = ':';
                p = ip4_string(p, &in6.s6_addr[12], "I4");
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_string(char *p, const char *addr, const char *fmt)
{
        int i;

        for (i = 0; i < 8; i++) {
                p = hex_byte_pack(p, *addr++);
                p = hex_byte_pack(p, *addr++);
                if (fmt[0] == 'I' && i != 7)
                        *p++ = ':';
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")];

        if (fmt[0] == 'I' && fmt[2] == 'c')
                ip6_compressed_string(ip6_addr, addr);
        else
                ip6_string(ip6_addr, addr, fmt);

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip4_addr[sizeof("255.255.255.255")];

        ip4_string(ip4_addr, addr, fmt);

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false, have_s = false, have_f = false, have_c = false;
        char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") +
                      sizeof(":12345") + sizeof("/123456789") +
                      sizeof("%1234567890")];
        char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr);
        const u8 *addr = (const u8 *) &sa->sin6_addr;
        char fmt6[2] = { fmt[0], '6' };
        u8 off = 0;

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'f':
                        have_f = true;
                        break;
                case 's':
                        have_s = true;
                        break;
                case 'c':
                        have_c = true;
                        break;
                }
        }

        if (have_p || have_s || have_f) {
                *p = '[';
                off = 1;
        }

        if (fmt6[0] == 'I' && have_c)
                p = ip6_compressed_string(ip6_addr + off, addr);
        else
                p = ip6_string(ip6_addr + off, addr, fmt6);

        if (have_p || have_s || have_f)
                *p++ = ']';

        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin6_port), spec);
        }
        if (have_f) {
                *p++ = '/';
                p = number(p, pend, ntohl(sa->sin6_flowinfo &
                                          IPV6_FLOWINFO_MASK), spec);
        }
        if (have_s) {
                *p++ = '%';
                p = number(p, pend, sa->sin6_scope_id, spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false;
        char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")];
        char *pend = ip4_addr + sizeof(ip4_addr);
        const u8 *addr = (const u8 *) &sa->sin_addr.s_addr;
        char fmt4[3] = { fmt[0], '4', 0 };

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'h':
                case 'l':
                case 'n':
                case 'b':
                        fmt4[2] = *fmt;
                        break;
                }
        }

        p = ip4_string(ip4_addr, addr, fmt4);
        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin_port), spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip_addr_string(char *buf, char *end, const void *ptr,
                     struct printf_spec spec, const char *fmt)
{
        char *err_fmt_msg;

        if (check_pointer(&buf, end, ptr, spec))
                return buf;

        switch (fmt[1]) {
        case '6':
                return ip6_addr_string(buf, end, ptr, spec, fmt);
        case '4':
                return ip4_addr_string(buf, end, ptr, spec, fmt);
        case 'S': {
                const union {
                        struct sockaddr                raw;
                        struct sockaddr_in        v4;
                        struct sockaddr_in6        v6;
                } *sa = ptr;

                switch (sa->raw.sa_family) {
                case AF_INET:
                        return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt);
                case AF_INET6:
                        return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }}
        }

        err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)";
        return error_string(buf, end, err_fmt_msg, spec);
}

static noinline_for_stack
char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                     const char *fmt)
{
        bool found = true;
        int count = 1;
        unsigned int flags = 0;
        int len;

        if (spec.field_width == 0)
                return buf;                                /* nothing to print */

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        do {
                switch (fmt[count++]) {
                case 'a':
                        flags |= ESCAPE_ANY;
                        break;
                case 'c':
                        flags |= ESCAPE_SPECIAL;
                        break;
                case 'h':
                        flags |= ESCAPE_HEX;
                        break;
                case 'n':
                        flags |= ESCAPE_NULL;
                        break;
                case 'o':
                        flags |= ESCAPE_OCTAL;
                        break;
                case 'p':
                        flags |= ESCAPE_NP;
                        break;
                case 's':
                        flags |= ESCAPE_SPACE;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (!flags)
                flags = ESCAPE_ANY_NP;

        len = spec.field_width < 0 ? 1 : spec.field_width;

        /*
         * string_escape_mem() writes as many characters as it can to
         * the given buffer, and returns the total size of the output
         * had the buffer been big enough.
         */
        buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL);

        return buf;
}

__diag_push();
__diag_ignore(GCC, all, "-Wsuggest-attribute=format",
              "Not a valid __printf() conversion candidate.");
static char *va_format(char *buf, char *end, struct va_format *va_fmt,
                       struct printf_spec spec)
{
        va_list va;

        if (check_pointer(&buf, end, va_fmt, spec))
                return buf;

        va_copy(va, *va_fmt->va);
        buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
        va_end(va);

        return buf;
}
__diag_pop();

static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
                  struct printf_spec spec, const char *fmt)
{
        char uuid[UUID_STRING_LEN + 1];
        char *p = uuid;
        int i;
        const u8 *index = uuid_index;
        bool uc = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (*(++fmt)) {
        case 'L':
                uc = true;
                fallthrough;
        case 'l':
                index = guid_index;
                break;
        case 'B':
                uc = true;
                break;
        }

        for (i = 0; i < 16; i++) {
                if (uc)
                        p = hex_byte_pack_upper(p, addr[index[i]]);
                else
                        p = hex_byte_pack(p, addr[index[i]]);
                switch (i) {
                case 3:
                case 5:
                case 7:
                case 9:
                        *p++ = '-';
                        break;
                }
        }

        *p = 0;

        return string_nocheck(buf, end, uuid, spec);
}

static noinline_for_stack
char *netdev_bits(char *buf, char *end, const void *addr,
                  struct printf_spec spec,  const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                num = *(const netdev_features_t *)addr;
                size = sizeof(netdev_features_t);
                break;
        default:
                return error_string(buf, end, "(%pN?)", spec);
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *fourcc_string(char *buf, char *end, const u32 *fourcc,
                    struct printf_spec spec, const char *fmt)
{
        char output[sizeof("0123 little-endian (0x01234567)")];
        char *p = output;
        unsigned int i;
        bool pixel_fmt = false;
        u32 orig, val;

        if (fmt[1] != 'c')
                return error_string(buf, end, "(%p4?)", spec);

        if (check_pointer(&buf, end, fourcc, spec))
                return buf;

        orig = get_unaligned(fourcc);
        switch (fmt[2]) {
        case 'h':
                if (fmt[3] == 'R')
                        orig = swab32(orig);
                break;
        case 'l':
                orig = (__force u32)cpu_to_le32(orig);
                break;
        case 'b':
                orig = (__force u32)cpu_to_be32(orig);
                break;
        case 'c':
                /* Pixel formats are printed LSB-first */
                pixel_fmt = true;
                break;
        default:
                return error_string(buf, end, "(%p4?)", spec);
        }

        val = pixel_fmt ? swab32(orig & ~BIT(31)) : orig;

        for (i = 0; i < sizeof(u32); i++) {
                unsigned char c = val >> ((3 - i) * 8);

                /* Print non-control ASCII characters as-is, dot otherwise */
                *p++ = isascii(c) && isprint(c) ? c : '.';
        }

        if (pixel_fmt) {
                *p++ = ' ';
                strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian");
                p += strlen(p);
        }

        *p++ = ' ';
        *p++ = '(';
        p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32));
        *p++ = ')';
        *p = '\0';

        return string(buf, end, output, spec);
}

static noinline_for_stack
char *address_val(char *buf, char *end, const void *addr,
                  struct printf_spec spec, const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'd':
                num = *(const dma_addr_t *)addr;
                size = sizeof(dma_addr_t);
                break;
        case 'p':
        default:
                num = *(const phys_addr_t *)addr;
                size = sizeof(phys_addr_t);
                break;
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        int year = tm->tm_year + (r ? 0 : 1900);
        int mon = tm->tm_mon + (r ? 0 : 1);

        buf = number(buf, end, year, default_dec04_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        buf = number(buf, end, mon, default_dec02_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        return number(buf, end, tm->tm_mday, default_dec02_spec);
}

static noinline_for_stack
char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        buf = number(buf, end, tm->tm_hour, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        buf = number(buf, end, tm->tm_min, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        return number(buf, end, tm->tm_sec, default_dec02_spec);
}

static noinline_for_stack
char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
              struct printf_spec spec, const char *fmt)
{
        bool have_t = true, have_d = true;
        bool raw = false, iso8601_separator = true;
        bool found = true;
        int count = 2;

        switch (fmt[count]) {
        case 'd':
                have_t = false;
                count++;
                break;
        case 't':
                have_d = false;
                count++;
                break;
        }

        do {
                switch (fmt[count++]) {
                case 'r':
                        raw = true;
                        break;
                case 's':
                        iso8601_separator = false;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (have_d)
                buf = date_str(buf, end, tm, raw);
        if (have_d && have_t) {
                if (buf < end)
                        *buf = iso8601_separator ? 'T' : ' ';
                buf++;
        }
        if (have_t)
                buf = time_str(buf, end, tm, raw);

        return buf;
}

static noinline_for_stack
char *time64_str(char *buf, char *end, const time64_t time,
                 struct printf_spec spec, const char *fmt)
{
        struct rtc_time rtc_time;
        struct tm tm;

        time64_to_tm(time, 0, &tm);

        rtc_time.tm_sec = tm.tm_sec;
        rtc_time.tm_min = tm.tm_min;
        rtc_time.tm_hour = tm.tm_hour;
        rtc_time.tm_mday = tm.tm_mday;
        rtc_time.tm_mon = tm.tm_mon;
        rtc_time.tm_year = tm.tm_year;
        rtc_time.tm_wday = tm.tm_wday;
        rtc_time.tm_yday = tm.tm_yday;

        rtc_time.tm_isdst = 0;

        return rtc_str(buf, end, &rtc_time, spec, fmt);
}

static noinline_for_stack
char *timespec64_str(char *buf, char *end, const struct timespec64 *ts,
                     struct printf_spec spec, const char *fmt)
{
        static const struct printf_spec default_dec09_spec = {
                .base = 10,
                .field_width = 9,
                .precision = -1,
                .flags = ZEROPAD,
        };

        if (fmt[2] == 'p')
                buf = number(buf, end, ts->tv_sec, default_dec_spec);
        else
                buf = time64_str(buf, end, ts->tv_sec, spec, fmt);
        if (buf < end)
                *buf = '.';
        buf++;

        return number(buf, end, ts->tv_nsec, default_dec09_spec);
}

static noinline_for_stack
char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec,
                    const char *fmt)
{
        if (check_pointer(&buf, end, ptr, spec))
                return buf;

        switch (fmt[1]) {
        case 'R':
                return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt);
        case 'S':
                return timespec64_str(buf, end, (const struct timespec64 *)ptr, spec, fmt);
        case 'T':
                return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt);
        default:
                return error_string(buf, end, "(%pt?)", spec);
        }
}

static noinline_for_stack
char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
            const char *fmt)
{
        if (!IS_ENABLED(CONFIG_HAVE_CLK))
                return error_string(buf, end, "(%pC?)", spec);

        if (check_pointer(&buf, end, clk, spec))
                return buf;

#ifdef CONFIG_COMMON_CLK
        return string(buf, end, __clk_get_name(clk), spec);
#else
        return ptr_to_id(buf, end, clk, spec);
#endif
}

static
char *format_flags(char *buf, char *end, unsigned long flags,
                                        const struct trace_print_flags *names)
{
        unsigned long mask;

        for ( ; flags && names->name; names++) {
                mask = names->mask;
                if ((flags & mask) != mask)
                        continue;

                buf = string(buf, end, names->name, default_str_spec);

                flags &= ~mask;
                if (flags) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }
        }

        if (flags)
                buf = number(buf, end, flags, default_flag_spec);

        return buf;
}

struct page_flags_fields {
        int width;
        int shift;
        int mask;
        const struct printf_spec *spec;
        const char *name;
};

static const struct page_flags_fields pff[] = {
        {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK,
         &default_dec_spec, "section"},
        {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK,
         &default_dec_spec, "node"},
        {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK,
         &default_dec_spec, "zone"},
        {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK,
         &default_flag_spec, "lastcpupid"},
        {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK,
         &default_flag_spec, "kasantag"},
};

static
char *format_page_flags(char *buf, char *end, unsigned long flags)
{
        unsigned long main_flags = flags & PAGEFLAGS_MASK;
        bool append = false;
        int i;

        buf = number(buf, end, flags, default_flag_spec);
        if (buf < end)
                *buf = '(';
        buf++;

        /* Page flags from the main area. */
        if (main_flags) {
                buf = format_flags(buf, end, main_flags, pageflag_names);
                append = true;
        }

        /* Page flags from the fields area */
        for (i = 0; i < ARRAY_SIZE(pff); i++) {
                /* Skip undefined fields. */
                if (!pff[i].width)
                        continue;

                /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */
                if (append) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }

                buf = string(buf, end, pff[i].name, default_str_spec);
                if (buf < end)
                        *buf = '=';
                buf++;
                buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask,
                             *pff[i].spec);

                append = true;
        }
        if (buf < end)
                *buf = ')';
        buf++;

        return buf;
}

static noinline_for_stack
char *flags_string(char *buf, char *end, void *flags_ptr,
                   struct printf_spec spec, const char *fmt)
{
        unsigned long flags;
        const struct trace_print_flags *names;

        if (check_pointer(&buf, end, flags_ptr, spec))
                return buf;

        switch (fmt[1]) {
        case 'p':
                return format_page_flags(buf, end, *(unsigned long *)flags_ptr);
        case 'v':
                flags = *(unsigned long *)flags_ptr;
                names = vmaflag_names;
                break;
        case 'g':
                flags = (__force unsigned long)(*(gfp_t *)flags_ptr);
                names = gfpflag_names;
                break;
        default:
                return error_string(buf, end, "(%pG?)", spec);
        }

        return format_flags(buf, end, flags, names);
}

static noinline_for_stack
char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
                              char *end)
{
        int depth;

        /* Loop starting from the root node to the current node. */
        for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) {
                /*
                 * Only get a reference for other nodes (i.e. parent nodes).
                 * fwnode refcount may be 0 here.
                 */
                struct fwnode_handle *__fwnode = depth ?
                        fwnode_get_nth_parent(fwnode, depth) : fwnode;

                buf = string(buf, end, fwnode_get_name_prefix(__fwnode),
                             default_str_spec);
                buf = string(buf, end, fwnode_get_name(__fwnode),
                             default_str_spec);

                if (depth)
                        fwnode_handle_put(__fwnode);
        }

        return buf;
}

static noinline_for_stack
char *device_node_string(char *buf, char *end, struct device_node *dn,
                         struct printf_spec spec, const char *fmt)
{
        char tbuf[sizeof("xxxx") + 1];
        const char *p;
        int ret;
        char *buf_start = buf;
        struct property *prop;
        bool has_mult, pass;

        struct printf_spec str_spec = spec;
        str_spec.field_width = -1;

        if (fmt[0] != 'F')
                return error_string(buf, end, "(%pO?)", spec);

        if (!IS_ENABLED(CONFIG_OF))
                return error_string(buf, end, "(%pOF?)", spec);

        if (check_pointer(&buf, end, dn, spec))
                return buf;

        /* simple case without anything any more format specifiers */
        fmt++;
        if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0)
                fmt = "f";

        for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) {
                int precision;
                if (pass) {
                        if (buf < end)
                                *buf = ':';
                        buf++;
                }

                switch (*fmt) {
                case 'f':        /* full_name */
                        buf = fwnode_full_name_string(of_fwnode_handle(dn), buf,
                                                      end);
                        break;
                case 'n':        /* name */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        precision = str_spec.precision;
                        str_spec.precision = strchrnul(p, '@') - p;
                        buf = string(buf, end, p, str_spec);
                        str_spec.precision = precision;
                        break;
                case 'p':        /* phandle */
                        buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec);
                        break;
                case 'P':        /* path-spec */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        if (!p[1])
                                p = "/";
                        buf = string(buf, end, p, str_spec);
                        break;
                case 'F':        /* flags */
                        tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-';
                        tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-';
                        tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-';
                        tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-';
                        tbuf[4] = 0;
                        buf = string_nocheck(buf, end, tbuf, str_spec);
                        break;
                case 'c':        /* major compatible string */
                        ret = of_property_read_string(dn, "compatible", &p);
                        if (!ret)
                                buf = string(buf, end, p, str_spec);
                        break;
                case 'C':        /* full compatible string */
                        has_mult = false;
                        of_property_for_each_string(dn, "compatible", prop, p) {
                                if (has_mult)
                                        buf = string_nocheck(buf, end, ",", str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);
                                buf = string(buf, end, p, str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);

                                has_mult = true;
                        }
                        break;
                default:
                        break;
                }
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

static noinline_for_stack
char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode,
                    struct printf_spec spec, const char *fmt)
{
        struct printf_spec str_spec = spec;
        char *buf_start = buf;

        str_spec.field_width = -1;

        if (*fmt != 'w')
                return error_string(buf, end, "(%pf?)", spec);

        if (check_pointer(&buf, end, fwnode, spec))
                return buf;

        fmt++;

        switch (*fmt) {
        case 'P':        /* name */
                buf = string(buf, end, fwnode_get_name(fwnode), str_spec);
                break;
        case 'f':        /* full_name */
        default:
                buf = fwnode_full_name_string(fwnode, buf, end);
                break;
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

static noinline_for_stack
char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr,
                        struct printf_spec spec)
{
        if (*fmt == 'r' && fmt[1] == 'a')
                return range_string(buf, end, ptr, spec, fmt);
        return resource_string(buf, end, ptr, spec, fmt);
}

void __init hash_pointers_finalize(bool slub_debug)
{
        switch (hash_pointers_mode) {
        case HASH_PTR_ALWAYS:
                no_hash_pointers = false;
                break;
        case HASH_PTR_NEVER:
                no_hash_pointers = true;
                break;
        case HASH_PTR_AUTO:
        default:
                no_hash_pointers = slub_debug;
                break;
        }

        if (!no_hash_pointers)
                return;

        pr_warn("**********************************************************\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** This system shows unhashed kernel memory addresses   **\n");
        pr_warn("** via the console, logs, and other interfaces. This    **\n");
        pr_warn("** might reduce the security of your system.            **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** If you see this message and you are not debugging    **\n");
        pr_warn("** the kernel, report this immediately to your system   **\n");
        pr_warn("** administrator!                                       **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** Use hash_pointers=always to force this mode off      **\n");
        pr_warn("**                                                      **\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**********************************************************\n");
}

static int __init hash_pointers_mode_parse(char *str)
{
        if (!str) {
                pr_warn("Hash pointers mode empty; falling back to auto.\n");
                hash_pointers_mode = HASH_PTR_AUTO;
        } else if (strncmp(str, "auto", 4) == 0)   {
                pr_info("Hash pointers mode set to auto.\n");
                hash_pointers_mode = HASH_PTR_AUTO;
        } else if (strncmp(str, "never", 5) == 0) {
                pr_info("Hash pointers mode set to never.\n");
                hash_pointers_mode = HASH_PTR_NEVER;
        } else if (strncmp(str, "always", 6) == 0) {
                pr_info("Hash pointers mode set to always.\n");
                hash_pointers_mode = HASH_PTR_ALWAYS;
        } else {
                pr_warn("Unknown hash_pointers mode '%s' specified; assuming auto.\n", str);
                hash_pointers_mode = HASH_PTR_AUTO;
        }

        return 0;
}
early_param("hash_pointers", hash_pointers_mode_parse);

static int __init no_hash_pointers_enable(char *str)
{
        return hash_pointers_mode_parse("never");
}
early_param("no_hash_pointers", no_hash_pointers_enable);

/*
 * Show a '%p' thing.  A kernel extension is that the '%p' is followed
 * by an extra set of alphanumeric characters that are extended format
 * specifiers.
 *
 * Please update scripts/checkpatch.pl when adding/removing conversion
 * characters.  (Search for "check for vsprintf extension").
 *
 * Right now we handle:
 *
 * - 'S' For symbolic direct pointers (or function descriptors) with offset
 * - 's' For symbolic direct pointers (or function descriptors) without offset
 * - '[Ss]R' as above with __builtin_extract_return_addr() translation
 * - 'S[R]b' as above with module build ID (for use in backtraces)
 * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of
 *            %ps and %pS. Be careful when re-using these specifiers.
 * - 'B' For backtraced symbolic direct pointers with offset
 * - 'Bb' as above with module build ID (for use in backtraces)
 * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
 * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
 * - 'ra' For struct ranges, e.g., [range 0x0000000000000000 - 0x00000000000000ff]
 * - 'b[l]' For a bitmap, the number of bits is determined by the field
 *       width which must be explicitly specified either as part of the
 *       format string '%32b[l]' or through '%*b[l]', [l] selects
 *       range-list format instead of hex format
 * - 'M' For a 6-byte MAC address, it prints the address in the
 *       usual colon-separated hex notation
 * - 'm' For a 6-byte MAC address, it prints the hex address without colons
 * - 'MF' For a 6-byte MAC FDDI address, it prints the address
 *       with a dash-separated hex notation
 * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth)
 * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way
 *       IPv4 uses dot-separated decimal without leading 0's (1.2.3.4)
 *       IPv6 uses colon separated network-order 16 bit hex with leading 0's
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - 'i' [46] for 'raw' IPv4/IPv6 addresses
 *       IPv6 omits the colons (01020304...0f)
 *       IPv4 uses dot-separated decimal with leading 0's (010.123.045.006)
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order
 * - 'I[6S]c' for IPv6 addresses printed as specified by
 *       https://tools.ietf.org/html/rfc5952
 * - 'E[achnops]' For an escaped buffer, where rules are defined by combination
 *                of the following flags (see string_escape_mem() for the
 *                details):
 *                  a - ESCAPE_ANY
 *                  c - ESCAPE_SPECIAL
 *                  h - ESCAPE_HEX
 *                  n - ESCAPE_NULL
 *                  o - ESCAPE_OCTAL
 *                  p - ESCAPE_NP
 *                  s - ESCAPE_SPACE
 *                By default ESCAPE_ANY_NP is used.
 * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form
 *       "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
 *       Options for %pU are:
 *         b big endian lower case hex (default)
 *         B big endian UPPER case hex
 *         l little endian lower case hex
 *         L little endian UPPER case hex
 *           big endian output byte order is:
 *             [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15]
 *           little endian output byte order is:
 *             [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15]
 * - 'V' For a struct va_format which contains a format string * and va_list *,
 *       call vsnprintf(->format, *->va_list).
 *       Implements a "recursive vsnprintf".
 *       Do not use this feature without some mechanism to verify the
 *       correctness of the format string and va_list arguments.
 * - 'K' For a kernel pointer that should be hidden from unprivileged users.
 *       Use only for procfs, sysfs and similar files, not printk(); please
 *       read the documentation (path below) first.
 * - 'NF' For a netdev_features_t
 * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value.
 * - '4c[h[R]lb]' For generic FourCC code with raw numerical value. Both are
 *         displayed in the big-endian format. This is the opposite of V4L2 or
 *         DRM FourCCs.
 *         The additional specifiers define what endianness is used to load
 *         the stored bytes. The data might be interpreted using the host,
 *         reversed host byte order, little-endian, or big-endian.
 * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with
 *            a certain separator (' ' by default):
 *              C colon
 *              D dash
 *              N no separator
 *            The maximum supported length is 64 bytes of the input. Consider
 *            to use print_hex_dump() for the larger input.
 * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives
 *           (default assumed to be phys_addr_t, passed by reference)
 * - 'd[234]' For a dentry name (optionally 2-4 last components)
 * - 'D[234]' Same as 'd' but for a struct file
 * - 'g' For block_device name (gendisk + partition number)
 * - 't[RST][dt][r][s]' For time and date as represented by:
 *      R    struct rtc_time
 *      S    struct timespec64
 *      T    time64_t
 * - 'tSp' For time represented by struct timespec64 printed as <seconds>.<nanoseconds>
 * - 'C' For a clock, it prints the name (Common Clock Framework) or address
 *       (legacy clock framework) of the clock
 * - 'G' For flags to be printed as a collection of symbolic strings that would
 *       construct the specific value. Supported flags given by option:
 *       p page flags (see struct page) given as pointer to unsigned long
 *       g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
 *       v vma flags (VM_*) given as pointer to unsigned long
 * - 'OF[fnpPcCF]'  For a device tree object
 *                  Without any optional arguments prints the full_name
 *                  f device node full_name
 *                  n device node name
 *                  p device node phandle
 *                  P device node path spec (name + @unit)
 *                  F device node flags
 *                  c major compatible string
 *                  C full compatible string
 * - 'fw[fP]'        For a firmware node (struct fwnode_handle) pointer
 *                Without an option prints the full name of the node
 *                f full name
 *                P node name, including a possible unit address
 * - 'x' For printing the address unmodified. Equivalent to "%lx".
 *       Please read the documentation (path below) before using!
 * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of
 *           bpf_trace_printk() where [ku] prefix specifies either kernel (k)
 *           or user (u) memory to probe, and:
 *              s a string, equivalent to "%s" on direct vsnprintf() use
 *
 * ** When making changes please also update:
 *        Documentation/core-api/printk-formats.rst
 *
 * Note: The default behaviour (unadorned %p) is to hash the address,
 * rendering it useful as a unique identifier.
 *
 * There is also a '%pA' format specifier, but it is only intended to be used
 * from Rust code to format core::fmt::Arguments. Do *not* use it from C.
 * See rust/kernel/print.rs for details.
 */
static noinline_for_stack
char *pointer(const char *fmt, char *buf, char *end, void *ptr,
              struct printf_spec spec)
{
        switch (*fmt) {
        case 'S':
        case 's':
                ptr = dereference_symbol_descriptor(ptr);
                fallthrough;
        case 'B':
                return symbol_string(buf, end, ptr, spec, fmt);
        case 'R':
        case 'r':
                return resource_or_range(fmt, buf, end, ptr, spec);
        case 'h':
                return hex_string(buf, end, ptr, spec, fmt);
        case 'b':
                switch (fmt[1]) {
                case 'l':
                        return bitmap_list_string(buf, end, ptr, spec, fmt);
                default:
                        return bitmap_string(buf, end, ptr, spec, fmt);
                }
        case 'M':                        /* Colon separated: 00:01:02:03:04:05 */
        case 'm':                        /* Contiguous: 000102030405 */
                                        /* [mM]F (FDDI) */
                                        /* [mM]R (Reverse order; Bluetooth) */
                return mac_address_string(buf, end, ptr, spec, fmt);
        case 'I':                        /* Formatted IP supported
                                         * 4:        1.2.3.4
                                         * 6:        0001:0203:...:0708
                                         * 6c:        1::708 or 1::1.2.3.4
                                         */
        case 'i':                        /* Contiguous:
                                         * 4:        001.002.003.004
                                         * 6:   000102...0f
                                         */
                return ip_addr_string(buf, end, ptr, spec, fmt);
        case 'E':
                return escaped_string(buf, end, ptr, spec, fmt);
        case 'U':
                return uuid_string(buf, end, ptr, spec, fmt);
        case 'V':
                return va_format(buf, end, ptr, spec);
        case 'K':
                return restricted_pointer(buf, end, ptr, spec);
        case 'N':
                return netdev_bits(buf, end, ptr, spec, fmt);
        case '4':
                return fourcc_string(buf, end, ptr, spec, fmt);
        case 'a':
                return address_val(buf, end, ptr, spec, fmt);
        case 'd':
                return dentry_name(buf, end, ptr, spec, fmt);
        case 't':
                return time_and_date(buf, end, ptr, spec, fmt);
        case 'C':
                return clock(buf, end, ptr, spec, fmt);
        case 'D':
                return file_dentry_name(buf, end, ptr, spec, fmt);
#ifdef CONFIG_BLOCK
        case 'g':
                return bdev_name(buf, end, ptr, spec, fmt);
#endif

        case 'G':
                return flags_string(buf, end, ptr, spec, fmt);
        case 'O':
                return device_node_string(buf, end, ptr, spec, fmt + 1);
        case 'f':
                return fwnode_string(buf, end, ptr, spec, fmt + 1);
        case 'A':
                if (!IS_ENABLED(CONFIG_RUST)) {
                        WARN_ONCE(1, "Please remove %%pA from non-Rust code\n");
                        return error_string(buf, end, "(%pA?)", spec);
                }
                return rust_fmt_argument(buf, end, ptr);
        case 'x':
                return pointer_string(buf, end, ptr, spec);
        case 'e':
                /* %pe with a non-ERR_PTR gets treated as plain %p */
                if (!IS_ERR(ptr))
                        return default_pointer(buf, end, ptr, spec);
                return err_ptr(buf, end, ptr, spec);
        case 'u':
        case 'k':
                switch (fmt[1]) {
                case 's':
                        return string(buf, end, ptr, spec);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }
        default:
                return default_pointer(buf, end, ptr, spec);
        }
}

struct fmt {
        const char *str;
        unsigned char state;        // enum format_state
        unsigned char size;        // size of numbers
};

#define SPEC_CHAR(x, flag) [(x)-32] = flag
static unsigned char spec_flag(unsigned char c)
{
        static const unsigned char spec_flag_array[] = {
                SPEC_CHAR(' ', SPACE),
                SPEC_CHAR('#', SPECIAL),
                SPEC_CHAR('+', PLUS),
                SPEC_CHAR('-', LEFT),
                SPEC_CHAR('0', ZEROPAD),
        };
        c -= 32;
        return (c < sizeof(spec_flag_array)) ? spec_flag_array[c] : 0;
}

/*
 * Helper function to decode printf style format.
 * Each call decode a token from the format and return the
 * number of characters read (or likely the delta where it wants
 * to go on the next call).
 * The decoded token is returned through the parameters
 *
 * 'h', 'l', or 'L' for integer fields
 * 'z' support added 23/7/1999 S.H.
 * 'z' changed to 'Z' --davidm 1/25/99
 * 'Z' changed to 'z' --adobriyan 2017-01-25
 * 't' added for ptrdiff_t
 *
 * @fmt: the format string
 * @type of the token returned
 * @flags: various flags such as +, -, # tokens..
 * @field_width: overwritten width
 * @base: base of the number (octal, hex, ...)
 * @precision: precision of a number
 * @qualifier: qualifier of a number (long, size_t, ...)
 */
static noinline_for_stack
struct fmt format_decode(struct fmt fmt, struct printf_spec *spec)
{
        const char *start = fmt.str;
        char flag;

        /* we finished early by reading the field width */
        if (unlikely(fmt.state == FORMAT_STATE_WIDTH)) {
                if (spec->field_width < 0) {
                        spec->field_width = -spec->field_width;
                        spec->flags |= LEFT;
                }
                fmt.state = FORMAT_STATE_NONE;
                goto precision;
        }

        /* we finished early by reading the precision */
        if (unlikely(fmt.state == FORMAT_STATE_PRECISION)) {
                if (spec->precision < 0)
                        spec->precision = 0;

                fmt.state = FORMAT_STATE_NONE;
                goto qualifier;
        }

        /* By default */
        fmt.state = FORMAT_STATE_NONE;

        for (; *fmt.str ; fmt.str++) {
                if (*fmt.str == '%')
                        break;
        }

        /* Return the current non-format string */
        if (fmt.str != start || !*fmt.str)
                return fmt;

        /* Process flags. This also skips the first '%' */
        spec->flags = 0;
        do {
                /* this also skips first '%' */
                flag = spec_flag(*++fmt.str);
                spec->flags |= flag;
        } while (flag);

        /* get field width */
        spec->field_width = -1;

        if (isdigit(*fmt.str))
                spec->field_width = skip_atoi(&fmt.str);
        else if (unlikely(*fmt.str == '*')) {
                /* it's the next argument */
                fmt.state = FORMAT_STATE_WIDTH;
                fmt.str++;
                return fmt;
        }

precision:
        /* get the precision */
        spec->precision = -1;
        if (unlikely(*fmt.str == '.')) {
                fmt.str++;
                if (isdigit(*fmt.str)) {
                        spec->precision = skip_atoi(&fmt.str);
                        if (spec->precision < 0)
                                spec->precision = 0;
                } else if (*fmt.str == '*') {
                        /* it's the next argument */
                        fmt.state = FORMAT_STATE_PRECISION;
                        fmt.str++;
                        return fmt;
                }
        }

qualifier:
        /* Set up default numeric format */
        spec->base = 10;
        fmt.state = FORMAT_STATE_NUM;
        fmt.size = sizeof(int);
        static const struct format_state {
                unsigned char state;
                unsigned char size;
                unsigned char flags_or_double_size;
                unsigned char base;
        } lookup_state[256] = {
                // Length
                ['l'] = { 0, sizeof(long), sizeof(long long) },
                ['L'] = { 0, sizeof(long long) },
                ['h'] = { 0, sizeof(short), sizeof(char) },
                ['H'] = { 0, sizeof(char) },        // Questionable historical
                ['z'] = { 0, sizeof(size_t) },
                ['t'] = { 0, sizeof(ptrdiff_t) },

                // Non-numeric formats
                ['c'] = { FORMAT_STATE_CHAR },
                ['s'] = { FORMAT_STATE_STR },
                ['p'] = { FORMAT_STATE_PTR },
                ['%'] = { FORMAT_STATE_PERCENT_CHAR },

                // Numerics
                ['o'] = { FORMAT_STATE_NUM, 0, 0, 8 },
                ['x'] = { FORMAT_STATE_NUM, 0, SMALL, 16 },
                ['X'] = { FORMAT_STATE_NUM, 0, 0, 16 },
                ['d'] = { FORMAT_STATE_NUM, 0, SIGN, 10 },
                ['i'] = { FORMAT_STATE_NUM, 0, SIGN, 10 },
                ['u'] = { FORMAT_STATE_NUM, 0, 0, 10, },

                /*
                 * Since %n poses a greater security risk than
                 * utility, treat it as any other invalid or
                 * unsupported format specifier.
                 */
        };

        const struct format_state *p = lookup_state + (u8)*fmt.str;
        if (p->size) {
                fmt.size = p->size;
                if (p->flags_or_double_size && fmt.str[0] == fmt.str[1]) {
                        fmt.size = p->flags_or_double_size;
                        fmt.str++;
                }
                fmt.str++;
                p = lookup_state + *fmt.str;
        }
        if (p->state) {
                if (p->base)
                        spec->base = p->base;
                spec->flags |= p->flags_or_double_size;
                fmt.state = p->state;
                fmt.str++;
                return fmt;
        }

        WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt.str);
        fmt.state = FORMAT_STATE_INVALID;
        return fmt;
}

static void
set_field_width(struct printf_spec *spec, int width)
{
        spec->field_width = width;
        if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) {
                spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX);
        }
}

static void
set_precision(struct printf_spec *spec, int prec)
{
        spec->precision = prec;
        if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) {
                spec->precision = clamp(prec, 0, PRECISION_MAX);
        }
}

/*
 * Turn a 1/2/4-byte value into a 64-bit one for printing: truncate
 * as necessary and deal with signedness.
 *
 * 'size' is the size of the value in bytes.
 */
static unsigned long long convert_num_spec(unsigned int val, int size, struct printf_spec spec)
{
        unsigned int shift = 32 - size*8;

        val <<= shift;
        if (!(spec.flags & SIGN))
                return val >> shift;
        return (int)val >> shift;
}

/**
 * vsnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt_str: The format string to use
 * @args: Arguments for the format string
 *
 * This function generally follows C99 vsnprintf, but has some
 * extensions and a few limitations:
 *
 *  - ``%n`` is unsupported
 *  - ``%p*`` is handled by pointer()
 *
 * See pointer() or Documentation/core-api/printk-formats.rst for more
 * extensive description.
 *
 * **Please update the documentation in both places when making changes**
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 *
 * If you're not already dealing with a va_list consider using snprintf().
 */
int vsnprintf(char *buf, size_t size, const char *fmt_str, va_list args)
{
        char *str, *end;
        struct printf_spec spec = {0};
        struct fmt fmt = {
                .str = fmt_str,
                .state = FORMAT_STATE_NONE,
        };

        /* Reject out-of-range values early.  Large positive sizes are
           used for unknown buffer sizes. */
        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt.str) {
                const char *old_fmt = fmt.str;

                fmt = format_decode(fmt, &spec);

                switch (fmt.state) {
                case FORMAT_STATE_NONE: {
                        int read = fmt.str - old_fmt;
                        if (str < end) {
                                int copy = read;
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        continue;
                }

                case FORMAT_STATE_NUM: {
                        unsigned long long num;

                        if (fmt.size > sizeof(int))
                                num = va_arg(args, long long);
                        else
                                num = convert_num_spec(va_arg(args, int), fmt.size, spec);
                        str = number(str, end, num, spec);
                        continue;
                }

                case FORMAT_STATE_WIDTH:
                        set_field_width(&spec, va_arg(args, int));
                        continue;

                case FORMAT_STATE_PRECISION:
                        set_precision(&spec, va_arg(args, int));
                        continue;

                case FORMAT_STATE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;

                                }
                        }
                        c = (unsigned char) va_arg(args, int);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        continue;
                }

                case FORMAT_STATE_STR:
                        str = string(str, end, va_arg(args, char *), spec);
                        continue;

                case FORMAT_STATE_PTR:
                        str = pointer(fmt.str, str, end, va_arg(args, void *),
                                      spec);
                        while (isalnum(*fmt.str))
                                fmt.str++;
                        continue;

                case FORMAT_STATE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        continue;

                default:
                        /*
                         * Presumably the arguments passed gcc's type
                         * checking, but there is no safe or sane way
                         * for us to continue parsing the format and
                         * fetching from the va_list; the remaining
                         * specifiers and arguments would be out of
                         * sync.
                         */
                        goto out;
                }
        }

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

        /* the trailing null byte doesn't count towards the total */
        return str-buf;

}
EXPORT_SYMBOL(vsnprintf);

/**
 * vscnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The return value is the number of characters which have been written into
 * the @buf not including the trailing '\0'. If @size is == 0 the function
 * returns 0.
 *
 * If you're not already dealing with a va_list consider using scnprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
        int i;

        if (unlikely(!size))
                return 0;

        i = vsnprintf(buf, size, fmt, args);

        if (likely(i < size))
                return i;

        return size - 1;
}
EXPORT_SYMBOL(vscnprintf);

/**
 * snprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters which would be
 * generated for the given input, excluding the trailing null,
 * as per ISO C99.  If the return is greater than or equal to
 * @size, the resulting string is truncated.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int snprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(snprintf);

/**
 * scnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters written into @buf not including
 * the trailing '\0'. If @size is == 0 the function returns 0.
 */

int scnprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vscnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(scnprintf);

/**
 * vsprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The return value is the number of characters written into @buf not including
 * the trailing '\0'. Use vsnprintf() or vscnprintf() in order to avoid
 * buffer overflows.
 *
 * If you're not already dealing with a va_list consider using sprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vsprintf(char *buf, const char *fmt, va_list args)
{
        return vsnprintf(buf, INT_MAX, fmt, args);
}
EXPORT_SYMBOL(vsprintf);

/**
 * sprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters written into @buf not including
 * the trailing '\0'. Use snprintf() or scnprintf() in order to avoid
 * buffer overflows.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int sprintf(char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, INT_MAX, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sprintf);

#ifdef CONFIG_BINARY_PRINTF
/*
 * bprintf service:
 * vbin_printf() - VA arguments to binary data
 * bstr_printf() - Binary data to text string
 */

/**
 * vbin_printf - Parse a format string and place args' binary value in a buffer
 * @bin_buf: The buffer to place args' binary value
 * @size: The size of the buffer(by words(32bits), not characters)
 * @fmt_str: The format string to use
 * @args: Arguments for the format string
 *
 * The format follows C99 vsnprintf, except %n is ignored, and its argument
 * is skipped.
 *
 * The return value is the number of words(32bits) which would be generated for
 * the given input.
 *
 * NOTE:
 * If the return value is greater than @size, the resulting bin_buf is NOT
 * valid for bstr_printf().
 */
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt_str, va_list args)
{
        struct fmt fmt = {
                .str = fmt_str,
                .state = FORMAT_STATE_NONE,
        };
        struct printf_spec spec = {0};
        char *str, *end;
        int width;

        str = (char *)bin_buf;
        end = (char *)(bin_buf + size);

#define save_arg(type)                                                        \
({                                                                        \
        unsigned long long value;                                        \
        if (sizeof(type) == 8) {                                        \
                unsigned long long val8;                                \
                str = PTR_ALIGN(str, sizeof(u32));                        \
                val8 = va_arg(args, unsigned long long);                \
                if (str + sizeof(type) <= end) {                        \
                        *(u32 *)str = *(u32 *)&val8;                        \
                        *(u32 *)(str + 4) = *((u32 *)&val8 + 1);        \
                }                                                        \
                value = val8;                                                \
        } else {                                                        \
                unsigned int val4;                                        \
                str = PTR_ALIGN(str, sizeof(type));                        \
                val4 = va_arg(args, int);                                \
                if (str + sizeof(type) <= end)                                \
                        *(typeof(type) *)str = (type)(long)val4;        \
                value = (unsigned long long)val4;                        \
        }                                                                \
        str += sizeof(type);                                                \
        value;                                                                \
})

        while (*fmt.str) {
                fmt = format_decode(fmt, &spec);

                switch (fmt.state) {
                case FORMAT_STATE_NONE:
                case FORMAT_STATE_PERCENT_CHAR:
                        break;
                case FORMAT_STATE_INVALID:
                        goto out;

                case FORMAT_STATE_WIDTH:
                case FORMAT_STATE_PRECISION:
                        width = (int)save_arg(int);
                        /* Pointers may require the width */
                        if (*fmt.str == 'p')
                                set_field_width(&spec, width);
                        break;

                case FORMAT_STATE_CHAR:
                        save_arg(char);
                        break;

                case FORMAT_STATE_STR: {
                        const char *save_str = va_arg(args, char *);
                        const char *err_msg;
                        size_t len;

                        err_msg = check_pointer_msg(save_str);
                        if (err_msg)
                                save_str = err_msg;

                        len = strlen(save_str) + 1;
                        if (str + len < end)
                                memcpy(str, save_str, len);
                        str += len;
                        break;
                }

                case FORMAT_STATE_PTR:
                        /* Dereferenced pointers must be done now */
                        switch (*fmt.str) {
                        /* Dereference of functions is still OK */
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                save_arg(void *);
                                break;
                        default:
                                if (!isalnum(*fmt.str)) {
                                        save_arg(void *);
                                        break;
                                }
                                str = pointer(fmt.str, str, end, va_arg(args, void *),
                                              spec);
                                if (str + 1 < end)
                                        *str++ = '\0';
                                else
                                        end[-1] = '\0'; /* Must be nul terminated */
                        }
                        /* skip all alphanumeric pointer suffixes */
                        while (isalnum(*fmt.str))
                                fmt.str++;
                        break;

                case FORMAT_STATE_NUM:
                        if (fmt.size > sizeof(int)) {
                                save_arg(long long);
                        } else {
                                save_arg(int);
                        }
                }
        }

out:
        return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
#undef save_arg
}
EXPORT_SYMBOL_GPL(vbin_printf);

/**
 * bstr_printf - Format a string from binary arguments and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt_str: The format string to use
 * @bin_buf: Binary arguments for the format string
 *
 * This function like C99 vsnprintf, but the difference is that vsnprintf gets
 * arguments from stack, and bstr_printf gets arguments from @bin_buf which is
 * a binary buffer that generated by vbin_printf.
 *
 * The format follows C99 vsnprintf, but has some extensions:
 *  see vsnprintf comment for details.
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 */
int bstr_printf(char *buf, size_t size, const char *fmt_str, const u32 *bin_buf)
{
        struct fmt fmt = {
                .str = fmt_str,
                .state = FORMAT_STATE_NONE,
        };
        struct printf_spec spec = {0};
        char *str, *end;
        const char *args = (const char *)bin_buf;

        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

#define get_arg(type)                                                        \
({                                                                        \
        typeof(type) value;                                                \
        if (sizeof(type) == 8) {                                        \
                args = PTR_ALIGN(args, sizeof(u32));                        \
                *(u32 *)&value = *(u32 *)args;                                \
                *((u32 *)&value + 1) = *(u32 *)(args + 4);                \
        } else {                                                        \
                args = PTR_ALIGN(args, sizeof(type));                        \
                value = *(typeof(type) *)args;                                \
        }                                                                \
        args += sizeof(type);                                                \
        value;                                                                \
})

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt.str) {
                const char *old_fmt = fmt.str;
                unsigned long long num;

                fmt = format_decode(fmt, &spec);
                switch (fmt.state) {
                case FORMAT_STATE_NONE: {
                        int read = fmt.str - old_fmt;
                        if (str < end) {
                                int copy = read;
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        continue;
                }

                case FORMAT_STATE_WIDTH:
                        set_field_width(&spec, get_arg(int));
                        continue;

                case FORMAT_STATE_PRECISION:
                        set_precision(&spec, get_arg(int));
                        continue;

                case FORMAT_STATE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;
                                }
                        }
                        c = (unsigned char) get_arg(char);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        continue;
                }

                case FORMAT_STATE_STR: {
                        const char *str_arg = args;
                        args += strlen(str_arg) + 1;
                        str = string(str, end, (char *)str_arg, spec);
                        continue;
                }

                case FORMAT_STATE_PTR: {
                        bool process = false;
                        int copy, len;
                        /* Non function dereferences were already done */
                        switch (*fmt.str) {
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                process = true;
                                break;
                        default:
                                if (!isalnum(*fmt.str)) {
                                        process = true;
                                        break;
                                }
                                /* Pointer dereference was already processed */
                                if (str < end) {
                                        len = copy = strlen(args);
                                        if (copy > end - str)
                                                copy = end - str;
                                        memcpy(str, args, copy);
                                        str += len;
                                        args += len + 1;
                                }
                        }
                        if (process)
                                str = pointer(fmt.str, str, end, get_arg(void *), spec);

                        while (isalnum(*fmt.str))
                                fmt.str++;
                        continue;
                }

                case FORMAT_STATE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        continue;

                case FORMAT_STATE_INVALID:
                        goto out;

                case FORMAT_STATE_NUM:
                        if (fmt.size > sizeof(int))
                                num = get_arg(long long);
                        else
                                num = convert_num_spec(get_arg(int), fmt.size, spec);
                        str = number(str, end, num, spec);
                        continue;
                }
        } /* while(*fmt.str) */

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

#undef get_arg

        /* the trailing null byte doesn't count towards the total */
        return str - buf;
}
EXPORT_SYMBOL_GPL(bstr_printf);

#endif /* CONFIG_BINARY_PRINTF */

/**
 * vsscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        format of buffer
 * @args:        arguments
 */
int vsscanf(const char *buf, const char *fmt, va_list args)
{
        const char *str = buf;
        char *next;
        char digit;
        int num = 0;
        u8 qualifier;
        unsigned int base;
        union {
                long long s;
                unsigned long long u;
        } val;
        s16 field_width;
        bool is_sign;

        while (*fmt) {
                /* skip any white space in format */
                /* white space in format matches any amount of
                 * white space, including none, in the input.
                 */
                if (isspace(*fmt)) {
                        fmt = skip_spaces(++fmt);
                        str = skip_spaces(str);
                }

                /* anything that is not a conversion must match exactly */
                if (*fmt != '%' && *fmt) {
                        if (*fmt++ != *str++)
                                break;
                        continue;
                }

                if (!*fmt)
                        break;
                ++fmt;

                /* skip this conversion.
                 * advance both strings to next white space
                 */
                if (*fmt == '*') {
                        if (!*str)
                                break;
                        while (!isspace(*fmt) && *fmt != '%' && *fmt) {
                                /* '%*[' not yet supported, invalid format */
                                if (*fmt == '[')
                                        return num;
                                fmt++;
                        }
                        while (!isspace(*str) && *str)
                                str++;
                        continue;
                }

                /* get field width */
                field_width = -1;
                if (isdigit(*fmt)) {
                        field_width = skip_atoi(&fmt);
                        if (field_width <= 0)
                                break;
                }

                /* get conversion qualifier */
                qualifier = -1;
                if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
                    *fmt == 'z') {
                        qualifier = *fmt++;
                        if (unlikely(qualifier == *fmt)) {
                                if (qualifier == 'h') {
                                        qualifier = 'H';
                                        fmt++;
                                } else if (qualifier == 'l') {
                                        qualifier = 'L';
                                        fmt++;
                                }
                        }
                }

                if (!*fmt)
                        break;

                if (*fmt == 'n') {
                        /* return number of characters read so far */
                        *va_arg(args, int *) = str - buf;
                        ++fmt;
                        continue;
                }

                if (!*str)
                        break;

                base = 10;
                is_sign = false;

                switch (*fmt++) {
                case 'c':
                {
                        char *s = (char *)va_arg(args, char*);
                        if (field_width == -1)
                                field_width = 1;
                        do {
                                *s++ = *str++;
                        } while (--field_width > 0 && *str);
                        num++;
                }
                continue;
                case 's':
                {
                        char *s = (char *)va_arg(args, char *);
                        if (field_width == -1)
                                field_width = SHRT_MAX;
                        /* first, skip leading white space in buffer */
                        str = skip_spaces(str);

                        /* now copy until next white space */
                        while (*str && !isspace(*str) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        num++;
                }
                continue;
                /*
                 * Warning: This implementation of the '[' conversion specifier
                 * deviates from its glibc counterpart in the following ways:
                 * (1) It does NOT support ranges i.e. '-' is NOT a special
                 *     character
                 * (2) It cannot match the closing bracket ']' itself
                 * (3) A field width is required
                 * (4) '%*[' (discard matching input) is currently not supported
                 *
                 * Example usage:
                 * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]",
                 *                buf1, buf2, buf3);
                 * if (ret < 3)
                 *    // etc..
                 */
                case '[':
                {
                        char *s = (char *)va_arg(args, char *);
                        DECLARE_BITMAP(set, 256) = {0};
                        unsigned int len = 0;
                        bool negate = (*fmt == '^');

                        /* field width is required */
                        if (field_width == -1)
                                return num;

                        if (negate)
                                ++fmt;

                        for ( ; *fmt && *fmt != ']'; ++fmt, ++len)
                                __set_bit((u8)*fmt, set);

                        /* no ']' or no character set found */
                        if (!*fmt || !len)
                                return num;
                        ++fmt;

                        if (negate) {
                                bitmap_complement(set, set, 256);
                                /* exclude null '\0' byte */
                                __clear_bit(0, set);
                        }

                        /* match must be non-empty */
                        if (!test_bit((u8)*str, set))
                                return num;

                        while (test_bit((u8)*str, set) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        ++num;
                }
                continue;
                case 'o':
                        base = 8;
                        break;
                case 'x':
                case 'X':
                        base = 16;
                        break;
                case 'i':
                        base = 0;
                        fallthrough;
                case 'd':
                        is_sign = true;
                        fallthrough;
                case 'u':
                        break;
                case '%':
                        /* looking for '%' in str */
                        if (*str++ != '%')
                                return num;
                        continue;
                default:
                        /* invalid format; stop here */
                        return num;
                }

                /* have some sort of integer conversion.
                 * first, skip white space in buffer.
                 */
                str = skip_spaces(str);

                digit = *str;
                if (is_sign && digit == '-') {
                        if (field_width == 1)
                                break;

                        digit = *(str + 1);
                }

                if (!digit
                    || (base == 16 && !isxdigit(digit))
                    || (base == 10 && !isdigit(digit))
                    || (base == 8 && !isodigit(digit))
                    || (base == 0 && !isdigit(digit)))
                        break;

                if (is_sign)
                        val.s = simple_strntoll(str, &next, base,
                                                field_width >= 0 ? field_width : INT_MAX);
                else
                        val.u = simple_strntoull(str, &next, base,
                                                 field_width >= 0 ? field_width : INT_MAX);

                switch (qualifier) {
                case 'H':        /* that's 'hh' in format */
                        if (is_sign)
                                *va_arg(args, signed char *) = val.s;
                        else
                                *va_arg(args, unsigned char *) = val.u;
                        break;
                case 'h':
                        if (is_sign)
                                *va_arg(args, short *) = val.s;
                        else
                                *va_arg(args, unsigned short *) = val.u;
                        break;
                case 'l':
                        if (is_sign)
                                *va_arg(args, long *) = val.s;
                        else
                                *va_arg(args, unsigned long *) = val.u;
                        break;
                case 'L':
                        if (is_sign)
                                *va_arg(args, long long *) = val.s;
                        else
                                *va_arg(args, unsigned long long *) = val.u;
                        break;
                case 'z':
                        *va_arg(args, size_t *) = val.u;
                        break;
                default:
                        if (is_sign)
                                *va_arg(args, int *) = val.s;
                        else
                                *va_arg(args, unsigned int *) = val.u;
                        break;
                }
                num++;

                if (!next)
                        break;
                str = next;
        }

        return num;
}
EXPORT_SYMBOL(vsscanf);

/**
 * sscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        formatting of buffer
 * @...:        resulting arguments
 */
int sscanf(const char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsscanf(buf, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sscanf);































    2 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_STRUCT_H
#define _LINUX_FS_STRUCT_H

#include <linux/sched.h>
#include <linux/path.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>

struct fs_struct {
        int users;
        seqlock_t seq;
        int umask;
        int in_exec;
        struct path root, pwd;
} __randomize_layout;

extern struct kmem_cache *fs_cachep;

extern void exit_fs(struct task_struct *);
extern void set_fs_root(struct fs_struct *, const struct path *);
extern void set_fs_pwd(struct fs_struct *, const struct path *);
extern struct fs_struct *copy_fs_struct(struct fs_struct *);
extern void free_fs_struct(struct fs_struct *);
extern int unshare_fs_struct(void);

static inline void get_fs_root(struct fs_struct *fs, struct path *root)
{
        read_seqlock_excl(&fs->seq);
        *root = fs->root;
        path_get(root);
        read_sequnlock_excl(&fs->seq);
}

static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
{
        read_seqlock_excl(&fs->seq);
        *pwd = fs->pwd;
        path_get(pwd);
        read_sequnlock_excl(&fs->seq);
}

extern bool current_chrooted(void);

static inline int current_umask(void)
{
        return current->fs->umask;
}

#endif /* _LINUX_FS_STRUCT_H */




































































































    1 





    1 








































    1 




















    1 



























    1 








    1 








































































































































































































































































































































































































































































































































































































































































































































    1 







    1 






    1 















    1 















    1 



































































































































































































    1 

























    1 



    1 


    1 



























    1 




























































































































































































































































































    1 













    1 





    1 






















    1 














    1 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
/*
 * hugetlbpage-backed filesystem.  Based on ramfs.
 *
 * Nadia Yvette Chambers, 2002
 *
 * Copyright (C) 2002 Linus Torvalds.
 * License: GPL
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/capability.h>
#include <linux/ctype.h>
#include <linux/backing-dev.h>
#include <linux/hugetlb.h>
#include <linux/folio_batch.h>
#include <linux/fs_parser.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/dnotify.h>
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/magic.h>
#include <linux/migrate.h>
#include <linux/uio.h>

#include <linux/uaccess.h>
#include <linux/sched/mm.h>

#define CREATE_TRACE_POINTS
#include <trace/events/hugetlbfs.h>

static const struct address_space_operations hugetlbfs_aops;
static const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;

enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };

struct hugetlbfs_fs_context {
        struct hstate                *hstate;
        unsigned long long        max_size_opt;
        unsigned long long        min_size_opt;
        long                        max_hpages;
        long                        nr_inodes;
        long                        min_hpages;
        enum hugetlbfs_size_type max_val_type;
        enum hugetlbfs_size_type min_val_type;
        kuid_t                        uid;
        kgid_t                        gid;
        umode_t                        mode;
};

int sysctl_hugetlb_shm_group;

enum hugetlb_param {
        Opt_gid,
        Opt_min_size,
        Opt_mode,
        Opt_nr_inodes,
        Opt_pagesize,
        Opt_size,
        Opt_uid,
};

static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
        fsparam_gid   ("gid",                Opt_gid),
        fsparam_string("min_size",        Opt_min_size),
        fsparam_u32oct("mode",                Opt_mode),
        fsparam_string("nr_inodes",        Opt_nr_inodes),
        fsparam_string("pagesize",        Opt_pagesize),
        fsparam_string("size",                Opt_size),
        fsparam_uid   ("uid",                Opt_uid),
        {}
};

/*
 * Mask used when checking the page offset value passed in via system
 * calls.  This value will be converted to a loff_t which is signed.
 * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
 * value.  The extra bit (- 1 in the shift value) is to take the sign
 * bit into account.
 */
#define PGOFF_LOFFT_MAX \
        (((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))

static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
{
        /* Unfortunate we have to reassign vma->vm_private_data. */
        return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
}

static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
{
        struct file *file = desc->file;
        struct inode *inode = file_inode(file);
        loff_t len, vma_len;
        int ret;
        struct hstate *h = hstate_file(file);
        vma_flags_t vma_flags;

        /*
         * vma address alignment (but not the pgoff alignment) has
         * already been checked by prepare_hugepage_range.  If you add
         * any error returns here, do so after setting VM_HUGETLB, so
         * is_vm_hugetlb_page tests below unmap_region go the right
         * way when do_mmap unwinds (may be important on powerpc
         * and ia64).
         */
        vma_desc_set_flags(desc, VMA_HUGETLB_BIT, VMA_DONTEXPAND_BIT);
        desc->vm_ops = &hugetlb_vm_ops;

        /*
         * page based offset in vm_pgoff could be sufficiently large to
         * overflow a loff_t when converted to byte offset.  This can
         * only happen on architectures where sizeof(loff_t) ==
         * sizeof(unsigned long).  So, only check in those instances.
         */
        if (sizeof(unsigned long) == sizeof(loff_t)) {
                if (desc->pgoff & PGOFF_LOFFT_MAX)
                        return -EINVAL;
        }

        /* must be huge page aligned */
        if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
                return -EINVAL;

        vma_len = (loff_t)vma_desc_size(desc);
        len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT);
        /* check for overflow */
        if (len < vma_len)
                return -EINVAL;

        inode_lock(inode);
        file_accessed(file);

        ret = -ENOMEM;

        vma_flags = desc->vma_flags;
        /*
         * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
         * reserving here. Note: only for SHM hugetlbfs file, the inode
         * flag S_PRIVATE is set.
         */
        if (inode->i_flags & S_PRIVATE)
                vma_flags_set(&vma_flags, VMA_NORESERVE_BIT);

        if (hugetlb_reserve_pages(inode,
                        desc->pgoff >> huge_page_order(h),
                        len >> huge_page_shift(h), desc,
                        vma_flags) < 0)
                goto out;

        ret = 0;
        if (vma_desc_test(desc, VMA_WRITE_BIT) && inode->i_size < len)
                i_size_write(inode, len);
out:
        inode_unlock(inode);

        if (!ret) {
                /* Allocate the VMA lock after we set it up. */
                desc->action.success_hook = hugetlb_file_mmap_prepare_success;
                /*
                 * We cannot permit the rmap finding this VMA in the time
                 * between the VMA being inserted into the VMA tree and the
                 * completion/success hook being invoked.
                 *
                 * This is because we establish a per-VMA hugetlb lock which can
                 * be raced by rmap.
                 */
                desc->action.hide_from_rmap_until_complete = true;
        }
        return ret;
}

/*
 * Called under mmap_write_lock(mm).
 */

unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                            unsigned long len, unsigned long pgoff,
                            unsigned long flags)
{
        unsigned long addr0 = 0;
        struct hstate *h = hstate_file(file);

        if (len & ~huge_page_mask(h))
                return -EINVAL;
        if ((flags & MAP_FIXED) && (addr & ~huge_page_mask(h)))
                return -EINVAL;
        if (addr)
                addr0 = ALIGN(addr, huge_page_size(h));

        return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0);
}

/*
 * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset.
 * Returns the maximum number of bytes one can read without touching the 1st raw
 * HWPOISON page.
 */
static size_t adjust_range_hwpoison(struct folio *folio, size_t offset,
                size_t bytes)
{
        struct page *page = folio_page(folio, offset / PAGE_SIZE);
        size_t safe_bytes;

        if (is_raw_hwpoison_page_in_hugepage(page))
                return 0;
        /* Safe to read the remaining bytes in this page. */
        safe_bytes = PAGE_SIZE - (offset % PAGE_SIZE);
        page++;

        /* Check each remaining page as long as we are not done yet. */
        for (; safe_bytes < bytes; safe_bytes += PAGE_SIZE, page++)
                if (is_raw_hwpoison_page_in_hugepage(page))
                        break;

        return min(safe_bytes, bytes);
}

/*
 * Support for read() - Find the page attached to f_mapping and copy out the
 * data. This provides functionality similar to filemap_read().
 */
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct hstate *h = hstate_file(file);
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        unsigned long index = iocb->ki_pos >> huge_page_shift(h);
        unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
        unsigned long end_index;
        loff_t isize;
        ssize_t retval = 0;

        while (iov_iter_count(to)) {
                struct folio *folio;
                size_t nr, copied, want;

                /* nr is the maximum number of bytes to copy from this page */
                nr = huge_page_size(h);
                isize = i_size_read(inode);
                if (!isize)
                        break;
                end_index = (isize - 1) >> huge_page_shift(h);
                if (index > end_index)
                        break;
                if (index == end_index) {
                        nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
                        if (nr <= offset)
                                break;
                }
                nr = nr - offset;

                /* Find the folio */
                folio = filemap_lock_hugetlb_folio(h, mapping, index);
                if (IS_ERR(folio)) {
                        /*
                         * We have a HOLE, zero out the user-buffer for the
                         * length of the hole or request.
                         */
                        copied = iov_iter_zero(nr, to);
                } else {
                        folio_unlock(folio);

                        if (!folio_test_hwpoison(folio))
                                want = nr;
                        else {
                                /*
                                 * Adjust how many bytes safe to read without
                                 * touching the 1st raw HWPOISON page after
                                 * offset.
                                 */
                                want = adjust_range_hwpoison(folio, offset, nr);
                                if (want == 0) {
                                        folio_put(folio);
                                        retval = -EIO;
                                        break;
                                }
                        }

                        /*
                         * We have the folio, copy it to user space buffer.
                         */
                        copied = copy_folio_to_iter(folio, offset, want, to);
                        folio_put(folio);
                }
                offset += copied;
                retval += copied;
                if (copied != nr && iov_iter_count(to)) {
                        if (!retval)
                                retval = -EFAULT;
                        break;
                }
                index += offset >> huge_page_shift(h);
                offset &= ~huge_page_mask(h);
        }
        iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
        return retval;
}

static int hugetlbfs_write_begin(const struct kiocb *iocb,
                        struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct folio **foliop, void **fsdata)
{
        return -EINVAL;
}

static int hugetlbfs_write_end(const struct kiocb *iocb,
                               struct address_space *mapping,
                               loff_t pos, unsigned len, unsigned copied,
                               struct folio *folio, void *fsdata)
{
        BUG();
        return -EINVAL;
}

static void hugetlb_delete_from_page_cache(struct folio *folio)
{
        folio_clear_dirty(folio);
        folio_clear_uptodate(folio);
        filemap_remove_folio(folio);
}

/*
 * Called with i_mmap_rwsem held for inode based vma maps.  This makes
 * sure vma (and vm_mm) will not go away.  We also hold the hugetlb fault
 * mutex for the page in the mapping.  So, we can not race with page being
 * faulted into the vma.
 */
static bool hugetlb_vma_maps_pfn(struct vm_area_struct *vma,
                                unsigned long addr, unsigned long pfn)
{
        pte_t *ptep, pte;

        ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
        if (!ptep)
                return false;

        pte = huge_ptep_get(vma->vm_mm, addr, ptep);
        if (huge_pte_none(pte) || !pte_present(pte))
                return false;

        if (pte_pfn(pte) == pfn)
                return true;

        return false;
}

/*
 * Can vma_offset_start/vma_offset_end overflow on 32-bit arches?
 * No, because the interval tree returns us only those vmas
 * which overlap the truncated area starting at pgoff,
 * and no vma on a 32-bit arch can span beyond the 4GB.
 */
static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
{
        unsigned long offset = 0;

        if (vma->vm_pgoff < start)
                offset = (start - vma->vm_pgoff) << PAGE_SHIFT;

        return vma->vm_start + offset;
}

static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
{
        unsigned long t_end;

        if (!end)
                return vma->vm_end;

        t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
        if (t_end > vma->vm_end)
                t_end = vma->vm_end;
        return t_end;
}

/*
 * Called with hugetlb fault mutex held.  Therefore, no more mappings to
 * this folio can be created while executing the routine.
 */
static void hugetlb_unmap_file_folio(struct hstate *h,
                                        struct address_space *mapping,
                                        struct folio *folio, pgoff_t index)
{
        struct rb_root_cached *root = &mapping->i_mmap;
        struct hugetlb_vma_lock *vma_lock;
        unsigned long pfn = folio_pfn(folio);
        struct vm_area_struct *vma;
        unsigned long v_start;
        unsigned long v_end;
        pgoff_t start, end;

        start = index * pages_per_huge_page(h);
        end = (index + 1) * pages_per_huge_page(h);

        i_mmap_lock_write(mapping);
retry:
        vma_lock = NULL;
        vma_interval_tree_foreach(vma, root, start, end - 1) {
                v_start = vma_offset_start(vma, start);
                v_end = vma_offset_end(vma, end);

                if (!hugetlb_vma_maps_pfn(vma, v_start, pfn))
                        continue;

                if (!hugetlb_vma_trylock_write(vma)) {
                        vma_lock = vma->vm_private_data;
                        /*
                         * If we can not get vma lock, we need to drop
                         * immap_sema and take locks in order.  First,
                         * take a ref on the vma_lock structure so that
                         * we can be guaranteed it will not go away when
                         * dropping immap_sema.
                         */
                        kref_get(&vma_lock->refs);
                        break;
                }

                unmap_hugepage_range(vma, v_start, v_end, NULL,
                                     ZAP_FLAG_DROP_MARKER);
                hugetlb_vma_unlock_write(vma);
        }

        i_mmap_unlock_write(mapping);

        if (vma_lock) {
                /*
                 * Wait on vma_lock.  We know it is still valid as we have
                 * a reference.  We must 'open code' vma locking as we do
                 * not know if vma_lock is still attached to vma.
                 */
                down_write(&vma_lock->rw_sema);
                i_mmap_lock_write(mapping);

                vma = vma_lock->vma;
                if (!vma) {
                        /*
                         * If lock is no longer attached to vma, then just
                         * unlock, drop our reference and retry looking for
                         * other vmas.
                         */
                        up_write(&vma_lock->rw_sema);
                        kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
                        goto retry;
                }

                /*
                 * vma_lock is still attached to vma.  Check to see if vma
                 * still maps page and if so, unmap.
                 */
                v_start = vma_offset_start(vma, start);
                v_end = vma_offset_end(vma, end);
                if (hugetlb_vma_maps_pfn(vma, v_start, pfn))
                        unmap_hugepage_range(vma, v_start, v_end, NULL,
                                             ZAP_FLAG_DROP_MARKER);

                kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
                hugetlb_vma_unlock_write(vma);

                goto retry;
        }
}

static void
hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
                      zap_flags_t zap_flags)
{
        struct vm_area_struct *vma;

        /*
         * end == 0 indicates that the entire range after start should be
         * unmapped.  Note, end is exclusive, whereas the interval tree takes
         * an inclusive "last".
         */
        vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
                unsigned long v_start;
                unsigned long v_end;

                if (!hugetlb_vma_trylock_write(vma))
                        continue;

                v_start = vma_offset_start(vma, start);
                v_end = vma_offset_end(vma, end);

                unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags);

                /*
                 * Note that vma lock only exists for shared/non-private
                 * vmas.  Therefore, lock is not held when calling
                 * unmap_hugepage_range for private vmas.
                 */
                hugetlb_vma_unlock_write(vma);
        }
}

/*
 * Called with hugetlb fault mutex held.
 */
static void remove_inode_single_folio(struct hstate *h, struct inode *inode,
                struct address_space *mapping, struct folio *folio,
                pgoff_t index, bool truncate_op)
{
        /*
         * If folio is mapped, it was faulted in after being
         * unmapped in caller or hugetlb_vmdelete_list() skips
         * unmapping it due to fail to grab lock.  Unmap (again)
         * while holding the fault mutex.  The mutex will prevent
         * faults until we finish removing the folio.  Hold folio
         * lock to guarantee no concurrent migration.
         */
        folio_lock(folio);
        if (unlikely(folio_mapped(folio)))
                hugetlb_unmap_file_folio(h, mapping, folio, index);

        /*
         * We must remove the folio from page cache before removing
         * the region/ reserve map (hugetlb_unreserve_pages).  In
         * rare out of memory conditions, removal of the region/reserve
         * map could fail.  Correspondingly, the subpool and global
         * reserve usage count can need to be adjusted.
         */
        VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
        hugetlb_delete_from_page_cache(folio);
        if (!truncate_op) {
                if (unlikely(hugetlb_unreserve_pages(inode, index,
                                                        index + 1, 1)))
                        hugetlb_fix_reserve_counts(inode);
        }

        folio_unlock(folio);
}

/*
 * remove_inode_hugepages handles two distinct cases: truncation and hole
 * punch.  There are subtle differences in operation for each case.
 *
 * truncation is indicated by end of range being LLONG_MAX
 *        In this case, we first scan the range and release found pages.
 *        After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
 *        maps and global counts.  Page faults can race with truncation.
 *        During faults, hugetlb_no_page() checks i_size before page allocation,
 *        and again after obtaining page table lock.  It will 'back out'
 *        allocations in the truncated range.
 * hole punch is indicated if end is not LLONG_MAX
 *        In the hole punch case we scan the range and release found pages.
 *        Only when releasing a page is the associated region/reserve map
 *        deleted.  The region/reserve map for ranges without associated
 *        pages are not modified.  Page faults can race with hole punch.
 *        This is indicated if we find a mapped page.
 * Note: If the passed end of range value is beyond the end of file, but
 * not LLONG_MAX this routine still performs a hole punch operation.
 */
static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                                   loff_t lend)
{
        struct hstate *h = hstate_inode(inode);
        struct address_space *mapping = &inode->i_data;
        const pgoff_t end = lend >> PAGE_SHIFT;
        struct folio_batch fbatch;
        pgoff_t next, index;
        int i, freed = 0;
        bool truncate_op = (lend == LLONG_MAX);

        folio_batch_init(&fbatch);
        next = lstart >> PAGE_SHIFT;
        while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
                for (i = 0; i < folio_batch_count(&fbatch); ++i) {
                        struct folio *folio = fbatch.folios[i];
                        u32 hash = 0;

                        index = folio->index >> huge_page_order(h);
                        hash = hugetlb_fault_mutex_hash(mapping, index);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);

                        /*
                         * Remove folio that was part of folio_batch.
                         */
                        remove_inode_single_folio(h, inode, mapping, folio,
                                                  index, truncate_op);
                        freed++;

                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }

        if (truncate_op)
                (void)hugetlb_unreserve_pages(inode,
                                lstart >> huge_page_shift(h),
                                LONG_MAX, freed);
}

static void hugetlbfs_evict_inode(struct inode *inode)
{
        struct resv_map *resv_map;

        trace_hugetlbfs_evict_inode(inode);
        remove_inode_hugepages(inode, 0, LLONG_MAX);

        resv_map = HUGETLBFS_I(inode)->resv_map;
        /* Only regular and link inodes have associated reserve maps */
        if (resv_map)
                resv_map_release(&resv_map->refs);
        clear_inode(inode);
}

static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
        pgoff_t pgoff;
        struct address_space *mapping = inode->i_mapping;
        struct hstate *h = hstate_inode(inode);

        BUG_ON(offset & ~huge_page_mask(h));
        pgoff = offset >> PAGE_SHIFT;

        i_size_write(inode, offset);
        i_mmap_lock_write(mapping);
        if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
                hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
                                      ZAP_FLAG_DROP_MARKER);
        i_mmap_unlock_write(mapping);
        remove_inode_hugepages(inode, offset, LLONG_MAX);
}

static void hugetlbfs_zero_partial_page(struct hstate *h,
                                        struct address_space *mapping,
                                        loff_t start,
                                        loff_t end)
{
        pgoff_t idx = start >> huge_page_shift(h);
        struct folio *folio;

        folio = filemap_lock_hugetlb_folio(h, mapping, idx);
        if (IS_ERR(folio))
                return;

        start = start & ~huge_page_mask(h);
        end = end & ~huge_page_mask(h);
        if (!end)
                end = huge_page_size(h);

        folio_zero_segment(folio, (size_t)start, (size_t)end);

        folio_unlock(folio);
        folio_put(folio);
}

static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
        struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
        struct address_space *mapping = inode->i_mapping;
        struct hstate *h = hstate_inode(inode);
        loff_t hpage_size = huge_page_size(h);
        loff_t hole_start, hole_end;

        /*
         * hole_start and hole_end indicate the full pages within the hole.
         */
        hole_start = round_up(offset, hpage_size);
        hole_end = round_down(offset + len, hpage_size);

        inode_lock(inode);

        /* protected by i_rwsem */
        if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
                inode_unlock(inode);
                return -EPERM;
        }

        i_mmap_lock_write(mapping);

        /* If range starts before first full page, zero partial page. */
        if (offset < hole_start)
                hugetlbfs_zero_partial_page(h, mapping,
                                offset, min(offset + len, hole_start));

        /* Unmap users of full pages in the hole. */
        if (hole_end > hole_start) {
                if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
                        hugetlb_vmdelete_list(&mapping->i_mmap,
                                              hole_start >> PAGE_SHIFT,
                                              hole_end >> PAGE_SHIFT, 0);
        }

        /* If range extends beyond last full page, zero partial page. */
        if ((offset + len) > hole_end && (offset + len) > hole_start)
                hugetlbfs_zero_partial_page(h, mapping,
                                hole_end, offset + len);

        i_mmap_unlock_write(mapping);

        /* Remove full pages from the file. */
        if (hole_end > hole_start)
                remove_inode_hugepages(inode, hole_start, hole_end);

        inode_unlock(inode);

        return 0;
}

static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
                                loff_t len)
{
        struct inode *inode = file_inode(file);
        struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
        struct address_space *mapping = inode->i_mapping;
        struct hstate *h = hstate_inode(inode);
        struct vm_area_struct pseudo_vma;
        struct mm_struct *mm = current->mm;
        loff_t hpage_size = huge_page_size(h);
        unsigned long hpage_shift = huge_page_shift(h);
        pgoff_t start, index, end;
        int error;
        u32 hash;

        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                error = hugetlbfs_punch_hole(inode, offset, len);
                goto out_nolock;
        }

        /*
         * Default preallocate case.
         * For this range, start is rounded down and end is rounded up
         * as well as being converted to page offsets.
         */
        start = offset >> hpage_shift;
        end = (offset + len + hpage_size - 1) >> hpage_shift;

        inode_lock(inode);

        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
        error = inode_newsize_ok(inode, offset + len);
        if (error)
                goto out;

        if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
                error = -EPERM;
                goto out;
        }

        /*
         * Initialize a pseudo vma as this is required by the huge page
         * allocation routines.
         */
        vma_init(&pseudo_vma, mm);
        vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
        pseudo_vma.vm_file = file;

        for (index = start; index < end; index++) {
                /*
                 * This is supposed to be the vaddr where the page is being
                 * faulted in, but we have no vaddr here.
                 */
                struct folio *folio;
                unsigned long addr;

                cond_resched();

                /*
                 * fallocate(2) manpage permits EINTR; we may have been
                 * interrupted because we are using up too much memory.
                 */
                if (signal_pending(current)) {
                        error = -EINTR;
                        break;
                }

                /* addr is the offset within the file (zero based) */
                addr = index * hpage_size;

                /* mutex taken here, fault path and hole punch */
                hash = hugetlb_fault_mutex_hash(mapping, index);
                mutex_lock(&hugetlb_fault_mutex_table[hash]);

                /* See if already present in mapping to avoid alloc/free */
                folio = filemap_get_folio(mapping, index << huge_page_order(h));
                if (!IS_ERR(folio)) {
                        folio_put(folio);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        continue;
                }

                /*
                 * Allocate folio without setting the avoid_reserve argument.
                 * There certainly are no reserves associated with the
                 * pseudo_vma.  However, there could be shared mappings with
                 * reserves for the file at the inode level.  If we fallocate
                 * folios in these areas, we need to consume the reserves
                 * to keep reservation accounting consistent.
                 */
                folio = alloc_hugetlb_folio(&pseudo_vma, addr, false);
                if (IS_ERR(folio)) {
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        error = PTR_ERR(folio);
                        goto out;
                }
                folio_zero_user(folio, addr);
                __folio_mark_uptodate(folio);
                error = hugetlb_add_to_page_cache(folio, mapping, index);
                if (unlikely(error)) {
                        restore_reserve_on_error(h, &pseudo_vma, addr, folio);
                        folio_put(folio);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        goto out;
                }

                mutex_unlock(&hugetlb_fault_mutex_table[hash]);

                folio_set_hugetlb_migratable(folio);
                /*
                 * folio_unlock because locked by hugetlb_add_to_page_cache()
                 * folio_put() due to reference from alloc_hugetlb_folio()
                 */
                folio_unlock(folio);
                folio_put(folio);
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
                i_size_write(inode, offset + len);
        inode_set_ctime_current(inode);
out:
        inode_unlock(inode);

out_nolock:
        trace_hugetlbfs_fallocate(inode, mode, offset, len, error);
        return error;
}

static int hugetlbfs_setattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        struct hstate *h = hstate_inode(inode);
        int error;
        unsigned int ia_valid = attr->ia_valid;
        struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);

        error = setattr_prepare(idmap, dentry, attr);
        if (error)
                return error;

        trace_hugetlbfs_setattr(inode, dentry, attr);

        if (ia_valid & ATTR_SIZE) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;

                if (newsize & ~huge_page_mask(h))
                        return -EINVAL;
                /* protected by i_rwsem */
                if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
                    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
                        return -EPERM;
                hugetlb_vmtruncate(inode, newsize);
        }

        setattr_copy(idmap, inode, attr);
        mark_inode_dirty(inode);
        return 0;
}

static struct inode *hugetlbfs_get_root(struct super_block *sb,
                                        struct hugetlbfs_fs_context *ctx)
{
        struct inode *inode;

        inode = new_inode(sb);
        if (inode) {
                inode->i_ino = get_next_ino();
                inode->i_mode = S_IFDIR | ctx->mode;
                inode->i_uid = ctx->uid;
                inode->i_gid = ctx->gid;
                simple_inode_init_ts(inode);
                inode->i_op = &hugetlbfs_dir_inode_operations;
                inode->i_fop = &simple_dir_operations;
                HUGETLBFS_I(inode)->resv_map = NULL;
                /* directory inodes start off with i_nlink == 2 (for "." entry) */
                inc_nlink(inode);
                lockdep_annotate_inode_mutex_key(inode);
        }
        return inode;
}

/*
 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
 * be taken from reclaim -- unlike regular filesystems. This needs an
 * annotation because huge_pmd_share() does an allocation under hugetlb's
 * i_mmap_rwsem.
 */
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;

static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                                        struct mnt_idmap *idmap,
                                        struct inode *dir,
                                        umode_t mode, dev_t dev)
{
        struct inode *inode;
        struct resv_map *resv_map = NULL;

        /*
         * Reserve maps are only needed for inodes that can have associated
         * page allocations.
         */
        if (S_ISREG(mode) || S_ISLNK(mode)) {
                resv_map = resv_map_alloc();
                if (!resv_map)
                        return NULL;
        }

        inode = new_inode(sb);
        if (inode) {
                struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);

                inode->i_ino = get_next_ino();
                inode_init_owner(idmap, inode, dir, mode);
                lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
                                &hugetlbfs_i_mmap_rwsem_key);
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                simple_inode_init_ts(inode);
                info->resv_map = resv_map;
                info->seals = F_SEAL_SEAL;
                switch (mode & S_IFMT) {
                default:
                        init_special_inode(inode, mode, dev);
                        break;
                case S_IFREG:
                        inode->i_op = &hugetlbfs_inode_operations;
                        inode->i_fop = &hugetlbfs_file_operations;
                        break;
                case S_IFDIR:
                        inode->i_op = &hugetlbfs_dir_inode_operations;
                        inode->i_fop = &simple_dir_operations;

                        /* directory inodes start off with i_nlink == 2 (for "." entry) */
                        inc_nlink(inode);
                        break;
                case S_IFLNK:
                        inode->i_op = &page_symlink_inode_operations;
                        inode_nohighmem(inode);
                        break;
                }
                lockdep_annotate_inode_mutex_key(inode);
                trace_hugetlbfs_alloc_inode(inode, dir, mode);
        } else {
                if (resv_map)
                        kref_put(&resv_map->refs, resv_map_release);
        }

        return inode;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
                           struct dentry *dentry, umode_t mode, dev_t dev)
{
        struct inode *inode;

        inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev);
        if (!inode)
                return -ENOSPC;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        d_make_persistent(dentry, inode);
        return 0;
}

static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                                      struct dentry *dentry, umode_t mode)
{
        int retval = hugetlbfs_mknod(idmap, dir, dentry,
                                     mode | S_IFDIR, 0);
        if (!retval)
                inc_nlink(dir);
        return ERR_PTR(retval);
}

static int hugetlbfs_create(struct mnt_idmap *idmap,
                            struct inode *dir, struct dentry *dentry,
                            umode_t mode, bool excl)
{
        return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}

static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
                             struct inode *dir, struct file *file,
                             umode_t mode)
{
        struct inode *inode;

        inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0);
        if (!inode)
                return -ENOSPC;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        d_tmpfile(file, inode);
        return finish_open_simple(file, 0);
}

static int hugetlbfs_symlink(struct mnt_idmap *idmap,
                             struct inode *dir, struct dentry *dentry,
                             const char *symname)
{
        const umode_t mode = S_IFLNK|S_IRWXUGO;
        struct inode *inode;
        int error = -ENOSPC;

        inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0);
        if (inode) {
                int l = strlen(symname)+1;
                error = page_symlink(inode, symname, l);
                if (!error)
                        d_make_persistent(dentry, inode);
                else
                        iput(inode);
        }
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));

        return error;
}

#ifdef CONFIG_MIGRATION
static int hugetlbfs_migrate_folio(struct address_space *mapping,
                                struct folio *dst, struct folio *src,
                                enum migrate_mode mode)
{
        int rc;

        rc = migrate_huge_page_move_mapping(mapping, dst, src);
        if (rc)
                return rc;

        if (hugetlb_folio_subpool(src)) {
                hugetlb_set_folio_subpool(dst,
                                        hugetlb_folio_subpool(src));
                hugetlb_set_folio_subpool(src, NULL);
        }

        folio_migrate_flags(dst, src);

        return 0;
}
#else
#define hugetlbfs_migrate_folio NULL
#endif

static int hugetlbfs_error_remove_folio(struct address_space *mapping,
                                struct folio *folio)
{
        return 0;
}

/*
 * Display the mount options in /proc/mounts.
 */
static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root)
{
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb);
        struct hugepage_subpool *spool = sbinfo->spool;
        unsigned long hpage_size = huge_page_size(sbinfo->hstate);
        unsigned hpage_shift = huge_page_shift(sbinfo->hstate);
        char mod;

        if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
                seq_printf(m, ",uid=%u",
                           from_kuid_munged(&init_user_ns, sbinfo->uid));
        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
                seq_printf(m, ",gid=%u",
                           from_kgid_munged(&init_user_ns, sbinfo->gid));
        if (sbinfo->mode != 0755)
                seq_printf(m, ",mode=%o", sbinfo->mode);
        if (sbinfo->max_inodes != -1)
                seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes);

        hpage_size /= 1024;
        mod = 'K';
        if (hpage_size >= 1024) {
                hpage_size /= 1024;
                mod = 'M';
        }
        seq_printf(m, ",pagesize=%lu%c", hpage_size, mod);
        if (spool) {
                if (spool->max_hpages != -1)
                        seq_printf(m, ",size=%llu",
                                   (unsigned long long)spool->max_hpages << hpage_shift);
                if (spool->min_hpages != -1)
                        seq_printf(m, ",min_size=%llu",
                                   (unsigned long long)spool->min_hpages << hpage_shift);
        }
        return 0;
}

static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
        struct hstate *h = hstate_inode(d_inode(dentry));
        u64 id = huge_encode_dev(dentry->d_sb->s_dev);

        buf->f_fsid = u64_to_fsid(id);
        buf->f_type = HUGETLBFS_MAGIC;
        buf->f_bsize = huge_page_size(h);
        if (sbinfo) {
                spin_lock(&sbinfo->stat_lock);
                /* If no limits set, just report 0 or -1 for max/free/used
                 * blocks, like simple_statfs() */
                if (sbinfo->spool) {
                        long free_pages;

                        spin_lock_irq(&sbinfo->spool->lock);
                        buf->f_blocks = sbinfo->spool->max_hpages;
                        free_pages = sbinfo->spool->max_hpages
                                - sbinfo->spool->used_hpages;
                        buf->f_bavail = buf->f_bfree = free_pages;
                        spin_unlock_irq(&sbinfo->spool->lock);
                        buf->f_files = sbinfo->max_inodes;
                        buf->f_ffree = sbinfo->free_inodes;
                }
                spin_unlock(&sbinfo->stat_lock);
        }
        buf->f_namelen = NAME_MAX;
        return 0;
}

static void hugetlbfs_put_super(struct super_block *sb)
{
        struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);

        if (sbi) {
                sb->s_fs_info = NULL;

                if (sbi->spool)
                        hugepage_put_subpool(sbi->spool);

                kfree(sbi);
        }
}

static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
        if (sbinfo->free_inodes >= 0) {
                spin_lock(&sbinfo->stat_lock);
                if (unlikely(!sbinfo->free_inodes)) {
                        spin_unlock(&sbinfo->stat_lock);
                        return 0;
                }
                sbinfo->free_inodes--;
                spin_unlock(&sbinfo->stat_lock);
        }

        return 1;
}

static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
        if (sbinfo->free_inodes >= 0) {
                spin_lock(&sbinfo->stat_lock);
                sbinfo->free_inodes++;
                spin_unlock(&sbinfo->stat_lock);
        }
}


static struct kmem_cache *hugetlbfs_inode_cachep;

static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
{
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
        struct hugetlbfs_inode_info *p;

        if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
                return NULL;
        p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
        if (unlikely(!p)) {
                hugetlbfs_inc_free_inodes(sbinfo);
                return NULL;
        }
        return &p->vfs_inode;
}

static void hugetlbfs_free_inode(struct inode *inode)
{
        trace_hugetlbfs_free_inode(inode);
        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}

static void hugetlbfs_destroy_inode(struct inode *inode)
{
        hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
}

static const struct address_space_operations hugetlbfs_aops = {
        .write_begin        = hugetlbfs_write_begin,
        .write_end        = hugetlbfs_write_end,
        .dirty_folio        = noop_dirty_folio,
        .migrate_folio  = hugetlbfs_migrate_folio,
        .error_remove_folio        = hugetlbfs_error_remove_folio,
};


static void init_once(void *foo)
{
        struct hugetlbfs_inode_info *ei = foo;

        inode_init_once(&ei->vfs_inode);
}

static const struct file_operations hugetlbfs_file_operations = {
        .read_iter                = hugetlbfs_read_iter,
        .mmap_prepare                = hugetlbfs_file_mmap_prepare,
        .fsync                        = noop_fsync,
        .get_unmapped_area        = hugetlb_get_unmapped_area,
        .llseek                        = default_llseek,
        .fallocate                = hugetlbfs_fallocate,
        .fop_flags                = FOP_HUGE_PAGES,
};

static const struct inode_operations hugetlbfs_dir_inode_operations = {
        .create                = hugetlbfs_create,
        .lookup                = simple_lookup,
        .link                = simple_link,
        .unlink                = simple_unlink,
        .symlink        = hugetlbfs_symlink,
        .mkdir                = hugetlbfs_mkdir,
        .rmdir                = simple_rmdir,
        .mknod                = hugetlbfs_mknod,
        .rename                = simple_rename,
        .setattr        = hugetlbfs_setattr,
        .tmpfile        = hugetlbfs_tmpfile,
};

static const struct inode_operations hugetlbfs_inode_operations = {
        .setattr        = hugetlbfs_setattr,
};

static const struct super_operations hugetlbfs_ops = {
        .alloc_inode    = hugetlbfs_alloc_inode,
        .free_inode     = hugetlbfs_free_inode,
        .destroy_inode  = hugetlbfs_destroy_inode,
        .evict_inode        = hugetlbfs_evict_inode,
        .statfs                = hugetlbfs_statfs,
        .put_super        = hugetlbfs_put_super,
        .show_options        = hugetlbfs_show_options,
};

/*
 * Convert size option passed from command line to number of huge pages
 * in the pool specified by hstate.  Size option could be in bytes
 * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
 */
static long
hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
                         enum hugetlbfs_size_type val_type)
{
        if (val_type == NO_SIZE)
                return -1;

        if (val_type == SIZE_PERCENT) {
                size_opt <<= huge_page_shift(h);
                size_opt *= h->max_huge_pages;
                do_div(size_opt, 100);
        }

        size_opt >>= huge_page_shift(h);
        return size_opt;
}

/*
 * Parse one mount parameter.
 */
static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct hugetlbfs_fs_context *ctx = fc->fs_private;
        struct fs_parse_result result;
        struct hstate *h;
        char *rest;
        unsigned long ps;
        int opt;

        opt = fs_parse(fc, hugetlb_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_uid:
                ctx->uid = result.uid;
                return 0;

        case Opt_gid:
                ctx->gid = result.gid;
                return 0;

        case Opt_mode:
                ctx->mode = result.uint_32 & 01777U;
                return 0;

        case Opt_size:
                /* memparse() will accept a K/M/G without a digit */
                if (!param->string || !isdigit(param->string[0]))
                        goto bad_val;
                ctx->max_size_opt = memparse(param->string, &rest);
                ctx->max_val_type = SIZE_STD;
                if (*rest == '%')
                        ctx->max_val_type = SIZE_PERCENT;
                return 0;

        case Opt_nr_inodes:
                /* memparse() will accept a K/M/G without a digit */
                if (!param->string || !isdigit(param->string[0]))
                        goto bad_val;
                ctx->nr_inodes = memparse(param->string, &rest);
                return 0;

        case Opt_pagesize:
                ps = memparse(param->string, &rest);
                h = size_to_hstate(ps);
                if (!h) {
                        pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
                        return -EINVAL;
                }
                ctx->hstate = h;
                return 0;

        case Opt_min_size:
                /* memparse() will accept a K/M/G without a digit */
                if (!param->string || !isdigit(param->string[0]))
                        goto bad_val;
                ctx->min_size_opt = memparse(param->string, &rest);
                ctx->min_val_type = SIZE_STD;
                if (*rest == '%')
                        ctx->min_val_type = SIZE_PERCENT;
                return 0;

        default:
                return -EINVAL;
        }

bad_val:
        return invalfc(fc, "Bad value '%s' for mount option '%s'\n",
                      param->string, param->key);
}

/*
 * Validate the parsed options.
 */
static int hugetlbfs_validate(struct fs_context *fc)
{
        struct hugetlbfs_fs_context *ctx = fc->fs_private;

        /*
         * Use huge page pool size (in hstate) to convert the size
         * options to number of huge pages.  If NO_SIZE, -1 is returned.
         */
        ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
                                                   ctx->max_size_opt,
                                                   ctx->max_val_type);
        ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
                                                   ctx->min_size_opt,
                                                   ctx->min_val_type);

        /*
         * If max_size was specified, then min_size must be smaller
         */
        if (ctx->max_val_type > NO_SIZE &&
            ctx->min_hpages > ctx->max_hpages) {
                pr_err("Minimum size can not be greater than maximum size\n");
                return -EINVAL;
        }

        return 0;
}

static int
hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct hugetlbfs_fs_context *ctx = fc->fs_private;
        struct hugetlbfs_sb_info *sbinfo;

        sbinfo = kmalloc_obj(struct hugetlbfs_sb_info);
        if (!sbinfo)
                return -ENOMEM;
        sb->s_fs_info = sbinfo;
        spin_lock_init(&sbinfo->stat_lock);
        sbinfo->hstate                = ctx->hstate;
        sbinfo->max_inodes        = ctx->nr_inodes;
        sbinfo->free_inodes        = ctx->nr_inodes;
        sbinfo->spool                = NULL;
        sbinfo->uid                = ctx->uid;
        sbinfo->gid                = ctx->gid;
        sbinfo->mode                = ctx->mode;

        /*
         * Allocate and initialize subpool if maximum or minimum size is
         * specified.  Any needed reservations (for minimum size) are taken
         * when the subpool is created.
         */
        if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
                sbinfo->spool = hugepage_new_subpool(ctx->hstate,
                                                     ctx->max_hpages,
                                                     ctx->min_hpages);
                if (!sbinfo->spool)
                        goto out_free;
        }
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = huge_page_size(ctx->hstate);
        sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
        sb->s_magic = HUGETLBFS_MAGIC;
        sb->s_op = &hugetlbfs_ops;
        sb->s_d_flags = DCACHE_DONTCACHE;
        sb->s_time_gran = 1;

        /*
         * Due to the special and limited functionality of hugetlbfs, it does
         * not work well as a stacking filesystem.
         */
        sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
        sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
        if (!sb->s_root)
                goto out_free;
        return 0;
out_free:
        kfree(sbinfo->spool);
        kfree(sbinfo);
        return -ENOMEM;
}

static int hugetlbfs_get_tree(struct fs_context *fc)
{
        int err = hugetlbfs_validate(fc);
        if (err)
                return err;
        return get_tree_nodev(fc, hugetlbfs_fill_super);
}

static void hugetlbfs_fs_context_free(struct fs_context *fc)
{
        kfree(fc->fs_private);
}

static const struct fs_context_operations hugetlbfs_fs_context_ops = {
        .free                = hugetlbfs_fs_context_free,
        .parse_param        = hugetlbfs_parse_param,
        .get_tree        = hugetlbfs_get_tree,
};

static int hugetlbfs_init_fs_context(struct fs_context *fc)
{
        struct hugetlbfs_fs_context *ctx;

        ctx = kzalloc_obj(struct hugetlbfs_fs_context);
        if (!ctx)
                return -ENOMEM;

        ctx->max_hpages        = -1; /* No limit on size by default */
        ctx->nr_inodes        = -1; /* No limit on number of inodes by default */
        ctx->uid        = current_fsuid();
        ctx->gid        = current_fsgid();
        ctx->mode        = 0755;
        ctx->hstate        = &default_hstate;
        ctx->min_hpages        = -1; /* No default minimum size */
        ctx->max_val_type = NO_SIZE;
        ctx->min_val_type = NO_SIZE;
        fc->fs_private = ctx;
        fc->ops        = &hugetlbfs_fs_context_ops;
        return 0;
}

static struct file_system_type hugetlbfs_fs_type = {
        .name                        = "hugetlbfs",
        .init_fs_context        = hugetlbfs_init_fs_context,
        .parameters                = hugetlb_fs_parameters,
        .kill_sb                = kill_anon_super,
        .fs_flags               = FS_ALLOW_IDMAP,
};

static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];

static int can_do_hugetlb_shm(void)
{
        kgid_t shm_group;
        shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
        return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
}

static int get_hstate_idx(int page_size_log)
{
        struct hstate *h = hstate_sizelog(page_size_log);

        if (!h)
                return -1;
        return hstate_index(h);
}

/*
 * Note that size should be aligned to proper hugepage size in caller side,
 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
 */
struct file *hugetlb_file_setup(const char *name, size_t size,
                                vma_flags_t acctflag, int creat_flags,
                                int page_size_log)
{
        struct inode *inode;
        struct vfsmount *mnt;
        int hstate_idx;
        struct file *file;

        hstate_idx = get_hstate_idx(page_size_log);
        if (hstate_idx < 0)
                return ERR_PTR(-ENODEV);

        mnt = hugetlbfs_vfsmount[hstate_idx];
        if (!mnt)
                return ERR_PTR(-ENOENT);

        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
                struct ucounts *ucounts = current_ucounts();

                if (user_shm_lock(size, ucounts)) {
                        pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
                                current->comm, current->pid);
                        user_shm_unlock(size, ucounts);
                }
                return ERR_PTR(-EPERM);
        }

        file = ERR_PTR(-ENOSPC);
        /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts.  */
        inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL,
                                    S_IFREG | S_IRWXUGO, 0);
        if (!inode)
                goto out;
        if (creat_flags == HUGETLB_SHMFS_INODE)
                inode->i_flags |= S_PRIVATE;

        inode->i_size = size;
        clear_nlink(inode);

        if (hugetlb_reserve_pages(inode, 0,
                        size >> huge_page_shift(hstate_inode(inode)), NULL,
                        acctflag) < 0)
                file = ERR_PTR(-ENOMEM);
        else
                file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
                                        &hugetlbfs_file_operations);
        if (!IS_ERR(file))
                return file;

        iput(inode);
out:
        return file;
}

static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
{
        struct fs_context *fc;
        struct vfsmount *mnt;

        fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
        if (IS_ERR(fc)) {
                mnt = ERR_CAST(fc);
        } else {
                struct hugetlbfs_fs_context *ctx = fc->fs_private;
                ctx->hstate = h;
                mnt = fc_mount_longterm(fc);
                put_fs_context(fc);
        }
        if (IS_ERR(mnt))
                pr_err("Cannot mount internal hugetlbfs for page size %luK",
                       huge_page_size(h) / SZ_1K);
        return mnt;
}

static int __init init_hugetlbfs_fs(void)
{
        struct vfsmount *mnt;
        struct hstate *h;
        int error;
        int i;

        if (!hugepages_supported()) {
                pr_info("disabling because there are no supported hugepage sizes\n");
                return -ENOTSUPP;
        }

        error = -ENOMEM;
        hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
                                        sizeof(struct hugetlbfs_inode_info),
                                        0, SLAB_ACCOUNT, init_once);
        if (hugetlbfs_inode_cachep == NULL)
                goto out;

        error = register_filesystem(&hugetlbfs_fs_type);
        if (error)
                goto out_free;

        /* default hstate mount is required */
        mnt = mount_one_hugetlbfs(&default_hstate);
        if (IS_ERR(mnt)) {
                error = PTR_ERR(mnt);
                goto out_unreg;
        }
        hugetlbfs_vfsmount[default_hstate_idx] = mnt;

        /* other hstates are optional */
        i = 0;
        for_each_hstate(h) {
                if (i == default_hstate_idx) {
                        i++;
                        continue;
                }

                mnt = mount_one_hugetlbfs(h);
                if (IS_ERR(mnt))
                        hugetlbfs_vfsmount[i] = NULL;
                else
                        hugetlbfs_vfsmount[i] = mnt;
                i++;
        }

        return 0;

 out_unreg:
        (void)unregister_filesystem(&hugetlbfs_fs_type);
 out_free:
        kmem_cache_destroy(hugetlbfs_inode_cachep);
 out:
        return error;
}
fs_initcall(init_hugetlbfs_fs)























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *                INETPEER - A storage for permanent information about peers
 *
 *  Authors:        Andrey V. Savochkin <saw@msu.ru>
 */

#ifndef _NET_INETPEER_H
#define _NET_INETPEER_H

#include <linux/types.h>
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/spinlock.h>
#include <linux/rtnetlink.h>
#include <net/ipv6.h>
#include <linux/atomic.h>

/* IPv4 address key for cache lookups */
struct ipv4_addr_key {
        __be32        addr;
        int        vif;
};

#define INETPEER_MAXKEYSZ   (sizeof(struct in6_addr) / sizeof(u32))

struct inetpeer_addr {
        union {
                struct ipv4_addr_key        a4;
                struct in6_addr                a6;
                u32                        key[INETPEER_MAXKEYSZ];
        };
        __u16                                family;
};

struct inet_peer {
        struct rb_node                rb_node;
        struct inetpeer_addr        daddr;

        u32                        metrics[RTAX_MAX];
        u32                        rate_tokens;        /* rate limiting for ICMP */
        u32                        n_redirects;
        unsigned long                rate_last;
        /*
         * Once inet_peer is queued for deletion (refcnt == 0), following field
         * is not available: rid
         * We can share memory with rcu_head to help keep inet_peer small.
         */
        union {
                struct {
                        atomic_t                        rid;                /* Frag reception counter */
                };
                struct rcu_head         rcu;
        };

        /* following fields might be frequently dirtied */
        __u32                        dtime;        /* the time of last use of not referenced entries */
        refcount_t                refcnt;
};

struct inet_peer_base {
        struct rb_root                rb_root;
        seqlock_t                lock;
        int                        total;
};

void inet_peer_base_init(struct inet_peer_base *);

void inet_initpeers(void) __init;

#define INETPEER_METRICS_NEW        (~(u32) 0)

static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip)
{
        iaddr->a4.addr = ip;
        iaddr->a4.vif = 0;
        iaddr->family = AF_INET;
}

static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr)
{
        return iaddr->a4.addr;
}

static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr,
                                        struct in6_addr *in6)
{
        iaddr->a6 = *in6;
        iaddr->family = AF_INET6;
}

static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr)
{
        return &iaddr->a6;
}

/* can be called with or without local BH being disabled */
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
                               const struct inetpeer_addr *daddr);

static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base,
                                                __be32 v4daddr,
                                                int vif)
{
        struct inetpeer_addr daddr;

        daddr.a4.addr = v4daddr;
        daddr.a4.vif = vif;
        daddr.family = AF_INET;
        return inet_getpeer(base, &daddr);
}

static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base,
                                                const struct in6_addr *v6daddr)
{
        struct inetpeer_addr daddr;

        daddr.a6 = *v6daddr;
        daddr.family = AF_INET6;
        return inet_getpeer(base, &daddr);
}

static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a,
                                    const struct inetpeer_addr *b)
{
        int i, n;

        if (a->family == AF_INET)
                n = sizeof(a->a4) / sizeof(u32);
        else
                n = sizeof(a->a6) / sizeof(u32);

        for (i = 0; i < n; i++) {
                if (a->key[i] == b->key[i])
                        continue;
                if (a->key[i] < b->key[i])
                        return -1;
                return 1;
        }

        return 0;
}

/* can be called from BH context or outside */
void inet_putpeer(struct inet_peer *p);
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);

void inetpeer_invalidate_tree(struct inet_peer_base *);

#endif /* _NET_INETPEER_H */


















    2 







    6 





    1 










    5 






    2 






























































    6 


























































    2 

    1 






    2 


    2 









    2 













    2 

    2 










    2 

















    2 









    2 



    1 







    2 










    2 
    2 
    2 
    2 





































































































































































   19 





   20 
   18 













































   18 



   17 
    3 




   17 



    5 






    6 
    6 

















    6 




    6 



    6 

    6 












    6 





    6 



    6 















    6 















    6 





    6 




    6 












    2 






    1 






















    2 



















    1 














    6 













    6 
























    4 









    2 













    6 







    6 



    6 





















    3 
























    6 






















    1 



















    2 

    2 


















































    2 










    1 








    2 

















    1 








    2 













    2 









    2 













































































































































































































































































































































































































    6 








    6 












    6 






    6 
    6 







    4 











































    6 


    6 













    6 
















    6 
    6 











    6 




    6 























































































































    5 
    2 





























    3 






































































    6 

































































    6 
























    3 
    1 
    4 

    4 






























































































































































































































































    4 


























    4 

















    4 









































































































    4 

















































































    4 

    6 




    6 













    5 





    6 


































    6 















    6 




















































































    6 














    6 








    6 



    6 
















    5 
    6 







    6 








    6 








    6 
    5 








    6 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <linux/vmalloc.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/perf_event.h>
#include <net/xdp.h>
#include "disasm.h"

#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)

static bool is_cmpxchg_insn(const struct bpf_insn *insn)
{
        return BPF_CLASS(insn->code) == BPF_STX &&
               BPF_MODE(insn->code) == BPF_ATOMIC &&
               insn->imm == BPF_CMPXCHG;
}

/* Return the regno defined by the insn, or -1. */
static int insn_def_regno(const struct bpf_insn *insn)
{
        switch (BPF_CLASS(insn->code)) {
        case BPF_JMP:
        case BPF_JMP32:
        case BPF_ST:
                return -1;
        case BPF_STX:
                if (BPF_MODE(insn->code) == BPF_ATOMIC ||
                    BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) {
                        if (insn->imm == BPF_CMPXCHG)
                                return BPF_REG_0;
                        else if (insn->imm == BPF_LOAD_ACQ)
                                return insn->dst_reg;
                        else if (insn->imm & BPF_FETCH)
                                return insn->src_reg;
                }
                return -1;
        default:
                return insn->dst_reg;
        }
}

/* Return TRUE if INSN has defined any 32-bit value explicitly. */
static bool insn_has_def32(struct bpf_insn *insn)
{
        int dst_reg = insn_def_regno(insn);

        if (dst_reg == -1)
                return false;

        return !bpf_is_reg64(insn, dst_reg, NULL, DST_OP);
}

static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
{
        const struct bpf_kfunc_desc *d0 = a;
        const struct bpf_kfunc_desc *d1 = b;

        if (d0->imm != d1->imm)
                return d0->imm < d1->imm ? -1 : 1;
        if (d0->offset != d1->offset)
                return d0->offset < d1->offset ? -1 : 1;
        return 0;
}

const struct btf_func_model *
bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
                         const struct bpf_insn *insn)
{
        const struct bpf_kfunc_desc desc = {
                .imm = insn->imm,
                .offset = insn->off,
        };
        const struct bpf_kfunc_desc *res;
        struct bpf_kfunc_desc_tab *tab;

        tab = prog->aux->kfunc_tab;
        res = bsearch(&desc, tab->descs, tab->nr_descs,
                      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);

        return res ? &res->func_model : NULL;
}

static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc)
{
        unsigned long call_imm;

        if (bpf_jit_supports_far_kfunc_call()) {
                call_imm = desc->func_id;
        } else {
                call_imm = BPF_CALL_IMM(desc->addr);
                /* Check whether the relative offset overflows desc->imm */
                if ((unsigned long)(s32)call_imm != call_imm) {
                        verbose(env, "address of kernel func_id %u is out of range\n",
                                desc->func_id);
                        return -EINVAL;
                }
        }
        desc->imm = call_imm;
        return 0;
}

static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env)
{
        struct bpf_kfunc_desc_tab *tab;
        int i, err;

        tab = env->prog->aux->kfunc_tab;
        if (!tab)
                return 0;

        for (i = 0; i < tab->nr_descs; i++) {
                err = set_kfunc_desc_imm(env, &tab->descs[i]);
                if (err)
                        return err;
        }

        sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
             kfunc_desc_cmp_by_imm_off, NULL);
        return 0;
}

static int add_kfunc_in_insns(struct bpf_verifier_env *env,
                              struct bpf_insn *insn, int cnt)
{
        int i, ret;

        for (i = 0; i < cnt; i++, insn++) {
                if (bpf_pseudo_kfunc_call(insn)) {
                        ret = bpf_add_kfunc_call(env, insn->imm, insn->off);
                        if (ret < 0)
                                return ret;
                }
        }
        return 0;
}

#ifndef CONFIG_BPF_JIT_ALWAYS_ON
static int get_callee_stack_depth(struct bpf_verifier_env *env,
                                  const struct bpf_insn *insn, int idx)
{
        int start = idx + insn->imm + 1, subprog;

        subprog = bpf_find_subprog(env, start);
        if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start))
                return -EFAULT;
        return env->subprog_info[subprog].stack_depth;
}
#endif

/* single env->prog->insni[off] instruction was replaced with the range
 * insni[off, off + cnt).  Adjust corresponding insn_aux_data by copying
 * [0, off) and [off, end) to new locations, so the patched range stays zero
 */
static void adjust_insn_aux_data(struct bpf_verifier_env *env,
                                 struct bpf_prog *new_prog, u32 off, u32 cnt)
{
        struct bpf_insn_aux_data *data = env->insn_aux_data;
        struct bpf_insn *insn = new_prog->insnsi;
        u32 old_seen = data[off].seen;
        u32 prog_len;
        int i;

        /* aux info at OFF always needs adjustment, no matter fast path
         * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
         * original insn at old prog.
         */
        data[off].zext_dst = insn_has_def32(insn + off + cnt - 1);

        if (cnt == 1)
                return;
        prog_len = new_prog->len;

        memmove(data + off + cnt - 1, data + off,
                sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
        memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1));
        for (i = off; i < off + cnt - 1; i++) {
                /* Expand insni[off]'s seen count to the patched range. */
                data[i].seen = old_seen;
                data[i].zext_dst = insn_has_def32(insn + i);
        }

        /*
         * The indirect_target flag of the original instruction was moved to the last of the
         * new instructions by the above memmove and memset, but the indirect jump target is
         * actually the first instruction, so move it back. This also matches with the behavior
         * of bpf_insn_array_adjust(), which preserves xlated_off to point to the first new
         * instruction.
         */
        if (data[off + cnt - 1].indirect_target) {
                data[off].indirect_target = 1;
                data[off + cnt - 1].indirect_target = 0;
        }
}

static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
{
        int i;

        if (len == 1)
                return;
        /* NOTE: fake 'exit' subprog should be updated as well. */
        for (i = 0; i <= env->subprog_cnt; i++) {
                if (env->subprog_info[i].start <= off)
                        continue;
                env->subprog_info[i].start += len - 1;
        }
}

static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len)
{
        int i;

        if (len == 1)
                return;

        for (i = 0; i < env->insn_array_map_cnt; i++)
                bpf_insn_array_adjust(env->insn_array_maps[i], off, len);
}

static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len)
{
        int i;

        for (i = 0; i < env->insn_array_map_cnt; i++)
                bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len);
}

static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
{
        struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
        int i, sz = prog->aux->size_poke_tab;
        struct bpf_jit_poke_descriptor *desc;

        for (i = 0; i < sz; i++) {
                desc = &tab[i];
                if (desc->insn_idx <= off)
                        continue;
                desc->insn_idx += len - 1;
        }
}

struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
                                     const struct bpf_insn *patch, u32 len)
{
        struct bpf_prog *new_prog;
        struct bpf_insn_aux_data *new_data = NULL;

        if (len > 1) {
                new_data = vrealloc(env->insn_aux_data,
                                    array_size(env->prog->len + len - 1,
                                               sizeof(struct bpf_insn_aux_data)),
                                    GFP_KERNEL_ACCOUNT | __GFP_ZERO);
                if (!new_data)
                        return NULL;

                env->insn_aux_data = new_data;
        }

        new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
        if (IS_ERR(new_prog)) {
                if (PTR_ERR(new_prog) == -ERANGE)
                        verbose(env,
                                "insn %d cannot be patched due to 16-bit range\n",
                                env->insn_aux_data[off].orig_idx);
                return NULL;
        }
        adjust_insn_aux_data(env, new_prog, off, len);
        adjust_subprog_starts(env, off, len);
        adjust_insn_arrays(env, off, len);
        adjust_poke_descs(new_prog, off, len);
        return new_prog;
}

/*
 * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the
 * jump offset by 'delta'.
 */
static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
{
        struct bpf_insn *insn = prog->insnsi;
        u32 insn_cnt = prog->len, i;
        s32 imm;
        s16 off;

        for (i = 0; i < insn_cnt; i++, insn++) {
                u8 code = insn->code;

                if (tgt_idx <= i && i < tgt_idx + delta)
                        continue;

                if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
                    BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
                        continue;

                if (insn->code == (BPF_JMP32 | BPF_JA)) {
                        if (i + 1 + insn->imm != tgt_idx)
                                continue;
                        if (check_add_overflow(insn->imm, delta, &imm))
                                return -ERANGE;
                        insn->imm = imm;
                } else {
                        if (i + 1 + insn->off != tgt_idx)
                                continue;
                        if (check_add_overflow(insn->off, delta, &off))
                                return -ERANGE;
                        insn->off = off;
                }
        }
        return 0;
}

static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
                                              u32 off, u32 cnt)
{
        int i, j;

        /* find first prog starting at or after off (first to remove) */
        for (i = 0; i < env->subprog_cnt; i++)
                if (env->subprog_info[i].start >= off)
                        break;
        /* find first prog starting at or after off + cnt (first to stay) */
        for (j = i; j < env->subprog_cnt; j++)
                if (env->subprog_info[j].start >= off + cnt)
                        break;
        /* if j doesn't start exactly at off + cnt, we are just removing
         * the front of previous prog
         */
        if (env->subprog_info[j].start != off + cnt)
                j--;

        if (j > i) {
                struct bpf_prog_aux *aux = env->prog->aux;
                int move;

                /* move fake 'exit' subprog as well */
                move = env->subprog_cnt + 1 - j;

                memmove(env->subprog_info + i,
                        env->subprog_info + j,
                        sizeof(*env->subprog_info) * move);
                env->subprog_cnt -= j - i;

                /* remove func_info */
                if (aux->func_info) {
                        move = aux->func_info_cnt - j;

                        memmove(aux->func_info + i,
                                aux->func_info + j,
                                sizeof(*aux->func_info) * move);
                        aux->func_info_cnt -= j - i;
                        /* func_info->insn_off is set after all code rewrites,
                         * in adjust_btf_func() - no need to adjust
                         */
                }
        } else {
                /* convert i from "first prog to remove" to "first to adjust" */
                if (env->subprog_info[i].start == off)
                        i++;
        }

        /* update fake 'exit' subprog as well */
        for (; i <= env->subprog_cnt; i++)
                env->subprog_info[i].start -= cnt;

        return 0;
}

static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
                                      u32 cnt)
{
        struct bpf_prog *prog = env->prog;
        u32 i, l_off, l_cnt, nr_linfo;
        struct bpf_line_info *linfo;

        nr_linfo = prog->aux->nr_linfo;
        if (!nr_linfo)
                return 0;

        linfo = prog->aux->linfo;

        /* find first line info to remove, count lines to be removed */
        for (i = 0; i < nr_linfo; i++)
                if (linfo[i].insn_off >= off)
                        break;

        l_off = i;
        l_cnt = 0;
        for (; i < nr_linfo; i++)
                if (linfo[i].insn_off < off + cnt)
                        l_cnt++;
                else
                        break;

        /* First live insn doesn't match first live linfo, it needs to "inherit"
         * last removed linfo.  prog is already modified, so prog->len == off
         * means no live instructions after (tail of the program was removed).
         */
        if (prog->len != off && l_cnt &&
            (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
                l_cnt--;
                linfo[--i].insn_off = off + cnt;
        }

        /* remove the line info which refer to the removed instructions */
        if (l_cnt) {
                memmove(linfo + l_off, linfo + i,
                        sizeof(*linfo) * (nr_linfo - i));

                prog->aux->nr_linfo -= l_cnt;
                nr_linfo = prog->aux->nr_linfo;
        }

        /* pull all linfo[i].insn_off >= off + cnt in by cnt */
        for (i = l_off; i < nr_linfo; i++)
                linfo[i].insn_off -= cnt;

        /* fix up all subprogs (incl. 'exit') which start >= off */
        for (i = 0; i <= env->subprog_cnt; i++)
                if (env->subprog_info[i].linfo_idx > l_off) {
                        /* program may have started in the removed region but
                         * may not be fully removed
                         */
                        if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
                                env->subprog_info[i].linfo_idx -= l_cnt;
                        else
                                env->subprog_info[i].linfo_idx = l_off;
                }

        return 0;
}

/*
 * Clean up dynamically allocated fields of aux data for instructions [start, ...]
 */
void bpf_clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len)
{
        struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
        struct bpf_insn *insns = env->prog->insnsi;
        int end = start + len;
        int i;

        for (i = start; i < end; i++) {
                if (aux_data[i].jt) {
                        kvfree(aux_data[i].jt);
                        aux_data[i].jt = NULL;
                }

                if (bpf_is_ldimm64(&insns[i]))
                        i++;
        }
}

static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
{
        struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
        unsigned int orig_prog_len = env->prog->len;
        int err;

        if (bpf_prog_is_offloaded(env->prog->aux))
                bpf_prog_offload_remove_insns(env, off, cnt);

        /* Should be called before bpf_remove_insns, as it uses prog->insnsi */
        bpf_clear_insn_aux_data(env, off, cnt);

        err = bpf_remove_insns(env->prog, off, cnt);
        if (err)
                return err;

        err = adjust_subprog_starts_after_remove(env, off, cnt);
        if (err)
                return err;

        err = bpf_adj_linfo_after_remove(env, off, cnt);
        if (err)
                return err;

        adjust_insn_arrays_after_remove(env, off, cnt);

        memmove(aux_data + off,        aux_data + off + cnt,
                sizeof(*aux_data) * (orig_prog_len - off - cnt));

        return 0;
}

static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0);

bool bpf_insn_is_cond_jump(u8 code)
{
        u8 op;

        op = BPF_OP(code);
        if (BPF_CLASS(code) == BPF_JMP32)
                return op != BPF_JA;

        if (BPF_CLASS(code) != BPF_JMP)
                return false;

        return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
}

void bpf_opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
{
        struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
        struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
        struct bpf_insn *insn = env->prog->insnsi;
        const int insn_cnt = env->prog->len;
        int i;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (!bpf_insn_is_cond_jump(insn->code))
                        continue;

                if (!aux_data[i + 1].seen)
                        ja.off = insn->off;
                else if (!aux_data[i + 1 + insn->off].seen)
                        ja.off = 0;
                else
                        continue;

                if (bpf_prog_is_offloaded(env->prog->aux))
                        bpf_prog_offload_replace_insn(env, i, &ja);

                memcpy(insn, &ja, sizeof(ja));
        }
}

int bpf_opt_remove_dead_code(struct bpf_verifier_env *env)
{
        struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
        int insn_cnt = env->prog->len;
        int i, err;

        for (i = 0; i < insn_cnt; i++) {
                int j;

                j = 0;
                while (i + j < insn_cnt && !aux_data[i + j].seen)
                        j++;
                if (!j)
                        continue;

                err = verifier_remove_insns(env, i, j);
                if (err)
                        return err;
                insn_cnt = env->prog->len;
        }

        return 0;
}

int bpf_opt_remove_nops(struct bpf_verifier_env *env)
{
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        bool is_may_goto_0, is_ja;
        int i, err;

        for (i = 0; i < insn_cnt; i++) {
                is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0));
                is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP));

                if (!is_may_goto_0 && !is_ja)
                        continue;

                err = verifier_remove_insns(env, i, 1);
                if (err)
                        return err;
                insn_cnt--;
                /* Go back one insn to catch may_goto +1; may_goto +0 sequence */
                i -= (is_may_goto_0 && i > 0) ? 2 : 1;
        }

        return 0;
}

int bpf_opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
                                         const union bpf_attr *attr)
{
        struct bpf_insn *patch;
        /* use env->insn_buf as two independent buffers */
        struct bpf_insn *zext_patch = env->insn_buf;
        struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2];
        struct bpf_insn_aux_data *aux = env->insn_aux_data;
        int i, patch_len, delta = 0, len = env->prog->len;
        struct bpf_insn *insns = env->prog->insnsi;
        struct bpf_prog *new_prog;
        bool rnd_hi32;

        rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
        zext_patch[1] = BPF_ZEXT_REG(0);
        rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
        rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
        rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
        for (i = 0; i < len; i++) {
                int adj_idx = i + delta;
                struct bpf_insn insn;
                int load_reg;

                insn = insns[adj_idx];
                load_reg = insn_def_regno(&insn);
                if (!aux[adj_idx].zext_dst) {
                        u8 code, class;
                        u32 imm_rnd;

                        if (!rnd_hi32)
                                continue;

                        code = insn.code;
                        class = BPF_CLASS(code);
                        if (load_reg == -1)
                                continue;

                        /* NOTE: arg "reg" (the fourth one) is only used for
                         *       BPF_STX + SRC_OP, so it is safe to pass NULL
                         *       here.
                         */
                        if (bpf_is_reg64(&insn, load_reg, NULL, DST_OP)) {
                                if (class == BPF_LD &&
                                    BPF_MODE(code) == BPF_IMM)
                                        i++;
                                continue;
                        }

                        /* ctx load could be transformed into wider load. */
                        if (class == BPF_LDX &&
                            aux[adj_idx].ptr_type == PTR_TO_CTX)
                                continue;

                        imm_rnd = get_random_u32();
                        rnd_hi32_patch[0] = insn;
                        rnd_hi32_patch[1].imm = imm_rnd;
                        rnd_hi32_patch[3].dst_reg = load_reg;
                        patch = rnd_hi32_patch;
                        patch_len = 4;
                        goto apply_patch_buffer;
                }

                /* Add in an zero-extend instruction if a) the JIT has requested
                 * it or b) it's a CMPXCHG.
                 *
                 * The latter is because: BPF_CMPXCHG always loads a value into
                 * R0, therefore always zero-extends. However some archs'
                 * equivalent instruction only does this load when the
                 * comparison is successful. This detail of CMPXCHG is
                 * orthogonal to the general zero-extension behaviour of the
                 * CPU, so it's treated independently of bpf_jit_needs_zext.
                 */
                if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
                        continue;

                /* Zero-extension is done by the caller. */
                if (bpf_pseudo_kfunc_call(&insn))
                        continue;

                if (verifier_bug_if(load_reg == -1, env,
                                    "zext_dst is set, but no reg is defined"))
                        return -EFAULT;

                zext_patch[0] = insn;
                zext_patch[1].dst_reg = load_reg;
                zext_patch[1].src_reg = load_reg;
                patch = zext_patch;
                patch_len = 2;
apply_patch_buffer:
                new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
                if (!new_prog)
                        return -ENOMEM;
                env->prog = new_prog;
                insns = new_prog->insnsi;
                aux = env->insn_aux_data;
                delta += patch_len - 1;
        }

        return 0;
}

/* convert load instructions that access fields of a context type into a
 * sequence of instructions that access fields of the underlying structure:
 *     struct __sk_buff    -> struct sk_buff
 *     struct bpf_sock_ops -> struct sock
 */
int bpf_convert_ctx_accesses(struct bpf_verifier_env *env)
{
        struct bpf_subprog_info *subprogs = env->subprog_info;
        const struct bpf_verifier_ops *ops = env->ops;
        int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0;
        const int insn_cnt = env->prog->len;
        struct bpf_insn *epilogue_buf = env->epilogue_buf;
        struct bpf_insn *insn_buf = env->insn_buf;
        struct bpf_insn *insn;
        u32 target_size, size_default, off;
        struct bpf_prog *new_prog;
        enum bpf_access_type type;
        bool is_narrower_load;
        int epilogue_idx = 0;

        if (ops->gen_epilogue) {
                epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
                                                 -(subprogs[0].stack_depth + 8));
                if (epilogue_cnt >= INSN_BUF_SIZE) {
                        verifier_bug(env, "epilogue is too long");
                        return -EFAULT;
                } else if (epilogue_cnt) {
                        /* Save the ARG_PTR_TO_CTX for the epilogue to use */
                        cnt = 0;
                        subprogs[0].stack_depth += 8;
                        insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
                                                      -subprogs[0].stack_depth);
                        insn_buf[cnt++] = env->prog->insnsi[0];
                        new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;
                        env->prog = new_prog;
                        delta += cnt - 1;

                        ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1);
                        if (ret < 0)
                                return ret;
                }
        }

        if (ops->gen_prologue || env->seen_direct_write) {
                if (!ops->gen_prologue) {
                        verifier_bug(env, "gen_prologue is null");
                        return -EFAULT;
                }
                cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
                                        env->prog);
                if (cnt >= INSN_BUF_SIZE) {
                        verifier_bug(env, "prologue is too long");
                        return -EFAULT;
                } else if (cnt) {
                        new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        env->prog = new_prog;
                        delta += cnt - 1;

                        ret = add_kfunc_in_insns(env, insn_buf, cnt - 1);
                        if (ret < 0)
                                return ret;
                }
        }

        if (delta)
                WARN_ON(adjust_jmp_off(env->prog, 0, delta));

        if (bpf_prog_is_offloaded(env->prog->aux))
                return 0;

        insn = env->prog->insnsi + delta;

        for (i = 0; i < insn_cnt; i++, insn++) {
                bpf_convert_ctx_access_t convert_ctx_access;
                u8 mode;

                if (env->insn_aux_data[i + delta].nospec) {
                        WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state);
                        struct bpf_insn *patch = insn_buf;

                        *patch++ = BPF_ST_NOSPEC();
                        *patch++ = *insn;
                        cnt = patch - insn_buf;
                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        /* This can not be easily merged with the
                         * nospec_result-case, because an insn may require a
                         * nospec before and after itself. Therefore also do not
                         * 'continue' here but potentially apply further
                         * patching to insn. *insn should equal patch[1] now.
                         */
                }

                if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
                    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
                    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
                    insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
                    insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
                    insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
                    insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
                        type = BPF_READ;
                } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
                           insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
                           insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
                           insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
                           insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
                           insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
                           insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
                           insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
                        type = BPF_WRITE;
                } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) ||
                            insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) ||
                            insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
                            insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
                           env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
                        insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
                        env->prog->aux->num_exentries++;
                        continue;
                } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
                           epilogue_cnt &&
                           i + delta < subprogs[1].start) {
                        /* Generate epilogue for the main prog */
                        if (epilogue_idx) {
                                /* jump back to the earlier generated epilogue */
                                insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1);
                                cnt = 1;
                        } else {
                                memcpy(insn_buf, epilogue_buf,
                                       epilogue_cnt * sizeof(*epilogue_buf));
                                cnt = epilogue_cnt;
                                /* epilogue_idx cannot be 0. It must have at
                                 * least one ctx ptr saving insn before the
                                 * epilogue.
                                 */
                                epilogue_idx = i + delta;
                        }
                        goto patch_insn_buf;
                } else {
                        continue;
                }

                if (type == BPF_WRITE &&
                    env->insn_aux_data[i + delta].nospec_result) {
                        /* nospec_result is only used to mitigate Spectre v4 and
                         * to limit verification-time for Spectre v1.
                         */
                        struct bpf_insn *patch = insn_buf;

                        *patch++ = *insn;
                        *patch++ = BPF_ST_NOSPEC();
                        cnt = patch - insn_buf;
                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        continue;
                }

                switch ((int)env->insn_aux_data[i + delta].ptr_type) {
                case PTR_TO_CTX:
                        if (!ops->convert_ctx_access)
                                continue;
                        convert_ctx_access = ops->convert_ctx_access;
                        break;
                case PTR_TO_SOCKET:
                case PTR_TO_SOCK_COMMON:
                        convert_ctx_access = bpf_sock_convert_ctx_access;
                        break;
                case PTR_TO_TCP_SOCK:
                        convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
                        break;
                case PTR_TO_XDP_SOCK:
                        convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
                        break;
                case PTR_TO_BTF_ID:
                case PTR_TO_BTF_ID | PTR_UNTRUSTED:
                /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
                 * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
                 * be said once it is marked PTR_UNTRUSTED, hence we must handle
                 * any faults for loads into such types. BPF_WRITE is disallowed
                 * for this case.
                 */
                case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
                case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED:
                        if (type == BPF_READ) {
                                if (BPF_MODE(insn->code) == BPF_MEM)
                                        insn->code = BPF_LDX | BPF_PROBE_MEM |
                                                     BPF_SIZE((insn)->code);
                                else
                                        insn->code = BPF_LDX | BPF_PROBE_MEMSX |
                                                     BPF_SIZE((insn)->code);
                                env->prog->aux->num_exentries++;
                        }
                        continue;
                case PTR_TO_ARENA:
                        if (BPF_MODE(insn->code) == BPF_MEMSX) {
                                if (!bpf_jit_supports_insn(insn, true)) {
                                        verbose(env, "sign extending loads from arena are not supported yet\n");
                                        return -EOPNOTSUPP;
                                }
                                insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code);
                        } else {
                                insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
                        }
                        env->prog->aux->num_exentries++;
                        continue;
                default:
                        continue;
                }

                ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
                size = BPF_LDST_BYTES(insn);
                mode = BPF_MODE(insn->code);

                /* If the read access is a narrower load of the field,
                 * convert to a 4/8-byte load, to minimum program type specific
                 * convert_ctx_access changes. If conversion is successful,
                 * we will apply proper mask to the result.
                 */
                is_narrower_load = size < ctx_field_size;
                size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
                off = insn->off;
                if (is_narrower_load) {
                        u8 size_code;

                        if (type == BPF_WRITE) {
                                verifier_bug(env, "narrow ctx access misconfigured");
                                return -EFAULT;
                        }

                        size_code = BPF_H;
                        if (ctx_field_size == 4)
                                size_code = BPF_W;
                        else if (ctx_field_size == 8)
                                size_code = BPF_DW;

                        insn->off = off & ~(size_default - 1);
                        insn->code = BPF_LDX | BPF_MEM | size_code;
                }

                target_size = 0;
                cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
                                         &target_size);
                if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
                    (ctx_field_size && !target_size)) {
                        verifier_bug(env, "error during ctx access conversion (%d)", cnt);
                        return -EFAULT;
                }

                if (is_narrower_load && size < target_size) {
                        u8 shift = bpf_ctx_narrow_access_offset(
                                off, size, size_default) * 8;
                        if (shift && cnt + 1 >= INSN_BUF_SIZE) {
                                verifier_bug(env, "narrow ctx load misconfigured");
                                return -EFAULT;
                        }
                        if (ctx_field_size <= 4) {
                                if (shift)
                                        insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
                                                                        insn->dst_reg,
                                                                        shift);
                                insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
                                                                (1 << size * 8) - 1);
                        } else {
                                if (shift)
                                        insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
                                                                        insn->dst_reg,
                                                                        shift);
                                insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
                                                                (1ULL << size * 8) - 1);
                        }
                }
                if (mode == BPF_MEMSX)
                        insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
                                                       insn->dst_reg, insn->dst_reg,
                                                       size * 8, 0);

patch_insn_buf:
                new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                if (!new_prog)
                        return -ENOMEM;

                delta += cnt - 1;

                /* keep walking new program and skip insns we just inserted */
                env->prog = new_prog;
                insn      = new_prog->insnsi + i + delta;
        }

        return 0;
}

static u32 *bpf_dup_subprog_starts(struct bpf_verifier_env *env)
{
        u32 *starts = NULL;

        starts = kvmalloc_objs(u32, env->subprog_cnt, GFP_KERNEL_ACCOUNT);
        if (starts) {
                for (int i = 0; i < env->subprog_cnt; i++)
                        starts[i] = env->subprog_info[i].start;
        }
        return starts;
}

static void bpf_restore_subprog_starts(struct bpf_verifier_env *env, u32 *orig_starts)
{
        for (int i = 0; i < env->subprog_cnt; i++)
                env->subprog_info[i].start = orig_starts[i];
        /* restore the start of fake 'exit' subprog as well */
        env->subprog_info[env->subprog_cnt].start = env->prog->len;
}

struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env)
{
        size_t size;
        void *new_aux;

        size = array_size(sizeof(struct bpf_insn_aux_data), env->prog->len);
        new_aux = __vmalloc(size, GFP_KERNEL_ACCOUNT);
        if (new_aux)
                memcpy(new_aux, env->insn_aux_data, size);
        return new_aux;
}

void bpf_restore_insn_aux_data(struct bpf_verifier_env *env,
                               struct bpf_insn_aux_data *orig_insn_aux)
{
        /* the expanded elements are zero-filled, so no special handling is required */
        vfree(env->insn_aux_data);
        env->insn_aux_data = orig_insn_aux;
}

static int jit_subprogs(struct bpf_verifier_env *env)
{
        struct bpf_prog *prog = env->prog, **func, *tmp;
        int i, j, subprog_start, subprog_end = 0, len, subprog;
        struct bpf_map *map_ptr;
        struct bpf_insn *insn;
        void *old_bpf_func;
        int err, num_exentries;

        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
                if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
                        continue;

                /* Upon error here we cannot fall back to interpreter but
                 * need a hard reject of the program. Thus -EFAULT is
                 * propagated in any case.
                 */
                subprog = bpf_find_subprog(env, i + insn->imm + 1);
                if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d",
                                    i + insn->imm + 1))
                        return -EFAULT;
                /* temporarily remember subprog id inside insn instead of
                 * aux_data, since next loop will split up all insns into funcs
                 */
                insn->off = subprog;
                /* remember original imm in case JIT fails and fallback
                 * to interpreter will be needed
                 */
                env->insn_aux_data[i].call_imm = insn->imm;
                /* point imm to __bpf_call_base+1 from JITs point of view */
                insn->imm = 1;
                if (bpf_pseudo_func(insn)) {
#if defined(MODULES_VADDR)
                        u64 addr = MODULES_VADDR;
#else
                        u64 addr = VMALLOC_START;
#endif
                        /* jit (e.g. x86_64) may emit fewer instructions
                         * if it learns a u32 imm is the same as a u64 imm.
                         * Set close enough to possible prog address.
                         */
                        insn[0].imm = (u32)addr;
                        insn[1].imm = addr >> 32;
                }
        }

        err = bpf_prog_alloc_jited_linfo(prog);
        if (err)
                goto out_undo_insn;

        err = -ENOMEM;
        func = kzalloc_objs(prog, env->subprog_cnt);
        if (!func)
                goto out_undo_insn;

        for (i = 0; i < env->subprog_cnt; i++) {
                subprog_start = subprog_end;
                subprog_end = env->subprog_info[i + 1].start;

                len = subprog_end - subprog_start;
                /* bpf_prog_run() doesn't call subprogs directly,
                 * hence main prog stats include the runtime of subprogs.
                 * subprogs don't have IDs and not reachable via prog_get_next_id
                 * func[i]->stats will never be accessed and stays NULL
                 */
                func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
                if (!func[i])
                        goto out_free;
                memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
                       len * sizeof(struct bpf_insn));
                func[i]->type = prog->type;
                func[i]->len = len;
                if (bpf_prog_calc_tag(func[i]))
                        goto out_free;
                func[i]->is_func = 1;
                func[i]->sleepable = prog->sleepable;
                func[i]->blinded = prog->blinded;
                func[i]->aux->func_idx = i;
                /* Below members will be freed only at prog->aux */
                func[i]->aux->btf = prog->aux->btf;
                func[i]->aux->subprog_start = subprog_start;
                func[i]->aux->func_info = prog->aux->func_info;
                func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
                func[i]->aux->poke_tab = prog->aux->poke_tab;
                func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
                func[i]->aux->main_prog_aux = prog->aux;

                for (j = 0; j < prog->aux->size_poke_tab; j++) {
                        struct bpf_jit_poke_descriptor *poke;

                        poke = &prog->aux->poke_tab[j];
                        if (poke->insn_idx < subprog_end &&
                            poke->insn_idx >= subprog_start)
                                poke->aux = func[i]->aux;
                }

                func[i]->aux->name[0] = 'F';
                func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
                if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
                        func[i]->aux->jits_use_priv_stack = true;

                func[i]->jit_requested = 1;
                func[i]->blinding_requested = prog->blinding_requested;
                func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
                func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
                func[i]->aux->linfo = prog->aux->linfo;
                func[i]->aux->nr_linfo = prog->aux->nr_linfo;
                func[i]->aux->jited_linfo = prog->aux->jited_linfo;
                func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
                func[i]->aux->arena = prog->aux->arena;
                func[i]->aux->used_maps = env->used_maps;
                func[i]->aux->used_map_cnt = env->used_map_cnt;
                num_exentries = 0;
                insn = func[i]->insnsi;
                for (j = 0; j < func[i]->len; j++, insn++) {
                        if (BPF_CLASS(insn->code) == BPF_LDX &&
                            (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
                             BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
                             BPF_MODE(insn->code) == BPF_PROBE_MEM32SX ||
                             BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
                                num_exentries++;
                        if ((BPF_CLASS(insn->code) == BPF_STX ||
                             BPF_CLASS(insn->code) == BPF_ST) &&
                             BPF_MODE(insn->code) == BPF_PROBE_MEM32)
                                num_exentries++;
                        if (BPF_CLASS(insn->code) == BPF_STX &&
                             BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
                                num_exentries++;
                }
                func[i]->aux->num_exentries = num_exentries;
                func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
                func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
                func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
                func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
                func[i]->aux->token = prog->aux->token;
                if (!i)
                        func[i]->aux->exception_boundary = env->seen_exception;
                func[i] = bpf_int_jit_compile(env, func[i]);
                if (!func[i]->jited) {
                        err = -ENOTSUPP;
                        goto out_free;
                }
                cond_resched();
        }

        /* at this point all bpf functions were successfully JITed
         * now populate all bpf_calls with correct addresses and
         * run last pass of JIT
         */
        for (i = 0; i < env->subprog_cnt; i++) {
                insn = func[i]->insnsi;
                for (j = 0; j < func[i]->len; j++, insn++) {
                        if (bpf_pseudo_func(insn)) {
                                subprog = insn->off;
                                insn[0].imm = (u32)(long)func[subprog]->bpf_func;
                                insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
                                continue;
                        }
                        if (!bpf_pseudo_call(insn))
                                continue;
                        subprog = insn->off;
                        insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
                }

                /* we use the aux data to keep a list of the start addresses
                 * of the JITed images for each function in the program
                 *
                 * for some architectures, such as powerpc64, the imm field
                 * might not be large enough to hold the offset of the start
                 * address of the callee's JITed image from __bpf_call_base
                 *
                 * in such cases, we can lookup the start address of a callee
                 * by using its subprog id, available from the off field of
                 * the call instruction, as an index for this list
                 */
                func[i]->aux->func = func;
                func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
                func[i]->aux->real_func_cnt = env->subprog_cnt;
        }
        for (i = 0; i < env->subprog_cnt; i++) {
                old_bpf_func = func[i]->bpf_func;
                tmp = bpf_int_jit_compile(env, func[i]);
                if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
                        verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
                        err = -ENOTSUPP;
                        goto out_free;
                }
                cond_resched();
        }

        /*
         * Cleanup func[i]->aux fields which aren't required
         * or can become invalid in future
         */
        for (i = 0; i < env->subprog_cnt; i++) {
                func[i]->aux->used_maps = NULL;
                func[i]->aux->used_map_cnt = 0;
        }

        /* finally lock prog and jit images for all functions and
         * populate kallsysm. Begin at the first subprogram, since
         * bpf_prog_load will add the kallsyms for the main program.
         */
        for (i = 1; i < env->subprog_cnt; i++) {
                err = bpf_prog_lock_ro(func[i]);
                if (err)
                        goto out_free;
        }

        for (i = 1; i < env->subprog_cnt; i++)
                bpf_prog_kallsyms_add(func[i]);

        /* Last step: make now unused interpreter insns from main
         * prog consistent for later dump requests, so they can
         * later look the same as if they were interpreted only.
         */
        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
                if (bpf_pseudo_func(insn)) {
                        insn[0].imm = env->insn_aux_data[i].call_imm;
                        insn[1].imm = insn->off;
                        insn->off = 0;
                        continue;
                }
                if (!bpf_pseudo_call(insn))
                        continue;
                insn->off = env->insn_aux_data[i].call_imm;
                subprog = bpf_find_subprog(env, i + insn->off + 1);
                insn->imm = subprog;
        }

        prog->jited = 1;
        prog->bpf_func = func[0]->bpf_func;
        prog->jited_len = func[0]->jited_len;
        prog->aux->extable = func[0]->aux->extable;
        prog->aux->num_exentries = func[0]->aux->num_exentries;
        prog->aux->func = func;
        prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
        prog->aux->real_func_cnt = env->subprog_cnt;
        prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
        prog->aux->exception_boundary = func[0]->aux->exception_boundary;
        bpf_prog_jit_attempt_done(prog);
        return 0;
out_free:
        /* We failed JIT'ing, so at this point we need to unregister poke
         * descriptors from subprogs, so that kernel is not attempting to
         * patch it anymore as we're freeing the subprog JIT memory.
         */
        for (i = 0; i < prog->aux->size_poke_tab; i++) {
                map_ptr = prog->aux->poke_tab[i].tail_call.map;
                map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
        }
        /* At this point we're guaranteed that poke descriptors are not
         * live anymore. We can just unlink its descriptor table as it's
         * released with the main prog.
         */
        for (i = 0; i < env->subprog_cnt; i++) {
                if (!func[i])
                        continue;
                func[i]->aux->poke_tab = NULL;
                bpf_jit_free(func[i]);
        }
        kfree(func);
out_undo_insn:
        bpf_prog_jit_attempt_done(prog);
        return err;
}

int bpf_jit_subprogs(struct bpf_verifier_env *env)
{
        int err, i;
        bool blinded = false;
        struct bpf_insn *insn;
        struct bpf_prog *prog, *orig_prog;
        struct bpf_insn_aux_data *orig_insn_aux;
        u32 *orig_subprog_starts;

        if (env->subprog_cnt <= 1)
                return 0;

        prog = orig_prog = env->prog;
        if (bpf_prog_need_blind(prog)) {
                orig_insn_aux = bpf_dup_insn_aux_data(env);
                if (!orig_insn_aux) {
                        err = -ENOMEM;
                        goto out_cleanup;
                }
                orig_subprog_starts = bpf_dup_subprog_starts(env);
                if (!orig_subprog_starts) {
                        vfree(orig_insn_aux);
                        err = -ENOMEM;
                        goto out_cleanup;
                }
                prog = bpf_jit_blind_constants(env, prog);
                if (IS_ERR(prog)) {
                        err = -ENOMEM;
                        prog = orig_prog;
                        goto out_restore;
                }
                blinded = true;
        }

        err = jit_subprogs(env);
        if (err)
                goto out_jit_err;

        if (blinded) {
                bpf_jit_prog_release_other(prog, orig_prog);
                kvfree(orig_subprog_starts);
                vfree(orig_insn_aux);
        }

        return 0;

out_jit_err:
        if (blinded) {
                bpf_jit_prog_release_other(orig_prog, prog);
                /* roll back to the clean original prog */
                prog = env->prog = orig_prog;
                goto out_restore;
        } else {
                if (err != -EFAULT) {
                        /*
                         * We will fall back to interpreter mode when err is not -EFAULT, before
                         * that, insn->off and insn->imm should be restored to their original
                         * values since they were modified by jit_subprogs.
                         */
                        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
                                if (!bpf_pseudo_call(insn))
                                        continue;
                                insn->off = 0;
                                insn->imm = env->insn_aux_data[i].call_imm;
                        }
                }
                goto out_cleanup;
        }

out_restore:
        bpf_restore_subprog_starts(env, orig_subprog_starts);
        bpf_restore_insn_aux_data(env, orig_insn_aux);
        kvfree(orig_subprog_starts);
out_cleanup:
        /* cleanup main prog to be interpreted */
        prog->jit_requested = 0;
        prog->blinding_requested = 0;
        return err;
}

int bpf_fixup_call_args(struct bpf_verifier_env *env)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        struct bpf_prog *prog = env->prog;
        struct bpf_insn *insn = prog->insnsi;
        bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
        int i, depth;
#endif
        int err = 0;

        if (env->prog->jit_requested &&
            !bpf_prog_is_offloaded(env->prog->aux)) {
                err = bpf_jit_subprogs(env);
                if (err == 0)
                        return 0;
                if (err == -EFAULT)
                        return err;
        }
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        if (has_kfunc_call) {
                verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
                return -EINVAL;
        }
        if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
                /* When JIT fails the progs with bpf2bpf calls and tail_calls
                 * have to be rejected, since interpreter doesn't support them yet.
                 */
                verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
                return -EINVAL;
        }
        for (i = 0; i < prog->len; i++, insn++) {
                if (bpf_pseudo_func(insn)) {
                        /* When JIT fails the progs with callback calls
                         * have to be rejected, since interpreter doesn't support them yet.
                         */
                        verbose(env, "callbacks are not allowed in non-JITed programs\n");
                        return -EINVAL;
                }

                if (!bpf_pseudo_call(insn))
                        continue;
                depth = get_callee_stack_depth(env, insn, i);
                if (depth < 0)
                        return depth;
                bpf_patch_call_args(insn, depth);
        }
        err = 0;
#endif
        return err;
}


/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
{
        struct bpf_subprog_info *info = env->subprog_info;
        int cnt = env->subprog_cnt;
        struct bpf_prog *prog;

        /* We only reserve one slot for hidden subprogs in subprog_info. */
        if (env->hidden_subprog_cnt) {
                verifier_bug(env, "only one hidden subprog supported");
                return -EFAULT;
        }
        /* We're not patching any existing instruction, just appending the new
         * ones for the hidden subprog. Hence all of the adjustment operations
         * in bpf_patch_insn_data are no-ops.
         */
        prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
        if (!prog)
                return -ENOMEM;
        env->prog = prog;
        info[cnt + 1].start = info[cnt].start;
        info[cnt].start = prog->len - len + 1;
        env->subprog_cnt++;
        env->hidden_subprog_cnt++;
        return 0;
}

/* Do various post-verification rewrites in a single program pass.
 * These rewrites simplify JIT and interpreter implementations.
 */
int bpf_do_misc_fixups(struct bpf_verifier_env *env)
{
        struct bpf_prog *prog = env->prog;
        enum bpf_attach_type eatype = prog->expected_attach_type;
        enum bpf_prog_type prog_type = resolve_prog_type(prog);
        struct bpf_insn *insn = prog->insnsi;
        const struct bpf_func_proto *fn;
        const int insn_cnt = prog->len;
        const struct bpf_map_ops *ops;
        struct bpf_insn_aux_data *aux;
        struct bpf_insn *insn_buf = env->insn_buf;
        struct bpf_prog *new_prog;
        struct bpf_map *map_ptr;
        int i, ret, cnt, delta = 0, cur_subprog = 0;
        struct bpf_subprog_info *subprogs = env->subprog_info;
        u16 stack_depth = subprogs[cur_subprog].stack_depth;
        u16 stack_depth_extra = 0;

        if (env->seen_exception && !env->exception_callback_subprog) {
                struct bpf_insn *patch = insn_buf;

                *patch++ = env->prog->insnsi[insn_cnt - 1];
                *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
                *patch++ = BPF_EXIT_INSN();
                ret = add_hidden_subprog(env, insn_buf, patch - insn_buf);
                if (ret < 0)
                        return ret;
                prog = env->prog;
                insn = prog->insnsi;

                env->exception_callback_subprog = env->subprog_cnt - 1;
                /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
                bpf_mark_subprog_exc_cb(env, env->exception_callback_subprog);
        }

        for (i = 0; i < insn_cnt;) {
                if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
                        if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
                            (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
                                /* convert to 32-bit mov that clears upper 32-bit */
                                insn->code = BPF_ALU | BPF_MOV | BPF_X;
                                /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
                                insn->off = 0;
                                insn->imm = 0;
                        } /* cast from as(0) to as(1) should be handled by JIT */
                        goto next_insn;
                }

                if (env->insn_aux_data[i + delta].needs_zext)
                        /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
                        insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);

                /* Make sdiv/smod divide-by-minus-one exceptions impossible. */
                if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) ||
                     insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) ||
                     insn->code == (BPF_ALU | BPF_MOD | BPF_K) ||
                     insn->code == (BPF_ALU | BPF_DIV | BPF_K)) &&
                    insn->off == 1 && insn->imm == -1) {
                        bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
                        bool isdiv = BPF_OP(insn->code) == BPF_DIV;
                        struct bpf_insn *patch = insn_buf;

                        if (isdiv)
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
                                                        BPF_NEG | BPF_K, insn->dst_reg,
                                                        0, 0, 0);
                        else
                                *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);

                        cnt = patch - insn_buf;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */
                if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
                    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
                    insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
                    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
                        bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
                        bool isdiv = BPF_OP(insn->code) == BPF_DIV;
                        bool is_sdiv = isdiv && insn->off == 1;
                        bool is_smod = !isdiv && insn->off == 1;
                        struct bpf_insn *patch = insn_buf;

                        if (is_sdiv) {
                                /* [R,W]x sdiv 0 -> 0
                                 * LLONG_MIN sdiv -1 -> LLONG_MIN
                                 * INT_MIN sdiv -1 -> INT_MIN
                                 */
                                *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
                                                        BPF_ADD | BPF_K, BPF_REG_AX,
                                                        0, 0, 1);
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
                                                        BPF_JGT | BPF_K, BPF_REG_AX,
                                                        0, 4, 1);
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
                                                        BPF_JEQ | BPF_K, BPF_REG_AX,
                                                        0, 1, 0);
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
                                                        BPF_MOV | BPF_K, insn->dst_reg,
                                                        0, 0, 0);
                                /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
                                                        BPF_NEG | BPF_K, insn->dst_reg,
                                                        0, 0, 0);
                                *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
                                *patch++ = *insn;
                                cnt = patch - insn_buf;
                        } else if (is_smod) {
                                /* [R,W]x mod 0 -> [R,W]x */
                                /* [R,W]x mod -1 -> 0 */
                                *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
                                                        BPF_ADD | BPF_K, BPF_REG_AX,
                                                        0, 0, 1);
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
                                                        BPF_JGT | BPF_K, BPF_REG_AX,
                                                        0, 3, 1);
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
                                                        BPF_JEQ | BPF_K, BPF_REG_AX,
                                                        0, 3 + (is64 ? 0 : 1), 1);
                                *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
                                *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
                                *patch++ = *insn;

                                if (!is64) {
                                        *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
                                        *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
                                }
                                cnt = patch - insn_buf;
                        } else if (isdiv) {
                                /* [R,W]x div 0 -> 0 */
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
                                                        BPF_JNE | BPF_K, insn->src_reg,
                                                        0, 2, 0);
                                *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg);
                                *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
                                *patch++ = *insn;
                                cnt = patch - insn_buf;
                        } else {
                                /* [R,W]x mod 0 -> [R,W]x */
                                *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
                                                        BPF_JEQ | BPF_K, insn->src_reg,
                                                        0, 1 + (is64 ? 0 : 1), 0);
                                *patch++ = *insn;

                                if (!is64) {
                                        *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
                                        *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
                                }
                                cnt = patch - insn_buf;
                        }

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Make it impossible to de-reference a userspace address */
                if (BPF_CLASS(insn->code) == BPF_LDX &&
                    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
                     BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
                        struct bpf_insn *patch = insn_buf;
                        u64 uaddress_limit = bpf_arch_uaddress_limit();

                        if (!uaddress_limit)
                                goto next_insn;

                        *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
                        if (insn->off)
                                *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
                        *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
                        *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
                        *patch++ = *insn;
                        *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
                        *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);

                        cnt = patch - insn_buf;
                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
                if (BPF_CLASS(insn->code) == BPF_LD &&
                    (BPF_MODE(insn->code) == BPF_ABS ||
                     BPF_MODE(insn->code) == BPF_IND)) {
                        cnt = env->ops->gen_ld_abs(insn, insn_buf);
                        if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
                                verifier_bug(env, "%d insns generated for ld_abs", cnt);
                                return -EFAULT;
                        }

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Rewrite pointer arithmetic to mitigate speculation attacks. */
                if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
                    insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
                        const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
                        const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
                        struct bpf_insn *patch = insn_buf;
                        bool issrc, isneg, isimm;
                        u32 off_reg;

                        aux = &env->insn_aux_data[i + delta];
                        if (!aux->alu_state ||
                            aux->alu_state == BPF_ALU_NON_POINTER)
                                goto next_insn;

                        isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
                        issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
                                BPF_ALU_SANITIZE_SRC;
                        isimm = aux->alu_state & BPF_ALU_IMMEDIATE;

                        off_reg = issrc ? insn->src_reg : insn->dst_reg;
                        if (isimm) {
                                *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
                        } else {
                                if (isneg)
                                        *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
                                *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
                                *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
                                *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
                                *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
                                *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
                                *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
                        }
                        if (!issrc)
                                *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
                        insn->src_reg = BPF_REG_AX;
                        if (isneg)
                                insn->code = insn->code == code_add ?
                                             code_sub : code_add;
                        *patch++ = *insn;
                        if (issrc && isneg && !isimm)
                                *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
                        cnt = patch - insn_buf;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                if (bpf_is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
                        int stack_off_cnt = -stack_depth - 16;

                        /*
                         * Two 8 byte slots, depth-16 stores the count, and
                         * depth-8 stores the start timestamp of the loop.
                         *
                         * The starting value of count is BPF_MAX_TIMED_LOOPS
                         * (0xffff).  Every iteration loads it and subs it by 1,
                         * until the value becomes 0 in AX (thus, 1 in stack),
                         * after which we call arch_bpf_timed_may_goto, which
                         * either sets AX to 0xffff to keep looping, or to 0
                         * upon timeout. AX is then stored into the stack. In
                         * the next iteration, we either see 0 and break out, or
                         * continue iterating until the next time value is 0
                         * after subtraction, rinse and repeat.
                         */
                        stack_depth_extra = 16;
                        insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
                        if (insn->off >= 0)
                                insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
                        else
                                insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
                        insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
                        insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
                        /*
                         * AX is used as an argument to pass in stack_off_cnt
                         * (to add to r10/fp), and also as the return value of
                         * the call to arch_bpf_timed_may_goto.
                         */
                        insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
                        insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
                        insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
                        cnt = 7;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta += cnt - 1;
                        env->prog = prog = new_prog;
                        insn = new_prog->insnsi + i + delta;
                        goto next_insn;
                } else if (bpf_is_may_goto_insn(insn)) {
                        int stack_off = -stack_depth - 8;

                        stack_depth_extra = 8;
                        insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
                        if (insn->off >= 0)
                                insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
                        else
                                insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
                        insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
                        insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
                        cnt = 4;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta += cnt - 1;
                        env->prog = prog = new_prog;
                        insn = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                if (insn->code != (BPF_JMP | BPF_CALL))
                        goto next_insn;
                if (insn->src_reg == BPF_PSEUDO_CALL)
                        goto next_insn;
                if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
                        ret = bpf_fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
                        if (ret)
                                return ret;
                        if (cnt == 0)
                                goto next_insn;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta         += cnt - 1;
                        env->prog = prog = new_prog;
                        insn          = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Skip inlining the helper call if the JIT does it. */
                if (bpf_jit_inlines_helper_call(insn->imm))
                        goto next_insn;

                if (insn->imm == BPF_FUNC_get_route_realm)
                        prog->dst_needed = 1;
                if (insn->imm == BPF_FUNC_get_prandom_u32)
                        bpf_user_rnd_init_once();
                if (insn->imm == BPF_FUNC_override_return)
                        prog->kprobe_override = 1;
                if (insn->imm == BPF_FUNC_tail_call) {
                        /* If we tail call into other programs, we
                         * cannot make any assumptions since they can
                         * be replaced dynamically during runtime in
                         * the program array.
                         */
                        prog->cb_access = 1;
                        if (!bpf_allow_tail_call_in_subprogs(env))
                                prog->aux->stack_depth = MAX_BPF_STACK;
                        prog->aux->max_pkt_offset = MAX_PACKET_OFF;

                        /* mark bpf_tail_call as different opcode to avoid
                         * conditional branch in the interpreter for every normal
                         * call and to prevent accidental JITing by JIT compiler
                         * that doesn't support bpf_tail_call yet
                         */
                        insn->imm = 0;
                        insn->code = BPF_JMP | BPF_TAIL_CALL;

                        aux = &env->insn_aux_data[i + delta];
                        if (env->bpf_capable && !prog->blinding_requested &&
                            prog->jit_requested &&
                            !bpf_map_key_poisoned(aux) &&
                            !bpf_map_ptr_poisoned(aux) &&
                            !bpf_map_ptr_unpriv(aux)) {
                                struct bpf_jit_poke_descriptor desc = {
                                        .reason = BPF_POKE_REASON_TAIL_CALL,
                                        .tail_call.map = aux->map_ptr_state.map_ptr,
                                        .tail_call.key = bpf_map_key_immediate(aux),
                                        .insn_idx = i + delta,
                                };

                                ret = bpf_jit_add_poke_descriptor(prog, &desc);
                                if (ret < 0) {
                                        verbose(env, "adding tail call poke descriptor failed\n");
                                        return ret;
                                }

                                insn->imm = ret + 1;
                                goto next_insn;
                        }

                        if (!bpf_map_ptr_unpriv(aux))
                                goto next_insn;

                        /* instead of changing every JIT dealing with tail_call
                         * emit two extra insns:
                         * if (index >= max_entries) goto out;
                         * index &= array->index_mask;
                         * to avoid out-of-bounds cpu speculation
                         */
                        if (bpf_map_ptr_poisoned(aux)) {
                                verbose(env, "tail_call abusing map_ptr\n");
                                return -EINVAL;
                        }

                        map_ptr = aux->map_ptr_state.map_ptr;
                        insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
                                                  map_ptr->max_entries, 2);
                        insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
                                                    container_of(map_ptr,
                                                                 struct bpf_array,
                                                                 map)->index_mask);
                        insn_buf[2] = *insn;
                        cnt = 3;
                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                if (insn->imm == BPF_FUNC_timer_set_callback) {
                        /* The verifier will process callback_fn as many times as necessary
                         * with different maps and the register states prepared by
                         * set_timer_callback_state will be accurate.
                         *
                         * The following use case is valid:
                         *   map1 is shared by prog1, prog2, prog3.
                         *   prog1 calls bpf_timer_init for some map1 elements
                         *   prog2 calls bpf_timer_set_callback for some map1 elements.
                         *     Those that were not bpf_timer_init-ed will return -EINVAL.
                         *   prog3 calls bpf_timer_start for some map1 elements.
                         *     Those that were not both bpf_timer_init-ed and
                         *     bpf_timer_set_callback-ed will return -EINVAL.
                         */
                        struct bpf_insn ld_addrs[2] = {
                                BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
                        };

                        insn_buf[0] = ld_addrs[0];
                        insn_buf[1] = ld_addrs[1];
                        insn_buf[2] = *insn;
                        cnt = 3;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto patch_call_imm;
                }

                /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
                if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
                        /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
                         * bpf_mem_alloc() returns a ptr to the percpu data ptr.
                         */
                        insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
                        insn_buf[1] = *insn;
                        cnt = 2;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta += cnt - 1;
                        env->prog = prog = new_prog;
                        insn = new_prog->insnsi + i + delta;
                        goto patch_call_imm;
                }

                /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
                 * and other inlining handlers are currently limited to 64 bit
                 * only.
                 */
                if (prog->jit_requested && BITS_PER_LONG == 64 &&
                    (insn->imm == BPF_FUNC_map_lookup_elem ||
                     insn->imm == BPF_FUNC_map_update_elem ||
                     insn->imm == BPF_FUNC_map_delete_elem ||
                     insn->imm == BPF_FUNC_map_push_elem   ||
                     insn->imm == BPF_FUNC_map_pop_elem    ||
                     insn->imm == BPF_FUNC_map_peek_elem   ||
                     insn->imm == BPF_FUNC_redirect_map    ||
                     insn->imm == BPF_FUNC_for_each_map_elem ||
                     insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
                        aux = &env->insn_aux_data[i + delta];
                        if (bpf_map_ptr_poisoned(aux))
                                goto patch_call_imm;

                        map_ptr = aux->map_ptr_state.map_ptr;
                        ops = map_ptr->ops;
                        if (insn->imm == BPF_FUNC_map_lookup_elem &&
                            ops->map_gen_lookup) {
                                cnt = ops->map_gen_lookup(map_ptr, insn_buf);
                                if (cnt == -EOPNOTSUPP)
                                        goto patch_map_ops_generic;
                                if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
                                        verifier_bug(env, "%d insns generated for map lookup", cnt);
                                        return -EFAULT;
                                }

                                new_prog = bpf_patch_insn_data(env, i + delta,
                                                               insn_buf, cnt);
                                if (!new_prog)
                                        return -ENOMEM;

                                delta    += cnt - 1;
                                env->prog = prog = new_prog;
                                insn      = new_prog->insnsi + i + delta;
                                goto next_insn;
                        }

                        BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
                                     (void *(*)(struct bpf_map *map, void *key))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
                                     (long (*)(struct bpf_map *map, void *key))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_update_elem,
                                     (long (*)(struct bpf_map *map, void *key, void *value,
                                              u64 flags))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_push_elem,
                                     (long (*)(struct bpf_map *map, void *value,
                                              u64 flags))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
                                     (long (*)(struct bpf_map *map, void *value))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
                                     (long (*)(struct bpf_map *map, void *value))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_redirect,
                                     (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
                                     (long (*)(struct bpf_map *map,
                                              bpf_callback_t callback_fn,
                                              void *callback_ctx,
                                              u64 flags))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
                                     (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));

patch_map_ops_generic:
                        switch (insn->imm) {
                        case BPF_FUNC_map_lookup_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
                                goto next_insn;
                        case BPF_FUNC_map_update_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_update_elem);
                                goto next_insn;
                        case BPF_FUNC_map_delete_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
                                goto next_insn;
                        case BPF_FUNC_map_push_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_push_elem);
                                goto next_insn;
                        case BPF_FUNC_map_pop_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
                                goto next_insn;
                        case BPF_FUNC_map_peek_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
                                goto next_insn;
                        case BPF_FUNC_redirect_map:
                                insn->imm = BPF_CALL_IMM(ops->map_redirect);
                                goto next_insn;
                        case BPF_FUNC_for_each_map_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
                                goto next_insn;
                        case BPF_FUNC_map_lookup_percpu_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
                                goto next_insn;
                        }

                        goto patch_call_imm;
                }

                /* Implement bpf_jiffies64 inline. */
                if (prog->jit_requested && BITS_PER_LONG == 64 &&
                    insn->imm == BPF_FUNC_jiffies64) {
                        struct bpf_insn ld_jiffies_addr[2] = {
                                BPF_LD_IMM64(BPF_REG_0,
                                             (unsigned long)&jiffies),
                        };

                        insn_buf[0] = ld_jiffies_addr[0];
                        insn_buf[1] = ld_jiffies_addr[1];
                        insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
                                                  BPF_REG_0, 0);
                        cnt = 3;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
                                                       cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
                /* Implement bpf_get_smp_processor_id() inline. */
                if (insn->imm == BPF_FUNC_get_smp_processor_id &&
                    bpf_verifier_inlines_helper_call(env, insn->imm)) {
                        /* BPF_FUNC_get_smp_processor_id inlining is an
                         * optimization, so if cpu_number is ever
                         * changed in some incompatible and hard to support
                         * way, it's fine to back out this inlining logic
                         */
#ifdef CONFIG_SMP
                        insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number);
                        insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
                        insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
                        cnt = 3;
#else
                        insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
                        cnt = 1;
#endif
                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */
                if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) &&
                    bpf_verifier_inlines_helper_call(env, insn->imm)) {
                        insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&current_task);
                        insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
                        insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
                        cnt = 3;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }
#endif
                /* Implement bpf_get_func_arg inline. */
                if (prog_type == BPF_PROG_TYPE_TRACING &&
                    insn->imm == BPF_FUNC_get_func_arg) {
                        if (eatype == BPF_TRACE_RAW_TP) {
                                int nr_args = btf_type_vlen(prog->aux->attach_func_proto);

                                /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
                                insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
                                cnt = 1;
                        } else {
                                /* Load nr_args from ctx - 8 */
                                insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
                                insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
                                cnt = 2;
                        }
                        insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
                        insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
                        insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
                        insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
                        insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
                        insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0);
                        insn_buf[cnt++] = BPF_JMP_A(1);
                        insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement bpf_get_func_ret inline. */
                if (prog_type == BPF_PROG_TYPE_TRACING &&
                    insn->imm == BPF_FUNC_get_func_ret) {
                        if (eatype == BPF_TRACE_FEXIT ||
                            eatype == BPF_TRACE_FSESSION ||
                            eatype == BPF_MODIFY_RETURN) {
                                /* Load nr_args from ctx - 8 */
                                insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
                                insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
                                insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
                                insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
                                insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
                                insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
                                insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
                                cnt = 7;
                        } else {
                                insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
                                cnt = 1;
                        }

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement get_func_arg_cnt inline. */
                if (prog_type == BPF_PROG_TYPE_TRACING &&
                    insn->imm == BPF_FUNC_get_func_arg_cnt) {
                        if (eatype == BPF_TRACE_RAW_TP) {
                                int nr_args = btf_type_vlen(prog->aux->attach_func_proto);

                                /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
                                insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
                                cnt = 1;
                        } else {
                                /* Load nr_args from ctx - 8 */
                                insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
                                insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
                                cnt = 2;
                        }

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement bpf_get_func_ip inline. */
                if (prog_type == BPF_PROG_TYPE_TRACING &&
                    insn->imm == BPF_FUNC_get_func_ip) {
                        /* Load IP address from ctx - 16 */
                        insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
                        if (!new_prog)
                                return -ENOMEM;

                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement bpf_get_branch_snapshot inline. */
                if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
                    prog->jit_requested && BITS_PER_LONG == 64 &&
                    insn->imm == BPF_FUNC_get_branch_snapshot) {
                        /* We are dealing with the following func protos:
                         * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
                         * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
                         */
                        const u32 br_entry_size = sizeof(struct perf_branch_entry);

                        /* struct perf_branch_entry is part of UAPI and is
                         * used as an array element, so extremely unlikely to
                         * ever grow or shrink
                         */
                        BUILD_BUG_ON(br_entry_size != 24);

                        /* if (unlikely(flags)) return -EINVAL */
                        insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);

                        /* Transform size (bytes) into number of entries (cnt = size / 24).
                         * But to avoid expensive division instruction, we implement
                         * divide-by-3 through multiplication, followed by further
                         * division by 8 through 3-bit right shift.
                         * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
                         * p. 227, chapter "Unsigned Division by 3" for details and proofs.
                         *
                         * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
                         */
                        insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
                        insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
                        insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);

                        /* call perf_snapshot_branch_stack implementation */
                        insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
                        /* if (entry_cnt == 0) return -ENOENT */
                        insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
                        /* return entry_cnt * sizeof(struct perf_branch_entry) */
                        insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
                        insn_buf[7] = BPF_JMP_A(3);
                        /* return -EINVAL; */
                        insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
                        insn_buf[9] = BPF_JMP_A(1);
                        /* return -ENOENT; */
                        insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
                        cnt = 11;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement bpf_kptr_xchg inline */
                if (prog->jit_requested && BITS_PER_LONG == 64 &&
                    insn->imm == BPF_FUNC_kptr_xchg &&
                    bpf_jit_supports_ptr_xchg()) {
                        insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
                        insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
                        cnt = 2;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }
patch_call_imm:
                fn = env->ops->get_func_proto(insn->imm, env->prog);
                /* all functions that have prototype and verifier allowed
                 * programs to call them, must be real in-kernel functions
                 */
                if (!fn->func) {
                        verifier_bug(env,
                                     "not inlined functions %s#%d is missing func",
                                     func_id_name(insn->imm), insn->imm);
                        return -EFAULT;
                }
                insn->imm = fn->func - __bpf_call_base;
next_insn:
                if (subprogs[cur_subprog + 1].start == i + delta + 1) {
                        subprogs[cur_subprog].stack_depth += stack_depth_extra;
                        subprogs[cur_subprog].stack_extra = stack_depth_extra;

                        stack_depth = subprogs[cur_subprog].stack_depth;
                        if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) {
                                verbose(env, "stack size %d(extra %d) is too large\n",
                                        stack_depth, stack_depth_extra);
                                return -EINVAL;
                        }
                        cur_subprog++;
                        stack_depth = subprogs[cur_subprog].stack_depth;
                        stack_depth_extra = 0;
                }
                i++;
                insn++;
        }

        env->prog->aux->stack_depth = subprogs[0].stack_depth;
        for (i = 0; i < env->subprog_cnt; i++) {
                int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
                int subprog_start = subprogs[i].start;
                int stack_slots = subprogs[i].stack_extra / 8;
                int slots = delta, cnt = 0;

                if (!stack_slots)
                        continue;
                /* We need two slots in case timed may_goto is supported. */
                if (stack_slots > slots) {
                        verifier_bug(env, "stack_slots supports may_goto only");
                        return -EFAULT;
                }

                stack_depth = subprogs[i].stack_depth;
                if (bpf_jit_supports_timed_may_goto()) {
                        insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
                                                     BPF_MAX_TIMED_LOOPS);
                        insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
                } else {
                        /* Add ST insn to subprog prologue to init extra stack */
                        insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
                                                     BPF_MAX_LOOPS);
                }
                /* Copy first actual insn to preserve it */
                insn_buf[cnt++] = env->prog->insnsi[subprog_start];

                new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
                if (!new_prog)
                        return -ENOMEM;
                env->prog = prog = new_prog;
                /*
                 * If may_goto is a first insn of a prog there could be a jmp
                 * insn that points to it, hence adjust all such jmps to point
                 * to insn after BPF_ST that inits may_goto count.
                 * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
                 */
                WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
        }

        /* Since poke tab is now finalized, publish aux to tracker. */
        for (i = 0; i < prog->aux->size_poke_tab; i++) {
                map_ptr = prog->aux->poke_tab[i].tail_call.map;
                if (!map_ptr->ops->map_poke_track ||
                    !map_ptr->ops->map_poke_untrack ||
                    !map_ptr->ops->map_poke_run) {
                        verifier_bug(env, "poke tab is misconfigured");
                        return -EFAULT;
                }

                ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
                if (ret < 0) {
                        verbose(env, "tracking tail call prog failed\n");
                        return ret;
                }
        }

        ret = sort_kfunc_descs_by_imm_off(env);
        if (ret)
                return ret;

        return 0;
}

static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
                                        int position,
                                        s32 stack_base,
                                        u32 callback_subprogno,
                                        u32 *total_cnt)
{
        s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
        s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
        s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
        int reg_loop_max = BPF_REG_6;
        int reg_loop_cnt = BPF_REG_7;
        int reg_loop_ctx = BPF_REG_8;

        struct bpf_insn *insn_buf = env->insn_buf;
        struct bpf_prog *new_prog;
        u32 callback_start;
        u32 call_insn_offset;
        s32 callback_offset;
        u32 cnt = 0;

        /* This represents an inlined version of bpf_iter.c:bpf_loop,
         * be careful to modify this code in sync.
         */

        /* Return error and jump to the end of the patch if
         * expected number of iterations is too big.
         */
        insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
        insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
        insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16);
        /* spill R6, R7, R8 to use these as loop vars */
        insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
        insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
        insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
        /* initialize loop vars */
        insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
        insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0);
        insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
        /* loop header,
         * if reg_loop_cnt >= reg_loop_max skip the loop body
         */
        insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5);
        /* callback call,
         * correct callback offset would be set after patching
         */
        insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
        insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
        insn_buf[cnt++] = BPF_CALL_REL(0);
        /* increment loop counter */
        insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
        /* jump to loop header if callback returned 0 */
        insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6);
        /* return value of bpf_loop,
         * set R0 to the number of iterations
         */
        insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
        /* restore original values of R6, R7, R8 */
        insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
        insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
        insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);

        *total_cnt = cnt;
        new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt);
        if (!new_prog)
                return new_prog;

        /* callback start is known only after patching */
        callback_start = env->subprog_info[callback_subprogno].start;
        /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
        call_insn_offset = position + 12;
        callback_offset = callback_start - call_insn_offset - 1;
        new_prog->insnsi[call_insn_offset].imm = callback_offset;

        return new_prog;
}

static bool is_bpf_loop_call(struct bpf_insn *insn)
{
        return insn->code == (BPF_JMP | BPF_CALL) &&
                insn->src_reg == 0 &&
                insn->imm == BPF_FUNC_loop;
}

/* For all sub-programs in the program (including main) check
 * insn_aux_data to see if there are bpf_loop calls that require
 * inlining. If such calls are found the calls are replaced with a
 * sequence of instructions produced by `inline_bpf_loop` function and
 * subprog stack_depth is increased by the size of 3 registers.
 * This stack space is used to spill values of the R6, R7, R8.  These
 * registers are used to store the loop bound, counter and context
 * variables.
 */
int bpf_optimize_bpf_loop(struct bpf_verifier_env *env)
{
        struct bpf_subprog_info *subprogs = env->subprog_info;
        int i, cur_subprog = 0, cnt, delta = 0;
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        u16 stack_depth = subprogs[cur_subprog].stack_depth;
        u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
        u16 stack_depth_extra = 0;

        for (i = 0; i < insn_cnt; i++, insn++) {
                struct bpf_loop_inline_state *inline_state =
                        &env->insn_aux_data[i + delta].loop_inline_state;

                if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
                        struct bpf_prog *new_prog;

                        stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
                        new_prog = inline_bpf_loop(env,
                                                   i + delta,
                                                   -(stack_depth + stack_depth_extra),
                                                   inline_state->callback_subprogno,
                                                   &cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta     += cnt - 1;
                        env->prog  = new_prog;
                        insn       = new_prog->insnsi + i + delta;
                }

                if (subprogs[cur_subprog + 1].start == i + delta + 1) {
                        subprogs[cur_subprog].stack_depth += stack_depth_extra;
                        cur_subprog++;
                        stack_depth = subprogs[cur_subprog].stack_depth;
                        stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
                        stack_depth_extra = 0;
                }
        }

        env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;

        return 0;
}

/* Remove unnecessary spill/fill pairs, members of fastcall pattern,
 * adjust subprograms stack depth when possible.
 */
int bpf_remove_fastcall_spills_fills(struct bpf_verifier_env *env)
{
        struct bpf_subprog_info *subprog = env->subprog_info;
        struct bpf_insn_aux_data *aux = env->insn_aux_data;
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        u32 spills_num;
        bool modified = false;
        int i, j;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (aux[i].fastcall_spills_num > 0) {
                        spills_num = aux[i].fastcall_spills_num;
                        /* NOPs would be removed by opt_remove_nops() */
                        for (j = 1; j <= spills_num; ++j) {
                                *(insn - j) = NOP;
                                *(insn + j) = NOP;
                        }
                        modified = true;
                }
                if ((subprog + 1)->start == i + 1) {
                        if (modified && !subprog->keep_fastcall_stack)
                                subprog->stack_depth = -subprog->fastcall_stack_off;
                        subprog++;
                        modified = false;
                }
        }

        return 0;
}
























































    1 




    1 




    1 











































































































    1 





























































































    1 




































    2 







    1 


    1 




    1 






    1 





    1 













    1 










































    1 











    1 






    1 

    1 




    1 




























    1 



















    1 














    1 














    1 





    1 


    1 
    1 




    1 






























    1 






    1 








    1 










    1 





    1 




    1 








    1 
































    1 












    1 




    1 





    1 























    1 

    1 


































































    1 







    1 









    1 













    1 














































    1 




    1 































    2 


    2 







































    2 












    2 





    2 











    1 










    1 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/balloc.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/time.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "mballoc.h"

#include <trace/events/ext4.h>
#include <kunit/static_stub.h>

static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
                                            ext4_group_t block_group);
/*
 * balloc.c contains the blocks allocation and deallocation routines
 */

/*
 * Calculate block group number for a given block number
 */
ext4_group_t ext4_get_group_number(struct super_block *sb,
                                   ext4_fsblk_t block)
{
        ext4_group_t group;

        if (test_opt2(sb, STD_GROUP_SIZE))
                group = (block -
                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
                        (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
        else
                ext4_get_group_no_and_offset(sb, block, &group, NULL);
        return group;
}

/*
 * Calculate the block group number and offset into the block/cluster
 * allocation bitmap, given a block number
 */
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        ext4_grpblk_t offset;

        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
        offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
                EXT4_SB(sb)->s_cluster_bits;
        if (offsetp)
                *offsetp = offset;
        if (blockgrpp)
                *blockgrpp = blocknr;

}

/*
 * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
 * and 0 otherwise.
 */
static inline int ext4_block_in_group(struct super_block *sb,
                                      ext4_fsblk_t block,
                                      ext4_group_t block_group)
{
        ext4_group_t actual_group;

        actual_group = ext4_get_group_number(sb, block);
        return (actual_group == block_group) ? 1 : 0;
}

/*
 * Return the number of clusters used for file system metadata; this
 * represents the overhead needed by the file system.
 */
static unsigned ext4_num_overhead_clusters(struct super_block *sb,
                                           ext4_group_t block_group,
                                           struct ext4_group_desc *gdp)
{
        unsigned base_clusters, num_clusters;
        int block_cluster = -1, inode_cluster;
        int itbl_cluster_start = -1, itbl_cluster_end = -1;
        ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
        ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1;
        ext4_fsblk_t itbl_blk_start, itbl_blk_end;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* This is the number of clusters used by the superblock,
         * block group descriptors, and reserved block group
         * descriptor blocks */
        base_clusters = ext4_num_base_meta_clusters(sb, block_group);
        num_clusters = base_clusters;

        /*
         * Account and record inode table clusters if any cluster
         * is in the block group, or inode table cluster range is
         * [-1, -1] and won't overlap with block/inode bitmap cluster
         * accounted below.
         */
        itbl_blk_start = ext4_inode_table(sb, gdp);
        itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1;
        if (itbl_blk_start <= end && itbl_blk_end >= start) {
                itbl_blk_start = max(itbl_blk_start, start);
                itbl_blk_end = min(itbl_blk_end, end);

                itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start);
                itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start);

                num_clusters += itbl_cluster_end - itbl_cluster_start + 1;
                /* check if border cluster is overlapped */
                if (itbl_cluster_start == base_clusters - 1)
                        num_clusters--;
        }

        /*
         * For the allocation bitmaps, we first need to check to see
         * if the block is in the block group.  If it is, then check
         * to see if the cluster is already accounted for in the clusters
         * used for the base metadata cluster and inode tables cluster.
         * Normally all of these blocks are contiguous, so the special
         * case handling shouldn't be necessary except for *very*
         * unusual file system layouts.
         */
        if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
                block_cluster = EXT4_B2C(sbi,
                                         ext4_block_bitmap(sb, gdp) - start);
                if (block_cluster >= base_clusters &&
                    (block_cluster < itbl_cluster_start ||
                    block_cluster > itbl_cluster_end))
                        num_clusters++;
        }

        if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
                inode_cluster = EXT4_B2C(sbi,
                                         ext4_inode_bitmap(sb, gdp) - start);
                /*
                 * Additional check if inode bitmap is in just accounted
                 * block_cluster
                 */
                if (inode_cluster != block_cluster &&
                    inode_cluster >= base_clusters &&
                    (inode_cluster < itbl_cluster_start ||
                    inode_cluster > itbl_cluster_end))
                        num_clusters++;
        }

        return num_clusters;
}

static unsigned int num_clusters_in_group(struct super_block *sb,
                                          ext4_group_t block_group)
{
        unsigned int blocks;

        if (block_group == ext4_get_groups_count(sb) - 1) {
                /*
                 * Even though mke2fs always initializes the first and
                 * last group, just in case some other tool was used,
                 * we need to make sure we calculate the right free
                 * blocks.
                 */
                blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
                        ext4_group_first_block_no(sb, block_group);
        } else
                blocks = EXT4_BLOCKS_PER_GROUP(sb);
        return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
}

/* Initializes an uninitialized block bitmap */
static int ext4_init_block_bitmap(struct super_block *sb,
                                   struct buffer_head *bh,
                                   ext4_group_t block_group,
                                   struct ext4_group_desc *gdp)
{
        unsigned int bit, bit_max;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t start, tmp;

        ASSERT(buffer_locked(bh));

        if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT |
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        memset(bh->b_data, 0, sb->s_blocksize);

        bit_max = ext4_num_base_meta_clusters(sb, block_group);
        if ((bit_max >> 3) >= bh->b_size)
                return -EFSCORRUPTED;

        for (bit = 0; bit < bit_max; bit++)
                ext4_set_bit(bit, bh->b_data);

        start = ext4_group_first_block_no(sb, block_group);

        /* Set bits for block and inode bitmaps, and inode table */
        tmp = ext4_block_bitmap(sb, gdp);
        if (ext4_block_in_group(sb, tmp, block_group))
                ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);

        tmp = ext4_inode_bitmap(sb, gdp);
        if (ext4_block_in_group(sb, tmp, block_group))
                ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);

        tmp = ext4_inode_table(sb, gdp);
        for (; tmp < ext4_inode_table(sb, gdp) +
                     sbi->s_itb_per_group; tmp++) {
                if (ext4_block_in_group(sb, tmp, block_group))
                        ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
        }

        /*
         * Also if the number of blocks within the group is less than
         * the blocksize * 8 ( which is the size of bitmap ), set rest
         * of the block bitmap to 1
         */
        ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
                             sb->s_blocksize * 8, bh->b_data);
        return 0;
}

/* Return the number of free blocks in a block group.  It is used when
 * the block bitmap is uninitialized, so we can't just count the bits
 * in the bitmap. */
unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                       ext4_group_t block_group,
                                       struct ext4_group_desc *gdp)
{
        return num_clusters_in_group(sb, block_group) -
                ext4_num_overhead_clusters(sb, block_group, gdp);
}

/*
 * The free blocks are managed by bitmaps.  A file system contains several
 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
 * block for inodes, N blocks for the inode table and data blocks.
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.  The descriptors are loaded in memory
 * when a file system is mounted (see ext4_fill_super).
 */

/**
 * ext4_get_group_desc() -- load group descriptor from disk
 * @sb:                        super block
 * @block_group:        given block group
 * @bh:                        pointer to the buffer head to store the block
 *                        group descriptor
 */
struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
                                             ext4_group_t block_group,
                                             struct buffer_head **bh)
{
        unsigned int group_desc;
        unsigned int offset;
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh_p;

        KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc,
                                   sb, block_group, bh);

        if (block_group >= ngroups) {
                ext4_error(sb, "block_group >= groups_count - block_group = %u,"
                           " groups_count = %u", block_group, ngroups);

                return NULL;
        }

        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
        bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
        /*
         * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
         * the pointer being dereferenced won't be dereferenced again. By
         * looking at the usage in add_new_gdb() the value isn't modified,
         * just the pointer, and so it remains valid.
         */
        if (!bh_p) {
                ext4_error(sb, "Group descriptor not loaded - "
                           "block_group = %u, group_desc = %u, desc = %u",
                           block_group, group_desc, offset);
                return NULL;
        }

        desc = (struct ext4_group_desc *)(
                (__u8 *)bh_p->b_data +
                offset * EXT4_DESC_SIZE(sb));
        if (bh)
                *bh = bh_p;
        return desc;
}

static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head *bh)
{
        ext4_grpblk_t next_zero_bit;
        unsigned long bitmap_size = sb->s_blocksize * 8;
        unsigned int offset = num_clusters_in_group(sb, block_group);

        if (bitmap_size <= offset)
                return 0;

        next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset);

        return (next_zero_bit < bitmap_size ? next_zero_bit : 0);
}

struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
                                            ext4_group_t group)
{
        struct ext4_group_info **grp_info;
        long indexv, indexh;

        if (unlikely(group >= EXT4_SB(sb)->s_groups_count))
                return NULL;
        indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
        indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
        grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
        return grp_info[indexh];
}

/*
 * Return the block number which was discovered to be invalid, or 0 if
 * the block bitmap is valid.
 */
static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
                                            struct ext4_group_desc *desc,
                                            ext4_group_t block_group,
                                            struct buffer_head *bh)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t offset;
        ext4_grpblk_t next_zero_bit;
        ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_fsblk_t blk;
        ext4_fsblk_t group_first_block;

        if (ext4_has_feature_flex_bg(sb)) {
                /* with FLEX_BG, the inode/block bitmaps and itable
                 * blocks may not be in the group at all
                 * so the bitmap validation will be skipped for those groups
                 * or it has to also read the block group where the bitmaps
                 * are located to verify they are set.
                 */
                return 0;
        }
        group_first_block = ext4_group_first_block_no(sb, block_group);

        /* check whether block bitmap block number is set */
        blk = ext4_block_bitmap(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
                /* bad block bitmap */
                return blk;

        /* check whether the inode bitmap block number is set */
        blk = ext4_inode_bitmap(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
                /* bad block bitmap */
                return blk;

        /* check whether the inode table block number is set */
        blk = ext4_inode_table(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit)
                return blk;
        next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
                        EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1,
                        EXT4_B2C(sbi, offset));
        if (next_zero_bit <
            EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1)
                /* bad bitmap for inode tables */
                return blk;
        return 0;
}

static int ext4_validate_block_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *desc,
                                      ext4_group_t block_group,
                                      struct buffer_head *bh)
{
        ext4_fsblk_t        blk;
        struct ext4_group_info *grp;

        if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        grp = ext4_get_group_info(sb, block_group);

        if (buffer_verified(bh))
                return 0;
        if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                return -EFSCORRUPTED;

        ext4_lock_group(sb, block_group);
        if (buffer_verified(bh))
                goto verified;
        if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) ||
                     ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
        if (unlikely(blk != 0)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
                           block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSCORRUPTED;
        }
        blk = ext4_valid_block_bitmap_padding(sb, block_group, bh);
        if (unlikely(blk != 0)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set",
                           block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                                 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSCORRUPTED;
        }
        set_buffer_verified(bh);
verified:
        ext4_unlock_group(sb, block_group);
        return 0;
}

/**
 * ext4_read_block_bitmap_nowait()
 * @sb:                        super block
 * @block_group:        given block group
 * @ignore_locked:        ignore locked buffers
 *
 * Read the bitmap for a given block_group,and validate the
 * bits for block/inode/inode tables are set in the bitmaps
 *
 * Return buffer_head on success or an ERR_PTR in case of failure.
 */
struct buffer_head *
ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
                              bool ignore_locked)
{
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh;
        ext4_fsblk_t bitmap_blk;
        int err;

        KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait,
                                   sb, block_group, ignore_locked);

        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return ERR_PTR(-EFSCORRUPTED);
        bitmap_blk = ext4_block_bitmap(sb, desc);
        if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
                ext4_error(sb, "Invalid block bitmap block %llu in "
                           "block_group %u", bitmap_blk, block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return ERR_PTR(-EFSCORRUPTED);
        }
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
                ext4_warning(sb, "Cannot get buffer for block bitmap - "
                             "block_group = %u, block_bitmap = %llu",
                             block_group, bitmap_blk);
                return ERR_PTR(-ENOMEM);
        }

        if (ignore_locked && buffer_locked(bh)) {
                /* buffer under IO already, return if called for prefetching */
                put_bh(bh);
                return NULL;
        }

        if (bitmap_uptodate(bh))
                goto verify;

        lock_buffer(bh);
        if (bitmap_uptodate(bh)) {
                unlock_buffer(bh);
                goto verify;
        }
        ext4_lock_group(sb, block_group);
        if (ext4_has_group_desc_csum(sb) &&
            (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                if (block_group == 0) {
                        ext4_unlock_group(sb, block_group);
                        unlock_buffer(bh);
                        ext4_error(sb, "Block bitmap for bg 0 marked "
                                   "uninitialized");
                        err = -EFSCORRUPTED;
                        goto out;
                }
                err = ext4_init_block_bitmap(sb, bh, block_group, desc);
                if (err) {
                        ext4_unlock_group(sb, block_group);
                        unlock_buffer(bh);
                        ext4_error(sb, "Failed to init block bitmap for group "
                                   "%u: %d", block_group, err);
                        goto out;
                }
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
                set_buffer_verified(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
                 * bitmap is also uptodate
                 */
                set_bitmap_uptodate(bh);
                unlock_buffer(bh);
                goto verify;
        }
        /*
         * submit the buffer_head for reading
         */
        set_buffer_new(bh);
        trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
        ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
                            (ignore_locked ? REQ_RAHEAD : 0),
                            ext4_end_bitmap_read,
                            ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO));
        return bh;
verify:
        err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
        if (err)
                goto out;
        return bh;
out:
        put_bh(bh);
        return ERR_PTR(err);
}

/* Returns 0 on success, -errno on error */
int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
                           struct buffer_head *bh)
{
        struct ext4_group_desc *desc;

        KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap,
                                   sb, block_group, bh);

        if (!buffer_new(bh))
                return 0;
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return -EFSCORRUPTED;
        wait_on_buffer(bh);
        if (!buffer_uptodate(bh)) {
                ext4_error_err(sb, EIO, "Cannot read block bitmap - "
                               "block_group = %u, block_bitmap = %llu",
                               block_group, (unsigned long long) bh->b_blocknr);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EIO;
        }
        clear_buffer_new(bh);
        /* Panic or remount fs read-only if block bitmap is invalid */
        return ext4_validate_block_bitmap(sb, desc, block_group, bh);
}

struct buffer_head *
ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
        struct buffer_head *bh;
        int err;

        bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
        if (IS_ERR(bh))
                return bh;
        err = ext4_wait_block_bitmap(sb, block_group, bh);
        if (err) {
                put_bh(bh);
                return ERR_PTR(err);
        }
        return bh;
}

/**
 * ext4_has_free_clusters()
 * @sbi:        in-core super block structure.
 * @nclusters:        number of needed blocks
 * @flags:        flags from ext4_mb_new_blocks()
 *
 * Check if filesystem has nclusters free & available for allocation.
 * On success return 1, return 0 on failure.
 */
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
                                  s64 nclusters, unsigned int flags)
{
        s64 free_clusters, dirty_clusters, rsv, resv_clusters;
        struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
        struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;

        free_clusters  = percpu_counter_read_positive(fcc);
        dirty_clusters = percpu_counter_read_positive(dcc);
        resv_clusters = atomic64_read(&sbi->s_resv_clusters);

        /*
         * r_blocks_count should always be multiple of the cluster ratio so
         * we are safe to do a plane bit shift only.
         */
        rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
              resv_clusters;

        if (free_clusters - (nclusters + rsv + dirty_clusters) <
                                        EXT4_FREECLUSTERS_WATERMARK) {
                free_clusters  = percpu_counter_sum_positive(fcc);
                dirty_clusters = percpu_counter_sum_positive(dcc);
        }
        /* Check whether we have space after accounting for current
         * dirty clusters & root reserved clusters.
         */
        if (free_clusters >= (rsv + nclusters + dirty_clusters))
                return 1;

        /* Hm, nope.  Are (enough) root reserved clusters available? */
        if (uid_eq(sbi->s_resuid, current_fsuid()) ||
            (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
            (flags & EXT4_MB_USE_ROOT_BLOCKS) ||
            capable(CAP_SYS_RESOURCE)) {

                if (free_clusters >= (nclusters + dirty_clusters +
                                      resv_clusters))
                        return 1;
        }
        /* No free blocks. Let's see if we can dip into reserved pool */
        if (flags & EXT4_MB_USE_RESERVED) {
                if (free_clusters >= (nclusters + dirty_clusters))
                        return 1;
        }

        return 0;
}

int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
                             s64 nclusters, unsigned int flags)
{
        if (ext4_has_free_clusters(sbi, nclusters, flags)) {
                percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
                return 0;
        } else
                return -ENOSPC;
}

/**
 * ext4_should_retry_alloc() - check if a block allocation should be retried
 * @sb:                        superblock
 * @retries:                number of retry attempts made so far
 *
 * ext4_should_retry_alloc() is called when ENOSPC is returned while
 * attempting to allocate blocks.  If there's an indication that a pending
 * journal transaction might free some space and allow another attempt to
 * succeed, this function will wait for the current or committing transaction
 * to complete and then return TRUE.
 */
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!sbi->s_journal)
                return 0;

        if (++(*retries) > 3) {
                percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
                return 0;
        }

        /*
         * if there's no indication that blocks are about to be freed it's
         * possible we just missed a transaction commit that did so
         */
        smp_mb();
        if (atomic_read(&sbi->s_mb_free_pending) == 0) {
                if (test_opt(sb, DISCARD)) {
                        atomic_inc(&sbi->s_retry_alloc_pending);
                        flush_work(&sbi->s_discard_work);
                        atomic_dec(&sbi->s_retry_alloc_pending);
                }
                return ext4_has_free_clusters(sbi, 1, 0);
        }

        /*
         * it's possible we've just missed a transaction commit here,
         * so ignore the returned status
         */
        ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id);
        (void) jbd2_journal_force_commit_nested(sbi->s_journal);
        return 1;
}

/*
 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
 *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
 * @count:                pointer to total number of clusters needed
 * @errp:               error code
 *
 * Return 1st allocated block number on success, *count stores total account
 * error stores in errp pointer
 */
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                  ext4_fsblk_t goal, unsigned int flags,
                                  unsigned long *count, int *errp)
{
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;

        memset(&ar, 0, sizeof(ar));
        /* Fill with neighbour allocated blocks */
        ar.inode = inode;
        ar.goal = goal;
        ar.len = count ? *count : 1;
        ar.flags = flags;

        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
                *count = ar.len;
        /*
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metadata, but we do account for it.
         */
        if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
                dquot_alloc_block_nofail(inode,
                                EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
        }
        return ret;
}

/**
 * ext4_count_free_clusters() -- count filesystem free clusters
 * @sb:                superblock
 *
 * Adds up the number of free clusters from each block group.
 */
ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
{
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_info *grp;
#ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
        unsigned int x;
        struct buffer_head *bitmap_bh = NULL;

        es = EXT4_SB(sb)->s_es;
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;

        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                grp = NULL;
                if (EXT4_SB(sb)->s_group_info)
                        grp = ext4_get_group_info(sb, i);
                if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        desc_count += ext4_free_group_clusters(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (IS_ERR(bitmap_bh)) {
                        bitmap_bh = NULL;
                        continue;
                }

                x = ext4_count_free(bitmap_bh->b_data,
                                    EXT4_CLUSTERS_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
                        i, ext4_free_group_clusters(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
        printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
               ", computed = %llu, %llu\n",
               EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
               desc_count, bitmap_count);
        return bitmap_count;
#else
        desc_count = 0;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                grp = NULL;
                if (EXT4_SB(sb)->s_group_info)
                        grp = ext4_get_group_info(sb, i);
                if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        desc_count += ext4_free_group_clusters(sb, gdp);
        }

        return desc_count;
#endif
}

static inline int test_root(ext4_group_t a, int b)
{
        while (1) {
                if (a < b)
                        return 0;
                if (a == b)
                        return 1;
                if ((a % b) != 0)
                        return 0;
                a = a / b;
        }
}

/**
 *        ext4_bg_has_super - number of blocks used by the superblock in group
 *        @sb: superblock for filesystem
 *        @group: group number to check
 *
 *        Return the number of blocks used by the superblock (primary or backup)
 *        in this group.  Currently this will be only 0 or 1.
 */
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (group == 0)
                return 1;
        if (ext4_has_feature_sparse_super2(sb)) {
                if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
                    group == le32_to_cpu(es->s_backup_bgs[1]))
                        return 1;
                return 0;
        }
        if ((group <= 1) || !ext4_has_feature_sparse_super(sb))
                return 1;
        if (!(group & 1))
                return 0;
        if (test_root(group, 3) || (test_root(group, 5)) ||
            test_root(group, 7))
                return 1;

        return 0;
}

static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
                                        ext4_group_t group)
{
        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
        ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
        ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;

        if (group == first || group == first + 1 || group == last)
                return 1;
        return 0;
}

static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
                                        ext4_group_t group)
{
        if (!ext4_bg_has_super(sb, group))
                return 0;

        if (ext4_has_feature_meta_bg(sb))
                return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
        else
                return EXT4_SB(sb)->s_gdb_count;
}

/**
 *        ext4_bg_num_gdb - number of blocks used by the group table in group
 *        @sb: superblock for filesystem
 *        @group: group number to check
 *
 *        Return the number of blocks used by the group descriptor table
 *        (primary or backup) in this group.  In the future there may be a
 *        different number of descriptor blocks in each group.
 */
unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
{
        unsigned long first_meta_bg =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);

        if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg)
                return ext4_bg_num_gdb_nometa(sb, group);

        return ext4_bg_num_gdb_meta(sb,group);

}

/*
 * This function returns the number of file system metadata blocks at
 * the beginning of a block group, including the reserved gdt blocks.
 */
unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
                                       ext4_group_t block_group)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned num;

        /* Check for superblock and gdt backups in this group */
        num = ext4_bg_has_super(sb, block_group);

        if (!ext4_has_feature_meta_bg(sb) ||
            block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
                          sbi->s_desc_per_block) {
                if (num) {
                        num += ext4_bg_num_gdb_nometa(sb, block_group);
                        num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
                }
        } else { /* For META_BG_BLOCK_GROUPS */
                num += ext4_bg_num_gdb_meta(sb, block_group);
        }
        return num;
}

static unsigned int ext4_num_base_meta_clusters(struct super_block *sb,
                                                ext4_group_t block_group)
{
        return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group));
}

/**
 *        ext4_inode_to_goal_block - return a hint for block allocation
 *        @inode: inode for block allocation
 *
 *        Return the ideal location to start allocating blocks for a
 *        newly created inode.
 */
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_group_t block_group;
        ext4_grpblk_t colour;
        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;

        block_group = ei->i_block_group;
        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
                /*
                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
                 * block groups per flexgroup, reserve the first block
                 * group for directories and special files.  Regular
                 * files will start at the second block group.  This
                 * tends to speed up directory access and improves
                 * fsck times.
                 */
                block_group &= ~(flex_size-1);
                if (S_ISREG(inode->i_mode))
                        block_group++;
        }
        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

        /*
         * If we are doing delayed allocation, we don't need take
         * colour into account.
         */
        if (test_opt(inode->i_sb, DELALLOC))
                return bg_start;

        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (task_pid_nr(current) % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
        else
                colour = (task_pid_nr(current) % 16) *
                        ((last_block - bg_start) / 16);
        return bg_start + colour;
}




















   19 












   38 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BH_H
#define _LINUX_BH_H

#include <linux/instruction_pointer.h>
#include <linux/preempt.h>

#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS)
extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
#else
static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
        preempt_count_add(cnt);
        barrier();
}
#endif

static inline void local_bh_disable(void)
{
        __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

extern void _local_bh_enable(void);
extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);

static inline void local_bh_enable_ip(unsigned long ip)
{
        __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET);
}

static inline void local_bh_enable(void)
{
        __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

#ifdef CONFIG_PREEMPT_RT
extern bool local_bh_blocked(void);
#else
static inline bool local_bh_blocked(void) { return false; }
#endif

#endif /* _LINUX_BH_H */




































    2 
    2 


    2 

    2 







    2 









    2 





    2 





















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
// SPDX-License-Identifier: GPL-2.0+
/*
 * Buffer/page management specific to NILFS
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Ryusuke Konishi and Seiji Kihara.
 */

#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/swap.h>
#include <linux/bitops.h>
#include <linux/page-flags.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/folio_batch.h>
#include <linux/gfp.h>
#include "nilfs.h"
#include "page.h"
#include "mdt.h"


#define NILFS_BUFFER_INHERENT_BITS                                        \
        (BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) |        \
         BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked))

static struct buffer_head *__nilfs_get_folio_block(struct folio *folio,
                unsigned long block, pgoff_t index, int blkbits,
                unsigned long b_state)

{
        unsigned long first_block;
        struct buffer_head *bh = folio_buffers(folio);

        if (!bh)
                bh = create_empty_buffers(folio, 1 << blkbits, b_state);

        first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
        bh = get_nth_bh(bh, block - first_block);

        wait_on_buffer(bh);
        return bh;
}

struct buffer_head *nilfs_grab_buffer(struct inode *inode,
                                      struct address_space *mapping,
                                      unsigned long blkoff,
                                      unsigned long b_state)
{
        int blkbits = inode->i_blkbits;
        pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits);
        struct folio *folio;
        struct buffer_head *bh;

        folio = filemap_grab_folio(mapping, index);
        if (IS_ERR(folio))
                return NULL;

        bh = __nilfs_get_folio_block(folio, blkoff, index, blkbits, b_state);
        if (unlikely(!bh)) {
                folio_unlock(folio);
                folio_put(folio);
                return NULL;
        }
        bh->b_bdev = inode->i_sb->s_bdev;
        return bh;
}

/**
 * nilfs_forget_buffer - discard dirty state
 * @bh: buffer head of the buffer to be discarded
 */
void nilfs_forget_buffer(struct buffer_head *bh)
{
        struct folio *folio = bh->b_folio;
        const unsigned long clear_bits =
                (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
                 BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
                 BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) |
                 BIT(BH_Delay));

        lock_buffer(bh);
        set_mask_bits(&bh->b_state, clear_bits, 0);
        if (nilfs_folio_buffers_clean(folio))
                __nilfs_clear_folio_dirty(folio);

        bh->b_blocknr = -1;
        folio_clear_uptodate(folio);
        folio_clear_mappedtodisk(folio);
        unlock_buffer(bh);
        brelse(bh);
}

/**
 * nilfs_copy_buffer -- copy buffer data and flags
 * @dbh: destination buffer
 * @sbh: source buffer
 */
void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
{
        void *saddr, *daddr;
        unsigned long bits;
        struct folio *sfolio = sbh->b_folio, *dfolio = dbh->b_folio;
        struct buffer_head *bh;

        saddr = kmap_local_folio(sfolio, bh_offset(sbh));
        daddr = kmap_local_folio(dfolio, bh_offset(dbh));
        memcpy(daddr, saddr, sbh->b_size);
        kunmap_local(daddr);
        kunmap_local(saddr);

        dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
        dbh->b_blocknr = sbh->b_blocknr;
        dbh->b_bdev = sbh->b_bdev;

        bh = dbh;
        bits = sbh->b_state & (BIT(BH_Uptodate) | BIT(BH_Mapped));
        while ((bh = bh->b_this_page) != dbh) {
                lock_buffer(bh);
                bits &= bh->b_state;
                unlock_buffer(bh);
        }
        if (bits & BIT(BH_Uptodate))
                folio_mark_uptodate(dfolio);
        else
                folio_clear_uptodate(dfolio);
        if (bits & BIT(BH_Mapped))
                folio_set_mappedtodisk(dfolio);
        else
                folio_clear_mappedtodisk(dfolio);
}

/**
 * nilfs_folio_buffers_clean - Check if a folio has dirty buffers or not.
 * @folio: Folio to be checked.
 *
 * Return: false if the folio has dirty buffers, true otherwise.
 */
bool nilfs_folio_buffers_clean(struct folio *folio)
{
        struct buffer_head *bh, *head;

        bh = head = folio_buffers(folio);
        do {
                if (buffer_dirty(bh))
                        return false;
                bh = bh->b_this_page;
        } while (bh != head);
        return true;
}

void nilfs_folio_bug(struct folio *folio)
{
        struct buffer_head *bh, *head;
        struct address_space *m;
        unsigned long ino;

        if (unlikely(!folio)) {
                printk(KERN_CRIT "NILFS_FOLIO_BUG(NULL)\n");
                return;
        }

        m = folio->mapping;
        ino = m ? m->host->i_ino : 0;

        printk(KERN_CRIT "NILFS_FOLIO_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
               "mapping=%p ino=%lu\n",
               folio, folio_ref_count(folio),
               (unsigned long long)folio->index, folio->flags.f, m, ino);

        head = folio_buffers(folio);
        if (head) {
                int i = 0;

                bh = head;
                do {
                        printk(KERN_CRIT
                               " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
                               i++, bh, atomic_read(&bh->b_count),
                               (unsigned long long)bh->b_blocknr, bh->b_state);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
}

/**
 * nilfs_copy_folio -- copy the folio with buffers
 * @dst: destination folio
 * @src: source folio
 * @copy_dirty: flag whether to copy dirty states on the folio's buffer heads.
 *
 * This function is for both data folios and btnode folios.  The dirty flag
 * should be treated by caller.  The folio must not be under i/o.
 * Both src and dst folio must be locked
 */
static void nilfs_copy_folio(struct folio *dst, struct folio *src,
                bool copy_dirty)
{
        struct buffer_head *dbh, *dbufs, *sbh;
        unsigned long mask = NILFS_BUFFER_INHERENT_BITS;

        BUG_ON(folio_test_writeback(dst));

        sbh = folio_buffers(src);
        dbh = folio_buffers(dst);
        if (!dbh)
                dbh = create_empty_buffers(dst, sbh->b_size, 0);

        if (copy_dirty)
                mask |= BIT(BH_Dirty);

        dbufs = dbh;
        do {
                lock_buffer(sbh);
                lock_buffer(dbh);
                dbh->b_state = sbh->b_state & mask;
                dbh->b_blocknr = sbh->b_blocknr;
                dbh->b_bdev = sbh->b_bdev;
                sbh = sbh->b_this_page;
                dbh = dbh->b_this_page;
        } while (dbh != dbufs);

        folio_copy(dst, src);

        if (folio_test_uptodate(src) && !folio_test_uptodate(dst))
                folio_mark_uptodate(dst);
        else if (!folio_test_uptodate(src) && folio_test_uptodate(dst))
                folio_clear_uptodate(dst);
        if (folio_test_mappedtodisk(src) && !folio_test_mappedtodisk(dst))
                folio_set_mappedtodisk(dst);
        else if (!folio_test_mappedtodisk(src) && folio_test_mappedtodisk(dst))
                folio_clear_mappedtodisk(dst);

        do {
                unlock_buffer(sbh);
                unlock_buffer(dbh);
                sbh = sbh->b_this_page;
                dbh = dbh->b_this_page;
        } while (dbh != dbufs);
}

int nilfs_copy_dirty_pages(struct address_space *dmap,
                           struct address_space *smap)
{
        struct folio_batch fbatch;
        unsigned int i;
        pgoff_t index = 0;
        int err = 0;

        folio_batch_init(&fbatch);
repeat:
        if (!filemap_get_folios_tag(smap, &index, (pgoff_t)-1,
                                PAGECACHE_TAG_DIRTY, &fbatch))
                return 0;

        for (i = 0; i < folio_batch_count(&fbatch); i++) {
                struct folio *folio = fbatch.folios[i], *dfolio;

                folio_lock(folio);
                if (unlikely(!folio_test_dirty(folio)))
                        NILFS_FOLIO_BUG(folio, "inconsistent dirty state");

                dfolio = filemap_grab_folio(dmap, folio->index);
                if (IS_ERR(dfolio)) {
                        /* No empty page is added to the page cache */
                        folio_unlock(folio);
                        err = PTR_ERR(dfolio);
                        break;
                }
                if (unlikely(!folio_buffers(folio)))
                        NILFS_FOLIO_BUG(folio,
                                       "found empty page in dat page cache");

                nilfs_copy_folio(dfolio, folio, true);
                filemap_dirty_folio(folio_mapping(dfolio), dfolio);

                folio_unlock(dfolio);
                folio_put(dfolio);
                folio_unlock(folio);
        }
        folio_batch_release(&fbatch);
        cond_resched();

        if (likely(!err))
                goto repeat;
        return err;
}

/**
 * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
 * @dmap: destination page cache
 * @smap: source page cache
 *
 * No pages must be added to the cache during this process.
 * This must be ensured by the caller.
 */
void nilfs_copy_back_pages(struct address_space *dmap,
                           struct address_space *smap)
{
        struct folio_batch fbatch;
        unsigned int i, n;
        pgoff_t start = 0;

        folio_batch_init(&fbatch);
repeat:
        n = filemap_get_folios(smap, &start, ~0UL, &fbatch);
        if (!n)
                return;

        for (i = 0; i < folio_batch_count(&fbatch); i++) {
                struct folio *folio = fbatch.folios[i], *dfolio;
                pgoff_t index = folio->index;

                folio_lock(folio);
                dfolio = filemap_lock_folio(dmap, index);
                if (!IS_ERR(dfolio)) {
                        /* overwrite existing folio in the destination cache */
                        WARN_ON(folio_test_dirty(dfolio));
                        nilfs_copy_folio(dfolio, folio, false);
                        folio_unlock(dfolio);
                        folio_put(dfolio);
                        /* Do we not need to remove folio from smap here? */
                } else {
                        struct folio *f;

                        /* move the folio to the destination cache */
                        xa_lock_irq(&smap->i_pages);
                        f = __xa_erase(&smap->i_pages, index);
                        WARN_ON(folio != f);
                        smap->nrpages--;
                        xa_unlock_irq(&smap->i_pages);

                        xa_lock_irq(&dmap->i_pages);
                        f = __xa_store(&dmap->i_pages, index, folio, GFP_NOFS);
                        if (unlikely(f)) {
                                /* Probably -ENOMEM */
                                folio->mapping = NULL;
                                folio_put(folio);
                        } else {
                                folio->mapping = dmap;
                                dmap->nrpages++;
                                if (folio_test_dirty(folio))
                                        __xa_set_mark(&dmap->i_pages, index,
                                                        PAGECACHE_TAG_DIRTY);
                        }
                        xa_unlock_irq(&dmap->i_pages);
                }
                folio_unlock(folio);
        }
        folio_batch_release(&fbatch);
        cond_resched();

        goto repeat;
}

/**
 * nilfs_clear_dirty_pages - discard dirty pages in address space
 * @mapping: address space with dirty pages for discarding
 */
void nilfs_clear_dirty_pages(struct address_space *mapping)
{
        struct folio_batch fbatch;
        unsigned int i;
        pgoff_t index = 0;

        folio_batch_init(&fbatch);

        while (filemap_get_folios_tag(mapping, &index, (pgoff_t)-1,
                                PAGECACHE_TAG_DIRTY, &fbatch)) {
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        folio_lock(folio);

                        /*
                         * This folio may have been removed from the address
                         * space by truncation or invalidation when the lock
                         * was acquired.  Skip processing in that case.
                         */
                        if (likely(folio->mapping == mapping))
                                nilfs_clear_folio_dirty(folio);

                        folio_unlock(folio);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
}

/**
 * nilfs_clear_folio_dirty - discard dirty folio
 * @folio: dirty folio that will be discarded
 *
 * nilfs_clear_folio_dirty() clears working states including dirty state for
 * the folio and its buffers.  If the folio has buffers, clear only if it is
 * confirmed that none of the buffer heads are busy (none have valid
 * references and none are locked).
 */
void nilfs_clear_folio_dirty(struct folio *folio)
{
        struct buffer_head *bh, *head;

        BUG_ON(!folio_test_locked(folio));

        head = folio_buffers(folio);
        if (head) {
                const unsigned long clear_bits =
                        (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
                         BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
                         BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) |
                         BIT(BH_Delay));
                bool busy, invalidated = false;

recheck_buffers:
                busy = false;
                bh = head;
                do {
                        if (atomic_read(&bh->b_count) | buffer_locked(bh)) {
                                busy = true;
                                break;
                        }
                } while (bh = bh->b_this_page, bh != head);

                if (busy) {
                        if (invalidated)
                                return;
                        invalidate_bh_lrus();
                        invalidated = true;
                        goto recheck_buffers;
                }

                bh = head;
                do {
                        lock_buffer(bh);
                        set_mask_bits(&bh->b_state, clear_bits, 0);
                        unlock_buffer(bh);
                } while (bh = bh->b_this_page, bh != head);
        }

        folio_clear_uptodate(folio);
        folio_clear_mappedtodisk(folio);
        folio_clear_checked(folio);
        __nilfs_clear_folio_dirty(folio);
}

unsigned int nilfs_page_count_clean_buffers(struct folio *folio,
                                            unsigned int from, unsigned int to)
{
        unsigned int block_start, block_end;
        struct buffer_head *bh, *head;
        unsigned int nc = 0;

        for (bh = head = folio_buffers(folio), block_start = 0;
             bh != head || !block_start;
             block_start = block_end, bh = bh->b_this_page) {
                block_end = block_start + bh->b_size;
                if (block_end > from && block_start < to && !buffer_dirty(bh))
                        nc++;
        }
        return nc;
}

/*
 * NILFS2 needs clear_page_dirty() in the following two cases:
 *
 * 1) For B-tree node pages and data pages of DAT file, NILFS2 clears dirty
 *    flag of pages when it copies back pages from shadow cache to the
 *    original cache.
 *
 * 2) Some B-tree operations like insertion or deletion may dispose buffers
 *    in dirty state, and this needs to cancel the dirty state of their pages.
 */
void __nilfs_clear_folio_dirty(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;

        if (mapping) {
                xa_lock_irq(&mapping->i_pages);
                if (folio_test_dirty(folio)) {
                        __xa_clear_mark(&mapping->i_pages, folio->index,
                                             PAGECACHE_TAG_DIRTY);
                        xa_unlock_irq(&mapping->i_pages);
                        folio_clear_dirty_for_io(folio);
                        return;
                }
                xa_unlock_irq(&mapping->i_pages);
                return;
        }
        folio_clear_dirty(folio);
}

/**
 * nilfs_find_uncommitted_extent - find extent of uncommitted data
 * @inode: inode
 * @start_blk: start block offset (in)
 * @blkoff: start offset of the found extent (out)
 *
 * This function searches an extent of buffers marked "delayed" which
 * starts from a block offset equal to or larger than @start_blk.  If
 * such an extent was found, this will store the start offset in
 * @blkoff and return its length in blocks.
 *
 * Return: Length in blocks of found extent, 0 otherwise.
 */
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
                                            sector_t start_blk,
                                            sector_t *blkoff)
{
        unsigned int i, nr_folios;
        pgoff_t index;
        unsigned long length = 0;
        struct folio_batch fbatch;
        struct folio *folio;

        if (inode->i_mapping->nrpages == 0)
                return 0;

        index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);

        folio_batch_init(&fbatch);

repeat:
        nr_folios = filemap_get_folios_contig(inode->i_mapping, &index, ULONG_MAX,
                        &fbatch);
        if (nr_folios == 0)
                return length;

        i = 0;
        do {
                folio = fbatch.folios[i];

                folio_lock(folio);
                if (folio_buffers(folio)) {
                        struct buffer_head *bh, *head;
                        sector_t b;

                        b = folio->index << (PAGE_SHIFT - inode->i_blkbits);
                        bh = head = folio_buffers(folio);
                        do {
                                if (b < start_blk)
                                        continue;
                                if (buffer_delay(bh)) {
                                        if (length == 0)
                                                *blkoff = b;
                                        length++;
                                } else if (length > 0) {
                                        goto out_locked;
                                }
                        } while (++b, bh = bh->b_this_page, bh != head);
                } else {
                        if (length > 0)
                                goto out_locked;
                }
                folio_unlock(folio);

        } while (++i < nr_folios);

        folio_batch_release(&fbatch);
        cond_resched();
        goto repeat;

out_locked:
        folio_unlock(folio);
        folio_batch_release(&fbatch);
        return length;
}































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PATH_H
#define _LINUX_PATH_H

struct dentry;
struct vfsmount;

struct path {
        struct vfsmount *mnt;
        struct dentry *dentry;
} __randomize_layout;

extern void path_get(const struct path *);
extern void path_put(const struct path *);

static inline int path_equal(const struct path *path1, const struct path *path2)
{
        return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
}

/*
 * Cleanup macro for use with __free(path_put). Avoids dereference and
 * copying @path unlike DEFINE_FREE(). path_put() will handle the empty
 * path correctly just ensure @path is initialized:
 *
 * struct path path __free(path_put) = {};
 */
#define __free_path_put path_put

#endif  /* _LINUX_PATH_H */





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/truncate.h
 *
 * Common inline functions needed for truncate support
 */

/*
 * Truncate blocks that were not used by write. We have to truncate the
 * pagecache as well so that corresponding buffers get properly unmapped.
 */
static inline void ext4_truncate_failed_write(struct inode *inode)
{
        struct address_space *mapping = inode->i_mapping;

        /*
         * We don't need to call ext4_break_layouts() because the blocks we
         * are truncating were never visible to userspace.
         */
        filemap_invalidate_lock(mapping);
        truncate_inode_pages(mapping, inode->i_size);
        ext4_truncate(inode);
        filemap_invalidate_unlock(mapping);
}

/*
 * Work out how many blocks we need to proceed with the next chunk of a
 * truncate transaction.
 */
static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
{
        ext4_lblk_t needed;

        needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);

        /* Give ourselves just enough room to cope with inodes in which
         * i_blocks is corrupt: we've seen disk corruptions in the past
         * which resulted in random data in an inode which looked enough
         * like a regular file for ext4 to try to delete it.  Things
         * will go a bit crazy if that happens, but at least we should
         * try not to panic the whole kernel. */
        if (needed < 2)
                needed = 2;

        /* But we need to bound the transaction so we don't overflow the
         * journal. */
        if (needed > EXT4_MAX_TRANS_DATA)
                needed = EXT4_MAX_TRANS_DATA;

        return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
}




















































































































































































































































































































































































































































































































































































































































































































    3 










    2 

    3 


    3 




    3 






    2 






















































    1 



    1 
















    3 









    3 




















    3 


    3 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
// SPDX-License-Identifier: GPL-2.0
/* sysfs entries for device PM */
#include <linux/device.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/pm_qos.h>
#include <linux/pm_runtime.h>
#include <linux/atomic.h>
#include <linux/jiffies.h>
#include "power.h"

/*
 *        control - Report/change current runtime PM setting of the device
 *
 *        Runtime power management of a device can be blocked with the help of
 *        this attribute.  All devices have one of the following two values for
 *        the power/control file:
 *
 *         + "auto\n" to allow the device to be power managed at run time;
 *         + "on\n" to prevent the device from being power managed at run time;
 *
 *        The default for all devices is "auto", which means that devices may be
 *        subject to automatic power management, depending on their drivers.
 *        Changing this attribute to "on" prevents the driver from power managing
 *        the device at run time.  Doing that while the device is suspended causes
 *        it to be woken up.
 *
 *        wakeup - Report/change current wakeup option for device
 *
 *        Some devices support "wakeup" events, which are hardware signals
 *        used to activate devices from suspended or low power states.  Such
 *        devices have one of three values for the sysfs power/wakeup file:
 *
 *         + "enabled\n" to issue the events;
 *         + "disabled\n" not to do so; or
 *         + "\n" for temporary or permanent inability to issue wakeup.
 *
 *        (For example, unconfigured USB devices can't issue wakeups.)
 *
 *        Familiar examples of devices that can issue wakeup events include
 *        keyboards and mice (both PS2 and USB styles), power buttons, modems,
 *        "Wake-On-LAN" Ethernet links, GPIO lines, and more.  Some events
 *        will wake the entire system from a suspend state; others may just
 *        wake up the device (if the system as a whole is already active).
 *        Some wakeup events use normal IRQ lines; other use special out
 *        of band signaling.
 *
 *        It is the responsibility of device drivers to enable (or disable)
 *        wakeup signaling as part of changing device power states, respecting
 *        the policy choices provided through the driver model.
 *
 *        Devices may not be able to generate wakeup events from all power
 *        states.  Also, the events may be ignored in some configurations;
 *        for example, they might need help from other devices that aren't
 *        active, or which may have wakeup disabled.  Some drivers rely on
 *        wakeup events internally (unless they are disabled), keeping
 *        their hardware in low power modes whenever they're unused.  This
 *        saves runtime power, without requiring system-wide sleep states.
 *
 *        async - Report/change current async suspend setting for the device
 *
 *        Asynchronous suspend and resume of the device during system-wide power
 *        state transitions can be enabled by writing "enabled" to this file.
 *        Analogously, if "disabled" is written to this file, the device will be
 *        suspended and resumed synchronously.
 *
 *        All devices have one of the following two values for power/async:
 *
 *         + "enabled\n" to permit the asynchronous suspend/resume of the device;
 *         + "disabled\n" to forbid it;
 *
 *        NOTE: It generally is unsafe to permit the asynchronous suspend/resume
 *        of a device unless it is certain that all of the PM dependencies of the
 *        device are known to the PM core.  However, for some devices this
 *        attribute is set to "enabled" by bus type code or device drivers and in
 *        that cases it should be safe to leave the default value.
 *
 *        autosuspend_delay_ms - Report/change a device's autosuspend_delay value
 *
 *        Some drivers don't want to carry out a runtime suspend as soon as a
 *        device becomes idle; they want it always to remain idle for some period
 *        of time before suspending it.  This period is the autosuspend_delay
 *        value (expressed in milliseconds) and it can be controlled by the user.
 *        If the value is negative then the device will never be runtime
 *        suspended.
 *
 *        NOTE: The autosuspend_delay_ms attribute and the autosuspend_delay
 *        value are used only if the driver calls pm_runtime_use_autosuspend().
 *
 *        wakeup_count - Report the number of wakeup events related to the device
 */

const char power_group_name[] = "power";
EXPORT_SYMBOL_GPL(power_group_name);

static const char ctrl_auto[] = "auto";
static const char ctrl_on[] = "on";

static ssize_t control_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        return sysfs_emit(buf, "%s\n",
                          dev->power.runtime_auto ? ctrl_auto : ctrl_on);
}

static ssize_t control_store(struct device * dev, struct device_attribute *attr,
                             const char * buf, size_t n)
{
        device_lock(dev);
        if (sysfs_streq(buf, ctrl_auto))
                pm_runtime_allow(dev);
        else if (sysfs_streq(buf, ctrl_on))
                pm_runtime_forbid(dev);
        else
                n = -EINVAL;
        device_unlock(dev);
        return n;
}

static DEVICE_ATTR_RW(control);

static ssize_t runtime_active_time_show(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf)
{
        u64 tmp = pm_runtime_active_time(dev);

        do_div(tmp, NSEC_PER_MSEC);

        return sysfs_emit(buf, "%llu\n", tmp);
}

static DEVICE_ATTR_RO(runtime_active_time);

static ssize_t runtime_suspended_time_show(struct device *dev,
                                           struct device_attribute *attr,
                                           char *buf)
{
        u64 tmp = pm_runtime_suspended_time(dev);

        do_div(tmp, NSEC_PER_MSEC);

        return sysfs_emit(buf, "%llu\n", tmp);
}

static DEVICE_ATTR_RO(runtime_suspended_time);

static ssize_t runtime_status_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        const char *output;

        if (dev->power.runtime_error) {
                output = "error";
        } else if (dev->power.disable_depth) {
                output = "unsupported";
        } else {
                switch (dev->power.runtime_status) {
                case RPM_SUSPENDED:
                        output = "suspended";
                        break;
                case RPM_SUSPENDING:
                        output = "suspending";
                        break;
                case RPM_RESUMING:
                        output = "resuming";
                        break;
                case RPM_ACTIVE:
                        output = "active";
                        break;
                default:
                        return -EIO;
                }
        }
        return sysfs_emit(buf, "%s\n", output);
}

static DEVICE_ATTR_RO(runtime_status);

static ssize_t autosuspend_delay_ms_show(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf)
{
        if (!dev->power.use_autosuspend)
                return -EIO;

        return sysfs_emit(buf, "%d\n", dev->power.autosuspend_delay);
}

static ssize_t autosuspend_delay_ms_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t n)
{
        long delay;

        if (!dev->power.use_autosuspend)
                return -EIO;

        if (kstrtol(buf, 10, &delay) != 0 || delay != (int) delay)
                return -EINVAL;

        device_lock(dev);
        pm_runtime_set_autosuspend_delay(dev, delay);
        device_unlock(dev);
        return n;
}

static DEVICE_ATTR_RW(autosuspend_delay_ms);

static ssize_t pm_qos_resume_latency_us_show(struct device *dev,
                                             struct device_attribute *attr,
                                             char *buf)
{
        s32 value = dev_pm_qos_requested_resume_latency(dev);

        if (value == 0)
                return sysfs_emit(buf, "n/a\n");
        if (value == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT)
                value = 0;

        return sysfs_emit(buf, "%d\n", value);
}

static ssize_t pm_qos_resume_latency_us_store(struct device *dev,
                                              struct device_attribute *attr,
                                              const char *buf, size_t n)
{
        s32 value;
        int ret;

        if (!kstrtos32(buf, 0, &value)) {
                /*
                 * Prevent users from writing negative or "no constraint" values
                 * directly.
                 */
                if (value < 0 || value == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT)
                        return -EINVAL;

                if (value == 0)
                        value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
        } else if (sysfs_streq(buf, "n/a")) {
                value = 0;
        } else {
                return -EINVAL;
        }

        ret = dev_pm_qos_update_request(dev->power.qos->resume_latency_req,
                                        value);
        return ret < 0 ? ret : n;
}

static DEVICE_ATTR_RW(pm_qos_resume_latency_us);

static ssize_t pm_qos_latency_tolerance_us_show(struct device *dev,
                                                struct device_attribute *attr,
                                                char *buf)
{
        s32 value = dev_pm_qos_get_user_latency_tolerance(dev);

        if (value < 0)
                return sysfs_emit(buf, "%s\n", "auto");
        if (value == PM_QOS_LATENCY_ANY)
                return sysfs_emit(buf, "%s\n", "any");

        return sysfs_emit(buf, "%d\n", value);
}

static ssize_t pm_qos_latency_tolerance_us_store(struct device *dev,
                                                 struct device_attribute *attr,
                                                 const char *buf, size_t n)
{
        s32 value;
        int ret;

        if (kstrtos32(buf, 0, &value) == 0) {
                /* Users can't write negative values directly */
                if (value < 0)
                        return -EINVAL;
        } else {
                if (sysfs_streq(buf, "auto"))
                        value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
                else if (sysfs_streq(buf, "any"))
                        value = PM_QOS_LATENCY_ANY;
                else
                        return -EINVAL;
        }
        ret = dev_pm_qos_update_user_latency_tolerance(dev, value);
        return ret < 0 ? ret : n;
}

static DEVICE_ATTR_RW(pm_qos_latency_tolerance_us);

static ssize_t pm_qos_no_power_off_show(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf)
{
        return sysfs_emit(buf, "%d\n", !!(dev_pm_qos_requested_flags(dev)
                                          & PM_QOS_FLAG_NO_POWER_OFF));
}

static ssize_t pm_qos_no_power_off_store(struct device *dev,
                                         struct device_attribute *attr,
                                         const char *buf, size_t n)
{
        int ret;

        if (kstrtoint(buf, 0, &ret))
                return -EINVAL;

        if (ret != 0 && ret != 1)
                return -EINVAL;

        ret = dev_pm_qos_update_flags(dev, PM_QOS_FLAG_NO_POWER_OFF, ret);
        return ret < 0 ? ret : n;
}

static DEVICE_ATTR_RW(pm_qos_no_power_off);

#ifdef CONFIG_PM_SLEEP
static const char _enabled[] = "enabled";
static const char _disabled[] = "disabled";

static ssize_t wakeup_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        return sysfs_emit(buf, "%s\n", device_can_wakeup(dev)
                          ? (device_may_wakeup(dev) ? _enabled : _disabled)
                          : "");
}

static ssize_t wakeup_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t n)
{
        if (!device_can_wakeup(dev))
                return -EINVAL;

        if (sysfs_streq(buf, _enabled))
                device_set_wakeup_enable(dev, 1);
        else if (sysfs_streq(buf, _disabled))
                device_set_wakeup_enable(dev, 0);
        else
                return -EINVAL;
        return n;
}

static DEVICE_ATTR_RW(wakeup);

static ssize_t wakeup_count_show(struct device *dev,
                                 struct device_attribute *attr, char *buf)
{
        unsigned long count;
        bool enabled = false;

        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                count = dev->power.wakeup->wakeup_count;
                enabled = true;
        }
        spin_unlock_irq(&dev->power.lock);

        if (!enabled)
                return sysfs_emit(buf, "\n");
        return sysfs_emit(buf, "%lu\n", count);
}

static DEVICE_ATTR_RO(wakeup_count);

static ssize_t wakeup_active_count_show(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf)
{
        unsigned long count;
        bool enabled = false;

        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                count = dev->power.wakeup->active_count;
                enabled = true;
        }
        spin_unlock_irq(&dev->power.lock);

        if (!enabled)
                return sysfs_emit(buf, "\n");
        return sysfs_emit(buf, "%lu\n", count);
}

static DEVICE_ATTR_RO(wakeup_active_count);

static ssize_t wakeup_abort_count_show(struct device *dev,
                                       struct device_attribute *attr,
                                       char *buf)
{
        unsigned long count;
        bool enabled = false;

        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                count = dev->power.wakeup->wakeup_count;
                enabled = true;
        }
        spin_unlock_irq(&dev->power.lock);

        if (!enabled)
                return sysfs_emit(buf, "\n");
        return sysfs_emit(buf, "%lu\n", count);
}

static DEVICE_ATTR_RO(wakeup_abort_count);

static ssize_t wakeup_expire_count_show(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf)
{
        unsigned long count;
        bool enabled = false;

        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                count = dev->power.wakeup->expire_count;
                enabled = true;
        }
        spin_unlock_irq(&dev->power.lock);

        if (!enabled)
                return sysfs_emit(buf, "\n");
        return sysfs_emit(buf, "%lu\n", count);
}

static DEVICE_ATTR_RO(wakeup_expire_count);

static ssize_t wakeup_active_show(struct device *dev,
                                  struct device_attribute *attr, char *buf)
{
        unsigned int active;
        bool enabled = false;

        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                active = dev->power.wakeup->active;
                enabled = true;
        }
        spin_unlock_irq(&dev->power.lock);

        if (!enabled)
                return sysfs_emit(buf, "\n");
        return sysfs_emit(buf, "%u\n", active);
}

static DEVICE_ATTR_RO(wakeup_active);

static ssize_t wakeup_total_time_ms_show(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf)
{
        s64 msec;
        bool enabled = false;

        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                msec = ktime_to_ms(dev->power.wakeup->total_time);
                enabled = true;
        }
        spin_unlock_irq(&dev->power.lock);

        if (!enabled)
                return sysfs_emit(buf, "\n");
        return sysfs_emit(buf, "%lld\n", msec);
}

static DEVICE_ATTR_RO(wakeup_total_time_ms);

static ssize_t wakeup_max_time_ms_show(struct device *dev,
                                       struct device_attribute *attr, char *buf)
{
        s64 msec;
        bool enabled = false;

        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                msec = ktime_to_ms(dev->power.wakeup->max_time);
                enabled = true;
        }
        spin_unlock_irq(&dev->power.lock);

        if (!enabled)
                return sysfs_emit(buf, "\n");
        return sysfs_emit(buf, "%lld\n", msec);
}

static DEVICE_ATTR_RO(wakeup_max_time_ms);

static ssize_t wakeup_last_time_ms_show(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf)
{
        s64 msec;
        bool enabled = false;

        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                msec = ktime_to_ms(dev->power.wakeup->last_time);
                enabled = true;
        }
        spin_unlock_irq(&dev->power.lock);

        if (!enabled)
                return sysfs_emit(buf, "\n");
        return sysfs_emit(buf, "%lld\n", msec);
}

static DEVICE_ATTR_RO(wakeup_last_time_ms);

#ifdef CONFIG_PM_AUTOSLEEP
static ssize_t wakeup_prevent_sleep_time_ms_show(struct device *dev,
                                                 struct device_attribute *attr,
                                                 char *buf)
{
        s64 msec;
        bool enabled = false;

        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                msec = ktime_to_ms(dev->power.wakeup->prevent_sleep_time);
                enabled = true;
        }
        spin_unlock_irq(&dev->power.lock);

        if (!enabled)
                return sysfs_emit(buf, "\n");
        return sysfs_emit(buf, "%lld\n", msec);
}

static DEVICE_ATTR_RO(wakeup_prevent_sleep_time_ms);
#endif /* CONFIG_PM_AUTOSLEEP */

static inline int dpm_sysfs_wakeup_change_owner(struct device *dev, kuid_t kuid,
                                                kgid_t kgid)
{
        if (dev->power.wakeup && dev->power.wakeup->dev)
                return device_change_owner(dev->power.wakeup->dev, kuid, kgid);
        return 0;
}

#else /* CONFIG_PM_SLEEP */
static inline int dpm_sysfs_wakeup_change_owner(struct device *dev, kuid_t kuid,
                                                kgid_t kgid)
{
        return 0;
}
#endif

#ifdef CONFIG_PM_ADVANCED_DEBUG
static ssize_t runtime_usage_show(struct device *dev,
                                  struct device_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n", atomic_read(&dev->power.usage_count));
}
static DEVICE_ATTR_RO(runtime_usage);

static ssize_t runtime_active_kids_show(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf)
{
        return sysfs_emit(buf, "%d\n", dev->power.ignore_children ?
                          0 : atomic_read(&dev->power.child_count));
}
static DEVICE_ATTR_RO(runtime_active_kids);

static ssize_t runtime_enabled_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        const char *output;

        if (dev->power.disable_depth && !dev->power.runtime_auto)
                output = "disabled & forbidden";
        else if (dev->power.disable_depth)
                output = "disabled";
        else if (!dev->power.runtime_auto)
                output = "forbidden";
        else
                output = "enabled";

        return sysfs_emit(buf, "%s\n", output);
}
static DEVICE_ATTR_RO(runtime_enabled);

#ifdef CONFIG_PM_SLEEP
static ssize_t async_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        return sysfs_emit(buf, "%s\n",
                          device_async_suspend_enabled(dev) ?
                          _enabled : _disabled);
}

static ssize_t async_store(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t n)
{
        if (sysfs_streq(buf, _enabled))
                device_enable_async_suspend(dev);
        else if (sysfs_streq(buf, _disabled))
                device_disable_async_suspend(dev);
        else
                return -EINVAL;
        return n;
}

static DEVICE_ATTR_RW(async);

#endif /* CONFIG_PM_SLEEP */
#endif /* CONFIG_PM_ADVANCED_DEBUG */

static struct attribute *power_attrs[] = {
#if defined(CONFIG_PM_ADVANCED_DEBUG) && defined(CONFIG_PM_SLEEP)
        &dev_attr_async.attr,
#endif
        NULL,
};
static const struct attribute_group pm_attr_group = {
        .name        = power_group_name,
        .attrs        = power_attrs,
};

static struct attribute *wakeup_attrs[] = {
#ifdef CONFIG_PM_SLEEP
        &dev_attr_wakeup.attr,
        &dev_attr_wakeup_count.attr,
        &dev_attr_wakeup_active_count.attr,
        &dev_attr_wakeup_abort_count.attr,
        &dev_attr_wakeup_expire_count.attr,
        &dev_attr_wakeup_active.attr,
        &dev_attr_wakeup_total_time_ms.attr,
        &dev_attr_wakeup_max_time_ms.attr,
        &dev_attr_wakeup_last_time_ms.attr,
#ifdef CONFIG_PM_AUTOSLEEP
        &dev_attr_wakeup_prevent_sleep_time_ms.attr,
#endif
#endif
        NULL,
};
static const struct attribute_group pm_wakeup_attr_group = {
        .name        = power_group_name,
        .attrs        = wakeup_attrs,
};

static struct attribute *runtime_attrs[] = {
        &dev_attr_runtime_status.attr,
        &dev_attr_control.attr,
        &dev_attr_runtime_suspended_time.attr,
        &dev_attr_runtime_active_time.attr,
        &dev_attr_autosuspend_delay_ms.attr,
#ifdef CONFIG_PM_ADVANCED_DEBUG
        &dev_attr_runtime_usage.attr,
        &dev_attr_runtime_active_kids.attr,
        &dev_attr_runtime_enabled.attr,
#endif
        NULL,
};
static const struct attribute_group pm_runtime_attr_group = {
        .name        = power_group_name,
        .attrs        = runtime_attrs,
};

static struct attribute *pm_qos_resume_latency_attrs[] = {
        &dev_attr_pm_qos_resume_latency_us.attr,
        NULL,
};
static const struct attribute_group pm_qos_resume_latency_attr_group = {
        .name        = power_group_name,
        .attrs        = pm_qos_resume_latency_attrs,
};

static struct attribute *pm_qos_latency_tolerance_attrs[] = {
        &dev_attr_pm_qos_latency_tolerance_us.attr,
        NULL,
};
static const struct attribute_group pm_qos_latency_tolerance_attr_group = {
        .name        = power_group_name,
        .attrs        = pm_qos_latency_tolerance_attrs,
};

static struct attribute *pm_qos_flags_attrs[] = {
        &dev_attr_pm_qos_no_power_off.attr,
        NULL,
};
static const struct attribute_group pm_qos_flags_attr_group = {
        .name        = power_group_name,
        .attrs        = pm_qos_flags_attrs,
};

int dpm_sysfs_add(struct device *dev)
{
        int rc;

        /* No need to create PM sysfs if explicitly disabled. */
        if (device_pm_not_required(dev))
                return 0;

        rc = sysfs_create_group(&dev->kobj, &pm_attr_group);
        if (rc)
                return rc;

        if (!pm_runtime_has_no_callbacks(dev)) {
                rc = sysfs_merge_group(&dev->kobj, &pm_runtime_attr_group);
                if (rc)
                        goto err_out;
        }
        if (device_can_wakeup(dev)) {
                rc = sysfs_merge_group(&dev->kobj, &pm_wakeup_attr_group);
                if (rc)
                        goto err_runtime;
        }
        if (dev->power.set_latency_tolerance) {
                rc = sysfs_merge_group(&dev->kobj,
                                       &pm_qos_latency_tolerance_attr_group);
                if (rc)
                        goto err_wakeup;
        }
        rc = pm_wakeup_source_sysfs_add(dev);
        if (rc)
                goto err_latency;
        return 0;

 err_latency:
        sysfs_unmerge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group);
 err_wakeup:
        sysfs_unmerge_group(&dev->kobj, &pm_wakeup_attr_group);
 err_runtime:
        sysfs_unmerge_group(&dev->kobj, &pm_runtime_attr_group);
 err_out:
        sysfs_remove_group(&dev->kobj, &pm_attr_group);
        return rc;
}

int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid)
{
        int rc;

        if (device_pm_not_required(dev))
                return 0;

        rc = sysfs_group_change_owner(&dev->kobj, &pm_attr_group, kuid, kgid);
        if (rc)
                return rc;

        if (!pm_runtime_has_no_callbacks(dev)) {
                rc = sysfs_group_change_owner(
                        &dev->kobj, &pm_runtime_attr_group, kuid, kgid);
                if (rc)
                        return rc;
        }

        if (device_can_wakeup(dev)) {
                rc = sysfs_group_change_owner(&dev->kobj, &pm_wakeup_attr_group,
                                              kuid, kgid);
                if (rc)
                        return rc;

                rc = dpm_sysfs_wakeup_change_owner(dev, kuid, kgid);
                if (rc)
                        return rc;
        }

        if (dev->power.set_latency_tolerance) {
                rc = sysfs_group_change_owner(
                        &dev->kobj, &pm_qos_latency_tolerance_attr_group, kuid,
                        kgid);
                if (rc)
                        return rc;
        }
        return 0;
}

int wakeup_sysfs_add(struct device *dev)
{
        int ret = sysfs_merge_group(&dev->kobj, &pm_wakeup_attr_group);

        if (!ret)
                kobject_uevent(&dev->kobj, KOBJ_CHANGE);

        return ret;
}

void wakeup_sysfs_remove(struct device *dev)
{
        sysfs_unmerge_group(&dev->kobj, &pm_wakeup_attr_group);
        kobject_uevent(&dev->kobj, KOBJ_CHANGE);
}

int pm_qos_sysfs_add_resume_latency(struct device *dev)
{
        return sysfs_merge_group(&dev->kobj, &pm_qos_resume_latency_attr_group);
}

void pm_qos_sysfs_remove_resume_latency(struct device *dev)
{
        sysfs_unmerge_group(&dev->kobj, &pm_qos_resume_latency_attr_group);
}

int pm_qos_sysfs_add_flags(struct device *dev)
{
        return sysfs_merge_group(&dev->kobj, &pm_qos_flags_attr_group);
}

void pm_qos_sysfs_remove_flags(struct device *dev)
{
        sysfs_unmerge_group(&dev->kobj, &pm_qos_flags_attr_group);
}

int pm_qos_sysfs_add_latency_tolerance(struct device *dev)
{
        return sysfs_merge_group(&dev->kobj,
                                 &pm_qos_latency_tolerance_attr_group);
}

void pm_qos_sysfs_remove_latency_tolerance(struct device *dev)
{
        sysfs_unmerge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group);
}

void rpm_sysfs_remove(struct device *dev)
{
        sysfs_unmerge_group(&dev->kobj, &pm_runtime_attr_group);
}

void dpm_sysfs_remove(struct device *dev)
{
        if (device_pm_not_required(dev))
                return;
        sysfs_unmerge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group);
        dev_pm_qos_constraints_destroy(dev);
        rpm_sysfs_remove(dev);
        sysfs_unmerge_group(&dev->kobj, &pm_wakeup_attr_group);
        sysfs_remove_group(&dev->kobj, &pm_attr_group);
}








































































    1 




    1 




    1 



















    1 


    1 








    1 

































































































    1 











    1 







    1 

















    1 













    1 









































    1 









    1 




























    1 
    1 




    1 




    1 


    1 

    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/block_validity.c
 *
 * Copyright (C) 2009
 * Theodore Ts'o (tytso@mit.edu)
 *
 * Track which blocks in the filesystem are metadata blocks that
 * should never be used as data blocks by files or directories.
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include "ext4.h"

struct ext4_system_zone {
        struct rb_node        node;
        ext4_fsblk_t        start_blk;
        unsigned int        count;
        u32                ino;
};

static struct kmem_cache *ext4_system_zone_cachep;

int __init ext4_init_system_zone(void)
{
        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
        if (ext4_system_zone_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_system_zone(void)
{
        rcu_barrier();
        kmem_cache_destroy(ext4_system_zone_cachep);
}

static inline int can_merge(struct ext4_system_zone *entry1,
                     struct ext4_system_zone *entry2)
{
        if ((entry1->start_blk + entry1->count) == entry2->start_blk &&
            entry1->ino == entry2->ino)
                return 1;
        return 0;
}

static void release_system_zone(struct ext4_system_blocks *system_blks)
{
        struct ext4_system_zone        *entry, *n;

        rbtree_postorder_for_each_entry_safe(entry, n,
                                &system_blks->root, node)
                kmem_cache_free(ext4_system_zone_cachep, entry);
}

/*
 * Mark a range of blocks as belonging to the "system zone" --- that
 * is, filesystem metadata blocks which should never be used by
 * inodes.
 */
static int add_system_zone(struct ext4_system_blocks *system_blks,
                           ext4_fsblk_t start_blk,
                           unsigned int count, u32 ino)
{
        struct ext4_system_zone *new_entry, *entry;
        struct rb_node **n = &system_blks->root.rb_node, *node;
        struct rb_node *parent = NULL, *new_node;

        while (*n) {
                parent = *n;
                entry = rb_entry(parent, struct ext4_system_zone, node);
                if (start_blk < entry->start_blk)
                        n = &(*n)->rb_left;
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else        /* Unexpected overlap of system zones. */
                        return -EFSCORRUPTED;
        }

        new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
                                     GFP_KERNEL);
        if (!new_entry)
                return -ENOMEM;
        new_entry->start_blk = start_blk;
        new_entry->count = count;
        new_entry->ino = ino;
        new_node = &new_entry->node;

        rb_link_node(new_node, parent, n);
        rb_insert_color(new_node, &system_blks->root);

        /* Can we merge to the left? */
        node = rb_prev(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_system_zone, node);
                if (can_merge(entry, new_entry)) {
                        new_entry->start_blk = entry->start_blk;
                        new_entry->count += entry->count;
                        rb_erase(node, &system_blks->root);
                        kmem_cache_free(ext4_system_zone_cachep, entry);
                }
        }

        /* Can we merge to the right? */
        node = rb_next(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_system_zone, node);
                if (can_merge(new_entry, entry)) {
                        new_entry->count += entry->count;
                        rb_erase(node, &system_blks->root);
                        kmem_cache_free(ext4_system_zone_cachep, entry);
                }
        }
        return 0;
}

static void debug_print_tree(struct ext4_sb_info *sbi)
{
        struct rb_node *node;
        struct ext4_system_zone *entry;
        struct ext4_system_blocks *system_blks;
        int first = 1;

        printk(KERN_INFO "System zones: ");
        rcu_read_lock();
        system_blks = rcu_dereference(sbi->s_system_blks);
        node = rb_first(&system_blks->root);
        while (node) {
                entry = rb_entry(node, struct ext4_system_zone, node);
                printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
                       entry->start_blk, entry->start_blk + entry->count - 1);
                first = 0;
                node = rb_next(node);
        }
        rcu_read_unlock();
        printk(KERN_CONT "\n");
}

static int ext4_protect_reserved_inode(struct super_block *sb,
                                       struct ext4_system_blocks *system_blks,
                                       u32 ino)
{
        struct inode *inode;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_map_blocks map;
        u32 i = 0, num;
        int err = 0, n;

        if ((ino < EXT4_ROOT_INO) ||
            (ino > le32_to_cpu(sbi->s_es->s_inodes_count)))
                return -EINVAL;
        inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
        num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
        while (i < num) {
                cond_resched();
                map.m_lblk = i;
                map.m_len = num - i;
                n = ext4_map_blocks(NULL, inode, &map, 0);
                if (n < 0) {
                        err = n;
                        break;
                }
                if (n == 0) {
                        i++;
                } else {
                        err = add_system_zone(system_blks, map.m_pblk, n, ino);
                        if (err < 0) {
                                if (err == -EFSCORRUPTED) {
                                        EXT4_ERROR_INODE_ERR(inode, -err,
                                                "blocks %llu-%llu from inode overlap system zone",
                                                map.m_pblk,
                                                map.m_pblk + map.m_len - 1);
                                }
                                break;
                        }
                        i += n;
                }
        }
        iput(inode);
        return err;
}

static void ext4_destroy_system_zone(struct rcu_head *rcu)
{
        struct ext4_system_blocks *system_blks;

        system_blks = container_of(rcu, struct ext4_system_blocks, rcu);
        release_system_zone(system_blks);
        kfree(system_blks);
}

/*
 * Build system zone rbtree which is used for block validity checking.
 *
 * The update of system_blks pointer in this function is protected by
 * sb->s_umount semaphore. However we have to be careful as we can be
 * racing with ext4_inode_block_valid() calls reading system_blks rbtree
 * protected only by RCU. That's why we first build the rbtree and then
 * swap it in place.
 */
int ext4_setup_system_zone(struct super_block *sb)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_system_blocks *system_blks;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
        int ret;

        system_blks = kzalloc_obj(*system_blks);
        if (!system_blks)
                return -ENOMEM;

        for (i=0; i < ngroups; i++) {
                unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i);

                cond_resched();
                if (meta_blks != 0) {
                        ret = add_system_zone(system_blks,
                                        ext4_group_first_block_no(sb, i),
                                        meta_blks, 0);
                        if (ret)
                                goto err;
                }
                gdp = ext4_get_group_desc(sb, i, NULL);
                ret = add_system_zone(system_blks,
                                ext4_block_bitmap(sb, gdp), 1, 0);
                if (ret)
                        goto err;
                ret = add_system_zone(system_blks,
                                ext4_inode_bitmap(sb, gdp), 1, 0);
                if (ret)
                        goto err;
                ret = add_system_zone(system_blks,
                                ext4_inode_table(sb, gdp),
                                sbi->s_itb_per_group, 0);
                if (ret)
                        goto err;
        }
        if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
                ret = ext4_protect_reserved_inode(sb, system_blks,
                                le32_to_cpu(sbi->s_es->s_journal_inum));
                if (ret)
                        goto err;
        }

        /*
         * System blks rbtree complete, announce it once to prevent racing
         * with ext4_inode_block_valid() accessing the rbtree at the same
         * time.
         */
        rcu_assign_pointer(sbi->s_system_blks, system_blks);

        if (test_opt(sb, DEBUG))
                debug_print_tree(sbi);
        return 0;
err:
        release_system_zone(system_blks);
        kfree(system_blks);
        return ret;
}

/*
 * Called when the filesystem is unmounted or when remounting it with
 * noblock_validity specified.
 *
 * The update of system_blks pointer in this function is protected by
 * sb->s_umount semaphore. However we have to be careful as we can be
 * racing with ext4_inode_block_valid() calls reading system_blks rbtree
 * protected only by RCU. So we first clear the system_blks pointer and
 * then free the rbtree only after RCU grace period expires.
 */
void ext4_release_system_zone(struct super_block *sb)
{
        struct ext4_system_blocks *system_blks;

        system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks,
                                        lockdep_is_held(&sb->s_umount));
        rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL);

        if (system_blks)
                call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}

int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
                                ext4_fsblk_t start_blk, unsigned int count)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_system_blocks *system_blks;
        struct ext4_system_zone *entry;
        struct rb_node *n;
        int ret = 1;

        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (start_blk + count < start_blk) ||
            (start_blk + count > ext4_blocks_count(sbi->s_es)))
                return 0;

        /*
         * Lock the system zone to prevent it being released concurrently
         * when doing a remount which inverse current "[no]block_validity"
         * mount option.
         */
        rcu_read_lock();
        system_blks = rcu_dereference(sbi->s_system_blks);
        if (system_blks == NULL)
                goto out_rcu;

        n = system_blks->root.rb_node;
        while (n) {
                entry = rb_entry(n, struct ext4_system_zone, node);
                if (start_blk + count - 1 < entry->start_blk)
                        n = n->rb_left;
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = n->rb_right;
                else {
                        ret = 0;
                        if (inode)
                                ret = (entry->ino == inode->i_ino);
                        break;
                }
        }
out_rcu:
        rcu_read_unlock();
        return ret;
}

/*
 * Returns 1 if the passed-in block region (start_blk,
 * start_blk+count) is valid; 0 if some part of the block region
 * overlaps with some other filesystem metadata blocks.
 */
int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
                          unsigned int count)
{
        return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count);
}

int ext4_check_blockref(const char *function, unsigned int line,
                        struct inode *inode, __le32 *p, unsigned int max)
{
        __le32 *bref = p;
        unsigned int blk;
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

        if (journal && inode == journal->j_inode)
                return 0;

        while (bref < p+max) {
                blk = le32_to_cpu(*bref++);
                if (blk &&
                    unlikely(!ext4_inode_block_valid(inode, blk, 1))) {
                        ext4_error_inode(inode, function, line, blk,
                                         "invalid block");
                        return -EFSCORRUPTED;
                }
        }
        return 0;
}













































   17 









    3 
   17 












   17 
   16 


    3 
   18 



   18 



















   15 
   15 

















   15 

   15 































    2 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                IP/TCP/UDP checksumming routines
 *
 * Authors:        Jorge Cwik, <jorge@laser.satlink.net>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Tom May, <ftom@netcom.com>
 *                Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de>
 *                Lots of code moved from tcp.c and ip.c; see those files
 *                for more names.
 *
 * 03/02/96        Jes Sorensen, Andreas Schwab, Roman Hodek:
 *                Fixed some nasty bugs, causing some horrible crashes.
 *                A: At some points, the sum (%0) was used as
 *                length-counter instead of the length counter
 *                (%1). Thanks to Roman Hodek for pointing this out.
 *                B: GCC seems to mess up if one uses too many
 *                data-registers to hold input values and one tries to
 *                specify d0 and d1 as scratch registers. Letting gcc
 *                choose these registers itself solves the problem.
 */

/* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access
 kills, so most of the assembly has to go. */

#include <linux/export.h>
#include <net/checksum.h>

#include <asm/byteorder.h>

#ifndef do_csum
static unsigned int do_csum(const unsigned char *buff, int len)
{
        int odd;
        unsigned int result = 0;

        if (len <= 0)
                goto out;
        odd = 1 & (unsigned long) buff;
        if (odd) {
#ifdef __LITTLE_ENDIAN
                result += (*buff << 8);
#else
                result = *buff;
#endif
                len--;
                buff++;
        }
        if (len >= 2) {
                if (2 & (unsigned long) buff) {
                        result += *(unsigned short *) buff;
                        len -= 2;
                        buff += 2;
                }
                if (len >= 4) {
                        const unsigned char *end = buff + ((unsigned)len & ~3);
                        unsigned int carry = 0;
                        do {
                                unsigned int w = *(unsigned int *) buff;
                                buff += 4;
                                result += carry;
                                result += w;
                                carry = (w > result);
                        } while (buff < end);
                        result += carry;
                        result = (result & 0xffff) + (result >> 16);
                }
                if (len & 2) {
                        result += *(unsigned short *) buff;
                        buff += 2;
                }
        }
        if (len & 1)
#ifdef __LITTLE_ENDIAN
                result += *buff;
#else
                result += (*buff << 8);
#endif
        result = csum_from32to16(result);
        if (odd)
                result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
out:
        return result;
}
#endif

#ifndef ip_fast_csum
/*
 *        This is a version of ip_compute_csum() optimized for IP headers,
 *        which always checksum on 4 octet boundaries.
 */
__sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
        return (__force __sum16)~do_csum(iph, ihl*4);
}
EXPORT_SYMBOL(ip_fast_csum);
#endif

/*
 * computes the checksum of a memory block at buff, length len,
 * and adds in "sum" (32-bit)
 *
 * returns a 32-bit number suitable for feeding into itself
 * or csum_tcpudp_magic
 *
 * this function must be called with even lengths, except
 * for the last fragment, which may be odd
 *
 * it's best to have buff aligned on a 32-bit boundary
 */
__wsum csum_partial(const void *buff, int len, __wsum wsum)
{
        unsigned int sum = (__force unsigned int)wsum;
        unsigned int result = do_csum(buff, len);

        /* add in old sum, and carry.. */
        result += sum;
        if (sum > result)
                result += 1;
        return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial);

/*
 * this routine is used for miscellaneous IP-like checksums, mainly
 * in icmp.c
 */
__sum16 ip_compute_csum(const void *buff, int len)
{
        return (__force __sum16)~do_csum(buff, len);
}
EXPORT_SYMBOL(ip_compute_csum);

#ifndef csum_tcpudp_nofold
static inline u32 from64to32(u64 x)
{
        /* add up 32-bit and 32-bit for 32+c bit */
        x = (x & 0xffffffff) + (x >> 32);
        /* add up carry.. */
        x = (x & 0xffffffff) + (x >> 32);
        return (u32)x;
}

__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
                          __u32 len, __u8 proto, __wsum sum)
{
        unsigned long long s = (__force u32)sum;

        s += (__force u32)saddr;
        s += (__force u32)daddr;
#ifdef __BIG_ENDIAN
        s += proto + len;
#else
        s += (proto + len) << 8;
#endif
        return (__force __wsum)from64to32(s);
}
EXPORT_SYMBOL(csum_tcpudp_nofold);
#endif




















































































































































































































































































































    2 





















    2 











    3 
























    4 















    4 



    3 












    4 








    1 
















    2 
















































































































    3 
    1 
    7 



    3 


























    3 




































































































































































































































































































































































    2 















































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
 * Copyright (C) 2017-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005
 * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved.
 *
 * This driver produces cryptographically secure pseudorandom data. It is divided
 * into roughly six sections, each with a section header:
 *
 *   - Initialization and readiness waiting.
 *   - Fast key erasure RNG, the "crng".
 *   - Entropy accumulation and extraction routines.
 *   - Entropy collection routines.
 *   - Userspace reader/writer interfaces.
 *   - Sysctl interface.
 *
 * The high level overview is that there is one input pool, into which
 * various pieces of data are hashed. Prior to initialization, some of that
 * data is then "credited" as having a certain number of bits of entropy.
 * When enough bits of entropy are available, the hash is finalized and
 * handed as a key to a stream cipher that expands it indefinitely for
 * various consumers. This key is periodically refreshed as the various
 * entropy collectors, described below, add data to the input pool.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/utsname.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/workqueue.h>
#include <linux/irq.h>
#include <linux/ratelimit.h>
#include <linux/syscalls.h>
#include <linux/completion.h>
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include <linux/siphash.h>
#include <linux/sched/isolation.h>
#include <crypto/chacha.h>
#include <crypto/blake2s.h>
#include <vdso/datapage.h>
#include <asm/archrandom.h>
#include <asm/processor.h>
#include <asm/irq.h>
#include <asm/irq_regs.h>
#include <asm/io.h>

/*********************************************************************
 *
 * Initialization and readiness waiting.
 *
 * Much of the RNG infrastructure is devoted to various dependencies
 * being able to wait until the RNG has collected enough entropy and
 * is ready for safe consumption.
 *
 *********************************************************************/

/*
 * crng_init is protected by base_crng->lock, and only increases
 * its value (from empty->early->ready).
 */
static enum {
        CRNG_EMPTY = 0, /* Little to no entropy collected */
        CRNG_EARLY = 1, /* At least POOL_EARLY_BITS collected */
        CRNG_READY = 2  /* Fully initialized with POOL_READY_BITS collected */
} crng_init __read_mostly = CRNG_EMPTY;
static DEFINE_STATIC_KEY_FALSE(crng_is_ready);
#define crng_ready() (static_branch_likely(&crng_is_ready) || crng_init >= CRNG_READY)
/* Various types of waiters for crng_init->CRNG_READY transition. */
static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
static struct fasync_struct *fasync;
static ATOMIC_NOTIFIER_HEAD(random_ready_notifier);

/* Control how we warn userspace. */
static struct ratelimit_state urandom_warning =
        RATELIMIT_STATE_INIT_FLAGS("urandom_warning", HZ, 3, RATELIMIT_MSG_ON_RELEASE);
static int ratelimit_disable __read_mostly = 0;
module_param_named(ratelimit_disable, ratelimit_disable, int, 0644);
MODULE_PARM_DESC(ratelimit_disable, "Disable random ratelimit suppression");

/*
 * Returns whether or not the input pool has been seeded and thus guaranteed
 * to supply cryptographically secure random numbers. This applies to: the
 * /dev/urandom device, the get_random_bytes function, and the get_random_{u8,
 * u16,u32,u64,long} family of functions.
 *
 * Returns: true if the input pool has been seeded.
 *          false if the input pool has not been seeded.
 */
bool rng_is_initialized(void)
{
        return crng_ready();
}
EXPORT_SYMBOL(rng_is_initialized);

static void __cold crng_set_ready(struct work_struct *work)
{
        static_branch_enable(&crng_is_ready);
}

/* Used by wait_for_random_bytes(), and considered an entropy collector, below. */
static void try_to_generate_entropy(void);

/*
 * Wait for the input pool to be seeded and thus guaranteed to supply
 * cryptographically secure random numbers. This applies to: the /dev/urandom
 * device, the get_random_bytes function, and the get_random_{u8,u16,u32,u64,
 * long} family of functions. Using any of these functions without first
 * calling this function forfeits the guarantee of security.
 *
 * Returns: 0 if the input pool has been seeded.
 *          -ERESTARTSYS if the function was interrupted by a signal.
 */
int wait_for_random_bytes(void)
{
        while (!crng_ready()) {
                int ret;

                try_to_generate_entropy();
                ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ);
                if (ret)
                        return ret > 0 ? 0 : ret;
        }
        return 0;
}
EXPORT_SYMBOL(wait_for_random_bytes);

/*
 * Add a callback function that will be invoked when the crng is initialised,
 * or immediately if it already has been. Only use this is you are absolutely
 * sure it is required. Most users should instead be able to test
 * `rng_is_initialized()` on demand, or make use of `get_random_bytes_wait()`.
 */
int __cold execute_with_initialized_rng(struct notifier_block *nb)
{
        unsigned long flags;
        int ret = 0;

        spin_lock_irqsave(&random_ready_notifier.lock, flags);
        if (crng_ready())
                nb->notifier_call(nb, 0, NULL);
        else
                ret = raw_notifier_chain_register((struct raw_notifier_head *)&random_ready_notifier.head, nb);
        spin_unlock_irqrestore(&random_ready_notifier.lock, flags);
        return ret;
}

/*********************************************************************
 *
 * Fast key erasure RNG, the "crng".
 *
 * These functions expand entropy from the entropy extractor into
 * long streams for external consumption using the "fast key erasure"
 * RNG described at <https://blog.cr.yp.to/20170723-random.html>.
 *
 * There are a few exported interfaces for use by other drivers:
 *
 *        void get_random_bytes(void *buf, size_t len)
 *        u8 get_random_u8()
 *        u16 get_random_u16()
 *        u32 get_random_u32()
 *        u32 get_random_u32_below(u32 ceil)
 *        u32 get_random_u32_above(u32 floor)
 *        u32 get_random_u32_inclusive(u32 floor, u32 ceil)
 *        u64 get_random_u64()
 *        unsigned long get_random_long()
 *
 * These interfaces will return the requested number of random bytes
 * into the given buffer or as a return value. This is equivalent to
 * a read from /dev/urandom. The u8, u16, u32, u64, long family of
 * functions may be higher performance for one-off random integers,
 * because they do a bit of buffering and do not invoke reseeding
 * until the buffer is emptied.
 *
 *********************************************************************/

enum {
        CRNG_RESEED_START_INTERVAL = HZ,
        CRNG_RESEED_INTERVAL = 60 * HZ
};

static struct {
        u8 key[CHACHA_KEY_SIZE] __aligned(__alignof__(long));
        unsigned long generation;
        spinlock_t lock;
} base_crng = {
        .lock = __SPIN_LOCK_UNLOCKED(base_crng.lock)
};

struct crng {
        u8 key[CHACHA_KEY_SIZE];
        unsigned long generation;
        local_lock_t lock;
};

static DEFINE_PER_CPU(struct crng, crngs) = {
        .generation = ULONG_MAX,
        .lock = INIT_LOCAL_LOCK(crngs.lock),
};

/*
 * Return the interval until the next reseeding, which is normally
 * CRNG_RESEED_INTERVAL, but during early boot, it is at an interval
 * proportional to the uptime.
 */
static unsigned int crng_reseed_interval(void)
{
        static bool early_boot = true;

        if (unlikely(READ_ONCE(early_boot))) {
                time64_t uptime = ktime_get_seconds();
                if (uptime >= CRNG_RESEED_INTERVAL / HZ * 2)
                        WRITE_ONCE(early_boot, false);
                else
                        return max_t(unsigned int, CRNG_RESEED_START_INTERVAL,
                                     (unsigned int)uptime / 2 * HZ);
        }
        return CRNG_RESEED_INTERVAL;
}

/* Used by crng_reseed() and crng_make_state() to extract a new seed from the input pool. */
static void extract_entropy(void *buf, size_t len);

/* This extracts a new crng key from the input pool. */
static void crng_reseed(struct work_struct *work)
{
        static DECLARE_DELAYED_WORK(next_reseed, crng_reseed);
        unsigned long flags;
        unsigned long next_gen;
        u8 key[CHACHA_KEY_SIZE];

        /* Immediately schedule the next reseeding, so that it fires sooner rather than later. */
        if (likely(system_dfl_wq))
                queue_delayed_work(system_dfl_wq, &next_reseed, crng_reseed_interval());

        extract_entropy(key, sizeof(key));

        /*
         * We copy the new key into the base_crng, overwriting the old one,
         * and update the generation counter. We avoid hitting ULONG_MAX,
         * because the per-cpu crngs are initialized to ULONG_MAX, so this
         * forces new CPUs that come online to always initialize.
         */
        spin_lock_irqsave(&base_crng.lock, flags);
        memcpy(base_crng.key, key, sizeof(base_crng.key));
        next_gen = base_crng.generation + 1;
        if (next_gen == ULONG_MAX)
                ++next_gen;
        WRITE_ONCE(base_crng.generation, next_gen);

        /* base_crng.generation's invalid value is ULONG_MAX, while
         * vdso_k_rng_data->generation's invalid value is 0, so add one to the
         * former to arrive at the latter. Use smp_store_release so that this
         * is ordered with the write above to base_crng.generation. Pairs with
         * the smp_rmb() before the syscall in the vDSO code.
         *
         * Cast to unsigned long for 32-bit architectures, since atomic 64-bit
         * operations are not supported on those architectures. This is safe
         * because base_crng.generation is a 32-bit value. On big-endian
         * architectures it will be stored in the upper 32 bits, but that's okay
         * because the vDSO side only checks whether the value changed, without
         * actually using or interpreting the value.
         */
        if (IS_ENABLED(CONFIG_VDSO_GETRANDOM))
                smp_store_release((unsigned long *)&vdso_k_rng_data->generation, next_gen + 1);

        if (!static_branch_likely(&crng_is_ready))
                crng_init = CRNG_READY;
        spin_unlock_irqrestore(&base_crng.lock, flags);
        memzero_explicit(key, sizeof(key));
}

/*
 * This generates a ChaCha block using the provided key, and then
 * immediately overwrites that key with half the block. It returns
 * the resultant ChaCha state to the user, along with the second
 * half of the block containing 32 bytes of random data that may
 * be used; random_data_len may not be greater than 32.
 *
 * The returned ChaCha state contains within it a copy of the old
 * key value, at index 4, so the state should always be zeroed out
 * immediately after using in order to maintain forward secrecy.
 * If the state cannot be erased in a timely manner, then it is
 * safer to set the random_data parameter to &chacha_state->x[4]
 * so that this function overwrites it before returning.
 */
static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE],
                                  struct chacha_state *chacha_state,
                                  u8 *random_data, size_t random_data_len)
{
        u8 first_block[CHACHA_BLOCK_SIZE];

        BUG_ON(random_data_len > 32);

        chacha_init_consts(chacha_state);
        memcpy(&chacha_state->x[4], key, CHACHA_KEY_SIZE);
        memset(&chacha_state->x[12], 0, sizeof(u32) * 4);
        chacha20_block(chacha_state, first_block);

        memcpy(key, first_block, CHACHA_KEY_SIZE);
        memcpy(random_data, first_block + CHACHA_KEY_SIZE, random_data_len);
        memzero_explicit(first_block, sizeof(first_block));
}

/*
 * This function returns a ChaCha state that you may use for generating
 * random data. It also returns up to 32 bytes on its own of random data
 * that may be used; random_data_len may not be greater than 32.
 */
static void crng_make_state(struct chacha_state *chacha_state,
                            u8 *random_data, size_t random_data_len)
{
        unsigned long flags;
        struct crng *crng;

        BUG_ON(random_data_len > 32);

        /*
         * For the fast path, we check whether we're ready, unlocked first, and
         * then re-check once locked later. In the case where we're really not
         * ready, we do fast key erasure with the base_crng directly, extracting
         * when crng_init is CRNG_EMPTY.
         */
        if (!crng_ready()) {
                bool ready;

                spin_lock_irqsave(&base_crng.lock, flags);
                ready = crng_ready();
                if (!ready) {
                        if (crng_init == CRNG_EMPTY)
                                extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_fast_key_erasure(base_crng.key, chacha_state,
                                              random_data, random_data_len);
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
                if (!ready)
                        return;
        }

        local_lock_irqsave(&crngs.lock, flags);
        crng = raw_cpu_ptr(&crngs);

        /*
         * If our per-cpu crng is older than the base_crng, then it means
         * somebody reseeded the base_crng. In that case, we do fast key
         * erasure on the base_crng, and use its output as the new key
         * for our per-cpu crng. This brings us up to date with base_crng.
         */
        if (unlikely(crng->generation != READ_ONCE(base_crng.generation))) {
                spin_lock(&base_crng.lock);
                crng_fast_key_erasure(base_crng.key, chacha_state,
                                      crng->key, sizeof(crng->key));
                crng->generation = base_crng.generation;
                spin_unlock(&base_crng.lock);
        }

        /*
         * Finally, when we've made it this far, our per-cpu crng has an up
         * to date key, and we can do fast key erasure with it to produce
         * some random data and a ChaCha state for the caller. All other
         * branches of this function are "unlikely", so most of the time we
         * should wind up here immediately.
         */
        crng_fast_key_erasure(crng->key, chacha_state, random_data, random_data_len);
        local_unlock_irqrestore(&crngs.lock, flags);
}

static void _get_random_bytes(void *buf, size_t len)
{
        struct chacha_state chacha_state;
        u8 tmp[CHACHA_BLOCK_SIZE];
        size_t first_block_len;

        if (!len)
                return;

        first_block_len = min_t(size_t, 32, len);
        crng_make_state(&chacha_state, buf, first_block_len);
        len -= first_block_len;
        buf += first_block_len;

        while (len) {
                if (len < CHACHA_BLOCK_SIZE) {
                        chacha20_block(&chacha_state, tmp);
                        memcpy(buf, tmp, len);
                        memzero_explicit(tmp, sizeof(tmp));
                        break;
                }

                chacha20_block(&chacha_state, buf);
                if (unlikely(chacha_state.x[12] == 0))
                        ++chacha_state.x[13];
                len -= CHACHA_BLOCK_SIZE;
                buf += CHACHA_BLOCK_SIZE;
        }

        chacha_zeroize_state(&chacha_state);
}

/*
 * This returns random bytes in arbitrary quantities. The quality of the
 * random bytes is as good as /dev/urandom. In order to ensure that the
 * randomness provided by this function is okay, the function
 * wait_for_random_bytes() should be called and return 0 at least once
 * at any point prior.
 */
void get_random_bytes(void *buf, size_t len)
{
        _get_random_bytes(buf, len);
}
EXPORT_SYMBOL(get_random_bytes);

static ssize_t get_random_bytes_user(struct iov_iter *iter)
{
        struct chacha_state chacha_state;
        u8 block[CHACHA_BLOCK_SIZE];
        size_t ret = 0, copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        /*
         * Immediately overwrite the ChaCha key at index 4 with random
         * bytes, in case userspace causes copy_to_iter() below to sleep
         * forever, so that we still retain forward secrecy in that case.
         */
        crng_make_state(&chacha_state, (u8 *)&chacha_state.x[4],
                        CHACHA_KEY_SIZE);
        /*
         * However, if we're doing a read of len <= 32, we don't need to
         * use chacha_state after, so we can simply return those bytes to
         * the user directly.
         */
        if (iov_iter_count(iter) <= CHACHA_KEY_SIZE) {
                ret = copy_to_iter(&chacha_state.x[4], CHACHA_KEY_SIZE, iter);
                goto out_zero_chacha;
        }

        for (;;) {
                chacha20_block(&chacha_state, block);
                if (unlikely(chacha_state.x[12] == 0))
                        ++chacha_state.x[13];

                copied = copy_to_iter(block, sizeof(block), iter);
                ret += copied;
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
out_zero_chacha:
        chacha_zeroize_state(&chacha_state);
        return ret ? ret : -EFAULT;
}

/*
 * Batched entropy returns random integers. The quality of the random
 * number is as good as /dev/urandom. In order to ensure that the randomness
 * provided by this function is okay, the function wait_for_random_bytes()
 * should be called and return 0 at least once at any point prior.
 */

#define DEFINE_BATCHED_ENTROPY(type)                                                \
struct batch_ ##type {                                                                \
        /*                                                                        \
         * We make this 1.5x a ChaCha block, so that we get the                        \
         * remaining 32 bytes from fast key erasure, plus one full                \
         * block from the detached ChaCha state. We can increase                \
         * the size of this later if needed so long as we keep the                \
         * formula of (integer_blocks + 0.5) * CHACHA_BLOCK_SIZE.                \
         */                                                                        \
        type entropy[CHACHA_BLOCK_SIZE * 3 / (2 * sizeof(type))];                \
        local_lock_t lock;                                                        \
        unsigned long generation;                                                \
        unsigned int position;                                                        \
};                                                                                \
                                                                                \
static DEFINE_PER_CPU(struct batch_ ##type, batched_entropy_ ##type) = {        \
        .lock = INIT_LOCAL_LOCK(batched_entropy_ ##type.lock),                        \
        .position = UINT_MAX                                                        \
};                                                                                \
                                                                                \
type get_random_ ##type(void)                                                        \
{                                                                                \
        type ret;                                                                \
        unsigned long flags;                                                        \
        struct batch_ ##type *batch;                                                \
        unsigned long next_gen;                                                        \
                                                                                \
        if  (!crng_ready()) {                                                        \
                _get_random_bytes(&ret, sizeof(ret));                                \
                return ret;                                                        \
        }                                                                        \
                                                                                \
        local_lock_irqsave(&batched_entropy_ ##type.lock, flags);                \
        batch = raw_cpu_ptr(&batched_entropy_##type);                                \
                                                                                \
        next_gen = READ_ONCE(base_crng.generation);                                \
        if (batch->position >= ARRAY_SIZE(batch->entropy) ||                        \
            next_gen != batch->generation) {                                        \
                _get_random_bytes(batch->entropy, sizeof(batch->entropy));        \
                batch->position = 0;                                                \
                batch->generation = next_gen;                                        \
        }                                                                        \
                                                                                \
        ret = batch->entropy[batch->position];                                        \
        batch->entropy[batch->position] = 0;                                        \
        ++batch->position;                                                        \
        local_unlock_irqrestore(&batched_entropy_ ##type.lock, flags);                \
        return ret;                                                                \
}                                                                                \
EXPORT_SYMBOL(get_random_ ##type);

DEFINE_BATCHED_ENTROPY(u8)
DEFINE_BATCHED_ENTROPY(u16)
DEFINE_BATCHED_ENTROPY(u32)
DEFINE_BATCHED_ENTROPY(u64)

u32 __get_random_u32_below(u32 ceil)
{
        /*
         * This is the slow path for variable ceil. It is still fast, most of
         * the time, by doing traditional reciprocal multiplication and
         * opportunistically comparing the lower half to ceil itself, before
         * falling back to computing a larger bound, and then rejecting samples
         * whose lower half would indicate a range indivisible by ceil. The use
         * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable
         * in 32-bits.
         */
        u32 rand = get_random_u32();
        u64 mult;

        /*
         * This function is technically undefined for ceil == 0, and in fact
         * for the non-underscored constant version in the header, we build bug
         * on that. But for the non-constant case, it's convenient to have that
         * evaluate to being a straight call to get_random_u32(), so that
         * get_random_u32_inclusive() can work over its whole range without
         * undefined behavior.
         */
        if (unlikely(!ceil))
                return rand;

        mult = (u64)ceil * rand;
        if (unlikely((u32)mult < ceil)) {
                u32 bound = -ceil % ceil;
                while (unlikely((u32)mult < bound))
                        mult = (u64)ceil * get_random_u32();
        }
        return mult >> 32;
}
EXPORT_SYMBOL(__get_random_u32_below);

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU is coming up, with entry
 * CPUHP_RANDOM_PREPARE, which comes before CPUHP_WORKQUEUE_PREP.
 */
int __cold random_prepare_cpu(unsigned int cpu)
{
        /*
         * When the cpu comes back online, immediately invalidate both
         * the per-cpu crng and all batches, so that we serve fresh
         * randomness.
         */
        per_cpu_ptr(&crngs, cpu)->generation = ULONG_MAX;
        per_cpu_ptr(&batched_entropy_u8, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u16, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u32, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u64, cpu)->position = UINT_MAX;
        return 0;
}
#endif


/**********************************************************************
 *
 * Entropy accumulation and extraction routines.
 *
 * Callers may add entropy via:
 *
 *     static void mix_pool_bytes(const void *buf, size_t len)
 *
 * After which, if added entropy should be credited:
 *
 *     static void credit_init_bits(size_t bits)
 *
 * Finally, extract entropy via:
 *
 *     static void extract_entropy(void *buf, size_t len)
 *
 **********************************************************************/

enum {
        POOL_BITS = BLAKE2S_HASH_SIZE * 8,
        POOL_READY_BITS = POOL_BITS, /* When crng_init->CRNG_READY */
        POOL_EARLY_BITS = POOL_READY_BITS / 2 /* When crng_init->CRNG_EARLY */
};

static struct {
        struct blake2s_ctx hash;
        spinlock_t lock;
        unsigned int init_bits;
} input_pool = {
        .hash.h = { BLAKE2S_IV0 ^ (0x01010000 | BLAKE2S_HASH_SIZE),
                    BLAKE2S_IV1, BLAKE2S_IV2, BLAKE2S_IV3, BLAKE2S_IV4,
                    BLAKE2S_IV5, BLAKE2S_IV6, BLAKE2S_IV7 },
        .hash.outlen = BLAKE2S_HASH_SIZE,
        .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
};

static void _mix_pool_bytes(const void *buf, size_t len)
{
        blake2s_update(&input_pool.hash, buf, len);
}

/*
 * This function adds bytes into the input pool. It does not
 * update the initialization bit counter; the caller should call
 * credit_init_bits if this is appropriate.
 */
static void mix_pool_bytes(const void *buf, size_t len)
{
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}

/*
 * This is an HKDF-like construction for using the hashed collected entropy
 * as a PRF key, that's then expanded block-by-block.
 */
static void extract_entropy(void *buf, size_t len)
{
        unsigned long flags;
        u8 seed[BLAKE2S_HASH_SIZE], next_key[BLAKE2S_HASH_SIZE];
        struct {
                unsigned long rdseed[32 / sizeof(long)];
                size_t counter;
        } block;
        size_t i, longs;

        for (i = 0; i < ARRAY_SIZE(block.rdseed);) {
                longs = arch_get_random_seed_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
                if (longs) {
                        i += longs;
                        continue;
                }
                longs = arch_get_random_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
                if (longs) {
                        i += longs;
                        continue;
                }
                block.rdseed[i++] = random_get_entropy();
        }

        spin_lock_irqsave(&input_pool.lock, flags);

        /* seed = HASHPRF(last_key, entropy_input) */
        blake2s_final(&input_pool.hash, seed);

        /* next_key = HASHPRF(seed, RDSEED || 0) */
        block.counter = 0;
        blake2s(seed, sizeof(seed), (const u8 *)&block, sizeof(block), next_key, sizeof(next_key));
        blake2s_init_key(&input_pool.hash, BLAKE2S_HASH_SIZE, next_key, sizeof(next_key));

        spin_unlock_irqrestore(&input_pool.lock, flags);
        memzero_explicit(next_key, sizeof(next_key));

        while (len) {
                i = min_t(size_t, len, BLAKE2S_HASH_SIZE);
                /* output = HASHPRF(seed, RDSEED || ++counter) */
                ++block.counter;
                blake2s(seed, sizeof(seed), (const u8 *)&block, sizeof(block), buf, i);
                len -= i;
                buf += i;
        }

        memzero_explicit(seed, sizeof(seed));
        memzero_explicit(&block, sizeof(block));
}

#define credit_init_bits(bits) if (!crng_ready()) _credit_init_bits(bits)

static void __cold _credit_init_bits(size_t bits)
{
        static DECLARE_WORK(set_ready, crng_set_ready);
        unsigned int new, orig, add;
        unsigned long flags;
        int m;

        if (!bits)
                return;

        add = min_t(size_t, bits, POOL_BITS);

        orig = READ_ONCE(input_pool.init_bits);
        do {
                new = min_t(unsigned int, POOL_BITS, orig + add);
        } while (!try_cmpxchg(&input_pool.init_bits, &orig, new));

        if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) {
                crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */
                if (system_dfl_wq)
                        queue_work(system_dfl_wq, &set_ready);
                atomic_notifier_call_chain(&random_ready_notifier, 0, NULL);
                if (IS_ENABLED(CONFIG_VDSO_GETRANDOM))
                        WRITE_ONCE(vdso_k_rng_data->is_ready, true);
                wake_up_interruptible(&crng_init_wait);
                kill_fasync(&fasync, SIGIO, POLL_IN);
                pr_notice("crng init done\n");
                m = ratelimit_state_get_miss(&urandom_warning);
                if (m)
                        pr_notice("%d urandom warning(s) missed due to ratelimiting\n", m);
        } else if (orig < POOL_EARLY_BITS && new >= POOL_EARLY_BITS) {
                spin_lock_irqsave(&base_crng.lock, flags);
                /* Check if crng_init is CRNG_EMPTY, to avoid race with crng_reseed(). */
                if (crng_init == CRNG_EMPTY) {
                        extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_init = CRNG_EARLY;
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
        }
}


/**********************************************************************
 *
 * Entropy collection routines.
 *
 * The following exported functions are used for pushing entropy into
 * the above entropy accumulation routines:
 *
 *        void add_device_randomness(const void *buf, size_t len);
 *        void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after);
 *        void add_bootloader_randomness(const void *buf, size_t len);
 *        void add_vmfork_randomness(const void *unique_vm_id, size_t len);
 *        void add_interrupt_randomness(int irq);
 *        void add_input_randomness(unsigned int type, unsigned int code, unsigned int value);
 *        void add_disk_randomness(struct gendisk *disk);
 *
 * add_device_randomness() adds data to the input pool that
 * is likely to differ between two devices (or possibly even per boot).
 * This would be things like MAC addresses or serial numbers, or the
 * read-out of the RTC. This does *not* credit any actual entropy to
 * the pool, but it initializes the pool to different values for devices
 * that might otherwise be identical and have very little entropy
 * available to them (particularly common in the embedded world).
 *
 * add_hwgenerator_randomness() is for true hardware RNGs, and will credit
 * entropy as specified by the caller. If the entropy pool is full it will
 * block until more entropy is needed.
 *
 * add_bootloader_randomness() is called by bootloader drivers, such as EFI
 * and device tree, and credits its input depending on whether or not the
 * command line option 'random.trust_bootloader' is set.
 *
 * add_vmfork_randomness() adds a unique (but not necessarily secret) ID
 * representing the current instance of a VM to the pool, without crediting,
 * and then force-reseeds the crng so that it takes effect immediately.
 *
 * add_interrupt_randomness() uses the interrupt timing as random
 * inputs to the entropy pool. Using the cycle counters and the irq source
 * as inputs, it feeds the input pool roughly once a second or after 64
 * interrupts, crediting 1 bit of entropy for whichever comes first.
 *
 * add_input_randomness() uses the input layer interrupt timing, as well
 * as the event type information from the hardware.
 *
 * add_disk_randomness() uses what amounts to the seek time of block
 * layer request events, on a per-disk_devt basis, as input to the
 * entropy pool. Note that high-speed solid state drives with very low
 * seek times do not make for good sources of entropy, as their seek
 * times are usually fairly consistent.
 *
 * The last two routines try to estimate how many bits of entropy
 * to credit. They do this by keeping track of the first and second
 * order deltas of the event timings.
 *
 **********************************************************************/

static bool trust_cpu __initdata = true;
static bool trust_bootloader __initdata = true;
static int __init parse_trust_cpu(char *arg)
{
        return kstrtobool(arg, &trust_cpu);
}
static int __init parse_trust_bootloader(char *arg)
{
        return kstrtobool(arg, &trust_bootloader);
}
early_param("random.trust_cpu", parse_trust_cpu);
early_param("random.trust_bootloader", parse_trust_bootloader);

static int random_pm_notification(struct notifier_block *nb, unsigned long action, void *data)
{
        unsigned long flags, entropy = random_get_entropy();

        /*
         * Encode a representation of how long the system has been suspended,
         * in a way that is distinct from prior system suspends.
         */
        ktime_t stamps[] = { ktime_get(), ktime_get_boottime(), ktime_get_real() };

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&action, sizeof(action));
        _mix_pool_bytes(stamps, sizeof(stamps));
        _mix_pool_bytes(&entropy, sizeof(entropy));
        spin_unlock_irqrestore(&input_pool.lock, flags);

        if (crng_ready() && (action == PM_RESTORE_PREPARE ||
            (action == PM_POST_SUSPEND && !IS_ENABLED(CONFIG_PM_AUTOSLEEP) &&
             !IS_ENABLED(CONFIG_PM_USERSPACE_AUTOSLEEP)))) {
                crng_reseed(NULL);
                pr_notice("crng reseeded on system resumption\n");
        }
        return 0;
}

static struct notifier_block pm_notifier = { .notifier_call = random_pm_notification };

/*
 * This is called extremely early, before time keeping functionality is
 * available, but arch randomness is. Interrupts are not yet enabled.
 */
void __init random_init_early(const char *command_line)
{
        unsigned long entropy[BLAKE2S_BLOCK_SIZE / sizeof(long)];
        size_t i, longs, arch_bits;

#if defined(LATENT_ENTROPY_PLUGIN)
        static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy;
        _mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed));
#endif

        for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) {
                longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i);
                if (longs) {
                        _mix_pool_bytes(entropy, sizeof(*entropy) * longs);
                        i += longs;
                        continue;
                }
                longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i);
                if (longs) {
                        _mix_pool_bytes(entropy, sizeof(*entropy) * longs);
                        i += longs;
                        continue;
                }
                arch_bits -= sizeof(*entropy) * 8;
                ++i;
        }

        _mix_pool_bytes(init_utsname(), sizeof(*(init_utsname())));
        _mix_pool_bytes(command_line, strlen(command_line));

        /* Reseed if already seeded by earlier phases. */
        if (crng_ready())
                crng_reseed(NULL);
        else if (trust_cpu)
                _credit_init_bits(arch_bits);
}

/*
 * This is called a little bit after the prior function, and now there is
 * access to timestamps counters. Interrupts are not yet enabled.
 */
void __init random_init(void)
{
        unsigned long entropy = random_get_entropy();
        ktime_t now = ktime_get_real();

        _mix_pool_bytes(&now, sizeof(now));
        _mix_pool_bytes(&entropy, sizeof(entropy));
        add_latent_entropy();

        /*
         * If we were initialized by the cpu or bootloader before workqueues
         * are initialized, then we should enable the static branch here.
         */
        if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY)
                crng_set_ready(NULL);

        /* Reseed if already seeded by earlier phases. */
        if (crng_ready())
                crng_reseed(NULL);

        WARN_ON(register_pm_notifier(&pm_notifier));

        WARN(!entropy, "Missing cycle counter and fallback timer; RNG "
                       "entropy collection will consequently suffer.");
}

/*
 * Add device- or boot-specific data to the input pool to help
 * initialize it.
 *
 * None of this adds any entropy; it is meant to avoid the problem of
 * the entropy pool having similar initial state across largely
 * identical devices.
 */
void add_device_randomness(const void *buf, size_t len)
{
        unsigned long entropy = random_get_entropy();
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&entropy, sizeof(entropy));
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}
EXPORT_SYMBOL(add_device_randomness);

/*
 * Interface for in-kernel drivers of true hardware RNGs. Those devices
 * may produce endless random bits, so this function will sleep for
 * some amount of time after, if the sleep_after parameter is true.
 */
void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after)
{
        mix_pool_bytes(buf, len);
        credit_init_bits(entropy);

        /*
         * Throttle writing to once every reseed interval, unless we're not yet
         * initialized or no entropy is credited.
         */
        if (sleep_after && !kthread_should_stop() && (crng_ready() || !entropy))
                schedule_timeout_interruptible(crng_reseed_interval());
}
EXPORT_SYMBOL_GPL(add_hwgenerator_randomness);

/*
 * Handle random seed passed by bootloader, and credit it depending
 * on the command line option 'random.trust_bootloader'.
 */
void __init add_bootloader_randomness(const void *buf, size_t len)
{
        mix_pool_bytes(buf, len);
        if (trust_bootloader)
                credit_init_bits(len * 8);
}

#if IS_ENABLED(CONFIG_VMGENID)
static BLOCKING_NOTIFIER_HEAD(vmfork_chain);

/*
 * Handle a new unique VM ID, which is unique, not secret, so we
 * don't credit it, but we do immediately force a reseed after so
 * that it's used by the crng posthaste.
 */
void __cold add_vmfork_randomness(const void *unique_vm_id, size_t len)
{
        add_device_randomness(unique_vm_id, len);
        if (crng_ready()) {
                crng_reseed(NULL);
                pr_notice("crng reseeded due to virtual machine fork\n");
        }
        blocking_notifier_call_chain(&vmfork_chain, 0, NULL);
}
#if IS_MODULE(CONFIG_VMGENID)
EXPORT_SYMBOL_GPL(add_vmfork_randomness);
#endif

int __cold register_random_vmfork_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmfork_chain, nb);
}
EXPORT_SYMBOL_GPL(register_random_vmfork_notifier);

int __cold unregister_random_vmfork_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmfork_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_random_vmfork_notifier);
#endif

struct fast_pool {
        unsigned long pool[4];
        unsigned long last;
        unsigned int count;
        struct timer_list mix;
};

static void mix_interrupt_randomness(struct timer_list *work);

static DEFINE_PER_CPU(struct fast_pool, irq_randomness) = {
#ifdef CONFIG_64BIT
#define FASTMIX_PERM SIPHASH_PERMUTATION
        .pool = { SIPHASH_CONST_0, SIPHASH_CONST_1, SIPHASH_CONST_2, SIPHASH_CONST_3 },
#else
#define FASTMIX_PERM HSIPHASH_PERMUTATION
        .pool = { HSIPHASH_CONST_0, HSIPHASH_CONST_1, HSIPHASH_CONST_2, HSIPHASH_CONST_3 },
#endif
        .mix = __TIMER_INITIALIZER(mix_interrupt_randomness, 0)
};

/*
 * This is [Half]SipHash-1-x, starting from an empty key. Because
 * the key is fixed, it assumes that its inputs are non-malicious,
 * and therefore this has no security on its own. s represents the
 * four-word SipHash state, while v represents a two-word input.
 */
static void fast_mix(unsigned long s[4], unsigned long v1, unsigned long v2)
{
        s[3] ^= v1;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v1;
        s[3] ^= v2;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v2;
}

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU has just come online, with
 * entry CPUHP_AP_RANDOM_ONLINE, just after CPUHP_AP_WORKQUEUE_ONLINE.
 */
int __cold random_online_cpu(unsigned int cpu)
{
        /*
         * During CPU shutdown and before CPU onlining, add_interrupt_
         * randomness() may schedule mix_interrupt_randomness(), and
         * set the MIX_INFLIGHT flag. However, because the worker can
         * be scheduled on a different CPU during this period, that
         * flag will never be cleared. For that reason, we zero out
         * the flag here, which runs just after workqueues are onlined
         * for the CPU again. This also has the effect of setting the
         * irq randomness count to zero so that new accumulated irqs
         * are fresh.
         */
        per_cpu_ptr(&irq_randomness, cpu)->count = 0;
        return 0;
}
#endif

static void mix_interrupt_randomness(struct timer_list *work)
{
        struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
        /*
         * The size of the copied stack pool is explicitly 2 longs so that we
         * only ever ingest half of the siphash output each time, retaining
         * the other half as the next "key" that carries over. The entropy is
         * supposed to be sufficiently dispersed between bits so on average
         * we don't wind up "losing" some.
         */
        unsigned long pool[2];
        unsigned int count;

        /* Check to see if we're running on the wrong CPU due to hotplug. */
        local_irq_disable();
        if (fast_pool != this_cpu_ptr(&irq_randomness)) {
                local_irq_enable();
                return;
        }

        /*
         * Copy the pool to the stack so that the mixer always has a
         * consistent view, before we reenable irqs again.
         */
        memcpy(pool, fast_pool->pool, sizeof(pool));
        count = fast_pool->count;
        fast_pool->count = 0;
        fast_pool->last = jiffies;
        local_irq_enable();

        mix_pool_bytes(pool, sizeof(pool));
        credit_init_bits(clamp_t(unsigned int, (count & U16_MAX) / 64, 1, sizeof(pool) * 8));

        memzero_explicit(pool, sizeof(pool));
}

void add_interrupt_randomness(int irq)
{
        enum { MIX_INFLIGHT = 1U << 31 };
        unsigned long entropy = random_get_entropy();
        struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
        struct pt_regs *regs = get_irq_regs();
        unsigned int new_count;

        fast_mix(fast_pool->pool, entropy,
                 (regs ? instruction_pointer(regs) : _RET_IP_) ^ swab(irq));
        new_count = ++fast_pool->count;

        if (new_count & MIX_INFLIGHT)
                return;

        if (new_count < 1024 && !time_is_before_jiffies(fast_pool->last + HZ))
                return;

        fast_pool->count |= MIX_INFLIGHT;
        if (!timer_pending(&fast_pool->mix)) {
                fast_pool->mix.expires = jiffies;
                add_timer_on(&fast_pool->mix, raw_smp_processor_id());
        }
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);

/* There is one of these per entropy source */
struct timer_rand_state {
        unsigned long last_time;
        long last_delta, last_delta2;
};

/*
 * This function adds entropy to the entropy "pool" by using timing
 * delays. It uses the timer_rand_state structure to make an estimate
 * of how many bits of entropy this call has added to the pool. The
 * value "num" is also added to the pool; it should somehow describe
 * the type of event that just happened.
 */
static void add_timer_randomness(struct timer_rand_state *state, unsigned int num)
{
        unsigned long entropy = random_get_entropy(), now = jiffies, flags;
        long delta, delta2, delta3;
        unsigned int bits;

        /*
         * If we're in a hard IRQ, add_interrupt_randomness() will be called
         * sometime after, so mix into the fast pool.
         */
        if (in_hardirq()) {
                fast_mix(this_cpu_ptr(&irq_randomness)->pool, entropy, num);
        } else {
                spin_lock_irqsave(&input_pool.lock, flags);
                _mix_pool_bytes(&entropy, sizeof(entropy));
                _mix_pool_bytes(&num, sizeof(num));
                spin_unlock_irqrestore(&input_pool.lock, flags);
        }

        if (crng_ready())
                return;

        /*
         * Calculate number of bits of randomness we probably added.
         * We take into account the first, second and third-order deltas
         * in order to make our estimate.
         */
        delta = now - READ_ONCE(state->last_time);
        WRITE_ONCE(state->last_time, now);

        delta2 = delta - READ_ONCE(state->last_delta);
        WRITE_ONCE(state->last_delta, delta);

        delta3 = delta2 - READ_ONCE(state->last_delta2);
        WRITE_ONCE(state->last_delta2, delta2);

        if (delta < 0)
                delta = -delta;
        if (delta2 < 0)
                delta2 = -delta2;
        if (delta3 < 0)
                delta3 = -delta3;
        if (delta > delta2)
                delta = delta2;
        if (delta > delta3)
                delta = delta3;

        /*
         * delta is now minimum absolute delta. Round down by 1 bit
         * on general principles, and limit entropy estimate to 11 bits.
         */
        bits = min(fls(delta >> 1), 11);

        /*
         * As mentioned above, if we're in a hard IRQ, add_interrupt_randomness()
         * will run after this, which uses a different crediting scheme of 1 bit
         * per every 64 interrupts. In order to let that function do accounting
         * close to the one in this function, we credit a full 64/64 bit per bit,
         * and then subtract one to account for the extra one added.
         */
        if (in_hardirq())
                this_cpu_ptr(&irq_randomness)->count += max(1u, bits * 64) - 1;
        else
                _credit_init_bits(bits);
}

void add_input_randomness(unsigned int type, unsigned int code, unsigned int value)
{
        static unsigned char last_value;
        static struct timer_rand_state input_timer_state = { INITIAL_JIFFIES };

        /* Ignore autorepeat and the like. */
        if (value == last_value)
                return;

        last_value = value;
        add_timer_randomness(&input_timer_state,
                             (type << 4) ^ code ^ (code >> 4) ^ value);
}
EXPORT_SYMBOL_GPL(add_input_randomness);

#ifdef CONFIG_BLOCK
void add_disk_randomness(struct gendisk *disk)
{
        if (!disk || !disk->random)
                return;
        /* First major is 1, so we get >= 0x200 here. */
        add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
}
EXPORT_SYMBOL_GPL(add_disk_randomness);

void __cold rand_initialize_disk(struct gendisk *disk)
{
        struct timer_rand_state *state;

        /*
         * If kzalloc returns null, we just won't use that entropy
         * source.
         */
        state = kzalloc_obj(struct timer_rand_state);
        if (state) {
                state->last_time = INITIAL_JIFFIES;
                disk->random = state;
        }
}
#endif

struct entropy_timer_state {
        unsigned long entropy;
        struct timer_list timer;
        atomic_t samples;
        unsigned int samples_per_bit;
};

/*
 * Each time the timer fires, we expect that we got an unpredictable jump in
 * the cycle counter. Even if the timer is running on another CPU, the timer
 * activity will be touching the stack of the CPU that is generating entropy.
 *
 * Note that we don't re-arm the timer in the timer itself - we are happy to be
 * scheduled away, since that just makes the load more complex, but we do not
 * want the timer to keep ticking unless the entropy loop is running.
 *
 * So the re-arming always happens in the entropy loop itself.
 */
static void __cold entropy_timer(struct timer_list *timer)
{
        struct entropy_timer_state *state = container_of(timer, struct entropy_timer_state, timer);
        unsigned long entropy = random_get_entropy();

        mix_pool_bytes(&entropy, sizeof(entropy));
        if (atomic_inc_return(&state->samples) % state->samples_per_bit == 0)
                credit_init_bits(1);
}

/*
 * If we have an actual cycle counter, see if we can generate enough entropy
 * with timing noise.
 */
static void __cold try_to_generate_entropy(void)
{
        enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
        u8 stack_bytes[sizeof(struct entropy_timer_state) + SMP_CACHE_BYTES - 1];
        struct entropy_timer_state *stack = PTR_ALIGN((void *)stack_bytes, SMP_CACHE_BYTES);
        unsigned int i, num_different = 0;
        unsigned long last = random_get_entropy();
        cpumask_var_t timer_cpus;
        int cpu = -1;

        for (i = 0; i < NUM_TRIAL_SAMPLES - 1; ++i) {
                stack->entropy = random_get_entropy();
                if (stack->entropy != last)
                        ++num_different;
                last = stack->entropy;
        }
        stack->samples_per_bit = DIV_ROUND_UP(NUM_TRIAL_SAMPLES, num_different + 1);
        if (stack->samples_per_bit > MAX_SAMPLES_PER_BIT)
                return;

        atomic_set(&stack->samples, 0);
        timer_setup_on_stack(&stack->timer, entropy_timer, 0);
        if (!alloc_cpumask_var(&timer_cpus, GFP_KERNEL))
                goto out;

        while (!crng_ready() && !signal_pending(current)) {
                /*
                 * Check !timer_pending() and then ensure that any previous callback has finished
                 * executing by checking timer_delete_sync_try(), before queueing the next one.
                 */
                if (!timer_pending(&stack->timer) && timer_delete_sync_try(&stack->timer) >= 0) {
                        unsigned int num_cpus;

                        /*
                         * Preemption must be disabled here, both to read the current CPU number
                         * and to avoid scheduling a timer on a dead CPU.
                         */
                        preempt_disable();

                        /* Only schedule callbacks on timer CPUs that are online. */
                        cpumask_and(timer_cpus, housekeeping_cpumask(HK_TYPE_TIMER), cpu_online_mask);
                        num_cpus = cpumask_weight(timer_cpus);
                        /* In very bizarre case of misconfiguration, fallback to all online. */
                        if (unlikely(num_cpus == 0)) {
                                *timer_cpus = *cpu_online_mask;
                                num_cpus = cpumask_weight(timer_cpus);
                        }

                        /* Basic CPU round-robin, which avoids the current CPU. */
                        do {
                                cpu = cpumask_next(cpu, timer_cpus);
                                if (cpu >= nr_cpu_ids)
                                        cpu = cpumask_first(timer_cpus);
                        } while (cpu == smp_processor_id() && num_cpus > 1);

                        /* Expiring the timer at `jiffies` means it's the next tick. */
                        stack->timer.expires = jiffies;

                        add_timer_on(&stack->timer, cpu);

                        preempt_enable();
                }
                mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));
                schedule();
                stack->entropy = random_get_entropy();
        }
        mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));

        free_cpumask_var(timer_cpus);
out:
        timer_delete_sync(&stack->timer);
        timer_destroy_on_stack(&stack->timer);
}


/**********************************************************************
 *
 * Userspace reader/writer interfaces.
 *
 * getrandom(2) is the primary modern interface into the RNG and should
 * be used in preference to anything else.
 *
 * Reading from /dev/random has the same functionality as calling
 * getrandom(2) with flags=0. In earlier versions, however, it had
 * vastly different semantics and should therefore be avoided, to
 * prevent backwards compatibility issues.
 *
 * Reading from /dev/urandom has the same functionality as calling
 * getrandom(2) with flags=GRND_INSECURE. Because it does not block
 * waiting for the RNG to be ready, it should not be used.
 *
 * Writing to either /dev/random or /dev/urandom adds entropy to
 * the input pool but does not credit it.
 *
 * Polling on /dev/random indicates when the RNG is initialized, on
 * the read side, and when it wants new entropy, on the write side.
 *
 * Both /dev/random and /dev/urandom have the same set of ioctls for
 * adding entropy, getting the entropy count, zeroing the count, and
 * reseeding the crng.
 *
 **********************************************************************/

SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags)
{
        struct iov_iter iter;
        int ret;

        if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))
                return -EINVAL;

        /*
         * Requesting insecure and blocking randomness at the same time makes
         * no sense.
         */
        if ((flags & (GRND_INSECURE | GRND_RANDOM)) == (GRND_INSECURE | GRND_RANDOM))
                return -EINVAL;

        if (!crng_ready() && !(flags & GRND_INSECURE)) {
                if (flags & GRND_NONBLOCK)
                        return -EAGAIN;
                ret = wait_for_random_bytes();
                if (unlikely(ret))
                        return ret;
        }

        ret = import_ubuf(ITER_DEST, ubuf, len, &iter);
        if (unlikely(ret))
                return ret;
        return get_random_bytes_user(&iter);
}

static __poll_t random_poll(struct file *file, poll_table *wait)
{
        poll_wait(file, &crng_init_wait, wait);
        return crng_ready() ? EPOLLIN | EPOLLRDNORM : EPOLLOUT | EPOLLWRNORM;
}

static ssize_t write_pool_user(struct iov_iter *iter)
{
        u8 block[BLAKE2S_BLOCK_SIZE];
        ssize_t ret = 0;
        size_t copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        for (;;) {
                copied = copy_from_iter(block, sizeof(block), iter);
                ret += copied;
                mix_pool_bytes(block, copied);
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
        return ret ? ret : -EFAULT;
}

static ssize_t random_write_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        return write_pool_user(iter);
}

static ssize_t urandom_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        static int maxwarn = 10;

        /*
         * Opportunistically attempt to initialize the RNG on platforms that
         * have fast cycle counters, but don't (for now) require it to succeed.
         */
        if (!crng_ready())
                try_to_generate_entropy();

        if (!crng_ready()) {
                if (!ratelimit_disable && maxwarn <= 0)
                        ratelimit_state_inc_miss(&urandom_warning);
                else if (ratelimit_disable || __ratelimit(&urandom_warning)) {
                        --maxwarn;
                        pr_notice("%s: uninitialized urandom read (%zu bytes read)\n",
                                  current->comm, iov_iter_count(iter));
                }
        }

        return get_random_bytes_user(iter);
}

static ssize_t random_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        int ret;

        if (!crng_ready() &&
            ((kiocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) ||
             (kiocb->ki_filp->f_flags & O_NONBLOCK)))
                return -EAGAIN;

        ret = wait_for_random_bytes();
        if (ret != 0)
                return ret;
        return get_random_bytes_user(iter);
}

static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
        int __user *p = (int __user *)arg;
        int ent_count;

        switch (cmd) {
        case RNDGETENTCNT:
                /* Inherently racy, no point locking. */
                if (put_user(input_pool.init_bits, p))
                        return -EFAULT;
                return 0;
        case RNDADDTOENTCNT:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                credit_init_bits(ent_count);
                return 0;
        case RNDADDENTROPY: {
                struct iov_iter iter;
                ssize_t ret;
                int len;

                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p++))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                if (get_user(len, p++))
                        return -EFAULT;
                ret = import_ubuf(ITER_SOURCE, p, len, &iter);
                if (unlikely(ret))
                        return ret;
                ret = write_pool_user(&iter);
                if (unlikely(ret < 0))
                        return ret;
                /* Since we're crediting, enforce that it was all written into the pool. */
                if (unlikely(ret != len))
                        return -EFAULT;
                credit_init_bits(ent_count);
                return 0;
        }
        case RNDZAPENTCNT:
        case RNDCLEARPOOL:
                /* No longer has any effect. */
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                return 0;
        case RNDRESEEDCRNG:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (!crng_ready())
                        return -ENODATA;
                crng_reseed(NULL);
                return 0;
        default:
                return -EINVAL;
        }
}

static int random_fasync(int fd, struct file *filp, int on)
{
        return fasync_helper(fd, filp, on, &fasync);
}

const struct file_operations random_fops = {
        .read_iter = random_read_iter,
        .write_iter = random_write_iter,
        .poll = random_poll,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = copy_splice_read,
        .splice_write = iter_file_splice_write,
};

const struct file_operations urandom_fops = {
        .read_iter = urandom_read_iter,
        .write_iter = random_write_iter,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = copy_splice_read,
        .splice_write = iter_file_splice_write,
};


/********************************************************************
 *
 * Sysctl interface.
 *
 * These are partly unused legacy knobs with dummy values to not break
 * userspace and partly still useful things. They are usually accessible
 * in /proc/sys/kernel/random/ and are as follows:
 *
 * - boot_id - a UUID representing the current boot.
 *
 * - uuid - a random UUID, different each time the file is read.
 *
 * - poolsize - the number of bits of entropy that the input pool can
 *   hold, tied to the POOL_BITS constant.
 *
 * - entropy_avail - the number of bits of entropy currently in the
 *   input pool. Always <= poolsize.
 *
 * - write_wakeup_threshold - the amount of entropy in the input pool
 *   below which write polls to /dev/random will unblock, requesting
 *   more entropy, tied to the POOL_READY_BITS constant. It is writable
 *   to avoid breaking old userspaces, but writing to it does not
 *   change any behavior of the RNG.
 *
 * - urandom_min_reseed_secs - fixed to the value CRNG_RESEED_INTERVAL.
 *   It is writable to avoid breaking old userspaces, but writing
 *   to it does not change any behavior of the RNG.
 *
 ********************************************************************/

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static int sysctl_random_min_urandom_seed = CRNG_RESEED_INTERVAL / HZ;
static int sysctl_random_write_wakeup_bits = POOL_READY_BITS;
static int sysctl_poolsize = POOL_BITS;
static u8 sysctl_bootid[UUID_SIZE];

/*
 * This function is used to return both the bootid UUID, and random
 * UUID. The difference is in whether table->data is NULL; if it is,
 * then a new UUID is generated and returned to the user.
 */
static int proc_do_uuid(const struct ctl_table *table, int write, void *buf,
                        size_t *lenp, loff_t *ppos)
{
        u8 tmp_uuid[UUID_SIZE], *uuid;
        char uuid_string[UUID_STRING_LEN + 1];
        struct ctl_table fake_table = {
                .data = uuid_string,
                .maxlen = UUID_STRING_LEN
        };

        if (write)
                return -EPERM;

        uuid = table->data;
        if (!uuid) {
                uuid = tmp_uuid;
                generate_random_uuid(uuid);
        } else {
                static DEFINE_SPINLOCK(bootid_spinlock);

                spin_lock(&bootid_spinlock);
                if (!uuid[8])
                        generate_random_uuid(uuid);
                spin_unlock(&bootid_spinlock);
        }

        snprintf(uuid_string, sizeof(uuid_string), "%pU", uuid);
        return proc_dostring(&fake_table, 0, buf, lenp, ppos);
}

/* The same as proc_dointvec, but writes don't change anything. */
static int proc_do_rointvec(const struct ctl_table *table, int write, void *buf,
                            size_t *lenp, loff_t *ppos)
{
        return write ? 0 : proc_dointvec(table, 0, buf, lenp, ppos);
}

static const struct ctl_table random_table[] = {
        {
                .procname        = "poolsize",
                .data                = &sysctl_poolsize,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "entropy_avail",
                .data                = &input_pool.init_bits,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "write_wakeup_threshold",
                .data                = &sysctl_random_write_wakeup_bits,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "urandom_min_reseed_secs",
                .data                = &sysctl_random_min_urandom_seed,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "boot_id",
                .data                = &sysctl_bootid,
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
        {
                .procname        = "uuid",
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
};

/*
 * random_init() is called before sysctl_init(),
 * so we cannot call register_sysctl_init() in random_init()
 */
static int __init random_sysctls_init(void)
{
        register_sysctl_init("kernel/random", random_table);
        return 0;
}
device_initcall(random_sysctls_init);
#endif





































































































































































   13 







   13 




























   13 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_THROTTLE_H
#define BLK_THROTTLE_H

#include "blk-cgroup-rwstat.h"

/*
 * To implement hierarchical throttling, throtl_grps form a tree and bios
 * are dispatched upwards level by level until they reach the top and get
 * issued.  When dispatching bios from the children and local group at each
 * level, if the bios are dispatched into a single bio_list, there's a risk
 * of a local or child group which can queue many bios at once filling up
 * the list starving others.
 *
 * To avoid such starvation, dispatched bios are queued separately
 * according to where they came from.  When they are again dispatched to
 * the parent, they're popped in round-robin order so that no single source
 * hogs the dispatch window.
 *
 * throtl_qnode is used to keep the queued bios separated by their sources.
 * Bios are queued to throtl_qnode which in turn is queued to
 * throtl_service_queue and then dispatched in round-robin order.
 *
 * It's also used to track the reference counts on blkg's.  A qnode always
 * belongs to a throtl_grp and gets queued on itself or the parent, so
 * incrementing the reference of the associated throtl_grp when a qnode is
 * queued and decrementing when dequeued is enough to keep the whole blkg
 * tree pinned while bios are in flight.
 */
struct throtl_qnode {
        struct list_head        node;                /* service_queue->queued[] */
        struct bio_list                bios_bps;        /* queued bios for bps limit */
        struct bio_list                bios_iops;        /* queued bios for iops limit */
        struct throtl_grp        *tg;                /* tg this qnode belongs to */
};

struct throtl_service_queue {
        struct throtl_service_queue *parent_sq;        /* the parent service_queue */

        /*
         * Bios queued directly to this service_queue or dispatched from
         * children throtl_grp's.
         */
        struct list_head        queued[2];        /* throtl_qnode [READ/WRITE] */
        unsigned int                nr_queued_bps[2];        /* number of queued bps bios */
        unsigned int                nr_queued_iops[2];        /* number of queued iops bios */

        /*
         * RB tree of active children throtl_grp's, which are sorted by
         * their ->disptime.
         */
        struct rb_root_cached        pending_tree;        /* RB tree of active tgs */
        unsigned int                nr_pending;        /* # queued in the tree */
        unsigned long                first_pending_disptime;        /* disptime of the first tg */
        struct timer_list        pending_timer;        /* fires on first_pending_disptime */
};

enum tg_state_flags {
        THROTL_TG_PENDING                = 1 << 0,        /* on parent's pending tree */
        THROTL_TG_WAS_EMPTY                = 1 << 1,        /* bio_lists[] became non-empty */
        /*
         * The sq's iops queue is empty, and a bio is about to be enqueued
         * to the first qnode's bios_iops list.
         */
        THROTL_TG_IOPS_WAS_EMPTY        = 1 << 2,
        THROTL_TG_CANCELING                = 1 << 3,        /* starts to cancel bio */
};

struct throtl_grp {
        /* must be the first member */
        struct blkg_policy_data pd;

        /* active throtl group service_queue member */
        struct rb_node rb_node;

        /* throtl_data this group belongs to */
        struct throtl_data *td;

        /* this group's service queue */
        struct throtl_service_queue service_queue;

        /*
         * qnode_on_self is used when bios are directly queued to this
         * throtl_grp so that local bios compete fairly with bios
         * dispatched from children.  qnode_on_parent is used when bios are
         * dispatched from this throtl_grp into its parent and will compete
         * with the sibling qnode_on_parents and the parent's
         * qnode_on_self.
         */
        struct throtl_qnode qnode_on_self[2];
        struct throtl_qnode qnode_on_parent[2];

        /*
         * Dispatch time in jiffies. This is the estimated time when group
         * will unthrottle and is ready to dispatch more bio. It is used as
         * key to sort active groups in service tree.
         */
        unsigned long disptime;

        unsigned int flags;

        /* are there any throtl rules between this group and td? */
        bool has_rules_bps[2];
        bool has_rules_iops[2];

        /* bytes per second rate limits */
        uint64_t bps[2];

        /* IOPS limits */
        unsigned int iops[2];

        /*
         * Number of bytes/bio's dispatched in current slice.
         * When new configuration is submitted while some bios are still throttled,
         * first calculate the carryover: the amount of bytes/IOs already waited
         * under the previous configuration. Then, [bytes/io]_disp are represented
         * as the negative of the carryover, and they will be used to calculate the
         * wait time under the new configuration.
         */
        int64_t bytes_disp[2];
        int io_disp[2];

        unsigned long last_check_time;

        /* When did we start a new slice */
        unsigned long slice_start[2];
        unsigned long slice_end[2];

        struct blkg_rwstat stat_bytes;
        struct blkg_rwstat stat_ios;
};

extern struct blkcg_policy blkcg_policy_throtl;

static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
        return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
}

static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
{
        return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
}

/*
 * Internal throttling interface
 */
#ifndef CONFIG_BLK_DEV_THROTTLING
static inline void blk_throtl_exit(struct gendisk *disk) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
#else /* CONFIG_BLK_DEV_THROTTLING */
void blk_throtl_exit(struct gendisk *disk);
bool __blk_throtl_bio(struct bio *bio);
void blk_throtl_cancel_bios(struct gendisk *disk);

static inline bool blk_throtl_activated(struct request_queue *q)
{
        /*
         * q->td guarantees that the blk-throttle module is already loaded,
         * and the plid of blk-throttle is assigned.
         * blkcg_policy_enabled() guarantees that the policy is activated
         * in the request_queue.
         */
        return q->td != NULL && blkcg_policy_enabled(q, &blkcg_policy_throtl);
}

static inline bool blk_should_throtl(struct bio *bio)
{
        struct throtl_grp *tg;
        int rw = bio_data_dir(bio);

        if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
                return false;

        tg = blkg_to_tg(bio->bi_blkg);
        if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
                if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
                        bio_set_flag(bio, BIO_CGROUP_ACCT);
                        blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
                                        bio->bi_iter.bi_size);
                }
                blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
        }

        /* iops limit is always counted */
        if (tg->has_rules_iops[rw])
                return true;

        if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED))
                return true;

        return false;
}

static inline bool blk_throtl_bio(struct bio *bio)
{
        /*
         * block throttling takes effect if the policy is activated
         * in the bio's request_queue.
         */
        if (!blk_should_throtl(bio))
                return false;

        return __blk_throtl_bio(bio);
}
#endif /* CONFIG_BLK_DEV_THROTTLING */

#endif
























































































    4 


































































































































































































































































































   11 












    3 









    2 


    3 














    1 




    2 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET  is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the IP router.
 *
 * Version:        @(#)route.h        1.0.4        05/27/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 * Fixes:
 *                Alan Cox        :        Reformatted. Added ip_rt_local()
 *                Alan Cox        :        Support for TCP parameters.
 *                Alexey Kuznetsov:        Major changes for new routing code.
 *                Mike McLagan    :        Routing by source
 *                Robert Olsson   :        Added rt_cache statistics
 */
#ifndef _ROUTE_H
#define _ROUTE_H

#include <net/dst.h>
#include <net/inetpeer.h>
#include <net/flow.h>
#include <net/inet_sock.h>
#include <net/ip_fib.h>
#include <net/arp.h>
#include <net/ndisc.h>
#include <net/inet_dscp.h>
#include <net/sock.h>
#include <linux/in_route.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/route.h>
#include <linux/ip.h>
#include <linux/cache.h>
#include <linux/security.h>

static inline __u8 ip_sock_rt_scope(const struct sock *sk)
{
        if (sock_flag(sk, SOCK_LOCALROUTE))
                return RT_SCOPE_LINK;

        return RT_SCOPE_UNIVERSE;
}

static inline __u8 ip_sock_rt_tos(const struct sock *sk)
{
        return READ_ONCE(inet_sk(sk)->tos) & INET_DSCP_MASK;
}

struct ip_tunnel_info;
struct fib_nh;
struct fib_info;
struct uncached_list;
struct rtable {
        struct dst_entry        dst;

        int                        rt_genid;
        unsigned int                rt_flags;
        __u16                        rt_type;
        __u8                        rt_is_input;
        __u8                        rt_uses_gateway;

        int                        rt_iif;

        u8                        rt_gw_family;
        /* Info on neighbour */
        union {
                __be32                rt_gw4;
                struct in6_addr        rt_gw6;
        };

        /* Miscellaneous cached information */
        u32                        rt_mtu_locked:1,
                                rt_pmtu:31;
};

#define dst_rtable(_ptr) container_of_const(_ptr, struct rtable, dst)

/**
 * skb_rtable - Returns the skb &rtable
 * @skb: buffer
 */
static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
        return dst_rtable(skb_dst(skb));
}

static inline bool rt_is_input_route(const struct rtable *rt)
{
        return rt->rt_is_input != 0;
}

static inline bool rt_is_output_route(const struct rtable *rt)
{
        return rt->rt_is_input == 0;
}

static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
{
        if (rt->rt_gw_family == AF_INET)
                return rt->rt_gw4;
        return daddr;
}

struct ip_rt_acct {
        __u32         o_bytes;
        __u32         o_packets;
        __u32         i_bytes;
        __u32         i_packets;
};

struct rt_cache_stat {
        unsigned int in_slow_tot;
        unsigned int in_slow_mc;
        unsigned int in_no_route;
        unsigned int in_brd;
        unsigned int in_martian_dst;
        unsigned int in_martian_src;
        unsigned int out_slow_tot;
        unsigned int out_slow_mc;
};

extern struct ip_rt_acct __percpu *ip_rt_acct;

struct in_device;

int ip_rt_init(void);
void rt_cache_flush(struct net *net);
void rt_flush_dev(struct net_device *dev);

static inline void inet_sk_init_flowi4(const struct inet_sock *inet,
                                       struct flowi4 *fl4)
{
        const struct ip_options_rcu *ip4_opt;
        const struct sock *sk;
        __be32 daddr;

        rcu_read_lock();
        ip4_opt = rcu_dereference(inet->inet_opt);

        /* Source routing option overrides the socket destination address */
        if (ip4_opt && ip4_opt->opt.srr)
                daddr = ip4_opt->opt.faddr;
        else
                daddr = inet->inet_daddr;
        rcu_read_unlock();

        sk = &inet->sk;
        flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
                           ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
                           sk->sk_protocol, inet_sk_flowi_flags(sk), daddr,
                           inet->inet_saddr, inet->inet_dport,
                           inet->inet_sport, sk_uid(sk));
        security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
}

struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp,
                                        const struct sk_buff *skb);
struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp,
                                            struct fib_result *res,
                                            const struct sk_buff *skb);

static inline struct rtable *__ip_route_output_key(struct net *net,
                                                   struct flowi4 *flp)
{
        return ip_route_output_key_hash(net, flp, NULL);
}

struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
                                    const struct sock *sk);
struct dst_entry *ipv4_blackhole_route(struct net *net,
                                       struct dst_entry *dst_orig);

static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 *flp)
{
        return ip_route_output_flow(net, flp, NULL);
}

/* Simplistic IPv4 route lookup function.
 * This is only suitable for some particular use cases: since the flowi4
 * structure is only partially set, it may bypass some fib-rules.
 */
static inline struct rtable *ip_route_output(struct net *net, __be32 daddr,
                                             __be32 saddr, dscp_t dscp,
                                             int oif, __u8 scope)
{
        struct flowi4 fl4 = {
                .flowi4_oif = oif,
                .flowi4_dscp = dscp,
                .flowi4_scope = scope,
                .daddr = daddr,
                .saddr = saddr,
        };

        return ip_route_output_key(net, &fl4);
}

static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4,
                                                   const struct sock *sk,
                                                   __be32 daddr, __be32 saddr,
                                                   __be16 dport, __be16 sport,
                                                   __u8 proto, __u8 tos, int oif)
{
        flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos,
                           sk ? ip_sock_rt_scope(sk) : RT_SCOPE_UNIVERSE,
                           proto, sk ? inet_sk_flowi_flags(sk) : 0,
                           daddr, saddr, dport, sport, sock_net_uid(net, sk));
        if (sk)
                security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
        return ip_route_output_flow(net, fl4, sk);
}

enum skb_drop_reason
ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                      dscp_t dscp, struct net_device *dev,
                      struct in_device *in_dev, u32 *itag);
enum skb_drop_reason
ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                     dscp_t dscp, struct net_device *dev);
enum skb_drop_reason
ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                  dscp_t dscp, struct net_device *dev,
                  const struct sk_buff *hint);

static inline enum skb_drop_reason
ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp,
               struct net_device *devin)
{
        enum skb_drop_reason reason;

        rcu_read_lock();
        reason = ip_route_input_noref(skb, dst, src, dscp, devin);
        if (!reason) {
                skb_dst_force(skb);
                if (!skb_dst(skb))
                        reason = SKB_DROP_REASON_NOT_SPECIFIED;
        }
        rcu_read_unlock();

        return reason;
}

void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif,
                      u8 protocol);
void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol);
void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
void ip_rt_send_redirect(struct sk_buff *skb);

unsigned int inet_addr_type(struct net *net, __be32 addr);
unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
                                __be32 addr);
unsigned int inet_addr_type_dev_table(struct net *net,
                                      const struct net_device *dev,
                                      __be32 addr);
void ip_rt_multicast_event(struct in_device *);
int ip_rt_ioctl(struct net *, unsigned int cmd, struct rtentry *rt);
void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
struct rtable *rt_dst_alloc(struct net_device *dev,
                            unsigned int flags, u16 type, bool noxfrm);
struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt);

struct in_ifaddr;
void fib_add_ifaddr(struct in_ifaddr *);
void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);

void rt_add_uncached_list(struct rtable *rt);
void rt_del_uncached_list(struct rtable *rt);

int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
                       u32 table_id, struct fib_info *fi,
                       int *fa_index, int fa_start, unsigned int flags);

static inline void ip_rt_put(struct rtable *rt)
{
        /* dst_release() accepts a NULL parameter.
         * We rely on dst being first structure in struct rtable
         */
        BUILD_BUG_ON(offsetof(struct rtable, dst) != 0);
        dst_release(&rt->dst);
}

extern const __u8 ip_tos2prio[16];

static inline char rt_tos2priority(u8 tos)
{
        return ip_tos2prio[IPTOS_TOS(tos)>>1];
}

/* ip_route_connect() and ip_route_newports() work in tandem whilst
 * binding a socket for a new outgoing connection.
 *
 * In order to use IPSEC properly, we must, in the end, have a
 * route that was looked up using all available keys including source
 * and destination ports.
 *
 * However, if a source port needs to be allocated (the user specified
 * a wildcard source port) we need to obtain addressing information
 * in order to perform that allocation.
 *
 * So ip_route_connect() looks up a route using wildcarded source and
 * destination ports in the key, simply so that we can get a pair of
 * addresses to use for port allocation.
 *
 * Later, once the ports are allocated, ip_route_newports() will make
 * another route lookup if needed to make sure we catch any IPSEC
 * rules keyed on the port information.
 *
 * The callers allocate the flow key on their stack, and must pass in
 * the same flowi4 object to both the ip_route_connect() and the
 * ip_route_newports() calls.
 */

static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
                                         __be32 src, int oif, u8 protocol,
                                         __be16 sport, __be16 dport,
                                         const struct sock *sk)
{
        __u8 flow_flags = 0;

        if (inet_test_bit(TRANSPARENT, sk))
                flow_flags |= FLOWI_FLAG_ANYSRC;

        if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport)
                flow_flags |= FLOWI_FLAG_ANY_SPORT;

        flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk),
                           ip_sock_rt_scope(sk), protocol, flow_flags, dst,
                           src, dport, sport, sk_uid(sk));
}

static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst,
                                              __be32 src, int oif, u8 protocol,
                                              __be16 sport, __be16 dport,
                                              const struct sock *sk)
{
        struct net *net = sock_net(sk);
        struct rtable *rt;

        ip_route_connect_init(fl4, dst, src, oif, protocol, sport, dport, sk);

        if (!dst || !src) {
                rt = __ip_route_output_key(net, fl4);
                if (IS_ERR(rt))
                        return rt;
                ip_rt_put(rt);
                flowi4_update_output(fl4, oif, fl4->daddr, fl4->saddr);
        }
        security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
        return ip_route_output_flow(net, fl4, sk);
}

static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt,
                                               __be16 orig_sport, __be16 orig_dport,
                                               __be16 sport, __be16 dport,
                                               const struct sock *sk)
{
        if (sport != orig_sport || dport != orig_dport) {
                fl4->fl4_dport = dport;
                fl4->fl4_sport = sport;
                ip_rt_put(rt);
                flowi4_update_output(fl4, sk->sk_bound_dev_if, fl4->daddr,
                                     fl4->saddr);
                security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
                return ip_route_output_flow(sock_net(sk), fl4, sk);
        }
        return rt;
}

static inline int inet_iif(const struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);

        if (rt && rt->rt_iif)
                return rt->rt_iif;

        return skb->skb_iif;
}

static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
{
        int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);

        if (hoplimit == 0) {
                const struct net *net;

                rcu_read_lock();
                net = dst_dev_net_rcu(dst);
                hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
                rcu_read_unlock();
        }
        return hoplimit;
}

static inline struct neighbour *ip_neigh_gw4(struct net_device *dev,
                                             __be32 daddr)
{
        struct neighbour *neigh;

        neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr);
        if (unlikely(!neigh))
                neigh = __neigh_create(&arp_tbl, &daddr, dev, false);

        return neigh;
}

static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt,
                                                struct sk_buff *skb,
                                                bool *is_v6gw)
{
        struct net_device *dev = rt->dst.dev;
        struct neighbour *neigh;

        if (likely(rt->rt_gw_family == AF_INET)) {
                neigh = ip_neigh_gw4(dev, rt->rt_gw4);
        } else if (rt->rt_gw_family == AF_INET6) {
                neigh = ip_neigh_gw6(dev, &rt->rt_gw6);
                *is_v6gw = true;
        } else {
                neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr);
        }
        return neigh;
}

#endif        /* _ROUTE_H */




















   19 


























































    1 





















   18 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC64_64_H
#define _ASM_X86_ATOMIC64_64_H

#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>

/* The 64-bit atomic type */

#define ATOMIC64_INIT(i)        { (i) }

static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
{
        return __READ_ONCE((v)->counter);
}

static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
        __WRITE_ONCE(v->counter, i);
}

static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "addq %1, %0"
                     : "=m" (v->counter)
                     : "er" (i), "m" (v->counter) : "memory");
}

static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "subq %1, %0"
                     : "=m" (v->counter)
                     : "er" (i), "m" (v->counter) : "memory");
}

static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
}
#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test

static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "incq %0"
                     : "=m" (v->counter)
                     : "m" (v->counter) : "memory");
}
#define arch_atomic64_inc arch_atomic64_inc

static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "decq %0"
                     : "=m" (v->counter)
                     : "m" (v->counter) : "memory");
}
#define arch_atomic64_dec arch_atomic64_dec

static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
}
#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test

static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
}
#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test

static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
}
#define arch_atomic64_add_negative arch_atomic64_add_negative

static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic64_add_return arch_atomic64_add_return

#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)

static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add

#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)

static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg

static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        return arch_try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg

static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic64_xchg arch_atomic64_xchg

static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "andq %1, %0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
        return val;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and

static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "orq %1, %0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
        return val;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or

static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "xorq %1, %0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
        return val;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor

#endif /* _ASM_X86_ATOMIC64_64_H */























































































































































































































































































































































































































































    1 



    1 













    1 





    1 


























































































































































































































    1 




















    1 










    1 


    1 

    1 
























































































































































































































    1 
    1 



    1 



    1 



    1 



    1 
    1 

    1 





    1 












    1 
    1 
    1 












    1 




    1 

























































































































































































































































































    1 






    1 







    1 








    1 












    1 





















    1 

    1 

    1 

    1 

    1 

    1 

    1 







































































































































    1 


    1 

    1 
    1 

































































    1 


















    1 





    1 





    1 






















    1 













    1 

























    1 









    1 







    1 



    1 

    1 

    1 

    1 

    1 

























































































































































































    1 
    1 


















    1 



















    1 





































































































































































































































    1 













    1 


    1 

















    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 



    1 


    1 


    1 

    1 


    1 




















































    1 



















    1 

    1 









    1 


































    1 





    1 





    1 















    1 

    1 








    1 


    1 


    1 

    1 













    1 


    1 
    1 




    1 






















    1 












    1 











































































































































































    1 

    1 
















    1 



    1 




    1 







    1 








    1 

    1 

















    1 


    1 




    1 
    1 


















    1 


























    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
// SPDX-License-Identifier: GPL-2.0-only
/*
 * The input core
 *
 * Copyright (c) 1999-2002 Vojtech Pavlik
 */


#define pr_fmt(fmt) KBUILD_BASENAME ": " fmt

#include <linux/export.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/idr.h>
#include <linux/input/mt.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/major.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/pm.h>
#include <linux/poll.h>
#include <linux/device.h>
#include <linux/kstrtox.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include "input-compat.h"
#include "input-core-private.h"
#include "input-poller.h"

MODULE_AUTHOR("Vojtech Pavlik <vojtech@suse.cz>");
MODULE_DESCRIPTION("Input core");
MODULE_LICENSE("GPL");

#define INPUT_MAX_CHAR_DEVICES                1024
#define INPUT_FIRST_DYNAMIC_DEV                256
static DEFINE_IDA(input_ida);

static LIST_HEAD(input_dev_list);
static LIST_HEAD(input_handler_list);

/*
 * input_mutex protects access to both input_dev_list and input_handler_list.
 * This also causes input_[un]register_device and input_[un]register_handler
 * be mutually exclusive which simplifies locking in drivers implementing
 * input handlers.
 */
static DEFINE_MUTEX(input_mutex);

static const struct input_value input_value_sync = { EV_SYN, SYN_REPORT, 1 };

static const unsigned int input_max_code[EV_CNT] = {
        [EV_KEY] = KEY_MAX,
        [EV_REL] = REL_MAX,
        [EV_ABS] = ABS_MAX,
        [EV_MSC] = MSC_MAX,
        [EV_SW] = SW_MAX,
        [EV_LED] = LED_MAX,
        [EV_SND] = SND_MAX,
        [EV_FF] = FF_MAX,
};

static inline int is_event_supported(unsigned int code,
                                     unsigned long *bm, unsigned int max)
{
        return code <= max && test_bit(code, bm);
}

static int input_defuzz_abs_event(int value, int old_val, int fuzz)
{
        if (fuzz) {
                if (value > old_val - fuzz / 2 && value < old_val + fuzz / 2)
                        return old_val;

                if (value > old_val - fuzz && value < old_val + fuzz)
                        return (old_val * 3 + value) / 4;

                if (value > old_val - fuzz * 2 && value < old_val + fuzz * 2)
                        return (old_val + value) / 2;
        }

        return value;
}

static void input_start_autorepeat(struct input_dev *dev, int code)
{
        if (test_bit(EV_REP, dev->evbit) &&
            dev->rep[REP_PERIOD] && dev->rep[REP_DELAY] &&
            dev->timer.function) {
                dev->repeat_key = code;
                mod_timer(&dev->timer,
                          jiffies + msecs_to_jiffies(dev->rep[REP_DELAY]));
        }
}

static void input_stop_autorepeat(struct input_dev *dev)
{
        timer_delete(&dev->timer);
}

/*
 * Pass values first through all filters and then, if event has not been
 * filtered out, through all open handles. This order is achieved by placing
 * filters at the head of the list of handles attached to the device, and
 * placing regular handles at the tail of the list.
 *
 * This function is called with dev->event_lock held and interrupts disabled.
 */
static void input_pass_values(struct input_dev *dev,
                              struct input_value *vals, unsigned int count)
{
        struct input_handle *handle;
        struct input_value *v;

        lockdep_assert_held(&dev->event_lock);

        scoped_guard(rcu) {
                handle = rcu_dereference(dev->grab);
                if (handle) {
                        count = handle->handle_events(handle, vals, count);
                        break;
                }

                list_for_each_entry_rcu(handle, &dev->h_list, d_node) {
                        if (handle->open) {
                                count = handle->handle_events(handle, vals,
                                                              count);
                                if (!count)
                                        break;
                        }
                }
        }

        /* trigger auto repeat for key events */
        if (test_bit(EV_REP, dev->evbit) && test_bit(EV_KEY, dev->evbit)) {
                for (v = vals; v != vals + count; v++) {
                        if (v->type == EV_KEY && v->value != 2) {
                                if (v->value)
                                        input_start_autorepeat(dev, v->code);
                                else
                                        input_stop_autorepeat(dev);
                        }
                }
        }
}

#define INPUT_IGNORE_EVENT        0
#define INPUT_PASS_TO_HANDLERS        1
#define INPUT_PASS_TO_DEVICE        2
#define INPUT_SLOT                4
#define INPUT_FLUSH                8
#define INPUT_PASS_TO_ALL        (INPUT_PASS_TO_HANDLERS | INPUT_PASS_TO_DEVICE)

static int input_handle_abs_event(struct input_dev *dev,
                                  unsigned int code, int *pval)
{
        struct input_mt *mt = dev->mt;
        bool is_new_slot = false;
        bool is_mt_event;
        int *pold;

        if (code == ABS_MT_SLOT) {
                /*
                 * "Stage" the event; we'll flush it later, when we
                 * get actual touch data.
                 */
                if (mt && *pval >= 0 && *pval < mt->num_slots)
                        mt->slot = *pval;

                return INPUT_IGNORE_EVENT;
        }

        is_mt_event = input_is_mt_value(code);

        if (!is_mt_event) {
                pold = &dev->absinfo[code].value;
        } else if (mt) {
                pold = &mt->slots[mt->slot].abs[code - ABS_MT_FIRST];
                is_new_slot = mt->slot != dev->absinfo[ABS_MT_SLOT].value;
        } else {
                /*
                 * Bypass filtering for multi-touch events when
                 * not employing slots.
                 */
                pold = NULL;
        }

        if (pold) {
                *pval = input_defuzz_abs_event(*pval, *pold,
                                                dev->absinfo[code].fuzz);
                if (*pold == *pval)
                        return INPUT_IGNORE_EVENT;

                *pold = *pval;
        }

        /* Flush pending "slot" event */
        if (is_new_slot) {
                dev->absinfo[ABS_MT_SLOT].value = mt->slot;
                return INPUT_PASS_TO_HANDLERS | INPUT_SLOT;
        }

        return INPUT_PASS_TO_HANDLERS;
}

static int input_get_disposition(struct input_dev *dev,
                          unsigned int type, unsigned int code, int *pval)
{
        int disposition = INPUT_IGNORE_EVENT;
        int value = *pval;

        /* filter-out events from inhibited devices */
        if (dev->inhibited)
                return INPUT_IGNORE_EVENT;

        switch (type) {

        case EV_SYN:
                switch (code) {
                case SYN_CONFIG:
                        disposition = INPUT_PASS_TO_ALL;
                        break;

                case SYN_REPORT:
                        disposition = INPUT_PASS_TO_HANDLERS | INPUT_FLUSH;
                        break;
                case SYN_MT_REPORT:
                        disposition = INPUT_PASS_TO_HANDLERS;
                        break;
                }
                break;

        case EV_KEY:
                if (is_event_supported(code, dev->keybit, KEY_MAX)) {

                        /* auto-repeat bypasses state updates */
                        if (value == 2) {
                                disposition = INPUT_PASS_TO_HANDLERS;
                                break;
                        }

                        if (!!test_bit(code, dev->key) != !!value) {

                                __change_bit(code, dev->key);
                                disposition = INPUT_PASS_TO_HANDLERS;
                        }
                }
                break;

        case EV_SW:
                if (is_event_supported(code, dev->swbit, SW_MAX) &&
                    !!test_bit(code, dev->sw) != !!value) {

                        __change_bit(code, dev->sw);
                        disposition = INPUT_PASS_TO_HANDLERS;
                }
                break;

        case EV_ABS:
                if (is_event_supported(code, dev->absbit, ABS_MAX))
                        disposition = input_handle_abs_event(dev, code, &value);

                break;

        case EV_REL:
                if (is_event_supported(code, dev->relbit, REL_MAX) && value)
                        disposition = INPUT_PASS_TO_HANDLERS;

                break;

        case EV_MSC:
                if (is_event_supported(code, dev->mscbit, MSC_MAX))
                        disposition = INPUT_PASS_TO_ALL;

                break;

        case EV_LED:
                if (is_event_supported(code, dev->ledbit, LED_MAX) &&
                    !!test_bit(code, dev->led) != !!value) {

                        __change_bit(code, dev->led);
                        disposition = INPUT_PASS_TO_ALL;
                }
                break;

        case EV_SND:
                if (is_event_supported(code, dev->sndbit, SND_MAX)) {

                        if (!!test_bit(code, dev->snd) != !!value)
                                __change_bit(code, dev->snd);
                        disposition = INPUT_PASS_TO_ALL;
                }
                break;

        case EV_REP:
                if (code <= REP_MAX && value >= 0 && dev->rep[code] != value) {
                        dev->rep[code] = value;
                        disposition = INPUT_PASS_TO_ALL;
                }
                break;

        case EV_FF:
                if (value >= 0)
                        disposition = INPUT_PASS_TO_ALL;
                break;

        case EV_PWR:
                disposition = INPUT_PASS_TO_ALL;
                break;
        }

        *pval = value;
        return disposition;
}

static void input_event_dispose(struct input_dev *dev, int disposition,
                                unsigned int type, unsigned int code, int value)
{
        if ((disposition & INPUT_PASS_TO_DEVICE) && dev->event)
                dev->event(dev, type, code, value);

        if (disposition & INPUT_PASS_TO_HANDLERS) {
                struct input_value *v;

                if (disposition & INPUT_SLOT) {
                        v = &dev->vals[dev->num_vals++];
                        v->type = EV_ABS;
                        v->code = ABS_MT_SLOT;
                        v->value = dev->mt->slot;
                }

                v = &dev->vals[dev->num_vals++];
                v->type = type;
                v->code = code;
                v->value = value;
        }

        if (disposition & INPUT_FLUSH) {
                if (dev->num_vals >= 2)
                        input_pass_values(dev, dev->vals, dev->num_vals);
                dev->num_vals = 0;
                /*
                 * Reset the timestamp on flush so we won't end up
                 * with a stale one. Note we only need to reset the
                 * monolithic one as we use its presence when deciding
                 * whether to generate a synthetic timestamp.
                 */
                dev->timestamp[INPUT_CLK_MONO] = ktime_set(0, 0);
        } else if (dev->num_vals >= dev->max_vals - 2) {
                dev->vals[dev->num_vals++] = input_value_sync;
                input_pass_values(dev, dev->vals, dev->num_vals);
                dev->num_vals = 0;
        }
}

void input_handle_event(struct input_dev *dev,
                        unsigned int type, unsigned int code, int value)
{
        int disposition;

        lockdep_assert_held(&dev->event_lock);

        disposition = input_get_disposition(dev, type, code, &value);
        if (disposition != INPUT_IGNORE_EVENT) {
                if (type != EV_SYN)
                        add_input_randomness(type, code, value);

                input_event_dispose(dev, disposition, type, code, value);
        }
}

/**
 * input_event() - report new input event
 * @dev: device that generated the event
 * @type: type of the event
 * @code: event code
 * @value: value of the event
 *
 * This function should be used by drivers implementing various input
 * devices to report input events. See also input_inject_event().
 *
 * NOTE: input_event() may be safely used right after input device was
 * allocated with input_allocate_device(), even before it is registered
 * with input_register_device(), but the event will not reach any of the
 * input handlers. Such early invocation of input_event() may be used
 * to 'seed' initial state of a switch or initial position of absolute
 * axis, etc.
 */
void input_event(struct input_dev *dev,
                 unsigned int type, unsigned int code, int value)
{
        if (is_event_supported(type, dev->evbit, EV_MAX)) {
                guard(spinlock_irqsave)(&dev->event_lock);
                input_handle_event(dev, type, code, value);
        }
}
EXPORT_SYMBOL(input_event);

/**
 * input_inject_event() - send input event from input handler
 * @handle: input handle to send event through
 * @type: type of the event
 * @code: event code
 * @value: value of the event
 *
 * Similar to input_event() but will ignore event if device is
 * "grabbed" and handle injecting event is not the one that owns
 * the device.
 */
void input_inject_event(struct input_handle *handle,
                        unsigned int type, unsigned int code, int value)
{
        struct input_dev *dev = handle->dev;
        struct input_handle *grab;

        if (is_event_supported(type, dev->evbit, EV_MAX)) {
                guard(spinlock_irqsave)(&dev->event_lock);
                guard(rcu)();

                grab = rcu_dereference(dev->grab);
                if (!grab || grab == handle)
                        input_handle_event(dev, type, code, value);

        }
}
EXPORT_SYMBOL(input_inject_event);

/**
 * input_alloc_absinfo - allocates array of input_absinfo structs
 * @dev: the input device emitting absolute events
 *
 * If the absinfo struct the caller asked for is already allocated, this
 * functions will not do anything.
 */
void input_alloc_absinfo(struct input_dev *dev)
{
        if (dev->absinfo)
                return;

        dev->absinfo = kzalloc_objs(*dev->absinfo, ABS_CNT);
        if (!dev->absinfo) {
                dev_err(dev->dev.parent ?: &dev->dev,
                        "%s: unable to allocate memory\n", __func__);
                /*
                 * We will handle this allocation failure in
                 * input_register_device() when we refuse to register input
                 * device with ABS bits but without absinfo.
                 */
        }
}
EXPORT_SYMBOL(input_alloc_absinfo);

void input_set_abs_params(struct input_dev *dev, unsigned int axis,
                          int min, int max, int fuzz, int flat)
{
        struct input_absinfo *absinfo;

        __set_bit(EV_ABS, dev->evbit);
        __set_bit(axis, dev->absbit);

        input_alloc_absinfo(dev);
        if (!dev->absinfo)
                return;

        absinfo = &dev->absinfo[axis];
        absinfo->minimum = min;
        absinfo->maximum = max;
        absinfo->fuzz = fuzz;
        absinfo->flat = flat;
}
EXPORT_SYMBOL(input_set_abs_params);

/**
 * input_copy_abs - Copy absinfo from one input_dev to another
 * @dst: Destination input device to copy the abs settings to
 * @dst_axis: ABS_* value selecting the destination axis
 * @src: Source input device to copy the abs settings from
 * @src_axis: ABS_* value selecting the source axis
 *
 * Set absinfo for the selected destination axis by copying it from
 * the specified source input device's source axis.
 * This is useful to e.g. setup a pen/stylus input-device for combined
 * touchscreen/pen hardware where the pen uses the same coordinates as
 * the touchscreen.
 */
void input_copy_abs(struct input_dev *dst, unsigned int dst_axis,
                    const struct input_dev *src, unsigned int src_axis)
{
        /* src must have EV_ABS and src_axis set */
        if (WARN_ON(!(test_bit(EV_ABS, src->evbit) &&
                      test_bit(src_axis, src->absbit))))
                return;

        /*
         * input_alloc_absinfo() may have failed for the source. Our caller is
         * expected to catch this when registering the input devices, which may
         * happen after the input_copy_abs() call.
         */
        if (!src->absinfo)
                return;

        input_set_capability(dst, EV_ABS, dst_axis);
        if (!dst->absinfo)
                return;

        dst->absinfo[dst_axis] = src->absinfo[src_axis];
}
EXPORT_SYMBOL(input_copy_abs);

/**
 * input_grab_device - grabs device for exclusive use
 * @handle: input handle that wants to own the device
 *
 * When a device is grabbed by an input handle all events generated by
 * the device are delivered only to this handle. Also events injected
 * by other input handles are ignored while device is grabbed.
 */
int input_grab_device(struct input_handle *handle)
{
        struct input_dev *dev = handle->dev;

        scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) {
                if (dev->grab)
                        return -EBUSY;

                rcu_assign_pointer(dev->grab, handle);
        }

        return 0;
}
EXPORT_SYMBOL(input_grab_device);

static void __input_release_device(struct input_handle *handle)
{
        struct input_dev *dev = handle->dev;
        struct input_handle *grabber;

        grabber = rcu_dereference_protected(dev->grab,
                                            lockdep_is_held(&dev->mutex));
        if (grabber == handle) {
                rcu_assign_pointer(dev->grab, NULL);
                /* Make sure input_pass_values() notices that grab is gone */
                synchronize_rcu();

                list_for_each_entry(handle, &dev->h_list, d_node)
                        if (handle->open && handle->handler->start)
                                handle->handler->start(handle);
        }
}

/**
 * input_release_device - release previously grabbed device
 * @handle: input handle that owns the device
 *
 * Releases previously grabbed device so that other input handles can
 * start receiving input events. Upon release all handlers attached
 * to the device have their start() method called so they have a change
 * to synchronize device state with the rest of the system.
 */
void input_release_device(struct input_handle *handle)
{
        struct input_dev *dev = handle->dev;

        guard(mutex)(&dev->mutex);
        __input_release_device(handle);
}
EXPORT_SYMBOL(input_release_device);

/**
 * input_open_device - open input device
 * @handle: handle through which device is being accessed
 *
 * This function should be called by input handlers when they
 * want to start receive events from given input device.
 */
int input_open_device(struct input_handle *handle)
{
        struct input_dev *dev = handle->dev;
        int error;

        scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) {
                if (dev->going_away)
                        return -ENODEV;

                handle->open++;

                if (handle->handler->passive_observer)
                        return 0;

                if (dev->users++ || dev->inhibited) {
                        /*
                         * Device is already opened and/or inhibited,
                         * so we can exit immediately and report success.
                         */
                        return 0;
                }

                if (dev->open) {
                        error = dev->open(dev);
                        if (error) {
                                dev->users--;
                                handle->open--;
                                /*
                                 * Make sure we are not delivering any more
                                 * events through this handle.
                                 */
                                synchronize_rcu();
                                return error;
                        }
                }

                if (dev->poller)
                        input_dev_poller_start(dev->poller);
        }

        return 0;
}
EXPORT_SYMBOL(input_open_device);

int input_flush_device(struct input_handle *handle, struct file *file)
{
        struct input_dev *dev = handle->dev;

        scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) {
                if (dev->flush)
                        return dev->flush(dev, file);
        }

        return 0;
}
EXPORT_SYMBOL(input_flush_device);

/**
 * input_close_device - close input device
 * @handle: handle through which device is being accessed
 *
 * This function should be called by input handlers when they
 * want to stop receive events from given input device.
 */
void input_close_device(struct input_handle *handle)
{
        struct input_dev *dev = handle->dev;

        guard(mutex)(&dev->mutex);

        __input_release_device(handle);

        if (!handle->handler->passive_observer) {
                if (!--dev->users && !dev->inhibited) {
                        if (dev->poller)
                                input_dev_poller_stop(dev->poller);
                        if (dev->close)
                                dev->close(dev);
                }
        }

        if (!--handle->open) {
                /*
                 * synchronize_rcu() makes sure that input_pass_values()
                 * completed and that no more input events are delivered
                 * through this handle
                 */
                synchronize_rcu();
        }
}
EXPORT_SYMBOL(input_close_device);

/*
 * Simulate keyup events for all keys that are marked as pressed.
 * The function must be called with dev->event_lock held.
 */
static bool input_dev_release_keys(struct input_dev *dev)
{
        bool need_sync = false;
        int code;

        lockdep_assert_held(&dev->event_lock);

        if (is_event_supported(EV_KEY, dev->evbit, EV_MAX)) {
                for_each_set_bit(code, dev->key, KEY_CNT) {
                        input_handle_event(dev, EV_KEY, code, 0);
                        need_sync = true;
                }
        }

        return need_sync;
}

/*
 * Prepare device for unregistering
 */
static void input_disconnect_device(struct input_dev *dev)
{
        struct input_handle *handle;

        /*
         * Mark device as going away. Note that we take dev->mutex here
         * not to protect access to dev->going_away but rather to ensure
         * that there are no threads in the middle of input_open_device()
         */
        scoped_guard(mutex, &dev->mutex)
                dev->going_away = true;

        guard(spinlock_irq)(&dev->event_lock);

        /*
         * Simulate keyup events for all pressed keys so that handlers
         * are not left with "stuck" keys. The driver may continue
         * generate events even after we done here but they will not
         * reach any handlers.
         */
        if (input_dev_release_keys(dev))
                input_handle_event(dev, EV_SYN, SYN_REPORT, 1);

        list_for_each_entry(handle, &dev->h_list, d_node)
                handle->open = 0;
}

/**
 * input_scancode_to_scalar() - converts scancode in &struct input_keymap_entry
 * @ke: keymap entry containing scancode to be converted.
 * @scancode: pointer to the location where converted scancode should
 *        be stored.
 *
 * This function is used to convert scancode stored in &struct keymap_entry
 * into scalar form understood by legacy keymap handling methods. These
 * methods expect scancodes to be represented as 'unsigned int'.
 */
int input_scancode_to_scalar(const struct input_keymap_entry *ke,
                             unsigned int *scancode)
{
        switch (ke->len) {
        case 1:
                *scancode = *((u8 *)ke->scancode);
                break;

        case 2:
                *scancode = *((u16 *)ke->scancode);
                break;

        case 4:
                *scancode = *((u32 *)ke->scancode);
                break;

        default:
                return -EINVAL;
        }

        return 0;
}
EXPORT_SYMBOL(input_scancode_to_scalar);

/*
 * Those routines handle the default case where no [gs]etkeycode() is
 * defined. In this case, an array indexed by the scancode is used.
 */

static unsigned int input_fetch_keycode(struct input_dev *dev,
                                        unsigned int index)
{
        switch (dev->keycodesize) {
        case 1:
                return ((u8 *)dev->keycode)[index];

        case 2:
                return ((u16 *)dev->keycode)[index];

        default:
                return ((u32 *)dev->keycode)[index];
        }
}

static int input_default_getkeycode(struct input_dev *dev,
                                    struct input_keymap_entry *ke)
{
        unsigned int index;
        int error;

        if (!dev->keycodesize)
                return -EINVAL;

        if (ke->flags & INPUT_KEYMAP_BY_INDEX)
                index = ke->index;
        else {
                error = input_scancode_to_scalar(ke, &index);
                if (error)
                        return error;
        }

        if (index >= dev->keycodemax)
                return -EINVAL;

        ke->keycode = input_fetch_keycode(dev, index);
        ke->index = index;
        ke->len = sizeof(index);
        memcpy(ke->scancode, &index, sizeof(index));

        return 0;
}

static int input_default_setkeycode(struct input_dev *dev,
                                    const struct input_keymap_entry *ke,
                                    unsigned int *old_keycode)
{
        unsigned int index;
        int error;
        int i;

        if (!dev->keycodesize)
                return -EINVAL;

        if (ke->flags & INPUT_KEYMAP_BY_INDEX) {
                index = ke->index;
        } else {
                error = input_scancode_to_scalar(ke, &index);
                if (error)
                        return error;
        }

        if (index >= dev->keycodemax)
                return -EINVAL;

        if (dev->keycodesize < sizeof(ke->keycode) &&
                        (ke->keycode >> (dev->keycodesize * 8)))
                return -EINVAL;

        switch (dev->keycodesize) {
                case 1: {
                        u8 *k = (u8 *)dev->keycode;
                        *old_keycode = k[index];
                        k[index] = ke->keycode;
                        break;
                }
                case 2: {
                        u16 *k = (u16 *)dev->keycode;
                        *old_keycode = k[index];
                        k[index] = ke->keycode;
                        break;
                }
                default: {
                        u32 *k = (u32 *)dev->keycode;
                        *old_keycode = k[index];
                        k[index] = ke->keycode;
                        break;
                }
        }

        if (*old_keycode <= KEY_MAX) {
                __clear_bit(*old_keycode, dev->keybit);
                for (i = 0; i < dev->keycodemax; i++) {
                        if (input_fetch_keycode(dev, i) == *old_keycode) {
                                __set_bit(*old_keycode, dev->keybit);
                                /* Setting the bit twice is useless, so break */
                                break;
                        }
                }
        }

        __set_bit(ke->keycode, dev->keybit);
        return 0;
}

/**
 * input_get_keycode - retrieve keycode currently mapped to a given scancode
 * @dev: input device which keymap is being queried
 * @ke: keymap entry
 *
 * This function should be called by anyone interested in retrieving current
 * keymap. Presently evdev handlers use it.
 */
int input_get_keycode(struct input_dev *dev, struct input_keymap_entry *ke)
{
        guard(spinlock_irqsave)(&dev->event_lock);

        return dev->getkeycode(dev, ke);
}
EXPORT_SYMBOL(input_get_keycode);

/**
 * input_set_keycode - attribute a keycode to a given scancode
 * @dev: input device which keymap is being updated
 * @ke: new keymap entry
 *
 * This function should be called by anyone needing to update current
 * keymap. Presently keyboard and evdev handlers use it.
 */
int input_set_keycode(struct input_dev *dev,
                      const struct input_keymap_entry *ke)
{
        unsigned int old_keycode;
        int error;

        if (ke->keycode > KEY_MAX)
                return -EINVAL;

        guard(spinlock_irqsave)(&dev->event_lock);

        error = dev->setkeycode(dev, ke, &old_keycode);
        if (error)
                return error;

        /* Make sure KEY_RESERVED did not get enabled. */
        __clear_bit(KEY_RESERVED, dev->keybit);

        /*
         * Simulate keyup event if keycode is not present
         * in the keymap anymore
         */
        if (old_keycode > KEY_MAX) {
                dev_warn(dev->dev.parent ?: &dev->dev,
                         "%s: got too big old keycode %#x\n",
                         __func__, old_keycode);
        } else if (test_bit(EV_KEY, dev->evbit) &&
                   !is_event_supported(old_keycode, dev->keybit, KEY_MAX) &&
                   __test_and_clear_bit(old_keycode, dev->key)) {
                /*
                 * We have to use input_event_dispose() here directly instead
                 * of input_handle_event() because the key we want to release
                 * here is considered no longer supported by the device and
                 * input_handle_event() will ignore it.
                 */
                input_event_dispose(dev, INPUT_PASS_TO_HANDLERS,
                                    EV_KEY, old_keycode, 0);
                input_event_dispose(dev, INPUT_PASS_TO_HANDLERS | INPUT_FLUSH,
                                    EV_SYN, SYN_REPORT, 1);
        }

        return 0;
}
EXPORT_SYMBOL(input_set_keycode);

bool input_match_device_id(const struct input_dev *dev,
                           const struct input_device_id *id)
{
        if (id->flags & INPUT_DEVICE_ID_MATCH_BUS)
                if (id->bustype != dev->id.bustype)
                        return false;

        if (id->flags & INPUT_DEVICE_ID_MATCH_VENDOR)
                if (id->vendor != dev->id.vendor)
                        return false;

        if (id->flags & INPUT_DEVICE_ID_MATCH_PRODUCT)
                if (id->product != dev->id.product)
                        return false;

        if (id->flags & INPUT_DEVICE_ID_MATCH_VERSION)
                if (id->version != dev->id.version)
                        return false;

        if (!bitmap_subset(id->evbit, dev->evbit, EV_MAX) ||
            !bitmap_subset(id->keybit, dev->keybit, KEY_MAX) ||
            !bitmap_subset(id->relbit, dev->relbit, REL_MAX) ||
            !bitmap_subset(id->absbit, dev->absbit, ABS_MAX) ||
            !bitmap_subset(id->mscbit, dev->mscbit, MSC_MAX) ||
            !bitmap_subset(id->ledbit, dev->ledbit, LED_MAX) ||
            !bitmap_subset(id->sndbit, dev->sndbit, SND_MAX) ||
            !bitmap_subset(id->ffbit, dev->ffbit, FF_MAX) ||
            !bitmap_subset(id->swbit, dev->swbit, SW_MAX) ||
            !bitmap_subset(id->propbit, dev->propbit, INPUT_PROP_MAX)) {
                return false;
        }

        return true;
}
EXPORT_SYMBOL(input_match_device_id);

static const struct input_device_id *input_match_device(struct input_handler *handler,
                                                        struct input_dev *dev)
{
        const struct input_device_id *id;

        for (id = handler->id_table; id->flags; id++) {
                if (input_match_device_id(dev, id) &&
                    (!handler->match || handler->match(handler, dev))) {
                        return id;
                }
        }

        return NULL;
}

static int input_attach_handler(struct input_dev *dev, struct input_handler *handler)
{
        const struct input_device_id *id;
        int error;

        id = input_match_device(handler, dev);
        if (!id)
                return -ENODEV;

        error = handler->connect(handler, dev, id);
        if (error && error != -ENODEV)
                pr_err("failed to attach handler %s to device %s, error: %d\n",
                       handler->name, kobject_name(&dev->dev.kobj), error);

        return error;
}

#ifdef CONFIG_PROC_FS

static struct proc_dir_entry *proc_bus_input_dir;
static DECLARE_WAIT_QUEUE_HEAD(input_devices_poll_wait);
static int input_devices_state;

static inline void input_wakeup_procfs_readers(void)
{
        input_devices_state++;
        wake_up(&input_devices_poll_wait);
}

struct input_seq_state {
        unsigned short pos;
        bool mutex_acquired;
        int input_devices_state;
};

static __poll_t input_proc_devices_poll(struct file *file, poll_table *wait)
{
        struct seq_file *seq = file->private_data;
        struct input_seq_state *state = seq->private;

        poll_wait(file, &input_devices_poll_wait, wait);
        if (state->input_devices_state != input_devices_state) {
                state->input_devices_state = input_devices_state;
                return EPOLLIN | EPOLLRDNORM;
        }

        return 0;
}

static void *input_devices_seq_start(struct seq_file *seq, loff_t *pos)
{
        struct input_seq_state *state = seq->private;
        int error;

        error = mutex_lock_interruptible(&input_mutex);
        if (error) {
                state->mutex_acquired = false;
                return ERR_PTR(error);
        }

        state->mutex_acquired = true;

        return seq_list_start(&input_dev_list, *pos);
}

static void *input_devices_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        return seq_list_next(v, &input_dev_list, pos);
}

static void input_seq_stop(struct seq_file *seq, void *v)
{
        struct input_seq_state *state = seq->private;

        if (state->mutex_acquired)
                mutex_unlock(&input_mutex);
}

static void input_seq_print_bitmap(struct seq_file *seq, const char *name,
                                   unsigned long *bitmap, int max)
{
        int i;
        bool skip_empty = true;
        char buf[18];

        seq_printf(seq, "B: %s=", name);

        for (i = BITS_TO_LONGS(max) - 1; i >= 0; i--) {
                if (input_bits_to_string(buf, sizeof(buf),
                                         bitmap[i], skip_empty)) {
                        skip_empty = false;
                        seq_printf(seq, "%s%s", buf, i > 0 ? " " : "");
                }
        }

        /*
         * If no output was produced print a single 0.
         */
        if (skip_empty)
                seq_putc(seq, '0');

        seq_putc(seq, '\n');
}

static int input_devices_seq_show(struct seq_file *seq, void *v)
{
        struct input_dev *dev = container_of(v, struct input_dev, node);
        const char *path = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
        struct input_handle *handle;

        seq_printf(seq, "I: Bus=%04x Vendor=%04x Product=%04x Version=%04x\n",
                   dev->id.bustype, dev->id.vendor, dev->id.product, dev->id.version);

        seq_printf(seq, "N: Name=\"%s\"\n", dev->name ? dev->name : "");
        seq_printf(seq, "P: Phys=%s\n", dev->phys ? dev->phys : "");
        seq_printf(seq, "S: Sysfs=%s\n", path ? path : "");
        seq_printf(seq, "U: Uniq=%s\n", dev->uniq ? dev->uniq : "");
        seq_puts(seq, "H: Handlers=");

        list_for_each_entry(handle, &dev->h_list, d_node)
                seq_printf(seq, "%s ", handle->name);
        seq_putc(seq, '\n');

        input_seq_print_bitmap(seq, "PROP", dev->propbit, INPUT_PROP_MAX);

        input_seq_print_bitmap(seq, "EV", dev->evbit, EV_MAX);
        if (test_bit(EV_KEY, dev->evbit))
                input_seq_print_bitmap(seq, "KEY", dev->keybit, KEY_MAX);
        if (test_bit(EV_REL, dev->evbit))
                input_seq_print_bitmap(seq, "REL", dev->relbit, REL_MAX);
        if (test_bit(EV_ABS, dev->evbit))
                input_seq_print_bitmap(seq, "ABS", dev->absbit, ABS_MAX);
        if (test_bit(EV_MSC, dev->evbit))
                input_seq_print_bitmap(seq, "MSC", dev->mscbit, MSC_MAX);
        if (test_bit(EV_LED, dev->evbit))
                input_seq_print_bitmap(seq, "LED", dev->ledbit, LED_MAX);
        if (test_bit(EV_SND, dev->evbit))
                input_seq_print_bitmap(seq, "SND", dev->sndbit, SND_MAX);
        if (test_bit(EV_FF, dev->evbit))
                input_seq_print_bitmap(seq, "FF", dev->ffbit, FF_MAX);
        if (test_bit(EV_SW, dev->evbit))
                input_seq_print_bitmap(seq, "SW", dev->swbit, SW_MAX);

        seq_putc(seq, '\n');

        kfree(path);
        return 0;
}

static const struct seq_operations input_devices_seq_ops = {
        .start        = input_devices_seq_start,
        .next        = input_devices_seq_next,
        .stop        = input_seq_stop,
        .show        = input_devices_seq_show,
};

static int input_proc_devices_open(struct inode *inode, struct file *file)
{
        return seq_open_private(file, &input_devices_seq_ops,
                                sizeof(struct input_seq_state));
}

static const struct proc_ops input_devices_proc_ops = {
        .proc_open        = input_proc_devices_open,
        .proc_poll        = input_proc_devices_poll,
        .proc_read        = seq_read,
        .proc_lseek        = seq_lseek,
        .proc_release        = seq_release_private,
};

static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos)
{
        struct input_seq_state *state = seq->private;
        int error;

        error = mutex_lock_interruptible(&input_mutex);
        if (error) {
                state->mutex_acquired = false;
                return ERR_PTR(error);
        }

        state->mutex_acquired = true;
        state->pos = *pos;

        return seq_list_start(&input_handler_list, *pos);
}

static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct input_seq_state *state = seq->private;

        state->pos = *pos + 1;
        return seq_list_next(v, &input_handler_list, pos);
}

static int input_handlers_seq_show(struct seq_file *seq, void *v)
{
        struct input_handler *handler = container_of(v, struct input_handler, node);
        struct input_seq_state *state = seq->private;

        seq_printf(seq, "N: Number=%u Name=%s", state->pos, handler->name);
        if (handler->filter)
                seq_puts(seq, " (filter)");
        if (handler->legacy_minors)
                seq_printf(seq, " Minor=%d", handler->minor);
        seq_putc(seq, '\n');

        return 0;
}

static const struct seq_operations input_handlers_seq_ops = {
        .start        = input_handlers_seq_start,
        .next        = input_handlers_seq_next,
        .stop        = input_seq_stop,
        .show        = input_handlers_seq_show,
};

static int input_proc_handlers_open(struct inode *inode, struct file *file)
{
        return seq_open_private(file, &input_handlers_seq_ops,
                                sizeof(struct input_seq_state));
}

static const struct proc_ops input_handlers_proc_ops = {
        .proc_open        = input_proc_handlers_open,
        .proc_read        = seq_read,
        .proc_lseek        = seq_lseek,
        .proc_release        = seq_release_private,
};

static int __init input_proc_init(void)
{
        struct proc_dir_entry *entry;

        proc_bus_input_dir = proc_mkdir("bus/input", NULL);
        if (!proc_bus_input_dir)
                return -ENOMEM;

        entry = proc_create("devices", 0, proc_bus_input_dir,
                            &input_devices_proc_ops);
        if (!entry)
                goto fail1;

        entry = proc_create("handlers", 0, proc_bus_input_dir,
                            &input_handlers_proc_ops);
        if (!entry)
                goto fail2;

        return 0;

 fail2:        remove_proc_entry("devices", proc_bus_input_dir);
 fail1: remove_proc_entry("bus/input", NULL);
        return -ENOMEM;
}

static void input_proc_exit(void)
{
        remove_proc_entry("devices", proc_bus_input_dir);
        remove_proc_entry("handlers", proc_bus_input_dir);
        remove_proc_entry("bus/input", NULL);
}

#else /* !CONFIG_PROC_FS */
static inline void input_wakeup_procfs_readers(void) { }
static inline int input_proc_init(void) { return 0; }
static inline void input_proc_exit(void) { }
#endif

#define INPUT_DEV_STRING_ATTR_SHOW(name)                                \
static ssize_t input_dev_show_##name(struct device *dev,                \
                                     struct device_attribute *attr,        \
                                     char *buf)                                \
{                                                                        \
        struct input_dev *input_dev = to_input_dev(dev);                \
                                                                        \
        return sysfs_emit(buf, "%s\n",                                        \
                          input_dev->name ? input_dev->name : "");        \
}                                                                        \
static DEVICE_ATTR(name, S_IRUGO, input_dev_show_##name, NULL)

INPUT_DEV_STRING_ATTR_SHOW(name);
INPUT_DEV_STRING_ATTR_SHOW(phys);
INPUT_DEV_STRING_ATTR_SHOW(uniq);

static int input_print_modalias_bits(char *buf, int size,
                                     char name, const unsigned long *bm,
                                     unsigned int min_bit, unsigned int max_bit)
{
        int bit = min_bit;
        int len = 0;

        len += snprintf(buf, max(size, 0), "%c", name);
        for_each_set_bit_from(bit, bm, max_bit)
                len += snprintf(buf + len, max(size - len, 0), "%X,", bit);
        return len;
}

static int input_print_modalias_parts(char *buf, int size, int full_len,
                                      const struct input_dev *id)
{
        int len, klen, remainder, space;

        len = snprintf(buf, max(size, 0),
                       "input:b%04Xv%04Xp%04Xe%04X-",
                       id->id.bustype, id->id.vendor,
                       id->id.product, id->id.version);

        len += input_print_modalias_bits(buf + len, size - len,
                                'e', id->evbit, 0, EV_MAX);

        /*
         * Calculate the remaining space in the buffer making sure we
         * have place for the terminating 0.
         */
        space = max(size - (len + 1), 0);

        klen = input_print_modalias_bits(buf + len, size - len,
                                'k', id->keybit, KEY_MIN_INTERESTING, KEY_MAX);
        len += klen;

        /*
         * If we have more data than we can fit in the buffer, check
         * if we can trim key data to fit in the rest. We will indicate
         * that key data is incomplete by adding "+" sign at the end, like
         * this: * "k1,2,3,45,+,".
         *
         * Note that we shortest key info (if present) is "k+," so we
         * can only try to trim if key data is longer than that.
         */
        if (full_len && size < full_len + 1 && klen > 3) {
                remainder = full_len - len;
                /*
                 * We can only trim if we have space for the remainder
                 * and also for at least "k+," which is 3 more characters.
                 */
                if (remainder <= space - 3) {
                        /*
                         * We are guaranteed to have 'k' in the buffer, so
                         * we need at least 3 additional bytes for storing
                         * "+," in addition to the remainder.
                         */
                        for (int i = size - 1 - remainder - 3; i >= 0; i--) {
                                if (buf[i] == 'k' || buf[i] == ',') {
                                        strcpy(buf + i + 1, "+,");
                                        len = i + 3; /* Not counting '\0' */
                                        break;
                                }
                        }
                }
        }

        len += input_print_modalias_bits(buf + len, size - len,
                                'r', id->relbit, 0, REL_MAX);
        len += input_print_modalias_bits(buf + len, size - len,
                                'a', id->absbit, 0, ABS_MAX);
        len += input_print_modalias_bits(buf + len, size - len,
                                'm', id->mscbit, 0, MSC_MAX);
        len += input_print_modalias_bits(buf + len, size - len,
                                'l', id->ledbit, 0, LED_MAX);
        len += input_print_modalias_bits(buf + len, size - len,
                                's', id->sndbit, 0, SND_MAX);
        len += input_print_modalias_bits(buf + len, size - len,
                                'f', id->ffbit, 0, FF_MAX);
        len += input_print_modalias_bits(buf + len, size - len,
                                'w', id->swbit, 0, SW_MAX);

        return len;
}

static int input_print_modalias(char *buf, int size, const struct input_dev *id)
{
        int full_len;

        /*
         * Printing is done in 2 passes: first one figures out total length
         * needed for the modalias string, second one will try to trim key
         * data in case when buffer is too small for the entire modalias.
         * If the buffer is too small regardless, it will fill as much as it
         * can (without trimming key data) into the buffer and leave it to
         * the caller to figure out what to do with the result.
         */
        full_len = input_print_modalias_parts(NULL, 0, 0, id);
        return input_print_modalias_parts(buf, size, full_len, id);
}

static ssize_t input_dev_show_modalias(struct device *dev,
                                       struct device_attribute *attr,
                                       char *buf)
{
        struct input_dev *id = to_input_dev(dev);
        ssize_t len;

        len = input_print_modalias(buf, PAGE_SIZE, id);
        if (len < PAGE_SIZE - 2)
                len += snprintf(buf + len, PAGE_SIZE - len, "\n");

        return min_t(int, len, PAGE_SIZE);
}
static DEVICE_ATTR(modalias, S_IRUGO, input_dev_show_modalias, NULL);

static int input_print_bitmap(char *buf, int buf_size, const unsigned long *bitmap,
                              int max, int add_cr);

static ssize_t input_dev_show_properties(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf)
{
        struct input_dev *input_dev = to_input_dev(dev);
        int len = input_print_bitmap(buf, PAGE_SIZE, input_dev->propbit,
                                     INPUT_PROP_MAX, true);
        return min_t(int, len, PAGE_SIZE);
}
static DEVICE_ATTR(properties, S_IRUGO, input_dev_show_properties, NULL);

static int input_inhibit_device(struct input_dev *dev);
static int input_uninhibit_device(struct input_dev *dev);

static ssize_t inhibited_show(struct device *dev,
                              struct device_attribute *attr,
                              char *buf)
{
        struct input_dev *input_dev = to_input_dev(dev);

        return sysfs_emit(buf, "%d\n", input_dev->inhibited);
}

static ssize_t inhibited_store(struct device *dev,
                               struct device_attribute *attr, const char *buf,
                               size_t len)
{
        struct input_dev *input_dev = to_input_dev(dev);
        ssize_t rv;
        bool inhibited;

        if (kstrtobool(buf, &inhibited))
                return -EINVAL;

        if (inhibited)
                rv = input_inhibit_device(input_dev);
        else
                rv = input_uninhibit_device(input_dev);

        if (rv != 0)
                return rv;

        return len;
}

static DEVICE_ATTR_RW(inhibited);

static struct attribute *input_dev_attrs[] = {
        &dev_attr_name.attr,
        &dev_attr_phys.attr,
        &dev_attr_uniq.attr,
        &dev_attr_modalias.attr,
        &dev_attr_properties.attr,
        &dev_attr_inhibited.attr,
        NULL
};

static const struct attribute_group input_dev_attr_group = {
        .attrs        = input_dev_attrs,
};

#define INPUT_DEV_ID_ATTR(name)                                                \
static ssize_t input_dev_show_id_##name(struct device *dev,                \
                                        struct device_attribute *attr,        \
                                        char *buf)                        \
{                                                                        \
        struct input_dev *input_dev = to_input_dev(dev);                \
        return sysfs_emit(buf, "%04x\n", input_dev->id.name);                \
}                                                                        \
static DEVICE_ATTR(name, S_IRUGO, input_dev_show_id_##name, NULL)

INPUT_DEV_ID_ATTR(bustype);
INPUT_DEV_ID_ATTR(vendor);
INPUT_DEV_ID_ATTR(product);
INPUT_DEV_ID_ATTR(version);

static struct attribute *input_dev_id_attrs[] = {
        &dev_attr_bustype.attr,
        &dev_attr_vendor.attr,
        &dev_attr_product.attr,
        &dev_attr_version.attr,
        NULL
};

static const struct attribute_group input_dev_id_attr_group = {
        .name        = "id",
        .attrs        = input_dev_id_attrs,
};

static int input_print_bitmap(char *buf, int buf_size, const unsigned long *bitmap,
                              int max, int add_cr)
{
        int i;
        int len = 0;
        bool skip_empty = true;

        for (i = BITS_TO_LONGS(max) - 1; i >= 0; i--) {
                len += input_bits_to_string(buf + len, max(buf_size - len, 0),
                                            bitmap[i], skip_empty);
                if (len) {
                        skip_empty = false;
                        if (i > 0)
                                len += snprintf(buf + len, max(buf_size - len, 0), " ");
                }
        }

        /*
         * If no output was produced print a single 0.
         */
        if (len == 0)
                len = snprintf(buf, buf_size, "%d", 0);

        if (add_cr)
                len += snprintf(buf + len, max(buf_size - len, 0), "\n");

        return len;
}

#define INPUT_DEV_CAP_ATTR(ev, bm)                                        \
static ssize_t input_dev_show_cap_##bm(struct device *dev,                \
                                       struct device_attribute *attr,        \
                                       char *buf)                        \
{                                                                        \
        struct input_dev *input_dev = to_input_dev(dev);                \
        int len = input_print_bitmap(buf, PAGE_SIZE,                        \
                                     input_dev->bm##bit, ev##_MAX,        \
                                     true);                                \
        return min_t(int, len, PAGE_SIZE);                                \
}                                                                        \
static DEVICE_ATTR(bm, S_IRUGO, input_dev_show_cap_##bm, NULL)

INPUT_DEV_CAP_ATTR(EV, ev);
INPUT_DEV_CAP_ATTR(KEY, key);
INPUT_DEV_CAP_ATTR(REL, rel);
INPUT_DEV_CAP_ATTR(ABS, abs);
INPUT_DEV_CAP_ATTR(MSC, msc);
INPUT_DEV_CAP_ATTR(LED, led);
INPUT_DEV_CAP_ATTR(SND, snd);
INPUT_DEV_CAP_ATTR(FF, ff);
INPUT_DEV_CAP_ATTR(SW, sw);

static struct attribute *input_dev_caps_attrs[] = {
        &dev_attr_ev.attr,
        &dev_attr_key.attr,
        &dev_attr_rel.attr,
        &dev_attr_abs.attr,
        &dev_attr_msc.attr,
        &dev_attr_led.attr,
        &dev_attr_snd.attr,
        &dev_attr_ff.attr,
        &dev_attr_sw.attr,
        NULL
};

static const struct attribute_group input_dev_caps_attr_group = {
        .name        = "capabilities",
        .attrs        = input_dev_caps_attrs,
};

static const struct attribute_group *input_dev_attr_groups[] = {
        &input_dev_attr_group,
        &input_dev_id_attr_group,
        &input_dev_caps_attr_group,
        &input_poller_attribute_group,
        NULL
};

static void input_dev_release(struct device *device)
{
        struct input_dev *dev = to_input_dev(device);

        input_ff_destroy(dev);
        input_mt_destroy_slots(dev);
        kfree(dev->poller);
        kfree(dev->absinfo);
        kfree(dev->vals);
        kfree(dev);

        module_put(THIS_MODULE);
}

/*
 * Input uevent interface - loading event handlers based on
 * device bitfields.
 */
static int input_add_uevent_bm_var(struct kobj_uevent_env *env,
                                   const char *name, const unsigned long *bitmap, int max)
{
        int len;

        if (add_uevent_var(env, "%s", name))
                return -ENOMEM;

        len = input_print_bitmap(&env->buf[env->buflen - 1],
                                 sizeof(env->buf) - env->buflen,
                                 bitmap, max, false);
        if (len >= (sizeof(env->buf) - env->buflen))
                return -ENOMEM;

        env->buflen += len;
        return 0;
}

/*
 * This is a pretty gross hack. When building uevent data the driver core
 * may try adding more environment variables to kobj_uevent_env without
 * telling us, so we have no idea how much of the buffer we can use to
 * avoid overflows/-ENOMEM elsewhere. To work around this let's artificially
 * reduce amount of memory we will use for the modalias environment variable.
 *
 * The potential additions are:
 *
 * SEQNUM=18446744073709551615 - (%llu - 28 bytes)
 * HOME=/ (6 bytes)
 * PATH=/sbin:/bin:/usr/sbin:/usr/bin (34 bytes)
 *
 * 68 bytes total. Allow extra buffer - 96 bytes
 */
#define UEVENT_ENV_EXTRA_LEN        96

static int input_add_uevent_modalias_var(struct kobj_uevent_env *env,
                                         const struct input_dev *dev)
{
        int len;

        if (add_uevent_var(env, "MODALIAS="))
                return -ENOMEM;

        len = input_print_modalias(&env->buf[env->buflen - 1],
                                   (int)sizeof(env->buf) - env->buflen -
                                        UEVENT_ENV_EXTRA_LEN,
                                   dev);
        if (len >= ((int)sizeof(env->buf) - env->buflen -
                                        UEVENT_ENV_EXTRA_LEN))
                return -ENOMEM;

        env->buflen += len;
        return 0;
}

#define INPUT_ADD_HOTPLUG_VAR(fmt, val...)                                \
        do {                                                                \
                int err = add_uevent_var(env, fmt, val);                \
                if (err)                                                \
                        return err;                                        \
        } while (0)

#define INPUT_ADD_HOTPLUG_BM_VAR(name, bm, max)                                \
        do {                                                                \
                int err = input_add_uevent_bm_var(env, name, bm, max);        \
                if (err)                                                \
                        return err;                                        \
        } while (0)

#define INPUT_ADD_HOTPLUG_MODALIAS_VAR(dev)                                \
        do {                                                                \
                int err = input_add_uevent_modalias_var(env, dev);        \
                if (err)                                                \
                        return err;                                        \
        } while (0)

static int input_dev_uevent(const struct device *device, struct kobj_uevent_env *env)
{
        const struct input_dev *dev = to_input_dev(device);

        INPUT_ADD_HOTPLUG_VAR("PRODUCT=%x/%x/%x/%x",
                                dev->id.bustype, dev->id.vendor,
                                dev->id.product, dev->id.version);
        if (dev->name)
                INPUT_ADD_HOTPLUG_VAR("NAME=\"%s\"", dev->name);
        if (dev->phys)
                INPUT_ADD_HOTPLUG_VAR("PHYS=\"%s\"", dev->phys);
        if (dev->uniq)
                INPUT_ADD_HOTPLUG_VAR("UNIQ=\"%s\"", dev->uniq);

        INPUT_ADD_HOTPLUG_BM_VAR("PROP=", dev->propbit, INPUT_PROP_MAX);

        INPUT_ADD_HOTPLUG_BM_VAR("EV=", dev->evbit, EV_MAX);
        if (test_bit(EV_KEY, dev->evbit))
                INPUT_ADD_HOTPLUG_BM_VAR("KEY=", dev->keybit, KEY_MAX);
        if (test_bit(EV_REL, dev->evbit))
                INPUT_ADD_HOTPLUG_BM_VAR("REL=", dev->relbit, REL_MAX);
        if (test_bit(EV_ABS, dev->evbit))
                INPUT_ADD_HOTPLUG_BM_VAR("ABS=", dev->absbit, ABS_MAX);
        if (test_bit(EV_MSC, dev->evbit))
                INPUT_ADD_HOTPLUG_BM_VAR("MSC=", dev->mscbit, MSC_MAX);
        if (test_bit(EV_LED, dev->evbit))
                INPUT_ADD_HOTPLUG_BM_VAR("LED=", dev->ledbit, LED_MAX);
        if (test_bit(EV_SND, dev->evbit))
                INPUT_ADD_HOTPLUG_BM_VAR("SND=", dev->sndbit, SND_MAX);
        if (test_bit(EV_FF, dev->evbit))
                INPUT_ADD_HOTPLUG_BM_VAR("FF=", dev->ffbit, FF_MAX);
        if (test_bit(EV_SW, dev->evbit))
                INPUT_ADD_HOTPLUG_BM_VAR("SW=", dev->swbit, SW_MAX);

        INPUT_ADD_HOTPLUG_MODALIAS_VAR(dev);

        return 0;
}

#define INPUT_DO_TOGGLE(dev, type, bits, on)                                \
        do {                                                                \
                int i;                                                        \
                bool active;                                                \
                                                                        \
                if (!test_bit(EV_##type, dev->evbit))                        \
                        break;                                                \
                                                                        \
                for_each_set_bit(i, dev->bits##bit, type##_CNT) {        \
                        active = test_bit(i, dev->bits);                \
                        if (!active && !on)                                \
                                continue;                                \
                                                                        \
                        dev->event(dev, EV_##type, i, on ? active : 0);        \
                }                                                        \
        } while (0)

static void input_dev_toggle(struct input_dev *dev, bool activate)
{
        if (!dev->event)
                return;

        INPUT_DO_TOGGLE(dev, LED, led, activate);
        INPUT_DO_TOGGLE(dev, SND, snd, activate);

        if (activate && test_bit(EV_REP, dev->evbit)) {
                dev->event(dev, EV_REP, REP_PERIOD, dev->rep[REP_PERIOD]);
                dev->event(dev, EV_REP, REP_DELAY, dev->rep[REP_DELAY]);
        }
}

/**
 * input_reset_device() - reset/restore the state of input device
 * @dev: input device whose state needs to be reset
 *
 * This function tries to reset the state of an opened input device and
 * bring internal state and state if the hardware in sync with each other.
 * We mark all keys as released, restore LED state, repeat rate, etc.
 */
void input_reset_device(struct input_dev *dev)
{
        guard(mutex)(&dev->mutex);
        guard(spinlock_irqsave)(&dev->event_lock);

        input_dev_toggle(dev, true);
        if (input_dev_release_keys(dev))
                input_handle_event(dev, EV_SYN, SYN_REPORT, 1);
}
EXPORT_SYMBOL(input_reset_device);

static int input_inhibit_device(struct input_dev *dev)
{
        guard(mutex)(&dev->mutex);

        if (dev->inhibited)
                return 0;

        if (dev->users) {
                if (dev->close)
                        dev->close(dev);
                if (dev->poller)
                        input_dev_poller_stop(dev->poller);
        }

        scoped_guard(spinlock_irq, &dev->event_lock) {
                input_mt_release_slots(dev);
                input_dev_release_keys(dev);
                input_handle_event(dev, EV_SYN, SYN_REPORT, 1);
                input_dev_toggle(dev, false);
        }

        dev->inhibited = true;

        return 0;
}

static int input_uninhibit_device(struct input_dev *dev)
{
        int error;

        guard(mutex)(&dev->mutex);

        if (!dev->inhibited)
                return 0;

        if (dev->users) {
                if (dev->open) {
                        error = dev->open(dev);
                        if (error)
                                return error;
                }
                if (dev->poller)
                        input_dev_poller_start(dev->poller);
        }

        dev->inhibited = false;

        scoped_guard(spinlock_irq, &dev->event_lock)
                input_dev_toggle(dev, true);

        return 0;
}

static int input_dev_suspend(struct device *dev)
{
        struct input_dev *input_dev = to_input_dev(dev);

        guard(spinlock_irq)(&input_dev->event_lock);

        /*
         * Keys that are pressed now are unlikely to be
         * still pressed when we resume.
         */
        if (input_dev_release_keys(input_dev))
                input_handle_event(input_dev, EV_SYN, SYN_REPORT, 1);

        /* Turn off LEDs and sounds, if any are active. */
        input_dev_toggle(input_dev, false);

        return 0;
}

static int input_dev_resume(struct device *dev)
{
        struct input_dev *input_dev = to_input_dev(dev);

        guard(spinlock_irq)(&input_dev->event_lock);

        /* Restore state of LEDs and sounds, if any were active. */
        input_dev_toggle(input_dev, true);

        return 0;
}

static int input_dev_freeze(struct device *dev)
{
        struct input_dev *input_dev = to_input_dev(dev);

        guard(spinlock_irq)(&input_dev->event_lock);

        /*
         * Keys that are pressed now are unlikely to be
         * still pressed when we resume.
         */
        if (input_dev_release_keys(input_dev))
                input_handle_event(input_dev, EV_SYN, SYN_REPORT, 1);

        return 0;
}

static int input_dev_poweroff(struct device *dev)
{
        struct input_dev *input_dev = to_input_dev(dev);

        guard(spinlock_irq)(&input_dev->event_lock);

        /* Turn off LEDs and sounds, if any are active. */
        input_dev_toggle(input_dev, false);

        return 0;
}

static const struct dev_pm_ops input_dev_pm_ops = {
        .suspend        = input_dev_suspend,
        .resume                = input_dev_resume,
        .freeze                = input_dev_freeze,
        .poweroff        = input_dev_poweroff,
        .restore        = input_dev_resume,
};

static const struct device_type input_dev_type = {
        .groups                = input_dev_attr_groups,
        .release        = input_dev_release,
        .uevent                = input_dev_uevent,
        .pm                = pm_sleep_ptr(&input_dev_pm_ops),
};

static char *input_devnode(const struct device *dev, umode_t *mode)
{
        return kasprintf(GFP_KERNEL, "input/%s", dev_name(dev));
}

const struct class input_class = {
        .name                = "input",
        .devnode        = input_devnode,
};
EXPORT_SYMBOL_GPL(input_class);

/**
 * input_allocate_device - allocate memory for new input device
 *
 * Returns prepared struct input_dev or %NULL.
 *
 * NOTE: Use input_free_device() to free devices that have not been
 * registered; input_unregister_device() should be used for already
 * registered devices.
 */
struct input_dev *input_allocate_device(void)
{
        static atomic_t input_no = ATOMIC_INIT(-1);
        struct input_dev *dev;

        dev = kzalloc_obj(*dev);
        if (!dev)
                return NULL;

        /*
         * Start with space for SYN_REPORT + 7 EV_KEY/EV_MSC events + 2 spare,
         * see input_estimate_events_per_packet(). We will tune the number
         * when we register the device.
         */
        dev->max_vals = 10;
        dev->vals = kzalloc_objs(*dev->vals, dev->max_vals);
        if (!dev->vals) {
                kfree(dev);
                return NULL;
        }

        mutex_init(&dev->mutex);
        spin_lock_init(&dev->event_lock);
        timer_setup(&dev->timer, NULL, 0);
        INIT_LIST_HEAD(&dev->h_list);
        INIT_LIST_HEAD(&dev->node);

        dev->dev.type = &input_dev_type;
        dev->dev.class = &input_class;
        device_initialize(&dev->dev);
        /*
         * From this point on we can no longer simply "kfree(dev)", we need
         * to use input_free_device() so that device core properly frees its
         * resources associated with the input device.
         */

        dev_set_name(&dev->dev, "input%lu",
                     (unsigned long)atomic_inc_return(&input_no));

        __module_get(THIS_MODULE);

        return dev;
}
EXPORT_SYMBOL(input_allocate_device);

struct input_devres {
        struct input_dev *input;
};

static int devm_input_device_match(struct device *dev, void *res, void *data)
{
        struct input_devres *devres = res;

        return devres->input == data;
}

static void devm_input_device_release(struct device *dev, void *res)
{
        struct input_devres *devres = res;
        struct input_dev *input = devres->input;

        dev_dbg(dev, "%s: dropping reference to %s\n",
                __func__, dev_name(&input->dev));
        input_put_device(input);
}

/**
 * devm_input_allocate_device - allocate managed input device
 * @dev: device owning the input device being created
 *
 * Returns prepared struct input_dev or %NULL.
 *
 * Managed input devices do not need to be explicitly unregistered or
 * freed as it will be done automatically when owner device unbinds from
 * its driver (or binding fails). Once managed input device is allocated,
 * it is ready to be set up and registered in the same fashion as regular
 * input device. There are no special devm_input_device_[un]register()
 * variants, regular ones work with both managed and unmanaged devices,
 * should you need them. In most cases however, managed input device need
 * not be explicitly unregistered or freed.
 *
 * NOTE: the owner device is set up as parent of input device and users
 * should not override it.
 */
struct input_dev *devm_input_allocate_device(struct device *dev)
{
        struct input_dev *input;
        struct input_devres *devres;

        devres = devres_alloc(devm_input_device_release,
                              sizeof(*devres), GFP_KERNEL);
        if (!devres)
                return NULL;

        input = input_allocate_device();
        if (!input) {
                devres_free(devres);
                return NULL;
        }

        input->dev.parent = dev;
        input->devres_managed = true;

        devres->input = input;
        devres_add(dev, devres);

        return input;
}
EXPORT_SYMBOL(devm_input_allocate_device);

/**
 * input_free_device - free memory occupied by input_dev structure
 * @dev: input device to free
 *
 * This function should only be used if input_register_device()
 * was not called yet or if it failed. Once device was registered
 * use input_unregister_device() and memory will be freed once last
 * reference to the device is dropped.
 *
 * Device should be allocated by input_allocate_device().
 *
 * NOTE: If there are references to the input device then memory
 * will not be freed until last reference is dropped.
 */
void input_free_device(struct input_dev *dev)
{
        if (dev) {
                if (dev->devres_managed)
                        WARN_ON(devres_destroy(dev->dev.parent,
                                                devm_input_device_release,
                                                devm_input_device_match,
                                                dev));
                input_put_device(dev);
        }
}
EXPORT_SYMBOL(input_free_device);

/**
 * input_set_timestamp - set timestamp for input events
 * @dev: input device to set timestamp for
 * @timestamp: the time at which the event has occurred
 *   in CLOCK_MONOTONIC
 *
 * This function is intended to provide to the input system a more
 * accurate time of when an event actually occurred. The driver should
 * call this function as soon as a timestamp is acquired ensuring
 * clock conversions in input_set_timestamp are done correctly.
 *
 * The system entering suspend state between timestamp acquisition and
 * calling input_set_timestamp can result in inaccurate conversions.
 */
void input_set_timestamp(struct input_dev *dev, ktime_t timestamp)
{
        dev->timestamp[INPUT_CLK_MONO] = timestamp;
        dev->timestamp[INPUT_CLK_REAL] = ktime_mono_to_real(timestamp);
        dev->timestamp[INPUT_CLK_BOOT] = ktime_mono_to_any(timestamp,
                                                           TK_OFFS_BOOT);
}
EXPORT_SYMBOL(input_set_timestamp);

/**
 * input_get_timestamp - get timestamp for input events
 * @dev: input device to get timestamp from
 *
 * A valid timestamp is a timestamp of non-zero value.
 */
ktime_t *input_get_timestamp(struct input_dev *dev)
{
        const ktime_t invalid_timestamp = ktime_set(0, 0);

        if (!ktime_compare(dev->timestamp[INPUT_CLK_MONO], invalid_timestamp))
                input_set_timestamp(dev, ktime_get());

        return dev->timestamp;
}
EXPORT_SYMBOL(input_get_timestamp);

/**
 * input_set_capability - mark device as capable of a certain event
 * @dev: device that is capable of emitting or accepting event
 * @type: type of the event (EV_KEY, EV_REL, etc...)
 * @code: event code
 *
 * In addition to setting up corresponding bit in appropriate capability
 * bitmap the function also adjusts dev->evbit.
 */
void input_set_capability(struct input_dev *dev, unsigned int type, unsigned int code)
{
        if (type < EV_CNT && input_max_code[type] &&
            code > input_max_code[type]) {
                pr_err("%s: invalid code %u for type %u\n", __func__, code,
                       type);
                dump_stack();
                return;
        }

        switch (type) {
        case EV_KEY:
                __set_bit(code, dev->keybit);
                break;

        case EV_REL:
                __set_bit(code, dev->relbit);
                break;

        case EV_ABS:
                input_alloc_absinfo(dev);
                __set_bit(code, dev->absbit);
                break;

        case EV_MSC:
                __set_bit(code, dev->mscbit);
                break;

        case EV_SW:
                __set_bit(code, dev->swbit);
                break;

        case EV_LED:
                __set_bit(code, dev->ledbit);
                break;

        case EV_SND:
                __set_bit(code, dev->sndbit);
                break;

        case EV_FF:
                __set_bit(code, dev->ffbit);
                break;

        case EV_PWR:
                /* do nothing */
                break;

        default:
                pr_err("%s: unknown type %u (code %u)\n", __func__, type, code);
                dump_stack();
                return;
        }

        __set_bit(type, dev->evbit);
}
EXPORT_SYMBOL(input_set_capability);

static unsigned int input_estimate_events_per_packet(struct input_dev *dev)
{
        int mt_slots;
        int i;
        unsigned int events;

        if (dev->mt) {
                mt_slots = dev->mt->num_slots;
        } else if (test_bit(ABS_MT_TRACKING_ID, dev->absbit)) {
                mt_slots = dev->absinfo[ABS_MT_TRACKING_ID].maximum -
                           dev->absinfo[ABS_MT_TRACKING_ID].minimum + 1;
                mt_slots = clamp(mt_slots, 2, 32);
        } else if (test_bit(ABS_MT_POSITION_X, dev->absbit)) {
                mt_slots = 2;
        } else {
                mt_slots = 0;
        }

        events = mt_slots + 1; /* count SYN_MT_REPORT and SYN_REPORT */

        if (test_bit(EV_ABS, dev->evbit))
                for_each_set_bit(i, dev->absbit, ABS_CNT)
                        events += input_is_mt_axis(i) ? mt_slots : 1;

        if (test_bit(EV_REL, dev->evbit))
                events += bitmap_weight(dev->relbit, REL_CNT);

        /* Make room for KEY and MSC events */
        events += 7;

        return events;
}

#define INPUT_CLEANSE_BITMASK(dev, type, bits)                                \
        do {                                                                \
                if (!test_bit(EV_##type, dev->evbit))                        \
                        memset(dev->bits##bit, 0,                        \
                                sizeof(dev->bits##bit));                \
        } while (0)

static void input_cleanse_bitmasks(struct input_dev *dev)
{
        INPUT_CLEANSE_BITMASK(dev, KEY, key);
        INPUT_CLEANSE_BITMASK(dev, REL, rel);
        INPUT_CLEANSE_BITMASK(dev, ABS, abs);
        INPUT_CLEANSE_BITMASK(dev, MSC, msc);
        INPUT_CLEANSE_BITMASK(dev, LED, led);
        INPUT_CLEANSE_BITMASK(dev, SND, snd);
        INPUT_CLEANSE_BITMASK(dev, FF, ff);
        INPUT_CLEANSE_BITMASK(dev, SW, sw);
}

static void __input_unregister_device(struct input_dev *dev)
{
        struct input_handle *handle, *next;

        input_disconnect_device(dev);

        scoped_guard(mutex, &input_mutex) {
                list_for_each_entry_safe(handle, next, &dev->h_list, d_node)
                        handle->handler->disconnect(handle);
                WARN_ON(!list_empty(&dev->h_list));

                timer_delete_sync(&dev->timer);
                list_del_init(&dev->node);

                input_wakeup_procfs_readers();
        }

        device_del(&dev->dev);
}

static void devm_input_device_unregister(struct device *dev, void *res)
{
        struct input_devres *devres = res;
        struct input_dev *input = devres->input;

        dev_dbg(dev, "%s: unregistering device %s\n",
                __func__, dev_name(&input->dev));
        __input_unregister_device(input);
}

/*
 * Generate software autorepeat event. Note that we take
 * dev->event_lock here to avoid racing with input_event
 * which may cause keys get "stuck".
 */
static void input_repeat_key(struct timer_list *t)
{
        struct input_dev *dev = timer_container_of(dev, t, timer);

        guard(spinlock_irqsave)(&dev->event_lock);

        if (!dev->inhibited &&
            test_bit(dev->repeat_key, dev->key) &&
            is_event_supported(dev->repeat_key, dev->keybit, KEY_MAX)) {

                input_set_timestamp(dev, ktime_get());
                input_handle_event(dev, EV_KEY, dev->repeat_key, 2);
                input_handle_event(dev, EV_SYN, SYN_REPORT, 1);

                if (dev->rep[REP_PERIOD])
                        mod_timer(&dev->timer, jiffies +
                                        msecs_to_jiffies(dev->rep[REP_PERIOD]));
        }
}

/**
 * input_enable_softrepeat - enable software autorepeat
 * @dev: input device
 * @delay: repeat delay
 * @period: repeat period
 *
 * Enable software autorepeat on the input device.
 */
void input_enable_softrepeat(struct input_dev *dev, int delay, int period)
{
        dev->timer.function = input_repeat_key;
        dev->rep[REP_DELAY] = delay;
        dev->rep[REP_PERIOD] = period;
}
EXPORT_SYMBOL(input_enable_softrepeat);

bool input_device_enabled(struct input_dev *dev)
{
        lockdep_assert_held(&dev->mutex);

        return !dev->inhibited && dev->users > 0;
}
EXPORT_SYMBOL_GPL(input_device_enabled);

static int input_device_tune_vals(struct input_dev *dev)
{
        struct input_value *vals;
        unsigned int packet_size;
        unsigned int max_vals;

        packet_size = input_estimate_events_per_packet(dev);
        if (dev->hint_events_per_packet < packet_size)
                dev->hint_events_per_packet = packet_size;

        max_vals = dev->hint_events_per_packet + 2;
        if (dev->max_vals >= max_vals)
                return 0;

        vals = kcalloc(max_vals, sizeof(*vals), GFP_KERNEL);
        if (!vals)
                return -ENOMEM;

        scoped_guard(spinlock_irq, &dev->event_lock) {
                dev->max_vals = max_vals;
                swap(dev->vals, vals);
        }

        /* Because of swap() above, this frees the old vals memory */
        kfree(vals);

        return 0;
}

/**
 * input_register_device - register device with input core
 * @dev: device to be registered
 *
 * This function registers device with input core. The device must be
 * allocated with input_allocate_device() and all it's capabilities
 * set up before registering.
 * If function fails the device must be freed with input_free_device().
 * Once device has been successfully registered it can be unregistered
 * with input_unregister_device(); input_free_device() should not be
 * called in this case.
 *
 * Note that this function is also used to register managed input devices
 * (ones allocated with devm_input_allocate_device()). Such managed input
 * devices need not be explicitly unregistered or freed, their tear down
 * is controlled by the devres infrastructure. It is also worth noting
 * that tear down of managed input devices is internally a 2-step process:
 * registered managed input device is first unregistered, but stays in
 * memory and can still handle input_event() calls (although events will
 * not be delivered anywhere). The freeing of managed input device will
 * happen later, when devres stack is unwound to the point where device
 * allocation was made.
 */
int input_register_device(struct input_dev *dev)
{
        struct input_devres *devres = NULL;
        struct input_handler *handler;
        const char *path;
        int error;

        if (test_bit(EV_ABS, dev->evbit) && !dev->absinfo) {
                dev_err(&dev->dev,
                        "Absolute device without dev->absinfo, refusing to register\n");
                return -EINVAL;
        }

        if (dev->devres_managed) {
                devres = devres_alloc(devm_input_device_unregister,
                                      sizeof(*devres), GFP_KERNEL);
                if (!devres)
                        return -ENOMEM;

                devres->input = dev;
        }

        /* Every input device generates EV_SYN/SYN_REPORT events. */
        __set_bit(EV_SYN, dev->evbit);

        /* KEY_RESERVED is not supposed to be transmitted to userspace. */
        __clear_bit(KEY_RESERVED, dev->keybit);

        /* Make sure that bitmasks not mentioned in dev->evbit are clean. */
        input_cleanse_bitmasks(dev);

        error = input_device_tune_vals(dev);
        if (error)
                goto err_devres_free;

        /*
         * If delay and period are pre-set by the driver, then autorepeating
         * is handled by the driver itself and we don't do it in input.c.
         */
        if (!dev->rep[REP_DELAY] && !dev->rep[REP_PERIOD])
                input_enable_softrepeat(dev, 250, 33);

        if (!dev->getkeycode)
                dev->getkeycode = input_default_getkeycode;

        if (!dev->setkeycode)
                dev->setkeycode = input_default_setkeycode;

        if (dev->poller)
                input_dev_poller_finalize(dev->poller);

        error = device_add(&dev->dev);
        if (error)
                goto err_devres_free;

        path = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
        pr_info("%s as %s\n",
                dev->name ? dev->name : "Unspecified device",
                path ? path : "N/A");
        kfree(path);

        error = -EINTR;
        scoped_cond_guard(mutex_intr, goto err_device_del, &input_mutex) {
                list_add_tail(&dev->node, &input_dev_list);

                list_for_each_entry(handler, &input_handler_list, node)
                        input_attach_handler(dev, handler);

                input_wakeup_procfs_readers();
        }

        if (dev->devres_managed) {
                dev_dbg(dev->dev.parent, "%s: registering %s with devres.\n",
                        __func__, dev_name(&dev->dev));
                devres_add(dev->dev.parent, devres);
        }
        return 0;

err_device_del:
        device_del(&dev->dev);
err_devres_free:
        devres_free(devres);
        return error;
}
EXPORT_SYMBOL(input_register_device);

/**
 * input_unregister_device - unregister previously registered device
 * @dev: device to be unregistered
 *
 * This function unregisters an input device. Once device is unregistered
 * the caller should not try to access it as it may get freed at any moment.
 */
void input_unregister_device(struct input_dev *dev)
{
        if (dev->devres_managed) {
                WARN_ON(devres_destroy(dev->dev.parent,
                                        devm_input_device_unregister,
                                        devm_input_device_match,
                                        dev));
                __input_unregister_device(dev);
                /*
                 * We do not do input_put_device() here because it will be done
                 * when 2nd devres fires up.
                 */
        } else {
                __input_unregister_device(dev);
                input_put_device(dev);
        }
}
EXPORT_SYMBOL(input_unregister_device);

static int input_handler_check_methods(const struct input_handler *handler)
{
        int count = 0;

        if (handler->filter)
                count++;
        if (handler->events)
                count++;
        if (handler->event)
                count++;

        if (count > 1) {
                pr_err("%s: only one event processing method can be defined (%s)\n",
                       __func__, handler->name);
                return -EINVAL;
        }

        return 0;
}

/**
 * input_register_handler - register a new input handler
 * @handler: handler to be registered
 *
 * This function registers a new input handler (interface) for input
 * devices in the system and attaches it to all input devices that
 * are compatible with the handler.
 */
int input_register_handler(struct input_handler *handler)
{
        struct input_dev *dev;
        int error;

        error = input_handler_check_methods(handler);
        if (error)
                return error;

        scoped_cond_guard(mutex_intr, return -EINTR, &input_mutex) {
                INIT_LIST_HEAD(&handler->h_list);

                list_add_tail(&handler->node, &input_handler_list);

                list_for_each_entry(dev, &input_dev_list, node)
                        input_attach_handler(dev, handler);

                input_wakeup_procfs_readers();
        }

        return 0;
}
EXPORT_SYMBOL(input_register_handler);

/**
 * input_unregister_handler - unregisters an input handler
 * @handler: handler to be unregistered
 *
 * This function disconnects a handler from its input devices and
 * removes it from lists of known handlers.
 */
void input_unregister_handler(struct input_handler *handler)
{
        struct input_handle *handle, *next;

        guard(mutex)(&input_mutex);

        list_for_each_entry_safe(handle, next, &handler->h_list, h_node)
                handler->disconnect(handle);
        WARN_ON(!list_empty(&handler->h_list));

        list_del_init(&handler->node);

        input_wakeup_procfs_readers();
}
EXPORT_SYMBOL(input_unregister_handler);

/**
 * input_handler_for_each_handle - handle iterator
 * @handler: input handler to iterate
 * @data: data for the callback
 * @fn: function to be called for each handle
 *
 * Iterate over @bus's list of devices, and call @fn for each, passing
 * it @data and stop when @fn returns a non-zero value. The function is
 * using RCU to traverse the list and therefore may be using in atomic
 * contexts. The @fn callback is invoked from RCU critical section and
 * thus must not sleep.
 */
int input_handler_for_each_handle(struct input_handler *handler, void *data,
                                  int (*fn)(struct input_handle *, void *))
{
        struct input_handle *handle;
        int retval;

        guard(rcu)();

        list_for_each_entry_rcu(handle, &handler->h_list, h_node) {
                retval = fn(handle, data);
                if (retval)
                        return retval;
        }

        return 0;
}
EXPORT_SYMBOL(input_handler_for_each_handle);

/*
 * An implementation of input_handle's handle_events() method that simply
 * invokes handler->event() method for each event one by one.
 */
static unsigned int input_handle_events_default(struct input_handle *handle,
                                                struct input_value *vals,
                                                unsigned int count)
{
        struct input_handler *handler = handle->handler;
        struct input_value *v;

        for (v = vals; v != vals + count; v++)
                handler->event(handle, v->type, v->code, v->value);

        return count;
}

/*
 * An implementation of input_handle's handle_events() method that invokes
 * handler->filter() method for each event one by one and removes events
 * that were filtered out from the "vals" array.
 */
static unsigned int input_handle_events_filter(struct input_handle *handle,
                                               struct input_value *vals,
                                               unsigned int count)
{
        struct input_handler *handler = handle->handler;
        struct input_value *end = vals;
        struct input_value *v;

        for (v = vals; v != vals + count; v++) {
                if (handler->filter(handle, v->type, v->code, v->value))
                        continue;
                if (end != v)
                        *end = *v;
                end++;
        }

        return end - vals;
}

/*
 * An implementation of input_handle's handle_events() method that does nothing.
 */
static unsigned int input_handle_events_null(struct input_handle *handle,
                                             struct input_value *vals,
                                             unsigned int count)
{
        return count;
}

/*
 * Sets up appropriate handle->event_handler based on the input_handler
 * associated with the handle.
 */
static void input_handle_setup_event_handler(struct input_handle *handle)
{
        struct input_handler *handler = handle->handler;

        if (handler->filter)
                handle->handle_events = input_handle_events_filter;
        else if (handler->event)
                handle->handle_events = input_handle_events_default;
        else if (handler->events)
                handle->handle_events = handler->events;
        else
                handle->handle_events = input_handle_events_null;
}

/**
 * input_register_handle - register a new input handle
 * @handle: handle to register
 *
 * This function puts a new input handle onto device's
 * and handler's lists so that events can flow through
 * it once it is opened using input_open_device().
 *
 * This function is supposed to be called from handler's
 * connect() method.
 */
int input_register_handle(struct input_handle *handle)
{
        struct input_handler *handler = handle->handler;
        struct input_dev *dev = handle->dev;

        input_handle_setup_event_handler(handle);
        /*
         * We take dev->mutex here to prevent race with
         * input_release_device().
         */
        scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) {
                /*
                 * Filters go to the head of the list, normal handlers
                 * to the tail.
                 */
                if (handler->filter)
                        list_add_rcu(&handle->d_node, &dev->h_list);
                else
                        list_add_tail_rcu(&handle->d_node, &dev->h_list);
        }

        /*
         * Since we are supposed to be called from ->connect()
         * which is mutually exclusive with ->disconnect()
         * we can't be racing with input_unregister_handle()
         * and so separate lock is not needed here.
         */
        list_add_tail_rcu(&handle->h_node, &handler->h_list);

        if (handler->start)
                handler->start(handle);

        return 0;
}
EXPORT_SYMBOL(input_register_handle);

/**
 * input_unregister_handle - unregister an input handle
 * @handle: handle to unregister
 *
 * This function removes input handle from device's
 * and handler's lists.
 *
 * This function is supposed to be called from handler's
 * disconnect() method.
 */
void input_unregister_handle(struct input_handle *handle)
{
        struct input_dev *dev = handle->dev;

        list_del_rcu(&handle->h_node);

        /*
         * Take dev->mutex to prevent race with input_release_device().
         */
        scoped_guard(mutex, &dev->mutex)
                list_del_rcu(&handle->d_node);

        synchronize_rcu();
}
EXPORT_SYMBOL(input_unregister_handle);

/**
 * input_get_new_minor - allocates a new input minor number
 * @legacy_base: beginning or the legacy range to be searched
 * @legacy_num: size of legacy range
 * @allow_dynamic: whether we can also take ID from the dynamic range
 *
 * This function allocates a new device minor for from input major namespace.
 * Caller can request legacy minor by specifying @legacy_base and @legacy_num
 * parameters and whether ID can be allocated from dynamic range if there are
 * no free IDs in legacy range.
 */
int input_get_new_minor(int legacy_base, unsigned int legacy_num,
                        bool allow_dynamic)
{
        /*
         * This function should be called from input handler's ->connect()
         * methods, which are serialized with input_mutex, so no additional
         * locking is needed here.
         */
        if (legacy_base >= 0) {
                int minor = ida_alloc_range(&input_ida, legacy_base,
                                            legacy_base + legacy_num - 1,
                                            GFP_KERNEL);
                if (minor >= 0 || !allow_dynamic)
                        return minor;
        }

        return ida_alloc_range(&input_ida, INPUT_FIRST_DYNAMIC_DEV,
                               INPUT_MAX_CHAR_DEVICES - 1, GFP_KERNEL);
}
EXPORT_SYMBOL(input_get_new_minor);

/**
 * input_free_minor - release previously allocated minor
 * @minor: minor to be released
 *
 * This function releases previously allocated input minor so that it can be
 * reused later.
 */
void input_free_minor(unsigned int minor)
{
        ida_free(&input_ida, minor);
}
EXPORT_SYMBOL(input_free_minor);

static int __init input_init(void)
{
        int err;

        err = class_register(&input_class);
        if (err) {
                pr_err("unable to register input_dev class\n");
                return err;
        }

        err = input_proc_init();
        if (err)
                goto fail1;

        err = register_chrdev_region(MKDEV(INPUT_MAJOR, 0),
                                     INPUT_MAX_CHAR_DEVICES, "input");
        if (err) {
                pr_err("unable to register char major %d", INPUT_MAJOR);
                goto fail2;
        }

        return 0;

 fail2:        input_proc_exit();
 fail1:        class_unregister(&input_class);
        return err;
}

static void __exit input_exit(void)
{
        input_proc_exit();
        unregister_chrdev_region(MKDEV(INPUT_MAJOR, 0),
                                 INPUT_MAX_CHAR_DEVICES);
        class_unregister(&input_class);
}

subsys_initcall(input_init);
module_exit(input_exit);

























































































































































































   33 


















   31 

















   13 




















   15 






















   15 















   14 




























   14 




















   15 


















   15 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Percpu refcounts:
 * (C) 2012 Google, Inc.
 * Author: Kent Overstreet <koverstreet@google.com>
 *
 * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
 * atomic_dec_and_test() - but percpu.
 *
 * There's one important difference between percpu refs and normal atomic_t
 * refcounts; you have to keep track of your initial refcount, and then when you
 * start shutting down you call percpu_ref_kill() _before_ dropping the initial
 * refcount.
 *
 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
 * than an atomic_t - this is because of the way shutdown works, see
 * percpu_ref_kill()/PERCPU_COUNT_BIAS.
 *
 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
 * puts the ref back in single atomic_t mode, collecting the per cpu refs and
 * issuing the appropriate barriers, and then marks the ref as shutting down so
 * that percpu_ref_put() will check for the ref hitting 0.  After it returns,
 * it's safe to drop the initial ref.
 *
 * USAGE:
 *
 * See fs/aio.c for some example usage; it's used there for struct kioctx, which
 * is created when userspaces calls io_setup(), and destroyed when userspace
 * calls io_destroy() or the process exits.
 *
 * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
 * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
 * After that, there can't be any new users of the kioctx (from lookup_ioctx())
 * and it's then safe to drop the initial ref with percpu_ref_put().
 *
 * Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
 * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations don't
 * imply RCU grace periods of any kind and if a user wants to combine percpu_ref
 * with RCU protection, it must be done explicitly.
 *
 * Code that does a two stage shutdown like this often needs some kind of
 * explicit synchronization to ensure the initial refcount can only be dropped
 * once - percpu_ref_kill() does this for you, it returns true once and false if
 * someone else already called it. The aio code uses it this way, but it's not
 * necessary if the code has some other mechanism to synchronize teardown.
 * around.
 */

#ifndef _LINUX_PERCPU_REFCOUNT_H
#define _LINUX_PERCPU_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/types.h>
#include <linux/gfp.h>

struct percpu_ref;
typedef void (percpu_ref_func_t)(struct percpu_ref *);

/* flags set in the lower bits of percpu_ref->percpu_count_ptr */
enum {
        __PERCPU_REF_ATOMIC        = 1LU << 0,        /* operating in atomic mode */
        __PERCPU_REF_DEAD        = 1LU << 1,        /* (being) killed */
        __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,

        __PERCPU_REF_FLAG_BITS        = 2,
};

/* @flags for percpu_ref_init() */
enum {
        /*
         * Start w/ ref == 1 in atomic mode.  Can be switched to percpu
         * operation using percpu_ref_switch_to_percpu().  If initialized
         * with this flag, the ref will stay in atomic mode until
         * percpu_ref_switch_to_percpu() is invoked on it.
         * Implies ALLOW_REINIT.
         */
        PERCPU_REF_INIT_ATOMIC        = 1 << 0,

        /*
         * Start dead w/ ref == 0 in atomic mode.  Must be revived with
         * percpu_ref_reinit() before used.  Implies INIT_ATOMIC and
         * ALLOW_REINIT.
         */
        PERCPU_REF_INIT_DEAD        = 1 << 1,

        /*
         * Allow switching from atomic mode to percpu mode.
         */
        PERCPU_REF_ALLOW_REINIT        = 1 << 2,
};

struct percpu_ref_data {
        atomic_long_t                count;
        percpu_ref_func_t        *release;
        percpu_ref_func_t        *confirm_switch;
        bool                        force_atomic:1;
        bool                        allow_reinit:1;
        struct rcu_head                rcu;
        struct percpu_ref        *ref;
};

struct percpu_ref {
        /*
         * The low bit of the pointer indicates whether the ref is in percpu
         * mode; if set, then get/put will manipulate the atomic_t.
         */
        unsigned long                percpu_count_ptr;

        /*
         * 'percpu_ref' is often embedded into user structure, and only
         * 'percpu_count_ptr' is required in fast path, move other fields
         * into 'percpu_ref_data', so we can reduce memory footprint in
         * fast path.
         */
        struct percpu_ref_data  *data;
};

int __must_check percpu_ref_init(struct percpu_ref *ref,
                                 percpu_ref_func_t *release, unsigned int flags,
                                 gfp_t gfp);
void percpu_ref_exit(struct percpu_ref *ref);
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch);
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill);
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
bool percpu_ref_is_zero(struct percpu_ref *ref);

/**
 * percpu_ref_kill - drop the initial ref
 * @ref: percpu_ref to kill
 *
 * Must be used to drop the initial ref on a percpu refcount; must be called
 * precisely once before shutdown.
 *
 * Switches @ref into atomic mode before gathering up the percpu counters
 * and dropping the initial ref.
 *
 * There are no implied RCU grace periods between kill and release.
 */
static inline void percpu_ref_kill(struct percpu_ref *ref)
{
        percpu_ref_kill_and_confirm(ref, NULL);
}

/*
 * Internal helper.  Don't use outside percpu-refcount proper.  The
 * function doesn't return the pointer and let the caller test it for NULL
 * because doing so forces the compiler to generate two conditional
 * branches as it can't assume that @ref->percpu_count is not NULL.
 */
static inline bool __ref_is_percpu(struct percpu_ref *ref,
                                          unsigned long __percpu **percpu_countp)
{
        unsigned long percpu_ptr;

        /*
         * The value of @ref->percpu_count_ptr is tested for
         * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then
         * used as a pointer.  If the compiler generates a separate fetch
         * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in
         * between contaminating the pointer value, meaning that
         * READ_ONCE() is required when fetching it.
         *
         * The dependency ordering from the READ_ONCE() pairs
         * with smp_store_release() in __percpu_ref_switch_to_percpu().
         */
        percpu_ptr = READ_ONCE(ref->percpu_count_ptr);

        /*
         * Theoretically, the following could test just ATOMIC; however,
         * then we'd have to mask off DEAD separately as DEAD may be
         * visible without ATOMIC if we race with percpu_ref_kill().  DEAD
         * implies ATOMIC anyway.  Test them together.
         */
        if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD))
                return false;

        *percpu_countp = (unsigned long __percpu *)percpu_ptr;
        return true;
}

/**
 * percpu_ref_get_many - increment a percpu refcount
 * @ref: percpu_ref to get
 * @nr: number of references to get
 *
 * Analogous to atomic_long_add().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_add(*percpu_count, nr);
        else
                atomic_long_add(nr, &ref->data->count);

        rcu_read_unlock();
}

/**
 * percpu_ref_get - increment a percpu refcount
 * @ref: percpu_ref to get
 *
 * Analogous to atomic_long_inc().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get(struct percpu_ref *ref)
{
        percpu_ref_get_many(ref, 1);
}

/**
 * percpu_ref_tryget_many - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 * @nr: number of references to get
 *
 * Increment a percpu refcount  by @nr unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
                                          unsigned long nr)
{
        unsigned long __percpu *percpu_count;
        bool ret;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count)) {
                this_cpu_add(*percpu_count, nr);
                ret = true;
        } else {
                ret = atomic_long_add_unless(&ref->data->count, nr, 0);
        }

        rcu_read_unlock();

        return ret;
}

/**
 * percpu_ref_tryget - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
        return percpu_ref_tryget_many(ref, 1);
}

/**
 * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the
 * caller is responsible for taking RCU.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        bool ret = false;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (likely(__ref_is_percpu(ref, &percpu_count))) {
                this_cpu_inc(*percpu_count);
                ret = true;
        } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
                ret = atomic_long_inc_not_zero(&ref->data->count);
        }
        return ret;
}

/**
 * percpu_ref_tryget_live - try to increment a live percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless it has already been killed.  Returns
 * %true on success; %false on failure.
 *
 * Completion of percpu_ref_kill() in itself doesn't guarantee that this
 * function will fail.  For such guarantee, percpu_ref_kill_and_confirm()
 * should be used.  After the confirm_kill callback is invoked, it's
 * guaranteed that no new reference will be given out by
 * percpu_ref_tryget_live().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
{
        bool ret = false;

        rcu_read_lock();
        ret = percpu_ref_tryget_live_rcu(ref);
        rcu_read_unlock();
        return ret;
}

/**
 * percpu_ref_put_many - decrement a percpu refcount
 * @ref: percpu_ref to put
 * @nr: number of references to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_sub(*percpu_count, nr);
        else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count)))
                ref->data->release(ref);

        rcu_read_unlock();
}

/**
 * percpu_ref_put - decrement a percpu refcount
 * @ref: percpu_ref to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put(struct percpu_ref *ref)
{
        percpu_ref_put_many(ref, 1);
}

/**
 * percpu_ref_is_dying - test whether a percpu refcount is dying or dead
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref is dying or dead.
 *
 * This function is safe to call as long as @ref is between init and exit
 * and the caller is responsible for synchronizing against state changes.
 */
static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
{
        return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
}

#endif
































































































































    1 






    1 


    1 



    1 







    1 
















































































































































































































































































































































































































































    1 











































    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
// SPDX-License-Identifier: GPL-2.0
/*
 * Tty buffer allocation management
 */

#include <linux/types.h>
#include <linux/errno.h>
#include <linux/minmax.h>
#include <linux/tty.h>
#include <linux/tty_buffer.h>
#include <linux/tty_driver.h>
#include <linux/tty_flip.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/ratelimit.h>
#include "tty.h"

#define MIN_TTYB_SIZE        256
#define TTYB_ALIGN_MASK        0xff

/*
 * Byte threshold to limit memory consumption for flip buffers.
 * The actual memory limit is > 2x this amount.
 */
#define TTYB_DEFAULT_MEM_LIMIT        (640 * 1024UL)

/*
 * We default to dicing tty buffer allocations to this many characters
 * in order to avoid multiple page allocations. We know the size of
 * tty_buffer itself but it must also be taken into account that the
 * buffer is 256 byte aligned. See tty_buffer_find for the allocation
 * logic this must match.
 */

#define TTY_BUFFER_PAGE        (((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~TTYB_ALIGN_MASK)

/**
 * tty_buffer_lock_exclusive        -        gain exclusive access to buffer
 * @port: tty port owning the flip buffer
 *
 * Guarantees safe use of the &tty_ldisc_ops.receive_buf() method by excluding
 * the buffer work and any pending flush from using the flip buffer. Data can
 * continue to be added concurrently to the flip buffer from the driver side.
 *
 * See also tty_buffer_unlock_exclusive().
 */
void tty_buffer_lock_exclusive(struct tty_port *port)
{
        struct tty_bufhead *buf = &port->buf;

        atomic_inc(&buf->priority);
        mutex_lock(&buf->lock);
}
EXPORT_SYMBOL_GPL(tty_buffer_lock_exclusive);

static bool tty_buffer_queue_work(struct tty_bufhead *buf)
{
        struct workqueue_struct *flip_wq = READ_ONCE(buf->flip_wq);

        return queue_work(flip_wq ?: system_dfl_wq, &buf->work);
}

/**
 * tty_buffer_unlock_exclusive        -        release exclusive access
 * @port: tty port owning the flip buffer
 *
 * The buffer work is restarted if there is data in the flip buffer.
 *
 * See also tty_buffer_lock_exclusive().
 */
void tty_buffer_unlock_exclusive(struct tty_port *port)
{
        struct tty_bufhead *buf = &port->buf;
        bool restart = buf->head->commit != buf->head->read;

        atomic_dec(&buf->priority);
        mutex_unlock(&buf->lock);

        if (restart)
                tty_buffer_queue_work(buf);
}
EXPORT_SYMBOL_GPL(tty_buffer_unlock_exclusive);

/**
 * tty_buffer_space_avail        -        return unused buffer space
 * @port: tty port owning the flip buffer
 *
 * Returns: the # of bytes which can be written by the driver without reaching
 * the buffer limit.
 *
 * Note: this does not guarantee that memory is available to write the returned
 * # of bytes (use tty_prepare_flip_string() to pre-allocate if memory
 * guarantee is required).
 */
unsigned int tty_buffer_space_avail(struct tty_port *port)
{
        int space = port->buf.mem_limit - atomic_read(&port->buf.mem_used);

        return max(space, 0);
}
EXPORT_SYMBOL_GPL(tty_buffer_space_avail);

static void tty_buffer_reset(struct tty_buffer *p, size_t size)
{
        p->used = 0;
        p->size = size;
        p->next = NULL;
        p->commit = 0;
        p->lookahead = 0;
        p->read = 0;
        p->flags = true;
}

/**
 * tty_buffer_free_all                -        free buffers used by a tty
 * @port: tty port to free from
 *
 * Remove all the buffers pending on a tty whether queued with data or in the
 * free ring. Must be called when the tty is no longer in use.
 */
void tty_buffer_free_all(struct tty_port *port)
{
        struct tty_bufhead *buf = &port->buf;
        struct tty_buffer *p, *next;
        struct llist_node *llist;
        unsigned int freed = 0;
        int still_used;

        while ((p = buf->head) != NULL) {
                buf->head = p->next;
                freed += p->size;
                if (p->size > 0)
                        kfree(p);
        }
        llist = llist_del_all(&buf->free);
        llist_for_each_entry_safe(p, next, llist, free)
                kfree(p);

        tty_buffer_reset(&buf->sentinel, 0);
        buf->head = &buf->sentinel;
        buf->tail = &buf->sentinel;

        still_used = atomic_xchg(&buf->mem_used, 0);
        WARN(still_used != freed, "we still have not freed %d bytes!",
                        still_used - freed);
}

/**
 * tty_buffer_alloc        -        allocate a tty buffer
 * @port: tty port
 * @size: desired size (characters)
 *
 * Allocate a new tty buffer to hold the desired number of characters. We
 * round our buffers off in 256 character chunks to get better allocation
 * behaviour.
 *
 * Returns: %NULL if out of memory or the allocation would exceed the per
 * device queue.
 */
static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size)
{
        struct llist_node *free;
        struct tty_buffer *p;

        /* Round the buffer size out */
        size = __ALIGN_MASK(size, TTYB_ALIGN_MASK);

        if (size <= MIN_TTYB_SIZE) {
                free = llist_del_first(&port->buf.free);
                if (free) {
                        p = llist_entry(free, struct tty_buffer, free);
                        goto found;
                }
        }

        /* Should possibly check if this fails for the largest buffer we
         * have queued and recycle that ?
         */
        if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit)
                return NULL;
        p = kmalloc_flex(*p, data, 2 * size, GFP_ATOMIC | __GFP_NOWARN);
        if (p == NULL)
                return NULL;

found:
        tty_buffer_reset(p, size);
        atomic_add(size, &port->buf.mem_used);
        return p;
}

/**
 * tty_buffer_free                -        free a tty buffer
 * @port: tty port owning the buffer
 * @b: the buffer to free
 *
 * Free a tty buffer, or add it to the free list according to our internal
 * strategy.
 */
static void tty_buffer_free(struct tty_port *port, struct tty_buffer *b)
{
        struct tty_bufhead *buf = &port->buf;

        /* Dumb strategy for now - should keep some stats */
        WARN_ON(atomic_sub_return(b->size, &buf->mem_used) < 0);

        if (b->size > MIN_TTYB_SIZE)
                kfree(b);
        else if (b->size > 0)
                llist_add(&b->free, &buf->free);
}

/**
 * tty_buffer_flush                -        flush full tty buffers
 * @tty: tty to flush
 * @ld: optional ldisc ptr (must be referenced)
 *
 * Flush all the buffers containing receive data. If @ld != %NULL, flush the
 * ldisc input buffer.
 *
 * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'.
 */
void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld)
{
        struct tty_port *port = tty->port;
        struct tty_bufhead *buf = &port->buf;
        struct tty_buffer *next;

        atomic_inc(&buf->priority);

        mutex_lock(&buf->lock);
        /* paired w/ release in __tty_buffer_request_room; ensures there are
         * no pending memory accesses to the freed buffer
         */
        while ((next = smp_load_acquire(&buf->head->next)) != NULL) {
                tty_buffer_free(port, buf->head);
                buf->head = next;
        }
        buf->head->read = buf->head->commit;
        buf->head->lookahead = buf->head->read;

        if (ld && ld->ops->flush_buffer)
                ld->ops->flush_buffer(tty);

        atomic_dec(&buf->priority);
        mutex_unlock(&buf->lock);
}

/**
 * __tty_buffer_request_room        -        grow tty buffer if needed
 * @port: tty port
 * @size: size desired
 * @flags: buffer has to store flags along character data
 *
 * Make at least @size bytes of linear space available for the tty buffer.
 *
 * Will change over to a new buffer if the current buffer is encoded as
 * %TTY_NORMAL (so has no flags buffer) and the new buffer requires a flags
 * buffer.
 *
 * Returns: the size we managed to find.
 */
static int __tty_buffer_request_room(struct tty_port *port, size_t size,
                                     bool flags)
{
        struct tty_bufhead *buf = &port->buf;
        struct tty_buffer *n, *b = buf->tail;
        size_t left = (b->flags ? 1 : 2) * b->size - b->used;
        bool change = !b->flags && flags;

        if (!change && left >= size)
                return size;

        /* This is the slow path - looking for new buffers to use */
        n = tty_buffer_alloc(port, size);
        if (n == NULL)
                return change ? 0 : left;

        n->flags = flags;
        buf->tail = n;
        /*
         * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs()
         * ensures they see all buffer data.
         */
        smp_store_release(&b->commit, b->used);
        /*
         * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs()
         * ensures the latest commit value can be read before the head
         * is advanced to the next buffer.
         */
        smp_store_release(&b->next, n);

        return size;
}

int tty_buffer_request_room(struct tty_port *port, size_t size)
{
        return __tty_buffer_request_room(port, size, true);
}
EXPORT_SYMBOL_GPL(tty_buffer_request_room);

size_t __tty_insert_flip_string_flags(struct tty_port *port, const u8 *chars,
                                      const u8 *flags, bool mutable_flags,
                                      size_t size)
{
        bool need_flags = mutable_flags || flags[0] != TTY_NORMAL;
        size_t copied = 0;

        do {
                size_t goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE);
                size_t space = __tty_buffer_request_room(port, goal, need_flags);
                struct tty_buffer *tb = port->buf.tail;

                if (unlikely(space == 0))
                        break;

                memcpy(char_buf_ptr(tb, tb->used), chars, space);

                if (mutable_flags) {
                        memcpy(flag_buf_ptr(tb, tb->used), flags, space);
                        flags += space;
                } else if (tb->flags) {
                        memset(flag_buf_ptr(tb, tb->used), flags[0], space);
                } else {
                        /* tb->flags should be available once requested */
                        WARN_ON_ONCE(need_flags);
                }

                tb->used += space;
                copied += space;
                chars += space;

                /* There is a small chance that we need to split the data over
                 * several buffers. If this is the case we must loop.
                 */
        } while (unlikely(size > copied));

        return copied;
}
EXPORT_SYMBOL(__tty_insert_flip_string_flags);

/**
 * tty_prepare_flip_string        -        make room for characters
 * @port: tty port
 * @chars: return pointer for character write area
 * @size: desired size
 *
 * Prepare a block of space in the buffer for data.
 *
 * This is used for drivers that need their own block copy routines into the
 * buffer. There is no guarantee the buffer is a DMA target!
 *
 * Returns: the length available and buffer pointer (@chars) to the space which
 * is now allocated and accounted for as ready for normal characters.
 */
size_t tty_prepare_flip_string(struct tty_port *port, u8 **chars, size_t size)
{
        size_t space = __tty_buffer_request_room(port, size, false);

        if (likely(space)) {
                struct tty_buffer *tb = port->buf.tail;

                *chars = char_buf_ptr(tb, tb->used);
                if (tb->flags)
                        memset(flag_buf_ptr(tb, tb->used), TTY_NORMAL, space);
                tb->used += space;
        }

        return space;
}
EXPORT_SYMBOL_GPL(tty_prepare_flip_string);

/**
 * tty_ldisc_receive_buf        -        forward data to line discipline
 * @ld: line discipline to process input
 * @p: char buffer
 * @f: %TTY_NORMAL, %TTY_BREAK, etc. flags buffer
 * @count: number of bytes to process
 *
 * Callers other than flush_to_ldisc() need to exclude the kworker from
 * concurrent use of the line discipline, see paste_selection().
 *
 * Returns: the number of bytes processed.
 */
size_t tty_ldisc_receive_buf(struct tty_ldisc *ld, const u8 *p, const u8 *f,
                             size_t count)
{
        if (ld->ops->receive_buf2)
                count = ld->ops->receive_buf2(ld->tty, p, f, count);
        else {
                count = min_t(size_t, count, ld->tty->receive_room);
                if (count && ld->ops->receive_buf)
                        ld->ops->receive_buf(ld->tty, p, f, count);
        }
        return count;
}
EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf);

static void lookahead_bufs(struct tty_port *port, struct tty_buffer *head)
{
        head->lookahead = max(head->lookahead, head->read);

        while (head) {
                struct tty_buffer *next;
                unsigned int count;

                /*
                 * Paired w/ release in __tty_buffer_request_room();
                 * ensures commit value read is not stale if the head
                 * is advancing to the next buffer.
                 */
                next = smp_load_acquire(&head->next);
                /*
                 * Paired w/ release in __tty_buffer_request_room() or in
                 * tty_buffer_flush(); ensures we see the committed buffer data.
                 */
                count = smp_load_acquire(&head->commit) - head->lookahead;
                if (!count) {
                        head = next;
                        continue;
                }

                if (port->client_ops->lookahead_buf) {
                        u8 *p, *f = NULL;

                        p = char_buf_ptr(head, head->lookahead);
                        if (head->flags)
                                f = flag_buf_ptr(head, head->lookahead);

                        port->client_ops->lookahead_buf(port, p, f, count);
                }

                head->lookahead += count;
        }
}

static size_t
receive_buf(struct tty_port *port, struct tty_buffer *head, size_t count)
{
        u8 *p = char_buf_ptr(head, head->read);
        const u8 *f = NULL;
        size_t n;

        if (head->flags)
                f = flag_buf_ptr(head, head->read);

        n = port->client_ops->receive_buf(port, p, f, count);
        if (n > 0)
                memset(p, 0, n);
        return n;
}

/**
 * flush_to_ldisc                -        flush data from buffer to ldisc
 * @work: tty structure passed from work queue.
 *
 * This routine is called out of the software interrupt to flush data from the
 * buffer chain to the line discipline.
 *
 * The receive_buf() method is single threaded for each tty instance.
 *
 * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'.
 */
static void flush_to_ldisc(struct work_struct *work)
{
        struct tty_port *port = container_of(work, struct tty_port, buf.work);
        struct tty_bufhead *buf = &port->buf;

        mutex_lock(&buf->lock);

        while (1) {
                struct tty_buffer *head = buf->head;
                struct tty_buffer *next;
                size_t count, rcvd;

                /* Ldisc or user is trying to gain exclusive access */
                if (atomic_read(&buf->priority))
                        break;

                /* paired w/ release in __tty_buffer_request_room();
                 * ensures commit value read is not stale if the head
                 * is advancing to the next buffer
                 */
                next = smp_load_acquire(&head->next);
                /* paired w/ release in __tty_buffer_request_room() or in
                 * tty_buffer_flush(); ensures we see the committed buffer data
                 */
                count = smp_load_acquire(&head->commit) - head->read;
                if (!count) {
                        if (next == NULL)
                                break;
                        buf->head = next;
                        tty_buffer_free(port, head);
                        continue;
                }

                rcvd = receive_buf(port, head, count);
                head->read += rcvd;
                if (rcvd < count)
                        lookahead_bufs(port, head);
                if (!rcvd)
                        break;

                cond_resched();
        }

        mutex_unlock(&buf->lock);

}

static inline void tty_flip_buffer_commit(struct tty_buffer *tail)
{
        /*
         * Paired w/ acquire in flush_to_ldisc(); ensures flush_to_ldisc() sees
         * buffer data.
         */
        smp_store_release(&tail->commit, tail->used);
}

/**
 * tty_flip_buffer_push                -        push terminal buffers
 * @port: tty port to push
 *
 * Queue a push of the terminal flip buffers to the line discipline. Can be
 * called from IRQ/atomic context.
 *
 * In the event of the queue being busy for flipping the work will be held off
 * and retried later.
 */
void tty_flip_buffer_push(struct tty_port *port)
{
        struct tty_bufhead *buf = &port->buf;

        tty_flip_buffer_commit(buf->tail);
        tty_buffer_queue_work(buf);
}
EXPORT_SYMBOL(tty_flip_buffer_push);

/**
 * tty_insert_flip_string_and_push_buffer - add characters to the tty buffer and
 *        push
 * @port: tty port
 * @chars: characters
 * @size: size
 *
 * The function combines tty_insert_flip_string() and tty_flip_buffer_push()
 * with the exception of properly holding the @port->lock.
 *
 * To be used only internally (by pty currently).
 *
 * Returns: the number added.
 */
int tty_insert_flip_string_and_push_buffer(struct tty_port *port,
                                           const u8 *chars, size_t size)
{
        struct tty_bufhead *buf = &port->buf;
        unsigned long flags;

        spin_lock_irqsave(&port->lock, flags);
        size = tty_insert_flip_string(port, chars, size);
        if (size)
                tty_flip_buffer_commit(buf->tail);
        spin_unlock_irqrestore(&port->lock, flags);

        tty_buffer_queue_work(buf);

        return size;
}

/**
 * tty_buffer_init                -        prepare a tty buffer structure
 * @port: tty port to initialise
 *
 * Set up the initial state of the buffer management for a tty device. Must be
 * called before the other tty buffer functions are used.
 */
void tty_buffer_init(struct tty_port *port)
{
        struct tty_bufhead *buf = &port->buf;

        mutex_init(&buf->lock);
        tty_buffer_reset(&buf->sentinel, 0);
        buf->head = &buf->sentinel;
        buf->tail = &buf->sentinel;
        init_llist_head(&buf->free);
        atomic_set(&buf->mem_used, 0);
        atomic_set(&buf->priority, 0);
        INIT_WORK(&buf->work, flush_to_ldisc);
        buf->mem_limit = TTYB_DEFAULT_MEM_LIMIT;
}

/**
 * tty_buffer_set_limit                -        change the tty buffer memory limit
 * @port: tty port to change
 * @limit: memory limit to set
 *
 * Change the tty buffer memory limit.
 *
 * Must be called before the other tty buffer functions are used.
 */
int tty_buffer_set_limit(struct tty_port *port, int limit)
{
        if (limit < MIN_TTYB_SIZE)
                return -EINVAL;
        port->buf.mem_limit = limit;
        return 0;
}
EXPORT_SYMBOL_GPL(tty_buffer_set_limit);

/* slave ptys can claim nested buffer lock when handling BRK and INTR */
void tty_buffer_set_lock_subclass(struct tty_port *port)
{
        lockdep_set_subclass(&port->buf.lock, TTY_LOCK_SLAVE);
}

bool tty_buffer_restart_work(struct tty_port *port)
{
        return tty_buffer_queue_work(&port->buf);
}

bool tty_buffer_cancel_work(struct tty_port *port)
{
        return cancel_work_sync(&port->buf.work);
}

void tty_buffer_flush_work(struct tty_port *port)
{
        flush_work(&port->buf.work);
}










































































































































































































    2 



















































































    1 


















    2 
    1 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * the_nilfs shared structure.
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Ryusuke Konishi.
 *
 */

#ifndef _THE_NILFS_H
#define _THE_NILFS_H

#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/rbtree.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/refcount.h>

struct nilfs_sc_info;
struct nilfs_sysfs_dev_subgroups;

/* the_nilfs struct */
enum {
        THE_NILFS_INIT = 0,     /* Information from super_block is set */
        THE_NILFS_DISCONTINUED,        /* 'next' pointer chain has broken */
        THE_NILFS_GC_RUNNING,        /* gc process is running */
        THE_NILFS_SB_DIRTY,        /* super block is dirty */
        THE_NILFS_PURGING,        /* disposing dirty files for cleanup */
};

/**
 * struct the_nilfs - struct to supervise multiple nilfs mount points
 * @ns_flags: flags
 * @ns_flushed_device: flag indicating if all volatile data was flushed
 * @ns_sb: back pointer to super block instance
 * @ns_bdev: block device
 * @ns_sem: semaphore for shared states
 * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts
 * @ns_sbh: buffer heads of on-disk super blocks
 * @ns_sbp: pointers to super block data
 * @ns_sbwtime: previous write time of super block
 * @ns_sbwcount: write count of super block
 * @ns_sbsize: size of valid data in super block
 * @ns_mount_state: file system state
 * @ns_sb_update_freq: interval of periodical update of superblocks (in seconds)
 * @ns_seg_seq: segment sequence counter
 * @ns_segnum: index number of the latest full segment.
 * @ns_nextnum: index number of the full segment index to be used next
 * @ns_pseg_offset: offset of next partial segment in the current full segment
 * @ns_cno: next checkpoint number
 * @ns_ctime: write time of the last segment
 * @ns_nongc_ctime: write time of the last segment not for cleaner operation
 * @ns_ndirtyblks: Number of dirty data blocks
 * @ns_last_segment_lock: lock protecting fields for the latest segment
 * @ns_last_pseg: start block number of the latest segment
 * @ns_last_seq: sequence value of the latest segment
 * @ns_last_cno: checkpoint number of the latest segment
 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
 * @ns_prev_seq: base sequence number used to decide if advance log cursor
 * @ns_writer: log writer
 * @ns_segctor_sem: semaphore protecting log write
 * @ns_dat: DAT file inode
 * @ns_cpfile: checkpoint file inode
 * @ns_sufile: segusage file inode
 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
 * @ns_cptree_lock: lock protecting @ns_cptree
 * @ns_dirty_files: list of dirty files
 * @ns_inode_lock: lock protecting @ns_dirty_files
 * @ns_gc_inodes: dummy inodes to keep live blocks
 * @ns_mount_opt: mount options
 * @ns_resuid: uid for reserved blocks
 * @ns_resgid: gid for reserved blocks
 * @ns_interval: checkpoint creation interval
 * @ns_watermark: watermark for the number of dirty buffers
 * @ns_blocksize_bits: bit length of block size
 * @ns_blocksize: block size
 * @ns_nsegments: number of segments in filesystem
 * @ns_blocks_per_segment: number of blocks per segment
 * @ns_r_segments_percentage: reserved segments percentage
 * @ns_nrsvsegs: number of reserved segments
 * @ns_first_data_block: block number of first data block
 * @ns_inode_size: size of on-disk inode
 * @ns_first_ino: first not-special inode number
 * @ns_crc_seed: seed value of CRC32 calculation
 * @ns_dev_kobj: /sys/fs/<nilfs>/<device>
 * @ns_dev_kobj_unregister: completion state
 * @ns_dev_subgroups: <device> subgroups pointer
 */
struct the_nilfs {
        unsigned long                ns_flags;
        int                        ns_flushed_device;

        struct super_block     *ns_sb;
        struct block_device    *ns_bdev;
        struct rw_semaphore        ns_sem;
        struct mutex                ns_snapshot_mount_mutex;

        /*
         * used for
         * - loading the latest checkpoint exclusively.
         * - allocating a new full segment.
         */
        struct buffer_head     *ns_sbh[2];
        struct nilfs_super_block *ns_sbp[2];
        time64_t                ns_sbwtime;
        unsigned int                ns_sbwcount;
        unsigned int                ns_sbsize;
        unsigned int                ns_mount_state;
        unsigned int                ns_sb_update_freq;

        /*
         * The following fields are updated by a writable FS-instance.
         * These fields are protected by ns_segctor_sem outside load_nilfs().
         */
        u64                        ns_seg_seq;
        __u64                        ns_segnum;
        __u64                        ns_nextnum;
        unsigned long                ns_pseg_offset;
        __u64                        ns_cno;
        time64_t                ns_ctime;
        time64_t                ns_nongc_ctime;
        atomic_t                ns_ndirtyblks;

        /*
         * The following fields hold information on the latest partial segment
         * written to disk with a super root.  These fields are protected by
         * ns_last_segment_lock.
         */
        spinlock_t                ns_last_segment_lock;
        sector_t                ns_last_pseg;
        u64                        ns_last_seq;
        __u64                        ns_last_cno;
        u64                        ns_prot_seq;
        u64                        ns_prev_seq;

        struct nilfs_sc_info   *ns_writer;
        struct rw_semaphore        ns_segctor_sem;

        /*
         * Following fields are lock free except for the period before
         * the_nilfs is initialized.
         */
        struct inode               *ns_dat;
        struct inode               *ns_cpfile;
        struct inode               *ns_sufile;

        /* Checkpoint tree */
        struct rb_root                ns_cptree;
        spinlock_t                ns_cptree_lock;

        /* Dirty inode list */
        struct list_head        ns_dirty_files;
        spinlock_t                ns_inode_lock;

        /* GC inode list */
        struct list_head        ns_gc_inodes;

        /* Mount options */
        unsigned long                ns_mount_opt;

        uid_t                        ns_resuid;
        gid_t                        ns_resgid;
        unsigned long                ns_interval;
        unsigned long                ns_watermark;

        /* Disk layout information (static) */
        unsigned int                ns_blocksize_bits;
        unsigned int                ns_blocksize;
        unsigned long                ns_nsegments;
        unsigned long                ns_blocks_per_segment;
        unsigned long                ns_r_segments_percentage;
        unsigned long                ns_nrsvsegs;
        unsigned long                ns_first_data_block;
        int                        ns_inode_size;
        unsigned int                ns_first_ino;
        u32                        ns_crc_seed;

        /* /sys/fs/<nilfs>/<device> */
        struct kobject ns_dev_kobj;
        struct completion ns_dev_kobj_unregister;
        struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups;
};

#define THE_NILFS_FNS(bit, name)                                        \
static inline void set_nilfs_##name(struct the_nilfs *nilfs)                \
{                                                                        \
        set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);                        \
}                                                                        \
static inline void clear_nilfs_##name(struct the_nilfs *nilfs)                \
{                                                                        \
        clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);                        \
}                                                                        \
static inline int nilfs_##name(struct the_nilfs *nilfs)                        \
{                                                                        \
        return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);                \
}

THE_NILFS_FNS(INIT, init)
THE_NILFS_FNS(DISCONTINUED, discontinued)
THE_NILFS_FNS(GC_RUNNING, gc_running)
THE_NILFS_FNS(SB_DIRTY, sb_dirty)
THE_NILFS_FNS(PURGING, purging)

/*
 * Mount option operations
 */
#define nilfs_clear_opt(nilfs, opt)  \
        ((nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt)
#define nilfs_set_opt(nilfs, opt)  \
        ((nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt)
#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)

/**
 * struct nilfs_root - nilfs root object
 * @cno: checkpoint number
 * @rb_node: red-black tree node
 * @count: refcount of this structure
 * @nilfs: nilfs object
 * @ifile: inode file
 * @inodes_count: number of inodes
 * @blocks_count: number of blocks
 * @snapshot_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot>
 * @snapshot_kobj_unregister: completion state for kernel object
 */
struct nilfs_root {
        __u64 cno;
        struct rb_node rb_node;

        refcount_t count;
        struct the_nilfs *nilfs;
        struct inode *ifile;

        atomic64_t inodes_count;
        atomic64_t blocks_count;

        /* /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> */
        struct kobject snapshot_kobj;
        struct completion snapshot_kobj_unregister;
};

/* Special checkpoint number */
#define NILFS_CPTREE_CURRENT_CNO        0

/* Minimum interval of periodical update of superblocks (in seconds) */
#define NILFS_SB_FREQ                10

static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
{
        u64 t = ktime_get_real_seconds();

        return t < nilfs->ns_sbwtime ||
                t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq;
}

static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
{
        int flip_bits = nilfs->ns_sbwcount & 0x0FL;

        return (flip_bits != 0x08 && flip_bits != 0x0F);
}

void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
struct the_nilfs *alloc_nilfs(struct super_block *sb);
void destroy_nilfs(struct the_nilfs *nilfs);
int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs);
void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs);
int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
                                             __u64 cno);
void nilfs_put_root(struct nilfs_root *root);
int nilfs_near_disk_full(struct the_nilfs *);
void nilfs_fall_back_super_block(struct the_nilfs *);
void nilfs_swap_super_block(struct the_nilfs *);


static inline void nilfs_get_root(struct nilfs_root *root)
{
        refcount_inc(&root->count);
}

static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
{
        unsigned int valid_fs;

        down_read(&nilfs->ns_sem);
        valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
        up_read(&nilfs->ns_sem);
        return valid_fs;
}

static inline void
nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
                        sector_t *seg_start, sector_t *seg_end)
{
        *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
        *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
        if (segnum == 0)
                *seg_start = nilfs->ns_first_data_block;
}

static inline sector_t
nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
{
        return (segnum == 0) ? nilfs->ns_first_data_block :
                (sector_t)nilfs->ns_blocks_per_segment * segnum;
}

static inline __u64
nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
{
        sector_t segnum = blocknr;

        sector_div(segnum, nilfs->ns_blocks_per_segment);
        return segnum;
}

static inline void
nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
                        sector_t seg_end)
{
        /* terminate the current full segment (used in case of I/O-error) */
        nilfs->ns_pseg_offset = seg_end - seg_start + 1;
}

static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
{
        /* move forward with a full segment */
        nilfs->ns_segnum = nilfs->ns_nextnum;
        nilfs->ns_pseg_offset = 0;
        nilfs->ns_seg_seq++;
}

static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
{
        __u64 cno;

        spin_lock(&nilfs->ns_last_segment_lock);
        cno = nilfs->ns_last_cno;
        spin_unlock(&nilfs->ns_last_segment_lock);
        return cno;
}

static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
{
        return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
}

static inline int nilfs_flush_device(struct the_nilfs *nilfs)
{
        int err;

        if (!nilfs_test_opt(nilfs, BARRIER) || nilfs->ns_flushed_device)
                return 0;

        nilfs->ns_flushed_device = 1;
        /*
         * the store to ns_flushed_device must not be reordered after
         * blkdev_issue_flush().
         */
        smp_wmb();

        err = blkdev_issue_flush(nilfs->ns_bdev);
        if (err != -EIO)
                err = 0;
        return err;
}

#endif /* _THE_NILFS_H */



























































    1 


    1 





    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/seq_file.c
 *
 * helper functions for making synthetic files from sequences of records.
 * initial implementation -- AV, Oct 2001.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/cache.h>
#include <linux/fs.h>
#include <linux/export.h>
#include <linux/hex.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/string_helpers.h>
#include <linux/uio.h>

#include <linux/uaccess.h>
#include <asm/page.h>

static struct kmem_cache *seq_file_cache __ro_after_init;

static void seq_set_overflow(struct seq_file *m)
{
        m->count = m->size;
}

static void *seq_buf_alloc(unsigned long size)
{
        if (unlikely(size > MAX_RW_COUNT))
                return NULL;

        return kvmalloc(size, GFP_KERNEL_ACCOUNT);
}

/**
 *        seq_open -        initialize sequential file
 *        @file: file we initialize
 *        @op: method table describing the sequence
 *
 *        seq_open() sets @file, associating it with a sequence described
 *        by @op.  @op->start() sets the iterator up and returns the first
 *        element of sequence. @op->stop() shuts it down.  @op->next()
 *        returns the next element of sequence.  @op->show() prints element
 *        into the buffer.  In case of error ->start() and ->next() return
 *        ERR_PTR(error).  In the end of sequence they return %NULL. ->show()
 *        returns 0 in case of success and negative number in case of error.
 *        Returning SEQ_SKIP means "discard this element and move on".
 *        Note: seq_open() will allocate a struct seq_file and store its
 *        pointer in @file->private_data. This pointer should not be modified.
 */
int seq_open(struct file *file, const struct seq_operations *op)
{
        struct seq_file *p;

        WARN_ON(file->private_data);

        p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        file->private_data = p;

        mutex_init(&p->lock);
        p->op = op;

        // No refcounting: the lifetime of 'p' is constrained
        // to the lifetime of the file.
        p->file = file;

        /*
         * seq_files support lseek() and pread().  They do not implement
         * write() at all, but we clear FMODE_PWRITE here for historical
         * reasons.
         *
         * If a client of seq_files a) implements file.write() and b) wishes to
         * support pwrite() then that client will need to implement its own
         * file.open() which calls seq_open() and then sets FMODE_PWRITE.
         */
        file->f_mode &= ~FMODE_PWRITE;
        return 0;
}
EXPORT_SYMBOL(seq_open);

static int traverse(struct seq_file *m, loff_t offset)
{
        loff_t pos = 0;
        int error = 0;
        void *p;

        m->index = 0;
        m->count = m->from = 0;
        if (!offset)
                return 0;

        if (!m->buf) {
                m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
                if (!m->buf)
                        return -ENOMEM;
        }
        p = m->op->start(m, &m->index);
        while (p) {
                error = PTR_ERR(p);
                if (IS_ERR(p))
                        break;
                error = m->op->show(m, p);
                if (error < 0)
                        break;
                if (unlikely(error)) {
                        error = 0;
                        m->count = 0;
                }
                if (seq_has_overflowed(m))
                        goto Eoverflow;
                p = m->op->next(m, p, &m->index);
                if (pos + m->count > offset) {
                        m->from = offset - pos;
                        m->count -= m->from;
                        break;
                }
                pos += m->count;
                m->count = 0;
                if (pos == offset)
                        break;
        }
        m->op->stop(m, p);
        return error;

Eoverflow:
        m->op->stop(m, p);
        kvfree(m->buf);
        m->count = 0;
        m->buf = seq_buf_alloc(m->size <<= 1);
        return !m->buf ? -ENOMEM : -EAGAIN;
}

/**
 *        seq_read -        ->read() method for sequential files.
 *        @file: the file to read from
 *        @buf: the buffer to read to
 *        @size: the maximum number of bytes to read
 *        @ppos: the current position in the file
 *
 *        Ready-made ->f_op->read()
 */
ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
        struct iovec iov = { .iov_base = buf, .iov_len = size};
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, file);
        iov_iter_init(&iter, ITER_DEST, &iov, 1, size);

        kiocb.ki_pos = *ppos;
        ret = seq_read_iter(&kiocb, &iter);
        *ppos = kiocb.ki_pos;
        return ret;
}
EXPORT_SYMBOL(seq_read);

/*
 * Ready-made ->f_op->read_iter()
 */
ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct seq_file *m = iocb->ki_filp->private_data;
        size_t copied = 0;
        size_t n;
        void *p;
        int err = 0;

        if (!iov_iter_count(iter))
                return 0;

        mutex_lock(&m->lock);

        /*
         * if request is to read from zero offset, reset iterator to first
         * record as it might have been already advanced by previous requests
         */
        if (iocb->ki_pos == 0) {
                m->index = 0;
                m->count = 0;
        }

        /* Don't assume ki_pos is where we left it */
        if (unlikely(iocb->ki_pos != m->read_pos)) {
                while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN)
                        ;
                if (err) {
                        /* With prejudice... */
                        m->read_pos = 0;
                        m->index = 0;
                        m->count = 0;
                        goto Done;
                } else {
                        m->read_pos = iocb->ki_pos;
                }
        }

        /* grab buffer if we didn't have one */
        if (!m->buf) {
                m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
                if (!m->buf)
                        goto Enomem;
        }
        // something left in the buffer - copy it out first
        if (m->count) {
                n = copy_to_iter(m->buf + m->from, m->count, iter);
                m->count -= n;
                m->from += n;
                copied += n;
                if (m->count)        // hadn't managed to copy everything
                        goto Done;
        }
        // get a non-empty record in the buffer
        m->from = 0;
        p = m->op->start(m, &m->index);
        while (1) {
                err = PTR_ERR(p);
                if (!p || IS_ERR(p))        // EOF or an error
                        break;
                err = m->op->show(m, p);
                if (err < 0)                // hard error
                        break;
                if (unlikely(err))        // ->show() says "skip it"
                        m->count = 0;
                if (unlikely(!m->count)) { // empty record
                        p = m->op->next(m, p, &m->index);
                        continue;
                }
                if (!seq_has_overflowed(m)) // got it
                        goto Fill;
                // need a bigger buffer
                m->op->stop(m, p);
                kvfree(m->buf);
                m->count = 0;
                m->buf = seq_buf_alloc(m->size <<= 1);
                if (!m->buf)
                        goto Enomem;
                p = m->op->start(m, &m->index);
        }
        // EOF or an error
        m->op->stop(m, p);
        m->count = 0;
        goto Done;
Fill:
        // one non-empty record is in the buffer; if they want more,
        // try to fit more in, but in any case we need to advance
        // the iterator once for every record shown.
        while (1) {
                size_t offs = m->count;
                loff_t pos = m->index;

                p = m->op->next(m, p, &m->index);
                if (pos == m->index) {
                        pr_info_ratelimited("buggy .next function %ps did not update position index\n",
                                            m->op->next);
                        m->index++;
                }
                if (!p || IS_ERR(p))        // no next record for us
                        break;
                if (m->count >= iov_iter_count(iter))
                        break;
                err = m->op->show(m, p);
                if (err > 0) {                // ->show() says "skip it"
                        m->count = offs;
                } else if (err || seq_has_overflowed(m)) {
                        m->count = offs;
                        break;
                }
        }
        m->op->stop(m, p);
        n = copy_to_iter(m->buf, m->count, iter);
        copied += n;
        m->count -= n;
        m->from = n;
Done:
        if (unlikely(!copied)) {
                copied = m->count ? -EFAULT : err;
        } else {
                iocb->ki_pos += copied;
                m->read_pos += copied;
        }
        mutex_unlock(&m->lock);
        return copied;
Enomem:
        err = -ENOMEM;
        goto Done;
}
EXPORT_SYMBOL(seq_read_iter);

/**
 *        seq_lseek -        ->llseek() method for sequential files.
 *        @file: the file in question
 *        @offset: new position
 *        @whence: 0 for absolute, 1 for relative position
 *
 *        Ready-made ->f_op->llseek()
 */
loff_t seq_lseek(struct file *file, loff_t offset, int whence)
{
        struct seq_file *m = file->private_data;
        loff_t retval = -EINVAL;

        mutex_lock(&m->lock);
        switch (whence) {
        case SEEK_CUR:
                offset += file->f_pos;
                fallthrough;
        case SEEK_SET:
                if (offset < 0)
                        break;
                retval = offset;
                if (offset != m->read_pos) {
                        while ((retval = traverse(m, offset)) == -EAGAIN)
                                ;
                        if (retval) {
                                /* with extreme prejudice... */
                                file->f_pos = 0;
                                m->read_pos = 0;
                                m->index = 0;
                                m->count = 0;
                        } else {
                                m->read_pos = offset;
                                retval = file->f_pos = offset;
                        }
                } else {
                        file->f_pos = offset;
                }
        }
        mutex_unlock(&m->lock);
        return retval;
}
EXPORT_SYMBOL(seq_lseek);

/**
 *        seq_release -        free the structures associated with sequential file.
 *        @inode: its inode
 *        @file: file in question
 *
 *        Frees the structures associated with sequential file; can be used
 *        as ->f_op->release() if you don't have private data to destroy.
 */
int seq_release(struct inode *inode, struct file *file)
{
        struct seq_file *m = file->private_data;
        kvfree(m->buf);
        kmem_cache_free(seq_file_cache, m);
        return 0;
}
EXPORT_SYMBOL(seq_release);

/**
 * seq_escape_mem - print data into buffer, escaping some characters
 * @m: target buffer
 * @src: source buffer
 * @len: size of source buffer
 * @flags: flags to pass to string_escape_mem()
 * @esc: set of characters that need escaping
 *
 * Puts data into buffer, replacing each occurrence of character from
 * given class (defined by @flags and @esc) with printable escaped sequence.
 *
 * Use seq_has_overflowed() to check for errors.
 */
void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
                    unsigned int flags, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int ret;

        ret = string_escape_mem(src, len, buf, size, flags, esc);
        seq_commit(m, ret < size ? ret : -1);
}
EXPORT_SYMBOL(seq_escape_mem);

void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
        int len;

        if (m->count < m->size) {
                len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
                if (m->count + len < m->size) {
                        m->count += len;
                        return;
                }
        }
        seq_set_overflow(m);
}
EXPORT_SYMBOL(seq_vprintf);

void seq_printf(struct seq_file *m, const char *f, ...)
{
        va_list args;

        va_start(args, f);
        seq_vprintf(m, f, args);
        va_end(args);
}
EXPORT_SYMBOL(seq_printf);

#ifdef CONFIG_BINARY_PRINTF
void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary)
{
        int len;

        if (m->count < m->size) {
                len = bstr_printf(m->buf + m->count, m->size - m->count, f,
                                  binary);
                if (m->count + len < m->size) {
                        m->count += len;
                        return;
                }
        }
        seq_set_overflow(m);
}
EXPORT_SYMBOL(seq_bprintf);
#endif /* CONFIG_BINARY_PRINTF */

/**
 *        mangle_path -        mangle and copy path to buffer beginning
 *        @s: buffer start
 *        @p: beginning of path in above buffer
 *        @esc: set of characters that need escaping
 *
 *      Copy the path from @p to @s, replacing each occurrence of character from
 *      @esc with usual octal escape.
 *      Returns pointer past last written character in @s, or NULL in case of
 *      failure.
 */
char *mangle_path(char *s, const char *p, const char *esc)
{
        while (s <= p) {
                char c = *p++;
                if (!c) {
                        return s;
                } else if (!strchr(esc, c)) {
                        *s++ = c;
                } else if (s + 4 > p) {
                        break;
                } else {
                        *s++ = '\\';
                        *s++ = '0' + ((c & 0300) >> 6);
                        *s++ = '0' + ((c & 070) >> 3);
                        *s++ = '0' + (c & 07);
                }
        }
        return NULL;
}
EXPORT_SYMBOL(mangle_path);

/**
 * seq_path - seq_file interface to print a pathname
 * @m: the seq_file handle
 * @path: the struct path to print
 * @esc: set of characters to escape in the output
 *
 * return the absolute path of 'path', as represented by the
 * dentry / mnt pair in the path parameter.
 */
int seq_path(struct seq_file *m, const struct path *path, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int res = -1;

        if (size) {
                char *p = d_path(path, buf, size);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
                        if (end)
                                res = end - buf;
                }
        }
        seq_commit(m, res);

        return res;
}
EXPORT_SYMBOL(seq_path);

/**
 * seq_file_path - seq_file interface to print a pathname of a file
 * @m: the seq_file handle
 * @file: the struct file to print
 * @esc: set of characters to escape in the output
 *
 * return the absolute path to the file.
 */
int seq_file_path(struct seq_file *m, struct file *file, const char *esc)
{
        return seq_path(m, &file->f_path, esc);
}
EXPORT_SYMBOL(seq_file_path);

/*
 * Same as seq_path, but relative to supplied root.
 */
int seq_path_root(struct seq_file *m, const struct path *path,
                  const struct path *root, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int res = -ENAMETOOLONG;

        if (size) {
                char *p;

                p = __d_path(path, root, buf, size);
                if (!p)
                        return SEQ_SKIP;
                res = PTR_ERR(p);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
                        if (end)
                                res = end - buf;
                        else
                                res = -ENAMETOOLONG;
                }
        }
        seq_commit(m, res);

        return res < 0 && res != -ENAMETOOLONG ? res : 0;
}

/*
 * returns the path of the 'dentry' from the root of its filesystem.
 */
int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int res = -1;

        if (size) {
                char *p = dentry_path(dentry, buf, size);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
                        if (end)
                                res = end - buf;
                }
        }
        seq_commit(m, res);

        return res;
}
EXPORT_SYMBOL(seq_dentry);

void *single_start(struct seq_file *p, loff_t *pos)
{
        return *pos ? NULL : SEQ_START_TOKEN;
}

static void *single_next(struct seq_file *p, void *v, loff_t *pos)
{
        ++*pos;
        return NULL;
}

static void single_stop(struct seq_file *p, void *v)
{
}

int single_open(struct file *file, int (*show)(struct seq_file *, void *),
                void *data)
{
        struct seq_operations *op = kmalloc_obj(*op, GFP_KERNEL_ACCOUNT);
        int res = -ENOMEM;

        if (op) {
                op->start = single_start;
                op->next = single_next;
                op->stop = single_stop;
                op->show = show;
                res = seq_open(file, op);
                if (!res)
                        ((struct seq_file *)file->private_data)->private = data;
                else
                        kfree(op);
        }
        return res;
}
EXPORT_SYMBOL(single_open);

int single_open_size(struct file *file, int (*show)(struct seq_file *, void *),
                void *data, size_t size)
{
        char *buf = seq_buf_alloc(size);
        int ret;
        if (!buf)
                return -ENOMEM;
        ret = single_open(file, show, data);
        if (ret) {
                kvfree(buf);
                return ret;
        }
        ((struct seq_file *)file->private_data)->buf = buf;
        ((struct seq_file *)file->private_data)->size = size;
        return 0;
}
EXPORT_SYMBOL(single_open_size);

int single_release(struct inode *inode, struct file *file)
{
        const struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
        int res = seq_release(inode, file);
        kfree(op);
        return res;
}
EXPORT_SYMBOL(single_release);

int seq_release_private(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;

        kfree(seq->private);
        seq->private = NULL;
        return seq_release(inode, file);
}
EXPORT_SYMBOL(seq_release_private);

void *__seq_open_private(struct file *f, const struct seq_operations *ops,
                int psize)
{
        int rc;
        void *private;
        struct seq_file *seq;

        private = kzalloc(psize, GFP_KERNEL_ACCOUNT);
        if (private == NULL)
                goto out;

        rc = seq_open(f, ops);
        if (rc < 0)
                goto out_free;

        seq = f->private_data;
        seq->private = private;
        return private;

out_free:
        kfree(private);
out:
        return NULL;
}
EXPORT_SYMBOL(__seq_open_private);

int seq_open_private(struct file *filp, const struct seq_operations *ops,
                int psize)
{
        return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM;
}
EXPORT_SYMBOL(seq_open_private);

void seq_putc(struct seq_file *m, char c)
{
        if (m->count >= m->size)
                return;

        m->buf[m->count++] = c;
}
EXPORT_SYMBOL(seq_putc);

void __seq_puts(struct seq_file *m, const char *s)
{
        seq_write(m, s, strlen(s));
}
EXPORT_SYMBOL(__seq_puts);

/**
 * seq_put_decimal_ull_width - A helper routine for putting decimal numbers
 *                                without rich format of printf().
 * only 'unsigned long long' is supported.
 * @m: seq_file identifying the buffer to which data should be written
 * @delimiter: a string which is printed before the number
 * @num: the number
 * @width: a minimum field width
 *
 * This routine will put strlen(delimiter) + number into seq_filed.
 * This routine is very quick when you show lots of numbers.
 * In usual cases, it will be better to use seq_printf(). It's easier to read.
 */
void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter,
                         unsigned long long num, unsigned int width)
{
        int len;

        if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
                goto overflow;

        if (delimiter && delimiter[0]) {
                if (delimiter[1] == 0)
                        seq_putc(m, delimiter[0]);
                else
                        seq_puts(m, delimiter);
        }

        if (!width)
                width = 1;

        if (m->count + width >= m->size)
                goto overflow;

        len = num_to_str(m->buf + m->count, m->size - m->count, num, width);
        if (!len)
                goto overflow;

        m->count += len;
        return;

overflow:
        seq_set_overflow(m);
}

void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
                         unsigned long long num)
{
        return seq_put_decimal_ull_width(m, delimiter, num, 0);
}
EXPORT_SYMBOL(seq_put_decimal_ull);

/**
 * seq_put_hex_ll - put a number in hexadecimal notation
 * @m: seq_file identifying the buffer to which data should be written
 * @delimiter: a string which is printed before the number
 * @v: the number
 * @width: a minimum field width
 *
 * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v)
 *
 * This routine is very quick when you show lots of numbers.
 * In usual cases, it will be better to use seq_printf(). It's easier to read.
 */
void seq_put_hex_ll(struct seq_file *m, const char *delimiter,
                                unsigned long long v, unsigned int width)
{
        unsigned int len;
        int i;

        if (delimiter && delimiter[0]) {
                if (delimiter[1] == 0)
                        seq_putc(m, delimiter[0]);
                else
                        seq_puts(m, delimiter);
        }

        /* If x is 0, the result of __builtin_clzll is undefined */
        if (v == 0)
                len = 1;
        else
                len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4;

        if (len < width)
                len = width;

        if (m->count + len > m->size) {
                seq_set_overflow(m);
                return;
        }

        for (i = len - 1; i >= 0; i--) {
                m->buf[m->count + i] = hex_asc[0xf & v];
                v = v >> 4;
        }
        m->count += len;
}

void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num)
{
        int len;

        if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */
                goto overflow;

        if (delimiter && delimiter[0]) {
                if (delimiter[1] == 0)
                        seq_putc(m, delimiter[0]);
                else
                        seq_puts(m, delimiter);
        }

        if (m->count + 2 >= m->size)
                goto overflow;

        if (num < 0) {
                m->buf[m->count++] = '-';
                num = -num;
        }

        if (num < 10) {
                m->buf[m->count++] = num + '0';
                return;
        }

        len = num_to_str(m->buf + m->count, m->size - m->count, num, 0);
        if (!len)
                goto overflow;

        m->count += len;
        return;

overflow:
        seq_set_overflow(m);
}
EXPORT_SYMBOL(seq_put_decimal_ll);

/**
 * seq_write - write arbitrary data to buffer
 * @seq: seq_file identifying the buffer to which data should be written
 * @data: data address
 * @len: number of bytes
 *
 * Return 0 on success, non-zero otherwise.
 */
int seq_write(struct seq_file *seq, const void *data, size_t len)
{
        if (seq->count + len < seq->size) {
                memcpy(seq->buf + seq->count, data, len);
                seq->count += len;
                return 0;
        }
        seq_set_overflow(seq);
        return -1;
}
EXPORT_SYMBOL(seq_write);

/**
 * seq_pad - write padding spaces to buffer
 * @m: seq_file identifying the buffer to which data should be written
 * @c: the byte to append after padding if non-zero
 */
void seq_pad(struct seq_file *m, char c)
{
        int size = m->pad_until - m->count;
        if (size > 0) {
                if (size + m->count > m->size) {
                        seq_set_overflow(m);
                        return;
                }
                memset(m->buf + m->count, ' ', size);
                m->count += size;
        }
        if (c)
                seq_putc(m, c);
}
EXPORT_SYMBOL(seq_pad);

/* A complete analogue of print_hex_dump() */
void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
                  int rowsize, int groupsize, const void *buf, size_t len,
                  bool ascii)
{
        const u8 *ptr = buf;
        int i, linelen, remaining = len;
        char *buffer;
        size_t size;
        int ret;

        if (rowsize != 16 && rowsize != 32)
                rowsize = 16;

        for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
                linelen = min(remaining, rowsize);
                remaining -= rowsize;

                switch (prefix_type) {
                case DUMP_PREFIX_ADDRESS:
                        seq_printf(m, "%s%p: ", prefix_str, ptr + i);
                        break;
                case DUMP_PREFIX_OFFSET:
                        seq_printf(m, "%s%.8x: ", prefix_str, i);
                        break;
                default:
                        seq_printf(m, "%s", prefix_str);
                        break;
                }

                size = seq_get_buf(m, &buffer);
                ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
                                         buffer, size, ascii);
                seq_commit(m, ret < size ? ret : -1);

                seq_putc(m, '\n');
        }
}
EXPORT_SYMBOL(seq_hex_dump);

struct list_head *seq_list_start(struct list_head *head, loff_t pos)
{
        struct list_head *lh;

        list_for_each(lh, head)
                if (pos-- == 0)
                        return lh;

        return NULL;
}
EXPORT_SYMBOL(seq_list_start);

struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
{
        if (!pos)
                return head;

        return seq_list_start(head, pos - 1);
}
EXPORT_SYMBOL(seq_list_start_head);

struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
{
        struct list_head *lh;

        lh = ((struct list_head *)v)->next;
        ++*ppos;
        return lh == head ? NULL : lh;
}
EXPORT_SYMBOL(seq_list_next);

struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos)
{
        struct list_head *lh;

        list_for_each_rcu(lh, head)
                if (pos-- == 0)
                        return lh;

        return NULL;
}
EXPORT_SYMBOL(seq_list_start_rcu);

struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos)
{
        if (!pos)
                return head;

        return seq_list_start_rcu(head, pos - 1);
}
EXPORT_SYMBOL(seq_list_start_head_rcu);

struct list_head *seq_list_next_rcu(void *v, struct list_head *head,
                                    loff_t *ppos)
{
        struct list_head *lh;

        lh = list_next_rcu((struct list_head *)v);
        ++*ppos;
        return lh == head ? NULL : lh;
}
EXPORT_SYMBOL(seq_list_next_rcu);

/**
 * seq_hlist_start - start an iteration of a hlist
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start().
 */
struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
{
        struct hlist_node *node;

        hlist_for_each(node, head)
                if (pos-- == 0)
                        return node;
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_start);

/**
 * seq_hlist_start_head - start an iteration of a hlist
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start(). Call this function if you want to
 * print a header at the top of the output.
 */
struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
{
        if (!pos)
                return SEQ_START_TOKEN;

        return seq_hlist_start(head, pos - 1);
}
EXPORT_SYMBOL(seq_hlist_start_head);

/**
 * seq_hlist_next - move to the next position of the hlist
 * @v:    the current iterator
 * @head: the head of the hlist
 * @ppos: the current position
 *
 * Called at seq_file->op->next().
 */
struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
                                  loff_t *ppos)
{
        struct hlist_node *node = v;

        ++*ppos;
        if (v == SEQ_START_TOKEN)
                return head->first;
        else
                return node->next;
}
EXPORT_SYMBOL(seq_hlist_next);

/**
 * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start().
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
                                       loff_t pos)
{
        struct hlist_node *node;

        __hlist_for_each_rcu(node, head)
                if (pos-- == 0)
                        return node;
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_start_rcu);

/**
 * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start(). Call this function if you want to
 * print a header at the top of the output.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
                                            loff_t pos)
{
        if (!pos)
                return SEQ_START_TOKEN;

        return seq_hlist_start_rcu(head, pos - 1);
}
EXPORT_SYMBOL(seq_hlist_start_head_rcu);

/**
 * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
 * @v:    the current iterator
 * @head: the head of the hlist
 * @ppos: the current position
 *
 * Called at seq_file->op->next().
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
struct hlist_node *seq_hlist_next_rcu(void *v,
                                      struct hlist_head *head,
                                      loff_t *ppos)
{
        struct hlist_node *node = v;

        ++*ppos;
        if (v == SEQ_START_TOKEN)
                return rcu_dereference(head->first);
        else
                return rcu_dereference(node->next);
}
EXPORT_SYMBOL(seq_hlist_next_rcu);

/**
 * seq_hlist_start_percpu - start an iteration of a percpu hlist array
 * @head: pointer to percpu array of struct hlist_heads
 * @cpu:  pointer to cpu "cursor"
 * @pos:  start position of sequence
 *
 * Called at seq_file->op->start().
 */
struct hlist_node *
seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
{
        struct hlist_node *node;

        for_each_possible_cpu(*cpu) {
                hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
                        if (pos-- == 0)
                                return node;
                }
        }
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_start_percpu);

/**
 * seq_hlist_next_percpu - move to the next position of the percpu hlist array
 * @v:    pointer to current hlist_node
 * @head: pointer to percpu array of struct hlist_heads
 * @cpu:  pointer to cpu "cursor"
 * @pos:  start position of sequence
 *
 * Called at seq_file->op->next().
 */
struct hlist_node *
seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
                        int *cpu, loff_t *pos)
{
        struct hlist_node *node = v;

        ++*pos;

        if (node->next)
                return node->next;

        for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
             *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
                struct hlist_head *bucket = per_cpu_ptr(head, *cpu);

                if (!hlist_empty(bucket))
                        return bucket->first;
        }
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_next_percpu);

void __init seq_file_init(void)
{
        seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC);
}


























































































































































































































































































































































   11 



























   36 


















   10 




































































































   11 





   11 







   11 
   11 








































































   11 






















































































































   23 



































































































































   10 





















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Filesystem access notification for Linux
 *
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#ifndef __LINUX_FSNOTIFY_BACKEND_H
#define __LINUX_FSNOTIFY_BACKEND_H

#ifdef __KERNEL__

#include <linux/idr.h> /* inotify uses this */
#include <linux/fs.h> /* struct inode */
#include <linux/list.h>
#include <linux/path.h> /* struct path */
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/mempool.h>
#include <linux/sched/mm.h>

/*
 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
 * convert between them.  dnotify only needs conversion at watch creation
 * so no perf loss there.  fanotify isn't defined yet, so it can use the
 * wholes if it needs more events.
 */
#define FS_ACCESS                0x00000001        /* File was accessed */
#define FS_MODIFY                0x00000002        /* File was modified */
#define FS_ATTRIB                0x00000004        /* Metadata changed */
#define FS_CLOSE_WRITE                0x00000008        /* Writable file was closed */
#define FS_CLOSE_NOWRITE        0x00000010        /* Unwritable file closed */
#define FS_OPEN                        0x00000020        /* File was opened */
#define FS_MOVED_FROM                0x00000040        /* File was moved from X */
#define FS_MOVED_TO                0x00000080        /* File was moved to Y */
#define FS_CREATE                0x00000100        /* Subfile was created */
#define FS_DELETE                0x00000200        /* Subfile was deleted */
#define FS_DELETE_SELF                0x00000400        /* Self was deleted */
#define FS_MOVE_SELF                0x00000800        /* Self was moved */
#define FS_OPEN_EXEC                0x00001000        /* File was opened for exec */

#define FS_UNMOUNT                0x00002000        /* inode on umount fs */
#define FS_Q_OVERFLOW                0x00004000        /* Event queued overflowed */
#define FS_ERROR                0x00008000        /* Filesystem Error (fanotify) */

/*
 * FS_IN_IGNORED overloads FS_ERROR.  It is only used internally by inotify
 * which does not support FS_ERROR.
 */
#define FS_IN_IGNORED                0x00008000        /* last inotify event here */

#define FS_OPEN_PERM                0x00010000        /* open event in an permission hook */
#define FS_ACCESS_PERM                0x00020000        /* access event in a permissions hook */
#define FS_OPEN_EXEC_PERM        0x00040000        /* open/exec event in a permission hook */
/* #define FS_DIR_MODIFY        0x00080000 */        /* Deprecated (reserved) */

#define FS_PRE_ACCESS                0x00100000        /* Pre-content access hook */

#define FS_MNT_ATTACH                0x01000000        /* Mount was attached */
#define FS_MNT_DETACH                0x02000000        /* Mount was detached */
#define FS_MNT_MOVE                (FS_MNT_ATTACH | FS_MNT_DETACH)

/*
 * Set on inode mark that cares about things that happen to its children.
 * Always set for dnotify and inotify.
 * Set on inode/sb/mount marks that care about parent/name info.
 */
#define FS_EVENT_ON_CHILD        0x08000000

#define FS_RENAME                0x10000000        /* File was renamed */
#define FS_DN_MULTISHOT                0x20000000        /* dnotify multishot */
#define FS_ISDIR                0x40000000        /* event occurred against dir */

#define FS_MOVE                        (FS_MOVED_FROM | FS_MOVED_TO)

/*
 * Directory entry modification events - reported only to directory
 * where entry is modified and not to a watching parent.
 * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
 * when a directory entry inside a child subdir changes.
 */
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)

/* Mount namespace events */
#define FSNOTIFY_MNT_EVENTS (FS_MNT_ATTACH | FS_MNT_DETACH)

/* Content events can be used to inspect file content */
#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \
                                      FS_ACCESS_PERM)
/* Pre-content events can be used to fill file content */
#define FSNOTIFY_PRE_CONTENT_EVENTS  (FS_PRE_ACCESS)

#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \
                                  FSNOTIFY_PRE_CONTENT_EVENTS)

/*
 * This is a list of all events that may get sent to a parent that is watching
 * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory.
 */
#define FS_EVENTS_POSS_ON_CHILD   (ALL_FSNOTIFY_PERM_EVENTS | \
                                   FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
                                   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \
                                   FS_OPEN | FS_OPEN_EXEC)

/*
 * This is a list of all events that may get sent with the parent inode as the
 * @to_tell argument of fsnotify().
 * It may include events that can be sent to an inode/sb/mount mark, but cannot
 * be sent to a parent watching children.
 */
#define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD)

/* Events that can be reported to backends */
#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
                             FSNOTIFY_MNT_EVENTS | \
                             FS_EVENTS_POSS_ON_CHILD | \
                             FS_DELETE_SELF | FS_MOVE_SELF | \
                             FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
                             FS_ERROR)

/* Extra flags that may be reported with event or control handling of events */
#define ALL_FSNOTIFY_FLAGS  (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT)

#define ALL_FSNOTIFY_BITS   (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS)

struct fsnotify_group;
struct fsnotify_event;
struct fsnotify_mark;
struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;

struct mem_cgroup;

/*
 * Each group much define these ops.  The fsnotify infrastructure will call
 * these operations for each relevant group.
 *
 * handle_event - main call for a group to handle an fs event
 * @group:        group to notify
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 * @iter_info:        array of marks from this group that are interested in the event
 *
 * handle_inode_event - simple variant of handle_event() for groups that only
 *                have inode marks and don't have ignore mask
 * @mark:        mark to notify
 * @mask:        event type and flags
 * @inode:        inode that event happened on
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to.
 *                Either @inode or @dir must be non-NULL.
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 *
 * free_group_priv - called when a group refcnt hits 0 to clean up the private union
 * freeing_mark - called when a mark is being destroyed for some reason.  The group
 *                MUST be holding a reference on each mark and that reference must be
 *                dropped in this function.  inotify uses this function to send
 *                userspace messages that marks have been removed.
 */
struct fsnotify_ops {
        int (*handle_event)(struct fsnotify_group *group, u32 mask,
                            const void *data, int data_type, struct inode *dir,
                            const struct qstr *file_name, u32 cookie,
                            struct fsnotify_iter_info *iter_info);
        int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
                            struct inode *inode, struct inode *dir,
                            const struct qstr *file_name, u32 cookie);
        void (*free_group_priv)(struct fsnotify_group *group);
        void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
        void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event);
        /* called on final put+free to free memory */
        void (*free_mark)(struct fsnotify_mark *mark);
};

/*
 * all of the information about the original object we want to now send to
 * a group.  If you want to carry more info from the accessing task to the
 * listener this structure is where you need to be adding fields.
 */
struct fsnotify_event {
        struct list_head list;
};

/*
 * fsnotify group priorities.
 * Events are sent in order from highest priority to lowest priority.
 */
enum fsnotify_group_prio {
        FSNOTIFY_PRIO_NORMAL = 0,        /* normal notifiers, no permissions */
        FSNOTIFY_PRIO_CONTENT,                /* fanotify permission events */
        FSNOTIFY_PRIO_PRE_CONTENT,        /* fanotify pre-content events */
        __FSNOTIFY_PRIO_NUM
};

/*
 * A group is a "thing" that wants to receive notification about filesystem
 * events.  The mask holds the subset of event types this group cares about.
 * refcnt on a group is up to the implementor and at any moment if it goes 0
 * everything will be cleaned up.
 */
struct fsnotify_group {
        const struct fsnotify_ops *ops;        /* how this group handles things */

        /*
         * How the refcnt is used is up to each group.  When the refcnt hits 0
         * fsnotify will clean up all of the resources associated with this group.
         * As an example, the dnotify group will always have a refcnt=1 and that
         * will never change.  Inotify, on the other hand, has a group per
         * inotify_init() and the refcnt will hit 0 only when that fd has been
         * closed.
         */
        refcount_t refcnt;                /* things with interest in this group */

        /* needed to send notification to userspace */
        spinlock_t notification_lock;                /* protect the notification_list */
        struct list_head notification_list;        /* list of event_holder this group needs to send to userspace */
        wait_queue_head_t notification_waitq;        /* read() on the notification file blocks on this waitq */
        unsigned int q_len;                        /* events on the queue */
        unsigned int max_events;                /* maximum events allowed on the list */
        enum fsnotify_group_prio priority;        /* priority for sending events */
        bool shutdown;                /* group is being shut down, don't queue more events */

#define FSNOTIFY_GROUP_USER        0x01 /* user allocated group */
#define FSNOTIFY_GROUP_DUPS        0x02 /* allow multiple marks per object */
        int flags;
        unsigned int owner_flags;        /* stored flags of mark_mutex owner */

        /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
        struct mutex mark_mutex;        /* protect marks_list */
        atomic_t user_waits;                /* Number of tasks waiting for user
                                         * response */
        struct list_head marks_list;        /* all inode marks for this group */

        struct fasync_struct *fsn_fa;    /* async notification */

        struct fsnotify_event *overflow_event;        /* Event we queue when the
                                                 * notification list is too
                                                 * full */

        struct mem_cgroup *memcg;        /* memcg to charge allocations */
        struct user_namespace *user_ns;        /* user ns where group was created */

        /* groups can define private fields here or use the void *private */
        union {
                void *private;
#ifdef CONFIG_INOTIFY_USER
                struct inotify_group_private_data {
                        spinlock_t        idr_lock;
                        struct idr      idr;
                        struct ucounts *ucounts;
                } inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
                struct fanotify_group_private_data {
                        /* Hash table of events for merge */
                        struct hlist_head *merge_hash;
                        /* allows a group to block waiting for a userspace response */
                        struct list_head access_list;
                        wait_queue_head_t access_waitq;
                        int flags;           /* flags from fanotify_init() */
                        int f_flags; /* event_f_flags from fanotify_init() */
                        struct ucounts *ucounts;
                        mempool_t error_events_pool;
                        /* chained on perm_group_list */
                        struct list_head perm_grp_list;
                } fanotify_data;
#endif /* CONFIG_FANOTIFY */
        };
};

/*
 * These helpers are used to prevent deadlock when reclaiming inodes with
 * evictable marks of the same group that is allocating a new mark.
 */
static inline void fsnotify_group_lock(struct fsnotify_group *group)
{
        mutex_lock(&group->mark_mutex);
        group->owner_flags = memalloc_nofs_save();
}

static inline void fsnotify_group_unlock(struct fsnotify_group *group)
{
        memalloc_nofs_restore(group->owner_flags);
        mutex_unlock(&group->mark_mutex);
}

static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
{
        WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
        WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
}

/* When calling fsnotify tell it if the data is a path or inode */
enum fsnotify_data_type {
        FSNOTIFY_EVENT_NONE,
        FSNOTIFY_EVENT_FILE_RANGE,
        FSNOTIFY_EVENT_PATH,
        FSNOTIFY_EVENT_INODE,
        FSNOTIFY_EVENT_DENTRY,
        FSNOTIFY_EVENT_MNT,
        FSNOTIFY_EVENT_ERROR,
};

struct fs_error_report {
        int error;
        struct inode *inode;
        struct super_block *sb;
};

struct file_range {
        const struct path *path;
        loff_t pos;
        size_t count;
};

static inline const struct path *file_range_path(const struct file_range *range)
{
        return range->path;
}

struct fsnotify_mnt {
        const struct mnt_namespace *ns;
        u64 mnt_id;
};

static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return (struct inode *)data;
        case FSNOTIFY_EVENT_DENTRY:
                return d_inode(data);
        case FSNOTIFY_EVENT_PATH:
                return d_inode(((const struct path *)data)->dentry);
        case FSNOTIFY_EVENT_FILE_RANGE:
                return d_inode(file_range_path(data)->dentry);
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *)data)->inode;
        default:
                return NULL;
        }
}

static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_DENTRY:
                /* Non const is needed for dget() */
                return (struct dentry *)data;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry;
        case FSNOTIFY_EVENT_FILE_RANGE:
                return file_range_path(data)->dentry;
        default:
                return NULL;
        }
}

static inline const struct path *fsnotify_data_path(const void *data,
                                                    int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_PATH:
                return data;
        case FSNOTIFY_EVENT_FILE_RANGE:
                return file_range_path(data);
        default:
                return NULL;
        }
}

static inline struct super_block *fsnotify_data_sb(const void *data,
                                                   int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return ((struct inode *)data)->i_sb;
        case FSNOTIFY_EVENT_DENTRY:
                return ((struct dentry *)data)->d_sb;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry->d_sb;
        case FSNOTIFY_EVENT_FILE_RANGE:
                return file_range_path(data)->dentry->d_sb;
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *) data)->sb;
        default:
                return NULL;
        }
}

static inline const struct fsnotify_mnt *fsnotify_data_mnt(const void *data,
                                                           int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_MNT:
                return data;
        default:
                return NULL;
        }
}

static inline u64 fsnotify_data_mnt_id(const void *data, int data_type)
{
        const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type);

        return mnt_data ? mnt_data->mnt_id : 0;
}

static inline struct fs_error_report *fsnotify_data_error_report(
                                                        const void *data,
                                                        int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_ERROR:
                return (struct fs_error_report *) data;
        default:
                return NULL;
        }
}

static inline const struct file_range *fsnotify_data_file_range(
                                                        const void *data,
                                                        int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_FILE_RANGE:
                return (struct file_range *)data;
        default:
                return NULL;
        }
}

/*
 * Index to merged marks iterator array that correlates to a type of watch.
 * The type of watched object can be deduced from the iterator type, but not
 * the other way around, because an event can match different watched objects
 * of the same object type.
 * For example, both parent and child are watching an object of type inode.
 */
enum fsnotify_iter_type {
        FSNOTIFY_ITER_TYPE_INODE,
        FSNOTIFY_ITER_TYPE_VFSMOUNT,
        FSNOTIFY_ITER_TYPE_SB,
        FSNOTIFY_ITER_TYPE_PARENT,
        FSNOTIFY_ITER_TYPE_INODE2,
        FSNOTIFY_ITER_TYPE_MNTNS,
        FSNOTIFY_ITER_TYPE_COUNT
};

/* The type of object that a mark is attached to */
enum fsnotify_obj_type {
        FSNOTIFY_OBJ_TYPE_ANY = -1,
        FSNOTIFY_OBJ_TYPE_INODE,
        FSNOTIFY_OBJ_TYPE_VFSMOUNT,
        FSNOTIFY_OBJ_TYPE_SB,
        FSNOTIFY_OBJ_TYPE_MNTNS,
        FSNOTIFY_OBJ_TYPE_COUNT,
        FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
};

static inline bool fsnotify_valid_obj_type(unsigned int obj_type)
{
        return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT);
}

struct fsnotify_iter_info {
        struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT];
        struct fsnotify_group *current_group;
        unsigned int report_mask;
        int srcu_idx;
};

static inline bool fsnotify_iter_should_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        return (iter_info->report_mask & (1U << iter_type));
}

static inline void fsnotify_iter_set_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        iter_info->report_mask |= (1U << iter_type);
}

static inline struct fsnotify_mark *fsnotify_iter_mark(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        if (fsnotify_iter_should_report_type(iter_info, iter_type))
                return iter_info->marks[iter_type];
        return NULL;
}

static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type,
                                     struct fsnotify_mark **markp)
{
        while (type < FSNOTIFY_ITER_TYPE_COUNT) {
                *markp = fsnotify_iter_mark(iter, type);
                if (*markp)
                        break;
                type++;
        }
        return type;
}

#define FSNOTIFY_ITER_FUNCS(name, NAME) \
static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
                struct fsnotify_iter_info *iter_info) \
{ \
        return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \
}

FSNOTIFY_ITER_FUNCS(inode, INODE)
FSNOTIFY_ITER_FUNCS(parent, PARENT)
FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
FSNOTIFY_ITER_FUNCS(sb, SB)

#define fsnotify_foreach_iter_type(type) \
        for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++)
#define fsnotify_foreach_iter_mark_type(iter, mark, type) \
        for (type = 0; \
             type = fsnotify_iter_step(iter, type, &mark), \
             type < FSNOTIFY_ITER_TYPE_COUNT; \
             type++)

/*
 * Inode/vfsmount/sb point to this structure which tracks all marks attached to
 * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this
 * structure. We destroy this structure when there are no more marks attached
 * to it. The structure is protected by fsnotify_mark_srcu.
 */
struct fsnotify_mark_connector {
        spinlock_t lock;
        unsigned char type;        /* Type of object [lock] */
        unsigned char prio;        /* Highest priority group */
#define FSNOTIFY_CONN_FLAG_IS_WATCHED        0x01
#define FSNOTIFY_CONN_FLAG_HAS_IREF        0x02
        unsigned short flags;        /* flags [lock] */
        union {
                /* Object pointer [lock] */
                void *obj;
                /* Used listing heads to free after srcu period expires */
                struct fsnotify_mark_connector *destroy_next;
        };
        struct hlist_head list;        /* List of marks */
};

/*
 * Container for per-sb fsnotify state (sb marks and more).
 * Attached lazily on first marked object on the sb and freed when killing sb.
 */
struct fsnotify_sb_info {
        struct fsnotify_mark_connector __rcu *sb_marks;
        /* List of connectors for inode marks */
        struct list_head inode_conn_list;
        spinlock_t list_lock;        /* Lock protecting inode_conn_list */
        /*
         * Number of inode/mount/sb objects that are being watched in this sb.
         * Note that inodes objects are currently double-accounted.
         *
         * The value in watched_objects[prio] is the number of objects that are
         * watched by groups of priority >= prio, so watched_objects[0] is the
         * total number of watched objects in this sb.
         */
        atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM];
};

static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb)
{
#ifdef CONFIG_FSNOTIFY
        return READ_ONCE(sb->s_fsnotify_info);
#else
        return NULL;
#endif
}

static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb)
{
        return &fsnotify_sb_info(sb)->watched_objects[0];
}

/*
 * A mark is simply an object attached to an in core inode which allows an
 * fsnotify listener to indicate they are either no longer interested in events
 * of a type matching mask or only interested in those events.
 *
 * These are flushed when an inode is evicted from core and may be flushed
 * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
 * users (such as dnotify) will flush these when the open fd is closed and not
 * at inode eviction or modification.
 *
 * Text in brackets is showing the lock(s) protecting modifications of a
 * particular entry. obj_lock means either inode->i_lock or
 * mnt->mnt_root->d_lock depending on the mark type.
 */
struct fsnotify_mark {
        /* Mask this mark is for [mark->lock, group->mark_mutex] */
        __u32 mask;
        /* We hold one for presence in g_list. Also one ref for each 'thing'
         * in kernel that found and may be using this mark. */
        refcount_t refcnt;
        /* Group this mark is for. Set on mark creation, stable until last ref
         * is dropped */
        struct fsnotify_group *group;
        /* List of marks by group->marks_list. Also reused for queueing
         * mark into destroy_list when it's waiting for the end of SRCU period
         * before it can be freed. [group->mark_mutex] */
        struct list_head g_list;
        /* Protects inode / mnt pointers, flags, masks */
        spinlock_t lock;
        /* List of marks for inode / vfsmount [connector->lock, mark ref] */
        struct hlist_node obj_list;
        /* Head of list of marks for an object [mark ref] */
        struct fsnotify_mark_connector *connector;
        /* Events types and flags to ignore [mark->lock, group->mark_mutex] */
        __u32 ignore_mask;
        /* General fsnotify mark flags */
#define FSNOTIFY_MARK_FLAG_ALIVE                0x0001
#define FSNOTIFY_MARK_FLAG_ATTACHED                0x0002
        /* inotify mark flags */
#define FSNOTIFY_MARK_FLAG_EXCL_UNLINK                0x0010
#define FSNOTIFY_MARK_FLAG_IN_ONESHOT                0x0020
        /* fanotify mark flags */
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY        0x0100
#define FSNOTIFY_MARK_FLAG_NO_IREF                0x0200
#define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS        0x0400
#define FSNOTIFY_MARK_FLAG_HAS_FSID                0x0800
#define FSNOTIFY_MARK_FLAG_WEAK_FSID                0x1000
        unsigned int flags;                /* flags [mark->lock] */
};

#ifdef CONFIG_FSNOTIFY

/* called from the vfs helpers */

/* main fsnotify call to send events */
extern int fsnotify(__u32 mask, const void *data, int data_type,
                    struct inode *dir, const struct qstr *name,
                    struct inode *inode, u32 cookie);
extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                           int data_type);
extern void __fsnotify_inode_delete(struct inode *inode);
extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
extern void fsnotify_sb_delete(struct super_block *sb);
extern void __fsnotify_mntns_delete(struct mnt_namespace *mntns);
extern void fsnotify_sb_free(struct super_block *sb);
extern u32 fsnotify_get_cookie(void);
extern void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt);

static inline __u32 fsnotify_parent_needed_mask(__u32 mask)
{
        /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */
        if (!(mask & FS_EVENT_ON_CHILD))
                return 0;
        /*
         * This object might be watched by a mark that cares about parent/name
         * info, does it care about the specific set of events that can be
         * reported with parent/name info?
         */
        return mask & FS_EVENTS_POSS_TO_PARENT;
}

static inline int fsnotify_inode_watches_children(struct inode *inode)
{
        __u32 parent_mask = READ_ONCE(inode->i_fsnotify_mask);

        /* FS_EVENT_ON_CHILD is set if the inode may care */
        if (!(parent_mask & FS_EVENT_ON_CHILD))
                return 0;
        /* this inode might care about child events, does it care about the
         * specific set of events that can happen on a child? */
        return parent_mask & FS_EVENTS_POSS_ON_CHILD;
}

/*
 * Update the dentry with a flag indicating the interest of its parent to receive
 * filesystem events when those events happens to this dentry->d_inode.
 */
static inline void fsnotify_update_flags(struct dentry *dentry)
{
        assert_spin_locked(&dentry->d_lock);

        /*
         * Serialisation of setting PARENT_WATCHED on the dentries is provided
         * by d_lock. If inotify_inode_watched changes after we have taken
         * d_lock, the following fsnotify_set_children_dentry_flags call will
         * find our entry, so it will spin until we complete here, and update
         * us with the new state.
         */
        if (fsnotify_inode_watches_children(dentry->d_parent->d_inode))
                dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
        else
                dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
}

/* called from fsnotify listeners, such as fanotify or dnotify */

/* create a new group */
extern struct fsnotify_group *fsnotify_alloc_group(
                                const struct fsnotify_ops *ops,
                                int flags);
/* get reference to a group */
extern void fsnotify_get_group(struct fsnotify_group *group);
/* drop reference on a group from fsnotify_alloc_group */
extern void fsnotify_put_group(struct fsnotify_group *group);
/* group destruction begins, stop queuing new events */
extern void fsnotify_group_stop_queueing(struct fsnotify_group *group);
/* destroy group */
extern void fsnotify_destroy_group(struct fsnotify_group *group);
/* fasync handler function */
extern int fsnotify_fasync(int fd, struct file *file, int on);
/* Free event from memory */
extern void fsnotify_destroy_event(struct fsnotify_group *group,
                                   struct fsnotify_event *event);
/* attach the event to the group notification queue */
extern int fsnotify_insert_event(struct fsnotify_group *group,
                                 struct fsnotify_event *event,
                                 int (*merge)(struct fsnotify_group *,
                                              struct fsnotify_event *),
                                 void (*insert)(struct fsnotify_group *,
                                                struct fsnotify_event *));

static inline int fsnotify_add_event(struct fsnotify_group *group,
                                     struct fsnotify_event *event,
                                     int (*merge)(struct fsnotify_group *,
                                                  struct fsnotify_event *))
{
        return fsnotify_insert_event(group, event, merge, NULL);
}

/* Queue overflow event to a notification group */
static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
{
        fsnotify_add_event(group, group->overflow_event, NULL);
}

static inline bool fsnotify_is_overflow_event(u32 mask)
{
        return mask & FS_Q_OVERFLOW;
}

static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
{
        assert_spin_locked(&group->notification_lock);

        return list_empty(&group->notification_list);
}

extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
/* return, but do not dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
/* return AND dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
/* Remove event queued in the notification list */
extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
                                         struct fsnotify_event *event);

/* functions used to manipulate the marks attached to inodes */

/*
 * Canonical "ignore mask" including event flags.
 *
 * Note the subtle semantic difference from the legacy ->ignored_mask.
 * ->ignored_mask traditionally only meant which events should be ignored,
 * while ->ignore_mask also includes flags regarding the type of objects on
 * which events should be ignored.
 */
static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark)
{
        __u32 ignore_mask = mark->ignore_mask;

        /* The event flags in ignore mask take effect */
        if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
                return ignore_mask;

        /*
         * Legacy behavior:
         * - Always ignore events on dir
         * - Ignore events on child if parent is watching children
         */
        ignore_mask |= FS_ISDIR;
        ignore_mask &= ~FS_EVENT_ON_CHILD;
        ignore_mask |= mark->mask & FS_EVENT_ON_CHILD;

        return ignore_mask;
}

/* Legacy ignored_mask - only event types to ignore */
static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark)
{
        return mark->ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/*
 * Check if mask (or ignore mask) should be applied depending if victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir,
                                            int iter_type)
{
        /* Should mask be applied to a directory? */
        if (is_dir && !(mask & FS_ISDIR))
                return false;

        /* Should mask be applied to a child? */
        if (iter_type == FSNOTIFY_ITER_TYPE_PARENT &&
            !(mask & FS_EVENT_ON_CHILD))
                return false;

        return true;
}

/*
 * Effective ignore mask taking into account if event victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark,
                                                   bool is_dir, int iter_type)
{
        __u32 ignore_mask = fsnotify_ignored_events(mark);

        if (!ignore_mask)
                return 0;

        /* For non-dir and non-child, no need to consult the event flags */
        if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT)
                return ignore_mask;

        ignore_mask = fsnotify_ignore_mask(mark);
        if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type))
                return 0;

        return ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/* Get mask for calculating object interest taking ignore mask into account */
static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
{
        __u32 mask = mark->mask;

        if (!fsnotify_ignored_events(mark))
                return mask;

        /* Interest in FS_MODIFY may be needed for clearing ignore mask */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                mask |= FS_MODIFY;

        /*
         * If mark is interested in ignoring events on children, the object must
         * show interest in those events for fsnotify_parent() to notice it.
         */
        return mask | mark->ignore_mask;
}

/* Get mask of events for a list of marks */
extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn);
/* Calculate mask of events for a list of marks */
extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
extern void fsnotify_init_mark(struct fsnotify_mark *mark,
                               struct fsnotify_group *group);
/* Find mark belonging to given group in the list of marks */
struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
                                         struct fsnotify_group *group);
/* attach the mark to the object */
int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
                      unsigned int obj_type, int add_flags);
int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj,
                             unsigned int obj_type, int add_flags);

/* attach the mark to the inode */
static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                                          struct inode *inode,
                                          int add_flags)
{
        return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
                                 add_flags);
}
static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
                                                 struct inode *inode,
                                                 int add_flags)
{
        return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
                                        add_flags);
}

static inline struct fsnotify_mark *fsnotify_find_inode_mark(
                                                struct inode *inode,
                                                struct fsnotify_group *group)
{
        return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group);
}

/* given a group and a mark, flag mark to be freed when all references are dropped */
extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                                  struct fsnotify_group *group);
/* detach mark from inode / mount list, group list, drop inode reference */
extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
/* free mark */
extern void fsnotify_free_mark(struct fsnotify_mark *mark);
/* Wait until all marks queued for destruction are destroyed */
extern void fsnotify_wait_marks_destroyed(void);
/* Clear all of the marks of a group attached to a given object type */
extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                          unsigned int obj_type);
extern void fsnotify_get_mark(struct fsnotify_mark *mark);
extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);

static inline void fsnotify_init_event(struct fsnotify_event *event)
{
        INIT_LIST_HEAD(&event->list);
}
int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
                         size_t count);

#else

static inline int fsnotify_pre_content(const struct path *path,
                                       const loff_t *ppos, size_t count)
{
        return 0;
}

static inline int fsnotify(__u32 mask, const void *data, int data_type,
                           struct inode *dir, const struct qstr *name,
                           struct inode *inode, u32 cookie)
{
        return 0;
}

static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        return 0;
}

static inline void __fsnotify_inode_delete(struct inode *inode)
{}

static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{}

static inline void fsnotify_sb_delete(struct super_block *sb)
{}

static inline void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
{}

static inline void fsnotify_sb_free(struct super_block *sb)
{}

static inline void fsnotify_update_flags(struct dentry *dentry)
{}

static inline u32 fsnotify_get_cookie(void)
{
        return 0;
}

static inline void fsnotify_unmount_inodes(struct super_block *sb)
{}

static inline void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt)
{}

#endif        /* CONFIG_FSNOTIFY */

#endif        /* __KERNEL __ */

#endif        /* __LINUX_FSNOTIFY_BACKEND_H */


















































































































































   11 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SHRINKER_H
#define _LINUX_SHRINKER_H

#include <linux/atomic.h>
#include <linux/types.h>
#include <linux/refcount.h>
#include <linux/completion.h>

#define SHRINKER_UNIT_BITS        BITS_PER_LONG

/*
 * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
 * shrinkers, which have elements charged to the memcg.
 */
struct shrinker_info_unit {
        atomic_long_t nr_deferred[SHRINKER_UNIT_BITS];
        DECLARE_BITMAP(map, SHRINKER_UNIT_BITS);
};

struct shrinker_info {
        struct rcu_head rcu;
        int map_nr_max;
        struct shrinker_info_unit *unit[];
};

/*
 * This struct is used to pass information from page reclaim to the shrinkers.
 * We consolidate the values for easier extension later.
 *
 * The 'gfpmask' refers to the allocation we are currently trying to
 * fulfil.
 */
struct shrink_control {
        gfp_t gfp_mask;

        /* current node being shrunk (for NUMA aware shrinkers) */
        int nid;

        /*
         * How many objects scan_objects should scan and try to reclaim.
         * This is reset before every call, so it is safe for callees
         * to modify.
         */
        unsigned long nr_to_scan;

        /*
         * How many objects did scan_objects process?
         * This defaults to nr_to_scan before every call, but the callee
         * should track its actual progress.
         */
        unsigned long nr_scanned;

        /* current memcg being shrunk (for memcg aware shrinkers) */
        struct mem_cgroup *memcg;
};

#define SHRINK_STOP (~0UL)
#define SHRINK_EMPTY (~0UL - 1)
/*
 * A callback you can register to apply pressure to ageable caches.
 *
 * @count_objects should return the number of freeable items in the cache. If
 * there are no objects to free, it should return SHRINK_EMPTY, while 0 is
 * returned in cases of the number of freeable items cannot be determined
 * or shrinker should skip this cache for this time (e.g., their number
 * is below shrinkable limit). No deadlock checks should be done during the
 * count callback - the shrinker relies on aggregating scan counts that couldn't
 * be executed due to potential deadlocks to be run at a later call when the
 * deadlock condition is no longer pending.
 *
 * @scan_objects will only be called if @count_objects returned a non-zero
 * value for the number of freeable objects. The callout should scan the cache
 * and attempt to free items from the cache. It should then return the number
 * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
 * due to potential deadlocks. If SHRINK_STOP is returned, then no further
 * attempts to call the @scan_objects will be made from the current reclaim
 * context.
 *
 * @flags determine the shrinker abilities, like numa awareness
 */
struct shrinker {
        unsigned long (*count_objects)(struct shrinker *,
                                       struct shrink_control *sc);
        unsigned long (*scan_objects)(struct shrinker *,
                                      struct shrink_control *sc);

        long batch;        /* reclaim batch size, 0 = default */
        int seeks;        /* seeks to recreate an obj */
        unsigned flags;

        /*
         * The reference count of this shrinker. Registered shrinker have an
         * initial refcount of 1, then the lookup operations are now allowed
         * to use it via shrinker_try_get(). Later in the unregistration step,
         * the initial refcount will be discarded, and will free the shrinker
         * asynchronously via RCU after its refcount reaches 0.
         */
        refcount_t refcount;
        struct completion done;        /* use to wait for refcount to reach 0 */
        struct rcu_head rcu;

        void *private_data;

        /* These are for internal use */
        struct list_head list;
#ifdef CONFIG_MEMCG
        /* ID in shrinker_idr */
        int id;
#endif
#ifdef CONFIG_SHRINKER_DEBUG
        int debugfs_id;
        const char *name;
        struct dentry *debugfs_entry;
#endif
        /* objs pending delete, per node */
        atomic_long_t *nr_deferred;
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */

/* Internal flags */
#define SHRINKER_REGISTERED        BIT(0)
#define SHRINKER_ALLOCATED        BIT(1)

/* Flags for users to use */
#define SHRINKER_NUMA_AWARE        BIT(2)
#define SHRINKER_MEMCG_AWARE        BIT(3)
/*
 * It just makes sense when the shrinker is also MEMCG_AWARE for now,
 * non-MEMCG_AWARE shrinker should not have this flag set.
 */
#define SHRINKER_NONSLAB        BIT(4)

__printf(2, 3)
struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
void shrinker_register(struct shrinker *shrinker);
void shrinker_free(struct shrinker *shrinker);

static inline bool shrinker_try_get(struct shrinker *shrinker)
{
        return refcount_inc_not_zero(&shrinker->refcount);
}

static inline void shrinker_put(struct shrinker *shrinker)
{
        if (refcount_dec_and_test(&shrinker->refcount))
                complete(&shrinker->done);
}

#ifdef CONFIG_SHRINKER_DEBUG
extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
                                                  const char *fmt, ...);
#else /* CONFIG_SHRINKER_DEBUG */
static inline __printf(2, 3)
int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
{
        return 0;
}
#endif /* CONFIG_SHRINKER_DEBUG */
#endif /* _LINUX_SHRINKER_H */




















































































































































































































































































































































































































    1 



    1 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/proc/root.c
 *
 *  Copyright (C) 1991, 1992 Linus Torvalds
 *
 *  proc root directory handling functions
 */
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/sched/stat.h>
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/user_namespace.h>
#include <linux/fs_context.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
#include <linux/fs_parser.h>
#include <linux/cred.h>
#include <linux/magic.h>
#include <linux/slab.h>

#include "internal.h"

struct proc_fs_context {
        struct pid_namespace        *pid_ns;
        unsigned int                mask;
        enum proc_hidepid        hidepid;
        int                        gid;
        enum proc_pidonly        pidonly;
};

enum proc_param {
        Opt_gid,
        Opt_hidepid,
        Opt_subset,
        Opt_pidns,
};

static const struct fs_parameter_spec proc_fs_parameters[] = {
        fsparam_u32("gid",                Opt_gid),
        fsparam_string("hidepid",        Opt_hidepid),
        fsparam_string("subset",        Opt_subset),
        fsparam_file_or_string("pidns",        Opt_pidns),
        {}
};

static inline int valid_hidepid(unsigned int value)
{
        return (value == HIDEPID_OFF ||
                value == HIDEPID_NO_ACCESS ||
                value == HIDEPID_INVISIBLE ||
                value == HIDEPID_NOT_PTRACEABLE);
}

static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct proc_fs_context *ctx = fc->fs_private;
        struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid);
        struct fs_parse_result result;
        int base = (unsigned long)hidepid_u32_spec.data;

        if (param->type != fs_value_is_string)
                return invalf(fc, "proc: unexpected type of hidepid value\n");

        if (!kstrtouint(param->string, base, &result.uint_32)) {
                if (!valid_hidepid(result.uint_32))
                        return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string);
                ctx->hidepid = result.uint_32;
                return 0;
        }

        if (!strcmp(param->string, "off"))
                ctx->hidepid = HIDEPID_OFF;
        else if (!strcmp(param->string, "noaccess"))
                ctx->hidepid = HIDEPID_NO_ACCESS;
        else if (!strcmp(param->string, "invisible"))
                ctx->hidepid = HIDEPID_INVISIBLE;
        else if (!strcmp(param->string, "ptraceable"))
                ctx->hidepid = HIDEPID_NOT_PTRACEABLE;
        else
                return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string);

        return 0;
}

static int proc_parse_subset_param(struct fs_context *fc, char *value)
{
        struct proc_fs_context *ctx = fc->fs_private;

        while (value) {
                char *ptr = strchr(value, ',');

                if (ptr != NULL)
                        *ptr++ = '\0';

                if (*value != '\0') {
                        if (!strcmp(value, "pid")) {
                                ctx->pidonly = PROC_PIDONLY_ON;
                        } else {
                                return invalf(fc, "proc: unsupported subset option - %s\n", value);
                        }
                }
                value = ptr;
        }

        return 0;
}

#ifdef CONFIG_PID_NS
static int proc_parse_pidns_param(struct fs_context *fc,
                                  struct fs_parameter *param,
                                  struct fs_parse_result *result)
{
        struct proc_fs_context *ctx = fc->fs_private;
        struct pid_namespace *target, *active = task_active_pid_ns(current);
        struct ns_common *ns;
        struct file *ns_filp __free(fput) = NULL;

        switch (param->type) {
        case fs_value_is_file:
                /* came through fsconfig, steal the file reference */
                ns_filp = no_free_ptr(param->file);
                break;
        case fs_value_is_string:
                ns_filp = filp_open(param->string, O_RDONLY, 0);
                break;
        default:
                WARN_ON_ONCE(true);
                break;
        }
        if (!ns_filp)
                ns_filp = ERR_PTR(-EBADF);
        if (IS_ERR(ns_filp)) {
                errorfc(fc, "could not get file from pidns argument");
                return PTR_ERR(ns_filp);
        }

        if (!proc_ns_file(ns_filp))
                return invalfc(fc, "pidns argument is not an nsfs file");
        ns = get_proc_ns(file_inode(ns_filp));
        if (ns->ns_type != CLONE_NEWPID)
                return invalfc(fc, "pidns argument is not a pidns file");
        target = container_of(ns, struct pid_namespace, ns);

        /*
         * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
         * permission model should be the same as pidns_install().
         */
        if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
                errorfc(fc, "insufficient permissions to set pidns");
                return -EPERM;
        }
        if (!pidns_is_ancestor(target, active))
                return invalfc(fc, "cannot set pidns to non-descendant pidns");

        put_pid_ns(ctx->pid_ns);
        ctx->pid_ns = get_pid_ns(target);
        put_user_ns(fc->user_ns);
        fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
        return 0;
}
#endif /* CONFIG_PID_NS */

static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct proc_fs_context *ctx = fc->fs_private;
        struct fs_parse_result result;
        int opt, err;

        opt = fs_parse(fc, proc_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_gid:
                ctx->gid = result.uint_32;
                break;

        case Opt_hidepid:
                err = proc_parse_hidepid_param(fc, param);
                if (err)
                        return err;
                break;

        case Opt_subset:
                err = proc_parse_subset_param(fc, param->string);
                if (err)
                        return err;
                break;

        case Opt_pidns:
#ifdef CONFIG_PID_NS
                /*
                 * We would have to RCU-protect every proc_pid_ns() or
                 * proc_sb_info() access if we allowed this to be reconfigured
                 * for an existing procfs instance. Luckily, procfs instances
                 * are cheap to create, and mount-beneath would let you
                 * atomically replace an instance even with overmounts.
                 */
                if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
                        errorfc(fc, "cannot reconfigure pidns for existing procfs");
                        return -EBUSY;
                }
                err = proc_parse_pidns_param(fc, param, &result);
                if (err)
                        return err;
                break;
#else
                errorfc(fc, "pidns mount flag not supported on this system");
                return -EOPNOTSUPP;
#endif

        default:
                return -EINVAL;
        }

        ctx->mask |= 1 << opt;
        return 0;
}

static void proc_apply_options(struct proc_fs_info *fs_info,
                               struct fs_context *fc,
                               struct user_namespace *user_ns)
{
        struct proc_fs_context *ctx = fc->fs_private;

        if (ctx->mask & (1 << Opt_gid))
                fs_info->pid_gid = make_kgid(user_ns, ctx->gid);
        if (ctx->mask & (1 << Opt_hidepid))
                fs_info->hide_pid = ctx->hidepid;
        if (ctx->mask & (1 << Opt_subset))
                fs_info->pidonly = ctx->pidonly;
        if (ctx->mask & (1 << Opt_pidns) &&
            !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
                put_pid_ns(fs_info->pid_ns);
                fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
        }
}

static int proc_fill_super(struct super_block *s, struct fs_context *fc)
{
        struct proc_fs_context *ctx = fc->fs_private;
        struct inode *root_inode;
        struct proc_fs_info *fs_info;
        int ret;

        fs_info = kzalloc_obj(*fs_info);
        if (!fs_info)
                return -ENOMEM;

        fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
        proc_apply_options(fs_info, fc, current_user_ns());

        /* User space would break if executables or devices appear on proc */
        s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
        s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
        s->s_blocksize = 1024;
        s->s_blocksize_bits = 10;
        s->s_magic = PROC_SUPER_MAGIC;
        s->s_op = &proc_sops;
        s->s_time_gran = 1;
        s->s_fs_info = fs_info;

        /*
         * procfs isn't actually a stacking filesystem; however, there is
         * too much magic going on inside it to permit stacking things on
         * top of it
         */
        s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;

        /* procfs dentries and inodes don't require IO to create */
        s->s_shrink->seeks = 0;

        pde_get(&proc_root);
        root_inode = proc_get_inode(s, &proc_root);
        if (!root_inode) {
                pr_err("proc_fill_super: get root inode failed\n");
                return -ENOMEM;
        }

        s->s_root = d_make_root(root_inode);
        if (!s->s_root) {
                pr_err("proc_fill_super: allocate dentry failed\n");
                return -ENOMEM;
        }

        ret = proc_setup_self(s);
        if (ret) {
                return ret;
        }
        return proc_setup_thread_self(s);
}

static int proc_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        struct proc_fs_info *fs_info = proc_sb_info(sb);

        sync_filesystem(sb);

        proc_apply_options(fs_info, fc, current_user_ns());
        return 0;
}

static int proc_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, proc_fill_super);
}

static void proc_fs_context_free(struct fs_context *fc)
{
        struct proc_fs_context *ctx = fc->fs_private;

        put_pid_ns(ctx->pid_ns);
        kfree(ctx);
}

static const struct fs_context_operations proc_fs_context_ops = {
        .free                = proc_fs_context_free,
        .parse_param        = proc_parse_param,
        .get_tree        = proc_get_tree,
        .reconfigure        = proc_reconfigure,
};

static int proc_init_fs_context(struct fs_context *fc)
{
        struct proc_fs_context *ctx;

        ctx = kzalloc_obj(struct proc_fs_context);
        if (!ctx)
                return -ENOMEM;

        ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
        put_user_ns(fc->user_ns);
        fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
        fc->fs_private = ctx;
        fc->ops = &proc_fs_context_ops;
        return 0;
}

static void proc_kill_sb(struct super_block *sb)
{
        struct proc_fs_info *fs_info = proc_sb_info(sb);

        kill_anon_super(sb);
        if (fs_info) {
                put_pid_ns(fs_info->pid_ns);
                kfree_rcu(fs_info, rcu);
        }
}

static struct file_system_type proc_fs_type = {
        .name                        = "proc",
        .init_fs_context        = proc_init_fs_context,
        .parameters                = proc_fs_parameters,
        .kill_sb                = proc_kill_sb,
        .fs_flags                = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM,
};

void __init proc_root_init(void)
{
        proc_init_kmemcache();
        set_proc_pid_nlink();
        proc_self_init();
        proc_thread_self_init();
        proc_symlink("mounts", NULL, "self/mounts");

        proc_net_init();
        proc_mkdir("fs", NULL);
        proc_mkdir("driver", NULL);
        proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
        /* just give it a mountpoint */
        proc_create_mount_point("openprom");
#endif
        proc_tty_init();
        proc_mkdir("bus", NULL);
        proc_sys_init();

        /*
         * Last things last. It is not like userspace processes eager
         * to open /proc files exist at this point but register last
         * anyway.
         */
        register_filesystem(&proc_fs_type);
}

static int proc_root_getattr(struct mnt_idmap *idmap,
                             const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
                         stat);
        stat->nlink = proc_root.nlink + nr_processes();
        return 0;
}

static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
        if (!proc_pid_lookup(dentry, flags))
                return NULL;

        return proc_lookup(dir, dentry, flags);
}

static int proc_root_readdir(struct file *file, struct dir_context *ctx)
{
        if (ctx->pos < FIRST_PROCESS_ENTRY) {
                int error = proc_readdir(file, ctx);
                if (unlikely(error <= 0))
                        return error;
                ctx->pos = FIRST_PROCESS_ENTRY;
        }

        return proc_pid_readdir(file, ctx);
}

/*
 * The root /proc directory is special, as it has the
 * <pid> directories. Thus we don't use the generic
 * directory handling functions for that..
 */
static const struct file_operations proc_root_operations = {
        .read                 = generic_read_dir,
        .iterate_shared         = proc_root_readdir,
        .llseek                = generic_file_llseek,
};

/*
 * proc root can do almost nothing..
 */
static const struct inode_operations proc_root_inode_operations = {
        .lookup                = proc_root_lookup,
        .getattr        = proc_root_getattr,
};

/*
 * This is the root "inode" in the /proc tree..
 */
struct proc_dir_entry proc_root = {
        .low_ino        = PROCFS_ROOT_INO,
        .namelen        = 5,
        .mode                = S_IFDIR | S_IRUGO | S_IXUGO,
        .nlink                = 2,
        .refcnt                = REFCOUNT_INIT(1),
        .proc_iops        = &proc_root_inode_operations,
        .proc_dir_ops        = &proc_root_operations,
        .parent                = &proc_root,
        .subdir                = RB_ROOT,
        .name                = "/proc",
};



























   69 


    7 










   61 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// SPDX-License-Identifier: GPL-2.0
#include <linux/fault-inject.h>
#include <linux/debugfs.h>
#include <linux/error-injection.h>
#include <linux/mm.h>

static struct {
        struct fault_attr attr;

        bool ignore_gfp_highmem;
        bool ignore_gfp_reclaim;
        u32 min_order;
} fail_page_alloc = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_reclaim = true,
        .ignore_gfp_highmem = true,
        .min_order = 1,
};

static int __init setup_fail_page_alloc(char *str)
{
        return setup_fault_attr(&fail_page_alloc.attr, str);
}
__setup("fail_page_alloc=", setup_fail_page_alloc);

bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
        int flags = 0;

        if (order < fail_page_alloc.min_order)
                return false;
        if (gfp_mask & __GFP_NOFAIL)
                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                return false;
        if (fail_page_alloc.ignore_gfp_reclaim &&
                        (gfp_mask & __GFP_DIRECT_RECLAIM))
                return false;

        /* See comment in __should_failslab() */
        if (gfp_mask & __GFP_NOWARN)
                flags |= FAULT_NOWARN;

        return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
}
ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_page_alloc_debugfs(void)
{
        umode_t mode = S_IFREG | 0600;
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
                                        &fail_page_alloc.attr);

        debugfs_create_bool("ignore-gfp-wait", mode, dir,
                            &fail_page_alloc.ignore_gfp_reclaim);
        debugfs_create_bool("ignore-gfp-highmem", mode, dir,
                            &fail_page_alloc.ignore_gfp_highmem);
        debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);

        return 0;
}

late_initcall(fail_page_alloc_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */














































































































































   25 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_64_H
#define _ASM_X86_PAGE_64_H

#include <asm/page_64_types.h>

#ifndef __ASSEMBLER__
#include <asm/cpufeatures.h>
#include <asm/alternative.h>

#include <linux/kmsan-checks.h>
#include <linux/mmdebug.h>

/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
extern unsigned long phys_base;

extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
extern unsigned long direct_map_physmem_end;

static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));

        return x;
}

#ifdef CONFIG_DEBUG_VIRTUAL
extern unsigned long __phys_addr(unsigned long);
#else
#define __phys_addr(x)                __phys_addr_nodebug(x)
#endif

static inline unsigned long __phys_addr_symbol(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* only check upper bounds since lower bounds will trigger carry */
        VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);

        return y + phys_base;
}

#define __phys_reloc_hide(x)        (x)

void __clear_pages_unrolled(void *page);
KCFI_REFERENCE(__clear_pages_unrolled);

/**
 * clear_pages() - clear a page range using a kernel virtual address.
 * @addr: start address of kernel page range
 * @npages: number of pages
 *
 * Switch between three implementations of page clearing based on CPU
 * capabilities:
 *
 *  - __clear_pages_unrolled(): the oldest, slowest and universally
 *    supported method. Zeroes via 8-byte MOV instructions unrolled 8x
 *    to write a 64-byte cacheline in each loop iteration.
 *
 *  - "REP; STOSQ": really old CPUs had crummy REP implementations.
 *    Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be
 *    trusted. The instruction writes 8-byte per REP iteration but
 *    CPUs can internally batch these together and do larger writes.
 *
 *  - "REP; STOSB": used on CPUs with "enhanced REP MOVSB/STOSB",
 *    which enumerate 'ERMS' and provide an implementation which
 *    unlike "REP; STOSQ" above wasn't overly picky about alignment.
 *    The instruction writes 1-byte per REP iteration with CPUs
 *    internally batching these together into larger writes and is
 *    generally fastest of the three.
 *
 * Note that when running as a guest, features exposed by the CPU
 * might be mediated by the hypervisor. So, the STOSQ variant might
 * be in active use on some systems even when the hardware enumerates
 * ERMS.
 *
 * Does absolutely no exception handling.
 */
static inline void clear_pages(void *addr, unsigned int npages)
{
        u64 len = npages * PAGE_SIZE;
        /*
         * Clean up KMSAN metadata for the pages being cleared. The assembly call
         * below clobbers @addr, so perform unpoisoning before it.
         */
        kmsan_unpoison_memory(addr, len);

        /*
         * The inline asm embeds a CALL instruction and usually that is a no-no
         * due to the compiler not knowing that and thus being unable to track
         * callee-clobbered registers.
         *
         * In this case that is fine because the registers clobbered by
         * __clear_pages_unrolled() are part of the inline asm register
         * specification.
         */
        asm volatile(ALTERNATIVE_2("call __clear_pages_unrolled",
                                   "shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD,
                                   "rep stosb", X86_FEATURE_ERMS)
                        : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
                        : "a" (0)
                        : "cc", "memory");
}
#define clear_pages clear_pages

static inline void clear_page(void *addr)
{
        clear_pages(addr, 1);
}

void copy_page(void *to, void *from);
KCFI_REFERENCE(copy_page);

/*
 * User space process size.  This is the first address outside the user range.
 * There are a few constraints that determine this:
 *
 * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
 * address, then that syscall will enter the kernel with a
 * non-canonical return address, and SYSRET will explode dangerously.
 * We avoid this particular problem by preventing anything
 * from being mapped at the maximum canonical address.
 *
 * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
 * CPUs malfunction if they execute code from the highest canonical page.
 * They'll speculate right off the end of the canonical space, and
 * bad things happen.  This is worked around in the same way as the
 * Intel problem.
 *
 * With page table isolation enabled, we map the LDT in ... [stay tuned]
 */
static __always_inline unsigned long task_size_max(void)
{
        unsigned long ret;

        alternative_io("movq %[small],%0","movq %[large],%0",
                        X86_FEATURE_LA57,
                        "=r" (ret),
                        [small] "i" ((1ul << 47)-PAGE_SIZE),
                        [large] "i" ((1ul << 56)-PAGE_SIZE));

        return ret;
}

#endif        /* !__ASSEMBLER__ */

#ifdef CONFIG_X86_VSYSCALL_EMULATION
# define __HAVE_ARCH_GATE_AREA 1
#endif

#endif /* _ASM_X86_PAGE_64_H */












































































































































































































































































































    6 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VMALLOC_H
#define _LINUX_VMALLOC_H

#include <linux/alloc_tag.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <asm/page.h>                /* pgprot_t */
#include <linux/rbtree.h>
#include <linux/overflow.h>

#include <asm/vmalloc.h>

struct vm_area_struct;                /* vma defining user mapping in mm_types.h */
struct notifier_block;                /* in notifier.h */
struct iov_iter;                /* in uio.h */

/* bits in flags of vmalloc's vm_struct below */
#define VM_IOREMAP                0x00000001        /* ioremap() and friends */
#define VM_ALLOC                0x00000002        /* vmalloc() */
#define VM_MAP                        0x00000004        /* vmap()ed pages */
#define VM_USERMAP                0x00000008        /* suitable for remap_vmalloc_range */
#define VM_DMA_COHERENT                0x00000010        /* dma_alloc_coherent */
#define VM_UNINITIALIZED        0x00000020        /* vm_struct is not fully initialized */
#define VM_NO_GUARD                0x00000040      /* ***DANGEROUS*** don't add guard page */
#define VM_KASAN                0x00000080      /* has allocated kasan shadow memory */
#define VM_FLUSH_RESET_PERMS        0x00000100        /* reset direct map and flush TLB on unmap, can't be freed in atomic context */
#define VM_MAP_PUT_PAGES        0x00000200        /* put pages and free array in vfree */
#define VM_ALLOW_HUGE_VMAP        0x00000400      /* Allow for huge pages on archs with HAVE_ARCH_HUGE_VMALLOC */

#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
        !defined(CONFIG_KASAN_VMALLOC)
#define VM_DEFER_KMEMLEAK        0x00000800        /* defer kmemleak object creation */
#else
#define VM_DEFER_KMEMLEAK        0
#endif
#define VM_SPARSE                0x00001000        /* sparse vm_area. not all pages are present. */

/* bits [20..32] reserved for arch specific ioremap internals */

/*
 * Maximum alignment for ioremap() regions.
 * Can be overridden by arch-specific value.
 */
#ifndef IOREMAP_MAX_ORDER
#define IOREMAP_MAX_ORDER        (7 + PAGE_SHIFT)        /* 128 pages */
#endif

struct vm_struct {
        union {
                struct vm_struct *next;          /* Early registration of vm_areas. */
                struct llist_node llnode; /* Asynchronous freeing on error paths. */
        };

        void                        *addr;
        unsigned long                size;
        unsigned long                flags;
        struct page                **pages;
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        unsigned int                page_order;
#endif
        unsigned int                nr_pages;
        phys_addr_t                phys_addr;
        const void                *caller;
        unsigned long                requested_size;
};

struct vmap_area {
        unsigned long va_start;
        unsigned long va_end;

        struct rb_node rb_node;         /* address sorted rbtree */
        struct list_head list;          /* address sorted list */

        /*
         * The following two variables can be packed, because
         * a vmap_area object can be either:
         *    1) in "free" tree (root is free_vmap_area_root)
         *    2) or "busy" tree (root is vmap_area_root)
         */
        union {
                unsigned long subtree_max_size; /* in "free" tree */
                struct vm_struct *vm;           /* in "busy" tree */
        };
        unsigned long flags; /* mark type of vm_map_ram area */
};

/* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */
#ifndef arch_vmap_p4d_supported
static inline bool arch_vmap_p4d_supported(pgprot_t prot)
{
        return false;
}
#endif

#ifndef arch_vmap_pud_supported
static inline bool arch_vmap_pud_supported(pgprot_t prot)
{
        return false;
}
#endif

#ifndef arch_vmap_pmd_supported
static inline bool arch_vmap_pmd_supported(pgprot_t prot)
{
        return false;
}
#endif

#ifndef arch_vmap_pte_range_map_size
static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end,
                                                         u64 pfn, unsigned int max_page_shift)
{
        return PAGE_SIZE;
}
#endif

#ifndef arch_vmap_pte_range_unmap_size
static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr,
                                                           pte_t *ptep)
{
        return PAGE_SIZE;
}
#endif

#ifndef arch_vmap_pte_supported_shift
static inline int arch_vmap_pte_supported_shift(unsigned long size)
{
        return PAGE_SHIFT;
}
#endif

#ifndef arch_vmap_pgprot_tagged
static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
{
        return prot;
}
#endif

/*
 *        Highlevel APIs for driver use
 */
extern void vm_unmap_ram(const void *mem, unsigned int count);
extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
extern void vm_unmap_aliases(void);

extern void *vmalloc_noprof(unsigned long size) __alloc_size(1);
#define vmalloc(...)                alloc_hooks(vmalloc_noprof(__VA_ARGS__))

extern void *vzalloc_noprof(unsigned long size) __alloc_size(1);
#define vzalloc(...)                alloc_hooks(vzalloc_noprof(__VA_ARGS__))

extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1);
#define vmalloc_user(...)        alloc_hooks(vmalloc_user_noprof(__VA_ARGS__))

extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1);
#define vmalloc_node(...)        alloc_hooks(vmalloc_node_noprof(__VA_ARGS__))

extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1);
#define vzalloc_node(...)        alloc_hooks(vzalloc_node_noprof(__VA_ARGS__))

extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1);
#define vmalloc_32(...)                alloc_hooks(vmalloc_32_noprof(__VA_ARGS__))

extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1);
#define vmalloc_32_user(...)        alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__))

extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
#define __vmalloc(...)                alloc_hooks(__vmalloc_noprof(__VA_ARGS__))

extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
                        pgprot_t prot, unsigned long vm_flags, int node,
                        const void *caller) __alloc_size(1);
#define __vmalloc_node_range(...)        alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__))

void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask,
                int node, const void *caller) __alloc_size(1);
#define __vmalloc_node(...)        alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__))

void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1);
#define vmalloc_huge_node(...)        alloc_hooks(vmalloc_huge_node_noprof(__VA_ARGS__))

static inline void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
{
        return vmalloc_huge_node(size, gfp_mask, NUMA_NO_NODE);
}

extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
#define __vmalloc_array(...)        alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__))

extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2);
#define vmalloc_array(...)        alloc_hooks(vmalloc_array_noprof(__VA_ARGS__))

extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
#define __vcalloc(...)                alloc_hooks(__vcalloc_noprof(__VA_ARGS__))

extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2);
#define vcalloc(...)                alloc_hooks(vcalloc_noprof(__VA_ARGS__))

void *__must_check vrealloc_node_align_noprof(const void *p, size_t size,
                unsigned long align, gfp_t flags, int nid) __realloc_size(2);
#define vrealloc_node_noprof(_p, _s, _f, _nid)        \
        vrealloc_node_align_noprof(_p, _s, 1, _f, _nid)
#define vrealloc_noprof(_p, _s, _f)                \
        vrealloc_node_align_noprof(_p, _s, 1, _f, NUMA_NO_NODE)
#define vrealloc_node_align(...)                alloc_hooks(vrealloc_node_align_noprof(__VA_ARGS__))
#define vrealloc_node(...)                        alloc_hooks(vrealloc_node_noprof(__VA_ARGS__))
#define vrealloc(...)                                alloc_hooks(vrealloc_noprof(__VA_ARGS__))

extern void vfree(const void *addr);
extern void vfree_atomic(const void *addr);

extern void *vmap(struct page **pages, unsigned int count,
                        unsigned long flags, pgprot_t prot);
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot);
extern void vunmap(const void *addr);

extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
                                       unsigned long uaddr, void *kaddr,
                                       unsigned long pgoff, unsigned long size);

extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                                                        unsigned long pgoff);

int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
                     struct page **pages, unsigned int page_shift);

/*
 *        Lowlevel-APIs (not for driver use!)
 */

static inline size_t get_vm_area_size(const struct vm_struct *area)
{
        if (!(area->flags & VM_NO_GUARD))
                /* return actual size without guard page */
                return area->size - PAGE_SIZE;
        else
                return area->size;

}

extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
extern struct vm_struct *get_vm_area_caller(unsigned long size,
                                        unsigned long flags, const void *caller);
extern struct vm_struct *__get_vm_area_caller(unsigned long size,
                                        unsigned long flags,
                                        unsigned long start, unsigned long end,
                                        const void *caller);
void free_vm_area(struct vm_struct *area);
extern struct vm_struct *remove_vm_area(const void *addr);
extern struct vm_struct *find_vm_area(const void *addr);
struct vmap_area *find_vmap_area(unsigned long addr);

static inline bool is_vm_area_hugepages(const void *addr)
{
        /*
         * This may not 100% tell if the area is mapped with > PAGE_SIZE
         * page table entries, if for some reason the architecture indicates
         * larger sizes are available but decides not to use them, nothing
         * prevents that. This only indicates the size of the physical page
         * allocated in the vmalloc layer.
         */
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        return find_vm_area(addr)->page_order > 0;
#else
        return false;
#endif
}

/* for /proc/kcore */
long vread_iter(struct iov_iter *iter, const char *addr, size_t count);

/*
 *        Internals.  Don't use..
 */
__init void vm_area_add_early(struct vm_struct *vm);
__init void vm_area_register_early(struct vm_struct *vm, size_t align);

int register_vmap_purge_notifier(struct notifier_block *nb);
int unregister_vmap_purge_notifier(struct notifier_block *nb);

#ifdef CONFIG_MMU
#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)

int vm_area_map_pages(struct vm_struct *area, unsigned long start,
                      unsigned long end, struct page **pages);
void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
                         unsigned long end);
void vunmap_range(unsigned long addr, unsigned long end);

static inline void set_vm_flush_reset_perms(void *addr)
{
        struct vm_struct *vm = find_vm_area(addr);

        if (vm)
                vm->flags |= VM_FLUSH_RESET_PERMS;
}
#else  /* !CONFIG_MMU */
#define VMALLOC_TOTAL 0UL

static inline void set_vm_flush_reset_perms(void *addr) {}
#endif /* CONFIG_MMU */

#if defined(CONFIG_MMU) && defined(CONFIG_SMP)
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
                                     size_t align);

void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
# else
static inline struct vm_struct **
pcpu_get_vm_areas(const unsigned long *offsets,
                const size_t *sizes, int nr_vms,
                size_t align)
{
        return NULL;
}

static inline void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) {}
#endif

#if defined(CONFIG_MMU) && defined(CONFIG_PRINTK)
bool vmalloc_dump_obj(void *object);
#else
static inline bool vmalloc_dump_obj(void *object) { return false; }
#endif

unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask);
void memalloc_restore_scope(unsigned int flags);
#endif /* _LINUX_VMALLOC_H */








































































































































































































































































































































































































































































































































































    1 








    1 




























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
// SPDX-License-Identifier: GPL-2.0
/*
 *        SUCS NET3:
 *
 *        Generic datagram handling routines. These are generic for all
 *        protocols. Possibly a generic IP version on top of these would
 *        make sense. Not tonight however 8-).
 *        This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
 *        NetROM layer all have identical poll code and mostly
 *        identical recvmsg() code. So we share it here. The poll was
 *        shared before but buried in udp.c so I moved it.
 *
 *        Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
 *                                                     udp.c code)
 *
 *        Fixes:
 *                Alan Cox        :        NULL return from skb_peek_copy()
 *                                        understood
 *                Alan Cox        :        Rewrote skb_read_datagram to avoid the
 *                                        skb_peek_copy stuff.
 *                Alan Cox        :        Added support for SOCK_SEQPACKET.
 *                                        IPX can no longer use the SO_TYPE hack
 *                                        but AX.25 now works right, and SPX is
 *                                        feasible.
 *                Alan Cox        :        Fixed write poll of non IP protocol
 *                                        crash.
 *                Florian  La Roche:        Changed for my new skbuff handling.
 *                Darryl Miles        :        Fixed non-blocking SOCK_SEQPACKET.
 *                Linus Torvalds        :        BSD semantic fixes.
 *                Alan Cox        :        Datagram iovec handling
 *                Darryl Miles        :        Fixed non-blocking SOCK_STREAM.
 *                Alan Cox        :        POSIXisms
 *                Pete Wyckoff    :       Unconnected accept() fix.
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/poll.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/iov_iter.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/crc32.h>

#include <net/protocol.h>
#include <linux/skbuff.h>

#include <net/checksum.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>

#include "devmem.h"

/*
 *        Is a socket 'connection oriented' ?
 */
static inline int connection_based(struct sock *sk)
{
        return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}

static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
                                  void *key)
{
        /*
         * Avoid a wakeup if event not interesting for us
         */
        if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
                return 0;
        return autoremove_wake_function(wait, mode, sync, key);
}
/*
 * Wait for the last received packet to be different from skb
 */
int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
                                int *err, long *timeo_p,
                                const struct sk_buff *skb)
{
        int error;
        DEFINE_WAIT_FUNC(wait, receiver_wake_function);

        prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);

        /* Socket errors? */
        error = sock_error(sk);
        if (error)
                goto out_err;

        if (READ_ONCE(queue->prev) != skb)
                goto out;

        /* Socket shut down? */
        if (sk->sk_shutdown & RCV_SHUTDOWN)
                goto out_noerr;

        /* Sequenced packets can come disconnected.
         * If so we report the problem
         */
        error = -ENOTCONN;
        if (connection_based(sk) &&
            !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
                goto out_err;

        /* handle signals */
        if (signal_pending(current))
                goto interrupted;

        error = 0;
        *timeo_p = schedule_timeout(*timeo_p);
out:
        finish_wait(sk_sleep(sk), &wait);
        return error;
interrupted:
        error = sock_intr_errno(*timeo_p);
out_err:
        *err = error;
        goto out;
out_noerr:
        *err = 0;
        error = 1;
        goto out;
}
EXPORT_SYMBOL(__skb_wait_for_more_packets);

static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
{
        struct sk_buff *nskb;

        if (skb->peeked)
                return skb;

        /* We have to unshare an skb before modifying it. */
        if (!skb_shared(skb))
                goto done;

        nskb = skb_clone(skb, GFP_ATOMIC);
        if (!nskb)
                return ERR_PTR(-ENOMEM);

        skb->prev->next = nskb;
        skb->next->prev = nskb;
        nskb->prev = skb->prev;
        nskb->next = skb->next;

        consume_skb(skb);
        skb = nskb;

done:
        skb->peeked = 1;

        return skb;
}

struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue,
                                          unsigned int flags,
                                          int *off, int *err,
                                          struct sk_buff **last)
{
        bool peek_at_off = false;
        struct sk_buff *skb;
        int _off = 0;

        if (unlikely(flags & MSG_PEEK && *off >= 0)) {
                peek_at_off = true;
                _off = *off;
        }

        *last = queue->prev;
        skb_queue_walk(queue, skb) {
                if (flags & MSG_PEEK) {
                        if (peek_at_off && _off >= skb->len &&
                            (_off || skb->peeked)) {
                                _off -= skb->len;
                                continue;
                        }
                        if (!skb->len) {
                                skb = skb_set_peeked(skb);
                                if (IS_ERR(skb)) {
                                        *err = PTR_ERR(skb);
                                        return NULL;
                                }
                        }
                        refcount_inc(&skb->users);
                } else {
                        __skb_unlink(skb, queue);
                }
                *off = _off;
                return skb;
        }
        return NULL;
}

/**
 *        __skb_try_recv_datagram - Receive a datagram skbuff
 *        @sk: socket
 *        @queue: socket queue from which to receive
 *        @flags: MSG\_ flags
 *        @off: an offset in bytes to peek skb from. Returns an offset
 *              within an skb where data actually starts
 *        @err: error code returned
 *        @last: set to last peeked message to inform the wait function
 *               what to look for when peeking
 *
 *        Get a datagram skbuff, understands the peeking, nonblocking wakeups
 *        and possible races. This replaces identical code in packet, raw and
 *        udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
 *        the long standing peek and read race for datagram sockets. If you
 *        alter this routine remember it must be re-entrant.
 *
 *        This function will lock the socket if a skb is returned, so
 *        the caller needs to unlock the socket in that case (usually by
 *        calling skb_free_datagram). Returns NULL with @err set to
 *        -EAGAIN if no data was available or to some other value if an
 *        error was detected.
 *
 *        * It does not lock socket since today. This function is
 *        * free of race conditions. This measure should/can improve
 *        * significantly datagram socket latencies at high loads,
 *        * when data copying to user space takes lots of time.
 *        * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
 *        *  8) Great win.)
 *        *                                            --ANK (980729)
 *
 *        The order of the tests when we find no data waiting are specified
 *        quite explicitly by POSIX 1003.1g, don't change them without having
 *        the standard around please.
 */
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
                                        struct sk_buff_head *queue,
                                        unsigned int flags, int *off, int *err,
                                        struct sk_buff **last)
{
        struct sk_buff *skb;
        unsigned long cpu_flags;
        /*
         * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
         */
        int error = sock_error(sk);

        if (error)
                goto no_packet;

        do {
                /* Again only user level code calls this function, so nothing
                 * interrupt level will suddenly eat the receive_queue.
                 *
                 * Look at current nfs client by the way...
                 * However, this function was correct in any case. 8)
                 */
                spin_lock_irqsave(&queue->lock, cpu_flags);
                skb = __skb_try_recv_from_queue(queue, flags, off, &error,
                                                last);
                spin_unlock_irqrestore(&queue->lock, cpu_flags);
                if (error)
                        goto no_packet;
                if (skb)
                        return skb;

                if (!sk_can_busy_loop(sk))
                        break;

                sk_busy_loop(sk, flags & MSG_DONTWAIT);
        } while (READ_ONCE(queue->prev) != *last);

        error = -EAGAIN;

no_packet:
        *err = error;
        return NULL;
}
EXPORT_SYMBOL(__skb_try_recv_datagram);

struct sk_buff *__skb_recv_datagram(struct sock *sk,
                                    struct sk_buff_head *sk_queue,
                                    unsigned int flags, int *off, int *err)
{
        struct sk_buff *skb, *last;
        long timeo;

        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        do {
                skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err,
                                              &last);
                if (skb)
                        return skb;

                if (*err != -EAGAIN)
                        break;
        } while (timeo &&
                 !__skb_wait_for_more_packets(sk, sk_queue, err,
                                              &timeo, last));

        return NULL;
}
EXPORT_SYMBOL(__skb_recv_datagram);

struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
                                  int *err)
{
        int off = 0;

        return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags,
                                   &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);

void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
        consume_skb(skb);
}
EXPORT_SYMBOL(skb_free_datagram);

int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
                        struct sk_buff *skb, unsigned int flags,
                        void (*destructor)(struct sock *sk,
                                           struct sk_buff *skb))
{
        int err = 0;

        if (flags & MSG_PEEK) {
                err = -ENOENT;
                spin_lock_bh(&sk_queue->lock);
                if (skb->next) {
                        __skb_unlink(skb, sk_queue);
                        refcount_dec(&skb->users);
                        if (destructor)
                                destructor(sk, skb);
                        err = 0;
                }
                spin_unlock_bh(&sk_queue->lock);
        }

        sk_drops_inc(sk);
        return err;
}
EXPORT_SYMBOL(__sk_queue_drop_skb);

/**
 *        skb_kill_datagram - Free a datagram skbuff forcibly
 *        @sk: socket
 *        @skb: datagram skbuff
 *        @flags: MSG\_ flags
 *
 *        This function frees a datagram skbuff that was received by
 *        skb_recv_datagram.  The flags argument must match the one
 *        used for skb_recv_datagram.
 *
 *        If the MSG_PEEK flag is set, and the packet is still on the
 *        receive queue of the socket, it will be taken off the queue
 *        before it is freed.
 *
 *        This function currently only disables BH when acquiring the
 *        sk_receive_queue lock.  Therefore it must not be used in a
 *        context where that lock is acquired in an IRQ context.
 *
 *        It returns 0 if the packet was removed by us.
 */

int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
        int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags,
                                      NULL);

        kfree_skb(skb);
        return err;
}
EXPORT_SYMBOL(skb_kill_datagram);

INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
                                                size_t bytes,
                                                void *data __always_unused,
                                                struct iov_iter *i));

static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
                               struct iov_iter *to, int len, bool fault_short,
                               size_t (*cb)(const void *, size_t, void *,
                                            struct iov_iter *), void *data)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset, start_off = offset, n;
        struct sk_buff *frag_iter;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
                                    skb->data + offset, copy, data, to);
                offset += n;
                if (n != copy)
                        goto short_copy;
                if ((len -= copy) == 0)
                        return 0;
        }

        if (!skb_frags_readable(skb))
                goto short_copy;

        /* Copy paged appendix. Hmm... why does this look so complicated? */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        n = 0;
                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_local_page(p);
                                n += INDIRECT_CALL_1(cb, simple_copy_to_iter,
                                        vaddr + p_off, p_len, data, to);
                                kunmap_local(vaddr);
                        }

                        offset += n;
                        if (n != copy)
                                goto short_copy;
                        if (!(len -= copy))
                                return 0;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (__skb_datagram_iter(frag_iter, offset - start,
                                                to, copy, fault_short, cb, data))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

        /* This is not really a user copy fault, but rather someone
         * gave us a bogus length on the skb.  We should probably
         * print a warning here as it may indicate a kernel bug.
         */

fault:
        iov_iter_revert(to, offset - start_off);
        return -EFAULT;

short_copy:
        if (fault_short || iov_iter_count(to))
                goto fault;

        return 0;
}

#ifdef CONFIG_NET_CRC32C
static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes,
                                      void *_crcp, struct iov_iter *i)
{
        u32 *crcp = _crcp;
        size_t copied;

        copied = copy_to_iter(addr, bytes, i);
        *crcp = crc32c(*crcp, addr, copied);
        return copied;
}

/**
 *        skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator
 *                and update a CRC32C value.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying from
 *        @to: iovec iterator to copy to
 *        @len: amount of data to copy from buffer to iovec
 *        @crcp: pointer to CRC32C value to update
 *
 *        Return: 0 on success, -EFAULT if there was a fault during copy.
 */
int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
                                      struct iov_iter *to, int len, u32 *crcp)
{
        return __skb_datagram_iter(skb, offset, to, len, true,
                                   crc32c_and_copy_to_iter, crcp);
}
EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter);
#endif /* CONFIG_NET_CRC32C */

static size_t simple_copy_to_iter(const void *addr, size_t bytes,
                void *data __always_unused, struct iov_iter *i)
{
        return copy_to_iter(addr, bytes, i);
}

/**
 *        skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying from
 *        @to: iovec iterator to copy to
 *        @len: amount of data to copy from buffer to iovec
 */
int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
                           struct iov_iter *to, int len)
{
        trace_skb_copy_datagram_iovec(skb, len);
        return __skb_datagram_iter(skb, offset, to, len, false,
                        simple_copy_to_iter, NULL);
}
EXPORT_SYMBOL(skb_copy_datagram_iter);

/**
 *        skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying to
 *        @from: the copy source
 *        @len: amount of data to copy to buffer from iovec
 *
 *        Returns 0 or -EFAULT.
 */
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
                                 struct iov_iter *from,
                                 int len)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                if (copy_from_iter(skb->data + offset, copy, from) != copy)
                        goto fault;
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
        }

        /* Copy paged appendix. Hmm... why does this look so complicated? */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        size_t copied;

                        if (copy > len)
                                copy = len;
                        copied = copy_page_from_iter(skb_frag_page(frag),
                                          skb_frag_off(frag) + offset - start,
                                          copy, from);
                        if (copied != copy)
                                goto fault;

                        if (!(len -= copy))
                                return 0;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_copy_datagram_from_iter(frag_iter,
                                                        offset - start,
                                                        from, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);

int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset,
                                     struct iov_iter *from, int len)
{
        struct iov_iter_state state;
        int ret;

        iov_iter_save_state(from, &state);
        ret = skb_copy_datagram_from_iter(skb, offset, from, len);
        if (ret)
                iov_iter_restore(from, &state);
        return ret;
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter_full);

int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
                                struct iov_iter *from, size_t length)
{
        int frag = skb_shinfo(skb)->nr_frags;

        if (!skb_frags_readable(skb))
                return -EFAULT;

        while (length && iov_iter_count(from)) {
                struct page *head, *last_head = NULL;
                struct page *pages[MAX_SKB_FRAGS];
                int refs, order, n = 0;
                size_t start;
                ssize_t copied;

                if (frag == MAX_SKB_FRAGS)
                        return -EMSGSIZE;

                copied = iov_iter_get_pages2(from, pages, length,
                                            MAX_SKB_FRAGS - frag, &start);
                if (copied < 0)
                        return -EFAULT;

                length -= copied;

                skb->data_len += copied;
                skb->len += copied;
                skb->truesize += PAGE_ALIGN(copied + start);

                head = compound_head(pages[n]);
                order = compound_order(head);

                for (refs = 0; copied != 0; start = 0) {
                        int size = min_t(int, copied, PAGE_SIZE - start);

                        if (pages[n] - head > (1UL << order) - 1) {
                                head = compound_head(pages[n]);
                                order = compound_order(head);
                        }

                        start += (pages[n] - head) << PAGE_SHIFT;
                        copied -= size;
                        n++;
                        if (frag) {
                                skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1];

                                if (head == skb_frag_page(last) &&
                                    start == skb_frag_off(last) + skb_frag_size(last)) {
                                        skb_frag_size_add(last, size);
                                        /* We combined this page, we need to release
                                         * a reference. Since compound pages refcount
                                         * is shared among many pages, batch the refcount
                                         * adjustments to limit false sharing.
                                         */
                                        last_head = head;
                                        refs++;
                                        continue;
                                }
                        }
                        if (refs) {
                                page_ref_sub(last_head, refs);
                                refs = 0;
                        }
                        skb_fill_page_desc_noacc(skb, frag++, head, start, size);
                }
                if (refs)
                        page_ref_sub(last_head, refs);
        }
        return 0;
}

static int
zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
                              int length,
                              struct net_devmem_dmabuf_binding *binding)
{
        int i = skb_shinfo(skb)->nr_frags;
        size_t virt_addr, size, off;
        struct net_iov *niov;

        /* Devmem filling works by taking an IOVEC from the user where the
         * iov_addrs are interpreted as an offset in bytes into the dma-buf to
         * send from. We do not support other iter types.
         */
        if (iov_iter_type(from) != ITER_IOVEC &&
            iov_iter_type(from) != ITER_UBUF)
                return -EFAULT;

        while (length && iov_iter_count(from)) {
                if (i == MAX_SKB_FRAGS)
                        return -EMSGSIZE;

                virt_addr = (size_t)iter_iov_addr(from);
                niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
                if (!niov)
                        return -EFAULT;

                size = min_t(size_t, size, length);
                size = min_t(size_t, size, iter_iov_len(from));

                get_netmem(net_iov_to_netmem(niov));
                skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
                                       size, PAGE_SIZE);
                iov_iter_advance(from, size);
                length -= size;
                i++;
        }

        return 0;
}

int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
                            struct sk_buff *skb, struct iov_iter *from,
                            size_t length,
                            struct net_devmem_dmabuf_binding *binding)
{
        unsigned long orig_size = skb->truesize;
        unsigned long truesize;
        int ret;

        if (msg && msg->msg_ubuf && msg->sg_from_iter)
                ret = msg->sg_from_iter(skb, from, length);
        else if (binding)
                ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
        else
                ret = zerocopy_fill_skb_from_iter(skb, from, length);

        truesize = skb->truesize - orig_size;
        if (sk && sk->sk_type == SOCK_STREAM) {
                sk_wmem_queued_add(sk, truesize);
                if (!skb_zcopy_pure(skb))
                        sk_mem_charge(sk, truesize);
        } else {
                refcount_add(truesize, &skb->sk->sk_wmem_alloc);
        }
        return ret;
}
EXPORT_SYMBOL(__zerocopy_sg_from_iter);

/**
 *        zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
 *        @skb: buffer to copy
 *        @from: the source to copy from
 *
 *        The function will first copy up to headlen, and then pin the userspace
 *        pages and build frags through them.
 *
 *        Returns 0, -EFAULT or -EMSGSIZE.
 */
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
{
        int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));

        /* copy up to skb headlen */
        if (skb_copy_datagram_from_iter(skb, 0, from, copy))
                return -EFAULT;

        return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);

static __always_inline
size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress,
                              size_t len, void *from, void *priv2)
{
        __wsum next, *csum = priv2;

        next = csum_and_copy_to_user(from + progress, iter_to, len);
        *csum = csum_block_add(*csum, next, progress);
        return next ? 0 : len;
}

static __always_inline
size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
                           size_t len, void *from, void *priv2)
{
        __wsum *csum = priv2;
        __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len);

        *csum = csum_block_add(*csum, next, progress);
        return 0;
}

struct csum_state {
        __wsum csum;
        size_t off;
};

static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
                                    struct iov_iter *i)
{
        struct csum_state *csstate = _csstate;
        __wsum sum;

        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (unlikely(iov_iter_is_discard(i))) {
                // can't use csum_memcpy() for that one - data is not copied
                csstate->csum = csum_block_add(csstate->csum,
                                               csum_partial(addr, bytes, 0),
                                               csstate->off);
                csstate->off += bytes;
                return bytes;
        }

        sum = csum_shift(csstate->csum, csstate->off);

        bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum,
                                     copy_to_user_iter_csum,
                                     memcpy_to_iter_csum);
        csstate->csum = csum_shift(sum, csstate->off);
        csstate->off += bytes;
        return bytes;
}

/**
 *        skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
 *          and update a checksum.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying from
 *        @to: iovec iterator to copy to
 *        @len: amount of data to copy from buffer to iovec
 *      @csump: checksum pointer
 */
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
                                      struct iov_iter *to, int len,
                                      __wsum *csump)
{
        struct csum_state csdata = { .csum = *csump };
        int ret;

        ret = __skb_datagram_iter(skb, offset, to, len, true,
                                  csum_and_copy_to_iter, &csdata);
        if (ret)
                return ret;

        *csump = csdata.csum;
        return 0;
}

/**
 *        skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
 *        @skb: skbuff
 *        @hlen: hardware length
 *        @msg: destination
 *
 *        Caller _must_ check that skb will fit to this iovec.
 *
 *        Returns: 0       - success.
 *                 -EINVAL - checksum failure.
 *                 -EFAULT - fault during copy.
 */
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
                                   int hlen, struct msghdr *msg)
{
        __wsum csum;
        int chunk = skb->len - hlen;

        if (!chunk)
                return 0;

        if (msg_data_left(msg) < chunk) {
                if (__skb_checksum_complete(skb))
                        return -EINVAL;
                if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
                        goto fault;
        } else {
                csum = csum_partial(skb->data, hlen, skb->csum);
                if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
                                               chunk, &csum))
                        goto fault;

                if (csum_fold(csum)) {
                        iov_iter_revert(&msg->msg_iter, chunk);
                        return -EINVAL;
                }

                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(NULL, skb);
        }
        return 0;
fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);

/**
 *        datagram_poll_queue - same as datagram_poll, but on a specific receive
 *                queue
 *        @file: file struct
 *        @sock: socket
 *        @wait: poll table
 *        @rcv_queue: receive queue to poll
 *
 *        Performs polling on the given receive queue, handling shutdown, error,
 *        and connection state. This is useful for protocols that deliver
 *        userspace-bound packets through a custom queue instead of
 *        sk->sk_receive_queue.
 *
 *        Return: poll bitmask indicating the socket's current state
 */
__poll_t datagram_poll_queue(struct file *file, struct socket *sock,
                             poll_table *wait, struct sk_buff_head *rcv_queue)
{
        struct sock *sk = sock->sk;
        __poll_t mask;
        u8 shutdown;

        sock_poll_wait(file, sock, wait);
        mask = 0;

        /* exceptional events? */
        if (READ_ONCE(sk->sk_err) ||
            !skb_queue_empty_lockless(&sk->sk_error_queue))
                mask |= EPOLLERR |
                        (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);

        shutdown = READ_ONCE(sk->sk_shutdown);
        if (shutdown & RCV_SHUTDOWN)
                mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
        if (shutdown == SHUTDOWN_MASK)
                mask |= EPOLLHUP;

        /* readable? */
        if (!skb_queue_empty_lockless(rcv_queue))
                mask |= EPOLLIN | EPOLLRDNORM;

        /* Connection-based need to check for termination and startup */
        if (connection_based(sk)) {
                int state = READ_ONCE(sk->sk_state);

                if (state == TCP_CLOSE)
                        mask |= EPOLLHUP;
                /* connection hasn't started yet? */
                if (state == TCP_SYN_SENT)
                        return mask;
        }

        /* writable? */
        if (sock_writeable(sk))
                mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
        else
                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);

        return mask;
}
EXPORT_SYMBOL(datagram_poll_queue);

/**
 *        datagram_poll - generic datagram poll
 *        @file: file struct
 *        @sock: socket
 *        @wait: poll table
 *
 *        Datagram poll: Again totally generic. This also handles
 *        sequenced packet sockets providing the socket receive queue
 *        is only ever holding data ready to receive.
 *
 *        Note: when you *don't* use this routine for this protocol,
 *        and you use a different write policy from sock_writeable()
 *        then please supply your own write_space callback.
 *
 *        Return: poll bitmask indicating the socket's current state
 */
__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait)
{
        return datagram_poll_queue(file, sock, wait,
                                   &sock->sk->sk_receive_queue);
}
EXPORT_SYMBOL(datagram_poll);
































   13 
   15 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// SPDX-License-Identifier: GPL-2.0-only
/*
 * A generic implementation of binary search for the Linux kernel
 *
 * Copyright (C) 2008-2009 Ksplice, Inc.
 * Author: Tim Abbott <tabbott@ksplice.com>
 */

#include <linux/export.h>
#include <linux/bsearch.h>
#include <linux/kprobes.h>

/*
 * bsearch - binary search an array of elements
 * @key: pointer to item being searched for
 * @base: pointer to first element to search
 * @num: number of elements
 * @size: size of each element
 * @cmp: pointer to comparison function
 *
 * This function does a binary search on the given array.  The
 * contents of the array should already be in ascending sorted order
 * under the provided comparison function.
 *
 * Note that the key need not have the same type as the elements in
 * the array, e.g. key could be a string and the comparison function
 * could compare the string with the struct's name field.  However, if
 * the key and elements in the array are of the same type, you can use
 * the same comparison function for both sort() and bsearch().
 */
void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        return __inline_bsearch(key, base, num, size, cmp);
}
EXPORT_SYMBOL(bsearch);
NOKPROBE_SYMBOL(bsearch);







































































































































































































































































































































































































   12 


































   66 













    1 









    1 





































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CGROUP_H
#define _LINUX_CGROUP_H
/*
 *  cgroup interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/nodemask.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/jump_label.h>
#include <linux/types.h>
#include <linux/notifier.h>
#include <linux/ns_common.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/kernel_stat.h>

#include <linux/cgroup-defs.h>
#include <linux/cgroup_namespace.h>

struct kernel_clone_args;

/*
 * All weight knobs on the default hierarchy should use the following min,
 * default and max values.  The default value is the logarithmic center of
 * MIN and MAX and allows 100x to be expressed in both directions.
 */
#define CGROUP_WEIGHT_MIN                1
#define CGROUP_WEIGHT_DFL                100
#define CGROUP_WEIGHT_MAX                10000

#ifdef CONFIG_CGROUPS

/*
 * To avoid confusing the compiler (and generating warnings) with code
 * that attempts to access what would be a 0-element array (i.e. sized
 * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
 * constant expression can be added.
 */
#define CGROUP_HAS_SUBSYS_CONFIG        (CGROUP_SUBSYS_COUNT > 0)

enum css_task_iter_flags {
        CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
        CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
        CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
};

/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
        struct cgroup_subsys                *ss;
        unsigned int                        flags;

        struct list_head                *cset_pos;
        struct list_head                *cset_head;

        struct list_head                *tcset_pos;
        struct list_head                *tcset_head;

        struct list_head                *task_pos;

        struct list_head                *cur_tasks_head;
        struct css_set                        *cur_cset;
        struct css_set                        *cur_dcset;
        struct task_struct                *cur_task;
        struct list_head                iters_node;        /* css_set->task_iters */
};

enum cgroup_lifetime_events {
        CGROUP_LIFETIME_ONLINE,
        CGROUP_LIFETIME_OFFLINE,
};

extern struct file_system_type cgroup_fs_type;
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct blocking_notifier_head cgroup_lifetime_notifier;

#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x)                                                                \
        extern struct static_key_true _x ## _cgrp_subsys_enabled_key;                \
        extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

/**
 * cgroup_subsys_enabled - fast test on whether a subsys is enabled
 * @ss: subsystem in question
 */
#define cgroup_subsys_enabled(ss)                                                \
        static_branch_likely(&ss ## _enabled_key)

/**
 * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
 * @ss: subsystem in question
 */
#define cgroup_subsys_on_dfl(ss)                                                \
        static_branch_likely(&ss ## _on_dfl_key)

bool cgroup_on_dfl(const struct cgroup *cgrp);

bool css_has_online_children(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
                                         struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
                                             struct cgroup_subsys *ss);
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss);

struct cgroup *cgroup_get_from_path(const char *path);
struct cgroup *cgroup_get_from_fd(int fd);
struct cgroup *cgroup_v1v2_get_from_fd(int fd);

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);

int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
void cgroup_file_notify(struct cgroup_file *cfile);
void cgroup_file_show(struct cgroup_file *cfile, bool show);

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk);

void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p,
                           struct kernel_clone_args *kargs);
extern void cgroup_cancel_fork(struct task_struct *p,
                               struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
                             struct kernel_clone_args *kargs);
void cgroup_task_exit(struct task_struct *p);
void cgroup_task_dead(struct task_struct *p);
void cgroup_task_release(struct task_struct *p);
void cgroup_task_free(struct task_struct *p);

int cgroup_init_early(void);
int cgroup_init(void);

int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v);

/*
 * Iteration helpers and macros.
 */

struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent);
struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
                                                    struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
                                                     struct cgroup_subsys_state *css);

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp);

void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);

/**
 * css_for_each_child - iterate through children of a css
 * @pos: the css * to use as the loop cursor
 * @parent: css whose children to walk
 *
 * Walk @parent's children.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_child(pos, parent)                                        \
        for ((pos) = css_next_child(NULL, (parent)); (pos);                \
             (pos) = css_next_child((pos), (parent)))

/**
 * css_for_each_descendant_pre - pre-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @root: css whose descendants to walk
 *
 * Walk @root's descendants.  @root is included in the iteration and the
 * first node to be visited.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * For example, the following guarantees that a descendant can't escape
 * state updates of its ancestors.
 *
 * my_online(@css)
 * {
 *        Lock @css's parent and @css;
 *        Inherit state from the parent;
 *        Unlock both.
 * }
 *
 * my_update_state(@css)
 * {
 *        css_for_each_descendant_pre(@pos, @css) {
 *                Lock @pos;
 *                if (@pos == @css)
 *                        Update @css's state;
 *                else
 *                        Verify @pos is alive and inherit state from its parent;
 *                Unlock @pos;
 *        }
 * }
 *
 * As long as the inheriting step, including checking the parent state, is
 * enclosed inside @pos locking, double-locking the parent isn't necessary
 * while inheriting.  The state update to the parent is guaranteed to be
 * visible by walking order and, as long as inheriting operations to the
 * same @pos are atomic to each other, multiple updates racing each other
 * still result in the correct state.  It's guaranateed that at least one
 * inheritance happens for any css after the latest update to its parent.
 *
 * If checking parent's state requires locking the parent, each inheriting
 * iteration should lock and unlock both @pos->parent and @pos.
 *
 * Alternatively, a subsystem may choose to use a single global lock to
 * synchronize ->css_online() and ->css_offline() against tree-walking
 * operations.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_descendant_pre(pos, css)                                \
        for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_pre((pos), (css)))

/**
 * css_for_each_descendant_post - post-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @css: css whose descendants to walk
 *
 * Similar to css_for_each_descendant_pre() but performs post-order
 * traversal instead.  @root is included in the iteration and the last
 * node to be visited.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * Note that the walk visibility guarantee example described in pre-order
 * walk doesn't apply the same to post-order walks.
 */
#define css_for_each_descendant_post(pos, css)                                \
        for ((pos) = css_next_descendant_post(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_post((pos), (css)))

/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp)                                \
        list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       cgroup_is_dead(child); }))                        \
                        ;                                                \
                else

/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)                \
        css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))        \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       (dsct) = (d_css)->cgroup;                        \
                       cgroup_is_dead(dsct); }))                        \
                        ;                                                \
                else

/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)                \
        css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))        \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       (dsct) = (d_css)->cgroup;                        \
                       cgroup_is_dead(dsct); }))                        \
                        ;                                                \
                else

/**
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * @task: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * @tset may contain multiple tasks and they may belong to multiple
 * processes.
 *
 * On the v2 hierarchy, there may be tasks from multiple processes and they
 * may not share the source or destination csses.
 *
 * On traditional hierarchies, when there are multiple tasks in @tset, if a
 * task of a process is in @tset, all tasks of the process are in @tset.
 * Also, all are guaranteed to share the same source and destination csses.
 *
 * Iteration is not in any specific order.
 */
#define cgroup_taskset_for_each(task, dst_css, tset)                        \
        for ((task) = cgroup_taskset_first((tset), &(dst_css));                \
             (task);                                                        \
             (task) = cgroup_taskset_next((tset), &(dst_css)))

/**
 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
 * @leader: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
 * may not contain any.
 */
#define cgroup_taskset_for_each_leader(leader, dst_css, tset)                \
        for ((leader) = cgroup_taskset_first((tset), &(dst_css));        \
             (leader);                                                        \
             (leader) = cgroup_taskset_next((tset), &(dst_css)))        \
                if ((leader) != (leader)->group_leader)                        \
                        ;                                                \
                else

/*
 * Inline functions.
 */

#ifdef CONFIG_DEBUG_CGROUP_REF
void css_get(struct cgroup_subsys_state *css);
void css_get_many(struct cgroup_subsys_state *css, unsigned int n);
bool css_tryget(struct cgroup_subsys_state *css);
bool css_tryget_online(struct cgroup_subsys_state *css);
void css_put(struct cgroup_subsys_state *css);
void css_put_many(struct cgroup_subsys_state *css, unsigned int n);
#else
#define CGROUP_REF_FN_ATTRS        static inline
#define CGROUP_REF_EXPORT(fn)
#include <linux/cgroup_refcnt.h>
#endif

static inline u64 cgroup_id(const struct cgroup *cgrp)
{
        return cgrp->kn->id;
}

/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
 */
static inline struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
                                                     struct cgroup_subsys *ss)
{
        if (CGROUP_HAS_SUBSYS_CONFIG && ss)
                return rcu_dereference_check(cgrp->subsys[ss->id],
                                        lockdep_is_held(&cgroup_mutex));
        else
                return &cgrp->self;
}

/**
 * css_is_dying - test whether the specified css is dying
 * @css: target css
 *
 * Test whether @css is in the process of offlining or already offline.  In
 * most cases, ->css_online() and ->css_offline() callbacks should be
 * enough; however, the actual offline operations are RCU delayed and this
 * test returns %true also when @css is scheduled to be offlined.
 *
 * This is useful, for example, when the use case requires synchronous
 * behavior with respect to cgroup removal.  cgroup removal schedules css
 * offlining but the css can seem alive while the operation is being
 * delayed.  If the delay affects user visible semantics, this test can be
 * used to resolve the situation.
 */
static inline bool css_is_dying(struct cgroup_subsys_state *css)
{
        return css->flags & CSS_DYING;
}

static inline bool css_is_online(struct cgroup_subsys_state *css)
{
        return css->flags & CSS_ONLINE;
}

static inline bool css_is_self(struct cgroup_subsys_state *css)
{
        if (css == &css->cgroup->self) {
                /* cgroup::self should not have subsystem association */
                WARN_ON(css->ss != NULL);
                return true;
        }

        return false;
}

static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
        return !(cgrp->self.flags & CSS_ONLINE);
}

static inline void cgroup_get(struct cgroup *cgrp)
{
        css_get(&cgrp->self);
}

static inline bool cgroup_tryget(struct cgroup *cgrp)
{
        return css_tryget(&cgrp->self);
}

static inline void cgroup_put(struct cgroup *cgrp)
{
        css_put(&cgrp->self);
}

static inline void cgroup_lock(void)
{
        mutex_lock(&cgroup_mutex);
}

static inline void cgroup_unlock(void)
{
        mutex_unlock(&cgroup_mutex);
}

/**
 * task_css_set_check - obtain a task's css_set with extra access conditions
 * @task: the task to obtain css_set for
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * A task's css_set is RCU protected, initialized and exited while holding
 * task_lock(), and can only be modified while holding both cgroup_mutex
 * and task_lock() while the task is alive.  This macro verifies that the
 * caller is inside proper critical section and returns @task's css_set.
 *
 * The caller can also specify additional allowed conditions via @__c, such
 * as locks used during the cgroup_subsys::attach() methods.
 */
#ifdef CONFIG_PROVE_RCU
#define task_css_set_check(task, __c)                                        \
        rcu_dereference_check((task)->cgroups,                                \
                rcu_read_lock_sched_held() ||                                \
                lockdep_is_held(&cgroup_mutex) ||                        \
                lockdep_is_held(&css_set_lock) ||                        \
                ((task)->flags & PF_EXITING) || (__c))
#else
#define task_css_set_check(task, __c)                                        \
        rcu_dereference((task)->cgroups)
#endif

/**
 * task_css_check - obtain css for (task, subsys) w/ extra access conds
 * @task: the target task
 * @subsys_id: the target subsystem ID
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
 * synchronization rules are the same as task_css_set_check().
 */
#define task_css_check(task, subsys_id, __c)                                \
        task_css_set_check((task), (__c))->subsys[(subsys_id)]

/**
 * task_css_set - obtain a task's css_set
 * @task: the task to obtain css_set for
 *
 * See task_css_set_check().
 */
static inline struct css_set *task_css_set(struct task_struct *task)
{
        return task_css_set_check(task, false);
}

/**
 * task_css - obtain css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * See task_css_check().
 */
static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
                                                   int subsys_id)
{
        return task_css_check(task, subsys_id, false);
}

/**
 * task_get_css - find and get the css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Find the css for the (@task, @subsys_id) combination, increment a
 * reference on and return it.  This function is guaranteed to return a
 * valid css.  The returned css may already have been offlined.
 */
static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        while (true) {
                css = task_css(task, subsys_id);
                /*
                 * Can't use css_tryget_online() here.  A task which has
                 * PF_EXITING set may stay associated with an offline css.
                 * If such task calls this function, css_tryget_online()
                 * will keep failing.
                 */
                if (likely(css_tryget(css)))
                        break;
                cpu_relax();
        }
        rcu_read_unlock();
        return css;
}

/**
 * task_css_is_root - test whether a task belongs to the root css
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Test whether @task belongs to the root css on the specified subsystem.
 * May be invoked in any context.
 */
static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
{
        return task_css_check(task, subsys_id, true) ==
                init_css_set.subsys[subsys_id];
}

static inline struct cgroup *task_cgroup(struct task_struct *task,
                                         int subsys_id)
{
        return task_css(task, subsys_id)->cgroup;
}

static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
{
        return task_css_set(task)->dfl_cgrp;
}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        struct cgroup_subsys_state *parent_css = cgrp->self.parent;

        if (parent_css)
                return container_of(parent_css, struct cgroup, self);
        return NULL;
}

/**
 * cgroup_is_descendant - test ancestry
 * @cgrp: the cgroup to be tested
 * @ancestor: possible ancestor of @cgrp
 *
 * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 * and @ancestor are accessible.
 */
static inline bool cgroup_is_descendant(struct cgroup *cgrp,
                                        struct cgroup *ancestor)
{
        if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
                return false;
        return cgrp->ancestors[ancestor->level] == ancestor;
}

/**
 * cgroup_ancestor - find ancestor of cgroup
 * @cgrp: cgroup to find ancestor of
 * @ancestor_level: level of ancestor to find starting from root
 *
 * Find ancestor of cgroup at specified level starting from root if it exists
 * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
 * @ancestor_level.
 *
 * This function is safe to call as long as @cgrp is accessible.
 */
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
                                             int ancestor_level)
{
        if (ancestor_level < 0 || ancestor_level > cgrp->level)
                return NULL;
        return cgrp->ancestors[ancestor_level];
}

/**
 * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
 * @task: the task to be tested
 * @ancestor: possible ancestor of @task's cgroup
 *
 * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
 * It follows all the same rules as cgroup_is_descendant, and only applies
 * to the default hierarchy.
 */
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        struct css_set *cset = task_css_set(task);

        return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
}

/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
                cgrp->nr_populated_threaded_children;
}

/* returns ino associated with a cgroup */
static inline ino_t cgroup_ino(struct cgroup *cgrp)
{
        return kernfs_ino(cgrp->kn);
}

/* cft/css accessors for cftype->write() operation */
static inline struct cftype *of_cft(struct kernfs_open_file *of)
{
        return of->kn->priv;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);

/* cft/css accessors for cftype->seq_*() operations */
static inline struct cftype *seq_cft(struct seq_file *seq)
{
        return of_cft(seq->private);
}

static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
        return of_css(seq->private);
}

/*
 * Name / path handling functions.  All are thin wrappers around the kernfs
 * counterparts and can be called under any context.
 */

static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_name(cgrp->kn, buf, buflen);
}

static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_path(cgrp->kn, buf, buflen);
}

static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
{
        pr_cont_kernfs_name(cgrp->kn);
}

static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
{
        pr_cont_kernfs_path(cgrp->kn);
}

bool cgroup_psi_enabled(void);

static inline void cgroup_init_kthreadd(void)
{
        /*
         * kthreadd is inherited by all kthreads, keep it in the root so
         * that the new kthreads are guaranteed to stay in the root until
         * initialization is finished.
         */
        current->no_cgroup_migration = 1;
}

static inline void cgroup_kthread_ready(void)
{
        /*
         * This kthread finished initialization.  The creator should have
         * set PF_NO_SETAFFINITY if this kthread should stay in the root.
         */
        current->no_cgroup_migration = 0;
}

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
struct cgroup *__cgroup_get_from_id(u64 id);
struct cgroup *cgroup_get_from_id(u64 id);
#else /* !CONFIG_CGROUPS */

struct cgroup_subsys_state;
struct cgroup;

static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
                                         struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
                                    struct dentry *dentry) { return -EINVAL; }

static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p,
                                  struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
                                      struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
                                    struct kernel_clone_args *kargs) {}
static inline void cgroup_task_exit(struct task_struct *p) {}
static inline void cgroup_task_dead(struct task_struct *p) {}
static inline void cgroup_task_release(struct task_struct *p) {}
static inline void cgroup_task_free(struct task_struct *p) {}

static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_kthreadd(void) {}
static inline void cgroup_kthread_ready(void) {}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        return NULL;
}

static inline bool cgroup_psi_enabled(void)
{
        return false;
}

static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        return true;
}

static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{}
#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUPS
/*
 * cgroup scalable recursive statistics.
 */
void css_rstat_updated(struct cgroup_subsys_state *css, int cpu);
void css_rstat_flush(struct cgroup_subsys_state *css);

/*
 * Basic resource stats.
 */
#ifdef CONFIG_CGROUP_CPUACCT
void cpuacct_charge(struct task_struct *tsk, u64 cputime);
void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_account_field(struct task_struct *tsk, int index,
                                         u64 val) {}
#endif

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec);

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_charge(task, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime(cgrp, delta_exec);
}

static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_account_field(task, index, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime_field(cgrp, index, delta_exec);
}

#else        /* CONFIG_CGROUPS */

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec) {}
static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec) {}

#endif        /* CONFIG_CGROUPS */

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);

static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
        return skcd->cgroup;
}

#else        /* CONFIG_CGROUP_DATA */

static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}

#endif        /* CONFIG_CGROUP_DATA */

#ifdef CONFIG_CGROUPS

void cgroup_enter_frozen(void);
void cgroup_leave_frozen(bool always_leave);
void cgroup_update_frozen(struct cgroup *cgrp);
void cgroup_freeze(struct cgroup *cgrp, bool freeze);
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
                                 struct cgroup *dst);

static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return task->frozen;
}

#else /* !CONFIG_CGROUPS */

static inline void cgroup_enter_frozen(void) { }
static inline void cgroup_leave_frozen(bool always_leave) { }
static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return false;
}

#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUP_BPF
static inline void cgroup_bpf_get(struct cgroup *cgrp)
{
        percpu_ref_get(&cgrp->bpf.refcnt);
}

static inline void cgroup_bpf_put(struct cgroup *cgrp)
{
        percpu_ref_put(&cgrp->bpf.refcnt);
}

#else /* CONFIG_CGROUP_BPF */

static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}

#endif /* CONFIG_CGROUP_BPF */

struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);

struct cgroup_of_peak *of_peak(struct kernfs_open_file *of);

#endif /* _LINUX_CGROUP_H */



























































    1 































































































































































































































































































































































































































































































    1 

























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Describes operations that can be performed on software-defined page table
 * leaf entries. These are abstracted from the hardware page table entries
 * themselves by the softleaf_t type, see mm_types.h.
 */
#ifndef _LINUX_LEAFOPS_H
#define _LINUX_LEAFOPS_H

#include <linux/mm_types.h>
#include <linux/swapops.h>
#include <linux/swap.h>

#ifdef CONFIG_MMU

/* Temporary until swp_entry_t eliminated. */
#define LEAF_TYPE_SHIFT SWP_TYPE_SHIFT

enum softleaf_type {
        /* Fundamental types. */
        SOFTLEAF_NONE,
        SOFTLEAF_SWAP,
        /* Migration types. */
        SOFTLEAF_MIGRATION_READ,
        SOFTLEAF_MIGRATION_READ_EXCLUSIVE,
        SOFTLEAF_MIGRATION_WRITE,
        /* Device types. */
        SOFTLEAF_DEVICE_PRIVATE_READ,
        SOFTLEAF_DEVICE_PRIVATE_WRITE,
        SOFTLEAF_DEVICE_EXCLUSIVE,
        /* H/W posion types. */
        SOFTLEAF_HWPOISON,
        /* Marker types. */
        SOFTLEAF_MARKER,
};

/**
 * softleaf_mk_none() - Create an empty ('none') leaf entry.
 * Returns: empty leaf entry.
 */
static inline softleaf_t softleaf_mk_none(void)
{
        return ((softleaf_t) { 0 });
}

/**
 * softleaf_from_pte() - Obtain a leaf entry from a PTE entry.
 * @pte: PTE entry.
 *
 * If @pte is present (therefore not a leaf entry) the function returns an empty
 * leaf entry. Otherwise, it returns a leaf entry.
 *
 * Returns: Leaf entry.
 */
static inline softleaf_t softleaf_from_pte(pte_t pte)
{
        softleaf_t arch_entry;

        if (pte_present(pte) || pte_none(pte))
                return softleaf_mk_none();

        pte = pte_swp_clear_flags(pte);
        arch_entry = __pte_to_swp_entry(pte);

        /* Temporary until swp_entry_t eliminated. */
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

/**
 * softleaf_to_pte() - Obtain a PTE entry from a leaf entry.
 * @entry: Leaf entry.
 *
 * This generates an architecture-specific PTE entry that can be utilised to
 * encode the metadata the leaf entry encodes.
 *
 * Returns: Architecture-specific PTE entry encoding leaf entry.
 */
static inline pte_t softleaf_to_pte(softleaf_t entry)
{
        /* Temporary until swp_entry_t eliminated. */
        return swp_entry_to_pte(entry);
}

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/**
 * softleaf_from_pmd() - Obtain a leaf entry from a PMD entry.
 * @pmd: PMD entry.
 *
 * If @pmd is present (therefore not a leaf entry) the function returns an empty
 * leaf entry. Otherwise, it returns a leaf entry.
 *
 * Returns: Leaf entry.
 */
static inline softleaf_t softleaf_from_pmd(pmd_t pmd)
{
        softleaf_t arch_entry;

        if (pmd_present(pmd) || pmd_none(pmd))
                return softleaf_mk_none();

        if (pmd_swp_soft_dirty(pmd))
                pmd = pmd_swp_clear_soft_dirty(pmd);
        if (pmd_swp_uffd_wp(pmd))
                pmd = pmd_swp_clear_uffd_wp(pmd);
        arch_entry = __pmd_to_swp_entry(pmd);

        /* Temporary until swp_entry_t eliminated. */
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

#else

static inline softleaf_t softleaf_from_pmd(pmd_t pmd)
{
        return softleaf_mk_none();
}

#endif

/**
 * softleaf_is_none() - Is the leaf entry empty?
 * @entry: Leaf entry.
 *
 * Empty entries are typically the result of a 'none' page table leaf entry
 * being converted to a leaf entry.
 *
 * Returns: true if the entry is empty, false otherwise.
 */
static inline bool softleaf_is_none(softleaf_t entry)
{
        return entry.val == 0;
}

/**
 * softleaf_type() - Identify the type of leaf entry.
 * @entry: Leaf entry.
 *
 * Returns: the leaf entry type associated with @entry.
 */
static inline enum softleaf_type softleaf_type(softleaf_t entry)
{
        unsigned int type_num;

        if (softleaf_is_none(entry))
                return SOFTLEAF_NONE;

        type_num = entry.val >> LEAF_TYPE_SHIFT;

        if (type_num < MAX_SWAPFILES)
                return SOFTLEAF_SWAP;

        switch (type_num) {
#ifdef CONFIG_MIGRATION
        case SWP_MIGRATION_READ:
                return SOFTLEAF_MIGRATION_READ;
        case SWP_MIGRATION_READ_EXCLUSIVE:
                return SOFTLEAF_MIGRATION_READ_EXCLUSIVE;
        case SWP_MIGRATION_WRITE:
                return SOFTLEAF_MIGRATION_WRITE;
#endif
#ifdef CONFIG_DEVICE_PRIVATE
        case SWP_DEVICE_WRITE:
                return SOFTLEAF_DEVICE_PRIVATE_WRITE;
        case SWP_DEVICE_READ:
                return SOFTLEAF_DEVICE_PRIVATE_READ;
        case SWP_DEVICE_EXCLUSIVE:
                return SOFTLEAF_DEVICE_EXCLUSIVE;
#endif
#ifdef CONFIG_MEMORY_FAILURE
        case SWP_HWPOISON:
                return SOFTLEAF_HWPOISON;
#endif
        case SWP_PTE_MARKER:
                return SOFTLEAF_MARKER;
        }

        /* Unknown entry type. */
        VM_WARN_ON_ONCE(1);
        return SOFTLEAF_NONE;
}

/**
 * softleaf_is_swap() - Is this leaf entry a swap entry?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is a swap entry, otherwise false.
 */
static inline bool softleaf_is_swap(softleaf_t entry)
{
        return softleaf_type(entry) == SOFTLEAF_SWAP;
}

/**
 * softleaf_is_migration_write() - Is this leaf entry a writable migration entry?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is a writable migration entry, otherwise
 * false.
 */
static inline bool softleaf_is_migration_write(softleaf_t entry)
{
        return softleaf_type(entry) == SOFTLEAF_MIGRATION_WRITE;
}

/**
 * softleaf_is_migration_read() - Is this leaf entry a readable migration entry?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is a readable migration entry, otherwise
 * false.
 */
static inline bool softleaf_is_migration_read(softleaf_t entry)
{
        return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ;
}

/**
 * softleaf_is_migration_read_exclusive() - Is this leaf entry an exclusive
 * readable migration entry?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is an exclusive readable migration entry,
 * otherwise false.
 */
static inline bool softleaf_is_migration_read_exclusive(softleaf_t entry)
{
        return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ_EXCLUSIVE;
}

/**
 * softleaf_is_migration() - Is this leaf entry a migration entry?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is a migration entry, otherwise false.
 */
static inline bool softleaf_is_migration(softleaf_t entry)
{
        switch (softleaf_type(entry)) {
        case SOFTLEAF_MIGRATION_READ:
        case SOFTLEAF_MIGRATION_READ_EXCLUSIVE:
        case SOFTLEAF_MIGRATION_WRITE:
                return true;
        default:
                return false;
        }
}

/**
 * softleaf_is_device_private_write() - Is this leaf entry a device private
 * writable entry?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is a device private writable entry, otherwise
 * false.
 */
static inline bool softleaf_is_device_private_write(softleaf_t entry)
{
        return softleaf_type(entry) == SOFTLEAF_DEVICE_PRIVATE_WRITE;
}

/**
 * softleaf_is_device_private() - Is this leaf entry a device private entry?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is a device private entry, otherwise false.
 */
static inline bool softleaf_is_device_private(softleaf_t entry)
{
        switch (softleaf_type(entry)) {
        case SOFTLEAF_DEVICE_PRIVATE_WRITE:
        case SOFTLEAF_DEVICE_PRIVATE_READ:
                return true;
        default:
                return false;
        }
}

/**
 * softleaf_is_device_exclusive() - Is this leaf entry a device-exclusive entry?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is a device-exclusive entry, otherwise false.
 */
static inline bool softleaf_is_device_exclusive(softleaf_t entry)
{
        return softleaf_type(entry) == SOFTLEAF_DEVICE_EXCLUSIVE;
}

/**
 * softleaf_is_hwpoison() - Is this leaf entry a hardware poison entry?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is a hardware poison entry, otherwise false.
 */
static inline bool softleaf_is_hwpoison(softleaf_t entry)
{
        return softleaf_type(entry) == SOFTLEAF_HWPOISON;
}

/**
 * softleaf_is_marker() - Is this leaf entry a marker?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is a marker entry, otherwise false.
 */
static inline bool softleaf_is_marker(softleaf_t entry)
{
        return softleaf_type(entry) == SOFTLEAF_MARKER;
}

/**
 * softleaf_to_marker() - Obtain marker associated with leaf entry.
 * @entry: Leaf entry, softleaf_is_marker(@entry) must return true.
 *
 * Returns: Marker associated with the leaf entry.
 */
static inline pte_marker softleaf_to_marker(softleaf_t entry)
{
        VM_WARN_ON_ONCE(!softleaf_is_marker(entry));

        return swp_offset(entry) & PTE_MARKER_MASK;
}

/**
 * softleaf_has_pfn() - Does this leaf entry encode a valid PFN number?
 * @entry: Leaf entry.
 *
 * A pfn swap entry is a special type of swap entry that always has a pfn stored
 * in the swap offset. They can either be used to represent unaddressable device
 * memory, to restrict access to a page undergoing migration or to represent a
 * pfn which has been hwpoisoned and unmapped.
 *
 * Returns: true if the leaf entry encodes a PFN, otherwise false.
 */
static inline bool softleaf_has_pfn(softleaf_t entry)
{
        /* Make sure the swp offset can always store the needed fields. */
        BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);

        if (softleaf_is_migration(entry))
                return true;
        if (softleaf_is_device_private(entry))
                return true;
        if (softleaf_is_device_exclusive(entry))
                return true;
        if (softleaf_is_hwpoison(entry))
                return true;

        return false;
}

/**
 * softleaf_to_pfn() - Obtain PFN encoded within leaf entry.
 * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true.
 *
 * Returns: The PFN associated with the leaf entry.
 */
static inline unsigned long softleaf_to_pfn(softleaf_t entry)
{
        VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));

        /* Temporary until swp_entry_t eliminated. */
        return swp_offset(entry) & SWP_PFN_MASK;
}

static inline void softleaf_migration_sync(softleaf_t entry,
                struct folio *folio)
{
        /*
         * Ensure we do not race with split, which might alter tail pages into new
         * folios and thus result in observing an unlocked folio.
         * This matches the write barrier in __split_folio_to_order().
         */
        smp_rmb();

        /*
         * Any use of migration entries may only occur while the
         * corresponding page is locked
         */
        VM_WARN_ON_ONCE(!folio_test_locked(folio));
}

/**
 * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry.
 * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true.
 *
 * Returns: Pointer to the struct page associated with the leaf entry's PFN.
 */
static inline struct page *softleaf_to_page(softleaf_t entry)
{
        struct page *page = pfn_to_page(softleaf_to_pfn(entry));

        VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
        if (softleaf_is_migration(entry))
                softleaf_migration_sync(entry, page_folio(page));

        return page;
}

/**
 * softleaf_to_folio() - Obtains struct folio for PFN encoded within leaf entry.
 * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true.
 *
 * Returns: Pointer to the struct folio associated with the leaf entry's PFN.
 */
static inline struct folio *softleaf_to_folio(softleaf_t entry)
{
        struct folio *folio = pfn_folio(softleaf_to_pfn(entry));

        VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
        if (softleaf_is_migration(entry))
                softleaf_migration_sync(entry, folio);

        return folio;
}

/**
 * softleaf_is_poison_marker() - Is this leaf entry a poison marker?
 * @entry: Leaf entry.
 *
 * The poison marker is set via UFFDIO_POISON. Userfaultfd-specific.
 *
 * Returns: true if the leaf entry is a poison marker, otherwise false.
 */
static inline bool softleaf_is_poison_marker(softleaf_t entry)
{
        if (!softleaf_is_marker(entry))
                return false;

        return softleaf_to_marker(entry) & PTE_MARKER_POISONED;
}

/**
 * softleaf_is_guard_marker() - Is this leaf entry a guard region marker?
 * @entry: Leaf entry.
 *
 * Returns: true if the leaf entry is a guard marker, otherwise false.
 */
static inline bool softleaf_is_guard_marker(softleaf_t entry)
{
        if (!softleaf_is_marker(entry))
                return false;

        return softleaf_to_marker(entry) & PTE_MARKER_GUARD;
}

/**
 * softleaf_is_uffd_wp_marker() - Is this leaf entry a userfautlfd write protect
 * marker?
 * @entry: Leaf entry.
 *
 * Userfaultfd-specific.
 *
 * Returns: true if the leaf entry is a UFFD WP marker, otherwise false.
 */
static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry)
{
        if (!softleaf_is_marker(entry))
                return false;

        return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP;
}

#ifdef CONFIG_MIGRATION

/**
 * softleaf_is_migration_young() - Does this migration entry contain an accessed
 * bit?
 * @entry: Leaf entry.
 *
 * If the architecture can support storing A/D bits in migration entries, this
 * determines whether the accessed (or 'young') bit was set on the migrated page
 * table entry.
 *
 * Returns: true if the entry contains an accessed bit, otherwise false.
 */
static inline bool softleaf_is_migration_young(softleaf_t entry)
{
        VM_WARN_ON_ONCE(!softleaf_is_migration(entry));

        if (migration_entry_supports_ad())
                return swp_offset(entry) & SWP_MIG_YOUNG;
        /* Keep the old behavior of aging page after migration */
        return false;
}

/**
 * softleaf_is_migration_dirty() - Does this migration entry contain a dirty bit?
 * @entry: Leaf entry.
 *
 * If the architecture can support storing A/D bits in migration entries, this
 * determines whether the dirty bit was set on the migrated page table entry.
 *
 * Returns: true if the entry contains a dirty bit, otherwise false.
 */
static inline bool softleaf_is_migration_dirty(softleaf_t entry)
{
        VM_WARN_ON_ONCE(!softleaf_is_migration(entry));

        if (migration_entry_supports_ad())
                return swp_offset(entry) & SWP_MIG_DIRTY;
        /* Keep the old behavior of clean page after migration */
        return false;
}

#else /* CONFIG_MIGRATION */

static inline bool softleaf_is_migration_young(softleaf_t entry)
{
        return false;
}

static inline bool softleaf_is_migration_dirty(softleaf_t entry)
{
        return false;
}
#endif /* CONFIG_MIGRATION */

/**
 * pte_is_marker() - Does the PTE entry encode a marker leaf entry?
 * @pte: PTE entry.
 *
 * Returns: true if this PTE is a marker leaf entry, otherwise false.
 */
static inline bool pte_is_marker(pte_t pte)
{
        return softleaf_is_marker(softleaf_from_pte(pte));
}

/**
 * pte_is_uffd_wp_marker() - Does this PTE entry encode a userfaultfd write
 * protect marker leaf entry?
 * @pte: PTE entry.
 *
 * Returns: true if this PTE is a UFFD WP marker leaf entry, otherwise false.
 */
static inline bool pte_is_uffd_wp_marker(pte_t pte)
{
        const softleaf_t entry = softleaf_from_pte(pte);

        return softleaf_is_uffd_wp_marker(entry);
}

/**
 * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker
 * leaf entry?
 * @pte: PTE entry.
 *
 * It's useful to be able to determine which leaf entries encode UFFD-specific
 * markers so we can handle these correctly.
 *
 * Returns: true if this PTE entry is a UFFD-specific marker, otherwise false.
 */
static inline bool pte_is_uffd_marker(pte_t pte)
{
        const softleaf_t entry = softleaf_from_pte(pte);

        if (!softleaf_is_marker(entry))
                return false;

        /* UFFD WP, poisoned swap entries are UFFD-handled. */
        if (softleaf_is_uffd_wp_marker(entry))
                return true;
        if (softleaf_is_poison_marker(entry))
                return true;

        return false;
}

#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)

/**
 * pmd_is_device_private_entry() - Check if PMD contains a device private swap
 * entry.
 * @pmd: The PMD to check.
 *
 * Returns true if the PMD contains a swap entry that represents a device private
 * page mapping. This is used for zone device private pages that have been
 * swapped out but still need special handling during various memory management
 * operations.
 *
 * Return: true if PMD contains device private entry, false otherwise
 */
static inline bool pmd_is_device_private_entry(pmd_t pmd)
{
        return softleaf_is_device_private(softleaf_from_pmd(pmd));
}

#else  /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */

static inline bool pmd_is_device_private_entry(pmd_t pmd)
{
        return false;
}

#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */

/**
 * pmd_is_migration_entry() - Does this PMD entry encode a migration entry?
 * @pmd: PMD entry.
 *
 * Returns: true if the PMD encodes a migration entry, otherwise false.
 */
static inline bool pmd_is_migration_entry(pmd_t pmd)
{
        return softleaf_is_migration(softleaf_from_pmd(pmd));
}

/**
 * softleaf_is_valid_pmd_entry() - Is the specified softleaf entry obtained from
 * a PMD one that we support at PMD level?
 * @entry: Entry to check.
 * Returns: true if the softleaf entry is valid at PMD, otherwise false.
 */
static inline bool softleaf_is_valid_pmd_entry(softleaf_t entry)
{
        /* Only device private, migration entries valid for PMD. */
        return softleaf_is_device_private(entry) ||
                softleaf_is_migration(entry);
}

/**
 * pmd_is_valid_softleaf() - Is this PMD entry a valid softleaf entry?
 * @pmd: PMD entry.
 *
 * PMD leaf entries are valid only if they are device private or migration
 * entries. This function asserts that a PMD leaf entry is valid in this
 * respect.
 *
 * Returns: true if the PMD entry is a valid leaf entry, otherwise false.
 */
static inline bool pmd_is_valid_softleaf(pmd_t pmd)
{
        const softleaf_t entry = softleaf_from_pmd(pmd);

        return softleaf_is_valid_pmd_entry(entry);
}

/**
 * pmd_to_softleaf_folio() - Convert the PMD entry to a folio.
 * @pmd: PMD entry.
 *
 * The PMD entry is expected to be a valid PMD softleaf entry.
 *
 * Returns: the folio the softleaf entry references if this is a valid softleaf
 * entry, otherwise NULL.
 */
static inline struct folio *pmd_to_softleaf_folio(pmd_t pmd)
{
        const softleaf_t entry = softleaf_from_pmd(pmd);

        if (!softleaf_is_valid_pmd_entry(entry)) {
                VM_WARN_ON_ONCE(true);
                return NULL;
        }
        return softleaf_to_folio(entry);
}

#endif  /* CONFIG_MMU */
#endif  /* _LINUX_LEAFOPS_H */


























    3 



    2 



























    2 



    2 



































































































    2 



    2 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
// SPDX-License-Identifier: GPL-2.0
/*
 * of.c                The helpers for hcd device tree support
 *
 * Copyright (C) 2016 Freescale Semiconductor, Inc.
 *        Author: Peter Chen <peter.chen@freescale.com>
 * Copyright (C) 2017 Johan Hovold <johan@kernel.org>
 */

#include <linux/of.h>
#include <linux/of_graph.h>
#include <linux/usb/of.h>

/**
 * usb_of_get_device_node() - get a USB device node
 * @hub: hub to which device is connected
 * @port1: one-based index of port
 *
 * Look up the node of a USB device given its parent hub device and one-based
 * port number.
 *
 * Return: A pointer to the node with incremented refcount if found, or
 * %NULL otherwise.
 */
struct device_node *usb_of_get_device_node(struct usb_device *hub, int port1)
{
        struct device_node *node;
        u32 reg;

        for_each_child_of_node(hub->dev.of_node, node) {
                if (of_property_read_u32(node, "reg", &reg))
                        continue;

                if (reg == port1)
                        return node;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(usb_of_get_device_node);

/**
 * usb_of_has_combined_node() - determine whether a device has a combined node
 * @udev: USB device
 *
 * Determine whether a USB device has a so called combined node which is
 * shared with its sole interface. This is the case if and only if the device
 * has a node and its descriptors report the following:
 *
 *        1) bDeviceClass is 0 or 9, and
 *        2) bNumConfigurations is 1, and
 *        3) bNumInterfaces is 1.
 *
 * Return: True iff the device has a device node and its descriptors match the
 * criteria for a combined node.
 */
bool usb_of_has_combined_node(struct usb_device *udev)
{
        struct usb_device_descriptor *ddesc = &udev->descriptor;
        struct usb_config_descriptor *cdesc;

        if (!udev->dev.of_node)
                return false;

        switch (ddesc->bDeviceClass) {
        case USB_CLASS_PER_INTERFACE:
        case USB_CLASS_HUB:
                if (ddesc->bNumConfigurations == 1) {
                        cdesc = &udev->config->desc;
                        if (cdesc->bNumInterfaces == 1)
                                return true;
                }
        }

        return false;
}
EXPORT_SYMBOL_GPL(usb_of_has_combined_node);

static bool usb_of_has_devices_or_graph(const struct usb_device *hub)
{
        const struct device_node *np = hub->dev.of_node;

        if (of_graph_is_present(np))
                return true;

        for_each_child_of_node_scoped(np, child)
                if (of_property_present(child, "reg"))
                        return true;

        return false;
}

/**
 * usb_of_get_connect_type() - get a USB hub's port connect_type
 * @hub: hub to which port is for @port1
 * @port1: one-based index of port
 *
 * Get the connect_type of @port1 based on the device node for @hub. If the
 * port is described in the OF graph, the connect_type is "hotplug". If the
 * @hub has a child device has with a 'reg' property equal to @port1 the
 * connect_type is "hard-wired". If there isn't an OF graph or child node at
 * all then the connect_type is "unknown". Otherwise, the port is considered
 * "unused" because it isn't described at all.
 *
 * Return: A connect_type for @port1 based on the device node for @hub.
 */
enum usb_port_connect_type usb_of_get_connect_type(struct usb_device *hub, int port1)
{
        struct device_node *np, *child, *ep, *remote_np;
        enum usb_port_connect_type connect_type;

        /* Only set connect_type if binding has ports/hardwired devices. */
        if (!usb_of_has_devices_or_graph(hub))
                return USB_PORT_CONNECT_TYPE_UNKNOWN;

        /* Assume port is unused if there's a graph or a child node. */
        connect_type = USB_PORT_NOT_USED;

        np = hub->dev.of_node;
        /*
         * Hotplug ports are connected to an available remote node, e.g.
         * usb-a-connector compatible node, in the OF graph.
         */
        if (of_graph_is_present(np)) {
                ep = of_graph_get_endpoint_by_regs(np, port1, -1);
                if (ep) {
                        remote_np = of_graph_get_remote_port_parent(ep);
                        of_node_put(ep);
                        if (of_device_is_available(remote_np))
                                connect_type = USB_PORT_CONNECT_TYPE_HOT_PLUG;
                        of_node_put(remote_np);
                }
        }

        /*
         * Hard-wired ports are child nodes with a reg property corresponding
         * to the port number, i.e. a usb device.
         */
        child = usb_of_get_device_node(hub, port1);
        if (of_device_is_available(child))
                connect_type = USB_PORT_CONNECT_TYPE_HARD_WIRED;
        of_node_put(child);

        return connect_type;
}
EXPORT_SYMBOL_GPL(usb_of_get_connect_type);

/**
 * usb_of_get_interface_node() - get a USB interface node
 * @udev: USB device of interface
 * @config: configuration value
 * @ifnum: interface number
 *
 * Look up the node of a USB interface given its USB device, configuration
 * value and interface number.
 *
 * Return: A pointer to the node with incremented refcount if found, or
 * %NULL otherwise.
 */
struct device_node *
usb_of_get_interface_node(struct usb_device *udev, u8 config, u8 ifnum)
{
        struct device_node *node;
        u32 reg[2];

        for_each_child_of_node(udev->dev.of_node, node) {
                if (of_property_read_u32_array(node, "reg", reg, 2))
                        continue;

                if (reg[0] == ifnum && reg[1] == config)
                        return node;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(usb_of_get_interface_node);



















































































































    1 








    1 











    1 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 2009  Red Hat, Inc.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/swapops.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/mm_types.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/compat.h>
#include <linux/pgalloc.h>
#include <linux/pgalloc_tag.h>
#include <linux/pagewalk.h>

#include <asm/tlb.h>
#include "internal.h"
#include "swap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>

/*
 * By default, transparent hugepage support is disabled in order to avoid
 * risking an increased memory footprint for applications that are not
 * guaranteed to benefit from it. When transparent hugepage support is
 * enabled, it is for all mappings, and khugepaged scans all mappings.
 * Defrag is invoked by khugepaged hugepage allocations and by page faults
 * for all hugepage allocations.
 */
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);

static struct shrinker *deferred_split_shrinker;
static unsigned long deferred_split_count(struct shrinker *shrink,
                                          struct shrink_control *sc);
static unsigned long deferred_split_scan(struct shrinker *shrink,
                                         struct shrink_control *sc);
static bool split_underused_thp = true;

static atomic_t huge_zero_refcount;
struct folio *huge_zero_folio __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
static bool anon_orders_configured __initdata;

static inline bool file_thp_enabled(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
                return false;

        if (!vma->vm_file)
                return false;

        inode = file_inode(vma->vm_file);

        if (IS_ANON_FILE(inode))
                return false;

        return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}

/* If returns true, we are unable to access the VMA's folios. */
static bool vma_is_special_huge(const struct vm_area_struct *vma)
{
        if (vma_is_dax(vma))
                return false;
        return vma_test_any(vma, VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT);
}

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
                                         vm_flags_t vm_flags,
                                         enum tva_type type,
                                         unsigned long orders)
{
        const bool smaps = type == TVA_SMAPS;
        const bool in_pf = type == TVA_PAGEFAULT;
        const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
        unsigned long supported_orders;

        /* Check the intersection of requested and supported orders. */
        if (vma_is_anonymous(vma))
                supported_orders = THP_ORDERS_ALL_ANON;
        else if (vma_is_dax(vma) || vma_is_special_huge(vma))
                supported_orders = THP_ORDERS_ALL_SPECIAL_DAX;
        else
                supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;

        orders &= supported_orders;
        if (!orders)
                return 0;

        if (!vma->vm_mm)                /* vdso */
                return 0;

        if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
                return 0;

        /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
        if (vma_is_dax(vma))
                return in_pf ? orders : 0;

        /*
         * khugepaged special VMA and hugetlb VMA.
         * Must be checked after dax since some dax mappings may have
         * VM_MIXEDMAP set.
         */
        if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
                return 0;

        /*
         * Check alignment for file vma and size for both file and anon vma by
         * filtering out the unsuitable orders.
         *
         * Skip the check for page fault. Huge fault does the check in fault
         * handlers.
         */
        if (!in_pf) {
                int order = highest_order(orders);
                unsigned long addr;

                while (orders) {
                        addr = vma->vm_end - (PAGE_SIZE << order);
                        if (thp_vma_suitable_order(vma, addr, order))
                                break;
                        order = next_order(&orders, order);
                }

                if (!orders)
                        return 0;
        }

        /*
         * Enabled via shmem mount options or sysfs settings.
         * Must be done before hugepage flags check since shmem has its
         * own flags.
         */
        if (!in_pf && shmem_file(vma->vm_file))
                return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
                                                   vma, vma->vm_pgoff, 0,
                                                   forced_collapse);

        if (!vma_is_anonymous(vma)) {
                /*
                 * Enforce THP collapse requirements as necessary. Anonymous vmas
                 * were already handled in thp_vma_allowable_orders().
                 */
                if (!forced_collapse &&
                    (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
                                                    !hugepage_global_always())))
                        return 0;

                /*
                 * Trust that ->huge_fault() handlers know what they are doing
                 * in fault path.
                 */
                if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
                        return orders;
                /* Only regular file is valid in collapse path */
                if (((!in_pf || smaps)) && file_thp_enabled(vma))
                        return orders;
                return 0;
        }

        if (vma_is_temporary_stack(vma))
                return 0;

        /*
         * THPeligible bit of smaps should show 1 for proper VMAs even
         * though anon_vma is not initialized yet.
         *
         * Allow page fault since anon_vma may be not initialized until
         * the first page fault.
         */
        if (!vma->anon_vma)
                return (smaps || in_pf) ? orders : 0;

        return orders;
}

static bool get_huge_zero_folio(void)
{
        struct folio *zero_folio;
retry:
        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
                return true;

        zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) &
                                 ~__GFP_MOVABLE,
                        HPAGE_PMD_ORDER);
        if (!zero_folio) {
                count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
                return false;
        }
        /* Ensure zero folio won't have large_rmappable flag set. */
        folio_clear_large_rmappable(zero_folio);
        preempt_disable();
        if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
                preempt_enable();
                folio_put(zero_folio);
                goto retry;
        }
        WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));

        /* We take additional reference here. It will be put back by shrinker */
        atomic_set(&huge_zero_refcount, 2);
        preempt_enable();
        count_vm_event(THP_ZERO_PAGE_ALLOC);
        return true;
}

static void put_huge_zero_folio(void)
{
        /*
         * Counter should never go to zero here. Only shrinker can put
         * last reference.
         */
        BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}

struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
                return huge_zero_folio;

        if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
                return READ_ONCE(huge_zero_folio);

        if (!get_huge_zero_folio())
                return NULL;

        if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm))
                put_huge_zero_folio();

        return READ_ONCE(huge_zero_folio);
}

void mm_put_huge_zero_folio(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
                return;

        if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
                put_huge_zero_folio();
}

static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
                                                  struct shrink_control *sc)
{
        /* we can free zero page only if last reference remains */
        return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}

static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
                                                 struct shrink_control *sc)
{
        if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
                struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
                BUG_ON(zero_folio == NULL);
                WRITE_ONCE(huge_zero_pfn, ~0UL);
                folio_put(zero_folio);
                return HPAGE_PMD_NR;
        }

        return 0;
}

static struct shrinker *huge_zero_folio_shrinker;

#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
                            struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
                output = "[always] madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always [madvise] never";
        else
                output = "always madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

enum anon_enabled_mode {
        ANON_ENABLED_ALWAYS        = 0,
        ANON_ENABLED_INHERIT        = 1,
        ANON_ENABLED_MADVISE        = 2,
        ANON_ENABLED_NEVER        = 3,
};

static const char * const anon_enabled_mode_strings[] = {
        [ANON_ENABLED_ALWAYS]        = "always",
        [ANON_ENABLED_INHERIT]        = "inherit",
        [ANON_ENABLED_MADVISE]        = "madvise",
        [ANON_ENABLED_NEVER]        = "never",
};

enum global_enabled_mode {
        GLOBAL_ENABLED_ALWAYS        = 0,
        GLOBAL_ENABLED_MADVISE        = 1,
        GLOBAL_ENABLED_NEVER        = 2,
};

static const char * const global_enabled_mode_strings[] = {
        [GLOBAL_ENABLED_ALWAYS]                = "always",
        [GLOBAL_ENABLED_MADVISE]        = "madvise",
        [GLOBAL_ENABLED_NEVER]                = "never",
};

static bool set_global_enabled_mode(enum global_enabled_mode mode)
{
        static const unsigned long thp_flags[] = {
                TRANSPARENT_HUGEPAGE_FLAG,
                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
        };
        enum global_enabled_mode m;
        bool changed = false;

        for (m = 0; m < ARRAY_SIZE(thp_flags); m++) {
                if (m == mode)
                        changed |= !test_and_set_bit(thp_flags[m],
                                                     &transparent_hugepage_flags);
                else
                        changed |= test_and_clear_bit(thp_flags[m],
                                                      &transparent_hugepage_flags);
        }

        return changed;
}

static ssize_t enabled_store(struct kobject *kobj,
                             struct kobj_attribute *attr,
                             const char *buf, size_t count)
{
        int mode;

        mode = sysfs_match_string(global_enabled_mode_strings, buf);
        if (mode < 0)
                return -EINVAL;

        if (set_global_enabled_mode(mode)) {
                int err = start_stop_khugepaged();

                if (err)
                        return err;
        } else {
                /*
                 * Recalculate watermarks even when the mode didn't
                 * change, as the previous code always called
                 * start_stop_khugepaged() which does this internally.
                 */
                set_recommended_min_free_kbytes();
        }
        return count;
}

static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);

ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf,
                                  enum transparent_hugepage_flag flag)
{
        return sysfs_emit(buf, "%d\n",
                          !!test_bit(flag, &transparent_hugepage_flags));
}

ssize_t single_hugepage_flag_store(struct kobject *kobj,
                                 struct kobj_attribute *attr,
                                 const char *buf, size_t count,
                                 enum transparent_hugepage_flag flag)
{
        unsigned long value;
        int ret;

        ret = kstrtoul(buf, 10, &value);
        if (ret < 0)
                return ret;
        if (value > 1)
                return -EINVAL;

        if (value)
                set_bit(flag, &transparent_hugepage_flags);
        else
                clear_bit(flag, &transparent_hugepage_flags);

        return count;
}

static ssize_t defrag_show(struct kobject *kobj,
                           struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
                     &transparent_hugepage_flags))
                output = "[always] defer defer+madvise madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
                          &transparent_hugepage_flags))
                output = "always [defer] defer+madvise madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always defer [defer+madvise] madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always defer defer+madvise [madvise] never";
        else
                output = "always defer defer+madvise madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t defrag_store(struct kobject *kobj,
                            struct kobj_attribute *attr,
                            const char *buf, size_t count)
{
        if (sysfs_streq(buf, "always")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "defer+madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "defer")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else
                return -EINVAL;

        return count;
}
static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);

static ssize_t use_zero_page_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return single_hugepage_flag_show(kobj, attr, buf,
                                         TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static ssize_t use_zero_page_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        return single_hugepage_flag_store(kobj, attr, buf, count,
                                 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);

static ssize_t hpage_pmd_size_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
}
static struct kobj_attribute hpage_pmd_size_attr =
        __ATTR_RO(hpage_pmd_size);

static ssize_t split_underused_thp_show(struct kobject *kobj,
                            struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n", split_underused_thp);
}

static ssize_t split_underused_thp_store(struct kobject *kobj,
                             struct kobj_attribute *attr,
                             const char *buf, size_t count)
{
        int err = kstrtobool(buf, &split_underused_thp);

        if (err < 0)
                return err;

        return count;
}

static struct kobj_attribute split_underused_thp_attr = __ATTR(
        shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);

static struct attribute *hugepage_attr[] = {
        &enabled_attr.attr,
        &defrag_attr.attr,
        &use_zero_page_attr.attr,
        &hpage_pmd_size_attr.attr,
#ifdef CONFIG_SHMEM
        &shmem_enabled_attr.attr,
#endif
        &split_underused_thp_attr.attr,
        NULL,
};

static const struct attribute_group hugepage_attr_group = {
        .attrs = hugepage_attr,
};

static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
static void thpsize_release(struct kobject *kobj);
static DEFINE_SPINLOCK(huge_anon_orders_lock);
static LIST_HEAD(thpsize_list);

static ssize_t anon_enabled_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
{
        int order = to_thpsize(kobj)->order;
        const char *output;

        if (test_bit(order, &huge_anon_orders_always))
                output = "[always] inherit madvise never";
        else if (test_bit(order, &huge_anon_orders_inherit))
                output = "always [inherit] madvise never";
        else if (test_bit(order, &huge_anon_orders_madvise))
                output = "always inherit [madvise] never";
        else
                output = "always inherit madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static bool set_anon_enabled_mode(int order, enum anon_enabled_mode mode)
{
        static unsigned long *enabled_orders[] = {
                &huge_anon_orders_always,
                &huge_anon_orders_inherit,
                &huge_anon_orders_madvise,
        };
        enum anon_enabled_mode m;
        bool changed = false;

        spin_lock(&huge_anon_orders_lock);
        for (m = 0; m < ARRAY_SIZE(enabled_orders); m++) {
                if (m == mode)
                        changed |= !__test_and_set_bit(order, enabled_orders[m]);
                else
                        changed |= __test_and_clear_bit(order, enabled_orders[m]);
        }
        spin_unlock(&huge_anon_orders_lock);

        return changed;
}

static ssize_t anon_enabled_store(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  const char *buf, size_t count)
{
        int order = to_thpsize(kobj)->order;
        int mode;

        mode = sysfs_match_string(anon_enabled_mode_strings, buf);
        if (mode < 0)
                return -EINVAL;

        if (set_anon_enabled_mode(order, mode)) {
                int err = start_stop_khugepaged();

                if (err)
                        return err;
        } else {
                /*
                 * Recalculate watermarks even when the mode didn't
                 * change, as the previous code always called
                 * start_stop_khugepaged() which does this internally.
                 */
                set_recommended_min_free_kbytes();
        }

        return count;
}

static struct kobj_attribute anon_enabled_attr =
        __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);

static struct attribute *anon_ctrl_attrs[] = {
        &anon_enabled_attr.attr,
        NULL,
};

static const struct attribute_group anon_ctrl_attr_grp = {
        .attrs = anon_ctrl_attrs,
};

static struct attribute *file_ctrl_attrs[] = {
#ifdef CONFIG_SHMEM
        &thpsize_shmem_enabled_attr.attr,
#endif
        NULL,
};

static const struct attribute_group file_ctrl_attr_grp = {
        .attrs = file_ctrl_attrs,
};

static struct attribute *any_ctrl_attrs[] = {
        NULL,
};

static const struct attribute_group any_ctrl_attr_grp = {
        .attrs = any_ctrl_attrs,
};

static const struct kobj_type thpsize_ktype = {
        .release = &thpsize_release,
        .sysfs_ops = &kobj_sysfs_ops,
};

DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};

static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
{
        unsigned long sum = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                struct mthp_stat *this = &per_cpu(mthp_stats, cpu);

                sum += this->stats[order][item];
        }

        return sum;
}

#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                \
static ssize_t _name##_show(struct kobject *kobj,                        \
                        struct kobj_attribute *attr, char *buf)                \
{                                                                        \
        int order = to_thpsize(kobj)->order;                                \
                                                                        \
        return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));        \
}                                                                        \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)

DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
#ifdef CONFIG_SHMEM
DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
#endif
DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);

static struct attribute *anon_stats_attrs[] = {
        &anon_fault_alloc_attr.attr,
        &anon_fault_fallback_attr.attr,
        &anon_fault_fallback_charge_attr.attr,
#ifndef CONFIG_SHMEM
        &zswpout_attr.attr,
        &swpin_attr.attr,
        &swpin_fallback_attr.attr,
        &swpin_fallback_charge_attr.attr,
        &swpout_attr.attr,
        &swpout_fallback_attr.attr,
#endif
        &split_deferred_attr.attr,
        &nr_anon_attr.attr,
        &nr_anon_partially_mapped_attr.attr,
        NULL,
};

static struct attribute_group anon_stats_attr_grp = {
        .name = "stats",
        .attrs = anon_stats_attrs,
};

static struct attribute *file_stats_attrs[] = {
#ifdef CONFIG_SHMEM
        &shmem_alloc_attr.attr,
        &shmem_fallback_attr.attr,
        &shmem_fallback_charge_attr.attr,
#endif
        NULL,
};

static struct attribute_group file_stats_attr_grp = {
        .name = "stats",
        .attrs = file_stats_attrs,
};

static struct attribute *any_stats_attrs[] = {
#ifdef CONFIG_SHMEM
        &zswpout_attr.attr,
        &swpin_attr.attr,
        &swpin_fallback_attr.attr,
        &swpin_fallback_charge_attr.attr,
        &swpout_attr.attr,
        &swpout_fallback_attr.attr,
#endif
        &split_attr.attr,
        &split_failed_attr.attr,
        NULL,
};

static struct attribute_group any_stats_attr_grp = {
        .name = "stats",
        .attrs = any_stats_attrs,
};

static int sysfs_add_group(struct kobject *kobj,
                           const struct attribute_group *grp)
{
        int ret = -ENOENT;

        /*
         * If the group is named, try to merge first, assuming the subdirectory
         * was already created. This avoids the warning emitted by
         * sysfs_create_group() if the directory already exists.
         */
        if (grp->name)
                ret = sysfs_merge_group(kobj, grp);
        if (ret)
                ret = sysfs_create_group(kobj, grp);

        return ret;
}

static struct thpsize *thpsize_create(int order, struct kobject *parent)
{
        unsigned long size = (PAGE_SIZE << order) / SZ_1K;
        struct thpsize *thpsize;
        int ret = -ENOMEM;

        thpsize = kzalloc_obj(*thpsize);
        if (!thpsize)
                goto err;

        thpsize->order = order;

        ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
                                   "hugepages-%lukB", size);
        if (ret) {
                kfree(thpsize);
                goto err;
        }


        ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp);
        if (ret)
                goto err_put;

        ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp);
        if (ret)
                goto err_put;

        if (BIT(order) & THP_ORDERS_ALL_ANON) {
                ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp);
                if (ret)
                        goto err_put;

                ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp);
                if (ret)
                        goto err_put;
        }

        if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
                ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp);
                if (ret)
                        goto err_put;

                ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp);
                if (ret)
                        goto err_put;
        }

        return thpsize;
err_put:
        kobject_put(&thpsize->kobj);
err:
        return ERR_PTR(ret);
}

static void thpsize_release(struct kobject *kobj)
{
        kfree(to_thpsize(kobj));
}

static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
        int err;
        struct thpsize *thpsize;
        unsigned long orders;
        int order;

        /*
         * Default to setting PMD-sized THP to inherit the global setting and
         * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
         * constant so we have to do this here.
         */
        if (!anon_orders_configured)
                huge_anon_orders_inherit = BIT(PMD_ORDER);

        *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
        if (unlikely(!*hugepage_kobj)) {
                pr_err("failed to create transparent hugepage kobject\n");
                return -ENOMEM;
        }

        err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
        if (err) {
                pr_err("failed to register transparent hugepage group\n");
                goto delete_obj;
        }

        err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
        if (err) {
                pr_err("failed to register transparent hugepage group\n");
                goto remove_hp_group;
        }

        orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
        order = highest_order(orders);
        while (orders) {
                thpsize = thpsize_create(order, *hugepage_kobj);
                if (IS_ERR(thpsize)) {
                        pr_err("failed to create thpsize for order %d\n", order);
                        err = PTR_ERR(thpsize);
                        goto remove_all;
                }
                list_add(&thpsize->node, &thpsize_list);
                order = next_order(&orders, order);
        }

        return 0;

remove_all:
        hugepage_exit_sysfs(*hugepage_kobj);
        return err;
remove_hp_group:
        sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
delete_obj:
        kobject_put(*hugepage_kobj);
        return err;
}

static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
        struct thpsize *thpsize, *tmp;

        list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
                list_del(&thpsize->node);
                kobject_put(&thpsize->kobj);
        }

        sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
        sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
        kobject_put(hugepage_kobj);
}
#else
static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
        return 0;
}

static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
}
#endif /* CONFIG_SYSFS */

static int __init thp_shrinker_init(void)
{
        deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
                                                 SHRINKER_MEMCG_AWARE |
                                                 SHRINKER_NONSLAB,
                                                 "thp-deferred_split");
        if (!deferred_split_shrinker)
                return -ENOMEM;

        deferred_split_shrinker->count_objects = deferred_split_count;
        deferred_split_shrinker->scan_objects = deferred_split_scan;
        shrinker_register(deferred_split_shrinker);

        if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
                /*
                 * Bump the reference of the huge_zero_folio and do not
                 * initialize the shrinker.
                 *
                 * huge_zero_folio will always be NULL on failure. We assume
                 * that get_huge_zero_folio() will most likely not fail as
                 * thp_shrinker_init() is invoked early on during boot.
                 */
                if (!get_huge_zero_folio())
                        pr_warn("Allocating persistent huge zero folio failed\n");
                return 0;
        }

        huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
        if (!huge_zero_folio_shrinker) {
                shrinker_free(deferred_split_shrinker);
                return -ENOMEM;
        }

        huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count;
        huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
        shrinker_register(huge_zero_folio_shrinker);

        return 0;
}

static void __init thp_shrinker_exit(void)
{
        shrinker_free(huge_zero_folio_shrinker);
        shrinker_free(deferred_split_shrinker);
}

static int __init hugepage_init(void)
{
        int err;
        struct kobject *hugepage_kobj;

        if (!has_transparent_hugepage()) {
                transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
                return -EINVAL;
        }

        /*
         * hugepages can't be allocated by the buddy allocator
         */
        MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);

        err = hugepage_init_sysfs(&hugepage_kobj);
        if (err)
                goto err_sysfs;

        err = khugepaged_init();
        if (err)
                goto err_slab;

        err = thp_shrinker_init();
        if (err)
                goto err_shrinker;

        /*
         * By default disable transparent hugepages on smaller systems,
         * where the extra memory used could hurt more than TLB overhead
         * is likely to save.  The admin can still enable it through /sys.
         */
        if (totalram_pages() < MB_TO_PAGES(512)) {
                transparent_hugepage_flags = 0;
                return 0;
        }

        err = start_stop_khugepaged();
        if (err)
                goto err_khugepaged;

        return 0;
err_khugepaged:
        thp_shrinker_exit();
err_shrinker:
        khugepaged_destroy();
err_slab:
        hugepage_exit_sysfs(hugepage_kobj);
err_sysfs:
        return err;
}
subsys_initcall(hugepage_init);

static int __init setup_transparent_hugepage(char *str)
{
        int ret = 0;
        if (!str)
                goto out;
        if (!strcmp(str, "always")) {
                set_bit(TRANSPARENT_HUGEPAGE_FLAG,
                        &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags);
                ret = 1;
        } else if (!strcmp(str, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
                          &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                        &transparent_hugepage_flags);
                ret = 1;
        } else if (!strcmp(str, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
                          &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags);
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("transparent_hugepage= cannot parse, ignored\n");
        return ret;
}
__setup("transparent_hugepage=", setup_transparent_hugepage);

static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_anon(char *str)
{
        char *token, *range, *policy, *subtoken;
        unsigned long always, inherit, madvise;
        char *start_size, *end_size;
        int start, end, nr;
        char *p;

        if (!str || strlen(str) + 1 > PAGE_SIZE)
                goto err;
        strscpy(str_dup, str);

        always = huge_anon_orders_always;
        madvise = huge_anon_orders_madvise;
        inherit = huge_anon_orders_inherit;
        p = str_dup;
        while ((token = strsep(&p, ";")) != NULL) {
                range = strsep(&token, ":");
                policy = token;

                if (!policy)
                        goto err;

                while ((subtoken = strsep(&range, ",")) != NULL) {
                        if (strchr(subtoken, '-')) {
                                start_size = strsep(&subtoken, "-");
                                end_size = subtoken;

                                start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON);
                                end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON);
                        } else {
                                start_size = end_size = subtoken;
                                start = end = get_order_from_str(subtoken,
                                                                 THP_ORDERS_ALL_ANON);
                        }

                        if (start == -EINVAL) {
                                pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
                                goto err;
                        }

                        if (end == -EINVAL) {
                                pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
                                goto err;
                        }

                        if (start < 0 || end < 0 || start > end)
                                goto err;

                        nr = end - start + 1;
                        if (!strcmp(policy, "always")) {
                                bitmap_set(&always, start, nr);
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                        } else if (!strcmp(policy, "madvise")) {
                                bitmap_set(&madvise, start, nr);
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&always, start, nr);
                        } else if (!strcmp(policy, "inherit")) {
                                bitmap_set(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&always, start, nr);
                        } else if (!strcmp(policy, "never")) {
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&always, start, nr);
                        } else {
                                pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
                                goto err;
                        }
                }
        }

        huge_anon_orders_always = always;
        huge_anon_orders_madvise = madvise;
        huge_anon_orders_inherit = inherit;
        anon_orders_configured = true;
        return 1;

err:
        pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str);
        return 0;
}
__setup("thp_anon=", setup_thp_anon);

pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pmd = pmd_mkwrite(pmd, vma);
        return pmd;
}

static struct deferred_split *split_queue_node(int nid)
{
        struct pglist_data *pgdata = NODE_DATA(nid);

        return &pgdata->deferred_split_queue;
}

#ifdef CONFIG_MEMCG
static inline
struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
                                           struct deferred_split *queue)
{
        if (mem_cgroup_disabled())
                return NULL;
        if (split_queue_node(folio_nid(folio)) == queue)
                return NULL;
        return container_of(queue, struct mem_cgroup, deferred_split_queue);
}

static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
{
        return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
}
#else
static inline
struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
                                           struct deferred_split *queue)
{
        return NULL;
}

static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
{
        return split_queue_node(nid);
}
#endif

static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
{
        struct deferred_split *queue;

retry:
        queue = memcg_split_queue(nid, memcg);
        spin_lock(&queue->split_queue_lock);
        /*
         * There is a period between setting memcg to dying and reparenting
         * deferred split queue, and during this period the THPs in the deferred
         * split queue will be hidden from the shrinker side.
         */
        if (unlikely(memcg_is_dying(memcg))) {
                spin_unlock(&queue->split_queue_lock);
                memcg = parent_mem_cgroup(memcg);
                goto retry;
        }

        return queue;
}

static struct deferred_split *
split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
{
        struct deferred_split *queue;

retry:
        queue = memcg_split_queue(nid, memcg);
        spin_lock_irqsave(&queue->split_queue_lock, *flags);
        if (unlikely(memcg_is_dying(memcg))) {
                spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
                memcg = parent_mem_cgroup(memcg);
                goto retry;
        }

        return queue;
}

static struct deferred_split *folio_split_queue_lock(struct folio *folio)
{
        struct deferred_split *queue;

        rcu_read_lock();
        queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
        /*
         * The memcg destruction path is acquiring the split queue lock for
         * reparenting. Once you have it locked, it's safe to drop the rcu lock.
         */
        rcu_read_unlock();

        return queue;
}

static struct deferred_split *
folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
{
        struct deferred_split *queue;

        rcu_read_lock();
        queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
        rcu_read_unlock();

        return queue;
}

static inline void split_queue_unlock(struct deferred_split *queue)
{
        spin_unlock(&queue->split_queue_lock);
}

static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
                                                 unsigned long flags)
{
        spin_unlock_irqrestore(&queue->split_queue_lock, flags);
}

static inline bool is_transparent_hugepage(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return false;

        return is_huge_zero_folio(folio) ||
                folio_test_large_rmappable(folio);
}

static unsigned long __thp_get_unmapped_area(struct file *filp,
                unsigned long addr, unsigned long len,
                loff_t off, unsigned long flags, unsigned long size,
                vm_flags_t vm_flags)
{
        loff_t off_end = off + len;
        loff_t off_align = round_up(off, size);
        unsigned long len_pad, ret, off_sub;

        if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
                return 0;

        if (off_end <= off_align || (off_end - off_align) < size)
                return 0;

        len_pad = len + size;
        if (len_pad < len || (off + len_pad) < off)
                return 0;

        ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad,
                                           off >> PAGE_SHIFT, flags, vm_flags);

        /*
         * The failure might be due to length padding. The caller will retry
         * without the padding.
         */
        if (IS_ERR_VALUE(ret))
                return 0;

        /*
         * Do not try to align to THP boundary if allocation at the address
         * hint succeeds.
         */
        if (ret == addr)
                return addr;

        off_sub = (off - ret) & (size - 1);

        if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub)
                return ret + size;

        ret += off_sub;
        return ret;
}

unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags,
                vm_flags_t vm_flags)
{
        unsigned long ret;
        loff_t off = (loff_t)pgoff << PAGE_SHIFT;

        ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
        if (ret)
                return ret;

        return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags,
                                            vm_flags);
}

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
{
        return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);

static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
                unsigned long addr)
{
        gfp_t gfp = vma_thp_gfp_mask(vma);
        const int order = HPAGE_PMD_ORDER;
        struct folio *folio;

        folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);

        if (unlikely(!folio)) {
                count_vm_event(THP_FAULT_FALLBACK);
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
                return NULL;
        }

        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
                folio_put(folio);
                count_vm_event(THP_FAULT_FALLBACK);
                count_vm_event(THP_FAULT_FALLBACK_CHARGE);
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                return NULL;
        }
        folio_throttle_swaprate(folio, gfp);

       /*
        * When a folio is not zeroed during allocation (__GFP_ZERO not used)
        * or user folios require special handling, folio_zero_user() is used to
        * make sure that the page corresponding to the faulting address will be
        * hot in the cache after zeroing.
        */
        if (user_alloc_needs_zeroing())
                folio_zero_user(folio, addr);
        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
         * folio_zero_user writes become visible before the set_pmd_at()
         * write.
         */
        __folio_mark_uptodate(folio);
        return folio;
}

void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd,
                struct vm_area_struct *vma, unsigned long haddr)
{
        pmd_t entry;

        entry = folio_mk_pmd(folio, vma->vm_page_prot);
        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
        folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
        folio_add_lru_vma(folio, vma);
        set_pmd_at(vma->vm_mm, haddr, pmd, entry);
        update_mmu_cache_pmd(vma, haddr, pmd);
        deferred_split_folio(folio, false);
}

static void map_anon_folio_pmd_pf(struct folio *folio, pmd_t *pmd,
                struct vm_area_struct *vma, unsigned long haddr)
{
        map_anon_folio_pmd_nopf(folio, pmd, vma, haddr);
        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
        count_vm_event(THP_FAULT_ALLOC);
        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
        count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}

static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        pgtable_t pgtable;
        vm_fault_t ret = 0;

        folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
        if (unlikely(!folio))
                return VM_FAULT_FALLBACK;

        pgtable = pte_alloc_one(vma->vm_mm);
        if (unlikely(!pgtable)) {
                ret = VM_FAULT_OOM;
                goto release;
        }

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd))) {
                goto unlock_release;
        } else {
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock_release;

                /* Deliver the page fault to userland */
                if (userfaultfd_missing(vma)) {
                        spin_unlock(vmf->ptl);
                        folio_put(folio);
                        pte_free(vma->vm_mm, pgtable);
                        ret = handle_userfault(vmf, VM_UFFD_MISSING);
                        VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        return ret;
                }
                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
                map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr);
                mm_inc_nr_ptes(vma->vm_mm);
                spin_unlock(vmf->ptl);
        }

        return 0;
unlock_release:
        spin_unlock(vmf->ptl);
release:
        if (pgtable)
                pte_free(vma->vm_mm, pgtable);
        folio_put(folio);
        return ret;

}

vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;
        spinlock_t *ptl;
        softleaf_t entry;
        struct page *page;
        struct folio *folio;

        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                vma_end_read(vma);
                return VM_FAULT_RETRY;
        }

        ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) {
                spin_unlock(ptl);
                return 0;
        }

        entry = softleaf_from_pmd(vmf->orig_pmd);
        page = softleaf_to_page(entry);
        folio = page_folio(page);
        vmf->page = page;
        vmf->pte = NULL;
        if (folio_trylock(folio)) {
                folio_get(folio);
                spin_unlock(ptl);
                ret = page_pgmap(page)->ops->migrate_to_ram(vmf);
                folio_unlock(folio);
                folio_put(folio);
        } else {
                spin_unlock(ptl);
        }

        return ret;
}

/*
 * always: directly stall for all thp allocations
 * defer: wake kswapd and fail if not immediately available
 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
 *                  fail if not immediately available
 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
 *            available
 * never: never stall for any thp allocation
 */
gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
{
        const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);

        /* Always do synchronous compaction */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);

        /* Kick kcompactd and fail quickly */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;

        /* Synchronous compaction if madvised, otherwise kick kcompactd */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT |
                        (vma_madvised ? __GFP_DIRECT_RECLAIM :
                                        __GFP_KSWAPD_RECLAIM);

        /* Only do synchronous compaction if madvised */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT |
                       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);

        return GFP_TRANSHUGE_LIGHT;
}

/* Caller must hold page table lock. */
static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct folio *zero_folio)
{
        pmd_t entry;
        entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
        entry = pmd_mkspecial(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        mm_inc_nr_ptes(mm);
}

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        vm_fault_t ret;

        if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                return VM_FAULT_FALLBACK;
        ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;
        khugepaged_enter_vma(vma, vma->vm_flags);

        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm) &&
                        transparent_hugepage_use_zero_page()) {
                pgtable_t pgtable;
                struct folio *zero_folio;
                vm_fault_t ret;

                pgtable = pte_alloc_one(vma->vm_mm);
                if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
                zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
                if (unlikely(!zero_folio)) {
                        pte_free(vma->vm_mm, pgtable);
                        count_vm_event(THP_FAULT_FALLBACK);
                        return VM_FAULT_FALLBACK;
                }
                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                ret = 0;
                if (pmd_none(*vmf->pmd)) {
                        ret = check_stable_address_space(vma->vm_mm);
                        if (ret) {
                                spin_unlock(vmf->ptl);
                                pte_free(vma->vm_mm, pgtable);
                        } else if (userfaultfd_missing(vma)) {
                                spin_unlock(vmf->ptl);
                                pte_free(vma->vm_mm, pgtable);
                                ret = handle_userfault(vmf, VM_UFFD_MISSING);
                                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        } else {
                                set_huge_zero_folio(pgtable, vma->vm_mm, vma,
                                                   haddr, vmf->pmd, zero_folio);
                                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                                spin_unlock(vmf->ptl);
                        }
                } else {
                        spin_unlock(vmf->ptl);
                        pte_free(vma->vm_mm, pgtable);
                }
                return ret;
        }

        return __do_huge_pmd_anonymous_page(vmf);
}

struct folio_or_pfn {
        union {
                struct folio *folio;
                unsigned long pfn;
        };
        bool is_folio;
};

static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
                bool write)
{
        struct mm_struct *mm = vma->vm_mm;
        pgtable_t pgtable = NULL;
        spinlock_t *ptl;
        pmd_t entry;

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (arch_needs_pgtable_deposit()) {
                pgtable = pte_alloc_one(vma->vm_mm);
                if (!pgtable)
                        return VM_FAULT_OOM;
        }

        ptl = pmd_lock(mm, pmd);
        if (!pmd_none(*pmd)) {
                const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
                                          fop.pfn;

                if (write) {
                        if (pmd_pfn(*pmd) != pfn) {
                                WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
                                goto out_unlock;
                        }
                        entry = pmd_mkyoung(*pmd);
                        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                        if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
                                update_mmu_cache_pmd(vma, addr, pmd);
                }
                goto out_unlock;
        }

        if (fop.is_folio) {
                entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);

                if (is_huge_zero_folio(fop.folio)) {
                        entry = pmd_mkspecial(entry);
                } else {
                        folio_get(fop.folio);
                        folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
                        add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
                }
        } else {
                entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
                entry = pmd_mkspecial(entry);
        }
        if (write) {
                entry = pmd_mkyoung(pmd_mkdirty(entry));
                entry = maybe_pmd_mkwrite(entry, vma);
        }

        if (pgtable) {
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                mm_inc_nr_ptes(mm);
                pgtable = NULL;
        }

        set_pmd_at(mm, addr, pmd, entry);
        update_mmu_cache_pmd(vma, addr, pmd);

out_unlock:
        spin_unlock(ptl);
        if (pgtable)
                pte_free(mm, pgtable);
        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_pfn_pmd - insert a pmd size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @write: whether it's a write fault
 *
 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
                              bool write)
{
        unsigned long addr = vmf->address & PMD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        pgprot_t pgprot = vma->vm_page_prot;
        struct folio_or_pfn fop = {
                .pfn = pfn,
        };

        /*
         * If we had pmd_special, we could avoid all these restrictions,
         * but we need to be consistent with PTEs and architectures that
         * can't support a 'special' bit.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

        pfnmap_setup_cachemode_pfn(pfn, &pgprot);

        return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);

vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
                                bool write)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long addr = vmf->address & PMD_MASK;
        struct folio_or_pfn fop = {
                .folio = folio,
                .is_folio = true,
        };

        if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
                return VM_FAULT_SIGBUS;

        return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pud = pud_mkwrite(pud);
        return pud;
}

static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
{
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        pud_t entry;

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        ptl = pud_lock(mm, pud);
        if (!pud_none(*pud)) {
                const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
                                          fop.pfn;

                if (write) {
                        if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
                                goto out_unlock;
                        entry = pud_mkyoung(*pud);
                        entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
                        if (pudp_set_access_flags(vma, addr, pud, entry, 1))
                                update_mmu_cache_pud(vma, addr, pud);
                }
                goto out_unlock;
        }

        if (fop.is_folio) {
                entry = folio_mk_pud(fop.folio, vma->vm_page_prot);

                folio_get(fop.folio);
                folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
                add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR);
        } else {
                entry = pud_mkhuge(pfn_pud(fop.pfn, prot));
                entry = pud_mkspecial(entry);
        }
        if (write) {
                entry = pud_mkyoung(pud_mkdirty(entry));
                entry = maybe_pud_mkwrite(entry, vma);
        }
        set_pud_at(mm, addr, pud, entry);
        update_mmu_cache_pud(vma, addr, pud);
out_unlock:
        spin_unlock(ptl);
        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_pfn_pud - insert a pud size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @write: whether it's a write fault
 *
 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
                              bool write)
{
        unsigned long addr = vmf->address & PUD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        pgprot_t pgprot = vma->vm_page_prot;
        struct folio_or_pfn fop = {
                .pfn = pfn,
        };

        /*
         * If we had pud_special, we could avoid all these restrictions,
         * but we need to be consistent with PTEs and architectures that
         * can't support a 'special' bit.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

        pfnmap_setup_cachemode_pfn(pfn, &pgprot);

        return insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);

/**
 * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
 * @vmf: Structure describing the fault
 * @folio: folio to insert
 * @write: whether it's a write fault
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
                                bool write)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long addr = vmf->address & PUD_MASK;
        struct folio_or_pfn fop = {
                .folio = folio,
                .is_folio = true,
        };

        if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
                return VM_FAULT_SIGBUS;

        return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

/**
 * touch_pmd - Mark page table pmd entry as accessed and dirty (for write)
 * @vma: The VMA covering @addr
 * @addr: The virtual address
 * @pmd: pmd pointer into the page table mapping @addr
 * @write: Whether it's a write access
 *
 * Return: whether the pmd entry is changed
 */
bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
               pmd_t *pmd, bool write)
{
        pmd_t entry;

        entry = pmd_mkyoung(*pmd);
        if (write)
                entry = pmd_mkdirty(entry);
        if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
                                  pmd, entry, write)) {
                update_mmu_cache_pmd(vma, addr, pmd);
                return true;
        }

        return false;
}

static void copy_huge_non_present_pmd(
                struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                pmd_t pmd, pgtable_t pgtable)
{
        softleaf_t entry = softleaf_from_pmd(pmd);
        struct folio *src_folio;

        VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd));

        if (softleaf_is_migration_write(entry) ||
            softleaf_is_migration_read_exclusive(entry)) {
                entry = make_readable_migration_entry(swp_offset(entry));
                pmd = swp_entry_to_pmd(entry);
                if (pmd_swp_soft_dirty(*src_pmd))
                        pmd = pmd_swp_mksoft_dirty(pmd);
                if (pmd_swp_uffd_wp(*src_pmd))
                        pmd = pmd_swp_mkuffd_wp(pmd);
                set_pmd_at(src_mm, addr, src_pmd, pmd);
        } else if (softleaf_is_device_private(entry)) {
                /*
                 * For device private entries, since there are no
                 * read exclusive entries, writable = !readable
                 */
                if (softleaf_is_device_private_write(entry)) {
                        entry = make_readable_device_private_entry(swp_offset(entry));
                        pmd = swp_entry_to_pmd(entry);

                        if (pmd_swp_soft_dirty(*src_pmd))
                                pmd = pmd_swp_mksoft_dirty(pmd);
                        if (pmd_swp_uffd_wp(*src_pmd))
                                pmd = pmd_swp_mkuffd_wp(pmd);
                        set_pmd_at(src_mm, addr, src_pmd, pmd);
                }

                src_folio = softleaf_to_folio(entry);
                VM_WARN_ON(!folio_test_large(src_folio));

                folio_get(src_folio);
                /*
                 * folio_try_dup_anon_rmap_pmd does not fail for
                 * device private entries.
                 */
                folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
                                            dst_vma, src_vma);
        }

        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
        mm_inc_nr_ptes(dst_mm);
        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        if (!userfaultfd_wp(dst_vma))
                pmd = pmd_swp_clear_uffd_wp(pmd);
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
}

int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        spinlock_t *dst_ptl, *src_ptl;
        struct page *src_page;
        struct folio *src_folio;
        pmd_t pmd;
        pgtable_t pgtable = NULL;
        int ret = -ENOMEM;

        pmd = pmdp_get_lockless(src_pmd);
        if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
                     !is_huge_zero_pmd(pmd))) {
                dst_ptl = pmd_lock(dst_mm, dst_pmd);
                src_ptl = pmd_lockptr(src_mm, src_pmd);
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                /*
                 * No need to recheck the pmd, it can't change with write
                 * mmap lock held here.
                 *
                 * Meanwhile, making sure it's not a CoW VMA with writable
                 * mapping, otherwise it means either the anon page wrongly
                 * applied special bit, or we made the PRIVATE mapping be
                 * able to wrongly write to the backend MMIO.
                 */
                VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
                goto set_pmd;
        }

        /* Skip if can be re-fill on fault */
        if (!vma_is_anonymous(dst_vma))
                return 0;

        pgtable = pte_alloc_one(dst_mm);
        if (unlikely(!pgtable))
                goto out;

        dst_ptl = pmd_lock(dst_mm, dst_pmd);
        src_ptl = pmd_lockptr(src_mm, src_pmd);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

        ret = -EAGAIN;
        pmd = *src_pmd;

        if (unlikely(thp_migration_supported() &&
                     pmd_is_valid_softleaf(pmd))) {
                copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
                                          dst_vma, src_vma, pmd, pgtable);
                ret = 0;
                goto out_unlock;
        }

        if (unlikely(!pmd_trans_huge(pmd))) {
                pte_free(dst_mm, pgtable);
                goto out_unlock;
        }
        /*
         * When page table lock is held, the huge zero pmd should not be
         * under splitting since we don't split the page itself, only pmd to
         * a page table.
         */
        if (is_huge_zero_pmd(pmd)) {
                /*
                 * mm_get_huge_zero_folio() will never allocate a new
                 * folio here, since we already have a zero page to
                 * copy. It just takes a reference.
                 */
                mm_get_huge_zero_folio(dst_mm);
                goto out_zero_page;
        }

        src_page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
        src_folio = page_folio(src_page);

        folio_get(src_folio);
        if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
                /* Page maybe pinned: split and retry the fault on PTEs. */
                folio_put(src_folio);
                pte_free(dst_mm, pgtable);
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
                __split_huge_pmd(src_vma, src_pmd, addr, false);
                return -EAGAIN;
        }
        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
out_zero_page:
        mm_inc_nr_ptes(dst_mm);
        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        if (!userfaultfd_wp(dst_vma))
                pmd = pmd_clear_uffd_wp(pmd);
        pmd = pmd_wrprotect(pmd);
set_pmd:
        pmd = pmd_mkold(pmd);
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);

        ret = 0;
out_unlock:
        spin_unlock(src_ptl);
        spin_unlock(dst_ptl);
out:
        return ret;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
               pud_t *pud, bool write)
{
        pud_t _pud;

        _pud = pud_mkyoung(*pud);
        if (write)
                _pud = pud_mkdirty(_pud);
        if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
                                  pud, _pud, write))
                update_mmu_cache_pud(vma, addr, pud);
}

int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                  struct vm_area_struct *vma)
{
        spinlock_t *dst_ptl, *src_ptl;
        pud_t pud;
        int ret;

        dst_ptl = pud_lock(dst_mm, dst_pud);
        src_ptl = pud_lockptr(src_mm, src_pud);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

        ret = -EAGAIN;
        pud = *src_pud;
        if (unlikely(!pud_trans_huge(pud)))
                goto out_unlock;

        /*
         * TODO: once we support anonymous pages, use
         * folio_try_dup_anon_rmap_*() and split if duplicating fails.
         */
        if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) {
                pudp_set_wrprotect(src_mm, addr, src_pud);
                pud = pud_wrprotect(pud);
        }
        pud = pud_mkold(pud);
        set_pud_at(dst_mm, addr, dst_pud, pud);

        ret = 0;
out_unlock:
        spin_unlock(src_ptl);
        spin_unlock(dst_ptl);
        return ret;
}

void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
        bool write = vmf->flags & FAULT_FLAG_WRITE;

        vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
        if (unlikely(!pud_same(*vmf->pud, orig_pud)))
                goto unlock;

        touch_pud(vmf->vma, vmf->address, vmf->pud, write);
unlock:
        spin_unlock(vmf->ptl);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

bool huge_pmd_set_accessed(struct vm_fault *vmf)
{
        bool write = vmf->flags & FAULT_FLAG_WRITE;

        if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
                return false;

        return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
}

static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
{
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        struct mmu_notifier_range range;
        struct folio *folio;
        vm_fault_t ret = 0;

        folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
        if (unlikely(!folio))
                return VM_FAULT_FALLBACK;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
                                haddr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
                goto release;
        ret = check_stable_address_space(vma->vm_mm);
        if (ret)
                goto release;
        (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
        map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr);
        goto unlock;
release:
        folio_put(folio);
unlock:
        spin_unlock(vmf->ptl);
        mmu_notifier_invalidate_range_end(&range);
        return ret;
}

vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        struct page *page;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t orig_pmd = vmf->orig_pmd;

        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);

        if (is_huge_zero_pmd(orig_pmd)) {
                vm_fault_t ret = do_huge_zero_wp_pmd(vmf);

                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;

                /* Fallback to splitting PMD if THP cannot be allocated */
                goto fallback;
        }

        spin_lock(vmf->ptl);

        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                spin_unlock(vmf->ptl);
                return 0;
        }

        page = pmd_page(orig_pmd);
        folio = page_folio(page);
        VM_BUG_ON_PAGE(!PageHead(page), page);

        /* Early check when only holding the PT lock. */
        if (PageAnonExclusive(page))
                goto reuse;

        if (!folio_trylock(folio)) {
                folio_get(folio);
                spin_unlock(vmf->ptl);
                folio_lock(folio);
                spin_lock(vmf->ptl);
                if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                        spin_unlock(vmf->ptl);
                        folio_unlock(folio);
                        folio_put(folio);
                        return 0;
                }
                folio_put(folio);
        }

        /* Recheck after temporarily dropping the PT lock. */
        if (PageAnonExclusive(page)) {
                folio_unlock(folio);
                goto reuse;
        }

        /*
         * See do_wp_page(): we can only reuse the folio exclusively if
         * there are no additional references. Note that we always drain
         * the LRU cache immediately after adding a THP.
         */
        if (folio_ref_count(folio) >
                        1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
                goto unlock_fallback;
        if (folio_test_swapcache(folio))
                folio_free_swap(folio);
        if (folio_ref_count(folio) == 1) {
                pmd_t entry;

                folio_move_anon_rmap(folio, vma);
                SetPageAnonExclusive(page);
                folio_unlock(folio);
reuse:
                if (unlikely(unshare)) {
                        spin_unlock(vmf->ptl);
                        return 0;
                }
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                spin_unlock(vmf->ptl);
                return 0;
        }

unlock_fallback:
        folio_unlock(folio);
        spin_unlock(vmf->ptl);
fallback:
        __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
        return VM_FAULT_FALLBACK;
}

static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
                                           unsigned long addr, pmd_t pmd)
{
        struct page *page;

        if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
                return false;

        /* Don't touch entries that are not even readable (NUMA hinting). */
        if (pmd_protnone(pmd))
                return false;

        /* Do we need write faults for softdirty tracking? */
        if (pmd_needs_soft_dirty_wp(vma, pmd))
                return false;

        /* Do we need write faults for uffd-wp tracking? */
        if (userfaultfd_huge_pmd_wp(vma, pmd))
                return false;

        if (!(vma->vm_flags & VM_SHARED)) {
                /* See can_change_pte_writable(). */
                page = vm_normal_page_pmd(vma, addr, pmd);
                return page && PageAnon(page) && PageAnonExclusive(page);
        }

        /* See can_change_pte_writable(). */
        return pmd_dirty(pmd);
}

/* NUMA hinting page fault entry point for trans huge pmds */
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        int nid = NUMA_NO_NODE;
        int target_nid, last_cpupid;
        pmd_t pmd, old_pmd;
        bool writable = false;
        int flags = 0;

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        old_pmd = pmdp_get(vmf->pmd);

        if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
                spin_unlock(vmf->ptl);
                return 0;
        }

        pmd = pmd_modify(old_pmd, vma->vm_page_prot);

        /*
         * Detect now whether the PMD could be writable; this information
         * is only valid while holding the PT lock.
         */
        writable = pmd_write(pmd);
        if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
            can_change_pmd_writable(vma, vmf->address, pmd))
                writable = true;

        folio = vm_normal_folio_pmd(vma, haddr, pmd);
        if (!folio)
                goto out_map;

        nid = folio_nid(folio);

        target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
                                        &last_cpupid);
        if (target_nid == NUMA_NO_NODE)
                goto out_map;
        if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
                flags |= TNF_MIGRATE_FAIL;
                goto out_map;
        }
        /* The folio is isolated and isolation code holds a folio reference. */
        spin_unlock(vmf->ptl);
        writable = false;

        if (!migrate_misplaced_folio(folio, target_nid)) {
                flags |= TNF_MIGRATED;
                nid = target_nid;
                task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
                return 0;
        }

        flags |= TNF_MIGRATE_FAIL;
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
                spin_unlock(vmf->ptl);
                return 0;
        }
out_map:
        /* Restore the PMD */
        pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot);
        pmd = pmd_mkyoung(pmd);
        if (writable)
                pmd = pmd_mkwrite(pmd, vma);
        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
        spin_unlock(vmf->ptl);

        if (nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
        return 0;
}

/*
 * Return true if we do MADV_FREE successfully on entire pmd page.
 * Otherwise, return false.
 */
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                pmd_t *pmd, unsigned long addr, unsigned long next)
{
        spinlock_t *ptl;
        pmd_t orig_pmd;
        struct folio *folio;
        struct mm_struct *mm = tlb->mm;
        bool ret = false;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                goto out_unlocked;

        orig_pmd = *pmd;
        if (is_huge_zero_pmd(orig_pmd))
                goto out;

        if (unlikely(!pmd_present(orig_pmd))) {
                VM_BUG_ON(thp_migration_supported() &&
                                  !pmd_is_migration_entry(orig_pmd));
                goto out;
        }

        folio = pmd_folio(orig_pmd);
        /*
         * If other processes are mapping this folio, we couldn't discard
         * the folio unless they all do MADV_FREE so let's skip the folio.
         */
        if (folio_maybe_mapped_shared(folio))
                goto out;

        if (!folio_trylock(folio))
                goto out;

        /*
         * If user want to discard part-pages of THP, split it so MADV_FREE
         * will deactivate only them.
         */
        if (next - addr != HPAGE_PMD_SIZE) {
                folio_get(folio);
                spin_unlock(ptl);
                split_folio(folio);
                folio_unlock(folio);
                folio_put(folio);
                goto out_unlocked;
        }

        if (folio_test_dirty(folio))
                folio_clear_dirty(folio);
        folio_unlock(folio);

        if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
                pmdp_invalidate(vma, addr, pmd);
                orig_pmd = pmd_mkold(orig_pmd);
                orig_pmd = pmd_mkclean(orig_pmd);

                set_pmd_at(mm, addr, pmd, orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
        }

        folio_mark_lazyfree(folio);
        ret = true;
out:
        spin_unlock(ptl);
out_unlocked:
        return ret;
}

static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
{
        pgtable_t pgtable;

        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pte_free(mm, pgtable);
        mm_dec_nr_ptes(mm);
}

static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma,
                pmd_t pmdval, struct folio *folio, bool is_present)
{
        const bool is_device_private = folio_is_device_private(folio);

        /* Present and device private folios are rmappable. */
        if (is_present || is_device_private)
                folio_remove_rmap_pmd(folio, &folio->page, vma);

        if (folio_test_anon(folio)) {
                add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
        } else {
                add_mm_counter(mm, mm_counter_file(folio),
                               -HPAGE_PMD_NR);

                if (is_present && pmd_young(pmdval) &&
                    likely(vma_has_recency(vma)))
                        folio_mark_accessed(folio);
        }

        /* Device private folios are pinned. */
        if (is_device_private)
                folio_put(folio);
}

static struct folio *normal_or_softleaf_folio_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t pmdval, bool is_present)
{
        if (is_present)
                return vm_normal_folio_pmd(vma, addr, pmdval);

        if (!thp_migration_supported())
                WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
        return pmd_to_softleaf_folio(pmdval);
}

static bool has_deposited_pgtable(struct vm_area_struct *vma, pmd_t pmdval,
                struct folio *folio)
{
        /* Some architectures require unconditional depositing. */
        if (arch_needs_pgtable_deposit())
                return true;

        /*
         * Huge zero always deposited except for DAX which handles itself, see
         * set_huge_zero_folio().
         */
        if (is_huge_zero_pmd(pmdval))
                return !vma_is_dax(vma);

        /*
         * Otherwise, only anonymous folios are deposited, see
         * __do_huge_pmd_anonymous_page().
         */
        return folio && folio_test_anon(folio);
}

/**
 * zap_huge_pmd - Zap a huge THP which is of PMD size.
 * @tlb: The MMU gather TLB state associated with the operation.
 * @vma: The VMA containing the range to zap.
 * @pmd: A pointer to the leaf PMD entry.
 * @addr: The virtual address for the range to zap.
 *
 * Returns: %true on success, %false otherwise.
 */
bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
{
        struct mm_struct *mm = tlb->mm;
        struct folio *folio = NULL;
        bool is_present = false;
        bool has_deposit;
        spinlock_t *ptl;
        pmd_t orig_pmd;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        ptl = __pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                return false;
        /*
         * For architectures like ppc64 we look at deposited pgtable
         * when calling pmdp_huge_get_and_clear. So do the
         * pgtable_trans_huge_withdraw after finishing pmdp related
         * operations.
         */
        orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
                                                tlb->fullmm);
        arch_check_zapped_pmd(vma, orig_pmd);
        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);

        is_present = pmd_present(orig_pmd);
        folio = normal_or_softleaf_folio_pmd(vma, addr, orig_pmd, is_present);
        has_deposit = has_deposited_pgtable(vma, orig_pmd, folio);
        if (folio)
                zap_huge_pmd_folio(mm, vma, orig_pmd, folio, is_present);
        if (has_deposit)
                zap_deposited_table(mm, pmd);

        spin_unlock(ptl);
        if (is_present && folio)
                tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
        return true;
}

#ifndef pmd_move_must_withdraw
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
                                         spinlock_t *old_pmd_ptl,
                                         struct vm_area_struct *vma)
{
        /*
         * With split pmd lock we also need to move preallocated
         * PTE page table if new_pmd is on different PMD page table.
         *
         * We also don't deposit and withdraw tables for file pages.
         */
        return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
}
#endif

static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
        if (pgtable_supports_soft_dirty()) {
                if (unlikely(pmd_is_migration_entry(pmd)))
                        pmd = pmd_swp_mksoft_dirty(pmd);
                else if (pmd_present(pmd))
                        pmd = pmd_mksoft_dirty(pmd);
        }

        return pmd;
}

static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
{
        if (pmd_none(pmd))
                return pmd;
        if (pmd_present(pmd))
                pmd = pmd_clear_uffd_wp(pmd);
        else
                pmd = pmd_swp_clear_uffd_wp(pmd);

        return pmd;
}

bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
        spinlock_t *old_ptl, *new_ptl;
        pmd_t pmd;
        struct mm_struct *mm = vma->vm_mm;
        bool force_flush = false;

        /*
         * The destination pmd shouldn't be established, free_pgtables()
         * should have released it; but move_page_tables() might have already
         * inserted a page table, if racing against shmem/file collapse.
         */
        if (!pmd_none(*new_pmd)) {
                VM_BUG_ON(pmd_trans_huge(*new_pmd));
                return false;
        }

        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_lock prevents deadlock.
         */
        old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
        if (old_ptl) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
                if (pmd_present(pmd))
                        force_flush = true;
                VM_BUG_ON(!pmd_none(*new_pmd));

                if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
                        pgtable_t pgtable;
                        pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
                        pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
                }
                pmd = move_soft_dirty_pmd(pmd);
                if (vma_has_uffd_without_event_remap(vma))
                        pmd = clear_uffd_wp_pmd(pmd);
                set_pmd_at(mm, new_addr, new_pmd, pmd);
                if (force_flush)
                        flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
                if (new_ptl != old_ptl)
                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
                return true;
        }
        return false;
}

static void change_non_present_huge_pmd(struct mm_struct *mm,
                unsigned long addr, pmd_t *pmd, bool uffd_wp,
                bool uffd_wp_resolve)
{
        softleaf_t entry = softleaf_from_pmd(*pmd);
        const struct folio *folio = softleaf_to_folio(entry);
        pmd_t newpmd;

        VM_WARN_ON(!pmd_is_valid_softleaf(*pmd));
        if (softleaf_is_migration_write(entry)) {
                /*
                 * A protection check is difficult so
                 * just be safe and disable write
                 */
                if (folio_test_anon(folio))
                        entry = make_readable_exclusive_migration_entry(swp_offset(entry));
                else
                        entry = make_readable_migration_entry(swp_offset(entry));
                newpmd = swp_entry_to_pmd(entry);
                if (pmd_swp_soft_dirty(*pmd))
                        newpmd = pmd_swp_mksoft_dirty(newpmd);
        } else if (softleaf_is_device_private_write(entry)) {
                entry = make_readable_device_private_entry(swp_offset(entry));
                newpmd = swp_entry_to_pmd(entry);
        } else {
                newpmd = *pmd;
        }

        if (uffd_wp)
                newpmd = pmd_swp_mkuffd_wp(newpmd);
        else if (uffd_wp_resolve)
                newpmd = pmd_swp_clear_uffd_wp(newpmd);
        if (!pmd_same(*pmd, newpmd))
                set_pmd_at(mm, addr, pmd, newpmd);
}

/*
 * Returns
 *  - 0 if PMD could not be locked
 *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
 *      or if prot_numa but THP migration is not supported
 *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
 */
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        pmd_t oldpmd, entry;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
        int ret = 1;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        if (prot_numa && !thp_migration_supported())
                return 1;

        ptl = __pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                return 0;

        if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) {
                change_non_present_huge_pmd(mm, addr, pmd, uffd_wp,
                                            uffd_wp_resolve);
                goto unlock;
        }

        if (prot_numa) {

                /*
                 * Avoid trapping faults against the zero page. The read-only
                 * data is likely to be read-cached on the local CPU and
                 * local/remote hits to the zero page are not interesting.
                 */
                if (is_huge_zero_pmd(*pmd))
                        goto unlock;

                if (pmd_protnone(*pmd))
                        goto unlock;

                if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
                                             vma_is_single_threaded_private(vma)))
                        goto unlock;
        }
        /*
         * In case prot_numa, we are under mmap_read_lock(mm). It's critical
         * to not clear pmd intermittently to avoid race with MADV_DONTNEED
         * which is also under mmap_read_lock(mm):
         *
         *        CPU0:                                CPU1:
         *                                change_huge_pmd(prot_numa=1)
         *                                 pmdp_huge_get_and_clear_notify()
         * madvise_dontneed()
         *  zap_pmd_range()
         *   pmd_trans_huge(*pmd) == 0 (without ptl)
         *   // skip the pmd
         *                                 set_pmd_at();
         *                                 // pmd is re-established
         *
         * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
         * which may break userspace.
         *
         * pmdp_invalidate_ad() is required to make sure we don't miss
         * dirty/young flags set by hardware.
         */
        oldpmd = pmdp_invalidate_ad(vma, addr, pmd);

        entry = pmd_modify(oldpmd, newprot);
        if (uffd_wp)
                entry = pmd_mkuffd_wp(entry);
        else if (uffd_wp_resolve)
                /*
                 * Leave the write bit to be handled by PF interrupt
                 * handler, then things like COW could be properly
                 * handled.
                 */
                entry = pmd_clear_uffd_wp(entry);

        /* See change_pte_range(). */
        if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
            can_change_pmd_writable(vma, addr, entry))
                entry = pmd_mkwrite(entry, vma);

        ret = HPAGE_PMD_NR;
        set_pmd_at(mm, addr, pmd, entry);

        if (huge_pmd_needs_flush(oldpmd, entry))
                tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
unlock:
        spin_unlock(ptl);
        return ret;
}

/*
 * Returns:
 *
 * - 0: if pud leaf changed from under us
 * - 1: if pud can be skipped
 * - HPAGE_PUD_NR: if pud was successfully processed
 */
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pud_t *pudp, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        pud_t oldpud, entry;
        spinlock_t *ptl;

        tlb_change_page_size(tlb, HPAGE_PUD_SIZE);

        /* NUMA balancing doesn't apply to dax */
        if (cp_flags & MM_CP_PROT_NUMA)
                return 1;

        /*
         * Huge entries on userfault-wp only works with anonymous, while we
         * don't have anonymous PUDs yet.
         */
        if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL))
                return 1;

        ptl = __pud_trans_huge_lock(pudp, vma);
        if (!ptl)
                return 0;

        /*
         * Can't clear PUD or it can race with concurrent zapping.  See
         * change_huge_pmd().
         */
        oldpud = pudp_invalidate(vma, addr, pudp);
        entry = pud_modify(oldpud, newprot);
        set_pud_at(mm, addr, pudp, entry);
        tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE);

        spin_unlock(ptl);
        return HPAGE_PUD_NR;
}
#endif

#ifdef CONFIG_USERFAULTFD
/*
 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
 * the caller, but it must return after releasing the page_table_lock.
 * Just move the page from src_pmd to dst_pmd if possible.
 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
 * repeated by the caller, or other errors in case of failure.
 */
int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
                        struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                        unsigned long dst_addr, unsigned long src_addr)
{
        pmd_t _dst_pmd, src_pmdval;
        struct page *src_page;
        struct folio *src_folio;
        spinlock_t *src_ptl, *dst_ptl;
        pgtable_t src_pgtable;
        struct mmu_notifier_range range;
        int err = 0;

        src_pmdval = *src_pmd;
        src_ptl = pmd_lockptr(mm, src_pmd);

        lockdep_assert_held(src_ptl);
        vma_assert_locked(src_vma);
        vma_assert_locked(dst_vma);

        /* Sanity checks before the operation */
        if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
            WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
                spin_unlock(src_ptl);
                return -EINVAL;
        }

        if (!pmd_trans_huge(src_pmdval)) {
                spin_unlock(src_ptl);
                if (pmd_is_migration_entry(src_pmdval)) {
                        pmd_migration_entry_wait(mm, &src_pmdval);
                        return -EAGAIN;
                }
                return -ENOENT;
        }

        src_page = pmd_page(src_pmdval);

        if (!is_huge_zero_pmd(src_pmdval)) {
                if (unlikely(!PageAnonExclusive(src_page))) {
                        spin_unlock(src_ptl);
                        return -EBUSY;
                }

                src_folio = page_folio(src_page);
                folio_get(src_folio);
        } else
                src_folio = NULL;

        spin_unlock(src_ptl);

        flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
                                src_addr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        if (src_folio)
                folio_lock(src_folio);

        dst_ptl = pmd_lockptr(mm, dst_pmd);
        double_pt_lock(src_ptl, dst_ptl);
        if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
                     !pmd_same(*dst_pmd, dst_pmdval))) {
                err = -EAGAIN;
                goto unlock_ptls;
        }
        if (src_folio) {
                if (folio_maybe_dma_pinned(src_folio) ||
                    !PageAnonExclusive(&src_folio->page)) {
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
                    WARN_ON_ONCE(!folio_test_anon(src_folio))) {
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
                /* Folio got pinned from under us. Put it back and fail the move. */
                if (folio_maybe_dma_pinned(src_folio)) {
                        set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                folio_move_anon_rmap(src_folio, dst_vma);
                src_folio->index = linear_page_index(dst_vma, dst_addr);

                _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
                /* Follow mremap() behavior and treat the entry dirty after the move */
                _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
        } else {
                src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
                _dst_pmd = move_soft_dirty_pmd(src_pmdval);
                _dst_pmd = clear_uffd_wp_pmd(_dst_pmd);
        }
        set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);

        src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
        pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
unlock_ptls:
        double_pt_unlock(src_ptl, dst_ptl);
        /* unblock rmap walks */
        if (src_folio)
                folio_unlock(src_folio);
        mmu_notifier_invalidate_range_end(&range);
        if (src_folio)
                folio_put(src_folio);
        return err;
}
#endif /* CONFIG_USERFAULTFD */

/*
 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
        spinlock_t *ptl;

        ptl = pmd_lock(vma->vm_mm, pmd);
        if (likely(pmd_is_huge(*pmd)))
                return ptl;
        spin_unlock(ptl);
        return NULL;
}

/*
 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
        spinlock_t *ptl;

        ptl = pud_lock(vma->vm_mm, pud);
        if (likely(pud_trans_huge(*pud)))
                return ptl;
        spin_unlock(ptl);
        return NULL;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pud_t *pud, unsigned long addr)
{
        spinlock_t *ptl;
        pud_t orig_pud;

        ptl = __pud_trans_huge_lock(pud, vma);
        if (!ptl)
                return 0;

        orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
        arch_check_zapped_pud(vma, orig_pud);
        tlb_remove_pud_tlb_entry(tlb, pud, addr);
        if (vma_is_special_huge(vma)) {
                spin_unlock(ptl);
                /* No zero page support yet */
        } else {
                struct page *page = NULL;
                struct folio *folio;

                /* No support for anonymous PUD pages or migration yet */
                VM_WARN_ON_ONCE(vma_is_anonymous(vma) ||
                                !pud_present(orig_pud));

                page = pud_page(orig_pud);
                folio = page_folio(page);
                folio_remove_rmap_pud(folio, page, vma);
                add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);

                spin_unlock(ptl);
                tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
        }
        return 1;
}

static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
                unsigned long haddr)
{
        struct folio *folio;
        struct page *page;
        pud_t old_pud;

        VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
        VM_BUG_ON(!pud_trans_huge(*pud));

        count_vm_event(THP_SPLIT_PUD);

        old_pud = pudp_huge_clear_flush(vma, haddr, pud);

        if (!vma_is_dax(vma))
                return;

        page = pud_page(old_pud);
        folio = page_folio(page);

        if (!folio_test_dirty(folio) && pud_dirty(old_pud))
                folio_mark_dirty(folio);
        if (!folio_test_referenced(folio) && pud_young(old_pud))
                folio_set_referenced(folio);
        folio_remove_rmap_pud(folio, page, vma);
        folio_put(folio);
        add_mm_counter(vma->vm_mm, mm_counter_file(folio),
                -HPAGE_PUD_NR);
}

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address)
{
        spinlock_t *ptl;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address & HPAGE_PUD_MASK,
                                (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        ptl = pud_lock(vma->vm_mm, pud);
        if (unlikely(!pud_trans_huge(*pud)))
                goto out;
        __split_huge_pud_locked(vma, pud, range.start);

out:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
}
#else
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address)
{
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
                unsigned long haddr, pmd_t *pmd)
{
        struct mm_struct *mm = vma->vm_mm;
        pgtable_t pgtable;
        pmd_t _pmd, old_pmd;
        unsigned long addr;
        pte_t *pte;
        int i;

        /*
         * Leave pmd empty until pte is filled note that it is fine to delay
         * notification until mmu_notifier_invalidate_range_end() as we are
         * replacing a zero pmd write protected page with a zero pte write
         * protected page.
         *
         * See Documentation/mm/mmu_notifier.rst
         */
        old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);

        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);

        pte = pte_offset_map(&_pmd, haddr);
        VM_BUG_ON(!pte);
        for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
                pte_t entry;

                entry = pfn_pte(zero_pfn(addr), vma->vm_page_prot);
                entry = pte_mkspecial(entry);
                if (pmd_uffd_wp(old_pmd))
                        entry = pte_mkuffd_wp(entry);
                VM_BUG_ON(!pte_none(ptep_get(pte)));
                set_pte_at(mm, addr, pte, entry);
                pte++;
        }
        pte_unmap(pte - 1);
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
}

static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long haddr, bool freeze)
{
        struct mm_struct *mm = vma->vm_mm;
        struct folio *folio;
        struct page *page;
        pgtable_t pgtable;
        pmd_t old_pmd, _pmd;
        bool soft_dirty, uffd_wp = false, young = false, write = false;
        bool anon_exclusive = false, dirty = false;
        unsigned long addr;
        pte_t *pte;
        int i;

        VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);

        VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(*pmd) && !pmd_trans_huge(*pmd));

        count_vm_event(THP_SPLIT_PMD);

        if (!vma_is_anonymous(vma)) {
                old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
                /*
                 * We are going to unmap this huge page. So
                 * just go ahead and zap it
                 */
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(mm, pmd);
                if (vma_is_special_huge(vma))
                        return;
                if (unlikely(pmd_is_migration_entry(old_pmd))) {
                        const softleaf_t old_entry = softleaf_from_pmd(old_pmd);

                        folio = softleaf_to_folio(old_entry);
                } else if (is_huge_zero_pmd(old_pmd)) {
                        return;
                } else {
                        page = pmd_page(old_pmd);
                        folio = page_folio(page);
                        if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
                                folio_mark_dirty(folio);
                        if (!folio_test_referenced(folio) && pmd_young(old_pmd))
                                folio_set_referenced(folio);
                        folio_remove_rmap_pmd(folio, page, vma);
                        folio_put(folio);
                }
                add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
                return;
        }

        if (is_huge_zero_pmd(*pmd)) {
                /*
                 * FIXME: Do we want to invalidate secondary mmu by calling
                 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
                 * inside __split_huge_pmd() ?
                 *
                 * We are going from a zero huge page write protected to zero
                 * small page also write protected so it does not seems useful
                 * to invalidate secondary mmu at this time.
                 */
                return __split_huge_zero_page_pmd(vma, haddr, pmd);
        }

        if (pmd_is_migration_entry(*pmd)) {
                softleaf_t entry;

                old_pmd = *pmd;
                entry = softleaf_from_pmd(old_pmd);
                page = softleaf_to_page(entry);
                folio = page_folio(page);

                soft_dirty = pmd_swp_soft_dirty(old_pmd);
                uffd_wp = pmd_swp_uffd_wp(old_pmd);

                write = softleaf_is_migration_write(entry);
                if (PageAnon(page))
                        anon_exclusive = softleaf_is_migration_read_exclusive(entry);
                young = softleaf_is_migration_young(entry);
                dirty = softleaf_is_migration_dirty(entry);
        } else if (pmd_is_device_private_entry(*pmd)) {
                softleaf_t entry;

                old_pmd = *pmd;
                entry = softleaf_from_pmd(old_pmd);
                page = softleaf_to_page(entry);
                folio = page_folio(page);

                soft_dirty = pmd_swp_soft_dirty(old_pmd);
                uffd_wp = pmd_swp_uffd_wp(old_pmd);

                write = softleaf_is_device_private_write(entry);
                anon_exclusive = PageAnonExclusive(page);

                /*
                 * Device private THP should be treated the same as regular
                 * folios w.r.t anon exclusive handling. See the comments for
                 * folio handling and anon_exclusive below.
                 */
                if (freeze && anon_exclusive &&
                    folio_try_share_anon_rmap_pmd(folio, page))
                        freeze = false;
                if (!freeze) {
                        rmap_t rmap_flags = RMAP_NONE;

                        folio_ref_add(folio, HPAGE_PMD_NR - 1);
                        if (anon_exclusive)
                                rmap_flags |= RMAP_EXCLUSIVE;

                        folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
                                                 vma, haddr, rmap_flags);
                }
        } else {
                /*
                 * Up to this point the pmd is present and huge and userland has
                 * the whole access to the hugepage during the split (which
                 * happens in place). If we overwrite the pmd with the not-huge
                 * version pointing to the pte here (which of course we could if
                 * all CPUs were bug free), userland could trigger a small page
                 * size TLB miss on the small sized TLB while the hugepage TLB
                 * entry is still established in the huge TLB. Some CPU doesn't
                 * like that. See
                 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
                 * 383 on page 105. Intel should be safe but is also warns that
                 * it's only safe if the permission and cache attributes of the
                 * two entries loaded in the two TLB is identical (which should
                 * be the case here). But it is generally safer to never allow
                 * small and huge TLB entries for the same virtual address to be
                 * loaded simultaneously. So instead of doing "pmd_populate();
                 * flush_pmd_tlb_range();" we first mark the current pmd
                 * notpresent (atomically because here the pmd_trans_huge must
                 * remain set at all times on the pmd until the split is
                 * complete for this pmd), then we flush the SMP TLB and finally
                 * we write the non-huge version of the pmd entry with
                 * pmd_populate.
                 */
                old_pmd = pmdp_invalidate(vma, haddr, pmd);
                page = pmd_page(old_pmd);
                folio = page_folio(page);
                if (pmd_dirty(old_pmd)) {
                        dirty = true;
                        folio_set_dirty(folio);
                }
                write = pmd_write(old_pmd);
                young = pmd_young(old_pmd);
                soft_dirty = pmd_soft_dirty(old_pmd);
                uffd_wp = pmd_uffd_wp(old_pmd);

                VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
                VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

                /*
                 * Without "freeze", we'll simply split the PMD, propagating the
                 * PageAnonExclusive() flag for each PTE by setting it for
                 * each subpage -- no need to (temporarily) clear.
                 *
                 * With "freeze" we want to replace mapped pages by
                 * migration entries right away. This is only possible if we
                 * managed to clear PageAnonExclusive() -- see
                 * set_pmd_migration_entry().
                 *
                 * In case we cannot clear PageAnonExclusive(), split the PMD
                 * only and let try_to_migrate_one() fail later.
                 *
                 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
                 */
                anon_exclusive = PageAnonExclusive(page);
                if (freeze && anon_exclusive &&
                    folio_try_share_anon_rmap_pmd(folio, page))
                        freeze = false;
                if (!freeze) {
                        rmap_t rmap_flags = RMAP_NONE;

                        folio_ref_add(folio, HPAGE_PMD_NR - 1);
                        if (anon_exclusive)
                                rmap_flags |= RMAP_EXCLUSIVE;
                        folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
                                                 vma, haddr, rmap_flags);
                }
        }

        /*
         * Withdraw the table only after we mark the pmd entry invalid.
         * This's critical for some architectures (Power).
         */
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);

        pte = pte_offset_map(&_pmd, haddr);
        VM_BUG_ON(!pte);

        /*
         * Note that NUMA hinting access restrictions are not transferred to
         * avoid any possibility of altering permissions across VMAs.
         */
        if (freeze || pmd_is_migration_entry(old_pmd)) {
                pte_t entry;
                swp_entry_t swp_entry;

                for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
                        if (write)
                                swp_entry = make_writable_migration_entry(
                                                        page_to_pfn(page + i));
                        else if (anon_exclusive)
                                swp_entry = make_readable_exclusive_migration_entry(
                                                        page_to_pfn(page + i));
                        else
                                swp_entry = make_readable_migration_entry(
                                                        page_to_pfn(page + i));
                        if (young)
                                swp_entry = make_migration_entry_young(swp_entry);
                        if (dirty)
                                swp_entry = make_migration_entry_dirty(swp_entry);
                        entry = swp_entry_to_pte(swp_entry);
                        if (soft_dirty)
                                entry = pte_swp_mksoft_dirty(entry);
                        if (uffd_wp)
                                entry = pte_swp_mkuffd_wp(entry);
                        VM_WARN_ON(!pte_none(ptep_get(pte + i)));
                        set_pte_at(mm, addr, pte + i, entry);
                }
        } else if (pmd_is_device_private_entry(old_pmd)) {
                pte_t entry;
                swp_entry_t swp_entry;

                for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
                        /*
                         * anon_exclusive was already propagated to the relevant
                         * pages corresponding to the pte entries when freeze
                         * is false.
                         */
                        if (write)
                                swp_entry = make_writable_device_private_entry(
                                                        page_to_pfn(page + i));
                        else
                                swp_entry = make_readable_device_private_entry(
                                                        page_to_pfn(page + i));
                        /*
                         * Young and dirty bits are not progated via swp_entry
                         */
                        entry = swp_entry_to_pte(swp_entry);
                        if (soft_dirty)
                                entry = pte_swp_mksoft_dirty(entry);
                        if (uffd_wp)
                                entry = pte_swp_mkuffd_wp(entry);
                        VM_WARN_ON(!pte_none(ptep_get(pte + i)));
                        set_pte_at(mm, addr, pte + i, entry);
                }
        } else {
                pte_t entry;

                entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
                if (write)
                        entry = pte_mkwrite(entry, vma);
                if (!young)
                        entry = pte_mkold(entry);
                /* NOTE: this may set soft-dirty too on some archs */
                if (dirty)
                        entry = pte_mkdirty(entry);
                if (soft_dirty)
                        entry = pte_mksoft_dirty(entry);
                if (uffd_wp)
                        entry = pte_mkuffd_wp(entry);

                for (i = 0; i < HPAGE_PMD_NR; i++)
                        VM_WARN_ON(!pte_none(ptep_get(pte + i)));

                set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
        }
        pte_unmap(pte);

        if (!pmd_is_migration_entry(*pmd))
                folio_remove_rmap_pmd(folio, page, vma);
        if (freeze)
                put_page(page);

        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
}

void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
                           pmd_t *pmd, bool freeze)
{
        VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
        if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd))
                __split_huge_pmd_locked(vma, pmd, address, freeze);
}

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze)
{
        spinlock_t *ptl;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address & HPAGE_PMD_MASK,
                                (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        ptl = pmd_lock(vma->vm_mm, pmd);
        split_huge_pmd_locked(vma, range.start, pmd, freeze);
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
}

void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
                bool freeze)
{
        pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);

        if (!pmd)
                return;

        __split_huge_pmd(vma, pmd, address, freeze);
}

static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
{
        /*
         * If the new address isn't hpage aligned and it could previously
         * contain an hugepage: check if we need to split an huge pmd.
         */
        if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
            range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
                         ALIGN(address, HPAGE_PMD_SIZE)))
                split_huge_pmd_address(vma, address, false);
}

void vma_adjust_trans_huge(struct vm_area_struct *vma,
                           unsigned long start,
                           unsigned long end,
                           struct vm_area_struct *next)
{
        /* Check if we need to split start first. */
        split_huge_pmd_if_needed(vma, start);

        /* Check if we need to split end next. */
        split_huge_pmd_if_needed(vma, end);

        /* If we're incrementing next->vm_start, we might need to split it. */
        if (next)
                split_huge_pmd_if_needed(next, end);
}

static void unmap_folio(struct folio *folio)
{
        enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
                TTU_BATCH_FLUSH;

        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);

        if (folio_test_pmd_mappable(folio))
                ttu_flags |= TTU_SPLIT_HUGE_PMD;

        /*
         * Anon pages need migration entries to preserve them, but file
         * pages can simply be left unmapped, then faulted back on demand.
         * If that is ever changed (perhaps for mlock), update remap_page().
         */
        if (folio_test_anon(folio))
                try_to_migrate(folio, ttu_flags);
        else
                try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);

        try_to_unmap_flush();
}

static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
                                            unsigned long addr, pmd_t *pmdp,
                                            struct folio *folio)
{
        struct mm_struct *mm = vma->vm_mm;
        int ref_count, map_count;
        pmd_t orig_pmd = *pmdp;

        if (pmd_dirty(orig_pmd))
                folio_set_dirty(folio);
        if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
                folio_set_swapbacked(folio);
                return false;
        }

        orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);

        /*
         * Syncing against concurrent GUP-fast:
         * - clear PMD; barrier; read refcount
         * - inc refcount; barrier; read PMD
         */
        smp_mb();

        ref_count = folio_ref_count(folio);
        map_count = folio_mapcount(folio);

        /*
         * Order reads for folio refcount and dirty flag
         * (see comments in __remove_mapping()).
         */
        smp_rmb();

        /*
         * If the folio or its PMD is redirtied at this point, or if there
         * are unexpected references, we will give up to discard this folio
         * and remap it.
         *
         * The only folio refs must be one from isolation plus the rmap(s).
         */
        if (pmd_dirty(orig_pmd))
                folio_set_dirty(folio);
        if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
                folio_set_swapbacked(folio);
                set_pmd_at(mm, addr, pmdp, orig_pmd);
                return false;
        }

        if (ref_count != map_count + 1) {
                set_pmd_at(mm, addr, pmdp, orig_pmd);
                return false;
        }

        folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
        zap_deposited_table(mm, pmdp);
        add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
        if (vma->vm_flags & VM_LOCKED)
                mlock_drain_local();
        folio_put(folio);

        return true;
}

bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
                           pmd_t *pmdp, struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
        VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));

        return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
}

static void remap_page(struct folio *folio, unsigned long nr, int flags)
{
        int i = 0;

        /* If unmap_folio() uses try_to_migrate() on file, remove this check */
        if (!folio_test_anon(folio))
                return;
        for (;;) {
                remove_migration_ptes(folio, folio, TTU_RMAP_LOCKED | flags);
                i += folio_nr_pages(folio);
                if (i >= nr)
                        break;
                folio = folio_next(folio);
        }
}

static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
                struct lruvec *lruvec, struct list_head *list)
{
        VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
        lockdep_assert_held(&lruvec->lru_lock);

        if (folio_is_device_private(folio))
                return;

        if (list) {
                /* page reclaim is reclaiming a huge page */
                VM_WARN_ON(folio_test_lru(folio));
                folio_get(new_folio);
                list_add_tail(&new_folio->lru, list);
        } else {
                /* head is still on lru (and we have it frozen) */
                VM_WARN_ON(!folio_test_lru(folio));
                if (folio_test_unevictable(folio))
                        new_folio->mlock_count = 0;
                else
                        list_add_tail(&new_folio->lru, &folio->lru);
                folio_set_lru(new_folio);
        }
}

static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
{
        for (; nr_pages; page++, nr_pages--)
                if (PageHWPoison(page))
                        return true;
        return false;
}

/*
 * It splits @folio into @new_order folios and copies the @folio metadata to
 * all the resulting folios.
 */
static void __split_folio_to_order(struct folio *folio, int old_order,
                int new_order)
{
        /* Scan poisoned pages when split a poisoned folio to large folios */
        const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order;
        long new_nr_pages = 1 << new_order;
        long nr_pages = 1 << old_order;
        long i;

        folio_clear_has_hwpoisoned(folio);

        /* Check first new_nr_pages since the loop below skips them */
        if (handle_hwpoison &&
            page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages))
                folio_set_has_hwpoisoned(folio);
        /*
         * Skip the first new_nr_pages, since the new folio from them have all
         * the flags from the original folio.
         */
        for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
                struct page *new_head = &folio->page + i;
                /*
                 * Careful: new_folio is not a "real" folio before we cleared PageTail.
                 * Don't pass it around before clear_compound_head().
                 */
                struct folio *new_folio = (struct folio *)new_head;

                VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head);

                /*
                 * Clone page flags before unfreezing refcount.
                 *
                 * After successful get_page_unless_zero() might follow flags change,
                 * for example lock_page() which set PG_waiters.
                 *
                 * Note that for mapped sub-pages of an anonymous THP,
                 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
                 * the migration entry instead from where remap_page() will restore it.
                 * We can still have PG_anon_exclusive set on effectively unmapped and
                 * unreferenced sub-pages of an anonymous THP: we can simply drop
                 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
                 */
                new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
                new_folio->flags.f |= (folio->flags.f &
                                ((1L << PG_referenced) |
                                 (1L << PG_swapbacked) |
                                 (1L << PG_swapcache) |
                                 (1L << PG_mlocked) |
                                 (1L << PG_uptodate) |
                                 (1L << PG_active) |
                                 (1L << PG_workingset) |
                                 (1L << PG_locked) |
                                 (1L << PG_unevictable) |
#ifdef CONFIG_ARCH_USES_PG_ARCH_2
                                 (1L << PG_arch_2) |
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
                                 (1L << PG_arch_3) |
#endif
                                 (1L << PG_dirty) |
                                 LRU_GEN_MASK | LRU_REFS_MASK));

                if (handle_hwpoison &&
                    page_range_has_hwpoisoned(new_head, new_nr_pages))
                        folio_set_has_hwpoisoned(new_folio);

                new_folio->mapping = folio->mapping;
                new_folio->index = folio->index + i;

                if (folio_test_swapcache(folio))
                        new_folio->swap.val = folio->swap.val + i;

                /* Page flags must be visible before we make the page non-compound. */
                smp_wmb();

                /*
                 * Clear PageTail before unfreezing page refcount.
                 *
                 * After successful get_page_unless_zero() might follow put_page()
                 * which needs correct compound_head().
                 */
                clear_compound_head(new_head);
                if (new_order) {
                        prep_compound_page(new_head, new_order);
                        folio_set_large_rmappable(new_folio);
                }

                if (folio_test_young(folio))
                        folio_set_young(new_folio);
                if (folio_test_idle(folio))
                        folio_set_idle(new_folio);
#ifdef CONFIG_MEMCG
                new_folio->memcg_data = folio->memcg_data;
#endif

                folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
        }

        if (new_order)
                folio_set_order(folio, new_order);
        else
                ClearPageCompound(&folio->page);
}

/**
 * __split_unmapped_folio() - splits an unmapped @folio to lower order folios in
 * two ways: uniform split or non-uniform split.
 * @folio: the to-be-split folio
 * @new_order: the smallest order of the after split folios (since buddy
 *             allocator like split generates folios with orders from @folio's
 *             order - 1 to new_order).
 * @split_at: in buddy allocator like split, the folio containing @split_at
 *            will be split until its order becomes @new_order.
 * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
 * @mapping: @folio->mapping
 * @split_type: if the split is uniform or not (buddy allocator like split)
 *
 *
 * 1. uniform split: the given @folio into multiple @new_order small folios,
 *    where all small folios have the same order. This is done when
 *    split_type is SPLIT_TYPE_UNIFORM.
 * 2. buddy allocator like (non-uniform) split: the given @folio is split into
 *    half and one of the half (containing the given page) is split into half
 *    until the given @folio's order becomes @new_order. This is done when
 *    split_type is SPLIT_TYPE_NON_UNIFORM.
 *
 * The high level flow for these two methods are:
 *
 * 1. uniform split: @xas is split with no expectation of failure and a single
 *    __split_folio_to_order() is called to split the @folio into @new_order
 *    along with stats update.
 * 2. non-uniform split: folio_order - @new_order calls to
 *    __split_folio_to_order() are expected to be made in a for loop to split
 *    the @folio to one lower order at a time. The folio containing @split_at
 *    is split in each iteration. @xas is split into half in each iteration and
 *    can fail. A failed @xas split leaves split folios as is without merging
 *    them back.
 *
 * After splitting, the caller's folio reference will be transferred to the
 * folio containing @split_at. The caller needs to unlock and/or free
 * after-split folios if necessary.
 *
 * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
 * split but not to @new_order, the caller needs to check)
 */
static int __split_unmapped_folio(struct folio *folio, int new_order,
                struct page *split_at, struct xa_state *xas,
                struct address_space *mapping, enum split_type split_type)
{
        const bool is_anon = folio_test_anon(folio);
        int old_order = folio_order(folio);
        int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1;
        struct folio *old_folio = folio;
        int split_order;

        /*
         * split to new_order one order at a time. For uniform split,
         * folio is split to new_order directly.
         */
        for (split_order = start_order;
             split_order >= new_order;
             split_order--) {
                int nr_new_folios = 1UL << (old_order - split_order);

                /* order-1 anonymous folio is not supported */
                if (is_anon && split_order == 1)
                        continue;

                if (mapping) {
                        /*
                         * uniform split has xas_split_alloc() called before
                         * irq is disabled to allocate enough memory, whereas
                         * non-uniform split can handle ENOMEM.
                         * Use the to-be-split folio, so that a parallel
                         * folio_try_get() waits on it until xarray is updated
                         * with after-split folios and the original one is
                         * unfrozen.
                         */
                        if (split_type == SPLIT_TYPE_UNIFORM) {
                                xas_split(xas, old_folio, old_order);
                        } else {
                                xas_set_order(xas, folio->index, split_order);
                                xas_try_split(xas, old_folio, old_order);
                                if (xas_error(xas))
                                        return xas_error(xas);
                        }
                }

                folio_split_memcg_refs(folio, old_order, split_order);
                split_page_owner(&folio->page, old_order, split_order);
                pgalloc_tag_split(folio, old_order, split_order);
                __split_folio_to_order(folio, old_order, split_order);

                if (is_anon) {
                        mod_mthp_stat(old_order, MTHP_STAT_NR_ANON, -1);
                        mod_mthp_stat(split_order, MTHP_STAT_NR_ANON, nr_new_folios);
                }
                /*
                 * If uniform split, the process is complete.
                 * If non-uniform, continue splitting the folio at @split_at
                 * as long as the next @split_order is >= @new_order.
                 */
                folio = page_folio(split_at);
                old_order = split_order;
        }

        return 0;
}

/**
 * folio_check_splittable() - check if a folio can be split to a given order
 * @folio: folio to be split
 * @new_order: the smallest order of the after split folios (since buddy
 *             allocator like split generates folios with orders from @folio's
 *             order - 1 to new_order).
 * @split_type: uniform or non-uniform split
 *
 * folio_check_splittable() checks if @folio can be split to @new_order using
 * @split_type method. The truncated folio check must come first.
 *
 * Context: folio must be locked.
 *
 * Return: 0 - @folio can be split to @new_order, otherwise an error number is
 * returned.
 */
int folio_check_splittable(struct folio *folio, unsigned int new_order,
                           enum split_type split_type)
{
        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
        /*
         * Folios that just got truncated cannot get split. Signal to the
         * caller that there was a race.
         *
         * TODO: this will also currently refuse folios without a mapping in the
         * swapcache (shmem or to-be-anon folios).
         */
        if (!folio->mapping && !folio_test_anon(folio))
                return -EBUSY;

        if (folio_test_anon(folio)) {
                /* order-1 is not supported for anonymous THP. */
                if (new_order == 1)
                        return -EINVAL;
        } else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) {
                if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
                    !mapping_large_folio_support(folio->mapping)) {
                        /*
                         * We can always split a folio down to a single page
                         * (new_order == 0) uniformly.
                         *
                         * For any other scenario
                         *   a) uniform split targeting a large folio
                         *      (new_order > 0)
                         *   b) any non-uniform split
                         * we must confirm that the file system supports large
                         * folios.
                         *
                         * Note that we might still have THPs in such
                         * mappings, which is created from khugepaged when
                         * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that
                         * case, the mapping does not actually support large
                         * folios properly.
                         */
                        return -EINVAL;
                }
        }

        /*
         * swapcache folio could only be split to order 0
         *
         * non-uniform split creates after-split folios with orders from
         * folio_order(folio) - 1 to new_order, making it not suitable for any
         * swapcache folio split. Only uniform split to order-0 can be used
         * here.
         */
        if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) {
                return -EINVAL;
        }

        if (is_huge_zero_folio(folio))
                return -EINVAL;

        if (folio_test_writeback(folio))
                return -EBUSY;

        return 0;
}

/* Number of folio references from the pagecache or the swapcache. */
static unsigned int folio_cache_ref_count(const struct folio *folio)
{
        if (folio_test_anon(folio) && !folio_test_swapcache(folio))
                return 0;
        return folio_nr_pages(folio);
}

static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
                                             struct page *split_at, struct xa_state *xas,
                                             struct address_space *mapping, bool do_lru,
                                             struct list_head *list, enum split_type split_type,
                                             pgoff_t end, int *nr_shmem_dropped)
{
        struct folio *end_folio = folio_next(folio);
        struct folio *new_folio, *next;
        int old_order = folio_order(folio);
        int ret = 0;
        struct deferred_split *ds_queue;

        VM_WARN_ON_ONCE(!mapping && end);
        /* Prevent deferred_split_scan() touching ->_refcount */
        ds_queue = folio_split_queue_lock(folio);
        if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
                struct swap_cluster_info *ci = NULL;
                struct lruvec *lruvec;

                if (old_order > 1) {
                        if (!list_empty(&folio->_deferred_list)) {
                                ds_queue->split_queue_len--;
                                /*
                                 * Reinitialize page_deferred_list after removing the
                                 * page from the split_queue, otherwise a subsequent
                                 * split will see list corruption when checking the
                                 * page_deferred_list.
                                 */
                                list_del_init(&folio->_deferred_list);
                        }
                        if (folio_test_partially_mapped(folio)) {
                                folio_clear_partially_mapped(folio);
                                mod_mthp_stat(old_order,
                                        MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
                        }
                }
                split_queue_unlock(ds_queue);
                if (mapping) {
                        int nr = folio_nr_pages(folio);

                        if (folio_test_pmd_mappable(folio) &&
                            new_order < HPAGE_PMD_ORDER) {
                                if (folio_test_swapbacked(folio)) {
                                        lruvec_stat_mod_folio(folio,
                                                        NR_SHMEM_THPS, -nr);
                                } else {
                                        lruvec_stat_mod_folio(folio,
                                                        NR_FILE_THPS, -nr);
                                        filemap_nr_thps_dec(mapping);
                                }
                        }
                }

                if (folio_test_swapcache(folio)) {
                        if (mapping) {
                                VM_WARN_ON_ONCE_FOLIO(mapping, folio);
                                return -EINVAL;
                        }

                        ci = swap_cluster_get_and_lock(folio);
                }

                /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
                if (do_lru)
                        lruvec = folio_lruvec_lock(folio);

                ret = __split_unmapped_folio(folio, new_order, split_at, xas,
                                             mapping, split_type);

                /*
                 * Unfreeze after-split folios and put them back to the right
                 * list. @folio should be kept frozon until page cache
                 * entries are updated with all the other after-split folios
                 * to prevent others seeing stale page cache entries.
                 * As a result, new_folio starts from the next folio of
                 * @folio.
                 */
                for (new_folio = folio_next(folio); new_folio != end_folio;
                     new_folio = next) {
                        unsigned long nr_pages = folio_nr_pages(new_folio);

                        next = folio_next(new_folio);

                        zone_device_private_split_cb(folio, new_folio);

                        folio_ref_unfreeze(new_folio,
                                           folio_cache_ref_count(new_folio) + 1);

                        if (do_lru)
                                lru_add_split_folio(folio, new_folio, lruvec, list);

                        /*
                         * Anonymous folio with swap cache.
                         * NOTE: shmem in swap cache is not supported yet.
                         */
                        if (ci) {
                                __swap_cache_replace_folio(ci, folio, new_folio);
                                continue;
                        }

                        /* Anonymous folio without swap cache */
                        if (!mapping)
                                continue;

                        /* Add the new folio to the page cache. */
                        if (new_folio->index < end) {
                                __xa_store(&mapping->i_pages, new_folio->index,
                                           new_folio, 0);
                                continue;
                        }

                        VM_WARN_ON_ONCE(!nr_shmem_dropped);
                        /* Drop folio beyond EOF: ->index >= end */
                        if (shmem_mapping(mapping) && nr_shmem_dropped)
                                *nr_shmem_dropped += nr_pages;
                        else if (folio_test_clear_dirty(new_folio))
                                folio_account_cleaned(
                                        new_folio, inode_to_wb(mapping->host));
                        __filemap_remove_folio(new_folio, NULL);
                        folio_put_refs(new_folio, nr_pages);
                }

                zone_device_private_split_cb(folio, NULL);
                /*
                 * Unfreeze @folio only after all page cache entries, which
                 * used to point to it, have been updated with new folios.
                 * Otherwise, a parallel folio_try_get() can grab @folio
                 * and its caller can see stale page cache entries.
                 */
                folio_ref_unfreeze(folio, folio_cache_ref_count(folio) + 1);

                if (do_lru)
                        lruvec_unlock(lruvec);

                if (ci)
                        swap_cluster_unlock(ci);
        } else {
                split_queue_unlock(ds_queue);
                return -EAGAIN;
        }

        return ret;
}

/**
 * __folio_split() - split a folio at @split_at to a @new_order folio
 * @folio: folio to split
 * @new_order: the order of the new folio
 * @split_at: a page within the new folio
 * @lock_at: a page within @folio to be left locked to caller
 * @list: after-split folios will be put on it if non NULL
 * @split_type: perform uniform split or not (non-uniform split)
 *
 * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
 * It is in charge of checking whether the split is supported or not and
 * preparing @folio for __split_unmapped_folio().
 *
 * After splitting, the after-split folio containing @lock_at remains locked
 * and others are unlocked:
 * 1. for uniform split, @lock_at points to one of @folio's subpages;
 * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
 *
 * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
 * split but not to @new_order, the caller needs to check)
 */
static int __folio_split(struct folio *folio, unsigned int new_order,
                struct page *split_at, struct page *lock_at,
                struct list_head *list, enum split_type split_type)
{
        XA_STATE(xas, &folio->mapping->i_pages, folio->index);
        struct folio *end_folio = folio_next(folio);
        bool is_anon = folio_test_anon(folio);
        struct address_space *mapping = NULL;
        struct anon_vma *anon_vma = NULL;
        int old_order = folio_order(folio);
        struct folio *new_folio, *next;
        int nr_shmem_dropped = 0;
        enum ttu_flags ttu_flags = 0;
        int ret;
        pgoff_t end = 0;

        VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);

        if (folio != page_folio(split_at) || folio != page_folio(lock_at)) {
                ret = -EINVAL;
                goto out;
        }

        if (new_order >= old_order) {
                ret = -EINVAL;
                goto out;
        }

        ret = folio_check_splittable(folio, new_order, split_type);
        if (ret) {
                VM_WARN_ONCE(ret == -EINVAL, "Tried to split an unsplittable folio");
                goto out;
        }

        if (is_anon) {
                /*
                 * The caller does not necessarily hold an mmap_lock that would
                 * prevent the anon_vma disappearing so we first we take a
                 * reference to it and then lock the anon_vma for write. This
                 * is similar to folio_lock_anon_vma_read except the write lock
                 * is taken to serialise against parallel split or collapse
                 * operations.
                 */
                anon_vma = folio_get_anon_vma(folio);
                if (!anon_vma) {
                        ret = -EBUSY;
                        goto out;
                }
                anon_vma_lock_write(anon_vma);
                mapping = NULL;
        } else {
                unsigned int min_order;
                gfp_t gfp;

                mapping = folio->mapping;
                min_order = mapping_min_folio_order(folio->mapping);
                if (new_order < min_order) {
                        ret = -EINVAL;
                        goto out;
                }

                gfp = current_gfp_context(mapping_gfp_mask(mapping) &
                                                        GFP_RECLAIM_MASK);

                if (!filemap_release_folio(folio, gfp)) {
                        ret = -EBUSY;
                        goto out;
                }

                if (split_type == SPLIT_TYPE_UNIFORM) {
                        xas_set_order(&xas, folio->index, new_order);
                        xas_split_alloc(&xas, folio, old_order, gfp);
                        if (xas_error(&xas)) {
                                ret = xas_error(&xas);
                                goto out;
                        }
                }

                anon_vma = NULL;
                i_mmap_lock_read(mapping);

                /*
                 *__split_unmapped_folio() may need to trim off pages beyond
                 * EOF: but on 32-bit, i_size_read() takes an irq-unsafe
                 * seqlock, which cannot be nested inside the page tree lock.
                 * So note end now: i_size itself may be changed at any moment,
                 * but folio lock is good enough to serialize the trimming.
                 */
                end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
                if (shmem_mapping(mapping))
                        end = shmem_fallocend(mapping->host, end);
        }

        /*
         * Racy check if we can split the page, before unmap_folio() will
         * split PMDs
         */
        if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1) {
                ret = -EAGAIN;
                goto out_unlock;
        }

        unmap_folio(folio);

        /* block interrupt reentry in xa_lock and spinlock */
        local_irq_disable();
        if (mapping) {
                /*
                 * Check if the folio is present in page cache.
                 * We assume all tail are present too, if folio is there.
                 */
                xas_lock(&xas);
                xas_reset(&xas);
                if (xas_load(&xas) != folio) {
                        ret = -EAGAIN;
                        goto fail;
                }
        }

        ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping,
                                                true, list, split_type, end, &nr_shmem_dropped);
fail:
        if (mapping)
                xas_unlock(&xas);

        local_irq_enable();

        if (nr_shmem_dropped)
                shmem_uncharge(mapping->host, nr_shmem_dropped);

        if (!ret && is_anon && !folio_is_device_private(folio))
                ttu_flags = TTU_USE_SHARED_ZEROPAGE;

        remap_page(folio, 1 << old_order, ttu_flags);

        /*
         * Unlock all after-split folios except the one containing
         * @lock_at page. If @folio is not split, it will be kept locked.
         */
        for (new_folio = folio; new_folio != end_folio; new_folio = next) {
                next = folio_next(new_folio);
                if (new_folio == page_folio(lock_at))
                        continue;

                folio_unlock(new_folio);
                /*
                 * Subpages may be freed if there wasn't any mapping
                 * like if add_to_swap() is running on a lru page that
                 * had its mapping zapped. And freeing these pages
                 * requires taking the lru_lock so we do the put_page
                 * of the tail pages after the split is complete.
                 */
                free_folio_and_swap_cache(new_folio);
        }

out_unlock:
        if (anon_vma) {
                anon_vma_unlock_write(anon_vma);
                put_anon_vma(anon_vma);
        }
        if (mapping)
                i_mmap_unlock_read(mapping);
out:
        xas_destroy(&xas);
        if (is_pmd_order(old_order))
                count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
        count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
        return ret;
}

/**
 * folio_split_unmapped() - split a large anon folio that is already unmapped
 * @folio: folio to split
 * @new_order: the order of folios after split
 *
 * This function is a helper for splitting folios that have already been
 * unmapped. The use case is that the device or the CPU can refuse to migrate
 * THP pages in the middle of migration, due to allocation issues on either
 * side.
 *
 * anon_vma_lock is not required to be held, mmap_read_lock() or
 * mmap_write_lock() should be held. @folio is expected to be locked by the
 * caller. device-private and non device-private folios are supported along
 * with folios that are in the swapcache. @folio should also be unmapped and
 * isolated from LRU (if applicable)
 *
 * Upon return, the folio is not remapped, split folios are not added to LRU,
 * free_folio_and_swap_cache() is not called, and new folios remain locked.
 *
 * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to
 *         insufficient reference count or extra pins).
 */
int folio_split_unmapped(struct folio *folio, unsigned int new_order)
{
        int ret = 0;

        VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);

        if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1)
                return -EAGAIN;

        local_irq_disable();
        ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL,
                                                NULL, false, NULL, SPLIT_TYPE_UNIFORM,
                                                0, NULL);
        local_irq_enable();
        return ret;
}

/*
 * This function splits a large folio into smaller folios of order @new_order.
 * @page can point to any page of the large folio to split. The split operation
 * does not change the position of @page.
 *
 * Prerequisites:
 *
 * 1) The caller must hold a reference on the @page's owning folio, also known
 *    as the large folio.
 *
 * 2) The large folio must be locked.
 *
 * 3) The folio must not be pinned. Any unexpected folio references, including
 *    GUP pins, will result in the folio not getting split; instead, the caller
 *    will receive an -EAGAIN.
 *
 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
 *    supported for non-file-backed folios, because folio->_deferred_list, which
 *    is used by partially mapped folios, is stored in subpage 2, but an order-1
 *    folio only has subpages 0 and 1. File-backed order-1 folios are supported,
 *    since they do not use _deferred_list.
 *
 * After splitting, the caller's folio reference will be transferred to @page,
 * resulting in a raised refcount of @page after this call. The other pages may
 * be freed if they are not mapped.
 *
 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
 *
 * Pages in @new_order will inherit the mapping, flags, and so on from the
 * huge page.
 *
 * Returns 0 if the huge page was split successfully.
 *
 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
 * the folio was concurrently removed from the page cache.
 *
 * Returns -EBUSY when trying to split the huge zeropage, if the folio is
 * under writeback, if fs-specific folio metadata cannot currently be
 * released, or if some unexpected race happened (e.g., anon VMA disappeared,
 * truncation).
 *
 * Callers should ensure that the order respects the address space mapping
 * min-order if one is set for non-anonymous folios.
 *
 * Returns -EINVAL when trying to split to an order that is incompatible
 * with the folio. Splitting to order 0 is compatible with all folios.
 */
int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                                     unsigned int new_order)
{
        struct folio *folio = page_folio(page);

        return __folio_split(folio, new_order, &folio->page, page, list,
                             SPLIT_TYPE_UNIFORM);
}

/**
 * folio_split() - split a folio at @split_at to a @new_order folio
 * @folio: folio to split
 * @new_order: the order of the new folio
 * @split_at: a page within the new folio
 * @list: after-split folios are added to @list if not null, otherwise to LRU
 *        list
 *
 * It has the same prerequisites and returns as
 * split_huge_page_to_list_to_order().
 *
 * Split a folio at @split_at to a new_order folio, leave the
 * remaining subpages of the original folio as large as possible. For example,
 * in the case of splitting an order-9 folio at its third order-3 subpages to
 * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
 * After the split, there will be a group of folios with different orders and
 * the new folio containing @split_at is marked in bracket:
 * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
 *
 * After split, folio is left locked for caller.
 *
 * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
 * split but not to @new_order, the caller needs to check)
 */
int folio_split(struct folio *folio, unsigned int new_order,
                struct page *split_at, struct list_head *list)
{
        return __folio_split(folio, new_order, split_at, &folio->page, list,
                             SPLIT_TYPE_NON_UNIFORM);
}

/**
 * min_order_for_split() - get the minimum order @folio can be split to
 * @folio: folio to split
 *
 * min_order_for_split() tells the minimum order @folio can be split to.
 * If a file-backed folio is truncated, 0 will be returned. Any subsequent
 * split attempt should get -EBUSY from split checking code.
 *
 * Return: @folio's minimum order for split
 */
unsigned int min_order_for_split(struct folio *folio)
{
        if (folio_test_anon(folio))
                return 0;

        /*
         * If the folio got truncated, we don't know the previous mapping and
         * consequently the old min order. But it doesn't matter, as any split
         * attempt will immediately fail with -EBUSY as the folio cannot get
         * split until freed.
         */
        if (!folio->mapping)
                return 0;

        return mapping_min_folio_order(folio->mapping);
}

int split_folio_to_list(struct folio *folio, struct list_head *list)
{
        return split_huge_page_to_list_to_order(&folio->page, list, 0);
}

/*
 * __folio_unqueue_deferred_split() is not to be called directly:
 * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
 * limits its calls to those folios which may have a _deferred_list for
 * queueing THP splits, and that list is (racily observed to be) non-empty.
 *
 * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
 * zero: because even when split_queue_lock is held, a non-empty _deferred_list
 * might be in use on deferred_split_scan()'s unlocked on-stack list.
 *
 * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
 * therefore important to unqueue deferred split before changing folio memcg.
 */
bool __folio_unqueue_deferred_split(struct folio *folio)
{
        struct deferred_split *ds_queue;
        unsigned long flags;
        bool unqueued = false;

        WARN_ON_ONCE(folio_ref_count(folio));
        WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));

        ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
        if (!list_empty(&folio->_deferred_list)) {
                ds_queue->split_queue_len--;
                if (folio_test_partially_mapped(folio)) {
                        folio_clear_partially_mapped(folio);
                        mod_mthp_stat(folio_order(folio),
                                      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
                }
                list_del_init(&folio->_deferred_list);
                unqueued = true;
        }
        split_queue_unlock_irqrestore(ds_queue, flags);

        return unqueued;        /* useful for debug warnings */
}

/* partially_mapped=false won't clear PG_partially_mapped folio flag */
void deferred_split_folio(struct folio *folio, bool partially_mapped)
{
        struct deferred_split *ds_queue;
        unsigned long flags;

        /*
         * Order 1 folios have no space for a deferred list, but we also
         * won't waste much memory by not adding them to the deferred list.
         */
        if (folio_order(folio) <= 1)
                return;

        if (!partially_mapped && !split_underused_thp)
                return;

        /*
         * Exclude swapcache: originally to avoid a corrupt deferred split
         * queue. Nowadays that is fully prevented by memcg1_swapout();
         * but if page reclaim is already handling the same folio, it is
         * unnecessary to handle it again in the shrinker, so excluding
         * swapcache here may still be a useful optimization.
         */
        if (folio_test_swapcache(folio))
                return;

        ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
        if (partially_mapped) {
                if (!folio_test_partially_mapped(folio)) {
                        folio_set_partially_mapped(folio);
                        if (folio_test_pmd_mappable(folio))
                                count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                        count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
                        mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);

                }
        } else {
                /* partially mapped folios cannot become non-partially mapped */
                VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
        }
        if (list_empty(&folio->_deferred_list)) {
                struct mem_cgroup *memcg;

                memcg = folio_split_queue_memcg(folio, ds_queue);
                list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
                ds_queue->split_queue_len++;
                if (memcg)
                        set_shrinker_bit(memcg, folio_nid(folio),
                                         shrinker_id(deferred_split_shrinker));
        }
        split_queue_unlock_irqrestore(ds_queue, flags);
}

static unsigned long deferred_split_count(struct shrinker *shrink,
                struct shrink_control *sc)
{
        struct pglist_data *pgdata = NODE_DATA(sc->nid);
        struct deferred_split *ds_queue = &pgdata->deferred_split_queue;

#ifdef CONFIG_MEMCG
        if (sc->memcg)
                ds_queue = &sc->memcg->deferred_split_queue;
#endif
        return READ_ONCE(ds_queue->split_queue_len);
}

static bool thp_underused(struct folio *folio)
{
        int num_zero_pages = 0, num_filled_pages = 0;
        int i;

        if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
                return false;

        if (folio_contain_hwpoisoned_page(folio))
                return false;

        for (i = 0; i < folio_nr_pages(folio); i++) {
                if (pages_identical(folio_page(folio, i), ZERO_PAGE(0))) {
                        if (++num_zero_pages > khugepaged_max_ptes_none)
                                return true;
                } else {
                        /*
                         * Another path for early exit once the number
                         * of non-zero filled pages exceeds threshold.
                         */
                        if (++num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none)
                                return false;
                }
        }
        return false;
}

static unsigned long deferred_split_scan(struct shrinker *shrink,
                struct shrink_control *sc)
{
        struct deferred_split *ds_queue;
        unsigned long flags;
        struct folio *folio, *next;
        int split = 0, i;
        struct folio_batch fbatch;

        folio_batch_init(&fbatch);

retry:
        ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
        /* Take pin on all head pages to avoid freeing them under us */
        list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
                                                        _deferred_list) {
                if (folio_try_get(folio)) {
                        folio_batch_add(&fbatch, folio);
                } else if (folio_test_partially_mapped(folio)) {
                        /* We lost race with folio_put() */
                        folio_clear_partially_mapped(folio);
                        mod_mthp_stat(folio_order(folio),
                                      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
                }
                list_del_init(&folio->_deferred_list);
                ds_queue->split_queue_len--;
                if (!--sc->nr_to_scan)
                        break;
                if (!folio_batch_space(&fbatch))
                        break;
        }
        split_queue_unlock_irqrestore(ds_queue, flags);

        for (i = 0; i < folio_batch_count(&fbatch); i++) {
                bool did_split = false;
                bool underused = false;
                struct deferred_split *fqueue;

                folio = fbatch.folios[i];
                if (!folio_test_partially_mapped(folio)) {
                        /*
                         * See try_to_map_unused_to_zeropage(): we cannot
                         * optimize zero-filled pages after splitting an
                         * mlocked folio.
                         */
                        if (folio_test_mlocked(folio))
                                goto next;
                        underused = thp_underused(folio);
                        if (!underused)
                                goto next;
                }
                if (!folio_trylock(folio))
                        goto requeue;
                if (!split_folio(folio)) {
                        did_split = true;
                        if (underused)
                                count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
                        split++;
                }
                folio_unlock(folio);
next:
                /*
                 * If thp_underused() returns false, or if split_folio()
                 * succeeds, or if split_folio() fails in the case it was
                 * underused, then consider it used and don't add it back to
                 * split_queue.
                 */
                if (did_split || !folio_test_partially_mapped(folio))
                        continue;
requeue:
                /*
                 * Add back partially mapped folios, or underused folios that
                 * we could not lock this round.
                 */
                fqueue = folio_split_queue_lock_irqsave(folio, &flags);
                if (list_empty(&folio->_deferred_list)) {
                        list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
                        fqueue->split_queue_len++;
                }
                split_queue_unlock_irqrestore(fqueue, flags);
        }
        folios_put(&fbatch);

        if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
                cond_resched();
                goto retry;
        }

        /*
         * Stop shrinker if we didn't split any page, but the queue is empty.
         * This can happen if pages were freed under us.
         */
        if (!split && list_empty(&ds_queue->split_queue))
                return SHRINK_STOP;
        return split;
}

#ifdef CONFIG_MEMCG
void reparent_deferred_split_queue(struct mem_cgroup *memcg)
{
        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
        struct deferred_split *ds_queue = &memcg->deferred_split_queue;
        struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
        int nid;

        spin_lock_irq(&ds_queue->split_queue_lock);
        spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);

        if (!ds_queue->split_queue_len)
                goto unlock;

        list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
        parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
        ds_queue->split_queue_len = 0;

        for_each_node(nid)
                set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));

unlock:
        spin_unlock(&parent_ds_queue->split_queue_lock);
        spin_unlock_irq(&ds_queue->split_queue_lock);
}
#endif

#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
        struct zone *zone;
        struct page *page;
        struct folio *folio;
        unsigned long pfn, max_zone_pfn;
        unsigned long total = 0, split = 0;

        pr_debug("Split all THPs\n");
        for_each_zone(zone) {
                if (!managed_zone(zone))
                        continue;
                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
                        int nr_pages;

                        page = pfn_to_online_page(pfn);
                        if (!page || PageTail(page))
                                continue;
                        folio = page_folio(page);
                        if (!folio_try_get(folio))
                                continue;

                        if (unlikely(page_folio(page) != folio))
                                goto next;

                        if (zone != folio_zone(folio))
                                goto next;

                        if (!folio_test_large(folio)
                                || folio_test_hugetlb(folio)
                                || !folio_test_lru(folio))
                                goto next;

                        total++;
                        folio_lock(folio);
                        nr_pages = folio_nr_pages(folio);
                        if (!split_folio(folio))
                                split++;
                        pfn += nr_pages - 1;
                        folio_unlock(folio);
next:
                        folio_put(folio);
                        cond_resched();
                }
        }

        pr_debug("%lu of %lu THP split\n", split, total);
}

static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
{
        if (vma_is_dax(vma))
                return true;
        if (vma_is_special_huge(vma))
                return true;
        if (vma_test(vma, VMA_IO_BIT))
                return true;
        if (is_vm_hugetlb_page(vma))
                return true;

        return false;
}

static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
                                unsigned long vaddr_end, unsigned int new_order,
                                long in_folio_offset)
{
        int ret = 0;
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned long total = 0, split = 0;
        unsigned long addr;

        vaddr_start &= PAGE_MASK;
        vaddr_end &= PAGE_MASK;

        task = find_get_task_by_vpid(pid);
        if (!task) {
                ret = -ESRCH;
                goto out;
        }

        /* Find the mm_struct */
        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                ret = -EINVAL;
                goto out;
        }

        pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
                 pid, vaddr_start, vaddr_end, new_order, in_folio_offset);

        mmap_read_lock(mm);
        /*
         * always increase addr by PAGE_SIZE, since we could have a PTE page
         * table filled with PTE-mapped THPs, each of which is distinct.
         */
        for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
                struct vm_area_struct *vma = vma_lookup(mm, addr);
                struct folio_walk fw;
                struct folio *folio;
                struct address_space *mapping;
                unsigned int target_order = new_order;

                if (!vma)
                        break;

                /* skip special VMA and hugetlb VMA */
                if (vma_not_suitable_for_thp_split(vma)) {
                        addr = vma->vm_end;
                        continue;
                }

                folio = folio_walk_start(&fw, vma, addr, 0);
                if (!folio)
                        continue;

                if (!is_transparent_hugepage(folio))
                        goto next;

                if (!folio_test_anon(folio)) {
                        mapping = folio->mapping;
                        target_order = max(new_order,
                                           mapping_min_folio_order(mapping));
                }

                if (target_order >= folio_order(folio))
                        goto next;

                total++;
                /*
                 * For folios with private, split_huge_page_to_list_to_order()
                 * will try to drop it before split and then check if the folio
                 * can be split or not. So skip the check here.
                 */
                if (!folio_test_private(folio) &&
                    folio_expected_ref_count(folio) != folio_ref_count(folio))
                        goto next;

                if (!folio_trylock(folio))
                        goto next;
                folio_get(folio);
                folio_walk_end(&fw, vma);

                if (!folio_test_anon(folio) && folio->mapping != mapping)
                        goto unlock;

                if (in_folio_offset < 0 ||
                    in_folio_offset >= folio_nr_pages(folio)) {
                        if (!split_folio_to_order(folio, target_order))
                                split++;
                } else {
                        struct page *split_at = folio_page(folio,
                                                           in_folio_offset);
                        if (!folio_split(folio, target_order, split_at, NULL))
                                split++;
                }

unlock:

                folio_unlock(folio);
                folio_put(folio);

                cond_resched();
                continue;
next:
                folio_walk_end(&fw, vma);
                cond_resched();
        }
        mmap_read_unlock(mm);
        mmput(mm);

        pr_debug("%lu of %lu THP split\n", split, total);

out:
        return ret;
}

static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
                                pgoff_t off_end, unsigned int new_order,
                                long in_folio_offset)
{
        struct file *candidate;
        struct address_space *mapping;
        pgoff_t index;
        int nr_pages = 1;
        unsigned long total = 0, split = 0;
        unsigned int min_order;
        unsigned int target_order;

        CLASS(filename_kernel, file)(file_path);
        candidate = file_open_name(file, O_RDONLY, 0);
        if (IS_ERR(candidate))
                return -EINVAL;

        pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
                 file_path, off_start, off_end, new_order, in_folio_offset);

        mapping = candidate->f_mapping;
        min_order = mapping_min_folio_order(mapping);
        target_order = max(new_order, min_order);

        for (index = off_start; index < off_end; index += nr_pages) {
                struct folio *folio = filemap_get_folio(mapping, index);

                nr_pages = 1;
                if (IS_ERR(folio))
                        continue;

                if (!folio_test_large(folio))
                        goto next;

                total++;
                nr_pages = folio_nr_pages(folio);

                if (target_order >= folio_order(folio))
                        goto next;

                if (!folio_trylock(folio))
                        goto next;

                if (folio->mapping != mapping)
                        goto unlock;

                if (in_folio_offset < 0 || in_folio_offset >= nr_pages) {
                        if (!split_folio_to_order(folio, target_order))
                                split++;
                } else {
                        struct page *split_at = folio_page(folio,
                                                           in_folio_offset);
                        if (!folio_split(folio, target_order, split_at, NULL))
                                split++;
                }

unlock:
                folio_unlock(folio);
next:
                folio_put(folio);
                cond_resched();
        }

        filp_close(candidate, NULL);
        pr_debug("%lu of %lu file-backed THP split\n", split, total);
        return 0;
}

#define MAX_INPUT_BUF_SZ 255

static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppops)
{
        static DEFINE_MUTEX(split_debug_mutex);
        ssize_t ret;
        /*
         * hold pid, start_vaddr, end_vaddr, new_order or
         * file_path, off_start, off_end, new_order
         */
        char input_buf[MAX_INPUT_BUF_SZ];
        int pid;
        unsigned long vaddr_start, vaddr_end;
        unsigned int new_order = 0;
        long in_folio_offset = -1;

        ret = mutex_lock_interruptible(&split_debug_mutex);
        if (ret)
                return ret;

        ret = -EFAULT;

        memset(input_buf, 0, MAX_INPUT_BUF_SZ);
        if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
                goto out;

        input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';

        if (input_buf[0] == '/') {
                char *tok;
                char *tok_buf = input_buf;
                char file_path[MAX_INPUT_BUF_SZ];
                pgoff_t off_start = 0, off_end = 0;
                size_t input_len = strlen(input_buf);

                tok = strsep(&tok_buf, ",");
                if (tok && tok_buf) {
                        strscpy(file_path, tok);
                } else {
                        ret = -EINVAL;
                        goto out;
                }

                ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
                                &new_order, &in_folio_offset);
                if (ret != 2 && ret != 3 && ret != 4) {
                        ret = -EINVAL;
                        goto out;
                }
                ret = split_huge_pages_in_file(file_path, off_start, off_end,
                                new_order, in_folio_offset);
                if (!ret)
                        ret = input_len;

                goto out;
        }

        ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
                        &vaddr_end, &new_order, &in_folio_offset);
        if (ret == 1 && pid == 1) {
                split_huge_pages_all();
                ret = strlen(input_buf);
                goto out;
        } else if (ret != 3 && ret != 4 && ret != 5) {
                ret = -EINVAL;
                goto out;
        }

        ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
                        in_folio_offset);
        if (!ret)
                ret = strlen(input_buf);
out:
        mutex_unlock(&split_debug_mutex);
        return ret;

}

static const struct file_operations split_huge_pages_fops = {
        .owner         = THIS_MODULE,
        .write         = split_huge_pages_write,
};

static int __init split_huge_pages_debugfs(void)
{
        debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
                            &split_huge_pages_fops);
        return 0;
}
late_initcall(split_huge_pages_debugfs);
#endif

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
{
        struct folio *folio = page_folio(page);
        struct vm_area_struct *vma = pvmw->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address = pvmw->address;
        bool anon_exclusive;
        pmd_t pmdval;
        swp_entry_t entry;
        pmd_t pmdswp;

        if (!(pvmw->pmd && !pvmw->pte))
                return 0;

        flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
        if (unlikely(!pmd_present(*pvmw->pmd)))
                pmdval = pmdp_huge_get_and_clear(vma->vm_mm, address, pvmw->pmd);
        else
                pmdval = pmdp_invalidate(vma, address, pvmw->pmd);

        /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
        anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
        if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
                set_pmd_at(mm, address, pvmw->pmd, pmdval);
                return -EBUSY;
        }

        if (pmd_dirty(pmdval))
                folio_mark_dirty(folio);
        if (pmd_write(pmdval))
                entry = make_writable_migration_entry(page_to_pfn(page));
        else if (anon_exclusive)
                entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
        else
                entry = make_readable_migration_entry(page_to_pfn(page));
        if (pmd_young(pmdval))
                entry = make_migration_entry_young(entry);
        if (pmd_dirty(pmdval))
                entry = make_migration_entry_dirty(entry);
        pmdswp = swp_entry_to_pmd(entry);
        if (pmd_soft_dirty(pmdval))
                pmdswp = pmd_swp_mksoft_dirty(pmdswp);
        if (pmd_uffd_wp(pmdval))
                pmdswp = pmd_swp_mkuffd_wp(pmdswp);
        set_pmd_at(mm, address, pvmw->pmd, pmdswp);
        folio_remove_rmap_pmd(folio, page, vma);
        folio_put(folio);
        trace_set_migration_pmd(address, pmd_val(pmdswp));

        return 0;
}

void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
{
        struct folio *folio = page_folio(new);
        struct vm_area_struct *vma = pvmw->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address = pvmw->address;
        unsigned long haddr = address & HPAGE_PMD_MASK;
        pmd_t pmde;
        softleaf_t entry;

        if (!(pvmw->pmd && !pvmw->pte))
                return;

        entry = softleaf_from_pmd(*pvmw->pmd);
        folio_get(folio);
        pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));

        if (pmd_swp_soft_dirty(*pvmw->pmd))
                pmde = pmd_mksoft_dirty(pmde);
        if (softleaf_is_migration_write(entry))
                pmde = pmd_mkwrite(pmde, vma);
        if (pmd_swp_uffd_wp(*pvmw->pmd))
                pmde = pmd_mkuffd_wp(pmde);
        if (!softleaf_is_migration_young(entry))
                pmde = pmd_mkold(pmde);
        /* NOTE: this may contain setting soft-dirty on some archs */
        if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry))
                pmde = pmd_mkdirty(pmde);

        if (folio_is_device_private(folio)) {
                swp_entry_t entry;

                if (pmd_write(pmde))
                        entry = make_writable_device_private_entry(
                                                        page_to_pfn(new));
                else
                        entry = make_readable_device_private_entry(
                                                        page_to_pfn(new));
                pmde = swp_entry_to_pmd(entry);

                if (pmd_swp_soft_dirty(*pvmw->pmd))
                        pmde = pmd_swp_mksoft_dirty(pmde);
                if (pmd_swp_uffd_wp(*pvmw->pmd))
                        pmde = pmd_swp_mkuffd_wp(pmde);
        }

        if (folio_test_anon(folio)) {
                rmap_t rmap_flags = RMAP_NONE;

                if (!softleaf_is_migration_read(entry))
                        rmap_flags |= RMAP_EXCLUSIVE;

                folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
        } else {
                folio_add_file_rmap_pmd(folio, new, vma);
        }
        VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
        set_pmd_at(mm, haddr, pvmw->pmd, pmde);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_pmd(vma, address, pvmw->pmd);
        trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif


















































































































































































































































































































































































































































































































































































































































   22 

    9 


    6 

    5 

    1 








   21 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  NET  is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Ethernet handlers.
 *
 * Version:        @(#)eth.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 *                Relocated to include/linux where it belongs by Alan Cox
 *                                                        <gw4pts@gw4pts.ampr.org>
 */
#ifndef _LINUX_ETHERDEVICE_H
#define _LINUX_ETHERDEVICE_H

#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/random.h>
#include <linux/crc32.h>
#include <linux/unaligned.h>
#include <asm/bitsperlong.h>

#ifdef __KERNEL__
struct device;
struct fwnode_handle;

int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
int platform_get_ethdev_address(struct device *dev, struct net_device *netdev);
unsigned char *arch_get_platform_mac_address(void);
int nvmem_get_mac_address(struct device *dev, void *addrbuf);
int device_get_mac_address(struct device *dev, char *addr);
int device_get_ethdev_address(struct device *dev, struct net_device *netdev);
int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr);

u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len);
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
extern const struct header_ops eth_header_ops;

int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
               const void *daddr, const void *saddr, unsigned len);
int eth_header_parse(const struct sk_buff *skb, const struct net_device *dev,
                     unsigned char *haddr);
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh,
                     __be16 type);
void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev,
                             const unsigned char *haddr);
__be16 eth_header_parse_protocol(const struct sk_buff *skb);
int eth_prepare_mac_addr_change(struct net_device *dev, void *p);
void eth_commit_mac_addr_change(struct net_device *dev, void *p);
int eth_mac_addr(struct net_device *dev, void *p);
int eth_validate_addr(struct net_device *dev);

struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
                                            unsigned int rxqs);
#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
#define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)

struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
                                           unsigned int txqs,
                                           unsigned int rxqs);
#define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1)

struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb);
int eth_gro_complete(struct sk_buff *skb, int nhoff);

/* Reserved Ethernet Addresses per IEEE 802.1Q */
static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
#define eth_stp_addr eth_reserved_addr_base

static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };

static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) =
{ 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };

/**
 * is_link_local_ether_addr - Determine if given Ethernet address is link-local
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per
 * IEEE 802.1Q 8.6.3 Frame filtering.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_link_local_ether_addr(const u8 *addr)
{
        __be16 *a = (__be16 *)addr;
        static const __be16 *b = (const __be16 *)eth_reserved_addr_base;
        static const __be16 m = cpu_to_be16(0xfff0);

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return (((*(const u32 *)addr) ^ (*(const u32 *)b)) |
                (__force int)((a[2] ^ b[2]) & m)) == 0;
#else
        return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
#endif
}

/**
 * is_zero_ether_addr - Determine if give Ethernet address is all zeros.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is all zeroes.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_zero_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0;
#else
        return (*(const u16 *)(addr + 0) |
                *(const u16 *)(addr + 2) |
                *(const u16 *)(addr + 4)) == 0;
#endif
}

/**
 * is_multicast_ether_addr - Determine if the Ethernet address is a multicast.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is a multicast address.
 * By definition the broadcast address is also a multicast address.
 */
static inline bool is_multicast_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        u32 a = *(const u32 *)addr;
#else
        u16 a = *(const u16 *)addr;
#endif
#ifdef __BIG_ENDIAN
        return 0x01 & (a >> ((sizeof(a) * 8) - 8));
#else
        return 0x01 & a;
#endif
}

static inline bool is_multicast_ether_addr_64bits(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
#ifdef __BIG_ENDIAN
        return 0x01 & ((*(const u64 *)addr) >> 56);
#else
        return 0x01 & (*(const u64 *)addr);
#endif
#else
        return is_multicast_ether_addr(addr);
#endif
}

/**
 * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802).
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is a local address.
 */
static inline bool is_local_ether_addr(const u8 *addr)
{
        return 0x02 & addr[0];
}

/**
 * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is the broadcast address.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_broadcast_ether_addr(const u8 *addr)
{
        return (*(const u16 *)(addr + 0) &
                *(const u16 *)(addr + 2) &
                *(const u16 *)(addr + 4)) == 0xffff;
}

/**
 * is_unicast_ether_addr - Determine if the Ethernet address is unicast
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is a unicast address.
 */
static inline bool is_unicast_ether_addr(const u8 *addr)
{
        return !is_multicast_ether_addr(addr);
}

/**
 * is_valid_ether_addr - Determine if the given Ethernet address is valid
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not
 * a multicast address, and is not FF:FF:FF:FF:FF:FF.
 *
 * Return: true if the address is valid.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_valid_ether_addr(const u8 *addr)
{
        /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to
         * explicitly check for it here. */
        return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr);
}

/**
 * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol
 * @proto: Ethertype/length value to be tested
 *
 * Check that the value from the Ethertype/length field is a valid Ethertype.
 *
 * Return: true if the valid is an 802.3 supported Ethertype.
 */
static inline bool eth_proto_is_802_3(__be16 proto)
{
#ifndef __BIG_ENDIAN
        /* if CPU is little endian mask off bits representing LSB */
        proto &= htons(0xFF00);
#endif
        /* cast both to u16 and compare since LSB can be ignored */
        return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN);
}

/**
 * eth_random_addr - Generate software assigned random Ethernet address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Generate a random Ethernet address (MAC) that is not multicast
 * and has the local assigned bit set.
 */
static inline void eth_random_addr(u8 *addr)
{
        get_random_bytes(addr, ETH_ALEN);
        addr[0] &= 0xfe;        /* clear multicast bit */
        addr[0] |= 0x02;        /* set local assignment bit (IEEE802) */
}

/**
 * eth_broadcast_addr - Assign broadcast address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Assign the broadcast address to the given address array.
 */
static inline void eth_broadcast_addr(u8 *addr)
{
        memset(addr, 0xff, ETH_ALEN);
}

/**
 * eth_zero_addr - Assign zero address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Assign the zero address to the given address array.
 */
static inline void eth_zero_addr(u8 *addr)
{
        memset(addr, 0x00, ETH_ALEN);
}

/**
 * eth_hw_addr_random - Generate software assigned random Ethernet and
 * set device flag
 * @dev: pointer to net_device structure
 *
 * Generate a random Ethernet address (MAC) to be used by a net device
 * and set addr_assign_type so the state can be read by sysfs and be
 * used by userspace.
 */
static inline void eth_hw_addr_random(struct net_device *dev)
{
        u8 addr[ETH_ALEN];

        eth_random_addr(addr);
        __dev_addr_set(dev, addr, ETH_ALEN);
        dev->addr_assign_type = NET_ADDR_RANDOM;
}

/**
 * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr
 * @ha: pointer to hardware address
 *
 * Calculate CRC from a hardware address as basis for filter hashes.
 */
static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha)
{
        return ether_crc(ETH_ALEN, ha->addr);
}

/**
 * ether_addr_copy - Copy an Ethernet address
 * @dst: Pointer to a six-byte array Ethernet address destination
 * @src: Pointer to a six-byte array Ethernet address source
 *
 * Please note: dst & src must both be aligned to u16.
 */
static inline void ether_addr_copy(u8 *dst, const u8 *src)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        *(u32 *)dst = *(const u32 *)src;
        *(u16 *)(dst + 4) = *(const u16 *)(src + 4);
#else
        u16 *a = (u16 *)dst;
        const u16 *b = (const u16 *)src;

        a[0] = b[0];
        a[1] = b[1];
        a[2] = b[2];
#endif
}

/**
 * eth_hw_addr_set - Assign Ethernet address to a net_device
 * @dev: pointer to net_device structure
 * @addr: address to assign
 *
 * Assign given address to the net_device, addr_assign_type is not changed.
 */
static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
{
        __dev_addr_set(dev, addr, ETH_ALEN);
}

/**
 * eth_hw_addr_inherit - Copy dev_addr from another net_device
 * @dst: pointer to net_device to copy dev_addr to
 * @src: pointer to net_device to copy dev_addr from
 *
 * Copy the Ethernet address from one net_device to another along with
 * the address attributes (addr_assign_type).
 */
static inline void eth_hw_addr_inherit(struct net_device *dst,
                                       struct net_device *src)
{
        dst->addr_assign_type = src->addr_assign_type;
        eth_hw_addr_set(dst, src->dev_addr);
}

/**
 * ether_addr_equal - Compare two Ethernet addresses
 * @addr1: Pointer to a six-byte array containing the Ethernet address
 * @addr2: Pointer other six-byte array containing the Ethernet address
 *
 * Compare two Ethernet addresses, returns true if equal
 *
 * Please note: addr1 & addr2 must both be aligned to u16.
 */
static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) |
                   ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4)));

        return fold == 0;
#else
        const u16 *a = (const u16 *)addr1;
        const u16 *b = (const u16 *)addr2;

        return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
#endif
}

/**
 * ether_addr_equal_64bits - Compare two Ethernet addresses
 * @addr1: Pointer to an array of 8 bytes
 * @addr2: Pointer to an other array of 8 bytes
 *
 * Compare two Ethernet addresses, returns true if equal, false otherwise.
 *
 * The function doesn't need any conditional branches and possibly uses
 * word memory accesses on CPU allowing cheap unaligned memory reads.
 * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 }
 *
 * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits.
 */

static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2);

#ifdef __BIG_ENDIAN
        return (fold >> 16) == 0;
#else
        return (fold << 16) == 0;
#endif
#else
        return ether_addr_equal(addr1, addr2);
#endif
}

/**
 * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses
 * @addr1: Pointer to a six-byte array containing the Ethernet address
 * @addr2: Pointer other six-byte array containing the Ethernet address
 *
 * Compare two Ethernet addresses, returns true if equal
 *
 * Please note: Use only when any Ethernet address may not be u16 aligned.
 */
static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return ether_addr_equal(addr1, addr2);
#else
        return memcmp(addr1, addr2, ETH_ALEN) == 0;
#endif
}

/**
 * ether_addr_equal_masked - Compare two Ethernet addresses with a mask
 * @addr1: Pointer to a six-byte array containing the 1st Ethernet address
 * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address
 * @mask: Pointer to a six-byte array containing the Ethernet address bitmask
 *
 * Compare two Ethernet addresses with a mask, returns true if for every bit
 * set in the bitmask the equivalent bits in the ethernet addresses are equal.
 * Using a mask with all bits set is a slower ether_addr_equal.
 */
static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2,
                                           const u8 *mask)
{
        int i;

        for (i = 0; i < ETH_ALEN; i++) {
                if ((addr1[i] ^ addr2[i]) & mask[i])
                        return false;
        }

        return true;
}

static inline bool ether_addr_is_ipv4_mcast(const u8 *addr)
{
        u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 };

        return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask);
}

static inline bool ether_addr_is_ipv6_mcast(const u8 *addr)
{
        u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 };

        return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask);
}

static inline bool ether_addr_is_ip_mcast(const u8 *addr)
{
        return ether_addr_is_ipv4_mcast(addr) ||
                ether_addr_is_ipv6_mcast(addr);
}

/**
 * ether_addr_to_u64 - Convert an Ethernet address into a u64 value.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: a u64 value of the address
 */
static inline u64 ether_addr_to_u64(const u8 *addr)
{
        u64 u = 0;
        int i;

        for (i = 0; i < ETH_ALEN; i++)
                u = u << 8 | addr[i];

        return u;
}

/**
 * u64_to_ether_addr - Convert a u64 to an Ethernet address.
 * @u: u64 to convert to an Ethernet MAC address
 * @addr: Pointer to a six-byte array to contain the Ethernet address
 */
static inline void u64_to_ether_addr(u64 u, u8 *addr)
{
        int i;

        for (i = ETH_ALEN - 1; i >= 0; i--) {
                addr[i] = u & 0xff;
                u = u >> 8;
        }
}

/**
 * eth_addr_dec - Decrement the given MAC address
 *
 * @addr: Pointer to a six-byte array containing Ethernet address to decrement
 */
static inline void eth_addr_dec(u8 *addr)
{
        u64 u = ether_addr_to_u64(addr);

        u--;
        u64_to_ether_addr(u, addr);
}

/**
 * eth_addr_inc() - Increment the given MAC address.
 * @addr: Pointer to a six-byte array containing Ethernet address to increment.
 */
static inline void eth_addr_inc(u8 *addr)
{
        u64 u = ether_addr_to_u64(addr);

        u++;
        u64_to_ether_addr(u, addr);
}

/**
 * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address.
 *
 * @offset: Offset to add.
 * @addr: Pointer to a six-byte array containing Ethernet address to increment.
 */
static inline void eth_addr_add(u8 *addr, long offset)
{
        u64 u = ether_addr_to_u64(addr);

        u += offset;
        u64_to_ether_addr(u, addr);
}

/**
 * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
 * @dev: Pointer to a device structure
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Compare passed address with all addresses of the device. Return true if the
 * address if one of the device addresses.
 *
 * Note that this function calls ether_addr_equal_64bits() so take care of
 * the right padding.
 */
static inline bool is_etherdev_addr(const struct net_device *dev,
                                    const u8 addr[6 + 2])
{
        struct netdev_hw_addr *ha;
        bool res = false;

        rcu_read_lock();
        for_each_dev_addr(dev, ha) {
                res = ether_addr_equal_64bits(addr, ha->addr);
                if (res)
                        break;
        }
        rcu_read_unlock();
        return res;
}
#endif        /* __KERNEL__ */

/**
 * compare_ether_header - Compare two Ethernet headers
 * @a: Pointer to Ethernet header
 * @b: Pointer to Ethernet header
 *
 * Compare two Ethernet headers, returns 0 if equal.
 * This assumes that the network header (i.e., IP header) is 4-byte
 * aligned OR the platform can handle unaligned access.  This is the
 * case for all packets coming into netif_receive_skb or similar
 * entry points.
 */

static inline unsigned long compare_ether_header(const void *a, const void *b)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        unsigned long fold;

        /*
         * We want to compare 14 bytes:
         *  [a0 ... a13] ^ [b0 ... b13]
         * Use two long XOR, ORed together, with an overlap of two bytes.
         *  [a0  a1  a2  a3  a4  a5  a6  a7 ] ^ [b0  b1  b2  b3  b4  b5  b6  b7 ] |
         *  [a6  a7  a8  a9  a10 a11 a12 a13] ^ [b6  b7  b8  b9  b10 b11 b12 b13]
         * This means the [a6 a7] ^ [b6 b7] part is done two times.
        */
        fold = *(unsigned long *)a ^ *(unsigned long *)b;
        fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6);
        return fold;
#else
        u32 *a32 = (u32 *)((u8 *)a + 2);
        u32 *b32 = (u32 *)((u8 *)b + 2);

        return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) |
               (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
#endif
}

/**
 * eth_hw_addr_gen - Generate and assign Ethernet address to a port
 * @dev: pointer to port's net_device structure
 * @base_addr: base Ethernet address
 * @id: offset to add to the base address
 *
 * Generate a MAC address using a base address and an offset and assign it
 * to a net_device. Commonly used by switch drivers which need to compute
 * addresses for all their ports. addr_assign_type is not changed.
 */
static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr,
                                   unsigned int id)
{
        u64 u = ether_addr_to_u64(base_addr);
        u8 addr[ETH_ALEN];

        u += id;
        u64_to_ether_addr(u, addr);
        eth_hw_addr_set(dev, addr);
}

/**
 * eth_skb_pkt_type - Assign packet type if destination address does not match
 * @skb: Assigned a packet type if address does not match @dev address
 * @dev: Network device used to compare packet address against
 *
 * If the destination MAC address of the packet does not match the network
 * device address, assign an appropriate packet type.
 */
static inline void eth_skb_pkt_type(struct sk_buff *skb,
                                    const struct net_device *dev)
{
        const struct ethhdr *eth = eth_hdr(skb);

        if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) {
                if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
                        if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
                                skb->pkt_type = PACKET_BROADCAST;
                        else
                                skb->pkt_type = PACKET_MULTICAST;
                } else {
                        skb->pkt_type = PACKET_OTHERHOST;
                }
        }
}

static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb)
{
        struct ethhdr *eth = (struct ethhdr *)skb->data;

        skb_pull_inline(skb, ETH_HLEN);
        return eth;
}

/**
 * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame
 * @skb: Buffer to pad
 *
 * An Ethernet frame should have a minimum size of 60 bytes.  This function
 * takes short frames and pads them with zeros up to the 60 byte limit.
 */
static inline int eth_skb_pad(struct sk_buff *skb)
{
        return skb_put_padto(skb, ETH_ZLEN);
}

#endif        /* _LINUX_ETHERDEVICE_H */



































































    1 



































    1 





























    1 










    1 





















    1 






    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
// SPDX-License-Identifier: GPL-2.0
/*
 * kmod - the kernel module loader
 *
 * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
 */

#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/binfmts.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/resource.h>
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <linux/rwsem.h>
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>

#include <trace/events/module.h>
#include "internal.h"

/*
 * Assuming:
 *
 * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
 *                       (u64) THREAD_SIZE * 8UL);
 *
 * If you need less than 50 threads would mean we're dealing with systems
 * smaller than 3200 pages. This assumes you are capable of having ~13M memory,
 * and this would only be an upper limit, after which the OOM killer would take
 * effect. Systems like these are very unlikely if modules are enabled.
 */
#define MAX_KMOD_CONCURRENT 50
static DEFINE_SEMAPHORE(kmod_concurrent_max, MAX_KMOD_CONCURRENT);

/*
 * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
 * running at the same time without returning. When this happens we
 * believe you've somehow ended up with a recursive module dependency
 * creating a loop.
 *
 * We have no option but to fail.
 *
 * Userspace should proactively try to detect and prevent these.
 */
#define MAX_KMOD_ALL_BUSY_TIMEOUT 5

/*
        modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH;

static void free_modprobe_argv(struct subprocess_info *info)
{
        kfree(info->argv[3]); /* check call_modprobe() */
        kfree(info->argv);
}

static int call_modprobe(char *orig_module_name, int wait)
{
        struct subprocess_info *info;
        static char *envp[] = {
                "HOME=/",
                "TERM=linux",
                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
                NULL
        };
        char *module_name;
        int ret;

        char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
        if (!argv)
                goto out;

        module_name = kstrdup(orig_module_name, GFP_KERNEL);
        if (!module_name)
                goto free_argv;

        argv[0] = modprobe_path;
        argv[1] = "-q";
        argv[2] = "--";
        argv[3] = module_name;        /* check free_modprobe_argv() */
        argv[4] = NULL;

        info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
                                         NULL, free_modprobe_argv, NULL);
        if (!info)
                goto free_module_name;

        ret = call_usermodehelper_exec(info, wait | UMH_KILLABLE);
        kmod_dup_request_announce(orig_module_name, ret);
        return ret;

free_module_name:
        kfree(module_name);
free_argv:
        kfree(argv);
out:
        kmod_dup_request_announce(orig_module_name, -ENOMEM);
        return -ENOMEM;
}

/**
 * __request_module - try to load a kernel module
 * @wait: wait (or not) for the operation to complete
 * @fmt: printf style format string for the name of the module
 * @...: arguments as specified in the format string
 *
 * Load a module using the user mode module loader. The function returns
 * zero on success or a negative errno code or positive exit code from
 * "modprobe" on failure. Note that a successful module load does not mean
 * the module did not then unload and exit on an error of its own. Callers
 * must check that the service they requested is now available not blindly
 * invoke it.
 *
 * If module auto-loading support is disabled then this function
 * simply returns -ENOENT.
 */
int __request_module(bool wait, const char *fmt, ...)
{
        va_list args;
        char module_name[MODULE_NAME_LEN];
        int ret, dup_ret;

        /*
         * We don't allow synchronous module loading from async.  Module
         * init may invoke async_synchronize_full() which will end up
         * waiting for this task which already is waiting for the module
         * loading to complete, leading to a deadlock.
         */
        WARN_ON_ONCE(wait && current_is_async());

        if (!modprobe_path[0])
                return -ENOENT;

        va_start(args, fmt);
        ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
        va_end(args);
        if (ret >= MODULE_NAME_LEN)
                return -ENAMETOOLONG;

        ret = security_kernel_module_request(module_name);
        if (ret)
                return ret;

        ret = down_timeout(&kmod_concurrent_max, MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
        if (ret) {
                pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
                                    module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
                return ret;
        }

        trace_module_request(module_name, wait, _RET_IP_);

        if (kmod_dup_request_exists_wait(module_name, wait, &dup_ret)) {
                ret = dup_ret;
                goto out;
        }

        ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);

out:
        up(&kmod_concurrent_max);

        return ret;
}
EXPORT_SYMBOL(__request_module);






































































    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * kernel/workqueue_internal.h
 *
 * Workqueue internal header file.  Only to be included by workqueue and
 * core kernel subsystems.
 */
#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
#define _KERNEL_WORKQUEUE_INTERNAL_H

#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/preempt.h>

struct worker_pool;

/*
 * The poor guys doing the actual heavy lifting.  All on-duty workers are
 * either serving the manager role, on idle list or on busy hash.  For
 * details on the locking annotation (L, I, X...), refer to workqueue.c.
 *
 * Only to be used in workqueue and async.
 */
struct worker {
        /* on idle list while idle, on busy hash table while busy */
        union {
                struct list_head        entry;        /* L: while idle */
                struct hlist_node        hentry;        /* L: while busy */
        };

        struct work_struct        *current_work;        /* K: work being processed and its */
        work_func_t                current_func;        /* K: function */
        struct pool_workqueue        *current_pwq;        /* K: pwq */
        u64                        current_at;        /* K: runtime at start or last wakeup */
        unsigned long                current_start;        /* K: start time of current work item */
        unsigned int                current_color;        /* K: color */

        int                        sleeping;        /* S: is worker sleeping? */

        /* used by the scheduler to determine a worker's last known identity */
        work_func_t                last_func;        /* K: last work's fn */

        struct list_head        scheduled;        /* L: scheduled works */

        struct task_struct        *task;                /* I: worker task */
        struct worker_pool        *pool;                /* A: the associated pool */
                                                /* L: for rescuers */
        struct list_head        node;                /* A: anchored at pool->workers */
                                                /* A: runs through worker->node */

        unsigned long                last_active;        /* K: last active timestamp */
        unsigned int                flags;                /* L: flags */
        int                        id;                /* I: worker id */

        /*
         * Opaque string set with work_set_desc().  Printed out with task
         * dump for debugging - WARN, BUG, panic or sysrq.
         */
        char                        desc[WORKER_DESC_LEN];

        /* used only by rescuers to point to the target workqueue */
        struct workqueue_struct        *rescue_wq;        /* I: the workqueue to rescue */
};

/**
 * current_wq_worker - return struct worker if %current is a workqueue worker
 */
static inline struct worker *current_wq_worker(void)
{
        if (in_task() && (current->flags & PF_WQ_WORKER))
                return kthread_data(current);
        return NULL;
}

/*
 * Scheduler hooks for concurrency managed workqueue.  Only to be used from
 * sched/ and workqueue.c.
 */
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);

#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */




















































































    3 


   17 


























   17 


   21 








































































































    1 









   14 

























   18 



   17 

   16 
















   14 



   14 
   14 
    3 

















































































































































































































































































































   18 

























































































































   14 















   14 







   15 

   13 

   13 
















   15 





















   17 

    4 
   15 






   14 


































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/file.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/*
 * Mapping table from "enum tomoyo_path_acl_index" to "enum tomoyo_mac_index".
 */
static const u8 tomoyo_p2mac[TOMOYO_MAX_PATH_OPERATION] = {
        [TOMOYO_TYPE_EXECUTE]    = TOMOYO_MAC_FILE_EXECUTE,
        [TOMOYO_TYPE_READ]       = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_WRITE]      = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_APPEND]     = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_UNLINK]     = TOMOYO_MAC_FILE_UNLINK,
        [TOMOYO_TYPE_GETATTR]    = TOMOYO_MAC_FILE_GETATTR,
        [TOMOYO_TYPE_RMDIR]      = TOMOYO_MAC_FILE_RMDIR,
        [TOMOYO_TYPE_TRUNCATE]   = TOMOYO_MAC_FILE_TRUNCATE,
        [TOMOYO_TYPE_SYMLINK]    = TOMOYO_MAC_FILE_SYMLINK,
        [TOMOYO_TYPE_CHROOT]     = TOMOYO_MAC_FILE_CHROOT,
        [TOMOYO_TYPE_UMOUNT]     = TOMOYO_MAC_FILE_UMOUNT,
};

/*
 * Mapping table from "enum tomoyo_mkdev_acl_index" to "enum tomoyo_mac_index".
 */
const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION] = {
        [TOMOYO_TYPE_MKBLOCK] = TOMOYO_MAC_FILE_MKBLOCK,
        [TOMOYO_TYPE_MKCHAR]  = TOMOYO_MAC_FILE_MKCHAR,
};

/*
 * Mapping table from "enum tomoyo_path2_acl_index" to "enum tomoyo_mac_index".
 */
const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION] = {
        [TOMOYO_TYPE_LINK]       = TOMOYO_MAC_FILE_LINK,
        [TOMOYO_TYPE_RENAME]     = TOMOYO_MAC_FILE_RENAME,
        [TOMOYO_TYPE_PIVOT_ROOT] = TOMOYO_MAC_FILE_PIVOT_ROOT,
};

/*
 * Mapping table from "enum tomoyo_path_number_acl_index" to
 * "enum tomoyo_mac_index".
 */
const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION] = {
        [TOMOYO_TYPE_CREATE] = TOMOYO_MAC_FILE_CREATE,
        [TOMOYO_TYPE_MKDIR]  = TOMOYO_MAC_FILE_MKDIR,
        [TOMOYO_TYPE_MKFIFO] = TOMOYO_MAC_FILE_MKFIFO,
        [TOMOYO_TYPE_MKSOCK] = TOMOYO_MAC_FILE_MKSOCK,
        [TOMOYO_TYPE_IOCTL]  = TOMOYO_MAC_FILE_IOCTL,
        [TOMOYO_TYPE_CHMOD]  = TOMOYO_MAC_FILE_CHMOD,
        [TOMOYO_TYPE_CHOWN]  = TOMOYO_MAC_FILE_CHOWN,
        [TOMOYO_TYPE_CHGRP]  = TOMOYO_MAC_FILE_CHGRP,
};

/**
 * tomoyo_put_name_union - Drop reference on "struct tomoyo_name_union".
 *
 * @ptr: Pointer to "struct tomoyo_name_union".
 *
 * Returns nothing.
 */
void tomoyo_put_name_union(struct tomoyo_name_union *ptr)
{
        tomoyo_put_group(ptr->group);
        tomoyo_put_name(ptr->filename);
}

/**
 * tomoyo_compare_name_union - Check whether a name matches "struct tomoyo_name_union" or not.
 *
 * @name: Pointer to "struct tomoyo_path_info".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 *
 * Returns "struct tomoyo_path_info" if @name matches @ptr, NULL otherwise.
 */
const struct tomoyo_path_info *
tomoyo_compare_name_union(const struct tomoyo_path_info *name,
                          const struct tomoyo_name_union *ptr)
{
        if (ptr->group)
                return tomoyo_path_matches_group(name, ptr->group);
        if (tomoyo_path_matches_pattern(name, ptr->filename))
                return ptr->filename;
        return NULL;
}

/**
 * tomoyo_put_number_union - Drop reference on "struct tomoyo_number_union".
 *
 * @ptr: Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
void tomoyo_put_number_union(struct tomoyo_number_union *ptr)
{
        tomoyo_put_group(ptr->group);
}

/**
 * tomoyo_compare_number_union - Check whether a value matches "struct tomoyo_number_union" or not.
 *
 * @value: Number to check.
 * @ptr:   Pointer to "struct tomoyo_number_union".
 *
 * Returns true if @value matches @ptr, false otherwise.
 */
bool tomoyo_compare_number_union(const unsigned long value,
                                 const struct tomoyo_number_union *ptr)
{
        if (ptr->group)
                return tomoyo_number_matches_group(value, value, ptr->group);
        return value >= ptr->values[0] && value <= ptr->values[1];
}

/**
 * tomoyo_add_slash - Add trailing '/' if needed.
 *
 * @buf: Pointer to "struct tomoyo_path_info".
 *
 * Returns nothing.
 *
 * @buf must be generated by tomoyo_encode() because this function does not
 * allocate memory for adding '/'.
 */
static void tomoyo_add_slash(struct tomoyo_path_info *buf)
{
        if (buf->is_dir)
                return;
        /*
         * This is OK because tomoyo_encode() reserves space for appending "/".
         */
        strcat((char *) buf->name, "/");
        tomoyo_fill_path_info(buf);
}

/**
 * tomoyo_get_realpath - Get realpath.
 *
 * @buf:  Pointer to "struct tomoyo_path_info".
 * @path: Pointer to "struct path".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_get_realpath(struct tomoyo_path_info *buf, const struct path *path)
{
        buf->name = tomoyo_realpath_from_path(path);
        if (buf->name) {
                tomoyo_fill_path_info(buf);
                return true;
        }
        return false;
}

/**
 * tomoyo_audit_path_log - Audit path request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path_log(struct tomoyo_request_info *r)
        __must_hold_shared(&tomoyo_ss)
{
        return tomoyo_supervisor(r, "file %s %s\n", tomoyo_path_keyword
                                 [r->param.path.operation],
                                 r->param.path.filename->name);
}

/**
 * tomoyo_audit_path2_log - Audit path/path request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path2_log(struct tomoyo_request_info *r)
        __must_hold_shared(&tomoyo_ss)
{
        return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords
                                 [tomoyo_pp2mac[r->param.path2.operation]],
                                 r->param.path2.filename1->name,
                                 r->param.path2.filename2->name);
}

/**
 * tomoyo_audit_mkdev_log - Audit path/number/number/number request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_mkdev_log(struct tomoyo_request_info *r)
        __must_hold_shared(&tomoyo_ss)
{
        return tomoyo_supervisor(r, "file %s %s 0%o %u %u\n",
                                 tomoyo_mac_keywords
                                 [tomoyo_pnnn2mac[r->param.mkdev.operation]],
                                 r->param.mkdev.filename->name,
                                 r->param.mkdev.mode, r->param.mkdev.major,
                                 r->param.mkdev.minor);
}

/**
 * tomoyo_audit_path_number_log - Audit path/number request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path_number_log(struct tomoyo_request_info *r)
        __must_hold_shared(&tomoyo_ss)
{
        const u8 type = r->param.path_number.operation;
        u8 radix;
        char buffer[64];

        switch (type) {
        case TOMOYO_TYPE_CREATE:
        case TOMOYO_TYPE_MKDIR:
        case TOMOYO_TYPE_MKFIFO:
        case TOMOYO_TYPE_MKSOCK:
        case TOMOYO_TYPE_CHMOD:
                radix = TOMOYO_VALUE_TYPE_OCTAL;
                break;
        case TOMOYO_TYPE_IOCTL:
                radix = TOMOYO_VALUE_TYPE_HEXADECIMAL;
                break;
        default:
                radix = TOMOYO_VALUE_TYPE_DECIMAL;
                break;
        }
        tomoyo_print_ulong(buffer, sizeof(buffer), r->param.path_number.number,
                           radix);
        return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords
                                 [tomoyo_pn2mac[type]],
                                 r->param.path_number.filename->name, buffer);
}

/**
 * tomoyo_check_path_acl - Check permission for path operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 *
 * To be able to use wildcard for domain transition, this function sets
 * matching entry on success. Since the caller holds tomoyo_read_lock(),
 * it is safe to set matching entry.
 */
static bool tomoyo_check_path_acl(struct tomoyo_request_info *r,
                                  const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl),
                                                         head);

        if (acl->perm & (1 << r->param.path.operation)) {
                r->param.path.matched_path =
                        tomoyo_compare_name_union(r->param.path.filename,
                                                  &acl->name);
                return r->param.path.matched_path != NULL;
        }
        return false;
}

/**
 * tomoyo_check_path_number_acl - Check permission for path number operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_path_number_acl(struct tomoyo_request_info *r,
                                         const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path_number_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.path_number.operation)) &&
                tomoyo_compare_number_union(r->param.path_number.number,
                                            &acl->number) &&
                tomoyo_compare_name_union(r->param.path_number.filename,
                                          &acl->name);
}

/**
 * tomoyo_check_path2_acl - Check permission for path path operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_path2_acl(struct tomoyo_request_info *r,
                                   const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path2_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.path2.operation)) &&
                tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1)
                && tomoyo_compare_name_union(r->param.path2.filename2,
                                             &acl->name2);
}

/**
 * tomoyo_check_mkdev_acl - Check permission for path number number number operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_mkdev_acl(struct tomoyo_request_info *r,
                                   const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_mkdev_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.mkdev.operation)) &&
                tomoyo_compare_number_union(r->param.mkdev.mode,
                                            &acl->mode) &&
                tomoyo_compare_number_union(r->param.mkdev.major,
                                            &acl->major) &&
                tomoyo_compare_number_union(r->param.mkdev.minor,
                                            &acl->minor) &&
                tomoyo_compare_name_union(r->param.mkdev.filename,
                                          &acl->name);
}

/**
 * tomoyo_same_path_acl - Check for duplicated "struct tomoyo_path_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path_acl(const struct tomoyo_acl_info *a,
                                 const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name, &p2->name);
}

/**
 * tomoyo_merge_path_acl - Merge duplicated "struct tomoyo_path_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path_acl(struct tomoyo_acl_info *a,
                                  struct tomoyo_acl_info *b,
                                  const bool is_delete)
{
        u16 * const a_perm = &container_of(a, struct tomoyo_path_acl, head)
                ->perm;
        u16 perm = READ_ONCE(*a_perm);
        const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path_acl - Update "struct tomoyo_path_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_path_acl(const u16 perm,
                                  struct tomoyo_acl_param *param)
{
        struct tomoyo_path_acl e = {
                .head.type = TOMOYO_TYPE_PATH_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path_acl,
                                             tomoyo_merge_path_acl);
        tomoyo_put_name_union(&e.name);
        return error;
}

/**
 * tomoyo_same_mkdev_acl - Check for duplicated "struct tomoyo_mkdev_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_mkdev_acl(const struct tomoyo_acl_info *a,
                                         const struct tomoyo_acl_info *b)
{
        const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name, &p2->name) &&
                tomoyo_same_number_union(&p1->mode, &p2->mode) &&
                tomoyo_same_number_union(&p1->major, &p2->major) &&
                tomoyo_same_number_union(&p1->minor, &p2->minor);
}

/**
 * tomoyo_merge_mkdev_acl - Merge duplicated "struct tomoyo_mkdev_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_mkdev_acl(struct tomoyo_acl_info *a,
                                   struct tomoyo_acl_info *b,
                                   const bool is_delete)
{
        u8 *const a_perm = &container_of(a, struct tomoyo_mkdev_acl,
                                         head)->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head)
                ->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_mkdev_acl - Update "struct tomoyo_mkdev_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_mkdev_acl(const u8 perm,
                                   struct tomoyo_acl_param *param)
{
        struct tomoyo_mkdev_acl e = {
                .head.type = TOMOYO_TYPE_MKDEV_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name) ||
            !tomoyo_parse_number_union(param, &e.mode) ||
            !tomoyo_parse_number_union(param, &e.major) ||
            !tomoyo_parse_number_union(param, &e.minor))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_mkdev_acl,
                                             tomoyo_merge_mkdev_acl);
        tomoyo_put_name_union(&e.name);
        tomoyo_put_number_union(&e.mode);
        tomoyo_put_number_union(&e.major);
        tomoyo_put_number_union(&e.minor);
        return error;
}

/**
 * tomoyo_same_path2_acl - Check for duplicated "struct tomoyo_path2_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path2_acl(const struct tomoyo_acl_info *a,
                                  const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name1, &p2->name1) &&
                tomoyo_same_name_union(&p1->name2, &p2->name2);
}

/**
 * tomoyo_merge_path2_acl - Merge duplicated "struct tomoyo_path2_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path2_acl(struct tomoyo_acl_info *a,
                                   struct tomoyo_acl_info *b,
                                   const bool is_delete)
{
        u8 * const a_perm = &container_of(a, struct tomoyo_path2_acl, head)
                ->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path2_acl - Update "struct tomoyo_path2_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_path2_acl(const u8 perm,
                                   struct tomoyo_acl_param *param)
{
        struct tomoyo_path2_acl e = {
                .head.type = TOMOYO_TYPE_PATH2_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name1) ||
            !tomoyo_parse_name_union(param, &e.name2))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path2_acl,
                                             tomoyo_merge_path2_acl);
        tomoyo_put_name_union(&e.name1);
        tomoyo_put_name_union(&e.name2);
        return error;
}

/**
 * tomoyo_path_permission - Check permission for single path operation.
 *
 * @r:         Pointer to "struct tomoyo_request_info".
 * @operation: Type of operation.
 * @filename:  Filename to check.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_path_permission(struct tomoyo_request_info *r, u8 operation,
                                  const struct tomoyo_path_info *filename)
        __must_hold_shared(&tomoyo_ss)
{
        int error;

        r->type = tomoyo_p2mac[operation];
        r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type);
        if (r->mode == TOMOYO_CONFIG_DISABLED)
                return 0;
        r->param_type = TOMOYO_TYPE_PATH_ACL;
        r->param.path.filename = filename;
        r->param.path.operation = operation;
        do {
                tomoyo_check_acl(r, tomoyo_check_path_acl);
                error = tomoyo_audit_path_log(r);
        } while (error == TOMOYO_RETRY_REQUEST);
        return error;
}

/**
 * tomoyo_execute_permission - Check permission for execute operation.
 *
 * @r:         Pointer to "struct tomoyo_request_info".
 * @filename:  Filename to check.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_execute_permission(struct tomoyo_request_info *r,
                              const struct tomoyo_path_info *filename)
{
        /*
         * Unlike other permission checks, this check is done regardless of
         * profile mode settings in order to check for domain transition
         * preference.
         */
        r->type = TOMOYO_MAC_FILE_EXECUTE;
        r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type);
        r->param_type = TOMOYO_TYPE_PATH_ACL;
        r->param.path.filename = filename;
        r->param.path.operation = TOMOYO_TYPE_EXECUTE;
        tomoyo_check_acl(r, tomoyo_check_path_acl);
        r->ee->transition = r->matched_acl && r->matched_acl->cond ?
                r->matched_acl->cond->transit : NULL;
        if (r->mode != TOMOYO_CONFIG_DISABLED)
                return tomoyo_audit_path_log(r);
        return 0;
}

/**
 * tomoyo_same_path_number_acl - Check for duplicated "struct tomoyo_path_number_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path_number_acl(const struct tomoyo_acl_info *a,
                                        const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path_number_acl *p1 = container_of(a, typeof(*p1),
                                                               head);
        const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2),
                                                               head);

        return tomoyo_same_name_union(&p1->name, &p2->name) &&
                tomoyo_same_number_union(&p1->number, &p2->number);
}

/**
 * tomoyo_merge_path_number_acl - Merge duplicated "struct tomoyo_path_number_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path_number_acl(struct tomoyo_acl_info *a,
                                         struct tomoyo_acl_info *b,
                                         const bool is_delete)
{
        u8 * const a_perm = &container_of(a, struct tomoyo_path_number_acl,
                                          head)->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head)
                ->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path_number_acl - Update ioctl/chmod/chown/chgrp ACL.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_update_path_number_acl(const u8 perm,
                                         struct tomoyo_acl_param *param)
{
        struct tomoyo_path_number_acl e = {
                .head.type = TOMOYO_TYPE_PATH_NUMBER_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name) ||
            !tomoyo_parse_number_union(param, &e.number))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path_number_acl,
                                             tomoyo_merge_path_number_acl);
        tomoyo_put_name_union(&e.name);
        tomoyo_put_number_union(&e.number);
        return error;
}

/**
 * tomoyo_path_number_perm - Check permission for "create", "mkdir", "mkfifo", "mksock", "ioctl", "chmod", "chown", "chgrp".
 *
 * @type:   Type of operation.
 * @path:   Pointer to "struct path".
 * @number: Number.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path_number_perm(const u8 type, const struct path *path,
                            unsigned long number)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error = -ENOMEM;
        struct tomoyo_path_info buf;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pn2mac[type])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf, path))
                goto out;
        r.obj = &obj;
        if (type == TOMOYO_TYPE_MKDIR)
                tomoyo_add_slash(&buf);
        r.param_type = TOMOYO_TYPE_PATH_NUMBER_ACL;
        r.param.path_number.operation = type;
        r.param.path_number.filename = &buf;
        r.param.path_number.number = number;
        do {
                tomoyo_check_acl(&r, tomoyo_check_path_number_acl);
                error = tomoyo_audit_path_number_log(&r);
        } while (error == TOMOYO_RETRY_REQUEST);
        kfree(buf.name);
 out:
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_check_open_permission - Check permission for "read" and "write".
 *
 * @domain: Pointer to "struct tomoyo_domain_info".
 * @path:   Pointer to "struct path".
 * @flag:   Flags for open().
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
                                 const struct path *path, const int flag)
{
        const u8 acc_mode = ACC_MODE(flag);
        int error = 0;
        struct tomoyo_path_info buf;
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int idx;

        buf.name = NULL;
        r.mode = TOMOYO_CONFIG_DISABLED;
        idx = tomoyo_read_lock();
        if (acc_mode &&
            tomoyo_init_request_info(&r, domain, TOMOYO_MAC_FILE_OPEN)
            != TOMOYO_CONFIG_DISABLED) {
                if (!tomoyo_get_realpath(&buf, path)) {
                        error = -ENOMEM;
                        goto out;
                }
                r.obj = &obj;
                if (acc_mode & MAY_READ)
                        error = tomoyo_path_permission(&r, TOMOYO_TYPE_READ,
                                                       &buf);
                if (!error && (acc_mode & MAY_WRITE))
                        error = tomoyo_path_permission(&r, (flag & O_APPEND) ?
                                                       TOMOYO_TYPE_APPEND :
                                                       TOMOYO_TYPE_WRITE,
                                                       &buf);
        }
 out:
        kfree(buf.name);
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_path_perm - Check permission for "unlink", "rmdir", "truncate", "symlink", "append", "chroot" and "unmount".
 *
 * @operation: Type of operation.
 * @path:      Pointer to "struct path".
 * @target:    Symlink's target if @operation is TOMOYO_TYPE_SYMLINK,
 *             NULL otherwise.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error;
        struct tomoyo_path_info buf;
        bool is_enforce;
        struct tomoyo_path_info symlink_target;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_p2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        is_enforce = (r.mode == TOMOYO_CONFIG_ENFORCING);
        error = -ENOMEM;
        buf.name = NULL;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf, path))
                goto out;
        r.obj = &obj;
        switch (operation) {
        case TOMOYO_TYPE_RMDIR:
        case TOMOYO_TYPE_CHROOT:
                tomoyo_add_slash(&buf);
                break;
        case TOMOYO_TYPE_SYMLINK:
                symlink_target.name = tomoyo_encode(target);
                if (!symlink_target.name)
                        goto out;
                tomoyo_fill_path_info(&symlink_target);
                obj.symlink_target = &symlink_target;
                break;
        }
        error = tomoyo_path_permission(&r, operation, &buf);
        if (operation == TOMOYO_TYPE_SYMLINK)
                kfree(symlink_target.name);
 out:
        kfree(buf.name);
        tomoyo_read_unlock(idx);
        if (!is_enforce)
                error = 0;
        return error;
}

/**
 * tomoyo_mkdev_perm - Check permission for "mkblock" and "mkchar".
 *
 * @operation: Type of operation. (TOMOYO_TYPE_MKCHAR or TOMOYO_TYPE_MKBLOCK)
 * @path:      Pointer to "struct path".
 * @mode:      Create mode.
 * @dev:       Device number.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_mkdev_perm(const u8 operation, const struct path *path,
                      const unsigned int mode, unsigned int dev)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error = -ENOMEM;
        struct tomoyo_path_info buf;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pnnn2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        idx = tomoyo_read_lock();
        error = -ENOMEM;
        if (tomoyo_get_realpath(&buf, path)) {
                r.obj = &obj;
                dev = new_decode_dev(dev);
                r.param_type = TOMOYO_TYPE_MKDEV_ACL;
                r.param.mkdev.filename = &buf;
                r.param.mkdev.operation = operation;
                r.param.mkdev.mode = mode;
                r.param.mkdev.major = MAJOR(dev);
                r.param.mkdev.minor = MINOR(dev);
                tomoyo_check_acl(&r, tomoyo_check_mkdev_acl);
                error = tomoyo_audit_mkdev_log(&r);
                kfree(buf.name);
        }
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_path2_perm - Check permission for "rename", "link" and "pivot_root".
 *
 * @operation: Type of operation.
 * @path1:      Pointer to "struct path".
 * @path2:      Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path2_perm(const u8 operation, const struct path *path1,
                      const struct path *path2)
{
        int error = -ENOMEM;
        struct tomoyo_path_info buf1;
        struct tomoyo_path_info buf2;
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path1->mnt, .dentry = path1->dentry },
                .path2 = { .mnt = path2->mnt, .dentry = path2->dentry }
        };
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pp2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        buf1.name = NULL;
        buf2.name = NULL;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf1, path1) ||
            !tomoyo_get_realpath(&buf2, path2))
                goto out;
        switch (operation) {
        case TOMOYO_TYPE_RENAME:
        case TOMOYO_TYPE_LINK:
                if (!d_is_dir(path1->dentry))
                        break;
                fallthrough;
        case TOMOYO_TYPE_PIVOT_ROOT:
                tomoyo_add_slash(&buf1);
                tomoyo_add_slash(&buf2);
                break;
        }
        r.obj = &obj;
        r.param_type = TOMOYO_TYPE_PATH2_ACL;
        r.param.path2.operation = operation;
        r.param.path2.filename1 = &buf1;
        r.param.path2.filename2 = &buf2;
        do {
                tomoyo_check_acl(&r, tomoyo_check_path2_acl);
                error = tomoyo_audit_path2_log(&r);
        } while (error == TOMOYO_RETRY_REQUEST);
 out:
        kfree(buf1.name);
        kfree(buf2.name);
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_same_mount_acl - Check for duplicated "struct tomoyo_mount_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_mount_acl(const struct tomoyo_acl_info *a,
                                  const struct tomoyo_acl_info *b)
{
        const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) &&
                tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) &&
                tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) &&
                tomoyo_same_number_union(&p1->flags, &p2->flags);
}

/**
 * tomoyo_update_mount_acl - Write "struct tomoyo_mount_acl" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_mount_acl(struct tomoyo_acl_param *param)
{
        struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL };
        int error;

        if (!tomoyo_parse_name_union(param, &e.dev_name) ||
            !tomoyo_parse_name_union(param, &e.dir_name) ||
            !tomoyo_parse_name_union(param, &e.fs_type) ||
            !tomoyo_parse_number_union(param, &e.flags))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_mount_acl, NULL);
        tomoyo_put_name_union(&e.dev_name);
        tomoyo_put_name_union(&e.dir_name);
        tomoyo_put_name_union(&e.fs_type);
        tomoyo_put_number_union(&e.flags);
        return error;
}

/**
 * tomoyo_write_file - Update file related list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_write_file(struct tomoyo_acl_param *param)
{
        u16 perm = 0;
        u8 type;
        const char *operation = tomoyo_read_token(param);

        for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++)
                if (tomoyo_permstr(operation, tomoyo_path_keyword[type]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_PATH2_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pp2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path2_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_PATH_NUMBER_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pn2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path_number_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_MKDEV_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pnnn2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_mkdev_acl(perm, param);
        if (tomoyo_permstr(operation,
                           tomoyo_mac_keywords[TOMOYO_MAC_FILE_MOUNT]))
                return tomoyo_update_mount_acl(param);
        return -EINVAL;
}



























































































































































































































































































    3 





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
// SPDX-License-Identifier: GPL-2.0
/*
 * ACPI support
 *
 * Copyright (C) 2020, Intel Corporation
 * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
 */

#include <linux/acpi.h>
#include <linux/pm_runtime.h>

#include "tb.h"

static acpi_status tb_acpi_add_link(acpi_handle handle, u32 level, void *data,
                                    void **ret)
{
        struct acpi_device *adev = acpi_fetch_acpi_dev(handle);
        struct fwnode_handle *fwnode;
        struct tb_nhi *nhi = data;
        struct pci_dev *pdev;
        struct device *dev;

        if (!adev)
                return AE_OK;

        fwnode = fwnode_find_reference(acpi_fwnode_handle(adev), "usb4-host-interface", 0);
        if (IS_ERR(fwnode))
                return AE_OK;

        /* It needs to reference this NHI */
        if (dev_fwnode(&nhi->pdev->dev) != fwnode)
                goto out_put;

        /*
         * Ignore USB3 ports here as USB core will set up device links between
         * tunneled USB3 devices and NHI host during USB device creation.
         * USB3 ports might not even have a physical device yet if xHCI driver
         * isn't bound yet.
         */
        dev = acpi_get_first_physical_node(adev);
        if (!dev || !dev_is_pci(dev))
                goto out_put;

        /* Check that this matches a PCIe root/downstream port. */
        pdev = to_pci_dev(dev);
        if (pci_is_pcie(pdev) &&
            (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT ||
             pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
                const struct device_link *link;

                /*
                 * Make them both active first to make sure the NHI does
                 * not runtime suspend before the consumer. The
                 * pm_runtime_put() below then allows the consumer to
                 * runtime suspend again (which then allows NHI runtime
                 * suspend too now that the device link is established).
                 */
                pm_runtime_get_sync(&pdev->dev);

                link = device_link_add(&pdev->dev, &nhi->pdev->dev,
                                       DL_FLAG_AUTOREMOVE_SUPPLIER |
                                       DL_FLAG_RPM_ACTIVE |
                                       DL_FLAG_PM_RUNTIME);
                if (link) {
                        dev_dbg(&nhi->pdev->dev, "created link from %s\n",
                                dev_name(&pdev->dev));
                        *(bool *)ret = true;
                } else {
                        dev_warn(&nhi->pdev->dev, "device link creation from %s failed\n",
                                 dev_name(&pdev->dev));
                }

                pm_runtime_put(&pdev->dev);
        }

out_put:
        fwnode_handle_put(fwnode);
        return AE_OK;
}

/**
 * tb_acpi_add_links() - Add device links based on ACPI description
 * @nhi: Pointer to NHI
 *
 * Goes over ACPI namespace finding tunneled ports that reference to
 * @nhi ACPI node. For each reference a device link is added. The link
 * is automatically removed by the driver core.
 *
 * Returns %true if at least one link was created, %false otherwise.
 */
bool tb_acpi_add_links(struct tb_nhi *nhi)
{
        acpi_status status;
        bool ret = false;

        if (!has_acpi_companion(&nhi->pdev->dev))
                return false;

        /*
         * Find all devices that have usb4-host-controller interface
         * property that references to this NHI.
         */
        status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, 32,
                                     tb_acpi_add_link, NULL, nhi, (void **)&ret);
        if (ACPI_FAILURE(status)) {
                dev_warn(&nhi->pdev->dev, "failed to enumerate tunneled ports\n");
                return false;
        }

        return ret;
}

/**
 * tb_acpi_is_native() - Did the platform grant native TBT/USB4 control
 *
 * Return: %true if the platform granted OS native control over
 * TBT/USB4, %false otherwise.
 *
 * When returned %true, software based connection manager can be used,
 * otherwise there is firmware based connection manager running.
 */
bool tb_acpi_is_native(void)
{
        return osc_sb_native_usb4_support_confirmed &&
               osc_sb_native_usb4_control;
}

/**
 * tb_acpi_may_tunnel_usb3() - Is USB3 tunneling allowed by the platform
 *
 * Return: %true if software based connection manager is used and
 * platform allows native USB 3.x tunneling, %false otherwise.
 */
bool tb_acpi_may_tunnel_usb3(void)
{
        if (tb_acpi_is_native())
                return osc_sb_native_usb4_control & OSC_USB_USB3_TUNNELING;
        return true;
}

/**
 * tb_acpi_may_tunnel_dp() - Is DisplayPort tunneling allowed by the platform
 *
 * Return: %true if software based connection manager is used and
 * platform allows native DP tunneling, %false otherwise.
 */
bool tb_acpi_may_tunnel_dp(void)
{
        if (tb_acpi_is_native())
                return osc_sb_native_usb4_control & OSC_USB_DP_TUNNELING;
        return true;
}

/**
 * tb_acpi_may_tunnel_pcie() - Is PCIe tunneling allowed by the platform
 *
 * Return: %true if software based connection manager is used and
 * platform allows native PCIe tunneling, %false otherwise.
 */
bool tb_acpi_may_tunnel_pcie(void)
{
        if (tb_acpi_is_native())
                return osc_sb_native_usb4_control & OSC_USB_PCIE_TUNNELING;
        return true;
}

/**
 * tb_acpi_is_xdomain_allowed() - Are XDomain connections allowed
 *
 * Return: %true if software based connection manager is used and
 * platform allows XDomain tunneling, %false otherwise.
 */
bool tb_acpi_is_xdomain_allowed(void)
{
        if (tb_acpi_is_native())
                return osc_sb_native_usb4_control & OSC_USB_XDOMAIN;
        return true;
}

/* UUID for retimer _DSM: e0053122-795b-4122-8a5e-57be1d26acb3 */
static const guid_t retimer_dsm_guid =
        GUID_INIT(0xe0053122, 0x795b, 0x4122,
                  0x8a, 0x5e, 0x57, 0xbe, 0x1d, 0x26, 0xac, 0xb3);

#define RETIMER_DSM_QUERY_ONLINE_STATE        1
#define RETIMER_DSM_SET_ONLINE_STATE        2

static int tb_acpi_retimer_set_power(struct tb_port *port, bool power)
{
        struct usb4_port *usb4 = port->usb4;
        union acpi_object argv4[2];
        struct acpi_device *adev;
        union acpi_object *obj;
        int ret;

        if (!usb4->can_offline)
                return 0;

        adev = ACPI_COMPANION(&usb4->dev);
        if (WARN_ON(!adev))
                return 0;

        /* Check if we are already powered on (and in correct mode) */
        obj = acpi_evaluate_dsm_typed(adev->handle, &retimer_dsm_guid, 1,
                                      RETIMER_DSM_QUERY_ONLINE_STATE, NULL,
                                      ACPI_TYPE_INTEGER);
        if (!obj) {
                tb_port_warn(port, "ACPI: query online _DSM failed\n");
                return -EIO;
        }

        ret = obj->integer.value;
        ACPI_FREE(obj);

        if (power == ret)
                return 0;

        tb_port_dbg(port, "ACPI: calling _DSM to power %s retimers\n",
                    power ? "on" : "off");

        argv4[0].type = ACPI_TYPE_PACKAGE;
        argv4[0].package.count = 1;
        argv4[0].package.elements = &argv4[1];
        argv4[1].integer.type = ACPI_TYPE_INTEGER;
        argv4[1].integer.value = power;

        obj = acpi_evaluate_dsm_typed(adev->handle, &retimer_dsm_guid, 1,
                                      RETIMER_DSM_SET_ONLINE_STATE, argv4,
                                      ACPI_TYPE_INTEGER);
        if (!obj) {
                tb_port_warn(port,
                             "ACPI: set online state _DSM evaluation failed\n");
                return -EIO;
        }

        ret = obj->integer.value;
        ACPI_FREE(obj);

        if (ret >= 0) {
                if (power)
                        return ret == 1 ? 0 : -EBUSY;
                return 0;
        }

        tb_port_warn(port, "ACPI: set online state _DSM failed with error %d\n", ret);
        return -EIO;
}

/**
 * tb_acpi_power_on_retimers() - Call platform to power on retimers
 * @port: USB4 port
 *
 * Calls platform to turn on power to all retimers behind this USB4
 * port. After this function returns successfully the caller can
 * continue with the normal retimer flows (as specified in the USB4
 * spec). Note if this returns %-EBUSY it means the type-C port is in
 * non-USB4/TBT mode (there is non-USB4/TBT device connected).
 *
 * This should only be called if the USB4/TBT link is not up.
 *
 * Return: %0 on success, negative errno otherwise.
 */
int tb_acpi_power_on_retimers(struct tb_port *port)
{
        return tb_acpi_retimer_set_power(port, true);
}

/**
 * tb_acpi_power_off_retimers() - Call platform to power off retimers
 * @port: USB4 port
 *
 * This is the opposite of tb_acpi_power_on_retimers(). After returning
 * successfully the normal operations with the @port can continue.
 *
 * Return: %0 on success, negative errno otherwise.
 */
int tb_acpi_power_off_retimers(struct tb_port *port)
{
        return tb_acpi_retimer_set_power(port, false);
}

static bool tb_acpi_bus_match(struct device *dev)
{
        return tb_is_switch(dev) || tb_is_usb4_port_device(dev);
}

static struct acpi_device *tb_acpi_switch_find_companion(struct tb_switch *sw)
{
        struct tb_switch *parent_sw = tb_switch_parent(sw);
        struct acpi_device *adev = NULL;

        /*
         * Device routers exists under the downstream facing USB4 port
         * of the parent router. Their _ADR is always 0.
         */
        if (parent_sw) {
                struct tb_port *port = tb_switch_downstream_port(sw);
                struct acpi_device *port_adev;

                port_adev = acpi_find_child_by_adr(ACPI_COMPANION(&parent_sw->dev),
                                                   port->port);
                if (port_adev)
                        adev = acpi_find_child_device(port_adev, 0, false);
        } else {
                struct tb_nhi *nhi = sw->tb->nhi;
                struct acpi_device *parent_adev;

                parent_adev = ACPI_COMPANION(&nhi->pdev->dev);
                if (parent_adev)
                        adev = acpi_find_child_device(parent_adev, 0, false);
        }

        return adev;
}

static struct acpi_device *tb_acpi_find_companion(struct device *dev)
{
        /*
         * The Thunderbolt/USB4 hierarchy looks like following:
         *
         * Device (NHI)
         *   Device (HR)                // Host router _ADR == 0
         *      Device (DFP0)                // Downstream port _ADR == lane 0 adapter
         *        Device (DR)                // Device router _ADR == 0
         *          Device (UFP)        // Upstream port _ADR == lane 0 adapter
         *      Device (DFP1)                // Downstream port _ADR == lane 0 adapter number
         *
         * At the moment we bind the host router to the corresponding
         * Linux device.
         */
        if (tb_is_switch(dev))
                return tb_acpi_switch_find_companion(tb_to_switch(dev));
        if (tb_is_usb4_port_device(dev))
                return acpi_find_child_by_adr(ACPI_COMPANION(dev->parent),
                                              tb_to_usb4_port_device(dev)->port->port);
        return NULL;
}

static void tb_acpi_setup(struct device *dev)
{
        struct acpi_device *adev = ACPI_COMPANION(dev);
        struct usb4_port *usb4 = tb_to_usb4_port_device(dev);

        if (!adev || !usb4)
                return;

        if (acpi_check_dsm(adev->handle, &retimer_dsm_guid, 1,
                           BIT(RETIMER_DSM_QUERY_ONLINE_STATE) |
                           BIT(RETIMER_DSM_SET_ONLINE_STATE)))
                usb4->can_offline = true;
}

static struct acpi_bus_type tb_acpi_bus = {
        .name = "thunderbolt",
        .match = tb_acpi_bus_match,
        .find_companion = tb_acpi_find_companion,
        .setup = tb_acpi_setup,
};

int tb_acpi_init(void)
{
        return register_acpi_bus_type(&tb_acpi_bus);
}

void tb_acpi_exit(void)
{
        unregister_acpi_bus_type(&tb_acpi_bus);
}





































































































































    3 



































    3 

































































    2 



















































































































































































































    3 

































































    3 







    3 

















    3 











    3 
























































































































































































































    3 





















    2 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * pm_runtime.h - Device run-time power management helper functions.
 *
 * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>
 */

#ifndef _LINUX_PM_RUNTIME_H
#define _LINUX_PM_RUNTIME_H

#include <linux/device.h>
#include <linux/notifier.h>
#include <linux/pm.h>

#include <linux/jiffies.h>

/* Runtime PM flag argument bits */
#define RPM_ASYNC                0x01        /* Request is asynchronous */
#define RPM_NOWAIT                0x02        /* Don't wait for concurrent
                                            state change */
#define RPM_GET_PUT                0x04        /* Increment/decrement the
                                            usage_count */
#define RPM_AUTO                0x08        /* Use autosuspend_delay */
#define RPM_TRANSPARENT        0x10        /* Succeed if runtime PM is disabled */

/*
 * Use this for defining a set of PM operations to be used in all situations
 * (system suspend, hibernation or runtime PM).
 *
 * Note that the behaviour differs from the deprecated UNIVERSAL_DEV_PM_OPS()
 * macro, which uses the provided callbacks for both runtime PM and system
 * sleep, while DEFINE_RUNTIME_DEV_PM_OPS() uses pm_runtime_force_suspend()
 * and pm_runtime_force_resume() for its system sleep callbacks.
 *
 * If the underlying dev_pm_ops struct symbol has to be exported, use
 * EXPORT_RUNTIME_DEV_PM_OPS() or EXPORT_GPL_RUNTIME_DEV_PM_OPS() instead.
 */
#define DEFINE_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
        _DEFINE_DEV_PM_OPS(name, pm_runtime_force_suspend, \
                           pm_runtime_force_resume, suspend_fn, \
                           resume_fn, idle_fn)

#define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
        EXPORT_DEV_PM_OPS(name) = { \
                RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
        }
#define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
        EXPORT_GPL_DEV_PM_OPS(name) = { \
                RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
        }
#define EXPORT_NS_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \
        EXPORT_NS_DEV_PM_OPS(name, ns) = { \
                RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
        }
#define EXPORT_NS_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \
        EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \
                RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
        }

#ifdef CONFIG_PM
extern struct workqueue_struct *pm_wq;

static inline bool queue_pm_work(struct work_struct *work)
{
        return queue_work(pm_wq, work);
}

extern int pm_generic_runtime_suspend(struct device *dev);
extern int pm_generic_runtime_resume(struct device *dev);
extern int pm_runtime_force_suspend(struct device *dev);

extern int __pm_runtime_idle(struct device *dev, int rpmflags);
extern int __pm_runtime_suspend(struct device *dev, int rpmflags);
extern int __pm_runtime_resume(struct device *dev, int rpmflags);
extern int pm_runtime_get_if_active(struct device *dev);
extern int pm_runtime_get_if_in_use(struct device *dev);
extern int pm_schedule_suspend(struct device *dev, unsigned int delay);
extern int __pm_runtime_set_status(struct device *dev, unsigned int status);
extern void pm_runtime_barrier(struct device *dev);
extern bool pm_runtime_block_if_disabled(struct device *dev);
extern void pm_runtime_unblock(struct device *dev);
extern void pm_runtime_enable(struct device *dev);
extern void __pm_runtime_disable(struct device *dev, bool check_resume);
extern void pm_runtime_allow(struct device *dev);
extern void pm_runtime_forbid(struct device *dev);
extern void pm_runtime_no_callbacks(struct device *dev);
extern void pm_runtime_irq_safe(struct device *dev);
extern void __pm_runtime_use_autosuspend(struct device *dev, bool use);
extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
extern u64 pm_runtime_autosuspend_expiration(struct device *dev);
extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
extern void pm_runtime_get_suppliers(struct device *dev);
extern void pm_runtime_put_suppliers(struct device *dev);
extern void pm_runtime_new_link(struct device *dev);
extern void pm_runtime_drop_link(struct device_link *link);
extern void pm_runtime_release_supplier(struct device_link *link);

int devm_pm_runtime_set_active_enabled(struct device *dev);
extern int devm_pm_runtime_enable(struct device *dev);
int devm_pm_runtime_get_noresume(struct device *dev);

/**
 * pm_suspend_ignore_children - Set runtime PM behavior regarding children.
 * @dev: Target device.
 * @enable: Whether or not to ignore possible dependencies on children.
 *
 * The dependencies of @dev on its children will not be taken into account by
 * the runtime PM framework going forward if @enable is %true, or they will
 * be taken into account otherwise.
 */
static inline void pm_suspend_ignore_children(struct device *dev, bool enable)
{
        dev->power.ignore_children = enable;
}

/**
 * pm_runtime_get_noresume - Bump up runtime PM usage counter of a device.
 * @dev: Target device.
 */
static inline void pm_runtime_get_noresume(struct device *dev)
{
        atomic_inc(&dev->power.usage_count);
}

/**
 * pm_runtime_put_noidle - Drop runtime PM usage counter of a device.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev unless it is 0 already.
 */
static inline void pm_runtime_put_noidle(struct device *dev)
{
        atomic_add_unless(&dev->power.usage_count, -1, 0);
}

/**
 * pm_runtime_suspended - Check whether or not a device is runtime-suspended.
 * @dev: Target device.
 *
 * Return %true if runtime PM is enabled for @dev and its runtime PM status is
 * %RPM_SUSPENDED, or %false otherwise.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which
 * runtime PM cannot be either disabled or enabled for @dev and its runtime PM
 * status cannot change.
 */
static inline bool pm_runtime_suspended(struct device *dev)
{
        return dev->power.runtime_status == RPM_SUSPENDED
                && !dev->power.disable_depth;
}

/**
 * pm_runtime_active - Check whether or not a device is runtime-active.
 * @dev: Target device.
 *
 * Return %true if runtime PM is disabled for @dev or its runtime PM status is
 * %RPM_ACTIVE, or %false otherwise.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which
 * runtime PM cannot be either disabled or enabled for @dev and its runtime PM
 * status cannot change.
 */
static inline bool pm_runtime_active(struct device *dev)
{
        return dev->power.runtime_status == RPM_ACTIVE
                || dev->power.disable_depth;
}

/**
 * pm_runtime_status_suspended - Check if runtime PM status is "suspended".
 * @dev: Target device.
 *
 * Return %true if the runtime PM status of @dev is %RPM_SUSPENDED, or %false
 * otherwise, regardless of whether or not runtime PM has been enabled for @dev.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which the
 * runtime PM status of @dev cannot change.
 */
static inline bool pm_runtime_status_suspended(struct device *dev)
{
        return dev->power.runtime_status == RPM_SUSPENDED;
}

/**
 * pm_runtime_enabled - Check if runtime PM is enabled.
 * @dev: Target device.
 *
 * Return %true if runtime PM is enabled for @dev or %false otherwise.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which
 * runtime PM cannot be either disabled or enabled for @dev.
 */
static inline bool pm_runtime_enabled(struct device *dev)
{
        return !dev->power.disable_depth;
}

/**
 * pm_runtime_blocked - Check if runtime PM enabling is blocked.
 * @dev: Target device.
 *
 * Do not call this function outside system suspend/resume code paths.
 */
static inline bool pm_runtime_blocked(struct device *dev)
{
        return dev->power.last_status == RPM_BLOCKED;
}

/**
 * pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present.
 * @dev: Target device.
 *
 * Return %true if @dev is a special device without runtime PM callbacks or
 * %false otherwise.
 */
static inline bool pm_runtime_has_no_callbacks(struct device *dev)
{
        return dev->power.no_callbacks;
}

/**
 * pm_runtime_mark_last_busy - Update the last access time of a device.
 * @dev: Target device.
 *
 * Update the last access time of @dev used by the runtime PM autosuspend
 * mechanism to the current time as returned by ktime_get_mono_fast_ns().
 */
static inline void pm_runtime_mark_last_busy(struct device *dev)
{
        WRITE_ONCE(dev->power.last_busy, ktime_get_mono_fast_ns());
}

/**
 * pm_runtime_is_irq_safe - Check if runtime PM can work in interrupt context.
 * @dev: Target device.
 *
 * Return %true if @dev has been marked as an "IRQ-safe" device (with respect
 * to runtime PM), in which case its runtime PM callabcks can be expected to
 * work correctly when invoked from interrupt handlers.
 */
static inline bool pm_runtime_is_irq_safe(struct device *dev)
{
        return dev->power.irq_safe;
}

extern u64 pm_runtime_suspended_time(struct device *dev);

#else /* !CONFIG_PM */

static inline bool queue_pm_work(struct work_struct *work) { return false; }

static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; }
static inline int pm_generic_runtime_resume(struct device *dev) { return 0; }
static inline int pm_runtime_force_suspend(struct device *dev) { return 0; }

static inline int __pm_runtime_idle(struct device *dev, int rpmflags)
{
        return -ENOSYS;
}
static inline int __pm_runtime_suspend(struct device *dev, int rpmflags)
{
        return -ENOSYS;
}
static inline int __pm_runtime_resume(struct device *dev, int rpmflags)
{
        return 1;
}
static inline int pm_schedule_suspend(struct device *dev, unsigned int delay)
{
        return -ENOSYS;
}
static inline int pm_runtime_get_if_in_use(struct device *dev)
{
        return -EINVAL;
}
static inline int pm_runtime_get_if_active(struct device *dev)
{
        return -EINVAL;
}
static inline int __pm_runtime_set_status(struct device *dev,
                                            unsigned int status) { return 0; }
static inline void pm_runtime_barrier(struct device *dev) {}
static inline bool pm_runtime_block_if_disabled(struct device *dev) { return true; }
static inline void pm_runtime_unblock(struct device *dev) {}
static inline void pm_runtime_enable(struct device *dev) {}
static inline void __pm_runtime_disable(struct device *dev, bool c) {}
static inline bool pm_runtime_blocked(struct device *dev) { return true; }
static inline void pm_runtime_allow(struct device *dev) {}
static inline void pm_runtime_forbid(struct device *dev) {}

static inline int devm_pm_runtime_set_active_enabled(struct device *dev) { return 0; }
static inline int devm_pm_runtime_enable(struct device *dev) { return 0; }
static inline int devm_pm_runtime_get_noresume(struct device *dev) { return 0; }

static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {}
static inline void pm_runtime_get_noresume(struct device *dev) {}
static inline void pm_runtime_put_noidle(struct device *dev) {}
static inline bool pm_runtime_suspended(struct device *dev) { return false; }
static inline bool pm_runtime_active(struct device *dev) { return true; }
static inline bool pm_runtime_status_suspended(struct device *dev) { return false; }
static inline bool pm_runtime_enabled(struct device *dev) { return false; }

static inline void pm_runtime_no_callbacks(struct device *dev) {}
static inline void pm_runtime_irq_safe(struct device *dev) {}
static inline bool pm_runtime_is_irq_safe(struct device *dev) { return false; }

static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return false; }
static inline void pm_runtime_mark_last_busy(struct device *dev) {}
static inline void __pm_runtime_use_autosuspend(struct device *dev,
                                                bool use) {}
static inline void pm_runtime_set_autosuspend_delay(struct device *dev,
                                                int delay) {}
static inline u64 pm_runtime_autosuspend_expiration(
                                struct device *dev) { return 0; }
static inline void pm_runtime_set_memalloc_noio(struct device *dev,
                                                bool enable){}
static inline void pm_runtime_get_suppliers(struct device *dev) {}
static inline void pm_runtime_put_suppliers(struct device *dev) {}
static inline void pm_runtime_new_link(struct device *dev) {}
static inline void pm_runtime_drop_link(struct device_link *link) {}
static inline void pm_runtime_release_supplier(struct device_link *link) {}

#endif /* !CONFIG_PM */

#ifdef CONFIG_PM_SLEEP

bool pm_runtime_need_not_resume(struct device *dev);
int pm_runtime_force_resume(struct device *dev);

#else /* !CONFIG_PM_SLEEP */

static inline bool pm_runtime_need_not_resume(struct device *dev) {return true; }
static inline int pm_runtime_force_resume(struct device *dev) { return -ENXIO; }

#endif /* CONFIG_PM_SLEEP */

/**
 * pm_runtime_idle - Conditionally set up autosuspend of a device or suspend it.
 * @dev: Target device.
 *
 * Invoke the "idle check" callback of @dev and, depending on its return value,
 * set up autosuspend of @dev or suspend it (depending on whether or not
 * autosuspend has been enabled for it).
 *
 * Return:
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change
 *            ongoing or device not in %RPM_ACTIVE state.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM idle and suspend callbacks.
 */
static inline int pm_runtime_idle(struct device *dev)
{
        return __pm_runtime_idle(dev, 0);
}

/**
 * pm_runtime_suspend - Suspend a device synchronously.
 * @dev: Target device.
 *
 * Return:
 * * 1: Success; device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change
 *            ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM suspend callbacks.
 */
static inline int pm_runtime_suspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, 0);
}

/**
 * pm_runtime_autosuspend - Update the last access time and set up autosuspend
 * of a device.
 * @dev: Target device.
 *
 * First update the last access time, then set up autosuspend of @dev or suspend
 * it (depending on whether or not autosuspend is enabled for it) without
 * engaging its "idle check" callback.
 *
 * Return:
 * * 1: Success; device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change
 *            ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM suspend callbacks.
 */
static inline int pm_runtime_autosuspend(struct device *dev)
{
        pm_runtime_mark_last_busy(dev);
        return __pm_runtime_suspend(dev, RPM_AUTO);
}

/**
 * pm_runtime_resume - Resume a device synchronously.
 * @dev: Target device.
 */
static inline int pm_runtime_resume(struct device *dev)
{
        return __pm_runtime_resume(dev, 0);
}

/**
 * pm_request_idle - Queue up "idle check" execution for a device.
 * @dev: Target device.
 *
 * Queue up a work item to run an equivalent of pm_runtime_idle() for @dev
 * asynchronously.
 *
 * Return:
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change
 *            ongoing or device not in %RPM_ACTIVE state.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 */
static inline int pm_request_idle(struct device *dev)
{
        return __pm_runtime_idle(dev, RPM_ASYNC);
}

/**
 * pm_request_resume - Queue up runtime-resume of a device.
 * @dev: Target device.
 */
static inline int pm_request_resume(struct device *dev)
{
        return __pm_runtime_resume(dev, RPM_ASYNC);
}

/**
 * pm_request_autosuspend - Update the last access time and queue up autosuspend
 * of a device.
 * @dev: Target device.
 *
 * Update the last access time of a device and queue up a work item to run an
 * equivalent pm_runtime_autosuspend() for @dev asynchronously.
 *
 * Return:
 * * 1: Success; device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change
 *            ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 */
static inline int pm_request_autosuspend(struct device *dev)
{
        pm_runtime_mark_last_busy(dev);
        return __pm_runtime_suspend(dev, RPM_ASYNC | RPM_AUTO);
}

/**
 * pm_runtime_get - Bump up usage counter and queue up resume of a device.
 * @dev: Target device.
 *
 * Bump up the runtime PM usage counter of @dev and queue up a work item to
 * carry out runtime-resume of it.
 */
static inline int pm_runtime_get(struct device *dev)
{
        return __pm_runtime_resume(dev, RPM_GET_PUT | RPM_ASYNC);
}

/**
 * pm_runtime_get_sync - Bump up usage counter of a device and resume it.
 * @dev: Target device.
 *
 * Bump up the runtime PM usage counter of @dev and carry out runtime-resume of
 * it synchronously.
 *
 * The possible return values of this function are the same as for
 * pm_runtime_resume() and the runtime PM usage counter of @dev remains
 * incremented in all cases, even if it returns an error code.
 * Consider using pm_runtime_resume_and_get() instead of it, especially
 * if its return value is checked by the caller, as this is likely to result
 * in cleaner code.
 */
static inline int pm_runtime_get_sync(struct device *dev)
{
        return __pm_runtime_resume(dev, RPM_GET_PUT);
}

static inline int pm_runtime_get_active(struct device *dev, int rpmflags)
{
        int ret;

        ret = __pm_runtime_resume(dev, RPM_GET_PUT | rpmflags);
        if (ret < 0) {
                pm_runtime_put_noidle(dev);
                return ret;
        }

        return 0;
}

/**
 * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it.
 * @dev: Target device.
 *
 * Resume @dev synchronously and if that is successful, increment its runtime
 * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been
 * incremented or a negative error code otherwise.
 */
static inline int pm_runtime_resume_and_get(struct device *dev)
{
        return pm_runtime_get_active(dev, 0);
}

/**
 * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, queue up a work item for @dev like in pm_request_idle().
 */
static inline void pm_runtime_put(struct device *dev)
{
        __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC);
}

/**
 * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, queue up a work item for @dev like in pm_request_autosuspend().
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 */
static inline int __pm_runtime_put_autosuspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_ASYNC | RPM_AUTO);
}

/**
 * pm_runtime_put_autosuspend - Update the last access time of a device, drop
 * its usage counter and queue autosuspend if the usage counter becomes 0.
 * @dev: Target device.
 *
 * Update the last access time of @dev, decrement runtime PM usage counter of
 * @dev and if it turns out to be equal to 0, queue up a work item for @dev like
 * in pm_request_autosuspend().
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 */
static inline int pm_runtime_put_autosuspend(struct device *dev)
{
        pm_runtime_mark_last_busy(dev);
        return __pm_runtime_put_autosuspend(dev);
}

DEFINE_GUARD(pm_runtime_noresume, struct device *,
             pm_runtime_get_noresume(_T), pm_runtime_put_noidle(_T));

DEFINE_GUARD(pm_runtime_active, struct device *,
             pm_runtime_get_sync(_T), pm_runtime_put(_T));
DEFINE_GUARD(pm_runtime_active_auto, struct device *,
             pm_runtime_get_sync(_T), pm_runtime_put_autosuspend(_T));
/*
 * Use the following guards with ACQUIRE()/ACQUIRE_ERR().
 *
 * The difference between the "_try" and "_try_enabled" variants is that the
 * former do not produce an error when runtime PM is disabled for the given
 * device.
 */
DEFINE_GUARD_COND(pm_runtime_active, _try,
                  pm_runtime_get_active(_T, RPM_TRANSPARENT), _RET == 0)
DEFINE_GUARD_COND(pm_runtime_active, _try_enabled,
                  pm_runtime_resume_and_get(_T), _RET == 0)
DEFINE_GUARD_COND(pm_runtime_active_auto, _try,
                  pm_runtime_get_active(_T, RPM_TRANSPARENT), _RET == 0)
DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled,
                  pm_runtime_resume_and_get(_T), _RET == 0)

/* ACQUIRE() wrapper macros for the guards defined above. */

#define PM_RUNTIME_ACQUIRE(_dev, _var)                        \
        ACQUIRE(pm_runtime_active_try, _var)(_dev)

#define PM_RUNTIME_ACQUIRE_AUTOSUSPEND(_dev, _var)        \
        ACQUIRE(pm_runtime_active_auto_try, _var)(_dev)

#define PM_RUNTIME_ACQUIRE_IF_ENABLED(_dev, _var)        \
        ACQUIRE(pm_runtime_active_try_enabled, _var)(_dev)

#define PM_RUNTIME_ACQUIRE_IF_ENABLED_AUTOSUSPEND(_dev, _var)        \
        ACQUIRE(pm_runtime_active_auto_try_enabled, _var)(_dev)

/*
 * ACQUIRE_ERR() wrapper macro for guard pm_runtime_active.
 *
 * Always check PM_RUNTIME_ACQUIRE_ERR() after using one of the
 * PM_RUNTIME_ACQUIRE*() macros defined above (yes, it can be used with
 * any of them) and if it is nonzero, avoid accessing the given device.
 */
#define PM_RUNTIME_ACQUIRE_ERR(_var_ptr)        \
        ACQUIRE_ERR(pm_runtime_active, _var_ptr)

/**
 * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, invoke the "idle check" callback of @dev and, depending on its
 * return value, set up autosuspend of @dev or suspend it (depending on whether
 * or not autosuspend has been enabled for it).
 *
 * The runtime PM usage counter of @dev remains decremented in all cases, even
 * if it returns an error code.
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM suspend callbacks.
 */
static inline int pm_runtime_put_sync(struct device *dev)
{
        return __pm_runtime_idle(dev, RPM_GET_PUT);
}

/**
 * pm_runtime_put_sync_suspend - Drop device usage counter and suspend if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, carry out runtime-suspend of @dev synchronously.
 *
 * The runtime PM usage counter of @dev remains decremented in all cases, even
 * if it returns an error code.
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM suspend callbacks.
 */
static inline int pm_runtime_put_sync_suspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, RPM_GET_PUT);
}

/**
 * pm_runtime_put_sync_autosuspend - Update the last access time of a device,
 * drop device usage counter and autosuspend if 0.
 * @dev: Target device.
 *
 * Update the last access time of @dev, decrement the runtime PM usage counter
 * of @dev and if it turns out to be equal to 0, set up autosuspend of @dev or
 * suspend it synchronously (depending on whether or not autosuspend has been
 * enabled for it).
 *
 * The runtime PM usage counter of @dev remains decremented in all cases, even
 * if it returns an error code.
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM suspend callbacks.
 */
static inline int pm_runtime_put_sync_autosuspend(struct device *dev)
{
        pm_runtime_mark_last_busy(dev);
        return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO);
}

/**
 * pm_runtime_set_active - Set runtime PM status to "active".
 * @dev: Target device.
 *
 * Set the runtime PM status of @dev to %RPM_ACTIVE and ensure that dependencies
 * of it will be taken into account.
 *
 * It is not valid to call this function for devices with runtime PM enabled.
 */
static inline int pm_runtime_set_active(struct device *dev)
{
        return __pm_runtime_set_status(dev, RPM_ACTIVE);
}

/**
 * pm_runtime_set_suspended - Set runtime PM status to "suspended".
 * @dev: Target device.
 *
 * Set the runtime PM status of @dev to %RPM_SUSPENDED and ensure that
 * dependencies of it will be taken into account.
 *
 * It is not valid to call this function for devices with runtime PM enabled.
 */
static inline int pm_runtime_set_suspended(struct device *dev)
{
        return __pm_runtime_set_status(dev, RPM_SUSPENDED);
}

/**
 * pm_runtime_disable - Disable runtime PM for a device.
 * @dev: Target device.
 *
 * Prevent the runtime PM framework from working with @dev by incrementing its
 * "disable" counter.
 *
 * If the counter is zero when this function runs and there is a pending runtime
 * resume request for @dev, it will be resumed.  If the counter is still zero at
 * that point, all of the pending runtime PM requests for @dev will be canceled
 * and all runtime PM operations in progress involving it will be waited for to
 * complete.
 *
 * For each invocation of this function for @dev, there must be a matching
 * pm_runtime_enable() call, so that runtime PM is eventually enabled for it
 * again.
 */
static inline void pm_runtime_disable(struct device *dev)
{
        __pm_runtime_disable(dev, true);
}

/**
 * pm_runtime_use_autosuspend - Allow autosuspend to be used for a device.
 * @dev: Target device.
 *
 * Allow the runtime PM autosuspend mechanism to be used for @dev whenever
 * requested (or "autosuspend" will be handled as direct runtime-suspend for
 * it).
 *
 * NOTE: It's important to undo this with pm_runtime_dont_use_autosuspend()
 * at driver exit time unless your driver initially enabled pm_runtime
 * with devm_pm_runtime_enable() (which handles it for you).
 */
static inline void pm_runtime_use_autosuspend(struct device *dev)
{
        __pm_runtime_use_autosuspend(dev, true);
}

/**
 * pm_runtime_dont_use_autosuspend - Prevent autosuspend from being used.
 * @dev: Target device.
 *
 * Prevent the runtime PM autosuspend mechanism from being used for @dev which
 * means that "autosuspend" will be handled as direct runtime-suspend for it
 * going forward.
 */
static inline void pm_runtime_dont_use_autosuspend(struct device *dev)
{
        __pm_runtime_use_autosuspend(dev, false);
}

#endif





































   28 



















   41 











    1 














    1 





























   41 















   46 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/kasan-checks.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>

#include <asm/byteorder.h>
#include <asm/word-at-a-time.h>

#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
#define IS_UNALIGNED(src, dst)        0
#else
#define IS_UNALIGNED(src, dst)        \
        (((long) dst | (long) src) & (sizeof(long) - 1))
#endif

/*
 * Do a strncpy, return length of string without final '\0'.
 * 'count' is the user-supplied count (return 'count' if we
 * hit it), 'max' is the address space maximum (and we return
 * -EFAULT if we hit it).
 */
static __always_inline long do_strncpy_from_user(char *dst, const char __user *src,
                                        unsigned long count, unsigned long max)
{
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
        unsigned long res = 0;

        if (IS_UNALIGNED(src, dst))
                goto byte_at_a_time;

        while (max >= sizeof(unsigned long)) {
                unsigned long c, data, mask;

                /* Fall back to byte-at-a-time if we get a page fault */
                unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);

                /*
                 * Note that we mask out the bytes following the NUL. This is
                 * important to do because string oblivious code may read past
                 * the NUL. For those routines, we don't want to give them
                 * potentially random bytes after the NUL in `src`.
                 *
                 * One example of such code is BPF map keys. BPF treats map keys
                 * as an opaque set of bytes. Without the post-NUL mask, any BPF
                 * maps keyed by strings returned from strncpy_from_user() may
                 * have multiple entries for semantically identical strings.
                 */
                if (has_zero(c, &data, &constants)) {
                        data = prep_zero_mask(c, data, &constants);
                        data = create_zero_mask(data);
                        mask = zero_bytemask(data);
                        *(unsigned long *)(dst+res) = c & mask;
                        return res + find_zero(data);
                }

                *(unsigned long *)(dst+res) = c;

                res += sizeof(unsigned long);
                max -= sizeof(unsigned long);
        }

byte_at_a_time:
        while (max) {
                char c;

                unsafe_get_user(c,src+res, efault);
                dst[res] = c;
                if (!c)
                        return res;
                res++;
                max--;
        }

        /*
         * Uhhuh. We hit 'max'. But was that the user-specified maximum
         * too? If so, that's ok - we got as much as the user asked for.
         */
        if (res >= count)
                return res;

        /*
         * Nope: we hit the address space limit, and we still had more
         * characters the caller would have wanted. That's an EFAULT.
         */
efault:
        return -EFAULT;
}

/**
 * strncpy_from_user: - Copy a NUL terminated string from userspace.
 * @dst:   Destination address, in kernel space.  This buffer must be at
 *         least @count bytes long.
 * @src:   Source address, in user space.
 * @count: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Copies a NUL-terminated string from userspace to kernel space.
 *
 * On success, returns the length of the string (not including the trailing
 * NUL).
 *
 * If access to userspace fails, returns -EFAULT (some data may have been
 * copied).
 *
 * If @count is smaller than the length of the string, copies @count bytes
 * and returns @count.
 */
long strncpy_from_user(char *dst, const char __user *src, long count)
{
        unsigned long max_addr, src_addr;

        might_fault();
        if (should_fail_usercopy())
                return -EFAULT;
        if (unlikely(count <= 0))
                return 0;

        kasan_check_write(dst, count);
        check_object_size(dst, count, false);

        if (can_do_masked_user_access()) {
                long retval;

                src = masked_user_read_access_begin(src);
                retval = do_strncpy_from_user(dst, src, count, count);
                user_read_access_end();
                return retval;
        }

        max_addr = TASK_SIZE_MAX;
        src_addr = (unsigned long)untagged_addr(src);
        if (likely(src_addr < max_addr)) {
                unsigned long max = max_addr - src_addr;
                long retval;

                /*
                 * Truncate 'max' to the user-specified limit, so that
                 * we only have one limit we need to check in the loop
                 */
                if (max > count)
                        max = count;

                if (user_read_access_begin(src, max)) {
                        retval = do_strncpy_from_user(dst, src, count, max);
                        user_read_access_end();
                        return retval;
                }
        }
        return -EFAULT;
}
EXPORT_SYMBOL(strncpy_from_user);






















    3 

    3 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
// SPDX-License-Identifier: GPL-2.0
/*
 * property.c - Unified device property interface.
 *
 * Copyright (C) 2014, Intel Corporation
 * Authors: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 *          Mika Westerberg <mika.westerberg@linux.intel.com>
 */

#include <linux/device.h>
#include <linux/err.h>
#include <linux/export.h>
#include <linux/kconfig.h>
#include <linux/of.h>
#include <linux/property.h>
#include <linux/phy.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/types.h>

struct fwnode_handle *__dev_fwnode(struct device *dev)
{
        return IS_ENABLED(CONFIG_OF) && dev->of_node ?
                of_fwnode_handle(dev->of_node) : dev->fwnode;
}
EXPORT_SYMBOL_GPL(__dev_fwnode);

const struct fwnode_handle *__dev_fwnode_const(const struct device *dev)
{
        return IS_ENABLED(CONFIG_OF) && dev->of_node ?
                of_fwnode_handle(dev->of_node) : dev->fwnode;
}
EXPORT_SYMBOL_GPL(__dev_fwnode_const);

/**
 * device_property_present - check if a property of a device is present
 * @dev: Device whose property is being checked
 * @propname: Name of the property
 *
 * Check if property @propname is present in the device firmware description.
 * This function is the unambiguous way to check that given property is present
 * in the device firmware description.
 *
 * Return: true if property @propname is present. Otherwise, returns false.
 */
bool device_property_present(const struct device *dev, const char *propname)
{
        return fwnode_property_present(dev_fwnode(dev), propname);
}
EXPORT_SYMBOL_GPL(device_property_present);

/**
 * fwnode_property_present - check if a property of a firmware node is present
 * @fwnode: Firmware node whose property to check
 * @propname: Name of the property
 *
 * Check if property @propname is present in the firmware node description.
 * This function is the unambiguous way to check that given property is present
 * in the firmware node description.
 *
 * Return: true if property @propname is present. Otherwise, returns false.
 */
bool fwnode_property_present(const struct fwnode_handle *fwnode,
                             const char *propname)
{
        bool ret;

        if (IS_ERR_OR_NULL(fwnode))
                return false;

        ret = fwnode_call_bool_op(fwnode, property_present, propname);
        if (ret)
                return ret;

        return fwnode_call_bool_op(fwnode->secondary, property_present, propname);
}
EXPORT_SYMBOL_GPL(fwnode_property_present);

/**
 * device_property_read_bool - Return the value for a boolean property of a device
 * @dev: Device whose property is being checked
 * @propname: Name of the property
 *
 * Use device_property_present() to check for the property presence.
 *
 * Return: if property @propname is true or false in the device firmware description.
 */
bool device_property_read_bool(const struct device *dev, const char *propname)
{
        return fwnode_property_read_bool(dev_fwnode(dev), propname);
}
EXPORT_SYMBOL_GPL(device_property_read_bool);

/**
 * fwnode_property_read_bool - Return the value for a boolean property of a firmware node
 * @fwnode: Firmware node whose property to check
 * @propname: Name of the property
 *
 * Use fwnode_property_present() to check for the property presence.
 *
 * Return: if property @propname is true or false in the firmware node description.
 */
bool fwnode_property_read_bool(const struct fwnode_handle *fwnode,
                             const char *propname)
{
        bool ret;

        if (IS_ERR_OR_NULL(fwnode))
                return false;

        ret = fwnode_call_bool_op(fwnode, property_read_bool, propname);
        if (ret)
                return ret;

        return fwnode_call_bool_op(fwnode->secondary, property_read_bool, propname);
}
EXPORT_SYMBOL_GPL(fwnode_property_read_bool);

/**
 * device_property_read_u8_array - return a u8 array property of a device
 * @dev: Device to get the property of
 * @propname: Name of the property
 * @val: The values are stored here or %NULL to return the number of values
 * @nval: Size of the @val array
 *
 * Function reads an array of u8 properties with @propname from the device
 * firmware description and stores them to @val if found.
 *
 * It's recommended to call device_property_count_u8() instead of calling
 * this function with @val equals %NULL and @nval equals 0.
 *
 * Return: number of values if @val was %NULL,
 *         %0 if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO if the property is not an array of numbers,
 *           %-EOVERFLOW if the size of the property is not as expected.
 *           %-ENXIO if no suitable firmware interface is present.
 */
int device_property_read_u8_array(const struct device *dev, const char *propname,
                                  u8 *val, size_t nval)
{
        return fwnode_property_read_u8_array(dev_fwnode(dev), propname, val, nval);
}
EXPORT_SYMBOL_GPL(device_property_read_u8_array);

/**
 * device_property_read_u16_array - return a u16 array property of a device
 * @dev: Device to get the property of
 * @propname: Name of the property
 * @val: The values are stored here or %NULL to return the number of values
 * @nval: Size of the @val array
 *
 * Function reads an array of u16 properties with @propname from the device
 * firmware description and stores them to @val if found.
 *
 * It's recommended to call device_property_count_u16() instead of calling
 * this function with @val equals %NULL and @nval equals 0.
 *
 * Return: number of values if @val was %NULL,
 *         %0 if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO if the property is not an array of numbers,
 *           %-EOVERFLOW if the size of the property is not as expected.
 *           %-ENXIO if no suitable firmware interface is present.
 */
int device_property_read_u16_array(const struct device *dev, const char *propname,
                                   u16 *val, size_t nval)
{
        return fwnode_property_read_u16_array(dev_fwnode(dev), propname, val, nval);
}
EXPORT_SYMBOL_GPL(device_property_read_u16_array);

/**
 * device_property_read_u32_array - return a u32 array property of a device
 * @dev: Device to get the property of
 * @propname: Name of the property
 * @val: The values are stored here or %NULL to return the number of values
 * @nval: Size of the @val array
 *
 * Function reads an array of u32 properties with @propname from the device
 * firmware description and stores them to @val if found.
 *
 * It's recommended to call device_property_count_u32() instead of calling
 * this function with @val equals %NULL and @nval equals 0.
 *
 * Return: number of values if @val was %NULL,
 *         %0 if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO if the property is not an array of numbers,
 *           %-EOVERFLOW if the size of the property is not as expected.
 *           %-ENXIO if no suitable firmware interface is present.
 */
int device_property_read_u32_array(const struct device *dev, const char *propname,
                                   u32 *val, size_t nval)
{
        return fwnode_property_read_u32_array(dev_fwnode(dev), propname, val, nval);
}
EXPORT_SYMBOL_GPL(device_property_read_u32_array);

/**
 * device_property_read_u64_array - return a u64 array property of a device
 * @dev: Device to get the property of
 * @propname: Name of the property
 * @val: The values are stored here or %NULL to return the number of values
 * @nval: Size of the @val array
 *
 * Function reads an array of u64 properties with @propname from the device
 * firmware description and stores them to @val if found.
 *
 * It's recommended to call device_property_count_u64() instead of calling
 * this function with @val equals %NULL and @nval equals 0.
 *
 * Return: number of values if @val was %NULL,
 *         %0 if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO if the property is not an array of numbers,
 *           %-EOVERFLOW if the size of the property is not as expected.
 *           %-ENXIO if no suitable firmware interface is present.
 */
int device_property_read_u64_array(const struct device *dev, const char *propname,
                                   u64 *val, size_t nval)
{
        return fwnode_property_read_u64_array(dev_fwnode(dev), propname, val, nval);
}
EXPORT_SYMBOL_GPL(device_property_read_u64_array);

/**
 * device_property_read_string_array - return a string array property of device
 * @dev: Device to get the property of
 * @propname: Name of the property
 * @val: The values are stored here or %NULL to return the number of values
 * @nval: Size of the @val array
 *
 * Function reads an array of string properties with @propname from the device
 * firmware description and stores them to @val if found.
 *
 * It's recommended to call device_property_string_array_count() instead of calling
 * this function with @val equals %NULL and @nval equals 0.
 *
 * Return: number of values read on success if @val is non-NULL,
 *           number of values available on success if @val is NULL,
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO or %-EILSEQ if the property is not an array of strings,
 *           %-EOVERFLOW if the size of the property is not as expected.
 *           %-ENXIO if no suitable firmware interface is present.
 */
int device_property_read_string_array(const struct device *dev, const char *propname,
                                      const char **val, size_t nval)
{
        return fwnode_property_read_string_array(dev_fwnode(dev), propname, val, nval);
}
EXPORT_SYMBOL_GPL(device_property_read_string_array);

/**
 * device_property_read_string - return a string property of a device
 * @dev: Device to get the property of
 * @propname: Name of the property
 * @val: The value is stored here
 *
 * Function reads property @propname from the device firmware description and
 * stores the value into @val if found. The value is checked to be a string.
 *
 * Return: %0 if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO or %-EILSEQ if the property type is not a string.
 *           %-ENXIO if no suitable firmware interface is present.
 */
int device_property_read_string(const struct device *dev, const char *propname,
                                const char **val)
{
        return fwnode_property_read_string(dev_fwnode(dev), propname, val);
}
EXPORT_SYMBOL_GPL(device_property_read_string);

/**
 * device_property_match_string - find a string in an array and return index
 * @dev: Device to get the property of
 * @propname: Name of the property holding the array
 * @string: String to look for
 *
 * Find a given string in a string array and if it is found return the
 * index back.
 *
 * Return: index, starting from %0, if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO if the property is not an array of strings,
 *           %-ENXIO if no suitable firmware interface is present.
 */
int device_property_match_string(const struct device *dev, const char *propname,
                                 const char *string)
{
        return fwnode_property_match_string(dev_fwnode(dev), propname, string);
}
EXPORT_SYMBOL_GPL(device_property_match_string);

static int fwnode_property_read_int_array(const struct fwnode_handle *fwnode,
                                          const char *propname,
                                          unsigned int elem_size, void *val,
                                          size_t nval)
{
        int ret;

        if (IS_ERR_OR_NULL(fwnode))
                return -EINVAL;

        ret = fwnode_call_int_op(fwnode, property_read_int_array, propname,
                                 elem_size, val, nval);
        if (ret != -EINVAL)
                return ret;

        return fwnode_call_int_op(fwnode->secondary, property_read_int_array, propname,
                                  elem_size, val, nval);
}

/**
 * fwnode_property_read_u8_array - return a u8 array property of firmware node
 * @fwnode: Firmware node to get the property of
 * @propname: Name of the property
 * @val: The values are stored here or %NULL to return the number of values
 * @nval: Size of the @val array
 *
 * Read an array of u8 properties with @propname from @fwnode and stores them to
 * @val if found.
 *
 * It's recommended to call fwnode_property_count_u8() instead of calling
 * this function with @val equals %NULL and @nval equals 0.
 *
 * Return: number of values if @val was %NULL,
 *         %0 if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO if the property is not an array of numbers,
 *           %-EOVERFLOW if the size of the property is not as expected,
 *           %-ENXIO if no suitable firmware interface is present.
 */
int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode,
                                  const char *propname, u8 *val, size_t nval)
{
        return fwnode_property_read_int_array(fwnode, propname, sizeof(u8),
                                              val, nval);
}
EXPORT_SYMBOL_GPL(fwnode_property_read_u8_array);

/**
 * fwnode_property_read_u16_array - return a u16 array property of firmware node
 * @fwnode: Firmware node to get the property of
 * @propname: Name of the property
 * @val: The values are stored here or %NULL to return the number of values
 * @nval: Size of the @val array
 *
 * Read an array of u16 properties with @propname from @fwnode and store them to
 * @val if found.
 *
 * It's recommended to call fwnode_property_count_u16() instead of calling
 * this function with @val equals %NULL and @nval equals 0.
 *
 * Return: number of values if @val was %NULL,
 *         %0 if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO if the property is not an array of numbers,
 *           %-EOVERFLOW if the size of the property is not as expected,
 *           %-ENXIO if no suitable firmware interface is present.
 */
int fwnode_property_read_u16_array(const struct fwnode_handle *fwnode,
                                   const char *propname, u16 *val, size_t nval)
{
        return fwnode_property_read_int_array(fwnode, propname, sizeof(u16),
                                              val, nval);
}
EXPORT_SYMBOL_GPL(fwnode_property_read_u16_array);

/**
 * fwnode_property_read_u32_array - return a u32 array property of firmware node
 * @fwnode: Firmware node to get the property of
 * @propname: Name of the property
 * @val: The values are stored here or %NULL to return the number of values
 * @nval: Size of the @val array
 *
 * Read an array of u32 properties with @propname from @fwnode store them to
 * @val if found.
 *
 * It's recommended to call fwnode_property_count_u32() instead of calling
 * this function with @val equals %NULL and @nval equals 0.
 *
 * Return: number of values if @val was %NULL,
 *         %0 if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO if the property is not an array of numbers,
 *           %-EOVERFLOW if the size of the property is not as expected,
 *           %-ENXIO if no suitable firmware interface is present.
 */
int fwnode_property_read_u32_array(const struct fwnode_handle *fwnode,
                                   const char *propname, u32 *val, size_t nval)
{
        return fwnode_property_read_int_array(fwnode, propname, sizeof(u32),
                                              val, nval);
}
EXPORT_SYMBOL_GPL(fwnode_property_read_u32_array);

/**
 * fwnode_property_read_u64_array - return a u64 array property firmware node
 * @fwnode: Firmware node to get the property of
 * @propname: Name of the property
 * @val: The values are stored here or %NULL to return the number of values
 * @nval: Size of the @val array
 *
 * Read an array of u64 properties with @propname from @fwnode and store them to
 * @val if found.
 *
 * It's recommended to call fwnode_property_count_u64() instead of calling
 * this function with @val equals %NULL and @nval equals 0.
 *
 * Return: number of values if @val was %NULL,
 *         %0 if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO if the property is not an array of numbers,
 *           %-EOVERFLOW if the size of the property is not as expected,
 *           %-ENXIO if no suitable firmware interface is present.
 */
int fwnode_property_read_u64_array(const struct fwnode_handle *fwnode,
                                   const char *propname, u64 *val, size_t nval)
{
        return fwnode_property_read_int_array(fwnode, propname, sizeof(u64),
                                              val, nval);
}
EXPORT_SYMBOL_GPL(fwnode_property_read_u64_array);

/**
 * fwnode_property_read_string_array - return string array property of a node
 * @fwnode: Firmware node to get the property of
 * @propname: Name of the property
 * @val: The values are stored here or %NULL to return the number of values
 * @nval: Size of the @val array
 *
 * Read an string list property @propname from the given firmware node and store
 * them to @val if found.
 *
 * It's recommended to call fwnode_property_string_array_count() instead of calling
 * this function with @val equals %NULL and @nval equals 0.
 *
 * Return: number of values read on success if @val is non-NULL,
 *           number of values available on success if @val is NULL,
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO or %-EILSEQ if the property is not an array of strings,
 *           %-EOVERFLOW if the size of the property is not as expected,
 *           %-ENXIO if no suitable firmware interface is present.
 */
int fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
                                      const char *propname, const char **val,
                                      size_t nval)
{
        int ret;

        if (IS_ERR_OR_NULL(fwnode))
                return -EINVAL;

        ret = fwnode_call_int_op(fwnode, property_read_string_array, propname,
                                 val, nval);
        if (ret != -EINVAL)
                return ret;

        return fwnode_call_int_op(fwnode->secondary, property_read_string_array, propname,
                                  val, nval);
}
EXPORT_SYMBOL_GPL(fwnode_property_read_string_array);

/**
 * fwnode_property_read_string - return a string property of a firmware node
 * @fwnode: Firmware node to get the property of
 * @propname: Name of the property
 * @val: The value is stored here
 *
 * Read property @propname from the given firmware node and store the value into
 * @val if found.  The value is checked to be a string.
 *
 * Return: %0 if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO or %-EILSEQ if the property is not a string,
 *           %-ENXIO if no suitable firmware interface is present.
 */
int fwnode_property_read_string(const struct fwnode_handle *fwnode,
                                const char *propname, const char **val)
{
        int ret = fwnode_property_read_string_array(fwnode, propname, val, 1);

        return ret < 0 ? ret : 0;
}
EXPORT_SYMBOL_GPL(fwnode_property_read_string);

/**
 * fwnode_property_match_string - find a string in an array and return index
 * @fwnode: Firmware node to get the property of
 * @propname: Name of the property holding the array
 * @string: String to look for
 *
 * Find a given string in a string array and if it is found return the
 * index back.
 *
 * Return: index, starting from %0, if the property was found (success),
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO if the property is not an array of strings,
 *           %-ENXIO if no suitable firmware interface is present.
 */
int fwnode_property_match_string(const struct fwnode_handle *fwnode,
        const char *propname, const char *string)
{
        const char **values;
        int nval, ret;

        nval = fwnode_property_string_array_count(fwnode, propname);
        if (nval < 0)
                return nval;

        if (nval == 0)
                return -ENODATA;

        values = kcalloc(nval, sizeof(*values), GFP_KERNEL);
        if (!values)
                return -ENOMEM;

        ret = fwnode_property_read_string_array(fwnode, propname, values, nval);
        if (ret < 0)
                goto out_free;

        ret = match_string(values, nval, string);
        if (ret < 0)
                ret = -ENODATA;

out_free:
        kfree(values);
        return ret;
}
EXPORT_SYMBOL_GPL(fwnode_property_match_string);

/**
 * fwnode_property_match_property_string - find a property string value in an array and return index
 * @fwnode: Firmware node to get the property of
 * @propname: Name of the property holding the string value
 * @array: String array to search in
 * @n: Size of the @array
 *
 * Find a property string value in a given @array and if it is found return
 * the index back.
 *
 * Return: index, starting from %0, if the string value was found in the @array (success),
 *           %-ENOENT when the string value was not found in the @array,
 *           %-EINVAL if given arguments are not valid,
 *           %-ENODATA if the property does not have a value,
 *           %-EPROTO or %-EILSEQ if the property is not a string,
 *           %-ENXIO if no suitable firmware interface is present.
 */
int fwnode_property_match_property_string(const struct fwnode_handle *fwnode,
        const char *propname, const char * const *array, size_t n)
{
        const char *string;
        int ret;

        ret = fwnode_property_read_string(fwnode, propname, &string);
        if (ret)
                return ret;

        ret = match_string(array, n, string);
        if (ret < 0)
                ret = -ENOENT;

        return ret;
}
EXPORT_SYMBOL_GPL(fwnode_property_match_property_string);

/**
 * fwnode_property_get_reference_args() - Find a reference with arguments
 * @fwnode:        Firmware node where to look for the reference
 * @prop:        The name of the property
 * @nargs_prop:        The name of the property telling the number of
 *                arguments in the referred node. NULL if @nargs is known,
 *                otherwise @nargs is ignored.
 * @nargs:        Number of arguments. Ignored if @nargs_prop is non-NULL.
 * @index:        Index of the reference, from zero onwards.
 * @args:        Result structure with reference and integer arguments.
 *                May be NULL.
 *
 * Obtain a reference based on a named property in an fwnode, with
 * integer arguments.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * @args->fwnode pointer.
 *
 * Return: %0 on success
 *            %-ENOENT when the index is out of bounds, the index has an empty
 *                     reference or the property was not found
 *            %-EINVAL on parse error
 *            %-ENOTCONN when the remote firmware node exists but has not been
 *                       registered yet
 */
int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode,
                                       const char *prop, const char *nargs_prop,
                                       unsigned int nargs, unsigned int index,
                                       struct fwnode_reference_args *args)
{
        int ret;

        if (IS_ERR_OR_NULL(fwnode))
                return -ENOENT;

        ret = fwnode_call_int_op(fwnode, get_reference_args, prop, nargs_prop,
                                 nargs, index, args);
        if (ret == 0)
                return ret;

        if (IS_ERR_OR_NULL(fwnode->secondary))
                return ret;

        return fwnode_call_int_op(fwnode->secondary, get_reference_args, prop, nargs_prop,
                                  nargs, index, args);
}
EXPORT_SYMBOL_GPL(fwnode_property_get_reference_args);

/**
 * fwnode_find_reference - Find named reference to a fwnode_handle
 * @fwnode: Firmware node where to look for the reference
 * @name: The name of the reference
 * @index: Index of the reference
 *
 * @index can be used when the named reference holds a table of references.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 *
 * Return: a pointer to the reference fwnode, when found. Otherwise,
 * returns an error pointer.
 */
struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode,
                                            const char *name,
                                            unsigned int index)
{
        struct fwnode_reference_args args;
        int ret;

        ret = fwnode_property_get_reference_args(fwnode, name, NULL, 0, index,
                                                 &args);
        return ret ? ERR_PTR(ret) : args.fwnode;
}
EXPORT_SYMBOL_GPL(fwnode_find_reference);

/**
 * fwnode_get_name - Return the name of a node
 * @fwnode: The firmware node
 *
 * Return: a pointer to the node name, or %NULL.
 */
const char *fwnode_get_name(const struct fwnode_handle *fwnode)
{
        return fwnode_call_ptr_op(fwnode, get_name);
}
EXPORT_SYMBOL_GPL(fwnode_get_name);

/**
 * fwnode_get_name_prefix - Return the prefix of node for printing purposes
 * @fwnode: The firmware node
 *
 * Return: the prefix of a node, intended to be printed right before the node.
 * The prefix works also as a separator between the nodes.
 */
const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
{
        return fwnode_call_ptr_op(fwnode, get_name_prefix);
}

/**
 * fwnode_name_eq - Return true if node name is equal
 * @fwnode: The firmware node
 * @name: The name to which to compare the node name
 *
 * Compare the name provided as an argument to the name of the node, stopping
 * the comparison at either NUL or '@' character, whichever comes first. This
 * function is generally used for comparing node names while ignoring the
 * possible unit address of the node.
 *
 * Return: true if the node name matches with the name provided in the @name
 * argument, false otherwise.
 */
bool fwnode_name_eq(const struct fwnode_handle *fwnode, const char *name)
{
        const char *node_name;
        ptrdiff_t len;

        node_name = fwnode_get_name(fwnode);
        if (!node_name)
                return false;

        len = strchrnul(node_name, '@') - node_name;

        return str_has_prefix(node_name, name) == len;
}
EXPORT_SYMBOL_GPL(fwnode_name_eq);

/**
 * fwnode_get_parent - Return parent firwmare node
 * @fwnode: Firmware whose parent is retrieved
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 *
 * Return: parent firmware node of the given node if possible or %NULL if no
 * parent was available.
 */
struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode)
{
        return fwnode_call_ptr_op(fwnode, get_parent);
}
EXPORT_SYMBOL_GPL(fwnode_get_parent);

/**
 * fwnode_get_next_parent - Iterate to the node's parent
 * @fwnode: Firmware whose parent is retrieved
 *
 * This is like fwnode_get_parent() except that it drops the refcount
 * on the passed node, making it suitable for iterating through a
 * node's parents.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer. Note that this function also puts a reference to @fwnode
 * unconditionally.
 *
 * Return: parent firmware node of the given node if possible or %NULL if no
 * parent was available.
 */
struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode)
{
        struct fwnode_handle *parent = fwnode_get_parent(fwnode);

        fwnode_handle_put(fwnode);

        return parent;
}
EXPORT_SYMBOL_GPL(fwnode_get_next_parent);

/**
 * fwnode_count_parents - Return the number of parents a node has
 * @fwnode: The node the parents of which are to be counted
 *
 * Return: the number of parents a node has.
 */
unsigned int fwnode_count_parents(const struct fwnode_handle *fwnode)
{
        struct fwnode_handle *parent;
        unsigned int count = 0;

        fwnode_for_each_parent_node(fwnode, parent)
                count++;

        return count;
}
EXPORT_SYMBOL_GPL(fwnode_count_parents);

/**
 * fwnode_get_nth_parent - Return an nth parent of a node
 * @fwnode: The node the parent of which is requested
 * @depth: Distance of the parent from the node
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 *
 * Return: the nth parent of a node. If there is no parent at the requested
 * @depth, %NULL is returned. If @depth is 0, the functionality is equivalent to
 * fwnode_handle_get(). For @depth == 1, it is fwnode_get_parent() and so on.
 */
struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode,
                                            unsigned int depth)
{
        struct fwnode_handle *parent;

        if (depth == 0)
                return fwnode_handle_get(fwnode);

        fwnode_for_each_parent_node(fwnode, parent) {
                if (--depth == 0)
                        return parent;
        }
        return NULL;
}
EXPORT_SYMBOL_GPL(fwnode_get_nth_parent);

/**
 * fwnode_get_next_child_node - Return the next child node handle for a node
 * @fwnode: Firmware node to find the next child node for.
 * @child: Handle to one of the node's child nodes or a %NULL handle.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer. Note that this function also puts a reference to @child
 * unconditionally.
 */
struct fwnode_handle *
fwnode_get_next_child_node(const struct fwnode_handle *fwnode,
                           struct fwnode_handle *child)
{
        struct fwnode_handle *next;

        if (IS_ERR_OR_NULL(fwnode))
                return NULL;

        /* Try to find a child in primary fwnode */
        next = fwnode_call_ptr_op(fwnode, get_next_child_node, child);
        if (next)
                return next;

        /* When no more children in primary, continue with secondary */
        return fwnode_call_ptr_op(fwnode->secondary, get_next_child_node, child);
}
EXPORT_SYMBOL_GPL(fwnode_get_next_child_node);

/**
 * fwnode_get_next_available_child_node - Return the next available child node handle for a node
 * @fwnode: Firmware node to find the next child node for.
 * @child: Handle to one of the node's child nodes or a %NULL handle.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer. Note that this function also puts a reference to @child
 * unconditionally.
 */
struct fwnode_handle *
fwnode_get_next_available_child_node(const struct fwnode_handle *fwnode,
                                     struct fwnode_handle *child)
{
        struct fwnode_handle *next_child = child;

        if (IS_ERR_OR_NULL(fwnode))
                return NULL;

        do {
                next_child = fwnode_get_next_child_node(fwnode, next_child);
                if (!next_child)
                        return NULL;
        } while (!fwnode_device_is_available(next_child));

        return next_child;
}
EXPORT_SYMBOL_GPL(fwnode_get_next_available_child_node);

/**
 * device_get_next_child_node - Return the next child node handle for a device
 * @dev: Device to find the next child node for.
 * @child: Handle to one of the device's child nodes or a %NULL handle.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer. Note that this function also puts a reference to @child
 * unconditionally.
 */
struct fwnode_handle *device_get_next_child_node(const struct device *dev,
                                                 struct fwnode_handle *child)
{
        return fwnode_get_next_child_node(dev_fwnode(dev), child);
}
EXPORT_SYMBOL_GPL(device_get_next_child_node);

/**
 * fwnode_get_named_child_node - Return first matching named child node handle
 * @fwnode: Firmware node to find the named child node for.
 * @childname: String to match child node name against.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 */
struct fwnode_handle *
fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
                            const char *childname)
{
        return fwnode_call_ptr_op(fwnode, get_named_child_node, childname);
}
EXPORT_SYMBOL_GPL(fwnode_get_named_child_node);

/**
 * device_get_named_child_node - Return first matching named child node handle
 * @dev: Device to find the named child node for.
 * @childname: String to match child node name against.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 */
struct fwnode_handle *device_get_named_child_node(const struct device *dev,
                                                  const char *childname)
{
        return fwnode_get_named_child_node(dev_fwnode(dev), childname);
}
EXPORT_SYMBOL_GPL(device_get_named_child_node);

/**
 * fwnode_handle_get - Obtain a reference to a device node
 * @fwnode: Pointer to the device node to obtain the reference to.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 *
 * Return: the fwnode handle.
 */
struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode)
{
        if (!fwnode_has_op(fwnode, get))
                return fwnode;

        return fwnode_call_ptr_op(fwnode, get);
}
EXPORT_SYMBOL_GPL(fwnode_handle_get);

/**
 * fwnode_device_is_available - check if a device is available for use
 * @fwnode: Pointer to the fwnode of the device.
 *
 * Return: true if device is available for use. Otherwise, returns false.
 *
 * For fwnode node types that don't implement the .device_is_available()
 * operation, this function returns true.
 */
bool fwnode_device_is_available(const struct fwnode_handle *fwnode)
{
        if (IS_ERR_OR_NULL(fwnode))
                return false;

        if (!fwnode_has_op(fwnode, device_is_available))
                return true;

        return fwnode_call_bool_op(fwnode, device_is_available);
}
EXPORT_SYMBOL_GPL(fwnode_device_is_available);

/**
 * fwnode_get_child_node_count - return the number of child nodes for a given firmware node
 * @fwnode: Pointer to the parent firmware node
 *
 * Return: the number of child nodes for a given firmware node.
 */
unsigned int fwnode_get_child_node_count(const struct fwnode_handle *fwnode)
{
        struct fwnode_handle *child;
        unsigned int count = 0;

        fwnode_for_each_child_node(fwnode, child)
                count++;

        return count;
}
EXPORT_SYMBOL_GPL(fwnode_get_child_node_count);

/**
 * fwnode_get_named_child_node_count - number of child nodes with given name
 * @fwnode: Node which child nodes are counted.
 * @name: String to match child node name against.
 *
 * Scan child nodes and count all the nodes with a specific name. Potential
 * 'number' -ending after the 'at sign' for scanned names is ignored.
 * E.g.::
 *   fwnode_get_named_child_node_count(fwnode, "channel");
 * would match all the nodes::
 *   channel { }, channel@0 {}, channel@0xabba {}...
 *
 * Return: the number of child nodes with a matching name for a given device.
 */
unsigned int fwnode_get_named_child_node_count(const struct fwnode_handle *fwnode,
                                               const char *name)
{
        struct fwnode_handle *child;
        unsigned int count = 0;

        fwnode_for_each_named_child_node(fwnode, child, name)
                count++;

        return count;
}
EXPORT_SYMBOL_GPL(fwnode_get_named_child_node_count);

bool device_dma_supported(const struct device *dev)
{
        return fwnode_call_bool_op(dev_fwnode(dev), device_dma_supported);
}
EXPORT_SYMBOL_GPL(device_dma_supported);

enum dev_dma_attr device_get_dma_attr(const struct device *dev)
{
        if (!fwnode_has_op(dev_fwnode(dev), device_get_dma_attr))
                return DEV_DMA_NOT_SUPPORTED;

        return fwnode_call_int_op(dev_fwnode(dev), device_get_dma_attr);
}
EXPORT_SYMBOL_GPL(device_get_dma_attr);

/**
 * fwnode_get_phy_mode - Get phy mode for given firmware node
 * @fwnode:        Pointer to the given node
 *
 * The function gets phy interface string from property 'phy-mode' or
 * 'phy-connection-type', and return its index in phy_modes table, or errno in
 * error case.
 */
int fwnode_get_phy_mode(const struct fwnode_handle *fwnode)
{
        const char *pm;
        int err, i;

        err = fwnode_property_read_string(fwnode, "phy-mode", &pm);
        if (err < 0)
                err = fwnode_property_read_string(fwnode,
                                                  "phy-connection-type", &pm);
        if (err < 0)
                return err;

        for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++)
                if (!strcasecmp(pm, phy_modes(i)))
                        return i;

        return -ENODEV;
}
EXPORT_SYMBOL_GPL(fwnode_get_phy_mode);

/**
 * device_get_phy_mode - Get phy mode for given device
 * @dev:        Pointer to the given device
 *
 * The function gets phy interface string from property 'phy-mode' or
 * 'phy-connection-type', and return its index in phy_modes table, or errno in
 * error case.
 */
int device_get_phy_mode(struct device *dev)
{
        return fwnode_get_phy_mode(dev_fwnode(dev));
}
EXPORT_SYMBOL_GPL(device_get_phy_mode);

/**
 * fwnode_iomap - Maps the memory mapped IO for a given fwnode
 * @fwnode:        Pointer to the firmware node
 * @index:        Index of the IO range
 *
 * Return: a pointer to the mapped memory.
 */
void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index)
{
        return fwnode_call_ptr_op(fwnode, iomap, index);
}
EXPORT_SYMBOL(fwnode_iomap);

/**
 * fwnode_irq_get - Get IRQ directly from a fwnode
 * @fwnode:        Pointer to the firmware node
 * @index:        Zero-based index of the IRQ
 *
 * Return: Linux IRQ number on success. Negative errno on failure.
 */
int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index)
{
        int ret;

        ret = fwnode_call_int_op(fwnode, irq_get, index);
        /* We treat mapping errors as invalid case */
        if (ret == 0)
                return -EINVAL;

        return ret;
}
EXPORT_SYMBOL(fwnode_irq_get);

/**
 * fwnode_irq_get_byname - Get IRQ from a fwnode using its name
 * @fwnode:        Pointer to the firmware node
 * @name:        IRQ name
 *
 * Description:
 * Find a match to the string @name in the 'interrupt-names' string array
 * in _DSD for ACPI, or of_node for Device Tree. Then get the Linux IRQ
 * number of the IRQ resource corresponding to the index of the matched
 * string.
 *
 * Return: Linux IRQ number on success, or negative errno otherwise.
 */
int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name)
{
        int index;

        if (!name)
                return -EINVAL;

        index = fwnode_property_match_string(fwnode, "interrupt-names",  name);
        if (index < 0)
                return index;

        return fwnode_irq_get(fwnode, index);
}
EXPORT_SYMBOL(fwnode_irq_get_byname);

/**
 * fwnode_graph_get_next_endpoint - Get next endpoint firmware node
 * @fwnode: Pointer to the parent firmware node
 * @prev: Previous endpoint node or %NULL to get the first
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer. Note that this function also puts a reference to @prev
 * unconditionally.
 *
 * Return: an endpoint firmware node pointer or %NULL if no more endpoints
 * are available.
 */
struct fwnode_handle *
fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
                               struct fwnode_handle *prev)
{
        struct fwnode_handle *ep, *port_parent = NULL;
        const struct fwnode_handle *parent;

        /*
         * If this function is in a loop and the previous iteration returned
         * an endpoint from fwnode->secondary, then we need to use the secondary
         * as parent rather than @fwnode.
         */
        if (prev) {
                port_parent = fwnode_graph_get_port_parent(prev);
                parent = port_parent;
        } else {
                parent = fwnode;
        }
        if (IS_ERR_OR_NULL(parent))
                return NULL;

        ep = fwnode_call_ptr_op(parent, graph_get_next_endpoint, prev);
        if (ep)
                goto out_put_port_parent;

        ep = fwnode_graph_get_next_endpoint(parent->secondary, NULL);

out_put_port_parent:
        fwnode_handle_put(port_parent);
        return ep;
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_next_endpoint);

/**
 * fwnode_graph_get_port_parent - Return the device fwnode of a port endpoint
 * @endpoint: Endpoint firmware node of the port
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 *
 * Return: the firmware node of the device the @endpoint belongs to.
 */
struct fwnode_handle *
fwnode_graph_get_port_parent(const struct fwnode_handle *endpoint)
{
        struct fwnode_handle *port, *parent;

        port = fwnode_get_parent(endpoint);
        parent = fwnode_call_ptr_op(port, graph_get_port_parent);

        fwnode_handle_put(port);

        return parent;
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_port_parent);

/**
 * fwnode_graph_get_remote_port_parent - Return fwnode of a remote device
 * @fwnode: Endpoint firmware node pointing to the remote endpoint
 *
 * Extracts firmware node of a remote device the @fwnode points to.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 */
struct fwnode_handle *
fwnode_graph_get_remote_port_parent(const struct fwnode_handle *fwnode)
{
        struct fwnode_handle *endpoint, *parent;

        endpoint = fwnode_graph_get_remote_endpoint(fwnode);
        parent = fwnode_graph_get_port_parent(endpoint);

        fwnode_handle_put(endpoint);

        return parent;
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port_parent);

/**
 * fwnode_graph_get_remote_port - Return fwnode of a remote port
 * @fwnode: Endpoint firmware node pointing to the remote endpoint
 *
 * Extracts firmware node of a remote port the @fwnode points to.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 */
struct fwnode_handle *
fwnode_graph_get_remote_port(const struct fwnode_handle *fwnode)
{
        return fwnode_get_next_parent(fwnode_graph_get_remote_endpoint(fwnode));
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port);

/**
 * fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint
 * @fwnode: Endpoint firmware node pointing to the remote endpoint
 *
 * Extracts firmware node of a remote endpoint the @fwnode points to.
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 */
struct fwnode_handle *
fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode)
{
        return fwnode_call_ptr_op(fwnode, graph_get_remote_endpoint);
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint);

static bool fwnode_graph_remote_available(struct fwnode_handle *ep)
{
        struct fwnode_handle *dev_node;
        bool available;

        dev_node = fwnode_graph_get_remote_port_parent(ep);
        available = fwnode_device_is_available(dev_node);
        fwnode_handle_put(dev_node);

        return available;
}

/**
 * fwnode_graph_get_endpoint_by_id - get endpoint by port and endpoint numbers
 * @fwnode: parent fwnode_handle containing the graph
 * @port: identifier of the port node
 * @endpoint: identifier of the endpoint node under the port node
 * @flags: fwnode lookup flags
 *
 * The caller is responsible for calling fwnode_handle_put() on the returned
 * fwnode pointer.
 *
 * Return: the fwnode handle of the local endpoint corresponding the port and
 * endpoint IDs or %NULL if not found.
 *
 * If FWNODE_GRAPH_ENDPOINT_NEXT is passed in @flags and the specified endpoint
 * has not been found, look for the closest endpoint ID greater than the
 * specified one and return the endpoint that corresponds to it, if present.
 *
 * Does not return endpoints that belong to disabled devices or endpoints that
 * are unconnected, unless FWNODE_GRAPH_DEVICE_DISABLED is passed in @flags.
 */
struct fwnode_handle *
fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode,
                                u32 port, u32 endpoint, unsigned long flags)
{
        struct fwnode_handle *ep, *best_ep = NULL;
        unsigned int best_ep_id = 0;
        bool endpoint_next = flags & FWNODE_GRAPH_ENDPOINT_NEXT;
        bool enabled_only = !(flags & FWNODE_GRAPH_DEVICE_DISABLED);

        fwnode_graph_for_each_endpoint(fwnode, ep) {
                struct fwnode_endpoint fwnode_ep = { 0 };
                int ret;

                if (enabled_only && !fwnode_graph_remote_available(ep))
                        continue;

                ret = fwnode_graph_parse_endpoint(ep, &fwnode_ep);
                if (ret < 0)
                        continue;

                if (fwnode_ep.port != port)
                        continue;

                if (fwnode_ep.id == endpoint)
                        return ep;

                if (!endpoint_next)
                        continue;

                /*
                 * If the endpoint that has just been found is not the first
                 * matching one and the ID of the one found previously is closer
                 * to the requested endpoint ID, skip it.
                 */
                if (fwnode_ep.id < endpoint ||
                    (best_ep && best_ep_id < fwnode_ep.id))
                        continue;

                fwnode_handle_put(best_ep);
                best_ep = fwnode_handle_get(ep);
                best_ep_id = fwnode_ep.id;
        }

        return best_ep;
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_by_id);

/**
 * fwnode_graph_get_endpoint_count - Count endpoints on a device node
 * @fwnode: The node related to a device
 * @flags: fwnode lookup flags
 * Count endpoints in a device node.
 *
 * If FWNODE_GRAPH_DEVICE_DISABLED flag is specified, also unconnected endpoints
 * and endpoints connected to disabled devices are counted.
 */
unsigned int fwnode_graph_get_endpoint_count(const struct fwnode_handle *fwnode,
                                             unsigned long flags)
{
        struct fwnode_handle *ep;
        unsigned int count = 0;

        fwnode_graph_for_each_endpoint(fwnode, ep) {
                if (flags & FWNODE_GRAPH_DEVICE_DISABLED ||
                    fwnode_graph_remote_available(ep))
                        count++;
        }

        return count;
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_count);

/**
 * fwnode_graph_parse_endpoint - parse common endpoint node properties
 * @fwnode: pointer to endpoint fwnode_handle
 * @endpoint: pointer to the fwnode endpoint data structure
 *
 * Parse @fwnode representing a graph endpoint node and store the
 * information in @endpoint. The caller must hold a reference to
 * @fwnode.
 */
int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
                                struct fwnode_endpoint *endpoint)
{
        memset(endpoint, 0, sizeof(*endpoint));

        return fwnode_call_int_op(fwnode, graph_parse_endpoint, endpoint);
}
EXPORT_SYMBOL(fwnode_graph_parse_endpoint);

const void *device_get_match_data(const struct device *dev)
{
        return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data, dev);
}
EXPORT_SYMBOL_GPL(device_get_match_data);

static unsigned int fwnode_graph_devcon_matches(const struct fwnode_handle *fwnode,
                                                const char *con_id, void *data,
                                                devcon_match_fn_t match,
                                                void **matches,
                                                unsigned int matches_len)
{
        struct fwnode_handle *node;
        struct fwnode_handle *ep;
        unsigned int count = 0;
        void *ret;

        fwnode_graph_for_each_endpoint(fwnode, ep) {
                if (matches && count >= matches_len) {
                        fwnode_handle_put(ep);
                        break;
                }

                node = fwnode_graph_get_remote_port_parent(ep);
                if (!fwnode_device_is_available(node)) {
                        fwnode_handle_put(node);
                        continue;
                }

                ret = match(node, con_id, data);
                fwnode_handle_put(node);
                if (ret) {
                        if (matches)
                                matches[count] = ret;
                        count++;
                }
        }
        return count;
}

static unsigned int fwnode_devcon_matches(const struct fwnode_handle *fwnode,
                                          const char *con_id, void *data,
                                          devcon_match_fn_t match,
                                          void **matches,
                                          unsigned int matches_len)
{
        struct fwnode_handle *node;
        unsigned int count = 0;
        unsigned int i;
        void *ret;

        for (i = 0; ; i++) {
                if (matches && count >= matches_len)
                        break;

                node = fwnode_find_reference(fwnode, con_id, i);
                if (IS_ERR(node))
                        break;

                ret = match(node, NULL, data);
                fwnode_handle_put(node);
                if (ret) {
                        if (matches)
                                matches[count] = ret;
                        count++;
                }
        }

        return count;
}

/**
 * fwnode_connection_find_match - Find connection from a device node
 * @fwnode: Device node with the connection
 * @con_id: Identifier for the connection
 * @data: Data for the match function
 * @match: Function to check and convert the connection description
 *
 * Find a connection with unique identifier @con_id between @fwnode and another
 * device node. @match will be used to convert the connection description to
 * data the caller is expecting to be returned.
 */
void *fwnode_connection_find_match(const struct fwnode_handle *fwnode,
                                   const char *con_id, void *data,
                                   devcon_match_fn_t match)
{
        unsigned int count;
        void *ret;

        if (!fwnode || !match)
                return NULL;

        count = fwnode_graph_devcon_matches(fwnode, con_id, data, match, &ret, 1);
        if (count)
                return ret;

        count = fwnode_devcon_matches(fwnode, con_id, data, match, &ret, 1);
        return count ? ret : NULL;
}
EXPORT_SYMBOL_GPL(fwnode_connection_find_match);

/**
 * fwnode_connection_find_matches - Find connections from a device node
 * @fwnode: Device node with the connection
 * @con_id: Identifier for the connection
 * @data: Data for the match function
 * @match: Function to check and convert the connection description
 * @matches: (Optional) array of pointers to fill with matches
 * @matches_len: Length of @matches
 *
 * Find up to @matches_len connections with unique identifier @con_id between
 * @fwnode and other device nodes. @match will be used to convert the
 * connection description to data the caller is expecting to be returned
 * through the @matches array.
 *
 * If @matches is %NULL @matches_len is ignored and the total number of resolved
 * matches is returned.
 *
 * Return: Number of matches resolved, or negative errno.
 */
int fwnode_connection_find_matches(const struct fwnode_handle *fwnode,
                                   const char *con_id, void *data,
                                   devcon_match_fn_t match,
                                   void **matches, unsigned int matches_len)
{
        unsigned int count_graph;
        unsigned int count_ref;

        if (!fwnode || !match)
                return -EINVAL;

        count_graph = fwnode_graph_devcon_matches(fwnode, con_id, data, match,
                                                  matches, matches_len);

        if (matches) {
                matches += count_graph;
                matches_len -= count_graph;
        }

        count_ref = fwnode_devcon_matches(fwnode, con_id, data, match,
                                          matches, matches_len);

        return count_graph + count_ref;
}
EXPORT_SYMBOL_GPL(fwnode_connection_find_matches);













































































































































































































    1 



































    1 















































































































































































































































































































































































































































































































































































































































































    1 






















    1 






















    1 





    1 









































































































































































































































    1 
































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_SCHED_GENERIC_H
#define __NET_SCHED_GENERIC_H

#include <linux/netdevice.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/pkt_sched.h>
#include <linux/pkt_cls.h>
#include <linux/percpu.h>
#include <linux/dynamic_queue_limits.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include <linux/hashtable.h>
#include <net/gen_stats.h>
#include <net/rtnetlink.h>
#include <net/flow_offload.h>
#include <linux/xarray.h>
#include <net/dropreason-qdisc.h>

struct Qdisc_ops;
struct qdisc_walker;
struct tcf_walker;
struct module;
struct bpf_flow_keys;
struct Qdisc;
struct netdev_queue;

struct qdisc_rate_table {
        struct tc_ratespec rate;
        u32                data[256];
        struct qdisc_rate_table *next;
        int                refcnt;
};

enum qdisc_state_t {
        __QDISC_STATE_SCHED,
        __QDISC_STATE_DEACTIVATED,
        __QDISC_STATE_MISSED,
        __QDISC_STATE_DRAINING,
};

#define QDISC_STATE_MISSED        BIT(__QDISC_STATE_MISSED)
#define QDISC_STATE_DRAINING        BIT(__QDISC_STATE_DRAINING)

#define QDISC_STATE_NON_EMPTY        (QDISC_STATE_MISSED | \
                                        QDISC_STATE_DRAINING)

struct qdisc_size_table {
        struct rcu_head                rcu;
        struct list_head        list;
        struct tc_sizespec        szopts;
        int                        refcnt;
        u16                        data[];
};

/* similar to sk_buff_head, but skb->prev pointer is undefined. */
struct qdisc_skb_head {
        struct sk_buff        *head;
        struct sk_buff        *tail;
        __u32                qlen;
        spinlock_t        lock;
};

struct Qdisc {
        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *sch);
        unsigned int                flags;
#define TCQ_F_BUILTIN                1
#define TCQ_F_INGRESS                2
#define TCQ_F_CAN_BYPASS        4
#define TCQ_F_MQROOT                8
#define TCQ_F_ONETXQUEUE        0x10 /* dequeue_skb() can assume all skbs are for
                                      * q->dev_queue : It can test
                                      * netif_xmit_frozen_or_stopped() before
                                      * dequeueing next packet.
                                      * Its true for MQ/MQPRIO slaves, or non
                                      * multiqueue device.
                                      */
#define TCQ_F_WARN_NONWC        (1 << 16)
#define TCQ_F_CPUSTATS                0x20 /* run using percpu statistics */
#define TCQ_F_NOPARENT                0x40 /* root of its hierarchy :
                                      * qdisc_tree_decrease_qlen() should stop.
                                      */
#define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK                0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED                0x200 /* qdisc is offloaded to HW */
#define TCQ_F_DEQUEUE_DROPS        0x400 /* ->dequeue() can drop packets in q->to_free */

        u32                        limit;
        const struct Qdisc_ops        *ops;
        struct qdisc_size_table        __rcu *stab;
        struct hlist_node       hash;
        u32                        handle;
        u32                        parent;

        struct netdev_queue        *dev_queue;

        struct net_rate_estimator __rcu *rate_est;
        struct gnet_stats_basic_sync __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        int                        pad;
        refcount_t                refcnt;

        /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */
        __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned;
                struct sk_buff_head        gso_skb;
                struct Qdisc                *next_sched;
                struct sk_buff_head        skb_bad_txq;
        __cacheline_group_end(Qdisc_read_mostly);

        /* Fields dirtied in dequeue() fast path. */
        __cacheline_group_begin(Qdisc_write) ____cacheline_aligned;
                struct qdisc_skb_head        q;
                unsigned long                state;
                struct gnet_stats_basic_sync bstats;
                bool                        running; /* must be written under qdisc spinlock */

                /* Note : we only change qstats.backlog in fast path. */
                struct gnet_stats_queue        qstats;

                struct sk_buff                *to_free;
        __cacheline_group_end(Qdisc_write);


        atomic_long_t                defer_count ____cacheline_aligned_in_smp;
        struct llist_head        defer_list;

        spinlock_t                seqlock;

        struct rcu_head                rcu;
        netdevice_tracker        dev_tracker;
        struct lock_class_key        root_lock_key;
        /* private data */
        long privdata[] ____cacheline_aligned;
};

static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return;
        refcount_inc(&qdisc->refcnt);
}

static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return true;
        return refcount_dec_if_one(&qdisc->refcnt);
}

/* Intended to be used by unlocked users, when concurrent qdisc release is
 * possible.
 */

static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return qdisc;
        if (refcount_inc_not_zero(&qdisc->refcnt))
                return qdisc;
        return NULL;
}

/* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc
 * root_lock section, or provide their own memory barriers -- ordering
 * against qdisc_run_begin/end() atomic bit operations.
 */
static inline bool qdisc_is_running(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK)
                return spin_is_locked(&qdisc->seqlock);
        return READ_ONCE(qdisc->running);
}

static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc)
{
        return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY);
}

static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
{
        return q->flags & TCQ_F_CPUSTATS;
}

static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
{
        if (qdisc_is_percpu_stats(qdisc))
                return nolock_qdisc_is_empty(qdisc);
        return !READ_ONCE(qdisc->q.qlen);
}

/* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with
 * the qdisc root lock acquired.
 */
static inline bool qdisc_run_begin(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK) {
                if (spin_trylock(&qdisc->seqlock))
                        return true;

                /* No need to insist if the MISSED flag was already set.
                 * Note that test_and_set_bit() also gives us memory ordering
                 * guarantees wrt potential earlier enqueue() and below
                 * spin_trylock(), both of which are necessary to prevent races
                 */
                if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state))
                        return false;

                /* Try to take the lock again to make sure that we will either
                 * grab it or the CPU that still has it will see MISSED set
                 * when testing it in qdisc_run_end()
                 */
                return spin_trylock(&qdisc->seqlock);
        }
        if (READ_ONCE(qdisc->running))
                return false;
        WRITE_ONCE(qdisc->running, true);
        return true;
}

static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc)
{
        struct sk_buff *to_free = NULL;

        if (qdisc->flags & TCQ_F_NOLOCK) {
                spin_unlock(&qdisc->seqlock);

                /* spin_unlock() only has store-release semantic. The unlock
                 * and test_bit() ordering is a store-load ordering, so a full
                 * memory barrier is needed here.
                 */
                smp_mb();

                if (unlikely(test_bit(__QDISC_STATE_MISSED,
                                      &qdisc->state)))
                        __netif_schedule(qdisc);
                return NULL;
        }

        if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) {
                to_free = qdisc->to_free;
                if (to_free)
                        qdisc->to_free = NULL;
        }
        WRITE_ONCE(qdisc->running, false);
        return to_free;
}

static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
{
        return qdisc->flags & TCQ_F_ONETXQUEUE;
}

static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq)
{
        return netdev_queue_dql_avail(txq);
}

struct Qdisc_class_ops {
        unsigned int                flags;
        /* Child qdisc manipulation */
        struct netdev_queue *        (*select_queue)(struct Qdisc *, struct tcmsg *);
        int                        (*graft)(struct Qdisc *, unsigned long cl,
                                        struct Qdisc *, struct Qdisc **,
                                        struct netlink_ext_ack *extack);
        struct Qdisc *                (*leaf)(struct Qdisc *, unsigned long cl);
        void                        (*qlen_notify)(struct Qdisc *, unsigned long);

        /* Class manipulation routines */
        unsigned long                (*find)(struct Qdisc *, u32 classid);
        int                        (*change)(struct Qdisc *, u32, u32,
                                        struct nlattr **, unsigned long *,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct Qdisc *, unsigned long,
                                          struct netlink_ext_ack *);
        void                        (*walk)(struct Qdisc *, struct qdisc_walker * arg);

        /* Filter manipulation */
        struct tcf_block *        (*tcf_block)(struct Qdisc *sch,
                                             unsigned long arg,
                                             struct netlink_ext_ack *extack);
        unsigned long                (*bind_tcf)(struct Qdisc *, unsigned long,
                                        u32 classid);
        void                        (*unbind_tcf)(struct Qdisc *, unsigned long);

        /* rtnetlink specific */
        int                        (*dump)(struct Qdisc *, unsigned long,
                                        struct sk_buff *skb, struct tcmsg*);
        int                        (*dump_stats)(struct Qdisc *, unsigned long,
                                        struct gnet_dump *);
};

/* Qdisc_class_ops flag values */

/* Implements API that doesn't require rtnl lock */
enum qdisc_class_ops_flags {
        QDISC_CLASS_OPS_DOIT_UNLOCKED = 1,
};

struct Qdisc_ops {
        struct Qdisc_ops        *next;
        const struct Qdisc_class_ops        *cl_ops;
        char                        id[IFNAMSIZ];
        int                        priv_size;
        unsigned int                static_flags;

        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *);
        struct sk_buff *        (*peek)(struct Qdisc *);

        int                        (*init)(struct Qdisc *sch, struct nlattr *arg,
                                        struct netlink_ext_ack *extack);
        void                        (*reset)(struct Qdisc *);
        void                        (*destroy)(struct Qdisc *);
        int                        (*change)(struct Qdisc *sch,
                                          struct nlattr *arg,
                                          struct netlink_ext_ack *extack);
        void                        (*attach)(struct Qdisc *sch);
        int                        (*change_tx_queue_len)(struct Qdisc *, unsigned int);
        void                        (*change_real_num_tx)(struct Qdisc *sch,
                                                      unsigned int new_real_tx);

        int                        (*dump)(struct Qdisc *, struct sk_buff *);
        int                        (*dump_stats)(struct Qdisc *, struct gnet_dump *);

        void                        (*ingress_block_set)(struct Qdisc *sch,
                                                     u32 block_index);
        void                        (*egress_block_set)(struct Qdisc *sch,
                                                    u32 block_index);
        u32                        (*ingress_block_get)(struct Qdisc *sch);
        u32                        (*egress_block_get)(struct Qdisc *sch);

        struct module                *owner;
};

struct tcf_result {
        union {
                struct {
                        unsigned long        class;
                        u32                classid;
                };
                const struct tcf_proto *goto_tp;
        };
};

struct tcf_chain;

struct tcf_proto_ops {
        struct list_head        head;
        char                        kind[IFNAMSIZ];

        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        int                        (*init)(struct tcf_proto*);
        void                        (*destroy)(struct tcf_proto *tp, bool rtnl_held,
                                           struct netlink_ext_ack *extack);

        void*                        (*get)(struct tcf_proto*, u32 handle);
        void                        (*put)(struct tcf_proto *tp, void *f);
        int                        (*change)(struct net *net, struct sk_buff *,
                                        struct tcf_proto*, unsigned long,
                                        u32 handle, struct nlattr **,
                                        void **, u32,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct tcf_proto *tp, void *arg,
                                          bool *last, bool rtnl_held,
                                          struct netlink_ext_ack *);
        bool                        (*delete_empty)(struct tcf_proto *tp);
        void                        (*walk)(struct tcf_proto *tp,
                                        struct tcf_walker *arg, bool rtnl_held);
        int                        (*reoffload)(struct tcf_proto *tp, bool add,
                                             flow_setup_cb_t *cb, void *cb_priv,
                                             struct netlink_ext_ack *extack);
        void                        (*hw_add)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*hw_del)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*bind_class)(void *, u32, unsigned long,
                                              void *, unsigned long);
        void *                        (*tmplt_create)(struct net *net,
                                                struct tcf_chain *chain,
                                                struct nlattr **tca,
                                                struct netlink_ext_ack *extack);
        void                        (*tmplt_destroy)(void *tmplt_priv);
        void                        (*tmplt_reoffload)(struct tcf_chain *chain,
                                                   bool add,
                                                   flow_setup_cb_t *cb,
                                                   void *cb_priv);
        struct tcf_exts *        (*get_exts)(const struct tcf_proto *tp,
                                            u32 handle);

        /* rtnetlink specific */
        int                        (*dump)(struct net*, struct tcf_proto*, void *,
                                        struct sk_buff *skb, struct tcmsg*,
                                        bool);
        int                        (*terse_dump)(struct net *net,
                                              struct tcf_proto *tp, void *fh,
                                              struct sk_buff *skb,
                                              struct tcmsg *t, bool rtnl_held);
        int                        (*tmplt_dump)(struct sk_buff *skb,
                                              struct net *net,
                                              void *tmplt_priv);

        struct module                *owner;
        int                        flags;
};

/* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags
 * are expected to implement tcf_proto_ops->delete_empty(), otherwise race
 * conditions can occur when filters are inserted/deleted simultaneously.
 */
enum tcf_proto_ops_flags {
        TCF_PROTO_OPS_DOIT_UNLOCKED = 1,
};

struct tcf_proto {
        /* Fast access part */
        struct tcf_proto __rcu        *next;
        void __rcu                *root;

        /* called under RCU BH lock*/
        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        __be16                        protocol;

        /* All the rest */
        u32                        prio;
        void                        *data;
        const struct tcf_proto_ops        *ops;
        struct tcf_chain        *chain;
        /* Lock protects tcf_proto shared state and can be used by unlocked
         * classifiers to protect their private data.
         */
        spinlock_t                lock;
        bool                        deleting;
        bool                        counted;
        bool                        usesw;
        refcount_t                refcnt;
        struct rcu_head                rcu;
        struct hlist_node        destroy_ht_node;
};

struct qdisc_skb_cb {
        unsigned int                pkt_len;
        u16                        pkt_segs;
        u16                        tc_classid;
#define QDISC_CB_PRIV_LEN 20
        unsigned char                data[QDISC_CB_PRIV_LEN];

        u16                        slave_dev_queue_mapping;
        u8                        post_ct:1;
        u8                        post_ct_snat:1;
        u8                        post_ct_dnat:1;
};

typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);

struct tcf_chain {
        /* Protects filter_chain. */
        struct mutex filter_chain_lock;
        struct tcf_proto __rcu *filter_chain;
        struct list_head list;
        struct tcf_block *block;
        u32 index; /* chain index */
        unsigned int refcnt;
        unsigned int action_refcnt;
        bool explicitly_created;
        bool flushing;
        const struct tcf_proto_ops *tmplt_ops;
        void *tmplt_priv;
        struct rcu_head rcu;
};

struct tcf_block {
        struct xarray ports; /* datapath accessible */
        /* Lock protects tcf_block and lifetime-management data of chains
         * attached to the block (refcnt, action_refcnt, explicitly_created).
         */
        struct mutex lock;
        struct list_head chain_list;
        u32 index; /* block index for shared blocks */
        u32 classid; /* which class this block belongs to */
        refcount_t refcnt;
        struct net *net;
        struct Qdisc *q;
        struct rw_semaphore cb_lock; /* protects cb_list and offload counters */
        struct flow_block flow_block;
        struct list_head owner_list;
        bool keep_dst;
        atomic_t useswcnt;
        atomic_t offloadcnt; /* Number of oddloaded filters */
        unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
        unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */
        struct {
                struct tcf_chain *chain;
                struct list_head filter_chain_list;
        } chain0;
        struct rcu_head rcu;
        DECLARE_HASHTABLE(proto_destroy_ht, 7);
        struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */
};

struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index);

static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain)
{
        return lockdep_is_held(&chain->filter_chain_lock);
}

static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp)
{
        return lockdep_is_held(&tp->lock);
}

#define tcf_chain_dereference(p, chain)                                        \
        rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain))

#define tcf_proto_dereference(p, tp)                                        \
        rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp))

static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
{
        struct qdisc_skb_cb *qcb;

        BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb));
        BUILD_BUG_ON(sizeof(qcb->data) < sz);
}

static inline int qdisc_qlen(const struct Qdisc *q)
{
        return q->q.qlen;
}

static inline int qdisc_qlen_sum(const struct Qdisc *q)
{
        __u32 qlen = q->qstats.qlen;
        int i;

        if (qdisc_is_percpu_stats(q)) {
                for_each_possible_cpu(i)
                        qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
        } else {
                qlen += q->q.qlen;
        }

        return qlen;
}

static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
{
        return (struct qdisc_skb_cb *)skb->cb;
}

static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc)
{
        return &qdisc->q.lock;
}

static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc)
{
        struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc);

        return q;
}

static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc)
{
        return rcu_dereference_bh(qdisc->dev_queue->qdisc);
}

static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
{
        return rcu_dereference_rtnl(qdisc->dev_queue->qdisc_sleeping);
}

static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
{
        struct Qdisc *root = qdisc_root_sleeping(qdisc);

        ASSERT_RTNL();
        return qdisc_lock(root);
}

static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
{
        return qdisc->dev_queue->dev;
}

static inline void sch_tree_lock(struct Qdisc *q)
{
        if (q->flags & TCQ_F_MQROOT)
                spin_lock_bh(qdisc_lock(q));
        else
                spin_lock_bh(qdisc_root_sleeping_lock(q));
}

static inline void sch_tree_unlock(struct Qdisc *q)
{
        if (q->flags & TCQ_F_MQROOT)
                spin_unlock_bh(qdisc_lock(q));
        else
                spin_unlock_bh(qdisc_root_sleeping_lock(q));
}

extern struct Qdisc noop_qdisc;
extern struct Qdisc_ops noop_qdisc_ops;
extern struct Qdisc_ops pfifo_fast_ops;
extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1];
extern struct Qdisc_ops mq_qdisc_ops;
extern struct Qdisc_ops noqueue_qdisc_ops;
extern const struct Qdisc_ops *default_qdisc_ops;
static inline const struct Qdisc_ops *
get_default_qdisc_ops(const struct net_device *dev, int ntx)
{
        return ntx < dev->real_num_tx_queues ?
                        default_qdisc_ops : &pfifo_fast_ops;
}

struct Qdisc_class_common {
        u32                        classid;
        unsigned int                filter_cnt;
        struct hlist_node        hnode;
};

struct Qdisc_class_hash {
        struct hlist_head        *hash;
        unsigned int                hashsize;
        unsigned int                hashmask;
        unsigned int                hashelems;
};

static inline unsigned int qdisc_class_hash(u32 id, u32 mask)
{
        id ^= id >> 8;
        id ^= id >> 4;
        return id & mask;
}

static inline struct Qdisc_class_common *
qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id)
{
        struct Qdisc_class_common *cl;
        unsigned int h;

        if (!id)
                return NULL;

        h = qdisc_class_hash(id, hash->hashmask);
        hlist_for_each_entry(cl, &hash->hash[h], hnode) {
                if (cl->classid == id)
                        return cl;
        }
        return NULL;
}

static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl)
{
        return cl->filter_cnt > 0;
}

static inline void qdisc_class_get(struct Qdisc_class_common *cl)
{
        unsigned int res;

        if (check_add_overflow(cl->filter_cnt, 1, &res))
                WARN(1, "Qdisc class overflow");

        cl->filter_cnt = res;
}

static inline void qdisc_class_put(struct Qdisc_class_common *cl)
{
        unsigned int res;

        if (check_sub_overflow(cl->filter_cnt, 1, &res))
                WARN(1, "Qdisc class underflow");

        cl->filter_cnt = res;
}

static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
{
        u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY;

        return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
}

int qdisc_class_hash_init(struct Qdisc_class_hash *);
void qdisc_class_hash_insert(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_remove(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *);
void qdisc_class_hash_destroy(struct Qdisc_class_hash *);

int dev_qdisc_change_tx_queue_len(struct net_device *dev);
void dev_qdisc_change_real_num_tx(struct net_device *dev,
                                  unsigned int new_real_tx);
void dev_init_scheduler(struct net_device *dev);
void dev_shutdown(struct net_device *dev);
void dev_activate(struct net_device *dev);
void dev_deactivate(struct net_device *dev, bool reset_needed);
void dev_deactivate_many(struct list_head *head, bool reset_needed);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
                              struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);

static inline void dev_reset_queue(struct net_device *dev,
                                   struct netdev_queue *dev_queue,
                                   void *_unused)
{
        struct Qdisc *qdisc;
        bool nolock;

        qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
        if (!qdisc)
                return;

        nolock = qdisc->flags & TCQ_F_NOLOCK;

        if (nolock)
                spin_lock_bh(&qdisc->seqlock);
        spin_lock_bh(qdisc_lock(qdisc));

        qdisc_reset(qdisc);

        spin_unlock_bh(qdisc_lock(qdisc));
        if (nolock) {
                clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
                clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
                spin_unlock_bh(&qdisc->seqlock);
        }
}

#ifdef CONFIG_NET_SCHED
int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                              void *type_data);
void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                                struct Qdisc *new, struct Qdisc *old,
                                enum tc_setup_type type, void *type_data,
                                struct netlink_ext_ack *extack);
#else
static inline int
qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                          void *type_data)
{
        q->flags &= ~TCQ_F_OFFLOADED;
        return 0;
}

static inline void
qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                           struct Qdisc *new, struct Qdisc *old,
                           enum tc_setup_type type, void *type_data,
                           struct netlink_ext_ack *extack)
{
}
#endif
void qdisc_offload_query_caps(struct net_device *dev,
                              enum tc_setup_type type,
                              void *caps, size_t caps_len);
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
                          const struct Qdisc_ops *ops,
                          struct netlink_ext_ack *extack);
void qdisc_free(struct Qdisc *qdisc);
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
                                const struct Qdisc_ops *ops, u32 parentid,
                                struct netlink_ext_ack *extack);
void __qdisc_calculate_pkt_len(struct sk_buff *skb,
                               const struct qdisc_size_table *stab);
int skb_do_redirect(struct sk_buff *);

static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_XGRESS
        return skb->tc_at_ingress;
#else
        return false;
#endif
}

static inline bool skb_skip_tc_classify(struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
        if (skb->tc_skip_classify) {
                skb->tc_skip_classify = 0;
                return true;
        }
#endif
        return false;
}

/* Reset all TX qdiscs greater than index of a device.  */
static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
{
        struct Qdisc *qdisc;
        bool nolock;

        for (; i < dev->num_tx_queues; i++) {
                qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
                if (qdisc) {
                        nolock = qdisc->flags & TCQ_F_NOLOCK;

                        if (nolock)
                                spin_lock_bh(&qdisc->seqlock);
                        spin_lock_bh(qdisc_lock(qdisc));
                        qdisc_reset(qdisc);
                        spin_unlock_bh(qdisc_lock(qdisc));
                        if (nolock) {
                                clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
                                clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
                                spin_unlock_bh(&qdisc->seqlock);
                        }
                }
        }
}

/* Are all TX queues of the device empty?  */
static inline bool qdisc_all_tx_empty(const struct net_device *dev)
{
        unsigned int i;

        rcu_read_lock();
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                const struct Qdisc *q = rcu_dereference(txq->qdisc);

                if (!qdisc_is_empty(q)) {
                        rcu_read_unlock();
                        return false;
                }
        }
        rcu_read_unlock();
        return true;
}

/* Are any of the TX qdiscs changing?  */
static inline bool qdisc_tx_changing(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                if (rcu_access_pointer(txq->qdisc) !=
                    rcu_access_pointer(txq->qdisc_sleeping))
                        return true;
        }
        return false;
}

/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */
static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq)
{
        struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc);

        return qdisc->enqueue == NULL;
}

/* Is the device using the noop qdisc on all queues?  */
static inline bool qdisc_tx_is_noop(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                if (rcu_access_pointer(txq->qdisc) != &noop_qdisc)
                        return false;
        }
        return true;
}

static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
{
        return qdisc_skb_cb(skb)->pkt_len;
}

static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb)
{
        u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs;

        DEBUG_NET_WARN_ON_ONCE(pkt_segs !=
                        (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1));
        return pkt_segs;
}

/* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
enum net_xmit_qdisc_t {
        __NET_XMIT_STOLEN = 0x00010000,
        __NET_XMIT_BYPASS = 0x00020000,
};

#ifdef CONFIG_NET_CLS_ACT
#define net_xmit_drop_count(e)        ((e) & __NET_XMIT_STOLEN ? 0 : 1)
#else
#define net_xmit_drop_count(e)        (1)
#endif

static inline void qdisc_calculate_pkt_len(struct sk_buff *skb,
                                           const struct Qdisc *sch)
{
#ifdef CONFIG_NET_SCHED
        struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab);

        if (stab)
                __qdisc_calculate_pkt_len(skb, stab);
#endif
}

static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                                struct sk_buff **to_free)
{
        return sch->enqueue(skb, sch, to_free);
}

static inline void _bstats_update(struct gnet_stats_basic_sync *bstats,
                                  __u64 bytes, __u64 packets)
{
        u64_stats_update_begin(&bstats->syncp);
        u64_stats_add(&bstats->bytes, bytes);
        u64_stats_add(&bstats->packets, packets);
        u64_stats_update_end(&bstats->syncp);
}

static inline void bstats_update(struct gnet_stats_basic_sync *bstats,
                                 const struct sk_buff *skb)
{
        _bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb));
}

static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
                                           const struct sk_buff *skb)
{
        bstats_update(this_cpu_ptr(sch->cpu_bstats), skb);
}

static inline void qdisc_bstats_update(struct Qdisc *sch,
                                       const struct sk_buff *skb)
{
        bstats_update(&sch->bstats, skb);
}

static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog -= qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog += qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
{
        this_cpu_dec(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->requeues);
}

static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
{
        sch->qstats.drops += count;
}

static inline void qstats_drop_inc(struct gnet_stats_queue *qstats)
{
        qstats->drops++;
}

static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats)
{
        qstats->overlimits++;
}

static inline void qdisc_qstats_drop(struct Qdisc *sch)
{
        qstats_drop_inc(&sch->qstats);
}

static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->drops);
}

static inline void qdisc_qstats_overlimit(struct Qdisc *sch)
{
        sch->qstats.overlimits++;
}

static inline int qdisc_qstats_copy(struct gnet_dump *d, struct Qdisc *sch)
{
        __u32 qlen = qdisc_qlen_sum(sch);

        return gnet_stats_copy_queue(d, sch->cpu_qstats, &sch->qstats, qlen);
}

static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch,  __u32 *qlen,
                                             __u32 *backlog)
{
        struct gnet_stats_queue qstats = { 0 };

        gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats);
        *qlen = qstats.qlen + qdisc_qlen(sch);
        *backlog = qstats.backlog;
}

static inline void qdisc_purge_queue(struct Qdisc *sch)
{
        __u32 qlen, backlog;

        qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
        qdisc_reset(sch);
        qdisc_tree_reduce_backlog(sch, qlen, backlog);
}

static inline void __qdisc_enqueue_tail(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        struct sk_buff *last = qh->tail;

        if (last) {
                skb->next = NULL;
                last->next = skb;
                qh->tail = skb;
        } else {
                qh->tail = skb;
                qh->head = skb;
        }
        qh->qlen++;
}

static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
{
        __qdisc_enqueue_tail(skb, &sch->q);
        qdisc_qstats_backlog_inc(sch, skb);
        return NET_XMIT_SUCCESS;
}

static inline void __qdisc_enqueue_head(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        skb->next = qh->head;

        if (!qh->head)
                qh->tail = skb;
        qh->head = skb;
        qh->qlen++;
}

static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
{
        struct sk_buff *skb = qh->head;

        if (likely(skb != NULL)) {
                qh->head = skb->next;
                qh->qlen--;
                if (qh->head == NULL)
                        qh->tail = NULL;
                skb->next = NULL;
        }

        return skb;
}

static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct)
{
        struct sk_buff *skb;

        skb = __skb_dequeue(&sch->gso_skb);
        if (skb) {
                sch->q.qlen--;
                qdisc_qstats_backlog_dec(sch, skb);
                return skb;
        }
        if (direct) {
                skb = __qdisc_dequeue_head(&sch->q);
                if (skb)
                        qdisc_qstats_backlog_dec(sch, skb);
                return skb;
        } else {
                return sch->dequeue(sch);
        }
}

static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
{
        struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);

        if (likely(skb != NULL)) {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
        }

        return skb;
}

struct tc_skb_cb {
        struct qdisc_skb_cb qdisc_cb;
        u32 drop_reason;

        u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */
        u16 mru;
};

static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
{
        struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
        return cb;
}

/* TC classifier accessors - use enum skb_drop_reason */
static inline enum skb_drop_reason
tcf_get_drop_reason(const struct sk_buff *skb)
{
        return (enum skb_drop_reason)tc_skb_cb(skb)->drop_reason;
}

static inline void tcf_set_drop_reason(const struct sk_buff *skb,
                                       enum skb_drop_reason reason)
{
        tc_skb_cb(skb)->drop_reason = (enum qdisc_drop_reason)reason;
}

/* Qdisc accessors - use enum qdisc_drop_reason */
static inline enum qdisc_drop_reason
tcf_get_qdisc_drop_reason(const struct sk_buff *skb)
{
        return tc_skb_cb(skb)->drop_reason;
}

static inline void tcf_set_qdisc_drop_reason(const struct sk_buff *skb,
                                             enum qdisc_drop_reason reason)
{
        tc_skb_cb(skb)->drop_reason = reason;
}

void __tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q,
                          struct netdev_queue *txq, struct net_device *dev);

static inline void tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q,
                                      struct netdev_queue *txq,
                                      struct net_device *dev)
{
        if (unlikely(skb))
                __tcf_kfree_skb_list(skb, q, txq, dev);
}

static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb,
                                      enum qdisc_drop_reason reason)
{
        struct Qdisc *root;

        DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS));
        DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK);

        rcu_read_lock();
        root = qdisc_root_sleeping(q);

        if (root->flags & TCQ_F_DEQUEUE_DROPS) {
                tcf_set_qdisc_drop_reason(skb, reason);
                skb->next = root->to_free;
                root->to_free = skb;
        } else {
                kfree_skb_reason(skb, (enum skb_drop_reason)reason);
        }
        rcu_read_unlock();
}

/* Instead of calling kfree_skb() while root qdisc lock is held,
 * queue the skb for future freeing at end of __dev_xmit_skb()
 */
static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
{
        skb->next = *to_free;
        *to_free = skb;
}

static inline void __qdisc_drop_all(struct sk_buff *skb,
                                    struct sk_buff **to_free)
{
        if (skb->prev)
                skb->prev->next = *to_free;
        else
                skb->next = *to_free;
        *to_free = skb;
}

static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
                                                   struct qdisc_skb_head *qh,
                                                   struct sk_buff **to_free)
{
        struct sk_buff *skb = __qdisc_dequeue_head(qh);

        if (likely(skb != NULL)) {
                unsigned int len = qdisc_pkt_len(skb);

                qdisc_qstats_backlog_dec(sch, skb);
                __qdisc_drop(skb, to_free);
                return len;
        }

        return 0;
}

static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
{
        const struct qdisc_skb_head *qh = &sch->q;

        return qh->head;
}

/* generic pseudo peek method for non-work-conserving qdisc */
static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        /* we can reuse ->gso_skb because peek isn't called for root qdiscs */
        if (!skb) {
                skb = sch->dequeue(sch);

                if (skb) {
                        __skb_queue_head(&sch->gso_skb, skb);
                        /* it's still part of the queue */
                        qdisc_qstats_backlog_inc(sch, skb);
                        sch->q.qlen++;
                }
        }

        return skb;
}

static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch,
                                                 struct sk_buff *skb)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_backlog_dec(sch, skb);
                qdisc_bstats_cpu_update(sch, skb);
                qdisc_qstats_cpu_qlen_dec(sch);
        } else {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
                sch->q.qlen--;
        }
}

static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch,
                                                 unsigned int pkt_len)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_qlen_inc(sch);
                this_cpu_add(sch->cpu_qstats->backlog, pkt_len);
        } else {
                sch->qstats.backlog += pkt_len;
                sch->q.qlen++;
        }
}

/* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        if (skb) {
                skb = __skb_dequeue(&sch->gso_skb);
                if (qdisc_is_percpu_stats(sch)) {
                        qdisc_qstats_cpu_backlog_dec(sch, skb);
                        qdisc_qstats_cpu_qlen_dec(sch);
                } else {
                        qdisc_qstats_backlog_dec(sch, skb);
                        sch->q.qlen--;
                }
        } else {
                skb = sch->dequeue(sch);
        }

        return skb;
}

static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh)
{
        /*
         * We do not know the backlog in bytes of this list, it
         * is up to the caller to correct it
         */
        ASSERT_RTNL();
        if (qh->qlen) {
                rtnl_kfree_skbs(qh->head, qh->tail);

                qh->head = NULL;
                qh->tail = NULL;
                qh->qlen = 0;
        }
}

static inline void qdisc_reset_queue(struct Qdisc *sch)
{
        __qdisc_reset_queue(&sch->q);
}

static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
                                          struct Qdisc **pold)
{
        struct Qdisc *old;

        sch_tree_lock(sch);
        old = *pold;
        *pold = new;
        if (old != NULL)
                qdisc_purge_queue(old);
        sch_tree_unlock(sch);

        return old;
}

static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
{
        rtnl_kfree_skbs(skb, skb);
        qdisc_qstats_drop(sch);
}

static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_cpu_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
                             struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch,
                                    struct sk_buff **to_free,
                                    enum qdisc_drop_reason reason)
{
        tcf_set_qdisc_drop_reason(skb, reason);
        return qdisc_drop(skb, sch, to_free);
}

static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop_all(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

struct psched_ratecfg {
        u64        rate_bytes_ps; /* bytes per second */
        u32        mult;
        u16        overhead;
        u16        mpu;
        u8        linklayer;
        u8        shift;
};

static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
                                unsigned int len)
{
        len += r->overhead;

        if (len < r->mpu)
                len = r->mpu;

        if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
                return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift;

        return ((u64)len * r->mult) >> r->shift;
}

void psched_ratecfg_precompute(struct psched_ratecfg *r,
                               const struct tc_ratespec *conf,
                               u64 rate64);

static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
                                          const struct psched_ratecfg *r)
{
        memset(res, 0, sizeof(*res));

        /* legacy struct tc_ratespec has a 32bit @rate field
         * Qdisc using 64bit rate should add new attributes
         * in order to maintain compatibility.
         */
        res->rate = min_t(u64, r->rate_bytes_ps, ~0U);

        res->overhead = r->overhead;
        res->mpu = r->mpu;
        res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
}

struct psched_pktrate {
        u64        rate_pkts_ps; /* packets per second */
        u32        mult;
        u8        shift;
};

static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r,
                                  unsigned int pkt_num)
{
        return ((u64)pkt_num * r->mult) >> r->shift;
}

void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64);

/* Mini Qdisc serves for specific needs of ingress/clsact Qdisc.
 * The fast path only needs to access filter list and to update stats
 */
struct mini_Qdisc {
        struct tcf_proto *filter_list;
        struct tcf_block *block;
        struct gnet_stats_basic_sync __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        unsigned long rcu_state;
};

static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq,
                                                const struct sk_buff *skb)
{
        bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb);
}

static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq)
{
        this_cpu_inc(miniq->cpu_qstats->drops);
}

struct mini_Qdisc_pair {
        struct mini_Qdisc miniq1;
        struct mini_Qdisc miniq2;
        struct mini_Qdisc __rcu **p_miniq;
};

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
                          struct tcf_proto *tp_head);
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
                          struct mini_Qdisc __rcu **p_miniq);
void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
                                struct tcf_block *block);

static inline bool mini_qdisc_pair_inited(struct mini_Qdisc_pair *miniqp)
{
        return !!miniqp->p_miniq;
}

void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx);

int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb));

/* Make sure qdisc is no longer in SCHED state. */
static inline void qdisc_synchronize(const struct Qdisc *q)
{
        while (test_bit(__QDISC_STATE_SCHED, &q->state))
                msleep(1);
}

#endif







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   23 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * include/net/dsa.h - Driver for Distributed Switch Architecture switch chips
 * Copyright (c) 2008-2009 Marvell Semiconductor
 */

#ifndef __LINUX_NET_DSA_H
#define __LINUX_NET_DSA_H

#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/of.h>
#include <linux/ethtool.h>
#include <linux/net_tstamp.h>
#include <linux/phy.h>
#include <linux/platform_data/dsa.h>
#include <linux/phylink.h>
#include <net/devlink.h>
#include <net/switchdev.h>

struct dsa_8021q_context;
struct tc_action;

#define DSA_TAG_PROTO_NONE_VALUE                0
#define DSA_TAG_PROTO_BRCM_VALUE                1
#define DSA_TAG_PROTO_BRCM_PREPEND_VALUE        2
#define DSA_TAG_PROTO_DSA_VALUE                        3
#define DSA_TAG_PROTO_EDSA_VALUE                4
#define DSA_TAG_PROTO_GSWIP_VALUE                5
#define DSA_TAG_PROTO_KSZ9477_VALUE                6
#define DSA_TAG_PROTO_KSZ9893_VALUE                7
#define DSA_TAG_PROTO_LAN9303_VALUE                8
#define DSA_TAG_PROTO_MTK_VALUE                        9
#define DSA_TAG_PROTO_QCA_VALUE                        10
#define DSA_TAG_PROTO_TRAILER_VALUE                11
#define DSA_TAG_PROTO_8021Q_VALUE                12
#define DSA_TAG_PROTO_SJA1105_VALUE                13
#define DSA_TAG_PROTO_KSZ8795_VALUE                14
#define DSA_TAG_PROTO_OCELOT_VALUE                15
#define DSA_TAG_PROTO_AR9331_VALUE                16
#define DSA_TAG_PROTO_RTL4_A_VALUE                17
#define DSA_TAG_PROTO_HELLCREEK_VALUE                18
#define DSA_TAG_PROTO_XRS700X_VALUE                19
#define DSA_TAG_PROTO_OCELOT_8021Q_VALUE        20
#define DSA_TAG_PROTO_SEVILLE_VALUE                21
#define DSA_TAG_PROTO_BRCM_LEGACY_VALUE                22
#define DSA_TAG_PROTO_SJA1110_VALUE                23
#define DSA_TAG_PROTO_RTL8_4_VALUE                24
#define DSA_TAG_PROTO_RTL8_4T_VALUE                25
#define DSA_TAG_PROTO_RZN1_A5PSW_VALUE                26
#define DSA_TAG_PROTO_LAN937X_VALUE                27
#define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE        28
#define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE        29
#define DSA_TAG_PROTO_YT921X_VALUE                30
#define DSA_TAG_PROTO_MXL_GSW1XX_VALUE                31
#define DSA_TAG_PROTO_MXL862_VALUE                32

enum dsa_tag_protocol {
        DSA_TAG_PROTO_NONE                = DSA_TAG_PROTO_NONE_VALUE,
        DSA_TAG_PROTO_BRCM                = DSA_TAG_PROTO_BRCM_VALUE,
        DSA_TAG_PROTO_BRCM_LEGACY        = DSA_TAG_PROTO_BRCM_LEGACY_VALUE,
        DSA_TAG_PROTO_BRCM_LEGACY_FCS        = DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE,
        DSA_TAG_PROTO_BRCM_PREPEND        = DSA_TAG_PROTO_BRCM_PREPEND_VALUE,
        DSA_TAG_PROTO_DSA                = DSA_TAG_PROTO_DSA_VALUE,
        DSA_TAG_PROTO_EDSA                = DSA_TAG_PROTO_EDSA_VALUE,
        DSA_TAG_PROTO_GSWIP                = DSA_TAG_PROTO_GSWIP_VALUE,
        DSA_TAG_PROTO_KSZ9477                = DSA_TAG_PROTO_KSZ9477_VALUE,
        DSA_TAG_PROTO_KSZ9893                = DSA_TAG_PROTO_KSZ9893_VALUE,
        DSA_TAG_PROTO_LAN9303                = DSA_TAG_PROTO_LAN9303_VALUE,
        DSA_TAG_PROTO_MTK                = DSA_TAG_PROTO_MTK_VALUE,
        DSA_TAG_PROTO_QCA                = DSA_TAG_PROTO_QCA_VALUE,
        DSA_TAG_PROTO_TRAILER                = DSA_TAG_PROTO_TRAILER_VALUE,
        DSA_TAG_PROTO_8021Q                = DSA_TAG_PROTO_8021Q_VALUE,
        DSA_TAG_PROTO_SJA1105                = DSA_TAG_PROTO_SJA1105_VALUE,
        DSA_TAG_PROTO_KSZ8795                = DSA_TAG_PROTO_KSZ8795_VALUE,
        DSA_TAG_PROTO_OCELOT                = DSA_TAG_PROTO_OCELOT_VALUE,
        DSA_TAG_PROTO_AR9331                = DSA_TAG_PROTO_AR9331_VALUE,
        DSA_TAG_PROTO_RTL4_A                = DSA_TAG_PROTO_RTL4_A_VALUE,
        DSA_TAG_PROTO_HELLCREEK                = DSA_TAG_PROTO_HELLCREEK_VALUE,
        DSA_TAG_PROTO_XRS700X                = DSA_TAG_PROTO_XRS700X_VALUE,
        DSA_TAG_PROTO_OCELOT_8021Q        = DSA_TAG_PROTO_OCELOT_8021Q_VALUE,
        DSA_TAG_PROTO_SEVILLE                = DSA_TAG_PROTO_SEVILLE_VALUE,
        DSA_TAG_PROTO_SJA1110                = DSA_TAG_PROTO_SJA1110_VALUE,
        DSA_TAG_PROTO_RTL8_4                = DSA_TAG_PROTO_RTL8_4_VALUE,
        DSA_TAG_PROTO_RTL8_4T                = DSA_TAG_PROTO_RTL8_4T_VALUE,
        DSA_TAG_PROTO_RZN1_A5PSW        = DSA_TAG_PROTO_RZN1_A5PSW_VALUE,
        DSA_TAG_PROTO_LAN937X                = DSA_TAG_PROTO_LAN937X_VALUE,
        DSA_TAG_PROTO_VSC73XX_8021Q        = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE,
        DSA_TAG_PROTO_YT921X                = DSA_TAG_PROTO_YT921X_VALUE,
        DSA_TAG_PROTO_MXL_GSW1XX        = DSA_TAG_PROTO_MXL_GSW1XX_VALUE,
        DSA_TAG_PROTO_MXL862                = DSA_TAG_PROTO_MXL862_VALUE,
};

struct dsa_switch;

struct dsa_device_ops {
        struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
        struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
        void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
                             int *offset);
        int (*connect)(struct dsa_switch *ds);
        void (*disconnect)(struct dsa_switch *ds);
        unsigned int needed_headroom;
        unsigned int needed_tailroom;
        const char *name;
        enum dsa_tag_protocol proto;
        /* Some tagging protocols either mangle or shift the destination MAC
         * address, in which case the DSA conduit would drop packets on ingress
         * if what it understands out of the destination MAC address is not in
         * its RX filter.
         */
        bool promisc_on_conduit;
};

struct dsa_lag {
        struct net_device *dev;
        unsigned int id;
        struct mutex fdb_lock;
        struct list_head fdbs;
        refcount_t refcount;
};

struct dsa_switch_tree {
        struct list_head        list;

        /* List of switch ports */
        struct list_head ports;

        /* Notifier chain for switch-wide events */
        struct raw_notifier_head        nh;

        /* Tree identifier */
        unsigned int index;

        /* Number of switches attached to this tree */
        struct kref refcount;

        /* Maps offloaded LAG netdevs to a zero-based linear ID for
         * drivers that need it.
         */
        struct dsa_lag **lags;

        /* Tagging protocol operations */
        const struct dsa_device_ops *tag_ops;

        /* Default tagging protocol preferred by the switches in this
         * tree.
         */
        enum dsa_tag_protocol default_proto;

        /* Has this tree been applied to the hardware? */
        bool setup;

        /*
         * Configuration data for the platform device that owns
         * this dsa switch tree instance.
         */
        struct dsa_platform_data        *pd;

        /* List of DSA links composing the routing table */
        struct list_head rtable;

        /* Length of "lags" array */
        unsigned int lags_len;

        /* Track the largest switch index within a tree */
        unsigned int last_switch;
};

/* LAG IDs are one-based, the dst->lags array is zero-based */
#define dsa_lags_foreach_id(_id, _dst)                                \
        for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++)        \
                if ((_dst)->lags[(_id) - 1])

#define dsa_lag_foreach_port(_dp, _dst, _lag)                        \
        list_for_each_entry((_dp), &(_dst)->ports, list)        \
                if (dsa_port_offloads_lag((_dp), (_lag)))

#define dsa_hsr_foreach_port(_dp, _ds, _hsr)                        \
        list_for_each_entry((_dp), &(_ds)->dst->ports, list)        \
                if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr))

static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst,
                                            unsigned int id)
{
        /* DSA LAG IDs are one-based, dst->lags is zero-based */
        return dst->lags[id - 1];
}

static inline int dsa_lag_id(struct dsa_switch_tree *dst,
                             struct net_device *lag_dev)
{
        unsigned int id;

        dsa_lags_foreach_id(id, dst) {
                struct dsa_lag *lag = dsa_lag_by_id(dst, id);

                if (lag->dev == lag_dev)
                        return lag->id;
        }

        return -ENODEV;
}

/* TC matchall action types */
enum dsa_port_mall_action_type {
        DSA_PORT_MALL_MIRROR,
        DSA_PORT_MALL_POLICER,
};

/* TC mirroring entry */
struct dsa_mall_mirror_tc_entry {
        u8 to_local_port;
        bool ingress;
};

/* TC matchall entry */
struct dsa_mall_tc_entry {
        struct list_head list;
        unsigned long cookie;
        enum dsa_port_mall_action_type type;
        union {
                struct dsa_mall_mirror_tc_entry mirror;
                struct flow_action_police policer;
        };
};

struct dsa_bridge {
        struct net_device *dev;
        unsigned int num;
        bool tx_fwd_offload;
        refcount_t refcount;
};

struct dsa_port {
        /* A CPU port is physically connected to a conduit device. A user port
         * exposes a network device to user-space, called 'user' here.
         */
        union {
                struct net_device *conduit;
                struct net_device *user;
        };

        /* Copy of the tagging protocol operations, for quicker access
         * in the data path. Valid only for the CPU ports.
         */
        const struct dsa_device_ops *tag_ops;

        /* Copies for faster access in conduit receive hot path */
        struct dsa_switch_tree *dst;
        struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);

        struct dsa_switch        *ds;

        unsigned int                index;

        enum {
                DSA_PORT_TYPE_UNUSED = 0,
                DSA_PORT_TYPE_CPU,
                DSA_PORT_TYPE_DSA,
                DSA_PORT_TYPE_USER,
        } type;

        const char                *name;
        struct dsa_port                *cpu_dp;
        u8                        mac[ETH_ALEN];

        u8                        stp_state;

        /* Warning: the following bit fields are not atomic, and updating them
         * can only be done from code paths where concurrency is not possible
         * (probe time or under rtnl_lock).
         */
        u8                        vlan_filtering:1;

        /* Managed by DSA on user ports and by drivers on CPU and DSA ports */
        u8                        learning:1;

        u8                        lag_tx_enabled:1;

        /* conduit state bits, valid only on CPU ports */
        u8                        conduit_admin_up:1;
        u8                        conduit_oper_up:1;

        /* Valid only on user ports */
        u8                        cpu_port_in_lag:1;

        u8                        setup:1;

        struct device_node        *dn;
        unsigned int                ageing_time;

        struct dsa_bridge        *bridge;
        struct devlink_port        devlink_port;
        struct phylink                *pl;
        struct phylink_config        pl_config;
        netdevice_tracker        conduit_tracker;
        struct dsa_lag                *lag;
        struct net_device        *hsr_dev;

        struct list_head list;

        /*
         * Original copy of the conduit netdev ethtool_ops
         */
        const struct ethtool_ops *orig_ethtool_ops;

        /* List of MAC addresses that must be forwarded on this port.
         * These are only valid on CPU ports and DSA links.
         */
        struct mutex                addr_lists_lock;
        struct list_head        fdbs;
        struct list_head        mdbs;

        struct mutex                vlans_lock;
        union {
                /* List of VLANs that CPU and DSA ports are members of.
                 * Access to this is serialized by the sleepable @vlans_lock.
                 */
                struct list_head        vlans;
                /* List of VLANs that user ports are members of.
                 * Access to this is serialized by netif_addr_lock_bh().
                 */
                struct list_head        user_vlans;
        };
};

static inline struct dsa_port *
dsa_phylink_to_port(struct phylink_config *config)
{
        return container_of(config, struct dsa_port, pl_config);
}

/* TODO: ideally DSA ports would have a single dp->link_dp member,
 * and no dst->rtable nor this struct dsa_link would be needed,
 * but this would require some more complex tree walking,
 * so keep it stupid at the moment and list them all.
 */
struct dsa_link {
        struct dsa_port *dp;
        struct dsa_port *link_dp;
        struct list_head list;
};

enum dsa_db_type {
        DSA_DB_PORT,
        DSA_DB_LAG,
        DSA_DB_BRIDGE,
};

struct dsa_db {
        enum dsa_db_type type;

        union {
                const struct dsa_port *dp;
                struct dsa_lag lag;
                struct dsa_bridge bridge;
        };
};

struct dsa_mac_addr {
        unsigned char addr[ETH_ALEN];
        u16 vid;
        refcount_t refcount;
        struct list_head list;
        struct dsa_db db;
};

struct dsa_vlan {
        u16 vid;
        refcount_t refcount;
        struct list_head list;
};

struct dsa_switch {
        struct device *dev;

        /*
         * Parent switch tree, and switch index.
         */
        struct dsa_switch_tree        *dst;
        unsigned int                index;

        /* Warning: the following bit fields are not atomic, and updating them
         * can only be done from code paths where concurrency is not possible
         * (probe time or under rtnl_lock).
         */
        u32                        setup:1;

        /* Disallow bridge core from requesting different VLAN awareness
         * settings on ports if not hardware-supported
         */
        u32                        vlan_filtering_is_global:1;

        /* Keep VLAN filtering enabled on ports not offloading any upper */
        u32                        needs_standalone_vlan_filtering:1;

        /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges
         * that have vlan_filtering=0. All drivers should ideally set this (and
         * then the option would get removed), but it is unknown whether this
         * would break things or not.
         */
        u32                        configure_vlan_while_not_filtering:1;

        /* Pop the default_pvid of VLAN-unaware bridge ports from tagged frames.
         * DEPRECATED: Do NOT set this field in new drivers. Instead look at
         * the dsa_software_vlan_untag() comments.
         */
        u32                        untag_bridge_pvid:1;
        /* Pop the default_pvid of VLAN-aware bridge ports from tagged frames.
         * Useful if the switch cannot preserve the VLAN tag as seen on the
         * wire for user port ingress, and chooses to send all frames as
         * VLAN-tagged to the CPU, including those which were originally
         * untagged.
         */
        u32                        untag_vlan_aware_bridge_pvid:1;

        /* Let DSA manage the FDB entries towards the
         * CPU, based on the software bridge database.
         */
        u32                        assisted_learning_on_cpu_port:1;

        /* In case vlan_filtering_is_global is set, the VLAN awareness state
         * should be retrieved from here and not from the per-port settings.
         */
        u32                        vlan_filtering:1;

        /* For switches that only have the MRU configurable. To ensure the
         * configured MTU is not exceeded, normalization of MRU on all bridged
         * interfaces is needed.
         */
        u32                        mtu_enforcement_ingress:1;

        /* Drivers that isolate the FDBs of multiple bridges must set this
         * to true to receive the bridge as an argument in .port_fdb_{add,del}
         * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be
         * passed as zero.
         */
        u32                        fdb_isolation:1;

        /* Drivers that have global DSCP mapping settings must set this to
         * true to automatically apply the settings to all ports.
         */
        u32                        dscp_prio_mapping_is_global:1;

        /* Listener for switch fabric events */
        struct notifier_block        nb;

        /*
         * Give the switch driver somewhere to hang its private data
         * structure.
         */
        void *priv;

        void *tagger_data;

        /*
         * Configuration data for this switch.
         */
        struct dsa_chip_data        *cd;

        /*
         * The switch operations.
         */
        const struct dsa_switch_ops        *ops;

        /*
         * Allow a DSA switch driver to override the phylink MAC ops
         */
        const struct phylink_mac_ops        *phylink_mac_ops;

        /*
         * User mii_bus and devices for the individual ports.
         */
        u32                        phys_mii_mask;
        struct mii_bus                *user_mii_bus;

        /* Ageing Time limits in msecs */
        unsigned int ageing_time_min;
        unsigned int ageing_time_max;

        /* Storage for drivers using tag_8021q */
        struct dsa_8021q_context *tag_8021q_ctx;

        /* devlink used to represent this switch device */
        struct devlink                *devlink;

        /* Number of switch port queues */
        unsigned int                num_tx_queues;

        /* Drivers that benefit from having an ID associated with each
         * offloaded LAG should set this to the maximum number of
         * supported IDs. DSA will then maintain a mapping of _at
         * least_ these many IDs, accessible to drivers via
         * dsa_lag_id().
         */
        unsigned int                num_lag_ids;

        /* Drivers that support bridge forwarding offload or FDB isolation
         * should set this to the maximum number of bridges spanning the same
         * switch tree (or all trees, in the case of cross-tree bridging
         * support) that can be offloaded.
         */
        unsigned int                max_num_bridges;

        unsigned int                num_ports;
};

static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
{
        struct dsa_switch_tree *dst = ds->dst;
        struct dsa_port *dp;

        list_for_each_entry(dp, &dst->ports, list)
                if (dp->ds == ds && dp->index == p)
                        return dp;

        return NULL;
}

static inline bool dsa_port_is_dsa(struct dsa_port *port)
{
        return port->type == DSA_PORT_TYPE_DSA;
}

static inline bool dsa_port_is_cpu(struct dsa_port *port)
{
        return port->type == DSA_PORT_TYPE_CPU;
}

static inline bool dsa_port_is_user(struct dsa_port *dp)
{
        return dp->type == DSA_PORT_TYPE_USER;
}

static inline bool dsa_port_is_unused(struct dsa_port *dp)
{
        return dp->type == DSA_PORT_TYPE_UNUSED;
}

static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp)
{
        return dsa_port_is_cpu(dp) && dp->conduit_admin_up &&
               dp->conduit_oper_up;
}

static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED;
}

static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU;
}

static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA;
}

static inline bool dsa_is_user_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER;
}

#define dsa_tree_for_each_user_port(_dp, _dst) \
        list_for_each_entry((_dp), &(_dst)->ports, list) \
                if (dsa_port_is_user((_dp)))

#define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \
        list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \
                if (dsa_port_is_user((_dp)))

#define dsa_tree_for_each_cpu_port(_dp, _dst) \
        list_for_each_entry((_dp), &(_dst)->ports, list) \
                if (dsa_port_is_cpu((_dp)))

#define dsa_switch_for_each_port(_dp, _ds) \
        list_for_each_entry((_dp), &(_ds)->dst->ports, list) \
                if ((_dp)->ds == (_ds))

#define dsa_switch_for_each_port_safe(_dp, _next, _ds) \
        list_for_each_entry_safe((_dp), (_next), &(_ds)->dst->ports, list) \
                if ((_dp)->ds == (_ds))

#define dsa_switch_for_each_port_continue_reverse(_dp, _ds) \
        list_for_each_entry_continue_reverse((_dp), &(_ds)->dst->ports, list) \
                if ((_dp)->ds == (_ds))

#define dsa_switch_for_each_available_port(_dp, _ds) \
        dsa_switch_for_each_port((_dp), (_ds)) \
                if (!dsa_port_is_unused((_dp)))

#define dsa_switch_for_each_user_port(_dp, _ds) \
        dsa_switch_for_each_port((_dp), (_ds)) \
                if (dsa_port_is_user((_dp)))

#define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \
        dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \
                if (dsa_port_is_user((_dp)))

#define dsa_switch_for_each_cpu_port(_dp, _ds) \
        dsa_switch_for_each_port((_dp), (_ds)) \
                if (dsa_port_is_cpu((_dp)))

#define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \
        dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \
                if (dsa_port_is_cpu((_dp)))

static inline u32 dsa_user_ports(struct dsa_switch *ds)
{
        struct dsa_port *dp;
        u32 mask = 0;

        dsa_switch_for_each_user_port(dp, ds)
                mask |= BIT(dp->index);

        return mask;
}

static inline u32 dsa_cpu_ports(struct dsa_switch *ds)
{
        struct dsa_port *cpu_dp;
        u32 mask = 0;

        dsa_switch_for_each_cpu_port(cpu_dp, ds)
                mask |= BIT(cpu_dp->index);

        return mask;
}

/* Return the local port used to reach an arbitrary switch device */
static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device)
{
        struct dsa_switch_tree *dst = ds->dst;
        struct dsa_link *dl;

        list_for_each_entry(dl, &dst->rtable, list)
                if (dl->dp->ds == ds && dl->link_dp->ds->index == device)
                        return dl->dp->index;

        return ds->num_ports;
}

/* Return the local port used to reach an arbitrary switch port */
static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device,
                                            int port)
{
        if (device == ds->index)
                return port;
        else
                return dsa_routing_port(ds, device);
}

/* Return the local port used to reach the dedicated CPU port */
static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port)
{
        const struct dsa_port *dp = dsa_to_port(ds, port);
        const struct dsa_port *cpu_dp = dp->cpu_dp;

        if (!cpu_dp)
                return port;

        return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index);
}

/* Return true if this is the local port used to reach the CPU port */
static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port)
{
        if (dsa_is_unused_port(ds, port))
                return false;

        return port == dsa_upstream_port(ds, port);
}

/* Return true if this is a DSA port leading away from the CPU */
static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port)
{
        return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port);
}

/* Return the local port used to reach the CPU port */
static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds)
{
        struct dsa_port *dp;

        dsa_switch_for_each_available_port(dp, ds) {
                return dsa_upstream_port(ds, dp->index);
        }

        return ds->num_ports;
}

/* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning
 * that the routing port from @downstream_ds to @upstream_ds is also the port
 * which @downstream_ds uses to reach its dedicated CPU.
 */
static inline bool dsa_switch_is_upstream_of(struct dsa_switch *upstream_ds,
                                             struct dsa_switch *downstream_ds)
{
        int routing_port;

        if (upstream_ds == downstream_ds)
                return true;

        routing_port = dsa_routing_port(downstream_ds, upstream_ds->index);

        return dsa_is_upstream_port(downstream_ds, routing_port);
}

static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp)
{
        const struct dsa_switch *ds = dp->ds;

        if (ds->vlan_filtering_is_global)
                return ds->vlan_filtering;
        else
                return dp->vlan_filtering;
}

static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp)
{
        return dp->lag ? dp->lag->id : 0;
}

static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp)
{
        return dp->lag ? dp->lag->dev : NULL;
}

static inline bool dsa_port_offloads_lag(struct dsa_port *dp,
                                         const struct dsa_lag *lag)
{
        return dsa_port_lag_dev_get(dp) == lag->dev;
}

static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp)
{
        if (dp->cpu_port_in_lag)
                return dsa_port_lag_dev_get(dp->cpu_dp);

        return dp->cpu_dp->conduit;
}

static inline
struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp)
{
        if (!dp->bridge)
                return NULL;

        if (dp->lag)
                return dp->lag->dev;
        else if (dp->hsr_dev)
                return dp->hsr_dev;

        return dp->user;
}

static inline struct net_device *
dsa_port_bridge_dev_get(const struct dsa_port *dp)
{
        return dp->bridge ? dp->bridge->dev : NULL;
}

static inline unsigned int dsa_port_bridge_num_get(struct dsa_port *dp)
{
        return dp->bridge ? dp->bridge->num : 0;
}

static inline bool dsa_port_bridge_same(const struct dsa_port *a,
                                        const struct dsa_port *b)
{
        struct net_device *br_a = dsa_port_bridge_dev_get(a);
        struct net_device *br_b = dsa_port_bridge_dev_get(b);

        /* Standalone ports are not in the same bridge with one another */
        return (!br_a || !br_b) ? false : (br_a == br_b);
}

static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp,
                                                 const struct net_device *dev)
{
        return dsa_port_to_bridge_port(dp) == dev;
}

static inline bool
dsa_port_offloads_bridge_dev(struct dsa_port *dp,
                             const struct net_device *bridge_dev)
{
        /* DSA ports connected to a bridge, and event was emitted
         * for the bridge.
         */
        return dsa_port_bridge_dev_get(dp) == bridge_dev;
}

static inline bool dsa_port_offloads_bridge(struct dsa_port *dp,
                                            const struct dsa_bridge *bridge)
{
        return dsa_port_bridge_dev_get(dp) == bridge->dev;
}

/* Returns true if any port of this tree offloads the given net_device */
static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst,
                                                 const struct net_device *dev)
{
        struct dsa_port *dp;

        list_for_each_entry(dp, &dst->ports, list)
                if (dsa_port_offloads_bridge_port(dp, dev))
                        return true;

        return false;
}

/* Returns true if any port of this tree offloads the given bridge */
static inline bool
dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst,
                             const struct net_device *bridge_dev)
{
        struct dsa_port *dp;

        list_for_each_entry(dp, &dst->ports, list)
                if (dsa_port_offloads_bridge_dev(dp, bridge_dev))
                        return true;

        return false;
}

#define dsa_switch_for_each_bridge_member(_dp, _ds, _bdev) \
        dsa_switch_for_each_user_port(_dp, _ds) \
                if (dsa_port_offloads_bridge_dev(_dp, _bdev))

static inline u32
dsa_bridge_ports(struct dsa_switch *ds, const struct net_device *bdev)
{
        struct dsa_port *dp;
        u32 mask = 0;

        dsa_switch_for_each_bridge_member(dp, ds, bdev)
                mask |= BIT(dp->index);

        return mask;
}

static inline bool dsa_port_tree_same(const struct dsa_port *a,
                                      const struct dsa_port *b)
{
        return a->ds->dst == b->ds->dst;
}

typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid,
                              bool is_static, void *data);
struct dsa_switch_ops {
        /*
         * Tagging protocol helpers called for the CPU ports and DSA links.
         * @get_tag_protocol retrieves the initial tagging protocol and is
         * mandatory. Switches which can operate using multiple tagging
         * protocols should implement @change_tag_protocol and report in
         * @get_tag_protocol the tagger in current use.
         */
        enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds,
                                                  int port,
                                                  enum dsa_tag_protocol mprot);
        int        (*change_tag_protocol)(struct dsa_switch *ds,
                                       enum dsa_tag_protocol proto);
        /*
         * Method for switch drivers to connect to the tagging protocol driver
         * in current use. The switch driver can provide handlers for certain
         * types of packets for switch management.
         */
        int        (*connect_tag_protocol)(struct dsa_switch *ds,
                                        enum dsa_tag_protocol proto);

        int        (*port_change_conduit)(struct dsa_switch *ds, int port,
                                       struct net_device *conduit,
                                       struct netlink_ext_ack *extack);

        /* Optional switch-wide initialization and destruction methods */
        int        (*setup)(struct dsa_switch *ds);
        void        (*teardown)(struct dsa_switch *ds);

        /* Per-port initialization and destruction methods. Mandatory if the
         * driver registers devlink port regions, optional otherwise.
         */
        int        (*port_setup)(struct dsa_switch *ds, int port);
        void        (*port_teardown)(struct dsa_switch *ds, int port);

        u32        (*get_phy_flags)(struct dsa_switch *ds, int port);

        /*
         * Access to the switch's PHY registers.
         */
        int        (*phy_read)(struct dsa_switch *ds, int port, int regnum);
        int        (*phy_write)(struct dsa_switch *ds, int port,
                             int regnum, u16 val);

        /*
         * PHYLINK integration
         */
        void        (*phylink_get_caps)(struct dsa_switch *ds, int port,
                                    struct phylink_config *config);
        void        (*phylink_fixed_state)(struct dsa_switch *ds, int port,
                                       struct phylink_link_state *state);
        /*
         * Port statistics counters.
         */
        void        (*get_strings)(struct dsa_switch *ds, int port,
                               u32 stringset, uint8_t *data);
        void        (*get_ethtool_stats)(struct dsa_switch *ds,
                                     int port, uint64_t *data);
        int        (*get_sset_count)(struct dsa_switch *ds, int port, int sset);
        void        (*get_ethtool_phy_stats)(struct dsa_switch *ds,
                                         int port, uint64_t *data);
        void        (*get_eth_phy_stats)(struct dsa_switch *ds, int port,
                                     struct ethtool_eth_phy_stats *phy_stats);
        void        (*get_eth_mac_stats)(struct dsa_switch *ds, int port,
                                     struct ethtool_eth_mac_stats *mac_stats);
        void        (*get_eth_ctrl_stats)(struct dsa_switch *ds, int port,
                                      struct ethtool_eth_ctrl_stats *ctrl_stats);
        void        (*get_rmon_stats)(struct dsa_switch *ds, int port,
                                  struct ethtool_rmon_stats *rmon_stats,
                                  const struct ethtool_rmon_hist_range **ranges);
        void        (*get_ts_stats)(struct dsa_switch *ds, int port,
                                struct ethtool_ts_stats *ts_stats);
        void        (*get_stats64)(struct dsa_switch *ds, int port,
                                   struct rtnl_link_stats64 *s);
        void        (*get_pause_stats)(struct dsa_switch *ds, int port,
                                   struct ethtool_pause_stats *pause_stats);
        void        (*self_test)(struct dsa_switch *ds, int port,
                             struct ethtool_test *etest, u64 *data);

        /*
         * ethtool Wake-on-LAN
         */
        void        (*get_wol)(struct dsa_switch *ds, int port,
                           struct ethtool_wolinfo *w);
        int        (*set_wol)(struct dsa_switch *ds, int port,
                           struct ethtool_wolinfo *w);

        /*
         * ethtool timestamp info
         */
        int        (*get_ts_info)(struct dsa_switch *ds, int port,
                               struct kernel_ethtool_ts_info *ts);

        /*
         * ethtool MAC merge layer
         */
        int        (*get_mm)(struct dsa_switch *ds, int port,
                          struct ethtool_mm_state *state);
        int        (*set_mm)(struct dsa_switch *ds, int port,
                          struct ethtool_mm_cfg *cfg,
                          struct netlink_ext_ack *extack);
        void        (*get_mm_stats)(struct dsa_switch *ds, int port,
                                struct ethtool_mm_stats *stats);

        /*
         * DCB ops
         */
        int        (*port_get_default_prio)(struct dsa_switch *ds, int port);
        int        (*port_set_default_prio)(struct dsa_switch *ds, int port,
                                         u8 prio);
        int        (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp);
        int        (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp,
                                      u8 prio);
        int        (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp,
                                      u8 prio);
        int        (*port_set_apptrust)(struct dsa_switch *ds, int port,
                                     const u8 *sel, int nsel);
        int        (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel,
                                     int *nsel);

        /*
         * Suspend and resume
         */
        int        (*suspend)(struct dsa_switch *ds);
        int        (*resume)(struct dsa_switch *ds);

        /*
         * Port enable/disable
         */
        int        (*port_enable)(struct dsa_switch *ds, int port,
                               struct phy_device *phy);
        void        (*port_disable)(struct dsa_switch *ds, int port);


        /*
         * Notification for MAC address changes on user ports. Drivers can
         * currently only veto operations. They should not use the method to
         * program the hardware, since the operation is not rolled back in case
         * of other errors.
         */
        int        (*port_set_mac_address)(struct dsa_switch *ds, int port,
                                        const unsigned char *addr);

        /*
         * Compatibility between device trees defining multiple CPU ports and
         * drivers which are not OK to use by default the numerically smallest
         * CPU port of a switch for its local ports. This can return NULL,
         * meaning "don't know/don't care".
         */
        struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds);

        /*
         * Port's MAC EEE settings
         */
        bool        (*support_eee)(struct dsa_switch *ds, int port);
        int        (*set_mac_eee)(struct dsa_switch *ds, int port,
                               struct ethtool_keee *e);

        /* EEPROM access */
        int        (*get_eeprom_len)(struct dsa_switch *ds);
        int        (*get_eeprom)(struct dsa_switch *ds,
                              struct ethtool_eeprom *eeprom, u8 *data);
        int        (*set_eeprom)(struct dsa_switch *ds,
                              struct ethtool_eeprom *eeprom, u8 *data);

        /*
         * Register access.
         */
        int        (*get_regs_len)(struct dsa_switch *ds, int port);
        void        (*get_regs)(struct dsa_switch *ds, int port,
                            struct ethtool_regs *regs, void *p);

        /*
         * Upper device tracking.
         */
        int        (*port_prechangeupper)(struct dsa_switch *ds, int port,
                                       struct netdev_notifier_changeupper_info *info);

        /*
         * Bridge integration
         */
        int        (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs);
        int        (*port_bridge_join)(struct dsa_switch *ds, int port,
                                    struct dsa_bridge bridge,
                                    bool *tx_fwd_offload,
                                    struct netlink_ext_ack *extack);
        void        (*port_bridge_leave)(struct dsa_switch *ds, int port,
                                     struct dsa_bridge bridge);
        void        (*port_stp_state_set)(struct dsa_switch *ds, int port,
                                      u8 state);
        int        (*port_mst_state_set)(struct dsa_switch *ds, int port,
                                      const struct switchdev_mst_state *state);
        void        (*port_fast_age)(struct dsa_switch *ds, int port);
        int        (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid);
        int        (*port_pre_bridge_flags)(struct dsa_switch *ds, int port,
                                         struct switchdev_brport_flags flags,
                                         struct netlink_ext_ack *extack);
        int        (*port_bridge_flags)(struct dsa_switch *ds, int port,
                                     struct switchdev_brport_flags flags,
                                     struct netlink_ext_ack *extack);
        void        (*port_set_host_flood)(struct dsa_switch *ds, int port,
                                       bool uc, bool mc);

        /*
         * VLAN support
         */
        int        (*port_vlan_filtering)(struct dsa_switch *ds, int port,
                                       bool vlan_filtering,
                                       struct netlink_ext_ack *extack);
        int        (*port_vlan_add)(struct dsa_switch *ds, int port,
                                 const struct switchdev_obj_port_vlan *vlan,
                                 struct netlink_ext_ack *extack);
        int        (*port_vlan_del)(struct dsa_switch *ds, int port,
                                 const struct switchdev_obj_port_vlan *vlan);
        int        (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge,
                                 const struct switchdev_vlan_msti *msti);

        /*
         * Forwarding database
         */
        int        (*port_fdb_add)(struct dsa_switch *ds, int port,
                                const unsigned char *addr, u16 vid,
                                struct dsa_db db);
        int        (*port_fdb_del)(struct dsa_switch *ds, int port,
                                const unsigned char *addr, u16 vid,
                                struct dsa_db db);
        int        (*port_fdb_dump)(struct dsa_switch *ds, int port,
                                 dsa_fdb_dump_cb_t *cb, void *data);
        int        (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag,
                               const unsigned char *addr, u16 vid,
                               struct dsa_db db);
        int        (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag,
                               const unsigned char *addr, u16 vid,
                               struct dsa_db db);

        /*
         * Multicast database
         */
        int        (*port_mdb_add)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_port_mdb *mdb,
                                struct dsa_db db);
        int        (*port_mdb_del)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_port_mdb *mdb,
                                struct dsa_db db);
        /*
         * RXNFC
         */
        int        (*get_rxnfc)(struct dsa_switch *ds, int port,
                             struct ethtool_rxnfc *nfc, u32 *rule_locs);
        int        (*set_rxnfc)(struct dsa_switch *ds, int port,
                             struct ethtool_rxnfc *nfc);

        /*
         * TC integration
         */
        int        (*cls_flower_add)(struct dsa_switch *ds, int port,
                                  struct flow_cls_offload *cls, bool ingress);
        int        (*cls_flower_del)(struct dsa_switch *ds, int port,
                                  struct flow_cls_offload *cls, bool ingress);
        int        (*cls_flower_stats)(struct dsa_switch *ds, int port,
                                    struct flow_cls_offload *cls, bool ingress);
        int        (*port_mirror_add)(struct dsa_switch *ds, int port,
                                   struct dsa_mall_mirror_tc_entry *mirror,
                                   bool ingress, struct netlink_ext_ack *extack);
        void        (*port_mirror_del)(struct dsa_switch *ds, int port,
                                   struct dsa_mall_mirror_tc_entry *mirror);
        int        (*port_policer_add)(struct dsa_switch *ds, int port,
                                    const struct flow_action_police *policer);
        void        (*port_policer_del)(struct dsa_switch *ds, int port);
        int        (*port_setup_tc)(struct dsa_switch *ds, int port,
                                 enum tc_setup_type type, void *type_data);

        /*
         * Cross-chip operations
         */
        int        (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index,
                                         int sw_index, int port,
                                         struct dsa_bridge bridge,
                                         struct netlink_ext_ack *extack);
        void        (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index,
                                          int sw_index, int port,
                                          struct dsa_bridge bridge);
        int        (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index,
                                        int port);
        int        (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index,
                                      int port, struct dsa_lag lag,
                                      struct netdev_lag_upper_info *info,
                                      struct netlink_ext_ack *extack);
        int        (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index,
                                       int port, struct dsa_lag lag);

        /*
         * PTP functionality
         */
        int        (*port_hwtstamp_get)(struct dsa_switch *ds, int port,
                                     struct kernel_hwtstamp_config *config);
        int        (*port_hwtstamp_set)(struct dsa_switch *ds, int port,
                                     struct kernel_hwtstamp_config *config,
                                     struct netlink_ext_ack *extack);
        void        (*port_txtstamp)(struct dsa_switch *ds, int port,
                                 struct sk_buff *skb);
        bool        (*port_rxtstamp)(struct dsa_switch *ds, int port,
                                 struct sk_buff *skb, unsigned int type);

        /* Devlink parameters, etc */
        int        (*devlink_param_get)(struct dsa_switch *ds, u32 id,
                                     struct devlink_param_gset_ctx *ctx);
        int        (*devlink_param_set)(struct dsa_switch *ds, u32 id,
                                     struct devlink_param_gset_ctx *ctx);
        int        (*devlink_info_get)(struct dsa_switch *ds,
                                    struct devlink_info_req *req,
                                    struct netlink_ext_ack *extack);
        int        (*devlink_sb_pool_get)(struct dsa_switch *ds,
                                       unsigned int sb_index, u16 pool_index,
                                       struct devlink_sb_pool_info *pool_info);
        int        (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index,
                                       u16 pool_index, u32 size,
                                       enum devlink_sb_threshold_type threshold_type,
                                       struct netlink_ext_ack *extack);
        int        (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port,
                                            unsigned int sb_index, u16 pool_index,
                                            u32 *p_threshold);
        int        (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port,
                                            unsigned int sb_index, u16 pool_index,
                                            u32 threshold,
                                            struct netlink_ext_ack *extack);
        int        (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port,
                                               unsigned int sb_index, u16 tc_index,
                                               enum devlink_sb_pool_type pool_type,
                                               u16 *p_pool_index, u32 *p_threshold);
        int        (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port,
                                               unsigned int sb_index, u16 tc_index,
                                               enum devlink_sb_pool_type pool_type,
                                               u16 pool_index, u32 threshold,
                                               struct netlink_ext_ack *extack);
        int        (*devlink_sb_occ_snapshot)(struct dsa_switch *ds,
                                           unsigned int sb_index);
        int        (*devlink_sb_occ_max_clear)(struct dsa_switch *ds,
                                            unsigned int sb_index);
        int        (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port,
                                                unsigned int sb_index, u16 pool_index,
                                                u32 *p_cur, u32 *p_max);
        int        (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port,
                                                   unsigned int sb_index, u16 tc_index,
                                                   enum devlink_sb_pool_type pool_type,
                                                   u32 *p_cur, u32 *p_max);

        /*
         * MTU change functionality. Switches can also adjust their MRU through
         * this method. By MTU, one understands the SDU (L2 payload) length.
         * If the switch needs to account for the DSA tag on the CPU port, this
         * method needs to do so privately.
         */
        int        (*port_change_mtu)(struct dsa_switch *ds, int port,
                                   int new_mtu);
        int        (*port_max_mtu)(struct dsa_switch *ds, int port);

        /*
         * LAG integration
         */
        int        (*port_lag_change)(struct dsa_switch *ds, int port);
        int        (*port_lag_join)(struct dsa_switch *ds, int port,
                                 struct dsa_lag lag,
                                 struct netdev_lag_upper_info *info,
                                 struct netlink_ext_ack *extack);
        int        (*port_lag_leave)(struct dsa_switch *ds, int port,
                                  struct dsa_lag lag);

        /*
         * HSR integration
         */
        int        (*port_hsr_join)(struct dsa_switch *ds, int port,
                                 struct net_device *hsr,
                                 struct netlink_ext_ack *extack);
        int        (*port_hsr_leave)(struct dsa_switch *ds, int port,
                                  struct net_device *hsr);

        /*
         * MRP integration
         */
        int        (*port_mrp_add)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_mrp *mrp);
        int        (*port_mrp_del)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_mrp *mrp);
        int        (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port,
                                          const struct switchdev_obj_ring_role_mrp *mrp);
        int        (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,
                                          const struct switchdev_obj_ring_role_mrp *mrp);

        /*
         * tag_8021q operations
         */
        int        (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid,
                                      u16 flags);
        int        (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid);

        /*
         * DSA conduit tracking operations
         */
        void        (*conduit_state_change)(struct dsa_switch *ds,
                                        const struct net_device *conduit,
                                        bool operational);
};

#define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)                \
        DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes,                \
                             dsa_devlink_param_get, dsa_devlink_param_set, NULL)

int dsa_devlink_param_get(struct devlink *dl, u32 id,
                          struct devlink_param_gset_ctx *ctx,
                          struct netlink_ext_ack *extack);
int dsa_devlink_param_set(struct devlink *dl, u32 id,
                          struct devlink_param_gset_ctx *ctx,
                          struct netlink_ext_ack *extack);
int dsa_devlink_params_register(struct dsa_switch *ds,
                                const struct devlink_param *params,
                                size_t params_count);
void dsa_devlink_params_unregister(struct dsa_switch *ds,
                                   const struct devlink_param *params,
                                   size_t params_count);
int dsa_devlink_resource_register(struct dsa_switch *ds,
                                  const char *resource_name,
                                  u64 resource_size,
                                  u64 resource_id,
                                  u64 parent_resource_id,
                                  const struct devlink_resource_size_params *size_params);

void dsa_devlink_resources_unregister(struct dsa_switch *ds);

void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
                                           u64 resource_id,
                                           devlink_resource_occ_get_t *occ_get,
                                           void *occ_get_priv);
void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
                                             u64 resource_id);
struct devlink_region *
dsa_devlink_region_create(struct dsa_switch *ds,
                          const struct devlink_region_ops *ops,
                          u32 region_max_snapshots, u64 region_size);
struct devlink_region *
dsa_devlink_port_region_create(struct dsa_switch *ds,
                               int port,
                               const struct devlink_port_region_ops *ops,
                               u32 region_max_snapshots, u64 region_size);
void dsa_devlink_region_destroy(struct devlink_region *region);

struct dsa_port *dsa_port_from_netdev(struct net_device *netdev);

struct dsa_devlink_priv {
        struct dsa_switch *ds;
};

static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl)
{
        struct dsa_devlink_priv *dl_priv = devlink_priv(dl);

        return dl_priv->ds;
}

static inline
struct dsa_switch *dsa_devlink_port_to_ds(struct devlink_port *port)
{
        struct devlink *dl = port->devlink;
        struct dsa_devlink_priv *dl_priv = devlink_priv(dl);

        return dl_priv->ds;
}

static inline int dsa_devlink_port_to_port(struct devlink_port *port)
{
        return port->index;
}

bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port,
                                 const unsigned char *addr, u16 vid,
                                 struct dsa_db db);
bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
                                 const struct switchdev_obj_port_mdb *mdb,
                                 struct dsa_db db);

int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port,
                                 struct net_device *hsr,
                                 struct netlink_ext_ack *extack);
int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port,
                             struct net_device *hsr,
                             struct netlink_ext_ack *extack);
int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port,
                              struct net_device *hsr);

/* Keep inline for faster access in hot path */
static inline bool netdev_uses_dsa(const struct net_device *dev)
{
#if IS_ENABLED(CONFIG_NET_DSA)
        return dev->dsa_ptr && dev->dsa_ptr->rcv;
#endif
        return false;
}

/* All DSA tags that push the EtherType to the right (basically all except tail
 * tags, which don't break dissection) can be treated the same from the
 * perspective of the flow dissector.
 *
 * We need to return:
 *  - offset: the (B - A) difference between:
 *    A. the position of the real EtherType and
 *    B. the current skb->data (aka ETH_HLEN bytes into the frame, aka 2 bytes
 *       after the normal EtherType was supposed to be)
 *    The offset in bytes is exactly equal to the tagger overhead (and half of
 *    that, in __be16 shorts).
 *
 *  - proto: the value of the real EtherType.
 */
static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb,
                                                __be16 *proto, int *offset)
{
#if IS_ENABLED(CONFIG_NET_DSA)
        const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops;
        int tag_len = ops->needed_headroom;

        *offset = tag_len;
        *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1];
#endif
}

void dsa_unregister_switch(struct dsa_switch *ds);
int dsa_register_switch(struct dsa_switch *ds);
void dsa_switch_shutdown(struct dsa_switch *ds);
struct dsa_switch *dsa_switch_find(int tree_index, int sw_index);
void dsa_flush_workqueue(void);
#ifdef CONFIG_PM_SLEEP
int dsa_switch_suspend(struct dsa_switch *ds);
int dsa_switch_resume(struct dsa_switch *ds);
#else
static inline int dsa_switch_suspend(struct dsa_switch *ds)
{
        return 0;
}
static inline int dsa_switch_resume(struct dsa_switch *ds)
{
        return 0;
}
#endif /* CONFIG_PM_SLEEP */

#if IS_ENABLED(CONFIG_NET_DSA)
bool dsa_user_dev_check(const struct net_device *dev);
#else
static inline bool dsa_user_dev_check(const struct net_device *dev)
{
        return false;
}
#endif

netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev);
void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up);
bool dsa_supports_eee(struct dsa_switch *ds, int port);

#endif































































































































































































































    2 






















    1 


    1 


















































































































    2 


























    2 



    2 

























    1 





























    1 







    1 















































































































    1 





















    1 


































































































































































    1 
    1 


















    1 



    1 





















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
// SPDX-License-Identifier: GPL-2.0
/*
 *  inode.c - part of debugfs, a tiny little debug file system
 *
 *  Copyright (C) 2004,2019 Greg Kroah-Hartman <greg@kroah.com>
 *  Copyright (C) 2004 IBM Inc.
 *  Copyright (C) 2019 Linux Foundation <gregkh@linuxfoundation.org>
 *
 *  debugfs is for people to use instead of /proc or /sys.
 *  See ./Documentation/core-api/kernel-api.rst for more details.
 */

#define pr_fmt(fmt)        "debugfs: " fmt

#include <linux/module.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/namei.h>
#include <linux/debugfs.h>
#include <linux/fsnotify.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/security.h>

#include "internal.h"

#define DEBUGFS_DEFAULT_MODE        0700

static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
static bool debugfs_enabled __ro_after_init = IS_ENABLED(CONFIG_DEBUG_FS_ALLOW_ALL);

/*
 * Don't allow access attributes to be changed whilst the kernel is locked down
 * so that we can use the file mode as part of a heuristic to determine whether
 * to lock down individual files.
 */
static int debugfs_setattr(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *ia)
{
        int ret;

        if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) {
                ret = security_locked_down(LOCKDOWN_DEBUGFS);
                if (ret)
                        return ret;
        }
        return simple_setattr(&nop_mnt_idmap, dentry, ia);
}

static const struct inode_operations debugfs_file_inode_operations = {
        .setattr        = debugfs_setattr,
};
static const struct inode_operations debugfs_dir_inode_operations = {
        .lookup                = simple_lookup,
        .setattr        = debugfs_setattr,
};
static const struct inode_operations debugfs_symlink_inode_operations = {
        .get_link        = simple_get_link,
        .setattr        = debugfs_setattr,
};

static struct inode *debugfs_get_inode(struct super_block *sb)
{
        struct inode *inode = new_inode(sb);
        if (inode) {
                inode->i_ino = get_next_ino();
                simple_inode_init_ts(inode);
        }
        return inode;
}

struct debugfs_fs_info {
        kuid_t uid;
        kgid_t gid;
        umode_t mode;
        /* Opt_* bitfield. */
        unsigned int opts;
};

enum {
        Opt_uid,
        Opt_gid,
        Opt_mode,
        Opt_source,
};

static const struct fs_parameter_spec debugfs_param_specs[] = {
        fsparam_gid        ("gid",                Opt_gid),
        fsparam_u32oct        ("mode",        Opt_mode),
        fsparam_uid        ("uid",                Opt_uid),
        fsparam_string        ("source",        Opt_source),
        {}
};

static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct debugfs_fs_info *opts = fc->s_fs_info;
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, debugfs_param_specs, param, &result);
        if (opt < 0) {
                /*
                * We might like to report bad mount options here; but
                * traditionally debugfs has ignored all mount options
                */
                if (opt == -ENOPARAM)
                        return 0;

                return opt;
        }

        switch (opt) {
        case Opt_uid:
                opts->uid = result.uid;
                break;
        case Opt_gid:
                opts->gid = result.gid;
                break;
        case Opt_mode:
                opts->mode = result.uint_32 & S_IALLUGO;
                break;
        case Opt_source:
                if (fc->source)
                        return invalfc(fc, "Multiple sources specified");
                fc->source = param->string;
                param->string = NULL;
                break;
        /*
         * We might like to report bad mount options here;
         * but traditionally debugfs has ignored all mount options
         */
        }

        opts->opts |= BIT(opt);

        return 0;
}

static void _debugfs_apply_options(struct super_block *sb, bool remount)
{
        struct debugfs_fs_info *fsi = sb->s_fs_info;
        struct inode *inode = d_inode(sb->s_root);

        /*
         * On remount, only reset mode/uid/gid if they were provided as mount
         * options.
         */

        if (!remount || fsi->opts & BIT(Opt_mode)) {
                inode->i_mode &= ~S_IALLUGO;
                inode->i_mode |= fsi->mode;
        }

        if (!remount || fsi->opts & BIT(Opt_uid))
                inode->i_uid = fsi->uid;

        if (!remount || fsi->opts & BIT(Opt_gid))
                inode->i_gid = fsi->gid;
}

static void debugfs_apply_options(struct super_block *sb)
{
        _debugfs_apply_options(sb, false);
}

static void debugfs_apply_options_remount(struct super_block *sb)
{
        _debugfs_apply_options(sb, true);
}

static int debugfs_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        struct debugfs_fs_info *sb_opts = sb->s_fs_info;
        struct debugfs_fs_info *new_opts = fc->s_fs_info;

        if (!new_opts)
                return 0;

        sync_filesystem(sb);

        /* structure copy of new mount options to sb */
        *sb_opts = *new_opts;
        debugfs_apply_options_remount(sb);

        return 0;
}

static int debugfs_show_options(struct seq_file *m, struct dentry *root)
{
        struct debugfs_fs_info *fsi = root->d_sb->s_fs_info;

        if (!uid_eq(fsi->uid, GLOBAL_ROOT_UID))
                seq_printf(m, ",uid=%u",
                           from_kuid_munged(&init_user_ns, fsi->uid));
        if (!gid_eq(fsi->gid, GLOBAL_ROOT_GID))
                seq_printf(m, ",gid=%u",
                           from_kgid_munged(&init_user_ns, fsi->gid));
        if (fsi->mode != DEBUGFS_DEFAULT_MODE)
                seq_printf(m, ",mode=%o", fsi->mode);

        return 0;
}

static struct kmem_cache *debugfs_inode_cachep __ro_after_init;

static void init_once(void *foo)
{
        struct debugfs_inode_info *info = foo;
        inode_init_once(&info->vfs_inode);
}

static struct inode *debugfs_alloc_inode(struct super_block *sb)
{
        struct debugfs_inode_info *info;
        info = alloc_inode_sb(sb, debugfs_inode_cachep, GFP_KERNEL);
        if (!info)
                return NULL;
        return &info->vfs_inode;
}

static void debugfs_free_inode(struct inode *inode)
{
        if (S_ISLNK(inode->i_mode))
                kfree(inode->i_link);
        kmem_cache_free(debugfs_inode_cachep, DEBUGFS_I(inode));
}

static const struct super_operations debugfs_super_operations = {
        .statfs                = simple_statfs,
        .show_options        = debugfs_show_options,
        .alloc_inode        = debugfs_alloc_inode,
        .free_inode        = debugfs_free_inode,
};

static void debugfs_release_dentry(struct dentry *dentry)
{
        struct debugfs_fsdata *fsd = dentry->d_fsdata;

        if (fsd) {
                WARN_ON(!list_empty(&fsd->cancellations));
                mutex_destroy(&fsd->cancellations_mtx);
        }
        kfree(fsd);
}

static struct vfsmount *debugfs_automount(struct path *path)
{
        struct inode *inode = path->dentry->d_inode;

        return DEBUGFS_I(inode)->automount(path->dentry, inode->i_private);
}

static const struct dentry_operations debugfs_dops = {
        .d_release = debugfs_release_dentry,
        .d_automount = debugfs_automount,
};

static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
        static const struct tree_descr debug_files[] = {{""}};
        int err;

        err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
        if (err)
                return err;

        sb->s_op = &debugfs_super_operations;
        set_default_d_op(sb, &debugfs_dops);
        sb->s_d_flags |= DCACHE_DONTCACHE;

        debugfs_apply_options(sb);

        return 0;
}

static int debugfs_get_tree(struct fs_context *fc)
{
        int err;

        err = get_tree_single(fc, debugfs_fill_super);
        if (err)
                return err;

        return debugfs_reconfigure(fc);
}

static void debugfs_free_fc(struct fs_context *fc)
{
        kfree(fc->s_fs_info);
}

static const struct fs_context_operations debugfs_context_ops = {
        .free                = debugfs_free_fc,
        .parse_param        = debugfs_parse_param,
        .get_tree        = debugfs_get_tree,
        .reconfigure        = debugfs_reconfigure,
};

static int debugfs_init_fs_context(struct fs_context *fc)
{
        struct debugfs_fs_info *fsi;

        fsi = kzalloc_obj(struct debugfs_fs_info);
        if (!fsi)
                return -ENOMEM;

        fsi->mode = DEBUGFS_DEFAULT_MODE;

        fc->s_fs_info = fsi;
        fc->ops = &debugfs_context_ops;
        return 0;
}

static struct file_system_type debug_fs_type = {
        .owner =        THIS_MODULE,
        .name =                "debugfs",
        .init_fs_context = debugfs_init_fs_context,
        .parameters =        debugfs_param_specs,
        .kill_sb =        kill_anon_super,
};
MODULE_ALIAS_FS("debugfs");

/**
 * debugfs_lookup() - look up an existing debugfs file
 * @name: a pointer to a string containing the name of the file to look up.
 * @parent: a pointer to the parent dentry of the file.
 *
 * This function will return a pointer to a dentry if it succeeds.  If the file
 * doesn't exist or an error occurs, %NULL will be returned.  The returned
 * dentry must be passed to dput() when it is no longer needed.
 *
 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.
 */
struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
{
        struct dentry *dentry;

        if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent))
                return NULL;

        if (!parent)
                parent = debugfs_mount->mnt_root;

        dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent);
        if (IS_ERR(dentry))
                return NULL;
        return dentry;
}
EXPORT_SYMBOL_GPL(debugfs_lookup);

static struct dentry *debugfs_start_creating(const char *name,
                                             struct dentry *parent)
{
        struct dentry *dentry;
        int error;

        if (!debugfs_enabled)
                return ERR_PTR(-EPERM);

        if (!debugfs_initialized())
                return ERR_PTR(-ENOENT);

        pr_debug("creating file '%s'\n", name);

        if (IS_ERR(parent))
                return parent;

        error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
                              &debugfs_mount_count);
        if (error) {
                pr_err("Unable to pin filesystem for file '%s'\n", name);
                return ERR_PTR(error);
        }

        /* If the parent is not specified, we create it in the root.
         * We need the root dentry to do this, which is in the super
         * block. A pointer to that is in the struct vfsmount that we
         * have around.
         */
        if (!parent)
                parent = debugfs_mount->mnt_root;

        dentry = simple_start_creating(parent, name);
        if (IS_ERR(dentry)) {
                if (dentry == ERR_PTR(-EEXIST))
                        pr_err("'%s' already exists in '%pd'\n", name, parent);
                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        }
        return dentry;
}

static struct dentry *debugfs_failed_creating(struct dentry *dentry)
{
        simple_done_creating(dentry);
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        return ERR_PTR(-ENOMEM);
}

static struct dentry *debugfs_end_creating(struct dentry *dentry)
{
        simple_done_creating(dentry);
        return dentry; // borrowed
}

static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
                                struct dentry *parent, void *data,
                                const void *aux,
                                const struct file_operations *proxy_fops,
                                const void *real_fops)
{
        struct dentry *dentry;
        struct inode *inode;

        if (!(mode & S_IFMT))
                mode |= S_IFREG;
        BUG_ON(!S_ISREG(mode));
        dentry = debugfs_start_creating(name, parent);

        if (IS_ERR(dentry))
                return dentry;

        inode = debugfs_get_inode(dentry->d_sb);
        if (unlikely(!inode)) {
                pr_err("out of free dentries, can not create file '%s'\n",
                       name);
                return debugfs_failed_creating(dentry);
        }

        inode->i_mode = mode;
        inode->i_private = data;

        inode->i_op = &debugfs_file_inode_operations;
        if (!real_fops)
                proxy_fops = &debugfs_noop_file_operations;
        inode->i_fop = proxy_fops;
        DEBUGFS_I(inode)->raw = real_fops;
        DEBUGFS_I(inode)->aux = (void *)aux;

        d_make_persistent(dentry, inode);
        fsnotify_create(d_inode(dentry->d_parent), dentry);
        return debugfs_end_creating(dentry);
}

struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
                                        struct dentry *parent, void *data,
                                        const void *aux,
                                        const struct file_operations *fops)
{
        return __debugfs_create_file(name, mode, parent, data, aux,
                                &debugfs_full_proxy_file_operations,
                                fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file_full);

struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
                                        struct dentry *parent, void *data,
                                        const void *aux,
                                        const struct debugfs_short_fops *fops)
{
        return __debugfs_create_file(name, mode, parent, data, aux,
                                &debugfs_full_short_proxy_file_operations,
                                fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file_short);

/**
 * debugfs_create_file_unsafe - create a file in the debugfs filesystem
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @data: a pointer to something that the caller will want to get to later
 *        on.  The inode.i_private pointer will point to this value on
 *        the open() call.
 * @fops: a pointer to a struct file_operations that should be used for
 *        this file.
 *
 * debugfs_create_file_unsafe() is completely analogous to
 * debugfs_create_file(), the only difference being that the fops
 * handed it will not get protected against file removals by the
 * debugfs core.
 *
 * It is your responsibility to protect your struct file_operation
 * methods against file removals by means of debugfs_file_get()
 * and debugfs_file_put(). ->open() is still protected by
 * debugfs though.
 *
 * Any struct file_operations defined by means of
 * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and
 * thus, may be used here.
 */
struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
                                   struct dentry *parent, void *data,
                                   const struct file_operations *fops)
{

        return __debugfs_create_file(name, mode, parent, data, NULL,
                                &debugfs_open_proxy_file_operations,
                                fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);

/**
 * debugfs_create_file_size - create a file in the debugfs filesystem
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @data: a pointer to something that the caller will want to get to later
 *        on.  The inode.i_private pointer will point to this value on
 *        the open() call.
 * @fops: a pointer to a struct file_operations that should be used for
 *        this file.
 * @file_size: initial file size
 *
 * This is the basic "create a file" function for debugfs.  It allows for a
 * wide range of flexibility in creating a file, or a directory (if you want
 * to create a directory, the debugfs_create_dir() function is
 * recommended to be used instead.)
 */
void debugfs_create_file_size(const char *name, umode_t mode,
                              struct dentry *parent, void *data,
                              const struct file_operations *fops,
                              loff_t file_size)
{
        struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);

        if (!IS_ERR(de))
                d_inode(de)->i_size = file_size;
}
EXPORT_SYMBOL_GPL(debugfs_create_file_size);

/**
 * debugfs_create_dir - create a directory in the debugfs filesystem
 * @name: a pointer to a string containing the name of the directory to
 *        create.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is NULL, then the
 *          directory will be created in the root of the debugfs filesystem.
 *
 * This function creates a directory in debugfs with the given name.
 *
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the file is
 * to be removed (no automatic cleanup happens if your module is unloaded,
 * you are responsible here.)  If an error occurs, ERR_PTR(-ERROR) will be
 * returned.
 *
 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.
 *
 * NOTE: it's expected that most callers should _ignore_ the errors returned
 * by this function. Other debugfs functions handle the fact that the "dentry"
 * passed to them could be an error and they don't crash in that case.
 * Drivers should generally work fine even if debugfs fails to init anyway.
 */
struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
{
        struct dentry *dentry = debugfs_start_creating(name, parent);
        struct inode *inode;

        if (IS_ERR(dentry))
                return dentry;

        inode = debugfs_get_inode(dentry->d_sb);
        if (unlikely(!inode)) {
                pr_err("out of free dentries, can not create directory '%s'\n",
                       name);
                return debugfs_failed_creating(dentry);
        }

        inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
        inode->i_op = &debugfs_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;

        /* directory inodes start off with i_nlink == 2 (for "." entry) */
        inc_nlink(inode);
        d_make_persistent(dentry, inode);
        inc_nlink(d_inode(dentry->d_parent));
        fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
        return debugfs_end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_dir);

/**
 * debugfs_create_automount - create automount point in the debugfs filesystem
 * @name: a pointer to a string containing the name of the file to create.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @f: function to be called when pathname resolution steps on that one.
 * @data: opaque argument to pass to f().
 *
 * @f should return what ->d_automount() would.
 */
struct dentry *debugfs_create_automount(const char *name,
                                        struct dentry *parent,
                                        debugfs_automount_t f,
                                        void *data)
{
        struct dentry *dentry = debugfs_start_creating(name, parent);
        struct inode *inode;

        if (IS_ERR(dentry))
                return dentry;

        inode = debugfs_get_inode(dentry->d_sb);
        if (unlikely(!inode)) {
                pr_err("out of free dentries, can not create automount '%s'\n",
                       name);
                return debugfs_failed_creating(dentry);
        }

        make_empty_dir_inode(inode);
        inode->i_flags |= S_AUTOMOUNT;
        inode->i_private = data;
        DEBUGFS_I(inode)->automount = f;
        /* directory inodes start off with i_nlink == 2 (for "." entry) */
        inc_nlink(inode);
        d_make_persistent(dentry, inode);
        inc_nlink(d_inode(dentry->d_parent));
        fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
        return debugfs_end_creating(dentry);
}
EXPORT_SYMBOL(debugfs_create_automount);

/**
 * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
 * @name: a pointer to a string containing the name of the symbolic link to
 *        create.
 * @parent: a pointer to the parent dentry for this symbolic link.  This
 *          should be a directory dentry if set.  If this parameter is NULL,
 *          then the symbolic link will be created in the root of the debugfs
 *          filesystem.
 * @target: a pointer to a string containing the path to the target of the
 *          symbolic link.
 *
 * This function creates a symbolic link with the given name in debugfs that
 * links to the given target path.
 *
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the symbolic
 * link is to be removed (no automatic cleanup happens if your module is
 * unloaded, you are responsible here.)  If an error occurs, ERR_PTR(-ERROR)
 * will be returned.
 *
 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.
 */
struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
                                      const char *target)
{
        struct dentry *dentry;
        struct inode *inode;
        char *link = kstrdup(target, GFP_KERNEL);
        if (!link)
                return ERR_PTR(-ENOMEM);

        dentry = debugfs_start_creating(name, parent);
        if (IS_ERR(dentry)) {
                kfree(link);
                return dentry;
        }

        inode = debugfs_get_inode(dentry->d_sb);
        if (unlikely(!inode)) {
                pr_err("out of free dentries, can not create symlink '%s'\n",
                       name);
                kfree(link);
                return debugfs_failed_creating(dentry);
        }
        inode->i_mode = S_IFLNK | S_IRWXUGO;
        inode->i_op = &debugfs_symlink_inode_operations;
        inode->i_link = link;
        d_make_persistent(dentry, inode);
        return debugfs_end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_symlink);

static void __debugfs_file_removed(struct dentry *dentry)
{
        struct debugfs_fsdata *fsd;

        /*
         * Paired with the closing smp_mb() implied by a successful
         * cmpxchg() in debugfs_file_get(): either
         * debugfs_file_get() must see a dead dentry or we must see a
         * debugfs_fsdata instance at ->d_fsdata here (or both).
         */
        smp_mb();
        fsd = READ_ONCE(dentry->d_fsdata);
        if (!fsd)
                return;

        /* if this was the last reference, we're done */
        if (refcount_dec_and_test(&fsd->active_users))
                return;

        /*
         * If there's still a reference, the code that obtained it can
         * be in different states:
         *  - The common case of not using cancellations, or already
         *    after debugfs_leave_cancellation(), where we just need
         *    to wait for debugfs_file_put() which signals the completion;
         *  - inside a cancellation section, i.e. between
         *    debugfs_enter_cancellation() and debugfs_leave_cancellation(),
         *    in which case we need to trigger the ->cancel() function,
         *    and then wait for debugfs_file_put() just like in the
         *    previous case;
         *  - before debugfs_enter_cancellation() (but obviously after
         *    debugfs_file_get()), in which case we may not see the
         *    cancellation in the list on the first round of the loop,
         *    but debugfs_enter_cancellation() signals the completion
         *    after adding it, so this code gets woken up to call the
         *    ->cancel() function.
         */
        while (refcount_read(&fsd->active_users)) {
                struct debugfs_cancellation *c;

                /*
                 * Lock the cancellations. Note that the cancellations
                 * structs are meant to be on the stack, so we need to
                 * ensure we either use them here or don't touch them,
                 * and debugfs_leave_cancellation() will wait for this
                 * to be finished processing before exiting one. It may
                 * of course win and remove the cancellation, but then
                 * chances are we never even got into this bit, we only
                 * do if the refcount isn't zero already.
                 */
                mutex_lock(&fsd->cancellations_mtx);
                while ((c = list_first_entry_or_null(&fsd->cancellations,
                                                     typeof(*c), list))) {
                        list_del_init(&c->list);
                        c->cancel(dentry, c->cancel_data);
                }
                mutex_unlock(&fsd->cancellations_mtx);

                wait_for_completion(&fsd->active_users_drained);
        }
}

static void remove_one(struct dentry *victim)
{
        if (d_is_reg(victim))
                __debugfs_file_removed(victim);
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}

/**
 * debugfs_remove - recursively removes a directory
 * @dentry: a pointer to a the dentry of the directory to be removed.  If this
 *          parameter is NULL or an error value, nothing will be done.
 *
 * This function recursively removes a directory tree in debugfs that
 * was previously created with a call to another debugfs function
 * (like debugfs_create_file() or variants thereof.)
 *
 * This function is required to be called in order for the file to be
 * removed, no automatic cleanup of files will happen when a module is
 * removed, you are responsible here.
 */
void debugfs_remove(struct dentry *dentry)
{
        if (IS_ERR_OR_NULL(dentry))
                return;

        simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
        simple_recursive_removal(dentry, remove_one);
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
EXPORT_SYMBOL_GPL(debugfs_remove);

/**
 * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it
 * @name: a pointer to a string containing the name of the item to look up.
 * @parent: a pointer to the parent dentry of the item.
 *
 * This is the equlivant of doing something like
 * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting
 * handled for the directory being looked up.
 */
void debugfs_lookup_and_remove(const char *name, struct dentry *parent)
{
        struct dentry *dentry;

        dentry = debugfs_lookup(name, parent);
        if (!dentry)
                return;

        debugfs_remove(dentry);
        dput(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove);

/**
 * debugfs_change_name - rename a file/directory in the debugfs filesystem
 * @dentry: dentry of an object to be renamed.
 * @fmt: format for new name
 *
 * This function renames a file/directory in debugfs.  The target must not
 * exist for rename to succeed.
 *
 * This function will return 0 on success and -E... on failure.
 *
 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.
 */
int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, ...)
{
        int error = 0;
        const char *new_name;
        struct name_snapshot old_name;
        struct dentry *target;
        struct renamedata rd = {};
        struct inode *dir;
        va_list ap;

        if (IS_ERR_OR_NULL(dentry))
                return 0;

        va_start(ap, fmt);
        new_name = kvasprintf_const(GFP_KERNEL, fmt, ap);
        va_end(ap);
        if (!new_name)
                return -ENOMEM;

        rd.old_parent = dget_parent(dentry);
        rd.new_parent = rd.old_parent;
        rd.flags = RENAME_NOREPLACE;
        target = lookup_noperm_unlocked(&QSTR(new_name), rd.new_parent);
        if (IS_ERR(target)) {
                error = PTR_ERR(target);
                goto out_free;
        }

        error = start_renaming_two_dentries(&rd, dentry, target);
        if (error) {
                if (error == -EEXIST && target == dentry)
                        /* it isn't an error to rename a thing to itself */
                        error = 0;
                goto out;
        }

        dir = d_inode(rd.old_parent);
        take_dentry_name_snapshot(&old_name, dentry);
        simple_rename_timestamp(dir, dentry, dir, rd.new_dentry);
        d_move(dentry, rd.new_dentry);
        fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry);
        release_dentry_name_snapshot(&old_name);
        end_renaming(&rd);
out:
        dput(rd.old_parent);
        dput(target);
out_free:
        kfree_const(new_name);
        return error;
}
EXPORT_SYMBOL_GPL(debugfs_change_name);

/**
 * debugfs_initialized - Tells whether debugfs has been registered
 */
bool debugfs_initialized(void)
{
        return debugfs_registered;
}
EXPORT_SYMBOL_GPL(debugfs_initialized);

static int __init debugfs_kernel(char *str)
{
        if (str) {
                if (!strcmp(str, "on"))
                        debugfs_enabled = true;
                else if (!strcmp(str, "off"))
                        debugfs_enabled = false;
                else if (!strcmp(str, "no-mount")) {
                        pr_notice("debugfs=no-mount is a deprecated alias "
                                  "for debugfs=off\n");
                        debugfs_enabled = false;
                }
        }

        return 0;
}
early_param("debugfs", debugfs_kernel);

static int __init debugfs_init(void)
{
        int retval;

        if (!debugfs_enabled)
                return -EPERM;

        retval = sysfs_create_mount_point(kernel_kobj, "debug");
        if (retval)
                return retval;

        debugfs_inode_cachep = kmem_cache_create("debugfs_inode_cache",
                                sizeof(struct debugfs_inode_info), 0,
                                SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
                                init_once);
        if (debugfs_inode_cachep == NULL) {
                sysfs_remove_mount_point(kernel_kobj, "debug");
                return -ENOMEM;
        }

        retval = register_filesystem(&debug_fs_type);
        if (retval) { // Really not going to happen
                sysfs_remove_mount_point(kernel_kobj, "debug");
                kmem_cache_destroy(debugfs_inode_cachep);
                return retval;
        }
        debugfs_registered = true;
        return 0;
}
core_initcall(debugfs_init);




















































































    3 


































    3 









    3 


























































    1 











































































    3 









    3 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
// SPDX-License-Identifier: GPL-2.0
/*
 * udc.c - Core UDC Framework
 *
 * Copyright (C) 2016 Intel Corporation
 * Author: Felipe Balbi <felipe.balbi@linux.intel.com>
 */

#undef TRACE_SYSTEM
#define TRACE_SYSTEM gadget

#if !defined(__UDC_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define __UDC_TRACE_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <asm/byteorder.h>
#include <linux/usb/gadget.h>

DECLARE_EVENT_CLASS(udc_log_gadget,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret),
        TP_STRUCT__entry(
                __field(enum usb_device_speed, speed)
                __field(enum usb_device_speed, max_speed)
                __field(enum usb_device_state, state)
                __field(unsigned, mA)
                __field(unsigned, sg_supported)
                __field(unsigned, is_otg)
                __field(unsigned, is_a_peripheral)
                __field(unsigned, b_hnp_enable)
                __field(unsigned, a_hnp_support)
                __field(unsigned, hnp_polling_support)
                __field(unsigned, host_request_flag)
                __field(unsigned, quirk_ep_out_aligned_size)
                __field(unsigned, quirk_altset_not_supp)
                __field(unsigned, quirk_stall_not_supp)
                __field(unsigned, quirk_zlp_not_supp)
                __field(unsigned, is_selfpowered)
                __field(unsigned, deactivated)
                __field(unsigned, connected)
                __field(int, ret)
        ),
        TP_fast_assign(
                __entry->speed = g->speed;
                __entry->max_speed = g->max_speed;
                __entry->state = g->state;
                __entry->mA = g->mA;
                __entry->sg_supported = g->sg_supported;
                __entry->is_otg = g->is_otg;
                __entry->is_a_peripheral = g->is_a_peripheral;
                __entry->b_hnp_enable = g->b_hnp_enable;
                __entry->a_hnp_support = g->a_hnp_support;
                __entry->hnp_polling_support = g->hnp_polling_support;
                __entry->host_request_flag = g->host_request_flag;
                __entry->quirk_ep_out_aligned_size = g->quirk_ep_out_aligned_size;
                __entry->quirk_altset_not_supp = g->quirk_altset_not_supp;
                __entry->quirk_stall_not_supp = g->quirk_stall_not_supp;
                __entry->quirk_zlp_not_supp = g->quirk_zlp_not_supp;
                __entry->is_selfpowered = g->is_selfpowered;
                __entry->deactivated = g->deactivated;
                __entry->connected = g->connected;
                __entry->ret = ret;
        ),
        TP_printk("speed %d/%d state %d %dmA [%s%s%s%s%s%s%s%s%s%s%s%s%s%s] --> %d",
                __entry->speed, __entry->max_speed, __entry->state, __entry->mA,
                __entry->sg_supported ? "sg:" : "",
                __entry->is_otg ? "OTG:" : "",
                __entry->is_a_peripheral ? "a_peripheral:" : "",
                __entry->b_hnp_enable ? "b_hnp:" : "",
                __entry->a_hnp_support ? "a_hnp:" : "",
                __entry->hnp_polling_support ? "hnp_poll:" : "",
                __entry->host_request_flag ? "hostreq:" : "",
                __entry->quirk_ep_out_aligned_size ? "out_aligned:" : "",
                __entry->quirk_altset_not_supp ? "no_altset:" : "",
                __entry->quirk_stall_not_supp ? "no_stall:" : "",
                __entry->quirk_zlp_not_supp ? "no_zlp" : "",
                __entry->is_selfpowered ? "self-powered:" : "bus-powered:",
                __entry->deactivated ? "deactivated:" : "activated:",
                __entry->connected ? "connected" : "disconnected",
                __entry->ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_set_state,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_frame_number,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_wakeup,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_set_remote_wakeup,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_set_selfpowered,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_clear_selfpowered,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_vbus_connect,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_vbus_draw,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_vbus_disconnect,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_connect,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_disconnect,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_deactivate,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DEFINE_EVENT(udc_log_gadget, usb_gadget_activate,
        TP_PROTO(struct usb_gadget *g, int ret),
        TP_ARGS(g, ret)
);

DECLARE_EVENT_CLASS(udc_log_ep,
        TP_PROTO(struct usb_ep *ep, int ret),
        TP_ARGS(ep, ret),
        TP_STRUCT__entry(
                __string(name, ep->name)
                __field(unsigned, maxpacket)
                __field(unsigned, maxpacket_limit)
                __field(unsigned, max_streams)
                __field(unsigned, mult)
                __field(unsigned, maxburst)
                __field(u8, address)
                __field(bool, claimed)
                __field(bool, enabled)
                __field(int, ret)
        ),
        TP_fast_assign(
                __assign_str(name);
                __entry->maxpacket = ep->maxpacket;
                __entry->maxpacket_limit = ep->maxpacket_limit;
                __entry->max_streams = ep->max_streams;
                __entry->mult = ep->mult;
                __entry->maxburst = ep->maxburst;
                __entry->address = ep->address,
                __entry->claimed = ep->claimed;
                __entry->enabled = ep->enabled;
                __entry->ret = ret;
        ),
        TP_printk("%s: mps %d/%d streams %d mult %d burst %d addr %02x %s%s --> %d",
                __get_str(name), __entry->maxpacket, __entry->maxpacket_limit,
                __entry->max_streams, __entry->mult, __entry->maxburst,
                __entry->address, __entry->claimed ? "claimed:" : "released:",
                __entry->enabled ? "enabled" : "disabled", ret)
);

DEFINE_EVENT(udc_log_ep, usb_ep_set_maxpacket_limit,
        TP_PROTO(struct usb_ep *ep, int ret),
        TP_ARGS(ep, ret)
);

DEFINE_EVENT(udc_log_ep, usb_ep_enable,
        TP_PROTO(struct usb_ep *ep, int ret),
        TP_ARGS(ep, ret)
);

DEFINE_EVENT(udc_log_ep, usb_ep_disable,
        TP_PROTO(struct usb_ep *ep, int ret),
        TP_ARGS(ep, ret)
);

DEFINE_EVENT(udc_log_ep, usb_ep_set_halt,
        TP_PROTO(struct usb_ep *ep, int ret),
        TP_ARGS(ep, ret)
);

DEFINE_EVENT(udc_log_ep, usb_ep_clear_halt,
        TP_PROTO(struct usb_ep *ep, int ret),
        TP_ARGS(ep, ret)
);

DEFINE_EVENT(udc_log_ep, usb_ep_set_wedge,
        TP_PROTO(struct usb_ep *ep, int ret),
        TP_ARGS(ep, ret)
);

DEFINE_EVENT(udc_log_ep, usb_ep_fifo_status,
        TP_PROTO(struct usb_ep *ep, int ret),
        TP_ARGS(ep, ret)
);

DEFINE_EVENT(udc_log_ep, usb_ep_fifo_flush,
        TP_PROTO(struct usb_ep *ep, int ret),
        TP_ARGS(ep, ret)
);

DECLARE_EVENT_CLASS(udc_log_req,
        TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret),
        TP_ARGS(ep, req, ret),
        TP_STRUCT__entry(
                __string(name, ep->name)
                __field(unsigned, length)
                __field(unsigned, actual)
                __field(unsigned, num_sgs)
                __field(unsigned, num_mapped_sgs)
                __field(unsigned, stream_id)
                __field(unsigned, no_interrupt)
                __field(unsigned, zero)
                __field(unsigned, short_not_ok)
                __field(int, status)
                __field(int, ret)
                __field(struct usb_request *, req)
        ),
        TP_fast_assign(
                __assign_str(name);
                __entry->length = req->length;
                __entry->actual = req->actual;
                __entry->num_sgs = req->num_sgs;
                __entry->num_mapped_sgs = req->num_mapped_sgs;
                __entry->stream_id = req->stream_id;
                __entry->no_interrupt = req->no_interrupt;
                __entry->zero = req->zero;
                __entry->short_not_ok = req->short_not_ok;
                __entry->status = req->status;
                __entry->ret = ret;
                __entry->req = req;
        ),
        TP_printk("%s: req %p length %d/%d sgs %d/%d stream %d %s%s%s status %d --> %d",
                __get_str(name),__entry->req,  __entry->actual, __entry->length,
                __entry->num_mapped_sgs, __entry->num_sgs, __entry->stream_id,
                __entry->zero ? "Z" : "z",
                __entry->short_not_ok ? "S" : "s",
                __entry->no_interrupt ? "i" : "I",
                __entry->status, __entry->ret
        )
);

DEFINE_EVENT(udc_log_req, usb_ep_alloc_request,
        TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret),
        TP_ARGS(ep, req, ret)
);

DEFINE_EVENT(udc_log_req, usb_ep_free_request,
        TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret),
        TP_ARGS(ep, req, ret)
);

DEFINE_EVENT(udc_log_req, usb_ep_queue,
        TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret),
        TP_ARGS(ep, req, ret)
);

DEFINE_EVENT(udc_log_req, usb_ep_dequeue,
        TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret),
        TP_ARGS(ep, req, ret)
);

DEFINE_EVENT(udc_log_req, usb_gadget_giveback_request,
        TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret),
        TP_ARGS(ep, req, ret)
);

#endif /* __UDC_TRACE_H */

/* this part has to be here */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .

#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace

#include <trace/define_trace.h>


























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PTRACE_H
#define _ASM_X86_PTRACE_H

#include <asm/segment.h>
#include <asm/page_types.h>
#include <uapi/asm/ptrace.h>

#ifndef __ASSEMBLER__
#ifdef __i386__

struct pt_regs {
        /*
         * NB: 32-bit x86 CPUs are inconsistent as what happens in the
         * following cases (where %seg represents a segment register):
         *
         * - pushl %seg: some do a 16-bit write and leave the high
         *   bits alone
         * - movl %seg, [mem]: some do a 16-bit write despite the movl
         * - IDT entry: some (e.g. 486) will leave the high bits of CS
         *   and (if applicable) SS undefined.
         *
         * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
         * so we can just treat all of the segment registers as 16-bit
         * values.
         */
        unsigned long bx;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;
        unsigned long bp;
        unsigned long ax;
        unsigned short ds;
        unsigned short __dsh;
        unsigned short es;
        unsigned short __esh;
        unsigned short fs;
        unsigned short __fsh;
        /*
         * On interrupt, gs and __gsh store the vector number.  They never
         * store gs any more.
         */
        unsigned short gs;
        unsigned short __gsh;
        /* On interrupt, this is the error code. */
        unsigned long orig_ax;
        unsigned long ip;
        unsigned short cs;
        unsigned short __csh;
        unsigned long flags;
        unsigned long sp;
        unsigned short ss;
        unsigned short __ssh;
};

#else /* __i386__ */

struct fred_cs {
                /* CS selector */
        u64        cs        : 16,
                /* Stack level at event time */
                sl        :  2,
                /* IBT in WAIT_FOR_ENDBRANCH state */
                wfe        :  1,
                        : 45;
};

struct fred_ss {
                /* SS selector */
        u64        ss        : 16,
                /* STI state */
                sti        :  1,
                /* Set if syscall, sysenter or INT n */
                swevent        :  1,
                /* Event is NMI type */
                nmi        :  1,
                        : 13,
                /* Event vector */
                vector        :  8,
                        :  8,
                /* Event type */
                type        :  4,
                        :  4,
                /* Event was incident to enclave execution */
                enclave        :  1,
                /* CPU was in 64-bit mode */
                l        :  1,
                /*
                 * Nested exception during FRED delivery, not set
                 * for #DF.
                 */
                nested        :  1,
                        :  1,
                /*
                 * The length of the instruction causing the event.
                 * Only set for INTO, INT1, INT3, INT n, SYSCALL
                 * and SYSENTER.  0 otherwise.
                 */
                insnlen        :  4;
};

struct pt_regs {
        /*
         * C ABI says these regs are callee-preserved. They aren't saved on
         * kernel entry unless syscall needs a complete, fully filled
         * "struct pt_regs".
         */
        unsigned long r15;
        unsigned long r14;
        unsigned long r13;
        unsigned long r12;
        unsigned long bp;
        unsigned long bx;

        /* These regs are callee-clobbered. Always saved on kernel entry. */
        unsigned long r11;
        unsigned long r10;
        unsigned long r9;
        unsigned long r8;
        unsigned long ax;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;

        /*
         * orig_ax is used on entry for:
         * - the syscall number (syscall, sysenter, int80)
         * - error_code stored by the CPU on traps and exceptions
         * - the interrupt number for device interrupts
         *
         * A FRED stack frame starts here:
         *   1) It _always_ includes an error code;
         *
         *   2) The return frame for ERET[US] starts here, but
         *      the content of orig_ax is ignored.
         */
        unsigned long orig_ax;

        /* The IRETQ return frame starts here */
        unsigned long ip;

        union {
                /* CS selector */
                u16                cs;
                /* The extended 64-bit data slot containing CS */
                u64                csx;
                /* The FRED CS extension */
                struct fred_cs        fred_cs;
        };

        unsigned long flags;
        unsigned long sp;

        union {
                /* SS selector */
                u16                ss;
                /* The extended 64-bit data slot containing SS */
                u64                ssx;
                /* The FRED SS extension */
                struct fred_ss        fred_ss;
        };

        /*
         * Top of stack on IDT systems, while FRED systems have extra fields
         * defined above for storing exception related information, e.g. CR2 or
         * DR6.
         */
};

#endif /* !__i386__ */

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt-base.h>
#endif

#include <asm/proto.h>

struct cpuinfo_x86;
struct task_struct;

extern unsigned long profile_pc(struct pt_regs *regs);

extern unsigned long
convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code);


static __always_inline unsigned long regs_return_value(struct pt_regs *regs)
{
        return regs->ax;
}

static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
{
        regs->ax = rc;
}

/*
 * user_mode(regs) determines whether a register set came from user
 * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
 * register set was from protected mode with RPL-3 CS value.  This
 * tricky test checks that with one comparison.
 *
 * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
 * the extra check.
 */
static __always_inline int user_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
        return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
#else
        return !!(regs->cs & 3);
#endif
}

static __always_inline int v8086_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
        return (regs->flags & X86_VM_MASK);
#else
        return 0;        /* No V86 mode support in long mode */
#endif
}

static inline bool user_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
#ifndef CONFIG_PARAVIRT_XXL
        /*
         * On non-paravirt systems, this is the only long mode CPL 3
         * selector.  We do not allow long mode selectors in the LDT.
         */
        return regs->cs == __USER_CS;
#else
        /* Headers are too twisted for this to go in paravirt.h. */
        return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
#endif
#else /* !CONFIG_X86_64 */
        return false;
#endif
}

/*
 * Determine whether the register set came from any context that is running in
 * 64-bit mode.
 */
static inline bool any_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
        return !user_mode(regs) || user_64bit_mode(regs);
#else
        return false;
#endif
}

#ifdef CONFIG_X86_64
#define current_user_stack_pointer()        current_pt_regs()->sp
#define compat_user_stack_pointer()        current_pt_regs()->sp

static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
{
        bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
                    regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack);

        ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack &&
                      regs->ip <  (unsigned long)entry_SYSRETQ_end);
#ifdef CONFIG_IA32_EMULATION
        ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
                      regs->ip <  (unsigned long)entry_SYSCALL_compat_safe_stack);
        ret = ret || (regs->ip >= (unsigned long)entry_SYSRETL_compat_unsafe_stack &&
                      regs->ip <  (unsigned long)entry_SYSRETL_compat_end);
#endif

        return ret;
}
#endif

static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
        return regs->sp;
}

static __always_inline unsigned long instruction_pointer(struct pt_regs *regs)
{
        return regs->ip;
}

static __always_inline
void instruction_pointer_set(struct pt_regs *regs, unsigned long val)
{
        regs->ip = val;
}

static __always_inline unsigned long frame_pointer(struct pt_regs *regs)
{
        return regs->bp;
}

static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs)
{
        return regs->sp;
}

static __always_inline
void user_stack_pointer_set(struct pt_regs *regs, unsigned long val)
{
        regs->sp = val;
}

static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
{
        return !(regs->flags & X86_EFLAGS_IF);
}

/* Query offset/name of register from its name/offset */
extern int regs_query_register_offset(const char *name);
extern const char *regs_query_register_name(unsigned int offset);
#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))

/**
 * regs_get_register() - get register value from its offset
 * @regs:        pt_regs from which register value is gotten.
 * @offset:        offset number of the register.
 *
 * regs_get_register returns the value of a register. The @offset is the
 * offset of the register in struct pt_regs address which specified by @regs.
 * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
 */
static inline unsigned long regs_get_register(struct pt_regs *regs,
                                              unsigned int offset)
{
        if (unlikely(offset > MAX_REG_OFFSET))
                return 0;
#ifdef CONFIG_X86_32
        /* The selector fields are 16-bit. */
        if (offset == offsetof(struct pt_regs, cs) ||
            offset == offsetof(struct pt_regs, ss) ||
            offset == offsetof(struct pt_regs, ds) ||
            offset == offsetof(struct pt_regs, es) ||
            offset == offsetof(struct pt_regs, fs) ||
            offset == offsetof(struct pt_regs, gs)) {
                return *(u16 *)((unsigned long)regs + offset);

        }
#endif
        return *(unsigned long *)((unsigned long)regs + offset);
}

/**
 * regs_within_kernel_stack() - check the address in the stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @addr:        address which is checked.
 *
 * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
 * If @addr is within the kernel stack, it returns true. If not, returns false.
 */
static inline int regs_within_kernel_stack(struct pt_regs *regs,
                                           unsigned long addr)
{
        return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1)));
}

/**
 * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @n:                stack entry number.
 *
 * regs_get_kernel_stack_nth() returns the address of the @n th entry of the
 * kernel stack which is specified by @regs. If the @n th entry is NOT in
 * the kernel stack, this returns NULL.
 */
static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
{
        unsigned long *addr = (unsigned long *)regs->sp;

        addr += n;
        if (regs_within_kernel_stack(regs, (unsigned long)addr))
                return addr;
        else
                return NULL;
}

/* To avoid include hell, we can't include uaccess.h */
extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size);

/**
 * regs_get_kernel_stack_nth() - get Nth entry of the stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @n:                stack entry number.
 *
 * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
 * is specified by @regs. If the @n th entry is NOT in the kernel stack
 * this returns 0.
 */
static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
                                                      unsigned int n)
{
        unsigned long *addr;
        unsigned long val;
        long ret;

        addr = regs_get_kernel_stack_nth_addr(regs, n);
        if (addr) {
                ret = copy_from_kernel_nofault(&val, addr, sizeof(val));
                if (!ret)
                        return val;
        }
        return 0;
}

/**
 * regs_get_kernel_argument() - get Nth function argument in kernel
 * @regs:        pt_regs of that context
 * @n:                function argument number (start from 0)
 *
 * regs_get_argument() returns @n th argument of the function call.
 * Note that this chooses most probably assignment, in some case
 * it can be incorrect.
 * This is expected to be called from kprobes or ftrace with regs
 * where the top of stack is the return address.
 */
static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
                                                     unsigned int n)
{
        static const unsigned int argument_offs[] = {
#ifdef __i386__
                offsetof(struct pt_regs, ax),
                offsetof(struct pt_regs, dx),
                offsetof(struct pt_regs, cx),
#define NR_REG_ARGUMENTS 3
#else
                offsetof(struct pt_regs, di),
                offsetof(struct pt_regs, si),
                offsetof(struct pt_regs, dx),
                offsetof(struct pt_regs, cx),
                offsetof(struct pt_regs, r8),
                offsetof(struct pt_regs, r9),
#define NR_REG_ARGUMENTS 6
#endif
        };

        if (n >= NR_REG_ARGUMENTS) {
                n -= NR_REG_ARGUMENTS - 1;
                return regs_get_kernel_stack_nth(regs, n);
        } else
                return regs_get_register(regs, argument_offs[n]);
}

#define arch_has_single_step()        (1)
#ifdef CONFIG_X86_DEBUGCTLMSR
#define arch_has_block_step()        (1)
#else
#define arch_has_block_step()        (boot_cpu_data.x86 >= 6)
#endif

#define ARCH_HAS_USER_SINGLE_STEP_REPORT

struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
                              struct user_desc __user *info);
extern int do_set_thread_area(struct task_struct *p, int idx,
                              struct user_desc __user *info, int can_allocate);

#ifdef CONFIG_X86_64
# define do_set_thread_area_64(p, s, t)        do_arch_prctl_64(p, s, t)
#else
# define do_set_thread_area_64(p, s, t)        (0)
#endif

#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PTRACE_H */



















































































































































































































































































































































































































































































    3 





























    3 













































































































































    3 
















    2 






















    3 







    3 






    3 











    3 















    3 








    3 




















    3 



    3 








    3 




























    3 









    3 










    3 

    3 





















































































































































































































































































































    3 


























    3 


































































    3 

































































































































































    3 




    3 






















    3 


























    3 









    3 



































































    3 










    3 










    3 

    3 


    2 




    3 













































































































    3 









































































    3 





    3 

    3 










    2 






    3 

    3 











    3 











    3 
























    3 











































    3 







    3 















































































































    3 




    3 













    3 










    1 
    3 







































































































    3 


































































































































































































































































































    3 




    3 

































































































































































































































































































    3 

    3 





    3 

    3 












    3 
    3 




























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
// SPDX-License-Identifier: GPL-2.0+
/*
 * (C) Copyright Linus Torvalds 1999
 * (C) Copyright Johannes Erdfelt 1999-2001
 * (C) Copyright Andreas Gal 1999
 * (C) Copyright Gregory P. Smith 1999
 * (C) Copyright Deti Fliegl 1999
 * (C) Copyright Randy Dunlap 2000
 * (C) Copyright David Brownell 2000-2002
 */

#include <linux/bcd.h>
#include <linux/module.h>
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/sched/task_stack.h>
#include <linux/slab.h>
#include <linux/completion.h>
#include <linux/utsname.h>
#include <linux/mm.h>
#include <asm/io.h>
#include <linux/device.h>
#include <linux/dma-mapping.h>
#include <linux/mutex.h>
#include <asm/irq.h>
#include <asm/byteorder.h>
#include <linux/unaligned.h>
#include <linux/platform_device.h>
#include <linux/workqueue.h>
#include <linux/pm_runtime.h>
#include <linux/types.h>
#include <linux/genalloc.h>
#include <linux/io.h>
#include <linux/kcov.h>

#include <linux/phy/phy.h>
#include <linux/usb.h>
#include <linux/usb/hcd.h>
#include <linux/usb/otg.h>

#include "usb.h"
#include "phy.h"


/*-------------------------------------------------------------------------*/

/*
 * USB Host Controller Driver framework
 *
 * Plugs into usbcore (usb_bus) and lets HCDs share code, minimizing
 * HCD-specific behaviors/bugs.
 *
 * This does error checks, tracks devices and urbs, and delegates to a
 * "hc_driver" only for code (and data) that really needs to know about
 * hardware differences.  That includes root hub registers, i/o queues,
 * and so on ... but as little else as possible.
 *
 * Shared code includes most of the "root hub" code (these are emulated,
 * though each HC's hardware works differently) and PCI glue, plus request
 * tracking overhead.  The HCD code should only block on spinlocks or on
 * hardware handshaking; blocking on software events (such as other kernel
 * threads releasing resources, or completing actions) is all generic.
 *
 * Happens the USB 2.0 spec says this would be invisible inside the "USBD",
 * and includes mostly a "HCDI" (HCD Interface) along with some APIs used
 * only by the hub driver ... and that neither should be seen or used by
 * usb client device drivers.
 *
 * Contributors of ideas or unattributed patches include: David Brownell,
 * Roman Weissgaerber, Rory Bolt, Greg Kroah-Hartman, ...
 *
 * HISTORY:
 * 2002-02-21        Pull in most of the usb_bus support from usb.c; some
 *                associated cleanup.  "usb_hcd" still != "usb_bus".
 * 2001-12-12        Initial patch version for Linux 2.5.1 kernel.
 */

/*-------------------------------------------------------------------------*/

/* host controllers we manage */
DEFINE_IDR (usb_bus_idr);
EXPORT_SYMBOL_GPL (usb_bus_idr);

/* used when allocating bus numbers */
#define USB_MAXBUS                64

/* used when updating list of hcds */
DEFINE_MUTEX(usb_bus_idr_lock);        /* exported only for usbfs */
EXPORT_SYMBOL_GPL (usb_bus_idr_lock);

/* used for controlling access to virtual root hubs */
static DEFINE_SPINLOCK(hcd_root_hub_lock);

/* used when updating an endpoint's URB list */
static DEFINE_SPINLOCK(hcd_urb_list_lock);

/* used to protect against unlinking URBs after the device is gone */
static DEFINE_SPINLOCK(hcd_urb_unlink_lock);

/* wait queue for synchronous unlinks */
DECLARE_WAIT_QUEUE_HEAD(usb_kill_urb_queue);

/*-------------------------------------------------------------------------*/

/*
 * Sharable chunks of root hub code.
 */

/*-------------------------------------------------------------------------*/
#define KERNEL_REL        bin2bcd(LINUX_VERSION_MAJOR)
#define KERNEL_VER        bin2bcd(LINUX_VERSION_PATCHLEVEL)

/* usb 3.1 root hub device descriptor */
static const u8 usb31_rh_dev_descriptor[18] = {
        0x12,       /*  __u8  bLength; */
        USB_DT_DEVICE, /* __u8 bDescriptorType; Device */
        0x10, 0x03, /*  __le16 bcdUSB; v3.1 */

        0x09,            /*  __u8  bDeviceClass; HUB_CLASSCODE */
        0x00,            /*  __u8  bDeviceSubClass; */
        0x03,       /*  __u8  bDeviceProtocol; USB 3 hub */
        0x09,       /*  __u8  bMaxPacketSize0; 2^9 = 512 Bytes */

        0x6b, 0x1d, /*  __le16 idVendor; Linux Foundation 0x1d6b */
        0x03, 0x00, /*  __le16 idProduct; device 0x0003 */
        KERNEL_VER, KERNEL_REL, /*  __le16 bcdDevice */

        0x03,       /*  __u8  iManufacturer; */
        0x02,       /*  __u8  iProduct; */
        0x01,       /*  __u8  iSerialNumber; */
        0x01        /*  __u8  bNumConfigurations; */
};

/* usb 3.0 root hub device descriptor */
static const u8 usb3_rh_dev_descriptor[18] = {
        0x12,       /*  __u8  bLength; */
        USB_DT_DEVICE, /* __u8 bDescriptorType; Device */
        0x00, 0x03, /*  __le16 bcdUSB; v3.0 */

        0x09,            /*  __u8  bDeviceClass; HUB_CLASSCODE */
        0x00,            /*  __u8  bDeviceSubClass; */
        0x03,       /*  __u8  bDeviceProtocol; USB 3.0 hub */
        0x09,       /*  __u8  bMaxPacketSize0; 2^9 = 512 Bytes */

        0x6b, 0x1d, /*  __le16 idVendor; Linux Foundation 0x1d6b */
        0x03, 0x00, /*  __le16 idProduct; device 0x0003 */
        KERNEL_VER, KERNEL_REL, /*  __le16 bcdDevice */

        0x03,       /*  __u8  iManufacturer; */
        0x02,       /*  __u8  iProduct; */
        0x01,       /*  __u8  iSerialNumber; */
        0x01        /*  __u8  bNumConfigurations; */
};

/* usb 2.0 root hub device descriptor */
static const u8 usb2_rh_dev_descriptor[18] = {
        0x12,       /*  __u8  bLength; */
        USB_DT_DEVICE, /* __u8 bDescriptorType; Device */
        0x00, 0x02, /*  __le16 bcdUSB; v2.0 */

        0x09,            /*  __u8  bDeviceClass; HUB_CLASSCODE */
        0x00,            /*  __u8  bDeviceSubClass; */
        0x00,       /*  __u8  bDeviceProtocol; [ usb 2.0 no TT ] */
        0x40,       /*  __u8  bMaxPacketSize0; 64 Bytes */

        0x6b, 0x1d, /*  __le16 idVendor; Linux Foundation 0x1d6b */
        0x02, 0x00, /*  __le16 idProduct; device 0x0002 */
        KERNEL_VER, KERNEL_REL, /*  __le16 bcdDevice */

        0x03,       /*  __u8  iManufacturer; */
        0x02,       /*  __u8  iProduct; */
        0x01,       /*  __u8  iSerialNumber; */
        0x01        /*  __u8  bNumConfigurations; */
};

/* no usb 2.0 root hub "device qualifier" descriptor: one speed only */

/* usb 1.1 root hub device descriptor */
static const u8 usb11_rh_dev_descriptor[18] = {
        0x12,       /*  __u8  bLength; */
        USB_DT_DEVICE, /* __u8 bDescriptorType; Device */
        0x10, 0x01, /*  __le16 bcdUSB; v1.1 */

        0x09,            /*  __u8  bDeviceClass; HUB_CLASSCODE */
        0x00,            /*  __u8  bDeviceSubClass; */
        0x00,       /*  __u8  bDeviceProtocol; [ low/full speeds only ] */
        0x40,       /*  __u8  bMaxPacketSize0; 64 Bytes */

        0x6b, 0x1d, /*  __le16 idVendor; Linux Foundation 0x1d6b */
        0x01, 0x00, /*  __le16 idProduct; device 0x0001 */
        KERNEL_VER, KERNEL_REL, /*  __le16 bcdDevice */

        0x03,       /*  __u8  iManufacturer; */
        0x02,       /*  __u8  iProduct; */
        0x01,       /*  __u8  iSerialNumber; */
        0x01        /*  __u8  bNumConfigurations; */
};


/*-------------------------------------------------------------------------*/

/* Configuration descriptors for our root hubs */

static const u8 fs_rh_config_descriptor[] = {

        /* one configuration */
        0x09,       /*  __u8  bLength; */
        USB_DT_CONFIG, /* __u8 bDescriptorType; Configuration */
        0x19, 0x00, /*  __le16 wTotalLength; */
        0x01,       /*  __u8  bNumInterfaces; (1) */
        0x01,       /*  __u8  bConfigurationValue; */
        0x00,       /*  __u8  iConfiguration; */
        0xc0,       /*  __u8  bmAttributes;
                                 Bit 7: must be set,
                                     6: Self-powered,
                                     5: Remote wakeup,
                                     4..0: resvd */
        0x00,       /*  __u8  MaxPower; */

        /* USB 1.1:
         * USB 2.0, single TT organization (mandatory):
         *        one interface, protocol 0
         *
         * USB 2.0, multiple TT organization (optional):
         *        two interfaces, protocols 1 (like single TT)
         *        and 2 (multiple TT mode) ... config is
         *        sometimes settable
         *        NOT IMPLEMENTED
         */

        /* one interface */
        0x09,       /*  __u8  if_bLength; */
        USB_DT_INTERFACE,  /* __u8 if_bDescriptorType; Interface */
        0x00,       /*  __u8  if_bInterfaceNumber; */
        0x00,       /*  __u8  if_bAlternateSetting; */
        0x01,       /*  __u8  if_bNumEndpoints; */
        0x09,       /*  __u8  if_bInterfaceClass; HUB_CLASSCODE */
        0x00,       /*  __u8  if_bInterfaceSubClass; */
        0x00,       /*  __u8  if_bInterfaceProtocol; [usb1.1 or single tt] */
        0x00,       /*  __u8  if_iInterface; */

        /* one endpoint (status change endpoint) */
        0x07,       /*  __u8  ep_bLength; */
        USB_DT_ENDPOINT, /* __u8 ep_bDescriptorType; Endpoint */
        0x81,       /*  __u8  ep_bEndpointAddress; IN Endpoint 1 */
        0x03,       /*  __u8  ep_bmAttributes; Interrupt */
        0x02, 0x00, /*  __le16 ep_wMaxPacketSize; 1 + (MAX_ROOT_PORTS / 8) */
        0xff        /*  __u8  ep_bInterval; (255ms -- usb 2.0 spec) */
};

static const u8 hs_rh_config_descriptor[] = {

        /* one configuration */
        0x09,       /*  __u8  bLength; */
        USB_DT_CONFIG, /* __u8 bDescriptorType; Configuration */
        0x19, 0x00, /*  __le16 wTotalLength; */
        0x01,       /*  __u8  bNumInterfaces; (1) */
        0x01,       /*  __u8  bConfigurationValue; */
        0x00,       /*  __u8  iConfiguration; */
        0xc0,       /*  __u8  bmAttributes;
                                 Bit 7: must be set,
                                     6: Self-powered,
                                     5: Remote wakeup,
                                     4..0: resvd */
        0x00,       /*  __u8  MaxPower; */

        /* USB 1.1:
         * USB 2.0, single TT organization (mandatory):
         *        one interface, protocol 0
         *
         * USB 2.0, multiple TT organization (optional):
         *        two interfaces, protocols 1 (like single TT)
         *        and 2 (multiple TT mode) ... config is
         *        sometimes settable
         *        NOT IMPLEMENTED
         */

        /* one interface */
        0x09,       /*  __u8  if_bLength; */
        USB_DT_INTERFACE, /* __u8 if_bDescriptorType; Interface */
        0x00,       /*  __u8  if_bInterfaceNumber; */
        0x00,       /*  __u8  if_bAlternateSetting; */
        0x01,       /*  __u8  if_bNumEndpoints; */
        0x09,       /*  __u8  if_bInterfaceClass; HUB_CLASSCODE */
        0x00,       /*  __u8  if_bInterfaceSubClass; */
        0x00,       /*  __u8  if_bInterfaceProtocol; [usb1.1 or single tt] */
        0x00,       /*  __u8  if_iInterface; */

        /* one endpoint (status change endpoint) */
        0x07,       /*  __u8  ep_bLength; */
        USB_DT_ENDPOINT, /* __u8 ep_bDescriptorType; Endpoint */
        0x81,       /*  __u8  ep_bEndpointAddress; IN Endpoint 1 */
        0x03,       /*  __u8  ep_bmAttributes; Interrupt */
                    /* __le16 ep_wMaxPacketSize; 1 + (MAX_ROOT_PORTS / 8)
                     * see hub.c:hub_configure() for details. */
        (USB_MAXCHILDREN + 1 + 7) / 8, 0x00,
        0x0c        /*  __u8  ep_bInterval; (256ms -- usb 2.0 spec) */
};

static const u8 ss_rh_config_descriptor[] = {
        /* one configuration */
        0x09,       /*  __u8  bLength; */
        USB_DT_CONFIG, /* __u8 bDescriptorType; Configuration */
        0x1f, 0x00, /*  __le16 wTotalLength; */
        0x01,       /*  __u8  bNumInterfaces; (1) */
        0x01,       /*  __u8  bConfigurationValue; */
        0x00,       /*  __u8  iConfiguration; */
        0xc0,       /*  __u8  bmAttributes;
                                 Bit 7: must be set,
                                     6: Self-powered,
                                     5: Remote wakeup,
                                     4..0: resvd */
        0x00,       /*  __u8  MaxPower; */

        /* one interface */
        0x09,       /*  __u8  if_bLength; */
        USB_DT_INTERFACE, /* __u8 if_bDescriptorType; Interface */
        0x00,       /*  __u8  if_bInterfaceNumber; */
        0x00,       /*  __u8  if_bAlternateSetting; */
        0x01,       /*  __u8  if_bNumEndpoints; */
        0x09,       /*  __u8  if_bInterfaceClass; HUB_CLASSCODE */
        0x00,       /*  __u8  if_bInterfaceSubClass; */
        0x00,       /*  __u8  if_bInterfaceProtocol; */
        0x00,       /*  __u8  if_iInterface; */

        /* one endpoint (status change endpoint) */
        0x07,       /*  __u8  ep_bLength; */
        USB_DT_ENDPOINT, /* __u8 ep_bDescriptorType; Endpoint */
        0x81,       /*  __u8  ep_bEndpointAddress; IN Endpoint 1 */
        0x03,       /*  __u8  ep_bmAttributes; Interrupt */
                    /* __le16 ep_wMaxPacketSize; 1 + (MAX_ROOT_PORTS / 8)
                     * see hub.c:hub_configure() for details. */
        (USB_MAXCHILDREN + 1 + 7) / 8, 0x00,
        0x0c,       /*  __u8  ep_bInterval; (256ms -- usb 2.0 spec) */

        /* one SuperSpeed endpoint companion descriptor */
        0x06,        /* __u8 ss_bLength */
        USB_DT_SS_ENDPOINT_COMP, /* __u8 ss_bDescriptorType; SuperSpeed EP */
                     /* Companion */
        0x00,        /* __u8 ss_bMaxBurst; allows 1 TX between ACKs */
        0x00,        /* __u8 ss_bmAttributes; 1 packet per service interval */
        0x02, 0x00   /* __le16 ss_wBytesPerInterval; 15 bits for max 15 ports */
};

/* authorized_default behaviour:
 * -1 is authorized for all devices (leftover from wireless USB)
 * 0 is unauthorized for all devices
 * 1 is authorized for all devices
 * 2 is authorized for internal devices
 */
#define USB_AUTHORIZE_WIRED        -1
#define USB_AUTHORIZE_NONE        0
#define USB_AUTHORIZE_ALL        1
#define USB_AUTHORIZE_INTERNAL        2

static int authorized_default = CONFIG_USB_DEFAULT_AUTHORIZATION_MODE;
module_param(authorized_default, int, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(authorized_default,
                "Default USB device authorization: 0 is not authorized, 1 is authorized (default), 2 is authorized for internal devices, -1 is authorized (same as 1)");
/*-------------------------------------------------------------------------*/

/**
 * ascii2desc() - Helper routine for producing UTF-16LE string descriptors
 * @s: Null-terminated ASCII (actually ISO-8859-1) string
 * @buf: Buffer for USB string descriptor (header + UTF-16LE)
 * @len: Length (in bytes; may be odd) of descriptor buffer.
 *
 * Return: The number of bytes filled in: 2 + 2*strlen(s) or @len,
 * whichever is less.
 *
 * Note:
 * USB String descriptors can contain at most 126 characters; input
 * strings longer than that are truncated.
 */
static unsigned
ascii2desc(char const *s, u8 *buf, unsigned len)
{
        unsigned n, t = 2 + 2*strlen(s);

        if (t > 254)
                t = 254;        /* Longest possible UTF string descriptor */
        if (len > t)
                len = t;

        t += USB_DT_STRING << 8;        /* Now t is first 16 bits to store */

        n = len;
        while (n--) {
                *buf++ = t;
                if (!n--)
                        break;
                *buf++ = t >> 8;
                t = (unsigned char)*s++;
        }
        return len;
}

/**
 * rh_string() - provides string descriptors for root hub
 * @id: the string ID number (0: langids, 1: serial #, 2: product, 3: vendor)
 * @hcd: the host controller for this root hub
 * @data: buffer for output packet
 * @len: length of the provided buffer
 *
 * Produces either a manufacturer, product or serial number string for the
 * virtual root hub device.
 *
 * Return: The number of bytes filled in: the length of the descriptor or
 * of the provided buffer, whichever is less.
 */
static unsigned
rh_string(int id, struct usb_hcd const *hcd, u8 *data, unsigned len)
{
        char buf[160];
        char const *s;
        static char const langids[4] = {4, USB_DT_STRING, 0x09, 0x04};

        /* language ids */
        switch (id) {
        case 0:
                /* Array of LANGID codes (0x0409 is MSFT-speak for "en-us") */
                /* See http://www.usb.org/developers/docs/USB_LANGIDs.pdf */
                if (len > 4)
                        len = 4;
                memcpy(data, langids, len);
                return len;
        case 1:
                /* Serial number */
                s = hcd->self.bus_name;
                break;
        case 2:
                /* Product name */
                s = hcd->product_desc;
                break;
        case 3:
                /* Manufacturer */
                snprintf (buf, sizeof buf, "%s %s %s", init_utsname()->sysname,
                        init_utsname()->release, hcd->driver->description);
                s = buf;
                break;
        default:
                /* Can't happen; caller guarantees it */
                return 0;
        }

        return ascii2desc(s, data, len);
}


/* Root hub control transfers execute synchronously */
static int rh_call_control (struct usb_hcd *hcd, struct urb *urb)
{
        struct usb_ctrlrequest *cmd;
        u16                typeReq, wValue, wIndex, wLength;
        u8                *ubuf = urb->transfer_buffer;
        unsigned        len = 0;
        int                status;
        u8                patch_wakeup = 0;
        u8                patch_protocol = 0;
        u16                tbuf_size;
        u8                *tbuf = NULL;
        const u8        *bufp;

        might_sleep();

        spin_lock_irq(&hcd_root_hub_lock);
        status = usb_hcd_link_urb_to_ep(hcd, urb);
        spin_unlock_irq(&hcd_root_hub_lock);
        if (status)
                return status;
        urb->hcpriv = hcd;        /* Indicate it's queued */

        cmd = (struct usb_ctrlrequest *) urb->setup_packet;
        typeReq  = (cmd->bRequestType << 8) | cmd->bRequest;
        wValue   = le16_to_cpu (cmd->wValue);
        wIndex   = le16_to_cpu (cmd->wIndex);
        wLength  = le16_to_cpu (cmd->wLength);

        if (wLength > urb->transfer_buffer_length)
                goto error;

        /*
         * tbuf should be at least as big as the
         * USB hub descriptor.
         */
        tbuf_size =  max_t(u16, sizeof(struct usb_hub_descriptor), wLength);
        tbuf = kzalloc(tbuf_size, GFP_KERNEL);
        if (!tbuf) {
                status = -ENOMEM;
                goto err_alloc;
        }

        bufp = tbuf;


        urb->actual_length = 0;
        switch (typeReq) {

        /* DEVICE REQUESTS */

        /* The root hub's remote wakeup enable bit is implemented using
         * driver model wakeup flags.  If this system supports wakeup
         * through USB, userspace may change the default "allow wakeup"
         * policy through sysfs or these calls.
         *
         * Most root hubs support wakeup from downstream devices, for
         * runtime power management (disabling USB clocks and reducing
         * VBUS power usage).  However, not all of them do so; silicon,
         * board, and BIOS bugs here are not uncommon, so these can't
         * be treated quite like external hubs.
         *
         * Likewise, not all root hubs will pass wakeup events upstream,
         * to wake up the whole system.  So don't assume root hub and
         * controller capabilities are identical.
         */

        case DeviceRequest | USB_REQ_GET_STATUS:
                tbuf[0] = (device_may_wakeup(&hcd->self.root_hub->dev)
                                        << USB_DEVICE_REMOTE_WAKEUP)
                                | (1 << USB_DEVICE_SELF_POWERED);
                tbuf[1] = 0;
                len = 2;
                break;
        case DeviceOutRequest | USB_REQ_CLEAR_FEATURE:
                if (wValue == USB_DEVICE_REMOTE_WAKEUP)
                        device_set_wakeup_enable(&hcd->self.root_hub->dev, 0);
                else
                        goto error;
                break;
        case DeviceOutRequest | USB_REQ_SET_FEATURE:
                if (device_can_wakeup(&hcd->self.root_hub->dev)
                                && wValue == USB_DEVICE_REMOTE_WAKEUP)
                        device_set_wakeup_enable(&hcd->self.root_hub->dev, 1);
                else
                        goto error;
                break;
        case DeviceRequest | USB_REQ_GET_CONFIGURATION:
                tbuf[0] = 1;
                len = 1;
                fallthrough;
        case DeviceOutRequest | USB_REQ_SET_CONFIGURATION:
                break;
        case DeviceRequest | USB_REQ_GET_DESCRIPTOR:
                switch (wValue & 0xff00) {
                case USB_DT_DEVICE << 8:
                        switch (hcd->speed) {
                        case HCD_USB32:
                        case HCD_USB31:
                                bufp = usb31_rh_dev_descriptor;
                                break;
                        case HCD_USB3:
                                bufp = usb3_rh_dev_descriptor;
                                break;
                        case HCD_USB2:
                                bufp = usb2_rh_dev_descriptor;
                                break;
                        case HCD_USB11:
                                bufp = usb11_rh_dev_descriptor;
                                break;
                        default:
                                goto error;
                        }
                        len = 18;
                        if (hcd->has_tt)
                                patch_protocol = 1;
                        break;
                case USB_DT_CONFIG << 8:
                        switch (hcd->speed) {
                        case HCD_USB32:
                        case HCD_USB31:
                        case HCD_USB3:
                                bufp = ss_rh_config_descriptor;
                                len = sizeof ss_rh_config_descriptor;
                                break;
                        case HCD_USB2:
                                bufp = hs_rh_config_descriptor;
                                len = sizeof hs_rh_config_descriptor;
                                break;
                        case HCD_USB11:
                                bufp = fs_rh_config_descriptor;
                                len = sizeof fs_rh_config_descriptor;
                                break;
                        default:
                                goto error;
                        }
                        if (device_can_wakeup(&hcd->self.root_hub->dev))
                                patch_wakeup = 1;
                        break;
                case USB_DT_STRING << 8:
                        if ((wValue & 0xff) < 4)
                                urb->actual_length = rh_string(wValue & 0xff,
                                                hcd, ubuf, wLength);
                        else /* unsupported IDs --> "protocol stall" */
                                goto error;
                        break;
                case USB_DT_BOS << 8:
                        goto nongeneric;
                default:
                        goto error;
                }
                break;
        case DeviceRequest | USB_REQ_GET_INTERFACE:
                tbuf[0] = 0;
                len = 1;
                fallthrough;
        case DeviceOutRequest | USB_REQ_SET_INTERFACE:
                break;
        case DeviceOutRequest | USB_REQ_SET_ADDRESS:
                /* wValue == urb->dev->devaddr */
                dev_dbg (hcd->self.controller, "root hub device address %d\n",
                        wValue);
                break;

        /* INTERFACE REQUESTS (no defined feature/status flags) */

        /* ENDPOINT REQUESTS */

        case EndpointRequest | USB_REQ_GET_STATUS:
                /* ENDPOINT_HALT flag */
                tbuf[0] = 0;
                tbuf[1] = 0;
                len = 2;
                fallthrough;
        case EndpointOutRequest | USB_REQ_CLEAR_FEATURE:
        case EndpointOutRequest | USB_REQ_SET_FEATURE:
                dev_dbg (hcd->self.controller, "no endpoint features yet\n");
                break;

        /* CLASS REQUESTS (and errors) */

        default:
nongeneric:
                /* non-generic request */
                switch (typeReq) {
                case GetHubStatus:
                        len = 4;
                        break;
                case GetPortStatus:
                        if (wValue == HUB_PORT_STATUS)
                                len = 4;
                        else
                                /* other port status types return 8 bytes */
                                len = 8;
                        break;
                case GetHubDescriptor:
                        len = sizeof (struct usb_hub_descriptor);
                        break;
                case DeviceRequest | USB_REQ_GET_DESCRIPTOR:
                        /* len is returned by hub_control */
                        break;
                }
                status = hcd->driver->hub_control (hcd,
                        typeReq, wValue, wIndex,
                        tbuf, wLength);

                if (typeReq == GetHubDescriptor)
                        usb_hub_adjust_deviceremovable(hcd->self.root_hub,
                                (struct usb_hub_descriptor *)tbuf);
                break;
error:
                /* "protocol stall" on error */
                status = -EPIPE;
        }

        if (status < 0) {
                len = 0;
                if (status != -EPIPE) {
                        dev_dbg (hcd->self.controller,
                                "CTRL: TypeReq=0x%x val=0x%x "
                                "idx=0x%x len=%d ==> %d\n",
                                typeReq, wValue, wIndex,
                                wLength, status);
                }
        } else if (status > 0) {
                /* hub_control may return the length of data copied. */
                len = status;
                status = 0;
        }
        if (len) {
                if (urb->transfer_buffer_length < len)
                        len = urb->transfer_buffer_length;
                urb->actual_length = len;
                /* always USB_DIR_IN, toward host */
                memcpy (ubuf, bufp, len);

                /* report whether RH hardware supports remote wakeup */
                if (patch_wakeup &&
                                len > offsetof (struct usb_config_descriptor,
                                                bmAttributes))
                        ((struct usb_config_descriptor *)ubuf)->bmAttributes
                                |= USB_CONFIG_ATT_WAKEUP;

                /* report whether RH hardware has an integrated TT */
                if (patch_protocol &&
                                len > offsetof(struct usb_device_descriptor,
                                                bDeviceProtocol))
                        ((struct usb_device_descriptor *) ubuf)->
                                bDeviceProtocol = USB_HUB_PR_HS_SINGLE_TT;
        }

        kfree(tbuf);
 err_alloc:

        /* any errors get returned through the urb completion */
        spin_lock_irq(&hcd_root_hub_lock);
        usb_hcd_unlink_urb_from_ep(hcd, urb);
        usb_hcd_giveback_urb(hcd, urb, status);
        spin_unlock_irq(&hcd_root_hub_lock);
        return 0;
}

/*-------------------------------------------------------------------------*/

/*
 * Root Hub interrupt transfers are polled using a timer if the
 * driver requests it; otherwise the driver is responsible for
 * calling usb_hcd_poll_rh_status() when an event occurs.
 *
 * Completion handler may not sleep. See usb_hcd_giveback_urb() for details.
 */
void usb_hcd_poll_rh_status(struct usb_hcd *hcd)
{
        struct urb        *urb;
        int                length;
        int                status;
        unsigned long        flags;
        char                buffer[6];        /* Any root hubs with > 31 ports? */

        if (unlikely(!hcd->rh_pollable))
                return;
        if (!hcd->uses_new_polling && !hcd->status_urb)
                return;

        length = hcd->driver->hub_status_data(hcd, buffer);
        if (length > 0) {

                /* try to complete the status urb */
                spin_lock_irqsave(&hcd_root_hub_lock, flags);
                urb = hcd->status_urb;
                if (urb) {
                        clear_bit(HCD_FLAG_POLL_PENDING, &hcd->flags);
                        hcd->status_urb = NULL;
                        if (urb->transfer_buffer_length >= length) {
                                status = 0;
                        } else {
                                status = -EOVERFLOW;
                                length = urb->transfer_buffer_length;
                        }
                        urb->actual_length = length;
                        memcpy(urb->transfer_buffer, buffer, length);

                        usb_hcd_unlink_urb_from_ep(hcd, urb);
                        usb_hcd_giveback_urb(hcd, urb, status);
                } else {
                        length = 0;
                        set_bit(HCD_FLAG_POLL_PENDING, &hcd->flags);
                }
                spin_unlock_irqrestore(&hcd_root_hub_lock, flags);
        }

        /* The USB 2.0 spec says 256 ms.  This is close enough and won't
         * exceed that limit if HZ is 100. The math is more clunky than
         * maybe expected, this is to make sure that all timers for USB devices
         * fire at the same time to give the CPU a break in between */
        if (hcd->uses_new_polling ? HCD_POLL_RH(hcd) :
                        (length == 0 && hcd->status_urb != NULL))
                mod_timer (&hcd->rh_timer, (jiffies/(HZ/4) + 1) * (HZ/4));
}
EXPORT_SYMBOL_GPL(usb_hcd_poll_rh_status);

/* timer callback */
static void rh_timer_func (struct timer_list *t)
{
        struct usb_hcd *_hcd = timer_container_of(_hcd, t, rh_timer);

        usb_hcd_poll_rh_status(_hcd);
}

/*-------------------------------------------------------------------------*/

static int rh_queue_status (struct usb_hcd *hcd, struct urb *urb)
{
        int                retval;
        unsigned long        flags;
        unsigned        len = 1 + (urb->dev->maxchild / 8);

        spin_lock_irqsave (&hcd_root_hub_lock, flags);
        if (hcd->status_urb || urb->transfer_buffer_length < len) {
                dev_dbg (hcd->self.controller, "not queuing rh status urb\n");
                retval = -EINVAL;
                goto done;
        }

        retval = usb_hcd_link_urb_to_ep(hcd, urb);
        if (retval)
                goto done;

        hcd->status_urb = urb;
        urb->hcpriv = hcd;        /* indicate it's queued */
        if (!hcd->uses_new_polling)
                mod_timer(&hcd->rh_timer, (jiffies/(HZ/4) + 1) * (HZ/4));

        /* If a status change has already occurred, report it ASAP */
        else if (HCD_POLL_PENDING(hcd))
                mod_timer(&hcd->rh_timer, jiffies);
        retval = 0;
 done:
        spin_unlock_irqrestore (&hcd_root_hub_lock, flags);
        return retval;
}

static int rh_urb_enqueue (struct usb_hcd *hcd, struct urb *urb)
{
        if (usb_endpoint_xfer_int(&urb->ep->desc))
                return rh_queue_status (hcd, urb);
        if (usb_endpoint_xfer_control(&urb->ep->desc))
                return rh_call_control (hcd, urb);
        return -EINVAL;
}

/*-------------------------------------------------------------------------*/

/* Unlinks of root-hub control URBs are legal, but they don't do anything
 * since these URBs always execute synchronously.
 */
static int usb_rh_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
{
        unsigned long        flags;
        int                rc;

        spin_lock_irqsave(&hcd_root_hub_lock, flags);
        rc = usb_hcd_check_unlink_urb(hcd, urb, status);
        if (rc)
                goto done;

        if (usb_endpoint_num(&urb->ep->desc) == 0) {        /* Control URB */
                ;        /* Do nothing */

        } else {                                /* Status URB */
                if (!hcd->uses_new_polling)
                        timer_delete(&hcd->rh_timer);
                if (urb == hcd->status_urb) {
                        hcd->status_urb = NULL;
                        usb_hcd_unlink_urb_from_ep(hcd, urb);
                        usb_hcd_giveback_urb(hcd, urb, status);
                }
        }
 done:
        spin_unlock_irqrestore(&hcd_root_hub_lock, flags);
        return rc;
}


/*-------------------------------------------------------------------------*/

/**
 * usb_bus_init - shared initialization code
 * @bus: the bus structure being initialized
 *
 * This code is used to initialize a usb_bus structure, memory for which is
 * separately managed.
 */
static void usb_bus_init (struct usb_bus *bus)
{
        memset(&bus->devmap, 0, sizeof(bus->devmap));

        bus->devnum_next = 1;

        bus->root_hub = NULL;
        bus->busnum = -1;
        bus->bandwidth_allocated = 0;
        bus->bandwidth_int_reqs  = 0;
        bus->bandwidth_isoc_reqs = 0;
        mutex_init(&bus->devnum_next_mutex);
}

/*-------------------------------------------------------------------------*/

/**
 * usb_register_bus - registers the USB host controller with the usb core
 * @bus: pointer to the bus to register
 *
 * Context: task context, might sleep.
 *
 * Assigns a bus number, and links the controller into usbcore data
 * structures so that it can be seen by scanning the bus list.
 *
 * Return: 0 if successful. A negative error code otherwise.
 */
static int usb_register_bus(struct usb_bus *bus)
{
        int result = -E2BIG;
        int busnum;

        mutex_lock(&usb_bus_idr_lock);
        busnum = idr_alloc(&usb_bus_idr, bus, 1, USB_MAXBUS, GFP_KERNEL);
        if (busnum < 0) {
                pr_err("%s: failed to get bus number\n", usbcore_name);
                goto error_find_busnum;
        }
        bus->busnum = busnum;
        mutex_unlock(&usb_bus_idr_lock);

        usb_notify_add_bus(bus);

        dev_info (bus->controller, "new USB bus registered, assigned bus "
                  "number %d\n", bus->busnum);
        return 0;

error_find_busnum:
        mutex_unlock(&usb_bus_idr_lock);
        return result;
}

/**
 * usb_deregister_bus - deregisters the USB host controller
 * @bus: pointer to the bus to deregister
 *
 * Context: task context, might sleep.
 *
 * Recycles the bus number, and unlinks the controller from usbcore data
 * structures so that it won't be seen by scanning the bus list.
 */
static void usb_deregister_bus (struct usb_bus *bus)
{
        dev_info (bus->controller, "USB bus %d deregistered\n", bus->busnum);

        /*
         * NOTE: make sure that all the devices are removed by the
         * controller code, as well as having it call this when cleaning
         * itself up
         */
        mutex_lock(&usb_bus_idr_lock);
        idr_remove(&usb_bus_idr, bus->busnum);
        mutex_unlock(&usb_bus_idr_lock);

        usb_notify_remove_bus(bus);
}

/**
 * register_root_hub - called by usb_add_hcd() to register a root hub
 * @hcd: host controller for this root hub
 *
 * This function registers the root hub with the USB subsystem.  It sets up
 * the device properly in the device tree and then calls usb_new_device()
 * to register the usb device.  It also assigns the root hub's USB address
 * (always 1).
 *
 * Return: 0 if successful. A negative error code otherwise.
 */
static int register_root_hub(struct usb_hcd *hcd)
{
        struct device *parent_dev = hcd->self.controller;
        struct usb_device *usb_dev = hcd->self.root_hub;
        struct usb_device_descriptor *descr;
        const int devnum = 1;
        int retval;

        usb_dev->devnum = devnum;
        usb_dev->bus->devnum_next = devnum + 1;
        set_bit(devnum, usb_dev->bus->devmap);
        usb_set_device_state(usb_dev, USB_STATE_ADDRESS);

        mutex_lock(&usb_bus_idr_lock);

        usb_dev->ep0.desc.wMaxPacketSize = cpu_to_le16(64);
        descr = usb_get_device_descriptor(usb_dev);
        if (IS_ERR(descr)) {
                retval = PTR_ERR(descr);
                mutex_unlock(&usb_bus_idr_lock);
                dev_dbg (parent_dev, "can't read %s device descriptor %d\n",
                                dev_name(&usb_dev->dev), retval);
                return retval;
        }
        usb_dev->descriptor = *descr;
        kfree(descr);

        if (le16_to_cpu(usb_dev->descriptor.bcdUSB) >= 0x0201) {
                retval = usb_get_bos_descriptor(usb_dev);
                if (!retval) {
                        usb_dev->lpm_capable = usb_device_supports_lpm(usb_dev);
                } else if (usb_dev->speed >= USB_SPEED_SUPER) {
                        mutex_unlock(&usb_bus_idr_lock);
                        dev_dbg(parent_dev, "can't read %s bos descriptor %d\n",
                                        dev_name(&usb_dev->dev), retval);
                        return retval;
                }
        }

        retval = usb_new_device (usb_dev);
        if (retval) {
                dev_err (parent_dev, "can't register root hub for %s, %d\n",
                                dev_name(&usb_dev->dev), retval);
        } else {
                spin_lock_irq (&hcd_root_hub_lock);
                hcd->rh_registered = 1;
                spin_unlock_irq (&hcd_root_hub_lock);

                /* Did the HC die before the root hub was registered? */
                if (HCD_DEAD(hcd))
                        usb_hc_died (hcd);        /* This time clean up */
        }
        mutex_unlock(&usb_bus_idr_lock);

        return retval;
}

/*
 * usb_hcd_start_port_resume - a root-hub port is sending a resume signal
 * @bus: the bus which the root hub belongs to
 * @portnum: the port which is being resumed
 *
 * HCDs should call this function when they know that a resume signal is
 * being sent to a root-hub port.  The root hub will be prevented from
 * going into autosuspend until usb_hcd_end_port_resume() is called.
 *
 * The bus's private lock must be held by the caller.
 */
void usb_hcd_start_port_resume(struct usb_bus *bus, int portnum)
{
        unsigned bit = 1 << portnum;

        if (!(bus->resuming_ports & bit)) {
                bus->resuming_ports |= bit;
                pm_runtime_get_noresume(&bus->root_hub->dev);
        }
}
EXPORT_SYMBOL_GPL(usb_hcd_start_port_resume);

/*
 * usb_hcd_end_port_resume - a root-hub port has stopped sending a resume signal
 * @bus: the bus which the root hub belongs to
 * @portnum: the port which is being resumed
 *
 * HCDs should call this function when they know that a resume signal has
 * stopped being sent to a root-hub port.  The root hub will be allowed to
 * autosuspend again.
 *
 * The bus's private lock must be held by the caller.
 */
void usb_hcd_end_port_resume(struct usb_bus *bus, int portnum)
{
        unsigned bit = 1 << portnum;

        if (bus->resuming_ports & bit) {
                bus->resuming_ports &= ~bit;
                pm_runtime_put_noidle(&bus->root_hub->dev);
        }
}
EXPORT_SYMBOL_GPL(usb_hcd_end_port_resume);

/*-------------------------------------------------------------------------*/

/**
 * usb_calc_bus_time - approximate periodic transaction time in nanoseconds
 * @speed: from dev->speed; USB_SPEED_{LOW,FULL,HIGH}
 * @is_input: true iff the transaction sends data to the host
 * @isoc: true for isochronous transactions, false for interrupt ones
 * @bytecount: how many bytes in the transaction.
 *
 * Return: Approximate bus time in nanoseconds for a periodic transaction.
 *
 * Note:
 * See USB 2.0 spec section 5.11.3; only periodic transfers need to be
 * scheduled in software, this function is only used for such scheduling.
 */
long usb_calc_bus_time (int speed, int is_input, int isoc, int bytecount)
{
        unsigned long        tmp;

        switch (speed) {
        case USB_SPEED_LOW:         /* INTR only */
                if (is_input) {
                        tmp = (67667L * (31L + 10L * BitTime (bytecount))) / 1000L;
                        return 64060L + (2 * BW_HUB_LS_SETUP) + BW_HOST_DELAY + tmp;
                } else {
                        tmp = (66700L * (31L + 10L * BitTime (bytecount))) / 1000L;
                        return 64107L + (2 * BW_HUB_LS_SETUP) + BW_HOST_DELAY + tmp;
                }
        case USB_SPEED_FULL:        /* ISOC or INTR */
                if (isoc) {
                        tmp = (8354L * (31L + 10L * BitTime (bytecount))) / 1000L;
                        return ((is_input) ? 7268L : 6265L) + BW_HOST_DELAY + tmp;
                } else {
                        tmp = (8354L * (31L + 10L * BitTime (bytecount))) / 1000L;
                        return 9107L + BW_HOST_DELAY + tmp;
                }
        case USB_SPEED_HIGH:        /* ISOC or INTR */
                /* FIXME adjust for input vs output */
                if (isoc)
                        tmp = HS_NSECS_ISO (bytecount);
                else
                        tmp = HS_NSECS (bytecount);
                return tmp;
        default:
                pr_debug ("%s: bogus device speed!\n", usbcore_name);
                return -1;
        }
}
EXPORT_SYMBOL_GPL(usb_calc_bus_time);


/*-------------------------------------------------------------------------*/

/*
 * Generic HC operations.
 */

/*-------------------------------------------------------------------------*/

/**
 * usb_hcd_link_urb_to_ep - add an URB to its endpoint queue
 * @hcd: host controller to which @urb was submitted
 * @urb: URB being submitted
 *
 * Host controller drivers should call this routine in their enqueue()
 * method.  The HCD's private spinlock must be held and interrupts must
 * be disabled.  The actions carried out here are required for URB
 * submission, as well as for endpoint shutdown and for usb_kill_urb.
 *
 * Return: 0 for no error, otherwise a negative error code (in which case
 * the enqueue() method must fail).  If no error occurs but enqueue() fails
 * anyway, it must call usb_hcd_unlink_urb_from_ep() before releasing
 * the private spinlock and returning.
 */
int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb)
{
        int                rc = 0;

        spin_lock(&hcd_urb_list_lock);

        /* Check that the URB isn't being killed */
        if (unlikely(atomic_read(&urb->reject))) {
                rc = -EPERM;
                goto done;
        }

        if (unlikely(!urb->ep->enabled)) {
                rc = -ENOENT;
                goto done;
        }

        if (unlikely(!urb->dev->can_submit)) {
                rc = -EHOSTUNREACH;
                goto done;
        }

        /*
         * Check the host controller's state and add the URB to the
         * endpoint's queue.
         */
        if (HCD_RH_RUNNING(hcd)) {
                urb->unlinked = 0;
                list_add_tail(&urb->urb_list, &urb->ep->urb_list);
        } else {
                rc = -ESHUTDOWN;
                goto done;
        }
 done:
        spin_unlock(&hcd_urb_list_lock);
        return rc;
}
EXPORT_SYMBOL_GPL(usb_hcd_link_urb_to_ep);

/**
 * usb_hcd_check_unlink_urb - check whether an URB may be unlinked
 * @hcd: host controller to which @urb was submitted
 * @urb: URB being checked for unlinkability
 * @status: error code to store in @urb if the unlink succeeds
 *
 * Host controller drivers should call this routine in their dequeue()
 * method.  The HCD's private spinlock must be held and interrupts must
 * be disabled.  The actions carried out here are required for making
 * sure than an unlink is valid.
 *
 * Return: 0 for no error, otherwise a negative error code (in which case
 * the dequeue() method must fail).  The possible error codes are:
 *
 *        -EIDRM: @urb was not submitted or has already completed.
 *                The completion function may not have been called yet.
 *
 *        -EBUSY: @urb has already been unlinked.
 */
int usb_hcd_check_unlink_urb(struct usb_hcd *hcd, struct urb *urb,
                int status)
{
        struct list_head        *tmp;

        /* insist the urb is still queued */
        list_for_each(tmp, &urb->ep->urb_list) {
                if (tmp == &urb->urb_list)
                        break;
        }
        if (tmp != &urb->urb_list)
                return -EIDRM;

        /* Any status except -EINPROGRESS means something already started to
         * unlink this URB from the hardware.  So there's no more work to do.
         */
        if (urb->unlinked)
                return -EBUSY;
        urb->unlinked = status;
        return 0;
}
EXPORT_SYMBOL_GPL(usb_hcd_check_unlink_urb);

/**
 * usb_hcd_unlink_urb_from_ep - remove an URB from its endpoint queue
 * @hcd: host controller to which @urb was submitted
 * @urb: URB being unlinked
 *
 * Host controller drivers should call this routine before calling
 * usb_hcd_giveback_urb().  The HCD's private spinlock must be held and
 * interrupts must be disabled.  The actions carried out here are required
 * for URB completion.
 */
void usb_hcd_unlink_urb_from_ep(struct usb_hcd *hcd, struct urb *urb)
{
        /* clear all state linking urb to this dev (and hcd) */
        spin_lock(&hcd_urb_list_lock);
        list_del_init(&urb->urb_list);
        spin_unlock(&hcd_urb_list_lock);
}
EXPORT_SYMBOL_GPL(usb_hcd_unlink_urb_from_ep);

/*
 * Some usb host controllers can only perform dma using a small SRAM area,
 * or have restrictions on addressable DRAM.
 * The usb core itself is however optimized for host controllers that can dma
 * using regular system memory - like pci devices doing bus mastering.
 *
 * To support host controllers with limited dma capabilities we provide dma
 * bounce buffers. This feature can be enabled by initializing
 * hcd->localmem_pool using usb_hcd_setup_local_mem().
 *
 * The initialized hcd->localmem_pool then tells the usb code to allocate all
 * data for dma using the genalloc API.
 *
 * So, to summarize...
 *
 * - We need "local" memory, canonical example being
 *   a small SRAM on a discrete controller being the
 *   only memory that the controller can read ...
 *   (a) "normal" kernel memory is no good, and
 *   (b) there's not enough to share
 *
 * - So we use that, even though the primary requirement
 *   is that the memory be "local" (hence addressable
 *   by that device), not "coherent".
 *
 */

static int hcd_alloc_coherent(struct usb_bus *bus,
                              gfp_t mem_flags, dma_addr_t *dma_handle,
                              void **vaddr_handle, size_t size,
                              enum dma_data_direction dir)
{
        unsigned char *vaddr;

        if (*vaddr_handle == NULL) {
                WARN_ON_ONCE(1);
                return -EFAULT;
        }

        vaddr = hcd_buffer_alloc(bus, size + sizeof(unsigned long),
                                 mem_flags, dma_handle);
        if (!vaddr)
                return -ENOMEM;

        /*
         * Store the virtual address of the buffer at the end
         * of the allocated dma buffer. The size of the buffer
         * may be uneven so use unaligned functions instead
         * of just rounding up. It makes sense to optimize for
         * memory footprint over access speed since the amount
         * of memory available for dma may be limited.
         */
        put_unaligned((unsigned long)*vaddr_handle,
                      (unsigned long *)(vaddr + size));

        if (dir == DMA_TO_DEVICE)
                memcpy(vaddr, *vaddr_handle, size);

        *vaddr_handle = vaddr;
        return 0;
}

static void hcd_free_coherent(struct usb_bus *bus, dma_addr_t *dma_handle,
                              void **vaddr_handle, size_t size,
                              enum dma_data_direction dir)
{
        unsigned char *vaddr = *vaddr_handle;

        vaddr = (void *)get_unaligned((unsigned long *)(vaddr + size));

        if (dir == DMA_FROM_DEVICE)
                memcpy(vaddr, *vaddr_handle, size);

        hcd_buffer_free(bus, size + sizeof(vaddr), *vaddr_handle, *dma_handle);

        *vaddr_handle = vaddr;
        *dma_handle = 0;
}

void usb_hcd_unmap_urb_setup_for_dma(struct usb_hcd *hcd, struct urb *urb)
{
        if (IS_ENABLED(CONFIG_HAS_DMA) &&
            (urb->transfer_flags & URB_SETUP_MAP_SINGLE))
                dma_unmap_single(hcd->self.sysdev,
                                urb->setup_dma,
                                sizeof(struct usb_ctrlrequest),
                                DMA_TO_DEVICE);
        else if (urb->transfer_flags & URB_SETUP_MAP_LOCAL)
                hcd_free_coherent(urb->dev->bus,
                                &urb->setup_dma,
                                (void **) &urb->setup_packet,
                                sizeof(struct usb_ctrlrequest),
                                DMA_TO_DEVICE);

        /* Make it safe to call this routine more than once */
        urb->transfer_flags &= ~(URB_SETUP_MAP_SINGLE | URB_SETUP_MAP_LOCAL);
}
EXPORT_SYMBOL_GPL(usb_hcd_unmap_urb_setup_for_dma);

static void unmap_urb_for_dma(struct usb_hcd *hcd, struct urb *urb)
{
        if (hcd->driver->unmap_urb_for_dma)
                hcd->driver->unmap_urb_for_dma(hcd, urb);
        else
                usb_hcd_unmap_urb_for_dma(hcd, urb);
}

void usb_hcd_unmap_urb_for_dma(struct usb_hcd *hcd, struct urb *urb)
{
        enum dma_data_direction dir;

        usb_hcd_unmap_urb_setup_for_dma(hcd, urb);

        dir = usb_urb_dir_in(urb) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
        if (IS_ENABLED(CONFIG_HAS_DMA) &&
            (urb->transfer_flags & URB_DMA_MAP_SG)) {
                dma_unmap_sg(hcd->self.sysdev,
                                urb->sg,
                                urb->num_sgs,
                                dir);
        } else if (IS_ENABLED(CONFIG_HAS_DMA) &&
                 (urb->transfer_flags & URB_DMA_MAP_PAGE)) {
                dma_unmap_page(hcd->self.sysdev,
                                urb->transfer_dma,
                                urb->transfer_buffer_length,
                                dir);
        } else if (IS_ENABLED(CONFIG_HAS_DMA) &&
                 (urb->transfer_flags & URB_DMA_MAP_SINGLE)) {
                dma_unmap_single(hcd->self.sysdev,
                                urb->transfer_dma,
                                urb->transfer_buffer_length,
                                dir);
        } else if (urb->transfer_flags & URB_MAP_LOCAL) {
                hcd_free_coherent(urb->dev->bus,
                                &urb->transfer_dma,
                                &urb->transfer_buffer,
                                urb->transfer_buffer_length,
                                dir);
        } else if ((urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) && urb->sgt) {
                dma_sync_sgtable_for_cpu(hcd->self.sysdev, urb->sgt, dir);
                if (dir == DMA_FROM_DEVICE)
                        invalidate_kernel_vmap_range(urb->transfer_buffer,
                                                     urb->transfer_buffer_length);
        }

        /* Make it safe to call this routine more than once */
        urb->transfer_flags &= ~(URB_DMA_MAP_SG | URB_DMA_MAP_PAGE |
                        URB_DMA_MAP_SINGLE | URB_MAP_LOCAL);
}
EXPORT_SYMBOL_GPL(usb_hcd_unmap_urb_for_dma);

static int map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
                           gfp_t mem_flags)
{
        if (hcd->driver->map_urb_for_dma)
                return hcd->driver->map_urb_for_dma(hcd, urb, mem_flags);
        else
                return usb_hcd_map_urb_for_dma(hcd, urb, mem_flags);
}

int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
                            gfp_t mem_flags)
{
        enum dma_data_direction dir;
        int ret = 0;

        /* Map the URB's buffers for DMA access.
         * Lower level HCD code should use *_dma exclusively,
         * unless it uses pio or talks to another transport,
         * or uses the provided scatter gather list for bulk.
         */

        if (usb_endpoint_xfer_control(&urb->ep->desc)) {
                if (hcd->self.uses_pio_for_control)
                        return ret;
                if (hcd->localmem_pool) {
                        ret = hcd_alloc_coherent(
                                        urb->dev->bus, mem_flags,
                                        &urb->setup_dma,
                                        (void **)&urb->setup_packet,
                                        sizeof(struct usb_ctrlrequest),
                                        DMA_TO_DEVICE);
                        if (ret)
                                return ret;
                        urb->transfer_flags |= URB_SETUP_MAP_LOCAL;
                } else if (hcd_uses_dma(hcd)) {
                        if (object_is_on_stack(urb->setup_packet)) {
                                WARN_ONCE(1, "setup packet is on stack\n");
                                return -EAGAIN;
                        }

                        urb->setup_dma = dma_map_single(
                                        hcd->self.sysdev,
                                        urb->setup_packet,
                                        sizeof(struct usb_ctrlrequest),
                                        DMA_TO_DEVICE);
                        if (dma_mapping_error(hcd->self.sysdev,
                                                urb->setup_dma))
                                return -EAGAIN;
                        urb->transfer_flags |= URB_SETUP_MAP_SINGLE;
                }
        }

        dir = usb_urb_dir_in(urb) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
        if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) {
                if (!urb->sgt)
                        return 0;

                if (dir == DMA_TO_DEVICE)
                        flush_kernel_vmap_range(urb->transfer_buffer,
                                                urb->transfer_buffer_length);
                dma_sync_sgtable_for_device(hcd->self.sysdev, urb->sgt, dir);
        } else if (urb->transfer_buffer_length != 0) {
                if (hcd->localmem_pool) {
                        ret = hcd_alloc_coherent(
                                        urb->dev->bus, mem_flags,
                                        &urb->transfer_dma,
                                        &urb->transfer_buffer,
                                        urb->transfer_buffer_length,
                                        dir);
                        if (ret == 0)
                                urb->transfer_flags |= URB_MAP_LOCAL;
                } else if (hcd_uses_dma(hcd)) {
                        if (urb->num_sgs) {
                                int n;

                                /* We don't support sg for isoc transfers ! */
                                if (usb_endpoint_xfer_isoc(&urb->ep->desc)) {
                                        WARN_ON(1);
                                        return -EINVAL;
                                }

                                n = dma_map_sg(
                                                hcd->self.sysdev,
                                                urb->sg,
                                                urb->num_sgs,
                                                dir);
                                if (!n)
                                        ret = -EAGAIN;
                                else
                                        urb->transfer_flags |= URB_DMA_MAP_SG;
                                urb->num_mapped_sgs = n;
                                if (n != urb->num_sgs)
                                        urb->transfer_flags |=
                                                        URB_DMA_SG_COMBINED;
                        } else if (urb->sg) {
                                struct scatterlist *sg = urb->sg;
                                urb->transfer_dma = dma_map_page(
                                                hcd->self.sysdev,
                                                sg_page(sg),
                                                sg->offset,
                                                urb->transfer_buffer_length,
                                                dir);
                                if (dma_mapping_error(hcd->self.sysdev,
                                                urb->transfer_dma))
                                        ret = -EAGAIN;
                                else
                                        urb->transfer_flags |= URB_DMA_MAP_PAGE;
                        } else if (object_is_on_stack(urb->transfer_buffer)) {
                                WARN_ONCE(1, "transfer buffer is on stack\n");
                                ret = -EAGAIN;
                        } else {
                                urb->transfer_dma = dma_map_single(
                                                hcd->self.sysdev,
                                                urb->transfer_buffer,
                                                urb->transfer_buffer_length,
                                                dir);
                                if (dma_mapping_error(hcd->self.sysdev,
                                                urb->transfer_dma))
                                        ret = -EAGAIN;
                                else
                                        urb->transfer_flags |= URB_DMA_MAP_SINGLE;
                        }
                }
                if (ret && (urb->transfer_flags & (URB_SETUP_MAP_SINGLE |
                                URB_SETUP_MAP_LOCAL)))
                        usb_hcd_unmap_urb_for_dma(hcd, urb);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(usb_hcd_map_urb_for_dma);

/*-------------------------------------------------------------------------*/

/* may be called in any context with a valid urb->dev usecount
 * caller surrenders "ownership" of urb
 * expects usb_submit_urb() to have sanity checked and conditioned all
 * inputs in the urb
 */
int usb_hcd_submit_urb (struct urb *urb, gfp_t mem_flags)
{
        int                        status;
        struct usb_hcd                *hcd = bus_to_hcd(urb->dev->bus);

        /* increment urb's reference count as part of giving it to the HCD
         * (which will control it).  HCD guarantees that it either returns
         * an error or calls giveback(), but not both.
         */
        usb_get_urb(urb);
        atomic_inc(&urb->use_count);
        atomic_inc(&urb->dev->urbnum);
        usbmon_urb_submit(&hcd->self, urb);

        /* NOTE requirements on root-hub callers (usbfs and the hub
         * driver, for now):  URBs' urb->transfer_buffer must be
         * valid and usb_buffer_{sync,unmap}() not be needed, since
         * they could clobber root hub response data.  Also, control
         * URBs must be submitted in process context with interrupts
         * enabled.
         */

        if (is_root_hub(urb->dev)) {
                status = rh_urb_enqueue(hcd, urb);
        } else {
                status = map_urb_for_dma(hcd, urb, mem_flags);
                if (likely(status == 0)) {
                        status = hcd->driver->urb_enqueue(hcd, urb, mem_flags);
                        if (unlikely(status))
                                unmap_urb_for_dma(hcd, urb);
                }
        }

        if (unlikely(status)) {
                usbmon_urb_submit_error(&hcd->self, urb, status);
                urb->hcpriv = NULL;
                INIT_LIST_HEAD(&urb->urb_list);
                atomic_dec(&urb->use_count);
                /*
                 * Order the write of urb->use_count above before the read
                 * of urb->reject below.  Pairs with the memory barriers in
                 * usb_kill_urb() and usb_poison_urb().
                 */
                smp_mb__after_atomic();

                atomic_dec(&urb->dev->urbnum);
                if (atomic_read(&urb->reject))
                        wake_up(&usb_kill_urb_queue);
                usb_put_urb(urb);
        }
        return status;
}

/*-------------------------------------------------------------------------*/

/* this makes the hcd giveback() the urb more quickly, by kicking it
 * off hardware queues (which may take a while) and returning it as
 * soon as practical.  we've already set up the urb's return status,
 * but we can't know if the callback completed already.
 */
static int unlink1(struct usb_hcd *hcd, struct urb *urb, int status)
{
        int                value;

        if (is_root_hub(urb->dev))
                value = usb_rh_urb_dequeue(hcd, urb, status);
        else {

                /* The only reason an HCD might fail this call is if
                 * it has not yet fully queued the urb to begin with.
                 * Such failures should be harmless. */
                value = hcd->driver->urb_dequeue(hcd, urb, status);
        }
        return value;
}

/*
 * called in any context
 *
 * caller guarantees urb won't be recycled till both unlink()
 * and the urb's completion function return
 */
int usb_hcd_unlink_urb (struct urb *urb, int status)
{
        struct usb_hcd                *hcd;
        struct usb_device        *udev = urb->dev;
        int                        retval = -EIDRM;
        unsigned long                flags;

        /* Prevent the device and bus from going away while
         * the unlink is carried out.  If they are already gone
         * then urb->use_count must be 0, since disconnected
         * devices can't have any active URBs.
         */
        spin_lock_irqsave(&hcd_urb_unlink_lock, flags);
        if (atomic_read(&urb->use_count) > 0) {
                retval = 0;
                usb_get_dev(udev);
        }
        spin_unlock_irqrestore(&hcd_urb_unlink_lock, flags);
        if (retval == 0) {
                hcd = bus_to_hcd(urb->dev->bus);
                retval = unlink1(hcd, urb, status);
                if (retval == 0)
                        retval = -EINPROGRESS;
                else if (retval != -EIDRM && retval != -EBUSY)
                        dev_dbg(&udev->dev, "hcd_unlink_urb %p fail %d\n",
                                        urb, retval);
                usb_put_dev(udev);
        }
        return retval;
}

/*-------------------------------------------------------------------------*/

static void __usb_hcd_giveback_urb(struct urb *urb)
{
        struct usb_hcd *hcd = bus_to_hcd(urb->dev->bus);
        struct usb_anchor *anchor = urb->anchor;
        int status = urb->unlinked;

        urb->hcpriv = NULL;
        if (unlikely((urb->transfer_flags & URB_SHORT_NOT_OK) &&
            urb->actual_length < urb->transfer_buffer_length &&
            !status))
                status = -EREMOTEIO;

        unmap_urb_for_dma(hcd, urb);
        usbmon_urb_complete(&hcd->self, urb, status);
        usb_anchor_suspend_wakeups(anchor);
        usb_unanchor_urb(urb);
        if (likely(status == 0))
                usb_led_activity(USB_LED_EVENT_HOST);

        /* pass ownership to the completion handler */
        urb->status = status;
        /*
         * This function can be called in task context inside another remote
         * coverage collection section, but kcov doesn't support that kind of
         * recursion yet. Only collect coverage in softirq context for now.
         */
        kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum);
        urb->complete(urb);
        kcov_remote_stop_softirq();

        usb_anchor_resume_wakeups(anchor);
        atomic_dec(&urb->use_count);
        /*
         * Order the write of urb->use_count above before the read
         * of urb->reject below.  Pairs with the memory barriers in
         * usb_kill_urb() and usb_poison_urb().
         */
        smp_mb__after_atomic();

        if (unlikely(atomic_read(&urb->reject)))
                wake_up(&usb_kill_urb_queue);
        usb_put_urb(urb);
}

static void usb_giveback_urb_bh(struct work_struct *work)
{
        struct giveback_urb_bh *bh =
                container_of(work, struct giveback_urb_bh, bh);
        struct list_head local_list;

        spin_lock_irq(&bh->lock);
        bh->running = true;
        list_replace_init(&bh->head, &local_list);
        spin_unlock_irq(&bh->lock);

        while (!list_empty(&local_list)) {
                struct urb *urb;

                urb = list_entry(local_list.next, struct urb, urb_list);
                list_del_init(&urb->urb_list);
                bh->completing_ep = urb->ep;
                __usb_hcd_giveback_urb(urb);
                bh->completing_ep = NULL;
        }

        /*
         * giveback new URBs next time to prevent this function
         * from not exiting for a long time.
         */
        spin_lock_irq(&bh->lock);
        if (!list_empty(&bh->head)) {
                if (bh->high_prio)
                        queue_work(system_bh_highpri_wq, &bh->bh);
                else
                        queue_work(system_bh_wq, &bh->bh);
        }
        bh->running = false;
        spin_unlock_irq(&bh->lock);
}

/**
 * usb_hcd_giveback_urb - return URB from HCD to device driver
 * @hcd: host controller returning the URB
 * @urb: urb being returned to the USB device driver.
 * @status: completion status code for the URB.
 *
 * Context: atomic. The completion callback is invoked either in a work queue
 * (BH) context or in the caller's context, depending on whether the HCD_BH
 * flag is set in the @hcd structure, except that URBs submitted to the
 * root hub always complete in BH context.
 *
 * This hands the URB from HCD to its USB device driver, using its
 * completion function.  The HCD has freed all per-urb resources
 * (and is done using urb->hcpriv).  It also released all HCD locks;
 * the device driver won't cause problems if it frees, modifies,
 * or resubmits this URB.
 *
 * If @urb was unlinked, the value of @status will be overridden by
 * @urb->unlinked.  Erroneous short transfers are detected in case
 * the HCD hasn't checked for them.
 */
void usb_hcd_giveback_urb(struct usb_hcd *hcd, struct urb *urb, int status)
{
        struct giveback_urb_bh *bh;
        bool running;

        /* pass status to BH via unlinked */
        if (likely(!urb->unlinked))
                urb->unlinked = status;

        if (!hcd_giveback_urb_in_bh(hcd) && !is_root_hub(urb->dev)) {
                __usb_hcd_giveback_urb(urb);
                return;
        }

        if (usb_pipeisoc(urb->pipe) || usb_pipeint(urb->pipe))
                bh = &hcd->high_prio_bh;
        else
                bh = &hcd->low_prio_bh;

        spin_lock(&bh->lock);
        list_add_tail(&urb->urb_list, &bh->head);
        running = bh->running;
        spin_unlock(&bh->lock);

        if (running)
                ;
        else if (bh->high_prio)
                queue_work(system_bh_highpri_wq, &bh->bh);
        else
                queue_work(system_bh_wq, &bh->bh);
}
EXPORT_SYMBOL_GPL(usb_hcd_giveback_urb);

/*-------------------------------------------------------------------------*/

/* Cancel all URBs pending on this endpoint and wait for the endpoint's
 * queue to drain completely.  The caller must first insure that no more
 * URBs can be submitted for this endpoint.
 */
void usb_hcd_flush_endpoint(struct usb_device *udev,
                struct usb_host_endpoint *ep)
{
        struct usb_hcd                *hcd;
        struct urb                *urb;

        if (!ep)
                return;
        might_sleep();
        hcd = bus_to_hcd(udev->bus);

        /* No more submits can occur */
        spin_lock_irq(&hcd_urb_list_lock);
rescan:
        list_for_each_entry_reverse(urb, &ep->urb_list, urb_list) {
                int        is_in;

                if (urb->unlinked)
                        continue;
                usb_get_urb (urb);
                is_in = usb_urb_dir_in(urb);
                spin_unlock(&hcd_urb_list_lock);

                /* kick hcd */
                unlink1(hcd, urb, -ESHUTDOWN);
                dev_dbg (hcd->self.controller,
                        "shutdown urb %p ep%d%s-%s\n",
                        urb, usb_endpoint_num(&ep->desc),
                        is_in ? "in" : "out",
                        usb_ep_type_string(usb_endpoint_type(&ep->desc)));
                usb_put_urb (urb);

                /* list contents may have changed */
                spin_lock(&hcd_urb_list_lock);
                goto rescan;
        }
        spin_unlock_irq(&hcd_urb_list_lock);

        /* Wait until the endpoint queue is completely empty */
        while (!list_empty (&ep->urb_list)) {
                spin_lock_irq(&hcd_urb_list_lock);

                /* The list may have changed while we acquired the spinlock */
                urb = NULL;
                if (!list_empty (&ep->urb_list)) {
                        urb = list_entry (ep->urb_list.prev, struct urb,
                                        urb_list);
                        usb_get_urb (urb);
                }
                spin_unlock_irq(&hcd_urb_list_lock);

                if (urb) {
                        usb_kill_urb (urb);
                        usb_put_urb (urb);
                }
        }
}

/**
 * usb_hcd_alloc_bandwidth - check whether a new bandwidth setting exceeds
 *                                the bus bandwidth
 * @udev: target &usb_device
 * @new_config: new configuration to install
 * @cur_alt: the current alternate interface setting
 * @new_alt: alternate interface setting that is being installed
 *
 * To change configurations, pass in the new configuration in new_config,
 * and pass NULL for cur_alt and new_alt.
 *
 * To reset a device's configuration (put the device in the ADDRESSED state),
 * pass in NULL for new_config, cur_alt, and new_alt.
 *
 * To change alternate interface settings, pass in NULL for new_config,
 * pass in the current alternate interface setting in cur_alt,
 * and pass in the new alternate interface setting in new_alt.
 *
 * Return: An error if the requested bandwidth change exceeds the
 * bus bandwidth or host controller internal resources.
 */
int usb_hcd_alloc_bandwidth(struct usb_device *udev,
                struct usb_host_config *new_config,
                struct usb_host_interface *cur_alt,
                struct usb_host_interface *new_alt)
{
        int num_intfs, i, j;
        struct usb_host_interface *alt = NULL;
        int ret = 0;
        struct usb_hcd *hcd;
        struct usb_host_endpoint *ep;

        hcd = bus_to_hcd(udev->bus);
        if (!hcd->driver->check_bandwidth)
                return 0;

        /* Configuration is being removed - set configuration 0 */
        if (!new_config && !cur_alt) {
                for (i = 1; i < 16; ++i) {
                        ep = udev->ep_out[i];
                        if (ep)
                                hcd->driver->drop_endpoint(hcd, udev, ep);
                        ep = udev->ep_in[i];
                        if (ep)
                                hcd->driver->drop_endpoint(hcd, udev, ep);
                }
                hcd->driver->check_bandwidth(hcd, udev);
                return 0;
        }
        /* Check if the HCD says there's enough bandwidth.  Enable all endpoints
         * each interface's alt setting 0 and ask the HCD to check the bandwidth
         * of the bus.  There will always be bandwidth for endpoint 0, so it's
         * ok to exclude it.
         */
        if (new_config) {
                num_intfs = new_config->desc.bNumInterfaces;
                /* Remove endpoints (except endpoint 0, which is always on the
                 * schedule) from the old config from the schedule
                 */
                for (i = 1; i < 16; ++i) {
                        ep = udev->ep_out[i];
                        if (ep) {
                                ret = hcd->driver->drop_endpoint(hcd, udev, ep);
                                if (ret < 0)
                                        goto reset;
                        }
                        ep = udev->ep_in[i];
                        if (ep) {
                                ret = hcd->driver->drop_endpoint(hcd, udev, ep);
                                if (ret < 0)
                                        goto reset;
                        }
                }
                for (i = 0; i < num_intfs; ++i) {
                        struct usb_host_interface *first_alt;
                        int iface_num;

                        first_alt = &new_config->intf_cache[i]->altsetting[0];
                        iface_num = first_alt->desc.bInterfaceNumber;
                        /* Set up endpoints for alternate interface setting 0 */
                        alt = usb_find_alt_setting(new_config, iface_num, 0);
                        if (!alt)
                                /* No alt setting 0? Pick the first setting. */
                                alt = first_alt;

                        for (j = 0; j < alt->desc.bNumEndpoints; j++) {
                                ret = hcd->driver->add_endpoint(hcd, udev, &alt->endpoint[j]);
                                if (ret < 0)
                                        goto reset;
                        }
                }
        }
        if (cur_alt && new_alt) {
                struct usb_interface *iface = usb_ifnum_to_if(udev,
                                cur_alt->desc.bInterfaceNumber);

                if (!iface)
                        return -EINVAL;
                if (iface->resetting_device) {
                        /*
                         * The USB core just reset the device, so the xHCI host
                         * and the device will think alt setting 0 is installed.
                         * However, the USB core will pass in the alternate
                         * setting installed before the reset as cur_alt.  Dig
                         * out the alternate setting 0 structure, or the first
                         * alternate setting if a broken device doesn't have alt
                         * setting 0.
                         */
                        cur_alt = usb_altnum_to_altsetting(iface, 0);
                        if (!cur_alt)
                                cur_alt = &iface->altsetting[0];
                }

                /* Drop all the endpoints in the current alt setting */
                for (i = 0; i < cur_alt->desc.bNumEndpoints; i++) {
                        ret = hcd->driver->drop_endpoint(hcd, udev,
                                        &cur_alt->endpoint[i]);
                        if (ret < 0)
                                goto reset;
                }
                /* Add all the endpoints in the new alt setting */
                for (i = 0; i < new_alt->desc.bNumEndpoints; i++) {
                        ret = hcd->driver->add_endpoint(hcd, udev,
                                        &new_alt->endpoint[i]);
                        if (ret < 0)
                                goto reset;
                }
        }
        ret = hcd->driver->check_bandwidth(hcd, udev);
reset:
        if (ret < 0)
                hcd->driver->reset_bandwidth(hcd, udev);
        return ret;
}

/* Disables the endpoint: synchronizes with the hcd to make sure all
 * endpoint state is gone from hardware.  usb_hcd_flush_endpoint() must
 * have been called previously.  Use for set_configuration, set_interface,
 * driver removal, physical disconnect.
 *
 * example:  a qh stored in ep->hcpriv, holding state related to endpoint
 * type, maxpacket size, toggle, halt status, and scheduling.
 */
void usb_hcd_disable_endpoint(struct usb_device *udev,
                struct usb_host_endpoint *ep)
{
        struct usb_hcd                *hcd;

        might_sleep();
        hcd = bus_to_hcd(udev->bus);
        if (hcd->driver->endpoint_disable)
                hcd->driver->endpoint_disable(hcd, ep);
}

/**
 * usb_hcd_reset_endpoint - reset host endpoint state
 * @udev: USB device.
 * @ep:   the endpoint to reset.
 *
 * Resets any host endpoint state such as the toggle bit, sequence
 * number and current window.
 */
void usb_hcd_reset_endpoint(struct usb_device *udev,
                            struct usb_host_endpoint *ep)
{
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);

        if (hcd->driver->endpoint_reset)
                hcd->driver->endpoint_reset(hcd, ep);
        else {
                int epnum = usb_endpoint_num(&ep->desc);
                int is_out = usb_endpoint_dir_out(&ep->desc);
                int is_control = usb_endpoint_xfer_control(&ep->desc);

                usb_settoggle(udev, epnum, is_out, 0);
                if (is_control)
                        usb_settoggle(udev, epnum, !is_out, 0);
        }
}

/**
 * usb_alloc_streams - allocate bulk endpoint stream IDs.
 * @interface:                alternate setting that includes all endpoints.
 * @eps:                array of endpoints that need streams.
 * @num_eps:                number of endpoints in the array.
 * @num_streams:        number of streams to allocate.
 * @mem_flags:                flags hcd should use to allocate memory.
 *
 * Sets up a group of bulk endpoints to have @num_streams stream IDs available.
 * Drivers may queue multiple transfers to different stream IDs, which may
 * complete in a different order than they were queued.
 *
 * Return: On success, the number of allocated streams. On failure, a negative
 * error code.
 */
int usb_alloc_streams(struct usb_interface *interface,
                struct usb_host_endpoint **eps, unsigned int num_eps,
                unsigned int num_streams, gfp_t mem_flags)
{
        struct usb_hcd *hcd;
        struct usb_device *dev;
        int i, ret;

        dev = interface_to_usbdev(interface);
        hcd = bus_to_hcd(dev->bus);
        if (!hcd->driver->alloc_streams || !hcd->driver->free_streams)
                return -EINVAL;
        if (dev->speed < USB_SPEED_SUPER)
                return -EINVAL;
        if (dev->state < USB_STATE_CONFIGURED)
                return -ENODEV;

        for (i = 0; i < num_eps; i++) {
                /* Streams only apply to bulk endpoints. */
                if (!usb_endpoint_xfer_bulk(&eps[i]->desc))
                        return -EINVAL;
                /* Re-alloc is not allowed */
                if (eps[i]->streams)
                        return -EINVAL;
        }

        ret = hcd->driver->alloc_streams(hcd, dev, eps, num_eps,
                        num_streams, mem_flags);
        if (ret < 0)
                return ret;

        for (i = 0; i < num_eps; i++)
                eps[i]->streams = ret;

        return ret;
}
EXPORT_SYMBOL_GPL(usb_alloc_streams);

/**
 * usb_free_streams - free bulk endpoint stream IDs.
 * @interface:        alternate setting that includes all endpoints.
 * @eps:        array of endpoints to remove streams from.
 * @num_eps:        number of endpoints in the array.
 * @mem_flags:        flags hcd should use to allocate memory.
 *
 * Reverts a group of bulk endpoints back to not using stream IDs.
 * Can fail if we are given bad arguments, or HCD is broken.
 *
 * Return: 0 on success. On failure, a negative error code.
 */
int usb_free_streams(struct usb_interface *interface,
                struct usb_host_endpoint **eps, unsigned int num_eps,
                gfp_t mem_flags)
{
        struct usb_hcd *hcd;
        struct usb_device *dev;
        int i, ret;

        dev = interface_to_usbdev(interface);
        hcd = bus_to_hcd(dev->bus);
        if (dev->speed < USB_SPEED_SUPER)
                return -EINVAL;

        /* Double-free is not allowed */
        for (i = 0; i < num_eps; i++)
                if (!eps[i] || !eps[i]->streams)
                        return -EINVAL;

        ret = hcd->driver->free_streams(hcd, dev, eps, num_eps, mem_flags);
        if (ret < 0)
                return ret;

        for (i = 0; i < num_eps; i++)
                eps[i]->streams = 0;

        return ret;
}
EXPORT_SYMBOL_GPL(usb_free_streams);

/* Protect against drivers that try to unlink URBs after the device
 * is gone, by waiting until all unlinks for @udev are finished.
 * Since we don't currently track URBs by device, simply wait until
 * nothing is running in the locked region of usb_hcd_unlink_urb().
 */
void usb_hcd_synchronize_unlinks(struct usb_device *udev)
{
        spin_lock_irq(&hcd_urb_unlink_lock);
        spin_unlock_irq(&hcd_urb_unlink_lock);
}

/*-------------------------------------------------------------------------*/

/* called in any context */
int usb_hcd_get_frame_number (struct usb_device *udev)
{
        struct usb_hcd        *hcd = bus_to_hcd(udev->bus);

        if (!HCD_RH_RUNNING(hcd))
                return -ESHUTDOWN;
        return hcd->driver->get_frame_number (hcd);
}

/*-------------------------------------------------------------------------*/
#ifdef CONFIG_USB_HCD_TEST_MODE

static void usb_ehset_completion(struct urb *urb)
{
        struct completion  *done = urb->context;

        complete(done);
}
/*
 * Allocate and initialize a control URB. This request will be used by the
 * EHSET SINGLE_STEP_SET_FEATURE test in which the DATA and STATUS stages
 * of the GetDescriptor request are sent 15 seconds after the SETUP stage.
 * Return NULL if failed.
 */
static struct urb *request_single_step_set_feature_urb(
        struct usb_device        *udev,
        void                        *dr,
        void                        *buf,
        struct completion        *done)
{
        struct urb *urb;
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);

        urb = usb_alloc_urb(0, GFP_KERNEL);
        if (!urb)
                return NULL;

        urb->pipe = usb_rcvctrlpipe(udev, 0);

        urb->ep = &udev->ep0;
        urb->dev = udev;
        urb->setup_packet = (void *)dr;
        urb->transfer_buffer = buf;
        urb->transfer_buffer_length = USB_DT_DEVICE_SIZE;
        urb->complete = usb_ehset_completion;
        urb->status = -EINPROGRESS;
        urb->actual_length = 0;
        urb->transfer_flags = URB_DIR_IN | URB_NO_TRANSFER_DMA_MAP;
        usb_get_urb(urb);
        atomic_inc(&urb->use_count);
        atomic_inc(&urb->dev->urbnum);
        if (map_urb_for_dma(hcd, urb, GFP_KERNEL)) {
                usb_put_urb(urb);
                usb_free_urb(urb);
                return NULL;
        }

        urb->context = done;
        return urb;
}

int ehset_single_step_set_feature(struct usb_hcd *hcd, int port)
{
        int retval = -ENOMEM;
        struct usb_ctrlrequest *dr;
        struct urb *urb;
        struct usb_device *udev;
        struct usb_device_descriptor *buf;
        DECLARE_COMPLETION_ONSTACK(done);

        /* Obtain udev of the rhub's child port */
        udev = usb_hub_find_child(hcd->self.root_hub, port);
        if (!udev) {
                dev_err(hcd->self.controller, "No device attached to the RootHub\n");
                return -ENODEV;
        }
        buf = kmalloc(USB_DT_DEVICE_SIZE, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        dr = kmalloc_obj(struct usb_ctrlrequest);
        if (!dr) {
                kfree(buf);
                return -ENOMEM;
        }

        /* Fill Setup packet for GetDescriptor */
        dr->bRequestType = USB_DIR_IN;
        dr->bRequest = USB_REQ_GET_DESCRIPTOR;
        dr->wValue = cpu_to_le16(USB_DT_DEVICE << 8);
        dr->wIndex = 0;
        dr->wLength = cpu_to_le16(USB_DT_DEVICE_SIZE);
        urb = request_single_step_set_feature_urb(udev, dr, buf, &done);
        if (!urb)
                goto cleanup;

        /* Submit just the SETUP stage */
        retval = hcd->driver->submit_single_step_set_feature(hcd, urb, 1);
        if (retval)
                goto out1;
        if (!wait_for_completion_timeout(&done, msecs_to_jiffies(2000))) {
                usb_kill_urb(urb);
                retval = -ETIMEDOUT;
                dev_err(hcd->self.controller,
                        "%s SETUP stage timed out on ep0\n", __func__);
                goto out1;
        }
        msleep(15 * 1000);

        /* Complete remaining DATA and STATUS stages using the same URB */
        urb->status = -EINPROGRESS;
        urb->transfer_flags &= ~URB_NO_TRANSFER_DMA_MAP;
        usb_get_urb(urb);
        atomic_inc(&urb->use_count);
        atomic_inc(&urb->dev->urbnum);
        if (map_urb_for_dma(hcd, urb, GFP_KERNEL)) {
                usb_put_urb(urb);
                goto out1;
        }

        retval = hcd->driver->submit_single_step_set_feature(hcd, urb, 0);
        if (!retval && !wait_for_completion_timeout(&done,
                                                msecs_to_jiffies(2000))) {
                usb_kill_urb(urb);
                retval = -ETIMEDOUT;
                dev_err(hcd->self.controller,
                        "%s IN stage timed out on ep0\n", __func__);
        }
out1:
        usb_free_urb(urb);
cleanup:
        kfree(dr);
        kfree(buf);
        return retval;
}
EXPORT_SYMBOL_GPL(ehset_single_step_set_feature);
#endif /* CONFIG_USB_HCD_TEST_MODE */

/*-------------------------------------------------------------------------*/

#ifdef        CONFIG_PM

int hcd_bus_suspend(struct usb_device *rhdev, pm_message_t msg)
{
        struct usb_hcd        *hcd = bus_to_hcd(rhdev->bus);
        int                status;
        int                old_state = hcd->state;

        dev_dbg(&rhdev->dev, "bus %ssuspend, wakeup %d\n",
                        (PMSG_IS_AUTO(msg) ? "auto-" : ""),
                        rhdev->do_remote_wakeup);
        if (HCD_DEAD(hcd)) {
                dev_dbg(&rhdev->dev, "skipped %s of dead bus\n", "suspend");
                return 0;
        }

        if (!hcd->driver->bus_suspend) {
                status = -ENOENT;
        } else {
                clear_bit(HCD_FLAG_RH_RUNNING, &hcd->flags);
                hcd->state = HC_STATE_QUIESCING;
                status = hcd->driver->bus_suspend(hcd);
        }
        if (status == 0) {
                usb_set_device_state(rhdev, USB_STATE_SUSPENDED);
                hcd->state = HC_STATE_SUSPENDED;

                if (!PMSG_IS_AUTO(msg))
                        usb_phy_roothub_suspend(hcd->self.sysdev,
                                                hcd->phy_roothub);

                /* Did we race with a root-hub wakeup event? */
                if (rhdev->do_remote_wakeup) {
                        char        buffer[6];

                        status = hcd->driver->hub_status_data(hcd, buffer);
                        if (status != 0) {
                                dev_dbg(&rhdev->dev, "suspend raced with wakeup event\n");
                                hcd_bus_resume(rhdev, PMSG_AUTO_RESUME);
                                status = -EBUSY;
                        }
                }
        } else {
                spin_lock_irq(&hcd_root_hub_lock);
                if (!HCD_DEAD(hcd)) {
                        set_bit(HCD_FLAG_RH_RUNNING, &hcd->flags);
                        hcd->state = old_state;
                }
                spin_unlock_irq(&hcd_root_hub_lock);
                dev_dbg(&rhdev->dev, "bus %s fail, err %d\n",
                                "suspend", status);
        }
        return status;
}

int hcd_bus_resume(struct usb_device *rhdev, pm_message_t msg)
{
        struct usb_hcd        *hcd = bus_to_hcd(rhdev->bus);
        int                status;
        int                old_state = hcd->state;

        dev_dbg(&rhdev->dev, "usb %sresume\n",
                        (PMSG_IS_AUTO(msg) ? "auto-" : ""));
        if (HCD_DEAD(hcd)) {
                dev_dbg(&rhdev->dev, "skipped %s of dead bus\n", "resume");
                return 0;
        }

        if (!PMSG_IS_AUTO(msg)) {
                status = usb_phy_roothub_resume(hcd->self.sysdev,
                                                hcd->phy_roothub);
                if (status)
                        return status;
        }

        if (!hcd->driver->bus_resume)
                return -ENOENT;
        if (HCD_RH_RUNNING(hcd))
                return 0;

        hcd->state = HC_STATE_RESUMING;
        status = hcd->driver->bus_resume(hcd);
        clear_bit(HCD_FLAG_WAKEUP_PENDING, &hcd->flags);
        if (status == 0)
                status = usb_phy_roothub_calibrate(hcd->phy_roothub);

        if (status == 0) {
                struct usb_device *udev;
                int port1;

                spin_lock_irq(&hcd_root_hub_lock);
                if (!HCD_DEAD(hcd)) {
                        usb_set_device_state(rhdev, rhdev->actconfig
                                        ? USB_STATE_CONFIGURED
                                        : USB_STATE_ADDRESS);
                        set_bit(HCD_FLAG_RH_RUNNING, &hcd->flags);
                        hcd->state = HC_STATE_RUNNING;
                }
                spin_unlock_irq(&hcd_root_hub_lock);

                /*
                 * Check whether any of the enabled ports on the root hub are
                 * unsuspended.  If they are then a TRSMRCY delay is needed
                 * (this is what the USB-2 spec calls a "global resume").
                 * Otherwise we can skip the delay.
                 */
                usb_hub_for_each_child(rhdev, port1, udev) {
                        if (udev->state != USB_STATE_NOTATTACHED &&
                                        !udev->port_is_suspended) {
                                usleep_range(10000, 11000);        /* TRSMRCY */
                                break;
                        }
                }
        } else {
                hcd->state = old_state;
                usb_phy_roothub_suspend(hcd->self.sysdev, hcd->phy_roothub);
                dev_dbg(&rhdev->dev, "bus %s fail, err %d\n",
                                "resume", status);
                if (status != -ESHUTDOWN)
                        usb_hc_died(hcd);
        }
        return status;
}

/* Workqueue routine for root-hub remote wakeup */
static void hcd_resume_work(struct work_struct *work)
{
        struct usb_hcd *hcd = container_of(work, struct usb_hcd, wakeup_work);
        struct usb_device *udev = hcd->self.root_hub;

        usb_remote_wakeup(udev);
}

/**
 * usb_hcd_resume_root_hub - called by HCD to resume its root hub
 * @hcd: host controller for this root hub
 *
 * The USB host controller calls this function when its root hub is
 * suspended (with the remote wakeup feature enabled) and a remote
 * wakeup request is received.  The routine submits a workqueue request
 * to resume the root hub (that is, manage its downstream ports again).
 */
void usb_hcd_resume_root_hub (struct usb_hcd *hcd)
{
        unsigned long flags;

        spin_lock_irqsave (&hcd_root_hub_lock, flags);
        if (hcd->rh_registered) {
                pm_wakeup_event(&hcd->self.root_hub->dev, 0);
                set_bit(HCD_FLAG_WAKEUP_PENDING, &hcd->flags);
                queue_work(system_freezable_wq, &hcd->wakeup_work);
        }
        spin_unlock_irqrestore (&hcd_root_hub_lock, flags);
}
EXPORT_SYMBOL_GPL(usb_hcd_resume_root_hub);

#endif        /* CONFIG_PM */

/*-------------------------------------------------------------------------*/

#ifdef        CONFIG_USB_OTG

/**
 * usb_bus_start_enum - start immediate enumeration (for OTG)
 * @bus: the bus (must use hcd framework)
 * @port_num: 1-based number of port; usually bus->otg_port
 * Context: atomic
 *
 * Starts enumeration, with an immediate reset followed later by
 * hub_wq identifying and possibly configuring the device.
 * This is needed by OTG controller drivers, where it helps meet
 * HNP protocol timing requirements for starting a port reset.
 *
 * Return: 0 if successful.
 */
int usb_bus_start_enum(struct usb_bus *bus, unsigned port_num)
{
        struct usb_hcd                *hcd;
        int                        status = -EOPNOTSUPP;

        /* NOTE: since HNP can't start by grabbing the bus's address0_sem,
         * boards with root hubs hooked up to internal devices (instead of
         * just the OTG port) may need more attention to resetting...
         */
        hcd = bus_to_hcd(bus);
        if (port_num && hcd->driver->start_port_reset)
                status = hcd->driver->start_port_reset(hcd, port_num);

        /* allocate hub_wq shortly after (first) root port reset finishes;
         * it may issue others, until at least 50 msecs have passed.
         */
        if (status == 0)
                mod_timer(&hcd->rh_timer, jiffies + msecs_to_jiffies(10));
        return status;
}
EXPORT_SYMBOL_GPL(usb_bus_start_enum);

#endif

/*-------------------------------------------------------------------------*/

/**
 * usb_hcd_irq - hook IRQs to HCD framework (bus glue)
 * @irq: the IRQ being raised
 * @__hcd: pointer to the HCD whose IRQ is being signaled
 *
 * If the controller isn't HALTed, calls the driver's irq handler.
 * Checks whether the controller is now dead.
 *
 * Return: %IRQ_HANDLED if the IRQ was handled. %IRQ_NONE otherwise.
 */
irqreturn_t usb_hcd_irq (int irq, void *__hcd)
{
        struct usb_hcd                *hcd = __hcd;
        irqreturn_t                rc;

        if (unlikely(HCD_DEAD(hcd) || !HCD_HW_ACCESSIBLE(hcd)))
                rc = IRQ_NONE;
        else if (hcd->driver->irq(hcd) == IRQ_NONE)
                rc = IRQ_NONE;
        else
                rc = IRQ_HANDLED;

        return rc;
}
EXPORT_SYMBOL_GPL(usb_hcd_irq);

/*-------------------------------------------------------------------------*/

/* Workqueue routine for when the root-hub has died. */
static void hcd_died_work(struct work_struct *work)
{
        struct usb_hcd *hcd = container_of(work, struct usb_hcd, died_work);
        static char *env[] = {
                "ERROR=DEAD",
                NULL
        };

        /* Notify user space that the host controller has died */
        kobject_uevent_env(&hcd->self.root_hub->dev.kobj, KOBJ_OFFLINE, env);
}

/**
 * usb_hc_died - report abnormal shutdown of a host controller (bus glue)
 * @hcd: pointer to the HCD representing the controller
 *
 * This is called by bus glue to report a USB host controller that died
 * while operations may still have been pending.  It's called automatically
 * by the PCI glue, so only glue for non-PCI busses should need to call it.
 *
 * Only call this function with the primary HCD.
 */
void usb_hc_died (struct usb_hcd *hcd)
{
        unsigned long flags;

        dev_err (hcd->self.controller, "HC died; cleaning up\n");

        spin_lock_irqsave (&hcd_root_hub_lock, flags);
        clear_bit(HCD_FLAG_RH_RUNNING, &hcd->flags);
        set_bit(HCD_FLAG_DEAD, &hcd->flags);
        if (hcd->rh_registered) {
                clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);

                /* make hub_wq clean up old urbs and devices */
                usb_set_device_state (hcd->self.root_hub,
                                USB_STATE_NOTATTACHED);
                usb_kick_hub_wq(hcd->self.root_hub);
        }
        if (usb_hcd_is_primary_hcd(hcd) && hcd->shared_hcd) {
                hcd = hcd->shared_hcd;
                clear_bit(HCD_FLAG_RH_RUNNING, &hcd->flags);
                set_bit(HCD_FLAG_DEAD, &hcd->flags);
                if (hcd->rh_registered) {
                        clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);

                        /* make hub_wq clean up old urbs and devices */
                        usb_set_device_state(hcd->self.root_hub,
                                        USB_STATE_NOTATTACHED);
                        usb_kick_hub_wq(hcd->self.root_hub);
                }
        }

        /* Handle the case where this function gets called with a shared HCD */
        if (usb_hcd_is_primary_hcd(hcd))
                schedule_work(&hcd->died_work);
        else
                schedule_work(&hcd->primary_hcd->died_work);

        spin_unlock_irqrestore (&hcd_root_hub_lock, flags);
        /* Make sure that the other roothub is also deallocated. */
}
EXPORT_SYMBOL_GPL (usb_hc_died);

/*-------------------------------------------------------------------------*/

static void init_giveback_urb_bh(struct giveback_urb_bh *bh)
{

        spin_lock_init(&bh->lock);
        INIT_LIST_HEAD(&bh->head);
        INIT_WORK(&bh->bh, usb_giveback_urb_bh);
}

struct usb_hcd *__usb_create_hcd(const struct hc_driver *driver,
                struct device *sysdev, struct device *dev, const char *bus_name,
                struct usb_hcd *primary_hcd)
{
        struct usb_hcd *hcd;

        hcd = kzalloc(sizeof(*hcd) + driver->hcd_priv_size, GFP_KERNEL);
        if (!hcd)
                return NULL;
        if (primary_hcd == NULL) {
                hcd->address0_mutex = kmalloc_obj(*hcd->address0_mutex);
                if (!hcd->address0_mutex) {
                        kfree(hcd);
                        dev_dbg(dev, "hcd address0 mutex alloc failed\n");
                        return NULL;
                }
                mutex_init(hcd->address0_mutex);
                hcd->bandwidth_mutex = kmalloc_obj(*hcd->bandwidth_mutex);
                if (!hcd->bandwidth_mutex) {
                        kfree(hcd->address0_mutex);
                        kfree(hcd);
                        dev_dbg(dev, "hcd bandwidth mutex alloc failed\n");
                        return NULL;
                }
                mutex_init(hcd->bandwidth_mutex);
                dev_set_drvdata(dev, hcd);
        } else {
                mutex_lock(&usb_port_peer_mutex);
                hcd->address0_mutex = primary_hcd->address0_mutex;
                hcd->bandwidth_mutex = primary_hcd->bandwidth_mutex;
                hcd->primary_hcd = primary_hcd;
                primary_hcd->primary_hcd = primary_hcd;
                hcd->shared_hcd = primary_hcd;
                primary_hcd->shared_hcd = hcd;
                mutex_unlock(&usb_port_peer_mutex);
        }

        kref_init(&hcd->kref);

        usb_bus_init(&hcd->self);
        hcd->self.controller = dev;
        hcd->self.sysdev = sysdev;
        hcd->self.bus_name = bus_name;

        timer_setup(&hcd->rh_timer, rh_timer_func, 0);
#ifdef CONFIG_PM
        INIT_WORK(&hcd->wakeup_work, hcd_resume_work);
#endif

        INIT_WORK(&hcd->died_work, hcd_died_work);

        hcd->driver = driver;
        hcd->speed = driver->flags & HCD_MASK;
        hcd->product_desc = (driver->product_desc) ? driver->product_desc :
                        "USB Host Controller";
        return hcd;
}
EXPORT_SYMBOL_GPL(__usb_create_hcd);

/**
 * usb_create_shared_hcd - create and initialize an HCD structure
 * @driver: HC driver that will use this hcd
 * @dev: device for this HC, stored in hcd->self.controller
 * @bus_name: value to store in hcd->self.bus_name
 * @primary_hcd: a pointer to the usb_hcd structure that is sharing the
 *              PCI device.  Only allocate certain resources for the primary HCD
 *
 * Context: task context, might sleep.
 *
 * Allocate a struct usb_hcd, with extra space at the end for the
 * HC driver's private data.  Initialize the generic members of the
 * hcd structure.
 *
 * Return: On success, a pointer to the created and initialized HCD structure.
 * On failure (e.g. if memory is unavailable), %NULL.
 */
struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
                struct device *dev, const char *bus_name,
                struct usb_hcd *primary_hcd)
{
        return __usb_create_hcd(driver, dev, dev, bus_name, primary_hcd);
}
EXPORT_SYMBOL_GPL(usb_create_shared_hcd);

/**
 * usb_create_hcd - create and initialize an HCD structure
 * @driver: HC driver that will use this hcd
 * @dev: device for this HC, stored in hcd->self.controller
 * @bus_name: value to store in hcd->self.bus_name
 *
 * Context: task context, might sleep.
 *
 * Allocate a struct usb_hcd, with extra space at the end for the
 * HC driver's private data.  Initialize the generic members of the
 * hcd structure.
 *
 * Return: On success, a pointer to the created and initialized HCD
 * structure. On failure (e.g. if memory is unavailable), %NULL.
 */
struct usb_hcd *usb_create_hcd(const struct hc_driver *driver,
                struct device *dev, const char *bus_name)
{
        return __usb_create_hcd(driver, dev, dev, bus_name, NULL);
}
EXPORT_SYMBOL_GPL(usb_create_hcd);

/*
 * Roothubs that share one PCI device must also share the bandwidth mutex.
 * Don't deallocate the bandwidth_mutex until the last shared usb_hcd is
 * deallocated.
 *
 * Make sure to deallocate the bandwidth_mutex only when the last HCD is
 * freed.  When hcd_release() is called for either hcd in a peer set,
 * invalidate the peer's ->shared_hcd and ->primary_hcd pointers.
 */
static void hcd_release(struct kref *kref)
{
        struct usb_hcd *hcd = container_of (kref, struct usb_hcd, kref);

        mutex_lock(&usb_port_peer_mutex);
        if (hcd->shared_hcd) {
                struct usb_hcd *peer = hcd->shared_hcd;

                peer->shared_hcd = NULL;
                peer->primary_hcd = NULL;
        } else {
                kfree(hcd->address0_mutex);
                kfree(hcd->bandwidth_mutex);
        }
        mutex_unlock(&usb_port_peer_mutex);
        kfree(hcd);
}

struct usb_hcd *usb_get_hcd(struct usb_hcd *hcd)
{
        if (hcd)
                kref_get(&hcd->kref);
        return hcd;
}
EXPORT_SYMBOL_GPL(usb_get_hcd);

void usb_put_hcd(struct usb_hcd *hcd)
{
        if (hcd)
                kref_put(&hcd->kref, hcd_release);
}
EXPORT_SYMBOL_GPL(usb_put_hcd);

int usb_hcd_is_primary_hcd(struct usb_hcd *hcd)
{
        if (!hcd->primary_hcd)
                return 1;
        return hcd == hcd->primary_hcd;
}
EXPORT_SYMBOL_GPL(usb_hcd_is_primary_hcd);

int usb_hcd_find_raw_port_number(struct usb_hcd *hcd, int port1)
{
        if (!hcd->driver->find_raw_port_number)
                return port1;

        return hcd->driver->find_raw_port_number(hcd, port1);
}

static int usb_hcd_request_irqs(struct usb_hcd *hcd,
                unsigned int irqnum, unsigned long irqflags)
{
        int retval;

        if (hcd->driver->irq) {

                snprintf(hcd->irq_descr, sizeof(hcd->irq_descr), "%s:usb%d",
                                hcd->driver->description, hcd->self.busnum);
                retval = request_irq(irqnum, &usb_hcd_irq, irqflags,
                                hcd->irq_descr, hcd);
                if (retval != 0) {
                        dev_err(hcd->self.controller,
                                        "request interrupt %d failed\n",
                                        irqnum);
                        return retval;
                }
                hcd->irq = irqnum;
                dev_info(hcd->self.controller, "irq %d, %s 0x%08llx\n", irqnum,
                                (hcd->driver->flags & HCD_MEMORY) ?
                                        "io mem" : "io port",
                                (unsigned long long)hcd->rsrc_start);
        } else {
                hcd->irq = 0;
                if (hcd->rsrc_start)
                        dev_info(hcd->self.controller, "%s 0x%08llx\n",
                                        (hcd->driver->flags & HCD_MEMORY) ?
                                                "io mem" : "io port",
                                        (unsigned long long)hcd->rsrc_start);
        }
        return 0;
}

/*
 * Before we free this root hub, flush in-flight peering attempts
 * and disable peer lookups
 */
static void usb_put_invalidate_rhdev(struct usb_hcd *hcd)
{
        struct usb_device *rhdev;

        mutex_lock(&usb_port_peer_mutex);
        rhdev = hcd->self.root_hub;
        hcd->self.root_hub = NULL;
        mutex_unlock(&usb_port_peer_mutex);
        usb_put_dev(rhdev);
}

/**
 * usb_stop_hcd - Halt the HCD
 * @hcd: the usb_hcd that has to be halted
 *
 * Stop the root-hub polling timer and invoke the HCD's ->stop callback.
 */
static void usb_stop_hcd(struct usb_hcd *hcd)
{
        hcd->rh_pollable = 0;
        clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
        timer_delete_sync(&hcd->rh_timer);

        hcd->driver->stop(hcd);
        hcd->state = HC_STATE_HALT;

        /* In case the HCD restarted the timer, stop it again. */
        clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
        timer_delete_sync(&hcd->rh_timer);
}

/**
 * usb_add_hcd - finish generic HCD structure initialization and register
 * @hcd: the usb_hcd structure to initialize
 * @irqnum: Interrupt line to allocate
 * @irqflags: Interrupt type flags
 *
 * Finish the remaining parts of generic HCD initialization: allocate the
 * buffers of consistent memory, register the bus, request the IRQ line,
 * and call the driver's reset() and start() routines.
 */
int usb_add_hcd(struct usb_hcd *hcd,
                unsigned int irqnum, unsigned long irqflags)
{
        int retval;
        struct usb_device *rhdev;
        struct usb_hcd *shared_hcd;
        int skip_phy_initialization;

        if (usb_hcd_is_primary_hcd(hcd))
                skip_phy_initialization = hcd->skip_phy_initialization;
        else
                skip_phy_initialization = hcd->primary_hcd->skip_phy_initialization;

        if (!skip_phy_initialization) {
                if (usb_hcd_is_primary_hcd(hcd)) {
                        hcd->phy_roothub = usb_phy_roothub_alloc(hcd->self.sysdev);
                        if (IS_ERR(hcd->phy_roothub))
                                return PTR_ERR(hcd->phy_roothub);
                } else {
                        hcd->phy_roothub = usb_phy_roothub_alloc_usb3_phy(hcd->self.sysdev);
                        if (IS_ERR(hcd->phy_roothub))
                                return PTR_ERR(hcd->phy_roothub);
                }

                retval = usb_phy_roothub_init(hcd->phy_roothub);
                if (retval)
                        return retval;

                retval = usb_phy_roothub_set_mode(hcd->phy_roothub,
                                                  PHY_MODE_USB_HOST_SS);
                if (retval)
                        retval = usb_phy_roothub_set_mode(hcd->phy_roothub,
                                                          PHY_MODE_USB_HOST);
                if (retval)
                        goto err_usb_phy_roothub_power_on;

                retval = usb_phy_roothub_power_on(hcd->phy_roothub);
                if (retval)
                        goto err_usb_phy_roothub_power_on;
        }

        dev_info(hcd->self.controller, "%s\n", hcd->product_desc);

        switch (authorized_default) {
        case USB_AUTHORIZE_NONE:
                hcd->dev_policy = USB_DEVICE_AUTHORIZE_NONE;
                break;

        case USB_AUTHORIZE_INTERNAL:
                hcd->dev_policy = USB_DEVICE_AUTHORIZE_INTERNAL;
                break;

        case USB_AUTHORIZE_ALL:
        case USB_AUTHORIZE_WIRED:
        default:
                hcd->dev_policy = USB_DEVICE_AUTHORIZE_ALL;
                break;
        }

        set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags);

        /* per default all interfaces are authorized */
        set_bit(HCD_FLAG_INTF_AUTHORIZED, &hcd->flags);

        /* HC is in reset state, but accessible.  Now do the one-time init,
         * bottom up so that hcds can customize the root hubs before hub_wq
         * starts talking to them.  (Note, bus id is assigned early too.)
         */
        retval = hcd_buffer_create(hcd);
        if (retval != 0) {
                dev_dbg(hcd->self.sysdev, "pool alloc failed\n");
                goto err_create_buf;
        }

        retval = usb_register_bus(&hcd->self);
        if (retval < 0)
                goto err_register_bus;

        rhdev = usb_alloc_dev(NULL, &hcd->self, 0);
        if (rhdev == NULL) {
                dev_err(hcd->self.sysdev, "unable to allocate root hub\n");
                retval = -ENOMEM;
                goto err_allocate_root_hub;
        }
        mutex_lock(&usb_port_peer_mutex);
        hcd->self.root_hub = rhdev;
        mutex_unlock(&usb_port_peer_mutex);

        rhdev->rx_lanes = 1;
        rhdev->tx_lanes = 1;
        rhdev->ssp_rate = USB_SSP_GEN_UNKNOWN;

        switch (hcd->speed) {
        case HCD_USB11:
                rhdev->speed = USB_SPEED_FULL;
                break;
        case HCD_USB2:
                rhdev->speed = USB_SPEED_HIGH;
                break;
        case HCD_USB3:
                rhdev->speed = USB_SPEED_SUPER;
                break;
        case HCD_USB32:
                rhdev->rx_lanes = 2;
                rhdev->tx_lanes = 2;
                rhdev->ssp_rate = USB_SSP_GEN_2x2;
                rhdev->speed = USB_SPEED_SUPER_PLUS;
                break;
        case HCD_USB31:
                rhdev->ssp_rate = USB_SSP_GEN_2x1;
                rhdev->speed = USB_SPEED_SUPER_PLUS;
                break;
        default:
                retval = -EINVAL;
                goto err_set_rh_speed;
        }

        /* wakeup flag init defaults to "everything works" for root hubs,
         * but drivers can override it in reset() if needed, along with
         * recording the overall controller's system wakeup capability.
         */
        device_set_wakeup_capable(&rhdev->dev, 1);

        /* HCD_FLAG_RH_RUNNING doesn't matter until the root hub is
         * registered.  But since the controller can die at any time,
         * let's initialize the flag before touching the hardware.
         */
        set_bit(HCD_FLAG_RH_RUNNING, &hcd->flags);

        /* "reset" is misnamed; its role is now one-time init. the controller
         * should already have been reset (and boot firmware kicked off etc).
         */
        if (hcd->driver->reset) {
                retval = hcd->driver->reset(hcd);
                if (retval < 0) {
                        dev_err(hcd->self.controller, "can't setup: %d\n",
                                        retval);
                        goto err_hcd_driver_setup;
                }
        }
        hcd->rh_pollable = 1;

        retval = usb_phy_roothub_calibrate(hcd->phy_roothub);
        if (retval)
                goto err_hcd_driver_setup;

        /* NOTE: root hub and controller capabilities may not be the same */
        if (device_can_wakeup(hcd->self.controller)
                        && device_can_wakeup(&hcd->self.root_hub->dev))
                dev_dbg(hcd->self.controller, "supports USB remote wakeup\n");

        /* initialize BHs */
        init_giveback_urb_bh(&hcd->high_prio_bh);
        hcd->high_prio_bh.high_prio = true;
        init_giveback_urb_bh(&hcd->low_prio_bh);

        /* enable irqs just before we start the controller,
         * if the BIOS provides legacy PCI irqs.
         */
        if (usb_hcd_is_primary_hcd(hcd) && irqnum) {
                retval = usb_hcd_request_irqs(hcd, irqnum, irqflags);
                if (retval)
                        goto err_request_irq;
        }

        hcd->state = HC_STATE_RUNNING;
        retval = hcd->driver->start(hcd);
        if (retval < 0) {
                dev_err(hcd->self.controller, "startup error %d\n", retval);
                goto err_hcd_driver_start;
        }

        /* starting here, usbcore will pay attention to the shared HCD roothub */
        shared_hcd = hcd->shared_hcd;
        if (!usb_hcd_is_primary_hcd(hcd) && shared_hcd && HCD_DEFER_RH_REGISTER(shared_hcd)) {
                retval = register_root_hub(shared_hcd);
                if (retval != 0)
                        goto err_register_root_hub;

                if (shared_hcd->uses_new_polling && HCD_POLL_RH(shared_hcd))
                        usb_hcd_poll_rh_status(shared_hcd);
        }

        /* starting here, usbcore will pay attention to this root hub */
        if (!HCD_DEFER_RH_REGISTER(hcd)) {
                retval = register_root_hub(hcd);
                if (retval != 0)
                        goto err_register_root_hub;

                if (hcd->uses_new_polling && HCD_POLL_RH(hcd))
                        usb_hcd_poll_rh_status(hcd);
        }

        return retval;

err_register_root_hub:
        usb_stop_hcd(hcd);
err_hcd_driver_start:
        if (usb_hcd_is_primary_hcd(hcd) && hcd->irq > 0)
                free_irq(irqnum, hcd);
err_request_irq:
err_hcd_driver_setup:
err_set_rh_speed:
        usb_put_invalidate_rhdev(hcd);
err_allocate_root_hub:
        usb_deregister_bus(&hcd->self);
err_register_bus:
        hcd_buffer_destroy(hcd);
err_create_buf:
        usb_phy_roothub_power_off(hcd->phy_roothub);
err_usb_phy_roothub_power_on:
        usb_phy_roothub_exit(hcd->phy_roothub);

        return retval;
}
EXPORT_SYMBOL_GPL(usb_add_hcd);

/**
 * usb_remove_hcd - shutdown processing for generic HCDs
 * @hcd: the usb_hcd structure to remove
 *
 * Context: task context, might sleep.
 *
 * Disconnects the root hub, then reverses the effects of usb_add_hcd(),
 * invoking the HCD's stop() method.
 */
void usb_remove_hcd(struct usb_hcd *hcd)
{
        struct usb_device *rhdev;
        bool rh_registered;

        if (!hcd) {
                pr_debug("%s: hcd is NULL\n", __func__);
                return;
        }
        rhdev = hcd->self.root_hub;

        dev_info(hcd->self.controller, "remove, state %x\n", hcd->state);

        usb_get_dev(rhdev);
        clear_bit(HCD_FLAG_RH_RUNNING, &hcd->flags);
        if (HC_IS_RUNNING (hcd->state))
                hcd->state = HC_STATE_QUIESCING;

        dev_dbg(hcd->self.controller, "roothub graceful disconnect\n");
        spin_lock_irq (&hcd_root_hub_lock);
        rh_registered = hcd->rh_registered;
        hcd->rh_registered = 0;
        spin_unlock_irq (&hcd_root_hub_lock);

#ifdef CONFIG_PM
        cancel_work_sync(&hcd->wakeup_work);
#endif
        cancel_work_sync(&hcd->died_work);

        mutex_lock(&usb_bus_idr_lock);
        if (rh_registered)
                usb_disconnect(&rhdev);                /* Sets rhdev to NULL */
        mutex_unlock(&usb_bus_idr_lock);

        /*
         * flush_work() isn't needed here because:
         * - driver's disconnect() called from usb_disconnect() should
         *   make sure its URBs are completed during the disconnect()
         *   callback
         *
         * - it is too late to run complete() here since driver may have
         *   been removed already now
         */

        /* Prevent any more root-hub status calls from the timer.
         * The HCD might still restart the timer (if a port status change
         * interrupt occurs), but usb_hcd_poll_rh_status() won't invoke
         * the hub_status_data() callback.
         */
        usb_stop_hcd(hcd);

        if (usb_hcd_is_primary_hcd(hcd)) {
                if (hcd->irq > 0)
                        free_irq(hcd->irq, hcd);
        }

        usb_deregister_bus(&hcd->self);
        hcd_buffer_destroy(hcd);

        usb_phy_roothub_power_off(hcd->phy_roothub);
        usb_phy_roothub_exit(hcd->phy_roothub);

        usb_put_invalidate_rhdev(hcd);
        hcd->flags = 0;
}
EXPORT_SYMBOL_GPL(usb_remove_hcd);

void
usb_hcd_platform_shutdown(struct platform_device *dev)
{
        struct usb_hcd *hcd = platform_get_drvdata(dev);

        /* No need for pm_runtime_put(), we're shutting down */
        pm_runtime_get_sync(&dev->dev);

        if (hcd->driver->shutdown)
                hcd->driver->shutdown(hcd);
}
EXPORT_SYMBOL_GPL(usb_hcd_platform_shutdown);

int usb_hcd_setup_local_mem(struct usb_hcd *hcd, phys_addr_t phys_addr,
                            dma_addr_t dma, size_t size)
{
        int err;
        void *local_mem;

        hcd->localmem_pool = devm_gen_pool_create(hcd->self.sysdev, 4,
                                                  dev_to_node(hcd->self.sysdev),
                                                  dev_name(hcd->self.sysdev));
        if (IS_ERR(hcd->localmem_pool))
                return PTR_ERR(hcd->localmem_pool);

        /*
         * if a physical SRAM address was passed, map it, otherwise
         * allocate system memory as a buffer.
         */
        if (phys_addr)
                local_mem = devm_memremap(hcd->self.sysdev, phys_addr,
                                          size, MEMREMAP_WC);
        else
                local_mem = dmam_alloc_attrs(hcd->self.sysdev, size, &dma,
                                             GFP_KERNEL,
                                             DMA_ATTR_WRITE_COMBINE);

        if (IS_ERR_OR_NULL(local_mem)) {
                if (!local_mem)
                        return -ENOMEM;

                return PTR_ERR(local_mem);
        }

        /*
         * Here we pass a dma_addr_t but the arg type is a phys_addr_t.
         * It's not backed by system memory and thus there's no kernel mapping
         * for it.
         */
        err = gen_pool_add_virt(hcd->localmem_pool, (unsigned long)local_mem,
                                dma, size, dev_to_node(hcd->self.sysdev));
        if (err < 0) {
                dev_err(hcd->self.sysdev, "gen_pool_add_virt failed with %d\n",
                        err);
                return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(usb_hcd_setup_local_mem);

/*-------------------------------------------------------------------------*/

#if IS_ENABLED(CONFIG_USB_MON)

const struct usb_mon_operations *mon_ops;

/*
 * The registration is unlocked.
 * We do it this way because we do not want to lock in hot paths.
 *
 * Notice that the code is minimally error-proof. Because usbmon needs
 * symbols from usbcore, usbcore gets referenced and cannot be unloaded first.
 */

int usb_mon_register(const struct usb_mon_operations *ops)
{

        if (mon_ops)
                return -EBUSY;

        mon_ops = ops;
        mb();
        return 0;
}
EXPORT_SYMBOL_GPL (usb_mon_register);

void usb_mon_deregister (void)
{

        if (mon_ops == NULL) {
                printk(KERN_ERR "USB: monitor was not registered\n");
                return;
        }
        mon_ops = NULL;
        mb();
}
EXPORT_SYMBOL_GPL (usb_mon_deregister);

#endif /* CONFIG_USB_MON || CONFIG_USB_MON_MODULE */





















   14 





    1 




























































































   11 















   11 














































   31 
   36 

   31 











   34 





   11 
















































   10 

   11 








   11 







   10 




















   11 






   10 


























   11 















   11 



   11 








   11 



   10 














   11 









   11 
   10 









   11 





















   11 

    9 


   10 







   11 











   11 






















   11 

   11 

























   11 

   10 

   11 









    8 










   11 















   10 
   11 
   11 





   11 

   11 

   11 
   11 

   11 
















   10 



   11 



   10 

   11 



   11 









   11 
   11 


   10 


   11 



   10 













   15 













   17 


































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#include <linux/dcache.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/srcu.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

/*
 * Clear all of the marks on an inode when it is being evicted from core
 */
void __fsnotify_inode_delete(struct inode *inode)
{
        fsnotify_clear_marks_by_inode(inode);
}
EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);

void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        fsnotify_clear_marks_by_mount(mnt);
}

void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
{
        fsnotify_clear_marks_by_mntns(mntns);
}

void fsnotify_sb_delete(struct super_block *sb)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);

        /* Were any marks ever added to any object on this sb? */
        if (!sbinfo)
                return;

        fsnotify_unmount_inodes(sbinfo);
        fsnotify_clear_marks_by_sb(sb);
        /* Wait for outstanding object references from connectors */
        wait_var_event(fsnotify_sb_watched_objects(sb),
                       !atomic_long_read(fsnotify_sb_watched_objects(sb)));
        WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT));
        WARN_ON(fsnotify_sb_has_priority_watchers(sb,
                                                  FSNOTIFY_PRIO_PRE_CONTENT));
}

void fsnotify_sb_free(struct super_block *sb)
{
        if (sb->s_fsnotify_info) {
                WARN_ON_ONCE(!list_empty(&sb->s_fsnotify_info->inode_conn_list));
                kfree(sb->s_fsnotify_info);
        }
}

/*
 * Given an inode, first check if we care what happens to our children.  Inotify
 * and dnotify both tell their parents about events.  If we care about any event
 * on a child we run all of our children and set a dentry flag saying that the
 * parent cares.  Thus when an event happens on a child it can quickly tell
 * if there is a need to find a parent and send the event to the parent.
 */
void fsnotify_set_children_dentry_flags(struct inode *inode)
{
        struct dentry *alias;

        if (!S_ISDIR(inode->i_mode))
                return;

        spin_lock(&inode->i_lock);
        /* run all of the dentries associated with this inode.  Since this is a
         * directory, there damn well better only be one item on this list */
        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                struct dentry *child;

                /* run all of the children of the original inode and fix their
                 * d_flags to indicate parental interest (their parent is the
                 * original inode) */
                spin_lock(&alias->d_lock);
                hlist_for_each_entry(child, &alias->d_children, d_sib) {
                        if (!child->d_inode)
                                continue;

                        spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                        child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
                        spin_unlock(&child->d_lock);
                }
                spin_unlock(&alias->d_lock);
        }
        spin_unlock(&inode->i_lock);
}

/*
 * Lazily clear false positive PARENT_WATCHED flag for child whose parent had
 * stopped watching children.
 */
static void fsnotify_clear_child_dentry_flag(struct inode *pinode,
                                             struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        /*
         * d_lock is a sufficient barrier to prevent observing a non-watched
         * parent state from before the fsnotify_set_children_dentry_flags()
         * or fsnotify_update_flags() call that had set PARENT_WATCHED.
         */
        if (!fsnotify_inode_watches_children(pinode))
                dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
        spin_unlock(&dentry->d_lock);
}

/* Are inode/sb/mount interested in parent and name info with this event? */
static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
                                        __u32 mask)
{
        __u32 marks_mask = 0;

        /* We only send parent/name to inode/sb/mount for events on non-dir */
        if (mask & FS_ISDIR)
                return false;

        /*
         * All events that are possible on child can also may be reported with
         * parent/name info to inode/sb/mount.  Otherwise, a watching parent
         * could result in events reported with unexpected name info to sb/mount.
         */
        BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT);

        /* Did either inode/sb/mount subscribe for events with parent/name? */
        marks_mask |= fsnotify_parent_needed_mask(
                                READ_ONCE(inode->i_fsnotify_mask));
        marks_mask |= fsnotify_parent_needed_mask(
                                READ_ONCE(inode->i_sb->s_fsnotify_mask));
        marks_mask |= fsnotify_parent_needed_mask(mnt_mask);

        /* Did they subscribe for this event with parent/name info? */
        return mask & marks_mask;
}

/* Are there any inode/mount/sb objects that watch for these events? */
static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
                                            __u32 mask)
{
        __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask |
                           READ_ONCE(inode->i_sb->s_fsnotify_mask);

        return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
}

/* Report pre-content event with optional range info */
int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
                         size_t count)
{
        struct file_range range;

        /* Report page aligned range only when pos is known */
        if (!ppos)
                return fsnotify_path(path, FS_PRE_ACCESS);

        range.path = path;
        range.pos = PAGE_ALIGN_DOWN(*ppos);
        range.count = PAGE_ALIGN(*ppos + count) - range.pos;

        return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range,
                               FSNOTIFY_EVENT_FILE_RANGE);
}

/*
 * Notify this dentry's parent about a child's events with child name info
 * if parent is watching or if inode/sb/mount are interested in events with
 * parent and name info.
 *
 * Notify only the child without name info if parent is not watching and
 * inode/sb/mount are not interested in events with parent and name info.
 */
int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                      int data_type)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        __u32 mnt_mask = path ?
                READ_ONCE(real_mount(path->mnt)->mnt_fsnotify_mask) : 0;
        struct inode *inode = d_inode(dentry);
        struct dentry *parent;
        bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
        bool parent_needed, parent_interested;
        __u32 p_mask;
        struct inode *p_inode = NULL;
        struct name_snapshot name;
        struct qstr *file_name = NULL;
        int ret = 0;

        /* Optimize the likely case of nobody watching this path */
        if (likely(!parent_watched &&
                   !fsnotify_object_watched(inode, mnt_mask, mask)))
                return 0;

        parent = NULL;
        parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask);
        if (!parent_watched && !parent_needed)
                goto notify;

        /* Does parent inode care about events on children? */
        parent = dget_parent(dentry);
        p_inode = parent->d_inode;
        p_mask = fsnotify_inode_watches_children(p_inode);
        if (unlikely(parent_watched && !p_mask))
                fsnotify_clear_child_dentry_flag(p_inode, dentry);

        /*
         * Include parent/name in notification either if some notification
         * groups require parent info or the parent is interested in this event.
         * The parent interest in ACCESS/MODIFY events does not apply to special
         * files, where read/write are not on the filesystem of the parent and
         * events can provide an undesirable side-channel for information
         * exfiltration.
         */
        parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS &&
                            !(data_type == FSNOTIFY_EVENT_PATH &&
                              d_is_special(dentry) &&
                              (mask & (FS_ACCESS | FS_MODIFY)));
        if (parent_needed || parent_interested) {
                /* When notifying parent, child should be passed as data */
                WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));

                /* Notify both parent and child with child name info */
                take_dentry_name_snapshot(&name, dentry);
                file_name = &name.name;
                if (parent_interested)
                        mask |= FS_EVENT_ON_CHILD;
        }

notify:
        ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0);

        if (file_name)
                release_dentry_name_snapshot(&name);
        dput(parent);

        return ret;
}
EXPORT_SYMBOL_GPL(__fsnotify_parent);

static int fsnotify_handle_inode_event(struct fsnotify_group *group,
                                       struct fsnotify_mark *inode_mark,
                                       u32 mask, const void *data, int data_type,
                                       struct inode *dir, const struct qstr *name,
                                       u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct inode *inode = fsnotify_data_inode(data, data_type);
        const struct fsnotify_ops *ops = group->ops;

        if (WARN_ON_ONCE(!ops->handle_inode_event))
                return 0;

        if (WARN_ON_ONCE(!inode && !dir))
                return 0;

        if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) &&
            path && d_unlinked(path->dentry))
                return 0;

        /* Check interest of this mark in case event was sent with two marks */
        if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS))
                return 0;

        return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
}

static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
                                 const void *data, int data_type,
                                 struct inode *dir, const struct qstr *name,
                                 u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
        struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info);
        int ret;

        if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
            WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
                return 0;

        /*
         * For FS_RENAME, 'dir' is old dir and 'data' is new dentry.
         * The only ->handle_inode_event() backend that supports FS_RENAME is
         * dnotify, where it means file was renamed within same parent.
         */
        if (mask & FS_RENAME) {
                struct dentry *moved = fsnotify_data_dentry(data, data_type);

                if (dir != moved->d_parent->d_inode)
                        return 0;
        }

        if (parent_mark) {
                ret = fsnotify_handle_inode_event(group, parent_mark, mask,
                                                  data, data_type, dir, name, 0);
                if (ret)
                        return ret;
        }

        if (!inode_mark)
                return 0;

        /*
         * Some events can be sent on both parent dir and child marks (e.g.
         * FS_ATTRIB).  If both parent dir and child are watching, report the
         * event once to parent dir with name (if interested) and once to child
         * without name (if interested).
         *
         * In any case regardless whether the parent is watching or not, the
         * child watcher is expecting an event without the FS_EVENT_ON_CHILD
         * flag. The file name is expected if and only if this is a directory
         * event.
         */
        mask &= ~FS_EVENT_ON_CHILD;
        if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) {
                dir = NULL;
                name = NULL;
        }

        return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
                                           dir, name, cookie);
}

static int send_to_group(__u32 mask, const void *data, int data_type,
                         struct inode *dir, const struct qstr *file_name,
                         u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *group = NULL;
        __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        __u32 marks_mask = 0;
        __u32 marks_ignore_mask = 0;
        bool is_dir = mask & FS_ISDIR;
        struct fsnotify_mark *mark;
        int type;

        if (!iter_info->report_mask)
                return 0;

        /* clear ignored on inode modification */
        if (mask & FS_MODIFY) {
                fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
                        if (!(mark->flags &
                              FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                                mark->ignore_mask = 0;
                }
        }

        /* Are any of the group marks interested in this event? */
        fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
                group = mark->group;
                marks_mask |= mark->mask;
                marks_ignore_mask |=
                        fsnotify_effective_ignore_mask(mark, is_dir, type);
        }

        pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
                 __func__, group, mask, marks_mask, marks_ignore_mask,
                 data, data_type, dir, cookie);

        if (!(test_mask & marks_mask & ~marks_ignore_mask))
                return 0;

        if (group->ops->handle_event) {
                return group->ops->handle_event(group, mask, data, data_type, dir,
                                                file_name, cookie, iter_info);
        }

        return fsnotify_handle_event(group, mask, data, data_type, dir,
                                     file_name, cookie, iter_info);
}

static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp)
{
        struct fsnotify_mark_connector *conn;
        struct hlist_node *node = NULL;

        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (conn)
                node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
{
        struct hlist_node *node = NULL;

        if (mark)
                node = srcu_dereference(mark->obj_list.next,
                                        &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

/*
 * iter_info is a multi head priority queue of marks.
 * Pick a subset of marks from queue heads, all with the same group
 * and set the report_mask to a subset of the selected marks.
 * Returns false if there are no more groups to iterate.
 */
static bool fsnotify_iter_select_report_types(
                struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *max_prio_group = NULL;
        struct fsnotify_mark *mark;
        int type;

        /* Choose max prio group among groups of all queue heads */
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark &&
                    fsnotify_compare_groups(max_prio_group, mark->group) > 0)
                        max_prio_group = mark->group;
        }

        if (!max_prio_group)
                return false;

        /* Set the report mask for marks from same group as max prio group */
        iter_info->current_group = max_prio_group;
        iter_info->report_mask = 0;
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark && mark->group == iter_info->current_group) {
                        /*
                         * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode
                         * is watching children and interested in this event,
                         * which is an event possible on child.
                         * But is *this mark* watching children?
                         */
                        if (type == FSNOTIFY_ITER_TYPE_PARENT &&
                            !(mark->mask & FS_EVENT_ON_CHILD) &&
                            !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD))
                                continue;

                        fsnotify_iter_set_report_type(iter_info, type);
                }
        }

        return true;
}

/*
 * Pop from iter_info multi head queue, the marks that belong to the group of
 * current iteration step.
 */
static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *mark;
        int type;

        /*
         * We cannot use fsnotify_foreach_iter_mark_type() here because we
         * may need to advance a mark of type X that belongs to current_group
         * but was not selected for reporting.
         */
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark && mark->group == iter_info->current_group)
                        iter_info->marks[type] =
                                fsnotify_next_mark(iter_info->marks[type]);
        }
}

/*
 * fsnotify - This is the main call to fsnotify.
 *
 * The VFS calls into hook specific functions in linux/fsnotify.h.
 * Those functions then in turn call here.  Here will call out to all of the
 * registered fsnotify_group.  Those groups can then use the notification event
 * in whatever means they feel necessary.
 *
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @inode:        optional inode associated with event -
 *                If @dir and @inode are both non-NULL, event may be
 *                reported to both.
 * @cookie:        inotify rename cookie
 */
int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
             const struct qstr *file_name, struct inode *inode, u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct super_block *sb = fsnotify_data_sb(data, data_type);
        const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type);
        struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL;
        struct fsnotify_iter_info iter_info = {};
        struct mount *mnt = NULL;
        struct inode *inode2 = NULL;
        struct dentry *moved;
        int inode2_type;
        int ret = 0;
        __u32 test_mask, marks_mask = 0;

        if (path)
                mnt = real_mount(path->mnt);

        if (!inode) {
                /* Dirent event - report on TYPE_INODE to dir */
                inode = dir;
                /* For FS_RENAME, inode is old_dir and inode2 is new_dir */
                if (mask & FS_RENAME) {
                        moved = fsnotify_data_dentry(data, data_type);
                        inode2 = moved->d_parent->d_inode;
                        inode2_type = FSNOTIFY_ITER_TYPE_INODE2;
                }
        } else if (mask & FS_EVENT_ON_CHILD) {
                /*
                 * Event on child - report on TYPE_PARENT to dir if it is
                 * watching children and on TYPE_INODE to child.
                 */
                inode2 = dir;
                inode2_type = FSNOTIFY_ITER_TYPE_PARENT;
        }

        /*
         * Optimization: srcu_read_lock() has a memory barrier which can
         * be expensive.  It protects walking the *_fsnotify_marks lists.
         * However, if we do not walk the lists, we do not have to do
         * SRCU because we have no references to any objects and do not
         * need SRCU to keep them "alive".
         */
        if ((!sbinfo || !sbinfo->sb_marks) &&
            (!mnt || !mnt->mnt_fsnotify_marks) &&
            (!inode || !inode->i_fsnotify_marks) &&
            (!inode2 || !inode2->i_fsnotify_marks) &&
            (!mnt_data || !mnt_data->ns->n_fsnotify_marks))
                return 0;

        if (sb)
                marks_mask |= READ_ONCE(sb->s_fsnotify_mask);
        if (mnt)
                marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask);
        if (inode)
                marks_mask |= READ_ONCE(inode->i_fsnotify_mask);
        if (inode2)
                marks_mask |= READ_ONCE(inode2->i_fsnotify_mask);
        if (mnt_data)
                marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask);

        /*
         * If this is a modify event we may need to clear some ignore masks.
         * In that case, the object with ignore masks will have the FS_MODIFY
         * event in its mask.
         * Otherwise, return if none of the marks care about this type of event.
         */
        test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        if (!(test_mask & marks_mask))
                return 0;

        iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);

        if (sbinfo) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
                        fsnotify_first_mark(&sbinfo->sb_marks);
        }
        if (mnt) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] =
                        fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
        }
        if (inode) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] =
                        fsnotify_first_mark(&inode->i_fsnotify_marks);
        }
        if (inode2) {
                iter_info.marks[inode2_type] =
                        fsnotify_first_mark(&inode2->i_fsnotify_marks);
        }
        if (mnt_data) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] =
                        fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks);
        }

        /*
         * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
         * ignore masks are properly reflected for mount/sb mark notifications.
         * That's why this traversal is so complicated...
         */
        while (fsnotify_iter_select_report_types(&iter_info)) {
                ret = send_to_group(mask, data, data_type, dir, file_name,
                                    cookie, &iter_info);

                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
                        goto out;

                fsnotify_iter_next(&iter_info);
        }
        ret = 0;
out:
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx);

        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify);

#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
/*
 * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm
 * hook and set the FMODE_NONOTIFY_ mode bits accordignly.
 * Later, fsnotify permission hooks do not check if there are permission event
 * watches, but that there were permission event watches at open time.
 */
int fsnotify_open_perm_and_set_mode(struct file *file)
{
        struct dentry *dentry = file->f_path.dentry, *parent;
        struct super_block *sb = dentry->d_sb;
        __u32 mnt_mask, p_mask = 0;

        /* Is it a file opened by fanotify? */
        if (FMODE_FSNOTIFY_NONE(file->f_mode))
                return 0;

        /*
         * Permission events is a super set of pre-content events, so if there
         * are no permission event watchers, there are also no pre-content event
         * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit.
         */
        if (likely(!fsnotify_sb_has_priority_watchers(sb,
                                                FSNOTIFY_PRIO_CONTENT))) {
                file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
                return 0;
        }

        /*
         * OK, there are some permission event watchers. Check if anybody is
         * watching for permission events on *this* file.
         */
        mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
        p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask,
                                         ALL_FSNOTIFY_PERM_EVENTS);
        if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
                parent = dget_parent(dentry);
                p_mask |= fsnotify_inode_watches_children(d_inode(parent));
                dput(parent);
        }

        /*
         * Legacy FAN_ACCESS_PERM events have very high performance overhead,
         * so unlikely to be used in the wild. If they are used there will be
         * no optimizations at all.
         */
        if (unlikely(p_mask & FS_ACCESS_PERM)) {
                /* Enable all permission and pre-content events */
                file_set_fsnotify_mode(file, 0);
                goto open_perm;
        }

        /*
         * Pre-content events are only supported on regular files.
         * If there are pre-content event watchers and no permission access
         * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
         * That is the common case with HSM service.
         */
        if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) {
                file_set_fsnotify_mode(file, FMODE_NONOTIFY |
                                             FMODE_NONOTIFY_PERM);
                goto open_perm;
        }

        /* Nobody watching permission and pre-content events on this file */
        file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);

open_perm:
        /*
         * Send open perm events depending on object masks and regardless of
         * FMODE_NONOTIFY_PERM.
         */
        if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) {
                int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);

                if (ret)
                        return ret;
        }

        if (p_mask & FS_OPEN_PERM)
                return fsnotify_path(&file->f_path, FS_OPEN_PERM);

        return 0;
}
#endif

void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt)
{
        struct fsnotify_mnt data = {
                .ns = ns,
                .mnt_id = real_mount(mnt)->mnt_id_unique,
        };

        if (WARN_ON_ONCE(!ns))
                return;

        /*
         * This is an optimization as well as making sure fsnotify_init() has
         * been called.
         */
        if (!ns->n_fsnotify_marks)
                return;

        fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0);
}

static __init int fsnotify_init(void)
{
        int ret;

        BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26);

        ret = init_srcu_struct(&fsnotify_mark_srcu);
        if (ret)
                panic("initializing fsnotify_mark_srcu");

        fsnotify_init_connector_caches();

        return 0;
}
core_initcall(fsnotify_init);
























































    1 













    1 














    1 

    1 

    1 
















    1 




















    1 














    1 










    1 


























































































































































































































































































































































































































































































































































































































































































































    1 






    1 








    1 

































































































































































































































































































































































































































































    1 
    1 

    1 













































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016,2017 Facebook
 */
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
#include <linux/perf_event.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/btf_ids.h>
#include <crypto/sha2.h>

#include "map_in_map.h"

#define ARRAY_CREATE_FLAG_MASK \
        (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
         BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)

static void bpf_array_free_percpu(struct bpf_array *array)
{
        int i;

        for (i = 0; i < array->map.max_entries; i++) {
                free_percpu(array->pptrs[i]);
                cond_resched();
        }
}

static int bpf_array_alloc_percpu(struct bpf_array *array)
{
        void __percpu *ptr;
        int i;

        for (i = 0; i < array->map.max_entries; i++) {
                ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8,
                                           GFP_USER | __GFP_NOWARN);
                if (!ptr) {
                        bpf_array_free_percpu(array);
                        return -ENOMEM;
                }
                array->pptrs[i] = ptr;
                cond_resched();
        }

        return 0;
}

/* Called from syscall */
int array_map_alloc_check(union bpf_attr *attr)
{
        bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
        int numa_node = bpf_map_attr_numa_node(attr);

        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
            attr->value_size == 0 ||
            attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
            !bpf_map_flags_access_ok(attr->map_flags) ||
            (percpu && numa_node != NUMA_NO_NODE))
                return -EINVAL;

        if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
            attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
                return -EINVAL;

        if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
            attr->map_flags & BPF_F_PRESERVE_ELEMS)
                return -EINVAL;

        /* avoid overflow on round_up(map->value_size) */
        if (attr->value_size > INT_MAX)
                return -E2BIG;
        /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
        if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
                return -E2BIG;

        return 0;
}

static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
        bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
        int numa_node = bpf_map_attr_numa_node(attr);
        u32 elem_size, index_mask, max_entries;
        bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
        u64 array_size, mask64;
        struct bpf_array *array;

        elem_size = round_up(attr->value_size, 8);

        max_entries = attr->max_entries;

        /* On 32 bit archs roundup_pow_of_two() with max_entries that has
         * upper most bit set in u32 space is undefined behavior due to
         * resulting 1U << 32, so do it manually here in u64 space.
         */
        mask64 = fls_long(max_entries - 1);
        mask64 = 1ULL << mask64;
        mask64 -= 1;

        index_mask = mask64;
        if (!bypass_spec_v1) {
                /* round up array size to nearest power of 2,
                 * since cpu will speculate within index_mask limits
                 */
                max_entries = index_mask + 1;
                /* Check for overflows. */
                if (max_entries < attr->max_entries)
                        return ERR_PTR(-E2BIG);
        }

        array_size = sizeof(*array);
        if (percpu) {
                array_size += (u64) max_entries * sizeof(void *);
        } else {
                /* rely on vmalloc() to return page-aligned memory and
                 * ensure array->value is exactly page-aligned
                 */
                if (attr->map_flags & BPF_F_MMAPABLE) {
                        array_size = PAGE_ALIGN(array_size);
                        array_size += PAGE_ALIGN((u64) max_entries * elem_size);
                } else {
                        array_size += (u64) max_entries * elem_size;
                }
        }

        /* allocate all map elements and zero-initialize them */
        if (attr->map_flags & BPF_F_MMAPABLE) {
                void *data;

                /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
                data = bpf_map_area_mmapable_alloc(array_size, numa_node);
                if (!data)
                        return ERR_PTR(-ENOMEM);
                array = data + PAGE_ALIGN(sizeof(struct bpf_array))
                        - offsetof(struct bpf_array, value);
        } else {
                array = bpf_map_area_alloc(array_size, numa_node);
        }
        if (!array)
                return ERR_PTR(-ENOMEM);
        array->index_mask = index_mask;
        array->map.bypass_spec_v1 = bypass_spec_v1;

        /* copy mandatory map attributes */
        bpf_map_init_from_attr(&array->map, attr);
        array->elem_size = elem_size;

        if (percpu && bpf_array_alloc_percpu(array)) {
                bpf_map_area_free(array);
                return ERR_PTR(-ENOMEM);
        }

        return &array->map;
}

static void *array_map_elem_ptr(struct bpf_array* array, u32 index)
{
        return array->value + (u64)array->elem_size * index;
}

/* Called from syscall or from eBPF program */
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        u32 index = *(u32 *)key;

        if (unlikely(index >= array->map.max_entries))
                return NULL;

        return array->value + (u64)array->elem_size * (index & array->index_mask);
}

static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size,
                               void *hash_buf)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);

        sha256(array->value, (u64)array->elem_size * array->map.max_entries,
               hash_buf);
        memcpy(array->map.sha, hash_buf, sizeof(array->map.sha));
        return 0;
}

static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
                                       u32 off)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);

        if (map->max_entries != 1)
                return -ENOTSUPP;
        if (off >= map->value_size)
                return -EINVAL;

        *imm = (unsigned long)array->value;
        return 0;
}

static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
                                       u32 *off)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        u64 base = (unsigned long)array->value;
        u64 range = array->elem_size;

        if (map->max_entries != 1)
                return -ENOTSUPP;
        if (imm < base || imm >= base + range)
                return -ENOENT;

        *off = imm - base;
        return 0;
}

/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct bpf_insn *insn = insn_buf;
        u32 elem_size = array->elem_size;
        const int ret = BPF_REG_0;
        const int map_ptr = BPF_REG_1;
        const int index = BPF_REG_2;

        if (map->map_flags & BPF_F_INNER_MAP)
                return -EOPNOTSUPP;

        *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
        *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
        if (!map->bypass_spec_v1) {
                *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
                *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
        } else {
                *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
        }

        if (is_power_of_2(elem_size)) {
                *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
        } else {
                *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
        }
        *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
        *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
        *insn++ = BPF_MOV64_IMM(ret, 0);
        return insn - insn_buf;
}

/* Called from eBPF program */
static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        u32 index = *(u32 *)key;

        if (unlikely(index >= array->map.max_entries))
                return NULL;

        return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}

/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */
static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct bpf_insn *insn = insn_buf;

        if (!bpf_jit_supports_percpu_insn())
                return -EOPNOTSUPP;

        if (map->map_flags & BPF_F_INNER_MAP)
                return -EOPNOTSUPP;

        BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0);
        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs));

        *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0);
        if (!map->bypass_spec_v1) {
                *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6);
                *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask);
        } else {
                *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5);
        }

        *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
        *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
        *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
        *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
        *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
        *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0);
        return insn - insn_buf;
}

static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        u32 index = *(u32 *)key;

        if (cpu >= nr_cpu_ids)
                return NULL;

        if (unlikely(index >= array->map.max_entries))
                return NULL;

        return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
}

int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        u32 index = *(u32 *)key;
        void __percpu *pptr;
        int cpu, off = 0;
        u32 size;

        if (unlikely(index >= array->map.max_entries))
                return -ENOENT;

        /* per_cpu areas are zero-filled and bpf programs can only
         * access 'value_size' of them, so copying rounded areas
         * will not leak any kernel data
         */
        size = array->elem_size;
        rcu_read_lock();
        pptr = array->pptrs[index & array->index_mask];
        if (map_flags & BPF_F_CPU) {
                cpu = map_flags >> 32;
                copy_map_value(map, value, per_cpu_ptr(pptr, cpu));
                check_and_init_map_value(map, value);
                goto unlock;
        }
        for_each_possible_cpu(cpu) {
                copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
                check_and_init_map_value(map, value + off);
                off += size;
        }
unlock:
        rcu_read_unlock();
        return 0;
}

/* Called from syscall */
int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
        u32 index = key ? *(u32 *)key : U32_MAX;
        u32 *next = (u32 *)next_key;

        if (index >= map->max_entries) {
                *next = 0;
                return 0;
        }

        if (index == map->max_entries - 1)
                return -ENOENT;

        *next = index + 1;
        return 0;
}

/* Called from syscall or from eBPF program */
static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
                                  u64 map_flags)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        u32 index = *(u32 *)key;
        char *val;

        if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
                /* unknown flags */
                return -EINVAL;

        if (unlikely(index >= array->map.max_entries))
                /* all elements were pre-allocated, cannot insert a new one */
                return -E2BIG;

        if (unlikely(map_flags & BPF_NOEXIST))
                /* all elements already exist */
                return -EEXIST;

        if (unlikely((map_flags & BPF_F_LOCK) &&
                     !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
                return -EINVAL;

        if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
                copy_map_value(map, val, value);
                bpf_obj_free_fields(array->map.record, val);
        } else {
                val = array->value +
                        (u64)array->elem_size * (index & array->index_mask);
                if (map_flags & BPF_F_LOCK)
                        copy_map_value_locked(map, val, value, false);
                else
                        copy_map_value(map, val, value);
                bpf_obj_free_fields(array->map.record, val);
        }
        return 0;
}

int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
                            u64 map_flags)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        u32 index = *(u32 *)key;
        void __percpu *pptr;
        void *ptr, *val;
        u32 size;
        int cpu;

        if (unlikely((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS))
                /* unknown flags */
                return -EINVAL;

        if (unlikely(index >= array->map.max_entries))
                /* all elements were pre-allocated, cannot insert a new one */
                return -E2BIG;

        if (unlikely(map_flags == BPF_NOEXIST))
                /* all elements already exist */
                return -EEXIST;

        /* the user space will provide round_up(value_size, 8) bytes that
         * will be copied into per-cpu area. bpf programs can only access
         * value_size of it. During lookup the same extra bytes will be
         * returned or zeros which were zero-filled by percpu_alloc,
         * so no kernel data leaks possible
         */
        size = array->elem_size;
        rcu_read_lock();
        pptr = array->pptrs[index & array->index_mask];
        if (map_flags & BPF_F_CPU) {
                cpu = map_flags >> 32;
                ptr = per_cpu_ptr(pptr, cpu);
                copy_map_value(map, ptr, value);
                bpf_obj_free_fields(array->map.record, ptr);
                goto unlock;
        }
        for_each_possible_cpu(cpu) {
                ptr = per_cpu_ptr(pptr, cpu);
                val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
                copy_map_value(map, ptr, val);
                bpf_obj_free_fields(array->map.record, ptr);
        }
unlock:
        rcu_read_unlock();
        return 0;
}

/* Called from syscall or from eBPF program */
static long array_map_delete_elem(struct bpf_map *map, void *key)
{
        return -EINVAL;
}

static void *array_map_vmalloc_addr(struct bpf_array *array)
{
        return (void *)round_down((unsigned long)array, PAGE_SIZE);
}

static void array_map_free_internal_structs(struct bpf_map *map)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        int i;

        /* We only free internal structs on uref dropping to zero */
        if (!bpf_map_has_internal_structs(map))
                return;

        for (i = 0; i < array->map.max_entries; i++)
                bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i));
}

/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        int i;

        if (!IS_ERR_OR_NULL(map->record)) {
                if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                        for (i = 0; i < array->map.max_entries; i++) {
                                void __percpu *pptr = array->pptrs[i & array->index_mask];
                                int cpu;

                                for_each_possible_cpu(cpu) {
                                        bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu));
                                        cond_resched();
                                }
                        }
                } else {
                        for (i = 0; i < array->map.max_entries; i++)
                                bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i));
                }
        }

        if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
                bpf_array_free_percpu(array);

        if (array->map.map_flags & BPF_F_MMAPABLE)
                bpf_map_area_free(array_map_vmalloc_addr(array));
        else
                bpf_map_area_free(array);
}

static void array_map_seq_show_elem(struct bpf_map *map, void *key,
                                    struct seq_file *m)
{
        void *value;

        rcu_read_lock();

        value = array_map_lookup_elem(map, key);
        if (!value) {
                rcu_read_unlock();
                return;
        }

        if (map->btf_key_type_id)
                seq_printf(m, "%u: ", *(u32 *)key);
        btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
        seq_putc(m, '\n');

        rcu_read_unlock();
}

static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
                                           struct seq_file *m)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        u32 index = *(u32 *)key;
        void __percpu *pptr;
        int cpu;

        rcu_read_lock();

        seq_printf(m, "%u: {\n", *(u32 *)key);
        pptr = array->pptrs[index & array->index_mask];
        for_each_possible_cpu(cpu) {
                seq_printf(m, "\tcpu%d: ", cpu);
                btf_type_seq_show(map->btf, map->btf_value_type_id,
                                  per_cpu_ptr(pptr, cpu), m);
                seq_putc(m, '\n');
        }
        seq_puts(m, "}\n");

        rcu_read_unlock();
}

static int array_map_check_btf(struct bpf_map *map,
                               const struct btf *btf,
                               const struct btf_type *key_type,
                               const struct btf_type *value_type)
{
        /* One exception for keyless BTF: .bss/.data/.rodata map */
        if (btf_type_is_void(key_type)) {
                if (map->map_type != BPF_MAP_TYPE_ARRAY ||
                    map->max_entries != 1)
                        return -EINVAL;

                if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC)
                        return -EINVAL;

                return 0;
        }

        /*
         * Bpf array can only take a u32 key. This check makes sure
         * that the btf matches the attr used during map_create.
         */
        if (!btf_type_is_i32(key_type))
                return -EINVAL;

        return 0;
}

static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT;

        if (!(map->map_flags & BPF_F_MMAPABLE))
                return -EINVAL;

        if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) >
            PAGE_ALIGN((u64)array->map.max_entries * array->elem_size))
                return -EINVAL;

        return remap_vmalloc_range(vma, array_map_vmalloc_addr(array),
                                   vma->vm_pgoff + pgoff);
}

static bool array_map_meta_equal(const struct bpf_map *meta0,
                                 const struct bpf_map *meta1)
{
        if (!bpf_map_meta_equal(meta0, meta1))
                return false;
        return meta0->map_flags & BPF_F_INNER_MAP ? true :
               meta0->max_entries == meta1->max_entries;
}

struct bpf_iter_seq_array_map_info {
        struct bpf_map *map;
        void *percpu_value_buf;
        u32 index;
};

static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
{
        struct bpf_iter_seq_array_map_info *info = seq->private;
        struct bpf_map *map = info->map;
        struct bpf_array *array;
        u32 index;

        if (info->index >= map->max_entries)
                return NULL;

        if (*pos == 0)
                ++*pos;
        array = container_of(map, struct bpf_array, map);
        index = info->index & array->index_mask;
        if (info->percpu_value_buf)
                return (void *)(uintptr_t)array->pptrs[index];
        return array_map_elem_ptr(array, index);
}

static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct bpf_iter_seq_array_map_info *info = seq->private;
        struct bpf_map *map = info->map;
        struct bpf_array *array;
        u32 index;

        ++*pos;
        ++info->index;
        if (info->index >= map->max_entries)
                return NULL;

        array = container_of(map, struct bpf_array, map);
        index = info->index & array->index_mask;
        if (info->percpu_value_buf)
                return (void *)(uintptr_t)array->pptrs[index];
        return array_map_elem_ptr(array, index);
}

static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
{
        struct bpf_iter_seq_array_map_info *info = seq->private;
        struct bpf_iter__bpf_map_elem ctx = {};
        struct bpf_map *map = info->map;
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        int off = 0, cpu = 0;
        void __percpu *pptr;
        u32 size;

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, v == NULL);
        if (!prog)
                return 0;

        ctx.meta = &meta;
        ctx.map = info->map;
        if (v) {
                ctx.key = &info->index;

                if (!info->percpu_value_buf) {
                        ctx.value = v;
                } else {
                        pptr = (void __percpu *)(uintptr_t)v;
                        size = array->elem_size;
                        for_each_possible_cpu(cpu) {
                                copy_map_value_long(map, info->percpu_value_buf + off,
                                                    per_cpu_ptr(pptr, cpu));
                                check_and_init_map_value(map, info->percpu_value_buf + off);
                                off += size;
                        }
                        ctx.value = info->percpu_value_buf;
                }
        }

        return bpf_iter_run_prog(prog, &ctx);
}

static int bpf_array_map_seq_show(struct seq_file *seq, void *v)
{
        return __bpf_array_map_seq_show(seq, v);
}

static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)
{
        if (!v)
                (void)__bpf_array_map_seq_show(seq, NULL);
}

static int bpf_iter_init_array_map(void *priv_data,
                                   struct bpf_iter_aux_info *aux)
{
        struct bpf_iter_seq_array_map_info *seq_info = priv_data;
        struct bpf_map *map = aux->map;
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        void *value_buf;
        u32 buf_size;

        if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                buf_size = array->elem_size * num_possible_cpus();
                value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
                if (!value_buf)
                        return -ENOMEM;

                seq_info->percpu_value_buf = value_buf;
        }

        /* bpf_iter_attach_map() acquires a map uref, and the uref may be
         * released before or in the middle of iterating map elements, so
         * acquire an extra map uref for iterator.
         */
        bpf_map_inc_with_uref(map);
        seq_info->map = map;
        return 0;
}

static void bpf_iter_fini_array_map(void *priv_data)
{
        struct bpf_iter_seq_array_map_info *seq_info = priv_data;

        bpf_map_put_with_uref(seq_info->map);
        kfree(seq_info->percpu_value_buf);
}

static const struct seq_operations bpf_array_map_seq_ops = {
        .start        = bpf_array_map_seq_start,
        .next        = bpf_array_map_seq_next,
        .stop        = bpf_array_map_seq_stop,
        .show        = bpf_array_map_seq_show,
};

static const struct bpf_iter_seq_info iter_seq_info = {
        .seq_ops                = &bpf_array_map_seq_ops,
        .init_seq_private        = bpf_iter_init_array_map,
        .fini_seq_private        = bpf_iter_fini_array_map,
        .seq_priv_size                = sizeof(struct bpf_iter_seq_array_map_info),
};

static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
                                    void *callback_ctx, u64 flags)
{
        u32 i, key, num_elems = 0;
        struct bpf_array *array;
        bool is_percpu;
        u64 ret = 0;
        void *val;

        cant_migrate();

        if (flags != 0)
                return -EINVAL;

        is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
        array = container_of(map, struct bpf_array, map);
        for (i = 0; i < map->max_entries; i++) {
                if (is_percpu)
                        val = this_cpu_ptr(array->pptrs[i]);
                else
                        val = array_map_elem_ptr(array, i);
                num_elems++;
                key = i;
                ret = callback_fn((u64)(long)map, (u64)(long)&key,
                                  (u64)(long)val, (u64)(long)callback_ctx, 0);
                /* return value: 0 - continue, 1 - stop and return */
                if (ret)
                        break;
        }

        return num_elems;
}

static u64 array_map_mem_usage(const struct bpf_map *map)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
        u32 elem_size = array->elem_size;
        u64 entries = map->max_entries;
        u64 usage = sizeof(*array);

        if (percpu) {
                usage += entries * sizeof(void *);
                usage += entries * elem_size * num_possible_cpus();
        } else {
                if (map->map_flags & BPF_F_MMAPABLE) {
                        usage = PAGE_ALIGN(usage);
                        usage += PAGE_ALIGN(entries * elem_size);
                } else {
                        usage += entries * elem_size;
                }
        }
        return usage;
}

BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
const struct bpf_map_ops array_map_ops = {
        .map_meta_equal = array_map_meta_equal,
        .map_alloc_check = array_map_alloc_check,
        .map_alloc = array_map_alloc,
        .map_free = array_map_free,
        .map_get_next_key = bpf_array_get_next_key,
        .map_release_uref = array_map_free_internal_structs,
        .map_lookup_elem = array_map_lookup_elem,
        .map_update_elem = array_map_update_elem,
        .map_delete_elem = array_map_delete_elem,
        .map_gen_lookup = array_map_gen_lookup,
        .map_direct_value_addr = array_map_direct_value_addr,
        .map_direct_value_meta = array_map_direct_value_meta,
        .map_mmap = array_map_mmap,
        .map_seq_show_elem = array_map_seq_show_elem,
        .map_check_btf = array_map_check_btf,
        .map_lookup_batch = generic_map_lookup_batch,
        .map_update_batch = generic_map_update_batch,
        .map_set_for_each_callback_args = map_set_for_each_callback_args,
        .map_for_each_callback = bpf_for_each_array_elem,
        .map_mem_usage = array_map_mem_usage,
        .map_btf_id = &array_map_btf_ids[0],
        .iter_seq_info = &iter_seq_info,
        .map_get_hash = &array_map_get_hash,
};

const struct bpf_map_ops percpu_array_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = array_map_alloc_check,
        .map_alloc = array_map_alloc,
        .map_free = array_map_free,
        .map_get_next_key = bpf_array_get_next_key,
        .map_lookup_elem = percpu_array_map_lookup_elem,
        .map_gen_lookup = percpu_array_map_gen_lookup,
        .map_update_elem = array_map_update_elem,
        .map_delete_elem = array_map_delete_elem,
        .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
        .map_seq_show_elem = percpu_array_map_seq_show_elem,
        .map_check_btf = array_map_check_btf,
        .map_lookup_batch = generic_map_lookup_batch,
        .map_update_batch = generic_map_update_batch,
        .map_set_for_each_callback_args = map_set_for_each_callback_args,
        .map_for_each_callback = bpf_for_each_array_elem,
        .map_mem_usage = array_map_mem_usage,
        .map_btf_id = &array_map_btf_ids[0],
        .iter_seq_info = &iter_seq_info,
};

static int fd_array_map_alloc_check(union bpf_attr *attr)
{
        /* only file descriptors can be stored in this type of map */
        if (attr->value_size != sizeof(u32))
                return -EINVAL;
        /* Program read-only/write-only not supported for special maps yet. */
        if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG))
                return -EINVAL;
        return array_map_alloc_check(attr);
}

static void fd_array_map_free(struct bpf_map *map)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        int i;

        /* make sure it's empty */
        for (i = 0; i < array->map.max_entries; i++)
                BUG_ON(array->ptrs[i] != NULL);

        bpf_map_area_free(array);
}

static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
{
        return ERR_PTR(-EOPNOTSUPP);
}

/* only called from syscall */
int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
{
        void **elem, *ptr;
        int ret =  0;

        if (!map->ops->map_fd_sys_lookup_elem)
                return -ENOTSUPP;

        rcu_read_lock();
        elem = array_map_lookup_elem(map, key);
        if (elem && (ptr = READ_ONCE(*elem)))
                *value = map->ops->map_fd_sys_lookup_elem(ptr);
        else
                ret = -ENOENT;
        rcu_read_unlock();

        return ret;
}

/* only called from syscall */
int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
                                 void *key, void *value, u64 map_flags)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        void *new_ptr, *old_ptr;
        u32 index = *(u32 *)key, ufd;

        if (map_flags != BPF_ANY)
                return -EINVAL;

        if (index >= array->map.max_entries)
                return -E2BIG;

        ufd = *(u32 *)value;
        new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
        if (IS_ERR(new_ptr))
                return PTR_ERR(new_ptr);

        if (map->ops->map_poke_run) {
                mutex_lock(&array->aux->poke_mutex);
                old_ptr = xchg(array->ptrs + index, new_ptr);
                map->ops->map_poke_run(map, index, old_ptr, new_ptr);
                mutex_unlock(&array->aux->poke_mutex);
        } else {
                old_ptr = xchg(array->ptrs + index, new_ptr);
        }

        if (old_ptr)
                map->ops->map_fd_put_ptr(map, old_ptr, true);
        return 0;
}

static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        void *old_ptr;
        u32 index = *(u32 *)key;

        if (index >= array->map.max_entries)
                return -E2BIG;

        if (map->ops->map_poke_run) {
                mutex_lock(&array->aux->poke_mutex);
                old_ptr = xchg(array->ptrs + index, NULL);
                map->ops->map_poke_run(map, index, old_ptr, NULL);
                mutex_unlock(&array->aux->poke_mutex);
        } else {
                old_ptr = xchg(array->ptrs + index, NULL);
        }

        if (old_ptr) {
                map->ops->map_fd_put_ptr(map, old_ptr, need_defer);
                return 0;
        } else {
                return -ENOENT;
        }
}

static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
{
        return __fd_array_map_delete_elem(map, key, true);
}

static void *prog_fd_array_get_ptr(struct bpf_map *map,
                                   struct file *map_file, int fd)
{
        struct bpf_prog *prog = bpf_prog_get(fd);
        bool is_extended;

        if (IS_ERR(prog))
                return prog;

        if (prog->type == BPF_PROG_TYPE_EXT ||
            !bpf_prog_map_compatible(map, prog)) {
                bpf_prog_put(prog);
                return ERR_PTR(-EINVAL);
        }

        mutex_lock(&prog->aux->ext_mutex);
        is_extended = prog->aux->is_extended;
        if (!is_extended)
                prog->aux->prog_array_member_cnt++;
        mutex_unlock(&prog->aux->ext_mutex);
        if (is_extended) {
                /* Extended prog can not be tail callee. It's to prevent a
                 * potential infinite loop like:
                 * tail callee prog entry -> tail callee prog subprog ->
                 * freplace prog entry --tailcall-> tail callee prog entry.
                 */
                bpf_prog_put(prog);
                return ERR_PTR(-EBUSY);
        }

        return prog;
}

static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
        struct bpf_prog *prog = ptr;

        mutex_lock(&prog->aux->ext_mutex);
        prog->aux->prog_array_member_cnt--;
        mutex_unlock(&prog->aux->ext_mutex);
        /* bpf_prog is freed after one RCU or tasks trace grace period */
        bpf_prog_put(prog);
}

static u32 prog_fd_array_sys_lookup_elem(void *ptr)
{
        return ((struct bpf_prog *)ptr)->aux->id;
}

/* decrement refcnt of all bpf_progs that are stored in this map */
static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        int i;

        for (i = 0; i < array->map.max_entries; i++) {
                __fd_array_map_delete_elem(map, &i, need_defer);
                cond_resched();
        }
}

static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
                                         struct seq_file *m)
{
        void **elem, *ptr;
        u32 prog_id;

        rcu_read_lock();

        elem = array_map_lookup_elem(map, key);
        if (elem) {
                ptr = READ_ONCE(*elem);
                if (ptr) {
                        seq_printf(m, "%u: ", *(u32 *)key);
                        prog_id = prog_fd_array_sys_lookup_elem(ptr);
                        btf_type_seq_show(map->btf, map->btf_value_type_id,
                                          &prog_id, m);
                        seq_putc(m, '\n');
                }
        }

        rcu_read_unlock();
}

struct prog_poke_elem {
        struct list_head list;
        struct bpf_prog_aux *aux;
};

static int prog_array_map_poke_track(struct bpf_map *map,
                                     struct bpf_prog_aux *prog_aux)
{
        struct prog_poke_elem *elem;
        struct bpf_array_aux *aux;
        int ret = 0;

        aux = container_of(map, struct bpf_array, map)->aux;
        mutex_lock(&aux->poke_mutex);
        list_for_each_entry(elem, &aux->poke_progs, list) {
                if (elem->aux == prog_aux)
                        goto out;
        }

        elem = kmalloc_obj(*elem);
        if (!elem) {
                ret = -ENOMEM;
                goto out;
        }

        INIT_LIST_HEAD(&elem->list);
        /* We must track the program's aux info at this point in time
         * since the program pointer itself may not be stable yet, see
         * also comment in prog_array_map_poke_run().
         */
        elem->aux = prog_aux;

        list_add_tail(&elem->list, &aux->poke_progs);
out:
        mutex_unlock(&aux->poke_mutex);
        return ret;
}

static void prog_array_map_poke_untrack(struct bpf_map *map,
                                        struct bpf_prog_aux *prog_aux)
{
        struct prog_poke_elem *elem, *tmp;
        struct bpf_array_aux *aux;

        aux = container_of(map, struct bpf_array, map)->aux;
        mutex_lock(&aux->poke_mutex);
        list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
                if (elem->aux == prog_aux) {
                        list_del_init(&elem->list);
                        kfree(elem);
                        break;
                }
        }
        mutex_unlock(&aux->poke_mutex);
}

void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
                                      struct bpf_prog *new, struct bpf_prog *old)
{
        WARN_ON_ONCE(1);
}

static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
                                    struct bpf_prog *old,
                                    struct bpf_prog *new)
{
        struct prog_poke_elem *elem;
        struct bpf_array_aux *aux;

        aux = container_of(map, struct bpf_array, map)->aux;
        WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));

        list_for_each_entry(elem, &aux->poke_progs, list) {
                struct bpf_jit_poke_descriptor *poke;
                int i;

                for (i = 0; i < elem->aux->size_poke_tab; i++) {
                        poke = &elem->aux->poke_tab[i];

                        /* Few things to be aware of:
                         *
                         * 1) We can only ever access aux in this context, but
                         *    not aux->prog since it might not be stable yet and
                         *    there could be danger of use after free otherwise.
                         * 2) Initially when we start tracking aux, the program
                         *    is not JITed yet and also does not have a kallsyms
                         *    entry. We skip these as poke->tailcall_target_stable
                         *    is not active yet. The JIT will do the final fixup
                         *    before setting it stable. The various
                         *    poke->tailcall_target_stable are successively
                         *    activated, so tail call updates can arrive from here
                         *    while JIT is still finishing its final fixup for
                         *    non-activated poke entries.
                         * 3) Also programs reaching refcount of zero while patching
                         *    is in progress is okay since we're protected under
                         *    poke_mutex and untrack the programs before the JIT
                         *    buffer is freed.
                         */
                        if (!READ_ONCE(poke->tailcall_target_stable))
                                continue;
                        if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
                                continue;
                        if (poke->tail_call.map != map ||
                            poke->tail_call.key != key)
                                continue;

                        bpf_arch_poke_desc_update(poke, new, old);
                }
        }
}

static void prog_array_map_clear_deferred(struct work_struct *work)
{
        struct bpf_map *map = container_of(work, struct bpf_array_aux,
                                           work)->map;
        bpf_fd_array_map_clear(map, true);
        bpf_map_put(map);
}

static void prog_array_map_clear(struct bpf_map *map)
{
        struct bpf_array_aux *aux = container_of(map, struct bpf_array,
                                                 map)->aux;
        bpf_map_inc(map);
        schedule_work(&aux->work);
}

static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
{
        struct bpf_array_aux *aux;
        struct bpf_map *map;

        aux = kzalloc_obj(*aux, GFP_KERNEL_ACCOUNT);
        if (!aux)
                return ERR_PTR(-ENOMEM);

        INIT_WORK(&aux->work, prog_array_map_clear_deferred);
        INIT_LIST_HEAD(&aux->poke_progs);
        mutex_init(&aux->poke_mutex);

        map = array_map_alloc(attr);
        if (IS_ERR(map)) {
                kfree(aux);
                return map;
        }

        container_of(map, struct bpf_array, map)->aux = aux;
        aux->map = map;

        return map;
}

static void prog_array_map_free(struct bpf_map *map)
{
        struct prog_poke_elem *elem, *tmp;
        struct bpf_array_aux *aux;

        aux = container_of(map, struct bpf_array, map)->aux;
        list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
                list_del_init(&elem->list);
                kfree(elem);
        }
        kfree(aux);
        fd_array_map_free(map);
}

/* prog_array->aux->{type,jited} is a runtime binding.
 * Doing static check alone in the verifier is not enough.
 * Thus, prog_array_map cannot be used as an inner_map
 * and map_meta_equal is not implemented.
 */
const struct bpf_map_ops prog_array_map_ops = {
        .map_alloc_check = fd_array_map_alloc_check,
        .map_alloc = prog_array_map_alloc,
        .map_free = prog_array_map_free,
        .map_poke_track = prog_array_map_poke_track,
        .map_poke_untrack = prog_array_map_poke_untrack,
        .map_poke_run = prog_array_map_poke_run,
        .map_get_next_key = bpf_array_get_next_key,
        .map_lookup_elem = fd_array_map_lookup_elem,
        .map_delete_elem = fd_array_map_delete_elem,
        .map_fd_get_ptr = prog_fd_array_get_ptr,
        .map_fd_put_ptr = prog_fd_array_put_ptr,
        .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
        .map_release_uref = prog_array_map_clear,
        .map_seq_show_elem = prog_array_map_seq_show_elem,
        .map_mem_usage = array_map_mem_usage,
        .map_btf_id = &array_map_btf_ids[0],
};

static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
                                                   struct file *map_file)
{
        struct bpf_event_entry *ee;

        ee = kzalloc_obj(*ee);
        if (ee) {
                ee->event = perf_file->private_data;
                ee->perf_file = perf_file;
                ee->map_file = map_file;
        }

        return ee;
}

static void __bpf_event_entry_free(struct rcu_head *rcu)
{
        struct bpf_event_entry *ee;

        ee = container_of(rcu, struct bpf_event_entry, rcu);
        fput(ee->perf_file);
        kfree(ee);
}

static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
{
        call_rcu(&ee->rcu, __bpf_event_entry_free);
}

static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
                                         struct file *map_file, int fd)
{
        struct bpf_event_entry *ee;
        struct perf_event *event;
        struct file *perf_file;
        u64 value;

        perf_file = perf_event_get(fd);
        if (IS_ERR(perf_file))
                return perf_file;

        ee = ERR_PTR(-EOPNOTSUPP);
        event = perf_file->private_data;
        if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
                goto err_out;

        ee = bpf_event_entry_gen(perf_file, map_file);
        if (ee)
                return ee;
        ee = ERR_PTR(-ENOMEM);
err_out:
        fput(perf_file);
        return ee;
}

static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
        /* bpf_perf_event is freed after one RCU grace period */
        bpf_event_entry_free_rcu(ptr);
}

static void perf_event_fd_array_release(struct bpf_map *map,
                                        struct file *map_file)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct bpf_event_entry *ee;
        int i;

        if (map->map_flags & BPF_F_PRESERVE_ELEMS)
                return;

        rcu_read_lock();
        for (i = 0; i < array->map.max_entries; i++) {
                ee = READ_ONCE(array->ptrs[i]);
                if (ee && ee->map_file == map_file)
                        __fd_array_map_delete_elem(map, &i, true);
        }
        rcu_read_unlock();
}

static void perf_event_fd_array_map_free(struct bpf_map *map)
{
        if (map->map_flags & BPF_F_PRESERVE_ELEMS)
                bpf_fd_array_map_clear(map, false);
        fd_array_map_free(map);
}

const struct bpf_map_ops perf_event_array_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = fd_array_map_alloc_check,
        .map_alloc = array_map_alloc,
        .map_free = perf_event_fd_array_map_free,
        .map_get_next_key = bpf_array_get_next_key,
        .map_lookup_elem = fd_array_map_lookup_elem,
        .map_delete_elem = fd_array_map_delete_elem,
        .map_fd_get_ptr = perf_event_fd_array_get_ptr,
        .map_fd_put_ptr = perf_event_fd_array_put_ptr,
        .map_release = perf_event_fd_array_release,
        .map_check_btf = map_check_no_btf,
        .map_mem_usage = array_map_mem_usage,
        .map_btf_id = &array_map_btf_ids[0],
};

#ifdef CONFIG_CGROUPS
static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
                                     struct file *map_file /* not used */,
                                     int fd)
{
        return cgroup_get_from_fd(fd);
}

static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
        /* cgroup_put free cgrp after a rcu grace period */
        cgroup_put(ptr);
}

static void cgroup_fd_array_free(struct bpf_map *map)
{
        bpf_fd_array_map_clear(map, false);
        fd_array_map_free(map);
}

const struct bpf_map_ops cgroup_array_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = fd_array_map_alloc_check,
        .map_alloc = array_map_alloc,
        .map_free = cgroup_fd_array_free,
        .map_get_next_key = bpf_array_get_next_key,
        .map_lookup_elem = fd_array_map_lookup_elem,
        .map_delete_elem = fd_array_map_delete_elem,
        .map_fd_get_ptr = cgroup_fd_array_get_ptr,
        .map_fd_put_ptr = cgroup_fd_array_put_ptr,
        .map_check_btf = map_check_no_btf,
        .map_mem_usage = array_map_mem_usage,
        .map_btf_id = &array_map_btf_ids[0],
};
#endif

static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
{
        struct bpf_map *map, *inner_map_meta;

        inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
        if (IS_ERR(inner_map_meta))
                return inner_map_meta;

        map = array_map_alloc(attr);
        if (IS_ERR(map)) {
                bpf_map_meta_free(inner_map_meta);
                return map;
        }

        map->inner_map_meta = inner_map_meta;

        return map;
}

static void array_of_map_free(struct bpf_map *map)
{
        /* map->inner_map_meta is only accessed by syscall which
         * is protected by fdget/fdput.
         */
        bpf_map_meta_free(map->inner_map_meta);
        bpf_fd_array_map_clear(map, false);
        fd_array_map_free(map);
}

static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
{
        struct bpf_map **inner_map = array_map_lookup_elem(map, key);

        if (!inner_map)
                return NULL;

        return READ_ONCE(*inner_map);
}

static int array_of_map_gen_lookup(struct bpf_map *map,
                                   struct bpf_insn *insn_buf)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        u32 elem_size = array->elem_size;
        struct bpf_insn *insn = insn_buf;
        const int ret = BPF_REG_0;
        const int map_ptr = BPF_REG_1;
        const int index = BPF_REG_2;

        *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
        *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
        if (!map->bypass_spec_v1) {
                *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
                *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
        } else {
                *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
        }
        if (is_power_of_2(elem_size))
                *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
        else
                *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
        *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
        *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
        *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
        *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
        *insn++ = BPF_MOV64_IMM(ret, 0);

        return insn - insn_buf;
}

const struct bpf_map_ops array_of_maps_map_ops = {
        .map_alloc_check = fd_array_map_alloc_check,
        .map_alloc = array_of_map_alloc,
        .map_free = array_of_map_free,
        .map_get_next_key = bpf_array_get_next_key,
        .map_lookup_elem = array_of_map_lookup_elem,
        .map_delete_elem = fd_array_map_delete_elem,
        .map_fd_get_ptr = bpf_map_fd_get_ptr,
        .map_fd_put_ptr = bpf_map_fd_put_ptr,
        .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
        .map_gen_lookup = array_of_map_gen_lookup,
        .map_lookup_batch = generic_map_lookup_batch,
        .map_update_batch = generic_map_update_batch,
        .map_check_btf = map_check_no_btf,
        .map_mem_usage = array_map_mem_usage,
        .map_btf_id = &array_map_btf_ids[0],
};














































































































































































































































































































































































































































































































































































































































    1 










    2 







    3 











    1 








    2 


    2 

    2 
















    1 














    2 






































































    1 

























    1 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * VLAN                An implementation of 802.1Q VLAN tagging.
 *
 * Authors:        Ben Greear <greearb@candelatech.com>
 */
#ifndef _LINUX_IF_VLAN_H_
#define _LINUX_IF_VLAN_H_

#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/bug.h>
#include <uapi/linux/if_vlan.h>

#define VLAN_HLEN        4                /* The additional bytes required by VLAN
                                         * (in addition to the Ethernet header)
                                         */
#define VLAN_ETH_HLEN        18                /* Total octets in header.         */
#define VLAN_ETH_ZLEN        64                /* Min. octets in frame sans FCS */

/*
 * According to 802.3ac, the packet can be 4 bytes longer. --Klika Jan
 */
#define VLAN_ETH_DATA_LEN        1500        /* Max. octets in payload         */
#define VLAN_ETH_FRAME_LEN        1518        /* Max. octets in frame sans FCS */

#define VLAN_MAX_DEPTH        8                /* Max. number of nested VLAN tags parsed */

/*
 *         struct vlan_hdr - vlan header
 *         @h_vlan_TCI: priority and VLAN ID
 *        @h_vlan_encapsulated_proto: packet type ID or len
 */
struct vlan_hdr {
        __be16        h_vlan_TCI;
        __be16        h_vlan_encapsulated_proto;
};

/**
 *        struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr)
 *        @h_dest: destination ethernet address
 *        @h_source: source ethernet address
 *        @h_vlan_proto: ethernet protocol
 *        @h_vlan_TCI: priority and VLAN ID
 *        @h_vlan_encapsulated_proto: packet type ID or len
 */
struct vlan_ethhdr {
        struct_group(addrs,
                unsigned char        h_dest[ETH_ALEN];
                unsigned char        h_source[ETH_ALEN];
        );
        __be16                h_vlan_proto;
        __be16                h_vlan_TCI;
        __be16                h_vlan_encapsulated_proto;
};

#include <linux/skbuff.h>

static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
{
        return (struct vlan_ethhdr *)skb_mac_header(skb);
}

/* Prefer this version in TX path, instead of
 * skb_reset_mac_header() + vlan_eth_hdr()
 */
static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb)
{
        return (struct vlan_ethhdr *)skb->data;
}

#define VLAN_PRIO_MASK                0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT                13
#define VLAN_CFI_MASK                0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */
#define VLAN_VID_MASK                0x0fff /* VLAN Identifier */
#define VLAN_N_VID                4096

/* found in socket.c */
extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *));

#define skb_vlan_tag_present(__skb)        (!!(__skb)->vlan_all)
#define skb_vlan_tag_get(__skb)                ((__skb)->vlan_tci)
#define skb_vlan_tag_get_id(__skb)        ((__skb)->vlan_tci & VLAN_VID_MASK)
#define skb_vlan_tag_get_cfi(__skb)        (!!((__skb)->vlan_tci & VLAN_CFI_MASK))
#define skb_vlan_tag_get_prio(__skb)        (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT)

static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev));
}

static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev);
}

static inline int vlan_get_rx_stag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev));
}

static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev);
}

/**
 *        struct vlan_pcpu_stats - VLAN percpu rx/tx stats
 *        @rx_packets: number of received packets
 *        @rx_bytes: number of received bytes
 *        @rx_multicast: number of received multicast packets
 *        @tx_packets: number of transmitted packets
 *        @tx_bytes: number of transmitted bytes
 *        @syncp: synchronization point for 64bit counters
 *        @rx_errors: number of rx errors
 *        @tx_dropped: number of tx drops
 */
struct vlan_pcpu_stats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                rx_multicast;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        struct u64_stats_sync        syncp;
        u32                        rx_errors;
        u32                        tx_dropped;
};

#if IS_ENABLED(CONFIG_VLAN_8021Q)

extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev,
                                               __be16 vlan_proto, u16 vlan_id);
extern int vlan_for_each(struct net_device *dev,
                         int (*action)(struct net_device *dev, int vid,
                                       void *arg), void *arg);
extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
extern u16 vlan_dev_vlan_id(const struct net_device *dev);
extern __be16 vlan_dev_vlan_proto(const struct net_device *dev);

/**
 *        struct vlan_priority_tci_mapping - vlan egress priority mappings
 *        @priority: skb priority
 *        @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000
 *        @next: pointer to next struct
 */
struct vlan_priority_tci_mapping {
        u32                                        priority;
        u16                                        vlan_qos;
        struct vlan_priority_tci_mapping        *next;
};

struct proc_dir_entry;
struct netpoll;

/**
 *        struct vlan_dev_priv - VLAN private device data
 *        @nr_ingress_mappings: number of ingress priority mappings
 *        @ingress_priority_map: ingress priority mappings
 *        @nr_egress_mappings: number of egress priority mappings
 *        @egress_priority_map: hash of egress priority mappings
 *        @vlan_proto: VLAN encapsulation protocol
 *        @vlan_id: VLAN identifier
 *        @flags: device flags
 *        @real_dev: underlying netdevice
 *        @dev_tracker: refcount tracker for @real_dev reference
 *        @real_dev_addr: address of underlying netdevice
 *        @dent: proc dir entry
 *        @vlan_pcpu_stats: ptr to percpu rx stats
 *        @netpoll: netpoll instance "propagated" down to @real_dev
 */
struct vlan_dev_priv {
        unsigned int                                nr_ingress_mappings;
        u32                                        ingress_priority_map[8];
        unsigned int                                nr_egress_mappings;
        struct vlan_priority_tci_mapping        *egress_priority_map[16];

        __be16                                        vlan_proto;
        u16                                        vlan_id;
        u16                                        flags;

        struct net_device                        *real_dev;
        netdevice_tracker                        dev_tracker;

        unsigned char                                real_dev_addr[ETH_ALEN];

        struct proc_dir_entry                        *dent;
        struct vlan_pcpu_stats __percpu                *vlan_pcpu_stats;
#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll                                *netpoll;
#endif
};

static inline bool is_vlan_dev(const struct net_device *dev)
{
        return dev->priv_flags & IFF_802_1Q_VLAN;
}

static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
{
        return netdev_priv(dev);
}

static inline u16
vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio)
{
        struct vlan_priority_tci_mapping *mp;

        smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */

        mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)];
        while (mp) {
                if (mp->priority == skprio) {
                        return mp->vlan_qos; /* This should already be shifted
                                              * to mask correctly with the
                                              * VLAN's TCI */
                }
                mp = mp->next;
        }
        return 0;
}

extern bool vlan_do_receive(struct sk_buff **skb);

extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid);
extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid);

extern int vlan_vids_add_by_dev(struct net_device *dev,
                                const struct net_device *by_dev);
extern void vlan_vids_del_by_dev(struct net_device *dev,
                                 const struct net_device *by_dev);

extern bool vlan_uses_dev(const struct net_device *dev);

#else
static inline bool is_vlan_dev(const struct net_device *dev)
{
        return false;
}

static inline struct net_device *
__vlan_find_dev_deep_rcu(struct net_device *real_dev,
                     __be16 vlan_proto, u16 vlan_id)
{
        return NULL;
}

static inline int
vlan_for_each(struct net_device *dev,
              int (*action)(struct net_device *dev, int vid, void *arg),
              void *arg)
{
        return 0;
}

static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
{
        WARN_ON_ONCE(1);
        return NULL;
}

static inline u16 vlan_dev_vlan_id(const struct net_device *dev)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev,
                                               u32 skprio)
{
        return 0;
}

static inline bool vlan_do_receive(struct sk_buff **skb)
{
        return false;
}

static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
{
        return 0;
}

static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
{
}

static inline int vlan_vids_add_by_dev(struct net_device *dev,
                                       const struct net_device *by_dev)
{
        return 0;
}

static inline void vlan_vids_del_by_dev(struct net_device *dev,
                                        const struct net_device *by_dev)
{
}

static inline bool vlan_uses_dev(const struct net_device *dev)
{
        return false;
}
#endif

/**
 * eth_type_vlan - check for valid vlan ether type.
 * @ethertype: ether type to check
 *
 * Returns: true if the ether type is a vlan ether type.
 */
static inline bool eth_type_vlan(__be16 ethertype)
{
        switch (ethertype) {
        case htons(ETH_P_8021Q):
        case htons(ETH_P_8021AD):
                return true;
        default:
                return false;
        }
}

static inline bool vlan_hw_offload_capable(netdev_features_t features,
                                           __be16 proto)
{
        if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX)
                return true;
        if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX)
                return true;
        return false;
}

/**
 * __vlan_insert_inner_tag - inner VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 * @mac_len: MAC header length including outer vlan headers
 *
 * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Returns: error if skb_cow_head fails.
 */
static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
                                          __be16 vlan_proto, u16 vlan_tci,
                                          unsigned int mac_len)
{
        const u8 meta_len = mac_len > ETH_TLEN ? skb_metadata_len(skb) : 0;
        struct vlan_ethhdr *veth;

        if (skb_cow_head(skb, meta_len + VLAN_HLEN) < 0)
                return -ENOMEM;

        skb_push(skb, VLAN_HLEN);

        /* Move the mac header sans proto to the beginning of the new header. */
        if (likely(mac_len > ETH_TLEN))
                skb_postpush_data_move(skb, VLAN_HLEN, mac_len - ETH_TLEN);
        if (skb_mac_header_was_set(skb))
                skb->mac_header -= VLAN_HLEN;

        veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN);

        /* first, the ethernet type */
        if (likely(mac_len >= ETH_TLEN)) {
                /* h_vlan_encapsulated_proto should already be populated, and
                 * skb->data has space for h_vlan_proto
                 */
                veth->h_vlan_proto = vlan_proto;
        } else {
                /* h_vlan_encapsulated_proto should not be populated, and
                 * skb->data has no space for h_vlan_proto
                 */
                veth->h_vlan_encapsulated_proto = skb->protocol;
        }

        /* now, the TCI */
        veth->h_vlan_TCI = htons(vlan_tci);

        return 0;
}

/**
 * __vlan_insert_tag - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Returns: error if skb_cow_head fails.
 */
static inline int __vlan_insert_tag(struct sk_buff *skb,
                                    __be16 vlan_proto, u16 vlan_tci)
{
        return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}

/**
 * vlan_insert_inner_tag - inner VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 * @mac_len: MAC header length including outer vlan headers
 *
 * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Return: modified @skb on success, NULL on error (@skb is freed).
 */
static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb,
                                                    __be16 vlan_proto,
                                                    u16 vlan_tci,
                                                    unsigned int mac_len)
{
        int err;

        err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len);
        if (err) {
                dev_kfree_skb_any(skb);
                return NULL;
        }
        return skb;
}

/**
 * vlan_insert_tag - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Return: modified @skb on success, NULL on error (@skb is freed).
 */
static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
                                              __be16 vlan_proto, u16 vlan_tci)
{
        return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}

/**
 * vlan_insert_tag_set_proto - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Return: modified @skb on success, NULL on error (@skb is freed).
 */
static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
                                                        __be16 vlan_proto,
                                                        u16 vlan_tci)
{
        skb = vlan_insert_tag(skb, vlan_proto, vlan_tci);
        if (skb)
                skb->protocol = vlan_proto;
        return skb;
}

/**
 * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info
 * @skb: skbuff to clear
 *
 * Clears the VLAN information from @skb
 */
static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb)
{
        skb->vlan_all = 0;
}

/**
 * __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb
 * @dst: skbuff to copy to
 * @src: skbuff to copy from
 *
 * Copies VLAN information from @src to @dst (for branchless code)
 */
static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src)
{
        dst->vlan_all = src->vlan_all;
}

/*
 * __vlan_hwaccel_push_inside - pushes vlan tag to the payload
 * @skb: skbuff to tag
 *
 * Pushes the VLAN tag from @skb->vlan_tci inside to the payload.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 */
static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
{
        skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
                                        skb_vlan_tag_get(skb));
        if (likely(skb))
                __vlan_hwaccel_clear_tag(skb);
        return skb;
}

/**
 * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest
 */
static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
                                          __be16 vlan_proto, u16 vlan_tci)
{
        skb->vlan_proto = vlan_proto;
        skb->vlan_tci = vlan_tci;
}

/**
 * __vlan_get_tag - get the VLAN ID that is part of the payload
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns: error if the skb is not of VLAN type
 */
static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
        struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);

        if (!eth_type_vlan(veth->h_vlan_proto))
                return -ENODATA;

        *vlan_tci = ntohs(veth->h_vlan_TCI);
        return 0;
}

/**
 * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[]
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns: error if @skb->vlan_tci is not set correctly
 */
static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
                                         u16 *vlan_tci)
{
        if (skb_vlan_tag_present(skb)) {
                *vlan_tci = skb_vlan_tag_get(skb);
                return 0;
        } else {
                *vlan_tci = 0;
                return -ENODATA;
        }
}

/**
 * vlan_get_tag - get the VLAN ID from the skb
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns: error if the skb is not VLAN tagged
 */
static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
        if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
                return __vlan_hwaccel_get_tag(skb, vlan_tci);
        } else {
                return __vlan_get_tag(skb, vlan_tci);
        }
}

struct vlan_type_depth {
        __be16 type;
        u16 depth;
};

struct vlan_type_depth __vlan_get_protocol_offset(const struct sk_buff *skb,
                                                  __be16 type,
                                                  int mac_offset);

/**
 * vlan_get_protocol_offset_inline() - get protocol EtherType.
 * @skb: skbuff to query
 * @type: first vlan protocol
 * @mac_offset: MAC offset
 * @depth: buffer to store length of eth and vlan tags in bytes
 *
 * Returns: the EtherType of the packet, regardless of whether it is
 * vlan encapsulated (normal or hardware accelerated) or not.
 */
static inline
__be16 vlan_get_protocol_offset_inline(const struct sk_buff *skb,
                                       __be16 type,
                                       int mac_offset,
                                       int *depth)
{
        if (eth_type_vlan(type)) {
                struct vlan_type_depth res;

                res = __vlan_get_protocol_offset(skb, type, mac_offset);

                if (depth && res.type)
                        *depth = res.depth;
                return res.type;
        }

        if (depth)
                *depth = skb->mac_len;

        return type;
}

static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type,
                                         int *depth)
{
        return vlan_get_protocol_offset_inline(skb, type, 0, depth);
}

/**
 * vlan_get_protocol - get protocol EtherType.
 * @skb: skbuff to query
 *
 * Returns: the EtherType of the packet, regardless of whether it is
 * vlan encapsulated (normal or hardware accelerated) or not.
 */
static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
{
        return __vlan_get_protocol(skb, skb->protocol, NULL);
}

/* This version of __vlan_get_protocol() also pulls mac header in skb->head */
static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb,
                                                 __be16 type, int *depth)
{
        int maclen;

        type = __vlan_get_protocol(skb, type, &maclen);

        if (type) {
                if (!pskb_may_pull(skb, maclen))
                        type = 0;
                else if (depth)
                        *depth = maclen;
        }
        return type;
}

/* A getter for the SKB protocol field which will handle VLAN tags consistently
 * whether VLAN acceleration is enabled or not.
 */
static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan)
{
        if (!skip_vlan)
                /* VLAN acceleration strips the VLAN header from the skb and
                 * moves it to skb->vlan_proto
                 */
                return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol;

        return vlan_get_protocol(skb);
}

static inline void vlan_set_encap_proto(struct sk_buff *skb,
                                        struct vlan_hdr *vhdr)
{
        __be16 proto;
        unsigned short *rawp;

        /*
         * Was a VLAN packet, grab the encapsulated protocol, which the layer
         * three protocols care about.
         */

        proto = vhdr->h_vlan_encapsulated_proto;
        if (eth_proto_is_802_3(proto)) {
                skb->protocol = proto;
                return;
        }

        rawp = (unsigned short *)(vhdr + 1);
        if (*rawp == 0xFFFF)
                /*
                 * This is a magic hack to spot IPX packets. Older Novell
                 * breaks the protocol design and runs IPX over 802.3 without
                 * an 802.2 LLC layer. We look for FFFF which isn't a used
                 * 802.2 SSAP/DSAP. This won't work for fault tolerant netware
                 * but does for the rest.
                 */
                skb->protocol = htons(ETH_P_802_3);
        else
                /*
                 * Real 802.2 LLC
                 */
                skb->protocol = htons(ETH_P_802_2);
}

/**
 * vlan_remove_tag - remove outer VLAN tag from payload
 * @skb: skbuff to remove tag from
 * @vlan_tci: buffer to store value
 *
 * Expects the skb to contain a VLAN tag in the payload, and to have skb->data
 * pointing at the MAC header.
 */
static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
{
        struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);

        *vlan_tci = ntohs(vhdr->h_vlan_TCI);

        vlan_set_encap_proto(skb, vhdr);
        __skb_pull(skb, VLAN_HLEN);
        skb_postpull_data_move(skb, VLAN_HLEN, 2 * ETH_ALEN);
}

/**
 * skb_vlan_tagged - check if skb is vlan tagged.
 * @skb: skbuff to query
 *
 * Returns: true if the skb is tagged, regardless of whether it is hardware
 * accelerated or not.
 */
static inline bool skb_vlan_tagged(const struct sk_buff *skb)
{
        if (!skb_vlan_tag_present(skb) &&
            likely(!eth_type_vlan(skb->protocol)))
                return false;

        return true;
}

/**
 * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers.
 * @skb: skbuff to query
 *
 * Returns: true if the skb is tagged with multiple vlan headers, regardless
 * of whether it is hardware accelerated or not.
 */
static inline bool skb_vlan_tagged_multi(struct sk_buff *skb)
{
        __be16 protocol = skb->protocol;

        if (!skb_vlan_tag_present(skb)) {
                struct vlan_ethhdr *veh;

                if (likely(!eth_type_vlan(protocol)))
                        return false;

                if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
                        return false;

                veh = skb_vlan_eth_hdr(skb);
                protocol = veh->h_vlan_encapsulated_proto;
        }

        if (!eth_type_vlan(protocol))
                return false;

        return true;
}

/**
 * vlan_features_check - drop unsafe features for skb with multiple tags.
 * @skb: skbuff to query
 * @features: features to be checked
 *
 * Returns: features without unsafe ones if the skb has multiple tags.
 */
static inline netdev_features_t vlan_features_check(struct sk_buff *skb,
                                                    netdev_features_t features)
{
        if (skb_vlan_tagged_multi(skb)) {
                /* In the case of multi-tagged packets, use a direct mask
                 * instead of using netdev_interesect_features(), to make
                 * sure that only devices supporting NETIF_F_HW_CSUM will
                 * have checksum offloading support.
                 */
                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
                            NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX |
                            NETIF_F_HW_VLAN_STAG_TX;
        }

        return features;
}

/**
 * compare_vlan_header - Compare two vlan headers
 * @h1: Pointer to vlan header
 * @h2: Pointer to vlan header
 *
 * Compare two vlan headers.
 *
 * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits.
 *
 * Return: 0 if equal, arbitrary non-zero value if not equal.
 */
static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1,
                                                const struct vlan_hdr *h2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return *(u32 *)h1 ^ *(u32 *)h2;
#else
        return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) |
               ((__force u32)h1->h_vlan_encapsulated_proto ^
                (__force u32)h2->h_vlan_encapsulated_proto);
#endif
}
#endif /* !(_LINUX_IF_VLAN_H_) */







































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Berkeley style UIO structures        -        Alan Cox 1994.
 */
#ifndef __LINUX_UIO_H
#define __LINUX_UIO_H

#include <linux/kernel.h>
#include <linux/mm_types.h>
#include <linux/ucopysize.h>
#include <uapi/linux/uio.h>

struct page;
struct folio_queue;

typedef unsigned int __bitwise iov_iter_extraction_t;

struct kvec {
        void *iov_base; /* and that should *never* hold a userland pointer */
        size_t iov_len;
};

enum iter_type {
        /* iter types */
        ITER_UBUF,
        ITER_IOVEC,
        ITER_BVEC,
        ITER_KVEC,
        ITER_FOLIOQ,
        ITER_XARRAY,
        ITER_DISCARD,
};

#define ITER_SOURCE        1        // == WRITE
#define ITER_DEST        0        // == READ

struct iov_iter_state {
        size_t iov_offset;
        size_t count;
        unsigned long nr_segs;
};

struct iov_iter {
        u8 iter_type;
        bool nofault;
        bool data_source;
        size_t iov_offset;
        /*
         * Hack alert: overlay ubuf_iovec with iovec + count, so
         * that the members resolve correctly regardless of the type
         * of iterator used. This means that you can use:
         *
         * &iter->__ubuf_iovec or iter->__iov
         *
         * interchangably for the user_backed cases, hence simplifying
         * some of the cases that need to deal with both.
         */
        union {
                /*
                 * This really should be a const, but we cannot do that without
                 * also modifying any of the zero-filling iter init functions.
                 * Leave it non-const for now, but it should be treated as such.
                 */
                struct iovec __ubuf_iovec;
                struct {
                        union {
                                /* use iter_iov() to get the current vec */
                                const struct iovec *__iov;
                                const struct kvec *kvec;
                                const struct bio_vec *bvec;
                                const struct folio_queue *folioq;
                                struct xarray *xarray;
                                void __user *ubuf;
                        };
                        size_t count;
                };
        };
        union {
                unsigned long nr_segs;
                u8 folioq_slot;
                loff_t xarray_start;
        };
};

typedef __u16 uio_meta_flags_t;

struct uio_meta {
        uio_meta_flags_t        flags;
        u16                        app_tag;
        u64                        seed;
        struct iov_iter                iter;
};

static inline const struct iovec *iter_iov(const struct iov_iter *iter)
{
        if (iter->iter_type == ITER_UBUF)
                return (const struct iovec *) &iter->__ubuf_iovec;
        return iter->__iov;
}

#define iter_iov_addr(iter)        (iter_iov(iter)->iov_base + (iter)->iov_offset)

static inline size_t iter_iov_len(const struct iov_iter *i)
{
        if (i->iter_type == ITER_UBUF)
                return i->count;
        return iter_iov(i)->iov_len - i->iov_offset;
}

static inline enum iter_type iov_iter_type(const struct iov_iter *i)
{
        return i->iter_type;
}

static inline void iov_iter_save_state(struct iov_iter *iter,
                                       struct iov_iter_state *state)
{
        state->iov_offset = iter->iov_offset;
        state->count = iter->count;
        state->nr_segs = iter->nr_segs;
}

static inline bool iter_is_ubuf(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_UBUF;
}

static inline bool iter_is_iovec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_IOVEC;
}

static inline bool iov_iter_is_kvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_KVEC;
}

static inline bool iov_iter_is_bvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_BVEC;
}

static inline bool iov_iter_is_discard(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_DISCARD;
}

static inline bool iov_iter_is_folioq(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_FOLIOQ;
}

static inline bool iov_iter_is_xarray(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_XARRAY;
}

static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
        return i->data_source ? WRITE : READ;
}

static inline bool user_backed_iter(const struct iov_iter *i)
{
        return iter_is_ubuf(i) || iter_is_iovec(i);
}

/*
 * Total number of bytes covered by an iovec.
 *
 * NOTE that it is not safe to use this function until all the iovec's
 * segment lengths have been validated.  Because the individual lengths can
 * overflow a size_t when added together.
 */
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
        unsigned long seg;
        size_t ret = 0;

        for (seg = 0; seg < nr_segs; seg++)
                ret += iov[seg].iov_len;
        return ret;
}

void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
size_t iov_iter_single_seg_count(const struct iov_iter *i);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);
size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset,
                size_t bytes, struct iov_iter *i);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);

static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
                size_t bytes, struct iov_iter *i)
{
        return copy_page_to_iter(&folio->page, offset, bytes, i);
}

static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset,
                                          size_t bytes, struct iov_iter *i)
{
        return copy_page_from_iter(&folio->page, offset, bytes, i);
}

size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
                                 size_t bytes, struct iov_iter *i);

static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, true))
                return _copy_to_iter(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, false))
                return _copy_from_iter(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_to_iter(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

static __always_inline __must_check
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_from_iter(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

static __always_inline __must_check
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, false))
                return _copy_from_iter_nocache(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_from_iter_nocache(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/*
 * Note, users like pmem that depend on the stricter semantics of
 * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for
 * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
 * destination is flushed from the cache on return.
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_from_iter_flushcache _copy_from_iter_nocache
#endif

#ifdef CONFIG_ARCH_HAS_COPY_MC
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_mc_to_iter _copy_to_iter
#endif

size_t iov_iter_zero(size_t bytes, struct iov_iter *);
unsigned long iov_iter_alignment(const struct iov_iter *i);
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
                        unsigned long nr_segs, size_t count);
void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
                          const struct folio_queue *folioq,
                          unsigned int first_slot, unsigned int offset, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
                     loff_t start, size_t count);
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
                        size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
                        size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);

static inline size_t iov_iter_count(const struct iov_iter *i)
{
        return i->count;
}

/*
 * Cap the iov_iter by given limit; note that the second argument is
 * *not* the new size - it's upper limit for such.  Passing it a value
 * greater than the amount of data in iov_iter is fine - it'll just do
 * nothing in that case.
 */
static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
{
        /*
         * count doesn't have to fit in size_t - comparison extends both
         * operands to u64 here and any value that would be truncated by
         * conversion in assignement is by definition greater than all
         * values of size_t, including old i->count.
         */
        if (i->count > count)
                i->count = count;
}

/*
 * reexpand a previously truncated iterator; count must be no more than how much
 * we had shrunk it.
 */
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
{
        i->count = count;
}

static inline int
iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
{
        size_t shorted = 0;
        int npages;

        if (iov_iter_count(i) > max_bytes) {
                shorted = iov_iter_count(i) - max_bytes;
                iov_iter_truncate(i, max_bytes);
        }
        npages = iov_iter_npages(i, maxpages);
        if (shorted)
                iov_iter_reexpand(i, iov_iter_count(i) + shorted);

        return npages;
}

struct iovec *iovec_from_user(const struct iovec __user *uvector,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat);
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i);
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat);
int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);

static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
                        void __user *buf, size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter) {
                .iter_type = ITER_UBUF,
                .data_source = direction,
                .ubuf = buf,
                .count = count,
                .nr_segs = 1
        };
}
/* Flags for iov_iter_get/extract_pages*() */
/* Allow P2PDMA on the extracted pages */
#define ITER_ALLOW_P2PDMA        ((__force iov_iter_extraction_t)0x01)

ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
                               size_t maxsize, unsigned int maxpages,
                               iov_iter_extraction_t extraction_flags,
                               size_t *offset0);
ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
                size_t max_size, unsigned short *nr_vecs,
                unsigned short max_vecs, iov_iter_extraction_t extraction_flags);

/**
 * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
 * @iter: The iterator
 *
 * Examine the iterator and indicate by returning true or false as to how, if
 * at all, pages extracted from the iterator will be retained by the extraction
 * function.
 *
 * %true indicates that the pages will have a pin placed in them that the
 * caller must unpin.  This is must be done for DMA/async DIO to force fork()
 * to forcibly copy a page for the child (the parent must retain the original
 * page).
 *
 * %false indicates that no measures are taken and that it's up to the caller
 * to retain the pages.
 */
static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter)
{
        return user_backed_iter(iter);
}

struct sg_table;
ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len,
                           struct sg_table *sgtable, unsigned int sg_max,
                           iov_iter_extraction_t extraction_flags);

#endif





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * async.h: Asynchronous function calls for boot performance
 *
 * (C) Copyright 2009 Intel Corporation
 * Author: Arjan van de Ven <arjan@linux.intel.com>
 */
#ifndef __ASYNC_H__
#define __ASYNC_H__

#include <linux/types.h>
#include <linux/list.h>
#include <linux/numa.h>
#include <linux/device.h>

typedef u64 async_cookie_t;
typedef void (*async_func_t) (void *data, async_cookie_t cookie);
struct async_domain {
        struct list_head pending;
        unsigned registered:1;
};

/*
 * domain participates in global async_synchronize_full
 */
#define ASYNC_DOMAIN(_name) \
        struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending),        \
                                      .registered = 1 }

/*
 * domain is free to go out of scope as soon as all pending work is
 * complete, this domain does not participate in async_synchronize_full
 */
#define ASYNC_DOMAIN_EXCLUSIVE(_name) \
        struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending), \
                                      .registered = 0 }

async_cookie_t async_schedule_node(async_func_t func, void *data,
                                   int node);
async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
                                          int node,
                                          struct async_domain *domain);

/**
 * async_schedule - schedule a function for asynchronous execution
 * @func: function to execute asynchronously
 * @data: data pointer to pass to the function
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * Note: This function may be called from atomic or non-atomic contexts.
 */
static inline async_cookie_t async_schedule(async_func_t func, void *data)
{
        return async_schedule_node(func, data, NUMA_NO_NODE);
}

/**
 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
 * @func: function to execute asynchronously
 * @data: data pointer to pass to the function
 * @domain: the domain
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * @domain may be used in the async_synchronize_*_domain() functions to
 * wait within a certain synchronization domain rather than globally.
 * Note: This function may be called from atomic or non-atomic contexts.
 */
static inline async_cookie_t
async_schedule_domain(async_func_t func, void *data,
                      struct async_domain *domain)
{
        return async_schedule_node_domain(func, data, NUMA_NO_NODE, domain);
}

/**
 * async_schedule_dev - A device specific version of async_schedule
 * @func: function to execute asynchronously
 * @dev: device argument to be passed to function
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * @dev is used as both the argument for the function and to provide NUMA
 * context for where to run the function. By doing this we can try to
 * provide for the best possible outcome by operating on the device on the
 * CPUs closest to the device.
 * Note: This function may be called from atomic or non-atomic contexts.
 */
static inline async_cookie_t
async_schedule_dev(async_func_t func, struct device *dev)
{
        return async_schedule_node(func, dev, dev_to_node(dev));
}

bool async_schedule_dev_nocall(async_func_t func, struct device *dev);

/**
 * async_schedule_dev_domain - A device specific version of async_schedule_domain
 * @func: function to execute asynchronously
 * @dev: device argument to be passed to function
 * @domain: the domain
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * @dev is used as both the argument for the function and to provide NUMA
 * context for where to run the function. By doing this we can try to
 * provide for the best possible outcome by operating on the device on the
 * CPUs closest to the device.
 * @domain may be used in the async_synchronize_*_domain() functions to
 * wait within a certain synchronization domain rather than globally.
 * Note: This function may be called from atomic or non-atomic contexts.
 */
static inline async_cookie_t
async_schedule_dev_domain(async_func_t func, struct device *dev,
                          struct async_domain *domain)
{
        return async_schedule_node_domain(func, dev, dev_to_node(dev), domain);
}

extern void async_synchronize_full(void);
extern void async_synchronize_full_domain(struct async_domain *domain);
extern void async_synchronize_cookie(async_cookie_t cookie);
extern void async_synchronize_cookie_domain(async_cookie_t cookie,
                                            struct async_domain *domain);
extern bool current_is_async(void);
extern void async_init(void);
#endif














   32 



   33 



   30 
   32 

   29 
























   24 
   26 



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/vmalloc.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/idr.h>
#include <linux/namei.h>
#include <linux/user_namespace.h>
#include <linux/security.h>

static bool bpf_ns_capable(struct user_namespace *ns, int cap)
{
        return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN));
}

bool bpf_token_capable(const struct bpf_token *token, int cap)
{
        struct user_namespace *userns;

        /* BPF token allows ns_capable() level of capabilities */
        userns = token ? token->userns : &init_user_ns;
        if (!bpf_ns_capable(userns, cap))
                return false;
        if (token && security_bpf_token_capable(token, cap) < 0)
                return false;
        return true;
}

void bpf_token_inc(struct bpf_token *token)
{
        atomic64_inc(&token->refcnt);
}

static void bpf_token_free(struct bpf_token *token)
{
        security_bpf_token_free(token);
        put_user_ns(token->userns);
        kfree(token);
}

static void bpf_token_put_deferred(struct work_struct *work)
{
        struct bpf_token *token = container_of(work, struct bpf_token, work);

        bpf_token_free(token);
}

void bpf_token_put(struct bpf_token *token)
{
        if (!token)
                return;

        if (!atomic64_dec_and_test(&token->refcnt))
                return;

        INIT_WORK(&token->work, bpf_token_put_deferred);
        schedule_work(&token->work);
}

static int bpf_token_release(struct inode *inode, struct file *filp)
{
        struct bpf_token *token = filp->private_data;

        bpf_token_put(token);
        return 0;
}

static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
{
        struct bpf_token *token = filp->private_data;
        u64 mask;

        BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
        mask = BIT_ULL(__MAX_BPF_CMD) - 1;
        if ((token->allowed_cmds & mask) == mask)
                seq_printf(m, "allowed_cmds:\tany\n");
        else
                seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);

        BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
        mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1;
        if ((token->allowed_maps & mask) == mask)
                seq_printf(m, "allowed_maps:\tany\n");
        else
                seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);

        BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
        mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1;
        if ((token->allowed_progs & mask) == mask)
                seq_printf(m, "allowed_progs:\tany\n");
        else
                seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);

        BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
        mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1;
        if ((token->allowed_attachs & mask) == mask)
                seq_printf(m, "allowed_attachs:\tany\n");
        else
                seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
}

#define BPF_TOKEN_INODE_NAME "bpf-token"

static const struct inode_operations bpf_token_iops = { };

const struct file_operations bpf_token_fops = {
        .release        = bpf_token_release,
        .show_fdinfo        = bpf_token_show_fdinfo,
};

int bpf_token_create(union bpf_attr *attr)
{
        struct bpf_token *token __free(kfree) = NULL;
        struct bpf_mount_opts *mnt_opts;
        struct user_namespace *userns;
        struct inode *inode;
        CLASS(fd, f)(attr->token_create.bpffs_fd);
        struct path path;
        struct super_block *sb;
        umode_t mode;
        int err;

        if (fd_empty(f))
                return -EBADF;

        path = fd_file(f)->f_path;
        sb = path.dentry->d_sb;

        if (path.dentry != sb->s_root)
                return -EINVAL;
        if (sb->s_op != &bpf_super_ops)
                return -EINVAL;
        err = path_permission(&path, MAY_ACCESS);
        if (err)
                return err;

        userns = sb->s_user_ns;
        /*
         * Enforce that creators of BPF tokens are in the same user
         * namespace as the BPF FS instance. This makes reasoning about
         * permissions a lot easier and we can always relax this later.
         */
        if (current_user_ns() != userns)
                return -EPERM;
        if (!ns_capable(userns, CAP_BPF))
                return -EPERM;

        /* Creating BPF token in init_user_ns doesn't make much sense. */
        if (current_user_ns() == &init_user_ns)
                return -EOPNOTSUPP;

        mnt_opts = sb->s_fs_info;
        if (mnt_opts->delegate_cmds == 0 &&
            mnt_opts->delegate_maps == 0 &&
            mnt_opts->delegate_progs == 0 &&
            mnt_opts->delegate_attachs == 0)
                return -ENOENT; /* no BPF token delegation is set up */

        mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
        inode = bpf_get_inode(sb, NULL, mode);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        inode->i_op = &bpf_token_iops;
        inode->i_fop = &bpf_token_fops;
        clear_nlink(inode); /* make sure it is unlinked */

        FD_PREPARE(fdf, O_CLOEXEC,
                   alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME,
                                     O_RDWR, &bpf_token_fops));
        if (fdf.err)
                return fdf.err;

        token = kzalloc_obj(*token, GFP_USER);
        if (!token)
                return -ENOMEM;

        atomic64_set(&token->refcnt, 1);

        /* remember bpffs owning userns for future ns_capable() checks. */
        token->userns = userns;
        token->allowed_cmds = mnt_opts->delegate_cmds;
        token->allowed_maps = mnt_opts->delegate_maps;
        token->allowed_progs = mnt_opts->delegate_progs;
        token->allowed_attachs = mnt_opts->delegate_attachs;

        err = security_bpf_token_create(token, attr, &path);
        if (err)
                return err;

        get_user_ns(token->userns);
        fd_prepare_file(fdf)->private_data = no_free_ptr(token);
        return fd_publish(fdf);
}

int bpf_token_get_info_by_fd(struct bpf_token *token,
                             const union bpf_attr *attr,
                             union bpf_attr __user *uattr)
{
        struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info);
        struct bpf_token_info info;
        u32 info_len = attr->info.info_len;

        info_len = min_t(u32, info_len, sizeof(info));
        memset(&info, 0, sizeof(info));

        info.allowed_cmds = token->allowed_cmds;
        info.allowed_maps = token->allowed_maps;
        info.allowed_progs = token->allowed_progs;
        info.allowed_attachs = token->allowed_attachs;

        if (copy_to_user(uinfo, &info, info_len) ||
            put_user(info_len, &uattr->info.info_len))
                return -EFAULT;

        return 0;
}

struct bpf_token *bpf_token_get_from_fd(u32 ufd)
{
        CLASS(fd, f)(ufd);
        struct bpf_token *token;

        if (fd_empty(f))
                return ERR_PTR(-EBADF);
        if (fd_file(f)->f_op != &bpf_token_fops)
                return ERR_PTR(-EINVAL);

        token = fd_file(f)->private_data;
        bpf_token_inc(token);

        return token;
}

bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
{
        if (!token)
                return false;
        if (!(token->allowed_cmds & BIT_ULL(cmd)))
                return false;
        return security_bpf_token_cmd(token, cmd) == 0;
}

bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
{
        if (!token || type >= __MAX_BPF_MAP_TYPE)
                return false;

        return token->allowed_maps & BIT_ULL(type);
}

bool bpf_token_allow_prog_type(const struct bpf_token *token,
                               enum bpf_prog_type prog_type,
                               enum bpf_attach_type attach_type)
{
        if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
                return false;

        return (token->allowed_progs & BIT_ULL(prog_type)) &&
               (token->allowed_attachs & BIT_ULL(attach_type));
}
























































































































































































































    3 

    3 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Definitions related to Power Management Quality of Service (PM QoS).
 *
 * Copyright (C) 2020 Intel Corporation
 *
 * Authors:
 *        Mark Gross <mgross@linux.intel.com>
 *        Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 */

#ifndef _LINUX_PM_QOS_H
#define _LINUX_PM_QOS_H

#include <linux/plist.h>
#include <linux/notifier.h>
#include <linux/device.h>

enum pm_qos_flags_status {
        PM_QOS_FLAGS_UNDEFINED = -1,
        PM_QOS_FLAGS_NONE,
        PM_QOS_FLAGS_SOME,
        PM_QOS_FLAGS_ALL,
};

#define PM_QOS_DEFAULT_VALUE        (-1)
#define PM_QOS_LATENCY_ANY        S32_MAX
#define PM_QOS_LATENCY_ANY_NS        ((s64)PM_QOS_LATENCY_ANY * NSEC_PER_USEC)

#define PM_QOS_CPU_LATENCY_DEFAULT_VALUE        (2000 * USEC_PER_SEC)
#define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE        PM_QOS_LATENCY_ANY
#define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT        PM_QOS_LATENCY_ANY
#define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS        PM_QOS_LATENCY_ANY_NS
#define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE        0
#define PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE        0
#define PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE        FREQ_QOS_MAX_DEFAULT_VALUE
#define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT        (-1)

#define PM_QOS_FLAG_NO_POWER_OFF        (1 << 0)

enum pm_qos_type {
        PM_QOS_UNITIALIZED,
        PM_QOS_MAX,                /* return the largest value */
        PM_QOS_MIN,                /* return the smallest value */
};

/*
 * Note: The lockless read path depends on the CPU accessing target_value
 * or effective_flags atomically.  Atomic access is only guaranteed on all CPU
 * types linux supports for 32 bit quantites
 */
struct pm_qos_constraints {
        struct plist_head list;
        s32 target_value;        /* Do not change to 64 bit */
        s32 default_value;
        s32 no_constraint_value;
        enum pm_qos_type type;
        struct blocking_notifier_head *notifiers;
};

struct pm_qos_request {
        struct plist_node node;
        struct pm_qos_constraints *qos;
};

struct pm_qos_flags_request {
        struct list_head node;
        s32 flags;        /* Do not change to 64 bit */
};

struct pm_qos_flags {
        struct list_head list;
        s32 effective_flags;        /* Do not change to 64 bit */
};


#define FREQ_QOS_MIN_DEFAULT_VALUE        0
#define FREQ_QOS_MAX_DEFAULT_VALUE        S32_MAX

enum freq_qos_req_type {
        FREQ_QOS_MIN = 1,
        FREQ_QOS_MAX,
};

struct freq_constraints {
        struct pm_qos_constraints min_freq;
        struct blocking_notifier_head min_freq_notifiers;
        struct pm_qos_constraints max_freq;
        struct blocking_notifier_head max_freq_notifiers;
};

struct freq_qos_request {
        enum freq_qos_req_type type;
        struct plist_node pnode;
        struct freq_constraints *qos;
};


enum dev_pm_qos_req_type {
        DEV_PM_QOS_RESUME_LATENCY = 1,
        DEV_PM_QOS_LATENCY_TOLERANCE,
        DEV_PM_QOS_MIN_FREQUENCY,
        DEV_PM_QOS_MAX_FREQUENCY,
        DEV_PM_QOS_FLAGS,
};

struct dev_pm_qos_request {
        enum dev_pm_qos_req_type type;
        union {
                struct plist_node pnode;
                struct pm_qos_flags_request flr;
                struct freq_qos_request freq;
        } data;
        struct device *dev;
};

struct dev_pm_qos {
        struct pm_qos_constraints resume_latency;
        struct pm_qos_constraints latency_tolerance;
        struct freq_constraints freq;
        struct pm_qos_flags flags;
        struct dev_pm_qos_request *resume_latency_req;
        struct dev_pm_qos_request *latency_tolerance_req;
        struct dev_pm_qos_request *flags_req;
};

/* Action requested to pm_qos_update_target */
enum pm_qos_req_action {
        PM_QOS_ADD_REQ,                /* Add a new request */
        PM_QOS_UPDATE_REQ,        /* Update an existing request */
        PM_QOS_REMOVE_REQ        /* Remove an existing request */
};

static inline int dev_pm_qos_request_active(struct dev_pm_qos_request *req)
{
        return req->dev != NULL;
}

s32 pm_qos_read_value(struct pm_qos_constraints *c);
int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
                         enum pm_qos_req_action action, int value);
bool pm_qos_update_flags(struct pm_qos_flags *pqf,
                         struct pm_qos_flags_request *req,
                         enum pm_qos_req_action action, s32 val);

#ifdef CONFIG_CPU_IDLE
s32 cpu_latency_qos_limit(void);
bool cpu_latency_qos_request_active(struct pm_qos_request *req);
void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value);
void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value);
void cpu_latency_qos_remove_request(struct pm_qos_request *req);
#else
static inline s32 cpu_latency_qos_limit(void) { return INT_MAX; }
static inline bool cpu_latency_qos_request_active(struct pm_qos_request *req)
{
        return false;
}
static inline void cpu_latency_qos_add_request(struct pm_qos_request *req,
                                               s32 value) {}
static inline void cpu_latency_qos_update_request(struct pm_qos_request *req,
                                                  s32 new_value) {}
static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {}
#endif

#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
s32 cpu_wakeup_latency_qos_limit(void);
#else
static inline s32 cpu_wakeup_latency_qos_limit(void)
{
        return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
}
#endif

#ifdef CONFIG_PM
enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask);
enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask);
s32 __dev_pm_qos_resume_latency(struct device *dev);
s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type);
int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
                           enum dev_pm_qos_req_type type, s32 value);
int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value);
int dev_pm_qos_remove_request(struct dev_pm_qos_request *req);
int dev_pm_qos_add_notifier(struct device *dev,
                            struct notifier_block *notifier,
                            enum dev_pm_qos_req_type type);
int dev_pm_qos_remove_notifier(struct device *dev,
                               struct notifier_block *notifier,
                               enum dev_pm_qos_req_type type);
void dev_pm_qos_constraints_init(struct device *dev);
void dev_pm_qos_constraints_destroy(struct device *dev);
int dev_pm_qos_add_ancestor_request(struct device *dev,
                                    struct dev_pm_qos_request *req,
                                    enum dev_pm_qos_req_type type, s32 value);
int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value);
void dev_pm_qos_hide_latency_limit(struct device *dev);
int dev_pm_qos_expose_flags(struct device *dev, s32 value);
void dev_pm_qos_hide_flags(struct device *dev);
int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set);
s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev);
int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val);
int dev_pm_qos_expose_latency_tolerance(struct device *dev);
void dev_pm_qos_hide_latency_tolerance(struct device *dev);

static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev)
{
        return dev->power.qos->resume_latency_req->data.pnode.prio;
}

static inline s32 dev_pm_qos_requested_flags(struct device *dev)
{
        return dev->power.qos->flags_req->data.flr.flags;
}

static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev)
{
        return IS_ERR_OR_NULL(dev->power.qos) ?
                PM_QOS_RESUME_LATENCY_NO_CONSTRAINT :
                pm_qos_read_value(&dev->power.qos->resume_latency);
}
#else
static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev,
                                                          s32 mask)
                        { return PM_QOS_FLAGS_UNDEFINED; }
static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev,
                                                        s32 mask)
                        { return PM_QOS_FLAGS_UNDEFINED; }
static inline s32 __dev_pm_qos_resume_latency(struct device *dev)
                        { return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
static inline s32 dev_pm_qos_read_value(struct device *dev,
                                        enum dev_pm_qos_req_type type)
{
        switch (type) {
        case DEV_PM_QOS_RESUME_LATENCY:
                return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
        case DEV_PM_QOS_MIN_FREQUENCY:
                return PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
        case DEV_PM_QOS_MAX_FREQUENCY:
                return PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
        default:
                WARN_ON(1);
                return 0;
        }
}

static inline int dev_pm_qos_add_request(struct device *dev,
                                         struct dev_pm_qos_request *req,
                                         enum dev_pm_qos_req_type type,
                                         s32 value)
                        { return 0; }
static inline int dev_pm_qos_update_request(struct dev_pm_qos_request *req,
                                            s32 new_value)
                        { return 0; }
static inline int dev_pm_qos_remove_request(struct dev_pm_qos_request *req)
                        { return 0; }
static inline int dev_pm_qos_add_notifier(struct device *dev,
                                          struct notifier_block *notifier,
                                          enum dev_pm_qos_req_type type)
                        { return 0; }
static inline int dev_pm_qos_remove_notifier(struct device *dev,
                                             struct notifier_block *notifier,
                                             enum dev_pm_qos_req_type type)
                        { return 0; }
static inline void dev_pm_qos_constraints_init(struct device *dev)
{
        dev->power.power_state = PMSG_ON;
}
static inline void dev_pm_qos_constraints_destroy(struct device *dev)
{
        dev->power.power_state = PMSG_INVALID;
}
static inline int dev_pm_qos_add_ancestor_request(struct device *dev,
                                                  struct dev_pm_qos_request *req,
                                                  enum dev_pm_qos_req_type type,
                                                  s32 value)
                        { return 0; }
static inline int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value)
                        { return 0; }
static inline void dev_pm_qos_hide_latency_limit(struct device *dev) {}
static inline int dev_pm_qos_expose_flags(struct device *dev, s32 value)
                        { return 0; }
static inline void dev_pm_qos_hide_flags(struct device *dev) {}
static inline int dev_pm_qos_update_flags(struct device *dev, s32 m, bool set)
                        { return 0; }
static inline s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev)
                        { return PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; }
static inline int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
                        { return 0; }
static inline int dev_pm_qos_expose_latency_tolerance(struct device *dev)
                        { return 0; }
static inline void dev_pm_qos_hide_latency_tolerance(struct device *dev) {}

static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev)
{
        return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
}
static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; }
static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev)
{
        return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
}
#endif

static inline int freq_qos_request_active(struct freq_qos_request *req)
{
        return !IS_ERR_OR_NULL(req->qos);
}

void freq_constraints_init(struct freq_constraints *qos);

s32 freq_qos_read_value(struct freq_constraints *qos,
                        enum freq_qos_req_type type);

int freq_qos_add_request(struct freq_constraints *qos,
                         struct freq_qos_request *req,
                         enum freq_qos_req_type type, s32 value);
int freq_qos_update_request(struct freq_qos_request *req, s32 new_value);
int freq_qos_remove_request(struct freq_qos_request *req);
int freq_qos_apply(struct freq_qos_request *req,
                   enum pm_qos_req_action action, s32 value);

int freq_qos_add_notifier(struct freq_constraints *qos,
                          enum freq_qos_req_type type,
                          struct notifier_block *notifier);
int freq_qos_remove_notifier(struct freq_constraints *qos,
                             enum freq_qos_req_type type,
                             struct notifier_block *notifier);

#endif




























































    1 




    1 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
// SPDX-License-Identifier: GPL-2.0
/* Lock down the kernel
 *
 * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public Licence
 * as published by the Free Software Foundation; either version
 * 2 of the Licence, or (at your option) any later version.
 */

#include <linux/security.h>
#include <linux/export.h>
#include <linux/lsm_hooks.h>
#include <uapi/linux/lsm.h>

static enum lockdown_reason kernel_locked_down;

static const enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE,
                                                 LOCKDOWN_INTEGRITY_MAX,
                                                 LOCKDOWN_CONFIDENTIALITY_MAX};

/*
 * Put the kernel into lock-down mode.
 */
static int lock_kernel_down(const char *where, enum lockdown_reason level)
{
        if (kernel_locked_down >= level)
                return -EPERM;

        kernel_locked_down = level;
        pr_notice("Kernel is locked down from %s; see man kernel_lockdown.7\n",
                  where);
        return 0;
}

static int __init lockdown_param(char *level)
{
        if (!level)
                return -EINVAL;

        if (strcmp(level, "integrity") == 0)
                lock_kernel_down("command line", LOCKDOWN_INTEGRITY_MAX);
        else if (strcmp(level, "confidentiality") == 0)
                lock_kernel_down("command line", LOCKDOWN_CONFIDENTIALITY_MAX);
        else
                return -EINVAL;

        return 0;
}

early_param("lockdown", lockdown_param);

/**
 * lockdown_is_locked_down - Find out if the kernel is locked down
 * @what: Tag to use in notice generated if lockdown is in effect
 */
static int lockdown_is_locked_down(enum lockdown_reason what)
{
        if (WARN(what >= LOCKDOWN_CONFIDENTIALITY_MAX,
                 "Invalid lockdown reason"))
                return -EPERM;

        if (kernel_locked_down >= what) {
                if (lockdown_reasons[what])
                        pr_notice_ratelimited("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n",
                                  current->comm, lockdown_reasons[what]);
                return -EPERM;
        }

        return 0;
}

static struct security_hook_list lockdown_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(locked_down, lockdown_is_locked_down),
};

static const struct lsm_id lockdown_lsmid = {
        .name = "lockdown",
        .id = LSM_ID_LOCKDOWN,
};

static int __init lockdown_lsm_init(void)
{
#if defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY)
        lock_kernel_down("Kernel configuration", LOCKDOWN_INTEGRITY_MAX);
#elif defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY)
        lock_kernel_down("Kernel configuration", LOCKDOWN_CONFIDENTIALITY_MAX);
#endif
        security_add_hooks(lockdown_hooks, ARRAY_SIZE(lockdown_hooks),
                           &lockdown_lsmid);
        return 0;
}

static ssize_t lockdown_read(struct file *filp, char __user *buf, size_t count,
                             loff_t *ppos)
{
        char temp[80] = "";
        int i, offset = 0;

        for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) {
                enum lockdown_reason level = lockdown_levels[i];

                if (lockdown_reasons[level]) {
                        const char *label = lockdown_reasons[level];

                        if (kernel_locked_down == level)
                                offset += sprintf(temp+offset, "[%s] ", label);
                        else
                                offset += sprintf(temp+offset, "%s ", label);
                }
        }

        /* Convert the last space to a newline if needed. */
        if (offset > 0)
                temp[offset-1] = '\n';

        return simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));
}

static ssize_t lockdown_write(struct file *file, const char __user *buf,
                              size_t n, loff_t *ppos)
{
        char *state;
        int i, len, err = -EINVAL;

        state = memdup_user_nul(buf, n);
        if (IS_ERR(state))
                return PTR_ERR(state);

        len = strlen(state);
        if (len && state[len-1] == '\n') {
                state[len-1] = '\0';
                len--;
        }

        for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) {
                enum lockdown_reason level = lockdown_levels[i];
                const char *label = lockdown_reasons[level];

                if (label && !strcmp(state, label))
                        err = lock_kernel_down("securityfs", level);
        }

        kfree(state);
        return err ? err : n;
}

static const struct file_operations lockdown_ops = {
        .read  = lockdown_read,
        .write = lockdown_write,
};

static int __init lockdown_secfs_init(void)
{
        struct dentry *dentry;

        dentry = securityfs_create_file("lockdown", 0644, NULL, NULL,
                                        &lockdown_ops);
        return PTR_ERR_OR_ZERO(dentry);
}

#ifdef CONFIG_SECURITY_LOCKDOWN_LSM_EARLY
DEFINE_EARLY_LSM(lockdown) = {
#else
DEFINE_LSM(lockdown) = {
#endif
        .id = &lockdown_lsmid,
        .init = lockdown_lsm_init,
        .initcall_core = lockdown_secfs_init,
};

















    1 



    1 


    1 
    1 




    1 










    1 






    1 







    1 


    1 



    1 















    1 












    1 


    1 
























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2017-2018 HUAWEI, Inc.
 *             https://www.huawei.com/
 * Copyright (C) 2021, Alibaba Cloud
 */
#include "internal.h"
#include <linux/filelock.h>
#include <linux/sched/mm.h>
#include <trace/events/erofs.h>

void erofs_unmap_metabuf(struct erofs_buf *buf)
{
        if (!buf->base)
                return;
        kunmap_local(buf->base);
        buf->base = NULL;
}

void erofs_put_metabuf(struct erofs_buf *buf)
{
        if (!buf->page)
                return;
        erofs_unmap_metabuf(buf);
        folio_put(page_folio(buf->page));
        buf->page = NULL;
}

void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
{
        pgoff_t index = (buf->off + offset) >> PAGE_SHIFT;
        struct folio *folio = NULL;
        loff_t fpos;
        int err;

        /*
         * Metadata access for file-backed mounts reuses page cache of backing
         * fs inodes (only folio data will be needed) to prevent double caching.
         * However, the data access range must be verified here in advance.
         */
        if (buf->file) {
                fpos = index << PAGE_SHIFT;
                err = rw_verify_area(READ, buf->file, &fpos, PAGE_SIZE);
                if (err < 0)
                        return ERR_PTR(err);
        }

        if (buf->page) {
                folio = page_folio(buf->page);
                if (folio_file_page(folio, index) != buf->page)
                        erofs_unmap_metabuf(buf);
        }
        if (!folio || !folio_contains(folio, index)) {
                erofs_put_metabuf(buf);
                folio = read_mapping_folio(buf->mapping, index, buf->file);
                if (IS_ERR(folio))
                        return folio;
        }
        buf->page = folio_file_page(folio, index);
        if (!need_kmap)
                return NULL;
        if (!buf->base)
                buf->base = kmap_local_page(buf->page);
        return buf->base + (offset & ~PAGE_MASK);
}

int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
                       bool in_metabox)
{
        struct erofs_sb_info *sbi = EROFS_SB(sb);

        buf->file = NULL;
        if (in_metabox) {
                if (unlikely(!sbi->metabox_inode))
                        return -EFSCORRUPTED;
                buf->mapping = sbi->metabox_inode->i_mapping;
                return 0;
        }
        buf->off = sbi->dif0.fsoff;
        if (erofs_is_fileio_mode(sbi)) {
                buf->file = sbi->dif0.file;        /* some fs like FUSE needs it */
                buf->mapping = buf->file->f_mapping;
        } else if (erofs_is_fscache_mode(sb))
                buf->mapping = sbi->dif0.fscache->inode->i_mapping;
        else
                buf->mapping = sb->s_bdev->bd_mapping;
        return 0;
}

void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
                         erofs_off_t offset, bool in_metabox)
{
        int err;

        err = erofs_init_metabuf(buf, sb, in_metabox);
        if (err)
                return ERR_PTR(err);
        return erofs_bread(buf, offset, true);
}

int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
{
        struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
        struct super_block *sb = inode->i_sb;
        unsigned int unit, blksz = sb->s_blocksize;
        struct erofs_inode *vi = EROFS_I(inode);
        struct erofs_inode_chunk_index *idx;
        erofs_blk_t startblk, addrmask;
        bool tailpacking;
        erofs_off_t pos;
        u64 chunknr;
        int err = 0;

        trace_erofs_map_blocks_enter(inode, map, 0);
        map->m_deviceid = 0;
        map->m_flags = 0;
        if (map->m_la >= inode->i_size)
                goto out;

        if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
                tailpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
                if (!tailpacking && vi->startblk == EROFS_NULL_ADDR)
                        goto out;
                pos = erofs_pos(sb, erofs_iblks(inode) - tailpacking);

                map->m_flags = EROFS_MAP_MAPPED;
                if (map->m_la < pos) {
                        map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la;
                        map->m_llen = pos - map->m_la;
                } else {
                        map->m_pa = erofs_iloc(inode) + vi->inode_isize +
                                vi->xattr_isize + erofs_blkoff(sb, map->m_la);
                        map->m_llen = inode->i_size - map->m_la;
                        map->m_flags |= EROFS_MAP_META;
                }
                goto out;
        }

        if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)
                unit = sizeof(*idx);                        /* chunk index */
        else
                unit = EROFS_BLOCK_MAP_ENTRY_SIZE;        /* block map */

        chunknr = map->m_la >> vi->chunkbits;
        pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
                    vi->xattr_isize, unit) + unit * chunknr;

        idx = erofs_read_metabuf(&buf, sb, pos, erofs_inode_in_metabox(inode));
        if (IS_ERR(idx)) {
                err = PTR_ERR(idx);
                goto out;
        }
        map->m_la = chunknr << vi->chunkbits;
        map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits,
                            round_up(inode->i_size - map->m_la, blksz));
        if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) {
                addrmask = (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ?
                        BIT_ULL(48) - 1 : BIT_ULL(32) - 1;
                startblk = (((u64)le16_to_cpu(idx->startblk_hi) << 32) |
                            le32_to_cpu(idx->startblk_lo)) & addrmask;
                if ((startblk ^ EROFS_NULL_ADDR) & addrmask) {
                        map->m_deviceid = le16_to_cpu(idx->device_id) &
                                EROFS_SB(sb)->device_id_mask;
                        map->m_pa = erofs_pos(sb, startblk);
                        map->m_flags = EROFS_MAP_MAPPED;
                }
        } else {
                startblk = le32_to_cpu(*(__le32 *)idx);
                if (startblk != (u32)EROFS_NULL_ADDR) {
                        map->m_pa = erofs_pos(sb, startblk);
                        map->m_flags = EROFS_MAP_MAPPED;
                }
        }
        erofs_put_metabuf(&buf);
out:
        if (!err) {
                map->m_plen = map->m_llen;
                /* inline data should be located in the same meta block */
                if ((map->m_flags & EROFS_MAP_META) &&
                    erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) {
                        erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
                        DBG_BUGON(1);
                        return -EFSCORRUPTED;
                }
        }
        trace_erofs_map_blocks_exit(inode, map, 0, err);
        return err;
}

static void erofs_fill_from_devinfo(struct erofs_map_dev *map,
                struct super_block *sb, struct erofs_device_info *dif)
{
        map->m_sb = sb;
        map->m_dif = dif;
        map->m_bdev = NULL;
        if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode))
                map->m_bdev = file_bdev(dif->file);
}

int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
        struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
        struct erofs_device_info *dif;
        erofs_off_t startoff;
        int id;

        erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
        map->m_bdev = sb->s_bdev;        /* use s_bdev for the primary device */
        if (map->m_deviceid) {
                down_read(&devs->rwsem);
                dif = idr_find(&devs->tree, map->m_deviceid - 1);
                if (!dif) {
                        up_read(&devs->rwsem);
                        return -ENODEV;
                }
                if (devs->flatdev) {
                        map->m_pa += erofs_pos(sb, dif->uniaddr);
                        up_read(&devs->rwsem);
                        return 0;
                }
                erofs_fill_from_devinfo(map, sb, dif);
                up_read(&devs->rwsem);
        } else if (devs->extra_devices && !devs->flatdev) {
                down_read(&devs->rwsem);
                idr_for_each_entry(&devs->tree, dif, id) {
                        if (!dif->uniaddr)
                                continue;

                        startoff = erofs_pos(sb, dif->uniaddr);
                        if (map->m_pa >= startoff &&
                            map->m_pa < startoff + erofs_pos(sb, dif->blocks)) {
                                map->m_pa -= startoff;
                                erofs_fill_from_devinfo(map, sb, dif);
                                break;
                        }
                }
                up_read(&devs->rwsem);
        }
        return 0;
}

/*
 * bit 30: I/O error occurred on this folio
 * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
 * bit 0 - 29: remaining parts to complete this folio
 */
#define EROFS_ONLINEFOLIO_EIO                30
#define EROFS_ONLINEFOLIO_DIRTY                29

void erofs_onlinefolio_init(struct folio *folio)
{
        union {
                atomic_t o;
                void *v;
        } u = { .o = ATOMIC_INIT(1) };

        folio->private = u.v;        /* valid only if file-backed folio is locked */
}

void erofs_onlinefolio_split(struct folio *folio)
{
        atomic_inc((atomic_t *)&folio->private);
}

void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
{
        int orig, v;

        do {
                orig = atomic_read((atomic_t *)&folio->private);
                DBG_BUGON(orig <= 0);
                v = dirty << EROFS_ONLINEFOLIO_DIRTY;
                v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO);
        } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);

        if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1))
                return;
        folio->private = 0;
        if (v & BIT(EROFS_ONLINEFOLIO_DIRTY))
                flush_dcache_folio(folio);
        folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO)));
}

struct erofs_iomap_iter_ctx {
        struct page *page;
        void *base;
        struct inode *realinode;
};

static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
        struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
        struct erofs_iomap_iter_ctx *ctx = iter->private;
        struct inode *realinode = ctx ? ctx->realinode : inode;
        struct super_block *sb = realinode->i_sb;
        struct erofs_map_blocks map;
        struct erofs_map_dev mdev;
        int ret;

        map.m_la = offset;
        map.m_llen = length;
        ret = erofs_map_blocks(realinode, &map);
        if (ret < 0)
                return ret;

        iomap->offset = map.m_la;
        iomap->length = map.m_llen;
        iomap->flags = 0;
        iomap->addr = IOMAP_NULL_ADDR;
        if (!(map.m_flags & EROFS_MAP_MAPPED)) {
                iomap->type = IOMAP_HOLE;
                return 0;
        }

        if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(realinode)) {
                mdev = (struct erofs_map_dev) {
                        .m_deviceid = map.m_deviceid,
                        .m_pa = map.m_pa,
                };
                ret = erofs_map_dev(sb, &mdev);
                if (ret)
                        return ret;

                if (flags & IOMAP_DAX)
                        iomap->dax_dev = mdev.m_dif->dax_dev;
                else
                        iomap->bdev = mdev.m_bdev;
                iomap->addr = mdev.m_dif->fsoff + mdev.m_pa;
                if (flags & IOMAP_DAX)
                        iomap->addr += mdev.m_dif->dax_part_off;
        }

        if (map.m_flags & EROFS_MAP_META) {
                iomap->type = IOMAP_INLINE;
                /* read context should read the inlined data */
                if (ctx) {
                        struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
                        void *ptr;

                        ptr = erofs_read_metabuf(&buf, sb, map.m_pa,
                                                 erofs_inode_in_metabox(realinode));
                        if (IS_ERR(ptr))
                                return PTR_ERR(ptr);
                        iomap->inline_data = ptr;
                        ctx->page = buf.page;
                        ctx->base = buf.base;
                }
        } else {
                iomap->type = IOMAP_MAPPED;
        }
        return 0;
}

static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
                ssize_t written, unsigned int flags, struct iomap *iomap)
{
        struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
        struct erofs_iomap_iter_ctx *ctx = iter->private;

        if (ctx && ctx->base) {
                struct erofs_buf buf = {
                        .page = ctx->page,
                        .base = ctx->base,
                };

                DBG_BUGON(iomap->type != IOMAP_INLINE);
                erofs_put_metabuf(&buf);
                ctx->base = NULL;
        }
        return written;
}

static const struct iomap_ops erofs_iomap_ops = {
        .iomap_begin = erofs_iomap_begin,
        .iomap_end = erofs_iomap_end,
};

int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 u64 start, u64 len)
{
        if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
                if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP))
                        return -EOPNOTSUPP;
                return iomap_fiemap(inode, fieinfo, start, len,
                                    &z_erofs_iomap_report_ops);
        }
        return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops);
}

/*
 * since we dont have write or truncate flows, so no inode
 * locking needs to be held at the moment.
 */
static int erofs_read_folio(struct file *file, struct folio *folio)
{
        struct iomap_read_folio_ctx read_ctx = {
                .ops                = &iomap_bio_read_ops,
                .cur_folio        = folio,
        };
        bool need_iput;
        struct erofs_iomap_iter_ctx iter_ctx = {
                .realinode = erofs_real_inode(folio_inode(folio), &need_iput),
        };

        trace_erofs_read_folio(iter_ctx.realinode, folio, true);
        iomap_read_folio(&erofs_iomap_ops, &read_ctx, &iter_ctx);
        if (need_iput)
                iput(iter_ctx.realinode);
        return 0;
}

static void erofs_readahead(struct readahead_control *rac)
{
        struct iomap_read_folio_ctx read_ctx = {
                .ops                = &iomap_bio_read_ops,
                .rac                = rac,
        };
        bool need_iput;
        struct erofs_iomap_iter_ctx iter_ctx = {
                .realinode = erofs_real_inode(rac->mapping->host, &need_iput),
        };

        trace_erofs_readahead(iter_ctx.realinode, readahead_index(rac),
                              readahead_count(rac), true);
        iomap_readahead(&erofs_iomap_ops, &read_ctx, &iter_ctx);
        if (need_iput)
                iput(iter_ctx.realinode);
}

static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
{
        return iomap_bmap(mapping, block, &erofs_iomap_ops);
}

static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        /* no need taking (shared) inode lock since it's a ro filesystem */
        if (!iov_iter_count(to))
                return 0;

        if (IS_ENABLED(CONFIG_FS_DAX) && IS_DAX(inode))
                return dax_iomap_rw(iocb, to, &erofs_iomap_ops);

        if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev) {
                struct erofs_iomap_iter_ctx iter_ctx = {
                        .realinode = inode,
                };

                return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
                                    NULL, 0, &iter_ctx, 0);
        }
        return filemap_read(iocb, to, 0);
}

/* for uncompressed (aligned) files and raw access for other files */
const struct address_space_operations erofs_aops = {
        .read_folio = erofs_read_folio,
        .readahead = erofs_readahead,
        .bmap = erofs_bmap,
        .direct_IO = noop_direct_IO,
        .release_folio = iomap_release_folio,
        .invalidate_folio = iomap_invalidate_folio,
};

#ifdef CONFIG_FS_DAX
static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
                unsigned int order)
{
        return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops);
}

static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
{
        return erofs_dax_huge_fault(vmf, 0);
}

static const struct vm_operations_struct erofs_dax_vm_ops = {
        .fault                = erofs_dax_fault,
        .huge_fault        = erofs_dax_huge_fault,
};

static int erofs_file_mmap_prepare(struct vm_area_desc *desc)
{
        if (!IS_DAX(file_inode(desc->file)))
                return generic_file_readonly_mmap_prepare(desc);

        if (vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT))
                return -EINVAL;

        desc->vm_ops = &erofs_dax_vm_ops;
        vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT);
        return 0;
}
#else
#define erofs_file_mmap_prepare        generic_file_readonly_mmap_prepare
#endif

static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;
        const struct iomap_ops *ops = &erofs_iomap_ops;

        if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
                if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP))
                        return generic_file_llseek(file, offset, whence);
                ops = &z_erofs_iomap_report_ops;
        }

        if (whence == SEEK_HOLE)
                offset = iomap_seek_hole(inode, offset, ops);
        else if (whence == SEEK_DATA)
                offset = iomap_seek_data(inode, offset, ops);
        else
                return generic_file_llseek(file, offset, whence);

        if (offset < 0)
                return offset;
        return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}

const struct file_operations erofs_file_fops = {
        .llseek                = erofs_file_llseek,
        .read_iter        = erofs_file_read_iter,
        .unlocked_ioctl = erofs_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl   = erofs_compat_ioctl,
#endif
        .mmap_prepare        = erofs_file_mmap_prepare,
        .get_unmapped_area = thp_get_unmapped_area,
        .splice_read        = filemap_splice_read,
        .setlease        = generic_setlease,
};

































































































































   13 








   14 













   11 











   14 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions related to generic timeout handling of requests.
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/fault-inject.h>

#include "blk.h"
#include "blk-mq.h"

#ifdef CONFIG_FAIL_IO_TIMEOUT

static DECLARE_FAULT_ATTR(fail_io_timeout);

static int __init setup_fail_io_timeout(char *str)
{
        return setup_fault_attr(&fail_io_timeout, str);
}
__setup("fail_io_timeout=", setup_fail_io_timeout);

bool __blk_should_fake_timeout(struct request_queue *q)
{
        return should_fail(&fail_io_timeout, 1);
}
EXPORT_SYMBOL_GPL(__blk_should_fake_timeout);

static int __init fail_io_timeout_debugfs(void)
{
        struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
                                                NULL, &fail_io_timeout);

        return PTR_ERR_OR_ZERO(dir);
}

late_initcall(fail_io_timeout_debugfs);

ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);
        int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags);

        return sprintf(buf, "%d\n", set != 0);
}

ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count)
{
        struct gendisk *disk = dev_to_disk(dev);
        int val;

        if (count) {
                struct request_queue *q = disk->queue;
                char *p = (char *) buf;

                val = simple_strtoul(p, &p, 10);
                if (val)
                        blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
                else
                        blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
        }

        return count;
}

#endif /* CONFIG_FAIL_IO_TIMEOUT */

/**
 * blk_abort_request - Request recovery for the specified command
 * @req:        pointer to the request of interest
 *
 * This function requests that the block layer start recovery for the
 * request by deleting the timer and calling the q's timeout function.
 * LLDDs who implement their own error recovery MAY ignore the timeout
 * event if they generated blk_abort_request.
 */
void blk_abort_request(struct request *req)
{
        /*
         * All we need to ensure is that timeout scan takes place
         * immediately and that scan sees the new timeout value.
         * No need for fancy synchronizations.
         */
        WRITE_ONCE(req->deadline, jiffies);
        kblockd_schedule_work(&req->q->timeout_work);
}
EXPORT_SYMBOL_GPL(blk_abort_request);

static unsigned long blk_timeout_mask __read_mostly;

static int __init blk_timeout_init(void)
{
        blk_timeout_mask = roundup_pow_of_two(HZ) - 1;
        return 0;
}

late_initcall(blk_timeout_init);

/*
 * Just a rough estimate, we don't care about specific values for timeouts.
 */
static inline unsigned long blk_round_jiffies(unsigned long j)
{
        return (j + blk_timeout_mask) + 1;
}

unsigned long blk_rq_timeout(unsigned long timeout)
{
        unsigned long maxt;

        maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT);
        if (time_after(timeout, maxt))
                timeout = maxt;

        return timeout;
}

/**
 * blk_add_timer - Start timeout timer for a single request
 * @req:        request that is about to start running.
 *
 * Notes:
 *    Each request has its own timer, and as it is added to the queue, we
 *    set up the timer. When the request completes, we cancel the timer.
 */
void blk_add_timer(struct request *req)
{
        struct request_queue *q = req->q;
        unsigned long expiry;

        /*
         * Some LLDs, like scsi, peek at the timeout to prevent a
         * command from being retried forever.
         */
        if (!req->timeout)
                req->timeout = q->rq_timeout;

        req->rq_flags &= ~RQF_TIMED_OUT;

        expiry = jiffies + req->timeout;
        WRITE_ONCE(req->deadline, expiry);

        /*
         * If the timer isn't already pending or this timeout is earlier
         * than an existing one, modify the timer. Round up to next nearest
         * second.
         */
        expiry = blk_rq_timeout(blk_round_jiffies(expiry));

        if (!timer_pending(&q->timeout) ||
            time_before(expiry, q->timeout.expires)) {
                unsigned long diff = q->timeout.expires - expiry;

                /*
                 * Due to added timer slack to group timers, the timer
                 * will often be a little in front of what we asked for.
                 * So apply some tolerance here too, otherwise we keep
                 * modifying the timer because expires for value X
                 * will be X + something.
                 */
                if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
                        mod_timer(&q->timeout, expiry);
        }

}













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   18 




   20 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

























































































































   20 






   21 








































































































































































































    6 
    1 


























































































































































































































































































































































































































































    3 






























































































































































































































































































































































































































































































































































   23 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 */
#ifndef _LINUX_BPF_H
#define _LINUX_BPF_H 1

#include <uapi/linux/bpf.h>
#include <uapi/linux/filter.h>

#include <crypto/sha2.h>
#include <linux/workqueue.h>
#include <linux/file.h>
#include <linux/percpu.h>
#include <linux/err.h>
#include <linux/rbtree_latch.h>
#include <linux/numa.h>
#include <linux/mm_types.h>
#include <linux/wait.h>
#include <linux/refcount.h>
#include <linux/mutex.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/capability.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/percpu-refcount.h>
#include <linux/stddef.h>
#include <linux/bpfptr.h>
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/static_call.h>
#include <linux/memcontrol.h>
#include <linux/cfi.h>
#include <asm/rqspinlock.h>

struct bpf_verifier_env;
struct bpf_verifier_log;
struct perf_event;
struct bpf_prog;
struct bpf_prog_aux;
struct bpf_map;
struct bpf_arena;
struct sock;
struct seq_file;
struct btf;
struct btf_type;
struct exception_table_entry;
struct seq_operations;
struct bpf_iter_aux_info;
struct bpf_local_storage;
struct bpf_local_storage_map;
struct kobject;
struct mem_cgroup;
struct module;
struct bpf_func_state;
struct ftrace_ops;
struct cgroup;
struct bpf_token;
struct user_namespace;
struct super_block;
struct inode;

extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
extern struct kobject *btf_kobj;
extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
extern bool bpf_global_ma_set;

typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
                                        struct bpf_iter_aux_info *aux);
typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
typedef unsigned int (*bpf_func_t)(const void *,
                                   const struct bpf_insn *);
struct bpf_iter_seq_info {
        const struct seq_operations *seq_ops;
        bpf_iter_init_seq_priv_t init_seq_private;
        bpf_iter_fini_seq_priv_t fini_seq_private;
        u32 seq_priv_size;
};

/* map is generic key/value storage optionally accessible by eBPF programs */
struct bpf_map_ops {
        /* funcs callable from userspace (via syscall) */
        int (*map_alloc_check)(union bpf_attr *attr);
        struct bpf_map *(*map_alloc)(union bpf_attr *attr);
        void (*map_release)(struct bpf_map *map, struct file *map_file);
        void (*map_free)(struct bpf_map *map);
        int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
        void (*map_release_uref)(struct bpf_map *map);
        void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
        int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
                                union bpf_attr __user *uattr);
        int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key,
                                          void *value, u64 flags);
        int (*map_lookup_and_delete_batch)(struct bpf_map *map,
                                           const union bpf_attr *attr,
                                           union bpf_attr __user *uattr);
        int (*map_update_batch)(struct bpf_map *map, struct file *map_file,
                                const union bpf_attr *attr,
                                union bpf_attr __user *uattr);
        int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
                                union bpf_attr __user *uattr);

        /* funcs callable from userspace and from eBPF programs */
        void *(*map_lookup_elem)(struct bpf_map *map, void *key);
        long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
        long (*map_delete_elem)(struct bpf_map *map, void *key);
        long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
        long (*map_pop_elem)(struct bpf_map *map, void *value);
        long (*map_peek_elem)(struct bpf_map *map, void *value);
        void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu);
        int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf);

        /* funcs called by prog_array and perf_event_array map */
        void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
                                int fd);
        /* If need_defer is true, the implementation should guarantee that
         * the to-be-put element is still alive before the bpf program, which
         * may manipulate it, exists.
         */
        void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer);
        int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
        u32 (*map_fd_sys_lookup_elem)(void *ptr);
        void (*map_seq_show_elem)(struct bpf_map *map, void *key,
                                  struct seq_file *m);
        int (*map_check_btf)(struct bpf_map *map,
                             const struct btf *btf,
                             const struct btf_type *key_type,
                             const struct btf_type *value_type);

        /* Prog poke tracking helpers. */
        int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
        void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
        void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
                             struct bpf_prog *new);

        /* Direct value access helpers. */
        int (*map_direct_value_addr)(const struct bpf_map *map,
                                     u64 *imm, u32 off);
        int (*map_direct_value_meta)(const struct bpf_map *map,
                                     u64 imm, u32 *off);
        int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
        __poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
                             struct poll_table_struct *pts);
        unsigned long (*map_get_unmapped_area)(struct file *filep, unsigned long addr,
                                               unsigned long len, unsigned long pgoff,
                                               unsigned long flags);

        /* Functions called by bpf_local_storage maps */
        int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
                                        void *owner, u32 size);
        void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap,
                                           void *owner, u32 size);
        struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);

        /* Misc helpers.*/
        long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);

        /* map_meta_equal must be implemented for maps that can be
         * used as an inner map.  It is a runtime check to ensure
         * an inner map can be inserted to an outer map.
         *
         * Some properties of the inner map has been used during the
         * verification time.  When inserting an inner map at the runtime,
         * map_meta_equal has to ensure the inserting map has the same
         * properties that the verifier has used earlier.
         */
        bool (*map_meta_equal)(const struct bpf_map *meta0,
                               const struct bpf_map *meta1);


        int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
                                              struct bpf_func_state *caller,
                                              struct bpf_func_state *callee);
        long (*map_for_each_callback)(struct bpf_map *map,
                                     bpf_callback_t callback_fn,
                                     void *callback_ctx, u64 flags);

        u64 (*map_mem_usage)(const struct bpf_map *map);

        /* BTF id of struct allocated by map_alloc */
        int *map_btf_id;

        /* bpf_iter info used to open a seq_file */
        const struct bpf_iter_seq_info *iter_seq_info;
};

enum {
        /* Support at most 11 fields in a BTF type */
        BTF_FIELDS_MAX           = 11,
};

enum btf_field_type {
        BPF_SPIN_LOCK  = (1 << 0),
        BPF_TIMER      = (1 << 1),
        BPF_KPTR_UNREF = (1 << 2),
        BPF_KPTR_REF   = (1 << 3),
        BPF_KPTR_PERCPU = (1 << 4),
        BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_PERCPU,
        BPF_LIST_HEAD  = (1 << 5),
        BPF_LIST_NODE  = (1 << 6),
        BPF_RB_ROOT    = (1 << 7),
        BPF_RB_NODE    = (1 << 8),
        BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE,
        BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
        BPF_REFCOUNT   = (1 << 9),
        BPF_WORKQUEUE  = (1 << 10),
        BPF_UPTR       = (1 << 11),
        BPF_RES_SPIN_LOCK = (1 << 12),
        BPF_TASK_WORK  = (1 << 13),
};

enum bpf_cgroup_storage_type {
        BPF_CGROUP_STORAGE_SHARED,
        BPF_CGROUP_STORAGE_PERCPU,
        __BPF_CGROUP_STORAGE_MAX
#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
};

#ifdef CONFIG_CGROUP_BPF
# define for_each_cgroup_storage_type(stype) \
        for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
#else
# define for_each_cgroup_storage_type(stype) for (; false; )
#endif /* CONFIG_CGROUP_BPF */

typedef void (*btf_dtor_kfunc_t)(void *);

struct btf_field_kptr {
        struct btf *btf;
        struct module *module;
        /* dtor used if btf_is_kernel(btf), otherwise the type is
         * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
         */
        btf_dtor_kfunc_t dtor;
        u32 btf_id;
};

struct btf_field_graph_root {
        struct btf *btf;
        u32 value_btf_id;
        u32 node_offset;
        struct btf_record *value_rec;
};

struct btf_field {
        u32 offset;
        u32 size;
        enum btf_field_type type;
        union {
                struct btf_field_kptr kptr;
                struct btf_field_graph_root graph_root;
        };
};

struct btf_record {
        u32 cnt;
        u32 field_mask;
        int spin_lock_off;
        int res_spin_lock_off;
        int timer_off;
        int wq_off;
        int refcount_off;
        int task_work_off;
        struct btf_field fields[];
};

/* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */
struct bpf_rb_node_kern {
        struct rb_node rb_node;
        void *owner;
} __attribute__((aligned(8)));

/* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */
struct bpf_list_node_kern {
        struct list_head list_head;
        void *owner;
} __attribute__((aligned(8)));

/* 'Ownership' of program-containing map is claimed by the first program
 * that is going to use this map or by the first program which FD is
 * stored in the map to make sure that all callers and callees have the
 * same prog type, JITed flag and xdp_has_frags flag.
 */
struct bpf_map_owner {
        enum bpf_prog_type type;
        bool jited;
        bool xdp_has_frags;
        bool sleepable;
        u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE];
        const struct btf_type *attach_func_proto;
        enum bpf_attach_type expected_attach_type;
};

struct bpf_map {
        u8 sha[SHA256_DIGEST_SIZE];
        const struct bpf_map_ops *ops;
        struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
        void *security;
#endif
        enum bpf_map_type map_type;
        u32 key_size;
        u32 value_size;
        u32 max_entries;
        u64 map_extra; /* any per-map-type extra fields */
        u32 map_flags;
        u32 id;
        struct btf_record *record;
        int numa_node;
        u32 btf_key_type_id;
        u32 btf_value_type_id;
        u32 btf_vmlinux_value_type_id;
        struct btf *btf;
#ifdef CONFIG_MEMCG
        struct obj_cgroup *objcg;
#endif
        char name[BPF_OBJ_NAME_LEN];
        struct mutex freeze_mutex;
        atomic64_t refcnt;
        atomic64_t usercnt;
        /* rcu is used before freeing and work is only used during freeing */
        union {
                struct work_struct work;
                struct rcu_head rcu;
        };
        atomic64_t writecnt;
        spinlock_t owner_lock;
        struct bpf_map_owner *owner;
        bool bypass_spec_v1;
        bool frozen; /* write-once; write-protected by freeze_mutex */
        bool free_after_mult_rcu_gp;
        bool free_after_rcu_gp;
        atomic64_t sleepable_refcnt;
        s64 __percpu *elem_count;
        u64 cookie; /* write-once */
        char *excl_prog_sha;
};

static inline const char *btf_field_type_name(enum btf_field_type type)
{
        switch (type) {
        case BPF_SPIN_LOCK:
                return "bpf_spin_lock";
        case BPF_RES_SPIN_LOCK:
                return "bpf_res_spin_lock";
        case BPF_TIMER:
                return "bpf_timer";
        case BPF_WORKQUEUE:
                return "bpf_wq";
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
                return "kptr";
        case BPF_KPTR_PERCPU:
                return "percpu_kptr";
        case BPF_UPTR:
                return "uptr";
        case BPF_LIST_HEAD:
                return "bpf_list_head";
        case BPF_LIST_NODE:
                return "bpf_list_node";
        case BPF_RB_ROOT:
                return "bpf_rb_root";
        case BPF_RB_NODE:
                return "bpf_rb_node";
        case BPF_REFCOUNT:
                return "bpf_refcount";
        case BPF_TASK_WORK:
                return "bpf_task_work";
        default:
                WARN_ON_ONCE(1);
                return "unknown";
        }
}

#if IS_ENABLED(CONFIG_DEBUG_KERNEL)
#define BPF_WARN_ONCE(cond, format...) WARN_ONCE(cond, format)
#else
#define BPF_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
#endif

static inline u32 btf_field_type_size(enum btf_field_type type)
{
        switch (type) {
        case BPF_SPIN_LOCK:
                return sizeof(struct bpf_spin_lock);
        case BPF_RES_SPIN_LOCK:
                return sizeof(struct bpf_res_spin_lock);
        case BPF_TIMER:
                return sizeof(struct bpf_timer);
        case BPF_WORKQUEUE:
                return sizeof(struct bpf_wq);
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
        case BPF_KPTR_PERCPU:
        case BPF_UPTR:
                return sizeof(u64);
        case BPF_LIST_HEAD:
                return sizeof(struct bpf_list_head);
        case BPF_LIST_NODE:
                return sizeof(struct bpf_list_node);
        case BPF_RB_ROOT:
                return sizeof(struct bpf_rb_root);
        case BPF_RB_NODE:
                return sizeof(struct bpf_rb_node);
        case BPF_REFCOUNT:
                return sizeof(struct bpf_refcount);
        case BPF_TASK_WORK:
                return sizeof(struct bpf_task_work);
        default:
                WARN_ON_ONCE(1);
                return 0;
        }
}

static inline u32 btf_field_type_align(enum btf_field_type type)
{
        switch (type) {
        case BPF_SPIN_LOCK:
                return __alignof__(struct bpf_spin_lock);
        case BPF_RES_SPIN_LOCK:
                return __alignof__(struct bpf_res_spin_lock);
        case BPF_TIMER:
                return __alignof__(struct bpf_timer);
        case BPF_WORKQUEUE:
                return __alignof__(struct bpf_wq);
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
        case BPF_KPTR_PERCPU:
        case BPF_UPTR:
                return __alignof__(u64);
        case BPF_LIST_HEAD:
                return __alignof__(struct bpf_list_head);
        case BPF_LIST_NODE:
                return __alignof__(struct bpf_list_node);
        case BPF_RB_ROOT:
                return __alignof__(struct bpf_rb_root);
        case BPF_RB_NODE:
                return __alignof__(struct bpf_rb_node);
        case BPF_REFCOUNT:
                return __alignof__(struct bpf_refcount);
        case BPF_TASK_WORK:
                return __alignof__(struct bpf_task_work);
        default:
                WARN_ON_ONCE(1);
                return 0;
        }
}

static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
{
        memset(addr, 0, field->size);

        switch (field->type) {
        case BPF_REFCOUNT:
                refcount_set((refcount_t *)addr, 1);
                break;
        case BPF_RB_NODE:
                RB_CLEAR_NODE((struct rb_node *)addr);
                break;
        case BPF_LIST_HEAD:
        case BPF_LIST_NODE:
                INIT_LIST_HEAD((struct list_head *)addr);
                break;
        case BPF_RB_ROOT:
                /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */
        case BPF_SPIN_LOCK:
        case BPF_RES_SPIN_LOCK:
        case BPF_TIMER:
        case BPF_WORKQUEUE:
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
        case BPF_KPTR_PERCPU:
        case BPF_UPTR:
        case BPF_TASK_WORK:
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }
}

static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type)
{
        if (IS_ERR_OR_NULL(rec))
                return false;
        return rec->field_mask & type;
}

static inline void bpf_obj_init(const struct btf_record *rec, void *obj)
{
        int i;

        if (IS_ERR_OR_NULL(rec))
                return;
        for (i = 0; i < rec->cnt; i++)
                bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset);
}

/* 'dst' must be a temporary buffer and should not point to memory that is being
 * used in parallel by a bpf program or bpf syscall, otherwise the access from
 * the bpf program or bpf syscall may be corrupted by the reinitialization,
 * leading to weird problems. Even 'dst' is newly-allocated from bpf memory
 * allocator, it is still possible for 'dst' to be used in parallel by a bpf
 * program or bpf syscall.
 */
static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
{
        bpf_obj_init(map->record, dst);
}

/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
 * forced to use 'long' read/writes to try to atomically copy long counters.
 * Best-effort only.  No barriers here, since it _will_ race with concurrent
 * updates from BPF programs. Called from bpf syscall and mostly used with
 * size 8 or 16 bytes, so ask compiler to inline it.
 */
static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
{
        const long *lsrc = src;
        long *ldst = dst;

        size /= sizeof(long);
        while (size--)
                data_race(*ldst++ = *lsrc++);
}

/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
static inline void bpf_obj_memcpy(struct btf_record *rec,
                                  void *dst, void *src, u32 size,
                                  bool long_memcpy)
{
        u32 curr_off = 0;
        int i;

        if (IS_ERR_OR_NULL(rec)) {
                if (long_memcpy)
                        bpf_long_memcpy(dst, src, round_up(size, 8));
                else
                        memcpy(dst, src, size);
                return;
        }

        for (i = 0; i < rec->cnt; i++) {
                u32 next_off = rec->fields[i].offset;
                u32 sz = next_off - curr_off;

                memcpy(dst + curr_off, src + curr_off, sz);
                curr_off += rec->fields[i].size + sz;
        }
        memcpy(dst + curr_off, src + curr_off, size - curr_off);
}

static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
{
        bpf_obj_memcpy(map->record, dst, src, map->value_size, false);
}

static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src)
{
        bpf_obj_memcpy(map->record, dst, src, map->value_size, true);
}

static inline void bpf_obj_swap_uptrs(const struct btf_record *rec, void *dst, void *src)
{
        unsigned long *src_uptr, *dst_uptr;
        const struct btf_field *field;
        int i;

        if (!btf_record_has_field(rec, BPF_UPTR))
                return;

        for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
                if (field->type != BPF_UPTR)
                        continue;

                src_uptr = src + field->offset;
                dst_uptr = dst + field->offset;
                swap(*src_uptr, *dst_uptr);
        }
}

static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size)
{
        u32 curr_off = 0;
        int i;

        if (IS_ERR_OR_NULL(rec)) {
                memset(dst, 0, size);
                return;
        }

        for (i = 0; i < rec->cnt; i++) {
                u32 next_off = rec->fields[i].offset;
                u32 sz = next_off - curr_off;

                memset(dst + curr_off, 0, sz);
                curr_off += rec->fields[i].size + sz;
        }
        memset(dst + curr_off, 0, size - curr_off);
}

static inline void zero_map_value(struct bpf_map *map, void *dst)
{
        bpf_obj_memzero(map->record, dst, map->value_size);
}

void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
                           bool lock_src);
void bpf_timer_cancel_and_free(void *timer);
void bpf_wq_cancel_and_free(void *timer);
void bpf_task_work_cancel_and_free(void *timer);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
                        struct bpf_spin_lock *spin_lock);
void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
                      struct bpf_spin_lock *spin_lock);
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);

struct bpf_offload_dev;
struct bpf_offloaded_map;

struct bpf_map_dev_ops {
        int (*map_get_next_key)(struct bpf_offloaded_map *map,
                                void *key, void *next_key);
        int (*map_lookup_elem)(struct bpf_offloaded_map *map,
                               void *key, void *value);
        int (*map_update_elem)(struct bpf_offloaded_map *map,
                               void *key, void *value, u64 flags);
        int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key);
};

struct bpf_offloaded_map {
        struct bpf_map map;
        struct net_device *netdev;
        const struct bpf_map_dev_ops *dev_ops;
        void *dev_priv;
        struct list_head offloads;
};

static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
{
        return container_of(map, struct bpf_offloaded_map, map);
}

static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
{
        return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY;
}

static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
{
        return (map->btf_value_type_id || map->btf_vmlinux_value_type_id) &&
                map->ops->map_seq_show_elem;
}

int map_check_no_btf(struct bpf_map *map,
                     const struct btf *btf,
                     const struct btf_type *key_type,
                     const struct btf_type *value_type);

bool bpf_map_meta_equal(const struct bpf_map *meta0,
                        const struct bpf_map *meta1);

static inline bool bpf_map_has_internal_structs(struct bpf_map *map)
{
        return btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK);
}

void bpf_map_free_internal_structs(struct bpf_map *map, void *obj);

int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
                                   struct bpf_dynptr *ptr__uninit);

#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
                                          u64 flags);
void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt);
#else
static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
                                                        int node_id, u64 flags)
{
        return NULL;
}

static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
{
}
#endif

extern const struct bpf_map_ops bpf_map_offload_ops;

/* bpf_type_flag contains a set of flags that are applicable to the values of
 * arg_type, ret_type and reg_type. For example, a pointer value may be null,
 * or a memory is read-only. We classify types into two categories: base types
 * and extended types. Extended types are base types combined with a type flag.
 *
 * Currently there are no more than 32 base types in arg_type, ret_type and
 * reg_types.
 */
#define BPF_BASE_TYPE_BITS        8

enum bpf_type_flag {
        /* PTR may be NULL. */
        PTR_MAYBE_NULL                = BIT(0 + BPF_BASE_TYPE_BITS),

        /* MEM is read-only. When applied on bpf_arg, it indicates the arg is
         * compatible with both mutable and immutable memory.
         */
        MEM_RDONLY                = BIT(1 + BPF_BASE_TYPE_BITS),

        /* MEM points to BPF ring buffer reservation. */
        MEM_RINGBUF                = BIT(2 + BPF_BASE_TYPE_BITS),

        /* MEM is in user address space. */
        MEM_USER                = BIT(3 + BPF_BASE_TYPE_BITS),

        /* MEM is a percpu memory. MEM_PERCPU tags PTR_TO_BTF_ID. When tagged
         * with MEM_PERCPU, PTR_TO_BTF_ID _cannot_ be directly accessed. In
         * order to drop this tag, it must be passed into bpf_per_cpu_ptr()
         * or bpf_this_cpu_ptr(), which will return the pointer corresponding
         * to the specified cpu.
         */
        MEM_PERCPU                = BIT(4 + BPF_BASE_TYPE_BITS),

        /* Indicates that the argument will be released. */
        OBJ_RELEASE                = BIT(5 + BPF_BASE_TYPE_BITS),

        /* PTR is not trusted. This is only used with PTR_TO_BTF_ID, to mark
         * unreferenced and referenced kptr loaded from map value using a load
         * instruction, so that they can only be dereferenced but not escape the
         * BPF program into the kernel (i.e. cannot be passed as arguments to
         * kfunc or bpf helpers).
         */
        PTR_UNTRUSTED                = BIT(6 + BPF_BASE_TYPE_BITS),

        /* MEM can be uninitialized. */
        MEM_UNINIT                = BIT(7 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to memory local to the bpf program. */
        DYNPTR_TYPE_LOCAL        = BIT(8 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to a kernel-produced ringbuf record. */
        DYNPTR_TYPE_RINGBUF        = BIT(9 + BPF_BASE_TYPE_BITS),

        /* Size is known at compile time. */
        MEM_FIXED_SIZE                = BIT(10 + BPF_BASE_TYPE_BITS),

        /* MEM is of an allocated object of type in program BTF. This is used to
         * tag PTR_TO_BTF_ID allocated using bpf_obj_new.
         */
        MEM_ALLOC                = BIT(11 + BPF_BASE_TYPE_BITS),

        /* PTR was passed from the kernel in a trusted context, and may be
         * passed to kfuncs or BPF helper functions.
         * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above.
         * PTR_UNTRUSTED refers to a kptr that was read directly from a map
         * without invoking bpf_kptr_xchg(). What we really need to know is
         * whether a pointer is safe to pass to a kfunc or BPF helper function.
         * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF
         * helpers, they do not cover all possible instances of unsafe
         * pointers. For example, a pointer that was obtained from walking a
         * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the
         * fact that it may be NULL, invalid, etc. This is due to backwards
         * compatibility requirements, as this was the behavior that was first
         * introduced when kptrs were added. The behavior is now considered
         * deprecated, and PTR_UNTRUSTED will eventually be removed.
         *
         * PTR_TRUSTED, on the other hand, is a pointer that the kernel
         * guarantees to be valid and safe to pass to kfuncs and BPF helpers.
         * For example, pointers passed to tracepoint arguments are considered
         * PTR_TRUSTED, as are pointers that are passed to struct_ops
         * callbacks. As alluded to above, pointers that are obtained from
         * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a
         * struct task_struct *task is PTR_TRUSTED, then accessing
         * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored
         * in a BPF register. Similarly, pointers passed to certain programs
         * types such as kretprobes are not guaranteed to be valid, as they may
         * for example contain an object that was recently freed.
         */
        PTR_TRUSTED                = BIT(12 + BPF_BASE_TYPE_BITS),

        /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */
        MEM_RCU                        = BIT(13 + BPF_BASE_TYPE_BITS),

        /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
         * Currently only valid for linked-list and rbtree nodes. If the nodes
         * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
         */
        NON_OWN_REF                = BIT(14 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to sk_buff */
        DYNPTR_TYPE_SKB                = BIT(15 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to xdp_buff */
        DYNPTR_TYPE_XDP                = BIT(16 + BPF_BASE_TYPE_BITS),

        /* Memory must be aligned on some architectures, used in combination with
         * MEM_FIXED_SIZE.
         */
        MEM_ALIGNED                = BIT(17 + BPF_BASE_TYPE_BITS),

        /* MEM is being written to, often combined with MEM_UNINIT. Non-presence
         * of MEM_WRITE means that MEM is only being read. MEM_WRITE without the
         * MEM_UNINIT means that memory needs to be initialized since it is also
         * read.
         */
        MEM_WRITE                = BIT(18 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */
        DYNPTR_TYPE_SKB_META        = BIT(19 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to file */
        DYNPTR_TYPE_FILE        = BIT(20 + BPF_BASE_TYPE_BITS),

        __BPF_TYPE_FLAG_MAX,
        __BPF_TYPE_LAST_FLAG        = __BPF_TYPE_FLAG_MAX - 1,
};

#define DYNPTR_TYPE_FLAG_MASK        (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
                                 | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META | DYNPTR_TYPE_FILE)

/* Max number of base types. */
#define BPF_BASE_TYPE_LIMIT        (1UL << BPF_BASE_TYPE_BITS)

/* Max number of all types. */
#define BPF_TYPE_LIMIT                (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1))

/* function argument constraints */
enum bpf_arg_type {
        ARG_DONTCARE = 0,        /* unused argument in helper function */

        /* the following constraints used to prototype
         * bpf_map_lookup/update/delete_elem() functions
         */
        ARG_CONST_MAP_PTR,        /* const argument used as pointer to bpf_map */
        ARG_PTR_TO_MAP_KEY,        /* pointer to stack used as map key */
        ARG_PTR_TO_MAP_VALUE,        /* pointer to stack used as map value */

        /* Used to prototype bpf_memcmp() and other functions that access data
         * on eBPF program stack
         */
        ARG_PTR_TO_MEM,                /* pointer to valid memory (stack, packet, map value) */
        ARG_PTR_TO_ARENA,

        ARG_CONST_SIZE,                /* number of bytes accessed from memory */
        ARG_CONST_SIZE_OR_ZERO,        /* number of bytes accessed from memory or 0 */

        ARG_PTR_TO_CTX,                /* pointer to context */
        ARG_ANYTHING,                /* any (initialized) argument is ok */
        ARG_PTR_TO_SPIN_LOCK,        /* pointer to bpf_spin_lock */
        ARG_PTR_TO_SOCK_COMMON,        /* pointer to sock_common */
        ARG_PTR_TO_SOCKET,        /* pointer to bpf_sock (fullsock) */
        ARG_PTR_TO_BTF_ID,        /* pointer to in-kernel struct */
        ARG_PTR_TO_RINGBUF_MEM,        /* pointer to dynamically reserved ringbuf memory */
        ARG_CONST_ALLOC_SIZE_OR_ZERO,        /* number of allocated bytes requested */
        ARG_PTR_TO_BTF_ID_SOCK_COMMON,        /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
        ARG_PTR_TO_PERCPU_BTF_ID,        /* pointer to in-kernel percpu type */
        ARG_PTR_TO_FUNC,        /* pointer to a bpf program function */
        ARG_PTR_TO_STACK,        /* pointer to stack */
        ARG_PTR_TO_CONST_STR,        /* pointer to a null terminated read-only string */
        ARG_PTR_TO_TIMER,        /* pointer to bpf_timer */
        ARG_KPTR_XCHG_DEST,        /* pointer to destination that kptrs are bpf_kptr_xchg'd into */
        ARG_PTR_TO_DYNPTR,      /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
        __BPF_ARG_TYPE_MAX,

        /* Extended arg_types. */
        ARG_PTR_TO_MAP_VALUE_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE,
        ARG_PTR_TO_MEM_OR_NULL                = PTR_MAYBE_NULL | ARG_PTR_TO_MEM,
        ARG_PTR_TO_CTX_OR_NULL                = PTR_MAYBE_NULL | ARG_PTR_TO_CTX,
        ARG_PTR_TO_SOCKET_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
        ARG_PTR_TO_STACK_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
        ARG_PTR_TO_BTF_ID_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID,
        /* Pointer to memory does not need to be initialized, since helper function
         * fills all bytes or clears them in error case.
         */
        ARG_PTR_TO_UNINIT_MEM                = MEM_UNINIT | MEM_WRITE | ARG_PTR_TO_MEM,
        /* Pointer to valid memory of size known at compile time. */
        ARG_PTR_TO_FIXED_SIZE_MEM        = MEM_FIXED_SIZE | ARG_PTR_TO_MEM,

        /* This must be the last entry. Its purpose is to ensure the enum is
         * wide enough to hold the higher bits reserved for bpf_type_flag.
         */
        __BPF_ARG_TYPE_LIMIT        = BPF_TYPE_LIMIT,
};
static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

/* type of values returned from helper functions */
enum bpf_return_type {
        RET_INTEGER,                        /* function returns integer */
        RET_VOID,                        /* function doesn't return anything */
        RET_PTR_TO_MAP_VALUE,                /* returns a pointer to map elem value */
        RET_PTR_TO_SOCKET,                /* returns a pointer to a socket */
        RET_PTR_TO_TCP_SOCK,                /* returns a pointer to a tcp_sock */
        RET_PTR_TO_SOCK_COMMON,                /* returns a pointer to a sock_common */
        RET_PTR_TO_MEM,                        /* returns a pointer to memory */
        RET_PTR_TO_MEM_OR_BTF_ID,        /* returns a pointer to a valid memory or a btf_id */
        RET_PTR_TO_BTF_ID,                /* returns a pointer to a btf_id */
        __BPF_RET_TYPE_MAX,

        /* Extended ret_types. */
        RET_PTR_TO_MAP_VALUE_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE,
        RET_PTR_TO_SOCKET_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
        RET_PTR_TO_TCP_SOCK_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
        RET_PTR_TO_SOCK_COMMON_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
        RET_PTR_TO_RINGBUF_MEM_OR_NULL        = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM,
        RET_PTR_TO_DYNPTR_MEM_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_MEM,
        RET_PTR_TO_BTF_ID_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
        RET_PTR_TO_BTF_ID_TRUSTED        = PTR_TRUSTED         | RET_PTR_TO_BTF_ID,

        /* This must be the last entry. Its purpose is to ensure the enum is
         * wide enough to hold the higher bits reserved for bpf_type_flag.
         */
        __BPF_RET_TYPE_LIMIT        = BPF_TYPE_LIMIT,
};
static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
 * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
 * instructions after verifying
 */
struct bpf_func_proto {
        u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
        bool gpl_only;
        bool pkt_access;
        bool might_sleep;
        /* set to true if helper follows contract for llvm
         * attribute bpf_fastcall:
         * - void functions do not scratch r0
         * - functions taking N arguments scratch only registers r1-rN
         */
        bool allow_fastcall;
        enum bpf_return_type ret_type;
        union {
                struct {
                        enum bpf_arg_type arg1_type;
                        enum bpf_arg_type arg2_type;
                        enum bpf_arg_type arg3_type;
                        enum bpf_arg_type arg4_type;
                        enum bpf_arg_type arg5_type;
                };
                enum bpf_arg_type arg_type[5];
        };
        union {
                struct {
                        u32 *arg1_btf_id;
                        u32 *arg2_btf_id;
                        u32 *arg3_btf_id;
                        u32 *arg4_btf_id;
                        u32 *arg5_btf_id;
                };
                u32 *arg_btf_id[5];
                struct {
                        size_t arg1_size;
                        size_t arg2_size;
                        size_t arg3_size;
                        size_t arg4_size;
                        size_t arg5_size;
                };
                size_t arg_size[5];
        };
        int *ret_btf_id; /* return value btf_id */
        bool (*allowed)(const struct bpf_prog *prog);
};

/* bpf_context is intentionally undefined structure. Pointer to bpf_context is
 * the first argument to eBPF programs.
 * For socket filters: 'struct bpf_context *' == 'struct sk_buff *'
 */
struct bpf_context;

enum bpf_access_type {
        BPF_READ = 1,
        BPF_WRITE = 2
};

/* types of values stored in eBPF registers */
/* Pointer types represent:
 * pointer
 * pointer + imm
 * pointer + (u16) var
 * pointer + (u16) var + imm
 * if (range > 0) then [ptr, ptr + range - off) is safe to access
 * if (id > 0) means that some 'var' was added
 * if (off > 0) means that 'imm' was added
 */
enum bpf_reg_type {
        NOT_INIT = 0,                 /* nothing was written into register */
        SCALAR_VALUE,                 /* reg doesn't contain a valid pointer */
        PTR_TO_CTX,                 /* reg points to bpf_context */
        CONST_PTR_TO_MAP,         /* reg points to struct bpf_map */
        PTR_TO_MAP_VALUE,         /* reg points to map element value */
        PTR_TO_MAP_KEY,                 /* reg points to a map element key */
        PTR_TO_STACK,                 /* reg == frame_pointer + offset */
        PTR_TO_PACKET_META,         /* skb->data - meta_len */
        PTR_TO_PACKET,                 /* reg points to skb->data */
        PTR_TO_PACKET_END,         /* skb->data + headlen */
        PTR_TO_FLOW_KEYS,         /* reg points to bpf_flow_keys */
        PTR_TO_SOCKET,                 /* reg points to struct bpf_sock */
        PTR_TO_SOCK_COMMON,         /* reg points to sock_common */
        PTR_TO_TCP_SOCK,         /* reg points to struct tcp_sock */
        PTR_TO_TP_BUFFER,         /* reg points to a writable raw tp's buffer */
        PTR_TO_XDP_SOCK,         /* reg points to struct xdp_sock */
        /* PTR_TO_BTF_ID points to a kernel struct that does not need
         * to be null checked by the BPF program. This does not imply the
         * pointer is _not_ null and in practice this can easily be a null
         * pointer when reading pointer chains. The assumption is program
         * context will handle null pointer dereference typically via fault
         * handling. The verifier must keep this in mind and can make no
         * assumptions about null or non-null when doing branch analysis.
         * Further, when passed into helpers the helpers can not, without
         * additional context, assume the value is non-null.
         */
        PTR_TO_BTF_ID,
        PTR_TO_MEM,                 /* reg points to valid memory region */
        PTR_TO_ARENA,
        PTR_TO_BUF,                 /* reg points to a read/write buffer */
        PTR_TO_FUNC,                 /* reg points to a bpf program function */
        PTR_TO_INSN,                 /* reg points to a bpf program instruction */
        CONST_PTR_TO_DYNPTR,         /* reg points to a const struct bpf_dynptr */
        __BPF_REG_TYPE_MAX,

        /* Extended reg_types. */
        PTR_TO_MAP_VALUE_OR_NULL        = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE,
        PTR_TO_SOCKET_OR_NULL                = PTR_MAYBE_NULL | PTR_TO_SOCKET,
        PTR_TO_SOCK_COMMON_OR_NULL        = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
        PTR_TO_TCP_SOCK_OR_NULL                = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
        /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not
         * been checked for null. Used primarily to inform the verifier
         * an explicit null check is required for this struct.
         */
        PTR_TO_BTF_ID_OR_NULL                = PTR_MAYBE_NULL | PTR_TO_BTF_ID,

        /* This must be the last entry. Its purpose is to ensure the enum is
         * wide enough to hold the higher bits reserved for bpf_type_flag.
         */
        __BPF_REG_TYPE_LIMIT        = BPF_TYPE_LIMIT,
};
static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

/* The information passed from prog-specific *_is_valid_access
 * back to the verifier.
 */
struct bpf_insn_access_aux {
        enum bpf_reg_type reg_type;
        bool is_ldsx;
        union {
                int ctx_field_size;
                struct {
                        struct btf *btf;
                        u32 btf_id;
                        u32 ref_obj_id;
                };
        };
        struct bpf_verifier_log *log; /* for verbose logs */
        bool is_retval; /* is accessing function return value ? */
};

static inline void
bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
{
        aux->ctx_field_size = size;
}

static bool bpf_is_ldimm64(const struct bpf_insn *insn)
{
        return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
}

static inline bool bpf_pseudo_func(const struct bpf_insn *insn)
{
        return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
}

/* Given a BPF_ATOMIC instruction @atomic_insn, return true if it is an
 * atomic load or store, and false if it is a read-modify-write instruction.
 */
static inline bool
bpf_atomic_is_load_store(const struct bpf_insn *atomic_insn)
{
        switch (atomic_insn->imm) {
        case BPF_LOAD_ACQ:
        case BPF_STORE_REL:
                return true;
        default:
                return false;
        }
}

struct bpf_prog_ops {
        int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
                        union bpf_attr __user *uattr);
};

struct bpf_reg_state;
struct bpf_verifier_ops {
        /* return eBPF function prototype for verification */
        const struct bpf_func_proto *
        (*get_func_proto)(enum bpf_func_id func_id,
                          const struct bpf_prog *prog);

        /* return true if 'size' wide access at offset 'off' within bpf_context
         * with 'type' (read or write) is allowed
         */
        bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info);
        int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
                            const struct bpf_prog *prog);
        int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog,
                            s16 ctx_stack_off);
        int (*gen_ld_abs)(const struct bpf_insn *orig,
                          struct bpf_insn *insn_buf);
        u32 (*convert_ctx_access)(enum bpf_access_type type,
                                  const struct bpf_insn *src,
                                  struct bpf_insn *dst,
                                  struct bpf_prog *prog, u32 *target_size);
        int (*btf_struct_access)(struct bpf_verifier_log *log,
                                 const struct bpf_reg_state *reg,
                                 int off, int size);
};

struct bpf_prog_offload_ops {
        /* verifier basic callbacks */
        int (*insn_hook)(struct bpf_verifier_env *env,
                         int insn_idx, int prev_insn_idx);
        int (*finalize)(struct bpf_verifier_env *env);
        /* verifier optimization callbacks (called after .finalize) */
        int (*replace_insn)(struct bpf_verifier_env *env, u32 off,
                            struct bpf_insn *insn);
        int (*remove_insns)(struct bpf_verifier_env *env, u32 off, u32 cnt);
        /* program management callbacks */
        int (*prepare)(struct bpf_prog *prog);
        int (*translate)(struct bpf_prog *prog);
        void (*destroy)(struct bpf_prog *prog);
};

struct bpf_prog_offload {
        struct bpf_prog                *prog;
        struct net_device        *netdev;
        struct bpf_offload_dev        *offdev;
        void                        *dev_priv;
        struct list_head        offloads;
        bool                        dev_state;
        bool                        opt_failed;
        void                        *jited_image;
        u32                        jited_len;
};

/* The longest tracepoint has 12 args.
 * See include/trace/bpf_probe.h
 */
#define MAX_BPF_FUNC_ARGS 12

/* The maximum number of arguments passed through registers
 * a single function may have.
 */
#define MAX_BPF_FUNC_REG_ARGS 5

/* The argument is a structure or a union. */
#define BTF_FMODEL_STRUCT_ARG                BIT(0)

/* The argument is signed. */
#define BTF_FMODEL_SIGNED_ARG                BIT(1)

struct btf_func_model {
        u8 ret_size;
        u8 ret_flags;
        u8 nr_args;
        u8 arg_size[MAX_BPF_FUNC_ARGS];
        u8 arg_flags[MAX_BPF_FUNC_ARGS];
};

/* Restore arguments before returning from trampoline to let original function
 * continue executing. This flag is used for fentry progs when there are no
 * fexit progs.
 */
#define BPF_TRAMP_F_RESTORE_REGS        BIT(0)
/* Call original function after fentry progs, but before fexit progs.
 * Makes sense for fentry/fexit, normal calls and indirect calls.
 */
#define BPF_TRAMP_F_CALL_ORIG                BIT(1)
/* Skip current frame and return to parent.  Makes sense for fentry/fexit
 * programs only. Should not be used with normal calls and indirect calls.
 */
#define BPF_TRAMP_F_SKIP_FRAME                BIT(2)
/* Store IP address of the caller on the trampoline stack,
 * so it's available for trampoline's programs.
 */
#define BPF_TRAMP_F_IP_ARG                BIT(3)
/* Return the return value of fentry prog. Only used by bpf_struct_ops. */
#define BPF_TRAMP_F_RET_FENTRY_RET        BIT(4)

/* Get original function from stack instead of from provided direct address.
 * Makes sense for trampolines with fexit or fmod_ret programs.
 */
#define BPF_TRAMP_F_ORIG_STACK                BIT(5)

/* This trampoline is on a function with another ftrace_ops with IPMODIFY,
 * e.g., a live patch. This flag is set and cleared by ftrace call backs,
 */
#define BPF_TRAMP_F_SHARE_IPMODIFY        BIT(6)

/* Indicate that current trampoline is in a tail call context. Then, it has to
 * cache and restore tail_call_cnt to avoid infinite tail call loop.
 */
#define BPF_TRAMP_F_TAIL_CALL_CTX        BIT(7)

/*
 * Indicate the trampoline should be suitable to receive indirect calls;
 * without this indirectly calling the generated code can result in #UD/#CP,
 * depending on the CFI options.
 *
 * Used by bpf_struct_ops.
 *
 * Incompatible with FENTRY usage, overloads @func_addr argument.
 */
#define BPF_TRAMP_F_INDIRECT                BIT(8)

/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
 * bytes on x86.
 */
enum {
#if defined(__s390x__)
        BPF_MAX_TRAMP_LINKS = 27,
#else
        BPF_MAX_TRAMP_LINKS = 38,
#endif
};

#define BPF_TRAMP_COOKIE_INDEX_SHIFT        8
#define BPF_TRAMP_IS_RETURN_SHIFT        63

struct bpf_tramp_links {
        struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];
        int nr_links;
};

struct bpf_tramp_run_ctx;

/* Different use cases for BPF trampoline:
 * 1. replace nop at the function entry (kprobe equivalent)
 *    flags = BPF_TRAMP_F_RESTORE_REGS
 *    fentry = a set of programs to run before returning from trampoline
 *
 * 2. replace nop at the function entry (kprobe + kretprobe equivalent)
 *    flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME
 *    orig_call = fentry_ip + MCOUNT_INSN_SIZE
 *    fentry = a set of program to run before calling original function
 *    fexit = a set of program to run after original function
 *
 * 3. replace direct call instruction anywhere in the function body
 *    or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid)
 *    With flags = 0
 *      fentry = a set of programs to run before returning from trampoline
 *    With flags = BPF_TRAMP_F_CALL_ORIG
 *      orig_call = original callback addr or direct function addr
 *      fentry = a set of program to run before calling original function
 *      fexit = a set of program to run after original function
 */
struct bpf_tramp_image;
int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
                                const struct btf_func_model *m, u32 flags,
                                struct bpf_tramp_links *tlinks,
                                void *func_addr);
void *arch_alloc_bpf_trampoline(unsigned int size);
void arch_free_bpf_trampoline(void *image, unsigned int size);
int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size);
int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
                             struct bpf_tramp_links *tlinks, void *func_addr);

u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
                                             struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
                                             struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog,
                                      struct bpf_tramp_run_ctx *run_ctx);
typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start,
                                      struct bpf_tramp_run_ctx *run_ctx);
bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog);
bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog);

#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
static inline bool bpf_trampoline_use_jmp(u64 flags)
{
        return flags & BPF_TRAMP_F_CALL_ORIG && !(flags & BPF_TRAMP_F_SKIP_FRAME);
}
#else
static inline bool bpf_trampoline_use_jmp(u64 flags)
{
        return false;
}
#endif

struct bpf_ksym {
        unsigned long                 start;
        unsigned long                 end;
        char                         name[KSYM_NAME_LEN];
        struct list_head         lnode;
        struct latch_tree_node         tnode;
        bool                         prog;
        u32                         fp_start;
        u32                         fp_end;
};

enum bpf_tramp_prog_type {
        BPF_TRAMP_FENTRY,
        BPF_TRAMP_FEXIT,
        BPF_TRAMP_MODIFY_RETURN,
        BPF_TRAMP_MAX,
        BPF_TRAMP_REPLACE, /* more than MAX */
        BPF_TRAMP_FSESSION,
};

struct bpf_tramp_image {
        void *image;
        int size;
        struct bpf_ksym ksym;
        struct percpu_ref pcref;
        void *ip_after_call;
        void *ip_epilogue;
        union {
                struct rcu_head rcu;
                struct work_struct work;
        };
};

struct bpf_trampoline {
        /* hlist for trampoline_key_table */
        struct hlist_node hlist_key;
        /* hlist for trampoline_ip_table */
        struct hlist_node hlist_ip;
        struct ftrace_ops *fops;
        /* serializes access to fields of this trampoline */
        struct mutex mutex;
        refcount_t refcnt;
        u32 flags;
        u64 key;
        unsigned long ip;
        struct {
                struct btf_func_model model;
                void *addr;
                bool ftrace_managed;
        } func;
        /* if !NULL this is BPF_PROG_TYPE_EXT program that extends another BPF
         * program by replacing one of its functions. func.addr is the address
         * of the function it replaced.
         */
        struct bpf_prog *extension_prog;
        /* list of BPF programs using this trampoline */
        struct hlist_head progs_hlist[BPF_TRAMP_MAX];
        /* Number of attached programs. A counter per kind. */
        int progs_cnt[BPF_TRAMP_MAX];
        /* Executable image of trampoline */
        struct bpf_tramp_image *cur_image;
};

struct bpf_attach_target_info {
        struct btf_func_model fmodel;
        long tgt_addr;
        struct module *tgt_mod;
        const char *tgt_name;
        const struct btf_type *tgt_type;
};

#define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */

struct bpf_dispatcher_prog {
        struct bpf_prog *prog;
        refcount_t users;
};

struct bpf_dispatcher {
        /* dispatcher mutex */
        struct mutex mutex;
        void *func;
        struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX];
        int num_progs;
        void *image;
        void *rw_image;
        u32 image_off;
        struct bpf_ksym ksym;
#ifdef CONFIG_HAVE_STATIC_CALL
        struct static_call_key *sc_key;
        void *sc_tramp;
#endif
};

#ifndef __bpfcall
#define __bpfcall __nocfi
#endif

static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func(
        const void *ctx,
        const struct bpf_insn *insnsi,
        bpf_func_t bpf_func)
{
        return bpf_func(ctx, insnsi);
}

/* the implementation of the opaque uapi struct bpf_dynptr */
struct bpf_dynptr_kern {
        void *data;
        /* Size represents the number of usable bytes of dynptr data.
         * If for example the offset is at 4 for a local dynptr whose data is
         * of type u64, the number of usable bytes is 4.
         *
         * The upper 8 bits are reserved. It is as follows:
         * Bits 0 - 23 = size
         * Bits 24 - 30 = dynptr type
         * Bit 31 = whether dynptr is read-only
         */
        u32 size;
        u32 offset;
} __aligned(8);

enum bpf_dynptr_type {
        BPF_DYNPTR_TYPE_INVALID,
        /* Points to memory that is local to the bpf program */
        BPF_DYNPTR_TYPE_LOCAL,
        /* Underlying data is a ringbuf record */
        BPF_DYNPTR_TYPE_RINGBUF,
        /* Underlying data is a sk_buff */
        BPF_DYNPTR_TYPE_SKB,
        /* Underlying data is a xdp_buff */
        BPF_DYNPTR_TYPE_XDP,
        /* Points to skb_metadata_end()-skb_metadata_len() */
        BPF_DYNPTR_TYPE_SKB_META,
        /* Underlying data is a file */
        BPF_DYNPTR_TYPE_FILE,
};

int bpf_dynptr_check_size(u64 size);
u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len);
void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len);
bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr);
int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset,
                       void *src, u64 len, u64 flags);
void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
                            void *buffer__nullable, u64 buffer__szk);

static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len)
{
        u64 size = __bpf_dynptr_size(ptr);

        if (len > size || offset > size - len)
                return -E2BIG;

        return 0;
}

#ifdef CONFIG_BPF_JIT
int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
                             struct bpf_trampoline *tr,
                             struct bpf_prog *tgt_prog);
int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
                               struct bpf_trampoline *tr,
                               struct bpf_prog *tgt_prog);
struct bpf_trampoline *bpf_trampoline_get(u64 key,
                                          struct bpf_attach_target_info *tgt_info);
void bpf_trampoline_put(struct bpf_trampoline *tr);
int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs);

/*
 * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn
 * indirection with a direct call to the bpf program. If the architecture does
 * not have STATIC_CALL, avoid a double-indirection.
 */
#ifdef CONFIG_HAVE_STATIC_CALL

#define __BPF_DISPATCHER_SC_INIT(_name)                                \
        .sc_key = &STATIC_CALL_KEY(_name),                        \
        .sc_tramp = STATIC_CALL_TRAMP_ADDR(_name),

#define __BPF_DISPATCHER_SC(name)                                \
        DEFINE_STATIC_CALL(bpf_dispatcher_##name##_call, bpf_dispatcher_nop_func)

#define __BPF_DISPATCHER_CALL(name)                                \
        static_call(bpf_dispatcher_##name##_call)(ctx, insnsi, bpf_func)

#define __BPF_DISPATCHER_UPDATE(_d, _new)                        \
        __static_call_update((_d)->sc_key, (_d)->sc_tramp, (_new))

#else
#define __BPF_DISPATCHER_SC_INIT(name)
#define __BPF_DISPATCHER_SC(name)
#define __BPF_DISPATCHER_CALL(name)                bpf_func(ctx, insnsi)
#define __BPF_DISPATCHER_UPDATE(_d, _new)
#endif

#define BPF_DISPATCHER_INIT(_name) {                                \
        .mutex = __MUTEX_INITIALIZER(_name.mutex),                \
        .func = &_name##_func,                                        \
        .progs = {},                                                \
        .num_progs = 0,                                                \
        .image = NULL,                                                \
        .image_off = 0,                                                \
        .ksym = {                                                \
                .name  = #_name,                                \
                .lnode = LIST_HEAD_INIT(_name.ksym.lnode),        \
        },                                                        \
        __BPF_DISPATCHER_SC_INIT(_name##_call)                        \
}

#define DEFINE_BPF_DISPATCHER(name)                                        \
        __BPF_DISPATCHER_SC(name);                                        \
        noinline __bpfcall unsigned int bpf_dispatcher_##name##_func(        \
                const void *ctx,                                        \
                const struct bpf_insn *insnsi,                                \
                bpf_func_t bpf_func)                                        \
        {                                                                \
                return __BPF_DISPATCHER_CALL(name);                        \
        }                                                                \
        EXPORT_SYMBOL(bpf_dispatcher_##name##_func);                        \
        struct bpf_dispatcher bpf_dispatcher_##name =                        \
                BPF_DISPATCHER_INIT(bpf_dispatcher_##name);

#define DECLARE_BPF_DISPATCHER(name)                                        \
        unsigned int bpf_dispatcher_##name##_func(                        \
                const void *ctx,                                        \
                const struct bpf_insn *insnsi,                                \
                bpf_func_t bpf_func);                                        \
        extern struct bpf_dispatcher bpf_dispatcher_##name;

#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func
#define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name)
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
                                struct bpf_prog *to);
/* Called only from JIT-enabled code, so there's no need for stubs. */
void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym);
void bpf_image_ksym_add(struct bpf_ksym *ksym);
void bpf_image_ksym_del(struct bpf_ksym *ksym);
void bpf_ksym_add(struct bpf_ksym *ksym);
void bpf_ksym_del(struct bpf_ksym *ksym);
bool bpf_has_frame_pointer(unsigned long ip);
int bpf_jit_charge_modmem(u32 size);
void bpf_jit_uncharge_modmem(u32 size);
bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog,
                                 int insn_idx);
#else
static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
                                           struct bpf_trampoline *tr,
                                           struct bpf_prog *tgt_prog)
{
        return -ENOTSUPP;
}
static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
                                             struct bpf_trampoline *tr,
                                             struct bpf_prog *tgt_prog)
{
        return -ENOTSUPP;
}
static inline struct bpf_trampoline *bpf_trampoline_get(u64 key,
                                                        struct bpf_attach_target_info *tgt_info)
{
        return NULL;
}
static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
#define DEFINE_BPF_DISPATCHER(name)
#define DECLARE_BPF_DISPATCHER(name)
#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nop_func
#define BPF_DISPATCHER_PTR(name) NULL
static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d,
                                              struct bpf_prog *from,
                                              struct bpf_prog *to) {}
static inline bool is_bpf_image_address(unsigned long address)
{
        return false;
}
static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
{
        return false;
}
#endif

struct bpf_func_info_aux {
        u16 linkage;
        bool unreliable;
        bool called : 1;
        bool verified : 1;
};

enum bpf_jit_poke_reason {
        BPF_POKE_REASON_TAIL_CALL,
};

/* Descriptor of pokes pointing /into/ the JITed image. */
struct bpf_jit_poke_descriptor {
        void *tailcall_target;
        void *tailcall_bypass;
        void *bypass_addr;
        void *aux;
        union {
                struct {
                        struct bpf_map *map;
                        u32 key;
                } tail_call;
        };
        bool tailcall_target_stable;
        u8 adj_off;
        u16 reason;
        u32 insn_idx;
};

/* reg_type info for ctx arguments */
struct bpf_ctx_arg_aux {
        u32 offset;
        enum bpf_reg_type reg_type;
        struct btf *btf;
        u32 btf_id;
        u32 ref_obj_id;
        bool refcounted;
};

struct btf_mod_pair {
        struct btf *btf;
        struct module *module;
};

struct bpf_kfunc_desc_tab;

enum bpf_stream_id {
        BPF_STDOUT = 1,
        BPF_STDERR = 2,
};

struct bpf_stream_elem {
        struct llist_node node;
        int total_len;
        int consumed_len;
        char str[];
};

enum {
        /* 100k bytes */
        BPF_STREAM_MAX_CAPACITY = 100000ULL,
};

struct bpf_stream {
        atomic_t capacity;
        struct llist_head log;        /* list of in-flight stream elements in LIFO order */

        struct mutex lock;  /* lock protecting backlog_{head,tail} */
        struct llist_node *backlog_head; /* list of in-flight stream elements in FIFO order */
        struct llist_node *backlog_tail; /* tail of the list above */
};

struct bpf_stream_stage {
        struct llist_head log;
        int len;
};

struct bpf_prog_aux {
        atomic64_t refcnt;
        u32 used_map_cnt;
        u32 used_btf_cnt;
        u32 max_ctx_offset;
        u32 max_pkt_offset;
        u32 max_tp_access;
        u32 stack_depth;
        u32 id;
        u32 func_cnt; /* used by non-func prog as the number of func progs */
        u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */
        u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
        u32 attach_btf_id; /* in-kernel BTF type id to attach to */
        u32 attach_st_ops_member_off;
        u32 ctx_arg_info_size;
        u32 max_rdonly_access;
        u32 max_rdwr_access;
        u32 subprog_start;
        struct btf *attach_btf;
        struct bpf_ctx_arg_aux *ctx_arg_info;
        void __percpu *priv_stack_ptr;
        struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
        struct bpf_prog *dst_prog;
        struct bpf_trampoline *dst_trampoline;
        enum bpf_prog_type saved_dst_prog_type;
        enum bpf_attach_type saved_dst_attach_type;
        bool verifier_zext; /* Zero extensions has been inserted by verifier. */
        bool dev_bound; /* Program is bound to the netdev. */
        bool offload_requested; /* Program is bound and offloaded to the netdev. */
        bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
        bool attach_tracing_prog; /* true if tracing another tracing program */
        bool func_proto_unreliable;
        bool tail_call_reachable;
        bool xdp_has_frags;
        bool exception_cb;
        bool exception_boundary;
        bool is_extended; /* true if extended by freplace program */
        bool jits_use_priv_stack;
        bool priv_stack_requested;
        bool changes_pkt_data;
        bool might_sleep;
        bool kprobe_write_ctx;
        u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
        struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
        struct bpf_arena *arena;
        void (*recursion_detected)(struct bpf_prog *prog); /* callback if recursion is detected */
        /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
        const struct btf_type *attach_func_proto;
        /* function name for valid attach_btf_id */
        const char *attach_func_name;
        struct bpf_prog **func;
        struct bpf_prog_aux *main_prog_aux;
        void *jit_data; /* JIT specific data. arch dependent */
        struct bpf_jit_poke_descriptor *poke_tab;
        struct bpf_kfunc_desc_tab *kfunc_tab;
        struct bpf_kfunc_btf_tab *kfunc_btf_tab;
        u32 size_poke_tab;
#ifdef CONFIG_FINEIBT
        struct bpf_ksym ksym_prefix;
#endif
        struct bpf_ksym ksym;
        const struct bpf_prog_ops *ops;
        const struct bpf_struct_ops *st_ops;
        struct bpf_map **used_maps;
        struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */
        struct btf_mod_pair *used_btfs;
        struct bpf_prog *prog;
        struct user_struct *user;
        u64 load_time; /* ns since boottime */
        u32 verified_insns;
        int cgroup_atype; /* enum cgroup_bpf_attach_type */
        struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
        char name[BPF_OBJ_NAME_LEN];
        u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64);
#ifdef CONFIG_SECURITY
        void *security;
#endif
        struct bpf_token *token;
        struct bpf_prog_offload *offload;
        struct btf *btf;
        struct bpf_func_info *func_info;
        struct bpf_func_info_aux *func_info_aux;
        /* bpf_line_info loaded from userspace.  linfo->insn_off
         * has the xlated insn offset.
         * Both the main and sub prog share the same linfo.
         * The subprog can access its first linfo by
         * using the linfo_idx.
         */
        struct bpf_line_info *linfo;
        /* jited_linfo is the jited addr of the linfo.  It has a
         * one to one mapping to linfo:
         * jited_linfo[i] is the jited addr for the linfo[i]->insn_off.
         * Both the main and sub prog share the same jited_linfo.
         * The subprog can access its first jited_linfo by
         * using the linfo_idx.
         */
        void **jited_linfo;
        u32 func_info_cnt;
        u32 nr_linfo;
        /* subprog can use linfo_idx to access its first linfo and
         * jited_linfo.
         * main prog always has linfo_idx == 0
         */
        u32 linfo_idx;
        struct module *mod;
        u32 num_exentries;
        struct exception_table_entry *extable;
        union {
                struct work_struct work;
                struct rcu_head        rcu;
        };
        struct bpf_stream stream[2];
        struct mutex st_ops_assoc_mutex;
        struct bpf_map __rcu *st_ops_assoc;
};

#define BPF_NR_CONTEXTS        4       /* normal, softirq, hardirq, NMI */

struct bpf_prog {
        u16                        pages;                /* Number of allocated pages */
        u16                        jited:1,        /* Is our filter JIT'ed? */
                                jit_requested:1,/* archs need to JIT the prog */
                                gpl_compatible:1, /* Is filter GPL compatible? */
                                cb_access:1,        /* Is control block accessed? */
                                dst_needed:1,        /* Do we need dst entry? */
                                blinding_requested:1, /* needs constant blinding */
                                blinded:1,        /* Was blinded */
                                is_func:1,        /* program is a bpf function */
                                kprobe_override:1, /* Do we override a kprobe? */
                                has_callchain_buf:1, /* callchain buffer allocated? */
                                enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
                                call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
                                call_get_func_ip:1, /* Do we call get_func_ip() */
                                call_session_cookie:1, /* Do we call bpf_session_cookie() */
                                tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
                                sleepable:1;        /* BPF program is sleepable */
        enum bpf_prog_type        type;                /* Type of BPF program */
        enum bpf_attach_type        expected_attach_type; /* For some prog types */
        u32                        len;                /* Number of filter blocks */
        u32                        jited_len;        /* Size of jited insns in bytes */
        union {
                u8 digest[SHA256_DIGEST_SIZE];
                u8 tag[BPF_TAG_SIZE];
        };
        struct bpf_prog_stats __percpu *stats;
        u8 __percpu                *active;        /* u8[BPF_NR_CONTEXTS] for recursion protection */
        unsigned int                (*bpf_func)(const void *ctx,
                                            const struct bpf_insn *insn);
        struct bpf_prog_aux        *aux;                /* Auxiliary fields */
        struct sock_fprog_kern        *orig_prog;        /* Original BPF program */
        /* Instructions for interpreter */
        union {
                DECLARE_FLEX_ARRAY(struct sock_filter, insns);
                DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi);
        };
};

struct bpf_array_aux {
        /* Programs with direct jumps into programs part of this array. */
        struct list_head poke_progs;
        struct bpf_map *map;
        struct mutex poke_mutex;
        struct work_struct work;
};

struct bpf_link {
        atomic64_t refcnt;
        u32 id;
        enum bpf_link_type type;
        const struct bpf_link_ops *ops;
        struct bpf_prog *prog;

        u32 flags;
        enum bpf_attach_type attach_type;

        /* rcu is used before freeing, work can be used to schedule that
         * RCU-based freeing before that, so they never overlap
         */
        union {
                struct rcu_head rcu;
                struct work_struct work;
        };
        /* whether BPF link itself has "sleepable" semantics, which can differ
         * from underlying BPF program having a "sleepable" semantics, as BPF
         * link's semantics is determined by target attach hook
         */
        bool sleepable;
};

struct bpf_link_ops {
        void (*release)(struct bpf_link *link);
        /* deallocate link resources callback, called without RCU grace period
         * waiting
         */
        void (*dealloc)(struct bpf_link *link);
        /* deallocate link resources callback, called after RCU grace period;
         * if either the underlying BPF program is sleepable or BPF link's
         * target hook is sleepable, we'll go through tasks trace RCU GP and
         * then "classic" RCU GP; this need for chaining tasks trace and
         * classic RCU GPs is designated by setting bpf_link->sleepable flag
         *
         * For non-sleepable tracepoint links we go through SRCU gp instead,
         * since RCU is not used in that case. Sleepable tracepoints still
         * follow the scheme above.
         */
        void (*dealloc_deferred)(struct bpf_link *link);
        int (*detach)(struct bpf_link *link);
        int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
                           struct bpf_prog *old_prog);
        void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
        int (*fill_link_info)(const struct bpf_link *link,
                              struct bpf_link_info *info);
        int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
                          struct bpf_map *old_map);
        __poll_t (*poll)(struct file *file, struct poll_table_struct *pts);
};

struct bpf_tramp_link {
        struct bpf_link link;
        struct hlist_node tramp_hlist;
        u64 cookie;
};

struct bpf_shim_tramp_link {
        struct bpf_tramp_link link;
        struct bpf_trampoline *trampoline;
};

struct bpf_tracing_link {
        struct bpf_tramp_link link;
        struct bpf_trampoline *trampoline;
        struct bpf_prog *tgt_prog;
};

struct bpf_fsession_link {
        struct bpf_tracing_link link;
        struct bpf_tramp_link fexit;
};

struct bpf_raw_tp_link {
        struct bpf_link link;
        struct bpf_raw_event_map *btp;
        u64 cookie;
};

struct bpf_link_primer {
        struct bpf_link *link;
        struct file *file;
        int fd;
        u32 id;
};

struct bpf_mount_opts {
        kuid_t uid;
        kgid_t gid;
        umode_t mode;

        /* BPF token-related delegation options */
        u64 delegate_cmds;
        u64 delegate_maps;
        u64 delegate_progs;
        u64 delegate_attachs;
};

struct bpf_token {
        struct work_struct work;
        atomic64_t refcnt;
        struct user_namespace *userns;
        u64 allowed_cmds;
        u64 allowed_maps;
        u64 allowed_progs;
        u64 allowed_attachs;
#ifdef CONFIG_SECURITY
        void *security;
#endif
};

struct bpf_struct_ops_value;
struct btf_member;

#define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64
/**
 * struct bpf_struct_ops - A structure of callbacks allowing a subsystem to
 *                           define a BPF_MAP_TYPE_STRUCT_OPS map type composed
 *                           of BPF_PROG_TYPE_STRUCT_OPS progs.
 * @verifier_ops: A structure of callbacks that are invoked by the verifier
 *                  when determining whether the struct_ops progs in the
 *                  struct_ops map are valid.
 * @init: A callback that is invoked a single time, and before any other
 *          callback, to initialize the structure. A nonzero return value means
 *          the subsystem could not be initialized.
 * @check_member: When defined, a callback invoked by the verifier to allow
 *                  the subsystem to determine if an entry in the struct_ops map
 *                  is valid. A nonzero return value means that the map is
 *                  invalid and should be rejected by the verifier.
 * @init_member: A callback that is invoked for each member of the struct_ops
 *                 map to allow the subsystem to initialize the member. A nonzero
 *                 value means the member could not be initialized. This callback
 *                 is exclusive with the @type, @type_id, @value_type, and
 *                 @value_id fields.
 * @reg: A callback that is invoked when the struct_ops map has been
 *         initialized and is being attached to. Zero means the struct_ops map
 *         has been successfully registered and is live. A nonzero return value
 *         means the struct_ops map could not be registered.
 * @unreg: A callback that is invoked when the struct_ops map should be
 *           unregistered.
 * @update: A callback that is invoked when the live struct_ops map is being
 *            updated to contain new values. This callback is only invoked when
 *            the struct_ops map is loaded with BPF_F_LINK. If not defined, the
 *            it is assumed that the struct_ops map cannot be updated.
 * @validate: A callback that is invoked after all of the members have been
 *              initialized. This callback should perform static checks on the
 *              map, meaning that it should either fail or succeed
 *              deterministically. A struct_ops map that has been validated may
 *              not necessarily succeed in being registered if the call to @reg
 *              fails. For example, a valid struct_ops map may be loaded, but
 *              then fail to be registered due to there being another active
 *              struct_ops map on the system in the subsystem already. For this
 *              reason, if this callback is not defined, the check is skipped as
 *              the struct_ops map will have final verification performed in
 *              @reg.
 * @cfi_stubs: Pointer to a structure of stub functions for CFI. These stubs
 *               provide the correct Control Flow Integrity hashes for the
 *               trampolines generated by BPF struct_ops.
 * @owner: The module that owns this struct_ops. Used for module reference
 *           counting to ensure the module providing the struct_ops cannot be
 *           unloaded while in use.
 * @name: The name of the struct bpf_struct_ops object.
 * @func_models: Func models
 */
struct bpf_struct_ops {
        const struct bpf_verifier_ops *verifier_ops;
        int (*init)(struct btf *btf);
        int (*check_member)(const struct btf_type *t,
                            const struct btf_member *member,
                            const struct bpf_prog *prog);
        int (*init_member)(const struct btf_type *t,
                           const struct btf_member *member,
                           void *kdata, const void *udata);
        int (*reg)(void *kdata, struct bpf_link *link);
        void (*unreg)(void *kdata, struct bpf_link *link);
        int (*update)(void *kdata, void *old_kdata, struct bpf_link *link);
        int (*validate)(void *kdata);
        void *cfi_stubs;
        struct module *owner;
        const char *name;
        struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
};

/* Every member of a struct_ops type has an instance even a member is not
 * an operator (function pointer). The "info" field will be assigned to
 * prog->aux->ctx_arg_info of BPF struct_ops programs to provide the
 * argument information required by the verifier to verify the program.
 *
 * btf_ctx_access() will lookup prog->aux->ctx_arg_info to find the
 * corresponding entry for an given argument.
 */
struct bpf_struct_ops_arg_info {
        struct bpf_ctx_arg_aux *info;
        u32 cnt;
};

struct bpf_struct_ops_desc {
        struct bpf_struct_ops *st_ops;

        const struct btf_type *type;
        const struct btf_type *value_type;
        u32 type_id;
        u32 value_id;

        /* Collection of argument information for each member */
        struct bpf_struct_ops_arg_info *arg_info;
};

enum bpf_struct_ops_state {
        BPF_STRUCT_OPS_STATE_INIT,
        BPF_STRUCT_OPS_STATE_INUSE,
        BPF_STRUCT_OPS_STATE_TOBEFREE,
        BPF_STRUCT_OPS_STATE_READY,
};

struct bpf_struct_ops_common_value {
        refcount_t refcnt;
        enum bpf_struct_ops_state state;
};

static inline bool bpf_prog_get_recursion_context(struct bpf_prog *prog)
{
#ifdef CONFIG_ARM64
        u8 rctx = interrupt_context_level();
        u8 *active = this_cpu_ptr(prog->active);
        u32 val;

        preempt_disable();
        active[rctx]++;
        val = le32_to_cpu(*(__le32 *)active);
        preempt_enable();
        if (val != BIT(rctx * 8))
                return false;

        return true;
#else
        return this_cpu_inc_return(*(int __percpu *)(prog->active)) == 1;
#endif
}

static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog)
{
#ifdef CONFIG_ARM64
        u8 rctx = interrupt_context_level();
        u8 *active = this_cpu_ptr(prog->active);

        preempt_disable();
        active[rctx]--;
        preempt_enable();
#else
        this_cpu_dec(*(int __percpu *)(prog->active));
#endif
}

#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
/* This macro helps developer to register a struct_ops type and generate
 * type information correctly. Developers should use this macro to register
 * a struct_ops type instead of calling __register_bpf_struct_ops() directly.
 */
#define register_bpf_struct_ops(st_ops, type)                                \
        ({                                                                \
                struct bpf_struct_ops_##type {                                \
                        struct bpf_struct_ops_common_value common;        \
                        struct type data ____cacheline_aligned_in_smp;        \
                };                                                        \
                BTF_TYPE_EMIT(struct bpf_struct_ops_##type);                \
                __register_bpf_struct_ops(st_ops);                        \
        })
#define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
bool bpf_struct_ops_get(const void *kdata);
void bpf_struct_ops_put(const void *kdata);
int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff);
int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
                                       void *value);
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
                                      struct bpf_tramp_link *link,
                                      const struct btf_func_model *model,
                                      void *stub_func,
                                      void **image, u32 *image_off,
                                      bool allow_alloc);
void bpf_struct_ops_image_free(void *image);
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
        if (owner == BPF_MODULE_OWNER)
                return bpf_struct_ops_get(data);
        else
                return try_module_get(owner);
}
static inline void bpf_module_put(const void *data, struct module *owner)
{
        if (owner == BPF_MODULE_OWNER)
                bpf_struct_ops_put(data);
        else
                module_put(owner);
}
int bpf_struct_ops_link_create(union bpf_attr *attr);
int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
u32 bpf_struct_ops_id(const void *kdata);

#ifdef CONFIG_NET
/* Define it here to avoid the use of forward declaration */
struct bpf_dummy_ops_state {
        int val;
};

struct bpf_dummy_ops {
        int (*test_1)(struct bpf_dummy_ops_state *cb);
        int (*test_2)(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2,
                      char a3, unsigned long a4);
        int (*test_sleepable)(struct bpf_dummy_ops_state *cb);
};

int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
                            union bpf_attr __user *uattr);
#endif
int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
                             struct btf *btf,
                             struct bpf_verifier_log *log);
void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map);
void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc);
#else
#define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; })
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
        return try_module_get(owner);
}
static inline void bpf_module_put(const void *data, struct module *owner)
{
        module_put(owner);
}
static inline int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
{
        return -ENOTSUPP;
}
static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
                                                     void *key,
                                                     void *value)
{
        return -EINVAL;
}
static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
{
        return -EOPNOTSUPP;
}
static inline int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map)
{
        return -EOPNOTSUPP;
}
static inline void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog)
{
}
static inline void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux)
{
        return NULL;
}
static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
{
}

static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
{
}

#endif

static inline int bpf_fsession_cnt(struct bpf_tramp_links *links)
{
        struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY];
        int cnt = 0;

        for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) {
                if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION)
                        cnt++;
        }

        return cnt;
}

static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_link *link)
{
        return link->link.prog->call_session_cookie;
}

static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links)
{
        struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY];
        int cnt = 0;

        for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) {
                if (bpf_prog_calls_session_cookie(fentries.links[i]))
                        cnt++;
        }

        return cnt;
}

int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
                               const struct bpf_ctx_arg_aux *info, u32 cnt);

#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
                                    int cgroup_atype,
                                    enum bpf_attach_type attach_type);
void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog);
#else
static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
                                                  int cgroup_atype,
                                                  enum bpf_attach_type attach_type)
{
        return -EOPNOTSUPP;
}
static inline void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
{
}
#endif

struct bpf_array {
        struct bpf_map map;
        u32 elem_size;
        u32 index_mask;
        struct bpf_array_aux *aux;
        union {
                DECLARE_FLEX_ARRAY(char, value) __aligned(8);
                DECLARE_FLEX_ARRAY(void *, ptrs) __aligned(8);
                DECLARE_FLEX_ARRAY(void __percpu *, pptrs) __aligned(8);
        };
};

/*
 * The bpf_array_get_next_key() function may be used for all array-like
 * maps, i.e., maps with u32 keys with range [0 ,..., max_entries)
 */
int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key);

#define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
#define MAX_TAIL_CALL_CNT 33

/* Maximum number of loops for bpf_loop and bpf_iter_num.
 * It's enum to expose it (and thus make it discoverable) through BTF.
 */
enum {
        BPF_MAX_LOOPS = 8 * 1024 * 1024,
        BPF_MAX_TIMED_LOOPS = 0xffff,
};

#define BPF_F_ACCESS_MASK        (BPF_F_RDONLY |                \
                                 BPF_F_RDONLY_PROG |        \
                                 BPF_F_WRONLY |                \
                                 BPF_F_WRONLY_PROG)

#define BPF_MAP_CAN_READ        BIT(0)
#define BPF_MAP_CAN_WRITE        BIT(1)

/* Maximum number of user-producer ring buffer samples that can be drained in
 * a call to bpf_user_ringbuf_drain().
 */
#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024)

static inline u32 bpf_map_flags_to_cap(struct bpf_map *map)
{
        u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);

        /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is
         * not possible.
         */
        if (access_flags & BPF_F_RDONLY_PROG)
                return BPF_MAP_CAN_READ;
        else if (access_flags & BPF_F_WRONLY_PROG)
                return BPF_MAP_CAN_WRITE;
        else
                return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE;
}

static inline bool bpf_map_flags_access_ok(u32 access_flags)
{
        return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) !=
               (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
}

static inline struct bpf_map_owner *bpf_map_owner_alloc(struct bpf_map *map)
{
        return kzalloc_obj(*map->owner, GFP_ATOMIC);
}

static inline void bpf_map_owner_free(struct bpf_map *map)
{
        kfree(map->owner);
}

struct bpf_event_entry {
        struct perf_event *event;
        struct file *perf_file;
        struct file *map_file;
        struct rcu_head rcu;
};

static inline bool map_type_contains_progs(struct bpf_map *map)
{
        return map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
               map->map_type == BPF_MAP_TYPE_DEVMAP ||
               map->map_type == BPF_MAP_TYPE_CPUMAP;
}

bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp);
int bpf_prog_calc_tag(struct bpf_prog *fp);

const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void);

const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void);

typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
                                        unsigned long off, unsigned long len);
typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type,
                                        const struct bpf_insn *src,
                                        struct bpf_insn *dst,
                                        struct bpf_prog *prog,
                                        u32 *target_size);

u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);

/* an array of programs to be executed under rcu_lock.
 *
 * Typical usage:
 * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run);
 *
 * the structure returned by bpf_prog_array_alloc() should be populated
 * with program pointers and the last pointer must be NULL.
 * The user has to keep refcnt on the program and make sure the program
 * is removed from the array before bpf_prog_put().
 * The 'struct bpf_prog_array *' should only be replaced with xchg()
 * since other cpus are walking the array of pointers in parallel.
 */
struct bpf_prog_array_item {
        struct bpf_prog *prog;
        union {
                struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
                u64 bpf_cookie;
        };
};

struct bpf_prog_array {
        struct rcu_head rcu;
        struct bpf_prog_array_item items[];
};

/* to avoid allocating empty bpf_prog_array for cgroups that
 * don't have bpf program attached use one global 'bpf_empty_prog_array'
 * It will not be modified the caller of bpf_prog_array_alloc()
 * (since caller requested prog_cnt == 0)
 * that pointer should be 'freed' by bpf_prog_array_free()
 */
extern struct bpf_prog_array bpf_empty_prog_array;

struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array *progs);
/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
int bpf_prog_array_length(struct bpf_prog_array *progs);
bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
                                __u32 __user *prog_ids, u32 cnt);

void bpf_prog_array_delete_safe(struct bpf_prog_array *progs,
                                struct bpf_prog *old_prog);
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index);
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
                             struct bpf_prog *prog);
int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt);
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
                        u64 bpf_cookie,
                        struct bpf_prog_array **new_array);

struct bpf_run_ctx {};

struct bpf_cg_run_ctx {
        struct bpf_run_ctx run_ctx;
        const struct bpf_prog_array_item *prog_item;
        int retval;
};

struct bpf_trace_run_ctx {
        struct bpf_run_ctx run_ctx;
        u64 bpf_cookie;
        bool is_uprobe;
};

struct bpf_tramp_run_ctx {
        struct bpf_run_ctx run_ctx;
        u64 bpf_cookie;
        struct bpf_run_ctx *saved_run_ctx;
};

static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
{
        struct bpf_run_ctx *old_ctx = NULL;

#ifdef CONFIG_BPF_SYSCALL
        old_ctx = current->bpf_ctx;
        current->bpf_ctx = new_ctx;
#endif
        return old_ctx;
}

static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
{
#ifdef CONFIG_BPF_SYSCALL
        current->bpf_ctx = old_ctx;
#endif
}

/* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */
#define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE                        (1 << 0)
/* BPF program asks to set CN on the packet. */
#define BPF_RET_SET_CN                                                (1 << 0)

typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx);

static __always_inline u32
bpf_prog_run_array(const struct bpf_prog_array *array,
                   const void *ctx, bpf_prog_run_fn run_prog)
{
        const struct bpf_prog_array_item *item;
        const struct bpf_prog *prog;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_trace_run_ctx run_ctx;
        u32 ret = 1;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");

        if (unlikely(!array))
                return ret;

        run_ctx.is_uprobe = false;

        migrate_disable();
        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
        item = &array->items[0];
        while ((prog = READ_ONCE(item->prog))) {
                run_ctx.bpf_cookie = item->bpf_cookie;
                ret &= run_prog(prog, ctx);
                item++;
        }
        bpf_reset_run_ctx(old_run_ctx);
        migrate_enable();
        return ret;
}

/* Notes on RCU design for bpf_prog_arrays containing sleepable programs:
 *
 * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
 * overall. As a result, we must use the bpf_prog_array_free_sleepable
 * in order to use the tasks_trace rcu grace period.
 *
 * When a non-sleepable program is inside the array, we take the rcu read
 * section and disable preemption for that program alone, so it can access
 * rcu-protected dynamically sized maps.
 */
static __always_inline u32
bpf_prog_run_array_uprobe(const struct bpf_prog_array *array,
                          const void *ctx, bpf_prog_run_fn run_prog)
{
        const struct bpf_prog_array_item *item;
        const struct bpf_prog *prog;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_trace_run_ctx run_ctx;
        u32 ret = 1;

        might_fault();
        RCU_LOCKDEP_WARN(!rcu_read_lock_trace_held(), "no rcu lock held");

        if (unlikely(!array))
                return ret;

        migrate_disable();

        run_ctx.is_uprobe = true;

        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
        item = &array->items[0];
        while ((prog = READ_ONCE(item->prog))) {
                if (!prog->sleepable)
                        rcu_read_lock();

                run_ctx.bpf_cookie = item->bpf_cookie;
                ret &= run_prog(prog, ctx);
                item++;

                if (!prog->sleepable)
                        rcu_read_unlock();
        }
        bpf_reset_run_ctx(old_run_ctx);
        migrate_enable();
        return ret;
}

bool bpf_jit_bypass_spec_v1(void);
bool bpf_jit_bypass_spec_v4(void);

#define bpf_rcu_lock_held() \
        (rcu_read_lock_held() || rcu_read_lock_trace_held() || rcu_read_lock_bh_held())

#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
extern struct mutex bpf_stats_enabled_mutex;

/*
 * Block execution of BPF programs attached to instrumentation (perf,
 * kprobes, tracepoints) to prevent deadlocks on map operations as any of
 * these events can happen inside a region which holds a map bucket lock
 * and can deadlock on it.
 */
static inline void bpf_disable_instrumentation(void)
{
        migrate_disable();
        this_cpu_inc(bpf_prog_active);
}

static inline void bpf_enable_instrumentation(void)
{
        this_cpu_dec(bpf_prog_active);
        migrate_enable();
}

extern const struct super_operations bpf_super_ops;
extern const struct file_operations bpf_map_fops;
extern const struct file_operations bpf_prog_fops;
extern const struct file_operations bpf_iter_fops;
extern const struct file_operations bpf_token_fops;

#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        extern const struct bpf_prog_ops _name ## _prog_ops; \
        extern const struct bpf_verifier_ops _name ## _verifier_ops;
#define BPF_MAP_TYPE(_id, _ops) \
        extern const struct bpf_map_ops _ops;
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE

extern const struct bpf_prog_ops bpf_offload_prog_ops;
extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
extern const struct bpf_verifier_ops xdp_analyzer_ops;

struct bpf_prog *bpf_prog_get(u32 ufd);
struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
                                       bool attach_drv);
void bpf_prog_add(struct bpf_prog *prog, int i);
void bpf_prog_sub(struct bpf_prog *prog, int i);
void bpf_prog_inc(struct bpf_prog *prog);
struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
void bpf_prog_put(struct bpf_prog *prog);

void bpf_prog_free_id(struct bpf_prog *prog);
void bpf_map_free_id(struct bpf_map *map);

struct btf_field *btf_record_find(const struct btf_record *rec,
                                  u32 offset, u32 field_mask);
void btf_record_free(struct btf_record *rec);
void bpf_map_free_record(struct bpf_map *map);
struct btf_record *btf_record_dup(const struct btf_record *rec);
bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj);
void bpf_obj_free_task_work(const struct btf_record *rec, void *obj);
void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);

struct bpf_map *bpf_map_get(u32 ufd);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);

/*
 * The __bpf_map_get() and __btf_get_by_fd() functions parse a file
 * descriptor and return a corresponding map or btf object.
 * Their names are double underscored to emphasize the fact that they
 * do not increase refcnt. To also increase refcnt use corresponding
 * bpf_map_get() and btf_get_by_fd() functions.
 */

static inline struct bpf_map *__bpf_map_get(struct fd f)
{
        if (fd_empty(f))
                return ERR_PTR(-EBADF);
        if (unlikely(fd_file(f)->f_op != &bpf_map_fops))
                return ERR_PTR(-EINVAL);
        return fd_file(f)->private_data;
}

static inline struct btf *__btf_get_by_fd(struct fd f)
{
        if (fd_empty(f))
                return ERR_PTR(-EBADF);
        if (unlikely(fd_file(f)->f_op != &btf_fops))
                return ERR_PTR(-EINVAL);
        return fd_file(f)->private_data;
}

void bpf_map_inc(struct bpf_map *map);
void bpf_map_inc_with_uref(struct bpf_map *map);
struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
void *bpf_map_area_alloc(u64 size, int numa_node);
void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
void bpf_map_area_free(void *base);
bool bpf_map_write_active(const struct bpf_map *map);
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
int  generic_map_lookup_batch(struct bpf_map *map,
                              const union bpf_attr *attr,
                              union bpf_attr __user *uattr);
int  generic_map_update_batch(struct bpf_map *map, struct file *map_file,
                              const union bpf_attr *attr,
                              union bpf_attr __user *uattr);
int  generic_map_delete_batch(struct bpf_map *map,
                              const union bpf_attr *attr,
                              union bpf_attr __user *uattr);
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);


int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
                        unsigned long nr_pages, struct page **page_array);
#ifdef CONFIG_MEMCG
void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
                         struct mem_cgroup **new_memcg);
void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
                        struct mem_cgroup *memcg);
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
                           int node);
void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
                             int node);
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags);
void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
                       gfp_t flags);
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
                                    size_t align, gfp_t flags);
#else
/*
 * These specialized allocators have to be macros for their allocations to be
 * accounted separately (to have separate alloc_tag).
 */
#define bpf_map_kmalloc_node(_map, _size, _flags, _node)        \
                kmalloc_node(_size, _flags, _node)
#define bpf_map_kmalloc_nolock(_map, _size, _flags, _node)        \
                kmalloc_nolock(_size, _flags, _node)
#define bpf_map_kzalloc(_map, _size, _flags)                        \
                kzalloc(_size, _flags)
#define bpf_map_kvcalloc(_map, _n, _size, _flags)                \
                kvcalloc(_n, _size, _flags)
#define bpf_map_alloc_percpu(_map, _size, _align, _flags)        \
                __alloc_percpu_gfp(_size, _align, _flags)
static inline void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
                                       struct mem_cgroup **new_memcg)
{
        *new_memcg = NULL;
        *old_memcg = NULL;
}

static inline void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
                                      struct mem_cgroup *memcg)
{
}
#endif

static inline int
bpf_map_init_elem_count(struct bpf_map *map)
{
        size_t size = sizeof(*map->elem_count), align = size;
        gfp_t flags = GFP_USER | __GFP_NOWARN;

        map->elem_count = bpf_map_alloc_percpu(map, size, align, flags);
        if (!map->elem_count)
                return -ENOMEM;

        return 0;
}

static inline void
bpf_map_free_elem_count(struct bpf_map *map)
{
        free_percpu(map->elem_count);
}

static inline void bpf_map_inc_elem_count(struct bpf_map *map)
{
        this_cpu_inc(*map->elem_count);
}

static inline void bpf_map_dec_elem_count(struct bpf_map *map)
{
        this_cpu_dec(*map->elem_count);
}

extern int sysctl_unprivileged_bpf_disabled;

bool bpf_token_capable(const struct bpf_token *token, int cap);

static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
{
        return bpf_token_capable(token, CAP_PERFMON);
}

static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
{
        return bpf_token_capable(token, CAP_PERFMON);
}

static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
{
        return bpf_jit_bypass_spec_v1() ||
                cpu_mitigations_off() ||
                bpf_token_capable(token, CAP_PERFMON);
}

static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
{
        return bpf_jit_bypass_spec_v4() ||
                cpu_mitigations_off() ||
                bpf_token_capable(token, CAP_PERFMON);
}

int bpf_map_new_fd(struct bpf_map *map, int flags);
int bpf_prog_new_fd(struct bpf_prog *prog);

void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
                   const struct bpf_link_ops *ops, struct bpf_prog *prog,
                   enum bpf_attach_type attach_type);
void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
                             const struct bpf_link_ops *ops, struct bpf_prog *prog,
                             enum bpf_attach_type attach_type, bool sleepable);
int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
int bpf_link_settle(struct bpf_link_primer *primer);
void bpf_link_cleanup(struct bpf_link_primer *primer);
void bpf_link_inc(struct bpf_link *link);
struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link);
void bpf_link_put(struct bpf_link *link);
int bpf_link_new_fd(struct bpf_link *link);
struct bpf_link *bpf_link_get_from_fd(u32 ufd);
struct bpf_link *bpf_link_get_curr_or_next(u32 *id);

void bpf_token_inc(struct bpf_token *token);
void bpf_token_put(struct bpf_token *token);
int bpf_token_create(union bpf_attr *attr);
struct bpf_token *bpf_token_get_from_fd(u32 ufd);
int bpf_token_get_info_by_fd(struct bpf_token *token,
                             const union bpf_attr *attr,
                             union bpf_attr __user *uattr);

bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
bool bpf_token_allow_prog_type(const struct bpf_token *token,
                               enum bpf_prog_type prog_type,
                               enum bpf_attach_type attach_type);

int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
                            umode_t mode);

#define BPF_ITER_FUNC_PREFIX "bpf_iter_"
#define DEFINE_BPF_ITER_FUNC(target, args...)                        \
        extern int bpf_iter_ ## target(args);                        \
        int __init bpf_iter_ ## target(args) { return 0; }

/*
 * The task type of iterators.
 *
 * For BPF task iterators, they can be parameterized with various
 * parameters to visit only some of tasks.
 *
 * BPF_TASK_ITER_ALL (default)
 *        Iterate over resources of every task.
 *
 * BPF_TASK_ITER_TID
 *        Iterate over resources of a task/tid.
 *
 * BPF_TASK_ITER_TGID
 *        Iterate over resources of every task of a process / task group.
 */
enum bpf_iter_task_type {
        BPF_TASK_ITER_ALL = 0,
        BPF_TASK_ITER_TID,
        BPF_TASK_ITER_TGID,
};

struct bpf_iter_aux_info {
        /* for map_elem iter */
        struct bpf_map *map;

        /* for cgroup iter */
        struct {
                struct cgroup *start; /* starting cgroup */
                enum bpf_cgroup_iter_order order;
        } cgroup;
        struct {
                enum bpf_iter_task_type        type;
                u32 pid;
        } task;
};

typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
                                        union bpf_iter_link_info *linfo,
                                        struct bpf_iter_aux_info *aux);
typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux);
typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux,
                                        struct seq_file *seq);
typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux,
                                         struct bpf_link_info *info);
typedef const struct bpf_func_proto *
(*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id,
                             const struct bpf_prog *prog);

enum bpf_iter_feature {
        BPF_ITER_RESCHED        = BIT(0),
};

#define BPF_ITER_CTX_ARG_MAX 2
struct bpf_iter_reg {
        const char *target;
        bpf_iter_attach_target_t attach_target;
        bpf_iter_detach_target_t detach_target;
        bpf_iter_show_fdinfo_t show_fdinfo;
        bpf_iter_fill_link_info_t fill_link_info;
        bpf_iter_get_func_proto_t get_func_proto;
        u32 ctx_arg_info_size;
        u32 feature;
        struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
        const struct bpf_iter_seq_info *seq_info;
};

struct bpf_iter_meta {
        __bpf_md_ptr(struct seq_file *, seq);
        u64 session_id;
        u64 seq_num;
};

struct bpf_iter__bpf_map_elem {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct bpf_map *, map);
        __bpf_md_ptr(void *, key);
        __bpf_md_ptr(void *, value);
};

int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
int bpf_iter_prog_supported(struct bpf_prog *prog);
const struct bpf_func_proto *
bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog);
int bpf_iter_new_fd(struct bpf_link *link);
bool bpf_link_is_iter(struct bpf_link *link);
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop);
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux,
                              struct seq_file *seq);
int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
                                struct bpf_link_info *info);

int map_set_for_each_callback_args(struct bpf_verifier_env *env,
                                   struct bpf_func_state *caller,
                                   struct bpf_func_state *callee);

int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 flags);
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags);
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
                           u64 flags);
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
                            u64 flags);

int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, bool delete);

int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
                                 void *key, void *value, u64 map_flags);
int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
                                void *key, void *value, u64 map_flags);
int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);

int bpf_get_file_flag(int flags);
int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
                             size_t actual_size);

/* verify correctness of eBPF program */
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size);

#ifndef CONFIG_BPF_JIT_ALWAYS_ON
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
#endif

struct btf *bpf_get_btf_vmlinux(void);

/* Map specifics */
struct xdp_frame;
struct sk_buff;
struct bpf_dtab_netdev;
struct bpf_cpu_map_entry;

void __dev_flush(struct list_head *flush_list);
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                    struct net_device *dev_rx);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
                    struct net_device *dev_rx);
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
                          struct bpf_map *map, bool exclude_ingress);
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
                             const struct bpf_prog *xdp_prog);
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
                           const struct bpf_prog *xdp_prog,
                           struct bpf_map *map, bool exclude_ingress);

void __cpu_map_flush(struct list_head *flush_list);
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
                    struct net_device *dev_rx);
int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
                             struct sk_buff *skb);

/* Return map's numa specified by userspace */
static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
{
        return (attr->map_flags & BPF_F_NUMA_NODE) ?
                attr->numa_node : NUMA_NO_NODE;
}

struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
int array_map_alloc_check(union bpf_attr *attr);

int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
                          union bpf_attr __user *uattr);
int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
                          union bpf_attr __user *uattr);
int bpf_prog_test_run_tracing(struct bpf_prog *prog,
                              const union bpf_attr *kattr,
                              union bpf_attr __user *uattr);
int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
                                     const union bpf_attr *kattr,
                                     union bpf_attr __user *uattr);
int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
                             const union bpf_attr *kattr,
                             union bpf_attr __user *uattr);
int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
                                const union bpf_attr *kattr,
                                union bpf_attr __user *uattr);
int bpf_prog_test_run_nf(struct bpf_prog *prog,
                         const union bpf_attr *kattr,
                         union bpf_attr __user *uattr);
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
                    const struct bpf_prog *prog,
                    struct bpf_insn_access_aux *info);

static inline bool bpf_tracing_ctx_access(int off, int size,
                                          enum bpf_access_type type)
{
        if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
                return false;
        if (type != BPF_READ)
                return false;
        if (off % size != 0)
                return false;
        return true;
}

static inline bool bpf_tracing_btf_ctx_access(int off, int size,
                                              enum bpf_access_type type,
                                              const struct bpf_prog *prog,
                                              struct bpf_insn_access_aux *info)
{
        if (!bpf_tracing_ctx_access(off, size, type))
                return false;
        return btf_ctx_access(off, size, type, prog, info);
}

int btf_struct_access(struct bpf_verifier_log *log,
                      const struct bpf_reg_state *reg,
                      int off, int size, enum bpf_access_type atype,
                      u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name);
bool btf_struct_ids_match(struct bpf_verifier_log *log,
                          const struct btf *btf, u32 id, int off,
                          const struct btf *need_btf, u32 need_type_id,
                          bool strict);

int btf_distill_func_proto(struct bpf_verifier_log *log,
                           struct btf *btf,
                           const struct btf_type *func_proto,
                           const char *func_name,
                           struct btf_func_model *m);

struct bpf_reg_state;
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
                         struct btf *btf, const struct btf_type *t);
const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
                                    int comp_idx, const char *tag_key);
int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
                           int comp_idx, const char *tag_key, int last_id);

struct bpf_prog *bpf_prog_by_id(u32 id);
struct bpf_link *bpf_link_by_id(u32 id);

const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
                                                 const struct bpf_prog *prog);
void bpf_task_storage_free(struct task_struct *task);
void bpf_cgrp_storage_free(struct cgroup *cgroup);
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
const struct btf_func_model *
bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
                         const struct bpf_insn *insn);
int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
                       u16 btf_fd_idx, u8 **func_addr);

struct bpf_core_ctx {
        struct bpf_verifier_log *log;
        const struct btf *btf;
};

bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
                                const struct bpf_reg_state *reg,
                                const char *field_name, u32 btf_id, const char *suffix);

bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
                               const struct btf *reg_btf, u32 reg_id,
                               const struct btf *arg_btf, u32 arg_id);

int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
                   int relo_idx, void *insn);

static inline bool unprivileged_ebpf_enabled(void)
{
        return !sysctl_unprivileged_bpf_disabled;
}

/* Not all bpf prog type has the bpf_ctx.
 * For the bpf prog type that has initialized the bpf_ctx,
 * this function can be used to decide if a kernel function
 * is called by a bpf program.
 */
static inline bool has_current_bpf_ctx(void)
{
        return !!current->bpf_ctx;
}

void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog);

void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
                     enum bpf_dynptr_type type, u32 offset, u32 size);
void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip);

#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
                                                     enum bpf_prog_type type,
                                                     bool attach_drv)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline void bpf_prog_add(struct bpf_prog *prog, int i)
{
}

static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
{
}

static inline void bpf_prog_put(struct bpf_prog *prog)
{
}

static inline void bpf_prog_inc(struct bpf_prog *prog)
{
}

static inline struct bpf_prog *__must_check
bpf_prog_inc_not_zero(struct bpf_prog *prog)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
                                 const struct bpf_link_ops *ops,
                                 struct bpf_prog *prog, enum bpf_attach_type attach_type)
{
}

static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
                                           const struct bpf_link_ops *ops, struct bpf_prog *prog,
                                           enum bpf_attach_type attach_type, bool sleepable)
{
}

static inline int bpf_link_prime(struct bpf_link *link,
                                 struct bpf_link_primer *primer)
{
        return -EOPNOTSUPP;
}

static inline int bpf_link_settle(struct bpf_link_primer *primer)
{
        return -EOPNOTSUPP;
}

static inline void bpf_link_cleanup(struct bpf_link_primer *primer)
{
}

static inline void bpf_link_inc(struct bpf_link *link)
{
}

static inline struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
        return NULL;
}

static inline void bpf_link_put(struct bpf_link *link)
{
}

static inline int bpf_obj_get_user(const char __user *pathname, int flags)
{
        return -EOPNOTSUPP;
}

static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
{
        return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
}

static inline void bpf_token_inc(struct bpf_token *token)
{
}

static inline void bpf_token_put(struct bpf_token *token)
{
}

static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int bpf_token_get_info_by_fd(struct bpf_token *token,
                                           const union bpf_attr *attr,
                                           union bpf_attr __user *uattr)
{
        return -EOPNOTSUPP;
}

static inline void __dev_flush(struct list_head *flush_list)
{
}

struct xdp_frame;
struct bpf_dtab_netdev;
struct bpf_cpu_map_entry;

static inline
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                    struct net_device *dev_rx)
{
        return 0;
}

static inline
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
                    struct net_device *dev_rx)
{
        return 0;
}

static inline
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
                          struct bpf_map *map, bool exclude_ingress)
{
        return 0;
}

struct sk_buff;

static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
                                           struct sk_buff *skb,
                                           const struct bpf_prog *xdp_prog)
{
        return 0;
}

static inline
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
                           const struct bpf_prog *xdp_prog,
                           struct bpf_map *map, bool exclude_ingress)
{
        return 0;
}

static inline void __cpu_map_flush(struct list_head *flush_list)
{
}

static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
                                  struct xdp_frame *xdpf,
                                  struct net_device *dev_rx)
{
        return 0;
}

static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
                                           struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
                                enum bpf_prog_type type)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog,
                                        const union bpf_attr *kattr,
                                        union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_skb(struct bpf_prog *prog,
                                        const union bpf_attr *kattr,
                                        union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_tracing(struct bpf_prog *prog,
                                            const union bpf_attr *kattr,
                                            union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
                                                   const union bpf_attr *kattr,
                                                   union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
                                              const union bpf_attr *kattr,
                                              union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline void bpf_map_put(struct bpf_map *map)
{
}

static inline struct bpf_prog *bpf_prog_by_id(u32 id)
{
        return ERR_PTR(-ENOTSUPP);
}

static inline int btf_struct_access(struct bpf_verifier_log *log,
                                    const struct bpf_reg_state *reg,
                                    int off, int size, enum bpf_access_type atype,
                                    u32 *next_btf_id, enum bpf_type_flag *flag,
                                    const char **field_name)
{
        return -EACCES;
}

static inline const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        return NULL;
}

static inline void bpf_task_storage_free(struct task_struct *task)
{
}

static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
{
        return false;
}

static inline const struct btf_func_model *
bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
                         const struct bpf_insn *insn)
{
        return NULL;
}

static inline int
bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
                   u16 btf_fd_idx, u8 **func_addr)
{
        return -ENOTSUPP;
}

static inline bool unprivileged_ebpf_enabled(void)
{
        return false;
}

static inline bool has_current_bpf_ctx(void)
{
        return false;
}

static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog)
{
}

static inline void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
}

static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
                                   enum bpf_dynptr_type type, u32 offset, u32 size)
{
}

static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
{
}

static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
{
}

static inline void bpf_prog_report_arena_violation(bool write, unsigned long addr,
                                                   unsigned long fault_ip)
{
}
#endif /* CONFIG_BPF_SYSCALL */

static inline bool bpf_net_capable(void)
{
        return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
}

static __always_inline int
bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
{
        int ret = -EFAULT;

        if (IS_ENABLED(CONFIG_BPF_EVENTS))
                ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
                memset(dst, 0, size);
        return ret;
}

void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len);

static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
                                                 enum bpf_prog_type type)
{
        return bpf_prog_get_type_dev(ufd, type, false);
}

void __bpf_free_used_maps(struct bpf_prog_aux *aux,
                          struct bpf_map **used_maps, u32 len);

bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool);

int bpf_prog_offload_compile(struct bpf_prog *prog);
void bpf_prog_dev_bound_destroy(struct bpf_prog *prog);
int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
                               struct bpf_prog *prog);

int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map);

int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
int bpf_map_offload_update_elem(struct bpf_map *map,
                                void *key, void *value, u64 flags);
int bpf_map_offload_delete_elem(struct bpf_map *map, void *key);
int bpf_map_offload_get_next_key(struct bpf_map *map,
                                 void *key, void *next_key);

bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map);

struct bpf_offload_dev *
bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv);
void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev);
void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev);
int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
                                    struct net_device *netdev);
void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
                                       struct net_device *netdev);
bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev);

void unpriv_ebpf_notify(int new_state);

#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
                              struct bpf_prog_aux *prog_aux);
void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id);
int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr);
int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog);
void bpf_dev_bound_netdev_unregister(struct net_device *dev);

static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
{
        return aux->dev_bound;
}

static inline bool bpf_prog_is_offloaded(const struct bpf_prog_aux *aux)
{
        return aux->offload_requested;
}

bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs);

static inline bool bpf_map_is_offloaded(struct bpf_map *map)
{
        return unlikely(map->ops == &bpf_map_offload_ops);
}

struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
void bpf_map_offload_map_free(struct bpf_map *map);
u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map);
int bpf_prog_test_run_syscall(struct bpf_prog *prog,
                              const union bpf_attr *kattr,
                              union bpf_attr __user *uattr);

int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
int sock_map_bpf_prog_query(const union bpf_attr *attr,
                            union bpf_attr __user *uattr);
int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);

void sock_map_unhash(struct sock *sk);
void sock_map_destroy(struct sock *sk);
void sock_map_close(struct sock *sk, long timeout);
#else
static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
                                            struct bpf_prog_aux *prog_aux)
{
        return -EOPNOTSUPP;
}

static inline void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog,
                                                u32 func_id)
{
        return NULL;
}

static inline int bpf_prog_dev_bound_init(struct bpf_prog *prog,
                                          union bpf_attr *attr)
{
        return -EOPNOTSUPP;
}

static inline int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog,
                                             struct bpf_prog *old_prog)
{
        return -EOPNOTSUPP;
}

static inline void bpf_dev_bound_netdev_unregister(struct net_device *dev)
{
}

static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
{
        return false;
}

static inline bool bpf_prog_is_offloaded(struct bpf_prog_aux *aux)
{
        return false;
}

static inline bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
{
        return false;
}

static inline bool bpf_map_is_offloaded(struct bpf_map *map)
{
        return false;
}

static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline void bpf_map_offload_map_free(struct bpf_map *map)
{
}

static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
{
        return 0;
}

static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog,
                                            const union bpf_attr *kattr,
                                            union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

#ifdef CONFIG_BPF_SYSCALL
static inline int sock_map_get_from_fd(const union bpf_attr *attr,
                                       struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int sock_map_prog_detach(const union bpf_attr *attr,
                                       enum bpf_prog_type ptype)
{
        return -EOPNOTSUPP;
}

static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
                                           u64 flags)
{
        return -EOPNOTSUPP;
}

static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
                                          union bpf_attr __user *uattr)
{
        return -EINVAL;
}

static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}
#endif /* CONFIG_BPF_SYSCALL */
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */

static __always_inline void
bpf_prog_inc_misses_counters(const struct bpf_prog_array *array)
{
        const struct bpf_prog_array_item *item;
        struct bpf_prog *prog;

        if (unlikely(!array))
                return;

        item = &array->items[0];
        while ((prog = READ_ONCE(item->prog))) {
                bpf_prog_inc_misses_counter(prog);
                item++;
        }
}

#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
void bpf_sk_reuseport_detach(struct sock *sk);
int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
                                       void *value);
int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
                                       void *value, u64 map_flags);
#else
static inline void bpf_sk_reuseport_detach(struct sock *sk)
{
}

#ifdef CONFIG_BPF_SYSCALL
static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
                                                     void *key, void *value)
{
        return -EOPNOTSUPP;
}

static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
                                                     void *key, void *value,
                                                     u64 map_flags)
{
        return -EOPNOTSUPP;
}
#endif /* CONFIG_BPF_SYSCALL */
#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */

#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL)

struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags);
struct bpf_key *bpf_lookup_system_key(u64 id);
void bpf_key_put(struct bpf_key *bkey);
int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
                               struct bpf_dynptr *sig_p,
                               struct bpf_key *trusted_keyring);

#else
static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
{
        return NULL;
}

static inline struct bpf_key *bpf_lookup_system_key(u64 id)
{
        return NULL;
}

static inline void bpf_key_put(struct bpf_key *bkey)
{
}

static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
                                             struct bpf_dynptr *sig_p,
                                             struct bpf_key *trusted_keyring)
{
        return -EOPNOTSUPP;
}
#endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */

/* verifier prototypes for helper functions called from eBPF programs */
extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
extern const struct bpf_func_proto bpf_map_delete_elem_proto;
extern const struct bpf_func_proto bpf_map_push_elem_proto;
extern const struct bpf_func_proto bpf_map_pop_elem_proto;
extern const struct bpf_func_proto bpf_map_peek_elem_proto;
extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto;

extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto;
extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_get_stack_proto;
extern const struct bpf_func_proto bpf_get_stack_sleepable_proto;
extern const struct bpf_func_proto bpf_get_task_stack_proto;
extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
extern const struct bpf_func_proto bpf_get_stack_proto_pe;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto;
extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto;
extern const struct bpf_func_proto bpf_current_task_under_cgroup_proto;
extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
extern const struct bpf_func_proto bpf_spin_lock_proto;
extern const struct bpf_func_proto bpf_spin_unlock_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto;
extern const struct bpf_func_proto bpf_strtol_proto;
extern const struct bpf_func_proto bpf_strtoul_proto;
extern const struct bpf_func_proto bpf_tcp_sock_proto;
extern const struct bpf_func_proto bpf_jiffies64_proto;
extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_event_output_data_proto;
extern const struct bpf_func_proto bpf_ringbuf_output_proto;
extern const struct bpf_func_proto bpf_ringbuf_reserve_proto;
extern const struct bpf_func_proto bpf_ringbuf_submit_proto;
extern const struct bpf_func_proto bpf_ringbuf_discard_proto;
extern const struct bpf_func_proto bpf_ringbuf_query_proto;
extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto;
extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto;
extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto;
extern const struct bpf_func_proto bpf_copy_from_user_proto;
extern const struct bpf_func_proto bpf_snprintf_btf_proto;
extern const struct bpf_func_proto bpf_snprintf_proto;
extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
extern const struct bpf_func_proto bpf_sock_from_file_proto;
extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
extern const struct bpf_func_proto bpf_task_storage_get_recur_proto;
extern const struct bpf_func_proto bpf_task_storage_get_proto;
extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto;
extern const struct bpf_func_proto bpf_task_storage_delete_proto;
extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto;
extern const struct bpf_func_proto bpf_find_vma_proto;
extern const struct bpf_func_proto bpf_loop_proto;
extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
extern const struct bpf_func_proto bpf_set_retval_proto;
extern const struct bpf_func_proto bpf_get_retval_proto;
extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto;
extern const struct bpf_func_proto bpf_cgrp_storage_get_proto;
extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto;

const struct bpf_func_proto *tracing_prog_func_proto(
  enum bpf_func_id func_id, const struct bpf_prog *prog);

/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

#if defined(CONFIG_NET)
bool bpf_sock_common_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     struct bpf_insn_access_aux *info);
bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                              struct bpf_insn_access_aux *info);
u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                const struct bpf_insn *si,
                                struct bpf_insn *insn_buf,
                                struct bpf_prog *prog,
                                u32 *target_size);
int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
                               struct bpf_dynptr *ptr);
#else
static inline bool bpf_sock_common_is_valid_access(int off, int size,
                                                   enum bpf_access_type type,
                                                   struct bpf_insn_access_aux *info)
{
        return false;
}
static inline bool bpf_sock_is_valid_access(int off, int size,
                                            enum bpf_access_type type,
                                            struct bpf_insn_access_aux *info)
{
        return false;
}
static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                              const struct bpf_insn *si,
                                              struct bpf_insn *insn_buf,
                                              struct bpf_prog *prog,
                                              u32 *target_size)
{
        return 0;
}
static inline int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
                                             struct bpf_dynptr *ptr)
{
        return -EOPNOTSUPP;
}
#endif

#ifdef CONFIG_INET
struct sk_reuseport_kern {
        struct sk_buff *skb;
        struct sock *sk;
        struct sock *selected_sk;
        struct sock *migrating_sk;
        void *data_end;
        u32 hash;
        u32 reuseport_id;
        bool bind_inany;
};
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info);

u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog,
                                    u32 *target_size);

bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info);

u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog,
                                    u32 *target_size);
#else
static inline bool bpf_tcp_sock_is_valid_access(int off, int size,
                                                enum bpf_access_type type,
                                                struct bpf_insn_access_aux *info)
{
        return false;
}

static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                                  const struct bpf_insn *si,
                                                  struct bpf_insn *insn_buf,
                                                  struct bpf_prog *prog,
                                                  u32 *target_size)
{
        return 0;
}
static inline bool bpf_xdp_sock_is_valid_access(int off, int size,
                                                enum bpf_access_type type,
                                                struct bpf_insn_access_aux *info)
{
        return false;
}

static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                                  const struct bpf_insn *si,
                                                  struct bpf_insn *insn_buf,
                                                  struct bpf_prog *prog,
                                                  u32 *target_size)
{
        return 0;
}
#endif /* CONFIG_INET */

enum bpf_text_poke_type {
        BPF_MOD_NOP,
        BPF_MOD_CALL,
        BPF_MOD_JUMP,
};

int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
                       enum bpf_text_poke_type new_t, void *old_addr,
                       void *new_addr);

void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
                               struct bpf_prog *new, struct bpf_prog *old);

void *bpf_arch_text_copy(void *dst, void *src, size_t len);
int bpf_arch_text_invalidate(void *dst, size_t len);

struct btf_id_set;
bool btf_id_set_contains(const struct btf_id_set *set, u32 id);

#define MAX_BPRINTF_VARARGS                12
#define MAX_BPRINTF_BUF                        1024

/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
 * arguments representation.
 */
#define MAX_BPRINTF_BIN_ARGS        512

struct bpf_bprintf_buffers {
        char bin_args[MAX_BPRINTF_BIN_ARGS];
        char buf[MAX_BPRINTF_BUF];
};

struct bpf_bprintf_data {
        u32 *bin_args;
        char *buf;
        bool get_bin_args;
        bool get_buf;
};

int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
                        u32 num_args, struct bpf_bprintf_data *data);
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data);
int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs);
void bpf_put_buffers(void);

void bpf_prog_stream_init(struct bpf_prog *prog);
void bpf_prog_stream_free(struct bpf_prog *prog);
int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len);
void bpf_stream_stage_init(struct bpf_stream_stage *ss);
void bpf_stream_stage_free(struct bpf_stream_stage *ss);
__printf(2, 3)
int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...);
int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
                            enum bpf_stream_id stream_id);
int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss);

#define bpf_stream_printk(ss, ...) bpf_stream_stage_printk(&ss, __VA_ARGS__)
#define bpf_stream_dump_stack(ss) bpf_stream_stage_dump_stack(&ss)

#define bpf_stream_stage(ss, prog, stream_id, expr)            \
        ({                                                     \
                bpf_stream_stage_init(&ss);                    \
                (expr);                                        \
                bpf_stream_stage_commit(&ss, prog, stream_id); \
                bpf_stream_stage_free(&ss);                    \
        })

#ifdef CONFIG_BPF_LSM
void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
void bpf_cgroup_atype_put(int cgroup_atype);
#else
static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {}
static inline void bpf_cgroup_atype_put(int cgroup_atype) {}
#endif /* CONFIG_BPF_LSM */

struct key;

#ifdef CONFIG_KEYS
struct bpf_key {
        struct key *key;
        bool has_ref;
};
#endif /* CONFIG_KEYS */

static inline bool type_is_alloc(u32 type)
{
        return type & MEM_ALLOC;
}

static inline gfp_t bpf_memcg_flags(gfp_t flags)
{
        if (memcg_bpf_enabled())
                return flags | __GFP_ACCOUNT;
        return flags;
}

static inline bool bpf_is_subprog(const struct bpf_prog *prog)
{
        return prog->aux->func_idx != 0;
}

const struct bpf_line_info *bpf_find_linfo(const struct bpf_prog *prog, u32 insn_off);
void bpf_get_linfo_file_line(struct btf *btf, const struct bpf_line_info *linfo,
                             const char **filep, const char **linep, int *nump);
int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
                           const char **linep, int *nump);
struct bpf_prog *bpf_prog_find_from_stack(void);

int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog);
int bpf_insn_array_ready(struct bpf_map *map);
void bpf_insn_array_release(struct bpf_map *map);
void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len);
void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len);

#ifdef CONFIG_BPF_SYSCALL
void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image);
#else
static inline void
bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
{
}
#endif

static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type)
{
        switch (map_type) {
        case BPF_MAP_TYPE_PERCPU_ARRAY:
        case BPF_MAP_TYPE_PERCPU_HASH:
        case BPF_MAP_TYPE_LRU_PERCPU_HASH:
        case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
                return true;
        default:
                return false;
        }
}

static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags)
{
        u32 cpu;

        if ((u32)flags & ~allowed_flags)
                return -EINVAL;

        if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))
                return -EINVAL;

        if (!(flags & BPF_F_CPU) && flags >> 32)
                return -EINVAL;

        if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) {
                if (!bpf_map_supports_cpu_flags(map->map_type))
                        return -EINVAL;
                if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS))
                        return -EINVAL;

                cpu = flags >> 32;
                if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus())
                        return -ERANGE;
        }

        return 0;
}

#endif /* _LINUX_BPF_H */





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   21 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
// SPDX-License-Identifier: GPL-2.0
/* CPU control.
 * (C) 2001, 2002, 2003, 2004 Rusty Russell
 */
#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/task.h>
#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
#include <linux/nmi.h>
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
#include <linux/random.h>
#include <linux/cc_platform.h>
#include <linux/parser.h>

#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
#include <trace/events/cpuhp.h>

#include "smpboot.h"

/**
 * struct cpuhp_cpu_state - Per cpu hotplug state storage
 * @state:        The current cpu state
 * @target:        The target state
 * @fail:        Current CPU hotplug callback state
 * @thread:        Pointer to the hotplug thread
 * @should_run:        Thread should execute
 * @rollback:        Perform a rollback
 * @single:        Single callback invocation
 * @bringup:        Single callback bringup or teardown selector
 * @node:        Remote CPU node; for multi-instance, do a
 *                single entry callback for install/remove
 * @last:        For multi-instance rollback, remember how far we got
 * @cb_state:        The state for a single callback (install/uninstall)
 * @result:        Result of the operation
 * @ap_sync_state:        State for AP synchronization
 * @done_up:        Signal completion to the issuer of the task for cpu-up
 * @done_down:        Signal completion to the issuer of the task for cpu-down
 */
struct cpuhp_cpu_state {
        enum cpuhp_state        state;
        enum cpuhp_state        target;
        enum cpuhp_state        fail;
#ifdef CONFIG_SMP
        struct task_struct        *thread;
        bool                        should_run;
        bool                        rollback;
        bool                        single;
        bool                        bringup;
        struct hlist_node        *node;
        struct hlist_node        *last;
        enum cpuhp_state        cb_state;
        int                        result;
        atomic_t                ap_sync_state;
        struct completion        done_up;
        struct completion        done_down;
#endif
};

static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
        .fail = CPUHP_INVALID,
};

#ifdef CONFIG_SMP
cpumask_t cpus_booted_once_mask;
#endif

#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
static struct lockdep_map cpuhp_state_up_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
static struct lockdep_map cpuhp_state_down_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);


static inline void cpuhp_lock_acquire(bool bringup)
{
        lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}

static inline void cpuhp_lock_release(bool bringup)
{
        lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
#else

static inline void cpuhp_lock_acquire(bool bringup) { }
static inline void cpuhp_lock_release(bool bringup) { }

#endif

/**
 * struct cpuhp_step - Hotplug state machine step
 * @name:        Name of the step
 * @startup:        Startup function of the step
 * @teardown:        Teardown function of the step
 * @cant_stop:        Bringup/teardown can't be stopped at this step
 * @multi_instance:        State has multiple instances which get added afterwards
 */
struct cpuhp_step {
        const char                *name;
        union {
                int                (*single)(unsigned int cpu);
                int                (*multi)(unsigned int cpu,
                                         struct hlist_node *node);
        } startup;
        union {
                int                (*single)(unsigned int cpu);
                int                (*multi)(unsigned int cpu,
                                         struct hlist_node *node);
        } teardown;
        /* private: */
        struct hlist_head        list;
        /* public: */
        bool                        cant_stop;
        bool                        multi_instance;
};

static DEFINE_MUTEX(cpuhp_state_mutex);
static struct cpuhp_step cpuhp_hp_states[];

static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
{
        return cpuhp_hp_states + state;
}

static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
{
        return bringup ? !step->startup.single : !step->teardown.single;
}

/**
 * cpuhp_invoke_callback - Invoke the callbacks for a given state
 * @cpu:        The cpu for which the callback should be invoked
 * @state:        The state to do callbacks for
 * @bringup:        True if the bringup callback should be invoked
 * @node:        For multi-instance, do a single entry callback for install/remove
 * @lastp:        For multi-instance rollback, remember how far we got
 *
 * Called from cpu hotplug and from the state register machinery.
 *
 * Return: %0 on success or a negative errno code
 */
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
                                 bool bringup, struct hlist_node *node,
                                 struct hlist_node **lastp)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct cpuhp_step *step = cpuhp_get_step(state);
        int (*cbm)(unsigned int cpu, struct hlist_node *node);
        int (*cb)(unsigned int cpu);
        int ret, cnt;

        if (st->fail == state) {
                st->fail = CPUHP_INVALID;
                return -EAGAIN;
        }

        if (cpuhp_step_empty(bringup, step)) {
                WARN_ON_ONCE(1);
                return 0;
        }

        if (!step->multi_instance) {
                WARN_ON_ONCE(lastp && *lastp);
                cb = bringup ? step->startup.single : step->teardown.single;

                trace_cpuhp_enter(cpu, st->target, state, cb);
                ret = cb(cpu);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                return ret;
        }
        cbm = bringup ? step->startup.multi : step->teardown.multi;

        /* Single invocation for instance add/remove */
        if (node) {
                WARN_ON_ONCE(lastp && *lastp);
                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                return ret;
        }

        /* State transition. Invoke on all instances */
        cnt = 0;
        hlist_for_each(node, &step->list) {
                if (lastp && node == *lastp)
                        break;

                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                if (ret) {
                        if (!lastp)
                                goto err;

                        *lastp = node;
                        return ret;
                }
                cnt++;
        }
        if (lastp)
                *lastp = NULL;
        return 0;
err:
        /* Rollback the instances if one failed */
        cbm = !bringup ? step->startup.multi : step->teardown.multi;
        if (!cbm)
                return ret;

        hlist_for_each(node, &step->list) {
                if (!cnt--)
                        break;

                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                /*
                 * Rollback must not fail,
                 */
                WARN_ON_ONCE(ret);
        }
        return ret;
}

/*
 * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
 */
static bool cpuhp_is_atomic_state(enum cpuhp_state state)
{
        return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
}

#ifdef CONFIG_SMP
static bool cpuhp_is_ap_state(enum cpuhp_state state)
{
        /*
         * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
         * purposes as that state is handled explicitly in cpu_down.
         */
        return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
}

static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
        struct completion *done = bringup ? &st->done_up : &st->done_down;
        wait_for_completion(done);
}

static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
        struct completion *done = bringup ? &st->done_up : &st->done_down;
        complete(done);
}

/* Synchronization state management */
enum cpuhp_sync_state {
        SYNC_STATE_DEAD,
        SYNC_STATE_KICKED,
        SYNC_STATE_SHOULD_DIE,
        SYNC_STATE_ALIVE,
        SYNC_STATE_SHOULD_ONLINE,
        SYNC_STATE_ONLINE,
};

#ifdef CONFIG_HOTPLUG_CORE_SYNC
/**
 * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown
 * @state:        The synchronization state to set
 *
 * No synchronization point. Just update of the synchronization state, but implies
 * a full barrier so that the AP changes are visible before the control CPU proceeds.
 */
static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state)
{
        atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);

        (void)atomic_xchg(st, state);
}

void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); }

static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state,
                                      enum cpuhp_sync_state next_state)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        ktime_t now, end, start = ktime_get();
        int sync;

        end = start + 10ULL * NSEC_PER_SEC;

        sync = atomic_read(st);
        while (1) {
                if (sync == state) {
                        if (!atomic_try_cmpxchg(st, &sync, next_state))
                                continue;
                        return true;
                }

                now = ktime_get();
                if (now > end) {
                        /* Timeout. Leave the state unchanged */
                        return false;
                } else if (now - start < NSEC_PER_MSEC) {
                        /* Poll for one millisecond */
                        arch_cpuhp_sync_state_poll();
                } else {
                        usleep_range(USEC_PER_MSEC, 2 * USEC_PER_MSEC);
                }
                sync = atomic_read(st);
        }
        return true;
}
#else  /* CONFIG_HOTPLUG_CORE_SYNC */
static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC */

#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD
/**
 * cpuhp_ap_report_dead - Update synchronization state to DEAD
 *
 * No synchronization point. Just update of the synchronization state.
 */
void cpuhp_ap_report_dead(void)
{
        cpuhp_ap_update_sync_state(SYNC_STATE_DEAD);
}

void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { }

/*
 * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down
 * because the AP cannot issue complete() at this stage.
 */
static void cpuhp_bp_sync_dead(unsigned int cpu)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        int sync = atomic_read(st);

        do {
                /* CPU can have reported dead already. Don't overwrite that! */
                if (sync == SYNC_STATE_DEAD)
                        break;
        } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE));

        if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) {
                /* CPU reached dead state. Invoke the cleanup function */
                arch_cpuhp_cleanup_dead_cpu(cpu);
                return;
        }

        /* No further action possible. Emit message and give up. */
        pr_err("CPU%u failed to report dead state\n", cpu);
}
#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */
static inline void cpuhp_bp_sync_dead(unsigned int cpu) { }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */

#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL
/**
 * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive
 *
 * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits
 * for the BP to release it.
 */
void cpuhp_ap_sync_alive(void)
{
        atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);

        cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE);

        /* Wait for the control CPU to release it. */
        while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE)
                cpu_relax();
}

static bool cpuhp_can_boot_ap(unsigned int cpu)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        int sync = atomic_read(st);

again:
        switch (sync) {
        case SYNC_STATE_DEAD:
                /* CPU is properly dead */
                break;
        case SYNC_STATE_KICKED:
                /* CPU did not come up in previous attempt */
                break;
        case SYNC_STATE_ALIVE:
                /* CPU is stuck cpuhp_ap_sync_alive(). */
                break;
        default:
                /* CPU failed to report online or dead and is in limbo state. */
                return false;
        }

        /* Prepare for booting */
        if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED))
                goto again;

        return true;
}

void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { }

/*
 * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up
 * because the AP cannot issue complete() so early in the bringup.
 */
static int cpuhp_bp_sync_alive(unsigned int cpu)
{
        int ret = 0;

        if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL))
                return 0;

        if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) {
                pr_err("CPU%u failed to report alive state\n", cpu);
                ret = -EIO;
        }

        /* Let the architecture cleanup the kick alive mechanics. */
        arch_cpuhp_cleanup_kick_cpu(cpu);
        return ret;
}
#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */
static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; }
static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */

/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);

/*
 * The following two APIs (cpu_maps_update_begin/done) must be used when
 * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
 */
void cpu_maps_update_begin(void)
{
        mutex_lock(&cpu_add_remove_lock);
}

void cpu_maps_update_done(void)
{
        mutex_unlock(&cpu_add_remove_lock);
}

/*
 * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
 * Should always be manipulated under cpu_add_remove_lock
 */
static int cpu_hotplug_disabled;

#ifdef CONFIG_HOTPLUG_CPU

DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);

static bool cpu_hotplug_offline_disabled __ro_after_init;

void cpus_read_lock(void)
{
        percpu_down_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_lock);

int cpus_read_trylock(void)
{
        return percpu_down_read_trylock(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_trylock);

void cpus_read_unlock(void)
{
        percpu_up_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_unlock);

void cpus_write_lock(void)
{
        percpu_down_write(&cpu_hotplug_lock);
}

void cpus_write_unlock(void)
{
        percpu_up_write(&cpu_hotplug_lock);
}

void lockdep_assert_cpus_held(void)
{
        /*
         * We can't have hotplug operations before userspace starts running,
         * and some init codepaths will knowingly not take the hotplug lock.
         * This is all valid, so mute lockdep until it makes sense to report
         * unheld locks.
         */
        if (system_state < SYSTEM_RUNNING)
                return;

        percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held);

#ifdef CONFIG_LOCKDEP
int lockdep_is_cpus_held(void)
{
        return percpu_rwsem_is_held(&cpu_hotplug_lock);
}

int lockdep_is_cpus_write_held(void)
{
        return percpu_rwsem_is_write_held(&cpu_hotplug_lock);
}
#endif

static void lockdep_acquire_cpus_lock(void)
{
        rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
}

static void lockdep_release_cpus_lock(void)
{
        rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}

/* Declare CPU offlining not supported */
void cpu_hotplug_disable_offlining(void)
{
        cpu_maps_update_begin();
        cpu_hotplug_offline_disabled = true;
        cpu_maps_update_done();
}

/*
 * Wait for currently running CPU hotplug operations to complete (if any) and
 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
 * hotplug path before performing hotplug operations. So acquiring that lock
 * guarantees mutual exclusion from any currently running hotplug operations.
 */
void cpu_hotplug_disable(void)
{
        cpu_maps_update_begin();
        cpu_hotplug_disabled++;
        cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_disable);

static void __cpu_hotplug_enable(void)
{
        if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
                return;
        cpu_hotplug_disabled--;
}

void cpu_hotplug_enable(void)
{
        cpu_maps_update_begin();
        __cpu_hotplug_enable();
        cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);

#else

static void lockdep_acquire_cpus_lock(void)
{
}

static void lockdep_release_cpus_lock(void)
{
}

#endif        /* CONFIG_HOTPLUG_CPU */

/*
 * Architectures that need SMT-specific errata handling during SMT hotplug
 * should override this.
 */
void __weak arch_smt_update(void) { }

#ifdef CONFIG_HOTPLUG_SMT

enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
static unsigned int cpu_smt_max_threads __ro_after_init;
unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;

void __init cpu_smt_disable(bool force)
{
        if (!cpu_smt_possible())
                return;

        if (force) {
                pr_info("SMT: Force disabled\n");
                cpu_smt_control = CPU_SMT_FORCE_DISABLED;
        } else {
                pr_info("SMT: disabled\n");
                cpu_smt_control = CPU_SMT_DISABLED;
        }
        cpu_smt_num_threads = 1;
}

/*
 * The decision whether SMT is supported can only be done after the full
 * CPU identification. Called from architecture code.
 */
void __init cpu_smt_set_num_threads(unsigned int num_threads,
                                    unsigned int max_threads)
{
        WARN_ON(!num_threads || (num_threads > max_threads));

        if (max_threads == 1)
                cpu_smt_control = CPU_SMT_NOT_SUPPORTED;

        cpu_smt_max_threads = max_threads;

        /*
         * If SMT has been disabled via the kernel command line or SMT is
         * not supported, set cpu_smt_num_threads to 1 for consistency.
         * If enabled, take the architecture requested number of threads
         * to bring up into account.
         */
        if (cpu_smt_control != CPU_SMT_ENABLED)
                cpu_smt_num_threads = 1;
        else if (num_threads < cpu_smt_num_threads)
                cpu_smt_num_threads = num_threads;
}

static int __init smt_cmdline_disable(char *str)
{
        cpu_smt_disable(str && !strcmp(str, "force"));
        return 0;
}
early_param("nosmt", smt_cmdline_disable);

/*
 * For Archicture supporting partial SMT states check if the thread is allowed.
 * Otherwise this has already been checked through cpu_smt_max_threads when
 * setting the SMT level.
 */
static inline bool cpu_smt_thread_allowed(unsigned int cpu)
{
#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
        return topology_smt_thread_allowed(cpu);
#else
        return true;
#endif
}

static inline bool cpu_bootable(unsigned int cpu)
{
        if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
                return true;

        /* All CPUs are bootable if controls are not configured */
        if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
                return true;

        /* All CPUs are bootable if CPU is not SMT capable */
        if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return true;

        if (topology_is_primary_thread(cpu))
                return true;

        /*
         * On x86 it's required to boot all logical CPUs at least once so
         * that the init code can get a chance to set CR4.MCE on each
         * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
         * core will shutdown the machine.
         */
        return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}

/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
        return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
                cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);

#else
static inline bool cpu_bootable(unsigned int cpu) { return true; }
#endif

static inline enum cpuhp_state
cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        bool bringup = st->state < target;

        st->rollback = false;
        st->last = NULL;

        st->target = target;
        st->single = false;
        st->bringup = bringup;
        if (cpu_dying(cpu) != !bringup)
                set_cpu_dying(cpu, !bringup);

        return prev_state;
}

static inline void
cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
                  enum cpuhp_state prev_state)
{
        bool bringup = !st->bringup;

        st->target = prev_state;

        /*
         * Already rolling back. No need invert the bringup value or to change
         * the current state.
         */
        if (st->rollback)
                return;

        st->rollback = true;

        /*
         * If we have st->last we need to undo partial multi_instance of this
         * state first. Otherwise start undo at the previous state.
         */
        if (!st->last) {
                if (st->bringup)
                        st->state--;
                else
                        st->state++;
        }

        st->bringup = bringup;
        if (cpu_dying(cpu) != !bringup)
                set_cpu_dying(cpu, !bringup);
}

/* Regular hotplug invocation of the AP hotplug thread */
static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
{
        if (!st->single && st->state == st->target)
                return;

        st->result = 0;
        /*
         * Make sure the above stores are visible before should_run becomes
         * true. Paired with the mb() above in cpuhp_thread_fun()
         */
        smp_mb();
        st->should_run = true;
        wake_up_process(st->thread);
        wait_for_ap_thread(st, st->bringup);
}

static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
                         enum cpuhp_state target)
{
        enum cpuhp_state prev_state;
        int ret;

        prev_state = cpuhp_set_state(cpu, st, target);
        __cpuhp_kick_ap(st);
        if ((ret = st->result)) {
                cpuhp_reset_state(cpu, st, prev_state);
                __cpuhp_kick_ap(st);
        }

        return ret;
}

static int bringup_wait_for_ap_online(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);

        /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
        wait_for_ap_thread(st, true);
        if (WARN_ON_ONCE((!cpu_online(cpu))))
                return -ECANCELED;

        /* Unpark the hotplug thread of the target cpu */
        kthread_unpark(st->thread);

        /*
         * SMT soft disabling on X86 requires to bring the CPU out of the
         * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
         * CPU marked itself as booted_once in notify_cpu_starting() so the
         * cpu_bootable() check will now return false if this is not the
         * primary sibling.
         */
        if (!cpu_bootable(cpu))
                return -ECANCELED;
        return 0;
}

#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
static int cpuhp_kick_ap_alive(unsigned int cpu)
{
        if (!cpuhp_can_boot_ap(cpu))
                return -EAGAIN;

        return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu));
}

static int cpuhp_bringup_ap(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int ret;

        /*
         * Some architectures have to walk the irq descriptors to
         * setup the vector space for the cpu which comes online.
         * Prevent irq alloc/free across the bringup.
         */
        irq_lock_sparse();

        ret = cpuhp_bp_sync_alive(cpu);
        if (ret)
                goto out_unlock;

        ret = bringup_wait_for_ap_online(cpu);
        if (ret)
                goto out_unlock;

        irq_unlock_sparse();

        if (st->target <= CPUHP_AP_ONLINE_IDLE)
                return 0;

        return cpuhp_kick_ap(cpu, st, st->target);

out_unlock:
        irq_unlock_sparse();
        return ret;
}
#else
static int bringup_cpu(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle = idle_thread_get(cpu);
        int ret;

        if (!cpuhp_can_boot_ap(cpu))
                return -EAGAIN;

        /*
         * Some architectures have to walk the irq descriptors to
         * setup the vector space for the cpu which comes online.
         *
         * Prevent irq alloc/free across the bringup by acquiring the
         * sparse irq lock. Hold it until the upcoming CPU completes the
         * startup in cpuhp_online_idle() which allows to avoid
         * intermediate synchronization points in the architecture code.
         */
        irq_lock_sparse();

        ret = __cpu_up(cpu, idle);
        if (ret)
                goto out_unlock;

        ret = cpuhp_bp_sync_alive(cpu);
        if (ret)
                goto out_unlock;

        ret = bringup_wait_for_ap_online(cpu);
        if (ret)
                goto out_unlock;

        irq_unlock_sparse();

        if (st->target <= CPUHP_AP_ONLINE_IDLE)
                return 0;

        return cpuhp_kick_ap(cpu, st, st->target);

out_unlock:
        irq_unlock_sparse();
        return ret;
}
#endif

static int finish_cpu(unsigned int cpu)
{
        struct task_struct *idle = idle_thread_get(cpu);
        struct mm_struct *mm = idle->active_mm;

        /*
         * sched_force_init_mm() ensured the use of &init_mm,
         * drop that refcount now that the CPU has stopped.
         */
        WARN_ON(mm != &init_mm);
        idle->active_mm = NULL;
        mmdrop_lazy_tlb(mm);

        return 0;
}

/*
 * Hotplug state machine related functions
 */

/*
 * Get the next state to run. Empty ones will be skipped. Returns true if a
 * state must be run.
 *
 * st->state will be modified ahead of time, to match state_to_run, as if it
 * has already ran.
 */
static bool cpuhp_next_state(bool bringup,
                             enum cpuhp_state *state_to_run,
                             struct cpuhp_cpu_state *st,
                             enum cpuhp_state target)
{
        do {
                if (bringup) {
                        if (st->state >= target)
                                return false;

                        *state_to_run = ++st->state;
                } else {
                        if (st->state <= target)
                                return false;

                        *state_to_run = st->state--;
                }

                if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
                        break;
        } while (true);

        return true;
}

static int __cpuhp_invoke_callback_range(bool bringup,
                                         unsigned int cpu,
                                         struct cpuhp_cpu_state *st,
                                         enum cpuhp_state target,
                                         bool nofail)
{
        enum cpuhp_state state;
        int ret = 0;

        while (cpuhp_next_state(bringup, &state, st, target)) {
                int err;

                err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
                if (!err)
                        continue;

                if (nofail) {
                        pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
                                cpu, bringup ? "UP" : "DOWN",
                                cpuhp_get_step(st->state)->name,
                                st->state, err);
                        ret = -1;
                } else {
                        ret = err;
                        break;
                }
        }

        return ret;
}

static inline int cpuhp_invoke_callback_range(bool bringup,
                                              unsigned int cpu,
                                              struct cpuhp_cpu_state *st,
                                              enum cpuhp_state target)
{
        return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
}

static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
                                                      unsigned int cpu,
                                                      struct cpuhp_cpu_state *st,
                                                      enum cpuhp_state target)
{
        __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
}

static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
{
        if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
                return true;
        /*
         * When CPU hotplug is disabled, then taking the CPU down is not
         * possible because takedown_cpu() and the architecture and
         * subsystem specific mechanisms are not available. So the CPU
         * which would be completely unplugged again needs to stay around
         * in the current state.
         */
        return st->state <= CPUHP_BRINGUP_CPU;
}

static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                              enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        int ret = 0;

        ret = cpuhp_invoke_callback_range(true, cpu, st, target);
        if (ret) {
                pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
                         ret, cpu, cpuhp_get_step(st->state)->name,
                         st->state);

                cpuhp_reset_state(cpu, st, prev_state);
                if (can_rollback_cpu(st))
                        WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
                                                            prev_state));
        }
        return ret;
}

/*
 * The cpu hotplug threads manage the bringup and teardown of the cpus
 */
static int cpuhp_should_run(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        return st->should_run;
}

/*
 * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
 * callbacks when a state gets [un]installed at runtime.
 *
 * Each invocation of this function by the smpboot thread does a single AP
 * state callback.
 *
 * It has 3 modes of operation:
 *  - single: runs st->cb_state
 *  - up:     runs ++st->state, while st->state < st->target
 *  - down:   runs st->state--, while st->state > st->target
 *
 * When complete or on error, should_run is cleared and the completion is fired.
 */
static void cpuhp_thread_fun(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        bool bringup = st->bringup;
        enum cpuhp_state state;

        if (WARN_ON_ONCE(!st->should_run))
                return;

        /*
         * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
         * that if we see ->should_run we also see the rest of the state.
         */
        smp_mb();

        /*
         * The BP holds the hotplug lock, but we're now running on the AP,
         * ensure that anybody asserting the lock is held, will actually find
         * it so.
         */
        lockdep_acquire_cpus_lock();
        cpuhp_lock_acquire(bringup);

        if (st->single) {
                state = st->cb_state;
                st->should_run = false;
        } else {
                st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
                if (!st->should_run)
                        goto end;
        }

        WARN_ON_ONCE(!cpuhp_is_ap_state(state));

        if (cpuhp_is_atomic_state(state)) {
                local_irq_disable();
                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
                local_irq_enable();

                /*
                 * STARTING/DYING must not fail!
                 */
                WARN_ON_ONCE(st->result);
        } else {
                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
        }

        if (st->result) {
                /*
                 * If we fail on a rollback, we're up a creek without no
                 * paddle, no way forward, no way back. We loose, thanks for
                 * playing.
                 */
                WARN_ON_ONCE(st->rollback);
                st->should_run = false;
        }

end:
        cpuhp_lock_release(bringup);
        lockdep_release_cpus_lock();

        if (!st->should_run)
                complete_ap_thread(st, bringup);
}

/* Invoke a single callback on a remote cpu */
static int
cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
                         struct hlist_node *node)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int ret;

        if (!cpu_online(cpu))
                return 0;

        cpuhp_lock_acquire(false);
        cpuhp_lock_release(false);

        cpuhp_lock_acquire(true);
        cpuhp_lock_release(true);

        /*
         * If we are up and running, use the hotplug thread. For early calls
         * we invoke the thread function directly.
         */
        if (!st->thread)
                return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);

        st->rollback = false;
        st->last = NULL;

        st->node = node;
        st->bringup = bringup;
        st->cb_state = state;
        st->single = true;

        __cpuhp_kick_ap(st);

        /*
         * If we failed and did a partial, do a rollback.
         */
        if ((ret = st->result) && st->last) {
                st->rollback = true;
                st->bringup = !bringup;

                __cpuhp_kick_ap(st);
        }

        /*
         * Clean up the leftovers so the next hotplug operation wont use stale
         * data.
         */
        st->node = st->last = NULL;
        return ret;
}

static int cpuhp_kick_ap_work(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state prev_state = st->state;
        int ret;

        cpuhp_lock_acquire(false);
        cpuhp_lock_release(false);

        cpuhp_lock_acquire(true);
        cpuhp_lock_release(true);

        trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
        ret = cpuhp_kick_ap(cpu, st, st->target);
        trace_cpuhp_exit(cpu, st->state, prev_state, ret);

        return ret;
}

static struct smp_hotplug_thread cpuhp_threads = {
        .store                        = &cpuhp_state.thread,
        .thread_should_run        = cpuhp_should_run,
        .thread_fn                = cpuhp_thread_fun,
        .thread_comm                = "cpuhp/%u",
        .selfparking                = true,
};

static __init void cpuhp_init_state(void)
{
        struct cpuhp_cpu_state *st;
        int cpu;

        for_each_possible_cpu(cpu) {
                st = per_cpu_ptr(&cpuhp_state, cpu);
                init_completion(&st->done_up);
                init_completion(&st->done_down);
        }
}

void __init cpuhp_threads_init(void)
{
        cpuhp_init_state();
        BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
        kthread_unpark(this_cpu_read(cpuhp_state.thread));
}

#ifdef CONFIG_HOTPLUG_CPU
#ifndef arch_clear_mm_cpumask_cpu
#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
#endif

/**
 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
 * @cpu: a CPU id
 *
 * This function walks all processes, finds a valid mm struct for each one and
 * then clears a corresponding bit in mm's cpumask.  While this all sounds
 * trivial, there are various non-obvious corner cases, which this function
 * tries to solve in a safe manner.
 *
 * Also note that the function uses a somewhat relaxed locking scheme, so it may
 * be called only for an already offlined CPU.
 */
void clear_tasks_mm_cpumask(int cpu)
{
        struct task_struct *p;

        /*
         * This function is called after the cpu is taken down and marked
         * offline, so its not like new tasks will ever get this cpu set in
         * their mm mask. -- Peter Zijlstra
         * Thus, we may use rcu_read_lock() here, instead of grabbing
         * full-fledged tasklist_lock.
         */
        WARN_ON(cpu_online(cpu));
        rcu_read_lock();
        for_each_process(p) {
                struct task_struct *t;

                /*
                 * Main thread might exit, but other threads may still have
                 * a valid mm. Find one.
                 */
                t = find_lock_task_mm(p);
                if (!t)
                        continue;
                arch_clear_mm_cpumask_cpu(cpu, t->mm);
                task_unlock(t);
        }
        rcu_read_unlock();
}

/* Take this CPU down. */
static int take_cpu_down(void *_param)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
        int err, cpu = smp_processor_id();

        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
                return err;

        /*
         * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
         * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
         */
        WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));

        /*
         * Invoke the former CPU_DYING callbacks. DYING must not fail!
         */
        cpuhp_invoke_callback_range_nofail(false, cpu, st, target);

        /* Park the stopper thread */
        stop_machine_park(cpu);
        return 0;
}

static int takedown_cpu(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int err;

        /* Park the smpboot threads */
        kthread_park(st->thread);

        /*
         * Prevent irq alloc/free while the dying cpu reorganizes the
         * interrupt affinities.
         */
        irq_lock_sparse();

        err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
        if (err) {
                /* CPU refused to die */
                irq_unlock_sparse();
                /* Unpark the hotplug thread so we can rollback there */
                kthread_unpark(st->thread);
                return err;
        }
        BUG_ON(cpu_online(cpu));

        /*
         * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
         * all runnable tasks from the CPU, there's only the idle task left now
         * that the migration thread is done doing the stop_machine thing.
         *
         * Wait for the stop thread to go away.
         */
        wait_for_ap_thread(st, false);
        BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);

        /* Interrupts are moved away from the dying cpu, reenable alloc/free */
        irq_unlock_sparse();

        hotplug_cpu__broadcast_tick_pull(cpu);
        /* This actually kills the CPU. */
        __cpu_die(cpu);

        cpuhp_bp_sync_dead(cpu);

        lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu));

        /*
         * Callbacks must be re-integrated right away to the RCU state machine.
         * Otherwise an RCU callback could block a further teardown function
         * waiting for its completion.
         */
        rcutree_migrate_callbacks(cpu);

        return 0;
}

static void cpuhp_complete_idle_dead(void *arg)
{
        struct cpuhp_cpu_state *st = arg;

        complete_ap_thread(st, false);
}

void cpuhp_report_idle_dead(void)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        BUG_ON(st->state != CPUHP_AP_OFFLINE);
        tick_assert_timekeeping_handover();
        rcutree_report_cpu_dead();
        st->state = CPUHP_AP_IDLE_DEAD;
        /*
         * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
         * to an online cpu.
         */
        smp_call_function_single(cpumask_first(cpu_online_mask),
                                 cpuhp_complete_idle_dead, st, 0);
}

static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                                enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        int ret = 0;

        ret = cpuhp_invoke_callback_range(false, cpu, st, target);
        if (ret) {
                pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
                         ret, cpu, cpuhp_get_step(st->state)->name,
                         st->state);

                cpuhp_reset_state(cpu, st, prev_state);

                if (st->state < prev_state)
                        WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
                                                            prev_state));
        }

        return ret;
}

/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
                           enum cpuhp_state target)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int prev_state, ret = 0;

        if (num_online_cpus() == 1)
                return -EBUSY;

        if (!cpu_present(cpu))
                return -EINVAL;

        cpus_write_lock();

        /*
         * Keep at least one housekeeping cpu onlined to avoid generating
         * an empty sched_domain span.
         */
        if (cpumask_any_and(cpu_online_mask,
                            housekeeping_cpumask(HK_TYPE_DOMAIN)) >= nr_cpu_ids) {
                ret = -EBUSY;
                goto out;
        }

        cpuhp_tasks_frozen = tasks_frozen;

        prev_state = cpuhp_set_state(cpu, st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread.
         */
        if (st->state > CPUHP_TEARDOWN_CPU) {
                st->target = max((int)target, CPUHP_TEARDOWN_CPU);
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
                 * return the error code..
                 */
                if (ret)
                        goto out;

                /*
                 * We might have stopped still in the range of the AP hotplug
                 * thread. Nothing to do anymore.
                 */
                if (st->state > CPUHP_TEARDOWN_CPU)
                        goto out;

                st->target = target;
        }
        /*
         * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
         * to do the further cleanups.
         */
        ret = cpuhp_down_callbacks(cpu, st, target);
        if (ret && st->state < prev_state) {
                if (st->state == CPUHP_TEARDOWN_CPU) {
                        cpuhp_reset_state(cpu, st, prev_state);
                        __cpuhp_kick_ap(st);
                } else {
                        WARN(1, "DEAD callback error for CPU%d", cpu);
                }
        }

out:
        cpus_write_unlock();
        arch_smt_update();
        return ret;
}

static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
        /*
         * If the platform does not support hotplug, report it explicitly to
         * differentiate it from a transient offlining failure.
         */
        if (cpu_hotplug_offline_disabled)
                return -EOPNOTSUPP;
        if (cpu_hotplug_disabled)
                return -EBUSY;
        return _cpu_down(cpu, 0, target);
}

static int cpu_down(unsigned int cpu, enum cpuhp_state target)
{
        int err;

        cpu_maps_update_begin();
        err = cpu_down_maps_locked(cpu, target);
        cpu_maps_update_done();
        return err;
}

/**
 * cpu_device_down - Bring down a cpu device
 * @dev: Pointer to the cpu device to offline
 *
 * This function is meant to be used by device core cpu subsystem only.
 *
 * Other subsystems should use remove_cpu() instead.
 *
 * Return: %0 on success or a negative errno code
 */
int cpu_device_down(struct device *dev)
{
        return cpu_down(dev->id, CPUHP_OFFLINE);
}

int remove_cpu(unsigned int cpu)
{
        int ret;

        lock_device_hotplug();
        ret = device_offline(get_cpu_device(cpu));
        unlock_device_hotplug();

        return ret;
}
EXPORT_SYMBOL_GPL(remove_cpu);

void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
{
        unsigned int cpu;
        int error;

        cpu_maps_update_begin();

        /*
         * Make certain the cpu I'm about to reboot on is online.
         *
         * This is inline to what migrate_to_reboot_cpu() already do.
         */
        if (!cpu_online(primary_cpu))
                primary_cpu = cpumask_first(cpu_online_mask);

        for_each_online_cpu(cpu) {
                if (cpu == primary_cpu)
                        continue;

                error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (error) {
                        pr_err("Failed to offline CPU%d - error=%d",
                                cpu, error);
                        break;
                }
        }

        /*
         * Ensure all but the reboot CPU are offline.
         */
        BUG_ON(num_online_cpus() > 1);

        /*
         * Make sure the CPUs won't be enabled by someone else after this
         * point. Kexec will reboot to a new kernel shortly resetting
         * everything along the way.
         */
        cpu_hotplug_disabled++;

        cpu_maps_update_done();
}

#else
#define takedown_cpu                NULL
#endif /*CONFIG_HOTPLUG_CPU*/

/**
 * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
 * @cpu: cpu that just started
 *
 * It must be called by the arch code on the new cpu, before the new cpu
 * enables interrupts and before the "boot" cpu returns from __cpu_up().
 */
void notify_cpu_starting(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);

        rcutree_report_cpu_starting(cpu);        /* Enables RCU usage on this CPU. */
        cpumask_set_cpu(cpu, &cpus_booted_once_mask);

        /*
         * STARTING must not fail!
         */
        cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
}

/*
 * Called from the idle task. Wake up the controlling task which brings the
 * hotplug thread of the upcoming CPU up and then delegates the rest of the
 * online bringup to the hotplug thread.
 */
void cpuhp_online_idle(enum cpuhp_state state)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        /* Happens for the boot cpu */
        if (state != CPUHP_AP_ONLINE_IDLE)
                return;

        cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE);

        /*
         * Unpark the stopper thread before we start the idle loop (and start
         * scheduling); this ensures the stopper task is always available.
         */
        stop_machine_unpark(smp_processor_id());

        st->state = CPUHP_AP_ONLINE_IDLE;
        complete_ap_thread(st, true);
}

/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle;
        int ret = 0;

        cpus_write_lock();

        if (!cpu_present(cpu)) {
                ret = -EINVAL;
                goto out;
        }

        /*
         * The caller of cpu_up() might have raced with another
         * caller. Nothing to do.
         */
        if (st->state >= target)
                goto out;

        if (st->state == CPUHP_OFFLINE) {
                /* Let it fail before we try to bring the cpu up */
                idle = idle_thread_get(cpu);
                if (IS_ERR(idle)) {
                        ret = PTR_ERR(idle);
                        goto out;
                }

                /*
                 * Reset stale stack state from the last time this CPU was online.
                 */
                scs_task_reset(idle);
                kasan_unpoison_task_stack(idle);
        }

        cpuhp_tasks_frozen = tasks_frozen;

        cpuhp_set_state(cpu, st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread once more.
         */
        if (st->state > CPUHP_BRINGUP_CPU) {
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
                 * return the error code..
                 */
                if (ret)
                        goto out;
        }

        /*
         * Try to reach the target state. We max out on the BP at
         * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
         * responsible for bringing it up to the target state.
         */
        target = min((int)target, CPUHP_BRINGUP_CPU);
        ret = cpuhp_up_callbacks(cpu, st, target);
out:
        cpus_write_unlock();
        arch_smt_update();
        return ret;
}

static int cpu_up(unsigned int cpu, enum cpuhp_state target)
{
        int err = 0;

        if (!cpu_possible(cpu)) {
                pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
                       cpu);
                return -EINVAL;
        }

        err = try_online_node(cpu_to_node(cpu));
        if (err)
                return err;

        cpu_maps_update_begin();

        if (cpu_hotplug_disabled) {
                err = -EBUSY;
                goto out;
        }
        if (!cpu_bootable(cpu)) {
                err = -EPERM;
                goto out;
        }

        err = _cpu_up(cpu, 0, target);
out:
        cpu_maps_update_done();
        return err;
}

/**
 * cpu_device_up - Bring up a cpu device
 * @dev: Pointer to the cpu device to online
 *
 * This function is meant to be used by device core cpu subsystem only.
 *
 * Other subsystems should use add_cpu() instead.
 *
 * Return: %0 on success or a negative errno code
 */
int cpu_device_up(struct device *dev)
{
        return cpu_up(dev->id, CPUHP_ONLINE);
}

int add_cpu(unsigned int cpu)
{
        int ret;

        lock_device_hotplug();
        ret = device_online(get_cpu_device(cpu));
        unlock_device_hotplug();

        return ret;
}
EXPORT_SYMBOL_GPL(add_cpu);

/**
 * bringup_hibernate_cpu - Bring up the CPU that we hibernated on
 * @sleep_cpu: The cpu we hibernated on and should be brought up.
 *
 * On some architectures like arm64, we can hibernate on any CPU, but on
 * wake up the CPU we hibernated on might be offline as a side effect of
 * using maxcpus= for example.
 *
 * Return: %0 on success or a negative errno code
 */
int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
        int ret;

        if (!cpu_online(sleep_cpu)) {
                pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
                ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
                if (ret) {
                        pr_err("Failed to bring hibernate-CPU up!\n");
                        return ret;
                }
        }
        return 0;
}

static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus,
                                      enum cpuhp_state target)
{
        unsigned int cpu;

        for_each_cpu(cpu, mask) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);

                if (cpu_up(cpu, target) && can_rollback_cpu(st)) {
                        /*
                         * If this failed then cpu_up() might have only
                         * rolled back to CPUHP_BP_KICK_AP for the final
                         * online. Clean it up. NOOP if already rolled back.
                         */
                        WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE));
                }

                if (!--ncpus)
                        break;
        }
}

#ifdef CONFIG_HOTPLUG_PARALLEL
static bool __cpuhp_parallel_bringup __ro_after_init = true;

static int __init parallel_bringup_parse_param(char *arg)
{
        return kstrtobool(arg, &__cpuhp_parallel_bringup);
}
early_param("cpuhp.parallel", parallel_bringup_parse_param);

#ifdef CONFIG_HOTPLUG_SMT
static inline bool cpuhp_smt_aware(void)
{
        return cpu_smt_max_threads > 1;
}

static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
{
        return cpu_primary_thread_mask;
}
#else
static inline bool cpuhp_smt_aware(void)
{
        return false;
}
static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
{
        return cpu_none_mask;
}
#endif

bool __weak arch_cpuhp_init_parallel_bringup(void)
{
        return true;
}

/*
 * On architectures which have enabled parallel bringup this invokes all BP
 * prepare states for each of the to be onlined APs first. The last state
 * sends the startup IPI to the APs. The APs proceed through the low level
 * bringup code in parallel and then wait for the control CPU to release
 * them one by one for the final onlining procedure.
 *
 * This avoids waiting for each AP to respond to the startup IPI in
 * CPUHP_BRINGUP_CPU.
 */
static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus)
{
        const struct cpumask *mask = cpu_present_mask;

        if (__cpuhp_parallel_bringup)
                __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup();
        if (!__cpuhp_parallel_bringup)
                return false;

        if (cpuhp_smt_aware()) {
                const struct cpumask *pmask = cpuhp_get_primary_thread_mask();
                static struct cpumask tmp_mask __initdata;

                /*
                 * X86 requires to prevent that SMT siblings stopped while
                 * the primary thread does a microcode update for various
                 * reasons. Bring the primary threads up first.
                 */
                cpumask_and(&tmp_mask, mask, pmask);
                cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP);
                cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE);
                /* Account for the online CPUs */
                ncpus -= num_online_cpus();
                if (!ncpus)
                        return true;
                /* Create the mask for secondary CPUs */
                cpumask_andnot(&tmp_mask, mask, pmask);
                mask = &tmp_mask;
        }

        /* Bring the not-yet started CPUs up */
        cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP);
        cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE);
        return true;
}
#else
static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; }
#endif /* CONFIG_HOTPLUG_PARALLEL */

void __init bringup_nonboot_cpus(unsigned int max_cpus)
{
        if (!max_cpus)
                return;

        /* Try parallel bringup optimization if enabled */
        if (cpuhp_bringup_cpus_parallel(max_cpus))
                return;

        /* Full per CPU serialized bringup */
        cpuhp_bringup_mask(cpu_present_mask, max_cpus, CPUHP_ONLINE);
}

#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;

int freeze_secondary_cpus(int primary)
{
        int cpu, error = 0;

        cpu_maps_update_begin();
        if (primary == -1) {
                primary = cpumask_first(cpu_online_mask);
                if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
                        primary = housekeeping_any_cpu(HK_TYPE_TIMER);
        } else {
                if (!cpu_online(primary))
                        primary = cpumask_first(cpu_online_mask);
        }

        /*
         * We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);

        pr_info("Disabling non-boot CPUs ...\n");
        for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) {
                if (!cpu_online(cpu) || cpu == primary)
                        continue;

                if (pm_wakeup_pending()) {
                        pr_info("Wakeup pending. Abort CPU freeze\n");
                        error = -EBUSY;
                        break;
                }

                trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
                error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
                trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
                else {
                        pr_err("Error taking CPU%d down: %d\n", cpu, error);
                        break;
                }
        }

        if (!error)
                BUG_ON(num_online_cpus() > 1);
        else
                pr_err("Non-boot CPUs are not disabled\n");

        /*
         * Make sure the CPUs won't be enabled by someone else. We need to do
         * this even in case of failure as all freeze_secondary_cpus() users are
         * supposed to do thaw_secondary_cpus() on the failure path.
         */
        cpu_hotplug_disabled++;

        cpu_maps_update_done();
        return error;
}

void __weak arch_thaw_secondary_cpus_begin(void)
{
}

void __weak arch_thaw_secondary_cpus_end(void)
{
}

void thaw_secondary_cpus(void)
{
        int cpu, error;

        /* Allow everyone to use the CPU hotplug again */
        cpu_maps_update_begin();
        __cpu_hotplug_enable();
        if (cpumask_empty(frozen_cpus))
                goto out;

        pr_info("Enabling non-boot CPUs ...\n");

        arch_thaw_secondary_cpus_begin();

        for_each_cpu(cpu, frozen_cpus) {
                trace_suspend_resume(TPS("CPU_ON"), cpu, true);
                error = _cpu_up(cpu, 1, CPUHP_ONLINE);
                trace_suspend_resume(TPS("CPU_ON"), cpu, false);
                if (!error) {
                        pr_info("CPU%d is up\n", cpu);
                        continue;
                }
                pr_warn("Error taking CPU%d up: %d\n", cpu, error);
        }

        arch_thaw_secondary_cpus_end();

        cpumask_clear(frozen_cpus);
out:
        cpu_maps_update_done();
}

static int __init alloc_frozen_cpus(void)
{
        if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
                return -ENOMEM;
        return 0;
}
core_initcall(alloc_frozen_cpus);

/*
 * When callbacks for CPU hotplug notifications are being executed, we must
 * ensure that the state of the system with respect to the tasks being frozen
 * or not, as reported by the notification, remains unchanged *throughout the
 * duration* of the execution of the callbacks.
 * Hence we need to prevent the freezer from racing with regular CPU hotplug.
 *
 * This synchronization is implemented by mutually excluding regular CPU
 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
 * Hibernate notifications.
 */
static int
cpu_hotplug_pm_callback(struct notifier_block *nb,
                        unsigned long action, void *ptr)
{
        switch (action) {

        case PM_SUSPEND_PREPARE:
        case PM_HIBERNATION_PREPARE:
                cpu_hotplug_disable();
                break;

        case PM_POST_SUSPEND:
        case PM_POST_HIBERNATION:
                cpu_hotplug_enable();
                break;

        default:
                return NOTIFY_DONE;
        }

        return NOTIFY_OK;
}


static int __init cpu_hotplug_pm_sync_init(void)
{
        /*
         * cpu_hotplug_pm_callback has higher priority than x86
         * bsp_pm_callback which depends on cpu_hotplug_pm_callback
         * to disable cpu hotplug to avoid cpu hotplug race.
         */
        pm_notifier(cpu_hotplug_pm_callback, 0);
        return 0;
}
core_initcall(cpu_hotplug_pm_sync_init);

#endif /* CONFIG_PM_SLEEP_SMP */

int __boot_cpu_id;

#endif /* CONFIG_SMP */

/* Boot processor state steps */
static struct cpuhp_step cpuhp_hp_states[] = {
        [CPUHP_OFFLINE] = {
                .name                        = "offline",
                .startup.single                = NULL,
                .teardown.single        = NULL,
        },
#ifdef CONFIG_SMP
        [CPUHP_CREATE_THREADS]= {
                .name                        = "threads:prepare",
                .startup.single                = smpboot_create_threads,
                .teardown.single        = NULL,
                .cant_stop                = true,
        },
        [CPUHP_RANDOM_PREPARE] = {
                .name                        = "random:prepare",
                .startup.single                = random_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_WORKQUEUE_PREP] = {
                .name                        = "workqueue:prepare",
                .startup.single                = workqueue_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_HRTIMERS_PREPARE] = {
                .name                        = "hrtimers:prepare",
                .startup.single                = hrtimers_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_SMPCFD_PREPARE] = {
                .name                        = "smpcfd:prepare",
                .startup.single                = smpcfd_prepare_cpu,
                .teardown.single        = smpcfd_dead_cpu,
        },
        [CPUHP_RELAY_PREPARE] = {
                .name                        = "relay:prepare",
                .startup.single                = relay_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_RCUTREE_PREP] = {
                .name                        = "RCU/tree:prepare",
                .startup.single                = rcutree_prepare_cpu,
                .teardown.single        = rcutree_dead_cpu,
        },
        /*
         * On the tear-down path, timers_dead_cpu() must be invoked
         * before blk_mq_queue_reinit_notify() from notify_dead(),
         * otherwise a RCU stall occurs.
         */
        [CPUHP_TIMERS_PREPARE] = {
                .name                        = "timers:prepare",
                .startup.single                = timers_prepare_cpu,
                .teardown.single        = timers_dead_cpu,
        },

#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
        /*
         * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until
         * the next step will release it.
         */
        [CPUHP_BP_KICK_AP] = {
                .name                        = "cpu:kick_ap",
                .startup.single                = cpuhp_kick_ap_alive,
        },

        /*
         * Waits for the AP to reach cpuhp_ap_sync_alive() and then
         * releases it for the complete bringup.
         */
        [CPUHP_BRINGUP_CPU] = {
                .name                        = "cpu:bringup",
                .startup.single                = cpuhp_bringup_ap,
                .teardown.single        = finish_cpu,
                .cant_stop                = true,
        },
#else
        /*
         * All-in-one CPU bringup state which includes the kick alive.
         */
        [CPUHP_BRINGUP_CPU] = {
                .name                        = "cpu:bringup",
                .startup.single                = bringup_cpu,
                .teardown.single        = finish_cpu,
                .cant_stop                = true,
        },
#endif
        /* Final state before CPU kills itself */
        [CPUHP_AP_IDLE_DEAD] = {
                .name                        = "idle:dead",
        },
        /*
         * Last state before CPU enters the idle loop to die. Transient state
         * for synchronization.
         */
        [CPUHP_AP_OFFLINE] = {
                .name                        = "ap:offline",
                .cant_stop                = true,
        },
        /* First state is scheduler control. Interrupts are disabled */
        [CPUHP_AP_SCHED_STARTING] = {
                .name                        = "sched:starting",
                .startup.single                = sched_cpu_starting,
                .teardown.single        = sched_cpu_dying,
        },
        [CPUHP_AP_RCUTREE_DYING] = {
                .name                        = "RCU/tree:dying",
                .startup.single                = NULL,
                .teardown.single        = rcutree_dying_cpu,
        },
        [CPUHP_AP_SMPCFD_DYING] = {
                .name                        = "smpcfd:dying",
                .startup.single                = NULL,
                .teardown.single        = smpcfd_dying_cpu,
        },
        [CPUHP_AP_HRTIMERS_DYING] = {
                .name                        = "hrtimers:dying",
                .startup.single                = hrtimers_cpu_starting,
                .teardown.single        = hrtimers_cpu_dying,
        },
        [CPUHP_AP_TICK_DYING] = {
                .name                        = "tick:dying",
                .startup.single                = NULL,
                .teardown.single        = tick_cpu_dying,
        },
        /* Entry state on starting. Interrupts enabled from here on. Transient
         * state for synchronsization */
        [CPUHP_AP_ONLINE] = {
                .name                        = "ap:online",
        },
        /*
         * Handled on control processor until the plugged processor manages
         * this itself.
         */
        [CPUHP_TEARDOWN_CPU] = {
                .name                        = "cpu:teardown",
                .startup.single                = NULL,
                .teardown.single        = takedown_cpu,
                .cant_stop                = true,
        },

        [CPUHP_AP_SCHED_WAIT_EMPTY] = {
                .name                        = "sched:waitempty",
                .startup.single                = NULL,
                .teardown.single        = sched_cpu_wait_empty,
        },

        /* Handle smpboot threads park/unpark */
        [CPUHP_AP_SMPBOOT_THREADS] = {
                .name                        = "smpboot/threads:online",
                .startup.single                = smpboot_unpark_threads,
                .teardown.single        = smpboot_park_threads,
        },
        [CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
                .name                        = "irq/affinity:online",
                .startup.single                = irq_affinity_online_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_AP_PERF_ONLINE] = {
                .name                        = "perf:online",
                .startup.single                = perf_event_init_cpu,
                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_AP_WATCHDOG_ONLINE] = {
                .name                        = "lockup_detector:online",
                .startup.single                = lockup_detector_online_cpu,
                .teardown.single        = lockup_detector_offline_cpu,
        },
        [CPUHP_AP_WORKQUEUE_ONLINE] = {
                .name                        = "workqueue:online",
                .startup.single                = workqueue_online_cpu,
                .teardown.single        = workqueue_offline_cpu,
        },
        [CPUHP_AP_RANDOM_ONLINE] = {
                .name                        = "random:online",
                .startup.single                = random_online_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_AP_RCUTREE_ONLINE] = {
                .name                        = "RCU/tree:online",
                .startup.single                = rcutree_online_cpu,
                .teardown.single        = rcutree_offline_cpu,
        },
#endif
        /*
         * The dynamically registered state space is here
         */

#ifdef CONFIG_SMP
        /* Last state is scheduler control setting the cpu active */
        [CPUHP_AP_ACTIVE] = {
                .name                        = "sched:active",
                .startup.single                = sched_cpu_activate,
                .teardown.single        = sched_cpu_deactivate,
        },
#endif

        /* CPU is fully up and running. */
        [CPUHP_ONLINE] = {
                .name                        = "online",
                .startup.single                = NULL,
                .teardown.single        = NULL,
        },
};

/* Sanity check for callbacks */
static int cpuhp_cb_check(enum cpuhp_state state)
{
        if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
                return -EINVAL;
        return 0;
}

/*
 * Returns a free for dynamic slot assignment of the Online state. The states
 * are protected by the cpuhp_slot_states mutex and an empty slot is identified
 * by having no name assigned.
 */
static int cpuhp_reserve_state(enum cpuhp_state state)
{
        enum cpuhp_state i, end;
        struct cpuhp_step *step;

        switch (state) {
        case CPUHP_AP_ONLINE_DYN:
                step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
                end = CPUHP_AP_ONLINE_DYN_END;
                break;
        case CPUHP_BP_PREPARE_DYN:
                step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
                end = CPUHP_BP_PREPARE_DYN_END;
                break;
        default:
                return -EINVAL;
        }

        for (i = state; i <= end; i++, step++) {
                if (!step->name)
                        return i;
        }
        WARN(1, "No more dynamic states available for CPU hotplug\n");
        return -ENOSPC;
}

static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
                                 int (*startup)(unsigned int cpu),
                                 int (*teardown)(unsigned int cpu),
                                 bool multi_instance)
{
        /* (Un)Install the callbacks for further cpu hotplug operations */
        struct cpuhp_step *sp;
        int ret = 0;

        /*
         * If name is NULL, then the state gets removed.
         *
         * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
         * the first allocation from these dynamic ranges, so the removal
         * would trigger a new allocation and clear the wrong (already
         * empty) state, leaving the callbacks of the to be cleared state
         * dangling, which causes wreckage on the next hotplug operation.
         */
        if (name && (state == CPUHP_AP_ONLINE_DYN ||
                     state == CPUHP_BP_PREPARE_DYN)) {
                ret = cpuhp_reserve_state(state);
                if (ret < 0)
                        return ret;
                state = ret;
        }
        sp = cpuhp_get_step(state);
        if (name && sp->name)
                return -EBUSY;

        sp->startup.single = startup;
        sp->teardown.single = teardown;
        sp->name = name;
        sp->multi_instance = multi_instance;
        INIT_HLIST_HEAD(&sp->list);
        return ret;
}

static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
{
        return cpuhp_get_step(state)->teardown.single;
}

/*
 * Call the startup/teardown function for a step either on the AP or
 * on the current CPU.
 */
static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
                            struct hlist_node *node)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int ret;

        /*
         * If there's nothing to do, we done.
         * Relies on the union for multi_instance.
         */
        if (cpuhp_step_empty(bringup, sp))
                return 0;
        /*
         * The non AP bound callbacks can fail on bringup. On teardown
         * e.g. module removal we crash for now.
         */
#ifdef CONFIG_SMP
        if (cpuhp_is_ap_state(state))
                ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
        else
                ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
        if (cpuhp_is_atomic_state(state)) {
                guard(irqsave)();
                ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
                /* STARTING/DYING must not fail! */
                WARN_ON_ONCE(ret);
        } else {
                ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
        }
#endif
        BUG_ON(ret && !bringup);
        return ret;
}

/*
 * Called from __cpuhp_setup_state on a recoverable failure.
 *
 * Note: The teardown callbacks for rollback are not allowed to fail!
 */
static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
                                   struct hlist_node *node)
{
        int cpu;

        /* Roll back the already executed steps on the other cpus */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpu >= failedcpu)
                        break;

                /* Did we invoke the startup call on that cpu ? */
                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, node);
        }
}

int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
                                          struct hlist_node *node,
                                          bool invoke)
{
        struct cpuhp_step *sp;
        int cpu;
        int ret;

        lockdep_assert_cpus_held();

        sp = cpuhp_get_step(state);
        if (sp->multi_instance == false)
                return -EINVAL;

        mutex_lock(&cpuhp_state_mutex);

        if (!invoke || !sp->startup.multi)
                goto add_node;

        /*
         * Try to call the startup callback for each present cpu
         * depending on the hotplug state of the cpu.
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate < state)
                        continue;

                ret = cpuhp_issue_call(cpu, state, true, node);
                if (ret) {
                        if (sp->teardown.multi)
                                cpuhp_rollback_install(cpu, state, node);
                        goto unlock;
                }
        }
add_node:
        ret = 0;
        hlist_add_head(node, &sp->list);
unlock:
        mutex_unlock(&cpuhp_state_mutex);
        return ret;
}

int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
                               bool invoke)
{
        int ret;

        cpus_read_lock();
        ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);

/**
 * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
 * @state:                The state to setup
 * @name:                Name of the step
 * @invoke:                If true, the startup function is invoked for cpus where
 *                        cpu state >= @state
 * @startup:                startup callback function
 * @teardown:                teardown callback function
 * @multi_instance:        State is set up for multiple instances which get
 *                        added afterwards.
 *
 * The caller needs to hold cpus read locked while calling this function.
 * Return:
 *   On success:
 *      Positive state number if @state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN;
 *      0 for all other states
 *   On failure: proper (negative) error code
 */
int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
                                   const char *name, bool invoke,
                                   int (*startup)(unsigned int cpu),
                                   int (*teardown)(unsigned int cpu),
                                   bool multi_instance)
{
        int cpu, ret = 0;
        bool dynstate;

        lockdep_assert_cpus_held();

        if (cpuhp_cb_check(state) || !name)
                return -EINVAL;

        mutex_lock(&cpuhp_state_mutex);

        ret = cpuhp_store_callbacks(state, name, startup, teardown,
                                    multi_instance);

        dynstate = state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN;
        if (ret > 0 && dynstate) {
                state = ret;
                ret = 0;
        }

        if (ret || !invoke || !startup)
                goto out;

        /*
         * Try to call the startup callback for each present cpu
         * depending on the hotplug state of the cpu.
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate < state)
                        continue;

                ret = cpuhp_issue_call(cpu, state, true, NULL);
                if (ret) {
                        if (teardown)
                                cpuhp_rollback_install(cpu, state, NULL);
                        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
                        goto out;
                }
        }
out:
        mutex_unlock(&cpuhp_state_mutex);
        /*
         * If the requested state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN,
         * return the dynamically allocated state in case of success.
         */
        if (!ret && dynstate)
                return state;
        return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);

int __cpuhp_setup_state(enum cpuhp_state state,
                        const char *name, bool invoke,
                        int (*startup)(unsigned int cpu),
                        int (*teardown)(unsigned int cpu),
                        bool multi_instance)
{
        int ret;

        cpus_read_lock();
        ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
                                             teardown, multi_instance);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state);

int __cpuhp_state_remove_instance(enum cpuhp_state state,
                                  struct hlist_node *node, bool invoke)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;

        BUG_ON(cpuhp_cb_check(state));

        if (!sp->multi_instance)
                return -EINVAL;

        cpus_read_lock();
        mutex_lock(&cpuhp_state_mutex);

        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;
        /*
         * Call the teardown callback for each present cpu depending
         * on the hotplug state of the cpu. This function is not
         * allowed to fail currently!
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, node);
        }

remove:
        hlist_del(node);
        mutex_unlock(&cpuhp_state_mutex);
        cpus_read_unlock();

        return 0;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);

/**
 * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
 * @state:        The state to remove
 * @invoke:        If true, the teardown function is invoked for cpus where
 *                cpu state >= @state
 *
 * The caller needs to hold cpus read locked while calling this function.
 * The teardown callback is currently not allowed to fail. Think
 * about module removal!
 */
void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;

        BUG_ON(cpuhp_cb_check(state));

        lockdep_assert_cpus_held();

        mutex_lock(&cpuhp_state_mutex);
        if (sp->multi_instance) {
                WARN(!hlist_empty(&sp->list),
                     "Error: Removing state %d which has instances left.\n",
                     state);
                goto remove;
        }

        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;

        /*
         * Call the teardown callback for each present cpu depending
         * on the hotplug state of the cpu. This function is not
         * allowed to fail currently!
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, NULL);
        }
remove:
        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
        mutex_unlock(&cpuhp_state_mutex);
}
EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);

void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
{
        cpus_read_lock();
        __cpuhp_remove_state_cpuslocked(state, invoke);
        cpus_read_unlock();
}
EXPORT_SYMBOL(__cpuhp_remove_state);

#ifdef CONFIG_HOTPLUG_SMT
static void cpuhp_offline_cpu_device(unsigned int cpu)
{
        struct device *dev = get_cpu_device(cpu);

        dev->offline = true;
        /* Tell user space about the state change */
        kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
}

static void cpuhp_online_cpu_device(unsigned int cpu)
{
        struct device *dev = get_cpu_device(cpu);

        dev->offline = false;
        /* Tell user space about the state change */
        kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}

int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
        int cpu, ret = 0;

        cpu_maps_update_begin();
        for_each_online_cpu(cpu) {
                if (topology_is_primary_thread(cpu))
                        continue;
                /*
                 * Disable can be called with CPU_SMT_ENABLED when changing
                 * from a higher to lower number of SMT threads per core.
                 */
                if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
                        continue;
                ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (ret)
                        break;
                /*
                 * As this needs to hold the cpu maps lock it's impossible
                 * to call device_offline() because that ends up calling
                 * cpu_down() which takes cpu maps lock. cpu maps lock
                 * needs to be held as this might race against in kernel
                 * abusers of the hotplug machinery (thermal management).
                 *
                 * So nothing would update device:offline state. That would
                 * leave the sysfs entry stale and prevent onlining after
                 * smt control has been changed to 'off' again. This is
                 * called under the sysfs hotplug lock, so it is properly
                 * serialized against the regular offline usage.
                 */
                cpuhp_offline_cpu_device(cpu);
        }
        if (!ret)
                cpu_smt_control = ctrlval;
        cpu_maps_update_done();
        return ret;
}

/* Check if the core a CPU belongs to is online */
#if !defined(topology_is_core_online)
static inline bool topology_is_core_online(unsigned int cpu)
{
        return true;
}
#endif

int cpuhp_smt_enable(void)
{
        int cpu, ret = 0;

        cpu_maps_update_begin();
        cpu_smt_control = CPU_SMT_ENABLED;
        for_each_present_cpu(cpu) {
                /* Skip online CPUs and CPUs on offline nodes */
                if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
                        continue;
                if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu))
                        continue;
                ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
                if (ret)
                        break;
                /* See comment in cpuhp_smt_disable() */
                cpuhp_online_cpu_device(cpu);
        }
        cpu_maps_update_done();
        return ret;
}
#endif

#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t state_show(struct device *dev,
                          struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->state);
}
static DEVICE_ATTR_RO(state);

static ssize_t target_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
        struct cpuhp_step *sp;
        int target, ret;

        ret = kstrtoint(buf, 10, &target);
        if (ret)
                return ret;

#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
        if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
                return -EINVAL;
#else
        if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
                return -EINVAL;
#endif

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(target);
        ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
                goto out;

        if (st->state < target)
                ret = cpu_up(dev->id, target);
        else if (st->state > target)
                ret = cpu_down(dev->id, target);
        else if (WARN_ON(st->target != target))
                st->target = target;
out:
        unlock_device_hotplug();
        return ret ? ret : count;
}

static ssize_t target_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->target);
}
static DEVICE_ATTR_RW(target);

static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
                          const char *buf, size_t count)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
        struct cpuhp_step *sp;
        int fail, ret;

        ret = kstrtoint(buf, 10, &fail);
        if (ret)
                return ret;

        if (fail == CPUHP_INVALID) {
                st->fail = fail;
                return count;
        }

        if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
                return -EINVAL;

        /*
         * Cannot fail STARTING/DYING callbacks.
         */
        if (cpuhp_is_atomic_state(fail))
                return -EINVAL;

        /*
         * DEAD callbacks cannot fail...
         * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
         * triggering STARTING callbacks, a failure in this state would
         * hinder rollback.
         */
        if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
                return -EINVAL;

        /*
         * Cannot fail anything that doesn't have callbacks.
         */
        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(fail);
        if (!sp->startup.single && !sp->teardown.single)
                ret = -EINVAL;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
                return ret;

        st->fail = fail;

        return count;
}

static ssize_t fail_show(struct device *dev,
                         struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->fail);
}

static DEVICE_ATTR_RW(fail);

static struct attribute *cpuhp_cpu_attrs[] = {
        &dev_attr_state.attr,
        &dev_attr_target.attr,
        &dev_attr_fail.attr,
        NULL
};

static const struct attribute_group cpuhp_cpu_attr_group = {
        .attrs = cpuhp_cpu_attrs,
        .name = "hotplug",
};

static ssize_t states_show(struct device *dev,
                                 struct device_attribute *attr, char *buf)
{
        ssize_t cur, res = 0;
        int i;

        mutex_lock(&cpuhp_state_mutex);
        for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
                struct cpuhp_step *sp = cpuhp_get_step(i);

                if (sp->name) {
                        cur = sprintf(buf, "%3d: %s\n", i, sp->name);
                        buf += cur;
                        res += cur;
                }
        }
        mutex_unlock(&cpuhp_state_mutex);
        return res;
}
static DEVICE_ATTR_RO(states);

static struct attribute *cpuhp_cpu_root_attrs[] = {
        &dev_attr_states.attr,
        NULL
};

static const struct attribute_group cpuhp_cpu_root_attr_group = {
        .attrs = cpuhp_cpu_root_attrs,
        .name = "hotplug",
};

#ifdef CONFIG_HOTPLUG_SMT

static bool cpu_smt_num_threads_valid(unsigned int threads)
{
        if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC))
                return threads >= 1 && threads <= cpu_smt_max_threads;
        return threads == 1 || threads == cpu_smt_max_threads;
}

static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        int ctrlval, ret, num_threads, orig_threads;
        bool force_off;

        if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
                return -EPERM;

        if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return -ENODEV;

        if (sysfs_streq(buf, "on")) {
                ctrlval = CPU_SMT_ENABLED;
                num_threads = cpu_smt_max_threads;
        } else if (sysfs_streq(buf, "off")) {
                ctrlval = CPU_SMT_DISABLED;
                num_threads = 1;
        } else if (sysfs_streq(buf, "forceoff")) {
                ctrlval = CPU_SMT_FORCE_DISABLED;
                num_threads = 1;
        } else if (kstrtoint(buf, 10, &num_threads) == 0) {
                if (num_threads == 1)
                        ctrlval = CPU_SMT_DISABLED;
                else if (cpu_smt_num_threads_valid(num_threads))
                        ctrlval = CPU_SMT_ENABLED;
                else
                        return -EINVAL;
        } else {
                return -EINVAL;
        }

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        orig_threads = cpu_smt_num_threads;
        cpu_smt_num_threads = num_threads;

        force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED;

        if (num_threads > orig_threads)
                ret = cpuhp_smt_enable();
        else if (num_threads < orig_threads || force_off)
                ret = cpuhp_smt_disable(ctrlval);

        unlock_device_hotplug();
        return ret ? ret : count;
}

#else /* !CONFIG_HOTPLUG_SMT */
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        return -ENODEV;
}
#endif /* CONFIG_HOTPLUG_SMT */

static const char *smt_states[] = {
        [CPU_SMT_ENABLED]                = "on",
        [CPU_SMT_DISABLED]                = "off",
        [CPU_SMT_FORCE_DISABLED]        = "forceoff",
        [CPU_SMT_NOT_SUPPORTED]                = "notsupported",
        [CPU_SMT_NOT_IMPLEMENTED]        = "notimplemented",
};

static ssize_t control_show(struct device *dev,
                            struct device_attribute *attr, char *buf)
{
        const char *state = smt_states[cpu_smt_control];

#ifdef CONFIG_HOTPLUG_SMT
        /*
         * If SMT is enabled but not all threads are enabled then show the
         * number of threads. If all threads are enabled show "on". Otherwise
         * show the state name.
         */
        if (cpu_smt_control == CPU_SMT_ENABLED &&
            cpu_smt_num_threads != cpu_smt_max_threads)
                return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
#endif

        return sysfs_emit(buf, "%s\n", state);
}

static ssize_t control_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        return __store_smt_control(dev, attr, buf, count);
}
static DEVICE_ATTR_RW(control);

static ssize_t active_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n", sched_smt_active());
}
static DEVICE_ATTR_RO(active);

static struct attribute *cpuhp_smt_attrs[] = {
        &dev_attr_control.attr,
        &dev_attr_active.attr,
        NULL
};

static const struct attribute_group cpuhp_smt_attr_group = {
        .attrs = cpuhp_smt_attrs,
        .name = "smt",
};

static int __init cpu_smt_sysfs_init(void)
{
        struct device *dev_root;
        int ret = -ENODEV;

        dev_root = bus_get_dev_root(&cpu_subsys);
        if (dev_root) {
                ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group);
                put_device(dev_root);
        }
        return ret;
}

static int __init cpuhp_sysfs_init(void)
{
        struct device *dev_root;
        int cpu, ret;

        ret = cpu_smt_sysfs_init();
        if (ret)
                return ret;

        dev_root = bus_get_dev_root(&cpu_subsys);
        if (dev_root) {
                ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group);
                put_device(dev_root);
                if (ret)
                        return ret;
        }

        for_each_possible_cpu(cpu) {
                struct device *dev = get_cpu_device(cpu);

                if (!dev)
                        continue;
                ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
                if (ret)
                        return ret;
        }
        return 0;
}
device_initcall(cpuhp_sysfs_init);
#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */

/*
 * cpu_bit_bitmap[] is a special, "compressed" data structure that
 * represents all NR_CPUS bits binary values of 1<<nr.
 *
 * It is used by cpumask_of() to get a constant address to a CPU
 * mask value that has a single bit set only.
 */

/* cpu_bit_bitmap[0] is empty - so we can back into it */
#define MASK_DECLARE_1(x)        [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x)        MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x)        MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x)        MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)

const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {

        MASK_DECLARE_8(0),        MASK_DECLARE_8(8),
        MASK_DECLARE_8(16),        MASK_DECLARE_8(24),
#if BITS_PER_LONG > 32
        MASK_DECLARE_8(32),        MASK_DECLARE_8(40),
        MASK_DECLARE_8(48),        MASK_DECLARE_8(56),
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);

const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);

#ifdef CONFIG_INIT_ALL_POSSIBLE
struct cpumask __cpu_possible_mask __ro_after_init
        = {CPU_BITS_ALL};
unsigned int __num_possible_cpus __ro_after_init = NR_CPUS;
#else
struct cpumask __cpu_possible_mask __ro_after_init;
unsigned int __num_possible_cpus __ro_after_init;
#endif
EXPORT_SYMBOL(__cpu_possible_mask);
EXPORT_SYMBOL(__num_possible_cpus);

struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);

struct cpumask __cpu_enabled_mask __read_mostly;
EXPORT_SYMBOL(__cpu_enabled_mask);

struct cpumask __cpu_present_mask __read_mostly;
EXPORT_SYMBOL(__cpu_present_mask);

struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);

struct cpumask __cpu_dying_mask __read_mostly;
EXPORT_SYMBOL(__cpu_dying_mask);

atomic_t __num_online_cpus __read_mostly;
EXPORT_SYMBOL(__num_online_cpus);

void init_cpu_present(const struct cpumask *src)
{
        cpumask_copy(&__cpu_present_mask, src);
}

void init_cpu_possible(const struct cpumask *src)
{
        cpumask_copy(&__cpu_possible_mask, src);
        __num_possible_cpus = cpumask_weight(&__cpu_possible_mask);
}

void set_cpu_online(unsigned int cpu, bool online)
{
        /*
         * atomic_inc/dec() is required to handle the horrid abuse of this
         * function by the reboot and kexec code which invoke it from
         * IPI/NMI broadcasts when shutting down CPUs. Invocation from
         * regular CPU hotplug is properly serialized.
         *
         * Note, that the fact that __num_online_cpus is of type atomic_t
         * does not protect readers which are not serialized against
         * concurrent hotplug operations.
         */
        if (online) {
                if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
                        atomic_inc(&__num_online_cpus);
        } else {
                if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
                        atomic_dec(&__num_online_cpus);
        }
}

/*
 * This should be marked __init, but there is a boatload of call sites
 * which need to be fixed up to do so. Sigh...
 */
void set_cpu_possible(unsigned int cpu, bool possible)
{
        if (possible) {
                if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask))
                        __num_possible_cpus++;
        } else {
                if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask))
                        __num_possible_cpus--;
        }
}

/*
 * Activate the first processor.
 */
void __init boot_cpu_init(void)
{
        int cpu = smp_processor_id();

        /* Mark the boot cpu "present", "online" etc for SMP and UP case */
        set_cpu_online(cpu, true);
        set_cpu_active(cpu, true);
        set_cpu_present(cpu, true);
        set_cpu_possible(cpu, true);

#ifdef CONFIG_SMP
        __boot_cpu_id = cpu;
#endif
}

/*
 * Must be called _AFTER_ setting up the per_cpu areas
 */
void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
        cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
        atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE);
#endif
        this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
        this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}

#ifdef CONFIG_CPU_MITIGATIONS
/*
 * All except the cross-thread attack vector are mitigated by default.
 * Cross-thread mitigation often requires disabling SMT which is expensive
 * so cross-thread mitigations are only partially enabled by default.
 *
 * Guest-to-Host and Guest-to-Guest vectors are only needed if KVM support is
 * present.
 */
static bool attack_vectors[NR_CPU_ATTACK_VECTORS] __ro_after_init = {
        [CPU_MITIGATE_USER_KERNEL] = true,
        [CPU_MITIGATE_USER_USER] = true,
        [CPU_MITIGATE_GUEST_HOST] = IS_ENABLED(CONFIG_KVM),
        [CPU_MITIGATE_GUEST_GUEST] = IS_ENABLED(CONFIG_KVM),
};

bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v)
{
        if (v < NR_CPU_ATTACK_VECTORS)
                return attack_vectors[v];

        WARN_ONCE(1, "Invalid attack vector %d\n", v);
        return false;
}

/*
 * There are 3 global options, 'off', 'auto', 'auto,nosmt'. These may optionally
 * be combined with attack-vector disables which follow them.
 *
 * Examples:
 *   mitigations=auto,no_user_kernel,no_user_user,no_cross_thread
 *   mitigations=auto,nosmt,no_guest_host,no_guest_guest
 *
 * mitigations=off is equivalent to disabling all attack vectors.
 */
enum cpu_mitigations {
        CPU_MITIGATIONS_OFF,
        CPU_MITIGATIONS_AUTO,
        CPU_MITIGATIONS_AUTO_NOSMT,
};

enum {
        NO_USER_KERNEL,
        NO_USER_USER,
        NO_GUEST_HOST,
        NO_GUEST_GUEST,
        NO_CROSS_THREAD,
        NR_VECTOR_PARAMS,
};

enum smt_mitigations smt_mitigations __ro_after_init = SMT_MITIGATIONS_AUTO;
static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;

static const match_table_t global_mitigations = {
        { CPU_MITIGATIONS_AUTO_NOSMT,        "auto,nosmt"},
        { CPU_MITIGATIONS_AUTO,                "auto"},
        { CPU_MITIGATIONS_OFF,                "off"},
};

static const match_table_t vector_mitigations = {
        { NO_USER_KERNEL,        "no_user_kernel"},
        { NO_USER_USER,                "no_user_user"},
        { NO_GUEST_HOST,        "no_guest_host"},
        { NO_GUEST_GUEST,        "no_guest_guest"},
        { NO_CROSS_THREAD,        "no_cross_thread"},
        { NR_VECTOR_PARAMS,        NULL},
};

static int __init mitigations_parse_global_opt(char *arg)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(global_mitigations); i++) {
                const char *pattern = global_mitigations[i].pattern;

                if (!strncmp(arg, pattern, strlen(pattern))) {
                        cpu_mitigations = global_mitigations[i].token;
                        return strlen(pattern);
                }
        }

        return 0;
}

static int __init mitigations_parse_cmdline(char *arg)
{
        char *s, *p;
        int len;

        len = mitigations_parse_global_opt(arg);

        if (cpu_mitigations_off()) {
                memset(attack_vectors, 0, sizeof(attack_vectors));
                smt_mitigations = SMT_MITIGATIONS_OFF;
        } else if (cpu_mitigations_auto_nosmt()) {
                smt_mitigations = SMT_MITIGATIONS_ON;
        }

        p = arg + len;

        if (!*p)
                return 0;

        /* Attack vector controls may come after the ',' */
        if (*p++ != ',' || !IS_ENABLED(CONFIG_ARCH_HAS_CPU_ATTACK_VECTORS)) {
                pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",        arg);
                return 0;
        }

        while ((s = strsep(&p, ",")) != NULL) {
                switch (match_token(s, vector_mitigations, NULL)) {
                case NO_USER_KERNEL:
                        attack_vectors[CPU_MITIGATE_USER_KERNEL] = false;
                        break;
                case NO_USER_USER:
                        attack_vectors[CPU_MITIGATE_USER_USER] = false;
                        break;
                case NO_GUEST_HOST:
                        attack_vectors[CPU_MITIGATE_GUEST_HOST] = false;
                        break;
                case NO_GUEST_GUEST:
                        attack_vectors[CPU_MITIGATE_GUEST_GUEST] = false;
                        break;
                case NO_CROSS_THREAD:
                        smt_mitigations = SMT_MITIGATIONS_OFF;
                        break;
                default:
                        pr_crit("Unsupported mitigations options %s\n",        s);
                        return 0;
                }
        }

        return 0;
}

/* mitigations=off */
bool cpu_mitigations_off(void)
{
        return cpu_mitigations == CPU_MITIGATIONS_OFF;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_off);

/* mitigations=auto,nosmt */
bool cpu_mitigations_auto_nosmt(void)
{
        return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
#else
static int __init mitigations_parse_cmdline(char *arg)
{
        pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n");
        return 0;
}
#endif
early_param("mitigations", mitigations_parse_cmdline);




































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2005,2006,2007,2008 IBM Corporation
 *
 * Authors:
 * Reiner Sailer <sailer@watson.ibm.com>
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima.h
 *        internal Integrity Measurement Architecture (IMA) definitions
 */

#ifndef __LINUX_IMA_H
#define __LINUX_IMA_H

#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/hash.h>
#include <linux/tpm.h>
#include <linux/audit.h>
#include <crypto/hash_info.h>

#include "../integrity.h"

enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_BINARY_NO_FIELD_LEN,
                     IMA_SHOW_BINARY_OLD_STRING_FMT, IMA_SHOW_ASCII };
enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8, TPM_PCR10 = 10 };

/* digest size for IMA, fits SHA1 or MD5 */
#define IMA_DIGEST_SIZE                SHA1_DIGEST_SIZE
#define IMA_EVENT_NAME_LEN_MAX        255

#define IMA_HASH_BITS 10
#define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS)

#define IMA_TEMPLATE_FIELD_ID_MAX_LEN        16
#define IMA_TEMPLATE_NUM_FIELDS_MAX        15

#define IMA_TEMPLATE_IMA_NAME "ima"
#define IMA_TEMPLATE_IMA_FMT "d|n"

#define NR_BANKS(chip) ((chip != NULL) ? chip->nr_allocated_banks : 0)

/* current content of the policy */
extern int ima_policy_flag;

/* bitset of digests algorithms allowed in the setxattr hook */
extern atomic_t ima_setxattr_allowed_hash_algorithms;

/* IMA hash algorithm description */
struct ima_algo_desc {
        struct crypto_shash *tfm;
        enum hash_algo algo;
        unsigned int digest_size;
};

/* set during initialization */
extern int ima_hash_algo __ro_after_init;
extern int ima_sha1_idx __ro_after_init;
extern int ima_hash_algo_idx __ro_after_init;
extern int ima_extra_slots __ro_after_init;
extern struct ima_algo_desc *ima_algo_array __ro_after_init;

extern int ima_appraise;
extern struct tpm_chip *ima_tpm_chip;
extern const char boot_aggregate_name[];

/* IMA event related data */
struct ima_event_data {
        struct ima_iint_cache *iint;
        struct file *file;
        const unsigned char *filename;
        struct evm_ima_xattr_data *xattr_value;
        int xattr_len;
        const struct modsig *modsig;
        const char *violation;
        const void *buf;
        int buf_len;
};

/* IMA template field data definition */
struct ima_field_data {
        u8 *data;
        u32 len;
};

/* IMA template field definition */
struct ima_template_field {
        const char field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN];
        int (*field_init)(struct ima_event_data *event_data,
                          struct ima_field_data *field_data);
        void (*field_show)(struct seq_file *m, enum ima_show_type show,
                           struct ima_field_data *field_data);
};

/* IMA template descriptor definition */
struct ima_template_desc {
        struct list_head list;
        char *name;
        char *fmt;
        int num_fields;
        const struct ima_template_field **fields;
};

struct ima_template_entry {
        int pcr;
        struct tpm_digest *digests;
        struct ima_template_desc *template_desc; /* template descriptor */
        u32 template_data_len;
        struct ima_field_data template_data[];        /* template related data */
};

struct ima_queue_entry {
        struct hlist_node hnext;        /* place in hash collision list */
        struct list_head later;                /* place in ima_measurements list */
        struct ima_template_entry *entry;
};
extern struct list_head ima_measurements;        /* list of all measurements */

/* Some details preceding the binary serialized measurement list */
struct ima_kexec_hdr {
        u16 version;
        u16 _reserved0;
        u32 _reserved1;
        u64 buffer_size;
        u64 count;
};

/* IMA iint action cache flags */
#define IMA_MEASURE                0x00000001
#define IMA_MEASURED                0x00000002
#define IMA_APPRAISE                0x00000004
#define IMA_APPRAISED                0x00000008
/*#define IMA_COLLECT                0x00000010  do not use this flag */
#define IMA_COLLECTED                0x00000020
#define IMA_AUDIT                0x00000040
#define IMA_AUDITED                0x00000080
#define IMA_HASH                0x00000100
#define IMA_HASHED                0x00000200

/* IMA iint policy rule cache flags */
#define IMA_NONACTION_FLAGS        0xff000000
#define IMA_DIGSIG_REQUIRED        0x01000000
#define IMA_PERMIT_DIRECTIO        0x02000000
#define IMA_NEW_FILE                0x04000000
#define IMA_SIGV3_REQUIRED        0x08000000
#define IMA_FAIL_UNVERIFIABLE_SIGS        0x10000000
#define IMA_MODSIG_ALLOWED        0x20000000
#define IMA_CHECK_BLACKLIST        0x40000000
#define IMA_VERITY_REQUIRED        0x80000000

/* Exclude non-action flags which are not rule-specific. */
#define IMA_NONACTION_RULE_FLAGS        (IMA_NONACTION_FLAGS & ~IMA_NEW_FILE)

#define IMA_DO_MASK                (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \
                                 IMA_HASH | IMA_APPRAISE_SUBMASK)
#define IMA_DONE_MASK                (IMA_MEASURED | IMA_APPRAISED | IMA_AUDITED | \
                                 IMA_HASHED | IMA_COLLECTED | \
                                 IMA_APPRAISED_SUBMASK)

/* IMA iint subaction appraise cache flags */
#define IMA_FILE_APPRAISE        0x00001000
#define IMA_FILE_APPRAISED        0x00002000
#define IMA_MMAP_APPRAISE        0x00004000
#define IMA_MMAP_APPRAISED        0x00008000
#define IMA_BPRM_APPRAISE        0x00010000
#define IMA_BPRM_APPRAISED        0x00020000
#define IMA_READ_APPRAISE        0x00040000
#define IMA_READ_APPRAISED        0x00080000
#define IMA_CREDS_APPRAISE        0x00100000
#define IMA_CREDS_APPRAISED        0x00200000
#define IMA_APPRAISE_SUBMASK        (IMA_FILE_APPRAISE | IMA_MMAP_APPRAISE | \
                                 IMA_BPRM_APPRAISE | IMA_READ_APPRAISE | \
                                 IMA_CREDS_APPRAISE)
#define IMA_APPRAISED_SUBMASK        (IMA_FILE_APPRAISED | IMA_MMAP_APPRAISED | \
                                 IMA_BPRM_APPRAISED | IMA_READ_APPRAISED | \
                                 IMA_CREDS_APPRAISED)

/*
 * IMA iint cache atomic_flags
 *
 * IMA_CHANGE_ATTR - indicates that chATTR() was called (chmod, chown, chgrp)
 * and file attributes have changed. On file open, it causes IMA to clear
 * iint->flags to re-evaluate policy and perform IMA functions again.
 *
 * IMA_CHANGE_XATTR - indicates that setxattr or removexattr was called and
 * extended attributes have changed. On file open, it causes IMA to clear
 * iint->flags IMA_DONE_MASK to re-appraise.
 *
 * IMA_UPDATE_XATTR - indicates that security.ima needs to be updated. It is
 * cleared if file policy changes and no update is needed.
 *
 * IMA_DIGSIG - indicates that file security.ima has signature and file
 * security.ima must not update on file close.
 *
 * IMA_MAY_EMIT_TOMTOU - indicates to add Time-of-Measure-Time-of-Use (ToMToU)
 * integrity violation (a file that is already opened for read is opened for
 * write) to the measurement list and to also emit an audit message.
 *
 * IMA_EMITTED_OPENWRITERS - indicates to add open-writers integrity violation
 * (a file that is already opened for write is opened for read) to the
 * measurement list and to also emit an audit message.
 *
 */
#define IMA_CHANGE_XATTR        0
#define IMA_UPDATE_XATTR        1
#define IMA_CHANGE_ATTR                2
#define IMA_DIGSIG                3
#define IMA_MAY_EMIT_TOMTOU        4
#define IMA_EMITTED_OPENWRITERS        5

/* IMA integrity metadata associated with an inode */
struct ima_iint_cache {
        struct mutex mutex;        /* protects: version, flags, digest */
        struct integrity_inode_attributes real_inode;
        unsigned long flags;
        unsigned long measured_pcrs;
        unsigned long atomic_flags;
        enum integrity_status ima_file_status:4;
        enum integrity_status ima_mmap_status:4;
        enum integrity_status ima_bprm_status:4;
        enum integrity_status ima_read_status:4;
        enum integrity_status ima_creds_status:4;
        struct ima_digest_data *ima_hash;
};

extern struct lsm_blob_sizes ima_blob_sizes;

static inline struct ima_iint_cache *
ima_inode_get_iint(const struct inode *inode)
{
        struct ima_iint_cache **iint_sec;

        if (unlikely(!inode->i_security))
                return NULL;

        iint_sec = inode->i_security + ima_blob_sizes.lbs_inode;
        return *iint_sec;
}

static inline void ima_inode_set_iint(const struct inode *inode,
                                      struct ima_iint_cache *iint)
{
        struct ima_iint_cache **iint_sec;

        if (unlikely(!inode->i_security))
                return;

        iint_sec = inode->i_security + ima_blob_sizes.lbs_inode;
        *iint_sec = iint;
}

struct ima_iint_cache *ima_iint_find(struct inode *inode);
struct ima_iint_cache *ima_inode_get(struct inode *inode);
void ima_inode_free_rcu(void *inode_security);
void __init ima_iintcache_init(void);

extern const int read_idmap[];

#ifdef CONFIG_HAVE_IMA_KEXEC
void ima_load_kexec_buffer(void);
#else
static inline void ima_load_kexec_buffer(void) {}
#endif /* CONFIG_HAVE_IMA_KEXEC */

#ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS
void ima_post_key_create_or_update(struct key *keyring, struct key *key,
                                   const void *payload, size_t plen,
                                   unsigned long flags, bool create);
#endif

#ifdef CONFIG_IMA_KEXEC
void ima_measure_kexec_event(const char *event_name);
#else
static inline void ima_measure_kexec_event(const char *event_name) {}
#endif

/*
 * The default binary_runtime_measurements list format is defined as the
 * platform native format.  The canonical format is defined as little-endian.
 */
extern bool ima_canonical_fmt;

/* Internal IMA function definitions */
int ima_init(void);
int ima_fs_init(void);
int ima_add_template_entry(struct ima_template_entry *entry, int violation,
                           const char *op, struct inode *inode,
                           const unsigned char *filename);
int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash);
int ima_calc_buffer_hash(const void *buf, loff_t len,
                         struct ima_digest_data *hash);
int ima_calc_field_array_hash(struct ima_field_data *field_data,
                              struct ima_template_entry *entry);
int ima_calc_boot_aggregate(struct ima_digest_data *hash);
void ima_add_violation(struct file *file, const unsigned char *filename,
                       struct ima_iint_cache *iint, const char *op,
                       const char *cause);
int ima_init_crypto(void);
void ima_putc(struct seq_file *m, void *data, int datalen);
void ima_print_digest(struct seq_file *m, u8 *digest, u32 size);
int template_desc_init_fields(const char *template_fmt,
                              const struct ima_template_field ***fields,
                              int *num_fields);
struct ima_template_desc *ima_template_desc_current(void);
struct ima_template_desc *ima_template_desc_buf(void);
struct ima_template_desc *lookup_template_desc(const char *name);
bool ima_template_has_modsig(const struct ima_template_desc *ima_template);
int ima_restore_measurement_entry(struct ima_template_entry *entry);
int ima_restore_measurement_list(loff_t bufsize, void *buf);
int ima_measurements_show(struct seq_file *m, void *v);
unsigned long ima_get_binary_runtime_size(void);
int ima_init_template(void);
void ima_init_template_list(void);
int __init ima_init_digests(void);
void __init ima_init_reboot_notifier(void);
int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event,
                          void *lsm_data);

/*
 * used to protect h_table and sha_table
 */
extern spinlock_t ima_queue_lock;

struct ima_h_table {
        atomic_long_t len;        /* number of stored measurements in the list */
        atomic_long_t violations;
        struct hlist_head queue[IMA_MEASURE_HTABLE_SIZE];
};
extern struct ima_h_table ima_htable;

static inline unsigned int ima_hash_key(u8 *digest)
{
        /* there is no point in taking a hash of part of a digest */
        return (digest[0] | digest[1] << 8) % IMA_MEASURE_HTABLE_SIZE;
}

#define __ima_hooks(hook)                                \
        hook(NONE, none)                                \
        hook(FILE_CHECK, file)                                \
        hook(MMAP_CHECK, mmap)                                \
        hook(MMAP_CHECK_REQPROT, mmap_reqprot)                \
        hook(BPRM_CHECK, bprm)                                \
        hook(CREDS_CHECK, creds)                        \
        hook(POST_SETATTR, post_setattr)                \
        hook(MODULE_CHECK, module)                        \
        hook(FIRMWARE_CHECK, firmware)                        \
        hook(KEXEC_KERNEL_CHECK, kexec_kernel)                \
        hook(KEXEC_INITRAMFS_CHECK, kexec_initramfs)        \
        hook(POLICY_CHECK, policy)                        \
        hook(KEXEC_CMDLINE, kexec_cmdline)                \
        hook(KEY_CHECK, key)                                \
        hook(CRITICAL_DATA, critical_data)                \
        hook(SETXATTR_CHECK, setxattr_check)                \
        hook(MAX_CHECK, none)

#define __ima_hook_enumify(ENUM, str)        ENUM,
#define __ima_stringify(arg) (#arg)
#define __ima_hook_measuring_stringify(ENUM, str) \
                (__ima_stringify(measuring_ ##str)),

enum ima_hooks {
        __ima_hooks(__ima_hook_enumify)
};

static const char * const ima_hooks_measure_str[] = {
        __ima_hooks(__ima_hook_measuring_stringify)
};

static inline const char *func_measure_str(enum ima_hooks func)
{
        if (func >= MAX_CHECK)
                return ima_hooks_measure_str[NONE];

        return ima_hooks_measure_str[func];
}

extern const char *const func_tokens[];

struct modsig;

#ifdef CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS
/*
 * To track keys that need to be measured.
 */
struct ima_key_entry {
        struct list_head list;
        void *payload;
        size_t payload_len;
        char *keyring_name;
};
void ima_init_key_queue(void);
bool ima_should_queue_key(void);
bool ima_queue_key(struct key *keyring, const void *payload,
                   size_t payload_len);
void ima_process_queued_keys(void);
#else
static inline void ima_init_key_queue(void) {}
static inline bool ima_should_queue_key(void) { return false; }
static inline bool ima_queue_key(struct key *keyring,
                                 const void *payload,
                                 size_t payload_len) { return false; }
static inline void ima_process_queued_keys(void) {}
#endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */

/* LIM API function definitions */
int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
                   const struct cred *cred, struct lsm_prop *prop, int mask,
                   enum ima_hooks func, int *pcr,
                   struct ima_template_desc **template_desc,
                   const char *func_data, unsigned int *allowed_algos);
int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func);
int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file,
                            void *buf, loff_t size, enum hash_algo algo,
                            struct modsig *modsig);
void ima_store_measurement(struct ima_iint_cache *iint, struct file *file,
                           const unsigned char *filename,
                           struct evm_ima_xattr_data *xattr_value,
                           int xattr_len, const struct modsig *modsig, int pcr,
                           struct ima_template_desc *template_desc);
int process_buffer_measurement(struct mnt_idmap *idmap,
                               struct inode *inode, const void *buf, int size,
                               const char *eventname, enum ima_hooks func,
                               int pcr, const char *func_data,
                               bool buf_hash, u8 *digest, size_t digest_len);
void ima_audit_measurement(struct ima_iint_cache *iint,
                           const unsigned char *filename);
int ima_alloc_init_template(struct ima_event_data *event_data,
                            struct ima_template_entry **entry,
                            struct ima_template_desc *template_desc);
int ima_store_template(struct ima_template_entry *entry, int violation,
                       struct inode *inode,
                       const unsigned char *filename, int pcr);
void ima_free_template_entry(struct ima_template_entry *entry);
const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);

/* IMA policy related functions */
int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
                     const struct cred *cred, struct lsm_prop *prop,
                     enum ima_hooks func, int mask, int flags, int *pcr,
                     struct ima_template_desc **template_desc,
                     const char *func_data, unsigned int *allowed_algos);
void ima_init_policy(void);
void ima_update_policy(void);
void ima_update_policy_flags(void);
ssize_t ima_parse_add_rule(char *);
void ima_delete_rules(void);
int ima_check_policy(void);
void *ima_policy_start(struct seq_file *m, loff_t *pos);
void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos);
void ima_policy_stop(struct seq_file *m, void *v);
int ima_policy_show(struct seq_file *m, void *v);

/* Appraise integrity measurements */
#define IMA_APPRAISE_ENFORCE        0x01
#define IMA_APPRAISE_FIX        0x02
#define IMA_APPRAISE_LOG        0x04
#define IMA_APPRAISE_MODULES        0x08
#define IMA_APPRAISE_FIRMWARE        0x10
#define IMA_APPRAISE_POLICY        0x20
#define IMA_APPRAISE_KEXEC        0x40

#ifdef CONFIG_IMA_APPRAISE
int ima_check_blacklist(struct ima_iint_cache *iint,
                        const struct modsig *modsig, int pcr);
int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint,
                             struct file *file, const unsigned char *filename,
                             struct evm_ima_xattr_data *xattr_value,
                             int xattr_len, const struct modsig *modsig,
                             bool bprm_is_check);
int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode,
                      int mask, enum ima_hooks func);
void ima_update_xattr(struct ima_iint_cache *iint, struct file *file);
enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint,
                                           enum ima_hooks func);
enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value,
                                 int xattr_len);
int ima_read_xattr(struct dentry *dentry,
                   struct evm_ima_xattr_data **xattr_value, int xattr_len);
void __init init_ima_appraise_lsm(const struct lsm_id *lsmid);

#else
static inline int ima_check_blacklist(struct ima_iint_cache *iint,
                                      const struct modsig *modsig, int pcr)
{
        return 0;
}

static inline int ima_appraise_measurement(enum ima_hooks func,
                                           struct ima_iint_cache *iint,
                                           struct file *file,
                                           const unsigned char *filename,
                                           struct evm_ima_xattr_data *xattr_value,
                                           int xattr_len,
                                           const struct modsig *modsig,
                                           bool bprm_is_check)
{
        return INTEGRITY_UNKNOWN;
}

static inline int ima_must_appraise(struct mnt_idmap *idmap,
                                    struct inode *inode, int mask,
                                    enum ima_hooks func)
{
        return 0;
}

static inline void ima_update_xattr(struct ima_iint_cache *iint,
                                    struct file *file)
{
}

static inline enum integrity_status
ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func)
{
        return INTEGRITY_UNKNOWN;
}

static inline enum hash_algo
ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len)
{
        return ima_hash_algo;
}

static inline int ima_read_xattr(struct dentry *dentry,
                                 struct evm_ima_xattr_data **xattr_value,
                                 int xattr_len)
{
        return 0;
}

static inline void __init init_ima_appraise_lsm(const struct lsm_id *lsmid)
{
}

#endif /* CONFIG_IMA_APPRAISE */

#ifdef CONFIG_IMA_APPRAISE_MODSIG
int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len,
                    struct modsig **modsig);
void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size);
int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo,
                          const u8 **digest, u32 *digest_size);
int ima_get_raw_modsig(const struct modsig *modsig, const void **data,
                       u32 *data_len);
void ima_free_modsig(struct modsig *modsig);
#else
static inline int ima_read_modsig(enum ima_hooks func, const void *buf,
                                  loff_t buf_len, struct modsig **modsig)
{
        return -EOPNOTSUPP;
}

static inline void ima_collect_modsig(struct modsig *modsig, const void *buf,
                                      loff_t size)
{
}

static inline int ima_get_modsig_digest(const struct modsig *modsig,
                                        enum hash_algo *algo, const u8 **digest,
                                        u32 *digest_size)
{
        return -EOPNOTSUPP;
}

static inline int ima_get_raw_modsig(const struct modsig *modsig,
                                     const void **data, u32 *data_len)
{
        return -EOPNOTSUPP;
}

static inline void ima_free_modsig(struct modsig *modsig)
{
}
#endif /* CONFIG_IMA_APPRAISE_MODSIG */

/* LSM based policy rules require audit */
#ifdef CONFIG_IMA_LSM_RULES

#define ima_filter_rule_init security_audit_rule_init
#define ima_filter_rule_free security_audit_rule_free
#define ima_filter_rule_match security_audit_rule_match

#else

static inline int ima_filter_rule_init(u32 field, u32 op, char *rulestr,
                                       void **lsmrule, gfp_t gfp)
{
        return -EINVAL;
}

static inline void ima_filter_rule_free(void *lsmrule)
{
}

static inline int ima_filter_rule_match(struct lsm_prop *prop, u32 field, u32 op,
                                        void *lsmrule)
{
        return -EINVAL;
}
#endif /* CONFIG_IMA_LSM_RULES */

#ifdef        CONFIG_IMA_READ_POLICY
#define        POLICY_FILE_FLAGS        (S_IWUSR | S_IRUSR)
#else
#define        POLICY_FILE_FLAGS        S_IWUSR
#endif /* CONFIG_IMA_READ_POLICY */

#endif /* __LINUX_IMA_H */

































    3 


   26 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM exceptions

#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGE_FAULT_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(exceptions,

        TP_PROTO(unsigned long address, struct pt_regs *regs,
                 unsigned long error_code),

        TP_ARGS(address, regs, error_code),

        TP_STRUCT__entry(
                __field(                unsigned long, address        )
                __field(                unsigned long, ip        )
                __field(                unsigned long, error_code )
        ),

        TP_fast_assign(
                __entry->address = address;
                __entry->ip = instruction_pointer(regs);
                __entry->error_code = error_code;
        ),

        TP_printk("address=%ps ip=%ps error_code=0x%lx",
                  (void *)__entry->address, (void *)__entry->ip,
                  __entry->error_code) );

DEFINE_EVENT(exceptions, page_fault_user,
        TP_PROTO(unsigned long address,        struct pt_regs *regs, unsigned long error_code),
        TP_ARGS(address, regs, error_code));
DEFINE_EVENT(exceptions, page_fault_kernel,
        TP_PROTO(unsigned long address,        struct pt_regs *regs, unsigned long error_code),
        TP_ARGS(address, regs, error_code));

#endif /*  _TRACE_PAGE_FAULT_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































































    3 

































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
// SPDX-License-Identifier: GPL-2.0
/*
 * Provides code common for host and device side USB.
 *
 * If either host side (ie. CONFIG_USB=y) or device side USB stack
 * (ie. CONFIG_USB_GADGET=y) is compiled in the kernel, this module is
 * compiled-in as well.  Otherwise, if either of the two stacks is
 * compiled as module, this file is compiled as module as well.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/platform_device.h>
#include <linux/usb/ch9.h>
#include <linux/usb/of.h>
#include <linux/usb/otg.h>
#include <linux/of_platform.h>
#include <linux/debugfs.h>
#include "common.h"

static const char *const ep_type_names[] = {
        [USB_ENDPOINT_XFER_CONTROL] = "ctrl",
        [USB_ENDPOINT_XFER_ISOC] = "isoc",
        [USB_ENDPOINT_XFER_BULK] = "bulk",
        [USB_ENDPOINT_XFER_INT] = "intr",
};

/**
 * usb_ep_type_string() - Returns human readable-name of the endpoint type.
 * @ep_type: The endpoint type to return human-readable name for.  If it's not
 *   any of the types: USB_ENDPOINT_XFER_{CONTROL, ISOC, BULK, INT},
 *   usually got by usb_endpoint_type(), the string 'unknown' will be returned.
 */
const char *usb_ep_type_string(int ep_type)
{
        if (ep_type < 0 || ep_type >= ARRAY_SIZE(ep_type_names))
                return "unknown";

        return ep_type_names[ep_type];
}
EXPORT_SYMBOL_GPL(usb_ep_type_string);

/**
 * usb_otg_state_string() - returns human readable name of OTG state.
 * @state: the OTG state to return the human readable name of. If it's not
 *    any of the states defined in usb_otg_state enum, 'UNDEFINED' will be
 *    returned.
 */
const char *usb_otg_state_string(enum usb_otg_state state)
{
        static const char *const names[] = {
                [OTG_STATE_A_IDLE] = "a_idle",
                [OTG_STATE_A_WAIT_VRISE] = "a_wait_vrise",
                [OTG_STATE_A_WAIT_BCON] = "a_wait_bcon",
                [OTG_STATE_A_HOST] = "a_host",
                [OTG_STATE_A_SUSPEND] = "a_suspend",
                [OTG_STATE_A_PERIPHERAL] = "a_peripheral",
                [OTG_STATE_A_WAIT_VFALL] = "a_wait_vfall",
                [OTG_STATE_A_VBUS_ERR] = "a_vbus_err",
                [OTG_STATE_B_IDLE] = "b_idle",
                [OTG_STATE_B_SRP_INIT] = "b_srp_init",
                [OTG_STATE_B_PERIPHERAL] = "b_peripheral",
                [OTG_STATE_B_WAIT_ACON] = "b_wait_acon",
                [OTG_STATE_B_HOST] = "b_host",
        };

        if (state < 0 || state >= ARRAY_SIZE(names))
                return "UNDEFINED";

        return names[state];
}
EXPORT_SYMBOL_GPL(usb_otg_state_string);

static const char *const speed_names[] = {
        [USB_SPEED_UNKNOWN] = "UNKNOWN",
        [USB_SPEED_LOW] = "low-speed",
        [USB_SPEED_FULL] = "full-speed",
        [USB_SPEED_HIGH] = "high-speed",
        [USB_SPEED_WIRELESS] = "wireless",
        [USB_SPEED_SUPER] = "super-speed",
        [USB_SPEED_SUPER_PLUS] = "super-speed-plus",
};

static const char *const ssp_rate[] = {
        [USB_SSP_GEN_UNKNOWN] = "UNKNOWN",
        [USB_SSP_GEN_2x1] = "super-speed-plus-gen2x1",
        [USB_SSP_GEN_1x2] = "super-speed-plus-gen1x2",
        [USB_SSP_GEN_2x2] = "super-speed-plus-gen2x2",
};

/**
 * usb_speed_string() - Returns human readable-name of the speed.
 * @speed: The speed to return human-readable name for.  If it's not
 *   any of the speeds defined in usb_device_speed enum, string for
 *   USB_SPEED_UNKNOWN will be returned.
 */
const char *usb_speed_string(enum usb_device_speed speed)
{
        if (speed < 0 || speed >= ARRAY_SIZE(speed_names))
                speed = USB_SPEED_UNKNOWN;
        return speed_names[speed];
}
EXPORT_SYMBOL_GPL(usb_speed_string);

/**
 * usb_get_maximum_speed - Get maximum requested speed for a given USB
 * controller.
 * @dev: Pointer to the given USB controller device
 *
 * The function gets the maximum speed string from property "maximum-speed",
 * and returns the corresponding enum usb_device_speed.
 */
enum usb_device_speed usb_get_maximum_speed(struct device *dev)
{
        const char *p = "maximum-speed";
        int ret;

        ret = device_property_match_property_string(dev, p, ssp_rate, ARRAY_SIZE(ssp_rate));
        if (ret > 0)
                return USB_SPEED_SUPER_PLUS;

        ret = device_property_match_property_string(dev, p, speed_names, ARRAY_SIZE(speed_names));
        if (ret > 0)
                return ret;

        return USB_SPEED_UNKNOWN;
}
EXPORT_SYMBOL_GPL(usb_get_maximum_speed);

/**
 * usb_get_maximum_ssp_rate - Get the signaling rate generation and lane count
 *        of a SuperSpeed Plus capable device.
 * @dev: Pointer to the given USB controller device
 *
 * If the string from "maximum-speed" property is super-speed-plus-genXxY where
 * 'X' is the generation number and 'Y' is the number of lanes, then this
 * function returns the corresponding enum usb_ssp_rate.
 */
enum usb_ssp_rate usb_get_maximum_ssp_rate(struct device *dev)
{
        const char *maximum_speed;
        int ret;

        ret = device_property_read_string(dev, "maximum-speed", &maximum_speed);
        if (ret < 0)
                return USB_SSP_GEN_UNKNOWN;

        ret = match_string(ssp_rate, ARRAY_SIZE(ssp_rate), maximum_speed);
        return (ret < 0) ? USB_SSP_GEN_UNKNOWN : ret;
}
EXPORT_SYMBOL_GPL(usb_get_maximum_ssp_rate);

/**
 * usb_state_string - Returns human readable name for the state.
 * @state: The state to return a human-readable name for. If it's not
 *        any of the states devices in usb_device_state_string enum,
 *        the string UNKNOWN will be returned.
 */
const char *usb_state_string(enum usb_device_state state)
{
        static const char *const names[] = {
                [USB_STATE_NOTATTACHED] = "not attached",
                [USB_STATE_ATTACHED] = "attached",
                [USB_STATE_POWERED] = "powered",
                [USB_STATE_RECONNECTING] = "reconnecting",
                [USB_STATE_UNAUTHENTICATED] = "unauthenticated",
                [USB_STATE_DEFAULT] = "default",
                [USB_STATE_ADDRESS] = "addressed",
                [USB_STATE_CONFIGURED] = "configured",
                [USB_STATE_SUSPENDED] = "suspended",
        };

        if (state < 0 || state >= ARRAY_SIZE(names))
                return "UNKNOWN";

        return names[state];
}
EXPORT_SYMBOL_GPL(usb_state_string);

static const char *const usb_dr_modes[] = {
        [USB_DR_MODE_UNKNOWN]                = "",
        [USB_DR_MODE_HOST]                = "host",
        [USB_DR_MODE_PERIPHERAL]        = "peripheral",
        [USB_DR_MODE_OTG]                = "otg",
};

/**
 * usb_get_dr_mode_from_string() - Get dual role mode for given string
 * @str: String to find the corresponding dual role mode for
 *
 * This function performs a lookup for the given string and returns the
 * corresponding enum usb_dr_mode. If no match for the string could be found,
 * 'USB_DR_MODE_UNKNOWN' is returned.
 */
static enum usb_dr_mode usb_get_dr_mode_from_string(const char *str)
{
        int ret;

        ret = match_string(usb_dr_modes, ARRAY_SIZE(usb_dr_modes), str);
        return (ret < 0) ? USB_DR_MODE_UNKNOWN : ret;
}

enum usb_dr_mode usb_get_dr_mode(struct device *dev)
{
        const char *dr_mode;
        int err;

        err = device_property_read_string(dev, "dr_mode", &dr_mode);
        if (err < 0)
                return USB_DR_MODE_UNKNOWN;

        return usb_get_dr_mode_from_string(dr_mode);
}
EXPORT_SYMBOL_GPL(usb_get_dr_mode);

/**
 * usb_get_role_switch_default_mode - Get default mode for given device
 * @dev: Pointer to the given device
 *
 * The function gets string from property 'role-switch-default-mode',
 * and returns the corresponding enum usb_dr_mode.
 */
enum usb_dr_mode usb_get_role_switch_default_mode(struct device *dev)
{
        const char *str;
        int ret;

        ret = device_property_read_string(dev, "role-switch-default-mode", &str);
        if (ret < 0)
                return USB_DR_MODE_UNKNOWN;

        return usb_get_dr_mode_from_string(str);
}
EXPORT_SYMBOL_GPL(usb_get_role_switch_default_mode);

/**
 * usb_decode_interval - Decode bInterval into the time expressed in 1us unit
 * @epd: The descriptor of the endpoint
 * @speed: The speed that the endpoint works as
 *
 * Function returns the interval expressed in 1us unit for servicing
 * endpoint for data transfers.
 */
unsigned int usb_decode_interval(const struct usb_endpoint_descriptor *epd,
                                 enum usb_device_speed speed)
{
        unsigned int interval = 0;

        switch (usb_endpoint_type(epd)) {
        case USB_ENDPOINT_XFER_CONTROL:
                /* uframes per NAK */
                if (speed == USB_SPEED_HIGH)
                        interval = epd->bInterval;
                break;
        case USB_ENDPOINT_XFER_ISOC:
                interval = 1 << (epd->bInterval - 1);
                break;
        case USB_ENDPOINT_XFER_BULK:
                /* uframes per NAK */
                if (speed == USB_SPEED_HIGH && usb_endpoint_dir_out(epd))
                        interval = epd->bInterval;
                break;
        case USB_ENDPOINT_XFER_INT:
                if (speed >= USB_SPEED_HIGH)
                        interval = 1 << (epd->bInterval - 1);
                else
                        interval = epd->bInterval;
                break;
        }

        interval *= (speed >= USB_SPEED_HIGH) ? 125 : 1000;

        return interval;
}
EXPORT_SYMBOL_GPL(usb_decode_interval);

#ifdef CONFIG_OF
/**
 * of_usb_get_dr_mode_by_phy - Get dual role mode for the controller device
 * which is associated with the given phy device_node
 * @np:        Pointer to the given phy device_node
 * @arg0: phandle args[0] for phy's with #phy-cells >= 1, or -1 for
 *        phys which do not have phy-cells
 *
 * In dts a usb controller associates with phy devices.  The function gets
 * the string from property 'dr_mode' of the controller associated with the
 * given phy device node, and returns the correspondig enum usb_dr_mode.
 */
enum usb_dr_mode of_usb_get_dr_mode_by_phy(struct device_node *np, int arg0)
{
        struct device_node *controller;
        struct of_phandle_args args;
        const char *dr_mode;
        int index;
        int err;

        for_each_node_with_property(controller, "phys") {
                if (!of_device_is_available(controller))
                        continue;
                index = 0;
                do {
                        if (arg0 == -1) {
                                args.np = of_parse_phandle(controller, "phys",
                                                        index);
                                args.args_count = 0;
                        } else {
                                err = of_parse_phandle_with_args(controller,
                                                        "phys", "#phy-cells",
                                                        index, &args);
                                if (err)
                                        break;
                        }

                        of_node_put(args.np);
                        if (args.np == np && (args.args_count == 0 ||
                                              args.args[0] == arg0))
                                goto finish;
                        index++;
                } while (args.np);
        }

finish:
        err = of_property_read_string(controller, "dr_mode", &dr_mode);
        of_node_put(controller);

        if (err < 0)
                return USB_DR_MODE_UNKNOWN;

        return usb_get_dr_mode_from_string(dr_mode);
}
EXPORT_SYMBOL_GPL(of_usb_get_dr_mode_by_phy);

/**
 * of_usb_host_tpl_support - to get if Targeted Peripheral List is supported
 * for given targeted hosts (non-PC hosts)
 * @np: Pointer to the given device_node
 *
 * The function gets if the targeted hosts support TPL or not
 */
bool of_usb_host_tpl_support(struct device_node *np)
{
        return of_property_read_bool(np, "tpl-support");
}
EXPORT_SYMBOL_GPL(of_usb_host_tpl_support);

/**
 * of_usb_update_otg_caps - to update usb otg capabilities according to
 * the passed properties in DT.
 * @np: Pointer to the given device_node
 * @otg_caps: Pointer to the target usb_otg_caps to be set
 *
 * The function updates the otg capabilities
 */
int of_usb_update_otg_caps(struct device_node *np,
                        struct usb_otg_caps *otg_caps)
{
        u32 otg_rev;

        if (!otg_caps)
                return -EINVAL;

        if (!of_property_read_u32(np, "otg-rev", &otg_rev)) {
                switch (otg_rev) {
                case 0x0100:
                case 0x0120:
                case 0x0130:
                case 0x0200:
                        /* Choose the lesser one if it's already been set */
                        if (otg_caps->otg_rev)
                                otg_caps->otg_rev = min_t(u16, otg_rev,
                                                        otg_caps->otg_rev);
                        else
                                otg_caps->otg_rev = otg_rev;
                        break;
                default:
                        pr_err("%pOF: unsupported otg-rev: 0x%x\n",
                                                np, otg_rev);
                        return -EINVAL;
                }
        } else {
                /*
                 * otg-rev is mandatory for otg properties, if not passed
                 * we set it to be 0 and assume it's a legacy otg device.
                 * Non-dt platform can set it afterwards.
                 */
                otg_caps->otg_rev = 0;
        }

        if (of_property_read_bool(np, "hnp-disable"))
                otg_caps->hnp_support = false;
        if (of_property_read_bool(np, "srp-disable"))
                otg_caps->srp_support = false;
        if (of_property_read_bool(np, "adp-disable") ||
                                (otg_caps->otg_rev < 0x0200))
                otg_caps->adp_support = false;

        return 0;
}
EXPORT_SYMBOL_GPL(of_usb_update_otg_caps);

/**
 * usb_of_get_companion_dev - Find the companion device
 * @dev: the device pointer to find a companion
 *
 * Find the companion device from platform bus.
 *
 * Takes a reference to the returned struct device which needs to be dropped
 * after use.
 *
 * Return: On success, a pointer to the companion device, %NULL on failure.
 */
struct device *usb_of_get_companion_dev(struct device *dev)
{
        struct device_node *node;
        struct platform_device *pdev = NULL;

        node = of_parse_phandle(dev->of_node, "companion", 0);
        if (node)
                pdev = of_find_device_by_node(node);

        of_node_put(node);

        return pdev ? &pdev->dev : NULL;
}
EXPORT_SYMBOL_GPL(usb_of_get_companion_dev);
#endif

struct dentry *usb_debug_root;
EXPORT_SYMBOL_GPL(usb_debug_root);

DEFINE_MUTEX(usb_dynids_lock);
EXPORT_SYMBOL_GPL(usb_dynids_lock);

static int __init usb_common_init(void)
{
        usb_debug_root = debugfs_create_dir("usb", NULL);
        ledtrig_usb_init();
        return 0;
}

static void __exit usb_common_exit(void)
{
        ledtrig_usb_exit();
        debugfs_remove_recursive(usb_debug_root);
}

subsys_initcall(usb_common_init);
module_exit(usb_common_exit);

MODULE_DESCRIPTION("Common code for host and device side USB");
MODULE_LICENSE("GPL");



































































    3 






    1 




























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2008-2012 Novell Inc.
 * Copyright (c) 2012-2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 * Copyright (c) 2012-2019 Linux Foundation
 *
 * Core driver model functions and structures that should not be
 * shared outside of the drivers/base/ directory.
 *
 */
#include <linux/notifier.h>

/**
 * struct subsys_private - structure to hold the private to the driver core
 *                           portions of the bus_type/class structure.
 * @subsys: the struct kset that defines this subsystem
 * @devices_kset: the subsystem's 'devices' directory
 * @interfaces: list of subsystem interfaces associated
 * @mutex: protect the devices, and interfaces lists.
 * @drivers_kset: the list of drivers associated
 * @klist_devices: the klist to iterate over the @devices_kset
 * @klist_drivers: the klist to iterate over the @drivers_kset
 * @bus_notifier: the bus notifier list for anything that cares about things
 *                  on this bus.
 * @drivers_autoprobe: gate whether new devices are automatically attached to
 *                       registered drivers, or new drivers automatically attach
 *                       to existing devices.
 * @bus: pointer back to the struct bus_type that this structure is associated
 *         with.
 * @dev_root: Default device to use as the parent.
 * @glue_dirs: "glue" directory to put in-between the parent device to
 *               avoid namespace conflicts
 * @class: pointer back to the struct class that this structure is associated
 *           with.
 * @lock_key: Lock class key for use by the lock validator
 *
 * This structure is the one that is the actual kobject allowing struct
 * bus_type/class to be statically allocated safely.  Nothing outside of the
 * driver core should ever touch these fields.
 */
struct subsys_private {
        struct kset subsys;
        struct kset *devices_kset;
        struct list_head interfaces;
        struct mutex mutex;

        struct kset *drivers_kset;
        struct klist klist_devices;
        struct klist klist_drivers;
        struct blocking_notifier_head bus_notifier;
        unsigned int drivers_autoprobe:1;
        const struct bus_type *bus;
        struct device *dev_root;

        struct kset glue_dirs;
        const struct class *class;

        struct lock_class_key lock_key;
};
#define to_subsys_private(obj) container_of_const(obj, struct subsys_private, subsys.kobj)

static inline struct subsys_private *subsys_get(struct subsys_private *sp)
{
        if (sp)
                kset_get(&sp->subsys);
        return sp;
}

static inline void subsys_put(struct subsys_private *sp)
{
        if (sp)
                kset_put(&sp->subsys);
}

struct subsys_private *bus_to_subsys(const struct bus_type *bus);
struct subsys_private *class_to_subsys(const struct class *class);

struct driver_private {
        struct kobject kobj;
        struct klist klist_devices;
        struct klist_node knode_bus;
        struct module_kobject *mkobj;
        struct device_driver *driver;
};
#define to_driver(obj) container_of(obj, struct driver_private, kobj)

#ifdef CONFIG_RUST
/**
 * struct driver_type - Representation of a Rust driver type.
 */
struct driver_type {
        /**
         * @id: Representation of core::any::TypeId.
         */
        u8 id[16];
} __packed;
#endif

/**
 * struct device_private - structure to hold the private to the driver core
 *                           portions of the device structure.
 * @klist_children: klist containing all children of this device
 * @knode_parent: node in sibling list
 * @knode_driver: node in driver list
 * @knode_bus: node in bus list
 * @knode_class: node in class list
 * @deferred_probe: entry in deferred_probe_list which is used to retry the
 *                    binding of drivers which were unable to get all the
 *                    resources needed by the device; typically because it depends
 *                    on another driver getting probed first.
 * @async_driver: pointer to device driver awaiting probe via async_probe
 * @deferred_probe_reason: capture the -EPROBE_DEFER message emitted with
 *                           dev_err_probe() for later retrieval via debugfs
 * @device: pointer back to the struct device that this structure is
 *            associated with.
 * @driver_type: The type of the bound Rust driver.
 * @dead: This device is currently either in the process of or has been
 *          removed from the system. Any asynchronous events scheduled for this
 *          device should exit without taking any action.
 *
 * Nothing outside of the driver core should ever touch these fields.
 */
struct device_private {
        struct klist klist_children;
        struct klist_node knode_parent;
        struct klist_node knode_driver;
        struct klist_node knode_bus;
        struct klist_node knode_class;
        struct list_head deferred_probe;
        const struct device_driver *async_driver;
        char *deferred_probe_reason;
        struct device *device;
#ifdef CONFIG_RUST
        struct driver_type driver_type;
#endif
        u8 dead:1;
};
#define to_device_private_parent(obj)        \
        container_of(obj, struct device_private, knode_parent)
#define to_device_private_driver(obj)        \
        container_of(obj, struct device_private, knode_driver)
#define to_device_private_bus(obj)        \
        container_of(obj, struct device_private, knode_bus)
#define to_device_private_class(obj)        \
        container_of(obj, struct device_private, knode_class)

/* initialisation functions */
int devices_init(void);
int buses_init(void);
int classes_init(void);
int firmware_init(void);
#ifdef CONFIG_SYS_HYPERVISOR
int hypervisor_init(void);
#else
static inline int hypervisor_init(void) { return 0; }
#endif
int platform_bus_init(void);
int faux_bus_init(void);
void cpu_dev_init(void);
void container_dev_init(void);
#ifdef CONFIG_AUXILIARY_BUS
void auxiliary_bus_init(void);
#else
static inline void auxiliary_bus_init(void) { }
#endif

struct kobject *virtual_device_parent(void);

int bus_add_device(struct device *dev);
void bus_probe_device(struct device *dev);
void bus_remove_device(struct device *dev);
void bus_notify(struct device *dev, enum bus_notifier_event value);
bool bus_is_registered(const struct bus_type *bus);

int bus_add_driver(struct device_driver *drv);
void bus_remove_driver(struct device_driver *drv);
void device_release_driver_internal(struct device *dev, const struct device_driver *drv,
                                    struct device *parent);

void driver_detach(const struct device_driver *drv);
void driver_deferred_probe_del(struct device *dev);
void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf);
static inline int driver_match_device(const struct device_driver *drv,
                                      struct device *dev)
{
        return drv->bus->match ? drv->bus->match(dev, drv) : 1;
}

static inline void dev_sync_state(struct device *dev)
{
        if (dev->bus->sync_state)
                dev->bus->sync_state(dev);
        else if (dev->driver && dev->driver->sync_state)
                dev->driver->sync_state(dev);
}

int driver_add_groups(const struct device_driver *drv, const struct attribute_group **groups);
void driver_remove_groups(const struct device_driver *drv, const struct attribute_group **groups);
void device_driver_detach(struct device *dev);

static inline void device_set_driver(struct device *dev, const struct device_driver *drv)
{
        /*
         * Majority (all?) read accesses to dev->driver happens either
         * while holding device lock or in bus/driver code that is only
         * invoked when the device is bound to a driver and there is no
         * concern of the pointer being changed while it is being read.
         * However when reading device's uevent file we read driver pointer
         * without taking device lock (so we do not block there for
         * arbitrary amount of time). We use WRITE_ONCE() here to prevent
         * tearing so that READ_ONCE() can safely be used in uevent code.
         */
        // FIXME - this cast should not be needed "soon"
        WRITE_ONCE(dev->driver, (struct device_driver *)drv);
}

struct devres_node;
typedef void (*dr_node_release_t)(struct device *dev, struct devres_node *node);
typedef void (*dr_node_free_t)(struct devres_node *node);

struct devres_node {
        struct list_head                entry;
        dr_node_release_t                release;
        dr_node_free_t                        free_node;
        const char                        *name;
        size_t                                size;
};

void devres_node_init(struct devres_node *node, dr_node_release_t release,
                      dr_node_free_t free_node);
void devres_node_add(struct device *dev, struct devres_node *node);
bool devres_node_remove(struct device *dev, struct devres_node *node);
void devres_set_node_dbginfo(struct devres_node *node, const char *name,
                             size_t size);
void devres_for_each_res(struct device *dev, dr_release_t release,
                         dr_match_t match, void *match_data,
                         void (*fn)(struct device *, void *, void *),
                         void *data);
int devres_release_all(struct device *dev);
void device_block_probing(void);
void device_unblock_probing(void);
void deferred_probe_extend_timeout(void);
void driver_deferred_probe_trigger(void);
const char *device_get_devnode(const struct device *dev, umode_t *mode,
                               kuid_t *uid, kgid_t *gid, const char **tmp);

/* /sys/devices directory */
extern struct kset *devices_kset;
void devices_kset_move_last(struct device *dev);

#if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS)
int module_add_driver(struct module *mod, const struct device_driver *drv);
void module_remove_driver(const struct device_driver *drv);
#else
static inline int module_add_driver(struct module *mod,
                                    struct device_driver *drv)
{
        return 0;
}
static inline void module_remove_driver(struct device_driver *drv) { }
#endif

#ifdef CONFIG_DEVTMPFS
int devtmpfs_init(void);
#else
static inline int devtmpfs_init(void) { return 0; }
#endif

#ifdef CONFIG_BLOCK
extern const struct class block_class;
static inline bool is_blockdev(struct device *dev)
{
        return dev->class == &block_class;
}
#else
static inline bool is_blockdev(struct device *dev) { return false; }
#endif

/* Device links support */
int device_links_read_lock(void);
void device_links_read_unlock(int idx);
int device_links_read_lock_held(void);
int device_links_check_suppliers(struct device *dev);
void device_links_force_bind(struct device *dev);
void device_links_driver_bound(struct device *dev);
void device_links_driver_cleanup(struct device *dev);
void device_links_no_driver(struct device *dev);
bool device_links_busy(struct device *dev);
void device_links_unbind_consumers(struct device *dev);
bool device_link_flag_is_sync_state_only(u32 flags);
void fw_devlink_drivers_done(void);
void fw_devlink_probing_done(void);

#define dev_for_each_link_to_supplier(__link, __dev)        \
        list_for_each_entry_srcu(__link, &(__dev)->links.suppliers, c_node, \
                                 device_links_read_lock_held())

#define dev_for_each_link_to_consumer(__link, __dev)        \
        list_for_each_entry_srcu(__link, &(__dev)->links.consumers, s_node, \
                                 device_links_read_lock_held())

/* device pm support */
void device_pm_move_to_tail(struct device *dev);

#ifdef CONFIG_DEVTMPFS
int devtmpfs_create_node(struct device *dev);
int devtmpfs_delete_node(struct device *dev);
#else
static inline int devtmpfs_create_node(struct device *dev) { return 0; }
static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
#endif

void software_node_init(void);
void software_node_notify(struct device *dev);
void software_node_notify_remove(struct device *dev);

#ifdef CONFIG_PINCTRL
int pinctrl_bind_pins(struct device *dev);
#else
static inline int pinctrl_bind_pins(struct device *dev)
{
        return 0;
}
#endif /* CONFIG_PINCTRL */











   77 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CLOCK_INLINED_H
#define _ASM_X86_CLOCK_INLINED_H

#include <asm/tsc.h>

struct clocksource;

static __always_inline u64 arch_inlined_clocksource_read(struct clocksource *cs)
{
        return (u64)rdtsc_ordered();
}

struct clock_event_device;

static __always_inline void
arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *evt)
{
        native_wrmsrq(MSR_IA32_TSC_DEADLINE, cycles);
}

#endif



























































































































































































































































































    3 


    3 




























    3 


    3 












































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
// SPDX-License-Identifier: GPL-2.0
/* Device wakeirq helper functions */
#include <linux/device.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/slab.h>
#include <linux/pm_runtime.h>
#include <linux/pm_wakeirq.h>

#include "power.h"

/**
 * dev_pm_attach_wake_irq - Attach device interrupt as a wake IRQ
 * @dev: Device entry
 * @wirq: Wake irq specific data
 *
 * Internal function to attach a dedicated wake-up interrupt as a wake IRQ.
 */
static int dev_pm_attach_wake_irq(struct device *dev, struct wake_irq *wirq)
{
        unsigned long flags;

        if (!dev || !wirq)
                return -EINVAL;

        spin_lock_irqsave(&dev->power.lock, flags);
        if (dev_WARN_ONCE(dev, dev->power.wakeirq,
                          "wake irq already initialized\n")) {
                spin_unlock_irqrestore(&dev->power.lock, flags);
                return -EEXIST;
        }

        dev->power.wakeirq = wirq;
        device_wakeup_attach_irq(dev, wirq);

        spin_unlock_irqrestore(&dev->power.lock, flags);
        return 0;
}

/**
 * dev_pm_set_wake_irq - Attach device IO interrupt as wake IRQ
 * @dev: Device entry
 * @irq: Device IO interrupt
 *
 * Attach a device IO interrupt as a wake IRQ. The wake IRQ gets
 * automatically configured for wake-up from suspend  based
 * on the device specific sysfs wakeup entry. Typically called
 * during driver probe after calling device_init_wakeup().
 */
int dev_pm_set_wake_irq(struct device *dev, int irq)
{
        struct wake_irq *wirq;
        int err;

        if (irq < 0)
                return -EINVAL;

        wirq = kzalloc_obj(*wirq);
        if (!wirq)
                return -ENOMEM;

        wirq->dev = dev;
        wirq->irq = irq;

        err = dev_pm_attach_wake_irq(dev, wirq);
        if (err)
                kfree(wirq);

        return err;
}
EXPORT_SYMBOL_GPL(dev_pm_set_wake_irq);

/**
 * dev_pm_clear_wake_irq - Detach a device IO interrupt wake IRQ
 * @dev: Device entry
 *
 * Detach a device wake IRQ and free resources.
 *
 * Note that it's OK for drivers to call this without calling
 * dev_pm_set_wake_irq() as all the driver instances may not have
 * a wake IRQ configured. This avoid adding wake IRQ specific
 * checks into the drivers.
 */
void dev_pm_clear_wake_irq(struct device *dev)
{
        struct wake_irq *wirq;
        unsigned long flags;

        spin_lock_irqsave(&dev->power.lock, flags);
        wirq = dev->power.wakeirq;
        if (!wirq) {
                spin_unlock_irqrestore(&dev->power.lock, flags);
                return;
        }

        device_wakeup_detach_irq(dev);
        dev->power.wakeirq = NULL;
        spin_unlock_irqrestore(&dev->power.lock, flags);

        if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED) {
                free_irq(wirq->irq, wirq);
                wirq->status &= ~WAKE_IRQ_DEDICATED_MASK;
        }
        kfree(wirq->name);
        kfree(wirq);
}
EXPORT_SYMBOL_GPL(dev_pm_clear_wake_irq);

static void devm_pm_clear_wake_irq(void *dev)
{
        dev_pm_clear_wake_irq(dev);
}

/**
 * devm_pm_set_wake_irq - device-managed variant of dev_pm_set_wake_irq
 * @dev: Device entry
 * @irq: Device IO interrupt
 *
 *
 * Attach a device IO interrupt as a wake IRQ, same with dev_pm_set_wake_irq,
 * but the device will be auto clear wake capability on driver detach.
 */
int devm_pm_set_wake_irq(struct device *dev, int irq)
{
        int ret;

        ret = dev_pm_set_wake_irq(dev, irq);
        if (ret)
                return ret;

        return devm_add_action_or_reset(dev, devm_pm_clear_wake_irq, dev);
}
EXPORT_SYMBOL_GPL(devm_pm_set_wake_irq);

/**
 * handle_threaded_wake_irq - Handler for dedicated wake-up interrupts
 * @irq: Device specific dedicated wake-up interrupt
 * @_wirq: Wake IRQ data
 *
 * Some devices have a separate wake-up interrupt in addition to the
 * device IO interrupt. The wake-up interrupt signals that a device
 * should be woken up from it's idle state. This handler uses device
 * specific pm_runtime functions to wake the device, and then it's
 * up to the device to do whatever it needs to. Note that as the
 * device may need to restore context and start up regulators, we
 * use a threaded IRQ.
 *
 * Also note that we are not resending the lost device interrupts.
 * We assume that the wake-up interrupt just needs to wake-up the
 * device, and then device's pm_runtime_resume() can deal with the
 * situation.
 */
static irqreturn_t handle_threaded_wake_irq(int irq, void *_wirq)
{
        struct wake_irq *wirq = _wirq;
        int res;

        /* Maybe abort suspend? */
        if (irqd_is_wakeup_set(irq_get_irq_data(irq))) {
                pm_wakeup_event(wirq->dev, 0);

                return IRQ_HANDLED;
        }

        /* We don't want RPM_ASYNC or RPM_NOWAIT here */
        res = pm_runtime_resume(wirq->dev);
        if (res < 0)
                dev_warn(wirq->dev,
                         "wake IRQ with no resume: %i\n", res);

        return IRQ_HANDLED;
}

static int __dev_pm_set_dedicated_wake_irq(struct device *dev, int irq, unsigned int flag)
{
        struct wake_irq *wirq;
        int err;

        if (irq < 0)
                return -EINVAL;

        wirq = kzalloc_obj(*wirq);
        if (!wirq)
                return -ENOMEM;

        wirq->name = kasprintf(GFP_KERNEL, "%s:wakeup", dev_name(dev));
        if (!wirq->name) {
                err = -ENOMEM;
                goto err_free;
        }

        wirq->dev = dev;
        wirq->irq = irq;

        /* Prevent deferred spurious wakeirqs with disable_irq_nosync() */
        irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY);

        /*
         * Consumer device may need to power up and restore state
         * so we use a threaded irq.
         */
        err = request_threaded_irq(irq, NULL, handle_threaded_wake_irq,
                                   IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                   wirq->name, wirq);
        if (err)
                goto err_free_name;

        err = dev_pm_attach_wake_irq(dev, wirq);
        if (err)
                goto err_free_irq;

        wirq->status = WAKE_IRQ_DEDICATED_ALLOCATED | flag;

        return err;

err_free_irq:
        free_irq(irq, wirq);
err_free_name:
        kfree(wirq->name);
err_free:
        kfree(wirq);

        return err;
}

/**
 * dev_pm_set_dedicated_wake_irq - Request a dedicated wake-up interrupt
 * @dev: Device entry
 * @irq: Device wake-up interrupt
 *
 * Unless your hardware has separate wake-up interrupts in addition
 * to the device IO interrupts, you don't need this.
 *
 * Sets up a threaded interrupt handler for a device that has
 * a dedicated wake-up interrupt in addition to the device IO
 * interrupt.
 */
int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
{
        return __dev_pm_set_dedicated_wake_irq(dev, irq, 0);
}
EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq);

/**
 * dev_pm_set_dedicated_wake_irq_reverse - Request a dedicated wake-up interrupt
 *                                         with reverse enable ordering
 * @dev: Device entry
 * @irq: Device wake-up interrupt
 *
 * Unless your hardware has separate wake-up interrupts in addition
 * to the device IO interrupts, you don't need this.
 *
 * Sets up a threaded interrupt handler for a device that has a dedicated
 * wake-up interrupt in addition to the device IO interrupt. It sets
 * the status of WAKE_IRQ_DEDICATED_REVERSE to tell rpm_suspend()
 * to enable dedicated wake-up interrupt after running the runtime suspend
 * callback for @dev.
 */
int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq)
{
        return __dev_pm_set_dedicated_wake_irq(dev, irq, WAKE_IRQ_DEDICATED_REVERSE);
}
EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq_reverse);

/**
 * dev_pm_enable_wake_irq_check - Checks and enables wake-up interrupt
 * @dev: Device
 * @can_change_status: Can change wake-up interrupt status
 *
 * Enables wakeirq conditionally. We need to enable wake-up interrupt
 * lazily on the first rpm_suspend(). This is needed as the consumer device
 * starts in RPM_SUSPENDED state, and the first pm_runtime_get() would
 * otherwise try to disable already disabled wakeirq. The wake-up interrupt
 * starts disabled with IRQ_NOAUTOEN set.
 *
 * Should be called from rpm_suspend(), rpm_resume(),
 * pm_runtime_force_suspend() or pm_runtime_force_resume().
 * Caller must hold &dev->power.lock or disable runtime PM to change
 * wirq->status.
 */
void dev_pm_enable_wake_irq_check(struct device *dev,
                                  bool can_change_status)
{
        struct wake_irq *wirq = dev->power.wakeirq;

        if (!wirq || !(wirq->status & WAKE_IRQ_DEDICATED_MASK))
                return;

        if (likely(wirq->status & WAKE_IRQ_DEDICATED_MANAGED)) {
                goto enable;
        } else if (can_change_status) {
                wirq->status |= WAKE_IRQ_DEDICATED_MANAGED;
                goto enable;
        }

        return;

enable:
        if (!can_change_status || !(wirq->status & WAKE_IRQ_DEDICATED_REVERSE)) {
                enable_irq(wirq->irq);
                wirq->status |= WAKE_IRQ_DEDICATED_ENABLED;
        }
}

/**
 * dev_pm_disable_wake_irq_check - Checks and disables wake-up interrupt
 * @dev: Device
 * @cond_disable: if set, also check WAKE_IRQ_DEDICATED_REVERSE
 *
 * Disables wake-up interrupt conditionally based on status.
 * Should be called from rpm_suspend(), rpm_resume(),
 * pm_runtime_force_suspend() or pm_runtime_force_resume().
 */
void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable)
{
        struct wake_irq *wirq = dev->power.wakeirq;

        if (!wirq || !(wirq->status & WAKE_IRQ_DEDICATED_MASK))
                return;

        if (cond_disable && (wirq->status & WAKE_IRQ_DEDICATED_REVERSE))
                return;

        if (wirq->status & WAKE_IRQ_DEDICATED_MANAGED) {
                wirq->status &= ~WAKE_IRQ_DEDICATED_ENABLED;
                disable_irq_nosync(wirq->irq);
        }
}

/**
 * dev_pm_enable_wake_irq_complete - enable wake IRQ not enabled before
 * @dev: Device using the wake IRQ
 *
 * Enable wake IRQ conditionally based on status, mainly used if want to
 * enable wake IRQ after running ->runtime_suspend() which depends on
 * WAKE_IRQ_DEDICATED_REVERSE.
 *
 * Should be called from rpm_suspend() or pm_runtime_force_suspend().
 */
void dev_pm_enable_wake_irq_complete(struct device *dev)
{
        struct wake_irq *wirq = dev->power.wakeirq;

        if (!wirq || !(wirq->status & WAKE_IRQ_DEDICATED_MASK))
                return;

        if (wirq->status & WAKE_IRQ_DEDICATED_MANAGED &&
            wirq->status & WAKE_IRQ_DEDICATED_REVERSE) {
                enable_irq(wirq->irq);
                wirq->status |= WAKE_IRQ_DEDICATED_ENABLED;
        }
}

/**
 * dev_pm_arm_wake_irq - Arm device wake-up
 * @wirq: Device wake-up interrupt
 *
 * Sets up the wake-up event conditionally based on the
 * device_may_wake().
 */
void dev_pm_arm_wake_irq(struct wake_irq *wirq)
{
        if (!wirq)
                return;

        if (device_may_wakeup(wirq->dev)) {
                if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED &&
                    !(wirq->status & WAKE_IRQ_DEDICATED_ENABLED))
                        enable_irq(wirq->irq);

                enable_irq_wake(wirq->irq);
        }
}

/**
 * dev_pm_disarm_wake_irq - Disarm device wake-up
 * @wirq: Device wake-up interrupt
 *
 * Clears up the wake-up event conditionally based on the
 * device_may_wake().
 */
void dev_pm_disarm_wake_irq(struct wake_irq *wirq)
{
        if (!wirq)
                return;

        if (device_may_wakeup(wirq->dev)) {
                disable_irq_wake(wirq->irq);

                if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED &&
                    !(wirq->status & WAKE_IRQ_DEDICATED_ENABLED))
                        disable_irq_nosync(wirq->irq);
        }
}






















































































































































































































































































   19 


















   19 

























































































































    3 















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Operations on the network namespace
 */
#ifndef __NET_NET_NAMESPACE_H
#define __NET_NET_NAMESPACE_H

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/sysctl.h>
#include <linux/uidgid.h>

#include <net/flow.h>
#include <net/netns/core.h>
#include <net/netns/mib.h>
#include <net/netns/unix.h>
#include <net/netns/packet.h>
#include <net/netns/ipv4.h>
#include <net/netns/ipv6.h>
#include <net/netns/nexthop.h>
#include <net/netns/ieee802154_6lowpan.h>
#include <net/netns/sctp.h>
#include <net/netns/netfilter.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netns/conntrack.h>
#endif
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
#include <net/netns/flow_table.h>
#endif
#include <net/netns/nftables.h>
#include <net/netns/xfrm.h>
#include <net/netns/mpls.h>
#include <net/netns/can.h>
#include <net/netns/xdp.h>
#include <net/netns/smc.h>
#include <net/netns/bpf.h>
#include <net/netns/mctp.h>
#include <net/netns/vsock.h>
#include <net/net_trackers.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
#include <linux/skbuff.h>
#include <linux/notifier.h>
#include <linux/xarray.h>

struct user_namespace;
struct proc_dir_entry;
struct net_device;
struct sock;
struct ctl_table_header;
struct net_generic;
struct uevent_sock;
struct netns_ipvs;
struct bpf_prog;


#define NETDEV_HASHBITS    8
#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)

struct net {
        /* First cache line can be often dirtied.
         * Do not place here read-mostly fields.
         */
        refcount_t                passive;        /* To decide when the network
                                                 * namespace should be freed.
                                                 */
        spinlock_t                rules_mod_lock;

        unsigned int                dev_base_seq;        /* protected by rtnl_mutex */
        u32                        ifindex;

        spinlock_t                nsid_lock;
        atomic_t                fnhe_genid;

        struct list_head        list;                /* list of network namespaces */
        struct list_head        exit_list;        /* To linked to call pernet exit
                                                 * methods on dead net (
                                                 * pernet_ops_rwsem read locked),
                                                 * or to unregister pernet ops
                                                 * (pernet_ops_rwsem write locked).
                                                 */
        struct llist_node        defer_free_list;
        struct llist_node        cleanup_list;        /* namespaces on death row */

        struct list_head ptype_all;
        struct list_head ptype_specific;

#ifdef CONFIG_KEYS
        struct key_tag                *key_domain;        /* Key domain of operation tag */
#endif
        struct user_namespace   *user_ns;        /* Owning user namespace */
        struct ucounts                *ucounts;
        struct idr                netns_ids;

        struct ns_common        ns;
        struct ref_tracker_dir  refcnt_tracker;
        struct ref_tracker_dir  notrefcnt_tracker; /* tracker for objects not
                                                    * refcounted against netns
                                                    */
        struct list_head         dev_base_head;
        struct proc_dir_entry         *proc_net;
        struct proc_dir_entry         *proc_net_stat;

#ifdef CONFIG_SYSCTL
        struct ctl_table_set        sysctls;
#endif

        struct sock                 *rtnl;                        /* rtnetlink socket */
        struct sock                *genl_sock;

        struct uevent_sock        *uevent_sock;                /* uevent socket */

        struct hlist_head         *dev_name_head;
        struct hlist_head        *dev_index_head;
        struct xarray                dev_by_index;
        struct raw_notifier_head        netdev_chain;

        /* Note that @hash_mix can be read millions times per second,
         * it is critical that it is on a read_mostly cache line.
         */
        u32                        hash_mix;
        bool                        is_dying;

        struct net_device       *loopback_dev;          /* The loopback */

        /* core fib_rules */
        struct list_head        rules_ops;

        struct netns_core        core;
        struct netns_mib        mib;
        struct netns_packet        packet;
#if IS_ENABLED(CONFIG_UNIX)
        struct netns_unix        unx;
#endif
        struct netns_nexthop        nexthop;
        struct netns_ipv4        ipv4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netns_ipv6        ipv6;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct netns_ieee802154_lowpan        ieee802154_lowpan;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
        struct netns_sctp        sctp;
#endif
#ifdef CONFIG_NETFILTER
        struct netns_nf                nf;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        struct netns_ct                ct;
#endif
#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
        struct netns_nftables        nft;
#endif
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
        struct netns_ft ft;
#endif
#endif
#ifdef CONFIG_WEXT_CORE
        struct sk_buff_head        wext_nlevents;
#endif
        struct net_generic __rcu        *gen;

        /* Used to store attached BPF programs */
        struct netns_bpf        bpf;

        /* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
        struct netns_xfrm        xfrm;
#endif

        u64                        net_cookie; /* written once */

#if IS_ENABLED(CONFIG_IP_VS)
        struct netns_ipvs        *ipvs;
#endif
#if IS_ENABLED(CONFIG_MPLS)
        struct netns_mpls        mpls;
#endif
#if IS_ENABLED(CONFIG_CAN)
        struct netns_can        can;
#endif
#ifdef CONFIG_XDP_SOCKETS
        struct netns_xdp        xdp;
#endif
#if IS_ENABLED(CONFIG_MCTP)
        struct netns_mctp        mctp;
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        struct sock                *crypto_nlsk;
#endif
        struct sock                *diag_nlsk;
#if IS_ENABLED(CONFIG_SMC)
        struct netns_smc        smc;
#endif
#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
        /* Move to a better place when the config guard is removed. */
        struct mutex                rtnl_mutex;
#endif
#if IS_ENABLED(CONFIG_VSOCKETS)
        struct netns_vsock        vsock;
#endif
} __randomize_layout;

#include <linux/seq_file_net.h>

/* Init's network namespace */
extern struct net init_net;

#ifdef CONFIG_NET_NS
struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns,
                        struct net *old_net);

void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);

void net_ns_barrier(void);

struct ns_common *get_net_ns(struct ns_common *ns);
struct net *get_net_ns_by_fd(int fd);
extern struct task_struct *cleanup_net_task;

#else /* CONFIG_NET_NS */
#include <linux/sched.h>
#include <linux/nsproxy.h>
static inline struct net *copy_net_ns(u64 flags,
        struct user_namespace *user_ns, struct net *old_net)
{
        if (flags & CLONE_NEWNET)
                return ERR_PTR(-EINVAL);
        return old_net;
}

static inline void net_ns_get_ownership(const struct net *net,
                                        kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;
}

static inline void net_ns_barrier(void) {}

static inline struct ns_common *get_net_ns(struct ns_common *ns)
{
        return ERR_PTR(-EINVAL);
}

static inline struct net *get_net_ns_by_fd(int fd)
{
        return ERR_PTR(-EINVAL);
}
#endif /* CONFIG_NET_NS */


extern struct list_head net_namespace_list;

struct net *get_net_ns_by_pid(pid_t pid);

#ifdef CONFIG_SYSCTL
void ipx_register_sysctl(void);
void ipx_unregister_sysctl(void);
#else
#define ipx_register_sysctl()
#define ipx_unregister_sysctl()
#endif

static inline struct net *to_net_ns(struct ns_common *ns)
{
        return container_of(ns, struct net, ns);
}

#ifdef CONFIG_NET_NS
void __put_net(struct net *net);

/* Try using get_net_track() instead */
static inline struct net *get_net(struct net *net)
{
        ns_ref_inc(net);
        return net;
}

static inline struct net *maybe_get_net(struct net *net)
{
        /* Used when we know struct net exists but we
         * aren't guaranteed a previous reference count
         * exists.  If the reference count is zero this
         * function fails and returns NULL.
         */
        if (!ns_ref_get(net))
                net = NULL;
        return net;
}

/* Try using put_net_track() instead */
static inline void put_net(struct net *net)
{
        if (ns_ref_put(net))
                __put_net(net);
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return net1 == net2;
}

static inline int check_net(const struct net *net)
{
        return ns_ref_read(net) != 0;
}

void net_drop_ns(struct ns_common *);
void net_passive_dec(struct net *net);

#else

static inline struct net *get_net(struct net *net)
{
        return net;
}

static inline void put_net(struct net *net)
{
}

static inline struct net *maybe_get_net(struct net *net)
{
        return net;
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return 1;
}

static inline int check_net(const struct net *net)
{
        return 1;
}

#define net_drop_ns NULL

static inline void net_passive_dec(struct net *net)
{
        refcount_dec(&net->passive);
}
#endif

static inline void net_passive_inc(struct net *net)
{
        refcount_inc(&net->passive);
}

/* Returns true if the netns initialization is completed successfully */
static inline bool net_initialized(const struct net *net)
{
        return READ_ONCE(net->list.next);
}

static inline void __netns_tracker_alloc(struct net *net,
                                         netns_tracker *tracker,
                                         bool refcounted,
                                         gfp_t gfp)
{
#ifdef CONFIG_NET_NS_REFCNT_TRACKER
        ref_tracker_alloc(refcounted ? &net->refcnt_tracker :
                                       &net->notrefcnt_tracker,
                          tracker, gfp);
#endif
}

static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker,
                                       gfp_t gfp)
{
        __netns_tracker_alloc(net, tracker, true, gfp);
}

static inline void __netns_tracker_free(struct net *net,
                                        netns_tracker *tracker,
                                        bool refcounted)
{
#ifdef CONFIG_NET_NS_REFCNT_TRACKER
       ref_tracker_free(refcounted ? &net->refcnt_tracker :
                                     &net->notrefcnt_tracker, tracker);
#endif
}

static inline struct net *get_net_track(struct net *net,
                                        netns_tracker *tracker, gfp_t gfp)
{
        get_net(net);
        netns_tracker_alloc(net, tracker, gfp);
        return net;
}

static inline void put_net_track(struct net *net, netns_tracker *tracker)
{
        __netns_tracker_free(net, tracker, true);
        put_net(net);
}

typedef struct {
#ifdef CONFIG_NET_NS
        struct net __rcu *net;
#endif
} possible_net_t;

static inline void write_pnet(possible_net_t *pnet, struct net *net)
{
#ifdef CONFIG_NET_NS
        rcu_assign_pointer(pnet->net, net);
#endif
}

static inline struct net *read_pnet(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference_protected(pnet->net, true);
#else
        return &init_net;
#endif
}

static inline struct net *read_pnet_rcu(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference(pnet->net);
#else
        return &init_net;
#endif
}

/* Protected by net_rwsem */
#define for_each_net(VAR)                                \
        list_for_each_entry(VAR, &net_namespace_list, list)
#define for_each_net_continue_reverse(VAR)                \
        list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list)
#define for_each_net_rcu(VAR)                                \
        list_for_each_entry_rcu(VAR, &net_namespace_list, list)

#ifdef CONFIG_NET_NS
#define __net_init
#define __net_exit
#define __net_initdata
#define __net_initconst
#else
#define __net_init        __init
#define __net_exit        __ref
#define __net_initdata        __initdata
#define __net_initconst        __initconst
#endif

int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp);
int peernet2id(const struct net *net, struct net *peer);
bool peernet_has_id(const struct net *net, struct net *peer);
struct net *get_net_ns_by_id(const struct net *net, int id);

struct pernet_operations {
        struct list_head list;
        /*
         * Below methods are called without any exclusive locks.
         * More than one net may be constructed and destructed
         * in parallel on several cpus. Every pernet_operations
         * have to keep in mind all other pernet_operations and
         * to introduce a locking, if they share common resources.
         *
         * The only time they are called with exclusive lock is
         * from register_pernet_subsys(), unregister_pernet_subsys()
         * register_pernet_device() and unregister_pernet_device().
         *
         * Exit methods using blocking RCU primitives, such as
         * synchronize_rcu(), should be implemented via exit_batch.
         * Then, destruction of a group of net requires single
         * synchronize_rcu() related to these pernet_operations,
         * instead of separate synchronize_rcu() for every net.
         * Please, avoid synchronize_rcu() at all, where it's possible.
         *
         * Note that a combination of pre_exit() and exit() can
         * be used, since a synchronize_rcu() is guaranteed between
         * the calls.
         */
        int (*init)(struct net *net);
        void (*pre_exit)(struct net *net);
        void (*exit)(struct net *net);
        void (*exit_batch)(struct list_head *net_exit_list);
        /* Following method is called with RTNL held. */
        void (*exit_rtnl)(struct net *net,
                          struct list_head *dev_kill_list);
        unsigned int * const id;
        const size_t size;
};

/*
 * Use these carefully.  If you implement a network device and it
 * needs per network namespace operations use device pernet operations,
 * otherwise use pernet subsys operations.
 *
 * Network interfaces need to be removed from a dying netns _before_
 * subsys notifiers can be called, as most of the network code cleanup
 * (which is done from subsys notifiers) runs with the assumption that
 * dev_remove_pack has been called so no new packets will arrive during
 * and after the cleanup functions have been called.  dev_remove_pack
 * is not per namespace so instead the guarantee of no more packets
 * arriving in a network namespace is provided by ensuring that all
 * network devices and all sockets have left the network namespace
 * before the cleanup methods are called.
 *
 * For the longest time the ipv4 icmp code was registered as a pernet
 * device which caused kernel oops, and panics during network
 * namespace cleanup.   So please don't get this wrong.
 */
int register_pernet_subsys(struct pernet_operations *);
void unregister_pernet_subsys(struct pernet_operations *);
int register_pernet_device(struct pernet_operations *);
void unregister_pernet_device(struct pernet_operations *);

struct ctl_table;

#define register_net_sysctl(net, path, table)        \
        register_net_sysctl_sz(net, path, table, ARRAY_SIZE(table))
#ifdef CONFIG_SYSCTL
int net_sysctl_init(void);
struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path,
                                             struct ctl_table *table, size_t table_size);
void unregister_net_sysctl_table(struct ctl_table_header *header);
#else
static inline int net_sysctl_init(void) { return 0; }
static inline struct ctl_table_header *register_net_sysctl_sz(struct net *net,
        const char *path, struct ctl_table *table, size_t table_size)
{
        return NULL;
}
static inline void unregister_net_sysctl_table(struct ctl_table_header *header)
{
}
#endif

static inline int rt_genid_ipv4(const struct net *net)
{
        return atomic_read(&net->ipv4.rt_genid);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int rt_genid_ipv6(const struct net *net)
{
        return atomic_read(&net->ipv6.fib6_sernum);
}
#endif

static inline void rt_genid_bump_ipv4(struct net *net)
{
        atomic_inc(&net->ipv4.rt_genid);
}

extern void (*__fib6_flush_trees)(struct net *net);
static inline void rt_genid_bump_ipv6(struct net *net)
{
        if (__fib6_flush_trees)
                __fib6_flush_trees(net);
}

#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
static inline struct netns_ieee802154_lowpan *
net_ieee802154_lowpan(struct net *net)
{
        return &net->ieee802154_lowpan;
}
#endif

/* For callers who don't really care about whether it's IPv4 or IPv6 */
static inline void rt_genid_bump_all(struct net *net)
{
        rt_genid_bump_ipv4(net);
        rt_genid_bump_ipv6(net);
}

static inline int fnhe_genid(const struct net *net)
{
        return atomic_read(&net->fnhe_genid);
}

static inline void fnhe_genid_bump(struct net *net)
{
        atomic_inc(&net->fnhe_genid);
}

#ifdef CONFIG_NET
void net_ns_init(void);
#else
static inline void net_ns_init(void) {}
#endif

#endif /* __NET_NET_NAMESPACE_H */




































































    6 




































   13 







































































































































































































































































































































































































































































































   11 

















    1 






















































































































































































































































































































































   19 





































































































































































































































































































































    7 





















    3 






















   17 





















   26 










































































































































































   23 


















    1 




































































































































































    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   11 













































































































































































































































































































































































   19 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   17 




























































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-instrumented.sh
// DO NOT MODIFY THIS FILE DIRECTLY

/*
 * This file provoides atomic operations with explicit instrumentation (e.g.
 * KASAN, KCSAN), which should be used unless it is necessary to avoid
 * instrumentation. Where it is necessary to aovid instrumenation, the
 * raw_atomic*() operations should be used.
 */
#ifndef _LINUX_ATOMIC_INSTRUMENTED_H
#define _LINUX_ATOMIC_INSTRUMENTED_H

#include <linux/build_bug.h>
#include <linux/compiler.h>
#include <linux/instrumented.h>

/**
 * atomic_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
atomic_read(const atomic_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_read(v);
}

/**
 * atomic_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
atomic_read_acquire(const atomic_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_read_acquire(v);
}

/**
 * atomic_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_set(atomic_t *v, int i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_set(v, i);
}

/**
 * atomic_set_release() - atomic set with release ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_set_release(atomic_t *v, int i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_set_release(v, i);
}

/**
 * atomic_add() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_add(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_add(i, v);
}

/**
 * atomic_add_return() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return(i, v);
}

/**
 * atomic_add_return_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_acquire(i, v);
}

/**
 * atomic_add_return_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_release(i, v);
}

/**
 * atomic_add_return_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_relaxed(i, v);
}

/**
 * atomic_fetch_add() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add(i, v);
}

/**
 * atomic_fetch_add_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_acquire(i, v);
}

/**
 * atomic_fetch_add_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_release(i, v);
}

/**
 * atomic_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_relaxed(i, v);
}

/**
 * atomic_sub() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_sub(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_sub(i, v);
}

/**
 * atomic_sub_return() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return(i, v);
}

/**
 * atomic_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_acquire(i, v);
}

/**
 * atomic_sub_return_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_release(i, v);
}

/**
 * atomic_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_relaxed(i, v);
}

/**
 * atomic_fetch_sub() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub(i, v);
}

/**
 * atomic_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_acquire(i, v);
}

/**
 * atomic_fetch_sub_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_release(i, v);
}

/**
 * atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_relaxed(i, v);
}

/**
 * atomic_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_inc(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_inc(v);
}

/**
 * atomic_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return(v);
}

/**
 * atomic_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_acquire(v);
}

/**
 * atomic_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_release(v);
}

/**
 * atomic_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_relaxed(v);
}

/**
 * atomic_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc(v);
}

/**
 * atomic_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_acquire(v);
}

/**
 * atomic_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_release(v);
}

/**
 * atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_relaxed(v);
}

/**
 * atomic_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_dec(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_dec(v);
}

/**
 * atomic_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return(v);
}

/**
 * atomic_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_acquire(v);
}

/**
 * atomic_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_release(v);
}

/**
 * atomic_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_relaxed(v);
}

/**
 * atomic_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec(v);
}

/**
 * atomic_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_acquire(v);
}

/**
 * atomic_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_release(v);
}

/**
 * atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_relaxed(v);
}

/**
 * atomic_and() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_and(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_and(i, v);
}

/**
 * atomic_fetch_and() - atomic bitwise AND with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and(i, v);
}

/**
 * atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_acquire(i, v);
}

/**
 * atomic_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_release(i, v);
}

/**
 * atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_relaxed(i, v);
}

/**
 * atomic_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_andnot(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_andnot(i, v);
}

/**
 * atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot(i, v);
}

/**
 * atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_acquire(i, v);
}

/**
 * atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_release(i, v);
}

/**
 * atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_relaxed(i, v);
}

/**
 * atomic_or() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_or(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_or(i, v);
}

/**
 * atomic_fetch_or() - atomic bitwise OR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or(i, v);
}

/**
 * atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_acquire(i, v);
}

/**
 * atomic_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_release(i, v);
}

/**
 * atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_relaxed(i, v);
}

/**
 * atomic_xor() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_xor(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_xor(i, v);
}

/**
 * atomic_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor(i, v);
}

/**
 * atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_acquire(i, v);
}

/**
 * atomic_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_release(i, v);
}

/**
 * atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_relaxed(i, v);
}

/**
 * atomic_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg(atomic_t *v, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg(v, new);
}

/**
 * atomic_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_acquire(atomic_t *v, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_acquire(v, new);
}

/**
 * atomic_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_release(atomic_t *v, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_release(v, new);
}

/**
 * atomic_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_relaxed(atomic_t *v, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_relaxed(v, new);
}

/**
 * atomic_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg(atomic_t *v, int old, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg(v, old, new);
}

/**
 * atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_release(v, old, new);
}

/**
 * atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg(v, old, new);
}

/**
 * atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_release(v, old, new);
}

/**
 * atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_sub_and_test(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_and_test(i, v);
}

/**
 * atomic_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_dec_and_test(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_and_test(v);
}

/**
 * atomic_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_inc_and_test(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_and_test(v);
}

/**
 * atomic_add_negative() - atomic add and test if negative with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative(i, v);
}

/**
 * atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_acquire(i, v);
}

/**
 * atomic_add_negative_release() - atomic add and test if negative with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_release(i, v);
}

/**
 * atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_relaxed(i, v);
}

/**
 * atomic_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_unless(v, a, u);
}

/**
 * atomic_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_add_unless(atomic_t *v, int a, int u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_unless(v, a, u);
}

/**
 * atomic_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_inc_not_zero(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_not_zero(v);
}

/**
 * atomic_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_inc_unless_negative(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_unless_negative(v);
}

/**
 * atomic_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_dec_unless_positive(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_unless_positive(v);
}

/**
 * atomic_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline int
atomic_dec_if_positive(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_if_positive(v);
}

/**
 * atomic64_read() - atomic load with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
atomic64_read(const atomic64_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic64_read(v);
}

/**
 * atomic64_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
atomic64_read_acquire(const atomic64_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic64_read_acquire(v);
}

/**
 * atomic64_set() - atomic set with relaxed ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_set(atomic64_t *v, s64 i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic64_set(v, i);
}

/**
 * atomic64_set_release() - atomic set with release ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_set_release(atomic64_t *v, s64 i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic64_set_release(v, i);
}

/**
 * atomic64_add() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_add(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_add(i, v);
}

/**
 * atomic64_add_return() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return(i, v);
}

/**
 * atomic64_add_return_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_acquire(i, v);
}

/**
 * atomic64_add_return_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_release(i, v);
}

/**
 * atomic64_add_return_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_relaxed(i, v);
}

/**
 * atomic64_fetch_add() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add(i, v);
}

/**
 * atomic64_fetch_add_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_acquire(i, v);
}

/**
 * atomic64_fetch_add_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_release(i, v);
}

/**
 * atomic64_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_relaxed(i, v);
}

/**
 * atomic64_sub() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_sub(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_sub(i, v);
}

/**
 * atomic64_sub_return() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return(i, v);
}

/**
 * atomic64_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_acquire(i, v);
}

/**
 * atomic64_sub_return_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_release(i, v);
}

/**
 * atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_relaxed(i, v);
}

/**
 * atomic64_fetch_sub() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub(i, v);
}

/**
 * atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_acquire(i, v);
}

/**
 * atomic64_fetch_sub_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_release(i, v);
}

/**
 * atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_relaxed(i, v);
}

/**
 * atomic64_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_inc(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_inc(v);
}

/**
 * atomic64_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return(v);
}

/**
 * atomic64_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_acquire(v);
}

/**
 * atomic64_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_release(v);
}

/**
 * atomic64_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_relaxed(v);
}

/**
 * atomic64_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc(v);
}

/**
 * atomic64_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_acquire(v);
}

/**
 * atomic64_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_release(v);
}

/**
 * atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_relaxed(v);
}

/**
 * atomic64_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_dec(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_dec(v);
}

/**
 * atomic64_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return(v);
}

/**
 * atomic64_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_acquire(v);
}

/**
 * atomic64_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_release(v);
}

/**
 * atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_relaxed(v);
}

/**
 * atomic64_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec(v);
}

/**
 * atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_acquire(v);
}

/**
 * atomic64_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_release(v);
}

/**
 * atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_relaxed(v);
}

/**
 * atomic64_and() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_and(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_and(i, v);
}

/**
 * atomic64_fetch_and() - atomic bitwise AND with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and(i, v);
}

/**
 * atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_acquire(i, v);
}

/**
 * atomic64_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_release(i, v);
}

/**
 * atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_relaxed(i, v);
}

/**
 * atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_andnot(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_andnot(i, v);
}

/**
 * atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot(i, v);
}

/**
 * atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_acquire(i, v);
}

/**
 * atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_release(i, v);
}

/**
 * atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_relaxed(i, v);
}

/**
 * atomic64_or() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_or(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_or(i, v);
}

/**
 * atomic64_fetch_or() - atomic bitwise OR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or(i, v);
}

/**
 * atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_acquire(i, v);
}

/**
 * atomic64_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_release(i, v);
}

/**
 * atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_relaxed(i, v);
}

/**
 * atomic64_xor() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_xor(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_xor(i, v);
}

/**
 * atomic64_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor(i, v);
}

/**
 * atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_acquire(i, v);
}

/**
 * atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_release(i, v);
}

/**
 * atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_relaxed(i, v);
}

/**
 * atomic64_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg(atomic64_t *v, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg(v, new);
}

/**
 * atomic64_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_acquire(atomic64_t *v, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_acquire(v, new);
}

/**
 * atomic64_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_release(atomic64_t *v, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_release(v, new);
}

/**
 * atomic64_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_relaxed(atomic64_t *v, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_relaxed(v, new);
}

/**
 * atomic64_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg(v, old, new);
}

/**
 * atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_acquire(v, old, new);
}

/**
 * atomic64_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_release(v, old, new);
}

/**
 * atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic64_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg(v, old, new);
}

/**
 * atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_release(v, old, new);
}

/**
 * atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_and_test(i, v);
}

/**
 * atomic64_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_dec_and_test(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_and_test(v);
}

/**
 * atomic64_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_inc_and_test(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_and_test(v);
}

/**
 * atomic64_add_negative() - atomic add and test if negative with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative(i, v);
}

/**
 * atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_acquire(i, v);
}

/**
 * atomic64_add_negative_release() - atomic add and test if negative with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_release(i, v);
}

/**
 * atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_relaxed(i, v);
}

/**
 * atomic64_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_unless(v, a, u);
}

/**
 * atomic64_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_unless(v, a, u);
}

/**
 * atomic64_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_inc_not_zero(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_not_zero(v);
}

/**
 * atomic64_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_inc_unless_negative(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_unless_negative(v);
}

/**
 * atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_dec_unless_positive(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_unless_positive(v);
}

/**
 * atomic64_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline s64
atomic64_dec_if_positive(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_if_positive(v);
}

/**
 * atomic_long_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
atomic_long_read(const atomic_long_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_long_read(v);
}

/**
 * atomic_long_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
atomic_long_read_acquire(const atomic_long_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_long_read_acquire(v);
}

/**
 * atomic_long_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_set(atomic_long_t *v, long i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_long_set(v, i);
}

/**
 * atomic_long_set_release() - atomic set with release ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_set_release(atomic_long_t *v, long i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_long_set_release(v, i);
}

/**
 * atomic_long_add() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_add(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_add(i, v);
}

/**
 * atomic_long_add_return() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return(i, v);
}

/**
 * atomic_long_add_return_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_acquire(i, v);
}

/**
 * atomic_long_add_return_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_release(i, v);
}

/**
 * atomic_long_add_return_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_relaxed(i, v);
}

/**
 * atomic_long_fetch_add() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add(i, v);
}

/**
 * atomic_long_fetch_add_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_acquire(i, v);
}

/**
 * atomic_long_fetch_add_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_release(i, v);
}

/**
 * atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_relaxed(i, v);
}

/**
 * atomic_long_sub() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_sub(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_sub(i, v);
}

/**
 * atomic_long_sub_return() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return(i, v);
}

/**
 * atomic_long_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_acquire(i, v);
}

/**
 * atomic_long_sub_return_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_release(i, v);
}

/**
 * atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_relaxed(i, v);
}

/**
 * atomic_long_fetch_sub() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub(i, v);
}

/**
 * atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_acquire(i, v);
}

/**
 * atomic_long_fetch_sub_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_release(i, v);
}

/**
 * atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_relaxed(i, v);
}

/**
 * atomic_long_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_inc(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_inc(v);
}

/**
 * atomic_long_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return(v);
}

/**
 * atomic_long_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_acquire(v);
}

/**
 * atomic_long_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_release(v);
}

/**
 * atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_relaxed(v);
}

/**
 * atomic_long_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc(v);
}

/**
 * atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_acquire(v);
}

/**
 * atomic_long_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_release(v);
}

/**
 * atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_relaxed(v);
}

/**
 * atomic_long_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_dec(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_dec(v);
}

/**
 * atomic_long_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return(v);
}

/**
 * atomic_long_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_acquire(v);
}

/**
 * atomic_long_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_release(v);
}

/**
 * atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_relaxed(v);
}

/**
 * atomic_long_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec(v);
}

/**
 * atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_acquire(v);
}

/**
 * atomic_long_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_release(v);
}

/**
 * atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_relaxed(v);
}

/**
 * atomic_long_and() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_and(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_and(i, v);
}

/**
 * atomic_long_fetch_and() - atomic bitwise AND with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and(i, v);
}

/**
 * atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_acquire(i, v);
}

/**
 * atomic_long_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_release(i, v);
}

/**
 * atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_relaxed(i, v);
}

/**
 * atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_andnot(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_andnot(i, v);
}

/**
 * atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot(i, v);
}

/**
 * atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_acquire(i, v);
}

/**
 * atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_release(i, v);
}

/**
 * atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_relaxed(i, v);
}

/**
 * atomic_long_or() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_or(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_or(i, v);
}

/**
 * atomic_long_fetch_or() - atomic bitwise OR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or(i, v);
}

/**
 * atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_acquire(i, v);
}

/**
 * atomic_long_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_release(i, v);
}

/**
 * atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_relaxed(i, v);
}

/**
 * atomic_long_xor() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_xor(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_xor(i, v);
}

/**
 * atomic_long_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor(i, v);
}

/**
 * atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_acquire(i, v);
}

/**
 * atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_release(i, v);
}

/**
 * atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_relaxed(i, v);
}

/**
 * atomic_long_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg(atomic_long_t *v, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg(v, new);
}

/**
 * atomic_long_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_acquire(atomic_long_t *v, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_acquire(v, new);
}

/**
 * atomic_long_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_release(atomic_long_t *v, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_release(v, new);
}

/**
 * atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_relaxed(atomic_long_t *v, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_relaxed(v, new);
}

/**
 * atomic_long_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg(v, old, new);
}

/**
 * atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_release(v, old, new);
}

/**
 * atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_release(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_sub_and_test(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_and_test(i, v);
}

/**
 * atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_dec_and_test(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_and_test(v);
}

/**
 * atomic_long_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_and_test(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_and_test(v);
}

/**
 * atomic_long_add_negative() - atomic add and test if negative with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative(i, v);
}

/**
 * atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_acquire(i, v);
}

/**
 * atomic_long_add_negative_release() - atomic add and test if negative with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_release(i, v);
}

/**
 * atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_relaxed(i, v);
}

/**
 * atomic_long_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_unless(v, a, u);
}

/**
 * atomic_long_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_add_unless(atomic_long_t *v, long a, long u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_unless(v, a, u);
}

/**
 * atomic_long_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_not_zero(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_not_zero(v);
}

/**
 * atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_unless_negative(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_unless_negative(v);
}

/**
 * atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_dec_unless_positive(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_unless_positive(v);
}

/**
 * atomic_long_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline long
atomic_long_dec_if_positive(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_if_positive(v);
}

#define xchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg(__ai_ptr, __VA_ARGS__); \
})

#define xchg_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_acquire(__ai_ptr, __VA_ARGS__); \
})

#define xchg_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_release(__ai_ptr, __VA_ARGS__); \
})

#define xchg_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define try_cmpxchg(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define cmpxchg_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_local(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \
})

#define sync_cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \
})

#define try_cmpxchg_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define sync_try_cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \
})


#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
// 9dd948d3012b22c4e75933a5172983f912e46439
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM mmap_lock

#if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MMAP_LOCK_H

#include <linux/memcontrol.h>
#include <linux/tracepoint.h>
#include <linux/types.h>

struct mm_struct;

DECLARE_EVENT_CLASS(mmap_lock,

        TP_PROTO(struct mm_struct *mm, bool write),

        TP_ARGS(mm, write),

        TP_STRUCT__entry(
                __field(struct mm_struct *, mm)
                __field(u64, memcg_id)
                __field(bool, write)
        ),

        TP_fast_assign(
                __entry->mm = mm;
                __entry->memcg_id = cgroup_id_from_mm(mm);
                __entry->write = write;
        ),

        TP_printk(
                "mm=%p memcg_id=%llu write=%s",
                __entry->mm, __entry->memcg_id,
                __entry->write ? "true" : "false"
        )
);

#define DEFINE_MMAP_LOCK_EVENT(name)                                    \
        DEFINE_EVENT(mmap_lock, name,                                   \
                TP_PROTO(struct mm_struct *mm, bool write),                \
                TP_ARGS(mm, write))

DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking);
DEFINE_MMAP_LOCK_EVENT(mmap_lock_released);

TRACE_EVENT(mmap_lock_acquire_returned,

        TP_PROTO(struct mm_struct *mm, bool write, bool success),

        TP_ARGS(mm, write, success),

        TP_STRUCT__entry(
                __field(struct mm_struct *, mm)
                __field(u64, memcg_id)
                __field(bool, write)
                __field(bool, success)
        ),

        TP_fast_assign(
                __entry->mm = mm;
                __entry->memcg_id = cgroup_id_from_mm(mm);
                __entry->write = write;
                __entry->success = success;
        ),

        TP_printk(
                "mm=%p memcg_id=%llu write=%s success=%s",
                __entry->mm,
                __entry->memcg_id,
                __entry->write ? "true" : "false",
                __entry->success ? "true" : "false"
        )
);

#endif /* _TRACE_MMAP_LOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




























    4 





    5 



    6 







    6 








    5 


    1 



































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
// SPDX-License-Identifier: GPL-2.0-only
/*
 * IPv6 library code, needed by static components when full IPv6 support is
 * not configured or static.
 */

#include <linux/export.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/ip.h>

/* if ipv6 module registers this function is used by xfrm to force all
 * sockets to relookup their nodes - this is fairly expensive, be
 * careful
 */
void (*__fib6_flush_trees)(struct net *);
EXPORT_SYMBOL(__fib6_flush_trees);

#define IPV6_ADDR_SCOPE_TYPE(scope)        ((scope) << 16)

static inline unsigned int ipv6_addr_scope2type(unsigned int scope)
{
        switch (scope) {
        case IPV6_ADDR_SCOPE_NODELOCAL:
                return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) |
                        IPV6_ADDR_LOOPBACK);
        case IPV6_ADDR_SCOPE_LINKLOCAL:
                return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) |
                        IPV6_ADDR_LINKLOCAL);
        case IPV6_ADDR_SCOPE_SITELOCAL:
                return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) |
                        IPV6_ADDR_SITELOCAL);
        }
        return IPV6_ADDR_SCOPE_TYPE(scope);
}

int __ipv6_addr_type(const struct in6_addr *addr)
{
        __be32 st;

        st = addr->s6_addr32[0];

        /* Consider all addresses with the first three bits different of
           000 and 111 as unicasts.
         */
        if ((st & htonl(0xE0000000)) != htonl(0x00000000) &&
            (st & htonl(0xE0000000)) != htonl(0xE0000000))
                return (IPV6_ADDR_UNICAST |
                        IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));

        if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) {
                /* multicast */
                /* addr-select 3.1 */
                return (IPV6_ADDR_MULTICAST |
                        ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr)));
        }

        if ((st & htonl(0xFFC00000)) == htonl(0xFE800000))
                return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST |
                        IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL));                /* addr-select 3.1 */
        if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000))
                return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST |
                        IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL));                /* addr-select 3.1 */
        if ((st & htonl(0xFE000000)) == htonl(0xFC000000))
                return (IPV6_ADDR_UNICAST |
                        IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));                        /* RFC 4193 */

        if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) {
                if (addr->s6_addr32[2] == 0) {
                        if (addr->s6_addr32[3] == 0)
                                return IPV6_ADDR_ANY;

                        if (addr->s6_addr32[3] == htonl(0x00000001))
                                return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST |
                                        IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL));        /* addr-select 3.4 */

                        return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST |
                                IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));        /* addr-select 3.3 */
                }

                if (addr->s6_addr32[2] == htonl(0x0000ffff))
                        return (IPV6_ADDR_MAPPED |
                                IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));        /* addr-select 3.3 */
        }

        return (IPV6_ADDR_UNICAST |
                IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));        /* addr-select 3.4 */
}
EXPORT_SYMBOL(__ipv6_addr_type);

static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
static BLOCKING_NOTIFIER_HEAD(inet6addr_validator_chain);

int register_inet6addr_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_register(&inet6addr_chain, nb);
}
EXPORT_SYMBOL(register_inet6addr_notifier);

int unregister_inet6addr_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_unregister(&inet6addr_chain, nb);
}
EXPORT_SYMBOL(unregister_inet6addr_notifier);

int inet6addr_notifier_call_chain(unsigned long val, void *v)
{
        return atomic_notifier_call_chain(&inet6addr_chain, val, v);
}
EXPORT_SYMBOL(inet6addr_notifier_call_chain);

int register_inet6addr_validator_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&inet6addr_validator_chain, nb);
}
EXPORT_SYMBOL(register_inet6addr_validator_notifier);

int unregister_inet6addr_validator_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&inet6addr_validator_chain,
                                                  nb);
}
EXPORT_SYMBOL(unregister_inet6addr_validator_notifier);

int inet6addr_validator_notifier_call_chain(unsigned long val, void *v)
{
        return blocking_notifier_call_chain(&inet6addr_validator_chain, val, v);
}
EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain);

/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
const struct in6_addr in6addr_loopback __aligned(BITS_PER_LONG/8)
        = IN6ADDR_LOOPBACK_INIT;
EXPORT_SYMBOL(in6addr_loopback);
const struct in6_addr in6addr_any __aligned(BITS_PER_LONG/8)
        = IN6ADDR_ANY_INIT;
EXPORT_SYMBOL(in6addr_any);
const struct in6_addr in6addr_linklocal_allnodes __aligned(BITS_PER_LONG/8)
        = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
EXPORT_SYMBOL(in6addr_linklocal_allnodes);
const struct in6_addr in6addr_linklocal_allrouters __aligned(BITS_PER_LONG/8)
        = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
EXPORT_SYMBOL(in6addr_linklocal_allrouters);
const struct in6_addr in6addr_interfacelocal_allnodes __aligned(BITS_PER_LONG/8)
        = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT;
EXPORT_SYMBOL(in6addr_interfacelocal_allnodes);
const struct in6_addr in6addr_interfacelocal_allrouters __aligned(BITS_PER_LONG/8)
        = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT;
EXPORT_SYMBOL(in6addr_interfacelocal_allrouters);
const struct in6_addr in6addr_sitelocal_allrouters __aligned(BITS_PER_LONG/8)
        = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
EXPORT_SYMBOL(in6addr_sitelocal_allrouters);

static void snmp6_free_dev(struct inet6_dev *idev)
{
        kfree(idev->stats.icmpv6msgdev);
        kfree(idev->stats.icmpv6dev);
        free_percpu(idev->stats.ipv6);
}

static void in6_dev_finish_destroy_rcu(struct rcu_head *head)
{
        struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu);

        snmp6_free_dev(idev);
        kfree(idev);
}

/* Nobody refers to this device, we may destroy it. */

void in6_dev_finish_destroy(struct inet6_dev *idev)
{
        struct net_device *dev = idev->dev;

        WARN_ON(!list_empty(&idev->addr_list));
        WARN_ON(rcu_access_pointer(idev->mc_list));
        WARN_ON(timer_pending(&idev->rs_timer));

#ifdef NET_REFCNT_DEBUG
        pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
#endif
        netdev_put(dev, &idev->dev_tracker);
        if (!idev->dead) {
                pr_warn("Freeing alive inet6 device %p\n", idev);
                return;
        }
        call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu);
}
EXPORT_SYMBOL(in6_dev_finish_destroy);






























































































































































































































































































































































































































































































































































































   11 








   12 

   11 


   11 


   11 








    1 
    1 




























































































































































   15 
















































































   14 





















   15 


   13 



   14 

   14 
   13 
   15 





   11 

























































































































































































































































































































































































































































































































    1 


























































































    1 





















































































































































































































































































































































































































































































    1 


























    1 













    1 



















































































































































































































    1 






















    1 






    1 








    1 








    1 










    1 




    1 




    1 






















    1 







    1 





    1 
    1 


    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   14 




   11 
    5 


   15 
   13 


   14 


   15 
   14 




















































   13 
   15 































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
// SPDX-License-Identifier: GPL-2.0
/*
 *  Generic process-grouping system.
 *
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
 *  Notifications support
 *  Copyright (C) 2009 Nokia Corporation
 *  Author: Kirill A. Shutemov
 *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  ---------------------------------------------------
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include "cgroup-internal.h"

#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/hashtable.h>
#include <linux/idr.h>
#include <linux/kthread.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <linux/nstree.h>
#include <linux/irq_work.h>
#include <net/sock.h>

#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>

#define CGROUP_FILE_NAME_MAX                (MAX_CGROUP_TYPE_NAMELEN +        \
                                         MAX_CFTYPE_NAME + 2)
/* let's not notify more than 100 times per second */
#define CGROUP_FILE_NOTIFY_MIN_INTV        DIV_ROUND_UP(HZ, 100)

/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
 * css_set_lock protects task->cgroups pointer, the list of css_set
 * objects, and the chain of tasks off each css_set.
 *
 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
 * cgroup.h can use them for lockdep annotations.
 */
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);

#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif

struct blocking_notifier_head cgroup_lifetime_notifier =
        BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier);

DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
static bool cgroup_debug __read_mostly;

/*
 * Protects cgroup_idr and css_idr so that IDs can be released without
 * grabbing cgroup_mutex.
 */
static DEFINE_SPINLOCK(cgroup_idr_lock);

DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);

#define cgroup_assert_mutex_or_rcu_locked()                                \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                        \
                           !lockdep_is_held(&cgroup_mutex),                \
                           "cgroup_mutex or RCU read lock required");

/*
 * cgroup destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup
 * destruction work items don't end up filling up max_active of system_percpu_wq
 * which may lead to deadlock.
 *
 * A cgroup destruction should enqueue work sequentially to:
 * cgroup_offline_wq: use for css offline work
 * cgroup_release_wq: use for css release work
 * cgroup_free_wq: use for free work
 *
 * Rationale for using separate workqueues:
 * The cgroup root free work may depend on completion of other css offline
 * operations. If all tasks were enqueued to a single workqueue, this could
 * create a deadlock scenario where:
 * - Free work waits for other css offline work to complete.
 * - But other css offline work is queued after free work in the same queue.
 *
 * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
 * 1. umount net_prio
 * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
 * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
 * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
 * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
 *    which can never complete as it's behind in the same queue and
 *    workqueue's max_active is 1.
 */
static struct workqueue_struct *cgroup_offline_wq;
static struct workqueue_struct *cgroup_release_wq;
static struct workqueue_struct *cgroup_free_wq;

/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x)                                                                \
        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                        \
        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                        \
        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                        \
        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu);
static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu);

/* the default hierarchy */
struct cgroup_root cgrp_dfl_root = {
        .cgrp.self.rstat_cpu = &root_rstat_cpu,
        .cgrp.rstat_base_cpu = &root_rstat_base_cpu,
};
EXPORT_SYMBOL_GPL(cgrp_dfl_root);

/*
 * The default hierarchy always exists but is hidden until mounted for the
 * first time.  This is for backward compatibility.
 */
bool cgrp_dfl_visible;

/* some controllers are not supported in the default hierarchy */
static u32 cgrp_dfl_inhibit_ss_mask;

/* some controllers are implicitly enabled on the default hierarchy */
static u32 cgrp_dfl_implicit_ss_mask;

/* some controllers can be threaded on the default hierarchy */
static u32 cgrp_dfl_threaded_ss_mask;

/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;

/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);

/*
 * Assign a monotonically increasing serial number to csses.  It guarantees
 * cgroups with bigger numbers are newer than those with smaller numbers.
 * Also, as csses are always appended to the parent's ->children list, it
 * guarantees that sibling csses are always sorted in the ascending serial
 * number order on the list.  Protected by cgroup_mutex.
 */
static u64 css_serial_nr_next = 1;

/*
 * These bitmasks identify subsystems with specific features to avoid
 * having to do iterative checks repeatedly.
 */
static u32 have_fork_callback __read_mostly;
static u32 have_exit_callback __read_mostly;
static u32 have_release_callback __read_mostly;
static u32 have_canfork_callback __read_mostly;

static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);

/*
 * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
 * read protected by either.
 *
 * Can only be turned on, but not turned off.
 */
bool cgroup_enable_per_threadgroup_rwsem __read_mostly;

/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
        .ns                = NS_COMMON_INIT(init_cgroup_ns),
        .user_ns        = &init_user_ns,
        .root_cset        = &init_css_set,
};

static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
static struct cftype cgroup_psi_files[];

/* cgroup optional features */
enum cgroup_opt_features {
#ifdef CONFIG_PSI
        OPT_FEATURE_PRESSURE,
#endif
        OPT_FEATURE_COUNT
};

static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
#ifdef CONFIG_PSI
        "pressure",
#endif
};

static u16 cgroup_feature_disable_mask __read_mostly;

static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
                               struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
static void cgroup_rt_init(void);

#ifdef CONFIG_DEBUG_CGROUP_REF
#define CGROUP_REF_FN_ATTRS        noinline
#define CGROUP_REF_EXPORT(fn)        EXPORT_SYMBOL_GPL(fn);
#include <linux/cgroup_refcnt.h>
#endif

/**
 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 * @ssid: subsys ID of interest
 *
 * cgroup_subsys_enabled() can only be used with literal subsys names which
 * is fine for individual subsystems but unsuitable for cgroup core.  This
 * is slower static_key_enabled() based test indexed by @ssid.
 */
bool cgroup_ssid_enabled(int ssid)
{
        if (!CGROUP_HAS_SUBSYS_CONFIG)
                return false;

        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}

/**
 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 * @cgrp: the cgroup of interest
 *
 * The default hierarchy is the v2 interface of cgroup and this function
 * can be used to test whether a cgroup is on the default hierarchy for
 * cases where a subsystem should behave differently depending on the
 * interface version.
 *
 * List of changed behaviors:
 *
 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 *   and "name" are disallowed.
 *
 * - When mounting an existing superblock, mount options should match.
 *
 * - rename(2) is disallowed.
 *
 * - "tasks" is removed.  Everything should be at process granularity.  Use
 *   "cgroup.procs" instead.
 *
 * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 *   recycled in-between reads.
 *
 * - "release_agent" and "notify_on_release" are removed.  Replacement
 *   notification mechanism will be implemented.
 *
 * - "cgroup.clone_children" is removed.
 *
 * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 *   and its descendants contain no task; otherwise, 1.  The file also
 *   generates kernfs notification which can be monitored through poll and
 *   [di]notify when the value of the file changes.
 *
 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 *   take masks of ancestors with non-empty cpus/mems, instead of being
 *   moved to an ancestor.
 *
 * - cpuset: a task can be moved into an empty cpuset, and again it takes
 *   masks of ancestors.
 *
 * - blkcg: blk-throttle becomes properly hierarchical.
 */
bool cgroup_on_dfl(const struct cgroup *cgrp)
{
        return cgrp->root == &cgrp_dfl_root;
}

/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
                            gfp_t gfp_mask)
{
        int ret;

        idr_preload(gfp_mask);
        spin_lock_bh(&cgroup_idr_lock);
        ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
        spin_unlock_bh(&cgroup_idr_lock);
        idr_preload_end();
        return ret;
}

static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
        void *ret;

        spin_lock_bh(&cgroup_idr_lock);
        ret = idr_replace(idr, ptr, id);
        spin_unlock_bh(&cgroup_idr_lock);
        return ret;
}

static void cgroup_idr_remove(struct idr *idr, int id)
{
        spin_lock_bh(&cgroup_idr_lock);
        idr_remove(idr, id);
        spin_unlock_bh(&cgroup_idr_lock);
}

static bool cgroup_has_tasks(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets;
}

static bool cgroup_is_threaded(struct cgroup *cgrp)
{
        return cgrp->dom_cgrp != cgrp;
}

/* can @cgrp host both domain and threaded children? */
static bool cgroup_is_mixable(struct cgroup *cgrp)
{
        /*
         * Root isn't under domain level resource control exempting it from
         * the no-internal-process constraint, so it can serve as a thread
         * root and a parent of resource domains at the same time.
         */
        return !cgroup_parent(cgrp);
}

/* can @cgrp become a thread root? Should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
        /* mixables don't care */
        if (cgroup_is_mixable(cgrp))
                return true;

        /* domain roots can't be nested under threaded */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* can only have either domain or threaded children */
        if (cgrp->nr_populated_domain_children)
                return false;

        /* and no domain controllers can be enabled */
        if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
                return false;

        return true;
}

/* is @cgrp root of a threaded subtree? */
static bool cgroup_is_thread_root(struct cgroup *cgrp)
{
        /* thread root should be a domain */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* a domain w/ threaded children is a thread root */
        if (cgrp->nr_threaded_children)
                return true;

        /*
         * A domain which has tasks and explicit threaded controllers
         * enabled is a thread root.
         */
        if (cgroup_has_tasks(cgrp) &&
            (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
                return true;

        return false;
}

/* a domain which isn't connected to the root w/o brekage can't be used */
static bool cgroup_is_valid_domain(struct cgroup *cgrp)
{
        /* the cgroup itself can be a thread root */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* but the ancestors can't be unless mixable */
        while ((cgrp = cgroup_parent(cgrp))) {
                if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
                        return false;
                if (cgroup_is_threaded(cgrp))
                        return false;
        }

        return true;
}

/* subsystems visibly enabled on a cgroup */
static u32 cgroup_control(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        u32 root_ss_mask = cgrp->root->subsys_mask;

        if (parent) {
                u32 ss_mask = parent->subtree_control;

                /* threaded cgroups can only have threaded controllers */
                if (cgroup_is_threaded(cgrp))
                        ss_mask &= cgrp_dfl_threaded_ss_mask;
                return ss_mask;
        }

        if (cgroup_on_dfl(cgrp))
                root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
                                  cgrp_dfl_implicit_ss_mask);
        return root_ss_mask;
}

/* subsystems enabled on a cgroup */
static u32 cgroup_ss_mask(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);

        if (parent) {
                u32 ss_mask = parent->subtree_ss_mask;

                /* threaded cgroups can only have threaded controllers */
                if (cgroup_is_threaded(cgrp))
                        ss_mask &= cgrp_dfl_threaded_ss_mask;
                return ss_mask;
        }

        return cgrp->root->subsys_mask;
}

/**
 * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Similar to cgroup_css() but returns the effective css, which is defined
 * as the matching css of the nearest ancestor including self which has @ss
 * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 * function is guaranteed to return non-NULL css.
 */
static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
                                                        struct cgroup_subsys *ss)
{
        lockdep_assert_held(&cgroup_mutex);

        if (!ss)
                return &cgrp->self;

        /*
         * This function is used while updating css associations and thus
         * can't test the csses directly.  Test ss_mask.
         */
        while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
                cgrp = cgroup_parent(cgrp);
                if (!cgrp)
                        return NULL;
        }

        return cgroup_css(cgrp, ss);
}

/**
 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 *
 * The returned css is not guaranteed to be online, and therefore it is the
 * callers responsibility to try get a reference for it.
 */
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
                                         struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        if (!CGROUP_HAS_SUBSYS_CONFIG)
                return NULL;

        do {
                css = cgroup_css(cgrp, ss);

                if (css)
                        return css;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);

        return init_css_set.subsys[ss->id];
}

/**
 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 * The returned css must be put using css_put().
 */
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
                                             struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        if (!CGROUP_HAS_SUBSYS_CONFIG)
                return NULL;

        rcu_read_lock();

        do {
                css = cgroup_css(cgrp, ss);

                if (css && css_tryget_online(css))
                        goto out_unlock;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);

        css = init_css_set.subsys[ss->id];
        css_get(css);
out_unlock:
        rcu_read_unlock();
        return css;
}
EXPORT_SYMBOL_GPL(cgroup_get_e_css);

static void cgroup_get_live(struct cgroup *cgrp)
{
        WARN_ON_ONCE(cgroup_is_dead(cgrp));
        cgroup_get(cgrp);
}

/**
 * __cgroup_task_count - count the number of tasks in a cgroup. The caller
 * is responsible for taking the css_set_lock.
 * @cgrp: the cgroup in question
 */
int __cgroup_task_count(const struct cgroup *cgrp)
{
        int count = 0;
        struct cgrp_cset_link *link;

        lockdep_assert_held(&css_set_lock);

        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                count += link->cset->nr_tasks;

        return count;
}

/**
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
 */
int cgroup_task_count(const struct cgroup *cgrp)
{
        int count;

        spin_lock_irq(&css_set_lock);
        count = __cgroup_task_count(cgrp);
        spin_unlock_irq(&css_set_lock);

        return count;
}

static struct cgroup *kn_priv(struct kernfs_node *kn)
{
        struct kernfs_node *parent;
        /*
         * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT.
         * Therefore it is always safe to dereference this pointer outside of a
         * RCU section.
         */
        parent = rcu_dereference_check(kn->__parent,
                                       kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT);
        return parent->priv;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
        struct cgroup *cgrp = kn_priv(of->kn);
        struct cftype *cft = of_cft(of);

        /*
         * This is open and unprotected implementation of cgroup_css().
         * seq_css() is only called from a kernfs file operation which has
         * an active reference on the file.  Because all the subsystem
         * files are drained before a css is disassociated with a cgroup,
         * the matching css from the cgroup's subsys table is guaranteed to
         * be and stay valid until the enclosing operation is complete.
         */
        if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
                return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
        else
                return &cgrp->self;
}
EXPORT_SYMBOL_GPL(of_css);

/**
 * for_each_css - iterate all css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
 * Should be called under cgroup_mutex.
 */
#define for_each_css(css, ssid, cgrp)                                        \
        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
                if (!((css) = rcu_dereference_check(                        \
                                (cgrp)->subsys[(ssid)],                        \
                                lockdep_is_held(&cgroup_mutex)))) { }        \
                else

/**
 * do_each_subsys_mask - filter for_each_subsys with a bitmask
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 * @ss_mask: the bitmask
 *
 * The block will only run for cases where the ssid-th bit (1 << ssid) of
 * @ss_mask is set.
 */
#define do_each_subsys_mask(ss, ssid, ss_mask) do {                        \
        unsigned long __ss_mask = (ss_mask);                                \
        if (!CGROUP_HAS_SUBSYS_CONFIG) {                                \
                (ssid) = 0;                                                \
                break;                                                        \
        }                                                                \
        for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {        \
                (ss) = cgroup_subsys[ssid];                                \
                {

#define while_each_subsys_mask()                                        \
                }                                                        \
        }                                                                \
} while (false)

/*
 * The default css_set - used by init and its children prior to any
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
struct css_set init_css_set = {
        .refcount                = REFCOUNT_INIT(1),
        .dom_cset                = &init_css_set,
        .tasks                        = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks                = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .dying_tasks                = LIST_HEAD_INIT(init_css_set.dying_tasks),
        .task_iters                = LIST_HEAD_INIT(init_css_set.task_iters),
        .threaded_csets                = LIST_HEAD_INIT(init_css_set.threaded_csets),
        .cgrp_links                = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .mg_src_preload_node        = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
        .mg_dst_preload_node        = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),

        /*
         * The following field is re-initialized when this cset gets linked
         * in cgroup_init().  However, let's initialize the field
         * statically too so that the default cgroup can be accessed safely
         * early during boot.
         */
        .dfl_cgrp                = &cgrp_dfl_root.cgrp,
};

static int css_set_count        = 1;        /* 1 for init_css_set */

static bool css_set_threaded(struct css_set *cset)
{
        return cset->dom_cset != cset;
}

/**
 * css_set_populated - does a css_set contain any tasks?
 * @cset: target css_set
 *
 * css_set_populated() should be the same as !!cset->nr_tasks at steady
 * state. However, css_set_populated() can be called while a task is being
 * added to or removed from the linked list before the nr_tasks is
 * properly updated. Hence, we can't just look at ->nr_tasks here.
 */
static bool css_set_populated(struct css_set *cset)
{
        lockdep_assert_held(&css_set_lock);

        return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}

/**
 * cgroup_update_populated - update the populated count of a cgroup
 * @cgrp: the target cgroup
 * @populated: inc or dec populated count
 *
 * One of the css_sets associated with @cgrp is either getting its first
 * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
 * count is propagated towards root so that a given cgroup's
 * nr_populated_children is zero iff none of its descendants contain any
 * tasks.
 *
 * @cgrp's interface file "cgroup.populated" is zero if both
 * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
 * 1 otherwise.  When the sum changes from or to zero, userland is notified
 * that the content of the interface file has changed.  This can be used to
 * detect when @cgrp and its descendants become populated or empty.
 */
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
        struct cgroup *child = NULL;
        int adj = populated ? 1 : -1;

        lockdep_assert_held(&css_set_lock);

        do {
                bool was_populated = cgroup_is_populated(cgrp);

                if (!child) {
                        cgrp->nr_populated_csets += adj;
                } else {
                        if (cgroup_is_threaded(child))
                                cgrp->nr_populated_threaded_children += adj;
                        else
                                cgrp->nr_populated_domain_children += adj;
                }

                if (was_populated == cgroup_is_populated(cgrp))
                        break;

                cgroup1_check_for_release(cgrp);
                TRACE_CGROUP_PATH(notify_populated, cgrp,
                                  cgroup_is_populated(cgrp));
                cgroup_file_notify(&cgrp->events_file);

                child = cgrp;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);
}

/**
 * css_set_update_populated - update populated state of a css_set
 * @cset: target css_set
 * @populated: whether @cset is populated or depopulated
 *
 * @cset is either getting the first task or losing the last.  Update the
 * populated counters of all associated cgroups accordingly.
 */
static void css_set_update_populated(struct css_set *cset, bool populated)
{
        struct cgrp_cset_link *link;

        lockdep_assert_held(&css_set_lock);

        list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
                cgroup_update_populated(link->cgrp, populated);
}

/*
 * @task is leaving, advance task iterators which are pointing to it so
 * that they can resume at the next position.  Advancing an iterator might
 * remove it from the list, use safe walk.  See css_task_iter_skip() for
 * details.
 */
static void css_set_skip_task_iters(struct css_set *cset,
                                    struct task_struct *task)
{
        struct css_task_iter *it, *pos;

        list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
                css_task_iter_skip(it, task);
}

/**
 * css_set_move_task - move a task from one css_set to another
 * @task: task being moved
 * @from_cset: css_set @task currently belongs to (may be NULL)
 * @to_cset: new css_set @task is being moved to (may be NULL)
 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 *
 * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 * css_set, @from_cset can be NULL.  If @task is being disassociated
 * instead of moved, @to_cset can be NULL.
 *
 * This function automatically handles populated counter updates and
 * css_task_iter adjustments but the caller is responsible for managing
 * @from_cset and @to_cset's reference counts.
 */
static void css_set_move_task(struct task_struct *task,
                              struct css_set *from_cset, struct css_set *to_cset,
                              bool use_mg_tasks)
{
        lockdep_assert_held(&css_set_lock);

        if (to_cset && !css_set_populated(to_cset))
                css_set_update_populated(to_cset, true);

        if (from_cset) {
                WARN_ON_ONCE(list_empty(&task->cg_list));

                css_set_skip_task_iters(from_cset, task);
                list_del_init(&task->cg_list);
                if (!css_set_populated(from_cset))
                        css_set_update_populated(from_cset, false);
        } else {
                WARN_ON_ONCE(!list_empty(&task->cg_list));
        }

        if (to_cset) {
                /*
                 * We are synchronized through cgroup_threadgroup_rwsem
                 * against PF_EXITING setting such that we can't race
                 * against cgroup_task_dead()/cgroup_task_free() dropping
                 * the css_set.
                 */
                WARN_ON_ONCE(task->flags & PF_EXITING);

                cgroup_move_task(task, to_cset);
                list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
                                                             &to_cset->tasks);
        }
}

/*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
 * account cgroups in empty hierarchies.
 */
#define CSS_SET_HASH_BITS        7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

static unsigned long css_set_hash(struct cgroup_subsys_state **css)
{
        unsigned long key = 0UL;
        struct cgroup_subsys *ss;
        int i;

        for_each_subsys(ss, i)
                key += (unsigned long)css[i];
        key = (key >> 16) ^ key;

        return key;
}

void put_css_set_locked(struct css_set *cset)
{
        struct cgrp_cset_link *link, *tmp_link;
        struct cgroup_subsys *ss;
        int ssid;

        lockdep_assert_held(&css_set_lock);

        if (!refcount_dec_and_test(&cset->refcount))
                return;

        WARN_ON_ONCE(!list_empty(&cset->threaded_csets));

        /* This css_set is dead. Unlink it and release cgroup and css refs */
        for_each_subsys(ss, ssid) {
                list_del(&cset->e_cset_node[ssid]);
                css_put(cset->subsys[ssid]);
        }
        hash_del(&cset->hlist);
        css_set_count--;

        list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                if (cgroup_parent(link->cgrp))
                        cgroup_put(link->cgrp);
                kfree(link);
        }

        if (css_set_threaded(cset)) {
                list_del(&cset->threaded_csets_node);
                put_css_set_locked(cset->dom_cset);
        }

        kfree_rcu(cset, rcu_head);
}

/**
 * compare_css_sets - helper function for find_existing_css_set().
 * @cset: candidate css_set being tested
 * @old_cset: existing css_set for a task
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
 * Returns true if "cset" matches "old_cset" except for the hierarchy
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
static bool compare_css_sets(struct css_set *cset,
                             struct css_set *old_cset,
                             struct cgroup *new_cgrp,
                             struct cgroup_subsys_state *template[])
{
        struct cgroup *new_dfl_cgrp;
        struct list_head *l1, *l2;

        /*
         * On the default hierarchy, there can be csets which are
         * associated with the same set of cgroups but different csses.
         * Let's first ensure that csses match.
         */
        if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                return false;


        /* @cset's domain should match the default cgroup's */
        if (cgroup_on_dfl(new_cgrp))
                new_dfl_cgrp = new_cgrp;
        else
                new_dfl_cgrp = old_cset->dfl_cgrp;

        if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
                return false;

        /*
         * Compare cgroup pointers in order to distinguish between
         * different cgroups in hierarchies.  As different cgroups may
         * share the same effective css, this comparison is always
         * necessary.
         */
        l1 = &cset->cgrp_links;
        l2 = &old_cset->cgrp_links;
        while (1) {
                struct cgrp_cset_link *link1, *link2;
                struct cgroup *cgrp1, *cgrp2;

                l1 = l1->next;
                l2 = l2->next;
                /* See if we reached the end - both lists are equal length. */
                if (l1 == &cset->cgrp_links) {
                        BUG_ON(l2 != &old_cset->cgrp_links);
                        break;
                } else {
                        BUG_ON(l2 == &old_cset->cgrp_links);
                }
                /* Locate the cgroups associated with these links. */
                link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
                link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
                cgrp1 = link1->cgrp;
                cgrp2 = link2->cgrp;
                /* Hierarchies should be linked in the same order. */
                BUG_ON(cgrp1->root != cgrp2->root);

                /*
                 * If this hierarchy is the hierarchy of the cgroup
                 * that's changing, then we need to check that this
                 * css_set points to the new cgroup; if it's any other
                 * hierarchy, then this css_set should point to the
                 * same cgroup as the old css_set.
                 */
                if (cgrp1->root == new_cgrp->root) {
                        if (cgrp1 != new_cgrp)
                                return false;
                } else {
                        if (cgrp1 != cgrp2)
                                return false;
                }
        }
        return true;
}

/**
 * find_existing_css_set - init css array and find the matching css_set
 * @old_cset: the css_set that we're using before the cgroup transition
 * @cgrp: the cgroup that we're moving into
 * @template: out param for the new set of csses, should be clear on entry
 */
static struct css_set *find_existing_css_set(struct css_set *old_cset,
                                        struct cgroup *cgrp,
                                        struct cgroup_subsys_state **template)
{
        struct cgroup_root *root = cgrp->root;
        struct cgroup_subsys *ss;
        struct css_set *cset;
        unsigned long key;
        int i;

        /*
         * Build the set of subsystem state objects that we want to see in the
         * new css_set. While subsystems can change globally, the entries here
         * won't change, so no need for locking.
         */
        for_each_subsys(ss, i) {
                if (root->subsys_mask & (1UL << i)) {
                        /*
                         * @ss is in this hierarchy, so we want the
                         * effective css from @cgrp.
                         */
                        template[i] = cgroup_e_css_by_mask(cgrp, ss);
                } else {
                        /*
                         * @ss is not in this hierarchy, so we don't want
                         * to change the css.
                         */
                        template[i] = old_cset->subsys[i];
                }
        }

        key = css_set_hash(template);
        hash_for_each_possible(css_set_table, cset, hlist, key) {
                if (!compare_css_sets(cset, old_cset, cgrp, template))
                        continue;

                /* This css_set matches what we need */
                return cset;
        }

        /* No existing cgroup group matched */
        return NULL;
}

static void free_cgrp_cset_links(struct list_head *links_to_free)
{
        struct cgrp_cset_link *link, *tmp_link;

        list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
                list_del(&link->cset_link);
                kfree(link);
        }
}

/**
 * allocate_cgrp_cset_links - allocate cgrp_cset_links
 * @count: the number of links to allocate
 * @tmp_links: list_head the allocated links are put on
 *
 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 * through ->cset_link.  Returns 0 on success or -errno.
 */
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{
        struct cgrp_cset_link *link;
        int i;

        INIT_LIST_HEAD(tmp_links);

        for (i = 0; i < count; i++) {
                link = kzalloc_obj(*link);
                if (!link) {
                        free_cgrp_cset_links(tmp_links);
                        return -ENOMEM;
                }
                list_add(&link->cset_link, tmp_links);
        }
        return 0;
}

/**
 * link_css_set - a helper function to link a css_set to a cgroup
 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
 * @cset: the css_set to be linked
 * @cgrp: the destination cgroup
 */
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
                         struct cgroup *cgrp)
{
        struct cgrp_cset_link *link;

        BUG_ON(list_empty(tmp_links));

        if (cgroup_on_dfl(cgrp))
                cset->dfl_cgrp = cgrp;

        link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
        link->cset = cset;
        link->cgrp = cgrp;

        /*
         * Always add links to the tail of the lists so that the lists are
         * in chronological order.
         */
        list_move_tail(&link->cset_link, &cgrp->cset_links);
        list_add_tail(&link->cgrp_link, &cset->cgrp_links);

        if (cgroup_parent(cgrp))
                cgroup_get_live(cgrp);
}

/**
 * find_css_set - return a new css_set with one cgroup updated
 * @old_cset: the baseline css_set
 * @cgrp: the cgroup to be updated
 *
 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 * substituted into the appropriate hierarchy.
 */
static struct css_set *find_css_set(struct css_set *old_cset,
                                    struct cgroup *cgrp)
{
        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
        struct css_set *cset;
        struct list_head tmp_links;
        struct cgrp_cset_link *link;
        struct cgroup_subsys *ss;
        unsigned long key;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        /* First see if we already have a cgroup group that matches
         * the desired set */
        spin_lock_irq(&css_set_lock);
        cset = find_existing_css_set(old_cset, cgrp, template);
        if (cset)
                get_css_set(cset);
        spin_unlock_irq(&css_set_lock);

        if (cset)
                return cset;

        cset = kzalloc_obj(*cset);
        if (!cset)
                return NULL;

        /* Allocate all the cgrp_cset_link objects that we'll need */
        if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
                kfree(cset);
                return NULL;
        }

        refcount_set(&cset->refcount, 1);
        cset->dom_cset = cset;
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->dying_tasks);
        INIT_LIST_HEAD(&cset->task_iters);
        INIT_LIST_HEAD(&cset->threaded_csets);
        INIT_HLIST_NODE(&cset->hlist);
        INIT_LIST_HEAD(&cset->cgrp_links);
        INIT_LIST_HEAD(&cset->mg_src_preload_node);
        INIT_LIST_HEAD(&cset->mg_dst_preload_node);
        INIT_LIST_HEAD(&cset->mg_node);

        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
        memcpy(cset->subsys, template, sizeof(cset->subsys));

        spin_lock_irq(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;

                if (c->root == cgrp->root)
                        c = cgrp;
                link_css_set(&tmp_links, cset, c);
        }

        BUG_ON(!list_empty(&tmp_links));

        css_set_count++;

        /* Add @cset to the hash table */
        key = css_set_hash(cset->subsys);
        hash_add(css_set_table, &cset->hlist, key);

        for_each_subsys(ss, ssid) {
                struct cgroup_subsys_state *css = cset->subsys[ssid];

                list_add_tail(&cset->e_cset_node[ssid],
                              &css->cgroup->e_csets[ssid]);
                css_get(css);
        }

        spin_unlock_irq(&css_set_lock);

        /*
         * If @cset should be threaded, look up the matching dom_cset and
         * link them up.  We first fully initialize @cset then look for the
         * dom_cset.  It's simpler this way and safe as @cset is guaranteed
         * to stay empty until we return.
         */
        if (cgroup_is_threaded(cset->dfl_cgrp)) {
                struct css_set *dcset;

                dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
                if (!dcset) {
                        put_css_set(cset);
                        return NULL;
                }

                spin_lock_irq(&css_set_lock);
                cset->dom_cset = dcset;
                list_add_tail(&cset->threaded_csets_node,
                              &dcset->threaded_csets);
                spin_unlock_irq(&css_set_lock);
        }

        return cset;
}

struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
        struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;

        return root_cgrp->root;
}

void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
{
        bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;

        /*
         * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
         * favordynmods can flip while task is between
         * cgroup_threadgroup_change_begin() and end(), so down_write global
         * cgroup_threadgroup_rwsem to synchronize them.
         *
         * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
         * cgroup_threadgroup_rwsem doesn't exlude tasks between
         * cgroup_thread_group_change_begin() and end() and thus it's unsafe to
         * turn off. As the scenario is unlikely, simply disallow disabling once
         * enabled and print out a warning.
         */
        percpu_down_write(&cgroup_threadgroup_rwsem);
        if (favor && !favoring) {
                cgroup_enable_per_threadgroup_rwsem = true;
                rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
                root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
        } else if (!favor && favoring) {
                if (cgroup_enable_per_threadgroup_rwsem)
                        pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
                rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
                root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
        }
        percpu_up_write(&cgroup_threadgroup_rwsem);
}

static int cgroup_init_root_id(struct cgroup_root *root)
{
        int id;

        lockdep_assert_held(&cgroup_mutex);

        id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
        if (id < 0)
                return id;

        root->hierarchy_id = id;
        return 0;
}

static void cgroup_exit_root_id(struct cgroup_root *root)
{
        lockdep_assert_held(&cgroup_mutex);

        idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}

void cgroup_free_root(struct cgroup_root *root)
{
        kfree_rcu(root, rcu);
}

static void cgroup_destroy_root(struct cgroup_root *root)
{
        struct cgroup *cgrp = &root->cgrp;
        struct cgrp_cset_link *link, *tmp_link;
        int ret;

        trace_cgroup_destroy_root(root);

        cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);

        BUG_ON(atomic_read(&root->nr_cgrps));
        BUG_ON(!list_empty(&cgrp->self.children));

        ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
                                           CGROUP_LIFETIME_OFFLINE, cgrp);
        WARN_ON_ONCE(notifier_to_errno(ret));

        /* Rebind all subsystems back to the default hierarchy */
        WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));

        /*
         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
        spin_lock_irq(&css_set_lock);

        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                kfree(link);
        }

        spin_unlock_irq(&css_set_lock);

        WARN_ON_ONCE(list_empty(&root->root_list));
        list_del_rcu(&root->root_list);
        cgroup_root_count--;

        if (!have_favordynmods)
                cgroup_favor_dynmods(root, false);

        cgroup_exit_root_id(root);

        cgroup_unlock();

        kernfs_destroy_root(root->kf_root);
        cgroup_free_root(root);
}

/*
 * Returned cgroup is without refcount but it's valid as long as cset pins it.
 */
static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
                                            struct cgroup_root *root)
{
        struct cgroup *res_cgroup = NULL;

        if (cset == &init_css_set) {
                res_cgroup = &root->cgrp;
        } else if (root == &cgrp_dfl_root) {
                res_cgroup = cset->dfl_cgrp;
        } else {
                struct cgrp_cset_link *link;
                lockdep_assert_held(&css_set_lock);

                list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                        struct cgroup *c = link->cgrp;

                        if (c->root == root) {
                                res_cgroup = c;
                                break;
                        }
                }
        }

        /*
         * If cgroup_mutex is not held, the cgrp_cset_link will be freed
         * before we remove the cgroup root from the root_list. Consequently,
         * when accessing a cgroup root, the cset_link may have already been
         * freed, resulting in a NULL res_cgroup. However, by holding the
         * cgroup_mutex, we ensure that res_cgroup can't be NULL.
         * If we don't hold cgroup_mutex in the caller, we must do the NULL
         * check.
         */
        return res_cgroup;
}

/*
 * look up cgroup associated with current task's cgroup namespace on the
 * specified hierarchy
 */
static struct cgroup *
current_cgns_cgroup_from_root(struct cgroup_root *root)
{
        struct cgroup *res = NULL;
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        rcu_read_lock();

        cset = current->nsproxy->cgroup_ns->root_cset;
        res = __cset_cgroup_from_root(cset, root);

        rcu_read_unlock();

        /*
         * The namespace_sem is held by current, so the root cgroup can't
         * be umounted. Therefore, we can ensure that the res is non-NULL.
         */
        WARN_ON_ONCE(!res);
        return res;
}

/*
 * Look up cgroup associated with current task's cgroup namespace on the default
 * hierarchy.
 *
 * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
 * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
 *   pointers.
 * - css_set_lock is not needed because we just read cset->dfl_cgrp.
 * - As a bonus returned cgrp is pinned with the current because it cannot
 *   switch cgroup_ns asynchronously.
 */
static struct cgroup *current_cgns_cgroup_dfl(void)
{
        struct css_set *cset;

        if (current->nsproxy) {
                cset = current->nsproxy->cgroup_ns->root_cset;
                return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
        } else {
                /*
                 * NOTE: This function may be called from bpf_cgroup_from_id()
                 * on a task which has already passed exit_nsproxy_namespaces()
                 * and nsproxy == NULL. Fall back to cgrp_dfl_root which will
                 * make all cgroups visible for lookups.
                 */
                return &cgrp_dfl_root.cgrp;
        }
}

/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
                                            struct cgroup_root *root)
{
        lockdep_assert_held(&css_set_lock);

        return __cset_cgroup_from_root(cset, root);
}

/*
 * Return the cgroup for "task" from the given hierarchy. Must be
 * called with css_set_lock held to prevent task's groups from being modified.
 * Must be called with either cgroup_mutex or rcu read lock to prevent the
 * cgroup root from being destroyed.
 */
struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                     struct cgroup_root *root)
{
        /*
         * No need to lock the task - since we hold css_set_lock the
         * task can't change groups.
         */
        return cset_cgroup_from_root(task_css_set(task), root);
}

/*
 * A task must hold cgroup_mutex to modify cgroups.
 *
 * Any task can increment and decrement the count field without lock.
 * So in general, code holding cgroup_mutex can't rely on the count
 * field not changing.  However, if the count goes to zero, then only
 * cgroup_attach_task() can increment it again.  Because a count of zero
 * means that no tasks are currently attached, therefore there is no
 * way a task attached to that cgroup can fork (the other way to
 * increment the count).  So code holding cgroup_mutex can safely
 * assume that if the count is zero, it will stay zero. Similarly, if
 * a task holds cgroup_mutex on a cgroup with zero count, it
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
 * A cgroup can only be deleted if both its 'count' of using tasks
 * is zero, and its list of 'children' cgroups is empty.  Since all
 * tasks in the system use _some_ cgroup, and since there is always at
 * least one task in the system (init, pid == 1), therefore, root cgroup
 * always has either children cgroups and/or using tasks.  So we don't
 * need a special hack to ensure that root cgroup cannot be deleted.
 *
 * P.S.  One more locking exception.  RCU is used to guard the
 * update of a tasks cgroup pointer by cgroup_attach_task()
 */

static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
                              char *buf)
{
        struct cgroup_subsys *ss = cft->ss;

        if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
            !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
                const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";

                snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
                         dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
                         cft->name);
        } else {
                strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
        }
        return buf;
}

/**
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
 *
 * S_IRUGO for read, S_IWUSR for write.
 */
static umode_t cgroup_file_mode(const struct cftype *cft)
{
        umode_t mode = 0;

        if (cft->read_u64 || cft->read_s64 || cft->seq_show)
                mode |= S_IRUGO;

        if (cft->write_u64 || cft->write_s64 || cft->write) {
                if (cft->flags & CFTYPE_WORLD_WRITABLE)
                        mode |= S_IWUGO;
                else
                        mode |= S_IWUSR;
        }

        return mode;
}

/**
 * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
 * @subtree_control: the new subtree_control mask to consider
 * @this_ss_mask: available subsystems
 *
 * On the default hierarchy, a subsystem may request other subsystems to be
 * enabled together through its ->depends_on mask.  In such cases, more
 * subsystems than specified in "cgroup.subtree_control" may be enabled.
 *
 * This function calculates which subsystems need to be enabled if
 * @subtree_control is to be applied while restricted to @this_ss_mask.
 */
static u32 cgroup_calc_subtree_ss_mask(u32 subtree_control, u32 this_ss_mask)
{
        u32 cur_ss_mask = subtree_control;
        struct cgroup_subsys *ss;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        cur_ss_mask |= cgrp_dfl_implicit_ss_mask;

        while (true) {
                u32 new_ss_mask = cur_ss_mask;

                do_each_subsys_mask(ss, ssid, cur_ss_mask) {
                        new_ss_mask |= ss->depends_on;
                } while_each_subsys_mask();

                /*
                 * Mask out subsystems which aren't available.  This can
                 * happen only if some depended-upon subsystems were bound
                 * to non-default hierarchies.
                 */
                new_ss_mask &= this_ss_mask;

                if (new_ss_mask == cur_ss_mask)
                        break;
                cur_ss_mask = new_ss_mask;
        }

        return cur_ss_mask;
}

/**
 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 *
 * This helper undoes cgroup_kn_lock_live() and should be invoked before
 * the method finishes if locking succeeded.  Note that once this function
 * returns the cgroup returned by cgroup_kn_lock_live() may become
 * inaccessible any time.  If the caller intends to continue to access the
 * cgroup, it should pin it before invoking this function.
 */
void cgroup_kn_unlock(struct kernfs_node *kn)
{
        struct cgroup *cgrp;

        if (kernfs_type(kn) == KERNFS_DIR)
                cgrp = kn->priv;
        else
                cgrp = kn_priv(kn);

        cgroup_unlock();

        kernfs_unbreak_active_protection(kn);
        cgroup_put(cgrp);
}

/**
 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 * @drain_offline: perform offline draining on the cgroup
 *
 * This helper is to be used by a cgroup kernfs method currently servicing
 * @kn.  It breaks the active protection, performs cgroup locking and
 * verifies that the associated cgroup is alive.  Returns the cgroup if
 * alive; otherwise, %NULL.  A successful return should be undone by a
 * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
 * cgroup is drained of offlining csses before return.
 *
 * Any cgroup kernfs method implementation which requires locking the
 * associated cgroup should use this helper.  It avoids nesting cgroup
 * locking under kernfs active protection and allows all kernfs operations
 * including self-removal.
 */
struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{
        struct cgroup *cgrp;

        if (kernfs_type(kn) == KERNFS_DIR)
                cgrp = kn->priv;
        else
                cgrp = kn_priv(kn);

        /*
         * We're gonna grab cgroup_mutex which nests outside kernfs
         * active_ref.  cgroup liveliness check alone provides enough
         * protection against removal.  Ensure @cgrp stays accessible and
         * break the active_ref protection.
         */
        if (!cgroup_tryget(cgrp))
                return NULL;
        kernfs_break_active_protection(kn);

        if (drain_offline)
                cgroup_lock_and_drain_offline(cgrp);
        else
                cgroup_lock();

        if (!cgroup_is_dead(cgrp))
                return cgrp;

        cgroup_kn_unlock(kn);
        return NULL;
}

static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
        char name[CGROUP_FILE_NAME_MAX];

        lockdep_assert_held(&cgroup_mutex);

        if (cft->file_offset) {
                struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
                struct cgroup_file *cfile = (void *)css + cft->file_offset;

                spin_lock_irq(&cfile->lock);
                WRITE_ONCE(cfile->kn, NULL);
                spin_unlock_irq(&cfile->lock);

                timer_delete_sync(&cfile->notify_timer);
        }

        kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}

/**
 * css_clear_dir - remove subsys files in a cgroup directory
 * @css: target css
 */
static void css_clear_dir(struct cgroup_subsys_state *css)
{
        struct cgroup *cgrp = css->cgroup;
        struct cftype *cfts;

        if (!(css->flags & CSS_VISIBLE))
                return;

        css->flags &= ~CSS_VISIBLE;

        if (css_is_self(css)) {
                if (cgroup_on_dfl(cgrp)) {
                        cgroup_addrm_files(css, cgrp,
                                           cgroup_base_files, false);
                        if (cgroup_psi_enabled())
                                cgroup_addrm_files(css, cgrp,
                                                   cgroup_psi_files, false);
                } else {
                        cgroup_addrm_files(css, cgrp,
                                           cgroup1_base_files, false);
                }
        } else {
                list_for_each_entry(cfts, &css->ss->cfts, node)
                        cgroup_addrm_files(css, cgrp, cfts, false);
        }
}

/**
 * css_populate_dir - create subsys files in a cgroup directory
 * @css: target css
 *
 * On failure, no file is added.
 */
static int css_populate_dir(struct cgroup_subsys_state *css)
{
        struct cgroup *cgrp = css->cgroup;
        struct cftype *cfts, *failed_cfts;
        int ret;

        if (css->flags & CSS_VISIBLE)
                return 0;

        if (css_is_self(css)) {
                if (cgroup_on_dfl(cgrp)) {
                        ret = cgroup_addrm_files(css, cgrp,
                                                 cgroup_base_files, true);
                        if (ret < 0)
                                return ret;

                        if (cgroup_psi_enabled()) {
                                ret = cgroup_addrm_files(css, cgrp,
                                                         cgroup_psi_files, true);
                                if (ret < 0) {
                                        cgroup_addrm_files(css, cgrp,
                                                           cgroup_base_files, false);
                                        return ret;
                                }
                        }
                } else {
                        ret = cgroup_addrm_files(css, cgrp,
                                                 cgroup1_base_files, true);
                        if (ret < 0)
                                return ret;
                }
        } else {
                list_for_each_entry(cfts, &css->ss->cfts, node) {
                        ret = cgroup_addrm_files(css, cgrp, cfts, true);
                        if (ret < 0) {
                                failed_cfts = cfts;
                                goto err;
                        }
                }
        }

        css->flags |= CSS_VISIBLE;

        return 0;
err:
        list_for_each_entry(cfts, &css->ss->cfts, node) {
                if (cfts == failed_cfts)
                        break;
                cgroup_addrm_files(css, cgrp, cfts, false);
        }
        return ret;
}

int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask)
{
        struct cgroup *dcgrp = &dst_root->cgrp;
        struct cgroup_subsys *ss;
        int ssid, ret;
        u32 dfl_disable_ss_mask = 0;

        lockdep_assert_held(&cgroup_mutex);

        do_each_subsys_mask(ss, ssid, ss_mask) {
                /*
                 * If @ss has non-root csses attached to it, can't move.
                 * If @ss is an implicit controller, it is exempt from this
                 * rule and can be stolen.
                 */
                if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
                    !ss->implicit_on_dfl)
                        return -EBUSY;

                /* can't move between two non-dummy roots either */
                if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
                        return -EBUSY;

                /*
                 * Collect ssid's that need to be disabled from default
                 * hierarchy.
                 */
                if (ss->root == &cgrp_dfl_root)
                        dfl_disable_ss_mask |= 1 << ssid;

        } while_each_subsys_mask();

        if (dfl_disable_ss_mask) {
                struct cgroup *scgrp = &cgrp_dfl_root.cgrp;

                /*
                 * Controllers from default hierarchy that need to be rebound
                 * are all disabled together in one go.
                 */
                cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
                WARN_ON(cgroup_apply_control(scgrp));
                cgroup_finalize_control(scgrp, 0);
        }

        do_each_subsys_mask(ss, ssid, ss_mask) {
                struct cgroup_root *src_root = ss->root;
                struct cgroup *scgrp = &src_root->cgrp;
                struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
                struct css_set *cset, *cset_pos;
                struct css_task_iter *it;

                WARN_ON(!css || cgroup_css(dcgrp, ss));

                if (src_root != &cgrp_dfl_root) {
                        /* disable from the source */
                        src_root->subsys_mask &= ~(1 << ssid);
                        WARN_ON(cgroup_apply_control(scgrp));
                        cgroup_finalize_control(scgrp, 0);
                }

                /* rebind */
                RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
                rcu_assign_pointer(dcgrp->subsys[ssid], css);
                ss->root = dst_root;

                spin_lock_irq(&css_set_lock);
                css->cgroup = dcgrp;
                WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
                list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
                                         e_cset_node[ss->id]) {
                        list_move_tail(&cset->e_cset_node[ss->id],
                                       &dcgrp->e_csets[ss->id]);
                        /*
                         * all css_sets of scgrp together in same order to dcgrp,
                         * patch in-flight iterators to preserve correct iteration.
                         * since the iterator is always advanced right away and
                         * finished when it->cset_pos meets it->cset_head, so only
                         * update it->cset_head is enough here.
                         */
                        list_for_each_entry(it, &cset->task_iters, iters_node)
                                if (it->cset_head == &scgrp->e_csets[ss->id])
                                        it->cset_head = &dcgrp->e_csets[ss->id];
                }
                spin_unlock_irq(&css_set_lock);

                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
                if (dst_root == &cgrp_dfl_root) {
                        static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
                } else {
                        dcgrp->subtree_control |= 1 << ssid;
                        static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                }

                ret = cgroup_apply_control(dcgrp);
                if (ret)
                        pr_warn("partial failure to rebind %s controller (err=%d)\n",
                                ss->name, ret);

                if (ss->bind)
                        ss->bind(css);
        } while_each_subsys_mask();

        kernfs_activate(dcgrp->kn);
        return 0;
}

int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
                     struct kernfs_root *kf_root)
{
        int len = 0;
        char *buf = NULL;
        struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
        struct cgroup *ns_cgroup;

        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        spin_lock_irq(&css_set_lock);
        ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
        len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
        spin_unlock_irq(&css_set_lock);

        if (len == -E2BIG)
                len = -ERANGE;
        else if (len > 0) {
                seq_escape(sf, buf, " \t\n\\");
                len = 0;
        }
        kfree(buf);
        return len;
}

enum cgroup2_param {
        Opt_nsdelegate,
        Opt_favordynmods,
        Opt_memory_localevents,
        Opt_memory_recursiveprot,
        Opt_memory_hugetlb_accounting,
        Opt_pids_localevents,
        nr__cgroup2_params
};

static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
        fsparam_flag("nsdelegate",                Opt_nsdelegate),
        fsparam_flag("favordynmods",                Opt_favordynmods),
        fsparam_flag("memory_localevents",        Opt_memory_localevents),
        fsparam_flag("memory_recursiveprot",        Opt_memory_recursiveprot),
        fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
        fsparam_flag("pids_localevents",        Opt_pids_localevents),
        {}
};

static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_nsdelegate:
                ctx->flags |= CGRP_ROOT_NS_DELEGATE;
                return 0;
        case Opt_favordynmods:
                ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
                return 0;
        case Opt_memory_localevents:
                ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                return 0;
        case Opt_memory_recursiveprot:
                ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
                return 0;
        case Opt_memory_hugetlb_accounting:
                ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
                return 0;
        case Opt_pids_localevents:
                ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
                return 0;
        }
        return -EINVAL;
}

struct cgroup_of_peak *of_peak(struct kernfs_open_file *of)
{
        struct cgroup_file_ctx *ctx = of->priv;

        return &ctx->peak;
}

static void apply_cgroup_root_flags(unsigned int root_flags)
{
        if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
                if (root_flags & CGRP_ROOT_NS_DELEGATE)
                        cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;

                cgroup_favor_dynmods(&cgrp_dfl_root,
                                     root_flags & CGRP_ROOT_FAVOR_DYNMODS);

                if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;

                if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;

                if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;

                if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
                        cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS;
        }
}

static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
{
        if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
                seq_puts(seq, ",nsdelegate");
        if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
                seq_puts(seq, ",favordynmods");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                seq_puts(seq, ",memory_localevents");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
                seq_puts(seq, ",memory_recursiveprot");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
                seq_puts(seq, ",memory_hugetlb_accounting");
        if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
                seq_puts(seq, ",pids_localevents");
        return 0;
}

static int cgroup_reconfigure(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

        apply_cgroup_root_flags(ctx->flags);
        return 0;
}

static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
        struct cgroup_subsys *ss;
        int ssid;

        INIT_LIST_HEAD(&cgrp->self.sibling);
        INIT_LIST_HEAD(&cgrp->self.children);
        INIT_LIST_HEAD(&cgrp->cset_links);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
        cgrp->self.cgroup = cgrp;
        cgrp->self.flags |= CSS_ONLINE;
        cgrp->dom_cgrp = cgrp;
        cgrp->max_descendants = INT_MAX;
        cgrp->max_depth = INT_MAX;
        prev_cputime_init(&cgrp->prev_cputime);

        for_each_subsys(ss, ssid)
                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);

#ifdef CONFIG_CGROUP_BPF
        for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++)
                cgrp->bpf.revisions[i] = 1;
#endif

        init_waitqueue_head(&cgrp->offline_waitq);
        init_waitqueue_head(&cgrp->dying_populated_waitq);
        INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}

void init_cgroup_root(struct cgroup_fs_context *ctx)
{
        struct cgroup_root *root = ctx->root;
        struct cgroup *cgrp = &root->cgrp;

        INIT_LIST_HEAD_RCU(&root->root_list);
        atomic_set(&root->nr_cgrps, 1);
        cgrp->root = root;
        init_cgroup_housekeeping(cgrp);

        /* DYNMODS must be modified through cgroup_favor_dynmods() */
        root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
        if (ctx->release_agent)
                strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
        if (ctx->name)
                strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
        if (ctx->cpuset_clone_children)
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}

int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask)
{
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
        struct kernfs_syscall_ops *kf_sops;
        struct css_set *cset;
        int i, ret;

        lockdep_assert_held(&cgroup_mutex);

        ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
                              0, GFP_KERNEL);
        if (ret)
                goto out;

        /*
         * We're accessing css_set_count without locking css_set_lock here,
         * but that's OK - it can only be increased by someone holding
         * cgroup_lock, and that's us.  Later rebinding may disable
         * controllers on the default hierarchy and thus create new csets,
         * which can't be more than the existing ones.  Allocate 2x.
         */
        ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
        if (ret)
                goto cancel_ref;

        ret = cgroup_init_root_id(root);
        if (ret)
                goto cancel_ref;

        kf_sops = root == &cgrp_dfl_root ?
                &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;

        root->kf_root = kernfs_create_root(kf_sops,
                                           KERNFS_ROOT_CREATE_DEACTIVATED |
                                           KERNFS_ROOT_SUPPORT_EXPORTOP |
                                           KERNFS_ROOT_SUPPORT_USER_XATTR |
                                           KERNFS_ROOT_INVARIANT_PARENT,
                                           root_cgrp);
        if (IS_ERR(root->kf_root)) {
                ret = PTR_ERR(root->kf_root);
                goto exit_root_id;
        }
        root_cgrp->kn = kernfs_root_to_node(root->kf_root);
        WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
        root_cgrp->ancestors[0] = root_cgrp;

        ret = css_populate_dir(&root_cgrp->self);
        if (ret)
                goto destroy_root;

        ret = css_rstat_init(&root_cgrp->self);
        if (ret)
                goto destroy_root;

        ret = rebind_subsystems(root, ss_mask);
        if (ret)
                goto exit_stats;

        ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
                                           CGROUP_LIFETIME_ONLINE, root_cgrp);
        WARN_ON_ONCE(notifier_to_errno(ret));

        trace_cgroup_setup_root(root);

        /*
         * There must be no failure case after here, since rebinding takes
         * care of subsystems' refcounts, which are explicitly dropped in
         * the failure exit path.
         */
        list_add_rcu(&root->root_list, &cgroup_roots);
        cgroup_root_count++;

        /*
         * Link the root cgroup in this hierarchy into all the css_set
         * objects.
         */
        spin_lock_irq(&css_set_lock);
        hash_for_each(css_set_table, i, cset, hlist) {
                link_css_set(&tmp_links, cset, root_cgrp);
                if (css_set_populated(cset))
                        cgroup_update_populated(root_cgrp, true);
        }
        spin_unlock_irq(&css_set_lock);

        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);

        ret = 0;
        goto out;

exit_stats:
        css_rstat_exit(&root_cgrp->self);
destroy_root:
        kernfs_destroy_root(root->kf_root);
        root->kf_root = NULL;
exit_root_id:
        cgroup_exit_root_id(root);
cancel_ref:
        percpu_ref_exit(&root_cgrp->self.refcnt);
out:
        free_cgrp_cset_links(&tmp_links);
        return ret;
}

int cgroup_do_get_tree(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        int ret;

        ctx->kfc.root = ctx->root->kf_root;
        if (fc->fs_type == &cgroup2_fs_type)
                ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
        else
                ctx->kfc.magic = CGROUP_SUPER_MAGIC;
        ret = kernfs_get_tree(fc);

        /*
         * In non-init cgroup namespace, instead of root cgroup's dentry,
         * we return the dentry corresponding to the cgroupns->root_cgrp.
         */
        if (!ret && ctx->ns != &init_cgroup_ns) {
                struct dentry *nsdentry;
                struct super_block *sb = fc->root->d_sb;
                struct cgroup *cgrp;

                cgroup_lock();
                spin_lock_irq(&css_set_lock);

                cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);

                spin_unlock_irq(&css_set_lock);
                cgroup_unlock();

                nsdentry = kernfs_node_dentry(cgrp->kn, sb);
                dput(fc->root);
                if (IS_ERR(nsdentry)) {
                        deactivate_locked_super(sb);
                        ret = PTR_ERR(nsdentry);
                        nsdentry = NULL;
                }
                fc->root = nsdentry;
        }

        if (!ctx->kfc.new_sb_created)
                cgroup_put(&ctx->root->cgrp);

        return ret;
}

/*
 * Destroy a cgroup filesystem context.
 */
static void cgroup_fs_context_free(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

        kfree(ctx->name);
        kfree(ctx->release_agent);
        put_cgroup_ns(ctx->ns);
        kernfs_free_fs_context(fc);
        kfree(ctx);
}

static int cgroup_get_tree(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        int ret;

        WRITE_ONCE(cgrp_dfl_visible, true);
        cgroup_get_live(&cgrp_dfl_root.cgrp);
        ctx->root = &cgrp_dfl_root;

        ret = cgroup_do_get_tree(fc);
        if (!ret)
                apply_cgroup_root_flags(ctx->flags);
        return ret;
}

static const struct fs_context_operations cgroup_fs_context_ops = {
        .free                = cgroup_fs_context_free,
        .parse_param        = cgroup2_parse_param,
        .get_tree        = cgroup_get_tree,
        .reconfigure        = cgroup_reconfigure,
};

static const struct fs_context_operations cgroup1_fs_context_ops = {
        .free                = cgroup_fs_context_free,
        .parse_param        = cgroup1_parse_param,
        .get_tree        = cgroup1_get_tree,
        .reconfigure        = cgroup1_reconfigure,
};

/*
 * Initialise the cgroup filesystem creation/reconfiguration context.  Notably,
 * we select the namespace we're going to use.
 */
static int cgroup_init_fs_context(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx;

        ctx = kzalloc_obj(struct cgroup_fs_context);
        if (!ctx)
                return -ENOMEM;

        ctx->ns = current->nsproxy->cgroup_ns;
        get_cgroup_ns(ctx->ns);
        fc->fs_private = &ctx->kfc;
        if (fc->fs_type == &cgroup2_fs_type)
                fc->ops = &cgroup_fs_context_ops;
        else
                fc->ops = &cgroup1_fs_context_ops;
        put_user_ns(fc->user_ns);
        fc->user_ns = get_user_ns(ctx->ns->user_ns);
        fc->global = true;

        if (have_favordynmods)
                ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;

        return 0;
}

static void cgroup_kill_sb(struct super_block *sb)
{
        struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
        struct cgroup_root *root = cgroup_root_from_kf(kf_root);

        /*
         * If @root doesn't have any children, start killing it.
         * This prevents new mounts by disabling percpu_ref_tryget_live().
         *
         * And don't kill the default root.
         */
        if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
            !percpu_ref_is_dying(&root->cgrp.self.refcnt))
                percpu_ref_kill(&root->cgrp.self.refcnt);
        cgroup_put(&root->cgrp);
        kernfs_kill_sb(sb);
}

struct file_system_type cgroup_fs_type = {
        .name                        = "cgroup",
        .init_fs_context        = cgroup_init_fs_context,
        .parameters                = cgroup1_fs_parameters,
        .kill_sb                = cgroup_kill_sb,
        .fs_flags                = FS_USERNS_MOUNT,
};

static struct file_system_type cgroup2_fs_type = {
        .name                        = "cgroup2",
        .init_fs_context        = cgroup_init_fs_context,
        .parameters                = cgroup2_fs_parameters,
        .kill_sb                = cgroup_kill_sb,
        .fs_flags                = FS_USERNS_MOUNT,
};

#ifdef CONFIG_CPUSETS_V1
enum cpuset_param {
        Opt_cpuset_v2_mode,
};

static const struct fs_parameter_spec cpuset_fs_parameters[] = {
        fsparam_flag  ("cpuset_v2_mode", Opt_cpuset_v2_mode),
        {}
};

static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, cpuset_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_cpuset_v2_mode:
                ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
                return 0;
        }
        return -EINVAL;
}

static const struct fs_context_operations cpuset_fs_context_ops = {
        .get_tree        = cgroup1_get_tree,
        .free                = cgroup_fs_context_free,
        .parse_param        = cpuset_parse_param,
};

/*
 * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
 */
static int cpuset_init_fs_context(struct fs_context *fc)
{
        char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
        struct cgroup_fs_context *ctx;
        int err;

        err = cgroup_init_fs_context(fc);
        if (err) {
                kfree(agent);
                return err;
        }

        fc->ops = &cpuset_fs_context_ops;

        ctx = cgroup_fc2context(fc);
        ctx->subsys_mask = 1 << cpuset_cgrp_id;
        ctx->flags |= CGRP_ROOT_NOPREFIX;
        ctx->release_agent = agent;

        get_filesystem(&cgroup_fs_type);
        put_filesystem(fc->fs_type);
        fc->fs_type = &cgroup_fs_type;

        return 0;
}

static struct file_system_type cpuset_fs_type = {
        .name                        = "cpuset",
        .init_fs_context        = cpuset_init_fs_context,
        .parameters                = cpuset_fs_parameters,
        .fs_flags                = FS_USERNS_MOUNT,
};
#endif

int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
                          struct cgroup_namespace *ns)
{
        struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);

        return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
}

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns)
{
        int ret;

        cgroup_lock();
        spin_lock_irq(&css_set_lock);

        ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);

        spin_unlock_irq(&css_set_lock);
        cgroup_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);

/**
 * cgroup_attach_lock - Lock for ->attach()
 * @lock_mode: whether acquire and acquire which rwsem
 * @tsk: thread group to lock
 *
 * cgroup migration sometimes needs to stabilize threadgroups against forks and
 * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
 * implementations (e.g. cpuset), also need to disable CPU hotplug.
 * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
 * lead to deadlocks.
 *
 * Bringing up a CPU may involve creating and destroying tasks which requires
 * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
 * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
 * write-locking threadgroup_rwsem, the locking order is reversed and we end up
 * waiting for an on-going CPU hotplug operation which in turn is waiting for
 * the threadgroup_rwsem to be released to create new tasks. For more details:
 *
 *   http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
 *
 * Resolve the situation by always acquiring cpus_read_lock() before optionally
 * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
 * CPU hotplug is disabled on entry.
 *
 * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
 * on dynamic cgroup modifications. see the comment above
 * CGRP_ROOT_FAVOR_DYNMODS definition.
 *
 * tsk is not NULL only when writing to cgroup.procs.
 */
void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
                        struct task_struct *tsk)
{
        cpus_read_lock();

        switch (lock_mode) {
        case CGRP_ATTACH_LOCK_NONE:
                break;
        case CGRP_ATTACH_LOCK_GLOBAL:
                percpu_down_write(&cgroup_threadgroup_rwsem);
                break;
        case CGRP_ATTACH_LOCK_PER_THREADGROUP:
                down_write(&tsk->signal->cgroup_threadgroup_rwsem);
                break;
        default:
                pr_warn("cgroup: Unexpected attach lock mode.");
                break;
        }
}

/**
 * cgroup_attach_unlock - Undo cgroup_attach_lock()
 * @lock_mode: whether release and release which rwsem
 * @tsk: thread group to lock
 */
void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
                          struct task_struct *tsk)
{
        switch (lock_mode) {
        case CGRP_ATTACH_LOCK_NONE:
                break;
        case CGRP_ATTACH_LOCK_GLOBAL:
                percpu_up_write(&cgroup_threadgroup_rwsem);
                break;
        case CGRP_ATTACH_LOCK_PER_THREADGROUP:
                up_write(&tsk->signal->cgroup_threadgroup_rwsem);
                break;
        default:
                pr_warn("cgroup: Unexpected attach lock mode.");
                break;
        }

        cpus_read_unlock();
}

/**
 * cgroup_migrate_add_task - add a migration target task to a migration context
 * @task: target task
 * @mgctx: target migration context
 *
 * Add @task, which is a migration target, to @mgctx->tset.  This function
 * becomes noop if @task doesn't need to be migrated.  @task's css_set
 * should have been added as a migration source and @task->cg_list will be
 * moved from the css_set's tasks list to mg_tasks one.
 */
static void cgroup_migrate_add_task(struct task_struct *task,
                                    struct cgroup_mgctx *mgctx)
{
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* @task either already exited or can't exit until the end */
        if (task->flags & PF_EXITING)
                return;

        /* cgroup_threadgroup_rwsem protects racing against forks */
        WARN_ON_ONCE(list_empty(&task->cg_list));

        cset = task_css_set(task);
        if (!cset->mg_src_cgrp)
                return;

        mgctx->tset.nr_tasks++;

        css_set_skip_task_iters(cset, task);
        list_move_tail(&task->cg_list, &cset->mg_tasks);
        if (list_empty(&cset->mg_node))
                list_add_tail(&cset->mg_node,
                              &mgctx->tset.src_csets);
        if (list_empty(&cset->mg_dst_cset->mg_node))
                list_add_tail(&cset->mg_dst_cset->mg_node,
                              &mgctx->tset.dst_csets);
}

/**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 * @tset iteration is initialized and the first task is returned.
 */
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp)
{
        tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
        tset->cur_task = NULL;

        return cgroup_taskset_next(tset, dst_cssp);
}

/**
 * cgroup_taskset_next - iterate to the next task in taskset
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 * Return the next task in @tset.  Iteration must have been initialized
 * with cgroup_taskset_first().
 */
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp)
{
        struct css_set *cset = tset->cur_cset;
        struct task_struct *task = tset->cur_task;

        while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
                if (!task)
                        task = list_first_entry(&cset->mg_tasks,
                                                struct task_struct, cg_list);
                else
                        task = list_next_entry(task, cg_list);

                if (&task->cg_list != &cset->mg_tasks) {
                        tset->cur_cset = cset;
                        tset->cur_task = task;

                        /*
                         * This function may be called both before and
                         * after cgroup_migrate_execute().  The two cases
                         * can be distinguished by looking at whether @cset
                         * has its ->mg_dst_cset set.
                         */
                        if (cset->mg_dst_cset)
                                *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
                        else
                                *dst_cssp = cset->subsys[tset->ssid];

                        return task;
                }

                cset = list_next_entry(cset, mg_node);
                task = NULL;
        }

        return NULL;
}

/**
 * cgroup_migrate_execute - migrate a taskset
 * @mgctx: migration context
 *
 * Migrate tasks in @mgctx as setup by migration preparation functions.
 * This function fails iff one of the ->can_attach callbacks fails and
 * guarantees that either all or none of the tasks in @mgctx are migrated.
 * @mgctx is consumed regardless of success.
 */
static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
{
        struct cgroup_taskset *tset = &mgctx->tset;
        struct cgroup_subsys *ss;
        struct task_struct *task, *tmp_task;
        struct css_set *cset, *tmp_cset;
        int ssid, failed_ssid, ret;

        /* check that we can legitimately attach to the cgroup */
        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ss->can_attach) {
                                tset->ssid = ssid;
                                ret = ss->can_attach(tset);
                                if (ret) {
                                        failed_ssid = ssid;
                                        goto out_cancel_attach;
                                }
                        }
                } while_each_subsys_mask();
        }

        /*
         * Now that we're guaranteed success, proceed to move all tasks to
         * the new cgroup.  There are no failure cases after here, so this
         * is the commit point.
         */
        spin_lock_irq(&css_set_lock);
        list_for_each_entry(cset, &tset->src_csets, mg_node) {
                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
                        struct css_set *from_cset = task_css_set(task);
                        struct css_set *to_cset = cset->mg_dst_cset;

                        get_css_set(to_cset);
                        to_cset->nr_tasks++;
                        css_set_move_task(task, from_cset, to_cset, true);
                        from_cset->nr_tasks--;
                        /*
                         * If the source or destination cgroup is frozen,
                         * the task might require to change its state.
                         */
                        cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
                                                    to_cset->dfl_cgrp);
                        put_css_set_locked(from_cset);

                }
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * Migration is committed, all target tasks are now on dst_csets.
         * Nothing is sensitive to fork() after this point.  Notify
         * controllers that migration is complete.
         */
        tset->csets = &tset->dst_csets;

        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ss->attach) {
                                tset->ssid = ssid;
                                ss->attach(tset);
                        }
                } while_each_subsys_mask();
        }

        ret = 0;
        goto out_release_tset;

out_cancel_attach:
        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ssid == failed_ssid)
                                break;
                        if (ss->cancel_attach) {
                                tset->ssid = ssid;
                                ss->cancel_attach(tset);
                        }
                } while_each_subsys_mask();
        }
out_release_tset:
        spin_lock_irq(&css_set_lock);
        list_splice_init(&tset->dst_csets, &tset->src_csets);
        list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
                list_del_init(&cset->mg_node);
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * Re-initialize the cgroup_taskset structure in case it is reused
         * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
         * iteration.
         */
        tset->nr_tasks = 0;
        tset->csets    = &tset->src_csets;
        return ret;
}

/**
 * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
 * @dst_cgrp: destination cgroup to test
 *
 * On the default hierarchy, except for the mixable, (possible) thread root
 * and threaded cgroups, subtree_control must be zero for migration
 * destination cgroups with tasks so that child cgroups don't compete
 * against tasks.
 */
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
        /* v1 doesn't have any restriction */
        if (!cgroup_on_dfl(dst_cgrp))
                return 0;

        /* verify @dst_cgrp can host resources */
        if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
                return -EOPNOTSUPP;

        /*
         * If @dst_cgrp is already or can become a thread root or is
         * threaded, it doesn't matter.
         */
        if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
                return 0;

        /* apply no-internal-process constraint */
        if (dst_cgrp->subtree_control)
                return -EBUSY;

        return 0;
}

/**
 * cgroup_migrate_finish - cleanup after attach
 * @mgctx: migration context
 *
 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
 * those functions for details.
 */
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
        struct css_set *cset, *tmp_cset;

        lockdep_assert_held(&cgroup_mutex);

        spin_lock_irq(&css_set_lock);

        list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
                                 mg_src_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_src_preload_node);
                put_css_set_locked(cset);
        }

        list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
                                 mg_dst_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_dst_preload_node);
                put_css_set_locked(cset);
        }

        spin_unlock_irq(&css_set_lock);
}

/**
 * cgroup_migrate_add_src - add a migration source css_set
 * @src_cset: the source css_set to add
 * @dst_cgrp: the destination cgroup
 * @mgctx: migration context
 *
 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
 * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
 * up by cgroup_migrate_finish().
 *
 * This function may be called without holding cgroup_threadgroup_rwsem
 * even if the target is a process.  Threads may be created and destroyed
 * but as long as cgroup_mutex is not dropped, no new css_set can be put
 * into play and the preloaded css_sets are guaranteed to cover all
 * migrations.
 */
void cgroup_migrate_add_src(struct css_set *src_cset,
                            struct cgroup *dst_cgrp,
                            struct cgroup_mgctx *mgctx)
{
        struct cgroup *src_cgrp;

        lockdep_assert_held(&cgroup_mutex);
        lockdep_assert_held(&css_set_lock);

        /*
         * If ->dead, @src_set is associated with one or more dead cgroups
         * and doesn't contain any migratable tasks.  Ignore it early so
         * that the rest of migration path doesn't get confused by it.
         */
        if (src_cset->dead)
                return;

        if (!list_empty(&src_cset->mg_src_preload_node))
                return;

        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

        WARN_ON(src_cset->mg_src_cgrp);
        WARN_ON(src_cset->mg_dst_cgrp);
        WARN_ON(!list_empty(&src_cset->mg_tasks));
        WARN_ON(!list_empty(&src_cset->mg_node));

        src_cset->mg_src_cgrp = src_cgrp;
        src_cset->mg_dst_cgrp = dst_cgrp;
        get_css_set(src_cset);
        list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}

/**
 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
 * @mgctx: migration context
 *
 * Tasks are about to be moved and all the source css_sets have been
 * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
 * pins all destination css_sets, links each to its source, and append them
 * to @mgctx->preloaded_dst_csets.
 *
 * This function must be called after cgroup_migrate_add_src() has been
 * called on each migration source css_set.  After migration is performed
 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
 * @mgctx.
 */
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
        struct css_set *src_cset, *tmp_cset;

        lockdep_assert_held(&cgroup_mutex);

        /* look up the dst cset for each src cset and link it to src */
        list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
                                 mg_src_preload_node) {
                struct css_set *dst_cset;
                struct cgroup_subsys *ss;
                int ssid;

                dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
                if (!dst_cset)
                        return -ENOMEM;

                WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

                /*
                 * If src cset equals dst, it's noop.  Drop the src.
                 * cgroup_migrate() will skip the cset too.  Note that we
                 * can't handle src == dst as some nodes are used by both.
                 */
                if (src_cset == dst_cset) {
                        src_cset->mg_src_cgrp = NULL;
                        src_cset->mg_dst_cgrp = NULL;
                        list_del_init(&src_cset->mg_src_preload_node);
                        put_css_set(src_cset);
                        put_css_set(dst_cset);
                        continue;
                }

                src_cset->mg_dst_cset = dst_cset;

                if (list_empty(&dst_cset->mg_dst_preload_node))
                        list_add_tail(&dst_cset->mg_dst_preload_node,
                                      &mgctx->preloaded_dst_csets);
                else
                        put_css_set(dst_cset);

                for_each_subsys(ss, ssid)
                        if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
                                mgctx->ss_mask |= 1 << ssid;
        }

        return 0;
}

/**
 * cgroup_migrate - migrate a process or task to a cgroup
 * @leader: the leader of the process or the task to migrate
 * @threadgroup: whether @leader points to the whole process or a single task
 * @mgctx: migration context
 *
 * Migrate a process or task denoted by @leader.  If migrating a process,
 * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
 * responsible for invoking cgroup_migrate_add_src() and
 * cgroup_migrate_prepare_dst() on the targets before invoking this
 * function and following up with cgroup_migrate_finish().
 *
 * As long as a controller's ->can_attach() doesn't fail, this function is
 * guaranteed to succeed.  This means that, excluding ->can_attach()
 * failure, when migrating multiple targets, the success or failure can be
 * decided for all targets by invoking group_migrate_prepare_dst() before
 * actually starting migrating.
 */
int cgroup_migrate(struct task_struct *leader, bool threadgroup,
                   struct cgroup_mgctx *mgctx)
{
        struct task_struct *task;

        /*
         * The following thread iteration should be inside an RCU critical
         * section to prevent tasks from being freed while taking the snapshot.
         * spin_lock_irq() implies RCU critical section here.
         */
        spin_lock_irq(&css_set_lock);
        task = leader;
        do {
                cgroup_migrate_add_task(task, mgctx);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        spin_unlock_irq(&css_set_lock);

        return cgroup_migrate_execute(mgctx);
}

/**
 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
 * @dst_cgrp: the cgroup to attach to
 * @leader: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
 *
 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
 */
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
                       bool threadgroup)
{
        DEFINE_CGROUP_MGCTX(mgctx);
        struct task_struct *task;
        int ret = 0;

        /* look up all src csets */
        spin_lock_irq(&css_set_lock);
        task = leader;
        do {
                cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        spin_unlock_irq(&css_set_lock);

        /* prepare dst csets and commit */
        ret = cgroup_migrate_prepare_dst(&mgctx);
        if (!ret)
                ret = cgroup_migrate(leader, threadgroup, &mgctx);

        cgroup_migrate_finish(&mgctx);

        if (!ret)
                TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);

        return ret;
}

struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
                                             enum cgroup_attach_lock_mode *lock_mode)
{
        struct task_struct *tsk;
        pid_t pid;

        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
                return ERR_PTR(-EINVAL);

retry_find_task:
        rcu_read_lock();
        if (pid) {
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
                        tsk = ERR_PTR(-ESRCH);
                        goto out_unlock_rcu;
                }
        } else {
                tsk = current;
        }

        if (threadgroup)
                tsk = tsk->group_leader;

        /*
         * kthreads may acquire PF_NO_SETAFFINITY during initialization.
         * If userland migrates such a kthread to a non-root cgroup, it can
         * become trapped in a cpuset, or RT kthread may be born in a
         * cgroup with no rt_runtime allocated.  Just say no.
         */
        if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
                tsk = ERR_PTR(-EINVAL);
                goto out_unlock_rcu;
        }
        get_task_struct(tsk);
        rcu_read_unlock();

        /*
         * If we migrate a single thread, we don't care about threadgroup
         * stability. If the thread is `current`, it won't exit(2) under our
         * hands or change PID through exec(2). We exclude
         * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers
         * by cgroup_mutex. Therefore, we can skip the global lock.
         */
        lockdep_assert_held(&cgroup_mutex);

        if (pid || threadgroup) {
                if (cgroup_enable_per_threadgroup_rwsem)
                        *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
                else
                        *lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
        } else {
                *lock_mode = CGRP_ATTACH_LOCK_NONE;
        }

        cgroup_attach_lock(*lock_mode, tsk);

        if (threadgroup) {
                if (!thread_group_leader(tsk)) {
                        /*
                         * A race with de_thread from another thread's exec()
                         * may strip us of our leadership. If this happens,
                         * throw this task away and try again.
                         */
                        cgroup_attach_unlock(*lock_mode, tsk);
                        put_task_struct(tsk);
                        goto retry_find_task;
                }
        }

        return tsk;

out_unlock_rcu:
        rcu_read_unlock();
        return tsk;
}

void cgroup_procs_write_finish(struct task_struct *task,
                               enum cgroup_attach_lock_mode lock_mode)
{
        cgroup_attach_unlock(lock_mode, task);

        /* release reference from cgroup_procs_write_start() */
        put_task_struct(task);
}

static void cgroup_print_ss_mask(struct seq_file *seq, u32 ss_mask)
{
        struct cgroup_subsys *ss;
        bool printed = false;
        int ssid;

        do_each_subsys_mask(ss, ssid, ss_mask) {
                if (printed)
                        seq_putc(seq, ' ');
                seq_puts(seq, ss->name);
                printed = true;
        } while_each_subsys_mask();
        if (printed)
                seq_putc(seq, '\n');
}

/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        cgroup_print_ss_mask(seq, cgroup_control(cgrp));
        return 0;
}

/* show controllers which are enabled for a given cgroup's children */
static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        cgroup_print_ss_mask(seq, cgrp->subtree_control);
        return 0;
}

/**
 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
 * @cgrp: root of the subtree to update csses for
 *
 * @cgrp's control masks have changed and its subtree's css associations
 * need to be updated accordingly.  This function looks up all css_sets
 * which are attached to the subtree, creates the matching updated css_sets
 * and migrates the tasks to the new ones.
 */
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
        DEFINE_CGROUP_MGCTX(mgctx);
        struct cgroup_subsys_state *d_css;
        struct cgroup *dsct;
        struct css_set *src_cset;
        enum cgroup_attach_lock_mode lock_mode;
        bool has_tasks;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* look up all csses currently attached to @cgrp's subtree */
        spin_lock_irq(&css_set_lock);
        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                struct cgrp_cset_link *link;

                /*
                 * As cgroup_update_dfl_csses() is only called by
                 * cgroup_apply_control(). The csses associated with the
                 * given cgrp will not be affected by changes made to
                 * its subtree_control file. We can skip them.
                 */
                if (dsct == cgrp)
                        continue;

                list_for_each_entry(link, &dsct->cset_links, cset_link)
                        cgroup_migrate_add_src(link->cset, dsct, &mgctx);
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * We need to write-lock threadgroup_rwsem while migrating tasks.
         * However, if there are no source csets for @cgrp, changing its
         * controllers isn't gonna produce any task migrations and the
         * write-locking can be skipped safely.
         */
        has_tasks = !list_empty(&mgctx.preloaded_src_csets);

        if (has_tasks)
                lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
        else
                lock_mode = CGRP_ATTACH_LOCK_NONE;

        cgroup_attach_lock(lock_mode, NULL);

        /* NULL dst indicates self on default hierarchy */
        ret = cgroup_migrate_prepare_dst(&mgctx);
        if (ret)
                goto out_finish;

        spin_lock_irq(&css_set_lock);
        list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
                            mg_src_preload_node) {
                struct task_struct *task, *ntask;

                /* all tasks in src_csets need to be migrated */
                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
                        cgroup_migrate_add_task(task, &mgctx);
        }
        spin_unlock_irq(&css_set_lock);

        ret = cgroup_migrate_execute(&mgctx);
out_finish:
        cgroup_migrate_finish(&mgctx);
        cgroup_attach_unlock(lock_mode, NULL);
        return ret;
}

/**
 * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
 * @cgrp: root of the target subtree
 *
 * Because css offlining is asynchronous, userland may try to re-enable a
 * controller while the previous css is still around.  This function grabs
 * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
 */
void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
        __acquires(&cgroup_mutex)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid;

restart:
        cgroup_lock();

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
                        DEFINE_WAIT(wait);

                        if (!css || !percpu_ref_is_dying(&css->refcnt))
                                continue;

                        cgroup_get_live(dsct);
                        prepare_to_wait(&dsct->offline_waitq, &wait,
                                        TASK_UNINTERRUPTIBLE);

                        cgroup_unlock();
                        schedule();
                        finish_wait(&dsct->offline_waitq, &wait);

                        cgroup_put(dsct);
                        goto restart;
                }
        }
}

/**
 * cgroup_save_control - save control masks and dom_cgrp of a subtree
 * @cgrp: root of the target subtree
 *
 * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
 * respective old_ prefixed fields for @cgrp's subtree including @cgrp
 * itself.
 */
static void cgroup_save_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                dsct->old_subtree_control = dsct->subtree_control;
                dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
                dsct->old_dom_cgrp = dsct->dom_cgrp;
        }
}

/**
 * cgroup_propagate_control - refresh control masks of a subtree
 * @cgrp: root of the target subtree
 *
 * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
 * ->subtree_control and propagate controller availability through the
 * subtree so that descendants don't have unavailable controllers enabled.
 */
static void cgroup_propagate_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                dsct->subtree_control &= cgroup_control(dsct);
                dsct->subtree_ss_mask =
                        cgroup_calc_subtree_ss_mask(dsct->subtree_control,
                                                    cgroup_ss_mask(dsct));
        }
}

/**
 * cgroup_restore_control - restore control masks and dom_cgrp of a subtree
 * @cgrp: root of the target subtree
 *
 * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
 * respective old_ prefixed fields for @cgrp's subtree including @cgrp
 * itself.
 */
static void cgroup_restore_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                dsct->subtree_control = dsct->old_subtree_control;
                dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
                dsct->dom_cgrp = dsct->old_dom_cgrp;
        }
}

static bool css_visible(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        if (cgroup_control(cgrp) & (1 << ss->id))
                return true;
        if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
                return false;
        return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
}

/**
 * cgroup_apply_control_enable - enable or show csses according to control
 * @cgrp: root of the target subtree
 *
 * Walk @cgrp's subtree and create new csses or make the existing ones
 * visible.  A css is created invisible if it's being implicitly enabled
 * through dependency.  An invisible css is made visible when the userland
 * explicitly enables it.
 *
 * Returns 0 on success, -errno on failure.  On failure, csses which have
 * been processed already aren't cleaned up.  The caller is responsible for
 * cleaning up with cgroup_apply_control_disable().
 */
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid, ret;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

                        if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
                                continue;

                        if (!css) {
                                css = css_create(dsct, ss);
                                if (IS_ERR(css))
                                        return PTR_ERR(css);
                        }

                        WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));

                        if (css_visible(css)) {
                                ret = css_populate_dir(css);
                                if (ret)
                                        return ret;
                        }
                }
        }

        return 0;
}

/**
 * cgroup_apply_control_disable - kill or hide csses according to control
 * @cgrp: root of the target subtree
 *
 * Walk @cgrp's subtree and kill and hide csses so that they match
 * cgroup_ss_mask() and cgroup_visible_mask().
 *
 * A css is hidden when the userland requests it to be disabled while other
 * subsystems are still depending on it.  The css must not actively control
 * resources and be in the vanilla state if it's made visible again later.
 * Controllers which may be depended upon should provide ->css_reset() for
 * this purpose.
 */
static void cgroup_apply_control_disable(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid;

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

                        if (!css)
                                continue;

                        WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));

                        if (css->parent &&
                            !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
                                kill_css(css);
                        } else if (!css_visible(css)) {
                                css_clear_dir(css);
                                if (ss->css_reset)
                                        ss->css_reset(css);
                        }
                }
        }
}

/**
 * cgroup_apply_control - apply control mask updates to the subtree
 * @cgrp: root of the target subtree
 *
 * subsystems can be enabled and disabled in a subtree using the following
 * steps.
 *
 * 1. Call cgroup_save_control() to stash the current state.
 * 2. Update ->subtree_control masks in the subtree as desired.
 * 3. Call cgroup_apply_control() to apply the changes.
 * 4. Optionally perform other related operations.
 * 5. Call cgroup_finalize_control() to finish up.
 *
 * This function implements step 3 and propagates the mask changes
 * throughout @cgrp's subtree, updates csses accordingly and perform
 * process migrations.
 */
static int cgroup_apply_control(struct cgroup *cgrp)
{
        int ret;

        cgroup_propagate_control(cgrp);

        ret = cgroup_apply_control_enable(cgrp);
        if (ret)
                return ret;

        /*
         * At this point, cgroup_e_css_by_mask() results reflect the new csses
         * making the following cgroup_update_dfl_csses() properly update
         * css associations of all tasks in the subtree.
         */
        return cgroup_update_dfl_csses(cgrp);
}

/**
 * cgroup_finalize_control - finalize control mask update
 * @cgrp: root of the target subtree
 * @ret: the result of the update
 *
 * Finalize control mask update.  See cgroup_apply_control() for more info.
 */
static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
{
        if (ret) {
                cgroup_restore_control(cgrp);
                cgroup_propagate_control(cgrp);
        }

        cgroup_apply_control_disable(cgrp);
}

static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u32 enable)
{
        u32 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;

        /* if nothing is getting enabled, nothing to worry about */
        if (!enable)
                return 0;

        /* can @cgrp host any resources? */
        if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
                return -EOPNOTSUPP;

        /* mixables don't care */
        if (cgroup_is_mixable(cgrp))
                return 0;

        if (domain_enable) {
                /* can't enable domain controllers inside a thread subtree */
                if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
                        return -EOPNOTSUPP;
        } else {
                /*
                 * Threaded controllers can handle internal competitions
                 * and are always allowed inside a (prospective) thread
                 * subtree.
                 */
                if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
                        return 0;
        }

        /*
         * Controllers can't be enabled for a cgroup with tasks to avoid
         * child cgroups competing against tasks.
         */
        if (cgroup_has_tasks(cgrp))
                return -EBUSY;

        return 0;
}

/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                            char *buf, size_t nbytes,
                                            loff_t off)
{
        u32 enable = 0, disable = 0;
        struct cgroup *cgrp, *child;
        struct cgroup_subsys *ss;
        char *tok;
        int ssid, ret;

        /*
         * Parse input - space separated list of subsystem names prefixed
         * with either + or -.
         */
        buf = strstrip(buf);
        while ((tok = strsep(&buf, " "))) {
                if (tok[0] == '\0')
                        continue;
                do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
                        if (!cgroup_ssid_enabled(ssid) ||
                            strcmp(tok + 1, ss->name))
                                continue;

                        if (*tok == '+') {
                                enable |= 1 << ssid;
                                disable &= ~(1 << ssid);
                        } else if (*tok == '-') {
                                disable |= 1 << ssid;
                                enable &= ~(1 << ssid);
                        } else {
                                return -EINVAL;
                        }
                        break;
                } while_each_subsys_mask();
                if (ssid == CGROUP_SUBSYS_COUNT)
                        return -EINVAL;
        }

        cgrp = cgroup_kn_lock_live(of->kn, true);
        if (!cgrp)
                return -ENODEV;

        for_each_subsys(ss, ssid) {
                if (enable & (1 << ssid)) {
                        if (cgrp->subtree_control & (1 << ssid)) {
                                enable &= ~(1 << ssid);
                                continue;
                        }

                        if (!(cgroup_control(cgrp) & (1 << ssid))) {
                                ret = -ENOENT;
                                goto out_unlock;
                        }
                } else if (disable & (1 << ssid)) {
                        if (!(cgrp->subtree_control & (1 << ssid))) {
                                disable &= ~(1 << ssid);
                                continue;
                        }

                        /* a child has it enabled? */
                        cgroup_for_each_live_child(child, cgrp) {
                                if (child->subtree_control & (1 << ssid)) {
                                        ret = -EBUSY;
                                        goto out_unlock;
                                }
                        }
                }
        }

        if (!enable && !disable) {
                ret = 0;
                goto out_unlock;
        }

        ret = cgroup_vet_subtree_control_enable(cgrp, enable);
        if (ret)
                goto out_unlock;

        /* save and update control masks and prepare csses */
        cgroup_save_control(cgrp);

        cgrp->subtree_control |= enable;
        cgrp->subtree_control &= ~disable;

        ret = cgroup_apply_control(cgrp);
        cgroup_finalize_control(cgrp, ret);
        if (ret)
                goto out_unlock;

        kernfs_activate(cgrp->kn);
out_unlock:
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
}

/**
 * cgroup_enable_threaded - make @cgrp threaded
 * @cgrp: the target cgroup
 *
 * Called when "threaded" is written to the cgroup.type interface file and
 * tries to make @cgrp threaded and join the parent's resource domain.
 * This function is never called on the root cgroup as cgroup.type doesn't
 * exist on it.
 */
static int cgroup_enable_threaded(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup *dom_cgrp = parent->dom_cgrp;
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* noop if already threaded */
        if (cgroup_is_threaded(cgrp))
                return 0;

        /*
         * If @cgroup is populated or has domain controllers enabled, it
         * can't be switched.  While the below cgroup_can_be_thread_root()
         * test can catch the same conditions, that's only when @parent is
         * not mixable, so let's check it explicitly.
         */
        if (cgroup_is_populated(cgrp) ||
            cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
                return -EOPNOTSUPP;

        /* we're joining the parent's domain, ensure its validity */
        if (!cgroup_is_valid_domain(dom_cgrp) ||
            !cgroup_can_be_thread_root(dom_cgrp))
                return -EOPNOTSUPP;

        /*
         * The following shouldn't cause actual migrations and should
         * always succeed.
         */
        cgroup_save_control(cgrp);

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
                if (dsct == cgrp || cgroup_is_threaded(dsct))
                        dsct->dom_cgrp = dom_cgrp;

        ret = cgroup_apply_control(cgrp);
        if (!ret)
                parent->nr_threaded_children++;

        cgroup_finalize_control(cgrp, ret);
        return ret;
}

static int cgroup_type_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        if (cgroup_is_threaded(cgrp))
                seq_puts(seq, "threaded\n");
        else if (!cgroup_is_valid_domain(cgrp))
                seq_puts(seq, "domain invalid\n");
        else if (cgroup_is_thread_root(cgrp))
                seq_puts(seq, "domain threaded\n");
        else
                seq_puts(seq, "domain\n");

        return 0;
}

static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        int ret;

        /* only switching to threaded mode is supported */
        if (strcmp(strstrip(buf), "threaded"))
                return -EINVAL;

        /* drain dying csses before we re-apply (threaded) subtree control */
        cgrp = cgroup_kn_lock_live(of->kn, true);
        if (!cgrp)
                return -ENOENT;

        /* threaded can only be enabled */
        ret = cgroup_enable_threaded(cgrp);

        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
}

static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        int descendants = READ_ONCE(cgrp->max_descendants);

        if (descendants == INT_MAX)
                seq_puts(seq, "max\n");
        else
                seq_printf(seq, "%d\n", descendants);

        return 0;
}

static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        int descendants;
        ssize_t ret;

        buf = strstrip(buf);
        if (!strcmp(buf, "max")) {
                descendants = INT_MAX;
        } else {
                ret = kstrtoint(buf, 0, &descendants);
                if (ret)
                        return ret;
        }

        if (descendants < 0)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgrp->max_descendants = descendants;

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static int cgroup_max_depth_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        int depth = READ_ONCE(cgrp->max_depth);

        if (depth == INT_MAX)
                seq_puts(seq, "max\n");
        else
                seq_printf(seq, "%d\n", depth);

        return 0;
}

static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
                                      char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        ssize_t ret;
        int depth;

        buf = strstrip(buf);
        if (!strcmp(buf, "max")) {
                depth = INT_MAX;
        } else {
                ret = kstrtoint(buf, 0, &depth);
                if (ret)
                        return ret;
        }

        if (depth < 0)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgrp->max_depth = depth;

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static int cgroup_events_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
        seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));

        return 0;
}

static int cgroup_stat_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgroup = seq_css(seq)->cgroup;
        struct cgroup_subsys_state *css;
        int dying_cnt[CGROUP_SUBSYS_COUNT];
        int ssid;

        seq_printf(seq, "nr_descendants %d\n",
                   cgroup->nr_descendants);

        /*
         * Show the number of live and dying csses associated with each of
         * non-inhibited cgroup subsystems that is bound to cgroup v2.
         *
         * Without proper lock protection, racing is possible. So the
         * numbers may not be consistent when that happens.
         */
        rcu_read_lock();
        for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
                dying_cnt[ssid] = -1;
                if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
                    (cgroup_subsys[ssid]->root !=  &cgrp_dfl_root))
                        continue;
                css = rcu_dereference_raw(cgroup->subsys[ssid]);
                dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid];
                seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
                           css ? (css->nr_descendants + 1) : 0);
        }

        seq_printf(seq, "nr_dying_descendants %d\n",
                   cgroup->nr_dying_descendants);
        for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
                if (dying_cnt[ssid] >= 0)
                        seq_printf(seq, "nr_dying_subsys_%s %d\n",
                                   cgroup_subsys[ssid]->name, dying_cnt[ssid]);
        }
        rcu_read_unlock();
        return 0;
}

static int cgroup_core_local_stat_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        unsigned int sequence;
        u64 freeze_time;

        do {
                sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq);
                freeze_time = cgrp->freezer.frozen_nsec;
                /* Add in current freezer interval if the cgroup is freezing. */
                if (test_bit(CGRP_FREEZE, &cgrp->flags))
                        freeze_time += (ktime_get_ns() -
                                        cgrp->freezer.freeze_start_nsec);
        } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence));

        do_div(freeze_time, NSEC_PER_USEC);
        seq_printf(seq, "frozen_usec %llu\n", freeze_time);

        return 0;
}

#ifdef CONFIG_CGROUP_SCHED
/**
 * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get @cgrp's css associated with @ss.  If the css doesn't exist
 * or is offline, %NULL is returned.
 */
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
                                                     struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        css = cgroup_css(cgrp, ss);
        if (css && !css_tryget_online(css))
                css = NULL;
        rcu_read_unlock();

        return css;
}

static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct cgroup_subsys *ss = cgroup_subsys[ssid];
        struct cgroup_subsys_state *css;
        int ret;

        if (!ss->css_extra_stat_show)
                return 0;

        css = cgroup_tryget_css(cgrp, ss);
        if (!css)
                return 0;

        ret = ss->css_extra_stat_show(seq, css);
        css_put(css);
        return ret;
}

static int cgroup_local_stat_show(struct seq_file *seq,
                                  struct cgroup *cgrp, int ssid)
{
        struct cgroup_subsys *ss = cgroup_subsys[ssid];
        struct cgroup_subsys_state *css;
        int ret;

        if (!ss->css_local_stat_show)
                return 0;

        css = cgroup_tryget_css(cgrp, ss);
        if (!css)
                return 0;

        ret = ss->css_local_stat_show(seq, css);
        css_put(css);
        return ret;
}
#endif

static int cpu_stat_show(struct seq_file *seq, void *v)
{
        int ret = 0;

        cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
        ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
#endif
        return ret;
}

static int cpu_local_stat_show(struct seq_file *seq, void *v)
{
        struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
        int ret = 0;

#ifdef CONFIG_CGROUP_SCHED
        ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
        return ret;
}

#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_CPU);
}

static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
                              size_t nbytes, enum psi_res res)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct psi_trigger *new;
        struct cgroup *cgrp;
        struct psi_group *psi;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENODEV;

        cgroup_get(cgrp);
        cgroup_kn_unlock(of->kn);

        /* Allow only one trigger per file descriptor */
        if (ctx->psi.trigger) {
                cgroup_put(cgrp);
                return -EBUSY;
        }

        psi = cgroup_psi(cgrp);
        new = psi_trigger_create(psi, buf, res, of->file, of);
        if (IS_ERR(new)) {
                cgroup_put(cgrp);
                return PTR_ERR(new);
        }

        smp_store_release(&ctx->psi.trigger, new);
        cgroup_put(cgrp);

        return nbytes;
}

static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_IO);
}

static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_MEM);
}

static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_CPU);
}

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_IRQ);
}

static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
                                         char *buf, size_t nbytes,
                                         loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_IRQ);
}
#endif

static int cgroup_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        seq_printf(seq, "%d\n", psi->enabled);

        return 0;
}

static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
                                     char *buf, size_t nbytes,
                                     loff_t off)
{
        ssize_t ret;
        int enable;
        struct cgroup *cgrp;
        struct psi_group *psi;

        ret = kstrtoint(strstrip(buf), 0, &enable);
        if (ret)
                return ret;

        if (enable < 0 || enable > 1)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        psi = cgroup_psi(cgrp);
        if (psi->enabled != enable) {
                int i;

                /* show or hide {cpu,memory,io,irq}.pressure files */
                for (i = 0; i < NR_PSI_RESOURCES; i++)
                        cgroup_file_show(&cgrp->psi_files[i], enable);

                psi->enabled = enable;
                if (enable)
                        psi_cgroup_restart(psi);
        }

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
                                          poll_table *pt)
{
        struct cgroup_file_ctx *ctx = of->priv;

        return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}

static void cgroup_pressure_release(struct kernfs_open_file *of)
{
        struct cgroup_file_ctx *ctx = of->priv;

        psi_trigger_destroy(ctx->psi.trigger);
}

bool cgroup_psi_enabled(void)
{
        if (static_branch_likely(&psi_disabled))
                return false;

        return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}

#else /* CONFIG_PSI */
bool cgroup_psi_enabled(void)
{
        return false;
}

#endif /* CONFIG_PSI */

static int cgroup_freeze_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        seq_printf(seq, "%d\n", cgrp->freezer.freeze);

        return 0;
}

static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
                                   char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        ssize_t ret;
        int freeze;

        ret = kstrtoint(strstrip(buf), 0, &freeze);
        if (ret)
                return ret;

        if (freeze < 0 || freeze > 1)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgroup_freeze(cgrp, freeze);

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static void __cgroup_kill(struct cgroup *cgrp)
{
        struct css_task_iter it;
        struct task_struct *task;

        lockdep_assert_held(&cgroup_mutex);

        spin_lock_irq(&css_set_lock);
        cgrp->kill_seq++;
        spin_unlock_irq(&css_set_lock);

        css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
        while ((task = css_task_iter_next(&it))) {
                /* Ignore kernel threads here. */
                if (task->flags & PF_KTHREAD)
                        continue;

                /* Skip tasks that are already dying. */
                if (__fatal_signal_pending(task))
                        continue;

                send_sig(SIGKILL, task, 0);
        }
        css_task_iter_end(&it);
}

static void cgroup_kill(struct cgroup *cgrp)
{
        struct cgroup_subsys_state *css;
        struct cgroup *dsct;

        lockdep_assert_held(&cgroup_mutex);

        cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
                __cgroup_kill(dsct);
}

static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        ssize_t ret = 0;
        int kill;
        struct cgroup *cgrp;

        ret = kstrtoint(strstrip(buf), 0, &kill);
        if (ret)
                return ret;

        if (kill != 1)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        /*
         * Killing is a process directed operation, i.e. the whole thread-group
         * is taken down so act like we do for cgroup.procs and only make this
         * writable in non-threaded cgroups.
         */
        if (cgroup_is_threaded(cgrp))
                ret = -EOPNOTSUPP;
        else
                cgroup_kill(cgrp);

        cgroup_kn_unlock(of->kn);

        return ret ?: nbytes;
}

static int cgroup_file_open(struct kernfs_open_file *of)
{
        struct cftype *cft = of_cft(of);
        struct cgroup_file_ctx *ctx;
        int ret;

        ctx = kzalloc_obj(*ctx);
        if (!ctx)
                return -ENOMEM;

        ctx->ns = current->nsproxy->cgroup_ns;
        get_cgroup_ns(ctx->ns);
        of->priv = ctx;

        if (!cft->open)
                return 0;

        ret = cft->open(of);
        if (ret) {
                put_cgroup_ns(ctx->ns);
                kfree(ctx);
        }
        return ret;
}

static void cgroup_file_release(struct kernfs_open_file *of)
{
        struct cftype *cft = of_cft(of);
        struct cgroup_file_ctx *ctx = of->priv;

        if (cft->release)
                cft->release(of);
        put_cgroup_ns(ctx->ns);
        kfree(ctx);
        of->priv = NULL;
}

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct cgroup *cgrp = kn_priv(of->kn);
        struct cftype *cft = of_cft(of);
        struct cgroup_subsys_state *css;
        int ret;

        if (!nbytes)
                return 0;

        /*
         * If namespaces are delegation boundaries, disallow writes to
         * files in an non-init namespace root from inside the namespace
         * except for the files explicitly marked delegatable -
         * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
         */
        if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
            !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
            ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
                return -EPERM;

        if (cft->write)
                return cft->write(of, buf, nbytes, off);

        /*
         * kernfs guarantees that a file isn't deleted with operations in
         * flight, which means that the matching css is and stays alive and
         * doesn't need to be pinned.  The RCU locking is not necessary
         * either.  It's just for the convenience of using cgroup_css().
         */
        rcu_read_lock();
        css = cgroup_css(cgrp, cft->ss);
        rcu_read_unlock();

        if (cft->write_u64) {
                unsigned long long v;
                ret = kstrtoull(buf, 0, &v);
                if (!ret)
                        ret = cft->write_u64(css, cft, v);
        } else if (cft->write_s64) {
                long long v;
                ret = kstrtoll(buf, 0, &v);
                if (!ret)
                        ret = cft->write_s64(css, cft, v);
        } else {
                ret = -EINVAL;
        }

        return ret ?: nbytes;
}

static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
{
        struct cftype *cft = of_cft(of);

        if (cft->poll)
                return cft->poll(of, pt);

        return kernfs_generic_poll(of, pt);
}

static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
        return seq_cft(seq)->seq_start(seq, ppos);
}

static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
        return seq_cft(seq)->seq_next(seq, v, ppos);
}

static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
        if (seq_cft(seq)->seq_stop)
                seq_cft(seq)->seq_stop(seq, v);
}

static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
        struct cftype *cft = seq_cft(m);
        struct cgroup_subsys_state *css = seq_css(m);

        if (cft->seq_show)
                return cft->seq_show(m, arg);

        if (cft->read_u64)
                seq_printf(m, "%llu\n", cft->read_u64(css, cft));
        else if (cft->read_s64)
                seq_printf(m, "%lld\n", cft->read_s64(css, cft));
        else
                return -EINVAL;
        return 0;
}

static struct kernfs_ops cgroup_kf_single_ops = {
        .atomic_write_len        = PAGE_SIZE,
        .open                        = cgroup_file_open,
        .release                = cgroup_file_release,
        .write                        = cgroup_file_write,
        .poll                        = cgroup_file_poll,
        .seq_show                = cgroup_seqfile_show,
};

static struct kernfs_ops cgroup_kf_ops = {
        .atomic_write_len        = PAGE_SIZE,
        .open                        = cgroup_file_open,
        .release                = cgroup_file_release,
        .write                        = cgroup_file_write,
        .poll                        = cgroup_file_poll,
        .seq_start                = cgroup_seqfile_start,
        .seq_next                = cgroup_seqfile_next,
        .seq_stop                = cgroup_seqfile_stop,
        .seq_show                = cgroup_seqfile_show,
};

static void cgroup_file_notify_timer(struct timer_list *timer)
{
        cgroup_file_notify(container_of(timer, struct cgroup_file,
                                        notify_timer));
}

static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
                           struct cftype *cft)
{
        char name[CGROUP_FILE_NAME_MAX];
        struct kernfs_node *kn;
        struct lock_class_key *key = NULL;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        key = &cft->lockdep_key;
#endif
        kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
                                  cgroup_file_mode(cft),
                                  current_fsuid(), current_fsgid(),
                                  0, cft->kf_ops, cft,
                                  NULL, key);
        if (IS_ERR(kn))
                return PTR_ERR(kn);

        if (cft->file_offset) {
                struct cgroup_file *cfile = (void *)css + cft->file_offset;

                timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
                spin_lock_init(&cfile->lock);
                cfile->kn = kn;
        }

        return 0;
}

/**
 * cgroup_addrm_files - add or remove files to a cgroup directory
 * @css: the target css
 * @cgrp: the target cgroup (usually css->cgroup)
 * @cfts: array of cftypes to be added
 * @is_add: whether to add or remove
 *
 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
 * For removals, this function never fails.
 */
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add)
{
        struct cftype *cft, *cft_end = NULL;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

restart:
        for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
                if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
                        continue;
                if (is_add) {
                        ret = cgroup_add_file(css, cgrp, cft);
                        if (ret) {
                                pr_warn("%s: failed to add %s, err=%d\n",
                                        __func__, cft->name, ret);
                                cft_end = cft;
                                is_add = false;
                                goto restart;
                        }
                } else {
                        cgroup_rm_file(cgrp, cft);
                }
        }
        return ret;
}

static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
        struct cgroup_subsys *ss = cfts[0].ss;
        struct cgroup *root = &ss->root->cgrp;
        struct cgroup_subsys_state *css;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

        /* add/rm files for all cgroups created before */
        css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
                struct cgroup *cgrp = css->cgroup;

                if (!(css->flags & CSS_VISIBLE))
                        continue;

                ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
                if (ret)
                        break;
        }

        if (is_add && !ret)
                kernfs_activate(root->kn);
        return ret;
}

static void cgroup_exit_cftypes(struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                /* free copy for custom atomic_write_len, see init_cftypes() */
                if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
                        kfree(cft->kf_ops);
                cft->kf_ops = NULL;
                cft->ss = NULL;

                /* revert flags set by cgroup core while adding @cfts */
                cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
                                __CFTYPE_ADDED);
        }
}

static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;
        int ret = 0;

        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                struct kernfs_ops *kf_ops;

                WARN_ON(cft->ss || cft->kf_ops);

                if (cft->flags & __CFTYPE_ADDED) {
                        ret = -EBUSY;
                        break;
                }

                if (cft->seq_start)
                        kf_ops = &cgroup_kf_ops;
                else
                        kf_ops = &cgroup_kf_single_ops;

                /*
                 * Ugh... if @cft wants a custom max_write_len, we need to
                 * make a copy of kf_ops to set its atomic_write_len.
                 */
                if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
                        kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
                        if (!kf_ops) {
                                ret = -ENOMEM;
                                break;
                        }
                        kf_ops->atomic_write_len = cft->max_write_len;
                }

                cft->kf_ops = kf_ops;
                cft->ss = ss;
                cft->flags |= __CFTYPE_ADDED;
        }

        if (ret)
                cgroup_exit_cftypes(cfts);
        return ret;
}

static void cgroup_rm_cftypes_locked(struct cftype *cfts)
{
        lockdep_assert_held(&cgroup_mutex);

        list_del(&cfts->node);
        cgroup_apply_cftypes(cfts, false);
        cgroup_exit_cftypes(cfts);
}

/**
 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Unregister @cfts.  Files described by @cfts are removed from all
 * existing cgroups and all future cgroups won't have them either.  This
 * function can be called anytime whether @cfts' subsys is attached or not.
 *
 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
 * registered.
 */
int cgroup_rm_cftypes(struct cftype *cfts)
{
        if (!cfts || cfts[0].name[0] == '\0')
                return 0;

        if (!(cfts[0].flags & __CFTYPE_ADDED))
                return -ENOENT;

        cgroup_lock();
        cgroup_rm_cftypes_locked(cfts);
        cgroup_unlock();
        return 0;
}

/**
 * cgroup_add_cftypes - add an array of cftypes to a subsystem
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Register @cfts to @ss.  Files described by @cfts are created for all
 * existing cgroups to which @ss is attached and all future cgroups will
 * have them too.  This function can be called anytime whether @ss is
 * attached or not.
 *
 * Returns 0 on successful registration, -errno on failure.  Note that this
 * function currently returns 0 as long as @cfts registration is successful
 * even if some file creation attempts on existing cgroups fail.
 */
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        int ret;

        if (!cgroup_ssid_enabled(ss->id))
                return 0;

        if (!cfts || cfts[0].name[0] == '\0')
                return 0;

        ret = cgroup_init_cftypes(ss, cfts);
        if (ret)
                return ret;

        cgroup_lock();

        list_add_tail(&cfts->node, &ss->cfts);
        ret = cgroup_apply_cftypes(cfts, true);
        if (ret)
                cgroup_rm_cftypes_locked(cfts);

        cgroup_unlock();
        return ret;
}

/**
 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the default hierarchy.
 */
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
                cft->flags |= __CFTYPE_ONLY_ON_DFL;
        return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the legacy hierarchies.
 */
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
                cft->flags |= __CFTYPE_NOT_ON_DFL;
        return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_file_notify - generate a file modified event for a cgroup_file
 * @cfile: target cgroup_file
 *
 * @cfile must have been obtained by setting cftype->file_offset.
 */
void cgroup_file_notify(struct cgroup_file *cfile)
{
        unsigned long flags, last, next;
        struct kernfs_node *kn = NULL;

        if (!READ_ONCE(cfile->kn))
                return;

        last = READ_ONCE(cfile->notified_at);
        next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
        if (time_in_range(jiffies, last, next)) {
                timer_reduce(&cfile->notify_timer, next);
                if (timer_pending(&cfile->notify_timer))
                        return;
        }

        spin_lock_irqsave(&cfile->lock, flags);
        if (cfile->kn) {
                kn = cfile->kn;
                kernfs_get(kn);
                WRITE_ONCE(cfile->notified_at, jiffies);
        }
        spin_unlock_irqrestore(&cfile->lock, flags);

        if (kn) {
                kernfs_notify(kn);
                kernfs_put(kn);
        }
}
EXPORT_SYMBOL_GPL(cgroup_file_notify);

/**
 * cgroup_file_show - show or hide a hidden cgroup file
 * @cfile: target cgroup_file obtained by setting cftype->file_offset
 * @show: whether to show or hide
 */
void cgroup_file_show(struct cgroup_file *cfile, bool show)
{
        struct kernfs_node *kn;

        spin_lock_irq(&cfile->lock);
        kn = cfile->kn;
        kernfs_get(kn);
        spin_unlock_irq(&cfile->lock);

        if (kn)
                kernfs_show(kn, show);

        kernfs_put(kn);
}

/**
 * css_next_child - find the next child of a given css
 * @pos: the current position (%NULL to initiate traversal)
 * @parent: css whose children to walk
 *
 * This function returns the next child of @parent and should be called
 * under either cgroup_mutex or RCU read lock.  The only requirement is
 * that @parent and @pos are accessible.  The next sibling is guaranteed to
 * be returned regardless of their states.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /*
         * @pos could already have been unlinked from the sibling list.
         * Once a cgroup is removed, its ->sibling.next is no longer
         * updated when its next sibling changes.  CSS_RELEASED is set when
         * @pos is taken off list, at which time its next pointer is valid,
         * and, as releases are serialized, the one pointed to by the next
         * pointer is guaranteed to not have started release yet.  This
         * implies that if we observe !CSS_RELEASED on @pos in this RCU
         * critical section, the one pointed to by its next pointer is
         * guaranteed to not have finished its RCU grace period even if we
         * have dropped rcu_read_lock() in-between iterations.
         *
         * If @pos has CSS_RELEASED set, its next pointer can't be
         * dereferenced; however, as each css is given a monotonically
         * increasing unique serial number and always appended to the
         * sibling list, the next one can be found by walking the parent's
         * children until the first css with higher serial number than
         * @pos's.  While this path can be slower, it happens iff iteration
         * races against release and the race window is very small.
         */
        if (!pos) {
                next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
        } else if (likely(!(pos->flags & CSS_RELEASED))) {
                next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
        } else {
                list_for_each_entry_rcu(next, &parent->children, sibling,
                                        lockdep_is_held(&cgroup_mutex))
                        if (next->serial_nr > pos->serial_nr)
                                break;
        }

        /*
         * @next, if not pointing to the head, can be dereferenced and is
         * the next sibling.
         */
        if (&next->sibling != &parent->children)
                return next;
        return NULL;
}

/**
 * css_next_descendant_pre - find the next descendant for pre-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: css whose descendants to walk
 *
 * To be used by css_for_each_descendant_pre().  Find the next descendant
 * to visit for pre-order traversal of @root's descendants.  @root is
 * included in the iteration and the first node to be visited.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section. Additionally, it isn't necessary to hold onto a reference to @pos.
 * This function will return the correct next descendant as long as both @pos
 * and @root are accessible and @pos is a descendant of @root.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
                        struct cgroup_subsys_state *root)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /* if first iteration, visit @root */
        if (!pos)
                return root;

        /* visit the first child if exists */
        next = css_next_child(NULL, pos);
        if (next)
                return next;

        /* no child, visit my or the closest ancestor's next sibling */
        while (pos != root) {
                next = css_next_child(pos, pos->parent);
                if (next)
                        return next;
                pos = pos->parent;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(css_next_descendant_pre);

/**
 * css_rightmost_descendant - return the rightmost descendant of a css
 * @pos: css of interest
 *
 * Return the rightmost descendant of @pos.  If there's no descendant, @pos
 * is returned.  This can be used during pre-order traversal to skip
 * subtree of @pos.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section. Additionally, it isn't necessary to hold onto a reference to @pos.
 * This function will return the correct rightmost descendant as long as @pos
 * is accessible.
 */
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
        struct cgroup_subsys_state *last, *tmp;

        cgroup_assert_mutex_or_rcu_locked();

        do {
                last = pos;
                /* ->prev isn't RCU safe, walk ->next till the end */
                pos = NULL;
                css_for_each_child(tmp, last)
                        pos = tmp;
        } while (pos);

        return last;
}

static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
{
        struct cgroup_subsys_state *last;

        do {
                last = pos;
                pos = css_next_child(NULL, pos);
        } while (pos);

        return last;
}

/**
 * css_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: css whose descendants to walk
 *
 * To be used by css_for_each_descendant_post().  Find the next descendant
 * to visit for post-order traversal of @root's descendants.  @root is
 * included in the iteration and the last node to be visited.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section. Additionally, it isn't necessary to hold onto a reference to @pos.
 * This function will return the correct next descendant as long as both @pos
 * and @cgroup are accessible and @pos is a descendant of @cgroup.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
                         struct cgroup_subsys_state *root)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /* if first iteration, visit leftmost descendant which may be @root */
        if (!pos)
                return css_leftmost_descendant(root);

        /* if we visited @root, we're done */
        if (pos == root)
                return NULL;

        /* if there's an unvisited sibling, visit its leftmost descendant */
        next = css_next_child(pos, pos->parent);
        if (next)
                return css_leftmost_descendant(next);

        /* no sibling left, visit parent */
        return pos->parent;
}

/**
 * css_has_online_children - does a css have online children
 * @css: the target css
 *
 * Returns %true if @css has any online children; otherwise, %false.  This
 * function can be called from any context but the caller is responsible
 * for synchronizing against on/offlining as necessary.
 */
bool css_has_online_children(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys_state *child;
        bool ret = false;

        rcu_read_lock();
        css_for_each_child(child, css) {
                if (css_is_online(child)) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}

static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
{
        struct list_head *l;
        struct cgrp_cset_link *link;
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* find the next threaded cset */
        if (it->tcset_pos) {
                l = it->tcset_pos->next;

                if (l != it->tcset_head) {
                        it->tcset_pos = l;
                        return container_of(l, struct css_set,
                                            threaded_csets_node);
                }

                it->tcset_pos = NULL;
        }

        /* find the next cset */
        l = it->cset_pos;
        l = l->next;
        if (l == it->cset_head) {
                it->cset_pos = NULL;
                return NULL;
        }

        if (it->ss) {
                cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
        } else {
                link = list_entry(l, struct cgrp_cset_link, cset_link);
                cset = link->cset;
        }

        it->cset_pos = l;

        /* initialize threaded css_set walking */
        if (it->flags & CSS_TASK_ITER_THREADED) {
                if (it->cur_dcset)
                        put_css_set_locked(it->cur_dcset);
                it->cur_dcset = cset;
                get_css_set(cset);

                it->tcset_head = &cset->threaded_csets;
                it->tcset_pos = &cset->threaded_csets;
        }

        return cset;
}

/**
 * css_task_iter_advance_css_set - advance a task iterator to the next css_set
 * @it: the iterator to advance
 *
 * Advance @it to the next css_set to walk.
 */
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* Advance to the next non-empty css_set and find first non-empty tasks list*/
        while ((cset = css_task_iter_next_css_set(it))) {
                if (!list_empty(&cset->tasks)) {
                        it->cur_tasks_head = &cset->tasks;
                        break;
                } else if (!list_empty(&cset->mg_tasks)) {
                        it->cur_tasks_head = &cset->mg_tasks;
                        break;
                } else if (!list_empty(&cset->dying_tasks)) {
                        it->cur_tasks_head = &cset->dying_tasks;
                        break;
                }
        }
        if (!cset) {
                it->task_pos = NULL;
                return;
        }
        it->task_pos = it->cur_tasks_head->next;

        /*
         * We don't keep css_sets locked across iteration steps and thus
         * need to take steps to ensure that iteration can be resumed after
         * the lock is re-acquired.  Iteration is performed at two levels -
         * css_sets and tasks in them.
         *
         * Once created, a css_set never leaves its cgroup lists, so a
         * pinned css_set is guaranteed to stay put and we can resume
         * iteration afterwards.
         *
         * Tasks may leave @cset across iteration steps.  This is resolved
         * by registering each iterator with the css_set currently being
         * walked and making css_set_move_task() advance iterators whose
         * next task is leaving.
         */
        if (it->cur_cset) {
                list_del(&it->iters_node);
                put_css_set_locked(it->cur_cset);
        }
        get_css_set(cset);
        it->cur_cset = cset;
        list_add(&it->iters_node, &cset->task_iters);
}

static void css_task_iter_skip(struct css_task_iter *it,
                               struct task_struct *task)
{
        lockdep_assert_held(&css_set_lock);

        if (it->task_pos == &task->cg_list) {
                it->task_pos = it->task_pos->next;
                it->flags |= CSS_TASK_ITER_SKIPPED;
        }
}

static void css_task_iter_advance(struct css_task_iter *it)
{
        struct task_struct *task;

        lockdep_assert_held(&css_set_lock);
repeat:
        if (it->task_pos) {
                /*
                 * Advance iterator to find next entry. We go through cset
                 * tasks, mg_tasks and dying_tasks, when consumed we move onto
                 * the next cset.
                 */
                if (it->flags & CSS_TASK_ITER_SKIPPED)
                        it->flags &= ~CSS_TASK_ITER_SKIPPED;
                else
                        it->task_pos = it->task_pos->next;

                if (it->task_pos == &it->cur_cset->tasks) {
                        it->cur_tasks_head = &it->cur_cset->mg_tasks;
                        it->task_pos = it->cur_tasks_head->next;
                }
                if (it->task_pos == &it->cur_cset->mg_tasks) {
                        it->cur_tasks_head = &it->cur_cset->dying_tasks;
                        it->task_pos = it->cur_tasks_head->next;
                }
                if (it->task_pos == &it->cur_cset->dying_tasks)
                        css_task_iter_advance_css_set(it);
        } else {
                /* called from start, proceed to the first cset */
                css_task_iter_advance_css_set(it);
        }

        if (!it->task_pos)
                return;

        task = list_entry(it->task_pos, struct task_struct, cg_list);
        /*
         * Hide tasks that are exiting but not yet removed. Keep zombie
         * leaders with live threads visible.
         */
        if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
                goto repeat;

        if (it->flags & CSS_TASK_ITER_PROCS) {
                /* if PROCS, skip over tasks which aren't group leaders */
                if (!thread_group_leader(task))
                        goto repeat;

                /* and dying leaders w/o live member threads */
                if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
                    !atomic_read(&task->signal->live))
                        goto repeat;
        } else {
                /* skip all dying ones */
                if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
                        goto repeat;
        }
}

/**
 * css_task_iter_start - initiate task iteration
 * @css: the css to walk tasks of
 * @flags: CSS_TASK_ITER_* flags
 * @it: the task iterator to use
 *
 * Initiate iteration through the tasks of @css.  The caller can call
 * css_task_iter_next() to walk through the tasks until the function
 * returns NULL.  On completion of iteration, css_task_iter_end() must be
 * called.
 */
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it)
{
        unsigned long irqflags;

        memset(it, 0, sizeof(*it));

        spin_lock_irqsave(&css_set_lock, irqflags);

        it->ss = css->ss;
        it->flags = flags;

        if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
                it->cset_pos = &css->cgroup->e_csets[css->ss->id];
        else
                it->cset_pos = &css->cgroup->cset_links;

        it->cset_head = it->cset_pos;

        css_task_iter_advance(it);

        spin_unlock_irqrestore(&css_set_lock, irqflags);
}

/**
 * css_task_iter_next - return the next task for the iterator
 * @it: the task iterator being iterated
 *
 * The "next" function for task iteration.  @it should have been
 * initialized via css_task_iter_start().  Returns NULL when the iteration
 * reaches the end.
 */
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
        unsigned long irqflags;

        if (it->cur_task) {
                put_task_struct(it->cur_task);
                it->cur_task = NULL;
        }

        spin_lock_irqsave(&css_set_lock, irqflags);

        /* @it may be half-advanced by skips, finish advancing */
        if (it->flags & CSS_TASK_ITER_SKIPPED)
                css_task_iter_advance(it);

        if (it->task_pos) {
                it->cur_task = list_entry(it->task_pos, struct task_struct,
                                          cg_list);
                get_task_struct(it->cur_task);
                css_task_iter_advance(it);
        }

        spin_unlock_irqrestore(&css_set_lock, irqflags);

        return it->cur_task;
}

/**
 * css_task_iter_end - finish task iteration
 * @it: the task iterator to finish
 *
 * Finish task iteration started by css_task_iter_start().
 */
void css_task_iter_end(struct css_task_iter *it)
{
        unsigned long irqflags;

        if (it->cur_cset) {
                spin_lock_irqsave(&css_set_lock, irqflags);
                list_del(&it->iters_node);
                put_css_set_locked(it->cur_cset);
                spin_unlock_irqrestore(&css_set_lock, irqflags);
        }

        if (it->cur_dcset)
                put_css_set(it->cur_dcset);

        if (it->cur_task)
                put_task_struct(it->cur_task);
}

static void cgroup_procs_release(struct kernfs_open_file *of)
{
        struct cgroup_file_ctx *ctx = of->priv;

        if (ctx->procs.started)
                css_task_iter_end(&ctx->procs.iter);
}

static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
        struct kernfs_open_file *of = s->private;
        struct cgroup_file_ctx *ctx = of->priv;

        if (pos)
                (*pos)++;

        return css_task_iter_next(&ctx->procs.iter);
}

static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
                                  unsigned int iter_flags)
{
        struct kernfs_open_file *of = s->private;
        struct cgroup *cgrp = seq_css(s)->cgroup;
        struct cgroup_file_ctx *ctx = of->priv;
        struct css_task_iter *it = &ctx->procs.iter;

        /*
         * When a seq_file is seeked, it's always traversed sequentially
         * from position 0, so we can simply keep iterating on !0 *pos.
         */
        if (!ctx->procs.started) {
                if (WARN_ON_ONCE((*pos)))
                        return ERR_PTR(-EINVAL);
                css_task_iter_start(&cgrp->self, iter_flags, it);
                ctx->procs.started = true;
        } else if (!(*pos)) {
                css_task_iter_end(it);
                css_task_iter_start(&cgrp->self, iter_flags, it);
        } else
                return it->cur_task;

        return cgroup_procs_next(s, NULL, NULL);
}

static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
{
        struct cgroup *cgrp = seq_css(s)->cgroup;

        /*
         * All processes of a threaded subtree belong to the domain cgroup
         * of the subtree.  Only threads can be distributed across the
         * subtree.  Reject reads on cgroup.procs in the subtree proper.
         * They're always empty anyway.
         */
        if (cgroup_is_threaded(cgrp))
                return ERR_PTR(-EOPNOTSUPP);

        return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
                                            CSS_TASK_ITER_THREADED);
}

static int cgroup_procs_show(struct seq_file *s, void *v)
{
        seq_printf(s, "%d\n", task_pid_vnr(v));
        return 0;
}

static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
{
        int ret;
        struct inode *inode;

        lockdep_assert_held(&cgroup_mutex);

        inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
        if (!inode)
                return -ENOMEM;

        ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
        iput(inode);
        return ret;
}

static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
                                         struct cgroup *dst_cgrp,
                                         struct super_block *sb,
                                         struct cgroup_namespace *ns)
{
        struct cgroup *com_cgrp = src_cgrp;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* find the common ancestor */
        while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
                com_cgrp = cgroup_parent(com_cgrp);

        /* %current should be authorized to migrate to the common ancestor */
        ret = cgroup_may_write(com_cgrp, sb);
        if (ret)
                return ret;

        /*
         * If namespaces are delegation boundaries, %current must be able
         * to see both source and destination cgroups from its namespace.
         */
        if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
            (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
             !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
                return -ENOENT;

        return 0;
}

static int cgroup_attach_permissions(struct cgroup *src_cgrp,
                                     struct cgroup *dst_cgrp,
                                     struct super_block *sb, bool threadgroup,
                                     struct cgroup_namespace *ns)
{
        int ret = 0;

        ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
        if (ret)
                return ret;

        ret = cgroup_migrate_vet_dst(dst_cgrp);
        if (ret)
                return ret;

        if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
                ret = -EOPNOTSUPP;

        return ret;
}

static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
                                    bool threadgroup)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct cgroup *src_cgrp, *dst_cgrp;
        struct task_struct *task;
        ssize_t ret;
        enum cgroup_attach_lock_mode lock_mode;

        dst_cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!dst_cgrp)
                return -ENODEV;

        task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
        ret = PTR_ERR_OR_ZERO(task);
        if (ret)
                goto out_unlock;

        /* find the source cgroup */
        spin_lock_irq(&css_set_lock);
        src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
        spin_unlock_irq(&css_set_lock);

        /*
         * Process and thread migrations follow same delegation rule. Check
         * permissions using the credentials from file open to protect against
         * inherited fd attacks.
         */
        scoped_with_creds(of->file->f_cred)
                ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
                                                of->file->f_path.dentry->d_sb,
                                                threadgroup, ctx->ns);
        if (ret)
                goto out_finish;

        ret = cgroup_attach_task(dst_cgrp, task, threadgroup);

out_finish:
        cgroup_procs_write_finish(task, lock_mode);
out_unlock:
        cgroup_kn_unlock(of->kn);

        return ret;
}

static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
                                  char *buf, size_t nbytes, loff_t off)
{
        return __cgroup_procs_write(of, buf, true) ?: nbytes;
}

static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
{
        return __cgroup_procs_start(s, pos, 0);
}

static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        return __cgroup_procs_write(of, buf, false) ?: nbytes;
}

/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
        {
                .name = "cgroup.type",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_type_show,
                .write = cgroup_type_write,
        },
        {
                .name = "cgroup.procs",
                .flags = CFTYPE_NS_DELEGATABLE,
                .file_offset = offsetof(struct cgroup, procs_file),
                .release = cgroup_procs_release,
                .seq_start = cgroup_procs_start,
                .seq_next = cgroup_procs_next,
                .seq_show = cgroup_procs_show,
                .write = cgroup_procs_write,
        },
        {
                .name = "cgroup.threads",
                .flags = CFTYPE_NS_DELEGATABLE,
                .release = cgroup_procs_release,
                .seq_start = cgroup_threads_start,
                .seq_next = cgroup_procs_next,
                .seq_show = cgroup_procs_show,
                .write = cgroup_threads_write,
        },
        {
                .name = "cgroup.controllers",
                .seq_show = cgroup_controllers_show,
        },
        {
                .name = "cgroup.subtree_control",
                .flags = CFTYPE_NS_DELEGATABLE,
                .seq_show = cgroup_subtree_control_show,
                .write = cgroup_subtree_control_write,
        },
        {
                .name = "cgroup.events",
                .flags = CFTYPE_NOT_ON_ROOT,
                .file_offset = offsetof(struct cgroup, events_file),
                .seq_show = cgroup_events_show,
        },
        {
                .name = "cgroup.max.descendants",
                .seq_show = cgroup_max_descendants_show,
                .write = cgroup_max_descendants_write,
        },
        {
                .name = "cgroup.max.depth",
                .seq_show = cgroup_max_depth_show,
                .write = cgroup_max_depth_write,
        },
        {
                .name = "cgroup.stat",
                .seq_show = cgroup_stat_show,
        },
        {
                .name = "cgroup.stat.local",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_core_local_stat_show,
        },
        {
                .name = "cgroup.freeze",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_freeze_show,
                .write = cgroup_freeze_write,
        },
        {
                .name = "cgroup.kill",
                .flags = CFTYPE_NOT_ON_ROOT,
                .write = cgroup_kill_write,
        },
        {
                .name = "cpu.stat",
                .seq_show = cpu_stat_show,
        },
        {
                .name = "cpu.stat.local",
                .seq_show = cpu_local_stat_show,
        },
        { }        /* terminate */
};

static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
        {
                .name = "io.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
                .seq_show = cgroup_io_pressure_show,
                .write = cgroup_io_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
        {
                .name = "memory.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
                .seq_show = cgroup_memory_pressure_show,
                .write = cgroup_memory_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
        {
                .name = "cpu.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
                .seq_show = cgroup_cpu_pressure_show,
                .write = cgroup_cpu_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
        {
                .name = "irq.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
                .seq_show = cgroup_irq_pressure_show,
                .write = cgroup_irq_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
#endif
        {
                .name = "cgroup.pressure",
                .seq_show = cgroup_pressure_show,
                .write = cgroup_pressure_write,
        },
#endif /* CONFIG_PSI */
        { }        /* terminate */
};

/*
 * css destruction is four-stage process.
 *
 * 1. Destruction starts.  Killing of the percpu_ref is initiated.
 *    Implemented in kill_css().
 *
 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
 *    and thus css_tryget_online() is guaranteed to fail, the css can be
 *    offlined by invoking offline_css().  After offlining, the base ref is
 *    put.  Implemented in css_killed_work_fn().
 *
 * 3. When the percpu_ref reaches zero, the only possible remaining
 *    accessors are inside RCU read sections.  css_release() schedules the
 *    RCU callback.
 *
 * 4. After the grace period, the css can be freed.  Implemented in
 *    css_free_rwork_fn().
 *
 * It is actually hairier because both step 2 and 4 require process context
 * and thus involve punting to css->destroy_work adding two additional
 * steps to the already complex sequence.
 */
static void css_free_rwork_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
                                struct cgroup_subsys_state, destroy_rwork);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        percpu_ref_exit(&css->refcnt);
        css_rstat_exit(css);

        if (!css_is_self(css)) {
                /* css free path */
                struct cgroup_subsys_state *parent = css->parent;
                int id = css->id;

                ss->css_free(css);
                cgroup_idr_remove(&ss->css_idr, id);
                cgroup_put(cgrp);

                if (parent)
                        css_put(parent);
        } else {
                /* cgroup free path */
                atomic_dec(&cgrp->root->nr_cgrps);
                if (!cgroup_on_dfl(cgrp))
                        cgroup1_pidlist_destroy_all(cgrp);
                cancel_work_sync(&cgrp->release_agent_work);
                bpf_cgrp_storage_free(cgrp);

                if (cgroup_parent(cgrp)) {
                        /*
                         * We get a ref to the parent, and put the ref when
                         * this cgroup is being freed, so it's guaranteed
                         * that the parent won't be destroyed before its
                         * children.
                         */
                        cgroup_put(cgroup_parent(cgrp));
                        kernfs_put(cgrp->kn);
                        psi_cgroup_free(cgrp);
                        kfree(cgrp);
                } else {
                        /*
                         * This is root cgroup's refcnt reaching zero,
                         * which indicates that the root should be
                         * released.
                         */
                        cgroup_destroy_root(cgrp->root);
                }
        }
}

static void css_release_work_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        cgroup_lock();

        css->flags |= CSS_RELEASED;
        list_del_rcu(&css->sibling);

        if (!css_is_self(css)) {
                struct cgroup *parent_cgrp;

                css_rstat_flush(css);

                cgroup_idr_replace(&ss->css_idr, NULL, css->id);
                if (ss->css_released)
                        ss->css_released(css);

                cgrp->nr_dying_subsys[ss->id]--;
                /*
                 * When a css is released and ready to be freed, its
                 * nr_descendants must be zero. However, the corresponding
                 * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem
                 * is activated and deactivated multiple times with one or
                 * more of its previous activation leaving behind dying csses.
                 */
                WARN_ON_ONCE(css->nr_descendants);
                parent_cgrp = cgroup_parent(cgrp);
                while (parent_cgrp) {
                        parent_cgrp->nr_dying_subsys[ss->id]--;
                        parent_cgrp = cgroup_parent(parent_cgrp);
                }
        } else {
                struct cgroup *tcgrp;

                /* cgroup release path */
                TRACE_CGROUP_PATH(release, cgrp);

                css_rstat_flush(&cgrp->self);

                spin_lock_irq(&css_set_lock);
                for (tcgrp = cgroup_parent(cgrp); tcgrp;
                     tcgrp = cgroup_parent(tcgrp))
                        tcgrp->nr_dying_descendants--;
                spin_unlock_irq(&css_set_lock);

                /*
                 * There are two control paths which try to determine
                 * cgroup from dentry without going through kernfs -
                 * cgroupstats_build() and css_tryget_online_from_dir().
                 * Those are supported by RCU protecting clearing of
                 * cgrp->kn->priv backpointer.
                 */
                if (cgrp->kn)
                        RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
                                         NULL);
        }

        cgroup_unlock();

        INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
        queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
}

static void css_release(struct percpu_ref *ref)
{
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);

        INIT_WORK(&css->destroy_work, css_release_work_fn);
        queue_work(cgroup_release_wq, &css->destroy_work);
}

static void init_and_link_css(struct cgroup_subsys_state *css,
                              struct cgroup_subsys *ss, struct cgroup *cgrp)
{
        lockdep_assert_held(&cgroup_mutex);

        cgroup_get_live(cgrp);

        memset(css, 0, sizeof(*css));
        css->cgroup = cgrp;
        css->ss = ss;
        css->id = -1;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
        css->serial_nr = css_serial_nr_next++;
        atomic_set(&css->online_cnt, 0);

        if (cgroup_parent(cgrp)) {
                css->parent = cgroup_css(cgroup_parent(cgrp), ss);
                css_get(css->parent);
        }

        BUG_ON(cgroup_css(cgrp, ss));
}

/* invoke ->css_online() on a new CSS and mark it online if successful */
static int online_css(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

        if (ss->css_online)
                ret = ss->css_online(css);
        if (!ret) {
                css->flags |= CSS_ONLINE;
                rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

                atomic_inc(&css->online_cnt);
                if (css->parent) {
                        atomic_inc(&css->parent->online_cnt);
                        while ((css = css->parent))
                                css->nr_descendants++;
                }
        }
        return ret;
}

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
static void offline_css(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;

        lockdep_assert_held(&cgroup_mutex);

        if (!css_is_online(css))
                return;

        if (ss->css_offline)
                ss->css_offline(css);

        css->flags &= ~CSS_ONLINE;
        RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);

        wake_up_all(&css->cgroup->offline_waitq);

        css->cgroup->nr_dying_subsys[ss->id]++;
        /*
         * Parent css and cgroup cannot be freed until after the freeing
         * of child css, see css_free_rwork_fn().
         */
        while ((css = css->parent)) {
                css->nr_descendants--;
                css->cgroup->nr_dying_subsys[ss->id]++;
        }
}

/**
 * css_create - create a cgroup_subsys_state
 * @cgrp: the cgroup new css will be associated with
 * @ss: the subsys of new css
 *
 * Create a new css associated with @cgrp - @ss pair.  On success, the new
 * css is online and installed in @cgrp.  This function doesn't create the
 * interface files.  Returns 0 on success, -errno on failure.
 */
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
        struct cgroup_subsys_state *css;
        int err;

        lockdep_assert_held(&cgroup_mutex);

        css = ss->css_alloc(parent_css);
        if (!css)
                css = ERR_PTR(-ENOMEM);
        if (IS_ERR(css))
                return css;

        init_and_link_css(css, ss, cgrp);

        err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
        if (err)
                goto err_free_css;

        err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
        if (err < 0)
                goto err_free_css;
        css->id = err;

        err = css_rstat_init(css);
        if (err)
                goto err_free_css;

        /* @css is ready to be brought online now, make it visible */
        list_add_tail_rcu(&css->sibling, &parent_css->children);
        cgroup_idr_replace(&ss->css_idr, css, css->id);

        err = online_css(css);
        if (err)
                goto err_list_del;

        return css;

err_list_del:
        list_del_rcu(&css->sibling);
err_free_css:
        INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
        queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
        return ERR_PTR(err);
}

/*
 * The returned cgroup is fully initialized including its control mask, but
 * it doesn't have the control mask applied.
 */
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
                                    umode_t mode)
{
        struct cgroup_root *root = parent->root;
        struct cgroup *cgrp, *tcgrp;
        struct kernfs_node *kn;
        int i, level = parent->level + 1;
        int ret;

        /* allocate the cgroup and its ID, 0 is reserved for the root */
        cgrp = kzalloc_flex(*cgrp, _low_ancestors, level);
        if (!cgrp)
                return ERR_PTR(-ENOMEM);

        ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
        if (ret)
                goto out_free_cgrp;

        /* create the directory */
        kn = kernfs_create_dir_ns(parent->kn, name, mode,
                                  current_fsuid(), current_fsgid(),
                                  cgrp, NULL);
        if (IS_ERR(kn)) {
                ret = PTR_ERR(kn);
                goto out_cancel_ref;
        }
        cgrp->kn = kn;

        init_cgroup_housekeeping(cgrp);

        cgrp->self.parent = &parent->self;
        cgrp->root = root;
        cgrp->level = level;

        /*
         * Now that init_cgroup_housekeeping() has been called and cgrp->self
         * is setup, it is safe to perform rstat initialization on it.
         */
        ret = css_rstat_init(&cgrp->self);
        if (ret)
                goto out_kernfs_remove;

        ret = psi_cgroup_alloc(cgrp);
        if (ret)
                goto out_stat_exit;

        for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
                cgrp->ancestors[tcgrp->level] = tcgrp;

        /*
         * New cgroup inherits effective freeze counter, and
         * if the parent has to be frozen, the child has too.
         */
        cgrp->freezer.e_freeze = parent->freezer.e_freeze;
        seqcount_spinlock_init(&cgrp->freezer.freeze_seq, &css_set_lock);
        if (cgrp->freezer.e_freeze) {
                /*
                 * Set the CGRP_FREEZE flag, so when a process will be
                 * attached to the child cgroup, it will become frozen.
                 * At this point the new cgroup is unpopulated, so we can
                 * consider it frozen immediately.
                 */
                set_bit(CGRP_FREEZE, &cgrp->flags);
                cgrp->freezer.freeze_start_nsec = ktime_get_ns();
                set_bit(CGRP_FROZEN, &cgrp->flags);
        }

        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

        cgrp->self.serial_nr = css_serial_nr_next++;

        ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier,
                                                  CGROUP_LIFETIME_ONLINE,
                                                  CGROUP_LIFETIME_OFFLINE, cgrp);
        ret = notifier_to_errno(ret);
        if (ret)
                goto out_psi_free;

        /* allocation complete, commit to creation */
        spin_lock_irq(&css_set_lock);
        for (i = 0; i < level; i++) {
                tcgrp = cgrp->ancestors[i];
                tcgrp->nr_descendants++;

                /*
                 * If the new cgroup is frozen, all ancestor cgroups get a new
                 * frozen descendant, but their state can't change because of
                 * this.
                 */
                if (cgrp->freezer.e_freeze)
                        tcgrp->freezer.nr_frozen_descendants++;
        }
        spin_unlock_irq(&css_set_lock);

        list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
        atomic_inc(&root->nr_cgrps);
        cgroup_get_live(parent);

        /*
         * On the default hierarchy, a child doesn't automatically inherit
         * subtree_control from the parent.  Each is configured manually.
         */
        if (!cgroup_on_dfl(cgrp))
                cgrp->subtree_control = cgroup_control(cgrp);

        cgroup_propagate_control(cgrp);

        return cgrp;

out_psi_free:
        psi_cgroup_free(cgrp);
out_stat_exit:
        css_rstat_exit(&cgrp->self);
out_kernfs_remove:
        kernfs_remove(cgrp->kn);
out_cancel_ref:
        percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
        kfree(cgrp);
        return ERR_PTR(ret);
}

static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{
        struct cgroup *cgroup;
        int ret = false;
        int level = 0;

        lockdep_assert_held(&cgroup_mutex);

        for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
                if (cgroup->nr_descendants >= cgroup->max_descendants)
                        goto fail;

                if (level >= cgroup->max_depth)
                        goto fail;

                level++;
        }

        ret = true;
fail:
        return ret;
}

int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
        struct cgroup *parent, *cgrp;
        int ret;

        /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
        if (strchr(name, '\n'))
                return -EINVAL;

        parent = cgroup_kn_lock_live(parent_kn, false);
        if (!parent)
                return -ENODEV;

        if (!cgroup_check_hierarchy_limits(parent)) {
                ret = -EAGAIN;
                goto out_unlock;
        }

        cgrp = cgroup_create(parent, name, mode);
        if (IS_ERR(cgrp)) {
                ret = PTR_ERR(cgrp);
                goto out_unlock;
        }

        /*
         * This extra ref will be put in css_free_rwork_fn() and guarantees
         * that @cgrp->kn is always accessible.
         */
        kernfs_get(cgrp->kn);

        ret = css_populate_dir(&cgrp->self);
        if (ret)
                goto out_destroy;

        ret = cgroup_apply_control_enable(cgrp);
        if (ret)
                goto out_destroy;

        TRACE_CGROUP_PATH(mkdir, cgrp);

        /* let's create and online css's */
        kernfs_activate(cgrp->kn);

        ret = 0;
        goto out_unlock;

out_destroy:
        cgroup_destroy_locked(cgrp);
out_unlock:
        cgroup_kn_unlock(parent_kn);
        return ret;
}

/*
 * This is called when the refcnt of a css is confirmed to be killed.
 * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
 * initiate destruction and put the css ref from kill_css().
 */
static void css_killed_work_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css;

        css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);

        cgroup_lock();

        do {
                offline_css(css);
                css_put(css);
                /* @css can't go away while we're holding cgroup_mutex */
                css = css->parent;
        } while (css && atomic_dec_and_test(&css->online_cnt));

        cgroup_unlock();
}

/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
{
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);

        if (atomic_dec_and_test(&css->online_cnt)) {
                INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
                queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
        }
}

/**
 * kill_css - destroy a css
 * @css: css to destroy
 *
 * This function initiates destruction of @css by removing cgroup interface
 * files and putting its base reference.  ->css_offline() will be invoked
 * asynchronously once css_tryget_online() is guaranteed to fail and when
 * the reference count reaches zero, @css will be released.
 */
static void kill_css(struct cgroup_subsys_state *css)
{
        lockdep_assert_held(&cgroup_mutex);

        if (css->flags & CSS_DYING)
                return;

        /*
         * Call css_killed(), if defined, before setting the CSS_DYING flag
         */
        if (css->ss->css_killed)
                css->ss->css_killed(css);

        css->flags |= CSS_DYING;

        /*
         * This must happen before css is disassociated with its cgroup.
         * See seq_css() for details.
         */
        css_clear_dir(css);

        /*
         * Killing would put the base ref, but we need to keep it alive
         * until after ->css_offline().
         */
        css_get(css);

        /*
         * cgroup core guarantees that, by the time ->css_offline() is
         * invoked, no new css reference will be given out via
         * css_tryget_online().  We can't simply call percpu_ref_kill() and
         * proceed to offlining css's because percpu_ref_kill() doesn't
         * guarantee that the ref is seen as killed on all CPUs on return.
         *
         * Use percpu_ref_kill_and_confirm() to get notifications as each
         * css is confirmed to be seen as killed on all CPUs.
         */
        percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}

/**
 * cgroup_destroy_locked - the first stage of cgroup destruction
 * @cgrp: cgroup to be destroyed
 *
 * css's make use of percpu refcnts whose killing latency shouldn't be
 * exposed to userland and are RCU protected.  Also, cgroup core needs to
 * guarantee that css_tryget_online() won't succeed by the time
 * ->css_offline() is invoked.  To satisfy all the requirements,
 * destruction is implemented in the following two steps.
 *
 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
 *     userland visible parts and start killing the percpu refcnts of
 *     css's.  Set up so that the next stage will be kicked off once all
 *     the percpu refcnts are confirmed to be killed.
 *
 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
 *     rest of destruction.  Once all cgroup references are gone, the
 *     cgroup is RCU-freed.
 *
 * This function implements s1.  After this step, @cgrp is gone as far as
 * the userland is concerned and a new cgroup with the same name may be
 * created.  As cgroup doesn't care about the names internally, this
 * doesn't cause any problem.
 */
static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
        struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *css;
        struct cgrp_cset_link *link;
        int ssid, ret;

        lockdep_assert_held(&cgroup_mutex);

        /*
         * Only migration can raise populated from zero and we're already
         * holding cgroup_mutex.
         */
        if (cgroup_is_populated(cgrp))
                return -EBUSY;

        /*
         * Make sure there's no live children.  We can't test emptiness of
         * ->self.children as dead children linger on it while being
         * drained; otherwise, "rmdir parent/child parent" may fail.
         */
        if (css_has_online_children(&cgrp->self))
                return -EBUSY;

        /*
         * Mark @cgrp and the associated csets dead.  The former prevents
         * further task migration and child creation by disabling
         * cgroup_kn_lock_live().  The latter makes the csets ignored by
         * the migration path.
         */
        cgrp->self.flags &= ~CSS_ONLINE;

        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                link->cset->dead = true;
        spin_unlock_irq(&css_set_lock);

        /* initiate massacre of all css's */
        for_each_css(css, ssid, cgrp)
                kill_css(css);

        /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
        css_clear_dir(&cgrp->self);
        kernfs_remove(cgrp->kn);

        if (cgroup_is_threaded(cgrp))
                parent->nr_threaded_children--;

        spin_lock_irq(&css_set_lock);
        for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                tcgrp->nr_descendants--;
                tcgrp->nr_dying_descendants++;
                /*
                 * If the dying cgroup is frozen, decrease frozen descendants
                 * counters of ancestor cgroups.
                 */
                if (test_bit(CGRP_FROZEN, &cgrp->flags))
                        tcgrp->freezer.nr_frozen_descendants--;
        }
        spin_unlock_irq(&css_set_lock);

        cgroup1_check_for_release(parent);

        ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
                                           CGROUP_LIFETIME_OFFLINE, cgrp);
        WARN_ON_ONCE(notifier_to_errno(ret));

        /* put the base reference */
        percpu_ref_kill(&cgrp->self.refcnt);

        return 0;
};

/**
 * cgroup_drain_dying - wait for dying tasks to leave before rmdir
 * @cgrp: the cgroup being removed
 *
 * cgroup.procs and cgroup.threads use css_task_iter which filters out
 * PF_EXITING tasks so that userspace doesn't see tasks that have already been
 * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
 * cgroup has non-empty css_sets - is only updated when dying tasks pass through
 * cgroup_task_dead() in finish_task_switch(). This creates a window where
 * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
 * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
 * tasks.
 *
 * This function aligns cgroup_has_tasks() with what userspace can observe. If
 * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
 * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
 * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
 *
 * This function only concerns itself with this cgroup's own dying tasks.
 * Whether the cgroup has children is cgroup_destroy_locked()'s problem.
 *
 * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
 * retry the full check from scratch.
 *
 * Must be called with cgroup_mutex held.
 */
static int cgroup_drain_dying(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
        struct css_task_iter it;
        struct task_struct *task;
        DEFINE_WAIT(wait);

        lockdep_assert_held(&cgroup_mutex);
retry:
        if (!cgroup_has_tasks(cgrp))
                return 0;

        /* Same iterator as cgroup.threads - if any task is visible, it's busy */
        css_task_iter_start(&cgrp->self, 0, &it);
        task = css_task_iter_next(&it);
        css_task_iter_end(&it);

        if (task)
                return -EBUSY;

        /*
         * All remaining tasks are PF_EXITING and will pass through
         * cgroup_task_dead() shortly. Wait for a kick and retry.
         *
         * cgroup_has_tasks() can't transition from false to true while we're
         * holding cgroup_mutex, but the true to false transition happens
         * under css_set_lock (via cgroup_task_dead()). We must retest and
         * prepare_to_wait() under css_set_lock. Otherwise, the transition
         * can happen between our first test and prepare_to_wait(), and we
         * sleep with no one to wake us.
         */
        spin_lock_irq(&css_set_lock);
        if (!cgroup_has_tasks(cgrp)) {
                spin_unlock_irq(&css_set_lock);
                return 0;
        }
        prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
                        TASK_UNINTERRUPTIBLE);
        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        schedule();
        finish_wait(&cgrp->dying_populated_waitq, &wait);
        mutex_lock(&cgroup_mutex);
        goto retry;
}

int cgroup_rmdir(struct kernfs_node *kn)
{
        struct cgroup *cgrp;
        int ret = 0;

        cgrp = cgroup_kn_lock_live(kn, false);
        if (!cgrp)
                return 0;

        ret = cgroup_drain_dying(cgrp);
        if (!ret) {
                ret = cgroup_destroy_locked(cgrp);
                if (!ret)
                        TRACE_CGROUP_PATH(rmdir, cgrp);
        }

        cgroup_kn_unlock(kn);
        return ret;
}

static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
        .show_options                = cgroup_show_options,
        .mkdir                        = cgroup_mkdir,
        .rmdir                        = cgroup_rmdir,
        .show_path                = cgroup_show_path,
};

static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
        struct cgroup_subsys_state *css;

        pr_debug("Initializing cgroup subsys %s\n", ss->name);

        cgroup_lock();

        idr_init(&ss->css_idr);
        INIT_LIST_HEAD(&ss->cfts);

        /* Create the root cgroup state for this subsystem */
        ss->root = &cgrp_dfl_root;
        css = ss->css_alloc(NULL);
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
        init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);

        /*
         * Root csses are never destroyed and we can't initialize
         * percpu_ref during early init.  Disable refcnting.
         */
        css->flags |= CSS_NO_REF;

        if (early) {
                /* allocation can't be done safely during early init */
                css->id = 1;
        } else {
                css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
                BUG_ON(css->id < 0);

                BUG_ON(ss_rstat_init(ss));
                BUG_ON(css_rstat_init(css));
        }

        /* Update the init_css_set to contain a subsys
         * pointer to this state - since the subsystem is
         * newly registered, all tasks and hence the
         * init_css_set is in the subsystem's root cgroup. */
        init_css_set.subsys[ss->id] = css;

        have_fork_callback |= (bool)ss->fork << ss->id;
        have_exit_callback |= (bool)ss->exit << ss->id;
        have_release_callback |= (bool)ss->release << ss->id;
        have_canfork_callback |= (bool)ss->can_fork << ss->id;

        /* At system boot, before all subsystems have been
         * registered, no tasks have been forked, so we don't
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));

        BUG_ON(online_css(css));

        cgroup_unlock();
}

/**
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
 * subsystems that request early init.
 */
int __init cgroup_init_early(void)
{
        static struct cgroup_fs_context __initdata ctx;
        struct cgroup_subsys *ss;
        int i;

        ctx.root = &cgrp_dfl_root;
        init_cgroup_root(&ctx);
        cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;

        RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

        for_each_subsys(ss, i) {
                WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
                     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
                     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
                     ss->id, ss->name);
                WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
                     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
                WARN(ss->early_init && ss->css_rstat_flush,
                     "cgroup rstat cannot be used with early init subsystem\n");

                ss->id = i;
                ss->name = cgroup_subsys_name[i];
                if (!ss->legacy_name)
                        ss->legacy_name = cgroup_subsys_name[i];

                if (ss->early_init)
                        cgroup_init_subsys(ss, true);
        }
        return 0;
}

/**
 * cgroup_init - cgroup initialization
 *
 * Register cgroup filesystem and /proc file, and initialize
 * any subsystems that didn't request early init.
 */
int __init cgroup_init(void)
{
        struct cgroup_subsys *ss;
        int ssid;

        BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 32);
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));

        BUG_ON(ss_rstat_init(NULL));

        get_user_ns(init_cgroup_ns.user_ns);
        cgroup_rt_init();

        cgroup_lock();

        /*
         * Add init_css_set to the hash table so that dfl_root can link to
         * it during init.
         */
        hash_add(css_set_table, &init_css_set.hlist,
                 css_set_hash(init_css_set.subsys));

        cgroup_bpf_lifetime_notifier_init();

        BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

        cgroup_unlock();

        for_each_subsys(ss, ssid) {
                if (ss->early_init) {
                        struct cgroup_subsys_state *css =
                                init_css_set.subsys[ss->id];

                        css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
                                                   GFP_KERNEL);
                        BUG_ON(css->id < 0);
                } else {
                        cgroup_init_subsys(ss, false);
                }

                list_add_tail(&init_css_set.e_cset_node[ssid],
                              &cgrp_dfl_root.cgrp.e_csets[ssid]);

                /*
                 * Setting dfl_root subsys_mask needs to consider the
                 * disabled flag and cftype registration needs kmalloc,
                 * both of which aren't available during early_init.
                 */
                if (!cgroup_ssid_enabled(ssid))
                        continue;

                if (cgroup1_ssid_disabled(ssid))
                        pr_info("Disabling %s control group subsystem in v1 mounts\n",
                                ss->legacy_name);

                cgrp_dfl_root.subsys_mask |= 1 << ss->id;

                /* implicit controllers must be threaded too */
                WARN_ON(ss->implicit_on_dfl && !ss->threaded);

                if (ss->implicit_on_dfl)
                        cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
                else if (!ss->dfl_cftypes)
                        cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;

                if (ss->threaded)
                        cgrp_dfl_threaded_ss_mask |= 1 << ss->id;

                if (ss->dfl_cftypes == ss->legacy_cftypes) {
                        WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
                } else {
                        WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
                        WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
                }

                if (ss->bind)
                        ss->bind(init_css_set.subsys[ssid]);

                cgroup_lock();
                css_populate_dir(init_css_set.subsys[ssid]);
                cgroup_unlock();
        }

        /* init_css_set.subsys[] has been updated, re-hash */
        hash_del(&init_css_set.hlist);
        hash_add(css_set_table, &init_css_set.hlist,
                 css_set_hash(init_css_set.subsys));

        WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
        WARN_ON(register_filesystem(&cgroup_fs_type));
        WARN_ON(register_filesystem(&cgroup2_fs_type));
        WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
#ifdef CONFIG_CPUSETS_V1
        WARN_ON(register_filesystem(&cpuset_fs_type));
#endif

        ns_tree_add(&init_cgroup_ns);
        return 0;
}

static int __init cgroup_wq_init(void)
{
        /*
         * There isn't much point in executing destruction path in
         * parallel.  Good chunk is serialized with cgroup_mutex anyway.
         * Use 1 for @max_active.
         *
         * We would prefer to do this in cgroup_init() above, but that
         * is called before init_workqueues(): so leave this until after.
         */
        cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1);
        BUG_ON(!cgroup_offline_wq);

        cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1);
        BUG_ON(!cgroup_release_wq);

        cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1);
        BUG_ON(!cgroup_free_wq);
        return 0;
}
core_initcall(cgroup_wq_init);

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{
        struct kernfs_node *kn;

        kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
        if (!kn)
                return;
        kernfs_path(kn, buf, buflen);
        kernfs_put(kn);
}

/*
 * __cgroup_get_from_id : get the cgroup associated with cgroup id
 * @id: cgroup id
 * On success return the cgrp or ERR_PTR on failure
 * There are no cgroup NS restrictions.
 */
struct cgroup *__cgroup_get_from_id(u64 id)
{
        struct kernfs_node *kn;
        struct cgroup *cgrp;

        kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
        if (!kn)
                return ERR_PTR(-ENOENT);

        if (kernfs_type(kn) != KERNFS_DIR) {
                kernfs_put(kn);
                return ERR_PTR(-ENOENT);
        }

        rcu_read_lock();

        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (cgrp && !cgroup_tryget(cgrp))
                cgrp = NULL;

        rcu_read_unlock();
        kernfs_put(kn);

        if (!cgrp)
                return ERR_PTR(-ENOENT);
        return cgrp;
}

/*
 * cgroup_get_from_id : get the cgroup associated with cgroup id
 * @id: cgroup id
 * On success return the cgrp or ERR_PTR on failure
 * Only cgroups within current task's cgroup NS are valid.
 */
struct cgroup *cgroup_get_from_id(u64 id)
{
        struct cgroup *cgrp, *root_cgrp;

        cgrp = __cgroup_get_from_id(id);
        if (IS_ERR(cgrp))
                return cgrp;

        root_cgrp = current_cgns_cgroup_dfl();
        if (!cgroup_is_descendant(cgrp, root_cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-ENOENT);
        }

        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);

/*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
 *  - Used for /proc/<pid>/cgroup.
 */
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
{
        char *buf;
        int retval;
        struct cgroup_root *root;

        retval = -ENOMEM;
        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                goto out;

        rcu_read_lock();
        spin_lock_irq(&css_set_lock);

        for_each_root(root) {
                struct cgroup_subsys *ss;
                struct cgroup *cgrp;
                int ssid, count = 0;

                if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
                        continue;

                cgrp = task_cgroup_from_root(tsk, root);
                /* The root has already been unmounted. */
                if (!cgrp)
                        continue;

                seq_printf(m, "%d:", root->hierarchy_id);
                if (root != &cgrp_dfl_root)
                        for_each_subsys(ss, ssid)
                                if (root->subsys_mask & (1 << ssid))
                                        seq_printf(m, "%s%s", count++ ? "," : "",
                                                   ss->legacy_name);
                if (strlen(root->name))
                        seq_printf(m, "%sname=%s", count ? "," : "",
                                   root->name);
                seq_putc(m, ':');
                /*
                 * On traditional hierarchies, all zombie tasks show up as
                 * belonging to the root cgroup.  On the default hierarchy,
                 * while a zombie doesn't show up in "cgroup.procs" and
                 * thus can't be migrated, its /proc/PID/cgroup keeps
                 * reporting the cgroup it belonged to before exiting.  If
                 * the cgroup is removed before the zombie is reaped,
                 * " (deleted)" is appended to the cgroup path.
                 */
                if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
                        retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
                                                current->nsproxy->cgroup_ns);
                        if (retval == -E2BIG)
                                retval = -ENAMETOOLONG;
                        if (retval < 0)
                                goto out_unlock;

                        seq_puts(m, buf);
                } else {
                        seq_puts(m, "/");
                }

                if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
                        seq_puts(m, " (deleted)\n");
                else
                        seq_putc(m, '\n');
        }

        retval = 0;
out_unlock:
        spin_unlock_irq(&css_set_lock);
        rcu_read_unlock();
        kfree(buf);
out:
        return retval;
}

/**
 * cgroup_fork - initialize cgroup related fields during copy_process()
 * @child: pointer to task_struct of forking parent process.
 *
 * A task is associated with the init_css_set until cgroup_post_fork()
 * attaches it to the target css_set.
 */
void cgroup_fork(struct task_struct *child)
{
        RCU_INIT_POINTER(child->cgroups, &init_css_set);
        INIT_LIST_HEAD(&child->cg_list);
}

/**
 * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
 * @f: file corresponding to cgroup_dir
 *
 * Find the cgroup from a file pointer associated with a cgroup directory.
 * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
 * cgroup cannot be found.
 */
static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
{
        struct cgroup_subsys_state *css;

        css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
        if (IS_ERR(css))
                return ERR_CAST(css);

        return css->cgroup;
}

/**
 * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
 * cgroup2.
 * @f: file corresponding to cgroup2_dir
 */
static struct cgroup *cgroup_get_from_file(struct file *f)
{
        struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);

        if (IS_ERR(cgrp))
                return ERR_CAST(cgrp);

        if (!cgroup_on_dfl(cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-EBADF);
        }

        return cgrp;
}

/**
 * cgroup_css_set_fork - find or create a css_set for a child process
 * @kargs: the arguments passed to create the child process
 *
 * This functions finds or creates a new css_set which the child
 * process will be attached to in cgroup_post_fork(). By default,
 * the child process will be given the same css_set as its parent.
 *
 * If CLONE_INTO_CGROUP is specified this function will try to find an
 * existing css_set which includes the requested cgroup and if not create
 * a new css_set that the child will be attached to later. If this function
 * succeeds it will hold cgroup_threadgroup_rwsem on return. If
 * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
 * before grabbing cgroup_threadgroup_rwsem and will hold a reference
 * to the target cgroup.
 */
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
        __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
        int ret;
        struct cgroup *dst_cgrp = NULL;
        struct css_set *cset;
        struct super_block *sb;

        if (kargs->flags & CLONE_INTO_CGROUP)
                cgroup_lock();

        cgroup_threadgroup_change_begin(current);

        spin_lock_irq(&css_set_lock);
        cset = task_css_set(current);
        get_css_set(cset);
        if (kargs->cgrp)
                kargs->kill_seq = kargs->cgrp->kill_seq;
        else
                kargs->kill_seq = cset->dfl_cgrp->kill_seq;
        spin_unlock_irq(&css_set_lock);

        if (!(kargs->flags & CLONE_INTO_CGROUP)) {
                kargs->cset = cset;
                return 0;
        }

        CLASS(fd_raw, f)(kargs->cgroup);
        if (fd_empty(f)) {
                ret = -EBADF;
                goto err;
        }
        sb = fd_file(f)->f_path.dentry->d_sb;

        dst_cgrp = cgroup_get_from_file(fd_file(f));
        if (IS_ERR(dst_cgrp)) {
                ret = PTR_ERR(dst_cgrp);
                dst_cgrp = NULL;
                goto err;
        }

        if (cgroup_is_dead(dst_cgrp)) {
                ret = -ENODEV;
                goto err;
        }

        /*
         * Verify that we the target cgroup is writable for us. This is
         * usually done by the vfs layer but since we're not going through
         * the vfs layer here we need to do it "manually".
         */
        ret = cgroup_may_write(dst_cgrp, sb);
        if (ret)
                goto err;

        /*
         * Spawning a task directly into a cgroup works by passing a file
         * descriptor to the target cgroup directory. This can even be an O_PATH
         * file descriptor. But it can never be a cgroup.procs file descriptor.
         * This was done on purpose so spawning into a cgroup could be
         * conceptualized as an atomic
         *
         *   fd = openat(dfd_cgroup, "cgroup.procs", ...);
         *   write(fd, <child-pid>, ...);
         *
         * sequence, i.e. it's a shorthand for the caller opening and writing
         * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
         * to always use the caller's credentials.
         */
        ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
                                        !(kargs->flags & CLONE_THREAD),
                                        current->nsproxy->cgroup_ns);
        if (ret)
                goto err;

        kargs->cset = find_css_set(cset, dst_cgrp);
        if (!kargs->cset) {
                ret = -ENOMEM;
                goto err;
        }

        put_css_set(cset);
        kargs->cgrp = dst_cgrp;
        return ret;

err:
        cgroup_threadgroup_change_end(current);
        cgroup_unlock();
        if (dst_cgrp)
                cgroup_put(dst_cgrp);
        put_css_set(cset);
        if (kargs->cset)
                put_css_set(kargs->cset);
        return ret;
}

/**
 * cgroup_css_set_put_fork - drop references we took during fork
 * @kargs: the arguments passed to create the child process
 *
 * Drop references to the prepared css_set and target cgroup if
 * CLONE_INTO_CGROUP was requested.
 */
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
        __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
        struct cgroup *cgrp = kargs->cgrp;
        struct css_set *cset = kargs->cset;

        cgroup_threadgroup_change_end(current);

        if (cset) {
                put_css_set(cset);
                kargs->cset = NULL;
        }

        if (kargs->flags & CLONE_INTO_CGROUP) {
                cgroup_unlock();
                if (cgrp) {
                        cgroup_put(cgrp);
                        kargs->cgrp = NULL;
                }
        }
}

/**
 * cgroup_can_fork - called on a new task before the process is exposed
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * This prepares a new css_set for the child process which the child will
 * be attached to in cgroup_post_fork().
 * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
 * callback returns an error, the fork aborts with that error code. This
 * allows for a cgroup subsystem to conditionally allow or deny new forks.
 */
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
        struct cgroup_subsys *ss;
        int i, j, ret;

        ret = cgroup_css_set_fork(kargs);
        if (ret)
                return ret;

        do_each_subsys_mask(ss, i, have_canfork_callback) {
                ret = ss->can_fork(child, kargs->cset);
                if (ret)
                        goto out_revert;
        } while_each_subsys_mask();

        return 0;

out_revert:
        for_each_subsys(ss, j) {
                if (j >= i)
                        break;
                if (ss->cancel_fork)
                        ss->cancel_fork(child, kargs->cset);
        }

        cgroup_css_set_put_fork(kargs);

        return ret;
}

/**
 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * This calls the cancel_fork() callbacks if a fork failed *after*
 * cgroup_can_fork() succeeded and cleans up references we took to
 * prepare a new css_set for the child process in cgroup_can_fork().
 */
void cgroup_cancel_fork(struct task_struct *child,
                        struct kernel_clone_args *kargs)
{
        struct cgroup_subsys *ss;
        int i;

        for_each_subsys(ss, i)
                if (ss->cancel_fork)
                        ss->cancel_fork(child, kargs->cset);

        cgroup_css_set_put_fork(kargs);
}

/**
 * cgroup_post_fork - finalize cgroup setup for the child process
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * Attach the child process to its css_set calling the subsystem fork()
 * callbacks.
 */
void cgroup_post_fork(struct task_struct *child,
                      struct kernel_clone_args *kargs)
        __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
        unsigned int cgrp_kill_seq = 0;
        unsigned long cgrp_flags = 0;
        bool kill = false;
        struct cgroup_subsys *ss;
        struct css_set *cset;
        int i;

        cset = kargs->cset;
        kargs->cset = NULL;

        spin_lock_irq(&css_set_lock);

        /* init tasks are special, only link regular threads */
        if (likely(child->pid)) {
                if (kargs->cgrp) {
                        cgrp_flags = kargs->cgrp->flags;
                        cgrp_kill_seq = kargs->cgrp->kill_seq;
                } else {
                        cgrp_flags = cset->dfl_cgrp->flags;
                        cgrp_kill_seq = cset->dfl_cgrp->kill_seq;
                }

                WARN_ON_ONCE(!list_empty(&child->cg_list));
                cset->nr_tasks++;
                css_set_move_task(child, NULL, cset, false);
        } else {
                put_css_set(cset);
                cset = NULL;
        }

        if (!(child->flags & PF_KTHREAD)) {
                if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
                        /*
                         * If the cgroup has to be frozen, the new task has
                         * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
                         * get the task into the frozen state.
                         */
                        spin_lock(&child->sighand->siglock);
                        WARN_ON_ONCE(child->frozen);
                        child->jobctl |= JOBCTL_TRAP_FREEZE;
                        spin_unlock(&child->sighand->siglock);

                        /*
                         * Calling cgroup_update_frozen() isn't required here,
                         * because it will be called anyway a bit later from
                         * do_freezer_trap(). So we avoid cgroup's transient
                         * switch from the frozen state and back.
                         */
                }

                /*
                 * If the cgroup is to be killed notice it now and take the
                 * child down right after we finished preparing it for
                 * userspace.
                 */
                kill = kargs->kill_seq != cgrp_kill_seq;
        }

        spin_unlock_irq(&css_set_lock);

        /*
         * Call ss->fork().  This must happen after @child is linked on
         * css_set; otherwise, @child might change state between ->fork()
         * and addition to css_set.
         */
        do_each_subsys_mask(ss, i, have_fork_callback) {
                ss->fork(child);
        } while_each_subsys_mask();

        /* Make the new cset the root_cset of the new cgroup namespace. */
        if (kargs->flags & CLONE_NEWCGROUP) {
                struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;

                get_css_set(cset);
                child->nsproxy->cgroup_ns->root_cset = cset;
                put_css_set(rcset);
        }

        /* Cgroup has to be killed so take down child immediately. */
        if (unlikely(kill))
                do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);

        cgroup_css_set_put_fork(kargs);
}

/**
 * cgroup_task_exit - detach cgroup from exiting task
 * @tsk: pointer to task_struct of exiting process
 *
 * Description: Detach cgroup from @tsk.
 *
 */
void cgroup_task_exit(struct task_struct *tsk)
{
        struct cgroup_subsys *ss;
        int i;

        /* see cgroup_post_fork() for details */
        do_each_subsys_mask(ss, i, have_exit_callback) {
                ss->exit(tsk);
        } while_each_subsys_mask();
}

static void do_cgroup_task_dead(struct task_struct *tsk)
{
        struct cgrp_cset_link *link;
        struct css_set *cset;
        unsigned long flags;

        spin_lock_irqsave(&css_set_lock, flags);

        WARN_ON_ONCE(list_empty(&tsk->cg_list));
        cset = task_css_set(tsk);
        css_set_move_task(tsk, cset, NULL, false);
        cset->nr_tasks--;
        /* matches the signal->live check in css_task_iter_advance() */
        if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
                list_add_tail(&tsk->cg_list, &cset->dying_tasks);

        /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
        list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
                if (waitqueue_active(&link->cgrp->dying_populated_waitq))
                        wake_up(&link->cgrp->dying_populated_waitq);

        if (dl_task(tsk))
                dec_dl_tasks_cs(tsk);

        WARN_ON_ONCE(cgroup_task_frozen(tsk));
        if (unlikely(!(tsk->flags & PF_KTHREAD) &&
                     test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
                cgroup_update_frozen(task_dfl_cgroup(tsk));

        spin_unlock_irqrestore(&css_set_lock, flags);
}

#ifdef CONFIG_PREEMPT_RT
/*
 * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
 * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
 * this lead to sleeping in the invalid context warning bug. css_set_lock is too
 * big to become a raw_spinlock. The task_dead path doesn't need to run
 * synchronously but can't be delayed indefinitely either as the dead task pins
 * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
 * irq_work to allow batching while ensuring timely completion.
 */
static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);

static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
{
        struct llist_node *lnode;
        struct task_struct *task, *next;

        lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
        llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
                do_cgroup_task_dead(task);
                put_task_struct(task);
        }
}

static void __init cgroup_rt_init(void)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
                per_cpu(cgrp_dead_tasks_iwork, cpu) =
                        IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
        }
}

void cgroup_task_dead(struct task_struct *task)
{
        get_task_struct(task);
        llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
        irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
}
#else        /* CONFIG_PREEMPT_RT */
static void __init cgroup_rt_init(void) {}

void cgroup_task_dead(struct task_struct *task)
{
        do_cgroup_task_dead(task);
}
#endif        /* CONFIG_PREEMPT_RT */

void cgroup_task_release(struct task_struct *task)
{
        struct cgroup_subsys *ss;
        int ssid;

        do_each_subsys_mask(ss, ssid, have_release_callback) {
                ss->release(task);
        } while_each_subsys_mask();
}

void cgroup_task_free(struct task_struct *task)
{
        struct css_set *cset = task_css_set(task);

        if (!list_empty(&task->cg_list)) {
                spin_lock_irq(&css_set_lock);
                css_set_skip_task_iters(task_css_set(task), task);
                list_del_init(&task->cg_list);
                spin_unlock_irq(&css_set_lock);
        }

        put_css_set(cset);
}

static int __init cgroup_disable(char *str)
{
        struct cgroup_subsys *ss;
        char *token;
        int i;

        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;

                for_each_subsys(ss, i) {
                        if (strcmp(token, ss->name) &&
                            strcmp(token, ss->legacy_name))
                                continue;

                        static_branch_disable(cgroup_subsys_enabled_key[i]);
                        pr_info("Disabling %s control group subsystem\n",
                                ss->name);
                }

                for (i = 0; i < OPT_FEATURE_COUNT; i++) {
                        if (strcmp(token, cgroup_opt_feature_names[i]))
                                continue;
                        cgroup_feature_disable_mask |= 1 << i;
                        pr_info("Disabling %s control group feature\n",
                                cgroup_opt_feature_names[i]);
                        break;
                }
        }
        return 1;
}
__setup("cgroup_disable=", cgroup_disable);

void __init __weak enable_debug_cgroup(void) { }

static int __init enable_cgroup_debug(char *str)
{
        cgroup_debug = true;
        enable_debug_cgroup();
        return 1;
}
__setup("cgroup_debug", enable_cgroup_debug);

static int __init cgroup_favordynmods_setup(char *str)
{
        return (kstrtobool(str, &have_favordynmods) == 0);
}
__setup("cgroup_favordynmods=", cgroup_favordynmods_setup);

/**
 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
 * @dentry: directory dentry of interest
 * @ss: subsystem of interest
 *
 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
 * to get the corresponding css and return it.  If such css doesn't exist
 * or can't be pinned, an ERR_PTR value is returned.
 */
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss)
{
        struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
        struct file_system_type *s_type = dentry->d_sb->s_type;
        struct cgroup_subsys_state *css = NULL;
        struct cgroup *cgrp;

        /* is @dentry a cgroup dir? */
        if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
            !kn || kernfs_type(kn) != KERNFS_DIR)
                return ERR_PTR(-EBADF);

        rcu_read_lock();

        /*
         * This path doesn't originate from kernfs and @kn could already
         * have been or be removed at any point.  @kn->priv is RCU
         * protected for this access.  See css_release_work_fn() for details.
         */
        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (cgrp)
                css = cgroup_css(cgrp, ss);

        if (!css || !css_tryget_online(css))
                css = ERR_PTR(-ENOENT);

        rcu_read_unlock();
        return css;
}

/**
 * css_from_id - lookup css by id
 * @id: the cgroup id
 * @ss: cgroup subsys to be looked into
 *
 * Returns the css if there's valid one with @id, otherwise returns NULL.
 * Should be called under rcu_read_lock().
 */
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
        WARN_ON_ONCE(!rcu_read_lock_held());
        return idr_find(&ss->css_idr, id);
}

/**
 * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
 * @path: path on the default hierarchy
 *
 * Find the cgroup at @path on the default hierarchy, increment its
 * reference count and return it.  Returns pointer to the found cgroup on
 * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
 * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
 */
struct cgroup *cgroup_get_from_path(const char *path)
{
        struct kernfs_node *kn;
        struct cgroup *cgrp = ERR_PTR(-ENOENT);
        struct cgroup *root_cgrp;

        root_cgrp = current_cgns_cgroup_dfl();
        kn = kernfs_walk_and_get(root_cgrp->kn, path);
        if (!kn)
                goto out;

        if (kernfs_type(kn) != KERNFS_DIR) {
                cgrp = ERR_PTR(-ENOTDIR);
                goto out_kernfs;
        }

        rcu_read_lock();

        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (!cgrp || !cgroup_tryget(cgrp))
                cgrp = ERR_PTR(-ENOENT);

        rcu_read_unlock();

out_kernfs:
        kernfs_put(kn);
out:
        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);

/**
 * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
 * @fd: fd obtained by open(cgroup_dir)
 *
 * Find the cgroup from a fd which should be obtained
 * by opening a cgroup directory.  Returns a pointer to the
 * cgroup on success. ERR_PTR is returned if the cgroup
 * cannot be found.
 */
struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
        CLASS(fd_raw, f)(fd);
        if (fd_empty(f))
                return ERR_PTR(-EBADF);

        return cgroup_v1v2_get_from_file(fd_file(f));
}

/**
 * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
 * cgroup2.
 * @fd: fd obtained by open(cgroup2_dir)
 */
struct cgroup *cgroup_get_from_fd(int fd)
{
        struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);

        if (IS_ERR(cgrp))
                return ERR_CAST(cgrp);

        if (!cgroup_on_dfl(cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-EBADF);
        }
        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);

static u64 power_of_ten(int power)
{
        u64 v = 1;
        while (power--)
                v *= 10;
        return v;
}

/**
 * cgroup_parse_float - parse a floating number
 * @input: input string
 * @dec_shift: number of decimal digits to shift
 * @v: output
 *
 * Parse a decimal floating point number in @input and store the result in
 * @v with decimal point right shifted @dec_shift times.  For example, if
 * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
 * Returns 0 on success, -errno otherwise.
 *
 * There's nothing cgroup specific about this function except that it's
 * currently the only user.
 */
int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
{
        s64 whole, frac = 0;
        int fstart = 0, fend = 0, flen;

        if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
                return -EINVAL;
        if (frac < 0)
                return -EINVAL;

        flen = fend > fstart ? fend - fstart : 0;
        if (flen < dec_shift)
                frac *= power_of_ten(dec_shift - flen);
        else
                frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));

        *v = whole * power_of_ten(dec_shift) + frac;
        return 0;
}

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgroup;

        rcu_read_lock();
        /* Don't associate the sock with unrelated interrupted task's cgroup. */
        if (in_interrupt()) {
                cgroup = &cgrp_dfl_root.cgrp;
                cgroup_get(cgroup);
                goto out;
        }

        while (true) {
                struct css_set *cset;

                cset = task_css_set(current);
                if (likely(cgroup_tryget(cset->dfl_cgrp))) {
                        cgroup = cset->dfl_cgrp;
                        break;
                }
                cpu_relax();
        }
out:
        skcd->cgroup = cgroup;
        cgroup_bpf_get(cgroup);
        rcu_read_unlock();
}

void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgrp = sock_cgroup_ptr(skcd);

        /*
         * We might be cloning a socket which is left in an empty
         * cgroup and the cgroup might have already been rmdir'd.
         * Don't use cgroup_get_live().
         */
        cgroup_get(cgrp);
        cgroup_bpf_get(cgrp);
}

void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgrp = sock_cgroup_ptr(skcd);

        cgroup_bpf_put(cgrp);
        cgroup_put(cgrp);
}

#endif        /* CONFIG_SOCK_CGROUP_DATA */

#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
                                      ssize_t size, const char *prefix)
{
        struct cftype *cft;
        ssize_t ret = 0;

        for (cft = files; cft && cft->name[0] != '\0'; cft++) {
                if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
                        continue;

                if (prefix)
                        ret += snprintf(buf + ret, size - ret, "%s.", prefix);

                ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);

                if (WARN_ON(ret >= size))
                        break;
        }

        return ret;
}

static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
                              char *buf)
{
        struct cgroup_subsys *ss;
        int ssid;
        ssize_t ret = 0;

        ret = show_delegatable_files(cgroup_base_files, buf + ret,
                                     PAGE_SIZE - ret, NULL);
        if (cgroup_psi_enabled())
                ret += show_delegatable_files(cgroup_psi_files, buf + ret,
                                              PAGE_SIZE - ret, NULL);

        for_each_subsys(ss, ssid)
                ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
                                              PAGE_SIZE - ret,
                                              cgroup_subsys_name[ssid]);

        return ret;
}
static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);

static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
                             char *buf)
{
        return snprintf(buf, PAGE_SIZE,
                        "nsdelegate\n"
                        "favordynmods\n"
                        "memory_localevents\n"
                        "memory_recursiveprot\n"
                        "memory_hugetlb_accounting\n"
                        "pids_localevents\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);

static struct attribute *cgroup_sysfs_attrs[] = {
        &cgroup_delegate_attr.attr,
        &cgroup_features_attr.attr,
        NULL,
};

static const struct attribute_group cgroup_sysfs_attr_group = {
        .attrs = cgroup_sysfs_attrs,
        .name = "cgroup",
};

static int __init cgroup_sysfs_init(void)
{
        return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
}
subsys_initcall(cgroup_sysfs_init);

#endif /* CONFIG_SYSFS */




















































































































































































































































































































































































































    2 














    2 














    2 


    1 

    2 






























    1 




















    1 
















    2 







    2 






    2 






    2 
    1 















    2 









    1 















    1 











































    2 




    1 












    2 





















    1 
    1 


















    2 



















    2 





































































































































    1 






    1 


    1 
    1 

    1 
    1 

    1 
    1 

    1 




    1 






    1 























































































    1 


    1 






















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS inode operations.
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Ryusuke Konishi.
 *
 */

#include <linux/buffer_head.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/uio.h>
#include <linux/fiemap.h>
#include <linux/random.h>
#include "nilfs.h"
#include "btnode.h"
#include "segment.h"
#include "page.h"
#include "mdt.h"
#include "cpfile.h"
#include "ifile.h"

/**
 * struct nilfs_iget_args - arguments used during comparison between inodes
 * @ino: inode number
 * @cno: checkpoint number
 * @root: pointer on NILFS root object (mounted checkpoint)
 * @type: inode type
 */
struct nilfs_iget_args {
        u64 ino;
        __u64 cno;
        struct nilfs_root *root;
        unsigned int type;
};

static int nilfs_iget_test(struct inode *inode, void *opaque);

void nilfs_inode_add_blocks(struct inode *inode, int n)
{
        struct nilfs_root *root = NILFS_I(inode)->i_root;

        inode_add_bytes(inode, i_blocksize(inode) * n);
        if (root)
                atomic64_add(n, &root->blocks_count);
}

void nilfs_inode_sub_blocks(struct inode *inode, int n)
{
        struct nilfs_root *root = NILFS_I(inode)->i_root;

        inode_sub_bytes(inode, i_blocksize(inode) * n);
        if (root)
                atomic64_sub(n, &root->blocks_count);
}

/**
 * nilfs_get_block() - get a file block on the filesystem (callback function)
 * @inode: inode struct of the target file
 * @blkoff: file block number
 * @bh_result: buffer head to be mapped on
 * @create: indicate whether allocating the block or not when it has not
 *      been allocated yet.
 *
 * This function does not issue actual read request of the specified data
 * block. It is done by VFS.
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_get_block(struct inode *inode, sector_t blkoff,
                    struct buffer_head *bh_result, int create)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        __u64 blknum = 0;
        int err = 0, ret;
        unsigned int maxblocks = bh_result->b_size >> inode->i_blkbits;

        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        if (ret >= 0) {        /* found */
                map_bh(bh_result, inode->i_sb, blknum);
                if (ret > 0)
                        bh_result->b_size = (ret << inode->i_blkbits);
                goto out;
        }
        /* data block was not found */
        if (ret == -ENOENT && create) {
                struct nilfs_transaction_info ti;

                bh_result->b_blocknr = 0;
                err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
                if (unlikely(err))
                        goto out;
                err = nilfs_bmap_insert(ii->i_bmap, blkoff,
                                        (unsigned long)bh_result);
                if (unlikely(err != 0)) {
                        if (err == -EEXIST) {
                                /*
                                 * The get_block() function could be called
                                 * from multiple callers for an inode.
                                 * However, the page having this block must
                                 * be locked in this case.
                                 */
                                nilfs_warn(inode->i_sb,
                                           "%s (ino=%llu): a race condition while inserting a data block at offset=%llu",
                                           __func__, inode->i_ino,
                                           (unsigned long long)blkoff);
                                err = -EAGAIN;
                        }
                        nilfs_transaction_abort(inode->i_sb);
                        goto out;
                }
                nilfs_mark_inode_dirty_sync(inode);
                nilfs_transaction_commit(inode->i_sb); /* never fails */
                /* Error handling should be detailed */
                set_buffer_new(bh_result);
                set_buffer_delay(bh_result);
                map_bh(bh_result, inode->i_sb, 0);
                /* Disk block number must be changed to proper value */

        } else if (ret == -ENOENT) {
                /*
                 * not found is not error (e.g. hole); must return without
                 * the mapped state flag.
                 */
                ;
        } else {
                err = ret;
        }

 out:
        return err;
}

/**
 * nilfs_read_folio() - implement read_folio() method of nilfs_aops {}
 * address_space_operations.
 * @file: file struct of the file to be read
 * @folio: the folio to be read
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int nilfs_read_folio(struct file *file, struct folio *folio)
{
        return mpage_read_folio(folio, nilfs_get_block);
}

static void nilfs_readahead(struct readahead_control *rac)
{
        mpage_readahead(rac, nilfs_get_block);
}

static int nilfs_writepages(struct address_space *mapping,
                            struct writeback_control *wbc)
{
        struct inode *inode = mapping->host;
        int err = 0;

        if (sb_rdonly(inode->i_sb)) {
                nilfs_clear_dirty_pages(mapping);
                return -EROFS;
        }

        if (wbc->sync_mode == WB_SYNC_ALL)
                err = nilfs_construct_dsync_segment(inode->i_sb, inode,
                                                    wbc->range_start,
                                                    wbc->range_end);
        return err;
}

static bool nilfs_dirty_folio(struct address_space *mapping,
                struct folio *folio)
{
        struct inode *inode = mapping->host;
        struct buffer_head *head;
        unsigned int nr_dirty = 0;
        bool ret = filemap_dirty_folio(mapping, folio);

        /*
         * The page may not be locked, eg if called from try_to_unmap_one()
         */
        spin_lock(&mapping->i_private_lock);
        head = folio_buffers(folio);
        if (head) {
                struct buffer_head *bh = head;

                do {
                        /* Do not mark hole blocks dirty */
                        if (buffer_dirty(bh) || !buffer_mapped(bh))
                                continue;

                        set_buffer_dirty(bh);
                        nr_dirty++;
                } while (bh = bh->b_this_page, bh != head);
        } else if (ret) {
                nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits);
        }
        spin_unlock(&mapping->i_private_lock);

        if (nr_dirty)
                nilfs_set_file_dirty(inode, nr_dirty);
        return ret;
}

void nilfs_write_failed(struct address_space *mapping, loff_t to)
{
        struct inode *inode = mapping->host;

        if (to > inode->i_size) {
                truncate_pagecache(inode, inode->i_size);
                nilfs_truncate(inode);
        }
}

static int nilfs_write_begin(const struct kiocb *iocb,
                             struct address_space *mapping,
                             loff_t pos, unsigned len,
                             struct folio **foliop, void **fsdata)

{
        struct inode *inode = mapping->host;
        int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);

        if (unlikely(err))
                return err;

        err = block_write_begin(mapping, pos, len, foliop, nilfs_get_block);
        if (unlikely(err)) {
                nilfs_write_failed(mapping, pos + len);
                nilfs_transaction_abort(inode->i_sb);
        }
        return err;
}

static int nilfs_write_end(const struct kiocb *iocb,
                           struct address_space *mapping,
                           loff_t pos, unsigned len, unsigned copied,
                           struct folio *folio, void *fsdata)
{
        struct inode *inode = mapping->host;
        unsigned int start = pos & (PAGE_SIZE - 1);
        unsigned int nr_dirty;
        int err;

        nr_dirty = nilfs_page_count_clean_buffers(folio, start,
                                                  start + copied);
        copied = generic_write_end(iocb, mapping, pos, len, copied, folio,
                                   fsdata);
        nilfs_set_file_dirty(inode, nr_dirty);
        err = nilfs_transaction_commit(inode->i_sb);
        return err ? : copied;
}

static ssize_t
nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        if (iov_iter_rw(iter) == WRITE)
                return 0;

        /* Needs synchronization with the cleaner */
        return blockdev_direct_IO(iocb, inode, iter, nilfs_get_block);
}

const struct address_space_operations nilfs_aops = {
        .read_folio                = nilfs_read_folio,
        .writepages                = nilfs_writepages,
        .dirty_folio                = nilfs_dirty_folio,
        .readahead                = nilfs_readahead,
        .write_begin                = nilfs_write_begin,
        .write_end                = nilfs_write_end,
        .invalidate_folio        = block_invalidate_folio,
        .direct_IO                = nilfs_direct_IO,
        .migrate_folio                = buffer_migrate_folio_norefs,
        .is_partially_uptodate  = block_is_partially_uptodate,
};

const struct address_space_operations nilfs_buffer_cache_aops = {
        .invalidate_folio        = block_invalidate_folio,
};

static int nilfs_insert_inode_locked(struct inode *inode,
                                     struct nilfs_root *root,
                                     unsigned long ino)
{
        struct nilfs_iget_args args = {
                .ino = ino, .root = root, .cno = 0, .type = NILFS_I_TYPE_NORMAL
        };

        return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
}

struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
{
        struct super_block *sb = dir->i_sb;
        struct inode *inode;
        struct nilfs_inode_info *ii;
        struct nilfs_root *root;
        struct buffer_head *bh;
        int err = -ENOMEM;
        ino_t ino;

        inode = new_inode(sb);
        if (unlikely(!inode))
                goto failed;

        mapping_set_gfp_mask(inode->i_mapping,
                           mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));

        root = NILFS_I(dir)->i_root;
        ii = NILFS_I(inode);
        ii->i_state = BIT(NILFS_I_NEW);
        ii->i_type = NILFS_I_TYPE_NORMAL;
        ii->i_root = root;

        err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
        if (unlikely(err))
                goto failed_ifile_create_inode;
        /* reference count of i_bh inherits from nilfs_mdt_read_block() */
        ii->i_bh = bh;

        atomic64_inc(&root->inodes_count);
        inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
        inode->i_ino = ino;
        simple_inode_init_ts(inode);

        if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
                err = nilfs_bmap_read(ii->i_bmap, NULL);
                if (err < 0)
                        goto failed_after_creation;

                set_bit(NILFS_I_BMAP, &ii->i_state);
                /* No lock is needed; iget() ensures it. */
        }

        ii->i_flags = nilfs_mask_flags(
                mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);

        /* ii->i_file_acl = 0; */
        /* ii->i_dir_acl = 0; */
        ii->i_dir_start_lookup = 0;
        nilfs_set_inode_flags(inode);
        inode->i_generation = get_random_u32();
        if (nilfs_insert_inode_locked(inode, root, ino) < 0) {
                err = -EIO;
                goto failed_after_creation;
        }

        err = nilfs_init_acl(inode, dir);
        if (unlikely(err))
                /*
                 * Never occur.  When supporting nilfs_init_acl(),
                 * proper cancellation of above jobs should be considered.
                 */
                goto failed_after_creation;

        return inode;

 failed_after_creation:
        clear_nlink(inode);
        if (inode_state_read_once(inode) & I_NEW)
                unlock_new_inode(inode);
        iput(inode);  /*
                       * raw_inode will be deleted through
                       * nilfs_evict_inode().
                       */
        goto failed;

 failed_ifile_create_inode:
        make_bad_inode(inode);
        iput(inode);
 failed:
        return ERR_PTR(err);
}

void nilfs_set_inode_flags(struct inode *inode)
{
        unsigned int flags = NILFS_I(inode)->i_flags;
        unsigned int new_fl = 0;

        if (flags & FS_SYNC_FL)
                new_fl |= S_SYNC;
        if (flags & FS_APPEND_FL)
                new_fl |= S_APPEND;
        if (flags & FS_IMMUTABLE_FL)
                new_fl |= S_IMMUTABLE;
        if (flags & FS_NOATIME_FL)
                new_fl |= S_NOATIME;
        if (flags & FS_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;
        inode_set_flags(inode, new_fl, S_SYNC | S_APPEND | S_IMMUTABLE |
                        S_NOATIME | S_DIRSYNC);
}

int nilfs_read_inode_common(struct inode *inode,
                            struct nilfs_inode *raw_inode)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);
        int err;

        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        i_uid_write(inode, le32_to_cpu(raw_inode->i_uid));
        i_gid_write(inode, le32_to_cpu(raw_inode->i_gid));
        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
        inode->i_size = le64_to_cpu(raw_inode->i_size);
        inode_set_atime(inode, le64_to_cpu(raw_inode->i_mtime),
                        le32_to_cpu(raw_inode->i_mtime_nsec));
        inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime),
                        le32_to_cpu(raw_inode->i_ctime_nsec));
        inode_set_mtime(inode, le64_to_cpu(raw_inode->i_mtime),
                        le32_to_cpu(raw_inode->i_mtime_nsec));
        if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
                return -EIO; /* this inode is for metadata and corrupted */
        if (inode->i_nlink == 0)
                return -ESTALE; /* this inode is deleted */

        inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
        ii->i_flags = le32_to_cpu(raw_inode->i_flags);
#if 0
        ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
        ii->i_dir_acl = S_ISREG(inode->i_mode) ?
                0 : le32_to_cpu(raw_inode->i_dir_acl);
#endif
        ii->i_dir_start_lookup = 0;
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);

        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
            S_ISLNK(inode->i_mode)) {
                err = nilfs_bmap_read(ii->i_bmap, raw_inode);
                if (err < 0)
                        return err;
                set_bit(NILFS_I_BMAP, &ii->i_state);
                /* No lock is needed; iget() ensures it. */
        }
        return 0;
}

static int __nilfs_read_inode(struct super_block *sb,
                              struct nilfs_root *root, unsigned long ino,
                              struct inode *inode)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct buffer_head *bh;
        struct nilfs_inode *raw_inode;
        int err;

        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
        if (unlikely(err))
                goto bad_inode;

        raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);

        err = nilfs_read_inode_common(inode, raw_inode);
        if (err)
                goto failed_unmap;

        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &nilfs_file_inode_operations;
                inode->i_fop = &nilfs_file_operations;
                inode->i_mapping->a_ops = &nilfs_aops;
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &nilfs_dir_inode_operations;
                inode->i_fop = &nilfs_dir_operations;
                inode->i_mapping->a_ops = &nilfs_aops;
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = &nilfs_symlink_inode_operations;
                inode_nohighmem(inode);
                inode->i_mapping->a_ops = &nilfs_aops;
        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
                   S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                inode->i_op = &nilfs_special_inode_operations;
                init_special_inode(
                        inode, inode->i_mode,
                        huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
        } else {
                nilfs_error(sb,
                            "invalid file type bits in mode 0%o for inode %lu",
                            inode->i_mode, ino);
                err = -EIO;
                goto failed_unmap;
        }
        nilfs_ifile_unmap_inode(raw_inode);
        brelse(bh);
        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        nilfs_set_inode_flags(inode);
        mapping_set_gfp_mask(inode->i_mapping,
                           mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
        return 0;

 failed_unmap:
        nilfs_ifile_unmap_inode(raw_inode);
        brelse(bh);

 bad_inode:
        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        return err;
}

static int nilfs_iget_test(struct inode *inode, void *opaque)
{
        struct nilfs_iget_args *args = opaque;
        struct nilfs_inode_info *ii;

        if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
                return 0;

        ii = NILFS_I(inode);
        if (ii->i_type != args->type)
                return 0;

        return !(args->type & NILFS_I_TYPE_GC) || args->cno == ii->i_cno;
}

static int nilfs_iget_set(struct inode *inode, void *opaque)
{
        struct nilfs_iget_args *args = opaque;

        inode->i_ino = args->ino;
        NILFS_I(inode)->i_cno = args->cno;
        NILFS_I(inode)->i_root = args->root;
        NILFS_I(inode)->i_type = args->type;
        if (args->root && args->ino == NILFS_ROOT_INO)
                nilfs_get_root(args->root);
        return 0;
}

struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
                            unsigned long ino)
{
        struct nilfs_iget_args args = {
                .ino = ino, .root = root, .cno = 0, .type = NILFS_I_TYPE_NORMAL
        };

        return ilookup5(sb, ino, nilfs_iget_test, &args);
}

struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
                                unsigned long ino)
{
        struct nilfs_iget_args args = {
                .ino = ino, .root = root, .cno = 0, .type = NILFS_I_TYPE_NORMAL
        };

        return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
}

struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
                         unsigned long ino)
{
        struct inode *inode;
        int err;

        inode = nilfs_iget_locked(sb, root, ino);
        if (unlikely(!inode))
                return ERR_PTR(-ENOMEM);

        if (!(inode_state_read_once(inode) & I_NEW)) {
                if (!inode->i_nlink) {
                        iput(inode);
                        return ERR_PTR(-ESTALE);
                }
                return inode;
        }

        err = __nilfs_read_inode(sb, root, ino, inode);
        if (unlikely(err)) {
                iget_failed(inode);
                return ERR_PTR(err);
        }
        unlock_new_inode(inode);
        return inode;
}

struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
                                __u64 cno)
{
        struct nilfs_iget_args args = {
                .ino = ino, .root = NULL, .cno = cno, .type = NILFS_I_TYPE_GC
        };
        struct inode *inode;
        int err;

        inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
        if (unlikely(!inode))
                return ERR_PTR(-ENOMEM);
        if (!(inode_state_read_once(inode) & I_NEW))
                return inode;

        err = nilfs_init_gcinode(inode);
        if (unlikely(err)) {
                iget_failed(inode);
                return ERR_PTR(err);
        }
        unlock_new_inode(inode);
        return inode;
}

/**
 * nilfs_attach_btree_node_cache - attach a B-tree node cache to the inode
 * @inode: inode object
 *
 * nilfs_attach_btree_node_cache() attaches a B-tree node cache to @inode,
 * or does nothing if the inode already has it.  This function allocates
 * an additional inode to maintain page cache of B-tree nodes one-on-one.
 *
 * Return: 0 on success, or %-ENOMEM if memory is insufficient.
 */
int nilfs_attach_btree_node_cache(struct inode *inode)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct inode *btnc_inode;
        struct nilfs_iget_args args;

        if (ii->i_assoc_inode)
                return 0;

        args.ino = inode->i_ino;
        args.root = ii->i_root;
        args.cno = ii->i_cno;
        args.type = ii->i_type | NILFS_I_TYPE_BTNC;

        btnc_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
                                  nilfs_iget_set, &args);
        if (unlikely(!btnc_inode))
                return -ENOMEM;
        if (inode_state_read_once(btnc_inode) & I_NEW) {
                nilfs_init_btnc_inode(btnc_inode);
                unlock_new_inode(btnc_inode);
        }
        NILFS_I(btnc_inode)->i_assoc_inode = inode;
        NILFS_I(btnc_inode)->i_bmap = ii->i_bmap;
        ii->i_assoc_inode = btnc_inode;

        return 0;
}

/**
 * nilfs_detach_btree_node_cache - detach the B-tree node cache from the inode
 * @inode: inode object
 *
 * nilfs_detach_btree_node_cache() detaches the B-tree node cache and its
 * holder inode bound to @inode, or does nothing if @inode doesn't have it.
 */
void nilfs_detach_btree_node_cache(struct inode *inode)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct inode *btnc_inode = ii->i_assoc_inode;

        if (btnc_inode) {
                NILFS_I(btnc_inode)->i_assoc_inode = NULL;
                ii->i_assoc_inode = NULL;
                iput(btnc_inode);
        }
}

/**
 * nilfs_iget_for_shadow - obtain inode for shadow mapping
 * @inode: inode object that uses shadow mapping
 *
 * nilfs_iget_for_shadow() allocates a pair of inodes that holds page
 * caches for shadow mapping.  The page cache for data pages is set up
 * in one inode and the one for b-tree node pages is set up in the
 * other inode, which is attached to the former inode.
 *
 * Return: a pointer to the inode for data pages on success, or %-ENOMEM
 * if memory is insufficient.
 */
struct inode *nilfs_iget_for_shadow(struct inode *inode)
{
        struct nilfs_iget_args args = {
                .ino = inode->i_ino, .root = NULL, .cno = 0,
                .type = NILFS_I_TYPE_SHADOW
        };
        struct inode *s_inode;
        int err;

        s_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
                               nilfs_iget_set, &args);
        if (unlikely(!s_inode))
                return ERR_PTR(-ENOMEM);
        if (!(inode_state_read_once(s_inode) & I_NEW))
                return inode;

        NILFS_I(s_inode)->i_flags = 0;
        memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap));
        mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS);
        s_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops;

        err = nilfs_attach_btree_node_cache(s_inode);
        if (unlikely(err)) {
                iget_failed(s_inode);
                return ERR_PTR(err);
        }
        unlock_new_inode(s_inode);
        return s_inode;
}

/**
 * nilfs_write_inode_common - export common inode information to on-disk inode
 * @inode:     inode object
 * @raw_inode: on-disk inode
 *
 * This function writes standard information from the on-memory inode @inode
 * to @raw_inode on ifile, cpfile or a super root block.  Since inode bmap
 * data is not exported, nilfs_bmap_write() must be called separately during
 * log writing.
 */
void nilfs_write_inode_common(struct inode *inode,
                              struct nilfs_inode *raw_inode)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);

        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
        raw_inode->i_uid = cpu_to_le32(i_uid_read(inode));
        raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
        raw_inode->i_size = cpu_to_le64(inode->i_size);
        raw_inode->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
        raw_inode->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
        raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
        raw_inode->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
        raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);

        raw_inode->i_flags = cpu_to_le32(ii->i_flags);
        raw_inode->i_generation = cpu_to_le32(inode->i_generation);

        /*
         * When extending inode, nilfs->ns_inode_size should be checked
         * for substitutions of appended fields.
         */
}

void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
{
        ino_t ino = inode->i_ino;
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct inode *ifile = ii->i_root->ifile;
        struct nilfs_inode *raw_inode;

        raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);

        if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
                memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
        if (flags & I_DIRTY_DATASYNC)
                set_bit(NILFS_I_INODE_SYNC, &ii->i_state);

        nilfs_write_inode_common(inode, raw_inode);

        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                raw_inode->i_device_code =
                        cpu_to_le64(huge_encode_dev(inode->i_rdev));

        nilfs_ifile_unmap_inode(raw_inode);
}

#define NILFS_MAX_TRUNCATE_BLOCKS        16384  /* 64MB for 4KB block */

static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
                                unsigned long from)
{
        __u64 b;
        int ret;

        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
                return;
repeat:
        ret = nilfs_bmap_last_key(ii->i_bmap, &b);
        if (ret == -ENOENT)
                return;
        else if (ret < 0)
                goto failed;

        if (b < from)
                return;

        b -= min_t(__u64, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
        ret = nilfs_bmap_truncate(ii->i_bmap, b);
        nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
        if (!ret || (ret == -ENOMEM &&
                     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
                goto repeat;

failed:
        nilfs_warn(ii->vfs_inode.i_sb, "error %d truncating bmap (ino=%llu)",
                   ret, ii->vfs_inode.i_ino);
}

void nilfs_truncate(struct inode *inode)
{
        unsigned long blkoff;
        unsigned int blocksize;
        struct nilfs_transaction_info ti;
        struct super_block *sb = inode->i_sb;
        struct nilfs_inode_info *ii = NILFS_I(inode);

        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
                return;
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;

        blocksize = sb->s_blocksize;
        blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
        nilfs_transaction_begin(sb, &ti, 0); /* never fails */

        block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);

        nilfs_truncate_bmap(ii, blkoff);

        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        if (IS_SYNC(inode))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);

        nilfs_mark_inode_dirty(inode);
        nilfs_set_file_dirty(inode, 0);
        nilfs_transaction_commit(sb);
        /*
         * May construct a logical segment and may fail in sync mode.
         * But truncate has no return value.
         */
}

static void nilfs_clear_inode(struct inode *inode)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);

        /*
         * Free resources allocated in nilfs_read_inode(), here.
         */
        BUG_ON(!list_empty(&ii->i_dirty));
        brelse(ii->i_bh);
        ii->i_bh = NULL;

        if (nilfs_is_metadata_file_inode(inode))
                nilfs_mdt_clear(inode);

        if (test_bit(NILFS_I_BMAP, &ii->i_state))
                nilfs_bmap_clear(ii->i_bmap);

        if (!(ii->i_type & NILFS_I_TYPE_BTNC))
                nilfs_detach_btree_node_cache(inode);

        if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
                nilfs_put_root(ii->i_root);
}

void nilfs_evict_inode(struct inode *inode)
{
        struct nilfs_transaction_info ti;
        struct super_block *sb = inode->i_sb;
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct the_nilfs *nilfs;
        int ret;

        if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
                truncate_inode_pages_final(&inode->i_data);
                clear_inode(inode);
                nilfs_clear_inode(inode);
                return;
        }
        nilfs_transaction_begin(sb, &ti, 0); /* never fails */

        truncate_inode_pages_final(&inode->i_data);

        nilfs = sb->s_fs_info;
        if (unlikely(sb_rdonly(sb) || !nilfs->ns_writer)) {
                /*
                 * If this inode is about to be disposed after the file system
                 * has been degraded to read-only due to file system corruption
                 * or after the writer has been detached, do not make any
                 * changes that cause writes, just clear it.
                 * Do this check after read-locking ns_segctor_sem by
                 * nilfs_transaction_begin() in order to avoid a race with
                 * the writer detach operation.
                 */
                clear_inode(inode);
                nilfs_clear_inode(inode);
                nilfs_transaction_abort(sb);
                return;
        }

        /* TODO: some of the following operations may fail.  */
        nilfs_truncate_bmap(ii, 0);
        nilfs_mark_inode_dirty(inode);
        clear_inode(inode);

        ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
        if (!ret)
                atomic64_dec(&ii->i_root->inodes_count);

        nilfs_clear_inode(inode);

        if (IS_SYNC(inode))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
        nilfs_transaction_commit(sb);
        /*
         * May construct a logical segment and may fail in sync mode.
         * But delete_inode has no return value.
         */
}

int nilfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                  struct iattr *iattr)
{
        struct nilfs_transaction_info ti;
        struct inode *inode = d_inode(dentry);
        struct super_block *sb = inode->i_sb;
        int err;

        err = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
        if (err)
                return err;

        err = nilfs_transaction_begin(sb, &ti, 0);
        if (unlikely(err))
                return err;

        if ((iattr->ia_valid & ATTR_SIZE) &&
            iattr->ia_size != i_size_read(inode)) {
                inode_dio_wait(inode);
                truncate_setsize(inode, iattr->ia_size);
                nilfs_truncate(inode);
        }

        setattr_copy(&nop_mnt_idmap, inode, iattr);
        mark_inode_dirty(inode);

        if (iattr->ia_valid & ATTR_MODE) {
                err = nilfs_acl_chmod(inode);
                if (unlikely(err))
                        goto out_err;
        }

        return nilfs_transaction_commit(sb);

out_err:
        nilfs_transaction_abort(sb);
        return err;
}

int nilfs_permission(struct mnt_idmap *idmap, struct inode *inode,
                     int mask)
{
        struct nilfs_root *root = NILFS_I(inode)->i_root;

        if ((mask & MAY_WRITE) && root &&
            root->cno != NILFS_CPTREE_CURRENT_CNO)
                return -EROFS; /* snapshot is not writable */

        return generic_permission(&nop_mnt_idmap, inode, mask);
}

int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
{
        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_inode_info *ii = NILFS_I(inode);
        int err;

        spin_lock(&nilfs->ns_inode_lock);
        if (ii->i_bh == NULL || unlikely(!buffer_uptodate(ii->i_bh))) {
                spin_unlock(&nilfs->ns_inode_lock);
                err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
                                                  inode->i_ino, pbh);
                if (unlikely(err))
                        return err;
                spin_lock(&nilfs->ns_inode_lock);
                if (ii->i_bh == NULL)
                        ii->i_bh = *pbh;
                else if (unlikely(!buffer_uptodate(ii->i_bh))) {
                        __brelse(ii->i_bh);
                        ii->i_bh = *pbh;
                } else {
                        brelse(*pbh);
                        *pbh = ii->i_bh;
                }
        } else
                *pbh = ii->i_bh;

        get_bh(*pbh);
        spin_unlock(&nilfs->ns_inode_lock);
        return 0;
}

int nilfs_inode_dirty(struct inode *inode)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        int ret = 0;

        if (!list_empty(&ii->i_dirty)) {
                spin_lock(&nilfs->ns_inode_lock);
                ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
                        test_bit(NILFS_I_BUSY, &ii->i_state);
                spin_unlock(&nilfs->ns_inode_lock);
        }
        return ret;
}

int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;

        atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);

        if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
                return 0;

        spin_lock(&nilfs->ns_inode_lock);
        if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
            !test_bit(NILFS_I_BUSY, &ii->i_state)) {
                /*
                 * Because this routine may race with nilfs_dispose_list(),
                 * we have to check NILFS_I_QUEUED here, too.
                 */
                if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
                        /*
                         * This will happen when somebody is freeing
                         * this inode.
                         */
                        nilfs_warn(inode->i_sb,
                                   "cannot set file dirty (ino=%llu): the file is being freed",
                                   inode->i_ino);
                        spin_unlock(&nilfs->ns_inode_lock);
                        return -EINVAL; /*
                                         * NILFS_I_DIRTY may remain for
                                         * freeing inode.
                                         */
                }
                list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
                set_bit(NILFS_I_QUEUED, &ii->i_state);
        }
        spin_unlock(&nilfs->ns_inode_lock);
        return 0;
}

int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
{
        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct buffer_head *ibh;
        int err;

        /*
         * Do not dirty inodes after the log writer has been detached
         * and its nilfs_root struct has been freed.
         */
        if (unlikely(nilfs_purging(nilfs)))
                return 0;

        err = nilfs_load_inode_block(inode, &ibh);
        if (unlikely(err)) {
                nilfs_warn(inode->i_sb,
                           "cannot mark inode dirty (ino=%llu): error %d loading inode block",
                           inode->i_ino, err);
                return err;
        }
        nilfs_update_inode(inode, ibh, flags);
        mark_buffer_dirty(ibh);
        nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
        brelse(ibh);
        return 0;
}

/**
 * nilfs_dirty_inode - reflect changes on given inode to an inode block.
 * @inode: inode of the file to be registered.
 * @flags: flags to determine the dirty state of the inode
 *
 * nilfs_dirty_inode() loads a inode block containing the specified
 * @inode and copies data from a nilfs_inode to a corresponding inode
 * entry in the inode block. This operation is excluded from the segment
 * construction. This function can be called both as a single operation
 * and as a part of indivisible file operations.
 */
void nilfs_dirty_inode(struct inode *inode, int flags)
{
        struct nilfs_transaction_info ti;
        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);

        if (is_bad_inode(inode)) {
                nilfs_warn(inode->i_sb,
                           "tried to mark bad_inode dirty. ignored.");
                dump_stack();
                return;
        }
        if (mdi) {
                nilfs_mdt_mark_dirty(inode);
                return;
        }
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
        __nilfs_mark_inode_dirty(inode, flags);
        nilfs_transaction_commit(inode->i_sb); /* never fails */
}

int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 __u64 start, __u64 len)
{
        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        __u64 logical = 0, phys = 0, size = 0;
        __u32 flags = 0;
        loff_t isize;
        sector_t blkoff, end_blkoff;
        sector_t delalloc_blkoff;
        unsigned long delalloc_blklen;
        unsigned int blkbits = inode->i_blkbits;
        int ret, n;

        ret = fiemap_prep(inode, fieinfo, start, &len, 0);
        if (ret)
                return ret;

        inode_lock(inode);

        isize = i_size_read(inode);

        blkoff = start >> blkbits;
        end_blkoff = (start + len - 1) >> blkbits;

        delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
                                                        &delalloc_blkoff);

        do {
                __u64 blkphy;
                unsigned int maxblocks;

                if (delalloc_blklen && blkoff == delalloc_blkoff) {
                        if (size) {
                                /* End of the current extent */
                                ret = fiemap_fill_next_extent(
                                        fieinfo, logical, phys, size, flags);
                                if (ret)
                                        break;
                        }
                        if (blkoff > end_blkoff)
                                break;

                        flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
                        logical = blkoff << blkbits;
                        phys = 0;
                        size = delalloc_blklen << blkbits;

                        blkoff = delalloc_blkoff + delalloc_blklen;
                        delalloc_blklen = nilfs_find_uncommitted_extent(
                                inode, blkoff, &delalloc_blkoff);
                        continue;
                }

                /*
                 * Limit the number of blocks that we look up so as
                 * not to get into the next delayed allocation extent.
                 */
                maxblocks = INT_MAX;
                if (delalloc_blklen)
                        maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
                                          maxblocks);
                blkphy = 0;

                down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
                n = nilfs_bmap_lookup_contig(
                        NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
                up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);

                if (n < 0) {
                        int past_eof;

                        if (unlikely(n != -ENOENT))
                                break; /* error */

                        /* HOLE */
                        blkoff++;
                        past_eof = ((blkoff << blkbits) >= isize);

                        if (size) {
                                /* End of the current extent */

                                if (past_eof)
                                        flags |= FIEMAP_EXTENT_LAST;

                                ret = fiemap_fill_next_extent(
                                        fieinfo, logical, phys, size, flags);
                                if (ret)
                                        break;
                                size = 0;
                        }
                        if (blkoff > end_blkoff || past_eof)
                                break;
                } else {
                        if (size) {
                                if (phys && blkphy << blkbits == phys + size) {
                                        /* The current extent goes on */
                                        size += (u64)n << blkbits;
                                } else {
                                        /* Terminate the current extent */
                                        ret = fiemap_fill_next_extent(
                                                fieinfo, logical, phys, size,
                                                flags);
                                        if (ret || blkoff > end_blkoff)
                                                break;

                                        /* Start another extent */
                                        flags = FIEMAP_EXTENT_MERGED;
                                        logical = blkoff << blkbits;
                                        phys = blkphy << blkbits;
                                        size = (u64)n << blkbits;
                                }
                        } else {
                                /* Start a new extent */
                                flags = FIEMAP_EXTENT_MERGED;
                                logical = blkoff << blkbits;
                                phys = blkphy << blkbits;
                                size = (u64)n << blkbits;
                        }
                        blkoff += n;
                }
                cond_resched();
        } while (true);

        /* If ret is 1 then we just hit the end of the extent array */
        if (ret == 1)
                ret = 0;

        inode_unlock(inode);
        return ret;
}










   17 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM printk

#if !defined(_TRACE_PRINTK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PRINTK_H

#include <linux/tracepoint.h>

TRACE_EVENT(console,
        TP_PROTO(const char *text, size_t len),

        TP_ARGS(text, len),

        TP_STRUCT__entry(
                __dynamic_array(char, msg, len + 1)
        ),

        TP_fast_assign(
                /*
                 * Each trace entry is printed in a new line.
                 * If the msg finishes with '\n', cut it off
                 * to avoid blank lines in the trace.
                 */
                if ((len > 0) && (text[len-1] == '\n'))
                        len -= 1;

                memcpy(__get_str(msg), text, len);
                __get_str(msg)[len] = 0;
        ),

        TP_printk("%s", __get_str(msg))
);
#endif /* _TRACE_PRINTK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





























    1 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Buffer/page management specific to NILFS
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Ryusuke Konishi and Seiji Kihara.
 */

#ifndef _NILFS_PAGE_H
#define _NILFS_PAGE_H

#include <linux/buffer_head.h>
#include "nilfs.h"

/*
 * Extended buffer state bits
 */
enum {
        BH_NILFS_Allocated = BH_PrivateStart,
        BH_NILFS_Node,
        BH_NILFS_Volatile,
        BH_NILFS_Checked,
        BH_NILFS_Redirected,
};

BUFFER_FNS(NILFS_Node, nilfs_node)                /* nilfs node buffers */
BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
BUFFER_FNS(NILFS_Checked, nilfs_checked)        /* buffer is verified */
BUFFER_FNS(NILFS_Redirected, nilfs_redirected)        /* redirected to a copy */


void __nilfs_clear_folio_dirty(struct folio *);

struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
                                      unsigned long, unsigned long);
void nilfs_forget_buffer(struct buffer_head *);
void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
bool nilfs_folio_buffers_clean(struct folio *);
void nilfs_folio_bug(struct folio *);

int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
void nilfs_clear_folio_dirty(struct folio *folio);
void nilfs_clear_dirty_pages(struct address_space *mapping);
unsigned int nilfs_page_count_clean_buffers(struct folio *folio,
                unsigned int from, unsigned int to);
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
                                            sector_t start_blk,
                                            sector_t *blkoff);

#define NILFS_FOLIO_BUG(folio, m, a...) \
        do { nilfs_folio_bug(folio); BUG(); } while (0)

#endif /* _NILFS_PAGE_H */












































































































































































































    1 



    1 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Support for polling mode for input devices.
 */

#include <linux/device.h>
#include <linux/export.h>
#include <linux/input.h>
#include <linux/jiffies.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include "input-poller.h"

struct input_dev_poller {
        void (*poll)(struct input_dev *dev);

        unsigned int poll_interval; /* msec */
        unsigned int poll_interval_max; /* msec */
        unsigned int poll_interval_min; /* msec */

        struct input_dev *input;
        struct delayed_work work;
};

static void input_dev_poller_queue_work(struct input_dev_poller *poller)
{
        unsigned long delay;

        delay = msecs_to_jiffies(poller->poll_interval);
        if (delay >= HZ)
                delay = round_jiffies_relative(delay);

        queue_delayed_work(system_freezable_wq, &poller->work, delay);
}

static void input_dev_poller_work(struct work_struct *work)
{
        struct input_dev_poller *poller =
                container_of(work, struct input_dev_poller, work.work);

        poller->poll(poller->input);
        input_dev_poller_queue_work(poller);
}

void input_dev_poller_finalize(struct input_dev_poller *poller)
{
        if (!poller->poll_interval)
                poller->poll_interval = 500;
        if (!poller->poll_interval_max)
                poller->poll_interval_max = poller->poll_interval;
}

void input_dev_poller_start(struct input_dev_poller *poller)
{
        /* Only start polling if polling is enabled */
        if (poller->poll_interval > 0) {
                poller->poll(poller->input);
                input_dev_poller_queue_work(poller);
        }
}

void input_dev_poller_stop(struct input_dev_poller *poller)
{
        cancel_delayed_work_sync(&poller->work);
}

int input_setup_polling(struct input_dev *dev,
                        void (*poll_fn)(struct input_dev *dev))
{
        struct input_dev_poller *poller;

        poller = kzalloc_obj(*poller);
        if (!poller) {
                /*
                 * We want to show message even though kzalloc() may have
                 * printed backtrace as knowing what instance of input
                 * device we were dealing with is helpful.
                 */
                dev_err(dev->dev.parent ?: &dev->dev,
                        "%s: unable to allocate poller structure\n", __func__);
                return -ENOMEM;
        }

        INIT_DELAYED_WORK(&poller->work, input_dev_poller_work);
        poller->input = dev;
        poller->poll = poll_fn;

        dev->poller = poller;
        return 0;
}
EXPORT_SYMBOL(input_setup_polling);

static bool input_dev_ensure_poller(struct input_dev *dev)
{
        if (!dev->poller) {
                dev_err(dev->dev.parent ?: &dev->dev,
                        "poller structure has not been set up\n");
                return false;
        }

        return true;
}

void input_set_poll_interval(struct input_dev *dev, unsigned int interval)
{
        if (input_dev_ensure_poller(dev))
                dev->poller->poll_interval = interval;
}
EXPORT_SYMBOL(input_set_poll_interval);

void input_set_min_poll_interval(struct input_dev *dev, unsigned int interval)
{
        if (input_dev_ensure_poller(dev))
                dev->poller->poll_interval_min = interval;
}
EXPORT_SYMBOL(input_set_min_poll_interval);

void input_set_max_poll_interval(struct input_dev *dev, unsigned int interval)
{
        if (input_dev_ensure_poller(dev))
                dev->poller->poll_interval_max = interval;
}
EXPORT_SYMBOL(input_set_max_poll_interval);

int input_get_poll_interval(struct input_dev *dev)
{
        if (!dev->poller)
                return -EINVAL;

        return dev->poller->poll_interval;
}
EXPORT_SYMBOL(input_get_poll_interval);

/* SYSFS interface */

static ssize_t input_dev_get_poll_interval(struct device *dev,
                                           struct device_attribute *attr,
                                           char *buf)
{
        struct input_dev *input = to_input_dev(dev);

        return sprintf(buf, "%d\n", input->poller->poll_interval);
}

static ssize_t input_dev_set_poll_interval(struct device *dev,
                                           struct device_attribute *attr,
                                           const char *buf, size_t count)
{
        struct input_dev *input = to_input_dev(dev);
        struct input_dev_poller *poller = input->poller;
        unsigned int interval;
        int err;

        err = kstrtouint(buf, 0, &interval);
        if (err)
                return err;

        if (interval < poller->poll_interval_min)
                return -EINVAL;

        if (interval > poller->poll_interval_max)
                return -EINVAL;

        guard(mutex)(&input->mutex);

        poller->poll_interval = interval;

        if (input_device_enabled(input)) {
                cancel_delayed_work_sync(&poller->work);
                if (poller->poll_interval > 0)
                        input_dev_poller_queue_work(poller);
        }

        return count;
}

static DEVICE_ATTR(poll, 0644,
                   input_dev_get_poll_interval, input_dev_set_poll_interval);

static ssize_t input_dev_get_poll_max(struct device *dev,
                                      struct device_attribute *attr, char *buf)
{
        struct input_dev *input = to_input_dev(dev);

        return sprintf(buf, "%d\n", input->poller->poll_interval_max);
}

static DEVICE_ATTR(max, 0444, input_dev_get_poll_max, NULL);

static ssize_t input_dev_get_poll_min(struct device *dev,
                                     struct device_attribute *attr, char *buf)
{
        struct input_dev *input = to_input_dev(dev);

        return sprintf(buf, "%d\n", input->poller->poll_interval_min);
}

static DEVICE_ATTR(min, 0444, input_dev_get_poll_min, NULL);

static umode_t input_poller_attrs_visible(struct kobject *kobj,
                                          struct attribute *attr, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct input_dev *input = to_input_dev(dev);

        return input->poller ? attr->mode : 0;
}

static struct attribute *input_poller_attrs[] = {
        &dev_attr_poll.attr,
        &dev_attr_max.attr,
        &dev_attr_min.attr,
        NULL
};

struct attribute_group input_poller_attribute_group = {
        .is_visible        = input_poller_attrs_visible,
        .attrs                = input_poller_attrs,
};






































































































































   14 









   14 














   12 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 







    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    8 
    1 


























    7 

















































































































































































































    4 




















    1 























    1 










































    1 















    1 


























    2 
















































































    1 


























    1 




















    1 


















    1 


































































    2 










    1 









    1 





    1 




    1 












































    1 










    1 






    3 









    3 

    1 





    1 






    1 


















































































































    2 
    1 










    1 









    1 






































































    1 








    1 











    1 






































    2 






































































    1 
    2 






    2 
    1 
    1 













    1 




































































































































































































































































































































    1 









    1 









    1 

















    1 






    1 


    1 






































































































































































































































































































































































































































































































































































































    1 



    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



















    1 















































    1 
































    1 















































































    1 







    1 













    1 
























    1 


    1 




















































































































































































































































































    2 




















    1 






    2 






























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Based on the design of the Berkeley Packet Filter. The new
 * internal format has been designed by PLUMgrid:
 *
 *        Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
 *
 * Authors:
 *
 *        Jay Schulist <jschlst@samba.org>
 *        Alexei Starovoitov <ast@plumgrid.com>
 *        Daniel Borkmann <dborkman@redhat.com>
 *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */

#include <linux/atomic.h>
#include <linux/bpf_verifier.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/sock_diag.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/gfp.h>
#include <net/inet_common.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/skmsg.h>
#include <net/sock.h>
#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/uaccess.h>
#include <linux/unaligned.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <net/sch_generic.h>
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
#include <net/dst.h>
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/udp.h>
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <linux/inetdevice.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/flow.h>
#include <net/arp.h>
#include <net/ipv6.h>
#include <net/net_namespace.h>
#include <linux/seg6_local.h>
#include <net/seg6.h>
#include <net/seg6_local.h>
#include <net/lwtunnel.h>
#include <net/bpf_sk_storage.h>
#include <net/transp_v6.h>
#include <linux/btf_ids.h>
#include <net/tls.h>
#include <net/xdp.h>
#include <net/mptcp.h>
#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netkit.h>
#include <linux/un.h>
#include <net/xdp_sock_drv.h>
#include <net/inet_dscp.h>

#include "dev.h"

/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */
static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check");

static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
{
        if (in_compat_syscall()) {
                struct compat_sock_fprog f32;

                if (len != sizeof(f32))
                        return -EINVAL;
                if (copy_from_sockptr(&f32, src, sizeof(f32)))
                        return -EFAULT;
                memset(dst, 0, sizeof(*dst));
                dst->len = f32.len;
                dst->filter = compat_ptr(f32.filter);
        } else {
                if (len != sizeof(*dst))
                        return -EINVAL;
                if (copy_from_sockptr(dst, src, sizeof(*dst)))
                        return -EFAULT;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);

/**
 *        sk_filter_trim_cap - run a packet through a socket filter
 *        @sk: sock associated with &sk_buff
 *        @skb: buffer to filter
 *        @cap: limit on how short the eBPF program may trim the packet
 *
 * Run the eBPF program and then cut skb->data to correct size returned by
 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
 * than pkt_len we keep whole skb->data. This is the socket level
 * wrapper to bpf_prog_run. It returns 0 if the packet should
 * be accepted or a drop_reason if the packet should be tossed.
 *
 */
enum skb_drop_reason
sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
{
        enum skb_drop_reason drop_reason;
        struct sk_filter *filter;
        int err;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
                return SKB_DROP_REASON_PFMEMALLOC;
        }
        err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
        if (err)
                return SKB_DROP_REASON_SOCKET_FILTER;

        err = security_sock_rcv_skb(sk, skb);
        if (err)
                return SKB_DROP_REASON_SECURITY_HOOK;

        drop_reason = 0;
        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter) {
                struct sock *save_sk = skb->sk;
                unsigned int pkt_len;

                skb->sk = sk;
                pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
                skb->sk = save_sk;
                err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
                if (err)
                        drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
        }
        rcu_read_unlock();

        return drop_reason;
}
EXPORT_SYMBOL(sk_filter_trim_cap);

BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
{
        return skb_get_poff(skb);
}

BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
        struct nlattr *nla;

        if (skb_is_nonlinear(skb))
                return 0;

        if (skb->len < sizeof(struct nlattr))
                return 0;

        if (a > skb->len - sizeof(struct nlattr))
                return 0;

        nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
        if (nla)
                return (void *) nla - (void *) skb->data;

        return 0;
}

BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
        struct nlattr *nla;

        if (skb_is_nonlinear(skb))
                return 0;

        if (skb->len < sizeof(struct nlattr))
                return 0;

        if (a > skb->len - sizeof(struct nlattr))
                return 0;

        nla = (struct nlattr *) &skb->data[a];
        if (!nla_ok(nla, skb->len - a))
                return 0;

        nla = nla_find_nested(nla, x);
        if (nla)
                return (void *) nla - (void *) skb->data;

        return 0;
}

static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
{
        if (likely(offset >= 0))
                return offset;

        if (offset >= SKF_NET_OFF)
                return offset - SKF_NET_OFF + skb_network_offset(skb);

        if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
                return offset - SKF_LL_OFF + skb_mac_offset(skb);

        return INT_MIN;
}

BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        u8 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return *(u8 *)(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return tmp;
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
                                         offset);
}

BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        __be16 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return get_unaligned_be16(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return be16_to_cpu(tmp);
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
                                          offset);
}

BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        __be32 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return get_unaligned_be32(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return be32_to_cpu(tmp);
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
                                          offset);
}

static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
                              struct bpf_insn *insn_buf)
{
        struct bpf_insn *insn = insn_buf;

        switch (skb_field) {
        case SKF_AD_MARK:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);

                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
                                      offsetof(struct sk_buff, mark));
                break;

        case SKF_AD_PKTTYPE:
                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET);
                *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
#endif
                break;

        case SKF_AD_QUEUE:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);

                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
                                      offsetof(struct sk_buff, queue_mapping));
                break;

        case SKF_AD_VLAN_TAG:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);

                /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
                                      offsetof(struct sk_buff, vlan_tci));
                break;
        case SKF_AD_VLAN_TAG_PRESENT:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4);
                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
                                      offsetof(struct sk_buff, vlan_all));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
                *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1);
                break;
        }

        return insn - insn_buf;
}

static bool convert_bpf_extensions(struct sock_filter *fp,
                                   struct bpf_insn **insnp)
{
        struct bpf_insn *insn = *insnp;
        u32 cnt;

        switch (fp->k) {
        case SKF_AD_OFF + SKF_AD_PROTOCOL:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);

                /* A = *(u16 *) (CTX + offsetof(protocol)) */
                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
                                      offsetof(struct sk_buff, protocol));
                /* A = ntohs(A) [emitting a nop or swap16] */
                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
                break;

        case SKF_AD_OFF + SKF_AD_PKTTYPE:
                cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_IFINDEX:
        case SKF_AD_OFF + SKF_AD_HATYPE:
                BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
                BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      BPF_REG_TMP, BPF_REG_CTX,
                                      offsetof(struct sk_buff, dev));
                /* if (tmp != 0) goto pc + 1 */
                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
                *insn++ = BPF_EXIT_INSN();
                if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
                                            offsetof(struct net_device, ifindex));
                else
                        *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
                                            offsetof(struct net_device, type));
                break;

        case SKF_AD_OFF + SKF_AD_MARK:
                cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_RXHASH:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);

                *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
                                    offsetof(struct sk_buff, hash));
                break;

        case SKF_AD_OFF + SKF_AD_QUEUE:
                cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TAG:
                cnt = convert_skb_access(SKF_AD_VLAN_TAG,
                                         BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
                cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
                                         BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TPID:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);

                /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
                                      offsetof(struct sk_buff, vlan_proto));
                /* A = ntohs(A) [emitting a nop or swap16] */
                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
                break;

        case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
        case SKF_AD_OFF + SKF_AD_NLATTR:
        case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
        case SKF_AD_OFF + SKF_AD_CPU:
        case SKF_AD_OFF + SKF_AD_RANDOM:
                /* arg1 = CTX */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
                /* arg2 = A */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
                /* arg3 = X */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
                /* Emit call(arg1=CTX, arg2=A, arg3=X) */
                switch (fp->k) {
                case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
                        break;
                case SKF_AD_OFF + SKF_AD_NLATTR:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
                        break;
                case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
                        break;
                case SKF_AD_OFF + SKF_AD_CPU:
                        *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
                        break;
                case SKF_AD_OFF + SKF_AD_RANDOM:
                        *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
                        bpf_user_rnd_init_once();
                        break;
                }
                break;

        case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
                /* A ^= X */
                *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
                break;

        default:
                /* This is just a dummy call to avoid letting the compiler
                 * evict __bpf_call_base() as an optimization. Placed here
                 * where no-one bothers.
                 */
                BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
                return false;
        }

        *insnp = insn;
        return true;
}

static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
{
        const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
        int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
        bool endian = BPF_SIZE(fp->code) == BPF_H ||
                      BPF_SIZE(fp->code) == BPF_W;
        bool indirect = BPF_MODE(fp->code) == BPF_IND;
        const int ip_align = NET_IP_ALIGN;
        struct bpf_insn *insn = *insnp;
        int offset = fp->k;

        if (!indirect &&
            ((unaligned_ok && offset >= 0) ||
             (!unaligned_ok && offset >= 0 &&
              offset + ip_align >= 0 &&
              (offset + ip_align) % size == 0))) {
                bool ldx_off_ok = offset <= S16_MAX;

                *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
                if (offset)
                        *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
                *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
                                      size, 2 + endian + (!ldx_off_ok * 2));
                if (ldx_off_ok) {
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
                                              BPF_REG_D, offset);
                } else {
                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
                                              BPF_REG_TMP, 0);
                }
                if (endian)
                        *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
                *insn++ = BPF_JMP_A(8);
        }

        *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
        *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
        *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
        if (!indirect) {
                *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
        } else {
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
                if (fp->k)
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
        }

        switch (BPF_SIZE(fp->code)) {
        case BPF_B:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
                break;
        case BPF_H:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
                break;
        case BPF_W:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
                break;
        default:
                return false;
        }

        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
        *insn   = BPF_EXIT_INSN();

        *insnp = insn;
        return true;
}

/**
 *        bpf_convert_filter - convert filter program
 *        @prog: the user passed filter program
 *        @len: the length of the user passed filter program
 *        @new_prog: allocated 'struct bpf_prog' or NULL
 *        @new_len: pointer to store length of converted program
 *        @seen_ld_abs: bool whether we've seen ld_abs/ind
 *
 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
 * style extended BPF (eBPF).
 * Conversion workflow:
 *
 * 1) First pass for calculating the new program length:
 *   bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
 *
 * 2) 2nd pass to remap in two passes: 1st pass finds new
 *    jump offsets, 2nd pass remapping:
 *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
 */
static int bpf_convert_filter(struct sock_filter *prog, int len,
                              struct bpf_prog *new_prog, int *new_len,
                              bool *seen_ld_abs)
{
        int new_flen = 0, pass = 0, target, i, stack_off;
        struct bpf_insn *new_insn, *first_insn = NULL;
        struct sock_filter *fp;
        int *addrs = NULL;
        u8 bpf_src;

        BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);

        if (len <= 0 || len > BPF_MAXINSNS)
                return -EINVAL;

        if (new_prog) {
                first_insn = new_prog->insnsi;
                addrs = kzalloc_objs(*addrs, len, GFP_KERNEL | __GFP_NOWARN);
                if (!addrs)
                        return -ENOMEM;
        }

do_pass:
        new_insn = first_insn;
        fp = prog;

        /* Classic BPF related prologue emission. */
        if (new_prog) {
                /* Classic BPF expects A and X to be reset first. These need
                 * to be guaranteed to be the first two instructions.
                 */
                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);

                /* All programs must keep CTX in callee saved BPF_REG_CTX.
                 * In eBPF case it's done by the compiler, here we need to
                 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
                 */
                *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
                if (*seen_ld_abs) {
                        /* For packet access in classic BPF, cache skb->data
                         * in callee-saved BPF R8 and skb->len - skb->data_len
                         * (headlen) in BPF R9. Since classic BPF is read-only
                         * on CTX, we only need to cache it once.
                         */
                        *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                                  BPF_REG_D, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, data));
                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, len));
                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, data_len));
                        *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
                }
        } else {
                new_insn += 3;
        }

        for (i = 0; i < len; fp++, i++) {
                struct bpf_insn tmp_insns[32] = { };
                struct bpf_insn *insn = tmp_insns;

                if (addrs)
                        addrs[i] = new_insn - first_insn;

                switch (fp->code) {
                /* All arithmetic insns and skb loads map as-is. */
                case BPF_ALU | BPF_ADD | BPF_X:
                case BPF_ALU | BPF_ADD | BPF_K:
                case BPF_ALU | BPF_SUB | BPF_X:
                case BPF_ALU | BPF_SUB | BPF_K:
                case BPF_ALU | BPF_AND | BPF_X:
                case BPF_ALU | BPF_AND | BPF_K:
                case BPF_ALU | BPF_OR | BPF_X:
                case BPF_ALU | BPF_OR | BPF_K:
                case BPF_ALU | BPF_LSH | BPF_X:
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_X:
                case BPF_ALU | BPF_RSH | BPF_K:
                case BPF_ALU | BPF_XOR | BPF_X:
                case BPF_ALU | BPF_XOR | BPF_K:
                case BPF_ALU | BPF_MUL | BPF_X:
                case BPF_ALU | BPF_MUL | BPF_K:
                case BPF_ALU | BPF_DIV | BPF_X:
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_MOD | BPF_X:
                case BPF_ALU | BPF_MOD | BPF_K:
                case BPF_ALU | BPF_NEG:
                case BPF_LD | BPF_ABS | BPF_W:
                case BPF_LD | BPF_ABS | BPF_H:
                case BPF_LD | BPF_ABS | BPF_B:
                case BPF_LD | BPF_IND | BPF_W:
                case BPF_LD | BPF_IND | BPF_H:
                case BPF_LD | BPF_IND | BPF_B:
                        /* Check for overloaded BPF extension and
                         * directly convert it if found, otherwise
                         * just move on with mapping.
                         */
                        if (BPF_CLASS(fp->code) == BPF_LD &&
                            BPF_MODE(fp->code) == BPF_ABS &&
                            convert_bpf_extensions(fp, &insn))
                                break;
                        if (BPF_CLASS(fp->code) == BPF_LD &&
                            convert_bpf_ld_abs(fp, &insn)) {
                                *seen_ld_abs = true;
                                break;
                        }

                        if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
                            fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
                                *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
                                /* Error with exception code on div/mod by 0.
                                 * For cBPF programs, this was always return 0.
                                 */
                                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
                                *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
                                *insn++ = BPF_EXIT_INSN();
                        }

                        *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
                        break;

                /* Jump transformation cannot use BPF block macros
                 * everywhere as offset calculation and target updates
                 * require a bit more work than the rest, i.e. jump
                 * opcodes map as-is, but offsets need adjustment.
                 */

#define BPF_EMIT_JMP                                                        \
        do {                                                                \
                const s32 off_min = S16_MIN, off_max = S16_MAX;                \
                s32 off;                                                \
                                                                        \
                if (target >= len || target < 0)                        \
                        goto err;                                        \
                off = addrs ? addrs[target] - addrs[i] - 1 : 0;                \
                /* Adjust pc relative offset for 2nd or 3rd insn. */        \
                off -= insn - tmp_insns;                                \
                /* Reject anything not fitting into insn->off. */        \
                if (off < off_min || off > off_max)                        \
                        goto err;                                        \
                insn->off = off;                                        \
        } while (0)

                case BPF_JMP | BPF_JA:
                        target = i + fp->k + 1;
                        insn->code = fp->code;
                        BPF_EMIT_JMP;
                        break;

                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                        if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
                                /* BPF immediates are signed, zero extend
                                 * immediate into tmp register and use it
                                 * in compare insn.
                                 */
                                *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);

                                insn->dst_reg = BPF_REG_A;
                                insn->src_reg = BPF_REG_TMP;
                                bpf_src = BPF_X;
                        } else {
                                insn->dst_reg = BPF_REG_A;
                                insn->imm = fp->k;
                                bpf_src = BPF_SRC(fp->code);
                                insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
                        }

                        /* Common case where 'jump_false' is next insn. */
                        if (fp->jf == 0) {
                                insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
                                target = i + fp->jt + 1;
                                BPF_EMIT_JMP;
                                break;
                        }

                        /* Convert some jumps when 'jump_true' is next insn. */
                        if (fp->jt == 0) {
                                switch (BPF_OP(fp->code)) {
                                case BPF_JEQ:
                                        insn->code = BPF_JMP | BPF_JNE | bpf_src;
                                        break;
                                case BPF_JGT:
                                        insn->code = BPF_JMP | BPF_JLE | bpf_src;
                                        break;
                                case BPF_JGE:
                                        insn->code = BPF_JMP | BPF_JLT | bpf_src;
                                        break;
                                default:
                                        goto jmp_rest;
                                }

                                target = i + fp->jf + 1;
                                BPF_EMIT_JMP;
                                break;
                        }
jmp_rest:
                        /* Other jumps are mapped into two insns: Jxx and JA. */
                        target = i + fp->jt + 1;
                        insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
                        BPF_EMIT_JMP;
                        insn++;

                        insn->code = BPF_JMP | BPF_JA;
                        target = i + fp->jf + 1;
                        BPF_EMIT_JMP;
                        break;

                /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */
                case BPF_LDX | BPF_MSH | BPF_B: {
                        struct sock_filter tmp = {
                                .code        = BPF_LD | BPF_ABS | BPF_B,
                                .k        = fp->k,
                        };

                        *seen_ld_abs = true;

                        /* X = A */
                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        /* A = BPF_R0 = *(u8 *) (skb->data + K) */
                        convert_bpf_ld_abs(&tmp, &insn);
                        insn++;
                        /* A &= 0xf */
                        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
                        /* A <<= 2 */
                        *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
                        /* tmp = X */
                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
                        /* X = A */
                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        /* A = tmp */
                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
                        break;
                }
                /* RET_K is remapped into 2 insns. RET_A case doesn't need an
                 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
                 */
                case BPF_RET | BPF_A:
                case BPF_RET | BPF_K:
                        if (BPF_RVAL(fp->code) == BPF_K)
                                *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
                                                        0, fp->k);
                        *insn = BPF_EXIT_INSN();
                        break;

                /* Store to stack. */
                case BPF_ST:
                case BPF_STX:
                        stack_off = fp->k * 4  + 4;
                        *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
                                            BPF_ST ? BPF_REG_A : BPF_REG_X,
                                            -stack_off);
                        /* check_load_and_stores() verifies that classic BPF can
                         * load from stack only after write, so tracking
                         * stack_depth for ST|STX insns is enough
                         */
                        if (new_prog && new_prog->aux->stack_depth < stack_off)
                                new_prog->aux->stack_depth = stack_off;
                        break;

                /* Load from stack. */
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                        stack_off = fp->k * 4  + 4;
                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
                                            BPF_REG_A : BPF_REG_X, BPF_REG_FP,
                                            -stack_off);
                        break;

                /* A = K or X = K */
                case BPF_LD | BPF_IMM:
                case BPF_LDX | BPF_IMM:
                        *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
                                              BPF_REG_A : BPF_REG_X, fp->k);
                        break;

                /* X = A */
                case BPF_MISC | BPF_TAX:
                        *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        break;

                /* A = X */
                case BPF_MISC | BPF_TXA:
                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
                        break;

                /* A = skb->len or X = skb->len */
                case BPF_LD | BPF_W | BPF_LEN:
                case BPF_LDX | BPF_W | BPF_LEN:
                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
                                            BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
                                            offsetof(struct sk_buff, len));
                        break;

                /* Access seccomp_data fields. */
                case BPF_LDX | BPF_ABS | BPF_W:
                        /* A = *(u32 *) (ctx + K) */
                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
                        break;

                /* Unknown instruction. */
                default:
                        goto err;
                }

                insn++;
                if (new_prog)
                        memcpy(new_insn, tmp_insns,
                               sizeof(*insn) * (insn - tmp_insns));
                new_insn += insn - tmp_insns;
        }

        if (!new_prog) {
                /* Only calculating new length. */
                *new_len = new_insn - first_insn;
                if (*seen_ld_abs)
                        *new_len += 4; /* Prologue bits. */
                return 0;
        }

        pass++;
        if (new_flen != new_insn - first_insn) {
                new_flen = new_insn - first_insn;
                if (pass > 2)
                        goto err;
                goto do_pass;
        }

        kfree(addrs);
        BUG_ON(*new_len != new_flen);
        return 0;
err:
        kfree(addrs);
        return -EINVAL;
}

/* Security:
 *
 * As we dont want to clear mem[] array for each packet going through
 * __bpf_prog_run(), we check that filter loaded by user never try to read
 * a cell if not previously written, and we check all branches to be sure
 * a malicious user doesn't try to abuse us.
 */
static int check_load_and_stores(const struct sock_filter *filter, int flen)
{
        u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
        int pc, ret = 0;

        BUILD_BUG_ON(BPF_MEMWORDS > 16);

        masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
        if (!masks)
                return -ENOMEM;

        memset(masks, 0xff, flen * sizeof(*masks));

        for (pc = 0; pc < flen; pc++) {
                memvalid &= masks[pc];

                switch (filter[pc].code) {
                case BPF_ST:
                case BPF_STX:
                        memvalid |= (1 << filter[pc].k);
                        break;
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                        if (!(memvalid & (1 << filter[pc].k))) {
                                ret = -EINVAL;
                                goto error;
                        }
                        break;
                case BPF_JMP | BPF_JA:
                        /* A jump must set masks on target */
                        masks[pc + 1 + filter[pc].k] &= memvalid;
                        memvalid = ~0;
                        break;
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        /* A jump must set masks on targets */
                        masks[pc + 1 + filter[pc].jt] &= memvalid;
                        masks[pc + 1 + filter[pc].jf] &= memvalid;
                        memvalid = ~0;
                        break;
                }
        }
error:
        kfree(masks);
        return ret;
}

static bool chk_code_allowed(u16 code_to_probe)
{
        static const bool codes[] = {
                /* 32 bit ALU operations */
                [BPF_ALU | BPF_ADD | BPF_K] = true,
                [BPF_ALU | BPF_ADD | BPF_X] = true,
                [BPF_ALU | BPF_SUB | BPF_K] = true,
                [BPF_ALU | BPF_SUB | BPF_X] = true,
                [BPF_ALU | BPF_MUL | BPF_K] = true,
                [BPF_ALU | BPF_MUL | BPF_X] = true,
                [BPF_ALU | BPF_DIV | BPF_K] = true,
                [BPF_ALU | BPF_DIV | BPF_X] = true,
                [BPF_ALU | BPF_MOD | BPF_K] = true,
                [BPF_ALU | BPF_MOD | BPF_X] = true,
                [BPF_ALU | BPF_AND | BPF_K] = true,
                [BPF_ALU | BPF_AND | BPF_X] = true,
                [BPF_ALU | BPF_OR | BPF_K] = true,
                [BPF_ALU | BPF_OR | BPF_X] = true,
                [BPF_ALU | BPF_XOR | BPF_K] = true,
                [BPF_ALU | BPF_XOR | BPF_X] = true,
                [BPF_ALU | BPF_LSH | BPF_K] = true,
                [BPF_ALU | BPF_LSH | BPF_X] = true,
                [BPF_ALU | BPF_RSH | BPF_K] = true,
                [BPF_ALU | BPF_RSH | BPF_X] = true,
                [BPF_ALU | BPF_NEG] = true,
                /* Load instructions */
                [BPF_LD | BPF_W | BPF_ABS] = true,
                [BPF_LD | BPF_H | BPF_ABS] = true,
                [BPF_LD | BPF_B | BPF_ABS] = true,
                [BPF_LD | BPF_W | BPF_LEN] = true,
                [BPF_LD | BPF_W | BPF_IND] = true,
                [BPF_LD | BPF_H | BPF_IND] = true,
                [BPF_LD | BPF_B | BPF_IND] = true,
                [BPF_LD | BPF_IMM] = true,
                [BPF_LD | BPF_MEM] = true,
                [BPF_LDX | BPF_W | BPF_LEN] = true,
                [BPF_LDX | BPF_B | BPF_MSH] = true,
                [BPF_LDX | BPF_IMM] = true,
                [BPF_LDX | BPF_MEM] = true,
                /* Store instructions */
                [BPF_ST] = true,
                [BPF_STX] = true,
                /* Misc instructions */
                [BPF_MISC | BPF_TAX] = true,
                [BPF_MISC | BPF_TXA] = true,
                /* Return instructions */
                [BPF_RET | BPF_K] = true,
                [BPF_RET | BPF_A] = true,
                /* Jump instructions */
                [BPF_JMP | BPF_JA] = true,
                [BPF_JMP | BPF_JEQ | BPF_K] = true,
                [BPF_JMP | BPF_JEQ | BPF_X] = true,
                [BPF_JMP | BPF_JGE | BPF_K] = true,
                [BPF_JMP | BPF_JGE | BPF_X] = true,
                [BPF_JMP | BPF_JGT | BPF_K] = true,
                [BPF_JMP | BPF_JGT | BPF_X] = true,
                [BPF_JMP | BPF_JSET | BPF_K] = true,
                [BPF_JMP | BPF_JSET | BPF_X] = true,
        };

        if (code_to_probe >= ARRAY_SIZE(codes))
                return false;

        return codes[code_to_probe];
}

static bool bpf_check_basics_ok(const struct sock_filter *filter,
                                unsigned int flen)
{
        if (filter == NULL)
                return false;
        if (flen == 0 || flen > BPF_MAXINSNS)
                return false;

        return true;
}

/**
 *        bpf_check_classic - verify socket filter code
 *        @filter: filter to verify
 *        @flen: length of filter
 *
 * Check the user's filter code. If we let some ugly
 * filter code slip through kaboom! The filter must contain
 * no references or jumps that are out of range, no illegal
 * instructions, and must end with a RET instruction.
 *
 * All jumps are forward as they are not signed.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
 */
static int bpf_check_classic(const struct sock_filter *filter,
                             unsigned int flen)
{
        bool anc_found;
        int pc;

        /* Check the filter code now */
        for (pc = 0; pc < flen; pc++) {
                const struct sock_filter *ftest = &filter[pc];

                /* May we actually operate on this code? */
                if (!chk_code_allowed(ftest->code))
                        return -EINVAL;

                /* Some instructions need special checks */
                switch (ftest->code) {
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_MOD | BPF_K:
                        /* Check for division by zero */
                        if (ftest->k == 0)
                                return -EINVAL;
                        break;
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_K:
                        if (ftest->k >= 32)
                                return -EINVAL;
                        break;
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                case BPF_ST:
                case BPF_STX:
                        /* Check for invalid memory addresses */
                        if (ftest->k >= BPF_MEMWORDS)
                                return -EINVAL;
                        break;
                case BPF_JMP | BPF_JA:
                        /* Note, the large ftest->k might cause loops.
                         * Compare this with conditional jumps below,
                         * where offsets are limited. --ANK (981016)
                         */
                        if (ftest->k >= (unsigned int)(flen - pc - 1))
                                return -EINVAL;
                        break;
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        /* Both conditionals must be safe */
                        if (pc + ftest->jt + 1 >= flen ||
                            pc + ftest->jf + 1 >= flen)
                                return -EINVAL;
                        break;
                case BPF_LD | BPF_W | BPF_ABS:
                case BPF_LD | BPF_H | BPF_ABS:
                case BPF_LD | BPF_B | BPF_ABS:
                        anc_found = false;
                        if (bpf_anc_helper(ftest) & BPF_ANC)
                                anc_found = true;
                        /* Ancillary operation unknown or unsupported */
                        if (anc_found == false && ftest->k >= SKF_AD_OFF)
                                return -EINVAL;
                }
        }

        /* Last instruction must be a RET code */
        switch (filter[flen - 1].code) {
        case BPF_RET | BPF_K:
        case BPF_RET | BPF_A:
                return check_load_and_stores(filter, flen);
        }

        return -EINVAL;
}

static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
                                      const struct sock_fprog *fprog)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct sock_fprog_kern *fkprog;

        fp->orig_prog = kmalloc_obj(*fkprog);
        if (!fp->orig_prog)
                return -ENOMEM;

        fkprog = fp->orig_prog;
        fkprog->len = fprog->len;

        fkprog->filter = kmemdup(fp->insns, fsize,
                                 GFP_KERNEL | __GFP_NOWARN);
        if (!fkprog->filter) {
                kfree(fp->orig_prog);
                return -ENOMEM;
        }

        return 0;
}

static void bpf_release_orig_filter(struct bpf_prog *fp)
{
        struct sock_fprog_kern *fprog = fp->orig_prog;

        if (fprog) {
                kfree(fprog->filter);
                kfree(fprog);
        }
}

static void __bpf_prog_release(struct bpf_prog *prog)
{
        if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
                bpf_prog_put(prog);
        } else {
                bpf_release_orig_filter(prog);
                bpf_prog_free(prog);
        }
}

static void __sk_filter_release(struct sk_filter *fp)
{
        __bpf_prog_release(fp->prog);
        kfree(fp);
}

/**
 *         sk_filter_release_rcu - Release a socket filter by rcu_head
 *        @rcu: rcu_head that contains the sk_filter to free
 */
static void sk_filter_release_rcu(struct rcu_head *rcu)
{
        struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);

        __sk_filter_release(fp);
}

/**
 *        sk_filter_release - release a socket filter
 *        @fp: filter to remove
 *
 *        Remove a filter from a socket and release its resources.
 */
static void sk_filter_release(struct sk_filter *fp)
{
        if (refcount_dec_and_test(&fp->refcnt))
                call_rcu(&fp->rcu, sk_filter_release_rcu);
}

void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
{
        u32 filter_size = bpf_prog_size(fp->prog->len);

        atomic_sub(filter_size, &sk->sk_omem_alloc);
        sk_filter_release(fp);
}

/* try to charge the socket memory if there is space available
 * return true on success
 */
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
        int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
        u32 filter_size = bpf_prog_size(fp->prog->len);

        /* same check as in sock_kmalloc() */
        if (filter_size <= optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
                atomic_add(filter_size, &sk->sk_omem_alloc);
                return true;
        }
        return false;
}

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
        if (!refcount_inc_not_zero(&fp->refcnt))
                return false;

        if (!__sk_filter_charge(sk, fp)) {
                sk_filter_release(fp);
                return false;
        }
        return true;
}

static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
{
        struct sock_filter *old_prog;
        struct bpf_prog *old_fp;
        int err, new_len, old_len = fp->len;
        bool seen_ld_abs = false;

        /* We are free to overwrite insns et al right here as it won't be used at
         * this point in time anymore internally after the migration to the eBPF
         * instruction representation.
         */
        BUILD_BUG_ON(sizeof(struct sock_filter) !=
                     sizeof(struct bpf_insn));

        /* Conversion cannot happen on overlapping memory areas,
         * so we need to keep the user BPF around until the 2nd
         * pass. At this time, the user BPF is stored in fp->insns.
         */
        old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter),
                                 GFP_KERNEL | __GFP_NOWARN);
        if (!old_prog) {
                err = -ENOMEM;
                goto out_err;
        }

        /* 1st pass: calculate the new program length. */
        err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
                                 &seen_ld_abs);
        if (err)
                goto out_err_free;

        /* Expand fp for appending the new filter representation. */
        old_fp = fp;
        fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
        if (!fp) {
                /* The old_fp is still around in case we couldn't
                 * allocate new memory, so uncharge on that one.
                 */
                fp = old_fp;
                err = -ENOMEM;
                goto out_err_free;
        }

        fp->len = new_len;

        /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
        err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
                                 &seen_ld_abs);
        if (err)
                /* 2nd bpf_convert_filter() can fail only if it fails
                 * to allocate memory, remapping must succeed. Note,
                 * that at this time old_fp has already been released
                 * by krealloc().
                 */
                goto out_err_free;

        fp = bpf_prog_select_runtime(fp, &err);
        if (err)
                goto out_err_free;

        kfree(old_prog);
        return fp;

out_err_free:
        kfree(old_prog);
out_err:
        __bpf_prog_release(fp);
        return ERR_PTR(err);
}

static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
                                           bpf_aux_classic_check_t trans)
{
        int err;

        fp->bpf_func = NULL;
        fp->jited = 0;

        err = bpf_check_classic(fp->insns, fp->len);
        if (err) {
                __bpf_prog_release(fp);
                return ERR_PTR(err);
        }

        /* There might be additional checks and transformations
         * needed on classic filters, f.e. in case of seccomp.
         */
        if (trans) {
                err = trans(fp->insns, fp->len);
                if (err) {
                        __bpf_prog_release(fp);
                        return ERR_PTR(err);
                }
        }

        /* Probe if we can JIT compile the filter and if so, do
         * the compilation of the filter.
         */
        bpf_jit_compile(fp);

        /* JIT compiler couldn't process this filter, so do the eBPF translation
         * for the optimized interpreter.
         */
        if (!fp->jited)
                fp = bpf_migrate_filter(fp);

        return fp;
}

/**
 *        bpf_prog_create - create an unattached filter
 *        @pfp: the unattached filter that is created
 *        @fprog: the filter program
 *
 * Create a filter independent of any socket. We first run some
 * sanity checks on it to make sure it does not explode on us later.
 * If an error occurs or there is insufficient memory for the filter
 * a negative errno code is returned. On success the return is zero.
 */
int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *fp;

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return -EINVAL;

        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!fp)
                return -ENOMEM;

        memcpy(fp->insns, fprog->filter, fsize);

        fp->len = fprog->len;
        /* Since unattached filters are not copied back to user
         * space through sk_get_filter(), we do not need to hold
         * a copy here, and can spare us the work.
         */
        fp->orig_prog = NULL;

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        fp = bpf_prepare_filter(fp, NULL);
        if (IS_ERR(fp))
                return PTR_ERR(fp);

        *pfp = fp;
        return 0;
}
EXPORT_SYMBOL_GPL(bpf_prog_create);

/**
 *        bpf_prog_create_from_user - create an unattached filter from user buffer
 *        @pfp: the unattached filter that is created
 *        @fprog: the filter program
 *        @trans: post-classic verifier transformation handler
 *        @save_orig: save classic BPF program
 *
 * This function effectively does the same as bpf_prog_create(), only
 * that it builds up its insns buffer from user space provided buffer.
 * It also allows for passing a bpf_aux_classic_check_t handler.
 */
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *fp;
        int err;

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return -EINVAL;

        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!fp)
                return -ENOMEM;

        if (copy_from_user(fp->insns, fprog->filter, fsize)) {
                __bpf_prog_free(fp);
                return -EFAULT;
        }

        fp->len = fprog->len;
        fp->orig_prog = NULL;

        if (save_orig) {
                err = bpf_prog_store_orig_filter(fp, fprog);
                if (err) {
                        __bpf_prog_free(fp);
                        return -ENOMEM;
                }
        }

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        fp = bpf_prepare_filter(fp, trans);
        if (IS_ERR(fp))
                return PTR_ERR(fp);

        *pfp = fp;
        return 0;
}
EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);

void bpf_prog_destroy(struct bpf_prog *fp)
{
        __bpf_prog_release(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_destroy);

static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
{
        struct sk_filter *fp, *old_fp;

        fp = kmalloc_obj(*fp);
        if (!fp)
                return -ENOMEM;

        fp->prog = prog;

        if (!__sk_filter_charge(sk, fp)) {
                kfree(fp);
                return -ENOMEM;
        }
        refcount_set(&fp->refcnt, 1);

        old_fp = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_filter, fp);

        if (old_fp)
                sk_filter_uncharge(sk, old_fp);

        return 0;
}

static
struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *prog;
        int err;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return ERR_PTR(-EPERM);

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return ERR_PTR(-EINVAL);

        prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!prog)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(prog->insns, fprog->filter, fsize)) {
                __bpf_prog_free(prog);
                return ERR_PTR(-EFAULT);
        }

        prog->len = fprog->len;

        err = bpf_prog_store_orig_filter(prog, fprog);
        if (err) {
                __bpf_prog_free(prog);
                return ERR_PTR(-ENOMEM);
        }

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        return bpf_prepare_filter(prog, NULL);
}

/**
 *        sk_attach_filter - attach a socket filter
 *        @fprog: the filter program
 *        @sk: the socket to use
 *
 * Attach the user's filter code. We first run some sanity checks on
 * it to make sure it does not explode on us later. If an error
 * occurs or there is insufficient memory for the filter a negative
 * errno code is returned. On success the return is zero.
 */
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
        struct bpf_prog *prog = __get_filter(fprog, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        err = __sk_attach_prog(prog, sk);
        if (err < 0) {
                __bpf_prog_release(prog);
                return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(sk_attach_filter);

int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
        struct bpf_prog *prog = __get_filter(fprog, sk);
        int err, optmem_max;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
        if (bpf_prog_size(prog->len) > optmem_max)
                err = -ENOMEM;
        else
                err = reuseport_attach_prog(sk, prog);

        if (err)
                __bpf_prog_release(prog);

        return err;
}

static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
{
        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return ERR_PTR(-EPERM);

        return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
}

int sk_attach_bpf(u32 ufd, struct sock *sk)
{
        struct bpf_prog *prog = __get_bpf(ufd, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        err = __sk_attach_prog(prog, sk);
        if (err < 0) {
                bpf_prog_put(prog);
                return err;
        }

        return 0;
}

int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
        struct bpf_prog *prog;
        int err, optmem_max;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
        if (PTR_ERR(prog) == -EINVAL)
                prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
                /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
                 * bpf prog (e.g. sockmap).  It depends on the
                 * limitation imposed by bpf_prog_load().
                 * Hence, sysctl_optmem_max is not checked.
                 */
                if ((sk->sk_type != SOCK_STREAM &&
                     sk->sk_type != SOCK_DGRAM) ||
                    (sk->sk_protocol != IPPROTO_UDP &&
                     sk->sk_protocol != IPPROTO_TCP) ||
                    (sk->sk_family != AF_INET &&
                     sk->sk_family != AF_INET6)) {
                        err = -ENOTSUPP;
                        goto err_prog_put;
                }
        } else {
                /* BPF_PROG_TYPE_SOCKET_FILTER */
                optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
                if (bpf_prog_size(prog->len) > optmem_max) {
                        err = -ENOMEM;
                        goto err_prog_put;
                }
        }

        err = reuseport_attach_prog(sk, prog);
err_prog_put:
        if (err)
                bpf_prog_put(prog);

        return err;
}

void sk_reuseport_prog_free(struct bpf_prog *prog)
{
        if (!prog)
                return;

        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
                bpf_prog_put(prog);
        else
                bpf_prog_destroy(prog);
}

static inline int __bpf_try_make_writable(struct sk_buff *skb,
                                          unsigned int write_len)
{
#ifdef CONFIG_DEBUG_NET
        /* Avoid a splat in pskb_may_pull_reason() */
        if (write_len > INT_MAX)
                return -EINVAL;
#endif
        return skb_ensure_writable(skb, write_len);
}

static inline int bpf_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        int err = __bpf_try_make_writable(skb, write_len);

        bpf_compute_data_pointers(skb);
        return err;
}

static int bpf_try_make_head_writable(struct sk_buff *skb)
{
        return bpf_try_make_writable(skb, skb_headlen(skb));
}

static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
{
        if (skb_at_tc_ingress(skb))
                skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}

static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
{
        if (skb_at_tc_ingress(skb))
                skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}

BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
           const void *, from, u32, len, u64, flags)
{
        void *ptr;

        if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
                return -EINVAL;
        if (unlikely(offset > INT_MAX))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + len)))
                return -EFAULT;

        ptr = skb->data + offset;
        if (flags & BPF_F_RECOMPUTE_CSUM)
                __skb_postpull_rcsum(skb, ptr, len, offset);

        memcpy(ptr, from, len);

        if (flags & BPF_F_RECOMPUTE_CSUM)
                __skb_postpush_rcsum(skb, ptr, len, offset);
        if (flags & BPF_F_INVALIDATE_HASH)
                skb_clear_hash(skb);

        return 0;
}

static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
        .func                = bpf_skb_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
                          u32 len, u64 flags)
{
        return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
}

BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
           void *, to, u32, len)
{
        void *ptr;

        if (unlikely(offset > INT_MAX))
                goto err_clear;

        ptr = skb_header_pointer(skb, offset, len, to);
        if (unlikely(!ptr))
                goto err_clear;
        if (ptr != to)
                memcpy(to, ptr, len);

        return 0;
err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
        .func                = bpf_skb_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
{
        return ____bpf_skb_load_bytes(skb, offset, to, len);
}

BPF_CALL_4(bpf_flow_dissector_load_bytes,
           const struct bpf_flow_dissector *, ctx, u32, offset,
           void *, to, u32, len)
{
        void *ptr;

        if (unlikely(offset > 0xffff))
                goto err_clear;

        if (unlikely(!ctx->skb))
                goto err_clear;

        ptr = skb_header_pointer(ctx->skb, offset, len, to);
        if (unlikely(!ptr))
                goto err_clear;
        if (ptr != to)
                memcpy(to, ptr, len);

        return 0;
err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
        .func                = bpf_flow_dissector_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
           u32, offset, void *, to, u32, len, u32, start_header)
{
        u8 *end = skb_tail_pointer(skb);
        u8 *start, *ptr;

        if (unlikely(offset > 0xffff))
                goto err_clear;

        switch (start_header) {
        case BPF_HDR_START_MAC:
                if (unlikely(!skb_mac_header_was_set(skb)))
                        goto err_clear;
                start = skb_mac_header(skb);
                break;
        case BPF_HDR_START_NET:
                start = skb_network_header(skb);
                break;
        default:
                goto err_clear;
        }

        ptr = start + offset;

        if (likely(ptr + len <= end)) {
                memcpy(to, ptr, len);
                return 0;
        }

err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
        .func                = bpf_skb_load_bytes_relative,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
{
        /* Idea is the following: should the needed direct read/write
         * test fail during runtime, we can pull in more data and redo
         * again, since implicitly, we invalidate previous checks here.
         *
         * Or, since we know how much we need to make read/writeable,
         * this can be done once at the program beginning for direct
         * access case. By this we overcome limitations of only current
         * headroom being accessible.
         */
        return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
}

static const struct bpf_func_proto bpf_skb_pull_data_proto = {
        .func                = bpf_skb_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
{
        return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
}

static const struct bpf_func_proto bpf_sk_fullsock_proto = {
        .func                = bpf_sk_fullsock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

static inline int sk_skb_try_make_writable(struct sk_buff *skb,
                                           unsigned int write_len)
{
        return __bpf_try_make_writable(skb, write_len);
}

BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
{
        /* Idea is the following: should the needed direct read/write
         * test fail during runtime, we can pull in more data and redo
         * again, since implicitly, we invalidate previous checks here.
         *
         * Or, since we know how much we need to make read/writeable,
         * this can be done once at the program beginning for direct
         * access case. By this we overcome limitations of only current
         * headroom being accessible.
         */
        return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
}

static const struct bpf_func_proto sk_skb_pull_data_proto = {
        .func                = sk_skb_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
           u64, from, u64, to, u64, flags)
{
        __sum16 *ptr;

        if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
                return -EINVAL;
        if (unlikely(offset > 0xffff || offset & 1))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
                return -EFAULT;

        ptr = (__sum16 *)(skb->data + offset);
        switch (flags & BPF_F_HDR_FIELD_MASK) {
        case 0:
                if (unlikely(from != 0))
                        return -EINVAL;

                csum_replace_by_diff(ptr, to);
                break;
        case 2:
                csum_replace2(ptr, from, to);
                break;
        case 4:
                csum_replace4(ptr, from, to);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
        .func                = bpf_l3_csum_replace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
           u64, from, u64, to, u64, flags)
{
        bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
        bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
        bool do_mforce = flags & BPF_F_MARK_ENFORCE;
        bool is_ipv6   = flags & BPF_F_IPV6;
        __sum16 *ptr;

        if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
                               BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6)))
                return -EINVAL;
        if (unlikely(offset > 0xffff || offset & 1))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
                return -EFAULT;

        ptr = (__sum16 *)(skb->data + offset);
        if (is_mmzero && !do_mforce && !*ptr)
                return 0;

        switch (flags & BPF_F_HDR_FIELD_MASK) {
        case 0:
                if (unlikely(from != 0))
                        return -EINVAL;

                inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6);
                break;
        case 2:
                inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
                break;
        case 4:
                inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
                break;
        default:
                return -EINVAL;
        }

        if (is_mmzero && !*ptr)
                *ptr = CSUM_MANGLED_0;
        return 0;
}

static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
        .func                = bpf_l4_csum_replace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
           __be32 *, to, u32, to_size, __wsum, seed)
{
        /* This is quite flexible, some examples:
         *
         * from_size == 0, to_size > 0,  seed := csum --> pushing data
         * from_size > 0,  to_size == 0, seed := csum --> pulling data
         * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
         *
         * Even for diffing, from_size and to_size don't need to be equal.
         */

        __wsum ret = seed;

        if (from_size && to_size)
                ret = csum_sub(csum_partial(to, to_size, ret),
                               csum_partial(from, from_size, 0));
        else if (to_size)
                ret = csum_partial(to, to_size, ret);

        else if (from_size)
                ret = ~csum_partial(from, from_size, ~ret);

        return csum_from32to16((__force unsigned int)ret);
}

static const struct bpf_func_proto bpf_csum_diff_proto = {
        .func                = bpf_csum_diff,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
{
        /* The interface is to be used in combination with bpf_csum_diff()
         * for direct packet writes. csum rotation for alignment as well
         * as emulating csum_sub() can be done from the eBPF program.
         */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                return (skb->csum = csum_add(skb->csum, csum));

        return -ENOTSUPP;
}

static const struct bpf_func_proto bpf_csum_update_proto = {
        .func                = bpf_csum_update,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
{
        /* The interface is to be used in combination with bpf_skb_adjust_room()
         * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
         * is passed as flags, for example.
         */
        switch (level) {
        case BPF_CSUM_LEVEL_INC:
                __skb_incr_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_DEC:
                __skb_decr_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_RESET:
                __skb_reset_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_QUERY:
                return skb->ip_summed == CHECKSUM_UNNECESSARY ?
                       skb->csum_level : -EACCES;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_csum_level_proto = {
        .func                = bpf_csum_level,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
        return dev_forward_skb_nomtu(dev, skb);
}

static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
                                      struct sk_buff *skb)
{
        int ret = ____dev_forward_skb(dev, skb, false);

        if (likely(!ret)) {
                skb->dev = dev;
                ret = netif_rx(skb);
        }

        return ret;
}

static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
        int ret;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                kfree_skb(skb);
                return -ENETDOWN;
        }

        skb->dev = dev;
        skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb));
        skb_clear_tstamp(skb);

        dev_xmit_recursion_inc();
        ret = dev_queue_xmit(skb);
        dev_xmit_recursion_dec();

        return ret;
}

static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
                                 u32 flags)
{
        unsigned int mlen = skb_network_offset(skb);

        if (unlikely(skb->len <= mlen)) {
                kfree_skb(skb);
                return -ERANGE;
        }

        if (mlen) {
                __skb_pull(skb, mlen);

                /* At ingress, the mac header has already been pulled once.
                 * At egress, skb_pospull_rcsum has to be done in case that
                 * the skb is originated from ingress (i.e. a forwarded skb)
                 * to ensure that rcsum starts at net header.
                 */
                if (!skb_at_tc_ingress(skb))
                        skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
        }
        skb_pop_mac_header(skb);
        skb_reset_mac_len(skb);
        return flags & BPF_F_INGRESS ?
               __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
                                 u32 flags)
{
        /* Verify that a link layer header is carried */
        if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) {
                kfree_skb(skb);
                return -ERANGE;
        }

        bpf_push_mac_rcsum(skb);
        return flags & BPF_F_INGRESS ?
               __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
                          u32 flags)
{
        if (dev_is_mac_header_xmit(dev))
                return __bpf_redirect_common(skb, dev, flags);
        else
                return __bpf_redirect_no_mac(skb, dev, flags);
}

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
                            struct net_device *dev, struct bpf_nh_params *nh)
{
        u32 hh_len = LL_RESERVED_SPACE(dev);
        const struct in6_addr *nexthop;
        struct dst_entry *dst = NULL;
        struct neighbour *neigh;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                goto out_drop;
        }

        skb->dev = dev;
        skb_clear_tstamp(skb);

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb)
                        return -ENOMEM;
        }

        if (unlikely(!ipv6_mod_enabled()))
                goto out_drop;

        rcu_read_lock();
        if (!nh) {
                dst = skb_dst(skb);
                nexthop = rt6_nexthop(dst_rt6_info(dst),
                                      &ipv6_hdr(skb)->daddr);
        } else {
                nexthop = &nh->ipv6_nh;
        }
        neigh = ip_neigh_gw6(dev, nexthop);
        if (likely(!IS_ERR(neigh))) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                local_bh_disable();
                dev_xmit_recursion_inc();
                ret = neigh_output(neigh, skb, false);
                dev_xmit_recursion_dec();
                local_bh_enable();
                rcu_read_unlock();
                return ret;
        }
        rcu_read_unlock();
        if (dst)
                IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
}

static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;

        if (!nh) {
                struct dst_entry *dst;
                struct flowi6 fl6 = {
                        .flowi6_flags = FLOWI_FLAG_ANYSRC,
                        .flowi6_mark  = skb->mark,
                        .flowlabel    = ip6_flowinfo(ip6h),
                        .flowi6_oif   = dev->ifindex,
                        .flowi6_proto = ip6h->nexthdr,
                        .daddr              = ip6h->daddr,
                        .saddr              = ip6h->saddr,
                };

                dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL);
                if (IS_ERR(dst))
                        goto out_drop;

                skb_dst_drop(skb);
                skb_dst_set(skb, dst);
        } else if (nh->nh_family != AF_INET6) {
                goto out_drop;
        }

        err = bpf_out_neigh_v6(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                dev_core_stats_tx_dropped_inc(dev);
        else
                ret = NET_XMIT_SUCCESS;
        goto out_xmit;
out_drop:
        dev_core_stats_tx_dropped_inc(dev);
        kfree_skb(skb);
out_xmit:
        return ret;
}
#else
static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        kfree_skb(skb);
        return NET_XMIT_DROP;
}
#endif /* CONFIG_IPV6 */

#if IS_ENABLED(CONFIG_INET)
static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
                            struct net_device *dev, struct bpf_nh_params *nh)
{
        u32 hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
        bool is_v6gw = false;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                goto out_drop;
        }

        skb->dev = dev;
        skb_clear_tstamp(skb);

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb)
                        return -ENOMEM;
        }

        rcu_read_lock();
        if (!nh) {
                struct rtable *rt = skb_rtable(skb);

                neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
        } else if (nh->nh_family == AF_INET6) {
                if (unlikely(!ipv6_mod_enabled())) {
                        rcu_read_unlock();
                        goto out_drop;
                }
                neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
                is_v6gw = true;
        } else if (nh->nh_family == AF_INET) {
                neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
        } else {
                rcu_read_unlock();
                goto out_drop;
        }

        if (likely(!IS_ERR(neigh))) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                local_bh_disable();
                dev_xmit_recursion_inc();
                ret = neigh_output(neigh, skb, is_v6gw);
                dev_xmit_recursion_dec();
                local_bh_enable();
                rcu_read_unlock();
                return ret;
        }
        rcu_read_unlock();
out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
}

static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        const struct iphdr *ip4h = ip_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;

        if (!nh) {
                struct flowi4 fl4 = {
                        .flowi4_flags = FLOWI_FLAG_ANYSRC,
                        .flowi4_mark  = skb->mark,
                        .flowi4_dscp  = ip4h_dscp(ip4h),
                        .flowi4_oif   = dev->ifindex,
                        .flowi4_proto = ip4h->protocol,
                        .daddr              = ip4h->daddr,
                        .saddr              = ip4h->saddr,
                };
                struct rtable *rt;

                rt = ip_route_output_flow(net, &fl4, NULL);
                if (IS_ERR(rt))
                        goto out_drop;
                if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
                        ip_rt_put(rt);
                        goto out_drop;
                }

                skb_dst_drop(skb);
                skb_dst_set(skb, &rt->dst);
        }

        err = bpf_out_neigh_v4(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                dev_core_stats_tx_dropped_inc(dev);
        else
                ret = NET_XMIT_SUCCESS;
        goto out_xmit;
out_drop:
        dev_core_stats_tx_dropped_inc(dev);
        kfree_skb(skb);
out_xmit:
        return ret;
}
#else
static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        kfree_skb(skb);
        return NET_XMIT_DROP;
}
#endif /* CONFIG_INET */

static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
                                struct bpf_nh_params *nh)
{
        struct ethhdr *ethh = eth_hdr(skb);

        if (unlikely(skb->mac_header >= skb->network_header))
                goto out;
        bpf_push_mac_rcsum(skb);
        if (is_multicast_ether_addr(ethh->h_dest))
                goto out;

        skb_pull(skb, sizeof(*ethh));
        skb_unset_mac_header(skb);
        skb_reset_network_header(skb);

        if (skb->protocol == htons(ETH_P_IP))
                return __bpf_redirect_neigh_v4(skb, dev, nh);
        else if (skb->protocol == htons(ETH_P_IPV6))
                return __bpf_redirect_neigh_v6(skb, dev, nh);
out:
        kfree_skb(skb);
        return -ENOTSUPP;
}

/* Internal, non-exposed redirect flags. */
enum {
        BPF_F_NEIGH        = (1ULL << 16),
        BPF_F_PEER        = (1ULL << 17),
        BPF_F_NEXTHOP        = (1ULL << 18),
#define BPF_F_REDIRECT_INTERNAL        (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
};

BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
        struct net_device *dev;
        struct sk_buff *clone;
        int ret;

        BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS);

        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                return -EINVAL;

        /* BPF test infra's convert___skb_to_skb() can create type-less
         * GSO packets. gso_features_check() will detect this as a bad
         * offload. However, lets not leak them out in the first place.
         */
        if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type))
                return -EBADMSG;

        dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
        if (unlikely(!dev))
                return -EINVAL;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (unlikely(!clone))
                return -ENOMEM;

        /* For direct write, we need to keep the invariant that the skbs
         * we're dealing with need to be uncloned. Should uncloning fail
         * here, we need to free the just generated clone to unclone once
         * again.
         */
        ret = bpf_try_make_head_writable(skb);
        if (unlikely(ret)) {
                kfree_skb(clone);
                return -ENOMEM;
        }

        return __bpf_redirect(clone, dev, flags);
}

static const struct bpf_func_proto bpf_clone_redirect_proto = {
        .func           = bpf_clone_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (likely(ops->ndo_get_peer_dev))
                return INDIRECT_CALL_1(ops->ndo_get_peer_dev,
                                       netkit_peer_dev, dev);
        return NULL;
}

int skb_do_redirect(struct sk_buff *skb)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct net *net = dev_net(skb->dev);
        struct net_device *dev;
        u32 flags = ri->flags;

        dev = dev_get_by_index_rcu(net, ri->tgt_index);
        ri->tgt_index = 0;
        ri->flags = 0;
        if (unlikely(!dev))
                goto out_drop;
        if (flags & BPF_F_PEER) {
                if (unlikely(!skb_at_tc_ingress(skb)))
                        goto out_drop;
                dev = skb_get_peer_dev(dev);
                if (unlikely(!dev ||
                             !(dev->flags & IFF_UP) ||
                             net_eq(net, dev_net(dev))))
                        goto out_drop;
                skb->dev = dev;
                dev_sw_netstats_rx_add(dev, skb->len);
                skb_scrub_packet(skb, false);
                return -EAGAIN;
        }
        return flags & BPF_F_NEIGH ?
               __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
                                    &ri->nh : NULL) :
               __bpf_redirect(skb, dev, flags);
out_drop:
        kfree_skb(skb);
        return -EINVAL;
}

BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                return TC_ACT_SHOT;

        ri->flags = flags;
        ri->tgt_index = ifindex;

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_proto = {
        .func           = bpf_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags))
                return TC_ACT_SHOT;

        ri->flags = BPF_F_PEER;
        ri->tgt_index = ifindex;

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_peer_proto = {
        .func           = bpf_redirect_peer,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
           int, plen, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely((plen && plen < sizeof(*params)) || flags))
                return TC_ACT_SHOT;

        ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
        ri->tgt_index = ifindex;

        BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
        if (plen)
                memcpy(&ri->nh, params, sizeof(ri->nh));

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_neigh_proto = {
        .func                = bpf_redirect_neigh,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
        .arg2_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
{
        msg->apply_bytes = bytes;
        return 0;
}

static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
        .func           = bpf_msg_apply_bytes,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
{
        msg->cork_bytes = bytes;
        return 0;
}

static void sk_msg_reset_curr(struct sk_msg *msg)
{
        if (!msg->sg.size) {
                msg->sg.curr = msg->sg.start;
                msg->sg.copybreak = 0;
        } else {
                u32 i = msg->sg.end;

                sk_msg_iter_var_prev(i);
                msg->sg.curr = i;
                msg->sg.copybreak = msg->sg.data[i].length;
        }
}

static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
        .func           = bpf_msg_cork_bytes,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
           u32, end, u64, flags)
{
        u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
        u32 first_sge, last_sge, i, shift, bytes_sg_total;
        struct scatterlist *sge;
        u8 *raw, *to, *from;
        struct page *page;

        if (unlikely(flags || end <= start))
                return -EINVAL;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += len;
                len = sk_msg_elem(msg, i)->length;
                if (start < offset + len)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        if (unlikely(start >= offset + len))
                return -EINVAL;

        first_sge = i;
        /* The start may point into the sg element so we need to also
         * account for the headroom.
         */
        bytes_sg_total = start - offset + bytes;
        if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
                goto out;

        /* At this point we need to linearize multiple scatterlist
         * elements or a single shared page. Either way we need to
         * copy into a linear buffer exclusively owned by BPF. Then
         * place the buffer in the scatterlist and fixup the original
         * entries by removing the entries now in the linear buffer
         * and shifting the remaining entries. For now we do not try
         * to copy partial entries to avoid complexity of running out
         * of sg_entry slots. The downside is reading a single byte
         * will copy the entire sg entry.
         */
        do {
                copy += sk_msg_elem(msg, i)->length;
                sk_msg_iter_var_next(i);
                if (bytes_sg_total <= copy)
                        break;
        } while (i != msg->sg.end);
        last_sge = i;

        if (unlikely(bytes_sg_total > copy))
                return -EINVAL;

        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
                           get_order(copy));
        if (unlikely(!page))
                return -ENOMEM;

        raw = page_address(page);
        i = first_sge;
        do {
                sge = sk_msg_elem(msg, i);
                from = sg_virt(sge);
                len = sge->length;
                to = raw + poffset;

                memcpy(to, from, len);
                poffset += len;
                sge->length = 0;
                put_page(sg_page(sge));

                sk_msg_iter_var_next(i);
        } while (i != last_sge);

        sg_set_page(&msg->sg.data[first_sge], page, copy, 0);

        /* To repair sg ring we need to shift entries. If we only
         * had a single entry though we can just replace it and
         * be done. Otherwise walk the ring and shift the entries.
         */
        WARN_ON_ONCE(last_sge == first_sge);
        shift = last_sge > first_sge ?
                last_sge - first_sge - 1 :
                NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
        if (!shift)
                goto out;

        i = first_sge;
        sk_msg_iter_var_next(i);
        do {
                u32 move_from;

                if (i + shift >= NR_MSG_FRAG_IDS)
                        move_from = i + shift - NR_MSG_FRAG_IDS;
                else
                        move_from = i + shift;
                if (move_from == msg->sg.end)
                        break;

                msg->sg.data[i] = msg->sg.data[move_from];
                msg->sg.data[move_from].length = 0;
                msg->sg.data[move_from].page_link = 0;
                msg->sg.data[move_from].offset = 0;
                sk_msg_iter_var_next(i);
        } while (1);

        msg->sg.end = msg->sg.end - shift > msg->sg.end ?
                      msg->sg.end - shift + NR_MSG_FRAG_IDS :
                      msg->sg.end - shift;
out:
        sk_msg_reset_curr(msg);
        msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
        msg->data_end = msg->data + bytes;
        return 0;
}

static const struct bpf_func_proto bpf_msg_pull_data_proto = {
        .func                = bpf_msg_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
           u32, len, u64, flags)
{
        struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
        u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
        u8 *raw, *to, *from;
        struct page *page;

        if (unlikely(flags))
                return -EINVAL;

        if (unlikely(len == 0))
                return 0;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += l;
                l = sk_msg_elem(msg, i)->length;

                if (start < offset + l)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        if (start > offset + l)
                return -EINVAL;

        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);

        /* If no space available will fallback to copy, we need at
         * least one scatterlist elem available to push data into
         * when start aligns to the beginning of an element or two
         * when it falls inside an element. We handle the start equals
         * offset case because its the common case for inserting a
         * header.
         */
        if (!space || (space == 1 && start != offset))
                copy = msg->sg.data[i].length;

        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
                           get_order(copy + len));
        if (unlikely(!page))
                return -ENOMEM;

        if (copy) {
                int front, back;

                raw = page_address(page);

                if (i == msg->sg.end)
                        sk_msg_iter_var_prev(i);
                psge = sk_msg_elem(msg, i);
                front = start - offset;
                back = psge->length - front;
                from = sg_virt(psge);

                if (front)
                        memcpy(raw, from, front);

                if (back) {
                        from += front;
                        to = raw + front + len;

                        memcpy(to, from, back);
                }

                put_page(sg_page(psge));
                new = i;
                goto place_new;
        }

        if (start - offset) {
                if (i == msg->sg.end)
                        sk_msg_iter_var_prev(i);
                psge = sk_msg_elem(msg, i);
                rsge = sk_msg_elem_cpy(msg, i);

                psge->length = start - offset;
                rsge.length -= psge->length;
                rsge.offset += start;

                sk_msg_iter_var_next(i);
                sg_unmark_end(psge);
                sg_unmark_end(&rsge);
        }

        /* Slot(s) to place newly allocated data */
        sk_msg_iter_next(msg, end);
        new = i;
        sk_msg_iter_var_next(i);

        if (i == msg->sg.end) {
                if (!rsge.length)
                        goto place_new;
                sk_msg_iter_next(msg, end);
                goto place_new;
        }

        /* Shift one or two slots as needed */
        sge = sk_msg_elem_cpy(msg, new);
        sg_unmark_end(&sge);

        nsge = sk_msg_elem_cpy(msg, i);
        if (rsge.length) {
                sk_msg_iter_var_next(i);
                nnsge = sk_msg_elem_cpy(msg, i);
                sk_msg_iter_next(msg, end);
        }

        while (i != msg->sg.end) {
                msg->sg.data[i] = sge;
                sge = nsge;
                sk_msg_iter_var_next(i);
                if (rsge.length) {
                        nsge = nnsge;
                        nnsge = sk_msg_elem_cpy(msg, i);
                } else {
                        nsge = sk_msg_elem_cpy(msg, i);
                }
        }

place_new:
        /* Place newly allocated data buffer */
        sk_mem_charge(msg->sk, len);
        msg->sg.size += len;
        __clear_bit(new, msg->sg.copy);
        sg_set_page(&msg->sg.data[new], page, len + copy, 0);
        if (rsge.length) {
                get_page(sg_page(&rsge));
                sk_msg_iter_var_next(new);
                msg->sg.data[new] = rsge;
        }

        sk_msg_reset_curr(msg);
        sk_msg_compute_data_pointers(msg);
        return 0;
}

static const struct bpf_func_proto bpf_msg_push_data_proto = {
        .func                = bpf_msg_push_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

static void sk_msg_shift_left(struct sk_msg *msg, int i)
{
        struct scatterlist *sge = sk_msg_elem(msg, i);
        int prev;

        put_page(sg_page(sge));
        do {
                prev = i;
                sk_msg_iter_var_next(i);
                msg->sg.data[prev] = msg->sg.data[i];
        } while (i != msg->sg.end);

        sk_msg_iter_prev(msg, end);
}

static void sk_msg_shift_right(struct sk_msg *msg, int i)
{
        struct scatterlist tmp, sge;

        sk_msg_iter_next(msg, end);
        sge = sk_msg_elem_cpy(msg, i);
        sk_msg_iter_var_next(i);
        tmp = sk_msg_elem_cpy(msg, i);

        while (i != msg->sg.end) {
                msg->sg.data[i] = sge;
                sk_msg_iter_var_next(i);
                sge = tmp;
                tmp = sk_msg_elem_cpy(msg, i);
        }
}

BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
           u32, len, u64, flags)
{
        u32 i = 0, l = 0, space, offset = 0;
        u64 last = start + len;
        int pop;

        if (unlikely(flags))
                return -EINVAL;

        if (unlikely(len == 0))
                return 0;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += l;
                l = sk_msg_elem(msg, i)->length;

                if (start < offset + l)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        /* Bounds checks: start and pop must be inside message */
        if (start >= offset + l || last > msg->sg.size)
                return -EINVAL;

        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);

        pop = len;
        /* --------------| offset
         * -| start      |-------- len -------|
         *
         *  |----- a ----|-------- pop -------|----- b ----|
         *  |______________________________________________| length
         *
         *
         * a:   region at front of scatter element to save
         * b:   region at back of scatter element to save when length > A + pop
         * pop: region to pop from element, same as input 'pop' here will be
         *      decremented below per iteration.
         *
         * Two top-level cases to handle when start != offset, first B is non
         * zero and second B is zero corresponding to when a pop includes more
         * than one element.
         *
         * Then if B is non-zero AND there is no space allocate space and
         * compact A, B regions into page. If there is space shift ring to
         * the right free'ing the next element in ring to place B, leaving
         * A untouched except to reduce length.
         */
        if (start != offset) {
                struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
                int a = start - offset;
                int b = sge->length - pop - a;

                sk_msg_iter_var_next(i);

                if (b > 0) {
                        if (space) {
                                sge->length = a;
                                sk_msg_shift_right(msg, i);
                                nsge = sk_msg_elem(msg, i);
                                get_page(sg_page(sge));
                                sg_set_page(nsge,
                                            sg_page(sge),
                                            b, sge->offset + pop + a);
                        } else {
                                struct page *page, *orig;
                                u8 *to, *from;

                                page = alloc_pages(__GFP_NOWARN |
                                                   __GFP_COMP   | GFP_ATOMIC,
                                                   get_order(a + b));
                                if (unlikely(!page))
                                        return -ENOMEM;

                                orig = sg_page(sge);
                                from = sg_virt(sge);
                                to = page_address(page);
                                memcpy(to, from, a);
                                memcpy(to + a, from + a + pop, b);
                                sg_set_page(sge, page, a + b, 0);
                                put_page(orig);
                        }
                        pop = 0;
                } else {
                        pop -= (sge->length - a);
                        sge->length = a;
                }
        }

        /* From above the current layout _must_ be as follows,
         *
         * -| offset
         * -| start
         *
         *  |---- pop ---|---------------- b ------------|
         *  |____________________________________________| length
         *
         * Offset and start of the current msg elem are equal because in the
         * previous case we handled offset != start and either consumed the
         * entire element and advanced to the next element OR pop == 0.
         *
         * Two cases to handle here are first pop is less than the length
         * leaving some remainder b above. Simply adjust the element's layout
         * in this case. Or pop >= length of the element so that b = 0. In this
         * case advance to next element decrementing pop.
         */
        while (pop) {
                struct scatterlist *sge = sk_msg_elem(msg, i);

                if (pop < sge->length) {
                        sge->length -= pop;
                        sge->offset += pop;
                        pop = 0;
                } else {
                        pop -= sge->length;
                        sk_msg_shift_left(msg, i);
                }
        }

        sk_mem_uncharge(msg->sk, len - pop);
        msg->sg.size -= (len - pop);
        sk_msg_reset_curr(msg);
        sk_msg_compute_data_pointers(msg);
        return 0;
}

static const struct bpf_func_proto bpf_msg_pop_data_proto = {
        .func                = bpf_msg_pop_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

#ifdef CONFIG_CGROUP_NET_CLASSID
BPF_CALL_0(bpf_get_cgroup_classid_curr)
{
        return __task_get_classid(current);
}

const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
        .func                = bpf_get_cgroup_classid_curr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
{
        struct sock *sk = skb_to_full_sk(skb);

        if (!sk || !sk_fullsock(sk))
                return 0;

        return sock_cgroup_classid(&sk->sk_cgrp_data);
}

static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
        .func                = bpf_skb_cgroup_classid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};
#endif

BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
        return task_get_classid(skb);
}

static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
        .func           = bpf_get_cgroup_classid,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
{
        return dst_tclassid(skb);
}

static const struct bpf_func_proto bpf_get_route_realm_proto = {
        .func           = bpf_get_route_realm,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
{
        /* If skb_clear_hash() was called due to mangling, we can
         * trigger SW recalculation here. Later access to hash
         * can then use the inline skb->hash via context directly
         * instead of calling this helper again.
         */
        return skb_get_hash(skb);
}

static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
        .func                = bpf_get_hash_recalc,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
{
        /* After all direct packet write, this can be used once for
         * triggering a lazy recalc on next skb_get_hash() invocation.
         */
        skb_clear_hash(skb);
        return 0;
}

static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
        .func                = bpf_set_hash_invalid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
{
        /* Set user specified hash as L4(+), so that it gets returned
         * on skb_get_hash() call unless BPF prog later on triggers a
         * skb_clear_hash().
         */
        __skb_set_sw_hash(skb, hash, true);
        return 0;
}

static const struct bpf_func_proto bpf_set_hash_proto = {
        .func                = bpf_set_hash,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
           u16, vlan_tci)
{
        int ret;

        if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
                     vlan_proto != htons(ETH_P_8021AD)))
                vlan_proto = htons(ETH_P_8021Q);

        bpf_push_mac_rcsum(skb);
        ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
        bpf_pull_mac_rcsum(skb);
        skb_reset_mac_len(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
        .func           = bpf_skb_vlan_push,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
        int ret;

        bpf_push_mac_rcsum(skb);
        ret = skb_vlan_pop(skb);
        bpf_pull_mac_rcsum(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
        .func           = bpf_skb_vlan_pop,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
        /* Caller already did skb_cow() with meta_len+len as headroom,
         * so no need to do it here.
         */
        skb_push(skb, len);
        skb_postpush_data_move(skb, len, off);
        memset(skb->data + off, 0, len);

        /* No skb_postpush_rcsum(skb, skb->data + off, len)
         * needed here as it does not change the skb->csum
         * result for checksum complete when summing over
         * zeroed blocks.
         */
        return 0;
}

static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
{
        void *old_data;

        /* skb_ensure_writable() is not needed here, as we're
         * already working on an uncloned skb.
         */
        if (unlikely(!pskb_may_pull(skb, off + len)))
                return -ENOMEM;

        old_data = skb->data;
        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, old_data + off, len);
        skb_postpull_data_move(skb, len, off);

        return 0;
}

static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
{
        bool trans_same = skb->transport_header == skb->network_header;
        int ret;

        /* There's no need for __skb_push()/__skb_pull() pair to
         * get to the start of the mac header as we're guaranteed
         * to always start from here under eBPF.
         */
        ret = bpf_skb_generic_push(skb, off, len);
        if (likely(!ret)) {
                skb->mac_header -= len;
                skb->network_header -= len;
                if (trans_same)
                        skb->transport_header = skb->network_header;
        }

        return ret;
}

static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
{
        bool trans_same = skb->transport_header == skb->network_header;
        int ret;

        /* Same here, __skb_push()/__skb_pull() pair not needed. */
        ret = bpf_skb_generic_pop(skb, off, len);
        if (likely(!ret)) {
                skb->mac_header += len;
                skb->network_header += len;
                if (trans_same)
                        skb->transport_header = skb->network_header;
        }

        return ret;
}

static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
{
        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        const u8 meta_len = skb_metadata_len(skb);
        u32 off = skb_mac_header_len(skb);
        int ret;

        ret = skb_cow(skb, meta_len + len_diff);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
                if (shinfo->gso_type & SKB_GSO_TCPV4) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV4;
                        shinfo->gso_type |=  SKB_GSO_TCPV6;
                }
                shinfo->gso_type |=  SKB_GSO_DODGY;
        }

        skb->protocol = htons(ETH_P_IPV6);
        skb_clear_hash(skb);

        return 0;
}

static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
{
        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        u32 off = skb_mac_header_len(skb);
        int ret;

        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
                if (shinfo->gso_type & SKB_GSO_TCPV6) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV6;
                        shinfo->gso_type |=  SKB_GSO_TCPV4;
                }
                shinfo->gso_type |=  SKB_GSO_DODGY;
        }

        skb->protocol = htons(ETH_P_IP);
        skb_clear_hash(skb);

        return 0;
}

static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
{
        __be16 from_proto = skb->protocol;

        if (from_proto == htons(ETH_P_IP) &&
              to_proto == htons(ETH_P_IPV6))
                return bpf_skb_proto_4_to_6(skb);

        if (from_proto == htons(ETH_P_IPV6) &&
              to_proto == htons(ETH_P_IP))
                return bpf_skb_proto_6_to_4(skb);

        return -ENOTSUPP;
}

BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
           u64, flags)
{
        int ret;

        if (unlikely(flags))
                return -EINVAL;

        /* General idea is that this helper does the basic groundwork
         * needed for changing the protocol, and eBPF program fills the
         * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
         * and other helpers, rather than passing a raw buffer here.
         *
         * The rationale is to keep this minimal and without a need to
         * deal with raw packet data. F.e. even if we would pass buffers
         * here, the program still needs to call the bpf_lX_csum_replace()
         * helpers anyway. Plus, this way we keep also separation of
         * concerns, since f.e. bpf_skb_store_bytes() should only take
         * care of stores.
         *
         * Currently, additional options and extension header space are
         * not supported, but flags register is reserved so we can adapt
         * that. For offloads, we mark packet as dodgy, so that headers
         * need to be verified first.
         */
        ret = bpf_skb_proto_xlat(skb, proto);
        bpf_compute_data_pointers(skb);
        if (ret)
                return ret;

        if (skb_valid_dst(skb))
                skb_dst_drop(skb);

        return 0;
}

static const struct bpf_func_proto bpf_skb_change_proto_proto = {
        .func                = bpf_skb_change_proto,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
{
        /* We only allow a restricted subset to be changed for now. */
        if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
                     !skb_pkt_type_ok(pkt_type)))
                return -EINVAL;

        skb->pkt_type = pkt_type;
        return 0;
}

static const struct bpf_func_proto bpf_skb_change_type_proto = {
        .func                = bpf_skb_change_type,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
{
        switch (skb->protocol) {
        case htons(ETH_P_IP):
                return sizeof(struct iphdr);
        case htons(ETH_P_IPV6):
                return sizeof(struct ipv6hdr);
        default:
                return ~0U;
        }
}

#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK        (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
                                         BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)

#define BPF_F_ADJ_ROOM_DECAP_L3_MASK        (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
                                         BPF_F_ADJ_ROOM_DECAP_L3_IPV6)

#define BPF_F_ADJ_ROOM_MASK                (BPF_F_ADJ_ROOM_FIXED_GSO | \
                                         BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2( \
                                          BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
                                         BPF_F_ADJ_ROOM_DECAP_L3_MASK)

static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
                            u64 flags)
{
        u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
        bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
        u16 mac_len = 0, inner_net = 0, inner_trans = 0;
        const u8 meta_len = skb_metadata_len(skb);
        unsigned int gso_type = SKB_GSO_DODGY;
        int ret;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
                /* udp gso_size delineates datagrams, only allow if fixed */
                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        return -ENOTSUPP;
        }

        ret = skb_cow_head(skb, meta_len + len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (encap) {
                if (skb->protocol != htons(ETH_P_IP) &&
                    skb->protocol != htons(ETH_P_IPV6))
                        return -ENOTSUPP;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        return -EINVAL;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        return -EINVAL;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
                    inner_mac_len < ETH_HLEN)
                        return -EINVAL;

                if (skb->encapsulation)
                        return -EALREADY;

                mac_len = skb->network_header - skb->mac_header;
                inner_net = skb->network_header;
                if (inner_mac_len > len_diff)
                        return -EINVAL;
                inner_trans = skb->transport_header;
        }

        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (encap) {
                skb->inner_mac_header = inner_net - inner_mac_len;
                skb->inner_network_header = inner_net;
                skb->inner_transport_header = inner_trans;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
                        skb_set_inner_protocol(skb, htons(ETH_P_TEB));
                else
                        skb_set_inner_protocol(skb, skb->protocol);

                skb->encapsulation = 1;
                skb_set_network_header(skb, mac_len);

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        gso_type |= SKB_GSO_UDP_TUNNEL;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
                        gso_type |= SKB_GSO_GRE;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        gso_type |= SKB_GSO_IPXIP6;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
                        gso_type |= SKB_GSO_IPXIP4;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
                        int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
                                        sizeof(struct ipv6hdr) :
                                        sizeof(struct iphdr);

                        skb_set_transport_header(skb, mac_len + nh_len);
                }

                /* Match skb->protocol to new outer l3 protocol */
                if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        skb->protocol = htons(ETH_P_IPV6);
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
                        skb->protocol = htons(ETH_P_IP);

                if (skb_valid_dst(skb))
                        skb_dst_drop(skb);
        }

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= gso_type;
                shinfo->gso_segs = 0;

                /* Due to header growth, MSS needs to be downgraded.
                 * There is a BUG_ON() when segmenting the frag_list with
                 * head_frag true, so linearize the skb after downgrading
                 * the MSS.
                 */
                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) {
                        skb_decrease_gso_size(shinfo, len_diff);
                        if (shinfo->frag_list)
                                return skb_linearize(skb);
                }
        }

        return 0;
}

static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
                              u64 flags)
{
        bool decap = flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK;
        int ret;

        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
                               BPF_F_ADJ_ROOM_DECAP_L3_MASK |
                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
                return -EINVAL;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
                /* udp gso_size delineates datagrams, only allow if fixed */
                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        return -ENOTSUPP;
        }

        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (decap) {
                /* Match skb->protocol to new outer l3 protocol */
                if (flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
                        skb->protocol = htons(ETH_P_IPV6);
                else if (flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
                        skb->protocol = htons(ETH_P_IP);

                if (skb_valid_dst(skb))
                        skb_dst_drop(skb);
        }

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* Due to header shrink, MSS can be upgraded. */
                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        skb_increase_gso_size(shinfo, len_diff);

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= SKB_GSO_DODGY;
                shinfo->gso_segs = 0;
        }

        return 0;
}

#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC

BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
{
        u32 len_diff_abs = abs(len_diff);
        bool shrink = len_diff < 0;
        int ret = 0;

        if (unlikely(flags || mode))
                return -EINVAL;
        if (unlikely(len_diff_abs > 0xfffU))
                return -EFAULT;

        if (!shrink) {
                ret = skb_cow(skb, len_diff);
                if (unlikely(ret < 0))
                        return ret;
                __skb_push(skb, len_diff_abs);
                memset(skb->data, 0, len_diff_abs);
        } else {
                if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
                        return -ENOMEM;
                __skb_pull(skb, len_diff_abs);
        }
        if (tls_sw_has_ctx_rx(skb->sk)) {
                struct strp_msg *rxm = strp_msg(skb);

                rxm->full_len += len_diff;
        }
        return ret;
}

static const struct bpf_func_proto sk_skb_adjust_room_proto = {
        .func                = sk_skb_adjust_room,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
{
        u32 len_cur, len_diff_abs = abs(len_diff);
        u32 len_min = bpf_skb_net_base_len(skb);
        u32 len_max = BPF_SKB_MAX_LEN;
        __be16 proto = skb->protocol;
        bool shrink = len_diff < 0;
        u32 off;
        int ret;

        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
                return -EINVAL;
        if (unlikely(len_diff_abs > 0xfffU))
                return -EFAULT;
        if (unlikely(proto != htons(ETH_P_IP) &&
                     proto != htons(ETH_P_IPV6)))
                return -ENOTSUPP;

        off = skb_mac_header_len(skb);
        switch (mode) {
        case BPF_ADJ_ROOM_NET:
                off += bpf_skb_net_base_len(skb);
                break;
        case BPF_ADJ_ROOM_MAC:
                break;
        default:
                return -ENOTSUPP;
        }

        if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
                if (!shrink)
                        return -EINVAL;

                switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
                case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
                        len_min = sizeof(struct iphdr);
                        break;
                case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
                        len_min = sizeof(struct ipv6hdr);
                        break;
                default:
                        return -EINVAL;
                }
        }

        len_cur = skb->len - skb_network_offset(skb);
        if ((shrink && (len_diff_abs >= len_cur ||
                        len_cur - len_diff_abs < len_min)) ||
            (!shrink && (skb->len + len_diff_abs > len_max &&
                         !skb_is_gso(skb))))
                return -ENOTSUPP;

        ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
                       bpf_skb_net_grow(skb, off, len_diff_abs, flags);
        if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
                __skb_reset_checksum_unnecessary(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
        .func                = bpf_skb_adjust_room,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
        int offset = skb_network_offset(skb);
        u32 min_len = 0;

        if (offset > 0)
                min_len = offset;
        if (skb_transport_header_was_set(skb)) {
                offset = skb_transport_offset(skb);
                if (offset > 0)
                        min_len = offset;
        }
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                offset = skb_checksum_start_offset(skb) +
                         skb->csum_offset + sizeof(__sum16);
                if (offset > 0)
                        min_len = offset;
        }
        return min_len;
}

static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
{
        unsigned int old_len = skb->len;
        int ret;

        ret = __skb_grow_rcsum(skb, new_len);
        if (!ret)
                memset(skb->data + old_len, 0, new_len - old_len);
        return ret;
}

static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
{
        return __skb_trim_rcsum(skb, new_len);
}

static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
                                        u64 flags)
{
        u32 max_len = BPF_SKB_MAX_LEN;
        u32 min_len = __bpf_skb_min_len(skb);
        int ret;

        if (unlikely(flags || new_len > max_len || new_len < min_len))
                return -EINVAL;
        if (skb->encapsulation)
                return -ENOTSUPP;

        /* The basic idea of this helper is that it's performing the
         * needed work to either grow or trim an skb, and eBPF program
         * rewrites the rest via helpers like bpf_skb_store_bytes(),
         * bpf_lX_csum_replace() and others rather than passing a raw
         * buffer here. This one is a slow path helper and intended
         * for replies with control messages.
         *
         * Like in bpf_skb_change_proto(), we want to keep this rather
         * minimal and without protocol specifics so that we are able
         * to separate concerns as in bpf_skb_store_bytes() should only
         * be the one responsible for writing buffers.
         *
         * It's really expected to be a slow path operation here for
         * control message replies, so we're implicitly linearizing,
         * uncloning and drop offloads from the skb by this.
         */
        ret = __bpf_try_make_writable(skb, skb->len);
        if (!ret) {
                if (new_len > skb->len)
                        ret = bpf_skb_grow_rcsum(skb, new_len);
                else if (new_len < skb->len)
                        ret = bpf_skb_trim_rcsum(skb, new_len);
                if (!ret && skb_is_gso(skb))
                        skb_gso_reset(skb);
        }
        return ret;
}

BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
{
        int ret = __bpf_skb_change_tail(skb, new_len, flags);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_tail_proto = {
        .func                = bpf_skb_change_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
{
        return __bpf_skb_change_tail(skb, new_len, flags);
}

static const struct bpf_func_proto sk_skb_change_tail_proto = {
        .func                = sk_skb_change_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
                                        u64 flags)
{
        const u8 meta_len = skb_metadata_len(skb);
        u32 max_len = BPF_SKB_MAX_LEN;
        u32 new_len = skb->len + head_room;
        int ret;

        if (unlikely(flags || (int)head_room < 0 ||
                     (!skb_is_gso(skb) && new_len > max_len) ||
                     new_len < skb->len))
                return -EINVAL;

        ret = skb_cow(skb, meta_len + head_room);
        if (likely(!ret)) {
                /* Idea for this helper is that we currently only
                 * allow to expand on mac header. This means that
                 * skb->protocol network header, etc, stay as is.
                 * Compared to bpf_skb_change_tail(), we're more
                 * flexible due to not needing to linearize or
                 * reset GSO. Intention for this helper is to be
                 * used by an L3 skb that needs to push mac header
                 * for redirection into L2 device.
                 */
                __skb_push(skb, head_room);
                skb_postpush_data_move(skb, head_room, 0);
                memset(skb->data, 0, head_room);
                skb_reset_mac_header(skb);
                skb_reset_mac_len(skb);
        }

        return ret;
}

BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
{
        int ret = __bpf_skb_change_head(skb, head_room, flags);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_head_proto = {
        .func                = bpf_skb_change_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
{
        return __bpf_skb_change_head(skb, head_room, flags);
}

static const struct bpf_func_proto sk_skb_change_head_proto = {
        .func                = sk_skb_change_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
{
        return xdp_get_buff_len(xdp);
}

static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
        .func                = bpf_xdp_get_buff_len,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)

const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
        .func                = bpf_xdp_get_buff_len,
        .gpl_only        = false,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_xdp_get_buff_len_bpf_ids[0],
};

static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
        return xdp_data_meta_unsupported(xdp) ? 0 :
               xdp->data - xdp->data_meta;
}

BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
        unsigned long metalen = xdp_get_metalen(xdp);
        void *data_start = xdp_frame_end + metalen;
        void *data = xdp->data + offset;

        if (unlikely(data < data_start ||
                     data > xdp->data_end - ETH_HLEN))
                return -EINVAL;

        if (metalen)
                memmove(xdp->data_meta + offset,
                        xdp->data_meta, metalen);
        xdp->data_meta += offset;
        xdp->data = data;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
        .func                = bpf_xdp_adjust_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
                      void *buf, unsigned long len, bool flush)
{
        unsigned long ptr_len, ptr_off = 0;
        skb_frag_t *next_frag, *end_frag;
        struct skb_shared_info *sinfo;
        void *src, *dst;
        u8 *ptr_buf;

        if (likely(xdp->data_end - xdp->data >= off + len)) {
                src = flush ? buf : xdp->data + off;
                dst = flush ? xdp->data + off : buf;
                memcpy(dst, src, len);
                return;
        }

        sinfo = xdp_get_shared_info_from_buff(xdp);
        end_frag = &sinfo->frags[sinfo->nr_frags];
        next_frag = &sinfo->frags[0];

        ptr_len = xdp->data_end - xdp->data;
        ptr_buf = xdp->data;

        while (true) {
                if (off < ptr_off + ptr_len) {
                        unsigned long copy_off = off - ptr_off;
                        unsigned long copy_len = min(len, ptr_len - copy_off);

                        src = flush ? buf : ptr_buf + copy_off;
                        dst = flush ? ptr_buf + copy_off : buf;
                        memcpy(dst, src, copy_len);

                        off += copy_len;
                        len -= copy_len;
                        buf += copy_len;
                }

                if (!len || next_frag == end_frag)
                        break;

                ptr_off += ptr_len;
                ptr_buf = skb_frag_address(next_frag);
                ptr_len = skb_frag_size(next_frag);
                next_frag++;
        }
}

void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
{
        u32 size = xdp->data_end - xdp->data;
        struct skb_shared_info *sinfo;
        void *addr = xdp->data;
        int i;

        if (unlikely(offset > 0xffff || len > 0xffff))
                return ERR_PTR(-EFAULT);

        if (unlikely(offset + len > xdp_get_buff_len(xdp)))
                return ERR_PTR(-EINVAL);

        if (likely(offset < size)) /* linear area */
                goto out;

        sinfo = xdp_get_shared_info_from_buff(xdp);
        offset -= size;
        for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
                u32 frag_size = skb_frag_size(&sinfo->frags[i]);

                if  (offset < frag_size) {
                        addr = skb_frag_address(&sinfo->frags[i]);
                        size = frag_size;
                        break;
                }
                offset -= frag_size;
        }
out:
        return offset + len <= size ? addr + offset : NULL;
}

BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
           void *, buf, u32, len)
{
        void *ptr;

        ptr = bpf_xdp_pointer(xdp, offset, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        if (!ptr)
                bpf_xdp_copy_buf(xdp, offset, buf, len, false);
        else
                memcpy(buf, ptr, len);

        return 0;
}

static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
        .func                = bpf_xdp_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
{
        return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
}

BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
           void *, buf, u32, len)
{
        void *ptr;

        ptr = bpf_xdp_pointer(xdp, offset, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        if (!ptr)
                bpf_xdp_copy_buf(xdp, offset, buf, len, true);
        else
                memcpy(ptr, buf, len);

        return 0;
}

static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
        .func                = bpf_xdp_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
{
        return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
}

static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
        struct xdp_rxq_info *rxq = xdp->rxq;
        int tailroom;

        if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
                return -EOPNOTSUPP;

        tailroom = rxq->frag_size - skb_frag_size(frag) -
                   skb_frag_off(frag) % rxq->frag_size;
        WARN_ON_ONCE(tailroom < 0);
        if (unlikely(offset > tailroom))
                return -EINVAL;

        memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
        skb_frag_size_add(frag, offset);
        sinfo->xdp_frags_size += offset;
        if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
                xsk_buff_get_tail(xdp)->data_end += offset;

        return 0;
}

static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
                                               bool tail, bool release)
{
        struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) :
                                          xsk_buff_get_head(xdp);

        if (release) {
                xsk_buff_del_frag(zc_frag);
        } else {
                if (tail)
                        zc_frag->data_end -= shrink;
                else
                        zc_frag->data += shrink;
        }

        return zc_frag;
}

static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
                                int shrink, bool tail)
{
        enum xdp_mem_type mem_type = xdp->rxq->mem.type;
        bool release = skb_frag_size(frag) == shrink;
        netmem_ref netmem = skb_frag_netmem(frag);
        struct xdp_buff *zc_frag = NULL;

        if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
                netmem = 0;
                zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release);
        }

        if (release) {
                __xdp_return(netmem, mem_type, false, zc_frag);
        } else {
                if (!tail)
                        skb_frag_off_add(frag, shrink);
                skb_frag_size_sub(frag, shrink);
        }

        return release;
}

static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        int i, n_frags_free = 0, len_free = 0;

        if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
                return -EINVAL;

        for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
                skb_frag_t *frag = &sinfo->frags[i];
                int shrink = min_t(int, offset, skb_frag_size(frag));

                len_free += shrink;
                offset -= shrink;
                if (bpf_xdp_shrink_data(xdp, frag, shrink, true))
                        n_frags_free++;
        }
        sinfo->nr_frags -= n_frags_free;
        sinfo->xdp_frags_size -= len_free;

        if (unlikely(!sinfo->nr_frags)) {
                xdp_buff_clear_frags_flag(xdp);
                xdp_buff_clear_frag_pfmemalloc(xdp);
                xdp->data_end -= offset;
        }

        return 0;
}

BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
        void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
        void *data_end = xdp->data_end + offset;

        if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
                if (offset < 0)
                        return bpf_xdp_frags_shrink_tail(xdp, -offset);

                return bpf_xdp_frags_increase_tail(xdp, offset);
        }

        /* Notice that xdp_data_hard_end have reserved some tailroom */
        if (unlikely(data_end > data_hard_end))
                return -EINVAL;

        if (unlikely(data_end < xdp->data + ETH_HLEN))
                return -EINVAL;

        /* Clear memory area on grow, can contain uninit kernel memory */
        if (offset > 0)
                memset(xdp->data_end, 0, offset);

        xdp->data_end = data_end;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
        .func                = bpf_xdp_adjust_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
        void *meta = xdp->data_meta + offset;
        unsigned long metalen = xdp->data - meta;

        if (xdp_data_meta_unsupported(xdp))
                return -ENOTSUPP;
        if (unlikely(meta < xdp_frame_end ||
                     meta > xdp->data))
                return -EINVAL;
        if (unlikely(xdp_metalen_invalid(metalen)))
                return -EACCES;

        xdp->data_meta = meta;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
        .func                = bpf_xdp_adjust_meta,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

/**
 * DOC: xdp redirect
 *
 * XDP_REDIRECT works by a three-step process, implemented in the functions
 * below:
 *
 * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
 *    of the redirect and store it (along with some other metadata) in a per-CPU
 *    struct bpf_redirect_info.
 *
 * 2. When the program returns the XDP_REDIRECT return code, the driver will
 *    call xdp_do_redirect() which will use the information in struct
 *    bpf_redirect_info to actually enqueue the frame into a map type-specific
 *    bulk queue structure.
 *
 * 3. Before exiting its NAPI poll loop, the driver will call
 *    xdp_do_flush(), which will flush all the different bulk queues,
 *    thus completing the redirect. Note that xdp_do_flush() must be
 *    called before napi_complete_done() in the driver, as the
 *    XDP_REDIRECT logic relies on being inside a single NAPI instance
 *    through to the xdp_do_flush() call for RCU protection of all
 *    in-kernel data structures.
 */
/*
 * Pointers to the map entries will be kept around for this whole sequence of
 * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
 * the core code; instead, the RCU protection relies on everything happening
 * inside a single NAPI poll sequence, which means it's between a pair of calls
 * to local_bh_disable()/local_bh_enable().
 *
 * The map entries are marked as __rcu and the map code makes sure to
 * dereference those pointers with rcu_dereference_check() in a way that works
 * for both sections that to hold an rcu_read_lock() and sections that are
 * called from NAPI without a separate rcu_read_lock(). The code below does not
 * use RCU annotations, but relies on those in the map code.
 */
void xdp_do_flush(void)
{
        struct list_head *lh_map, *lh_dev, *lh_xsk;

        bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
        if (lh_dev)
                __dev_flush(lh_dev);
        if (lh_map)
                __cpu_map_flush(lh_map);
        if (lh_xsk)
                __xsk_map_flush(lh_xsk);
}
EXPORT_SYMBOL_GPL(xdp_do_flush);

#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
void xdp_do_check_flushed(struct napi_struct *napi)
{
        struct list_head *lh_map, *lh_dev, *lh_xsk;
        bool missed = false;

        bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
        if (lh_dev) {
                __dev_flush(lh_dev);
                missed = true;
        }
        if (lh_map) {
                __cpu_map_flush(lh_map);
                missed = true;
        }
        if (lh_xsk) {
                __xsk_map_flush(lh_xsk);
                missed = true;
        }

        WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
                  napi->poll);
}
#endif

DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);

u32 xdp_master_redirect(struct xdp_buff *xdp)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct net_device *master, *slave;

        master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
        if (unlikely(!(master->flags & IFF_UP)))
                return XDP_ABORTED;
        slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
        if (slave && slave != xdp->rxq->dev) {
                /* The target device is different from the receiving device, so
                 * redirect it to the new device.
                 * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
                 * drivers to unmap the packet from their rx ring.
                 */
                ri->tgt_index = slave->ifindex;
                ri->map_id = INT_MAX;
                ri->map_type = BPF_MAP_TYPE_UNSPEC;
                return XDP_REDIRECT;
        }
        return XDP_TX;
}
EXPORT_SYMBOL_GPL(xdp_master_redirect);

static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
                                        const struct net_device *dev,
                                        struct xdp_buff *xdp,
                                        const struct bpf_prog *xdp_prog)
{
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        err = __xsk_map_redirect(fwd, xdp);
        if (unlikely(err))
                goto err;

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

static __always_inline int
__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
                        struct xdp_frame *xdpf,
                        const struct bpf_prog *xdp_prog)
{
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        u32 flags = ri->flags;
        struct bpf_map *map;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->flags = 0;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        if (unlikely(!xdpf)) {
                err = -EOVERFLOW;
                goto err;
        }

        switch (map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
                if (unlikely(flags & BPF_F_BROADCAST)) {
                        map = READ_ONCE(ri->map);

                        /* The map pointer is cleared when the map is being torn
                         * down by dev_map_free()
                         */
                        if (unlikely(!map)) {
                                err = -ENOENT;
                                break;
                        }

                        WRITE_ONCE(ri->map, NULL);
                        err = dev_map_enqueue_multi(xdpf, dev, map,
                                                    flags & BPF_F_EXCLUDE_INGRESS);
                } else {
                        err = dev_map_enqueue(fwd, xdpf, dev);
                }
                break;
        case BPF_MAP_TYPE_CPUMAP:
                err = cpu_map_enqueue(fwd, xdpf, dev);
                break;
        case BPF_MAP_TYPE_UNSPEC:
                if (map_id == INT_MAX) {
                        fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
                        if (unlikely(!fwd)) {
                                err = -EINVAL;
                                break;
                        }
                        err = dev_xdp_enqueue(fwd, xdpf, dev);
                        break;
                }
                fallthrough;
        default:
                err = -EBADRQC;
        }

        if (unlikely(err))
                goto err;

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                    const struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;

        if (map_type == BPF_MAP_TYPE_XSKMAP)
                return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);

        return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
                                       xdp_prog);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);

int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
                          struct xdp_frame *xdpf,
                          const struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;

        if (map_type == BPF_MAP_TYPE_XSKMAP)
                return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);

        return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);

static int xdp_do_generic_redirect_map(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct xdp_buff *xdp,
                                       const struct bpf_prog *xdp_prog,
                                       void *fwd, enum bpf_map_type map_type,
                                       u32 map_id, u32 flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct bpf_map *map;
        int err;

        switch (map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
                if (unlikely(flags & BPF_F_BROADCAST)) {
                        map = READ_ONCE(ri->map);

                        /* The map pointer is cleared when the map is being torn
                         * down by dev_map_free()
                         */
                        if (unlikely(!map)) {
                                err = -ENOENT;
                                break;
                        }

                        WRITE_ONCE(ri->map, NULL);
                        err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
                                                     flags & BPF_F_EXCLUDE_INGRESS);
                } else {
                        err = dev_map_generic_redirect(fwd, skb, xdp_prog);
                }
                if (unlikely(err))
                        goto err;
                break;
        case BPF_MAP_TYPE_XSKMAP:
                err = xsk_generic_rcv(fwd, xdp);
                if (err)
                        goto err;
                consume_skb(skb);
                break;
        case BPF_MAP_TYPE_CPUMAP:
                err = cpu_map_generic_redirect(fwd, skb);
                if (unlikely(err))
                        goto err;
                break;
        default:
                err = -EBADRQC;
                goto err;
        }

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp,
                            const struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        u32 flags = ri->flags;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->flags = 0;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
                fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
                if (unlikely(!fwd)) {
                        err = -EINVAL;
                        goto err;
                }

                err = xdp_ok_fwd_dev(fwd, skb->len);
                if (unlikely(err))
                        goto err;

                skb->dev = fwd;
                _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
                generic_xdp_tx(skb, xdp_prog);
                return 0;
        }

        return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags);
err:
        _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
        return err;
}

BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags))
                return XDP_ABORTED;

        /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
         * by map_idr) is used for ifindex based XDP redirect.
         */
        ri->tgt_index = ifindex;
        ri->map_id = INT_MAX;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        return XDP_REDIRECT;
}

static const struct bpf_func_proto bpf_xdp_redirect_proto = {
        .func           = bpf_xdp_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key,
           u64, flags)
{
        return map->ops->map_redirect(map, key, flags);
}

static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
        .func           = bpf_xdp_redirect_map,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
                                  unsigned long off, unsigned long len)
{
        void *ptr = skb_header_pointer(skb, off, len, dst_buff);

        if (unlikely(!ptr))
                return len;
        if (ptr != dst_buff)
                memcpy(dst_buff, ptr, len);

        return 0;
}

BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
           u64, flags, void *, meta, u64, meta_size)
{
        u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;
        if (unlikely(!skb || skb_size > skb->len))
                return -EFAULT;

        return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
                                bpf_skb_copy);
}

static const struct bpf_func_proto bpf_skb_event_output_proto = {
        .func                = bpf_skb_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)

const struct bpf_func_proto bpf_skb_output_proto = {
        .func                = bpf_skb_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_skb_output_btf_ids[0],
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

static unsigned short bpf_tunnel_key_af(u64 flags)
{
        return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
}

BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
           u32, size, u64, flags)
{
        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
        u8 compat[sizeof(struct bpf_tunnel_key)];
        void *to_orig = to;
        int err;

        if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
                                         BPF_F_TUNINFO_FLAGS)))) {
                err = -EINVAL;
                goto err_clear;
        }
        if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
                err = -EPROTO;
                goto err_clear;
        }
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                err = -EINVAL;
                switch (size) {
                case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
                case offsetof(struct bpf_tunnel_key, tunnel_label):
                case offsetof(struct bpf_tunnel_key, tunnel_ext):
                        goto set_compat;
                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
                        /* Fixup deprecated structure layouts here, so we have
                         * a common path later on.
                         */
                        if (ip_tunnel_info_af(info) != AF_INET)
                                goto err_clear;
set_compat:
                        to = (struct bpf_tunnel_key *)compat;
                        break;
                default:
                        goto err_clear;
                }
        }

        to->tunnel_id = be64_to_cpu(info->key.tun_id);
        to->tunnel_tos = info->key.tos;
        to->tunnel_ttl = info->key.ttl;
        if (flags & BPF_F_TUNINFO_FLAGS)
                to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);
        else
                to->tunnel_ext = 0;

        if (flags & BPF_F_TUNINFO_IPV6) {
                memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
                       sizeof(to->remote_ipv6));
                memcpy(to->local_ipv6, &info->key.u.ipv6.dst,
                       sizeof(to->local_ipv6));
                to->tunnel_label = be32_to_cpu(info->key.label);
        } else {
                to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
                to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst);
                memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3);
                to->tunnel_label = 0;
        }

        if (unlikely(size != sizeof(struct bpf_tunnel_key)))
                memcpy(to_orig, to, size);

        return 0;
err_clear:
        memset(to_orig, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
        .func                = bpf_skb_get_tunnel_key,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
{
        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
        int err;

        if (unlikely(!info ||
                     !ip_tunnel_is_options_present(info->key.tun_flags))) {
                err = -ENOENT;
                goto err_clear;
        }
        if (unlikely(size < info->options_len)) {
                err = -ENOMEM;
                goto err_clear;
        }

        ip_tunnel_info_opts_get(to, info);
        if (size > info->options_len)
                memset(to + info->options_len, 0, size - info->options_len);

        return info->options_len;
err_clear:
        memset(to, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
        .func                = bpf_skb_get_tunnel_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
};

static struct metadata_dst __percpu *md_dst;

BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
           const struct bpf_tunnel_key *, from, u32, size, u64, flags)
{
        struct metadata_dst *md = this_cpu_ptr(md_dst);
        u8 compat[sizeof(struct bpf_tunnel_key)];
        struct ip_tunnel_info *info;

        if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
                               BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER |
                               BPF_F_NO_TUNNEL_KEY)))
                return -EINVAL;
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                switch (size) {
                case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
                case offsetof(struct bpf_tunnel_key, tunnel_label):
                case offsetof(struct bpf_tunnel_key, tunnel_ext):
                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
                        /* Fixup deprecated structure layouts here, so we have
                         * a common path later on.
                         */
                        memcpy(compat, from, size);
                        memset(compat + size, 0, sizeof(compat) - size);
                        from = (const struct bpf_tunnel_key *) compat;
                        break;
                default:
                        return -EINVAL;
                }
        }
        if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
                     from->tunnel_ext))
                return -EINVAL;

        skb_dst_drop(skb);
        dst_hold((struct dst_entry *) md);
        skb_dst_set(skb, (struct dst_entry *) md);

        info = &md->u.tun_info;
        memset(info, 0, sizeof(*info));
        info->mode = IP_TUNNEL_INFO_TX;

        __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
        __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags,
                     flags & BPF_F_DONT_FRAGMENT);
        __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags,
                     !(flags & BPF_F_ZERO_CSUM_TX));
        __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags,
                     flags & BPF_F_SEQ_NUMBER);
        __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags,
                     !(flags & BPF_F_NO_TUNNEL_KEY));

        info->key.tun_id = cpu_to_be64(from->tunnel_id);
        info->key.tos = from->tunnel_tos;
        info->key.ttl = from->tunnel_ttl;

        if (flags & BPF_F_TUNINFO_IPV6) {
                info->mode |= IP_TUNNEL_INFO_IPV6;
                memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
                       sizeof(from->remote_ipv6));
                memcpy(&info->key.u.ipv6.src, from->local_ipv6,
                       sizeof(from->local_ipv6));
                info->key.label = cpu_to_be32(from->tunnel_label) &
                                  IPV6_FLOWLABEL_MASK;
        } else {
                info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
                info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4);
                info->key.flow_flags = FLOWI_FLAG_ANYSRC;
        }

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
        .func                = bpf_skb_set_tunnel_key,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
           const u8 *, from, u32, size)
{
        struct ip_tunnel_info *info = skb_tunnel_info(skb);
        const struct metadata_dst *md = this_cpu_ptr(md_dst);
        IP_TUNNEL_DECLARE_FLAGS(present) = { };

        if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
                return -EINVAL;
        if (unlikely(size > IP_TUNNEL_OPTS_MAX))
                return -ENOMEM;

        ip_tunnel_set_options_present(present);
        ip_tunnel_info_opts_set(info, from, size, present);

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
        .func                = bpf_skb_set_tunnel_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

static const struct bpf_func_proto *
bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
{
        if (!md_dst) {
                struct metadata_dst __percpu *tmp;

                tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
                                                METADATA_IP_TUNNEL,
                                                GFP_KERNEL);
                if (!tmp)
                        return NULL;
                if (cmpxchg(&md_dst, NULL, tmp))
                        metadata_dst_free_percpu(tmp);
        }

        switch (which) {
        case BPF_FUNC_skb_set_tunnel_key:
                return &bpf_skb_set_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return &bpf_skb_set_tunnel_opt_proto;
        default:
                return NULL;
        }
}

BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
           u32, idx)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct cgroup *cgrp;
        struct sock *sk;

        sk = skb_to_full_sk(skb);
        if (!sk || !sk_fullsock(sk))
                return -ENOENT;
        if (unlikely(idx >= array->map.max_entries))
                return -E2BIG;

        cgrp = READ_ONCE(array->ptrs[idx]);
        if (unlikely(!cgrp))
                return -EAGAIN;

        return sk_under_cgroup_hierarchy(sk, cgrp);
}

static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
        .func                = bpf_skb_under_cgroup,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

#ifdef CONFIG_SOCK_CGROUP_DATA
static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
        struct cgroup *cgrp;

        sk = sk_to_full_sk(sk);
        if (!sk || !sk_fullsock(sk))
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        return cgroup_id(cgrp);
}

BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
{
        return __bpf_sk_cgroup_id(skb->sk);
}

static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
        .func           = bpf_skb_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
                                              int ancestor_level)
{
        struct cgroup *ancestor;
        struct cgroup *cgrp;

        sk = sk_to_full_sk(sk);
        if (!sk || !sk_fullsock(sk))
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        ancestor = cgroup_ancestor(cgrp, ancestor_level);
        if (!ancestor)
                return 0;

        return cgroup_id(ancestor);
}

BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
           ancestor_level)
{
        return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
}

static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
        .func           = bpf_skb_ancestor_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
{
        return __bpf_sk_cgroup_id(sk);
}

static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
        .func           = bpf_sk_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
};

BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
{
        return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}

static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
        .func           = bpf_sk_ancestor_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type      = ARG_ANYTHING,
};
#endif

static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
                                  unsigned long off, unsigned long len)
{
        struct xdp_buff *xdp = (struct xdp_buff *)ctx;

        bpf_xdp_copy_buf(xdp, off, dst, len, false);
        return 0;
}

BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
           u64, flags, void *, meta, u64, meta_size)
{
        u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;

        if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
                return -EFAULT;

        return bpf_event_output(map, flags, meta, meta_size, xdp,
                                xdp_size, bpf_xdp_copy);
}

static const struct bpf_func_proto bpf_xdp_event_output_proto = {
        .func                = bpf_xdp_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)

const struct bpf_func_proto bpf_xdp_output_proto = {
        .func                = bpf_xdp_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_xdp_output_btf_ids[0],
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
        return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
}

static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
        .func           = bpf_get_socket_cookie,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
        return __sock_gen_cookie(ctx->sk);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
        .func                = bpf_get_socket_cookie_sock_addr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
{
        return __sock_gen_cookie(ctx);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
        .func                = bpf_get_socket_cookie_sock,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
{
        return sk ? sock_gen_cookie(sk) : 0;
}

const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
        .func                = bpf_get_socket_ptr_cookie,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
};

BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
        return __sock_gen_cookie(ctx->sk);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
        .func                = bpf_get_socket_cookie_sock_ops,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

static u64 __bpf_get_netns_cookie(struct sock *sk)
{
        const struct net *net = sk ? sock_net(sk) : &init_net;

        return net->net_cookie;
}

BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb)
{
        return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_proto = {
        .func           = bpf_get_netns_cookie,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
{
        return __bpf_get_netns_cookie(ctx);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
        .func                = bpf_get_netns_cookie_sock,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
        .func                = bpf_get_netns_cookie_sock_addr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
        .func                = bpf_get_netns_cookie_sock_ops,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
        .func                = bpf_get_netns_cookie_sk_msg,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
        struct sock *sk = sk_to_full_sk(skb->sk);
        kuid_t kuid;

        if (!sk || !sk_fullsock(sk))
                return overflowuid;
        kuid = sock_net_uid(sock_net(sk), sk);
        return from_kuid_munged(sock_net(sk)->user_ns, kuid);
}

static const struct bpf_func_proto bpf_get_socket_uid_proto = {
        .func           = bpf_get_socket_uid,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
{
        u32 sk_bpf_cb_flags;

        if (getopt) {
                *(u32 *)optval = sk->sk_bpf_cb_flags;
                return 0;
        }

        sk_bpf_cb_flags = *(u32 *)optval;

        if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
                return -EINVAL;

        sk->sk_bpf_cb_flags = sk_bpf_cb_flags;

        return 0;
}

static int sol_socket_sockopt(struct sock *sk, int optname,
                              char *optval, int *optlen,
                              bool getopt)
{
        switch (optname) {
        case SO_REUSEADDR:
        case SO_SNDBUF:
        case SO_RCVBUF:
        case SO_KEEPALIVE:
        case SO_PRIORITY:
        case SO_REUSEPORT:
        case SO_RCVLOWAT:
        case SO_MARK:
        case SO_MAX_PACING_RATE:
        case SO_BINDTOIFINDEX:
        case SO_TXREHASH:
        case SK_BPF_CB_FLAGS:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        case SO_BINDTODEVICE:
                break;
        default:
                return -EINVAL;
        }

        if (optname == SK_BPF_CB_FLAGS)
                return sk_bpf_set_get_cb_flags(sk, optval, getopt);

        if (getopt) {
                if (optname == SO_BINDTODEVICE)
                        return -EINVAL;
                return sk_getsockopt(sk, SOL_SOCKET, optname,
                                     KERNEL_SOCKPTR(optval),
                                     KERNEL_SOCKPTR(optlen));
        }

        return sk_setsockopt(sk, SOL_SOCKET, optname,
                             KERNEL_SOCKPTR(optval), *optlen);
}

static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname,
                                  char *optval, int optlen)
{
        if (optlen != sizeof(int))
                return -EINVAL;

        switch (optname) {
        case TCP_BPF_SOCK_OPS_CB_FLAGS: {
                int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags;

                memcpy(optval, &cb_flags, optlen);
                break;
        }
        case TCP_BPF_RTO_MIN: {
                int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min);

                memcpy(optval, &rto_min_us, optlen);
                break;
        }
        case TCP_BPF_DELACK_MAX: {
                int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max);

                memcpy(optval, &delack_max_us, optlen);
                break;
        }
        default:
                return -EINVAL;
        }

        return 0;
}

static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
                                  char *optval, int optlen)
{
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned long timeout;
        int val;

        if (optlen != sizeof(int))
                return -EINVAL;

        val = *(int *)optval;

        /* Only some options are supported */
        switch (optname) {
        case TCP_BPF_IW:
                if (val <= 0 || tp->data_segs_out > tp->syn_data)
                        return -EINVAL;
                tcp_snd_cwnd_set(tp, val);
                break;
        case TCP_BPF_SNDCWND_CLAMP:
                if (val <= 0)
                        return -EINVAL;
                tp->snd_cwnd_clamp = val;
                tp->snd_ssthresh = val;
                break;
        case TCP_BPF_DELACK_MAX:
                timeout = usecs_to_jiffies(val);
                if (timeout > TCP_DELACK_MAX ||
                    timeout < TCP_TIMEOUT_MIN)
                        return -EINVAL;
                inet_csk(sk)->icsk_delack_max = timeout;
                break;
        case TCP_BPF_RTO_MIN:
                timeout = usecs_to_jiffies(val);
                if (timeout > TCP_RTO_MIN ||
                    timeout < TCP_TIMEOUT_MIN)
                        return -EINVAL;
                inet_csk(sk)->icsk_rto_min = timeout;
                break;
        case TCP_BPF_SOCK_OPS_CB_FLAGS:
                if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
                        return -EINVAL;
                tp->bpf_sock_ops_cb_flags = val;
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
                                      int *optlen, bool getopt)
{
        struct tcp_sock *tp;
        int ret;

        if (*optlen < 2)
                return -EINVAL;

        if (getopt) {
                if (!inet_csk(sk)->icsk_ca_ops)
                        return -EINVAL;
                /* BPF expects NULL-terminated tcp-cc string */
                optval[--(*optlen)] = '\0';
                return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
                                         KERNEL_SOCKPTR(optval),
                                         KERNEL_SOCKPTR(optlen));
        }

        /* "cdg" is the only cc that alloc a ptr
         * in inet_csk_ca area.  The bpf-tcp-cc may
         * overwrite this ptr after switching to cdg.
         */
        if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
                return -ENOTSUPP;

        /* It stops this looping
         *
         * .init => bpf_setsockopt(tcp_cc) => .init =>
         * bpf_setsockopt(tcp_cc)" => .init => ....
         *
         * The second bpf_setsockopt(tcp_cc) is not allowed
         * in order to break the loop when both .init
         * are the same bpf prog.
         *
         * This applies even the second bpf_setsockopt(tcp_cc)
         * does not cause a loop.  This limits only the first
         * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
         * pick a fallback cc (eg. peer does not support ECN)
         * and the second '.init' cannot fallback to
         * another.
         */
        tp = tcp_sk(sk);
        if (tp->bpf_chg_cc_inprogress)
                return -EBUSY;

        tp->bpf_chg_cc_inprogress = 1;
        ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
                                KERNEL_SOCKPTR(optval), *optlen);
        tp->bpf_chg_cc_inprogress = 0;
        return ret;
}

static int sol_tcp_sockopt(struct sock *sk, int optname,
                           char *optval, int *optlen,
                           bool getopt)
{
        if (sk->sk_protocol != IPPROTO_TCP)
                return -EINVAL;

        switch (optname) {
        case TCP_NODELAY:
        case TCP_MAXSEG:
        case TCP_KEEPIDLE:
        case TCP_KEEPINTVL:
        case TCP_KEEPCNT:
        case TCP_SYNCNT:
        case TCP_WINDOW_CLAMP:
        case TCP_THIN_LINEAR_TIMEOUTS:
        case TCP_USER_TIMEOUT:
        case TCP_NOTSENT_LOWAT:
        case TCP_SAVE_SYN:
        case TCP_RTO_MAX_MS:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        case TCP_CONGESTION:
                return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
        case TCP_SAVED_SYN:
                if (*optlen < 1)
                        return -EINVAL;
                break;
        default:
                if (getopt)
                        return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen);
                return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
        }

        if (getopt) {
                if (optname == TCP_SAVED_SYN) {
                        struct tcp_sock *tp = tcp_sk(sk);

                        if (!tp->saved_syn ||
                            *optlen > tcp_saved_syn_len(tp->saved_syn))
                                return -EINVAL;
                        memcpy(optval, tp->saved_syn->data, *optlen);
                        /* It cannot free tp->saved_syn here because it
                         * does not know if the user space still needs it.
                         */
                        return 0;
                }

                return do_tcp_getsockopt(sk, SOL_TCP, optname,
                                         KERNEL_SOCKPTR(optval),
                                         KERNEL_SOCKPTR(optlen));
        }

        return do_tcp_setsockopt(sk, SOL_TCP, optname,
                                 KERNEL_SOCKPTR(optval), *optlen);
}

static int sol_ip_sockopt(struct sock *sk, int optname,
                          char *optval, int *optlen,
                          bool getopt)
{
        if (sk->sk_family != AF_INET)
                return -EINVAL;

        switch (optname) {
        case IP_TOS:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        if (getopt)
                return do_ip_getsockopt(sk, SOL_IP, optname,
                                        KERNEL_SOCKPTR(optval),
                                        KERNEL_SOCKPTR(optlen));

        return do_ip_setsockopt(sk, SOL_IP, optname,
                                KERNEL_SOCKPTR(optval), *optlen);
}

static int sol_ipv6_sockopt(struct sock *sk, int optname,
                            char *optval, int *optlen,
                            bool getopt)
{
        if (sk->sk_family != AF_INET6)
                return -EINVAL;

        switch (optname) {
        case IPV6_TCLASS:
        case IPV6_AUTOFLOWLABEL:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        if (getopt)
                return do_ipv6_getsockopt(sk, SOL_IPV6, optname,
                                          KERNEL_SOCKPTR(optval),
                                          KERNEL_SOCKPTR(optlen));

        return do_ipv6_setsockopt(sk, SOL_IPV6, optname,
                                  KERNEL_SOCKPTR(optval), *optlen);
}

static int __bpf_setsockopt(struct sock *sk, int level, int optname,
                            char *optval, int optlen)
{
        if (!sk_fullsock(sk))
                return -EINVAL;

        if (level == SOL_SOCKET)
                return sol_socket_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
                return sol_ip_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
                return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
                return sol_tcp_sockopt(sk, optname, optval, &optlen, false);

        return -EINVAL;
}

static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
{
        return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
}

static int _bpf_setsockopt(struct sock *sk, int level, int optname,
                           char *optval, int optlen)
{
        if (sk_fullsock(sk))
                sock_owned_by_me(sk);
        return __bpf_setsockopt(sk, level, optname, optval, optlen);
}

static int __bpf_getsockopt(struct sock *sk, int level, int optname,
                            char *optval, int optlen)
{
        int err, saved_optlen = optlen;

        if (!sk_fullsock(sk)) {
                err = -EINVAL;
                goto done;
        }

        if (level == SOL_SOCKET)
                err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
                err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
                err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
                err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
        else
                err = -EINVAL;

done:
        if (err)
                optlen = 0;
        if (optlen < saved_optlen)
                memset(optval + optlen, 0, saved_optlen - optlen);
        return err;
}

static int _bpf_getsockopt(struct sock *sk, int level, int optname,
                           char *optval, int optlen)
{
        if (sk_fullsock(sk))
                sock_owned_by_me(sk);
        return __bpf_getsockopt(sk, level, optname, optval, optlen);
}

BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_sk_setsockopt_proto = {
        .func                = bpf_sk_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return _bpf_getsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_sk_getsockopt_proto = {
        .func                = bpf_sk_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return __bpf_setsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = {
        .func                = bpf_unlocked_sk_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return __bpf_getsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = {
        .func                = bpf_unlocked_sk_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
        .func                = bpf_sock_addr_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
        .func                = bpf_sock_addr_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk,
                                          char *optval, int optlen,
                                          bool getopt)
{
        int val;

        if (optlen != sizeof(int))
                return -EINVAL;

        if (!sk_has_account(sk))
                return -EOPNOTSUPP;

        if (getopt) {
                *(int *)optval = sk->sk_bypass_prot_mem;
                return 0;
        }

        val = *(int *)optval;
        if (val < 0 || val > 1)
                return -EINVAL;

        sk->sk_bypass_prot_mem = val;
        return 0;
}

BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM)
                return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false);

        return __bpf_setsockopt(sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = {
        .func                = bpf_sock_create_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) {
                int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true);

                if (err)
                        memset(optval, 0, optlen);

                return err;
        }

        return __bpf_getsockopt(sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = {
        .func                = bpf_sock_create_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
           int, level, int, optname, char *, optval, int, optlen)
{
        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
        .func                = bpf_sock_ops_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
                                int optname, const u8 **start)
{
        struct sk_buff *syn_skb = bpf_sock->syn_skb;
        const u8 *hdr_start;
        int ret;

        if (syn_skb) {
                /* sk is a request_sock here */

                if (optname == TCP_BPF_SYN) {
                        hdr_start = syn_skb->data;
                        ret = tcp_hdrlen(syn_skb);
                } else if (optname == TCP_BPF_SYN_IP) {
                        hdr_start = skb_network_header(syn_skb);
                        ret = skb_network_header_len(syn_skb) +
                                tcp_hdrlen(syn_skb);
                } else {
                        /* optname == TCP_BPF_SYN_MAC */
                        hdr_start = skb_mac_header(syn_skb);
                        ret = skb_mac_header_len(syn_skb) +
                                skb_network_header_len(syn_skb) +
                                tcp_hdrlen(syn_skb);
                }
        } else {
                struct sock *sk = bpf_sock->sk;
                struct saved_syn *saved_syn;

                if (sk->sk_state == TCP_NEW_SYN_RECV)
                        /* synack retransmit. bpf_sock->syn_skb will
                         * not be available.  It has to resort to
                         * saved_syn (if it is saved).
                         */
                        saved_syn = inet_reqsk(sk)->saved_syn;
                else
                        saved_syn = tcp_sk(sk)->saved_syn;

                if (!saved_syn)
                        return -ENOENT;

                if (optname == TCP_BPF_SYN) {
                        hdr_start = saved_syn->data +
                                saved_syn->mac_hdrlen +
                                saved_syn->network_hdrlen;
                        ret = saved_syn->tcp_hdrlen;
                } else if (optname == TCP_BPF_SYN_IP) {
                        hdr_start = saved_syn->data +
                                saved_syn->mac_hdrlen;
                        ret = saved_syn->network_hdrlen +
                                saved_syn->tcp_hdrlen;
                } else {
                        /* optname == TCP_BPF_SYN_MAC */

                        /* TCP_SAVE_SYN may not have saved the mac hdr */
                        if (!saved_syn->mac_hdrlen)
                                return -ENOENT;

                        hdr_start = saved_syn->data;
                        ret = saved_syn->mac_hdrlen +
                                saved_syn->network_hdrlen +
                                saved_syn->tcp_hdrlen;
                }
        }

        *start = hdr_start;
        return ret;
}

BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
           int, level, int, optname, char *, optval, int, optlen)
{
        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
            optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
                int ret, copy_len = 0;
                const u8 *start;

                ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
                if (ret > 0) {
                        copy_len = ret;
                        if (optlen < copy_len) {
                                copy_len = optlen;
                                ret = -ENOSPC;
                        }

                        memcpy(optval, start, copy_len);
                }

                /* Zero out unused buffer at the end */
                memset(optval + copy_len, 0, optlen - copy_len);

                return ret;
        }

        return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
        .func                = bpf_sock_ops_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
           int, argval)
{
        struct sock *sk = bpf_sock->sk;
        int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;

        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
                return -EINVAL;

        tcp_sk(sk)->bpf_sock_ops_cb_flags = val;

        return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
}

static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
        .func                = bpf_sock_ops_cb_flags_set,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
           int, addr_len)
{
#ifdef CONFIG_INET
        struct sock *sk = ctx->sk;
        u32 flags = BIND_FROM_BPF;
        int err;

        err = -EINVAL;
        if (addr_len < offsetofend(struct sockaddr, sa_family))
                return err;
        if (addr->sa_family == AF_INET) {
                if (addr_len < sizeof(struct sockaddr_in))
                        return err;
                if (((struct sockaddr_in *)addr)->sin_port == htons(0))
                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
                return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6)
        } else if (addr->sa_family == AF_INET6) {
                if (addr_len < SIN6_LEN_RFC2133)
                        return err;
                if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
                        flags |= BIND_FORCE_ADDRESS_NO_PORT;

                return __inet6_bind(sk, (struct sockaddr_unsized *)addr,
                                    addr_len, flags);
#endif /* CONFIG_IPV6 */
        }
#endif /* CONFIG_INET */

        return -EAFNOSUPPORT;
}

static const struct bpf_func_proto bpf_bind_proto = {
        .func                = bpf_bind,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

#ifdef CONFIG_XFRM

#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))

struct metadata_dst __percpu *xfrm_bpf_md_dst;
EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst);

#endif

BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
           struct bpf_xfrm_state *, to, u32, size, u64, flags)
{
        const struct sec_path *sp = skb_sec_path(skb);
        const struct xfrm_state *x;

        if (!sp || unlikely(index >= sp->len || flags))
                goto err_clear;

        x = sp->xvec[index];

        if (unlikely(size != sizeof(struct bpf_xfrm_state)))
                goto err_clear;

        to->reqid = x->props.reqid;
        to->spi = x->id.spi;
        to->family = x->props.family;
        to->ext = 0;

        if (to->family == AF_INET6) {
                memcpy(to->remote_ipv6, x->props.saddr.a6,
                       sizeof(to->remote_ipv6));
        } else {
                to->remote_ipv4 = x->props.saddr.a4;
                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
        }

        return 0;
err_clear:
        memset(to, 0, size);
        return -EINVAL;
}

static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
        .func                = bpf_skb_get_xfrm_state,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};
#endif

#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu)
{
        params->h_vlan_TCI = 0;
        params->h_vlan_proto = 0;
        if (mtu)
                params->mtu_result = mtu; /* union with tot_len */

        return 0;
}
#endif

#if IS_ENABLED(CONFIG_INET)
static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
                               u32 flags, bool check_mtu)
{
        struct neighbour *neigh = NULL;
        struct fib_nh_common *nhc;
        struct in_device *in_dev;
        struct net_device *dev;
        struct fib_result res;
        struct flowi4 fl4;
        u32 mtu = 0;
        int err;

        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        /* verify forwarding is enabled on this interface */
        in_dev = __in_dev_get_rcu(dev);
        if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
                return BPF_FIB_LKUP_RET_FWD_DISABLED;

        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
                fl4.flowi4_iif = 1;
                fl4.flowi4_oif = params->ifindex;
        } else {
                fl4.flowi4_iif = params->ifindex;
                fl4.flowi4_oif = 0;
        }
        fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos);
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_flags = 0;

        fl4.flowi4_proto = params->l4_protocol;
        fl4.daddr = params->ipv4_dst;
        fl4.saddr = params->ipv4_src;
        fl4.fl4_sport = params->sport;
        fl4.fl4_dport = params->dport;
        fl4.flowi4_multipath_hash = 0;

        if (flags & BPF_FIB_LOOKUP_DIRECT) {
                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
                struct fib_table *tb;

                if (flags & BPF_FIB_LOOKUP_TBID) {
                        tbid = params->tbid;
                        /* zero out for vlan output */
                        params->tbid = 0;
                }

                tb = fib_get_table(net, tbid);
                if (unlikely(!tb))
                        return BPF_FIB_LKUP_RET_NOT_FWDED;

                err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
        } else {
                if (flags & BPF_FIB_LOOKUP_MARK)
                        fl4.flowi4_mark = params->mark;
                else
                        fl4.flowi4_mark = 0;
                fl4.flowi4_secid = 0;
                fl4.flowi4_tun_key.tun_id = 0;
                fl4.flowi4_uid = sock_net_uid(net, NULL);

                err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
        }

        if (err) {
                /* map fib lookup errors to RTN_ type */
                if (err == -EINVAL)
                        return BPF_FIB_LKUP_RET_BLACKHOLE;
                if (err == -EHOSTUNREACH)
                        return BPF_FIB_LKUP_RET_UNREACHABLE;
                if (err == -EACCES)
                        return BPF_FIB_LKUP_RET_PROHIBIT;

                return BPF_FIB_LKUP_RET_NOT_FWDED;
        }

        if (res.type != RTN_UNICAST)
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        if (fib_info_num_path(res.fi) > 1)
                fib_select_path(net, &res, &fl4, NULL);

        if (check_mtu) {
                mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
                if (params->tot_len > mtu) {
                        params->mtu_result = mtu; /* union with tot_len */
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
                }
        }

        nhc = res.nhc;

        /* do not handle lwt encaps right now */
        if (nhc->nhc_lwtstate)
                return BPF_FIB_LKUP_RET_UNSUPP_LWT;

        dev = nhc->nhc_dev;

        params->rt_metric = res.fi->fib_priority;
        params->ifindex = dev->ifindex;

        if (flags & BPF_FIB_LOOKUP_SRC)
                params->ipv4_src = fib_result_prefsrc(net, &res);

        /* xdp and cls_bpf programs are run in RCU-bh so
         * rcu_read_lock_bh is not needed here
         */
        if (likely(nhc->nhc_gw_family != AF_INET6)) {
                if (nhc->nhc_gw_family)
                        params->ipv4_dst = nhc->nhc_gw.ipv4;
        } else {
                struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;

                params->family = AF_INET6;
                *dst = nhc->nhc_gw.ipv6;
        }

        if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
                goto set_fwd_params;

        if (likely(nhc->nhc_gw_family != AF_INET6))
                neigh = __ipv4_neigh_lookup_noref(dev,
                                                  (__force u32)params->ipv4_dst);
        else if (IS_ENABLED(CONFIG_IPV6))
                neigh = __ipv6_neigh_lookup_noref(dev, params->ipv6_dst);

        if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;
        memcpy(params->dmac, neigh->ha, ETH_ALEN);
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);

set_fwd_params:
        return bpf_fib_set_fwd_params(params, mtu);
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
                               u32 flags, bool check_mtu)
{
        struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
        struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
        struct fib6_result res = {};
        struct neighbour *neigh;
        struct net_device *dev;
        struct inet6_dev *idev;
        struct flowi6 fl6;
        int strict = 0;
        int oif, err;
        u32 mtu = 0;

        /* link local addresses are never forwarded */
        if (rt6_need_strict(dst) || rt6_need_strict(src))
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        idev = __in6_dev_get_safely(dev);
        if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding)))
                return BPF_FIB_LKUP_RET_FWD_DISABLED;

        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
                fl6.flowi6_iif = 1;
                oif = fl6.flowi6_oif = params->ifindex;
        } else {
                oif = fl6.flowi6_iif = params->ifindex;
                fl6.flowi6_oif = 0;
                strict = RT6_LOOKUP_F_HAS_SADDR;
        }
        fl6.flowlabel = params->flowinfo;
        fl6.flowi6_scope = 0;
        fl6.flowi6_flags = 0;
        fl6.mp_hash = 0;

        fl6.flowi6_proto = params->l4_protocol;
        fl6.daddr = *dst;
        fl6.saddr = *src;
        fl6.fl6_sport = params->sport;
        fl6.fl6_dport = params->dport;

        if (flags & BPF_FIB_LOOKUP_DIRECT) {
                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
                struct fib6_table *tb;

                if (flags & BPF_FIB_LOOKUP_TBID) {
                        tbid = params->tbid;
                        /* zero out for vlan output */
                        params->tbid = 0;
                }

                tb = fib6_get_table(net, tbid);
                if (unlikely(!tb))
                        return BPF_FIB_LKUP_RET_NOT_FWDED;

                err = fib6_table_lookup(net, tb, oif, &fl6, &res, strict);
        } else {
                if (flags & BPF_FIB_LOOKUP_MARK)
                        fl6.flowi6_mark = params->mark;
                else
                        fl6.flowi6_mark = 0;
                fl6.flowi6_secid = 0;
                fl6.flowi6_tun_key.tun_id = 0;
                fl6.flowi6_uid = sock_net_uid(net, NULL);

                err = fib6_lookup(net, oif, &fl6, &res, strict);
        }

        if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
                     res.f6i == net->ipv6.fib6_null_entry))
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        switch (res.fib6_type) {
        /* only unicast is forwarded */
        case RTN_UNICAST:
                break;
        case RTN_BLACKHOLE:
                return BPF_FIB_LKUP_RET_BLACKHOLE;
        case RTN_UNREACHABLE:
                return BPF_FIB_LKUP_RET_UNREACHABLE;
        case RTN_PROHIBIT:
                return BPF_FIB_LKUP_RET_PROHIBIT;
        default:
                return BPF_FIB_LKUP_RET_NOT_FWDED;
        }

        fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
                         fl6.flowi6_oif != 0, NULL, strict);

        if (check_mtu) {
                mtu = ip6_mtu_from_fib6(&res, dst, src);
                if (params->tot_len > mtu) {
                        params->mtu_result = mtu; /* union with tot_len */
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
                }
        }

        if (res.nh->fib_nh_lws)
                return BPF_FIB_LKUP_RET_UNSUPP_LWT;

        if (res.nh->fib_nh_gw_family)
                *dst = res.nh->fib_nh_gw6;

        dev = res.nh->fib_nh_dev;
        params->rt_metric = res.f6i->fib6_metric;
        params->ifindex = dev->ifindex;

        if (flags & BPF_FIB_LOOKUP_SRC) {
                if (res.f6i->fib6_prefsrc.plen) {
                        *src = res.f6i->fib6_prefsrc.addr;
                } else {
                        err = ipv6_dev_get_saddr(net, dev, &fl6.daddr, 0, src);
                        if (err)
                                return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
                }
        }

        if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
                goto set_fwd_params;

        /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
         * not needed here.
         */
        neigh = __ipv6_neigh_lookup_noref(dev, dst);
        if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;
        memcpy(params->dmac, neigh->ha, ETH_ALEN);
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);

set_fwd_params:
        return bpf_fib_set_fwd_params(params, mtu);
}
#endif

#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
                             BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
                             BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)

BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
        if (plen < sizeof(*params))
                return -EINVAL;

        if (flags & ~BPF_FIB_LOOKUP_MASK)
                return -EINVAL;

        switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
        case AF_INET:
                return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
                                           flags, true);
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
                                           flags, true);
#endif
        }
        return -EAFNOSUPPORT;
}

static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
        .func                = bpf_xdp_fib_lookup,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_WRITE,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
        struct net *net = dev_net(skb->dev);
        int rc = -EAFNOSUPPORT;
        bool check_mtu = false;

        if (plen < sizeof(*params))
                return -EINVAL;

        if (flags & ~BPF_FIB_LOOKUP_MASK)
                return -EINVAL;

        if (params->tot_len)
                check_mtu = true;

        switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
        case AF_INET:
                rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
                break;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
                break;
#endif
        }

        if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
                struct net_device *dev;

                /* When tot_len isn't provided by user, check skb
                 * against MTU of FIB lookup resulting net_device
                 */
                dev = dev_get_by_index_rcu(net, params->ifindex);
                if (!is_skb_forwardable(dev, skb))
                        rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;

                params->mtu_result = dev->mtu; /* union with tot_len */
        }

        return rc;
}

static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
        .func                = bpf_skb_fib_lookup,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_WRITE,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
                                            u32 ifindex)
{
        struct net *netns = dev_net(dev_curr);

        /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
        if (ifindex == 0)
                return dev_curr;

        return dev_get_by_index_rcu(netns, ifindex);
}

BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
           u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
{
        int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
        struct net_device *dev = skb->dev;
        int mtu, dev_len, skb_len;

        if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
                return -EINVAL;
        if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
                return -EINVAL;

        dev = __dev_via_ifindex(dev, ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        mtu = READ_ONCE(dev->mtu);
        dev_len = mtu + dev->hard_header_len;

        /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
        skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;

        skb_len += len_diff; /* minus result pass check */
        if (skb_len <= dev_len) {
                ret = BPF_MTU_CHK_RET_SUCCESS;
                goto out;
        }
        /* At this point, skb->len exceed MTU, but as it include length of all
         * segments, it can still be below MTU.  The SKB can possibly get
         * re-segmented in transmit path (see validate_xmit_skb).  Thus, user
         * must choose if segs are to be MTU checked.
         */
        if (skb_is_gso(skb)) {
                ret = BPF_MTU_CHK_RET_SUCCESS;
                if (flags & BPF_MTU_CHK_SEGS) {
                        if (!skb_transport_header_was_set(skb))
                                return -EINVAL;
                        if (!skb_gso_validate_network_len(skb, mtu))
                                ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
                }
        }
out:
        *mtu_len = mtu;
        return ret;
}

BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
           u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
{
        struct net_device *dev = xdp->rxq->dev;
        int xdp_len = xdp->data_end - xdp->data;
        int ret = BPF_MTU_CHK_RET_SUCCESS;
        int mtu, dev_len;

        /* XDP variant doesn't support multi-buffer segment check (yet) */
        if (unlikely(flags))
                return -EINVAL;

        dev = __dev_via_ifindex(dev, ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        mtu = READ_ONCE(dev->mtu);
        dev_len = mtu + dev->hard_header_len;

        /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
        if (*mtu_len)
                xdp_len = *mtu_len + dev->hard_header_len;

        xdp_len += len_diff; /* minus result pass check */
        if (xdp_len > dev_len)
                ret = BPF_MTU_CHK_RET_FRAG_NEEDED;

        *mtu_len = mtu;
        return ret;
}

static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
        .func                = bpf_skb_check_mtu,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
        .arg3_size        = sizeof(u32),
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
        .func                = bpf_xdp_check_mtu,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
        .arg3_size        = sizeof(u32),
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
{
        int err;
        struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;

        if (!seg6_validate_srh(srh, len, false))
                return -EINVAL;

        switch (type) {
        case BPF_LWT_ENCAP_SEG6_INLINE:
                if (skb->protocol != htons(ETH_P_IPV6))
                        return -EBADMSG;

                err = seg6_do_srh_inline(skb, srh);
                break;
        case BPF_LWT_ENCAP_SEG6:
                skb_reset_inner_headers(skb);
                skb->encapsulation = 1;
                err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
                break;
        default:
                return -EINVAL;
        }

        bpf_compute_data_pointers(skb);
        if (err)
                return err;

        skb_set_transport_header(skb, sizeof(struct ipv6hdr));

        return seg6_lookup_nexthop(skb, NULL, 0);
}
#endif /* CONFIG_IPV6_SEG6_BPF */

#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
                             bool ingress)
{
        return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
}
#endif

BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
           u32, len)
{
        switch (type) {
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
        case BPF_LWT_ENCAP_SEG6:
        case BPF_LWT_ENCAP_SEG6_INLINE:
                return bpf_push_seg6_encap(skb, type, hdr, len);
#endif
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
        case BPF_LWT_ENCAP_IP:
                return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
#endif
        default:
                return -EINVAL;
        }
}

BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
           void *, hdr, u32, len)
{
        switch (type) {
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
        case BPF_LWT_ENCAP_IP:
                return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
#endif
        default:
                return -EINVAL;
        }
}

static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
        .func                = bpf_lwt_in_push_encap,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
        .func                = bpf_lwt_xmit_push_encap,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
           const void *, from, u32, len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        struct ipv6_sr_hdr *srh = srh_state->srh;
        void *srh_tlvs, *srh_end, *ptr;
        int srhoff = 0;

        lockdep_assert_held(&srh_state->bh_lock);
        if (srh == NULL)
                return -EINVAL;

        srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
        srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);

        ptr = skb->data + offset;
        if (ptr >= srh_tlvs && ptr + len <= srh_end)
                srh_state->valid = false;
        else if (ptr < (void *)&srh->flags ||
                 ptr + len > (void *)&srh->segments)
                return -EFAULT;

        if (unlikely(bpf_try_make_writable(skb, offset + len)))
                return -EFAULT;
        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
                return -EINVAL;
        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);

        memcpy(skb->data + offset, from, len);
        return 0;
}

static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
        .func                = bpf_lwt_seg6_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

static void bpf_update_srh_state(struct sk_buff *skb)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        int srhoff = 0;

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
                srh_state->srh = NULL;
        } else {
                srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
                srh_state->hdrlen = srh_state->srh->hdrlen << 3;
                srh_state->valid = true;
        }
}

BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
           u32, action, void *, param, u32, param_len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        int hdroff = 0;
        int err;

        lockdep_assert_held(&srh_state->bh_lock);
        switch (action) {
        case SEG6_LOCAL_ACTION_END_X:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(struct in6_addr))
                        return -EINVAL;
                return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
        case SEG6_LOCAL_ACTION_END_T:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(int))
                        return -EINVAL;
                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
        case SEG6_LOCAL_ACTION_END_DT6:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(int))
                        return -EINVAL;

                if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
                        return -EBADMSG;
                if (!pskb_pull(skb, hdroff))
                        return -EBADMSG;

                skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
                skb_reset_network_header(skb);
                skb_reset_transport_header(skb);
                skb->encapsulation = 0;

                bpf_compute_data_pointers(skb);
                bpf_update_srh_state(skb);
                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
        case SEG6_LOCAL_ACTION_END_B6:
                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
                                          param, param_len);
                if (!err)
                        bpf_update_srh_state(skb);

                return err;
        case SEG6_LOCAL_ACTION_END_B6_ENCAP:
                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
                                          param, param_len);
                if (!err)
                        bpf_update_srh_state(skb);

                return err;
        default:
                return -EINVAL;
        }
}

static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
        .func                = bpf_lwt_seg6_action,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
           s32, len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        struct ipv6_sr_hdr *srh = srh_state->srh;
        void *srh_end, *srh_tlvs, *ptr;
        struct ipv6hdr *hdr;
        int srhoff = 0;
        int ret;

        lockdep_assert_held(&srh_state->bh_lock);
        if (unlikely(srh == NULL))
                return -EINVAL;

        srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
                        ((srh->first_segment + 1) << 4));
        srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
                        srh_state->hdrlen);
        ptr = skb->data + offset;

        if (unlikely(ptr < srh_tlvs || ptr > srh_end))
                return -EFAULT;
        if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
                return -EFAULT;

        if (len > 0) {
                ret = skb_cow_head(skb, len);
                if (unlikely(ret < 0))
                        return ret;

                ret = bpf_skb_net_hdr_push(skb, offset, len);
        } else {
                ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
        }

        bpf_compute_data_pointers(skb);
        if (unlikely(ret < 0))
                return ret;

        hdr = (struct ipv6hdr *)skb->data;
        hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
                return -EINVAL;
        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
        srh_state->hdrlen += len;
        srh_state->valid = false;
        return 0;
}

static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
        .func                = bpf_lwt_seg6_adjust_srh,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};
#endif /* CONFIG_IPV6_SEG6_BPF */

#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
                              int dif, int sdif, u8 family, u8 proto)
{
        bool refcounted = false;
        struct sock *sk = NULL;

        if (family == AF_INET) {
                __be32 src4 = tuple->ipv4.saddr;
                __be32 dst4 = tuple->ipv4.daddr;

                if (proto == IPPROTO_TCP)
                        sk = __inet_lookup(net, NULL, 0,
                                           src4, tuple->ipv4.sport,
                                           dst4, tuple->ipv4.dport,
                                           dif, sdif, &refcounted);
                else
                        sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
                                               dst4, tuple->ipv4.dport,
                                               dif, sdif, NULL);
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
                struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;

                if (proto == IPPROTO_TCP)
                        sk = __inet6_lookup(net, NULL, 0,
                                            src6, tuple->ipv6.sport,
                                            dst6, ntohs(tuple->ipv6.dport),
                                            dif, sdif, &refcounted);
                else if (likely(ipv6_mod_enabled()))
                        sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport,
                                               dst6, tuple->ipv6.dport,
                                               dif, sdif, NULL);
#endif
        }

        if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                sk = NULL;
        }
        return sk;
}

/* bpf_skc_lookup performs the core lookup for different types of sockets,
 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
 */
static struct sock *
__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
                 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
                 u64 flags, int sdif)
{
        struct sock *sk = NULL;
        struct net *net;
        u8 family;

        if (len == sizeof(tuple->ipv4))
                family = AF_INET;
        else if (len == sizeof(tuple->ipv6))
                family = AF_INET6;
        else
                return NULL;

        if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))
                goto out;

        if (sdif < 0) {
                if (family == AF_INET)
                        sdif = inet_sdif(skb);
                else
                        sdif = inet6_sdif(skb);
        }

        if ((s32)netns_id < 0) {
                net = caller_net;
                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
        } else {
                net = get_net_ns_by_id(caller_net, netns_id);
                if (unlikely(!net))
                        goto out;
                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
                put_net(net);
        }

out:
        return sk;
}

static struct sock *
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
                struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
                u64 flags, int sdif)
{
        struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
                                           ifindex, proto, netns_id, flags,
                                           sdif);

        if (sk) {
                struct sock *sk2 = sk_to_full_sk(sk);

                /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
                 * sock refcnt is decremented to prevent a request_sock leak.
                 */
                if (sk2 != sk) {
                        sock_gen_put(sk);
                        /* Ensure there is no need to bump sk2 refcnt */
                        if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
                                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                                return NULL;
                        }
                        sk = sk2;
                }
        }

        return sk;
}

static struct sock *
bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
               u8 proto, u64 netns_id, u64 flags)
{
        struct net *caller_net;
        int ifindex;

        if (skb->dev) {
                caller_net = dev_net(skb->dev);
                ifindex = skb->dev->ifindex;
        } else {
                caller_net = sock_net(skb->sk);
                ifindex = 0;
        }

        return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
                                netns_id, flags, -1);
}

static struct sock *
bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
              u8 proto, u64 netns_id, u64 flags)
{
        struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
                                         flags);

        if (sk) {
                struct sock *sk2 = sk_to_full_sk(sk);

                /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
                 * sock refcnt is decremented to prevent a request_sock leak.
                 */
                if (sk2 != sk) {
                        sock_gen_put(sk);
                        /* Ensure there is no need to bump sk2 refcnt */
                        if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
                                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                                return NULL;
                        }
                        sk = sk2;
                }
        }

        return sk;
}

BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
                                             netns_id, flags);
}

static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
        .func                = bpf_skc_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
                                            netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
        .func                = bpf_sk_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
                                            netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
        .func                = bpf_sk_lookup_udp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net,
                                               ifindex, IPPROTO_TCP, netns_id,
                                               flags, sdif);
}

static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
        .func                = bpf_tc_skc_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
                                              ifindex, IPPROTO_TCP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
        .func                = bpf_tc_sk_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
                                              ifindex, IPPROTO_UDP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
        .func                = bpf_tc_sk_lookup_udp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
        if (sk && sk_is_refcounted(sk))
                sock_gen_put(sk);
        return 0;
}

static const struct bpf_func_proto bpf_sk_release_proto = {
        .func                = bpf_sk_release,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE,
};

BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
                                              ifindex, IPPROTO_UDP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
        .func           = bpf_xdp_sk_lookup_udp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
                                               ifindex, IPPROTO_TCP, netns_id,
                                               flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
        .func           = bpf_xdp_skc_lookup_tcp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
                                              ifindex, IPPROTO_TCP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
        .func           = bpf_xdp_sk_lookup_tcp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
                                               sock_net(ctx->sk), 0,
                                               IPPROTO_TCP, netns_id, flags,
                                               -1);
}

static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
        .func                = bpf_sock_addr_skc_lookup_tcp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
                                              sock_net(ctx->sk), 0, IPPROTO_TCP,
                                              netns_id, flags, -1);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
        .func                = bpf_sock_addr_sk_lookup_tcp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
                                              sock_net(ctx->sk), 0, IPPROTO_UDP,
                                              netns_id, flags, -1);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
        .func                = bpf_sock_addr_sk_lookup_udp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
                                          icsk_retransmits))
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct bpf_tcp_sock, bytes_received):
        case offsetof(struct bpf_tcp_sock, bytes_acked):
                return size == sizeof(__u64);
        default:
                return size == sizeof(__u32);
        }
}

u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

#define BPF_TCP_SOCK_GET_COMMON(FIELD)                                        \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) >        \
                             sizeof_field(struct bpf_tcp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(struct tcp_sock, FIELD)); \
        } while (0)

#define BPF_INET_SOCK_GET_COMMON(FIELD)                                        \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct inet_connection_sock,        \
                                          FIELD) >                        \
                             sizeof_field(struct bpf_tcp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                        \
                                        struct inet_connection_sock,        \
                                        FIELD),                                \
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(                                \
                                        struct inet_connection_sock,        \
                                        FIELD));                        \
        } while (0)

        BTF_TYPE_EMIT(struct bpf_tcp_sock);

        switch (si->off) {
        case offsetof(struct bpf_tcp_sock, rtt_min):
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
                             sizeof(struct minmax));
                BUILD_BUG_ON(sizeof(struct minmax) <
                             sizeof(struct minmax_sample));

                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct tcp_sock, rtt_min) +
                                      offsetof(struct minmax_sample, v));
                break;
        case offsetof(struct bpf_tcp_sock, snd_cwnd):
                BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
                break;
        case offsetof(struct bpf_tcp_sock, srtt_us):
                BPF_TCP_SOCK_GET_COMMON(srtt_us);
                break;
        case offsetof(struct bpf_tcp_sock, snd_ssthresh):
                BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
                break;
        case offsetof(struct bpf_tcp_sock, rcv_nxt):
                BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
                break;
        case offsetof(struct bpf_tcp_sock, snd_nxt):
                BPF_TCP_SOCK_GET_COMMON(snd_nxt);
                break;
        case offsetof(struct bpf_tcp_sock, snd_una):
                BPF_TCP_SOCK_GET_COMMON(snd_una);
                break;
        case offsetof(struct bpf_tcp_sock, mss_cache):
                BPF_TCP_SOCK_GET_COMMON(mss_cache);
                break;
        case offsetof(struct bpf_tcp_sock, ecn_flags):
                BPF_TCP_SOCK_GET_COMMON(ecn_flags);
                break;
        case offsetof(struct bpf_tcp_sock, rate_delivered):
                BPF_TCP_SOCK_GET_COMMON(rate_delivered);
                break;
        case offsetof(struct bpf_tcp_sock, rate_interval_us):
                BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
                break;
        case offsetof(struct bpf_tcp_sock, packets_out):
                BPF_TCP_SOCK_GET_COMMON(packets_out);
                break;
        case offsetof(struct bpf_tcp_sock, retrans_out):
                BPF_TCP_SOCK_GET_COMMON(retrans_out);
                break;
        case offsetof(struct bpf_tcp_sock, total_retrans):
                BPF_TCP_SOCK_GET_COMMON(total_retrans);
                break;
        case offsetof(struct bpf_tcp_sock, segs_in):
                BPF_TCP_SOCK_GET_COMMON(segs_in);
                break;
        case offsetof(struct bpf_tcp_sock, data_segs_in):
                BPF_TCP_SOCK_GET_COMMON(data_segs_in);
                break;
        case offsetof(struct bpf_tcp_sock, segs_out):
                BPF_TCP_SOCK_GET_COMMON(segs_out);
                break;
        case offsetof(struct bpf_tcp_sock, data_segs_out):
                BPF_TCP_SOCK_GET_COMMON(data_segs_out);
                break;
        case offsetof(struct bpf_tcp_sock, lost_out):
                BPF_TCP_SOCK_GET_COMMON(lost_out);
                break;
        case offsetof(struct bpf_tcp_sock, sacked_out):
                BPF_TCP_SOCK_GET_COMMON(sacked_out);
                break;
        case offsetof(struct bpf_tcp_sock, bytes_received):
                BPF_TCP_SOCK_GET_COMMON(bytes_received);
                break;
        case offsetof(struct bpf_tcp_sock, bytes_acked):
                BPF_TCP_SOCK_GET_COMMON(bytes_acked);
                break;
        case offsetof(struct bpf_tcp_sock, dsack_dups):
                BPF_TCP_SOCK_GET_COMMON(dsack_dups);
                break;
        case offsetof(struct bpf_tcp_sock, delivered):
                BPF_TCP_SOCK_GET_COMMON(delivered);
                break;
        case offsetof(struct bpf_tcp_sock, delivered_ce):
                BPF_TCP_SOCK_GET_COMMON(delivered_ce);
                break;
        case offsetof(struct bpf_tcp_sock, icsk_retransmits):
                BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
                break;
        }

        return insn - insn_buf;
}

BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
{
        if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_tcp_sock_proto = {
        .func                = bpf_tcp_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_TCP_SOCK_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
{
        sk = sk_to_full_sk(sk);

        if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

static const struct bpf_func_proto bpf_get_listener_sock_proto = {
        .func                = bpf_get_listener_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
{
        unsigned int iphdr_len;

        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                iphdr_len = sizeof(struct iphdr);
                break;
        case cpu_to_be16(ETH_P_IPV6):
                iphdr_len = sizeof(struct ipv6hdr);
                break;
        default:
                return 0;
        }

        if (skb_headlen(skb) < iphdr_len)
                return 0;

        if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
                return 0;

        return INET_ECN_set_ce(skb);
}

bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        default:
                return size == sizeof(__u32);
        }
}

u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

#define BPF_XDP_SOCK_GET(FIELD)                                                \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) >        \
                             sizeof_field(struct bpf_xdp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(struct xdp_sock, FIELD)); \
        } while (0)

        BTF_TYPE_EMIT(struct bpf_xdp_sock);

        switch (si->off) {
        case offsetof(struct bpf_xdp_sock, queue_id):
                BPF_XDP_SOCK_GET(queue_id);
                break;
        }

        return insn - insn_buf;
}

static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
        .func           = bpf_skb_ecn_set_ce,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
           struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
        int ret;

        if (unlikely(!sk || th_len < sizeof(*th)))
                return -EINVAL;

        /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
                return -EINVAL;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
                return -EINVAL;

        if (!th->ack || th->rst || th->syn)
                return -ENOENT;

        if (unlikely(iph_len < sizeof(struct iphdr)))
                return -EINVAL;

        if (tcp_synq_no_recent_overflow(sk))
                return -ENOENT;

        /* Both struct iphdr and struct ipv6hdr have the version field at the
         * same offset so we can cast to the shorter header (struct iphdr).
         */
        switch (((struct iphdr *)iph)->version) {
        case 4:
                if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
                        return -EINVAL;

                ret = __cookie_v4_check((struct iphdr *)iph, th);
                break;

#if IS_ENABLED(CONFIG_IPV6)
        case 6:
                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
                        return -EINVAL;

                if (sk->sk_family != AF_INET6)
                        return -EINVAL;

                ret = __cookie_v6_check((struct ipv6hdr *)iph, th);
                break;
#endif /* CONFIG_IPV6 */

        default:
                return -EPROTONOSUPPORT;
        }

        if (ret > 0)
                return 0;

        return -ENOENT;
#else
        return -ENOTSUPP;
#endif
}

static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
        .func                = bpf_tcp_check_syncookie,
        .gpl_only        = true,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
           struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
        u32 cookie;
        u16 mss;

        if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
                return -EINVAL;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
                return -ENOENT;

        if (!th->syn || th->ack || th->fin || th->rst)
                return -EINVAL;

        if (unlikely(iph_len < sizeof(struct iphdr)))
                return -EINVAL;

        /* Both struct iphdr and struct ipv6hdr have the version field at the
         * same offset so we can cast to the shorter header (struct iphdr).
         */
        switch (((struct iphdr *)iph)->version) {
        case 4:
                if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
                        return -EINVAL;

                mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
                break;

#if IS_ENABLED(CONFIG_IPV6)
        case 6:
                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
                        return -EINVAL;

                if (sk->sk_family != AF_INET6)
                        return -EINVAL;

                mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
                break;
#endif /* CONFIG_IPV6 */

        default:
                return -EPROTONOSUPPORT;
        }
        if (mss == 0)
                return -ENOENT;

        return cookie | ((u64)mss << 32);
#else
        return -EOPNOTSUPP;
#endif /* CONFIG_SYN_COOKIES */
}

static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
        .func                = bpf_tcp_gen_syncookie,
        .gpl_only        = true, /* __cookie_v*_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
{
        if (!sk || flags != 0)
                return -EINVAL;
        if (!skb_at_tc_ingress(skb))
                return -EOPNOTSUPP;
        if (unlikely(dev_net(skb->dev) != sock_net(sk)))
                return -ENETUNREACH;
        if (sk_unhashed(sk))
                return -EOPNOTSUPP;
        if (sk_is_refcounted(sk) &&
            unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
                return -ENOENT;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_pfree;

        return 0;
}

static const struct bpf_func_proto bpf_sk_assign_proto = {
        .func                = bpf_sk_assign,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg3_type        = ARG_ANYTHING,
};

static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
                                    u8 search_kind, const u8 *magic,
                                    u8 magic_len, bool *eol)
{
        u8 kind, kind_len;

        *eol = false;

        while (op < opend) {
                kind = op[0];

                if (kind == TCPOPT_EOL) {
                        *eol = true;
                        return ERR_PTR(-ENOMSG);
                } else if (kind == TCPOPT_NOP) {
                        op++;
                        continue;
                }

                if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
                        /* Something is wrong in the received header.
                         * Follow the TCP stack's tcp_parse_options()
                         * and just bail here.
                         */
                        return ERR_PTR(-EFAULT);

                kind_len = op[1];
                if (search_kind == kind) {
                        if (!magic_len)
                                return op;

                        if (magic_len > kind_len - 2)
                                return ERR_PTR(-ENOMSG);

                        if (!memcmp(&op[2], magic, magic_len))
                                return op;
                }

                op += kind_len;
        }

        return ERR_PTR(-ENOMSG);
}

BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           void *, search_res, u32, len, u64, flags)
{
        bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
        const u8 *op, *opend, *magic, *search = search_res;
        u8 search_kind, search_len, copy_len, magic_len;
        int ret;

        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        /* 2 byte is the minimal option len except TCPOPT_NOP and
         * TCPOPT_EOL which are useless for the bpf prog to learn
         * and this helper disallow loading them also.
         */
        if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
                return -EINVAL;

        search_kind = search[0];
        search_len = search[1];

        if (search_len > len || search_kind == TCPOPT_NOP ||
            search_kind == TCPOPT_EOL)
                return -EINVAL;

        if (search_kind == TCPOPT_EXP || search_kind == 253) {
                /* 16 or 32 bit magic.  +2 for kind and kind length */
                if (search_len != 4 && search_len != 6)
                        return -EINVAL;
                magic = &search[2];
                magic_len = search_len - 2;
        } else {
                if (search_len)
                        return -EINVAL;
                magic = NULL;
                magic_len = 0;
        }

        if (load_syn) {
                ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
                if (ret < 0)
                        return ret;

                opend = op + ret;
                op += sizeof(struct tcphdr);
        } else {
                if (!bpf_sock->skb ||
                    bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
                        /* This bpf_sock->op cannot call this helper */
                        return -EPERM;

                opend = bpf_sock->skb_data_end;
                op = bpf_sock->skb->data + sizeof(struct tcphdr);
        }

        op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
                                &eol);
        if (IS_ERR(op))
                return PTR_ERR(op);

        copy_len = op[1];
        ret = copy_len;
        if (copy_len > len) {
                ret = -ENOSPC;
                copy_len = len;
        }

        memcpy(search_res, op, copy_len);
        return ret;
}

static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
        .func                = bpf_sock_ops_load_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_WRITE,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           const void *, from, u32, len, u64, flags)
{
        u8 new_kind, new_kind_len, magic_len = 0, *opend;
        const u8 *op, *new_op, *magic = NULL;
        struct sk_buff *skb;
        bool eol;

        if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
                return -EPERM;

        if (len < 2 || flags)
                return -EINVAL;

        new_op = from;
        new_kind = new_op[0];
        new_kind_len = new_op[1];

        if (new_kind_len > len || new_kind == TCPOPT_NOP ||
            new_kind == TCPOPT_EOL)
                return -EINVAL;

        if (new_kind_len > bpf_sock->remaining_opt_len)
                return -ENOSPC;

        /* 253 is another experimental kind */
        if (new_kind == TCPOPT_EXP || new_kind == 253)  {
                if (new_kind_len < 4)
                        return -EINVAL;
                /* Match for the 2 byte magic also.
                 * RFC 6994: the magic could be 2 or 4 bytes.
                 * Hence, matching by 2 byte only is on the
                 * conservative side but it is the right
                 * thing to do for the 'search-for-duplication'
                 * purpose.
                 */
                magic = &new_op[2];
                magic_len = 2;
        }

        /* Check for duplication */
        skb = bpf_sock->skb;
        op = skb->data + sizeof(struct tcphdr);
        opend = bpf_sock->skb_data_end;

        op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
                                &eol);
        if (!IS_ERR(op))
                return -EEXIST;

        if (PTR_ERR(op) != -ENOMSG)
                return PTR_ERR(op);

        if (eol)
                /* The option has been ended.  Treat it as no more
                 * header option can be written.
                 */
                return -ENOSPC;

        /* No duplication found.  Store the header option. */
        memcpy(opend, from, new_kind_len);

        bpf_sock->remaining_opt_len -= new_kind_len;
        bpf_sock->skb_data_end += new_kind_len;

        return 0;
}

static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
        .func                = bpf_sock_ops_store_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           u32, len, u64, flags)
{
        if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
                return -EPERM;

        if (flags || len < 2)
                return -EINVAL;

        if (len > bpf_sock->remaining_opt_len)
                return -ENOSPC;

        bpf_sock->remaining_opt_len -= len;

        return 0;
}

static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
        .func                = bpf_sock_ops_reserve_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
           u64, tstamp, u32, tstamp_type)
{
        /* skb_clear_delivery_time() is done for inet protocol */
        if (skb->protocol != htons(ETH_P_IP) &&
            skb->protocol != htons(ETH_P_IPV6))
                return -EOPNOTSUPP;

        switch (tstamp_type) {
        case BPF_SKB_CLOCK_REALTIME:
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_REALTIME;
                break;
        case BPF_SKB_CLOCK_MONOTONIC:
                if (!tstamp)
                        return -EINVAL;
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_MONOTONIC;
                break;
        case BPF_SKB_CLOCK_TAI:
                if (!tstamp)
                        return -EINVAL;
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_TAI;
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
        .func           = bpf_skb_set_tstamp,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

#ifdef CONFIG_SYN_COOKIES
BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
           struct tcphdr *, th, u32, th_len)
{
        u32 cookie;
        u16 mss;

        if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
        cookie = __cookie_v4_init_sequence(iph, th, &mss);

        return cookie | ((u64)mss << 32);
}

static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
        .func                = bpf_tcp_raw_gen_syncookie_ipv4,
        .gpl_only        = true, /* __cookie_v4_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
        .arg1_size        = sizeof(struct iphdr),
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
           struct tcphdr *, th, u32, th_len)
{
#if IS_ENABLED(CONFIG_IPV6)
        const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
                sizeof(struct ipv6hdr);
        u32 cookie;
        u16 mss;

        if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
        cookie = __cookie_v6_init_sequence(iph, th, &mss);

        return cookie | ((u64)mss << 32);
#else
        return -EPROTONOSUPPORT;
#endif
}

static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
        .func                = bpf_tcp_raw_gen_syncookie_ipv6,
        .gpl_only        = true, /* __cookie_v6_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
        .arg1_size        = sizeof(struct ipv6hdr),
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
           struct tcphdr *, th)
{
        if (__cookie_v4_check(iph, th) > 0)
                return 0;

        return -EACCES;
}

static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
        .func                = bpf_tcp_raw_check_syncookie_ipv4,
        .gpl_only        = true, /* __cookie_v4_check is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
        .arg1_size        = sizeof(struct iphdr),
        .arg2_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
        .arg2_size        = sizeof(struct tcphdr),
};

BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
           struct tcphdr *, th)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (__cookie_v6_check(iph, th) > 0)
                return 0;

        return -EACCES;
#else
        return -EPROTONOSUPPORT;
#endif
}

static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
        .func                = bpf_tcp_raw_check_syncookie_ipv6,
        .gpl_only        = true, /* __cookie_v6_check is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
        .arg1_size        = sizeof(struct ipv6hdr),
        .arg2_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
        .arg2_size        = sizeof(struct tcphdr),
};
#endif /* CONFIG_SYN_COOKIES */

#endif /* CONFIG_INET */

bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
{
        switch (func_id) {
        case BPF_FUNC_clone_redirect:
        case BPF_FUNC_l3_csum_replace:
        case BPF_FUNC_l4_csum_replace:
        case BPF_FUNC_lwt_push_encap:
        case BPF_FUNC_lwt_seg6_action:
        case BPF_FUNC_lwt_seg6_adjust_srh:
        case BPF_FUNC_lwt_seg6_store_bytes:
        case BPF_FUNC_msg_pop_data:
        case BPF_FUNC_msg_pull_data:
        case BPF_FUNC_msg_push_data:
        case BPF_FUNC_skb_adjust_room:
        case BPF_FUNC_skb_change_head:
        case BPF_FUNC_skb_change_proto:
        case BPF_FUNC_skb_change_tail:
        case BPF_FUNC_skb_pull_data:
        case BPF_FUNC_skb_store_bytes:
        case BPF_FUNC_skb_vlan_pop:
        case BPF_FUNC_skb_vlan_push:
        case BPF_FUNC_store_hdr_opt:
        case BPF_FUNC_xdp_adjust_head:
        case BPF_FUNC_xdp_adjust_meta:
        case BPF_FUNC_xdp_adjust_tail:
        /* tail-called program could call any of the above */
        case BPF_FUNC_tail_call:
                return true;
        default:
                return false;
        }
}

const struct bpf_func_proto bpf_event_output_data_proto __weak;
const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;

static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_cg_sock_proto;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        case BPF_FUNC_setsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET_SOCK_CREATE:
                        return &bpf_sock_create_setsockopt_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_getsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET_SOCK_CREATE:
                        return &bpf_sock_create_getsockopt_proto;
                default:
                        return NULL;
                }
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_bind:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                        return &bpf_bind_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_addr_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_addr_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sock_addr_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sock_addr_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_sock_addr_skc_lookup_tcp_proto;
#endif /* CONFIG_INET */
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_setsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UNIX_CONNECT:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UNIX_SENDMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        return &bpf_sock_addr_setsockopt_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_getsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UNIX_CONNECT:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UNIX_SENDMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        return &bpf_sock_addr_getsockopt_proto;
                default:
                        return NULL;
                }
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &bpf_skb_load_bytes_relative_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;

static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
#ifdef CONFIG_SOCK_CGROUP_DATA
        case BPF_FUNC_skb_cgroup_id:
                return &bpf_skb_cgroup_id_proto;
        case BPF_FUNC_skb_ancestor_cgroup_id:
                return &bpf_skb_ancestor_cgroup_id_proto;
        case BPF_FUNC_sk_cgroup_id:
                return &bpf_sk_cgroup_id_proto;
        case BPF_FUNC_sk_ancestor_cgroup_id:
                return &bpf_sk_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
        case BPF_FUNC_get_listener_sock:
                return &bpf_get_listener_sock_proto;
        case BPF_FUNC_skb_ecn_set_ce:
                return &bpf_skb_ecn_set_ce_proto;
#endif
        default:
                return sk_filter_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &bpf_skb_load_bytes_relative_proto;
        case BPF_FUNC_skb_pull_data:
                return &bpf_skb_pull_data_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_csum_update:
                return &bpf_csum_update_proto;
        case BPF_FUNC_csum_level:
                return &bpf_csum_level_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace:
                return &bpf_l4_csum_replace_proto;
        case BPF_FUNC_clone_redirect:
                return &bpf_clone_redirect_proto;
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_proto;
        case BPF_FUNC_skb_vlan_push:
                return &bpf_skb_vlan_push_proto;
        case BPF_FUNC_skb_vlan_pop:
                return &bpf_skb_vlan_pop_proto;
        case BPF_FUNC_skb_change_proto:
                return &bpf_skb_change_proto_proto;
        case BPF_FUNC_skb_change_type:
                return &bpf_skb_change_type_proto;
        case BPF_FUNC_skb_adjust_room:
                return &bpf_skb_adjust_room_proto;
        case BPF_FUNC_skb_change_tail:
                return &bpf_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &bpf_skb_change_head_proto;
        case BPF_FUNC_skb_get_tunnel_key:
                return &bpf_skb_get_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_key:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_skb_get_tunnel_opt:
                return &bpf_skb_get_tunnel_opt_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_redirect:
                return &bpf_redirect_proto;
        case BPF_FUNC_redirect_neigh:
                return &bpf_redirect_neigh_proto;
        case BPF_FUNC_redirect_peer:
                return &bpf_redirect_peer_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
                return &bpf_get_hash_recalc_proto;
        case BPF_FUNC_set_hash_invalid:
                return &bpf_set_hash_invalid_proto;
        case BPF_FUNC_set_hash:
                return &bpf_set_hash_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_skb_under_cgroup:
                return &bpf_skb_under_cgroup_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_skb_fib_lookup_proto;
        case BPF_FUNC_check_mtu:
                return &bpf_skb_check_mtu_proto;
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_XFRM
        case BPF_FUNC_skb_get_xfrm_state:
                return &bpf_skb_get_xfrm_state_proto;
#endif
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_skb_cgroup_classid:
                return &bpf_skb_cgroup_classid_proto;
#endif
#ifdef CONFIG_SOCK_CGROUP_DATA
        case BPF_FUNC_skb_cgroup_id:
                return &bpf_skb_cgroup_id_proto;
        case BPF_FUNC_skb_ancestor_cgroup_id:
                return &bpf_skb_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_tc_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_tc_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
        case BPF_FUNC_get_listener_sock:
                return &bpf_get_listener_sock_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_tc_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_check_syncookie:
                return &bpf_tcp_check_syncookie_proto;
        case BPF_FUNC_skb_ecn_set_ce:
                return &bpf_skb_ecn_set_ce_proto;
        case BPF_FUNC_tcp_gen_syncookie:
                return &bpf_tcp_gen_syncookie_proto;
        case BPF_FUNC_sk_assign:
                return &bpf_sk_assign_proto;
        case BPF_FUNC_skb_set_tstamp:
                return &bpf_skb_set_tstamp_proto;
#ifdef CONFIG_SYN_COOKIES
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
                return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
                return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
                return &bpf_tcp_raw_check_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
                return &bpf_tcp_raw_check_syncookie_ipv6_proto;
#endif
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_xdp_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_xdp_adjust_head:
                return &bpf_xdp_adjust_head_proto;
        case BPF_FUNC_xdp_adjust_meta:
                return &bpf_xdp_adjust_meta_proto;
        case BPF_FUNC_redirect:
                return &bpf_xdp_redirect_proto;
        case BPF_FUNC_redirect_map:
                return &bpf_xdp_redirect_map_proto;
        case BPF_FUNC_xdp_adjust_tail:
                return &bpf_xdp_adjust_tail_proto;
        case BPF_FUNC_xdp_get_buff_len:
                return &bpf_xdp_get_buff_len_proto;
        case BPF_FUNC_xdp_load_bytes:
                return &bpf_xdp_load_bytes_proto;
        case BPF_FUNC_xdp_store_bytes:
                return &bpf_xdp_store_bytes_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_xdp_fib_lookup_proto;
        case BPF_FUNC_check_mtu:
                return &bpf_xdp_check_mtu_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_xdp_sk_lookup_udp_proto;
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_xdp_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_xdp_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_check_syncookie:
                return &bpf_tcp_check_syncookie_proto;
        case BPF_FUNC_tcp_gen_syncookie:
                return &bpf_tcp_gen_syncookie_proto;
#ifdef CONFIG_SYN_COOKIES
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
                return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
                return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
                return &bpf_tcp_raw_check_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
                return &bpf_tcp_raw_check_syncookie_ipv6_proto;
#endif
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }

#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
        /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The
         * kfuncs are defined in two different modules, and we want to be able
         * to use them interchangeably with the same BTF type ID. Because modules
         * can't de-duplicate BTF IDs between each other, we need the type to be
         * referenced in the vmlinux BTF or the verifier will get confused about
         * the different types. So we add this dummy type reference which will
         * be included in vmlinux BTF, allowing both modules to refer to the
         * same type ID.
         */
        BTF_TYPE_EMIT(struct nf_conn___init);
#endif
}

const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto bpf_sock_hash_update_proto __weak;

static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_setsockopt:
                return &bpf_sock_ops_setsockopt_proto;
        case BPF_FUNC_getsockopt:
                return &bpf_sock_ops_getsockopt_proto;
        case BPF_FUNC_sock_ops_cb_flags_set:
                return &bpf_sock_ops_cb_flags_set_proto;
        case BPF_FUNC_sock_map_update:
                return &bpf_sock_map_update_proto;
        case BPF_FUNC_sock_hash_update:
                return &bpf_sock_hash_update_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_ops_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_ops_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_load_hdr_opt:
                return &bpf_sock_ops_load_hdr_opt_proto;
        case BPF_FUNC_store_hdr_opt:
                return &bpf_sock_ops_store_hdr_opt_proto;
        case BPF_FUNC_reserve_hdr_opt:
                return &bpf_sock_ops_reserve_hdr_opt_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;

static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_msg_redirect_map:
                return &bpf_msg_redirect_map_proto;
        case BPF_FUNC_msg_redirect_hash:
                return &bpf_msg_redirect_hash_proto;
        case BPF_FUNC_msg_apply_bytes:
                return &bpf_msg_apply_bytes_proto;
        case BPF_FUNC_msg_cork_bytes:
                return &bpf_msg_cork_bytes_proto;
        case BPF_FUNC_msg_pull_data:
                return &bpf_msg_pull_data_proto;
        case BPF_FUNC_msg_push_data:
                return &bpf_msg_push_data_proto;
        case BPF_FUNC_msg_pop_data:
                return &bpf_msg_pop_data_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sk_msg_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;

static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_pull_data:
                return &sk_skb_pull_data_proto;
        case BPF_FUNC_skb_change_tail:
                return &sk_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &sk_skb_change_head_proto;
        case BPF_FUNC_skb_adjust_room:
                return &sk_skb_adjust_room_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_sk_redirect_map:
                return &bpf_sk_redirect_map_proto;
        case BPF_FUNC_sk_redirect_hash:
                return &bpf_sk_redirect_hash_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_flow_dissector_load_bytes_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_pull_data:
                return &bpf_skb_pull_data_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
                return &bpf_get_hash_recalc_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_skb_under_cgroup:
                return &bpf_skb_under_cgroup_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_lwt_push_encap:
                return &bpf_lwt_in_push_encap_proto;
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_get_tunnel_key:
                return &bpf_skb_get_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_key:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_skb_get_tunnel_opt:
                return &bpf_skb_get_tunnel_opt_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_redirect:
                return &bpf_redirect_proto;
        case BPF_FUNC_clone_redirect:
                return &bpf_clone_redirect_proto;
        case BPF_FUNC_skb_change_tail:
                return &bpf_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &bpf_skb_change_head_proto;
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_csum_update:
                return &bpf_csum_update_proto;
        case BPF_FUNC_csum_level:
                return &bpf_csum_level_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace:
                return &bpf_l4_csum_replace_proto;
        case BPF_FUNC_set_hash_invalid:
                return &bpf_set_hash_invalid_proto;
        case BPF_FUNC_lwt_push_encap:
                return &bpf_lwt_xmit_push_encap_proto;
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
        case BPF_FUNC_lwt_seg6_store_bytes:
                return &bpf_lwt_seg6_store_bytes_proto;
        case BPF_FUNC_lwt_seg6_action:
                return &bpf_lwt_seg6_action_proto;
        case BPF_FUNC_lwt_seg6_adjust_srh:
                return &bpf_lwt_seg6_adjust_srh_proto;
#endif
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
                                    const struct bpf_prog *prog,
                                    struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct __sk_buff))
                return false;

        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        switch (off) {
        case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                if (off + size > offsetofend(struct __sk_buff, cb[4]))
                        return false;
                break;
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (info->is_ldsx || size != size_default)
                        return false;
                break;
        case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
        case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
                if (size != size_default)
                        return false;
                break;
        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
                return false;
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                if (type == BPF_WRITE || size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range(struct __sk_buff, tstamp):
                if (size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range_ptr(struct __sk_buff, sk):
                if (type == BPF_WRITE || size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
                break;
        case offsetof(struct __sk_buff, tstamp_type):
                return false;
        case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
                /* Explicitly prohibit access to padding in __sk_buff. */
                return false;
        default:
                /* Only narrow read access allowed for now. */
                if (type == BPF_WRITE) {
                        if (size != size_default)
                                return false;
                } else {
                        bpf_ctx_record_field_size(info, size_default);
                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                                return false;
                }
        }

        return true;
}

static bool sk_filter_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, data_end):
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                default:
                        return false;
                }
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool cg_skb_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, wire_len):
                return false;
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (!bpf_token_capable(prog->aux->token, CAP_BPF))
                        return false;
                break;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                case bpf_ctx_range(struct __sk_buff, tstamp):
                        if (!bpf_token_capable(prog->aux->token, CAP_BPF))
                                return false;
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool lwt_is_valid_access(int off, int size,
                                enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

/* Attach type specific accesses */
static bool __sock_filter_check_attach_type(int off,
                                            enum bpf_access_type access_type,
                                            enum bpf_attach_type attach_type)
{
        switch (off) {
        case offsetof(struct bpf_sock, bound_dev_if):
        case offsetof(struct bpf_sock, mark):
        case offsetof(struct bpf_sock, priority):
                switch (attach_type) {
                case BPF_CGROUP_INET_SOCK_CREATE:
                case BPF_CGROUP_INET_SOCK_RELEASE:
                        goto full_access;
                default:
                        return false;
                }
        case bpf_ctx_range(struct bpf_sock, src_ip4):
                switch (attach_type) {
                case BPF_CGROUP_INET4_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
                switch (attach_type) {
                case BPF_CGROUP_INET6_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        case bpf_ctx_range(struct bpf_sock, src_port):
                switch (attach_type) {
                case BPF_CGROUP_INET4_POST_BIND:
                case BPF_CGROUP_INET6_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        }
read_only:
        return access_type == BPF_READ;
full_access:
        return true;
}

bool bpf_sock_common_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range_till(struct bpf_sock, type, priority):
                return false;
        default:
                return bpf_sock_is_valid_access(off, size, type, info);
        }
}

bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                              struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);
        int field_size;

        if (off < 0 || off >= sizeof(struct bpf_sock))
                return false;
        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct bpf_sock, state):
        case offsetof(struct bpf_sock, family):
        case offsetof(struct bpf_sock, type):
        case offsetof(struct bpf_sock, protocol):
        case offsetof(struct bpf_sock, src_port):
        case offsetof(struct bpf_sock, rx_queue_mapping):
        case bpf_ctx_range(struct bpf_sock, src_ip4):
        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
        case bpf_ctx_range(struct bpf_sock, dst_ip4):
        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);
        case bpf_ctx_range(struct bpf_sock, dst_port):
                field_size = size == size_default ?
                        size_default : sizeof_field(struct bpf_sock, dst_port);
                bpf_ctx_record_field_size(info, field_size);
                return bpf_ctx_narrow_access_ok(off, size, field_size);
        case offsetofend(struct bpf_sock, dst_port) ...
             offsetof(struct bpf_sock, dst_ip4) - 1:
                return false;
        }

        return size == size_default;
}

static bool sock_filter_is_valid_access(int off, int size,
                                        enum bpf_access_type type,
                                        const struct bpf_prog *prog,
                                        struct bpf_insn_access_aux *info)
{
        if (!bpf_sock_is_valid_access(off, size, type, info))
                return false;
        return __sock_filter_check_attach_type(off, type,
                                               prog->expected_attach_type);
}

static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write,
                             const struct bpf_prog *prog)
{
        /* Neither direct read nor direct write requires any preliminary
         * action.
         */
        return 0;
}

static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
                                const struct bpf_prog *prog, int drop_verdict)
{
        struct bpf_insn *insn = insn_buf;

        if (!direct_write)
                return 0;

        /* if (!skb->cloned)
         *       goto start;
         *
         * (Fast-path, otherwise approximation that we might be
         *  a clone, do the rest in helper.)
         */
        *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET);
        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);

        /* ret = bpf_skb_pull_data(skb, 0); */
        *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
        *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
        *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
                               BPF_FUNC_skb_pull_data);
        /* if (!ret)
         *      goto restore;
         * return TC_ACT_SHOT;
         */
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
        *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
        *insn++ = BPF_EXIT_INSN();

        /* restore: */
        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
        /* start: */
        *insn++ = prog->insnsi[0];

        return insn - insn_buf;
}

static int bpf_gen_ld_abs(const struct bpf_insn *orig,
                          struct bpf_insn *insn_buf)
{
        bool indirect = BPF_MODE(orig->code) == BPF_IND;
        struct bpf_insn *insn = insn_buf;

        if (!indirect) {
                *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
        } else {
                *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
                if (orig->imm)
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
        }
        /* We're guaranteed here that CTX is in R6. */
        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);

        switch (BPF_SIZE(orig->code)) {
        case BPF_B:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
                break;
        case BPF_H:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
                break;
        case BPF_W:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
                break;
        }

        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
        *insn++ = BPF_EXIT_INSN();

        return insn - insn_buf;
}

static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
                               const struct bpf_prog *prog)
{
        return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
}

static bool tc_cls_act_is_valid_access(int off, int size,
                                       enum bpf_access_type type,
                                       const struct bpf_prog *prog,
                                       struct bpf_insn_access_aux *info)
{
        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, tc_index):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range(struct __sk_buff, tc_classid):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                case bpf_ctx_range(struct __sk_buff, tstamp):
                case bpf_ctx_range(struct __sk_buff, queue_mapping):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_meta):
                info->reg_type = PTR_TO_PACKET_META;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
                return false;
        case offsetof(struct __sk_buff, tstamp_type):
                /* The convert_ctx_access() on reading and writing
                 * __sk_buff->tstamp depends on whether the bpf prog
                 * has used __sk_buff->tstamp_type or not.
                 * Thus, we need to set prog->tstamp_type_access
                 * earlier during is_valid_access() here.
                 */
                ((struct bpf_prog *)prog)->tstamp_type_access = 1;
                return size == sizeof(__u8);
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

DEFINE_MUTEX(nf_conn_btf_access_lock);
EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);

int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
                              const struct bpf_reg_state *reg,
                              int off, int size);
EXPORT_SYMBOL_GPL(nfct_btf_struct_access);

static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
                                        const struct bpf_reg_state *reg,
                                        int off, int size)
{
        int ret = -EACCES;

        mutex_lock(&nf_conn_btf_access_lock);
        if (nfct_btf_struct_access)
                ret = nfct_btf_struct_access(log, reg, off, size);
        mutex_unlock(&nf_conn_btf_access_lock);

        return ret;
}

static bool __is_valid_xdp_access(int off, int size)
{
        if (off < 0 || off >= sizeof(struct xdp_md))
                return false;
        if (off % size != 0)
                return false;
        if (size != sizeof(__u32))
                return false;

        return true;
}

static bool xdp_is_valid_access(int off, int size,
                                enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info)
{
        if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
                switch (off) {
                case offsetof(struct xdp_md, egress_ifindex):
                        return false;
                }
        }

        if (type == BPF_WRITE) {
                if (bpf_prog_is_offloaded(prog->aux)) {
                        switch (off) {
                        case offsetof(struct xdp_md, rx_queue_index):
                                return __is_valid_xdp_access(off, size);
                        }
                }
                return false;
        } else {
                switch (off) {
                case offsetof(struct xdp_md, data_meta):
                case offsetof(struct xdp_md, data):
                case offsetof(struct xdp_md, data_end):
                        if (info->is_ldsx)
                                return false;
                }
        }

        switch (off) {
        case offsetof(struct xdp_md, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case offsetof(struct xdp_md, data_meta):
                info->reg_type = PTR_TO_PACKET_META;
                break;
        case offsetof(struct xdp_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return __is_valid_xdp_access(off, size);
}

void bpf_warn_invalid_xdp_action(const struct net_device *dev,
                                 const struct bpf_prog *prog, u32 act)
{
        const u32 act_max = XDP_REDIRECT;

        pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n",
                     act > act_max ? "Illegal" : "Driver unsupported",
                     act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A");
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);

static int xdp_btf_struct_access(struct bpf_verifier_log *log,
                                 const struct bpf_reg_state *reg,
                                 int off, int size)
{
        int ret = -EACCES;

        mutex_lock(&nf_conn_btf_access_lock);
        if (nfct_btf_struct_access)
                ret = nfct_btf_struct_access(log, reg, off, size);
        mutex_unlock(&nf_conn_btf_access_lock);

        return ret;
}

static bool sock_addr_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sock_addr))
                return false;
        if (off % size != 0)
                return false;

        /* Disallow access to fields not belonging to the attach type's address
         * family.
         */
        switch (off) {
        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_UDP4_SENDMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_UDP6_SENDMSG:
                        break;
                default:
                        return false;
                }
                break;
        }

        switch (off) {
        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
        case bpf_ctx_range(struct bpf_sock_addr, user_port):
                if (type == BPF_READ) {
                        bpf_ctx_record_field_size(info, size_default);

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   user_ip6))
                                return true;

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   msg_src_ip6))
                                return true;

                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                                return false;
                } else {
                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   user_ip6))
                                return true;

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   msg_src_ip6))
                                return true;

                        if (size != size_default)
                                return false;
                }
                break;
        case bpf_ctx_range_ptr(struct bpf_sock_addr, sk):
                if (type != BPF_READ)
                        return false;
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        case bpf_ctx_range(struct bpf_sock_addr, user_family):
        case bpf_ctx_range(struct bpf_sock_addr, family):
        case bpf_ctx_range(struct bpf_sock_addr, type):
        case bpf_ctx_range(struct bpf_sock_addr, protocol):
                if (type != BPF_READ)
                        return false;
                if (size != size_default)
                        return false;
                break;
        default:
                return false;
        }

        return true;
}

static bool sock_ops_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     const struct bpf_prog *prog,
                                     struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sock_ops))
                return false;

        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        if (type == BPF_WRITE) {
                switch (off) {
                case offsetof(struct bpf_sock_ops, reply):
                case offsetof(struct bpf_sock_ops, sk_txhash):
                        if (size != size_default)
                                return false;
                        break;
                default:
                        return false;
                }
        } else {
                switch (off) {
                case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
                                        bytes_acked):
                        if (size != sizeof(__u64))
                                return false;
                        break;
                case bpf_ctx_range_ptr(struct bpf_sock_ops, sk):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_SOCKET_OR_NULL;
                        break;
                case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_PACKET;
                        break;
                case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_PACKET_END;
                        break;
                case offsetof(struct bpf_sock_ops, skb_tcp_flags):
                        bpf_ctx_record_field_size(info, size_default);
                        return bpf_ctx_narrow_access_ok(off, size,
                                                        size_default);
                case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp):
                        if (size != sizeof(__u64))
                                return false;
                        break;
                default:
                        if (size != size_default)
                                return false;
                        break;
                }
        }

        return true;
}

static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
                           const struct bpf_prog *prog)
{
        return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
}

static bool sk_skb_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, tc_index):
                case bpf_ctx_range(struct __sk_buff, priority):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, mark):
                return false;
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool sk_msg_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        if (type == BPF_WRITE)
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        case bpf_ctx_range_ptr(struct sk_msg_md, data):
                info->reg_type = PTR_TO_PACKET;
                if (size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range_ptr(struct sk_msg_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                if (size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range_ptr(struct sk_msg_md, sk):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        case bpf_ctx_range(struct sk_msg_md, family):
        case bpf_ctx_range(struct sk_msg_md, remote_ip4):
        case bpf_ctx_range(struct sk_msg_md, local_ip4):
        case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range(struct sk_msg_md, remote_port):
        case bpf_ctx_range(struct sk_msg_md, local_port):
        case bpf_ctx_range(struct sk_msg_md, size):
                if (size != sizeof(__u32))
                        return false;
                break;
        default:
                return false;
        }
        return true;
}

static bool flow_dissector_is_valid_access(int off, int size,
                                           enum bpf_access_type type,
                                           const struct bpf_prog *prog,
                                           struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct __sk_buff))
                return false;

        if (off % size != 0)
                return false;

        if (type == BPF_WRITE)
                return false;

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                if (info->is_ldsx || size != size_default)
                        return false;
                info->reg_type = PTR_TO_PACKET;
                return true;
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (info->is_ldsx || size != size_default)
                        return false;
                info->reg_type = PTR_TO_PACKET_END;
                return true;
        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_FLOW_KEYS;
                return true;
        default:
                return false;
        }
}

static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
                                             const struct bpf_insn *si,
                                             struct bpf_insn *insn_buf,
                                             struct bpf_prog *prog,
                                             u32 *target_size)

{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct __sk_buff, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, data));
                break;

        case offsetof(struct __sk_buff, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, data_end));
                break;

        case offsetof(struct __sk_buff, flow_keys):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, flow_keys));
                break;
        }

        return insn - insn_buf;
}

static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
                                                     struct bpf_insn *insn)
{
        __u8 value_reg = si->dst_reg;
        __u8 skb_reg = si->src_reg;
        BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
        BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
        BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
        BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
        *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
        *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
#ifdef __BIG_ENDIAN_BITFIELD
        *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
#else
        BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
#endif

        return insn;
}

static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg,
                                                  struct bpf_insn *insn)
{
        /* si->dst_reg = skb_shinfo(SKB); */
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
                              BPF_REG_AX, skb_reg,
                              offsetof(struct sk_buff, end));
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
                              dst_reg, skb_reg,
                              offsetof(struct sk_buff, head));
        *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX);
#else
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
                              dst_reg, skb_reg,
                              offsetof(struct sk_buff, end));
#endif

        return insn;
}

static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
                                                const struct bpf_insn *si,
                                                struct bpf_insn *insn)
{
        __u8 value_reg = si->dst_reg;
        __u8 skb_reg = si->src_reg;

#ifdef CONFIG_NET_XGRESS
        /* If the tstamp_type is read,
         * the bpf prog is aware the tstamp could have delivery time.
         * Thus, read skb->tstamp as is if tstamp_type_access is true.
         */
        if (!prog->tstamp_type_access) {
                /* AX is needed because src_reg and dst_reg could be the same */
                __u8 tmp_reg = BPF_REG_AX;

                *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
                /* check if ingress mask bits is set */
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
                *insn++ = BPF_JMP_A(4);
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
                *insn++ = BPF_JMP_A(2);
                /* skb->tc_at_ingress && skb->tstamp_type,
                 * read 0 as the (rcv) timestamp.
                 */
                *insn++ = BPF_MOV64_IMM(value_reg, 0);
                *insn++ = BPF_JMP_A(1);
        }
#endif

        *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
                              offsetof(struct sk_buff, tstamp));
        return insn;
}

static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
                                                 const struct bpf_insn *si,
                                                 struct bpf_insn *insn)
{
        __u8 value_reg = si->src_reg;
        __u8 skb_reg = si->dst_reg;

#ifdef CONFIG_NET_XGRESS
        /* If the tstamp_type is read,
         * the bpf prog is aware the tstamp could have delivery time.
         * Thus, write skb->tstamp as is if tstamp_type_access is true.
         * Otherwise, writing at ingress will have to clear the
         * skb->tstamp_type bit also.
         */
        if (!prog->tstamp_type_access) {
                __u8 tmp_reg = BPF_REG_AX;

                *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
                /* Writing __sk_buff->tstamp as ingress, goto <clear> */
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
                /* goto <store> */
                *insn++ = BPF_JMP_A(2);
                /* <clear>: skb->tstamp_type */
                *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
                *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
        }
#endif

        /* <store>: skb->tstamp = tstamp */
        *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM,
                               skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm);
        return insn;
}

#define BPF_EMIT_STORE(size, si, off)                                        \
        BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM,                \
                     (si)->dst_reg, (si)->src_reg, (off), (si)->imm)

static u32 bpf_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                                  struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct __sk_buff, len):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, len, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, protocol):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, protocol, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, vlan_proto):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, vlan_proto, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, priority):
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 bpf_target_off(struct sk_buff, priority, 4,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, priority, 4,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, skb_iif, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, dev));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct net_device, ifindex, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, hash):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, hash, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, mark):
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 bpf_target_off(struct sk_buff, mark, 4,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, mark, 4,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, pkt_type):
                *target_size = 1;
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
                                      PKT_TYPE_OFFSET);
                *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
#endif
                break;

        case offsetof(struct __sk_buff, queue_mapping):
                if (type == BPF_WRITE) {
                        u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);

                        if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
                                *insn++ = BPF_JMP_A(0); /* noop */
                                break;
                        }

                        if (BPF_CLASS(si->code) == BPF_STX)
                                *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
                        *insn++ = BPF_EMIT_STORE(BPF_H, si, offset);
                } else {
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff,
                                                             queue_mapping,
                                                             2, target_size));
                }
                break;

        case offsetof(struct __sk_buff, vlan_present):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff,
                                                     vlan_all, 4, target_size));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1);
                break;

        case offsetof(struct __sk_buff, vlan_tci):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, vlan_tci, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, cb[0]) ...
             offsetofend(struct __sk_buff, cb[4]) - 1:
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                              offsetof(struct qdisc_skb_cb, data)) %
                             sizeof(__u64));

                prog->cb_access = 1;
                off  = si->off;
                off -= offsetof(struct __sk_buff, cb[0]);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, data);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, tc_classid):
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);

                off  = si->off;
                off -= offsetof(struct __sk_buff, tc_classid);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, tc_classid);
                *target_size = 2;
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
                                              si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, data));
                break;

        case offsetof(struct __sk_buff, data_meta):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_meta);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct bpf_skb_data_end, data_meta);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, data_end):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_end);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct bpf_skb_data_end, data_end);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, tc_index):
#ifdef CONFIG_NET_SCHED
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_H, si,
                                                 bpf_target_off(struct sk_buff, tc_index, 2,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, tc_index, 2,
                                                             target_size));
#else
                *target_size = 2;
                if (type == BPF_WRITE)
                        *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
                else
                        *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct __sk_buff, napi_id):
#if defined(CONFIG_NET_RX_BUSY_POLL)
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, napi_id, 4,
                                                     target_size));
                *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#else
                *target_size = 4;
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
                break;
        case offsetof(struct __sk_buff, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_family,
                                                     2, target_size));
                break;
        case offsetof(struct __sk_buff, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_daddr,
                                                     4, target_size));
                break;
        case offsetof(struct __sk_buff, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_rcv_saddr,
                                                     4, target_size));
                break;
        case offsetof(struct __sk_buff, remote_ip6[0]) ...
             offsetof(struct __sk_buff, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, remote_ip6[0]);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        case offsetof(struct __sk_buff, local_ip6[0]) ...
             offsetof(struct __sk_buff, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, local_ip6[0]);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct __sk_buff, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_dport,
                                                     2, target_size));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct __sk_buff, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_num, 2, target_size));
                break;

        case offsetof(struct __sk_buff, tstamp):
                BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);

                if (type == BPF_WRITE)
                        insn = bpf_convert_tstamp_write(prog, si, insn);
                else
                        insn = bpf_convert_tstamp_read(prog, si, insn);
                break;

        case offsetof(struct __sk_buff, tstamp_type):
                insn = bpf_convert_tstamp_type_read(si, insn);
                break;

        case offsetof(struct __sk_buff, gso_segs):
                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     gso_segs, 2,
                                                     target_size));
                break;
        case offsetof(struct __sk_buff, gso_size):
                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     gso_size, 2,
                                                     target_size));
                break;
        case offsetof(struct __sk_buff, wire_len):
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, wire_len);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, pkt_len);
                *target_size = 4;
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                break;
        case offsetof(struct __sk_buff, hwtstamp):
                BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8);
                BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0);

                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_DW,
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     hwtstamps, 8,
                                                     target_size));
                break;
        }

        return insn - insn_buf;
}

u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                const struct bpf_insn *si,
                                struct bpf_insn *insn_buf,
                                struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct bpf_sock, bound_dev_if):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_bound_dev_if));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_bound_dev_if));
                break;

        case offsetof(struct bpf_sock, mark):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_mark));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_mark));
                break;

        case offsetof(struct bpf_sock, priority):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_priority));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_priority));
                break;

        case offsetof(struct bpf_sock, family):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_family),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common,
                                       skc_family,
                                       sizeof_field(struct sock_common,
                                                    skc_family),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, type):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_type),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_type,
                                       sizeof_field(struct sock, sk_type),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, protocol):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_protocol),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_protocol,
                                       sizeof_field(struct sock, sk_protocol),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, src_ip4):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_rcv_saddr,
                                       sizeof_field(struct sock_common,
                                                    skc_rcv_saddr),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, dst_ip4):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_daddr,
                                       sizeof_field(struct sock_common,
                                                    skc_daddr),
                                       target_size));
                break;

        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                off = si->off;
                off -= offsetof(struct bpf_sock, src_ip6[0]);
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(
                                struct sock_common,
                                skc_v6_rcv_saddr.s6_addr32[0],
                                sizeof_field(struct sock_common,
                                             skc_v6_rcv_saddr.s6_addr32[0]),
                                target_size) + off);
#else
                (void)off;
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                off = si->off;
                off -= offsetof(struct bpf_sock, dst_ip6[0]);
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common,
                                       skc_v6_daddr.s6_addr32[0],
                                       sizeof_field(struct sock_common,
                                                    skc_v6_daddr.s6_addr32[0]),
                                       target_size) + off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
                *target_size = 4;
#endif
                break;

        case offsetof(struct bpf_sock, src_port):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_num),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_num,
                                       sizeof_field(struct sock_common,
                                                    skc_num),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, dst_port):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_dport,
                                       sizeof_field(struct sock_common,
                                                    skc_dport),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, state):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_state),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_state,
                                       sizeof_field(struct sock_common,
                                                    skc_state),
                                       target_size));
                break;
        case offsetof(struct bpf_sock, rx_queue_mapping):
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_rx_queue_mapping,
                                       sizeof_field(struct sock,
                                                    sk_rx_queue_mapping),
                                       target_size));
                *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
                                      1);
                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
#else
                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
                *target_size = 2;
#endif
                break;
        }

        return insn - insn_buf;
}

static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
                                         const struct bpf_insn *si,
                                         struct bpf_insn *insn_buf,
                                         struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct __sk_buff, ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct net_device, ifindex, 4,
                                                     target_size));
                break;
        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
                                              target_size);
        }

        return insn - insn_buf;
}

static u32 xdp_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                                  struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct xdp_md, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data));
                break;
        case offsetof(struct xdp_md, data_meta):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data_meta));
                break;
        case offsetof(struct xdp_md, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data_end));
                break;
        case offsetof(struct xdp_md, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, rxq));
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_rxq_info, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
        case offsetof(struct xdp_md, rx_queue_index):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, rxq));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_rxq_info,
                                               queue_index));
                break;
        case offsetof(struct xdp_md, egress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, txq));
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_txq_info, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
        }

        return insn - insn_buf;
}

/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
 * context Structure, F is Field in context structure that contains a pointer
 * to Nested Structure of type NS that has the field NF.
 *
 * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
 * sure that SIZE is not greater than actual size of S.F.NF.
 *
 * If offset OFF is provided, the load happens from that offset relative to
 * offset of NF.
 */
#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF)               \
        do {                                                                       \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg,     \
                                      si->src_reg, offsetof(S, F));               \
                *insn++ = BPF_LDX_MEM(                                               \
                        SIZE, si->dst_reg, si->dst_reg,                               \
                        bpf_target_off(NS, NF, sizeof_field(NS, NF),               \
                                       target_size)                               \
                                + OFF);                                               \
        } while (0)

#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF)                               \
        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF,                       \
                                             BPF_FIELD_SIZEOF(NS, NF), 0)

/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
 * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
 *
 * In addition it uses Temporary Field TF (member of struct S) as the 3rd
 * "register" since two registers available in convert_ctx_access are not
 * enough: we can't override neither SRC, since it contains value to store, nor
 * DST since it contains pointer to context that may be used by later
 * instructions. But we need a temporary place to save pointer to nested
 * structure whose field we want to store to.
 */
#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF)               \
        do {                                                                       \
                int tmp_reg = BPF_REG_9;                                       \
                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)               \
                        --tmp_reg;                                               \
                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)               \
                        --tmp_reg;                                               \
                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg,               \
                                      offsetof(S, TF));                               \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg,               \
                                      si->dst_reg, offsetof(S, F));               \
                *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code),   \
                                       tmp_reg, si->src_reg,                       \
                        bpf_target_off(NS, NF, sizeof_field(NS, NF),               \
                                       target_size)                               \
                                       + OFF,                                       \
                                       si->imm);                               \
                *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg,               \
                                      offsetof(S, TF));                               \
        } while (0)

#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
                                                      TF)                       \
        do {                                                                       \
                if (type == BPF_WRITE) {                                       \
                        SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE,   \
                                                         OFF, TF);               \
                } else {                                                       \
                        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(                       \
                                S, NS, F, NF, SIZE, OFF);  \
                }                                                               \
        } while (0)

static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
                                        const struct bpf_insn *si,
                                        struct bpf_insn *insn_buf,
                                        struct bpf_prog *prog, u32 *target_size)
{
        int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sock_addr, user_family):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sockaddr, uaddr, sa_family);
                break;

        case offsetof(struct bpf_sock_addr, user_ip4):
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
                        sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
                break;

        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
                off = si->off;
                off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
                        sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
                        tmp_reg);
                break;

        case offsetof(struct bpf_sock_addr, user_port):
                /* To get port we need to know sa_family first and then treat
                 * sockaddr as either sockaddr_in or sockaddr_in6.
                 * Though we can simplify since port field has same offset and
                 * size in both structures.
                 * Here we check this invariant and use just one of the
                 * structures if it's true.
                 */
                BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
                             offsetof(struct sockaddr_in6, sin6_port));
                BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
                             sizeof_field(struct sockaddr_in6, sin6_port));
                /* Account for sin6_port being smaller than user_port. */
                port_size = min(port_size, BPF_LDST_BYTES(si));
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
                        sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
                break;

        case offsetof(struct bpf_sock_addr, family):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_family);
                break;

        case offsetof(struct bpf_sock_addr, type):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_type);
                break;

        case offsetof(struct bpf_sock_addr, protocol):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_protocol);
                break;

        case offsetof(struct bpf_sock_addr, msg_src_ip4):
                /* Treat t_ctx as struct in_addr for msg_src_ip4. */
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct in_addr, t_ctx,
                        s_addr, BPF_SIZE(si->code), 0, tmp_reg);
                break;

        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
                off = si->off;
                off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
                /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
                        s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
                break;
        case offsetof(struct bpf_sock_addr, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_addr_kern, sk));
                break;
        }

        return insn - insn_buf;
}

static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
                                       const struct bpf_insn *si,
                                       struct bpf_insn *insn_buf,
                                       struct bpf_prog *prog,
                                       u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                              \
        do {                                                                      \
                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                      \
                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == si->src_reg) {                              \
                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,              \
                                          offsetof(struct bpf_sock_ops_kern,  \
                                          temp));                              \
                        fullsock_reg = reg;                                      \
                        jmp += 2;                                              \
                }                                                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_locked_tcp_sock),              \
                                      fullsock_reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_locked_tcp_sock));              \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);              \
                if (si->dst_reg == si->src_reg)                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      si->dst_reg, si->src_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,                      \
                                                       OBJ_FIELD),              \
                                      si->dst_reg, si->dst_reg,                      \
                                      offsetof(OBJ, OBJ_FIELD));              \
                if (si->dst_reg == si->src_reg)        {                              \
                        *insn++ = BPF_JMP_A(2);                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                        *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);              \
                }                                                              \
        } while (0)

#define SOCK_OPS_GET_SK()                                                              \
        do {                                                                      \
                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1;     \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == si->src_reg) {                              \
                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,              \
                                          offsetof(struct bpf_sock_ops_kern,  \
                                          temp));                              \
                        fullsock_reg = reg;                                      \
                        jmp += 2;                                              \
                }                                                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_fullsock),                      \
                                      fullsock_reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_fullsock));                      \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);              \
                if (si->dst_reg == si->src_reg)                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      si->dst_reg, si->src_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                if (si->dst_reg == si->src_reg)        {                              \
                        *insn++ = BPF_JMP_A(2);                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                        *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);              \
                }                                                              \
        } while (0)

#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
                SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)

/* Helper macro for adding write access to tcp_sock or sock fields.
 * The macro is called with two registers, dst_reg which contains a pointer
 * to ctx (context) and src_reg which contains the value that should be
 * stored. However, we need an additional register since we cannot overwrite
 * dst_reg because it may be used later in the program.
 * Instead we "borrow" one of the other register. We first save its value
 * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
 * it at the end of the macro.
 */
#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                              \
        do {                                                                      \
                int reg = BPF_REG_9;                                              \
                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                      \
                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,                      \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               temp));                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_locked_tcp_sock),              \
                                      reg, si->dst_reg,                              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_locked_tcp_sock));              \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      reg, si->dst_reg,                              \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) |     \
                                       BPF_MEM | BPF_CLASS(si->code),              \
                                       reg, si->src_reg,                      \
                                       offsetof(OBJ, OBJ_FIELD),              \
                                       si->imm);                              \
                *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               temp));                              \
        } while (0)

#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)              \
        do {                                                                      \
                if (TYPE == BPF_WRITE)                                              \
                        SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);              \
                else                                                              \
                        SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);              \
        } while (0)

        switch (si->off) {
        case offsetof(struct bpf_sock_ops, op):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       op),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, op));
                break;

        case offsetof(struct bpf_sock_ops, replylong[0]) ...
             offsetof(struct bpf_sock_ops, replylong[3]):
                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
                             sizeof_field(struct bpf_sock_ops_kern, reply));
                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
                             sizeof_field(struct bpf_sock_ops_kern, replylong));
                off = si->off;
                off -= offsetof(struct bpf_sock_ops, replylong[0]);
                off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              off);
                break;

        case offsetof(struct bpf_sock_ops, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_family));
                break;

        case offsetof(struct bpf_sock_ops, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_daddr));
                break;

        case offsetof(struct bpf_sock_ops, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_rcv_saddr));
                break;

        case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
             offsetof(struct bpf_sock_ops, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
             offsetof(struct bpf_sock_ops, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct bpf_sock_ops, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct bpf_sock_ops, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_num));
                break;

        case offsetof(struct bpf_sock_ops, is_fullsock):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern,
                                                is_fullsock),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               is_fullsock));
                break;

        case offsetof(struct bpf_sock_ops, state):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_state));
                break;

        case offsetof(struct bpf_sock_ops, rtt_min):
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
                             sizeof(struct minmax));
                BUILD_BUG_ON(sizeof(struct minmax) <
                             sizeof(struct minmax_sample));

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct tcp_sock, rtt_min) +
                                      sizeof_field(struct minmax_sample, t));
                break;

        case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
                SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
                                   struct tcp_sock);
                break;

        case offsetof(struct bpf_sock_ops, sk_txhash):
                SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
                                          struct sock, type);
                break;
        case offsetof(struct bpf_sock_ops, snd_cwnd):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
                break;
        case offsetof(struct bpf_sock_ops, srtt_us):
                SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
                break;
        case offsetof(struct bpf_sock_ops, snd_ssthresh):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
                break;
        case offsetof(struct bpf_sock_ops, rcv_nxt):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
                break;
        case offsetof(struct bpf_sock_ops, snd_nxt):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
                break;
        case offsetof(struct bpf_sock_ops, snd_una):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
                break;
        case offsetof(struct bpf_sock_ops, mss_cache):
                SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
                break;
        case offsetof(struct bpf_sock_ops, ecn_flags):
                SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
                break;
        case offsetof(struct bpf_sock_ops, rate_delivered):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
                break;
        case offsetof(struct bpf_sock_ops, rate_interval_us):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
                break;
        case offsetof(struct bpf_sock_ops, packets_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
                break;
        case offsetof(struct bpf_sock_ops, retrans_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
                break;
        case offsetof(struct bpf_sock_ops, total_retrans):
                SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
                break;
        case offsetof(struct bpf_sock_ops, segs_in):
                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
                break;
        case offsetof(struct bpf_sock_ops, data_segs_in):
                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
                break;
        case offsetof(struct bpf_sock_ops, segs_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
                break;
        case offsetof(struct bpf_sock_ops, data_segs_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
                break;
        case offsetof(struct bpf_sock_ops, lost_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
                break;
        case offsetof(struct bpf_sock_ops, sacked_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
                break;
        case offsetof(struct bpf_sock_ops, bytes_received):
                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
                break;
        case offsetof(struct bpf_sock_ops, bytes_acked):
                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
                break;
        case offsetof(struct bpf_sock_ops, sk):
                SOCK_OPS_GET_SK();
                break;
        case offsetof(struct bpf_sock_ops, skb_data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb_data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb_data_end));
                break;
        case offsetof(struct bpf_sock_ops, skb_data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct sk_buff, data));
                break;
        case offsetof(struct bpf_sock_ops, skb_len):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct sk_buff, len));
                break;
        case offsetof(struct bpf_sock_ops, skb_tcp_flags):
                off = offsetof(struct sk_buff, cb);
                off += offsetof(struct tcp_skb_cb, tcp_flags);
                *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
                                                       tcp_flags),
                                      si->dst_reg, si->dst_reg, off);
                break;
        case offsetof(struct bpf_sock_ops, skb_hwtstamp): {
                struct bpf_insn *jmp_on_null_skb;

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                /* Reserve one insn to test skb == NULL */
                jmp_on_null_skb = insn++;
                insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     hwtstamps, 8,
                                                     target_size));
                *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0,
                                               insn - jmp_on_null_skb - 1);
                break;
        }
        }
        return insn - insn_buf;
}

/* data_end = skb->data + skb_headlen() */
static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
                                                    struct bpf_insn *insn)
{
        int reg;
        int temp_reg_off = offsetof(struct sk_buff, cb) +
                           offsetof(struct sk_skb_cb, temp_reg);

        if (si->src_reg == si->dst_reg) {
                /* We need an extra register, choose and save a register. */
                reg = BPF_REG_9;
                if (si->src_reg == reg || si->dst_reg == reg)
                        reg--;
                if (si->src_reg == reg || si->dst_reg == reg)
                        reg--;
                *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
        } else {
                reg = si->dst_reg;
        }

        /* reg = skb->data */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                              reg, si->src_reg,
                              offsetof(struct sk_buff, data));
        /* AX = skb->len */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
                              BPF_REG_AX, si->src_reg,
                              offsetof(struct sk_buff, len));
        /* reg = skb->data + skb->len */
        *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
        /* AX = skb->data_len */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
                              BPF_REG_AX, si->src_reg,
                              offsetof(struct sk_buff, data_len));

        /* reg = skb->data + skb->len - skb->data_len */
        *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);

        if (si->src_reg == si->dst_reg) {
                /* Restore the saved register */
                *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
                *insn++ = BPF_MOV64_REG(si->dst_reg, reg);
                *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
        }

        return insn;
}

static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct __sk_buff, data_end):
                insn = bpf_convert_data_end_access(si, insn);
                break;
        case offsetof(struct __sk_buff, cb[0]) ...
             offsetofend(struct __sk_buff, cb[4]) - 1:
                BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                              offsetof(struct sk_skb_cb, data)) %
                             sizeof(__u64));

                prog->cb_access = 1;
                off  = si->off;
                off -= offsetof(struct __sk_buff, cb[0]);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct sk_skb_cb, data);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                break;


        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
                                              target_size);
        }

        return insn - insn_buf;
}

static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
#if IS_ENABLED(CONFIG_IPV6)
        int off;
#endif

        /* convert ctx uses the fact sg element is first in struct */
        BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);

        switch (si->off) {
        case offsetof(struct sk_msg_md, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, data));
                break;
        case offsetof(struct sk_msg_md, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, data_end));
                break;
        case offsetof(struct sk_msg_md, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_family));
                break;

        case offsetof(struct sk_msg_md, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_daddr));
                break;

        case offsetof(struct sk_msg_md, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_rcv_saddr));
                break;

        case offsetof(struct sk_msg_md, remote_ip6[0]) ...
             offsetof(struct sk_msg_md, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct sk_msg_md, remote_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct sk_msg_md, local_ip6[0]) ...
             offsetof(struct sk_msg_md, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct sk_msg_md, local_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct sk_msg_md, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct sk_msg_md, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_num));
                break;

        case offsetof(struct sk_msg_md, size):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg_sg, size));
                break;

        case offsetof(struct sk_msg_md, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops sk_filter_verifier_ops = {
        .get_func_proto                = sk_filter_func_proto,
        .is_valid_access        = sk_filter_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
        .gen_ld_abs                = bpf_gen_ld_abs,
};

const struct bpf_prog_ops sk_filter_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
        .get_func_proto                = tc_cls_act_func_proto,
        .is_valid_access        = tc_cls_act_is_valid_access,
        .convert_ctx_access        = tc_cls_act_convert_ctx_access,
        .gen_prologue                = tc_cls_act_prologue,
        .gen_ld_abs                = bpf_gen_ld_abs,
        .btf_struct_access        = tc_cls_act_btf_struct_access,
};

const struct bpf_prog_ops tc_cls_act_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops xdp_verifier_ops = {
        .get_func_proto                = xdp_func_proto,
        .is_valid_access        = xdp_is_valid_access,
        .convert_ctx_access        = xdp_convert_ctx_access,
        .gen_prologue                = bpf_noop_prologue,
        .btf_struct_access        = xdp_btf_struct_access,
};

const struct bpf_prog_ops xdp_prog_ops = {
        .test_run                = bpf_prog_test_run_xdp,
};

const struct bpf_verifier_ops cg_skb_verifier_ops = {
        .get_func_proto                = cg_skb_func_proto,
        .is_valid_access        = cg_skb_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops cg_skb_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_in_verifier_ops = {
        .get_func_proto                = lwt_in_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_in_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_out_verifier_ops = {
        .get_func_proto                = lwt_out_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_out_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
        .get_func_proto                = lwt_xmit_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
        .gen_prologue                = tc_cls_act_prologue,
};

const struct bpf_prog_ops lwt_xmit_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
        .get_func_proto                = lwt_seg6local_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_seg6local_prog_ops = {
};

const struct bpf_verifier_ops cg_sock_verifier_ops = {
        .get_func_proto                = sock_filter_func_proto,
        .is_valid_access        = sock_filter_is_valid_access,
        .convert_ctx_access        = bpf_sock_convert_ctx_access,
};

const struct bpf_prog_ops cg_sock_prog_ops = {
};

const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
        .get_func_proto                = sock_addr_func_proto,
        .is_valid_access        = sock_addr_is_valid_access,
        .convert_ctx_access        = sock_addr_convert_ctx_access,
};

const struct bpf_prog_ops cg_sock_addr_prog_ops = {
};

const struct bpf_verifier_ops sock_ops_verifier_ops = {
        .get_func_proto                = sock_ops_func_proto,
        .is_valid_access        = sock_ops_is_valid_access,
        .convert_ctx_access        = sock_ops_convert_ctx_access,
};

const struct bpf_prog_ops sock_ops_prog_ops = {
};

const struct bpf_verifier_ops sk_skb_verifier_ops = {
        .get_func_proto                = sk_skb_func_proto,
        .is_valid_access        = sk_skb_is_valid_access,
        .convert_ctx_access        = sk_skb_convert_ctx_access,
        .gen_prologue                = sk_skb_prologue,
};

const struct bpf_prog_ops sk_skb_prog_ops = {
};

const struct bpf_verifier_ops sk_msg_verifier_ops = {
        .get_func_proto                = sk_msg_func_proto,
        .is_valid_access        = sk_msg_is_valid_access,
        .convert_ctx_access        = sk_msg_convert_ctx_access,
        .gen_prologue                = bpf_noop_prologue,
};

const struct bpf_prog_ops sk_msg_prog_ops = {
};

const struct bpf_verifier_ops flow_dissector_verifier_ops = {
        .get_func_proto                = flow_dissector_func_proto,
        .is_valid_access        = flow_dissector_is_valid_access,
        .convert_ctx_access        = flow_dissector_convert_ctx_access,
};

const struct bpf_prog_ops flow_dissector_prog_ops = {
        .test_run                = bpf_prog_test_run_flow_dissector,
};

int sk_detach_filter(struct sock *sk)
{
        int ret = -ENOENT;
        struct sk_filter *filter;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        filter = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        if (filter) {
                RCU_INIT_POINTER(sk->sk_filter, NULL);
                sk_filter_uncharge(sk, filter);
                ret = 0;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(sk_detach_filter);

int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
{
        struct sock_fprog_kern *fprog;
        struct sk_filter *filter;
        int ret = 0;

        sockopt_lock_sock(sk);
        filter = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        if (!filter)
                goto out;

        /* We're copying the filter that has been originally attached,
         * so no conversion/decode needed anymore. eBPF programs that
         * have no original program cannot be dumped through this.
         */
        ret = -EACCES;
        fprog = filter->prog->orig_prog;
        if (!fprog)
                goto out;

        ret = fprog->len;
        if (!len)
                /* User space only enquires number of filter blocks. */
                goto out;

        ret = -EINVAL;
        if (len < fprog->len)
                goto out;

        ret = -EFAULT;
        if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
                goto out;

        /* Instead of bytes, the API requests to return the number
         * of filter blocks.
         */
        ret = fprog->len;
out:
        sockopt_release_sock(sk);
        return ret;
}

#ifdef CONFIG_INET
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
                                    struct sock_reuseport *reuse,
                                    struct sock *sk, struct sk_buff *skb,
                                    struct sock *migrating_sk,
                                    u32 hash)
{
        reuse_kern->skb = skb;
        reuse_kern->sk = sk;
        reuse_kern->selected_sk = NULL;
        reuse_kern->migrating_sk = migrating_sk;
        reuse_kern->data_end = skb->data + skb_headlen(skb);
        reuse_kern->hash = hash;
        reuse_kern->reuseport_id = reuse->reuseport_id;
        reuse_kern->bind_inany = reuse->bind_inany;
}

struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  struct sock *migrating_sk,
                                  u32 hash)
{
        struct sk_reuseport_kern reuse_kern;
        enum sk_action action;

        bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
        action = bpf_prog_run(prog, &reuse_kern);

        if (action == SK_PASS)
                return reuse_kern.selected_sk;
        else
                return ERR_PTR(-ECONNREFUSED);
}

BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
           struct bpf_map *, map, void *, key, u32, flags)
{
        bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
        struct sock_reuseport *reuse;
        struct sock *selected_sk;
        int err;

        selected_sk = map->ops->map_lookup_elem(map, key);
        if (!selected_sk)
                return -ENOENT;

        reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
        if (!reuse) {
                /* reuseport_array has only sk with non NULL sk_reuseport_cb.
                 * The only (!reuse) case here is - the sk has already been
                 * unhashed (e.g. by close()), so treat it as -ENOENT.
                 *
                 * Other maps (e.g. sock_map) do not provide this guarantee and
                 * the sk may never be in the reuseport group to begin with.
                 */
                err = is_sockarray ? -ENOENT : -EINVAL;
                goto error;
        }

        if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
                struct sock *sk = reuse_kern->sk;

                if (sk->sk_protocol != selected_sk->sk_protocol) {
                        err = -EPROTOTYPE;
                } else if (sk->sk_family != selected_sk->sk_family) {
                        err = -EAFNOSUPPORT;
                } else {
                        /* Catch all. Likely bound to a different sockaddr. */
                        err = -EBADFD;
                }
                goto error;
        }

        reuse_kern->selected_sk = selected_sk;

        return 0;
error:
        /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
        if (sk_is_refcounted(selected_sk))
                sock_put(selected_sk);

        return err;
}

static const struct bpf_func_proto sk_select_reuseport_proto = {
        .func           = sk_select_reuseport,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_CONST_MAP_PTR,
        .arg3_type      = ARG_PTR_TO_MAP_KEY,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(sk_reuseport_load_bytes,
           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
           void *, to, u32, len)
{
        return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
}

static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
        .func                = sk_reuseport_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(sk_reuseport_load_bytes_relative,
           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
           void *, to, u32, len, u32, start_header)
{
        return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
                                               len, start_header);
}

static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
        .func                = sk_reuseport_load_bytes_relative,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
sk_reuseport_func_proto(enum bpf_func_id func_id,
                        const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_sk_select_reuseport:
                return &sk_select_reuseport_proto;
        case BPF_FUNC_skb_load_bytes:
                return &sk_reuseport_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &sk_reuseport_load_bytes_relative_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_ptr_cookie_proto;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static bool
sk_reuseport_is_valid_access(int off, int size,
                             enum bpf_access_type type,
                             const struct bpf_prog *prog,
                             struct bpf_insn_access_aux *info)
{
        const u32 size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
            off % size || type != BPF_READ)
                return false;

        switch (off) {
        case offsetof(struct sk_reuseport_md, data):
                info->reg_type = PTR_TO_PACKET;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, hash):
                return size == size_default;

        case offsetof(struct sk_reuseport_md, sk):
                info->reg_type = PTR_TO_SOCKET;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, migrating_sk):
                info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
                return size == sizeof(__u64);

        /* Fields that allow narrowing */
        case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
                if (size < sizeof_field(struct sk_buff, protocol))
                        return false;
                fallthrough;
        case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
        case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
        case bpf_ctx_range(struct sk_reuseport_md, len):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);

        default:
                return false;
        }
}

#define SK_REUSEPORT_LOAD_FIELD(F) ({                                        \
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
                              si->dst_reg, si->src_reg,                        \
                              bpf_target_off(struct sk_reuseport_kern, F, \
                                             sizeof_field(struct sk_reuseport_kern, F), \
                                             target_size));                \
        })

#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD)                                \
        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,                \
                                    struct sk_buff,                        \
                                    skb,                                \
                                    SKB_FIELD)

#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD)                                \
        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,                \
                                    struct sock,                        \
                                    sk,                                        \
                                    SK_FIELD)

static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
                                           const struct bpf_insn *si,
                                           struct bpf_insn *insn_buf,
                                           struct bpf_prog *prog,
                                           u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct sk_reuseport_md, data):
                SK_REUSEPORT_LOAD_SKB_FIELD(data);
                break;

        case offsetof(struct sk_reuseport_md, len):
                SK_REUSEPORT_LOAD_SKB_FIELD(len);
                break;

        case offsetof(struct sk_reuseport_md, eth_protocol):
                SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
                break;

        case offsetof(struct sk_reuseport_md, ip_protocol):
                SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
                break;

        case offsetof(struct sk_reuseport_md, data_end):
                SK_REUSEPORT_LOAD_FIELD(data_end);
                break;

        case offsetof(struct sk_reuseport_md, hash):
                SK_REUSEPORT_LOAD_FIELD(hash);
                break;

        case offsetof(struct sk_reuseport_md, bind_inany):
                SK_REUSEPORT_LOAD_FIELD(bind_inany);
                break;

        case offsetof(struct sk_reuseport_md, sk):
                SK_REUSEPORT_LOAD_FIELD(sk);
                break;

        case offsetof(struct sk_reuseport_md, migrating_sk):
                SK_REUSEPORT_LOAD_FIELD(migrating_sk);
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
        .get_func_proto                = sk_reuseport_func_proto,
        .is_valid_access        = sk_reuseport_is_valid_access,
        .convert_ctx_access        = sk_reuseport_convert_ctx_access,
};

const struct bpf_prog_ops sk_reuseport_prog_ops = {
};

DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
EXPORT_SYMBOL(bpf_sk_lookup_enabled);

BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
           struct sock *, sk, u64, flags)
{
        if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
                               BPF_SK_LOOKUP_F_NO_REUSEPORT)))
                return -EINVAL;
        if (unlikely(sk && sk_is_refcounted(sk)))
                return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
        if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
                return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
        if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
                return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */

        /* Check if socket is suitable for packet L3/L4 protocol */
        if (sk && sk->sk_protocol != ctx->protocol)
                return -EPROTOTYPE;
        if (sk && sk->sk_family != ctx->family &&
            (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
                return -EAFNOSUPPORT;

        if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
                return -EEXIST;

        /* Select socket as lookup result */
        ctx->selected_sk = sk;
        ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
        return 0;
}

static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
        .func                = bpf_sk_lookup_assign,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_SOCKET_OR_NULL,
        .arg3_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_assign:
                return &bpf_sk_lookup_assign_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static bool sk_lookup_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
                return false;
        if (off % size != 0)
                return false;
        if (type != BPF_READ)
                return false;

        switch (off) {
        case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk):
                info->reg_type = PTR_TO_SOCKET_OR_NULL;
                return size == sizeof(__u64);

        case bpf_ctx_range(struct bpf_sk_lookup, family):
        case bpf_ctx_range(struct bpf_sk_lookup, protocol):
        case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
        case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
        case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range(struct bpf_sk_lookup, local_port):
        case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
                bpf_ctx_record_field_size(info, sizeof(__u32));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));

        case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
                /* Allow 4-byte access to 2-byte field for backward compatibility */
                if (size == sizeof(__u32))
                        return true;
                bpf_ctx_record_field_size(info, sizeof(__be16));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));

        case offsetofend(struct bpf_sk_lookup, remote_port) ...
             offsetof(struct bpf_sk_lookup, local_ip4) - 1:
                /* Allow access to zero padding for backward compatibility */
                bpf_ctx_record_field_size(info, sizeof(__u16));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));

        default:
                return false;
        }
}

static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
                                        const struct bpf_insn *si,
                                        struct bpf_insn *insn_buf,
                                        struct bpf_prog *prog,
                                        u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sk_lookup, sk):
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, selected_sk));
                break;

        case offsetof(struct bpf_sk_lookup, family):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     family, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, protocol):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     protocol, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, remote_ip4):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     v4.saddr, 4, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, local_ip4):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     v4.daddr, 4, target_size));
                break;

        case bpf_ctx_range_till(struct bpf_sk_lookup,
                                remote_ip6[0], remote_ip6[3]): {
#if IS_ENABLED(CONFIG_IPV6)
                int off = si->off;

                off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, v6.saddr));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        }
        case bpf_ctx_range_till(struct bpf_sk_lookup,
                                local_ip6[0], local_ip6[3]): {
#if IS_ENABLED(CONFIG_IPV6)
                int off = si->off;

                off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, v6.daddr));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        }
        case offsetof(struct bpf_sk_lookup, remote_port):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     sport, 2, target_size));
                break;

        case offsetofend(struct bpf_sk_lookup, remote_port):
                *target_size = 2;
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
                break;

        case offsetof(struct bpf_sk_lookup, local_port):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     dport, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     ingress_ifindex, 4, target_size));
                break;
        }

        return insn - insn_buf;
}

const struct bpf_prog_ops sk_lookup_prog_ops = {
        .test_run = bpf_prog_test_run_sk_lookup,
};

const struct bpf_verifier_ops sk_lookup_verifier_ops = {
        .get_func_proto                = sk_lookup_func_proto,
        .is_valid_access        = sk_lookup_is_valid_access,
        .convert_ctx_access        = sk_lookup_convert_ctx_access,
};

#endif /* CONFIG_INET */

DEFINE_BPF_DISPATCHER(xdp)

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
{
        bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
}

BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE)
#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
BTF_SOCK_TYPE_xxx
#undef BTF_SOCK_TYPE

BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
{
        /* tcp6_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct tcp6_sock);
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
            sk->sk_family == AF_INET6)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
        .func                        = bpf_skc_to_tcp6_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
};

BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
{
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
        .func                        = bpf_skc_to_tcp_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
};

BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
{
        /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
         * generated if CONFIG_INET=n. Trigger an explicit generation here.
         */
        BTF_TYPE_EMIT(struct inet_timewait_sock);
        BTF_TYPE_EMIT(struct tcp_timewait_sock);

#ifdef CONFIG_INET
        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
                return (unsigned long)sk;
#endif

#if IS_ENABLED(CONFIG_IPV6)
        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
                return (unsigned long)sk;
#endif

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
        .func                        = bpf_skc_to_tcp_timewait_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
};

BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
{
#ifdef CONFIG_INET
        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
                return (unsigned long)sk;
#endif

#if IS_ENABLED(CONFIG_IPV6)
        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
                return (unsigned long)sk;
#endif

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
        .func                        = bpf_skc_to_tcp_request_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
};

BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
{
        /* udp6_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct udp6_sock);
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
            sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
        .func                        = bpf_skc_to_udp6_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
};

BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk)
{
        /* unix_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct unix_sock);
        if (sk && sk_is_unix(sk))
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_unix_sock_proto = {
        .func                        = bpf_skc_to_unix_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_UNIX],
};

BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk)
{
        BTF_TYPE_EMIT(struct mptcp_sock);
        return (unsigned long)bpf_mptcp_sock_from_subflow(sk);
}

const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = {
        .func                = bpf_skc_to_mptcp_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
        .ret_btf_id        = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP],
};

BPF_CALL_1(bpf_sock_from_file, struct file *, file)
{
        return (unsigned long)sock_from_file(file);
}

BTF_ID_LIST(bpf_sock_from_file_btf_ids)
BTF_ID(struct, socket)
BTF_ID(struct, file)

const struct bpf_func_proto bpf_sock_from_file_proto = {
        .func                = bpf_sock_from_file,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_BTF_ID_OR_NULL,
        .ret_btf_id        = &bpf_sock_from_file_btf_ids[0],
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_sock_from_file_btf_ids[1],
};

static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func;

        switch (func_id) {
        case BPF_FUNC_skc_to_tcp6_sock:
                func = &bpf_skc_to_tcp6_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_sock:
                func = &bpf_skc_to_tcp_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_timewait_sock:
                func = &bpf_skc_to_tcp_timewait_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_request_sock:
                func = &bpf_skc_to_tcp_request_sock_proto;
                break;
        case BPF_FUNC_skc_to_udp6_sock:
                func = &bpf_skc_to_udp6_sock_proto;
                break;
        case BPF_FUNC_skc_to_unix_sock:
                func = &bpf_skc_to_unix_sock_proto;
                break;
        case BPF_FUNC_skc_to_mptcp_sock:
                func = &bpf_skc_to_mptcp_sock_proto;
                break;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }

        if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
                return NULL;

        return func;
}

/**
 * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area.
 * @skb: socket buffer carrying the metadata
 * @offset: offset into the metadata area, must be <= skb_metadata_len()
 */
void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
{
        return skb_metadata_end(skb) - skb_metadata_len(skb) + offset;
}

int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
                               const void *from, u32 len, u64 flags)
{
        if (unlikely(flags))
                return -EINVAL;
        if (unlikely(bpf_try_make_writable(skb, 0)))
                return -EFAULT;

        memmove(bpf_skb_meta_pointer(skb, offset), from, len);
        return 0;
}

__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
                                    struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        struct sk_buff *skb = (struct sk_buff *)s;

        if (flags) {
                bpf_dynptr_set_null(ptr);
                return -EINVAL;
        }

        bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);

        return 0;
}

/**
 * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area.
 * @skb_: socket buffer carrying the metadata
 * @flags: future use, must be zero
 * @ptr__uninit: dynptr to initialize
 *
 * Set up a dynptr for access to the metadata area earlier allocated from the
 * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to
 * &__sk_buff->data_meta.
 *
 * Return:
 * * %0         - dynptr ready to use
 * * %-EINVAL   - invalid flags, dynptr set to null
 */
__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags,
                                         struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        struct sk_buff *skb = (struct sk_buff *)skb_;

        if (flags) {
                bpf_dynptr_set_null(ptr);
                return -EINVAL;
        }

        bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb));

        return 0;
}

__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
                                    struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        struct xdp_buff *xdp = (struct xdp_buff *)x;

        if (flags) {
                bpf_dynptr_set_null(ptr);
                return -EINVAL;
        }

        bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));

        return 0;
}

__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
                                           const u8 *sun_path, u32 sun_path__sz)
{
        struct sockaddr_un *un;

        if (sa_kern->sk->sk_family != AF_UNIX)
                return -EINVAL;

        /* We do not allow changing the address to unnamed or larger than the
         * maximum allowed address size for a unix sockaddr.
         */
        if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
                return -EINVAL;

        un = (struct sockaddr_un *)sa_kern->uaddr;
        memcpy(un->sun_path, sun_path, sun_path__sz);
        sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;

        return 0;
}

__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
                                        struct bpf_tcp_req_attrs *attrs, int attrs__sz)
{
#if IS_ENABLED(CONFIG_SYN_COOKIES)
        struct sk_buff *skb = (struct sk_buff *)s;
        const struct request_sock_ops *ops;
        struct inet_request_sock *ireq;
        struct tcp_request_sock *treq;
        struct request_sock *req;
        struct net *net;
        __u16 min_mss;
        u32 tsoff = 0;

        if (attrs__sz != sizeof(*attrs) ||
            attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2])
                return -EINVAL;

        if (!skb_at_tc_ingress(skb))
                return -EINVAL;

        net = dev_net(skb->dev);
        if (net != sock_net(sk))
                return -ENETUNREACH;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                ops = &tcp_request_sock_ops;
                min_mss = 536;
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                ops = &tcp6_request_sock_ops;
                min_mss = IPV6_MIN_MTU - 60;
                break;
#endif
        default:
                return -EINVAL;
        }

        if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN ||
            sk_is_mptcp(sk))
                return -EINVAL;

        if (attrs->mss < min_mss)
                return -EINVAL;

        if (attrs->wscale_ok) {
                if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling))
                        return -EINVAL;

                if (attrs->snd_wscale > TCP_MAX_WSCALE ||
                    attrs->rcv_wscale > TCP_MAX_WSCALE)
                        return -EINVAL;
        }

        if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
                return -EINVAL;

        if (attrs->tstamp_ok) {
                if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
                        return -EINVAL;

                tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns());
        }

        req = inet_reqsk_alloc(ops, sk, false);
        if (!req)
                return -ENOMEM;

        ireq = inet_rsk(req);
        treq = tcp_rsk(req);

        req->rsk_listener = sk;
        req->syncookie = 1;
        req->mss = attrs->mss;
        req->ts_recent = attrs->rcv_tsval;

        ireq->snd_wscale = attrs->snd_wscale;
        ireq->rcv_wscale = attrs->rcv_wscale;
        ireq->tstamp_ok        = !!attrs->tstamp_ok;
        ireq->sack_ok = !!attrs->sack_ok;
        ireq->wscale_ok = !!attrs->wscale_ok;
        ireq->ecn_ok = !!attrs->ecn_ok;

        treq->req_usec_ts = !!attrs->usec_ts_ok;
        treq->ts_off = tsoff;

        skb_orphan(skb);
        skb->sk = req_to_sk(req);
        skb->destructor = sock_pfree;

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
                                              u64 flags)
{
        struct sk_buff *skb;

        if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB)
                return -EOPNOTSUPP;

        if (flags)
                return -EINVAL;

        skb = skops->skb;
        skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
        TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF;
        skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;

        return 0;
}

/**
 * bpf_xdp_pull_data() - Pull in non-linear xdp data.
 * @x: &xdp_md associated with the XDP buffer
 * @len: length of data to be made directly accessible in the linear part
 *
 * Pull in data in case the XDP buffer associated with @x is non-linear and
 * not all @len are in the linear data area.
 *
 * Direct packet access allows reading and writing linear XDP data through
 * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which
 * ends up in the linear part of the xdp_buff depends on the NIC and its
 * configuration. When a frag-capable XDP program wants to directly access
 * headers that may be in the non-linear area, call this kfunc to make sure
 * the data is available in the linear area. Alternatively, use dynptr or
 * bpf_xdp_{load,store}_bytes() to access data without pulling.
 *
 * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate
 * headers in the non-linear data area.
 *
 * A call to this kfunc may reduce headroom. If there is not enough tailroom
 * in the linear data area, metadata and data will be shifted down.
 *
 * A call to this kfunc is susceptible to change the buffer geometry.
 * Therefore, at load time, all checks on pointers previously done by the
 * verifier are invalidated and must be performed again, if the kfunc is used
 * in combination with direct packet access.
 *
 * Return:
 * * %0         - success
 * * %-EINVAL   - invalid len
 */
__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len)
{
        struct xdp_buff *xdp = (struct xdp_buff *)x;
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        int i, delta, shift, headroom, tailroom, n_frags_free = 0;
        void *data_hard_end = xdp_data_hard_end(xdp);
        int data_len = xdp->data_end - xdp->data;
        void *start;

        if (len <= data_len)
                return 0;

        if (unlikely(len > xdp_get_buff_len(xdp)))
                return -EINVAL;

        start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta;

        headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame);
        tailroom = data_hard_end - xdp->data_end;

        delta = len - data_len;
        if (unlikely(delta > tailroom + headroom))
                return -EINVAL;

        shift = delta - tailroom;
        if (shift > 0) {
                memmove(start - shift, start, xdp->data_end - start);

                xdp->data_meta -= shift;
                xdp->data -= shift;
                xdp->data_end -= shift;
        }

        for (i = 0; i < sinfo->nr_frags && delta; i++) {
                skb_frag_t *frag = &sinfo->frags[i];
                u32 shrink = min_t(u32, delta, skb_frag_size(frag));

                memcpy(xdp->data_end, skb_frag_address(frag), shrink);

                xdp->data_end += shrink;
                sinfo->xdp_frags_size -= shrink;
                delta -= shrink;
                if (bpf_xdp_shrink_data(xdp, frag, shrink, false))
                        n_frags_free++;
        }

        if (unlikely(n_frags_free)) {
                memmove(sinfo->frags, sinfo->frags + n_frags_free,
                        (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t));

                sinfo->nr_frags -= n_frags_free;

                if (!sinfo->nr_frags) {
                        xdp_buff_clear_frags_flag(xdp);
                        xdp_buff_clear_frag_pfmemalloc(xdp);
                }
        }

        return 0;
}

__bpf_kfunc_end_defs();

int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
                               struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        int err;

        err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
        if (err)
                return err;

        bpf_dynptr_set_rdonly(ptr);

        return 0;
}

BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
BTF_KFUNCS_END(bpf_kfunc_check_set_skb)

BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta)
BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta)
BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta)

BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
BTF_ID_FLAGS(func, bpf_xdp_pull_data)
BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)

BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)

BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk)
BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)

BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp)
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)

static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_skb,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_skb_meta,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_xdp,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_sock_addr,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_tcp_reqsk,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_sock_ops,
};

static int __init bpf_kfunc_init(void)
{
        int ret;

        ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
                                               &bpf_kfunc_set_sock_addr);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
        return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
}
late_initcall(bpf_kfunc_init);

__bpf_kfunc_start_defs();

/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
 *
 * The function expects a non-NULL pointer to a socket, and invokes the
 * protocol specific socket destroy handlers.
 *
 * The helper can only be called from BPF contexts that have acquired the socket
 * locks.
 *
 * Parameters:
 * @sock: Pointer to socket to be destroyed
 *
 * Return:
 * On error, may return EPROTONOSUPPORT, EINVAL.
 * EPROTONOSUPPORT if protocol specific destroy handler is not supported.
 * 0 otherwise
 */
__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
{
        struct sock *sk = (struct sock *)sock;

        /* The locking semantics that allow for synchronous execution of the
         * destroy handlers are only supported for TCP and UDP.
         * Supporting protocols will need to acquire sock lock in the BPF context
         * prior to invoking this kfunc.
         */
        if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP &&
                                           sk->sk_protocol != IPPROTO_UDP))
                return -EOPNOTSUPP;

        return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
}

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
BTF_ID_FLAGS(func, bpf_sock_destroy)
BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)

static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
        if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) &&
            prog->expected_attach_type != BPF_TRACE_ITER)
                return -EACCES;
        return 0;
}

static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = {
        .owner = THIS_MODULE,
        .set   = &bpf_sk_iter_kfunc_ids,
        .filter = tracing_iter_filter,
};

static int init_subsystem(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set);
}
late_initcall(init_subsystem);

































































































































































































































































































































































































































































































































































































































































































   37 

















   21 



































































































































































































































































   57 






























































































































   11 


   22 










































































































   57 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk).
 *
 * (C) SGI 2006, Christoph Lameter
 *         Cleaned up and restructured to ease the addition of alternative
 *         implementations of SLAB allocators.
 * (C) Linux Foundation 2008-2013
 *      Unified interface for all slab allocators
 */

#ifndef _LINUX_SLAB_H
#define        _LINUX_SLAB_H

#include <linux/bug.h>
#include <linux/cache.h>
#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/percpu-refcount.h>
#include <linux/cleanup.h>
#include <linux/hash.h>

enum _slab_flag_bits {
        _SLAB_CONSISTENCY_CHECKS,
        _SLAB_RED_ZONE,
        _SLAB_POISON,
        _SLAB_KMALLOC,
        _SLAB_HWCACHE_ALIGN,
        _SLAB_CACHE_DMA,
        _SLAB_CACHE_DMA32,
        _SLAB_STORE_USER,
        _SLAB_PANIC,
        _SLAB_TYPESAFE_BY_RCU,
        _SLAB_TRACE,
#ifdef CONFIG_DEBUG_OBJECTS
        _SLAB_DEBUG_OBJECTS,
#endif
        _SLAB_NOLEAKTRACE,
        _SLAB_NO_MERGE,
#ifdef CONFIG_FAILSLAB
        _SLAB_FAILSLAB,
#endif
#ifdef CONFIG_MEMCG
        _SLAB_ACCOUNT,
#endif
#ifdef CONFIG_KASAN_GENERIC
        _SLAB_KASAN,
#endif
        _SLAB_NO_USER_FLAGS,
#ifdef CONFIG_KFENCE
        _SLAB_SKIP_KFENCE,
#endif
#ifndef CONFIG_SLUB_TINY
        _SLAB_RECLAIM_ACCOUNT,
#endif
        _SLAB_OBJECT_POISON,
        _SLAB_CMPXCHG_DOUBLE,
        _SLAB_NO_OBJ_EXT,
#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
        _SLAB_OBJ_EXT_IN_OBJ,
#endif
        _SLAB_FLAGS_LAST_BIT
};

#define __SLAB_FLAG_BIT(nr)        ((slab_flags_t __force)(1U << (nr)))
#define __SLAB_FLAG_UNUSED        ((slab_flags_t __force)(0U))

/*
 * Flags to pass to kmem_cache_create().
 * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op
 */
/* DEBUG: Perform (expensive) checks on alloc/free */
#define SLAB_CONSISTENCY_CHECKS        __SLAB_FLAG_BIT(_SLAB_CONSISTENCY_CHECKS)
/* DEBUG: Red zone objs in a cache */
#define SLAB_RED_ZONE                __SLAB_FLAG_BIT(_SLAB_RED_ZONE)
/* DEBUG: Poison objects */
#define SLAB_POISON                __SLAB_FLAG_BIT(_SLAB_POISON)
/* Indicate a kmalloc slab */
#define SLAB_KMALLOC                __SLAB_FLAG_BIT(_SLAB_KMALLOC)
/**
 * define SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
 *
 * Sufficiently large objects are aligned on cache line boundary. For object
 * size smaller than a half of cache line size, the alignment is on the half of
 * cache line size. In general, if object size is smaller than 1/2^n of cache
 * line size, the alignment is adjusted to 1/2^n.
 *
 * If explicit alignment is also requested by the respective
 * &struct kmem_cache_args field, the greater of both is alignments is applied.
 */
#define SLAB_HWCACHE_ALIGN        __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN)
/* Use GFP_DMA memory */
#define SLAB_CACHE_DMA                __SLAB_FLAG_BIT(_SLAB_CACHE_DMA)
/* Use GFP_DMA32 memory */
#define SLAB_CACHE_DMA32        __SLAB_FLAG_BIT(_SLAB_CACHE_DMA32)
/* DEBUG: Store the last owner for bug hunting */
#define SLAB_STORE_USER                __SLAB_FLAG_BIT(_SLAB_STORE_USER)
/* Panic if kmem_cache_create() fails */
#define SLAB_PANIC                __SLAB_FLAG_BIT(_SLAB_PANIC)
/**
 * define SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
 *
 * This delays freeing the SLAB page by a grace period, it does _NOT_
 * delay object freeing. This means that if you do kmem_cache_free()
 * that memory location is free to be reused at any time. Thus it may
 * be possible to see another object there in the same RCU grace period.
 *
 * This feature only ensures the memory location backing the object
 * stays valid, the trick to using this is relying on an independent
 * object validation pass. Something like:
 *
 * ::
 *
 *  begin:
 *   rcu_read_lock();
 *   obj = lockless_lookup(key);
 *   if (obj) {
 *     if (!try_get_ref(obj)) // might fail for free objects
 *       rcu_read_unlock();
 *       goto begin;
 *
 *     if (obj->key != key) { // not the object we expected
 *       put_ref(obj);
 *       rcu_read_unlock();
 *       goto begin;
 *     }
 *   }
 *  rcu_read_unlock();
 *
 * This is useful if we need to approach a kernel structure obliquely,
 * from its address obtained without the usual locking. We can lock
 * the structure to stabilize it and check it's still at the given address,
 * only if we can be sure that the memory has not been meanwhile reused
 * for some other kind of object (which our subsystem's lock might corrupt).
 *
 * rcu_read_lock before reading the address, then rcu_read_unlock after
 * taking the spinlock within the structure expected at that address.
 *
 * Note that object identity check has to be done *after* acquiring a
 * reference, therefore user has to ensure proper ordering for loads.
 * Similarly, when initializing objects allocated with SLAB_TYPESAFE_BY_RCU,
 * the newly allocated object has to be fully initialized *before* its
 * refcount gets initialized and proper ordering for stores is required.
 * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() are
 * designed with the proper fences required for reference counting objects
 * allocated with SLAB_TYPESAFE_BY_RCU.
 *
 * Note that it is not possible to acquire a lock within a structure
 * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference
 * as described above.  The reason is that SLAB_TYPESAFE_BY_RCU pages
 * are not zeroed before being given to the slab, which means that any
 * locks must be initialized after each and every kmem_struct_alloc().
 * Alternatively, make the ctor passed to kmem_cache_create() initialize
 * the locks at page-allocation time, as is done in __i915_request_ctor(),
 * sighand_ctor(), and anon_vma_ctor().  Such a ctor permits readers
 * to safely acquire those ctor-initialized locks under rcu_read_lock()
 * protection.
 *
 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
 */
#define SLAB_TYPESAFE_BY_RCU        __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU)
/* Trace allocations and frees */
#define SLAB_TRACE                __SLAB_FLAG_BIT(_SLAB_TRACE)

/* Flag to prevent checks on free */
#ifdef CONFIG_DEBUG_OBJECTS
# define SLAB_DEBUG_OBJECTS        __SLAB_FLAG_BIT(_SLAB_DEBUG_OBJECTS)
#else
# define SLAB_DEBUG_OBJECTS        __SLAB_FLAG_UNUSED
#endif

/* Avoid kmemleak tracing */
#define SLAB_NOLEAKTRACE        __SLAB_FLAG_BIT(_SLAB_NOLEAKTRACE)

/*
 * Prevent merging with compatible kmem caches. This flag should be used
 * cautiously. Valid use cases:
 *
 * - caches created for self-tests (e.g. kunit)
 * - general caches created and used by a subsystem, only when a
 *   (subsystem-specific) debug option is enabled
 * - performance critical caches, should be very rare and consulted with slab
 *   maintainers, and not used together with CONFIG_SLUB_TINY
 */
#define SLAB_NO_MERGE                __SLAB_FLAG_BIT(_SLAB_NO_MERGE)

/* Fault injection mark */
#ifdef CONFIG_FAILSLAB
# define SLAB_FAILSLAB                __SLAB_FLAG_BIT(_SLAB_FAILSLAB)
#else
# define SLAB_FAILSLAB                __SLAB_FLAG_UNUSED
#endif
/**
 * define SLAB_ACCOUNT - Account allocations to memcg.
 *
 * All object allocations from this cache will be memcg accounted, regardless of
 * __GFP_ACCOUNT being or not being passed to individual allocations.
 */
#ifdef CONFIG_MEMCG
# define SLAB_ACCOUNT                __SLAB_FLAG_BIT(_SLAB_ACCOUNT)
#else
# define SLAB_ACCOUNT                __SLAB_FLAG_UNUSED
#endif

#ifdef CONFIG_KASAN_GENERIC
#define SLAB_KASAN                __SLAB_FLAG_BIT(_SLAB_KASAN)
#else
#define SLAB_KASAN                __SLAB_FLAG_UNUSED
#endif

/*
 * Ignore user specified debugging flags.
 * Intended for caches created for self-tests so they have only flags
 * specified in the code and other flags are ignored.
 */
#define SLAB_NO_USER_FLAGS        __SLAB_FLAG_BIT(_SLAB_NO_USER_FLAGS)

#ifdef CONFIG_KFENCE
#define SLAB_SKIP_KFENCE        __SLAB_FLAG_BIT(_SLAB_SKIP_KFENCE)
#else
#define SLAB_SKIP_KFENCE        __SLAB_FLAG_UNUSED
#endif

/* The following flags affect the page allocator grouping pages by mobility */
/**
 * define SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
 *
 * Use this flag for caches that have an associated shrinker. As a result, slab
 * pages are allocated with __GFP_RECLAIMABLE, which affects grouping pages by
 * mobility, and are accounted in SReclaimable counter in /proc/meminfo
 */
#ifndef CONFIG_SLUB_TINY
#define SLAB_RECLAIM_ACCOUNT        __SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT)
#else
#define SLAB_RECLAIM_ACCOUNT        __SLAB_FLAG_UNUSED
#endif
#define SLAB_TEMPORARY                SLAB_RECLAIM_ACCOUNT        /* Objects are short-lived */

/* Slab created using create_boot_cache */
#define SLAB_NO_OBJ_EXT                __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT)

#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
#define SLAB_OBJ_EXT_IN_OBJ        __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ)
#else
#define SLAB_OBJ_EXT_IN_OBJ        __SLAB_FLAG_UNUSED
#endif

/*
 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
 *
 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
 *
 * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
 * Both make kfree a no-op.
 */
#define ZERO_SIZE_PTR ((void *)16)

#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
                                (unsigned long)ZERO_SIZE_PTR)

#include <linux/kasan.h>

struct list_lru;
struct mem_cgroup;
/*
 * struct kmem_cache related prototypes
 */
bool slab_is_available(void);

/**
 * struct kmem_cache_args - Less common arguments for kmem_cache_create()
 *
 * Any uninitialized fields of the structure are interpreted as unused. The
 * exception is @freeptr_offset where %0 is a valid value, so
 * @use_freeptr_offset must be also set to %true in order to interpret the field
 * as used. For @useroffset %0 is also valid, but only with non-%0
 * @usersize.
 *
 * When %NULL args is passed to kmem_cache_create(), it is equivalent to all
 * fields unused.
 */
struct kmem_cache_args {
        /**
         * @align: The required alignment for the objects.
         *
         * %0 means no specific alignment is requested.
         */
        unsigned int align;
        /**
         * @useroffset: Usercopy region offset.
         *
         * %0 is a valid offset, when @usersize is non-%0
         */
        unsigned int useroffset;
        /**
         * @usersize: Usercopy region size.
         *
         * %0 means no usercopy region is specified.
         */
        unsigned int usersize;
        /**
         * @freeptr_offset: Custom offset for the free pointer
         * in caches with &SLAB_TYPESAFE_BY_RCU or @ctor
         *
         * By default, &SLAB_TYPESAFE_BY_RCU and @ctor caches place the free
         * pointer outside of the object. This might cause the object to grow
         * in size. Cache creators that have a reason to avoid this can specify
         * a custom free pointer offset in their data structure where the free
         * pointer will be placed.
         *
         * For caches with &SLAB_TYPESAFE_BY_RCU, the caller must ensure that
         * the free pointer does not overlay fields required to guard against
         * object recycling (See &SLAB_TYPESAFE_BY_RCU for details).
         *
         * For caches with @ctor, the caller must ensure that the free pointer
         * does not overlay fields initialized by the constructor.
         *
         * Currently, only caches with &SLAB_TYPESAFE_BY_RCU or @ctor
         * may specify @freeptr_offset.
         *
         * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
         * is specified, @use_freeptr_offset must be set %true.
         */
        unsigned int freeptr_offset;
        /**
         * @use_freeptr_offset: Whether a @freeptr_offset is used.
         */
        bool use_freeptr_offset;
        /**
         * @ctor: A constructor for the objects.
         *
         * The constructor is invoked for each object in a newly allocated slab
         * page. It is the cache user's responsibility to free object in the
         * same state as after calling the constructor, or deal appropriately
         * with any differences between a freshly constructed and a reallocated
         * object.
         *
         * %NULL means no constructor.
         */
        void (*ctor)(void *);
        /**
         * @sheaf_capacity: Enable sheaves of given capacity for the cache.
         *
         * With a non-zero value, allocations from the cache go through caching
         * arrays called sheaves. Each cpu has a main sheaf that's always
         * present, and a spare sheaf that may be not present. When both become
         * empty, there's an attempt to replace an empty sheaf with a full sheaf
         * from the per-node barn.
         *
         * When no full sheaf is available, and gfp flags allow blocking, a
         * sheaf is allocated and filled from slab(s) using bulk allocation.
         * Otherwise the allocation falls back to the normal operation
         * allocating a single object from a slab.
         *
         * Analogically when freeing and both percpu sheaves are full, the barn
         * may replace it with an empty sheaf, unless it's over capacity. In
         * that case a sheaf is bulk freed to slab pages.
         *
         * The sheaves do not enforce NUMA placement of objects, so allocations
         * via kmem_cache_alloc_node() with a node specified other than
         * NUMA_NO_NODE will bypass them.
         *
         * Bulk allocation and free operations also try to use the cpu sheaves
         * and barn, but fallback to using slab pages directly.
         *
         * When slub_debug is enabled for the cache, the sheaf_capacity argument
         * is ignored.
         *
         * %0 means no sheaves will be created.
         */
        unsigned int sheaf_capacity;
};

struct kmem_cache *__kmem_cache_create_args(const char *name,
                                            unsigned int object_size,
                                            struct kmem_cache_args *args,
                                            slab_flags_t flags);
static inline struct kmem_cache *
__kmem_cache_create(const char *name, unsigned int size, unsigned int align,
                    slab_flags_t flags, void (*ctor)(void *))
{
        struct kmem_cache_args kmem_args = {
                .align        = align,
                .ctor        = ctor,
        };

        return __kmem_cache_create_args(name, size, &kmem_args, flags);
}

/**
 * kmem_cache_create_usercopy - Create a kmem cache with a region suitable
 * for copying to userspace.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @useroffset: Usercopy region offset
 * @usersize: Usercopy region size
 * @ctor: A constructor for the objects, or %NULL.
 *
 * This is a legacy wrapper, new code should use either KMEM_CACHE_USERCOPY()
 * if whitelisting a single field is sufficient, or kmem_cache_create() with
 * the necessary parameters passed via the args parameter (see
 * &struct kmem_cache_args)
 *
 * Return: a pointer to the cache on success, NULL on failure.
 */
static inline struct kmem_cache *
kmem_cache_create_usercopy(const char *name, unsigned int size,
                           unsigned int align, slab_flags_t flags,
                           unsigned int useroffset, unsigned int usersize,
                           void (*ctor)(void *))
{
        struct kmem_cache_args kmem_args = {
                .align                = align,
                .ctor                = ctor,
                .useroffset        = useroffset,
                .usersize        = usersize,
        };

        return __kmem_cache_create_args(name, size, &kmem_args, flags);
}

/* If NULL is passed for @args, use this variant with default arguments. */
static inline struct kmem_cache *
__kmem_cache_default_args(const char *name, unsigned int size,
                          struct kmem_cache_args *args,
                          slab_flags_t flags)
{
        struct kmem_cache_args kmem_default_args = {};

        /* Make sure we don't get passed garbage. */
        if (WARN_ON_ONCE(args))
                return ERR_PTR(-EINVAL);

        return __kmem_cache_create_args(name, size, &kmem_default_args, flags);
}

/**
 * kmem_cache_create - Create a kmem cache.
 * @__name: A string which is used in /proc/slabinfo to identify this cache.
 * @__object_size: The size of objects to be created in this cache.
 * @__args: Optional arguments, see &struct kmem_cache_args. Passing %NULL
 *            means defaults will be used for all the arguments.
 *
 * This is currently implemented as a macro using ``_Generic()`` to call
 * either the new variant of the function, or a legacy one.
 *
 * The new variant has 4 parameters:
 * ``kmem_cache_create(name, object_size, args, flags)``
 *
 * See __kmem_cache_create_args() which implements this.
 *
 * The legacy variant has 5 parameters:
 * ``kmem_cache_create(name, object_size, align, flags, ctor)``
 *
 * The align and ctor parameters map to the respective fields of
 * &struct kmem_cache_args
 *
 * Context: Cannot be called within a interrupt, but can be interrupted.
 *
 * Return: a pointer to the cache on success, NULL on failure.
 */
#define kmem_cache_create(__name, __object_size, __args, ...)           \
        _Generic((__args),                                              \
                struct kmem_cache_args *: __kmem_cache_create_args,        \
                void *: __kmem_cache_default_args,                        \
                default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__)

void kmem_cache_destroy(struct kmem_cache *s);
int kmem_cache_shrink(struct kmem_cache *s);

/*
 * Please use this macro to create slab caches. Simply specify the
 * name of the structure and maybe some flags that are listed above.
 *
 * The alignment of the struct determines object alignment. If you
 * f.e. add ____cacheline_aligned_in_smp to the struct declaration
 * then the objects will be properly aligned in SMP configurations.
 */
#define KMEM_CACHE(__struct, __flags)                                   \
        __kmem_cache_create_args(#__struct, sizeof(struct __struct),    \
                        &(struct kmem_cache_args) {                        \
                                .align        = __alignof__(struct __struct), \
                        }, (__flags))

/*
 * To whitelist a single field for copying to/from usercopy, use this
 * macro instead for KMEM_CACHE() above.
 */
#define KMEM_CACHE_USERCOPY(__struct, __flags, __field)                                                \
        __kmem_cache_create_args(#__struct, sizeof(struct __struct),                                \
                        &(struct kmem_cache_args) {                                                \
                                .align                = __alignof__(struct __struct),                        \
                                .useroffset        = offsetof(struct __struct, __field),                \
                                .usersize        = sizeof_field(struct __struct, __field),        \
                        }, (__flags))

/*
 * Common kmalloc functions provided by all allocators
 */
void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size,
                                               unsigned long align,
                                               gfp_t flags, int nid) __realloc_size(2);
#define krealloc_noprof(_o, _s, _f)        krealloc_node_align_noprof(_o, _s, 1, _f, NUMA_NO_NODE)
#define krealloc_node_align(...)        alloc_hooks(krealloc_node_align_noprof(__VA_ARGS__))
#define krealloc_node(_o, _s, _f, _n)        krealloc_node_align(_o, _s, 1, _f, _n)
#define krealloc(...)                        krealloc_node(__VA_ARGS__, NUMA_NO_NODE)

void kfree(const void *objp);
void kfree_nolock(const void *objp);
void kfree_sensitive(const void *objp);

DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))
DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T))

size_t ksize(const void *objp);

#ifdef CONFIG_PRINTK
bool kmem_dump_obj(void *object);
#else
static inline bool kmem_dump_obj(void *object) { return false; }
#endif

/*
 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 * alignment larger than the alignment of a 64-bit integer.
 * Setting ARCH_DMA_MINALIGN in arch headers allows that.
 */
#ifdef ARCH_HAS_DMA_MINALIGN
#if ARCH_DMA_MINALIGN > 8 && !defined(ARCH_KMALLOC_MINALIGN)
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#endif
#endif

#ifndef ARCH_KMALLOC_MINALIGN
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#elif ARCH_KMALLOC_MINALIGN > 8
#define KMALLOC_MIN_SIZE ARCH_KMALLOC_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
#endif

/*
 * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
 * Intended for arches that get misalignment faults even for 64 bit integer
 * aligned buffers.
 */
#ifndef ARCH_SLAB_MINALIGN
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif

/*
 * Arches can define this function if they want to decide the minimum slab
 * alignment at runtime. The value returned by the function must be a power
 * of two and >= ARCH_SLAB_MINALIGN.
 */
#ifndef arch_slab_minalign
static inline unsigned int arch_slab_minalign(void)
{
        return ARCH_SLAB_MINALIGN;
}
#endif

/*
 * kmem_cache_alloc and friends return pointers aligned to ARCH_SLAB_MINALIGN.
 * kmalloc and friends return pointers aligned to both ARCH_KMALLOC_MINALIGN
 * and ARCH_SLAB_MINALIGN, but here we only assume the former alignment.
 */
#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
#define __assume_page_alignment __assume_aligned(PAGE_SIZE)

/*
 * Kmalloc array related definitions
 */

/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH        (PAGE_SHIFT + 1)
#define KMALLOC_SHIFT_MAX        (MAX_PAGE_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        3
#endif

/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocator */
#define KMALLOC_MAX_ORDER        (KMALLOC_SHIFT_MAX - PAGE_SHIFT)

/*
 * Kmalloc subsystem.
 */
#ifndef KMALLOC_MIN_SIZE
#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
#endif

/*
 * This restriction comes from byte sized index implementation.
 * Page size is normally 2^12 bytes and, in this case, if we want to use
 * byte sized index which can represent 2^8 entries, the size of the object
 * should be equal or greater to 2^12 / 2^8 = 2^4 = 16.
 * If minimum size of kmalloc is less than 16, we use it as minimum object
 * size and give up to use byte sized index.
 */
#define SLAB_OBJ_MIN_SIZE      (KMALLOC_MIN_SIZE < 16 ? \
                               (KMALLOC_MIN_SIZE) : 16)

#ifdef CONFIG_RANDOM_KMALLOC_CACHES
#define RANDOM_KMALLOC_CACHES_NR        15 // # of cache copies
#else
#define RANDOM_KMALLOC_CACHES_NR        0
#endif

/*
 * Whenever changing this, take care of that kmalloc_type() and
 * create_kmalloc_caches() still work as intended.
 *
 * KMALLOC_NORMAL can contain only unaccounted objects whereas KMALLOC_CGROUP
 * is for accounted but unreclaimable and non-dma objects. All the other
 * kmem caches can have both accounted and unaccounted objects.
 */
enum kmalloc_cache_type {
        KMALLOC_NORMAL = 0,
#ifndef CONFIG_ZONE_DMA
        KMALLOC_DMA = KMALLOC_NORMAL,
#endif
#ifndef CONFIG_MEMCG
        KMALLOC_CGROUP = KMALLOC_NORMAL,
#endif
        KMALLOC_RANDOM_START = KMALLOC_NORMAL,
        KMALLOC_RANDOM_END = KMALLOC_RANDOM_START + RANDOM_KMALLOC_CACHES_NR,
#ifdef CONFIG_SLUB_TINY
        KMALLOC_RECLAIM = KMALLOC_NORMAL,
#else
        KMALLOC_RECLAIM,
#endif
#ifdef CONFIG_ZONE_DMA
        KMALLOC_DMA,
#endif
#ifdef CONFIG_MEMCG
        KMALLOC_CGROUP,
#endif
        NR_KMALLOC_TYPES
};

typedef struct kmem_cache * kmem_buckets[KMALLOC_SHIFT_HIGH + 1];

extern kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES];

/*
 * Define gfp bits that should not be set for KMALLOC_NORMAL.
 */
#define KMALLOC_NOT_NORMAL_BITS                                        \
        (__GFP_RECLAIMABLE |                                        \
        (IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |        \
        (IS_ENABLED(CONFIG_MEMCG) ? __GFP_ACCOUNT : 0))

extern unsigned long random_kmalloc_seed;

static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigned long caller)
{
        /*
         * The most common case is KMALLOC_NORMAL, so test for it
         * with a single branch for all the relevant flags.
         */
        if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
#ifdef CONFIG_RANDOM_KMALLOC_CACHES
                /* RANDOM_KMALLOC_CACHES_NR (=15) copies + the KMALLOC_NORMAL */
                return KMALLOC_RANDOM_START + hash_64(caller ^ random_kmalloc_seed,
                                                      ilog2(RANDOM_KMALLOC_CACHES_NR + 1));
#else
                return KMALLOC_NORMAL;
#endif

        /*
         * At least one of the flags has to be set. Their priorities in
         * decreasing order are:
         *  1) __GFP_DMA
         *  2) __GFP_RECLAIMABLE
         *  3) __GFP_ACCOUNT
         */
        if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
                return KMALLOC_DMA;
        if (!IS_ENABLED(CONFIG_MEMCG) || (flags & __GFP_RECLAIMABLE))
                return KMALLOC_RECLAIM;
        else
                return KMALLOC_CGROUP;
}

/*
 * Figure out which kmalloc slab an allocation of a certain size
 * belongs to.
 * 0 = zero alloc
 * 1 =  65 .. 96 bytes
 * 2 = 129 .. 192 bytes
 * n = 2^(n-1)+1 .. 2^n
 *
 * Note: __kmalloc_index() is compile-time optimized, and not runtime optimized;
 * typical usage is via kmalloc_index() and therefore evaluated at compile-time.
 * Callers where !size_is_constant should only be test modules, where runtime
 * overheads of __kmalloc_index() can be tolerated.  Also see kmalloc_slab().
 */
static __always_inline unsigned int __kmalloc_index(size_t size,
                                                    bool size_is_constant)
{
        if (!size)
                return 0;

        if (size <= KMALLOC_MIN_SIZE)
                return KMALLOC_SHIFT_LOW;

        if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
                return 1;
        if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
                return 2;
        if (size <=          8) return 3;
        if (size <=         16) return 4;
        if (size <=         32) return 5;
        if (size <=         64) return 6;
        if (size <=        128) return 7;
        if (size <=        256) return 8;
        if (size <=        512) return 9;
        if (size <=       1024) return 10;
        if (size <=   2 * 1024) return 11;
        if (size <=   4 * 1024) return 12;
        if (size <=   8 * 1024) return 13;
        if (size <=  16 * 1024) return 14;
        if (size <=  32 * 1024) return 15;
        if (size <=  64 * 1024) return 16;
        if (size <= 128 * 1024) return 17;
        if (size <= 256 * 1024) return 18;
        if (size <= 512 * 1024) return 19;
        if (size <= 1024 * 1024) return 20;
        if (size <=  2 * 1024 * 1024) return 21;

        if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
                BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
        else
                BUG();

        /* Will never be reached. Needed because the compiler may complain */
        return -1;
}
static_assert(PAGE_SHIFT <= 20);
#define kmalloc_index(s) __kmalloc_index(s, true)

#include <linux/alloc_tag.h>

/**
 * kmem_cache_alloc - Allocate an object
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 *
 * Allocate an object from this cache.
 * See kmem_cache_zalloc() for a shortcut of adding __GFP_ZERO to flags.
 *
 * Return: pointer to the new object or %NULL in case of error
 */
void *kmem_cache_alloc_noprof(struct kmem_cache *cachep,
                              gfp_t flags) __assume_slab_alignment __malloc;
#define kmem_cache_alloc(...)                        alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__))

void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
                            gfp_t gfpflags) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_lru(...)        alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__))

/**
 * kmem_cache_charge - memcg charge an already allocated slab memory
 * @objp: address of the slab object to memcg charge
 * @gfpflags: describe the allocation context
 *
 * kmem_cache_charge allows charging a slab object to the current memcg,
 * primarily in cases where charging at allocation time might not be possible
 * because the target memcg is not known (i.e. softirq context)
 *
 * The objp should be pointer returned by the slab allocator functions like
 * kmalloc (with __GFP_ACCOUNT in flags) or kmem_cache_alloc. The memcg charge
 * behavior can be controlled through gfpflags parameter, which affects how the
 * necessary internal metadata can be allocated. Including __GFP_NOFAIL denotes
 * that overcharging is requested instead of failure, but is not applied for the
 * internal metadata allocation.
 *
 * There are several cases where it will return true even if the charging was
 * not done:
 * More specifically:
 *
 * 1. For !CONFIG_MEMCG or cgroup_disable=memory systems.
 * 2. Already charged slab objects.
 * 3. For slab objects from KMALLOC_NORMAL caches - allocated by kmalloc()
 *    without __GFP_ACCOUNT
 * 4. Allocating internal metadata has failed
 *
 * Return: true if charge was successful otherwise false.
 */
bool kmem_cache_charge(void *objp, gfp_t gfpflags);
void kmem_cache_free(struct kmem_cache *s, void *objp);

kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
                                  unsigned int useroffset, unsigned int usersize,
                                  void (*ctor)(void *));

/*
 * Bulk allocation and freeing operations. These are accelerated in an
 * allocator specific way to avoid taking locks repeatedly or building
 * metadata structures unnecessarily.
 *
 * Note that interrupts must be enabled when calling these functions.
 */
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);

int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
#define kmem_cache_alloc_bulk(...)        alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__))

static __always_inline void kfree_bulk(size_t size, void **p)
{
        kmem_cache_free_bulk(NULL, size, p);
}

void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
                                   int node) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_node(...)        alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))

struct slab_sheaf *
kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size);

int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
                struct slab_sheaf **sheafp, unsigned int size);

void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
                                       struct slab_sheaf *sheaf);

void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp,
                        struct slab_sheaf *sheaf) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_from_sheaf(...)        \
                        alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__))

unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf);

/*
 * These macros allow declaring a kmem_buckets * parameter alongside size, which
 * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call
 * sites don't have to pass NULL.
 */
#ifdef CONFIG_SLAB_BUCKETS
#define DECL_BUCKET_PARAMS(_size, _b)        size_t (_size), kmem_buckets *(_b)
#define PASS_BUCKET_PARAMS(_size, _b)        (_size), (_b)
#define PASS_BUCKET_PARAM(_b)                (_b)
#else
#define DECL_BUCKET_PARAMS(_size, _b)        size_t (_size)
#define PASS_BUCKET_PARAMS(_size, _b)        (_size)
#define PASS_BUCKET_PARAM(_b)                NULL
#endif

/*
 * The following functions are not to be used directly and are intended only
 * for internal use from kmalloc() and kmalloc_node()
 * with the exception of kunit tests
 */

void *__kmalloc_noprof(size_t size, gfp_t flags)
                                __assume_kmalloc_alignment __alloc_size(1);

void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
                                __assume_kmalloc_alignment __alloc_size(1);

void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t flags, size_t size)
                                __assume_kmalloc_alignment __alloc_size(3);

void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
                                  int node, size_t size)
                                __assume_kmalloc_alignment __alloc_size(4);

void *__kmalloc_large_noprof(size_t size, gfp_t flags)
                                __assume_page_alignment __alloc_size(1);

void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
                                __assume_page_alignment __alloc_size(1);

/**
 * kmalloc - allocate kernel memory
 * @size: how many bytes of memory are required.
 * @flags: describe the allocation context
 *
 * kmalloc is the normal method of allocating memory
 * for objects smaller than page size in the kernel.
 *
 * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
 * bytes. For @size of power of two bytes, the alignment is also guaranteed
 * to be at least to the size. For other sizes, the alignment is guaranteed to
 * be at least the largest power-of-two divisor of @size.
 *
 * The @flags argument may be one of the GFP flags defined at
 * include/linux/gfp_types.h and described at
 * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
 *
 * The recommended usage of the @flags is described at
 * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`
 *
 * Below is a brief outline of the most useful GFP flags
 *
 * %GFP_KERNEL
 *        Allocate normal kernel ram. May sleep.
 *
 * %GFP_NOWAIT
 *        Allocation will not sleep.
 *
 * %GFP_ATOMIC
 *        Allocation will not sleep.  May use emergency pools.
 *
 * Also it is possible to set different flags by OR'ing
 * in one or more of the following additional @flags:
 *
 * %__GFP_ZERO
 *        Zero the allocated memory before returning. Also see kzalloc().
 *
 * %__GFP_HIGH
 *        This allocation has high priority and may use emergency pools.
 *
 * %__GFP_NOFAIL
 *        Indicate that this allocation is in no way allowed to fail
 *        (think twice before using).
 *
 * %__GFP_NORETRY
 *        If memory is not immediately available,
 *        then give up at once.
 *
 * %__GFP_NOWARN
 *        If allocation fails, don't issue any warnings.
 *
 * %__GFP_RETRY_MAYFAIL
 *        Try really hard to succeed the allocation but fail
 *        eventually.
 */
static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags)
{
        if (__builtin_constant_p(size) && size) {
                unsigned int index;

                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return __kmalloc_large_noprof(size, flags);

                index = kmalloc_index(size);
                return __kmalloc_cache_noprof(
                                kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
                                flags, size);
        }
        return __kmalloc_noprof(size, flags);
}
#define kmalloc(...)                                alloc_hooks(kmalloc_noprof(__VA_ARGS__))

void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node);
#define kmalloc_nolock(...)                        alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__))

/**
 * __alloc_objs - Allocate objects of a given type using
 * @KMALLOC: which size-based kmalloc wrapper to allocate with.
 * @GFP: GFP flags for the allocation.
 * @TYPE: type to allocate space for.
 * @COUNT: how many @TYPE objects to allocate.
 *
 * Returns: Newly allocated pointer to (first) @TYPE of @COUNT-many
 * allocated @TYPE objects, or NULL on failure.
 */
#define __alloc_objs(KMALLOC, GFP, TYPE, COUNT)                                \
({                                                                        \
        const size_t __obj_size = size_mul(sizeof(TYPE), COUNT);        \
        (TYPE *)KMALLOC(__obj_size, GFP);                                \
})

/**
 * __alloc_flex - Allocate an object that has a trailing flexible array
 * @KMALLOC: kmalloc wrapper function to use for allocation.
 * @GFP: GFP flags for the allocation.
 * @TYPE: type of structure to allocate space for.
 * @FAM: The name of the flexible array member of @TYPE structure.
 * @COUNT: how many @FAM elements to allocate space for.
 *
 * Returns: Newly allocated pointer to @TYPE with @COUNT-many trailing
 * @FAM elements, or NULL on failure or if @COUNT cannot be represented
 * by the member of @TYPE that counts the @FAM elements (annotated via
 * __counted_by()).
 */
#define __alloc_flex(KMALLOC, GFP, TYPE, FAM, COUNT)                        \
({                                                                        \
        const size_t __count = (COUNT);                                        \
        const size_t __obj_size = struct_size_t(TYPE, FAM, __count);        \
        TYPE *__obj_ptr = KMALLOC(__obj_size, GFP);                        \
        if (__obj_ptr)                                                        \
                __set_flex_counter(__obj_ptr->FAM, __count);                \
        __obj_ptr;                                                        \
})

/**
 * kmalloc_obj - Allocate a single instance of the given type
 * @VAR_OR_TYPE: Variable or type to allocate.
 * @GFP: GFP flags for the allocation.
 *
 * Returns: newly allocated pointer to a @VAR_OR_TYPE on success, or NULL
 * on failure.
 */
#define kmalloc_obj(VAR_OR_TYPE, ...) \
        __alloc_objs(kmalloc, default_gfp(__VA_ARGS__), typeof(VAR_OR_TYPE), 1)

/**
 * kmalloc_objs - Allocate an array of the given type
 * @VAR_OR_TYPE: Variable or type to allocate an array of.
 * @COUNT: How many elements in the array.
 * @GFP: GFP flags for the allocation.
 *
 * Returns: newly allocated pointer to array of @VAR_OR_TYPE on success,
 * or NULL on failure.
 */
#define kmalloc_objs(VAR_OR_TYPE, COUNT, ...) \
        __alloc_objs(kmalloc, default_gfp(__VA_ARGS__), typeof(VAR_OR_TYPE), COUNT)

/**
 * kmalloc_flex - Allocate a single instance of the given flexible structure
 * @VAR_OR_TYPE: Variable or type to allocate (with its flex array).
 * @FAM: The name of the flexible array member of the structure.
 * @COUNT: How many flexible array member elements are desired.
 * @GFP: GFP flags for the allocation.
 *
 * Returns: newly allocated pointer to @VAR_OR_TYPE on success, NULL on
 * failure. If @FAM has been annotated with __counted_by(), the allocation
 * will immediately fail if @COUNT is larger than what the type of the
 * struct's counter variable can represent.
 */
#define kmalloc_flex(VAR_OR_TYPE, FAM, COUNT, ...) \
        __alloc_flex(kmalloc, default_gfp(__VA_ARGS__), typeof(VAR_OR_TYPE), FAM, COUNT)

/* All kzalloc aliases for kmalloc_(obj|objs|flex). */
#define kzalloc_obj(P, ...) \
        __alloc_objs(kzalloc, default_gfp(__VA_ARGS__), typeof(P), 1)
#define kzalloc_objs(P, COUNT, ...) \
        __alloc_objs(kzalloc, default_gfp(__VA_ARGS__), typeof(P), COUNT)
#define kzalloc_flex(P, FAM, COUNT, ...)                \
        __alloc_flex(kzalloc, default_gfp(__VA_ARGS__), typeof(P), FAM, COUNT)

/* All kvmalloc aliases for kmalloc_(obj|objs|flex). */
#define kvmalloc_obj(P, ...) \
        __alloc_objs(kvmalloc, default_gfp(__VA_ARGS__), typeof(P), 1)
#define kvmalloc_objs(P, COUNT, ...) \
        __alloc_objs(kvmalloc, default_gfp(__VA_ARGS__), typeof(P), COUNT)
#define kvmalloc_flex(P, FAM, COUNT, ...) \
        __alloc_flex(kvmalloc, default_gfp(__VA_ARGS__), typeof(P), FAM, COUNT)

/* All kvzalloc aliases for kmalloc_(obj|objs|flex). */
#define kvzalloc_obj(P, ...) \
        __alloc_objs(kvzalloc, default_gfp(__VA_ARGS__), typeof(P), 1)
#define kvzalloc_objs(P, COUNT, ...) \
        __alloc_objs(kvzalloc, default_gfp(__VA_ARGS__), typeof(P), COUNT)
#define kvzalloc_flex(P, FAM, COUNT, ...) \
        __alloc_flex(kvzalloc, default_gfp(__VA_ARGS__), typeof(P), FAM, COUNT)

#define kmem_buckets_alloc(_b, _size, _flags)        \
        alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))

#define kmem_buckets_alloc_track_caller(_b, _size, _flags)        \
        alloc_hooks(__kmalloc_node_track_caller_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE, _RET_IP_))

static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node)
{
        if (__builtin_constant_p(size) && size) {
                unsigned int index;

                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return __kmalloc_large_node_noprof(size, flags, node);

                index = kmalloc_index(size);
                return __kmalloc_cache_node_noprof(
                                kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
                                flags, node, size);
        }
        return __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node);
}
#define kmalloc_node(...)                        alloc_hooks(kmalloc_node_noprof(__VA_ARGS__))

/**
 * kmalloc_array - allocate memory for an array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        return kmalloc_noprof(bytes, flags);
}
#define kmalloc_array(...)                        alloc_hooks(kmalloc_array_noprof(__VA_ARGS__))

/**
 * krealloc_array - reallocate memory for an array.
 * @p: pointer to the memory chunk to reallocate
 * @new_n: new number of elements to alloc
 * @new_size: new size of a single member of the array
 * @flags: the type of memory to allocate (see kmalloc)
 *
 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
 * initial memory allocation, every subsequent call to this API for the same
 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
 * __GFP_ZERO is not fully honored by this API.
 *
 * See krealloc_noprof() for further details.
 *
 * In any case, the contents of the object pointed to are preserved up to the
 * lesser of the new and old sizes.
 */
static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p,
                                                                       size_t new_n,
                                                                       size_t new_size,
                                                                       gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
                return NULL;

        return krealloc_noprof(p, bytes, flags);
}
#define krealloc_array(...)                        alloc_hooks(krealloc_array_noprof(__VA_ARGS__))

/**
 * kcalloc - allocate memory for an array. The memory is set to zero.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
#define kcalloc(n, size, flags)                kmalloc_array(n, size, (flags) | __GFP_ZERO)

void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node,
                                         unsigned long caller) __alloc_size(1);
#define kmalloc_node_track_caller_noprof(size, flags, node, caller) \
        __kmalloc_node_track_caller_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node, caller)
#define kmalloc_node_track_caller(...)                \
        alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_))

/*
 * kmalloc_track_caller is a special version of kmalloc that records the
 * calling function of the routine calling it for slab leak tracking instead
 * of just the calling function (confusing, eh?).
 * It's useful when the call to kmalloc comes from a widely-used standard
 * allocator where we care about the real place the memory allocation
 * request comes from.
 */
#define kmalloc_track_caller(...)                kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE)

#define kmalloc_track_caller_noprof(...)        \
                kmalloc_node_track_caller_noprof(__VA_ARGS__, NUMA_NO_NODE, _RET_IP_)

static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags,
                                                          int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc_node_noprof(bytes, flags, node);
        return __kmalloc_node_noprof(PASS_BUCKET_PARAMS(bytes, NULL), flags, node);
}
#define kmalloc_array_node(...)                        alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__))

#define kcalloc_node(_n, _size, _flags, _node)        \
        kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node)

/*
 * Shortcuts
 */
#define kmem_cache_zalloc(_k, _flags)                kmem_cache_alloc(_k, (_flags)|__GFP_ZERO)

/**
 * kzalloc - allocate memory. The memory is set to zero.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
{
        return kmalloc_noprof(size, flags | __GFP_ZERO);
}
#define kzalloc(...)                                alloc_hooks(kzalloc_noprof(__VA_ARGS__))
#define kzalloc_node(_size, _flags, _node)        kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)

void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
                             gfp_t flags, int node) __alloc_size(1);
#define kvmalloc_node_align_noprof(_size, _align, _flags, _node)        \
        __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, NULL), _align, _flags, _node)
#define kvmalloc_node_align(...)                \
        alloc_hooks(kvmalloc_node_align_noprof(__VA_ARGS__))
#define kvmalloc_node(_s, _f, _n)                kvmalloc_node_align(_s, 1, _f, _n)
#define kvmalloc(...)                                kvmalloc_node(__VA_ARGS__, NUMA_NO_NODE)
#define kvzalloc(_size, _flags)                        kvmalloc(_size, (_flags)|__GFP_ZERO)

#define kvzalloc_node(_size, _flags, _node)        kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node)

#define kmem_buckets_valloc(_b, _size, _flags)        \
        alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), 1, _flags, NUMA_NO_NODE))

static inline __alloc_size(1, 2) void *
kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        return kvmalloc_node_align_noprof(bytes, 1, flags, node);
}

#define kvmalloc_array_noprof(...)                kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE)
#define kvcalloc_node_noprof(_n,_s,_f,_node)        kvmalloc_array_node_noprof(_n,_s,(_f)|__GFP_ZERO,_node)
#define kvcalloc_noprof(...)                        kvcalloc_node_noprof(__VA_ARGS__, NUMA_NO_NODE)

#define kvmalloc_array(...)                        alloc_hooks(kvmalloc_array_noprof(__VA_ARGS__))
#define kvcalloc_node(...)                        alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__))
#define kvcalloc(...)                                alloc_hooks(kvcalloc_noprof(__VA_ARGS__))

void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
                                  gfp_t flags, int nid) __realloc_size(2);
#define kvrealloc_node_align(...)                \
        alloc_hooks(kvrealloc_node_align_noprof(__VA_ARGS__))
#define kvrealloc_node(_p, _s, _f, _n)                kvrealloc_node_align(_p, _s, 1, _f, _n)
#define kvrealloc(...)                                kvrealloc_node(__VA_ARGS__, NUMA_NO_NODE)

extern void kvfree(const void *addr);
DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T))

extern void kvfree_sensitive(const void *addr, size_t len);

unsigned int kmem_cache_size(struct kmem_cache *s);

#ifndef CONFIG_KVFREE_RCU_BATCHED
static inline void kvfree_rcu_barrier(void)
{
        rcu_barrier();
}

static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
{
        rcu_barrier();
}

static inline void kfree_rcu_scheduler_running(void) { }
#else
void kvfree_rcu_barrier(void);

void kvfree_rcu_barrier_on_cache(struct kmem_cache *s);

void kfree_rcu_scheduler_running(void);
#endif

/**
 * kmalloc_size_roundup - Report allocation bucket size for the given size
 *
 * @size: Number of bytes to round up from.
 *
 * This returns the number of bytes that would be available in a kmalloc()
 * allocation of @size bytes. For example, a 126 byte request would be
 * rounded up to the next sized kmalloc bucket, 128 bytes. (This is strictly
 * for the general-purpose kmalloc()-based allocations, and is not for the
 * pre-sized kmem_cache_alloc()-based allocations.)
 *
 * Use this to kmalloc() the full bucket size ahead of time instead of using
 * ksize() to query the size after an allocation.
 */
size_t kmalloc_size_roundup(size_t size);

void __init kmem_cache_init_late(void);
void __init kvfree_rcu_init(void);

#endif        /* _LINUX_SLAB_H */
























































































































































































































   13 





   14 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/of_address.h>
#include <linux/of_iommu.h>
#include <linux/of_reserved_mem.h>
#include <linux/dma-direct.h> /* for bus_dma_region */
#include <linux/dma-map-ops.h>
#include <linux/init.h>
#include <linux/mod_devicetable.h>
#include <linux/slab.h>
#include <linux/platform_device.h>

#include <asm/errno.h>
#include "of_private.h"

/**
 * of_match_device - Tell if a struct device matches an of_device_id list
 * @matches: array of of_device_id match structures to search in
 * @dev: the OF device structure to match against
 *
 * Used by a driver to check whether an platform_device present in the
 * system is in its list of supported devices.
 */
const struct of_device_id *of_match_device(const struct of_device_id *matches,
                                           const struct device *dev)
{
        if (!matches || !dev->of_node || dev->of_node_reused)
                return NULL;
        return of_match_node(matches, dev->of_node);
}
EXPORT_SYMBOL(of_match_device);

static void
of_dma_set_restricted_buffer(struct device *dev, struct device_node *np)
{
        struct device_node *of_node = dev->of_node;
        struct of_phandle_iterator it;
        int rc, i = 0;

        if (!IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL))
                return;

        /*
         * If dev->of_node doesn't exist or doesn't contain memory-region, try
         * the OF node having DMA configuration.
         */
        if (!of_property_present(of_node, "memory-region"))
                of_node = np;

        of_for_each_phandle(&it, rc, of_node, "memory-region", NULL, 0) {
                /*
                 * There might be multiple memory regions, but only one
                 * restricted-dma-pool region is allowed.
                 */
                if (of_device_is_compatible(it.node, "restricted-dma-pool") &&
                    of_device_is_available(it.node)) {
                        if (of_reserved_mem_device_init_by_idx(dev, of_node, i))
                                dev_warn(dev, "failed to initialise \"restricted-dma-pool\" memory node\n");
                        of_node_put(it.node);
                        break;
                }
                i++;
        }

}

/**
 * of_dma_configure_id - Setup DMA configuration
 * @dev:        Device to apply DMA configuration
 * @np:                Pointer to OF node having DMA configuration
 * @force_dma:  Whether device is to be set up by of_dma_configure() even if
 *                DMA capability is not explicitly described by firmware.
 * @id:                Optional const pointer value input id
 *
 * Try to get devices's DMA configuration from DT and update it
 * accordingly.
 *
 * If platform code needs to use its own special DMA configuration, it
 * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events
 * to fix up DMA configuration.
 */
int of_dma_configure_id(struct device *dev, struct device_node *np,
                        bool force_dma, const u32 *id)
{
        const struct bus_dma_region *map = NULL;
        struct device_node *bus_np;
        u64 mask, end = 0;
        bool coherent, set_map = false;
        int ret;

        if (dev->dma_range_map) {
                dev_dbg(dev, "dma_range_map already set\n");
                goto skip_map;
        }

        if (np == dev->of_node)
                bus_np = __of_get_dma_parent(np);
        else
                bus_np = of_node_get(np);

        ret = of_dma_get_range(bus_np, &map);
        of_node_put(bus_np);
        if (ret < 0) {
                /*
                 * For legacy reasons, we have to assume some devices need
                 * DMA configuration regardless of whether "dma-ranges" is
                 * correctly specified or not.
                 */
                if (!force_dma)
                        return ret == -ENODEV ? 0 : ret;
        } else {
                /* Determine the overall bounds of all DMA regions */
                end = dma_range_map_max(map);
                set_map = true;
        }
skip_map:
        /*
         * If @dev is expected to be DMA-capable then the bus code that created
         * it should have initialised its dma_mask pointer by this point. For
         * now, we'll continue the legacy behaviour of coercing it to the
         * coherent mask if not, but we'll no longer do so quietly.
         */
        if (!dev->dma_mask) {
                dev_warn(dev, "DMA mask not set\n");
                dev->dma_mask = &dev->coherent_dma_mask;
        }

        if (!end && dev->coherent_dma_mask)
                end = dev->coherent_dma_mask;
        else if (!end)
                end = (1ULL << 32) - 1;

        /*
         * Limit coherent and dma mask based on size and default mask
         * set by the driver.
         */
        mask = DMA_BIT_MASK(ilog2(end) + 1);
        dev->coherent_dma_mask &= mask;
        *dev->dma_mask &= mask;
        /* ...but only set bus limit and range map if we found valid dma-ranges earlier */
        if (set_map) {
                dev->bus_dma_limit = end;
                dev->dma_range_map = map;
        }

        coherent = of_dma_is_coherent(np);
        dev_dbg(dev, "device is%sdma coherent\n",
                coherent ? " " : " not ");

        ret = of_iommu_configure(dev, np, id);
        if (ret == -EPROBE_DEFER) {
                /* Don't touch range map if it wasn't set from a valid dma-ranges */
                if (set_map)
                        dev->dma_range_map = NULL;
                kfree(map);
                return -EPROBE_DEFER;
        }
        /* Take all other IOMMU errors to mean we'll just carry on without it */
        dev_dbg(dev, "device is%sbehind an iommu\n",
                !ret ? " " : " not ");

        arch_setup_dma_ops(dev, coherent);

        if (ret)
                of_dma_set_restricted_buffer(dev, np);

        return 0;
}
EXPORT_SYMBOL_GPL(of_dma_configure_id);

const void *of_device_get_match_data(const struct device *dev)
{
        const struct of_device_id *match;

        match = of_match_device(dev->driver->of_match_table, dev);
        if (!match)
                return NULL;

        return match->data;
}
EXPORT_SYMBOL(of_device_get_match_data);

/**
 * of_device_modalias - Fill buffer with newline terminated modalias string
 * @dev:        Calling device
 * @str:        Modalias string
 * @len:        Size of @str
 */
ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len)
{
        ssize_t sl;

        if (!dev || !dev->of_node || dev->of_node_reused)
                return -ENODEV;

        sl = of_modalias(dev->of_node, str, len - 2);
        if (sl < 0)
                return sl;
        if (sl > len - 2)
                return -ENOMEM;

        str[sl++] = '\n';
        str[sl] = 0;
        return sl;
}
EXPORT_SYMBOL_GPL(of_device_modalias);

/**
 * of_device_uevent - Display OF related uevent information
 * @dev:        Device to display the uevent information for
 * @env:        Kernel object's userspace event reference to fill up
 */
void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        const char *compat, *type;
        struct alias_prop *app;
        struct property *p;
        int seen = 0;

        if ((!dev) || (!dev->of_node))
                return;

        add_uevent_var(env, "OF_NAME=%pOFn", dev->of_node);
        add_uevent_var(env, "OF_FULLNAME=%pOF", dev->of_node);
        type = of_node_get_device_type(dev->of_node);
        if (type)
                add_uevent_var(env, "OF_TYPE=%s", type);

        /* Since the compatible field can contain pretty much anything
         * it's not really legal to split it out with commas. We split it
         * up using a number of environment variables instead. */
        of_property_for_each_string(dev->of_node, "compatible", p, compat) {
                add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat);
                seen++;
        }
        add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen);

        seen = 0;
        mutex_lock(&of_mutex);
        list_for_each_entry(app, &aliases_lookup, link) {
                if (dev->of_node == app->np) {
                        add_uevent_var(env, "OF_ALIAS_%d=%s", seen,
                                       app->alias);
                        seen++;
                }
        }
        mutex_unlock(&of_mutex);
}
EXPORT_SYMBOL_GPL(of_device_uevent);

int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env)
{
        int sl;

        if ((!dev) || (!dev->of_node) || dev->of_node_reused)
                return -ENODEV;

        /* Devicetree modalias is tricky, we add it in 2 steps */
        if (add_uevent_var(env, "MODALIAS="))
                return -ENOMEM;

        sl = of_modalias(dev->of_node, &env->buf[env->buflen-1],
                         sizeof(env->buf) - env->buflen);
        if (sl < 0)
                return sl;
        if (sl >= (sizeof(env->buf) - env->buflen))
                return -ENOMEM;
        env->buflen += sl;

        return 0;
}
EXPORT_SYMBOL_GPL(of_device_uevent_modalias);

/**
 * of_device_make_bus_id - Use the device node data to assign a unique name
 * @dev: pointer to device structure that is linked to a device tree node
 *
 * This routine will first try using the translated bus address to
 * derive a unique name. If it cannot, then it will prepend names from
 * parent nodes until a unique name can be derived.
 */
void of_device_make_bus_id(struct device *dev)
{
        struct device_node *node = dev->of_node;
        const __be32 *reg;
        u64 addr;
        u32 mask;

        /* Construct the name, using parent nodes if necessary to ensure uniqueness */
        while (node->parent) {
                /*
                 * If the address can be translated, then that is as much
                 * uniqueness as we need. Make it the first component and return
                 */
                reg = of_get_property(node, "reg", NULL);
                if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) {
                        if (!of_property_read_u32(node, "mask", &mask))
                                dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn",
                                             addr, ffs(mask) - 1, node, dev_name(dev));

                        else
                                dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn",
                                             addr, node, dev_name(dev));
                        return;
                }

                /* format arguments only used if dev_name() resolves to NULL */
                dev_set_name(dev, dev_name(dev) ? "%s:%s" : "%s",
                             kbasename(node->full_name), dev_name(dev));
                node = node->parent;
        }
}
EXPORT_SYMBOL_GPL(of_device_make_bus_id);










































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_DELAY_H
#define _LINUX_DELAY_H

/*
 * Copyright (C) 1993 Linus Torvalds
 *
 * Delay routines, using a pre-computed "loops_per_jiffy" value.
 * Sleep routines using timer list timers or hrtimers.
 */

#include <linux/math.h>
#include <linux/sched.h>
#include <linux/jiffies.h>

extern unsigned long loops_per_jiffy;

#include <asm/delay.h>

/*
 * Using udelay() for intervals greater than a few milliseconds can
 * risk overflow for high loops_per_jiffy (high bogomips) machines. The
 * mdelay() provides a wrapper to prevent this.  For delays greater
 * than MAX_UDELAY_MS milliseconds, the wrapper is used.  Architecture
 * specific values can be defined in asm-???/delay.h as an override.
 * The 2nd mdelay() definition ensures GCC will optimize away the 
 * while loop for the common cases where n <= MAX_UDELAY_MS  --  Paul G.
 */
#ifndef MAX_UDELAY_MS
#define MAX_UDELAY_MS        5
#endif

#ifndef mdelay
/**
 * mdelay - Inserting a delay based on milliseconds with busy waiting
 * @n:        requested delay in milliseconds
 *
 * See udelay() for basic information about mdelay() and it's variants.
 *
 * Please double check, whether mdelay() is the right way to go or whether a
 * refactoring of the code is the better variant to be able to use msleep()
 * instead.
 */
#define mdelay(n) (\
        (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
        ({unsigned long __ms=(n); while (__ms--) udelay(1000);}))
#endif

#ifndef ndelay
static inline void ndelay(unsigned long x)
{
        udelay(DIV_ROUND_UP(x, 1000));
}
#define ndelay(x) ndelay(x)
#endif

extern unsigned long lpj_fine;
void calibrate_delay(void);
unsigned long calibrate_delay_is_known(void);
void __attribute__((weak)) calibration_delay_done(void);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);
void usleep_range_state(unsigned long min, unsigned long max,
                        unsigned int state);

/**
 * usleep_range - Sleep for an approximate time
 * @min:        Minimum time in microseconds to sleep
 * @max:        Maximum time in microseconds to sleep
 *
 * For basic information please refer to usleep_range_state().
 *
 * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep.
 */
static inline void usleep_range(unsigned long min, unsigned long max)
{
        usleep_range_state(min, max, TASK_UNINTERRUPTIBLE);
}

/**
 * usleep_range_idle - Sleep for an approximate time with idle time accounting
 * @min:        Minimum time in microseconds to sleep
 * @max:        Maximum time in microseconds to sleep
 *
 * For basic information please refer to usleep_range_state().
 *
 * The sleeping task has the state TASK_IDLE during the sleep to prevent
 * contribution to the load average.
 */
static inline void usleep_range_idle(unsigned long min, unsigned long max)
{
        usleep_range_state(min, max, TASK_IDLE);
}

/**
 * ssleep - wrapper for seconds around msleep
 * @seconds:        Requested sleep duration in seconds
 *
 * Please refer to msleep() for detailed information.
 */
static inline void ssleep(unsigned int seconds)
{
        msleep(seconds * 1000);
}

static const unsigned int max_slack_shift = 2;
#define USLEEP_RANGE_UPPER_BOUND        ((TICK_NSEC << max_slack_shift) / NSEC_PER_USEC)

/**
 * fsleep - flexible sleep which autoselects the best mechanism
 * @usecs:        requested sleep duration in microseconds
 *
 * flseep() selects the best mechanism that will provide maximum 25% slack
 * to the requested sleep duration. Therefore it uses:
 *
 * * udelay() loop for sleep durations <= 10 microseconds to avoid hrtimer
 *   overhead for really short sleep durations.
 * * usleep_range() for sleep durations which would lead with the usage of
 *   msleep() to a slack larger than 25%. This depends on the granularity of
 *   jiffies.
 * * msleep() for all other sleep durations.
 *
 * Note: When %CONFIG_HIGH_RES_TIMERS is not set, all sleeps are processed with
 * the granularity of jiffies and the slack might exceed 25% especially for
 * short sleep durations.
 */
static inline void fsleep(unsigned long usecs)
{
        if (usecs <= 10)
                udelay(usecs);
        else if (usecs < USLEEP_RANGE_UPPER_BOUND)
                usleep_range(usecs, usecs + (usecs >> max_slack_shift));
        else
                msleep(DIV_ROUND_UP(usecs, USEC_PER_MSEC));
}

#endif /* defined(_LINUX_DELAY_H) */










   21 





   23 






   19 












   22 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGALLOC_TRACK_H
#define _LINUX_PGALLOC_TRACK_H

#if defined(CONFIG_MMU)
static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(pgd_none(*pgd))) {
                if (__p4d_alloc(mm, pgd, address))
                        return NULL;
                *mod_mask |= PGTBL_PGD_MODIFIED;
        }

        return p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(p4d_none(*p4d))) {
                if (__pud_alloc(mm, p4d, address))
                        return NULL;
                *mod_mask |= PGTBL_P4D_MODIFIED;
        }

        return pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(pud_none(*pud))) {
                if (__pmd_alloc(mm, pud, address))
                        return NULL;
                *mod_mask |= PGTBL_PUD_MODIFIED;
        }

        return pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

#define pte_alloc_kernel_track(pmd, address, mask)                        \
        ((unlikely(pmd_none(*(pmd))) &&                                        \
          (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
                NULL: pte_offset_kernel(pmd, address))

#endif /* _LINUX_PGALLOC_TRACK_H */



















































































   14 

   10 












    4 














































   14 













   12 




































   29 





   20 



   12 



   13 


















   12 























   13 
    1 




   12 


   13 

   14 


   13 




   14 
   13 






























































   26 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
// SPDX-License-Identifier: GPL-2.0
/*
 * A fast, small, non-recursive O(n log n) sort for the Linux kernel
 *
 * This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
 * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
 *
 * Quicksort manages n*log2(n) - 1.26*n for random inputs (1.63*n
 * better) at the expense of stack usage and much larger code to avoid
 * quicksort's O(n^2) worst case.
 */

#include <linux/types.h>
#include <linux/export.h>
#include <linux/sort.h>

/**
 * is_aligned - is this pointer & size okay for word-wide copying?
 * @base: pointer to data
 * @size: size of each element
 * @align: required alignment (typically 4 or 8)
 *
 * Returns true if elements can be copied using word loads and stores.
 * The size must be a multiple of the alignment, and the base address must
 * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
 *
 * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
 * to "if ((a | b) & mask)", so we do that by hand.
 */
__attribute_const__ __always_inline
static bool is_aligned(const void *base, size_t size, unsigned char align)
{
        unsigned char lsbits = (unsigned char)size;

        (void)base;
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
        lsbits |= (unsigned char)(uintptr_t)base;
#endif
        return (lsbits & (align - 1)) == 0;
}

/**
 * swap_words_32 - swap two elements in 32-bit chunks
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size (must be a multiple of 4)
 *
 * Exchange the two objects in memory.  This exploits base+index addressing,
 * which basically all CPUs have, to minimize loop overhead computations.
 *
 * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
 * bottom of the loop, even though the zero flag is still valid from the
 * subtract (since the intervening mov instructions don't alter the flags).
 * Gcc 8.1.0 doesn't have that problem.
 */
static void swap_words_32(void *a, void *b, size_t n)
{
        do {
                u32 t = *(u32 *)(a + (n -= 4));
                *(u32 *)(a + n) = *(u32 *)(b + n);
                *(u32 *)(b + n) = t;
        } while (n);
}

/**
 * swap_words_64 - swap two elements in 64-bit chunks
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size (must be a multiple of 8)
 *
 * Exchange the two objects in memory.  This exploits base+index
 * addressing, which basically all CPUs have, to minimize loop overhead
 * computations.
 *
 * We'd like to use 64-bit loads if possible.  If they're not, emulating
 * one requires base+index+4 addressing which x86 has but most other
 * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
 * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
 * x32 ABI).  Are there any cases the kernel needs to worry about?
 */
static void swap_words_64(void *a, void *b, size_t n)
{
        do {
#ifdef CONFIG_64BIT
                u64 t = *(u64 *)(a + (n -= 8));
                *(u64 *)(a + n) = *(u64 *)(b + n);
                *(u64 *)(b + n) = t;
#else
                /* Use two 32-bit transfers to avoid base+index+4 addressing */
                u32 t = *(u32 *)(a + (n -= 4));
                *(u32 *)(a + n) = *(u32 *)(b + n);
                *(u32 *)(b + n) = t;

                t = *(u32 *)(a + (n -= 4));
                *(u32 *)(a + n) = *(u32 *)(b + n);
                *(u32 *)(b + n) = t;
#endif
        } while (n);
}

/**
 * swap_bytes - swap two elements a byte at a time
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size
 *
 * This is the fallback if alignment doesn't allow using larger chunks.
 */
static void swap_bytes(void *a, void *b, size_t n)
{
        do {
                char t = ((char *)a)[--n];
                ((char *)a)[n] = ((char *)b)[n];
                ((char *)b)[n] = t;
        } while (n);
}

/*
 * The values are arbitrary as long as they can't be confused with
 * a pointer, but small integers make for the smallest compare
 * instructions.
 */
#define SWAP_WORDS_64 (swap_r_func_t)0
#define SWAP_WORDS_32 (swap_r_func_t)1
#define SWAP_BYTES    (swap_r_func_t)2
#define SWAP_WRAPPER  (swap_r_func_t)3

struct wrapper {
        cmp_func_t cmp;
        swap_func_t swap;
};

/*
 * The function pointer is last to make tail calls most efficient if the
 * compiler decides not to inline this function.
 */
static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
{
        if (swap_func == SWAP_WRAPPER) {
                ((const struct wrapper *)priv)->swap(a, b, (int)size);
                return;
        }

        if (swap_func == SWAP_WORDS_64)
                swap_words_64(a, b, size);
        else if (swap_func == SWAP_WORDS_32)
                swap_words_32(a, b, size);
        else if (swap_func == SWAP_BYTES)
                swap_bytes(a, b, size);
        else
                swap_func(a, b, (int)size, priv);
}

#define _CMP_WRAPPER ((cmp_r_func_t)0L)

static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
{
        if (cmp == _CMP_WRAPPER)
                return ((const struct wrapper *)priv)->cmp(a, b);
        return cmp(a, b, priv);
}

/**
 * parent - given the offset of the child, find the offset of the parent.
 * @i: the offset of the heap element whose parent is sought.  Non-zero.
 * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
 * @size: size of each element
 *
 * In terms of array indexes, the parent of element j = @i/@size is simply
 * (j-1)/2.  But when working in byte offsets, we can't use implicit
 * truncation of integer divides.
 *
 * Fortunately, we only need one bit of the quotient, not the full divide.
 * @size has a least significant bit.  That bit will be clear if @i is
 * an even multiple of @size, and set if it's an odd multiple.
 *
 * Logically, we're doing "if (i & lsbit) i -= size;", but since the
 * branch is unpredictable, it's done with a bit of clever branch-free
 * code instead.
 */
__attribute_const__ __always_inline
static size_t parent(size_t i, unsigned int lsbit, size_t size)
{
        i -= size;
        i -= size & -(i & lsbit);
        return i / 2;
}

#include <linux/sched.h>

static void __sort_r(void *base, size_t num, size_t size,
                     cmp_r_func_t cmp_func,
                     swap_r_func_t swap_func,
                     const void *priv,
                     bool may_schedule)
{
        /* pre-scale counters for performance */
        size_t n = num * size, a = (num/2) * size;
        const unsigned int lsbit = size & -size;  /* Used to find parent */
        size_t shift = 0;

        if (!a)                /* num < 2 || size == 0 */
                return;

        /* called from 'sort' without swap function, let's pick the default */
        if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
                swap_func = NULL;

        if (!swap_func) {
                if (is_aligned(base, size, 8))
                        swap_func = SWAP_WORDS_64;
                else if (is_aligned(base, size, 4))
                        swap_func = SWAP_WORDS_32;
                else
                        swap_func = SWAP_BYTES;
        }

        /*
         * Loop invariants:
         * 1. elements [a,n) satisfy the heap property (compare greater than
         *    all of their children),
         * 2. elements [n,num*size) are sorted, and
         * 3. a <= b <= c <= d <= n (whenever they are valid).
         */
        for (;;) {
                size_t b, c, d;

                if (a)                        /* Building heap: sift down a */
                        a -= size << shift;
                else if (n > 3 * size) { /* Sorting: Extract two largest elements */
                        n -= size;
                        do_swap(base, base + n, size, swap_func, priv);
                        shift = do_cmp(base + size, base + 2 * size, cmp_func, priv) <= 0;
                        a = size << shift;
                        n -= size;
                        do_swap(base + a, base + n, size, swap_func, priv);
                } else {                /* Sort complete */
                        break;
                }

                /*
                 * Sift element at "a" down into heap.  This is the
                 * "bottom-up" variant, which significantly reduces
                 * calls to cmp_func(): we find the sift-down path all
                 * the way to the leaves (one compare per level), then
                 * backtrack to find where to insert the target element.
                 *
                 * Because elements tend to sift down close to the leaves,
                 * this uses fewer compares than doing two per level
                 * on the way down.  (A bit more than half as many on
                 * average, 3/4 worst-case.)
                 */
                for (b = a; c = 2*b + size, (d = c + size) < n;)
                        b = do_cmp(base + c, base + d, cmp_func, priv) > 0 ? c : d;
                if (d == n)        /* Special case last leaf with no sibling */
                        b = c;

                /* Now backtrack from "b" to the correct location for "a" */
                while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0)
                        b = parent(b, lsbit, size);
                c = b;                        /* Where "a" belongs */
                while (b != a) {        /* Shift it into place */
                        b = parent(b, lsbit, size);
                        do_swap(base + b, base + c, size, swap_func, priv);
                }

                if (may_schedule)
                        cond_resched();
        }

        n -= size;
        do_swap(base, base + n, size, swap_func, priv);
        if (n == size * 2 && do_cmp(base, base + size, cmp_func, priv) > 0)
                do_swap(base, base + size, size, swap_func, priv);
}

/**
 * sort_r - sort an array of elements
 * @base: pointer to data to sort
 * @num: number of elements
 * @size: size of each element
 * @cmp_func: pointer to comparison function
 * @swap_func: pointer to swap function or NULL
 * @priv: third argument passed to comparison function
 *
 * This function does a heapsort on the given array.  You may provide
 * a swap_func function if you need to do something more than a memory
 * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
 * avoids a slow retpoline and so is significantly faster.
 *
 * The comparison function must adhere to specific mathematical
 * properties to ensure correct and stable sorting:
 * - Antisymmetry: cmp_func(a, b) must return the opposite sign of
 * cmp_func(b, a).
 * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then
 * cmp_func(a, c) <= 0.
 *
 * Sorting time is O(n log n) both on average and worst-case. While
 * quicksort is slightly faster on average, it suffers from exploitable
 * O(n*n) worst-case behavior and extra memory requirements that make
 * it less suitable for kernel use.
 */
void sort_r(void *base, size_t num, size_t size,
            cmp_r_func_t cmp_func,
            swap_r_func_t swap_func,
            const void *priv)
{
        __sort_r(base, num, size, cmp_func, swap_func, priv, false);
}
EXPORT_SYMBOL(sort_r);

/**
 * sort_r_nonatomic - sort an array of elements, with cond_resched
 * @base: pointer to data to sort
 * @num: number of elements
 * @size: size of each element
 * @cmp_func: pointer to comparison function
 * @swap_func: pointer to swap function or NULL
 * @priv: third argument passed to comparison function
 *
 * Same as sort_r, but preferred for larger arrays as it does a periodic
 * cond_resched().
 */
void sort_r_nonatomic(void *base, size_t num, size_t size,
                      cmp_r_func_t cmp_func,
                      swap_r_func_t swap_func,
                      const void *priv)
{
        __sort_r(base, num, size, cmp_func, swap_func, priv, true);
}
EXPORT_SYMBOL(sort_r_nonatomic);

void sort(void *base, size_t num, size_t size,
          cmp_func_t cmp_func,
          swap_func_t swap_func)
{
        struct wrapper w = {
                .cmp  = cmp_func,
                .swap = swap_func,
        };

        return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, false);
}
EXPORT_SYMBOL(sort);

void sort_nonatomic(void *base, size_t num, size_t size,
                    cmp_func_t cmp_func,
                    swap_func_t swap_func)
{
        struct wrapper w = {
                .cmp  = cmp_func,
                .swap = swap_func,
        };

        return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, true);
}
EXPORT_SYMBOL(sort_nonatomic);





























    1 






    1 



































































    1 








































    1 

    1 

    1 
    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// SPDX-License-Identifier: GPL-2.0-or-later

/*
 * Functions for initializing, allocating, freeing and duplicating VMAs. Shared
 * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
 */

#include "vma_internal.h"
#include "vma.h"

/* SLAB cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep;

void __init vma_state_init(void)
{
        struct kmem_cache_args args = {
                .use_freeptr_offset = true,
                .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
                .sheaf_capacity = 32,
        };

        vm_area_cachep = kmem_cache_create("vm_area_struct",
                        sizeof(struct vm_area_struct), &args,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        SLAB_ACCOUNT);
}

struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!vma)
                return NULL;

        vma_init(vma, mm);

        return vma;
}

static void vm_area_init_from(const struct vm_area_struct *src,
                              struct vm_area_struct *dest)
{
        dest->vm_mm = src->vm_mm;
        dest->vm_ops = src->vm_ops;
        dest->vm_start = src->vm_start;
        dest->vm_end = src->vm_end;
        dest->anon_vma = src->anon_vma;
        dest->vm_pgoff = src->vm_pgoff;
        dest->vm_file = src->vm_file;
        dest->vm_private_data = src->vm_private_data;
        vm_flags_init(dest, src->vm_flags);
        memcpy(&dest->vm_page_prot, &src->vm_page_prot,
               sizeof(dest->vm_page_prot));
        /*
         * src->shared.rb may be modified concurrently when called from
         * dup_mmap(), but the clone will reinitialize it.
         */
        data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
        memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
               sizeof(dest->vm_userfaultfd_ctx));
#ifdef CONFIG_ANON_VMA_NAME
        dest->anon_name = src->anon_name;
#endif
#ifdef CONFIG_SWAP
        memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
               sizeof(dest->swap_readahead_info));
#endif
#ifndef CONFIG_MMU
        dest->vm_region = src->vm_region;
#endif
#ifdef CONFIG_NUMA
        dest->vm_policy = src->vm_policy;
#endif
#ifdef __HAVE_PFNMAP_TRACKING
        dest->pfnmap_track_ctx = NULL;
#endif
}

#ifdef __HAVE_PFNMAP_TRACKING
static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
                struct vm_area_struct *new)
{
        struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx;

        if (likely(!ctx))
                return 0;

        /*
         * We don't expect to ever hit this. If ever required, we would have
         * to duplicate the tracking.
         */
        if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX))
                return -ENOMEM;
        kref_get(&ctx->kref);
        new->pfnmap_track_ctx = ctx;
        return 0;
}

static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
{
        struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx;

        if (likely(!ctx))
                return;

        kref_put(&ctx->kref, pfnmap_track_ctx_release);
        vma->pfnmap_track_ctx = NULL;
}
#else
static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
                struct vm_area_struct *new)
{
        return 0;
}
static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
{
}
#endif

struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
        struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

        if (!new)
                return NULL;

        ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
        ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
        vm_area_init_from(orig, new);

        if (vma_pfnmap_track_ctx_dup(orig, new)) {
                kmem_cache_free(vm_area_cachep, new);
                return NULL;
        }
        vma_lock_init(new, true);
        INIT_LIST_HEAD(&new->anon_vma_chain);
        vma_numab_state_init(new);
        dup_anon_vma_name(orig, new);

        return new;
}

void vm_area_free(struct vm_area_struct *vma)
{
        /* The vma should be detached while being destroyed. */
        vma_assert_detached(vma);
        vma_numab_state_free(vma);
        free_anon_vma_name(vma);
        vma_pfnmap_track_ctx_release(vma);
        kmem_cache_free(vm_area_cachep, vma);
}

























































































































    1 




































   35 

   13 
   35 






















   12 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _LINUX_FILE_REF_H
#define _LINUX_FILE_REF_H

#include <linux/atomic.h>
#include <linux/preempt.h>
#include <linux/types.h>

/*
 * file_ref is a reference count implementation specifically for use by
 * files. It takes inspiration from rcuref but differs in key aspects
 * such as support for SLAB_TYPESAFE_BY_RCU type caches.
 *
 * FILE_REF_ONEREF                FILE_REF_MAXREF
 * 0x0000000000000000UL      0x7FFFFFFFFFFFFFFFUL
 * <-------------------valid ------------------->
 *
 *                       FILE_REF_SATURATED
 * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL
 * <-----------------------saturation zone---------------------->
 *
 * FILE_REF_RELEASED                   FILE_REF_DEAD
 * 0xC000000000000000UL         0xE000000000000000UL
 * <-------------------dead zone------------------->
 *
 * FILE_REF_NOREF
 * 0xFFFFFFFFFFFFFFFFUL
 */

#ifdef CONFIG_64BIT
#define FILE_REF_ONEREF                0x0000000000000000UL
#define FILE_REF_MAXREF                0x7FFFFFFFFFFFFFFFUL
#define FILE_REF_SATURATED        0xA000000000000000UL
#define FILE_REF_RELEASED        0xC000000000000000UL
#define FILE_REF_DEAD                0xE000000000000000UL
#define FILE_REF_NOREF                0xFFFFFFFFFFFFFFFFUL
#else
#define FILE_REF_ONEREF                0x00000000U
#define FILE_REF_MAXREF                0x7FFFFFFFU
#define FILE_REF_SATURATED        0xA0000000U
#define FILE_REF_RELEASED        0xC0000000U
#define FILE_REF_DEAD                0xE0000000U
#define FILE_REF_NOREF                0xFFFFFFFFU
#endif

typedef struct {
#ifdef CONFIG_64BIT
        atomic64_t refcnt;
#else
        atomic_t refcnt;
#endif
} file_ref_t;

/**
 * file_ref_init - Initialize a file reference count
 * @ref: Pointer to the reference count
 * @cnt: The initial reference count typically '1'
 */
static inline void file_ref_init(file_ref_t *ref, unsigned long cnt)
{
        atomic_long_set(&ref->refcnt, cnt - 1);
}

bool __file_ref_put(file_ref_t *ref, unsigned long cnt);

/**
 * file_ref_get - Acquire one reference on a file
 * @ref: Pointer to the reference count
 *
 * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF.
 *
 * Provides full memory ordering.
 *
 * Return: False if the attempt to acquire a reference failed. This happens
 *         when the last reference has been put already. True if a reference
 *         was successfully acquired
 */
static __always_inline __must_check bool file_ref_get(file_ref_t *ref)
{
        /*
         * Unconditionally increase the reference count with full
         * ordering. The saturation and dead zones provide enough
         * tolerance for this.
         *
         * If this indicates negative the file in question the fail can
         * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU.
         * Hence, unconditionally altering the file reference count to
         * e.g., reset the file reference count back to the middle of
         * the deadzone risk end up marking someone else's file as dead
         * behind their back.
         *
         * It would be possible to do a careful:
         *
         * cnt = atomic_long_inc_return();
         * if (likely(cnt >= 0))
         *        return true;
         *
         * and then something like:
         *
         * if (cnt >= FILE_REF_RELEASE)
         *        atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD),
         *
         * to set the value back to the middle of the deadzone. But it's
         * practically impossible to go from FILE_REF_DEAD to
         * FILE_REF_ONEREF. It would need 2305843009213693952/2^61
         * file_ref_get()s to resurrect such a dead file.
         */
        return !atomic_long_add_negative(1, &ref->refcnt);
}

/**
 * file_ref_inc - Acquire one reference on a file
 * @ref: Pointer to the reference count
 *
 * Acquire an additional reference on a file. Warns if the caller didn't
 * already hold a reference.
 */
static __always_inline void file_ref_inc(file_ref_t *ref)
{
        long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt);
        WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference");
}

/**
 * file_ref_put -- Release a file reference
 * @ref:        Pointer to the reference count
 *
 * Provides release memory ordering, such that prior loads and stores
 * are done before, and provides an acquire ordering on success such
 * that free() must come after.
 *
 * Return: True if this was the last reference with no future references
 *         possible. This signals the caller that it can safely release
 *         the object which is protected by the reference counter.
 *         False if there are still active references or the put() raced
 *         with a concurrent get()/put() pair. Caller is not allowed to
 *         release the protected object.
 */
static __always_inline __must_check bool file_ref_put(file_ref_t *ref)
{
        long cnt;

        /*
         * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put()
         * calls don't risk UAFs when a file is recyclyed, it is still
         * vulnerable to UAFs caused by freeing the whole slab page once
         * it becomes unused. Prevent file_ref_put() from being
         * preempted protects against this.
         */
        guard(preempt)();
        /*
         * Unconditionally decrease the reference count. The saturation
         * and dead zones provide enough tolerance for this. If this
         * fails then we need to handle the last reference drop and
         * cases inside the saturation and dead zones.
         */
        cnt = atomic_long_dec_return(&ref->refcnt);
        if (cnt >= 0)
                return false;
        return __file_ref_put(ref, cnt);
}

/**
 * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF
 * @ref:        Pointer to the reference count
 *
 * Semantically it is equivalent to calling file_ref_put(), but it trades lower
 * performance in face of other CPUs also modifying the refcount for higher
 * performance when this happens to be the last reference.
 *
 * For the last reference file_ref_put() issues 2 atomics. One to drop the
 * reference and another to transition it to FILE_REF_DEAD. This routine does
 * the work in one step, but in order to do it has to pre-read the variable which
 * decreases scalability.
 *
 * Use with close() et al, stick to file_ref_put() by default.
 */
static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref)
{
        long old;

        old = atomic_long_read(&ref->refcnt);
        if (likely(old == FILE_REF_ONEREF)) {
                if (likely(atomic_long_try_cmpxchg(&ref->refcnt, &old, FILE_REF_DEAD)))
                        return true;
        }
        return file_ref_put(ref);
}

/**
 * file_ref_read - Read the number of file references
 * @ref: Pointer to the reference count
 *
 * Return: The number of held references (0 ... N)
 */
static inline unsigned long file_ref_read(file_ref_t *ref)
{
        unsigned long c = atomic_long_read(&ref->refcnt);

        /* Return 0 if within the DEAD zone. */
        return c >= FILE_REF_RELEASED ? 0 : c + 1;
}

/*
 * __file_ref_read_raw - Return the value stored in ref->refcnt
 * @ref: Pointer to the reference count
 *
 * Return: The raw value found in the counter
 *
 * A hack for file_needs_f_pos_lock(), you probably want to use
 * file_ref_read() instead.
 */
static inline unsigned long __file_ref_read_raw(file_ref_t *ref)
{
        return atomic_long_read(&ref->refcnt);
}

#endif




































































































    1 












    1 
    1 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
// SPDX-License-Identifier: GPL-2.0-only
/*
 * 32bit compatibility wrappers for the input subsystem.
 *
 * Very heavily based on evdev.c - Copyright (c) 1999-2002 Vojtech Pavlik
 */

#include <linux/export.h>
#include <linux/sprintf.h>
#include <linux/uaccess.h>
#include "input-compat.h"

#ifdef CONFIG_COMPAT

int input_event_from_user(const char __user *buffer,
                          struct input_event *event)
{
        if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct input_event_compat compat_event;

                if (copy_from_user(&compat_event, buffer,
                                   sizeof(struct input_event_compat)))
                        return -EFAULT;

                event->input_event_sec = compat_event.sec;
                event->input_event_usec = compat_event.usec;
                event->type = compat_event.type;
                event->code = compat_event.code;
                event->value = compat_event.value;

        } else {
                if (copy_from_user(event, buffer, sizeof(struct input_event)))
                        return -EFAULT;
        }

        return 0;
}

int input_event_to_user(char __user *buffer,
                        const struct input_event *event)
{
        if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct input_event_compat compat_event;

                compat_event.sec = event->input_event_sec;
                compat_event.usec = event->input_event_usec;
                compat_event.type = event->type;
                compat_event.code = event->code;
                compat_event.value = event->value;

                if (copy_to_user(buffer, &compat_event,
                                 sizeof(struct input_event_compat)))
                        return -EFAULT;

        } else {
                if (copy_to_user(buffer, event, sizeof(struct input_event)))
                        return -EFAULT;
        }

        return 0;
}

int input_ff_effect_from_user(const char __user *buffer, size_t size,
                              struct ff_effect *effect)
{
        if (in_compat_syscall()) {
                struct ff_effect_compat *compat_effect;

                if (size != sizeof(struct ff_effect_compat))
                        return -EINVAL;

                /*
                 * It so happens that the pointer which needs to be changed
                 * is the last field in the structure, so we can retrieve the
                 * whole thing and replace just the pointer.
                 */
                compat_effect = (struct ff_effect_compat *)effect;

                if (copy_from_user(compat_effect, buffer,
                                   sizeof(struct ff_effect_compat)))
                        return -EFAULT;

                if (compat_effect->type == FF_PERIODIC &&
                    compat_effect->u.periodic.waveform == FF_CUSTOM)
                        effect->u.periodic.custom_data =
                                compat_ptr(compat_effect->u.periodic.custom_data);
        } else {
                if (size != sizeof(struct ff_effect))
                        return -EINVAL;

                if (copy_from_user(effect, buffer, sizeof(struct ff_effect)))
                        return -EFAULT;
        }

        return 0;
}

int input_bits_to_string(char *buf, int buf_size, unsigned long bits,
                         bool skip_empty)
{
        int len = 0;

        if (in_compat_syscall()) {
                u32 dword = bits >> 32;
                if (dword || !skip_empty)
                        len += snprintf(buf, buf_size, "%x ", dword);

                dword = bits & 0xffffffffUL;
                if (dword || !skip_empty || len)
                        len += snprintf(buf + len, max(buf_size - len, 0),
                                        "%x", dword);
        } else {
                if (bits || !skip_empty)
                        len += snprintf(buf, buf_size, "%lx", bits);
        }

        return len;
}

#else

int input_event_from_user(const char __user *buffer,
                         struct input_event *event)
{
        if (copy_from_user(event, buffer, sizeof(struct input_event)))
                return -EFAULT;

        return 0;
}

int input_event_to_user(char __user *buffer,
                        const struct input_event *event)
{
        if (copy_to_user(buffer, event, sizeof(struct input_event)))
                return -EFAULT;

        return 0;
}

int input_ff_effect_from_user(const char __user *buffer, size_t size,
                              struct ff_effect *effect)
{
        if (size != sizeof(struct ff_effect))
                return -EINVAL;

        if (copy_from_user(effect, buffer, sizeof(struct ff_effect)))
                return -EFAULT;

        return 0;
}

int input_bits_to_string(char *buf, int buf_size, unsigned long bits,
                         bool skip_empty)
{
        return bits || !skip_empty ?
                snprintf(buf, buf_size, "%lx", bits) : 0;
}

#endif /* CONFIG_COMPAT */

EXPORT_SYMBOL_GPL(input_event_from_user);
EXPORT_SYMBOL_GPL(input_event_to_user);
EXPORT_SYMBOL_GPL(input_ff_effect_from_user);



































   22 





   23 










   23 





























   14 






   14 






























   12 







   13 






   13 



















   14 



















   13 























   12 




   14 



















































































































































































    4 
























    3 




    3 
















    2 







    2 



    3 


















    4 


    4 



























































































    1 


























    1 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/xarray.h>

/**
 * idr_alloc_u32() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @nextid: Pointer to an ID.
 * @max: The maximum ID to allocate (inclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @nextid and @max.
 * Note that @max is inclusive whereas the @end parameter to idr_alloc()
 * is exclusive.  The new ID is assigned to @nextid before the pointer
 * is inserted into the IDR, so if @nextid points into the object pointed
 * to by @ptr, a concurrent lookup will not find an uninitialised ID.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.  If an error occurred,
 * @nextid is unchanged.
 */
int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
                        unsigned long max, gfp_t gfp)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int base = idr->idr_base;
        unsigned int id = *nextid;

        if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
                idr->idr_rt.xa_flags |= IDR_RT_MARKER;
        if (max < base)
                return -ENOSPC;

        id = (id < base) ? 0 : id - base;
        radix_tree_iter_init(&iter, id);
        slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);
        if (IS_ERR(slot))
                return PTR_ERR(slot);

        *nextid = iter.index + base;
        /* there is a memory barrier inside radix_tree_iter_replace() */
        radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
        radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);

        return 0;
}
EXPORT_SYMBOL_GPL(idr_alloc_u32);

/**
 * idr_alloc() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = start;
        int ret;

        if (WARN_ON_ONCE(start < 0))
                return -EINVAL;

        ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp);
        if (ret)
                return ret;

        return id;
}
EXPORT_SYMBOL_GPL(idr_alloc);

/**
 * idr_alloc_cyclic() - Allocate an ID cyclically.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 * The search for an unused ID will start at the last ID allocated and will
 * wrap around to @start if no free IDs are found before reaching @end.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = idr->idr_next;
        int err, max = end > 0 ? end - 1 : INT_MAX;

        if ((int)id < start)
                id = start;

        err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        if ((err == -ENOSPC) && (id > start)) {
                id = start;
                err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        }
        if (err)
                return err;

        idr->idr_next = id + 1;
        return id;
}
EXPORT_SYMBOL(idr_alloc_cyclic);

/**
 * idr_remove() - Remove an ID from the IDR.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Removes this ID from the IDR.  If the ID was not previously in the IDR,
 * this function returns %NULL.
 *
 * Since this function modifies the IDR, the caller should provide their
 * own locking to ensure that concurrent modification of the same IDR is
 * not possible.
 *
 * Return: The pointer formerly associated with this ID.
 */
void *idr_remove(struct idr *idr, unsigned long id)
{
        return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL);
}
EXPORT_SYMBOL_GPL(idr_remove);

/**
 * idr_find() - Return pointer for given ID.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Looks up the pointer associated with this ID.  A %NULL pointer may
 * indicate that @id is not allocated or that the %NULL pointer was
 * associated with this ID.
 *
 * This function can be called under rcu_read_lock(), given that the leaf
 * pointers lifetimes are correctly managed.
 *
 * Return: The pointer associated with this ID.
 */
void *idr_find(const struct idr *idr, unsigned long id)
{
        return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base);
}
EXPORT_SYMBOL_GPL(idr_find);

/**
 * idr_for_each() - Iterate through all stored pointers.
 * @idr: IDR handle.
 * @fn: Function to be called for each pointer.
 * @data: Data passed to callback function.
 *
 * The callback function will be called for each entry in @idr, passing
 * the ID, the entry and @data.
 *
 * If @fn returns anything other than %0, the iteration stops and that
 * value is returned from this function.
 *
 * idr_for_each() can be called concurrently with idr_alloc() and
 * idr_remove() if protected by RCU.  Newly added entries may not be
 * seen and deleted entries may be seen, but adding and removing entries
 * will not cause other entries to be skipped, nor spurious ones to be seen.
 */
int idr_for_each(const struct idr *idr,
                int (*fn)(int id, void *p, void *data), void *data)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        int base = idr->idr_base;

        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) {
                int ret;
                unsigned long id = iter.index + base;

                if (WARN_ON_ONCE(id > INT_MAX))
                        break;
                ret = fn(id, rcu_dereference_raw(*slot), data);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(idr_for_each);

/**
 * idr_get_next_ul() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next_ul(struct idr *idr, unsigned long *nextid)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        void *entry = NULL;
        unsigned long base = idr->idr_base;
        unsigned long id = *nextid;

        id = (id < base) ? 0 : id - base;
        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) {
                entry = rcu_dereference_raw(*slot);
                if (!entry)
                        continue;
                if (!xa_is_internal(entry))
                        break;
                if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry))
                        break;
                slot = radix_tree_iter_retry(&iter);
        }
        if (!slot)
                return NULL;

        *nextid = iter.index + base;
        return entry;
}
EXPORT_SYMBOL(idr_get_next_ul);

/**
 * idr_get_next() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next(struct idr *idr, int *nextid)
{
        unsigned long id = *nextid;
        void *entry = idr_get_next_ul(idr, &id);

        if (WARN_ON_ONCE(id > INT_MAX))
                return NULL;
        *nextid = id;
        return entry;
}
EXPORT_SYMBOL(idr_get_next);

/**
 * idr_replace() - replace pointer for given ID.
 * @idr: IDR handle.
 * @ptr: New pointer to associate with the ID.
 * @id: ID to change.
 *
 * Replace the pointer registered with an ID and return the old value.
 * This function can be called under the RCU read lock concurrently with
 * idr_alloc() and idr_remove() (as long as the ID being removed is not
 * the one being replaced!).
 *
 * Returns: the old value on success.  %-ENOENT indicates that @id was not
 * found.  %-EINVAL indicates that @ptr was not valid.
 */
void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
{
        struct radix_tree_node *node;
        void __rcu **slot = NULL;
        void *entry;

        id -= idr->idr_base;

        entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
        if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
                return ERR_PTR(-ENOENT);

        __radix_tree_replace(&idr->idr_rt, node, slot, ptr);

        return entry;
}
EXPORT_SYMBOL(idr_replace);

/**
 * DOC: IDA description
 *
 * The IDA is an ID allocator which does not provide the ability to
 * associate an ID with a pointer.  As such, it only needs to store one
 * bit per ID, and so is more space efficient than an IDR.  To use an IDA,
 * define it using DEFINE_IDA() (or embed a &struct ida in a data structure,
 * then initialise it using ida_init()).  To allocate a new ID, call
 * ida_alloc(), ida_alloc_min(), ida_alloc_max() or ida_alloc_range().
 * To free an ID, call ida_free().
 *
 * ida_destroy() can be used to dispose of an IDA without needing to
 * free the individual IDs in it.  You can use ida_is_empty() to find
 * out whether the IDA has any IDs currently allocated.
 *
 * The IDA handles its own locking.  It is safe to call any of the IDA
 * functions without synchronisation in your code.
 *
 * IDs are currently limited to the range [0-INT_MAX].  If this is an awkward
 * limitation, it should be quite straightforward to raise the maximum.
 */

/*
 * Developer's notes:
 *
 * The IDA uses the functionality provided by the XArray to store bitmaps in
 * each entry.  The XA_FREE_MARK is only cleared when all bits in the bitmap
 * have been set.
 *
 * I considered telling the XArray that each slot is an order-10 node
 * and indexing by bit number, but the XArray can't allow a single multi-index
 * entry in the head, which would significantly increase memory consumption
 * for the IDA.  So instead we divide the index by the number of bits in the
 * leaf bitmap before doing a radix tree lookup.
 *
 * As an optimisation, if there are only a few low bits set in any given
 * leaf, instead of allocating a 128-byte bitmap, we store the bits
 * as a value entry.  Value entries never have the XA_FREE_MARK cleared
 * because we can always convert them into a bitmap entry.
 *
 * It would be possible to optimise further; once we've run out of a
 * single 128-byte bitmap, we currently switch to a 576-byte node, put
 * the 128-byte bitmap in the first entry and then start allocating extra
 * 128-byte entries.  We could instead use the 512 bytes of the node's
 * data as a bitmap before moving to that scheme.  I do not believe this
 * is a worthwhile optimisation; Rasmus Villemoes surveyed the current
 * users of the IDA and almost none of them use more than 1024 entries.
 * Those that do use more than the 8192 IDs that the 512 bytes would
 * provide.
 *
 * The IDA always uses a lock to alloc/free.  If we add a 'test_bit'
 * equivalent, it will still need locking.  Going to RCU lookup would require
 * using RCU to free bitmaps, and that's not trivial without embedding an
 * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte
 * bitmap, which is excessive.
 */

/**
 * ida_alloc_range() - Allocate an unused ID.
 * @ida: IDA handle.
 * @min: Lowest ID to allocate.
 * @max: Highest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between @min and @max, inclusive.  The allocated ID will
 * not exceed %INT_MAX, even if @max is larger.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
                        gfp_t gfp)
{
        XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS);
        unsigned bit = min % IDA_BITMAP_BITS;
        unsigned long flags;
        struct ida_bitmap *bitmap, *alloc = NULL;

        if ((int)min < 0)
                return -ENOSPC;

        if ((int)max < 0)
                max = INT_MAX;

retry:
        xas_lock_irqsave(&xas, flags);
next:
        bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK);
        if (xas.xa_index > min / IDA_BITMAP_BITS)
                bit = 0;
        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                goto nospc;

        if (xa_is_value(bitmap)) {
                unsigned long tmp = xa_to_value(bitmap);

                if (bit < BITS_PER_XA_VALUE) {
                        bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit);
                        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                                goto nospc;
                        if (bit < BITS_PER_XA_VALUE) {
                                tmp |= 1UL << bit;
                                xas_store(&xas, xa_mk_value(tmp));
                                goto out;
                        }
                }
                bitmap = alloc;
                if (!bitmap)
                        bitmap = kzalloc_obj(*bitmap, GFP_NOWAIT);
                if (!bitmap)
                        goto alloc;
                bitmap->bitmap[0] = tmp;
                xas_store(&xas, bitmap);
                if (xas_error(&xas)) {
                        bitmap->bitmap[0] = 0;
                        goto out;
                }
        }

        if (bitmap) {
                bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit);
                if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                        goto nospc;
                if (bit == IDA_BITMAP_BITS)
                        goto next;

                __set_bit(bit, bitmap->bitmap);
                if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } else {
                if (bit < BITS_PER_XA_VALUE) {
                        bitmap = xa_mk_value(1UL << bit);
                } else {
                        bitmap = alloc;
                        if (!bitmap)
                                bitmap = kzalloc_obj(*bitmap, GFP_NOWAIT);
                        if (!bitmap)
                                goto alloc;
                        __set_bit(bit, bitmap->bitmap);
                }
                xas_store(&xas, bitmap);
        }
out:
        xas_unlock_irqrestore(&xas, flags);
        if (xas_nomem(&xas, gfp)) {
                xas.xa_index = min / IDA_BITMAP_BITS;
                bit = min % IDA_BITMAP_BITS;
                goto retry;
        }
        if (bitmap != alloc)
                kfree(alloc);
        if (xas_error(&xas))
                return xas_error(&xas);
        return xas.xa_index * IDA_BITMAP_BITS + bit;
alloc:
        xas_unlock_irqrestore(&xas, flags);
        alloc = kzalloc_obj(*bitmap, gfp);
        if (!alloc)
                return -ENOMEM;
        xas_set(&xas, min / IDA_BITMAP_BITS);
        bit = min % IDA_BITMAP_BITS;
        goto retry;
nospc:
        xas_unlock_irqrestore(&xas, flags);
        kfree(alloc);
        return -ENOSPC;
}
EXPORT_SYMBOL(ida_alloc_range);

/**
 * ida_find_first_range - Get the lowest used ID.
 * @ida: IDA handle.
 * @min: Lowest ID to get.
 * @max: Highest ID to get.
 *
 * Get the lowest used ID between @min and @max, inclusive.  The returned
 * ID will not exceed %INT_MAX, even if @max is larger.
 *
 * Context: Any context. Takes and releases the xa_lock.
 * Return: The lowest used ID, or errno if no used ID is found.
 */
int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max)
{
        unsigned long index = min / IDA_BITMAP_BITS;
        unsigned int offset = min % IDA_BITMAP_BITS;
        unsigned long *addr, size, bit;
        unsigned long tmp = 0;
        unsigned long flags;
        void *entry;
        int ret;

        if ((int)min < 0)
                return -EINVAL;
        if ((int)max < 0)
                max = INT_MAX;

        xa_lock_irqsave(&ida->xa, flags);

        entry = xa_find(&ida->xa, &index, max / IDA_BITMAP_BITS, XA_PRESENT);
        if (!entry) {
                ret = -ENOENT;
                goto err_unlock;
        }

        if (index > min / IDA_BITMAP_BITS)
                offset = 0;
        if (index * IDA_BITMAP_BITS + offset > max) {
                ret = -ENOENT;
                goto err_unlock;
        }

        if (xa_is_value(entry)) {
                tmp = xa_to_value(entry);
                addr = &tmp;
                size = BITS_PER_XA_VALUE;
        } else {
                addr = ((struct ida_bitmap *)entry)->bitmap;
                size = IDA_BITMAP_BITS;
        }

        bit = find_next_bit(addr, size, offset);

        xa_unlock_irqrestore(&ida->xa, flags);

        if (bit == size ||
            index * IDA_BITMAP_BITS + bit > max)
                return -ENOENT;

        return index * IDA_BITMAP_BITS + bit;

err_unlock:
        xa_unlock_irqrestore(&ida->xa, flags);
        return ret;
}
EXPORT_SYMBOL(ida_find_first_range);

/**
 * ida_free() - Release an allocated ID.
 * @ida: IDA handle.
 * @id: Previously allocated ID.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_free(struct ida *ida, unsigned int id)
{
        XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS);
        unsigned bit = id % IDA_BITMAP_BITS;
        struct ida_bitmap *bitmap;
        unsigned long flags;

        if ((int)id < 0)
                return;

        xas_lock_irqsave(&xas, flags);
        bitmap = xas_load(&xas);

        if (xa_is_value(bitmap)) {
                unsigned long v = xa_to_value(bitmap);
                if (bit >= BITS_PER_XA_VALUE)
                        goto err;
                if (!(v & (1UL << bit)))
                        goto err;
                v &= ~(1UL << bit);
                if (!v)
                        goto delete;
                xas_store(&xas, xa_mk_value(v));
        } else {
                if (!bitmap || !test_bit(bit, bitmap->bitmap))
                        goto err;
                __clear_bit(bit, bitmap->bitmap);
                xas_set_mark(&xas, XA_FREE_MARK);
                if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
                        kfree(bitmap);
delete:
                        xas_store(&xas, NULL);
                }
        }
        xas_unlock_irqrestore(&xas, flags);
        return;
 err:
        xas_unlock_irqrestore(&xas, flags);
        WARN(1, "ida_free called for id=%d which is not allocated.\n", id);
}
EXPORT_SYMBOL(ida_free);

/**
 * ida_destroy() - Free all IDs.
 * @ida: IDA handle.
 *
 * Calling this function frees all IDs and releases all resources used
 * by an IDA.  When this call returns, the IDA is empty and can be reused
 * or freed.  If the IDA is already empty, there is no need to call this
 * function.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_destroy(struct ida *ida)
{
        XA_STATE(xas, &ida->xa, 0);
        struct ida_bitmap *bitmap;
        unsigned long flags;

        xas_lock_irqsave(&xas, flags);
        xas_for_each(&xas, bitmap, ULONG_MAX) {
                if (!xa_is_value(bitmap))
                        kfree(bitmap);
                xas_store(&xas, NULL);
        }
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(ida_destroy);

#ifndef __KERNEL__
extern void xa_dump_index(unsigned long index, unsigned int shift);
#define IDA_CHUNK_SHIFT                ilog2(IDA_BITMAP_BITS)

static void ida_dump_entry(void *entry, unsigned long index)
{
        unsigned long i;

        if (!entry)
                return;

        if (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);
                unsigned int shift = node->shift + IDA_CHUNK_SHIFT +
                        XA_CHUNK_SHIFT;

                xa_dump_index(index * IDA_BITMAP_BITS, shift);
                xa_dump_node(node);
                for (i = 0; i < XA_CHUNK_SIZE; i++)
                        ida_dump_entry(node->slots[i],
                                        index | (i << node->shift));
        } else if (xa_is_value(entry)) {
                xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG));
                pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry);
        } else {
                struct ida_bitmap *bitmap = entry;

                xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT);
                pr_cont("bitmap: %p data", bitmap);
                for (i = 0; i < IDA_BITMAP_LONGS; i++)
                        pr_cont(" %lx", bitmap->bitmap[i]);
                pr_cont("\n");
        }
}

static void ida_dump(struct ida *ida)
{
        struct xarray *xa = &ida->xa;
        pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head,
                                xa->xa_flags >> ROOT_TAG_SHIFT);
        ida_dump_entry(xa->xa_head, 0);
}
#endif













































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#undef TRACE_SYSTEM
#define TRACE_SYSTEM qdisc

#if !defined(_TRACE_QDISC_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_QDISC_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
#include <linux/pkt_sched.h>
#include <net/sch_generic.h>

TRACE_EVENT(qdisc_dequeue,

        TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq,
                 int packets, struct sk_buff *skb),

        TP_ARGS(qdisc, txq, packets, skb),

        TP_STRUCT__entry(
                __field(        struct Qdisc *,                qdisc        )
                __field(const        struct netdev_queue *,        txq        )
                __field(        int,                        packets        )
                __field(        void *,                        skbaddr        )
                __field(        int,                        ifindex        )
                __field(        u32,                        handle        )
                __field(        u32,                        parent        )
                __field(        unsigned long,                txq_state)
        ),

        /* skb==NULL indicate packets dequeued was 0, even when packets==1 */
        TP_fast_assign(
                __entry->qdisc                = qdisc;
                __entry->txq                = txq;
                __entry->packets        = skb ? packets : 0;
                __entry->skbaddr        = skb;
                __entry->ifindex        = txq->dev ? txq->dev->ifindex : 0;
                __entry->handle                = qdisc->handle;
                __entry->parent                = qdisc->parent;
                __entry->txq_state        = txq->state;
        ),

        TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p",
                  __entry->ifindex, __entry->handle, __entry->parent,
                  __entry->txq_state, __entry->packets, __entry->skbaddr )
);

TRACE_EVENT(qdisc_enqueue,

        TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),

        TP_ARGS(qdisc, txq, skb),

        TP_STRUCT__entry(
                __field(struct Qdisc *, qdisc)
                __field(const struct netdev_queue *, txq)
                __field(void *,        skbaddr)
                __field(int, ifindex)
                __field(u32, handle)
                __field(u32, parent)
        ),

        TP_fast_assign(
                __entry->qdisc = qdisc;
                __entry->txq         = txq;
                __entry->skbaddr = skb;
                __entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
                __entry->handle         = qdisc->handle;
                __entry->parent         = qdisc->parent;
        ),

        TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%p",
                  __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr)
);

#undef FN
#undef FNe
#define FN(reason)        TRACE_DEFINE_ENUM(QDISC_DROP_##reason);
#define FNe(reason)        TRACE_DEFINE_ENUM(QDISC_DROP_##reason);
DEFINE_QDISC_DROP_REASON(FN, FNe)

#undef FN
#undef FNe
#define FN(reason)        { QDISC_DROP_##reason, #reason },
#define FNe(reason)        { QDISC_DROP_##reason, #reason }

TRACE_EVENT(qdisc_drop,

        TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq,
                 struct net_device *dev, struct sk_buff *skb,
                 enum qdisc_drop_reason reason),

        TP_ARGS(qdisc, txq, dev, skb, reason),

        TP_STRUCT__entry(
                __field(struct Qdisc *, qdisc)
                __field(const struct netdev_queue *, txq)
                __field(void *,        skbaddr)
                __field(int, ifindex)
                __field(u32, handle)
                __field(u32, parent)
                __field(enum qdisc_drop_reason, reason)
                __string(kind, qdisc->ops->id)
        ),

        TP_fast_assign(
                __entry->qdisc = qdisc;
                __entry->txq         = txq;
                __entry->skbaddr = skb;
                __entry->ifindex = dev ? dev->ifindex : 0;
                __entry->handle         = qdisc->handle;
                __entry->parent         = qdisc->parent;
                __entry->reason         = reason;
                __assign_str(kind);
        ),

        TP_printk("drop ifindex=%d kind=%s handle=0x%X parent=0x%X skbaddr=%p reason=%s",
                  __entry->ifindex, __get_str(kind), __entry->handle,
                  __entry->parent, __entry->skbaddr,
                  __print_symbolic(__entry->reason,
                                   DEFINE_QDISC_DROP_REASON(FN, FNe)))
);

#undef FN
#undef FNe

TRACE_EVENT(qdisc_reset,

        TP_PROTO(struct Qdisc *q),

        TP_ARGS(q),

        TP_STRUCT__entry(
                __string(        dev,                qdisc_dev(q) ? qdisc_dev(q)->name : "(null)"        )
                __string(        kind,                q->ops->id                )
                __field(        u32,                parent                        )
                __field(        u32,                handle                        )
        ),

        TP_fast_assign(
                __assign_str(dev);
                __assign_str(kind);
                __entry->parent = q->parent;
                __entry->handle = q->handle;
        ),

        TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev),
                  __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent),
                  TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle))
);

TRACE_EVENT(qdisc_destroy,

        TP_PROTO(struct Qdisc *q),

        TP_ARGS(q),

        TP_STRUCT__entry(
                __string(        dev,                qdisc_dev(q)->name        )
                __string(        kind,                q->ops->id                )
                __field(        u32,                parent                        )
                __field(        u32,                handle                        )
        ),

        TP_fast_assign(
                __assign_str(dev);
                __assign_str(kind);
                __entry->parent = q->parent;
                __entry->handle = q->handle;
        ),

        TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev),
                  __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent),
                  TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle))
);

TRACE_EVENT(qdisc_create,

        TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent),

        TP_ARGS(ops, dev, parent),

        TP_STRUCT__entry(
                __string(        dev,                dev->name        )
                __string(        kind,                ops->id                )
                __field(        u32,                parent                )
        ),

        TP_fast_assign(
                __assign_str(dev);
                __assign_str(kind);
                __entry->parent = parent;
        ),

        TP_printk("dev=%s kind=%s parent=%x:%x",
                  __get_str(dev), __get_str(kind),
                  TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent))
);

#endif /* _TRACE_QDISC_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





































































































   75 














   25 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPUFEATURE_H
#define _ASM_X86_CPUFEATURE_H

#include <asm/processor.h>

#if defined(__KERNEL__) && !defined(__ASSEMBLER__)

#include <asm/asm.h>
#include <linux/bitops.h>
#include <asm/alternative.h>
#include <asm/cpufeaturemasks.h>

enum cpuid_leafs
{
        CPUID_1_EDX                = 0,
        CPUID_8000_0001_EDX,
        CPUID_8086_0001_EDX,
        CPUID_LNX_1,
        CPUID_1_ECX,
        CPUID_C000_0001_EDX,
        CPUID_8000_0001_ECX,
        CPUID_LNX_2,
        CPUID_LNX_3,
        CPUID_7_0_EBX,
        CPUID_D_1_EAX,
        CPUID_LNX_4,
        CPUID_7_1_EAX,
        CPUID_8000_0008_EBX,
        CPUID_6_EAX,
        CPUID_8000_000A_EDX,
        CPUID_7_ECX,
        CPUID_LNX_6,
        CPUID_7_EDX,
        CPUID_8000_001F_EAX,
        CPUID_8000_0021_EAX,
        CPUID_LNX_5,
        NR_CPUID_WORDS,
};

extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];

/*
 * In order to save room, we index into this array by doing
 * X86_BUG_<name> - NCAPINTS*32.
 */
extern const char * const x86_bug_flags[NBUGINTS*32];
#define x86_bug_flag(flag) x86_bug_flags[flag]

#define test_cpu_cap(c, bit)                                                \
         arch_test_bit(bit, (unsigned long *)((c)->x86_capability))

#define cpu_has(c, bit)                                                        \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         test_cpu_cap(c, bit))

#define this_cpu_has(bit)                                                \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         x86_this_cpu_test_bit(bit, cpu_info.x86_capability))

/*
 * This is the default CPU features testing macro to use in code.
 *
 * It is for detection of features which need kernel infrastructure to be
 * used.  It may *not* directly test the CPU itself.  Use the cpu_has() family
 * if you want true runtime testing of CPU features, like in hypervisor code
 * where you are supporting a possible guest feature where host support for it
 * is not relevant.
 */
#define cpu_feature_enabled(bit)        \
        (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))

#define boot_cpu_has(bit)        cpu_has(&boot_cpu_data, bit)

#define set_cpu_cap(c, bit)        set_bit(bit, (unsigned long *)((c)->x86_capability))

extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
void check_cpufeature_deps(struct cpuinfo_x86 *c);

#define setup_force_cpu_cap(bit) do {                        \
                                                        \
        if (!boot_cpu_has(bit))                                \
                WARN_ON(alternatives_patched);                \
                                                        \
        set_cpu_cap(&boot_cpu_data, bit);                \
        set_bit(bit, (unsigned long *)cpu_caps_set);        \
} while (0)

#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)

/*
 * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
 * that this is only used on a fallback path and will sometimes cause
 * it to manifest the address of boot_cpu_data in a register, fouling
 * the mainline (post-initialization) code.
 */
static __always_inline bool _static_cpu_has(u16 bit)
{
        asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
                ".pushsection .altinstr_aux,\"ax\"\n"
                "6:\n"
                ANNOTATE_DATA_SPECIAL "\n"
                " testb %[bitnum], %a[cap_byte]\n"
                " jnz %l[t_yes]\n"
                " jmp %l[t_no]\n"
                ".popsection\n"
                 : : [feature]  "i" (bit),
                     [bitnum]   "i" (1 << (bit & 7)),
                     [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3])
                 : : t_yes, t_no);
t_yes:
        return true;
t_no:
        return false;
}

#define static_cpu_has(bit)                                        \
(                                                                \
        __builtin_constant_p(boot_cpu_has(bit)) ?                \
                boot_cpu_has(bit) :                                \
                _static_cpu_has(bit)                                \
)

#define cpu_has_bug(c, bit)                cpu_has(c, (bit))
#define set_cpu_bug(c, bit)                set_cpu_cap(c, (bit))
#define clear_cpu_bug(c, bit)                clear_cpu_cap(c, (bit))

#define static_cpu_has_bug(bit)                static_cpu_has((bit))
#define boot_cpu_has_bug(bit)                cpu_has_bug(&boot_cpu_data, (bit))
#define boot_cpu_set_bug(bit)                set_cpu_cap(&boot_cpu_data, (bit))

#define MAX_CPU_FEATURES                (NCAPINTS * 32)
#define cpu_have_feature                boot_cpu_has

#define CPU_FEATURE_TYPEFMT                "x86,ven%04Xfam%04Xmod%04X"
#define CPU_FEATURE_TYPEVAL                boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
                                        boot_cpu_data.x86_model

#endif /* defined(__KERNEL__) && !defined(__ASSEMBLER__) */
#endif /* _ASM_X86_CPUFEATURE_H */












































































































    2 















    1 



























































































































































    1 































































































































































    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
 * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * inode.c
 */

/*
 * This file implements code to create and read inodes from disk.
 *
 * Inodes in Squashfs are identified by a 48-bit inode which encodes the
 * location of the compressed metadata block containing the inode, and the byte
 * offset into that block where the inode is placed (<block, offset>).
 *
 * To maximise compression there are different inodes for each file type
 * (regular file, directory, device, etc.), the inode contents and length
 * varying with the type.
 *
 * To further maximise compression, two types of regular file inode and
 * directory inode are defined: inodes optimised for frequently occurring
 * regular files and directories, and extended types where extra
 * information has to be stored.
 */

#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/xattr.h>
#include <linux/pagemap.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
#include "xattr.h"

/*
 * Initialise VFS inode with the base inode information common to all
 * Squashfs inode types.  Sqsh_ino contains the unswapped base inode
 * off disk.
 */
static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
                                struct squashfs_base_inode *sqsh_ino)
{
        uid_t i_uid;
        gid_t i_gid;
        int err;

        inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
        if (inode->i_ino == 0)
                return -EINVAL;

        err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid);
        if (err)
                return err;

        err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &i_gid);
        if (err)
                return err;

        i_uid_write(inode, i_uid);
        i_gid_write(inode, i_gid);
        inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
        inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
        inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
        inode->i_mode = le16_to_cpu(sqsh_ino->mode);
        inode->i_size = 0;

        /* File type must not be set at this moment, for it will later be set by the caller. */
        if (inode->i_mode & S_IFMT)
                err = -EIO;

        return err;
}


struct inode *squashfs_iget(struct super_block *sb, long long ino,
                                unsigned int ino_number)
{
        struct inode *inode = iget_locked(sb, ino_number);
        int err;

        TRACE("Entered squashfs_iget\n");

        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (!(inode_state_read_once(inode) & I_NEW))
                return inode;

        err = squashfs_read_inode(inode, ino);
        if (err) {
                iget_failed(inode);
                return ERR_PTR(err);
        }

        unlock_new_inode(inode);
        return inode;
}


/*
 * Initialise VFS inode by reading inode from inode table (compressed
 * metadata).  The format and amount of data read depends on type.
 */
int squashfs_read_inode(struct inode *inode, long long ino)
{
        struct super_block *sb = inode->i_sb;
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
        int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
        union squashfs_inode squashfs_ino;
        struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
        int xattr_id = SQUASHFS_INVALID_XATTR;

        TRACE("Entered squashfs_read_inode\n");

        /*
         * Read inode base common to all inode types.
         */
        err = squashfs_read_metadata(sb, sqshb_ino, &block,
                                &offset, sizeof(*sqshb_ino));
        if (err < 0)
                goto failed_read;

        err = squashfs_new_inode(sb, inode, sqshb_ino);
        if (err)
                goto failed_read;

        block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
        offset = SQUASHFS_INODE_OFFSET(ino);

        type = le16_to_cpu(sqshb_ino->inode_type);
        switch (type) {
        case SQUASHFS_REG_TYPE: {
                unsigned int frag_offset, frag;
                int frag_size;
                u64 frag_blk;
                struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;

                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
                                                        sizeof(*sqsh_ino));
                if (err < 0)
                        goto failed_read;

                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
                frag = le32_to_cpu(sqsh_ino->fragment);
                if (frag != SQUASHFS_INVALID_FRAG) {
                        /*
                         * the file cannot have a fragment (tailend) and have a
                         * file size a multiple of the block size
                         */
                        if ((inode->i_size & (msblk->block_size - 1)) == 0) {
                                err = -EINVAL;
                                goto failed_read;
                        }
                        frag_offset = le32_to_cpu(sqsh_ino->offset);
                        frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
                        if (frag_size < 0) {
                                err = frag_size;
                                goto failed_read;
                        }
                } else {
                        frag_blk = SQUASHFS_INVALID_BLK;
                        frag_size = 0;
                        frag_offset = 0;
                }

                set_nlink(inode, 1);
                inode->i_fop = &squashfs_file_operations;
                inode->i_mode |= S_IFREG;
                inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
                squashfs_i(inode)->fragment_block = frag_blk;
                squashfs_i(inode)->fragment_size = frag_size;
                squashfs_i(inode)->fragment_offset = frag_offset;
                squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
                squashfs_i(inode)->block_list_start = block;
                squashfs_i(inode)->offset = offset;
                squashfs_i(inode)->parent = 0;
                inode->i_data.a_ops = &squashfs_aops;

                TRACE("File inode %x:%x, start_block %llx, block_list_start "
                        "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
                        offset, squashfs_i(inode)->start, block, offset);
                break;
        }
        case SQUASHFS_LREG_TYPE: {
                unsigned int frag_offset, frag;
                int frag_size;
                u64 frag_blk;
                struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;

                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
                                                        sizeof(*sqsh_ino));
                if (err < 0)
                        goto failed_read;

                inode->i_size = le64_to_cpu(sqsh_ino->file_size);
                if (inode->i_size < 0) {
                        err = -EINVAL;
                        goto failed_read;
                }
                frag = le32_to_cpu(sqsh_ino->fragment);
                if (frag != SQUASHFS_INVALID_FRAG) {
                        /*
                         * the file cannot have a fragment (tailend) and have a
                         * file size a multiple of the block size
                         */
                        if ((inode->i_size & (msblk->block_size - 1)) == 0) {
                                err = -EINVAL;
                                goto failed_read;
                        }
                        frag_offset = le32_to_cpu(sqsh_ino->offset);
                        frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
                        if (frag_size < 0) {
                                err = frag_size;
                                goto failed_read;
                        }
                } else {
                        frag_blk = SQUASHFS_INVALID_BLK;
                        frag_size = 0;
                        frag_offset = 0;
                }

                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                inode->i_op = &squashfs_inode_ops;
                inode->i_fop = &squashfs_file_operations;
                inode->i_mode |= S_IFREG;
                inode->i_blocks = (inode->i_size -
                                le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;

                squashfs_i(inode)->fragment_block = frag_blk;
                squashfs_i(inode)->fragment_size = frag_size;
                squashfs_i(inode)->fragment_offset = frag_offset;
                squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
                squashfs_i(inode)->block_list_start = block;
                squashfs_i(inode)->offset = offset;
                squashfs_i(inode)->parent = 0;
                inode->i_data.a_ops = &squashfs_aops;

                TRACE("File inode %x:%x, start_block %llx, block_list_start "
                        "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
                        offset, squashfs_i(inode)->start, block, offset);
                break;
        }
        case SQUASHFS_DIR_TYPE: {
                struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir;

                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
                                sizeof(*sqsh_ino));
                if (err < 0)
                        goto failed_read;

                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                inode->i_size = le16_to_cpu(sqsh_ino->file_size);
                inode->i_op = &squashfs_dir_inode_ops;
                inode->i_fop = &squashfs_dir_ops;
                inode->i_mode |= S_IFDIR;
                squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
                squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
                squashfs_i(inode)->dir_idx_cnt = 0;
                squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);

                TRACE("Directory inode %x:%x, start_block %llx, offset %x\n",
                                SQUASHFS_INODE_BLK(ino), offset,
                                squashfs_i(inode)->start,
                                le16_to_cpu(sqsh_ino->offset));
                break;
        }
        case SQUASHFS_LDIR_TYPE: {
                struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir;

                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
                                sizeof(*sqsh_ino));
                if (err < 0)
                        goto failed_read;

                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
                inode->i_op = &squashfs_dir_inode_ops;
                inode->i_fop = &squashfs_dir_ops;
                inode->i_mode |= S_IFDIR;
                squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
                squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
                squashfs_i(inode)->dir_idx_start = block;
                squashfs_i(inode)->dir_idx_offset = offset;
                squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count);
                squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);

                TRACE("Long directory inode %x:%x, start_block %llx, offset "
                                "%x\n", SQUASHFS_INODE_BLK(ino), offset,
                                squashfs_i(inode)->start,
                                le16_to_cpu(sqsh_ino->offset));
                break;
        }
        case SQUASHFS_SYMLINK_TYPE:
        case SQUASHFS_LSYMLINK_TYPE: {
                struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink;

                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
                                sizeof(*sqsh_ino));
                if (err < 0)
                        goto failed_read;

                inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
                if (inode->i_size > PAGE_SIZE) {
                        ERROR("Corrupted symlink\n");
                        return -EINVAL;
                }

                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                inode->i_op = &squashfs_symlink_inode_ops;
                inode_nohighmem(inode);
                inode->i_data.a_ops = &squashfs_symlink_aops;
                inode->i_mode |= S_IFLNK;
                squashfs_i(inode)->start = block;
                squashfs_i(inode)->offset = offset;
                squashfs_i(inode)->parent = 0;

                if (type == SQUASHFS_LSYMLINK_TYPE) {
                        __le32 xattr;

                        err = squashfs_read_metadata(sb, NULL, &block,
                                                &offset, inode->i_size);
                        if (err < 0)
                                goto failed_read;
                        err = squashfs_read_metadata(sb, &xattr, &block,
                                                &offset, sizeof(xattr));
                        if (err < 0)
                                goto failed_read;
                        xattr_id = le32_to_cpu(xattr);
                }

                TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
                                "%x\n", SQUASHFS_INODE_BLK(ino), offset,
                                block, offset);
                break;
        }
        case SQUASHFS_BLKDEV_TYPE:
        case SQUASHFS_CHRDEV_TYPE: {
                struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
                unsigned int rdev;

                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
                                sizeof(*sqsh_ino));
                if (err < 0)
                        goto failed_read;

                if (type == SQUASHFS_CHRDEV_TYPE)
                        inode->i_mode |= S_IFCHR;
                else
                        inode->i_mode |= S_IFBLK;
                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                rdev = le32_to_cpu(sqsh_ino->rdev);
                init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
                squashfs_i(inode)->parent = 0;

                TRACE("Device inode %x:%x, rdev %x\n",
                                SQUASHFS_INODE_BLK(ino), offset, rdev);
                break;
        }
        case SQUASHFS_LBLKDEV_TYPE:
        case SQUASHFS_LCHRDEV_TYPE: {
                struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
                unsigned int rdev;

                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
                                sizeof(*sqsh_ino));
                if (err < 0)
                        goto failed_read;

                if (type == SQUASHFS_LCHRDEV_TYPE)
                        inode->i_mode |= S_IFCHR;
                else
                        inode->i_mode |= S_IFBLK;
                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_op = &squashfs_inode_ops;
                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                rdev = le32_to_cpu(sqsh_ino->rdev);
                init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
                squashfs_i(inode)->parent = 0;

                TRACE("Device inode %x:%x, rdev %x\n",
                                SQUASHFS_INODE_BLK(ino), offset, rdev);
                break;
        }
        case SQUASHFS_FIFO_TYPE:
        case SQUASHFS_SOCKET_TYPE: {
                struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;

                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
                                sizeof(*sqsh_ino));
                if (err < 0)
                        goto failed_read;

                if (type == SQUASHFS_FIFO_TYPE)
                        inode->i_mode |= S_IFIFO;
                else
                        inode->i_mode |= S_IFSOCK;
                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                init_special_inode(inode, inode->i_mode, 0);
                squashfs_i(inode)->parent = 0;
                break;
        }
        case SQUASHFS_LFIFO_TYPE:
        case SQUASHFS_LSOCKET_TYPE: {
                struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;

                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
                                sizeof(*sqsh_ino));
                if (err < 0)
                        goto failed_read;

                if (type == SQUASHFS_LFIFO_TYPE)
                        inode->i_mode |= S_IFIFO;
                else
                        inode->i_mode |= S_IFSOCK;
                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_op = &squashfs_inode_ops;
                set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
                init_special_inode(inode, inode->i_mode, 0);
                squashfs_i(inode)->parent = 0;
                break;
        }
        default:
                ERROR("Unknown inode type %d in squashfs_iget!\n", type);
                return -EINVAL;
        }

        if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
                err = squashfs_xattr_lookup(sb, xattr_id,
                                        &squashfs_i(inode)->xattr_count,
                                        &squashfs_i(inode)->xattr_size,
                                        &squashfs_i(inode)->xattr);
                if (err < 0)
                        goto failed_read;
                inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
                                + 1;
        } else
                squashfs_i(inode)->xattr_count = 0;

        return 0;

failed_read:
        ERROR("Unable to read inode 0x%llx\n", ino);
        return err;
}


const struct inode_operations squashfs_inode_ops = {
        .listxattr = squashfs_listxattr
};















































































    3 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _LINUX_RANDOM_H
#define _LINUX_RANDOM_H

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/list.h>

#include <uapi/linux/random.h>

struct notifier_block;

void add_device_randomness(const void *buf, size_t len);
void __init add_bootloader_randomness(const void *buf, size_t len);
void add_input_randomness(unsigned int type, unsigned int code,
                          unsigned int value) __latent_entropy;
void add_interrupt_randomness(int irq) __latent_entropy;
void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after);

static inline void add_latent_entropy(void)
{
#if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__)
        add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy));
#else
        add_device_randomness(NULL, 0);
#endif
}

#if IS_ENABLED(CONFIG_VMGENID)
void add_vmfork_randomness(const void *unique_vm_id, size_t len);
int register_random_vmfork_notifier(struct notifier_block *nb);
int unregister_random_vmfork_notifier(struct notifier_block *nb);
#else
static inline int register_random_vmfork_notifier(struct notifier_block *nb) { return 0; }
static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { return 0; }
#endif

void get_random_bytes(void *buf, size_t len);
u8 get_random_u8(void);
u16 get_random_u16(void);
u32 get_random_u32(void);
u64 get_random_u64(void);
static inline unsigned long get_random_long(void)
{
#if BITS_PER_LONG == 64
        return get_random_u64();
#else
        return get_random_u32();
#endif
}

u32 __get_random_u32_below(u32 ceil);

/*
 * Returns a random integer in the interval [0, ceil), with uniform
 * distribution, suitable for all uses. Fastest when ceil is a constant, but
 * still fast for variable ceil as well.
 */
static inline u32 get_random_u32_below(u32 ceil)
{
        if (!__builtin_constant_p(ceil))
                return __get_random_u32_below(ceil);

        /*
         * For the fast path, below, all operations on ceil are precomputed by
         * the compiler, so this incurs no overhead for checking pow2, doing
         * divisions, or branching based on integer size. The resultant
         * algorithm does traditional reciprocal multiplication (typically
         * optimized by the compiler into shifts and adds), rejecting samples
         * whose lower half would indicate a range indivisible by ceil.
         */
        BUILD_BUG_ON_MSG(!ceil, "get_random_u32_below() must take ceil > 0");
        if (ceil <= 1)
                return 0;
        for (;;) {
                if (ceil <= 1U << 8) {
                        u32 mult = ceil * get_random_u8();
                        if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil))
                                return mult >> 8;
                } else if (ceil <= 1U << 16) {
                        u32 mult = ceil * get_random_u16();
                        if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil))
                                return mult >> 16;
                } else {
                        u64 mult = (u64)ceil * get_random_u32();
                        if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil))
                                return mult >> 32;
                }
        }
}

/*
 * Returns a random integer in the interval (floor, U32_MAX], with uniform
 * distribution, suitable for all uses. Fastest when floor is a constant, but
 * still fast for variable floor as well.
 */
static inline u32 get_random_u32_above(u32 floor)
{
        BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && floor == U32_MAX,
                         "get_random_u32_above() must take floor < U32_MAX");
        return floor + 1 + get_random_u32_below(U32_MAX - floor);
}

/*
 * Returns a random integer in the interval [floor, ceil], with uniform
 * distribution, suitable for all uses. Fastest when floor and ceil are
 * constant, but still fast for variable floor and ceil as well.
 */
static inline u32 get_random_u32_inclusive(u32 floor, u32 ceil)
{
        BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && __builtin_constant_p(ceil) &&
                         (floor > ceil || ceil - floor == U32_MAX),
                         "get_random_u32_inclusive() must take floor <= ceil");
        return floor + get_random_u32_below(ceil - floor + 1);
}

void __init random_init_early(const char *command_line);
void __init random_init(void);
bool rng_is_initialized(void);
int wait_for_random_bytes(void);
int execute_with_initialized_rng(struct notifier_block *nb);

/* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes).
 * Returns the result of the call to wait_for_random_bytes. */
static inline int get_random_bytes_wait(void *buf, size_t nbytes)
{
        int ret = wait_for_random_bytes();
        get_random_bytes(buf, nbytes);
        return ret;
}

#ifdef CONFIG_SMP
int random_prepare_cpu(unsigned int cpu);
int random_online_cpu(unsigned int cpu);
#endif

#ifndef MODULE
extern const struct file_operations random_fops, urandom_fops;
#endif

#endif /* _LINUX_RANDOM_H */








































































































































































































































































































































































































































































































































































































































































































    1 


























































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/indirect.c
 *
 *  from
 *
 *  linux/fs/ext4/inode.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Goal-directed block allocation by Stephen Tweedie
 *        (sct@redhat.com), 1993, 1998
 */

#include "ext4_jbd2.h"
#include "truncate.h"
#include <linux/dax.h>
#include <linux/uio.h>

#include <trace/events/ext4.h>

typedef struct {
        __le32        *p;
        __le32        key;
        struct buffer_head *bh;
} Indirect;

static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
{
        p->key = *(p->p = v);
        p->bh = bh;
}

/**
 *        ext4_block_to_path - parse the block number into array of offsets
 *        @inode: inode in question (we are only interested in its superblock)
 *        @i_block: block number to be parsed
 *        @offsets: array to store the offsets in
 *        @boundary: set this non-zero if the referred-to block is likely to be
 *               followed (on disk) by an indirect block.
 *
 *        To store the locations of file's data ext4 uses a data structure common
 *        for UNIX filesystems - tree of pointers anchored in the inode, with
 *        data blocks at leaves and indirect blocks in intermediate nodes.
 *        This function translates the block number into path in that tree -
 *        return value is the path length and @offsets[n] is the offset of
 *        pointer to (n+1)th node in the nth one. If @block is out of range
 *        (negative or too large) warning is printed and zero returned.
 *
 *        Note: function doesn't find node addresses, so no IO is needed. All
 *        we need to know is the capacity of indirect blocks (taken from the
 *        inode->i_sb).
 */

/*
 * Portability note: the last comparison (check that we fit into triple
 * indirect block) is spelled differently, because otherwise on an
 * architecture with 32-bit longs and 8Kb pages we might get into trouble
 * if our filesystem had 8Kb blocks. We might use long long, but that would
 * kill us on x86. Oh, well, at least the sign propagation does not matter -
 * i_block would have to be negative in the very beginning, so we would not
 * get there at all.
 */

static int ext4_block_to_path(struct inode *inode,
                              ext4_lblk_t i_block,
                              ext4_lblk_t offsets[4], int *boundary)
{
        int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
        const long direct_blocks = EXT4_NDIR_BLOCKS,
                indirect_blocks = ptrs,
                double_blocks = (1 << (ptrs_bits * 2));
        int n = 0;
        int final = 0;

        if (i_block < direct_blocks) {
                offsets[n++] = i_block;
                final = direct_blocks;
        } else if ((i_block -= direct_blocks) < indirect_blocks) {
                offsets[n++] = EXT4_IND_BLOCK;
                offsets[n++] = i_block;
                final = ptrs;
        } else if ((i_block -= indirect_blocks) < double_blocks) {
                offsets[n++] = EXT4_DIND_BLOCK;
                offsets[n++] = i_block >> ptrs_bits;
                offsets[n++] = i_block & (ptrs - 1);
                final = ptrs;
        } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
                offsets[n++] = EXT4_TIND_BLOCK;
                offsets[n++] = i_block >> (ptrs_bits * 2);
                offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
                offsets[n++] = i_block & (ptrs - 1);
                final = ptrs;
        } else {
                ext4_warning(inode->i_sb, "block %lu > max in inode %llu",
                             i_block + direct_blocks +
                             indirect_blocks + double_blocks, inode->i_ino);
        }
        if (boundary)
                *boundary = final - 1 - (i_block & (ptrs - 1));
        return n;
}

/**
 *        ext4_get_branch - read the chain of indirect blocks leading to data
 *        @inode: inode in question
 *        @depth: depth of the chain (1 - direct pointer, etc.)
 *        @offsets: offsets of pointers in inode/indirect blocks
 *        @chain: place to store the result
 *        @err: here we store the error value
 *
 *        Function fills the array of triples <key, p, bh> and returns %NULL
 *        if everything went OK or the pointer to the last filled triple
 *        (incomplete one) otherwise. Upon the return chain[i].key contains
 *        the number of (i+1)-th block in the chain (as it is stored in memory,
 *        i.e. little-endian 32-bit), chain[i].p contains the address of that
 *        number (it points into struct inode for i==0 and into the bh->b_data
 *        for i>0) and chain[i].bh points to the buffer_head of i-th indirect
 *        block for i>0 and NULL for i==0. In other words, it holds the block
 *        numbers of the chain, addresses they were taken from (and where we can
 *        verify that chain did not change) and buffer_heads hosting these
 *        numbers.
 *
 *        Function stops when it stumbles upon zero pointer (absent block)
 *                (pointer to last triple returned, *@err == 0)
 *        or when it gets an IO error reading an indirect block
 *                (ditto, *@err == -EIO)
 *        or when it reads all @depth-1 indirect blocks successfully and finds
 *        the whole chain, all way to the data (returns %NULL, *err == 0).
 *
 *      Need to be called with
 *      down_read(&EXT4_I(inode)->i_data_sem)
 */
static Indirect *ext4_get_branch(struct inode *inode, int depth,
                                 ext4_lblk_t  *offsets,
                                 Indirect chain[4], int *err)
{
        struct super_block *sb = inode->i_sb;
        Indirect *p = chain;
        struct buffer_head *bh;
        unsigned int key;
        int ret = -EIO;

        *err = 0;
        /* i_data is not going away, no lock needed */
        add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
        if (!p->key)
                goto no_block;
        while (--depth) {
                key = le32_to_cpu(p->key);
                if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) {
                        /* the block was out of range */
                        ret = -EFSCORRUPTED;
                        goto failure;
                }
                bh = sb_getblk(sb, key);
                if (unlikely(!bh)) {
                        ret = -ENOMEM;
                        goto failure;
                }

                if (!bh_uptodate_or_lock(bh)) {
                        if (ext4_read_bh(bh, 0, NULL, false) < 0) {
                                put_bh(bh);
                                goto failure;
                        }
                        /* validate block references */
                        if (ext4_check_indirect_blockref(inode, bh)) {
                                put_bh(bh);
                                goto failure;
                        }
                }

                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
                /* Reader: end */
                if (!p->key)
                        goto no_block;
        }
        return NULL;

failure:
        *err = ret;
no_block:
        return p;
}

/**
 *        ext4_find_near - find a place for allocation with sufficient locality
 *        @inode: owner
 *        @ind: descriptor of indirect block.
 *
 *        This function returns the preferred place for block allocation.
 *        It is used when heuristic for sequential allocation fails.
 *        Rules are:
 *          + if there is a block to the left of our position - allocate near it.
 *          + if pointer will live in indirect block - allocate near that block.
 *          + if pointer will live in inode - allocate in the same
 *            cylinder group.
 *
 * In the latter case we colour the starting block by the callers PID to
 * prevent it from clashing with concurrent allocations for a different inode
 * in the same block group.   The PID is used here so that functionally related
 * files will be close-by on-disk.
 *
 *        Caller must make sure that @ind is valid and will stay that way.
 */
static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
        __le32 *p;

        /* Try to find previous block */
        for (p = ind->p - 1; p >= start; p--) {
                if (*p)
                        return le32_to_cpu(*p);
        }

        /* No such thing, so let's try location of indirect block */
        if (ind->bh)
                return ind->bh->b_blocknr;

        /*
         * It is going to be referred to from the inode itself? OK, just put it
         * into the same cylinder group then.
         */
        return ext4_inode_to_goal_block(inode);
}

/**
 *        ext4_find_goal - find a preferred place for allocation.
 *        @inode: owner
 *        @block:  block we want
 *        @partial: pointer to the last triple within a chain
 *
 *        Normally this function find the preferred place for block allocation,
 *        returns it.
 *        Because this is only used for non-extent files, we limit the block nr
 *        to 32 bits.
 */
static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
                                   Indirect *partial)
{
        ext4_fsblk_t goal;

        /*
         * XXX need to get goal block from mballoc's data structures
         */

        goal = ext4_find_near(inode, partial);
        goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
        return goal;
}

/**
 *        ext4_blks_to_allocate - Look up the block map and count the number
 *        of direct blocks need to be allocated for the given branch.
 *
 *        @branch: chain of indirect blocks
 *        @k: number of blocks need for indirect blocks
 *        @blks: number of data blocks to be mapped.
 *        @blocks_to_boundary:  the offset in the indirect block
 *
 *        return the total number of blocks to be allocate, including the
 *        direct and indirect blocks.
 */
static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
                                 int blocks_to_boundary)
{
        unsigned int count = 0;

        /*
         * Simple case, [t,d]Indirect block(s) has not allocated yet
         * then it's clear blocks on that path have not allocated
         */
        if (k > 0) {
                /* right now we don't handle cross boundary allocation */
                if (blks < blocks_to_boundary + 1)
                        count += blks;
                else
                        count += blocks_to_boundary + 1;
                return count;
        }

        count++;
        while (count < blks && count <= blocks_to_boundary &&
                le32_to_cpu(*(branch[0].p + count)) == 0) {
                count++;
        }
        return count;
}

/**
 * ext4_alloc_branch() - allocate and set up a chain of blocks
 * @handle: handle for this transaction
 * @ar: structure describing the allocation request
 * @indirect_blks: number of allocated indirect blocks
 * @offsets: offsets (in the blocks) to store the pointers to next.
 * @branch: place to store the chain in.
 *
 *        This function allocates blocks, zeroes out all but the last one,
 *        links them into chain and (if we are synchronous) writes them to disk.
 *        In other words, it prepares a branch that can be spliced onto the
 *        inode. It stores the information about that chain in the branch[], in
 *        the same format as ext4_get_branch() would do. We are calling it after
 *        we had read the existing part of chain and partial points to the last
 *        triple of that (one with zero ->key). Upon the exit we have the same
 *        picture as after the successful ext4_get_block(), except that in one
 *        place chain is disconnected - *branch->p is still zero (we did not
 *        set the last link), but branch->key contains the number that should
 *        be placed into *branch->p to fill that gap.
 *
 *        If allocation fails we free all blocks we've allocated (and forget
 *        their buffer_heads) and return the error value the from failed
 *        ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
 *        as described above and return 0.
 */
static int ext4_alloc_branch(handle_t *handle,
                             struct ext4_allocation_request *ar,
                             int indirect_blks, ext4_lblk_t *offsets,
                             Indirect *branch)
{
        struct buffer_head *                bh;
        ext4_fsblk_t                        b, new_blocks[4];
        __le32                                *p;
        int                                i, j, err, len = 1;

        for (i = 0; i <= indirect_blks; i++) {
                if (i == indirect_blks) {
                        new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
                } else {
                        ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
                                        ar->inode, ar->goal,
                                        ar->flags & EXT4_MB_DELALLOC_RESERVED,
                                        NULL, &err);
                        /* Simplify error cleanup... */
                        branch[i+1].bh = NULL;
                }
                if (err) {
                        i--;
                        goto failed;
                }
                branch[i].key = cpu_to_le32(new_blocks[i]);
                if (i == 0)
                        continue;

                bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]);
                if (unlikely(!bh)) {
                        err = -ENOMEM;
                        goto failed;
                }
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, ar->inode->i_sb,
                                                     bh, EXT4_JTR_NONE);
                if (err) {
                        unlock_buffer(bh);
                        goto failed;
                }

                memset(bh->b_data, 0, bh->b_size);
                p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
                b = new_blocks[i];

                if (i == indirect_blks)
                        len = ar->len;
                for (j = 0; j < len; j++)
                        *p++ = cpu_to_le32(b++);

                BUFFER_TRACE(bh, "marking uptodate");
                set_buffer_uptodate(bh);
                unlock_buffer(bh);

                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, ar->inode, bh);
                if (err)
                        goto failed;
        }
        return 0;
failed:
        if (i == indirect_blks) {
                /* Free data blocks */
                ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
                                 ar->len, 0);
                i--;
        }
        for (; i >= 0; i--) {
                /*
                 * We want to ext4_forget() only freshly allocated indirect
                 * blocks. Buffer for new_blocks[i] is at branch[i+1].bh
                 * (buffer at branch[0].bh is indirect block / inode already
                 * existing before ext4_alloc_branch() was called). Also
                 * because blocks are freshly allocated, we don't need to
                 * revoke them which is why we don't set
                 * EXT4_FREE_BLOCKS_METADATA.
                 */
                ext4_free_blocks(handle, ar->inode, branch[i+1].bh,
                                 new_blocks[i], 1,
                                 branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0);
        }
        return err;
}

/**
 * ext4_splice_branch() - splice the allocated branch onto inode.
 * @handle: handle for this transaction
 * @ar: structure describing the allocation request
 * @where: location of missing link
 * @num:   number of indirect blocks we are adding
 *
 * This function fills the missing link and does all housekeeping needed in
 * inode (->i_blocks, etc.). In case of success we end up with the full
 * chain to new block and return 0.
 */
static int ext4_splice_branch(handle_t *handle,
                              struct ext4_allocation_request *ar,
                              Indirect *where, int num)
{
        int i;
        int err = 0;
        ext4_fsblk_t current_block;

        /*
         * If we're splicing into a [td]indirect block (as opposed to the
         * inode) then we need to get write access to the [td]indirect block
         * before the splice.
         */
        if (where->bh) {
                BUFFER_TRACE(where->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, ar->inode->i_sb,
                                                    where->bh, EXT4_JTR_NONE);
                if (err)
                        goto err_out;
        }
        /* That's it */

        *where->p = where->key;

        /*
         * Update the host buffer_head or inode to point to more just allocated
         * direct blocks blocks
         */
        if (num == 0 && ar->len > 1) {
                current_block = le32_to_cpu(where->key) + 1;
                for (i = 1; i < ar->len; i++)
                        *(where->p + i) = cpu_to_le32(current_block++);
        }

        /* We are done with atomic stuff, now do the rest of housekeeping */
        /* had we spliced it onto indirect block? */
        if (where->bh) {
                /*
                 * If we spliced it onto an indirect block, we haven't
                 * altered the inode.  Note however that if it is being spliced
                 * onto an indirect block at the very end of the file (the
                 * file is growing) then we *will* alter the inode to reflect
                 * the new i_size.  But that is not done here - it is done in
                 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
                 */
                ext4_debug("splicing indirect only\n");
                BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
                if (err)
                        goto err_out;
        } else {
                /*
                 * OK, we spliced it into the inode itself on a direct block.
                 */
                err = ext4_mark_inode_dirty(handle, ar->inode);
                if (unlikely(err))
                        goto err_out;
                ext4_debug("splicing direct\n");
        }
        return err;

err_out:
        for (i = 1; i <= num; i++) {
                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
                 */
                ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
        ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key),
                         ar->len, 0);

        return err;
}

/*
 * The ext4_ind_map_blocks() function handles non-extents inodes
 * (i.e., using the traditional indirect/double-indirect i_blocks
 * scheme) for ext4_map_blocks().
 *
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
 * to tree, set linkage between the newborn blocks, write them if sync is
 * required, recheck the path, free and repeat if check fails, otherwise
 * set the last missing link (that will protect us from any truncate-generated
 * removals - all blocks on the path are immune now) and possibly force the
 * write on the parent block.
 * That has a nice additional property: no special recovery from the failed
 * allocations is needed - we simply release blocks and do not touch anything
 * reachable from inode.
 *
 * `handle' can be NULL if create == 0.
 *
 * return > 0, # of blocks mapped or allocated.
 * return = 0, if plain lookup failed.
 * return < 0, error case.
 *
 * The ext4_ind_get_blocks() function should be called with
 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
 * blocks.
 */
int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
                        int flags)
{
        struct ext4_allocation_request ar;
        int err = -EIO;
        ext4_lblk_t offsets[4];
        Indirect chain[4];
        Indirect *partial;
        int indirect_blks;
        int blocks_to_boundary = 0;
        int depth;
        u64 count = 0;
        ext4_fsblk_t first_block = 0;

        trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
                                   &blocks_to_boundary);

        if (depth == 0)
                goto out;

        partial = ext4_get_branch(inode, depth, offsets, chain, &err);

        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
                count++;
                /*map more blocks*/
                while (count < map->m_len && count <= blocks_to_boundary) {
                        ext4_fsblk_t blk;

                        blk = le32_to_cpu(*(chain[depth-1].p + count));

                        if (blk == first_block + count)
                                count++;
                        else
                                break;
                }
                goto got_it;
        }

        /* Next simple case - plain lookup failed */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
                int i;

                /*
                 * Count number blocks in a subtree under 'partial'. At each
                 * level we count number of complete empty subtrees beyond
                 * current offset and then descend into the subtree only
                 * partially beyond current offset.
                 */
                count = 0;
                for (i = partial - chain + 1; i < depth; i++)
                        count = count * epb + (epb - offsets[i] - 1);
                count++;
                /* Fill in size of a hole we found */
                map->m_pblk = 0;
                map->m_len = umin(map->m_len, count);
                goto cleanup;
        }

        /* Failed read of indirect block */
        if (err == -EIO)
                goto cleanup;

        /*
         * Okay, we need to do block allocation.
        */
        if (ext4_has_feature_bigalloc(inode->i_sb)) {
                EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
                                 "non-extent mapped inodes with bigalloc");
                err = -EFSCORRUPTED;
                goto out;
        }

        /* Set up for the direct block allocation */
        memset(&ar, 0, sizeof(ar));
        ar.inode = inode;
        ar.logical = map->m_lblk;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
        if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
                ar.flags |= EXT4_MB_USE_RESERVED;

        ar.goal = ext4_find_goal(inode, map->m_lblk, partial);

        /* the number of blocks need to allocate for [d,t]indirect blocks */
        indirect_blks = (chain + depth) - partial - 1;

        /*
         * Next look up the indirect map to count the totoal number of
         * direct blocks to allocate for this branch.
         */
        ar.len = ext4_blks_to_allocate(partial, indirect_blks,
                                       map->m_len, blocks_to_boundary);

        /*
         * Block out ext4_truncate while we alter the tree
         */
        err = ext4_alloc_branch(handle, &ar, indirect_blks,
                                offsets + (partial - chain), partial);

        /*
         * The ext4_splice_branch call will free and forget any buffers
         * on the new chain if there is a failure, but that risks using
         * up transaction credits, especially for bitmaps where the
         * credits cannot be returned.  Can we handle this somehow?  We
         * may need to return -EAGAIN upwards in the worst case.  --sct
         */
        if (!err)
                err = ext4_splice_branch(handle, &ar, partial, indirect_blks);
        if (err)
                goto cleanup;

        map->m_flags |= EXT4_MAP_NEW;

        ext4_update_inode_fsync_trans(handle, inode, 1);
        count = ar.len;

got_it:
        map->m_flags |= EXT4_MAP_MAPPED;
        map->m_pblk = le32_to_cpu(chain[depth-1].key);
        map->m_len = count;
        if (count > blocks_to_boundary)
                map->m_flags |= EXT4_MAP_BOUNDARY;
        err = count;
        /* Clean up and exit */
        partial = chain + depth - 1;        /* the whole chain */
cleanup:
        while (partial > chain) {
                BUFFER_TRACE(partial->bh, "call brelse");
                brelse(partial->bh);
                partial--;
        }
out:
        trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
        return err;
}

/*
 * Calculate number of indirect blocks touched by mapping @nrblocks logically
 * contiguous blocks
 */
int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
{
        /*
         * With N contiguous data blocks, we need at most
         * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
         * 2 dindirect blocks, and 1 tindirect block
         */
        return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}

static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
                                     struct buffer_head *bh, int *dropped)
{
        int err;

        if (bh) {
                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (unlikely(err))
                        return err;
        }
        err = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(err))
                return err;
        /*
         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
         * moment, get_block can be called only for blocks inside i_size since
         * page cache has been already dropped and writes are blocked by
         * i_rwsem. So we can safely drop the i_data_sem here.
         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        ext4_discard_preallocations(inode);
        up_write(&EXT4_I(inode)->i_data_sem);
        *dropped = 1;
        return 0;
}

/*
 * Truncate transactions can be complex and absolutely huge.  So we need to
 * be able to restart the transaction at a convenient checkpoint to make
 * sure we don't overflow the journal.
 *
 * Try to extend this transaction for the purposes of truncation.  If
 * extend fails, we restart transaction.
 */
static int ext4_ind_truncate_ensure_credits(handle_t *handle,
                                            struct inode *inode,
                                            struct buffer_head *bh,
                                            int revoke_creds)
{
        int ret;
        int dropped = 0;

        ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS,
                        ext4_blocks_for_truncate(inode), revoke_creds,
                        ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped));
        if (dropped)
                down_write(&EXT4_I(inode)->i_data_sem);
        if (ret <= 0)
                return ret;
        if (bh) {
                BUFFER_TRACE(bh, "retaking write access");
                ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                                    EXT4_JTR_NONE);
                if (unlikely(ret))
                        return ret;
        }
        return 0;
}

/*
 * Probably it should be a library function... search for first non-zero word
 * or memcmp with zero_page, whatever is better for particular architecture.
 * Linus?
 */
static inline int all_zeroes(__le32 *p, __le32 *q)
{
        while (p < q)
                if (*p++)
                        return 0;
        return 1;
}

/**
 *        ext4_find_shared - find the indirect blocks for partial truncation.
 *        @inode:          inode in question
 *        @depth:          depth of the affected branch
 *        @offsets: offsets of pointers in that branch (see ext4_block_to_path)
 *        @chain:          place to store the pointers to partial indirect blocks
 *        @top:          place to the (detached) top of branch
 *
 *        This is a helper function used by ext4_truncate().
 *
 *        When we do truncate() we may have to clean the ends of several
 *        indirect blocks but leave the blocks themselves alive. Block is
 *        partially truncated if some data below the new i_size is referred
 *        from it (and it is on the path to the first completely truncated
 *        data block, indeed).  We have to free the top of that path along
 *        with everything to the right of the path. Since no allocation
 *        past the truncation point is possible until ext4_truncate()
 *        finishes, we may safely do the latter, but top of branch may
 *        require special attention - pageout below the truncation point
 *        might try to populate it.
 *
 *        We atomically detach the top of branch from the tree, store the
 *        block number of its root in *@top, pointers to buffer_heads of
 *        partially truncated blocks - in @chain[].bh and pointers to
 *        their last elements that should not be removed - in
 *        @chain[].p. Return value is the pointer to last filled element
 *        of @chain.
 *
 *        The work left to caller to do the actual freeing of subtrees:
 *                a) free the subtree starting from *@top
 *                b) free the subtrees whose roots are stored in
 *                        (@chain[i].p+1 .. end of @chain[i].bh->b_data)
 *                c) free the subtrees growing from the inode past the @chain[0].
 *                        (no partially truncated stuff there).  */

static Indirect *ext4_find_shared(struct inode *inode, int depth,
                                  ext4_lblk_t offsets[4], Indirect chain[4],
                                  __le32 *top)
{
        Indirect *partial, *p;
        int k, err;

        *top = 0;
        /* Make k index the deepest non-null offset + 1 */
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = ext4_get_branch(inode, k, offsets, chain, &err);
        /* Writer: pointers */
        if (!partial)
                partial = chain + k-1;
        /*
         * If the branch acquired continuation since we've looked at it -
         * fine, it should all survive and (new) top doesn't belong to us.
         */
        if (!partial->key && *partial->p)
                /* Writer: end */
                goto no_top;
        for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
                ;
        /*
         * OK, we've found the last block that must survive. The rest of our
         * branch should be detached before unlocking. However, if that rest
         * of branch is all ours and does not grow immediately from the inode
         * it's easier to cheat and just decrement partial->p.
         */
        if (p == chain + k - 1 && p > chain) {
                p->p--;
        } else {
                *top = *p->p;
                /* Nope, don't do this in ext4.  Must leave the tree intact */
#if 0
                *p->p = 0;
#endif
        }
        /* Writer: end */

        while (partial > p) {
                brelse(partial->bh);
                partial--;
        }
no_top:
        return partial;
}

/*
 * Zero a number of block pointers in either an inode or an indirect block.
 * If we restart the transaction we must again get write access to the
 * indirect block for further modification.
 *
 * We release `count' blocks on disk, but (last - first) may be greater
 * than `count' because there can be holes in there.
 *
 * Return 0 on success, 1 on invalid block range
 * and < 0 on fatal error.
 */
static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh,
                             ext4_fsblk_t block_to_free,
                             unsigned long count, __le32 *first,
                             __le32 *last)
{
        __le32 *p;
        int        flags = EXT4_FREE_BLOCKS_VALIDATED;
        int        err;

        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
                flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
        else if (ext4_should_journal_data(inode))
                flags |= EXT4_FREE_BLOCKS_FORGET;

        if (!ext4_inode_block_valid(inode, block_to_free, count)) {
                EXT4_ERROR_INODE(inode, "attempt to clear invalid "
                                 "blocks %llu len %lu",
                                 (unsigned long long) block_to_free, count);
                return 1;
        }

        err = ext4_ind_truncate_ensure_credits(handle, inode, bh,
                                ext4_free_data_revoke_credits(inode, count));
        if (err < 0)
                goto out_err;

        for (p = first; p < last; p++)
                *p = 0;

        ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
        return 0;
out_err:
        ext4_std_error(inode->i_sb, err);
        return err;
}

/**
 * ext4_free_data - free a list of data blocks
 * @handle:        handle for this transaction
 * @inode:        inode we are dealing with
 * @this_bh:        indirect buffer_head which contains *@first and *@last
 * @first:        array of block numbers
 * @last:        points immediately past the end of array
 *
 * We are freeing all blocks referred from that array (numbers are stored as
 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
 *
 * We accumulate contiguous runs of blocks to free.  Conveniently, if these
 * blocks are contiguous then releasing them at one time will only affect one
 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
 * actually use a lot of journal space.
 *
 * @this_bh will be %NULL if @first and @last point into the inode's direct
 * block pointers.
 */
static void ext4_free_data(handle_t *handle, struct inode *inode,
                           struct buffer_head *this_bh,
                           __le32 *first, __le32 *last)
{
        ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
        unsigned long count = 0;            /* Number of blocks in the run */
        __le32 *block_to_free_p = NULL;            /* Pointer into inode/ind
                                               corresponding to
                                               block_to_free */
        ext4_fsblk_t nr;                    /* Current block # */
        __le32 *p;                            /* Pointer into inode/ind
                                               for current block */
        int err = 0;

        if (this_bh) {                                /* For indirect block */
                BUFFER_TRACE(this_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode->i_sb,
                                                    this_bh, EXT4_JTR_NONE);
                /* Important: if we can't update the indirect pointers
                 * to the blocks, we can't free them. */
                if (err)
                        return;
        }

        for (p = first; p < last; p++) {
                nr = le32_to_cpu(*p);
                if (nr) {
                        /* accumulate blocks to free if they're contiguous */
                        if (count == 0) {
                                block_to_free = nr;
                                block_to_free_p = p;
                                count = 1;
                        } else if (nr == block_to_free + count) {
                                count++;
                        } else {
                                err = ext4_clear_blocks(handle, inode, this_bh,
                                                        block_to_free, count,
                                                        block_to_free_p, p);
                                if (err)
                                        break;
                                block_to_free = nr;
                                block_to_free_p = p;
                                count = 1;
                        }
                }
        }

        if (!err && count > 0)
                err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
                                        count, block_to_free_p, p);
        if (err < 0)
                /* fatal error */
                return;

        if (this_bh) {
                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");

                /*
                 * The buffer head should have an attached journal head at this
                 * point. However, if the data is corrupted and an indirect
                 * block pointed to itself, it would have been detached when
                 * the block was cleared. Check for this instead of OOPSing.
                 */
                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
                        EXT4_ERROR_INODE(inode,
                                         "circular indirect block detected at "
                                         "block %llu",
                                (unsigned long long) this_bh->b_blocknr);
        }
}

/**
 *        ext4_free_branches - free an array of branches
 *        @handle: JBD handle for this transaction
 *        @inode:        inode we are dealing with
 *        @parent_bh: the buffer_head which contains *@first and *@last
 *        @first:        array of block numbers
 *        @last:        pointer immediately past the end of array
 *        @depth:        depth of the branches to free
 *
 *        We are freeing all blocks referred from these branches (numbers are
 *        stored as little-endian 32-bit) and updating @inode->i_blocks
 *        appropriately.
 */
static void ext4_free_branches(handle_t *handle, struct inode *inode,
                               struct buffer_head *parent_bh,
                               __le32 *first, __le32 *last, int depth)
{
        ext4_fsblk_t nr;
        __le32 *p;

        if (ext4_handle_is_aborted(handle))
                return;

        if (depth--) {
                struct buffer_head *bh;
                int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
                p = last;
                while (--p >= first) {
                        nr = le32_to_cpu(*p);
                        if (!nr)
                                continue;                /* A hole */

                        if (!ext4_inode_block_valid(inode, nr, 1)) {
                                EXT4_ERROR_INODE(inode,
                                                 "invalid indirect mapped "
                                                 "block %lu (level %d)",
                                                 (unsigned long) nr, depth);
                                break;
                        }

                        /* Go read the buffer for the next level down */
                        bh = ext4_sb_bread_nofail(inode->i_sb, nr);

                        /*
                         * A read failure? Report error and clear slot
                         * (should be rare).
                         */
                        if (IS_ERR(bh)) {
                                ext4_error_inode_block(inode, nr, -PTR_ERR(bh),
                                                       "Read failure");
                                continue;
                        }

                        /* This zaps the entire block.  Bottom up. */
                        BUFFER_TRACE(bh, "free child branches");
                        ext4_free_branches(handle, inode, bh,
                                        (__le32 *) bh->b_data,
                                        (__le32 *) bh->b_data + addr_per_block,
                                        depth);
                        brelse(bh);

                        /*
                         * Everything below this pointer has been
                         * released.  Now let this top-of-subtree go.
                         *
                         * We want the freeing of this indirect block to be
                         * atomic in the journal with the updating of the
                         * bitmap block which owns it.  So make some room in
                         * the journal.
                         *
                         * We zero the parent pointer *after* freeing its
                         * pointee in the bitmaps, so if extend_transaction()
                         * for some reason fails to put the bitmap changes and
                         * the release into the same transaction, recovery
                         * will merely complain about releasing a free block,
                         * rather than leaking blocks.
                         */
                        if (ext4_handle_is_aborted(handle))
                                return;
                        if (ext4_ind_truncate_ensure_credits(handle, inode,
                                        NULL,
                                        ext4_free_metadata_revoke_credits(
                                                        inode->i_sb, 1)) < 0)
                                return;

                        /*
                         * The forget flag here is critical because if
                         * we are journaling (and not doing data
                         * journaling), we have to make sure a revoke
                         * record is written to prevent the journal
                         * replay from overwriting the (former)
                         * indirect block if it gets reallocated as a
                         * data block.  This must happen in the same
                         * transaction where the data blocks are
                         * actually freed.
                         */
                        ext4_free_blocks(handle, inode, NULL, nr, 1,
                                         EXT4_FREE_BLOCKS_METADATA|
                                         EXT4_FREE_BLOCKS_FORGET);

                        if (parent_bh) {
                                /*
                                 * The block which we have just freed is
                                 * pointed to by an indirect block: journal it
                                 */
                                BUFFER_TRACE(parent_bh, "get_write_access");
                                if (!ext4_journal_get_write_access(handle,
                                                inode->i_sb, parent_bh,
                                                EXT4_JTR_NONE)) {
                                        *p = 0;
                                        BUFFER_TRACE(parent_bh,
                                        "call ext4_handle_dirty_metadata");
                                        ext4_handle_dirty_metadata(handle,
                                                                   inode,
                                                                   parent_bh);
                                }
                        }
                }
        } else {
                /* We have reached the bottom of the tree. */
                BUFFER_TRACE(parent_bh, "free data blocks");
                ext4_free_data(handle, inode, parent_bh, first, last);
        }
}

void ext4_ind_truncate(handle_t *handle, struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        __le32 *i_data = ei->i_data;
        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        ext4_lblk_t offsets[4];
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
        int n = 0;
        ext4_lblk_t last_block, max_block;
        unsigned blocksize = inode->i_sb->s_blocksize;

        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);

        if (last_block != max_block) {
                n = ext4_block_to_path(inode, last_block, offsets, NULL);
                if (n == 0)
                        return;
        }

        ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);

        /*
         * The orphan list entry will now protect us from any crash which
         * occurs before the truncate completes, so it is now safe to propagate
         * the new, shorter inode size (held for now in i_size) into the
         * on-disk inode. We do this via i_disksize, which is the value which
         * ext4 *really* writes onto the disk inode.
         */
        ei->i_disksize = inode->i_size;

        if (last_block == max_block) {
                /*
                 * It is unnecessary to free any data blocks if last_block is
                 * equal to the indirect block limit.
                 */
                return;
        } else if (n == 1) {                /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
                goto do_indirects;
        }

        partial = ext4_find_shared(inode, n, offsets, chain, &nr);
        /* Kill the top of shared branch (not detached) */
        if (nr) {
                if (partial == chain) {
                        /* Shared branch grows from the inode */
                        ext4_free_branches(handle, inode, NULL,
                                           &nr, &nr+1, (chain+n-1) - partial);
                        *partial->p = 0;
                        /*
                         * We mark the inode dirty prior to restart,
                         * and prior to stop.  No need for it here.
                         */
                } else {
                        /* Shared branch grows from an indirect block */
                        BUFFER_TRACE(partial->bh, "get_write_access");
                        ext4_free_branches(handle, inode, partial->bh,
                                        partial->p,
                                        partial->p+1, (chain+n-1) - partial);
                }
        }
        /* Clear the ends of indirect blocks on the shared branch */
        while (partial > chain) {
                ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
                                   (__le32*)partial->bh->b_data+addr_per_block,
                                   (chain+n-1) - partial);
                BUFFER_TRACE(partial->bh, "call brelse");
                brelse(partial->bh);
                partial--;
        }
do_indirects:
        /* Kill the remaining (whole) subtrees */
        switch (offsets[0]) {
        default:
                nr = i_data[EXT4_IND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
                        i_data[EXT4_IND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_IND_BLOCK:
                nr = i_data[EXT4_DIND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
                        i_data[EXT4_DIND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_DIND_BLOCK:
                nr = i_data[EXT4_TIND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
                        i_data[EXT4_TIND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_TIND_BLOCK:
                ;
        }
}

/**
 *        ext4_ind_remove_space - remove space from the range
 *        @handle: JBD handle for this transaction
 *        @inode:        inode we are dealing with
 *        @start:        First block to remove
 *        @end:        One block after the last block to remove (exclusive)
 *
 *        Free the blocks in the defined range (end is exclusive endpoint of
 *        range). This is used by ext4_punch_hole().
 */
int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
                          ext4_lblk_t start, ext4_lblk_t end)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        __le32 *i_data = ei->i_data;
        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        ext4_lblk_t offsets[4], offsets2[4];
        Indirect chain[4], chain2[4];
        Indirect *partial, *partial2;
        Indirect *p = NULL, *p2 = NULL;
        ext4_lblk_t max_block;
        __le32 nr = 0, nr2 = 0;
        int n = 0, n2 = 0;
        unsigned blocksize = inode->i_sb->s_blocksize;

        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        if (end >= max_block)
                end = max_block;
        if ((start >= end) || (start > max_block))
                return 0;

        n = ext4_block_to_path(inode, start, offsets, NULL);
        n2 = ext4_block_to_path(inode, end, offsets2, NULL);

        BUG_ON(n > n2);

        if ((n == 1) && (n == n2)) {
                /* We're punching only within direct block range */
                ext4_free_data(handle, inode, NULL, i_data + offsets[0],
                               i_data + offsets2[0]);
                return 0;
        } else if (n2 > n) {
                /*
                 * Start and end are on a different levels so we're going to
                 * free partial block at start, and partial block at end of
                 * the range. If there are some levels in between then
                 * do_indirects label will take care of that.
                 */

                if (n == 1) {
                        /*
                         * Start is at the direct block level, free
                         * everything to the end of the level.
                         */
                        ext4_free_data(handle, inode, NULL, i_data + offsets[0],
                                       i_data + EXT4_NDIR_BLOCKS);
                        goto end_range;
                }


                partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
                if (nr) {
                        if (partial == chain) {
                                /* Shared branch grows from the inode */
                                ext4_free_branches(handle, inode, NULL,
                                           &nr, &nr+1, (chain+n-1) - partial);
                                *partial->p = 0;
                        } else {
                                /* Shared branch grows from an indirect block */
                                BUFFER_TRACE(partial->bh, "get_write_access");
                                ext4_free_branches(handle, inode, partial->bh,
                                        partial->p,
                                        partial->p+1, (chain+n-1) - partial);
                        }
                }

                /*
                 * Clear the ends of indirect blocks on the shared branch
                 * at the start of the range
                 */
                while (partial > chain) {
                        ext4_free_branches(handle, inode, partial->bh,
                                partial->p + 1,
                                (__le32 *)partial->bh->b_data+addr_per_block,
                                (chain+n-1) - partial);
                        partial--;
                }

end_range:
                partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
                if (nr2) {
                        if (partial2 == chain2) {
                                /*
                                 * Remember, end is exclusive so here we're at
                                 * the start of the next level we're not going
                                 * to free. Everything was covered by the start
                                 * of the range.
                                 */
                                goto do_indirects;
                        }
                } else {
                        /*
                         * ext4_find_shared returns Indirect structure which
                         * points to the last element which should not be
                         * removed by truncate. But this is end of the range
                         * in punch_hole so we need to point to the next element
                         */
                        partial2->p++;
                }

                /*
                 * Clear the ends of indirect blocks on the shared branch
                 * at the end of the range
                 */
                while (partial2 > chain2) {
                        ext4_free_branches(handle, inode, partial2->bh,
                                           (__le32 *)partial2->bh->b_data,
                                           partial2->p,
                                           (chain2+n2-1) - partial2);
                        partial2--;
                }
                goto do_indirects;
        }

        /* Punch happened within the same level (n == n2) */
        partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
        partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);

        /* Free top, but only if partial2 isn't its subtree. */
        if (nr) {
                int level = min(partial - chain, partial2 - chain2);
                int i;
                int subtree = 1;

                for (i = 0; i <= level; i++) {
                        if (offsets[i] != offsets2[i]) {
                                subtree = 0;
                                break;
                        }
                }

                if (!subtree) {
                        if (partial == chain) {
                                /* Shared branch grows from the inode */
                                ext4_free_branches(handle, inode, NULL,
                                                   &nr, &nr+1,
                                                   (chain+n-1) - partial);
                                *partial->p = 0;
                        } else {
                                /* Shared branch grows from an indirect block */
                                BUFFER_TRACE(partial->bh, "get_write_access");
                                ext4_free_branches(handle, inode, partial->bh,
                                                   partial->p,
                                                   partial->p+1,
                                                   (chain+n-1) - partial);
                        }
                }
        }

        if (!nr2) {
                /*
                 * ext4_find_shared returns Indirect structure which
                 * points to the last element which should not be
                 * removed by truncate. But this is end of the range
                 * in punch_hole so we need to point to the next element
                 */
                partial2->p++;
        }

        while (partial > chain || partial2 > chain2) {
                int depth = (chain+n-1) - partial;
                int depth2 = (chain2+n2-1) - partial2;

                if (partial > chain && partial2 > chain2 &&
                    partial->bh->b_blocknr == partial2->bh->b_blocknr) {
                        /*
                         * We've converged on the same block. Clear the range,
                         * then we're done.
                         */
                        ext4_free_branches(handle, inode, partial->bh,
                                           partial->p + 1,
                                           partial2->p,
                                           (chain+n-1) - partial);
                        goto cleanup;
                }

                /*
                 * The start and end partial branches may not be at the same
                 * level even though the punch happened within one level. So, we
                 * give them a chance to arrive at the same level, then walk
                 * them in step with each other until we converge on the same
                 * block.
                 */
                if (partial > chain && depth <= depth2) {
                        ext4_free_branches(handle, inode, partial->bh,
                                           partial->p + 1,
                                           (__le32 *)partial->bh->b_data+addr_per_block,
                                           (chain+n-1) - partial);
                        partial--;
                }
                if (partial2 > chain2 && depth2 <= depth) {
                        ext4_free_branches(handle, inode, partial2->bh,
                                           (__le32 *)partial2->bh->b_data,
                                           partial2->p,
                                           (chain2+n2-1) - partial2);
                        partial2--;
                }
        }

cleanup:
        while (p && p > chain) {
                BUFFER_TRACE(p->bh, "call brelse");
                brelse(p->bh);
                p--;
        }
        while (p2 && p2 > chain2) {
                BUFFER_TRACE(p2->bh, "call brelse");
                brelse(p2->bh);
                p2--;
        }
        return 0;

do_indirects:
        /* Kill the remaining (whole) subtrees */
        switch (offsets[0]) {
        default:
                if (++n >= n2)
                        break;
                nr = i_data[EXT4_IND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
                        i_data[EXT4_IND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_IND_BLOCK:
                if (++n >= n2)
                        break;
                nr = i_data[EXT4_DIND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
                        i_data[EXT4_DIND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_DIND_BLOCK:
                if (++n >= n2)
                        break;
                nr = i_data[EXT4_TIND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
                        i_data[EXT4_TIND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_TIND_BLOCK:
                ;
        }
        goto cleanup;
}





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






















































































































































    1 









    1 

    1 





























    3 





















































    3 






    3 









    3 


























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
// SPDX-License-Identifier: GPL-2.0+
/*
 * 2002-10-15  Posix Clocks & timers
 *                           by George Anzinger george@mvista.com
 *                             Copyright (C) 2002 2003 by MontaVista Software.
 *
 * 2004-06-01  Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
 *                             Copyright (C) 2004 Boris Hu
 *
 * These are all the functions necessary to implement POSIX clocks & timers
 */
#include <linux/compat.h>
#include <linux/compiler.h>
#include <linux/init.h>
#include <linux/jhash.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/memblock.h>
#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
#include <linux/prctl.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/time.h>
#include <linux/time_namespace.h>
#include <linux/uaccess.h>

#include "timekeeping.h"
#include "posix-timers.h"

/*
 * Timers are managed in a hash table for lockless lookup. The hash key is
 * constructed from current::signal and the timer ID and the timer is
 * matched against current::signal and the timer ID when walking the hash
 * bucket list.
 *
 * This allows checkpoint/restore to reconstruct the exact timer IDs for
 * a process.
 */
struct timer_hash_bucket {
        spinlock_t                lock;
        struct hlist_head        head;
};

static struct {
        struct timer_hash_bucket        *buckets;
        unsigned long                        mask;
        struct kmem_cache                *cache;
} __timer_data __ro_after_init __aligned(4*sizeof(long));

#define timer_buckets                (__timer_data.buckets)
#define timer_hashmask                (__timer_data.mask)
#define posix_timers_cache        (__timer_data.cache)

static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;

#define TIMER_ANY_ID                INT_MIN

/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
                        ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif

static struct k_itimer *lock_timer(timer_t timer_id);
static inline void unlock_timer(struct k_itimer *timr)
{
        if (likely((timr)))
                spin_unlock_irq(&timr->it_lock);
}

#define scoped_timer_get_or_fail(_id)                                        \
        scoped_cond_guard(lock_timer, return -EINVAL, _id)

#define scoped_timer                                (scope)

DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), lock_timer(id), timer_t id);
DEFINE_CLASS_IS_COND_GUARD(lock_timer);

static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr)
{
        return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask];
}

static struct k_itimer *posix_timer_by_id(timer_t id)
{
        struct signal_struct *sig = current->signal;
        struct timer_hash_bucket *bucket = hash_bucket(sig, id);
        struct k_itimer *timer;

        hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) {
                /* timer->it_signal can be set concurrently */
                if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
                        return timer;
        }
        return NULL;
}

static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer)
{
        unsigned long val = (unsigned long)timer->it_signal;

        /*
         * Mask out bit 0, which acts as invalid marker to prevent
         * posix_timer_by_id() detecting it as valid.
         */
        return (struct signal_struct *)(val & ~1UL);
}

static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig,
                               timer_t id)
{
        struct hlist_head *head = &bucket->head;
        struct k_itimer *timer;

        hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) {
                if ((posix_sig_owner(timer) == sig) && (timer->it_id == id))
                        return true;
        }
        return false;
}

static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
{
        struct timer_hash_bucket *bucket = hash_bucket(sig, id);

        scoped_guard (spinlock, &bucket->lock) {
                /*
                 * Validate under the lock as this could have raced against
                 * another thread ending up with the same ID, which is
                 * highly unlikely, but possible.
                 */
                if (!posix_timer_hashed(bucket, sig, id)) {
                        /*
                         * Set the timer ID and the signal pointer to make
                         * it identifiable in the hash table. The signal
                         * pointer has bit 0 set to indicate that it is not
                         * yet fully initialized. posix_timer_hashed()
                         * masks this bit out, but the syscall lookup fails
                         * to match due to it being set. This guarantees
                         * that there can't be duplicate timer IDs handed
                         * out.
                         */
                        timer->it_id = (timer_t)id;
                        timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
                        hlist_add_head_rcu(&timer->t_hash, &bucket->head);
                        return true;
                }
        }
        return false;
}

static int posix_timer_add(struct k_itimer *timer, int req_id)
{
        struct signal_struct *sig = current->signal;

        if (unlikely(req_id != TIMER_ANY_ID)) {
                if (!posix_timer_add_at(timer, sig, req_id))
                        return -EBUSY;

                /*
                 * Move the ID counter past the requested ID, so that after
                 * switching back to normal mode the IDs are outside of the
                 * exact allocated region. That avoids ID collisions on the
                 * next regular timer_create() invocations.
                 */
                atomic_set(&sig->next_posix_timer_id, req_id + 1);
                return req_id;
        }

        for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
                /* Get the next timer ID and clamp it to positive space */
                unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;

                if (posix_timer_add_at(timer, sig, id))
                        return id;
                cond_resched();
        }
        /* POSIX return code when no timer ID could be allocated */
        return -EAGAIN;
}

static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_real_ts64(tp);
        return 0;
}

static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
{
        return ktime_get_real();
}

static int posix_clock_realtime_set(const clockid_t which_clock,
                                    const struct timespec64 *tp)
{
        return do_sys_settimeofday64(tp, NULL);
}

static int posix_clock_realtime_adj(const clockid_t which_clock,
                                    struct __kernel_timex *t)
{
        return do_adjtimex(t);
}

static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_ts64(tp);
        timens_add_monotonic(tp);
        return 0;
}

static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
{
        return ktime_get();
}

static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_raw_ts64(tp);
        timens_add_monotonic(tp);
        return 0;
}

static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_coarse_real_ts64(tp);
        return 0;
}

static int posix_get_monotonic_coarse(clockid_t which_clock,
                                                struct timespec64 *tp)
{
        ktime_get_coarse_ts64(tp);
        timens_add_monotonic(tp);
        return 0;
}

static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
{
        *tp = ktime_to_timespec64(KTIME_LOW_RES);
        return 0;
}

static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_boottime_ts64(tp);
        timens_add_boottime(tp);
        return 0;
}

static ktime_t posix_get_boottime_ktime(const clockid_t which_clock)
{
        return ktime_get_boottime();
}

static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_clocktai_ts64(tp);
        return 0;
}

static ktime_t posix_get_tai_ktime(clockid_t which_clock)
{
        return ktime_get_clocktai();
}

static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
{
        tp->tv_sec = 0;
        tp->tv_nsec = hrtimer_resolution;
        return 0;
}

/*
 * The siginfo si_overrun field and the return value of timer_getoverrun(2)
 * are of type int. Clamp the overrun value to INT_MAX
 */
static inline int timer_overrun_to_int(struct k_itimer *timr)
{
        if (timr->it_overrun_last > (s64)INT_MAX)
                return INT_MAX;

        return (int)timr->it_overrun_last;
}

static void common_hrtimer_rearm(struct k_itimer *timr)
{
        struct hrtimer *timer = &timr->it.real.timer;

        timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval);
        hrtimer_restart(timer);
}

static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr)
{
        guard(spinlock)(&timr->it_lock);

        /*
         * Check if the timer is still alive or whether it got modified
         * since the signal was queued. In either case, don't rearm and
         * drop the signal.
         */
        if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr)))
                return false;

        if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
                return true;

        timr->kclock->timer_rearm(timr);
        timr->it_status = POSIX_TIMER_ARMED;
        timr->it_overrun_last = timr->it_overrun;
        timr->it_overrun = -1LL;
        ++timr->it_signal_seq;
        info->si_overrun = timer_overrun_to_int(timr);
        return true;
}

/*
 * This function is called from the signal delivery code. It decides
 * whether the signal should be dropped and rearms interval timers.  The
 * timer can be unconditionally accessed as there is a reference held on
 * it.
 */
bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq)
{
        struct k_itimer *timr = container_of(timer_sigq, struct k_itimer, sigq);
        bool ret;

        /*
         * Release siglock to ensure proper locking order versus
         * timr::it_lock. Keep interrupts disabled.
         */
        spin_unlock(&current->sighand->siglock);

        ret = __posixtimer_deliver_signal(info, timr);

        /* Drop the reference which was acquired when the signal was queued */
        posixtimer_putref(timr);

        spin_lock(&current->sighand->siglock);
        return ret;
}

void posix_timer_queue_signal(struct k_itimer *timr)
{
        lockdep_assert_held(&timr->it_lock);

        if (!posixtimer_valid(timr))
                return;

        timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED;
        posixtimer_send_sigqueue(timr);
}

/*
 * This function gets called when a POSIX.1b interval timer expires from
 * the HRTIMER interrupt (soft interrupt on RT kernels).
 *
 * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI
 * based timers.
 */
static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
{
        struct k_itimer *timr = container_of(timer, struct k_itimer, it.real.timer);

        guard(spinlock_irqsave)(&timr->it_lock);
        posix_timer_queue_signal(timr);
        return HRTIMER_NORESTART;
}

long posixtimer_create_prctl(unsigned long ctrl)
{
        switch (ctrl) {
        case PR_TIMER_CREATE_RESTORE_IDS_OFF:
                current->signal->timer_create_restore_ids = 0;
                return 0;
        case PR_TIMER_CREATE_RESTORE_IDS_ON:
                current->signal->timer_create_restore_ids = 1;
                return 0;
        case PR_TIMER_CREATE_RESTORE_IDS_GET:
                return current->signal->timer_create_restore_ids;
        }
        return -EINVAL;
}

static struct pid *good_sigevent(sigevent_t * event)
{
        struct pid *pid = task_tgid(current);
        struct task_struct *rtn;

        switch (event->sigev_notify) {
        case SIGEV_SIGNAL | SIGEV_THREAD_ID:
                pid = find_vpid(event->sigev_notify_thread_id);
                rtn = pid_task(pid, PIDTYPE_PID);
                if (!rtn || !same_thread_group(rtn, current))
                        return NULL;
                fallthrough;
        case SIGEV_SIGNAL:
        case SIGEV_THREAD:
                if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
                        return NULL;
                fallthrough;
        case SIGEV_NONE:
                return pid;
        default:
                return NULL;
        }
}

static struct k_itimer *alloc_posix_timer(void)
{
        struct k_itimer *tmr;

        if (unlikely(!posix_timers_cache))
                return NULL;

        tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
        if (!tmr)
                return tmr;

        if (unlikely(!posixtimer_init_sigqueue(&tmr->sigq))) {
                kmem_cache_free(posix_timers_cache, tmr);
                return NULL;
        }
        rcuref_init(&tmr->rcuref, 1);
        return tmr;
}

void posixtimer_free_timer(struct k_itimer *tmr)
{
        put_pid(tmr->it_pid);
        if (tmr->sigq.ucounts)
                dec_rlimit_put_ucounts(tmr->sigq.ucounts, UCOUNT_RLIMIT_SIGPENDING);
        kfree_rcu(tmr, rcu);
}

static void posix_timer_unhash_and_free(struct k_itimer *tmr)
{
        struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id);

        scoped_guard (spinlock, &bucket->lock)
                hlist_del_rcu(&tmr->t_hash);
        posixtimer_putref(tmr);
}

static int common_timer_create(struct k_itimer *new_timer)
{
        hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0);
        return 0;
}

/* Create a POSIX.1b interval timer. */
static int do_timer_create(clockid_t which_clock, struct sigevent *event,
                           timer_t __user *created_timer_id)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        timer_t req_id = TIMER_ANY_ID;
        struct k_itimer *new_timer;
        int error, new_timer_id;

        if (!kc)
                return -EINVAL;
        if (!kc->timer_create)
                return -EOPNOTSUPP;

        /* Special case for CRIU to restore timers with a given timer ID. */
        if (unlikely(current->signal->timer_create_restore_ids)) {
                if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
                        return -EFAULT;
                /* Valid IDs are 0..INT_MAX */
                if ((unsigned int)req_id > INT_MAX)
                        return -EINVAL;
        }

        new_timer = alloc_posix_timer();
        if (unlikely(!new_timer))
                return -EAGAIN;

        spin_lock_init(&new_timer->it_lock);

        /*
         * Add the timer to the hash table. The timer is not yet valid
         * after insertion, but has a unique ID allocated.
         */
        new_timer_id = posix_timer_add(new_timer, req_id);
        if (new_timer_id < 0) {
                posixtimer_free_timer(new_timer);
                return new_timer_id;
        }

        new_timer->it_clock = which_clock;
        new_timer->kclock = kc;
        new_timer->it_overrun = -1LL;

        if (event) {
                scoped_guard (rcu)
                        new_timer->it_pid = get_pid(good_sigevent(event));
                if (!new_timer->it_pid) {
                        error = -EINVAL;
                        goto out;
                }
                new_timer->it_sigev_notify     = event->sigev_notify;
                new_timer->sigq.info.si_signo = event->sigev_signo;
                new_timer->sigq.info.si_value = event->sigev_value;
        } else {
                new_timer->it_sigev_notify     = SIGEV_SIGNAL;
                new_timer->sigq.info.si_signo = SIGALRM;
                new_timer->sigq.info.si_value.sival_int = new_timer->it_id;
                new_timer->it_pid = get_pid(task_tgid(current));
        }

        if (new_timer->it_sigev_notify & SIGEV_THREAD_ID)
                new_timer->it_pid_type = PIDTYPE_PID;
        else
                new_timer->it_pid_type = PIDTYPE_TGID;

        new_timer->sigq.info.si_tid = new_timer->it_id;
        new_timer->sigq.info.si_code = SI_TIMER;

        if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) {
                error = -EFAULT;
                goto out;
        }
        /*
         * After successful copy out, the timer ID is visible to user space
         * now but not yet valid because new_timer::signal low order bit is 1.
         *
         * Complete the initialization with the clock specific create
         * callback.
         */
        error = kc->timer_create(new_timer);
        if (error)
                goto out;

        /*
         * timer::it_lock ensures that __lock_timer() observes a fully
         * initialized timer when it observes a valid timer::it_signal.
         *
         * sighand::siglock is required to protect signal::posix_timers.
         */
        scoped_guard (spinlock_irq, &new_timer->it_lock) {
                guard(spinlock)(&current->sighand->siglock);
                /*
                 * new_timer::it_signal contains the signal pointer with
                 * bit 0 set, which makes it invalid for syscall operations.
                 * Store the unmodified signal pointer to make it valid.
                 */
                WRITE_ONCE(new_timer->it_signal, current->signal);
                hlist_add_head_rcu(&new_timer->list, &current->signal->posix_timers);
        }
        /*
         * After unlocking @new_timer is subject to concurrent removal and
         * cannot be touched anymore
         */
        return 0;
out:
        posix_timer_unhash_and_free(new_timer);
        return error;
}

SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                struct sigevent __user *, timer_event_spec,
                timer_t __user *, created_timer_id)
{
        if (timer_event_spec) {
                sigevent_t event;

                if (copy_from_user(&event, timer_event_spec, sizeof (event)))
                        return -EFAULT;
                return do_timer_create(which_clock, &event, created_timer_id);
        }
        return do_timer_create(which_clock, NULL, created_timer_id);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
                       struct compat_sigevent __user *, timer_event_spec,
                       timer_t __user *, created_timer_id)
{
        if (timer_event_spec) {
                sigevent_t event;

                if (get_compat_sigevent(&event, timer_event_spec))
                        return -EFAULT;
                return do_timer_create(which_clock, &event, created_timer_id);
        }
        return do_timer_create(which_clock, NULL, created_timer_id);
}
#endif

static struct k_itimer *lock_timer(timer_t timer_id)
{
        struct k_itimer *timr;

        /*
         * timer_t could be any type >= int and we want to make sure any
         * @timer_id outside positive int range fails lookup.
         */
        if ((unsigned long long)timer_id > INT_MAX)
                return NULL;

        /*
         * The hash lookup and the timers are RCU protected.
         *
         * Timers are added to the hash in invalid state where
         * timr::it_signal is marked invalid. timer::it_signal is only set
         * after the rest of the initialization succeeded.
         *
         * Timer destruction happens in steps:
         *  1) Set timr::it_signal marked invalid with timr::it_lock held
         *  2) Release timr::it_lock
         *  3) Remove from the hash under hash_lock
         *  4) Put the reference count.
         *
         * The reference count might not drop to zero if timr::sigq is
         * queued. In that case the signal delivery or flush will put the
         * last reference count.
         *
         * When the reference count reaches zero, the timer is scheduled
         * for RCU removal after the grace period.
         *
         * Holding rcu_read_lock() across the lookup ensures that
         * the timer cannot be freed.
         *
         * The lookup validates locklessly that timr::it_signal ==
         * current::it_signal and timr::it_id == @timer_id. timr::it_id
         * can't change, but timr::it_signal can become invalid during
         * destruction, which makes the locked check fail.
         */
        guard(rcu)();
        timr = posix_timer_by_id(timer_id);
        if (timr) {
                spin_lock_irq(&timr->it_lock);
                /*
                 * Validate under timr::it_lock that timr::it_signal is
                 * still valid. Pairs with #1 above.
                 */
                if (timr->it_signal == current->signal)
                        return timr;
                spin_unlock_irq(&timr->it_lock);
        }
        return NULL;
}

static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now)
{
        struct hrtimer *timer = &timr->it.real.timer;

        return __hrtimer_expires_remaining_adjusted(timer, now);
}

static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
{
        struct hrtimer *timer = &timr->it.real.timer;

        return hrtimer_forward(timer, now, timr->it_interval);
}

/*
 * Get the time remaining on a POSIX.1b interval timer.
 *
 * Two issues to handle here:
 *
 *  1) The timer has a requeue pending. The return value must appear as
 *     if the timer has been requeued right now.
 *
 *  2) The timer is a SIGEV_NONE timer. These timers are never enqueued
 *     into the hrtimer queue and therefore never expired. Emulate expiry
 *     here taking #1 into account.
 */
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
        const struct k_clock *kc = timr->kclock;
        ktime_t now, remaining, iv;
        bool sig_none;

        sig_none = timr->it_sigev_notify == SIGEV_NONE;
        iv = timr->it_interval;

        /* interval timer ? */
        if (iv) {
                cur_setting->it_interval = ktime_to_timespec64(iv);
        } else if (timr->it_status == POSIX_TIMER_DISARMED) {
                /*
                 * SIGEV_NONE oneshot timers are never queued and therefore
                 * timr->it_status is always DISARMED. The check below
                 * vs. remaining time will handle this case.
                 *
                 * For all other timers there is nothing to update here, so
                 * return.
                 */
                if (!sig_none)
                        return;
        }

        now = kc->clock_get_ktime(timr->it_clock);

        /*
         * If this is an interval timer and either has requeue pending or
         * is a SIGEV_NONE timer move the expiry time forward by intervals,
         * so expiry is > now.
         */
        if (iv && timr->it_status != POSIX_TIMER_ARMED)
                timr->it_overrun += kc->timer_forward(timr, now);

        remaining = kc->timer_remaining(timr, now);
        /*
         * As @now is retrieved before a possible timer_forward() and
         * cannot be reevaluated by the compiler @remaining is based on the
         * same @now value. Therefore @remaining is consistent vs. @now.
         *
         * Consequently all interval timers, i.e. @iv > 0, cannot have a
         * remaining time <= 0 because timer_forward() guarantees to move
         * them forward so that the next timer expiry is > @now.
         */
        if (remaining <= 0) {
                /*
                 * A single shot SIGEV_NONE timer must return 0, when it is
                 * expired! Timers which have a real signal delivery mode
                 * must return a remaining time greater than 0 because the
                 * signal has not yet been delivered.
                 */
                if (!sig_none)
                        cur_setting->it_value.tv_nsec = 1;
        } else {
                cur_setting->it_value = ktime_to_timespec64(remaining);
        }
}

static int do_timer_gettime(timer_t timer_id,  struct itimerspec64 *setting)
{
        memset(setting, 0, sizeof(*setting));
        scoped_timer_get_or_fail(timer_id)
                scoped_timer->kclock->timer_get(scoped_timer, setting);
        return 0;
}

/* Get the time remaining on a POSIX.1b interval timer. */
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
                struct __kernel_itimerspec __user *, setting)
{
        struct itimerspec64 cur_setting;

        int ret = do_timer_gettime(timer_id, &cur_setting);
        if (!ret) {
                if (put_itimerspec64(&cur_setting, setting))
                        ret = -EFAULT;
        }
        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
                struct old_itimerspec32 __user *, setting)
{
        struct itimerspec64 cur_setting;

        int ret = do_timer_gettime(timer_id, &cur_setting);
        if (!ret) {
                if (put_old_itimerspec32(&cur_setting, setting))
                        ret = -EFAULT;
        }
        return ret;
}

#endif

/**
 * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer
 * @timer_id:        The timer ID which identifies the timer
 *
 * The "overrun count" of a timer is one plus the number of expiration
 * intervals which have elapsed between the first expiry, which queues the
 * signal and the actual signal delivery. On signal delivery the "overrun
 * count" is calculated and cached, so it can be returned directly here.
 *
 * As this is relative to the last queued signal the returned overrun count
 * is meaningless outside of the signal delivery path and even there it
 * does not accurately reflect the current state when user space evaluates
 * it.
 *
 * Returns:
 *        -EINVAL                @timer_id is invalid
 *        1..INT_MAX        The number of overruns related to the last delivered signal
 */
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
        scoped_timer_get_or_fail(timer_id)
                return timer_overrun_to_int(scoped_timer);
}

static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
                               bool absolute, bool sigev_none)
{
        struct hrtimer *timer = &timr->it.real.timer;
        enum hrtimer_mode mode;

        mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
        /*
         * Posix magic: Relative CLOCK_REALTIME timers are not affected by
         * clock modifications, so they become CLOCK_MONOTONIC based under the
         * hood. See hrtimer_setup(). Update timr->kclock, so the generic
         * functions which use timr->kclock->clock_get_*() work.
         *
         * Note: it_clock stays unmodified, because the next timer_set() might
         * use ABSTIME, so it needs to switch back.
         */
        if (timr->it_clock == CLOCK_REALTIME)
                timr->kclock = absolute ? &clock_realtime : &clock_monotonic;

        hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode);

        if (!absolute)
                expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer));
        hrtimer_set_expires(timer, expires);

        if (!sigev_none)
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}

static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
{
        return hrtimer_try_to_cancel(&timr->it.real.timer);
}

static void common_timer_wait_running(struct k_itimer *timer)
{
        hrtimer_cancel_wait_running(&timer->it.real.timer);
}

/*
 * On PREEMPT_RT this prevents priority inversion and a potential livelock
 * against the ksoftirqd thread in case that ksoftirqd gets preempted while
 * executing a hrtimer callback.
 *
 * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this
 * just results in a cpu_relax().
 *
 * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is
 * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this
 * prevents spinning on an eventually scheduled out task and a livelock
 * when the task which tries to delete or disarm the timer has preempted
 * the task which runs the expiry in task work context.
 */
static void timer_wait_running(struct k_itimer *timer)
{
        /*
         * kc->timer_wait_running() might drop RCU lock. So @timer
         * cannot be touched anymore after the function returns!
         */
        timer->kclock->timer_wait_running(timer);
}

/*
 * Set up the new interval and reset the signal delivery data
 */
void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting)
{
        if (new_setting->it_value.tv_sec || new_setting->it_value.tv_nsec)
                timer->it_interval = timespec64_to_ktime(new_setting->it_interval);
        else
                timer->it_interval = 0;

        /* Reset overrun accounting */
        timer->it_overrun_last = 0;
        timer->it_overrun = -1LL;
}

/* Set a POSIX.1b interval timer. */
int common_timer_set(struct k_itimer *timr, int flags,
                     struct itimerspec64 *new_setting,
                     struct itimerspec64 *old_setting)
{
        const struct k_clock *kc = timr->kclock;
        bool sigev_none;
        ktime_t expires;

        if (old_setting)
                common_timer_get(timr, old_setting);

        /*
         * Careful here. On SMP systems the timer expiry function could be
         * active and spinning on timr->it_lock.
         */
        if (kc->timer_try_to_cancel(timr) < 0)
                return TIMER_RETRY;

        timr->it_status = POSIX_TIMER_DISARMED;
        posix_timer_set_common(timr, new_setting);

        /* Keep timer disarmed when it_value is zero */
        if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
                return 0;

        expires = timespec64_to_ktime(new_setting->it_value);
        if (flags & TIMER_ABSTIME)
                expires = timens_ktime_to_host(timr->it_clock, expires);
        sigev_none = timr->it_sigev_notify == SIGEV_NONE;

        kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
        if (!sigev_none)
                timr->it_status = POSIX_TIMER_ARMED;
        return 0;
}

static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64,
                            struct itimerspec64 *old_spec64)
{
        if (!timespec64_valid(&new_spec64->it_interval) ||
            !timespec64_valid(&new_spec64->it_value))
                return -EINVAL;

        if (old_spec64)
                memset(old_spec64, 0, sizeof(*old_spec64));

        for (; ; old_spec64 = NULL) {
                struct k_itimer *timr;

                scoped_timer_get_or_fail(timer_id) {
                        timr = scoped_timer;

                        if (old_spec64)
                                old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);

                        /* Prevent signal delivery and rearming. */
                        timr->it_signal_seq++;

                        int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64);
                        if (ret != TIMER_RETRY)
                                return ret;

                        /* Protect the timer from being freed when leaving the lock scope */
                        rcu_read_lock();
                }
                timer_wait_running(timr);
                rcu_read_unlock();
        }
}

/* Set a POSIX.1b interval timer */
SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
                const struct __kernel_itimerspec __user *, new_setting,
                struct __kernel_itimerspec __user *, old_setting)
{
        struct itimerspec64 new_spec, old_spec, *rtn;
        int error = 0;

        if (!new_setting)
                return -EINVAL;

        if (get_itimerspec64(&new_spec, new_setting))
                return -EFAULT;

        rtn = old_setting ? &old_spec : NULL;
        error = do_timer_settime(timer_id, flags, &new_spec, rtn);
        if (!error && old_setting) {
                if (put_itimerspec64(&old_spec, old_setting))
                        error = -EFAULT;
        }
        return error;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags,
                struct old_itimerspec32 __user *, new,
                struct old_itimerspec32 __user *, old)
{
        struct itimerspec64 new_spec, old_spec;
        struct itimerspec64 *rtn = old ? &old_spec : NULL;
        int error = 0;

        if (!new)
                return -EINVAL;
        if (get_old_itimerspec32(&new_spec, new))
                return -EFAULT;

        error = do_timer_settime(timer_id, flags, &new_spec, rtn);
        if (!error && old) {
                if (put_old_itimerspec32(&old_spec, old))
                        error = -EFAULT;
        }
        return error;
}
#endif

int common_timer_del(struct k_itimer *timer)
{
        const struct k_clock *kc = timer->kclock;

        if (kc->timer_try_to_cancel(timer) < 0)
                return TIMER_RETRY;
        timer->it_status = POSIX_TIMER_DISARMED;
        return 0;
}

/*
 * If the deleted timer is on the ignored list, remove it and
 * drop the associated reference.
 */
static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr)
{
        if (!hlist_unhashed(&tmr->ignored_list)) {
                hlist_del_init(&tmr->ignored_list);
                posixtimer_putref(tmr);
        }
}

static void posix_timer_delete(struct k_itimer *timer)
{
        /*
         * Invalidate the timer, remove it from the linked list and remove
         * it from the ignored list if pending.
         *
         * The invalidation must be written with siglock held so that the
         * signal code observes the invalidated timer::it_signal in
         * do_sigaction(), which prevents it from moving a pending signal
         * of a deleted timer to the ignore list.
         *
         * The invalidation also prevents signal queueing, signal delivery
         * and therefore rearming from the signal delivery path.
         *
         * A concurrent lookup can still find the timer in the hash, but it
         * will check timer::it_signal with timer::it_lock held and observe
         * bit 0 set, which invalidates it. That also prevents the timer ID
         * from being handed out before this timer is completely gone.
         */
        timer->it_signal_seq++;

        scoped_guard (spinlock, &current->sighand->siglock) {
                unsigned long sig = (unsigned long)timer->it_signal | 1UL;

                WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig);
                hlist_del_rcu(&timer->list);
                posix_timer_cleanup_ignored(timer);
        }

        while (timer->kclock->timer_del(timer) == TIMER_RETRY) {
                guard(rcu)();
                spin_unlock_irq(&timer->it_lock);
                timer_wait_running(timer);
                spin_lock_irq(&timer->it_lock);
        }
}

/* Delete a POSIX.1b interval timer. */
SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
{
        struct k_itimer *timer;

        scoped_timer_get_or_fail(timer_id) {
                timer = scoped_timer;
                posix_timer_delete(timer);
        }
        /* Remove it from the hash, which frees up the timer ID */
        posix_timer_unhash_and_free(timer);
        return 0;
}

/*
 * Invoked from do_exit() when the last thread of a thread group exits.
 * At that point no other task can access the timers of the dying
 * task anymore.
 */
void exit_itimers(struct task_struct *tsk)
{
        struct hlist_head timers;
        struct hlist_node *next;
        struct k_itimer *timer;

        /* Clear restore mode for exec() */
        tsk->signal->timer_create_restore_ids = 0;

        if (hlist_empty(&tsk->signal->posix_timers))
                return;

        /* Protect against concurrent read via /proc/$PID/timers */
        scoped_guard (spinlock_irq, &tsk->sighand->siglock)
                hlist_move_list(&tsk->signal->posix_timers, &timers);

        /* The timers are not longer accessible via tsk::signal */
        hlist_for_each_entry_safe(timer, next, &timers, list) {
                scoped_guard (spinlock_irq, &timer->it_lock)
                        posix_timer_delete(timer);
                posix_timer_unhash_and_free(timer);
                cond_resched();
        }

        /*
         * There should be no timers on the ignored list. posix_timer_delete() has
         * mopped them up.
         */
        if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers)))
                return;

        hlist_move_list(&tsk->signal->ignored_posix_timers, &timers);
        while (!hlist_empty(&timers)) {
                posix_timer_cleanup_ignored(hlist_entry(timers.first, struct k_itimer,
                                                        ignored_list));
        }
}

SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
                const struct __kernel_timespec __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 new_tp;

        if (!kc || !kc->clock_set)
                return -EINVAL;

        if (get_timespec64(&new_tp, tp))
                return -EFAULT;

        /*
         * Permission checks have to be done inside the clock specific
         * setter callback.
         */
        return kc->clock_set(which_clock, &new_tp);
}

SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
                struct __kernel_timespec __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 kernel_tp;
        int error;

        if (!kc)
                return -EINVAL;

        error = kc->clock_get_timespec(which_clock, &kernel_tp);

        if (!error && put_timespec64(&kernel_tp, tp))
                error = -EFAULT;

        return error;
}

int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);

        if (!kc)
                return -EINVAL;
        if (!kc->clock_adj)
                return -EOPNOTSUPP;

        return kc->clock_adj(which_clock, ktx);
}

SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
                struct __kernel_timex __user *, utx)
{
        struct __kernel_timex ktx;
        int err;

        if (copy_from_user(&ktx, utx, sizeof(ktx)))
                return -EFAULT;

        err = do_clock_adjtime(which_clock, &ktx);

        if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
                return -EFAULT;

        return err;
}

/**
 * sys_clock_getres - Get the resolution of a clock
 * @which_clock:        The clock to get the resolution for
 * @tp:                        Pointer to a a user space timespec64 for storage
 *
 * POSIX defines:
 *
 * "The clock_getres() function shall return the resolution of any
 * clock. Clock resolutions are implementation-defined and cannot be set by
 * a process. If the argument res is not NULL, the resolution of the
 * specified clock shall be stored in the location pointed to by res. If
 * res is NULL, the clock resolution is not returned. If the time argument
 * of clock_settime() is not a multiple of res, then the value is truncated
 * to a multiple of res."
 *
 * Due to the various hardware constraints the real resolution can vary
 * wildly and even change during runtime when the underlying devices are
 * replaced. The kernel also can use hardware devices with different
 * resolutions for reading the time and for arming timers.
 *
 * The kernel therefore deviates from the POSIX spec in various aspects:
 *
 * 1) The resolution returned to user space
 *
 *    For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI,
 *    CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW
 *    the kernel differentiates only two cases:
 *
 *    I)  Low resolution mode:
 *
 *          When high resolution timers are disabled at compile or runtime
 *          the resolution returned is nanoseconds per tick, which represents
 *          the precision at which timers expire.
 *
 *    II) High resolution mode:
 *
 *          When high resolution timers are enabled the resolution returned
 *          is always one nanosecond independent of the actual resolution of
 *          the underlying hardware devices.
 *
 *          For CLOCK_*_ALARM the actual resolution depends on system
 *          state. When system is running the resolution is the same as the
 *          resolution of the other clocks. During suspend the actual
 *          resolution is the resolution of the underlying RTC device which
 *          might be way less precise than the clockevent device used during
 *          running state.
 *
 *   For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution
 *   returned is always nanoseconds per tick.
 *
 *   For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution
 *   returned is always one nanosecond under the assumption that the
 *   underlying scheduler clock has a better resolution than nanoseconds
 *   per tick.
 *
 *   For dynamic POSIX clocks (PTP devices) the resolution returned is
 *   always one nanosecond.
 *
 * 2) Affect on sys_clock_settime()
 *
 *    The kernel does not truncate the time which is handed in to
 *    sys_clock_settime(). The kernel internal timekeeping is always using
 *    nanoseconds precision independent of the clocksource device which is
 *    used to read the time from. The resolution of that device only
 *    affects the precision of the time returned by sys_clock_gettime().
 *
 * Returns:
 *        0                Success. @tp contains the resolution
 *        -EINVAL                @which_clock is not a valid clock ID
 *        -EFAULT                Copying the resolution to @tp faulted
 *        -ENODEV                Dynamic POSIX clock is not backed by a device
 *        -EOPNOTSUPP        Dynamic POSIX clock does not support getres()
 */
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
                struct __kernel_timespec __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 rtn_tp;
        int error;

        if (!kc)
                return -EINVAL;

        error = kc->clock_getres(which_clock, &rtn_tp);

        if (!error && tp && put_timespec64(&rtn_tp, tp))
                error = -EFAULT;

        return error;
}

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock,
                struct old_timespec32 __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 ts;

        if (!kc || !kc->clock_set)
                return -EINVAL;

        if (get_old_timespec32(&ts, tp))
                return -EFAULT;

        return kc->clock_set(which_clock, &ts);
}

SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
                struct old_timespec32 __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 ts;
        int err;

        if (!kc)
                return -EINVAL;

        err = kc->clock_get_timespec(which_clock, &ts);

        if (!err && put_old_timespec32(&ts, tp))
                err = -EFAULT;

        return err;
}

SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
                struct old_timex32 __user *, utp)
{
        struct __kernel_timex ktx;
        int err;

        err = get_old_timex32(&ktx, utp);
        if (err)
                return err;

        err = do_clock_adjtime(which_clock, &ktx);

        if (err >= 0 && put_old_timex32(utp, &ktx))
                return -EFAULT;

        return err;
}

SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
                struct old_timespec32 __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 ts;
        int err;

        if (!kc)
                return -EINVAL;

        err = kc->clock_getres(which_clock, &ts);
        if (!err && tp && put_old_timespec32(&ts, tp))
                return -EFAULT;

        return err;
}

#endif

/*
 * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI
 */
static int common_nsleep(const clockid_t which_clock, int flags,
                         const struct timespec64 *rqtp)
{
        ktime_t texp = timespec64_to_ktime(*rqtp);

        return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
                                 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
                                 which_clock);
}

/*
 * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME
 *
 * Absolute nanosleeps for these clocks are time-namespace adjusted.
 */
static int common_nsleep_timens(const clockid_t which_clock, int flags,
                                const struct timespec64 *rqtp)
{
        ktime_t texp = timespec64_to_ktime(*rqtp);

        if (flags & TIMER_ABSTIME)
                texp = timens_ktime_to_host(which_clock, texp);

        return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
                                 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
                                 which_clock);
}

SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
                const struct __kernel_timespec __user *, rqtp,
                struct __kernel_timespec __user *, rmtp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 t;

        if (!kc)
                return -EINVAL;
        if (!kc->nsleep)
                return -EOPNOTSUPP;

        if (get_timespec64(&t, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&t))
                return -EINVAL;
        if (flags & TIMER_ABSTIME)
                rmtp = NULL;
        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
        current->restart_block.nanosleep.rmtp = rmtp;

        return kc->nsleep(which_clock, flags, &t);
}

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
                struct old_timespec32 __user *, rqtp,
                struct old_timespec32 __user *, rmtp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 t;

        if (!kc)
                return -EINVAL;
        if (!kc->nsleep)
                return -EOPNOTSUPP;

        if (get_old_timespec32(&t, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&t))
                return -EINVAL;
        if (flags & TIMER_ABSTIME)
                rmtp = NULL;
        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
        current->restart_block.nanosleep.compat_rmtp = rmtp;

        return kc->nsleep(which_clock, flags, &t);
}

#endif

static const struct k_clock clock_realtime = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_timespec        = posix_get_realtime_timespec,
        .clock_get_ktime        = posix_get_realtime_ktime,
        .clock_set                = posix_clock_realtime_set,
        .clock_adj                = posix_clock_realtime_adj,
        .nsleep                        = common_nsleep,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock clock_monotonic = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_timespec        = posix_get_monotonic_timespec,
        .clock_get_ktime        = posix_get_monotonic_ktime,
        .nsleep                        = common_nsleep_timens,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock clock_monotonic_raw = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_timespec        = posix_get_monotonic_raw,
};

static const struct k_clock clock_realtime_coarse = {
        .clock_getres                = posix_get_coarse_res,
        .clock_get_timespec        = posix_get_realtime_coarse,
};

static const struct k_clock clock_monotonic_coarse = {
        .clock_getres                = posix_get_coarse_res,
        .clock_get_timespec        = posix_get_monotonic_coarse,
};

static const struct k_clock clock_tai = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_ktime        = posix_get_tai_ktime,
        .clock_get_timespec        = posix_get_tai_timespec,
        .nsleep                        = common_nsleep,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock clock_boottime = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_ktime        = posix_get_boottime_ktime,
        .clock_get_timespec        = posix_get_boottime_timespec,
        .nsleep                        = common_nsleep_timens,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock * const posix_clocks[] = {
        [CLOCK_REALTIME]                = &clock_realtime,
        [CLOCK_MONOTONIC]                = &clock_monotonic,
        [CLOCK_PROCESS_CPUTIME_ID]        = &clock_process,
        [CLOCK_THREAD_CPUTIME_ID]        = &clock_thread,
        [CLOCK_MONOTONIC_RAW]                = &clock_monotonic_raw,
        [CLOCK_REALTIME_COARSE]                = &clock_realtime_coarse,
        [CLOCK_MONOTONIC_COARSE]        = &clock_monotonic_coarse,
        [CLOCK_BOOTTIME]                = &clock_boottime,
        [CLOCK_REALTIME_ALARM]                = &alarm_clock,
        [CLOCK_BOOTTIME_ALARM]                = &alarm_clock,
        [CLOCK_TAI]                        = &clock_tai,
#ifdef CONFIG_POSIX_AUX_CLOCKS
        [CLOCK_AUX ... CLOCK_AUX_LAST]        = &clock_aux,
#endif
};

static const struct k_clock *clockid_to_kclock(const clockid_t id)
{
        clockid_t idx = id;

        if (id < 0) {
                return (id & CLOCKFD_MASK) == CLOCKFD ?
                        &clock_posix_dynamic : &clock_posix_cpu;
        }

        if (id >= ARRAY_SIZE(posix_clocks))
                return NULL;

        return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}

static int __init posixtimer_init(void)
{
        unsigned long i, size;
        unsigned int shift;

        posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                               sizeof(struct k_itimer),
                                               __alignof__(struct k_itimer),
                                               SLAB_ACCOUNT, NULL);

        if (IS_ENABLED(CONFIG_BASE_SMALL))
                size = 512;
        else
                size = roundup_pow_of_two(512 * num_possible_cpus());

        timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets),
                                                size, 0, 0, &shift, NULL, size, size);
        size = 1UL << shift;
        timer_hashmask = size - 1;

        for (i = 0; i < size; i++) {
                spin_lock_init(&timer_buckets[i].lock);
                INIT_HLIST_HEAD(&timer_buckets[i].head);
        }
        return 0;
}
core_initcall(posixtimer_init);

















































































































































































































































































































































































































































































    2 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_FLOW_DISSECTOR_H
#define _NET_FLOW_DISSECTOR_H

#include <linux/types.h>
#include <linux/in6.h>
#include <linux/siphash.h>
#include <linux/string.h>
#include <uapi/linux/if_ether.h>
#include <uapi/linux/pkt_cls.h>

struct bpf_prog;
struct net;
struct sk_buff;

/**
 * struct flow_dissector_key_control:
 * @thoff:     Transport header offset
 * @addr_type: Type of key. One of FLOW_DISSECTOR_KEY_*
 * @flags:     Key flags.
 *             Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAG|ENCAPSULATION|F_*)
 */
struct flow_dissector_key_control {
        u16        thoff;
        u16        addr_type;
        u32        flags;
};

/* The control flags are kept in sync with TCA_FLOWER_KEY_FLAGS_*, as those
 * flags are exposed to userspace in some error paths, ie. unsupported flags.
 */
enum flow_dissector_ctrl_flags {
        FLOW_DIS_IS_FRAGMENT                = TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT,
        FLOW_DIS_FIRST_FRAG                = TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST,
        FLOW_DIS_F_TUNNEL_CSUM                = TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM,
        FLOW_DIS_F_TUNNEL_DONT_FRAGMENT        = TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT,
        FLOW_DIS_F_TUNNEL_OAM                = TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM,
        FLOW_DIS_F_TUNNEL_CRIT_OPT        = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT,

        /* These flags are internal to the kernel */
        FLOW_DIS_ENCAPSULATION                = (TCA_FLOWER_KEY_FLAGS_MAX << 1),
};

enum flow_dissect_ret {
        FLOW_DISSECT_RET_OUT_GOOD,
        FLOW_DISSECT_RET_OUT_BAD,
        FLOW_DISSECT_RET_PROTO_AGAIN,
        FLOW_DISSECT_RET_IPPROTO_AGAIN,
        FLOW_DISSECT_RET_CONTINUE,
};

/**
 * struct flow_dissector_key_basic:
 * @n_proto:  Network header protocol (eg. IPv4/IPv6)
 * @ip_proto: Transport header protocol (eg. TCP/UDP)
 * @padding:  Unused
 */
struct flow_dissector_key_basic {
        __be16        n_proto;
        u8        ip_proto;
        u8        padding;
};

struct flow_dissector_key_tags {
        u32        flow_label;
};

struct flow_dissector_key_vlan {
        union {
                struct {
                        u16        vlan_id:12,
                                vlan_dei:1,
                                vlan_priority:3;
                };
                __be16        vlan_tci;
        };
        __be16        vlan_tpid;
        __be16        vlan_eth_type;
        u16        padding;
};

struct flow_dissector_mpls_lse {
        u32        mpls_ttl:8,
                mpls_bos:1,
                mpls_tc:3,
                mpls_label:20;
};

#define FLOW_DIS_MPLS_MAX 7
struct flow_dissector_key_mpls {
        struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */
        u8 used_lses; /* One bit set for each Label Stack Entry in use */
};

static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls,
                                          int lse_index)
{
        mpls->used_lses |= 1 << lse_index;
}

#define FLOW_DIS_TUN_OPTS_MAX 255
/**
 * struct flow_dissector_key_enc_opts:
 * @data: tunnel option data
 * @len: length of tunnel option data
 * @dst_opt_type: tunnel option type
 */
struct flow_dissector_key_enc_opts {
        u8 data[FLOW_DIS_TUN_OPTS_MAX];        /* Using IP_TUNNEL_OPTS_MAX is desired
                                         * here but seems difficult to #include
                                         */
        u8 len;
        u32 dst_opt_type;
};

struct flow_dissector_key_keyid {
        __be32        keyid;
};

/**
 * struct flow_dissector_key_ipv4_addrs:
 * @src: source ip address
 * @dst: destination ip address
 */
struct flow_dissector_key_ipv4_addrs {
        /* (src,dst) must be grouped, in the same way than in IP header */
        __be32 src;
        __be32 dst;
};

/**
 * struct flow_dissector_key_ipv6_addrs:
 * @src: source ip address
 * @dst: destination ip address
 */
struct flow_dissector_key_ipv6_addrs {
        /* (src,dst) must be grouped, in the same way than in IP header */
        struct in6_addr src;
        struct in6_addr dst;
};

/**
 * struct flow_dissector_key_tipc:
 * @key: source node address combined with selector
 */
struct flow_dissector_key_tipc {
        __be32 key;
};

/**
 * struct flow_dissector_key_addrs:
 * @v4addrs: IPv4 addresses
 * @v6addrs: IPv6 addresses
 * @tipckey: TIPC key
 */
struct flow_dissector_key_addrs {
        union {
                struct flow_dissector_key_ipv4_addrs v4addrs;
                struct flow_dissector_key_ipv6_addrs v6addrs;
                struct flow_dissector_key_tipc tipckey;
        };
};

/**
 * struct flow_dissector_key_arp:
 * @sip: Sender IP address
 * @tip: Target IP address
 * @op:  Operation
 * @sha: Sender hardware address
 * @tha: Target hardware address
 */
struct flow_dissector_key_arp {
        __u32 sip;
        __u32 tip;
        __u8 op;
        unsigned char sha[ETH_ALEN];
        unsigned char tha[ETH_ALEN];
};

/**
 * struct flow_dissector_key_ports:
 * @ports: port numbers of Transport header
 * @src: source port number
 * @dst: destination port number
 */
struct flow_dissector_key_ports {
        union {
                __be32 ports;
                struct {
                        __be16 src;
                        __be16 dst;
                };
        };
};

/**
 * struct flow_dissector_key_ports_range
 * @tp: port number from packet
 * @tp_min: min port number in range
 * @tp_max: max port number in range
 */
struct flow_dissector_key_ports_range {
        union {
                struct flow_dissector_key_ports tp;
                struct {
                        struct flow_dissector_key_ports tp_min;
                        struct flow_dissector_key_ports tp_max;
                };
        };
};

/**
 * struct flow_dissector_key_icmp:
 * @type: ICMP type
 * @code: ICMP code
 * @id:   Session identifier
 */
struct flow_dissector_key_icmp {
        struct {
                u8 type;
                u8 code;
        };
        u16 id;
};

/**
 * struct flow_dissector_key_eth_addrs:
 * @src: source Ethernet address
 * @dst: destination Ethernet address
 */
struct flow_dissector_key_eth_addrs {
        /* (dst,src) must be grouped, in the same way than in ETH header */
        unsigned char dst[ETH_ALEN];
        unsigned char src[ETH_ALEN];
};

/**
 * struct flow_dissector_key_tcp:
 * @flags: flags
 */
struct flow_dissector_key_tcp {
        __be16 flags;
};

/**
 * struct flow_dissector_key_ip:
 * @tos: tos
 * @ttl: ttl
 */
struct flow_dissector_key_ip {
        __u8        tos;
        __u8        ttl;
};

/**
 * struct flow_dissector_key_meta:
 * @ingress_ifindex: ingress ifindex
 * @ingress_iftype: ingress interface type
 * @l2_miss: packet did not match an L2 entry during forwarding
 */
struct flow_dissector_key_meta {
        int ingress_ifindex;
        u16 ingress_iftype;
        u8 l2_miss;
};

/**
 * struct flow_dissector_key_ct:
 * @ct_state: conntrack state after converting with map
 * @ct_mark: conttrack mark
 * @ct_zone: conntrack zone
 * @ct_labels: conntrack labels
 */
struct flow_dissector_key_ct {
        u16        ct_state;
        u16        ct_zone;
        u32        ct_mark;
        u32        ct_labels[4];
};

/**
 * struct flow_dissector_key_hash:
 * @hash: hash value
 */
struct flow_dissector_key_hash {
        u32 hash;
};

/**
 * struct flow_dissector_key_num_of_vlans:
 * @num_of_vlans: num_of_vlans value
 */
struct flow_dissector_key_num_of_vlans {
        u8 num_of_vlans;
};

/**
 * struct flow_dissector_key_pppoe:
 * @session_id: pppoe session id
 * @ppp_proto: ppp protocol
 * @type: pppoe eth type
 */
struct flow_dissector_key_pppoe {
        __be16 session_id;
        __be16 ppp_proto;
        __be16 type;
};

/**
 * struct flow_dissector_key_l2tpv3:
 * @session_id: identifier for a l2tp session
 */
struct flow_dissector_key_l2tpv3 {
        __be32 session_id;
};

/**
 * struct flow_dissector_key_ipsec:
 * @spi: identifier for a ipsec connection
 */
struct flow_dissector_key_ipsec {
        __be32 spi;
};

/**
 * struct flow_dissector_key_cfm
 * @mdl_ver: maintenance domain level (mdl) and cfm protocol version
 * @opcode: code specifying a type of cfm protocol packet
 *
 * See 802.1ag, ITU-T G.8013/Y.1731
 *         1               2
 * |7 6 5 4 3 2 1 0|7 6 5 4 3 2 1 0|
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * | mdl | version |     opcode    |
 * +-----+---------+-+-+-+-+-+-+-+-+
 */
struct flow_dissector_key_cfm {
        u8        mdl_ver;
        u8        opcode;
};

#define FLOW_DIS_CFM_MDL_MASK GENMASK(7, 5)
#define FLOW_DIS_CFM_MDL_MAX 7

enum flow_dissector_key_id {
        FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
        FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
        FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
        FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
        FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_PORTS_RANGE, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
        FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
        FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */
        FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */
        FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_vlan */
        FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_tags */
        FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
        FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
        FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
        FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */
        FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */
        FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */
        FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */
        FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */
        FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */
        FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */
        FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */
        FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */
        FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */
        FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */
        FLOW_DISSECTOR_KEY_L2TPV3, /* struct flow_dissector_key_l2tpv3 */
        FLOW_DISSECTOR_KEY_CFM, /* struct flow_dissector_key_cfm */
        FLOW_DISSECTOR_KEY_IPSEC, /* struct flow_dissector_key_ipsec */

        FLOW_DISSECTOR_KEY_MAX,
};

#define FLOW_DISSECTOR_F_PARSE_1ST_FRAG                BIT(0)
#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL        BIT(1)
#define FLOW_DISSECTOR_F_STOP_AT_ENCAP                BIT(2)
#define FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP        BIT(3)

struct flow_dissector_key {
        enum flow_dissector_key_id key_id;
        size_t offset; /* offset of struct flow_dissector_key_*
                          in target the struct */
};

struct flow_dissector {
        unsigned long long  used_keys;
                /* each bit represents presence of one key id */
        unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
};

struct flow_keys_basic {
        struct flow_dissector_key_control control;
        struct flow_dissector_key_basic basic;
};

struct flow_keys {
        struct flow_dissector_key_control control;
#define FLOW_KEYS_HASH_START_FIELD basic
        struct flow_dissector_key_basic basic __aligned(SIPHASH_ALIGNMENT);
        struct flow_dissector_key_tags tags;
        struct flow_dissector_key_vlan vlan;
        struct flow_dissector_key_vlan cvlan;
        struct flow_dissector_key_keyid keyid;
        struct flow_dissector_key_ports ports;
        struct flow_dissector_key_icmp icmp;
        /* 'addrs' must be the last member */
        struct flow_dissector_key_addrs addrs;
};

#define FLOW_KEYS_HASH_OFFSET                \
        offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD)

__be32 flow_get_u32_src(const struct flow_keys *flow);
__be32 flow_get_u32_dst(const struct flow_keys *flow);

extern struct flow_dissector flow_keys_dissector;
extern struct flow_dissector flow_keys_basic_dissector;

/* struct flow_keys_digest:
 *
 * This structure is used to hold a digest of the full flow keys. This is a
 * larger "hash" of a flow to allow definitively matching specific flows where
 * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
 * that it can be used in CB of skb (see sch_choke for an example).
 */
#define FLOW_KEYS_DIGEST_LEN        16
struct flow_keys_digest {
        u8        data[FLOW_KEYS_DIGEST_LEN];
};

void make_flow_keys_digest(struct flow_keys_digest *digest,
                           const struct flow_keys *flow);

static inline bool flow_keys_have_l4(const struct flow_keys *keys)
{
        return (keys->ports.ports || keys->tags.flow_label);
}

u32 flow_hash_from_keys(struct flow_keys *keys);
u32 flow_hash_from_keys_seed(struct flow_keys *keys,
                             const siphash_key_t *keyval);
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
                           struct flow_dissector_key_icmp *key_icmp,
                           const void *data, int thoff, int hlen);

static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector,
                                      enum flow_dissector_key_id key_id)
{
        return flow_dissector->used_keys & (1ULL << key_id);
}

static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
                                              enum flow_dissector_key_id key_id,
                                              void *target_container)
{
        return ((char *)target_container) + flow_dissector->offset[key_id];
}

struct bpf_flow_dissector {
        struct bpf_flow_keys        *flow_keys;
        const struct sk_buff        *skb;
        const void                *data;
        const void                *data_end;
};

static inline void
flow_dissector_init_keys(struct flow_dissector_key_control *key_control,
                         struct flow_dissector_key_basic *key_basic)
{
        memset(key_control, 0, sizeof(*key_control));
        memset(key_basic, 0, sizeof(*key_basic));
}

#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach_check(struct net *net,
                                         struct bpf_prog *prog);
#endif /* CONFIG_BPF_SYSCALL */

#endif








































































































































































































































    2 





    2 














    2 














    1 







    1 

    2 













    2 







































    2 














    1 























































































































    1 
































































































































































    2 




    2 











    2 





    2 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
// SPDX-License-Identifier: GPL-2.0-only
/*
 * QNX6 file system, Linux implementation.
 *
 * Version : 1.0.0
 *
 * History :
 *
 * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release.
 * 16-02-2012 pagemap extension by Al Viro
 *
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/seq_file.h>
#include <linux/crc32.h>
#include <linux/mpage.h>
#include <linux/fs_parser.h>
#include <linux/fs_context.h>
#include "qnx6.h"

static const struct super_operations qnx6_sops;

static void qnx6_put_super(struct super_block *sb);
static struct inode *qnx6_alloc_inode(struct super_block *sb);
static void qnx6_free_inode(struct inode *inode);
static int qnx6_reconfigure(struct fs_context *fc);
static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf);
static int qnx6_show_options(struct seq_file *seq, struct dentry *root);

static const struct super_operations qnx6_sops = {
        .alloc_inode        = qnx6_alloc_inode,
        .free_inode        = qnx6_free_inode,
        .put_super        = qnx6_put_super,
        .statfs                = qnx6_statfs,
        .show_options        = qnx6_show_options,
};

static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
{
        struct super_block *sb = root->d_sb;
        struct qnx6_sb_info *sbi = QNX6_SB(sb);

        if (sbi->s_mount_opt & QNX6_MOUNT_MMI_FS)
                seq_puts(seq, ",mmi_fs");
        return 0;
}

static int qnx6_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;

        sync_filesystem(sb);
        fc->sb_flags |= SB_RDONLY;
        return 0;
}

static unsigned qnx6_get_devblock(struct super_block *sb, __fs32 block)
{
        struct qnx6_sb_info *sbi = QNX6_SB(sb);
        return fs32_to_cpu(sbi, block) + sbi->s_blks_off;
}

static unsigned qnx6_block_map(struct inode *inode, unsigned iblock);

static int qnx6_get_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh, int create)
{
        unsigned phys;

        pr_debug("qnx6_get_block inode=[%llu] iblock=[%ld]\n",
                 inode->i_ino, (unsigned long)iblock);

        phys = qnx6_block_map(inode, iblock);
        if (phys) {
                /* logical block is before EOF */
                map_bh(bh, inode->i_sb, phys);
        }
        return 0;
}

static int qnx6_check_blockptr(__fs32 ptr)
{
        if (ptr == ~(__fs32)0) {
                pr_err("hit unused blockpointer.\n");
                return 0;
        }
        return 1;
}

static int qnx6_read_folio(struct file *file, struct folio *folio)
{
        return mpage_read_folio(folio, qnx6_get_block);
}

static void qnx6_readahead(struct readahead_control *rac)
{
        mpage_readahead(rac, qnx6_get_block);
}

/*
 * returns the block number for the no-th element in the tree
 * inodebits requred as there are multiple inodes in one inode block
 */
static unsigned qnx6_block_map(struct inode *inode, unsigned no)
{
        struct super_block *s = inode->i_sb;
        struct qnx6_sb_info *sbi = QNX6_SB(s);
        struct qnx6_inode_info *ei = QNX6_I(inode);
        unsigned block = 0;
        struct buffer_head *bh;
        __fs32 ptr;
        int levelptr;
        int ptrbits = sbi->s_ptrbits;
        int bitdelta;
        u32 mask = (1 << ptrbits) - 1;
        int depth = ei->di_filelevels;
        int i;

        bitdelta = ptrbits * depth;
        levelptr = no >> bitdelta;

        if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) {
                pr_err("Requested file block number (%u) too big.", no);
                return 0;
        }

        block = qnx6_get_devblock(s, ei->di_block_ptr[levelptr]);

        for (i = 0; i < depth; i++) {
                bh = sb_bread(s, block);
                if (!bh) {
                        pr_err("Error reading block (%u)\n", block);
                        return 0;
                }
                bitdelta -= ptrbits;
                levelptr = (no >> bitdelta) & mask;
                ptr = ((__fs32 *)bh->b_data)[levelptr];

                if (!qnx6_check_blockptr(ptr))
                        return 0;

                block = qnx6_get_devblock(s, ptr);
                brelse(bh);
        }
        return block;
}

static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct qnx6_sb_info *sbi = QNX6_SB(sb);
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);

        buf->f_type    = sb->s_magic;
        buf->f_bsize   = sb->s_blocksize;
        buf->f_blocks  = fs32_to_cpu(sbi, sbi->sb->sb_num_blocks);
        buf->f_bfree   = fs32_to_cpu(sbi, sbi->sb->sb_free_blocks);
        buf->f_files   = fs32_to_cpu(sbi, sbi->sb->sb_num_inodes);
        buf->f_ffree   = fs32_to_cpu(sbi, sbi->sb->sb_free_inodes);
        buf->f_bavail  = buf->f_bfree;
        buf->f_namelen = QNX6_LONG_NAME_MAX;
        buf->f_fsid    = u64_to_fsid(id);

        return 0;
}

/*
 * Check the root directory of the filesystem to make sure
 * it really _is_ a qnx6 filesystem, and to check the size
 * of the directory entry.
 */
static const char *qnx6_checkroot(struct super_block *s)
{
        int error = 0;
        struct qnx6_dir_entry *dir_entry;
        struct inode *root = d_inode(s->s_root);
        struct address_space *mapping = root->i_mapping;
        struct folio *folio = read_mapping_folio(mapping, 0, NULL);

        if (IS_ERR(folio))
                return "error reading root directory";
        dir_entry = kmap_local_folio(folio, 0);
        if (memcmp(dir_entry[0].de_fname, ".", 2) ||
            memcmp(dir_entry[1].de_fname, "..", 3))
                error = 1;
        folio_release_kmap(folio, dir_entry);
        if (error)
                return "error reading root directory.";
        return NULL;
}

#ifdef CONFIG_QNX6FS_DEBUG
void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s)
{
        struct qnx6_sb_info *sbi = QNX6_SB(s);

        pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic));
        pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum));
        pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial));
        pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags));
        pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize));
        pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes));
        pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes));
        pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks));
        pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks));
        pr_debug("inode_levels: %02x\n", sb->Inode.levels);
}
#endif

enum {
        Opt_mmifs
};

struct qnx6_context {
        unsigned long s_mount_opts;
};

static const struct fs_parameter_spec qnx6_param_spec[] = {
        fsparam_flag        ("mmi_fs",        Opt_mmifs),
        {}
};

static int qnx6_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct qnx6_context *ctx = fc->fs_private;
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, qnx6_param_spec, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_mmifs:
                ctx->s_mount_opts |= QNX6_MOUNT_MMI_FS;
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
                                int offset, int silent)
{
        struct qnx6_sb_info *sbi = QNX6_SB(s);
        struct buffer_head *bh;
        struct qnx6_super_block *sb;

        /* Check the superblock signatures
           start with the first superblock */
        bh = sb_bread(s, offset);
        if (!bh) {
                pr_err("unable to read the first superblock\n");
                return NULL;
        }
        sb = (struct qnx6_super_block *)bh->b_data;
        if (fs32_to_cpu(sbi, sb->sb_magic) != QNX6_SUPER_MAGIC) {
                sbi->s_bytesex = BYTESEX_BE;
                if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) {
                        /* we got a big endian fs */
                        pr_debug("fs got different endianness.\n");
                        return bh;
                } else
                        sbi->s_bytesex = BYTESEX_LE;
                if (!silent) {
                        if (offset == 0) {
                                pr_err("wrong signature (magic) in superblock #1.\n");
                        } else {
                                pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n",
                                        offset * s->s_blocksize);
                        }
                }
                brelse(bh);
                return NULL;
        }
        return bh;
}

static struct inode *qnx6_private_inode(struct super_block *s,
                                        struct qnx6_root_node *p);

static int qnx6_fill_super(struct super_block *s, struct fs_context *fc)
{
        struct buffer_head *bh1 = NULL, *bh2 = NULL;
        struct qnx6_super_block *sb1 = NULL, *sb2 = NULL;
        struct qnx6_sb_info *sbi;
        struct qnx6_context *ctx = fc->fs_private;
        struct inode *root;
        const char *errmsg;
        struct qnx6_sb_info *qs;
        int ret = -EINVAL;
        u64 offset;
        int bootblock_offset = QNX6_BOOTBLOCK_SIZE;
        int silent = fc->sb_flags & SB_SILENT;

        qs = kzalloc_obj(struct qnx6_sb_info);
        if (!qs)
                return -ENOMEM;
        s->s_fs_info = qs;
        qs->s_mount_opt = ctx->s_mount_opts;

        /* Superblock always is 512 Byte long */
        if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) {
                pr_err("unable to set blocksize\n");
                goto outnobh;
        }

        if (qs->s_mount_opt == QNX6_MOUNT_MMI_FS) {
                sb1 = qnx6_mmi_fill_super(s, silent);
                if (sb1)
                        goto mmi_success;
                else
                        goto outnobh;
        }
        sbi = QNX6_SB(s);
        sbi->s_bytesex = BYTESEX_LE;
        /* Check the superblock signatures
           start with the first superblock */
        bh1 = qnx6_check_first_superblock(s,
                bootblock_offset / QNX6_SUPERBLOCK_SIZE, silent);
        if (!bh1) {
                /* try again without bootblock offset */
                bh1 = qnx6_check_first_superblock(s, 0, silent);
                if (!bh1) {
                        pr_err("unable to read the first superblock\n");
                        goto outnobh;
                }
                /* seems that no bootblock at partition start */
                bootblock_offset = 0;
        }
        sb1 = (struct qnx6_super_block *)bh1->b_data;

#ifdef CONFIG_QNX6FS_DEBUG
        qnx6_superblock_debug(sb1, s);
#endif

        /* checksum check - start at byte 8 and end at byte 512 */
        if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
                        crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
                pr_err("superblock #1 checksum error\n");
                goto out;
        }

        /* set new blocksize */
        if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
                pr_err("unable to set blocksize\n");
                goto out;
        }
        /* blocksize invalidates bh - pull it back in */
        brelse(bh1);
        bh1 = sb_bread(s, bootblock_offset >> s->s_blocksize_bits);
        if (!bh1)
                goto outnobh;
        sb1 = (struct qnx6_super_block *)bh1->b_data;

        /* calculate second superblock blocknumber */
        offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) +
                (bootblock_offset >> s->s_blocksize_bits) +
                (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits);

        /* set bootblock offset */
        sbi->s_blks_off = (bootblock_offset >> s->s_blocksize_bits) +
                          (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits);

        /* next the second superblock */
        bh2 = sb_bread(s, offset);
        if (!bh2) {
                pr_err("unable to read the second superblock\n");
                goto out;
        }
        sb2 = (struct qnx6_super_block *)bh2->b_data;
        if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
                if (!silent)
                        pr_err("wrong signature (magic) in superblock #2.\n");
                goto out;
        }

        /* checksum check - start at byte 8 and end at byte 512 */
        if (fs32_to_cpu(sbi, sb2->sb_checksum) !=
                                crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
                pr_err("superblock #2 checksum error\n");
                goto out;
        }

        if (fs64_to_cpu(sbi, sb1->sb_serial) >=
                                        fs64_to_cpu(sbi, sb2->sb_serial)) {
                /* superblock #1 active */
                sbi->sb_buf = bh1;
                sbi->sb = (struct qnx6_super_block *)bh1->b_data;
                brelse(bh2);
                pr_info("superblock #1 active\n");
        } else {
                /* superblock #2 active */
                sbi->sb_buf = bh2;
                sbi->sb = (struct qnx6_super_block *)bh2->b_data;
                brelse(bh1);
                pr_info("superblock #2 active\n");
        }
mmi_success:
        /* sanity check - limit maximum indirect pointer levels */
        if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) {
                pr_err("too many inode levels (max %i, sb %i)\n",
                       QNX6_PTR_MAX_LEVELS, sb1->Inode.levels);
                goto out;
        }
        if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) {
                pr_err("too many longfilename levels (max %i, sb %i)\n",
                       QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels);
                goto out;
        }
        s->s_op = &qnx6_sops;
        s->s_magic = QNX6_SUPER_MAGIC;
        s->s_flags |= SB_RDONLY;        /* Yup, read-only yet */
        s->s_time_min = 0;
        s->s_time_max = U32_MAX;

        /* ease the later tree level calculations */
        sbi = QNX6_SB(s);
        sbi->s_ptrbits = ilog2(s->s_blocksize / 4);
        sbi->inodes = qnx6_private_inode(s, &sb1->Inode);
        if (!sbi->inodes)
                goto out;
        sbi->longfile = qnx6_private_inode(s, &sb1->Longfile);
        if (!sbi->longfile)
                goto out1;

        /* prefetch root inode */
        root = qnx6_iget(s, QNX6_ROOT_INO);
        if (IS_ERR(root)) {
                pr_err("get inode failed\n");
                ret = PTR_ERR(root);
                goto out2;
        }

        ret = -ENOMEM;
        s->s_root = d_make_root(root);
        if (!s->s_root)
                goto out2;

        ret = -EINVAL;
        errmsg = qnx6_checkroot(s);
        if (errmsg != NULL) {
                if (!silent)
                        pr_err("%s\n", errmsg);
                goto out3;
        }
        return 0;

out3:
        dput(s->s_root);
        s->s_root = NULL;
out2:
        iput(sbi->longfile);
out1:
        iput(sbi->inodes);
out:
        brelse(bh1);
        brelse(bh2);
outnobh:
        kfree(qs);
        s->s_fs_info = NULL;
        return ret;
}

static void qnx6_put_super(struct super_block *sb)
{
        struct qnx6_sb_info *qs = QNX6_SB(sb);
        brelse(qs->sb_buf);
        iput(qs->longfile);
        iput(qs->inodes);
        kfree(qs);
        sb->s_fs_info = NULL;
        return;
}

static sector_t qnx6_bmap(struct address_space *mapping, sector_t block)
{
        return generic_block_bmap(mapping, block, qnx6_get_block);
}
static const struct address_space_operations qnx6_aops = {
        .read_folio        = qnx6_read_folio,
        .readahead        = qnx6_readahead,
        .bmap                = qnx6_bmap
};

static struct inode *qnx6_private_inode(struct super_block *s,
                                        struct qnx6_root_node *p)
{
        struct inode *inode = new_inode(s);
        if (inode) {
                struct qnx6_inode_info *ei = QNX6_I(inode);
                struct qnx6_sb_info *sbi = QNX6_SB(s);
                inode->i_size = fs64_to_cpu(sbi, p->size);
                memcpy(ei->di_block_ptr, p->ptr, sizeof(p->ptr));
                ei->di_filelevels = p->levels;
                inode->i_mode = S_IFREG | S_IRUSR; /* probably wrong */
                inode->i_mapping->a_ops = &qnx6_aops;
        }
        return inode;
}

struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
{
        struct qnx6_sb_info *sbi = QNX6_SB(sb);
        struct qnx6_inode_entry *raw_inode;
        struct inode *inode;
        struct qnx6_inode_info        *ei;
        struct address_space *mapping;
        struct folio *folio;
        u32 n, offs;

        inode = iget_locked(sb, ino);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (!(inode_state_read_once(inode) & I_NEW))
                return inode;

        ei = QNX6_I(inode);

        inode->i_mode = 0;

        if (ino == 0) {
                pr_err("bad inode number on dev %s: %u is out of range\n",
                       sb->s_id, ino);
                iget_failed(inode);
                return ERR_PTR(-EIO);
        }
        n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS);
        mapping = sbi->inodes->i_mapping;
        folio = read_mapping_folio(mapping, n, NULL);
        if (IS_ERR(folio)) {
                pr_err("major problem: unable to read inode from dev %s\n",
                       sb->s_id);
                iget_failed(inode);
                return ERR_CAST(folio);
        }
        offs = offset_in_folio(folio, (ino - 1) << QNX6_INODE_SIZE_BITS);
        raw_inode = kmap_local_folio(folio, offs);

        inode->i_mode    = fs16_to_cpu(sbi, raw_inode->di_mode);
        i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid));
        i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid));
        inode->i_size    = fs64_to_cpu(sbi, raw_inode->di_size);
        inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->di_mtime), 0);
        inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->di_atime), 0);
        inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0);

        /* calc blocks based on 512 byte blocksize */
        inode->i_blocks = (inode->i_size + 511) >> 9;

        memcpy(&ei->di_block_ptr, &raw_inode->di_block_ptr,
                                sizeof(raw_inode->di_block_ptr));
        ei->di_filelevels = raw_inode->di_filelevels;

        if (S_ISREG(inode->i_mode)) {
                inode->i_fop = &generic_ro_fops;
                inode->i_mapping->a_ops = &qnx6_aops;
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &qnx6_dir_inode_operations;
                inode->i_fop = &qnx6_dir_operations;
                inode->i_mapping->a_ops = &qnx6_aops;
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = &page_symlink_inode_operations;
                inode_nohighmem(inode);
                inode->i_mapping->a_ops = &qnx6_aops;
        } else
                init_special_inode(inode, inode->i_mode, 0);
        folio_release_kmap(folio, raw_inode);
        unlock_new_inode(inode);
        return inode;
}

static struct kmem_cache *qnx6_inode_cachep;

static struct inode *qnx6_alloc_inode(struct super_block *sb)
{
        struct qnx6_inode_info *ei;
        ei = alloc_inode_sb(sb, qnx6_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        return &ei->vfs_inode;
}

static void qnx6_free_inode(struct inode *inode)
{
        kmem_cache_free(qnx6_inode_cachep, QNX6_I(inode));
}

static void init_once(void *foo)
{
        struct qnx6_inode_info *ei = (struct qnx6_inode_info *) foo;

        inode_init_once(&ei->vfs_inode);
}

static int init_inodecache(void)
{
        qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
                                             sizeof(struct qnx6_inode_info),
                                             0, (SLAB_RECLAIM_ACCOUNT|
                                                SLAB_ACCOUNT),
                                             init_once);
        if (!qnx6_inode_cachep)
                return -ENOMEM;
        return 0;
}

static void destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(qnx6_inode_cachep);
}

static int qnx6_get_tree(struct fs_context *fc)
{
        return get_tree_bdev(fc, qnx6_fill_super);
}

static void qnx6_free_fc(struct fs_context *fc)
{
        kfree(fc->fs_private);
}

static const struct fs_context_operations qnx6_context_ops = {
        .parse_param        = qnx6_parse_param,
        .get_tree        = qnx6_get_tree,
        .reconfigure        = qnx6_reconfigure,
        .free                = qnx6_free_fc,
};

static int qnx6_init_fs_context(struct fs_context *fc)
{
        struct qnx6_context *ctx;

        ctx = kzalloc_obj(struct qnx6_context);
        if (!ctx)
                return -ENOMEM;
        fc->ops = &qnx6_context_ops;
        fc->fs_private = ctx;

        return 0;
}

static struct file_system_type qnx6_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "qnx6",
        .kill_sb                = kill_block_super,
        .fs_flags                = FS_REQUIRES_DEV,
        .init_fs_context        = qnx6_init_fs_context,
        .parameters                = qnx6_param_spec,
};
MODULE_ALIAS_FS("qnx6");

static int __init init_qnx6_fs(void)
{
        int err;

        err = init_inodecache();
        if (err)
                return err;

        err = register_filesystem(&qnx6_fs_type);
        if (err) {
                destroy_inodecache();
                return err;
        }

        pr_info("QNX6 filesystem 1.0.0 registered.\n");
        return 0;
}

static void __exit exit_qnx6_fs(void)
{
        unregister_filesystem(&qnx6_fs_type);
        destroy_inodecache();
}

module_init(init_qnx6_fs)
module_exit(exit_qnx6_fs)
MODULE_DESCRIPTION("QNX6 file system");
MODULE_LICENSE("GPL");





























































































































   25 





















    1 


    1 


























    1 









    1 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/bug.h>
#include <linux/atomic.h>
#include <linux/errseq.h>
#include <linux/log2.h>

/*
 * An errseq_t is a way of recording errors in one place, and allowing any
 * number of "subscribers" to tell whether it has changed since a previous
 * point where it was sampled.
 *
 * It's implemented as an unsigned 32-bit value. The low order bits are
 * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits
 * are used as a counter. This is done with atomics instead of locking so that
 * these functions can be called from any context.
 *
 * The general idea is for consumers to sample an errseq_t value. That value
 * can later be used to tell whether any new errors have occurred since that
 * sampling was done.
 *
 * Note that there is a risk of collisions if new errors are being recorded
 * frequently, since we have so few bits to use as a counter.
 *
 * To mitigate this, one bit is used as a flag to tell whether the value has
 * been sampled since a new value was recorded. That allows us to avoid bumping
 * the counter if no one has sampled it since the last time an error was
 * recorded.
 *
 * A new errseq_t should always be zeroed out.  A errseq_t value of all zeroes
 * is the special (but common) case where there has never been an error. An all
 * zero value thus serves as the "epoch" if one wishes to know whether there
 * has ever been an error set since it was first initialized.
 */

/* The low bits are designated for error code (max of MAX_ERRNO) */
#define ERRSEQ_SHIFT                (ilog2(MAX_ERRNO) + 1)

/* This bit is used as a flag to indicate whether the value has been seen */
#define ERRSEQ_SEEN                (1 << ERRSEQ_SHIFT)

/* Leverage macro ERRSEQ_SEEN to define errno mask macro here */
#define ERRNO_MASK                (ERRSEQ_SEEN - 1)

/* The lowest bit of the counter */
#define ERRSEQ_CTR_INC                (1 << (ERRSEQ_SHIFT + 1))

/**
 * errseq_set - set a errseq_t for later reporting
 * @eseq: errseq_t field that should be set
 * @err: error to set (must be between -1 and -MAX_ERRNO)
 *
 * This function sets the error in @eseq, and increments the sequence counter
 * if the last sequence was sampled at some point in the past.
 *
 * Any error set will always overwrite an existing error.
 *
 * Return: The previous value, primarily for debugging purposes. The
 * return value should not be used as a previously sampled value in later
 * calls as it will not have the SEEN flag set.
 */
errseq_t errseq_set(errseq_t *eseq, int err)
{
        errseq_t cur, old;


        /*
         * Ensure the error code actually fits where we want it to go. If it
         * doesn't then just throw a warning and don't record anything. We
         * also don't accept zero here as that would effectively clear a
         * previous error.
         */
        old = READ_ONCE(*eseq);

        if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO),
                                "err = %d\n", err))
                return old;

        for (;;) {
                errseq_t new;

                /* Clear out error bits and set new error */
                new = (old & ~(ERRNO_MASK | ERRSEQ_SEEN)) | -err;

                /* Only increment if someone has looked at it */
                if (old & ERRSEQ_SEEN)
                        new += ERRSEQ_CTR_INC;

                /* If there would be no change, then call it done */
                if (new == old) {
                        cur = new;
                        break;
                }

                /* Try to swap the new value into place */
                cur = cmpxchg(eseq, old, new);

                /*
                 * Call it success if we did the swap or someone else beat us
                 * to it for the same value.
                 */
                if (likely(cur == old || cur == new))
                        break;

                /* Raced with an update, try again */
                old = cur;
        }
        return cur;
}
EXPORT_SYMBOL(errseq_set);

/**
 * errseq_sample() - Grab current errseq_t value.
 * @eseq: Pointer to errseq_t to be sampled.
 *
 * This function allows callers to initialise their errseq_t variable.
 * If the error has been "seen", new callers will not see an old error.
 * If there is an unseen error in @eseq, the caller of this function will
 * see it the next time it checks for an error.
 *
 * Context: Any context.
 * Return: The current errseq value.
 */
errseq_t errseq_sample(errseq_t *eseq)
{
        errseq_t old = READ_ONCE(*eseq);

        /* If nobody has seen this error yet, then we can be the first. */
        if (!(old & ERRSEQ_SEEN))
                old = 0;
        return old;
}
EXPORT_SYMBOL(errseq_sample);

/**
 * errseq_check() - Has an error occurred since a particular sample point?
 * @eseq: Pointer to errseq_t value to be checked.
 * @since: Previously-sampled errseq_t from which to check.
 *
 * Grab the value that eseq points to, and see if it has changed @since
 * the given value was sampled. The @since value is not advanced, so there
 * is no need to mark the value as seen.
 *
 * Return: The latest error set in the errseq_t or 0 if it hasn't changed.
 */
int errseq_check(errseq_t *eseq, errseq_t since)
{
        errseq_t cur = READ_ONCE(*eseq);

        if (likely(cur == since))
                return 0;
        return -(cur & ERRNO_MASK);
}
EXPORT_SYMBOL(errseq_check);

/**
 * errseq_check_and_advance() - Check an errseq_t and advance to current value.
 * @eseq: Pointer to value being checked and reported.
 * @since: Pointer to previously-sampled errseq_t to check against and advance.
 *
 * Grab the eseq value, and see whether it matches the value that @since
 * points to. If it does, then just return 0.
 *
 * If it doesn't, then the value has changed. Set the "seen" flag, and try to
 * swap it into place as the new eseq value. Then, set that value as the new
 * "since" value, and return whatever the error portion is set to.
 *
 * Note that no locking is provided here for concurrent updates to the "since"
 * value. The caller must provide that if necessary. Because of this, callers
 * may want to do a lockless errseq_check before taking the lock and calling
 * this.
 *
 * Return: Negative errno if one has been stored, or 0 if no new error has
 * occurred.
 */
int errseq_check_and_advance(errseq_t *eseq, errseq_t *since)
{
        int err = 0;
        errseq_t old, new;

        /*
         * Most callers will want to use the inline wrapper to check this,
         * so that the common case of no error is handled without needing
         * to take the lock that protects the "since" value.
         */
        old = READ_ONCE(*eseq);
        if (old != *since) {
                /*
                 * Set the flag and try to swap it into place if it has
                 * changed.
                 *
                 * We don't care about the outcome of the swap here. If the
                 * swap doesn't occur, then it has either been updated by a
                 * writer who is altering the value in some way (updating
                 * counter or resetting the error), or another reader who is
                 * just setting the "seen" flag. Either outcome is OK, and we
                 * can advance "since" and return an error based on what we
                 * have.
                 */
                new = old | ERRSEQ_SEEN;
                if (new != old)
                        cmpxchg(eseq, old, new);
                *since = new;
                err = -(new & ERRNO_MASK);
        }
        return err;
}
EXPORT_SYMBOL(errseq_check_and_advance);












































































































































































































































    3 







    2 

















    3 






    3 


    3 





















    3 




    3 













    3 
















    2 







    1 

    2 


    3 







    3 










    3 












    3 







    3 









    3 






    3 




















































































    3 










    2 
















    3 

















    3 













    1 








    1 






































    1 










    2 

























































































































    3 






    3 









    1 

























































    3 




















    3 













































    3 






























    3 












    1 



    2 















    2 


































    3 















    2 








    3 






    3 



























































    6 










    6 
























    4 














    4 


    4 









    2 










    1 





















    3 







    1 






















    4 








    4 














    2 





    2 




    2 















































































































































































































    7 





    7 


























    7 


    7 


    7 






































    1 











    6 






    6 










































































    1 

    1 


    1 




























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        NET3:        Implementation of the ICMP protocol layer.
 *
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *
 *        Some of the function names and the icmp unreach table for this
 *        module were derived from [icmp.c 1.0.11 06/02/93] by
 *        Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
 *        Other than that this module is a complete rewrite.
 *
 *        Fixes:
 *        Clemens Fruhwirth        :        introduce global icmp rate limiting
 *                                        with icmp type masking ability instead
 *                                        of broken per type icmp timeouts.
 *                Mike Shaver        :        RFC1122 checks.
 *                Alan Cox        :        Multicast ping reply as self.
 *                Alan Cox        :        Fix atomicity lockup in ip_build_xmit
 *                                        call.
 *                Alan Cox        :        Added 216,128 byte paths to the MTU
 *                                        code.
 *                Martin Mares        :        RFC1812 checks.
 *                Martin Mares        :        Can be configured to follow redirects
 *                                        if acting as a router _without_ a
 *                                        routing protocol (RFC 1812).
 *                Martin Mares        :        Echo requests may be configured to
 *                                        be ignored (RFC 1812).
 *                Martin Mares        :        Limitation of ICMP error message
 *                                        transmit rate (RFC 1812).
 *                Martin Mares        :        TOS and Precedence set correctly
 *                                        (RFC 1812).
 *                Martin Mares        :        Now copying as much data from the
 *                                        original packet as we can without
 *                                        exceeding 576 bytes (RFC 1812).
 *        Willy Konynenberg        :        Transparent proxying support.
 *                Keith Owens        :        RFC1191 correction for 4.2BSD based
 *                                        path MTU bug.
 *                Thomas Quinot        :        ICMP Dest Unreach codes up to 15 are
 *                                        valid (RFC 1812).
 *                Andi Kleen        :        Check all packet lengths properly
 *                                        and moved all kfree_skb() up to
 *                                        icmp_rcv.
 *                Andi Kleen        :        Move the rate limit bookkeeping
 *                                        into the dest entry and use a token
 *                                        bucket filter (thanks to ANK). Make
 *                                        the rates sysctl configurable.
 *                Yu Tianli        :        Fixed two ugly bugs in icmp_send
 *                                        - IP option length was accounted wrongly
 *                                        - ICMP header length was not accounted
 *                                          at all.
 *              Tristan Greaves :       Added sysctl option to ignore bogus
 *                                      broadcast responses from broken routers.
 *
 * To Fix:
 *
 *        - Should use skb_pull() instead of all the manual checking.
 *          This would also greatly simply some upper layer error handlers. --AK
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
#include <linux/slab.h>
#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/protocol.h>
#include <net/icmp.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/uaccess.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/inet_common.h>
#include <net/ip_fib.h>
#include <net/l3mdev.h>
#include <net/addrconf.h>
#include <net/inet_dscp.h>
#define CREATE_TRACE_POINTS
#include <trace/events/icmp.h>

/*
 *        Build xmit assembly blocks
 */

struct icmp_bxm {
        struct sk_buff *skb;
        int offset;
        int data_len;

        struct {
                struct icmphdr icmph;
                __be32               times[3];
        } data;
        int head_len;

        /* Must be last as it ends in a flexible-array member. */
        struct ip_options_rcu replyopts;
};

/* An array of errno for error messages from dest unreach. */
/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */

const struct icmp_err icmp_err_convert[] = {
        {
                .errno = ENETUNREACH,        /* ICMP_NET_UNREACH */
                .fatal = 0,
        },
        {
                .errno = EHOSTUNREACH,        /* ICMP_HOST_UNREACH */
                .fatal = 0,
        },
        {
                .errno = ENOPROTOOPT        /* ICMP_PROT_UNREACH */,
                .fatal = 1,
        },
        {
                .errno = ECONNREFUSED,        /* ICMP_PORT_UNREACH */
                .fatal = 1,
        },
        {
                .errno = EMSGSIZE,        /* ICMP_FRAG_NEEDED */
                .fatal = 0,
        },
        {
                .errno = EOPNOTSUPP,        /* ICMP_SR_FAILED */
                .fatal = 0,
        },
        {
                .errno = ENETUNREACH,        /* ICMP_NET_UNKNOWN */
                .fatal = 1,
        },
        {
                .errno = EHOSTDOWN,        /* ICMP_HOST_UNKNOWN */
                .fatal = 1,
        },
        {
                .errno = ENONET,        /* ICMP_HOST_ISOLATED */
                .fatal = 1,
        },
        {
                .errno = ENETUNREACH,        /* ICMP_NET_ANO        */
                .fatal = 1,
        },
        {
                .errno = EHOSTUNREACH,        /* ICMP_HOST_ANO */
                .fatal = 1,
        },
        {
                .errno = ENETUNREACH,        /* ICMP_NET_UNR_TOS */
                .fatal = 0,
        },
        {
                .errno = EHOSTUNREACH,        /* ICMP_HOST_UNR_TOS */
                .fatal = 0,
        },
        {
                .errno = EHOSTUNREACH,        /* ICMP_PKT_FILTERED */
                .fatal = 1,
        },
        {
                .errno = EHOSTUNREACH,        /* ICMP_PREC_VIOLATION */
                .fatal = 1,
        },
        {
                .errno = EHOSTUNREACH,        /* ICMP_PREC_CUTOFF */
                .fatal = 1,
        },
};
EXPORT_SYMBOL(icmp_err_convert);

/*
 *        ICMP control array. This specifies what to do with each ICMP.
 */

struct icmp_control {
        enum skb_drop_reason (*handler)(struct sk_buff *skb);
        short   error;                /* This ICMP is classed as an error message */
};

static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];

static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk);

/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
        struct sock *sk;

        sk = this_cpu_read(ipv4_icmp_sk);

        if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
                /* This can happen if the output path signals a
                 * dst_link_failure() for an outgoing ICMP packet.
                 */
                return NULL;
        }
        sock_net_set(sk, net);
        return sk;
}

static inline void icmp_xmit_unlock(struct sock *sk)
{
        sock_net_set(sk, &init_net);
        spin_unlock(&sk->sk_lock.slock);
}

/**
 * icmp_global_allow - Are we allowed to send one more ICMP message ?
 * @net: network namespace
 *
 * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
 * Returns false if we reached the limit and can not send another packet.
 * Works in tandem with icmp_global_consume().
 */
bool icmp_global_allow(struct net *net)
{
        u32 delta, now, oldstamp;
        int incr, new, old;

        /* Note: many cpus could find this condition true.
         * Then later icmp_global_consume() could consume more credits,
         * this is an acceptable race.
         */
        if (atomic_read(&net->ipv4.icmp_global_credit) > 0)
                return true;

        now = jiffies;
        oldstamp = READ_ONCE(net->ipv4.icmp_global_stamp);
        delta = min_t(u32, now - oldstamp, HZ);
        if (delta < HZ / 50)
                return false;

        incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec);
        incr = div_u64((u64)incr * delta, HZ);
        if (!incr)
                return false;

        if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) {
                old = atomic_read(&net->ipv4.icmp_global_credit);
                do {
                        new = min(old + incr, READ_ONCE(net->ipv4.sysctl_icmp_msgs_burst));
                } while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new));
        }
        return true;
}

void icmp_global_consume(struct net *net)
{
        int credits = get_random_u32_below(3);

        /* Note: this might make icmp_global.credit negative. */
        if (credits)
                atomic_sub(credits, &net->ipv4.icmp_global_credit);
}

static bool icmpv4_mask_allow(struct net *net, int type, int code)
{
        if (type > NR_ICMP_TYPES)
                return true;

        /* Don't limit PMTU discovery. */
        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
                return true;

        /* Limit if icmp type is enabled in ratemask. */
        if (!((1 << type) & READ_ONCE(net->ipv4.sysctl_icmp_ratemask)))
                return true;

        return false;
}

static bool icmpv4_global_allow(struct net *net, int type, int code,
                                bool *apply_ratelimit)
{
        if (icmpv4_mask_allow(net, type, code))
                return true;

        if (icmp_global_allow(net)) {
                *apply_ratelimit = true;
                return true;
        }
        __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
        return false;
}

/*
 *        Send an ICMP frame.
 */

static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
                               struct flowi4 *fl4, int type, int code,
                               bool apply_ratelimit)
{
        struct dst_entry *dst = &rt->dst;
        struct inet_peer *peer;
        struct net_device *dev;
        int peer_timeout;
        bool rc = true;

        if (!apply_ratelimit)
                return true;

        peer_timeout = READ_ONCE(net->ipv4.sysctl_icmp_ratelimit);
        if (!peer_timeout)
                goto out;

        /* No rate limit on loopback */
        rcu_read_lock();
        dev = dst_dev_rcu(dst);
        if (dev && (dev->flags & IFF_LOOPBACK))
                goto out_unlock;

        peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
                               l3mdev_master_ifindex_rcu(dev));
        rc = inet_peer_xrlim_allow(peer, peer_timeout);

out_unlock:
        rcu_read_unlock();
out:
        if (!rc)
                __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
        else
                icmp_global_consume(net);
        return rc;
}

/*
 *        Maintain the counters used in the SNMP statistics for outgoing ICMP
 */
void icmp_out_count(struct net *net, unsigned char type)
{
        ICMPMSGOUT_INC_STATS(net, type);
        ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS);
}

/*
 *        Checksum each fragment, and on the first include the headers and final
 *        checksum.
 */
static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
                          struct sk_buff *skb)
{
        DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
                        IP_OPTIONS_DATA_FIXED_SIZE);
        __wsum csum;

        icmp_param = from;

        csum = skb_copy_and_csum_bits(icmp_param->skb,
                                      icmp_param->offset + offset,
                                      to, len);

        skb->csum = csum_block_add(skb->csum, csum, odd);
        if (icmp_pointers[icmp_param->data.icmph.type].error)
                nf_ct_attach(skb, icmp_param->skb);
        return 0;
}

static void icmp_push_reply(struct sock *sk,
                            struct icmp_bxm *icmp_param,
                            struct flowi4 *fl4,
                            struct ipcm_cookie *ipc, struct rtable **rt)
{
        struct sk_buff *skb;

        if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
                           icmp_param->data_len+icmp_param->head_len,
                           icmp_param->head_len,
                           ipc, rt, MSG_DONTWAIT) < 0) {
                __ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS);
                ip_flush_pending_frames(sk);
        } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
                struct icmphdr *icmph = icmp_hdr(skb);
                __wsum csum;
                struct sk_buff *skb1;

                csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
                                                 (char *)icmph,
                                                 icmp_param->head_len);
                skb_queue_walk(&sk->sk_write_queue, skb1) {
                        csum = csum_add(csum, skb1->csum);
                }
                icmph->checksum = csum_fold(csum);
                skb->ip_summed = CHECKSUM_NONE;
                ip_push_pending_frames(sk, fl4);
        }
}

/*
 *        Driving logic for building and sending ICMP messages.
 */

static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct net *net = dev_net_rcu(rt->dst.dev);
        bool apply_ratelimit = false;
        struct ipcm_cookie ipc;
        struct flowi4 fl4;
        struct sock *sk;
        __be32 daddr, saddr;
        u32 mark = IP4_REPLY_MARK(net, skb->mark);
        int type = icmp_param->data.icmph.type;
        int code = icmp_param->data.icmph.code;

        if (ip_options_echo(net, &icmp_param->replyopts.opt, skb))
                return;

        /* Needed by both icmpv4_global_allow and icmp_xmit_lock */
        local_bh_disable();

        /* is global icmp_msgs_per_sec exhausted ? */
        if (!icmpv4_global_allow(net, type, code, &apply_ratelimit))
                goto out_bh_enable;

        sk = icmp_xmit_lock(net);
        if (!sk)
                goto out_bh_enable;

        icmp_param->data.icmph.checksum = 0;

        ipcm_init(&ipc);
        ipc.tos = ip_hdr(skb)->tos;
        ipc.sockc.mark = mark;
        daddr = ipc.addr = ip_hdr(skb)->saddr;
        saddr = fib_compute_spec_dst(skb);

        if (icmp_param->replyopts.opt.optlen) {
                ipc.opt = &icmp_param->replyopts;
                if (ipc.opt->opt.srr)
                        daddr = icmp_param->replyopts.opt.faddr;
        }
        memset(&fl4, 0, sizeof(fl4));
        fl4.daddr = daddr;
        fl4.saddr = saddr;
        fl4.flowi4_mark = mark;
        fl4.flowi4_uid = sock_net_uid(net, NULL);
        fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb));
        fl4.flowi4_proto = IPPROTO_ICMP;
        fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
        security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
        rt = ip_route_output_key(net, &fl4);
        if (IS_ERR(rt))
                goto out_unlock;
        if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
                icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
        ip_rt_put(rt);
out_unlock:
        icmp_xmit_unlock(sk);
out_bh_enable:
        local_bh_enable();
}

/*
 * The device used for looking up which routing table to use for sending an ICMP
 * error is preferably the source whenever it is set, which should ensure the
 * icmp error can be sent to the source host, else lookup using the routing
 * table of the destination device, else use the main routing table (index 0).
 */
static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        const struct dst_entry *dst;

        if (dev)
                return dev;
        dst = skb_dst(skb);
        return dst ? dst_dev(dst) : NULL;
}

static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
                                        struct sk_buff *skb_in,
                                        const struct iphdr *iph, __be32 saddr,
                                        dscp_t dscp, u32 mark, int type,
                                        int code, struct icmp_bxm *param)
{
        struct net_device *route_lookup_dev;
        struct dst_entry *dst, *dst2;
        struct rtable *rt, *rt2;
        struct flowi4 fl4_dec;
        int err;

        memset(fl4, 0, sizeof(*fl4));
        fl4->daddr = (param->replyopts.opt.srr ?
                      param->replyopts.opt.faddr : iph->saddr);
        fl4->saddr = saddr;
        fl4->flowi4_mark = mark;
        fl4->flowi4_uid = sock_net_uid(net, NULL);
        fl4->flowi4_dscp = dscp;
        fl4->flowi4_proto = IPPROTO_ICMP;
        fl4->fl4_icmp_type = type;
        fl4->fl4_icmp_code = code;
        route_lookup_dev = icmp_get_route_lookup_dev(skb_in);
        fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);

        security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4));
        rt = ip_route_output_key_hash(net, fl4, skb_in);
        if (IS_ERR(rt))
                return rt;

        /* No need to clone since we're just using its address. */
        rt2 = rt;

        dst = xfrm_lookup(net, &rt->dst,
                          flowi4_to_flowi(fl4), NULL, 0);
        rt = dst_rtable(dst);
        if (!IS_ERR(dst)) {
                if (rt != rt2)
                        return rt;
                if (inet_addr_type_dev_table(net, route_lookup_dev,
                                             fl4->daddr) == RTN_LOCAL)
                        return rt;
        } else if (PTR_ERR(dst) == -EPERM) {
                rt = NULL;
        } else {
                return rt;
        }
        err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
        if (err)
                goto relookup_failed;

        if (inet_addr_type_dev_table(net, route_lookup_dev,
                                     fl4_dec.saddr) == RTN_LOCAL) {
                rt2 = __ip_route_output_key(net, &fl4_dec);
                if (IS_ERR(rt2))
                        err = PTR_ERR(rt2);
        } else {
                struct flowi4 fl4_2 = {};
                unsigned long orefdst;

                fl4_2.daddr = fl4_dec.saddr;
                rt2 = ip_route_output_key(net, &fl4_2);
                if (IS_ERR(rt2)) {
                        err = PTR_ERR(rt2);
                        goto relookup_failed;
                }
                /* Ugh! */
                orefdst = skb_dstref_steal(skb_in);
                err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
                                     dscp, rt2->dst.dev) ? -EINVAL : 0;

                dst_release(&rt2->dst);
                rt2 = skb_rtable(skb_in);
                /* steal dst entry from skb_in, don't drop refcnt */
                skb_dstref_steal(skb_in);
                skb_dstref_restore(skb_in, orefdst);

                /*
                 * At this point, fl4_dec.daddr should NOT be local (we
                 * checked fl4_dec.saddr above). However, a race condition
                 * may occur if the address is added to the interface
                 * concurrently. In that case, ip_route_input() returns a
                 * LOCAL route with dst.output=ip_rt_bug, which must not
                 * be used for output.
                 */
                if (!err && rt2 && rt2->rt_type == RTN_LOCAL) {
                        net_warn_ratelimited("detected local route for %pI4 during ICMP sending, src %pI4\n",
                                             &fl4_dec.daddr, &fl4_dec.saddr);
                        dst_release(&rt2->dst);
                        err = -EINVAL;
                }
        }

        if (err)
                goto relookup_failed;

        dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL,
                           XFRM_LOOKUP_ICMP);
        rt2 = dst_rtable(dst2);
        if (!IS_ERR(dst2)) {
                dst_release(&rt->dst);
                rt = rt2;
        } else if (PTR_ERR(dst2) == -EPERM) {
                if (rt)
                        dst_release(&rt->dst);
                return rt2;
        } else {
                err = PTR_ERR(dst2);
                goto relookup_failed;
        }
        return rt;

relookup_failed:
        if (rt)
                return rt;
        return ERR_PTR(err);
}

struct icmp_ext_iio_addr4_subobj {
        __be16 afi;
        __be16 reserved;
        __be32 addr4;
};

static unsigned int icmp_ext_iio_len(void)
{
        return sizeof(struct icmp_extobj_hdr) +
                /* ifIndex */
                sizeof(__be32) +
                /* Interface Address Sub-Object */
                sizeof(struct icmp_ext_iio_addr4_subobj) +
                /* Interface Name Sub-Object. Length must be a multiple of 4
                 * bytes.
                 */
                ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
                /* MTU */
                sizeof(__be32);
}

static unsigned int icmp_ext_max_len(u8 ext_objs)
{
        unsigned int ext_max_len;

        ext_max_len = sizeof(struct icmp_ext_hdr);

        if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
                ext_max_len += icmp_ext_iio_len();

        return ext_max_len;
}

static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev)
{
        struct in_device *in_dev;
        struct in_ifaddr *ifa;

        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                return 0;

        /* It is unclear from RFC 5837 which IP address should be chosen, but
         * it makes sense to choose a global unicast address.
         */
        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
                        continue;
                if (ifa->ifa_scope != RT_SCOPE_UNIVERSE ||
                    ipv4_is_multicast(ifa->ifa_address))
                        continue;
                return ifa->ifa_address;
        }

        return 0;
}

static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
                                    int iif)
{
        struct icmp_ext_iio_name_subobj *name_subobj;
        struct icmp_extobj_hdr *objh;
        struct net_device *dev;
        __be32 data;

        if (!iif)
                return;

        /* Add the fields in the order specified by RFC 5837. */
        objh = skb_put(skb, sizeof(*objh));
        objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
        objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);

        data = htonl(iif);
        skb_put_data(skb, &data, sizeof(__be32));
        objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;

        rcu_read_lock();

        dev = dev_get_by_index_rcu(net, iif);
        if (!dev)
                goto out;

        data = icmp_ext_iio_addr4_find(dev);
        if (data) {
                struct icmp_ext_iio_addr4_subobj *addr4_subobj;

                addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj));
                addr4_subobj->afi = htons(ICMP_AFI_IP);
                addr4_subobj->addr4 = data;
                objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
        }

        name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
        name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
        netdev_copy_name(dev, name_subobj->name);
        objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;

        data = htonl(READ_ONCE(dev->mtu));
        skb_put_data(skb, &data, sizeof(__be32));
        objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;

out:
        rcu_read_unlock();
        objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
}

static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb,
                                 u8 ext_objs, int iif)
{
        if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
                icmp_ext_iio_iif_append(net, skb, iif);
}

static struct sk_buff *
icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph,
                unsigned int room, int iif)
{
        unsigned int payload_len, ext_max_len, ext_len;
        struct icmp_ext_hdr *ext_hdr;
        struct sk_buff *skb;
        u8 ext_objs;
        int nhoff;

        switch (icmph->type) {
        case ICMP_DEST_UNREACH:
        case ICMP_TIME_EXCEEDED:
        case ICMP_PARAMETERPROB:
                break;
        default:
                return NULL;
        }

        ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask);
        if (!ext_objs)
                return NULL;

        ext_max_len = icmp_ext_max_len(ext_objs);
        if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
                return NULL;

        skb = skb_clone(skb_in, GFP_ATOMIC);
        if (!skb)
                return NULL;

        nhoff = skb_network_offset(skb);
        payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);

        if (!pskb_network_may_pull(skb, payload_len))
                goto free_skb;

        if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
            __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
                goto free_skb;

        if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
                goto free_skb;

        ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
        ext_hdr->version = ICMP_EXT_VERSION_2;

        icmp_ext_objs_append(net, skb, ext_objs, iif);

        /* Do not send an empty extension structure. */
        ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
        if (ext_len == sizeof(*ext_hdr))
                goto free_skb;

        ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
        /* The length of the original datagram in 32-bit words (RFC 4884). */
        icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32);

        return skb;

free_skb:
        consume_skb(skb);
        return NULL;
}

/*
 *        Send an ICMP message in response to a situation
 *
 *        RFC 1122: 3.2.2        MUST send at least the IP header and 8 bytes of header.
 *                  MAY send more (we do).
 *                        MUST NOT change this header information.
 *                        MUST NOT reply to a multicast/broadcast IP address.
 *                        MUST NOT reply to a multicast/broadcast MAC address.
 *                        MUST reply to only the first fragment.
 */

void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
                 const struct inet_skb_parm *parm)
{
        DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
                        IP_OPTIONS_DATA_FIXED_SIZE);
        struct iphdr *iph;
        int room;
        struct rtable *rt = skb_rtable(skb_in);
        bool apply_ratelimit = false;
        struct sk_buff *ext_skb;
        struct ipcm_cookie ipc;
        struct flowi4 fl4;
        __be32 saddr;
        u8  tos;
        u32 mark;
        struct net *net;
        struct sock *sk;

        if (!rt)
                return;

        rcu_read_lock();

        if (rt->dst.dev)
                net = dev_net_rcu(rt->dst.dev);
        else if (skb_in->dev)
                net = dev_net_rcu(skb_in->dev);
        else
                goto out;

        /*
         *        Find the original header. It is expected to be valid, of course.
         *        Check this, icmp_send is called from the most obscure devices
         *        sometimes.
         */
        iph = ip_hdr(skb_in);

        if ((u8 *)iph < skb_in->head ||
            (skb_network_header(skb_in) + sizeof(*iph)) >
            skb_tail_pointer(skb_in))
                goto out;

        /*
         *        No replies to physical multicast/broadcast
         */
        if (skb_in->pkt_type != PACKET_HOST)
                goto out;

        /*
         *        Now check at the protocol level
         */
        if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
                goto out;

        /*
         *        Only reply to fragment 0. We byte re-order the constant
         *        mask for efficiency.
         */
        if (iph->frag_off & htons(IP_OFFSET))
                goto out;

        /*
         *        If we send an ICMP error to an ICMP error a mess would result..
         */
        if (icmp_pointers[type].error) {
                /*
                 *        We are an error, check if we are replying to an
                 *        ICMP error
                 */
                if (iph->protocol == IPPROTO_ICMP) {
                        u8 _inner_type, *itp;

                        itp = skb_header_pointer(skb_in,
                                                 skb_network_header(skb_in) +
                                                 (iph->ihl << 2) +
                                                 offsetof(struct icmphdr,
                                                          type) -
                                                 skb_in->data,
                                                 sizeof(_inner_type),
                                                 &_inner_type);
                        if (!itp)
                                goto out;

                        /*
                         *        Assume any unknown ICMP type is an error. This
                         *        isn't specified by the RFC, but think about it..
                         */
                        if (*itp > NR_ICMP_TYPES ||
                            icmp_pointers[*itp].error)
                                goto out;
                }
        }

        /* Needed by both icmpv4_global_allow and icmp_xmit_lock */
        local_bh_disable();

        /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
         * incoming dev is loopback.  If outgoing dev change to not be
         * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
         */
        if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
              !icmpv4_global_allow(net, type, code, &apply_ratelimit))
                goto out_bh_enable;

        sk = icmp_xmit_lock(net);
        if (!sk)
                goto out_bh_enable;

        /*
         *        Construct source address and options.
         */

        saddr = iph->daddr;
        if (!(rt->rt_flags & RTCF_LOCAL)) {
                struct net_device *dev = NULL;

                rcu_read_lock();
                if (rt_is_input_route(rt) &&
                    READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr))
                        dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif :
                                                   inet_iif(skb_in));

                if (dev)
                        saddr = inet_select_addr(dev, iph->saddr,
                                                 RT_SCOPE_LINK);
                else
                        saddr = 0;
                rcu_read_unlock();
        }

        tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) |
                                           IPTOS_PREC_INTERNETCONTROL) :
                                           iph->tos;
        mark = IP4_REPLY_MARK(net, skb_in->mark);

        if (__ip_options_echo(net, &icmp_param->replyopts.opt, skb_in,
                              &parm->opt))
                goto out_unlock;


        /*
         *        Prepare data for ICMP header.
         */

        icmp_param->data.icmph.type         = type;
        icmp_param->data.icmph.code         = code;
        icmp_param->data.icmph.un.gateway = info;
        icmp_param->data.icmph.checksum         = 0;
        icmp_param->skb          = skb_in;
        icmp_param->offset = skb_network_offset(skb_in);
        ipcm_init(&ipc);
        ipc.tos = tos;
        ipc.addr = iph->saddr;
        ipc.opt = &icmp_param->replyopts;
        ipc.sockc.mark = mark;

        rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr,
                               inet_dsfield_to_dscp(tos), mark, type, code,
                               icmp_param);
        if (IS_ERR(rt))
                goto out_unlock;

        /* peer icmp_ratelimit */
        if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
                goto ende;

        /* RFC says return as much as we can without exceeding 576 bytes. */

        room = dst4_mtu(&rt->dst);
        if (room > 576)
                room = 576;
        room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.optlen;
        room -= sizeof(struct icmphdr);
        /* Guard against tiny mtu. We need to include at least one
         * IP network header for this message to make any sense.
         */
        if (room <= (int)sizeof(struct iphdr))
                goto ende;

        ext_skb = icmp_ext_append(net, skb_in, &icmp_param->data.icmph, room,
                                  parm->iif);
        if (ext_skb)
                icmp_param->skb = ext_skb;

        icmp_param->data_len = icmp_param->skb->len - icmp_param->offset;
        if (icmp_param->data_len > room)
                icmp_param->data_len = room;
        icmp_param->head_len = sizeof(struct icmphdr);

        /* if we don't have a source address at this point, fall back to the
         * dummy address instead of sending out a packet with a source address
         * of 0.0.0.0
         */
        if (!fl4.saddr)
                fl4.saddr = htonl(INADDR_DUMMY);

        trace_icmp_send(skb_in, type, code);

        icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);

        if (ext_skb)
                consume_skb(ext_skb);
ende:
        ip_rt_put(rt);
out_unlock:
        icmp_xmit_unlock(sk);
out_bh_enable:
        local_bh_enable();
out:
        rcu_read_unlock();
}
EXPORT_SYMBOL(__icmp_send);

#if IS_ENABLED(CONFIG_NF_NAT)
#include <net/netfilter/nf_conntrack.h>
void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
        struct sk_buff *cloned_skb = NULL;
        enum ip_conntrack_info ctinfo;
        enum ip_conntrack_dir dir;
        struct inet_skb_parm parm;
        struct nf_conn *ct;
        __be32 orig_ip;

        memset(&parm, 0, sizeof(parm));
        ct = nf_ct_get(skb_in, &ctinfo);
        if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
                __icmp_send(skb_in, type, code, info, &parm);
                return;
        }

        if (skb_shared(skb_in))
                skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);

        if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
            (skb_network_header(skb_in) + sizeof(struct iphdr)) >
            skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
            skb_network_offset(skb_in) + sizeof(struct iphdr))))
                goto out;

        orig_ip = ip_hdr(skb_in)->saddr;
        dir = CTINFO2DIR(ctinfo);
        ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip;
        __icmp_send(skb_in, type, code, info, &parm);
        ip_hdr(skb_in)->saddr = orig_ip;
out:
        consume_skb(cloned_skb);
}
EXPORT_SYMBOL(icmp_ndo_send);
#endif

static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        const struct net_protocol *ipprot;
        int protocol = iph->protocol;

        /* Checkin full IP header plus 8 bytes of protocol to
         * avoid additional coding at protocol handlers.
         */
        if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
                goto out;

        /* IPPROTO_RAW sockets are not supposed to receive anything. */
        if (protocol == IPPROTO_RAW)
                goto out;

        raw_icmp_error(skb, protocol, info);

        ipprot = rcu_dereference(inet_protos[protocol]);
        if (ipprot && ipprot->err_handler)
                ipprot->err_handler(skb, info);
        return;

out:
        __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
}

static bool icmp_tag_validation(int proto)
{
        const struct net_protocol *ipprot;
        bool ok;

        rcu_read_lock();
        ipprot = rcu_dereference(inet_protos[proto]);
        ok = ipprot ? ipprot->icmp_strict_tag_validation : false;
        rcu_read_unlock();
        return ok;
}

/*
 *        Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and
 *        ICMP_PARAMETERPROB.
 */

static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
{
        enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
        const struct iphdr *iph;
        struct icmphdr *icmph;
        struct net *net;
        u32 info = 0;

        net = skb_dst_dev_net_rcu(skb);

        /*
         *        Incomplete header ?
         *         Only checks for the IP header, there should be an
         *        additional check for longer headers in upper levels.
         */

        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto out_err;

        icmph = icmp_hdr(skb);
        iph   = (const struct iphdr *)skb->data;

        if (iph->ihl < 5)  { /* Mangled header, drop. */
                reason = SKB_DROP_REASON_IP_INHDR;
                goto out_err;
        }

        switch (icmph->type) {
        case ICMP_DEST_UNREACH:
                switch (icmph->code & 15) {
                case ICMP_NET_UNREACH:
                case ICMP_HOST_UNREACH:
                case ICMP_PROT_UNREACH:
                case ICMP_PORT_UNREACH:
                        break;
                case ICMP_FRAG_NEEDED:
                        /* for documentation of the ip_no_pmtu_disc
                         * values please see
                         * Documentation/networking/ip-sysctl.rst
                         */
                        switch (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) {
                        default:
                                net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n",
                                                    &iph->daddr);
                                break;
                        case 2:
                                goto out;
                        case 3:
                                if (!icmp_tag_validation(iph->protocol))
                                        goto out;
                                fallthrough;
                        case 0:
                                info = ntohs(icmph->un.frag.mtu);
                        }
                        break;
                case ICMP_SR_FAILED:
                        net_dbg_ratelimited("%pI4: Source Route Failed\n",
                                            &iph->daddr);
                        break;
                default:
                        break;
                }
                if (icmph->code > NR_ICMP_UNREACH)
                        goto out;
                break;
        case ICMP_PARAMETERPROB:
                info = ntohl(icmph->un.gateway) >> 24;
                break;
        case ICMP_TIME_EXCEEDED:
                __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS);
                if (icmph->code == ICMP_EXC_FRAGTIME)
                        goto out;
                break;
        }

        /*
         *        Throw it at our lower layers
         *
         *        RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
         *                  header.
         *        RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
         *                  transport layer.
         *        RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
         *                  transport layer.
         */

        /*
         *        Check the other end isn't violating RFC 1122. Some routers send
         *        bogus responses to broadcast frames. If you see this message
         *        first check your netmask matches at both ends, if it does then
         *        get the other vendor to fix their kit.
         */

        if (!READ_ONCE(net->ipv4.sysctl_icmp_ignore_bogus_error_responses) &&
            inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
                net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
                                     &ip_hdr(skb)->saddr,
                                     icmph->type, icmph->code,
                                     &iph->daddr, skb->dev->name);
                goto out;
        }

        icmp_socket_deliver(skb, info);

out:
        return reason;
out_err:
        __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
        return reason ?: SKB_DROP_REASON_NOT_SPECIFIED;
}


/*
 *        Handle ICMP_REDIRECT.
 */

static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
{
        if (skb->len < sizeof(struct iphdr)) {
                __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
                return SKB_DROP_REASON_PKT_TOO_SMALL;
        }

        if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
                /* there aught to be a stat */
                return SKB_DROP_REASON_NOMEM;
        }

        icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway));
        return SKB_NOT_DROPPED_YET;
}

/*
 *        Handle ICMP_ECHO ("ping") and ICMP_EXT_ECHO ("PROBE") requests.
 *
 *        RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
 *                  requests.
 *        RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
 *                  included in the reply.
 *        RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
 *                  echo requests, MUST have default=NOT.
 *        RFC 8335: 8 MUST have a config option to enable/disable ICMP
 *                  Extended Echo Functionality, MUST be disabled by default
 *        See also WRT handling of options once they are done and working.
 */

static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
{
        DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
                        IP_OPTIONS_DATA_FIXED_SIZE);
        struct net *net;

        net = skb_dst_dev_net_rcu(skb);
        /* should there be an ICMP stat for ignored echos? */
        if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
                return SKB_NOT_DROPPED_YET;

        icmp_param->data.icmph           = *icmp_hdr(skb);
        icmp_param->skb                   = skb;
        icmp_param->offset           = 0;
        icmp_param->data_len           = skb->len;
        icmp_param->head_len           = sizeof(struct icmphdr);

        if (icmp_param->data.icmph.type == ICMP_ECHO)
                icmp_param->data.icmph.type = ICMP_ECHOREPLY;
        else if (!icmp_build_probe(skb, &icmp_param->data.icmph))
                return SKB_NOT_DROPPED_YET;

        icmp_reply(icmp_param, skb);
        return SKB_NOT_DROPPED_YET;
}

/*        Helper for icmp_echo and icmpv6_echo_reply.
 *        Searches for net_device that matches PROBE interface identifier
 *                and builds PROBE reply message in icmphdr.
 *
 *        Returns false if PROBE responses are disabled via sysctl
 */

bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
{
        struct net *net = dev_net_rcu(skb->dev);
        struct icmp_ext_hdr *ext_hdr, _ext_hdr;
        struct icmp_ext_echo_iio *iio, _iio;
        struct inet6_dev *in6_dev;
        struct in_device *in_dev;
        struct net_device *dev;
        char buff[IFNAMSIZ];
        u16 ident_len;
        u8 status;

        if (!READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
                return false;

        /* We currently only support probing interfaces on the proxy node
         * Check to ensure L-bit is set
         */
        if (!(ntohs(icmphdr->un.echo.sequence) & 1))
                return false;
        /* Clear status bits in reply message */
        icmphdr->un.echo.sequence &= htons(0xFF00);
        if (icmphdr->type == ICMP_EXT_ECHO)
                icmphdr->type = ICMP_EXT_ECHOREPLY;
        else
                icmphdr->type = ICMPV6_EXT_ECHO_REPLY;
        ext_hdr = skb_header_pointer(skb, 0, sizeof(_ext_hdr), &_ext_hdr);
        /* Size of iio is class_type dependent.
         * Only check header here and assign length based on ctype in the switch statement
         */
        iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio);
        if (!ext_hdr || !iio)
                goto send_mal_query;
        if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr) ||
            ntohs(iio->extobj_hdr.length) > sizeof(_iio))
                goto send_mal_query;
        ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr);
        iio = skb_header_pointer(skb, sizeof(_ext_hdr),
                                 sizeof(iio->extobj_hdr) + ident_len, &_iio);
        if (!iio)
                goto send_mal_query;

        status = 0;
        dev = NULL;
        switch (iio->extobj_hdr.class_type) {
        case ICMP_EXT_ECHO_CTYPE_NAME:
                if (ident_len >= IFNAMSIZ)
                        goto send_mal_query;
                memset(buff, 0, sizeof(buff));
                memcpy(buff, &iio->ident.name, ident_len);
                dev = dev_get_by_name(net, buff);
                break;
        case ICMP_EXT_ECHO_CTYPE_INDEX:
                if (ident_len != sizeof(iio->ident.ifindex))
                        goto send_mal_query;
                dev = dev_get_by_index(net, ntohl(iio->ident.ifindex));
                break;
        case ICMP_EXT_ECHO_CTYPE_ADDR:
                if (ident_len < sizeof(iio->ident.addr.ctype3_hdr) ||
                    ident_len != sizeof(iio->ident.addr.ctype3_hdr) +
                                 iio->ident.addr.ctype3_hdr.addrlen)
                        goto send_mal_query;
                switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) {
                case ICMP_AFI_IP:
                        if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in_addr))
                                goto send_mal_query;
                        dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr);
                        break;
#if IS_ENABLED(CONFIG_IPV6)
                case ICMP_AFI_IP6:
                        if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr))
                                goto send_mal_query;
                        dev = ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev);
                        dev_hold(dev);
                        break;
#endif
                default:
                        goto send_mal_query;
                }
                break;
        default:
                goto send_mal_query;
        }
        if (!dev) {
                icmphdr->code = ICMP_EXT_CODE_NO_IF;
                return true;
        }
        /* Fill bits in reply message */
        if (dev->flags & IFF_UP)
                status |= ICMP_EXT_ECHOREPLY_ACTIVE;

        in_dev = __in_dev_get_rcu(dev);
        if (in_dev && rcu_access_pointer(in_dev->ifa_list))
                status |= ICMP_EXT_ECHOREPLY_IPV4;

        in6_dev = __in6_dev_get(dev);
        if (in6_dev && !list_empty(&in6_dev->addr_list))
                status |= ICMP_EXT_ECHOREPLY_IPV6;

        dev_put(dev);
        icmphdr->un.echo.sequence |= htons(status);
        return true;
send_mal_query:
        icmphdr->code = ICMP_EXT_CODE_MAL_QUERY;
        return true;
}

/*
 *        Handle ICMP Timestamp requests.
 *        RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
 *                  SHOULD be in the kernel for minimum random latency.
 *                  MUST be accurate to a few minutes.
 *                  MUST be updated at least at 15Hz.
 */
static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
{
        DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
                        IP_OPTIONS_DATA_FIXED_SIZE);
        /*
         *        Too short.
         */
        if (skb->len < 4)
                goto out_err;

        /*
         *        Fill in the current time as ms since midnight UT:
         */
        icmp_param->data.times[1] = inet_current_timestamp();
        icmp_param->data.times[2] = icmp_param->data.times[1];

        BUG_ON(skb_copy_bits(skb, 0, &icmp_param->data.times[0], 4));

        icmp_param->data.icmph           = *icmp_hdr(skb);
        icmp_param->data.icmph.type = ICMP_TIMESTAMPREPLY;
        icmp_param->data.icmph.code = 0;
        icmp_param->skb                   = skb;
        icmp_param->offset           = 0;
        icmp_param->data_len           = 0;
        icmp_param->head_len           = sizeof(struct icmphdr) + 12;
        icmp_reply(icmp_param, skb);
        return SKB_NOT_DROPPED_YET;

out_err:
        __ICMP_INC_STATS(skb_dst_dev_net_rcu(skb), ICMP_MIB_INERRORS);
        return SKB_DROP_REASON_PKT_TOO_SMALL;
}

static enum skb_drop_reason icmp_discard(struct sk_buff *skb)
{
        /* pretend it was a success */
        return SKB_NOT_DROPPED_YET;
}

/*
 *        Deal with incoming ICMP packets.
 */
int icmp_rcv(struct sk_buff *skb)
{
        enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct rtable *rt = skb_rtable(skb);
        struct net *net = dev_net_rcu(rt->dst.dev);
        struct icmphdr *icmph;

        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                struct sec_path *sp = skb_sec_path(skb);
                int nh;

                if (!(sp && sp->xvec[sp->len - 1]->props.flags &
                                 XFRM_STATE_ICMP)) {
                        reason = SKB_DROP_REASON_XFRM_POLICY;
                        goto drop;
                }

                if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
                        goto drop;

                nh = skb_network_offset(skb);
                skb_set_network_header(skb, sizeof(*icmph));

                if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN,
                                                skb)) {
                        reason = SKB_DROP_REASON_XFRM_POLICY;
                        goto drop;
                }

                skb_set_network_header(skb, nh);
        }

        __ICMP_INC_STATS(net, ICMP_MIB_INMSGS);

        if (skb_checksum_simple_validate(skb))
                goto csum_error;

        if (!pskb_pull(skb, sizeof(*icmph)))
                goto error;

        icmph = icmp_hdr(skb);

        ICMPMSGIN_INC_STATS(net, icmph->type);

        /* Check for ICMP Extended Echo (PROBE) messages */
        if (icmph->type == ICMP_EXT_ECHO) {
                /* We can't use icmp_pointers[].handler() because it is an array of
                 * size NR_ICMP_TYPES + 1 (19 elements) and PROBE has code 42.
                 */
                reason = icmp_echo(skb);
                goto reason_check;
        }

        /*
         *        Parse the ICMP message
         */

        if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
                /*
                 *        RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
                 *          silently ignored (we let user decide with a sysctl).
                 *        RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
                 *          discarded if to broadcast/multicast.
                 */
                if ((icmph->type == ICMP_ECHO ||
                     icmph->type == ICMP_TIMESTAMP) &&
                    READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_broadcasts)) {
                        reason = SKB_DROP_REASON_INVALID_PROTO;
                        goto error;
                }
                if (icmph->type != ICMP_ECHO &&
                    icmph->type != ICMP_TIMESTAMP &&
                    icmph->type != ICMP_ADDRESS &&
                    icmph->type != ICMP_ADDRESSREPLY) {
                        reason = SKB_DROP_REASON_INVALID_PROTO;
                        goto error;
                }
        }

        if (icmph->type == ICMP_EXT_ECHOREPLY ||
            icmph->type == ICMP_ECHOREPLY) {
                reason = ping_rcv(skb);
                return reason ? NET_RX_DROP : NET_RX_SUCCESS;
        }

        /*
         *        18 is the highest 'known' ICMP type. Anything else is a mystery
         *
         *        RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently
         *                  discarded.
         */
        if (icmph->type > NR_ICMP_TYPES) {
                reason = SKB_DROP_REASON_UNHANDLED_PROTO;
                goto error;
        }

        reason = icmp_pointers[icmph->type].handler(skb);
reason_check:
        if (!reason)  {
                consume_skb(skb);
                return NET_RX_SUCCESS;
        }

drop:
        kfree_skb_reason(skb, reason);
        return NET_RX_DROP;
csum_error:
        reason = SKB_DROP_REASON_ICMP_CSUM;
        __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
error:
        __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
        goto drop;
}

static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off)
{
        struct icmp_extobj_hdr *objh, _objh;
        struct icmp_ext_hdr *exth, _exth;
        u16 olen;

        exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth);
        if (!exth)
                return false;
        if (exth->version != 2)
                return true;

        if (exth->checksum &&
            csum_fold(skb_checksum(skb, off, skb->len - off, 0)))
                return false;

        off += sizeof(_exth);
        while (off < skb->len) {
                objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh);
                if (!objh)
                        return false;

                olen = ntohs(objh->length);
                if (olen < sizeof(_objh))
                        return false;

                off += olen;
                if (off > skb->len)
                        return false;
        }

        return true;
}

void ip_icmp_error_rfc4884(const struct sk_buff *skb,
                           struct sock_ee_data_rfc4884 *out,
                           int thlen, int off)
{
        int hlen;

        /* original datagram headers: end of icmph to payload (skb->data) */
        hlen = -skb_transport_offset(skb) - thlen;

        /* per rfc 4884: minimal datagram length of 128 bytes */
        if (off < 128 || off < hlen)
                return;

        /* kernel has stripped headers: return payload offset in bytes */
        off -= hlen;
        if (off + sizeof(struct icmp_ext_hdr) > skb->len)
                return;

        out->len = off;

        if (!ip_icmp_error_rfc4884_validate(skb, off))
                out->flags |= SO_EE_RFC4884_FLAG_INVALID;
}

int icmp_err(struct sk_buff *skb, u32 info)
{
        struct iphdr *iph = (struct iphdr *)skb->data;
        int offset = iph->ihl<<2;
        struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
        struct net *net = dev_net_rcu(skb->dev);
        int type = icmp_hdr(skb)->type;
        int code = icmp_hdr(skb)->code;

        /*
         * Use ping_err to handle all icmp errors except those
         * triggered by ICMP_ECHOREPLY which sent from kernel.
         */
        if (icmph->type != ICMP_ECHOREPLY) {
                ping_err(skb, offset, info);
                return 0;
        }

        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
                ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
        else if (type == ICMP_REDIRECT)
                ipv4_redirect(skb, net, 0, IPPROTO_ICMP);

        return 0;
}

/*
 *        This table is the definition of how we handle ICMP.
 */
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
        [ICMP_ECHOREPLY] = {
                .handler = ping_rcv,
        },
        [1] = {
                .handler = icmp_discard,
                .error = 1,
        },
        [2] = {
                .handler = icmp_discard,
                .error = 1,
        },
        [ICMP_DEST_UNREACH] = {
                .handler = icmp_unreach,
                .error = 1,
        },
        [ICMP_SOURCE_QUENCH] = {
                .handler = icmp_unreach,
                .error = 1,
        },
        [ICMP_REDIRECT] = {
                .handler = icmp_redirect,
                .error = 1,
        },
        [6] = {
                .handler = icmp_discard,
                .error = 1,
        },
        [7] = {
                .handler = icmp_discard,
                .error = 1,
        },
        [ICMP_ECHO] = {
                .handler = icmp_echo,
        },
        [9] = {
                .handler = icmp_discard,
                .error = 1,
        },
        [10] = {
                .handler = icmp_discard,
                .error = 1,
        },
        [ICMP_TIME_EXCEEDED] = {
                .handler = icmp_unreach,
                .error = 1,
        },
        [ICMP_PARAMETERPROB] = {
                .handler = icmp_unreach,
                .error = 1,
        },
        [ICMP_TIMESTAMP] = {
                .handler = icmp_timestamp,
        },
        [ICMP_TIMESTAMPREPLY] = {
                .handler = icmp_discard,
        },
        [ICMP_INFO_REQUEST] = {
                .handler = icmp_discard,
        },
        [ICMP_INFO_REPLY] = {
                .handler = icmp_discard,
        },
        [ICMP_ADDRESS] = {
                .handler = icmp_discard,
        },
        [ICMP_ADDRESSREPLY] = {
                .handler = icmp_discard,
        },
};

static int __net_init icmp_sk_init(struct net *net)
{
        /* Control parameters for ECHO replies. */
        net->ipv4.sysctl_icmp_echo_ignore_all = 0;
        net->ipv4.sysctl_icmp_echo_enable_probe = 0;
        net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;

        /* Control parameter - ignore bogus broadcast responses? */
        net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1;

        /*
         *         Configurable global rate limit.
         *
         *        ratelimit defines tokens/packet consumed for dst->rate_token
         *        bucket ratemask defines which icmp types are ratelimited by
         *        setting        it's bit position.
         *
         *        default:
         *        dest unreachable (3), source quench (4),
         *        time exceeded (11), parameter problem (12)
         */

        net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
        net->ipv4.sysctl_icmp_ratemask = 0x1818;
        net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
        net->ipv4.sysctl_icmp_errors_extension_mask = 0;
        net->ipv4.sysctl_icmp_msgs_per_sec = 10000;
        net->ipv4.sysctl_icmp_msgs_burst = 10000;

        return 0;
}

static struct pernet_operations __net_initdata icmp_sk_ops = {
       .init = icmp_sk_init,
};

int __init icmp_init(void)
{
        int err, i;

        for_each_possible_cpu(i) {
                struct sock *sk;

                err = inet_ctl_sock_create(&sk, PF_INET,
                                           SOCK_RAW, IPPROTO_ICMP, &init_net);
                if (err < 0)
                        return err;

                per_cpu(ipv4_icmp_sk, i) = sk;

                /* Enough space for 2 64K ICMP packets, including
                 * sk_buff/skb_shared_info struct overhead.
                 */
                sk->sk_sndbuf =        2 * SKB_TRUESIZE(64 * 1024);

                /*
                 * Speedup sock_wfree()
                 */
                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
        }
        return register_pernet_subsys(&icmp_sk_ops);
}


















































   19 
   20 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_USER_H
#define _LINUX_SCHED_USER_H

#include <linux/uidgid.h>
#include <linux/atomic.h>
#include <linux/percpu_counter.h>
#include <linux/refcount.h>
#include <linux/ratelimit.h>

/*
 * Some day this will be a full-fledged user tracking system..
 */
struct user_struct {
        refcount_t __count;        /* reference count */
#ifdef CONFIG_EPOLL
        struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */
#endif
        unsigned long unix_inflight;        /* How many files in flight in unix sockets */
        atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */

        /* Hash table maintenance information */
        struct hlist_node uidhash_node;
        kuid_t uid;

#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
        defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \
        defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD)
        atomic_long_t locked_vm;
#endif
#ifdef CONFIG_WATCH_QUEUE
        atomic_t nr_watches;        /* The number of watches this user currently has */
#endif

        /* Miscellaneous per-user rate limit */
        struct ratelimit_state ratelimit;
};

extern int uids_sysfs_init(void);

extern struct user_struct *find_user(kuid_t);

extern struct user_struct root_user;
#define INIT_USER (&root_user)


/* per-UID process charging. */
extern struct user_struct * alloc_uid(kuid_t);
static inline struct user_struct *get_uid(struct user_struct *u)
{
        refcount_inc(&u->__count);
        return u;
}
extern void free_uid(struct user_struct *);

#endif /* _LINUX_SCHED_USER_H */




























































































































































































































































    3 




    3 
    3 




























































































































































































































































































































































































































































































































































































































































































































































































    3 



























    3 









    3 











    3 















































    3 


    3 








































    3 


































































































































    3 

















    3 











    3 



    3 























    3 

    3 














































    3 


















    2 


































    2 




    2 















    2 















    2 




    3 


















    3 
    2 



















    3 





    3 
























































































    3 




    3 































































































































































































































































































































































































































































































































































































































































































































































































    3 







































































































































    3 


















    2 

    3 



















    3 


    2 












   13 




    3 








   14 


    3 

   13 






















   13 
    3 

    3 
    3 







   14 




    3 











   13 

   12 

   13 





    3 
   12 


   13 





   14 
    3 






    3 
   13 







   14 
    3 






























































































































    3 






    3 



















































    3 




    3 

    1 



    2 

    3 







    3 





    3 





    2 

    3 



    3 



























    3 



    2 




    3 




    2 
    3 

    3 
    1 

































































    3 



    3 


    3 
















    2 

    3 

















































    3 



    3 



    2 



    3 























    3 








    3 

































    1 






   12 


























    1 










    3 














    1 
    1 










    1 



    1 







    1 







    3 




















    3 
































    1 

    1 








    3 



    3 




















































    1 
    1 





    3 




    3 







    3 









    1 




    1 
    1 















    1 


    1 






    1 

    1 









    3 




















    2 





    3 












    3 









    3 


































    3 












    3 









    3 




    3 











    3 






    3 



    3 






























    3 








    3 




















    3 


























    3 



    3 



    3 





    1 


























































    3 














    3 
    3 








   17 


   18 
















    3 


















    3 







    3 


    3 










    3 
    3 

    3 
    3 



    3 




    1 







    3 





    3 


















    3 















    3 


    3 













































   13 





    1 

   13 



   10 

    1 



   11 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 
















    3 













    3 





    3 


    3 






    3 















    2 


    3 









































    1 

    3 
    3 
















































































































































    1 

















    2 













    1 
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/core.c - core driver model code (device registration, etc)
 *
 * Copyright (c) 2002-3 Patrick Mochel
 * Copyright (c) 2002-3 Open Source Development Labs
 * Copyright (c) 2006 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2006 Novell, Inc.
 */

#include <linux/acpi.h>
#include <linux/blkdev.h>
#include <linux/cleanup.h>
#include <linux/cpufreq.h>
#include <linux/device.h>
#include <linux/dma-map-ops.h> /* for dma_default_coherent */
#include <linux/err.h>
#include <linux/fwnode.h>
#include <linux/init.h>
#include <linux/kdev_t.h>
#include <linux/kstrtox.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/pm_runtime.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/string_helpers.h>
#include <linux/swiotlb.h>
#include <linux/sysfs.h>

#include "base.h"
#include "physical_location.h"
#include "power/power.h"

/* Device links support. */
static LIST_HEAD(deferred_sync);
static unsigned int defer_sync_state_count = 1;
static DEFINE_MUTEX(fwnode_link_lock);
static bool fw_devlink_is_permissive(void);
static void __fw_devlink_link_to_consumers(struct device *dev);
static bool fw_devlink_drv_reg_done;
static bool fw_devlink_best_effort;
static struct workqueue_struct *device_link_wq;

/**
 * __fwnode_link_add - Create a link between two fwnode_handles.
 * @con: Consumer end of the link.
 * @sup: Supplier end of the link.
 * @flags: Link flags.
 *
 * Create a fwnode link between fwnode handles @con and @sup. The fwnode link
 * represents the detail that the firmware lists @sup fwnode as supplying a
 * resource to @con.
 *
 * The driver core will use the fwnode link to create a device link between the
 * two device objects corresponding to @con and @sup when they are created. The
 * driver core will automatically delete the fwnode link between @con and @sup
 * after doing that.
 *
 * Attempts to create duplicate links between the same pair of fwnode handles
 * are ignored and there is no reference counting.
 */
static int __fwnode_link_add(struct fwnode_handle *con,
                             struct fwnode_handle *sup, u8 flags)
{
        struct fwnode_link *link;

        list_for_each_entry(link, &sup->consumers, s_hook)
                if (link->consumer == con) {
                        link->flags |= flags;
                        return 0;
                }

        link = kzalloc_obj(*link);
        if (!link)
                return -ENOMEM;

        link->supplier = sup;
        INIT_LIST_HEAD(&link->s_hook);
        link->consumer = con;
        INIT_LIST_HEAD(&link->c_hook);
        link->flags = flags;

        list_add(&link->s_hook, &sup->consumers);
        list_add(&link->c_hook, &con->suppliers);
        pr_debug("%pfwf Linked as a fwnode consumer to %pfwf\n",
                 con, sup);

        return 0;
}

int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup,
                    u8 flags)
{
        guard(mutex)(&fwnode_link_lock);

        return __fwnode_link_add(con, sup, flags);
}

/**
 * __fwnode_link_del - Delete a link between two fwnode_handles.
 * @link: the fwnode_link to be deleted
 *
 * The fwnode_link_lock needs to be held when this function is called.
 */
static void __fwnode_link_del(struct fwnode_link *link)
{
        pr_debug("%pfwf Dropping the fwnode link to %pfwf\n",
                 link->consumer, link->supplier);
        list_del(&link->s_hook);
        list_del(&link->c_hook);
        kfree(link);
}

/**
 * __fwnode_link_cycle - Mark a fwnode link as being part of a cycle.
 * @link: the fwnode_link to be marked
 *
 * The fwnode_link_lock needs to be held when this function is called.
 */
static void __fwnode_link_cycle(struct fwnode_link *link)
{
        pr_debug("%pfwf: cycle: depends on %pfwf\n",
                 link->consumer, link->supplier);
        link->flags |= FWLINK_FLAG_CYCLE;
}

/**
 * fwnode_links_purge_suppliers - Delete all supplier links of fwnode_handle.
 * @fwnode: fwnode whose supplier links need to be deleted
 *
 * Deletes all supplier links connecting directly to @fwnode.
 */
static void fwnode_links_purge_suppliers(struct fwnode_handle *fwnode)
{
        struct fwnode_link *link, *tmp;

        guard(mutex)(&fwnode_link_lock);

        list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook)
                __fwnode_link_del(link);
}

/**
 * fwnode_links_purge_consumers - Delete all consumer links of fwnode_handle.
 * @fwnode: fwnode whose consumer links need to be deleted
 *
 * Deletes all consumer links connecting directly to @fwnode.
 */
static void fwnode_links_purge_consumers(struct fwnode_handle *fwnode)
{
        struct fwnode_link *link, *tmp;

        guard(mutex)(&fwnode_link_lock);

        list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook)
                __fwnode_link_del(link);
}

/**
 * fwnode_links_purge - Delete all links connected to a fwnode_handle.
 * @fwnode: fwnode whose links needs to be deleted
 *
 * Deletes all links connecting directly to a fwnode.
 */
void fwnode_links_purge(struct fwnode_handle *fwnode)
{
        fwnode_links_purge_suppliers(fwnode);
        fwnode_links_purge_consumers(fwnode);
}

void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode)
{
        struct fwnode_handle *child;

        /* Don't purge consumer links of an added child */
        if (fwnode->dev)
                return;

        fwnode_set_flag(fwnode, FWNODE_FLAG_NOT_DEVICE);
        fwnode_links_purge_consumers(fwnode);

        fwnode_for_each_available_child_node(fwnode, child)
                fw_devlink_purge_absent_suppliers(child);
}
EXPORT_SYMBOL_GPL(fw_devlink_purge_absent_suppliers);

/**
 * __fwnode_links_move_consumers - Move consumer from @from to @to fwnode_handle
 * @from: move consumers away from this fwnode
 * @to: move consumers to this fwnode
 *
 * Move all consumer links from @from fwnode to @to fwnode.
 */
static void __fwnode_links_move_consumers(struct fwnode_handle *from,
                                          struct fwnode_handle *to)
{
        struct fwnode_link *link, *tmp;

        list_for_each_entry_safe(link, tmp, &from->consumers, s_hook) {
                __fwnode_link_add(link->consumer, to, link->flags);
                __fwnode_link_del(link);
        }
}

/**
 * __fw_devlink_pickup_dangling_consumers - Pick up dangling consumers
 * @fwnode: fwnode from which to pick up dangling consumers
 * @new_sup: fwnode of new supplier
 *
 * If the @fwnode has a corresponding struct device and the device supports
 * probing (that is, added to a bus), then we want to let fw_devlink create
 * MANAGED device links to this device, so leave @fwnode and its descendant's
 * fwnode links alone.
 *
 * Otherwise, move its consumers to the new supplier @new_sup.
 */
static void __fw_devlink_pickup_dangling_consumers(struct fwnode_handle *fwnode,
                                                   struct fwnode_handle *new_sup)
{
        struct fwnode_handle *child;

        if (fwnode->dev && fwnode->dev->bus)
                return;

        fwnode_set_flag(fwnode, FWNODE_FLAG_NOT_DEVICE);
        __fwnode_links_move_consumers(fwnode, new_sup);

        fwnode_for_each_available_child_node(fwnode, child)
                __fw_devlink_pickup_dangling_consumers(child, new_sup);
}

static DEFINE_MUTEX(device_links_lock);
DEFINE_STATIC_SRCU(device_links_srcu);

static inline void device_links_write_lock(void)
{
        mutex_lock(&device_links_lock);
}

static inline void device_links_write_unlock(void)
{
        mutex_unlock(&device_links_lock);
}

int device_links_read_lock(void) __acquires(&device_links_srcu)
{
        return srcu_read_lock(&device_links_srcu);
}

void device_links_read_unlock(int idx) __releases(&device_links_srcu)
{
        srcu_read_unlock(&device_links_srcu, idx);
}

int device_links_read_lock_held(void)
{
        return srcu_read_lock_held(&device_links_srcu);
}

static void device_link_synchronize_removal(void)
{
        synchronize_srcu(&device_links_srcu);
}

static void device_link_remove_from_lists(struct device_link *link)
{
        list_del_rcu(&link->s_node);
        list_del_rcu(&link->c_node);
}

static bool device_is_ancestor(struct device *dev, struct device *target)
{
        while (target->parent) {
                target = target->parent;
                if (dev == target)
                        return true;
        }
        return false;
}

#define DL_MARKER_FLAGS                (DL_FLAG_INFERRED | \
                                 DL_FLAG_CYCLE | \
                                 DL_FLAG_MANAGED)
bool device_link_flag_is_sync_state_only(u32 flags)
{
        return (flags & ~DL_MARKER_FLAGS) == DL_FLAG_SYNC_STATE_ONLY;
}

/**
 * device_is_dependent - Check if one device depends on another one
 * @dev: Device to check dependencies for.
 * @target: Device to check against.
 *
 * Check if @target depends on @dev or any device dependent on it (its child or
 * its consumer etc).  Return 1 if that is the case or 0 otherwise.
 */
static int device_is_dependent(struct device *dev, void *target)
{
        struct device_link *link;
        int ret;

        /*
         * The "ancestors" check is needed to catch the case when the target
         * device has not been completely initialized yet and it is still
         * missing from the list of children of its parent device.
         */
        if (dev == target || device_is_ancestor(dev, target))
                return 1;

        ret = device_for_each_child(dev, target, device_is_dependent);
        if (ret)
                return ret;

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (device_link_flag_is_sync_state_only(link->flags))
                        continue;

                if (link->consumer == target)
                        return 1;

                ret = device_is_dependent(link->consumer, target);
                if (ret)
                        break;
        }
        return ret;
}

static void device_link_init_status(struct device_link *link,
                                    struct device *consumer,
                                    struct device *supplier)
{
        switch (supplier->links.status) {
        case DL_DEV_PROBING:
                switch (consumer->links.status) {
                case DL_DEV_PROBING:
                        /*
                         * A consumer driver can create a link to a supplier
                         * that has not completed its probing yet as long as it
                         * knows that the supplier is already functional (for
                         * example, it has just acquired some resources from the
                         * supplier).
                         */
                        link->status = DL_STATE_CONSUMER_PROBE;
                        break;
                default:
                        link->status = DL_STATE_DORMANT;
                        break;
                }
                break;
        case DL_DEV_DRIVER_BOUND:
                switch (consumer->links.status) {
                case DL_DEV_PROBING:
                        link->status = DL_STATE_CONSUMER_PROBE;
                        break;
                case DL_DEV_DRIVER_BOUND:
                        link->status = DL_STATE_ACTIVE;
                        break;
                default:
                        link->status = DL_STATE_AVAILABLE;
                        break;
                }
                break;
        case DL_DEV_UNBINDING:
                link->status = DL_STATE_SUPPLIER_UNBIND;
                break;
        default:
                link->status = DL_STATE_DORMANT;
                break;
        }
}

static int device_reorder_to_tail(struct device *dev, void *not_used)
{
        struct device_link *link;

        /*
         * Devices that have not been registered yet will be put to the ends
         * of the lists during the registration, so skip them here.
         */
        if (device_is_registered(dev))
                devices_kset_move_last(dev);

        if (device_pm_initialized(dev))
                device_pm_move_last(dev);

        device_for_each_child(dev, NULL, device_reorder_to_tail);
        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (device_link_flag_is_sync_state_only(link->flags))
                        continue;
                device_reorder_to_tail(link->consumer, NULL);
        }

        return 0;
}

/**
 * device_pm_move_to_tail - Move set of devices to the end of device lists
 * @dev: Device to move
 *
 * This is a device_reorder_to_tail() wrapper taking the requisite locks.
 *
 * It moves the @dev along with all of its children and all of its consumers
 * to the ends of the device_kset and dpm_list, recursively.
 */
void device_pm_move_to_tail(struct device *dev)
{
        int idx;

        idx = device_links_read_lock();
        device_pm_lock();
        device_reorder_to_tail(dev, NULL);
        device_pm_unlock();
        device_links_read_unlock(idx);
}

#define to_devlink(dev)        container_of((dev), struct device_link, link_dev)

static ssize_t status_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        const char *output;

        switch (to_devlink(dev)->status) {
        case DL_STATE_NONE:
                output = "not tracked";
                break;
        case DL_STATE_DORMANT:
                output = "dormant";
                break;
        case DL_STATE_AVAILABLE:
                output = "available";
                break;
        case DL_STATE_CONSUMER_PROBE:
                output = "consumer probing";
                break;
        case DL_STATE_ACTIVE:
                output = "active";
                break;
        case DL_STATE_SUPPLIER_UNBIND:
                output = "supplier unbinding";
                break;
        default:
                output = "unknown";
                break;
        }

        return sysfs_emit(buf, "%s\n", output);
}
static DEVICE_ATTR_RO(status);

static ssize_t auto_remove_on_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct device_link *link = to_devlink(dev);
        const char *output;

        if (device_link_test(link, DL_FLAG_AUTOREMOVE_SUPPLIER))
                output = "supplier unbind";
        else if (device_link_test(link, DL_FLAG_AUTOREMOVE_CONSUMER))
                output = "consumer unbind";
        else
                output = "never";

        return sysfs_emit(buf, "%s\n", output);
}
static DEVICE_ATTR_RO(auto_remove_on);

static ssize_t runtime_pm_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct device_link *link = to_devlink(dev);

        return sysfs_emit(buf, "%d\n", device_link_test(link, DL_FLAG_PM_RUNTIME));
}
static DEVICE_ATTR_RO(runtime_pm);

static ssize_t sync_state_only_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        struct device_link *link = to_devlink(dev);

        return sysfs_emit(buf, "%d\n", device_link_test(link, DL_FLAG_SYNC_STATE_ONLY));
}
static DEVICE_ATTR_RO(sync_state_only);

static struct attribute *devlink_attrs[] = {
        &dev_attr_status.attr,
        &dev_attr_auto_remove_on.attr,
        &dev_attr_runtime_pm.attr,
        &dev_attr_sync_state_only.attr,
        NULL,
};
ATTRIBUTE_GROUPS(devlink);

static void device_link_release_fn(struct work_struct *work)
{
        struct device_link *link = container_of(work, struct device_link, rm_work);

        /* Ensure that all references to the link object have been dropped. */
        device_link_synchronize_removal();

        pm_runtime_release_supplier(link);
        /*
         * If supplier_preactivated is set, the link has been dropped between
         * the pm_runtime_get_suppliers() and pm_runtime_put_suppliers() calls
         * in __driver_probe_device().  In that case, drop the supplier's
         * PM-runtime usage counter to remove the reference taken by
         * pm_runtime_get_suppliers().
         */
        if (link->supplier_preactivated)
                pm_runtime_put_noidle(link->supplier);

        pm_request_idle(link->supplier);

        put_device(link->consumer);
        put_device(link->supplier);
        kfree(link);
}

static void devlink_dev_release(struct device *dev)
{
        struct device_link *link = to_devlink(dev);

        INIT_WORK(&link->rm_work, device_link_release_fn);
        /*
         * It may take a while to complete this work because of the SRCU
         * synchronization in device_link_release_fn() and if the consumer or
         * supplier devices get deleted when it runs, so put it into the
         * dedicated workqueue.
         */
        queue_work(device_link_wq, &link->rm_work);
}

/**
 * device_link_wait_removal - Wait for ongoing devlink removal jobs to terminate
 */
void device_link_wait_removal(void)
{
        /*
         * devlink removal jobs are queued in the dedicated work queue.
         * To be sure that all removal jobs are terminated, ensure that any
         * scheduled work has run to completion.
         */
        flush_workqueue(device_link_wq);
}
EXPORT_SYMBOL_GPL(device_link_wait_removal);

static const struct class devlink_class = {
        .name = "devlink",
        .dev_groups = devlink_groups,
        .dev_release = devlink_dev_release,
};

static int devlink_add_symlinks(struct device *dev)
{
        char *buf_con __free(kfree) = NULL, *buf_sup __free(kfree) = NULL;
        int ret;
        struct device_link *link = to_devlink(dev);
        struct device *sup = link->supplier;
        struct device *con = link->consumer;

        ret = sysfs_create_link(&link->link_dev.kobj, &sup->kobj, "supplier");
        if (ret)
                goto out;

        ret = sysfs_create_link(&link->link_dev.kobj, &con->kobj, "consumer");
        if (ret)
                goto err_con;

        buf_con = kasprintf(GFP_KERNEL, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
        if (!buf_con) {
                ret = -ENOMEM;
                goto err_con_dev;
        }

        ret = sysfs_create_link(&sup->kobj, &link->link_dev.kobj, buf_con);
        if (ret)
                goto err_con_dev;

        buf_sup = kasprintf(GFP_KERNEL, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup));
        if (!buf_sup) {
                ret = -ENOMEM;
                goto err_sup_dev;
        }

        ret = sysfs_create_link(&con->kobj, &link->link_dev.kobj, buf_sup);
        if (ret)
                goto err_sup_dev;

        goto out;

err_sup_dev:
        sysfs_remove_link(&sup->kobj, buf_con);
err_con_dev:
        sysfs_remove_link(&link->link_dev.kobj, "consumer");
err_con:
        sysfs_remove_link(&link->link_dev.kobj, "supplier");
out:
        return ret;
}

static void devlink_remove_symlinks(struct device *dev)
{
        char *buf_con __free(kfree) = NULL, *buf_sup __free(kfree) = NULL;
        struct device_link *link = to_devlink(dev);
        struct device *sup = link->supplier;
        struct device *con = link->consumer;

        sysfs_remove_link(&link->link_dev.kobj, "consumer");
        sysfs_remove_link(&link->link_dev.kobj, "supplier");

        if (device_is_registered(con)) {
                buf_sup = kasprintf(GFP_KERNEL, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup));
                if (!buf_sup)
                        goto out;
                sysfs_remove_link(&con->kobj, buf_sup);
        }

        buf_con = kasprintf(GFP_KERNEL, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
        if (!buf_con)
                goto out;
        sysfs_remove_link(&sup->kobj, buf_con);

        return;

out:
        WARN(1, "Unable to properly free device link symlinks!\n");
}

static struct class_interface devlink_class_intf = {
        .class = &devlink_class,
        .add_dev = devlink_add_symlinks,
        .remove_dev = devlink_remove_symlinks,
};

static int __init devlink_class_init(void)
{
        int ret;

        ret = class_register(&devlink_class);
        if (ret)
                return ret;

        ret = class_interface_register(&devlink_class_intf);
        if (ret)
                class_unregister(&devlink_class);

        return ret;
}
postcore_initcall(devlink_class_init);

#define DL_MANAGED_LINK_FLAGS (DL_FLAG_AUTOREMOVE_CONSUMER | \
                               DL_FLAG_AUTOREMOVE_SUPPLIER | \
                               DL_FLAG_AUTOPROBE_CONSUMER  | \
                               DL_FLAG_SYNC_STATE_ONLY | \
                               DL_FLAG_INFERRED | \
                               DL_FLAG_CYCLE)

#define DL_ADD_VALID_FLAGS (DL_MANAGED_LINK_FLAGS | DL_FLAG_STATELESS | \
                            DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE)

/**
 * device_link_add - Create a link between two devices.
 * @consumer: Consumer end of the link.
 * @supplier: Supplier end of the link.
 * @flags: Link flags.
 *
 * Return: On success, a device_link struct will be returned.
 *         On error or invalid flag settings, NULL will be returned.
 *
 * The caller is responsible for the proper synchronization of the link creation
 * with runtime PM.  First, setting the DL_FLAG_PM_RUNTIME flag will cause the
 * runtime PM framework to take the link into account.  Second, if the
 * DL_FLAG_RPM_ACTIVE flag is set in addition to it, the supplier devices will
 * be forced into the active meta state and reference-counted upon the creation
 * of the link.  If DL_FLAG_PM_RUNTIME is not set, DL_FLAG_RPM_ACTIVE will be
 * ignored.
 *
 * If DL_FLAG_STATELESS is set in @flags, the caller of this function is
 * expected to release the link returned by it directly with the help of either
 * device_link_del() or device_link_remove().
 *
 * If that flag is not set, however, the caller of this function is handing the
 * management of the link over to the driver core entirely and its return value
 * can only be used to check whether or not the link is present.  In that case,
 * the DL_FLAG_AUTOREMOVE_CONSUMER and DL_FLAG_AUTOREMOVE_SUPPLIER device link
 * flags can be used to indicate to the driver core when the link can be safely
 * deleted.  Namely, setting one of them in @flags indicates to the driver core
 * that the link is not going to be used (by the given caller of this function)
 * after unbinding the consumer or supplier driver, respectively, from its
 * device, so the link can be deleted at that point.  If none of them is set,
 * the link will be maintained until one of the devices pointed to by it (either
 * the consumer or the supplier) is unregistered.
 *
 * Also, if DL_FLAG_STATELESS, DL_FLAG_AUTOREMOVE_CONSUMER and
 * DL_FLAG_AUTOREMOVE_SUPPLIER are not set in @flags (that is, a persistent
 * managed device link is being added), the DL_FLAG_AUTOPROBE_CONSUMER flag can
 * be used to request the driver core to automatically probe for a consumer
 * driver after successfully binding a driver to the supplier device.
 *
 * The combination of DL_FLAG_STATELESS and one of DL_FLAG_AUTOREMOVE_CONSUMER,
 * DL_FLAG_AUTOREMOVE_SUPPLIER, or DL_FLAG_AUTOPROBE_CONSUMER set in @flags at
 * the same time is invalid and will cause NULL to be returned upfront.
 * However, if a device link between the given @consumer and @supplier pair
 * exists already when this function is called for them, the existing link will
 * be returned regardless of its current type and status (the link's flags may
 * be modified then).  The caller of this function is then expected to treat
 * the link as though it has just been created, so (in particular) if
 * DL_FLAG_STATELESS was passed in @flags, the link needs to be released
 * explicitly when not needed any more (as stated above).
 *
 * A side effect of the link creation is re-ordering of dpm_list and the
 * devices_kset list by moving the consumer device and all devices depending
 * on it to the ends of these lists (that does not happen to devices that have
 * not been registered when this function is called).
 *
 * The supplier device is required to be registered when this function is called
 * and NULL will be returned if that is not the case.  The consumer device need
 * not be registered, however.
 */
struct device_link *device_link_add(struct device *consumer,
                                    struct device *supplier, u32 flags)
{
        struct device_link *link;

        if (!consumer || !supplier || consumer == supplier ||
            flags & ~DL_ADD_VALID_FLAGS ||
            (flags & DL_FLAG_STATELESS && flags & DL_MANAGED_LINK_FLAGS) ||
            (flags & DL_FLAG_AUTOPROBE_CONSUMER &&
             flags & (DL_FLAG_AUTOREMOVE_CONSUMER |
                      DL_FLAG_AUTOREMOVE_SUPPLIER)))
                return NULL;

        if (flags & DL_FLAG_PM_RUNTIME && flags & DL_FLAG_RPM_ACTIVE) {
                if (pm_runtime_get_sync(supplier) < 0) {
                        pm_runtime_put_noidle(supplier);
                        return NULL;
                }
        }

        if (!(flags & DL_FLAG_STATELESS))
                flags |= DL_FLAG_MANAGED;

        if (flags & DL_FLAG_SYNC_STATE_ONLY &&
            !device_link_flag_is_sync_state_only(flags))
                return NULL;

        device_links_write_lock();
        device_pm_lock();

        /*
         * If the supplier has not been fully registered yet or there is a
         * reverse (non-SYNC_STATE_ONLY) dependency between the consumer and
         * the supplier already in the graph, return NULL. If the link is a
         * SYNC_STATE_ONLY link, we don't check for reverse dependencies
         * because it only affects sync_state() callbacks.
         */
        if (!device_pm_initialized(supplier)
            || (!(flags & DL_FLAG_SYNC_STATE_ONLY) &&
                  device_is_dependent(consumer, supplier))) {
                link = NULL;
                goto out;
        }

        /*
         * SYNC_STATE_ONLY links are useless once a consumer device has probed.
         * So, only create it if the consumer hasn't probed yet.
         */
        if (flags & DL_FLAG_SYNC_STATE_ONLY &&
            consumer->links.status != DL_DEV_NO_DRIVER &&
            consumer->links.status != DL_DEV_PROBING) {
                link = NULL;
                goto out;
        }

        /*
         * DL_FLAG_AUTOREMOVE_SUPPLIER indicates that the link will be needed
         * longer than for DL_FLAG_AUTOREMOVE_CONSUMER and setting them both
         * together doesn't make sense, so prefer DL_FLAG_AUTOREMOVE_SUPPLIER.
         */
        if (flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
                flags &= ~DL_FLAG_AUTOREMOVE_CONSUMER;

        list_for_each_entry(link, &supplier->links.consumers, s_node) {
                if (link->consumer != consumer)
                        continue;

                if (device_link_test(link, DL_FLAG_INFERRED) &&
                    !(flags & DL_FLAG_INFERRED))
                        link->flags &= ~DL_FLAG_INFERRED;

                if (flags & DL_FLAG_PM_RUNTIME) {
                        if (!device_link_test(link, DL_FLAG_PM_RUNTIME)) {
                                pm_runtime_new_link(consumer);
                                link->flags |= DL_FLAG_PM_RUNTIME;
                        }
                        if (flags & DL_FLAG_RPM_ACTIVE)
                                refcount_inc(&link->rpm_active);
                }

                if (flags & DL_FLAG_STATELESS) {
                        kref_get(&link->kref);
                        if (device_link_test(link, DL_FLAG_SYNC_STATE_ONLY) &&
                            !device_link_test(link, DL_FLAG_STATELESS)) {
                                link->flags |= DL_FLAG_STATELESS;
                                goto reorder;
                        } else {
                                link->flags |= DL_FLAG_STATELESS;
                                goto out;
                        }
                }

                /*
                 * If the life time of the link following from the new flags is
                 * longer than indicated by the flags of the existing link,
                 * update the existing link to stay around longer.
                 */
                if (flags & DL_FLAG_AUTOREMOVE_SUPPLIER) {
                        if (device_link_test(link, DL_FLAG_AUTOREMOVE_CONSUMER)) {
                                link->flags &= ~DL_FLAG_AUTOREMOVE_CONSUMER;
                                link->flags |= DL_FLAG_AUTOREMOVE_SUPPLIER;
                        }
                } else if (!(flags & DL_FLAG_AUTOREMOVE_CONSUMER)) {
                        link->flags &= ~(DL_FLAG_AUTOREMOVE_CONSUMER |
                                         DL_FLAG_AUTOREMOVE_SUPPLIER);
                }
                if (!device_link_test(link, DL_FLAG_MANAGED)) {
                        kref_get(&link->kref);
                        link->flags |= DL_FLAG_MANAGED;
                        device_link_init_status(link, consumer, supplier);
                }
                if (device_link_test(link, DL_FLAG_SYNC_STATE_ONLY) &&
                    !(flags & DL_FLAG_SYNC_STATE_ONLY)) {
                        link->flags &= ~DL_FLAG_SYNC_STATE_ONLY;
                        goto reorder;
                }

                goto out;
        }

        link = kzalloc_obj(*link);
        if (!link)
                goto out;

        refcount_set(&link->rpm_active, 1);

        get_device(supplier);
        link->supplier = supplier;
        INIT_LIST_HEAD(&link->s_node);
        get_device(consumer);
        link->consumer = consumer;
        INIT_LIST_HEAD(&link->c_node);
        link->flags = flags;
        kref_init(&link->kref);

        link->link_dev.class = &devlink_class;
        device_set_pm_not_required(&link->link_dev);
        dev_set_name(&link->link_dev, "%s:%s--%s:%s",
                     dev_bus_name(supplier), dev_name(supplier),
                     dev_bus_name(consumer), dev_name(consumer));
        if (device_register(&link->link_dev)) {
                put_device(&link->link_dev);
                link = NULL;
                goto out;
        }

        if (flags & DL_FLAG_PM_RUNTIME) {
                if (flags & DL_FLAG_RPM_ACTIVE)
                        refcount_inc(&link->rpm_active);

                pm_runtime_new_link(consumer);
        }

        /* Determine the initial link state. */
        if (flags & DL_FLAG_STATELESS)
                link->status = DL_STATE_NONE;
        else
                device_link_init_status(link, consumer, supplier);

        /*
         * Some callers expect the link creation during consumer driver probe to
         * resume the supplier even without DL_FLAG_RPM_ACTIVE.
         */
        if (link->status == DL_STATE_CONSUMER_PROBE &&
            flags & DL_FLAG_PM_RUNTIME)
                pm_runtime_resume(supplier);

        list_add_tail_rcu(&link->s_node, &supplier->links.consumers);
        list_add_tail_rcu(&link->c_node, &consumer->links.suppliers);

        if (flags & DL_FLAG_SYNC_STATE_ONLY) {
                dev_dbg(consumer,
                        "Linked as a sync state only consumer to %s\n",
                        dev_name(supplier));
                goto out;
        }

reorder:
        /*
         * Move the consumer and all of the devices depending on it to the end
         * of dpm_list and the devices_kset list.
         *
         * It is necessary to hold dpm_list locked throughout all that or else
         * we may end up suspending with a wrong ordering of it.
         */
        device_reorder_to_tail(consumer, NULL);

        dev_dbg(consumer, "Linked as a consumer to %s\n", dev_name(supplier));

out:
        device_pm_unlock();
        device_links_write_unlock();

        if ((flags & DL_FLAG_PM_RUNTIME && flags & DL_FLAG_RPM_ACTIVE) && !link)
                pm_runtime_put(supplier);

        return link;
}
EXPORT_SYMBOL_GPL(device_link_add);

static void __device_link_del(struct kref *kref)
{
        struct device_link *link = container_of(kref, struct device_link, kref);

        dev_dbg(link->consumer, "Dropping the link to %s\n",
                dev_name(link->supplier));

        pm_runtime_drop_link(link);

        device_link_remove_from_lists(link);
        device_unregister(&link->link_dev);
}

static void device_link_put_kref(struct device_link *link)
{
        if (device_link_test(link, DL_FLAG_STATELESS))
                kref_put(&link->kref, __device_link_del);
        else if (!device_is_registered(link->consumer))
                __device_link_del(&link->kref);
        else
                WARN(1, "Unable to drop a managed device link reference\n");
}

/**
 * device_link_del - Delete a stateless link between two devices.
 * @link: Device link to delete.
 *
 * The caller must ensure proper synchronization of this function with runtime
 * PM.  If the link was added multiple times, it needs to be deleted as often.
 * Care is required for hotplugged devices:  Their links are purged on removal
 * and calling device_link_del() is then no longer allowed.
 */
void device_link_del(struct device_link *link)
{
        device_links_write_lock();
        device_link_put_kref(link);
        device_links_write_unlock();
}
EXPORT_SYMBOL_GPL(device_link_del);

/**
 * device_link_remove - Delete a stateless link between two devices.
 * @consumer: Consumer end of the link.
 * @supplier: Supplier end of the link.
 *
 * The caller must ensure proper synchronization of this function with runtime
 * PM.
 */
void device_link_remove(void *consumer, struct device *supplier)
{
        struct device_link *link;

        if (WARN_ON(consumer == supplier))
                return;

        device_links_write_lock();

        list_for_each_entry(link, &supplier->links.consumers, s_node) {
                if (link->consumer == consumer) {
                        device_link_put_kref(link);
                        break;
                }
        }

        device_links_write_unlock();
}
EXPORT_SYMBOL_GPL(device_link_remove);

static void device_links_missing_supplier(struct device *dev)
{
        struct device_link *link;

        list_for_each_entry(link, &dev->links.suppliers, c_node) {
                if (link->status != DL_STATE_CONSUMER_PROBE)
                        continue;

                if (link->supplier->links.status == DL_DEV_DRIVER_BOUND) {
                        WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
                } else {
                        WARN_ON(!device_link_test(link, DL_FLAG_SYNC_STATE_ONLY));
                        WRITE_ONCE(link->status, DL_STATE_DORMANT);
                }
        }
}

static bool dev_is_best_effort(struct device *dev)
{
        return (fw_devlink_best_effort && dev->can_match) ||
                (dev->fwnode && fwnode_test_flag(dev->fwnode, FWNODE_FLAG_BEST_EFFORT));
}

static struct fwnode_handle *fwnode_links_check_suppliers(
                                                struct fwnode_handle *fwnode)
{
        struct fwnode_link *link;

        if (!fwnode || fw_devlink_is_permissive())
                return NULL;

        list_for_each_entry(link, &fwnode->suppliers, c_hook)
                if (!(link->flags &
                      (FWLINK_FLAG_CYCLE | FWLINK_FLAG_IGNORE)))
                        return link->supplier;

        return NULL;
}

/**
 * device_links_check_suppliers - Check presence of supplier drivers.
 * @dev: Consumer device.
 *
 * Check links from this device to any suppliers.  Walk the list of the device's
 * links to suppliers and see if all of them are available.  If not, simply
 * return -EPROBE_DEFER.
 *
 * We need to guarantee that the supplier will not go away after the check has
 * been positive here.  It only can go away in __device_release_driver() and
 * that function  checks the device's links to consumers.  This means we need to
 * mark the link as "consumer probe in progress" to make the supplier removal
 * wait for us to complete (or bad things may happen).
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
int device_links_check_suppliers(struct device *dev)
{
        struct device_link *link;
        int ret = 0, fwnode_ret = 0;
        struct fwnode_handle *sup_fw;

        /*
         * Device waiting for supplier to become available is not allowed to
         * probe.
         */
        scoped_guard(mutex, &fwnode_link_lock) {
                sup_fw = fwnode_links_check_suppliers(dev->fwnode);
                if (sup_fw) {
                        if (dev_is_best_effort(dev))
                                fwnode_ret = -EAGAIN;
                        else
                                return dev_err_probe(dev, -EPROBE_DEFER,
                                                     "wait for supplier %pfwf\n", sup_fw);
                }
        }

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.suppliers, c_node) {
                if (!device_link_test(link, DL_FLAG_MANAGED))
                        continue;

                if (link->status != DL_STATE_AVAILABLE &&
                    !device_link_test(link, DL_FLAG_SYNC_STATE_ONLY)) {

                        if (dev_is_best_effort(dev) &&
                            device_link_test(link, DL_FLAG_INFERRED) &&
                            !link->supplier->can_match) {
                                ret = -EAGAIN;
                                continue;
                        }

                        device_links_missing_supplier(dev);
                        ret = dev_err_probe(dev, -EPROBE_DEFER,
                                            "supplier %s not ready\n", dev_name(link->supplier));
                        break;
                }
                WRITE_ONCE(link->status, DL_STATE_CONSUMER_PROBE);
        }
        dev->links.status = DL_DEV_PROBING;

        device_links_write_unlock();

        return ret ? ret : fwnode_ret;
}

/**
 * __device_links_queue_sync_state - Queue a device for sync_state() callback
 * @dev: Device to call sync_state() on
 * @list: List head to queue the @dev on
 *
 * Queues a device for a sync_state() callback when the device links write lock
 * isn't held. This allows the sync_state() execution flow to use device links
 * APIs.  The caller must ensure this function is called with
 * device_links_write_lock() held.
 *
 * This function does a get_device() to make sure the device is not freed while
 * on this list.
 *
 * So the caller must also ensure that device_links_flush_sync_list() is called
 * as soon as the caller releases device_links_write_lock().  This is necessary
 * to make sure the sync_state() is called in a timely fashion and the
 * put_device() is called on this device.
 */
static void __device_links_queue_sync_state(struct device *dev,
                                            struct list_head *list)
{
        struct device_link *link;

        if (!dev_has_sync_state(dev))
                return;
        if (dev->state_synced)
                return;

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!device_link_test(link, DL_FLAG_MANAGED))
                        continue;
                if (link->status != DL_STATE_ACTIVE)
                        return;
        }

        /*
         * Set the flag here to avoid adding the same device to a list more
         * than once. This can happen if new consumers get added to the device
         * and probed before the list is flushed.
         */
        dev->state_synced = true;

        if (WARN_ON(!list_empty(&dev->links.defer_sync)))
                return;

        get_device(dev);
        list_add_tail(&dev->links.defer_sync, list);
}

/**
 * device_links_flush_sync_list - Call sync_state() on a list of devices
 * @list: List of devices to call sync_state() on
 * @dont_lock_dev: Device for which lock is already held by the caller
 *
 * Calls sync_state() on all the devices that have been queued for it. This
 * function is used in conjunction with __device_links_queue_sync_state(). The
 * @dont_lock_dev parameter is useful when this function is called from a
 * context where a device lock is already held.
 */
static void device_links_flush_sync_list(struct list_head *list,
                                         struct device *dont_lock_dev)
{
        struct device *dev, *tmp;

        list_for_each_entry_safe(dev, tmp, list, links.defer_sync) {
                list_del_init(&dev->links.defer_sync);

                if (dev != dont_lock_dev)
                        device_lock(dev);

                dev_sync_state(dev);

                if (dev != dont_lock_dev)
                        device_unlock(dev);

                put_device(dev);
        }
}

void device_links_supplier_sync_state_pause(void)
{
        device_links_write_lock();
        defer_sync_state_count++;
        device_links_write_unlock();
}

void device_links_supplier_sync_state_resume(void)
{
        struct device *dev, *tmp;
        LIST_HEAD(sync_list);

        device_links_write_lock();
        if (!defer_sync_state_count) {
                WARN(true, "Unmatched sync_state pause/resume!");
                goto out;
        }
        defer_sync_state_count--;
        if (defer_sync_state_count)
                goto out;

        list_for_each_entry_safe(dev, tmp, &deferred_sync, links.defer_sync) {
                /*
                 * Delete from deferred_sync list before queuing it to
                 * sync_list because defer_sync is used for both lists.
                 */
                list_del_init(&dev->links.defer_sync);
                __device_links_queue_sync_state(dev, &sync_list);
        }
out:
        device_links_write_unlock();

        device_links_flush_sync_list(&sync_list, NULL);
}

static int sync_state_resume_initcall(void)
{
        device_links_supplier_sync_state_resume();
        return 0;
}
late_initcall(sync_state_resume_initcall);

static void __device_links_supplier_defer_sync(struct device *sup)
{
        if (list_empty(&sup->links.defer_sync) && dev_has_sync_state(sup))
                list_add_tail(&sup->links.defer_sync, &deferred_sync);
}

static void device_link_drop_managed(struct device_link *link)
{
        link->flags &= ~DL_FLAG_MANAGED;
        WRITE_ONCE(link->status, DL_STATE_NONE);
        kref_put(&link->kref, __device_link_del);
}

static ssize_t waiting_for_supplier_show(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf)
{
        bool val;

        device_lock(dev);
        scoped_guard(mutex, &fwnode_link_lock)
                val = !!fwnode_links_check_suppliers(dev->fwnode);
        device_unlock(dev);
        return sysfs_emit(buf, "%u\n", val);
}
static DEVICE_ATTR_RO(waiting_for_supplier);

/**
 * device_links_force_bind - Prepares device to be force bound
 * @dev: Consumer device.
 *
 * device_bind_driver() force binds a device to a driver without calling any
 * driver probe functions. So the consumer really isn't going to wait for any
 * supplier before it's bound to the driver. We still want the device link
 * states to be sensible when this happens.
 *
 * In preparation for device_bind_driver(), this function goes through each
 * supplier device links and checks if the supplier is bound. If it is, then
 * the device link status is set to CONSUMER_PROBE. Otherwise, the device link
 * is dropped. Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_force_bind(struct device *dev)
{
        struct device_link *link, *ln;

        device_links_write_lock();

        list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
                if (!device_link_test(link, DL_FLAG_MANAGED))
                        continue;

                if (link->status != DL_STATE_AVAILABLE) {
                        device_link_drop_managed(link);
                        continue;
                }
                WRITE_ONCE(link->status, DL_STATE_CONSUMER_PROBE);
        }
        dev->links.status = DL_DEV_PROBING;

        device_links_write_unlock();
}

/**
 * device_links_driver_bound - Update device links after probing its driver.
 * @dev: Device to update the links for.
 *
 * The probe has been successful, so update links from this device to any
 * consumers by changing their status to "available".
 *
 * Also change the status of @dev's links to suppliers to "active".
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_driver_bound(struct device *dev)
{
        struct device_link *link, *ln;
        LIST_HEAD(sync_list);

        /*
         * If a device binds successfully, it's expected to have created all
         * the device links it needs to or make new device links as it needs
         * them. So, fw_devlink no longer needs to create device links to any
         * of the device's suppliers.
         *
         * Also, if a child firmware node of this bound device is not added as a
         * device by now, assume it is never going to be added. Make this bound
         * device the fallback supplier to the dangling consumers of the child
         * firmware node because this bound device is probably implementing the
         * child firmware node functionality and we don't want the dangling
         * consumers to defer probe indefinitely waiting for a device for the
         * child firmware node.
         */
        if (dev->fwnode && dev->fwnode->dev == dev) {
                struct fwnode_handle *child;

                fwnode_links_purge_suppliers(dev->fwnode);

                guard(mutex)(&fwnode_link_lock);

                fwnode_for_each_available_child_node(dev->fwnode, child)
                        __fw_devlink_pickup_dangling_consumers(child,
                                                               dev->fwnode);
                __fw_devlink_link_to_consumers(dev);
        }
        device_remove_file(dev, &dev_attr_waiting_for_supplier);

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!device_link_test(link, DL_FLAG_MANAGED))
                        continue;

                /*
                 * Links created during consumer probe may be in the "consumer
                 * probe" state to start with if the supplier is still probing
                 * when they are created and they may become "active" if the
                 * consumer probe returns first.  Skip them here.
                 */
                if (link->status == DL_STATE_CONSUMER_PROBE ||
                    link->status == DL_STATE_ACTIVE)
                        continue;

                WARN_ON(link->status != DL_STATE_DORMANT);
                WRITE_ONCE(link->status, DL_STATE_AVAILABLE);

                if (device_link_test(link, DL_FLAG_AUTOPROBE_CONSUMER))
                        driver_deferred_probe_add(link->consumer);
        }

        if (defer_sync_state_count)
                __device_links_supplier_defer_sync(dev);
        else
                __device_links_queue_sync_state(dev, &sync_list);

        list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
                struct device *supplier;

                if (!device_link_test(link, DL_FLAG_MANAGED))
                        continue;

                supplier = link->supplier;
                if (device_link_test(link, DL_FLAG_SYNC_STATE_ONLY)) {
                        /*
                         * When DL_FLAG_SYNC_STATE_ONLY is set, it means no
                         * other DL_MANAGED_LINK_FLAGS have been set. So, it's
                         * save to drop the managed link completely.
                         */
                        device_link_drop_managed(link);
                } else if (dev_is_best_effort(dev) &&
                           device_link_test(link, DL_FLAG_INFERRED) &&
                           link->status != DL_STATE_CONSUMER_PROBE &&
                           !link->supplier->can_match) {
                        /*
                         * When dev_is_best_effort() is true, we ignore device
                         * links to suppliers that don't have a driver.  If the
                         * consumer device still managed to probe, there's no
                         * point in maintaining a device link in a weird state
                         * (consumer probed before supplier). So delete it.
                         */
                        device_link_drop_managed(link);
                } else {
                        WARN_ON(link->status != DL_STATE_CONSUMER_PROBE);
                        WRITE_ONCE(link->status, DL_STATE_ACTIVE);
                }

                /*
                 * This needs to be done even for the deleted
                 * DL_FLAG_SYNC_STATE_ONLY device link in case it was the last
                 * device link that was preventing the supplier from getting a
                 * sync_state() call.
                 */
                if (defer_sync_state_count)
                        __device_links_supplier_defer_sync(supplier);
                else
                        __device_links_queue_sync_state(supplier, &sync_list);
        }

        dev->links.status = DL_DEV_DRIVER_BOUND;

        device_links_write_unlock();

        device_links_flush_sync_list(&sync_list, dev);
}

/**
 * __device_links_no_driver - Update links of a device without a driver.
 * @dev: Device without a drvier.
 *
 * Delete all non-persistent links from this device to any suppliers.
 *
 * Persistent links stay around, but their status is changed to "available",
 * unless they already are in the "supplier unbind in progress" state in which
 * case they need not be updated.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
static void __device_links_no_driver(struct device *dev)
{
        struct device_link *link, *ln;

        list_for_each_entry_safe_reverse(link, ln, &dev->links.suppliers, c_node) {
                if (!device_link_test(link, DL_FLAG_MANAGED))
                        continue;

                if (device_link_test(link, DL_FLAG_AUTOREMOVE_CONSUMER)) {
                        device_link_drop_managed(link);
                        continue;
                }

                if (link->status != DL_STATE_CONSUMER_PROBE &&
                    link->status != DL_STATE_ACTIVE)
                        continue;

                if (link->supplier->links.status == DL_DEV_DRIVER_BOUND) {
                        WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
                } else {
                        WARN_ON(!device_link_test(link, DL_FLAG_SYNC_STATE_ONLY));
                        WRITE_ONCE(link->status, DL_STATE_DORMANT);
                }
        }

        dev->links.status = DL_DEV_NO_DRIVER;
}

/**
 * device_links_no_driver - Update links after failing driver probe.
 * @dev: Device whose driver has just failed to probe.
 *
 * Clean up leftover links to consumers for @dev and invoke
 * %__device_links_no_driver() to update links to suppliers for it as
 * appropriate.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_no_driver(struct device *dev)
{
        struct device_link *link;

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!device_link_test(link, DL_FLAG_MANAGED))
                        continue;

                /*
                 * The probe has failed, so if the status of the link is
                 * "consumer probe" or "active", it must have been added by
                 * a probing consumer while this device was still probing.
                 * Change its state to "dormant", as it represents a valid
                 * relationship, but it is not functionally meaningful.
                 */
                if (link->status == DL_STATE_CONSUMER_PROBE ||
                    link->status == DL_STATE_ACTIVE)
                        WRITE_ONCE(link->status, DL_STATE_DORMANT);
        }

        __device_links_no_driver(dev);

        device_links_write_unlock();
}

/**
 * device_links_driver_cleanup - Update links after driver removal.
 * @dev: Device whose driver has just gone away.
 *
 * Update links to consumers for @dev by changing their status to "dormant" and
 * invoke %__device_links_no_driver() to update links to suppliers for it as
 * appropriate.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_driver_cleanup(struct device *dev)
{
        struct device_link *link, *ln;

        device_links_write_lock();

        list_for_each_entry_safe(link, ln, &dev->links.consumers, s_node) {
                if (!device_link_test(link, DL_FLAG_MANAGED))
                        continue;

                WARN_ON(device_link_test(link, DL_FLAG_AUTOREMOVE_CONSUMER));
                WARN_ON(link->status != DL_STATE_SUPPLIER_UNBIND);

                /*
                 * autoremove the links between this @dev and its consumer
                 * devices that are not active, i.e. where the link state
                 * has moved to DL_STATE_SUPPLIER_UNBIND.
                 */
                if (link->status == DL_STATE_SUPPLIER_UNBIND &&
                    device_link_test(link, DL_FLAG_AUTOREMOVE_SUPPLIER))
                        device_link_drop_managed(link);

                WRITE_ONCE(link->status, DL_STATE_DORMANT);
        }

        list_del_init(&dev->links.defer_sync);
        __device_links_no_driver(dev);

        device_links_write_unlock();
}

/**
 * device_links_busy - Check if there are any busy links to consumers.
 * @dev: Device to check.
 *
 * Check each consumer of the device and return 'true' if its link's status
 * is one of "consumer probe" or "active" (meaning that the given consumer is
 * probing right now or its driver is present).  Otherwise, change the link
 * state to "supplier unbind" to prevent the consumer from being probed
 * successfully going forward.
 *
 * Return 'false' if there are no probing or active consumers.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
bool device_links_busy(struct device *dev)
{
        struct device_link *link;
        bool ret = false;

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!device_link_test(link, DL_FLAG_MANAGED))
                        continue;

                if (link->status == DL_STATE_CONSUMER_PROBE
                    || link->status == DL_STATE_ACTIVE) {
                        ret = true;
                        break;
                }
                WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND);
        }

        dev->links.status = DL_DEV_UNBINDING;

        device_links_write_unlock();
        return ret;
}

/**
 * device_links_unbind_consumers - Force unbind consumers of the given device.
 * @dev: Device to unbind the consumers of.
 *
 * Walk the list of links to consumers for @dev and if any of them is in the
 * "consumer probe" state, wait for all device probes in progress to complete
 * and start over.
 *
 * If that's not the case, change the status of the link to "supplier unbind"
 * and check if the link was in the "active" state.  If so, force the consumer
 * driver to unbind and start over (the consumer will not re-probe as we have
 * changed the state of the link already).
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_unbind_consumers(struct device *dev)
{
        struct device_link *link;

 start:
        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                enum device_link_state status;

                if (!device_link_test(link, DL_FLAG_MANAGED) ||
                    device_link_test(link, DL_FLAG_SYNC_STATE_ONLY))
                        continue;

                status = link->status;
                if (status == DL_STATE_CONSUMER_PROBE) {
                        device_links_write_unlock();

                        wait_for_device_probe();
                        goto start;
                }
                WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND);
                if (status == DL_STATE_ACTIVE) {
                        struct device *consumer = link->consumer;

                        get_device(consumer);

                        device_links_write_unlock();

                        device_release_driver_internal(consumer, NULL,
                                                       consumer->parent);
                        put_device(consumer);
                        goto start;
                }
        }

        device_links_write_unlock();
}

/**
 * device_links_purge - Delete existing links to other devices.
 * @dev: Target device.
 */
static void device_links_purge(struct device *dev)
{
        struct device_link *link, *ln;

        if (dev->class == &devlink_class)
                return;

        /*
         * Delete all of the remaining links from this device to any other
         * devices (either consumers or suppliers).
         */
        device_links_write_lock();

        list_for_each_entry_safe_reverse(link, ln, &dev->links.suppliers, c_node) {
                WARN_ON(link->status == DL_STATE_ACTIVE);
                __device_link_del(&link->kref);
        }

        list_for_each_entry_safe_reverse(link, ln, &dev->links.consumers, s_node) {
                WARN_ON(link->status != DL_STATE_DORMANT &&
                        link->status != DL_STATE_NONE);
                __device_link_del(&link->kref);
        }

        device_links_write_unlock();
}

#define FW_DEVLINK_FLAGS_PERMISSIVE        (DL_FLAG_INFERRED | \
                                         DL_FLAG_SYNC_STATE_ONLY)
#define FW_DEVLINK_FLAGS_ON                (DL_FLAG_INFERRED | \
                                         DL_FLAG_AUTOPROBE_CONSUMER)
#define FW_DEVLINK_FLAGS_RPM                (FW_DEVLINK_FLAGS_ON | \
                                         DL_FLAG_PM_RUNTIME)

static u32 fw_devlink_flags = FW_DEVLINK_FLAGS_RPM;
static int __init fw_devlink_setup(char *arg)
{
        if (!arg)
                return -EINVAL;

        if (strcmp(arg, "off") == 0) {
                fw_devlink_flags = 0;
        } else if (strcmp(arg, "permissive") == 0) {
                fw_devlink_flags = FW_DEVLINK_FLAGS_PERMISSIVE;
        } else if (strcmp(arg, "on") == 0) {
                fw_devlink_flags = FW_DEVLINK_FLAGS_ON;
        } else if (strcmp(arg, "rpm") == 0) {
                fw_devlink_flags = FW_DEVLINK_FLAGS_RPM;
        }
        return 0;
}
early_param("fw_devlink", fw_devlink_setup);

static bool fw_devlink_strict;
static int __init fw_devlink_strict_setup(char *arg)
{
        return kstrtobool(arg, &fw_devlink_strict);
}
early_param("fw_devlink.strict", fw_devlink_strict_setup);

#define FW_DEVLINK_SYNC_STATE_STRICT        0
#define FW_DEVLINK_SYNC_STATE_TIMEOUT        1

#ifndef CONFIG_FW_DEVLINK_SYNC_STATE_TIMEOUT
static int fw_devlink_sync_state;
#else
static int fw_devlink_sync_state = FW_DEVLINK_SYNC_STATE_TIMEOUT;
#endif

static int __init fw_devlink_sync_state_setup(char *arg)
{
        if (!arg)
                return -EINVAL;

        if (strcmp(arg, "strict") == 0) {
                fw_devlink_sync_state = FW_DEVLINK_SYNC_STATE_STRICT;
                return 0;
        } else if (strcmp(arg, "timeout") == 0) {
                fw_devlink_sync_state = FW_DEVLINK_SYNC_STATE_TIMEOUT;
                return 0;
        }
        return -EINVAL;
}
early_param("fw_devlink.sync_state", fw_devlink_sync_state_setup);

static inline u32 fw_devlink_get_flags(u8 fwlink_flags)
{
        if (fwlink_flags & FWLINK_FLAG_CYCLE)
                return FW_DEVLINK_FLAGS_PERMISSIVE | DL_FLAG_CYCLE;

        return fw_devlink_flags;
}

static bool fw_devlink_is_permissive(void)
{
        return fw_devlink_flags == FW_DEVLINK_FLAGS_PERMISSIVE;
}

bool fw_devlink_is_strict(void)
{
        return fw_devlink_strict && !fw_devlink_is_permissive();
}

static void fw_devlink_parse_fwnode(struct fwnode_handle *fwnode)
{
        if (fwnode_test_flag(fwnode, FWNODE_FLAG_LINKS_ADDED))
                return;

        fwnode_call_int_op(fwnode, add_links);
        fwnode_set_flag(fwnode, FWNODE_FLAG_LINKS_ADDED);
}

static void fw_devlink_parse_fwtree(struct fwnode_handle *fwnode)
{
        struct fwnode_handle *child = NULL;

        fw_devlink_parse_fwnode(fwnode);

        while ((child = fwnode_get_next_available_child_node(fwnode, child)))
                fw_devlink_parse_fwtree(child);
}

static void fw_devlink_relax_link(struct device_link *link)
{
        if (!device_link_test(link, DL_FLAG_INFERRED))
                return;

        if (device_link_flag_is_sync_state_only(link->flags))
                return;

        pm_runtime_drop_link(link);
        link->flags = DL_FLAG_MANAGED | FW_DEVLINK_FLAGS_PERMISSIVE;
        dev_dbg(link->consumer, "Relaxing link with %s\n",
                dev_name(link->supplier));
}

static int fw_devlink_no_driver(struct device *dev, void *data)
{
        struct device_link *link = to_devlink(dev);

        if (!link->supplier->can_match)
                fw_devlink_relax_link(link);

        return 0;
}

void fw_devlink_drivers_done(void)
{
        fw_devlink_drv_reg_done = true;
        device_links_write_lock();
        class_for_each_device(&devlink_class, NULL, NULL,
                              fw_devlink_no_driver);
        device_links_write_unlock();
}

static int fw_devlink_dev_sync_state(struct device *dev, void *data)
{
        struct device_link *link = to_devlink(dev);
        struct device *sup = link->supplier;

        if (!device_link_test(link, DL_FLAG_MANAGED) ||
            link->status == DL_STATE_ACTIVE || sup->state_synced ||
            !dev_has_sync_state(sup))
                return 0;

        if (fw_devlink_sync_state == FW_DEVLINK_SYNC_STATE_STRICT) {
                dev_info(sup, "sync_state() pending due to %s\n",
                         dev_name(link->consumer));
                return 0;
        }

        if (!list_empty(&sup->links.defer_sync))
                return 0;

        dev_warn(sup, "Timed out. Forcing sync_state()\n");
        sup->state_synced = true;
        get_device(sup);
        list_add_tail(&sup->links.defer_sync, data);

        return 0;
}

void fw_devlink_probing_done(void)
{
        LIST_HEAD(sync_list);

        device_links_write_lock();
        class_for_each_device(&devlink_class, NULL, &sync_list,
                              fw_devlink_dev_sync_state);
        device_links_write_unlock();
        device_links_flush_sync_list(&sync_list, NULL);
}

/**
 * wait_for_init_devices_probe - Try to probe any device needed for init
 *
 * Some devices might need to be probed and bound successfully before the kernel
 * boot sequence can finish and move on to init/userspace. For example, a
 * network interface might need to be bound to be able to mount a NFS rootfs.
 *
 * With fw_devlink=on by default, some of these devices might be blocked from
 * probing because they are waiting on a optional supplier that doesn't have a
 * driver. While fw_devlink will eventually identify such devices and unblock
 * the probing automatically, it might be too late by the time it unblocks the
 * probing of devices. For example, the IP4 autoconfig might timeout before
 * fw_devlink unblocks probing of the network interface.
 *
 * This function is available to temporarily try and probe all devices that have
 * a driver even if some of their suppliers haven't been added or don't have
 * drivers.
 *
 * The drivers can then decide which of the suppliers are optional vs mandatory
 * and probe the device if possible. By the time this function returns, all such
 * "best effort" probes are guaranteed to be completed. If a device successfully
 * probes in this mode, we delete all fw_devlink discovered dependencies of that
 * device where the supplier hasn't yet probed successfully because they have to
 * be optional dependencies.
 *
 * Any devices that didn't successfully probe go back to being treated as if
 * this function was never called.
 *
 * This also means that some devices that aren't needed for init and could have
 * waited for their optional supplier to probe (when the supplier's module is
 * loaded later on) would end up probing prematurely with limited functionality.
 * So call this function only when boot would fail without it.
 */
void __init wait_for_init_devices_probe(void)
{
        if (!fw_devlink_flags || fw_devlink_is_permissive())
                return;

        /*
         * Wait for all ongoing probes to finish so that the "best effort" is
         * only applied to devices that can't probe otherwise.
         */
        wait_for_device_probe();

        pr_info("Trying to probe devices needed for running init ...\n");
        fw_devlink_best_effort = true;
        driver_deferred_probe_trigger();

        /*
         * Wait for all "best effort" probes to finish before going back to
         * normal enforcement.
         */
        wait_for_device_probe();
        fw_devlink_best_effort = false;
}

static void fw_devlink_unblock_consumers(struct device *dev)
{
        struct device_link *link;

        if (!fw_devlink_flags || fw_devlink_is_permissive())
                return;

        device_links_write_lock();
        list_for_each_entry(link, &dev->links.consumers, s_node)
                fw_devlink_relax_link(link);
        device_links_write_unlock();
}

static bool fwnode_init_without_drv(struct fwnode_handle *fwnode)
{
        struct device *dev;
        bool ret;

        if (!fwnode_test_flag(fwnode, FWNODE_FLAG_INITIALIZED))
                return false;

        dev = get_dev_from_fwnode(fwnode);
        ret = !dev || dev->links.status == DL_DEV_NO_DRIVER;
        put_device(dev);

        return ret;
}

static bool fwnode_ancestor_init_without_drv(struct fwnode_handle *fwnode)
{
        struct fwnode_handle *parent;

        fwnode_for_each_parent_node(fwnode, parent) {
                if (fwnode_init_without_drv(parent)) {
                        fwnode_handle_put(parent);
                        return true;
                }
        }

        return false;
}

/**
 * fwnode_is_ancestor_of - Test if @ancestor is ancestor of @child
 * @ancestor: Firmware which is tested for being an ancestor
 * @child: Firmware which is tested for being the child
 *
 * A node is considered an ancestor of itself too.
 *
 * Return: true if @ancestor is an ancestor of @child. Otherwise, returns false.
 */
static bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor,
                                  const struct fwnode_handle *child)
{
        struct fwnode_handle *parent;

        if (IS_ERR_OR_NULL(ancestor))
                return false;

        if (child == ancestor)
                return true;

        fwnode_for_each_parent_node(child, parent) {
                if (parent == ancestor) {
                        fwnode_handle_put(parent);
                        return true;
                }
        }
        return false;
}

/**
 * fwnode_get_next_parent_dev - Find device of closest ancestor fwnode
 * @fwnode: firmware node
 *
 * Given a firmware node (@fwnode), this function finds its closest ancestor
 * firmware node that has a corresponding struct device and returns that struct
 * device.
 *
 * The caller is responsible for calling put_device() on the returned device
 * pointer.
 *
 * Return: a pointer to the device of the @fwnode's closest ancestor.
 */
static struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode)
{
        struct fwnode_handle *parent;
        struct device *dev;

        fwnode_for_each_parent_node(fwnode, parent) {
                dev = get_dev_from_fwnode(parent);
                if (dev) {
                        fwnode_handle_put(parent);
                        return dev;
                }
        }
        return NULL;
}

/**
 * __fw_devlink_relax_cycles - Relax and mark dependency cycles.
 * @con_handle: Potential consumer device fwnode.
 * @sup_handle: Potential supplier's fwnode.
 *
 * Needs to be called with fwnode_lock and device link lock held.
 *
 * Check if @sup_handle or any of its ancestors or suppliers direct/indirectly
 * depend on @con. This function can detect multiple cyles between @sup_handle
 * and @con. When such dependency cycles are found, convert all device links
 * created solely by fw_devlink into SYNC_STATE_ONLY device links. Also, mark
 * all fwnode links in the cycle with FWLINK_FLAG_CYCLE so that when they are
 * converted into a device link in the future, they are created as
 * SYNC_STATE_ONLY device links. This is the equivalent of doing
 * fw_devlink=permissive just between the devices in the cycle. We need to do
 * this because, at this point, fw_devlink can't tell which of these
 * dependencies is not a real dependency.
 *
 * Return true if one or more cycles were found. Otherwise, return false.
 */
static bool __fw_devlink_relax_cycles(struct fwnode_handle *con_handle,
                                 struct fwnode_handle *sup_handle)
{
        struct device *sup_dev = NULL, *par_dev = NULL, *con_dev = NULL;
        struct fwnode_link *link;
        struct device_link *dev_link;
        bool ret = false;

        if (!sup_handle)
                return false;

        /*
         * We aren't trying to find all cycles. Just a cycle between con and
         * sup_handle.
         */
        if (fwnode_test_flag(sup_handle, FWNODE_FLAG_VISITED))
                return false;

        fwnode_set_flag(sup_handle, FWNODE_FLAG_VISITED);

        /* Termination condition. */
        if (sup_handle == con_handle) {
                pr_debug("----- cycle: start -----\n");
                ret = true;
                goto out;
        }

        sup_dev = get_dev_from_fwnode(sup_handle);
        con_dev = get_dev_from_fwnode(con_handle);
        /*
         * If sup_dev is bound to a driver and @con hasn't started binding to a
         * driver, sup_dev can't be a consumer of @con. So, no need to check
         * further.
         */
        if (sup_dev && sup_dev->links.status ==  DL_DEV_DRIVER_BOUND &&
            con_dev && con_dev->links.status == DL_DEV_NO_DRIVER) {
                ret = false;
                goto out;
        }

        list_for_each_entry(link, &sup_handle->suppliers, c_hook) {
                if (link->flags & FWLINK_FLAG_IGNORE)
                        continue;

                if (__fw_devlink_relax_cycles(con_handle, link->supplier)) {
                        __fwnode_link_cycle(link);
                        ret = true;
                }
        }

        /*
         * Give priority to device parent over fwnode parent to account for any
         * quirks in how fwnodes are converted to devices.
         */
        if (sup_dev)
                par_dev = get_device(sup_dev->parent);
        else
                par_dev = fwnode_get_next_parent_dev(sup_handle);

        if (par_dev && __fw_devlink_relax_cycles(con_handle, par_dev->fwnode)) {
                pr_debug("%pfwf: cycle: child of %pfwf\n", sup_handle,
                         par_dev->fwnode);
                ret = true;
        }

        if (!sup_dev)
                goto out;

        list_for_each_entry(dev_link, &sup_dev->links.suppliers, c_node) {
                /*
                 * Ignore a SYNC_STATE_ONLY flag only if it wasn't marked as
                 * such due to a cycle.
                 */
                if (device_link_flag_is_sync_state_only(dev_link->flags) &&
                    !device_link_test(dev_link, DL_FLAG_CYCLE))
                        continue;

                if (__fw_devlink_relax_cycles(con_handle,
                                              dev_link->supplier->fwnode)) {
                        pr_debug("%pfwf: cycle: depends on %pfwf\n", sup_handle,
                                 dev_link->supplier->fwnode);
                        fw_devlink_relax_link(dev_link);
                        dev_link->flags |= DL_FLAG_CYCLE;
                        ret = true;
                }
        }

out:
        fwnode_clear_flag(sup_handle, FWNODE_FLAG_VISITED);
        put_device(sup_dev);
        put_device(con_dev);
        put_device(par_dev);
        return ret;
}

/**
 * fw_devlink_create_devlink - Create a device link from a consumer to fwnode
 * @con: consumer device for the device link
 * @sup_handle: fwnode handle of supplier
 * @link: fwnode link that's being converted to a device link
 *
 * This function will try to create a device link between the consumer device
 * @con and the supplier device represented by @sup_handle.
 *
 * The supplier has to be provided as a fwnode because incorrect cycles in
 * fwnode links can sometimes cause the supplier device to never be created.
 * This function detects such cases and returns an error if it cannot create a
 * device link from the consumer to a missing supplier.
 *
 * Returns,
 * 0 on successfully creating a device link
 * -EINVAL if the device link cannot be created as expected
 * -EAGAIN if the device link cannot be created right now, but it may be
 *  possible to do that in the future
 */
static int fw_devlink_create_devlink(struct device *con,
                                     struct fwnode_handle *sup_handle,
                                     struct fwnode_link *link)
{
        struct device *sup_dev;
        int ret = 0;
        u32 flags;

        if (link->flags & FWLINK_FLAG_IGNORE)
                return 0;

        /*
         * In some cases, a device P might also be a supplier to its child node
         * C. However, this would defer the probe of C until the probe of P
         * completes successfully. This is perfectly fine in the device driver
         * model. device_add() doesn't guarantee probe completion of the device
         * by the time it returns.
         *
         * However, there are a few drivers that assume C will finish probing
         * as soon as it's added and before P finishes probing. So, we provide
         * a flag to let fw_devlink know not to delay the probe of C until the
         * probe of P completes successfully.
         *
         * When such a flag is set, we can't create device links where P is the
         * supplier of C as that would delay the probe of C.
         */
        if (fwnode_test_flag(sup_handle, FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD) &&
            fwnode_is_ancestor_of(sup_handle, con->fwnode))
                return -EINVAL;

        /*
         * Don't try to optimize by not calling the cycle detection logic under
         * certain conditions. There's always some corner case that won't get
         * detected.
         */
        device_links_write_lock();
        if (__fw_devlink_relax_cycles(link->consumer, sup_handle)) {
                __fwnode_link_cycle(link);
                pr_debug("----- cycle: end -----\n");
                pr_info("%pfwf: Fixed dependency cycle(s) with %pfwf\n",
                        link->consumer, sup_handle);
        }
        device_links_write_unlock();

        if (con->fwnode == link->consumer)
                flags = fw_devlink_get_flags(link->flags);
        else
                flags = FW_DEVLINK_FLAGS_PERMISSIVE;

        if (fwnode_test_flag(sup_handle, FWNODE_FLAG_NOT_DEVICE))
                sup_dev = fwnode_get_next_parent_dev(sup_handle);
        else
                sup_dev = get_dev_from_fwnode(sup_handle);

        if (sup_dev) {
                /*
                 * If it's one of those drivers that don't actually bind to
                 * their device using driver core, then don't wait on this
                 * supplier device indefinitely.
                 */
                if (sup_dev->links.status == DL_DEV_NO_DRIVER &&
                    fwnode_test_flag(sup_handle, FWNODE_FLAG_INITIALIZED)) {
                        dev_dbg(con,
                                "Not linking %pfwf - dev might never probe\n",
                                sup_handle);
                        ret = -EINVAL;
                        goto out;
                }

                if (con != sup_dev && !device_link_add(con, sup_dev, flags)) {
                        dev_err(con, "Failed to create device link (0x%x) with supplier %s for %pfwf\n",
                                flags, dev_name(sup_dev), link->consumer);
                        ret = -EINVAL;
                }

                goto out;
        }

        /*
         * Supplier or supplier's ancestor already initialized without a struct
         * device or being probed by a driver.
         */
        if (fwnode_init_without_drv(sup_handle) ||
            fwnode_ancestor_init_without_drv(sup_handle)) {
                dev_dbg(con, "Not linking %pfwf - might never become dev\n",
                        sup_handle);
                return -EINVAL;
        }

        ret = -EAGAIN;
out:
        put_device(sup_dev);
        return ret;
}

/**
 * __fw_devlink_link_to_consumers - Create device links to consumers of a device
 * @dev: Device that needs to be linked to its consumers
 *
 * This function looks at all the consumer fwnodes of @dev and creates device
 * links between the consumer device and @dev (supplier).
 *
 * If the consumer device has not been added yet, then this function creates a
 * SYNC_STATE_ONLY link between @dev (supplier) and the closest ancestor device
 * of the consumer fwnode. This is necessary to make sure @dev doesn't get a
 * sync_state() callback before the real consumer device gets to be added and
 * then probed.
 *
 * Once device links are created from the real consumer to @dev (supplier), the
 * fwnode links are deleted.
 */
static void __fw_devlink_link_to_consumers(struct device *dev)
{
        struct fwnode_handle *fwnode = dev->fwnode;
        struct fwnode_link *link, *tmp;

        list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook) {
                struct device *con_dev;
                bool own_link = true;
                int ret;

                con_dev = get_dev_from_fwnode(link->consumer);
                /*
                 * If consumer device is not available yet, make a "proxy"
                 * SYNC_STATE_ONLY link from the consumer's parent device to
                 * the supplier device. This is necessary to make sure the
                 * supplier doesn't get a sync_state() callback before the real
                 * consumer can create a device link to the supplier.
                 *
                 * This proxy link step is needed to handle the case where the
                 * consumer's parent device is added before the supplier.
                 */
                if (!con_dev) {
                        con_dev = fwnode_get_next_parent_dev(link->consumer);
                        /*
                         * However, if the consumer's parent device is also the
                         * parent of the supplier, don't create a
                         * consumer-supplier link from the parent to its child
                         * device. Such a dependency is impossible.
                         */
                        if (con_dev &&
                            fwnode_is_ancestor_of(con_dev->fwnode, fwnode)) {
                                put_device(con_dev);
                                con_dev = NULL;
                        } else {
                                own_link = false;
                        }
                }

                if (!con_dev)
                        continue;

                ret = fw_devlink_create_devlink(con_dev, fwnode, link);
                put_device(con_dev);
                if (!own_link || ret == -EAGAIN)
                        continue;

                __fwnode_link_del(link);
        }
}

/**
 * __fw_devlink_link_to_suppliers - Create device links to suppliers of a device
 * @dev: The consumer device that needs to be linked to its suppliers
 * @fwnode: Root of the fwnode tree that is used to create device links
 *
 * This function looks at all the supplier fwnodes of fwnode tree rooted at
 * @fwnode and creates device links between @dev (consumer) and all the
 * supplier devices of the entire fwnode tree at @fwnode.
 *
 * The function creates normal (non-SYNC_STATE_ONLY) device links between @dev
 * and the real suppliers of @dev. Once these device links are created, the
 * fwnode links are deleted.
 *
 * In addition, it also looks at all the suppliers of the entire fwnode tree
 * because some of the child devices of @dev that have not been added yet
 * (because @dev hasn't probed) might already have their suppliers added to
 * driver core. So, this function creates SYNC_STATE_ONLY device links between
 * @dev (consumer) and these suppliers to make sure they don't execute their
 * sync_state() callbacks before these child devices have a chance to create
 * their device links. The fwnode links that correspond to the child devices
 * aren't delete because they are needed later to create the device links
 * between the real consumer and supplier devices.
 */
static void __fw_devlink_link_to_suppliers(struct device *dev,
                                           struct fwnode_handle *fwnode)
{
        bool own_link = (dev->fwnode == fwnode);
        struct fwnode_link *link, *tmp;
        struct fwnode_handle *child = NULL;

        list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook) {
                int ret;
                struct fwnode_handle *sup = link->supplier;

                ret = fw_devlink_create_devlink(dev, sup, link);
                if (!own_link || ret == -EAGAIN)
                        continue;

                __fwnode_link_del(link);
        }

        /*
         * Make "proxy" SYNC_STATE_ONLY device links to represent the needs of
         * all the descendants. This proxy link step is needed to handle the
         * case where the supplier is added before the consumer's parent device
         * (@dev).
         */
        while ((child = fwnode_get_next_available_child_node(fwnode, child)))
                __fw_devlink_link_to_suppliers(dev, child);
}

static void fw_devlink_link_device(struct device *dev)
{
        struct fwnode_handle *fwnode = dev->fwnode;

        if (!fw_devlink_flags)
                return;

        fw_devlink_parse_fwtree(fwnode);

        guard(mutex)(&fwnode_link_lock);

        __fw_devlink_link_to_consumers(dev);
        __fw_devlink_link_to_suppliers(dev, fwnode);
}

/* Device links support end. */

static struct kobject *dev_kobj;

/* /sys/dev/char */
static struct kobject *sysfs_dev_char_kobj;

/* /sys/dev/block */
static struct kobject *sysfs_dev_block_kobj;

static DEFINE_MUTEX(device_hotplug_lock);

void lock_device_hotplug(void)
{
        mutex_lock(&device_hotplug_lock);
}

void unlock_device_hotplug(void)
{
        mutex_unlock(&device_hotplug_lock);
}

int lock_device_hotplug_sysfs(void)
{
        if (mutex_trylock(&device_hotplug_lock))
                return 0;

        /* Avoid busy looping (5 ms of sleep should do). */
        msleep(5);
        return restart_syscall();
}

#ifdef CONFIG_BLOCK
static inline int device_is_not_partition(struct device *dev)
{
        return !(dev->type == &part_type);
}
#else
static inline int device_is_not_partition(struct device *dev)
{
        return 1;
}
#endif

static void device_platform_notify(struct device *dev)
{
        acpi_device_notify(dev);

        software_node_notify(dev);
}

static void device_platform_notify_remove(struct device *dev)
{
        software_node_notify_remove(dev);

        acpi_device_notify_remove(dev);
}

/**
 * dev_driver_string - Return a device's driver name, if at all possible
 * @dev: struct device to get the name of
 *
 * Will return the device's driver's name if it is bound to a device.  If
 * the device is not bound to a driver, it will return the name of the bus
 * it is attached to.  If it is not attached to a bus either, an empty
 * string will be returned.
 */
const char *dev_driver_string(const struct device *dev)
{
        struct device_driver *drv;

        /* dev->driver can change to NULL underneath us because of unbinding,
         * so be careful about accessing it.  dev->bus and dev->class should
         * never change once they are set, so they don't need special care.
         */
        drv = READ_ONCE(dev->driver);
        return drv ? drv->name : dev_bus_name(dev);
}
EXPORT_SYMBOL(dev_driver_string);

#define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr)

static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr,
                             char *buf)
{
        struct device_attribute *dev_attr = to_dev_attr(attr);
        struct device *dev = kobj_to_dev(kobj);
        ssize_t ret = -EIO;

        if (dev_attr->show)
                ret = dev_attr->show(dev, dev_attr, buf);
        if (ret >= (ssize_t)PAGE_SIZE) {
                printk("dev_attr_show: %pS returned bad count\n",
                                dev_attr->show);
        }
        return ret;
}

static ssize_t dev_attr_store(struct kobject *kobj, struct attribute *attr,
                              const char *buf, size_t count)
{
        struct device_attribute *dev_attr = to_dev_attr(attr);
        struct device *dev = kobj_to_dev(kobj);
        ssize_t ret = -EIO;

        if (dev_attr->store)
                ret = dev_attr->store(dev, dev_attr, buf, count);
        return ret;
}

static const struct sysfs_ops dev_sysfs_ops = {
        .show        = dev_attr_show,
        .store        = dev_attr_store,
};

#define to_ext_attr(x) container_of(x, struct dev_ext_attribute, attr)

ssize_t device_store_ulong(struct device *dev,
                           struct device_attribute *attr,
                           const char *buf, size_t size)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);
        int ret;
        unsigned long new;

        ret = kstrtoul(buf, 0, &new);
        if (ret)
                return ret;
        *(unsigned long *)(ea->var) = new;
        /* Always return full write size even if we didn't consume all */
        return size;
}
EXPORT_SYMBOL_GPL(device_store_ulong);

ssize_t device_show_ulong(struct device *dev,
                          struct device_attribute *attr,
                          char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);
        return sysfs_emit(buf, "%lx\n", *(unsigned long *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_ulong);

ssize_t device_store_int(struct device *dev,
                         struct device_attribute *attr,
                         const char *buf, size_t size)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);
        int ret;
        long new;

        ret = kstrtol(buf, 0, &new);
        if (ret)
                return ret;

        if (new > INT_MAX || new < INT_MIN)
                return -EINVAL;
        *(int *)(ea->var) = new;
        /* Always return full write size even if we didn't consume all */
        return size;
}
EXPORT_SYMBOL_GPL(device_store_int);

ssize_t device_show_int(struct device *dev,
                        struct device_attribute *attr,
                        char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        return sysfs_emit(buf, "%d\n", *(int *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_int);

ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
                          const char *buf, size_t size)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        if (kstrtobool(buf, ea->var) < 0)
                return -EINVAL;

        return size;
}
EXPORT_SYMBOL_GPL(device_store_bool);

ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
                         char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        return sysfs_emit(buf, "%d\n", *(bool *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_bool);

ssize_t device_show_string(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        return sysfs_emit(buf, "%s\n", (char *)ea->var);
}
EXPORT_SYMBOL_GPL(device_show_string);

/**
 * device_release - free device structure.
 * @kobj: device's kobject.
 *
 * This is called once the reference count for the object
 * reaches 0. We forward the call to the device's release
 * method, which should handle actually freeing the structure.
 */
static void device_release(struct kobject *kobj)
{
        struct device *dev = kobj_to_dev(kobj);
        struct device_private *p = dev->p;

        /*
         * Some platform devices are driven without driver attached
         * and managed resources may have been acquired.  Make sure
         * all resources are released.
         *
         * Drivers still can add resources into device after device
         * is deleted but alive, so release devres here to avoid
         * possible memory leak.
         */
        devres_release_all(dev);

        kfree(dev->dma_range_map);
        kfree(dev->driver_override.name);

        if (dev->release)
                dev->release(dev);
        else if (dev->type && dev->type->release)
                dev->type->release(dev);
        else if (dev->class && dev->class->dev_release)
                dev->class->dev_release(dev);
        else
                WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
                        dev_name(dev));
        kfree(p);
}

static const struct ns_common *device_namespace(const struct kobject *kobj)
{
        const struct device *dev = kobj_to_dev(kobj);

        if (dev->class && dev->class->namespace)
                return dev->class->namespace(dev);

        return NULL;
}

static void device_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        const struct device *dev = kobj_to_dev(kobj);

        if (dev->class && dev->class->get_ownership)
                dev->class->get_ownership(dev, uid, gid);
}

static const struct kobj_type device_ktype = {
        .release        = device_release,
        .sysfs_ops        = &dev_sysfs_ops,
        .namespace        = device_namespace,
        .get_ownership        = device_get_ownership,
};


static int dev_uevent_filter(const struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);

        if (ktype == &device_ktype) {
                const struct device *dev = kobj_to_dev(kobj);
                if (dev->bus)
                        return 1;
                if (dev->class)
                        return 1;
        }
        return 0;
}

static const char *dev_uevent_name(const struct kobject *kobj)
{
        const struct device *dev = kobj_to_dev(kobj);

        if (dev->bus)
                return dev->bus->name;
        if (dev->class)
                return dev->class->name;
        return NULL;
}

/*
 * Try filling "DRIVER=<name>" uevent variable for a device. Because this
 * function may race with binding and unbinding the device from a driver,
 * we need to be careful. Binding is generally safe, at worst we miss the
 * fact that the device is already bound to a driver (but the driver
 * information that is delivered through uevents is best-effort, it may
 * become obsolete as soon as it is generated anyways). Unbinding is more
 * risky as driver pointer is transitioning to NULL, so READ_ONCE() should
 * be used to make sure we are dealing with the same pointer, and to
 * ensure that driver structure is not going to disappear from under us
 * we take bus' drivers klist lock. The assumption that only registered
 * driver can be bound to a device, and to unregister a driver bus code
 * will take the same lock.
 */
static void dev_driver_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);

        if (sp) {
                scoped_guard(spinlock, &sp->klist_drivers.k_lock) {
                        struct device_driver *drv = READ_ONCE(dev->driver);
                        if (drv)
                                add_uevent_var(env, "DRIVER=%s", drv->name);
                }

                subsys_put(sp);
        }
}

static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
        const struct device *dev = kobj_to_dev(kobj);
        int retval = 0;

        /* add device node properties if present */
        if (MAJOR(dev->devt)) {
                const char *tmp;
                const char *name;
                umode_t mode = 0;
                kuid_t uid = GLOBAL_ROOT_UID;
                kgid_t gid = GLOBAL_ROOT_GID;

                add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
                add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
                name = device_get_devnode(dev, &mode, &uid, &gid, &tmp);
                if (name) {
                        add_uevent_var(env, "DEVNAME=%s", name);
                        if (mode)
                                add_uevent_var(env, "DEVMODE=%#o", mode & 0777);
                        if (!uid_eq(uid, GLOBAL_ROOT_UID))
                                add_uevent_var(env, "DEVUID=%u", from_kuid(&init_user_ns, uid));
                        if (!gid_eq(gid, GLOBAL_ROOT_GID))
                                add_uevent_var(env, "DEVGID=%u", from_kgid(&init_user_ns, gid));
                        kfree(tmp);
                }
        }

        if (dev->type && dev->type->name)
                add_uevent_var(env, "DEVTYPE=%s", dev->type->name);

        /* Add "DRIVER=%s" variable if the device is bound to a driver */
        dev_driver_uevent(dev, env);

        /* Add common DT information about the device */
        of_device_uevent(dev, env);

        /* have the bus specific function add its stuff */
        if (dev->bus && dev->bus->uevent) {
                retval = dev->bus->uevent(dev, env);
                if (retval)
                        pr_debug("device: '%s': %s: bus uevent() returned %d\n",
                                 dev_name(dev), __func__, retval);
        }

        /* have the class specific function add its stuff */
        if (dev->class && dev->class->dev_uevent) {
                retval = dev->class->dev_uevent(dev, env);
                if (retval)
                        pr_debug("device: '%s': %s: class uevent() "
                                 "returned %d\n", dev_name(dev),
                                 __func__, retval);
        }

        /* have the device type specific function add its stuff */
        if (dev->type && dev->type->uevent) {
                retval = dev->type->uevent(dev, env);
                if (retval)
                        pr_debug("device: '%s': %s: dev_type uevent() "
                                 "returned %d\n", dev_name(dev),
                                 __func__, retval);
        }

        return retval;
}

static const struct kset_uevent_ops device_uevent_ops = {
        .filter =        dev_uevent_filter,
        .name =                dev_uevent_name,
        .uevent =        dev_uevent,
};

static ssize_t uevent_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct kobject *top_kobj;
        struct kset *kset;
        struct kobj_uevent_env *env = NULL;
        int i;
        int len = 0;
        int retval;

        /* search the kset, the device belongs to */
        top_kobj = &dev->kobj;
        while (!top_kobj->kset && top_kobj->parent)
                top_kobj = top_kobj->parent;
        if (!top_kobj->kset)
                goto out;

        kset = top_kobj->kset;
        if (!kset->uevent_ops || !kset->uevent_ops->uevent)
                goto out;

        /* respect filter */
        if (kset->uevent_ops && kset->uevent_ops->filter)
                if (!kset->uevent_ops->filter(&dev->kobj))
                        goto out;

        env = kzalloc_obj(struct kobj_uevent_env);
        if (!env)
                return -ENOMEM;

        /* let the kset specific function add its keys */
        retval = kset->uevent_ops->uevent(&dev->kobj, env);
        if (retval)
                goto out;

        /* copy keys to file */
        for (i = 0; i < env->envp_idx; i++)
                len += sysfs_emit_at(buf, len, "%s\n", env->envp[i]);
out:
        kfree(env);
        return len;
}

static ssize_t uevent_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        int rc;

        rc = kobject_synth_uevent(&dev->kobj, buf, count);

        if (rc) {
                dev_err(dev, "uevent: failed to send synthetic uevent: %d\n", rc);
                return rc;
        }

        return count;
}
static DEVICE_ATTR_RW(uevent);

static ssize_t online_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        bool val;

        device_lock(dev);
        val = !dev->offline;
        device_unlock(dev);
        return sysfs_emit(buf, "%u\n", val);
}

static ssize_t online_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        bool val;
        int ret;

        ret = kstrtobool(buf, &val);
        if (ret < 0)
                return ret;

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        ret = val ? device_online(dev) : device_offline(dev);
        unlock_device_hotplug();
        return ret < 0 ? ret : count;
}
static DEVICE_ATTR_RW(online);

static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
                              char *buf)
{
        const char *loc;

        switch (dev->removable) {
        case DEVICE_REMOVABLE:
                loc = "removable";
                break;
        case DEVICE_FIXED:
                loc = "fixed";
                break;
        default:
                loc = "unknown";
        }
        return sysfs_emit(buf, "%s\n", loc);
}
static DEVICE_ATTR_RO(removable);

int device_add_groups(struct device *dev,
                      const struct attribute_group *const *groups)
{
        return sysfs_create_groups(&dev->kobj, groups);
}
EXPORT_SYMBOL_GPL(device_add_groups);

void device_remove_groups(struct device *dev,
                          const struct attribute_group *const *groups)
{
        sysfs_remove_groups(&dev->kobj, groups);
}
EXPORT_SYMBOL_GPL(device_remove_groups);

union device_attr_group_devres {
        const struct attribute_group *group;
        const struct attribute_group **groups;
};

static void devm_attr_group_remove(struct device *dev, void *res)
{
        union device_attr_group_devres *devres = res;
        const struct attribute_group *group = devres->group;

        dev_dbg(dev, "%s: removing group %p\n", __func__, group);
        sysfs_remove_group(&dev->kobj, group);
}

/**
 * devm_device_add_group - given a device, create a managed attribute group
 * @dev:        The device to create the group for
 * @grp:        The attribute group to create
 *
 * This function creates a group for the first time.  It will explicitly
 * warn and error if any of the attribute files being created already exist.
 *
 * Returns 0 on success or error code on failure.
 */
int devm_device_add_group(struct device *dev, const struct attribute_group *grp)
{
        union device_attr_group_devres *devres;
        int error;

        devres = devres_alloc(devm_attr_group_remove,
                              sizeof(*devres), GFP_KERNEL);
        if (!devres)
                return -ENOMEM;

        error = sysfs_create_group(&dev->kobj, grp);
        if (error) {
                devres_free(devres);
                return error;
        }

        devres->group = grp;
        devres_add(dev, devres);
        return 0;
}
EXPORT_SYMBOL_GPL(devm_device_add_group);

static int device_add_attrs(struct device *dev)
{
        const struct class *class = dev->class;
        const struct device_type *type = dev->type;
        int error;

        if (class) {
                error = device_add_groups(dev, class->dev_groups);
                if (error)
                        return error;
        }

        if (type) {
                error = device_add_groups(dev, type->groups);
                if (error)
                        goto err_remove_class_groups;
        }

        error = device_add_groups(dev, dev->groups);
        if (error)
                goto err_remove_type_groups;

        if (device_supports_offline(dev) && !dev->offline_disabled) {
                error = device_create_file(dev, &dev_attr_online);
                if (error)
                        goto err_remove_dev_groups;
        }

        if (fw_devlink_flags && !fw_devlink_is_permissive() && dev->fwnode) {
                error = device_create_file(dev, &dev_attr_waiting_for_supplier);
                if (error)
                        goto err_remove_dev_online;
        }

        if (dev_removable_is_valid(dev)) {
                error = device_create_file(dev, &dev_attr_removable);
                if (error)
                        goto err_remove_dev_waiting_for_supplier;
        }

        if (dev_add_physical_location(dev)) {
                error = device_add_group(dev,
                        &dev_attr_physical_location_group);
                if (error)
                        goto err_remove_dev_removable;
        }

        return 0;

 err_remove_dev_removable:
        device_remove_file(dev, &dev_attr_removable);
 err_remove_dev_waiting_for_supplier:
        device_remove_file(dev, &dev_attr_waiting_for_supplier);
 err_remove_dev_online:
        device_remove_file(dev, &dev_attr_online);
 err_remove_dev_groups:
        device_remove_groups(dev, dev->groups);
 err_remove_type_groups:
        if (type)
                device_remove_groups(dev, type->groups);
 err_remove_class_groups:
        if (class)
                device_remove_groups(dev, class->dev_groups);

        return error;
}

static void device_remove_attrs(struct device *dev)
{
        const struct class *class = dev->class;
        const struct device_type *type = dev->type;

        if (dev->physical_location) {
                device_remove_group(dev, &dev_attr_physical_location_group);
                kfree(dev->physical_location);
        }

        device_remove_file(dev, &dev_attr_removable);
        device_remove_file(dev, &dev_attr_waiting_for_supplier);
        device_remove_file(dev, &dev_attr_online);
        device_remove_groups(dev, dev->groups);

        if (type)
                device_remove_groups(dev, type->groups);

        if (class)
                device_remove_groups(dev, class->dev_groups);
}

static ssize_t dev_show(struct device *dev, struct device_attribute *attr,
                        char *buf)
{
        return print_dev_t(buf, dev->devt);
}
static DEVICE_ATTR_RO(dev);

/* /sys/devices/ */
struct kset *devices_kset;

/**
 * devices_kset_move_before - Move device in the devices_kset's list.
 * @deva: Device to move.
 * @devb: Device @deva should come before.
 */
static void devices_kset_move_before(struct device *deva, struct device *devb)
{
        if (!devices_kset)
                return;
        pr_debug("devices_kset: Moving %s before %s\n",
                 dev_name(deva), dev_name(devb));
        spin_lock(&devices_kset->list_lock);
        list_move_tail(&deva->kobj.entry, &devb->kobj.entry);
        spin_unlock(&devices_kset->list_lock);
}

/**
 * devices_kset_move_after - Move device in the devices_kset's list.
 * @deva: Device to move
 * @devb: Device @deva should come after.
 */
static void devices_kset_move_after(struct device *deva, struct device *devb)
{
        if (!devices_kset)
                return;
        pr_debug("devices_kset: Moving %s after %s\n",
                 dev_name(deva), dev_name(devb));
        spin_lock(&devices_kset->list_lock);
        list_move(&deva->kobj.entry, &devb->kobj.entry);
        spin_unlock(&devices_kset->list_lock);
}

/**
 * devices_kset_move_last - move the device to the end of devices_kset's list.
 * @dev: device to move
 */
void devices_kset_move_last(struct device *dev)
{
        if (!devices_kset)
                return;
        pr_debug("devices_kset: Moving %s to end of list\n", dev_name(dev));
        spin_lock(&devices_kset->list_lock);
        list_move_tail(&dev->kobj.entry, &devices_kset->list);
        spin_unlock(&devices_kset->list_lock);
}

/**
 * device_create_file - create sysfs attribute file for device.
 * @dev: device.
 * @attr: device attribute descriptor.
 */
int device_create_file(struct device *dev,
                       const struct device_attribute *attr)
{
        int error = 0;

        if (dev) {
                WARN(((attr->attr.mode & S_IWUGO) && !attr->store),
                        "Attribute %s: write permission without 'store'\n",
                        attr->attr.name);
                WARN(((attr->attr.mode & S_IRUGO) && !attr->show),
                        "Attribute %s: read permission without 'show'\n",
                        attr->attr.name);
                error = sysfs_create_file(&dev->kobj, &attr->attr);
        }

        return error;
}
EXPORT_SYMBOL_GPL(device_create_file);

/**
 * device_remove_file - remove sysfs attribute file.
 * @dev: device.
 * @attr: device attribute descriptor.
 */
void device_remove_file(struct device *dev,
                        const struct device_attribute *attr)
{
        if (dev)
                sysfs_remove_file(&dev->kobj, &attr->attr);
}
EXPORT_SYMBOL_GPL(device_remove_file);

/**
 * device_remove_file_self - remove sysfs attribute file from its own method.
 * @dev: device.
 * @attr: device attribute descriptor.
 *
 * See kernfs_remove_self() for details.
 */
bool device_remove_file_self(struct device *dev,
                             const struct device_attribute *attr)
{
        if (dev)
                return sysfs_remove_file_self(&dev->kobj, &attr->attr);
        else
                return false;
}
EXPORT_SYMBOL_GPL(device_remove_file_self);

/**
 * device_create_bin_file - create sysfs binary attribute file for device.
 * @dev: device.
 * @attr: device binary attribute descriptor.
 */
int device_create_bin_file(struct device *dev,
                           const struct bin_attribute *attr)
{
        int error = -EINVAL;
        if (dev)
                error = sysfs_create_bin_file(&dev->kobj, attr);
        return error;
}
EXPORT_SYMBOL_GPL(device_create_bin_file);

/**
 * device_remove_bin_file - remove sysfs binary attribute file
 * @dev: device.
 * @attr: device binary attribute descriptor.
 */
void device_remove_bin_file(struct device *dev,
                            const struct bin_attribute *attr)
{
        if (dev)
                sysfs_remove_bin_file(&dev->kobj, attr);
}
EXPORT_SYMBOL_GPL(device_remove_bin_file);

static void klist_children_get(struct klist_node *n)
{
        struct device_private *p = to_device_private_parent(n);
        struct device *dev = p->device;

        get_device(dev);
}

static void klist_children_put(struct klist_node *n)
{
        struct device_private *p = to_device_private_parent(n);
        struct device *dev = p->device;

        put_device(dev);
}

/**
 * device_initialize - init device structure.
 * @dev: device.
 *
 * This prepares the device for use by other layers by initializing
 * its fields.
 * It is the first half of device_register(), if called by
 * that function, though it can also be called separately, so one
 * may use @dev's fields. In particular, get_device()/put_device()
 * may be used for reference counting of @dev after calling this
 * function.
 *
 * All fields in @dev must be initialized by the caller to 0, except
 * for those explicitly set to some other value.  The simplest
 * approach is to use kzalloc() to allocate the structure containing
 * @dev.
 *
 * NOTE: Use put_device() to give up your reference instead of freeing
 * @dev directly once you have called this function.
 */
void device_initialize(struct device *dev)
{
        dev->kobj.kset = devices_kset;
        kobject_init(&dev->kobj, &device_ktype);
        INIT_LIST_HEAD(&dev->dma_pools);
        mutex_init(&dev->mutex);
        spin_lock_init(&dev->driver_override.lock);
        lockdep_set_novalidate_class(&dev->mutex);
        spin_lock_init(&dev->devres_lock);
        INIT_LIST_HEAD(&dev->devres_head);
        device_pm_init(dev);
        set_dev_node(dev, NUMA_NO_NODE);
        INIT_LIST_HEAD(&dev->links.consumers);
        INIT_LIST_HEAD(&dev->links.suppliers);
        INIT_LIST_HEAD(&dev->links.defer_sync);
        dev->links.status = DL_DEV_NO_DRIVER;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
        dev->dma_coherent = dma_default_coherent;
#endif
        swiotlb_dev_init(dev);
}
EXPORT_SYMBOL_GPL(device_initialize);

struct kobject *virtual_device_parent(void)
{
        static struct kobject *virtual_dir = NULL;

        if (!virtual_dir)
                virtual_dir = kobject_create_and_add("virtual",
                                                     &devices_kset->kobj);

        return virtual_dir;
}

struct class_dir {
        struct kobject kobj;
        const struct class *class;
};

#define to_class_dir(obj) container_of(obj, struct class_dir, kobj)

static void class_dir_release(struct kobject *kobj)
{
        struct class_dir *dir = to_class_dir(kobj);
        kfree(dir);
}

static const
struct kobj_ns_type_operations *class_dir_child_ns_type(const struct kobject *kobj)
{
        const struct class_dir *dir = to_class_dir(kobj);
        return dir->class->ns_type;
}

static const struct kobj_type class_dir_ktype = {
        .release        = class_dir_release,
        .sysfs_ops        = &kobj_sysfs_ops,
        .child_ns_type        = class_dir_child_ns_type
};

static struct kobject *class_dir_create_and_add(struct subsys_private *sp,
                                                struct kobject *parent_kobj)
{
        struct class_dir *dir;
        int retval;

        dir = kzalloc_obj(*dir);
        if (!dir)
                return ERR_PTR(-ENOMEM);

        dir->class = sp->class;
        kobject_init(&dir->kobj, &class_dir_ktype);

        dir->kobj.kset = &sp->glue_dirs;

        retval = kobject_add(&dir->kobj, parent_kobj, "%s", sp->class->name);
        if (retval < 0) {
                kobject_put(&dir->kobj);
                return ERR_PTR(retval);
        }
        return &dir->kobj;
}

static DEFINE_MUTEX(gdp_mutex);

static struct kobject *get_device_parent(struct device *dev,
                                         struct device *parent)
{
        struct subsys_private *sp = class_to_subsys(dev->class);
        struct kobject *kobj = NULL;

        if (sp) {
                struct kobject *parent_kobj;
                struct kobject *k;

                /*
                 * If we have no parent, we live in "virtual".
                 * Class-devices with a non class-device as parent, live
                 * in a "glue" directory to prevent namespace collisions.
                 */
                if (parent == NULL)
                        parent_kobj = virtual_device_parent();
                else if (parent->class && !dev->class->ns_type) {
                        subsys_put(sp);
                        return &parent->kobj;
                } else {
                        parent_kobj = &parent->kobj;
                }

                mutex_lock(&gdp_mutex);

                /* find our class-directory at the parent and reference it */
                spin_lock(&sp->glue_dirs.list_lock);
                list_for_each_entry(k, &sp->glue_dirs.list, entry)
                        if (k->parent == parent_kobj) {
                                kobj = kobject_get(k);
                                break;
                        }
                spin_unlock(&sp->glue_dirs.list_lock);
                if (kobj) {
                        mutex_unlock(&gdp_mutex);
                        subsys_put(sp);
                        return kobj;
                }

                /* or create a new class-directory at the parent device */
                k = class_dir_create_and_add(sp, parent_kobj);
                /* do not emit an uevent for this simple "glue" directory */
                mutex_unlock(&gdp_mutex);
                subsys_put(sp);
                return k;
        }

        /* subsystems can specify a default root directory for their devices */
        if (!parent && dev->bus) {
                struct device *dev_root = bus_get_dev_root(dev->bus);

                if (dev_root) {
                        kobj = &dev_root->kobj;
                        put_device(dev_root);
                        return kobj;
                }
        }

        if (parent)
                return &parent->kobj;
        return NULL;
}

static inline bool live_in_glue_dir(struct kobject *kobj,
                                    struct device *dev)
{
        struct subsys_private *sp;
        bool retval;

        if (!kobj || !dev->class)
                return false;

        sp = class_to_subsys(dev->class);
        if (!sp)
                return false;

        if (kobj->kset == &sp->glue_dirs)
                retval = true;
        else
                retval = false;

        subsys_put(sp);
        return retval;
}

static inline struct kobject *get_glue_dir(struct device *dev)
{
        return dev->kobj.parent;
}

/**
 * kobject_has_children - Returns whether a kobject has children.
 * @kobj: the object to test
 *
 * This will return whether a kobject has other kobjects as children.
 *
 * It does NOT account for the presence of attribute files, only sub
 * directories. It also assumes there is no concurrent addition or
 * removal of such children, and thus relies on external locking.
 */
static inline bool kobject_has_children(struct kobject *kobj)
{
        WARN_ON_ONCE(kref_read(&kobj->kref) == 0);

        return kobj->sd && kobj->sd->dir.subdirs;
}

/*
 * make sure cleaning up dir as the last step, we need to make
 * sure .release handler of kobject is run with holding the
 * global lock
 */
static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
{
        unsigned int ref;

        /* see if we live in a "glue" directory */
        if (!live_in_glue_dir(glue_dir, dev))
                return;

        mutex_lock(&gdp_mutex);
        /**
         * There is a race condition between removing glue directory
         * and adding a new device under the glue directory.
         *
         * CPU1:                                         CPU2:
         *
         * device_add()
         *   get_device_parent()
         *     class_dir_create_and_add()
         *       kobject_add_internal()
         *         create_dir()    // create glue_dir
         *
         *                                               device_add()
         *                                                 get_device_parent()
         *                                                   kobject_get() // get glue_dir
         *
         * device_del()
         *   cleanup_glue_dir()
         *     kobject_del(glue_dir)
         *
         *                                               kobject_add()
         *                                                 kobject_add_internal()
         *                                                   create_dir() // in glue_dir
         *                                                     sysfs_create_dir_ns()
         *                                                       kernfs_create_dir_ns(sd)
         *
         *       sysfs_remove_dir() // glue_dir->sd=NULL
         *       sysfs_put()        // free glue_dir->sd
         *
         *                                                         // sd is freed
         *                                                         kernfs_new_node(sd)
         *                                                           kernfs_get(glue_dir)
         *                                                           kernfs_add_one()
         *                                                           kernfs_put()
         *
         * Before CPU1 remove last child device under glue dir, if CPU2 add
         * a new device under glue dir, the glue_dir kobject reference count
         * will be increase to 2 in kobject_get(k). And CPU2 has been called
         * kernfs_create_dir_ns(). Meanwhile, CPU1 call sysfs_remove_dir()
         * and sysfs_put(). This result in glue_dir->sd is freed.
         *
         * Then the CPU2 will see a stale "empty" but still potentially used
         * glue dir around in kernfs_new_node().
         *
         * In order to avoid this happening, we also should make sure that
         * kernfs_node for glue_dir is released in CPU1 only when refcount
         * for glue_dir kobj is 1.
         */
        ref = kref_read(&glue_dir->kref);
        if (!kobject_has_children(glue_dir) && !--ref)
                kobject_del(glue_dir);
        kobject_put(glue_dir);
        mutex_unlock(&gdp_mutex);
}

static int device_add_class_symlinks(struct device *dev)
{
        struct device_node *of_node = dev_of_node(dev);
        struct subsys_private *sp;
        int error;

        if (of_node) {
                error = sysfs_create_link(&dev->kobj, of_node_kobj(of_node), "of_node");
                if (error)
                        dev_warn(dev, "Error %d creating of_node link\n",error);
                /* An error here doesn't warrant bringing down the device */
        }

        sp = class_to_subsys(dev->class);
        if (!sp)
                return 0;

        error = sysfs_create_link(&dev->kobj, &sp->subsys.kobj, "subsystem");
        if (error)
                goto out_devnode;

        if (dev->parent && device_is_not_partition(dev)) {
                error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
                                          "device");
                if (error)
                        goto out_subsys;
        }

        /* link in the class directory pointing to the device */
        error = sysfs_create_link(&sp->subsys.kobj, &dev->kobj, dev_name(dev));
        if (error)
                goto out_device;
        goto exit;

out_device:
        sysfs_remove_link(&dev->kobj, "device");
out_subsys:
        sysfs_remove_link(&dev->kobj, "subsystem");
out_devnode:
        sysfs_remove_link(&dev->kobj, "of_node");
exit:
        subsys_put(sp);
        return error;
}

static void device_remove_class_symlinks(struct device *dev)
{
        struct subsys_private *sp = class_to_subsys(dev->class);

        if (dev_of_node(dev))
                sysfs_remove_link(&dev->kobj, "of_node");

        if (!sp)
                return;

        if (dev->parent && device_is_not_partition(dev))
                sysfs_remove_link(&dev->kobj, "device");
        sysfs_remove_link(&dev->kobj, "subsystem");
        sysfs_delete_link(&sp->subsys.kobj, &dev->kobj, dev_name(dev));
        subsys_put(sp);
}

/**
 * dev_set_name - set a device name
 * @dev: device
 * @fmt: format string for the device's name
 */
int dev_set_name(struct device *dev, const char *fmt, ...)
{
        va_list vargs;
        int err;

        va_start(vargs, fmt);
        err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
        va_end(vargs);
        return err;
}
EXPORT_SYMBOL_GPL(dev_set_name);

/* select a /sys/dev/ directory for the device */
static struct kobject *device_to_dev_kobj(struct device *dev)
{
        if (is_blockdev(dev))
                return sysfs_dev_block_kobj;
        else
                return sysfs_dev_char_kobj;
}

static int device_create_sys_dev_entry(struct device *dev)
{
        struct kobject *kobj = device_to_dev_kobj(dev);
        int error = 0;
        char devt_str[15];

        if (kobj) {
                format_dev_t(devt_str, dev->devt);
                error = sysfs_create_link(kobj, &dev->kobj, devt_str);
        }

        return error;
}

static void device_remove_sys_dev_entry(struct device *dev)
{
        struct kobject *kobj = device_to_dev_kobj(dev);
        char devt_str[15];

        if (kobj) {
                format_dev_t(devt_str, dev->devt);
                sysfs_remove_link(kobj, devt_str);
        }
}

static int device_private_init(struct device *dev)
{
        dev->p = kzalloc_obj(*dev->p);
        if (!dev->p)
                return -ENOMEM;
        dev->p->device = dev;
        klist_init(&dev->p->klist_children, klist_children_get,
                   klist_children_put);
        INIT_LIST_HEAD(&dev->p->deferred_probe);
        return 0;
}

/**
 * device_add - add device to device hierarchy.
 * @dev: device.
 *
 * This is part 2 of device_register(), though may be called
 * separately _iff_ device_initialize() has been called separately.
 *
 * This adds @dev to the kobject hierarchy via kobject_add(), adds it
 * to the global and sibling lists for the device, then
 * adds it to the other relevant subsystems of the driver model.
 *
 * Do not call this routine or device_register() more than once for
 * any device structure.  The driver model core is not designed to work
 * with devices that get unregistered and then spring back to life.
 * (Among other things, it's very hard to guarantee that all references
 * to the previous incarnation of @dev have been dropped.)  Allocate
 * and register a fresh new struct device instead.
 *
 * NOTE: _Never_ directly free @dev after calling this function, even
 * if it returned an error! Always use put_device() to give up your
 * reference instead.
 *
 * Rule of thumb is: if device_add() succeeds, you should call
 * device_del() when you want to get rid of it. If device_add() has
 * *not* succeeded, use *only* put_device() to drop the reference
 * count.
 */
int device_add(struct device *dev)
{
        struct subsys_private *sp;
        struct device *parent;
        struct kobject *kobj;
        struct class_interface *class_intf;
        int error = -EINVAL;
        struct kobject *glue_dir = NULL;

        dev = get_device(dev);
        if (!dev)
                goto done;

        if (!dev->p) {
                error = device_private_init(dev);
                if (error)
                        goto done;
        }

        /*
         * for statically allocated devices, which should all be converted
         * some day, we need to initialize the name. We prevent reading back
         * the name, and force the use of dev_name()
         */
        if (dev->init_name) {
                error = dev_set_name(dev, "%s", dev->init_name);
                dev->init_name = NULL;
        }

        if (dev_name(dev))
                error = 0;
        /* subsystems can specify simple device enumeration */
        else if (dev->bus && dev->bus->dev_name)
                error = dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);
        else
                error = -EINVAL;
        if (error)
                goto name_error;

        pr_debug("device: '%s': %s\n", dev_name(dev), __func__);

        parent = get_device(dev->parent);
        kobj = get_device_parent(dev, parent);
        if (IS_ERR(kobj)) {
                error = PTR_ERR(kobj);
                goto parent_error;
        }
        if (kobj)
                dev->kobj.parent = kobj;

        /* use parent numa_node */
        if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
                set_dev_node(dev, dev_to_node(parent));

        /* first, register with generic layer. */
        /* we require the name to be set before, and pass NULL */
        error = kobject_add(&dev->kobj, dev->kobj.parent, NULL);
        if (error) {
                glue_dir = kobj;
                goto Error;
        }

        /* notify platform of device entry */
        device_platform_notify(dev);

        error = device_create_file(dev, &dev_attr_uevent);
        if (error)
                goto attrError;

        error = device_add_class_symlinks(dev);
        if (error)
                goto SymlinkError;
        error = device_add_attrs(dev);
        if (error)
                goto AttrsError;
        error = bus_add_device(dev);
        if (error)
                goto BusError;
        error = dpm_sysfs_add(dev);
        if (error)
                goto DPMError;
        device_pm_add(dev);

        if (MAJOR(dev->devt)) {
                error = device_create_file(dev, &dev_attr_dev);
                if (error)
                        goto DevAttrError;

                error = device_create_sys_dev_entry(dev);
                if (error)
                        goto SysEntryError;

                devtmpfs_create_node(dev);
        }

        /* Notify clients of device addition.  This call must come
         * after dpm_sysfs_add() and before kobject_uevent().
         */
        bus_notify(dev, BUS_NOTIFY_ADD_DEVICE);
        kobject_uevent(&dev->kobj, KOBJ_ADD);

        /*
         * Check if any of the other devices (consumers) have been waiting for
         * this device (supplier) to be added so that they can create a device
         * link to it.
         *
         * This needs to happen after device_pm_add() because device_link_add()
         * requires the supplier be registered before it's called.
         *
         * But this also needs to happen before bus_probe_device() to make sure
         * waiting consumers can link to it before the driver is bound to the
         * device and the driver sync_state callback is called for this device.
         */
        if (dev->fwnode && !dev->fwnode->dev) {
                dev->fwnode->dev = dev;
                fw_devlink_link_device(dev);
        }

        /*
         * The moment the device was linked into the bus's "klist_devices" in
         * bus_add_device() then it's possible that probe could have been
         * attempted in a different thread via userspace loading a driver
         * matching the device. "ready_to_probe" being unset would have
         * blocked those attempts. Now that all of the above initialization has
         * happened, unblock probe. If probe happens through another thread
         * after this point but before bus_probe_device() runs then it's fine.
         * bus_probe_device() -> device_initial_probe() -> __device_attach()
         * will notice (under device_lock) that the device is already bound.
         */
        device_lock(dev);
        dev_set_ready_to_probe(dev);
        device_unlock(dev);

        bus_probe_device(dev);

        /*
         * If all driver registration is done and a newly added device doesn't
         * match with any driver, don't block its consumers from probing in
         * case the consumer device is able to operate without this supplier.
         */
        if (dev->fwnode && fw_devlink_drv_reg_done && !dev->can_match)
                fw_devlink_unblock_consumers(dev);

        if (parent)
                klist_add_tail(&dev->p->knode_parent,
                               &parent->p->klist_children);

        sp = class_to_subsys(dev->class);
        if (sp) {
                mutex_lock(&sp->mutex);
                /* tie the class to the device */
                klist_add_tail(&dev->p->knode_class, &sp->klist_devices);

                /* notify any interfaces that the device is here */
                list_for_each_entry(class_intf, &sp->interfaces, node)
                        if (class_intf->add_dev)
                                class_intf->add_dev(dev);
                mutex_unlock(&sp->mutex);
                subsys_put(sp);
        }
done:
        put_device(dev);
        return error;
 SysEntryError:
        if (MAJOR(dev->devt))
                device_remove_file(dev, &dev_attr_dev);
 DevAttrError:
        device_pm_remove(dev);
        dpm_sysfs_remove(dev);
 DPMError:
        device_set_driver(dev, NULL);
        bus_remove_device(dev);
 BusError:
        device_remove_attrs(dev);
 AttrsError:
        device_remove_class_symlinks(dev);
 SymlinkError:
        device_remove_file(dev, &dev_attr_uevent);
 attrError:
        device_platform_notify_remove(dev);
        kobject_uevent(&dev->kobj, KOBJ_REMOVE);
        glue_dir = get_glue_dir(dev);
        kobject_del(&dev->kobj);
 Error:
        cleanup_glue_dir(dev, glue_dir);
parent_error:
        put_device(parent);
name_error:
        kfree(dev->p);
        dev->p = NULL;
        goto done;
}
EXPORT_SYMBOL_GPL(device_add);

/**
 * device_register - register a device with the system.
 * @dev: pointer to the device structure
 *
 * This happens in two clean steps - initialize the device
 * and add it to the system. The two steps can be called
 * separately, but this is the easiest and most common.
 * I.e. you should only call the two helpers separately if
 * have a clearly defined need to use and refcount the device
 * before it is added to the hierarchy.
 *
 * For more information, see the kerneldoc for device_initialize()
 * and device_add().
 *
 * NOTE: _Never_ directly free @dev after calling this function, even
 * if it returned an error! Always use put_device() to give up the
 * reference initialized in this function instead.
 */
int device_register(struct device *dev)
{
        device_initialize(dev);
        return device_add(dev);
}
EXPORT_SYMBOL_GPL(device_register);

/**
 * get_device - increment reference count for device.
 * @dev: device.
 *
 * This simply forwards the call to kobject_get(), though
 * we do take care to provide for the case that we get a NULL
 * pointer passed in.
 */
struct device *get_device(struct device *dev)
{
        return dev ? kobj_to_dev(kobject_get(&dev->kobj)) : NULL;
}
EXPORT_SYMBOL_GPL(get_device);

/**
 * put_device - decrement reference count.
 * @dev: device in question.
 */
void put_device(struct device *dev)
{
        /* might_sleep(); */
        if (dev)
                kobject_put(&dev->kobj);
}
EXPORT_SYMBOL_GPL(put_device);

bool kill_device(struct device *dev)
{
        /*
         * Require the device lock and set the "dead" flag to guarantee that
         * the update behavior is consistent with the other bitfields near
         * it and that we cannot have an asynchronous probe routine trying
         * to run while we are tearing out the bus/class/sysfs from
         * underneath the device.
         */
        device_lock_assert(dev);

        if (dev->p->dead)
                return false;
        dev->p->dead = true;
        return true;
}
EXPORT_SYMBOL_GPL(kill_device);

/**
 * device_del - delete device from system.
 * @dev: device.
 *
 * This is the first part of the device unregistration
 * sequence. This removes the device from the lists we control
 * from here, has it removed from the other driver model
 * subsystems it was added to in device_add(), and removes it
 * from the kobject hierarchy.
 *
 * NOTE: this should be called manually _iff_ device_add() was
 * also called manually.
 */
void device_del(struct device *dev)
{
        struct subsys_private *sp;
        struct device *parent = dev->parent;
        struct kobject *glue_dir = NULL;
        struct class_interface *class_intf;
        unsigned int noio_flag;

        device_lock(dev);
        kill_device(dev);
        device_unlock(dev);

        if (dev->fwnode && dev->fwnode->dev == dev)
                dev->fwnode->dev = NULL;

        /* Notify clients of device removal.  This call must come
         * before dpm_sysfs_remove().
         */
        noio_flag = memalloc_noio_save();
        bus_notify(dev, BUS_NOTIFY_DEL_DEVICE);

        dpm_sysfs_remove(dev);
        if (parent)
                klist_del(&dev->p->knode_parent);
        if (MAJOR(dev->devt)) {
                devtmpfs_delete_node(dev);
                device_remove_sys_dev_entry(dev);
                device_remove_file(dev, &dev_attr_dev);
        }

        sp = class_to_subsys(dev->class);
        if (sp) {
                device_remove_class_symlinks(dev);

                mutex_lock(&sp->mutex);
                /* notify any interfaces that the device is now gone */
                list_for_each_entry(class_intf, &sp->interfaces, node)
                        if (class_intf->remove_dev)
                                class_intf->remove_dev(dev);
                /* remove the device from the class list */
                klist_del(&dev->p->knode_class);
                mutex_unlock(&sp->mutex);
                subsys_put(sp);
        }
        device_remove_file(dev, &dev_attr_uevent);
        device_remove_attrs(dev);
        bus_remove_device(dev);
        device_pm_remove(dev);
        driver_deferred_probe_del(dev);
        device_platform_notify_remove(dev);
        device_links_purge(dev);

        /*
         * If a device does not have a driver attached, we need to clean
         * up any managed resources. We do this in device_release(), but
         * it's never called (and we leak the device) if a managed
         * resource holds a reference to the device. So release all
         * managed resources here, like we do in driver_detach(). We
         * still need to do so again in device_release() in case someone
         * adds a new resource after this point, though.
         */
        devres_release_all(dev);

        bus_notify(dev, BUS_NOTIFY_REMOVED_DEVICE);
        kobject_uevent(&dev->kobj, KOBJ_REMOVE);
        glue_dir = get_glue_dir(dev);
        kobject_del(&dev->kobj);
        cleanup_glue_dir(dev, glue_dir);
        memalloc_noio_restore(noio_flag);
        put_device(parent);
}
EXPORT_SYMBOL_GPL(device_del);

/**
 * device_unregister - unregister device from system.
 * @dev: device going away.
 *
 * We do this in two parts, like we do device_register(). First,
 * we remove it from all the subsystems with device_del(), then
 * we decrement the reference count via put_device(). If that
 * is the final reference count, the device will be cleaned up
 * via device_release() above. Otherwise, the structure will
 * stick around until the final reference to the device is dropped.
 */
void device_unregister(struct device *dev)
{
        pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
        device_del(dev);
        put_device(dev);
}
EXPORT_SYMBOL_GPL(device_unregister);

static struct device *prev_device(struct klist_iter *i)
{
        struct klist_node *n = klist_prev(i);
        struct device *dev = NULL;
        struct device_private *p;

        if (n) {
                p = to_device_private_parent(n);
                dev = p->device;
        }
        return dev;
}

static struct device *next_device(struct klist_iter *i)
{
        struct klist_node *n = klist_next(i);
        struct device *dev = NULL;
        struct device_private *p;

        if (n) {
                p = to_device_private_parent(n);
                dev = p->device;
        }
        return dev;
}

/**
 * device_get_devnode - path of device node file
 * @dev: device
 * @mode: returned file access mode
 * @uid: returned file owner
 * @gid: returned file group
 * @tmp: possibly allocated string
 *
 * Return the relative path of a possible device node.
 * Non-default names may need to allocate a memory to compose
 * a name. This memory is returned in tmp and needs to be
 * freed by the caller.
 */
const char *device_get_devnode(const struct device *dev,
                               umode_t *mode, kuid_t *uid, kgid_t *gid,
                               const char **tmp)
{
        char *s;

        *tmp = NULL;

        /* the device type may provide a specific name */
        if (dev->type && dev->type->devnode)
                *tmp = dev->type->devnode(dev, mode, uid, gid);
        if (*tmp)
                return *tmp;

        /* the class may provide a specific name */
        if (dev->class && dev->class->devnode)
                *tmp = dev->class->devnode(dev, mode);
        if (*tmp)
                return *tmp;

        /* return name without allocation, tmp == NULL */
        if (strchr(dev_name(dev), '!') == NULL)
                return dev_name(dev);

        /* replace '!' in the name with '/' */
        s = kstrdup_and_replace(dev_name(dev), '!', '/', GFP_KERNEL);
        if (!s)
                return NULL;
        return *tmp = s;
}

/**
 * device_for_each_child - device child iterator.
 * @parent: parent struct device.
 * @data: data for the callback.
 * @fn: function to be called for each device.
 *
 * Iterate over @parent's child devices, and call @fn for each,
 * passing it @data.
 *
 * We check the return of @fn each time. If it returns anything
 * other than 0, we break out and return that value.
 */
int device_for_each_child(struct device *parent, void *data,
                          device_iter_t fn)
{
        struct klist_iter i;
        struct device *child;
        int error = 0;

        if (!parent || !parent->p)
                return 0;

        klist_iter_init(&parent->p->klist_children, &i);
        while (!error && (child = next_device(&i)))
                error = fn(child, data);
        klist_iter_exit(&i);
        return error;
}
EXPORT_SYMBOL_GPL(device_for_each_child);

/**
 * device_for_each_child_reverse - device child iterator in reversed order.
 * @parent: parent struct device.
 * @data: data for the callback.
 * @fn: function to be called for each device.
 *
 * Iterate over @parent's child devices, and call @fn for each,
 * passing it @data.
 *
 * We check the return of @fn each time. If it returns anything
 * other than 0, we break out and return that value.
 */
int device_for_each_child_reverse(struct device *parent, void *data,
                                  device_iter_t fn)
{
        struct klist_iter i;
        struct device *child;
        int error = 0;

        if (!parent || !parent->p)
                return 0;

        klist_iter_init(&parent->p->klist_children, &i);
        while ((child = prev_device(&i)) && !error)
                error = fn(child, data);
        klist_iter_exit(&i);
        return error;
}
EXPORT_SYMBOL_GPL(device_for_each_child_reverse);

/**
 * device_for_each_child_reverse_from - device child iterator in reversed order.
 * @parent: parent struct device.
 * @from: optional starting point in child list
 * @data: data for the callback.
 * @fn: function to be called for each device.
 *
 * Iterate over @parent's child devices, starting at @from, and call @fn
 * for each, passing it @data. This helper is identical to
 * device_for_each_child_reverse() when @from is NULL.
 *
 * @fn is checked each iteration. If it returns anything other than 0,
 * iteration stop and that value is returned to the caller of
 * device_for_each_child_reverse_from();
 */
int device_for_each_child_reverse_from(struct device *parent,
                                       struct device *from, void *data,
                                       device_iter_t fn)
{
        struct klist_iter i;
        struct device *child;
        int error = 0;

        if (!parent || !parent->p)
                return 0;

        klist_iter_init_node(&parent->p->klist_children, &i,
                             (from ? &from->p->knode_parent : NULL));
        while ((child = prev_device(&i)) && !error)
                error = fn(child, data);
        klist_iter_exit(&i);
        return error;
}
EXPORT_SYMBOL_GPL(device_for_each_child_reverse_from);

/**
 * device_find_child - device iterator for locating a particular device.
 * @parent: parent struct device
 * @data: Data to pass to match function
 * @match: Callback function to check device
 *
 * This is similar to the device_for_each_child() function above, but it
 * returns a reference to a device that is 'found' for later use, as
 * determined by the @match callback.
 *
 * The callback should return 0 if the device doesn't match and non-zero
 * if it does.  If the callback returns non-zero and a reference to the
 * current device can be obtained, this function will return to the caller
 * and not iterate over any more devices.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
struct device *device_find_child(struct device *parent, const void *data,
                                 device_match_t match)
{
        struct klist_iter i;
        struct device *child;

        if (!parent || !parent->p)
                return NULL;

        klist_iter_init(&parent->p->klist_children, &i);
        while ((child = next_device(&i))) {
                if (match(child, data)) {
                        get_device(child);
                        break;
                }
        }
        klist_iter_exit(&i);
        return child;
}
EXPORT_SYMBOL_GPL(device_find_child);

int __init devices_init(void)
{
        devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
        if (!devices_kset)
                return -ENOMEM;
        dev_kobj = kobject_create_and_add("dev", NULL);
        if (!dev_kobj)
                goto dev_kobj_err;
        sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj);
        if (!sysfs_dev_block_kobj)
                goto block_kobj_err;
        sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj);
        if (!sysfs_dev_char_kobj)
                goto char_kobj_err;
        device_link_wq = alloc_workqueue("device_link_wq", WQ_PERCPU, 0);
        if (!device_link_wq)
                goto wq_err;

        return 0;

 wq_err:
        kobject_put(sysfs_dev_char_kobj);
 char_kobj_err:
        kobject_put(sysfs_dev_block_kobj);
 block_kobj_err:
        kobject_put(dev_kobj);
 dev_kobj_err:
        kset_unregister(devices_kset);
        return -ENOMEM;
}

static int device_check_offline(struct device *dev, void *not_used)
{
        int ret;

        ret = device_for_each_child(dev, NULL, device_check_offline);
        if (ret)
                return ret;

        return device_supports_offline(dev) && !dev->offline ? -EBUSY : 0;
}

/**
 * device_offline - Prepare the device for hot-removal.
 * @dev: Device to be put offline.
 *
 * Execute the device bus type's .offline() callback, if present, to prepare
 * the device for a subsequent hot-removal.  If that succeeds, the device must
 * not be used until either it is removed or its bus type's .online() callback
 * is executed.
 *
 * Call under device_hotplug_lock.
 */
int device_offline(struct device *dev)
{
        int ret;

        if (dev->offline_disabled)
                return -EPERM;

        ret = device_for_each_child(dev, NULL, device_check_offline);
        if (ret)
                return ret;

        device_lock(dev);
        if (device_supports_offline(dev)) {
                if (dev->offline) {
                        ret = 1;
                } else {
                        ret = dev->bus->offline(dev);
                        if (!ret) {
                                kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
                                dev->offline = true;
                        }
                }
        }
        device_unlock(dev);

        return ret;
}

/**
 * device_online - Put the device back online after successful device_offline().
 * @dev: Device to be put back online.
 *
 * If device_offline() has been successfully executed for @dev, but the device
 * has not been removed subsequently, execute its bus type's .online() callback
 * to indicate that the device can be used again.
 *
 * Call under device_hotplug_lock.
 */
int device_online(struct device *dev)
{
        int ret = 0;

        device_lock(dev);
        if (device_supports_offline(dev)) {
                if (dev->offline) {
                        ret = dev->bus->online(dev);
                        if (!ret) {
                                kobject_uevent(&dev->kobj, KOBJ_ONLINE);
                                dev->offline = false;
                        }
                } else {
                        ret = 1;
                }
        }
        device_unlock(dev);

        return ret;
}

struct root_device {
        struct device dev;
        struct module *owner;
};

static inline struct root_device *to_root_device(struct device *d)
{
        return container_of(d, struct root_device, dev);
}

static void root_device_release(struct device *dev)
{
        kfree(to_root_device(dev));
}

/**
 * __root_device_register - allocate and register a root device
 * @name: root device name
 * @owner: owner module of the root device, usually THIS_MODULE
 *
 * This function allocates a root device and registers it
 * using device_register(). In order to free the returned
 * device, use root_device_unregister().
 *
 * Root devices are dummy devices which allow other devices
 * to be grouped under /sys/devices. Use this function to
 * allocate a root device and then use it as the parent of
 * any device which should appear under /sys/devices/{name}
 *
 * The /sys/devices/{name} directory will also contain a
 * 'module' symlink which points to the @owner directory
 * in sysfs.
 *
 * Returns &struct device pointer on success, or ERR_PTR() on error.
 *
 * Note: You probably want to use root_device_register().
 */
struct device *__root_device_register(const char *name, struct module *owner)
{
        struct root_device *root;
        int err = -ENOMEM;

        root = kzalloc_obj(struct root_device);
        if (!root)
                return ERR_PTR(err);

        err = dev_set_name(&root->dev, "%s", name);
        if (err) {
                kfree(root);
                return ERR_PTR(err);
        }

        root->dev.release = root_device_release;

        err = device_register(&root->dev);
        if (err) {
                put_device(&root->dev);
                return ERR_PTR(err);
        }

#ifdef CONFIG_MODULES        /* gotta find a "cleaner" way to do this */
        if (owner) {
                struct module_kobject *mk = &owner->mkobj;

                err = sysfs_create_link(&root->dev.kobj, &mk->kobj, "module");
                if (err) {
                        device_unregister(&root->dev);
                        return ERR_PTR(err);
                }
                root->owner = owner;
        }
#endif

        return &root->dev;
}
EXPORT_SYMBOL_GPL(__root_device_register);

/**
 * root_device_unregister - unregister and free a root device
 * @dev: device going away
 *
 * This function unregisters and cleans up a device that was created by
 * root_device_register().
 */
void root_device_unregister(struct device *dev)
{
        struct root_device *root = to_root_device(dev);

        if (root->owner)
                sysfs_remove_link(&root->dev.kobj, "module");

        device_unregister(dev);
}
EXPORT_SYMBOL_GPL(root_device_unregister);


static void device_create_release(struct device *dev)
{
        pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
        kfree(dev);
}

static __printf(6, 0) struct device *
device_create_groups_vargs(const struct class *class, struct device *parent,
                           dev_t devt, void *drvdata,
                           const struct attribute_group **groups,
                           const char *fmt, va_list args)
{
        struct device *dev = NULL;
        int retval = -ENODEV;

        if (IS_ERR_OR_NULL(class))
                goto error;

        dev = kzalloc_obj(*dev);
        if (!dev) {
                retval = -ENOMEM;
                goto error;
        }

        device_initialize(dev);
        dev->devt = devt;
        dev->class = class;
        dev->parent = parent;
        dev->groups = groups;
        dev->release = device_create_release;
        dev_set_drvdata(dev, drvdata);

        retval = kobject_set_name_vargs(&dev->kobj, fmt, args);
        if (retval)
                goto error;

        retval = device_add(dev);
        if (retval)
                goto error;

        return dev;

error:
        put_device(dev);
        return ERR_PTR(retval);
}

/**
 * device_create - creates a device and registers it with sysfs
 * @class: pointer to the struct class that this device should be registered to
 * @parent: pointer to the parent struct device of this new device, if any
 * @devt: the dev_t for the char device to be added
 * @drvdata: the data to be added to the device for callbacks
 * @fmt: string for the device's name
 *
 * This function can be used by char device classes.  A struct device
 * will be created in sysfs, registered to the specified class.
 *
 * A "dev" file will be created, showing the dev_t for the device, if
 * the dev_t is not 0,0.
 * If a pointer to a parent struct device is passed in, the newly created
 * struct device will be a child of that device in sysfs.
 * The pointer to the struct device will be returned from the call.
 * Any further sysfs files that might be required can be created using this
 * pointer.
 *
 * Returns &struct device pointer on success, or ERR_PTR() on error.
 */
struct device *device_create(const struct class *class, struct device *parent,
                             dev_t devt, void *drvdata, const char *fmt, ...)
{
        va_list vargs;
        struct device *dev;

        va_start(vargs, fmt);
        dev = device_create_groups_vargs(class, parent, devt, drvdata, NULL,
                                          fmt, vargs);
        va_end(vargs);
        return dev;
}
EXPORT_SYMBOL_GPL(device_create);

/**
 * device_create_with_groups - creates a device and registers it with sysfs
 * @class: pointer to the struct class that this device should be registered to
 * @parent: pointer to the parent struct device of this new device, if any
 * @devt: the dev_t for the char device to be added
 * @drvdata: the data to be added to the device for callbacks
 * @groups: NULL-terminated list of attribute groups to be created
 * @fmt: string for the device's name
 *
 * This function can be used by char device classes.  A struct device
 * will be created in sysfs, registered to the specified class.
 * Additional attributes specified in the groups parameter will also
 * be created automatically.
 *
 * A "dev" file will be created, showing the dev_t for the device, if
 * the dev_t is not 0,0.
 * If a pointer to a parent struct device is passed in, the newly created
 * struct device will be a child of that device in sysfs.
 * The pointer to the struct device will be returned from the call.
 * Any further sysfs files that might be required can be created using this
 * pointer.
 *
 * Returns &struct device pointer on success, or ERR_PTR() on error.
 */
struct device *device_create_with_groups(const struct class *class,
                                         struct device *parent, dev_t devt,
                                         void *drvdata,
                                         const struct attribute_group **groups,
                                         const char *fmt, ...)
{
        va_list vargs;
        struct device *dev;

        va_start(vargs, fmt);
        dev = device_create_groups_vargs(class, parent, devt, drvdata, groups,
                                         fmt, vargs);
        va_end(vargs);
        return dev;
}
EXPORT_SYMBOL_GPL(device_create_with_groups);

/**
 * device_destroy - removes a device that was created with device_create()
 * @class: pointer to the struct class that this device was registered with
 * @devt: the dev_t of the device that was previously registered
 *
 * This call unregisters and cleans up a device that was created with a
 * call to device_create().
 */
void device_destroy(const struct class *class, dev_t devt)
{
        struct device *dev;

        dev = class_find_device_by_devt(class, devt);
        if (dev) {
                put_device(dev);
                device_unregister(dev);
        }
}
EXPORT_SYMBOL_GPL(device_destroy);

/**
 * device_rename - renames a device
 * @dev: the pointer to the struct device to be renamed
 * @new_name: the new name of the device
 *
 * It is the responsibility of the caller to provide mutual
 * exclusion between two different calls of device_rename
 * on the same device to ensure that new_name is valid and
 * won't conflict with other devices.
 *
 * Note: given that some subsystems (networking and infiniband) use this
 * function, with no immediate plans for this to change, we cannot assume or
 * require that this function not be called at all.
 *
 * However, if you're writing new code, do not call this function. The following
 * text from Kay Sievers offers some insight:
 *
 * Renaming devices is racy at many levels, symlinks and other stuff are not
 * replaced atomically, and you get a "move" uevent, but it's not easy to
 * connect the event to the old and new device. Device nodes are not renamed at
 * all, there isn't even support for that in the kernel now.
 *
 * In the meantime, during renaming, your target name might be taken by another
 * driver, creating conflicts. Or the old name is taken directly after you
 * renamed it -- then you get events for the same DEVPATH, before you even see
 * the "move" event. It's just a mess, and nothing new should ever rely on
 * kernel device renaming. Besides that, it's not even implemented now for
 * other things than (driver-core wise very simple) network devices.
 *
 * Make up a "real" name in the driver before you register anything, or add
 * some other attributes for userspace to find the device, or use udev to add
 * symlinks -- but never rename kernel devices later, it's a complete mess. We
 * don't even want to get into that and try to implement the missing pieces in
 * the core. We really have other pieces to fix in the driver core mess. :)
 */
int device_rename(struct device *dev, const char *new_name)
{
        struct subsys_private *sp = NULL;
        struct kobject *kobj = &dev->kobj;
        char *old_device_name = NULL;
        int error;
        bool is_link_renamed = false;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        dev_dbg(dev, "renaming to %s\n", new_name);

        old_device_name = kstrdup(dev_name(dev), GFP_KERNEL);
        if (!old_device_name) {
                error = -ENOMEM;
                goto out;
        }

        if (dev->class) {
                sp = class_to_subsys(dev->class);

                if (!sp) {
                        error = -EINVAL;
                        goto out;
                }

                error = sysfs_rename_link_ns(&sp->subsys.kobj, kobj, old_device_name,
                                             new_name, kobject_namespace(kobj));
                if (error)
                        goto out;

                is_link_renamed = true;
        }

        error = kobject_rename(kobj, new_name);
out:
        if (error && is_link_renamed)
                sysfs_rename_link_ns(&sp->subsys.kobj, kobj, new_name,
                                     old_device_name, kobject_namespace(kobj));
        subsys_put(sp);

        put_device(dev);

        kfree(old_device_name);

        return error;
}
EXPORT_SYMBOL_GPL(device_rename);

static int device_move_class_links(struct device *dev,
                                   struct device *old_parent,
                                   struct device *new_parent)
{
        int error = 0;

        if (old_parent)
                sysfs_remove_link(&dev->kobj, "device");
        if (new_parent)
                error = sysfs_create_link(&dev->kobj, &new_parent->kobj,
                                          "device");
        return error;
}

/**
 * device_move - moves a device to a new parent
 * @dev: the pointer to the struct device to be moved
 * @new_parent: the new parent of the device (can be NULL)
 * @dpm_order: how to reorder the dpm_list
 */
int device_move(struct device *dev, struct device *new_parent,
                enum dpm_order dpm_order)
{
        int error;
        struct device *old_parent;
        struct kobject *new_parent_kobj;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        device_pm_lock();
        new_parent = get_device(new_parent);
        new_parent_kobj = get_device_parent(dev, new_parent);
        if (IS_ERR(new_parent_kobj)) {
                error = PTR_ERR(new_parent_kobj);
                put_device(new_parent);
                goto out;
        }

        pr_debug("device: '%s': %s: moving to '%s'\n", dev_name(dev),
                 __func__, new_parent ? dev_name(new_parent) : "<NULL>");
        error = kobject_move(&dev->kobj, new_parent_kobj);
        if (error) {
                cleanup_glue_dir(dev, new_parent_kobj);
                put_device(new_parent);
                goto out;
        }
        old_parent = dev->parent;
        dev->parent = new_parent;
        if (old_parent)
                klist_remove(&dev->p->knode_parent);
        if (new_parent) {
                klist_add_tail(&dev->p->knode_parent,
                               &new_parent->p->klist_children);
                set_dev_node(dev, dev_to_node(new_parent));
        }

        if (dev->class) {
                error = device_move_class_links(dev, old_parent, new_parent);
                if (error) {
                        /* We ignore errors on cleanup since we're hosed anyway... */
                        device_move_class_links(dev, new_parent, old_parent);
                        if (!kobject_move(&dev->kobj, &old_parent->kobj)) {
                                if (new_parent)
                                        klist_remove(&dev->p->knode_parent);
                                dev->parent = old_parent;
                                if (old_parent) {
                                        klist_add_tail(&dev->p->knode_parent,
                                                       &old_parent->p->klist_children);
                                        set_dev_node(dev, dev_to_node(old_parent));
                                }
                        }
                        cleanup_glue_dir(dev, new_parent_kobj);
                        put_device(new_parent);
                        goto out;
                }
        }
        switch (dpm_order) {
        case DPM_ORDER_NONE:
                break;
        case DPM_ORDER_DEV_AFTER_PARENT:
                device_pm_move_after(dev, new_parent);
                devices_kset_move_after(dev, new_parent);
                break;
        case DPM_ORDER_PARENT_BEFORE_DEV:
                device_pm_move_before(new_parent, dev);
                devices_kset_move_before(new_parent, dev);
                break;
        case DPM_ORDER_DEV_LAST:
                device_pm_move_last(dev);
                devices_kset_move_last(dev);
                break;
        }

        put_device(old_parent);
out:
        device_pm_unlock();
        put_device(dev);
        return error;
}
EXPORT_SYMBOL_GPL(device_move);

static int device_attrs_change_owner(struct device *dev, kuid_t kuid,
                                     kgid_t kgid)
{
        struct kobject *kobj = &dev->kobj;
        const struct class *class = dev->class;
        const struct device_type *type = dev->type;
        int error;

        if (class) {
                /*
                 * Change the device groups of the device class for @dev to
                 * @kuid/@kgid.
                 */
                error = sysfs_groups_change_owner(kobj, class->dev_groups, kuid,
                                                  kgid);
                if (error)
                        return error;
        }

        if (type) {
                /*
                 * Change the device groups of the device type for @dev to
                 * @kuid/@kgid.
                 */
                error = sysfs_groups_change_owner(kobj, type->groups, kuid,
                                                  kgid);
                if (error)
                        return error;
        }

        /* Change the device groups of @dev to @kuid/@kgid. */
        error = sysfs_groups_change_owner(kobj, dev->groups, kuid, kgid);
        if (error)
                return error;

        if (device_supports_offline(dev) && !dev->offline_disabled) {
                /* Change online device attributes of @dev to @kuid/@kgid. */
                error = sysfs_file_change_owner(kobj, dev_attr_online.attr.name,
                                                kuid, kgid);
                if (error)
                        return error;
        }

        return 0;
}

/**
 * device_change_owner - change the owner of an existing device.
 * @dev: device.
 * @kuid: new owner's kuid
 * @kgid: new owner's kgid
 *
 * This changes the owner of @dev and its corresponding sysfs entries to
 * @kuid/@kgid. This function closely mirrors how @dev was added via driver
 * core.
 *
 * Returns 0 on success or error code on failure.
 */
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid)
{
        int error;
        struct kobject *kobj = &dev->kobj;
        struct subsys_private *sp;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        /*
         * Change the kobject and the default attributes and groups of the
         * ktype associated with it to @kuid/@kgid.
         */
        error = sysfs_change_owner(kobj, kuid, kgid);
        if (error)
                goto out;

        /*
         * Change the uevent file for @dev to the new owner. The uevent file
         * was created in a separate step when @dev got added and we mirror
         * that step here.
         */
        error = sysfs_file_change_owner(kobj, dev_attr_uevent.attr.name, kuid,
                                        kgid);
        if (error)
                goto out;

        /*
         * Change the device groups, the device groups associated with the
         * device class, and the groups associated with the device type of @dev
         * to @kuid/@kgid.
         */
        error = device_attrs_change_owner(dev, kuid, kgid);
        if (error)
                goto out;

        error = dpm_sysfs_change_owner(dev, kuid, kgid);
        if (error)
                goto out;

        /*
         * Change the owner of the symlink located in the class directory of
         * the device class associated with @dev which points to the actual
         * directory entry for @dev to @kuid/@kgid. This ensures that the
         * symlink shows the same permissions as its target.
         */
        sp = class_to_subsys(dev->class);
        if (!sp) {
                error = -EINVAL;
                goto out;
        }
        error = sysfs_link_change_owner(&sp->subsys.kobj, &dev->kobj, dev_name(dev), kuid, kgid);
        subsys_put(sp);

out:
        put_device(dev);
        return error;
}

/**
 * device_shutdown - call ->shutdown() on each device to shutdown.
 */
void device_shutdown(void)
{
        struct device *dev, *parent;

        wait_for_device_probe();
        device_block_probing();

        cpufreq_suspend();

        spin_lock(&devices_kset->list_lock);
        /*
         * Walk the devices list backward, shutting down each in turn.
         * Beware that device unplug events may also start pulling
         * devices offline, even as the system is shutting down.
         */
        while (!list_empty(&devices_kset->list)) {
                dev = list_entry(devices_kset->list.prev, struct device,
                                kobj.entry);

                /*
                 * hold reference count of device's parent to
                 * prevent it from being freed because parent's
                 * lock is to be held
                 */
                parent = get_device(dev->parent);
                get_device(dev);
                /*
                 * Make sure the device is off the kset list, in the
                 * event that dev->*->shutdown() doesn't remove it.
                 */
                list_del_init(&dev->kobj.entry);
                spin_unlock(&devices_kset->list_lock);

                /* hold lock to avoid race with probe/release */
                if (parent)
                        device_lock(parent);
                device_lock(dev);

                /* Don't allow any more runtime suspends */
                pm_runtime_get_noresume(dev);
                pm_runtime_barrier(dev);

                if (dev->class && dev->class->shutdown_pre) {
                        if (initcall_debug)
                                dev_info(dev, "shutdown_pre\n");
                        dev->class->shutdown_pre(dev);
                }
                if (dev->bus && dev->bus->shutdown) {
                        if (initcall_debug)
                                dev_info(dev, "shutdown\n");
                        dev->bus->shutdown(dev);
                } else if (dev->driver && dev->driver->shutdown) {
                        if (initcall_debug)
                                dev_info(dev, "shutdown\n");
                        dev->driver->shutdown(dev);
                }

                device_unlock(dev);
                if (parent)
                        device_unlock(parent);

                put_device(dev);
                put_device(parent);

                spin_lock(&devices_kset->list_lock);
        }
        spin_unlock(&devices_kset->list_lock);
}

/*
 * Device logging functions
 */

#ifdef CONFIG_PRINTK
static void
set_dev_info(const struct device *dev, struct dev_printk_info *dev_info)
{
        const char *subsys;

        memset(dev_info, 0, sizeof(*dev_info));

        if (dev->class)
                subsys = dev->class->name;
        else if (dev->bus)
                subsys = dev->bus->name;
        else
                return;

        strscpy(dev_info->subsystem, subsys);

        /*
         * Add device identifier DEVICE=:
         *   b12:8         block dev_t
         *   c127:3        char dev_t
         *   n8            netdev ifindex
         *   +sound:card0  subsystem:devname
         */
        if (MAJOR(dev->devt)) {
                char c;

                if (strcmp(subsys, "block") == 0)
                        c = 'b';
                else
                        c = 'c';

                snprintf(dev_info->device, sizeof(dev_info->device),
                         "%c%u:%u", c, MAJOR(dev->devt), MINOR(dev->devt));
        } else if (strcmp(subsys, "net") == 0) {
                struct net_device *net = to_net_dev(dev);

                snprintf(dev_info->device, sizeof(dev_info->device),
                         "n%u", net->ifindex);
        } else {
                snprintf(dev_info->device, sizeof(dev_info->device),
                         "+%s:%s", subsys, dev_name(dev));
        }
}

int dev_vprintk_emit(int level, const struct device *dev,
                     const char *fmt, va_list args)
{
        struct dev_printk_info dev_info;

        set_dev_info(dev, &dev_info);

        return vprintk_emit(0, level, &dev_info, fmt, args);
}
EXPORT_SYMBOL(dev_vprintk_emit);

int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);

        r = dev_vprintk_emit(level, dev, fmt, args);

        va_end(args);

        return r;
}
EXPORT_SYMBOL(dev_printk_emit);

static void __dev_printk(const char *level, const struct device *dev,
                        struct va_format *vaf)
{
        if (dev)
                dev_printk_emit(level[1] - '0', dev, "%s %s: %pV",
                                dev_driver_string(dev), dev_name(dev), vaf);
        else
                printk("%s(NULL device *): %pV", level, vaf);
}

void _dev_printk(const char *level, const struct device *dev,
                 const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        __dev_printk(level, dev, &vaf);

        va_end(args);
}
EXPORT_SYMBOL(_dev_printk);

#define define_dev_printk_level(func, kern_level)                \
void func(const struct device *dev, const char *fmt, ...)        \
{                                                                \
        struct va_format vaf;                                        \
        va_list args;                                                \
                                                                \
        va_start(args, fmt);                                        \
                                                                \
        vaf.fmt = fmt;                                                \
        vaf.va = &args;                                                \
                                                                \
        __dev_printk(kern_level, dev, &vaf);                        \
                                                                \
        va_end(args);                                                \
}                                                                \
EXPORT_SYMBOL(func);

define_dev_printk_level(_dev_emerg, KERN_EMERG);
define_dev_printk_level(_dev_alert, KERN_ALERT);
define_dev_printk_level(_dev_crit, KERN_CRIT);
define_dev_printk_level(_dev_err, KERN_ERR);
define_dev_printk_level(_dev_warn, KERN_WARNING);
define_dev_printk_level(_dev_notice, KERN_NOTICE);
define_dev_printk_level(_dev_info, KERN_INFO);

#endif

static void __dev_probe_failed(const struct device *dev, int err, bool fatal,
                               const char *fmt, va_list vargsp)
{
        struct va_format vaf;
        va_list vargs;

        /*
         * On x86_64 and possibly on other architectures, va_list is actually a
         * size-1 array containing a structure.  As a result, function parameter
         * vargsp decays from T[1] to T*, and &vargsp has type T** rather than
         * T(*)[1], which is expected by its assignment to vaf.va below.
         *
         * One standard way to solve this mess is by creating a copy in a local
         * variable of type va_list and then using a pointer to that local copy
         * instead, which is the approach employed here.
         */
        va_copy(vargs, vargsp);

        vaf.fmt = fmt;
        vaf.va = &vargs;

        switch (err) {
        case -EPROBE_DEFER:
                device_set_deferred_probe_reason(dev, &vaf);
                dev_dbg(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
                break;

        case -ENOMEM:
                /* Don't print anything on -ENOMEM, there's already enough output */
                break;

        default:
                /* Log fatal final failures as errors, otherwise produce warnings */
                if (fatal)
                        dev_err(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
                else
                        dev_warn(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
                break;
        }

        va_end(vargs);
}

/**
 * dev_err_probe - probe error check and log helper
 * @dev: the pointer to the struct device
 * @err: error value to test
 * @fmt: printf-style format string
 * @...: arguments as specified in the format string
 *
 * This helper implements common pattern present in probe functions for error
 * checking: print debug or error message depending if the error value is
 * -EPROBE_DEFER and propagate error upwards.
 * In case of -EPROBE_DEFER it sets also defer probe reason, which can be
 * checked later by reading devices_deferred debugfs attribute.
 * It replaces the following code sequence::
 *
 *         if (err != -EPROBE_DEFER)
 *                 dev_err(dev, ...);
 *         else
 *                 dev_dbg(dev, ...);
 *         return err;
 *
 * with::
 *
 *         return dev_err_probe(dev, err, ...);
 *
 * Using this helper in your probe function is totally fine even if @err
 * is known to never be -EPROBE_DEFER.
 * The benefit compared to a normal dev_err() is the standardized format
 * of the error code, which is emitted symbolically (i.e. you get "EAGAIN"
 * instead of "-35"), and having the error code returned allows more
 * compact error paths.
 *
 * Returns @err.
 */
int dev_err_probe(const struct device *dev, int err, const char *fmt, ...)
{
        va_list vargs;

        va_start(vargs, fmt);

        /* Use dev_err() for logging when err doesn't equal -EPROBE_DEFER */
        __dev_probe_failed(dev, err, true, fmt, vargs);

        va_end(vargs);

        return err;
}
EXPORT_SYMBOL_GPL(dev_err_probe);

/**
 * dev_warn_probe - probe error check and log helper
 * @dev: the pointer to the struct device
 * @err: error value to test
 * @fmt: printf-style format string
 * @...: arguments as specified in the format string
 *
 * This helper implements common pattern present in probe functions for error
 * checking: print debug or warning message depending if the error value is
 * -EPROBE_DEFER and propagate error upwards.
 * In case of -EPROBE_DEFER it sets also defer probe reason, which can be
 * checked later by reading devices_deferred debugfs attribute.
 * It replaces the following code sequence::
 *
 *         if (err != -EPROBE_DEFER)
 *                 dev_warn(dev, ...);
 *         else
 *                 dev_dbg(dev, ...);
 *         return err;
 *
 * with::
 *
 *         return dev_warn_probe(dev, err, ...);
 *
 * Using this helper in your probe function is totally fine even if @err
 * is known to never be -EPROBE_DEFER.
 * The benefit compared to a normal dev_warn() is the standardized format
 * of the error code, which is emitted symbolically (i.e. you get "EAGAIN"
 * instead of "-35"), and having the error code returned allows more
 * compact error paths.
 *
 * Returns @err.
 */
int dev_warn_probe(const struct device *dev, int err, const char *fmt, ...)
{
        va_list vargs;

        va_start(vargs, fmt);

        /* Use dev_warn() for logging when err doesn't equal -EPROBE_DEFER */
        __dev_probe_failed(dev, err, false, fmt, vargs);

        va_end(vargs);

        return err;
}
EXPORT_SYMBOL_GPL(dev_warn_probe);

static inline bool fwnode_is_primary(struct fwnode_handle *fwnode)
{
        return fwnode && !IS_ERR(fwnode->secondary);
}

/**
 * set_primary_fwnode - Change the primary firmware node of a given device.
 * @dev: Device to handle.
 * @fwnode: New primary firmware node of the device.
 *
 * Set the device's firmware node pointer to @fwnode, but if a secondary
 * firmware node of the device is present, preserve it.
 *
 * Valid fwnode cases are:
 *  - primary --> secondary --> -ENODEV
 *  - primary --> NULL
 *  - secondary --> -ENODEV
 *  - NULL
 */
void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode)
{
        struct device *parent = dev->parent;
        struct fwnode_handle *fn = dev->fwnode;

        if (fwnode) {
                if (fwnode_is_primary(fn))
                        fn = fn->secondary;

                if (fn) {
                        WARN_ON(fwnode->secondary);
                        fwnode->secondary = fn;
                }
                dev->fwnode = fwnode;
        } else {
                if (fwnode_is_primary(fn)) {
                        dev->fwnode = fn->secondary;

                        /* Skip nullifying fn->secondary if the primary is shared */
                        if (parent && fn == parent->fwnode)
                                return;

                        /* Set fn->secondary = NULL, so fn remains the primary fwnode */
                        fn->secondary = NULL;
                } else {
                        dev->fwnode = NULL;
                }
        }
}
EXPORT_SYMBOL_GPL(set_primary_fwnode);

/**
 * set_secondary_fwnode - Change the secondary firmware node of a given device.
 * @dev: Device to handle.
 * @fwnode: New secondary firmware node of the device.
 *
 * If a primary firmware node of the device is present, set its secondary
 * pointer to @fwnode.  Otherwise, set the device's firmware node pointer to
 * @fwnode.
 */
void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode)
{
        if (fwnode)
                fwnode->secondary = ERR_PTR(-ENODEV);

        if (fwnode_is_primary(dev->fwnode))
                dev->fwnode->secondary = fwnode;
        else
                dev->fwnode = fwnode;
}
EXPORT_SYMBOL_GPL(set_secondary_fwnode);

/**
 * device_remove_of_node - Remove an of_node from a device
 * @dev: device whose device tree node is being removed
 */
void device_remove_of_node(struct device *dev)
{
        dev = get_device(dev);
        if (!dev)
                return;

        if (!dev->of_node)
                goto end;

        if (dev->fwnode == of_fwnode_handle(dev->of_node))
                dev->fwnode = NULL;

        of_node_put(dev->of_node);
        dev->of_node = NULL;

end:
        put_device(dev);
}
EXPORT_SYMBOL_GPL(device_remove_of_node);

/**
 * device_add_of_node - Add an of_node to an existing device
 * @dev: device whose device tree node is being added
 * @of_node: of_node to add
 *
 * Return: 0 on success or error code on failure.
 */
int device_add_of_node(struct device *dev, struct device_node *of_node)
{
        int ret;

        if (!of_node)
                return -EINVAL;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        if (dev->of_node) {
                dev_err(dev, "Cannot replace node %pOF with %pOF\n",
                        dev->of_node, of_node);
                ret = -EBUSY;
                goto end;
        }

        dev->of_node = of_node_get(of_node);

        if (!dev->fwnode)
                dev->fwnode = of_fwnode_handle(of_node);

        ret = 0;
end:
        put_device(dev);
        return ret;
}
EXPORT_SYMBOL_GPL(device_add_of_node);

/**
 * device_set_of_node_from_dev - reuse device-tree node of another device
 * @dev: device whose device-tree node is being set
 * @dev2: device whose device-tree node is being reused
 *
 * Takes another reference to the new device-tree node after first dropping
 * any reference held to the old node.
 */
void device_set_of_node_from_dev(struct device *dev, const struct device *dev2)
{
        of_node_put(dev->of_node);
        dev->of_node = of_node_get(dev2->of_node);
        dev->of_node_reused = true;
}
EXPORT_SYMBOL_GPL(device_set_of_node_from_dev);

void device_set_node(struct device *dev, struct fwnode_handle *fwnode)
{
        dev->fwnode = fwnode;
        dev->of_node = to_of_node(fwnode);
}
EXPORT_SYMBOL_GPL(device_set_node);

/**
 * get_dev_from_fwnode - Obtain a reference count of the struct device the
 * struct fwnode_handle is associated with.
 * @fwnode: The pointer to the struct fwnode_handle to obtain the struct device
 * reference count of.
 *
 * This function obtains a reference count of the device the device pointer
 * embedded in the struct fwnode_handle points to.
 *
 * Note that the struct device pointer embedded in struct fwnode_handle does
 * *not* have a reference count of the struct device itself.
 *
 * Hence, it is a UAF (and thus a bug) to call this function if the caller can't
 * guarantee that the last reference count of the corresponding struct device is
 * not dropped concurrently.
 *
 * This is possible since struct fwnode_handle has its own reference count and
 * hence can out-live the struct device it is associated with.
 */
struct device *get_dev_from_fwnode(struct fwnode_handle *fwnode)
{
        return get_device((fwnode)->dev);
}
EXPORT_SYMBOL_GPL(get_dev_from_fwnode);

int device_match_name(struct device *dev, const void *name)
{
        return sysfs_streq(dev_name(dev), name);
}
EXPORT_SYMBOL_GPL(device_match_name);

int device_match_type(struct device *dev, const void *type)
{
        return dev->type == type;
}
EXPORT_SYMBOL_GPL(device_match_type);

int device_match_of_node(struct device *dev, const void *np)
{
        return np && dev->of_node == np;
}
EXPORT_SYMBOL_GPL(device_match_of_node);

int device_match_fwnode(struct device *dev, const void *fwnode)
{
        return fwnode && dev_fwnode(dev) == fwnode;
}
EXPORT_SYMBOL_GPL(device_match_fwnode);

int device_match_devt(struct device *dev, const void *pdevt)
{
        return dev->devt == *(dev_t *)pdevt;
}
EXPORT_SYMBOL_GPL(device_match_devt);

int device_match_acpi_dev(struct device *dev, const void *adev)
{
        return adev && ACPI_COMPANION(dev) == adev;
}
EXPORT_SYMBOL(device_match_acpi_dev);

int device_match_acpi_handle(struct device *dev, const void *handle)
{
        return handle && ACPI_HANDLE(dev) == handle;
}
EXPORT_SYMBOL(device_match_acpi_handle);

int device_match_any(struct device *dev, const void *unused)
{
        return 1;
}
EXPORT_SYMBOL_GPL(device_match_any);






































































    6 


    6 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// SPDX-License-Identifier: GPL-2.0-only
/*
 * IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6
 *
 * Copyright (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
 * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
 */
#include <linux/module.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/slab.h>
#include <net/ipv6.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("ip6tables mangle table");

#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
                            (1 << NF_INET_LOCAL_IN) | \
                            (1 << NF_INET_FORWARD) | \
                            (1 << NF_INET_LOCAL_OUT) | \
                            (1 << NF_INET_POST_ROUTING))

static const struct xt_table packet_mangler = {
        .name                = "mangle",
        .valid_hooks        = MANGLE_VALID_HOOKS,
        .me                = THIS_MODULE,
        .af                = NFPROTO_IPV6,
        .priority        = NF_IP6_PRI_MANGLE,
};

static unsigned int
ip6t_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
        struct in6_addr saddr, daddr;
        unsigned int ret, verdict;
        u32 flowlabel, mark;
        u8 hop_limit;
        int err;

        /* save source/dest address, mark, hoplimit, flowlabel, priority,  */
        memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
        memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
        mark = skb->mark;
        hop_limit = ipv6_hdr(skb)->hop_limit;

        /* flowlabel and prio (includes version, which shouldn't change either */
        flowlabel = *((u_int32_t *)ipv6_hdr(skb));

        ret = ip6t_do_table(priv, skb, state);
        verdict = ret & NF_VERDICT_MASK;

        if (verdict != NF_DROP && verdict != NF_STOLEN &&
            (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) ||
             !ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) ||
             skb->mark != mark ||
             ipv6_hdr(skb)->hop_limit != hop_limit ||
             flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
                err = ip6_route_me_harder(state->net, state->sk, skb);
                if (err < 0)
                        ret = NF_DROP_ERR(err);
        }

        return ret;
}

/* The work comes in here from netfilter.c. */
static unsigned int
ip6table_mangle_hook(void *priv, struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        if (state->hook == NF_INET_LOCAL_OUT)
                return ip6t_mangle_out(priv, skb, state);
        return ip6t_do_table(priv, skb, state);
}

static struct nf_hook_ops *mangle_ops __read_mostly;
static int ip6table_mangle_table_init(struct net *net)
{
        struct ip6t_replace *repl;
        int ret;

        repl = ip6t_alloc_initial_table(&packet_mangler);
        if (repl == NULL)
                return -ENOMEM;
        ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops);
        kfree(repl);
        return ret;
}

static void __net_exit ip6table_mangle_net_pre_exit(struct net *net)
{
        ip6t_unregister_table_pre_exit(net, "mangle");
}

static void __net_exit ip6table_mangle_net_exit(struct net *net)
{
        ip6t_unregister_table_exit(net, "mangle");
}

static struct pernet_operations ip6table_mangle_net_ops = {
        .pre_exit = ip6table_mangle_net_pre_exit,
        .exit = ip6table_mangle_net_exit,
};

static int __init ip6table_mangle_init(void)
{
        int ret = xt_register_template(&packet_mangler,
                                       ip6table_mangle_table_init);

        if (ret < 0)
                return ret;

        mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
        if (IS_ERR(mangle_ops)) {
                xt_unregister_template(&packet_mangler);
                return PTR_ERR(mangle_ops);
        }

        ret = register_pernet_subsys(&ip6table_mangle_net_ops);
        if (ret < 0) {
                xt_unregister_template(&packet_mangler);
                kfree(mangle_ops);
                return ret;
        }

        return ret;
}

static void __exit ip6table_mangle_fini(void)
{
        unregister_pernet_subsys(&ip6table_mangle_net_ops);
        xt_unregister_template(&packet_mangler);
        kfree(mangle_ops);
}

module_init(ip6table_mangle_init);
module_exit(ip6table_mangle_fini);



















































































    4 




    4 


























    3 



    3 














































































































































































































































































































































































    3 





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright(c) 2017 Intel Corporation. All rights reserved.
 */
#include <linux/pagemap.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/magic.h>
#include <linux/cdev.h>
#include <linux/slab.h>
#include <linux/uio.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/cacheinfo.h>
#include "dax-private.h"

/**
 * struct dax_device - anchor object for dax services
 * @inode: core vfs
 * @cdev: optional character interface for "device dax"
 * @private: dax driver private data
 * @flags: state and boolean properties
 * @ops: operations for this device
 * @holder_data: holder of a dax_device: could be filesystem or mapped device
 * @holder_ops: operations for the inner holder
 */
struct dax_device {
        struct inode inode;
        struct cdev cdev;
        void *private;
        unsigned long flags;
        const struct dax_operations *ops;
        void *holder_data;
        const struct dax_holder_operations *holder_ops;
};

static dev_t dax_devt;
DEFINE_STATIC_SRCU(dax_srcu);
static struct vfsmount *dax_mnt;
static DEFINE_IDA(dax_minor_ida);
static struct kmem_cache *dax_cache __read_mostly;
static struct super_block *dax_superblock __read_mostly;

int dax_read_lock(void)
{
        return srcu_read_lock(&dax_srcu);
}
EXPORT_SYMBOL_GPL(dax_read_lock);

void dax_read_unlock(int id)
{
        srcu_read_unlock(&dax_srcu, id);
}
EXPORT_SYMBOL_GPL(dax_read_unlock);

#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
#include <linux/blkdev.h>

static DEFINE_XARRAY(dax_hosts);

int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
        return xa_insert(&dax_hosts, (unsigned long)disk, dax_dev, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(dax_add_host);

void dax_remove_host(struct gendisk *disk)
{
        xa_erase(&dax_hosts, (unsigned long)disk);
}
EXPORT_SYMBOL_GPL(dax_remove_host);

/**
 * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax
 * @bdev: block device to find a dax_device for
 * @start_off: returns the byte offset into the dax_device that @bdev starts
 * @holder: filesystem or mapped device inside the dax_device
 * @ops: operations for the inner holder
 */
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
                void *holder, const struct dax_holder_operations *ops)
{
        struct dax_device *dax_dev;
        u64 part_size;
        int id;

        if (!blk_queue_dax(bdev->bd_disk->queue))
                return NULL;

        *start_off = get_start_sect(bdev) * SECTOR_SIZE;
        part_size = bdev_nr_sectors(bdev) * SECTOR_SIZE;
        if (*start_off % PAGE_SIZE || part_size % PAGE_SIZE) {
                pr_info("%pg: error: unaligned partition for dax\n", bdev);
                return NULL;
        }

        id = dax_read_lock();
        dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
        if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
                dax_dev = NULL;
        else if (holder) {
                if (!cmpxchg(&dax_dev->holder_data, NULL, holder))
                        dax_dev->holder_ops = ops;
                else
                        dax_dev = NULL;
        }
        dax_read_unlock(id);

        return dax_dev;
}
EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);

void fs_put_dax(struct dax_device *dax_dev, void *holder)
{
        if (dax_dev && holder &&
            cmpxchg(&dax_dev->holder_data, holder, NULL) == holder)
                dax_dev->holder_ops = NULL;
        put_dax(dax_dev);
}
EXPORT_SYMBOL_GPL(fs_put_dax);
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */

enum dax_device_flags {
        /* !alive + rcu grace period == no new operations / mappings */
        DAXDEV_ALIVE,
        /* gate whether dax_flush() calls the low level flush routine */
        DAXDEV_WRITE_CACHE,
        /* flag to check if device supports synchronous flush */
        DAXDEV_SYNC,
        /* do not leave the caches dirty after writes */
        DAXDEV_NOCACHE,
        /* handle CPU fetch exceptions during reads */
        DAXDEV_NOMC,
};

/**
 * dax_direct_access() - translate a device pgoff to an absolute pfn
 * @dax_dev: a dax_device instance representing the logical memory range
 * @pgoff: offset in pages from the start of the device to translate
 * @nr_pages: number of consecutive pages caller can handle relative to @pfn
 * @mode: indicator on normal access or recovery write
 * @kaddr: output parameter that returns a virtual address mapping of pfn
 * @pfn: output parameter that returns an absolute pfn translation of @pgoff
 *
 * Return: negative errno if an error occurs, otherwise the number of
 * pages accessible at the device relative @pgoff.
 */
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
                enum dax_access_mode mode, void **kaddr, unsigned long *pfn)
{
        long avail;

        if (!dax_dev)
                return -EOPNOTSUPP;

        if (!dax_alive(dax_dev))
                return -ENXIO;

        if (nr_pages < 0)
                return -EINVAL;

        avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
                        mode, kaddr, pfn);
        if (!avail)
                return -ERANGE;
        return min(avail, nr_pages);
}
EXPORT_SYMBOL_GPL(dax_direct_access);

size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i)
{
        if (!dax_alive(dax_dev))
                return 0;

        /*
         * The userspace address for the memory copy has already been validated
         * via access_ok() in vfs_write, so use the 'no check' version to bypass
         * the HARDENED_USERCOPY overhead.
         */
        if (test_bit(DAXDEV_NOCACHE, &dax_dev->flags))
                return _copy_from_iter_flushcache(addr, bytes, i);
        return _copy_from_iter(addr, bytes, i);
}

size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i)
{
        if (!dax_alive(dax_dev))
                return 0;

        /*
         * The userspace address for the memory copy has already been validated
         * via access_ok() in vfs_red, so use the 'no check' version to bypass
         * the HARDENED_USERCOPY overhead.
         */
        if (test_bit(DAXDEV_NOMC, &dax_dev->flags))
                return _copy_mc_to_iter(addr, bytes, i);
        return _copy_to_iter(addr, bytes, i);
}

int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
                        size_t nr_pages)
{
        int ret;

        if (!dax_alive(dax_dev))
                return -ENXIO;
        /*
         * There are no callers that want to zero more than one page as of now.
         * Once users are there, this check can be removed after the
         * device mapper code has been updated to split ranges across targets.
         */
        if (nr_pages != 1)
                return -EIO;

        ret = dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
        return dax_mem2blk_err(ret);
}
EXPORT_SYMBOL_GPL(dax_zero_page_range);

size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
                void *addr, size_t bytes, struct iov_iter *iter)
{
        if (!dax_dev->ops->recovery_write)
                return 0;
        return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter);
}
EXPORT_SYMBOL_GPL(dax_recovery_write);

int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off,
                              u64 len, int mf_flags)
{
        int rc, id;

        id = dax_read_lock();
        if (!dax_alive(dax_dev)) {
                rc = -ENXIO;
                goto out;
        }

        if (!dax_dev->holder_ops) {
                rc = -EOPNOTSUPP;
                goto out;
        }

        rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags);
out:
        dax_read_unlock(id);
        return rc;
}
EXPORT_SYMBOL_GPL(dax_holder_notify_failure);

#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
{
        if (unlikely(!dax_write_cache_enabled(dax_dev)))
                return;

        arch_wb_cache_pmem(addr, size);
}
#else
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
{
}
#endif
EXPORT_SYMBOL_GPL(dax_flush);

void dax_write_cache(struct dax_device *dax_dev, bool wc)
{
        if (wc)
                set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
        else
                clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
}
EXPORT_SYMBOL_GPL(dax_write_cache);

bool dax_write_cache_enabled(struct dax_device *dax_dev)
{
        return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
}
EXPORT_SYMBOL_GPL(dax_write_cache_enabled);

bool dax_synchronous(struct dax_device *dax_dev)
{
        return test_bit(DAXDEV_SYNC, &dax_dev->flags);
}
EXPORT_SYMBOL_GPL(dax_synchronous);

void set_dax_synchronous(struct dax_device *dax_dev)
{
        set_bit(DAXDEV_SYNC, &dax_dev->flags);
}
EXPORT_SYMBOL_GPL(set_dax_synchronous);

void set_dax_nocache(struct dax_device *dax_dev)
{
        set_bit(DAXDEV_NOCACHE, &dax_dev->flags);
}
EXPORT_SYMBOL_GPL(set_dax_nocache);

void set_dax_nomc(struct dax_device *dax_dev)
{
        set_bit(DAXDEV_NOMC, &dax_dev->flags);
}
EXPORT_SYMBOL_GPL(set_dax_nomc);

bool dax_alive(struct dax_device *dax_dev)
{
        lockdep_assert_held(&dax_srcu);
        return test_bit(DAXDEV_ALIVE, &dax_dev->flags);
}
EXPORT_SYMBOL_GPL(dax_alive);

/*
 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
 * that any fault handlers or operations that might have seen
 * dax_alive(), have completed.  Any operations that start after
 * synchronize_srcu() has run will abort upon seeing !dax_alive().
 *
 * Note, because alloc_dax() returns an ERR_PTR() on error, callers
 * typically store its result into a local variable in order to check
 * the result. Therefore, care must be taken to populate the struct
 * device dax_dev field make sure the dax_dev is not leaked.
 */
void kill_dax(struct dax_device *dax_dev)
{
        if (!dax_dev)
                return;

        if (dax_dev->holder_data != NULL)
                dax_holder_notify_failure(dax_dev, 0, U64_MAX,
                                MF_MEM_PRE_REMOVE);

        clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
        synchronize_srcu(&dax_srcu);

        /* clear holder data */
        dax_dev->holder_ops = NULL;
        dax_dev->holder_data = NULL;
}
EXPORT_SYMBOL_GPL(kill_dax);

void run_dax(struct dax_device *dax_dev)
{
        set_bit(DAXDEV_ALIVE, &dax_dev->flags);
}
EXPORT_SYMBOL_GPL(run_dax);

static struct inode *dax_alloc_inode(struct super_block *sb)
{
        struct dax_device *dax_dev;
        struct inode *inode;

        dax_dev = alloc_inode_sb(sb, dax_cache, GFP_KERNEL);
        if (!dax_dev)
                return NULL;

        inode = &dax_dev->inode;
        inode->i_rdev = 0;
        return inode;
}

static struct dax_device *to_dax_dev(struct inode *inode)
{
        return container_of(inode, struct dax_device, inode);
}

static void dax_free_inode(struct inode *inode)
{
        struct dax_device *dax_dev = to_dax_dev(inode);
        if (inode->i_rdev)
                ida_free(&dax_minor_ida, iminor(inode));
        kmem_cache_free(dax_cache, dax_dev);
}

static void dax_destroy_inode(struct inode *inode)
{
        struct dax_device *dax_dev = to_dax_dev(inode);
        WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags),
                        "kill_dax() must be called before final iput()\n");
}

static const struct super_operations dax_sops = {
        .statfs = simple_statfs,
        .alloc_inode = dax_alloc_inode,
        .destroy_inode = dax_destroy_inode,
        .free_inode = dax_free_inode,
        .drop_inode = inode_just_drop,
};

static int dax_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, DAXFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &dax_sops;
        return 0;
}

static struct file_system_type dax_fs_type = {
        .name                = "dax",
        .init_fs_context = dax_init_fs_context,
        .kill_sb        = kill_anon_super,
};

static int dax_test(struct inode *inode, void *data)
{
        dev_t devt = *(dev_t *) data;

        return inode->i_rdev == devt;
}

static int dax_set(struct inode *inode, void *data)
{
        dev_t devt = *(dev_t *) data;

        inode->i_rdev = devt;
        return 0;
}

static struct dax_device *dax_dev_get(dev_t devt)
{
        struct dax_device *dax_dev;
        struct inode *inode;

        inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
                        dax_test, dax_set, &devt);

        if (!inode)
                return NULL;

        dax_dev = to_dax_dev(inode);
        if (inode_state_read_once(inode) & I_NEW) {
                set_bit(DAXDEV_ALIVE, &dax_dev->flags);
                inode->i_cdev = &dax_dev->cdev;
                inode->i_mode = S_IFCHR;
                inode->i_flags = S_DAX;
                mapping_set_gfp_mask(&inode->i_data, GFP_USER);
                unlock_new_inode(inode);
        }

        return dax_dev;
}

struct dax_device *alloc_dax(void *private, const struct dax_operations *ops)
{
        struct dax_device *dax_dev;
        dev_t devt;
        int minor;

        /*
         * Unavailable on architectures with virtually aliased data caches,
         * except for device-dax (NULL operations pointer), which does
         * not use aliased mappings from the kernel.
         */
        if (ops && cpu_dcache_is_aliasing())
                return ERR_PTR(-EOPNOTSUPP);

        if (WARN_ON_ONCE(ops && !ops->zero_page_range))
                return ERR_PTR(-EINVAL);

        minor = ida_alloc_max(&dax_minor_ida, MINORMASK, GFP_KERNEL);
        if (minor < 0)
                return ERR_PTR(-ENOMEM);

        devt = MKDEV(MAJOR(dax_devt), minor);
        dax_dev = dax_dev_get(devt);
        if (!dax_dev)
                goto err_dev;

        dax_dev->ops = ops;
        dax_dev->private = private;
        return dax_dev;

 err_dev:
        ida_free(&dax_minor_ida, minor);
        return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(alloc_dax);

void put_dax(struct dax_device *dax_dev)
{
        if (!dax_dev)
                return;
        iput(&dax_dev->inode);
}
EXPORT_SYMBOL_GPL(put_dax);

/**
 * dax_holder() - obtain the holder of a dax device
 * @dax_dev: a dax_device instance
 *
 * Return: the holder's data which represents the holder if registered,
 * otherwize NULL.
 */
void *dax_holder(struct dax_device *dax_dev)
{
        return dax_dev->holder_data;
}
EXPORT_SYMBOL_GPL(dax_holder);

/**
 * inode_dax: convert a public inode into its dax_dev
 * @inode: An inode with i_cdev pointing to a dax_dev
 *
 * Note this is not equivalent to to_dax_dev() which is for private
 * internal use where we know the inode filesystem type == dax_fs_type.
 */
struct dax_device *inode_dax(struct inode *inode)
{
        struct cdev *cdev = inode->i_cdev;

        return container_of(cdev, struct dax_device, cdev);
}
EXPORT_SYMBOL_GPL(inode_dax);

struct inode *dax_inode(struct dax_device *dax_dev)
{
        return &dax_dev->inode;
}
EXPORT_SYMBOL_GPL(dax_inode);

void *dax_get_private(struct dax_device *dax_dev)
{
        if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags))
                return NULL;
        return dax_dev->private;
}
EXPORT_SYMBOL_GPL(dax_get_private);

static void init_once(void *_dax_dev)
{
        struct dax_device *dax_dev = _dax_dev;
        struct inode *inode = &dax_dev->inode;

        memset(dax_dev, 0, sizeof(*dax_dev));
        inode_init_once(inode);
}

static int dax_fs_init(void)
{
        int rc;

        dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
                        SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
                        init_once);
        if (!dax_cache)
                return -ENOMEM;

        dax_mnt = kern_mount(&dax_fs_type);
        if (IS_ERR(dax_mnt)) {
                rc = PTR_ERR(dax_mnt);
                goto err_mount;
        }
        dax_superblock = dax_mnt->mnt_sb;

        return 0;

 err_mount:
        kmem_cache_destroy(dax_cache);

        return rc;
}

static void dax_fs_exit(void)
{
        kern_unmount(dax_mnt);
        rcu_barrier();
        kmem_cache_destroy(dax_cache);
}

static int __init dax_core_init(void)
{
        int rc;

        rc = dax_fs_init();
        if (rc)
                return rc;

        rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax");
        if (rc)
                goto err_chrdev;

        rc = dax_bus_init();
        if (rc)
                goto err_bus;
        return 0;

err_bus:
        unregister_chrdev_region(dax_devt, MINORMASK+1);
err_chrdev:
        dax_fs_exit();
        return 0;
}

static void __exit dax_core_exit(void)
{
        dax_bus_exit();
        unregister_chrdev_region(dax_devt, MINORMASK+1);
        ida_destroy(&dax_minor_ida);
        dax_fs_exit();
}

MODULE_AUTHOR("Intel Corporation");
MODULE_DESCRIPTION("DAX: direct access to differentiated memory");
MODULE_LICENSE("GPL v2");
subsys_initcall(dax_core_init);
module_exit(dax_core_exit);











































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Skb ref helpers.
 *
 */

#ifndef _LINUX_SKBUFF_REF_H
#define _LINUX_SKBUFF_REF_H

#include <linux/skbuff.h>

/**
 * __skb_frag_ref - take an addition reference on a paged fragment.
 * @frag: the paged fragment
 *
 * Takes an additional reference on the paged fragment @frag.
 */
static __always_inline void __skb_frag_ref(skb_frag_t *frag)
{
        get_netmem(skb_frag_netmem(frag));
}

/**
 * skb_frag_ref - take an addition reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset.
 *
 * Takes an additional reference on the @f'th paged fragment of @skb.
 */
static __always_inline void skb_frag_ref(struct sk_buff *skb, int f)
{
        __skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}

bool napi_pp_put_page(netmem_ref netmem);

static __always_inline void skb_page_unref(netmem_ref netmem, bool recycle)
{
#ifdef CONFIG_PAGE_POOL
        if (recycle && napi_pp_put_page(netmem))
                return;
#endif
        put_netmem(netmem);
}

/**
 * __skb_frag_unref - release a reference on a paged fragment.
 * @frag: the paged fragment
 * @recycle: recycle the page if allocated via page_pool
 *
 * Releases a reference on the paged fragment @frag
 * or recycles the page via the page_pool API.
 */
static __always_inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{
        skb_page_unref(skb_frag_netmem(frag), recycle);
}

/**
 * skb_frag_unref - release a reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset
 *
 * Releases a reference on the @f'th paged fragment of @skb.
 */
static __always_inline void skb_frag_unref(struct sk_buff *skb, int f)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (!skb_zcopy_managed(skb))
                __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
}

#endif        /* _LINUX_SKBUFF_REF_H */



























































































































    2 





   14 












    1 




















    9 








   14 






    1 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef RQ_QOS_H
#define RQ_QOS_H

#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blk_types.h>
#include <linux/atomic.h>
#include <linux/wait.h>
#include <linux/blk-mq.h>

#include "blk-mq-debugfs.h"

struct blk_mq_debugfs_attr;

enum rq_qos_id {
        RQ_QOS_WBT,
        RQ_QOS_LATENCY,
        RQ_QOS_COST,
};

struct rq_wait {
        wait_queue_head_t wait;
        atomic_t inflight;
};

struct rq_qos {
        const struct rq_qos_ops *ops;
        struct gendisk *disk;
        enum rq_qos_id id;
        struct rq_qos *next;
#ifdef CONFIG_BLK_DEBUG_FS
        struct dentry *debugfs_dir;
#endif
};

struct rq_qos_ops {
        void (*throttle)(struct rq_qos *, struct bio *);
        void (*track)(struct rq_qos *, struct request *, struct bio *);
        void (*merge)(struct rq_qos *, struct request *, struct bio *);
        void (*issue)(struct rq_qos *, struct request *);
        void (*requeue)(struct rq_qos *, struct request *);
        void (*done)(struct rq_qos *, struct request *);
        void (*done_bio)(struct rq_qos *, struct bio *);
        void (*cleanup)(struct rq_qos *, struct bio *);
        void (*queue_depth_changed)(struct rq_qos *);
        void (*exit)(struct rq_qos *);
        const struct blk_mq_debugfs_attr *debugfs_attrs;
};

struct rq_depth {
        unsigned int max_depth;

        int scale_step;
        bool scaled_max;

        unsigned int queue_depth;
        unsigned int default_depth;
};

static inline struct rq_qos *rq_qos_id(struct request_queue *q,
                                       enum rq_qos_id id)
{
        struct rq_qos *rqos;
        for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
                if (rqos->id == id)
                        break;
        }
        return rqos;
}

static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
{
        return rq_qos_id(q, RQ_QOS_WBT);
}

static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
{
        return rq_qos_id(q, RQ_QOS_LATENCY);
}

static inline void rq_wait_init(struct rq_wait *rq_wait)
{
        atomic_set(&rq_wait->inflight, 0);
        init_waitqueue_head(&rq_wait->wait);
}

int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
                const struct rq_qos_ops *ops);
void rq_qos_del(struct rq_qos *rqos);

typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data);

void rq_qos_wait(struct rq_wait *rqw, void *private_data,
                 acquire_inflight_cb_t *acquire_inflight_cb,
                 cleanup_cb_t *cleanup_cb);
bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
bool rq_depth_scale_up(struct rq_depth *rqd);
bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
bool rq_depth_calc_max_depth(struct rq_depth *rqd);

void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_done(struct rq_qos *rqos, struct request *rq);
void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_queue_depth_changed(struct rq_qos *rqos);

static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
        if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
                __rq_qos_cleanup(q->rq_qos, bio);
}

static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
        if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) &&
            q->rq_qos && !blk_rq_is_passthrough(rq))
                __rq_qos_done(q->rq_qos, rq);
}

static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
{
        if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
                __rq_qos_issue(q->rq_qos, rq);
}

static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
{
        if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
                __rq_qos_requeue(q->rq_qos, rq);
}

static inline void rq_qos_done_bio(struct bio *bio)
{
        struct request_queue *q;

        if (!bio->bi_bdev || (!bio_flagged(bio, BIO_QOS_THROTTLED) &&
                             !bio_flagged(bio, BIO_QOS_MERGED)))
                return;

        q = bdev_get_queue(bio->bi_bdev);

        /*
         * A BIO may carry BIO_QOS_* flags even if the associated request_queue
         * does not have rq_qos enabled. This can happen with stacked block
         * devices — for example, NVMe multipath, where it's possible that the
         * bottom device has QoS enabled but the top device does not. Therefore,
         * always verify that q->rq_qos is present and QoS is enabled before
         * calling __rq_qos_done_bio().
         */
        if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
                __rq_qos_done_bio(q->rq_qos, bio);
}

static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
        if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) {
                bio_set_flag(bio, BIO_QOS_THROTTLED);
                __rq_qos_throttle(q->rq_qos, bio);
        }
}

static inline void rq_qos_track(struct request_queue *q, struct request *rq,
                                struct bio *bio)
{
        if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
                __rq_qos_track(q->rq_qos, rq, bio);
}

static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
                                struct bio *bio)
{
        if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) {
                bio_set_flag(bio, BIO_QOS_MERGED);
                __rq_qos_merge(q->rq_qos, rq, bio);
        }
}

static inline void rq_qos_queue_depth_changed(struct request_queue *q)
{
        if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
                __rq_qos_queue_depth_changed(q->rq_qos);
}

void rq_qos_exit(struct request_queue *);

#endif


































































    1 























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Released under the GPLv2 only.
 */

#include <linux/pm.h>
#include <linux/acpi.h>

struct usb_hub_descriptor;
struct usb_dev_state;

/* Functions local to drivers/usb/core/ */

extern int usb_create_sysfs_dev_files(struct usb_device *dev);
extern void usb_remove_sysfs_dev_files(struct usb_device *dev);
extern void usb_create_sysfs_intf_files(struct usb_interface *intf);
extern void usb_remove_sysfs_intf_files(struct usb_interface *intf);
extern int usb_update_wireless_status_attr(struct usb_interface *intf);
extern int usb_create_ep_devs(struct device *parent,
                                struct usb_host_endpoint *endpoint,
                                struct usb_device *udev);
extern void usb_remove_ep_devs(struct usb_host_endpoint *endpoint);

extern void usb_enable_endpoint(struct usb_device *dev,
                struct usb_host_endpoint *ep, bool reset_toggle);
extern void usb_enable_interface(struct usb_device *dev,
                struct usb_interface *intf, bool reset_toggles);
extern void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr,
                bool reset_hardware);
extern void usb_disable_interface(struct usb_device *dev,
                struct usb_interface *intf, bool reset_hardware);
extern void usb_release_interface_cache(struct kref *ref);
extern void usb_disable_device(struct usb_device *dev, int skip_ep0);
extern int usb_deauthorize_device(struct usb_device *);
extern int usb_authorize_device(struct usb_device *);
extern void usb_deauthorize_interface(struct usb_interface *);
extern void usb_authorize_interface(struct usb_interface *);
extern void usb_detect_quirks(struct usb_device *udev);
extern void usb_detect_interface_quirks(struct usb_device *udev);
extern void usb_release_quirk_list(void);
extern bool usb_endpoint_is_ignored(struct usb_device *udev,
                struct usb_host_interface *intf,
                struct usb_endpoint_descriptor *epd);
extern int usb_remove_device(struct usb_device *udev);

extern struct usb_device_descriptor *usb_get_device_descriptor(
                struct usb_device *udev);
extern int usb_set_isoch_delay(struct usb_device *dev);
extern int usb_get_bos_descriptor(struct usb_device *dev);
extern void usb_release_bos_descriptor(struct usb_device *dev);
extern int usb_set_configuration(struct usb_device *dev, int configuration);
extern int usb_choose_configuration(struct usb_device *udev);
extern int usb_generic_driver_probe(struct usb_device *udev);
extern void usb_generic_driver_disconnect(struct usb_device *udev);
extern int usb_generic_driver_suspend(struct usb_device *udev,
                pm_message_t msg);
extern int usb_generic_driver_resume(struct usb_device *udev,
                pm_message_t msg);

static inline unsigned usb_get_max_power(struct usb_device *udev,
                struct usb_host_config *c)
{
        /* SuperSpeed power is in 8 mA units; others are in 2 mA units */
        unsigned mul = (udev->speed >= USB_SPEED_SUPER ? 8 : 2);

        return c->desc.bMaxPower * mul;
}

extern void usb_kick_hub_wq(struct usb_device *dev);
extern int usb_match_one_id_intf(struct usb_device *dev,
                                 struct usb_host_interface *intf,
                                 const struct usb_device_id *id);
extern int usb_match_device(struct usb_device *dev,
                            const struct usb_device_id *id);
extern const struct usb_device_id *usb_device_match_id(struct usb_device *udev,
                                const struct usb_device_id *id);
extern bool usb_driver_applicable(struct usb_device *udev,
                                  const struct usb_device_driver *udrv);
extern void usb_forced_unbind_intf(struct usb_interface *intf);
extern void usb_unbind_and_rebind_marked_interfaces(struct usb_device *udev);

extern void usb_hub_release_all_ports(struct usb_device *hdev,
                struct usb_dev_state *owner);
extern bool usb_device_is_owned(struct usb_device *udev);

extern int  usb_hub_init(void);
extern void usb_hub_cleanup(void);
extern int usb_major_init(void);
extern void usb_major_cleanup(void);
extern int usb_device_supports_lpm(struct usb_device *udev);
extern int usb_port_disable(struct usb_device *udev);

#ifdef        CONFIG_PM

extern int usb_suspend(struct device *dev, pm_message_t msg);
extern int usb_resume(struct device *dev, pm_message_t msg);
extern int usb_resume_complete(struct device *dev);

extern int usb_port_suspend(struct usb_device *dev, pm_message_t msg);
extern int usb_port_resume(struct usb_device *dev, pm_message_t msg);

extern void usb_autosuspend_device(struct usb_device *udev);
extern int usb_autoresume_device(struct usb_device *udev);
extern int usb_remote_wakeup(struct usb_device *dev);
extern int usb_runtime_suspend(struct device *dev);
extern int usb_runtime_resume(struct device *dev);
extern int usb_runtime_idle(struct device *dev);
extern int usb_enable_usb2_hardware_lpm(struct usb_device *udev);
extern int usb_disable_usb2_hardware_lpm(struct usb_device *udev);

extern void usbfs_notify_suspend(struct usb_device *udev);
extern void usbfs_notify_resume(struct usb_device *udev);

#else

static inline int usb_port_suspend(struct usb_device *udev, pm_message_t msg)
{
        return 0;
}

static inline int usb_port_resume(struct usb_device *udev, pm_message_t msg)
{
        return 0;
}

#define usb_autosuspend_device(udev)                do {} while (0)
static inline int usb_autoresume_device(struct usb_device *udev)
{
        return 0;
}

static inline int usb_enable_usb2_hardware_lpm(struct usb_device *udev)
{
        return 0;
}

static inline int usb_disable_usb2_hardware_lpm(struct usb_device *udev)
{
        return 0;
}

#endif

extern const struct class usbmisc_class;
extern const struct bus_type usb_bus_type;
extern struct mutex usb_port_peer_mutex;
extern const struct device_type usb_device_type;
extern const struct device_type usb_if_device_type;
extern const struct device_type usb_ep_device_type;
extern const struct device_type usb_port_device_type;
extern struct usb_device_driver usb_generic_driver;

static inline int is_usb_device(const struct device *dev)
{
        return dev->type == &usb_device_type;
}

static inline int is_usb_interface(const struct device *dev)
{
        return dev->type == &usb_if_device_type;
}

static inline int is_usb_endpoint(const struct device *dev)
{
        return dev->type == &usb_ep_device_type;
}

static inline int is_usb_port(const struct device *dev)
{
        return dev->type == &usb_port_device_type;
}

static inline int is_root_hub(struct usb_device *udev)
{
        return (udev->parent == NULL);
}

extern bool is_usb_device_driver(const struct device_driver *drv);

/* for labeling diagnostics */
extern const char *usbcore_name;

/* sysfs stuff */
extern const struct attribute_group *usb_device_groups[];
extern const struct attribute_group *usb_interface_groups[];

/* usbfs stuff */
extern struct usb_driver usbfs_driver;
extern const struct file_operations usbfs_devices_fops;
extern const struct file_operations usbdev_file_operations;

extern int usb_devio_init(void);
extern void usb_devio_cleanup(void);

/*
 * Firmware specific cookie identifying a port's location. '0' == no location
 * data available
 */
typedef u32 usb_port_location_t;

/* internal notify stuff */
extern void usb_notify_add_device(struct usb_device *udev);
extern void usb_notify_remove_device(struct usb_device *udev);
extern void usb_notify_add_bus(struct usb_bus *ubus);
extern void usb_notify_remove_bus(struct usb_bus *ubus);
extern void usb_hub_adjust_deviceremovable(struct usb_device *hdev,
                struct usb_hub_descriptor *desc);

#ifdef CONFIG_ACPI
extern int usb_acpi_register(void);
extern void usb_acpi_unregister(void);
extern acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev,
        int port1);
#else
static inline int usb_acpi_register(void) { return 0; };
static inline void usb_acpi_unregister(void) { };
#endif















































































































































































































































































































































































































































































































   11 





























































    3 





































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_JIFFIES_H
#define _LINUX_JIFFIES_H

#include <linux/cache.h>
#include <linux/limits.h>
#include <linux/math64.h>
#include <linux/minmax.h>
#include <linux/types.h>
#include <linux/time.h>
#include <linux/timex.h>
#include <vdso/jiffies.h>
#include <asm/param.h>                        /* for HZ */
#include <generated/timeconst.h>

/*
 * The following defines establish the engineering parameters of the PLL
 * model. The HZ variable establishes the timer interrupt frequency, 100 Hz
 * for the SunOS kernel, 256 Hz for the Ultrix kernel and 1024 Hz for the
 * OSF/1 kernel. The SHIFT_HZ define expresses the same value as the
 * nearest power of two in order to avoid hardware multiply operations.
 */
#if HZ >= 12 && HZ < 24
# define SHIFT_HZ        4
#elif HZ >= 24 && HZ < 48
# define SHIFT_HZ        5
#elif HZ >= 48 && HZ < 96
# define SHIFT_HZ        6
#elif HZ >= 96 && HZ < 192
# define SHIFT_HZ        7
#elif HZ >= 192 && HZ < 384
# define SHIFT_HZ        8
#elif HZ >= 384 && HZ < 768
# define SHIFT_HZ        9
#elif HZ >= 768 && HZ < 1536
# define SHIFT_HZ        10
#elif HZ >= 1536 && HZ < 3072
# define SHIFT_HZ        11
#elif HZ >= 3072 && HZ < 6144
# define SHIFT_HZ        12
#elif HZ >= 6144 && HZ < 12288
# define SHIFT_HZ        13
#else
# error Invalid value of HZ.
#endif

/* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can
 * improve accuracy by shifting LSH bits, hence calculating:
 *     (NOM << LSH) / DEN
 * This however means trouble for large NOM, because (NOM << LSH) may no
 * longer fit in 32 bits. The following way of calculating this gives us
 * some slack, under the following conditions:
 *   - (NOM / DEN) fits in (32 - LSH) bits.
 *   - (NOM % DEN) fits in (32 - LSH) bits.
 */
#define SH_DIV(NOM,DEN,LSH) (   (((NOM) / (DEN)) << (LSH))              \
                             + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN))

/* LATCH is used in the interval timer and ftape setup. */
#define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ)        /* For divider */

extern void register_refined_jiffies(long clock_tick_rate);

/* TICK_USEC is the time between ticks in usec */
#define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ)

/* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
#define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)

/*
 * The 64-bit value is not atomic on 32-bit systems - you MUST NOT read it
 * without sampling the sequence number in jiffies_lock.
 * get_jiffies_64() will do this for you as appropriate.
 *
 * jiffies and jiffies_64 are at the same address for little-endian systems
 * and for 64-bit big-endian systems.
 * On 32-bit big-endian systems, jiffies is the lower 32 bits of jiffies_64
 * (i.e., at address @jiffies_64 + 4).
 * See arch/ARCH/kernel/vmlinux.lds.S
 */
extern u64 __cacheline_aligned_in_smp jiffies_64;
extern unsigned long volatile __cacheline_aligned_in_smp jiffies;

#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void);
#else
/**
 * get_jiffies_64 - read the 64-bit non-atomic jiffies_64 value
 *
 * When BITS_PER_LONG < 64, this uses sequence number sampling using
 * jiffies_lock to protect the 64-bit read.
 *
 * Return: current 64-bit jiffies value
 */
static inline u64 get_jiffies_64(void)
{
        return (u64)jiffies;
}
#endif

/**
 * DOC: General information about time_* inlines
 *
 * These inlines deal with timer wrapping correctly. You are strongly encouraged
 * to use them:
 *
 * #. Because people otherwise forget
 * #. Because if the timer wrap changes in future you won't have to alter your
 *    driver code.
 */

/**
 * time_after - returns true if the time a is after time b.
 * @a: first comparable as unsigned long
 * @b: second comparable as unsigned long
 *
 * Do this with "<0" and ">=0" to only test the sign of the result. A
 * good compiler would generate better code (and a really good compiler
 * wouldn't care). Gcc is currently neither.
 *
 * Return: %true is time a is after time b, otherwise %false.
 */
#define time_after(a,b)                \
        (typecheck(unsigned long, a) && \
         typecheck(unsigned long, b) && \
         ((long)((b) - (a)) < 0))
/**
 * time_before - returns true if the time a is before time b.
 * @a: first comparable as unsigned long
 * @b: second comparable as unsigned long
 *
 * Return: %true is time a is before time b, otherwise %false.
 */
#define time_before(a,b)        time_after(b,a)

/**
 * time_after_eq - returns true if the time a is after or the same as time b.
 * @a: first comparable as unsigned long
 * @b: second comparable as unsigned long
 *
 * Return: %true is time a is after or the same as time b, otherwise %false.
 */
#define time_after_eq(a,b)        \
        (typecheck(unsigned long, a) && \
         typecheck(unsigned long, b) && \
         ((long)((a) - (b)) >= 0))
/**
 * time_before_eq - returns true if the time a is before or the same as time b.
 * @a: first comparable as unsigned long
 * @b: second comparable as unsigned long
 *
 * Return: %true is time a is before or the same as time b, otherwise %false.
 */
#define time_before_eq(a,b)        time_after_eq(b,a)

/**
 * time_in_range - Calculate whether a is in the range of [b, c].
 * @a: time to test
 * @b: beginning of the range
 * @c: end of the range
 *
 * Return: %true is time a is in the range [b, c], otherwise %false.
 */
#define time_in_range(a,b,c) \
        (time_after_eq(a,b) && \
         time_before_eq(a,c))

/**
 * time_in_range_open - Calculate whether a is in the range of [b, c).
 * @a: time to test
 * @b: beginning of the range
 * @c: end of the range
 *
 * Return: %true is time a is in the range [b, c), otherwise %false.
 */
#define time_in_range_open(a,b,c) \
        (time_after_eq(a,b) && \
         time_before(a,c))

/* Same as above, but does so with platform independent 64bit types.
 * These must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64()). */

/**
 * time_after64 - returns true if the time a is after time b.
 * @a: first comparable as __u64
 * @b: second comparable as __u64
 *
 * This must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64()).
 *
 * Return: %true is time a is after time b, otherwise %false.
 */
#define time_after64(a,b)        \
        (typecheck(__u64, a) &&        \
         typecheck(__u64, b) && \
         ((__s64)((b) - (a)) < 0))
/**
 * time_before64 - returns true if the time a is before time b.
 * @a: first comparable as __u64
 * @b: second comparable as __u64
 *
 * This must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64()).
 *
 * Return: %true is time a is before time b, otherwise %false.
 */
#define time_before64(a,b)        time_after64(b,a)

/**
 * time_after_eq64 - returns true if the time a is after or the same as time b.
 * @a: first comparable as __u64
 * @b: second comparable as __u64
 *
 * This must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64()).
 *
 * Return: %true is time a is after or the same as time b, otherwise %false.
 */
#define time_after_eq64(a,b)        \
        (typecheck(__u64, a) && \
         typecheck(__u64, b) && \
         ((__s64)((a) - (b)) >= 0))
/**
 * time_before_eq64 - returns true if the time a is before or the same as time b.
 * @a: first comparable as __u64
 * @b: second comparable as __u64
 *
 * This must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64()).
 *
 * Return: %true is time a is before or the same as time b, otherwise %false.
 */
#define time_before_eq64(a,b)        time_after_eq64(b,a)

/**
 * time_in_range64 - Calculate whether a is in the range of [b, c].
 * @a: time to test
 * @b: beginning of the range
 * @c: end of the range
 *
 * Return: %true is time a is in the range [b, c], otherwise %false.
 */
#define time_in_range64(a, b, c) \
        (time_after_eq64(a, b) && \
         time_before_eq64(a, c))

/*
 * These eight macros compare jiffies[_64] and 'a' for convenience.
 */

/**
 * time_is_before_jiffies - return true if a is before jiffies
 * @a: time (unsigned long) to compare to jiffies
 *
 * Return: %true is time a is before jiffies, otherwise %false.
 */
#define time_is_before_jiffies(a) time_after(jiffies, a)
/**
 * time_is_before_jiffies64 - return true if a is before jiffies_64
 * @a: time (__u64) to compare to jiffies_64
 *
 * Return: %true is time a is before jiffies_64, otherwise %false.
 */
#define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a)

/**
 * time_is_after_jiffies - return true if a is after jiffies
 * @a: time (unsigned long) to compare to jiffies
 *
 * Return: %true is time a is after jiffies, otherwise %false.
 */
#define time_is_after_jiffies(a) time_before(jiffies, a)
/**
 * time_is_after_jiffies64 - return true if a is after jiffies_64
 * @a: time (__u64) to compare to jiffies_64
 *
 * Return: %true is time a is after jiffies_64, otherwise %false.
 */
#define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a)

/**
 * time_is_before_eq_jiffies - return true if a is before or equal to jiffies
 * @a: time (unsigned long) to compare to jiffies
 *
 * Return: %true is time a is before or the same as jiffies, otherwise %false.
 */
#define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a)
/**
 * time_is_before_eq_jiffies64 - return true if a is before or equal to jiffies_64
 * @a: time (__u64) to compare to jiffies_64
 *
 * Return: %true is time a is before or the same jiffies_64, otherwise %false.
 */
#define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a)

/**
 * time_is_after_eq_jiffies - return true if a is after or equal to jiffies
 * @a: time (unsigned long) to compare to jiffies
 *
 * Return: %true is time a is after or the same as jiffies, otherwise %false.
 */
#define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a)
/**
 * time_is_after_eq_jiffies64 - return true if a is after or equal to jiffies_64
 * @a: time (__u64) to compare to jiffies_64
 *
 * Return: %true is time a is after or the same as jiffies_64, otherwise %false.
 */
#define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a)

/*
 * Have the 32-bit jiffies value wrap 5 minutes after boot
 * so jiffies wrap bugs show up earlier.
 */
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))

/*
 * Change timeval to jiffies, trying to avoid the
 * most obvious overflows..
 *
 * And some not so obvious.
 *
 * Note that we don't want to return LONG_MAX, because
 * for various timeout reasons we often end up having
 * to wait "jiffies+1" in order to guarantee that we wait
 * at _least_ "jiffies" - so "jiffies+1" had better still
 * be positive.
 */
#define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1)

extern unsigned long preset_lpj;

/*
 * We want to do realistic conversions of time so we need to use the same
 * values the update wall clock code uses as the jiffies size.  This value
 * is: TICK_NSEC (which is defined in timex.h).  This
 * is a constant and is in nanoseconds.  We will use scaled math
 * with a set of scales defined here as SEC_JIFFIE_SC,  USEC_JIFFIE_SC and
 * NSEC_JIFFIE_SC.  Note that these defines contain nothing but
 * constants and so are computed at compile time.  SHIFT_HZ (computed in
 * timex.h) adjusts the scaling for different HZ values.

 * Scaled math???  What is that?
 *
 * Scaled math is a way to do integer math on values that would,
 * otherwise, either overflow, underflow, or cause undesired div
 * instructions to appear in the execution path.  In short, we "scale"
 * up the operands so they take more bits (more precision, less
 * underflow), do the desired operation and then "scale" the result back
 * by the same amount.  If we do the scaling by shifting we avoid the
 * costly mpy and the dastardly div instructions.

 * Suppose, for example, we want to convert from seconds to jiffies
 * where jiffies is defined in nanoseconds as NSEC_PER_JIFFIE.  The
 * simple math is: jiff = (sec * NSEC_PER_SEC) / NSEC_PER_JIFFIE; We
 * observe that (NSEC_PER_SEC / NSEC_PER_JIFFIE) is a constant which we
 * might calculate at compile time, however, the result will only have
 * about 3-4 bits of precision (less for smaller values of HZ).
 *
 * So, we scale as follows:
 * jiff = (sec) * (NSEC_PER_SEC / NSEC_PER_JIFFIE);
 * jiff = ((sec) * ((NSEC_PER_SEC * SCALE)/ NSEC_PER_JIFFIE)) / SCALE;
 * Then we make SCALE a power of two so:
 * jiff = ((sec) * ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) >> SCALE;
 * Now we define:
 * #define SEC_CONV = ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE))
 * jiff = (sec * SEC_CONV) >> SCALE;
 *
 * Often the math we use will expand beyond 32-bits so we tell C how to
 * do this and pass the 64-bit result of the mpy through the ">> SCALE"
 * which should take the result back to 32-bits.  We want this expansion
 * to capture as much precision as possible.  At the same time we don't
 * want to overflow so we pick the SCALE to avoid this.  In this file,
 * that means using a different scale for each range of HZ values (as
 * defined in timex.h).
 *
 * For those who want to know, gcc will give a 64-bit result from a "*"
 * operator if the result is a long long AND at least one of the
 * operands is cast to long long (usually just prior to the "*" so as
 * not to confuse it into thinking it really has a 64-bit operand,
 * which, buy the way, it can do, but it takes more code and at least 2
 * mpys).

 * We also need to be aware that one second in nanoseconds is only a
 * couple of bits away from overflowing a 32-bit word, so we MUST use
 * 64-bits to get the full range time in nanoseconds.

 */

/*
 * Here are the scales we will use.  One for seconds, nanoseconds and
 * microseconds.
 *
 * Within the limits of cpp we do a rough cut at the SEC_JIFFIE_SC and
 * check if the sign bit is set.  If not, we bump the shift count by 1.
 * (Gets an extra bit of precision where we can use it.)
 * We know it is set for HZ = 1024 and HZ = 100 not for 1000.
 * Haven't tested others.

 * Limits of cpp (for #if expressions) only long (no long long), but
 * then we only need the most signicant bit.
 */

#define SEC_JIFFIE_SC (31 - SHIFT_HZ)
#if !((((NSEC_PER_SEC << 2) / TICK_NSEC) << (SEC_JIFFIE_SC - 2)) & 0x80000000)
#undef SEC_JIFFIE_SC
#define SEC_JIFFIE_SC (32 - SHIFT_HZ)
#endif
#define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29)
#define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\
                                TICK_NSEC -1) / (u64)TICK_NSEC))

#define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\
                                        TICK_NSEC -1) / (u64)TICK_NSEC))
/*
 * The maximum jiffy value is (MAX_INT >> 1).  Here we translate that
 * into seconds.  The 64-bit case will overflow if we are not careful,
 * so use the messy SH_DIV macro to do it.  Still all constants.
 */
#if BITS_PER_LONG < 64
# define MAX_SEC_IN_JIFFIES \
        (long)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC) / NSEC_PER_SEC)
#else        /* take care of overflow on 64-bit machines */
# define MAX_SEC_IN_JIFFIES \
        (SH_DIV((MAX_JIFFY_OFFSET >> SEC_JIFFIE_SC) * TICK_NSEC, NSEC_PER_SEC, 1) - 1)

#endif

/*
 * Convert various time units to each other:
 */

#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
/**
 * jiffies_to_msecs - Convert jiffies to milliseconds
 * @j: jiffies value
 *
 * This inline version takes care of HZ in {100,250,1000}.
 *
 * Return: milliseconds value
 */
static inline unsigned int jiffies_to_msecs(const unsigned long j)
{
        return (MSEC_PER_SEC / HZ) * j;
}
#else
unsigned int jiffies_to_msecs(const unsigned long j);
#endif

#if !(USEC_PER_SEC % HZ)
/**
 * jiffies_to_usecs - Convert jiffies to microseconds
 * @j: jiffies value
 *
 * Return: microseconds value
 */
static inline unsigned int jiffies_to_usecs(const unsigned long j)
{
        /*
         * Hz usually doesn't go much further MSEC_PER_SEC.
         * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
         */
        BUILD_BUG_ON(HZ > USEC_PER_SEC);

        return (USEC_PER_SEC / HZ) * j;
}
#else
unsigned int jiffies_to_usecs(const unsigned long j);
#endif

/**
 * jiffies_to_nsecs - Convert jiffies to nanoseconds
 * @j: jiffies value
 *
 * Return: nanoseconds value
 */
static inline u64 jiffies_to_nsecs(const unsigned long j)
{
        return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
}

extern u64 jiffies64_to_nsecs(u64 j);
extern u64 jiffies64_to_msecs(u64 j);

extern unsigned long __msecs_to_jiffies(const unsigned int m);
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
/*
 * HZ is equal to or smaller than 1000, and 1000 is a nice round
 * multiple of HZ, divide with the factor between them, but round
 * upwards:
 */
static inline unsigned long _msecs_to_jiffies(const unsigned int m)
{
        return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
}
#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
/*
 * HZ is larger than 1000, and HZ is a nice round multiple of 1000 -
 * simply multiply with the factor between them.
 *
 * But first make sure the multiplication result cannot overflow:
 */
static inline unsigned long _msecs_to_jiffies(const unsigned int m)
{
        if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;
        return m * (HZ / MSEC_PER_SEC);
}
#else
/*
 * Generic case - multiply, round and divide. But first check that if
 * we are doing a net multiplication, that we wouldn't overflow:
 */
static inline unsigned long _msecs_to_jiffies(const unsigned int m)
{
        if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;

        return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32;
}
#endif
/**
 * msecs_to_jiffies: - convert milliseconds to jiffies
 * @m:        time in milliseconds
 *
 * conversion is done as follows:
 *
 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
 *
 * - 'too large' values [that would result in larger than
 *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
 *
 * - all other values are converted to jiffies by either multiplying
 *   the input value by a factor or dividing it with a factor and
 *   handling any 32-bit overflows.
 *   for the details see _msecs_to_jiffies()
 *
 * msecs_to_jiffies() checks for the passed in value being a constant
 * via __builtin_constant_p() allowing gcc to eliminate most of the
 * code. __msecs_to_jiffies() is called if the value passed does not
 * allow constant folding and the actual conversion must be done at
 * runtime.
 * The HZ range specific helpers _msecs_to_jiffies() are called both
 * directly here and from __msecs_to_jiffies() in the case where
 * constant folding is not possible.
 *
 * Return: jiffies value
 */
static __always_inline unsigned long msecs_to_jiffies(const unsigned int m)
{
        if (__builtin_constant_p(m)) {
                if ((int)m < 0)
                        return MAX_JIFFY_OFFSET;
                return _msecs_to_jiffies(m);
        } else {
                return __msecs_to_jiffies(m);
        }
}

/**
 * secs_to_jiffies: - convert seconds to jiffies
 * @_secs: time in seconds
 *
 * Conversion is done by simple multiplication with HZ
 *
 * secs_to_jiffies() is defined as a macro rather than a static inline
 * function so it can be used in static initializers.
 *
 * Return: jiffies value
 */
#define secs_to_jiffies(_secs) (unsigned long)((_secs) * HZ)

extern unsigned long __usecs_to_jiffies(const unsigned int u);
#if !(USEC_PER_SEC % HZ)
static inline unsigned long _usecs_to_jiffies(const unsigned int u)
{
        return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
}
#else
static inline unsigned long _usecs_to_jiffies(const unsigned int u)
{
        return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
                >> USEC_TO_HZ_SHR32;
}
#endif

/**
 * usecs_to_jiffies: - convert microseconds to jiffies
 * @u:        time in microseconds
 *
 * conversion is done as follows:
 *
 * - 'too large' values [that would result in larger than
 *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
 *
 * - all other values are converted to jiffies by either multiplying
 *   the input value by a factor or dividing it with a factor and
 *   handling any 32-bit overflows as for msecs_to_jiffies.
 *
 * usecs_to_jiffies() checks for the passed in value being a constant
 * via __builtin_constant_p() allowing gcc to eliminate most of the
 * code. __usecs_to_jiffies() is called if the value passed does not
 * allow constant folding and the actual conversion must be done at
 * runtime.
 * The HZ range specific helpers _usecs_to_jiffies() are called both
 * directly here and from __msecs_to_jiffies() in the case where
 * constant folding is not possible.
 *
 * Return: jiffies value
 */
static __always_inline unsigned long usecs_to_jiffies(const unsigned int u)
{
        if (__builtin_constant_p(u)) {
                if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
                        return MAX_JIFFY_OFFSET;
                return _usecs_to_jiffies(u);
        } else {
                return __usecs_to_jiffies(u);
        }
}

extern unsigned long timespec64_to_jiffies(const struct timespec64 *value);
extern void jiffies_to_timespec64(const unsigned long jiffies,
                                  struct timespec64 *value);
extern clock_t jiffies_to_clock_t(unsigned long x);

static inline clock_t jiffies_delta_to_clock_t(long delta)
{
        return jiffies_to_clock_t(max(0L, delta));
}

static inline unsigned int jiffies_delta_to_msecs(long delta)
{
        return jiffies_to_msecs(max(0L, delta));
}

extern unsigned long clock_t_to_jiffies(unsigned long x);
extern u64 jiffies_64_to_clock_t(u64 x);
extern u64 nsec_to_clock_t(u64 x);
extern u64 nsecs_to_jiffies64(u64 n);
extern unsigned long nsecs_to_jiffies(u64 n);

#define TIMESTAMP_SIZE        30

struct ctl_table;
int proc_dointvec_jiffies(const struct ctl_table *table, int dir, void *buffer,
                          size_t *lenp, loff_t *ppos);
int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
                                    void *buffer, size_t *lenp, loff_t *ppos);
int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
                                 void *buffer, size_t *lenp, loff_t *ppos);
int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
                             size_t *lenp, loff_t *ppos);
int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
                                      void *buffer, size_t *lenp, loff_t *ppos);

#endif









































































































































































































































































































   77 


















































































































































   71 




   78 















































    3 






































    3 
    3 













































































































































































































































































































































































































    4 




    5 


    5 


    5 




    6 




   67 





   69 


   71 

   77 

    1 

   69 



























   21 






   20 


   21 
   18 
   21 

    1 








































































































































   24 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   17 






   16 






   15 
   10 








































































































    1 






    1 


    1 


    1 







    1 




    1 






















    1 

























    1 

























































    1 







    1 








































































































































































































































































    1 










    1 







































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel timekeeping code and accessor functions. Based on code from
 *  timer.c, moved in commit 8524070b7982.
 */
#include <linux/audit.h>
#include <linux/clocksource.h>
#include <linux/compiler.h>
#include <linux/jiffies.h>
#include <linux/kobject.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/pvclock_gtod.h>
#include <linux/random.h>
#include <linux/sched/clock.h>
#include <linux/sched/loadavg.h>
#include <linux/static_key.h>
#include <linux/stop_machine.h>
#include <linux/syscore_ops.h>
#include <linux/tick.h>
#include <linux/time.h>
#include <linux/timex.h>
#include <linux/timekeeper_internal.h>

#include <vdso/auxclock.h>

#include "tick-internal.h"
#include "timekeeping_internal.h"
#include "ntp_internal.h"

#define TK_CLEAR_NTP                (1 << 0)
#define TK_CLOCK_WAS_SET        (1 << 1)

#define TK_UPDATE_ALL                (TK_CLEAR_NTP | TK_CLOCK_WAS_SET)

enum timekeeping_adv_mode {
        /* Update timekeeper when a tick has passed */
        TK_ADV_TICK,

        /* Update timekeeper on a direct frequency change */
        TK_ADV_FREQ
};

/*
 * The most important data for readout fits into a single 64 byte
 * cache line.
 */
struct tk_data {
        seqcount_raw_spinlock_t        seq;
        struct timekeeper        timekeeper;
        struct timekeeper        shadow_timekeeper;
        raw_spinlock_t                lock;
} ____cacheline_aligned;

static struct tk_data timekeeper_data[TIMEKEEPERS_MAX];

/* The core timekeeper */
#define tk_core                (timekeeper_data[TIMEKEEPER_CORE])

#ifdef CONFIG_POSIX_AUX_CLOCKS
static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
{
        return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts);
}

static inline bool tk_is_aux(const struct timekeeper *tk)
{
        return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST;
}
#else
static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
{
        return false;
}

static inline bool tk_is_aux(const struct timekeeper *tk)
{
        return false;
}
#endif

static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs)
{
        tk->offs_aux = offs;
        tk->monotonic_to_aux = ktime_to_timespec64(offs);
}

/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;

/**
 * struct tk_fast - NMI safe timekeeper
 * @seq:        Sequence counter for protecting updates. The lowest bit
 *                is the index for the tk_read_base array
 * @base:        tk_read_base array. Access is indexed by the lowest bit of
 *                @seq.
 *
 * See @update_fast_timekeeper() below.
 */
struct tk_fast {
        seqcount_latch_t        seq;
        struct tk_read_base        base[2];
};

/* Suspend-time cycles value for halted fast timekeeper. */
static u64 cycles_at_suspend;

static u64 dummy_clock_read(struct clocksource *cs)
{
        if (timekeeping_suspended)
                return cycles_at_suspend;
        return local_clock();
}

static struct clocksource dummy_clock = {
        .read = dummy_clock_read,
};

/*
 * Boot time initialization which allows local_clock() to be utilized
 * during early boot when clocksources are not available. local_clock()
 * returns nanoseconds already so no conversion is required, hence mult=1
 * and shift=0. When the first proper clocksource is installed then
 * the fast time keepers are updated with the correct values.
 */
#define FAST_TK_INIT                                                \
        {                                                        \
                .clock                = &dummy_clock,                        \
                .mask                = CLOCKSOURCE_MASK(64),                \
                .mult                = 1,                                \
                .shift                = 0,                                \
        }

static struct tk_fast tk_fast_mono ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

#ifdef CONFIG_POSIX_AUX_CLOCKS
static __init void tk_aux_setup(void);
static void tk_aux_update_clocksource(void);
static void tk_aux_advance(void);
#else
static inline void tk_aux_setup(void) { }
static inline void tk_aux_update_clocksource(void) { }
static inline void tk_aux_advance(void) { }
#endif

unsigned long timekeeper_lock_irqsave(void)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&tk_core.lock, flags);
        return flags;
}

void timekeeper_unlock_irqrestore(unsigned long flags)
{
        raw_spin_unlock_irqrestore(&tk_core.lock, flags);
}

/*
 * Multigrain timestamps require tracking the latest fine-grained timestamp
 * that has been issued, and never returning a coarse-grained timestamp that is
 * earlier than that value.
 *
 * mg_floor represents the latest fine-grained time that has been handed out as
 * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
 * converted to a realtime clock value on an as-needed basis.
 *
 * Maintaining mg_floor ensures the multigrain interfaces never issue a
 * timestamp earlier than one that has been previously issued.
 *
 * The exception to this rule is when there is a backward realtime clock jump. If
 * such an event occurs, a timestamp can appear to be earlier than a previous one.
 */
static __cacheline_aligned_in_smp atomic64_t mg_floor;

static inline void tk_normalize_xtime(struct timekeeper *tk)
{
        while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
                tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
                tk->xtime_sec++;
        }
        while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
                tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
                tk->raw_sec++;
        }
}

static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
{
        struct timespec64 ts;

        ts.tv_sec = tk->xtime_sec;
        ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        return ts;
}

static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
{
        struct timespec64 ts;

        ts.tv_sec = tk->xtime_sec;
        ts.tv_nsec = tk->coarse_nsec;
        return ts;
}

/*
 * Update the nanoseconds part for the coarse time keepers. They can't rely
 * on xtime_nsec because xtime_nsec could be adjusted by a small negative
 * amount when the multiplication factor of the clock is adjusted, which
 * could cause the coarse clocks to go slightly backwards. See
 * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse
 * clockids which only is updated when the clock has been set or  we have
 * accumulated time.
 */
static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
{
        tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
}

static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec = ts->tv_sec;
        tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_update_coarse_nsecs(tk);
}

static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec += ts->tv_sec;
        tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
        tk_update_coarse_nsecs(tk);
}

static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
{
        struct timespec64 tmp;

        /*
         * Verify consistency of: offset_real = -wall_to_monotonic
         * before modifying anything
         */
        set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
                                        -tk->wall_to_monotonic.tv_nsec);
        WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
        tk->wall_to_monotonic = wtm;
        set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
        /* Paired with READ_ONCE() in ktime_mono_to_any() */
        WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp));
        WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)));
}

static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
        /* Paired with READ_ONCE() in ktime_mono_to_any() */
        WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta));
        /*
         * Timespec representation for VDSO update to avoid 64bit division
         * on every update.
         */
        tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}

#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE
#include <asm/clock_inlined.h>

static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined);

/*
 * tk_clock_read - atomic clocksource read() helper
 *
 * This helper is necessary to use in the read paths because, while the
 * seqcount ensures we don't return a bad value while structures are updated,
 * it doesn't protect from potential crashes. There is the possibility that
 * the tkr's clocksource may change between the read reference, and the
 * clock reference passed to the read function.  This can cause crashes if
 * the wrong clocksource is passed to the wrong read function.
 * This isn't necessary to use when holding the tk_core.lock or doing
 * a read of the fast-timekeeper tkrs (which is protected by its own locking
 * and update logic).
 */
static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
        struct clocksource *clock = READ_ONCE(tkr->clock);

        if (static_branch_likely(&clocksource_read_inlined))
                return arch_inlined_clocksource_read(clock);

        return clock->read(clock);
}

static inline void clocksource_disable_inline_read(void)
{
        static_branch_disable(&clocksource_read_inlined);
}

static inline void clocksource_enable_inline_read(void)
{
        static_branch_enable(&clocksource_read_inlined);
}
#else
static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
        struct clocksource *clock = READ_ONCE(tkr->clock);

        return clock->read(clock);
}
static inline void clocksource_disable_inline_read(void) { }
static inline void clocksource_enable_inline_read(void) { }
#endif

/**
 * tk_setup_internals - Set up internals to use clocksource clock.
 *
 * @tk:                The target timekeeper to setup.
 * @clock:                Pointer to clocksource.
 *
 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
 * pair and interval request.
 *
 * Unless you're the timekeeping code, you should not be using this!
 */
static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
        u64 interval;
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;

        ++tk->cs_was_changed_seq;
        old_clock = tk->tkr_mono.clock;
        tk->tkr_mono.clock = clock;
        tk->tkr_mono.mask = clock->mask;
        tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);

        tk->tkr_raw.clock = clock;
        tk->tkr_raw.mask = clock->mask;
        tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;

        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
        ntpinterval = tmp;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
                tmp = 1;

        interval = (u64) tmp;
        tk->cycle_interval = interval;

        /* Go back from cycles -> shifted ns */
        tk->xtime_interval = interval * clock->mult;
        tk->xtime_remainder = ntpinterval - tk->xtime_interval;
        tk->raw_interval = interval * clock->mult;

         /* if changing clocks, convert xtime_nsec shift units */
        if (old_clock) {
                int shift_change = clock->shift - old_clock->shift;
                if (shift_change < 0) {
                        tk->tkr_mono.xtime_nsec >>= -shift_change;
                        tk->tkr_raw.xtime_nsec >>= -shift_change;
                } else {
                        tk->tkr_mono.xtime_nsec <<= shift_change;
                        tk->tkr_raw.xtime_nsec <<= shift_change;
                }
        }

        tk->tkr_mono.shift = clock->shift;
        tk->tkr_raw.shift = clock->shift;

        tk->ntp_error = 0;
        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
        tk->ntp_tick = ntpinterval << tk->ntp_error_shift;

        /*
         * The timekeeper keeps its own mult values for the currently
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
        tk->tkr_mono.mult = clock->mult;
        tk->tkr_raw.mult = clock->mult;
        tk->ntp_err_mult = 0;
        tk->skip_second_overflow = 0;

        tk->cs_id = clock->id;

        /* Coupled clockevent data */
        if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) &&
            clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) {
                /*
                 * Aim for an one hour maximum delta and use KHz to handle
                 * clocksources with a frequency above 4GHz correctly as
                 * the frequency argument of clocks_calc_mult_shift() is u32.
                 */
                clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift,
                                       NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000);
                /*
                 * Initialize the conversion limit as the previous clocksource
                 * might have the same shift/mult pair so the quick check in
                 * tk_update_ns_to_cyc() fails to update it after a clocksource
                 * change leaving it effectivly zero.
                 */
                tk->cs_ns_to_cyc_maxns = div_u64(clock->mask, tk->cs_ns_to_cyc_mult);
        }
}

/* Timekeeper helper functions. */
static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
{
        return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
}

static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
        /* Calculate the delta since the last update_wall_time() */
        u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;

        /*
         * This detects both negative motion and the case where the delta
         * overflows the multiplication with tkr->mult.
         */
        if (unlikely(delta > tkr->clock->max_cycles)) {
                /*
                 * Handle clocksource inconsistency between CPUs to prevent
                 * time from going backwards by checking for the MSB of the
                 * mask being set in the delta.
                 */
                if (delta & ~(mask >> 1))
                        return tkr->xtime_nsec >> tkr->shift;

                return delta_to_ns_safe(tkr, delta);
        }

        return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
}

static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
        return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
}

/**
 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
 * @tkr: Timekeeping readout base from which we take the update
 * @tkf: Pointer to NMI safe timekeeper
 *
 * We want to use this from any context including NMI and tracing /
 * instrumenting the timekeeping code itself.
 *
 * Employ the latch technique; see @write_seqcount_latch.
 *
 * So if a NMI hits the update of base[0] then it will use base[1]
 * which is still consistent. In the worst case this can result is a
 * slightly wrong timestamp (a few nanoseconds). See
 * @ktime_get_mono_fast_ns.
 */
static void update_fast_timekeeper(const struct tk_read_base *tkr,
                                   struct tk_fast *tkf)
{
        struct tk_read_base *base = tkf->base;

        /* Force readers off to base[1] */
        write_seqcount_latch_begin(&tkf->seq);

        /* Update base[0] */
        memcpy(base, tkr, sizeof(*base));

        /* Force readers back to base[0] */
        write_seqcount_latch(&tkf->seq);

        /* Update base[1] */
        memcpy(base + 1, base, sizeof(*base));

        write_seqcount_latch_end(&tkf->seq);
}

static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
        struct tk_read_base *tkr;
        unsigned int seq;
        u64 now;

        do {
                seq = read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                now = ktime_to_ns(tkr->base);
                now += timekeeping_get_ns(tkr);
        } while (read_seqcount_latch_retry(&tkf->seq, seq));

        return now;
}

/**
 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
 *
 * This timestamp is not guaranteed to be monotonic across an update.
 * The timestamp is calculated by:
 *
 *        now = base_mono + clock_delta * slope
 *
 * So if the update lowers the slope, readers who are forced to the
 * not yet updated second array are still using the old steeper slope.
 *
 * tmono
 * ^
 * |    o  n
 * |   o n
 * |  u
 * | o
 * |o
 * |12345678---> reader order
 *
 * o = old slope
 * u = update
 * n = new slope
 *
 * So reader 6 will observe time going backwards versus reader 5.
 *
 * While other CPUs are likely to be able to observe that, the only way
 * for a CPU local observation is when an NMI hits in the middle of
 * the update. Timestamps taken from that NMI context might be ahead
 * of the following timestamps. Callers need to be aware of that and
 * deal with it.
 */
u64 notrace ktime_get_mono_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_mono);
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);

/**
 * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
 *
 * Contrary to ktime_get_mono_fast_ns() this is always correct because the
 * conversion factor is not affected by NTP/PTP correction.
 */
u64 notrace ktime_get_raw_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);

/**
 * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
 *
 * To keep it NMI safe since we're accessing from tracing, we're not using a
 * separate timekeeper with updates to monotonic clock and boot offset
 * protected with seqcounts. This has the following minor side effects:
 *
 * (1) Its possible that a timestamp be taken after the boot offset is updated
 * but before the timekeeper is updated. If this happens, the new boot offset
 * is added to the old timekeeping making the clock appear to update slightly
 * earlier:
 *    CPU 0                                        CPU 1
 *    timekeeping_inject_sleeptime64()
 *    __timekeeping_inject_sleeptime(tk, delta);
 *                                                 timestamp();
 *    timekeeping_update_staged(tkd, TK_CLEAR_NTP...);
 *
 * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
 * partially updated.  Since the tk->offs_boot update is a rare event, this
 * should be a rare occurrence which postprocessing should be able to handle.
 *
 * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns()
 * apply as well.
 */
u64 notrace ktime_get_boot_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot)));
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);

/**
 * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock.
 *
 * The same limitations as described for ktime_get_boot_fast_ns() apply. The
 * mono time and the TAI offset are not read atomically which may yield wrong
 * readouts. However, an update of the TAI offset is an rare event e.g., caused
 * by settime or adjtimex with an offset. The user of this function has to deal
 * with the possibility of wrong timestamps in post processing.
 */
u64 notrace ktime_get_tai_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai)));
}
EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);

/**
 * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
 *
 * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
 */
u64 ktime_get_real_fast_ns(void)
{
        struct tk_fast *tkf = &tk_fast_mono;
        struct tk_read_base *tkr;
        u64 baser, delta;
        unsigned int seq;

        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                baser = ktime_to_ns(tkr->base_real);
                delta = timekeeping_get_ns(tkr);
        } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));

        return baser + delta;
}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);

/**
 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
 * @tk: Timekeeper to snapshot.
 *
 * It generally is unsafe to access the clocksource after timekeeping has been
 * suspended, so take a snapshot of the readout base of @tk and use it as the
 * fast timekeeper's readout base while suspended.  It will return the same
 * number of cycles every time until timekeeping is resumed at which time the
 * proper readout base for the fast timekeeper will be restored automatically.
 */
static void halt_fast_timekeeper(const struct timekeeper *tk)
{
        static struct tk_read_base tkr_dummy;
        const struct tk_read_base *tkr = &tk->tkr_mono;

        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        cycles_at_suspend = tk_clock_read(tkr);
        tkr_dummy.clock = &dummy_clock;
        tkr_dummy.base_real = tkr->base + tk->offs_real;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);

        tkr = &tk->tkr_raw;
        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        tkr_dummy.clock = &dummy_clock;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}

static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);

static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
{
        raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
}

/**
 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
 * @nb: Pointer to the notifier block to register
 */
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        int ret;

        guard(raw_spinlock_irqsave)(&tk_core.lock);
        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
        update_pvclock_gtod(tk, true);

        return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);

/**
 * pvclock_gtod_unregister_notifier - unregister a pvclock
 * timedata update listener
 * @nb: Pointer to the notifier block to unregister
 */
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
        guard(raw_spinlock_irqsave)(&tk_core.lock);
        return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);

/*
 * tk_update_leap_state - helper to update the next_leap_ktime
 */
static inline void tk_update_leap_state(struct timekeeper *tk)
{
        tk->next_leap_ktime = ntp_get_next_leap(tk->id);
        if (tk->next_leap_ktime != KTIME_MAX)
                /* Convert to monotonic time */
                tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
}

/*
 * Leap state update for both shadow and the real timekeeper
 * Separate to spare a full memcpy() of the timekeeper.
 */
static void tk_update_leap_state_all(struct tk_data *tkd)
{
        write_seqcount_begin(&tkd->seq);
        tk_update_leap_state(&tkd->shadow_timekeeper);
        tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime;
        write_seqcount_end(&tkd->seq);
}

/*
 * Update the ktime_t based scalar nsec members of the timekeeper
 */
static inline void tk_update_ktime_data(struct timekeeper *tk)
{
        u64 seconds;
        u32 nsec;

        /*
         * The xtime based monotonic readout is:
         *        nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
         * The ktime based monotonic readout is:
         *        nsec = base_mono + now();
         * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
         */
        seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
        nsec = (u32) tk->wall_to_monotonic.tv_nsec;
        tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);

        /*
         * The sum of the nanoseconds portions of xtime and
         * wall_to_monotonic can be greater/equal one second. Take
         * this into account before updating tk->ktime_sec.
         */
        nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        if (nsec >= NSEC_PER_SEC)
                seconds++;
        tk->ktime_sec = seconds;

        /* Update the monotonic raw base */
        tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}

static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc)
{
        struct tk_read_base *tkrs = &tks->tkr_mono;
        struct tk_read_base *tkrc = &tkc->tkr_mono;
        unsigned int shift;

        if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) ||
            !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT))
                return;

        if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift)
                return;
        /*
         * The conversion math is simple:
         *
         *      CS::MULT       (1 << NS_TO_CYC_SHIFT)
         *   --------------- = ----------------------
         *   (1 << CS:SHIFT)       NS_TO_CYC_MULT
         *
         * Ergo:
         *
         *   NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT
         *
         * NS_TO_CYC_SHIFT has been set up in tk_setup_internals()
         */
        shift = tkrs->shift + tks->cs_ns_to_cyc_shift;
        tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult);
        tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult);
}

/*
 * Restore the shadow timekeeper from the real timekeeper.
 */
static void timekeeping_restore_shadow(struct tk_data *tkd)
{
        lockdep_assert_held(&tkd->lock);
        memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper));
}

static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
{
        struct timekeeper *tk = &tkd->shadow_timekeeper;

        lockdep_assert_held(&tkd->lock);

        /*
         * Block out readers before running the updates below because that
         * updates VDSO and other time related infrastructure. Not blocking
         * the readers might let a reader see time going backwards when
         * reading from the VDSO after the VDSO update and then reading in
         * the kernel from the timekeeper before that got updated.
         */
        write_seqcount_begin(&tkd->seq);

        if (action & TK_CLEAR_NTP) {
                tk->ntp_error = 0;
                ntp_clear(tk->id);
        }

        tk_update_leap_state(tk);
        tk_update_ktime_data(tk);
        tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;

        if (tk->id == TIMEKEEPER_CORE) {
                tk_update_ns_to_cyc(tk, &tkd->timekeeper);
                update_vsyscall(tk);
                update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);

                update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
                update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
        } else if (tk_is_aux(tk)) {
                vdso_time_update_aux(tk);
        }

        if (action & TK_CLOCK_WAS_SET)
                tk->clock_was_set_seq++;

        /*
         * Update the real timekeeper.
         *
         * We could avoid this memcpy() by switching pointers, but that has
         * the downside that the reader side does not longer benefit from
         * the cacheline optimized data layout of the timekeeper and requires
         * another indirection.
         */
        memcpy(&tkd->timekeeper, tk, sizeof(*tk));
        write_seqcount_end(&tkd->seq);
}

/**
 * timekeeping_forward_now - update clock to the current time
 * @tk:                Pointer to the timekeeper to update
 *
 * Forward the current clock to update its state since the last call to
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
static void timekeeping_forward_now(struct timekeeper *tk)
{
        u64 cycle_now, delta;

        cycle_now = tk_clock_read(&tk->tkr_mono);
        delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
                                  tk->tkr_mono.clock->max_raw_delta);
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        while (delta > 0) {
                u64 max = tk->tkr_mono.clock->max_cycles;
                u64 incr = delta < max ? delta : max;

                tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
                tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
                tk_normalize_xtime(tk);
                delta -= incr;
        }
        tk_update_coarse_nsecs(tk);
}

/*
 * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles
 * @id:                Clocksource ID which is required for validity
 * @expires_ns:        Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted
 * @cycles:        Pointer to storage for corresponding absolute cycles value
 *
 * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value
 * based on the correlated clocksource of the clockevent device by using
 * the base nanoseconds and cycles values of the last timekeeper update and
 * converting the delta between @expires_ns and base nanoseconds to cycles.
 *
 * This only works for clockevent devices which are using a less than or
 * equal comparator against the clocksource.
 *
 * Utilizing this avoids two clocksource reads for such devices, the
 * ktime_get() in clockevents_program_event() to calculate the delta expiry
 * value and the readout in the device::set_next_event() callback to
 * convert the delta back to a absolute comparator value.
 *
 * Returns: True if @id matches the current clocksource ID, false otherwise
 */
bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct tk_read_base *tkrm = &tk->tkr_mono;
        ktime_t base_ns, delta_ns, max_ns;
        u64 base_cycles, delta_cycles;
        unsigned int seq;
        u32 mult, shift;

        /*
         * Racy check to avoid the seqcount overhead when ID does not match. If
         * the relevant clocksource is installed concurrently, then this will
         * just delay the switch over to this mechanism until the next event is
         * programmed. If the ID is not matching the clock events code will use
         * the regular relative set_next_event() callback as before.
         */
        if (data_race(tk->cs_id) != id)
                return false;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                if (tk->cs_id != id)
                        return false;

                base_cycles = tkrm->cycle_last;
                base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift);

                mult = tk->cs_ns_to_cyc_mult;
                shift = tk->cs_ns_to_cyc_shift;
                max_ns = tk->cs_ns_to_cyc_maxns;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        /* Prevent negative deltas and multiplication overflows */
        delta_ns = min(expires_ns - base_ns, max_ns);
        delta_ns = max(delta_ns, 0);

        /* Convert to cycles */
        delta_cycles = ((u64)delta_ns * mult) >> shift;
        *cycles = base_cycles + delta_cycles;
        return true;
}

/**
 * ktime_get_real_ts64 - Returns the time of day in a timespec64.
 * @ts:                pointer to the timespec to be set
 *
 * Returns the time of day in a timespec64 (WARN if suspended).
 */
void ktime_get_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ts->tv_sec = tk->xtime_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_real_ts64);

ktime_t ktime_get(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);

u32 ktime_get_resolution_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u32 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return nsecs;
}
EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);

static const ktime_t *const offsets[TK_OFFS_MAX] = {
        [TK_OFFS_REAL]        = &tk_core.timekeeper.offs_real,
        [TK_OFFS_BOOT]        = &tk_core.timekeeper.offs_boot,
        [TK_OFFS_TAI]        = &tk_core.timekeeper.offs_tai,
};

ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        const ktime_t *offset = offsets[offs];
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);

}
EXPORT_SYMBOL_GPL(ktime_get_with_offset);

ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        const ktime_t *offset = offsets[offs];
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = tk->coarse_nsec;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);

/**
 * ktime_mono_to_any() - convert monotonic time to any other time
 * @tmono:        time to convert.
 * @offs:        which offset to use
 */
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
        const ktime_t *offset = offsets[offs];
        unsigned int seq;
        ktime_t tconv;

        if (IS_ENABLED(CONFIG_64BIT)) {
                /*
                 * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and
                 * tk_update_sleep_time().
                 */
                return ktime_add(tmono, READ_ONCE(*offset));
        }

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                tconv = ktime_add(tmono, *offset);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return tconv;
}
EXPORT_SYMBOL_GPL(ktime_mono_to_any);

/**
 * ktime_get_raw - Returns the raw monotonic time in ktime_t format
 */
ktime_t ktime_get_raw(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_raw.base;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_raw);

/**
 * ktime_get_ts64 - get the monotonic clock in timespec64 format
 * @ts:                pointer to timespec variable
 *
 * The function calculates the monotonic clock from the realtime
 * clock and the wall_to_monotonic offset and stores the result
 * in normalized timespec64 format in the variable pointed to by @ts.
 */
void ktime_get_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 tomono;
        unsigned int seq;
        u64 nsec;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
                nsec = timekeeping_get_ns(&tk->tkr_mono);
                tomono = tk->wall_to_monotonic;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_sec += tomono.tv_sec;
        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts64);

/**
 * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
 *
 * Returns the seconds portion of CLOCK_MONOTONIC with a single non
 * serialized read. tk->ktime_sec is of type 'unsigned long' so this
 * works on both 32 and 64 bit systems. On 32 bit systems the readout
 * covers ~136 years of uptime which should be enough to prevent
 * premature wrap arounds.
 */
time64_t ktime_get_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        WARN_ON(timekeeping_suspended);
        return tk->ktime_sec;
}
EXPORT_SYMBOL_GPL(ktime_get_seconds);

/**
 * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
 *
 * Returns the wall clock seconds since 1970.
 *
 * For 64bit systems the fast access to tk->xtime_sec is preserved. On
 * 32bit systems the access must be protected with the sequence
 * counter to provide "atomic" access to the 64bit tk->xtime_sec
 * value.
 */
time64_t ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        time64_t seconds;
        unsigned int seq;

        if (IS_ENABLED(CONFIG_64BIT))
                return tk->xtime_sec;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                seconds = tk->xtime_sec;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return seconds;
}
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);

/**
 * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds
 *
 * The same as ktime_get_real_seconds() but without the sequence counter
 * protection. This function is used in restricted contexts like the x86 MCE
 * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half
 * completed modification and only to be used for such critical contexts.
 *
 * Returns: Racy snapshot of the CLOCK_REALTIME seconds value
 */
noinstr time64_t __ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return tk->xtime_sec;
}

/**
 * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
 * @systime_snapshot:        pointer to struct receiving the system time snapshot
 */
void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base_raw;
        ktime_t base_real;
        ktime_t base_boot;
        u64 nsec_raw;
        u64 nsec_real;
        u64 now;

        WARN_ON_ONCE(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(&tk->tkr_mono);
                systime_snapshot->cs_id = tk->tkr_mono.clock->id;
                systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
                systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_boot = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_boot);
                base_raw = tk->tkr_raw.base;
                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
                nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        systime_snapshot->cycles = now;
        systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
        systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
        systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_snapshot);

/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
{
        u64 tmp, rem;

        tmp = div64_u64_rem(*base, div, &rem);

        if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
            ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
                return -EOVERFLOW;
        tmp *= mult;

        rem = div64_u64(rem * mult, div);
        *base = tmp + rem;
        return 0;
}

/**
 * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
 * @history:                        Snapshot representing start of history
 * @partial_history_cycles:        Cycle offset into history (fractional part)
 * @total_history_cycles:        Total history length in cycles
 * @discontinuity:                True indicates clock was set on history period
 * @ts:                                Cross timestamp that should be adjusted using
 *        partial/total ratio
 *
 * Helper function used by get_device_system_crosststamp() to correct the
 * crosstimestamp corresponding to the start of the current interval to the
 * system counter value (timestamp point) provided by the driver. The
 * total_history_* quantities are the total history starting at the provided
 * reference point and ending at the start of the current interval. The cycle
 * count between the driver timestamp point and the start of the current
 * interval is partial_history_cycles.
 */
static int adjust_historical_crosststamp(struct system_time_snapshot *history,
                                         u64 partial_history_cycles,
                                         u64 total_history_cycles,
                                         bool discontinuity,
                                         struct system_device_crosststamp *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 corr_raw, corr_real;
        bool interp_forward;
        int ret;

        if (total_history_cycles == 0 || partial_history_cycles == 0)
                return 0;

        /* Interpolate shortest distance from beginning or end of history */
        interp_forward = partial_history_cycles > total_history_cycles / 2;
        partial_history_cycles = interp_forward ?
                total_history_cycles - partial_history_cycles :
                partial_history_cycles;

        /*
         * Scale the monotonic raw time delta by:
         *        partial_history_cycles / total_history_cycles
         */
        corr_raw = (u64)ktime_to_ns(
                ktime_sub(ts->sys_monoraw, history->raw));
        ret = scale64_check_overflow(partial_history_cycles,
                                     total_history_cycles, &corr_raw);
        if (ret)
                return ret;

        /*
         * If there is a discontinuity in the history, scale monotonic raw
         *        correction by:
         *        mult(real)/mult(raw) yielding the realtime correction
         * Otherwise, calculate the realtime correction similar to monotonic
         *        raw calculation
         */
        if (discontinuity) {
                corr_real = mul_u64_u32_div
                        (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
        } else {
                corr_real = (u64)ktime_to_ns(
                        ktime_sub(ts->sys_realtime, history->real));
                ret = scale64_check_overflow(partial_history_cycles,
                                             total_history_cycles, &corr_real);
                if (ret)
                        return ret;
        }

        /* Fixup monotonic raw and real time time values */
        if (interp_forward) {
                ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
                ts->sys_realtime = ktime_add_ns(history->real, corr_real);
        } else {
                ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
                ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
        }

        return 0;
}

/*
 * timestamp_in_interval - true if ts is chronologically in [start, end]
 *
 * True if ts occurs chronologically at or after start, and before or at end.
 */
static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
{
        if (ts >= start && ts <= end)
                return true;
        if (start > end && (ts >= start || ts <= end))
                return true;
        return false;
}

static bool convert_clock(u64 *val, u32 numerator, u32 denominator)
{
        u64 rem, res;

        if (!numerator || !denominator)
                return false;

        res = div64_u64_rem(*val, denominator, &rem) * numerator;
        *val = res + div_u64(rem * numerator, denominator);
        return true;
}

static bool convert_base_to_cs(struct system_counterval_t *scv)
{
        struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
        struct clocksource_base *base;
        u32 num, den;

        /* The timestamp was taken from the time keeper clock source */
        if (cs->id == scv->cs_id)
                return true;

        /*
         * Check whether cs_id matches the base clock. Prevent the compiler from
         * re-evaluating @base as the clocksource might change concurrently.
         */
        base = READ_ONCE(cs->base);
        if (!base || base->id != scv->cs_id)
                return false;

        num = scv->use_nsecs ? cs->freq_khz : base->numerator;
        den = scv->use_nsecs ? USEC_PER_SEC : base->denominator;

        if (!convert_clock(&scv->cycles, num, den))
                return false;

        scv->cycles += base->offset;
        return true;
}

static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id)
{
        struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
        struct clocksource_base *base;

        /*
         * Check whether base_id matches the base clock. Prevent the compiler from
         * re-evaluating @base as the clocksource might change concurrently.
         */
        base = READ_ONCE(cs->base);
        if (!base || base->id != base_id)
                return false;

        *cycles -= base->offset;
        if (!convert_clock(cycles, base->denominator, base->numerator))
                return false;
        return true;
}

static bool convert_ns_to_cs(u64 *delta)
{
        struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;

        if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta))
                return false;

        *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult);
        return true;
}

/**
 * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp
 * @treal:        CLOCK_REALTIME timestamp to convert
 * @base_id:        base clocksource id
 * @cycles:        pointer to store the converted base clock timestamp
 *
 * Converts a supplied, future realtime clock value to the corresponding base clock value.
 *
 * Return:  true if the conversion is successful, false otherwise.
 */
bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 delta;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                if ((u64)treal < tk->tkr_mono.base_real)
                        return false;
                delta = (u64)treal - tk->tkr_mono.base_real;
                if (!convert_ns_to_cs(&delta))
                        return false;
                *cycles = tk->tkr_mono.cycle_last + delta;
                if (!convert_cs_to_base(cycles, base_id))
                        return false;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return true;
}
EXPORT_SYMBOL_GPL(ktime_real_to_base_clock);

/**
 * get_device_system_crosststamp - Synchronously capture system/device timestamp
 * @get_time_fn:        Callback to get simultaneous device time and
 *        system counter from the device driver
 * @ctx:                Context passed to get_time_fn()
 * @history_begin:        Historical reference point used to interpolate system
 *        time when counter provided by the driver is before the current interval
 * @xtstamp:                Receives simultaneously captured system and device time
 *
 * Reads a timestamp from a device and correlates it to system time
 */
int get_device_system_crosststamp(int (*get_time_fn)
                                  (ktime_t *device_time,
                                   struct system_counterval_t *sys_counterval,
                                   void *ctx),
                                  void *ctx,
                                  struct system_time_snapshot *history_begin,
                                  struct system_device_crosststamp *xtstamp)
{
        struct system_counterval_t system_counterval = {};
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 cycles, now, interval_start;
        unsigned int clock_was_set_seq = 0;
        ktime_t base_real, base_raw;
        u64 nsec_real, nsec_raw;
        u8 cs_was_changed_seq;
        unsigned int seq;
        bool do_interp;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                /*
                 * Try to synchronously capture device time and a system
                 * counter value calling back into the device driver
                 */
                ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
                if (ret)
                        return ret;

                /*
                 * Verify that the clocksource ID associated with the captured
                 * system counter value is the same as for the currently
                 * installed timekeeper clocksource
                 */
                if (system_counterval.cs_id == CSID_GENERIC ||
                    !convert_base_to_cs(&system_counterval))
                        return -ENODEV;
                cycles = system_counterval.cycles;

                /*
                 * Check whether the system counter value provided by the
                 * device driver is on the current timekeeping interval.
                 */
                now = tk_clock_read(&tk->tkr_mono);
                interval_start = tk->tkr_mono.cycle_last;
                if (!timestamp_in_interval(interval_start, now, cycles)) {
                        clock_was_set_seq = tk->clock_was_set_seq;
                        cs_was_changed_seq = tk->cs_was_changed_seq;
                        cycles = interval_start;
                        do_interp = true;
                } else {
                        do_interp = false;
                }

                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_raw = tk->tkr_raw.base;

                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
                nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
        xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);

        /*
         * Interpolate if necessary, adjusting back from the start of the
         * current interval
         */
        if (do_interp) {
                u64 partial_history_cycles, total_history_cycles;
                bool discontinuity;

                /*
                 * Check that the counter value is not before the provided
                 * history reference and that the history doesn't cross a
                 * clocksource change
                 */
                if (!history_begin ||
                    !timestamp_in_interval(history_begin->cycles,
                                           cycles, system_counterval.cycles) ||
                    history_begin->cs_was_changed_seq != cs_was_changed_seq)
                        return -EINVAL;
                partial_history_cycles = cycles - system_counterval.cycles;
                total_history_cycles = cycles - history_begin->cycles;
                discontinuity =
                        history_begin->clock_was_set_seq != clock_was_set_seq;

                ret = adjust_historical_crosststamp(history_begin,
                                                    partial_history_cycles,
                                                    total_history_cycles,
                                                    discontinuity, xtstamp);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);

/**
 * timekeeping_clocksource_has_base - Check whether the current clocksource
 *                                      is based on given a base clock
 * @id:                base clocksource ID
 *
 * Note:        The return value is a snapshot which can become invalid right
 *                after the function returns.
 *
 * Return:        true if the timekeeper clocksource has a base clock with @id,
 *                false otherwise
 */
bool timekeeping_clocksource_has_base(enum clocksource_ids id)
{
        /*
         * This is a snapshot, so no point in using the sequence
         * count. Just prevent the compiler from re-evaluating @base as the
         * clocksource might change concurrently.
         */
        struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base);

        return base ? base->id == id : false;
}
EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base);

/**
 * do_settimeofday64 - Sets the time of day.
 * @ts:     pointer to the timespec64 variable containing the new time
 *
 * Sets the time of day to the new time and update NTP and notify hrtimers
 */
int do_settimeofday64(const struct timespec64 *ts)
{
        struct timespec64 ts_delta, xt;

        if (!timespec64_valid_settod(ts))
                return -EINVAL;

        scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;

                timekeeping_forward_now(tks);

                xt = tk_xtime(tks);
                ts_delta = timespec64_sub(*ts, xt);

                if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) {
                        timekeeping_restore_shadow(&tk_core);
                        return -EINVAL;
                }

                tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta));
                tk_set_xtime(tks, ts);
                timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
        }

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL);

        audit_tk_injoffset(ts_delta);
        add_device_randomness(ts, sizeof(*ts));
        return 0;
}
EXPORT_SYMBOL(do_settimeofday64);

static inline bool timekeeper_is_core_tk(struct timekeeper *tk)
{
        return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE;
}

/**
 * __timekeeping_inject_offset - Adds or subtracts from the current time.
 * @tkd:        Pointer to the timekeeper to modify
 * @ts:                Pointer to the timespec variable containing the offset
 *
 * Adds or subtracts an offset value from the current time.
 */
static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts)
{
        struct timekeeper *tks = &tkd->shadow_timekeeper;
        struct timespec64 tmp;

        if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;

        timekeeping_forward_now(tks);

        if (timekeeper_is_core_tk(tks)) {
                /* Make sure the proposed value is valid */
                tmp = timespec64_add(tk_xtime(tks), *ts);
                if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
                    !timespec64_valid_settod(&tmp)) {
                        timekeeping_restore_shadow(tkd);
                        return -EINVAL;
                }

                tk_xtime_add(tks, ts);
                tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
        } else {
                struct tk_read_base *tkr_mono = &tks->tkr_mono;
                ktime_t now, offs;

                /* Get the current time */
                now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono));
                /* Add the relative offset change */
                offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts));

                /* Prevent that the resulting time becomes negative */
                if (ktime_add(now, offs) < 0) {
                        timekeeping_restore_shadow(tkd);
                        return -EINVAL;
                }
                tk_update_aux_offs(tks, offs);
        }

        timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
        return 0;
}

static int timekeeping_inject_offset(const struct timespec64 *ts)
{
        int ret;

        scoped_guard (raw_spinlock_irqsave, &tk_core.lock)
                ret = __timekeeping_inject_offset(&tk_core, ts);

        /* Signal hrtimers about time change */
        if (!ret)
                clock_was_set(CLOCK_SET_WALL);
        return ret;
}

/*
 * Indicates if there is an offset between the system clock and the hardware
 * clock/persistent clock/rtc.
 */
int persistent_clock_is_local;

/*
 * Adjust the time obtained from the CMOS to be UTC time instead of
 * local time.
 *
 * This is ugly, but preferable to the alternatives.  Otherwise we
 * would either need to write a program to do it in /etc/rc (and risk
 * confusion if the program gets run more than once; it would also be
 * hard to make the program warp the clock precisely n hours)  or
 * compile in the timezone information into the kernel.  Bad, bad....
 *
 *                                                - TYT, 1992-01-01
 *
 * The best thing to do is to keep the CMOS clock in universal time (UTC)
 * as real UNIX machines always do it. This avoids all headaches about
 * daylight saving times and warping kernel clocks.
 */
void timekeeping_warp_clock(void)
{
        if (sys_tz.tz_minuteswest != 0) {
                struct timespec64 adjust;

                persistent_clock_is_local = 1;
                adjust.tv_sec = sys_tz.tz_minuteswest * 60;
                adjust.tv_nsec = 0;
                timekeeping_inject_offset(&adjust);
        }
}

/*
 * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
 */
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
{
        tk->tai_offset = tai_offset;
        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
}

/*
 * change_clocksource - Swaps clocksources if a new one is available
 *
 * Accumulates current time interval and initializes new clocksource
 */
static int change_clocksource(void *data)
{
        struct clocksource *new = data, *old = NULL;

        /*
         * If the clocksource is in a module, get a module reference.
         * Succeeds for built-in code (owner == NULL) as well. Abort if the
         * reference can't be acquired.
         */
        if (!try_module_get(new->owner))
                return 0;

        /* Abort if the device can't be enabled */
        if (new->enable && new->enable(new) != 0) {
                module_put(new->owner);
                return 0;
        }

        scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;

                timekeeping_forward_now(tks);
                old = tks->tkr_mono.clock;
                tk_setup_internals(tks, new);
                timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
        }

        tk_aux_update_clocksource();

        if (old) {
                if (old->disable)
                        old->disable(old);
                module_put(old->owner);
        }

        return 0;
}

/**
 * timekeeping_notify - Install a new clock source
 * @clock:                pointer to the clock source
 *
 * This function is called from clocksource.c after a new, better clock
 * source has been registered. The caller holds the clocksource_mutex.
 */
int timekeeping_notify(struct clocksource *clock)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        if (tk->tkr_mono.clock == clock)
                return 0;

        /* Disable inlined reads accross the clocksource switch */
        clocksource_disable_inline_read();

        stop_machine(change_clocksource, clock, NULL);

        /*
         * If the clocksource has been selected and supports inlined reads
         * enable the branch.
         */
        if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ)
                clocksource_enable_inline_read();

        tick_clock_notify();
        return tk->tkr_mono.clock == clock ? 0 : -1;
}

/**
 * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the raw monotonic time (completely un-modified by ntp)
 */
void ktime_get_raw_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->raw_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_raw_ts64);

/**
 * ktime_get_clock_ts64 - Returns time of a clock in a timespec
 * @id:                POSIX clock ID of the clock to read
 * @ts:                Pointer to the timespec64 to be set
 *
 * The timestamp is invalidated (@ts->sec is set to -1) if the
 * clock @id is not available.
 */
void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts)
{
        /* Invalidate time stamp */
        ts->tv_sec = -1;
        ts->tv_nsec = 0;

        switch (id) {
        case CLOCK_REALTIME:
                ktime_get_real_ts64(ts);
                return;
        case CLOCK_MONOTONIC:
                ktime_get_ts64(ts);
                return;
        case CLOCK_MONOTONIC_RAW:
                ktime_get_raw_ts64(ts);
                return;
        case CLOCK_AUX ... CLOCK_AUX_LAST:
                if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
                        ktime_get_aux_ts64(id, ts);
                return;
        default:
                WARN_ON_ONCE(1);
        }
}
EXPORT_SYMBOL_GPL(ktime_get_clock_ts64);

/**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
int timekeeping_valid_for_hres(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
 */
u64 timekeeping_max_deferment(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->max_idle_ns;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * read_persistent_clock64 -  Return time from the persistent clock.
 * @ts: Pointer to the storage for the readout value
 *
 * Weak dummy function for arches that do not yet support it.
 * Reads the time from the battery backed persistent clock.
 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
 *
 *  XXX - Do be sure to remove it once all arches implement it.
 */
void __weak read_persistent_clock64(struct timespec64 *ts)
{
        ts->tv_sec = 0;
        ts->tv_nsec = 0;
}

/**
 * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
 *                                        from the boot.
 * @wall_time:          current time as returned by persistent clock
 * @boot_offset:  offset that is defined as wall_time - boot_time
 *
 * Weak dummy function for arches that do not yet support it.
 *
 * The default function calculates offset based on the current value of
 * local_clock(). This way architectures that support sched_clock() but don't
 * support dedicated boot time clock will provide the best estimate of the
 * boot time.
 */
void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
                                     struct timespec64 *boot_offset)
{
        read_persistent_clock64(wall_time);
        *boot_offset = ns_to_timespec64(local_clock());
}

static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid)
{
        raw_spin_lock_init(&tkd->lock);
        seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
        tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
        tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid;
}

/*
 * Flag reflecting whether timekeeping_resume() has injected sleeptime.
 *
 * The flag starts of false and is only set when a suspend reaches
 * timekeeping_suspend(), timekeeping_resume() sets it to false when the
 * timekeeper clocksource is not stopping across suspend and has been
 * used to update sleep time. If the timekeeper clocksource has stopped
 * then the flag stays true and is used by the RTC resume code to decide
 * whether sleeptime must be injected and if so the flag gets false then.
 *
 * If a suspend fails before reaching timekeeping_resume() then the flag
 * stays false and prevents erroneous sleeptime injection.
 */
static bool suspend_timing_needed;

/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists;

/*
 * timekeeping_init - Initializes the clocksource and common timekeeping values
 */
void __init timekeeping_init(void)
{
        struct timespec64 wall_time, boot_offset, wall_to_mono;
        struct timekeeper *tks = &tk_core.shadow_timekeeper;
        struct clocksource *clock;

        tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true);
        tk_aux_setup();

        read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
        if (timespec64_valid_settod(&wall_time) &&
            timespec64_to_ns(&wall_time) > 0) {
                persistent_clock_exists = true;
        } else if (timespec64_to_ns(&wall_time) != 0) {
                pr_warn("Persistent clock returned invalid value");
                wall_time = (struct timespec64){0};
        }

        if (timespec64_compare(&wall_time, &boot_offset) < 0)
                boot_offset = (struct timespec64){0};

        /*
         * We want set wall_to_mono, so the following is true:
         * wall time + wall_to_mono = boot time
         */
        wall_to_mono = timespec64_sub(boot_offset, wall_time);

        guard(raw_spinlock_irqsave)(&tk_core.lock);

        ntp_init();

        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
        tk_setup_internals(tks, clock);

        tk_set_xtime(tks, &wall_time);
        tks->raw_sec = 0;

        tk_set_wall_to_mono(tks, wall_to_mono);

        timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
}

/* time in seconds when suspend began for persistent clock */
static struct timespec64 timekeeping_suspend_time;

/**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @tk:                Pointer to the timekeeper to be updated
 * @delta:        Pointer to the delta value in timespec64 format
 *
 * Takes a timespec offset measuring a suspend interval and properly
 * adds the sleep offset to the timekeeping variables.
 */
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
                                           const struct timespec64 *delta)
{
        if (!timespec64_valid_strict(delta)) {
                printk_deferred(KERN_WARNING
                                "__timekeeping_inject_sleeptime: Invalid "
                                "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
        tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
        tk_debug_account_sleep_time(delta);
}

#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
/*
 * We have three kinds of time sources to use for sleep time
 * injection, the preference order is:
 * 1) non-stop clocksource
 * 2) persistent clock (ie: RTC accessible when irqs are off)
 * 3) RTC
 *
 * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
 * If system has neither 1) nor 2), 3) will be used finally.
 *
 *
 * If timekeeping has injected sleeptime via either 1) or 2),
 * 3) becomes needless, so in this case we don't need to call
 * rtc_resume(), and this is what timekeeping_rtc_skipresume()
 * means.
 */
bool timekeeping_rtc_skipresume(void)
{
        return !suspend_timing_needed;
}

/*
 * 1) can be determined whether to use or not only when doing
 * timekeeping_resume() which is invoked after rtc_suspend(),
 * so we can't skip rtc_suspend() surely if system has 1).
 *
 * But if system has 2), 2) will definitely be used, so in this
 * case we don't need to call rtc_suspend(), and this is what
 * timekeeping_rtc_skipsuspend() means.
 */
bool timekeeping_rtc_skipsuspend(void)
{
        return persistent_clock_exists;
}

/**
 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec64 delta value
 *
 * This hook is for architectures that cannot support read_persistent_clock64
 * because their RTC/persistent clock is only accessible when irqs are enabled.
 * and also don't have an effective nonstop clocksource.
 *
 * This function should only be called by rtc_resume(), and allows
 * a suspend offset to be injected into the timekeeping values.
 */
void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
{
        scoped_guard(raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;

                suspend_timing_needed = false;
                timekeeping_forward_now(tks);
                __timekeeping_inject_sleeptime(tks, delta);
                timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
        }

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
}
#endif

/**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 */
void timekeeping_resume(void)
{
        struct timekeeper *tks = &tk_core.shadow_timekeeper;
        struct clocksource *clock = tks->tkr_mono.clock;
        struct timespec64 ts_new, ts_delta;
        bool inject_sleeptime = false;
        u64 cycle_now, nsec;
        unsigned long flags;

        read_persistent_clock64(&ts_new);

        clockevents_resume();
        clocksource_resume();

        raw_spin_lock_irqsave(&tk_core.lock, flags);

        /*
         * After system resumes, we need to calculate the suspended time and
         * compensate it for the OS time. There are 3 sources that could be
         * used: Nonstop clocksource during suspend, persistent clock and rtc
         * device.
         *
         * One specific platform may have 1 or 2 or all of them, and the
         * preference will be:
         *        suspend-nonstop clocksource -> persistent clock -> rtc
         * The less preferred source will only be tried if there is no better
         * usable source. The rtc part is handled separately in rtc core code.
         */
        cycle_now = tk_clock_read(&tks->tkr_mono);
        nsec = clocksource_stop_suspend_timing(clock, cycle_now);
        if (nsec > 0) {
                ts_delta = ns_to_timespec64(nsec);
                inject_sleeptime = true;
        } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
                ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
                inject_sleeptime = true;
        }

        if (inject_sleeptime) {
                suspend_timing_needed = false;
                __timekeeping_inject_sleeptime(tks, &ts_delta);
        }

        /* Re-base the last cycle value */
        tks->tkr_mono.cycle_last = cycle_now;
        tks->tkr_raw.cycle_last  = cycle_now;

        tks->ntp_error = 0;
        timekeeping_suspended = 0;
        timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
        raw_spin_unlock_irqrestore(&tk_core.lock, flags);

        touch_softlockup_watchdog();

        /* Resume the clockevent device(s) and hrtimers */
        tick_resume();
        /* Notify timerfd as resume is equivalent to clock_was_set() */
        timerfd_resume();
}

static void timekeeping_syscore_resume(void *data)
{
        timekeeping_resume();
}

int timekeeping_suspend(void)
{
        struct timekeeper *tks = &tk_core.shadow_timekeeper;
        struct timespec64 delta, delta_delta;
        static struct timespec64 old_delta;
        struct clocksource *curr_clock;
        unsigned long flags;
        u64 cycle_now;

        read_persistent_clock64(&timekeeping_suspend_time);

        /*
         * On some systems the persistent_clock can not be detected at
         * timekeeping_init by its return value, so if we see a valid
         * value returned, update the persistent_clock_exists flag.
         */
        if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
                persistent_clock_exists = true;

        suspend_timing_needed = true;

        raw_spin_lock_irqsave(&tk_core.lock, flags);
        timekeeping_forward_now(tks);
        timekeeping_suspended = 1;

        /*
         * Since we've called forward_now, cycle_last stores the value
         * just read from the current clocksource. Save this to potentially
         * use in suspend timing.
         */
        curr_clock = tks->tkr_mono.clock;
        cycle_now = tks->tkr_mono.cycle_last;
        clocksource_start_suspend_timing(curr_clock, cycle_now);

        if (persistent_clock_exists) {
                /*
                 * To avoid drift caused by repeated suspend/resumes,
                 * which each can add ~1 second drift error,
                 * try to compensate so the difference in system time
                 * and persistent_clock time stays close to constant.
                 */
                delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time);
                delta_delta = timespec64_sub(delta, old_delta);
                if (abs(delta_delta.tv_sec) >= 2) {
                        /*
                         * if delta_delta is too large, assume time correction
                         * has occurred and set old_delta to the current delta.
                         */
                        old_delta = delta;
                } else {
                        /* Otherwise try to adjust old_system to compensate */
                        timekeeping_suspend_time =
                                timespec64_add(timekeeping_suspend_time, delta_delta);
                }
        }

        timekeeping_update_from_shadow(&tk_core, 0);
        halt_fast_timekeeper(tks);
        raw_spin_unlock_irqrestore(&tk_core.lock, flags);

        tick_suspend();
        clocksource_suspend();
        clockevents_suspend();

        return 0;
}

static int timekeeping_syscore_suspend(void *data)
{
        return timekeeping_suspend();
}

/* sysfs resume/suspend bits for timekeeping */
static const struct syscore_ops timekeeping_syscore_ops = {
        .resume                = timekeeping_syscore_resume,
        .suspend        = timekeeping_syscore_suspend,
};

static struct syscore timekeeping_syscore = {
        .ops = &timekeeping_syscore_ops,
};

static int __init timekeeping_init_ops(void)
{
        register_syscore(&timekeeping_syscore);
        return 0;
}
device_initcall(timekeeping_init_ops);

/*
 * Apply a multiplier adjustment to the timekeeper
 */
static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
                                                         s64 offset,
                                                         s32 mult_adj)
{
        s64 interval = tk->cycle_interval;

        if (mult_adj == 0) {
                return;
        } else if (mult_adj == -1) {
                interval = -interval;
                offset = -offset;
        } else if (mult_adj != 1) {
                interval *= mult_adj;
                offset *= mult_adj;
        }

        /*
         * So the following can be confusing.
         *
         * To keep things simple, lets assume mult_adj == 1 for now.
         *
         * When mult_adj != 1, remember that the interval and offset values
         * have been appropriately scaled so the math is the same.
         *
         * The basic idea here is that we're increasing the multiplier
         * by one, this causes the xtime_interval to be incremented by
         * one cycle_interval. This is because:
         *        xtime_interval = cycle_interval * mult
         * So if mult is being incremented by one:
         *        xtime_interval = cycle_interval * (mult + 1)
         * Its the same as:
         *        xtime_interval = (cycle_interval * mult) + cycle_interval
         * Which can be shortened to:
         *        xtime_interval += cycle_interval
         *
         * So offset stores the non-accumulated cycles. Thus the current
         * time (in shifted nanoseconds) is:
         *        now = (offset * adj) + xtime_nsec
         * Now, even though we're adjusting the clock frequency, we have
         * to keep time consistent. In other words, we can't jump back
         * in time, and we also want to avoid jumping forward in time.
         *
         * So given the same offset value, we need the time to be the same
         * both before and after the freq adjustment.
         *        now = (offset * adj_1) + xtime_nsec_1
         *        now = (offset * adj_2) + xtime_nsec_2
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_2) + xtime_nsec_2
         * And we know:
         *        adj_2 = adj_1 + 1
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * (adj_1+1)) + xtime_nsec_2
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_1) + offset + xtime_nsec_2
         * Canceling the sides:
         *        xtime_nsec_1 = offset + xtime_nsec_2
         * Which gives us:
         *        xtime_nsec_2 = xtime_nsec_1 - offset
         * Which simplifies to:
         *        xtime_nsec -= offset
         */
        if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
                /* NTP adjustment caused clocksource mult overflow */
                WARN_ON_ONCE(1);
                return;
        }

        tk->tkr_mono.mult += mult_adj;
        tk->xtime_interval += interval;
        tk->tkr_mono.xtime_nsec -= offset;
}

/*
 * Adjust the timekeeper's multiplier to the correct frequency
 * and also to reduce the accumulated error value.
 */
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
        u64 ntp_tl = ntp_tick_length(tk->id);
        u32 mult;

        /*
         * Determine the multiplier from the current NTP tick length.
         * Avoid expensive division when the tick length doesn't change.
         */
        if (likely(tk->ntp_tick == ntp_tl)) {
                mult = tk->tkr_mono.mult - tk->ntp_err_mult;
        } else {
                tk->ntp_tick = ntp_tl;
                mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
                                 tk->xtime_remainder, tk->cycle_interval);
        }

        /*
         * If the clock is behind the NTP time, increase the multiplier by 1
         * to catch up with it. If it's ahead and there was a remainder in the
         * tick division, the clock will slow down. Otherwise it will stay
         * ahead until the tick length changes to a non-divisible value.
         */
        tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
        mult += tk->ntp_err_mult;

        timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);

        if (unlikely(tk->tkr_mono.clock->maxadj &&
                (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
                        > tk->tkr_mono.clock->maxadj))) {
                printk_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
                        tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
                        (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
        }

        /*
         * It may be possible that when we entered this function, xtime_nsec
         * was very small.  Further, if we're slightly speeding the clocksource
         * in the code above, its possible the required corrective factor to
         * xtime_nsec could cause it to underflow.
         *
         * Now, since we have already accumulated the second and the NTP
         * subsystem has been notified via second_overflow(), we need to skip
         * the next update.
         */
        if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
                tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
                                                        tk->tkr_mono.shift;
                tk->xtime_sec--;
                tk->skip_second_overflow = 1;
        }
}

/*
 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
 *
 * Helper function that accumulates the nsecs greater than a second
 * from the xtime_nsec field to the xtime_secs field.
 * It also calls into the NTP code to handle leapsecond processing.
 */
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
        unsigned int clock_set = 0;

        while (tk->tkr_mono.xtime_nsec >= nsecps) {
                int leap;

                tk->tkr_mono.xtime_nsec -= nsecps;
                tk->xtime_sec++;

                /*
                 * Skip NTP update if this second was accumulated before,
                 * i.e. xtime_nsec underflowed in timekeeping_adjust()
                 */
                if (unlikely(tk->skip_second_overflow)) {
                        tk->skip_second_overflow = 0;
                        continue;
                }

                /* Figure out if its a leap sec and apply if needed */
                leap = second_overflow(tk->id, tk->xtime_sec);
                if (unlikely(leap)) {
                        struct timespec64 ts;

                        tk->xtime_sec += leap;

                        ts.tv_sec = leap;
                        ts.tv_nsec = 0;
                        tk_set_wall_to_mono(tk,
                                timespec64_sub(tk->wall_to_monotonic, ts));

                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);

                        clock_set = TK_CLOCK_WAS_SET;
                }
        }
        return clock_set;
}

/*
 * logarithmic_accumulation - shifted accumulation of cycles
 *
 * This functions accumulates a shifted interval of cycles into
 * a shifted interval nanoseconds. Allows for O(log) accumulation
 * loop.
 *
 * Returns the unconsumed cycles.
 */
static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
                                    u32 shift, unsigned int *clock_set)
{
        u64 interval = tk->cycle_interval << shift;
        u64 snsec_per_sec;

        /* If the offset is smaller than a shifted interval, do nothing */
        if (offset < interval)
                return offset;

        /* Accumulate one shifted interval */
        offset -= interval;
        tk->tkr_mono.cycle_last += interval;
        tk->tkr_raw.cycle_last  += interval;

        tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
        *clock_set |= accumulate_nsecs_to_secs(tk);

        /* Accumulate raw time */
        tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
        snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
        while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
                tk->tkr_raw.xtime_nsec -= snsec_per_sec;
                tk->raw_sec++;
        }

        /* Accumulate error between NTP and clock interval */
        tk->ntp_error += tk->ntp_tick << shift;
        tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
                                                (tk->ntp_error_shift + shift);

        return offset;
}

/*
 * timekeeping_advance - Updates the timekeeper to the current time and
 * current NTP tick length
 */
static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode)
{
        struct timekeeper *tk = &tkd->shadow_timekeeper;
        struct timekeeper *real_tk = &tkd->timekeeper;
        unsigned int clock_set = 0;
        int shift = 0, maxshift;
        u64 offset, orig_offset;

        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                return false;

        offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
                                   tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
                                   tk->tkr_mono.clock->max_raw_delta);
        orig_offset = offset;
        /* Check if there's really nothing to do */
        if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
                return false;

        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
         * we calculate the largest doubling multiple of cycle_intervals
         * that is smaller than the offset.  We then accumulate that
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
        shift = ilog2(offset) - ilog2(tk->cycle_interval);
        shift = max(0, shift);
        /* Bound shift to one less than what overflows tick_length */
        maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1;
        shift = min(shift, maxshift);
        while (offset >= tk->cycle_interval) {
                offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
                if (offset < tk->cycle_interval<<shift)
                        shift--;
        }

        /* Adjust the multiplier to correct NTP error */
        timekeeping_adjust(tk, offset);

        /*
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
        clock_set |= accumulate_nsecs_to_secs(tk);

        /*
         * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls
         * making small negative adjustments to the base xtime_nsec
         * value, only update the coarse clocks if we accumulated time
         */
        if (orig_offset != offset)
                tk_update_coarse_nsecs(tk);

        timekeeping_update_from_shadow(tkd, clock_set);

        return !!clock_set;
}

static bool timekeeping_advance(enum timekeeping_adv_mode mode)
{
        guard(raw_spinlock_irqsave)(&tk_core.lock);
        return __timekeeping_advance(&tk_core, mode);
}

/**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 * It also updates the enabled auxiliary clock timekeepers
 */
void update_wall_time(void)
{
        if (timekeeping_advance(TK_ADV_TICK))
                clock_was_set_delayed();
        tk_aux_advance();
}

/**
 * getboottime64 - Return the real time of system boot.
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the wall-time of boot in a timespec64.
 *
 * This is based on the wall_to_monotonic offset and the total suspend
 * time. Calls to settimeofday will affect the value returned (which
 * basically means that however wrong your real time clock is at boot time,
 * you get the right time here).
 */
void getboottime64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);

        *ts = ktime_to_timespec64(t);
}
EXPORT_SYMBOL_GPL(getboottime64);

void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                *ts = tk_xtime_coarse(tk);
        } while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);

/**
 * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
 * @ts:                timespec64 to be filled
 *
 * Fetch the global mg_floor value, convert it to realtime and compare it
 * to the current coarse-grained time. Fill @ts with whichever is
 * latest. Note that this is a filesystem-specific interface and should be
 * avoided outside of that context.
 */
void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 floor = atomic64_read(&mg_floor);
        ktime_t f_real, offset, coarse;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                *ts = tk_xtime_coarse(tk);
                offset = tk_core.timekeeper.offs_real;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        coarse = timespec64_to_ktime(*ts);
        f_real = ktime_add(floor, offset);
        if (ktime_after(f_real, coarse))
                *ts = ktime_to_timespec64(f_real);
}

/**
 * ktime_get_real_ts64_mg - attempt to update floor value and return result
 * @ts:                pointer to the timespec to be set
 *
 * Get a monotonic fine-grained time value and attempt to swap it into
 * mg_floor. If that succeeds then accept the new floor value. If it fails
 * then another task raced in during the interim time and updated the
 * floor.  Since any update to the floor must be later than the previous
 * floor, either outcome is acceptable.
 *
 * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
 * and determining that the resulting coarse-grained timestamp did not effect
 * a change in ctime. Any more recent floor value would effect a change to
 * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
 *
 * @ts will be filled with the latest floor value, regardless of the outcome of
 * the cmpxchg. Note that this is a filesystem specific interface and should be
 * avoided outside of that context.
 */
void ktime_get_real_ts64_mg(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t old = atomic64_read(&mg_floor);
        ktime_t offset, mono;
        unsigned int seq;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ts->tv_sec = tk->xtime_sec;
                mono = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                offset = tk_core.timekeeper.offs_real;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        mono = ktime_add_ns(mono, nsecs);

        /*
         * Attempt to update the floor with the new time value. As any
         * update must be later then the existing floor, and would effect
         * a change to ctime from the perspective of the current task,
         * accept the resulting floor value regardless of the outcome of
         * the swap.
         */
        if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
                ts->tv_nsec = 0;
                timespec64_add_ns(ts, nsecs);
                timekeeping_inc_mg_floor_swaps();
        } else {
                /*
                 * Another task changed mg_floor since "old" was fetched.
                 * "old" has been updated with the latest value of "mg_floor".
                 * That value is newer than the previous floor value, which
                 * is enough to effect a change to ctime. Accept it.
                 */
                *ts = ktime_to_timespec64(ktime_add(old, offset));
        }
}

void ktime_get_coarse_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 now, mono;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                now = tk_xtime_coarse(tk);
                mono = tk->wall_to_monotonic;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
                                  now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);

/*
 * Must hold jiffies_lock
 */
void do_timer(unsigned long ticks)
{
        jiffies_64 += ticks;
        calc_global_load();
}

/**
 * ktime_get_update_offsets_now - hrtimer helper
 * @cwsseq:        pointer to check and store the clock was set sequence number
 * @offs_real:        pointer to storage for monotonic -> realtime offset
 * @offs_boot:        pointer to storage for monotonic -> boottime offset
 * @offs_tai:        pointer to storage for monotonic -> clock tai offset
 *
 * Returns current monotonic time and updates the offsets if the
 * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
 * different.
 *
 * Called from hrtimer_interrupt() or retrigger_next_event()
 */
ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
                                     ktime_t *offs_boot, ktime_t *offs_tai)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                base = ktime_add_ns(base, nsecs);

                if (*cwsseq != tk->clock_was_set_seq) {
                        *cwsseq = tk->clock_was_set_seq;
                        *offs_real = tk->offs_real;
                        *offs_boot = tk->offs_boot;
                        *offs_tai = tk->offs_tai;
                }

                /* Handle leapsecond insertion adjustments */
                if (unlikely(base >= tk->next_leap_ktime))
                        *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return base;
}

/*
 * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
 */
static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock)
{
        if (txc->modes & ADJ_ADJTIME) {
                /* singleshot must not be used with any other mode bits */
                if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
                        return -EINVAL;
                if (!(txc->modes & ADJ_OFFSET_READONLY) &&
                    !capable(CAP_SYS_TIME))
                        return -EPERM;
        } else {
                /* In order to modify anything, you gotta be super-user! */
                if (txc->modes && !capable(CAP_SYS_TIME))
                        return -EPERM;
                /*
                 * if the quartz is off by more than 10% then
                 * something is VERY wrong!
                 */
                if (txc->modes & ADJ_TICK &&
                    (txc->tick <  900000/USER_HZ ||
                     txc->tick > 1100000/USER_HZ))
                        return -EINVAL;
        }

        if (txc->modes & ADJ_SETOFFSET) {
                /* In order to inject time, you gotta be super-user! */
                if (!capable(CAP_SYS_TIME))
                        return -EPERM;

                /*
                 * Validate if a timespec/timeval used to inject a time
                 * offset is valid.  Offsets can be positive or negative, so
                 * we don't check tv_sec. The value of the timeval/timespec
                 * is the sum of its fields,but *NOTE*:
                 * The field tv_usec/tv_nsec must always be non-negative and
                 * we can't have more nanoseconds/microseconds than a second.
                 */
                if (txc->time.tv_usec < 0)
                        return -EINVAL;

                if (txc->modes & ADJ_NANO) {
                        if (txc->time.tv_usec >= NSEC_PER_SEC)
                                return -EINVAL;
                } else {
                        if (txc->time.tv_usec >= USEC_PER_SEC)
                                return -EINVAL;
                }
        }

        /*
         * Check for potential multiplication overflows that can
         * only happen on 64-bit systems:
         */
        if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
                if (LLONG_MIN / PPM_SCALE > txc->freq)
                        return -EINVAL;
                if (LLONG_MAX / PPM_SCALE < txc->freq)
                        return -EINVAL;
        }

        if (aux_clock) {
                /* Auxiliary clocks are similar to TAI and do not have leap seconds */
                if (txc->modes & ADJ_STATUS &&
                    txc->status & (STA_INS | STA_DEL))
                        return -EINVAL;

                /* No TAI offset setting */
                if (txc->modes & ADJ_TAI)
                        return -EINVAL;

                /* No PPS support either */
                if (txc->modes & ADJ_STATUS &&
                    txc->status & (STA_PPSFREQ | STA_PPSTIME))
                        return -EINVAL;
        }

        return 0;
}

/**
 * random_get_entropy_fallback - Returns the raw clock source value,
 * used by random.c for platforms with no valid random_get_entropy().
 */
unsigned long random_get_entropy_fallback(void)
{
        struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
        struct clocksource *clock = READ_ONCE(tkr->clock);

        if (unlikely(timekeeping_suspended || !clock))
                return 0;
        return clock->read(clock);
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);

struct adjtimex_result {
        struct audit_ntp_data        ad;
        struct timespec64        delta;
        bool                        clock_set;
};

static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
                         struct adjtimex_result *result)
{
        struct timekeeper *tks = &tkd->shadow_timekeeper;
        bool aux_clock = !timekeeper_is_core_tk(tks);
        struct timespec64 ts;
        s32 orig_tai, tai;
        int ret;

        /* Validate the data before disabling interrupts */
        ret = timekeeping_validate_timex(txc, aux_clock);
        if (ret)
                return ret;
        add_device_randomness(txc, sizeof(*txc));

        if (!aux_clock)
                ktime_get_real_ts64(&ts);
        else
                tk_get_aux_ts64(tkd->timekeeper.id, &ts);

        add_device_randomness(&ts, sizeof(ts));

        guard(raw_spinlock_irqsave)(&tkd->lock);

        if (!tks->clock_valid)
                return -ENODEV;

        if (txc->modes & ADJ_SETOFFSET) {
                result->delta.tv_sec  = txc->time.tv_sec;
                result->delta.tv_nsec = txc->time.tv_usec;
                if (!(txc->modes & ADJ_NANO))
                        result->delta.tv_nsec *= 1000;
                ret = __timekeeping_inject_offset(tkd, &result->delta);
                if (ret)
                        return ret;
                result->clock_set = true;
        }

        orig_tai = tai = tks->tai_offset;
        ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad);

        if (tai != orig_tai) {
                __timekeeping_set_tai_offset(tks, tai);
                timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
                result->clock_set = true;
        } else {
                tk_update_leap_state_all(tkd);
        }

        /* Update the multiplier immediately if frequency was set directly */
        if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
                result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ);

        return ret;
}

/**
 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
 * @txc:        Pointer to kernel_timex structure containing NTP parameters
 */
int do_adjtimex(struct __kernel_timex *txc)
{
        struct adjtimex_result result = { };
        int ret;

        ret = __do_adjtimex(&tk_core, txc, &result);
        if (ret < 0)
                return ret;

        if (txc->modes & ADJ_SETOFFSET)
                audit_tk_injoffset(result.delta);

        audit_ntp_log(&result.ad);

        if (result.clock_set)
                clock_was_set(CLOCK_SET_WALL);

        ntp_notify_cmos_timer(result.delta.tv_sec != 0);

        return ret;
}

/*
 * Invoked from NTP with the time keeper lock held, so lockless access is
 * fine.
 */
long ktime_get_ntp_seconds(unsigned int id)
{
        return timekeeper_data[id].timekeeper.xtime_sec;
}

#ifdef CONFIG_NTP_PPS
/**
 * hardpps() - Accessor function to NTP __hardpps function
 * @phase_ts:        Pointer to timespec64 structure representing phase timestamp
 * @raw_ts:        Pointer to timespec64 structure representing raw timestamp
 */
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
        guard(raw_spinlock_irqsave)(&tk_core.lock);
        __hardpps(phase_ts, raw_ts);
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */

#ifdef CONFIG_POSIX_AUX_CLOCKS
#include "posix-timers.h"

/*
 * Bitmap for the activated auxiliary timekeepers to allow lockless quick
 * checks in the hot paths without touching extra cache lines. If set, then
 * the state of the corresponding timekeeper has to be re-checked under
 * timekeeper::lock.
 */
static unsigned long aux_timekeepers;

static inline unsigned int clockid_to_tkid(unsigned int id)
{
        return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX;
}

static inline struct tk_data *aux_get_tk_data(clockid_t id)
{
        if (!clockid_aux_valid(id))
                return NULL;
        return &timekeeper_data[clockid_to_tkid(id)];
}

/* Invoked from timekeeping after a clocksource change */
static void tk_aux_update_clocksource(void)
{
        unsigned long active = READ_ONCE(aux_timekeepers);
        unsigned int id;

        for_each_set_bit(id, &active, BITS_PER_LONG) {
                struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
                struct timekeeper *tks = &tkd->shadow_timekeeper;

                guard(raw_spinlock_irqsave)(&tkd->lock);
                if (!tks->clock_valid)
                        continue;

                timekeeping_forward_now(tks);
                tk_setup_internals(tks, tk_core.timekeeper.tkr_raw.clock);
                timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
        }
}

static void tk_aux_advance(void)
{
        unsigned long active = READ_ONCE(aux_timekeepers);
        unsigned int id;

        /* Lockless quick check to avoid extra cache lines */
        for_each_set_bit(id, &active, BITS_PER_LONG) {
                struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];

                guard(raw_spinlock)(&aux_tkd->lock);
                if (aux_tkd->shadow_timekeeper.clock_valid)
                        __timekeeping_advance(aux_tkd, TK_ADV_TICK);
        }
}

/**
 * ktime_get_aux - Get time for a AUX clock
 * @id:        ID of the clock to read (CLOCK_AUX...)
 * @kt:        Pointer to ktime_t to store the time stamp
 *
 * Returns: True if the timestamp is valid, false otherwise
 */
bool ktime_get_aux(clockid_t id, ktime_t *kt)
{
        struct tk_data *aux_tkd = aux_get_tk_data(id);
        struct timekeeper *aux_tk;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        if (!aux_tkd)
                return false;

        aux_tk = &aux_tkd->timekeeper;
        do {
                seq = read_seqcount_begin(&aux_tkd->seq);
                if (!aux_tk->clock_valid)
                        return false;

                base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux);
                nsecs = timekeeping_get_ns(&aux_tk->tkr_mono);
        } while (read_seqcount_retry(&aux_tkd->seq, seq));

        *kt = ktime_add_ns(base, nsecs);
        return true;
}
EXPORT_SYMBOL_GPL(ktime_get_aux);

/**
 * ktime_get_aux_ts64 - Get time for a AUX clock
 * @id:        ID of the clock to read (CLOCK_AUX...)
 * @ts:        Pointer to timespec64 to store the time stamp
 *
 * Returns: True if the timestamp is valid, false otherwise
 */
bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts)
{
        ktime_t now;

        if (!ktime_get_aux(id, &now))
                return false;
        *ts = ktime_to_timespec64(now);
        return true;
}
EXPORT_SYMBOL_GPL(ktime_get_aux_ts64);

static int aux_get_res(clockid_t id, struct timespec64 *tp)
{
        if (!clockid_aux_valid(id))
                return -ENODEV;

        tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC;
        tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC;
        return 0;
}

static int aux_get_timespec(clockid_t id, struct timespec64 *tp)
{
        return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV;
}

static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
{
        struct tk_data *aux_tkd = aux_get_tk_data(id);
        struct timekeeper *aux_tks;
        ktime_t tnow, nsecs;

        if (!timespec64_valid_settod(tnew))
                return -EINVAL;
        if (!aux_tkd)
                return -ENODEV;

        aux_tks = &aux_tkd->shadow_timekeeper;

        guard(raw_spinlock_irq)(&aux_tkd->lock);
        if (!aux_tks->clock_valid)
                return -ENODEV;

        /* Forward the timekeeper base time */
        timekeeping_forward_now(aux_tks);
        /*
         * Get the updated base time. tkr_mono.base has not been
         * updated yet, so do that first. That makes the update
         * in timekeeping_update_from_shadow() redundant, but
         * that's harmless. After that @tnow can be calculated
         * by using tkr_mono::cycle_last, which has been set
         * by timekeeping_forward_now().
         */
        tk_update_ktime_data(aux_tks);
        nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last);
        tnow = ktime_add(aux_tks->tkr_mono.base, nsecs);

        /*
         * Calculate the new AUX offset as delta to @tnow ("monotonic").
         * That avoids all the tk::xtime back and forth conversions as
         * xtime ("realtime") is not applicable for auxiliary clocks and
         * kept in sync with "monotonic".
         */
        tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow));

        timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
        return 0;
}

static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc)
{
        struct tk_data *aux_tkd = aux_get_tk_data(id);
        struct adjtimex_result result = { };

        if (!aux_tkd)
                return -ENODEV;

        /*
         * @result is ignored for now as there are neither hrtimers nor a
         * RTC related to auxiliary clocks for now.
         */
        return __do_adjtimex(aux_tkd, txc, &result);
}

const struct k_clock clock_aux = {
        .clock_getres                = aux_get_res,
        .clock_get_timespec        = aux_get_timespec,
        .clock_set                = aux_clock_set,
        .clock_adj                = aux_clock_adj,
};

static void aux_clock_enable(clockid_t id)
{
        struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw;
        struct tk_data *aux_tkd = aux_get_tk_data(id);
        struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper;

        /* Prevent the core timekeeper from changing. */
        guard(raw_spinlock_irq)(&tk_core.lock);

        /*
         * Setup the auxiliary clock assuming that the raw core timekeeper
         * clock frequency conversion is close enough. Userspace has to
         * adjust for the deviation via clock_adjtime(2).
         */
        guard(raw_spinlock_nested)(&aux_tkd->lock);

        /* Remove leftovers of a previous registration */
        memset(aux_tks, 0, sizeof(*aux_tks));
        /* Restore the timekeeper id */
        aux_tks->id = aux_tkd->timekeeper.id;
        /* Setup the timekeeper based on the current system clocksource */
        tk_setup_internals(aux_tks, tkr_raw->clock);

        /* Mark it valid and set it live */
        aux_tks->clock_valid = true;
        timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
}

static void aux_clock_disable(clockid_t id)
{
        struct tk_data *aux_tkd = aux_get_tk_data(id);

        guard(raw_spinlock_irq)(&aux_tkd->lock);
        aux_tkd->shadow_timekeeper.clock_valid = false;
        timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
}

static DEFINE_MUTEX(aux_clock_mutex);

static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr,
                                      const char *buf, size_t count)
{
        /* Lazy atoi() as name is "0..7" */
        int id = kobj->name[0] & 0x7;
        bool enable;

        if (!capable(CAP_SYS_TIME))
                return -EPERM;

        if (kstrtobool(buf, &enable) < 0)
                return -EINVAL;

        guard(mutex)(&aux_clock_mutex);
        if (enable == test_bit(id, &aux_timekeepers))
                return count;

        if (enable) {
                aux_clock_enable(CLOCK_AUX + id);
                set_bit(id, &aux_timekeepers);
        } else {
                aux_clock_disable(CLOCK_AUX + id);
                clear_bit(id, &aux_timekeepers);
        }
        return count;
}

static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
        unsigned long active = READ_ONCE(aux_timekeepers);
        /* Lazy atoi() as name is "0..7" */
        int id = kobj->name[0] & 0x7;

        return sysfs_emit(buf, "%d\n", test_bit(id, &active));
}

static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable);

static struct attribute *aux_clock_enable_attrs[] = {
        &aux_clock_enable_attr.attr,
        NULL
};

static const struct attribute_group aux_clock_enable_attr_group = {
        .attrs = aux_clock_enable_attrs,
};

static int __init tk_aux_sysfs_init(void)
{
        struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
        int ret = -ENOMEM;

        if (!tko)
                return ret;

        auxo = kobject_create_and_add("aux_clocks", tko);
        if (!auxo)
                goto err_clean;

        for (int i = 0; i < MAX_AUX_CLOCKS; i++) {
                char id[2] = { [0] = '0' + i, };
                struct kobject *clk = kobject_create_and_add(id, auxo);

                if (!clk) {
                        ret = -ENOMEM;
                        goto err_clean;
                }

                ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
                if (ret)
                        goto err_clean;
        }
        return 0;

err_clean:
        kobject_put(auxo);
        kobject_put(tko);
        return ret;
}
late_initcall(tk_aux_sysfs_init);

static __init void tk_aux_setup(void)
{
        for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++)
                tkd_basic_setup(&timekeeper_data[i], i, false);
}
#endif /* CONFIG_POSIX_AUX_CLOCKS */


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    2 
    2 















    1 

    1 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Helpers for formatting and printing strings
 *
 * Copyright 31 August 2008 James Bottomley
 * Copyright (C) 2013, Intel Corporation
 */
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/math64.h>
#include <linux/export.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/hex.h>
#include <linux/limits.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/string_helpers.h>
#include <kunit/test.h>
#include <kunit/test-bug.h>

/**
 * string_get_size - get the size in the specified units
 * @size:        The size to be converted in blocks
 * @blk_size:        Size of the block (use 1 for size in bytes)
 * @units:        Units to use (powers of 1000 or 1024), whether to include space separator
 * @buf:        buffer to format to
 * @len:        length of buffer
 *
 * This function returns a string formatted to 3 significant figures
 * giving the size in the required units.  @buf should have room for
 * at least 9 bytes and will always be zero terminated.
 *
 * Return value: number of characters of output that would have been written
 * (which may be greater than len, if output was truncated).
 */
int string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
                    char *buf, int len)
{
        enum string_size_units units_base = units & STRING_UNITS_MASK;
        static const char *const units_10[] = {
                "", "k", "M", "G", "T", "P", "E", "Z", "Y",
        };
        static const char *const units_2[] = {
                "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi",
        };
        static const char *const *const units_str[] = {
                [STRING_UNITS_10] = units_10,
                [STRING_UNITS_2] = units_2,
        };
        static const unsigned int divisor[] = {
                [STRING_UNITS_10] = 1000,
                [STRING_UNITS_2] = 1024,
        };
        static const unsigned int rounding[] = { 500, 50, 5 };
        int i = 0, j;
        u32 remainder = 0, sf_cap;
        char tmp[12];
        const char *unit;

        tmp[0] = '\0';

        if (blk_size == 0)
                size = 0;
        if (size == 0)
                goto out;

        /* This is Napier's algorithm.  Reduce the original block size to
         *
         * coefficient * divisor[units_base]^i
         *
         * we do the reduction so both coefficients are just under 32 bits so
         * that multiplying them together won't overflow 64 bits and we keep
         * as much precision as possible in the numbers.
         *
         * Note: it's safe to throw away the remainders here because all the
         * precision is in the coefficients.
         */
        while (blk_size >> 32) {
                do_div(blk_size, divisor[units_base]);
                i++;
        }

        while (size >> 32) {
                do_div(size, divisor[units_base]);
                i++;
        }

        /* now perform the actual multiplication keeping i as the sum of the
         * two logarithms */
        size *= blk_size;

        /* and logarithmically reduce it until it's just under the divisor */
        while (size >= divisor[units_base]) {
                remainder = do_div(size, divisor[units_base]);
                i++;
        }

        /* work out in j how many digits of precision we need from the
         * remainder */
        sf_cap = size;
        for (j = 0; sf_cap*10 < 1000; j++)
                sf_cap *= 10;

        if (units_base == STRING_UNITS_2) {
                /* express the remainder as a decimal.  It's currently the
                 * numerator of a fraction whose denominator is
                 * divisor[units_base], which is 1 << 10 for STRING_UNITS_2 */
                remainder *= 1000;
                remainder >>= 10;
        }

        /* add a 5 to the digit below what will be printed to ensure
         * an arithmetical round up and carry it through to size */
        remainder += rounding[j];
        if (remainder >= 1000) {
                remainder -= 1000;
                size += 1;
        }

        if (j) {
                snprintf(tmp, sizeof(tmp), ".%03u", remainder);
                tmp[j+1] = '\0';
        }

 out:
        if (i >= ARRAY_SIZE(units_2))
                unit = "UNK";
        else
                unit = units_str[units_base][i];

        return snprintf(buf, len, "%u%s%s%s%s", (u32)size, tmp,
                        (units & STRING_UNITS_NO_SPACE) ? "" : " ",
                        unit,
                        (units & STRING_UNITS_NO_BYTES) ? "" : "B");
}
EXPORT_SYMBOL(string_get_size);

int parse_int_array(const char *buf, size_t count, int **array)
{
        int *ints, nints;

        get_options(buf, 0, &nints);
        if (!nints)
                return -ENOENT;

        ints = kzalloc_objs(*ints, nints + 1);
        if (!ints)
                return -ENOMEM;

        get_options(buf, nints + 1, ints);
        *array = ints;

        return 0;
}
EXPORT_SYMBOL(parse_int_array);

/**
 * parse_int_array_user - Split string into a sequence of integers
 * @from:        The user space buffer to read from
 * @count:        The maximum number of bytes to read
 * @array:        Returned pointer to sequence of integers
 *
 * On success @array is allocated and initialized with a sequence of
 * integers extracted from the @from plus an additional element that
 * begins the sequence and specifies the integers count.
 *
 * Caller takes responsibility for freeing @array when it is no longer
 * needed.
 */
int parse_int_array_user(const char __user *from, size_t count, int **array)
{
        char *buf;
        int ret;

        buf = memdup_user_nul(from, count);
        if (IS_ERR(buf))
                return PTR_ERR(buf);

        ret = parse_int_array(buf, count, array);
        kfree(buf);
        return ret;
}
EXPORT_SYMBOL(parse_int_array_user);

static bool unescape_space(char **src, char **dst)
{
        char *p = *dst, *q = *src;

        switch (*q) {
        case 'n':
                *p = '\n';
                break;
        case 'r':
                *p = '\r';
                break;
        case 't':
                *p = '\t';
                break;
        case 'v':
                *p = '\v';
                break;
        case 'f':
                *p = '\f';
                break;
        default:
                return false;
        }
        *dst += 1;
        *src += 1;
        return true;
}

static bool unescape_octal(char **src, char **dst)
{
        char *p = *dst, *q = *src;
        u8 num;

        if (isodigit(*q) == 0)
                return false;

        num = (*q++) & 7;
        while (num < 32 && isodigit(*q) && (q - *src < 3)) {
                num <<= 3;
                num += (*q++) & 7;
        }
        *p = num;
        *dst += 1;
        *src = q;
        return true;
}

static bool unescape_hex(char **src, char **dst)
{
        char *p = *dst, *q = *src;
        int digit;
        u8 num;

        if (*q++ != 'x')
                return false;

        num = digit = hex_to_bin(*q++);
        if (digit < 0)
                return false;

        digit = hex_to_bin(*q);
        if (digit >= 0) {
                q++;
                num = (num << 4) | digit;
        }
        *p = num;
        *dst += 1;
        *src = q;
        return true;
}

static bool unescape_special(char **src, char **dst)
{
        char *p = *dst, *q = *src;

        switch (*q) {
        case '\"':
                *p = '\"';
                break;
        case '\\':
                *p = '\\';
                break;
        case 'a':
                *p = '\a';
                break;
        case 'e':
                *p = '\e';
                break;
        default:
                return false;
        }
        *dst += 1;
        *src += 1;
        return true;
}

/**
 * string_unescape - unquote characters in the given string
 * @src:        source buffer (escaped)
 * @dst:        destination buffer (unescaped)
 * @size:        size of the destination buffer (0 to unlimit)
 * @flags:        combination of the flags.
 *
 * Description:
 * The function unquotes characters in the given string.
 *
 * Because the size of the output will be the same as or less than the size of
 * the input, the transformation may be performed in place.
 *
 * Caller must provide valid source and destination pointers. Be aware that
 * destination buffer will always be NULL-terminated. Source string must be
 * NULL-terminated as well.  The supported flags are::
 *
 *        UNESCAPE_SPACE:
 *                '\f' - form feed
 *                '\n' - new line
 *                '\r' - carriage return
 *                '\t' - horizontal tab
 *                '\v' - vertical tab
 *        UNESCAPE_OCTAL:
 *                '\NNN' - byte with octal value NNN (1 to 3 digits)
 *        UNESCAPE_HEX:
 *                '\xHH' - byte with hexadecimal value HH (1 to 2 digits)
 *        UNESCAPE_SPECIAL:
 *                '\"' - double quote
 *                '\\' - backslash
 *                '\a' - alert (BEL)
 *                '\e' - escape
 *        UNESCAPE_ANY:
 *                all previous together
 *
 * Return:
 * The amount of the characters processed to the destination buffer excluding
 * trailing '\0' is returned.
 */
int string_unescape(char *src, char *dst, size_t size, unsigned int flags)
{
        char *out = dst;

        if (!size)
                size = SIZE_MAX;

        while (*src && --size) {
                if (src[0] == '\\' && src[1] != '\0' && size > 1) {
                        src++;
                        size--;

                        if (flags & UNESCAPE_SPACE &&
                                        unescape_space(&src, &out))
                                continue;

                        if (flags & UNESCAPE_OCTAL &&
                                        unescape_octal(&src, &out))
                                continue;

                        if (flags & UNESCAPE_HEX &&
                                        unescape_hex(&src, &out))
                                continue;

                        if (flags & UNESCAPE_SPECIAL &&
                                        unescape_special(&src, &out))
                                continue;

                        *out++ = '\\';
                }
                *out++ = *src++;
        }
        *out = '\0';

        return out - dst;
}
EXPORT_SYMBOL(string_unescape);

static bool escape_passthrough(unsigned char c, char **dst, char *end)
{
        char *out = *dst;

        if (out < end)
                *out = c;
        *dst = out + 1;
        return true;
}

static bool escape_space(unsigned char c, char **dst, char *end)
{
        char *out = *dst;
        unsigned char to;

        switch (c) {
        case '\n':
                to = 'n';
                break;
        case '\r':
                to = 'r';
                break;
        case '\t':
                to = 't';
                break;
        case '\v':
                to = 'v';
                break;
        case '\f':
                to = 'f';
                break;
        default:
                return false;
        }

        if (out < end)
                *out = '\\';
        ++out;
        if (out < end)
                *out = to;
        ++out;

        *dst = out;
        return true;
}

static bool escape_special(unsigned char c, char **dst, char *end)
{
        char *out = *dst;
        unsigned char to;

        switch (c) {
        case '\\':
                to = '\\';
                break;
        case '\a':
                to = 'a';
                break;
        case '\e':
                to = 'e';
                break;
        case '"':
                to = '"';
                break;
        default:
                return false;
        }

        if (out < end)
                *out = '\\';
        ++out;
        if (out < end)
                *out = to;
        ++out;

        *dst = out;
        return true;
}

static bool escape_null(unsigned char c, char **dst, char *end)
{
        char *out = *dst;

        if (c)
                return false;

        if (out < end)
                *out = '\\';
        ++out;
        if (out < end)
                *out = '0';
        ++out;

        *dst = out;
        return true;
}

static bool escape_octal(unsigned char c, char **dst, char *end)
{
        char *out = *dst;

        if (out < end)
                *out = '\\';
        ++out;
        if (out < end)
                *out = ((c >> 6) & 0x07) + '0';
        ++out;
        if (out < end)
                *out = ((c >> 3) & 0x07) + '0';
        ++out;
        if (out < end)
                *out = ((c >> 0) & 0x07) + '0';
        ++out;

        *dst = out;
        return true;
}

static bool escape_hex(unsigned char c, char **dst, char *end)
{
        char *out = *dst;

        if (out < end)
                *out = '\\';
        ++out;
        if (out < end)
                *out = 'x';
        ++out;
        if (out < end)
                *out = hex_asc_hi(c);
        ++out;
        if (out < end)
                *out = hex_asc_lo(c);
        ++out;

        *dst = out;
        return true;
}

/**
 * string_escape_mem - quote characters in the given memory buffer
 * @src:        source buffer (unescaped)
 * @isz:        source buffer size
 * @dst:        destination buffer (escaped)
 * @osz:        destination buffer size
 * @flags:        combination of the flags
 * @only:        NULL-terminated string containing characters used to limit
 *                the selected escape class. If characters are included in @only
 *                that would not normally be escaped by the classes selected
 *                in @flags, they will be copied to @dst unescaped.
 *
 * Description:
 * The process of escaping byte buffer includes several parts. They are applied
 * in the following sequence.
 *
 *        1. The character is not matched to the one from @only string and thus
 *           must go as-is to the output.
 *        2. The character is matched to the printable and ASCII classes, if asked,
 *           and in case of match it passes through to the output.
 *        3. The character is matched to the printable or ASCII class, if asked,
 *           and in case of match it passes through to the output.
 *        4. The character is checked if it falls into the class given by @flags.
 *           %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
 *           character. Note that they actually can't go together, otherwise
 *           %ESCAPE_HEX will be ignored.
 *
 * Caller must provide valid source and destination pointers. Be aware that
 * destination buffer will not be NULL-terminated, thus caller have to append
 * it if needs. The supported flags are::
 *
 *        %ESCAPE_SPACE: (special white space, not space itself)
 *                '\f' - form feed
 *                '\n' - new line
 *                '\r' - carriage return
 *                '\t' - horizontal tab
 *                '\v' - vertical tab
 *        %ESCAPE_SPECIAL:
 *                '\"' - double quote
 *                '\\' - backslash
 *                '\a' - alert (BEL)
 *                '\e' - escape
 *        %ESCAPE_NULL:
 *                '\0' - null
 *        %ESCAPE_OCTAL:
 *                '\NNN' - byte with octal value NNN (3 digits)
 *        %ESCAPE_ANY:
 *                all previous together
 *        %ESCAPE_NP:
 *                escape only non-printable characters, checked by isprint()
 *        %ESCAPE_ANY_NP:
 *                all previous together
 *        %ESCAPE_HEX:
 *                '\xHH' - byte with hexadecimal value HH (2 digits)
 *        %ESCAPE_NA:
 *                escape only non-ascii characters, checked by isascii()
 *        %ESCAPE_NAP:
 *                escape only non-printable or non-ascii characters
 *        %ESCAPE_APPEND:
 *                append characters from @only to be escaped by the given classes
 *
 * %ESCAPE_APPEND would help to pass additional characters to the escaped, when
 * one of %ESCAPE_NP, %ESCAPE_NA, or %ESCAPE_NAP is provided.
 *
 * One notable caveat, the %ESCAPE_NAP, %ESCAPE_NP and %ESCAPE_NA have the
 * higher priority than the rest of the flags (%ESCAPE_NAP is the highest).
 * It doesn't make much sense to use either of them without %ESCAPE_OCTAL
 * or %ESCAPE_HEX, because they cover most of the other character classes.
 * %ESCAPE_NAP can utilize %ESCAPE_SPACE or %ESCAPE_SPECIAL in addition to
 * the above.
 *
 * Return:
 * The total size of the escaped output that would be generated for
 * the given input and flags. To check whether the output was
 * truncated, compare the return value to osz. There is room left in
 * dst for a '\0' terminator if and only if ret < osz.
 */
int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
                      unsigned int flags, const char *only)
{
        char *p = dst;
        char *end = p + osz;
        bool is_dict = only && *only;
        bool is_append = flags & ESCAPE_APPEND;

        while (isz--) {
                unsigned char c = *src++;
                bool in_dict = is_dict && strchr(only, c);

                /*
                 * Apply rules in the following sequence:
                 *        - the @only string is supplied and does not contain a
                 *          character under question
                 *        - the character is printable and ASCII, when @flags has
                 *          %ESCAPE_NAP bit set
                 *        - the character is printable, when @flags has
                 *          %ESCAPE_NP bit set
                 *        - the character is ASCII, when @flags has
                 *          %ESCAPE_NA bit set
                 *        - the character doesn't fall into a class of symbols
                 *          defined by given @flags
                 * In these cases we just pass through a character to the
                 * output buffer.
                 *
                 * When %ESCAPE_APPEND is passed, the characters from @only
                 * have been excluded from the %ESCAPE_NAP, %ESCAPE_NP, and
                 * %ESCAPE_NA cases.
                 */
                if (!(is_append || in_dict) && is_dict &&
                                          escape_passthrough(c, &p, end))
                        continue;

                if (!(is_append && in_dict) && isascii(c) && isprint(c) &&
                    flags & ESCAPE_NAP && escape_passthrough(c, &p, end))
                        continue;

                if (!(is_append && in_dict) && isprint(c) &&
                    flags & ESCAPE_NP && escape_passthrough(c, &p, end))
                        continue;

                if (!(is_append && in_dict) && isascii(c) &&
                    flags & ESCAPE_NA && escape_passthrough(c, &p, end))
                        continue;

                if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
                        continue;

                if (flags & ESCAPE_SPECIAL && escape_special(c, &p, end))
                        continue;

                if (flags & ESCAPE_NULL && escape_null(c, &p, end))
                        continue;

                /* ESCAPE_OCTAL and ESCAPE_HEX always go last */
                if (flags & ESCAPE_OCTAL && escape_octal(c, &p, end))
                        continue;

                if (flags & ESCAPE_HEX && escape_hex(c, &p, end))
                        continue;

                escape_passthrough(c, &p, end);
        }

        return p - dst;
}
EXPORT_SYMBOL(string_escape_mem);

/*
 * Return an allocated string that has been escaped of special characters
 * and double quotes, making it safe to log in quotes.
 */
char *kstrdup_quotable(const char *src, gfp_t gfp)
{
        size_t slen, dlen;
        char *dst;
        const int flags = ESCAPE_HEX;
        const char esc[] = "\f\n\r\t\v\a\e\\\"";

        if (!src)
                return NULL;
        slen = strlen(src);

        dlen = string_escape_mem(src, slen, NULL, 0, flags, esc);
        dst = kmalloc(dlen + 1, gfp);
        if (!dst)
                return NULL;

        WARN_ON(string_escape_mem(src, slen, dst, dlen, flags, esc) != dlen);
        dst[dlen] = '\0';

        return dst;
}
EXPORT_SYMBOL_GPL(kstrdup_quotable);

/*
 * Returns allocated NULL-terminated string containing process
 * command line, with inter-argument NULLs replaced with spaces,
 * and other special characters escaped.
 */
char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp)
{
        char *buffer, *quoted;
        int i, res;

        buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!buffer)
                return NULL;

        res = get_cmdline(task, buffer, PAGE_SIZE - 1);
        buffer[res] = '\0';

        /* Collapse trailing NULLs, leave res pointing to last non-NULL. */
        while (--res >= 0 && buffer[res] == '\0')
                ;

        /* Replace inter-argument NULLs. */
        for (i = 0; i <= res; i++)
                if (buffer[i] == '\0')
                        buffer[i] = ' ';

        /* Make sure result is printable. */
        quoted = kstrdup_quotable(buffer, gfp);
        kfree(buffer);
        return quoted;
}
EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline);

/*
 * Returns allocated NULL-terminated string containing pathname,
 * with special characters escaped, able to be safely logged. If
 * there is an error, the leading character will be "<".
 */
char *kstrdup_quotable_file(struct file *file, gfp_t gfp)
{
        char *temp, *pathname;

        if (!file)
                return kstrdup("<unknown>", gfp);

        /* We add 11 spaces for ' (deleted)' to be appended */
        temp = kmalloc(PATH_MAX + 11, GFP_KERNEL);
        if (!temp)
                return kstrdup("<no_memory>", gfp);

        pathname = file_path(file, temp, PATH_MAX + 11);
        if (IS_ERR(pathname))
                pathname = kstrdup("<too_long>", gfp);
        else
                pathname = kstrdup_quotable(pathname, gfp);

        kfree(temp);
        return pathname;
}
EXPORT_SYMBOL_GPL(kstrdup_quotable_file);

/*
 * Returns duplicate string in which the @old characters are replaced by @new.
 */
char *kstrdup_and_replace(const char *src, char old, char new, gfp_t gfp)
{
        char *dst;

        dst = kstrdup(src, gfp);
        if (!dst)
                return NULL;

        return strreplace(dst, old, new);
}
EXPORT_SYMBOL_GPL(kstrdup_and_replace);

/**
 * kasprintf_strarray - allocate and fill array of sequential strings
 * @gfp: flags for the slab allocator
 * @prefix: prefix to be used
 * @n: amount of lines to be allocated and filled
 *
 * Allocates and fills @n strings using pattern "%s-%zu", where prefix
 * is provided by caller. The caller is responsible to free them with
 * kfree_strarray() after use.
 *
 * Returns array of strings or NULL when memory can't be allocated.
 */
char **kasprintf_strarray(gfp_t gfp, const char *prefix, size_t n)
{
        char **names;
        size_t i;

        names = kcalloc(n + 1, sizeof(char *), gfp);
        if (!names)
                return NULL;

        for (i = 0; i < n; i++) {
                names[i] = kasprintf(gfp, "%s-%zu", prefix, i);
                if (!names[i]) {
                        kfree_strarray(names, i);
                        return NULL;
                }
        }

        return names;
}
EXPORT_SYMBOL_GPL(kasprintf_strarray);

/**
 * kfree_strarray - free a number of dynamically allocated strings contained
 *                  in an array and the array itself
 *
 * @array: Dynamically allocated array of strings to free.
 * @n: Number of strings (starting from the beginning of the array) to free.
 *
 * Passing a non-NULL @array and @n == 0 as well as NULL @array are valid
 * use-cases. If @array is NULL, the function does nothing.
 */
void kfree_strarray(char **array, size_t n)
{
        unsigned int i;

        if (!array)
                return;

        for (i = 0; i < n; i++)
                kfree(array[i]);
        kfree(array);
}
EXPORT_SYMBOL_GPL(kfree_strarray);

struct strarray {
        char **array;
        size_t n;
};

static void devm_kfree_strarray(struct device *dev, void *res)
{
        struct strarray *array = res;

        kfree_strarray(array->array, array->n);
}

char **devm_kasprintf_strarray(struct device *dev, const char *prefix, size_t n)
{
        struct strarray *ptr;

        ptr = devres_alloc(devm_kfree_strarray, sizeof(*ptr), GFP_KERNEL);
        if (!ptr)
                return ERR_PTR(-ENOMEM);

        ptr->array = kasprintf_strarray(GFP_KERNEL, prefix, n);
        if (!ptr->array) {
                devres_free(ptr);
                return ERR_PTR(-ENOMEM);
        }

        ptr->n = n;
        devres_add(dev, ptr);

        return ptr->array;
}
EXPORT_SYMBOL_GPL(devm_kasprintf_strarray);

/**
 * skip_spaces - Removes leading whitespace from @str.
 * @str: The string to be stripped.
 *
 * Returns a pointer to the first non-whitespace character in @str.
 */
char *skip_spaces(const char *str)
{
        while (isspace(*str))
                ++str;
        return (char *)str;
}
EXPORT_SYMBOL(skip_spaces);

/**
 * strim - Removes leading and trailing whitespace from @s.
 * @s: The string to be stripped.
 *
 * Note that the first trailing whitespace is replaced with a %NUL-terminator
 * in the given string @s. Returns a pointer to the first non-whitespace
 * character in @s.
 */
char *strim(char *s)
{
        size_t size;
        char *end;

        size = strlen(s);
        if (!size)
                return s;

        end = s + size - 1;
        while (end >= s && isspace(*end))
                end--;
        *(end + 1) = '\0';

        return skip_spaces(s);
}
EXPORT_SYMBOL(strim);

/**
 * sysfs_streq - return true if strings are equal, modulo trailing newline
 * @s1: one string
 * @s2: another string
 *
 * This routine returns true iff two strings are equal, treating both
 * NUL and newline-then-NUL as equivalent string terminations.  It's
 * geared for use with sysfs input strings, which generally terminate
 * with newlines but are compared against values without newlines.
 */
bool sysfs_streq(const char *s1, const char *s2)
{
        while (*s1 && *s1 == *s2) {
                s1++;
                s2++;
        }

        if (*s1 == *s2)
                return true;
        if (!*s1 && *s2 == '\n' && !s2[1])
                return true;
        if (*s1 == '\n' && !s1[1] && !*s2)
                return true;
        return false;
}
EXPORT_SYMBOL(sysfs_streq);

/**
 * match_string - matches given string in an array
 * @array:        array of strings
 * @n:                number of strings in the array or -1 for NULL terminated arrays
 * @string:        string to match with
 *
 * This routine will look for a string in an array of strings up to the
 * n-th element in the array or until the first NULL element.
 *
 * Historically the value of -1 for @n, was used to search in arrays that
 * are NULL terminated. However, the function does not make a distinction
 * when finishing the search: either @n elements have been compared OR
 * the first NULL element was found.
 *
 * Return:
 * index of a @string in the @array if matches, or %-EINVAL otherwise.
 */
int match_string(const char * const *array, size_t n, const char *string)
{
        int index;
        const char *item;

        for (index = 0; index < n; index++) {
                item = array[index];
                if (!item)
                        break;
                if (!strcmp(item, string))
                        return index;
        }

        return -EINVAL;
}
EXPORT_SYMBOL(match_string);

/**
 * __sysfs_match_string - matches given string in an array
 * @array: array of strings
 * @n: number of strings in the array or -1 for NULL terminated arrays
 * @str: string to match with
 *
 * Returns index of @str in the @array or -EINVAL, just like match_string().
 * Uses sysfs_streq instead of strcmp for matching.
 *
 * This routine will look for a string in an array of strings up to the
 * n-th element in the array or until the first NULL element.
 *
 * Historically the value of -1 for @n, was used to search in arrays that
 * are NULL terminated. However, the function does not make a distinction
 * when finishing the search: either @n elements have been compared OR
 * the first NULL element was found.
 */
int __sysfs_match_string(const char * const *array, size_t n, const char *str)
{
        const char *item;
        int index;

        for (index = 0; index < n; index++) {
                item = array[index];
                if (!item)
                        break;
                if (sysfs_streq(item, str))
                        return index;
        }

        return -EINVAL;
}
EXPORT_SYMBOL(__sysfs_match_string);

/**
 * strreplace - Replace all occurrences of character in string.
 * @str: The string to operate on.
 * @old: The character being replaced.
 * @new: The character @old is replaced with.
 *
 * Replaces the each @old character with a @new one in the given string @str.
 *
 * Return: pointer to the string @str itself.
 */
char *strreplace(char *str, char old, char new)
{
        char *s = str;

        for (; *s; ++s)
                if (*s == old)
                        *s = new;
        return str;
}
EXPORT_SYMBOL(strreplace);

/**
 * memcpy_and_pad - Copy one buffer to another with padding
 * @dest: Where to copy to
 * @dest_len: The destination buffer size
 * @src: Where to copy from
 * @count: The number of bytes to copy
 * @pad: Character to use for padding if space is left in destination.
 */
void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
                    int pad)
{
        if (dest_len > count) {
                memcpy(dest, src, count);
                memset(dest + count, pad,  dest_len - count);
        } else {
                memcpy(dest, src, dest_len);
        }
}
EXPORT_SYMBOL(memcpy_and_pad);

#ifdef CONFIG_FORTIFY_SOURCE
/* These are placeholders for fortify compile-time warnings. */
void __read_overflow2_field(size_t avail, size_t wanted) { }
EXPORT_SYMBOL(__read_overflow2_field);
void __write_overflow_field(size_t avail, size_t wanted) { }
EXPORT_SYMBOL(__write_overflow_field);

static const char * const fortify_func_name[] = {
#define MAKE_FORTIFY_FUNC_NAME(func)        [MAKE_FORTIFY_FUNC(func)] = #func
        EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC_NAME)
#undef  MAKE_FORTIFY_FUNC_NAME
};

void __fortify_report(const u8 reason, const size_t avail, const size_t size)
{
        const u8 func = FORTIFY_REASON_FUNC(reason);
        const bool write = FORTIFY_REASON_DIR(reason);
        const char *name;

        name = fortify_func_name[umin(func, FORTIFY_FUNC_UNKNOWN)];
        WARN(1, "%s: detected buffer overflow: %zu byte %s of buffer size %zu\n",
                 name, size, str_read_write(!write), avail);
}
EXPORT_SYMBOL(__fortify_report);

void __fortify_panic(const u8 reason, const size_t avail, const size_t size)
{
        __fortify_report(reason, avail, size);
        BUG();
}
EXPORT_SYMBOL(__fortify_panic);
#endif /* CONFIG_FORTIFY_SOURCE */









































































































































































































































































































































































    2 
    3 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/power/common.c - Common device power management code.
 *
 * Copyright (C) 2011 Rafael J. Wysocki <rjw@sisk.pl>, Renesas Electronics Corp.
 */
#include <linux/kernel.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/pm_clock.h>
#include <linux/acpi.h>
#include <linux/pm_domain.h>
#include <linux/pm_opp.h>

#include "power.h"

/**
 * dev_pm_get_subsys_data - Create or refcount power.subsys_data for device.
 * @dev: Device to handle.
 *
 * If power.subsys_data is NULL, point it to a new object, otherwise increment
 * its reference counter.  Return 0 if new object has been created or refcount
 * increased, otherwise negative error code.
 */
int dev_pm_get_subsys_data(struct device *dev)
{
        struct pm_subsys_data *psd;

        psd = kzalloc_obj(*psd);
        if (!psd)
                return -ENOMEM;

        spin_lock_irq(&dev->power.lock);

        if (dev->power.subsys_data) {
                dev->power.subsys_data->refcount++;
        } else {
                spin_lock_init(&psd->lock);
                psd->refcount = 1;
                dev->power.subsys_data = psd;
                pm_clk_init(dev);
                psd = NULL;
        }

        spin_unlock_irq(&dev->power.lock);

        /* kfree() verifies that its argument is nonzero. */
        kfree(psd);

        return 0;
}
EXPORT_SYMBOL_GPL(dev_pm_get_subsys_data);

/**
 * dev_pm_put_subsys_data - Drop reference to power.subsys_data.
 * @dev: Device to handle.
 *
 * If the reference counter of power.subsys_data is zero after dropping the
 * reference, power.subsys_data is removed.
 */
void dev_pm_put_subsys_data(struct device *dev)
{
        struct pm_subsys_data *psd;

        spin_lock_irq(&dev->power.lock);

        psd = dev_to_psd(dev);
        if (!psd)
                goto out;

        if (--psd->refcount == 0)
                dev->power.subsys_data = NULL;
        else
                psd = NULL;

 out:
        spin_unlock_irq(&dev->power.lock);
        kfree(psd);
}
EXPORT_SYMBOL_GPL(dev_pm_put_subsys_data);

/**
 * dev_pm_domain_attach - Attach a device to its PM domain.
 * @dev: Device to attach.
 * @flags: indicate whether we should power on/off the device on attach/detach
 *
 * The @dev may only be attached to a single PM domain. By iterating through
 * the available alternatives we try to find a valid PM domain for the device.
 * As attachment succeeds, the ->detach() callback in the struct dev_pm_domain
 * should be assigned by the corresponding attach function.
 *
 * This function should typically be invoked from subsystem level code during
 * the probe phase. Especially for those that holds devices which requires
 * power management through PM domains.
 *
 * Callers must ensure proper synchronization of this function with power
 * management callbacks.
 *
 * Returns 0 on successfully attached PM domain, or when it is found that the
 * device doesn't need a PM domain, else a negative error code.
 */
int dev_pm_domain_attach(struct device *dev, u32 flags)
{
        int ret;

        if (dev->pm_domain)
                return 0;

        ret = acpi_dev_pm_attach(dev, !!(flags & PD_FLAG_ATTACH_POWER_ON));
        if (!ret)
                ret = genpd_dev_pm_attach(dev);

        if (dev->pm_domain)
                dev->power.detach_power_off = !!(flags & PD_FLAG_DETACH_POWER_OFF);

        return ret < 0 ? ret : 0;
}
EXPORT_SYMBOL_GPL(dev_pm_domain_attach);

/**
 * dev_pm_domain_attach_by_id - Associate a device with one of its PM domains.
 * @dev: The device used to lookup the PM domain.
 * @index: The index of the PM domain.
 *
 * As @dev may only be attached to a single PM domain, the backend PM domain
 * provider creates a virtual device to attach instead. If attachment succeeds,
 * the ->detach() callback in the struct dev_pm_domain are assigned by the
 * corresponding backend attach function, as to deal with detaching of the
 * created virtual device.
 *
 * This function should typically be invoked by a driver during the probe phase,
 * in case its device requires power management through multiple PM domains. The
 * driver may benefit from using the received device, to configure device-links
 * towards its original device. Depending on the use-case and if needed, the
 * links may be dynamically changed by the driver, which allows it to control
 * the power to the PM domains independently from each other.
 *
 * Callers must ensure proper synchronization of this function with power
 * management callbacks.
 *
 * Returns the virtual created device when successfully attached to its PM
 * domain, NULL in case @dev don't need a PM domain, else an ERR_PTR().
 * Note that, to detach the returned virtual device, the driver shall call
 * dev_pm_domain_detach() on it, typically during the remove phase.
 */
struct device *dev_pm_domain_attach_by_id(struct device *dev,
                                          unsigned int index)
{
        if (dev->pm_domain)
                return ERR_PTR(-EEXIST);

        return genpd_dev_pm_attach_by_id(dev, index);
}
EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_id);

/**
 * dev_pm_domain_attach_by_name - Associate a device with one of its PM domains.
 * @dev: The device used to lookup the PM domain.
 * @name: The name of the PM domain.
 *
 * For a detailed function description, see dev_pm_domain_attach_by_id().
 */
struct device *dev_pm_domain_attach_by_name(struct device *dev,
                                            const char *name)
{
        if (dev->pm_domain)
                return ERR_PTR(-EEXIST);

        return genpd_dev_pm_attach_by_name(dev, name);
}
EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_name);

/**
 * dev_pm_domain_attach_list - Associate a device with its PM domains.
 * @dev: The device used to lookup the PM domains for.
 * @data: The data used for attaching to the PM domains.
 * @list: An out-parameter with an allocated list of attached PM domains.
 *
 * This function helps to attach a device to its multiple PM domains. The
 * caller, which is typically a driver's probe function, may provide a list of
 * names for the PM domains that we should try to attach the device to, but it
 * may also provide an empty list, in case the attach should be done for all of
 * the available PM domains.
 *
 * Callers must ensure proper synchronization of this function with power
 * management callbacks.
 *
 * Returns the number of attached PM domains or a negative error code in case of
 * a failure. Note that, to detach the list of PM domains, the driver shall call
 * dev_pm_domain_detach_list(), typically during the remove phase.
 */
int dev_pm_domain_attach_list(struct device *dev,
                              const struct dev_pm_domain_attach_data *data,
                              struct dev_pm_domain_list **list)
{
        struct device_node *np = dev->of_node;
        struct dev_pm_domain_list *pds;
        struct device *pd_dev = NULL;
        int ret, i, num_pds = 0;
        bool by_id = true;
        size_t size;
        u32 pd_flags = data ? data->pd_flags : 0;
        u32 link_flags = pd_flags & PD_FLAG_NO_DEV_LINK ? 0 :
                        DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME;

        if (dev->pm_domain)
                return -EEXIST;

        /* For now this is limited to OF based platforms. */
        if (!np)
                return 0;

        if (data && data->pd_names) {
                num_pds = data->num_pd_names;
                by_id = false;
        } else {
                num_pds = of_count_phandle_with_args(np, "power-domains",
                                                     "#power-domain-cells");
        }

        if (num_pds <= 0)
                return 0;

        pds = kzalloc_obj(*pds);
        if (!pds)
                return -ENOMEM;

        size = sizeof(*pds->pd_devs) + sizeof(*pds->pd_links) +
               sizeof(*pds->opp_tokens);
        pds->pd_devs = kcalloc(num_pds, size, GFP_KERNEL);
        if (!pds->pd_devs) {
                ret = -ENOMEM;
                goto free_pds;
        }
        pds->pd_links = (void *)(pds->pd_devs + num_pds);
        pds->opp_tokens = (void *)(pds->pd_links + num_pds);

        if (link_flags && pd_flags & PD_FLAG_DEV_LINK_ON)
                link_flags |= DL_FLAG_RPM_ACTIVE;

        for (i = 0; i < num_pds; i++) {
                if (by_id)
                        pd_dev = dev_pm_domain_attach_by_id(dev, i);
                else
                        pd_dev = dev_pm_domain_attach_by_name(dev,
                                                        data->pd_names[i]);
                if (IS_ERR_OR_NULL(pd_dev)) {
                        ret = pd_dev ? PTR_ERR(pd_dev) : -ENODEV;
                        goto err_attach;
                }

                if (pd_flags & PD_FLAG_REQUIRED_OPP) {
                        struct dev_pm_opp_config config = {
                                .required_dev = pd_dev,
                                .required_dev_index = i,
                        };

                        ret = dev_pm_opp_set_config(dev, &config);
                        if (ret < 0)
                                goto err_link;

                        pds->opp_tokens[i] = ret;
                }

                if (link_flags) {
                        struct device_link *link;

                        link = device_link_add(dev, pd_dev, link_flags);
                        if (!link) {
                                ret = -ENODEV;
                                goto err_link;
                        }

                        pds->pd_links[i] = link;
                }

                pds->pd_devs[i] = pd_dev;
        }

        pds->num_pds = num_pds;
        *list = pds;
        return num_pds;

err_link:
        dev_pm_opp_clear_config(pds->opp_tokens[i]);
        dev_pm_domain_detach(pd_dev, true);
err_attach:
        while (--i >= 0) {
                dev_pm_opp_clear_config(pds->opp_tokens[i]);
                if (pds->pd_links[i])
                        device_link_del(pds->pd_links[i]);
                dev_pm_domain_detach(pds->pd_devs[i], true);
        }
        kfree(pds->pd_devs);
free_pds:
        kfree(pds);
        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_domain_attach_list);

/**
 * devm_pm_domain_detach_list - devres-enabled version of dev_pm_domain_detach_list.
 * @_list: The list of PM domains to detach.
 *
 * This function reverse the actions from devm_pm_domain_attach_list().
 * it will be invoked during the remove phase from drivers implicitly if driver
 * uses devm_pm_domain_attach_list() to attach the PM domains.
 */
static void devm_pm_domain_detach_list(void *_list)
{
        struct dev_pm_domain_list *list = _list;

        dev_pm_domain_detach_list(list);
}

/**
 * devm_pm_domain_attach_list - devres-enabled version of dev_pm_domain_attach_list
 * @dev: The device used to lookup the PM domains for.
 * @data: The data used for attaching to the PM domains.
 * @list: An out-parameter with an allocated list of attached PM domains.
 *
 * NOTE: this will also handle calling devm_pm_domain_detach_list() for
 * you during remove phase.
 *
 * Returns the number of attached PM domains or a negative error code in case of
 * a failure.
 */
int devm_pm_domain_attach_list(struct device *dev,
                               const struct dev_pm_domain_attach_data *data,
                               struct dev_pm_domain_list **list)
{
        int ret, num_pds;

        num_pds = dev_pm_domain_attach_list(dev, data, list);
        if (num_pds <= 0)
                return num_pds;

        ret = devm_add_action_or_reset(dev, devm_pm_domain_detach_list, *list);
        if (ret)
                return ret;

        return num_pds;
}
EXPORT_SYMBOL_GPL(devm_pm_domain_attach_list);

/**
 * dev_pm_domain_detach - Detach a device from its PM domain.
 * @dev: Device to detach.
 * @power_off: Used to indicate whether we should power off the device.
 *
 * This functions will reverse the actions from dev_pm_domain_attach(),
 * dev_pm_domain_attach_by_id() and dev_pm_domain_attach_by_name(), thus it
 * detaches @dev from its PM domain.  Typically it should be invoked during the
 * remove phase, either from subsystem level code or from drivers.
 *
 * Callers must ensure proper synchronization of this function with power
 * management callbacks.
 */
void dev_pm_domain_detach(struct device *dev, bool power_off)
{
        if (dev->pm_domain && dev->pm_domain->detach)
                dev->pm_domain->detach(dev, power_off);
}
EXPORT_SYMBOL_GPL(dev_pm_domain_detach);

/**
 * dev_pm_domain_detach_list - Detach a list of PM domains.
 * @list: The list of PM domains to detach.
 *
 * This function reverse the actions from dev_pm_domain_attach_list().
 * Typically it should be invoked during the remove phase from drivers.
 *
 * Callers must ensure proper synchronization of this function with power
 * management callbacks.
 */
void dev_pm_domain_detach_list(struct dev_pm_domain_list *list)
{
        int i;

        if (!list)
                return;

        for (i = 0; i < list->num_pds; i++) {
                dev_pm_opp_clear_config(list->opp_tokens[i]);
                if (list->pd_links[i])
                        device_link_del(list->pd_links[i]);
                dev_pm_domain_detach(list->pd_devs[i], true);
        }

        kfree(list->pd_devs);
        kfree(list);
}
EXPORT_SYMBOL_GPL(dev_pm_domain_detach_list);

/**
 * dev_pm_domain_start - Start the device through its PM domain.
 * @dev: Device to start.
 *
 * This function should typically be called during probe by a subsystem/driver,
 * when it needs to start its device from the PM domain's perspective. Note
 * that, it's assumed that the PM domain is already powered on when this
 * function is called.
 *
 * Returns 0 on success and negative error values on failures.
 */
int dev_pm_domain_start(struct device *dev)
{
        if (dev->pm_domain && dev->pm_domain->start)
                return dev->pm_domain->start(dev);

        return 0;
}
EXPORT_SYMBOL_GPL(dev_pm_domain_start);

/**
 * dev_pm_domain_set - Set PM domain of a device.
 * @dev: Device whose PM domain is to be set.
 * @pd: PM domain to be set, or NULL.
 *
 * Sets the PM domain the device belongs to. The PM domain of a device needs
 * to be set before its probe finishes (it's bound to a driver).
 *
 * This function must be called with the device lock held.
 */
void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd)
{
        if (dev->pm_domain == pd)
                return;

        WARN(pd && device_is_bound(dev),
             "PM domains can only be changed for unbound devices\n");
        dev->pm_domain = pd;
        device_pm_check_callbacks(dev);
}
EXPORT_SYMBOL_GPL(dev_pm_domain_set);

/**
 * dev_pm_domain_set_performance_state - Request a new performance state.
 * @dev: The device to make the request for.
 * @state: Target performance state for the device.
 *
 * This function should be called when a new performance state needs to be
 * requested for a device that is attached to a PM domain. Note that, the
 * support for performance scaling for PM domains is optional.
 *
 * Returns 0 on success and when performance scaling isn't supported, negative
 * error code on failure.
 */
int dev_pm_domain_set_performance_state(struct device *dev, unsigned int state)
{
        if (dev->pm_domain && dev->pm_domain->set_performance_state)
                return dev->pm_domain->set_performance_state(dev, state);

        return 0;
}
EXPORT_SYMBOL_GPL(dev_pm_domain_set_performance_state);






























    3 

    2 







































































































































































































































    1 


    1 


    1 



    1 






























    1 






























    1 






    1 






    1 




















    1 






















































    1 












    1 


















    1 












    1 


















    1 










    1 














    1 






    1 



















    2 










































    2 
















    2 


















    2 



    1 





    2 



    1 









    1 
    1 












































    2 
    1 








    2 




    1 







    1 





    2 






















    2 






    1 













































    2 


    1 




    1 

    1 








    2 

    2 



















    3 









    2 
    2 









    3 

    2 


    1 



    1 











    3 









    2 



    3 



    3 

    2 















    3 













    3 






















    3 





























    3 










    2 
    1 






    3 















    3 
    3 

















    1 




















    1 

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
// SPDX-License-Identifier: GPL-2.0
/*
 * Released under the GPLv2 only.
 */

#include <linux/usb.h>
#include <linux/usb/ch9.h>
#include <linux/usb/hcd.h>
#include <linux/usb/quirks.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string_choices.h>
#include <linux/device.h>
#include <asm/byteorder.h>
#include "usb.h"


#define USB_MAXALTSETTING                128        /* Hard limit */

#define USB_MAXCONFIG                        8        /* Arbitrary limit */

static int find_next_descriptor(unsigned char *buffer, int size,
    int dt1, int dt2, int *num_skipped)
{
        struct usb_descriptor_header *h;
        int n = 0;
        unsigned char *buffer0 = buffer;

        /* Find the next descriptor of type dt1 or dt2 */
        while (size > 0) {
                h = (struct usb_descriptor_header *) buffer;
                if (h->bDescriptorType == dt1 || h->bDescriptorType == dt2)
                        break;
                buffer += h->bLength;
                size -= h->bLength;
                ++n;
        }

        /* Store the number of descriptors skipped and return the
         * number of bytes skipped */
        if (num_skipped)
                *num_skipped = n;
        return buffer - buffer0;
}

static void usb_parse_ssp_isoc_endpoint_companion(struct device *ddev,
                int cfgno, int inum, int asnum, struct usb_host_endpoint *ep,
                unsigned char *buffer, int size)
{
        struct usb_ssp_isoc_ep_comp_descriptor *desc;

        /*
         * The SuperSpeedPlus Isoc endpoint companion descriptor immediately
         * follows the SuperSpeed Endpoint Companion descriptor
         */
        desc = (struct usb_ssp_isoc_ep_comp_descriptor *) buffer;
        if (size < USB_DT_SSP_ISOC_EP_COMP_SIZE ||
            desc->bDescriptorType != USB_DT_SSP_ISOC_ENDPOINT_COMP) {
                dev_notice(ddev, "Invalid SuperSpeedPlus isoc endpoint companion"
                         "for config %d interface %d altsetting %d ep %d.\n",
                         cfgno, inum, asnum, ep->desc.bEndpointAddress);
                return;
        }
        memcpy(&ep->ssp_isoc_ep_comp, desc, USB_DT_SSP_ISOC_EP_COMP_SIZE);
}

static void usb_parse_eusb2_isoc_endpoint_companion(struct device *ddev,
                int cfgno, int inum, int asnum, struct usb_host_endpoint *ep,
                unsigned char *buffer, int size)
{
        struct usb_eusb2_isoc_ep_comp_descriptor *desc;
        struct usb_descriptor_header *h;

        /*
         * eUSB2 isochronous endpoint companion descriptor for this endpoint
         * shall be declared before the next endpoint or interface descriptor
         */
        while (size >= USB_DT_EUSB2_ISOC_EP_COMP_SIZE) {
                h = (struct usb_descriptor_header *)buffer;

                if (h->bDescriptorType == USB_DT_EUSB2_ISOC_ENDPOINT_COMP) {
                        desc = (struct usb_eusb2_isoc_ep_comp_descriptor *)buffer;
                        ep->eusb2_isoc_ep_comp = *desc;
                        return;
                }
                if (h->bDescriptorType == USB_DT_ENDPOINT ||
                    h->bDescriptorType == USB_DT_INTERFACE)
                        break;

                buffer += h->bLength;
                size -= h->bLength;
        }

        dev_notice(ddev, "No eUSB2 isoc ep %d companion for config %d interface %d altsetting %d\n",
                   ep->desc.bEndpointAddress, cfgno, inum, asnum);
}

static void usb_parse_ss_endpoint_companion(struct device *ddev, int cfgno,
                int inum, int asnum, struct usb_host_endpoint *ep,
                unsigned char *buffer, int size)
{
        struct usb_ss_ep_comp_descriptor *desc;
        int max_tx;

        /* The SuperSpeed endpoint companion descriptor is supposed to
         * be the first thing immediately following the endpoint descriptor.
         */
        desc = (struct usb_ss_ep_comp_descriptor *) buffer;

        if (size < USB_DT_SS_EP_COMP_SIZE) {
                dev_notice(ddev,
                           "invalid SuperSpeed endpoint companion descriptor "
                           "of length %d, skipping\n", size);
                return;
        }

        if (desc->bDescriptorType != USB_DT_SS_ENDPOINT_COMP) {
                dev_notice(ddev, "No SuperSpeed endpoint companion for config %d "
                                " interface %d altsetting %d ep %d: "
                                "using minimum values\n",
                                cfgno, inum, asnum, ep->desc.bEndpointAddress);

                /* Fill in some default values.
                 * Leave bmAttributes as zero, which will mean no streams for
                 * bulk, and isoc won't support multiple bursts of packets.
                 * With bursts of only one packet, and a Mult of 1, the max
                 * amount of data moved per endpoint service interval is one
                 * packet.
                 */
                ep->ss_ep_comp.bLength = USB_DT_SS_EP_COMP_SIZE;
                ep->ss_ep_comp.bDescriptorType = USB_DT_SS_ENDPOINT_COMP;
                if (usb_endpoint_xfer_isoc(&ep->desc) ||
                                usb_endpoint_xfer_int(&ep->desc))
                        ep->ss_ep_comp.wBytesPerInterval =
                                        ep->desc.wMaxPacketSize;
                return;
        }
        buffer += desc->bLength;
        size -= desc->bLength;
        memcpy(&ep->ss_ep_comp, desc, USB_DT_SS_EP_COMP_SIZE);

        /* Check the various values */
        if (usb_endpoint_xfer_control(&ep->desc) && desc->bMaxBurst != 0) {
                dev_notice(ddev, "Control endpoint with bMaxBurst = %d in "
                                "config %d interface %d altsetting %d ep %d: "
                                "setting to zero\n", desc->bMaxBurst,
                                cfgno, inum, asnum, ep->desc.bEndpointAddress);
                ep->ss_ep_comp.bMaxBurst = 0;
        } else if (desc->bMaxBurst > 15) {
                dev_notice(ddev, "Endpoint with bMaxBurst = %d in "
                                "config %d interface %d altsetting %d ep %d: "
                                "setting to 15\n", desc->bMaxBurst,
                                cfgno, inum, asnum, ep->desc.bEndpointAddress);
                ep->ss_ep_comp.bMaxBurst = 15;
        }

        if ((usb_endpoint_xfer_control(&ep->desc) ||
                        usb_endpoint_xfer_int(&ep->desc)) &&
                                desc->bmAttributes != 0) {
                dev_notice(ddev, "%s endpoint with bmAttributes = %d in "
                                "config %d interface %d altsetting %d ep %d: "
                                "setting to zero\n",
                                usb_endpoint_xfer_control(&ep->desc) ? "Control" : "Bulk",
                                desc->bmAttributes,
                                cfgno, inum, asnum, ep->desc.bEndpointAddress);
                ep->ss_ep_comp.bmAttributes = 0;
        } else if (usb_endpoint_xfer_bulk(&ep->desc) &&
                        desc->bmAttributes > 16) {
                dev_notice(ddev, "Bulk endpoint with more than 65536 streams in "
                                "config %d interface %d altsetting %d ep %d: "
                                "setting to max\n",
                                cfgno, inum, asnum, ep->desc.bEndpointAddress);
                ep->ss_ep_comp.bmAttributes = 16;
        } else if (usb_endpoint_xfer_isoc(&ep->desc) &&
                   !USB_SS_SSP_ISOC_COMP(desc->bmAttributes) &&
                   USB_SS_MULT(desc->bmAttributes) > 3) {
                dev_notice(ddev, "Isoc endpoint has Mult of %d in "
                                "config %d interface %d altsetting %d ep %d: "
                                "setting to 3\n",
                                USB_SS_MULT(desc->bmAttributes),
                                cfgno, inum, asnum, ep->desc.bEndpointAddress);
                ep->ss_ep_comp.bmAttributes = 2;
        }

        if (usb_endpoint_xfer_isoc(&ep->desc))
                max_tx = (desc->bMaxBurst + 1) *
                        (USB_SS_MULT(desc->bmAttributes)) *
                        usb_endpoint_maxp(&ep->desc);
        else if (usb_endpoint_xfer_int(&ep->desc))
                max_tx = usb_endpoint_maxp(&ep->desc) *
                        (desc->bMaxBurst + 1);
        else
                max_tx = 999999;
        if (le16_to_cpu(desc->wBytesPerInterval) > max_tx) {
                dev_notice(ddev, "%s endpoint with wBytesPerInterval of %d in "
                                "config %d interface %d altsetting %d ep %d: "
                                "setting to %d\n",
                                usb_endpoint_xfer_isoc(&ep->desc) ? "Isoc" : "Int",
                                le16_to_cpu(desc->wBytesPerInterval),
                                cfgno, inum, asnum, ep->desc.bEndpointAddress,
                                max_tx);
                ep->ss_ep_comp.wBytesPerInterval = cpu_to_le16(max_tx);
        }
        /* Parse a possible SuperSpeedPlus isoc ep companion descriptor */
        if (usb_endpoint_xfer_isoc(&ep->desc) &&
            USB_SS_SSP_ISOC_COMP(desc->bmAttributes))
                usb_parse_ssp_isoc_endpoint_companion(ddev, cfgno, inum, asnum,
                                                        ep, buffer, size);
}

static const unsigned short low_speed_maxpacket_maxes[4] = {
        [USB_ENDPOINT_XFER_CONTROL] = 8,
        [USB_ENDPOINT_XFER_ISOC] = 0,
        [USB_ENDPOINT_XFER_BULK] = 0,
        [USB_ENDPOINT_XFER_INT] = 8,
};
static const unsigned short full_speed_maxpacket_maxes[4] = {
        [USB_ENDPOINT_XFER_CONTROL] = 64,
        [USB_ENDPOINT_XFER_ISOC] = 1023,
        [USB_ENDPOINT_XFER_BULK] = 64,
        [USB_ENDPOINT_XFER_INT] = 64,
};
static const unsigned short high_speed_maxpacket_maxes[4] = {
        [USB_ENDPOINT_XFER_CONTROL] = 64,
        [USB_ENDPOINT_XFER_ISOC] = 1024,

        /* Bulk should be 512, but some devices use 1024: we will warn below */
        [USB_ENDPOINT_XFER_BULK] = 1024,
        [USB_ENDPOINT_XFER_INT] = 1024,
};
static const unsigned short super_speed_maxpacket_maxes[4] = {
        [USB_ENDPOINT_XFER_CONTROL] = 512,
        [USB_ENDPOINT_XFER_ISOC] = 1024,
        [USB_ENDPOINT_XFER_BULK] = 1024,
        [USB_ENDPOINT_XFER_INT] = 1024,
};

static bool endpoint_is_duplicate(struct usb_endpoint_descriptor *e1,
                struct usb_endpoint_descriptor *e2)
{
        if (e1->bEndpointAddress == e2->bEndpointAddress)
                return true;

        if (usb_endpoint_xfer_control(e1) || usb_endpoint_xfer_control(e2)) {
                if (usb_endpoint_num(e1) == usb_endpoint_num(e2))
                        return true;
        }

        return false;
}

/*
 * Check for duplicate endpoint addresses in other interfaces and in the
 * altsetting currently being parsed.
 */
static bool config_endpoint_is_duplicate(struct usb_host_config *config,
                int inum, int asnum, struct usb_endpoint_descriptor *d)
{
        struct usb_endpoint_descriptor *epd;
        struct usb_interface_cache *intfc;
        struct usb_host_interface *alt;
        int i, j, k;

        for (i = 0; i < config->desc.bNumInterfaces; ++i) {
                intfc = config->intf_cache[i];

                for (j = 0; j < intfc->num_altsetting; ++j) {
                        alt = &intfc->altsetting[j];

                        if (alt->desc.bInterfaceNumber == inum &&
                                        alt->desc.bAlternateSetting != asnum)
                                continue;

                        for (k = 0; k < alt->desc.bNumEndpoints; ++k) {
                                epd = &alt->endpoint[k].desc;

                                if (endpoint_is_duplicate(epd, d))
                                        return true;
                        }
                }
        }

        return false;
}

static int usb_parse_endpoint(struct device *ddev, int cfgno,
                struct usb_host_config *config, int inum, int asnum,
                struct usb_host_interface *ifp, int num_ep,
                unsigned char *buffer, int size)
{
        struct usb_device *udev = to_usb_device(ddev);
        unsigned char *buffer0 = buffer;
        struct usb_endpoint_descriptor *d;
        struct usb_host_endpoint *endpoint;
        int n, i, j, retval;
        unsigned int maxp;
        const unsigned short *maxpacket_maxes;
        u16 bcdUSB;

        d = (struct usb_endpoint_descriptor *) buffer;
        bcdUSB = le16_to_cpu(udev->descriptor.bcdUSB);
        buffer += d->bLength;
        size -= d->bLength;

        if (d->bLength >= USB_DT_ENDPOINT_AUDIO_SIZE)
                n = USB_DT_ENDPOINT_AUDIO_SIZE;
        else if (d->bLength >= USB_DT_ENDPOINT_SIZE)
                n = USB_DT_ENDPOINT_SIZE;
        else {
                dev_notice(ddev, "config %d interface %d altsetting %d has an "
                    "invalid endpoint descriptor of length %d, skipping\n",
                    cfgno, inum, asnum, d->bLength);
                goto skip_to_next_endpoint_or_interface_descriptor;
        }

        i = usb_endpoint_num(d);
        if (i == 0) {
                dev_notice(ddev, "config %d interface %d altsetting %d has an "
                    "invalid descriptor for endpoint zero, skipping\n",
                    cfgno, inum, asnum);
                goto skip_to_next_endpoint_or_interface_descriptor;
        }

        /* Only store as many endpoints as we have room for */
        if (ifp->desc.bNumEndpoints >= num_ep)
                goto skip_to_next_endpoint_or_interface_descriptor;

        /* Save a copy of the descriptor and use it instead of the original */
        endpoint = &ifp->endpoint[ifp->desc.bNumEndpoints];
        memcpy(&endpoint->desc, d, n);
        d = &endpoint->desc;

        /* Clear the reserved bits in bEndpointAddress */
        i = d->bEndpointAddress &
                        (USB_ENDPOINT_DIR_MASK | USB_ENDPOINT_NUMBER_MASK);
        if (i != d->bEndpointAddress) {
                dev_notice(ddev, "config %d interface %d altsetting %d has an endpoint descriptor with address 0x%X, changing to 0x%X\n",
                    cfgno, inum, asnum, d->bEndpointAddress, i);
                endpoint->desc.bEndpointAddress = i;
        }

        /* Check for duplicate endpoint addresses */
        if (config_endpoint_is_duplicate(config, inum, asnum, d)) {
                dev_notice(ddev, "config %d interface %d altsetting %d has a duplicate endpoint with address 0x%X, skipping\n",
                                cfgno, inum, asnum, d->bEndpointAddress);
                goto skip_to_next_endpoint_or_interface_descriptor;
        }

        /* Ignore some endpoints */
        if (udev->quirks & USB_QUIRK_ENDPOINT_IGNORE) {
                if (usb_endpoint_is_ignored(udev, ifp, d)) {
                        dev_notice(ddev, "config %d interface %d altsetting %d has an ignored endpoint with address 0x%X, skipping\n",
                                        cfgno, inum, asnum,
                                        d->bEndpointAddress);
                        goto skip_to_next_endpoint_or_interface_descriptor;
                }
        }

        /* Accept this endpoint */
        ++ifp->desc.bNumEndpoints;
        INIT_LIST_HEAD(&endpoint->urb_list);

        /*
         * Fix up bInterval values outside the legal range.
         * Use 10 or 8 ms if no proper value can be guessed.
         */
        i = 0;                /* i = min, j = max, n = default */
        j = 255;
        if (usb_endpoint_xfer_int(d)) {
                i = 1;
                switch (udev->speed) {
                case USB_SPEED_SUPER_PLUS:
                case USB_SPEED_SUPER:
                case USB_SPEED_HIGH:
                        /*
                         * Many device manufacturers are using full-speed
                         * bInterval values in high-speed interrupt endpoint
                         * descriptors. Try to fix those and fall back to an
                         * 8-ms default value otherwise.
                         */
                        n = fls(d->bInterval*8);
                        if (n == 0)
                                n = 7;        /* 8 ms = 2^(7-1) uframes */
                        j = 16;

                        /*
                         * Adjust bInterval for quirked devices.
                         */
                        /*
                         * This quirk fixes bIntervals reported in ms.
                         */
                        if (udev->quirks & USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL) {
                                n = clamp(fls(d->bInterval) + 3, i, j);
                                i = j = n;
                        }
                        /*
                         * This quirk fixes bIntervals reported in
                         * linear microframes.
                         */
                        if (udev->quirks & USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL) {
                                n = clamp(fls(d->bInterval), i, j);
                                i = j = n;
                        }
                        break;
                default:                /* USB_SPEED_FULL or _LOW */
                        /*
                         * For low-speed, 10 ms is the official minimum.
                         * But some "overclocked" devices might want faster
                         * polling so we'll allow it.
                         */
                        n = 10;
                        break;
                }
        } else if (usb_endpoint_xfer_isoc(d)) {
                i = 1;
                j = 16;
                switch (udev->speed) {
                case USB_SPEED_HIGH:
                        n = 7;                /* 8 ms = 2^(7-1) uframes */
                        break;
                default:                /* USB_SPEED_FULL */
                        n = 4;                /* 8 ms = 2^(4-1) frames */
                        break;
                }
        }
        if (d->bInterval < i || d->bInterval > j) {
                dev_notice(ddev, "config %d interface %d altsetting %d "
                    "endpoint 0x%X has an invalid bInterval %d, "
                    "changing to %d\n",
                    cfgno, inum, asnum,
                    d->bEndpointAddress, d->bInterval, n);
                endpoint->desc.bInterval = n;
        }

        /* Some buggy low-speed devices have Bulk endpoints, which is
         * explicitly forbidden by the USB spec.  In an attempt to make
         * them usable, we will try treating them as Interrupt endpoints.
         */
        if (udev->speed == USB_SPEED_LOW && usb_endpoint_xfer_bulk(d)) {
                dev_notice(ddev, "config %d interface %d altsetting %d "
                    "endpoint 0x%X is Bulk; changing to Interrupt\n",
                    cfgno, inum, asnum, d->bEndpointAddress);
                endpoint->desc.bmAttributes = USB_ENDPOINT_XFER_INT;
                endpoint->desc.bInterval = 1;
                if (usb_endpoint_maxp(&endpoint->desc) > 8)
                        endpoint->desc.wMaxPacketSize = cpu_to_le16(8);
        }

        /*
         * Validate the wMaxPacketSize field.
         * eUSB2 devices (see USB 2.0 Double Isochronous IN ECN 9.6.6 Endpoint)
         * and devices with isochronous endpoints in altsetting 0 (see USB 2.0
         * end of section 5.6.3) have wMaxPacketSize = 0.
         * So don't warn about those.
         */
        maxp = le16_to_cpu(endpoint->desc.wMaxPacketSize);

        if (maxp == 0 && bcdUSB != 0x0220 &&
            !(usb_endpoint_xfer_isoc(d) && asnum == 0))
                dev_notice(ddev, "config %d interface %d altsetting %d endpoint 0x%X has invalid wMaxPacketSize 0\n",
                    cfgno, inum, asnum, d->bEndpointAddress);

        /* Find the highest legal maxpacket size for this endpoint */
        i = 0;                /* additional transactions per microframe */
        switch (udev->speed) {
        case USB_SPEED_LOW:
                maxpacket_maxes = low_speed_maxpacket_maxes;
                break;
        case USB_SPEED_FULL:
                maxpacket_maxes = full_speed_maxpacket_maxes;
                break;
        case USB_SPEED_HIGH:
                /* Multiple-transactions bits are allowed only for HS periodic endpoints */
                if (usb_endpoint_xfer_int(d) || usb_endpoint_xfer_isoc(d)) {
                        i = maxp & USB_EP_MAXP_MULT_MASK;
                        maxp &= ~i;
                }
                fallthrough;
        default:
                maxpacket_maxes = high_speed_maxpacket_maxes;
                break;
        case USB_SPEED_SUPER:
        case USB_SPEED_SUPER_PLUS:
                maxpacket_maxes = super_speed_maxpacket_maxes;
                break;
        }
        j = maxpacket_maxes[usb_endpoint_type(&endpoint->desc)];

        if (maxp > j) {
                dev_notice(ddev, "config %d interface %d altsetting %d endpoint 0x%X has invalid maxpacket %d, setting to %d\n",
                    cfgno, inum, asnum, d->bEndpointAddress, maxp, j);
                maxp = j;
                endpoint->desc.wMaxPacketSize = cpu_to_le16(i | maxp);
        }

        /*
         * Some buggy high speed devices have bulk endpoints using
         * maxpacket sizes other than 512.  High speed HCDs may not
         * be able to handle that particular bug, so let's warn...
         */
        if (udev->speed == USB_SPEED_HIGH && usb_endpoint_xfer_bulk(d)) {
                if (maxp != 512)
                        dev_notice(ddev, "config %d interface %d altsetting %d "
                                "bulk endpoint 0x%X has invalid maxpacket %d\n",
                                cfgno, inum, asnum, d->bEndpointAddress,
                                maxp);
        }

        /* Parse a possible eUSB2 periodic endpoint companion descriptor */
        if (udev->speed == USB_SPEED_HIGH && bcdUSB == 0x0220 &&
            !le16_to_cpu(d->wMaxPacketSize) && usb_endpoint_is_isoc_in(d))
                usb_parse_eusb2_isoc_endpoint_companion(ddev, cfgno, inum, asnum,
                                                        endpoint, buffer, size);

        /* Parse a possible SuperSpeed endpoint companion descriptor */
        if (udev->speed >= USB_SPEED_SUPER)
                usb_parse_ss_endpoint_companion(ddev, cfgno,
                                inum, asnum, endpoint, buffer, size);

        /* Skip over any Class Specific or Vendor Specific descriptors;
         * find the next endpoint or interface descriptor */
        endpoint->extra = buffer;
        i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT,
                        USB_DT_INTERFACE, &n);
        endpoint->extralen = i;
        retval = buffer - buffer0 + i;
        if (n > 0)
                dev_dbg(ddev, "skipped %d descriptor%s after %s\n",
                    n, str_plural(n), "endpoint");
        return retval;

skip_to_next_endpoint_or_interface_descriptor:
        i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT,
            USB_DT_INTERFACE, NULL);
        return buffer - buffer0 + i;
}

void usb_release_interface_cache(struct kref *ref)
{
        struct usb_interface_cache *intfc = ref_to_usb_interface_cache(ref);
        int j;

        for (j = 0; j < intfc->num_altsetting; j++) {
                struct usb_host_interface *alt = &intfc->altsetting[j];

                kfree(alt->endpoint);
                kfree(alt->string);
        }
        kfree(intfc);
}

static int usb_parse_interface(struct device *ddev, int cfgno,
    struct usb_host_config *config, unsigned char *buffer, int size,
    u8 inums[], u8 nalts[])
{
        unsigned char *buffer0 = buffer;
        struct usb_interface_descriptor        *d;
        int inum, asnum;
        struct usb_interface_cache *intfc;
        struct usb_host_interface *alt;
        int i, n;
        int len, retval;
        int num_ep, num_ep_orig;

        d = (struct usb_interface_descriptor *) buffer;
        buffer += d->bLength;
        size -= d->bLength;

        if (d->bLength < USB_DT_INTERFACE_SIZE)
                goto skip_to_next_interface_descriptor;

        /* Which interface entry is this? */
        intfc = NULL;
        inum = d->bInterfaceNumber;
        for (i = 0; i < config->desc.bNumInterfaces; ++i) {
                if (inums[i] == inum) {
                        intfc = config->intf_cache[i];
                        break;
                }
        }
        if (!intfc || intfc->num_altsetting >= nalts[i])
                goto skip_to_next_interface_descriptor;

        /* Check for duplicate altsetting entries */
        asnum = d->bAlternateSetting;
        for ((i = 0, alt = &intfc->altsetting[0]);
              i < intfc->num_altsetting;
             (++i, ++alt)) {
                if (alt->desc.bAlternateSetting == asnum) {
                        dev_notice(ddev, "Duplicate descriptor for config %d "
                            "interface %d altsetting %d, skipping\n",
                            cfgno, inum, asnum);
                        goto skip_to_next_interface_descriptor;
                }
        }

        ++intfc->num_altsetting;
        memcpy(&alt->desc, d, USB_DT_INTERFACE_SIZE);

        /* Skip over any Class Specific or Vendor Specific descriptors;
         * find the first endpoint or interface descriptor */
        alt->extra = buffer;
        i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT,
            USB_DT_INTERFACE, &n);
        alt->extralen = i;
        if (n > 0)
                dev_dbg(ddev, "skipped %d descriptor%s after %s\n",
                    n, str_plural(n), "interface");
        buffer += i;
        size -= i;

        /* Allocate space for the right(?) number of endpoints */
        num_ep = num_ep_orig = alt->desc.bNumEndpoints;
        alt->desc.bNumEndpoints = 0;                /* Use as a counter */
        if (num_ep > USB_MAXENDPOINTS) {
                dev_notice(ddev, "too many endpoints for config %d interface %d "
                    "altsetting %d: %d, using maximum allowed: %d\n",
                    cfgno, inum, asnum, num_ep, USB_MAXENDPOINTS);
                num_ep = USB_MAXENDPOINTS;
        }

        if (num_ep > 0) {
                /* Can't allocate 0 bytes */
                len = sizeof(struct usb_host_endpoint) * num_ep;
                alt->endpoint = kzalloc(len, GFP_KERNEL);
                if (!alt->endpoint)
                        return -ENOMEM;
        }

        /* Parse all the endpoint descriptors */
        n = 0;
        while (size > 0) {
                if (((struct usb_descriptor_header *) buffer)->bDescriptorType
                     == USB_DT_INTERFACE)
                        break;
                retval = usb_parse_endpoint(ddev, cfgno, config, inum, asnum,
                                alt, num_ep, buffer, size);
                if (retval < 0)
                        return retval;
                ++n;

                buffer += retval;
                size -= retval;
        }

        if (n != num_ep_orig)
                dev_notice(ddev, "config %d interface %d altsetting %d has %d "
                    "endpoint descriptor%s, different from the interface "
                    "descriptor's value: %d\n",
                    cfgno, inum, asnum, n, str_plural(n), num_ep_orig);
        return buffer - buffer0;

skip_to_next_interface_descriptor:
        i = find_next_descriptor(buffer, size, USB_DT_INTERFACE,
            USB_DT_INTERFACE, NULL);
        return buffer - buffer0 + i;
}

static int usb_parse_configuration(struct usb_device *dev, int cfgidx,
    struct usb_host_config *config, unsigned char *buffer, int size)
{
        struct device *ddev = &dev->dev;
        unsigned char *buffer0 = buffer;
        int cfgno;
        int nintf, nintf_orig;
        int i, j, n;
        struct usb_interface_cache *intfc;
        unsigned char *buffer2;
        int size2;
        struct usb_descriptor_header *header;
        int retval;
        u8 inums[USB_MAXINTERFACES], nalts[USB_MAXINTERFACES];
        unsigned iad_num = 0;

        memcpy(&config->desc, buffer, USB_DT_CONFIG_SIZE);
        nintf = nintf_orig = config->desc.bNumInterfaces;
        config->desc.bNumInterfaces = 0;        // Adjusted later

        if (config->desc.bDescriptorType != USB_DT_CONFIG ||
            config->desc.bLength < USB_DT_CONFIG_SIZE ||
            config->desc.bLength > size) {
                dev_notice(ddev, "invalid descriptor for config index %d: "
                    "type = 0x%X, length = %d\n", cfgidx,
                    config->desc.bDescriptorType, config->desc.bLength);
                return -EINVAL;
        }
        cfgno = config->desc.bConfigurationValue;

        buffer += config->desc.bLength;
        size -= config->desc.bLength;

        if (nintf > USB_MAXINTERFACES) {
                dev_notice(ddev, "config %d has too many interfaces: %d, "
                    "using maximum allowed: %d\n",
                    cfgno, nintf, USB_MAXINTERFACES);
                nintf = USB_MAXINTERFACES;
        }

        /* Go through the descriptors, checking their length and counting the
         * number of altsettings for each interface */
        n = 0;
        for ((buffer2 = buffer, size2 = size);
              size2 > 0;
             (buffer2 += header->bLength, size2 -= header->bLength)) {

                if (size2 < sizeof(struct usb_descriptor_header)) {
                        dev_notice(ddev, "config %d descriptor has %d excess "
                            "byte%s, ignoring\n",
                            cfgno, size2, str_plural(size2));
                        break;
                }

                header = (struct usb_descriptor_header *) buffer2;
                if ((header->bLength > size2) || (header->bLength < 2)) {
                        dev_notice(ddev, "config %d has an invalid descriptor "
                            "of length %d, skipping remainder of the config\n",
                            cfgno, header->bLength);
                        break;
                }

                if (header->bDescriptorType == USB_DT_INTERFACE) {
                        struct usb_interface_descriptor *d;
                        int inum;

                        d = (struct usb_interface_descriptor *) header;
                        if (d->bLength < USB_DT_INTERFACE_SIZE) {
                                dev_notice(ddev, "config %d has an invalid "
                                    "interface descriptor of length %d, "
                                    "skipping\n", cfgno, d->bLength);
                                continue;
                        }

                        inum = d->bInterfaceNumber;

                        if ((dev->quirks & USB_QUIRK_HONOR_BNUMINTERFACES) &&
                            n >= nintf_orig) {
                                dev_notice(ddev, "config %d has more interface "
                                    "descriptors, than it declares in "
                                    "bNumInterfaces, ignoring interface "
                                    "number: %d\n", cfgno, inum);
                                continue;
                        }

                        if (inum >= nintf_orig)
                                dev_notice(ddev, "config %d has an invalid "
                                    "interface number: %d but max is %d\n",
                                    cfgno, inum, nintf_orig - 1);

                        /* Have we already encountered this interface?
                         * Count its altsettings */
                        for (i = 0; i < n; ++i) {
                                if (inums[i] == inum)
                                        break;
                        }
                        if (i < n) {
                                if (nalts[i] < 255)
                                        ++nalts[i];
                        } else if (n < USB_MAXINTERFACES) {
                                inums[n] = inum;
                                nalts[n] = 1;
                                ++n;
                        }

                } else if (header->bDescriptorType ==
                                USB_DT_INTERFACE_ASSOCIATION) {
                        struct usb_interface_assoc_descriptor *d;

                        d = (struct usb_interface_assoc_descriptor *)header;
                        if (d->bLength < USB_DT_INTERFACE_ASSOCIATION_SIZE) {
                                dev_notice(ddev,
                                         "config %d has an invalid interface association descriptor of length %d, skipping\n",
                                         cfgno, d->bLength);
                                continue;
                        }

                        if (iad_num == USB_MAXIADS) {
                                dev_notice(ddev, "found more Interface "
                                               "Association Descriptors "
                                               "than allocated for in "
                                               "configuration %d\n", cfgno);
                        } else {
                                config->intf_assoc[iad_num] = d;
                                iad_num++;
                        }

                } else if (header->bDescriptorType == USB_DT_DEVICE ||
                            header->bDescriptorType == USB_DT_CONFIG)
                        dev_notice(ddev, "config %d contains an unexpected "
                            "descriptor of type 0x%X, skipping\n",
                            cfgno, header->bDescriptorType);

        }        /* for ((buffer2 = buffer, size2 = size); ...) */
        size = buffer2 - buffer;
        config->desc.wTotalLength = cpu_to_le16(buffer2 - buffer0);

        if (n != nintf)
                dev_notice(ddev, "config %d has %d interface%s, different from "
                    "the descriptor's value: %d\n",
                    cfgno, n, str_plural(n), nintf_orig);
        else if (n == 0)
                dev_notice(ddev, "config %d has no interfaces?\n", cfgno);
        config->desc.bNumInterfaces = nintf = n;

        /* Check for missing interface numbers */
        for (i = 0; i < nintf; ++i) {
                for (j = 0; j < nintf; ++j) {
                        if (inums[j] == i)
                                break;
                }
                if (j >= nintf)
                        dev_notice(ddev, "config %d has no interface number "
                            "%d\n", cfgno, i);
        }

        /* Allocate the usb_interface_caches and altsetting arrays */
        for (i = 0; i < nintf; ++i) {
                j = nalts[i];
                if (j > USB_MAXALTSETTING) {
                        dev_notice(ddev, "too many alternate settings for "
                            "config %d interface %d: %d, "
                            "using maximum allowed: %d\n",
                            cfgno, inums[i], j, USB_MAXALTSETTING);
                        nalts[i] = j = USB_MAXALTSETTING;
                }

                intfc = kzalloc_flex(*intfc, altsetting, j);
                config->intf_cache[i] = intfc;
                if (!intfc)
                        return -ENOMEM;
                kref_init(&intfc->ref);
        }

        /* FIXME: parse the BOS descriptor */

        /* Skip over any Class Specific or Vendor Specific descriptors;
         * find the first interface descriptor */
        config->extra = buffer;
        i = find_next_descriptor(buffer, size, USB_DT_INTERFACE,
            USB_DT_INTERFACE, &n);
        config->extralen = i;
        if (n > 0)
                dev_dbg(ddev, "skipped %d descriptor%s after %s\n",
                    n, str_plural(n), "configuration");
        buffer += i;
        size -= i;

        /* Parse all the interface/altsetting descriptors */
        while (size > 0) {
                retval = usb_parse_interface(ddev, cfgno, config,
                    buffer, size, inums, nalts);
                if (retval < 0)
                        return retval;

                buffer += retval;
                size -= retval;
        }

        /* Check for missing altsettings */
        for (i = 0; i < nintf; ++i) {
                intfc = config->intf_cache[i];
                for (j = 0; j < intfc->num_altsetting; ++j) {
                        for (n = 0; n < intfc->num_altsetting; ++n) {
                                if (intfc->altsetting[n].desc.
                                    bAlternateSetting == j)
                                        break;
                        }
                        if (n >= intfc->num_altsetting)
                                dev_notice(ddev, "config %d interface %d has no "
                                    "altsetting %d\n", cfgno, inums[i], j);
                }
        }

        return 0;
}

/* hub-only!! ... and only exported for reset/reinit path.
 * otherwise used internally on disconnect/destroy path
 */
void usb_destroy_configuration(struct usb_device *dev)
{
        int c, i;

        if (!dev->config)
                return;

        if (dev->rawdescriptors) {
                for (i = 0; i < dev->descriptor.bNumConfigurations; i++)
                        kfree(dev->rawdescriptors[i]);

                kfree(dev->rawdescriptors);
                dev->rawdescriptors = NULL;
        }

        for (c = 0; c < dev->descriptor.bNumConfigurations; c++) {
                struct usb_host_config *cf = &dev->config[c];

                kfree(cf->string);
                for (i = 0; i < cf->desc.bNumInterfaces; i++) {
                        if (cf->intf_cache[i])
                                kref_put(&cf->intf_cache[i]->ref,
                                          usb_release_interface_cache);
                }
        }
        kfree(dev->config);
        dev->config = NULL;
}


/*
 * Get the USB config descriptors, cache and parse'em
 *
 * hub-only!! ... and only in reset path, or usb_new_device()
 * (used by real hubs and virtual root hubs)
 */
int usb_get_configuration(struct usb_device *dev)
{
        struct device *ddev = &dev->dev;
        int ncfg = dev->descriptor.bNumConfigurations;
        unsigned int cfgno, length;
        unsigned char *bigbuffer;
        struct usb_config_descriptor *desc;
        int result;

        if (ncfg > USB_MAXCONFIG) {
                dev_notice(ddev, "too many configurations: %d, "
                    "using maximum allowed: %d\n", ncfg, USB_MAXCONFIG);
                dev->descriptor.bNumConfigurations = ncfg = USB_MAXCONFIG;
        }

        if (ncfg < 1 && dev->quirks & USB_QUIRK_FORCE_ONE_CONFIG) {
                dev_info(ddev, "Device claims zero configurations, forcing to 1\n");
                dev->descriptor.bNumConfigurations = 1;
                ncfg = 1;
        } else if (ncfg < 1) {
                dev_err(ddev, "no configurations\n");
                return -EINVAL;
        }

        length = ncfg * sizeof(struct usb_host_config);
        dev->config = kzalloc(length, GFP_KERNEL);
        if (!dev->config)
                return -ENOMEM;

        length = ncfg * sizeof(char *);
        dev->rawdescriptors = kzalloc(length, GFP_KERNEL);
        if (!dev->rawdescriptors)
                return -ENOMEM;

        desc = kmalloc(USB_DT_CONFIG_SIZE, GFP_KERNEL);
        if (!desc)
                return -ENOMEM;

        for (cfgno = 0; cfgno < ncfg; cfgno++) {
                /* We grab just the first descriptor so we know how long
                 * the whole configuration is */
                result = usb_get_descriptor(dev, USB_DT_CONFIG, cfgno,
                    desc, USB_DT_CONFIG_SIZE);
                if (result < 0) {
                        dev_err(ddev, "unable to read config index %d "
                            "descriptor/%s: %d\n", cfgno, "start", result);
                        if (result != -EPIPE)
                                goto err;
                        dev_notice(ddev, "chopping to %d config(s)\n", cfgno);
                        dev->descriptor.bNumConfigurations = cfgno;
                        break;
                } else if (result < 4) {
                        dev_err(ddev, "config index %d descriptor too short "
                            "(expected %i, got %i)\n", cfgno,
                            USB_DT_CONFIG_SIZE, result);
                        result = -EINVAL;
                        goto err;
                }
                length = max_t(int, le16_to_cpu(desc->wTotalLength),
                    USB_DT_CONFIG_SIZE);

                /* Now that we know the length, get the whole thing */
                bigbuffer = kmalloc(length, GFP_KERNEL);
                if (!bigbuffer) {
                        result = -ENOMEM;
                        goto err;
                }

                if (dev->quirks & USB_QUIRK_DELAY_INIT)
                        msleep(200);

                result = usb_get_descriptor(dev, USB_DT_CONFIG, cfgno,
                    bigbuffer, length);
                if (result < 0) {
                        dev_err(ddev, "unable to read config index %d "
                            "descriptor/%s\n", cfgno, "all");
                        kfree(bigbuffer);
                        goto err;
                }
                if (result < length) {
                        dev_notice(ddev, "config index %d descriptor too short "
                            "(expected %i, got %i)\n", cfgno, length, result);
                        length = result;
                }

                dev->rawdescriptors[cfgno] = bigbuffer;

                result = usb_parse_configuration(dev, cfgno,
                    &dev->config[cfgno], bigbuffer, length);
                if (result < 0) {
                        ++cfgno;
                        goto err;
                }
        }

err:
        kfree(desc);
        dev->descriptor.bNumConfigurations = cfgno;

        return result;
}

void usb_release_bos_descriptor(struct usb_device *dev)
{
        if (dev->bos) {
                kfree(dev->bos->desc);
                kfree(dev->bos);
                dev->bos = NULL;
        }
}

static const __u8 bos_desc_len[256] = {
        [USB_CAP_TYPE_WIRELESS_USB] = USB_DT_USB_WIRELESS_CAP_SIZE,
        [USB_CAP_TYPE_EXT]          = USB_DT_USB_EXT_CAP_SIZE,
        [USB_SS_CAP_TYPE]           = USB_DT_USB_SS_CAP_SIZE,
        [USB_SSP_CAP_TYPE]          = USB_DT_USB_SSP_CAP_SIZE(1),
        [CONTAINER_ID_TYPE]         = USB_DT_USB_SS_CONTN_ID_SIZE,
        [USB_PTM_CAP_TYPE]          = USB_DT_USB_PTM_ID_SIZE,
};

/* Get BOS descriptor set */
int usb_get_bos_descriptor(struct usb_device *dev)
{
        struct device *ddev = &dev->dev;
        struct usb_bos_descriptor *bos;
        struct usb_dev_cap_header *cap;
        struct usb_ssp_cap_descriptor *ssp_cap;
        unsigned char *buffer, *buffer0;
        int length, total_len, num, i, ssac;
        __u8 cap_type;
        int ret;

        if (dev->quirks & USB_QUIRK_NO_BOS) {
                dev_dbg(ddev, "skipping BOS descriptor\n");
                return -ENOMSG;
        }

        bos = kzalloc_obj(*bos);
        if (!bos)
                return -ENOMEM;

        /* Get BOS descriptor */
        ret = usb_get_descriptor(dev, USB_DT_BOS, 0, bos, USB_DT_BOS_SIZE);
        if (ret < USB_DT_BOS_SIZE || bos->bLength < USB_DT_BOS_SIZE) {
                dev_notice(ddev, "unable to get BOS descriptor or descriptor too short\n");
                if (ret >= 0)
                        ret = -ENOMSG;
                kfree(bos);
                return ret;
        }

        length = bos->bLength;
        total_len = le16_to_cpu(bos->wTotalLength);
        num = bos->bNumDeviceCaps;
        kfree(bos);
        if (total_len < length)
                return -EINVAL;

        dev->bos = kzalloc_obj(*dev->bos);
        if (!dev->bos)
                return -ENOMEM;

        /* Now let's get the whole BOS descriptor set */
        buffer = kzalloc(total_len, GFP_KERNEL);
        if (!buffer) {
                ret = -ENOMEM;
                goto err;
        }
        dev->bos->desc = (struct usb_bos_descriptor *)buffer;

        ret = usb_get_descriptor(dev, USB_DT_BOS, 0, buffer, total_len);
        if (ret < total_len) {
                dev_notice(ddev, "unable to get BOS descriptor set\n");
                if (ret >= 0)
                        ret = -ENOMSG;
                goto err;
        }

        buffer0 = buffer;
        total_len -= length;
        buffer += length;

        for (i = 0; i < num; i++) {
                cap = (struct usb_dev_cap_header *)buffer;

                if (total_len < sizeof(*cap) || total_len < cap->bLength) {
                        dev->bos->desc->bNumDeviceCaps = i;
                        break;
                }
                cap_type = cap->bDevCapabilityType;
                length = cap->bLength;
                if (bos_desc_len[cap_type] && length < bos_desc_len[cap_type]) {
                        dev->bos->desc->bNumDeviceCaps = i;
                        break;
                }

                if (cap->bDescriptorType != USB_DT_DEVICE_CAPABILITY) {
                        dev_notice(ddev, "descriptor type invalid, skip\n");
                        goto skip_to_next_descriptor;
                }

                switch (cap_type) {
                case USB_CAP_TYPE_EXT:
                        dev->bos->ext_cap =
                                (struct usb_ext_cap_descriptor *)buffer;
                        break;
                case USB_SS_CAP_TYPE:
                        dev->bos->ss_cap =
                                (struct usb_ss_cap_descriptor *)buffer;
                        break;
                case USB_SSP_CAP_TYPE:
                        ssp_cap = (struct usb_ssp_cap_descriptor *)buffer;
                        ssac = (le32_to_cpu(ssp_cap->bmAttributes) &
                                USB_SSP_SUBLINK_SPEED_ATTRIBS);
                        if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac))
                                dev->bos->ssp_cap = ssp_cap;
                        break;
                case CONTAINER_ID_TYPE:
                        dev->bos->ss_id =
                                (struct usb_ss_container_id_descriptor *)buffer;
                        break;
                case USB_PTM_CAP_TYPE:
                        dev->bos->ptm_cap =
                                (struct usb_ptm_cap_descriptor *)buffer;
                        break;
                default:
                        break;
                }

skip_to_next_descriptor:
                total_len -= length;
                buffer += length;
        }
        dev->bos->desc->wTotalLength = cpu_to_le16(buffer - buffer0);

        return 0;

err:
        usb_release_bos_descriptor(dev);
        return ret;
}



















































    1 
































































































































   13 
























































































   20 
























   10 























































































































































































































































































































































    8 



















































































































































































































































































































    4 










    1 














    6 














    1 























    1 


















































































































































    8 
    4 















































































    2 



















   11 
   10 





































































































































































    1 










    1 














    1 


    1 
































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGEMAP_H
#define _LINUX_PAGEMAP_H

/*
 * Copyright 1995 Linus Torvalds
 */
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
#include <linux/hardirq.h> /* for in_interrupt() */
#include <linux/hugetlb_inline.h>

struct folio_batch;

unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end);

static inline void invalidate_remote_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
            S_ISLNK(inode->i_mode))
                invalidate_mapping_pages(inode->i_mapping, 0, -1);
}
int invalidate_inode_pages2(struct address_space *mapping);
int invalidate_inode_pages2_range(struct address_space *mapping,
                pgoff_t start, pgoff_t end);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
int filemap_invalidate_pages(struct address_space *mapping,
                             loff_t pos, loff_t end, bool nowait);

int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *);
int filemap_flush(struct address_space *);
int filemap_flush_nr(struct address_space *mapping, long *nr_to_write);
int filemap_fdatawait_keep_errors(struct address_space *mapping);
int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte);
int filemap_invalidate_inode(struct inode *inode, bool flush,
                             loff_t start, loff_t end);

static inline int filemap_fdatawait(struct address_space *mapping)
{
        return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
}

bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend);
int filemap_write_and_wait_range(struct address_space *mapping,
                loff_t lstart, loff_t lend);
int filemap_fdatawrite_range(struct address_space *mapping,
                loff_t start, loff_t end);
int filemap_check_errors(struct address_space *mapping);
void __filemap_set_wb_err(struct address_space *mapping, int err);
int kiocb_write_and_wait(struct kiocb *iocb, size_t count);

static inline int filemap_write_and_wait(struct address_space *mapping)
{
        return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
}

/**
 * filemap_set_wb_err - set a writeback error on an address_space
 * @mapping: mapping in which to set writeback error
 * @err: error to be set in mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * filemap_set_wb_err to record the error in the mapping so that it will be
 * automatically reported whenever fsync is called on the file.
 */
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
{
        /* Fastpath for common case of no error */
        if (unlikely(err))
                __filemap_set_wb_err(mapping, err);
}

/**
 * filemap_check_wb_err - has an error occurred since the mark was sampled?
 * @mapping: mapping to check for writeback errors
 * @since: previously-sampled errseq_t
 *
 * Grab the errseq_t value from the mapping, and see if it has changed "since"
 * the given value was sampled.
 *
 * If it has then report the latest error set, otherwise return 0.
 */
static inline int filemap_check_wb_err(struct address_space *mapping,
                                        errseq_t since)
{
        return errseq_check(&mapping->wb_err, since);
}

/**
 * filemap_sample_wb_err - sample the current errseq_t to test for later errors
 * @mapping: mapping to be sampled
 *
 * Writeback errors are always reported relative to a particular sample point
 * in the past. This function provides those sample points.
 */
static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
{
        return errseq_sample(&mapping->wb_err);
}

/**
 * file_sample_sb_err - sample the current errseq_t to test for later errors
 * @file: file pointer to be sampled
 *
 * Grab the most current superblock-level errseq_t value for the given
 * struct file.
 */
static inline errseq_t file_sample_sb_err(struct file *file)
{
        return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
}

/*
 * Flush file data before changing attributes.  Caller must hold any locks
 * required to prevent further writes to this file until we're done setting
 * flags.
 */
static inline int inode_drain_writes(struct inode *inode)
{
        inode_dio_wait(inode);
        return filemap_write_and_wait(inode->i_mapping);
}

static inline bool mapping_empty(const struct address_space *mapping)
{
        return xa_empty(&mapping->i_pages);
}

/*
 * mapping_shrinkable - test if page cache state allows inode reclaim
 * @mapping: the page cache mapping
 *
 * This checks the mapping's cache state for the pupose of inode
 * reclaim and LRU management.
 *
 * The caller is expected to hold the i_lock, but is not required to
 * hold the i_pages lock, which usually protects cache state. That's
 * because the i_lock and the list_lru lock that protect the inode and
 * its LRU state don't nest inside the irq-safe i_pages lock.
 *
 * Cache deletions are performed under the i_lock, which ensures that
 * when an inode goes empty, it will reliably get queued on the LRU.
 *
 * Cache additions do not acquire the i_lock and may race with this
 * check, in which case we'll report the inode as shrinkable when it
 * has cache pages. This is okay: the shrinker also checks the
 * refcount and the referenced bit, which will be elevated or set in
 * the process of adding new cache pages to an inode.
 */
static inline bool mapping_shrinkable(const struct address_space *mapping)
{
        void *head;

        /*
         * On highmem systems, there could be lowmem pressure from the
         * inodes before there is highmem pressure from the page
         * cache. Make inodes shrinkable regardless of cache state.
         */
        if (IS_ENABLED(CONFIG_HIGHMEM))
                return true;

        /* Cache completely empty? Shrink away. */
        head = rcu_access_pointer(mapping->i_pages.xa_head);
        if (!head)
                return true;

        /*
         * The xarray stores single offset-0 entries directly in the
         * head pointer, which allows non-resident page cache entries
         * to escape the shadow shrinker's list of xarray nodes. The
         * inode shrinker needs to pick them up under memory pressure.
         */
        if (!xa_is_node(head) && xa_is_value(head))
                return true;

        return false;
}

/*
 * Bits in mapping->flags.
 */
enum mapping_flags {
        AS_EIO                = 0,        /* IO error on async write */
        AS_ENOSPC        = 1,        /* ENOSPC on async write */
        AS_MM_ALL_LOCKS        = 2,        /* under mm_take_all_locks() */
        AS_UNEVICTABLE        = 3,        /* e.g., ramdisk, SHM_LOCK */
        AS_EXITING        = 4,         /* final truncate in progress */
        /* writeback related tags are not used */
        AS_NO_WRITEBACK_TAGS = 5,
        AS_RELEASE_ALWAYS = 6,        /* Call ->release_folio(), even if no private data */
        AS_STABLE_WRITES = 7,        /* must wait for writeback before modifying
                                   folio contents */
        AS_INACCESSIBLE = 8,        /* Do not attempt direct R/W access to the mapping */
        AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
        AS_KERNEL_FILE = 10,        /* mapping for a fake kernel file that shouldn't
                                   account usage to user cgroups */
        /* Bits 16-25 are used for FOLIO_ORDER */
        AS_FOLIO_ORDER_BITS = 5,
        AS_FOLIO_ORDER_MIN = 16,
        AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
};

#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1)
#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN)
#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX)
#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK)

/**
 * mapping_set_error - record a writeback error in the address_space
 * @mapping: the mapping in which an error should be set
 * @error: the error to set in the mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * mapping_set_error to record the error in the mapping so that it can be
 * reported when the application calls fsync(2).
 */
static inline void mapping_set_error(struct address_space *mapping, int error)
{
        if (likely(!error))
                return;

        /* Record in wb_err for checkers using errseq_t based tracking */
        __filemap_set_wb_err(mapping, error);

        /* Record it in superblock */
        if (mapping->host)
                errseq_set(&mapping->host->i_sb->s_wb_err, error);

        /* Record it in flags for now, for legacy callers */
        if (error == -ENOSPC)
                set_bit(AS_ENOSPC, &mapping->flags);
        else
                set_bit(AS_EIO, &mapping->flags);
}

static inline void mapping_set_unevictable(struct address_space *mapping)
{
        set_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_clear_unevictable(struct address_space *mapping)
{
        clear_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline bool mapping_unevictable(const struct address_space *mapping)
{
        return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_set_exiting(struct address_space *mapping)
{
        set_bit(AS_EXITING, &mapping->flags);
}

static inline int mapping_exiting(const struct address_space *mapping)
{
        return test_bit(AS_EXITING, &mapping->flags);
}

static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
{
        set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline int mapping_use_writeback_tags(const struct address_space *mapping)
{
        return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline bool mapping_release_always(const struct address_space *mapping)
{
        return test_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline void mapping_set_release_always(struct address_space *mapping)
{
        set_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline void mapping_clear_release_always(struct address_space *mapping)
{
        clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline bool mapping_stable_writes(const struct address_space *mapping)
{
        return test_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_set_stable_writes(struct address_space *mapping)
{
        set_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_clear_stable_writes(struct address_space *mapping)
{
        clear_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_set_inaccessible(struct address_space *mapping)
{
        /*
         * It's expected inaccessible mappings are also unevictable. Compaction
         * migrate scanner (isolate_migratepages_block()) relies on this to
         * reduce page locking.
         */
        set_bit(AS_UNEVICTABLE, &mapping->flags);
        set_bit(AS_INACCESSIBLE, &mapping->flags);
}

static inline bool mapping_inaccessible(const struct address_space *mapping)
{
        return test_bit(AS_INACCESSIBLE, &mapping->flags);
}

static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_space *mapping)
{
        set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
}

static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct address_space *mapping)
{
        return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
}

static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
{
        return mapping->gfp_mask;
}

/* Restricts the given gfp_mask to what the mapping allows. */
static inline gfp_t mapping_gfp_constraint(const struct address_space *mapping,
                gfp_t gfp_mask)
{
        return mapping_gfp_mask(mapping) & gfp_mask;
}

/*
 * This is non-atomic.  Only to be used before the mapping is activated.
 * Probably needs a barrier...
 */
static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
{
        m->gfp_mask = mask;
}

/*
 * There are some parts of the kernel which assume that PMD entries
 * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
 * limit the maximum allocation order to PMD size.  I'm not aware of any
 * assumptions about maximum order if THP are disabled, but 8 seems like
 * a good order (that's 1MB if you're using 4kB pages)
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define PREFERRED_MAX_PAGECACHE_ORDER        HPAGE_PMD_ORDER
#else
#define PREFERRED_MAX_PAGECACHE_ORDER        8
#endif

/*
 * xas_split_alloc() does not support arbitrary orders. This implies no
 * 512MB THP on ARM64 with 64KB base page size.
 */
#define MAX_XAS_ORDER                (XA_CHUNK_SHIFT * 2 - 1)
#define MAX_PAGECACHE_ORDER        min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)

/*
 * mapping_max_folio_size_supported() - Check the max folio size supported
 *
 * The filesystem should call this function at mount time if there is a
 * requirement on the folio mapping size in the page cache.
 */
static inline size_t mapping_max_folio_size_supported(void)
{
        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER);
        return PAGE_SIZE;
}

/*
 * mapping_set_folio_order_range() - Set the orders supported by a file.
 * @mapping: The address space of the file.
 * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
 * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
 *
 * The filesystem should call this function in its inode constructor to
 * indicate which base size (min) and maximum size (max) of folio the VFS
 * can use to cache the contents of the file.  This should only be used
 * if the filesystem needs special handling of folio sizes (ie there is
 * something the core cannot know).
 * Do not tune it based on, eg, i_size.
 *
 * Context: This should not be called while the inode is active as it
 * is non-atomic.
 */
static inline void mapping_set_folio_order_range(struct address_space *mapping,
                                                 unsigned int min,
                                                 unsigned int max)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return;

        if (min > MAX_PAGECACHE_ORDER)
                min = MAX_PAGECACHE_ORDER;

        if (max > MAX_PAGECACHE_ORDER)
                max = MAX_PAGECACHE_ORDER;

        if (max < min)
                max = min;

        mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
                (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
}

static inline void mapping_set_folio_min_order(struct address_space *mapping,
                                               unsigned int min)
{
        mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
}

/**
 * mapping_set_large_folios() - Indicate the file supports large folios.
 * @mapping: The address space of the file.
 *
 * The filesystem should call this function in its inode constructor to
 * indicate that the VFS can use large folios to cache the contents of
 * the file.
 *
 * Context: This should not be called while the inode is active as it
 * is non-atomic.
 */
static inline void mapping_set_large_folios(struct address_space *mapping)
{
        mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
}

static inline unsigned int
mapping_max_folio_order(const struct address_space *mapping)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return 0;
        return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
}

static inline unsigned int
mapping_min_folio_order(const struct address_space *mapping)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return 0;
        return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
}

static inline unsigned long
mapping_min_folio_nrpages(const struct address_space *mapping)
{
        return 1UL << mapping_min_folio_order(mapping);
}

static inline unsigned long
mapping_min_folio_nrbytes(const struct address_space *mapping)
{
        return mapping_min_folio_nrpages(mapping) << PAGE_SHIFT;
}

/**
 * mapping_align_index() - Align index for this mapping.
 * @mapping: The address_space.
 * @index: The page index.
 *
 * The index of a folio must be naturally aligned.  If you are adding a
 * new folio to the page cache and need to know what index to give it,
 * call this function.
 */
static inline pgoff_t mapping_align_index(const struct address_space *mapping,
                                          pgoff_t index)
{
        return round_down(index, mapping_min_folio_nrpages(mapping));
}

/*
 * Large folio support currently depends on THP.  These dependencies are
 * being worked on but are not yet fixed.
 */
static inline bool mapping_large_folio_support(const struct address_space *mapping)
{
        /* AS_FOLIO_ORDER is only reasonable for pagecache folios */
        VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON,
                        "Anonymous mapping always supports large folio");

        return mapping_max_folio_order(mapping) > 0;
}

/* Return the maximum folio size for this pagecache mapping, in bytes. */
static inline size_t mapping_max_folio_size(const struct address_space *mapping)
{
        return PAGE_SIZE << mapping_max_folio_order(mapping);
}

static inline int filemap_nr_thps(const struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        return atomic_read(&mapping->nr_thps);
#else
        return 0;
#endif
}

static inline void filemap_nr_thps_inc(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_large_folio_support(mapping))
                atomic_inc(&mapping->nr_thps);
#else
        WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
#endif
}

static inline void filemap_nr_thps_dec(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_large_folio_support(mapping))
                atomic_dec(&mapping->nr_thps);
#else
        WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
#endif
}

struct address_space *folio_mapping(const struct folio *folio);

/**
 * folio_flush_mapping - Find the file mapping this folio belongs to.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Anonymous folios return NULL, even if they're in
 * the swap cache.  Other kinds of folio also return NULL.
 *
 * This is ONLY used by architecture cache flushing code.  If you aren't
 * writing cache flushing code, you want either folio_mapping() or
 * folio_file_mapping().
 */
static inline struct address_space *folio_flush_mapping(struct folio *folio)
{
        if (unlikely(folio_test_swapcache(folio)))
                return NULL;

        return folio_mapping(folio);
}

/**
 * folio_inode - Get the host inode for this folio.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the inode that this folio
 * belongs to.
 *
 * Do not call this for folios which aren't in the page cache.
 */
static inline struct inode *folio_inode(struct folio *folio)
{
        return folio->mapping->host;
}

/**
 * folio_attach_private - Attach private data to a folio.
 * @folio: Folio to attach data to.
 * @data: Data to attach to folio.
 *
 * Attaching private data to a folio increments the page's reference count.
 * The data must be detached before the folio will be freed.
 */
static inline void folio_attach_private(struct folio *folio, void *data)
{
        folio_get(folio);
        folio->private = data;
        folio_set_private(folio);
}

/**
 * folio_change_private - Change private data on a folio.
 * @folio: Folio to change the data on.
 * @data: Data to set on the folio.
 *
 * Change the private data attached to a folio and return the old
 * data.  The page must previously have had data attached and the data
 * must be detached before the folio will be freed.
 *
 * Return: Data that was previously attached to the folio.
 */
static inline void *folio_change_private(struct folio *folio, void *data)
{
        void *old = folio_get_private(folio);

        folio->private = data;
        return old;
}

/**
 * folio_detach_private - Detach private data from a folio.
 * @folio: Folio to detach data from.
 *
 * Removes the data that was previously attached to the folio and decrements
 * the refcount on the page.
 *
 * Return: Data that was attached to the folio.
 */
static inline void *folio_detach_private(struct folio *folio)
{
        void *data = folio_get_private(folio);

        if (!folio_test_private(folio))
                return NULL;
        folio_clear_private(folio);
        folio->private = NULL;
        folio_put(folio);

        return data;
}

static inline void attach_page_private(struct page *page, void *data)
{
        folio_attach_private(page_folio(page), data);
}

static inline void *detach_page_private(struct page *page)
{
        return folio_detach_private(page_folio(page));
}

#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *policy);
#else
static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *policy)
{
        return folio_alloc_noprof(gfp, order);
}
#endif

#define filemap_alloc_folio(...)                                \
        alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))

static inline struct page *__page_cache_alloc(gfp_t gfp)
{
        return &filemap_alloc_folio(gfp, 0, NULL)->page;
}

static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
        return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}

typedef int filler_t(struct file *, struct folio *);

pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);

/**
 * typedef fgf_t - Flags for getting folios from the page cache.
 *
 * Most users of the page cache will not need to use these flags;
 * there are convenience functions such as filemap_get_folio() and
 * filemap_lock_folio().  For users which need more control over exactly
 * what is done with the folios, these flags to __filemap_get_folio()
 * are available.
 *
 * * %FGP_ACCESSED - The folio will be marked accessed.
 * * %FGP_LOCK - The folio is returned locked.
 * * %FGP_CREAT - If no folio is present then a new folio is allocated,
 *   added to the page cache and the VM's LRU list.  The folio is
 *   returned locked.
 * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
 *   folio is already in cache.  If the folio was allocated, unlock it
 *   before returning so the caller can do the same dance.
 * * %FGP_WRITE - The folio will be written to by the caller.
 * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
 * * %FGP_NOWAIT - Don't block on the folio lock.
 * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
 * * %FGP_DONTCACHE - Uncached buffered IO
 * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin()
 *   implementation.
 */
typedef unsigned int __bitwise fgf_t;

#define FGP_ACCESSED                ((__force fgf_t)0x00000001)
#define FGP_LOCK                ((__force fgf_t)0x00000002)
#define FGP_CREAT                ((__force fgf_t)0x00000004)
#define FGP_WRITE                ((__force fgf_t)0x00000008)
#define FGP_NOFS                ((__force fgf_t)0x00000010)
#define FGP_NOWAIT                ((__force fgf_t)0x00000020)
#define FGP_FOR_MMAP                ((__force fgf_t)0x00000040)
#define FGP_STABLE                ((__force fgf_t)0x00000080)
#define FGP_DONTCACHE                ((__force fgf_t)0x00000100)
#define FGF_GET_ORDER(fgf)        (((__force unsigned)fgf) >> 26)        /* top 6 bits */

#define FGP_WRITEBEGIN                (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)

static inline unsigned int filemap_get_order(size_t size)
{
        unsigned int shift = ilog2(size);

        if (shift <= PAGE_SHIFT)
                return 0;

        return shift - PAGE_SHIFT;
}

/**
 * fgf_set_order - Encode a length in the fgf_t flags.
 * @size: The suggested size of the folio to create.
 *
 * The caller of __filemap_get_folio() can use this to suggest a preferred
 * size for the folio that is created.  If there is already a folio at
 * the index, it will be returned, no matter what its size.  If a folio
 * is freshly created, it may be of a different size than requested
 * due to alignment constraints, memory pressure, or the presence of
 * other folios at nearby indices.
 */
static inline fgf_t fgf_set_order(size_t size)
{
        unsigned int order = filemap_get_order(size);

        if (!order)
                return 0;
        return (__force fgf_t)(order << 26);
}

void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
                pgoff_t index, fgf_t fgf_flags, gfp_t gfp, struct mempolicy *policy);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp);

static inline struct folio *__filemap_get_folio(struct address_space *mapping,
                pgoff_t index, fgf_t fgf_flags, gfp_t gfp)
{
        return __filemap_get_folio_mpol(mapping, index, fgf_flags, gfp, NULL);
}

/**
 * write_begin_get_folio - Get folio for write_begin with flags.
 * @iocb: The kiocb passed from write_begin (may be NULL).
 * @mapping: The address space to search.
 * @index: The page cache index.
 * @len: Length of data being written.
 *
 * This is a helper for filesystem write_begin() implementations.
 * It wraps __filemap_get_folio(), setting appropriate flags in
 * the write begin context.
 *
 * Return: A folio or an ERR_PTR.
 */
static inline struct folio *write_begin_get_folio(const struct kiocb *iocb,
                  struct address_space *mapping, pgoff_t index, size_t len)
{
        fgf_t fgp_flags = FGP_WRITEBEGIN;

        fgp_flags |= fgf_set_order(len);

        if (iocb && iocb->ki_flags & IOCB_DONTCACHE)
                fgp_flags |= FGP_DONTCACHE;

        return __filemap_get_folio(mapping, index, fgp_flags,
                                   mapping_gfp_mask(mapping));
}

/**
 * filemap_get_folio - Find and get a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If a folio is
 * present, it is returned with an increased refcount.
 *
 * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
 * this index.  Will not return a shadow, swap or DAX entry.
 */
static inline struct folio *filemap_get_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index, 0, 0);
}

/**
 * filemap_lock_folio - Find and lock a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If a folio is
 * present, it is returned locked with an increased refcount.
 *
 * Context: May sleep.
 * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
 * this index.  Will not return a shadow, swap or DAX entry.
 */
static inline struct folio *filemap_lock_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index, FGP_LOCK, 0);
}

/**
 * filemap_grab_folio - grab a folio from the page cache
 * @mapping: The address space to search
 * @index: The page index
 *
 * Looks up the page cache entry at @mapping & @index. If no folio is found,
 * a new folio is created. The folio is locked, marked as accessed, and
 * returned.
 *
 * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found
 * and failed to create a folio.
 */
static inline struct folio *filemap_grab_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                        mapping_gfp_mask(mapping));
}

/**
 * find_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned with an increased refcount.
 *
 * Otherwise, %NULL is returned.
 */
static inline struct page *find_get_page(struct address_space *mapping,
                                        pgoff_t offset)
{
        return pagecache_get_page(mapping, offset, 0, 0);
}

static inline struct page *find_get_page_flags(struct address_space *mapping,
                                        pgoff_t offset, fgf_t fgp_flags)
{
        return pagecache_get_page(mapping, offset, fgp_flags, 0);
}

/**
 * find_lock_page - locate, pin and lock a pagecache page
 * @mapping: the address_space to search
 * @index: the page index
 *
 * Looks up the page cache entry at @mapping & @index.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * Context: May sleep.
 * Return: A struct page or %NULL if there is no page in the cache for this
 * index.
 */
static inline struct page *find_lock_page(struct address_space *mapping,
                                        pgoff_t index)
{
        return pagecache_get_page(mapping, index, FGP_LOCK, 0);
}

/**
 * find_or_create_page - locate or add a pagecache page
 * @mapping: the page's address_space
 * @index: the page's index into the mapping
 * @gfp_mask: page allocation mode
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * If the page is not present, a new page is allocated using @gfp_mask
 * and added to the page cache and the VM's LRU list.  The page is
 * returned locked and with an increased refcount.
 *
 * On memory exhaustion, %NULL is returned.
 *
 * find_or_create_page() may sleep, even if @gfp_flags specifies an
 * atomic allocation!
 */
static inline struct page *find_or_create_page(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask)
{
        return pagecache_get_page(mapping, index,
                                        FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
                                        gfp_mask);
}

/**
 * grab_cache_page_nowait - returns locked page at given index in given cache
 * @mapping: target address_space
 * @index: the page index
 *
 * Returns locked page at given index in given cache, creating it if
 * needed, but do not wait if the page is locked or to reclaim memory.
 * This is intended for speculative data generators, where the data can
 * be regenerated if the page couldn't be grabbed.  This routine should
 * be safe to call while holding the lock for another page.
 *
 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 * and deadlock against the caller's locked page.
 */
static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
                                pgoff_t index)
{
        return pagecache_get_page(mapping, index,
                        FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
                        mapping_gfp_mask(mapping));
}

/**
 * folio_next_index - Get the index of the next folio.
 * @folio: The current folio.
 *
 * Return: The index of the folio which follows this folio in the file.
 */
static inline pgoff_t folio_next_index(const struct folio *folio)
{
        return folio->index + folio_nr_pages(folio);
}

/**
 * folio_next_pos - Get the file position of the next folio.
 * @folio: The current folio.
 *
 * Return: The position of the folio which follows this folio in the file.
 */
static inline loff_t folio_next_pos(const struct folio *folio)
{
        return (loff_t)folio_next_index(folio) << PAGE_SHIFT;
}

/**
 * folio_file_page - The page for a particular index.
 * @folio: The folio which contains this index.
 * @index: The index we want to look up.
 *
 * Sometimes after looking up a folio in the page cache, we need to
 * obtain the specific page for an index (eg a page fault).
 *
 * Return: The page containing the file data for this index.
 */
static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
{
        return folio_page(folio, index & (folio_nr_pages(folio) - 1));
}

/**
 * folio_contains - Does this folio contain this index?
 * @folio: The folio.
 * @index: The page index within the file.
 *
 * Context: The caller should have the folio locked and ensure
 * e.g., shmem did not move this folio to the swap cache.
 * Return: true or false.
 */
static inline bool folio_contains(const struct folio *folio, pgoff_t index)
{
        VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
        return index - folio->index < folio_nr_pages(folio);
}

unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_contig(struct address_space *mapping,
                pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);
unsigned filemap_get_folios_dirty(struct address_space *mapping,
                pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);

struct folio *read_cache_folio(struct address_space *, pgoff_t index,
                filler_t *filler, struct file *file);
struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index,
                gfp_t flags);
struct page *read_cache_page(struct address_space *, pgoff_t index,
                filler_t *filler, struct file *file);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);

static inline struct page *read_mapping_page(struct address_space *mapping,
                                pgoff_t index, struct file *file)
{
        return read_cache_page(mapping, index, NULL, file);
}

static inline struct folio *read_mapping_folio(struct address_space *mapping,
                                pgoff_t index, struct file *file)
{
        return read_cache_folio(mapping, index, NULL, file);
}

/**
 * page_pgoff - Calculate the logical page offset of this page.
 * @folio: The folio containing this page.
 * @page: The page which we need the offset of.
 *
 * For file pages, this is the offset from the beginning of the file
 * in units of PAGE_SIZE.  For anonymous pages, this is the offset from
 * the beginning of the anon_vma in units of PAGE_SIZE.  This will
 * return nonsense for KSM pages.
 *
 * Context: Caller must have a reference on the folio or otherwise
 * prevent it from being split or freed.
 *
 * Return: The offset in units of PAGE_SIZE.
 */
static inline pgoff_t page_pgoff(const struct folio *folio,
                const struct page *page)
{
        return folio->index + folio_page_idx(folio, page);
}

/**
 * folio_pos - Returns the byte position of this folio in its file.
 * @folio: The folio.
 */
static inline loff_t folio_pos(const struct folio *folio)
{
        return ((loff_t)folio->index) * PAGE_SIZE;
}

/*
 * Return byte-offset into filesystem object for page.
 */
static inline loff_t page_offset(struct page *page)
{
        struct folio *folio = page_folio(page);

        return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE;
}

/*
 * Get the offset in PAGE_SIZE (even for hugetlb folios).
 */
static inline pgoff_t folio_pgoff(const struct folio *folio)
{
        return folio->index;
}

static inline pgoff_t linear_page_index(const struct vm_area_struct *vma,
                                        const unsigned long address)
{
        pgoff_t pgoff;
        pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
        pgoff += vma->vm_pgoff;
        return pgoff;
}

struct wait_page_key {
        struct folio *folio;
        int bit_nr;
        int page_match;
};

struct wait_page_queue {
        struct folio *folio;
        int bit_nr;
        wait_queue_entry_t wait;
};

static inline bool wake_page_match(struct wait_page_queue *wait_page,
                                  struct wait_page_key *key)
{
        if (wait_page->folio != key->folio)
               return false;
        key->page_match = 1;

        if (wait_page->bit_nr != key->bit_nr)
                return false;

        return true;
}

void __folio_lock(struct folio *folio);
int __folio_lock_killable(struct folio *folio);
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf);
void unlock_page(struct page *page);
void folio_unlock(struct folio *folio);

/**
 * folio_trylock() - Attempt to lock a folio.
 * @folio: The folio to attempt to lock.
 *
 * Sometimes it is undesirable to wait for a folio to be unlocked (eg
 * when the locks are being taken in the wrong order, or if making
 * progress through a batch of folios is more important than processing
 * them in order).  Usually folio_lock() is the correct function to call.
 *
 * Context: Any context.
 * Return: Whether the lock was successfully acquired.
 */
static inline bool folio_trylock(struct folio *folio)
{
        return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0)));
}

/*
 * Return true if the page was successfully locked
 */
static inline bool trylock_page(struct page *page)
{
        return folio_trylock(page_folio(page));
}

/**
 * folio_lock() - Lock this folio.
 * @folio: The folio to lock.
 *
 * The folio lock protects against many things, probably more than it
 * should.  It is primarily held while a folio is being brought uptodate,
 * either from its backing file or from swap.  It is also held while a
 * folio is being truncated from its address_space, so holding the lock
 * is sufficient to keep folio->mapping stable.
 *
 * The folio lock is also held while write() is modifying the page to
 * provide POSIX atomicity guarantees (as long as the write does not
 * cross a page boundary).  Other modifications to the data in the folio
 * do not hold the folio lock and can race with writes, eg DMA and stores
 * to mapped pages.
 *
 * Context: May sleep.  If you need to acquire the locks of two or
 * more folios, they must be in order of ascending index, if they are
 * in the same address_space.  If they are in different address_spaces,
 * acquire the lock of the folio which belongs to the address_space which
 * has the lowest address in memory first.
 */
static inline void folio_lock(struct folio *folio)
{
        might_sleep();
        if (!folio_trylock(folio))
                __folio_lock(folio);
}

/**
 * lock_page() - Lock the folio containing this page.
 * @page: The page to lock.
 *
 * See folio_lock() for a description of what the lock protects.
 * This is a legacy function and new code should probably use folio_lock()
 * instead.
 *
 * Context: May sleep.  Pages in the same folio share a lock, so do not
 * attempt to lock two pages which share a folio.
 */
static inline void lock_page(struct page *page)
{
        struct folio *folio;
        might_sleep();

        folio = page_folio(page);
        if (!folio_trylock(folio))
                __folio_lock(folio);
}

/**
 * folio_lock_killable() - Lock this folio, interruptible by a fatal signal.
 * @folio: The folio to lock.
 *
 * Attempts to lock the folio, like folio_lock(), except that the sleep
 * to acquire the lock is interruptible by a fatal signal.
 *
 * Context: May sleep; see folio_lock().
 * Return: 0 if the lock was acquired; -EINTR if a fatal signal was received.
 */
static inline int folio_lock_killable(struct folio *folio)
{
        might_sleep();
        if (!folio_trylock(folio))
                return __folio_lock_killable(folio);
        return 0;
}

/*
 * folio_lock_or_retry - Lock the folio, unless this would block and the
 * caller indicated that it can handle a retry.
 *
 * Return value and mmap_lock implications depend on flags; see
 * __folio_lock_or_retry().
 */
static inline vm_fault_t folio_lock_or_retry(struct folio *folio,
                                             struct vm_fault *vmf)
{
        might_sleep();
        if (!folio_trylock(folio))
                return __folio_lock_or_retry(folio, vmf);
        return 0;
}

/*
 * This is exported only for folio_wait_locked/folio_wait_writeback, etc.,
 * and should not be used directly.
 */
void folio_wait_bit(struct folio *folio, int bit_nr);
int folio_wait_bit_killable(struct folio *folio, int bit_nr);

/* 
 * Wait for a folio to be unlocked.
 *
 * This must be called with the caller "holding" the folio,
 * ie with increased folio reference count so that the folio won't
 * go away during the wait.
 */
static inline void folio_wait_locked(struct folio *folio)
{
        if (folio_test_locked(folio))
                folio_wait_bit(folio, PG_locked);
}

static inline int folio_wait_locked_killable(struct folio *folio)
{
        if (!folio_test_locked(folio))
                return 0;
        return folio_wait_bit_killable(folio, PG_locked);
}

void folio_end_read(struct folio *folio, bool success);
void wait_on_page_writeback(struct page *page);
void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
void end_page_writeback(struct page *page);
void folio_end_writeback(struct folio *folio);
void folio_end_writeback_no_dropbehind(struct folio *folio);
void folio_end_dropbehind(struct folio *folio);
void folio_wait_stable(struct folio *folio);
void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
void __folio_cancel_dirty(struct folio *folio);
static inline void folio_cancel_dirty(struct folio *folio)
{
        /* Avoid atomic ops, locking, etc. when not actually needed. */
        if (folio_test_dirty(folio))
                __folio_cancel_dirty(folio);
}
bool folio_clear_dirty_for_io(struct folio *folio);
bool clear_page_dirty_for_io(struct page *page);
void folio_invalidate(struct folio *folio, size_t offset, size_t length);
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_MIGRATION
int filemap_migrate_folio(struct address_space *mapping, struct folio *dst,
                struct folio *src, enum migrate_mode mode);
#else
#define filemap_migrate_folio NULL
#endif
void folio_end_private_2(struct folio *folio);
void folio_wait_private_2(struct folio *folio);
int folio_wait_private_2_killable(struct folio *folio);

/*
 * Fault in userspace address range.
 */
size_t fault_in_writeable(char __user *uaddr, size_t size);
size_t fault_in_subpage_writeable(char __user *uaddr, size_t size);
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
size_t fault_in_readable(const char __user *uaddr, size_t size);

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                pgoff_t index, gfp_t gfp);
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
                pgoff_t index, gfp_t gfp);
void filemap_remove_folio(struct folio *folio);
void __filemap_remove_folio(struct folio *folio, void *shadow);
void replace_page_cache_folio(struct folio *old, struct folio *new);
void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct folio_batch *fbatch);
bool filemap_release_folio(struct folio *folio, gfp_t gfp);
loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
                int whence);

/* Must be non-static for BPF error injection */
int __filemap_add_folio(struct address_space *mapping, struct folio *folio,
                pgoff_t index, gfp_t gfp, void **shadowp);

bool filemap_range_has_writeback(struct address_space *mapping,
                                 loff_t start_byte, loff_t end_byte);

/**
 * filemap_range_needs_writeback - check if range potentially needs writeback
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback. Used by O_DIRECT
 * read/write with IOCB_NOWAIT, to see if the caller needs to do
 * filemap_write_and_wait_range() before proceeding.
 *
 * Return: %true if the caller should do filemap_write_and_wait_range() before
 * doing O_DIRECT to a page in this range, %false otherwise.
 */
static inline bool filemap_range_needs_writeback(struct address_space *mapping,
                                                 loff_t start_byte,
                                                 loff_t end_byte)
{
        if (!mapping->nrpages)
                return false;
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
            !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                return false;
        return filemap_range_has_writeback(mapping, start_byte, end_byte);
}

/**
 * struct readahead_control - Describes a readahead request.
 *
 * A readahead request is for consecutive pages.  Filesystems which
 * implement the ->readahead method should call readahead_folio() or
 * __readahead_batch() in a loop and attempt to start reads into each
 * folio in the request.
 *
 * Most of the fields in this struct are private and should be accessed
 * by the functions below.
 *
 * @file: The file, used primarily by network filesystems for authentication.
 *          May be NULL if invoked internally by the filesystem.
 * @mapping: Readahead this filesystem object.
 * @ra: File readahead state.  May be NULL.
 */
struct readahead_control {
        struct file *file;
        struct address_space *mapping;
        struct file_ra_state *ra;
/* private: use the readahead_* accessors instead */
        pgoff_t _index;
        unsigned int _nr_pages;
        unsigned int _batch_count;
        bool dropbehind;
        bool _workingset;
        unsigned long _pflags;
};

#define DEFINE_READAHEAD(ractl, f, r, m, i)                                \
        struct readahead_control ractl = {                                \
                .file = f,                                                \
                .mapping = m,                                                \
                .ra = r,                                                \
                ._index = i,                                                \
        }

#define VM_READAHEAD_PAGES        (SZ_128K / PAGE_SIZE)

void page_cache_ra_unbounded(struct readahead_control *,
                unsigned long nr_to_read, unsigned long lookahead_count);
void page_cache_sync_ra(struct readahead_control *, unsigned long req_count);
void page_cache_async_ra(struct readahead_control *, struct folio *,
                unsigned long req_count);
void readahead_expand(struct readahead_control *ractl,
                      loff_t new_start, size_t new_len);

/**
 * page_cache_sync_readahead - generic file readahead
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_sync_readahead() should be called when a cache miss happened:
 * it will submit the read.  The readahead logic may decide to piggyback more
 * pages onto the read request if access patterns suggest it will improve
 * performance.
 */
static inline
void page_cache_sync_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file, pgoff_t index,
                unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, ra, mapping, index);
        page_cache_sync_ra(&ractl, req_count);
}

/**
 * page_cache_async_readahead - file readahead for marked pages
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @folio: The folio which triggered the readahead call.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_async_readahead() should be called when a page is used which
 * is marked as PageReadahead; this is a marker to suggest that the application
 * has used up enough of the readahead window that we should start pulling in
 * more pages.
 */
static inline
void page_cache_async_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file,
                struct folio *folio, unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, ra, mapping, folio->index);
        page_cache_async_ra(&ractl, folio, req_count);
}

static inline struct folio *__readahead_folio(struct readahead_control *ractl)
{
        struct folio *folio;

        BUG_ON(ractl->_batch_count > ractl->_nr_pages);
        ractl->_nr_pages -= ractl->_batch_count;
        ractl->_index += ractl->_batch_count;

        if (!ractl->_nr_pages) {
                ractl->_batch_count = 0;
                return NULL;
        }

        folio = xa_load(&ractl->mapping->i_pages, ractl->_index);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        ractl->_batch_count = folio_nr_pages(folio);

        return folio;
}

/**
 * readahead_folio - Get the next folio to read.
 * @ractl: The current readahead request.
 *
 * Context: The folio is locked.  The caller should unlock the folio once
 * all I/O to that folio has completed.
 * Return: A pointer to the next folio, or %NULL if we are done.
 */
static inline struct folio *readahead_folio(struct readahead_control *ractl)
{
        struct folio *folio = __readahead_folio(ractl);

        if (folio)
                folio_put(folio);
        return folio;
}

static inline unsigned int __readahead_batch(struct readahead_control *rac,
                struct page **array, unsigned int array_sz)
{
        unsigned int i = 0;
        XA_STATE(xas, &rac->mapping->i_pages, 0);
        struct folio *folio;

        BUG_ON(rac->_batch_count > rac->_nr_pages);
        rac->_nr_pages -= rac->_batch_count;
        rac->_index += rac->_batch_count;
        rac->_batch_count = 0;

        xas_set(&xas, rac->_index);
        rcu_read_lock();
        xas_for_each(&xas, folio, rac->_index + rac->_nr_pages - 1) {
                if (xas_retry(&xas, folio))
                        continue;
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
                array[i++] = folio_page(folio, 0);
                rac->_batch_count += folio_nr_pages(folio);
                if (i == array_sz)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * readahead_pos - The byte offset into the file of this readahead request.
 * @rac: The readahead request.
 */
static inline loff_t readahead_pos(const struct readahead_control *rac)
{
        return (loff_t)rac->_index * PAGE_SIZE;
}

/**
 * readahead_length - The number of bytes in this readahead request.
 * @rac: The readahead request.
 */
static inline size_t readahead_length(const struct readahead_control *rac)
{
        return rac->_nr_pages * PAGE_SIZE;
}

/**
 * readahead_index - The index of the first page in this readahead request.
 * @rac: The readahead request.
 */
static inline pgoff_t readahead_index(const struct readahead_control *rac)
{
        return rac->_index;
}

/**
 * readahead_count - The number of pages in this readahead request.
 * @rac: The readahead request.
 */
static inline unsigned int readahead_count(const struct readahead_control *rac)
{
        return rac->_nr_pages;
}

/**
 * readahead_batch_length - The number of bytes in the current batch.
 * @rac: The readahead request.
 */
static inline size_t readahead_batch_length(const struct readahead_control *rac)
{
        return rac->_batch_count * PAGE_SIZE;
}

static inline unsigned long dir_pages(const struct inode *inode)
{
        return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
                               PAGE_SHIFT;
}

/**
 * folio_mkwrite_check_truncate - check if folio was truncated
 * @folio: the folio to check
 * @inode: the inode to check the folio against
 *
 * Return: the number of bytes in the folio up to EOF,
 * or -EFAULT if the folio was truncated.
 */
static inline ssize_t folio_mkwrite_check_truncate(const struct folio *folio,
                                                   const struct inode *inode)
{
        loff_t size = i_size_read(inode);
        pgoff_t index = size >> PAGE_SHIFT;
        size_t offset = offset_in_folio(folio, size);

        if (!folio->mapping)
                return -EFAULT;

        /* folio is wholly inside EOF */
        if (folio_next_index(folio) - 1 < index)
                return folio_size(folio);
        /* folio is wholly past EOF */
        if (folio->index > index || !offset)
                return -EFAULT;
        /* folio is partially inside EOF */
        return offset;
}

/**
 * i_blocks_per_folio - How many blocks fit in this folio.
 * @inode: The inode which contains the blocks.
 * @folio: The folio.
 *
 * If the block size is larger than the size of this folio, return zero.
 *
 * Context: The caller should hold a refcount on the folio to prevent it
 * from being split.
 * Return: The number of filesystem blocks covered by this folio.
 */
static inline
unsigned int i_blocks_per_folio(const struct inode *inode,
                                const struct folio *folio)
{
        return folio_size(folio) >> inode->i_blkbits;
}
#endif /* _LINUX_PAGEMAP_H */


















































































































    1 















    1 





    1 























































    3 





    3 

    3 




























































































    3 


    3 









    3 






























































































































































































































































































































































    3 


    3 









    3 




































    3 




















    3 


    3 












































































































































































































































































































    1 











    1 







































    1 











































    3 





    3 

    3 









    3 




















































    2 




















































































    3 


    3 

















    3 




























































































































































































































































































    3 






    3 
    3 











    3 

















    3 











    3 

    3 



    3 































































    3 

















    3 


































































































































































    3 














    3 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
// SPDX-License-Identifier: GPL-2.0
/*
 * udc.c - Core UDC Framework
 *
 * Copyright (C) 2010 Texas Instruments
 * Author: Felipe Balbi <balbi@ti.com>
 */

#define pr_fmt(fmt)        "UDC core: " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/list.h>
#include <linux/idr.h>
#include <linux/err.h>
#include <linux/dma-mapping.h>
#include <linux/sched/task_stack.h>
#include <linux/workqueue.h>

#include <linux/usb/ch9.h>
#include <linux/usb/gadget.h>
#include <linux/usb.h>

#include "trace.h"

static DEFINE_IDA(gadget_id_numbers);

static const struct bus_type gadget_bus_type;

/**
 * struct usb_udc - describes one usb device controller
 * @driver: the gadget driver pointer. For use by the class code
 * @dev: the child device to the actual controller
 * @gadget: the gadget. For use by the class code
 * @list: for use by the udc class driver
 * @vbus: for udcs who care about vbus status, this value is real vbus status;
 * for udcs who do not care about vbus status, this value is always true
 * @started: the UDC's started state. True if the UDC had started.
 * @allow_connect: Indicates whether UDC is allowed to be pulled up.
 * Set/cleared by gadget_(un)bind_driver() after gadget driver is bound or
 * unbound.
 * @vbus_work: work routine to handle VBUS status change notifications.
 * @connect_lock: protects udc->started, gadget->connect,
 * gadget->allow_connect and gadget->deactivate. The routines
 * usb_gadget_connect_locked(), usb_gadget_disconnect_locked(),
 * usb_udc_connect_control_locked(), usb_gadget_udc_start_locked() and
 * usb_gadget_udc_stop_locked() are called with this lock held.
 *
 * This represents the internal data structure which is used by the UDC-class
 * to hold information about udc driver and gadget together.
 */
struct usb_udc {
        struct usb_gadget_driver        *driver;
        struct usb_gadget                *gadget;
        struct device                        dev;
        struct list_head                list;
        bool                                vbus;
        bool                                started;
        bool                                allow_connect;
        struct work_struct                vbus_work;
        struct mutex                        connect_lock;
};

static const struct class udc_class;
static LIST_HEAD(udc_list);

/* Protects udc_list, udc->driver, driver->is_bound, and related calls */
static DEFINE_MUTEX(udc_lock);

/* ------------------------------------------------------------------------- */

/**
 * usb_ep_set_maxpacket_limit - set maximum packet size limit for endpoint
 * @ep:the endpoint being configured
 * @maxpacket_limit:value of maximum packet size limit
 *
 * This function should be used only in UDC drivers to initialize endpoint
 * (usually in probe function).
 */
void usb_ep_set_maxpacket_limit(struct usb_ep *ep,
                                              unsigned maxpacket_limit)
{
        ep->maxpacket_limit = maxpacket_limit;
        ep->maxpacket = maxpacket_limit;

        trace_usb_ep_set_maxpacket_limit(ep, 0);
}
EXPORT_SYMBOL_GPL(usb_ep_set_maxpacket_limit);

/**
 * usb_ep_enable - configure endpoint, making it usable
 * @ep:the endpoint being configured.  may not be the endpoint named "ep0".
 *        drivers discover endpoints through the ep_list of a usb_gadget.
 *
 * When configurations are set, or when interface settings change, the driver
 * will enable or disable the relevant endpoints.  while it is enabled, an
 * endpoint may be used for i/o until the driver receives a disconnect() from
 * the host or until the endpoint is disabled.
 *
 * the ep0 implementation (which calls this routine) must ensure that the
 * hardware capabilities of each endpoint match the descriptor provided
 * for it.  for example, an endpoint named "ep2in-bulk" would be usable
 * for interrupt transfers as well as bulk, but it likely couldn't be used
 * for iso transfers or for endpoint 14.  some endpoints are fully
 * configurable, with more generic names like "ep-a".  (remember that for
 * USB, "in" means "towards the USB host".)
 *
 * This routine may be called in an atomic (interrupt) context.
 *
 * returns zero, or a negative error code.
 */
int usb_ep_enable(struct usb_ep *ep)
{
        int ret = 0;

        if (ep->enabled)
                goto out;

        /* UDC drivers can't handle endpoints with maxpacket size 0 */
        if (!ep->desc || usb_endpoint_maxp(ep->desc) == 0) {
                WARN_ONCE(1, "%s: ep%d (%s) has %s\n", __func__, ep->address, ep->name,
                          (!ep->desc) ? "NULL descriptor" : "maxpacket 0");

                ret = -EINVAL;
                goto out;
        }

        ret = ep->ops->enable(ep, ep->desc);
        if (ret)
                goto out;

        ep->enabled = true;

out:
        trace_usb_ep_enable(ep, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_ep_enable);

/**
 * usb_ep_disable - endpoint is no longer usable
 * @ep:the endpoint being unconfigured.  may not be the endpoint named "ep0".
 *
 * no other task may be using this endpoint when this is called.
 * any pending and uncompleted requests will complete with status
 * indicating disconnect (-ESHUTDOWN) before this call returns.
 * gadget drivers must call usb_ep_enable() again before queueing
 * requests to the endpoint.
 *
 * This routine may be called in an atomic (interrupt) context.
 *
 * returns zero, or a negative error code.
 */
int usb_ep_disable(struct usb_ep *ep)
{
        int ret = 0;

        if (!ep->enabled)
                goto out;

        ret = ep->ops->disable(ep);
        if (ret)
                goto out;

        ep->enabled = false;

out:
        trace_usb_ep_disable(ep, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_ep_disable);

/**
 * usb_ep_alloc_request - allocate a request object to use with this endpoint
 * @ep:the endpoint to be used with with the request
 * @gfp_flags:GFP_* flags to use
 *
 * Request objects must be allocated with this call, since they normally
 * need controller-specific setup and may even need endpoint-specific
 * resources such as allocation of DMA descriptors.
 * Requests may be submitted with usb_ep_queue(), and receive a single
 * completion callback.  Free requests with usb_ep_free_request(), when
 * they are no longer needed.
 *
 * Returns the request, or null if one could not be allocated.
 */
struct usb_request *usb_ep_alloc_request(struct usb_ep *ep,
                                                       gfp_t gfp_flags)
{
        struct usb_request *req = NULL;

        req = ep->ops->alloc_request(ep, gfp_flags);

        if (req)
                req->ep = ep;

        trace_usb_ep_alloc_request(ep, req, req ? 0 : -ENOMEM);

        return req;
}
EXPORT_SYMBOL_GPL(usb_ep_alloc_request);

/**
 * usb_ep_free_request - frees a request object
 * @ep:the endpoint associated with the request
 * @req:the request being freed
 *
 * Reverses the effect of usb_ep_alloc_request().
 * Caller guarantees the request is not queued, and that it will
 * no longer be requeued (or otherwise used).
 */
void usb_ep_free_request(struct usb_ep *ep,
                                       struct usb_request *req)
{
        trace_usb_ep_free_request(ep, req, 0);
        ep->ops->free_request(ep, req);
}
EXPORT_SYMBOL_GPL(usb_ep_free_request);

/**
 * usb_ep_queue - queues (submits) an I/O request to an endpoint.
 * @ep:the endpoint associated with the request
 * @req:the request being submitted
 * @gfp_flags: GFP_* flags to use in case the lower level driver couldn't
 *        pre-allocate all necessary memory with the request.
 *
 * This tells the device controller to perform the specified request through
 * that endpoint (reading or writing a buffer).  When the request completes,
 * including being canceled by usb_ep_dequeue(), the request's completion
 * routine is called to return the request to the driver.  Any endpoint
 * (except control endpoints like ep0) may have more than one transfer
 * request queued; they complete in FIFO order.  Once a gadget driver
 * submits a request, that request may not be examined or modified until it
 * is given back to that driver through the completion callback.
 *
 * Each request is turned into one or more packets.  The controller driver
 * never merges adjacent requests into the same packet.  OUT transfers
 * will sometimes use data that's already buffered in the hardware.
 * Drivers can rely on the fact that the first byte of the request's buffer
 * always corresponds to the first byte of some USB packet, for both
 * IN and OUT transfers.
 *
 * Bulk endpoints can queue any amount of data; the transfer is packetized
 * automatically.  The last packet will be short if the request doesn't fill it
 * out completely.  Zero length packets (ZLPs) should be avoided in portable
 * protocols since not all usb hardware can successfully handle zero length
 * packets.  (ZLPs may be explicitly written, and may be implicitly written if
 * the request 'zero' flag is set.)  Bulk endpoints may also be used
 * for interrupt transfers; but the reverse is not true, and some endpoints
 * won't support every interrupt transfer.  (Such as 768 byte packets.)
 *
 * Interrupt-only endpoints are less functional than bulk endpoints, for
 * example by not supporting queueing or not handling buffers that are
 * larger than the endpoint's maxpacket size.  They may also treat data
 * toggle differently.
 *
 * Control endpoints ... after getting a setup() callback, the driver queues
 * one response (even if it would be zero length).  That enables the
 * status ack, after transferring data as specified in the response.  Setup
 * functions may return negative error codes to generate protocol stalls.
 * (Note that some USB device controllers disallow protocol stall responses
 * in some cases.)  When control responses are deferred (the response is
 * written after the setup callback returns), then usb_ep_set_halt() may be
 * used on ep0 to trigger protocol stalls.  Depending on the controller,
 * it may not be possible to trigger a status-stage protocol stall when the
 * data stage is over, that is, from within the response's completion
 * routine.
 *
 * For periodic endpoints, like interrupt or isochronous ones, the usb host
 * arranges to poll once per interval, and the gadget driver usually will
 * have queued some data to transfer at that time.
 *
 * Note that @req's ->complete() callback must never be called from
 * within usb_ep_queue() as that can create deadlock situations.
 *
 * This routine may be called in interrupt context.
 *
 * Returns zero, or a negative error code.  Endpoints that are not enabled
 * report errors; errors will also be
 * reported when the usb peripheral is disconnected.
 *
 * If and only if @req is successfully queued (the return value is zero),
 * @req->complete() will be called exactly once, when the Gadget core and
 * UDC are finished with the request.  When the completion function is called,
 * control of the request is returned to the device driver which submitted it.
 * The completion handler may then immediately free or reuse @req.
 */
int usb_ep_queue(struct usb_ep *ep,
                               struct usb_request *req, gfp_t gfp_flags)
{
        int ret = 0;

        if (!ep->enabled && ep->address) {
                pr_debug("USB gadget: queue request to disabled ep 0x%x (%s)\n",
                                 ep->address, ep->name);
                ret = -ESHUTDOWN;
                goto out;
        }

        ret = ep->ops->queue(ep, req, gfp_flags);

out:
        trace_usb_ep_queue(ep, req, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_ep_queue);

/**
 * usb_ep_dequeue - dequeues (cancels, unlinks) an I/O request from an endpoint
 * @ep:the endpoint associated with the request
 * @req:the request being canceled
 *
 * If the request is still active on the endpoint, it is dequeued and
 * eventually its completion routine is called (with status -ECONNRESET);
 * else a negative error code is returned.  This routine is asynchronous,
 * that is, it may return before the completion routine runs.
 *
 * Note that some hardware can't clear out write fifos (to unlink the request
 * at the head of the queue) except as part of disconnecting from usb. Such
 * restrictions prevent drivers from supporting configuration changes,
 * even to configuration zero (a "chapter 9" requirement).
 *
 * This routine may be called in interrupt context.
 */
int usb_ep_dequeue(struct usb_ep *ep, struct usb_request *req)
{
        int ret;

        ret = ep->ops->dequeue(ep, req);
        trace_usb_ep_dequeue(ep, req, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_ep_dequeue);

/**
 * usb_ep_set_halt - sets the endpoint halt feature.
 * @ep: the non-isochronous endpoint being stalled
 *
 * Use this to stall an endpoint, perhaps as an error report.
 * Except for control endpoints,
 * the endpoint stays halted (will not stream any data) until the host
 * clears this feature; drivers may need to empty the endpoint's request
 * queue first, to make sure no inappropriate transfers happen.
 *
 * Note that while an endpoint CLEAR_FEATURE will be invisible to the
 * gadget driver, a SET_INTERFACE will not be.  To reset endpoints for the
 * current altsetting, see usb_ep_clear_halt().  When switching altsettings,
 * it's simplest to use usb_ep_enable() or usb_ep_disable() for the endpoints.
 *
 * This routine may be called in interrupt context.
 *
 * Returns zero, or a negative error code.  On success, this call sets
 * underlying hardware state that blocks data transfers.
 * Attempts to halt IN endpoints will fail (returning -EAGAIN) if any
 * transfer requests are still queued, or if the controller hardware
 * (usually a FIFO) still holds bytes that the host hasn't collected.
 */
int usb_ep_set_halt(struct usb_ep *ep)
{
        int ret;

        ret = ep->ops->set_halt(ep, 1);
        trace_usb_ep_set_halt(ep, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_ep_set_halt);

/**
 * usb_ep_clear_halt - clears endpoint halt, and resets toggle
 * @ep:the bulk or interrupt endpoint being reset
 *
 * Use this when responding to the standard usb "set interface" request,
 * for endpoints that aren't reconfigured, after clearing any other state
 * in the endpoint's i/o queue.
 *
 * This routine may be called in interrupt context.
 *
 * Returns zero, or a negative error code.  On success, this call clears
 * the underlying hardware state reflecting endpoint halt and data toggle.
 * Note that some hardware can't support this request (like pxa2xx_udc),
 * and accordingly can't correctly implement interface altsettings.
 */
int usb_ep_clear_halt(struct usb_ep *ep)
{
        int ret;

        ret = ep->ops->set_halt(ep, 0);
        trace_usb_ep_clear_halt(ep, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_ep_clear_halt);

/**
 * usb_ep_set_wedge - sets the halt feature and ignores clear requests
 * @ep: the endpoint being wedged
 *
 * Use this to stall an endpoint and ignore CLEAR_FEATURE(HALT_ENDPOINT)
 * requests. If the gadget driver clears the halt status, it will
 * automatically unwedge the endpoint.
 *
 * This routine may be called in interrupt context.
 *
 * Returns zero on success, else negative errno.
 */
int usb_ep_set_wedge(struct usb_ep *ep)
{
        int ret;

        if (ep->ops->set_wedge)
                ret = ep->ops->set_wedge(ep);
        else
                ret = ep->ops->set_halt(ep, 1);

        trace_usb_ep_set_wedge(ep, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_ep_set_wedge);

/**
 * usb_ep_fifo_status - returns number of bytes in fifo, or error
 * @ep: the endpoint whose fifo status is being checked.
 *
 * FIFO endpoints may have "unclaimed data" in them in certain cases,
 * such as after aborted transfers.  Hosts may not have collected all
 * the IN data written by the gadget driver (and reported by a request
 * completion).  The gadget driver may not have collected all the data
 * written OUT to it by the host.  Drivers that need precise handling for
 * fault reporting or recovery may need to use this call.
 *
 * This routine may be called in interrupt context.
 *
 * This returns the number of such bytes in the fifo, or a negative
 * errno if the endpoint doesn't use a FIFO or doesn't support such
 * precise handling.
 */
int usb_ep_fifo_status(struct usb_ep *ep)
{
        int ret;

        if (ep->ops->fifo_status)
                ret = ep->ops->fifo_status(ep);
        else
                ret = -EOPNOTSUPP;

        trace_usb_ep_fifo_status(ep, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_ep_fifo_status);

/**
 * usb_ep_fifo_flush - flushes contents of a fifo
 * @ep: the endpoint whose fifo is being flushed.
 *
 * This call may be used to flush the "unclaimed data" that may exist in
 * an endpoint fifo after abnormal transaction terminations.  The call
 * must never be used except when endpoint is not being used for any
 * protocol translation.
 *
 * This routine may be called in interrupt context.
 */
void usb_ep_fifo_flush(struct usb_ep *ep)
{
        if (ep->ops->fifo_flush)
                ep->ops->fifo_flush(ep);

        trace_usb_ep_fifo_flush(ep, 0);
}
EXPORT_SYMBOL_GPL(usb_ep_fifo_flush);

/* ------------------------------------------------------------------------- */

/**
 * usb_gadget_frame_number - returns the current frame number
 * @gadget: controller that reports the frame number
 *
 * Returns the usb frame number, normally eleven bits from a SOF packet,
 * or negative errno if this device doesn't support this capability.
 */
int usb_gadget_frame_number(struct usb_gadget *gadget)
{
        int ret;

        ret = gadget->ops->get_frame(gadget);

        trace_usb_gadget_frame_number(gadget, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_frame_number);

/**
 * usb_gadget_wakeup - tries to wake up the host connected to this gadget
 * @gadget: controller used to wake up the host
 *
 * Returns zero on success, else negative error code if the hardware
 * doesn't support such attempts, or its support has not been enabled
 * by the usb host.  Drivers must return device descriptors that report
 * their ability to support this, or hosts won't enable it.
 *
 * This may also try to use SRP to wake the host and start enumeration,
 * even if OTG isn't otherwise in use.  OTG devices may also start
 * remote wakeup even when hosts don't explicitly enable it.
 */
int usb_gadget_wakeup(struct usb_gadget *gadget)
{
        int ret = 0;

        if (!gadget->ops->wakeup) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = gadget->ops->wakeup(gadget);

out:
        trace_usb_gadget_wakeup(gadget, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_wakeup);

/**
 * usb_gadget_set_remote_wakeup - configures the device remote wakeup feature.
 * @gadget:the device being configured for remote wakeup
 * @set:value to be configured.
 *
 * set to one to enable remote wakeup feature and zero to disable it.
 *
 * returns zero on success, else negative errno.
 */
int usb_gadget_set_remote_wakeup(struct usb_gadget *gadget, int set)
{
        int ret = 0;

        if (!gadget->ops->set_remote_wakeup) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = gadget->ops->set_remote_wakeup(gadget, set);

out:
        trace_usb_gadget_set_remote_wakeup(gadget, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_set_remote_wakeup);

/**
 * usb_gadget_set_selfpowered - sets the device selfpowered feature.
 * @gadget:the device being declared as self-powered
 *
 * this affects the device status reported by the hardware driver
 * to reflect that it now has a local power supply.
 *
 * returns zero on success, else negative errno.
 */
int usb_gadget_set_selfpowered(struct usb_gadget *gadget)
{
        int ret = 0;

        if (!gadget->ops->set_selfpowered) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = gadget->ops->set_selfpowered(gadget, 1);

out:
        trace_usb_gadget_set_selfpowered(gadget, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_set_selfpowered);

/**
 * usb_gadget_clear_selfpowered - clear the device selfpowered feature.
 * @gadget:the device being declared as bus-powered
 *
 * this affects the device status reported by the hardware driver.
 * some hardware may not support bus-powered operation, in which
 * case this feature's value can never change.
 *
 * returns zero on success, else negative errno.
 */
int usb_gadget_clear_selfpowered(struct usb_gadget *gadget)
{
        int ret = 0;

        if (!gadget->ops->set_selfpowered) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = gadget->ops->set_selfpowered(gadget, 0);

out:
        trace_usb_gadget_clear_selfpowered(gadget, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_clear_selfpowered);

/**
 * usb_gadget_vbus_connect - Notify controller that VBUS is powered
 * @gadget:The device which now has VBUS power.
 * Context: can sleep
 *
 * This call is used by a driver for an external transceiver (or GPIO)
 * that detects a VBUS power session starting.  Common responses include
 * resuming the controller, activating the D+ (or D-) pullup to let the
 * host detect that a USB device is attached, and starting to draw power
 * (8mA or possibly more, especially after SET_CONFIGURATION).
 *
 * Returns zero on success, else negative errno.
 */
int usb_gadget_vbus_connect(struct usb_gadget *gadget)
{
        int ret = 0;

        if (!gadget->ops->vbus_session) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = gadget->ops->vbus_session(gadget, 1);

out:
        trace_usb_gadget_vbus_connect(gadget, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_vbus_connect);

/**
 * usb_gadget_vbus_draw - constrain controller's VBUS power usage
 * @gadget:The device whose VBUS usage is being described
 * @mA:How much current to draw, in milliAmperes.  This should be twice
 *        the value listed in the configuration descriptor bMaxPower field.
 *
 * This call is used by gadget drivers during SET_CONFIGURATION calls,
 * reporting how much power the device may consume.  For example, this
 * could affect how quickly batteries are recharged.
 *
 * Returns zero on success, else negative errno.
 */
int usb_gadget_vbus_draw(struct usb_gadget *gadget, unsigned mA)
{
        int ret = 0;

        if (!gadget->ops->vbus_draw) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = gadget->ops->vbus_draw(gadget, mA);
        if (!ret)
                gadget->mA = mA;

out:
        trace_usb_gadget_vbus_draw(gadget, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_vbus_draw);

/**
 * usb_gadget_vbus_disconnect - notify controller about VBUS session end
 * @gadget:the device whose VBUS supply is being described
 * Context: can sleep
 *
 * This call is used by a driver for an external transceiver (or GPIO)
 * that detects a VBUS power session ending.  Common responses include
 * reversing everything done in usb_gadget_vbus_connect().
 *
 * Returns zero on success, else negative errno.
 */
int usb_gadget_vbus_disconnect(struct usb_gadget *gadget)
{
        int ret = 0;

        if (!gadget->ops->vbus_session) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = gadget->ops->vbus_session(gadget, 0);

out:
        trace_usb_gadget_vbus_disconnect(gadget, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_vbus_disconnect);

static int usb_gadget_connect_locked(struct usb_gadget *gadget)
        __must_hold(&gadget->udc->connect_lock)
{
        int ret = 0;

        if (!gadget->ops->pullup) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        if (gadget->deactivated || !gadget->udc->allow_connect || !gadget->udc->started) {
                /*
                 * If the gadget isn't usable (because it is deactivated,
                 * unbound, or not yet started), we only save the new state.
                 * The gadget will be connected automatically when it is
                 * activated/bound/started.
                 */
                gadget->connected = true;
                goto out;
        }

        ret = gadget->ops->pullup(gadget, 1);
        if (!ret)
                gadget->connected = 1;

out:
        trace_usb_gadget_connect(gadget, ret);

        return ret;
}

/**
 * usb_gadget_connect - software-controlled connect to USB host
 * @gadget:the peripheral being connected
 *
 * Enables the D+ (or potentially D-) pullup.  The host will start
 * enumerating this gadget when the pullup is active and a VBUS session
 * is active (the link is powered).
 *
 * Returns zero on success, else negative errno.
 */
int usb_gadget_connect(struct usb_gadget *gadget)
{
        int ret;

        mutex_lock(&gadget->udc->connect_lock);
        ret = usb_gadget_connect_locked(gadget);
        mutex_unlock(&gadget->udc->connect_lock);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_connect);

static int usb_gadget_disconnect_locked(struct usb_gadget *gadget)
        __must_hold(&gadget->udc->connect_lock)
{
        int ret = 0;

        if (!gadget->ops->pullup) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        if (!gadget->connected)
                goto out;

        if (gadget->deactivated || !gadget->udc->started) {
                /*
                 * If gadget is deactivated we only save new state.
                 * Gadget will stay disconnected after activation.
                 */
                gadget->connected = false;
                goto out;
        }

        ret = gadget->ops->pullup(gadget, 0);
        if (!ret)
                gadget->connected = 0;

        mutex_lock(&udc_lock);
        if (gadget->udc->driver)
                gadget->udc->driver->disconnect(gadget);
        mutex_unlock(&udc_lock);

out:
        trace_usb_gadget_disconnect(gadget, ret);

        return ret;
}

/**
 * usb_gadget_disconnect - software-controlled disconnect from USB host
 * @gadget:the peripheral being disconnected
 *
 * Disables the D+ (or potentially D-) pullup, which the host may see
 * as a disconnect (when a VBUS session is active).  Not all systems
 * support software pullup controls.
 *
 * Following a successful disconnect, invoke the ->disconnect() callback
 * for the current gadget driver so that UDC drivers don't need to.
 *
 * Returns zero on success, else negative errno.
 */
int usb_gadget_disconnect(struct usb_gadget *gadget)
{
        int ret;

        mutex_lock(&gadget->udc->connect_lock);
        ret = usb_gadget_disconnect_locked(gadget);
        mutex_unlock(&gadget->udc->connect_lock);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_disconnect);

/**
 * usb_gadget_deactivate - deactivate function which is not ready to work
 * @gadget: the peripheral being deactivated
 *
 * This routine may be used during the gadget driver bind() call to prevent
 * the peripheral from ever being visible to the USB host, unless later
 * usb_gadget_activate() is called.  For example, user mode components may
 * need to be activated before the system can talk to hosts.
 *
 * This routine may sleep; it must not be called in interrupt context
 * (such as from within a gadget driver's disconnect() callback).
 *
 * Returns zero on success, else negative errno.
 */
int usb_gadget_deactivate(struct usb_gadget *gadget)
{
        int ret = 0;

        mutex_lock(&gadget->udc->connect_lock);
        if (gadget->deactivated)
                goto unlock;

        if (gadget->connected) {
                ret = usb_gadget_disconnect_locked(gadget);
                if (ret)
                        goto unlock;

                /*
                 * If gadget was being connected before deactivation, we want
                 * to reconnect it in usb_gadget_activate().
                 */
                gadget->connected = true;
        }
        gadget->deactivated = true;

unlock:
        mutex_unlock(&gadget->udc->connect_lock);
        trace_usb_gadget_deactivate(gadget, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_deactivate);

/**
 * usb_gadget_activate - activate function which is not ready to work
 * @gadget: the peripheral being activated
 *
 * This routine activates gadget which was previously deactivated with
 * usb_gadget_deactivate() call. It calls usb_gadget_connect() if needed.
 *
 * This routine may sleep; it must not be called in interrupt context.
 *
 * Returns zero on success, else negative errno.
 */
int usb_gadget_activate(struct usb_gadget *gadget)
{
        int ret = 0;

        mutex_lock(&gadget->udc->connect_lock);
        if (!gadget->deactivated)
                goto unlock;

        gadget->deactivated = false;

        /*
         * If gadget has been connected before deactivation, or became connected
         * while it was being deactivated, we call usb_gadget_connect().
         */
        if (gadget->connected)
                ret = usb_gadget_connect_locked(gadget);

unlock:
        mutex_unlock(&gadget->udc->connect_lock);
        trace_usb_gadget_activate(gadget, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_activate);

/* ------------------------------------------------------------------------- */

#ifdef        CONFIG_HAS_DMA

int usb_gadget_map_request_by_dev(struct device *dev,
                struct usb_request *req, int is_in)
{
        if (req->length == 0)
                return 0;

        if (req->sg_was_mapped) {
                req->num_mapped_sgs = req->num_sgs;
                return 0;
        }

        if (req->num_sgs) {
                int     mapped;

                mapped = dma_map_sg(dev, req->sg, req->num_sgs,
                                is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
                if (mapped == 0) {
                        dev_err(dev, "failed to map SGs\n");
                        return -EFAULT;
                }

                req->num_mapped_sgs = mapped;
        } else {
                if (is_vmalloc_addr(req->buf)) {
                        dev_err(dev, "buffer is not dma capable\n");
                        return -EFAULT;
                } else if (object_is_on_stack(req->buf)) {
                        dev_err(dev, "buffer is on stack\n");
                        return -EFAULT;
                }

                req->dma = dma_map_single(dev, req->buf, req->length,
                                is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);

                if (dma_mapping_error(dev, req->dma)) {
                        dev_err(dev, "failed to map buffer\n");
                        return -EFAULT;
                }

                req->dma_mapped = 1;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(usb_gadget_map_request_by_dev);

int usb_gadget_map_request(struct usb_gadget *gadget,
                struct usb_request *req, int is_in)
{
        return usb_gadget_map_request_by_dev(gadget->dev.parent, req, is_in);
}
EXPORT_SYMBOL_GPL(usb_gadget_map_request);

void usb_gadget_unmap_request_by_dev(struct device *dev,
                struct usb_request *req, int is_in)
{
        if (req->length == 0 || req->sg_was_mapped)
                return;

        if (req->num_mapped_sgs) {
                dma_unmap_sg(dev, req->sg, req->num_sgs,
                                is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);

                req->num_mapped_sgs = 0;
        } else if (req->dma_mapped) {
                dma_unmap_single(dev, req->dma, req->length,
                                is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
                req->dma_mapped = 0;
        }
}
EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev);

void usb_gadget_unmap_request(struct usb_gadget *gadget,
                struct usb_request *req, int is_in)
{
        usb_gadget_unmap_request_by_dev(gadget->dev.parent, req, is_in);
}
EXPORT_SYMBOL_GPL(usb_gadget_unmap_request);

#endif        /* CONFIG_HAS_DMA */

/* ------------------------------------------------------------------------- */

/**
 * usb_gadget_giveback_request - give the request back to the gadget layer
 * @ep: the endpoint to be used with with the request
 * @req: the request being given back
 *
 * This is called by device controller drivers in order to return the
 * completed request back to the gadget layer.
 */
void usb_gadget_giveback_request(struct usb_ep *ep,
                struct usb_request *req)
{
        if (likely(req->status == 0))
                usb_led_activity(USB_LED_EVENT_GADGET);

        trace_usb_gadget_giveback_request(ep, req, 0);

        req->complete(ep, req);
}
EXPORT_SYMBOL_GPL(usb_gadget_giveback_request);

/* ------------------------------------------------------------------------- */

/**
 * gadget_find_ep_by_name - returns ep whose name is the same as sting passed
 *        in second parameter or NULL if searched endpoint not found
 * @g: controller to check for quirk
 * @name: name of searched endpoint
 */
struct usb_ep *gadget_find_ep_by_name(struct usb_gadget *g, const char *name)
{
        struct usb_ep *ep;

        gadget_for_each_ep(ep, g) {
                if (!strcmp(ep->name, name))
                        return ep;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(gadget_find_ep_by_name);

/* ------------------------------------------------------------------------- */

int usb_gadget_ep_match_desc(struct usb_gadget *gadget,
                struct usb_ep *ep, struct usb_endpoint_descriptor *desc,
                struct usb_ss_ep_comp_descriptor *ep_comp)
{
        u8                type;
        u16                max;
        int                num_req_streams = 0;

        /* endpoint already claimed? */
        if (ep->claimed)
                return 0;

        type = usb_endpoint_type(desc);
        max = usb_endpoint_maxp(desc);

        if (usb_endpoint_dir_in(desc) && !ep->caps.dir_in)
                return 0;
        if (usb_endpoint_dir_out(desc) && !ep->caps.dir_out)
                return 0;

        if (max > ep->maxpacket_limit)
                return 0;

        /* "high bandwidth" works only at high speed */
        if (!gadget_is_dualspeed(gadget) && usb_endpoint_maxp_mult(desc) > 1)
                return 0;

        switch (type) {
        case USB_ENDPOINT_XFER_CONTROL:
                /* only support ep0 for portable CONTROL traffic */
                return 0;
        case USB_ENDPOINT_XFER_ISOC:
                if (!ep->caps.type_iso)
                        return 0;
                /* ISO:  limit 1023 bytes full speed, 1024 high/super speed */
                if (!gadget_is_dualspeed(gadget) && max > 1023)
                        return 0;
                break;
        case USB_ENDPOINT_XFER_BULK:
                if (!ep->caps.type_bulk)
                        return 0;
                if (ep_comp && gadget_is_superspeed(gadget)) {
                        /* Get the number of required streams from the
                         * EP companion descriptor and see if the EP
                         * matches it
                         */
                        num_req_streams = ep_comp->bmAttributes & 0x1f;
                        if (num_req_streams > ep->max_streams)
                                return 0;
                }
                break;
        case USB_ENDPOINT_XFER_INT:
                /* Bulk endpoints handle interrupt transfers,
                 * except the toggle-quirky iso-synch kind
                 */
                if (!ep->caps.type_int && !ep->caps.type_bulk)
                        return 0;
                /* INT:  limit 64 bytes full speed, 1024 high/super speed */
                if (!gadget_is_dualspeed(gadget) && max > 64)
                        return 0;
                break;
        }

        return 1;
}
EXPORT_SYMBOL_GPL(usb_gadget_ep_match_desc);

/**
 * usb_gadget_check_config - checks if the UDC can support the binded
 *        configuration
 * @gadget: controller to check the USB configuration
 *
 * Ensure that a UDC is able to support the requested resources by a
 * configuration, and that there are no resource limitations, such as
 * internal memory allocated to all requested endpoints.
 *
 * Returns zero on success, else a negative errno.
 */
int usb_gadget_check_config(struct usb_gadget *gadget)
{
        if (gadget->ops->check_config)
                return gadget->ops->check_config(gadget);
        return 0;
}
EXPORT_SYMBOL_GPL(usb_gadget_check_config);

/* ------------------------------------------------------------------------- */

static void usb_gadget_state_work(struct work_struct *work)
{
        struct usb_gadget *gadget = work_to_gadget(work);
        struct usb_udc *udc = gadget->udc;

        if (udc)
                sysfs_notify(&udc->dev.kobj, NULL, "state");
}

void usb_gadget_set_state(struct usb_gadget *gadget,
                enum usb_device_state state)
{
        unsigned long flags;

        spin_lock_irqsave(&gadget->state_lock, flags);
        gadget->state = state;
        if (!gadget->teardown)
                schedule_work(&gadget->work);
        spin_unlock_irqrestore(&gadget->state_lock, flags);
        trace_usb_gadget_set_state(gadget, 0);
}
EXPORT_SYMBOL_GPL(usb_gadget_set_state);

/* ------------------------------------------------------------------------- */

/* Acquire connect_lock before calling this function. */
static int usb_udc_connect_control_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock)
{
        if (udc->vbus)
                return usb_gadget_connect_locked(udc->gadget);
        else
                return usb_gadget_disconnect_locked(udc->gadget);
}

static void vbus_event_work(struct work_struct *work)
{
        struct usb_udc *udc = container_of(work, struct usb_udc, vbus_work);

        mutex_lock(&udc->connect_lock);
        usb_udc_connect_control_locked(udc);
        mutex_unlock(&udc->connect_lock);
}

/**
 * usb_udc_vbus_handler - updates the udc core vbus status, and try to
 * connect or disconnect gadget
 * @gadget: The gadget which vbus change occurs
 * @status: The vbus status
 *
 * The udc driver calls it when it wants to connect or disconnect gadget
 * according to vbus status.
 *
 * This function can be invoked from interrupt context by irq handlers of
 * the gadget drivers, however, usb_udc_connect_control() has to run in
 * non-atomic context due to the following:
 * a. Some of the gadget driver implementations expect the ->pullup
 * callback to be invoked in non-atomic context.
 * b. usb_gadget_disconnect() acquires udc_lock which is a mutex.
 * Hence offload invocation of usb_udc_connect_control() to workqueue.
 */
void usb_udc_vbus_handler(struct usb_gadget *gadget, bool status)
{
        struct usb_udc *udc = gadget->udc;

        if (udc) {
                udc->vbus = status;
                schedule_work(&udc->vbus_work);
        }
}
EXPORT_SYMBOL_GPL(usb_udc_vbus_handler);

/**
 * usb_gadget_udc_reset - notifies the udc core that bus reset occurs
 * @gadget: The gadget which bus reset occurs
 * @driver: The gadget driver we want to notify
 *
 * If the udc driver has bus reset handler, it needs to call this when the bus
 * reset occurs, it notifies the gadget driver that the bus reset occurs as
 * well as updates gadget state.
 */
void usb_gadget_udc_reset(struct usb_gadget *gadget,
                struct usb_gadget_driver *driver)
{
        driver->reset(gadget);
        usb_gadget_set_state(gadget, USB_STATE_DEFAULT);
}
EXPORT_SYMBOL_GPL(usb_gadget_udc_reset);

/**
 * usb_gadget_udc_start_locked - tells usb device controller to start up
 * @udc: The UDC to be started
 *
 * This call is issued by the UDC Class driver when it's about
 * to register a gadget driver to the device controller, before
 * calling gadget driver's bind() method.
 *
 * It allows the controller to be powered off until strictly
 * necessary to have it powered on.
 *
 * Returns zero on success, else negative errno.
 *
 * Caller should acquire connect_lock before invoking this function.
 */
static inline int usb_gadget_udc_start_locked(struct usb_udc *udc)
        __must_hold(&udc->connect_lock)
{
        int ret;

        if (udc->started) {
                dev_err(&udc->dev, "UDC had already started\n");
                return -EBUSY;
        }

        ret = udc->gadget->ops->udc_start(udc->gadget, udc->driver);
        if (!ret)
                udc->started = true;

        return ret;
}

/**
 * usb_gadget_udc_stop_locked - tells usb device controller we don't need it anymore
 * @udc: The UDC to be stopped
 *
 * This call is issued by the UDC Class driver after calling
 * gadget driver's unbind() method.
 *
 * The details are implementation specific, but it can go as
 * far as powering off UDC completely and disable its data
 * line pullups.
 *
 * Caller should acquire connect lock before invoking this function.
 */
static inline void usb_gadget_udc_stop_locked(struct usb_udc *udc)
        __must_hold(&udc->connect_lock)
{
        if (!udc->started) {
                dev_err(&udc->dev, "UDC had already stopped\n");
                return;
        }

        udc->gadget->ops->udc_stop(udc->gadget);
        udc->started = false;
}

/**
 * usb_gadget_udc_set_speed - tells usb device controller speed supported by
 *    current driver
 * @udc: The device we want to set maximum speed
 * @speed: The maximum speed to allowed to run
 *
 * This call is issued by the UDC Class driver before calling
 * usb_gadget_udc_start_locked() in order to make sure that
 * we don't try to connect on speeds the gadget driver
 * doesn't support.
 */
static inline void usb_gadget_udc_set_speed(struct usb_udc *udc,
                                            enum usb_device_speed speed)
{
        struct usb_gadget *gadget = udc->gadget;
        enum usb_device_speed s;

        if (speed == USB_SPEED_UNKNOWN)
                s = gadget->max_speed;
        else
                s = min(speed, gadget->max_speed);

        if (s == USB_SPEED_SUPER_PLUS && gadget->ops->udc_set_ssp_rate)
                gadget->ops->udc_set_ssp_rate(gadget, gadget->max_ssp_rate);
        else if (gadget->ops->udc_set_speed)
                gadget->ops->udc_set_speed(gadget, s);
}

/**
 * usb_gadget_enable_async_callbacks - tell usb device controller to enable asynchronous callbacks
 * @udc: The UDC which should enable async callbacks
 *
 * This routine is used when binding gadget drivers.  It undoes the effect
 * of usb_gadget_disable_async_callbacks(); the UDC driver should enable IRQs
 * (if necessary) and resume issuing callbacks.
 *
 * This routine will always be called in process context.
 */
static inline void usb_gadget_enable_async_callbacks(struct usb_udc *udc)
{
        struct usb_gadget *gadget = udc->gadget;

        if (gadget->ops->udc_async_callbacks)
                gadget->ops->udc_async_callbacks(gadget, true);
}

/**
 * usb_gadget_disable_async_callbacks - tell usb device controller to disable asynchronous callbacks
 * @udc: The UDC which should disable async callbacks
 *
 * This routine is used when unbinding gadget drivers.  It prevents a race:
 * The UDC driver doesn't know when the gadget driver's ->unbind callback
 * runs, so unless it is told to disable asynchronous callbacks, it might
 * issue a callback (such as ->disconnect) after the unbind has completed.
 *
 * After this function runs, the UDC driver must suppress all ->suspend,
 * ->resume, ->disconnect, ->reset, and ->setup callbacks to the gadget driver
 * until async callbacks are again enabled.  A simple-minded but effective
 * way to accomplish this is to tell the UDC hardware not to generate any
 * more IRQs.
 *
 * Request completion callbacks must still be issued.  However, it's okay
 * to defer them until the request is cancelled, since the pull-up will be
 * turned off during the time period when async callbacks are disabled.
 *
 * This routine will always be called in process context.
 */
static inline void usb_gadget_disable_async_callbacks(struct usb_udc *udc)
{
        struct usb_gadget *gadget = udc->gadget;

        if (gadget->ops->udc_async_callbacks)
                gadget->ops->udc_async_callbacks(gadget, false);
}

/**
 * usb_udc_release - release the usb_udc struct
 * @dev: the dev member within usb_udc
 *
 * This is called by driver's core in order to free memory once the last
 * reference is released.
 */
static void usb_udc_release(struct device *dev)
{
        struct usb_udc *udc;

        udc = container_of(dev, struct usb_udc, dev);
        dev_dbg(dev, "releasing '%s'\n", dev_name(dev));
        kfree(udc);
}

static const struct attribute_group *usb_udc_attr_groups[];

static void usb_udc_nop_release(struct device *dev)
{
        dev_vdbg(dev, "%s\n", __func__);
}

/**
 * usb_initialize_gadget - initialize a gadget and its embedded struct device
 * @parent: the parent device to this udc. Usually the controller driver's
 * device.
 * @gadget: the gadget to be initialized.
 * @release: a gadget release function.
 */
void usb_initialize_gadget(struct device *parent, struct usb_gadget *gadget,
                void (*release)(struct device *dev))
{
        spin_lock_init(&gadget->state_lock);
        gadget->teardown = false;
        INIT_WORK(&gadget->work, usb_gadget_state_work);
        gadget->dev.parent = parent;

        if (release)
                gadget->dev.release = release;
        else
                gadget->dev.release = usb_udc_nop_release;

        device_initialize(&gadget->dev);
        gadget->dev.bus = &gadget_bus_type;
}
EXPORT_SYMBOL_GPL(usb_initialize_gadget);

/**
 * usb_add_gadget - adds a new gadget to the udc class driver list
 * @gadget: the gadget to be added to the list.
 *
 * Returns zero on success, negative errno otherwise.
 * Does not do a final usb_put_gadget() if an error occurs.
 */
int usb_add_gadget(struct usb_gadget *gadget)
{
        struct usb_udc                *udc;
        int                        ret = -ENOMEM;

        udc = kzalloc_obj(*udc);
        if (!udc)
                goto error;

        device_initialize(&udc->dev);
        udc->dev.release = usb_udc_release;
        udc->dev.class = &udc_class;
        udc->dev.groups = usb_udc_attr_groups;
        udc->dev.parent = gadget->dev.parent;
        ret = dev_set_name(&udc->dev, "%s",
                        kobject_name(&gadget->dev.parent->kobj));
        if (ret)
                goto err_put_udc;

        udc->gadget = gadget;
        gadget->udc = udc;
        mutex_init(&udc->connect_lock);

        udc->started = false;

        mutex_lock(&udc_lock);
        list_add_tail(&udc->list, &udc_list);
        mutex_unlock(&udc_lock);
        INIT_WORK(&udc->vbus_work, vbus_event_work);

        ret = device_add(&udc->dev);
        if (ret)
                goto err_unlist_udc;

        usb_gadget_set_state(gadget, USB_STATE_NOTATTACHED);
        udc->vbus = true;

        ret = ida_alloc(&gadget_id_numbers, GFP_KERNEL);
        if (ret < 0)
                goto err_del_udc;
        gadget->id_number = ret;
        dev_set_name(&gadget->dev, "gadget.%d", ret);

        ret = device_add(&gadget->dev);
        if (ret)
                goto err_free_id;

        ret = sysfs_create_link(&udc->dev.kobj,
                                &gadget->dev.kobj, "gadget");
        if (ret)
                goto err_del_gadget;

        return 0;

 err_del_gadget:
        device_del(&gadget->dev);

 err_free_id:
        ida_free(&gadget_id_numbers, gadget->id_number);

 err_del_udc:
        flush_work(&gadget->work);
        device_del(&udc->dev);

 err_unlist_udc:
        mutex_lock(&udc_lock);
        list_del(&udc->list);
        mutex_unlock(&udc_lock);

 err_put_udc:
        put_device(&udc->dev);

 error:
        return ret;
}
EXPORT_SYMBOL_GPL(usb_add_gadget);

/**
 * usb_add_gadget_udc_release - adds a new gadget to the udc class driver list
 * @parent: the parent device to this udc. Usually the controller driver's
 * device.
 * @gadget: the gadget to be added to the list.
 * @release: a gadget release function.
 *
 * Returns zero on success, negative errno otherwise.
 * Calls the gadget release function in the latter case.
 */
int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget,
                void (*release)(struct device *dev))
{
        int        ret;

        usb_initialize_gadget(parent, gadget, release);
        ret = usb_add_gadget(gadget);
        if (ret)
                usb_put_gadget(gadget);
        return ret;
}
EXPORT_SYMBOL_GPL(usb_add_gadget_udc_release);

/**
 * usb_get_gadget_udc_name - get the name of the first UDC controller
 * This functions returns the name of the first UDC controller in the system.
 * Please note that this interface is usefull only for legacy drivers which
 * assume that there is only one UDC controller in the system and they need to
 * get its name before initialization. There is no guarantee that the UDC
 * of the returned name will be still available, when gadget driver registers
 * itself.
 *
 * Returns pointer to string with UDC controller name on success, NULL
 * otherwise. Caller should kfree() returned string.
 */
char *usb_get_gadget_udc_name(void)
{
        struct usb_udc *udc;
        char *name = NULL;

        /* For now we take the first available UDC */
        mutex_lock(&udc_lock);
        list_for_each_entry(udc, &udc_list, list) {
                if (!udc->driver) {
                        name = kstrdup(udc->gadget->name, GFP_KERNEL);
                        break;
                }
        }
        mutex_unlock(&udc_lock);
        return name;
}
EXPORT_SYMBOL_GPL(usb_get_gadget_udc_name);

/**
 * usb_add_gadget_udc - adds a new gadget to the udc class driver list
 * @parent: the parent device to this udc. Usually the controller
 * driver's device.
 * @gadget: the gadget to be added to the list
 *
 * Returns zero on success, negative errno otherwise.
 */
int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget)
{
        return usb_add_gadget_udc_release(parent, gadget, NULL);
}
EXPORT_SYMBOL_GPL(usb_add_gadget_udc);

/**
 * usb_del_gadget - deletes a gadget and unregisters its udc
 * @gadget: the gadget to be deleted.
 *
 * This will unbind @gadget, if it is bound.
 * It will not do a final usb_put_gadget().
 */
void usb_del_gadget(struct usb_gadget *gadget)
{
        struct usb_udc *udc = gadget->udc;
        unsigned long flags;

        if (!udc)
                return;

        dev_vdbg(gadget->dev.parent, "unregistering gadget\n");

        mutex_lock(&udc_lock);
        list_del(&udc->list);
        mutex_unlock(&udc_lock);

        kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE);
        sysfs_remove_link(&udc->dev.kobj, "gadget");
        device_del(&gadget->dev);
        /*
         * Set the teardown flag before flushing the work to prevent new work
         * from being scheduled while we are cleaning up.
         */
        spin_lock_irqsave(&gadget->state_lock, flags);
        gadget->teardown = true;
        spin_unlock_irqrestore(&gadget->state_lock, flags);
        flush_work(&gadget->work);
        ida_free(&gadget_id_numbers, gadget->id_number);
        cancel_work_sync(&udc->vbus_work);
        device_unregister(&udc->dev);
}
EXPORT_SYMBOL_GPL(usb_del_gadget);

/**
 * usb_del_gadget_udc - unregisters a gadget
 * @gadget: the gadget to be unregistered.
 *
 * Calls usb_del_gadget() and does a final usb_put_gadget().
 */
void usb_del_gadget_udc(struct usb_gadget *gadget)
{
        usb_del_gadget(gadget);
        usb_put_gadget(gadget);
}
EXPORT_SYMBOL_GPL(usb_del_gadget_udc);

/* ------------------------------------------------------------------------- */

static int gadget_match_driver(struct device *dev, const struct device_driver *drv)
{
        struct usb_gadget *gadget = dev_to_usb_gadget(dev);
        struct usb_udc *udc = gadget->udc;
        const struct usb_gadget_driver *driver = container_of(drv,
                        struct usb_gadget_driver, driver);

        /* If the driver specifies a udc_name, it must match the UDC's name */
        if (driver->udc_name &&
                        strcmp(driver->udc_name, dev_name(&udc->dev)) != 0)
                return 0;

        /* If the driver is already bound to a gadget, it doesn't match */
        if (driver->is_bound)
                return 0;

        /* Otherwise any gadget driver matches any UDC */
        return 1;
}

static int gadget_bind_driver(struct device *dev)
{
        struct usb_gadget *gadget = dev_to_usb_gadget(dev);
        struct usb_udc *udc = gadget->udc;
        struct usb_gadget_driver *driver = container_of(dev->driver,
                        struct usb_gadget_driver, driver);
        int ret = 0;

        mutex_lock(&udc_lock);
        if (driver->is_bound) {
                mutex_unlock(&udc_lock);
                return -ENXIO;                /* Driver binds to only one gadget */
        }
        driver->is_bound = true;
        udc->driver = driver;
        mutex_unlock(&udc_lock);

        dev_dbg(&udc->dev, "binding gadget driver [%s]\n", driver->function);

        usb_gadget_udc_set_speed(udc, driver->max_speed);

        ret = driver->bind(udc->gadget, driver);
        if (ret)
                goto err_bind;

        mutex_lock(&udc->connect_lock);
        ret = usb_gadget_udc_start_locked(udc);
        if (ret) {
                mutex_unlock(&udc->connect_lock);
                goto err_start;
        }
        usb_gadget_enable_async_callbacks(udc);
        udc->allow_connect = true;
        ret = usb_udc_connect_control_locked(udc);
        if (ret)
                goto err_connect_control;

        mutex_unlock(&udc->connect_lock);

        kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE);
        return 0;

 err_connect_control:
        udc->allow_connect = false;
        usb_gadget_disable_async_callbacks(udc);
        if (gadget->irq)
                synchronize_irq(gadget->irq);
        usb_gadget_udc_stop_locked(udc);
        mutex_unlock(&udc->connect_lock);

 err_start:
        driver->unbind(udc->gadget);

 err_bind:
        if (ret != -EISNAM)
                dev_err(&udc->dev, "failed to start %s: %d\n",
                        driver->function, ret);

        mutex_lock(&udc_lock);
        udc->driver = NULL;
        driver->is_bound = false;
        mutex_unlock(&udc_lock);

        return ret;
}

static void gadget_unbind_driver(struct device *dev)
{
        struct usb_gadget *gadget = dev_to_usb_gadget(dev);
        struct usb_udc *udc = gadget->udc;
        struct usb_gadget_driver *driver = udc->driver;

        dev_dbg(&udc->dev, "unbinding gadget driver [%s]\n", driver->function);

        udc->allow_connect = false;
        cancel_work_sync(&udc->vbus_work);
        mutex_lock(&udc->connect_lock);
        usb_gadget_disconnect_locked(gadget);
        usb_gadget_disable_async_callbacks(udc);
        if (gadget->irq)
                synchronize_irq(gadget->irq);
        mutex_unlock(&udc->connect_lock);

        udc->driver->unbind(gadget);

        mutex_lock(&udc->connect_lock);
        usb_gadget_udc_stop_locked(udc);
        mutex_unlock(&udc->connect_lock);

        mutex_lock(&udc_lock);
        driver->is_bound = false;
        udc->driver = NULL;
        mutex_unlock(&udc_lock);

        kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE);
}

/* ------------------------------------------------------------------------- */

int usb_gadget_register_driver_owner(struct usb_gadget_driver *driver,
                struct module *owner, const char *mod_name)
{
        int ret;

        if (!driver || !driver->bind || !driver->setup)
                return -EINVAL;

        driver->driver.bus = &gadget_bus_type;
        driver->driver.owner = owner;
        driver->driver.mod_name = mod_name;
        driver->driver.probe_type = PROBE_FORCE_SYNCHRONOUS;
        ret = driver_register(&driver->driver);
        if (ret) {
                pr_warn("%s: driver registration failed: %d\n",
                                driver->function, ret);
                return ret;
        }

        mutex_lock(&udc_lock);
        if (!driver->is_bound) {
                if (driver->match_existing_only) {
                        pr_warn("%s: couldn't find an available UDC or it's busy\n",
                                        driver->function);
                        ret = -EBUSY;
                } else {
                        pr_info("%s: couldn't find an available UDC\n",
                                        driver->function);
                        ret = 0;
                }
        }
        mutex_unlock(&udc_lock);

        if (ret)
                driver_unregister(&driver->driver);
        return ret;
}
EXPORT_SYMBOL_GPL(usb_gadget_register_driver_owner);

int usb_gadget_unregister_driver(struct usb_gadget_driver *driver)
{
        if (!driver || !driver->unbind)
                return -EINVAL;

        driver_unregister(&driver->driver);
        return 0;
}
EXPORT_SYMBOL_GPL(usb_gadget_unregister_driver);

/* ------------------------------------------------------------------------- */

static ssize_t srp_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t n)
{
        struct usb_udc                *udc = container_of(dev, struct usb_udc, dev);

        if (sysfs_streq(buf, "1"))
                usb_gadget_wakeup(udc->gadget);

        return n;
}
static DEVICE_ATTR_WO(srp);

static ssize_t soft_connect_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t n)
{
        struct usb_udc                *udc = container_of(dev, struct usb_udc, dev);
        ssize_t                        ret;

        device_lock(&udc->gadget->dev);
        if (!udc->driver) {
                dev_err(dev, "soft-connect without a gadget driver\n");
                ret = -EOPNOTSUPP;
                goto out;
        }

        if (sysfs_streq(buf, "connect")) {
                mutex_lock(&udc->connect_lock);
                usb_gadget_udc_start_locked(udc);
                usb_gadget_connect_locked(udc->gadget);
                mutex_unlock(&udc->connect_lock);
        } else if (sysfs_streq(buf, "disconnect")) {
                mutex_lock(&udc->connect_lock);
                usb_gadget_disconnect_locked(udc->gadget);
                usb_gadget_udc_stop_locked(udc);
                mutex_unlock(&udc->connect_lock);
        } else {
                dev_err(dev, "unsupported command '%s'\n", buf);
                ret = -EINVAL;
                goto out;
        }

        ret = n;
out:
        device_unlock(&udc->gadget->dev);
        return ret;
}
static DEVICE_ATTR_WO(soft_connect);

static ssize_t state_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct usb_udc                *udc = container_of(dev, struct usb_udc, dev);
        struct usb_gadget        *gadget = udc->gadget;

        return sprintf(buf, "%s\n", usb_state_string(gadget->state));
}
static DEVICE_ATTR_RO(state);

static ssize_t function_show(struct device *dev, struct device_attribute *attr,
                             char *buf)
{
        struct usb_udc                *udc = container_of(dev, struct usb_udc, dev);
        struct usb_gadget_driver *drv;
        int                        rc = 0;

        mutex_lock(&udc_lock);
        drv = udc->driver;
        if (drv && drv->function)
                rc = scnprintf(buf, PAGE_SIZE, "%s\n", drv->function);
        mutex_unlock(&udc_lock);
        return rc;
}
static DEVICE_ATTR_RO(function);

#define USB_UDC_SPEED_ATTR(name, param)                                        \
ssize_t name##_show(struct device *dev,                                        \
                struct device_attribute *attr, char *buf)                \
{                                                                        \
        struct usb_udc *udc = container_of(dev, struct usb_udc, dev);        \
        return scnprintf(buf, PAGE_SIZE, "%s\n",                        \
                        usb_speed_string(udc->gadget->param));                \
}                                                                        \
static DEVICE_ATTR_RO(name)

static USB_UDC_SPEED_ATTR(current_speed, speed);
static USB_UDC_SPEED_ATTR(maximum_speed, max_speed);

#define USB_UDC_ATTR(name)                                        \
ssize_t name##_show(struct device *dev,                                \
                struct device_attribute *attr, char *buf)        \
{                                                                \
        struct usb_udc                *udc = container_of(dev, struct usb_udc, dev); \
        struct usb_gadget        *gadget = udc->gadget;                \
                                                                \
        return scnprintf(buf, PAGE_SIZE, "%d\n", gadget->name);        \
}                                                                \
static DEVICE_ATTR_RO(name)

static USB_UDC_ATTR(is_otg);
static USB_UDC_ATTR(is_a_peripheral);
static USB_UDC_ATTR(b_hnp_enable);
static USB_UDC_ATTR(a_hnp_support);
static USB_UDC_ATTR(a_alt_hnp_support);
static USB_UDC_ATTR(is_selfpowered);

static struct attribute *usb_udc_attrs[] = {
        &dev_attr_srp.attr,
        &dev_attr_soft_connect.attr,
        &dev_attr_state.attr,
        &dev_attr_function.attr,
        &dev_attr_current_speed.attr,
        &dev_attr_maximum_speed.attr,

        &dev_attr_is_otg.attr,
        &dev_attr_is_a_peripheral.attr,
        &dev_attr_b_hnp_enable.attr,
        &dev_attr_a_hnp_support.attr,
        &dev_attr_a_alt_hnp_support.attr,
        &dev_attr_is_selfpowered.attr,
        NULL,
};

static const struct attribute_group usb_udc_attr_group = {
        .attrs = usb_udc_attrs,
};

static const struct attribute_group *usb_udc_attr_groups[] = {
        &usb_udc_attr_group,
        NULL,
};

static int usb_udc_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        const struct usb_udc        *udc = container_of(dev, struct usb_udc, dev);
        int                        ret;

        ret = add_uevent_var(env, "USB_UDC_NAME=%s", udc->gadget->name);
        if (ret) {
                dev_err(dev, "failed to add uevent USB_UDC_NAME\n");
                return ret;
        }

        mutex_lock(&udc_lock);
        if (udc->driver)
                ret = add_uevent_var(env, "USB_UDC_DRIVER=%s",
                                udc->driver->function);
        mutex_unlock(&udc_lock);
        if (ret) {
                dev_err(dev, "failed to add uevent USB_UDC_DRIVER\n");
                return ret;
        }

        return 0;
}

static const struct class udc_class = {
        .name                = "udc",
        .dev_uevent        = usb_udc_uevent,
};

static const struct bus_type gadget_bus_type = {
        .name = "gadget",
        .probe = gadget_bind_driver,
        .remove = gadget_unbind_driver,
        .match = gadget_match_driver,
};

static int __init usb_udc_init(void)
{
        int rc;

        rc = class_register(&udc_class);
        if (rc)
                return rc;

        rc = bus_register(&gadget_bus_type);
        if (rc)
                class_unregister(&udc_class);
        return rc;
}
subsys_initcall(usb_udc_init);

static void __exit usb_udc_exit(void)
{
        bus_unregister(&gadget_bus_type);
        class_unregister(&udc_class);
}
module_exit(usb_udc_exit);

MODULE_DESCRIPTION("UDC Framework");
MODULE_AUTHOR("Felipe Balbi <balbi@ti.com>");
MODULE_LICENSE("GPL v2");





































































   12 


















   12 













































































































































































































































    5 

   12 
    4 






   21 

   24 







   14 


















































































































































































































   25 













   22 













   25 


















   22 

   25 

   25 












   22 







   14 

   14 



    3 


    3 










































   21 

















   24 





























   11 






















































































































































   10 




   11 


























































































































































    5 








































   34 














   42 



   39 






































































































    5 

   36 



   19 
   41 




















   21 























   22 
   22 


   21 


















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/file.c
 *
 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
 *
 *  Manage the dynamic fd arrays in the process files_struct.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <linux/file_ref.h>
#include <net/sock.h>
#include <linux/init_task.h>

#include "internal.h"

static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
{
        /*
         * If the reference count was already in the dead zone, then this
         * put() operation is imbalanced. Warn, put the reference count back to
         * DEAD and tell the caller to not deconstruct the object.
         */
        if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
                atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
                return false;
        }

        /*
         * This is a put() operation on a saturated refcount. Restore the
         * mean saturation value and tell the caller to not deconstruct the
         * object.
         */
        if (cnt > FILE_REF_MAXREF)
                atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
        return false;
}

/**
 * __file_ref_put - Slowpath of file_ref_put()
 * @ref:        Pointer to the reference count
 * @cnt:        Current reference count
 *
 * Invoked when the reference count is outside of the valid zone.
 *
 * Return:
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely schedule the
 *        object, which is protected by the reference counter, for
 *        deconstruction.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        deconstruct the protected object.
 */
bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
{
        /* Did this drop the last reference? */
        if (likely(cnt == FILE_REF_NOREF)) {
                /*
                 * Carefully try to set the reference count to FILE_REF_DEAD.
                 *
                 * This can fail if a concurrent get() operation has
                 * elevated it again or the corresponding put() even marked
                 * it dead already. Both are valid situations and do not
                 * require a retry. If this fails the caller is not
                 * allowed to deconstruct the object.
                 */
                if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
                        return false;

                /*
                 * The caller can safely schedule the object for
                 * deconstruction. Provide acquire ordering.
                 */
                smp_acquire__after_ctrl_dep();
                return true;
        }

        return __file_ref_put_badval(ref, cnt);
}
EXPORT_SYMBOL_GPL(__file_ref_put);

unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
unsigned int sysctl_nr_open_max =
        __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;

static void __free_fdtable(struct fdtable *fdt)
{
        kvfree(fdt->fd);
        kvfree(fdt->open_fds);
        kfree(fdt);
}

static void free_fdtable_rcu(struct rcu_head *rcu)
{
        __free_fdtable(container_of(rcu, struct fdtable, rcu));
}

#define BITBIT_NR(nr)        BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr)        (BITBIT_NR(nr) * sizeof(long))

#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
/*
 * Copy 'count' fd bits from the old table to the new table and clear the extra
 * space if any.  This does not copy the file pointers.  Called with the files
 * spinlock held for write.
 */
static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
                            unsigned int copy_words)
{
        unsigned int nwords = fdt_words(nfdt);

        bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
                        copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
        bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
                        copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
        bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
                        copy_words, nwords);
}

/*
 * Copy all file descriptors from the old table to the new, expanded table and
 * clear the extra space.  Called with the files spinlock held for write.
 */
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
        size_t cpy, set;

        BUG_ON(nfdt->max_fds < ofdt->max_fds);

        cpy = ofdt->max_fds * sizeof(struct file *);
        set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
        memcpy(nfdt->fd, ofdt->fd, cpy);
        memset((char *)nfdt->fd + cpy, 0, set);

        copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
}

/*
 * Note how the fdtable bitmap allocations very much have to be a multiple of
 * BITS_PER_LONG. This is not only because we walk those things in chunks of
 * 'unsigned long' in some places, but simply because that is how the Linux
 * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
 * they are very much "bits in an array of unsigned long".
 */
static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
{
        struct fdtable *fdt;
        unsigned int nr;
        void *data;

        /*
         * Figure out how many fds we actually want to support in this fdtable.
         * Allocation steps are keyed to the size of the fdarray, since it
         * grows far faster than any of the other dynamic data. We try to fit
         * the fdarray into comfortable page-tuned chunks: starting at 1024B
         * and growing in powers of two from there on.  Since we called only
         * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
         * already gives BITS_PER_LONG slots), the above boils down to
         * 1.  use the smallest power of two large enough to give us that many
         * slots.
         * 2.  on 32bit skip 64 and 128 - the minimal capacity we want there is
         * 256 slots (i.e. 1Kb fd array).
         * 3.  on 64bit don't skip anything, 1Kb fd array means 128 slots there
         * and we are never going to be asked for 64 or less.
         */
        if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
                nr = 256;
        else
                nr = roundup_pow_of_two(slots_wanted);
        /*
         * Note that this can drive nr *below* what we had passed if sysctl_nr_open
         * had been set lower between the check in expand_files() and here.
         *
         * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
         * bitmaps handling below becomes unpleasant, to put it mildly...
         */
        if (unlikely(nr > sysctl_nr_open)) {
                nr = round_down(sysctl_nr_open, BITS_PER_LONG);
                if (nr < slots_wanted)
                        return ERR_PTR(-EMFILE);
        }

        /*
         * Check if the allocation size would exceed INT_MAX. kvmalloc_array()
         * and kvmalloc() will warn if the allocation size is greater than
         * INT_MAX, as filp_cache objects are not __GFP_NOWARN.
         *
         * This can happen when sysctl_nr_open is set to a very high value and
         * a process tries to use a file descriptor near that limit. For example,
         * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what
         * systemd typically sets it to - then trying to use a file descriptor
         * close to that value will require allocating a file descriptor table
         * that exceeds 8GB in size.
         */
        if (unlikely(nr > INT_MAX / sizeof(struct file *)))
                return ERR_PTR(-EMFILE);

        fdt = kmalloc_obj(struct fdtable, GFP_KERNEL_ACCOUNT);
        if (!fdt)
                goto out;
        fdt->max_fds = nr;
        data = kvmalloc_objs(struct file *, nr, GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_fdt;
        fdt->fd = data;

        data = kvmalloc(max_t(size_t,
                                 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
                                 GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_arr;
        fdt->open_fds = data;
        data += nr / BITS_PER_BYTE;
        fdt->close_on_exec = data;
        data += nr / BITS_PER_BYTE;
        fdt->full_fds_bits = data;

        return fdt;

out_arr:
        kvfree(fdt->fd);
out_fdt:
        kfree(fdt);
out:
        return ERR_PTR(-ENOMEM);
}

/*
 * Expand the file descriptor table.
 * This function will allocate a new fdtable and both fd array and fdset, of
 * the given size.
 * Return <0 error code on error; 0 on successful completion.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_fdtable(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *new_fdt, *cur_fdt;

        spin_unlock(&files->file_lock);
        new_fdt = alloc_fdtable(nr + 1);

        /* make sure all fd_install() have seen resize_in_progress
         * or have finished their rcu_read_lock_sched() section.
         */
        if (atomic_read(&files->count) > 1)
                synchronize_rcu();

        spin_lock(&files->file_lock);
        if (IS_ERR(new_fdt))
                return PTR_ERR(new_fdt);
        cur_fdt = files_fdtable(files);
        BUG_ON(nr < cur_fdt->max_fds);
        copy_fdtable(new_fdt, cur_fdt);
        rcu_assign_pointer(files->fdt, new_fdt);
        if (cur_fdt != &files->fdtab)
                call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
        /* coupled with smp_rmb() in fd_install() */
        smp_wmb();
        return 0;
}

/*
 * Expand files.
 * This function will expand the file structures, if the requested size exceeds
 * the current capacity and there is room for expansion.
 * Return <0 error code on error; 0 on success.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_files(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *fdt;
        int error;

repeat:
        fdt = files_fdtable(files);

        /* Do we need to expand? */
        if (nr < fdt->max_fds)
                return 0;

        if (unlikely(files->resize_in_progress)) {
                spin_unlock(&files->file_lock);
                wait_event(files->resize_wait, !files->resize_in_progress);
                spin_lock(&files->file_lock);
                goto repeat;
        }

        /* Can we expand? */
        if (unlikely(nr >= sysctl_nr_open))
                return -EMFILE;

        /* All good, so we try */
        files->resize_in_progress = true;
        error = expand_fdtable(files, nr);
        files->resize_in_progress = false;

        wake_up_all(&files->resize_wait);
        return error;
}

static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
                                       bool set)
{
        if (set) {
                __set_bit(fd, fdt->close_on_exec);
        } else {
                if (test_bit(fd, fdt->close_on_exec))
                        __clear_bit(fd, fdt->close_on_exec);
        }
}

static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
{
        __set_bit(fd, fdt->open_fds);
        __set_close_on_exec(fd, fdt, set);
        fd /= BITS_PER_LONG;
        if (!~fdt->open_fds[fd])
                __set_bit(fd, fdt->full_fds_bits);
}

static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __clear_bit(fd, fdt->open_fds);
        fd /= BITS_PER_LONG;
        if (test_bit(fd, fdt->full_fds_bits))
                __clear_bit(fd, fdt->full_fds_bits);
}

static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
        return test_bit(fd, fdt->open_fds);
}

/*
 * Note that a sane fdtable size always has to be a multiple of
 * BITS_PER_LONG, since we have bitmaps that are sized by this.
 *
 * punch_hole is optional - when close_range() is asked to unshare
 * and close, we don't need to copy descriptors in that range, so
 * a smaller cloned descriptor table might suffice if the last
 * currently opened descriptor falls into that range.
 */
static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole)
{
        unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);

        if (last == fdt->max_fds)
                return NR_OPEN_DEFAULT;
        if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
                last = find_last_bit(fdt->open_fds, punch_hole->from);
                if (last == punch_hole->from)
                        return NR_OPEN_DEFAULT;
        }
        return ALIGN(last + 1, BITS_PER_LONG);
}

/*
 * Allocate a new descriptor table and copy contents from the passed in
 * instance.  Returns a pointer to cloned table on success, ERR_PTR()
 * on failure.  For 'punch_hole' see sane_fdtable_size().
 */
struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole)
{
        struct files_struct *newf;
        struct file **old_fds, **new_fds;
        unsigned int open_files, i;
        struct fdtable *old_fdt, *new_fdt;

        newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
        if (!newf)
                return ERR_PTR(-ENOMEM);

        atomic_set(&newf->count, 1);

        spin_lock_init(&newf->file_lock);
        newf->resize_in_progress = false;
        init_waitqueue_head(&newf->resize_wait);
        newf->next_fd = 0;
        new_fdt = &newf->fdtab;
        new_fdt->max_fds = NR_OPEN_DEFAULT;
        new_fdt->close_on_exec = newf->close_on_exec_init;
        new_fdt->open_fds = newf->open_fds_init;
        new_fdt->full_fds_bits = newf->full_fds_bits_init;
        new_fdt->fd = &newf->fd_array[0];

        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        open_files = sane_fdtable_size(old_fdt, punch_hole);

        /*
         * Check whether we need to allocate a larger fd array and fd set.
         */
        while (unlikely(open_files > new_fdt->max_fds)) {
                spin_unlock(&oldf->file_lock);

                if (new_fdt != &newf->fdtab)
                        __free_fdtable(new_fdt);

                new_fdt = alloc_fdtable(open_files);
                if (IS_ERR(new_fdt)) {
                        kmem_cache_free(files_cachep, newf);
                        return ERR_CAST(new_fdt);
                }

                /*
                 * Reacquire the oldf lock and a pointer to its fd table
                 * who knows it may have a new bigger fd table. We need
                 * the latest pointer.
                 */
                spin_lock(&oldf->file_lock);
                old_fdt = files_fdtable(oldf);
                open_files = sane_fdtable_size(old_fdt, punch_hole);
        }

        copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);

        old_fds = old_fdt->fd;
        new_fds = new_fdt->fd;

        /*
         * We may be racing against fd allocation from other threads using this
         * files_struct, despite holding ->file_lock.
         *
         * alloc_fd() might have already claimed a slot, while fd_install()
         * did not populate it yet. Note the latter operates locklessly, so
         * the file can show up as we are walking the array below.
         *
         * At the same time we know no files will disappear as all other
         * operations take the lock.
         *
         * Instead of trying to placate userspace racing with itself, we
         * ref the file if we see it and mark the fd slot as unused otherwise.
         */
        for (i = open_files; i != 0; i--) {
                struct file *f = rcu_dereference_raw(*old_fds++);
                if (f) {
                        get_file(f);
                } else {
                        __clear_open_fd(open_files - i, new_fdt);
                }
                rcu_assign_pointer(*new_fds++, f);
        }
        spin_unlock(&oldf->file_lock);

        /* clear the remainder */
        memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));

        rcu_assign_pointer(newf->fdt, new_fdt);

        return newf;
}

static struct fdtable *close_files(struct files_struct * files)
{
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
         * files structure.
         */
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned int i, j = 0;

        for (;;) {
                unsigned long set;
                i = j * BITS_PER_LONG;
                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds[j++];
                while (set) {
                        if (set & 1) {
                                struct file *file = fdt->fd[i];
                                if (file) {
                                        filp_close(file, files);
                                        cond_resched();
                                }
                        }
                        i++;
                        set >>= 1;
                }
        }

        return fdt;
}

void put_files_struct(struct files_struct *files)
{
        if (atomic_dec_and_test(&files->count)) {
                struct fdtable *fdt = close_files(files);

                /* free the arrays if they are not embedded */
                if (fdt != &files->fdtab)
                        __free_fdtable(fdt);
                kmem_cache_free(files_cachep, files);
        }
}

void exit_files(struct task_struct *tsk)
{
        struct files_struct * files = tsk->files;

        if (files) {
                task_lock(tsk);
                tsk->files = NULL;
                task_unlock(tsk);
                put_files_struct(files);
        }
}

struct files_struct init_files = {
        .count                = ATOMIC_INIT(1),
        .fdt                = &init_files.fdtab,
        .fdtab                = {
                .max_fds        = NR_OPEN_DEFAULT,
                .fd                = &init_files.fd_array[0],
                .close_on_exec        = init_files.close_on_exec_init,
                .open_fds        = init_files.open_fds_init,
                .full_fds_bits        = init_files.full_fds_bits_init,
        },
        .file_lock        = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
        .resize_wait        = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};

static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
        unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
        unsigned int maxbit = maxfd / BITS_PER_LONG;
        unsigned int bitbit = start / BITS_PER_LONG;
        unsigned int bit;

        /*
         * Try to avoid looking at the second level bitmap
         */
        bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
                                 start & (BITS_PER_LONG - 1));
        if (bit < BITS_PER_LONG)
                return bit + bitbit * BITS_PER_LONG;

        bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
        if (bitbit >= maxfd)
                return maxfd;
        if (bitbit > start)
                start = bitbit;
        return find_next_zero_bit(fdt->open_fds, maxfd, start);
}

/*
 * allocate a file descriptor, mark it busy.
 */
static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
        struct files_struct *files = current->files;
        unsigned int fd;
        int error;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
repeat:
        fdt = files_fdtable(files);
        fd = start;
        if (fd < files->next_fd)
                fd = files->next_fd;

        if (likely(fd < fdt->max_fds))
                fd = find_next_fd(fdt, fd);

        /*
         * N.B. For clone tasks sharing a files structure, this test
         * will limit the total number of files that can be opened.
         */
        error = -EMFILE;
        if (unlikely(fd >= end))
                goto out;

        if (unlikely(fd >= fdt->max_fds)) {
                error = expand_files(files, fd);
                if (error < 0)
                        goto out;

                goto repeat;
        }

        if (start <= files->next_fd)
                files->next_fd = fd + 1;

        __set_open_fd(fd, fdt, flags & O_CLOEXEC);
        error = fd;
        VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);

out:
        spin_unlock(&files->file_lock);
        return error;
}

int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
        return alloc_fd(0, nofile, flags);
}

int get_unused_fd_flags(unsigned flags)
{
        return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);

static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = files_fdtable(files);
        __clear_open_fd(fd, fdt);
        if (fd < files->next_fd)
                files->next_fd = fd;
}

void put_unused_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
}

EXPORT_SYMBOL(put_unused_fd);

/*
 * Install a file pointer in the fd array while it is being resized.
 *
 * We need to make sure our update to the array does not get lost as the resizing
 * thread can be copying the content as we modify it.
 *
 * We have two ways to do it:
 * - go off CPU waiting for resize_in_progress to clear
 * - take the spin lock
 *
 * The latter is trivial to implement and saves us from having to might_sleep()
 * for debugging purposes.
 *
 * This is moved out of line from fd_install() to convince gcc to optimize that
 * routine better.
 */
static void noinline fd_install_slowpath(unsigned int fd, struct file *file)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
        rcu_assign_pointer(fdt->fd[fd], file);
        spin_unlock(&files->file_lock);
}

/**
 * fd_install - install a file pointer in the fd array
 * @fd: file descriptor to install the file in
 * @file: the file to install
 *
 * This consumes the "file" refcount, so callers should treat it
 * as if they had called fput(file).
 */
void fd_install(unsigned int fd, struct file *file)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;

        if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
                return;

        rcu_read_lock_sched();
        if (unlikely(files->resize_in_progress)) {
                rcu_read_unlock_sched();
                fd_install_slowpath(fd, file);
                return;
        }
        /* coupled with smp_wmb() in expand_fdtable() */
        smp_rmb();
        fdt = rcu_dereference_sched(files->fdt);
        VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
        rcu_assign_pointer(fdt->fd[fd], file);
        rcu_read_unlock_sched();
}

EXPORT_SYMBOL(fd_install);

/**
 * file_close_fd_locked - return file associated with fd
 * @files: file struct to retrieve file from
 * @fd: file descriptor to retrieve file for
 *
 * Doesn't take a separate reference count.
 *
 * Context: files_lock must be held.
 *
 * Returns: The file associated with @fd (NULL if @fd is not open)
 */
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
{
        struct fdtable *fdt = files_fdtable(files);
        struct file *file;

        lockdep_assert_held(&files->file_lock);

        if (fd >= fdt->max_fds)
                return NULL;

        fd = array_index_nospec(fd, fdt->max_fds);
        file = rcu_dereference_raw(fdt->fd[fd]);
        if (file) {
                rcu_assign_pointer(fdt->fd[fd], NULL);
                __put_unused_fd(files, fd);
        }
        return file;
}

int close_fd(unsigned fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = file_close_fd_locked(files, fd);
        spin_unlock(&files->file_lock);
        if (!file)
                return -EBADF;

        return filp_close(file, files);
}
EXPORT_SYMBOL(close_fd);

/**
 * last_fd - return last valid index into fd table
 * @fdt: File descriptor table.
 *
 * Context: Either rcu read lock or files_lock must be held.
 *
 * Returns: Last valid index into fdtable.
 */
static inline unsigned last_fd(struct fdtable *fdt)
{
        return fdt->max_fds - 1;
}

static inline void __range_cloexec(struct files_struct *cur_fds,
                                   unsigned int fd, unsigned int max_fd)
{
        struct fdtable *fdt;

        /* make sure we're using the correct maximum value */
        spin_lock(&cur_fds->file_lock);
        fdt = files_fdtable(cur_fds);
        max_fd = min(last_fd(fdt), max_fd);
        if (fd <= max_fd)
                bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
        spin_unlock(&cur_fds->file_lock);
}

static inline void __range_close(struct files_struct *files, unsigned int fd,
                                 unsigned int max_fd)
{
        struct file *file;
        struct fdtable *fdt;
        unsigned n;

        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        n = last_fd(fdt);
        max_fd = min(max_fd, n);

        for (fd = find_next_bit(fdt->open_fds, max_fd + 1, fd);
             fd <= max_fd;
             fd = find_next_bit(fdt->open_fds, max_fd + 1, fd + 1)) {
                file = file_close_fd_locked(files, fd);
                if (file) {
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                        fdt = files_fdtable(files);
                } else if (need_resched()) {
                        spin_unlock(&files->file_lock);
                        cond_resched();
                        spin_lock(&files->file_lock);
                        fdt = files_fdtable(files);
                }
        }
        spin_unlock(&files->file_lock);
}

/**
 * sys_close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 * @flags:  CLOSE_RANGE flags.
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 * Currently, errors to close a given file descriptor are ignored.
 */
SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
                unsigned int, flags)
{
        struct task_struct *me = current;
        struct files_struct *cur_fds = me->files, *fds = NULL;

        if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
                return -EINVAL;

        if (fd > max_fd)
                return -EINVAL;

        if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
                struct fd_range range = {fd, max_fd}, *punch_hole = &range;

                /*
                 * If the caller requested all fds to be made cloexec we always
                 * copy all of the file descriptors since they still want to
                 * use them.
                 */
                if (flags & CLOSE_RANGE_CLOEXEC)
                        punch_hole = NULL;

                fds = dup_fd(cur_fds, punch_hole);
                if (IS_ERR(fds))
                        return PTR_ERR(fds);
                /*
                 * We used to share our file descriptor table, and have now
                 * created a private one, make sure we're using it below.
                 */
                swap(cur_fds, fds);
        }

        if (flags & CLOSE_RANGE_CLOEXEC)
                __range_cloexec(cur_fds, fd, max_fd);
        else
                __range_close(cur_fds, fd, max_fd);

        if (fds) {
                /*
                 * We're done closing the files we were supposed to. Time to install
                 * the new file descriptor table and drop the old one.
                 */
                task_lock(me);
                me->files = cur_fds;
                task_unlock(me);
                put_files_struct(fds);
        }

        return 0;
}

/**
 * file_close_fd - return file associated with fd
 * @fd: file descriptor to retrieve file for
 *
 * Doesn't take a separate reference count.
 *
 * Returns: The file associated with @fd (NULL if @fd is not open)
 */
struct file *file_close_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = file_close_fd_locked(files, fd);
        spin_unlock(&files->file_lock);

        return file;
}

void do_close_on_exec(struct files_struct *files)
{
        unsigned i;
        struct fdtable *fdt;

        /* exec unshares first */
        spin_lock(&files->file_lock);
        for (i = 0; ; i++) {
                unsigned long set;
                unsigned fd = i * BITS_PER_LONG;
                fdt = files_fdtable(files);
                if (fd >= fdt->max_fds)
                        break;
                set = fdt->close_on_exec[i];
                if (!set)
                        continue;
                fdt->close_on_exec[i] = 0;
                for ( ; set ; fd++, set >>= 1) {
                        struct file *file;
                        if (!(set & 1))
                                continue;
                        file = fdt->fd[fd];
                        if (!file)
                                continue;
                        rcu_assign_pointer(fdt->fd[fd], NULL);
                        __put_unused_fd(files, fd);
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }

        }
        spin_unlock(&files->file_lock);
}

static struct file *__get_file_rcu(struct file __rcu **f)
{
        struct file __rcu *file;
        struct file __rcu *file_reloaded;
        struct file __rcu *file_reloaded_cmp;

        file = rcu_dereference_raw(*f);
        if (!file)
                return NULL;

        if (unlikely(!file_ref_get(&file->f_ref)))
                return ERR_PTR(-EAGAIN);

        file_reloaded = rcu_dereference_raw(*f);

        /*
         * Ensure that all accesses have a dependency on the load from
         * rcu_dereference_raw() above so we get correct ordering
         * between reuse/allocation and the pointer check below.
         */
        file_reloaded_cmp = file_reloaded;
        OPTIMIZER_HIDE_VAR(file_reloaded_cmp);

        /*
         * file_ref_get() above provided a full memory barrier when we
         * acquired a reference.
         *
         * This is paired with the write barrier from assigning to the
         * __rcu protected file pointer so that if that pointer still
         * matches the current file, we know we have successfully
         * acquired a reference to the right file.
         *
         * If the pointers don't match the file has been reallocated by
         * SLAB_TYPESAFE_BY_RCU.
         */
        if (file == file_reloaded_cmp)
                return file_reloaded;

        fput(file);
        return ERR_PTR(-EAGAIN);
}

/**
 * get_file_rcu - try go get a reference to a file under rcu
 * @f: the file to get a reference on
 *
 * This function tries to get a reference on @f carefully verifying that
 * @f hasn't been reused.
 *
 * This function should rarely have to be used and only by users who
 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 *
 * Return: Returns @f with the reference count increased or NULL.
 */
struct file *get_file_rcu(struct file __rcu **f)
{
        for (;;) {
                struct file __rcu *file;

                file = __get_file_rcu(f);
                if (!IS_ERR(file))
                        return file;
        }
}
EXPORT_SYMBOL_GPL(get_file_rcu);

/**
 * get_file_active - try go get a reference to a file
 * @f: the file to get a reference on
 *
 * In contast to get_file_rcu() the pointer itself isn't part of the
 * reference counting.
 *
 * This function should rarely have to be used and only by users who
 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 *
 * Return: Returns @f with the reference count increased or NULL.
 */
struct file *get_file_active(struct file **f)
{
        struct file __rcu *file;

        rcu_read_lock();
        file = __get_file_rcu(f);
        rcu_read_unlock();
        if (IS_ERR(file))
                file = NULL;
        return file;
}
EXPORT_SYMBOL_GPL(get_file_active);

static inline struct file *__fget_files_rcu(struct files_struct *files,
       unsigned int fd, fmode_t mask)
{
        for (;;) {
                struct file *file;
                struct fdtable *fdt = rcu_dereference_raw(files->fdt);
                struct file __rcu **fdentry;
                unsigned long nospec_mask;

                /* Mask is a 0 for invalid fd's, ~0 for valid ones */
                nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);

                /*
                 * fdentry points to the 'fd' offset, or fdt->fd[0].
                 * Loading from fdt->fd[0] is always safe, because the
                 * array always exists.
                 */
                fdentry = fdt->fd + (fd & nospec_mask);

                /* Do the load, then mask any invalid result */
                file = rcu_dereference_raw(*fdentry);
                file = (void *)(nospec_mask & (unsigned long)file);
                if (unlikely(!file))
                        return NULL;

                /*
                 * Ok, we have a file pointer that was valid at
                 * some point, but it might have become stale since.
                 *
                 * We need to confirm it by incrementing the refcount
                 * and then check the lookup again.
                 *
                 * file_ref_get() gives us a full memory barrier. We
                 * only really need an 'acquire' one to protect the
                 * loads below, but we don't have that.
                 */
                if (unlikely(!file_ref_get(&file->f_ref)))
                        continue;

                /*
                 * Such a race can take two forms:
                 *
                 *  (a) the file ref already went down to zero and the
                 *      file hasn't been reused yet or the file count
                 *      isn't zero but the file has already been reused.
                 *
                 *  (b) the file table entry has changed under us.
                 *       Note that we don't need to re-check the 'fdt->fd'
                 *       pointer having changed, because it always goes
                 *       hand-in-hand with 'fdt'.
                 *
                 * If so, we need to put our ref and try again.
                 */
                if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
                    unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
                        fput(file);
                        continue;
                }

                /*
                 * This isn't the file we're looking for or we're not
                 * allowed to get a reference to it.
                 */
                if (unlikely(file->f_mode & mask)) {
                        fput(file);
                        return NULL;
                }

                /*
                 * Ok, we have a ref to the file, and checked that it
                 * still exists.
                 */
                return file;
        }
}

static struct file *__fget_files(struct files_struct *files, unsigned int fd,
                                 fmode_t mask)
{
        struct file *file;

        rcu_read_lock();
        file = __fget_files_rcu(files, fd, mask);
        rcu_read_unlock();

        return file;
}

static inline struct file *__fget(unsigned int fd, fmode_t mask)
{
        return __fget_files(current->files, fd, mask);
}

struct file *fget(unsigned int fd)
{
        return __fget(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fget);

struct file *fget_raw(unsigned int fd)
{
        return __fget(fd, 0);
}
EXPORT_SYMBOL(fget_raw);

struct file *fget_task(struct task_struct *task, unsigned int fd)
{
        struct file *file = NULL;

        task_lock(task);
        if (task->files)
                file = __fget_files(task->files, fd, 0);
        task_unlock(task);

        return file;
}

struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
{
        /* Must be called with rcu_read_lock held */
        struct files_struct *files;
        unsigned int fd = *ret_fd;
        struct file *file = NULL;

        task_lock(task);
        files = task->files;
        if (files) {
                rcu_read_lock();
                for (; fd < files_fdtable(files)->max_fds; fd++) {
                        file = __fget_files_rcu(files, fd, 0);
                        if (file)
                                break;
                }
                rcu_read_unlock();
        }
        task_unlock(task);
        *ret_fd = fd;
        return file;
}
EXPORT_SYMBOL(fget_task_next);

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
 * You can use this instead of fget if you satisfy all of the following
 * conditions:
 * 1) You must call fput_light before exiting the syscall and returning control
 *    to userspace (i.e. you cannot remember the returned struct file * after
 *    returning to userspace).
 * 2) You must not call filp_close on the returned struct file * in between
 *    calls to fget_light and fput_light.
 * 3) You must not clone the current task in between the calls to fget_light
 *    and fput_light.
 *
 * The fput_needed flag returned by fget_light should be passed to the
 * corresponding fput_light.
 *
 * (As an exception to rule 2, you can call filp_close between fget_light and
 * fput_light provided that you capture a real refcount with get_file before
 * the call to filp_close, and ensure that this real refcount is fput *after*
 * the fput_light call.)
 *
 * See also the documentation in rust/kernel/file.rs.
 */
static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
{
        struct files_struct *files = current->files;
        struct file *file;

        /*
         * If another thread is concurrently calling close_fd() followed
         * by put_files_struct(), we must not observe the old table
         * entry combined with the new refcount - otherwise we could
         * return a file that is concurrently being freed.
         *
         * atomic_read_acquire() pairs with atomic_dec_and_test() in
         * put_files_struct().
         */
        if (likely(atomic_read_acquire(&files->count) == 1)) {
                file = files_lookup_fd_raw(files, fd);
                if (!file || unlikely(file->f_mode & mask))
                        return EMPTY_FD;
                return BORROWED_FD(file);
        } else {
                file = __fget_files(files, fd, mask);
                if (!file)
                        return EMPTY_FD;
                return CLONED_FD(file);
        }
}
struct fd fdget(unsigned int fd)
{
        return __fget_light(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fdget);

struct fd fdget_raw(unsigned int fd)
{
        return __fget_light(fd, 0);
}

/*
 * Try to avoid f_pos locking. We only need it if the
 * file is marked for FMODE_ATOMIC_POS, and it can be
 * accessed multiple ways.
 *
 * Always do it for directories, because pidfd_getfd()
 * can make a file accessible even if it otherwise would
 * not be, and for directories this is a correctness
 * issue, not a "POSIX requirement".
 */
static inline bool file_needs_f_pos_lock(struct file *file)
{
        if (!(file->f_mode & FMODE_ATOMIC_POS))
                return false;
        if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF)
                return true;
        if (file->f_op->iterate_shared)
                return true;
        return false;
}

bool file_seek_cur_needs_f_lock(struct file *file)
{
        if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
                return false;

        /*
         * Note that we are not guaranteed to be called after fdget_pos() on
         * this file obj, in which case the caller is expected to provide the
         * appropriate locking.
         */

        return true;
}

struct fd fdget_pos(unsigned int fd)
{
        struct fd f = fdget(fd);
        struct file *file = fd_file(f);

        if (likely(file) && file_needs_f_pos_lock(file)) {
                f.word |= FDPUT_POS_UNLOCK;
                mutex_lock(&file->f_pos_lock);
        }
        return f;
}

void __f_unlock_pos(struct file *f)
{
        mutex_unlock(&f->f_pos_lock);
}

/*
 * We only lock f_pos if we have threads or if the file might be
 * shared with another process. In both cases we'll have an elevated
 * file count (done either by fdget() or by fork()).
 */

void set_close_on_exec(unsigned int fd, int flag)
{
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
        __set_close_on_exec(fd, files_fdtable(files), flag);
        spin_unlock(&files->file_lock);
}

bool get_close_on_exec(unsigned int fd)
{
        bool res;
        rcu_read_lock();
        res = close_on_exec(fd, current->files);
        rcu_read_unlock();
        return res;
}

static int do_dup2(struct files_struct *files,
        struct file *file, unsigned fd, unsigned flags)
__releases(&files->file_lock)
{
        struct file *tofree;
        struct fdtable *fdt;

        /*
         * dup2() is expected to close the file installed in the target fd slot
         * (if any). However, userspace hand-picking a fd may be racing against
         * its own threads which happened to allocate it in open() et al but did
         * not populate it yet.
         *
         * Broadly speaking we may be racing against the following:
         * fd = get_unused_fd_flags();     // fd slot reserved, ->fd[fd] == NULL
         * file = hard_work_goes_here();
         * fd_install(fd, file);           // only now ->fd[fd] == file
         *
         * It is an invariant that a successfully allocated fd has a NULL entry
         * in the array until the matching fd_install().
         *
         * If we fit the window, we have the fd to populate, yet no target file
         * to close. Trying to ignore it and install our new file would violate
         * the invariant and make fd_install() overwrite our file.
         *
         * Things can be done(tm) to handle this. However, the issue does not
         * concern legitimate programs and we only need to make sure the kernel
         * does not trip over it.
         *
         * The simplest way out is to return an error if we find ourselves here.
         *
         * POSIX is silent on the issue, we return -EBUSY.
         */
        fdt = files_fdtable(files);
        fd = array_index_nospec(fd, fdt->max_fds);
        tofree = rcu_dereference_raw(fdt->fd[fd]);
        if (!tofree && fd_is_open(fd, fdt))
                goto Ebusy;
        get_file(file);
        rcu_assign_pointer(fdt->fd[fd], file);
        __set_open_fd(fd, fdt, flags & O_CLOEXEC);
        spin_unlock(&files->file_lock);

        if (tofree)
                filp_close(tofree, files);

        return fd;

Ebusy:
        spin_unlock(&files->file_lock);
        return -EBUSY;
}

int replace_fd(unsigned fd, struct file *file, unsigned flags)
{
        int err;
        struct files_struct *files = current->files;

        if (!file)
                return close_fd(fd);

        if (fd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, fd);
        if (unlikely(err < 0))
                goto out_unlock;
        err = do_dup2(files, file, fd, flags);
        if (err < 0)
                return err;
        return 0;

out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

/**
 * receive_fd() - Install received file into file descriptor table
 * @file: struct file that was received from another process
 * @ufd: __user pointer to write new fd number to
 * @o_flags: the O_* flags to apply to the new fd entry
 *
 * Installs a received file into the file descriptor table, with appropriate
 * checks and count updates. Optionally writes the fd number to userspace, if
 * @ufd is non-NULL.
 *
 * This helper handles its own reference counting of the incoming
 * struct file.
 *
 * Returns newly install fd or -ve on error.
 */
int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
{
        int error;

        error = security_file_receive(file);
        if (error)
                return error;

        FD_PREPARE(fdf, o_flags, file);
        if (fdf.err)
                return fdf.err;
        get_file(file);

        if (ufd) {
                error = put_user(fd_prepare_fd(fdf), ufd);
                if (error)
                        return error;
        }

        __receive_sock(fd_prepare_file(fdf));
        return fd_publish(fdf);
}
EXPORT_SYMBOL_GPL(receive_fd);

int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
{
        int error;

        error = security_file_receive(file);
        if (error)
                return error;
        error = replace_fd(new_fd, file, o_flags);
        if (error)
                return error;
        __receive_sock(file);
        return new_fd;
}

static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
        int err = -EBADF;
        struct file *file;
        struct files_struct *files = current->files;

        if ((flags & ~O_CLOEXEC) != 0)
                return -EINVAL;

        if (unlikely(oldfd == newfd))
                return -EINVAL;

        if (newfd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, newfd);
        file = files_lookup_fd_locked(files, oldfd);
        if (unlikely(!file))
                goto Ebadf;
        if (unlikely(err < 0)) {
                if (err == -EMFILE)
                        goto Ebadf;
                goto out_unlock;
        }
        return do_dup2(files, file, newfd, flags);

Ebadf:
        err = -EBADF;
out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
{
        return ksys_dup3(oldfd, newfd, flags);
}

SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
        if (unlikely(newfd == oldfd)) { /* corner case */
                struct files_struct *files = current->files;
                struct file *f;
                int retval = oldfd;

                rcu_read_lock();
                f = __fget_files_rcu(files, oldfd, 0);
                if (!f)
                        retval = -EBADF;
                rcu_read_unlock();
                if (f)
                        fput(f);
                return retval;
        }
        return ksys_dup3(oldfd, newfd, 0);
}

SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
        int ret = -EBADF;
        struct file *file = fget_raw(fildes);

        if (file) {
                ret = get_unused_fd_flags(0);
                if (ret >= 0)
                        fd_install(ret, file);
                else
                        fput(file);
        }
        return ret;
}

int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
        unsigned long nofile = rlimit(RLIMIT_NOFILE);
        int err;
        if (from >= nofile)
                return -EINVAL;
        err = alloc_fd(from, nofile, flags);
        if (err >= 0) {
                get_file(file);
                fd_install(err, file);
        }
        return err;
}

int iterate_fd(struct files_struct *files, unsigned n,
                int (*f)(const void *, struct file *, unsigned),
                const void *p)
{
        struct fdtable *fdt;
        int res = 0;
        if (!files)
                return 0;
        spin_lock(&files->file_lock);
        for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
                struct file *file;
                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
                if (!file)
                        continue;
                res = f(p, file, n);
                if (res)
                        break;
        }
        spin_unlock(&files->file_lock);
        return res;
}
EXPORT_SYMBOL(iterate_fd);



























































































   15 






















   18 
















    1 

   18 






   15 
















   15 












   17 






















   17 



    2 






   16 
   12 


    5 
    1 



    6 










   14 















   17 






















   16 











   19 
   19 






   18 














   19 









   16 






































































































    3 











    3 















































   16 















   17 


   15 
    3 





    1 
   17 


   19 
   18 
   14 
   16 







   16 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
// SPDX-License-Identifier: GPL-2.0-or-later
/* Provide a way to create a superblock configuration context within the kernel
 * that allows a superblock to be set up prior to mounting.
 *
 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/security.h>
#include <linux/mnt_namespace.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <net/net_namespace.h>
#include <asm/sections.h>
#include "mount.h"
#include "internal.h"

static const struct constant_table common_set_sb_flag[] = {
        { "dirsync",        SB_DIRSYNC },
        { "lazytime",        SB_LAZYTIME },
        { "mand",        SB_MANDLOCK },
        { "ro",                SB_RDONLY },
        { "sync",        SB_SYNCHRONOUS },
        { },
};

static const struct constant_table common_clear_sb_flag[] = {
        { "async",        SB_SYNCHRONOUS },
        { "nolazytime",        SB_LAZYTIME },
        { "nomand",        SB_MANDLOCK },
        { "rw",                SB_RDONLY },
        { },
};

/*
 * Check for a common mount option that manipulates s_flags.
 */
static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
{
        unsigned int token;

        token = lookup_constant(common_set_sb_flag, key, 0);
        if (token) {
                fc->sb_flags |= token;
                fc->sb_flags_mask |= token;
                return 0;
        }

        token = lookup_constant(common_clear_sb_flag, key, 0);
        if (token) {
                fc->sb_flags &= ~token;
                fc->sb_flags_mask |= token;
                return 0;
        }

        return -ENOPARAM;
}

/**
 * vfs_parse_fs_param_source - Handle setting "source" via parameter
 * @fc: The filesystem context to modify
 * @param: The parameter
 *
 * This is a simple helper for filesystems to verify that the "source" they
 * accept is sane.
 *
 * Returns 0 on success, -ENOPARAM if this is not  "source" parameter, and
 * -EINVAL otherwise. In the event of failure, supplementary error information
 *  is logged.
 */
int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param)
{
        if (strcmp(param->key, "source") != 0)
                return -ENOPARAM;

        if (param->type != fs_value_is_string)
                return invalf(fc, "Non-string source");

        if (fc->source)
                return invalf(fc, "Multiple sources");

        fc->source = param->string;
        param->string = NULL;
        return 0;
}
EXPORT_SYMBOL(vfs_parse_fs_param_source);

/**
 * vfs_parse_fs_param - Add a single parameter to a superblock config
 * @fc: The filesystem context to modify
 * @param: The parameter
 *
 * A single mount option in string form is applied to the filesystem context
 * being set up.  Certain standard options (for example "ro") are translated
 * into flag bits without going to the filesystem.  The active security module
 * is allowed to observe and poach options.  Any other options are passed over
 * to the filesystem to parse.
 *
 * This may be called multiple times for a context.
 *
 * Returns 0 on success and a negative error code on failure.  In the event of
 * failure, supplementary error information may have been set.
 */
int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
{
        int ret;

        if (!param->key)
                return invalf(fc, "Unnamed parameter\n");

        ret = vfs_parse_sb_flag(fc, param->key);
        if (ret != -ENOPARAM)
                return ret;

        ret = security_fs_context_parse_param(fc, param);
        if (ret != -ENOPARAM)
                /* Param belongs to the LSM or is disallowed by the LSM; so
                 * don't pass to the FS.
                 */
                return ret;

        if (fc->ops->parse_param) {
                ret = fc->ops->parse_param(fc, param);
                if (ret != -ENOPARAM)
                        return ret;
        }

        /* If the filesystem doesn't take any arguments, give it the
         * default handling of source.
         */
        ret = vfs_parse_fs_param_source(fc, param);
        if (ret != -ENOPARAM)
                return ret;

        return invalf(fc, "%s: Unknown parameter '%s'",
                      fc->fs_type->name, param->key);
}
EXPORT_SYMBOL(vfs_parse_fs_param);

/**
 * vfs_parse_fs_qstr - Convenience function to just parse a string.
 * @fc: Filesystem context.
 * @key: Parameter name.
 * @value: Default value.
 */
int vfs_parse_fs_qstr(struct fs_context *fc, const char *key,
                        const struct qstr *value)
{
        int ret;

        struct fs_parameter param = {
                .key        = key,
                .type        = fs_value_is_flag,
                .size        = value ? value->len : 0,
        };

        if (value) {
                param.string = kmemdup_nul(value->name, value->len, GFP_KERNEL);
                if (!param.string)
                        return -ENOMEM;
                param.type = fs_value_is_string;
        }

        ret = vfs_parse_fs_param(fc, &param);
        kfree(param.string);
        return ret;
}
EXPORT_SYMBOL(vfs_parse_fs_qstr);

/**
 * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data
 * @fc: The superblock configuration to fill in.
 * @data: The data to parse
 * @sep: callback for separating next option
 *
 * Parse a blob of data that's in key[=val][,key[=val]]* form with a custom
 * option separator callback.
 *
 * Returns 0 on success or the error returned by the ->parse_option() fs_context
 * operation on failure.
 */
int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
                             char *(*sep)(char **))
{
        char *options = data, *key;
        int ret = 0;

        if (!options)
                return 0;

        ret = security_sb_eat_lsm_opts(options, &fc->security);
        if (ret)
                return ret;

        while ((key = sep(&options)) != NULL) {
                if (*key) {
                        char *value = strchr(key, '=');

                        if (value) {
                                if (unlikely(value == key))
                                        continue;
                                *value++ = 0;
                        }
                        ret = vfs_parse_fs_string(fc, key, value);
                        if (ret < 0)
                                break;
                }
        }

        return ret;
}
EXPORT_SYMBOL(vfs_parse_monolithic_sep);

static char *vfs_parse_comma_sep(char **s)
{
        return strsep(s, ",");
}

/**
 * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
 * @fc: The superblock configuration to fill in.
 * @data: The data to parse
 *
 * Parse a blob of data that's in key[=val][,key[=val]]* form.  This can be
 * called from the ->monolithic_mount_data() fs_context operation.
 *
 * Returns 0 on success or the error returned by the ->parse_option() fs_context
 * operation on failure.
 */
int generic_parse_monolithic(struct fs_context *fc, void *data)
{
        return vfs_parse_monolithic_sep(fc, data, vfs_parse_comma_sep);
}
EXPORT_SYMBOL(generic_parse_monolithic);

/**
 * alloc_fs_context - Create a filesystem context.
 * @fs_type: The filesystem type.
 * @reference: The dentry from which this one derives (or NULL)
 * @sb_flags: Filesystem/superblock flags (SB_*)
 * @sb_flags_mask: Applicable members of @sb_flags
 * @purpose: The purpose that this configuration shall be used for.
 *
 * Open a filesystem and create a mount context.  The mount context is
 * initialised with the supplied flags and, if a submount/automount from
 * another superblock (referred to by @reference) is supplied, may have
 * parameters such as namespaces copied across from that superblock.
 */
static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
                                      struct dentry *reference,
                                      unsigned int sb_flags,
                                      unsigned int sb_flags_mask,
                                      enum fs_context_purpose purpose)
{
        struct fs_context *fc;
        int ret = -ENOMEM;

        fc = kzalloc_obj(struct fs_context, GFP_KERNEL_ACCOUNT);
        if (!fc)
                return ERR_PTR(-ENOMEM);

        fc->purpose        = purpose;
        fc->sb_flags        = sb_flags;
        fc->sb_flags_mask = sb_flags_mask;
        fc->fs_type        = get_filesystem(fs_type);
        fc->cred        = get_current_cred();
        fc->net_ns        = get_net(current->nsproxy->net_ns);
        fc->log.prefix        = fs_type->name;

        mutex_init(&fc->uapi_mutex);

        switch (purpose) {
        case FS_CONTEXT_FOR_MOUNT:
                fc->user_ns = get_user_ns(fc->cred->user_ns);
                break;
        case FS_CONTEXT_FOR_SUBMOUNT:
                fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
                break;
        case FS_CONTEXT_FOR_RECONFIGURE:
                atomic_inc(&reference->d_sb->s_active);
                fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
                fc->root = dget(reference);
                break;
        }

        ret = fc->fs_type->init_fs_context(fc);
        if (ret < 0)
                goto err_fc;
        fc->need_free = true;
        return fc;

err_fc:
        put_fs_context(fc);
        return ERR_PTR(ret);
}

struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
                                        unsigned int sb_flags)
{
        return alloc_fs_context(fs_type, NULL, sb_flags, 0,
                                        FS_CONTEXT_FOR_MOUNT);
}
EXPORT_SYMBOL(fs_context_for_mount);

struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
                                        unsigned int sb_flags,
                                        unsigned int sb_flags_mask)
{
        return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags,
                                sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE);
}

/**
 * fs_context_for_submount: allocate a new fs_context for a submount
 * @type: file_system_type of the new context
 * @reference: reference dentry from which to copy relevant info
 *
 * Allocate a new fs_context suitable for a submount. This also ensures that
 * the fc->security object is inherited from @reference (if needed).
 */
struct fs_context *fs_context_for_submount(struct file_system_type *type,
                                           struct dentry *reference)
{
        struct fs_context *fc;
        int ret;

        fc = alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
        if (IS_ERR(fc))
                return fc;

        ret = security_fs_context_submount(fc, reference->d_sb);
        if (ret) {
                put_fs_context(fc);
                return ERR_PTR(ret);
        }

        return fc;
}
EXPORT_SYMBOL(fs_context_for_submount);

void fc_drop_locked(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        dput(fc->root);
        fc->root = NULL;
        deactivate_locked_super(sb);
}

/**
 * vfs_dup_fs_context - Duplicate a filesystem context.
 * @src_fc: The context to copy.
 */
struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
{
        struct fs_context *fc;
        int ret;

        if (!src_fc->ops->dup)
                return ERR_PTR(-EOPNOTSUPP);

        fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL);
        if (!fc)
                return ERR_PTR(-ENOMEM);

        mutex_init(&fc->uapi_mutex);

        fc->fs_private        = NULL;
        fc->s_fs_info        = NULL;
        fc->source        = NULL;
        fc->security        = NULL;
        get_filesystem(fc->fs_type);
        get_net(fc->net_ns);
        get_user_ns(fc->user_ns);
        get_cred(fc->cred);
        if (fc->log.log)
                refcount_inc(&fc->log.log->usage);

        /* Can't call put until we've called ->dup */
        ret = fc->ops->dup(fc, src_fc);
        if (ret < 0)
                goto err_fc;

        ret = security_fs_context_dup(fc, src_fc);
        if (ret < 0)
                goto err_fc;
        return fc;

err_fc:
        put_fs_context(fc);
        return ERR_PTR(ret);
}
EXPORT_SYMBOL(vfs_dup_fs_context);

/**
 * logfc - Log a message to a filesystem context
 * @log: The filesystem context to log to, or NULL to use printk.
 * @prefix: A string to prefix the output with, or NULL.
 * @level: 'w' for a warning, 'e' for an error.  Anything else is a notice.
 * @fmt: The format of the buffer.
 */
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...)
{
        va_list va;
        struct va_format vaf = {.fmt = fmt, .va = &va};

        va_start(va, fmt);
        if (!log) {
                switch (level) {
                case 'w':
                        printk(KERN_WARNING "%s%s%pV\n", prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);
                        break;
                case 'e':
                        printk(KERN_ERR "%s%s%pV\n", prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);
                        break;
                case 'i':
                        printk(KERN_INFO "%s%s%pV\n", prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);
                        break;
                default:
                        printk(KERN_NOTICE "%s%s%pV\n", prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);
                        break;
                }
        } else {
                unsigned int logsize = ARRAY_SIZE(log->buffer);
                u8 index;
                char *q = kasprintf(GFP_KERNEL, "%c %s%s%pV\n", level,
                                                prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);

                index = log->head & (logsize - 1);
                BUILD_BUG_ON(sizeof(log->head) != sizeof(u8) ||
                             sizeof(log->tail) != sizeof(u8));
                if ((u8)(log->head - log->tail) == logsize) {
                        /* The buffer is full, discard the oldest message */
                        if (log->need_free & (1 << index))
                                kfree(log->buffer[index]);
                        log->tail++;
                }

                log->buffer[index] = q ? q : "OOM: Can't store error string";
                if (q)
                        log->need_free |= 1 << index;
                else
                        log->need_free &= ~(1 << index);
                log->head++;
        }
        va_end(va);
}
EXPORT_SYMBOL(logfc);

/*
 * Free a logging structure.
 */
static void put_fc_log(struct fs_context *fc)
{
        struct fc_log *log = fc->log.log;
        int i;

        if (log) {
                if (refcount_dec_and_test(&log->usage)) {
                        fc->log.log = NULL;
                        for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++)
                                if (log->need_free & (1 << i))
                                        kfree(log->buffer[i]);
                        kfree(log);
                }
        }
}

/**
 * put_fs_context - Dispose of a superblock configuration context.
 * @fc: The context to dispose of.
 */
void put_fs_context(struct fs_context *fc)
{
        struct super_block *sb;

        if (fc->root) {
                sb = fc->root->d_sb;
                dput(fc->root);
                fc->root = NULL;
                deactivate_super(sb);
        }

        if (fc->need_free && fc->ops && fc->ops->free)
                fc->ops->free(fc);

        security_free_mnt_opts(&fc->security);
        put_net(fc->net_ns);
        put_user_ns(fc->user_ns);
        put_cred(fc->cred);
        put_fc_log(fc);
        put_filesystem(fc->fs_type);
        kfree(fc->source);
        kfree(fc);
}
EXPORT_SYMBOL(put_fs_context);

int parse_monolithic_mount_data(struct fs_context *fc, void *data)
{
        int (*monolithic_mount_data)(struct fs_context *, void *);

        monolithic_mount_data = fc->ops->parse_monolithic;
        if (!monolithic_mount_data)
                monolithic_mount_data = generic_parse_monolithic;

        return monolithic_mount_data(fc, data);
}

/*
 * Clean up a context after performing an action on it and put it into a state
 * from where it can be used to reconfigure a superblock.
 *
 * Note that here we do only the parts that can't fail; the rest is in
 * finish_clean_context() below and in between those fs_context is marked
 * FS_CONTEXT_AWAITING_RECONF.  The reason for splitup is that after
 * successful mount or remount we need to report success to userland.
 * Trying to do full reinit (for the sake of possible subsequent remount)
 * and failing to allocate memory would've put us into a nasty situation.
 * So here we only discard the old state and reinitialization is left
 * until we actually try to reconfigure.
 */
void vfs_clean_context(struct fs_context *fc)
{
        if (fc->need_free && fc->ops && fc->ops->free)
                fc->ops->free(fc);
        fc->need_free = false;
        fc->fs_private = NULL;
        fc->s_fs_info = NULL;
        fc->sb_flags = 0;
        security_free_mnt_opts(&fc->security);
        kfree(fc->source);
        fc->source = NULL;
        fc->exclusive = false;

        fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
        fc->phase = FS_CONTEXT_AWAITING_RECONF;
}

int finish_clean_context(struct fs_context *fc)
{
        int error;

        if (fc->phase != FS_CONTEXT_AWAITING_RECONF)
                return 0;

        error = fc->fs_type->init_fs_context(fc);

        if (unlikely(error)) {
                fc->phase = FS_CONTEXT_FAILED;
                return error;
        }
        fc->need_free = true;
        fc->phase = FS_CONTEXT_RECONF_PARAMS;
        return 0;
}


















   22 


   23 

















    1 



    1 























































































































































































    1 
















    1 




















































































   15 




   17 














   13 
































































   16 















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/tomoyo.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/lsm_hooks.h>
#include <uapi/linux/lsm.h>
#include "common.h"

/**
 * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread.
 *
 * Returns pointer to "struct tomoyo_domain_info" for current thread.
 */
struct tomoyo_domain_info *tomoyo_domain(void)
{
        struct tomoyo_task *s = tomoyo_task(current);

        if (s->old_domain_info && !current->in_execve) {
                atomic_dec(&s->old_domain_info->users);
                s->old_domain_info = NULL;
        }
        return s->domain_info;
}

/**
 * tomoyo_cred_prepare - Target for security_prepare_creds().
 *
 * @new: Pointer to "struct cred".
 * @old: Pointer to "struct cred".
 * @gfp: Memory allocation flags.
 *
 * Returns 0.
 */
static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
                               gfp_t gfp)
{
        /* Restore old_domain_info saved by previous execve() request. */
        struct tomoyo_task *s = tomoyo_task(current);

        if (s->old_domain_info && !current->in_execve) {
                atomic_dec(&s->domain_info->users);
                s->domain_info = s->old_domain_info;
                s->old_domain_info = NULL;
        }
        return 0;
}

/**
 * tomoyo_bprm_committed_creds - Target for security_bprm_committed_creds().
 *
 * @bprm: Pointer to "struct linux_binprm".
 */
static void tomoyo_bprm_committed_creds(const struct linux_binprm *bprm)
{
        /* Clear old_domain_info saved by execve() request. */
        struct tomoyo_task *s = tomoyo_task(current);

        atomic_dec(&s->old_domain_info->users);
        s->old_domain_info = NULL;
}

#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
/**
 * tomoyo_bprm_creds_for_exec - Target for security_bprm_creds_for_exec().
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0.
 */
static int tomoyo_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        /*
         * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested
         * for the first time.
         */
        if (!tomoyo_policy_loaded)
                tomoyo_load_policy(bprm->filename);
        return 0;
}
#endif

/**
 * tomoyo_bprm_check_security - Target for security_bprm_check().
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_bprm_check_security(struct linux_binprm *bprm)
{
        struct tomoyo_task *s = tomoyo_task(current);

        /*
         * Execute permission is checked against pathname passed to execve()
         * using current domain.
         */
        if (!s->old_domain_info) {
                const int idx = tomoyo_read_lock();
                const int err = tomoyo_find_next_domain(bprm);

                tomoyo_read_unlock(idx);
                return err;
        }
        /*
         * Read permission is checked against interpreters using next domain.
         */
        return tomoyo_check_open_permission(s->domain_info,
                                            &bprm->file->f_path, O_RDONLY);
}

/**
 * tomoyo_inode_getattr - Target for security_inode_getattr().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_inode_getattr(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_GETATTR, path, NULL);
}

/**
 * tomoyo_path_truncate - Target for security_path_truncate().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_truncate(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_TRUNCATE, path, NULL);
}

/**
 * tomoyo_file_truncate - Target for security_file_truncate().
 *
 * @file: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_truncate(struct file *file)
{
        return tomoyo_path_truncate(&file->f_path);
}

/**
 * tomoyo_path_unlink - Target for security_path_unlink().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL);
}

/**
 * tomoyo_path_mkdir - Target for security_path_mkdir().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 * @mode:   DAC permission mode.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry,
                             umode_t mode)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_number_perm(TOMOYO_TYPE_MKDIR, &path,
                                       mode & S_IALLUGO);
}

/**
 * tomoyo_path_rmdir - Target for security_path_rmdir().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL);
}

/**
 * tomoyo_path_symlink - Target for security_path_symlink().
 *
 * @parent:   Pointer to "struct path".
 * @dentry:   Pointer to "struct dentry".
 * @old_name: Symlink's content.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_symlink(const struct path *parent, struct dentry *dentry,
                               const char *old_name)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_SYMLINK, &path, old_name);
}

/**
 * tomoyo_path_mknod - Target for security_path_mknod().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 * @mode:   DAC permission mode.
 * @dev:    Device attributes.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_mknod(const struct path *parent, struct dentry *dentry,
                             umode_t mode, unsigned int dev)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };
        int type = TOMOYO_TYPE_CREATE;
        const unsigned int perm = mode & S_IALLUGO;

        switch (mode & S_IFMT) {
        case S_IFCHR:
                type = TOMOYO_TYPE_MKCHAR;
                break;
        case S_IFBLK:
                type = TOMOYO_TYPE_MKBLOCK;
                break;
        default:
                goto no_dev;
        }
        return tomoyo_mkdev_perm(type, &path, perm, dev);
 no_dev:
        switch (mode & S_IFMT) {
        case S_IFIFO:
                type = TOMOYO_TYPE_MKFIFO;
                break;
        case S_IFSOCK:
                type = TOMOYO_TYPE_MKSOCK;
                break;
        }
        return tomoyo_path_number_perm(type, &path, perm);
}

/**
 * tomoyo_path_link - Target for security_path_link().
 *
 * @old_dentry: Pointer to "struct dentry".
 * @new_dir:    Pointer to "struct path".
 * @new_dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_link(struct dentry *old_dentry, const struct path *new_dir,
                            struct dentry *new_dentry)
{
        struct path path1 = { .mnt = new_dir->mnt, .dentry = old_dentry };
        struct path path2 = { .mnt = new_dir->mnt, .dentry = new_dentry };

        return tomoyo_path2_perm(TOMOYO_TYPE_LINK, &path1, &path2);
}

/**
 * tomoyo_path_rename - Target for security_path_rename().
 *
 * @old_parent: Pointer to "struct path".
 * @old_dentry: Pointer to "struct dentry".
 * @new_parent: Pointer to "struct path".
 * @new_dentry: Pointer to "struct dentry".
 * @flags: Rename options.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_rename(const struct path *old_parent,
                              struct dentry *old_dentry,
                              const struct path *new_parent,
                              struct dentry *new_dentry,
                              const unsigned int flags)
{
        struct path path1 = { .mnt = old_parent->mnt, .dentry = old_dentry };
        struct path path2 = { .mnt = new_parent->mnt, .dentry = new_dentry };

        if (flags & RENAME_EXCHANGE) {
                const int err = tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path2,
                                &path1);

                if (err)
                        return err;
        }
        return tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path1, &path2);
}

/**
 * tomoyo_file_fcntl - Target for security_file_fcntl().
 *
 * @file: Pointer to "struct file".
 * @cmd:  Command for fcntl().
 * @arg:  Argument for @cmd.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_fcntl(struct file *file, unsigned int cmd,
                             unsigned long arg)
{
        if (!(cmd == F_SETFL && ((arg ^ file->f_flags) & O_APPEND)))
                return 0;
        return tomoyo_check_open_permission(tomoyo_domain(), &file->f_path,
                                            O_WRONLY | (arg & O_APPEND));
}

/**
 * tomoyo_file_open - Target for security_file_open().
 *
 * @f: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_open(struct file *f)
{
        /* Don't check read permission here if called from execve(). */
        /* Illogically, FMODE_EXEC is in f_flags, not f_mode. */
        if (f->f_flags & __FMODE_EXEC)
                return 0;
        return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path,
                                            f->f_flags);
}

/**
 * tomoyo_file_ioctl - Target for security_file_ioctl().
 *
 * @file: Pointer to "struct file".
 * @cmd:  Command for ioctl().
 * @arg:  Argument for @cmd.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_ioctl(struct file *file, unsigned int cmd,
                             unsigned long arg)
{
        return tomoyo_path_number_perm(TOMOYO_TYPE_IOCTL, &file->f_path, cmd);
}

/**
 * tomoyo_path_chmod - Target for security_path_chmod().
 *
 * @path: Pointer to "struct path".
 * @mode: DAC permission mode.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chmod(const struct path *path, umode_t mode)
{
        return tomoyo_path_number_perm(TOMOYO_TYPE_CHMOD, path,
                                       mode & S_IALLUGO);
}

/**
 * tomoyo_path_chown - Target for security_path_chown().
 *
 * @path: Pointer to "struct path".
 * @uid:  Owner ID.
 * @gid:  Group ID.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        int error = 0;

        if (uid_valid(uid))
                error = tomoyo_path_number_perm(TOMOYO_TYPE_CHOWN, path,
                                                from_kuid(&init_user_ns, uid));
        if (!error && gid_valid(gid))
                error = tomoyo_path_number_perm(TOMOYO_TYPE_CHGRP, path,
                                                from_kgid(&init_user_ns, gid));
        return error;
}

/**
 * tomoyo_path_chroot - Target for security_path_chroot().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chroot(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_CHROOT, path, NULL);
}

/**
 * tomoyo_sb_mount - Target for security_sb_mount().
 *
 * @dev_name: Name of device file. Maybe NULL.
 * @path:     Pointer to "struct path".
 * @type:     Name of filesystem type. Maybe NULL.
 * @flags:    Mount options.
 * @data:     Optional data. Maybe NULL.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_mount(const char *dev_name, const struct path *path,
                           const char *type, unsigned long flags, void *data)
{
        return tomoyo_mount_permission(dev_name, path, type, flags, data);
}

/**
 * tomoyo_sb_umount - Target for security_sb_umount().
 *
 * @mnt:   Pointer to "struct vfsmount".
 * @flags: Unmount options.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_umount(struct vfsmount *mnt, int flags)
{
        struct path path = { .mnt = mnt, .dentry = mnt->mnt_root };

        return tomoyo_path_perm(TOMOYO_TYPE_UMOUNT, &path, NULL);
}

/**
 * tomoyo_sb_pivotroot - Target for security_sb_pivotroot().
 *
 * @old_path: Pointer to "struct path".
 * @new_path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_pivotroot(const struct path *old_path, const struct path *new_path)
{
        return tomoyo_path2_perm(TOMOYO_TYPE_PIVOT_ROOT, new_path, old_path);
}

/**
 * tomoyo_socket_listen - Check permission for listen().
 *
 * @sock:    Pointer to "struct socket".
 * @backlog: Backlog parameter.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_listen(struct socket *sock, int backlog)
{
        return tomoyo_socket_listen_permission(sock);
}

/**
 * tomoyo_socket_connect - Check permission for connect().
 *
 * @sock:     Pointer to "struct socket".
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_connect(struct socket *sock, struct sockaddr *addr,
                                 int addr_len)
{
        return tomoyo_socket_connect_permission(sock, addr, addr_len);
}

/**
 * tomoyo_socket_bind - Check permission for bind().
 *
 * @sock:     Pointer to "struct socket".
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_bind(struct socket *sock, struct sockaddr *addr,
                              int addr_len)
{
        return tomoyo_socket_bind_permission(sock, addr, addr_len);
}

/**
 * tomoyo_socket_sendmsg - Check permission for sendmsg().
 *
 * @sock: Pointer to "struct socket".
 * @msg:  Pointer to "struct msghdr".
 * @size: Size of message.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                                 int size)
{
        return tomoyo_socket_sendmsg_permission(sock, msg, size);
}

struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = {
        .lbs_task = sizeof(struct tomoyo_task),
};

/**
 * tomoyo_task_alloc - Target for security_task_alloc().
 *
 * @task:        Pointer to "struct task_struct".
 * @clone_flags: clone() flags.
 *
 * Returns 0.
 */
static int tomoyo_task_alloc(struct task_struct *task,
                             u64 clone_flags)
{
        struct tomoyo_task *old = tomoyo_task(current);
        struct tomoyo_task *new = tomoyo_task(task);

        new->domain_info = old->domain_info;
        atomic_inc(&new->domain_info->users);
        new->old_domain_info = NULL;
        return 0;
}

/**
 * tomoyo_task_free - Target for security_task_free().
 *
 * @task: Pointer to "struct task_struct".
 */
static void tomoyo_task_free(struct task_struct *task)
{
        struct tomoyo_task *s = tomoyo_task(task);

        if (s->domain_info) {
                atomic_dec(&s->domain_info->users);
                s->domain_info = NULL;
        }
        if (s->old_domain_info) {
                atomic_dec(&s->old_domain_info->users);
                s->old_domain_info = NULL;
        }
}

static const struct lsm_id tomoyo_lsmid = {
        .name = "tomoyo",
        .id = LSM_ID_TOMOYO,
};

/* tomoyo_hooks is used for registering TOMOYO. */
static struct security_hook_list tomoyo_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare),
        LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds),
        LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc),
        LSM_HOOK_INIT(task_free, tomoyo_task_free),
#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
        LSM_HOOK_INIT(bprm_creds_for_exec, tomoyo_bprm_creds_for_exec),
#endif
        LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security),
        LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl),
        LSM_HOOK_INIT(file_open, tomoyo_file_open),
        LSM_HOOK_INIT(file_truncate, tomoyo_file_truncate),
        LSM_HOOK_INIT(path_truncate, tomoyo_path_truncate),
        LSM_HOOK_INIT(path_unlink, tomoyo_path_unlink),
        LSM_HOOK_INIT(path_mkdir, tomoyo_path_mkdir),
        LSM_HOOK_INIT(path_rmdir, tomoyo_path_rmdir),
        LSM_HOOK_INIT(path_symlink, tomoyo_path_symlink),
        LSM_HOOK_INIT(path_mknod, tomoyo_path_mknod),
        LSM_HOOK_INIT(path_link, tomoyo_path_link),
        LSM_HOOK_INIT(path_rename, tomoyo_path_rename),
        LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr),
        LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, tomoyo_file_ioctl),
        LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod),
        LSM_HOOK_INIT(path_chown, tomoyo_path_chown),
        LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot),
        LSM_HOOK_INIT(sb_mount, tomoyo_sb_mount),
        LSM_HOOK_INIT(sb_umount, tomoyo_sb_umount),
        LSM_HOOK_INIT(sb_pivotroot, tomoyo_sb_pivotroot),
        LSM_HOOK_INIT(socket_bind, tomoyo_socket_bind),
        LSM_HOOK_INIT(socket_connect, tomoyo_socket_connect),
        LSM_HOOK_INIT(socket_listen, tomoyo_socket_listen),
        LSM_HOOK_INIT(socket_sendmsg, tomoyo_socket_sendmsg),
};

/* Lock for GC. */
DEFINE_SRCU(tomoyo_ss);

int tomoyo_enabled __ro_after_init = 1;

/**
 * tomoyo_init - Register TOMOYO Linux as a LSM module.
 *
 * Returns 0.
 */
static int __init tomoyo_init(void)
{
        struct tomoyo_task *s = tomoyo_task(current);

        /* register ourselves with the security framework */
        security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks),
                           &tomoyo_lsmid);
        pr_info("TOMOYO Linux initialized\n");
        s->domain_info = &tomoyo_kernel_domain;
        atomic_inc(&tomoyo_kernel_domain.users);
        s->old_domain_info = NULL;
        tomoyo_mm_init();

        return 0;
}

DEFINE_LSM(tomoyo) = {
        .id = &tomoyo_lsmid,
        .enabled = &tomoyo_enabled,
        .flags = LSM_FLAG_LEGACY_MAJOR,
        .blobs = &tomoyo_blob_sizes,
        .init = tomoyo_init,
        .initcall_fs = tomoyo_interface_init,
};




















































    1 
    1 































































































































    5 

    4 
    2 



    5 


    5 




    4 



    4 
    4 




    3 
    1 











    3 



















   13 




















































































































































































   12 



   12 


























































   11 


   13 


















































   12 
















   13 
















    1 























   12 





    2 
    9 

    3 






















    2 




    2 





    2 



    2 



































































   11 














    9 












































































































    3 
    4 







    4 




    2 







    2 





    2 

    2 




















    2 










    2 






































    1 










    1 


    1 




    1 












































    2 
   13 
   12 





















    1 
    2 















    1 







    1 





    1 














    9 


   10 
   10 



















    1 
    1 



















































































































































































































































































































































































    1 













































    3 





































    1 







    1 

    1 























































    2 



    2 








































































































    1 


























    2 

    1 

    2 


    2 

    1 

    2 
























    2 





    2 































































    2 









    2 






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
 */
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/bio-integrity.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
#include <linux/iocontext.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/highmem.h>
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
#include <linux/kmemleak.h>

#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
#include "blk-cgroup.h"

#define ALLOC_CACHE_THRESHOLD        16
#define ALLOC_CACHE_MAX                256

struct bio_alloc_cache {
        struct bio                *free_list;
        struct bio                *free_list_irq;
        unsigned int                nr;
        unsigned int                nr_irq;
};

#define BIO_INLINE_VECS 4

static struct biovec_slab {
        int nr_vecs;
        char *name;
        struct kmem_cache *slab;
} bvec_slabs[] __read_mostly = {
        { .nr_vecs = 16, .name = "biovec-16" },
        { .nr_vecs = 64, .name = "biovec-64" },
        { .nr_vecs = 128, .name = "biovec-128" },
        { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" },
};

static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
{
        switch (nr_vecs) {
        /* smaller bios use inline vecs */
        case 5 ... 16:
                return &bvec_slabs[0];
        case 17 ... 64:
                return &bvec_slabs[1];
        case 65 ... 128:
                return &bvec_slabs[2];
        case 129 ... BIO_MAX_VECS:
                return &bvec_slabs[3];
        default:
                BUG();
                return NULL;
        }
}

/*
 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 * IO code that does not need private memory pools.
 */
struct bio_set fs_bio_set;
EXPORT_SYMBOL(fs_bio_set);

/*
 * Our slab pool management
 */
struct bio_slab {
        struct kmem_cache *slab;
        unsigned int slab_ref;
        unsigned int slab_size;
        char name[12];
};
static DEFINE_MUTEX(bio_slab_lock);
static DEFINE_XARRAY(bio_slabs);

static struct bio_slab *create_bio_slab(unsigned int size)
{
        struct bio_slab *bslab = kzalloc_obj(*bslab);

        if (!bslab)
                return NULL;

        snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
        bslab->slab = kmem_cache_create(bslab->name, size,
                        ARCH_KMALLOC_MINALIGN,
                        SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
        if (!bslab->slab)
                goto fail_alloc_slab;

        bslab->slab_ref = 1;
        bslab->slab_size = size;

        if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
                return bslab;

        kmem_cache_destroy(bslab->slab);

fail_alloc_slab:
        kfree(bslab);
        return NULL;
}

static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
{
        return bs->front_pad + sizeof(struct bio) + bs->back_pad;
}

static inline void *bio_slab_addr(struct bio *bio)
{
        return (void *)bio - bio->bi_pool->front_pad;
}

static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
{
        unsigned int size = bs_bio_slab_size(bs);
        struct bio_slab *bslab;

        mutex_lock(&bio_slab_lock);
        bslab = xa_load(&bio_slabs, size);
        if (bslab)
                bslab->slab_ref++;
        else
                bslab = create_bio_slab(size);
        mutex_unlock(&bio_slab_lock);

        if (bslab)
                return bslab->slab;
        return NULL;
}

static void bio_put_slab(struct bio_set *bs)
{
        struct bio_slab *bslab = NULL;
        unsigned int slab_size = bs_bio_slab_size(bs);

        mutex_lock(&bio_slab_lock);

        bslab = xa_load(&bio_slabs, slab_size);
        if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
                goto out;

        WARN_ON_ONCE(bslab->slab != bs->bio_slab);

        WARN_ON(!bslab->slab_ref);

        if (--bslab->slab_ref)
                goto out;

        xa_erase(&bio_slabs, slab_size);

        kmem_cache_destroy(bslab->slab);
        kfree(bslab);

out:
        mutex_unlock(&bio_slab_lock);
}

/*
 * Make the first allocation restricted and don't dump info on allocation
 * failures, since we'll fall back to the mempool in case of failure.
 */
static inline gfp_t try_alloc_gfp(gfp_t gfp)
{
        return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
                __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
}

void bio_uninit(struct bio *bio)
{
#ifdef CONFIG_BLK_CGROUP
        if (bio->bi_blkg) {
                blkg_put(bio->bi_blkg);
                bio->bi_blkg = NULL;
        }
#endif
        if (bio_integrity(bio))
                bio_integrity_free(bio);

        bio_crypt_free_ctx(bio);
}
EXPORT_SYMBOL(bio_uninit);

static void bio_free(struct bio *bio)
{
        struct bio_set *bs = bio->bi_pool;
        void *p = bio;

        WARN_ON_ONCE(!bs);
        WARN_ON_ONCE(bio->bi_max_vecs > BIO_MAX_VECS);

        bio_uninit(bio);
        if (bio->bi_max_vecs == BIO_MAX_VECS)
                mempool_free(bio->bi_io_vec, &bs->bvec_pool);
        else if (bio->bi_max_vecs > BIO_INLINE_VECS)
                kmem_cache_free(biovec_slab(bio->bi_max_vecs)->slab,
                                bio->bi_io_vec);
        mempool_free(p - bs->front_pad, &bs->bio_pool);
}

/*
 * Users of this function have their own bio allocation. Subsequently,
 * they must remember to pair any call to bio_init() with bio_uninit()
 * when IO has completed, or when the bio is released.
 */
void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
              unsigned short max_vecs, blk_opf_t opf)
{
        bio->bi_next = NULL;
        bio->bi_bdev = bdev;
        bio->bi_opf = opf;
        bio->bi_flags = 0;
        bio->bi_ioprio = 0;
        bio->bi_write_hint = 0;
        bio->bi_write_stream = 0;
        bio->bi_status = 0;
        bio->bi_bvec_gap_bit = 0;
        bio->bi_iter.bi_sector = 0;
        bio->bi_iter.bi_size = 0;
        bio->bi_iter.bi_idx = 0;
        bio->bi_iter.bi_bvec_done = 0;
        bio->bi_end_io = NULL;
        bio->bi_private = NULL;
#ifdef CONFIG_BLK_CGROUP
        bio->bi_blkg = NULL;
        bio->issue_time_ns = 0;
        if (bdev)
                bio_associate_blkg(bio);
#ifdef CONFIG_BLK_CGROUP_IOCOST
        bio->bi_iocost_cost = 0;
#endif
#endif
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        bio->bi_crypt_context = NULL;
#endif
#ifdef CONFIG_BLK_DEV_INTEGRITY
        bio->bi_integrity = NULL;
#endif
        bio->bi_vcnt = 0;

        atomic_set(&bio->__bi_remaining, 1);
        atomic_set(&bio->__bi_cnt, 1);
        bio->bi_cookie = BLK_QC_T_NONE;

        bio->bi_max_vecs = max_vecs;
        bio->bi_io_vec = table;
        bio->bi_pool = NULL;
}
EXPORT_SYMBOL(bio_init);

/**
 * bio_reset - reinitialize a bio
 * @bio:        bio to reset
 * @bdev:        block device to use the bio for
 * @opf:        operation and flags for bio
 *
 * Description:
 *   After calling bio_reset(), @bio will be in the same state as a freshly
 *   allocated bio returned bio bio_alloc_bioset() - the only fields that are
 *   preserved are the ones that are initialized by bio_alloc_bioset(). See
 *   comment in struct bio.
 */
void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
{
        struct bio_vec          *bv = bio->bi_io_vec;

        bio_uninit(bio);
        memset(bio, 0, BIO_RESET_BYTES);
        atomic_set(&bio->__bi_remaining, 1);
        bio->bi_io_vec = bv;
        bio->bi_bdev = bdev;
        if (bio->bi_bdev)
                bio_associate_blkg(bio);
        bio->bi_opf = opf;
}
EXPORT_SYMBOL(bio_reset);

/**
 * bio_reuse - reuse a bio with the payload left intact
 * @bio:        bio to reuse
 * @opf:        operation and flags for the next I/O
 *
 * Allow reusing an existing bio for another operation with all set up
 * fields including the payload, device and end_io handler left intact.
 *
 * Typically used when @bio is first used to read data which is then written
 * to another location without modification.  @bio must not be in-flight and
 * owned by the caller.  Can't be used for cloned bios.
 *
 * Note: Can't be used when @bio has integrity or blk-crypto contexts for now.
 * Feel free to add that support when you need it, though.
 */
void bio_reuse(struct bio *bio, blk_opf_t opf)
{
        unsigned short vcnt = bio->bi_vcnt, i;
        bio_end_io_t *end_io = bio->bi_end_io;
        void *private = bio->bi_private;

        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
        WARN_ON_ONCE(bio_integrity(bio));
        WARN_ON_ONCE(bio_has_crypt_ctx(bio));

        bio_reset(bio, bio->bi_bdev, opf);
        for (i = 0; i < vcnt; i++)
                bio->bi_iter.bi_size += bio->bi_io_vec[i].bv_len;
        bio->bi_vcnt = vcnt;
        bio->bi_private = private;
        bio->bi_end_io = end_io;
}
EXPORT_SYMBOL_GPL(bio_reuse);

static struct bio *__bio_chain_endio(struct bio *bio)
{
        struct bio *parent = bio->bi_private;

        if (bio->bi_status && !parent->bi_status)
                parent->bi_status = bio->bi_status;
        bio_put(bio);
        return parent;
}

/*
 * This function should only be used as a flag and must never be called.
 * If execution reaches here, it indicates a serious programming error.
 */
static void bio_chain_endio(struct bio *bio)
{
        BUG();
}

/**
 * bio_chain - chain bio completions
 * @bio: the target bio
 * @parent: the parent bio of @bio
 *
 * The caller won't have a bi_end_io called when @bio completes - instead,
 * @parent's bi_end_io won't be called until both @parent and @bio have
 * completed; the chained bio will also be freed when it completes.
 *
 * The caller must not set bi_private or bi_end_io in @bio.
 */
void bio_chain(struct bio *bio, struct bio *parent)
{
        BUG_ON(bio->bi_private || bio->bi_end_io);

        bio->bi_private = parent;
        bio->bi_end_io        = bio_chain_endio;
        bio_inc_remaining(parent);
}
EXPORT_SYMBOL(bio_chain);

/**
 * bio_chain_and_submit - submit a bio after chaining it to another one
 * @prev: bio to chain and submit
 * @new: bio to chain to
 *
 * If @prev is non-NULL, chain it to @new and submit it.
 *
 * Return: @new.
 */
struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
{
        if (prev) {
                bio_chain(prev, new);
                submit_bio(prev);
        }
        return new;
}

struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
                unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
{
        return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
}
EXPORT_SYMBOL_GPL(blk_next_bio);

static void bio_alloc_rescue(struct work_struct *work)
{
        struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
        struct bio *bio;

        while (1) {
                spin_lock(&bs->rescue_lock);
                bio = bio_list_pop(&bs->rescue_list);
                spin_unlock(&bs->rescue_lock);

                if (!bio)
                        break;

                submit_bio_noacct(bio);
        }
}

/*
 * submit_bio_noacct() converts recursion to iteration; this means if we're
 * running beneath it, any bios we allocate and submit will not be submitted
 * (and thus freed) until after we return.
 *
 * This exposes us to a potential deadlock if we allocate multiple bios from the
 * same bio_set while running underneath submit_bio_noacct().  If we were to
 * allocate multiple bios (say a stacking block driver that was splitting bios),
 * we would deadlock if we exhausted the mempool's reserve.
 *
 * We solve this, and guarantee forward progress by punting the bios on
 * current->bio_list to a per bio_set rescuer workqueue before blocking to wait
 * for elements being returned to the mempool.
 */
static void punt_bios_to_rescuer(struct bio_set *bs)
{
        struct bio_list punt, nopunt;
        struct bio *bio;

        if (!current->bio_list || !bs->rescue_workqueue)
                return;
        if (bio_list_empty(&current->bio_list[0]) &&
            bio_list_empty(&current->bio_list[1]))
                return;

        /*
         * In order to guarantee forward progress we must punt only bios that
         * were allocated from this bio_set; otherwise, if there was a bio on
         * there for a stacking driver higher up in the stack, processing it
         * could require allocating bios from this bio_set, and doing that from
         * our own rescuer would be bad.
         *
         * Since bio lists are singly linked, pop them all instead of trying to
         * remove from the middle of the list:
         */

        bio_list_init(&punt);
        bio_list_init(&nopunt);

        while ((bio = bio_list_pop(&current->bio_list[0])))
                bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
        current->bio_list[0] = nopunt;

        bio_list_init(&nopunt);
        while ((bio = bio_list_pop(&current->bio_list[1])))
                bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
        current->bio_list[1] = nopunt;

        spin_lock(&bs->rescue_lock);
        bio_list_merge(&bs->rescue_list, &punt);
        spin_unlock(&bs->rescue_lock);

        queue_work(bs->rescue_workqueue, &bs->rescue_work);
}

static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
{
        unsigned long flags;

        /* cache->free_list must be empty */
        if (WARN_ON_ONCE(cache->free_list))
                return;

        local_irq_save(flags);
        cache->free_list = cache->free_list_irq;
        cache->free_list_irq = NULL;
        cache->nr += cache->nr_irq;
        cache->nr_irq = 0;
        local_irq_restore(flags);
}

static struct bio *bio_alloc_percpu_cache(struct bio_set *bs)
{
        struct bio_alloc_cache *cache;
        struct bio *bio;

        cache = per_cpu_ptr(bs->cache, get_cpu());
        if (!cache->free_list) {
                if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD)
                        bio_alloc_irq_cache_splice(cache);
                if (!cache->free_list) {
                        put_cpu();
                        return NULL;
                }
        }
        bio = cache->free_list;
        cache->free_list = bio->bi_next;
        cache->nr--;
        put_cpu();
        bio->bi_pool = bs;

        kmemleak_alloc(bio_slab_addr(bio),
                       kmem_cache_size(bs->bio_slab), 1, GFP_NOIO);
        return bio;
}

/**
 * bio_alloc_bioset - allocate a bio for I/O
 * @bdev:        block device to allocate the bio for (can be %NULL)
 * @nr_vecs:        number of bvecs to pre-allocate
 * @opf:        operation and flags for bio
 * @gfp:        the GFP_* mask given to the slab allocator
 * @bs:                the bio_set to allocate from.
 *
 * Allocate a bio from the mempools in @bs.
 *
 * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
 * allocate a bio.  This is due to the mempool guarantees.  To make this work,
 * callers must never allocate more than 1 bio at a time from the general pool.
 * Callers that need to allocate more than 1 bio must always submit the
 * previously allocated bio for IO before attempting to allocate a new one.
 * Failure to do so can cause deadlocks under memory pressure.
 *
 * Note that when running under submit_bio_noacct() (i.e. any block driver),
 * bios are not submitted until after you return - see the code in
 * submit_bio_noacct() that converts recursion into iteration, to prevent
 * stack overflows.
 *
 * This would normally mean allocating multiple bios under submit_bio_noacct()
 * would be susceptible to deadlocks, but we have
 * deadlock avoidance code that resubmits any blocked bios from a rescuer
 * thread.
 *
 * However, we do not guarantee forward progress for allocations from other
 * mempools. Doing multiple allocations from the same mempool under
 * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
 * for per bio allocations.
 *
 * Returns: Pointer to new bio on success, NULL on failure.
 */
struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
                             blk_opf_t opf, gfp_t gfp, struct bio_set *bs)
{
        struct bio_vec *bvecs = NULL;
        struct bio *bio = NULL;
        gfp_t saved_gfp = gfp;
        void *p;

        /* should not use nobvec bioset for nr_vecs > 0 */
        if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
                return NULL;

        gfp = try_alloc_gfp(gfp);
        if (bs->cache && nr_vecs <= BIO_INLINE_VECS) {
                /*
                 * Set REQ_ALLOC_CACHE even if no cached bio is available to
                 * return the allocated bio to the percpu cache when done.
                 */
                opf |= REQ_ALLOC_CACHE;
                bio = bio_alloc_percpu_cache(bs);
        } else {
                opf &= ~REQ_ALLOC_CACHE;
                p = kmem_cache_alloc(bs->bio_slab, gfp);
                if (p)
                        bio = p + bs->front_pad;
        }

        if (bio && nr_vecs > BIO_INLINE_VECS) {
                struct biovec_slab *bvs = biovec_slab(nr_vecs);

                /*
                 * Upgrade nr_vecs to take full advantage of the allocation.
                 * We also rely on this in bio_free().
                 */
                nr_vecs = bvs->nr_vecs;
                bvecs = kmem_cache_alloc(bvs->slab, gfp);
                if (unlikely(!bvecs)) {
                        kmem_cache_free(bs->bio_slab, p);
                        bio = NULL;
                }
        }

        if (unlikely(!bio)) {
                /*
                 * Give up if we are not allow to sleep as non-blocking mempool
                 * allocations just go back to the slab allocation.
                 */
                if (!(saved_gfp & __GFP_DIRECT_RECLAIM))
                        return NULL;

                punt_bios_to_rescuer(bs);

                /*
                 * Don't rob the mempools by returning to the per-CPU cache if
                 * we're tight on memory.
                 */
                opf &= ~REQ_ALLOC_CACHE;

                p = mempool_alloc(&bs->bio_pool, saved_gfp);
                bio = p + bs->front_pad;
                if (nr_vecs > BIO_INLINE_VECS) {
                        nr_vecs = BIO_MAX_VECS;
                        bvecs = mempool_alloc(&bs->bvec_pool, saved_gfp);
                }
        }

        if (nr_vecs && nr_vecs <= BIO_INLINE_VECS)
                bio_init_inline(bio, bdev, nr_vecs, opf);
        else
                bio_init(bio, bdev, bvecs, nr_vecs, opf);
        bio->bi_pool = bs;
        return bio;
}
EXPORT_SYMBOL(bio_alloc_bioset);

/**
 * bio_kmalloc - kmalloc a bio
 * @nr_vecs:        number of bio_vecs to allocate
 * @gfp_mask:   the GFP_* mask given to the slab allocator
 *
 * Use kmalloc to allocate a bio (including bvecs).  The bio must be initialized
 * using bio_init() before use.  To free a bio returned from this function use
 * kfree() after calling bio_uninit().  A bio returned from this function can
 * be reused by calling bio_uninit() before calling bio_init() again.
 *
 * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this
 * function are not backed by a mempool can fail.  Do not use this function
 * for allocations in the file system I/O path.
 *
 * Returns: Pointer to new bio on success, NULL on failure.
 */
struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
{
        struct bio *bio;

        if (nr_vecs > BIO_MAX_INLINE_VECS)
                return NULL;
        return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec),
                        gfp_mask);
}
EXPORT_SYMBOL(bio_kmalloc);

void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
{
        struct bio_vec bv;
        struct bvec_iter iter;

        __bio_for_each_segment(bv, bio, iter, start)
                memzero_bvec(&bv);
}
EXPORT_SYMBOL(zero_fill_bio_iter);

/**
 * bio_truncate - truncate the bio to small size of @new_size
 * @bio:        the bio to be truncated
 * @new_size:        new size for truncating the bio
 *
 * Description:
 *   Truncate the bio to new size of @new_size. If bio_op(bio) is
 *   REQ_OP_READ, zero the truncated part. This function should only
 *   be used for handling corner cases, such as bio eod.
 */
static void bio_truncate(struct bio *bio, unsigned new_size)
{
        struct bio_vec bv;
        struct bvec_iter iter;
        unsigned int done = 0;
        bool truncated = false;

        if (new_size >= bio->bi_iter.bi_size)
                return;

        if (bio_op(bio) != REQ_OP_READ)
                goto exit;

        bio_for_each_segment(bv, bio, iter) {
                if (done + bv.bv_len > new_size) {
                        size_t offset;

                        if (!truncated)
                                offset = new_size - done;
                        else
                                offset = 0;
                        memzero_page(bv.bv_page, bv.bv_offset + offset,
                                  bv.bv_len - offset);
                        truncated = true;
                }
                done += bv.bv_len;
        }

 exit:
        /*
         * Don't touch bvec table here and make it really immutable, since
         * fs bio user has to retrieve all pages via bio_for_each_segment_all
         * in its .end_bio() callback.
         *
         * It is enough to truncate bio by updating .bi_size since we can make
         * correct bvec with the updated .bi_size for drivers.
         */
        bio->bi_iter.bi_size = new_size;
}

/**
 * guard_bio_eod - truncate a BIO to fit the block device
 * @bio:        bio to truncate
 *
 * This allows us to do IO even on the odd last sectors of a device, even if the
 * block size is some multiple of the physical sector size.
 *
 * We'll just truncate the bio to the size of the device, and clear the end of
 * the buffer head manually.  Truly out-of-range accesses will turn into actual
 * I/O errors, this only handles the "we need to be able to do I/O at the final
 * sector" case.
 */
void guard_bio_eod(struct bio *bio)
{
        sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);

        if (!maxsector)
                return;

        /*
         * If the *whole* IO is past the end of the device,
         * let it through, and the IO layer will turn it into
         * an EIO.
         */
        if (unlikely(bio->bi_iter.bi_sector >= maxsector))
                return;

        maxsector -= bio->bi_iter.bi_sector;
        if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
                return;

        bio_truncate(bio, maxsector << 9);
}

static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
                                   unsigned int nr)
{
        unsigned int i = 0;
        struct bio *bio;

        while ((bio = cache->free_list) != NULL) {
                cache->free_list = bio->bi_next;
                cache->nr--;
                kmemleak_alloc(bio_slab_addr(bio),
                               kmem_cache_size(bio->bi_pool->bio_slab),
                               1, GFP_KERNEL);
                bio_free(bio);
                if (++i == nr)
                        break;
        }
        return i;
}

static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
                                  unsigned int nr)
{
        nr -= __bio_alloc_cache_prune(cache, nr);
        if (!READ_ONCE(cache->free_list)) {
                bio_alloc_irq_cache_splice(cache);
                __bio_alloc_cache_prune(cache, nr);
        }
}

static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
        struct bio_set *bs;

        bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
        if (bs->cache) {
                struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);

                bio_alloc_cache_prune(cache, -1U);
        }
        return 0;
}

static void bio_alloc_cache_destroy(struct bio_set *bs)
{
        int cpu;

        if (!bs->cache)
                return;

        cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
        for_each_possible_cpu(cpu) {
                struct bio_alloc_cache *cache;

                cache = per_cpu_ptr(bs->cache, cpu);
                bio_alloc_cache_prune(cache, -1U);
        }
        free_percpu(bs->cache);
        bs->cache = NULL;
}

static inline void bio_put_percpu_cache(struct bio *bio)
{
        struct bio_alloc_cache *cache;

        cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
        if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX)
                goto out_free;

        if (in_task()) {
                bio_uninit(bio);
                bio->bi_next = cache->free_list;
                /* Not necessary but helps not to iopoll already freed bios */
                bio->bi_bdev = NULL;
                cache->free_list = bio;
                cache->nr++;
                kmemleak_free(bio_slab_addr(bio));
        } else if (in_hardirq()) {
                lockdep_assert_irqs_disabled();

                bio_uninit(bio);
                bio->bi_next = cache->free_list_irq;
                cache->free_list_irq = bio;
                cache->nr_irq++;
                kmemleak_free(bio_slab_addr(bio));
        } else {
                goto out_free;
        }
        put_cpu();
        return;
out_free:
        put_cpu();
        bio_free(bio);
}

/**
 * bio_put - release a reference to a bio
 * @bio:   bio to release reference to
 *
 * Description:
 *   Put a reference to a &struct bio, either one you have gotten with
 *   bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
 **/
void bio_put(struct bio *bio)
{
        if (unlikely(bio_flagged(bio, BIO_REFFED))) {
                BUG_ON(!atomic_read(&bio->__bi_cnt));
                if (!atomic_dec_and_test(&bio->__bi_cnt))
                        return;
        }
        if (bio->bi_opf & REQ_ALLOC_CACHE)
                bio_put_percpu_cache(bio);
        else
                bio_free(bio);
}
EXPORT_SYMBOL(bio_put);

static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{
        bio_set_flag(bio, BIO_CLONED);
        bio->bi_ioprio = bio_src->bi_ioprio;
        bio->bi_write_hint = bio_src->bi_write_hint;
        bio->bi_write_stream = bio_src->bi_write_stream;
        bio->bi_iter = bio_src->bi_iter;

        if (bio->bi_bdev) {
                if (bio->bi_bdev == bio_src->bi_bdev &&
                    bio_flagged(bio_src, BIO_REMAPPED))
                        bio_set_flag(bio, BIO_REMAPPED);
                bio_clone_blkg_association(bio, bio_src);
        }

        if (bio_crypt_clone(bio, bio_src, gfp) < 0)
                return -ENOMEM;
        if (bio_integrity(bio_src) &&
            bio_integrity_clone(bio, bio_src, gfp) < 0)
                return -ENOMEM;
        return 0;
}

/**
 * bio_alloc_clone - clone a bio that shares the original bio's biovec
 * @bdev: block_device to clone onto
 * @bio_src: bio to clone from
 * @gfp: allocation priority
 * @bs: bio_set to allocate from
 *
 * Allocate a new bio that is a clone of @bio_src. This reuses the bio_vecs
 * pointed to by @bio_src->bi_io_vec, and clones the iterator pointing to
 * the current position in it.  The caller owns the returned bio, but not
 * the bio_vecs, and must ensure the bio is freed before the memory
 * pointed to by @bio_Src->bi_io_vecs.
 */
struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
                gfp_t gfp, struct bio_set *bs)
{
        struct bio *bio;

        bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
        if (!bio)
                return NULL;

        if (__bio_clone(bio, bio_src, gfp) < 0) {
                bio_put(bio);
                return NULL;
        }
        bio->bi_io_vec = bio_src->bi_io_vec;

        return bio;
}
EXPORT_SYMBOL(bio_alloc_clone);

/**
 * bio_init_clone - clone a bio that shares the original bio's biovec
 * @bdev: block_device to clone onto
 * @bio: bio to clone into
 * @bio_src: bio to clone from
 * @gfp: allocation priority
 *
 * Initialize a new bio in caller provided memory that is a clone of @bio_src.
 * The same bio_vecs reuse and bio lifetime rules as bio_alloc_clone() apply.
 */
int bio_init_clone(struct block_device *bdev, struct bio *bio,
                struct bio *bio_src, gfp_t gfp)
{
        int ret;

        bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
        ret = __bio_clone(bio, bio_src, gfp);
        if (ret)
                bio_uninit(bio);
        return ret;
}
EXPORT_SYMBOL(bio_init_clone);

/**
 * bio_full - check if the bio is full
 * @bio:        bio to check
 * @len:        length of one segment to be added
 *
 * Return true if @bio is full and one segment with @len bytes can't be
 * added to the bio, otherwise return false
 */
static inline bool bio_full(struct bio *bio, unsigned len)
{
        if (bio->bi_vcnt >= bio->bi_max_vecs)
                return true;
        if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
                return true;
        return false;
}

static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
                unsigned int len, unsigned int off)
{
        size_t bv_end = bv->bv_offset + bv->bv_len;
        phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
        phys_addr_t page_addr = page_to_phys(page);

        if (vec_end_addr + 1 != page_addr + off)
                return false;
        if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
                return false;

        if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
                if (IS_ENABLED(CONFIG_KMSAN))
                        return false;
                if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
                        return false;
        }

        bv->bv_len += len;
        return true;
}

/*
 * Try to merge a page into a segment, while obeying the hardware segment
 * size limit.
 *
 * This is kept around for the integrity metadata, which is still tries
 * to build the initial bio to the hardware limit and doesn't have proper
 * helpers to split.  Hopefully this will go away soon.
 */
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
                struct page *page, unsigned len, unsigned offset)
{
        unsigned long mask = queue_segment_boundary(q);
        phys_addr_t addr1 = bvec_phys(bv);
        phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;

        if ((addr1 | mask) != (addr2 | mask))
                return false;
        if (len > queue_max_segment_size(q) - bv->bv_len)
                return false;
        return bvec_try_merge_page(bv, page, len, offset);
}

/**
 * __bio_add_page - add page(s) to a bio in a new segment
 * @bio: destination bio
 * @page: start page to add
 * @len: length of the data to add, may cross pages
 * @off: offset of the data relative to @page, may cross pages
 *
 * Add the data at @page + @off to @bio as a new bvec.  The caller must ensure
 * that @bio has space for another bvec.
 */
void __bio_add_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int off)
{
        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
        WARN_ON_ONCE(bio_full(bio, len));

        if (is_pci_p2pdma_page(page))
                bio->bi_opf |= REQ_NOMERGE;

        bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
        bio->bi_iter.bi_size += len;
        bio->bi_vcnt++;
}
EXPORT_SYMBOL_GPL(__bio_add_page);

/**
 * bio_add_virt_nofail - add data in the direct kernel mapping to a bio
 * @bio: destination bio
 * @vaddr: data to add
 * @len: length of the data to add, may cross pages
 *
 * Add the data at @vaddr to @bio.  The caller must have ensure a segment
 * is available for the added data.  No merging into an existing segment
 * will be performed.
 */
void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len)
{
        __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr));
}
EXPORT_SYMBOL_GPL(bio_add_virt_nofail);

/**
 *        bio_add_page        -        attempt to add page(s) to bio
 *        @bio: destination bio
 *        @page: start page to add
 *        @len: vec entry length, may cross pages
 *        @offset: vec entry offset relative to @page, may cross pages
 *
 *        Attempt to add page(s) to the bio_vec maplist. This will only fail
 *        if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
 */
int bio_add_page(struct bio *bio, struct page *page,
                 unsigned int len, unsigned int offset)
{
        if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
                return 0;
        if (WARN_ON_ONCE(len == 0))
                return 0;
        if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
                return 0;

        if (bio->bi_vcnt > 0) {
                struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];

                if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
                        return 0;

                if (bvec_try_merge_page(bv, page, len, offset)) {
                        bio->bi_iter.bi_size += len;
                        return len;
                }
        }

        if (bio->bi_vcnt >= bio->bi_max_vecs)
                return 0;
        __bio_add_page(bio, page, len, offset);
        return len;
}
EXPORT_SYMBOL(bio_add_page);

void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
                          size_t off)
{
        unsigned long nr = off / PAGE_SIZE;

        WARN_ON_ONCE(len > BIO_MAX_SIZE);
        __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE);
}
EXPORT_SYMBOL_GPL(bio_add_folio_nofail);

/**
 * bio_add_folio - Attempt to add part of a folio to a bio.
 * @bio: BIO to add to.
 * @folio: Folio to add.
 * @len: How many bytes from the folio to add.
 * @off: First byte in this folio to add.
 *
 * Filesystems that use folios can call this function instead of calling
 * bio_add_page() for each page in the folio.  If @off is bigger than
 * PAGE_SIZE, this function can create a bio_vec that starts in a page
 * after the bv_page.  BIOs do not support folios that are 4GiB or larger.
 *
 * Return: Whether the addition was successful.
 */
bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
                   size_t off)
{
        unsigned long nr = off / PAGE_SIZE;

        if (len > BIO_MAX_SIZE)
                return false;
        return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0;
}
EXPORT_SYMBOL(bio_add_folio);

/**
 * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio
 * @bio: destination bio
 * @vaddr: vmalloc address to add
 * @len: total length in bytes of the data to add
 *
 * Add data starting at @vaddr to @bio and return how many bytes were added.
 * This may be less than the amount originally asked.  Returns 0 if no data
 * could be added to @bio.
 *
 * This helper calls flush_kernel_vmap_range() for the range added.  For reads
 * the caller still needs to manually call invalidate_kernel_vmap_range() in
 * the completion handler.
 */
unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len)
{
        unsigned int offset = offset_in_page(vaddr);

        len = min(len, PAGE_SIZE - offset);
        if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len)
                return 0;
        if (op_is_write(bio_op(bio)))
                flush_kernel_vmap_range(vaddr, len);
        return len;
}
EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk);

/**
 * bio_add_vmalloc - add a vmalloc region to a bio
 * @bio: destination bio
 * @vaddr: vmalloc address to add
 * @len: total length in bytes of the data to add
 *
 * Add data starting at @vaddr to @bio.  Return %true on success or %false if
 * @bio does not have enough space for the payload.
 *
 * This helper calls flush_kernel_vmap_range() for the range added.  For reads
 * the caller still needs to manually call invalidate_kernel_vmap_range() in
 * the completion handler.
 */
bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len)
{
        do {
                unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len);

                if (!added)
                        return false;
                vaddr += added;
                len -= added;
        } while (len);

        return true;
}
EXPORT_SYMBOL_GPL(bio_add_vmalloc);

void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
        struct folio_iter fi;

        bio_for_each_folio_all(fi, bio) {
                size_t nr_pages;

                if (mark_dirty) {
                        folio_lock(fi.folio);
                        folio_mark_dirty(fi.folio);
                        folio_unlock(fi.folio);
                }
                nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
                           fi.offset / PAGE_SIZE + 1;
                unpin_user_folio(fi.folio, nr_pages);
        }
}
EXPORT_SYMBOL_GPL(__bio_release_pages);

void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
{
        WARN_ON_ONCE(bio->bi_max_vecs);

        bio->bi_io_vec = (struct bio_vec *)iter->bvec;
        bio->bi_iter.bi_idx = 0;
        bio->bi_iter.bi_bvec_done = iter->iov_offset;
        bio->bi_iter.bi_size = iov_iter_count(iter);
        bio_set_flag(bio, BIO_CLONED);
}

/*
 * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that
 * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length
 * for the next iteration.
 */
static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
                            unsigned len_align_mask)
{
        size_t nbytes = bio->bi_iter.bi_size & len_align_mask;

        if (!nbytes)
                return 0;

        iov_iter_revert(iter, nbytes);
        bio->bi_iter.bi_size -= nbytes;
        do {
                struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];

                if (nbytes < bv->bv_len) {
                        bv->bv_len -= nbytes;
                        break;
                }

                if (bio_flagged(bio, BIO_PAGE_PINNED))
                        unpin_user_page(bv->bv_page);

                bio->bi_vcnt--;
                nbytes -= bv->bv_len;
        } while (nbytes);

        if (!bio->bi_vcnt)
                return -EFAULT;
        return 0;
}

/**
 * bio_iov_iter_get_pages - add user or kernel pages to a bio
 * @bio: bio to add pages to
 * @iter: iov iterator describing the region to be added
 * @len_align_mask: the mask to align the total size to, 0 for any length
 *
 * This takes either an iterator pointing to user memory, or one pointing to
 * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
 * map them into the kernel. On IO completion, the caller should put those
 * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
 * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
 * to ensure the bvecs and pages stay referenced until the submitted I/O is
 * completed by a call to ->ki_complete() or returns with an error other than
 * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
 * on IO completion. If it isn't, then pages should be released.
 *
 * The function tries, but does not guarantee, to pin as many pages as
 * fit into the bio, or are requested in @iter, whatever is smaller. If
 * MM encounters an error pinning the requested pages, it stops. Error
 * is returned only if 0 pages could be pinned.
 */
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
                           unsigned len_align_mask)
{
        iov_iter_extraction_t flags = 0;

        if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
                return -EIO;

        if (iov_iter_is_bvec(iter)) {
                bio_iov_bvec_set(bio, iter);
                iov_iter_advance(iter, bio->bi_iter.bi_size);
                return 0;
        }

        if (iov_iter_extract_will_pin(iter))
                bio_set_flag(bio, BIO_PAGE_PINNED);
        if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
                flags |= ITER_ALLOW_P2PDMA;

        do {
                ssize_t ret;

                ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec,
                                BIO_MAX_SIZE - bio->bi_iter.bi_size,
                                &bio->bi_vcnt, bio->bi_max_vecs, flags);
                if (ret <= 0) {
                        if (!bio->bi_vcnt)
                                return ret;
                        break;
                }
                bio->bi_iter.bi_size += ret;
        } while (iov_iter_count(iter) && !bio_full(bio, 0));

        if (is_pci_p2pdma_page(bio->bi_io_vec->bv_page))
                bio->bi_opf |= REQ_NOMERGE;
        return bio_iov_iter_align_down(bio, iter, len_align_mask);
}

static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size)
{
        struct folio *folio;

        while (*size > PAGE_SIZE) {
                folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size));
                if (folio)
                        return folio;
                *size = rounddown_pow_of_two(*size - 1);
        }

        return folio_alloc(gfp, get_order(*size));
}

static void bio_free_folios(struct bio *bio)
{
        struct bio_vec *bv;
        int i;

        bio_for_each_bvec_all(bv, bio, i) {
                struct folio *folio = page_folio(bv->bv_page);

                if (!is_zero_folio(folio))
                        folio_put(folio);
        }
}

static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter,
                size_t maxlen)
{
        size_t total_len = min(maxlen, iov_iter_count(iter));

        if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
                return -EINVAL;
        if (WARN_ON_ONCE(bio->bi_iter.bi_size))
                return -EINVAL;
        if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs))
                return -EINVAL;

        do {
                size_t this_len = min(total_len, SZ_1M);
                struct folio *folio;

                if (this_len > PAGE_SIZE * 2)
                        this_len = rounddown_pow_of_two(this_len);

                if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len)
                        break;

                folio = folio_alloc_greedy(GFP_KERNEL, &this_len);
                if (!folio)
                        break;
                bio_add_folio_nofail(bio, folio, this_len, 0);

                if (copy_from_iter(folio_address(folio), this_len, iter) !=
                                this_len) {
                        bio_free_folios(bio);
                        return -EFAULT;
                }

                total_len -= this_len;
        } while (total_len && bio->bi_vcnt < bio->bi_max_vecs);

        if (!bio->bi_iter.bi_size)
                return -ENOMEM;
        return 0;
}

static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter,
                size_t maxlen)
{
        size_t len = min3(iov_iter_count(iter), maxlen, SZ_1M);
        struct folio *folio;

        folio = folio_alloc_greedy(GFP_KERNEL, &len);
        if (!folio)
                return -ENOMEM;

        do {
                ssize_t ret;

                ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
                                &bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
                if (ret <= 0) {
                        if (!bio->bi_vcnt) {
                                folio_put(folio);
                                return ret;
                        }
                        break;
                }
                len -= ret;
                bio->bi_iter.bi_size += ret;
        } while (len && bio->bi_vcnt < bio->bi_max_vecs - 1);

        /*
         * Set the folio directly here.  The above loop has already calculated
         * the correct bi_size, and we use bi_vcnt for the user buffers.  That
         * is safe as bi_vcnt is only used by the submitter and not the actual
         * I/O path.
         */
        bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0);
        if (iov_iter_extract_will_pin(iter))
                bio_set_flag(bio, BIO_PAGE_PINNED);
        return 0;
}

/**
 * bio_iov_iter_bounce - bounce buffer data from an iter into a bio
 * @bio:        bio to send
 * @iter:        iter to read from / write into
 * @maxlen:        maximum size to bounce
 *
 * Helper for direct I/O implementations that need to bounce buffer because
 * we need to checksum the data or perform other operations that require
 * consistency.  Allocates folios to back the bounce buffer, and for writes
 * copies the data into it.  Needs to be paired with bio_iov_iter_unbounce()
 * called on completion.
 */
int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen)
{
        if (op_is_write(bio_op(bio)))
                return bio_iov_iter_bounce_write(bio, iter, maxlen);
        return bio_iov_iter_bounce_read(bio, iter, maxlen);
}

static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)
{
        struct folio *folio = page_folio(bv->bv_page);
        size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE -
                        bv->bv_offset / PAGE_SIZE + 1;

        if (mark_dirty)
                folio_mark_dirty_lock(folio);
        unpin_user_folio(folio, nr_pages);
}

static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error,
                bool mark_dirty)
{
        unsigned int len = bio->bi_io_vec[0].bv_len;

        if (likely(!is_error)) {
                void *buf = bvec_virt(&bio->bi_io_vec[0]);
                struct iov_iter to;

                iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt,
                                len);
                /* copying to pinned pages should always work */
                WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len);
        } else {
                /* No need to mark folios dirty if never copied to them */
                mark_dirty = false;
        }

        if (bio_flagged(bio, BIO_PAGE_PINNED)) {
                int i;

                for (i = 0; i < bio->bi_vcnt; i++)
                        bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty);
        }

        folio_put(page_folio(bio->bi_io_vec[0].bv_page));
}

/**
 * bio_iov_iter_unbounce - finish a bounce buffer operation
 * @bio:        completed bio
 * @is_error:        %true if an I/O error occurred and data should not be copied
 * @mark_dirty:        If %true, folios will be marked dirty.
 *
 * Helper for direct I/O implementations that need to bounce buffer because
 * we need to checksum the data or perform other operations that require
 * consistency.  Called to complete a bio set up by bio_iov_iter_bounce().
 * Copies data back for reads, and marks the original folios dirty if
 * requested and then frees the bounce buffer.
 */
void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty)
{
        if (op_is_write(bio_op(bio)))
                bio_free_folios(bio);
        else
                bio_iov_iter_unbounce_read(bio, is_error, mark_dirty);
}

static void bio_wait_end_io(struct bio *bio)
{
        complete(bio->bi_private);
}

/**
 * bio_await - call a function on a bio, and wait until it completes
 * @bio:        the bio which describes the I/O
 * @submit:        function called to submit the bio
 * @priv:        private data passed to @submit
 *
 * Wait for the bio as well as any bio chained off it after executing the
 * passed in callback @submit.  The wait for the bio is set up before calling
 * @submit to ensure that the completion is captured.  If @submit is %NULL,
 * submit_bio() is used instead to submit the bio.
 *
 * Note: this overrides the bi_private and bi_end_io fields in the bio.
 */
void bio_await(struct bio *bio, void *priv,
               void (*submit)(struct bio *bio, void *priv))
{
        DECLARE_COMPLETION_ONSTACK_MAP(done,
                        bio->bi_bdev->bd_disk->lockdep_map);

        bio->bi_private = &done;
        bio->bi_end_io = bio_wait_end_io;
        bio->bi_opf |= REQ_SYNC;
        if (submit)
                submit(bio, priv);
        else
                submit_bio(bio);
        blk_wait_io(&done);
}
EXPORT_SYMBOL_GPL(bio_await);

/**
 * submit_bio_wait - submit a bio, and wait until it completes
 * @bio: The &struct bio which describes the I/O
 *
 * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
 * bio_endio() on failure.
 *
 * WARNING: Unlike to how submit_bio() is usually used, this function does not
 * result in bio reference to be consumed. The caller must drop the reference
 * on his own.
 */
int submit_bio_wait(struct bio *bio)
{
        bio_await(bio, NULL, NULL);
        return blk_status_to_errno(bio->bi_status);
}
EXPORT_SYMBOL(submit_bio_wait);

static void bio_endio_cb(struct bio *bio, void *priv)
{
        bio_endio(bio);
}

/*
 * Submit @bio synchronously, or call bio_endio on it if the current process
 * is being killed.
 */
int bio_submit_or_kill(struct bio *bio, unsigned int flags)
{
        if ((flags & BLKDEV_ZERO_KILLABLE) && fatal_signal_pending(current)) {
                bio_await(bio, NULL, bio_endio_cb);
                return -EINTR;
        }

        return submit_bio_wait(bio);
}

/**
 * bdev_rw_virt - synchronously read into / write from kernel mapping
 * @bdev:        block device to access
 * @sector:        sector to access
 * @data:        data to read/write
 * @len:        length in byte to read/write
 * @op:                operation (e.g. REQ_OP_READ/REQ_OP_WRITE)
 *
 * Performs synchronous I/O to @bdev for @data/@len.  @data must be in
 * the kernel direct mapping and not a vmalloc address.
 */
int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
                size_t len, enum req_op op)
{
        struct bio_vec bv;
        struct bio bio;
        int error;

        if (WARN_ON_ONCE(is_vmalloc_addr(data)))
                return -EIO;

        bio_init(&bio, bdev, &bv, 1, op);
        bio.bi_iter.bi_sector = sector;
        bio_add_virt_nofail(&bio, data, len);
        error = submit_bio_wait(&bio);
        bio_uninit(&bio);
        return error;
}
EXPORT_SYMBOL_GPL(bdev_rw_virt);

void __bio_advance(struct bio *bio, unsigned bytes)
{
        if (bio_integrity(bio))
                bio_integrity_advance(bio, bytes);

        bio_crypt_advance(bio, bytes);
        bio_advance_iter(bio, &bio->bi_iter, bytes);
}
EXPORT_SYMBOL(__bio_advance);

void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                        struct bio *src, struct bvec_iter *src_iter)
{
        while (src_iter->bi_size && dst_iter->bi_size) {
                struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
                struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
                unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
                void *src_buf = bvec_kmap_local(&src_bv);
                void *dst_buf = bvec_kmap_local(&dst_bv);

                memcpy(dst_buf, src_buf, bytes);

                kunmap_local(dst_buf);
                kunmap_local(src_buf);

                bio_advance_iter_single(src, src_iter, bytes);
                bio_advance_iter_single(dst, dst_iter, bytes);
        }
}
EXPORT_SYMBOL(bio_copy_data_iter);

/**
 * bio_copy_data - copy contents of data buffers from one bio to another
 * @src: source bio
 * @dst: destination bio
 *
 * Stops when it reaches the end of either @src or @dst - that is, copies
 * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
 */
void bio_copy_data(struct bio *dst, struct bio *src)
{
        struct bvec_iter src_iter = src->bi_iter;
        struct bvec_iter dst_iter = dst->bi_iter;

        bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
}
EXPORT_SYMBOL(bio_copy_data);

void bio_free_pages(struct bio *bio)
{
        struct bio_vec *bvec;
        struct bvec_iter_all iter_all;

        bio_for_each_segment_all(bvec, bio, iter_all)
                __free_page(bvec->bv_page);
}
EXPORT_SYMBOL(bio_free_pages);

/*
 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
 * for performing direct-IO in BIOs.
 *
 * The problem is that we cannot run folio_mark_dirty() from interrupt context
 * because the required locks are not interrupt-safe.  So what we can do is to
 * mark the pages dirty _before_ performing IO.  And in interrupt context,
 * check that the pages are still dirty.   If so, fine.  If not, redirty them
 * in process context.
 *
 * Note that this code is very hard to test under normal circumstances because
 * direct-io pins the pages with get_user_pages().  This makes
 * is_page_cache_freeable return false, and the VM will not clean the pages.
 * But other code (eg, flusher threads) could clean the pages if they are mapped
 * pagecache.
 *
 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
 * deferred bio dirtying paths.
 */

/*
 * bio_set_pages_dirty() will mark all the bio's pages as dirty.
 */
void bio_set_pages_dirty(struct bio *bio)
{
        struct folio_iter fi;

        bio_for_each_folio_all(fi, bio) {
                folio_lock(fi.folio);
                folio_mark_dirty(fi.folio);
                folio_unlock(fi.folio);
        }
}
EXPORT_SYMBOL_GPL(bio_set_pages_dirty);

/*
 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
 * If they are, then fine.  If, however, some pages are clean then they must
 * have been written out during the direct-IO read.  So we take another ref on
 * the BIO and re-dirty the pages in process context.
 *
 * It is expected that bio_check_pages_dirty() will wholly own the BIO from
 * here on.  It will unpin each page and will run one bio_put() against the
 * BIO.
 */

static void bio_dirty_fn(struct work_struct *work);

static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
static DEFINE_SPINLOCK(bio_dirty_lock);
static struct bio *bio_dirty_list;

/*
 * This runs in process context
 */
static void bio_dirty_fn(struct work_struct *work)
{
        struct bio *bio, *next;

        spin_lock_irq(&bio_dirty_lock);
        next = bio_dirty_list;
        bio_dirty_list = NULL;
        spin_unlock_irq(&bio_dirty_lock);

        while ((bio = next) != NULL) {
                next = bio->bi_private;

                bio_release_pages(bio, true);
                bio_put(bio);
        }
}

void bio_check_pages_dirty(struct bio *bio)
{
        struct folio_iter fi;
        unsigned long flags;

        bio_for_each_folio_all(fi, bio) {
                if (!folio_test_dirty(fi.folio))
                        goto defer;
        }

        bio_release_pages(bio, false);
        bio_put(bio);
        return;
defer:
        spin_lock_irqsave(&bio_dirty_lock, flags);
        bio->bi_private = bio_dirty_list;
        bio_dirty_list = bio;
        spin_unlock_irqrestore(&bio_dirty_lock, flags);
        schedule_work(&bio_dirty_work);
}
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);

static inline bool bio_remaining_done(struct bio *bio)
{
        /*
         * If we're not chaining, then ->__bi_remaining is always 1 and
         * we always end io on the first invocation.
         */
        if (!bio_flagged(bio, BIO_CHAIN))
                return true;

        BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);

        if (atomic_dec_and_test(&bio->__bi_remaining)) {
                bio_clear_flag(bio, BIO_CHAIN);
                return true;
        }

        return false;
}

/**
 * bio_endio - end I/O on a bio
 * @bio:        bio
 *
 * Description:
 *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
 *   way to end I/O on a bio. No one should call bi_end_io() directly on a
 *   bio unless they own it and thus know that it has an end_io function.
 *
 *   bio_endio() can be called several times on a bio that has been chained
 *   using bio_chain().  The ->bi_end_io() function will only be called the
 *   last time.
 **/
void bio_endio(struct bio *bio)
{
again:
        if (!bio_remaining_done(bio))
                return;
        if (!bio_integrity_endio(bio))
                return;

        blk_zone_bio_endio(bio);

        rq_qos_done_bio(bio);

        if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
                trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);
        }

        /*
         * Need to have a real endio function for chained bios, otherwise
         * various corner cases will break (like stacking block devices that
         * save/restore bi_end_io) - however, we want to avoid unbounded
         * recursion and blowing the stack. Tail call optimization would
         * handle this, but compiling with frame pointers also disables
         * gcc's sibling call optimization.
         */
        if (bio->bi_end_io == bio_chain_endio) {
                bio = __bio_chain_endio(bio);
                goto again;
        }

#ifdef CONFIG_BLK_CGROUP
        /*
         * Release cgroup info.  We shouldn't have to do this here, but quite
         * a few callers of bio_init fail to call bio_uninit, so we cover up
         * for that here at least for now.
         */
        if (bio->bi_blkg) {
                blkg_put(bio->bi_blkg);
                bio->bi_blkg = NULL;
        }
#endif

        if (bio->bi_end_io)
                bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);

/**
 * bio_split - split a bio
 * @bio:        bio to split
 * @sectors:        number of sectors to split from the front of @bio
 * @gfp:        gfp mask
 * @bs:                bio set to allocate from
 *
 * Allocates and returns a new bio which represents @sectors from the start of
 * @bio, and updates @bio to represent the remaining sectors.
 *
 * Unless this is a discard request the newly allocated bio will point
 * to @bio's bi_io_vec. It is the caller's responsibility to ensure that
 * neither @bio nor @bs are freed before the split bio.
 */
struct bio *bio_split(struct bio *bio, int sectors,
                      gfp_t gfp, struct bio_set *bs)
{
        struct bio *split;

        if (WARN_ON_ONCE(sectors <= 0))
                return ERR_PTR(-EINVAL);
        if (WARN_ON_ONCE(sectors >= bio_sectors(bio)))
                return ERR_PTR(-EINVAL);

        /* Zone append commands cannot be split */
        if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
                return ERR_PTR(-EINVAL);

        /* atomic writes cannot be split */
        if (bio->bi_opf & REQ_ATOMIC)
                return ERR_PTR(-EINVAL);

        split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
        if (!split)
                return ERR_PTR(-ENOMEM);

        split->bi_iter.bi_size = sectors << 9;

        if (bio_integrity(split))
                bio_integrity_trim(split);

        bio_advance(bio, split->bi_iter.bi_size);

        if (bio_flagged(bio, BIO_TRACE_COMPLETION))
                bio_set_flag(split, BIO_TRACE_COMPLETION);

        return split;
}
EXPORT_SYMBOL(bio_split);

/**
 * bio_trim - trim a bio
 * @bio:        bio to trim
 * @offset:        number of sectors to trim from the front of @bio
 * @size:        size we want to trim @bio to, in sectors
 *
 * This function is typically used for bios that are cloned and submitted
 * to the underlying device in parts.
 */
void bio_trim(struct bio *bio, sector_t offset, sector_t size)
{
        /* We should never trim an atomic write */
        if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size))
                return;

        if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
                         offset + size > bio_sectors(bio)))
                return;

        size <<= 9;
        if (offset == 0 && size == bio->bi_iter.bi_size)
                return;

        bio_advance(bio, offset << 9);
        bio->bi_iter.bi_size = size;

        if (bio_integrity(bio))
                bio_integrity_trim(bio);
}
EXPORT_SYMBOL_GPL(bio_trim);

/*
 * create memory pools for biovec's in a bio_set.
 * use the global biovec slabs created for general use.
 */
int biovec_init_pool(mempool_t *pool, int pool_entries)
{
        struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;

        return mempool_init_slab_pool(pool, pool_entries, bp->slab);
}

/*
 * bioset_exit - exit a bioset initialized with bioset_init()
 *
 * May be called on a zeroed but uninitialized bioset (i.e. allocated with
 * kzalloc()).
 */
void bioset_exit(struct bio_set *bs)
{
        bio_alloc_cache_destroy(bs);
        if (bs->rescue_workqueue)
                destroy_workqueue(bs->rescue_workqueue);
        bs->rescue_workqueue = NULL;

        mempool_exit(&bs->bio_pool);
        mempool_exit(&bs->bvec_pool);

        if (bs->bio_slab)
                bio_put_slab(bs);
        bs->bio_slab = NULL;
}
EXPORT_SYMBOL(bioset_exit);

/**
 * bioset_init - Initialize a bio_set
 * @bs:                pool to initialize
 * @pool_size:        Number of bio and bio_vecs to cache in the mempool
 * @front_pad:        Number of bytes to allocate in front of the returned bio
 * @flags:        Flags to modify behavior, currently %BIOSET_NEED_BVECS
 *              and %BIOSET_NEED_RESCUER
 *
 * Description:
 *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
 *    to ask for a number of bytes to be allocated in front of the bio.
 *    Front pad allocation is useful for embedding the bio inside
 *    another structure, to avoid allocating extra data to go with the bio.
 *    Note that the bio must be embedded at the END of that structure always,
 *    or things will break badly.
 *    If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
 *    for allocating iovecs.  This pool is not needed e.g. for bio_init_clone().
 *    If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used
 *    to dispatch queued requests when the mempool runs out of space.
 *
 */
int bioset_init(struct bio_set *bs,
                unsigned int pool_size,
                unsigned int front_pad,
                int flags)
{
        bs->front_pad = front_pad;
        if (flags & BIOSET_NEED_BVECS)
                bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
        else
                bs->back_pad = 0;

        spin_lock_init(&bs->rescue_lock);
        bio_list_init(&bs->rescue_list);
        INIT_WORK(&bs->rescue_work, bio_alloc_rescue);

        bs->bio_slab = bio_find_or_create_slab(bs);
        if (!bs->bio_slab)
                return -ENOMEM;

        if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
                goto bad;

        if ((flags & BIOSET_NEED_BVECS) &&
            biovec_init_pool(&bs->bvec_pool, pool_size))
                goto bad;

        if (flags & BIOSET_NEED_RESCUER) {
                bs->rescue_workqueue = alloc_workqueue("bioset",
                                                        WQ_MEM_RECLAIM, 0);
                if (!bs->rescue_workqueue)
                        goto bad;
        }
        if (flags & BIOSET_PERCPU_CACHE) {
                bs->cache = alloc_percpu(struct bio_alloc_cache);
                if (!bs->cache)
                        goto bad;
                cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
        }

        return 0;
bad:
        bioset_exit(bs);
        return -ENOMEM;
}
EXPORT_SYMBOL(bioset_init);

static int __init init_bio(void)
{
        int i;

        BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags));

        for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
                struct biovec_slab *bvs = bvec_slabs + i;

                bvs->slab = kmem_cache_create(bvs->name,
                                bvs->nr_vecs * sizeof(struct bio_vec), 0,
                                SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
        }

        cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
                                        bio_cpu_dead);

        if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0,
                        BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
                panic("bio: can't allocate bios\n");

        return 0;
}
subsys_initcall(init_bio);
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 









































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
// SPDX-License-Identifier: GPL-2.0
/*
 * Written for linux by Johan Myreen as a translation from
 * the assembly version by Linus (with diacriticals added)
 *
 * Some additional features added by Christoph Niemann (ChN), March 1993
 *
 * Loadable keymaps by Risto Kankkunen, May 1993
 *
 * Diacriticals redone & other small changes, aeb@cwi.nl, June 1993
 * Added decr/incr_console, dynamic keymaps, Unicode support,
 * dynamic function/string keys, led setting,  Sept 1994
 * `Sticky' modifier keys, 951006.
 *
 * 11-11-96: SAK should now work in the raw mode (Martin Mares)
 *
 * Modified to provide 'generic' keyboard support by Hamish Macdonald
 * Merge with the m68k keyboard driver and split-off of the PC low-level
 * parts by Geert Uytterhoeven, May 1997
 *
 * 27-05-97: Added support for the Magic SysRq Key (Martin Mares)
 * 30-07-98: Dead keys redone, aeb@cwi.nl.
 * 21-08-02: Converted to input API, major cleanup. (Vojtech Pavlik)
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/consolemap.h>
#include <linux/init.h>
#include <linux/input.h>
#include <linux/jiffies.h>
#include <linux/kbd_diacr.h>
#include <linux/kbd_kern.h>
#include <linux/leds.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nospec.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/sched/debug.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/tty_flip.h>
#include <linux/tty.h>
#include <linux/uaccess.h>
#include <linux/vt_kern.h>

#include <asm/irq_regs.h>

/*
 * Exported functions/variables
 */

#define KBD_DEFMODE (BIT(VC_REPEAT) | BIT(VC_META))

#if defined(CONFIG_X86) || defined(CONFIG_PARISC)
#include <asm/kbdleds.h>
#else
static inline int kbd_defleds(void)
{
        return 0;
}
#endif

#define KBD_DEFLOCK 0

/*
 * Handler Tables.
 */

#define K_HANDLERS\
        k_self,                k_fn,                k_spec,                k_pad,\
        k_dead,                k_cons,                k_cur,                k_shift,\
        k_meta,                k_ascii,        k_lock,                k_lowercase,\
        k_slock,        k_dead2,        k_brl,                k_csi

typedef void (k_handler_fn)(struct vc_data *vc, unsigned char value,
                            char up_flag);
static k_handler_fn K_HANDLERS;
static k_handler_fn *k_handler[16] = { K_HANDLERS };

#define FN_HANDLERS\
        fn_null,        fn_enter,        fn_show_ptregs,        fn_show_mem,\
        fn_show_state,        fn_send_intr,        fn_lastcons,        fn_caps_toggle,\
        fn_num,                fn_hold,        fn_scroll_forw,        fn_scroll_back,\
        fn_boot_it,        fn_caps_on,        fn_compose,        fn_SAK,\
        fn_dec_console, fn_inc_console, fn_spawn_con,        fn_bare_num

typedef void (fn_handler_fn)(struct vc_data *vc);
static fn_handler_fn FN_HANDLERS;
static fn_handler_fn *fn_handler[] = { FN_HANDLERS };

/*
 * Variables exported for vt_ioctl.c
 */

struct vt_spawn_console vt_spawn_con = {
        .lock = __SPIN_LOCK_UNLOCKED(vt_spawn_con.lock),
        .pid  = NULL,
        .sig  = 0,
};


/*
 * Internal Data.
 */

static struct kbd_struct kbd_table[MAX_NR_CONSOLES];
static struct kbd_struct *kbd = kbd_table;

/* maximum values each key_handler can handle */
static const unsigned char max_vals[] = {
        [ KT_LATIN        ] = 255,
        [ KT_FN                ] = ARRAY_SIZE(func_table) - 1,
        [ KT_SPEC        ] = ARRAY_SIZE(fn_handler) - 1,
        [ KT_PAD        ] = NR_PAD - 1,
        [ KT_DEAD        ] = NR_DEAD - 1,
        [ KT_CONS        ] = 255,
        [ KT_CUR        ] = 3,
        [ KT_SHIFT        ] = NR_SHIFT - 1,
        [ KT_META        ] = 255,
        [ KT_ASCII        ] = NR_ASCII - 1,
        [ KT_LOCK        ] = NR_LOCK - 1,
        [ KT_LETTER        ] = 255,
        [ KT_SLOCK        ] = NR_LOCK - 1,
        [ KT_DEAD2        ] = 255,
        [ KT_BRL        ] = NR_BRL - 1,
        [ KT_CSI        ] = 99,
};

static const int NR_TYPES = ARRAY_SIZE(max_vals);

static void kbd_bh(struct tasklet_struct *unused);
static DECLARE_TASKLET_DISABLED(keyboard_tasklet, kbd_bh);

static struct input_handler kbd_handler;
static DEFINE_SPINLOCK(kbd_event_lock);
static DEFINE_SPINLOCK(led_lock);
static DEFINE_SPINLOCK(func_buf_lock); /* guard 'func_buf'  and friends */
static DECLARE_BITMAP(key_down, KEY_CNT);        /* keyboard key bitmap */
static unsigned char shift_down[NR_SHIFT];                /* shift state counters.. */
static bool dead_key_next;

/* Handles a number being assembled on the number pad */
static bool npadch_active;
static unsigned int npadch_value;

static unsigned int diacr;
static bool rep;                        /* flag telling character repeat */

static int shift_state = 0;

static unsigned int ledstate = -1U;                        /* undefined */
static unsigned char ledioctl;
static bool vt_switch;

/*
 * Notifier list for console keyboard events
 */
static ATOMIC_NOTIFIER_HEAD(keyboard_notifier_list);

int register_keyboard_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_register(&keyboard_notifier_list, nb);
}
EXPORT_SYMBOL_GPL(register_keyboard_notifier);

int unregister_keyboard_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_unregister(&keyboard_notifier_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_keyboard_notifier);

/*
 * Translation of scancodes to keycodes. We set them on only the first
 * keyboard in the list that accepts the scancode and keycode.
 * Explanation for not choosing the first attached keyboard anymore:
 *  USB keyboards for example have two event devices: one for all "normal"
 *  keys and one for extra function keys (like "volume up", "make coffee",
 *  etc.). So this means that scancodes for the extra function keys won't
 *  be valid for the first event device, but will be for the second.
 */

struct getset_keycode_data {
        struct input_keymap_entry ke;
        int error;
};

static int getkeycode_helper(struct input_handle *handle, void *data)
{
        struct getset_keycode_data *d = data;

        d->error = input_get_keycode(handle->dev, &d->ke);

        return d->error == 0; /* stop as soon as we successfully get one */
}

static int getkeycode(unsigned int scancode)
{
        struct getset_keycode_data d = {
                .ke        = {
                        .flags                = 0,
                        .len                = sizeof(scancode),
                        .keycode        = 0,
                },
                .error        = -ENODEV,
        };

        memcpy(d.ke.scancode, &scancode, sizeof(scancode));

        input_handler_for_each_handle(&kbd_handler, &d, getkeycode_helper);

        return d.error ?: d.ke.keycode;
}

static int setkeycode_helper(struct input_handle *handle, void *data)
{
        struct getset_keycode_data *d = data;

        d->error = input_set_keycode(handle->dev, &d->ke);

        return d->error == 0; /* stop as soon as we successfully set one */
}

static int setkeycode(unsigned int scancode, unsigned int keycode)
{
        struct getset_keycode_data d = {
                .ke        = {
                        .flags                = 0,
                        .len                = sizeof(scancode),
                        .keycode        = keycode,
                },
                .error        = -ENODEV,
        };

        memcpy(d.ke.scancode, &scancode, sizeof(scancode));

        input_handler_for_each_handle(&kbd_handler, &d, setkeycode_helper);

        return d.error;
}

/*
 * Making beeps and bells. Note that we prefer beeps to bells, but when
 * shutting the sound off we do both.
 */

static int kd_sound_helper(struct input_handle *handle, void *data)
{
        unsigned int *hz = data;
        struct input_dev *dev = handle->dev;

        if (test_bit(EV_SND, dev->evbit)) {
                if (test_bit(SND_TONE, dev->sndbit)) {
                        input_inject_event(handle, EV_SND, SND_TONE, *hz);
                        if (*hz)
                                return 0;
                }
                if (test_bit(SND_BELL, dev->sndbit))
                        input_inject_event(handle, EV_SND, SND_BELL, *hz ? 1 : 0);
        }

        return 0;
}

static void kd_nosound(struct timer_list *unused)
{
        static unsigned int zero;

        input_handler_for_each_handle(&kbd_handler, &zero, kd_sound_helper);
}

static DEFINE_TIMER(kd_mksound_timer, kd_nosound);

void kd_mksound(unsigned int hz, unsigned int ticks)
{
        timer_delete_sync(&kd_mksound_timer);

        input_handler_for_each_handle(&kbd_handler, &hz, kd_sound_helper);

        if (hz && ticks)
                mod_timer(&kd_mksound_timer, jiffies + ticks);
}
EXPORT_SYMBOL(kd_mksound);

/*
 * Setting the keyboard rate.
 */

static int kbd_rate_helper(struct input_handle *handle, void *data)
{
        struct input_dev *dev = handle->dev;
        struct kbd_repeat *rpt = data;

        if (test_bit(EV_REP, dev->evbit)) {

                if (rpt[0].delay > 0)
                        input_inject_event(handle,
                                           EV_REP, REP_DELAY, rpt[0].delay);
                if (rpt[0].period > 0)
                        input_inject_event(handle,
                                           EV_REP, REP_PERIOD, rpt[0].period);

                rpt[1].delay = dev->rep[REP_DELAY];
                rpt[1].period = dev->rep[REP_PERIOD];
        }

        return 0;
}

int kbd_rate(struct kbd_repeat *rpt)
{
        struct kbd_repeat data[2] = { *rpt };

        input_handler_for_each_handle(&kbd_handler, data, kbd_rate_helper);
        *rpt = data[1];        /* Copy currently used settings */

        return 0;
}

/*
 * Helper Functions.
 */
static void put_queue(struct vc_data *vc, int ch)
{
        tty_insert_flip_char(&vc->port, ch, 0);
        tty_flip_buffer_push(&vc->port);
}

static void puts_queue(struct vc_data *vc, const char *cp)
{
        tty_insert_flip_string(&vc->port, cp, strlen(cp));
        tty_flip_buffer_push(&vc->port);
}

static void applkey(struct vc_data *vc, int key, char mode)
{
        static char buf[] = { 0x1b, 'O', 0x00, 0x00 };

        buf[1] = (mode ? 'O' : '[');
        buf[2] = key;
        puts_queue(vc, buf);
}

/*
 * Many other routines do put_queue, but I think either
 * they produce ASCII, or they produce some user-assigned
 * string, and in both cases we might assume that it is
 * in utf-8 already.
 */
static void to_utf8(struct vc_data *vc, uint c)
{
        if (c < 0x80)
                /*  0******* */
                put_queue(vc, c);
        else if (c < 0x800) {
                /* 110***** 10****** */
                put_queue(vc, 0xc0 | (c >> 6));
                put_queue(vc, 0x80 | (c & 0x3f));
        } else if (c < 0x10000) {
                if (c >= 0xD800 && c < 0xE000)
                        return;
                if (c == 0xFFFF)
                        return;
                /* 1110**** 10****** 10****** */
                put_queue(vc, 0xe0 | (c >> 12));
                put_queue(vc, 0x80 | ((c >> 6) & 0x3f));
                put_queue(vc, 0x80 | (c & 0x3f));
        } else if (c < 0x110000) {
                /* 11110*** 10****** 10****** 10****** */
                put_queue(vc, 0xf0 | (c >> 18));
                put_queue(vc, 0x80 | ((c >> 12) & 0x3f));
                put_queue(vc, 0x80 | ((c >> 6) & 0x3f));
                put_queue(vc, 0x80 | (c & 0x3f));
        }
}

static void put_queue_utf8(struct vc_data *vc, u32 value)
{
        if (kbd->kbdmode == VC_UNICODE)
                to_utf8(vc, value);
        else {
                int c = conv_uni_to_8bit(value);
                if (c != -1)
                        put_queue(vc, c);
        }
}

/* FIXME: review locking for vt.c callers */
static void set_leds(void)
{
        tasklet_schedule(&keyboard_tasklet);
}

/*
 * Called after returning from RAW mode or when changing consoles - recompute
 * shift_down[] and shift_state from key_down[] maybe called when keymap is
 * undefined, so that shiftkey release is seen. The caller must hold the
 * kbd_event_lock.
 */

static void do_compute_shiftstate(void)
{
        unsigned int k, sym, val;

        shift_state = 0;
        memset(shift_down, 0, sizeof(shift_down));

        for_each_set_bit(k, key_down, min(NR_KEYS, KEY_CNT)) {
                sym = U(key_maps[0][k]);
                if (KTYP(sym) != KT_SHIFT && KTYP(sym) != KT_SLOCK)
                        continue;

                val = KVAL(sym);
                if (val == KVAL(K_CAPSSHIFT))
                        val = KVAL(K_SHIFT);

                shift_down[val]++;
                shift_state |= BIT(val);
        }
}

/* We still have to export this method to vt.c */
void vt_set_leds_compute_shiftstate(void)
{
        /*
         * When VT is switched, the keyboard led needs to be set once.
         * Ensure that after the switch is completed, the state of the
         * keyboard LED is consistent with the state of the keyboard lock.
         */
        vt_switch = true;
        set_leds();

        guard(spinlock_irqsave)(&kbd_event_lock);
        do_compute_shiftstate();
}

/*
 * We have a combining character DIACR here, followed by the character CH.
 * If the combination occurs in the table, return the corresponding value.
 * Otherwise, if CH is a space or equals DIACR, return DIACR.
 * Otherwise, conclude that DIACR was not combining after all,
 * queue it and return CH.
 */
static unsigned int handle_diacr(struct vc_data *vc, unsigned int ch)
{
        unsigned int d = diacr;
        unsigned int i;

        diacr = 0;

        if ((d & ~0xff) == BRL_UC_ROW) {
                if ((ch & ~0xff) == BRL_UC_ROW)
                        return d | ch;
        } else {
                for (i = 0; i < accent_table_size; i++)
                        if (accent_table[i].diacr == d && accent_table[i].base == ch)
                                return accent_table[i].result;
        }

        if (ch == ' ' || ch == (BRL_UC_ROW|0) || ch == d)
                return d;

        put_queue_utf8(vc, d);

        return ch;
}

/*
 * Special function handlers
 */
static void fn_enter(struct vc_data *vc)
{
        if (diacr) {
                put_queue_utf8(vc, diacr);
                diacr = 0;
        }

        put_queue(vc, '\r');
        if (vc_kbd_mode(kbd, VC_CRLF))
                put_queue(vc, '\n');
}

static void fn_caps_toggle(struct vc_data *vc)
{
        if (rep)
                return;

        chg_vc_kbd_led(kbd, VC_CAPSLOCK);
}

static void fn_caps_on(struct vc_data *vc)
{
        if (rep)
                return;

        set_vc_kbd_led(kbd, VC_CAPSLOCK);
}

static void fn_show_ptregs(struct vc_data *vc)
{
        struct pt_regs *regs = get_irq_regs();

        if (regs)
                show_regs(regs);
}

static void fn_hold(struct vc_data *vc)
{
        struct tty_struct *tty = vc->port.tty;

        if (rep || !tty)
                return;

        /*
         * Note: SCROLLOCK will be set (cleared) by stop_tty (start_tty);
         * these routines are also activated by ^S/^Q.
         * (And SCROLLOCK can also be set by the ioctl KDSKBLED.)
         */
        if (tty->flow.stopped)
                start_tty(tty);
        else
                stop_tty(tty);
}

static void fn_num(struct vc_data *vc)
{
        if (vc_kbd_mode(kbd, VC_APPLIC))
                applkey(vc, 'P', 1);
        else
                fn_bare_num(vc);
}

/*
 * Bind this to Shift-NumLock if you work in application keypad mode
 * but want to be able to change the NumLock flag.
 * Bind this to NumLock if you prefer that the NumLock key always
 * changes the NumLock flag.
 */
static void fn_bare_num(struct vc_data *vc)
{
        if (!rep)
                chg_vc_kbd_led(kbd, VC_NUMLOCK);
}

static void fn_lastcons(struct vc_data *vc)
{
        /* switch to the last used console, ChN */
        set_console(last_console);
}

static void fn_dec_console(struct vc_data *vc)
{
        int i, cur = fg_console;

        /* Currently switching?  Queue this next switch relative to that. */
        if (want_console != -1)
                cur = want_console;

        for (i = cur - 1; i != cur; i--) {
                if (i == -1)
                        i = MAX_NR_CONSOLES - 1;
                if (vc_cons_allocated(i))
                        break;
        }
        set_console(i);
}

static void fn_inc_console(struct vc_data *vc)
{
        int i, cur = fg_console;

        /* Currently switching?  Queue this next switch relative to that. */
        if (want_console != -1)
                cur = want_console;

        for (i = cur+1; i != cur; i++) {
                if (i == MAX_NR_CONSOLES)
                        i = 0;
                if (vc_cons_allocated(i))
                        break;
        }
        set_console(i);
}

static void fn_send_intr(struct vc_data *vc)
{
        tty_insert_flip_char(&vc->port, 0, TTY_BREAK);
        tty_flip_buffer_push(&vc->port);
}

static void fn_scroll_forw(struct vc_data *vc)
{
        scrollfront(vc, 0);
}

static void fn_scroll_back(struct vc_data *vc)
{
        scrollback(vc);
}

static void fn_show_mem(struct vc_data *vc)
{
        show_mem();
}

static void fn_show_state(struct vc_data *vc)
{
        show_state();
}

static void fn_boot_it(struct vc_data *vc)
{
        ctrl_alt_del();
}

static void fn_compose(struct vc_data *vc)
{
        dead_key_next = true;
}

static void fn_spawn_con(struct vc_data *vc)
{
        guard(spinlock)(&vt_spawn_con.lock);
        if (vt_spawn_con.pid)
                if (kill_pid(vt_spawn_con.pid, vt_spawn_con.sig, 1)) {
                        put_pid(vt_spawn_con.pid);
                        vt_spawn_con.pid = NULL;
                }
}

static void fn_SAK(struct vc_data *vc)
{
        struct work_struct *SAK_work = &vc_cons[fg_console].SAK_work;
        schedule_work(SAK_work);
}

static void fn_null(struct vc_data *vc)
{
        do_compute_shiftstate();
}

/*
 * Special key handlers
 */
static void k_spec(struct vc_data *vc, unsigned char value, char up_flag)
{
        if (up_flag)
                return;
        if (value >= ARRAY_SIZE(fn_handler))
                return;
        if ((kbd->kbdmode == VC_RAW ||
             kbd->kbdmode == VC_MEDIUMRAW ||
             kbd->kbdmode == VC_OFF) &&
             value != KVAL(K_SAK))
                return;                /* SAK is allowed even in raw mode */
        fn_handler[value](vc);
}

static void k_lowercase(struct vc_data *vc, unsigned char value, char up_flag)
{
        pr_err("k_lowercase was called - impossible\n");
}

static void k_unicode(struct vc_data *vc, unsigned int value, char up_flag)
{
        if (up_flag)
                return;                /* no action, if this is a key release */

        if (diacr)
                value = handle_diacr(vc, value);

        if (dead_key_next) {
                dead_key_next = false;
                diacr = value;
                return;
        }
        put_queue_utf8(vc, value);
}

/*
 * Handle dead key. Note that we now may have several
 * dead keys modifying the same character. Very useful
 * for Vietnamese.
 */
static void k_deadunicode(struct vc_data *vc, unsigned int value, char up_flag)
{
        if (up_flag)
                return;

        diacr = (diacr ? handle_diacr(vc, value) : value);
}

static void k_self(struct vc_data *vc, unsigned char value, char up_flag)
{
        k_unicode(vc, conv_8bit_to_uni(value), up_flag);
}

static void k_dead2(struct vc_data *vc, unsigned char value, char up_flag)
{
        k_deadunicode(vc, value, up_flag);
}

/*
 * Obsolete - for backwards compatibility only
 */
static void k_dead(struct vc_data *vc, unsigned char value, char up_flag)
{
        static const unsigned char ret_diacr[NR_DEAD] = {
                '`',        /* dead_grave */
                '\'',        /* dead_acute */
                '^',        /* dead_circumflex */
                '~',        /* dead_tilda */
                '"',        /* dead_diaeresis */
                ',',        /* dead_cedilla */
                '_',        /* dead_macron */
                'U',        /* dead_breve */
                '.',        /* dead_abovedot */
                '*',        /* dead_abovering */
                '=',        /* dead_doubleacute */
                'c',        /* dead_caron */
                'k',        /* dead_ogonek */
                'i',        /* dead_iota */
                '#',        /* dead_voiced_sound */
                'o',        /* dead_semivoiced_sound */
                '!',        /* dead_belowdot */
                '?',        /* dead_hook */
                '+',        /* dead_horn */
                '-',        /* dead_stroke */
                ')',        /* dead_abovecomma */
                '(',        /* dead_abovereversedcomma */
                ':',        /* dead_doublegrave */
                'n',        /* dead_invertedbreve */
                ';',        /* dead_belowcomma */
                '$',        /* dead_currency */
                '@',        /* dead_greek */
        };

        k_deadunicode(vc, ret_diacr[value], up_flag);
}

static void k_cons(struct vc_data *vc, unsigned char value, char up_flag)
{
        if (up_flag)
                return;

        set_console(value);
}

static void k_fn(struct vc_data *vc, unsigned char value, char up_flag)
{
        if (up_flag)
                return;

        if ((unsigned)value < ARRAY_SIZE(func_table)) {
                guard(spinlock_irqsave)(&func_buf_lock);
                if (func_table[value])
                        puts_queue(vc, func_table[value]);
        } else
                pr_err("k_fn called with value=%d\n", value);
}

/*
 * Compute xterm-style modifier parameter for CSI sequences.
 * Returns 1 + (shift ? 1 : 0) + (alt ? 2 : 0) + (ctrl ? 4 : 0)
 */
static int csi_modifier_param(void)
{
        int mod = 1;

        if (shift_state & (BIT(KG_SHIFT) | BIT(KG_SHIFTL) | BIT(KG_SHIFTR)))
                mod += 1;
        if (shift_state & (BIT(KG_ALT) | BIT(KG_ALTGR)))
                mod += 2;
        if (shift_state & (BIT(KG_CTRL) | BIT(KG_CTRLL) | BIT(KG_CTRLR)))
                mod += 4;
        return mod;
}

static void k_cur(struct vc_data *vc, unsigned char value, char up_flag)
{
        static const char cur_chars[] = "BDCA";
        int mod;

        if (up_flag)
                return;

        mod = csi_modifier_param();
        if (mod > 1) {
                char buf[] = { 0x1b, '[', '1', ';', '0' + mod, cur_chars[value], 0x00 };

                puts_queue(vc, buf);
        } else {
                applkey(vc, cur_chars[value], vc_kbd_mode(kbd, VC_CKMODE));
        }
}

static void k_pad(struct vc_data *vc, unsigned char value, char up_flag)
{
        static const char pad_chars[] = "0123456789+-*/\015,.?()#";
        static const char app_map[] = "pqrstuvwxylSRQMnnmPQS";

        if (up_flag)
                return;                /* no action, if this is a key release */

        /* kludge... shift forces cursor/number keys */
        if (vc_kbd_mode(kbd, VC_APPLIC) && !shift_down[KG_SHIFT]) {
                applkey(vc, app_map[value], 1);
                return;
        }

        if (!vc_kbd_led(kbd, VC_NUMLOCK)) {

                switch (value) {
                case KVAL(K_PCOMMA):
                case KVAL(K_PDOT):
                        k_fn(vc, KVAL(K_REMOVE), 0);
                        return;
                case KVAL(K_P0):
                        k_fn(vc, KVAL(K_INSERT), 0);
                        return;
                case KVAL(K_P1):
                        k_fn(vc, KVAL(K_SELECT), 0);
                        return;
                case KVAL(K_P2):
                        k_cur(vc, KVAL(K_DOWN), 0);
                        return;
                case KVAL(K_P3):
                        k_fn(vc, KVAL(K_PGDN), 0);
                        return;
                case KVAL(K_P4):
                        k_cur(vc, KVAL(K_LEFT), 0);
                        return;
                case KVAL(K_P6):
                        k_cur(vc, KVAL(K_RIGHT), 0);
                        return;
                case KVAL(K_P7):
                        k_fn(vc, KVAL(K_FIND), 0);
                        return;
                case KVAL(K_P8):
                        k_cur(vc, KVAL(K_UP), 0);
                        return;
                case KVAL(K_P9):
                        k_fn(vc, KVAL(K_PGUP), 0);
                        return;
                case KVAL(K_P5):
                        applkey(vc, 'G', vc_kbd_mode(kbd, VC_APPLIC));
                        return;
                }
        }

        put_queue(vc, pad_chars[value]);
        if (value == KVAL(K_PENTER) && vc_kbd_mode(kbd, VC_CRLF))
                put_queue(vc, '\n');
}

static void k_shift(struct vc_data *vc, unsigned char value, char up_flag)
{
        int old_state = shift_state;

        if (rep)
                return;
        /*
         * Mimic typewriter:
         * a CapsShift key acts like Shift but undoes CapsLock
         */
        if (value == KVAL(K_CAPSSHIFT)) {
                value = KVAL(K_SHIFT);
                if (!up_flag)
                        clr_vc_kbd_led(kbd, VC_CAPSLOCK);
        }

        if (up_flag) {
                /*
                 * handle the case that two shift or control
                 * keys are depressed simultaneously
                 */
                if (shift_down[value])
                        shift_down[value]--;
        } else
                shift_down[value]++;

        if (shift_down[value])
                shift_state |= BIT(value);
        else
                shift_state &= ~BIT(value);

        /* kludge */
        if (up_flag && shift_state != old_state && npadch_active) {
                if (kbd->kbdmode == VC_UNICODE)
                        to_utf8(vc, npadch_value);
                else
                        put_queue(vc, npadch_value & 0xff);
                npadch_active = false;
        }
}

static void k_meta(struct vc_data *vc, unsigned char value, char up_flag)
{
        if (up_flag)
                return;

        if (vc_kbd_mode(kbd, VC_META)) {
                put_queue(vc, '\033');
                put_queue(vc, value);
        } else
                put_queue(vc, value | BIT(7));
}

static void k_ascii(struct vc_data *vc, unsigned char value, char up_flag)
{
        unsigned int base;

        if (up_flag)
                return;

        if (value < 10) {
                /* decimal input of code, while Alt depressed */
                base = 10;
        } else {
                /* hexadecimal input of code, while AltGr depressed */
                value -= 10;
                base = 16;
        }

        if (!npadch_active) {
                npadch_value = 0;
                npadch_active = true;
        }

        npadch_value = npadch_value * base + value;
}

static void k_lock(struct vc_data *vc, unsigned char value, char up_flag)
{
        if (up_flag || rep)
                return;

        chg_vc_kbd_lock(kbd, value);
}

static void k_slock(struct vc_data *vc, unsigned char value, char up_flag)
{
        k_shift(vc, value, up_flag);
        if (up_flag || rep)
                return;

        chg_vc_kbd_slock(kbd, value);
        /* try to make Alt, oops, AltGr and such work */
        if (!key_maps[kbd->lockstate ^ kbd->slockstate]) {
                kbd->slockstate = 0;
                chg_vc_kbd_slock(kbd, value);
        }
}

/* by default, 300ms interval for combination release */
static unsigned brl_timeout = 300;
MODULE_PARM_DESC(brl_timeout, "Braille keys release delay in ms (0 for commit on first key release)");
module_param(brl_timeout, uint, 0644);

static unsigned brl_nbchords = 1;
MODULE_PARM_DESC(brl_nbchords, "Number of chords that produce a braille pattern (0 for dead chords)");
module_param(brl_nbchords, uint, 0644);

static void k_brlcommit(struct vc_data *vc, unsigned int pattern, char up_flag)
{
        static unsigned long chords;
        static unsigned committed;

        if (!brl_nbchords)
                k_deadunicode(vc, BRL_UC_ROW | pattern, up_flag);
        else {
                committed |= pattern;
                chords++;
                if (chords == brl_nbchords) {
                        k_unicode(vc, BRL_UC_ROW | committed, up_flag);
                        chords = 0;
                        committed = 0;
                }
        }
}

static void k_brl(struct vc_data *vc, unsigned char value, char up_flag)
{
        static unsigned pressed, committing;
        static unsigned long releasestart;

        if (kbd->kbdmode != VC_UNICODE) {
                if (!up_flag)
                        pr_warn("keyboard mode must be unicode for braille patterns\n");
                return;
        }

        if (!value) {
                k_unicode(vc, BRL_UC_ROW, up_flag);
                return;
        }

        if (value > 8)
                return;

        if (!up_flag) {
                pressed |= BIT(value - 1);
                if (!brl_timeout)
                        committing = pressed;
        } else if (brl_timeout) {
                if (!committing ||
                    time_after(jiffies,
                               releasestart + msecs_to_jiffies(brl_timeout))) {
                        committing = pressed;
                        releasestart = jiffies;
                }
                pressed &= ~BIT(value - 1);
                if (!pressed && committing) {
                        k_brlcommit(vc, committing, 0);
                        committing = 0;
                }
        } else {
                if (committing) {
                        k_brlcommit(vc, committing, 0);
                        committing = 0;
                }
                pressed &= ~BIT(value - 1);
        }
}

/*
 * Handle KT_CSI keysym type: generate CSI tilde sequences with modifier
 * support. The value encodes the CSI parameter number, producing sequences
 * like ESC [ <value> ~ or ESC [ <value> ; <mod> ~ when modifiers are held.
 */
static void k_csi(struct vc_data *vc, unsigned char value, char up_flag)
{
        char buf[10];
        int i = 0;
        int mod;

        if (up_flag)
                return;

        mod = csi_modifier_param();

        buf[i++] = 0x1b;
        buf[i++] = '[';
        if (value >= 10)
                buf[i++] = '0' + value / 10;
        buf[i++] = '0' + value % 10;
        if (mod > 1) {
                buf[i++] = ';';
                buf[i++] = '0' + mod;
        }
        buf[i++] = '~';
        buf[i] = 0x00;

        puts_queue(vc, buf);
}

#if IS_ENABLED(CONFIG_INPUT_LEDS) && IS_ENABLED(CONFIG_LEDS_TRIGGERS)

struct kbd_led_trigger {
        struct led_trigger trigger;
        unsigned int mask;
};

static int kbd_led_trigger_activate(struct led_classdev *cdev)
{
        struct kbd_led_trigger *trigger =
                container_of(cdev->trigger, struct kbd_led_trigger, trigger);

        tasklet_disable(&keyboard_tasklet);
        if (ledstate != -1U)
                led_set_brightness(cdev, ledstate & trigger->mask ? LED_FULL : LED_OFF);
        tasklet_enable(&keyboard_tasklet);

        return 0;
}

#define KBD_LED_TRIGGER(_led_bit, _name) {                        \
                .trigger = {                                        \
                        .name = _name,                                \
                        .activate = kbd_led_trigger_activate,        \
                },                                                \
                .mask        = BIT(_led_bit),                        \
        }

#define KBD_LOCKSTATE_TRIGGER(_led_bit, _name)                \
        KBD_LED_TRIGGER((_led_bit) + 8, _name)

static struct kbd_led_trigger kbd_led_triggers[] = {
        KBD_LED_TRIGGER(VC_SCROLLOCK, "kbd-scrolllock"),
        KBD_LED_TRIGGER(VC_NUMLOCK,   "kbd-numlock"),
        KBD_LED_TRIGGER(VC_CAPSLOCK,  "kbd-capslock"),
        KBD_LED_TRIGGER(VC_KANALOCK,  "kbd-kanalock"),

        KBD_LOCKSTATE_TRIGGER(VC_SHIFTLOCK,  "kbd-shiftlock"),
        KBD_LOCKSTATE_TRIGGER(VC_ALTGRLOCK,  "kbd-altgrlock"),
        KBD_LOCKSTATE_TRIGGER(VC_CTRLLOCK,   "kbd-ctrllock"),
        KBD_LOCKSTATE_TRIGGER(VC_ALTLOCK,    "kbd-altlock"),
        KBD_LOCKSTATE_TRIGGER(VC_SHIFTLLOCK, "kbd-shiftllock"),
        KBD_LOCKSTATE_TRIGGER(VC_SHIFTRLOCK, "kbd-shiftrlock"),
        KBD_LOCKSTATE_TRIGGER(VC_CTRLLLOCK,  "kbd-ctrlllock"),
        KBD_LOCKSTATE_TRIGGER(VC_CTRLRLOCK,  "kbd-ctrlrlock"),
};

static void kbd_propagate_led_state(unsigned int old_state,
                                    unsigned int new_state)
{
        struct kbd_led_trigger *trigger;
        unsigned int changed = old_state ^ new_state;
        int i;

        for (i = 0; i < ARRAY_SIZE(kbd_led_triggers); i++) {
                trigger = &kbd_led_triggers[i];

                if (changed & trigger->mask)
                        led_trigger_event(&trigger->trigger,
                                          new_state & trigger->mask ?
                                                LED_FULL : LED_OFF);
        }
}

static int kbd_update_leds_helper(struct input_handle *handle, void *data)
{
        unsigned int led_state = *(unsigned int *)data;

        if (test_bit(EV_LED, handle->dev->evbit))
                kbd_propagate_led_state(~led_state, led_state);

        return 0;
}

static void kbd_init_leds(void)
{
        int error;
        int i;

        for (i = 0; i < ARRAY_SIZE(kbd_led_triggers); i++) {
                error = led_trigger_register(&kbd_led_triggers[i].trigger);
                if (error)
                        pr_err("error %d while registering trigger %s\n",
                               error, kbd_led_triggers[i].trigger.name);
        }
}

#else

static int kbd_update_leds_helper(struct input_handle *handle, void *data)
{
        unsigned int leds = *(unsigned int *)data;

        if (test_bit(EV_LED, handle->dev->evbit)) {
                input_inject_event(handle, EV_LED, LED_SCROLLL, !!(leds & BIT(0)));
                input_inject_event(handle, EV_LED, LED_NUML,    !!(leds & BIT(1)));
                input_inject_event(handle, EV_LED, LED_CAPSL,   !!(leds & BIT(2)));
                input_inject_event(handle, EV_SYN, SYN_REPORT, 0);
        }

        return 0;
}

static void kbd_propagate_led_state(unsigned int old_state,
                                    unsigned int new_state)
{
        input_handler_for_each_handle(&kbd_handler, &new_state,
                                      kbd_update_leds_helper);
}

static void kbd_init_leds(void)
{
}

#endif

/*
 * The leds display either (i) the status of NumLock, CapsLock, ScrollLock,
 * or (ii) whatever pattern of lights people want to show using KDSETLED,
 * or (iii) specified bits of specified words in kernel memory.
 */
static unsigned char getledstate(void)
{
        return ledstate & 0xff;
}

void setledstate(struct kbd_struct *kb, unsigned int led)
{
        guard(spinlock_irqsave)(&led_lock);
        if (!(led & ~7)) {
                ledioctl = led;
                kb->ledmode = LED_SHOW_IOCTL;
        } else
                kb->ledmode = LED_SHOW_FLAGS;

        set_leds();
}

static inline unsigned char getleds(void)
{
        struct kbd_struct *kb = kbd_table + fg_console;

        if (kb->ledmode == LED_SHOW_IOCTL)
                return ledioctl;

        return kb->ledflagstate;
}

/**
 *        vt_get_leds        -        helper for braille console
 *        @console: console to read
 *        @flag: flag we want to check
 *
 *        Check the status of a keyboard led flag and report it back
 */
int vt_get_leds(unsigned int console, int flag)
{
        struct kbd_struct *kb = &kbd_table[console];

        guard(spinlock_irqsave)(&led_lock);
        return vc_kbd_led(kb, flag);
}
EXPORT_SYMBOL_GPL(vt_get_leds);

/**
 *        vt_set_led_state        -        set LED state of a console
 *        @console: console to set
 *        @leds: LED bits
 *
 *        Set the LEDs on a console. This is a wrapper for the VT layer
 *        so that we can keep kbd knowledge internal
 */
void vt_set_led_state(unsigned int console, int leds)
{
        struct kbd_struct *kb = &kbd_table[console];
        setledstate(kb, leds);
}

/**
 *        vt_kbd_con_start        -        Keyboard side of console start
 *        @console: console
 *
 *        Handle console start. This is a wrapper for the VT layer
 *        so that we can keep kbd knowledge internal
 *
 *        FIXME: We eventually need to hold the kbd lock here to protect
 *        the LED updating. We can't do it yet because fn_hold calls stop_tty
 *        and start_tty under the kbd_event_lock, while normal tty paths
 *        don't hold the lock. We probably need to split out an LED lock
 *        but not during an -rc release!
 */
void vt_kbd_con_start(unsigned int console)
{
        struct kbd_struct *kb = &kbd_table[console];

        guard(spinlock_irqsave)(&led_lock);
        clr_vc_kbd_led(kb, VC_SCROLLOCK);
        set_leds();
}

/**
 *        vt_kbd_con_stop                -        Keyboard side of console stop
 *        @console: console
 *
 *        Handle console stop. This is a wrapper for the VT layer
 *        so that we can keep kbd knowledge internal
 */
void vt_kbd_con_stop(unsigned int console)
{
        struct kbd_struct *kb = &kbd_table[console];

        guard(spinlock_irqsave)(&led_lock);
        set_vc_kbd_led(kb, VC_SCROLLOCK);
        set_leds();
}

/*
 * This is the tasklet that updates LED state of LEDs using standard
 * keyboard triggers. The reason we use tasklet is that we need to
 * handle the scenario when keyboard handler is not registered yet
 * but we already getting updates from the VT to update led state.
 */
static void kbd_bh(struct tasklet_struct *unused)
{
        unsigned int leds;

        scoped_guard(spinlock_irqsave, &led_lock) {
                leds = getleds();
                leds |= (unsigned int)kbd->lockstate << 8;
        }

        if (vt_switch) {
                ledstate = ~leds;
                vt_switch = false;
        }

        if (leds != ledstate) {
                kbd_propagate_led_state(ledstate, leds);
                ledstate = leds;
        }
}

#if defined(CONFIG_X86) || defined(CONFIG_ALPHA) ||\
    defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) ||\
    defined(CONFIG_PARISC) || defined(CONFIG_SUPERH) ||\
    (defined(CONFIG_ARM) && defined(CONFIG_KEYBOARD_ATKBD) && !defined(CONFIG_ARCH_RPC))

static inline bool kbd_is_hw_raw(const struct input_dev *dev)
{
        if (!test_bit(EV_MSC, dev->evbit) || !test_bit(MSC_RAW, dev->mscbit))
                return false;

        return dev->id.bustype == BUS_I8042 &&
                dev->id.vendor == 0x0001 && dev->id.product == 0x0001;
}

static const unsigned short x86_keycodes[256] =
        { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
         16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
         32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
         48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
         64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
         80, 81, 82, 83, 84,118, 86, 87, 88,115,120,119,121,112,123, 92,
        284,285,309,  0,312, 91,327,328,329,331,333,335,336,337,338,339,
        367,288,302,304,350, 89,334,326,267,126,268,269,125,347,348,349,
        360,261,262,263,268,376,100,101,321,316,373,286,289,102,351,355,
        103,104,105,275,287,279,258,106,274,107,294,364,358,363,362,361,
        291,108,381,281,290,272,292,305,280, 99,112,257,306,359,113,114,
        264,117,271,374,379,265,266, 93, 94, 95, 85,259,375,260, 90,116,
        377,109,111,277,278,282,283,295,296,297,299,300,301,293,303,307,
        308,310,313,314,315,317,318,319,320,357,322,323,324,325,276,330,
        332,340,365,342,343,344,345,346,356,270,341,368,369,370,371,372 };

#ifdef CONFIG_SPARC
static int sparc_l1_a_state;
extern void sun_do_break(void);
#endif

static int emulate_raw(struct vc_data *vc, unsigned int keycode,
                       unsigned char up_flag)
{
        int code;

        switch (keycode) {

        case KEY_PAUSE:
                put_queue(vc, 0xe1);
                put_queue(vc, 0x1d | up_flag);
                put_queue(vc, 0x45 | up_flag);
                break;

        case KEY_HANGEUL:
                if (!up_flag)
                        put_queue(vc, 0xf2);
                break;

        case KEY_HANJA:
                if (!up_flag)
                        put_queue(vc, 0xf1);
                break;

        case KEY_SYSRQ:
                /*
                 * Real AT keyboards (that's what we're trying
                 * to emulate here) emit 0xe0 0x2a 0xe0 0x37 when
                 * pressing PrtSc/SysRq alone, but simply 0x54
                 * when pressing Alt+PrtSc/SysRq.
                 */
                if (test_bit(KEY_LEFTALT, key_down) ||
                    test_bit(KEY_RIGHTALT, key_down)) {
                        put_queue(vc, 0x54 | up_flag);
                } else {
                        put_queue(vc, 0xe0);
                        put_queue(vc, 0x2a | up_flag);
                        put_queue(vc, 0xe0);
                        put_queue(vc, 0x37 | up_flag);
                }
                break;

        default:
                if (keycode > 255)
                        return -1;

                code = x86_keycodes[keycode];
                if (!code)
                        return -1;

                if (code & 0x100)
                        put_queue(vc, 0xe0);
                put_queue(vc, (code & 0x7f) | up_flag);

                break;
        }

        return 0;
}

#else

static inline bool kbd_is_hw_raw(const struct input_dev *dev)
{
        return false;
}

static int emulate_raw(struct vc_data *vc, unsigned int keycode, unsigned char up_flag)
{
        if (keycode > 127)
                return -1;

        put_queue(vc, keycode | up_flag);
        return 0;
}
#endif

static void kbd_rawcode(unsigned char data)
{
        struct vc_data *vc = vc_cons[fg_console].d;

        kbd = &kbd_table[vc->vc_num];
        if (kbd->kbdmode == VC_RAW)
                put_queue(vc, data);
}

static void kbd_keycode(unsigned int keycode, int down, bool hw_raw)
{
        struct vc_data *vc = vc_cons[fg_console].d;
        unsigned short keysym, *key_map;
        unsigned char type;
        bool raw_mode;
        struct tty_struct *tty;
        int shift_final;
        struct keyboard_notifier_param param = { .vc = vc, .value = keycode, .down = down };
        int rc;

        tty = vc->port.tty;

        if (tty && (!tty->driver_data)) {
                /* No driver data? Strange. Okay we fix it then. */
                tty->driver_data = vc;
        }

        kbd = &kbd_table[vc->vc_num];

#ifdef CONFIG_SPARC
        if (keycode == KEY_STOP)
                sparc_l1_a_state = down;
#endif

        rep = (down == 2);

        raw_mode = (kbd->kbdmode == VC_RAW);
        if (raw_mode && !hw_raw)
                if (emulate_raw(vc, keycode, !down << 7))
                        if (keycode < BTN_MISC && printk_ratelimit())
                                pr_warn("can't emulate rawmode for keycode %d\n",
                                        keycode);

#ifdef CONFIG_SPARC
        if (keycode == KEY_A && sparc_l1_a_state) {
                sparc_l1_a_state = false;
                sun_do_break();
        }
#endif

        if (kbd->kbdmode == VC_MEDIUMRAW) {
                /*
                 * This is extended medium raw mode, with keys above 127
                 * encoded as 0, high 7 bits, low 7 bits, with the 0 bearing
                 * the 'up' flag if needed. 0 is reserved, so this shouldn't
                 * interfere with anything else. The two bytes after 0 will
                 * always have the up flag set not to interfere with older
                 * applications. This allows for 16384 different keycodes,
                 * which should be enough.
                 */
                if (keycode < 128) {
                        put_queue(vc, keycode | (!down << 7));
                } else {
                        put_queue(vc, !down << 7);
                        put_queue(vc, (keycode >> 7) | BIT(7));
                        put_queue(vc, keycode | BIT(7));
                }
                raw_mode = true;
        }

        assign_bit(keycode, key_down, down);

        if (rep &&
            (!vc_kbd_mode(kbd, VC_REPEAT) ||
             (tty && !L_ECHO(tty) && tty_chars_in_buffer(tty)))) {
                /*
                 * Don't repeat a key if the input buffers are not empty and the
                 * characters get aren't echoed locally. This makes key repeat
                 * usable with slow applications and under heavy loads.
                 */
                return;
        }

        param.shift = shift_final = (shift_state | kbd->slockstate) ^ kbd->lockstate;
        param.ledstate = kbd->ledflagstate;
        key_map = key_maps[shift_final];

        /*
         * Fall back to the plain map if modifiers are active, the modifier-
         * specific map is missing or has no entry, and the plain map has a
         * modifier-aware key type (KT_CUR or KT_CSI). These handlers encode
         * the modifier state into the emitted escape sequence.
         */
        if (shift_final && keycode < NR_KEYS &&
            (!key_map || key_map[keycode] == K_HOLE) && key_maps[0]) {
                unsigned short plain = key_maps[0][keycode];
                unsigned char type = KTYP(plain);

                if (type >= 0xf0 && (type - 0xf0 == KT_CUR || type - 0xf0 == KT_CSI))
                        key_map = key_maps[0];
        }

        rc = atomic_notifier_call_chain(&keyboard_notifier_list,
                                        KBD_KEYCODE, &param);
        if (rc == NOTIFY_STOP || !key_map) {
                atomic_notifier_call_chain(&keyboard_notifier_list,
                                           KBD_UNBOUND_KEYCODE, &param);
                do_compute_shiftstate();
                kbd->slockstate = 0;
                return;
        }

        if (keycode < NR_KEYS)
                keysym = key_map[keycode];
        else if (keycode >= KEY_BRL_DOT1 && keycode <= KEY_BRL_DOT8)
                keysym = U(K(KT_BRL, keycode - KEY_BRL_DOT1 + 1));
        else
                return;

        type = KTYP(keysym);

        if (type < 0xf0) {
                param.value = keysym;
                rc = atomic_notifier_call_chain(&keyboard_notifier_list,
                                                KBD_UNICODE, &param);
                if (rc != NOTIFY_STOP)
                        if (down && !(raw_mode || kbd->kbdmode == VC_OFF))
                                k_unicode(vc, keysym, !down);
                return;
        }

        type -= 0xf0;

        if (type == KT_LETTER) {
                type = KT_LATIN;
                if (vc_kbd_led(kbd, VC_CAPSLOCK)) {
                        key_map = key_maps[shift_final ^ BIT(KG_SHIFT)];
                        if (key_map)
                                keysym = key_map[keycode];
                }
        }

        param.value = keysym;
        rc = atomic_notifier_call_chain(&keyboard_notifier_list,
                                        KBD_KEYSYM, &param);
        if (rc == NOTIFY_STOP)
                return;

        if ((raw_mode || kbd->kbdmode == VC_OFF) && type != KT_SPEC && type != KT_SHIFT)
                return;

        (*k_handler[type])(vc, KVAL(keysym), !down);

        param.ledstate = kbd->ledflagstate;
        atomic_notifier_call_chain(&keyboard_notifier_list, KBD_POST_KEYSYM, &param);

        if (type != KT_SLOCK)
                kbd->slockstate = 0;
}

static void kbd_event(struct input_handle *handle, unsigned int event_type,
                      unsigned int event_code, int value)
{
        /* We are called with interrupts disabled, just take the lock */
        scoped_guard(spinlock, &kbd_event_lock) {
                if (event_type == EV_MSC && event_code == MSC_RAW &&
                                kbd_is_hw_raw(handle->dev))
                        kbd_rawcode(value);
                if (event_type == EV_KEY && event_code <= KEY_MAX)
                        kbd_keycode(event_code, value, kbd_is_hw_raw(handle->dev));
        }

        tasklet_schedule(&keyboard_tasklet);
        do_poke_blanked_console = 1;
        schedule_console_callback();
}

static bool kbd_match(struct input_handler *handler, struct input_dev *dev)
{
        if (test_bit(EV_SND, dev->evbit))
                return true;

        if (test_bit(EV_KEY, dev->evbit)) {
                if (find_next_bit(dev->keybit, BTN_MISC, KEY_RESERVED) <
                                BTN_MISC)
                        return true;
                if (find_next_bit(dev->keybit, KEY_BRL_DOT10 + 1,
                                        KEY_BRL_DOT1) <= KEY_BRL_DOT10)
                        return true;
        }

        return false;
}

/*
 * When a keyboard (or other input device) is found, the kbd_connect
 * function is called. The function then looks at the device, and if it
 * likes it, it can open it and get events from it. In this (kbd_connect)
 * function, we should decide which VT to bind that keyboard to initially.
 */
static int kbd_connect(struct input_handler *handler, struct input_dev *dev,
                        const struct input_device_id *id)
{
        int error;

        struct input_handle __free(kfree) *handle = kzalloc_obj(*handle);
        if (!handle)
                return -ENOMEM;

        handle->dev = dev;
        handle->handler = handler;
        handle->name = "kbd";

        error = input_register_handle(handle);
        if (error)
                return error;

        error = input_open_device(handle);
        if (error)
                goto err_unregister_handle;

        retain_and_null_ptr(handle);

        return 0;

 err_unregister_handle:
        input_unregister_handle(handle);
        return error;
}

static void kbd_disconnect(struct input_handle *handle)
{
        input_close_device(handle);
        input_unregister_handle(handle);
        kfree(handle);
}

/*
 * Start keyboard handler on the new keyboard by refreshing LED state to
 * match the rest of the system.
 */
static void kbd_start(struct input_handle *handle)
{
        tasklet_disable(&keyboard_tasklet);

        if (ledstate != -1U)
                kbd_update_leds_helper(handle, &ledstate);

        tasklet_enable(&keyboard_tasklet);
}

static const struct input_device_id kbd_ids[] = {
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT,
                .evbit = { BIT_MASK(EV_KEY) },
        },

        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT,
                .evbit = { BIT_MASK(EV_SND) },
        },

        { },    /* Terminating entry */
};

MODULE_DEVICE_TABLE(input, kbd_ids);

static struct input_handler kbd_handler = {
        .event                = kbd_event,
        .match                = kbd_match,
        .connect        = kbd_connect,
        .disconnect        = kbd_disconnect,
        .start                = kbd_start,
        .name                = "kbd",
        .id_table        = kbd_ids,
};

int __init kbd_init(void)
{
        int i;
        int error;

        for (i = 0; i < MAX_NR_CONSOLES; i++) {
                kbd_table[i].ledflagstate = kbd_defleds();
                kbd_table[i].default_ledflagstate = kbd_defleds();
                kbd_table[i].ledmode = LED_SHOW_FLAGS;
                kbd_table[i].lockstate = KBD_DEFLOCK;
                kbd_table[i].slockstate = 0;
                kbd_table[i].modeflags = KBD_DEFMODE;
                kbd_table[i].kbdmode = default_utf8 ? VC_UNICODE : VC_XLATE;
        }

        kbd_init_leds();

        error = input_register_handler(&kbd_handler);
        if (error)
                return error;

        tasklet_enable(&keyboard_tasklet);
        tasklet_schedule(&keyboard_tasklet);

        return 0;
}

/* Ioctl support code */

static int vt_do_kdgkbdiacr(void __user *udp)
{
        struct kbdiacrs __user *a = udp;
        int i, asize;

        struct kbdiacr __free(kfree) *dia = kmalloc_array(MAX_DIACR, sizeof(struct kbdiacr),
                                                          GFP_KERNEL);
        if (!dia)
                return -ENOMEM;

        /* Lock the diacriticals table, make a copy and then
           copy it after we unlock */
        scoped_guard(spinlock_irqsave, &kbd_event_lock) {
                asize = accent_table_size;
                for (i = 0; i < asize; i++) {
                        dia[i].diacr = conv_uni_to_8bit(accent_table[i].diacr);
                        dia[i].base = conv_uni_to_8bit(accent_table[i].base);
                        dia[i].result = conv_uni_to_8bit(accent_table[i].result);
                }
        }

        if (put_user(asize, &a->kb_cnt))
                return -EFAULT;
        if (copy_to_user(a->kbdiacr, dia, asize * sizeof(struct kbdiacr)))
                return -EFAULT;
        return 0;
}

static int vt_do_kdgkbdiacruc(void __user *udp)
{
        struct kbdiacrsuc __user *a = udp;
        int asize;

        void __free(kfree) *buf = kmalloc_array(MAX_DIACR, sizeof(struct kbdiacruc),
                                                GFP_KERNEL);
        if (buf == NULL)
                return -ENOMEM;

        /* Lock the diacriticals table, make a copy and then
           copy it after we unlock */
        scoped_guard(spinlock_irqsave, &kbd_event_lock) {
                asize = accent_table_size;
                memcpy(buf, accent_table, asize * sizeof(struct kbdiacruc));
        }

        if (put_user(asize, &a->kb_cnt))
                return -EFAULT;
        if (copy_to_user(a->kbdiacruc, buf, asize * sizeof(struct kbdiacruc)))
                return -EFAULT;

        return 0;
}

static int vt_do_kdskbdiacr(void __user *udp, int perm)
{
        struct kbdiacrs __user *a = udp;
        struct kbdiacr __free(kfree) *dia = NULL;
        unsigned int ct;
        int i;

        if (!perm)
                return -EPERM;
        if (get_user(ct, &a->kb_cnt))
                return -EFAULT;
        if (ct >= MAX_DIACR)
                return -EINVAL;

        if (ct) {
                dia = memdup_array_user(a->kbdiacr,
                                        ct, sizeof(struct kbdiacr));
                if (IS_ERR(dia))
                        return PTR_ERR(dia);
        }

        guard(spinlock_irqsave)(&kbd_event_lock);
        accent_table_size = ct;
        for (i = 0; i < ct; i++) {
                accent_table[i].diacr =
                                conv_8bit_to_uni(dia[i].diacr);
                accent_table[i].base =
                                conv_8bit_to_uni(dia[i].base);
                accent_table[i].result =
                                conv_8bit_to_uni(dia[i].result);
        }

        return 0;
}

static int vt_do_kdskbdiacruc(void __user *udp, int perm)
{
        struct kbdiacrsuc __user *a = udp;
        unsigned int ct;
        void __free(kfree) *buf = NULL;

        if (!perm)
                return -EPERM;

        if (get_user(ct, &a->kb_cnt))
                return -EFAULT;

        if (ct >= MAX_DIACR)
                return -EINVAL;

        if (ct) {
                buf = memdup_array_user(a->kbdiacruc,
                                        ct, sizeof(struct kbdiacruc));
                if (IS_ERR(buf))
                        return PTR_ERR(buf);
        }
        guard(spinlock_irqsave)(&kbd_event_lock);
        if (ct)
                memcpy(accent_table, buf,
                                ct * sizeof(struct kbdiacruc));
        accent_table_size = ct;
        return 0;
}

/**
 *        vt_do_diacrit                -        diacritical table updates
 *        @cmd: ioctl request
 *        @udp: pointer to user data for ioctl
 *        @perm: permissions check computed by caller
 *
 *        Update the diacritical tables atomically and safely. Lock them
 *        against simultaneous keypresses
 */
int vt_do_diacrit(unsigned int cmd, void __user *udp, int perm)
{
        switch (cmd) {
        case KDGKBDIACR:
                return vt_do_kdgkbdiacr(udp);
        case KDGKBDIACRUC:
                return vt_do_kdgkbdiacruc(udp);
        case KDSKBDIACR:
                return vt_do_kdskbdiacr(udp, perm);
        case KDSKBDIACRUC:
                return vt_do_kdskbdiacruc(udp, perm);
        }
        return 0;
}

/**
 *        vt_do_kdskbmode                -        set keyboard mode ioctl
 *        @console: the console to use
 *        @arg: the requested mode
 *
 *        Update the keyboard mode bits while holding the correct locks.
 *        Return 0 for success or an error code.
 */
int vt_do_kdskbmode(unsigned int console, unsigned int arg)
{
        struct kbd_struct *kb = &kbd_table[console];

        guard(spinlock_irqsave)(&kbd_event_lock);
        switch(arg) {
        case K_RAW:
                kb->kbdmode = VC_RAW;
                return 0;
        case K_MEDIUMRAW:
                kb->kbdmode = VC_MEDIUMRAW;
                return 0;
        case K_XLATE:
                kb->kbdmode = VC_XLATE;
                do_compute_shiftstate();
                return 0;
        case K_UNICODE:
                kb->kbdmode = VC_UNICODE;
                do_compute_shiftstate();
                return 0;
        case K_OFF:
                kb->kbdmode = VC_OFF;
                return 0;
        default:
                return -EINVAL;
        }
}

/**
 *        vt_do_kdskbmeta                -        set keyboard meta state
 *        @console: the console to use
 *        @arg: the requested meta state
 *
 *        Update the keyboard meta bits while holding the correct locks.
 *        Return 0 for success or an error code.
 */
int vt_do_kdskbmeta(unsigned int console, unsigned int arg)
{
        struct kbd_struct *kb = &kbd_table[console];

        guard(spinlock_irqsave)(&kbd_event_lock);
        switch(arg) {
        case K_METABIT:
                clr_vc_kbd_mode(kb, VC_META);
                return 0;
        case K_ESCPREFIX:
                set_vc_kbd_mode(kb, VC_META);
                return 0;
        default:
                return -EINVAL;
        }
}

int vt_do_kbkeycode_ioctl(int cmd, struct kbkeycode __user *user_kbkc, int perm)
{
        struct kbkeycode tmp;
        int kc;

        if (copy_from_user(&tmp, user_kbkc, sizeof(struct kbkeycode)))
                return -EFAULT;

        switch (cmd) {
        case KDGETKEYCODE:
                kc = getkeycode(tmp.scancode);
                if (kc < 0)
                        return kc;
                return put_user(kc, &user_kbkc->keycode);
        case KDSETKEYCODE:
                if (!perm)
                        return -EPERM;
                return setkeycode(tmp.scancode, tmp.keycode);
        }

        return 0;
}

static unsigned short vt_kdgkbent(unsigned char kbdmode, unsigned char idx,
                unsigned char map)
{
        unsigned short *key_map;

        /* Ensure another thread doesn't free it under us */
        guard(spinlock_irqsave)(&kbd_event_lock);
        key_map = key_maps[map];
        if (key_map) {
                unsigned short val = U(key_map[idx]);
                if (kbdmode != VC_UNICODE && KTYP(val) >= NR_TYPES)
                        return K_HOLE;
                return val;
        }

        return idx ? K_HOLE : K_NOSUCHMAP;
}

static int vt_kdskbent(unsigned char kbdmode, unsigned char idx,
                unsigned char map, unsigned short val)
{
        unsigned short *key_map, oldval;

        if (!idx && val == K_NOSUCHMAP) {
                guard(spinlock_irqsave)(&kbd_event_lock);
                /* deallocate map */
                key_map = key_maps[map];
                if (map && key_map) {
                        key_maps[map] = NULL;
                        if (key_map[0] == U(K_ALLOCATED)) {
                                kfree(key_map);
                                keymap_count--;
                        }
                }

                return 0;
        }

        if (KTYP(val) < NR_TYPES) {
                if (KVAL(val) > max_vals[KTYP(val)])
                        return -EINVAL;
        } else if (kbdmode != VC_UNICODE)
                return -EINVAL;

        /* ++Geert: non-PC keyboards may generate keycode zero */
#if !defined(__mc68000__) && !defined(__powerpc__)
        /* assignment to entry 0 only tests validity of args */
        if (!idx)
                return 0;
#endif

        unsigned short __free(kfree) *new_map = kmalloc(sizeof(plain_map), GFP_KERNEL);
        if (!new_map)
                return -ENOMEM;

        guard(spinlock_irqsave)(&kbd_event_lock);
        key_map = key_maps[map];
        if (key_map == NULL) {
                int j;

                if (keymap_count >= MAX_NR_OF_USER_KEYMAPS && !capable(CAP_SYS_RESOURCE))
                        return -EPERM;

                key_map = key_maps[map] = no_free_ptr(new_map);
                key_map[0] = U(K_ALLOCATED);
                for (j = 1; j < NR_KEYS; j++)
                        key_map[j] = U(K_HOLE);
                keymap_count++;
        }

        oldval = U(key_map[idx]);
        if (val == oldval)
                return 0;

        /* Attention Key */
        if ((oldval == K_SAK || val == K_SAK) && !capable(CAP_SYS_ADMIN))
                return -EPERM;

        key_map[idx] = U(val);
        if (!map && (KTYP(oldval) == KT_SHIFT || KTYP(val) == KT_SHIFT))
                do_compute_shiftstate();

        return 0;
}

int vt_do_kdsk_ioctl(int cmd, struct kbentry __user *user_kbe, int perm,
                                                unsigned int console)
{
        struct kbd_struct *kb = &kbd_table[console];
        struct kbentry kbe;

        if (copy_from_user(&kbe, user_kbe, sizeof(struct kbentry)))
                return -EFAULT;

        switch (cmd) {
        case KDGKBENT:
                return put_user(vt_kdgkbent(kb->kbdmode, kbe.kb_index,
                                        kbe.kb_table),
                                &user_kbe->kb_value);
        case KDSKBENT:
                if (!perm || !capable(CAP_SYS_TTY_CONFIG))
                        return -EPERM;
                return vt_kdskbent(kb->kbdmode, kbe.kb_index, kbe.kb_table,
                                kbe.kb_value);
        }
        return 0;
}

static char *vt_kdskbsent(char *kbs, unsigned char cur)
{
        static DECLARE_BITMAP(is_kmalloc, MAX_NR_FUNC);
        char *cur_f = func_table[cur];

        if (cur_f && strlen(cur_f) >= strlen(kbs)) {
                strcpy(cur_f, kbs);
                return kbs;
        }

        func_table[cur] = kbs;

        return __test_and_set_bit(cur, is_kmalloc) ? cur_f : NULL;
}

int vt_do_kdgkb_ioctl(int cmd, struct kbsentry __user *user_kdgkb, int perm)
{
        unsigned char kb_func;

        if (get_user(kb_func, &user_kdgkb->kb_func))
                return -EFAULT;

        kb_func = array_index_nospec(kb_func, MAX_NR_FUNC);

        switch (cmd) {
        case KDGKBSENT: {
                /* size should have been a struct member */
                ssize_t len = sizeof(user_kdgkb->kb_string);

                char __free(kfree) *kbs = kmalloc(len, GFP_KERNEL);
                if (!kbs)
                        return -ENOMEM;

                scoped_guard(spinlock_irqsave, &func_buf_lock)
                        len = strscpy(kbs, func_table[kb_func] ? : "", len);

                if (len < 0)
                        return -ENOSPC;

                if (copy_to_user(user_kdgkb->kb_string, kbs, len + 1))
                        return -EFAULT;

                return 0;
        }
        case KDSKBSENT:
                if (!perm || !capable(CAP_SYS_TTY_CONFIG))
                        return -EPERM;

                char __free(kfree) *kbs = strndup_user(user_kdgkb->kb_string,
                                                       sizeof(user_kdgkb->kb_string));
                if (IS_ERR(kbs))
                        return PTR_ERR(kbs);

                guard(spinlock_irqsave)(&func_buf_lock);
                kbs = vt_kdskbsent(kbs, kb_func);

                return 0;
        }

        return 0;
}

int vt_do_kdskled(unsigned int console, int cmd, unsigned long arg, int perm)
{
        struct kbd_struct *kb = &kbd_table[console];
        unsigned char ucval;

        switch(cmd) {
        /* the ioctls below read/set the flags usually shown in the leds */
        /* don't use them - they will go away without warning */
        case KDGKBLED:
                scoped_guard(spinlock_irqsave, &kbd_event_lock)
                        ucval = kb->ledflagstate | (kb->default_ledflagstate << 4);
                return put_user(ucval, (char __user *)arg);

        case KDSKBLED:
                if (!perm)
                        return -EPERM;
                if (arg & ~0x77)
                        return -EINVAL;
                scoped_guard(spinlock_irqsave, &led_lock) {
                        kb->ledflagstate = (arg & 7);
                        kb->default_ledflagstate = ((arg >> 4) & 7);
                        set_leds();
                }
                return 0;

        /* the ioctls below only set the lights, not the functions */
        /* for those, see KDGKBLED and KDSKBLED above */
        case KDGETLED:
                ucval = getledstate();
                return put_user(ucval, (char __user *)arg);

        case KDSETLED:
                if (!perm)
                        return -EPERM;
                setledstate(kb, arg);
                return 0;
        }
        return -ENOIOCTLCMD;
}

int vt_do_kdgkbmode(unsigned int console)
{
        struct kbd_struct *kb = &kbd_table[console];
        /* This is a spot read so needs no locking */
        switch (kb->kbdmode) {
        case VC_RAW:
                return K_RAW;
        case VC_MEDIUMRAW:
                return K_MEDIUMRAW;
        case VC_UNICODE:
                return K_UNICODE;
        case VC_OFF:
                return K_OFF;
        default:
                return K_XLATE;
        }
}

/**
 *        vt_do_kdgkbmeta                -        report meta status
 *        @console: console to report
 *
 *        Report the meta flag status of this console
 */
int vt_do_kdgkbmeta(unsigned int console)
{
        struct kbd_struct *kb = &kbd_table[console];
        /* Again a spot read so no locking */
        return vc_kbd_mode(kb, VC_META) ? K_ESCPREFIX : K_METABIT;
}

/**
 *        vt_reset_unicode        -        reset the unicode status
 *        @console: console being reset
 *
 *        Restore the unicode console state to its default
 */
void vt_reset_unicode(unsigned int console)
{
        guard(spinlock_irqsave)(&kbd_event_lock);
        kbd_table[console].kbdmode = default_utf8 ? VC_UNICODE : VC_XLATE;
}

/**
 *        vt_get_shift_state        -        shift bit state
 *
 *        Report the shift bits from the keyboard state. We have to export
 *        this to support some oddities in the vt layer.
 */
int vt_get_shift_state(void)
{
        /* Don't lock as this is a transient report */
        return shift_state;
}

/**
 *        vt_reset_keyboard        -        reset keyboard state
 *        @console: console to reset
 *
 *        Reset the keyboard bits for a console as part of a general console
 *        reset event
 */
void vt_reset_keyboard(unsigned int console)
{
        struct kbd_struct *kb = &kbd_table[console];

        guard(spinlock_irqsave)(&kbd_event_lock);
        set_vc_kbd_mode(kb, VC_REPEAT);
        clr_vc_kbd_mode(kb, VC_CKMODE);
        clr_vc_kbd_mode(kb, VC_APPLIC);
        clr_vc_kbd_mode(kb, VC_CRLF);
        kb->lockstate = 0;
        kb->slockstate = 0;
        guard(spinlock)(&led_lock);
        kb->ledmode = LED_SHOW_FLAGS;
        kb->ledflagstate = kb->default_ledflagstate;
        /* do not do set_leds here because this causes an endless tasklet loop
           when the keyboard hasn't been initialized yet */
}

/**
 *        vt_get_kbd_mode_bit        -        read keyboard status bits
 *        @console: console to read from
 *        @bit: mode bit to read
 *
 *        Report back a vt mode bit. We do this without locking so the
 *        caller must be sure that there are no synchronization needs
 */

int vt_get_kbd_mode_bit(unsigned int console, int bit)
{
        struct kbd_struct *kb = &kbd_table[console];
        return vc_kbd_mode(kb, bit);
}

/**
 *        vt_set_kbd_mode_bit        -        read keyboard status bits
 *        @console: console to read from
 *        @bit: mode bit to read
 *
 *        Set a vt mode bit. We do this without locking so the
 *        caller must be sure that there are no synchronization needs
 */

void vt_set_kbd_mode_bit(unsigned int console, int bit)
{
        struct kbd_struct *kb = &kbd_table[console];

        guard(spinlock_irqsave)(&kbd_event_lock);
        set_vc_kbd_mode(kb, bit);
}

/**
 *        vt_clr_kbd_mode_bit        -        read keyboard status bits
 *        @console: console to read from
 *        @bit: mode bit to read
 *
 *        Report back a vt mode bit. We do this without locking so the
 *        caller must be sure that there are no synchronization needs
 */

void vt_clr_kbd_mode_bit(unsigned int console, int bit)
{
        struct kbd_struct *kb = &kbd_table[console];

        guard(spinlock_irqsave)(&kbd_event_lock);
        clr_vc_kbd_mode(kb, bit);
}














































































































































































    1 











    1 















    1 




    1 



































































































    1 






    1 








    1 

    1 





    1 












































































































































































    1 

    1 



    1 















    1 






















































































































































































































































































































































































































































    1 
































    1 






























    1 

    1 
























































































































































































































































































































    1 









    1 



    1 




    1 




    1 










    1 
























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
// SPDX-License-Identifier: GPL-2.0-or-later
/* Generic associative array implementation.
 *
 * See Documentation/core-api/assoc_array.rst for information.
 *
 * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
//#define DEBUG
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/assoc_array_priv.h>

/*
 * Iterate over an associative array.  The caller must hold the RCU read lock
 * or better.
 */
static int assoc_array_subtree_iterate(const struct assoc_array_ptr *root,
                                       const struct assoc_array_ptr *stop,
                                       int (*iterator)(const void *leaf,
                                                       void *iterator_data),
                                       void *iterator_data)
{
        const struct assoc_array_shortcut *shortcut;
        const struct assoc_array_node *node;
        const struct assoc_array_ptr *cursor, *ptr, *parent;
        unsigned long has_meta;
        int slot, ret;

        cursor = root;

begin_node:
        if (assoc_array_ptr_is_shortcut(cursor)) {
                /* Descend through a shortcut */
                shortcut = assoc_array_ptr_to_shortcut(cursor);
                cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */
        }

        node = assoc_array_ptr_to_node(cursor);
        slot = 0;

        /* We perform two passes of each node.
         *
         * The first pass does all the leaves in this node.  This means we
         * don't miss any leaves if the node is split up by insertion whilst
         * we're iterating over the branches rooted here (we may, however, see
         * some leaves twice).
         */
        has_meta = 0;
        for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */
                has_meta |= (unsigned long)ptr;
                if (ptr && assoc_array_ptr_is_leaf(ptr)) {
                        /* We need a barrier between the read of the pointer,
                         * which is supplied by the above READ_ONCE().
                         */
                        /* Invoke the callback */
                        ret = iterator(assoc_array_ptr_to_leaf(ptr),
                                       iterator_data);
                        if (ret)
                                return ret;
                }
        }

        /* The second pass attends to all the metadata pointers.  If we follow
         * one of these we may find that we don't come back here, but rather go
         * back to a replacement node with the leaves in a different layout.
         *
         * We are guaranteed to make progress, however, as the slot number for
         * a particular portion of the key space cannot change - and we
         * continue at the back pointer + 1.
         */
        if (!(has_meta & ASSOC_ARRAY_PTR_META_TYPE))
                goto finished_node;
        slot = 0;

continue_node:
        node = assoc_array_ptr_to_node(cursor);
        for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */
                if (assoc_array_ptr_is_meta(ptr)) {
                        cursor = ptr;
                        goto begin_node;
                }
        }

finished_node:
        /* Move up to the parent (may need to skip back over a shortcut) */
        parent = READ_ONCE(node->back_pointer); /* Address dependency. */
        slot = node->parent_slot;
        if (parent == stop)
                return 0;

        if (assoc_array_ptr_is_shortcut(parent)) {
                shortcut = assoc_array_ptr_to_shortcut(parent);
                cursor = parent;
                parent = READ_ONCE(shortcut->back_pointer); /* Address dependency. */
                slot = shortcut->parent_slot;
                if (parent == stop)
                        return 0;
        }

        /* Ascend to next slot in parent node */
        cursor = parent;
        slot++;
        goto continue_node;
}

/**
 * assoc_array_iterate - Pass all objects in the array to a callback
 * @array: The array to iterate over.
 * @iterator: The callback function.
 * @iterator_data: Private data for the callback function.
 *
 * Iterate over all the objects in an associative array.  Each one will be
 * presented to the iterator function.
 *
 * If the array is being modified concurrently with the iteration then it is
 * possible that some objects in the array will be passed to the iterator
 * callback more than once - though every object should be passed at least
 * once.  If this is undesirable then the caller must lock against modification
 * for the duration of this function.
 *
 * The function will return 0 if no objects were in the array or else it will
 * return the result of the last iterator function called.  Iteration stops
 * immediately if any call to the iteration function results in a non-zero
 * return.
 *
 * The caller should hold the RCU read lock or better if concurrent
 * modification is possible.
 */
int assoc_array_iterate(const struct assoc_array *array,
                        int (*iterator)(const void *object,
                                        void *iterator_data),
                        void *iterator_data)
{
        struct assoc_array_ptr *root = READ_ONCE(array->root); /* Address dependency. */

        if (!root)
                return 0;
        return assoc_array_subtree_iterate(root, NULL, iterator, iterator_data);
}

enum assoc_array_walk_status {
        assoc_array_walk_tree_empty,
        assoc_array_walk_found_terminal_node,
        assoc_array_walk_found_wrong_shortcut,
};

struct assoc_array_walk_result {
        struct {
                struct assoc_array_node        *node;        /* Node in which leaf might be found */
                int                level;
                int                slot;
        } terminal_node;
        struct {
                struct assoc_array_shortcut *shortcut;
                int                level;
                int                sc_level;
                unsigned long        sc_segments;
                unsigned long        dissimilarity;
        } wrong_shortcut;
};

/*
 * Navigate through the internal tree looking for the closest node to the key.
 */
static enum assoc_array_walk_status
assoc_array_walk(const struct assoc_array *array,
                 const struct assoc_array_ops *ops,
                 const void *index_key,
                 struct assoc_array_walk_result *result)
{
        struct assoc_array_shortcut *shortcut;
        struct assoc_array_node *node;
        struct assoc_array_ptr *cursor, *ptr;
        unsigned long sc_segments, dissimilarity;
        unsigned long segments;
        int level, sc_level, next_sc_level;
        int slot;

        pr_devel("-->%s()\n", __func__);

        cursor = READ_ONCE(array->root);  /* Address dependency. */
        if (!cursor)
                return assoc_array_walk_tree_empty;

        level = 0;

        /* Use segments from the key for the new leaf to navigate through the
         * internal tree, skipping through nodes and shortcuts that are on
         * route to the destination.  Eventually we'll come to a slot that is
         * either empty or contains a leaf at which point we've found a node in
         * which the leaf we're looking for might be found or into which it
         * should be inserted.
         */
jumped:
        segments = ops->get_key_chunk(index_key, level);
        pr_devel("segments[%d]: %lx\n", level, segments);

        if (assoc_array_ptr_is_shortcut(cursor))
                goto follow_shortcut;

consider_node:
        node = assoc_array_ptr_to_node(cursor);
        slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK);
        slot &= ASSOC_ARRAY_FAN_MASK;
        ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */

        pr_devel("consider slot %x [ix=%d type=%lu]\n",
                 slot, level, (unsigned long)ptr & 3);

        if (!assoc_array_ptr_is_meta(ptr)) {
                /* The node doesn't have a node/shortcut pointer in the slot
                 * corresponding to the index key that we have to follow.
                 */
                result->terminal_node.node = node;
                result->terminal_node.level = level;
                result->terminal_node.slot = slot;
                pr_devel("<--%s() = terminal_node\n", __func__);
                return assoc_array_walk_found_terminal_node;
        }

        if (assoc_array_ptr_is_node(ptr)) {
                /* There is a pointer to a node in the slot corresponding to
                 * this index key segment, so we need to follow it.
                 */
                cursor = ptr;
                level += ASSOC_ARRAY_LEVEL_STEP;
                if ((level & ASSOC_ARRAY_KEY_CHUNK_MASK) != 0)
                        goto consider_node;
                goto jumped;
        }

        /* There is a shortcut in the slot corresponding to the index key
         * segment.  We follow the shortcut if its partial index key matches
         * this leaf's.  Otherwise we need to split the shortcut.
         */
        cursor = ptr;
follow_shortcut:
        shortcut = assoc_array_ptr_to_shortcut(cursor);
        pr_devel("shortcut to %d\n", shortcut->skip_to_level);
        sc_level = level + ASSOC_ARRAY_LEVEL_STEP;
        BUG_ON(sc_level > shortcut->skip_to_level);

        do {
                /* Check the leaf against the shortcut's index key a word at a
                 * time, trimming the final word (the shortcut stores the index
                 * key completely from the root to the shortcut's target).
                 */
                if ((sc_level & ASSOC_ARRAY_KEY_CHUNK_MASK) == 0)
                        segments = ops->get_key_chunk(index_key, sc_level);

                sc_segments = shortcut->index_key[sc_level >> ASSOC_ARRAY_KEY_CHUNK_SHIFT];
                dissimilarity = segments ^ sc_segments;

                if (round_up(sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE) > shortcut->skip_to_level) {
                        /* Trim segments that are beyond the shortcut */
                        int shift = shortcut->skip_to_level & ASSOC_ARRAY_KEY_CHUNK_MASK;
                        dissimilarity &= ~(ULONG_MAX << shift);
                        next_sc_level = shortcut->skip_to_level;
                } else {
                        next_sc_level = sc_level + ASSOC_ARRAY_KEY_CHUNK_SIZE;
                        next_sc_level = round_down(next_sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE);
                }

                if (dissimilarity != 0) {
                        /* This shortcut points elsewhere */
                        result->wrong_shortcut.shortcut = shortcut;
                        result->wrong_shortcut.level = level;
                        result->wrong_shortcut.sc_level = sc_level;
                        result->wrong_shortcut.sc_segments = sc_segments;
                        result->wrong_shortcut.dissimilarity = dissimilarity;
                        return assoc_array_walk_found_wrong_shortcut;
                }

                sc_level = next_sc_level;
        } while (sc_level < shortcut->skip_to_level);

        /* The shortcut matches the leaf's index to this point. */
        cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */
        if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) {
                level = sc_level;
                goto jumped;
        } else {
                level = sc_level;
                goto consider_node;
        }
}

/**
 * assoc_array_find - Find an object by index key
 * @array: The associative array to search.
 * @ops: The operations to use.
 * @index_key: The key to the object.
 *
 * Find an object in an associative array by walking through the internal tree
 * to the node that should contain the object and then searching the leaves
 * there.  NULL is returned if the requested object was not found in the array.
 *
 * The caller must hold the RCU read lock or better.
 */
void *assoc_array_find(const struct assoc_array *array,
                       const struct assoc_array_ops *ops,
                       const void *index_key)
{
        struct assoc_array_walk_result result;
        const struct assoc_array_node *node;
        const struct assoc_array_ptr *ptr;
        const void *leaf;
        int slot;

        if (assoc_array_walk(array, ops, index_key, &result) !=
            assoc_array_walk_found_terminal_node)
                return NULL;

        node = result.terminal_node.node;

        /* If the target key is available to us, it's has to be pointed to by
         * the terminal node.
         */
        for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */
                if (ptr && assoc_array_ptr_is_leaf(ptr)) {
                        /* We need a barrier between the read of the pointer
                         * and dereferencing the pointer - but only if we are
                         * actually going to dereference it.
                         */
                        leaf = assoc_array_ptr_to_leaf(ptr);
                        if (ops->compare_object(leaf, index_key))
                                return (void *)leaf;
                }
        }

        return NULL;
}

/*
 * Destructively iterate over an associative array.  The caller must prevent
 * other simultaneous accesses.
 */
static void assoc_array_destroy_subtree(struct assoc_array_ptr *root,
                                        const struct assoc_array_ops *ops)
{
        struct assoc_array_shortcut *shortcut;
        struct assoc_array_node *node;
        struct assoc_array_ptr *cursor, *parent = NULL;
        int slot = -1;

        pr_devel("-->%s()\n", __func__);

        cursor = root;
        if (!cursor) {
                pr_devel("empty\n");
                return;
        }

move_to_meta:
        if (assoc_array_ptr_is_shortcut(cursor)) {
                /* Descend through a shortcut */
                pr_devel("[%d] shortcut\n", slot);
                BUG_ON(!assoc_array_ptr_is_shortcut(cursor));
                shortcut = assoc_array_ptr_to_shortcut(cursor);
                BUG_ON(shortcut->back_pointer != parent);
                BUG_ON(slot != -1 && shortcut->parent_slot != slot);
                parent = cursor;
                cursor = shortcut->next_node;
                slot = -1;
                BUG_ON(!assoc_array_ptr_is_node(cursor));
        }

        pr_devel("[%d] node\n", slot);
        node = assoc_array_ptr_to_node(cursor);
        BUG_ON(node->back_pointer != parent);
        BUG_ON(slot != -1 && node->parent_slot != slot);
        slot = 0;

continue_node:
        pr_devel("Node %p [back=%p]\n", node, node->back_pointer);
        for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                struct assoc_array_ptr *ptr = node->slots[slot];
                if (!ptr)
                        continue;
                if (assoc_array_ptr_is_meta(ptr)) {
                        parent = cursor;
                        cursor = ptr;
                        goto move_to_meta;
                }

                if (ops) {
                        pr_devel("[%d] free leaf\n", slot);
                        ops->free_object(assoc_array_ptr_to_leaf(ptr));
                }
        }

        parent = node->back_pointer;
        slot = node->parent_slot;
        pr_devel("free node\n");
        kfree(node);
        if (!parent)
                return; /* Done */

        /* Move back up to the parent (may need to free a shortcut on
         * the way up) */
        if (assoc_array_ptr_is_shortcut(parent)) {
                shortcut = assoc_array_ptr_to_shortcut(parent);
                BUG_ON(shortcut->next_node != cursor);
                cursor = parent;
                parent = shortcut->back_pointer;
                slot = shortcut->parent_slot;
                pr_devel("free shortcut\n");
                kfree(shortcut);
                if (!parent)
                        return;

                BUG_ON(!assoc_array_ptr_is_node(parent));
        }

        /* Ascend to next slot in parent node */
        pr_devel("ascend to %p[%d]\n", parent, slot);
        cursor = parent;
        node = assoc_array_ptr_to_node(cursor);
        slot++;
        goto continue_node;
}

/**
 * assoc_array_destroy - Destroy an associative array
 * @array: The array to destroy.
 * @ops: The operations to use.
 *
 * Discard all metadata and free all objects in an associative array.  The
 * array will be empty and ready to use again upon completion.  This function
 * cannot fail.
 *
 * The caller must prevent all other accesses whilst this takes place as no
 * attempt is made to adjust pointers gracefully to permit RCU readlock-holding
 * accesses to continue.  On the other hand, no memory allocation is required.
 */
void assoc_array_destroy(struct assoc_array *array,
                         const struct assoc_array_ops *ops)
{
        assoc_array_destroy_subtree(array->root, ops);
        array->root = NULL;
}

/*
 * Handle insertion into an empty tree.
 */
static bool assoc_array_insert_in_empty_tree(struct assoc_array_edit *edit)
{
        struct assoc_array_node *new_n0;

        pr_devel("-->%s()\n", __func__);

        new_n0 = kzalloc_obj(struct assoc_array_node);
        if (!new_n0)
                return false;

        edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);
        edit->leaf_p = &new_n0->slots[0];
        edit->adjust_count_on = new_n0;
        edit->set[0].ptr = &edit->array->root;
        edit->set[0].to = assoc_array_node_to_ptr(new_n0);

        pr_devel("<--%s() = ok [no root]\n", __func__);
        return true;
}

/*
 * Handle insertion into a terminal node.
 */
static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit,
                                                  const struct assoc_array_ops *ops,
                                                  const void *index_key,
                                                  struct assoc_array_walk_result *result)
{
        struct assoc_array_shortcut *shortcut, *new_s0;
        struct assoc_array_node *node, *new_n0, *new_n1, *side;
        struct assoc_array_ptr *ptr;
        unsigned long dissimilarity, base_seg, blank;
        size_t keylen;
        bool have_meta;
        int level, diff;
        int slot, next_slot, free_slot, i, j;

        node        = result->terminal_node.node;
        level        = result->terminal_node.level;
        edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = result->terminal_node.slot;

        pr_devel("-->%s()\n", __func__);

        /* We arrived at a node which doesn't have an onward node or shortcut
         * pointer that we have to follow.  This means that (a) the leaf we
         * want must go here (either by insertion or replacement) or (b) we
         * need to split this node and insert in one of the fragments.
         */
        free_slot = -1;

        /* Firstly, we have to check the leaves in this node to see if there's
         * a matching one we should replace in place.
         */
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                ptr = node->slots[i];
                if (!ptr) {
                        free_slot = i;
                        continue;
                }
                if (assoc_array_ptr_is_leaf(ptr) &&
                    ops->compare_object(assoc_array_ptr_to_leaf(ptr),
                                        index_key)) {
                        pr_devel("replace in slot %d\n", i);
                        edit->leaf_p = &node->slots[i];
                        edit->dead_leaf = node->slots[i];
                        pr_devel("<--%s() = ok [replace]\n", __func__);
                        return true;
                }
        }

        /* If there is a free slot in this node then we can just insert the
         * leaf here.
         */
        if (free_slot >= 0) {
                pr_devel("insert in free slot %d\n", free_slot);
                edit->leaf_p = &node->slots[free_slot];
                edit->adjust_count_on = node;
                pr_devel("<--%s() = ok [insert]\n", __func__);
                return true;
        }

        /* The node has no spare slots - so we're either going to have to split
         * it or insert another node before it.
         *
         * Whatever, we're going to need at least two new nodes - so allocate
         * those now.  We may also need a new shortcut, but we deal with that
         * when we need it.
         */
        new_n0 = kzalloc_obj(struct assoc_array_node);
        if (!new_n0)
                return false;
        edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);
        new_n1 = kzalloc_obj(struct assoc_array_node);
        if (!new_n1)
                return false;
        edit->new_meta[1] = assoc_array_node_to_ptr(new_n1);

        /* We need to find out how similar the leaves are. */
        pr_devel("no spare slots\n");
        have_meta = false;
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                ptr = node->slots[i];
                if (assoc_array_ptr_is_meta(ptr)) {
                        edit->segment_cache[i] = 0xff;
                        have_meta = true;
                        continue;
                }
                base_seg = ops->get_object_key_chunk(
                        assoc_array_ptr_to_leaf(ptr), level);
                base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK;
                edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK;
        }

        if (have_meta) {
                pr_devel("have meta\n");
                goto split_node;
        }

        /* The node contains only leaves */
        dissimilarity = 0;
        base_seg = edit->segment_cache[0];
        for (i = 1; i < ASSOC_ARRAY_FAN_OUT; i++)
                dissimilarity |= edit->segment_cache[i] ^ base_seg;

        pr_devel("only leaves; dissimilarity=%lx\n", dissimilarity);

        if ((dissimilarity & ASSOC_ARRAY_FAN_MASK) == 0) {
                /* The old leaves all cluster in the same slot.  We will need
                 * to insert a shortcut if the new node wants to cluster with them.
                 */
                if ((edit->segment_cache[ASSOC_ARRAY_FAN_OUT] ^ base_seg) == 0)
                        goto all_leaves_cluster_together;

                /* Otherwise all the old leaves cluster in the same slot, but
                 * the new leaf wants to go into a different slot - so we
                 * create a new node (n0) to hold the new leaf and a pointer to
                 * a new node (n1) holding all the old leaves.
                 *
                 * This can be done by falling through to the node splitting
                 * path.
                 */
                pr_devel("present leaves cluster but not new leaf\n");
        }

split_node:
        pr_devel("split node\n");

        /* We need to split the current node.  The node must contain anything
         * from a single leaf (in the one leaf case, this leaf will cluster
         * with the new leaf) and the rest meta-pointers, to all leaves, some
         * of which may cluster.
         *
         * It won't contain the case in which all the current leaves plus the
         * new leaves want to cluster in the same slot.
         *
         * We need to expel at least two leaves out of a set consisting of the
         * leaves in the node and the new leaf.  The current meta pointers can
         * just be copied as they shouldn't cluster with any of the leaves.
         *
         * We need a new node (n0) to replace the current one and a new node to
         * take the expelled nodes (n1).
         */
        edit->set[0].to = assoc_array_node_to_ptr(new_n0);
        new_n0->back_pointer = node->back_pointer;
        new_n0->parent_slot = node->parent_slot;
        new_n1->back_pointer = assoc_array_node_to_ptr(new_n0);
        new_n1->parent_slot = -1; /* Need to calculate this */

do_split_node:
        pr_devel("do_split_node\n");

        new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch;
        new_n1->nr_leaves_on_branch = 0;

        /* Begin by finding two matching leaves.  There have to be at least two
         * that match - even if there are meta pointers - because any leaf that
         * would match a slot with a meta pointer in it must be somewhere
         * behind that meta pointer and cannot be here.  Further, given N
         * remaining leaf slots, we now have N+1 leaves to go in them.
         */
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                slot = edit->segment_cache[i];
                if (slot != 0xff)
                        for (j = i + 1; j < ASSOC_ARRAY_FAN_OUT + 1; j++)
                                if (edit->segment_cache[j] == slot)
                                        goto found_slot_for_multiple_occupancy;
        }
found_slot_for_multiple_occupancy:
        pr_devel("same slot: %x %x [%02x]\n", i, j, slot);
        BUG_ON(i >= ASSOC_ARRAY_FAN_OUT);
        BUG_ON(j >= ASSOC_ARRAY_FAN_OUT + 1);
        BUG_ON(slot >= ASSOC_ARRAY_FAN_OUT);

        new_n1->parent_slot = slot;

        /* Metadata pointers cannot change slot */
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++)
                if (assoc_array_ptr_is_meta(node->slots[i]))
                        new_n0->slots[i] = node->slots[i];
                else
                        new_n0->slots[i] = NULL;
        BUG_ON(new_n0->slots[slot] != NULL);
        new_n0->slots[slot] = assoc_array_node_to_ptr(new_n1);

        /* Filter the leaf pointers between the new nodes */
        free_slot = -1;
        next_slot = 0;
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                if (assoc_array_ptr_is_meta(node->slots[i]))
                        continue;
                if (edit->segment_cache[i] == slot) {
                        new_n1->slots[next_slot++] = node->slots[i];
                        new_n1->nr_leaves_on_branch++;
                } else {
                        do {
                                free_slot++;
                        } while (new_n0->slots[free_slot] != NULL);
                        new_n0->slots[free_slot] = node->slots[i];
                }
        }

        pr_devel("filtered: f=%x n=%x\n", free_slot, next_slot);

        if (edit->segment_cache[ASSOC_ARRAY_FAN_OUT] != slot) {
                do {
                        free_slot++;
                } while (new_n0->slots[free_slot] != NULL);
                edit->leaf_p = &new_n0->slots[free_slot];
                edit->adjust_count_on = new_n0;
        } else {
                edit->leaf_p = &new_n1->slots[next_slot++];
                edit->adjust_count_on = new_n1;
        }

        BUG_ON(next_slot <= 1);

        edit->set_backpointers_to = assoc_array_node_to_ptr(new_n0);
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                if (edit->segment_cache[i] == 0xff) {
                        ptr = node->slots[i];
                        BUG_ON(assoc_array_ptr_is_leaf(ptr));
                        if (assoc_array_ptr_is_node(ptr)) {
                                side = assoc_array_ptr_to_node(ptr);
                                edit->set_backpointers[i] = &side->back_pointer;
                        } else {
                                shortcut = assoc_array_ptr_to_shortcut(ptr);
                                edit->set_backpointers[i] = &shortcut->back_pointer;
                        }
                }
        }

        ptr = node->back_pointer;
        if (!ptr)
                edit->set[0].ptr = &edit->array->root;
        else if (assoc_array_ptr_is_node(ptr))
                edit->set[0].ptr = &assoc_array_ptr_to_node(ptr)->slots[node->parent_slot];
        else
                edit->set[0].ptr = &assoc_array_ptr_to_shortcut(ptr)->next_node;
        edit->excised_meta[0] = assoc_array_node_to_ptr(node);
        pr_devel("<--%s() = ok [split node]\n", __func__);
        return true;

all_leaves_cluster_together:
        /* All the leaves, new and old, want to cluster together in this node
         * in the same slot, so we have to replace this node with a shortcut to
         * skip over the identical parts of the key and then place a pair of
         * nodes, one inside the other, at the end of the shortcut and
         * distribute the keys between them.
         *
         * Firstly we need to work out where the leaves start diverging as a
         * bit position into their keys so that we know how big the shortcut
         * needs to be.
         *
         * We only need to make a single pass of N of the N+1 leaves because if
         * any keys differ between themselves at bit X then at least one of
         * them must also differ with the base key at bit X or before.
         */
        pr_devel("all leaves cluster together\n");
        diff = INT_MAX;
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                int x = ops->diff_objects(assoc_array_ptr_to_leaf(node->slots[i]),
                                          index_key);
                if (x < diff) {
                        BUG_ON(x < 0);
                        diff = x;
                }
        }
        BUG_ON(diff == INT_MAX);
        BUG_ON(diff < level + ASSOC_ARRAY_LEVEL_STEP);

        keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE);
        keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;

        new_s0 = kzalloc_flex(*new_s0, index_key, keylen);
        if (!new_s0)
                return false;
        edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s0);

        edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0);
        new_s0->back_pointer = node->back_pointer;
        new_s0->parent_slot = node->parent_slot;
        new_s0->next_node = assoc_array_node_to_ptr(new_n0);
        new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0);
        new_n0->parent_slot = 0;
        new_n1->back_pointer = assoc_array_node_to_ptr(new_n0);
        new_n1->parent_slot = -1; /* Need to calculate this */

        new_s0->skip_to_level = level = diff & ~ASSOC_ARRAY_LEVEL_STEP_MASK;
        pr_devel("skip_to_level = %d [diff %d]\n", level, diff);
        BUG_ON(level <= 0);

        for (i = 0; i < keylen; i++)
                new_s0->index_key[i] =
                        ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE);

        if (level & ASSOC_ARRAY_KEY_CHUNK_MASK) {
                blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK);
                pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank);
                new_s0->index_key[keylen - 1] &= ~blank;
        }

        /* This now reduces to a node splitting exercise for which we'll need
         * to regenerate the disparity table.
         */
        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                ptr = node->slots[i];
                base_seg = ops->get_object_key_chunk(assoc_array_ptr_to_leaf(ptr),
                                                     level);
                base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK;
                edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK;
        }

        base_seg = ops->get_key_chunk(index_key, level);
        base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK;
        edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = base_seg & ASSOC_ARRAY_FAN_MASK;
        goto do_split_node;
}

/*
 * Handle insertion into the middle of a shortcut.
 */
static bool assoc_array_insert_mid_shortcut(struct assoc_array_edit *edit,
                                            const struct assoc_array_ops *ops,
                                            struct assoc_array_walk_result *result)
{
        struct assoc_array_shortcut *shortcut, *new_s0, *new_s1;
        struct assoc_array_node *node, *new_n0, *side;
        unsigned long sc_segments, dissimilarity, blank;
        size_t keylen;
        int level, sc_level, diff;
        int sc_slot;

        shortcut        = result->wrong_shortcut.shortcut;
        level                = result->wrong_shortcut.level;
        sc_level        = result->wrong_shortcut.sc_level;
        sc_segments        = result->wrong_shortcut.sc_segments;
        dissimilarity        = result->wrong_shortcut.dissimilarity;

        pr_devel("-->%s(ix=%d dis=%lx scix=%d)\n",
                 __func__, level, dissimilarity, sc_level);

        /* We need to split a shortcut and insert a node between the two
         * pieces.  Zero-length pieces will be dispensed with entirely.
         *
         * First of all, we need to find out in which level the first
         * difference was.
         */
        diff = __ffs(dissimilarity);
        diff &= ~ASSOC_ARRAY_LEVEL_STEP_MASK;
        diff += sc_level & ~ASSOC_ARRAY_KEY_CHUNK_MASK;
        pr_devel("diff=%d\n", diff);

        if (!shortcut->back_pointer) {
                edit->set[0].ptr = &edit->array->root;
        } else if (assoc_array_ptr_is_node(shortcut->back_pointer)) {
                node = assoc_array_ptr_to_node(shortcut->back_pointer);
                edit->set[0].ptr = &node->slots[shortcut->parent_slot];
        } else {
                BUG();
        }

        edit->excised_meta[0] = assoc_array_shortcut_to_ptr(shortcut);

        /* Create a new node now since we're going to need it anyway */
        new_n0 = kzalloc_obj(struct assoc_array_node);
        if (!new_n0)
                return false;
        edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);
        edit->adjust_count_on = new_n0;

        /* Insert a new shortcut before the new node if this segment isn't of
         * zero length - otherwise we just connect the new node directly to the
         * parent.
         */
        level += ASSOC_ARRAY_LEVEL_STEP;
        if (diff > level) {
                pr_devel("pre-shortcut %d...%d\n", level, diff);
                keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE);
                keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;

                new_s0 = kzalloc_flex(*new_s0, index_key, keylen);
                if (!new_s0)
                        return false;
                edit->new_meta[1] = assoc_array_shortcut_to_ptr(new_s0);
                edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0);
                new_s0->back_pointer = shortcut->back_pointer;
                new_s0->parent_slot = shortcut->parent_slot;
                new_s0->next_node = assoc_array_node_to_ptr(new_n0);
                new_s0->skip_to_level = diff;

                new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0);
                new_n0->parent_slot = 0;

                memcpy(new_s0->index_key, shortcut->index_key,
                       flex_array_size(new_s0, index_key, keylen));

                blank = ULONG_MAX << (diff & ASSOC_ARRAY_KEY_CHUNK_MASK);
                pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, diff, blank);
                new_s0->index_key[keylen - 1] &= ~blank;
        } else {
                pr_devel("no pre-shortcut\n");
                edit->set[0].to = assoc_array_node_to_ptr(new_n0);
                new_n0->back_pointer = shortcut->back_pointer;
                new_n0->parent_slot = shortcut->parent_slot;
        }

        side = assoc_array_ptr_to_node(shortcut->next_node);
        new_n0->nr_leaves_on_branch = side->nr_leaves_on_branch;

        /* We need to know which slot in the new node is going to take a
         * metadata pointer.
         */
        sc_slot = sc_segments >> (diff & ASSOC_ARRAY_KEY_CHUNK_MASK);
        sc_slot &= ASSOC_ARRAY_FAN_MASK;

        pr_devel("new slot %lx >> %d -> %d\n",
                 sc_segments, diff & ASSOC_ARRAY_KEY_CHUNK_MASK, sc_slot);

        /* Determine whether we need to follow the new node with a replacement
         * for the current shortcut.  We could in theory reuse the current
         * shortcut if its parent slot number doesn't change - but that's a
         * 1-in-16 chance so not worth expending the code upon.
         */
        level = diff + ASSOC_ARRAY_LEVEL_STEP;
        if (level < shortcut->skip_to_level) {
                pr_devel("post-shortcut %d...%d\n", level, shortcut->skip_to_level);
                keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE);
                keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;

                new_s1 = kzalloc_flex(*new_s1, index_key, keylen);
                if (!new_s1)
                        return false;
                edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s1);

                new_s1->back_pointer = assoc_array_node_to_ptr(new_n0);
                new_s1->parent_slot = sc_slot;
                new_s1->next_node = shortcut->next_node;
                new_s1->skip_to_level = shortcut->skip_to_level;

                new_n0->slots[sc_slot] = assoc_array_shortcut_to_ptr(new_s1);

                memcpy(new_s1->index_key, shortcut->index_key,
                       flex_array_size(new_s1, index_key, keylen));

                edit->set[1].ptr = &side->back_pointer;
                edit->set[1].to = assoc_array_shortcut_to_ptr(new_s1);
        } else {
                pr_devel("no post-shortcut\n");

                /* We don't have to replace the pointed-to node as long as we
                 * use memory barriers to make sure the parent slot number is
                 * changed before the back pointer (the parent slot number is
                 * irrelevant to the old parent shortcut).
                 */
                new_n0->slots[sc_slot] = shortcut->next_node;
                edit->set_parent_slot[0].p = &side->parent_slot;
                edit->set_parent_slot[0].to = sc_slot;
                edit->set[1].ptr = &side->back_pointer;
                edit->set[1].to = assoc_array_node_to_ptr(new_n0);
        }

        /* Install the new leaf in a spare slot in the new node. */
        if (sc_slot == 0)
                edit->leaf_p = &new_n0->slots[1];
        else
                edit->leaf_p = &new_n0->slots[0];

        pr_devel("<--%s() = ok [split shortcut]\n", __func__);
        return true;
}

/**
 * assoc_array_insert - Script insertion of an object into an associative array
 * @array: The array to insert into.
 * @ops: The operations to use.
 * @index_key: The key to insert at.
 * @object: The object to insert.
 *
 * Precalculate and preallocate a script for the insertion or replacement of an
 * object in an associative array.  This results in an edit script that can
 * either be applied or cancelled.
 *
 * The function returns a pointer to an edit script or -ENOMEM.
 *
 * The caller should lock against other modifications and must continue to hold
 * the lock until assoc_array_apply_edit() has been called.
 *
 * Accesses to the tree may take place concurrently with this function,
 * provided they hold the RCU read lock.
 */
struct assoc_array_edit *assoc_array_insert(struct assoc_array *array,
                                            const struct assoc_array_ops *ops,
                                            const void *index_key,
                                            void *object)
{
        struct assoc_array_walk_result result;
        struct assoc_array_edit *edit;

        pr_devel("-->%s()\n", __func__);

        /* The leaf pointer we're given must not have the bottom bit set as we
         * use those for type-marking the pointer.  NULL pointers are also not
         * allowed as they indicate an empty slot but we have to allow them
         * here as they can be updated later.
         */
        BUG_ON(assoc_array_ptr_is_meta(object));

        edit = kzalloc_obj(struct assoc_array_edit);
        if (!edit)
                return ERR_PTR(-ENOMEM);
        edit->array = array;
        edit->ops = ops;
        edit->leaf = assoc_array_leaf_to_ptr(object);
        edit->adjust_count_by = 1;

        switch (assoc_array_walk(array, ops, index_key, &result)) {
        case assoc_array_walk_tree_empty:
                /* Allocate a root node if there isn't one yet */
                if (!assoc_array_insert_in_empty_tree(edit))
                        goto enomem;
                return edit;

        case assoc_array_walk_found_terminal_node:
                /* We found a node that doesn't have a node/shortcut pointer in
                 * the slot corresponding to the index key that we have to
                 * follow.
                 */
                if (!assoc_array_insert_into_terminal_node(edit, ops, index_key,
                                                           &result))
                        goto enomem;
                return edit;

        case assoc_array_walk_found_wrong_shortcut:
                /* We found a shortcut that didn't match our key in a slot we
                 * needed to follow.
                 */
                if (!assoc_array_insert_mid_shortcut(edit, ops, &result))
                        goto enomem;
                return edit;
        }

enomem:
        /* Clean up after an out of memory error */
        pr_devel("enomem\n");
        assoc_array_cancel_edit(edit);
        return ERR_PTR(-ENOMEM);
}

/**
 * assoc_array_insert_set_object - Set the new object pointer in an edit script
 * @edit: The edit script to modify.
 * @object: The object pointer to set.
 *
 * Change the object to be inserted in an edit script.  The object pointed to
 * by the old object is not freed.  This must be done prior to applying the
 * script.
 */
void assoc_array_insert_set_object(struct assoc_array_edit *edit, void *object)
{
        BUG_ON(!object);
        edit->leaf = assoc_array_leaf_to_ptr(object);
}

struct assoc_array_delete_collapse_context {
        struct assoc_array_node        *node;
        const void                *skip_leaf;
        int                        slot;
};

/*
 * Subtree collapse to node iterator.
 */
static int assoc_array_delete_collapse_iterator(const void *leaf,
                                                void *iterator_data)
{
        struct assoc_array_delete_collapse_context *collapse = iterator_data;

        if (leaf == collapse->skip_leaf)
                return 0;

        BUG_ON(collapse->slot >= ASSOC_ARRAY_FAN_OUT);

        collapse->node->slots[collapse->slot++] = assoc_array_leaf_to_ptr(leaf);
        return 0;
}

/**
 * assoc_array_delete - Script deletion of an object from an associative array
 * @array: The array to search.
 * @ops: The operations to use.
 * @index_key: The key to the object.
 *
 * Precalculate and preallocate a script for the deletion of an object from an
 * associative array.  This results in an edit script that can either be
 * applied or cancelled.
 *
 * The function returns a pointer to an edit script if the object was found,
 * NULL if the object was not found or -ENOMEM.
 *
 * The caller should lock against other modifications and must continue to hold
 * the lock until assoc_array_apply_edit() has been called.
 *
 * Accesses to the tree may take place concurrently with this function,
 * provided they hold the RCU read lock.
 */
struct assoc_array_edit *assoc_array_delete(struct assoc_array *array,
                                            const struct assoc_array_ops *ops,
                                            const void *index_key)
{
        struct assoc_array_delete_collapse_context collapse;
        struct assoc_array_walk_result result;
        struct assoc_array_node *node, *new_n0;
        struct assoc_array_edit *edit;
        struct assoc_array_ptr *ptr;
        bool has_meta;
        int slot, i;

        pr_devel("-->%s()\n", __func__);

        edit = kzalloc_obj(struct assoc_array_edit);
        if (!edit)
                return ERR_PTR(-ENOMEM);
        edit->array = array;
        edit->ops = ops;
        edit->adjust_count_by = -1;

        switch (assoc_array_walk(array, ops, index_key, &result)) {
        case assoc_array_walk_found_terminal_node:
                /* We found a node that should contain the leaf we've been
                 * asked to remove - *if* it's in the tree.
                 */
                pr_devel("terminal_node\n");
                node = result.terminal_node.node;

                for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                        ptr = node->slots[slot];
                        if (ptr &&
                            assoc_array_ptr_is_leaf(ptr) &&
                            ops->compare_object(assoc_array_ptr_to_leaf(ptr),
                                                index_key))
                                goto found_leaf;
                }
                fallthrough;
        case assoc_array_walk_tree_empty:
        case assoc_array_walk_found_wrong_shortcut:
        default:
                assoc_array_cancel_edit(edit);
                pr_devel("not found\n");
                return NULL;
        }

found_leaf:
        BUG_ON(array->nr_leaves_on_tree <= 0);

        /* In the simplest form of deletion we just clear the slot and release
         * the leaf after a suitable interval.
         */
        edit->dead_leaf = node->slots[slot];
        edit->set[0].ptr = &node->slots[slot];
        edit->set[0].to = NULL;
        edit->adjust_count_on = node;

        /* If that concludes erasure of the last leaf, then delete the entire
         * internal array.
         */
        if (array->nr_leaves_on_tree == 1) {
                edit->set[1].ptr = &array->root;
                edit->set[1].to = NULL;
                edit->adjust_count_on = NULL;
                edit->excised_subtree = array->root;
                pr_devel("all gone\n");
                return edit;
        }

        /* However, we'd also like to clear up some metadata blocks if we
         * possibly can.
         *
         * We go for a simple algorithm of: if this node has FAN_OUT or fewer
         * leaves in it, then attempt to collapse it - and attempt to
         * recursively collapse up the tree.
         *
         * We could also try and collapse in partially filled subtrees to take
         * up space in this node.
         */
        if (node->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) {
                struct assoc_array_node *parent, *grandparent;
                struct assoc_array_ptr *ptr;

                /* First of all, we need to know if this node has metadata so
                 * that we don't try collapsing if all the leaves are already
                 * here.
                 */
                has_meta = false;
                for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                        ptr = node->slots[i];
                        if (assoc_array_ptr_is_meta(ptr)) {
                                has_meta = true;
                                break;
                        }
                }

                pr_devel("leaves: %ld [m=%d]\n",
                         node->nr_leaves_on_branch - 1, has_meta);

                /* Look further up the tree to see if we can collapse this node
                 * into a more proximal node too.
                 */
                parent = node;
        collapse_up:
                pr_devel("collapse subtree: %ld\n", parent->nr_leaves_on_branch);

                ptr = parent->back_pointer;
                if (!ptr)
                        goto do_collapse;
                if (assoc_array_ptr_is_shortcut(ptr)) {
                        struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(ptr);
                        ptr = s->back_pointer;
                        if (!ptr)
                                goto do_collapse;
                }

                grandparent = assoc_array_ptr_to_node(ptr);
                if (grandparent->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) {
                        parent = grandparent;
                        goto collapse_up;
                }

        do_collapse:
                /* There's no point collapsing if the original node has no meta
                 * pointers to discard and if we didn't merge into one of that
                 * node's ancestry.
                 */
                if (has_meta || parent != node) {
                        node = parent;

                        /* Create a new node to collapse into */
                        new_n0 = kzalloc_obj(struct assoc_array_node);
                        if (!new_n0)
                                goto enomem;
                        edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);

                        new_n0->back_pointer = node->back_pointer;
                        new_n0->parent_slot = node->parent_slot;
                        new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch;
                        edit->adjust_count_on = new_n0;

                        collapse.node = new_n0;
                        collapse.skip_leaf = assoc_array_ptr_to_leaf(edit->dead_leaf);
                        collapse.slot = 0;
                        assoc_array_subtree_iterate(assoc_array_node_to_ptr(node),
                                                    node->back_pointer,
                                                    assoc_array_delete_collapse_iterator,
                                                    &collapse);
                        pr_devel("collapsed %d,%lu\n", collapse.slot, new_n0->nr_leaves_on_branch);
                        BUG_ON(collapse.slot != new_n0->nr_leaves_on_branch - 1);

                        if (!node->back_pointer) {
                                edit->set[1].ptr = &array->root;
                        } else if (assoc_array_ptr_is_leaf(node->back_pointer)) {
                                BUG();
                        } else if (assoc_array_ptr_is_node(node->back_pointer)) {
                                struct assoc_array_node *p =
                                        assoc_array_ptr_to_node(node->back_pointer);
                                edit->set[1].ptr = &p->slots[node->parent_slot];
                        } else if (assoc_array_ptr_is_shortcut(node->back_pointer)) {
                                struct assoc_array_shortcut *s =
                                        assoc_array_ptr_to_shortcut(node->back_pointer);
                                edit->set[1].ptr = &s->next_node;
                        }
                        edit->set[1].to = assoc_array_node_to_ptr(new_n0);
                        edit->excised_subtree = assoc_array_node_to_ptr(node);
                }
        }

        return edit;

enomem:
        /* Clean up after an out of memory error */
        pr_devel("enomem\n");
        assoc_array_cancel_edit(edit);
        return ERR_PTR(-ENOMEM);
}

/**
 * assoc_array_clear - Script deletion of all objects from an associative array
 * @array: The array to clear.
 * @ops: The operations to use.
 *
 * Precalculate and preallocate a script for the deletion of all the objects
 * from an associative array.  This results in an edit script that can either
 * be applied or cancelled.
 *
 * The function returns a pointer to an edit script if there are objects to be
 * deleted, NULL if there are no objects in the array or -ENOMEM.
 *
 * The caller should lock against other modifications and must continue to hold
 * the lock until assoc_array_apply_edit() has been called.
 *
 * Accesses to the tree may take place concurrently with this function,
 * provided they hold the RCU read lock.
 */
struct assoc_array_edit *assoc_array_clear(struct assoc_array *array,
                                           const struct assoc_array_ops *ops)
{
        struct assoc_array_edit *edit;

        pr_devel("-->%s()\n", __func__);

        if (!array->root)
                return NULL;

        edit = kzalloc_obj(struct assoc_array_edit);
        if (!edit)
                return ERR_PTR(-ENOMEM);
        edit->array = array;
        edit->ops = ops;
        edit->set[1].ptr = &array->root;
        edit->set[1].to = NULL;
        edit->excised_subtree = array->root;
        edit->ops_for_excised_subtree = ops;
        pr_devel("all gone\n");
        return edit;
}

/*
 * Handle the deferred destruction after an applied edit.
 */
static void assoc_array_rcu_cleanup(struct rcu_head *head)
{
        struct assoc_array_edit *edit =
                container_of(head, struct assoc_array_edit, rcu);
        int i;

        pr_devel("-->%s()\n", __func__);

        if (edit->dead_leaf)
                edit->ops->free_object(assoc_array_ptr_to_leaf(edit->dead_leaf));
        for (i = 0; i < ARRAY_SIZE(edit->excised_meta); i++)
                if (edit->excised_meta[i])
                        kfree(assoc_array_ptr_to_node(edit->excised_meta[i]));

        if (edit->excised_subtree) {
                BUG_ON(assoc_array_ptr_is_leaf(edit->excised_subtree));
                if (assoc_array_ptr_is_node(edit->excised_subtree)) {
                        struct assoc_array_node *n =
                                assoc_array_ptr_to_node(edit->excised_subtree);
                        n->back_pointer = NULL;
                } else {
                        struct assoc_array_shortcut *s =
                                assoc_array_ptr_to_shortcut(edit->excised_subtree);
                        s->back_pointer = NULL;
                }
                assoc_array_destroy_subtree(edit->excised_subtree,
                                            edit->ops_for_excised_subtree);
        }

        kfree(edit);
}

/**
 * assoc_array_apply_edit - Apply an edit script to an associative array
 * @edit: The script to apply.
 *
 * Apply an edit script to an associative array to effect an insertion,
 * deletion or clearance.  As the edit script includes preallocated memory,
 * this is guaranteed not to fail.
 *
 * The edit script, dead objects and dead metadata will be scheduled for
 * destruction after an RCU grace period to permit those doing read-only
 * accesses on the array to continue to do so under the RCU read lock whilst
 * the edit is taking place.
 */
void assoc_array_apply_edit(struct assoc_array_edit *edit)
{
        struct assoc_array_shortcut *shortcut;
        struct assoc_array_node *node;
        struct assoc_array_ptr *ptr;
        int i;

        pr_devel("-->%s()\n", __func__);

        smp_wmb();
        if (edit->leaf_p)
                *edit->leaf_p = edit->leaf;

        smp_wmb();
        for (i = 0; i < ARRAY_SIZE(edit->set_parent_slot); i++)
                if (edit->set_parent_slot[i].p)
                        *edit->set_parent_slot[i].p = edit->set_parent_slot[i].to;

        smp_wmb();
        for (i = 0; i < ARRAY_SIZE(edit->set_backpointers); i++)
                if (edit->set_backpointers[i])
                        *edit->set_backpointers[i] = edit->set_backpointers_to;

        smp_wmb();
        for (i = 0; i < ARRAY_SIZE(edit->set); i++)
                if (edit->set[i].ptr)
                        *edit->set[i].ptr = edit->set[i].to;

        if (edit->array->root == NULL) {
                edit->array->nr_leaves_on_tree = 0;
        } else if (edit->adjust_count_on) {
                node = edit->adjust_count_on;
                for (;;) {
                        node->nr_leaves_on_branch += edit->adjust_count_by;

                        ptr = node->back_pointer;
                        if (!ptr)
                                break;
                        if (assoc_array_ptr_is_shortcut(ptr)) {
                                shortcut = assoc_array_ptr_to_shortcut(ptr);
                                ptr = shortcut->back_pointer;
                                if (!ptr)
                                        break;
                        }
                        BUG_ON(!assoc_array_ptr_is_node(ptr));
                        node = assoc_array_ptr_to_node(ptr);
                }

                edit->array->nr_leaves_on_tree += edit->adjust_count_by;
        }

        call_rcu(&edit->rcu, assoc_array_rcu_cleanup);
}

/**
 * assoc_array_cancel_edit - Discard an edit script.
 * @edit: The script to discard.
 *
 * Free an edit script and all the preallocated data it holds without making
 * any changes to the associative array it was intended for.
 *
 * NOTE!  In the case of an insertion script, this does _not_ release the leaf
 * that was to be inserted.  That is left to the caller.
 */
void assoc_array_cancel_edit(struct assoc_array_edit *edit)
{
        struct assoc_array_ptr *ptr;
        int i;

        pr_devel("-->%s()\n", __func__);

        /* Clean up after an out of memory error */
        for (i = 0; i < ARRAY_SIZE(edit->new_meta); i++) {
                ptr = edit->new_meta[i];
                if (ptr) {
                        if (assoc_array_ptr_is_node(ptr))
                                kfree(assoc_array_ptr_to_node(ptr));
                        else
                                kfree(assoc_array_ptr_to_shortcut(ptr));
                }
        }
        kfree(edit);
}

/**
 * assoc_array_gc - Garbage collect an associative array.
 * @array: The array to clean.
 * @ops: The operations to use.
 * @iterator: A callback function to pass judgement on each object.
 * @iterator_data: Private data for the callback function.
 *
 * Collect garbage from an associative array and pack down the internal tree to
 * save memory.
 *
 * The iterator function is asked to pass judgement upon each object in the
 * array.  If it returns false, the object is discard and if it returns true,
 * the object is kept.  If it returns true, it must increment the object's
 * usage count (or whatever it needs to do to retain it) before returning.
 *
 * This function returns 0 if successful or -ENOMEM if out of memory.  In the
 * latter case, the array is not changed.
 *
 * The caller should lock against other modifications and must continue to hold
 * the lock until assoc_array_apply_edit() has been called.
 *
 * Accesses to the tree may take place concurrently with this function,
 * provided they hold the RCU read lock.
 */
int assoc_array_gc(struct assoc_array *array,
                   const struct assoc_array_ops *ops,
                   bool (*iterator)(void *object, void *iterator_data),
                   void *iterator_data)
{
        struct assoc_array_shortcut *shortcut, *new_s;
        struct assoc_array_node *node, *new_n;
        struct assoc_array_edit *edit;
        struct assoc_array_ptr *cursor, *ptr;
        struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp;
        unsigned long nr_leaves_on_tree;
        bool retained;
        int keylen, slot, nr_free, next_slot, i;

        pr_devel("-->%s()\n", __func__);

        if (!array->root)
                return 0;

        edit = kzalloc_obj(struct assoc_array_edit);
        if (!edit)
                return -ENOMEM;
        edit->array = array;
        edit->ops = ops;
        edit->ops_for_excised_subtree = ops;
        edit->set[0].ptr = &array->root;
        edit->excised_subtree = array->root;

        new_root = new_parent = NULL;
        new_ptr_pp = &new_root;
        cursor = array->root;

descend:
        /* If this point is a shortcut, then we need to duplicate it and
         * advance the target cursor.
         */
        if (assoc_array_ptr_is_shortcut(cursor)) {
                shortcut = assoc_array_ptr_to_shortcut(cursor);
                keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE);
                keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;
                new_s = kmalloc_flex(*new_s, index_key, keylen);
                if (!new_s)
                        goto enomem;
                pr_devel("dup shortcut %p -> %p\n", shortcut, new_s);
                memcpy(new_s, shortcut, struct_size(new_s, index_key, keylen));
                new_s->back_pointer = new_parent;
                new_s->parent_slot = shortcut->parent_slot;
                *new_ptr_pp = new_parent = assoc_array_shortcut_to_ptr(new_s);
                new_ptr_pp = &new_s->next_node;
                cursor = shortcut->next_node;
        }

        /* Duplicate the node at this position */
        node = assoc_array_ptr_to_node(cursor);
        new_n = kzalloc_obj(struct assoc_array_node);
        if (!new_n)
                goto enomem;
        pr_devel("dup node %p -> %p\n", node, new_n);
        new_n->back_pointer = new_parent;
        new_n->parent_slot = node->parent_slot;
        *new_ptr_pp = new_parent = assoc_array_node_to_ptr(new_n);
        new_ptr_pp = NULL;
        slot = 0;

continue_node:
        /* Filter across any leaves and gc any subtrees */
        for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = node->slots[slot];
                if (!ptr)
                        continue;

                if (assoc_array_ptr_is_leaf(ptr)) {
                        if (iterator(assoc_array_ptr_to_leaf(ptr),
                                     iterator_data))
                                /* The iterator will have done any reference
                                 * counting on the object for us.
                                 */
                                new_n->slots[slot] = ptr;
                        continue;
                }

                new_ptr_pp = &new_n->slots[slot];
                cursor = ptr;
                goto descend;
        }

retry_compress:
        pr_devel("-- compress node %p --\n", new_n);

        /* Count up the number of empty slots in this node and work out the
         * subtree leaf count.
         */
        new_n->nr_leaves_on_branch = 0;
        nr_free = 0;
        for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = new_n->slots[slot];
                if (!ptr)
                        nr_free++;
                else if (assoc_array_ptr_is_leaf(ptr))
                        new_n->nr_leaves_on_branch++;
        }
        pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch);

        /* See what we can fold in */
        retained = false;
        next_slot = 0;
        for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                struct assoc_array_shortcut *s;
                struct assoc_array_node *child;

                ptr = new_n->slots[slot];
                if (!ptr || assoc_array_ptr_is_leaf(ptr))
                        continue;

                s = NULL;
                if (assoc_array_ptr_is_shortcut(ptr)) {
                        s = assoc_array_ptr_to_shortcut(ptr);
                        ptr = s->next_node;
                }

                child = assoc_array_ptr_to_node(ptr);
                new_n->nr_leaves_on_branch += child->nr_leaves_on_branch;

                if (child->nr_leaves_on_branch <= nr_free + 1) {
                        /* Fold the child node into this one */
                        pr_devel("[%d] fold node %lu/%d [nx %d]\n",
                                 slot, child->nr_leaves_on_branch, nr_free + 1,
                                 next_slot);

                        /* We would already have reaped an intervening shortcut
                         * on the way back up the tree.
                         */
                        BUG_ON(s);

                        new_n->slots[slot] = NULL;
                        nr_free++;
                        if (slot < next_slot)
                                next_slot = slot;
                        for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
                                struct assoc_array_ptr *p = child->slots[i];
                                if (!p)
                                        continue;
                                BUG_ON(assoc_array_ptr_is_meta(p));
                                while (new_n->slots[next_slot])
                                        next_slot++;
                                BUG_ON(next_slot >= ASSOC_ARRAY_FAN_OUT);
                                new_n->slots[next_slot++] = p;
                                nr_free--;
                        }
                        kfree(child);
                } else {
                        pr_devel("[%d] retain node %lu/%d [nx %d]\n",
                                 slot, child->nr_leaves_on_branch, nr_free + 1,
                                 next_slot);
                        retained = true;
                }
        }

        if (retained && new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) {
                pr_devel("internal nodes remain despite enough space, retrying\n");
                goto retry_compress;
        }
        pr_devel("after: %lu\n", new_n->nr_leaves_on_branch);

        nr_leaves_on_tree = new_n->nr_leaves_on_branch;

        /* Excise this node if it is singly occupied by a shortcut */
        if (nr_free == ASSOC_ARRAY_FAN_OUT - 1) {
                for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++)
                        if ((ptr = new_n->slots[slot]))
                                break;

                if (assoc_array_ptr_is_meta(ptr) &&
                    assoc_array_ptr_is_shortcut(ptr)) {
                        pr_devel("excise node %p with 1 shortcut\n", new_n);
                        new_s = assoc_array_ptr_to_shortcut(ptr);
                        new_parent = new_n->back_pointer;
                        slot = new_n->parent_slot;
                        kfree(new_n);
                        if (!new_parent) {
                                new_s->back_pointer = NULL;
                                new_s->parent_slot = 0;
                                new_root = ptr;
                                goto gc_complete;
                        }

                        if (assoc_array_ptr_is_shortcut(new_parent)) {
                                /* We can discard any preceding shortcut also */
                                struct assoc_array_shortcut *s =
                                        assoc_array_ptr_to_shortcut(new_parent);

                                pr_devel("excise preceding shortcut\n");

                                new_parent = new_s->back_pointer = s->back_pointer;
                                slot = new_s->parent_slot = s->parent_slot;
                                kfree(s);
                                if (!new_parent) {
                                        new_s->back_pointer = NULL;
                                        new_s->parent_slot = 0;
                                        new_root = ptr;
                                        goto gc_complete;
                                }
                        }

                        new_s->back_pointer = new_parent;
                        new_s->parent_slot = slot;
                        new_n = assoc_array_ptr_to_node(new_parent);
                        new_n->slots[slot] = ptr;
                        goto ascend_old_tree;
                }
        }

        /* Excise any shortcuts we might encounter that point to nodes that
         * only contain leaves.
         */
        ptr = new_n->back_pointer;
        if (!ptr)
                goto gc_complete;

        if (assoc_array_ptr_is_shortcut(ptr)) {
                new_s = assoc_array_ptr_to_shortcut(ptr);
                new_parent = new_s->back_pointer;
                slot = new_s->parent_slot;

                if (new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) {
                        struct assoc_array_node *n;

                        pr_devel("excise shortcut\n");
                        new_n->back_pointer = new_parent;
                        new_n->parent_slot = slot;
                        kfree(new_s);
                        if (!new_parent) {
                                new_root = assoc_array_node_to_ptr(new_n);
                                goto gc_complete;
                        }

                        n = assoc_array_ptr_to_node(new_parent);
                        n->slots[slot] = assoc_array_node_to_ptr(new_n);
                }
        } else {
                new_parent = ptr;
        }
        new_n = assoc_array_ptr_to_node(new_parent);

ascend_old_tree:
        ptr = node->back_pointer;
        if (assoc_array_ptr_is_shortcut(ptr)) {
                shortcut = assoc_array_ptr_to_shortcut(ptr);
                slot = shortcut->parent_slot;
                cursor = shortcut->back_pointer;
                if (!cursor)
                        goto gc_complete;
        } else {
                slot = node->parent_slot;
                cursor = ptr;
        }
        BUG_ON(!cursor);
        node = assoc_array_ptr_to_node(cursor);
        slot++;
        goto continue_node;

gc_complete:
        edit->set[0].to = new_root;
        assoc_array_apply_edit(edit);
        array->nr_leaves_on_tree = nr_leaves_on_tree;
        return 0;

enomem:
        pr_devel("enomem\n");
        assoc_array_destroy_subtree(new_root, edit->ops);
        kfree(edit);
        return -ENOMEM;
}

















































































































































































































    1 
    1 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/parser.h>
#include <linux/errno.h>
#include <linux/stringhash.h>

#include "utf8n.h"

int utf8_validate(const struct unicode_map *um, const struct qstr *str)
{
        if (utf8nlen(um, UTF8_NFDI, str->name, str->len) < 0)
                return -1;
        return 0;
}
EXPORT_SYMBOL(utf8_validate);

int utf8_strncmp(const struct unicode_map *um,
                 const struct qstr *s1, const struct qstr *s2)
{
        struct utf8cursor cur1, cur2;
        int c1, c2;

        if (utf8ncursor(&cur1, um, UTF8_NFDI, s1->name, s1->len) < 0)
                return -EINVAL;

        if (utf8ncursor(&cur2, um, UTF8_NFDI, s2->name, s2->len) < 0)
                return -EINVAL;

        do {
                c1 = utf8byte(&cur1);
                c2 = utf8byte(&cur2);

                if (c1 < 0 || c2 < 0)
                        return -EINVAL;
                if (c1 != c2)
                        return 1;
        } while (c1);

        return 0;
}
EXPORT_SYMBOL(utf8_strncmp);

int utf8_strncasecmp(const struct unicode_map *um,
                     const struct qstr *s1, const struct qstr *s2)
{
        struct utf8cursor cur1, cur2;
        int c1, c2;

        if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0)
                return -EINVAL;

        if (utf8ncursor(&cur2, um, UTF8_NFDICF, s2->name, s2->len) < 0)
                return -EINVAL;

        do {
                c1 = utf8byte(&cur1);
                c2 = utf8byte(&cur2);

                if (c1 < 0 || c2 < 0)
                        return -EINVAL;
                if (c1 != c2)
                        return 1;
        } while (c1);

        return 0;
}
EXPORT_SYMBOL(utf8_strncasecmp);

/* String cf is expected to be a valid UTF-8 casefolded
 * string.
 */
int utf8_strncasecmp_folded(const struct unicode_map *um,
                            const struct qstr *cf,
                            const struct qstr *s1)
{
        struct utf8cursor cur1;
        int c1, c2;
        int i = 0;

        if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0)
                return -EINVAL;

        do {
                c1 = utf8byte(&cur1);
                c2 = cf->name[i++];
                if (c1 < 0)
                        return -EINVAL;
                if (c1 != c2)
                        return 1;
        } while (c1);

        return 0;
}
EXPORT_SYMBOL(utf8_strncasecmp_folded);

int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
                  unsigned char *dest, size_t dlen)
{
        struct utf8cursor cur;
        size_t nlen = 0;

        if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0)
                return -EINVAL;

        for (nlen = 0; nlen < dlen; nlen++) {
                int c = utf8byte(&cur);

                dest[nlen] = c;
                if (!c)
                        return nlen;
                if (c == -1)
                        break;
        }
        return -EINVAL;
}
EXPORT_SYMBOL(utf8_casefold);

int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
                       struct qstr *str)
{
        struct utf8cursor cur;
        int c;
        unsigned long hash = init_name_hash(salt);

        if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0)
                return -EINVAL;

        while ((c = utf8byte(&cur))) {
                if (c < 0)
                        return -EINVAL;
                hash = partial_name_hash((unsigned char)c, hash);
        }
        str->hash = end_name_hash(hash);
        return 0;
}
EXPORT_SYMBOL(utf8_casefold_hash);

int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
                   unsigned char *dest, size_t dlen)
{
        struct utf8cursor cur;
        ssize_t nlen = 0;

        if (utf8ncursor(&cur, um, UTF8_NFDI, str->name, str->len) < 0)
                return -EINVAL;

        for (nlen = 0; nlen < dlen; nlen++) {
                int c = utf8byte(&cur);

                dest[nlen] = c;
                if (!c)
                        return nlen;
                if (c == -1)
                        break;
        }
        return -EINVAL;
}
EXPORT_SYMBOL(utf8_normalize);

static const struct utf8data *find_table_version(const struct utf8data *table,
                size_t nr_entries, unsigned int version)
{
        size_t i = nr_entries - 1;

        while (version < table[i].maxage)
                i--;
        if (version > table[i].maxage)
                return NULL;
        return &table[i];
}

struct unicode_map *utf8_load(unsigned int version)
{
        struct unicode_map *um;

        um = kzalloc_obj(struct unicode_map);
        if (!um)
                return ERR_PTR(-ENOMEM);
        um->version = version;

        um->tables = symbol_request(utf8_data_table);
        if (!um->tables)
                goto out_free_um;

        if (!utf8version_is_supported(um, version))
                goto out_symbol_put;
        um->ntab[UTF8_NFDI] = find_table_version(um->tables->utf8nfdidata,
                        um->tables->utf8nfdidata_size, um->version);
        if (!um->ntab[UTF8_NFDI])
                goto out_symbol_put;
        um->ntab[UTF8_NFDICF] = find_table_version(um->tables->utf8nfdicfdata,
                        um->tables->utf8nfdicfdata_size, um->version);
        if (!um->ntab[UTF8_NFDICF])
                goto out_symbol_put;
        return um;

out_symbol_put:
        symbol_put(utf8_data_table);
out_free_um:
        kfree(um);
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL(utf8_load);

void utf8_unload(struct unicode_map *um)
{
        if (um) {
                symbol_put(utf8_data_table);
                kfree(um);
        }
}
EXPORT_SYMBOL(utf8_unload);

/**
 * utf8_parse_version - Parse a UTF-8 version number from a string
 *
 * @version: input string
 *
 * Returns the parsed version on success, negative code on error
 */
int utf8_parse_version(char *version)
{
        substring_t args[3];
        unsigned int maj, min, rev;
        static const struct match_token token[] = {
                {1, "%d.%d.%d"},
                {0, NULL}
        };

        if (match_token(version, token, args) != 1)
                return -EINVAL;

        if (match_int(&args[0], &maj) || match_int(&args[1], &min) ||
            match_int(&args[2], &rev))
                return -EINVAL;

        return UNICODE_AGE(maj, min, rev);
}
EXPORT_SYMBOL(utf8_parse_version);
























































    1 






    1 






















































    1 






    1 






























































    1 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/user.h>
#include <linux/regset.h>
#include <linux/syscalls.h>
#include <linux/nospec.h>

#include <linux/uaccess.h>
#include <asm/desc.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/gsseg.h>

#include "tls.h"

/*
 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
 */
static int get_free_idx(void)
{
        struct thread_struct *t = &current->thread;
        int idx;

        for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
                if (desc_empty(&t->tls_array[idx]))
                        return idx + GDT_ENTRY_TLS_MIN;
        return -ESRCH;
}

static bool tls_desc_okay(const struct user_desc *info)
{
        /*
         * For historical reasons (i.e. no one ever documented how any
         * of the segmentation APIs work), user programs can and do
         * assume that a struct user_desc that's all zeros except for
         * entry_number means "no segment at all".  This never actually
         * worked.  In fact, up to Linux 3.19, a struct user_desc like
         * this would create a 16-bit read-write segment with base and
         * limit both equal to zero.
         *
         * That was close enough to "no segment at all" until we
         * hardened this function to disallow 16-bit TLS segments.  Fix
         * it up by interpreting these zeroed segments the way that they
         * were almost certainly intended to be interpreted.
         *
         * The correct way to ask for "no segment at all" is to specify
         * a user_desc that satisfies LDT_empty.  To keep everything
         * working, we accept both.
         *
         * Note that there's a similar kludge in modify_ldt -- look at
         * the distinction between modes 1 and 0x11.
         */
        if (LDT_empty(info) || LDT_zero(info))
                return true;

        /*
         * espfix is required for 16-bit data segments, but espfix
         * only works for LDT segments.
         */
        if (!info->seg_32bit)
                return false;

        /* Only allow data segments in the TLS array. */
        if (info->contents > 1)
                return false;

        /*
         * Non-present segments with DPL 3 present an interesting attack
         * surface.  The kernel should handle such segments correctly,
         * but TLS is very difficult to protect in a sandbox, so prevent
         * such segments from being created.
         *
         * If userspace needs to remove a TLS entry, it can still delete
         * it outright.
         */
        if (info->seg_not_present)
                return false;

        return true;
}

static void set_tls_desc(struct task_struct *p, int idx,
                         const struct user_desc *info, int n)
{
        struct thread_struct *t = &p->thread;
        struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
        int cpu;

        /*
         * We must not get preempted while modifying the TLS.
         */
        cpu = get_cpu();

        while (n-- > 0) {
                if (LDT_empty(info) || LDT_zero(info))
                        memset(desc, 0, sizeof(*desc));
                else
                        fill_ldt(desc, info);
                ++info;
                ++desc;
        }

        if (t == &current->thread)
                load_TLS(t, cpu);

        put_cpu();
}

/*
 * Set a given TLS descriptor:
 */
int do_set_thread_area(struct task_struct *p, int idx,
                       struct user_desc __user *u_info,
                       int can_allocate)
{
        struct user_desc info;
        unsigned short modified_sel;

        if (copy_from_user(&info, u_info, sizeof(info)))
                return -EFAULT;

        if (!tls_desc_okay(&info))
                return -EINVAL;

        if (idx == -1)
                idx = info.entry_number;

        /*
         * index -1 means the kernel should try to find and
         * allocate an empty descriptor:
         */
        if (idx == -1 && can_allocate) {
                idx = get_free_idx();
                if (idx < 0)
                        return idx;
                if (put_user(idx, &u_info->entry_number))
                        return -EFAULT;
        }

        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
                return -EINVAL;

        set_tls_desc(p, idx, &info, 1);

        /*
         * If DS, ES, FS, or GS points to the modified segment, forcibly
         * refresh it.  Only needed on x86_64 because x86_32 reloads them
         * on return to user mode.
         */
        modified_sel = (idx << 3) | 3;

        if (p == current) {
                unsigned short sel;

#ifdef CONFIG_X86_64
                savesegment(ds, sel);
                if (sel == modified_sel)
                        loadsegment(ds, sel);

                savesegment(es, sel);
                if (sel == modified_sel)
                        loadsegment(es, sel);

                savesegment(fs, sel);
                if (sel == modified_sel)
                        loadsegment(fs, sel);
#endif

                savesegment(gs, sel);
                if (sel == modified_sel)
                        load_gs_index(sel);
        } else {
#ifdef CONFIG_X86_64
                if (p->thread.fsindex == modified_sel)
                        p->thread.fsbase = info.base_addr;

                if (p->thread.gsindex == modified_sel)
                        p->thread.gsbase = info.base_addr;
#endif
        }

        return 0;
}

SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, u_info)
{
        return do_set_thread_area(current, -1, u_info, 1);
}


/*
 * Get the current Thread-Local Storage area:
 */

static void fill_user_desc(struct user_desc *info, int idx,
                           const struct desc_struct *desc)

{
        memset(info, 0, sizeof(*info));
        info->entry_number = idx;
        info->base_addr = get_desc_base(desc);
        info->limit = get_desc_limit(desc);
        info->seg_32bit = desc->d;
        info->contents = desc->type >> 2;
        info->read_exec_only = !(desc->type & 2);
        info->limit_in_pages = desc->g;
        info->seg_not_present = !desc->p;
        info->useable = desc->avl;
#ifdef CONFIG_X86_64
        info->lm = desc->l;
#endif
}

int do_get_thread_area(struct task_struct *p, int idx,
                       struct user_desc __user *u_info)
{
        struct user_desc info;
        int index;

        if (idx == -1 && get_user(idx, &u_info->entry_number))
                return -EFAULT;

        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
                return -EINVAL;

        index = idx - GDT_ENTRY_TLS_MIN;
        index = array_index_nospec(index,
                        GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN + 1);

        fill_user_desc(&info, idx, &p->thread.tls_array[index]);

        if (copy_to_user(u_info, &info, sizeof(info)))
                return -EFAULT;
        return 0;
}

SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, u_info)
{
        return do_get_thread_area(current, -1, u_info);
}

int regset_tls_active(struct task_struct *target,
                      const struct user_regset *regset)
{
        struct thread_struct *t = &target->thread;
        int n = GDT_ENTRY_TLS_ENTRIES;
        while (n > 0 && desc_empty(&t->tls_array[n - 1]))
                --n;
        return n;
}

int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
                   struct membuf to)
{
        const struct desc_struct *tls;
        struct user_desc v;
        int pos;

        for (pos = 0, tls = target->thread.tls_array; to.left; pos++, tls++) {
                fill_user_desc(&v, GDT_ENTRY_TLS_MIN + pos, tls);
                membuf_write(&to, &v, sizeof(v));
        }
        return 0;
}

int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
                   unsigned int pos, unsigned int count,
                   const void *kbuf, const void __user *ubuf)
{
        struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
        const struct user_desc *info;
        int i;

        if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
            (pos % sizeof(struct user_desc)) != 0 ||
            (count % sizeof(struct user_desc)) != 0)
                return -EINVAL;

        if (kbuf)
                info = kbuf;
        else if (__copy_from_user(infobuf, ubuf, count))
                return -EFAULT;
        else
                info = infobuf;

        for (i = 0; i < count / sizeof(struct user_desc); i++)
                if (!tls_desc_okay(info + i))
                        return -EINVAL;

        set_tls_desc(target,
                     GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
                     info, count / sizeof(struct user_desc));

        return 0;
}





























   19 












   16 





























    1 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This file provides wrappers with sanitizer instrumentation for atomic bit
 * operations.
 *
 * To use this functionality, an arch's bitops.h file needs to define each of
 * the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
 * arch___set_bit(), etc.).
 */
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H

#include <linux/instrumented.h>

/**
 * set_bit - Atomically set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 *
 * Note that @nr may be almost arbitrarily large; this function is not
 * restricted to acting on a single-word quantity.
 */
static __always_inline void set_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_set_bit(nr, addr);
}

/**
 * clear_bit - Clears a bit in memory
 * @nr: Bit to clear
 * @addr: Address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 */
static __always_inline void clear_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_clear_bit(nr, addr);
}

/**
 * change_bit - Toggle a bit in memory
 * @nr: Bit to change
 * @addr: Address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 *
 * Note that @nr may be almost arbitrarily large; this function is not
 * restricted to acting on a single-word quantity.
 */
static __always_inline void change_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_change_bit(nr, addr);
}

/**
 * test_and_set_bit - Set a bit and return its old value
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
{
        kcsan_mb();
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_set_bit(nr, addr);
}

/**
 * test_and_clear_bit - Clear a bit and return its old value
 * @nr: Bit to clear
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        kcsan_mb();
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_clear_bit(nr, addr);
}

/**
 * test_and_change_bit - Change a bit and return its old value
 * @nr: Bit to change
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
{
        kcsan_mb();
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_change_bit(nr, addr);
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H */









    2 


1
2
3
4
5
6
7
8
9
10
11
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_NS_HASH_H__
#define __NET_NS_HASH_H__

#include <net/net_namespace.h>

static inline u32 net_hash_mix(const struct net *net)
{
        return net->hash_mix;
}
#endif
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_STRINGHASH_H
#define __LINUX_STRINGHASH_H

#include <linux/compiler.h>        /* For __pure */
#include <linux/types.h>        /* For u32, u64 */
#include <linux/hash.h>

/*
 * Routines for hashing strings of bytes to a 32-bit hash value.
 *
 * These hash functions are NOT GUARANTEED STABLE between kernel
 * versions, architectures, or even repeated boots of the same kernel.
 * (E.g. they may depend on boot-time hardware detection or be
 * deliberately randomized.)
 *
 * They are also not intended to be secure against collisions caused by
 * malicious inputs; much slower hash functions are required for that.
 *
 * They are optimized for pathname components, meaning short strings.
 * Even if a majority of files have longer names, the dynamic profile of
 * pathname components skews short due to short directory names.
 * (E.g. /usr/lib/libsesquipedalianism.so.3.141.)
 */

/*
 * Version 1: one byte at a time.  Example of use:
 *
 * unsigned long hash = init_name_hash;
 * while (*p)
 *        hash = partial_name_hash(tolower(*p++), hash);
 * hash = end_name_hash(hash);
 *
 * Although this is designed for bytes, fs/hfsplus/unicode.c
 * abuses it to hash 16-bit values.
 */

/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
#define init_name_hash(salt)                (unsigned long)(salt)

/* partial hash update function. Assume roughly 4 bits per character */
static inline unsigned long
partial_name_hash(unsigned long c, unsigned long prevhash)
{
        return (prevhash + (c << 4) + (c >> 4)) * 11;
}

/*
 * Finally: cut down the number of bits to a int value (and try to avoid
 * losing bits).  This also has the property (wanted by the dcache)
 * that the msbits make a good hash table index.
 */
static inline unsigned int end_name_hash(unsigned long hash)
{
        return hash_long(hash, 32);
}

/*
 * Version 2: One word (32 or 64 bits) at a time.
 * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning <asm/word-at-a-time.h>
 * exists, which describes major Linux platforms like x86 and ARM), then
 * this computes a different hash function much faster.
 *
 * If not set, this falls back to a wrapper around the preceding.
 */
extern unsigned int __pure full_name_hash(const void *salt, const char *, unsigned int);

/*
 * A hash_len is a u64 with the hash of a string in the low
 * half and the length in the high half.
 */
#define hashlen_hash(hashlen) ((u32)(hashlen))
#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))

/* Return the "hash_len" (hash and length) of a null-terminated string */
extern u64 __pure hashlen_string(const void *salt, const char *name);

#endif        /* __LINUX_STRINGHASH_H */











































































































































































































































































































































































































































































    1 



















































    1 





















































































































































































































































































































    1 












    1 

    1 



























    1 



    1 

































    1 

























































































































































































































































































    1 


    1 










    1 









    1 





































    1 



    1 
    1 






    1 







    1 

















    1 













    1 






    1 



    1 




    1 



























































    1 




















    1 









    1 






    1 












    1 





    1 








    1 


    1 

    1 





    1 

    1 


































    1 







    1 










    1 


    1 




    1 












    1 
    1 

    1 
































    1 


    1 



    1 



















    1 

    1 












    1 





















    1 

















    1 


















    1 






























































































































































































































































































































































































































































































































































































































































































































































































    1 









    1 











































































































































































































































































    1 






    1 







































































    1 

    1 
    1 










    1 

    1 

    1 



















    1 
    1 
    1 








    1 





    1 

    1 










    1 





    1 











    1 




    1 
    1 

























    1 



    1 

























    1 



    1 

    1 




    1 









    1 




    1 


    1 



    1 
    1 

    1 




























    1 






























    1 






    1 




















    1 
    1 



    1 



    1 





























    1 






    1 







    1 























































































































































































































































































    1 




















    1 












    1 





















    1 



















    1 



























































    1 























    1 


    1 










    1 












































    1 




    1 













    1 



















































































    1 





















    1 


















    1 







    1 








    1 







































    1 


















    1 













    1 































































































































































































































































































































































































































































































































































































































































































    1 









    1 






























































































































































    1 























    1 



































































































































































    1 









    1 



































































































































































































    1 



















    1 








































    1 




































































































































































































































































































































































    1 












    1 





























































































    1 


















    1 





    1 















































    1 









































    1 





    1 




    1 






    1 































































































    1 















































    1 

















    1 
    1 


















    1 


















































































































































    1 

























    1 

    1 

    1 

    1 









    1 

    1 



















    1 




    1 
    1 



    1 
    1 





















































































    1 













    1 




    1 

























    1 







    1 




















    1 

















    1 













    1 
















    1 



    1 


    1 

    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
 * Written by Alex Tomas <alex@clusterfs.com>
 */


/*
 * mballoc.c contains the multiblocks allocation routines
 */

#include "ext4_jbd2.h"
#include "mballoc.h"
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/backing-dev.h>
#include <linux/freezer.h>
#include <trace/events/ext4.h>
#include <kunit/static_stub.h>

/*
 * MUSTDO:
 *   - test ext4_ext_search_left() and ext4_ext_search_right()
 *   - search for metadata in few groups
 *
 * TODO v4:
 *   - normalization should take into account whether file is still open
 *   - discard preallocations if no free space left (policy?)
 *   - don't normalize tails
 *   - quota
 *   - reservation for superuser
 *
 * TODO v3:
 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
 *   - track min/max extents in each group for better group selection
 *   - mb_mark_used() may allocate chunk right after splitting buddy
 *   - tree of groups sorted by number of free blocks
 *   - error handling
 */

/*
 * The allocation request involve request for multiple number of blocks
 * near to the goal(block) value specified.
 *
 * During initialization phase of the allocator we decide to use the
 * group preallocation or inode preallocation depending on the size of
 * the file. The size of the file could be the resulting file size we
 * would have after allocation, or the current file size, which ever
 * is larger. If the size is less than sbi->s_mb_stream_request we
 * select to use the group preallocation. The default value of
 * s_mb_stream_request is 16 blocks. This can also be tuned via
 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
 * terms of number of blocks.
 *
 * The main motivation for having small file use group preallocation is to
 * ensure that we have small files closer together on the disk.
 *
 * First stage the allocator looks at the inode prealloc list,
 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
 * spaces for this particular inode. The inode prealloc space is
 * represented as:
 *
 * pa_lstart -> the logical start block for this prealloc space
 * pa_pstart -> the physical start block for this prealloc space
 * pa_len    -> length for this prealloc space (in clusters)
 * pa_free   ->  free space available in this prealloc space (in clusters)
 *
 * The inode preallocation space is used looking at the _logical_ start
 * block. If only the logical file block falls within the range of prealloc
 * space we will consume the particular prealloc space. This makes sure that
 * we have contiguous physical blocks representing the file blocks
 *
 * The important thing to be noted in case of inode prealloc space is that
 * we don't modify the values associated to inode prealloc space except
 * pa_free.
 *
 * If we are not able to find blocks in the inode prealloc space and if we
 * have the group allocation flag set then we look at the locality group
 * prealloc space. These are per CPU prealloc list represented as
 *
 * ext4_sb_info.s_locality_groups[smp_processor_id()]
 *
 * The reason for having a per cpu locality group is to reduce the contention
 * between CPUs. It is possible to get scheduled at this point.
 *
 * The locality group prealloc space is used looking at whether we have
 * enough free space (pa_free) within the prealloc space.
 *
 * If we can't allocate blocks via inode prealloc or/and locality group
 * prealloc then we look at the buddy cache. The buddy cache is represented
 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
 * mapped to the buddy and bitmap information regarding different
 * groups. The buddy information is attached to buddy cache inode so that
 * we can access them through the page cache. The information regarding
 * each group is loaded via ext4_mb_load_buddy.  The information involve
 * block bitmap and buddy information. The information are stored in the
 * inode as:
 *
 *  {                        folio                        }
 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 *
 *
 * one block each for bitmap and buddy information.  So for each group we
 * take up 2 blocks. A folio can contain blocks_per_folio (folio_size /
 * blocksize) blocks.  So it can have information regarding groups_per_folio
 * which is blocks_per_folio/2
 *
 * The buddy cache inode is not stored on disk. The inode is thrown
 * away when the filesystem is unmounted.
 *
 * We look for count number of blocks in the buddy cache. If we were able
 * to locate that many free blocks we return with additional information
 * regarding rest of the contiguous physical block available
 *
 * Before allocating blocks via buddy cache we normalize the request
 * blocks. This ensure we ask for more blocks that we needed. The extra
 * blocks that we get after allocation is added to the respective prealloc
 * list. In case of inode preallocation we follow a list of heuristics
 * based on file size. This can be found in ext4_mb_normalize_request. If
 * we are doing a group prealloc we try to normalize the request to
 * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
 * dependent on the cluster size; for non-bigalloc file systems, it is
 * 512 blocks. This can be tuned via
 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
 * terms of number of blocks. If we have mounted the file system with -O
 * stripe=<value> option the group prealloc request is normalized to the
 * smallest multiple of the stripe value (sbi->s_stripe) which is
 * greater than the default mb_group_prealloc.
 *
 * If "mb_optimize_scan" mount option is set, we maintain in memory group info
 * structures in two data structures:
 *
 * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders)
 *
 *    Locking: Writers use xa_lock, readers use rcu_read_lock.
 *
 *    This is an array of xarrays where the index in the array represents the
 *    largest free order in the buddy bitmap of the participating group infos of
 *    that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total
 *    number of buddy bitmap orders possible) number of xarrays. Group-infos are
 *    placed in appropriate xarrays.
 *
 * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size)
 *
 *    Locking: Writers use xa_lock, readers use rcu_read_lock.
 *
 *    This is an array of xarrays where in the i-th xarray there are groups with
 *    average fragment size >= 2^i and < 2^(i+1). The average fragment size
 *    is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
 *    Note that we don't bother with a special xarray for completely empty
 *    groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed
 *    in appropriate xarrays.
 *
 * In xarray, the index is the block group number, the value is the block group
 * information, and a non-empty value indicates the block group is present in
 * the current xarray.
 *
 * When "mb_optimize_scan" mount option is set, mballoc consults the above data
 * structures to decide the order in which groups are to be traversed for
 * fulfilling an allocation request.
 *
 * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order
 * >= the order of the request. We directly look at the largest free order list
 * in the data structure (1) above where largest_free_order = order of the
 * request. If that list is empty, we look at remaining list in the increasing
 * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED
 * lookup in O(1) time.
 *
 * At CR_GOAL_LEN_FAST, we only consider groups where
 * average fragment size > request size. So, we lookup a group which has average
 * fragment size just above or equal to request size using our average fragment
 * size group lists (data structure 2) in O(1) time.
 *
 * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied
 * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in
 * CR_GOAL_LEN_FAST suggests that there is no BG that has avg
 * fragment size > goal length. So before falling to the slower
 * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and
 * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big
 * enough average fragment size. This increases the chances of finding a
 * suitable block group in O(1) time and results in faster allocation at the
 * cost of reduced size of allocation.
 *
 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
 * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and
 * CR_GOAL_LEN_FAST phase.
 *
 * The regular allocator (using the buddy cache) supports a few tunables.
 *
 * /sys/fs/ext4/<partition>/mb_min_to_scan
 * /sys/fs/ext4/<partition>/mb_max_to_scan
 * /sys/fs/ext4/<partition>/mb_order2_req
 * /sys/fs/ext4/<partition>/mb_max_linear_groups
 *
 * The regular allocator uses buddy scan only if the request len is power of
 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 * value of s_mb_order2_reqs can be tuned via
 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 * stripe size (sbi->s_stripe), we try to search for contiguous block in
 * stripe size. This should result in better allocation on RAID setups. If
 * not, we search in the specific group using bitmap for best extents. The
 * tunable min_to_scan and max_to_scan control the behaviour here.
 * min_to_scan indicate how long the mballoc __must__ look for a best
 * extent and max_to_scan indicates how long the mballoc __can__ look for a
 * best extent in the found extents. Searching for the blocks starts with
 * the group specified as the goal value in allocation context via
 * ac_g_ex. Each group is first checked based on the criteria whether it
 * can be used for allocation. ext4_mb_good_group explains how the groups are
 * checked.
 *
 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
 * get traversed linearly. That may result in subsequent allocations being not
 * close to each other. And so, the underlying device may get filled up in a
 * non-linear fashion. While that may not matter on non-rotational devices, for
 * rotational devices that may result in higher seek times. "mb_max_linear_groups"
 * tells mballoc how many groups mballoc should search linearly before
 * performing consulting above data structures for more efficient lookups. For
 * non rotational devices, this value defaults to 0 and for rotational devices
 * this is set to MB_DEFAULT_LINEAR_LIMIT.
 *
 * Both the prealloc space are getting populated as above. So for the first
 * request we will hit the buddy cache which will result in this prealloc
 * space getting filled. The prealloc space is then later used for the
 * subsequent request.
 */

/*
 * mballoc operates on the following data:
 *  - on-disk bitmap
 *  - in-core buddy (actually includes buddy and bitmap)
 *  - preallocation descriptors (PAs)
 *
 * there are two types of preallocations:
 *  - inode
 *    assiged to specific inode and can be used for this inode only.
 *    it describes part of inode's space preallocated to specific
 *    physical blocks. any block from that preallocated can be used
 *    independent. the descriptor just tracks number of blocks left
 *    unused. so, before taking some block from descriptor, one must
 *    make sure corresponded logical block isn't allocated yet. this
 *    also means that freeing any block within descriptor's range
 *    must discard all preallocated blocks.
 *  - locality group
 *    assigned to specific locality group which does not translate to
 *    permanent set of inodes: inode can join and leave group. space
 *    from this type of preallocation can be used for any inode. thus
 *    it's consumed from the beginning to the end.
 *
 * relation between them can be expressed as:
 *    in-core buddy = on-disk bitmap + preallocation descriptors
 *
 * this mean blocks mballoc considers used are:
 *  - allocated blocks (persistent)
 *  - preallocated blocks (non-persistent)
 *
 * consistency in mballoc world means that at any time a block is either
 * free or used in ALL structures. notice: "any time" should not be read
 * literally -- time is discrete and delimited by locks.
 *
 *  to keep it simple, we don't use block numbers, instead we count number of
 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
 *
 * all operations can be expressed as:
 *  - init buddy:                        buddy = on-disk + PAs
 *  - new PA:                                buddy += N; PA = N
 *  - use inode PA:                        on-disk += N; PA -= N
 *  - discard inode PA                        buddy -= on-disk - PA; PA = 0
 *  - use locality group PA                on-disk += N; PA -= N
 *  - discard locality group PA                buddy -= PA; PA = 0
 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
 *        is used in real operation because we can't know actual used
 *        bits from PA, only from on-disk bitmap
 *
 * if we follow this strict logic, then all operations above should be atomic.
 * given some of them can block, we'd have to use something like semaphores
 * killing performance on high-end SMP hardware. let's try to relax it using
 * the following knowledge:
 *  1) if buddy is referenced, it's already initialized
 *  2) while block is used in buddy and the buddy is referenced,
 *     nobody can re-allocate that block
 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
 *     block
 *
 * so, now we're building a concurrency table:
 *  - init buddy vs.
 *    - new PA
 *      blocks for PA are allocated in the buddy, buddy must be referenced
 *      until PA is linked to allocation group to avoid concurrent buddy init
 *    - use inode PA
 *      we need to make sure that either on-disk bitmap or PA has uptodate data
 *      given (3) we care that PA-=N operation doesn't interfere with init
 *    - discard inode PA
 *      the simplest way would be to have buddy initialized by the discard
 *    - use locality group PA
 *      again PA-=N must be serialized with init
 *    - discard locality group PA
 *      the simplest way would be to have buddy initialized by the discard
 *  - new PA vs.
 *    - use inode PA
 *      i_data_sem serializes them
 *    - discard inode PA
 *      discard process must wait until PA isn't used by another process
 *    - use locality group PA
 *      some mutex should serialize them
 *    - discard locality group PA
 *      discard process must wait until PA isn't used by another process
 *  - use inode PA
 *    - use inode PA
 *      i_data_sem or another mutex should serializes them
 *    - discard inode PA
 *      discard process must wait until PA isn't used by another process
 *    - use locality group PA
 *      nothing wrong here -- they're different PAs covering different blocks
 *    - discard locality group PA
 *      discard process must wait until PA isn't used by another process
 *
 * now we're ready to make few consequences:
 *  - PA is referenced and while it is no discard is possible
 *  - PA is referenced until block isn't marked in on-disk bitmap
 *  - PA changes only after on-disk bitmap
 *  - discard must not compete with init. either init is done before
 *    any discard or they're serialized somehow
 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
 *
 * a special case when we've used PA to emptiness. no need to modify buddy
 * in this case, but we should care about concurrent init
 *
 */

 /*
 * Logic in few words:
 *
 *  - allocation:
 *    load group
 *    find blocks
 *    mark bits in on-disk bitmap
 *    release group
 *
 *  - use preallocation:
 *    find proper PA (per-inode or group)
 *    load group
 *    mark bits in on-disk bitmap
 *    release group
 *    release PA
 *
 *  - free:
 *    load group
 *    mark bits in on-disk bitmap
 *    release group
 *
 *  - discard preallocations in group:
 *    mark PAs deleted
 *    move them onto local list
 *    load on-disk bitmap
 *    load group
 *    remove PA from object (inode or locality group)
 *    mark free blocks in-core
 *
 *  - discard inode's preallocations:
 */

/*
 * Locking rules
 *
 * Locks:
 *  - bitlock on a group        (group)
 *  - object (inode/locality)        (object)
 *  - per-pa lock                (pa)
 *  - cr_power2_aligned lists lock        (cr_power2_aligned)
 *  - cr_goal_len_fast lists lock        (cr_goal_len_fast)
 *
 * Paths:
 *  - new pa
 *    object
 *    group
 *
 *  - find and use pa:
 *    pa
 *
 *  - release consumed pa:
 *    pa
 *    group
 *    object
 *
 *  - generate in-core bitmap:
 *    group
 *        pa
 *
 *  - discard all for given object (inode, locality group):
 *    object
 *        pa
 *    group
 *
 *  - discard all for given group:
 *    group
 *        pa
 *    group
 *        object
 *
 *  - allocation path (ext4_mb_regular_allocator)
 *    group
 *    cr_power2_aligned/cr_goal_len_fast
 */
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
static struct kmem_cache *ext4_free_data_cachep;

/* We create slab caches for groupinfo data structures based on the
 * superblock block size.  There will be one per mounted filesystem for
 * each unique s_blocksize_bits */
#define NR_GRPINFO_CACHES 8
static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];

static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
};

static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);

static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
                              ext4_group_t group);

static int ext4_try_to_trim_range(struct super_block *sb,
                struct ext4_buddy *e4b, ext4_grpblk_t start,
                ext4_grpblk_t max, ext4_grpblk_t minblocks);

/*
 * The algorithm using this percpu seq counter goes below:
 * 1. We sample the percpu discard_pa_seq counter before trying for block
 *    allocation in ext4_mb_new_blocks().
 * 2. We increment this percpu discard_pa_seq counter when we either allocate
 *    or free these blocks i.e. while marking those blocks as used/free in
 *    mb_mark_used()/mb_free_blocks().
 * 3. We also increment this percpu seq counter when we successfully identify
 *    that the bb_prealloc_list is not empty and hence proceed for discarding
 *    of those PAs inside ext4_mb_discard_group_preallocations().
 *
 * Now to make sure that the regular fast path of block allocation is not
 * affected, as a small optimization we only sample the percpu seq counter
 * on that cpu. Only when the block allocation fails and when freed blocks
 * found were 0, that is when we sample percpu seq counter for all cpus using
 * below function ext4_get_discard_pa_seq_sum(). This happens after making
 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
 */
static DEFINE_PER_CPU(u64, discard_pa_seq);
static inline u64 ext4_get_discard_pa_seq_sum(void)
{
        int __cpu;
        u64 __seq = 0;

        for_each_possible_cpu(__cpu)
                __seq += per_cpu(discard_pa_seq, __cpu);
        return __seq;
}

static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
#if BITS_PER_LONG == 64
        *bit += ((unsigned long) addr & 7UL) << 3;
        addr = (void *) ((unsigned long) addr & ~7UL);
#elif BITS_PER_LONG == 32
        *bit += ((unsigned long) addr & 3UL) << 3;
        addr = (void *) ((unsigned long) addr & ~3UL);
#else
#error "how many bits you are?!"
#endif
        return addr;
}

static inline int mb_test_bit(int bit, void *addr)
{
        /*
         * ext4_test_bit on architecture like powerpc
         * needs unsigned long aligned address
         */
        addr = mb_correct_addr_and_bit(&bit, addr);
        return ext4_test_bit(bit, addr);
}

static inline void mb_set_bit(int bit, void *addr)
{
        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_set_bit(bit, addr);
}

static inline void mb_clear_bit(int bit, void *addr)
{
        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_clear_bit(bit, addr);
}

static inline int mb_test_and_clear_bit(int bit, void *addr)
{
        addr = mb_correct_addr_and_bit(&bit, addr);
        return ext4_test_and_clear_bit(bit, addr);
}

static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
        tmpmax = max + fix;
        start += fix;

        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
        if (ret > max)
                return max;
        return ret;
}

static inline int mb_find_next_bit(void *addr, int max, int start)
{
        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
        tmpmax = max + fix;
        start += fix;

        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
        if (ret > max)
                return max;
        return ret;
}

static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
{
        char *bb;

        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(max == NULL);

        if (order > e4b->bd_blkbits + 1) {
                *max = 0;
                return NULL;
        }

        /* at order 0 we see each particular block */
        if (order == 0) {
                *max = 1 << (e4b->bd_blkbits + 3);
                return e4b->bd_bitmap;
        }

        bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];

        return bb;
}

#ifdef DOUBLE_CHECK
static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                           int first, int count)
{
        int i;
        struct super_block *sb = e4b->bd_sb;

        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
                        ext4_fsblk_t blocknr;

                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                              inode ? inode->i_ino : 0,
                                              blocknr,
                                              "freeing block already freed "
                                              "(bit %u)",
                                              first + i);
                }
                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
        }
}

static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
{
        int i;

        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
        }
}

static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
        if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
                unsigned char *b1, *b2;
                int i;
                b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
                b2 = (unsigned char *) bitmap;
                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
                        if (b1[i] != b2[i]) {
                                ext4_msg(e4b->bd_sb, KERN_ERR,
                                         "corruption in group %u "
                                         "at byte %u(%u): %x in copy != %x "
                                         "on disk/prealloc",
                                         e4b->bd_group, i, i * 8, b1[i], b2[i]);
                                BUG();
                        }
                }
        }
}

static void mb_group_bb_bitmap_alloc(struct super_block *sb,
                        struct ext4_group_info *grp, ext4_group_t group)
{
        struct buffer_head *bh;

        grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
        if (!grp->bb_bitmap)
                return;

        bh = ext4_read_block_bitmap(sb, group);
        if (IS_ERR_OR_NULL(bh)) {
                kfree(grp->bb_bitmap);
                grp->bb_bitmap = NULL;
                return;
        }

        memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
        put_bh(bh);
}

static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
{
        kfree(grp->bb_bitmap);
}

#else
static inline void mb_free_blocks_double(struct inode *inode,
                                struct ext4_buddy *e4b, int first, int count)
{
        return;
}
static inline void mb_mark_used_double(struct ext4_buddy *e4b,
                                                int first, int count)
{
        return;
}
static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{
        return;
}

static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
                        struct ext4_group_info *grp, ext4_group_t group)
{
        return;
}

static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
{
        return;
}
#endif

#ifdef AGGRESSIVE_CHECK

#define MB_CHECK_ASSERT(assert)                                                \
do {                                                                        \
        if (!(assert)) {                                                \
                printk(KERN_EMERG                                        \
                        "Assertion failure in %s() at %s:%d: \"%s\"\n",        \
                        function, file, line, # assert);                \
                BUG();                                                        \
        }                                                                \
} while (0)

/*
 * Perform buddy integrity check with the following steps:
 *
 * 1. Top-down validation (from highest order down to order 1, excluding order-0 bitmap):
 *    For each pair of adjacent orders, if a higher-order bit is set (indicating a free block),
 *    at most one of the two corresponding lower-order bits may be clear (free).
 *
 * 2. Order-0 (bitmap) validation, performed on bit pairs:
 *    - If either bit in a pair is set (1, allocated), then all corresponding higher-order bits
 *      must not be free (0).
 *    - If both bits in a pair are clear (0, free), then exactly one of the corresponding
 *      higher-order bits must be free (0).
 *
 * 3. Preallocation (pa) list validation:
 *    For each preallocated block (pa) in the group:
 *    - Verify that pa_pstart falls within the bounds of this block group.
 *    - Ensure the corresponding bit(s) in the order-0 bitmap are marked as allocated (1).
 */
static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
                                const char *function, int line)
{
        struct super_block *sb = e4b->bd_sb;
        int order = e4b->bd_blkbits + 1;
        int max;
        int max2;
        int i;
        int j;
        int k;
        int count;
        struct ext4_group_info *grp;
        int fragments = 0;
        int fstart;
        struct list_head *cur;
        void *buddy;
        void *buddy2;

        if (e4b->bd_info->bb_check_counter++ % 10)
                return;

        while (order > 1) {
                buddy = mb_find_buddy(e4b, order, &max);
                MB_CHECK_ASSERT(buddy);
                buddy2 = mb_find_buddy(e4b, order - 1, &max2);
                MB_CHECK_ASSERT(buddy2);
                MB_CHECK_ASSERT(buddy != buddy2);
                MB_CHECK_ASSERT(max * 2 == max2);

                count = 0;
                for (i = 0; i < max; i++) {

                        if (mb_test_bit(i, buddy)) {
                                /* only single bit in buddy2 may be 0 */
                                if (!mb_test_bit(i << 1, buddy2)) {
                                        MB_CHECK_ASSERT(
                                                mb_test_bit((i<<1)+1, buddy2));
                                }
                                continue;
                        }

                        count++;
                }
                MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
                order--;
        }

        fstart = -1;
        buddy = mb_find_buddy(e4b, 0, &max);
        for (i = 0; i < max; i++) {
                if (!mb_test_bit(i, buddy)) {
                        MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
                        if (fstart == -1) {
                                fragments++;
                                fstart = i;
                        }
                } else {
                        fstart = -1;
                }
                if (!(i & 1)) {
                        int in_use, zero_bit_count = 0;

                        in_use = mb_test_bit(i, buddy) || mb_test_bit(i + 1, buddy);
                        for (j = 1; j < e4b->bd_blkbits + 2; j++) {
                                buddy2 = mb_find_buddy(e4b, j, &max2);
                                k = i >> j;
                                MB_CHECK_ASSERT(k < max2);
                                if (!mb_test_bit(k, buddy2))
                                        zero_bit_count++;
                        }
                        MB_CHECK_ASSERT(zero_bit_count == !in_use);
                }
        }
        MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);

        grp = ext4_get_group_info(sb, e4b->bd_group);
        if (!grp)
                return;
        list_for_each(cur, &grp->bb_prealloc_list) {
                ext4_group_t groupnr;
                struct ext4_prealloc_space *pa;
                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                if (!pa->pa_len)
                        continue;
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
                MB_CHECK_ASSERT(groupnr == e4b->bd_group);
                for (i = 0; i < pa->pa_len; i++)
                        MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
        }
}
#undef MB_CHECK_ASSERT
#define mb_check_buddy(e4b) __mb_check_buddy(e4b,        \
                                        __FILE__, __func__, __LINE__)
#else
#define mb_check_buddy(e4b)
#endif

/*
 * Divide blocks started from @first with length @len into
 * smaller chunks with power of 2 blocks.
 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
 * then increase bb_counters[] for corresponded chunk size.
 */
static void ext4_mb_mark_free_simple(struct super_block *sb,
                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
                                        struct ext4_group_info *grp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t min;
        ext4_grpblk_t max;
        ext4_grpblk_t chunk;
        unsigned int border;

        BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));

        border = 2 << sb->s_blocksize_bits;

        while (len > 0) {
                /* find how many blocks can be covered since this position */
                max = ffs(first | border) - 1;

                /* find how many blocks of power 2 we need to mark */
                min = fls(len) - 1;

                if (max < min)
                        min = max;
                chunk = 1 << min;

                /* mark multiblock chunks only */
                grp->bb_counters[min]++;
                if (min > 0)
                        mb_clear_bit(first >> min,
                                     buddy + sbi->s_mb_offsets[min]);

                len -= chunk;
                first += chunk;
        }
}

static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
{
        int order;

        /*
         * We don't bother with a special lists groups with only 1 block free
         * extents and for completely empty groups.
         */
        order = fls(len) - 2;
        if (order < 0)
                return 0;
        if (order == MB_NUM_ORDERS(sb))
                order--;
        if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
                order = MB_NUM_ORDERS(sb) - 1;
        return order;
}

/* Move group to appropriate avg_fragment_size list */
static void
mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int new, old;

        if (!test_opt2(sb, MB_OPTIMIZE_SCAN))
                return;

        old = grp->bb_avg_fragment_size_order;
        new = grp->bb_fragments == 0 ? -1 :
              mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments);
        if (new == old)
                return;

        if (old >= 0)
                xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group);

        grp->bb_avg_fragment_size_order = new;
        if (new >= 0) {
                /*
                 * Cannot use __GFP_NOFAIL because we hold the group lock.
                 * Although allocation for insertion may fails, it's not fatal
                 * as we have linear traversal to fall back on.
                 */
                int err = xa_insert(&sbi->s_mb_avg_fragment_size[new],
                                    grp->bb_group, grp, GFP_ATOMIC);
                if (err)
                        mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d",
                                 grp->bb_group, new, err);
        }
}

static ext4_group_t ext4_get_allocation_groups_count(
                                struct ext4_allocation_context *ac)
{
        ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);

        /* non-extent files are limited to low blocks/groups */
        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
                ngroups = EXT4_SB(ac->ac_sb)->s_blockfile_groups;

        /* Pairs with smp_wmb() in ext4_update_super() */
        smp_rmb();

        return ngroups;
}

static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
                                        struct xarray *xa,
                                        ext4_group_t start, ext4_group_t end)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        enum criteria cr = ac->ac_criteria;
        ext4_group_t ngroups = ext4_get_allocation_groups_count(ac);
        unsigned long group = start;
        struct ext4_group_info *grp;

        if (WARN_ON_ONCE(end > ngroups || start >= end))
                return 0;

        xa_for_each_range(xa, group, grp, start, end - 1) {
                int err;

                if (sbi->s_mb_stats)
                        atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);

                err = ext4_mb_scan_group(ac, grp->bb_group);
                if (err || ac->ac_status != AC_STATUS_CONTINUE)
                        return err;

                cond_resched();
        }

        return 0;
}

/*
 * Find a suitable group of given order from the largest free orders xarray.
 */
static inline int
ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac,
                                             int order, ext4_group_t start,
                                             ext4_group_t end)
{
        struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order];

        if (xa_empty(xa))
                return 0;

        return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
}

/*
 * Choose next group by traversing largest_free_order lists. Updates *new_cr if
 * cr level needs an update.
 */
static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
                                          ext4_group_t group)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int i;
        int ret = 0;
        ext4_group_t start, end;

        start = group;
        end = ext4_get_allocation_groups_count(ac);
wrap_around:
        for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
                ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
                                                                   start, end);
                if (ret || ac->ac_status != AC_STATUS_CONTINUE)
                        return ret;
        }
        if (start) {
                end = start;
                start = 0;
                goto wrap_around;
        }

        if (sbi->s_mb_stats)
                atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);

        /* Increment cr and search again if no group is found */
        ac->ac_criteria = CR_GOAL_LEN_FAST;
        return ret;
}

/*
 * Find a suitable group of given order from the average fragments xarray.
 */
static int
ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac,
                                         int order, ext4_group_t start,
                                         ext4_group_t end)
{
        struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order];

        if (xa_empty(xa))
                return 0;

        return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
}

/*
 * Choose next group by traversing average fragment size list of suitable
 * order. Updates *new_cr if cr level needs an update.
 */
static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
                                         ext4_group_t group)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int i, ret = 0;
        ext4_group_t start, end;

        start = group;
        end = ext4_get_allocation_groups_count(ac);
wrap_around:
        i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
        for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
                ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i,
                                                               start, end);
                if (ret || ac->ac_status != AC_STATUS_CONTINUE)
                        return ret;
        }
        if (start) {
                end = start;
                start = 0;
                goto wrap_around;
        }

        if (sbi->s_mb_stats)
                atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
        /*
         * CR_BEST_AVAIL_LEN works based on the concept that we have
         * a larger normalized goal len request which can be trimmed to
         * a smaller goal len such that it can still satisfy original
         * request len. However, allocation request for non-regular
         * files never gets normalized.
         * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA).
         */
        if (ac->ac_flags & EXT4_MB_HINT_DATA)
                ac->ac_criteria = CR_BEST_AVAIL_LEN;
        else
                ac->ac_criteria = CR_GOAL_LEN_SLOW;

        return ret;
}

/*
 * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment
 * order we have and proactively trim the goal request length to that order to
 * find a suitable group faster.
 *
 * This optimizes allocation speed at the cost of slightly reduced
 * preallocations. However, we make sure that we don't trim the request too
 * much and fall to CR_GOAL_LEN_SLOW in that case.
 */
static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
                                          ext4_group_t group)
{
        int ret = 0;
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int i, order, min_order;
        unsigned long num_stripe_clusters = 0;
        ext4_group_t start, end;

        /*
         * mb_avg_fragment_size_order() returns order in a way that makes
         * retrieving back the length using (1 << order) inaccurate. Hence, use
         * fls() instead since we need to know the actual length while modifying
         * goal length.
         */
        order = fls(ac->ac_g_ex.fe_len) - 1;
        if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb)))
                order = MB_NUM_ORDERS(ac->ac_sb);
        min_order = order - sbi->s_mb_best_avail_max_trim_order;
        if (min_order < 0)
                min_order = 0;

        if (sbi->s_stripe > 0) {
                /*
                 * We are assuming that stripe size is always a multiple of
                 * cluster ratio otherwise __ext4_fill_super exists early.
                 */
                num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
                if (1 << min_order < num_stripe_clusters)
                        /*
                         * We consider 1 order less because later we round
                         * up the goal len to num_stripe_clusters
                         */
                        min_order = fls(num_stripe_clusters) - 1;
        }

        if (1 << min_order < ac->ac_o_ex.fe_len)
                min_order = fls(ac->ac_o_ex.fe_len);

        start = group;
        end = ext4_get_allocation_groups_count(ac);
wrap_around:
        for (i = order; i >= min_order; i--) {
                int frag_order;
                /*
                 * Scale down goal len to make sure we find something
                 * in the free fragments list. Basically, reduce
                 * preallocations.
                 */
                ac->ac_g_ex.fe_len = 1 << i;

                if (num_stripe_clusters > 0) {
                        /*
                         * Try to round up the adjusted goal length to
                         * stripe size (in cluster units) multiple for
                         * efficiency.
                         */
                        ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len,
                                                     num_stripe_clusters);
                }

                frag_order = mb_avg_fragment_size_order(ac->ac_sb,
                                                        ac->ac_g_ex.fe_len);

                ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order,
                                                               start, end);
                if (ret || ac->ac_status != AC_STATUS_CONTINUE)
                        return ret;
        }
        if (start) {
                end = start;
                start = 0;
                goto wrap_around;
        }

        /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
        ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
        if (sbi->s_mb_stats)
                atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
        ac->ac_criteria = CR_GOAL_LEN_SLOW;

        return ret;
}

static inline int should_optimize_scan(struct ext4_allocation_context *ac)
{
        if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
                return 0;
        if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
                return 0;
        return 1;
}

/*
 * next linear group for allocation.
 */
static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups)
{
        /*
         * Artificially restricted ngroups for non-extent
         * files makes group > ngroups possible on first loop.
         */
        *group =  *group + 1 >= ngroups ? 0 : *group + 1;
}

static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac,
                ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count)
{
        int ret, i;
        enum criteria cr = ac->ac_criteria;
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group = *start;

        for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) {
                ret = ext4_mb_scan_group(ac, group);
                if (ret || ac->ac_status != AC_STATUS_CONTINUE)
                        return ret;
                cond_resched();
        }

        *start = group;
        if (count == ngroups)
                ac->ac_criteria++;

        /* Processed all groups and haven't found blocks */
        if (sbi->s_mb_stats && i == ngroups)
                atomic64_inc(&sbi->s_bal_cX_failed[cr]);

        return 0;
}

static int ext4_mb_scan_groups(struct ext4_allocation_context *ac)
{
        int ret = 0;
        ext4_group_t start;
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        ext4_group_t ngroups = ext4_get_allocation_groups_count(ac);

        /* searching for the right group start from the goal value specified */
        start = ac->ac_g_ex.fe_group;
        if (start >= ngroups)
                start = 0;
        ac->ac_prefetch_grp = start;
        ac->ac_prefetch_nr = 0;

        if (!should_optimize_scan(ac))
                return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups);

        /*
         * Optimized scanning can return non adjacent groups which can cause
         * seek overhead for rotational disks. So try few linear groups before
         * trying optimized scan.
         */
        if (sbi->s_mb_max_linear_groups)
                ret = ext4_mb_scan_groups_linear(ac, ngroups, &start,
                                                 sbi->s_mb_max_linear_groups);
        if (ret || ac->ac_status != AC_STATUS_CONTINUE)
                return ret;

        switch (ac->ac_criteria) {
        case CR_POWER2_ALIGNED:
                return ext4_mb_scan_groups_p2_aligned(ac, start);
        case CR_GOAL_LEN_FAST:
                return ext4_mb_scan_groups_goal_fast(ac, start);
        case CR_BEST_AVAIL_LEN:
                return ext4_mb_scan_groups_best_avail(ac, start);
        default:
                /*
                 * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an
                 * rb tree sorted by bb_free. But until that happens, we should
                 * never come here.
                 */
                WARN_ON(1);
        }

        return 0;
}

/*
 * Cache the order of the largest free extent we have available in this block
 * group.
 */
static void
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int new, old = grp->bb_largest_free_order;

        for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--)
                if (grp->bb_counters[new] > 0)
                        break;

        /* No need to move between order lists? */
        if (new == old)
                return;

        if (old >= 0) {
                struct xarray *xa = &sbi->s_mb_largest_free_orders[old];

                if (!xa_empty(xa) && xa_load(xa, grp->bb_group))
                        xa_erase(xa, grp->bb_group);
        }

        grp->bb_largest_free_order = new;
        if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) {
                /*
                 * Cannot use __GFP_NOFAIL because we hold the group lock.
                 * Although allocation for insertion may fails, it's not fatal
                 * as we have linear traversal to fall back on.
                 */
                int err = xa_insert(&sbi->s_mb_largest_free_orders[new],
                                    grp->bb_group, grp, GFP_ATOMIC);
                if (err)
                        mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d",
                                 grp->bb_group, new, err);
        }
}

static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
                            void *buddy, void *bitmap, ext4_group_t group,
                            struct ext4_group_info *grp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_grpblk_t i = 0;
        ext4_grpblk_t first;
        ext4_grpblk_t len;
        unsigned free = 0;
        unsigned fragments = 0;
        unsigned long long period = get_cycles();

        /* initialize buddy from bitmap which is aggregation
         * of on-disk bitmap and preallocations */
        i = mb_find_next_zero_bit(bitmap, max, 0);
        grp->bb_first_free = i;
        while (i < max) {
                fragments++;
                first = i;
                i = mb_find_next_bit(bitmap, max, i);
                len = i - first;
                free += len;
                if (len > 1)
                        ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
                else
                        grp->bb_counters[0]++;
                if (i < max)
                        i = mb_find_next_zero_bit(bitmap, max, i);
        }
        grp->bb_fragments = fragments;

        if (free != grp->bb_free) {
                ext4_grp_locked_error(sb, group, 0, 0,
                                      "block bitmap and bg descriptor "
                                      "inconsistent: %u vs %u free clusters",
                                      free, grp->bb_free);
                /*
                 * If we intend to continue, we consider group descriptor
                 * corrupt and update bb_free using bitmap value
                 */
                grp->bb_free = free;
                ext4_mark_group_bitmap_corrupted(sb, group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
        }
        mb_set_largest_free_order(sb, grp);
        mb_update_avg_fragment_size(sb, grp);

        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));

        period = get_cycles() - period;
        atomic_inc(&sbi->s_mb_buddies_generated);
        atomic64_add(period, &sbi->s_mb_generation_time);
}

static void mb_regenerate_buddy(struct ext4_buddy *e4b)
{
        int count;
        int order = 1;
        void *buddy;

        while ((buddy = mb_find_buddy(e4b, order++, &count)))
                mb_set_bits(buddy, 0, count);

        e4b->bd_info->bb_fragments = 0;
        memset(e4b->bd_info->bb_counters, 0,
                sizeof(*e4b->bd_info->bb_counters) *
                (e4b->bd_sb->s_blocksize_bits + 2));

        ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
                e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
}

/* The buddy information is attached the buddy cache inode
 * for convenience. The information regarding each group
 * is loaded via ext4_mb_load_buddy. The information involve
 * block bitmap and buddy information. The information are
 * stored in the inode as
 *
 * {                        folio                        }
 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 *
 *
 * one block each for bitmap and buddy information.
 * So for each group we take up 2 blocks. A folio can
 * contain blocks_per_folio (folio_size / blocksize)  blocks.
 * So it can have information regarding groups_per_folio which
 * is blocks_per_folio/2
 *
 * Locking note:  This routine takes the block group lock of all groups
 * for this folio; do not hold this lock when calling this routine!
 */
static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
{
        ext4_group_t ngroups;
        unsigned int blocksize;
        int blocks_per_folio;
        int groups_per_folio;
        int err = 0;
        int i;
        ext4_group_t first_group, group;
        int first_block;
        struct super_block *sb;
        struct buffer_head *bhs;
        struct buffer_head **bh = NULL;
        struct inode *inode;
        char *data;
        char *bitmap;
        struct ext4_group_info *grinfo;

        inode = folio->mapping->host;
        sb = inode->i_sb;
        ngroups = ext4_get_groups_count(sb);
        blocksize = i_blocksize(inode);
        blocks_per_folio = folio_size(folio) / blocksize;
        WARN_ON_ONCE(!blocks_per_folio);
        groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2);

        mb_debug(sb, "init folio %lu\n", folio->index);

        /* allocate buffer_heads to read bitmaps */
        if (groups_per_folio > 1) {
                i = sizeof(struct buffer_head *) * groups_per_folio;
                bh = kzalloc(i, gfp);
                if (bh == NULL)
                        return -ENOMEM;
        } else
                bh = &bhs;

        /* read all groups the folio covers into the cache */
        first_group = EXT4_PG_TO_LBLK(inode, folio->index) / 2;
        for (i = 0, group = first_group; i < groups_per_folio; i++, group++) {
                if (group >= ngroups)
                        break;

                grinfo = ext4_get_group_info(sb, group);
                if (!grinfo)
                        continue;
                /*
                 * If folio is uptodate then we came here after online resize
                 * which added some new uninitialized group info structs, so
                 * we must skip all initialized uptodate buddies on the folio,
                 * which may be currently in use by an allocating task.
                 */
                if (folio_test_uptodate(folio) &&
                                !EXT4_MB_GRP_NEED_INIT(grinfo)) {
                        bh[i] = NULL;
                        continue;
                }
                bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
                if (IS_ERR(bh[i])) {
                        err = PTR_ERR(bh[i]);
                        bh[i] = NULL;
                        goto out;
                }
                mb_debug(sb, "read bitmap for group %u\n", group);
        }

        /* wait for I/O completion */
        for (i = 0, group = first_group; i < groups_per_folio; i++, group++) {
                int err2;

                if (!bh[i])
                        continue;
                err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
                if (!err)
                        err = err2;
        }

        first_block = EXT4_PG_TO_LBLK(inode, folio->index);
        for (i = 0; i < blocks_per_folio; i++) {
                group = (first_block + i) >> 1;
                if (group >= ngroups)
                        break;

                if (!bh[group - first_group])
                        /* skip initialized uptodate buddy */
                        continue;

                if (!buffer_verified(bh[group - first_group]))
                        /* Skip faulty bitmaps */
                        continue;
                err = 0;

                /*
                 * data carry information regarding this
                 * particular group in the format specified
                 * above
                 *
                 */
                data = folio_address(folio) + (i * blocksize);
                bitmap = bh[group - first_group]->b_data;

                /*
                 * We place the buddy block and bitmap block
                 * close together
                 */
                grinfo = ext4_get_group_info(sb, group);
                if (!grinfo) {
                        err = -EFSCORRUPTED;
                        goto out;
                }
                if ((first_block + i) & 1) {
                        /* this is block of buddy */
                        BUG_ON(incore == NULL);
                        mb_debug(sb, "put buddy for group %u in folio %lu/%x\n",
                                group, folio->index, i * blocksize);
                        trace_ext4_mb_buddy_bitmap_load(sb, group);
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
                               sizeof(*grinfo->bb_counters) *
                               (MB_NUM_ORDERS(sb)));
                        /*
                         * incore got set to the group block bitmap below
                         */
                        ext4_lock_group(sb, group);
                        /* init the buddy */
                        memset(data, 0xff, blocksize);
                        ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
                        ext4_unlock_group(sb, group);
                        incore = NULL;
                } else {
                        /* this is block of bitmap */
                        BUG_ON(incore != NULL);
                        mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n",
                                group, folio->index, i * blocksize);
                        trace_ext4_mb_bitmap_load(sb, group);

                        /* see comments in ext4_mb_put_pa() */
                        ext4_lock_group(sb, group);
                        memcpy(data, bitmap, blocksize);

                        /* mark all preallocated blks used in in-core bitmap */
                        ext4_mb_generate_from_pa(sb, data, group);
                        WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root));
                        ext4_unlock_group(sb, group);

                        /* set incore so that the buddy information can be
                         * generated using this
                         */
                        incore = data;
                }
        }
        folio_mark_uptodate(folio);

out:
        if (bh) {
                for (i = 0; i < groups_per_folio; i++)
                        brelse(bh[i]);
                if (bh != &bhs)
                        kfree(bh);
        }
        return err;
}

/*
 * Lock the buddy and bitmap folios. This makes sure other parallel init_group
 * on the same buddy folio doesn't happen while holding the buddy folio lock.
 * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap
 * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0.
 */
static int ext4_mb_get_buddy_folio_lock(struct super_block *sb,
                ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
{
        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
        int block, pnum;
        struct folio *folio;

        e4b->bd_buddy_folio = NULL;
        e4b->bd_bitmap_folio = NULL;

        /*
         * the buddy cache inode stores the block bitmap
         * and buddy information in consecutive blocks.
         * So for each group we need two blocks.
         */
        block = group * 2;
        pnum = EXT4_LBLK_TO_PG(inode, block);
        folio = __filemap_get_folio(inode->i_mapping, pnum,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
        if (IS_ERR(folio))
                return PTR_ERR(folio);
        BUG_ON(folio->mapping != inode->i_mapping);
        WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize);
        e4b->bd_bitmap_folio = folio;
        e4b->bd_bitmap = folio_address(folio) +
                         offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));

        block++;
        pnum = EXT4_LBLK_TO_PG(inode, block);
        if (folio_contains(folio, pnum)) {
                /* buddy and bitmap are on the same folio */
                return 0;
        }

        /* we need another folio for the buddy */
        folio = __filemap_get_folio(inode->i_mapping, pnum,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
        if (IS_ERR(folio))
                return PTR_ERR(folio);
        BUG_ON(folio->mapping != inode->i_mapping);
        WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize);
        e4b->bd_buddy_folio = folio;
        return 0;
}

static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b)
{
        if (e4b->bd_bitmap_folio) {
                folio_unlock(e4b->bd_bitmap_folio);
                folio_put(e4b->bd_bitmap_folio);
        }
        if (e4b->bd_buddy_folio) {
                folio_unlock(e4b->bd_buddy_folio);
                folio_put(e4b->bd_buddy_folio);
        }
}

/*
 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
 * block group lock of all groups for this folio; do not hold the BG lock when
 * calling this routine!
 */
static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
{

        struct ext4_group_info *this_grp;
        struct ext4_buddy e4b;
        struct folio *folio;
        int ret = 0;

        might_sleep();
        mb_debug(sb, "init group %u\n", group);
        this_grp = ext4_get_group_info(sb, group);
        if (!this_grp)
                return -EFSCORRUPTED;

        /*
         * This ensures that we don't reinit the buddy cache
         * folio which map to the group from which we are already
         * allocating. If we are looking at the buddy cache we would
         * have taken a reference using ext4_mb_load_buddy and that
         * would have pinned buddy folio to page cache.
         * The call to ext4_mb_get_buddy_folio_lock will mark the
         * folio accessed.
         */
        ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp);
        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                /*
                 * somebody initialized the group
                 * return without doing anything
                 */
                goto err;
        }

        folio = e4b.bd_bitmap_folio;
        ret = ext4_mb_init_cache(folio, NULL, gfp);
        if (ret)
                goto err;
        if (!folio_test_uptodate(folio)) {
                ret = -EIO;
                goto err;
        }

        if (e4b.bd_buddy_folio == NULL) {
                /*
                 * If both the bitmap and buddy are in
                 * the same folio we don't need to force
                 * init the buddy
                 */
                ret = 0;
                goto err;
        }
        /* init buddy cache */
        folio = e4b.bd_buddy_folio;
        ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp);
        if (ret)
                goto err;
        if (!folio_test_uptodate(folio)) {
                ret = -EIO;
                goto err;
        }
err:
        ext4_mb_put_buddy_folio_lock(&e4b);
        return ret;
}

/*
 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
 * block group lock of all groups for this folio; do not hold the BG lock when
 * calling this routine!
 */
static noinline_for_stack int
ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
                       struct ext4_buddy *e4b, gfp_t gfp)
{
        int block;
        int pnum;
        struct folio *folio;
        int ret;
        struct ext4_group_info *grp;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct inode *inode = sbi->s_buddy_cache;

        might_sleep();
        mb_debug(sb, "load group %u\n", group);

        grp = ext4_get_group_info(sb, group);
        if (!grp)
                return -EFSCORRUPTED;

        e4b->bd_blkbits = sb->s_blocksize_bits;
        e4b->bd_info = grp;
        e4b->bd_sb = sb;
        e4b->bd_group = group;
        e4b->bd_buddy_folio = NULL;
        e4b->bd_bitmap_folio = NULL;

        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                /*
                 * we need full data about the group
                 * to make a good selection
                 */
                ret = ext4_mb_init_group(sb, group, gfp);
                if (ret)
                        return ret;
        }

        /*
         * the buddy cache inode stores the block bitmap
         * and buddy information in consecutive blocks.
         * So for each group we need two blocks.
         */
        block = group * 2;
        pnum = EXT4_LBLK_TO_PG(inode, block);

        /* Avoid locking the folio in the fast path ... */
        folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
        if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) {
                /*
                 * folio_test_locked is employed to detect ongoing folio
                 * migrations, since concurrent migrations can lead to
                 * bitmap inconsistency. And if we are not uptodate that
                 * implies somebody just created the folio but is yet to
                 * initialize it. We can drop the folio reference and
                 * try to get the folio with lock in both cases to avoid
                 * concurrency.
                 */
                if (!IS_ERR(folio))
                        folio_put(folio);
                folio = __filemap_get_folio(inode->i_mapping, pnum,
                                FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
                if (!IS_ERR(folio)) {
                        if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
        "ext4: bitmap's mapping != inode->i_mapping\n")) {
                                /* should never happen */
                                folio_unlock(folio);
                                ret = -EINVAL;
                                goto err;
                        }
                        if (!folio_test_uptodate(folio)) {
                                ret = ext4_mb_init_cache(folio, NULL, gfp);
                                if (ret) {
                                        folio_unlock(folio);
                                        goto err;
                                }
                                mb_cmp_bitmaps(e4b, folio_address(folio) +
                                        offset_in_folio(folio,
                                                EXT4_LBLK_TO_B(inode, block)));
                        }
                        folio_unlock(folio);
                }
        }
        if (IS_ERR(folio)) {
                ret = PTR_ERR(folio);
                goto err;
        }
        if (!folio_test_uptodate(folio)) {
                ret = -EIO;
                goto err;
        }

        /* Folios marked accessed already */
        e4b->bd_bitmap_folio = folio;
        e4b->bd_bitmap = folio_address(folio) +
                         offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));

        block++;
        pnum = EXT4_LBLK_TO_PG(inode, block);
        /* buddy and bitmap are on the same folio? */
        if (folio_contains(folio, pnum)) {
                folio_get(folio);
                goto update_buddy;
        }

        /* we need another folio for the buddy */
        folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
        if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) {
                if (!IS_ERR(folio))
                        folio_put(folio);
                folio = __filemap_get_folio(inode->i_mapping, pnum,
                                FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
                if (!IS_ERR(folio)) {
                        if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
        "ext4: buddy bitmap's mapping != inode->i_mapping\n")) {
                                /* should never happen */
                                folio_unlock(folio);
                                ret = -EINVAL;
                                goto err;
                        }
                        if (!folio_test_uptodate(folio)) {
                                ret = ext4_mb_init_cache(folio, e4b->bd_bitmap,
                                                         gfp);
                                if (ret) {
                                        folio_unlock(folio);
                                        goto err;
                                }
                        }
                        folio_unlock(folio);
                }
        }
        if (IS_ERR(folio)) {
                ret = PTR_ERR(folio);
                goto err;
        }
        if (!folio_test_uptodate(folio)) {
                ret = -EIO;
                goto err;
        }

update_buddy:
        /* Folios marked accessed already */
        e4b->bd_buddy_folio = folio;
        e4b->bd_buddy = folio_address(folio) +
                        offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));

        return 0;

err:
        if (!IS_ERR_OR_NULL(folio))
                folio_put(folio);
        if (e4b->bd_bitmap_folio)
                folio_put(e4b->bd_bitmap_folio);

        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
        return ret;
}

static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                              struct ext4_buddy *e4b)
{
        return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
}

static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
        if (e4b->bd_bitmap_folio)
                folio_put(e4b->bd_bitmap_folio);
        if (e4b->bd_buddy_folio)
                folio_put(e4b->bd_buddy_folio);
}


static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
{
        int order = 1, max;
        void *bb;

        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));

        while (order <= e4b->bd_blkbits + 1) {
                bb = mb_find_buddy(e4b, order, &max);
                if (!mb_test_bit(block >> order, bb)) {
                        /* this block is part of buddy of order 'order' */
                        return order;
                }
                order++;
        }
        return 0;
}

static void mb_clear_bits(void *bm, int cur, int len)
{
        __u32 *addr;

        len = cur + len;
        while (cur < len) {
                if ((cur & 31) == 0 && (len - cur) >= 32) {
                        /* fast path: clear whole word at once */
                        addr = bm + (cur >> 3);
                        *addr = 0;
                        cur += 32;
                        continue;
                }
                mb_clear_bit(cur, bm);
                cur++;
        }
}

/* clear bits in given range
 * will return first found zero bit if any, -1 otherwise
 */
static int mb_test_and_clear_bits(void *bm, int cur, int len)
{
        __u32 *addr;
        int zero_bit = -1;

        len = cur + len;
        while (cur < len) {
                if ((cur & 31) == 0 && (len - cur) >= 32) {
                        /* fast path: clear whole word at once */
                        addr = bm + (cur >> 3);
                        if (*addr != (__u32)(-1) && zero_bit == -1)
                                zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
                        *addr = 0;
                        cur += 32;
                        continue;
                }
                if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
                        zero_bit = cur;
                cur++;
        }

        return zero_bit;
}

void mb_set_bits(void *bm, int cur, int len)
{
        __u32 *addr;

        len = cur + len;
        while (cur < len) {
                if ((cur & 31) == 0 && (len - cur) >= 32) {
                        /* fast path: set whole word at once */
                        addr = bm + (cur >> 3);
                        *addr = 0xffffffff;
                        cur += 32;
                        continue;
                }
                mb_set_bit(cur, bm);
                cur++;
        }
}

static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
{
        if (mb_test_bit(*bit + side, bitmap)) {
                mb_clear_bit(*bit, bitmap);
                (*bit) -= side;
                return 1;
        }
        else {
                (*bit) += side;
                mb_set_bit(*bit, bitmap);
                return -1;
        }
}

static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
{
        int max;
        int order = 1;
        void *buddy = mb_find_buddy(e4b, order, &max);

        while (buddy) {
                void *buddy2;

                /* Bits in range [first; last] are known to be set since
                 * corresponding blocks were allocated. Bits in range
                 * (first; last) will stay set because they form buddies on
                 * upper layer. We just deal with borders if they don't
                 * align with upper layer and then go up.
                 * Releasing entire group is all about clearing
                 * single bit of highest order buddy.
                 */

                /* Example:
                 * ---------------------------------
                 * |   1   |   1   |   1   |   1   |
                 * ---------------------------------
                 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
                 * ---------------------------------
                 *   0   1   2   3   4   5   6   7
                 *      \_____________________/
                 *
                 * Neither [1] nor [6] is aligned to above layer.
                 * Left neighbour [0] is free, so mark it busy,
                 * decrease bb_counters and extend range to
                 * [0; 6]
                 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
                 * mark [6] free, increase bb_counters and shrink range to
                 * [0; 5].
                 * Then shift range to [0; 2], go up and do the same.
                 */


                if (first & 1)
                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
                if (!(last & 1))
                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
                if (first > last)
                        break;
                order++;

                buddy2 = mb_find_buddy(e4b, order, &max);
                if (!buddy2) {
                        mb_clear_bits(buddy, first, last - first + 1);
                        e4b->bd_info->bb_counters[order - 1] += last - first + 1;
                        break;
                }
                first >>= 1;
                last >>= 1;
                buddy = buddy2;
        }
}

static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                           int first, int count)
{
        int left_is_free = 0;
        int right_is_free = 0;
        int block;
        int last = first + count - 1;
        struct super_block *sb = e4b->bd_sb;

        if (WARN_ON(count == 0))
                return;
        BUG_ON(last >= (sb->s_blocksize << 3));
        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        /* Don't bother if the block group is corrupt. */
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                return;

        mb_check_buddy(e4b);
        mb_free_blocks_double(inode, e4b, first, count);

        /* access memory sequentially: check left neighbour,
         * clear range and then check right neighbour
         */
        if (first != 0)
                left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
        block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
        if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
                right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);

        if (unlikely(block != -1)) {
                struct ext4_sb_info *sbi = EXT4_SB(sb);
                ext4_fsblk_t blocknr;

                /*
                 * Fastcommit replay can free already freed blocks which
                 * corrupts allocation info. Regenerate it.
                 */
                if (sbi->s_mount_state & EXT4_FC_REPLAY) {
                        mb_regenerate_buddy(e4b);
                        goto check;
                }

                blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                blocknr += EXT4_C2B(sbi, block);
                ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                ext4_grp_locked_error(sb, e4b->bd_group,
                                      inode ? inode->i_ino : 0, blocknr,
                                      "freeing already freed block (bit %u); block bitmap corrupt.",
                                      block);
                return;
        }

        this_cpu_inc(discard_pa_seq);
        e4b->bd_info->bb_free += count;
        if (first < e4b->bd_info->bb_first_free)
                e4b->bd_info->bb_first_free = first;

        /* let's maintain fragments counter */
        if (left_is_free && right_is_free)
                e4b->bd_info->bb_fragments--;
        else if (!left_is_free && !right_is_free)
                e4b->bd_info->bb_fragments++;

        /* buddy[0] == bd_bitmap is a special case, so handle
         * it right away and let mb_buddy_mark_free stay free of
         * zero order checks.
         * Check if neighbours are to be coaleasced,
         * adjust bitmap bb_counters and borders appropriately.
         */
        if (first & 1) {
                first += !left_is_free;
                e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
        }
        if (!(last & 1)) {
                last -= !right_is_free;
                e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
        }

        if (first <= last)
                mb_buddy_mark_free(e4b, first >> 1, last >> 1);

        mb_set_largest_free_order(sb, e4b->bd_info);
        mb_update_avg_fragment_size(sb, e4b->bd_info);
check:
        mb_check_buddy(e4b);
}

static int mb_find_extent(struct ext4_buddy *e4b, int block,
                                int needed, struct ext4_free_extent *ex)
{
        int max, order, next;
        void *buddy;

        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        BUG_ON(ex == NULL);

        buddy = mb_find_buddy(e4b, 0, &max);
        BUG_ON(buddy == NULL);
        BUG_ON(block >= max);
        if (mb_test_bit(block, buddy)) {
                ex->fe_len = 0;
                ex->fe_start = 0;
                ex->fe_group = 0;
                return 0;
        }

        /* find actual order */
        order = mb_find_order_for_block(e4b, block);

        ex->fe_len = (1 << order) - (block & ((1 << order) - 1));
        ex->fe_start = block;
        ex->fe_group = e4b->bd_group;

        block = block >> order;

        while (needed > ex->fe_len &&
               mb_find_buddy(e4b, order, &max)) {

                if (block + 1 >= max)
                        break;

                next = (block + 1) * (1 << order);
                if (mb_test_bit(next, e4b->bd_bitmap))
                        break;

                order = mb_find_order_for_block(e4b, next);

                block = next >> order;
                ex->fe_len += 1 << order;
        }

        if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
                /* Should never happen! (but apparently sometimes does?!?) */
                WARN_ON(1);
                ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
                        "corruption or bug in mb_find_extent "
                        "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
                        block, order, needed, ex->fe_group, ex->fe_start,
                        ex->fe_len, ex->fe_logical);
                ex->fe_len = 0;
                ex->fe_start = 0;
                ex->fe_group = 0;
        }
        return ex->fe_len;
}

static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
{
        int ord;
        int mlen = 0;
        int max = 0;
        int start = ex->fe_start;
        int len = ex->fe_len;
        unsigned ret = 0;
        int len0 = len;
        void *buddy;
        int ord_start, ord_end;

        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
        BUG_ON(e4b->bd_group != ex->fe_group);
        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_mark_used_double(e4b, start, len);

        this_cpu_inc(discard_pa_seq);
        e4b->bd_info->bb_free -= len;
        if (e4b->bd_info->bb_first_free == start)
                e4b->bd_info->bb_first_free += len;

        /* let's maintain fragments counter */
        if (start != 0)
                mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
                max = !mb_test_bit(start + len, e4b->bd_bitmap);
        if (mlen && max)
                e4b->bd_info->bb_fragments++;
        else if (!mlen && !max)
                e4b->bd_info->bb_fragments--;

        /* let's maintain buddy itself */
        while (len) {
                ord = mb_find_order_for_block(e4b, start);

                if (((start >> ord) << ord) == start && len >= (1 << ord)) {
                        /* the whole chunk may be allocated at once! */
                        mlen = 1 << ord;
                        buddy = mb_find_buddy(e4b, ord, &max);
                        BUG_ON((start >> ord) >= max);
                        mb_set_bit(start >> ord, buddy);
                        e4b->bd_info->bb_counters[ord]--;
                        start += mlen;
                        len -= mlen;
                        BUG_ON(len < 0);
                        continue;
                }

                /* store for history */
                if (ret == 0)
                        ret = len | (ord << 16);

                BUG_ON(ord <= 0);
                buddy = mb_find_buddy(e4b, ord, &max);
                mb_set_bit(start >> ord, buddy);
                e4b->bd_info->bb_counters[ord]--;

                ord_start = (start >> ord) << ord;
                ord_end = ord_start + (1 << ord);
                /* first chunk */
                if (start > ord_start)
                        ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
                                                 ord_start, start - ord_start,
                                                 e4b->bd_info);

                /* last chunk */
                if (start + len < ord_end) {
                        ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
                                                 start + len,
                                                 ord_end - (start + len),
                                                 e4b->bd_info);
                        break;
                }
                len = start + len - ord_end;
                start = ord_end;
        }
        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);

        mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
        mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
        mb_check_buddy(e4b);

        return ret;
}

/*
 * Must be called under group lock!
 */
static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int ret;

        BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);

        ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
        ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
        ret = mb_mark_used(e4b, &ac->ac_b_ex);

        /* preallocation can change ac_b_ex, thus we store actually
         * allocated blocks for history */
        ac->ac_f_ex = ac->ac_b_ex;

        ac->ac_status = AC_STATUS_FOUND;
        ac->ac_tail = ret & 0xffff;
        ac->ac_buddy = ret >> 16;

        /*
         * take the folio reference. We want the folio to be pinned
         * so that we don't get a ext4_mb_init_cache_call for this
         * group until we update the bitmap. That would mean we
         * double allocate blocks. The reference is dropped
         * in ext4_mb_release_context
         */
        ac->ac_bitmap_folio = e4b->bd_bitmap_folio;
        folio_get(ac->ac_bitmap_folio);
        ac->ac_buddy_folio = e4b->bd_buddy_folio;
        folio_get(ac->ac_buddy_folio);
        /* store last allocated for subsequent stream allocation */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                int hash = (unsigned int)ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;

                WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group);
        }

        /*
         * As we've just preallocated more space than
         * user requested originally, we store allocated
         * space in a special descriptor.
         */
        if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
                ext4_mb_new_preallocation(ac);

}

static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b,
                                        int finish_group)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_free_extent *bex = &ac->ac_b_ex;
        struct ext4_free_extent *gex = &ac->ac_g_ex;

        if (ac->ac_status == AC_STATUS_FOUND)
                return;
        /*
         * We don't want to scan for a whole year
         */
        if (ac->ac_found > sbi->s_mb_max_to_scan &&
                        !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
                ac->ac_status = AC_STATUS_BREAK;
                return;
        }

        /*
         * Haven't found good chunk so far, let's continue
         */
        if (bex->fe_len < gex->fe_len)
                return;

        if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
                ext4_mb_use_best_found(ac, e4b);
}

/*
 * The routine checks whether found extent is good enough. If it is,
 * then the extent gets marked used and flag is set to the context
 * to stop scanning. Otherwise, the extent is compared with the
 * previous found extent and if new one is better, then it's stored
 * in the context. Later, the best found extent will be used, if
 * mballoc can't find good enough extent.
 *
 * The algorithm used is roughly as follows:
 *
 * * If free extent found is exactly as big as goal, then
 *   stop the scan and use it immediately
 *
 * * If free extent found is smaller than goal, then keep retrying
 *   upto a max of sbi->s_mb_max_to_scan times (default 200). After
 *   that stop scanning and use whatever we have.
 *
 * * If free extent found is bigger than goal, then keep retrying
 *   upto a max of sbi->s_mb_min_to_scan times (default 10) before
 *   stopping the scan and using the extent.
 *
 *
 * FIXME: real allocation policy is to be designed yet!
 */
static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
                                        struct ext4_free_extent *ex,
                                        struct ext4_buddy *e4b)
{
        struct ext4_free_extent *bex = &ac->ac_b_ex;
        struct ext4_free_extent *gex = &ac->ac_g_ex;

        BUG_ON(ex->fe_len <= 0);
        BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
        BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);

        ac->ac_found++;
        ac->ac_cX_found[ac->ac_criteria]++;

        /*
         * The special case - take what you catch first
         */
        if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
                *bex = *ex;
                ext4_mb_use_best_found(ac, e4b);
                return;
        }

        /*
         * Let's check whether the chuck is good enough
         */
        if (ex->fe_len == gex->fe_len) {
                *bex = *ex;
                ext4_mb_use_best_found(ac, e4b);
                return;
        }

        /*
         * If this is first found extent, just store it in the context
         */
        if (bex->fe_len == 0) {
                *bex = *ex;
                return;
        }

        /*
         * If new found extent is better, store it in the context
         */
        if (bex->fe_len < gex->fe_len) {
                /* if the request isn't satisfied, any found extent
                 * larger than previous best one is better */
                if (ex->fe_len > bex->fe_len)
                        *bex = *ex;
        } else if (ex->fe_len > gex->fe_len) {
                /* if the request is satisfied, then we try to find
                 * an extent that still satisfy the request, but is
                 * smaller than previous one */
                if (ex->fe_len < bex->fe_len)
                        *bex = *ex;
        }

        ext4_mb_check_limits(ac, e4b, 0);
}

static noinline_for_stack
void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct ext4_free_extent ex = ac->ac_b_ex;
        ext4_group_t group = ex.fe_group;
        int max;
        int err;

        BUG_ON(ex.fe_len <= 0);
        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
        if (err)
                return;

        ext4_lock_group(ac->ac_sb, group);
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                goto out;

        max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);

        if (max > 0) {
                ac->ac_b_ex = ex;
                ext4_mb_use_best_found(ac, e4b);
        }

out:
        ext4_unlock_group(ac->ac_sb, group);
        ext4_mb_unload_buddy(e4b);
}

static noinline_for_stack
int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                                struct ext4_buddy *e4b)
{
        ext4_group_t group = ac->ac_g_ex.fe_group;
        int max;
        int err;
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        struct ext4_free_extent ex;

        if (!grp)
                return -EFSCORRUPTED;
        if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
                return 0;
        if (grp->bb_free == 0)
                return 0;

        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
        if (err) {
                if (EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info) &&
                    !(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                        return 0;
                return err;
        }

        ext4_lock_group(ac->ac_sb, group);
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                goto out;

        max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
                             ac->ac_g_ex.fe_len, &ex);
        ex.fe_logical = 0xDEADFA11; /* debug value */

        if (max >= ac->ac_g_ex.fe_len &&
            ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) {
                ext4_fsblk_t start;

                start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
                /* use do_div to get remainder (would be 64-bit modulo) */
                if (do_div(start, sbi->s_stripe) == 0) {
                        ac->ac_found++;
                        ac->ac_b_ex = ex;
                        ext4_mb_use_best_found(ac, e4b);
                }
        } else if (max >= ac->ac_g_ex.fe_len) {
                BUG_ON(ex.fe_len <= 0);
                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
                ac->ac_found++;
                ac->ac_b_ex = ex;
                ext4_mb_use_best_found(ac, e4b);
        } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
                /* Sometimes, caller may want to merge even small
                 * number of blocks to an existing extent */
                BUG_ON(ex.fe_len <= 0);
                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
                ac->ac_found++;
                ac->ac_b_ex = ex;
                ext4_mb_use_best_found(ac, e4b);
        }
out:
        ext4_unlock_group(ac->ac_sb, group);
        ext4_mb_unload_buddy(e4b);

        return 0;
}

/*
 * The routine scans buddy structures (not bitmap!) from given order
 * to max order and tries to find big enough chunk to satisfy the req
 */
static noinline_for_stack
void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_group_info *grp = e4b->bd_info;
        void *buddy;
        int i;
        int k;
        int max;

        BUG_ON(ac->ac_2order <= 0);
        for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
                if (grp->bb_counters[i] == 0)
                        continue;

                buddy = mb_find_buddy(e4b, i, &max);
                if (WARN_RATELIMIT(buddy == NULL,
                         "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i))
                        continue;

                k = mb_find_next_zero_bit(buddy, max, 0);
                if (k >= max) {
                        ext4_mark_group_bitmap_corrupted(ac->ac_sb,
                                        e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
                                "%d free clusters of order %d. But found 0",
                                grp->bb_counters[i], i);
                        break;
                }
                ac->ac_found++;
                ac->ac_cX_found[ac->ac_criteria]++;

                ac->ac_b_ex.fe_len = 1 << i;
                ac->ac_b_ex.fe_start = k << i;
                ac->ac_b_ex.fe_group = e4b->bd_group;

                ext4_mb_use_best_found(ac, e4b);

                BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);

                if (EXT4_SB(sb)->s_mb_stats)
                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);

                break;
        }
}

/*
 * The routine scans the group and measures all found extents.
 * In order to optimize scanning, caller must pass number of
 * free blocks in the group, so the routine can know upper limit.
 */
static noinline_for_stack
void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct super_block *sb = ac->ac_sb;
        void *bitmap = e4b->bd_bitmap;
        struct ext4_free_extent ex;
        int i, j, freelen;
        int free;

        free = e4b->bd_info->bb_free;
        if (WARN_ON(free <= 0))
                return;

        i = e4b->bd_info->bb_first_free;

        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
                i = mb_find_next_zero_bit(bitmap,
                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
                if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
                        /*
                         * IF we have corrupt bitmap, we won't find any
                         * free blocks even though group info says we
                         * have free blocks
                         */
                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
                                        "%d free clusters as per "
                                        "group info. But bitmap says 0",
                                        free);
                        break;
                }

                if (!ext4_mb_cr_expensive(ac->ac_criteria)) {
                        /*
                         * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
                         * sure that this group will have a large enough
                         * continuous free extent, so skip over the smaller free
                         * extents
                         */
                        j = mb_find_next_bit(bitmap,
                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
                        freelen = j - i;

                        if (freelen < ac->ac_g_ex.fe_len) {
                                i = j;
                                free -= freelen;
                                continue;
                        }
                }

                mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
                if (WARN_ON(ex.fe_len <= 0))
                        break;
                if (free < ex.fe_len) {
                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
                                        "%d free clusters as per "
                                        "group info. But got %d blocks",
                                        free, ex.fe_len);
                        /*
                         * The number of free blocks differs. This mostly
                         * indicate that the bitmap is corrupt. So exit
                         * without claiming the space.
                         */
                        break;
                }
                ex.fe_logical = 0xDEADC0DE; /* debug value */
                ext4_mb_measure_extent(ac, &ex, e4b);

                i += ex.fe_len;
                free -= ex.fe_len;
        }

        ext4_mb_check_limits(ac, e4b, 1);
}

/*
 * This is a special case for storages like raid5
 * we try to find stripe-aligned chunks for stripe-size-multiple requests
 */
static noinline_for_stack
void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
                                 struct ext4_buddy *e4b)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        void *bitmap = e4b->bd_bitmap;
        struct ext4_free_extent ex;
        ext4_fsblk_t first_group_block;
        ext4_fsblk_t a;
        ext4_grpblk_t i, stripe;
        int max;

        BUG_ON(sbi->s_stripe == 0);

        /* find first stripe-aligned block in group */
        first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);

        a = first_group_block + sbi->s_stripe - 1;
        do_div(a, sbi->s_stripe);
        i = (a * sbi->s_stripe) - first_group_block;

        stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe);
        i = EXT4_B2C(sbi, i);
        while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
                if (!mb_test_bit(i, bitmap)) {
                        max = mb_find_extent(e4b, i, stripe, &ex);
                        if (max >= stripe) {
                                ac->ac_found++;
                                ac->ac_cX_found[ac->ac_criteria]++;
                                ex.fe_logical = 0xDEADF00D; /* debug value */
                                ac->ac_b_ex = ex;
                                ext4_mb_use_best_found(ac, e4b);
                                break;
                        }
                }
                i += stripe;
        }
}

static void __ext4_mb_scan_group(struct ext4_allocation_context *ac)
{
        bool is_stripe_aligned;
        struct ext4_sb_info *sbi;
        enum criteria cr = ac->ac_criteria;

        ac->ac_groups_scanned++;
        if (cr == CR_POWER2_ALIGNED)
                return ext4_mb_simple_scan_group(ac, ac->ac_e4b);

        sbi = EXT4_SB(ac->ac_sb);
        is_stripe_aligned = false;
        if ((sbi->s_stripe >= sbi->s_cluster_ratio) &&
            !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe)))
                is_stripe_aligned = true;

        if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) &&
            is_stripe_aligned)
                ext4_mb_scan_aligned(ac, ac->ac_e4b);

        if (ac->ac_status == AC_STATUS_CONTINUE)
                ext4_mb_complex_scan_group(ac, ac->ac_e4b);
}

/*
 * This is also called BEFORE we load the buddy bitmap.
 * Returns either 1 or 0 indicating that the group is either suitable
 * for the allocation or not.
 */
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
                                ext4_group_t group, enum criteria cr)
{
        ext4_grpblk_t free, fragments;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);

        BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS);

        if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                return false;

        free = grp->bb_free;
        if (free == 0)
                return false;

        fragments = grp->bb_fragments;
        if (fragments == 0)
                return false;

        switch (cr) {
        case CR_POWER2_ALIGNED:
                BUG_ON(ac->ac_2order == 0);

                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
                    ((group % flex_size) == 0))
                        return false;

                if (free < ac->ac_g_ex.fe_len)
                        return false;

                if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
                        return true;

                if (grp->bb_largest_free_order < ac->ac_2order)
                        return false;

                return true;
        case CR_GOAL_LEN_FAST:
        case CR_BEST_AVAIL_LEN:
                if ((free / fragments) >= ac->ac_g_ex.fe_len)
                        return true;
                break;
        case CR_GOAL_LEN_SLOW:
                if (free >= ac->ac_g_ex.fe_len)
                        return true;
                break;
        case CR_ANY_FREE:
                return true;
        default:
                BUG();
        }

        return false;
}

/*
 * This could return negative error code if something goes wrong
 * during ext4_mb_init_group(). This should not be called with
 * ext4_lock_group() held.
 *
 * Note: because we are conditionally operating with the group lock in
 * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
 * function using __acquire and __release.  This means we need to be
 * super careful before messing with the error path handling via "goto
 * out"!
 */
static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
                                     ext4_group_t group, enum criteria cr)
{
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
        ext4_grpblk_t free;
        int ret = 0;

        if (!grp)
                return -EFSCORRUPTED;
        if (sbi->s_mb_stats)
                atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
        if (should_lock) {
                ext4_lock_group(sb, group);
                __release(ext4_group_lock_ptr(sb, group));
        }
        free = grp->bb_free;
        if (free == 0)
                goto out;
        /*
         * In all criterias except CR_ANY_FREE we try to avoid groups that
         * can't possibly satisfy the full goal request due to insufficient
         * free blocks.
         */
        if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len)
                goto out;
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                goto out;
        if (should_lock) {
                __acquire(ext4_group_lock_ptr(sb, group));
                ext4_unlock_group(sb, group);
        }

        /* We only do this if the grp has never been initialized */
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                struct ext4_group_desc *gdp =
                        ext4_get_group_desc(sb, group, NULL);
                int ret;

                /*
                 * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
                 * search to find large good chunks almost for free. If buddy
                 * data is not ready, then this optimization makes no sense. But
                 * we never skip the first block group in a flex_bg, since this
                 * gets used for metadata block allocation, and we want to make
                 * sure we locate metadata blocks in the first block group in
                 * the flex_bg if possible.
                 */
                if (!ext4_mb_cr_expensive(cr) &&
                    (!sbi->s_log_groups_per_flex ||
                     ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
                    !(ext4_has_group_desc_csum(sb) &&
                      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
                        return 0;
                ret = ext4_mb_init_group(sb, group, GFP_NOFS);
                if (ret)
                        return ret;
        }

        if (should_lock) {
                ext4_lock_group(sb, group);
                __release(ext4_group_lock_ptr(sb, group));
        }
        ret = ext4_mb_good_group(ac, group, cr);
out:
        if (should_lock) {
                __acquire(ext4_group_lock_ptr(sb, group));
                ext4_unlock_group(sb, group);
        }
        return ret;
}

/*
 * Start prefetching @nr block bitmaps starting at @group.
 * Return the next group which needs to be prefetched.
 */
ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
                              unsigned int nr, int *cnt)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct buffer_head *bh;
        struct blk_plug plug;

        blk_start_plug(&plug);
        while (nr-- > 0) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
                                                                  NULL);
                struct ext4_group_info *grp = ext4_get_group_info(sb, group);

                /*
                 * Prefetch block groups with free blocks; but don't
                 * bother if it is marked uninitialized on disk, since
                 * it won't require I/O to read.  Also only try to
                 * prefetch once, so we avoid getblk() call, which can
                 * be expensive.
                 */
                if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
                    EXT4_MB_GRP_NEED_INIT(grp) &&
                    ext4_free_group_clusters(sb, gdp) > 0 ) {
                        bh = ext4_read_block_bitmap_nowait(sb, group, true);
                        if (!IS_ERR_OR_NULL(bh)) {
                                if (!buffer_uptodate(bh) && cnt)
                                        (*cnt)++;
                                brelse(bh);
                        }
                }
                if (++group >= ngroups)
                        group = 0;
        }
        blk_finish_plug(&plug);
        return group;
}

/*
 * Batch reads of the block allocation bitmaps to get
 * multiple READs in flight; limit prefetching at inexpensive
 * CR, otherwise mballoc can spend a lot of time loading
 * imperfect groups
 */
static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac,
                                   ext4_group_t group)
{
        struct ext4_sb_info *sbi;

        if (ac->ac_prefetch_grp != group)
                return;

        sbi = EXT4_SB(ac->ac_sb);
        if (ext4_mb_cr_expensive(ac->ac_criteria) ||
            ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) {
                unsigned int nr = sbi->s_mb_prefetch;

                if (ext4_has_feature_flex_bg(ac->ac_sb)) {
                        nr = 1 << sbi->s_log_groups_per_flex;
                        nr -= group & (nr - 1);
                        nr = umin(nr, sbi->s_mb_prefetch);
                }

                ac->ac_prefetch_nr = nr;
                ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr,
                                                       &ac->ac_prefetch_ios);
        }
}

/*
 * Prefetching reads the block bitmap into the buffer cache; but we
 * need to make sure that the buddy bitmap in the page cache has been
 * initialized.  Note that ext4_mb_init_group() will block if the I/O
 * is not yet completed, or indeed if it was not initiated by
 * ext4_mb_prefetch did not start the I/O.
 *
 * TODO: We should actually kick off the buddy bitmap setup in a work
 * queue when the buffer I/O is completed, so that we don't block
 * waiting for the block allocation bitmap read to finish when
 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
 */
void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
                           unsigned int nr)
{
        struct ext4_group_desc *gdp;
        struct ext4_group_info *grp;

        while (nr-- > 0) {
                if (!group)
                        group = ext4_get_groups_count(sb);
                group--;
                gdp = ext4_get_group_desc(sb, group, NULL);
                grp = ext4_get_group_info(sb, group);

                if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
                    ext4_free_group_clusters(sb, gdp) > 0) {
                        if (ext4_mb_init_group(sb, group, GFP_NOFS))
                                break;
                }
        }
}

static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
                              ext4_group_t group)
{
        int ret;
        struct super_block *sb = ac->ac_sb;
        enum criteria cr = ac->ac_criteria;

        ext4_mb_might_prefetch(ac, group);

        /* prevent unnecessary buddy loading. */
        if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group)))
                return 0;

        /* This now checks without needing the buddy folio */
        ret = ext4_mb_good_group_nolock(ac, group, cr);
        if (ret <= 0) {
                if (!ac->ac_first_err)
                        ac->ac_first_err = ret;
                return 0;
        }

        ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b);
        if (ret)
                return ret;

        /* skip busy group */
        if (cr >= CR_ANY_FREE)
                ext4_lock_group(sb, group);
        else if (!ext4_try_lock_group(sb, group))
                goto out_unload;

        /* We need to check again after locking the block group. */
        if (unlikely(!ext4_mb_good_group(ac, group, cr)))
                goto out_unlock;

        __ext4_mb_scan_group(ac);

out_unlock:
        ext4_unlock_group(sb, group);
out_unload:
        ext4_mb_unload_buddy(ac->ac_e4b);
        return ret;
}

static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
        ext4_group_t i;
        int err = 0;
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_buddy e4b;

        BUG_ON(ac->ac_status == AC_STATUS_FOUND);

        /* first, try the goal */
        err = ext4_mb_find_by_goal(ac, &e4b);
        if (err || ac->ac_status == AC_STATUS_FOUND)
                goto out;

        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                goto out;

        /*
         * ac->ac_2order is set only if the fe_len is a power of 2
         * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
         * so that we try exact allocation using buddy.
         */
        i = fls(ac->ac_g_ex.fe_len);
        ac->ac_2order = 0;
        /*
         * We search using buddy data only if the order of the request
         * is greater than equal to the sbi_s_mb_order2_reqs
         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
         * We also support searching for power-of-two requests only for
         * requests upto maximum buddy size we have constructed.
         */
        if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
                if (is_power_of_2(ac->ac_g_ex.fe_len))
                        ac->ac_2order = array_index_nospec(i - 1,
                                                           MB_NUM_ORDERS(sb));
        }

        /* if stream allocation is enabled, use global goal */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                int hash = (unsigned int)ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;

                ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
                ac->ac_g_ex.fe_start = -1;
                ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
        }

        /*
         * Let's just scan groups to find more-less suitable blocks We
         * start with CR_GOAL_LEN_FAST, unless it is power of 2
         * aligned, in which case let's do that faster approach first.
         */
        ac->ac_criteria = CR_GOAL_LEN_FAST;
        if (ac->ac_2order)
                ac->ac_criteria = CR_POWER2_ALIGNED;

        ac->ac_e4b = &e4b;
        ac->ac_prefetch_ios = 0;
        ac->ac_first_err = 0;
repeat:
        while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
                err = ext4_mb_scan_groups(ac);
                if (err)
                        goto out;

                if (ac->ac_status != AC_STATUS_CONTINUE)
                        break;
        }

        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
            !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
                /*
                 * We've been searching too long. Let's try to allocate
                 * the best chunk we've found so far
                 */
                ext4_mb_try_best_found(ac, &e4b);
                if (ac->ac_status != AC_STATUS_FOUND) {
                        int lost;

                        /*
                         * Someone more lucky has already allocated it.
                         * The only thing we can do is just take first
                         * found block(s)
                         */
                        lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
                        mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
                                 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
                                 ac->ac_b_ex.fe_len, lost);

                        ac->ac_b_ex.fe_group = 0;
                        ac->ac_b_ex.fe_start = 0;
                        ac->ac_b_ex.fe_len = 0;
                        ac->ac_status = AC_STATUS_CONTINUE;
                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
                        ac->ac_criteria = CR_ANY_FREE;
                        goto repeat;
                }
        }

        if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
                atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
                if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
                    ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
                        atomic_inc(&sbi->s_bal_stream_goals);
        }
out:
        if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
                err = ac->ac_first_err;

        mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
                 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
                 ac->ac_flags, ac->ac_criteria, err);

        if (ac->ac_prefetch_nr)
                ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);

        return err;
}

static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        ext4_group_t group;

        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
}

static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        ext4_group_t group;

        ++*pos;
        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
}

static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
        int i, err;
        char nbuf[16];
        struct ext4_buddy e4b;
        struct ext4_group_info *grinfo;
        unsigned char blocksize_bits = min_t(unsigned char,
                                             sb->s_blocksize_bits,
                                             EXT4_MAX_BLOCK_LOG_SIZE);
        DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters,
                        EXT4_MAX_BLOCK_LOG_SIZE + 2);

        group--;
        if (group == 0)
                seq_puts(seq, "#group: free  frags first ["
                              " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
                              " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");

        i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) +
                sizeof(struct ext4_group_info);

        grinfo = ext4_get_group_info(sb, group);
        if (!grinfo)
                return 0;
        /* Load the group info in memory only if not already loaded. */
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
                err = ext4_mb_load_buddy(sb, group, &e4b);
                if (err) {
                        seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf));
                        return 0;
                }
                ext4_mb_unload_buddy(&e4b);
        }

        /*
         * We care only about free space counters in the group info and
         * these are safe to access even after the buddy has been unloaded
         */
        memcpy(sg, grinfo, i);
        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free,
                        sg->bb_fragments, sg->bb_first_free);
        for (i = 0; i <= 13; i++)
                seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
                                sg->bb_counters[i] : 0);
        seq_puts(seq, " ]");
        if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg))
                seq_puts(seq, " Block bitmap corrupted!");
        seq_putc(seq, '\n');
        return 0;
}

static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
{
}

const struct seq_operations ext4_mb_seq_groups_ops = {
        .start  = ext4_mb_seq_groups_start,
        .next   = ext4_mb_seq_groups_next,
        .stop   = ext4_mb_seq_groups_stop,
        .show   = ext4_mb_seq_groups_show,
};

int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
{
        struct super_block *sb = seq->private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        seq_puts(seq, "mballoc:\n");
        if (!sbi->s_mb_stats) {
                seq_puts(seq, "\tmb stats collection turned off.\n");
                seq_puts(
                        seq,
                        "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
                return 0;
        }
        seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
        seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));

        seq_printf(seq, "\tgroups_scanned: %u\n",
                   atomic_read(&sbi->s_bal_groups_scanned));

        /* CR_POWER2_ALIGNED stats */
        seq_puts(seq, "\tcr_p2_aligned_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED]));
        seq_printf(
                seq, "\t\tgroups_considered: %llu\n",
                atomic64_read(
                        &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]));
        seq_printf(seq, "\t\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));

        /* CR_GOAL_LEN_FAST stats */
        seq_puts(seq, "\tcr_goal_fast_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST]));
        seq_printf(seq, "\t\tgroups_considered: %llu\n",
                   atomic64_read(
                           &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST]));
        seq_printf(seq, "\t\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));

        /* CR_BEST_AVAIL_LEN stats */
        seq_puts(seq, "\tcr_best_avail_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN]));
        seq_printf(
                seq, "\t\tgroups_considered: %llu\n",
                atomic64_read(
                        &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN]));
        seq_printf(seq, "\t\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));

        /* CR_GOAL_LEN_SLOW stats */
        seq_puts(seq, "\tcr_goal_slow_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW]));
        seq_printf(seq, "\t\tgroups_considered: %llu\n",
                   atomic64_read(
                           &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW]));
        seq_printf(seq, "\t\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW]));

        /* CR_ANY_FREE stats */
        seq_puts(seq, "\tcr_any_free_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE]));
        seq_printf(
                seq, "\t\tgroups_considered: %llu\n",
                atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE]));
        seq_printf(seq, "\t\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE]));

        /* Aggregates */
        seq_printf(seq, "\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_ex_scanned));
        seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
        seq_printf(seq, "\t\tstream_goal_hits: %u\n",
                   atomic_read(&sbi->s_bal_stream_goals));
        seq_printf(seq, "\t\tlen_goal_hits: %u\n",
                   atomic_read(&sbi->s_bal_len_goals));
        seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
        seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
        seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
        seq_printf(seq, "\tbuddies_generated: %u/%u\n",
                   atomic_read(&sbi->s_mb_buddies_generated),
                   ext4_get_groups_count(sb));
        seq_printf(seq, "\tbuddies_time_used: %llu\n",
                   atomic64_read(&sbi->s_mb_generation_time));
        seq_printf(seq, "\tpreallocated: %u\n",
                   atomic_read(&sbi->s_mb_preallocated));
        seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded));
        return 0;
}

static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        unsigned long position;

        if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
                return NULL;
        position = *pos + 1;
        return (void *) ((unsigned long) position);
}

static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        unsigned long position;

        ++*pos;
        if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
                return NULL;
        position = *pos + 1;
        return (void *) ((unsigned long) position);
}

static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned long position = ((unsigned long) v);
        struct ext4_group_info *grp;
        unsigned int count;
        unsigned long idx;

        position--;
        if (position >= MB_NUM_ORDERS(sb)) {
                position -= MB_NUM_ORDERS(sb);
                if (position == 0)
                        seq_puts(seq, "avg_fragment_size_lists:\n");

                count = 0;
                xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp)
                        count++;
                seq_printf(seq, "\tlist_order_%u_groups: %u\n",
                                        (unsigned int)position, count);
                return 0;
        }

        if (position == 0) {
                seq_printf(seq, "optimize_scan: %d\n",
                           test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
                seq_puts(seq, "max_free_order_lists:\n");
        }
        count = 0;
        xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp)
                count++;
        seq_printf(seq, "\tlist_order_%u_groups: %u\n",
                   (unsigned int)position, count);

        return 0;
}

static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
{
}

const struct seq_operations ext4_mb_seq_structs_summary_ops = {
        .start  = ext4_mb_seq_structs_summary_start,
        .next   = ext4_mb_seq_structs_summary_next,
        .stop   = ext4_mb_seq_structs_summary_stop,
        .show   = ext4_mb_seq_structs_summary_show,
};

static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
{
        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];

        BUG_ON(!cachep);
        return cachep;
}

/*
 * Allocate the top-level s_group_info array for the specified number
 * of groups
 */
int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned size;
        struct ext4_group_info ***old_groupinfo, ***new_groupinfo;

        size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
                EXT4_DESC_PER_BLOCK_BITS(sb);
        if (size <= sbi->s_group_info_size)
                return 0;

        size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
        new_groupinfo = kvzalloc(size, GFP_KERNEL);
        if (!new_groupinfo) {
                ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
                return -ENOMEM;
        }
        rcu_read_lock();
        old_groupinfo = rcu_dereference(sbi->s_group_info);
        if (old_groupinfo)
                memcpy(new_groupinfo, old_groupinfo,
                       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
        rcu_read_unlock();
        rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
        sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
        if (old_groupinfo)
                ext4_kvfree_array_rcu(old_groupinfo);
        ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
                   sbi->s_group_info_size);
        return 0;
}

/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                          struct ext4_group_desc *desc)
{
        int i;
        int metalen = 0;
        int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info **meta_group_info;
        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);

        /*
         * First check if this group is the first of a reserved block.
         * If it's true, we have to allocate a new table of pointers
         * to ext4_group_info structures
         */
        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
                metalen = sizeof(*meta_group_info) <<
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                meta_group_info = kmalloc(metalen, GFP_NOFS);
                if (meta_group_info == NULL) {
                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
                                 "for a buddy group");
                        return -ENOMEM;
                }
                rcu_read_lock();
                rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
                rcu_read_unlock();
        }

        meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);

        meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
        if (meta_group_info[i] == NULL) {
                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
                goto exit_group_info;
        }
        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                &(meta_group_info[i]->bb_state));

        /*
         * initialize bb_free to be able to skip
         * empty groups without initialization
         */
        if (ext4_has_group_desc_csum(sb) &&
            (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                meta_group_info[i]->bb_free =
                        ext4_free_clusters_after_init(sb, group, desc);
        } else {
                meta_group_info[i]->bb_free =
                        ext4_free_group_clusters(sb, desc);
        }

        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root = RB_ROOT;
        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
        meta_group_info[i]->bb_avg_fragment_size_order = -1;  /* uninit */
        meta_group_info[i]->bb_group = group;

        mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
        return 0;

exit_group_info:
        /* If a meta_group_info table has been allocated, release it now */
        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
                struct ext4_group_info ***group_info;

                rcu_read_lock();
                group_info = rcu_dereference(sbi->s_group_info);
                kfree(group_info[idx]);
                group_info[idx] = NULL;
                rcu_read_unlock();
        }
        return -ENOMEM;
} /* ext4_mb_add_groupinfo */

static int ext4_mb_init_backend(struct super_block *sb)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;
        struct ext4_group_desc *desc;
        struct ext4_group_info ***group_info;
        struct kmem_cache *cachep;

        err = ext4_mb_alloc_groupinfo(sb, ngroups);
        if (err)
                return err;

        sbi->s_buddy_cache = new_inode(sb);
        if (sbi->s_buddy_cache == NULL) {
                ext4_msg(sb, KERN_ERR, "can't get new inode");
                goto err_freesgi;
        }
        /* To avoid potentially colliding with an valid on-disk inode number,
         * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
         * not in the inode hash, so it should never be found by iget(), but
         * this will avoid confusion if it ever shows up during debugging. */
        sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
        ext4_set_inode_mapping_order(sbi->s_buddy_cache);

        for (i = 0; i < ngroups; i++) {
                cond_resched();
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
                        goto err_freebuddy;
                }
                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
                        goto err_freebuddy;
        }

        if (ext4_has_feature_flex_bg(sb)) {
                /* a single flex group is supposed to be read by a single IO.
                 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
                 * unsigned integer, so the maximum shift is 32.
                 */
                if (sbi->s_es->s_log_groups_per_flex >= 32) {
                        ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
                        goto err_freebuddy;
                }
                sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
                        BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
                sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
        } else {
                sbi->s_mb_prefetch = 32;
        }
        if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
                sbi->s_mb_prefetch = ext4_get_groups_count(sb);
        /*
         * now many real IOs to prefetch within a single allocation at
         * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related
         * optimization we shouldn't try to load too many groups, at some point
         * we should start to use what we've got in memory.
         * with an average random access time 5ms, it'd take a second to get
         * 200 groups (* N with flex_bg), so let's make this limit 4
         */
        sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
        if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
                sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);

        return 0;

err_freebuddy:
        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        while (i-- > 0) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);

                if (grp)
                        kmem_cache_free(cachep, grp);
        }
        i = sbi->s_group_info_size;
        rcu_read_lock();
        group_info = rcu_dereference(sbi->s_group_info);
        while (i-- > 0)
                kfree(group_info[i]);
        rcu_read_unlock();
        iput(sbi->s_buddy_cache);
err_freesgi:
        kvfree(rcu_access_pointer(sbi->s_group_info));
        return -ENOMEM;
}

static void ext4_groupinfo_destroy_slabs(void)
{
        int i;

        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
                kmem_cache_destroy(ext4_groupinfo_caches[i]);
                ext4_groupinfo_caches[i] = NULL;
        }
}

static int ext4_groupinfo_create_slab(size_t size)
{
        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
        int slab_size;
        int blocksize_bits = order_base_2(size);
        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
        struct kmem_cache *cachep;

        if (cache_index >= NR_GRPINFO_CACHES)
                return -EINVAL;

        if (unlikely(cache_index < 0))
                cache_index = 0;

        mutex_lock(&ext4_grpinfo_slab_create_mutex);
        if (ext4_groupinfo_caches[cache_index]) {
                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
                return 0;        /* Already created */
        }

        slab_size = offsetof(struct ext4_group_info,
                                bb_counters[blocksize_bits + 2]);

        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
                                        NULL);

        ext4_groupinfo_caches[cache_index] = cachep;

        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
        if (!cachep) {
                printk(KERN_EMERG
                       "EXT4-fs: no memory for groupinfo slab cache\n");
                return -ENOMEM;
        }

        return 0;
}

static void ext4_discard_work(struct work_struct *work)
{
        struct ext4_sb_info *sbi = container_of(work,
                        struct ext4_sb_info, s_discard_work);
        struct super_block *sb = sbi->s_sb;
        struct ext4_free_data *fd, *nfd;
        struct ext4_buddy e4b;
        LIST_HEAD(discard_list);
        ext4_group_t grp, load_grp;
        int err = 0;

        spin_lock(&sbi->s_md_lock);
        list_splice_init(&sbi->s_discard_list, &discard_list);
        spin_unlock(&sbi->s_md_lock);

        load_grp = UINT_MAX;
        list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
                /*
                 * If filesystem is umounting or no memory or suffering
                 * from no space, give up the discard
                 */
                if ((sb->s_flags & SB_ACTIVE) && !err &&
                    !atomic_read(&sbi->s_retry_alloc_pending)) {
                        grp = fd->efd_group;
                        if (grp != load_grp) {
                                if (load_grp != UINT_MAX)
                                        ext4_mb_unload_buddy(&e4b);

                                err = ext4_mb_load_buddy(sb, grp, &e4b);
                                if (err) {
                                        kmem_cache_free(ext4_free_data_cachep, fd);
                                        load_grp = UINT_MAX;
                                        continue;
                                } else {
                                        load_grp = grp;
                                }
                        }

                        ext4_lock_group(sb, grp);
                        ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
                                                fd->efd_start_cluster + fd->efd_count - 1, 1);
                        ext4_unlock_group(sb, grp);
                }
                kmem_cache_free(ext4_free_data_cachep, fd);
        }

        if (load_grp != UINT_MAX)
                ext4_mb_unload_buddy(&e4b);
}

static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi)
{
        if (!sbi->s_mb_avg_fragment_size)
                return;

        for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
                xa_destroy(&sbi->s_mb_avg_fragment_size[i]);

        kfree(sbi->s_mb_avg_fragment_size);
        sbi->s_mb_avg_fragment_size = NULL;
}

static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi)
{
        if (!sbi->s_mb_largest_free_orders)
                return;

        for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
                xa_destroy(&sbi->s_mb_largest_free_orders[i]);

        kfree(sbi->s_mb_largest_free_orders);
        sbi->s_mb_largest_free_orders = NULL;
}

int ext4_mb_init(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned i, j;
        unsigned offset, offset_incr;
        unsigned max;
        int ret;

        i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);

        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
                ret = -ENOMEM;
                goto out;
        }

        i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
                ret = -ENOMEM;
                goto out;
        }

        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
        if (ret < 0)
                goto out;

        /* order 0 is regular bitmap */
        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
        sbi->s_mb_offsets[0] = 0;

        i = 1;
        offset = 0;
        offset_incr = 1 << (sb->s_blocksize_bits - 1);
        max = sb->s_blocksize << 2;
        do {
                sbi->s_mb_offsets[i] = offset;
                sbi->s_mb_maxs[i] = max;
                offset += offset_incr;
                offset_incr = offset_incr >> 1;
                max = max >> 1;
                i++;
        } while (i < MB_NUM_ORDERS(sb));

        sbi->s_mb_avg_fragment_size =
                kmalloc_objs(struct xarray, MB_NUM_ORDERS(sb));
        if (!sbi->s_mb_avg_fragment_size) {
                ret = -ENOMEM;
                goto out;
        }
        for (i = 0; i < MB_NUM_ORDERS(sb); i++)
                xa_init(&sbi->s_mb_avg_fragment_size[i]);

        sbi->s_mb_largest_free_orders =
                kmalloc_objs(struct xarray, MB_NUM_ORDERS(sb));
        if (!sbi->s_mb_largest_free_orders) {
                ret = -ENOMEM;
                goto out;
        }
        for (i = 0; i < MB_NUM_ORDERS(sb); i++)
                xa_init(&sbi->s_mb_largest_free_orders[i]);

        spin_lock_init(&sbi->s_md_lock);
        atomic_set(&sbi->s_mb_free_pending, 0);
        INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
        INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
        INIT_LIST_HEAD(&sbi->s_discard_list);
        INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
        atomic_set(&sbi->s_retry_alloc_pending, 0);

        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
        sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
        sbi->s_mb_stats = MB_DEFAULT_STATS;
        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
        sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;

        /*
         * The default group preallocation is 512, which for 4k block
         * sizes translates to 2 megabytes.  However for bigalloc file
         * systems, this is probably too big (i.e, if the cluster size
         * is 1 megabyte, then group preallocation size becomes half a
         * gigabyte!).  As a default, we will keep a two megabyte
         * group pralloc size for cluster sizes up to 64k, and after
         * that, we will force a minimum group preallocation size of
         * 32 clusters.  This translates to 8 megs when the cluster
         * size is 256k, and 32 megs when the cluster size is 1 meg,
         * which seems reasonable as a default.
         */
        sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
                                       sbi->s_cluster_bits, 32);
        /*
         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
         * to the lowest multiple of s_stripe which is bigger than
         * the s_mb_group_prealloc as determined above. We want
         * the preallocation size to be an exact multiple of the
         * RAID stripe size so that preallocations don't fragment
         * the stripes.
         */
        if (sbi->s_stripe > 1) {
                sbi->s_mb_group_prealloc = roundup(
                        sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe));
        }

        sbi->s_mb_nr_global_goals = umin(num_possible_cpus(),
                                         DIV_ROUND_UP(sbi->s_groups_count, 4));
        sbi->s_mb_last_groups = kzalloc_objs(ext4_group_t,
                                             sbi->s_mb_nr_global_goals);
        if (sbi->s_mb_last_groups == NULL) {
                ret = -ENOMEM;
                goto out;
        }

        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
                ret = -ENOMEM;
                goto out_free_last_groups;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
                lg = per_cpu_ptr(sbi->s_locality_groups, i);
                mutex_init(&lg->lg_mutex);
                for (j = 0; j < PREALLOC_TB_SIZE; j++)
                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
                spin_lock_init(&lg->lg_prealloc_lock);
        }

        if (!bdev_rot(sb->s_bdev))
                sbi->s_mb_max_linear_groups = 0;
        else
                sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0)
                goto out_free_locality_groups;

        return 0;

out_free_locality_groups:
        free_percpu(sbi->s_locality_groups);
        sbi->s_locality_groups = NULL;
out_free_last_groups:
        kfree(sbi->s_mb_last_groups);
        sbi->s_mb_last_groups = NULL;
out:
        ext4_mb_avg_fragment_size_destroy(sbi);
        ext4_mb_largest_free_orders_destroy(sbi);
        kfree(sbi->s_mb_offsets);
        sbi->s_mb_offsets = NULL;
        kfree(sbi->s_mb_maxs);
        sbi->s_mb_maxs = NULL;
        return ret;
}

/* need to called with the ext4 group lock held */
static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
{
        struct ext4_prealloc_space *pa;
        struct list_head *cur, *tmp;
        int count = 0;

        list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                list_del(&pa->pa_group_list);
                count++;
                kmem_cache_free(ext4_pspace_cachep, pa);
        }
        return count;
}

void ext4_mb_release(struct super_block *sb)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int num_meta_group_infos;
        struct ext4_group_info *grinfo, ***group_info;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        int count;

        /*
         * wait the discard work to drain all of ext4_free_data
         */
        flush_work(&sbi->s_discard_work);
        WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));

        group_info = rcu_access_pointer(sbi->s_group_info);
        if (group_info) {
                for (i = 0; i < ngroups; i++) {
                        cond_resched();
                        grinfo = ext4_get_group_info(sb, i);
                        if (!grinfo)
                                continue;
                        mb_group_bb_bitmap_free(grinfo);
                        ext4_lock_group(sb, i);
                        count = ext4_mb_cleanup_pa(grinfo);
                        if (count)
                                mb_debug(sb, "mballoc: %d PAs left\n",
                                         count);
                        ext4_unlock_group(sb, i);
                        kmem_cache_free(cachep, grinfo);
                }
                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                for (i = 0; i < num_meta_group_infos; i++)
                        kfree(group_info[i]);
                kvfree(group_info);
        }
        ext4_mb_avg_fragment_size_destroy(sbi);
        ext4_mb_largest_free_orders_destroy(sbi);
        kfree(sbi->s_mb_offsets);
        kfree(sbi->s_mb_maxs);
        iput(sbi->s_buddy_cache);
        if (sbi->s_mb_stats) {
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u blocks %u reqs (%u success)",
                                atomic_read(&sbi->s_bal_allocated),
                                atomic_read(&sbi->s_bal_reqs),
                                atomic_read(&sbi->s_bal_success));
                ext4_msg(sb, KERN_INFO,
                      "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
                                "%u 2^N hits, %u breaks, %u lost",
                                atomic_read(&sbi->s_bal_ex_scanned),
                                atomic_read(&sbi->s_bal_groups_scanned),
                                atomic_read(&sbi->s_bal_goals),
                                atomic_read(&sbi->s_bal_2orders),
                                atomic_read(&sbi->s_bal_breaks),
                                atomic_read(&sbi->s_mb_lost_chunks));
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u generated and it took %llu",
                                atomic_read(&sbi->s_mb_buddies_generated),
                                atomic64_read(&sbi->s_mb_generation_time));
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u preallocated, %u discarded",
                                atomic_read(&sbi->s_mb_preallocated),
                                atomic_read(&sbi->s_mb_discarded));
        }

        free_percpu(sbi->s_locality_groups);
        kfree(sbi->s_mb_last_groups);
}

static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
        ext4_fsblk_t discard_block;

        discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
                         ext4_group_first_block_no(sb, block_group));
        count = EXT4_C2B(EXT4_SB(sb), count);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);

        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}

static void ext4_free_data_in_buddy(struct super_block *sb,
                                    struct ext4_free_data *entry)
{
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
        int err, count = 0;

        mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
                 entry->efd_count, entry->efd_group, entry);

        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
        /* we expect to find existing buddy because it's pinned */
        BUG_ON(err != 0);

        atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
        db = e4b.bd_info;
        /* there are blocks to put in buddy to make them really free */
        count += entry->efd_count;
        ext4_lock_group(sb, entry->efd_group);
        /* Take it out of per group rb tree */
        rb_erase(&entry->efd_node, &(db->bb_free_root));
        mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);

        /*
         * Clear the trimmed flag for the group so that the next
         * ext4_trim_fs can trim it.
         */
        EXT4_MB_GRP_CLEAR_TRIMMED(db);

        if (!db->bb_free_root.rb_node) {
                /* No more items in the per group rb tree
                 * balance refcounts from ext4_mb_free_metadata()
                 */
                folio_put(e4b.bd_buddy_folio);
                folio_put(e4b.bd_bitmap_folio);
        }
        ext4_unlock_group(sb, entry->efd_group);
        ext4_mb_unload_buddy(&e4b);

        mb_debug(sb, "freed %d blocks in 1 structures\n", count);
}

/*
 * This function is called by the jbd2 layer once the commit has finished,
 * so we know we can free the blocks that were released with that commit.
 */
void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_free_data *entry, *tmp;
        LIST_HEAD(freed_data_list);
        struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1];
        bool wake;

        list_replace_init(s_freed_head, &freed_data_list);

        list_for_each_entry(entry, &freed_data_list, efd_list)
                ext4_free_data_in_buddy(sb, entry);

        if (test_opt(sb, DISCARD)) {
                spin_lock(&sbi->s_md_lock);
                wake = list_empty(&sbi->s_discard_list);
                list_splice_tail(&freed_data_list, &sbi->s_discard_list);
                spin_unlock(&sbi->s_md_lock);
                if (wake)
                        queue_work(system_dfl_wq, &sbi->s_discard_work);
        } else {
                list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
                        kmem_cache_free(ext4_free_data_cachep, entry);
        }
}

int __init ext4_init_mballoc(void)
{
        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
                                        SLAB_RECLAIM_ACCOUNT);
        if (ext4_pspace_cachep == NULL)
                goto out;

        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
                                    SLAB_RECLAIM_ACCOUNT);
        if (ext4_ac_cachep == NULL)
                goto out_pa_free;

        ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
                                           SLAB_RECLAIM_ACCOUNT);
        if (ext4_free_data_cachep == NULL)
                goto out_ac_free;

        return 0;

out_ac_free:
        kmem_cache_destroy(ext4_ac_cachep);
out_pa_free:
        kmem_cache_destroy(ext4_pspace_cachep);
out:
        return -ENOMEM;
}

void ext4_exit_mballoc(void)
{
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
         */
        rcu_barrier();
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_data_cachep);
        ext4_groupinfo_destroy_slabs();
}

#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001
#define EXT4_MB_SYNC_UPDATE 0x0002
int
ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
                     ext4_group_t group, ext4_grpblk_t blkoff,
                     ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_group_desc *gdp;
        struct buffer_head *gdp_bh;
        int err;
        unsigned int i, already, changed = len;

        KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context,
                                   handle, sb, state, group, blkoff, len,
                                   flags, ret_changed);

        if (ret_changed)
                *ret_changed = 0;
        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (IS_ERR(bitmap_bh))
                return PTR_ERR(bitmap_bh);

        if (handle) {
                BUFFER_TRACE(bitmap_bh, "getting write access");
                err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto out_err;
        }

        err = -EIO;
        gdp = ext4_get_group_desc(sb, group, &gdp_bh);
        if (!gdp)
                goto out_err;

        if (handle) {
                BUFFER_TRACE(gdp_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, sb, gdp_bh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto out_err;
        }

        ext4_lock_group(sb, group);
        if (ext4_has_group_desc_csum(sb) &&
            (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                ext4_free_group_clusters_set(sb, gdp,
                        ext4_free_clusters_after_init(sb, group, gdp));
        }

        if (flags & EXT4_MB_BITMAP_MARKED_CHECK) {
                already = 0;
                for (i = 0; i < len; i++)
                        if (mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
                                        state)
                                already++;
                changed = len - already;
        }

        if (state) {
                mb_set_bits(bitmap_bh->b_data, blkoff, len);
                ext4_free_group_clusters_set(sb, gdp,
                        ext4_free_group_clusters(sb, gdp) - changed);
        } else {
                mb_clear_bits(bitmap_bh->b_data, blkoff, len);
                ext4_free_group_clusters_set(sb, gdp,
                        ext4_free_group_clusters(sb, gdp) + changed);
        }

        ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
        ext4_group_desc_csum_set(sb, group, gdp);
        ext4_unlock_group(sb, group);
        if (ret_changed)
                *ret_changed = changed;

        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, group);
                struct flex_groups *fg = sbi_array_rcu_deref(sbi,
                                           s_flex_groups, flex_group);

                if (state)
                        atomic64_sub(changed, &fg->free_clusters);
                else
                        atomic64_add(changed, &fg->free_clusters);
        }

        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        if (err)
                goto out_err;
        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
        if (err)
                goto out_err;

        if (flags & EXT4_MB_SYNC_UPDATE) {
                sync_dirty_buffer(bitmap_bh);
                sync_dirty_buffer(gdp_bh);
        }

out_err:
        brelse(bitmap_bh);
        return err;
}

/*
 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
 * Returns 0 if success or error code
 */
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle)
{
        struct ext4_group_desc *gdp;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block;
        int err, len;
        int flags = 0;
        ext4_grpblk_t changed;

        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(ac->ac_b_ex.fe_len <= 0);

        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);

        gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL);
        if (!gdp)
                return -EIO;
        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
                        ext4_free_group_clusters(sb, gdp));

        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
        if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
                           "fs metadata", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and return EFSCORRUPTED
                 * We leak some of the blocks here.
                 */
                err = ext4_mb_mark_context(handle, sb, true,
                                           ac->ac_b_ex.fe_group,
                                           ac->ac_b_ex.fe_start,
                                           ac->ac_b_ex.fe_len,
                                           0, NULL);
                if (!err)
                        err = -EFSCORRUPTED;
                return err;
        }

#ifdef AGGRESSIVE_CHECK
        flags |= EXT4_MB_BITMAP_MARKED_CHECK;
#endif
        err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group,
                                   ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len,
                                   flags, &changed);

        if (err && changed == 0)
                return err;

#ifdef AGGRESSIVE_CHECK
        BUG_ON(changed != ac->ac_b_ex.fe_len);
#endif
        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);

        return err;
}

/*
 * Idempotent helper for Ext4 fast commit replay path to set the state of
 * blocks in bitmaps and update counters.
 */
void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
                     int len, bool state)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
        ext4_grpblk_t blkoff;
        int err = 0;
        unsigned int clen, thisgrp_len;

        while (len > 0) {
                ext4_get_group_no_and_offset(sb, block, &group, &blkoff);

                /*
                 * Check to see if we are freeing blocks across a group
                 * boundary.
                 * In case of flex_bg, this can happen that (block, len) may
                 * span across more than one group. In that case we need to
                 * get the corresponding group metadata to work with.
                 * For this we have goto again loop.
                 */
                thisgrp_len = min(len, EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
                clen = EXT4_NUM_B2C(sbi, thisgrp_len);

                if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
                        ext4_error(sb, "Marking blocks in system zone - "
                                   "Block = %llu, len = %u",
                                   block, thisgrp_len);
                        break;
                }

                err = ext4_mb_mark_context(NULL, sb, state,
                                           group, blkoff, clen,
                                           EXT4_MB_BITMAP_MARKED_CHECK |
                                           EXT4_MB_SYNC_UPDATE,
                                           NULL);
                if (err)
                        break;

                block += thisgrp_len;
                len -= thisgrp_len;
                BUG_ON(len < 0);
        }
}

/*
 * here we normalize request for locality group
 * Group request are normalized to s_mb_group_prealloc, which goes to
 * s_strip if we set the same via mount option.
 * s_mb_group_prealloc can be configured via
 * /sys/fs/ext4/<partition>/mb_group_prealloc
 *
 * XXX: should we try to preallocate more than the group has now?
 */
static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_locality_group *lg = ac->ac_lg;

        BUG_ON(lg == NULL);
        ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
        mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
}

/*
 * This function returns the next element to look at during inode
 * PA rbtree walk. We assume that we have held the inode PA rbtree lock
 * (ei->i_prealloc_lock)
 *
 * new_start        The start of the range we want to compare
 * cur_start        The existing start that we are comparing against
 * node        The node of the rb_tree
 */
static inline struct rb_node*
ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node)
{
        if (new_start < cur_start)
                return node->rb_left;
        else
                return node->rb_right;
}

static inline void
ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
                          ext4_lblk_t start, loff_t end)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_prealloc_space *tmp_pa;
        ext4_lblk_t tmp_pa_start;
        loff_t tmp_pa_end;
        struct rb_node *iter;

        read_lock(&ei->i_prealloc_lock);
        for (iter = ei->i_prealloc_node.rb_node; iter;
             iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) {
                tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                  pa_node.inode_node);
                tmp_pa_start = tmp_pa->pa_lstart;
                tmp_pa_end = pa_logical_end(sbi, tmp_pa);

                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted == 0)
                        BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start));
                spin_unlock(&tmp_pa->pa_lock);
        }
        read_unlock(&ei->i_prealloc_lock);
}

/*
 * Given an allocation context "ac" and a range "start", "end", check
 * and adjust boundaries if the range overlaps with any of the existing
 * preallocatoins stored in the corresponding inode of the allocation context.
 *
 * Parameters:
 *        ac                        allocation context
 *        start                        start of the new range
 *        end                        end of the new range
 */
static inline void
ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
                          ext4_lblk_t *start, loff_t *end)
{
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL;
        struct rb_node *iter;
        ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1;
        loff_t new_end, tmp_pa_end, left_pa_end = -1;

        new_start = *start;
        new_end = *end;

        /*
         * Adjust the normalized range so that it doesn't overlap with any
         * existing preallocated blocks(PAs). Make sure to hold the rbtree lock
         * so it doesn't change underneath us.
         */
        read_lock(&ei->i_prealloc_lock);

        /* Step 1: find any one immediate neighboring PA of the normalized range */
        for (iter = ei->i_prealloc_node.rb_node; iter;
             iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
                                            tmp_pa_start, iter)) {
                tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                  pa_node.inode_node);
                tmp_pa_start = tmp_pa->pa_lstart;
                tmp_pa_end = pa_logical_end(sbi, tmp_pa);

                /* PA must not overlap original request */
                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted == 0)
                        BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
                                 ac->ac_o_ex.fe_logical < tmp_pa_start));
                spin_unlock(&tmp_pa->pa_lock);
        }

        /*
         * Step 2: check if the found PA is left or right neighbor and
         * get the other neighbor
         */
        if (tmp_pa) {
                if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) {
                        struct rb_node *tmp;

                        left_pa = tmp_pa;
                        tmp = rb_next(&left_pa->pa_node.inode_node);
                        if (tmp) {
                                right_pa = rb_entry(tmp,
                                                    struct ext4_prealloc_space,
                                                    pa_node.inode_node);
                        }
                } else {
                        struct rb_node *tmp;

                        right_pa = tmp_pa;
                        tmp = rb_prev(&right_pa->pa_node.inode_node);
                        if (tmp) {
                                left_pa = rb_entry(tmp,
                                                   struct ext4_prealloc_space,
                                                   pa_node.inode_node);
                        }
                }
        }

        /* Step 3: get the non deleted neighbors */
        if (left_pa) {
                for (iter = &left_pa->pa_node.inode_node;;
                     iter = rb_prev(iter)) {
                        if (!iter) {
                                left_pa = NULL;
                                break;
                        }

                        tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                          pa_node.inode_node);
                        left_pa = tmp_pa;
                        spin_lock(&tmp_pa->pa_lock);
                        if (tmp_pa->pa_deleted == 0) {
                                spin_unlock(&tmp_pa->pa_lock);
                                break;
                        }
                        spin_unlock(&tmp_pa->pa_lock);
                }
        }

        if (right_pa) {
                for (iter = &right_pa->pa_node.inode_node;;
                     iter = rb_next(iter)) {
                        if (!iter) {
                                right_pa = NULL;
                                break;
                        }

                        tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                          pa_node.inode_node);
                        right_pa = tmp_pa;
                        spin_lock(&tmp_pa->pa_lock);
                        if (tmp_pa->pa_deleted == 0) {
                                spin_unlock(&tmp_pa->pa_lock);
                                break;
                        }
                        spin_unlock(&tmp_pa->pa_lock);
                }
        }

        if (left_pa) {
                left_pa_end = pa_logical_end(sbi, left_pa);
                BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical);
        }

        if (right_pa) {
                right_pa_start = right_pa->pa_lstart;
                BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical);
        }

        /* Step 4: trim our normalized range to not overlap with the neighbors */
        if (left_pa) {
                if (left_pa_end > new_start)
                        new_start = left_pa_end;
        }

        if (right_pa) {
                if (right_pa_start < new_end)
                        new_end = right_pa_start;
        }
        read_unlock(&ei->i_prealloc_lock);

        /* XXX: extra loop to check we really don't overlap preallocations */
        ext4_mb_pa_assert_overlap(ac, new_start, new_end);

        *start = new_start;
        *end = new_end;
}

/*
 * Normalization means making request better in terms of
 * size and alignment
 */
static noinline_for_stack void
ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                                struct ext4_allocation_request *ar)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_super_block *es = sbi->s_es;
        int bsbits, max;
        loff_t size, start_off, end;
        loff_t orig_size __maybe_unused;
        ext4_lblk_t start;

        /* do normalize only data requests, metadata requests
           do not need preallocation */
        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                return;

        /* sometime caller may want exact blocks */
        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                return;

        /* caller may indicate that preallocation isn't
         * required (it's a tail, for example) */
        if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
                return;

        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
                ext4_mb_normalize_group_request(ac);
                return ;
        }

        bsbits = ac->ac_sb->s_blocksize_bits;

        /* first, let's learn actual file size
         * given current request is allocated */
        size = extent_logical_end(sbi, &ac->ac_o_ex);
        size = size << bsbits;
        if (size < i_size_read(ac->ac_inode))
                size = i_size_read(ac->ac_inode);
        orig_size = size;

        /* max size of free chunks */
        max = 2 << bsbits;

#define NRL_CHECK_SIZE(req, size, max, chunk_size)        \
                (req <= (size) || max <= (chunk_size))

        /* first, try to predict filesize */
        start_off = 0;
        if (size <= SZ_1M) {
                /*
                 * For files up to 1MB, round up the preallocation size to
                 * the next power of two, with a minimum of 16KB.
                 */
                if (size <= (unsigned long)SZ_16K)
                        size = SZ_16K;
                else
                        size = roundup_pow_of_two(size);
        } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
                                                (21 - bsbits)) << 21;
                size = 2 * 1024 * 1024;
        } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
                                                        (22 - bsbits)) << 22;
                size = 4 * 1024 * 1024;
        } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
                                        (8<<20)>>bsbits, max, 8 * 1024)) {
                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
                                                        (23 - bsbits)) << 23;
                size = 8 * 1024 * 1024;
        } else {
                start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
                size          = (loff_t) EXT4_C2B(sbi,
                                              ac->ac_o_ex.fe_len) << bsbits;
        }
        size = size >> bsbits;
        start = start_off >> bsbits;

        /*
         * For tiny groups (smaller than 8MB) the chosen allocation
         * alignment may be larger than group size. Make sure the
         * alignment does not move allocation to a different group which
         * makes mballoc fail assertions later.
         */
        start = max(start, rounddown(ac->ac_o_ex.fe_logical,
                        (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));

        /* avoid unnecessary preallocation that may trigger assertions */
        if (start + size > EXT_MAX_BLOCKS)
                size = EXT_MAX_BLOCKS - start;

        /* don't cover already allocated blocks in selected range */
        if (ar->pleft && start <= ar->lleft) {
                size -= ar->lleft + 1 - start;
                start = ar->lleft + 1;
        }
        if (ar->pright && start + size - 1 >= ar->lright)
                size -= start + size - ar->lright;

        /*
         * Trim allocation request for filesystems with artificially small
         * groups.
         */
        if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
                size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);

        end = start + size;

        ext4_mb_pa_adjust_overlap(ac, &start, &end);

        size = end - start;

        /*
         * In this function "start" and "size" are normalized for better
         * alignment and length such that we could preallocate more blocks.
         * This normalization is done such that original request of
         * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
         * "size" boundaries.
         * (Note fe_len can be relaxed since FS block allocation API does not
         * provide gurantee on number of contiguous blocks allocation since that
         * depends upon free space left, etc).
         * In case of inode pa, later we use the allocated blocks
         * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated
         * range of goal/best blocks [start, size] to put it at the
         * ac_o_ex.fe_logical extent of this inode.
         * (See ext4_mb_use_inode_pa() for more details)
         */
        if (start + size <= ac->ac_o_ex.fe_logical ||
                        start > ac->ac_o_ex.fe_logical) {
                ext4_msg(ac->ac_sb, KERN_ERR,
                         "start %lu, size %lu, fe_logical %lu",
                         (unsigned long) start, (unsigned long) size,
                         (unsigned long) ac->ac_o_ex.fe_logical);
                BUG();
        }
        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));

        /* now prepare goal request */

        /* XXX: is it better to align blocks WRT to logical
         * placement or satisfy big request as is */
        ac->ac_g_ex.fe_logical = start;
        ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
        ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;

        /* define goal start in order to merge */
        if (ar->pright && (ar->lright == (start + size)) &&
            ar->pright >= size &&
            ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
                /* merge to the right */
                ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
                                                &ac->ac_g_ex.fe_group,
                                                &ac->ac_g_ex.fe_start);
                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
        }
        if (ar->pleft && (ar->lleft + 1 == start) &&
            ar->pleft + 1 < ext4_blocks_count(es)) {
                /* merge to the left */
                ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
                                                &ac->ac_g_ex.fe_group,
                                                &ac->ac_g_ex.fe_start);
                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
        }

        mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
                 orig_size, start);
}

static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);

        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
                atomic_inc(&sbi->s_bal_reqs);
                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
                        atomic_inc(&sbi->s_bal_success);

                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
                for (int i=0; i<EXT4_MB_NUM_CRS; i++) {
                        atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]);
                }

                atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
                        atomic_inc(&sbi->s_bal_goals);
                /* did we allocate as much as normalizer originally wanted? */
                if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len)
                        atomic_inc(&sbi->s_bal_len_goals);

                if (ac->ac_found > sbi->s_mb_max_to_scan)
                        atomic_inc(&sbi->s_bal_breaks);
        }

        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
                trace_ext4_mballoc_alloc(ac);
        else
                trace_ext4_mballoc_prealloc(ac);
}

/*
 * Called on failure; free up any blocks from the inode PA for this
 * context.  We don't need this for MB_GROUP_PA because we only change
 * pa_free in ext4_mb_release_context(), but on failure, we've already
 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
 */
static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
{
        struct ext4_prealloc_space *pa = ac->ac_pa;
        struct ext4_buddy e4b;
        int err;

        if (pa == NULL) {
                if (ac->ac_f_ex.fe_len == 0)
                        return;
                err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
                if (WARN_RATELIMIT(err,
                                   "ext4: mb_load_buddy failed (%d)", err))
                        /*
                         * This should never happen since we pin the
                         * folios in the ext4_allocation_context so
                         * ext4_mb_load_buddy() should never fail.
                         */
                        return;
                ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
                mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
                               ac->ac_f_ex.fe_len);
                ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
                ext4_mb_unload_buddy(&e4b);
                return;
        }
        if (pa->pa_type == MB_INODE_PA) {
                spin_lock(&pa->pa_lock);
                pa->pa_free += ac->ac_b_ex.fe_len;
                spin_unlock(&pa->pa_lock);
        }
}

/*
 * use blocks preallocated to inode
 */
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
                                struct ext4_prealloc_space *pa)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        ext4_fsblk_t start;
        ext4_fsblk_t end;
        int len;

        /* found preallocated blocks, use them */
        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
        end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
                  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
        len = EXT4_NUM_B2C(sbi, end - start);
        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
                                        &ac->ac_b_ex.fe_start);
        ac->ac_b_ex.fe_len = len;
        ac->ac_status = AC_STATUS_FOUND;
        ac->ac_pa = pa;

        BUG_ON(start < pa->pa_pstart);
        BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
        BUG_ON(pa->pa_free < len);
        BUG_ON(ac->ac_b_ex.fe_len <= 0);
        pa->pa_free -= len;

        mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
}

/*
 * use blocks preallocated to locality group
 */
static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
                                struct ext4_prealloc_space *pa)
{
        unsigned int len = ac->ac_o_ex.fe_len;

        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
                                        &ac->ac_b_ex.fe_group,
                                        &ac->ac_b_ex.fe_start);
        ac->ac_b_ex.fe_len = len;
        ac->ac_status = AC_STATUS_FOUND;
        ac->ac_pa = pa;

        /* we don't correct pa_pstart or pa_len here to avoid
         * possible race when the group is being loaded concurrently
         * instead we correct pa later, after blocks are marked
         * in on-disk bitmap -- see ext4_mb_release_context()
         * Other CPUs are prevented from allocating from this pa by lg_mutex
         */
        mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
                 pa->pa_lstart, len, pa);
}

/*
 * Return the prealloc space that have minimal distance
 * from the goal block. @cpa is the prealloc
 * space that is having currently known minimal distance
 * from the goal block.
 */
static struct ext4_prealloc_space *
ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
                        struct ext4_prealloc_space *pa,
                        struct ext4_prealloc_space *cpa)
{
        ext4_fsblk_t cur_distance, new_distance;

        if (cpa == NULL) {
                atomic_inc(&pa->pa_count);
                return pa;
        }
        cur_distance = abs(goal_block - cpa->pa_pstart);
        new_distance = abs(goal_block - pa->pa_pstart);

        if (cur_distance <= new_distance)
                return cpa;

        /* drop the previous reference */
        atomic_dec(&cpa->pa_count);
        atomic_inc(&pa->pa_count);
        return pa;
}

/*
 * check if found pa meets EXT4_MB_HINT_GOAL_ONLY
 */
static bool
ext4_mb_pa_goal_check(struct ext4_allocation_context *ac,
                      struct ext4_prealloc_space *pa)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        ext4_fsblk_t start;

        if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)))
                return true;

        /*
         * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted
         * in ext4_mb_normalize_request and will keep same with ac_o_ex
         * from ext4_mb_initialize_context. Choose ac_g_ex here to keep
         * consistent with ext4_mb_find_by_goal.
         */
        start = pa->pa_pstart +
                (ac->ac_g_ex.fe_logical - pa->pa_lstart);
        if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start)
                return false;

        if (ac->ac_g_ex.fe_len > pa->pa_len -
            EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart))
                return false;

        return true;
}

/*
 * search goal blocks in preallocated space
 */
static noinline_for_stack bool
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int order, i;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_locality_group *lg;
        struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL;
        struct rb_node *iter;
        ext4_fsblk_t goal_block;

        /* only data can be preallocated */
        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                return false;

        /*
         * first, try per-file preallocation by searching the inode pa rbtree.
         *
         * Here, we can't do a direct traversal of the tree because
         * ext4_mb_discard_group_preallocation() can paralelly mark the pa
         * deleted and that can cause direct traversal to skip some entries.
         */
        read_lock(&ei->i_prealloc_lock);

        if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) {
                goto try_group_pa;
        }

        /*
         * Step 1: Find a pa with logical start immediately adjacent to the
         * original logical start. This could be on the left or right.
         *
         * (tmp_pa->pa_lstart never changes so we can skip locking for it).
         */
        for (iter = ei->i_prealloc_node.rb_node; iter;
             iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
                                            tmp_pa->pa_lstart, iter)) {
                tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                  pa_node.inode_node);
        }

        /*
         * Step 2: The adjacent pa might be to the right of logical start, find
         * the left adjacent pa. After this step we'd have a valid tmp_pa whose
         * logical start is towards the left of original request's logical start
         */
        if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) {
                struct rb_node *tmp;
                tmp = rb_prev(&tmp_pa->pa_node.inode_node);

                if (tmp) {
                        tmp_pa = rb_entry(tmp, struct ext4_prealloc_space,
                                            pa_node.inode_node);
                } else {
                        /*
                         * If there is no adjacent pa to the left then finding
                         * an overlapping pa is not possible hence stop searching
                         * inode pa tree
                         */
                        goto try_group_pa;
                }
        }

        BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));

        /*
         * Step 3: If the left adjacent pa is deleted, keep moving left to find
         * the first non deleted adjacent pa. After this step we should have a
         * valid tmp_pa which is guaranteed to be non deleted.
         */
        for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) {
                if (!iter) {
                        /*
                         * no non deleted left adjacent pa, so stop searching
                         * inode pa tree
                         */
                        goto try_group_pa;
                }
                tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                  pa_node.inode_node);
                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted == 0) {
                        /*
                         * We will keep holding the pa_lock from
                         * this point on because we don't want group discard
                         * to delete this pa underneath us. Since group
                         * discard is anyways an ENOSPC operation it
                         * should be okay for it to wait a few more cycles.
                         */
                        break;
                } else {
                        spin_unlock(&tmp_pa->pa_lock);
                }
        }

        BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
        BUG_ON(tmp_pa->pa_deleted == 1);

        /*
         * Step 4: We now have the non deleted left adjacent pa. Only this
         * pa can possibly satisfy the request hence check if it overlaps
         * original logical start and stop searching if it doesn't.
         */
        if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) {
                spin_unlock(&tmp_pa->pa_lock);
                goto try_group_pa;
        }

        /* non-extent files can't have physical blocks past 2^32 */
        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
            (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
             EXT4_MAX_BLOCK_FILE_PHYS)) {
                /*
                 * Since PAs don't overlap, we won't find any other PA to
                 * satisfy this.
                 */
                spin_unlock(&tmp_pa->pa_lock);
                goto try_group_pa;
        }

        if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
                atomic_inc(&tmp_pa->pa_count);
                ext4_mb_use_inode_pa(ac, tmp_pa);
                spin_unlock(&tmp_pa->pa_lock);
                read_unlock(&ei->i_prealloc_lock);
                return true;
        } else {
                /*
                 * We found a valid overlapping pa but couldn't use it because
                 * it had no free blocks. This should ideally never happen
                 * because:
                 *
                 * 1. When a new inode pa is added to rbtree it must have
                 *    pa_free > 0 since otherwise we won't actually need
                 *    preallocation.
                 *
                 * 2. An inode pa that is in the rbtree can only have it's
                 *    pa_free become zero when another thread calls:
                 *      ext4_mb_new_blocks
                 *       ext4_mb_use_preallocated
                 *        ext4_mb_use_inode_pa
                 *
                 * 3. Further, after the above calls make pa_free == 0, we will
                 *    immediately remove it from the rbtree in:
                 *      ext4_mb_new_blocks
                 *       ext4_mb_release_context
                 *        ext4_mb_put_pa
                 *
                 * 4. Since the pa_free becoming 0 and pa_free getting removed
                 * from tree both happen in ext4_mb_new_blocks, which is always
                 * called with i_data_sem held for data allocations, we can be
                 * sure that another process will never see a pa in rbtree with
                 * pa_free == 0.
                 */
                WARN_ON_ONCE(tmp_pa->pa_free == 0);
        }
        spin_unlock(&tmp_pa->pa_lock);
try_group_pa:
        read_unlock(&ei->i_prealloc_lock);

        /* can we use group allocation? */
        if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
                return false;

        /* inode may have no locality group for some reason */
        lg = ac->ac_lg;
        if (lg == NULL)
                return false;
        order  = fls(ac->ac_o_ex.fe_len) - 1;
        if (order > PREALLOC_TB_SIZE - 1)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;

        goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
        /*
         * search for the prealloc space that is having
         * minimal distance from the goal block.
         */
        for (i = order; i < PREALLOC_TB_SIZE; i++) {
                rcu_read_lock();
                list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i],
                                        pa_node.lg_list) {
                        spin_lock(&tmp_pa->pa_lock);
                        if (tmp_pa->pa_deleted == 0 &&
                                        tmp_pa->pa_free >= ac->ac_o_ex.fe_len) {

                                cpa = ext4_mb_check_group_pa(goal_block,
                                                                tmp_pa, cpa);
                        }
                        spin_unlock(&tmp_pa->pa_lock);
                }
                rcu_read_unlock();
        }
        if (cpa) {
                ext4_mb_use_group_pa(ac, cpa);
                return true;
        }
        return false;
}

/*
 * the function goes through all preallocation in this group and marks them
 * used in in-core bitmap. buddy must be generated from this bitmap
 * Need to be called with ext4 group lock held
 */
static noinline_for_stack
void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group)
{
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_prealloc_space *pa;
        struct list_head *cur;
        ext4_group_t groupnr;
        ext4_grpblk_t start;
        int preallocated = 0;
        int len;

        if (!grp)
                return;

        /* all form of preallocation discards first load group,
         * so the only competing code is preallocation use.
         * we don't need any locking here
         * notice we do NOT ignore preallocations with pa_deleted
         * otherwise we could leave used blocks available for
         * allocation in buddy when concurrent ext4_mb_put_pa()
         * is dropping preallocation
         */
        list_for_each(cur, &grp->bb_prealloc_list) {
                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                spin_lock(&pa->pa_lock);
                ext4_get_group_no_and_offset(sb, pa->pa_pstart,
                                             &groupnr, &start);
                len = pa->pa_len;
                spin_unlock(&pa->pa_lock);
                if (unlikely(len == 0))
                        continue;
                BUG_ON(groupnr != group);
                mb_set_bits(bitmap, start, len);
                preallocated += len;
        }
        mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
}

static void ext4_mb_mark_pa_deleted(struct super_block *sb,
                                    struct ext4_prealloc_space *pa)
{
        struct ext4_inode_info *ei;

        if (pa->pa_deleted) {
                ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
                             pa->pa_type, pa->pa_pstart, pa->pa_lstart,
                             pa->pa_len);
                return;
        }

        pa->pa_deleted = 1;

        if (pa->pa_type == MB_INODE_PA) {
                ei = EXT4_I(pa->pa_inode);
                atomic_dec(&ei->i_prealloc_active);
        }
}

static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa)
{
        BUG_ON(!pa);
        BUG_ON(atomic_read(&pa->pa_count));
        BUG_ON(pa->pa_deleted == 0);
        kmem_cache_free(ext4_pspace_cachep, pa);
}

static void ext4_mb_pa_callback(struct rcu_head *head)
{
        struct ext4_prealloc_space *pa;

        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
        ext4_mb_pa_free(pa);
}

/*
 * drops a reference to preallocated space descriptor
 * if this was the last reference and the space is consumed
 */
static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
                        struct super_block *sb, struct ext4_prealloc_space *pa)
{
        ext4_group_t grp;
        ext4_fsblk_t grp_blk;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);

        /* in this short window concurrent discard can set pa_deleted */
        spin_lock(&pa->pa_lock);
        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
                spin_unlock(&pa->pa_lock);
                return;
        }

        if (pa->pa_deleted == 1) {
                spin_unlock(&pa->pa_lock);
                return;
        }

        ext4_mb_mark_pa_deleted(sb, pa);
        spin_unlock(&pa->pa_lock);

        grp_blk = pa->pa_pstart;
        /*
         * If doing group-based preallocation, pa_pstart may be in the
         * next group when pa is used up
         */
        if (pa->pa_type == MB_GROUP_PA)
                grp_blk--;

        grp = ext4_get_group_number(sb, grp_blk);

        /*
         * possible race:
         *
         *  P1 (buddy init)                        P2 (regular allocation)
         *                                        find block B in PA
         *  copy on-disk bitmap to buddy
         *                                          mark B in on-disk bitmap
         *                                        drop PA from group
         *  mark all PAs in buddy
         *
         * thus, P1 initializes buddy with B available. to prevent this
         * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
         * against that pair
         */
        ext4_lock_group(sb, grp);
        list_del(&pa->pa_group_list);
        ext4_unlock_group(sb, grp);

        if (pa->pa_type == MB_INODE_PA) {
                write_lock(pa->pa_node_lock.inode_lock);
                rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
                write_unlock(pa->pa_node_lock.inode_lock);
                ext4_mb_pa_free(pa);
        } else {
                spin_lock(pa->pa_node_lock.lg_lock);
                list_del_rcu(&pa->pa_node.lg_list);
                spin_unlock(pa->pa_node_lock.lg_lock);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
}

static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new)
{
        struct rb_node **iter = &root->rb_node, *parent = NULL;
        struct ext4_prealloc_space *iter_pa, *new_pa;
        ext4_lblk_t iter_start, new_start;

        while (*iter) {
                iter_pa = rb_entry(*iter, struct ext4_prealloc_space,
                                   pa_node.inode_node);
                new_pa = rb_entry(new, struct ext4_prealloc_space,
                                   pa_node.inode_node);
                iter_start = iter_pa->pa_lstart;
                new_start = new_pa->pa_lstart;

                parent = *iter;
                if (new_start < iter_start)
                        iter = &((*iter)->rb_left);
                else
                        iter = &((*iter)->rb_right);
        }

        rb_link_node(new, parent, iter);
        rb_insert_color(new, root);
}

/*
 * creates new preallocated space for given inode
 */
static noinline_for_stack void
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_prealloc_space *pa;
        struct ext4_group_info *grp;
        struct ext4_inode_info *ei;

        /* preallocate only when found space is larger then requested */
        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
        BUG_ON(ac->ac_pa == NULL);

        pa = ac->ac_pa;

        if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) {
                struct ext4_free_extent ex = {
                        .fe_logical = ac->ac_g_ex.fe_logical,
                        .fe_len = ac->ac_orig_goal_len,
                };
                loff_t orig_goal_end = extent_logical_end(sbi, &ex);
                loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex);

                /*
                 * We can't allocate as much as normalizer wants, so we try
                 * to get proper lstart to cover the original request, except
                 * when the goal doesn't cover the original request as below:
                 *
                 * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
                 * best_ex:0/200(200) -> adjusted: 1848/2048(200)
                 */
                BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
                BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);

                /*
                 * Use the below logic for adjusting best extent as it keeps
                 * fragmentation in check while ensuring logical range of best
                 * extent doesn't overflow out of goal extent:
                 *
                 * 1. Check if best ex can be kept at end of goal (before
                 *    cr_best_avail trimmed it) and still cover original start
                 * 2. Else, check if best ex can be kept at start of goal and
                 *    still cover original end
                 * 3. Else, keep the best ex at start of original request.
                 */
                ex.fe_len = ac->ac_b_ex.fe_len;

                ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len);
                if (ac->ac_o_ex.fe_logical >= ex.fe_logical)
                        goto adjust_bex;

                ex.fe_logical = ac->ac_g_ex.fe_logical;
                if (o_ex_end <= extent_logical_end(sbi, &ex))
                        goto adjust_bex;

                ex.fe_logical = ac->ac_o_ex.fe_logical;
adjust_bex:
                ac->ac_b_ex.fe_logical = ex.fe_logical;

                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
                BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
        }

        pa->pa_lstart = ac->ac_b_ex.fe_logical;
        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
        pa->pa_len = ac->ac_b_ex.fe_len;
        pa->pa_free = pa->pa_len;
        spin_lock_init(&pa->pa_lock);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
        pa->pa_type = MB_INODE_PA;

        mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
                 pa->pa_len, pa->pa_lstart);
        trace_ext4_mb_new_inode_pa(ac, pa);

        atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
        ext4_mb_use_inode_pa(ac, pa);

        ei = EXT4_I(ac->ac_inode);
        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
        if (!grp)
                return;

        pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock;
        pa->pa_inode = ac->ac_inode;

        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);

        write_lock(pa->pa_node_lock.inode_lock);
        ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node);
        write_unlock(pa->pa_node_lock.inode_lock);
        atomic_inc(&ei->i_prealloc_active);
}

/*
 * creates new preallocated space for locality group inodes belongs to
 */
static noinline_for_stack void
ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_locality_group *lg;
        struct ext4_prealloc_space *pa;
        struct ext4_group_info *grp;

        /* preallocate only when found space is larger then requested */
        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
        BUG_ON(ac->ac_pa == NULL);

        pa = ac->ac_pa;

        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
        pa->pa_lstart = pa->pa_pstart;
        pa->pa_len = ac->ac_b_ex.fe_len;
        pa->pa_free = pa->pa_len;
        spin_lock_init(&pa->pa_lock);
        INIT_LIST_HEAD(&pa->pa_node.lg_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
        pa->pa_type = MB_GROUP_PA;

        mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
                 pa->pa_len, pa->pa_lstart);
        trace_ext4_mb_new_group_pa(ac, pa);

        ext4_mb_use_group_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);

        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
        if (!grp)
                return;
        lg = ac->ac_lg;
        BUG_ON(lg == NULL);

        pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock;
        pa->pa_inode = NULL;

        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);

        /*
         * We will later add the new pa to the right bucket
         * after updating the pa_free in ext4_mb_release_context
         */
}

static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
{
        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
                ext4_mb_new_group_pa(ac);
        else
                ext4_mb_new_inode_pa(ac);
}

/*
 * finds all unused blocks in on-disk bitmap, frees them in
 * in-core bitmap and buddy.
 * @pa must be unlinked from inode and group lists, so that
 * nobody else can find/use it.
 * the caller MUST hold group/inode locks.
 * TODO: optimize the case when there are no in-core structures yet
 */
static noinline_for_stack void
ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        struct ext4_prealloc_space *pa)
{
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned int end;
        unsigned int next;
        ext4_group_t group;
        ext4_grpblk_t bit;
        unsigned long long grp_blk_start;
        int free = 0;

        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;

        while (bit < end) {
                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                if (bit >= end)
                        break;
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
                mb_debug(sb, "free preallocated %u/%u in group %u\n",
                         (unsigned) ext4_group_first_block_no(sb, group) + bit,
                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;

                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
                trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
                                                    EXT4_C2B(sbi, bit)),
                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
        if (free != pa->pa_free) {
                ext4_msg(e4b->bd_sb, KERN_CRIT,
                         "pa %p: logic %lu, phys. %lu, len %d",
                         pa, (unsigned long) pa->pa_lstart,
                         (unsigned long) pa->pa_pstart,
                         pa->pa_len);
                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
                                        free, pa->pa_free);
                /*
                 * pa is already deleted so we use the value obtained
                 * from the bitmap and continue.
                 */
        }
        atomic_add(free, &sbi->s_mb_discarded);
}

static noinline_for_stack void
ext4_mb_release_group_pa(struct ext4_buddy *e4b,
                                struct ext4_prealloc_space *pa)
{
        struct super_block *sb = e4b->bd_sb;
        ext4_group_t group;
        ext4_grpblk_t bit;

        trace_ext4_mb_release_group_pa(sb, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
                ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
                             e4b->bd_group, group, pa->pa_pstart);
                return;
        }
        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
}

/*
 * releases all preallocations in given group
 *
 * first, we need to decide discard policy:
 * - when do we discard
 *   1) ENOSPC
 * - how many do we discard
 *   1) how many requested
 */
static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block *sb,
                                     ext4_group_t group, int *busy)
{
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
        LIST_HEAD(list);
        struct ext4_buddy e4b;
        struct ext4_inode_info *ei;
        int err;
        int free = 0;

        if (!grp)
                return 0;
        mb_debug(sb, "discard preallocation for group %u\n", group);
        if (list_empty(&grp->bb_prealloc_list))
                goto out_dbg;

        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (IS_ERR(bitmap_bh)) {
                err = PTR_ERR(bitmap_bh);
                ext4_error_err(sb, -err,
                               "Error %d reading block bitmap for %u",
                               err, group);
                goto out_dbg;
        }

        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
                ext4_warning(sb, "Error %d loading buddy information for %u",
                             err, group);
                put_bh(bitmap_bh);
                goto out_dbg;
        }

        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
                                &grp->bb_prealloc_list, pa_group_list) {
                spin_lock(&pa->pa_lock);
                if (atomic_read(&pa->pa_count)) {
                        spin_unlock(&pa->pa_lock);
                        *busy = 1;
                        continue;
                }
                if (pa->pa_deleted) {
                        spin_unlock(&pa->pa_lock);
                        continue;
                }

                /* seems this one can be freed ... */
                ext4_mb_mark_pa_deleted(sb, pa);

                if (!free)
                        this_cpu_inc(discard_pa_seq);

                /* we can trust pa_free ... */
                free += pa->pa_free;

                spin_unlock(&pa->pa_lock);

                list_del(&pa->pa_group_list);
                list_add(&pa->u.pa_tmp_list, &list);
        }

        /* now free all selected PAs */
        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {

                /* remove from object (inode or locality group) */
                if (pa->pa_type == MB_GROUP_PA) {
                        spin_lock(pa->pa_node_lock.lg_lock);
                        list_del_rcu(&pa->pa_node.lg_list);
                        spin_unlock(pa->pa_node_lock.lg_lock);
                } else {
                        write_lock(pa->pa_node_lock.inode_lock);
                        ei = EXT4_I(pa->pa_inode);
                        rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
                        write_unlock(pa->pa_node_lock.inode_lock);
                }

                list_del(&pa->u.pa_tmp_list);

                if (pa->pa_type == MB_GROUP_PA) {
                        ext4_mb_release_group_pa(&e4b, pa);
                        call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
                } else {
                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                        ext4_mb_pa_free(pa);
                }
        }

        ext4_unlock_group(sb, group);
        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
out_dbg:
        mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
                 free, group, grp->bb_free);
        return free;
}

/*
 * releases all non-used preallocated blocks for given inode
 *
 * It's important to discard preallocations under i_data_sem
 * We don't want another block to be served from the prealloc
 * space when we are discarding the inode prealloc space.
 *
 * FIXME!! Make sure it is valid at all the call sites
 */
void ext4_discard_preallocations(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
        ext4_group_t group = 0;
        LIST_HEAD(list);
        struct ext4_buddy e4b;
        struct rb_node *iter;
        int err;

        if (!S_ISREG(inode->i_mode))
                return;

        if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        mb_debug(sb, "discard preallocation for inode %llu\n",
                 inode->i_ino);
        trace_ext4_discard_preallocations(inode,
                        atomic_read(&ei->i_prealloc_active));

repeat:
        /* first, collect all pa's in the inode */
        write_lock(&ei->i_prealloc_lock);
        for (iter = rb_first(&ei->i_prealloc_node); iter;
             iter = rb_next(iter)) {
                pa = rb_entry(iter, struct ext4_prealloc_space,
                              pa_node.inode_node);
                BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock);

                spin_lock(&pa->pa_lock);
                if (atomic_read(&pa->pa_count)) {
                        /* this shouldn't happen often - nobody should
                         * use preallocation while we're discarding it */
                        spin_unlock(&pa->pa_lock);
                        write_unlock(&ei->i_prealloc_lock);
                        ext4_msg(sb, KERN_ERR,
                                 "uh-oh! used pa while discarding");
                        WARN_ON(1);
                        schedule_timeout_uninterruptible(HZ);
                        goto repeat;

                }
                if (pa->pa_deleted == 0) {
                        ext4_mb_mark_pa_deleted(sb, pa);
                        spin_unlock(&pa->pa_lock);
                        rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
                        list_add(&pa->u.pa_tmp_list, &list);
                        continue;
                }

                /* someone is deleting pa right now */
                spin_unlock(&pa->pa_lock);
                write_unlock(&ei->i_prealloc_lock);

                /* we have to wait here because pa_deleted
                 * doesn't mean pa is already unlinked from
                 * the list. as we might be called from
                 * ->clear_inode() the inode will get freed
                 * and concurrent thread which is unlinking
                 * pa from inode's list may access already
                 * freed memory, bad-bad-bad */

                /* XXX: if this happens too often, we can
                 * add a flag to force wait only in case
                 * of ->clear_inode(), but not in case of
                 * regular truncate */
                schedule_timeout_uninterruptible(HZ);
                goto repeat;
        }
        write_unlock(&ei->i_prealloc_lock);

        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
                BUG_ON(pa->pa_type != MB_INODE_PA);
                group = ext4_get_group_number(sb, pa->pa_pstart);

                err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
                                             GFP_NOFS|__GFP_NOFAIL);
                if (err) {
                        ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
                                       err, group);
                        continue;
                }

                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(bitmap_bh)) {
                        err = PTR_ERR(bitmap_bh);
                        ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
                                       err, group);
                        ext4_mb_unload_buddy(&e4b);
                        continue;
                }

                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                ext4_unlock_group(sb, group);

                ext4_mb_unload_buddy(&e4b);
                put_bh(bitmap_bh);

                list_del(&pa->u.pa_tmp_list);
                ext4_mb_pa_free(pa);
        }
}

static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
{
        struct ext4_prealloc_space *pa;

        BUG_ON(ext4_pspace_cachep == NULL);
        pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
        if (!pa)
                return -ENOMEM;
        atomic_set(&pa->pa_count, 1);
        ac->ac_pa = pa;
        return 0;
}

static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac)
{
        struct ext4_prealloc_space *pa = ac->ac_pa;

        BUG_ON(!pa);
        ac->ac_pa = NULL;
        WARN_ON(!atomic_dec_and_test(&pa->pa_count));
        /*
         * current function is only called due to an error or due to
         * len of found blocks < len of requested blocks hence the PA has not
         * been added to grp->bb_prealloc_list. So we don't need to lock it
         */
        pa->pa_deleted = 1;
        ext4_mb_pa_free(pa);
}

#ifdef CONFIG_EXT4_DEBUG
static inline void ext4_mb_show_pa(struct super_block *sb)
{
        ext4_group_t i, ngroups;

        if (ext4_emergency_state(sb))
                return;

        ngroups = ext4_get_groups_count(sb);
        mb_debug(sb, "groups: ");
        for (i = 0; i < ngroups; i++) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
                struct ext4_prealloc_space *pa;
                ext4_grpblk_t start;
                struct list_head *cur;

                if (!grp)
                        continue;
                ext4_lock_group(sb, i);
                list_for_each(cur, &grp->bb_prealloc_list) {
                        pa = list_entry(cur, struct ext4_prealloc_space,
                                        pa_group_list);
                        spin_lock(&pa->pa_lock);
                        ext4_get_group_no_and_offset(sb, pa->pa_pstart,
                                                     NULL, &start);
                        spin_unlock(&pa->pa_lock);
                        mb_debug(sb, "PA:%u:%d:%d\n", i, start,
                                 pa->pa_len);
                }
                ext4_unlock_group(sb, i);
                mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
                         grp->bb_fragments);
        }
}

static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;

        if (ext4_emergency_state(sb))
                return;

        mb_debug(sb, "Can't allocate:"
                        " Allocation context details:");
        mb_debug(sb, "status %u flags 0x%x",
                        ac->ac_status, ac->ac_flags);
        mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
                        "goal %lu/%lu/%lu@%lu, "
                        "best %lu/%lu/%lu@%lu cr %d",
                        (unsigned long)ac->ac_o_ex.fe_group,
                        (unsigned long)ac->ac_o_ex.fe_start,
                        (unsigned long)ac->ac_o_ex.fe_len,
                        (unsigned long)ac->ac_o_ex.fe_logical,
                        (unsigned long)ac->ac_g_ex.fe_group,
                        (unsigned long)ac->ac_g_ex.fe_start,
                        (unsigned long)ac->ac_g_ex.fe_len,
                        (unsigned long)ac->ac_g_ex.fe_logical,
                        (unsigned long)ac->ac_b_ex.fe_group,
                        (unsigned long)ac->ac_b_ex.fe_start,
                        (unsigned long)ac->ac_b_ex.fe_len,
                        (unsigned long)ac->ac_b_ex.fe_logical,
                        (int)ac->ac_criteria);
        mb_debug(sb, "%u found", ac->ac_found);
        mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa));
        if (ac->ac_pa)
                mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
                         "group pa" : "inode pa");
        ext4_mb_show_pa(sb);
}
#else
static inline void ext4_mb_show_pa(struct super_block *sb)
{
}
static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
        ext4_mb_show_pa(ac->ac_sb);
}
#endif

/*
 * We use locality group preallocation for small size file. The size of the
 * file is determined by the current size or the resulting size after
 * allocation which ever is larger
 *
 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
 */
static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int bsbits = ac->ac_sb->s_blocksize_bits;
        loff_t size, isize;
        bool inode_pa_eligible, group_pa_eligible;

        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                return;

        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                return;

        group_pa_eligible = sbi->s_mb_group_prealloc > 0;
        inode_pa_eligible = true;
        size = extent_logical_end(sbi, &ac->ac_o_ex);
        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
                >> bsbits;

        /* No point in using inode preallocation for closed files */
        if ((size == isize) && !ext4_fs_is_busy(sbi) &&
            !inode_is_open_for_write(ac->ac_inode))
                inode_pa_eligible = false;

        size = max(size, isize);
        /* Don't use group allocation for large files */
        if (size > sbi->s_mb_stream_request)
                group_pa_eligible = false;

        if (!group_pa_eligible) {
                if (inode_pa_eligible)
                        ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
                else
                        ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
                return;
        }

        BUG_ON(ac->ac_lg != NULL);
        /*
         * locality group prealloc space are per cpu. The reason for having
         * per cpu locality group is to reduce the contention between block
         * request from multiple CPUs.
         */
        ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);

        /* we're going to use group allocation */
        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;

        /* serialize all allocations in the group */
        mutex_lock(&ac->ac_lg->lg_mutex);
}

static noinline_for_stack void
ext4_mb_initialize_context(struct ext4_allocation_context *ac,
                                struct ext4_allocation_request *ar)
{
        struct super_block *sb = ar->inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_group_t group;
        unsigned int len;
        ext4_fsblk_t goal;
        ext4_grpblk_t block;

        /* we can't allocate > group size */
        len = ar->len;

        /* just a dirty hack to filter too big requests  */
        if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
                len = EXT4_CLUSTERS_PER_GROUP(sb);

        /* start searching from the goal */
        goal = ar->goal;
        if (goal < le32_to_cpu(es->s_first_data_block) ||
                        goal >= ext4_blocks_count(es))
                goal = le32_to_cpu(es->s_first_data_block);
        ext4_get_group_no_and_offset(sb, goal, &group, &block);

        /* set up allocation goals */
        ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
        ac->ac_status = AC_STATUS_CONTINUE;
        ac->ac_sb = sb;
        ac->ac_inode = ar->inode;
        ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
        ac->ac_o_ex.fe_group = group;
        ac->ac_o_ex.fe_start = block;
        ac->ac_o_ex.fe_len = len;
        ac->ac_g_ex = ac->ac_o_ex;
        ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
        ac->ac_flags = ar->flags;

        /* we have to define context: we'll work with a file or
         * locality group. this is a policy, actually */
        ext4_mb_group_or_file(ac);

        mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
                        "left: %u/%u, right %u/%u to %swritable\n",
                        (unsigned) ar->len, (unsigned) ar->logical,
                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
                        (unsigned) ar->lleft, (unsigned) ar->pleft,
                        (unsigned) ar->lright, (unsigned) ar->pright,
                        inode_is_open_for_write(ar->inode) ? "" : "non-");
}

static noinline_for_stack void
ext4_mb_discard_lg_preallocations(struct super_block *sb,
                                        struct ext4_locality_group *lg,
                                        int order, int total_entries)
{
        ext4_group_t group = 0;
        struct ext4_buddy e4b;
        LIST_HEAD(discard_list);
        struct ext4_prealloc_space *pa, *tmp;

        mb_debug(sb, "discard locality group preallocation\n");

        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
                                pa_node.lg_list,
                                lockdep_is_held(&lg->lg_prealloc_lock)) {
                spin_lock(&pa->pa_lock);
                if (atomic_read(&pa->pa_count)) {
                        /*
                         * This is the pa that we just used
                         * for block allocation. So don't
                         * free that
                         */
                        spin_unlock(&pa->pa_lock);
                        continue;
                }
                if (pa->pa_deleted) {
                        spin_unlock(&pa->pa_lock);
                        continue;
                }
                /* only lg prealloc space */
                BUG_ON(pa->pa_type != MB_GROUP_PA);

                /* seems this one can be freed ... */
                ext4_mb_mark_pa_deleted(sb, pa);
                spin_unlock(&pa->pa_lock);

                list_del_rcu(&pa->pa_node.lg_list);
                list_add(&pa->u.pa_tmp_list, &discard_list);

                total_entries--;
                if (total_entries <= 5) {
                        /*
                         * we want to keep only 5 entries
                         * allowing it to grow to 8. This
                         * mak sure we don't call discard
                         * soon for this list.
                         */
                        break;
                }
        }
        spin_unlock(&lg->lg_prealloc_lock);

        list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
                int err;

                group = ext4_get_group_number(sb, pa->pa_pstart);
                err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
                                             GFP_NOFS|__GFP_NOFAIL);
                if (err) {
                        ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
                                       err, group);
                        continue;
                }
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
                ext4_mb_release_group_pa(&e4b, pa);
                ext4_unlock_group(sb, group);

                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
}

/*
 * We have incremented pa_count. So it cannot be freed at this
 * point. Also we hold lg_mutex. So no parallel allocation is
 * possible from this lg. That means pa_free cannot be updated.
 *
 * A parallel ext4_mb_discard_group_preallocations is possible.
 * which can cause the lg_prealloc_list to be updated.
 */

static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
{
        int order, added = 0, lg_prealloc_count = 1;
        struct super_block *sb = ac->ac_sb;
        struct ext4_locality_group *lg = ac->ac_lg;
        struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;

        order = fls(pa->pa_free) - 1;
        if (order > PREALLOC_TB_SIZE - 1)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;
        /* Add the prealloc space to lg */
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
                                pa_node.lg_list,
                                lockdep_is_held(&lg->lg_prealloc_lock)) {
                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted) {
                        spin_unlock(&tmp_pa->pa_lock);
                        continue;
                }
                if (!added && pa->pa_free < tmp_pa->pa_free) {
                        /* Add to the tail of the previous entry */
                        list_add_tail_rcu(&pa->pa_node.lg_list,
                                                &tmp_pa->pa_node.lg_list);
                        added = 1;
                        /*
                         * we want to count the total
                         * number of entries in the list
                         */
                }
                spin_unlock(&tmp_pa->pa_lock);
                lg_prealloc_count++;
        }
        if (!added)
                list_add_tail_rcu(&pa->pa_node.lg_list,
                                        &lg->lg_prealloc_list[order]);
        spin_unlock(&lg->lg_prealloc_lock);

        /* Now trim the list to be not more than 8 elements */
        if (lg_prealloc_count > 8)
                ext4_mb_discard_lg_preallocations(sb, lg,
                                                  order, lg_prealloc_count);
}

/*
 * release all resource we used in allocation
 */
static void ext4_mb_release_context(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
                if (pa->pa_type == MB_GROUP_PA) {
                        /* see comment in ext4_mb_use_group_pa() */
                        spin_lock(&pa->pa_lock);
                        pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                        pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                        pa->pa_free -= ac->ac_b_ex.fe_len;
                        pa->pa_len -= ac->ac_b_ex.fe_len;
                        spin_unlock(&pa->pa_lock);

                        /*
                         * We want to add the pa to the right bucket.
                         * Remove it from the list and while adding
                         * make sure the list to which we are adding
                         * doesn't grow big.
                         */
                        if (likely(pa->pa_free)) {
                                spin_lock(pa->pa_node_lock.lg_lock);
                                list_del_rcu(&pa->pa_node.lg_list);
                                spin_unlock(pa->pa_node_lock.lg_lock);
                                ext4_mb_add_n_trim(ac);
                        }
                }

                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
        if (ac->ac_bitmap_folio)
                folio_put(ac->ac_bitmap_folio);
        if (ac->ac_buddy_folio)
                folio_put(ac->ac_buddy_folio);
        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
                mutex_unlock(&ac->ac_lg->lg_mutex);
        ext4_mb_collect_stats(ac);
}

static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
{
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        int ret;
        int freed = 0, busy = 0;
        int retry = 0;

        trace_ext4_mb_discard_preallocations(sb, needed);

        if (needed == 0)
                needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
 repeat:
        for (i = 0; i < ngroups && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
                freed += ret;
                needed -= ret;
                cond_resched();
        }

        if (needed > 0 && busy && ++retry < 3) {
                busy = 0;
                goto repeat;
        }

        return freed;
}

static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
                        struct ext4_allocation_context *ac, u64 *seq)
{
        int freed;
        u64 seq_retry = 0;
        bool ret = false;

        freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
        if (freed) {
                ret = true;
                goto out_dbg;
        }
        seq_retry = ext4_get_discard_pa_seq_sum();
        if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
                ac->ac_flags |= EXT4_MB_STRICT_CHECK;
                *seq = seq_retry;
                ret = true;
        }

out_dbg:
        mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret));
        return ret;
}

/*
 * Simple allocator for Ext4 fast commit replay path. It searches for blocks
 * linearly starting at the goal block and also excludes the blocks which
 * are going to be in use after fast commit replay.
 */
static ext4_fsblk_t
ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
{
        struct buffer_head *bitmap_bh;
        struct super_block *sb = ar->inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group, nr;
        ext4_grpblk_t blkoff;
        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_grpblk_t i = 0;
        ext4_fsblk_t goal, block;
        struct ext4_super_block *es = sbi->s_es;

        goal = ar->goal;
        if (goal < le32_to_cpu(es->s_first_data_block) ||
                        goal >= ext4_blocks_count(es))
                goal = le32_to_cpu(es->s_first_data_block);

        ar->len = 0;
        ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
        for (nr = ext4_get_groups_count(sb); nr > 0; nr--) {
                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(bitmap_bh)) {
                        *errp = PTR_ERR(bitmap_bh);
                        pr_warn("Failed to read block bitmap\n");
                        return 0;
                }

                while (1) {
                        i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
                                                blkoff);
                        if (i >= max)
                                break;
                        if (ext4_fc_replay_check_excluded(sb,
                                ext4_group_first_block_no(sb, group) +
                                EXT4_C2B(sbi, i))) {
                                blkoff = i + 1;
                        } else
                                break;
                }
                brelse(bitmap_bh);
                if (i < max)
                        break;

                if (++group >= ext4_get_groups_count(sb))
                        group = 0;

                blkoff = 0;
        }

        if (i >= max) {
                *errp = -ENOSPC;
                return 0;
        }

        block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
        ext4_mb_mark_bb(sb, block, 1, true);
        ar->len = 1;

        *errp = 0;
        return block;
}

/*
 * Main entry point into mballoc to allocate blocks
 * it tries to use preallocation first, then falls back
 * to usual allocation
 */
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                                struct ext4_allocation_request *ar, int *errp)
{
        struct ext4_allocation_context *ac = NULL;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block = 0;
        unsigned int inquota = 0;
        unsigned int reserv_clstrs = 0;
        int retries = 0;
        u64 seq;

        might_sleep();
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);

        trace_ext4_request_blocks(ar);
        if (sbi->s_mount_state & EXT4_FC_REPLAY)
                return ext4_mb_new_blocks_simple(ar, errp);

        /* Allow to use superuser reservation for quota file */
        if (ext4_is_quota_file(ar->inode))
                ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;

        if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
                /* Without delayed allocation we need to verify
                 * there is enough free blocks to do block allocation
                 * and verify allocation doesn't exceed the quota limits.
                 */
                while (ar->len &&
                        ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {

                        /* let others to free the space */
                        cond_resched();
                        ar->len = ar->len >> 1;
                }
                if (!ar->len) {
                        ext4_mb_show_pa(sb);
                        *errp = -ENOSPC;
                        return 0;
                }
                reserv_clstrs = ar->len;
                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
                        dquot_alloc_block_nofail(ar->inode,
                                                 EXT4_C2B(sbi, ar->len));
                } else {
                        while (ar->len &&
                                dquot_alloc_block(ar->inode,
                                                  EXT4_C2B(sbi, ar->len))) {

                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
                                ar->len--;
                        }
                }
                inquota = ar->len;
                if (ar->len == 0) {
                        *errp = -EDQUOT;
                        goto out;
                }
        }

        ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
        if (!ac) {
                ar->len = 0;
                *errp = -ENOMEM;
                goto out;
        }

        ext4_mb_initialize_context(ac, ar);

        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
        seq = this_cpu_read(discard_pa_seq);
        if (!ext4_mb_use_preallocated(ac)) {
                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
                ext4_mb_normalize_request(ac, ar);

                *errp = ext4_mb_pa_alloc(ac);
                if (*errp)
                        goto errout;
repeat:
                /* allocate space in core */
                *errp = ext4_mb_regular_allocator(ac);
                /*
                 * pa allocated above is added to grp->bb_prealloc_list only
                 * when we were able to allocate some block i.e. when
                 * ac->ac_status == AC_STATUS_FOUND.
                 * And error from above mean ac->ac_status != AC_STATUS_FOUND
                 * So we have to free this pa here itself.
                 */
                if (*errp) {
                        ext4_mb_pa_put_free(ac);
                        ext4_discard_allocated_blocks(ac);
                        goto errout;
                }
                if (ac->ac_status == AC_STATUS_FOUND &&
                        ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
                        ext4_mb_pa_put_free(ac);
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle);
                if (*errp) {
                        ext4_discard_allocated_blocks(ac);
                        goto errout;
                } else {
                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
                        ar->len = ac->ac_b_ex.fe_len;
                }
        } else {
                if (++retries < 3 &&
                    ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
                        goto repeat;
                /*
                 * If block allocation fails then the pa allocated above
                 * needs to be freed here itself.
                 */
                ext4_mb_pa_put_free(ac);
                *errp = -ENOSPC;
        }

        if (*errp) {
errout:
                ac->ac_b_ex.fe_len = 0;
                ar->len = 0;
                ext4_mb_show_ac(ac);
        }
        ext4_mb_release_context(ac);
        kmem_cache_free(ext4_ac_cachep, ac);
out:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
        /* release any reserved blocks */
        if (reserv_clstrs)
                percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs);

        trace_ext4_allocate_blocks(ar, (unsigned long long)block);

        return block;
}

/*
 * We can merge two free data extents only if the physical blocks
 * are contiguous, AND the extents were freed by the same transaction,
 * AND the blocks are associated with the same group.
 */
static inline bool
ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1,
                                 struct ext4_free_data *entry2)
{
        if (entry1->efd_tid != entry2->efd_tid)
                return false;
        if (entry1->efd_start_cluster + entry1->efd_count !=
            entry2->efd_start_cluster)
                return false;
        if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group))
                return false;
        return true;
}

static inline void
ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root,
                         struct ext4_free_data *entry1,
                         struct ext4_free_data *entry2)
{
        entry1->efd_count += entry2->efd_count;
        spin_lock(&sbi->s_md_lock);
        list_del(&entry2->efd_list);
        spin_unlock(&sbi->s_md_lock);
        rb_erase(&entry2->efd_node, root);
        kmem_cache_free(ext4_free_data_cachep, entry2);
}

static inline void
ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root,
                                 struct ext4_free_data *entry)
{
        struct ext4_free_data *prev;
        struct rb_node *node;

        node = rb_prev(&entry->efd_node);
        if (!node)
                return;

        prev = rb_entry(node, struct ext4_free_data, efd_node);
        if (ext4_freed_extents_can_be_merged(prev, entry))
                ext4_merge_freed_extents(sbi, root, prev, entry);
}

static inline void
ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root,
                                 struct ext4_free_data *entry)
{
        struct ext4_free_data *next;
        struct rb_node *node;

        node = rb_next(&entry->efd_node);
        if (!node)
                return;

        next = rb_entry(node, struct ext4_free_data, efd_node);
        if (ext4_freed_extents_can_be_merged(entry, next))
                ext4_merge_freed_extents(sbi, root, entry, next);
}

static noinline_for_stack void
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                      struct ext4_free_data *new_entry)
{
        ext4_group_t group = e4b->bd_group;
        ext4_grpblk_t cluster;
        ext4_grpblk_t clusters = new_entry->efd_count;
        struct ext4_free_data *entry = NULL;
        struct ext4_group_info *db = e4b->bd_info;
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct rb_root *root = &db->bb_free_root;
        struct rb_node **n = &root->rb_node;
        struct rb_node *parent = NULL, *new_node;

        BUG_ON(!ext4_handle_valid(handle));
        BUG_ON(e4b->bd_bitmap_folio == NULL);
        BUG_ON(e4b->bd_buddy_folio == NULL);

        new_node = &new_entry->efd_node;
        cluster = new_entry->efd_start_cluster;

        if (!*n) {
                /* first free block exent. We need to
                   protect buddy cache from being freed,
                 * otherwise we'll refresh it from
                 * on-disk bitmap and lose not-yet-available
                 * blocks */
                folio_get(e4b->bd_buddy_folio);
                folio_get(e4b->bd_bitmap_folio);
        }
        while (*n) {
                parent = *n;
                entry = rb_entry(parent, struct ext4_free_data, efd_node);
                if (cluster < entry->efd_start_cluster)
                        n = &(*n)->rb_left;
                else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
                        n = &(*n)->rb_right;
                else {
                        ext4_grp_locked_error(sb, group, 0,
                                ext4_group_first_block_no(sb, group) +
                                EXT4_C2B(sbi, cluster),
                                "Block already on to-be-freed list");
                        kmem_cache_free(ext4_free_data_cachep, new_entry);
                        return;
                }
        }

        atomic_add(clusters, &sbi->s_mb_free_pending);
        if (!entry)
                goto insert;

        /* Now try to see the extent can be merged to prev and next */
        if (ext4_freed_extents_can_be_merged(new_entry, entry)) {
                entry->efd_start_cluster = cluster;
                entry->efd_count += new_entry->efd_count;
                kmem_cache_free(ext4_free_data_cachep, new_entry);
                ext4_try_merge_freed_extent_prev(sbi, root, entry);
                return;
        }
        if (ext4_freed_extents_can_be_merged(entry, new_entry)) {
                entry->efd_count += new_entry->efd_count;
                kmem_cache_free(ext4_free_data_cachep, new_entry);
                ext4_try_merge_freed_extent_next(sbi, root, entry);
                return;
        }
insert:
        rb_link_node(new_node, parent, n);
        rb_insert_color(new_node, root);

        spin_lock(&sbi->s_md_lock);
        list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
        spin_unlock(&sbi->s_md_lock);
}

static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
                                        unsigned long count)
{
        struct super_block *sb = inode->i_sb;
        ext4_group_t group;
        ext4_grpblk_t blkoff;

        ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
        ext4_mb_mark_context(NULL, sb, false, group, blkoff, count,
                             EXT4_MB_BITMAP_MARKED_CHECK |
                             EXT4_MB_SYNC_UPDATE,
                             NULL);
}

/**
 * ext4_mb_clear_bb() -- helper function for freeing blocks.
 *                        Used by ext4_free_blocks()
 * @handle:                handle for this transaction
 * @inode:                inode
 * @block:                starting physical block to be freed
 * @count:                number of blocks to be freed
 * @flags:                flags used by ext4_free_blocks
 */
static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
                               ext4_fsblk_t block, unsigned long count,
                               int flags)
{
        struct super_block *sb = inode->i_sb;
        struct ext4_group_info *grp;
        unsigned int overflow;
        ext4_grpblk_t bit;
        ext4_group_t block_group;
        struct ext4_sb_info *sbi;
        struct ext4_buddy e4b;
        unsigned int count_clusters;
        int err = 0;
        int mark_flags = 0;
        ext4_grpblk_t changed;

        sbi = EXT4_SB(sb);

        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
                goto error_out;
        }
        flags |= EXT4_FREE_BLOCKS_VALIDATED;

do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);

        grp = ext4_get_group_info(sb, block_group);
        if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                return;

        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
        if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
                overflow = EXT4_C2B(sbi, bit) + count -
                        EXT4_BLOCKS_PER_GROUP(sb);
                count -= overflow;
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
        }
        count_clusters = EXT4_NUM_B2C(sbi, count);
        trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);

        /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
        err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
                                     GFP_NOFS|__GFP_NOFAIL);
        if (err)
                goto error_out;

        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
                goto error_clean;
        }

#ifdef AGGRESSIVE_CHECK
        mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK;
#endif
        err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
                                   count_clusters, mark_flags, &changed);


        if (err && changed == 0)
                goto error_clean;

#ifdef AGGRESSIVE_CHECK
        BUG_ON(changed != count_clusters);
#endif

        /*
         * We need to make sure we don't reuse the freed block until after the
         * transaction is committed. We make an exception if the inode is to be
         * written in writeback mode since writeback mode has weak data
         * consistency guarantees.
         */
        if (ext4_handle_valid(handle) &&
            ((flags & EXT4_FREE_BLOCKS_METADATA) ||
             !ext4_should_writeback_data(inode))) {
                struct ext4_free_data *new_entry;
                /*
                 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
                 * to fail.
                 */
                new_entry = kmem_cache_alloc(ext4_free_data_cachep,
                                GFP_NOFS|__GFP_NOFAIL);
                new_entry->efd_start_cluster = bit;
                new_entry->efd_group = block_group;
                new_entry->efd_count = count_clusters;
                new_entry->efd_tid = handle->h_transaction->t_tid;

                ext4_lock_group(sb, block_group);
                ext4_mb_free_metadata(handle, &e4b, new_entry);
        } else {
                if (test_opt(sb, DISCARD)) {
                        err = ext4_issue_discard(sb, block_group, bit,
                                                 count_clusters);
                        /*
                         * Ignore EOPNOTSUPP error. This is consistent with
                         * what happens when using journal.
                         */
                        if (err == -EOPNOTSUPP)
                                err = 0;
                        if (err)
                                ext4_msg(sb, KERN_WARNING, "discard request in"
                                         " group:%u block:%d count:%lu failed"
                                         " with %d", block_group, bit, count,
                                         err);
                }

                EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);

                ext4_lock_group(sb, block_group);
                mb_free_blocks(inode, &e4b, bit, count_clusters);
        }

        ext4_unlock_group(sb, block_group);

        /*
         * on a bigalloc file system, defer the s_freeclusters_counter
         * update to the caller (ext4_remove_space and friends) so they
         * can determine if a cluster freed here should be rereserved
         */
        if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
                if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
                        dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
                percpu_counter_add(&sbi->s_freeclusters_counter,
                                   count_clusters);
        }

        if (overflow && !err) {
                block += count;
                count = overflow;
                ext4_mb_unload_buddy(&e4b);
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
                goto do_more;
        }

error_clean:
        ext4_mb_unload_buddy(&e4b);
error_out:
        ext4_std_error(sb, err);
}

/**
 * ext4_free_blocks() -- Free given blocks and update quota
 * @handle:                handle for this transaction
 * @inode:                inode
 * @bh:                        optional buffer of the block to be freed
 * @block:                starting physical block to be freed
 * @count:                number of blocks to be freed
 * @flags:                flags used by ext4_free_blocks
 */
void ext4_free_blocks(handle_t *handle, struct inode *inode,
                      struct buffer_head *bh, ext4_fsblk_t block,
                      unsigned long count, int flags)
{
        struct super_block *sb = inode->i_sb;
        unsigned int overflow;
        struct ext4_sb_info *sbi;

        sbi = EXT4_SB(sb);

        if (bh) {
                if (block)
                        BUG_ON(block != bh->b_blocknr);
                else
                        block = bh->b_blocknr;
        }

        if (sbi->s_mount_state & EXT4_FC_REPLAY) {
                ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
                return;
        }

        might_sleep();

        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks not in datazone - "
                           "block = %llu, count = %lu", block, count);
                return;
        }
        flags |= EXT4_FREE_BLOCKS_VALIDATED;

        ext4_debug("freeing block %llu\n", block);
        trace_ext4_free_blocks(inode, block, count, flags);

        if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
                BUG_ON(count > 1);

                ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                            inode, bh, block);
        }

        /*
         * If the extent to be freed does not begin on a cluster
         * boundary, we need to deal with partial clusters at the
         * beginning and end of the extent.  Normally we will free
         * blocks at the beginning or the end unless we are explicitly
         * requested to avoid doing so.
         */
        overflow = EXT4_PBLK_COFF(sbi, block);
        if (overflow) {
                if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
                        overflow = sbi->s_cluster_ratio - overflow;
                        block += overflow;
                        if (count > overflow)
                                count -= overflow;
                        else
                                return;
                } else {
                        block -= overflow;
                        count += overflow;
                }
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
        }
        overflow = EXT4_LBLK_COFF(sbi, count);
        if (overflow) {
                if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
                        if (count > overflow)
                                count -= overflow;
                        else
                                return;
                } else
                        count += sbi->s_cluster_ratio - overflow;
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
        }

        if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
                int i;
                int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;

                for (i = 0; i < count; i++) {
                        cond_resched();
                        if (is_metadata)
                                bh = sb_find_get_block_nonatomic(inode->i_sb,
                                                                 block + i);
                        ext4_forget(handle, is_metadata, inode, bh, block + i);
                }
        }

        ext4_mb_clear_bb(handle, inode, block, count, flags);
}

/**
 * ext4_group_add_blocks() -- Add given blocks to an existing group
 * @handle:                        handle to this transaction
 * @sb:                                super block
 * @block:                        start physical block to add to the block group
 * @count:                        number of blocks to free
 *
 * This marks the blocks as free in the bitmap and buddy.
 */
int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                         ext4_fsblk_t block, unsigned long count)
{
        ext4_group_t block_group;
        ext4_grpblk_t bit;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_buddy e4b;
        int err = 0;
        ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
        ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
        unsigned long cluster_count = last_cluster - first_cluster + 1;
        ext4_grpblk_t changed;

        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);

        if (cluster_count == 0)
                return 0;

        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
        if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
                ext4_warning(sb, "too many blocks added to group %u",
                             block_group);
                err = -EINVAL;
                goto error_out;
        }

        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
                goto error_out;

        if (!ext4_sb_block_valid(sb, NULL, block, count)) {
                ext4_error(sb, "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
                err = -EINVAL;
                goto error_clean;
        }

        err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
                                   cluster_count, EXT4_MB_BITMAP_MARKED_CHECK,
                                   &changed);
        if (err && changed == 0)
                goto error_clean;

        if (changed != cluster_count)
                ext4_error(sb, "bit already cleared in group %u", block_group);

        ext4_lock_group(sb, block_group);
        mb_free_blocks(NULL, &e4b, bit, cluster_count);
        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeclusters_counter,
                           changed);

error_clean:
        ext4_mb_unload_buddy(&e4b);
error_out:
        ext4_std_error(sb, err);
        return err;
}

/**
 * ext4_trim_extent -- function to TRIM one single free extent in the group
 * @sb:                super block for the file system
 * @start:        starting block of the free extent in the alloc. group
 * @count:        number of blocks to TRIM
 * @e4b:        ext4 buddy for the group
 *
 * Trim "count" blocks starting at "start" in the "group". To assure that no
 * one will allocate those blocks, mark it as used in buddy bitmap. This must
 * be called with under the group lock.
 */
static int ext4_trim_extent(struct super_block *sb,
                int start, int count, struct ext4_buddy *e4b)
__releases(bitlock)
__acquires(bitlock)
{
        struct ext4_free_extent ex;
        ext4_group_t group = e4b->bd_group;
        int ret = 0;

        trace_ext4_trim_extent(sb, group, start, count);

        assert_spin_locked(ext4_group_lock_ptr(sb, group));

        ex.fe_start = start;
        ex.fe_group = group;
        ex.fe_len = count;

        /*
         * Mark blocks used, so no one can reuse them while
         * being trimmed.
         */
        mb_mark_used(e4b, &ex);
        ext4_unlock_group(sb, group);
        ret = ext4_issue_discard(sb, group, start, count);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
        return ret;
}

static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
                                           ext4_group_t grp)
{
        unsigned long nr_clusters_in_group;

        if (grp < (ext4_get_groups_count(sb) - 1))
                nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
        else
                nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
                                        ext4_group_first_block_no(sb, grp))
                                       >> EXT4_CLUSTER_BITS(sb);

        return nr_clusters_in_group - 1;
}

static bool ext4_trim_interrupted(void)
{
        return fatal_signal_pending(current) || freezing(current);
}

static int ext4_try_to_trim_range(struct super_block *sb,
                struct ext4_buddy *e4b, ext4_grpblk_t start,
                ext4_grpblk_t max, ext4_grpblk_t minblocks)
__acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
        ext4_grpblk_t next, count, free_count, last, origin_start;
        bool set_trimmed = false;
        void *bitmap;

        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                return 0;

        last = ext4_last_grp_cluster(sb, e4b->bd_group);
        bitmap = e4b->bd_bitmap;
        if (start == 0 && max >= last)
                set_trimmed = true;
        origin_start = start;
        start = max(e4b->bd_info->bb_first_free, start);
        count = 0;
        free_count = 0;

        while (start <= max) {
                start = mb_find_next_zero_bit(bitmap, max + 1, start);
                if (start > max)
                        break;

                next = mb_find_next_bit(bitmap, last + 1, start);
                if (origin_start == 0 && next >= last)
                        set_trimmed = true;

                if ((next - start) >= minblocks) {
                        int ret = ext4_trim_extent(sb, start, next - start, e4b);

                        if (ret && ret != -EOPNOTSUPP)
                                return count;
                        count += next - start;
                }
                free_count += next - start;
                start = next + 1;

                if (ext4_trim_interrupted())
                        return count;

                if (need_resched()) {
                        ext4_unlock_group(sb, e4b->bd_group);
                        cond_resched();
                        ext4_lock_group(sb, e4b->bd_group);
                }

                if ((e4b->bd_info->bb_free - free_count) < minblocks)
                        break;
        }

        if (set_trimmed)
                EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);

        return count;
}

/**
 * ext4_trim_all_free -- function to trim all free space in alloc. group
 * @sb:                        super block for file system
 * @group:                group to be trimmed
 * @start:                first group block to examine
 * @max:                last group block to examine
 * @minblocks:                minimum extent block count
 *
 * ext4_trim_all_free walks through group's block bitmap searching for free
 * extents. When the free extent is found, mark it as used in group buddy
 * bitmap. Then issue a TRIM command on this extent and free the extent in
 * the group buddy bitmap.
 */
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                   ext4_grpblk_t start, ext4_grpblk_t max,
                   ext4_grpblk_t minblocks)
{
        struct ext4_buddy e4b;
        int ret;

        trace_ext4_trim_all_free(sb, group, start, max);

        ret = ext4_mb_load_buddy(sb, group, &e4b);
        if (ret) {
                ext4_warning(sb, "Error %d loading buddy information for %u",
                             ret, group);
                return ret;
        }

        ext4_lock_group(sb, group);

        if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
            minblocks < EXT4_SB(sb)->s_last_trim_minblks)
                ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
        else
                ret = 0;

        ext4_unlock_group(sb, group);
        ext4_mb_unload_buddy(&e4b);

        ext4_debug("trimmed %d blocks in the group %d\n",
                ret, group);

        return ret;
}

/**
 * ext4_trim_fs() -- trim ioctl handle function
 * @sb:                        superblock for filesystem
 * @range:                fstrim_range structure
 *
 * start:        First Byte to trim
 * len:                number of Bytes to trim from start
 * minlen:        minimum extent length in Bytes
 * ext4_trim_fs goes through all allocation groups containing Bytes from
 * start to start+len. For each such a group ext4_trim_all_free function
 * is invoked to trim all free space.
 */
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
        unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
        struct ext4_group_info *grp;
        ext4_group_t group, first_group, last_group;
        ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
        uint64_t start, end, minlen, trimmed = 0;
        ext4_fsblk_t first_data_blk =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
        ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
        int ret = 0;

        start = range->start >> sb->s_blocksize_bits;
        end = start + (range->len >> sb->s_blocksize_bits) - 1;
        minlen = EXT4_NUM_B2C(EXT4_SB(sb),
                              range->minlen >> sb->s_blocksize_bits);

        if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
            start >= max_blks ||
            range->len < sb->s_blocksize)
                return -EINVAL;
        /* No point to try to trim less than discard granularity */
        if (range->minlen < discard_granularity) {
                minlen = EXT4_NUM_B2C(EXT4_SB(sb),
                                discard_granularity >> sb->s_blocksize_bits);
                if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
                        goto out;
        }
        if (end >= max_blks - 1)
                end = max_blks - 1;
        if (end <= first_data_blk)
                goto out;
        if (start < first_data_blk)
                start = first_data_blk;

        /* Determine first and last group to examine based on start and end */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
                                     &first_group, &first_cluster);
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
                                     &last_group, &last_cluster);

        /* end now represents the last cluster to discard in this group */
        end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;

        for (group = first_group; group <= last_group; group++) {
                if (ext4_trim_interrupted())
                        break;
                grp = ext4_get_group_info(sb, group);
                if (!grp)
                        continue;
                /* We only do this if the grp has never been initialized */
                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                        ret = ext4_mb_init_group(sb, group, GFP_NOFS);
                        if (ret)
                                break;
                }

                /*
                 * For all the groups except the last one, last cluster will
                 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
                 * change it for the last group, note that last_cluster is
                 * already computed earlier by ext4_get_group_no_and_offset()
                 */
                if (group == last_group)
                        end = last_cluster;
                if (grp->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, group, first_cluster,
                                                 end, minlen);
                        if (cnt < 0) {
                                ret = cnt;
                                break;
                        }
                        trimmed += cnt;
                }

                /*
                 * For every group except the first one, we are sure
                 * that the first cluster to discard will be cluster #0.
                 */
                first_cluster = 0;
        }

        if (!ret)
                EXT4_SB(sb)->s_last_trim_minblks = minlen;

out:
        range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
        return ret;
}

/* Iterate all the free extents in the group. */
int
ext4_mballoc_query_range(
        struct super_block                *sb,
        ext4_group_t                        group,
        ext4_grpblk_t                        first,
        ext4_grpblk_t                        end,
        ext4_mballoc_query_range_fn        meta_formatter,
        ext4_mballoc_query_range_fn        formatter,
        void                                *priv)
{
        void                                *bitmap;
        ext4_grpblk_t                        start, next;
        struct ext4_buddy                e4b;
        int                                error;

        error = ext4_mb_load_buddy(sb, group, &e4b);
        if (error)
                return error;
        bitmap = e4b.bd_bitmap;

        ext4_lock_group(sb, group);

        start = max(e4b.bd_info->bb_first_free, first);
        if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
                end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
        if (meta_formatter && start != first) {
                if (start > end)
                        start = end;
                ext4_unlock_group(sb, group);
                error = meta_formatter(sb, group, first, start - first,
                                       priv);
                if (error)
                        goto out_unload;
                ext4_lock_group(sb, group);
        }
        while (start <= end) {
                start = mb_find_next_zero_bit(bitmap, end + 1, start);
                if (start > end)
                        break;
                next = mb_find_next_bit(bitmap, end + 1, start);

                ext4_unlock_group(sb, group);
                error = formatter(sb, group, start, next - start, priv);
                if (error)
                        goto out_unload;
                ext4_lock_group(sb, group);

                start = next + 1;
        }

        ext4_unlock_group(sb, group);
out_unload:
        ext4_mb_unload_buddy(&e4b);

        return error;
}

#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
void mb_clear_bits_test(void *bm, int cur, int len)
{
         mb_clear_bits(bm, cur, len);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(mb_clear_bits_test);

ext4_fsblk_t
ext4_mb_new_blocks_simple_test(struct ext4_allocation_request *ar,
                               int *errp)
{
        return ext4_mb_new_blocks_simple(ar, errp);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_new_blocks_simple_test);

int mb_find_next_zero_bit_test(void *addr, int max, int start)
{
        return mb_find_next_zero_bit(addr, max, start);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(mb_find_next_zero_bit_test);

int mb_find_next_bit_test(void *addr, int max, int start)
{
        return mb_find_next_bit(addr, max, start);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(mb_find_next_bit_test);

void mb_clear_bit_test(int bit, void *addr)
{
        mb_clear_bit(bit, addr);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(mb_clear_bit_test);

int mb_test_bit_test(int bit, void *addr)
{
        return mb_test_bit(bit, addr);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(mb_test_bit_test);

int ext4_mb_mark_diskspace_used_test(struct ext4_allocation_context *ac,
                                     handle_t *handle)
{
        return ext4_mb_mark_diskspace_used(ac, handle);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_mark_diskspace_used_test);

int mb_mark_used_test(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
{
        return mb_mark_used(e4b, ex);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(mb_mark_used_test);

void ext4_mb_generate_buddy_test(struct super_block *sb, void *buddy,
                                 void *bitmap, ext4_group_t group,
                                 struct ext4_group_info *grp)
{
        ext4_mb_generate_buddy(sb, buddy, bitmap, group, grp);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_generate_buddy_test);

int ext4_mb_load_buddy_test(struct super_block *sb, ext4_group_t group,
                            struct ext4_buddy *e4b)
{
        return ext4_mb_load_buddy(sb, group, e4b);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_load_buddy_test);

void ext4_mb_unload_buddy_test(struct ext4_buddy *e4b)
{
        ext4_mb_unload_buddy(e4b);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_unload_buddy_test);

void mb_free_blocks_test(struct inode *inode, struct ext4_buddy *e4b,
                         int first, int count)
{
        mb_free_blocks(inode, e4b, first, count);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(mb_free_blocks_test);

void ext4_free_blocks_simple_test(struct inode *inode, ext4_fsblk_t block,
                                  unsigned long count)
{
        return ext4_free_blocks_simple(inode, block, count);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_free_blocks_simple_test);

EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_wait_block_bitmap);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_init);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_get_group_desc);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_count_free_clusters);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_get_group_info);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_free_group_clusters_set);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_release);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_read_block_bitmap_nowait);
EXPORT_SYMBOL_FOR_EXT4_TEST(mb_set_bits);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_fc_init_inode);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_mark_context);
#endif












































    1 
















    1 

















    1 





    1 























































































































































































































































    1 









    1 
























    1 


















    1 




























    1 






    1 


























    1 


    1 


















    1 



























    1 


    1 

    1 



















































    1 
    1 









    1 


    1 










    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfs/bnode.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Handle basic btree node operations
 */

#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/swap.h>

#include "btree.h"

static inline
bool is_bnode_offset_valid(struct hfs_bnode *node, u32 off)
{
        bool is_valid = off < node->tree->node_size;

        if (!is_valid) {
                pr_err("requested invalid offset: "
                       "NODE: id %u, type %#x, height %u, "
                       "node_size %u, offset %u\n",
                       node->this, node->type, node->height,
                       node->tree->node_size, off);
        }

        return is_valid;
}

static inline
u32 check_and_correct_requested_length(struct hfs_bnode *node, u32 off, u32 len)
{
        unsigned int node_size;

        if (!is_bnode_offset_valid(node, off))
                return 0;

        node_size = node->tree->node_size;

        if ((off + len) > node_size) {
                u32 new_len = node_size - off;

                pr_err("requested length has been corrected: "
                       "NODE: id %u, type %#x, height %u, "
                       "node_size %u, offset %u, "
                       "requested_len %u, corrected_len %u\n",
                       node->this, node->type, node->height,
                       node->tree->node_size, off, len, new_len);

                return new_len;
        }

        return len;
}

void hfs_bnode_read(struct hfs_bnode *node, void *buf, u32 off, u32 len)
{
        struct page *page;
        u32 pagenum;
        u32 bytes_read;
        u32 bytes_to_read;

        if (!is_bnode_offset_valid(node, off))
                return;

        if (len == 0) {
                pr_err("requested zero length: "
                       "NODE: id %u, type %#x, height %u, "
                       "node_size %u, offset %u, len %u\n",
                       node->this, node->type, node->height,
                       node->tree->node_size, off, len);
                return;
        }

        len = check_and_correct_requested_length(node, off, len);

        off += node->page_offset;
        pagenum = off >> PAGE_SHIFT;
        off &= ~PAGE_MASK; /* compute page offset for the first page */

        for (bytes_read = 0; bytes_read < len; bytes_read += bytes_to_read) {
                if (pagenum >= node->tree->pages_per_bnode)
                        break;
                page = node->page[pagenum];
                bytes_to_read = min_t(u32, len - bytes_read, PAGE_SIZE - off);

                memcpy_from_page(buf + bytes_read, page, off, bytes_to_read);

                pagenum++;
                off = 0; /* page offset only applies to the first page */
        }
}

u16 hfs_bnode_read_u16(struct hfs_bnode *node, u32 off)
{
        __be16 data;
        // optimize later...
        hfs_bnode_read(node, &data, off, 2);
        return be16_to_cpu(data);
}

u8 hfs_bnode_read_u8(struct hfs_bnode *node, u32 off)
{
        u8 data;
        // optimize later...
        hfs_bnode_read(node, &data, off, 1);
        return data;
}

void hfs_bnode_read_key(struct hfs_bnode *node, void *key, u32 off)
{
        struct hfs_btree *tree;
        u32 key_len;

        tree = node->tree;
        if (node->type == HFS_NODE_LEAF ||
            tree->attributes & HFS_TREE_VARIDXKEYS)
                key_len = hfs_bnode_read_u8(node, off) + 1;
        else
                key_len = tree->max_key_len + 1;

        if (key_len > sizeof(hfs_btree_key) || key_len < 1) {
                memset(key, 0, sizeof(hfs_btree_key));
                pr_err("hfs: Invalid key length: %u\n", key_len);
                return;
        }

        hfs_bnode_read(node, key, off, key_len);
}

void hfs_bnode_write(struct hfs_bnode *node, void *buf, u32 off, u32 len)
{
        struct page *page;

        if (!is_bnode_offset_valid(node, off))
                return;

        if (len == 0) {
                pr_err("requested zero length: "
                       "NODE: id %u, type %#x, height %u, "
                       "node_size %u, offset %u, len %u\n",
                       node->this, node->type, node->height,
                       node->tree->node_size, off, len);
                return;
        }

        len = check_and_correct_requested_length(node, off, len);

        off += node->page_offset;
        page = node->page[0];

        memcpy_to_page(page, off, buf, len);
        set_page_dirty(page);
}

void hfs_bnode_write_u16(struct hfs_bnode *node, u32 off, u16 data)
{
        __be16 v = cpu_to_be16(data);
        // optimize later...
        hfs_bnode_write(node, &v, off, 2);
}

void hfs_bnode_write_u8(struct hfs_bnode *node, u32 off, u8 data)
{
        // optimize later...
        hfs_bnode_write(node, &data, off, 1);
}

void hfs_bnode_clear(struct hfs_bnode *node, u32 off, u32 len)
{
        struct page *page;

        if (!is_bnode_offset_valid(node, off))
                return;

        if (len == 0) {
                pr_err("requested zero length: "
                       "NODE: id %u, type %#x, height %u, "
                       "node_size %u, offset %u, len %u\n",
                       node->this, node->type, node->height,
                       node->tree->node_size, off, len);
                return;
        }

        len = check_and_correct_requested_length(node, off, len);

        off += node->page_offset;
        page = node->page[0];

        memzero_page(page, off, len);
        set_page_dirty(page);
}

void hfs_bnode_copy(struct hfs_bnode *dst_node, u32 dst,
                    struct hfs_bnode *src_node, u32 src, u32 len)
{
        struct page *src_page, *dst_page;

        hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
        if (!len)
                return;

        len = check_and_correct_requested_length(src_node, src, len);
        len = check_and_correct_requested_length(dst_node, dst, len);

        src += src_node->page_offset;
        dst += dst_node->page_offset;
        src_page = src_node->page[0];
        dst_page = dst_node->page[0];

        memcpy_page(dst_page, dst, src_page, src, len);
        set_page_dirty(dst_page);
}

void hfs_bnode_move(struct hfs_bnode *node, u32 dst, u32 src, u32 len)
{
        struct page *page;
        void *ptr;

        hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
        if (!len)
                return;

        len = check_and_correct_requested_length(node, src, len);
        len = check_and_correct_requested_length(node, dst, len);

        src += node->page_offset;
        dst += node->page_offset;
        page = node->page[0];
        ptr = kmap_local_page(page);
        memmove(ptr + dst, ptr + src, len);
        kunmap_local(ptr);
        set_page_dirty(page);
}

void hfs_bnode_dump(struct hfs_bnode *node)
{
        struct hfs_bnode_desc desc;
        __be32 cnid;
        int i, off, key_off;

        hfs_dbg("node %d\n", node->this);
        hfs_bnode_read(node, &desc, 0, sizeof(desc));
        hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n",
                be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
                desc.type, desc.height, be16_to_cpu(desc.num_recs));

        off = node->tree->node_size - 2;
        for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
                key_off = hfs_bnode_read_u16(node, off);
                hfs_dbg(" key_off %d", key_off);
                if (i && node->type == HFS_NODE_INDEX) {
                        int tmp;

                        if (node->tree->attributes & HFS_TREE_VARIDXKEYS)
                                tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1;
                        else
                                tmp = node->tree->max_key_len + 1;
                        hfs_dbg(" (%d,%d",
                                tmp, hfs_bnode_read_u8(node, key_off));
                        hfs_bnode_read(node, &cnid, key_off + tmp, 4);
                        hfs_dbg(", cnid %d)", be32_to_cpu(cnid));
                } else if (i && node->type == HFS_NODE_LEAF) {
                        int tmp;

                        tmp = hfs_bnode_read_u8(node, key_off);
                        hfs_dbg(" (%d)", tmp);
                }
        }
        hfs_dbg("\n");
}

void hfs_bnode_unlink(struct hfs_bnode *node)
{
        struct hfs_btree *tree;
        struct hfs_bnode *tmp;
        __be32 cnid;

        tree = node->tree;
        if (node->prev) {
                tmp = hfs_bnode_find(tree, node->prev);
                if (IS_ERR(tmp))
                        return;
                tmp->next = node->next;
                cnid = cpu_to_be32(tmp->next);
                hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_head = node->next;

        if (node->next) {
                tmp = hfs_bnode_find(tree, node->next);
                if (IS_ERR(tmp))
                        return;
                tmp->prev = node->prev;
                cnid = cpu_to_be32(tmp->prev);
                hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_tail = node->prev;

        // move down?
        if (!node->prev && !node->next) {
                printk(KERN_DEBUG "hfs_btree_del_level\n");
        }
        if (!node->parent) {
                tree->root = 0;
                tree->depth = 0;
        }
        set_bit(HFS_BNODE_DELETED, &node->flags);
}

static inline int hfs_bnode_hash(u32 num)
{
        num = (num >> 16) + num;
        num += num >> 8;
        return num & (NODE_HASH_SIZE - 1);
}

struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
{
        struct hfs_bnode *node;

        if (cnid >= tree->node_count) {
                pr_err("request for non-existent node %d in B*Tree\n", cnid);
                return NULL;
        }

        for (node = tree->node_hash[hfs_bnode_hash(cnid)];
             node; node = node->next_hash) {
                if (node->this == cnid) {
                        return node;
                }
        }
        return NULL;
}

static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
{
        struct hfs_bnode *node, *node2;
        struct address_space *mapping;
        struct page *page;
        int size, block, i, hash;
        loff_t off;

        if (cnid >= tree->node_count) {
                pr_err("request for non-existent node %d in B*Tree\n", cnid);
                return NULL;
        }

        size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
                sizeof(struct page *);
        node = kzalloc(size, GFP_KERNEL);
        if (!node)
                return NULL;
        node->tree = tree;
        node->this = cnid;
        set_bit(HFS_BNODE_NEW, &node->flags);
        atomic_set(&node->refcnt, 1);
        hfs_dbg("cnid %d, node %d, refcnt 1\n",
                node->tree->cnid, node->this);
        init_waitqueue_head(&node->lock_wq);
        spin_lock(&tree->hash_lock);
        node2 = hfs_bnode_findhash(tree, cnid);
        if (!node2) {
                hash = hfs_bnode_hash(cnid);
                node->next_hash = tree->node_hash[hash];
                tree->node_hash[hash] = node;
                tree->node_hash_cnt++;
        } else {
                hfs_bnode_get(node2);
                spin_unlock(&tree->hash_lock);
                kfree(node);
                wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags));
                return node2;
        }
        spin_unlock(&tree->hash_lock);

        mapping = tree->inode->i_mapping;
        off = (loff_t)cnid * tree->node_size;
        block = off >> PAGE_SHIFT;
        node->page_offset = off & ~PAGE_MASK;
        for (i = 0; i < tree->pages_per_bnode; i++) {
                page = read_mapping_page(mapping, block++, NULL);
                if (IS_ERR(page))
                        goto fail;
                node->page[i] = page;
        }

        return node;
fail:
        set_bit(HFS_BNODE_ERROR, &node->flags);
        return node;
}

void hfs_bnode_unhash(struct hfs_bnode *node)
{
        struct hfs_bnode **p;

        hfs_dbg("cnid %d, node %d, refcnt %d\n",
                node->tree->cnid, node->this, atomic_read(&node->refcnt));
        for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
             *p && *p != node; p = &(*p)->next_hash)
                ;
        BUG_ON(!*p);
        *p = node->next_hash;
        node->tree->node_hash_cnt--;
}

/* Load a particular node out of a tree */
struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
{
        struct hfs_bnode *node;
        struct hfs_bnode_desc *desc;
        int i, rec_off, off, next_off;
        int entry_size, key_size;

        spin_lock(&tree->hash_lock);
        node = hfs_bnode_findhash(tree, num);
        if (node) {
                hfs_bnode_get(node);
                spin_unlock(&tree->hash_lock);
                wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags));
                if (test_bit(HFS_BNODE_ERROR, &node->flags))
                        goto node_error;
                return node;
        }
        spin_unlock(&tree->hash_lock);
        node = __hfs_bnode_create(tree, num);
        if (!node)
                return ERR_PTR(-ENOMEM);
        if (test_bit(HFS_BNODE_ERROR, &node->flags))
                goto node_error;
        if (!test_bit(HFS_BNODE_NEW, &node->flags))
                return node;

        desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) +
                                         node->page_offset);
        node->prev = be32_to_cpu(desc->prev);
        node->next = be32_to_cpu(desc->next);
        node->num_recs = be16_to_cpu(desc->num_recs);
        node->type = desc->type;
        node->height = desc->height;
        kunmap_local(desc);

        switch (node->type) {
        case HFS_NODE_HEADER:
        case HFS_NODE_MAP:
                if (node->height != 0)
                        goto node_error;
                break;
        case HFS_NODE_LEAF:
                if (node->height != 1)
                        goto node_error;
                break;
        case HFS_NODE_INDEX:
                if (node->height <= 1 || node->height > tree->depth)
                        goto node_error;
                break;
        default:
                goto node_error;
        }

        rec_off = tree->node_size - 2;
        off = hfs_bnode_read_u16(node, rec_off);
        if (off != sizeof(struct hfs_bnode_desc))
                goto node_error;
        for (i = 1; i <= node->num_recs; off = next_off, i++) {
                rec_off -= 2;
                next_off = hfs_bnode_read_u16(node, rec_off);
                if (next_off <= off ||
                    next_off > tree->node_size ||
                    next_off & 1)
                        goto node_error;
                entry_size = next_off - off;
                if (node->type != HFS_NODE_INDEX &&
                    node->type != HFS_NODE_LEAF)
                        continue;
                key_size = hfs_bnode_read_u8(node, off) + 1;
                if (key_size >= entry_size /*|| key_size & 1*/)
                        goto node_error;
        }
        clear_bit(HFS_BNODE_NEW, &node->flags);
        wake_up(&node->lock_wq);
        return node;

node_error:
        set_bit(HFS_BNODE_ERROR, &node->flags);
        clear_bit(HFS_BNODE_NEW, &node->flags);
        wake_up(&node->lock_wq);
        hfs_bnode_put(node);
        return ERR_PTR(-EIO);
}

void hfs_bnode_free(struct hfs_bnode *node)
{
        int i;

        for (i = 0; i < node->tree->pages_per_bnode; i++)
                if (node->page[i])
                        put_page(node->page[i]);
        kfree(node);
}

struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
{
        struct hfs_bnode *node;
        struct page **pagep;
        int i;

        spin_lock(&tree->hash_lock);
        node = hfs_bnode_findhash(tree, num);
        spin_unlock(&tree->hash_lock);
        if (node) {
                pr_crit("new node %u already hashed?\n", num);
                WARN_ON(1);
                return node;
        }
        node = __hfs_bnode_create(tree, num);
        if (!node)
                return ERR_PTR(-ENOMEM);
        if (test_bit(HFS_BNODE_ERROR, &node->flags)) {
                hfs_bnode_put(node);
                return ERR_PTR(-EIO);
        }

        pagep = node->page;
        memzero_page(*pagep, node->page_offset,
                     min((int)PAGE_SIZE, (int)tree->node_size));
        set_page_dirty(*pagep);
        for (i = 1; i < tree->pages_per_bnode; i++) {
                memzero_page(*++pagep, 0, PAGE_SIZE);
                set_page_dirty(*pagep);
        }
        clear_bit(HFS_BNODE_NEW, &node->flags);
        wake_up(&node->lock_wq);

        return node;
}

void hfs_bnode_get(struct hfs_bnode *node)
{
        if (node) {
                atomic_inc(&node->refcnt);
                hfs_dbg("cnid %d, node %d, refcnt %d\n",
                        node->tree->cnid, node->this,
                        atomic_read(&node->refcnt));
        }
}

/* Dispose of resources used by a node */
void hfs_bnode_put(struct hfs_bnode *node)
{
        if (node) {
                struct hfs_btree *tree = node->tree;
                int i;

                hfs_dbg("cnid %d, node %d, refcnt %d\n",
                        node->tree->cnid, node->this,
                        atomic_read(&node->refcnt));
                BUG_ON(!atomic_read(&node->refcnt));
                if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
                        return;
                for (i = 0; i < tree->pages_per_bnode; i++) {
                        if (!node->page[i])
                                continue;
                        mark_page_accessed(node->page[i]);
                }

                if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
                        hfs_bnode_unhash(node);
                        spin_unlock(&tree->hash_lock);
                        hfs_bnode_clear(node, 0, tree->node_size);
                        hfs_bmap_free(node);
                        hfs_bnode_free(node);
                        return;
                }
                spin_unlock(&tree->hash_lock);
        }
}




































































    1 



















































    1 










    1 







































































































































































    1 








    1 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/* inffast.c -- fast decoding
 * Copyright (C) 1995-2004 Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

#include <linux/zutil.h>
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"

#ifndef ASMINF

union uu {
        unsigned short us;
        unsigned char b[2];
};

/* Endian independent version */
static inline unsigned short
get_unaligned16(const unsigned short *p)
{
        union uu  mm;
        unsigned char *b = (unsigned char *)p;

        mm.b[0] = b[0];
        mm.b[1] = b[1];
        return mm.us;
}

/*
   Decode literal, length, and distance codes and write out the resulting
   literal and match bytes until either not enough input or output is
   available, an end-of-block is encountered, or a data error is encountered.
   When large enough input and output buffers are supplied to inflate(), for
   example, a 16K input buffer and a 64K output buffer, more than 95% of the
   inflate execution time is spent in this routine.

   Entry assumptions:

        state->mode == LEN
        strm->avail_in >= 6
        strm->avail_out >= 258
        start >= strm->avail_out
        state->bits < 8

   On return, state->mode is one of:

        LEN -- ran out of enough output space or enough available input
        TYPE -- reached end of block code, inflate() to interpret next block
        BAD -- error in block data

   Notes:

    - The maximum input bits used by a length/distance pair is 15 bits for the
      length code, 5 bits for the length extra, 15 bits for the distance code,
      and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
      Therefore if strm->avail_in >= 6, then there is enough input to avoid
      checking for available input while decoding.

    - The maximum bytes that a single length/distance pair can output is 258
      bytes, which is the maximum length that can be coded.  inflate_fast()
      requires strm->avail_out >= 258 for each loop to avoid checking for
      output space.

    - @start:        inflate()'s starting value for strm->avail_out
 */
void inflate_fast(z_streamp strm, unsigned start)
{
    struct inflate_state *state;
    const unsigned char *in;    /* local strm->next_in */
    const unsigned char *last;  /* while in < last, enough input available */
    unsigned char *out;         /* local strm->next_out */
    unsigned char *beg;         /* inflate()'s initial strm->next_out */
    unsigned char *end;         /* while out < end, enough space available */
#ifdef INFLATE_STRICT
    unsigned dmax;              /* maximum distance from zlib header */
#endif
    unsigned wsize;             /* window size or zero if not using window */
    unsigned whave;             /* valid bytes in the window */
    unsigned write;             /* window write index */
    unsigned char *window;      /* allocated sliding window, if wsize != 0 */
    unsigned long hold;         /* local strm->hold */
    unsigned bits;              /* local strm->bits */
    code const *lcode;          /* local strm->lencode */
    code const *dcode;          /* local strm->distcode */
    unsigned lmask;             /* mask for first level of length codes */
    unsigned dmask;             /* mask for first level of distance codes */
    code this;                  /* retrieved table entry */
    unsigned op;                /* code bits, operation, extra bits, or */
                                /*  window position, window bytes to copy */
    unsigned len;               /* match length, unused bytes */
    unsigned dist;              /* match distance */
    unsigned char *from;        /* where to copy match from */

    /* copy state to local variables */
    state = (struct inflate_state *)strm->state;
    in = strm->next_in;
    last = in + (strm->avail_in - 5);
    out = strm->next_out;
    beg = out - (start - strm->avail_out);
    end = out + (strm->avail_out - 257);
#ifdef INFLATE_STRICT
    dmax = state->dmax;
#endif
    wsize = state->wsize;
    whave = state->whave;
    write = state->write;
    window = state->window;
    hold = state->hold;
    bits = state->bits;
    lcode = state->lencode;
    dcode = state->distcode;
    lmask = (1U << state->lenbits) - 1;
    dmask = (1U << state->distbits) - 1;

    /* decode literals and length/distances until end-of-block or not enough
       input data or output space */
    do {
        if (bits < 15) {
            hold += (unsigned long)(*in++) << bits;
            bits += 8;
            hold += (unsigned long)(*in++) << bits;
            bits += 8;
        }
        this = lcode[hold & lmask];
      dolen:
        op = (unsigned)(this.bits);
        hold >>= op;
        bits -= op;
        op = (unsigned)(this.op);
        if (op == 0) {                          /* literal */
            *out++ = (unsigned char)(this.val);
        }
        else if (op & 16) {                     /* length base */
            len = (unsigned)(this.val);
            op &= 15;                           /* number of extra bits */
            if (op) {
                if (bits < op) {
                    hold += (unsigned long)(*in++) << bits;
                    bits += 8;
                }
                len += (unsigned)hold & ((1U << op) - 1);
                hold >>= op;
                bits -= op;
            }
            if (bits < 15) {
                hold += (unsigned long)(*in++) << bits;
                bits += 8;
                hold += (unsigned long)(*in++) << bits;
                bits += 8;
            }
            this = dcode[hold & dmask];
          dodist:
            op = (unsigned)(this.bits);
            hold >>= op;
            bits -= op;
            op = (unsigned)(this.op);
            if (op & 16) {                      /* distance base */
                dist = (unsigned)(this.val);
                op &= 15;                       /* number of extra bits */
                if (bits < op) {
                    hold += (unsigned long)(*in++) << bits;
                    bits += 8;
                    if (bits < op) {
                        hold += (unsigned long)(*in++) << bits;
                        bits += 8;
                    }
                }
                dist += (unsigned)hold & ((1U << op) - 1);
#ifdef INFLATE_STRICT
                if (dist > dmax) {
                    strm->msg = (char *)"invalid distance too far back";
                    state->mode = BAD;
                    break;
                }
#endif
                hold >>= op;
                bits -= op;
                op = (unsigned)(out - beg);     /* max distance in output */
                if (dist > op) {                /* see if copy from window */
                    op = dist - op;             /* distance back in window */
                    if (op > whave) {
                        strm->msg = (char *)"invalid distance too far back";
                        state->mode = BAD;
                        break;
                    }
                    from = window;
                    if (write == 0) {           /* very common case */
                        from += wsize - op;
                        if (op < len) {         /* some from window */
                            len -= op;
                            do {
                                *out++ = *from++;
                            } while (--op);
                            from = out - dist;  /* rest from output */
                        }
                    }
                    else if (write < op) {      /* wrap around window */
                        from += wsize + write - op;
                        op -= write;
                        if (op < len) {         /* some from end of window */
                            len -= op;
                            do {
                                *out++ = *from++;
                            } while (--op);
                            from = window;
                            if (write < len) {  /* some from start of window */
                                op = write;
                                len -= op;
                                do {
                                    *out++ = *from++;
                                } while (--op);
                                from = out - dist;      /* rest from output */
                            }
                        }
                    }
                    else {                      /* contiguous in window */
                        from += write - op;
                        if (op < len) {         /* some from window */
                            len -= op;
                            do {
                                *out++ = *from++;
                            } while (--op);
                            from = out - dist;  /* rest from output */
                        }
                    }
                    while (len > 2) {
                        *out++ = *from++;
                        *out++ = *from++;
                        *out++ = *from++;
                        len -= 3;
                    }
                    if (len) {
                        *out++ = *from++;
                        if (len > 1)
                            *out++ = *from++;
                    }
                }
                else {
                    unsigned short *sout;
                    unsigned long loops;

                    from = out - dist;          /* copy direct from output */
                    /* minimum length is three */
                    /* Align out addr */
                    if (!((long)(out - 1) & 1)) {
                        *out++ = *from++;
                        len--;
                    }
                    sout = (unsigned short *)(out);
                    if (dist > 2) {
                        unsigned short *sfrom;

                        sfrom = (unsigned short *)(from);
                        loops = len >> 1;
                        do {
                            if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                                *sout++ = *sfrom++;
                            else
                                *sout++ = get_unaligned16(sfrom++);
                        } while (--loops);
                        out = (unsigned char *)sout;
                        from = (unsigned char *)sfrom;
                    } else { /* dist == 1 or dist == 2 */
                        unsigned short pat16;

                        pat16 = *(sout-1);
                        if (dist == 1) {
                                union uu mm;
                                /* copy one char pattern to both bytes */
                                mm.us = pat16;
                                mm.b[0] = mm.b[1];
                                pat16 = mm.us;
                        }
                        loops = len >> 1;
                        do
                            *sout++ = pat16;
                        while (--loops);
                        out = (unsigned char *)sout;
                    }
                    if (len & 1)
                        *out++ = *from++;
                }
            }
            else if ((op & 64) == 0) {          /* 2nd level distance code */
                this = dcode[this.val + (hold & ((1U << op) - 1))];
                goto dodist;
            }
            else {
                strm->msg = (char *)"invalid distance code";
                state->mode = BAD;
                break;
            }
        }
        else if ((op & 64) == 0) {              /* 2nd level length code */
            this = lcode[this.val + (hold & ((1U << op) - 1))];
            goto dolen;
        }
        else if (op & 32) {                     /* end-of-block */
            state->mode = TYPE;
            break;
        }
        else {
            strm->msg = (char *)"invalid literal/length code";
            state->mode = BAD;
            break;
        }
    } while (in < last && out < end);

    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
    len = bits >> 3;
    in -= len;
    bits -= len << 3;
    hold &= (1U << bits) - 1;

    /* update state and return */
    strm->next_in = in;
    strm->next_out = out;
    strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
    strm->avail_out = (unsigned)(out < end ?
                                 257 + (end - out) : 257 - (out - end));
    state->hold = hold;
    state->bits = bits;
    return;
}

/*
   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
   - Using bit fields for code structure
   - Different op definition to avoid & for extra bits (do & for table bits)
   - Three separate decoding do-loops for direct, window, and write == 0
   - Special case for distance > 1 copies to do overlapped load and store copy
   - Explicit branch predictions (based on measured branch probabilities)
   - Deferring match copy and interspersed it with decoding subsequent codes
   - Swapping literal/length else
   - Swapping window/direct else
   - Larger unrolled copy loops (three is about right)
   - Moving len -= 3 statement into middle of loop
 */

#endif /* !ASMINF */








































































































































    1 





    1 

    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/cpumask.h>
#include <linux/export.h>
#include <linux/memblock.h>
#include <linux/numa.h>

/* These are not inline because of header tangles. */
#ifdef CONFIG_CPUMASK_OFFSTACK
/**
 * alloc_cpumask_var_node - allocate a struct cpumask on a given node
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 * @node: memory node from which to allocate or %NUMA_NO_NODE
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>).
 *
 * Return: TRUE if memory allocation succeeded, FALSE otherwise.
 *
 * In addition, mask will be NULL if this fails.  Note that gcc is
 * usually smart enough to know that mask can never be NULL if
 * CONFIG_CPUMASK_OFFSTACK=n, so does code elimination in that case
 * too.
 */
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        *mask = kmalloc_node(cpumask_size(), flags, node);

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        if (!*mask) {
                printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
                dump_stack();
        }
#endif

        return *mask != NULL;
}
EXPORT_SYMBOL(alloc_cpumask_var_node);

/**
 * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop (in <linux/cpumask.h>).
 * Either returns an allocated (zero-filled) cpumask, or causes the
 * system to panic.
 */
void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
        *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
}

/**
 * free_cpumask_var - frees memory allocated for a struct cpumask.
 * @mask: cpumask to free
 *
 * This is safe on a NULL mask.
 */
void free_cpumask_var(cpumask_var_t mask)
{
        kfree(mask);
}
EXPORT_SYMBOL(free_cpumask_var);

/**
 * free_bootmem_cpumask_var - frees result of alloc_bootmem_cpumask_var
 * @mask: cpumask to free
 */
void __init free_bootmem_cpumask_var(cpumask_var_t mask)
{
        memblock_free(mask, cpumask_size());
}
#endif

/**
 * cpumask_local_spread - select the i'th cpu based on NUMA distances
 * @i: index number
 * @node: local numa_node
 *
 * Return: online CPU according to a numa aware policy; local cpus are returned
 * first, followed by non-local ones, then it wraps around.
 *
 * For those who wants to enumerate all CPUs based on their NUMA distances,
 * i.e. call this function in a loop, like:
 *
 * for (i = 0; i < num_online_cpus(); i++) {
 *        cpu = cpumask_local_spread(i, node);
 *        do_something(cpu);
 * }
 *
 * There's a better alternative based on for_each()-like iterators:
 *
 *        for_each_numa_hop_mask(mask, node) {
 *                for_each_cpu_andnot(cpu, mask, prev)
 *                        do_something(cpu);
 *                prev = mask;
 *        }
 *
 * It's simpler and more verbose than above. Complexity of iterator-based
 * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while
 * cpumask_local_spread() when called for each cpu is
 * O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)).
 */
unsigned int cpumask_local_spread(unsigned int i, int node)
{
        unsigned int cpu;

        /* Wrap: we always want a cpu. */
        i %= num_online_cpus();

        cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node);

        WARN_ON(cpu >= nr_cpu_ids);
        return cpu;
}
EXPORT_SYMBOL(cpumask_local_spread);

static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);

/**
 * cpumask_any_and_distribute - Return an arbitrary cpu within src1p & src2p.
 * @src1p: first &cpumask for intersection
 * @src2p: second &cpumask for intersection
 *
 * Iterated calls using the same srcp1 and srcp2 will be distributed within
 * their intersection.
 *
 * Return: >= nr_cpu_ids if the intersection is empty.
 */
unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        unsigned int next, prev;

        /* NOTE: our first selection will skip 0. */
        prev = __this_cpu_read(distribute_cpu_mask_prev);

        next = cpumask_next_and_wrap(prev, src1p, src2p);
        if (next < nr_cpu_ids)
                __this_cpu_write(distribute_cpu_mask_prev, next);

        return next;
}
EXPORT_SYMBOL(cpumask_any_and_distribute);

/**
 * cpumask_any_distribute - Return an arbitrary cpu from srcp
 * @srcp: &cpumask for selection
 *
 * Return: >= nr_cpu_ids if the intersection is empty.
 */
unsigned int cpumask_any_distribute(const struct cpumask *srcp)
{
        unsigned int next, prev;

        /* NOTE: our first selection will skip 0. */
        prev = __this_cpu_read(distribute_cpu_mask_prev);
        next = cpumask_next_wrap(prev, srcp);
        if (next < nr_cpu_ids)
                __this_cpu_write(distribute_cpu_mask_prev, next);

        return next;
}
EXPORT_SYMBOL(cpumask_any_distribute);














































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 



















    3 




























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Helpers for initial module or kernel cmdline parsing
 * Copyright (C) 2001 Rusty Russell.
 */
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/kstrtox.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/overflow.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/string.h>

#ifdef CONFIG_SYSFS
/* Protects all built-in parameters, modules use their own param_lock */
static DEFINE_MUTEX(param_lock);

/* Use the module's mutex, or if built-in use the built-in mutex */
#ifdef CONFIG_MODULES
#define KPARAM_MUTEX(mod)        ((mod) ? &(mod)->param_lock : &param_lock)
#else
#define KPARAM_MUTEX(mod)        (&param_lock)
#endif

static inline void check_kparam_locked(struct module *mod)
{
        BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod)));
}
#else
static inline void check_kparam_locked(struct module *mod)
{
}
#endif /* !CONFIG_SYSFS */

/* This just allows us to keep track of which parameters are kmalloced. */
struct kmalloced_param {
        struct list_head list;
        char val[];
};
static LIST_HEAD(kmalloced_params);
static DEFINE_SPINLOCK(kmalloced_params_lock);

static void *kmalloc_parameter(unsigned int size)
{
        struct kmalloced_param *p;

        p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL);
        if (!p)
                return NULL;

        spin_lock(&kmalloced_params_lock);
        list_add(&p->list, &kmalloced_params);
        spin_unlock(&kmalloced_params_lock);

        return p->val;
}

/* Does nothing if parameter wasn't kmalloced above. */
static void maybe_kfree_parameter(void *param)
{
        struct kmalloced_param *p;

        spin_lock(&kmalloced_params_lock);
        list_for_each_entry(p, &kmalloced_params, list) {
                if (p->val == param) {
                        list_del(&p->list);
                        kfree(p);
                        break;
                }
        }
        spin_unlock(&kmalloced_params_lock);
}

static char dash2underscore(char c)
{
        if (c == '-')
                return '_';
        return c;
}

bool parameqn(const char *a, const char *b, size_t n)
{
        size_t i;

        for (i = 0; i < n; i++) {
                if (dash2underscore(a[i]) != dash2underscore(b[i]))
                        return false;
        }
        return true;
}

bool parameq(const char *a, const char *b)
{
        return parameqn(a, b, strlen(a)+1);
}

static bool param_check_unsafe(const struct kernel_param *kp)
{
        if (kp->flags & KERNEL_PARAM_FL_HWPARAM &&
            security_locked_down(LOCKDOWN_MODULE_PARAMETERS))
                return false;

        if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
                pr_notice("Setting dangerous option %s - tainting kernel\n",
                          kp->name);
                add_taint(TAINT_USER, LOCKDEP_STILL_OK);
        }

        return true;
}

static int parse_one(char *param,
                     char *val,
                     const char *doing,
                     const struct kernel_param *params,
                     unsigned num_params,
                     s16 min_level,
                     s16 max_level,
                     void *arg, parse_unknown_fn handle_unknown)
{
        unsigned int i;
        int err;

        /* Find parameter */
        for (i = 0; i < num_params; i++) {
                if (parameq(param, params[i].name)) {
                        if (params[i].level < min_level
                            || params[i].level > max_level)
                                return 0;
                        /* No one handled NULL, so do it here. */
                        if (!val &&
                            !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG))
                                return -EINVAL;
                        pr_debug("handling %s with %p\n", param,
                                params[i].ops->set);
                        kernel_param_lock(params[i].mod);
                        if (param_check_unsafe(&params[i]))
                                err = params[i].ops->set(val, &params[i]);
                        else
                                err = -EPERM;
                        kernel_param_unlock(params[i].mod);
                        return err;
                }
        }

        if (handle_unknown) {
                pr_debug("doing %s: %s='%s'\n", doing, param, val);
                return handle_unknown(param, val, doing, arg);
        }

        pr_debug("Unknown argument '%s'\n", param);
        return -ENOENT;
}

/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
char *parse_args(const char *doing,
                 char *args,
                 const struct kernel_param *params,
                 unsigned int num,
                 s16 min_level,
                 s16 max_level,
                 void *arg, parse_unknown_fn unknown)
{
        char *param, *val, *err = NULL;

        /* Chew leading spaces */
        args = skip_spaces(args);

        if (*args)
                pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);

        while (*args) {
                int ret;
                int irq_was_disabled;

                args = next_arg(args, &param, &val);
                /* Stop at -- */
                if (!val && strcmp(param, "--") == 0)
                        return err ?: args;
                irq_was_disabled = irqs_disabled();
                ret = parse_one(param, val, doing, params, num,
                                min_level, max_level, arg, unknown);
                if (irq_was_disabled && !irqs_disabled())
                        pr_warn("%s: option '%s' enabled irq's!\n",
                                doing, param);

                switch (ret) {
                case 0:
                        continue;
                case -ENOENT:
                        pr_err("%s: Unknown parameter `%s'\n", doing, param);
                        break;
                case -ENOSPC:
                        pr_err("%s: `%s' too large for parameter `%s'\n",
                               doing, val ?: "", param);
                        break;
                default:
                        pr_err("%s: `%s' invalid for parameter `%s'\n",
                               doing, val ?: "", param);
                        break;
                }

                err = ERR_PTR(ret);
        }

        return err;
}

/* Lazy bastard, eh? */
#define STANDARD_PARAM_DEF(name, type, format, strtolfn)                      \
        int param_set_##name(const char *val, const struct kernel_param *kp) \
        {                                                                \
                return strtolfn(val, 0, (type *)kp->arg);                \
        }                                                                \
        int param_get_##name(char *buffer, const struct kernel_param *kp) \
        {                                                                \
                return scnprintf(buffer, PAGE_SIZE, format "\n",        \
                                *((type *)kp->arg));                        \
        }                                                                \
        const struct kernel_param_ops param_ops_##name = {                        \
                .set = param_set_##name,                                \
                .get = param_get_##name,                                \
        };                                                                \
        EXPORT_SYMBOL(param_set_##name);                                \
        EXPORT_SYMBOL(param_get_##name);                                \
        EXPORT_SYMBOL(param_ops_##name)


STANDARD_PARAM_DEF(byte,        unsigned char,                "%hhu",                kstrtou8);
STANDARD_PARAM_DEF(short,        short,                        "%hi",                kstrtos16);
STANDARD_PARAM_DEF(ushort,        unsigned short,                "%hu",                kstrtou16);
STANDARD_PARAM_DEF(int,                int,                        "%i",                kstrtoint);
STANDARD_PARAM_DEF(uint,        unsigned int,                "%u",                kstrtouint);
STANDARD_PARAM_DEF(long,        long,                        "%li",                kstrtol);
STANDARD_PARAM_DEF(ulong,        unsigned long,                "%lu",                kstrtoul);
STANDARD_PARAM_DEF(ullong,        unsigned long long,        "%llu",                kstrtoull);
STANDARD_PARAM_DEF(hexint,        unsigned int,                "%#08x",         kstrtouint);

int param_set_uint_minmax(const char *val, const struct kernel_param *kp,
                unsigned int min, unsigned int max)
{
        unsigned int num;
        int ret;

        if (!val)
                return -EINVAL;
        ret = kstrtouint(val, 0, &num);
        if (ret)
                return ret;
        if (num < min || num > max)
                return -EINVAL;
        *((unsigned int *)kp->arg) = num;
        return 0;
}
EXPORT_SYMBOL_GPL(param_set_uint_minmax);

int param_set_charp(const char *val, const struct kernel_param *kp)
{
        size_t len, maxlen = 1024;

        len = strnlen(val, maxlen + 1);
        if (len == maxlen + 1) {
                pr_err("%s: string parameter too long\n", kp->name);
                return -ENOSPC;
        }

        maybe_kfree_parameter(*(char **)kp->arg);

        /*
         * This is a hack. We can't kmalloc() in early boot, and we
         * don't need to; this mangled commandline is preserved.
         */
        if (slab_is_available()) {
                *(char **)kp->arg = kmalloc_parameter(len + 1);
                if (!*(char **)kp->arg)
                        return -ENOMEM;
                strcpy(*(char **)kp->arg, val);
        } else
                *(const char **)kp->arg = val;

        return 0;
}
EXPORT_SYMBOL(param_set_charp);

int param_get_charp(char *buffer, const struct kernel_param *kp)
{
        return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
}
EXPORT_SYMBOL(param_get_charp);

void param_free_charp(void *arg)
{
        maybe_kfree_parameter(*((char **)arg));
}
EXPORT_SYMBOL(param_free_charp);

const struct kernel_param_ops param_ops_charp = {
        .set = param_set_charp,
        .get = param_get_charp,
        .free = param_free_charp,
};
EXPORT_SYMBOL(param_ops_charp);

/* Actually could be a bool or an int, for historical reasons. */
int param_set_bool(const char *val, const struct kernel_param *kp)
{
        /* No equals means "set"... */
        if (!val) val = "1";

        /* One of =[yYnN01] */
        return kstrtobool(val, kp->arg);
}
EXPORT_SYMBOL(param_set_bool);

int param_get_bool(char *buffer, const struct kernel_param *kp)
{
        /* Y and N chosen as being relatively non-coder friendly */
        return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
}
EXPORT_SYMBOL(param_get_bool);

const struct kernel_param_ops param_ops_bool = {
        .flags = KERNEL_PARAM_OPS_FL_NOARG,
        .set = param_set_bool,
        .get = param_get_bool,
};
EXPORT_SYMBOL(param_ops_bool);

int param_set_bool_enable_only(const char *val, const struct kernel_param *kp)
{
        int err;
        bool new_value;
        bool orig_value = *(bool *)kp->arg;
        struct kernel_param dummy_kp = *kp;

        dummy_kp.arg = &new_value;

        err = param_set_bool(val, &dummy_kp);
        if (err)
                return err;

        /* Don't let them unset it once it's set! */
        if (!new_value && orig_value)
                return -EROFS;

        if (new_value)
                err = param_set_bool(val, kp);

        return err;
}
EXPORT_SYMBOL_GPL(param_set_bool_enable_only);

const struct kernel_param_ops param_ops_bool_enable_only = {
        .flags = KERNEL_PARAM_OPS_FL_NOARG,
        .set = param_set_bool_enable_only,
        .get = param_get_bool,
};
EXPORT_SYMBOL_GPL(param_ops_bool_enable_only);

/* This one must be bool. */
int param_set_invbool(const char *val, const struct kernel_param *kp)
{
        int ret;
        bool boolval;
        struct kernel_param dummy;

        dummy.arg = &boolval;
        ret = param_set_bool(val, &dummy);
        if (ret == 0)
                *(bool *)kp->arg = !boolval;
        return ret;
}
EXPORT_SYMBOL(param_set_invbool);

int param_get_invbool(char *buffer, const struct kernel_param *kp)
{
        return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
}
EXPORT_SYMBOL(param_get_invbool);

const struct kernel_param_ops param_ops_invbool = {
        .set = param_set_invbool,
        .get = param_get_invbool,
};
EXPORT_SYMBOL(param_ops_invbool);

int param_set_bint(const char *val, const struct kernel_param *kp)
{
        /* Match bool exactly, by re-using it. */
        struct kernel_param boolkp = *kp;
        bool v;
        int ret;

        boolkp.arg = &v;

        ret = param_set_bool(val, &boolkp);
        if (ret == 0)
                *(int *)kp->arg = v;
        return ret;
}
EXPORT_SYMBOL(param_set_bint);

const struct kernel_param_ops param_ops_bint = {
        .flags = KERNEL_PARAM_OPS_FL_NOARG,
        .set = param_set_bint,
        .get = param_get_int,
};
EXPORT_SYMBOL(param_ops_bint);

/* We break the rule and mangle the string. */
static int param_array(struct module *mod,
                       const char *name,
                       const char *val,
                       unsigned int min, unsigned int max,
                       void *elem, int elemsize,
                       int (*set)(const char *, const struct kernel_param *kp),
                       s16 level,
                       unsigned int *num)
{
        int ret;
        struct kernel_param kp;
        char save;

        /* Get the name right for errors. */
        kp.name = name;
        kp.arg = elem;
        kp.level = level;

        *num = 0;
        /* We expect a comma-separated list of values. */
        do {
                int len;

                if (*num == max) {
                        pr_err("%s: can only take %i arguments\n", name, max);
                        return -EINVAL;
                }
                len = strcspn(val, ",");

                /* nul-terminate and parse */
                save = val[len];
                ((char *)val)[len] = '\0';
                check_kparam_locked(mod);
                ret = set(val, &kp);

                if (ret != 0)
                        return ret;
                kp.arg += elemsize;
                val += len+1;
                (*num)++;
        } while (save == ',');

        if (*num < min) {
                pr_err("%s: needs at least %i arguments\n", name, min);
                return -EINVAL;
        }
        return 0;
}

static int param_array_set(const char *val, const struct kernel_param *kp)
{
        const struct kparam_array *arr = kp->arr;
        unsigned int temp_num;

        return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem,
                           arr->elemsize, arr->ops->set, kp->level,
                           arr->num ?: &temp_num);
}

static int param_array_get(char *buffer, const struct kernel_param *kp)
{
        int i, off, ret;
        const struct kparam_array *arr = kp->arr;
        struct kernel_param p = *kp;

        for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
                /* Replace \n with comma */
                if (i)
                        buffer[off - 1] = ',';
                p.arg = arr->elem + arr->elemsize * i;
                check_kparam_locked(p.mod);
                ret = arr->ops->get(buffer + off, &p);
                if (ret < 0)
                        return ret;
                off += ret;
        }
        buffer[off] = '\0';
        return off;
}

static void param_array_free(void *arg)
{
        unsigned int i;
        const struct kparam_array *arr = arg;

        if (arr->ops->free)
                for (i = 0; i < (arr->num ? *arr->num : arr->max); i++)
                        arr->ops->free(arr->elem + arr->elemsize * i);
}

const struct kernel_param_ops param_array_ops = {
        .set = param_array_set,
        .get = param_array_get,
        .free = param_array_free,
};
EXPORT_SYMBOL(param_array_ops);

int param_set_copystring(const char *val, const struct kernel_param *kp)
{
        const struct kparam_string *kps = kp->str;
        const size_t len = strnlen(val, kps->maxlen);

        if (len == kps->maxlen) {
                pr_err("%s: string doesn't fit in %u chars.\n",
                       kp->name, kps->maxlen-1);
                return -ENOSPC;
        }
        memcpy(kps->string, val, len + 1);
        return 0;
}
EXPORT_SYMBOL(param_set_copystring);

int param_get_string(char *buffer, const struct kernel_param *kp)
{
        const struct kparam_string *kps = kp->str;
        return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
}
EXPORT_SYMBOL(param_get_string);

const struct kernel_param_ops param_ops_string = {
        .set = param_set_copystring,
        .get = param_get_string,
};
EXPORT_SYMBOL(param_ops_string);

/* sysfs output in /sys/modules/XYZ/parameters/ */
#define to_module_attr(n) container_of_const(n, struct module_attribute, attr)
#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)

struct param_attribute
{
        struct module_attribute mattr;
        const struct kernel_param *param;
};

struct module_param_attrs
{
        unsigned int num;
        struct attribute_group grp;
        struct param_attribute attrs[] __counted_by(num);
};

#ifdef CONFIG_SYSFS
#define to_param_attr(n) container_of_const(n, struct param_attribute, mattr)

static ssize_t param_attr_show(const struct module_attribute *mattr,
                               struct module_kobject *mk, char *buf)
{
        int count;
        const struct param_attribute *attribute = to_param_attr(mattr);

        if (!attribute->param->ops->get)
                return -EPERM;

        kernel_param_lock(mk->mod);
        count = attribute->param->ops->get(buf, attribute->param);
        kernel_param_unlock(mk->mod);
        return count;
}

/* sysfs always hands a nul-terminated string in buf.  We rely on that. */
static ssize_t param_attr_store(const struct module_attribute *mattr,
                                struct module_kobject *mk,
                                const char *buf, size_t len)
{
         int err;
        const struct param_attribute *attribute = to_param_attr(mattr);

        if (!attribute->param->ops->set)
                return -EPERM;

        kernel_param_lock(mk->mod);
        if (param_check_unsafe(attribute->param))
                err = attribute->param->ops->set(buf, attribute->param);
        else
                err = -EPERM;
        kernel_param_unlock(mk->mod);
        if (!err)
                return len;
        return err;
}
#endif

#ifdef CONFIG_SYSFS
void kernel_param_lock(struct module *mod)
{
        mutex_lock(KPARAM_MUTEX(mod));
}

void kernel_param_unlock(struct module *mod)
{
        mutex_unlock(KPARAM_MUTEX(mod));
}

EXPORT_SYMBOL(kernel_param_lock);
EXPORT_SYMBOL(kernel_param_unlock);

/*
 * add_sysfs_param - add a parameter to sysfs
 * @mk: struct module_kobject
 * @kp: the actual parameter definition to add to sysfs
 * @name: name of parameter
 *
 * Create a kobject if for a (per-module) parameter if mp NULL, and
 * create file in sysfs.  Returns an error on out of memory.  Always cleans up
 * if there's an error.
 */
static __init_or_module int add_sysfs_param(struct module_kobject *mk,
                                            const struct kernel_param *kp,
                                            const char *name)
{
        struct module_param_attrs *new_mp;
        struct attribute **new_attrs;
        unsigned int i;

        /* We don't bother calling this with invisible parameters. */
        BUG_ON(!kp->perm);

        if (!mk->mp) {
                /* First allocation. */
                mk->mp = kzalloc_obj(*mk->mp);
                if (!mk->mp)
                        return -ENOMEM;
                mk->mp->grp.name = "parameters";
                /* NULL-terminated attribute array. */
                mk->mp->grp.attrs = kzalloc_obj(mk->mp->grp.attrs[0]);
                /* Caller will cleanup via free_module_param_attrs */
                if (!mk->mp->grp.attrs)
                        return -ENOMEM;
        }

        /* Enlarge allocations. */
        new_mp = krealloc(mk->mp, struct_size(mk->mp, attrs, mk->mp->num + 1),
                          GFP_KERNEL);
        if (!new_mp)
                return -ENOMEM;
        mk->mp = new_mp;
        mk->mp->num++;

        /* Extra pointer for NULL terminator */
        new_attrs = krealloc_array(mk->mp->grp.attrs, mk->mp->num + 1,
                                   sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL);
        if (!new_attrs)
                return -ENOMEM;
        mk->mp->grp.attrs = new_attrs;

        /* Tack new one on the end. */
        memset(&mk->mp->attrs[mk->mp->num - 1], 0, sizeof(mk->mp->attrs[0]));
        sysfs_attr_init(&mk->mp->attrs[mk->mp->num - 1].mattr.attr);
        mk->mp->attrs[mk->mp->num - 1].param = kp;
        mk->mp->attrs[mk->mp->num - 1].mattr.show = param_attr_show;
        /* Do not allow runtime DAC changes to make param writable. */
        if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
                mk->mp->attrs[mk->mp->num - 1].mattr.store = param_attr_store;
        else
                mk->mp->attrs[mk->mp->num - 1].mattr.store = NULL;
        mk->mp->attrs[mk->mp->num - 1].mattr.attr.name = (char *)name;
        mk->mp->attrs[mk->mp->num - 1].mattr.attr.mode = kp->perm;

        /* Fix up all the pointers, since krealloc can move us */
        for (i = 0; i < mk->mp->num; i++)
                mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr;
        mk->mp->grp.attrs[mk->mp->num] = NULL;
        return 0;
}

#ifdef CONFIG_MODULES
static void free_module_param_attrs(struct module_kobject *mk)
{
        if (mk->mp)
                kfree(mk->mp->grp.attrs);
        kfree(mk->mp);
        mk->mp = NULL;
}

/*
 * module_param_sysfs_setup - setup sysfs support for one module
 * @mod: module
 * @kparam: module parameters (array)
 * @num_params: number of module parameters
 *
 * Adds sysfs entries for module parameters under
 * /sys/module/[mod->name]/parameters/
 */
int module_param_sysfs_setup(struct module *mod,
                             const struct kernel_param *kparam,
                             unsigned int num_params)
{
        int i, err;
        bool params = false;

        for (i = 0; i < num_params; i++) {
                if (kparam[i].perm == 0)
                        continue;
                err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
                if (err) {
                        free_module_param_attrs(&mod->mkobj);
                        return err;
                }
                params = true;
        }

        if (!params)
                return 0;

        /* Create the param group. */
        err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
        if (err)
                free_module_param_attrs(&mod->mkobj);
        return err;
}

/*
 * module_param_sysfs_remove - remove sysfs support for one module
 * @mod: module
 *
 * Remove sysfs entries for module parameters and the corresponding
 * kobject.
 */
void module_param_sysfs_remove(struct module *mod)
{
        if (mod->mkobj.mp) {
                sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
                /*
                 * We are positive that no one is using any param
                 * attrs at this point. Deallocate immediately.
                 */
                free_module_param_attrs(&mod->mkobj);
        }
}
#endif

struct module_kobject * __init_or_module
lookup_or_create_module_kobject(const char *name)
{
        struct module_kobject *mk;
        struct kobject *kobj;
        int err;

        kobj = kset_find_obj(module_kset, name);
        if (kobj)
                return to_module_kobject(kobj);

        mk = kzalloc_obj(struct module_kobject);
        if (!mk)
                return NULL;

        mk->mod = THIS_MODULE;
        mk->kobj.kset = module_kset;
        err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
        if (IS_ENABLED(CONFIG_MODULES) && !err)
                err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
        if (err) {
                kobject_put(&mk->kobj);
                pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
                        name, err);
                return NULL;
        }

        /* So that we hold reference in both cases. */
        kobject_get(&mk->kobj);

        return mk;
}

static void __init kernel_add_sysfs_param(const char *name,
                                          const struct kernel_param *kparam,
                                          unsigned int name_skip)
{
        struct module_kobject *mk;
        int err;

        mk = lookup_or_create_module_kobject(name);
        if (!mk)
                return;

        /* We need to remove old parameters before adding more. */
        if (mk->mp)
                sysfs_remove_group(&mk->kobj, &mk->mp->grp);

        /* These should not fail at boot. */
        err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
        BUG_ON(err);
        err = sysfs_create_group(&mk->kobj, &mk->mp->grp);
        BUG_ON(err);
        kobject_uevent(&mk->kobj, KOBJ_ADD);
        kobject_put(&mk->kobj);
}

/*
 * param_sysfs_builtin - add sysfs parameters for built-in modules
 *
 * Add module_parameters to sysfs for "modules" built into the kernel.
 *
 * The "module" name (KBUILD_MODNAME) is stored before a dot, the
 * "parameter" name is stored behind a dot in kernel_param->name. So,
 * extract the "module" name for all built-in kernel_param-eters,
 * and for all who have the same, call kernel_add_sysfs_param.
 */
static void __init param_sysfs_builtin(void)
{
        const struct kernel_param *kp;
        unsigned int name_len;
        char modname[MODULE_NAME_LEN];

        for (kp = __start___param; kp < __stop___param; kp++) {
                char *dot;

                if (kp->perm == 0)
                        continue;

                dot = strchr(kp->name, '.');
                if (!dot) {
                        /* This happens for core_param() */
                        strscpy(modname, "kernel");
                        name_len = 0;
                } else {
                        name_len = dot - kp->name + 1;
                        strscpy(modname, kp->name, name_len);
                }
                kernel_add_sysfs_param(modname, kp, name_len);
        }
}

ssize_t __modver_version_show(const struct module_attribute *mattr,
                              struct module_kobject *mk, char *buf)
{
        const struct module_version_attribute *vattr =
                container_of_const(mattr, struct module_version_attribute, mattr);

        return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
}

extern const struct module_version_attribute __start___modver[];
extern const struct module_version_attribute __stop___modver[];

static void __init version_sysfs_builtin(void)
{
        const struct module_version_attribute *vattr;
        struct module_kobject *mk;
        int err;

        for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
                mk = lookup_or_create_module_kobject(vattr->module_name);
                if (mk) {
                        err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
                        WARN_ON_ONCE(err);
                        kobject_uevent(&mk->kobj, KOBJ_ADD);
                        kobject_put(&mk->kobj);
                }
        }
}

/* module-related sysfs stuff */

static ssize_t module_attr_show(struct kobject *kobj,
                                struct attribute *attr,
                                char *buf)
{
        const struct module_attribute *attribute;
        struct module_kobject *mk;
        int ret;

        attribute = to_module_attr(attr);
        mk = to_module_kobject(kobj);

        if (!attribute->show)
                return -EIO;

        ret = attribute->show(attribute, mk, buf);

        return ret;
}

static ssize_t module_attr_store(struct kobject *kobj,
                                struct attribute *attr,
                                const char *buf, size_t len)
{
        const struct module_attribute *attribute;
        struct module_kobject *mk;
        int ret;

        attribute = to_module_attr(attr);
        mk = to_module_kobject(kobj);

        if (!attribute->store)
                return -EIO;

        ret = attribute->store(attribute, mk, buf, len);

        return ret;
}

static const struct sysfs_ops module_sysfs_ops = {
        .show = module_attr_show,
        .store = module_attr_store,
};

static int uevent_filter(const struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);

        if (ktype == &module_ktype)
                return 1;
        return 0;
}

static const struct kset_uevent_ops module_uevent_ops = {
        .filter = uevent_filter,
};

struct kset *module_kset;

static void module_kobj_release(struct kobject *kobj)
{
        struct module_kobject *mk = to_module_kobject(kobj);

        if (mk->kobj_completion)
                complete(mk->kobj_completion);
}

const struct kobj_type module_ktype = {
        .release   =        module_kobj_release,
        .sysfs_ops =        &module_sysfs_ops,
};

/*
 * param_sysfs_init - create "module" kset
 *
 * This must be done before the initramfs is unpacked and
 * request_module() thus becomes possible, because otherwise the
 * module load would fail in mod_sysfs_init.
 */
static int __init param_sysfs_init(void)
{
        module_kset = kset_create_and_add("module", &module_uevent_ops, NULL);
        if (!module_kset) {
                printk(KERN_WARNING "%s (%d): error creating kset\n",
                        __FILE__, __LINE__);
                return -ENOMEM;
        }

        return 0;
}
subsys_initcall(param_sysfs_init);

/*
 * param_sysfs_builtin_init - add sysfs version and parameter
 * attributes for built-in modules
 */
static int __init param_sysfs_builtin_init(void)
{
        if (!module_kset)
                return -ENOMEM;

        version_sysfs_builtin();
        param_sysfs_builtin();

        return 0;
}
late_initcall(param_sysfs_builtin_init);

#endif /* CONFIG_SYSFS */

#ifdef CONFIG_MODULES

/*
 * module_destroy_params - free all parameters for one module
 * @params: module parameters (array)
 * @num: number of module parameters
 */
void module_destroy_params(const struct kernel_param *params, unsigned int num)
{
        unsigned int i;

        for (i = 0; i < num; i++)
                if (params[i].ops->free)
                        params[i].ops->free(params[i].arg);
}

#endif /* CONFIG_MODULES */





























































































































































































































































































































    3 
    3 




























































































































    2 




    2 




























    1 

    1 






    1 























    1 








    1 



















    1 












    2 

    1 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_NEIGHBOUR_H
#define _NET_NEIGHBOUR_H

#include <linux/neighbour.h>

/*
 *        Generic neighbour manipulation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
 *
 *         Changes:
 *
 *        Harald Welte:                <laforge@gnumonks.org>
 *                - Add neighbour cache statistics like rtstat
 */

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
#include <linux/bitmap.h>

#include <linux/err.h>
#include <linux/sysctl.h>
#include <linux/workqueue.h>
#include <net/rtnetlink.h>
#include <net/neighbour_tables.h>

/*
 * NUD stands for "neighbor unreachability detection"
 */

#define NUD_IN_TIMER        (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE)
#define NUD_VALID        (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY)
#define NUD_CONNECTED        (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE)

struct neighbour;

enum {
        NEIGH_VAR_MCAST_PROBES,
        NEIGH_VAR_UCAST_PROBES,
        NEIGH_VAR_APP_PROBES,
        NEIGH_VAR_MCAST_REPROBES,
        NEIGH_VAR_RETRANS_TIME,
        NEIGH_VAR_BASE_REACHABLE_TIME,
        NEIGH_VAR_DELAY_PROBE_TIME,
        NEIGH_VAR_INTERVAL_PROBE_TIME_MS,
        NEIGH_VAR_GC_STALETIME,
        NEIGH_VAR_QUEUE_LEN_BYTES,
        NEIGH_VAR_PROXY_QLEN,
        NEIGH_VAR_ANYCAST_DELAY,
        NEIGH_VAR_PROXY_DELAY,
        NEIGH_VAR_LOCKTIME,
#define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1)
        /* Following are used as a second way to access one of the above */
        NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */
        NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */
        NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */
        /* Following are used by "default" only */
        NEIGH_VAR_GC_INTERVAL,
        NEIGH_VAR_GC_THRESH1,
        NEIGH_VAR_GC_THRESH2,
        NEIGH_VAR_GC_THRESH3,
        NEIGH_VAR_MAX
};

struct neigh_parms {
        possible_net_t net;
        struct net_device *dev;
        netdevice_tracker dev_tracker;
        struct list_head list;
        int        (*neigh_setup)(struct neighbour *);
        struct neigh_table *tbl;

        void        *sysctl_table;

        int dead;
        refcount_t refcnt;
        struct rcu_head rcu_head;

        int        reachable_time;
        u32        qlen;
        int        data[NEIGH_VAR_DATA_MAX];
        DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX);
};

static inline void neigh_var_set(struct neigh_parms *p, int index, int val)
{
        set_bit(index, p->data_state);
        WRITE_ONCE(p->data[index], val);
}

#define __NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr])
#define NEIGH_VAR(p, attr) READ_ONCE(__NEIGH_VAR(p, attr))
#define NEIGH_VAR_PTR(p, attr) (&(__NEIGH_VAR(p, attr)))

/* In ndo_neigh_setup, NEIGH_VAR_INIT should be used.
 * In other cases, NEIGH_VAR_SET should be used.
 */
#define NEIGH_VAR_INIT(p, attr, val) (__NEIGH_VAR(p, attr) = val)
#define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val)

static inline void neigh_parms_data_state_setall(struct neigh_parms *p)
{
        bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX);
}

static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p)
{
        bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX);
}

struct neigh_statistics {
        unsigned long allocs;                /* number of allocated neighs */
        unsigned long destroys;                /* number of destroyed neighs */
        unsigned long hash_grows;        /* number of hash resizes */

        unsigned long res_failed;        /* number of failed resolutions */

        unsigned long lookups;                /* number of lookups */
        unsigned long hits;                /* number of hits (among lookups) */

        unsigned long rcv_probes_mcast;        /* number of received mcast ipv6 */
        unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */

        unsigned long periodic_gc_runs;        /* number of periodic GC runs */
        unsigned long forced_gc_runs;        /* number of forced GC runs */

        unsigned long unres_discards;        /* number of unresolved drops */
        unsigned long table_fulls;      /* times even gc couldn't help */
};

#define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field)

struct neighbour {
        struct hlist_node        hash;
        struct hlist_node        dev_list;
        struct neigh_table        *tbl;
        struct neigh_parms        *parms;
        unsigned long                confirmed;
        unsigned long                updated;
        rwlock_t                lock;
        refcount_t                refcnt;
        unsigned int                arp_queue_len_bytes;
        struct sk_buff_head        arp_queue;
        struct timer_list        timer;
        unsigned long                used;
        atomic_t                probes;
        u8                        nud_state;
        u8                        type;
        u8                        dead;
        u8                        protocol;
        u32                        flags;
        seqlock_t                ha_lock;
        unsigned char                ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8);
        struct hh_cache                hh;
        int                        (*output)(struct neighbour *, struct sk_buff *);
        const struct neigh_ops        *ops;
        struct list_head        gc_list;
        struct list_head        managed_list;
        struct rcu_head                rcu;
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;
        u8                        primary_key[];
} __randomize_layout;

struct neigh_ops {
        int                        family;
        void                        (*solicit)(struct neighbour *, struct sk_buff *);
        void                        (*error_report)(struct neighbour *, struct sk_buff *);
        int                        (*output)(struct neighbour *, struct sk_buff *);
        int                        (*connected_output)(struct neighbour *, struct sk_buff *);
};

struct pneigh_entry {
        struct pneigh_entry        __rcu *next;
        possible_net_t                net;
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;
        union {
                struct list_head        free_node;
                struct rcu_head                rcu;
        };
        u32                        flags;
        u8                        protocol;
        bool                        permanent;
        u32                        key[];
};

/*
 *        neighbour table manipulation
 */

#define NEIGH_NUM_HASH_RND        4

struct neigh_hash_table {
        struct hlist_head        *hash_heads;
        unsigned int                hash_shift;
        __u32                        hash_rnd[NEIGH_NUM_HASH_RND];
        struct rcu_head                rcu;
};


struct neigh_table {
        int                        family;
        unsigned int                entry_size;
        unsigned int                key_len;
        __be16                        protocol;
        __u32                        (*hash)(const void *pkey,
                                        const struct net_device *dev,
                                        __u32 *hash_rnd);
        bool                        (*key_eq)(const struct neighbour *, const void *pkey);
        int                        (*constructor)(struct neighbour *);
        int                        (*pconstructor)(struct pneigh_entry *);
        void                        (*pdestructor)(struct pneigh_entry *);
        void                        (*proxy_redo)(struct sk_buff *skb);
        int                        (*is_multicast)(const void *pkey);
        bool                        (*allow_add)(const struct net_device *dev,
                                             struct netlink_ext_ack *extack);
        char                        *id;
        struct neigh_parms        parms;
        struct list_head        parms_list;
        int                        gc_interval;
        int                        gc_thresh1;
        int                        gc_thresh2;
        int                        gc_thresh3;
        unsigned long                last_flush;
        struct delayed_work        gc_work;
        struct delayed_work        managed_work;
        struct timer_list         proxy_timer;
        struct sk_buff_head        proxy_queue;
        atomic_t                entries;
        atomic_t                gc_entries;
        struct list_head        gc_list;
        struct list_head        managed_list;
        spinlock_t                lock;
        unsigned long                last_rand;
        struct neigh_statistics        __percpu *stats;
        struct neigh_hash_table __rcu *nht;
        struct mutex                phash_lock;
        struct pneigh_entry        __rcu **phash_buckets;
};

static inline int neigh_parms_family(struct neigh_parms *p)
{
        return p->tbl->family;
}

#define NEIGH_PRIV_ALIGN        sizeof(long long)
#define NEIGH_ENTRY_SIZE(size)        ALIGN((size), NEIGH_PRIV_ALIGN)

static inline void *neighbour_priv(const struct neighbour *n)
{
        return (char *)n + n->tbl->entry_size;
}

/* flags for neigh_update() */
#define NEIGH_UPDATE_F_OVERRIDE                        BIT(0)
#define NEIGH_UPDATE_F_WEAK_OVERRIDE                BIT(1)
#define NEIGH_UPDATE_F_OVERRIDE_ISROUTER        BIT(2)
#define NEIGH_UPDATE_F_USE                        BIT(3)
#define NEIGH_UPDATE_F_MANAGED                        BIT(4)
#define NEIGH_UPDATE_F_EXT_LEARNED                BIT(5)
#define NEIGH_UPDATE_F_ISROUTER                        BIT(6)
#define NEIGH_UPDATE_F_ADMIN                        BIT(7)
#define NEIGH_UPDATE_F_EXT_VALIDATED                BIT(8)

/* In-kernel representation for NDA_FLAGS_EXT flags: */
#define NTF_OLD_MASK                0xff
#define NTF_EXT_SHIFT                8
#define NTF_EXT_MASK                (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED)

#define NTF_MANAGED                (NTF_EXT_MANAGED << NTF_EXT_SHIFT)
#define NTF_EXT_VALIDATED        (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT)

extern const struct nla_policy nda_policy[];

#define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash)
#define neigh_for_each_in_bucket_rcu(pos, head) \
        hlist_for_each_entry_rcu(pos, head, hash)
#define neigh_for_each_in_bucket_safe(pos, tmp, head) \
        hlist_for_each_entry_safe(pos, tmp, head, hash)

static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey)
{
        return *(const u32 *)n->primary_key == *(const u32 *)pkey;
}

static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey)
{
        const u32 *n32 = (const u32 *)n->primary_key;
        const u32 *p32 = pkey;

        return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) |
                (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0;
}

static inline struct neighbour *___neigh_lookup_noref(
        struct neigh_table *tbl,
        bool (*key_eq)(const struct neighbour *n, const void *pkey),
        __u32 (*hash)(const void *pkey,
                      const struct net_device *dev,
                      __u32 *hash_rnd),
        const void *pkey,
        struct net_device *dev)
{
        struct neigh_hash_table *nht = rcu_dereference(tbl->nht);
        struct neighbour *n;
        u32 hash_val;

        hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
        neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val])
                if (n->dev == dev && key_eq(n, pkey))
                        return n;

        return NULL;
}

static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl,
                                                     const void *pkey,
                                                     struct net_device *dev)
{
        return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev);
}

static inline void neigh_confirm(struct neighbour *n)
{
        if (n) {
                unsigned long now = jiffies;

                /* avoid dirtying neighbour */
                if (READ_ONCE(n->confirmed) != now)
                        WRITE_ONCE(n->confirmed, now);
        }
}

void neigh_table_init(int index, struct neigh_table *tbl);
int neigh_table_clear(int index, struct neigh_table *tbl);
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
                               struct net_device *dev);
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
                                 struct net_device *dev, bool want_ref);
static inline struct neighbour *neigh_create(struct neigh_table *tbl,
                                             const void *pkey,
                                             struct net_device *dev)
{
        return __neigh_create(tbl, pkey, dev, true);
}
void neigh_destroy(struct neighbour *neigh);
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
                       const bool immediate_ok);
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags,
                 u32 nlmsg_pid);
void __neigh_set_probe_once(struct neighbour *neigh);
bool neigh_remove_one(struct neighbour *ndel);
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev);
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb);
int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb);
int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb);
struct neighbour *neigh_event_ns(struct neigh_table *tbl,
                                                u8 *lladdr, void *saddr,
                                                struct net_device *dev);

struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
                                      struct neigh_table *tbl);
void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms);

static inline
struct net *neigh_parms_net(const struct neigh_parms *parms)
{
        return read_pnet(&parms->net);
}

unsigned long neigh_rand_reach_time(unsigned long base);

static inline void neigh_set_reach_time(struct neigh_parms *p)
{
        unsigned long base = NEIGH_VAR(p, BASE_REACHABLE_TIME);

        WRITE_ONCE(p->reachable_time, neigh_rand_reach_time(base));
}

void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
                    struct sk_buff *skb);
struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net,
                                   const void *key, struct net_device *dev);
int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key,
                  struct net_device *dev, u32 flags, u8 protocol,
                  bool permanent);
int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key,
                  struct net_device *dev);

static inline struct net *pneigh_net(const struct pneigh_entry *pneigh)
{
        return read_pnet(&pneigh->net);
}

void neigh_app_ns(struct neighbour *n);
void neigh_for_each(struct neigh_table *tbl,
                    void (*cb)(struct neighbour *, void *), void *cookie);
void __neigh_for_each_release(struct neigh_table *tbl,
                              int (*cb)(struct neighbour *));
int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *);

struct neigh_seq_state {
        struct seq_net_private p;
        struct neigh_table *tbl;
        struct neigh_hash_table *nht;
        void *(*neigh_sub_iter)(struct neigh_seq_state *state,
                                struct neighbour *n, loff_t *pos);
        unsigned int bucket;
        unsigned int flags;
#define NEIGH_SEQ_NEIGH_ONLY        0x00000001
#define NEIGH_SEQ_IS_PNEIGH        0x00000002
#define NEIGH_SEQ_SKIP_NOARP        0x00000004
};
void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *,
                      unsigned int);
void *neigh_seq_next(struct seq_file *, void *, loff_t *);
void neigh_seq_stop(struct seq_file *, void *);

int neigh_proc_dointvec(const struct ctl_table *ctl, int write,
                        void *buffer, size_t *lenp, loff_t *ppos);
int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write,
                                void *buffer,
                                size_t *lenp, loff_t *ppos);
int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write,
                                   void *buffer, size_t *lenp, loff_t *ppos);

int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
                          proc_handler *proc_handler);
void neigh_sysctl_unregister(struct neigh_parms *p);

static inline void __neigh_parms_put(struct neigh_parms *parms)
{
        refcount_dec(&parms->refcnt);
}

static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms)
{
        refcount_inc(&parms->refcnt);
        return parms;
}

/*
 *        Neighbour references
 */

static inline void neigh_release(struct neighbour *neigh)
{
        if (refcount_dec_and_test(&neigh->refcnt))
                neigh_destroy(neigh);
}

static inline struct neighbour * neigh_clone(struct neighbour *neigh)
{
        if (neigh)
                refcount_inc(&neigh->refcnt);
        return neigh;
}

#define neigh_hold(n)        refcount_inc(&(n)->refcnt)

static __always_inline int neigh_event_send_probe(struct neighbour *neigh,
                                                  struct sk_buff *skb,
                                                  const bool immediate_ok)
{
        unsigned long now = jiffies;

        if (READ_ONCE(neigh->used) != now)
                WRITE_ONCE(neigh->used, now);
        if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)))
                return __neigh_event_send(neigh, skb, immediate_ok);
        return 0;
}

static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
        return neigh_event_send_probe(neigh, skb, true);
}

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
{
        unsigned int seq, hh_alen;

        do {
                seq = read_seqbegin(&hh->hh_lock);
                hh_alen = HH_DATA_ALIGN(ETH_HLEN);
                memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN);
        } while (read_seqretry(&hh->hh_lock, seq));
        return 0;
}
#endif

static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
        unsigned int hh_alen = 0;
        unsigned int seq;
        unsigned int hh_len;

        do {
                seq = read_seqbegin(&hh->hh_lock);
                hh_len = READ_ONCE(hh->hh_len);
                if (likely(hh_len <= HH_DATA_MOD)) {
                        hh_alen = HH_DATA_MOD;

                        /* skb_push() would proceed silently if we have room for
                         * the unaligned size but not for the aligned size:
                         * check headroom explicitly.
                         */
                        if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
                                /* this is inlined by gcc */
                                memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
                                       HH_DATA_MOD);
                        }
                } else {
                        hh_alen = HH_DATA_ALIGN(hh_len);

                        if (likely(skb_headroom(skb) >= hh_alen)) {
                                memcpy(skb->data - hh_alen, hh->hh_data,
                                       hh_alen);
                        }
                }
        } while (read_seqretry(&hh->hh_lock, seq));

        if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) {
                kfree_skb(skb);
                return NET_XMIT_DROP;
        }

        __skb_push(skb, hh_len);
        return dev_queue_xmit(skb);
}

static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
                               bool skip_cache)
{
        const struct hh_cache *hh = &n->hh;

        /* n->nud_state and hh->hh_len could be changed under us.
         * neigh_hh_output() is taking care of the race later.
         */
        if (!skip_cache &&
            (READ_ONCE(n->nud_state) & NUD_CONNECTED) &&
            READ_ONCE(hh->hh_len))
                return neigh_hh_output(hh, skb);

        return READ_ONCE(n->output)(n, skb);
}

static inline struct neighbour *
__neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat)
{
        struct neighbour *n = neigh_lookup(tbl, pkey, dev);

        if (n || !creat)
                return n;

        n = neigh_create(tbl, pkey, dev);
        return IS_ERR(n) ? NULL : n;
}

static inline struct neighbour *
__neigh_lookup_errno(struct neigh_table *tbl, const void *pkey,
  struct net_device *dev)
{
        struct neighbour *n = neigh_lookup(tbl, pkey, dev);

        if (n)
                return n;

        return neigh_create(tbl, pkey, dev);
}

struct neighbour_cb {
        unsigned long sched_next;
        unsigned int flags;
};

#define LOCALLY_ENQUEUED 0x1

#define NEIGH_CB(skb)        ((struct neighbour_cb *)(skb)->cb)

static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
                                     const struct net_device *dev)
{
        unsigned int seq;

        do {
                seq = read_seqbegin(&n->ha_lock);
                memcpy(dst, n->ha, dev->addr_len);
        } while (read_seqretry(&n->ha_lock, seq));
}

static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags,
                                          int *notify)
{
        u8 ndm_flags = 0;

        ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0;
        if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) {
                if (ndm_flags & NTF_ROUTER)
                        neigh->flags |= NTF_ROUTER;
                else
                        neigh->flags &= ~NTF_ROUTER;
                *notify = 1;
        }
}
#endif





















































































































































































































































































































































































































































































































































































































































































































































































    3 

























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * workqueue.h --- work queue handling for Linux.
 */

#ifndef _LINUX_WORKQUEUE_H
#define _LINUX_WORKQUEUE_H

#include <linux/alloc_tag.h>
#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cpumask_types.h>
#include <linux/rcupdate.h>
#include <linux/workqueue_types.h>

/*
 * The first word is the work queue pointer and the flags rolled into
 * one
 */
#define work_data_bits(work) ((unsigned long *)(&(work)->data))

enum work_bits {
        WORK_STRUCT_PENDING_BIT        = 0,        /* work item is pending execution */
        WORK_STRUCT_INACTIVE_BIT,        /* work item is inactive */
        WORK_STRUCT_PWQ_BIT,                /* data points to pwq */
        WORK_STRUCT_LINKED_BIT,                /* next work is linked to this one */
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC_BIT,                /* static initializer (debugobjects) */
#endif
        WORK_STRUCT_FLAG_BITS,

        /* color for workqueue flushing */
        WORK_STRUCT_COLOR_SHIFT        = WORK_STRUCT_FLAG_BITS,
        WORK_STRUCT_COLOR_BITS        = 4,

        /*
         * When WORK_STRUCT_PWQ is set, reserve 8 bits off of pwq pointer w/
         * debugobjects turned off. This makes pwqs aligned to 256 bytes (512
         * bytes w/ DEBUG_OBJECTS_WORK) and allows 16 workqueue flush colors.
         *
         * MSB
         * [ pwq pointer ] [ flush color ] [ STRUCT flags ]
         *                     4 bits        4 or 5 bits
         */
        WORK_STRUCT_PWQ_SHIFT        = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS,

        /*
         * data contains off-queue information when !WORK_STRUCT_PWQ.
         *
         * MSB
         * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ]
         *                  16 bits          1 bit        4 or 5 bits
         */
        WORK_OFFQ_FLAG_SHIFT        = WORK_STRUCT_FLAG_BITS,
        WORK_OFFQ_BH_BIT        = WORK_OFFQ_FLAG_SHIFT,
        WORK_OFFQ_FLAG_END,
        WORK_OFFQ_FLAG_BITS        = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT,

        WORK_OFFQ_DISABLE_SHIFT        = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS,
        WORK_OFFQ_DISABLE_BITS        = 16,

        /*
         * When a work item is off queue, the high bits encode off-queue flags
         * and the last pool it was on. Cap pool ID to 31 bits and use the
         * highest number to indicate that no pool is associated.
         */
        WORK_OFFQ_POOL_SHIFT        = WORK_OFFQ_DISABLE_SHIFT + WORK_OFFQ_DISABLE_BITS,
        WORK_OFFQ_LEFT                = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT,
        WORK_OFFQ_POOL_BITS        = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31,
};

enum work_flags {
        WORK_STRUCT_PENDING        = 1 << WORK_STRUCT_PENDING_BIT,
        WORK_STRUCT_INACTIVE        = 1 << WORK_STRUCT_INACTIVE_BIT,
        WORK_STRUCT_PWQ                = 1 << WORK_STRUCT_PWQ_BIT,
        WORK_STRUCT_LINKED        = 1 << WORK_STRUCT_LINKED_BIT,
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC        = 1 << WORK_STRUCT_STATIC_BIT,
#else
        WORK_STRUCT_STATIC        = 0,
#endif
};

enum wq_misc_consts {
        WORK_NR_COLORS                = (1 << WORK_STRUCT_COLOR_BITS),

        /* not bound to any CPU, prefer the local CPU */
        WORK_CPU_UNBOUND        = NR_CPUS,

        /* bit mask for work_busy() return values */
        WORK_BUSY_PENDING        = 1 << 0,
        WORK_BUSY_RUNNING        = 1 << 1,

        /* maximum string length for set_worker_desc() */
        WORKER_DESC_LEN                = 32,
};

/* Convenience constants - of type 'unsigned long', not 'enum'! */
#define WORK_OFFQ_BH                (1ul << WORK_OFFQ_BH_BIT)
#define WORK_OFFQ_FLAG_MASK        (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT)
#define WORK_OFFQ_DISABLE_MASK        (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT)
#define WORK_OFFQ_POOL_NONE        ((1ul << WORK_OFFQ_POOL_BITS) - 1)
#define WORK_STRUCT_NO_POOL        (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT)
#define WORK_STRUCT_PWQ_MASK        (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1))

#define WORK_DATA_INIT()        ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
#define WORK_DATA_STATIC_INIT()        \
        ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))

struct delayed_work {
        struct work_struct work;
        struct timer_list timer;

        /* target workqueue and CPU ->timer uses to queue ->work */
        struct workqueue_struct *wq;
        int cpu;
};

struct rcu_work {
        struct work_struct work;
        struct rcu_head rcu;

        /* target workqueue ->rcu uses to queue ->work */
        struct workqueue_struct *wq;
};

enum wq_affn_scope {
        WQ_AFFN_DFL,                        /* use system default */
        WQ_AFFN_CPU,                        /* one pod per CPU */
        WQ_AFFN_SMT,                        /* one pod per SMT */
        WQ_AFFN_CACHE,                        /* one pod per LLC */
        WQ_AFFN_CACHE_SHARD,                /* synthetic sub-LLC shards */
        WQ_AFFN_NUMA,                        /* one pod per NUMA node */
        WQ_AFFN_SYSTEM,                        /* one pod across the whole system */

        WQ_AFFN_NR_TYPES,
};

/**
 * struct workqueue_attrs - A struct for workqueue attributes.
 *
 * This can be used to change attributes of an unbound workqueue.
 */
struct workqueue_attrs {
        /**
         * @nice: nice level
         */
        int nice;

        /**
         * @cpumask: allowed CPUs
         *
         * Work items in this workqueue are affine to these CPUs and not allowed
         * to execute on other CPUs. A pool serving a workqueue must have the
         * same @cpumask.
         */
        cpumask_var_t cpumask;

        /**
         * @__pod_cpumask: internal attribute used to create per-pod pools
         *
         * Internal use only.
         *
         * Per-pod unbound worker pools are used to improve locality. Always a
         * subset of ->cpumask. A workqueue can be associated with multiple
         * worker pools with disjoint @__pod_cpumask's. Whether the enforcement
         * of a pool's @__pod_cpumask is strict depends on @affn_strict.
         */
        cpumask_var_t __pod_cpumask;

        /**
         * @affn_strict: affinity scope is strict
         *
         * If clear, workqueue will make a best-effort attempt at starting the
         * worker inside @__pod_cpumask but the scheduler is free to migrate it
         * outside.
         *
         * If set, workers are only allowed to run inside @__pod_cpumask.
         */
        bool affn_strict;

        /*
         * Below fields aren't properties of a worker_pool. They only modify how
         * :c:func:`apply_workqueue_attrs` select pools and thus don't
         * participate in pool hash calculations or equality comparisons.
         *
         * If @affn_strict is set, @cpumask isn't a property of a worker_pool
         * either.
         */

        /**
         * @affn_scope: unbound CPU affinity scope
         *
         * CPU pods are used to improve execution locality of unbound work
         * items. There are multiple pod types, one for each wq_affn_scope, and
         * every CPU in the system belongs to one pod in every pod type. CPUs
         * that belong to the same pod share the worker pool. For example,
         * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker
         * pool for each NUMA node.
         */
        enum wq_affn_scope affn_scope;

        /**
         * @ordered: work items must be executed one by one in queueing order
         */
        bool ordered;
};

static inline struct delayed_work *to_delayed_work(struct work_struct *work)
{
        return container_of(work, struct delayed_work, work);
}

static inline struct rcu_work *to_rcu_work(struct work_struct *work)
{
        return container_of(work, struct rcu_work, work);
}

struct execute_work {
        struct work_struct work;
};

#ifdef CONFIG_LOCKDEP
/*
 * NB: because we have to copy the lockdep_map, setting _key
 * here is required, otherwise it could get initialised to the
 * copy of the lockdep_map!
 */
#define __WORK_INIT_LOCKDEP_MAP(n, k) \
        .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k),
#else
#define __WORK_INIT_LOCKDEP_MAP(n, k)
#endif

#define __WORK_INITIALIZER(n, f) {                                        \
        .data = WORK_DATA_STATIC_INIT(),                                \
        .entry        = { &(n).entry, &(n).entry },                                \
        .func = (f),                                                        \
        __WORK_INIT_LOCKDEP_MAP(#n, &(n))                                \
        }

#define __DELAYED_WORK_INITIALIZER(n, f, tflags) {                        \
        .work = __WORK_INITIALIZER((n).work, (f)),                        \
        .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\
                                     (tflags) | TIMER_IRQSAFE),                \
        }

#define DECLARE_WORK(n, f)                                                \
        struct work_struct n = __WORK_INITIALIZER(n, f)

#define DECLARE_DELAYED_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0)

#define DECLARE_DEFERRABLE_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE)

#ifdef CONFIG_DEBUG_OBJECTS_WORK
extern void __init_work(struct work_struct *work, int onstack);
extern void destroy_work_on_stack(struct work_struct *work);
extern void destroy_delayed_work_on_stack(struct delayed_work *work);
static inline unsigned int work_static(struct work_struct *work)
{
        return *work_data_bits(work) & WORK_STRUCT_STATIC;
}
#else
static inline void __init_work(struct work_struct *work, int onstack) { }
static inline void destroy_work_on_stack(struct work_struct *work) { }
static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { }
static inline unsigned int work_static(struct work_struct *work) { return 0; }
#endif

/*
 * initialize all of a work item in one go
 *
 * NOTE! No point in using "atomic_long_set()": using a direct
 * assignment of the work data initializer allows the compiler
 * to generate better code.
 */
#ifdef CONFIG_LOCKDEP
#define __INIT_WORK_KEY(_work, _func, _onstack, _key)                        \
        do {                                                                \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#else
#define __INIT_WORK_KEY(_work, _func, _onstack, _key)                        \
        do {                                                                \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#endif

#define __INIT_WORK(_work, _func, _onstack)                                \
        do {                                                                \
                static __maybe_unused struct lock_class_key __key;        \
                                                                        \
                __INIT_WORK_KEY(_work, _func, _onstack, &__key);        \
        } while (0)

#define INIT_WORK(_work, _func)                                                \
        __INIT_WORK((_work), (_func), 0)

#define INIT_WORK_ONSTACK(_work, _func)                                        \
        __INIT_WORK((_work), (_func), 1)

#define INIT_WORK_ONSTACK_KEY(_work, _func, _key)                        \
        __INIT_WORK_KEY((_work), (_func), 1, _key)

#define __INIT_DELAYED_WORK(_work, _func, _tflags)                        \
        do {                                                                \
                INIT_WORK(&(_work)->work, (_func));                        \
                __timer_init(&(_work)->timer,                                \
                             delayed_work_timer_fn,                        \
                             (_tflags) | TIMER_IRQSAFE);                \
        } while (0)

#define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags)                \
        do {                                                                \
                INIT_WORK_ONSTACK(&(_work)->work, (_func));                \
                __timer_init_on_stack(&(_work)->timer,                        \
                                      delayed_work_timer_fn,                \
                                      (_tflags) | TIMER_IRQSAFE);        \
        } while (0)

#define INIT_DELAYED_WORK(_work, _func)                                        \
        __INIT_DELAYED_WORK(_work, _func, 0)

#define INIT_DELAYED_WORK_ONSTACK(_work, _func)                                \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0)

#define INIT_DEFERRABLE_WORK(_work, _func)                                \
        __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE)

#define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func)                        \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE)

#define INIT_RCU_WORK(_work, _func)                                        \
        INIT_WORK(&(_work)->work, (_func))

#define INIT_RCU_WORK_ONSTACK(_work, _func)                                \
        INIT_WORK_ONSTACK(&(_work)->work, (_func))

/**
 * work_pending - Find out whether a work item is currently pending
 * @work: The work item in question
 */
#define work_pending(work) \
        test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))

/**
 * delayed_work_pending - Find out whether a delayable work item is currently
 * pending
 * @w: The work item in question
 */
#define delayed_work_pending(w) \
        work_pending(&(w)->work)

/*
 * Workqueue flags and constants.  For details, please refer to
 * Documentation/core-api/workqueue.rst.
 */
enum wq_flags {
        WQ_BH                        = 1 << 0, /* execute in bottom half (softirq) context */
        WQ_UNBOUND                = 1 << 1, /* not bound to any cpu */
        WQ_FREEZABLE                = 1 << 2, /* freeze during suspend */
        WQ_MEM_RECLAIM                = 1 << 3, /* may be used for memory reclaim */
        WQ_HIGHPRI                = 1 << 4, /* high priority */
        WQ_CPU_INTENSIVE        = 1 << 5, /* cpu intensive workqueue */
        WQ_SYSFS                = 1 << 6, /* visible in sysfs, see workqueue_sysfs_register() */

        /*
         * Per-cpu workqueues are generally preferred because they tend to
         * show better performance thanks to cache locality.  Per-cpu
         * workqueues exclude the scheduler from choosing the CPU to
         * execute the worker threads, which has an unfortunate side effect
         * of increasing power consumption.
         *
         * The scheduler considers a CPU idle if it doesn't have any task
         * to execute and tries to keep idle cores idle to conserve power;
         * however, for example, a per-cpu work item scheduled from an
         * interrupt handler on an idle CPU will force the scheduler to
         * execute the work item on that CPU breaking the idleness, which in
         * turn may lead to more scheduling choices which are sub-optimal
         * in terms of power consumption.
         *
         * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default
         * but become unbound if workqueue.power_efficient kernel param is
         * specified.  Per-cpu workqueues which are identified to
         * contribute significantly to power-consumption are identified and
         * marked with this flag and enabling the power_efficient mode
         * leads to noticeable power saving at the cost of small
         * performance disadvantage.
         *
         * http://thread.gmane.org/gmane.linux.kernel/1480396
         */
        WQ_POWER_EFFICIENT        = 1 << 7,
        WQ_PERCPU                = 1 << 8, /* bound to a specific cpu */

        __WQ_DESTROYING                = 1 << 15, /* internal: workqueue is destroying */
        __WQ_DRAINING                = 1 << 16, /* internal: workqueue is draining */
        __WQ_ORDERED                = 1 << 17, /* internal: workqueue is ordered */
        __WQ_LEGACY                = 1 << 18, /* internal: create*_workqueue() */

        /* BH wq only allows the following flags */
        __WQ_BH_ALLOWS                = WQ_BH | WQ_HIGHPRI | WQ_PERCPU,
};

enum wq_consts {
        WQ_MAX_ACTIVE                = 2048,          /* I like 2048, better ideas? */
        WQ_UNBOUND_MAX_ACTIVE        = WQ_MAX_ACTIVE,
        WQ_DFL_ACTIVE                = WQ_MAX_ACTIVE / 2,

        /*
         * Per-node default cap on min_active. Unless explicitly set, min_active
         * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see
         * workqueue_struct->min_active definition.
         */
        WQ_DFL_MIN_ACTIVE        = 8,
};

/*
 * System-wide workqueues which are always present.
 *
 * system_percpu_wq is the one used by schedule[_delayed]_work[_on]().
 * Multi-CPU multi-threaded.  There are users which expect relatively
 * short queue flush time.  Don't queue works which can run for too
 * long.
 *
 * system_highpri_wq is similar to system_percpu_wq but for work items which
 * require WQ_HIGHPRI.
 *
 * system_long_wq is similar to system_percpu_wq but may host long running
 * works.  Queue flushing might take relatively long.
 *
 * system_dfl_long_wq is similar to system_dfl_wq but it may host long running
 * works.
 *
 * system_dfl_wq is unbound workqueue.  Workers are not bound to
 * any specific CPU, not concurrency managed, and all queued works are
 * executed immediately as long as max_active limit is not reached and
 * resources are available.
 *
 * system_freezable_wq is equivalent to system_percpu_wq except that it's
 * freezable.
 *
 * *_power_efficient_wq are inclined towards saving power and converted
 * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise,
 * they are same as their non-power-efficient counterparts - e.g.
 * system_power_efficient_wq is identical to system_percpu_wq if
 * 'wq_power_efficient' is disabled.  See WQ_POWER_EFFICIENT for more info.
 *
 * system_bh[_highpri]_wq are convenience interface to softirq. BH work items
 * are executed in the queueing CPU's BH context in the queueing order.
 */
extern struct workqueue_struct *system_wq; /* use system_percpu_wq, this will be removed */
extern struct workqueue_struct *system_percpu_wq;
extern struct workqueue_struct *system_highpri_wq;
extern struct workqueue_struct *system_long_wq;
extern struct workqueue_struct *system_unbound_wq;
extern struct workqueue_struct *system_dfl_wq;
extern struct workqueue_struct *system_freezable_wq;
extern struct workqueue_struct *system_power_efficient_wq;
extern struct workqueue_struct *system_freezable_power_efficient_wq;
extern struct workqueue_struct *system_bh_wq;
extern struct workqueue_struct *system_bh_highpri_wq;
extern struct workqueue_struct *system_dfl_long_wq;

void workqueue_softirq_action(bool highpri);
void workqueue_softirq_dead(unsigned int cpu);

/**
 * alloc_workqueue - allocate a workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags
 * @max_active: max in-flight work items, 0 for default
 * @...: args for @fmt
 *
 * For a per-cpu workqueue, @max_active limits the number of in-flight work
 * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be
 * executing at most one work item for the workqueue.
 *
 * For unbound workqueues, @max_active limits the number of in-flight work items
 * for the whole system. e.g. @max_active of 16 indicates that there can be
 * at most 16 work items executing for the workqueue in the whole system.
 *
 * As sharing the same active counter for an unbound workqueue across multiple
 * NUMA nodes can be expensive, @max_active is distributed to each NUMA node
 * according to the proportion of the number of online CPUs and enforced
 * independently.
 *
 * Depending on online CPU distribution, a node may end up with per-node
 * max_active which is significantly lower than @max_active, which can lead to
 * deadlocks if the per-node concurrency limit is lower than the maximum number
 * of interdependent work items for the workqueue.
 *
 * To guarantee forward progress regardless of online CPU distribution, the
 * concurrency limit on every node is guaranteed to be equal to or greater than
 * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means
 * that the sum of per-node max_active's may be larger than @max_active.
 *
 * For detailed information on %WQ_\* flags, please refer to
 * Documentation/core-api/workqueue.rst.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
__printf(1, 4) struct workqueue_struct *
alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...);
#define alloc_workqueue(...)        alloc_hooks(alloc_workqueue_noprof(__VA_ARGS__))

/**
 * devm_alloc_workqueue - Resource-managed allocate a workqueue
 * @dev: Device to allocate workqueue for
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags
 * @max_active: max in-flight work items, 0 for default
 * @...: args for @fmt
 *
 * Resource managed workqueue, see alloc_workqueue() for details.
 *
 * The workqueue will be automatically destroyed on driver detach.  Typically
 * this should be used in drivers already relying on devm interafaces.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
__printf(2, 5) struct workqueue_struct *
devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags,
                     int max_active, ...);

#ifdef CONFIG_LOCKDEP
/**
 * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags
 * @max_active: max in-flight work items, 0 for default
 * @lockdep_map: user-defined lockdep_map
 * @...: args for @fmt
 *
 * Same as alloc_workqueue but with the a user-define lockdep_map. Useful for
 * workqueues created with the same purpose and to avoid leaking a lockdep_map
 * on each workqueue creation.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
__printf(1, 5) struct workqueue_struct *
alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
                            struct lockdep_map *lockdep_map, ...);

/**
 * alloc_ordered_workqueue_lockdep_map - allocate an ordered workqueue with
 * user-defined lockdep_map
 *
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
 * @lockdep_map: user-defined lockdep_map
 * @args: args for @fmt
 *
 * Same as alloc_ordered_workqueue but with the a user-define lockdep_map.
 * Useful for workqueues created with the same purpose and to avoid leaking a
 * lockdep_map on each workqueue creation.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
#define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...)        \
        alloc_hooks(alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags),\
                                                1, lockdep_map, ##args))
#endif

/**
 * alloc_ordered_workqueue - allocate an ordered workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
 * @args: args for @fmt
 *
 * Allocate an ordered workqueue.  An ordered workqueue executes at
 * most one work item at any given time in the queued order.  They are
 * implemented as unbound workqueues with @max_active of one.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
#define alloc_ordered_workqueue(fmt, flags, args...)                        \
        alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)
#define devm_alloc_ordered_workqueue(dev, fmt, flags, args...)                \
        devm_alloc_workqueue(dev, fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)

#define create_workqueue(name)                                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name))
#define create_freezable_workqueue(name)                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |        \
                        WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name)                                \
        alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)

#define from_work(var, callback_work, work_fieldname)        \
        container_of(callback_work, typeof(*var), work_fieldname)

extern void destroy_workqueue(struct workqueue_struct *wq);

struct workqueue_attrs *alloc_workqueue_attrs_noprof(void);
#define alloc_workqueue_attrs(...)        alloc_hooks(alloc_workqueue_attrs_noprof(__VA_ARGS__))

void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs);
extern int workqueue_unbound_housekeeping_update(const struct cpumask *hk);

extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
                        struct work_struct *work);
extern bool queue_work_node(int node, struct workqueue_struct *wq,
                            struct work_struct *work);
extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *work, unsigned long delay);
extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay);
extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork);

extern void __flush_workqueue(struct workqueue_struct *wq);
extern void drain_workqueue(struct workqueue_struct *wq);

extern int schedule_on_each_cpu(work_func_t func);

int execute_in_process_context(work_func_t fn, struct execute_work *);

extern bool flush_work(struct work_struct *work);
extern bool cancel_work(struct work_struct *work);
extern bool cancel_work_sync(struct work_struct *work);

extern bool flush_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work_sync(struct delayed_work *dwork);

extern bool disable_work(struct work_struct *work);
extern bool disable_work_sync(struct work_struct *work);
extern bool enable_work(struct work_struct *work);

extern bool disable_delayed_work(struct delayed_work *dwork);
extern bool disable_delayed_work_sync(struct delayed_work *dwork);
extern bool enable_delayed_work(struct delayed_work *dwork);

extern bool flush_rcu_work(struct rcu_work *rwork);

extern void workqueue_set_max_active(struct workqueue_struct *wq,
                                     int max_active);
extern void workqueue_set_min_active(struct workqueue_struct *wq,
                                     int min_active);
extern struct work_struct *current_work(void);
extern bool current_is_workqueue_rescuer(void);
extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
extern unsigned int work_busy(struct work_struct *work);
extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
extern void show_all_workqueues(void);
extern void show_freezable_workqueues(void);
extern void show_one_workqueue(struct workqueue_struct *wq);
extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);

/**
 * queue_work - queue work on a workqueue
 * @wq: workqueue to use
 * @work: work to queue
 *
 * Returns %false if @work was already on a queue, %true otherwise.
 *
 * We queue the work to the CPU on which it was submitted, but if the CPU dies
 * it can be processed by another CPU.
 *
 * Memory-ordering properties:  If it returns %true, guarantees that all stores
 * preceding the call to queue_work() in the program order will be visible from
 * the CPU which will execute @work by the time such work executes, e.g.,
 *
 * { x is initially 0 }
 *
 *   CPU0                                CPU1
 *
 *   WRITE_ONCE(x, 1);                        [ @work is being executed ]
 *   r0 = queue_work(wq, work);                  r1 = READ_ONCE(x);
 *
 * Forbids: r0 == true && r1 == 0
 */
static inline bool queue_work(struct workqueue_struct *wq,
                              struct work_struct *work)
{
        return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}

/**
 * queue_delayed_work - queue work on a workqueue after delay
 * @wq: workqueue to use
 * @dwork: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
 */
static inline bool queue_delayed_work(struct workqueue_struct *wq,
                                      struct delayed_work *dwork,
                                      unsigned long delay)
{
        return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * mod_delayed_work - modify delay of or queue a delayed work
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * mod_delayed_work_on() on local CPU.
 */
static inline bool mod_delayed_work(struct workqueue_struct *wq,
                                    struct delayed_work *dwork,
                                    unsigned long delay)
{
        return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * schedule_work_on - put work task on a specific cpu
 * @cpu: cpu to put the work task on
 * @work: job to be done
 *
 * This puts a job on a specific cpu
 */
static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
        return queue_work_on(cpu, system_percpu_wq, work);
}

/**
 * schedule_work - put work task in per-CPU workqueue
 * @work: job to be done
 *
 * Returns %false if @work was already on the system per-CPU workqueue and
 * %true otherwise.
 *
 * This puts a job in the system per-CPU workqueue if it was not already
 * queued and leaves it in the same position on the system per-CPU
 * workqueue otherwise.
 *
 * Shares the same memory-ordering properties of queue_work(), cf. the
 * DocBook header of queue_work().
 */
static inline bool schedule_work(struct work_struct *work)
{
        return queue_work(system_percpu_wq, work);
}

/**
 * enable_and_queue_work - Enable and queue a work item on a specific workqueue
 * @wq: The target workqueue
 * @work: The work item to be enabled and queued
 *
 * This function combines the operations of enable_work() and queue_work(),
 * providing a convenient way to enable and queue a work item in a single call.
 * It invokes enable_work() on @work and then queues it if the disable depth
 * reached 0. Returns %true if the disable depth reached 0 and @work is queued,
 * and %false otherwise.
 *
 * Note that @work is always queued when disable depth reaches zero. If the
 * desired behavior is queueing only if certain events took place while @work is
 * disabled, the user should implement the necessary state tracking and perform
 * explicit conditional queueing after enable_work().
 */
static inline bool enable_and_queue_work(struct workqueue_struct *wq,
                                         struct work_struct *work)
{
        if (enable_work(work)) {
                queue_work(wq, work);
                return true;
        }
        return false;
}

/*
 * Detect attempt to flush system-wide workqueues at compile time when possible.
 * Warn attempt to flush system-wide workqueues at runtime.
 *
 * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp
 * for reasons and steps for converting system-wide workqueues into local workqueues.
 */
extern void __warn_flushing_systemwide_wq(void)
        __compiletime_warning("Please avoid flushing system-wide workqueues.");

/* Please stop using this function, for this function will be removed in near future. */
#define flush_scheduled_work()                                                \
({                                                                        \
        __warn_flushing_systemwide_wq();                                \
        __flush_workqueue(system_percpu_wq);                                        \
})

#define flush_workqueue(wq)                                                \
({                                                                        \
        struct workqueue_struct *_wq = (wq);                                \
                                                                        \
        if ((__builtin_constant_p(_wq == system_percpu_wq) &&                        \
             _wq == system_percpu_wq) ||                                        \
            (__builtin_constant_p(_wq == system_highpri_wq) &&                \
             _wq == system_highpri_wq) ||                                \
            (__builtin_constant_p(_wq == system_long_wq) &&                \
             _wq == system_long_wq) ||                                        \
            (__builtin_constant_p(_wq == system_dfl_long_wq) &&                \
             _wq == system_dfl_long_wq) ||                                        \
            (__builtin_constant_p(_wq == system_dfl_wq) &&                \
             _wq == system_dfl_wq) ||                                \
            (__builtin_constant_p(_wq == system_freezable_wq) &&        \
             _wq == system_freezable_wq) ||                                \
            (__builtin_constant_p(_wq == system_power_efficient_wq) &&        \
             _wq == system_power_efficient_wq) ||                        \
            (__builtin_constant_p(_wq == system_freezable_power_efficient_wq) && \
             _wq == system_freezable_power_efficient_wq))                \
                __warn_flushing_systemwide_wq();                        \
        __flush_workqueue(_wq);                                                \
})

/**
 * schedule_delayed_work_on - queue work in per-CPU workqueue on CPU after delay
 * @cpu: cpu to use
 * @dwork: job to be done
 * @delay: number of jiffies to wait
 *
 * After waiting for a given time this puts a job in the system per-CPU
 * workqueue on the specified CPU.
 */
static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
                                            unsigned long delay)
{
        return queue_delayed_work_on(cpu, system_percpu_wq, dwork, delay);
}

/**
 * schedule_delayed_work - put work task in per-CPU workqueue after delay
 * @dwork: job to be done
 * @delay: number of jiffies to wait or 0 for immediate execution
 *
 * After waiting for a given time this puts a job in the system per-CPU
 * workqueue.
 */
static inline bool schedule_delayed_work(struct delayed_work *dwork,
                                         unsigned long delay)
{
        return queue_delayed_work(system_percpu_wq, dwork, delay);
}

#ifndef CONFIG_SMP
static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
#else
long work_on_cpu_key(int cpu, long (*fn)(void *),
                     void *arg, struct lock_class_key *key);
/*
 * A new key is defined for each caller to make sure the work
 * associated with the function doesn't share its locking class.
 */
#define work_on_cpu(_cpu, _fn, _arg)                        \
({                                                        \
        static struct lock_class_key __key;                \
                                                        \
        work_on_cpu_key(_cpu, _fn, _arg, &__key);        \
})

#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER
extern void freeze_workqueues_begin(void);
extern bool freeze_workqueues_busy(void);
extern void thaw_workqueues(void);
#endif /* CONFIG_FREEZER */

#ifdef CONFIG_SYSFS
int workqueue_sysfs_register(struct workqueue_struct *wq);
#else        /* CONFIG_SYSFS */
static inline int workqueue_sysfs_register(struct workqueue_struct *wq)
{ return 0; }
#endif        /* CONFIG_SYSFS */

#ifdef CONFIG_WQ_WATCHDOG
void wq_watchdog_touch(int cpu);
#else        /* CONFIG_WQ_WATCHDOG */
static inline void wq_watchdog_touch(int cpu) { }
#endif        /* CONFIG_WQ_WATCHDOG */

#ifdef CONFIG_SMP
int workqueue_prepare_cpu(unsigned int cpu);
int workqueue_online_cpu(unsigned int cpu);
int workqueue_offline_cpu(unsigned int cpu);
#endif

void __init workqueue_init_early(void);
void __init workqueue_init(void);
void __init workqueue_init_topology(void);

#endif

















































































































































































































































































































   19 
























   15 
















   18 


























































































































































































   16 











































































































































































































































































































































































































































































































































































































































































































































































































































































   17 


   17 



   17 






























   17 































   16 








   18 























   17 




























   17 

































































































































































































































































































































































































   17 

















































   18 













   11 


























   19 

















    4 















    5 






   10 



    8 



















































   17 









































   18 













   15 




















































   18 
   17 










    1 













   17 





    1 
   17 







   19 

   13 




   17 






























   16 














   17 










   15 















    3 
   17 




   17 
    3 

   18 


   16 

    1 




    1 



    1 


























   17 








   14 
    3 




    1 

   18 



   19 








































   16 




















   18 





   18 



   17 


   17 

















   19 

   18 





   19 






   13 





   16 































































































































































































































































































































































































































   19 

   16 














   18 
































































































   18 






















   18 











   18 







   16 
































































   18 






    1 



   15 




























   17 












   19 
   14 























































   17 












   14 








   19 







   11 



   15 
   17 

   17 















   18 


































   15 







   19 

   19 




























   11 
   18 







   11 








   14 















   17 




   19 





































































































































































































































































































































































































































































































    1 



    1 
    1 




    1 

    1 



    1 

























    1 























































































































    1 
    1 






















    1 





    1 




















    1 











    1 



    1 

































































































































































































































































































































































































































































































































   14 

   17 


   18 














   19 






















































































































































































































































































































































































































   17 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/printk.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 * Modified to make sys_syslog() more flexible: added commands to
 * return the last 4k of kernel messages, regardless of whether
 * they've been read or not.  Added option to suppress kernel printk's
 * to the console.  Added hook for sending the console messages
 * elsewhere, in preparation for a serial line console (someday).
 * Ted Ts'o, 2/11/93.
 * Modified for sysctl support, 1/8/97, Chris Horn.
 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
 *     manfred@colorfullife.com
 * Rewrote bits to get rid of console_lock
 *        01Mar01 Andrew Morton
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/console.h>
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/syscore_ops.h>
#include <linux/vmcore_info.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
#include <linux/cpu.h>
#include <linux/rculist.h>
#include <linux/poll.h>
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
#include <linux/panic.h>

#include <linux/uaccess.h>
#include <asm/sections.h>

#include <trace/events/initcall.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>

#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
#include "internal.h"

int console_printk[4] = {
        CONSOLE_LOGLEVEL_DEFAULT,        /* console_loglevel */
        MESSAGE_LOGLEVEL_DEFAULT,        /* default_message_loglevel */
        CONSOLE_LOGLEVEL_MIN,                /* minimum_console_loglevel */
        CONSOLE_LOGLEVEL_DEFAULT,        /* default_console_loglevel */
};
EXPORT_SYMBOL_GPL(console_printk);

atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);

EXPORT_TRACEPOINT_SYMBOL_GPL(console);

/*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
 */
int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);

/*
 * console_mutex protects console_list updates and console->flags updates.
 * The flags are synchronized only for consoles that are registered, i.e.
 * accessible via the console list.
 */
static DEFINE_MUTEX(console_mutex);

/*
 * console_sem protects updates to console->seq
 * and also provides serialization for console printing.
 */
static DEFINE_SEMAPHORE(console_sem, 1);
HLIST_HEAD(console_list);
EXPORT_SYMBOL_GPL(console_list);
DEFINE_STATIC_SRCU(console_srcu);

/*
 * System may need to suppress printk message under certain
 * circumstances, like after kernel panic happens.
 */
int __read_mostly suppress_printk;

#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
        .name = "console_lock"
};

void lockdep_assert_console_list_lock_held(void)
{
        lockdep_assert_held(&console_mutex);
}
EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
bool console_srcu_read_lock_is_held(void)
{
        return srcu_read_lock_held(&console_srcu);
}
EXPORT_SYMBOL(console_srcu_read_lock_is_held);
#endif

enum devkmsg_log_bits {
        __DEVKMSG_LOG_BIT_ON = 0,
        __DEVKMSG_LOG_BIT_OFF,
        __DEVKMSG_LOG_BIT_LOCK,
};

enum devkmsg_log_masks {
        DEVKMSG_LOG_MASK_ON             = BIT(__DEVKMSG_LOG_BIT_ON),
        DEVKMSG_LOG_MASK_OFF            = BIT(__DEVKMSG_LOG_BIT_OFF),
        DEVKMSG_LOG_MASK_LOCK           = BIT(__DEVKMSG_LOG_BIT_LOCK),
};

/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
#define DEVKMSG_LOG_MASK_DEFAULT        0

static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;

static int __control_devkmsg(char *str)
{
        size_t len;

        if (!str)
                return -EINVAL;

        len = str_has_prefix(str, "on");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_ON;
                return len;
        }

        len = str_has_prefix(str, "off");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_OFF;
                return len;
        }

        len = str_has_prefix(str, "ratelimit");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
                return len;
        }

        return -EINVAL;
}

static int __init control_devkmsg(char *str)
{
        if (__control_devkmsg(str) < 0) {
                pr_warn("printk.devkmsg: bad option string '%s'\n", str);
                return 1;
        }

        /*
         * Set sysctl string accordingly:
         */
        if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
                strscpy(devkmsg_log_str, "on");
        else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
                strscpy(devkmsg_log_str, "off");
        /* else "ratelimit" which is set by default. */

        /*
         * Sysctl cannot change it anymore. The kernel command line setting of
         * this parameter is to force the setting to be permanent throughout the
         * runtime of the system. This is a precation measure against userspace
         * trying to be a smarta** and attempting to change it up on us.
         */
        devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;

        return 1;
}
__setup("printk.devkmsg=", control_devkmsg);

char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        char old_str[DEVKMSG_STR_MAX_SIZE];
        unsigned int old;
        int err;

        if (write) {
                if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
                        return -EINVAL;

                old = devkmsg_log;
                strscpy(old_str, devkmsg_log_str);
        }

        err = proc_dostring(table, write, buffer, lenp, ppos);
        if (err)
                return err;

        if (write) {
                err = __control_devkmsg(devkmsg_log_str);

                /*
                 * Do not accept an unknown string OR a known string with
                 * trailing crap...
                 */
                if (err < 0 || (err + 1 != *lenp)) {

                        /* ... and restore old setting. */
                        devkmsg_log = old;
                        strscpy(devkmsg_log_str, old_str);

                        return -EINVAL;
                }
        }

        return 0;
}
#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */

/**
 * console_list_lock - Lock the console list
 *
 * For console list or console->flags updates
 */
void console_list_lock(void)
        __acquires(&console_mutex)
{
        /*
         * In unregister_console() and console_force_preferred_locked(),
         * synchronize_srcu() is called with the console_list_lock held.
         * Therefore it is not allowed that the console_list_lock is taken
         * with the srcu_lock held.
         *
         * Detecting if this context is really in the read-side critical
         * section is only possible if the appropriate debug options are
         * enabled.
         */
        WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
                     srcu_read_lock_held(&console_srcu));

        mutex_lock(&console_mutex);
}
EXPORT_SYMBOL(console_list_lock);

/**
 * console_list_unlock - Unlock the console list
 *
 * Counterpart to console_list_lock()
 */
void console_list_unlock(void)
        __releases(&console_mutex)
{
        mutex_unlock(&console_mutex);
}
EXPORT_SYMBOL(console_list_unlock);

/**
 * console_srcu_read_lock - Register a new reader for the
 *        SRCU-protected console list
 *
 * Use for_each_console_srcu() to iterate the console list
 *
 * Context: Any context.
 * Return: A cookie to pass to console_srcu_read_unlock().
 */
int console_srcu_read_lock(void)
        __acquires(&console_srcu)
{
        return srcu_read_lock_nmisafe(&console_srcu);
}
EXPORT_SYMBOL(console_srcu_read_lock);

/**
 * console_srcu_read_unlock - Unregister an old reader from
 *        the SRCU-protected console list
 * @cookie: cookie returned from console_srcu_read_lock()
 *
 * Counterpart to console_srcu_read_lock()
 */
void console_srcu_read_unlock(int cookie)
        __releases(&console_srcu)
{
        srcu_read_unlock_nmisafe(&console_srcu, cookie);
}
EXPORT_SYMBOL(console_srcu_read_unlock);

/*
 * Helper macros to handle lockdep when locking/unlocking console_sem. We use
 * macros instead of functions so that _RET_IP_ contains useful information.
 */
#define down_console_sem() do { \
        down(&console_sem);\
        mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
} while (0)

static int __down_trylock_console_sem(unsigned long ip)
{
        int lock_failed;
        unsigned long flags;

        /*
         * Here and in __up_console_sem() we need to be in safe mode,
         * because spindump/WARN/etc from under console ->lock will
         * deadlock in printk()->down_trylock_console_sem() otherwise.
         */
        printk_safe_enter_irqsave(flags);
        lock_failed = down_trylock(&console_sem);
        printk_safe_exit_irqrestore(flags);

        if (lock_failed)
                return 1;
        mutex_acquire(&console_lock_dep_map, 0, 1, ip);
        return 0;
}
#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)

static void __up_console_sem(unsigned long ip)
{
        unsigned long flags;

        mutex_release(&console_lock_dep_map, ip);

        printk_safe_enter_irqsave(flags);
        up(&console_sem);
        printk_safe_exit_irqrestore(flags);
}
#define up_console_sem() __up_console_sem(_RET_IP_)

/*
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
 * definitely not the perfect debug tool (we don't know if _WE_
 * hold it and are racing, but it helps tracking those weird code
 * paths in the console code where we end up in places I want
 * locked without the console semaphore held).
 */
static int console_locked;

/*
 *        Array of consoles built from command line options (console=)
 */

#define MAX_CMDLINECONSOLES 8

static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];

static int preferred_console = -1;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);

/* Flag: console code may call schedule() */
static int console_may_schedule;

enum con_msg_format_flags {
        MSG_FORMAT_DEFAULT        = 0,
        MSG_FORMAT_SYSLOG        = (1 << 0),
};

static int console_msg_format = MSG_FORMAT_DEFAULT;

/*
 * The printk log buffer consists of a sequenced collection of records, each
 * containing variable length message text. Every record also contains its
 * own meta-data (@info).
 *
 * Every record meta-data carries the timestamp in microseconds, as well as
 * the standard userspace syslog level and syslog facility. The usual kernel
 * messages use LOG_KERN; userspace-injected messages always carry a matching
 * syslog facility, by default LOG_USER. The origin of every message can be
 * reliably determined that way.
 *
 * The human readable log message of a record is available in @text, the
 * length of the message text in @text_len. The stored message is not
 * terminated.
 *
 * Optionally, a record can carry a dictionary of properties (key/value
 * pairs), to provide userspace with a machine-readable message context.
 *
 * Examples for well-defined, commonly used property names are:
 *   DEVICE=b12:8               device identifier
 *                                b12:8         block dev_t
 *                                c127:3        char dev_t
 *                                n8            netdev ifindex
 *                                +sound:card0  subsystem:devname
 *   SUBSYSTEM=pci              driver-core subsystem name
 *
 * Valid characters in property names are [a-zA-Z0-9.-_]. Property names
 * and values are terminated by a '\0' character.
 *
 * Example of record values:
 *   record.text_buf                = "it's a line" (unterminated)
 *   record.info.seq                = 56
 *   record.info.ts_nsec            = 36863
 *   record.info.text_len           = 11
 *   record.info.facility           = 0 (LOG_KERN)
 *   record.info.flags              = 0
 *   record.info.level              = 3 (LOG_ERR)
 *   record.info.caller_id          = 299 (task 299)
 *   record.info.dev_info.subsystem = "pci" (terminated)
 *   record.info.dev_info.device    = "+pci:0000:00:01.0" (terminated)
 *
 * The 'struct printk_info' buffer must never be directly exported to
 * userspace, it is a kernel-private implementation detail that might
 * need to be changed in the future, when the requirements change.
 *
 * /dev/kmsg exports the structured data in the following line format:
 *   "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n"
 *
 * Users of the export format should ignore possible additional values
 * separated by ',', and find the message after the ';' character.
 *
 * The optional key/value pairs are attached as continuation lines starting
 * with a space character and terminated by a newline. All possible
 * non-prinatable characters are escaped in the "\xff" notation.
 */

/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_MUTEX(syslog_lock);

/*
 * Specifies if a legacy console is registered. If legacy consoles are
 * present, it is necessary to perform the console lock/unlock dance
 * whenever console flushing should occur.
 */
bool have_legacy_console;

/*
 * Specifies if an nbcon console is registered. If nbcon consoles are present,
 * synchronous printing of legacy consoles will not occur during panic until
 * the backtrace has been stored to the ringbuffer.
 */
bool have_nbcon_console;

/*
 * Specifies if a boot console is registered. If boot consoles are present,
 * nbcon consoles cannot print simultaneously and must be synchronized by
 * the console lock. This is because boot consoles and nbcon consoles may
 * have mapped the same hardware.
 */
bool have_boot_console;

/* See printk_legacy_allow_panic_sync() for details. */
bool legacy_allow_panic_sync;

/* Avoid using irq_work when suspending. */
bool console_irqwork_blocked;

#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
static DECLARE_WAIT_QUEUE_HEAD(legacy_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;

/* True when _all_ printer threads are available for printing. */
bool printk_kthreads_running;

struct latched_seq {
        seqcount_latch_t        latch;
        u64                        val[2];
};

/*
 * The next printk record to read after the last 'clear' command. There are
 * two copies (updated with seqcount_latch) so that reads can locklessly
 * access a valid value. Writers are synchronized by @syslog_lock.
 */
static struct latched_seq clear_seq = {
        .latch                = SEQCNT_LATCH_ZERO(clear_seq.latch),
        .val[0]                = 0,
        .val[1]                = 0,
};

#define LOG_LEVEL(v)                ((v) & 0x07)
#define LOG_FACILITY(v)                ((v) >> 3 & 0xff)

/* record buffer */
#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
#define LOG_BUF_LEN_MAX ((u32)1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;

/*
 * Define the average message size. This only affects the number of
 * descriptors that will be available. Underestimating is better than
 * overestimating (too many available descriptors is better than not enough).
 */
#define PRB_AVGBITS 5        /* 32 character average length */

#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
#error CONFIG_LOG_BUF_SHIFT value too small.
#endif
_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
                 PRB_AVGBITS, &__log_buf[0]);

static struct printk_ringbuffer printk_rb_dynamic;

struct printk_ringbuffer *prb = &printk_rb_static;

/*
 * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
 * per_cpu_areas are initialised. This variable is set to true when
 * it's safe to access per-CPU data.
 */
static bool __printk_percpu_data_ready __ro_after_init;

bool printk_percpu_data_ready(void)
{
        return __printk_percpu_data_ready;
}

/* Must be called under syslog_lock. */
static void latched_seq_write(struct latched_seq *ls, u64 val)
{
        write_seqcount_latch_begin(&ls->latch);
        ls->val[0] = val;
        write_seqcount_latch(&ls->latch);
        ls->val[1] = val;
        write_seqcount_latch_end(&ls->latch);
}

/* Can be called from any context. */
static u64 latched_seq_read_nolock(struct latched_seq *ls)
{
        unsigned int seq;
        unsigned int idx;
        u64 val;

        do {
                seq = read_seqcount_latch(&ls->latch);
                idx = seq & 0x1;
                val = ls->val[idx];
        } while (read_seqcount_latch_retry(&ls->latch, seq));

        return val;
}

/* Return log buffer address */
char *log_buf_addr_get(void)
{
        return log_buf;
}

/* Return log buffer size */
u32 log_buf_len_get(void)
{
        return log_buf_len;
}

/*
 * Define how much of the log buffer we could take at maximum. The value
 * must be greater than two. Note that only half of the buffer is available
 * when the index points to the middle.
 */
#define MAX_LOG_TAKE_PART 4
static const char trunc_msg[] = "<truncated>";

static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
{
        /*
         * The message should not take the whole buffer. Otherwise, it might
         * get removed too soon.
         */
        u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;

        if (*text_len > max_text_len)
                *text_len = max_text_len;

        /* enable the warning message (if there is room) */
        *trunc_msg_len = strlen(trunc_msg);
        if (*text_len >= *trunc_msg_len)
                *text_len -= *trunc_msg_len;
        else
                *trunc_msg_len = 0;
}

int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);

static int syslog_action_restricted(int type)
{
        if (dmesg_restrict)
                return 1;
        /*
         * Unless restricted, we allow "read all" and "get buffer size"
         * for everybody.
         */
        return type != SYSLOG_ACTION_READ_ALL &&
               type != SYSLOG_ACTION_SIZE_BUFFER;
}

static int check_syslog_permissions(int type, int source)
{
        /*
         * If this is from /proc/kmsg and we've already opened it, then we've
         * already done the capabilities checks at open time.
         */
        if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
                goto ok;

        if (syslog_action_restricted(type)) {
                if (capable(CAP_SYSLOG))
                        goto ok;
                return -EPERM;
        }
ok:
        return security_syslog(type);
}

static void append_char(char **pp, char *e, char c)
{
        if (*pp < e)
                *(*pp)++ = c;
}

static ssize_t info_print_ext_header(char *buf, size_t size,
                                     struct printk_info *info)
{
        u64 ts_usec = info->ts_nsec;
        char caller[20];
#ifdef CONFIG_PRINTK_CALLER
        u32 id = info->caller_id;

        snprintf(caller, sizeof(caller), ",caller=%c%u",
                 id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
#else
        caller[0] = '\0';
#endif

        do_div(ts_usec, 1000);

        return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
                         (info->facility << 3) | info->level, info->seq,
                         ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
}

static ssize_t msg_add_ext_text(char *buf, size_t size,
                                const char *text, size_t text_len,
                                unsigned char endc)
{
        char *p = buf, *e = buf + size;
        size_t i;

        /* escape non-printable characters */
        for (i = 0; i < text_len; i++) {
                unsigned char c = text[i];

                if (c < ' ' || c >= 127 || c == '\\')
                        p += scnprintf(p, e - p, "\\x%02x", c);
                else
                        append_char(&p, e, c);
        }
        append_char(&p, e, endc);

        return p - buf;
}

static ssize_t msg_add_dict_text(char *buf, size_t size,
                                 const char *key, const char *val)
{
        size_t val_len = strlen(val);
        ssize_t len;

        if (!val_len)
                return 0;

        len = msg_add_ext_text(buf, size, "", 0, ' ');        /* dict prefix */
        len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
        len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');

        return len;
}

static ssize_t msg_print_ext_body(char *buf, size_t size,
                                  char *text, size_t text_len,
                                  struct dev_printk_info *dev_info)
{
        ssize_t len;

        len = msg_add_ext_text(buf, size, text, text_len, '\n');

        if (!dev_info)
                goto out;

        len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
                                 dev_info->subsystem);
        len += msg_add_dict_text(buf + len, size - len, "DEVICE",
                                 dev_info->device);
out:
        return len;
}

/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
        atomic64_t seq;
        struct ratelimit_state rs;
        struct mutex lock;
        struct printk_buffers pbufs;
};

static __printf(3, 4) __cold
int devkmsg_emit(int facility, int level, const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk_emit(facility, level, NULL, fmt, args);
        va_end(args);

        return r;
}

static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
        char *buf, *line;
        int level = default_message_loglevel;
        int facility = 1;        /* LOG_USER */
        struct file *file = iocb->ki_filp;
        struct devkmsg_user *user = file->private_data;
        size_t len = iov_iter_count(from);
        ssize_t ret = len;

        if (len > PRINTKRB_RECORD_MAX)
                return -EINVAL;

        /* Ignore when user logging is disabled. */
        if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
                return len;

        /* Ratelimit when not explicitly enabled. */
        if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
                if (!___ratelimit(&user->rs, current->comm))
                        return ret;
        }

        buf = kmalloc(len+1, GFP_KERNEL);
        if (buf == NULL)
                return -ENOMEM;

        buf[len] = '\0';
        if (!copy_from_iter_full(buf, len, from)) {
                kfree(buf);
                return -EFAULT;
        }

        /*
         * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
         * the decimal value represents 32bit, the lower 3 bit are the log
         * level, the rest are the log facility.
         *
         * If no prefix or no userspace facility is specified, we
         * enforce LOG_USER, to be able to reliably distinguish
         * kernel-generated messages from userspace-injected ones.
         */
        line = buf;
        if (line[0] == '<') {
                char *endp = NULL;
                unsigned int u;

                u = simple_strtoul(line + 1, &endp, 10);
                if (endp && endp[0] == '>') {
                        level = LOG_LEVEL(u);
                        if (LOG_FACILITY(u) != 0)
                                facility = LOG_FACILITY(u);
                        endp++;
                        line = endp;
                }
        }

        devkmsg_emit(facility, level, "%s", line);
        kfree(buf);
        return ret;
}

static ssize_t devkmsg_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
{
        struct devkmsg_user *user = file->private_data;
        char *outbuf = &user->pbufs.outbuf[0];
        struct printk_message pmsg = {
                .pbufs = &user->pbufs,
        };
        ssize_t ret;

        ret = mutex_lock_interruptible(&user->lock);
        if (ret)
                return ret;

        if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {
                if (file->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
                        goto out;
                }

                /*
                 * Guarantee this task is visible on the waitqueue before
                 * checking the wake condition.
                 *
                 * The full memory barrier within set_current_state() of
                 * prepare_to_wait_event() pairs with the full memory barrier
                 * within wq_has_sleeper().
                 *
                 * This pairs with __wake_up_klogd:A.
                 */
                ret = wait_event_interruptible(log_wait,
                                printk_get_next_message(&pmsg, atomic64_read(&user->seq), true,
                                                        false)); /* LMM(devkmsg_read:A) */
                if (ret)
                        goto out;
        }

        if (pmsg.dropped) {
                /* our last seen message is gone, return error and reset */
                atomic64_set(&user->seq, pmsg.seq);
                ret = -EPIPE;
                goto out;
        }

        atomic64_set(&user->seq, pmsg.seq + 1);

        if (pmsg.outbuf_len > count) {
                ret = -EINVAL;
                goto out;
        }

        if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {
                ret = -EFAULT;
                goto out;
        }
        ret = pmsg.outbuf_len;
out:
        mutex_unlock(&user->lock);
        return ret;
}

/*
 * Be careful when modifying this function!!!
 *
 * Only few operations are supported because the device works only with the
 * entire variable length messages (records). Non-standard values are
 * returned in the other cases and has been this way for quite some time.
 * User space applications might depend on this behavior.
 */
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
        struct devkmsg_user *user = file->private_data;
        loff_t ret = 0;

        if (offset)
                return -ESPIPE;

        switch (whence) {
        case SEEK_SET:
                /* the first record */
                atomic64_set(&user->seq, prb_first_valid_seq(prb));
                break;
        case SEEK_DATA:
                /*
                 * The first record after the last SYSLOG_ACTION_CLEAR,
                 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
                 * changes no global state, and does not clear anything.
                 */
                atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
                break;
        case SEEK_END:
                /* after the last record */
                atomic64_set(&user->seq, prb_next_seq(prb));
                break;
        default:
                ret = -EINVAL;
        }
        return ret;
}

static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
        struct devkmsg_user *user = file->private_data;
        struct printk_info info;
        __poll_t ret = 0;

        poll_wait(file, &log_wait, wait);

        if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
                /* return error when data has vanished underneath us */
                if (info.seq != atomic64_read(&user->seq))
                        ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
                else
                        ret = EPOLLIN|EPOLLRDNORM;
        }

        return ret;
}

static int devkmsg_open(struct inode *inode, struct file *file)
{
        struct devkmsg_user *user;
        int err;

        if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
                return -EPERM;

        /* write-only does not need any file context */
        if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
                err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
                                               SYSLOG_FROM_READER);
                if (err)
                        return err;
        }

        user = kvmalloc_obj(struct devkmsg_user);
        if (!user)
                return -ENOMEM;

        ratelimit_default_init(&user->rs);
        ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);

        mutex_init(&user->lock);

        atomic64_set(&user->seq, prb_first_valid_seq(prb));

        file->private_data = user;
        return 0;
}

static int devkmsg_release(struct inode *inode, struct file *file)
{
        struct devkmsg_user *user = file->private_data;

        ratelimit_state_exit(&user->rs);

        mutex_destroy(&user->lock);
        kvfree(user);
        return 0;
}

const struct file_operations kmsg_fops = {
        .open = devkmsg_open,
        .read = devkmsg_read,
        .write_iter = devkmsg_write,
        .llseek = devkmsg_llseek,
        .poll = devkmsg_poll,
        .release = devkmsg_release,
};

#ifdef CONFIG_VMCORE_INFO
/*
 * This appends the listed symbols to /proc/vmcore
 *
 * /proc/vmcore is used by various utilities, like crash and makedumpfile to
 * obtain access to symbols that are otherwise very difficult to locate.  These
 * symbols are specifically used so that utilities can access and extract the
 * dmesg log from a vmcore file after a crash.
 */
void log_buf_vmcoreinfo_setup(void)
{
        struct dev_printk_info *dev_info = NULL;

        VMCOREINFO_SYMBOL(prb);
        VMCOREINFO_SYMBOL(printk_rb_static);
        VMCOREINFO_SYMBOL(clear_seq);

        /*
         * Export struct size and field offsets. User space tools can
         * parse it and detect any changes to structure down the line.
         */

        VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
        VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
        VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
        VMCOREINFO_OFFSET(printk_ringbuffer, fail);

        VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
        VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
        VMCOREINFO_OFFSET(prb_desc_ring, descs);
        VMCOREINFO_OFFSET(prb_desc_ring, infos);
        VMCOREINFO_OFFSET(prb_desc_ring, head_id);
        VMCOREINFO_OFFSET(prb_desc_ring, tail_id);

        VMCOREINFO_STRUCT_SIZE(prb_desc);
        VMCOREINFO_OFFSET(prb_desc, state_var);
        VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);

        VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
        VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
        VMCOREINFO_OFFSET(prb_data_blk_lpos, next);

        VMCOREINFO_STRUCT_SIZE(printk_info);
        VMCOREINFO_OFFSET(printk_info, seq);
        VMCOREINFO_OFFSET(printk_info, ts_nsec);
        VMCOREINFO_OFFSET(printk_info, text_len);
        VMCOREINFO_OFFSET(printk_info, caller_id);
        VMCOREINFO_OFFSET(printk_info, dev_info);

        VMCOREINFO_STRUCT_SIZE(dev_printk_info);
        VMCOREINFO_OFFSET(dev_printk_info, subsystem);
        VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
        VMCOREINFO_OFFSET(dev_printk_info, device);
        VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));

        VMCOREINFO_STRUCT_SIZE(prb_data_ring);
        VMCOREINFO_OFFSET(prb_data_ring, size_bits);
        VMCOREINFO_OFFSET(prb_data_ring, data);
        VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
        VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);

        VMCOREINFO_SIZE(atomic_long_t);
        VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);

        VMCOREINFO_STRUCT_SIZE(latched_seq);
        VMCOREINFO_OFFSET(latched_seq, val);
}
#endif

/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;

/* we practice scaling the ring buffer by powers of 2 */
static void __init log_buf_len_update(u64 size)
{
        if (size > (u64)LOG_BUF_LEN_MAX) {
                size = (u64)LOG_BUF_LEN_MAX;
                pr_err("log_buf over 2G is not supported.\n");
        }

        if (size)
                size = roundup_pow_of_two(size);
        if (size > log_buf_len)
                new_log_buf_len = (unsigned long)size;
}

/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
{
        u64 size;

        if (!str)
                return -EINVAL;

        size = memparse(str, &str);

        log_buf_len_update(size);

        return 0;
}
early_param("log_buf_len", log_buf_len_setup);

#ifdef CONFIG_SMP
#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)

static void __init log_buf_add_cpu(void)
{
        unsigned int cpu_extra;

        /*
         * archs should set up cpu_possible_bits properly with
         * set_cpu_possible() after setup_arch() but just in
         * case lets ensure this is valid.
         */
        if (num_possible_cpus() == 1)
                return;

        cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;

        /* by default this will only continue through for large > 64 CPUs */
        if (cpu_extra <= __LOG_BUF_LEN / 2)
                return;

        pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
                __LOG_CPU_MAX_BUF_LEN);
        pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
                cpu_extra);
        pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);

        log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
}
#else /* !CONFIG_SMP */
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */

static void __init set_percpu_data_ready(void)
{
        __printk_percpu_data_ready = true;
}

static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
                                     struct printk_record *r)
{
        struct prb_reserved_entry e;
        struct printk_record dest_r;

        prb_rec_init_wr(&dest_r, r->info->text_len);

        if (!prb_reserve(&e, rb, &dest_r))
                return 0;

        memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
        dest_r.info->text_len = r->info->text_len;
        dest_r.info->facility = r->info->facility;
        dest_r.info->level = r->info->level;
        dest_r.info->flags = r->info->flags;
        dest_r.info->ts_nsec = r->info->ts_nsec;
        dest_r.info->caller_id = r->info->caller_id;
        memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));

        prb_final_commit(&e);

        return prb_record_text_space(&e);
}

static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;

static void print_log_buf_usage_stats(void)
{
        unsigned int descs_count = log_buf_len >> PRB_AVGBITS;
        size_t meta_data_size;

        meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info));

        pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n",
                log_buf_len, meta_data_size, log_buf_len + meta_data_size);
}

void __init setup_log_buf(int early)
{
        struct printk_info *new_infos;
        unsigned int new_descs_count;
        struct prb_desc *new_descs;
        struct printk_info info;
        struct printk_record r;
        unsigned int text_size;
        size_t new_descs_size;
        size_t new_infos_size;
        unsigned long flags;
        char *new_log_buf;
        unsigned int free;
        u64 seq;

        /*
         * Some archs call setup_log_buf() multiple times - first is very
         * early, e.g. from setup_arch(), and second - when percpu_areas
         * are initialised.
         */
        if (!early)
                set_percpu_data_ready();

        if (log_buf != __log_buf)
                return;

        if (!early && !new_log_buf_len)
                log_buf_add_cpu();

        if (!new_log_buf_len) {
                /* Show the memory stats only once. */
                if (!early)
                        goto out;

                return;
        }

        new_descs_count = new_log_buf_len >> PRB_AVGBITS;
        if (new_descs_count == 0) {
                pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
                goto out;
        }

        new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
        if (unlikely(!new_log_buf)) {
                pr_err("log_buf_len: %lu text bytes not available\n",
                       new_log_buf_len);
                goto out;
        }

        new_descs_size = new_descs_count * sizeof(struct prb_desc);
        new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
        if (unlikely(!new_descs)) {
                pr_err("log_buf_len: %zu desc bytes not available\n",
                       new_descs_size);
                goto err_free_log_buf;
        }

        new_infos_size = new_descs_count * sizeof(struct printk_info);
        new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
        if (unlikely(!new_infos)) {
                pr_err("log_buf_len: %zu info bytes not available\n",
                       new_infos_size);
                goto err_free_descs;
        }

        prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));

        prb_init(&printk_rb_dynamic,
                 new_log_buf, ilog2(new_log_buf_len),
                 new_descs, ilog2(new_descs_count),
                 new_infos);

        local_irq_save(flags);

        log_buf_len = new_log_buf_len;
        log_buf = new_log_buf;
        new_log_buf_len = 0;

        free = __LOG_BUF_LEN;
        prb_for_each_record(0, &printk_rb_static, seq, &r) {
                text_size = add_to_rb(&printk_rb_dynamic, &r);
                if (text_size > free)
                        free = 0;
                else
                        free -= text_size;
        }

        prb = &printk_rb_dynamic;

        local_irq_restore(flags);

        /*
         * Copy any remaining messages that might have appeared from
         * NMI context after copying but before switching to the
         * dynamic buffer.
         */
        prb_for_each_record(seq, &printk_rb_static, seq, &r) {
                text_size = add_to_rb(&printk_rb_dynamic, &r);
                if (text_size > free)
                        free = 0;
                else
                        free -= text_size;
        }

        if (seq != prb_next_seq(&printk_rb_static)) {
                pr_err("dropped %llu messages\n",
                       prb_next_seq(&printk_rb_static) - seq);
        }

        print_log_buf_usage_stats();
        pr_info("early log buf free: %u(%u%%)\n",
                free, (free * 100) / __LOG_BUF_LEN);
        return;

err_free_descs:
        memblock_free(new_descs, new_descs_size);
err_free_log_buf:
        memblock_free(new_log_buf, new_log_buf_len);
out:
        print_log_buf_usage_stats();
}

static bool __read_mostly ignore_loglevel;

static int __init ignore_loglevel_setup(char *str)
{
        ignore_loglevel = true;
        pr_info("debug: ignoring loglevel setting.\n");

        return 0;
}

early_param("ignore_loglevel", ignore_loglevel_setup);
module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel,
                 "ignore loglevel setting (prints all kernel messages to the console)");

static bool suppress_message_printing(int level)
{
        return (level >= console_loglevel && !ignore_loglevel);
}

#ifdef CONFIG_BOOT_PRINTK_DELAY

static int boot_delay; /* msecs delay after each printk during bootup */
static unsigned long long loops_per_msec;        /* based on boot_delay */

static int __init boot_delay_setup(char *str)
{
        unsigned long lpj;

        lpj = preset_lpj ? preset_lpj : 1000000;        /* some guess */
        loops_per_msec = (unsigned long long)lpj / 1000 * HZ;

        get_option(&str, &boot_delay);
        if (boot_delay > 10 * 1000)
                boot_delay = 0;

        pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
                "HZ: %d, loops_per_msec: %llu\n",
                boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
        return 0;
}
early_param("boot_delay", boot_delay_setup);

static void boot_delay_msec(int level)
{
        unsigned long long k;
        unsigned long timeout;
        bool suppress = !is_printk_force_console() &&
                        suppress_message_printing(level);

        if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress)
                return;

        k = (unsigned long long)loops_per_msec * boot_delay;

        timeout = jiffies + msecs_to_jiffies(boot_delay);
        while (k) {
                k--;
                cpu_relax();
                /*
                 * use (volatile) jiffies to prevent
                 * compiler reduction; loop termination via jiffies
                 * is secondary and may or may not happen.
                 */
                if (time_after(jiffies, timeout))
                        break;
                touch_nmi_watchdog();
        }
}
#else
static inline void boot_delay_msec(int level)
{
}
#endif

static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);

static size_t print_syslog(unsigned int level, char *buf)
{
        return sprintf(buf, "<%u>", level);
}

static size_t print_time(u64 ts, char *buf)
{
        unsigned long rem_nsec = do_div(ts, 1000000000);

        return sprintf(buf, "[%5lu.%06lu]",
                       (unsigned long)ts, rem_nsec / 1000);
}

#ifdef CONFIG_PRINTK_CALLER
static size_t print_caller(u32 id, char *buf)
{
        char caller[12];

        snprintf(caller, sizeof(caller), "%c%u",
                 id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
        return sprintf(buf, "[%6s]", caller);
}
#else
#define print_caller(id, buf) 0
#endif

static size_t info_print_prefix(const struct printk_info  *info, bool syslog,
                                bool time, char *buf)
{
        size_t len = 0;

        if (syslog)
                len = print_syslog((info->facility << 3) | info->level, buf);

        if (time)
                len += print_time(info->ts_nsec, buf + len);

        len += print_caller(info->caller_id, buf + len);

        if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
                buf[len++] = ' ';
                buf[len] = '\0';
        }

        return len;
}

/*
 * Prepare the record for printing. The text is shifted within the given
 * buffer to avoid a need for another one. The following operations are
 * done:
 *
 *   - Add prefix for each line.
 *   - Drop truncated lines that no longer fit into the buffer.
 *   - Add the trailing newline that has been removed in vprintk_store().
 *   - Add a string terminator.
 *
 * Since the produced string is always terminated, the maximum possible
 * return value is @r->text_buf_size - 1;
 *
 * Return: The length of the updated/prepared text, including the added
 * prefixes and the newline. The terminator is not counted. The dropped
 * line(s) are not counted.
 */
static size_t record_print_text(struct printk_record *r, bool syslog,
                                bool time)
{
        size_t text_len = r->info->text_len;
        size_t buf_size = r->text_buf_size;
        char *text = r->text_buf;
        char prefix[PRINTK_PREFIX_MAX];
        bool truncated = false;
        size_t prefix_len;
        size_t line_len;
        size_t len = 0;
        char *next;

        /*
         * If the message was truncated because the buffer was not large
         * enough, treat the available text as if it were the full text.
         */
        if (text_len > buf_size)
                text_len = buf_size;

        prefix_len = info_print_prefix(r->info, syslog, time, prefix);

        /*
         * @text_len: bytes of unprocessed text
         * @line_len: bytes of current line _without_ newline
         * @text:     pointer to beginning of current line
         * @len:      number of bytes prepared in r->text_buf
         */
        for (;;) {
                next = memchr(text, '\n', text_len);
                if (next) {
                        line_len = next - text;
                } else {
                        /* Drop truncated line(s). */
                        if (truncated)
                                break;
                        line_len = text_len;
                }

                /*
                 * Truncate the text if there is not enough space to add the
                 * prefix and a trailing newline and a terminator.
                 */
                if (len + prefix_len + text_len + 1 + 1 > buf_size) {
                        /* Drop even the current line if no space. */
                        if (len + prefix_len + line_len + 1 + 1 > buf_size)
                                break;

                        text_len = buf_size - len - prefix_len - 1 - 1;
                        truncated = true;
                }

                memmove(text + prefix_len, text, text_len);
                memcpy(text, prefix, prefix_len);

                /*
                 * Increment the prepared length to include the text and
                 * prefix that were just moved+copied. Also increment for the
                 * newline at the end of this line. If this is the last line,
                 * there is no newline, but it will be added immediately below.
                 */
                len += prefix_len + line_len + 1;
                if (text_len == line_len) {
                        /*
                         * This is the last line. Add the trailing newline
                         * removed in vprintk_store().
                         */
                        text[prefix_len + line_len] = '\n';
                        break;
                }

                /*
                 * Advance beyond the added prefix and the related line with
                 * its newline.
                 */
                text += prefix_len + line_len + 1;

                /*
                 * The remaining text has only decreased by the line with its
                 * newline.
                 *
                 * Note that @text_len can become zero. It happens when @text
                 * ended with a newline (either due to truncation or the
                 * original string ending with "\n\n"). The loop is correctly
                 * repeated and (if not truncated) an empty line with a prefix
                 * will be prepared.
                 */
                text_len -= line_len + 1;
        }

        /*
         * If a buffer was provided, it will be terminated. Space for the
         * string terminator is guaranteed to be available. The terminator is
         * not counted in the return value.
         */
        if (buf_size > 0)
                r->text_buf[len] = 0;

        return len;
}

static size_t get_record_print_text_size(struct printk_info *info,
                                         unsigned int line_count,
                                         bool syslog, bool time)
{
        char prefix[PRINTK_PREFIX_MAX];
        size_t prefix_len;

        prefix_len = info_print_prefix(info, syslog, time, prefix);

        /*
         * Each line will be preceded with a prefix. The intermediate
         * newlines are already within the text, but a final trailing
         * newline will be added.
         */
        return ((prefix_len * line_count) + info->text_len + 1);
}

/*
 * Beginning with @start_seq, find the first record where it and all following
 * records up to (but not including) @max_seq fit into @size.
 *
 * @max_seq is simply an upper bound and does not need to exist. If the caller
 * does not require an upper bound, -1 can be used for @max_seq.
 */
static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
                                  bool syslog, bool time)
{
        struct printk_info info;
        unsigned int line_count;
        size_t len = 0;
        u64 seq;

        /* Determine the size of the records up to @max_seq. */
        prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
                if (info.seq >= max_seq)
                        break;
                len += get_record_print_text_size(&info, line_count, syslog, time);
        }

        /*
         * Adjust the upper bound for the next loop to avoid subtracting
         * lengths that were never added.
         */
        if (seq < max_seq)
                max_seq = seq;

        /*
         * Move first record forward until length fits into the buffer. Ignore
         * newest messages that were not counted in the above cycle. Messages
         * might appear and get lost in the meantime. This is a best effort
         * that prevents an infinite loop that could occur with a retry.
         */
        prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
                if (len <= size || info.seq >= max_seq)
                        break;
                len -= get_record_print_text_size(&info, line_count, syslog, time);
        }

        return seq;
}

/* The caller is responsible for making sure @size is greater than 0. */
static int syslog_print(char __user *buf, int size)
{
        struct printk_info info;
        struct printk_record r;
        char *text;
        int len = 0;
        u64 seq;

        text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;

        prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);

        mutex_lock(&syslog_lock);

        /*
         * Wait for the @syslog_seq record to be available. @syslog_seq may
         * change while waiting.
         */
        do {
                seq = syslog_seq;

                mutex_unlock(&syslog_lock);
                /*
                 * Guarantee this task is visible on the waitqueue before
                 * checking the wake condition.
                 *
                 * The full memory barrier within set_current_state() of
                 * prepare_to_wait_event() pairs with the full memory barrier
                 * within wq_has_sleeper().
                 *
                 * This pairs with __wake_up_klogd:A.
                 */
                len = wait_event_interruptible(log_wait,
                                prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
                mutex_lock(&syslog_lock);

                if (len)
                        goto out;
        } while (syslog_seq != seq);

        /*
         * Copy records that fit into the buffer. The above cycle makes sure
         * that the first record is always available.
         */
        do {
                size_t n;
                size_t skip;
                int err;

                if (!prb_read_valid(prb, syslog_seq, &r))
                        break;

                if (r.info->seq != syslog_seq) {
                        /* message is gone, move to next valid one */
                        syslog_seq = r.info->seq;
                        syslog_partial = 0;
                }

                /*
                 * To keep reading/counting partial line consistent,
                 * use printk_time value as of the beginning of a line.
                 */
                if (!syslog_partial)
                        syslog_time = printk_time;

                skip = syslog_partial;
                n = record_print_text(&r, true, syslog_time);
                if (n - syslog_partial <= size) {
                        /* message fits into buffer, move forward */
                        syslog_seq = r.info->seq + 1;
                        n -= syslog_partial;
                        syslog_partial = 0;
                } else if (!len){
                        /* partial read(), remember position */
                        n = size;
                        syslog_partial += n;
                } else
                        n = 0;

                if (!n)
                        break;

                mutex_unlock(&syslog_lock);
                err = copy_to_user(buf, text + skip, n);
                mutex_lock(&syslog_lock);

                if (err) {
                        if (!len)
                                len = -EFAULT;
                        break;
                }

                len += n;
                size -= n;
                buf += n;
        } while (size);
out:
        mutex_unlock(&syslog_lock);
        kfree(text);
        return len;
}

static int syslog_print_all(char __user *buf, int size, bool clear)
{
        struct printk_info info;
        struct printk_record r;
        char *text;
        int len = 0;
        u64 seq;
        bool time;

        text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;

        time = printk_time;
        /*
         * Find first record that fits, including all following records,
         * into the user-provided buffer for this dump.
         */
        seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
                                     size, true, time);

        prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);

        prb_for_each_record(seq, prb, seq, &r) {
                int textlen;

                textlen = record_print_text(&r, true, time);

                if (len + textlen > size) {
                        seq--;
                        break;
                }

                if (copy_to_user(buf + len, text, textlen))
                        len = -EFAULT;
                else
                        len += textlen;

                if (len < 0)
                        break;
        }

        if (clear) {
                mutex_lock(&syslog_lock);
                latched_seq_write(&clear_seq, seq);
                mutex_unlock(&syslog_lock);
        }

        kfree(text);
        return len;
}

static void syslog_clear(void)
{
        mutex_lock(&syslog_lock);
        latched_seq_write(&clear_seq, prb_next_seq(prb));
        mutex_unlock(&syslog_lock);
}

int do_syslog(int type, char __user *buf, int len, int source)
{
        struct printk_info info;
        bool clear = false;
        static int saved_console_loglevel = LOGLEVEL_DEFAULT;
        int error;

        error = check_syslog_permissions(type, source);
        if (error)
                return error;

        switch (type) {
        case SYSLOG_ACTION_CLOSE:        /* Close log */
                break;
        case SYSLOG_ACTION_OPEN:        /* Open log */
                break;
        case SYSLOG_ACTION_READ:        /* Read from log */
                if (!buf || len < 0)
                        return -EINVAL;
                if (!len)
                        return 0;
                if (!access_ok(buf, len))
                        return -EFAULT;
                error = syslog_print(buf, len);
                break;
        /* Read/clear last kernel messages */
        case SYSLOG_ACTION_READ_CLEAR:
                clear = true;
                fallthrough;
        /* Read last kernel messages */
        case SYSLOG_ACTION_READ_ALL:
                if (!buf || len < 0)
                        return -EINVAL;
                if (!len)
                        return 0;
                if (!access_ok(buf, len))
                        return -EFAULT;
                error = syslog_print_all(buf, len, clear);
                break;
        /* Clear ring buffer */
        case SYSLOG_ACTION_CLEAR:
                syslog_clear();
                break;
        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == LOGLEVEL_DEFAULT)
                        saved_console_loglevel = console_loglevel;
                console_loglevel = minimum_console_loglevel;
                break;
        /* Enable logging to console */
        case SYSLOG_ACTION_CONSOLE_ON:
                if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
                        console_loglevel = saved_console_loglevel;
                        saved_console_loglevel = LOGLEVEL_DEFAULT;
                }
                break;
        /* Set level of messages printed to console */
        case SYSLOG_ACTION_CONSOLE_LEVEL:
                if (len < 1 || len > 8)
                        return -EINVAL;
                if (len < minimum_console_loglevel)
                        len = minimum_console_loglevel;
                console_loglevel = len;
                /* Implicitly re-enable logging to console */
                saved_console_loglevel = LOGLEVEL_DEFAULT;
                break;
        /* Number of chars in the log buffer */
        case SYSLOG_ACTION_SIZE_UNREAD:
                mutex_lock(&syslog_lock);
                if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
                        /* No unread messages. */
                        mutex_unlock(&syslog_lock);
                        return 0;
                }
                if (info.seq != syslog_seq) {
                        /* messages are gone, move to first one */
                        syslog_seq = info.seq;
                        syslog_partial = 0;
                }
                if (source == SYSLOG_FROM_PROC) {
                        /*
                         * Short-cut for poll(/"proc/kmsg") which simply checks
                         * for pending data, not the size; return the count of
                         * records, not the length.
                         */
                        error = prb_next_seq(prb) - syslog_seq;
                } else {
                        bool time = syslog_partial ? syslog_time : printk_time;
                        unsigned int line_count;
                        u64 seq;

                        prb_for_each_info(syslog_seq, prb, seq, &info,
                                          &line_count) {
                                error += get_record_print_text_size(&info, line_count,
                                                                    true, time);
                                time = printk_time;
                        }
                        error -= syslog_partial;
                }
                mutex_unlock(&syslog_lock);
                break;
        /* Size of the log buffer */
        case SYSLOG_ACTION_SIZE_BUFFER:
                error = log_buf_len;
                break;
        default:
                error = -EINVAL;
                break;
        }

        return error;
}

SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
{
        return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}

/*
 * Special console_lock variants that help to reduce the risk of soft-lockups.
 * They allow to pass console_lock to another printk() call using a busy wait.
 */

#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_owner_dep_map = {
        .name = "console_owner"
};
#endif

static DEFINE_RAW_SPINLOCK(console_owner_lock);
static struct task_struct *console_owner;
static bool console_waiter;

/**
 * console_lock_spinning_enable - mark beginning of code where another
 *        thread might safely busy wait
 *
 * This basically converts console_lock into a spinlock. This marks
 * the section where the console_lock owner can not sleep, because
 * there may be a waiter spinning (like a spinlock). Also it must be
 * ready to hand over the lock at the end of the section.
 */
void console_lock_spinning_enable(void)
{
        /*
         * Do not use spinning in panic(). The panic CPU wants to keep the lock.
         * Non-panic CPUs abandon the flush anyway.
         *
         * Just keep the lockdep annotation. The panic-CPU should avoid
         * taking console_owner_lock because it might cause a deadlock.
         * This looks like the easiest way how to prevent false lockdep
         * reports without handling races a lockless way.
         */
        if (panic_in_progress())
                goto lockdep;

        raw_spin_lock(&console_owner_lock);
        console_owner = current;
        raw_spin_unlock(&console_owner_lock);

lockdep:
        /* The waiter may spin on us after setting console_owner */
        spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}

/**
 * console_lock_spinning_disable_and_check - mark end of code where another
 *        thread was able to busy wait and check if there is a waiter
 * @cookie: cookie returned from console_srcu_read_lock()
 *
 * This is called at the end of the section where spinning is allowed.
 * It has two functions. First, it is a signal that it is no longer
 * safe to start busy waiting for the lock. Second, it checks if
 * there is a busy waiter and passes the lock rights to her.
 *
 * Important: Callers lose both the console_lock and the SRCU read lock if
 *        there was a busy waiter. They must not touch items synchronized by
 *        console_lock or SRCU read lock in this case.
 *
 * Return: 1 if the lock rights were passed, 0 otherwise.
 */
int console_lock_spinning_disable_and_check(int cookie)
{
        int waiter;

        /*
         * Ignore spinning waiters during panic() because they might get stopped
         * or blocked at any time,
         *
         * It is safe because nobody is allowed to start spinning during panic
         * in the first place. If there has been a waiter then non panic CPUs
         * might stay spinning. They would get stopped anyway. The panic context
         * will never start spinning and an interrupted spin on panic CPU will
         * never continue.
         */
        if (panic_in_progress()) {
                /* Keep lockdep happy. */
                spin_release(&console_owner_dep_map, _THIS_IP_);
                return 0;
        }

        raw_spin_lock(&console_owner_lock);
        waiter = READ_ONCE(console_waiter);
        console_owner = NULL;
        raw_spin_unlock(&console_owner_lock);

        if (!waiter) {
                spin_release(&console_owner_dep_map, _THIS_IP_);
                return 0;
        }

        /* The waiter is now free to continue */
        WRITE_ONCE(console_waiter, false);

        spin_release(&console_owner_dep_map, _THIS_IP_);

        /*
         * Preserve lockdep lock ordering. Release the SRCU read lock before
         * releasing the console_lock.
         */
        console_srcu_read_unlock(cookie);

        /*
         * Hand off console_lock to waiter. The waiter will perform
         * the up(). After this, the waiter is the console_lock owner.
         */
        mutex_release(&console_lock_dep_map, _THIS_IP_);
        return 1;
}

/**
 * console_trylock_spinning - try to get console_lock by busy waiting
 *
 * This allows to busy wait for the console_lock when the current
 * owner is running in specially marked sections. It means that
 * the current owner is running and cannot reschedule until it
 * is ready to lose the lock.
 *
 * Return: 1 if we got the lock, 0 othrewise
 */
static int console_trylock_spinning(void)
{
        struct task_struct *owner = NULL;
        bool waiter;
        bool spin = false;
        unsigned long flags;

        if (console_trylock())
                return 1;

        /*
         * It's unsafe to spin once a panic has begun. If we are the
         * panic CPU, we may have already halted the owner of the
         * console_sem. If we are not the panic CPU, then we should
         * avoid taking console_sem, so the panic CPU has a better
         * chance of cleanly acquiring it later.
         */
        if (panic_in_progress())
                return 0;

        printk_safe_enter_irqsave(flags);

        raw_spin_lock(&console_owner_lock);
        owner = READ_ONCE(console_owner);
        waiter = READ_ONCE(console_waiter);
        if (!waiter && owner && owner != current) {
                WRITE_ONCE(console_waiter, true);
                spin = true;
        }
        raw_spin_unlock(&console_owner_lock);

        /*
         * If there is an active printk() writing to the
         * consoles, instead of having it write our data too,
         * see if we can offload that load from the active
         * printer, and do some printing ourselves.
         * Go into a spin only if there isn't already a waiter
         * spinning, and there is an active printer, and
         * that active printer isn't us (recursive printk?).
         */
        if (!spin) {
                printk_safe_exit_irqrestore(flags);
                return 0;
        }

        /* We spin waiting for the owner to release us */
        spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
        /* Owner will clear console_waiter on hand off */
        while (READ_ONCE(console_waiter))
                cpu_relax();
        spin_release(&console_owner_dep_map, _THIS_IP_);

        printk_safe_exit_irqrestore(flags);
        /*
         * The owner passed the console lock to us.
         * Since we did not spin on console lock, annotate
         * this as a trylock. Otherwise lockdep will
         * complain.
         */
        mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);

        /*
         * Update @console_may_schedule for trylock because the previous
         * owner may have been schedulable.
         */
        console_may_schedule = 0;

        return 1;
}

/*
 * Recursion is tracked separately on each CPU. If NMIs are supported, an
 * additional NMI context per CPU is also separately tracked. Until per-CPU
 * is available, a separate "early tracking" is performed.
 */
static DEFINE_PER_CPU(u8, printk_count);
static u8 printk_count_early;
#ifdef CONFIG_HAVE_NMI
static DEFINE_PER_CPU(u8, printk_count_nmi);
static u8 printk_count_nmi_early;
#endif

/*
 * Recursion is limited to keep the output sane. printk() should not require
 * more than 1 level of recursion (allowing, for example, printk() to trigger
 * a WARN), but a higher value is used in case some printk-internal errors
 * exist, such as the ringbuffer validation checks failing.
 */
#define PRINTK_MAX_RECURSION 3

/*
 * Return a pointer to the dedicated counter for the CPU+context of the
 * caller.
 */
static u8 *__printk_recursion_counter(void)
{
#ifdef CONFIG_HAVE_NMI
        if (in_nmi()) {
                if (printk_percpu_data_ready())
                        return this_cpu_ptr(&printk_count_nmi);
                return &printk_count_nmi_early;
        }
#endif
        if (printk_percpu_data_ready())
                return this_cpu_ptr(&printk_count);
        return &printk_count_early;
}

/*
 * Enter recursion tracking. Interrupts are disabled to simplify tracking.
 * The caller must check the boolean return value to see if the recursion is
 * allowed. On failure, interrupts are not disabled.
 *
 * @recursion_ptr must be a variable of type (u8 *) and is the same variable
 * that is passed to printk_exit_irqrestore().
 */
#define printk_enter_irqsave(recursion_ptr, flags)        \
({                                                        \
        bool success = true;                                \
                                                        \
        typecheck(u8 *, recursion_ptr);                        \
        local_irq_save(flags);                                \
        (recursion_ptr) = __printk_recursion_counter();        \
        if (*(recursion_ptr) > PRINTK_MAX_RECURSION) {        \
                local_irq_restore(flags);                \
                success = false;                        \
        } else {                                        \
                (*(recursion_ptr))++;                        \
        }                                                \
        success;                                        \
})

/* Exit recursion tracking, restoring interrupts. */
#define printk_exit_irqrestore(recursion_ptr, flags)        \
        do {                                                \
                typecheck(u8 *, recursion_ptr);                \
                (*(recursion_ptr))--;                        \
                local_irq_restore(flags);                \
        } while (0)

int printk_delay_msec __read_mostly;

static inline void printk_delay(int level)
{
        boot_delay_msec(level);

        if (unlikely(printk_delay_msec)) {
                int m = printk_delay_msec;

                while (m--) {
                        mdelay(1);
                        touch_nmi_watchdog();
                }
        }
}

#define CALLER_ID_MASK 0x80000000

static inline u32 printk_caller_id(void)
{
        return in_task() ? task_pid_nr(current) :
                CALLER_ID_MASK + smp_processor_id();
}

#ifdef CONFIG_PRINTK_EXECUTION_CTX
/* Store the opposite info than caller_id. */
static u32 printk_caller_id2(void)
{
        return !in_task() ? task_pid_nr(current) :
                CALLER_ID_MASK + smp_processor_id();
}

static pid_t printk_info_get_pid(const struct printk_info *info)
{
        u32 caller_id = info->caller_id;
        u32 caller_id2 = info->caller_id2;

        return caller_id & CALLER_ID_MASK ? caller_id2 : caller_id;
}

static int printk_info_get_cpu(const struct printk_info *info)
{
        u32 caller_id = info->caller_id;
        u32 caller_id2 = info->caller_id2;

        return ((caller_id & CALLER_ID_MASK ?
                 caller_id : caller_id2) & ~CALLER_ID_MASK);
}
#endif

/**
 * printk_parse_prefix - Parse level and control flags.
 *
 * @text:     The terminated text message.
 * @level:    A pointer to the current level value, will be updated.
 * @flags:    A pointer to the current printk_info flags, will be updated.
 *
 * @level may be NULL if the caller is not interested in the parsed value.
 * Otherwise the variable pointed to by @level must be set to
 * LOGLEVEL_DEFAULT in order to be updated with the parsed value.
 *
 * @flags may be NULL if the caller is not interested in the parsed value.
 * Otherwise the variable pointed to by @flags will be OR'd with the parsed
 * value.
 *
 * Return: The length of the parsed level and control flags.
 */
u16 printk_parse_prefix(const char *text, int *level,
                        enum printk_info_flags *flags)
{
        u16 prefix_len = 0;
        int kern_level;

        while (*text) {
                kern_level = printk_get_level(text);
                if (!kern_level)
                        break;

                switch (kern_level) {
                case '0' ... '7':
                        if (level && *level == LOGLEVEL_DEFAULT)
                                *level = kern_level - '0';
                        break;
                case 'c':        /* KERN_CONT */
                        if (flags)
                                *flags |= LOG_CONT;
                }

                prefix_len += 2;
                text += 2;
        }

        return prefix_len;
}

__printf(5, 0)
static u16 printk_sprint(char *text, u16 size, int facility,
                         enum printk_info_flags *flags, const char *fmt,
                         va_list args)
{
        u16 text_len;

        text_len = vscnprintf(text, size, fmt, args);

        /* Mark and strip a trailing newline. */
        if (text_len && text[text_len - 1] == '\n') {
                text_len--;
                *flags |= LOG_NEWLINE;
        }

        /* Strip log level and control flags. */
        if (facility == 0) {
                u16 prefix_len;

                prefix_len = printk_parse_prefix(text, NULL, NULL);
                if (prefix_len) {
                        text_len -= prefix_len;
                        memmove(text, text + prefix_len, text_len);
                }
        }

        trace_console(text, text_len);

        return text_len;
}

#ifdef CONFIG_PRINTK_EXECUTION_CTX
static void printk_store_execution_ctx(struct printk_info *info)
{
        info->caller_id2 = printk_caller_id2();
        get_task_comm(info->comm, current);
}

static void pmsg_load_execution_ctx(struct printk_message *pmsg,
                                    const struct printk_info *info)
{
        pmsg->cpu = printk_info_get_cpu(info);
        pmsg->pid = printk_info_get_pid(info);
        memcpy(pmsg->comm, info->comm, sizeof(pmsg->comm));
        static_assert(sizeof(pmsg->comm) == sizeof(info->comm));
}
#else
static void printk_store_execution_ctx(struct printk_info *info) {}

static void pmsg_load_execution_ctx(struct printk_message *pmsg,
                                    const struct printk_info *info) {}
#endif

__printf(4, 0)
int vprintk_store(int facility, int level,
                  const struct dev_printk_info *dev_info,
                  const char *fmt, va_list args)
{
        struct prb_reserved_entry e;
        enum printk_info_flags flags = 0;
        struct printk_record r;
        unsigned long irqflags;
        u16 trunc_msg_len = 0;
        char prefix_buf[8];
        u8 *recursion_ptr;
        u16 reserve_size;
        va_list args2;
        u32 caller_id;
        u16 text_len;
        int ret = 0;
        u64 ts_nsec;

        if (!printk_enter_irqsave(recursion_ptr, irqflags))
                return 0;

        /*
         * Since the duration of printk() can vary depending on the message
         * and state of the ringbuffer, grab the timestamp now so that it is
         * close to the call of printk(). This provides a more deterministic
         * timestamp with respect to the caller.
         */
        ts_nsec = local_clock();

        caller_id = printk_caller_id();

        /*
         * The sprintf needs to come first since the syslog prefix might be
         * passed in as a parameter. An extra byte must be reserved so that
         * later the vscnprintf() into the reserved buffer has room for the
         * terminating '\0', which is not counted by vsnprintf().
         */
        va_copy(args2, args);
        reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
        va_end(args2);

        if (reserve_size > PRINTKRB_RECORD_MAX)
                reserve_size = PRINTKRB_RECORD_MAX;

        /* Extract log level or control flags. */
        if (facility == 0)
                printk_parse_prefix(&prefix_buf[0], &level, &flags);

        if (level == LOGLEVEL_DEFAULT)
                level = default_message_loglevel;

        if (dev_info)
                flags |= LOG_NEWLINE;

        if (is_printk_force_console())
                flags |= LOG_FORCE_CON;

        if (flags & LOG_CONT) {
                prb_rec_init_wr(&r, reserve_size);
                if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
                        text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
                                                 facility, &flags, fmt, args);
                        r.info->text_len += text_len;

                        if (flags & LOG_FORCE_CON)
                                r.info->flags |= LOG_FORCE_CON;

                        if (flags & LOG_NEWLINE) {
                                r.info->flags |= LOG_NEWLINE;
                                prb_final_commit(&e);
                        } else {
                                prb_commit(&e);
                        }

                        ret = text_len;
                        goto out;
                }
        }

        /*
         * Explicitly initialize the record before every prb_reserve() call.
         * prb_reserve_in_last() and prb_reserve() purposely invalidate the
         * structure when they fail.
         */
        prb_rec_init_wr(&r, reserve_size);
        if (!prb_reserve(&e, prb, &r)) {
                /* truncate the message if it is too long for empty buffer */
                truncate_msg(&reserve_size, &trunc_msg_len);

                prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
                if (!prb_reserve(&e, prb, &r))
                        goto out;
        }

        /* fill message */
        text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args);
        if (trunc_msg_len)
                memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
        r.info->text_len = text_len + trunc_msg_len;
        r.info->facility = facility;
        r.info->level = level & 7;
        r.info->flags = flags & 0x1f;
        r.info->ts_nsec = ts_nsec;
        r.info->caller_id = caller_id;
        if (dev_info)
                memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
        printk_store_execution_ctx(r.info);

        /* A message without a trailing newline can be continued. */
        if (!(flags & LOG_NEWLINE))
                prb_commit(&e);
        else
                prb_final_commit(&e);

        ret = text_len + trunc_msg_len;
out:
        printk_exit_irqrestore(recursion_ptr, irqflags);
        return ret;
}

/*
 * This acts as a one-way switch to allow legacy consoles to print from
 * the printk() caller context on a panic CPU. It also attempts to flush
 * the legacy consoles in this context.
 */
void printk_legacy_allow_panic_sync(void)
{
        struct console_flush_type ft;

        legacy_allow_panic_sync = true;

        printk_get_console_flush_type(&ft);
        if (ft.legacy_direct) {
                if (console_trylock())
                        console_unlock();
        }
}

bool __read_mostly debug_non_panic_cpus;

#ifdef CONFIG_PRINTK_CALLER
static int __init debug_non_panic_cpus_setup(char *str)
{
        debug_non_panic_cpus = true;
        pr_info("allow messages from non-panic CPUs in panic()\n");

        return 0;
}
early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup);
module_param(debug_non_panic_cpus, bool, 0644);
MODULE_PARM_DESC(debug_non_panic_cpus,
                 "allow messages from non-panic CPUs in panic()");
#endif

asmlinkage int vprintk_emit(int facility, int level,
                            const struct dev_printk_info *dev_info,
                            const char *fmt, va_list args)
{
        struct console_flush_type ft;
        int printed_len;

        /* Suppress unimportant messages after panic happens */
        if (unlikely(suppress_printk))
                return 0;

        /*
         * The messages on the panic CPU are the most important. If
         * non-panic CPUs are generating any messages, they will be
         * silently dropped.
         */
        if (panic_on_other_cpu() &&
            !debug_non_panic_cpus &&
            !panic_triggering_all_cpu_backtrace)
                return 0;

        printk_get_console_flush_type(&ft);

        /* If called from the scheduler, we can not call up(). */
        if (level == LOGLEVEL_SCHED) {
                level = LOGLEVEL_DEFAULT;
                ft.legacy_offload |= ft.legacy_direct && !console_irqwork_blocked;
                ft.legacy_direct = false;
        }

        printk_delay(level);

        printed_len = vprintk_store(facility, level, dev_info, fmt, args);

        if (ft.nbcon_atomic)
                nbcon_atomic_flush_pending();

        if (ft.nbcon_offload)
                nbcon_kthreads_wake();

        if (ft.legacy_direct) {
                /*
                 * The caller may be holding system-critical or
                 * timing-sensitive locks. Disable preemption during
                 * printing of all remaining records to all consoles so that
                 * this context can return as soon as possible. Hopefully
                 * another printk() caller will take over the printing.
                 */
                preempt_disable();
                /*
                 * Try to acquire and then immediately release the console
                 * semaphore. The release will print out buffers. With the
                 * spinning variant, this context tries to take over the
                 * printing from another printing context.
                 */
                if (console_trylock_spinning())
                        console_unlock();
                preempt_enable();
        }

        if (ft.legacy_offload)
                defer_console_output();
        else if (!console_irqwork_blocked)
                wake_up_klogd();

        return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);

int vprintk_default(const char *fmt, va_list args)
{
        return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);

asmlinkage __visible int _printk(const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk(fmt, args);
        va_end(args);

        return r;
}
EXPORT_SYMBOL(_printk);

static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);

#else /* CONFIG_PRINTK */

#define printk_time                false

#define prb_read_valid(rb, seq, r)        false
#define prb_first_valid_seq(rb)                0
#define prb_next_seq(rb)                0

static u64 syslog_seq;

static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }

#endif /* CONFIG_PRINTK */

#ifdef CONFIG_EARLY_PRINTK
struct console *early_console;

asmlinkage __visible void early_printk(const char *fmt, ...)
{
        va_list ap;
        char buf[512];
        int n;

        if (!early_console)
                return;

        va_start(ap, fmt);
        n = vscnprintf(buf, sizeof(buf), fmt, ap);
        va_end(ap);

        early_console->write(early_console, buf, n);
}
#endif

static void set_user_specified(struct console_cmdline *c, bool user_specified)
{
        if (!user_specified)
                return;

        /*
         * @c console was defined by the user on the command line.
         * Do not clear when added twice also by SPCR or the device tree.
         */
        c->user_specified = true;
        /* At least one console defined by the user on the command line. */
        console_set_on_cmdline = 1;
}

static int __add_preferred_console(const char *name, const short idx,
                                   const char *devname, char *options,
                                   char *brl_options, bool user_specified)
{
        struct console_cmdline *c;
        int i;

        if (!name && !devname)
                return -EINVAL;

        /*
         * We use a signed short index for struct console for device drivers to
         * indicate a not yet assigned index or port. However, a negative index
         * value is not valid when the console name and index are defined on
         * the command line.
         */
        if (name && idx < 0)
                return -EINVAL;

        /*
         *        See if this tty is not yet registered, and
         *        if we have a slot free.
         */
        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
             i++, c++) {
                if ((name && strcmp(c->name, name) == 0 && c->index == idx) ||
                    (devname && strcmp(c->devname, devname) == 0)) {
                        if (!brl_options)
                                preferred_console = i;
                        set_user_specified(c, user_specified);
                        return 0;
                }
        }
        if (i == MAX_CMDLINECONSOLES)
                return -E2BIG;
        if (!brl_options)
                preferred_console = i;
        if (name)
                strscpy(c->name, name);
        if (devname)
                strscpy(c->devname, devname);
        c->options = options;
        set_user_specified(c, user_specified);
        braille_set_options(c, brl_options);

        c->index = idx;
        return 0;
}

static int __init console_msg_format_setup(char *str)
{
        if (!strcmp(str, "syslog"))
                console_msg_format = MSG_FORMAT_SYSLOG;
        if (!strcmp(str, "default"))
                console_msg_format = MSG_FORMAT_DEFAULT;
        return 1;
}
__setup("console_msg_format=", console_msg_format_setup);

/*
 * Set up a console.  Called via do_early_param() in init/main.c
 * for each "console=" parameter in the boot command line.
 */
static int __init console_setup(char *str)
{
        static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4);
        char buf[sizeof(console_cmdline[0].devname)];
        char *brl_options = NULL;
        char *ttyname = NULL;
        char *devname = NULL;
        char *options;
        char *s;
        int idx;

        /*
         * console="" or console=null have been suggested as a way to
         * disable console output. Use ttynull that has been created
         * for exactly this purpose.
         */
        if (str[0] == 0 || strcmp(str, "null") == 0) {
                __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true);
                return 1;
        }

        if (_braille_console_setup(&str, &brl_options))
                return 1;

        /* For a DEVNAME:0.0 style console the character device is unknown early */
        if (strchr(str, ':'))
                devname = buf;
        else
                ttyname = buf;

        /*
         * Decode str into name, index, options.
         */
        if (ttyname && isdigit(str[0]))
                scnprintf(buf, sizeof(buf), "ttyS%s", str);
        else
                strscpy(buf, str);

        options = strchr(str, ',');
        if (options)
                *(options++) = 0;

#ifdef __sparc__
        if (!strcmp(str, "ttya"))
                strscpy(buf, "ttyS0");
        if (!strcmp(str, "ttyb"))
                strscpy(buf, "ttyS1");
#endif

        for (s = buf; *s; s++)
                if ((ttyname && isdigit(*s)) || *s == ',')
                        break;

        /* @idx will get defined when devname matches. */
        if (devname)
                idx = -1;
        else
                idx = simple_strtoul(s, NULL, 10);

        *s = 0;

        __add_preferred_console(ttyname, idx, devname, options, brl_options, true);
        return 1;
}
__setup("console=", console_setup);

/**
 * add_preferred_console - add a device to the list of preferred consoles.
 * @name: device name
 * @idx: device index
 * @options: options for this console
 *
 * The last preferred console added will be used for kernel messages
 * and stdin/out/err for init.  Normally this is used by console_setup
 * above to handle user-supplied console arguments; however it can also
 * be used by arch-specific code either to override the user or more
 * commonly to provide a default console (ie from PROM variables) when
 * the user has not supplied one.
 */
int add_preferred_console(const char *name, const short idx, char *options)
{
        return __add_preferred_console(name, idx, NULL, options, NULL, false);
}

/**
 * match_devname_and_update_preferred_console - Update a preferred console
 *        when matching devname is found.
 * @devname: DEVNAME:0.0 style device name
 * @name: Name of the corresponding console driver, e.g. "ttyS"
 * @idx: Console index, e.g. port number.
 *
 * The function checks whether a device with the given @devname is
 * preferred via the console=DEVNAME:0.0 command line option.
 * It fills the missing console driver name and console index
 * so that a later register_console() call could find (match)
 * and enable this device.
 *
 * It might be used when a driver subsystem initializes particular
 * devices with already known DEVNAME:0.0 style names. And it
 * could predict which console driver name and index this device
 * would later get associated with.
 *
 * Return: 0 on success, negative error code on failure.
 */
int match_devname_and_update_preferred_console(const char *devname,
                                               const char *name,
                                               const short idx)
{
        struct console_cmdline *c = console_cmdline;
        int i;

        if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0)
                return -EINVAL;

        for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
             i++, c++) {
                if (!strcmp(devname, c->devname)) {
                        pr_info("associate the preferred console \"%s\" with \"%s%d\"\n",
                                devname, name, idx);
                        strscpy(c->name, name);
                        c->index = idx;
                        return 0;
                }
        }

        return -ENOENT;
}
EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console);

bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);

static int __init console_suspend_disable(char *str)
{
        console_suspend_enabled = false;
        return 1;
}
__setup("no_console_suspend", console_suspend_disable);
module_param_named(console_suspend, console_suspend_enabled,
                bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
        " and hibernate operations");

static bool printk_console_no_auto_verbose;

void console_verbose(void)
{
        if (console_loglevel && !printk_console_no_auto_verbose)
                console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
}
EXPORT_SYMBOL_GPL(console_verbose);

module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644);
MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");

/**
 * console_suspend_all - suspend the console subsystem
 *
 * This disables printk() while we go into suspend states
 */
void console_suspend_all(void)
{
        struct console *con;

        if (console_suspend_enabled)
                pr_info("Suspending console(s) (use no_console_suspend to debug)\n");

        /*
         * Flush any console backlog and then avoid queueing irq_work until
         * console_resume_all(). Until then deferred printing is no longer
         * triggered, NBCON consoles transition to atomic flushing, and
         * any klogd waiters are not triggered.
         */
        pr_flush(1000, true);
        console_irqwork_blocked = true;

        if (!console_suspend_enabled)
                return;

        console_list_lock();
        for_each_console(con)
                console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All printing
         * contexts must be able to see that they are suspended so that it
         * is guaranteed that all printing has stopped when this function
         * completes.
         */
        synchronize_srcu(&console_srcu);
}

void console_resume_all(void)
{
        struct console_flush_type ft;
        struct console *con;

        /*
         * Allow queueing irq_work. After restoring console state, deferred
         * printing and any klogd waiters need to be triggered in case there
         * is now a console backlog.
         */
        console_irqwork_blocked = false;

        if (console_suspend_enabled) {
                console_list_lock();
                for_each_console(con)
                        console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
                console_list_unlock();

                /*
                 * Ensure that all SRCU list walks have completed. All printing
                 * contexts must be able to see they are no longer suspended so
                 * that they are guaranteed to wake up and resume printing.
                 */
                synchronize_srcu(&console_srcu);
        }

        printk_get_console_flush_type(&ft);
        if (ft.nbcon_offload)
                nbcon_kthreads_wake();
        if (ft.legacy_offload)
                defer_console_output();
        else
                wake_up_klogd();

        pr_flush(1000, true);
}

/**
 * console_cpu_notify - print deferred console messages after CPU hotplug
 * @cpu: unused
 *
 * If printk() is called from a CPU that is not online yet, the messages
 * will be printed on the console only if there are CON_ANYTIME consoles.
 * This function is called when a new CPU comes online (or fails to come
 * up) or goes offline.
 */
static int console_cpu_notify(unsigned int cpu)
{
        struct console_flush_type ft;

        if (!cpuhp_tasks_frozen) {
                printk_get_console_flush_type(&ft);
                if (ft.nbcon_atomic)
                        nbcon_atomic_flush_pending();
                if (ft.legacy_direct) {
                        if (console_trylock())
                                console_unlock();
                }
        }
        return 0;
}

/**
 * console_lock - block the console subsystem from printing
 *
 * Acquires a lock which guarantees that no consoles will
 * be in or enter their write() callback.
 *
 * Can sleep, returns nothing.
 */
void console_lock(void)
{
        might_sleep();

        /* On panic, the console_lock must be left to the panic cpu. */
        while (panic_on_other_cpu())
                msleep(1000);

        down_console_sem();
        console_locked = 1;
        console_may_schedule = 1;
}
EXPORT_SYMBOL(console_lock);

/**
 * console_trylock - try to block the console subsystem from printing
 *
 * Try to acquire a lock which guarantees that no consoles will
 * be in or enter their write() callback.
 *
 * returns 1 on success, and 0 on failure to acquire the lock.
 */
int console_trylock(void)
{
        /* On panic, the console_lock must be left to the panic cpu. */
        if (panic_on_other_cpu())
                return 0;
        if (down_trylock_console_sem())
                return 0;
        console_locked = 1;
        console_may_schedule = 0;
        return 1;
}
EXPORT_SYMBOL(console_trylock);

int is_console_locked(void)
{
        return console_locked;
}
EXPORT_SYMBOL(is_console_locked);

static void __console_unlock(void)
{
        console_locked = 0;
        up_console_sem();
}

#ifdef CONFIG_PRINTK

/*
 * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting
 * the existing message over and inserting the scratchbuf message.
 *
 * @pmsg is the original printk message.
 * @fmt is the printf format of the message which will prepend the existing one.
 *
 * If there is not enough space in @pmsg->pbufs->outbuf, the existing
 * message text will be sufficiently truncated.
 *
 * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
 */
__printf(2, 3)
static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...)
{
        struct printk_buffers *pbufs = pmsg->pbufs;
        const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
        const size_t outbuf_sz = sizeof(pbufs->outbuf);
        char *scratchbuf = &pbufs->scratchbuf[0];
        char *outbuf = &pbufs->outbuf[0];
        va_list args;
        size_t len;

        va_start(args, fmt);
        len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args);
        va_end(args);

        /*
         * Make sure outbuf is sufficiently large before prepending.
         * Keep at least the prefix when the message must be truncated.
         * It is a rather theoretical problem when someone tries to
         * use a minimalist buffer.
         */
        if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz))
                return;

        if (pmsg->outbuf_len + len >= outbuf_sz) {
                /* Truncate the message, but keep it terminated. */
                pmsg->outbuf_len = outbuf_sz - (len + 1);
                outbuf[pmsg->outbuf_len] = 0;
        }

        memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1);
        memcpy(outbuf, scratchbuf, len);
        pmsg->outbuf_len += len;
}

/*
 * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message".
 * @pmsg->outbuf_len is updated appropriately.
 *
 * @pmsg is the printk message to prepend.
 *
 * @dropped is the dropped count to report in the dropped message.
 */
void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
{
        console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped);
}

/*
 * Prepend the message in @pmsg->pbufs->outbuf with a "replay message".
 * @pmsg->outbuf_len is updated appropriately.
 *
 * @pmsg is the printk message to prepend.
 */
void console_prepend_replay(struct printk_message *pmsg)
{
        console_prepend_message(pmsg, "** replaying previous printk message **\n");
}

/*
 * Read and format the specified record (or a later record if the specified
 * record is not available).
 *
 * @pmsg will contain the formatted result. @pmsg->pbufs must point to a
 * struct printk_buffers.
 *
 * @seq is the record to read and format. If it is not available, the next
 * valid record is read.
 *
 * @is_extended specifies if the message should be formatted for extended
 * console output.
 *
 * @may_supress specifies if records may be skipped based on loglevel.
 *
 * Returns false if no record is available. Otherwise true and all fields
 * of @pmsg are valid. (See the documentation of struct printk_message
 * for information about the @pmsg fields.)
 */
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
                             bool is_extended, bool may_suppress)
{
        struct printk_buffers *pbufs = pmsg->pbufs;
        const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
        const size_t outbuf_sz = sizeof(pbufs->outbuf);
        char *scratchbuf = &pbufs->scratchbuf[0];
        char *outbuf = &pbufs->outbuf[0];
        struct printk_info info;
        struct printk_record r;
        size_t len = 0;
        bool force_con;

        /*
         * Formatting extended messages requires a separate buffer, so use the
         * scratch buffer to read in the ringbuffer text.
         *
         * Formatting normal messages is done in-place, so read the ringbuffer
         * text directly into the output buffer.
         */
        if (is_extended)
                prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz);
        else
                prb_rec_init_rd(&r, &info, outbuf, outbuf_sz);

        if (!prb_read_valid(prb, seq, &r))
                return false;

        pmsg->seq = r.info->seq;
        pmsg->dropped = r.info->seq - seq;
        force_con = r.info->flags & LOG_FORCE_CON;
        pmsg_load_execution_ctx(pmsg, r.info);

        /*
         * Skip records that are not forced to be printed on consoles and that
         * has level above the console loglevel.
         */
        if (!force_con && may_suppress && suppress_message_printing(r.info->level))
                goto out;

        if (is_extended) {
                len = info_print_ext_header(outbuf, outbuf_sz, r.info);
                len += msg_print_ext_body(outbuf + len, outbuf_sz - len,
                                          &r.text_buf[0], r.info->text_len, &r.info->dev_info);
        } else {
                len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
        }
out:
        pmsg->outbuf_len = len;
        return true;
}

/*
 * The legacy console always acquires a spinlock_t from its printing
 * callback. This violates lock nesting if the caller acquired an always
 * spinning lock (raw_spinlock_t) while invoking printk(). This is not a
 * problem on PREEMPT_RT because legacy consoles print always from a
 * dedicated thread and never from within printk(). Therefore we tell
 * lockdep that a sleeping spin lock (spinlock_t) is valid here.
 */
#ifdef CONFIG_PREEMPT_RT
static inline void printk_legacy_allow_spinlock_enter(void) { }
static inline void printk_legacy_allow_spinlock_exit(void) { }
#else
static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_CONFIG);

static inline void printk_legacy_allow_spinlock_enter(void)
{
        lock_map_acquire_try(&printk_legacy_map);
}

static inline void printk_legacy_allow_spinlock_exit(void)
{
        lock_map_release(&printk_legacy_map);
}
#endif /* CONFIG_PREEMPT_RT */

/*
 * Used as the printk buffers for non-panic, serialized console printing.
 * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
 * Its usage requires the console_lock held.
 */
struct printk_buffers printk_shared_pbufs;

/*
 * Print one record for the given console. The record printed is whatever
 * record is the next available record for the given console.
 *
 * @handover will be set to true if a printk waiter has taken over the
 * console_lock, in which case the caller is no longer holding both the
 * console_lock and the SRCU read lock. Otherwise it is set to false.
 *
 * @cookie is the cookie from the SRCU read lock.
 *
 * Returns false if the given console has no next record to print, otherwise
 * true.
 *
 * Requires the console_lock and the SRCU read lock.
 */
static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
        bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
        char *outbuf = &printk_shared_pbufs.outbuf[0];
        struct printk_message pmsg = {
                .pbufs = &printk_shared_pbufs,
        };
        unsigned long flags;

        *handover = false;

        if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))
                return false;

        con->dropped += pmsg.dropped;

        /* Skip messages of formatted length 0. */
        if (pmsg.outbuf_len == 0) {
                con->seq = pmsg.seq + 1;
                goto skip;
        }

        if (con->dropped && !is_extended) {
                console_prepend_dropped(&pmsg, con->dropped);
                con->dropped = 0;
        }

        /* Write everything out to the hardware. */

        if (force_legacy_kthread() && !panic_in_progress()) {
                /*
                 * With forced threading this function is in a task context
                 * (either legacy kthread or get_init_console_seq()). There
                 * is no need for concern about printk reentrance, handovers,
                 * or lockdep complaints.
                 */

                con->write(con, outbuf, pmsg.outbuf_len);
                con->seq = pmsg.seq + 1;
        } else {
                /*
                 * While actively printing out messages, if another printk()
                 * were to occur on another CPU, it may wait for this one to
                 * finish. This task can not be preempted if there is a
                 * waiter waiting to take over.
                 *
                 * Interrupts are disabled because the hand over to a waiter
                 * must not be interrupted until the hand over is completed
                 * (@console_waiter is cleared).
                 */
                printk_safe_enter_irqsave(flags);
                console_lock_spinning_enable();

                /* Do not trace print latency. */
                stop_critical_timings();

                printk_legacy_allow_spinlock_enter();
                con->write(con, outbuf, pmsg.outbuf_len);
                printk_legacy_allow_spinlock_exit();

                start_critical_timings();

                con->seq = pmsg.seq + 1;

                *handover = console_lock_spinning_disable_and_check(cookie);
                printk_safe_exit_irqrestore(flags);
        }
skip:
        return true;
}

#else

static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
        *handover = false;
        return false;
}

static inline void printk_kthreads_check_locked(void) { }

#endif /* CONFIG_PRINTK */


/*
 * Print out one record for each console.
 *
 * @do_cond_resched is set by the caller. It can be true only in schedulable
 * context.
 *
 * @next_seq is set to the sequence number after the last available record.
 * The value is valid only when all usable consoles were flushed. It is
 * when the function returns true (can do the job) and @try_again parameter
 * is set to false, see below.
 *
 * @handover will be set to true if a printk waiter has taken over the
 * console_lock, in which case the caller is no longer holding the
 * console_lock. Otherwise it is set to false.
 *
 * @try_again will be set to true when it still makes sense to call this
 * function again. The function could do the job, see the return value.
 * And some consoles still make progress.
 *
 * Returns true when the function could do the job. Some consoles are usable,
 * and there was no takeover and no panic_on_other_cpu().
 *
 * Requires the console_lock.
 */
static bool console_flush_one_record(bool do_cond_resched, u64 *next_seq, bool *handover,
                                     bool *try_again)
{
        struct console_flush_type ft;
        bool any_usable = false;
        struct console *con;
        int cookie;

        *try_again = false;

        printk_get_console_flush_type(&ft);

        cookie = console_srcu_read_lock();
        for_each_console_srcu(con) {
                short flags = console_srcu_read_flags(con);
                u64 printk_seq;
                bool progress;

                /*
                 * console_flush_one_record() is only responsible for
                 * nbcon consoles when the nbcon consoles cannot print via
                 * their atomic or threaded flushing.
                 */
                if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
                        continue;

                if (!console_is_usable(con, flags, !do_cond_resched))
                        continue;
                any_usable = true;

                if (flags & CON_NBCON) {
                        progress = nbcon_legacy_emit_next_record(con, handover, cookie,
                                                                 !do_cond_resched);
                        printk_seq = nbcon_seq_read(con);
                } else {
                        progress = console_emit_next_record(con, handover, cookie);
                        printk_seq = con->seq;
                }

                /*
                 * If a handover has occurred, the SRCU read lock
                 * is already released.
                 */
                if (*handover)
                        goto fail;

                /* Track the next of the highest seq flushed. */
                if (printk_seq > *next_seq)
                        *next_seq = printk_seq;

                if (!progress)
                        continue;

                /*
                 * An usable console made a progress. There might still be
                 * pending messages.
                 */
                *try_again = true;

                /* Allow panic_cpu to take over the consoles safely. */
                if (panic_on_other_cpu())
                        goto fail_srcu;

                if (do_cond_resched)
                        cond_resched();
        }
        console_srcu_read_unlock(cookie);

        return any_usable;

fail_srcu:
        console_srcu_read_unlock(cookie);
fail:
        *try_again = false;
        return false;
}

/*
 * Print out all remaining records to all consoles.
 *
 * @do_cond_resched is set by the caller. It can be true only in schedulable
 * context.
 *
 * @next_seq is set to the sequence number after the last available record.
 * The value is valid only when this function returns true. It means that all
 * usable consoles are completely flushed.
 *
 * @handover will be set to true if a printk waiter has taken over the
 * console_lock, in which case the caller is no longer holding the
 * console_lock. Otherwise it is set to false.
 *
 * Returns true when there was at least one usable console and all messages
 * were flushed to all usable consoles. A returned false informs the caller
 * that everything was not flushed (either there were no usable consoles or
 * another context has taken over printing or it is a panic situation and this
 * is not the panic CPU). Regardless the reason, the caller should assume it
 * is not useful to immediately try again.
 *
 * Requires the console_lock.
 */
static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
{
        bool try_again;
        bool ret;

        *next_seq = 0;
        *handover = false;

        do {
                ret = console_flush_one_record(do_cond_resched, next_seq,
                                               handover, &try_again);
        } while (try_again);

        return ret;
}

static void __console_flush_and_unlock(void)
{
        bool do_cond_resched;
        bool handover;
        bool flushed;
        u64 next_seq;

        /*
         * Console drivers are called with interrupts disabled, so
         * @console_may_schedule should be cleared before; however, we may
         * end up dumping a lot of lines, for example, if called from
         * console registration path, and should invoke cond_resched()
         * between lines if allowable.  Not doing so can cause a very long
         * scheduling stall on a slow console leading to RCU stall and
         * softlockup warnings which exacerbate the issue with more
         * messages practically incapacitating the system. Therefore, create
         * a local to use for the printing loop.
         */
        do_cond_resched = console_may_schedule;

        do {
                console_may_schedule = 0;

                flushed = console_flush_all(do_cond_resched, &next_seq, &handover);
                if (!handover)
                        __console_unlock();

                /*
                 * Abort if there was a failure to flush all messages to all
                 * usable consoles. Either it is not possible to flush (in
                 * which case it would be an infinite loop of retrying) or
                 * another context has taken over printing.
                 */
                if (!flushed)
                        break;

                /*
                 * Some context may have added new records after
                 * console_flush_all() but before unlocking the console.
                 * Re-check if there is a new record to flush. If the trylock
                 * fails, another context is already handling the printing.
                 */
        } while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
}

/**
 * console_unlock - unblock the legacy console subsystem from printing
 *
 * Releases the console_lock which the caller holds to block printing of
 * the legacy console subsystem.
 *
 * While the console_lock was held, console output may have been buffered
 * by printk(). If this is the case, console_unlock() emits the output on
 * legacy consoles prior to releasing the lock.
 *
 * console_unlock(); may be called from any context.
 */
void console_unlock(void)
{
        struct console_flush_type ft;

        printk_get_console_flush_type(&ft);
        if (ft.legacy_direct)
                __console_flush_and_unlock();
        else
                __console_unlock();
}
EXPORT_SYMBOL(console_unlock);

void console_unblank(void)
{
        bool found_unblank = false;
        struct console *c;
        int cookie;

        /*
         * First check if there are any consoles implementing the unblank()
         * callback. If not, there is no reason to continue and take the
         * console lock, which in particular can be dangerous if
         * @oops_in_progress is set.
         */
        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                if (!console_is_usable(c, console_srcu_read_flags(c), true))
                        continue;

                if (c->unblank) {
                        found_unblank = true;
                        break;
                }
        }
        console_srcu_read_unlock(cookie);
        if (!found_unblank)
                return;

        /*
         * Stop console printing because the unblank() callback may
         * assume the console is not within its write() callback.
         *
         * If @oops_in_progress is set, this may be an atomic context.
         * In that case, attempt a trylock as best-effort.
         */
        if (oops_in_progress) {
                /* Semaphores are not NMI-safe. */
                if (in_nmi())
                        return;

                /*
                 * Attempting to trylock the console lock can deadlock
                 * if another CPU was stopped while modifying the
                 * semaphore. "Hope and pray" that this is not the
                 * current situation.
                 */
                if (down_trylock_console_sem() != 0)
                        return;
        } else
                console_lock();

        console_locked = 1;
        console_may_schedule = 0;

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                if (!console_is_usable(c, console_srcu_read_flags(c), true))
                        continue;

                if (c->unblank)
                        c->unblank();
        }
        console_srcu_read_unlock(cookie);

        console_unlock();

        if (!oops_in_progress)
                pr_flush(1000, true);
}

/*
 * Rewind all consoles to the oldest available record.
 *
 * IMPORTANT: The function is safe only when called under
 *            console_lock(). It is not enforced because
 *            it is used as a best effort in panic().
 */
static void __console_rewind_all(void)
{
        struct console *c;
        short flags;
        int cookie;
        u64 seq;

        seq = prb_first_valid_seq(prb);

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                flags = console_srcu_read_flags(c);

                if (flags & CON_NBCON) {
                        nbcon_seq_force(c, seq);
                } else {
                        /*
                         * This assignment is safe only when called under
                         * console_lock(). On panic, legacy consoles are
                         * only best effort.
                         */
                        c->seq = seq;
                }
        }
        console_srcu_read_unlock(cookie);
}

/**
 * console_flush_on_panic - flush console content on panic
 * @mode: flush all messages in buffer or just the pending ones
 *
 * Immediately output all pending messages no matter what.
 */
void console_flush_on_panic(enum con_flush_mode mode)
{
        struct console_flush_type ft;
        bool handover;
        u64 next_seq;

        /*
         * Ignore the console lock and flush out the messages. Attempting a
         * trylock would not be useful because:
         *
         *   - if it is contended, it must be ignored anyway
         *   - console_lock() and console_trylock() block and fail
         *     respectively in panic for non-panic CPUs
         *   - semaphores are not NMI-safe
         */

        /*
         * If another context is holding the console lock,
         * @console_may_schedule might be set. Clear it so that
         * this context does not call cond_resched() while flushing.
         */
        console_may_schedule = 0;

        if (mode == CONSOLE_REPLAY_ALL)
                __console_rewind_all();

        printk_get_console_flush_type(&ft);
        if (ft.nbcon_atomic)
                nbcon_atomic_flush_pending();

        /* Flush legacy consoles once allowed, even when dangerous. */
        if (legacy_allow_panic_sync)
                console_flush_all(false, &next_seq, &handover);
}

/*
 * Return the console tty driver structure and its associated index
 */
struct tty_driver *console_device(int *index)
{
        struct console *c;
        struct tty_driver *driver = NULL;
        int cookie;

        /*
         * Take console_lock to serialize device() callback with
         * other console operations. For example, fg_console is
         * modified under console_lock when switching vt.
         */
        console_lock();

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                if (!c->device)
                        continue;
                driver = c->device(c, index);
                if (driver)
                        break;
        }
        console_srcu_read_unlock(cookie);

        console_unlock();
        return driver;
}

/*
 * Prevent further output on the passed console device so that (for example)
 * serial drivers can suspend console output before suspending a port, and can
 * re-enable output afterwards.
 */
void console_suspend(struct console *console)
{
        __pr_flush(console, 1000, true);
        console_list_lock();
        console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All contexts must
         * be able to see that this console is disabled so that (for example)
         * the caller can suspend the port without risk of another context
         * using the port.
         */
        synchronize_srcu(&console_srcu);
}
EXPORT_SYMBOL(console_suspend);

void console_resume(struct console *console)
{
        struct console_flush_type ft;
        bool is_nbcon;

        console_list_lock();
        console_srcu_write_flags(console, console->flags | CON_ENABLED);
        is_nbcon = console->flags & CON_NBCON;
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. The related
         * printing context must be able to see it is enabled so that
         * it is guaranteed to wake up and resume printing.
         */
        synchronize_srcu(&console_srcu);

        printk_get_console_flush_type(&ft);
        if (is_nbcon && ft.nbcon_offload)
                nbcon_kthread_wake(console);
        else if (ft.legacy_offload)
                defer_console_output();

        __pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_resume);

#ifdef CONFIG_PRINTK
static int unregister_console_locked(struct console *console);

/* True when system boot is far enough to create printer threads. */
bool printk_kthreads_ready __ro_after_init;

static struct task_struct *printk_legacy_kthread;

static bool legacy_kthread_should_wakeup(void)
{
        struct console_flush_type ft;
        struct console *con;
        bool ret = false;
        int cookie;

        if (kthread_should_stop())
                return true;

        printk_get_console_flush_type(&ft);

        cookie = console_srcu_read_lock();
        for_each_console_srcu(con) {
                short flags = console_srcu_read_flags(con);
                u64 printk_seq;

                /*
                 * The legacy printer thread is only responsible for nbcon
                 * consoles when the nbcon consoles cannot print via their
                 * atomic or threaded flushing.
                 */
                if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
                        continue;

                if (!console_is_usable(con, flags, false))
                        continue;

                if (flags & CON_NBCON) {
                        printk_seq = nbcon_seq_read(con);
                } else {
                        /*
                         * It is safe to read @seq because only this
                         * thread context updates @seq.
                         */
                        printk_seq = con->seq;
                }

                if (prb_read_valid(prb, printk_seq, NULL)) {
                        ret = true;
                        break;
                }
        }
        console_srcu_read_unlock(cookie);

        return ret;
}

static int legacy_kthread_func(void *unused)
{
        bool try_again;

wait_for_event:
        wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup());

        do {
                bool handover = false;
                u64 next_seq = 0;

                if (kthread_should_stop())
                        return 0;

                console_lock();
                console_flush_one_record(true, &next_seq, &handover, &try_again);
                if (!handover)
                        __console_unlock();

        } while (try_again);

        goto wait_for_event;
}

static bool legacy_kthread_create(void)
{
        struct task_struct *kt;

        lockdep_assert_console_list_lock_held();

        kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy");
        if (WARN_ON(IS_ERR(kt))) {
                pr_err("failed to start legacy printing thread\n");
                return false;
        }

        printk_legacy_kthread = kt;

        /*
         * It is important that console printing threads are scheduled
         * shortly after a printk call and with generous runtime budgets.
         */
        sched_set_normal(printk_legacy_kthread, -20);

        return true;
}

/**
 * printk_kthreads_shutdown - shutdown all threaded printers
 * @data: syscore context
 *
 * On system shutdown all threaded printers are stopped. This allows printk
 * to transition back to atomic printing, thus providing a robust mechanism
 * for the final shutdown/reboot messages to be output.
 */
static void printk_kthreads_shutdown(void *data)
{
        struct console *con;

        console_list_lock();
        if (printk_kthreads_running) {
                printk_kthreads_running = false;

                for_each_console(con) {
                        if (con->flags & CON_NBCON)
                                nbcon_kthread_stop(con);
                }

                /*
                 * The threads may have been stopped while printing a
                 * backlog. Flush any records left over.
                 */
                nbcon_atomic_flush_pending();
        }
        console_list_unlock();
}

static const struct syscore_ops printk_syscore_ops = {
        .shutdown = printk_kthreads_shutdown,
};

static struct syscore printk_syscore = {
        .ops = &printk_syscore_ops,
};

/*
 * If appropriate, start nbcon kthreads and set @printk_kthreads_running.
 * If any kthreads fail to start, those consoles are unregistered.
 *
 * Must be called under console_list_lock().
 */
static void printk_kthreads_check_locked(void)
{
        struct hlist_node *tmp;
        struct console *con;

        lockdep_assert_console_list_lock_held();

        if (!printk_kthreads_ready)
                return;

        /* Start or stop the legacy kthread when needed. */
        if (have_legacy_console || have_boot_console) {
                if (!printk_legacy_kthread &&
                    force_legacy_kthread() &&
                    !legacy_kthread_create()) {
                        /*
                         * All legacy consoles must be unregistered. If there
                         * are any nbcon consoles, they will set up their own
                         * kthread.
                         */
                        hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                                if (con->flags & CON_NBCON)
                                        continue;

                                unregister_console_locked(con);
                        }
                }
        } else if (printk_legacy_kthread) {
                kthread_stop(printk_legacy_kthread);
                printk_legacy_kthread = NULL;
        }

        /*
         * Printer threads cannot be started as long as any boot console is
         * registered because there is no way to synchronize the hardware
         * registers between boot console code and regular console code.
         * It can only be known that there will be no new boot consoles when
         * an nbcon console is registered.
         */
        if (have_boot_console || !have_nbcon_console) {
                /* Clear flag in case all nbcon consoles unregistered. */
                printk_kthreads_running = false;
                return;
        }

        if (printk_kthreads_running)
                return;

        hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                if (!(con->flags & CON_NBCON))
                        continue;

                if (!nbcon_kthread_create(con))
                        unregister_console_locked(con);
        }

        printk_kthreads_running = true;
}

static int __init printk_set_kthreads_ready(void)
{
        register_syscore(&printk_syscore);

        console_list_lock();
        printk_kthreads_ready = true;
        printk_kthreads_check_locked();
        console_list_unlock();

        return 0;
}
early_initcall(printk_set_kthreads_ready);
#endif /* CONFIG_PRINTK */

static int __read_mostly keep_bootcon;

static int __init keep_bootcon_setup(char *str)
{
        keep_bootcon = 1;
        pr_info("debug: skip boot console de-registration.\n");

        return 0;
}

early_param("keep_bootcon", keep_bootcon_setup);

static int console_call_setup(struct console *newcon, char *options)
{
        int err;

        if (!newcon->setup)
                return 0;

        /* Synchronize with possible boot console. */
        console_lock();
        err = newcon->setup(newcon, options);
        console_unlock();

        return err;
}

/*
 * This is called by register_console() to try to match
 * the newly registered console with any of the ones selected
 * by either the command line or add_preferred_console() and
 * setup/enable it.
 *
 * Care need to be taken with consoles that are statically
 * enabled such as netconsole
 */
static int try_enable_preferred_console(struct console *newcon,
                                        bool user_specified)
{
        struct console_cmdline *c;
        int i, err;

        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
             i++, c++) {
                /* Console not yet initialized? */
                if (!c->name[0])
                        continue;
                if (c->user_specified != user_specified)
                        continue;
                if (!newcon->match ||
                    newcon->match(newcon, c->name, c->index, c->options) != 0) {
                        /* default matching */
                        BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
                        if (strcmp(c->name, newcon->name) != 0)
                                continue;
                        if (newcon->index >= 0 &&
                            newcon->index != c->index)
                                continue;
                        if (newcon->index < 0)
                                newcon->index = c->index;

                        if (_braille_register_console(newcon, c))
                                return 0;

                        err = console_call_setup(newcon, c->options);
                        if (err)
                                return err;
                }
                newcon->flags |= CON_ENABLED;
                if (i == preferred_console)
                        newcon->flags |= CON_CONSDEV;
                return 0;
        }

        /*
         * Some consoles, such as pstore and netconsole, can be enabled even
         * without matching. Accept the pre-enabled consoles only when match()
         * and setup() had a chance to be called.
         */
        if (newcon->flags & CON_ENABLED && c->user_specified ==        user_specified)
                return 0;

        return -ENOENT;
}

/* Try to enable the console unconditionally */
static void try_enable_default_console(struct console *newcon)
{
        if (newcon->index < 0)
                newcon->index = 0;

        if (console_call_setup(newcon, NULL) != 0)
                return;

        newcon->flags |= CON_ENABLED;

        if (newcon->device)
                newcon->flags |= CON_CONSDEV;
}

/* Return the starting sequence number for a newly registered console. */
static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered)
{
        struct console *con;
        bool handover;
        u64 init_seq;

        if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
                /* Get a consistent copy of @syslog_seq. */
                mutex_lock(&syslog_lock);
                init_seq = syslog_seq;
                mutex_unlock(&syslog_lock);
        } else {
                /* Begin with next message added to ringbuffer. */
                init_seq = prb_next_seq(prb);

                /*
                 * If any enabled boot consoles are due to be unregistered
                 * shortly, some may not be caught up and may be the same
                 * device as @newcon. Since it is not known which boot console
                 * is the same device, flush all consoles and, if necessary,
                 * start with the message of the enabled boot console that is
                 * the furthest behind.
                 */
                if (bootcon_registered && !keep_bootcon) {
                        /*
                         * Hold the console_lock to stop console printing and
                         * guarantee safe access to console->seq.
                         */
                        console_lock();

                        /*
                         * Flush all consoles and set the console to start at
                         * the next unprinted sequence number.
                         */
                        if (!console_flush_all(true, &init_seq, &handover)) {
                                /*
                                 * Flushing failed. Just choose the lowest
                                 * sequence of the enabled boot consoles.
                                 */

                                /*
                                 * If there was a handover, this context no
                                 * longer holds the console_lock.
                                 */
                                if (handover)
                                        console_lock();

                                init_seq = prb_next_seq(prb);
                                for_each_console(con) {
                                        u64 seq;

                                        if (!(con->flags & CON_BOOT) ||
                                            !(con->flags & CON_ENABLED)) {
                                                continue;
                                        }

                                        if (con->flags & CON_NBCON)
                                                seq = nbcon_seq_read(con);
                                        else
                                                seq = con->seq;

                                        if (seq < init_seq)
                                                init_seq = seq;
                                }
                        }

                        console_unlock();
                }
        }

        return init_seq;
}

#define console_first()                                \
        hlist_entry(console_list.first, struct console, node)

static int unregister_console_locked(struct console *console);

/*
 * The console driver calls this routine during kernel initialization
 * to register the console printing procedure with printk() and to
 * print any messages that were printed by the kernel before the
 * console driver was initialized.
 *
 * This can happen pretty early during the boot process (because of
 * early_printk) - sometimes before setup_arch() completes - be careful
 * of what kernel features are used - they may not be initialised yet.
 *
 * There are two types of consoles - bootconsoles (early_printk) and
 * "real" consoles (everything which is not a bootconsole) which are
 * handled differently.
 *  - Any number of bootconsoles can be registered at any time.
 *  - As soon as a "real" console is registered, all bootconsoles
 *    will be unregistered automatically.
 *  - Once a "real" console is registered, any attempt to register a
 *    bootconsoles will be rejected
 */
void register_console(struct console *newcon)
{
        bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic;
        bool bootcon_registered = false;
        bool realcon_registered = false;
        struct console *con;
        unsigned long flags;
        u64 init_seq;
        int err;

        console_list_lock();

        for_each_console(con) {
                if (WARN(con == newcon, "console '%s%d' already registered\n",
                                         con->name, con->index)) {
                        goto unlock;
                }

                if (con->flags & CON_BOOT)
                        bootcon_registered = true;
                else
                        realcon_registered = true;
        }

        /* Do not register boot consoles when there already is a real one. */
        if ((newcon->flags & CON_BOOT) && realcon_registered) {
                pr_info("Too late to register bootconsole %s%d\n",
                        newcon->name, newcon->index);
                goto unlock;
        }

        if (newcon->flags & CON_NBCON) {
                /*
                 * Ensure the nbcon console buffers can be allocated
                 * before modifying any global data.
                 */
                if (!nbcon_alloc(newcon))
                        goto unlock;
        }

        /*
         * See if we want to enable this console driver by default.
         *
         * Nope when a console is preferred by the command line, device
         * tree, or SPCR.
         *
         * The first real console with tty binding (driver) wins. More
         * consoles might get enabled before the right one is found.
         *
         * Note that a console with tty binding will have CON_CONSDEV
         * flag set and will be first in the list.
         */
        if (preferred_console < 0) {
                if (hlist_empty(&console_list) || !console_first()->device ||
                    console_first()->flags & CON_BOOT) {
                        try_enable_default_console(newcon);
                }
        }

        /* See if this console matches one we selected on the command line */
        err = try_enable_preferred_console(newcon, true);

        /* If not, try to match against the platform default(s) */
        if (err == -ENOENT)
                err = try_enable_preferred_console(newcon, false);

        /* printk() messages are not printed to the Braille console. */
        if (err || newcon->flags & CON_BRL) {
                if (newcon->flags & CON_NBCON)
                        nbcon_free(newcon);
                goto unlock;
        }

        /*
         * If we have a bootconsole, and are switching to a real console,
         * don't print everything out again, since when the boot console, and
         * the real console are the same physical device, it's annoying to
         * see the beginning boot messages twice
         */
        if (bootcon_registered &&
            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
                newcon->flags &= ~CON_PRINTBUFFER;
        }

        newcon->dropped = 0;
        init_seq = get_init_console_seq(newcon, bootcon_registered);

        if (newcon->flags & CON_NBCON) {
                have_nbcon_console = true;
                nbcon_seq_force(newcon, init_seq);
        } else {
                have_legacy_console = true;
                newcon->seq = init_seq;
        }

        if (newcon->flags & CON_BOOT)
                have_boot_console = true;

        /*
         * If another context is actively using the hardware of this new
         * console, it will not be aware of the nbcon synchronization. This
         * is a risk that two contexts could access the hardware
         * simultaneously if this new console is used for atomic printing
         * and the other context is still using the hardware.
         *
         * Use the driver synchronization to ensure that the hardware is not
         * in use while this new console transitions to being registered.
         */
        if (use_device_lock)
                newcon->device_lock(newcon, &flags);

        /*
         * Put this console in the list - keep the
         * preferred driver at the head of the list.
         */
        if (hlist_empty(&console_list)) {
                /* Ensure CON_CONSDEV is always set for the head. */
                newcon->flags |= CON_CONSDEV;
                hlist_add_head_rcu(&newcon->node, &console_list);

        } else if (newcon->flags & CON_CONSDEV) {
                /* Only the new head can have CON_CONSDEV set. */
                console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
                hlist_add_head_rcu(&newcon->node, &console_list);

        } else {
                hlist_add_behind_rcu(&newcon->node, console_list.first);
        }

        /*
         * No need to synchronize SRCU here! The caller does not rely
         * on all contexts being able to see the new console before
         * register_console() completes.
         */

        /* This new console is now registered. */
        if (use_device_lock)
                newcon->device_unlock(newcon, flags);

        console_sysfs_notify();

        /*
         * By unregistering the bootconsoles after we enable the real console
         * we get the "console xxx enabled" message on all the consoles -
         * boot consoles, real consoles, etc - this is to ensure that end
         * users know there might be something in the kernel's log buffer that
         * went to the bootconsole (that they do not see on the real console)
         */
        con_printk(KERN_INFO, newcon, "enabled\n");
        if (bootcon_registered &&
            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
            !keep_bootcon) {
                struct hlist_node *tmp;

                hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                        if (con->flags & CON_BOOT)
                                unregister_console_locked(con);
                }
        }

        /* Changed console list, may require printer threads to start/stop. */
        printk_kthreads_check_locked();
unlock:
        console_list_unlock();
}
EXPORT_SYMBOL(register_console);

/* Must be called under console_list_lock(). */
static int unregister_console_locked(struct console *console)
{
        bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic;
        bool found_legacy_con = false;
        bool found_nbcon_con = false;
        bool found_boot_con = false;
        unsigned long flags;
        struct console *c;
        int res;

        lockdep_assert_console_list_lock_held();

        con_printk(KERN_INFO, console, "disabled\n");

        res = _braille_unregister_console(console);
        if (res < 0)
                return res;
        if (res > 0)
                return 0;

        if (!console_is_registered_locked(console))
                res = -ENODEV;
        else if (console_is_usable(console, console->flags, true))
                __pr_flush(console, 1000, true);

        /* Disable it unconditionally */
        console_srcu_write_flags(console, console->flags & ~CON_ENABLED);

        if (res < 0)
                return res;

        /*
         * Use the driver synchronization to ensure that the hardware is not
         * in use while this console transitions to being unregistered.
         */
        if (use_device_lock)
                console->device_lock(console, &flags);

        hlist_del_init_rcu(&console->node);

        if (use_device_lock)
                console->device_unlock(console, flags);

        /*
         * <HISTORICAL>
         * If this isn't the last console and it has CON_CONSDEV set, we
         * need to set it on the next preferred console.
         * </HISTORICAL>
         *
         * The above makes no sense as there is no guarantee that the next
         * console has any device attached. Oh well....
         */
        if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
                console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);

        /*
         * Ensure that all SRCU list walks have completed. All contexts
         * must not be able to see this console in the list so that any
         * exit/cleanup routines can be performed safely.
         */
        synchronize_srcu(&console_srcu);

        /*
         * With this console gone, the global flags tracking registered
         * console types may have changed. Update them.
         */
        for_each_console(c) {
                if (c->flags & CON_BOOT)
                        found_boot_con = true;

                if (c->flags & CON_NBCON)
                        found_nbcon_con = true;
                else
                        found_legacy_con = true;
        }
        if (!found_boot_con)
                have_boot_console = found_boot_con;
        if (!found_legacy_con)
                have_legacy_console = found_legacy_con;
        if (!found_nbcon_con)
                have_nbcon_console = found_nbcon_con;

        /* @have_nbcon_console must be updated before calling nbcon_free(). */
        if (console->flags & CON_NBCON)
                nbcon_free(console);

        console_sysfs_notify();

        if (console->exit)
                res = console->exit(console);

        /* Changed console list, may require printer threads to start/stop. */
        printk_kthreads_check_locked();

        return res;
}

int unregister_console(struct console *console)
{
        int res;

        console_list_lock();
        res = unregister_console_locked(console);
        console_list_unlock();
        return res;
}
EXPORT_SYMBOL(unregister_console);

/**
 * console_force_preferred_locked - force a registered console preferred
 * @con: The registered console to force preferred.
 *
 * Must be called under console_list_lock().
 */
void console_force_preferred_locked(struct console *con)
{
        struct console *cur_pref_con;

        if (!console_is_registered_locked(con))
                return;

        cur_pref_con = console_first();

        /* Already preferred? */
        if (cur_pref_con == con)
                return;

        /*
         * Delete, but do not re-initialize the entry. This allows the console
         * to continue to appear registered (via any hlist_unhashed_lockless()
         * checks), even though it was briefly removed from the console list.
         */
        hlist_del_rcu(&con->node);

        /*
         * Ensure that all SRCU list walks have completed so that the console
         * can be added to the beginning of the console list and its forward
         * list pointer can be re-initialized.
         */
        synchronize_srcu(&console_srcu);

        con->flags |= CON_CONSDEV;
        WARN_ON(!con->device);

        /* Only the new head can have CON_CONSDEV set. */
        console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
        hlist_add_head_rcu(&con->node, &console_list);
}
EXPORT_SYMBOL(console_force_preferred_locked);

/*
 * Initialize the console device. This is called *early*, so
 * we can't necessarily depend on lots of kernel help here.
 * Just do some early initializations, and do the complex setup
 * later.
 */
void __init console_init(void)
{
        int ret;
        initcall_t call;
        initcall_entry_t *ce;

#ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE
        if (!console_set_on_cmdline)
                add_preferred_console("ttynull", 0, NULL);
#endif

        /* Setup the default TTY line discipline. */
        n_tty_init();

        /*
         * set up the console device so that later boot sequences can
         * inform about problems etc..
         */
        ce = __con_initcall_start;
        trace_initcall_level("console");
        while (ce < __con_initcall_end) {
                call = initcall_from_entry(ce);
                trace_initcall_start(call);
                ret = call();
                trace_initcall_finish(call, ret);
                ce++;
        }
}

/*
 * Some boot consoles access data that is in the init section and which will
 * be discarded after the initcalls have been run. To make sure that no code
 * will access this data, unregister the boot consoles in a late initcall.
 *
 * If for some reason, such as deferred probe or the driver being a loadable
 * module, the real console hasn't registered yet at this point, there will
 * be a brief interval in which no messages are logged to the console, which
 * makes it difficult to diagnose problems that occur during this time.
 *
 * To mitigate this problem somewhat, only unregister consoles whose memory
 * intersects with the init section. Note that all other boot consoles will
 * get unregistered when the real preferred console is registered.
 */
static int __init printk_late_init(void)
{
        struct hlist_node *tmp;
        struct console *con;
        int ret;

        console_list_lock();
        hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                if (!(con->flags & CON_BOOT))
                        continue;

                /* Check addresses that might be used for enabled consoles. */
                if (init_section_intersects(con, sizeof(*con)) ||
                    init_section_contains(con->write, 0) ||
                    init_section_contains(con->read, 0) ||
                    init_section_contains(con->device, 0) ||
                    init_section_contains(con->unblank, 0) ||
                    init_section_contains(con->data, 0)) {
                        /*
                         * Please, consider moving the reported consoles out
                         * of the init section.
                         */
                        pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
                                con->name, con->index);
                        unregister_console_locked(con);
                }
        }
        console_list_unlock();

        ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
                                        console_cpu_notify);
        WARN_ON(ret < 0);
        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
                                        console_cpu_notify, NULL);
        WARN_ON(ret < 0);
        printk_sysctl_init();
        return 0;
}
late_initcall(printk_late_init);

#if defined CONFIG_PRINTK
/* If @con is specified, only wait for that console. Otherwise wait for all. */
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
{
        unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
        unsigned long remaining_jiffies = timeout_jiffies;
        struct console_flush_type ft;
        struct console *c;
        u64 last_diff = 0;
        u64 printk_seq;
        short flags;
        int cookie;
        u64 diff;
        u64 seq;

        /* Sorry, pr_flush() will not work this early. */
        if (system_state < SYSTEM_SCHEDULING)
                return false;

        might_sleep();

        seq = prb_next_reserve_seq(prb);

        /* Flush the consoles so that records up to @seq are printed. */
        printk_get_console_flush_type(&ft);
        if (ft.nbcon_atomic)
                nbcon_atomic_flush_pending();
        if (ft.legacy_direct) {
                console_lock();
                console_unlock();
        }

        for (;;) {
                unsigned long begin_jiffies;
                unsigned long slept_jiffies;

                diff = 0;

                /*
                 * Hold the console_lock to guarantee safe access to
                 * console->seq. Releasing console_lock flushes more
                 * records in case @seq is still not printed on all
                 * usable consoles.
                 *
                 * Holding the console_lock is not necessary if there
                 * are no legacy or boot consoles. However, such a
                 * console could register at any time. Always hold the
                 * console_lock as a precaution rather than
                 * synchronizing against register_console().
                 */
                console_lock();

                cookie = console_srcu_read_lock();
                for_each_console_srcu(c) {
                        if (con && con != c)
                                continue;

                        flags = console_srcu_read_flags(c);

                        /*
                         * If consoles are not usable, it cannot be expected
                         * that they make forward progress, so only increment
                         * @diff for usable consoles.
                         */
                        if (!console_is_usable(c, flags, true) &&
                            !console_is_usable(c, flags, false)) {
                                continue;
                        }

                        if (flags & CON_NBCON) {
                                printk_seq = nbcon_seq_read(c);
                        } else {
                                printk_seq = c->seq;
                        }

                        if (printk_seq < seq)
                                diff += seq - printk_seq;
                }
                console_srcu_read_unlock(cookie);

                if (diff != last_diff && reset_on_progress)
                        remaining_jiffies = timeout_jiffies;

                console_unlock();

                /* Note: @diff is 0 if there are no usable consoles. */
                if (diff == 0 || remaining_jiffies == 0)
                        break;

                /* msleep(1) might sleep much longer. Check time by jiffies. */
                begin_jiffies = jiffies;
                msleep(1);
                slept_jiffies = jiffies - begin_jiffies;

                remaining_jiffies -= min(slept_jiffies, remaining_jiffies);

                last_diff = diff;
        }

        return (diff == 0);
}

/**
 * pr_flush() - Wait for printing threads to catch up.
 *
 * @timeout_ms:        The maximum time (in ms) to wait.
 * @reset_on_progress: Reset the timeout if forward progress is seen.
 *
 * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
 * represents infinite waiting.
 *
 * If @reset_on_progress is true, the timeout will be reset whenever any
 * printer has been seen to make some forward progress.
 *
 * Context: Process context. May sleep while acquiring console lock.
 * Return: true if all usable printers are caught up.
 */
bool pr_flush(int timeout_ms, bool reset_on_progress)
{
        return __pr_flush(NULL, timeout_ms, reset_on_progress);
}

/*
 * Delayed printk version, for scheduler-internal messages:
 */
#define PRINTK_PENDING_WAKEUP        0x01
#define PRINTK_PENDING_OUTPUT        0x02

static DEFINE_PER_CPU(int, printk_pending);

static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
        int pending = this_cpu_xchg(printk_pending, 0);

        if (pending & PRINTK_PENDING_OUTPUT) {
                if (force_legacy_kthread()) {
                        if (printk_legacy_kthread)
                                wake_up_interruptible(&legacy_wait);
                } else {
                        if (console_trylock())
                                console_unlock();
                }
        }

        if (pending & PRINTK_PENDING_WAKEUP)
                wake_up_interruptible(&log_wait);
}

static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
        IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);

static void __wake_up_klogd(int val)
{
        if (!printk_percpu_data_ready())
                return;

        /*
         * It is not allowed to call this function when console irq_work
         * is blocked.
         */
        if (WARN_ON_ONCE(console_irqwork_blocked))
                return;

        preempt_disable();
        /*
         * Guarantee any new records can be seen by tasks preparing to wait
         * before this context checks if the wait queue is empty.
         *
         * The full memory barrier within wq_has_sleeper() pairs with the full
         * memory barrier within set_current_state() of
         * prepare_to_wait_event(), which is called after ___wait_event() adds
         * the waiter but before it has checked the wait condition.
         *
         * This pairs with devkmsg_read:A and syslog_print:A.
         */
        if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
            (val & PRINTK_PENDING_OUTPUT)) {
                this_cpu_or(printk_pending, val);
                irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
        }
        preempt_enable();
}

/**
 * wake_up_klogd - Wake kernel logging daemon
 *
 * Use this function when new records have been added to the ringbuffer
 * and the console printing of those records has already occurred or is
 * known to be handled by some other context. This function will only
 * wake the logging daemon.
 *
 * Context: Any context.
 */
void wake_up_klogd(void)
{
        __wake_up_klogd(PRINTK_PENDING_WAKEUP);
}

/**
 * defer_console_output - Wake kernel logging daemon and trigger
 *        console printing in a deferred context
 *
 * Use this function when new records have been added to the ringbuffer,
 * this context is responsible for console printing those records, but
 * the current context is not allowed to perform the console printing.
 * Trigger an irq_work context to perform the console printing. This
 * function also wakes the logging daemon.
 *
 * Context: Any context.
 */
void defer_console_output(void)
{
        /*
         * New messages may have been added directly to the ringbuffer
         * using vprintk_store(), so wake any waiters as well.
         */
        __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
}

/**
 * printk_trigger_flush - Attempt to flush printk buffer to consoles.
 *
 * If possible, flush the printk buffer to all consoles in the caller's
 * context. If offloading is available, trigger deferred printing.
 *
 * This is best effort. Depending on the system state, console states,
 * and caller context, no actual flushing may result from this call.
 */
void printk_trigger_flush(void)
{
        struct console_flush_type ft;

        printk_get_console_flush_type(&ft);
        if (ft.nbcon_atomic)
                nbcon_atomic_flush_pending();
        if (ft.nbcon_offload)
                nbcon_kthreads_wake();
        if (ft.legacy_direct) {
                if (console_trylock())
                        console_unlock();
        }
        if (ft.legacy_offload)
                defer_console_output();
}

int vprintk_deferred(const char *fmt, va_list args)
{
        return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
}

int _printk_deferred(const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk_deferred(fmt, args);
        va_end(args);

        return r;
}

/*
 * printk rate limiting, lifted from the networking subsystem.
 *
 * This enforces a rate limit: not more than 10 kernel messages
 * every 5s to make a denial-of-service attack impossible.
 */
DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);

int __printk_ratelimit(const char *func)
{
        return ___ratelimit(&printk_ratelimit_state, func);
}
EXPORT_SYMBOL(__printk_ratelimit);

/**
 * printk_timed_ratelimit - caller-controlled printk ratelimiting
 * @caller_jiffies: pointer to caller's state
 * @interval_msecs: minimum interval between prints
 *
 * printk_timed_ratelimit() returns true if more than @interval_msecs
 * milliseconds have elapsed since the last time printk_timed_ratelimit()
 * returned true.
 */
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                        unsigned int interval_msecs)
{
        unsigned long elapsed = jiffies - *caller_jiffies;

        if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
                return false;

        *caller_jiffies = jiffies;
        return true;
}
EXPORT_SYMBOL(printk_timed_ratelimit);

static DEFINE_SPINLOCK(dump_list_lock);
static LIST_HEAD(dump_list);

/**
 * kmsg_dump_register - register a kernel log dumper.
 * @dumper: pointer to the kmsg_dumper structure
 *
 * Adds a kernel log dumper to the system. The dump callback in the
 * structure will be called when the kernel oopses or panics and must be
 * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
 */
int kmsg_dump_register(struct kmsg_dumper *dumper)
{
        unsigned long flags;
        int err = -EBUSY;

        /* The dump callback needs to be set */
        if (!dumper->dump)
                return -EINVAL;

        spin_lock_irqsave(&dump_list_lock, flags);
        /* Don't allow registering multiple times */
        if (!dumper->registered) {
                dumper->registered = 1;
                list_add_tail_rcu(&dumper->list, &dump_list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);

        return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_register);

/**
 * kmsg_dump_unregister - unregister a kmsg dumper.
 * @dumper: pointer to the kmsg_dumper structure
 *
 * Removes a dump device from the system. Returns zero on success and
 * %-EINVAL otherwise.
 */
int kmsg_dump_unregister(struct kmsg_dumper *dumper)
{
        unsigned long flags;
        int err = -EINVAL;

        spin_lock_irqsave(&dump_list_lock, flags);
        if (dumper->registered) {
                dumper->registered = 0;
                list_del_rcu(&dumper->list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);
        synchronize_rcu();

        return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_unregister);

static bool always_kmsg_dump;
module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);

const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
{
        switch (reason) {
        case KMSG_DUMP_PANIC:
                return "Panic";
        case KMSG_DUMP_OOPS:
                return "Oops";
        case KMSG_DUMP_EMERG:
                return "Emergency";
        case KMSG_DUMP_SHUTDOWN:
                return "Shutdown";
        default:
                return "Unknown";
        }
}
EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);

/**
 * kmsg_dump_desc - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
 * @desc: a short string to describe what caused the panic or oops. Can be NULL
 * if no additional description is available.
 *
 * Call each of the registered dumper's dump() callback, which can
 * retrieve the kmsg records with kmsg_dump_get_line() or
 * kmsg_dump_get_buffer().
 */
void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc)
{
        struct kmsg_dumper *dumper;
        struct kmsg_dump_detail detail = {
                .reason = reason,
                .description = desc};

        rcu_read_lock();
        list_for_each_entry_rcu(dumper, &dump_list, list) {
                enum kmsg_dump_reason max_reason = dumper->max_reason;

                /*
                 * If client has not provided a specific max_reason, default
                 * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set.
                 */
                if (max_reason == KMSG_DUMP_UNDEF) {
                        max_reason = always_kmsg_dump ? KMSG_DUMP_MAX :
                                                        KMSG_DUMP_OOPS;
                }
                if (reason > max_reason)
                        continue;

                /* invoke dumper which will iterate over records */
                dumper->dump(dumper, &detail);
        }
        rcu_read_unlock();
}

/**
 * kmsg_dump_get_line - retrieve one kmsg log line
 * @iter: kmsg dump iterator
 * @syslog: include the "<4>" prefixes
 * @line: buffer to copy the line to
 * @size: maximum size of the buffer
 * @len: length of line placed into buffer
 *
 * Start at the beginning of the kmsg buffer, with the oldest kmsg
 * record, and copy one record into the provided buffer.
 *
 * Consecutive calls will return the next available record moving
 * towards the end of the buffer with the youngest messages.
 *
 * A return value of FALSE indicates that there are no more records to
 * read.
 */
bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
                        char *line, size_t size, size_t *len)
{
        u64 min_seq = latched_seq_read_nolock(&clear_seq);
        struct printk_info info;
        unsigned int line_count;
        struct printk_record r;
        size_t l = 0;
        bool ret = false;

        if (iter->cur_seq < min_seq)
                iter->cur_seq = min_seq;

        prb_rec_init_rd(&r, &info, line, size);

        /* Read text or count text lines? */
        if (line) {
                if (!prb_read_valid(prb, iter->cur_seq, &r))
                        goto out;
                l = record_print_text(&r, syslog, printk_time);
        } else {
                if (!prb_read_valid_info(prb, iter->cur_seq,
                                         &info, &line_count)) {
                        goto out;
                }
                l = get_record_print_text_size(&info, line_count, syslog,
                                               printk_time);

        }

        iter->cur_seq = r.info->seq + 1;
        ret = true;
out:
        if (len)
                *len = l;
        return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);

/**
 * kmsg_dump_get_buffer - copy kmsg log lines
 * @iter: kmsg dump iterator
 * @syslog: include the "<4>" prefixes
 * @buf: buffer to copy the line to
 * @size: maximum size of the buffer
 * @len_out: length of line placed into buffer
 *
 * Start at the end of the kmsg buffer and fill the provided buffer
 * with as many of the *youngest* kmsg records that fit into it.
 * If the buffer is large enough, all available kmsg records will be
 * copied with a single call.
 *
 * Consecutive calls will fill the buffer with the next block of
 * available older records, not including the earlier retrieved ones.
 *
 * A return value of FALSE indicates that there are no more records to
 * read.
 */
bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
                          char *buf, size_t size, size_t *len_out)
{
        u64 min_seq = latched_seq_read_nolock(&clear_seq);
        struct printk_info info;
        struct printk_record r;
        u64 seq;
        u64 next_seq;
        size_t len = 0;
        bool ret = false;
        bool time = printk_time;

        if (!buf || !size)
                goto out;

        if (iter->cur_seq < min_seq)
                iter->cur_seq = min_seq;

        if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
                if (info.seq != iter->cur_seq) {
                        /* messages are gone, move to first available one */
                        iter->cur_seq = info.seq;
                }
        }

        /* last entry */
        if (iter->cur_seq >= iter->next_seq)
                goto out;

        /*
         * Find first record that fits, including all following records,
         * into the user-provided buffer for this dump. Pass in size-1
         * because this function (by way of record_print_text()) will
         * not write more than size-1 bytes of text into @buf.
         */
        seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
                                     size - 1, syslog, time);

        /*
         * Next kmsg_dump_get_buffer() invocation will dump block of
         * older records stored right before this one.
         */
        next_seq = seq;

        prb_rec_init_rd(&r, &info, buf, size);

        prb_for_each_record(seq, prb, seq, &r) {
                if (r.info->seq >= iter->next_seq)
                        break;

                len += record_print_text(&r, syslog, time);

                /* Adjust record to store to remaining buffer space. */
                prb_rec_init_rd(&r, &info, buf + len, size - len);
        }

        iter->next_seq = next_seq;
        ret = true;
out:
        if (len_out)
                *len_out = len;
        return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);

/**
 * kmsg_dump_rewind - reset the iterator
 * @iter: kmsg dump iterator
 *
 * Reset the dumper's iterator so that kmsg_dump_get_line() and
 * kmsg_dump_get_buffer() can be called again and used multiple
 * times within the same dumper.dump() callback.
 */
void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
        iter->cur_seq = latched_seq_read_nolock(&clear_seq);
        iter->next_seq = prb_next_seq(prb);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);

/**
 * console_try_replay_all - try to replay kernel log on consoles
 *
 * Try to obtain lock on console subsystem and replay all
 * available records in printk buffer on the consoles.
 * Does nothing if lock is not obtained.
 *
 * Context: Any, except for NMI.
 */
void console_try_replay_all(void)
{
        struct console_flush_type ft;

        printk_get_console_flush_type(&ft);
        if (console_trylock()) {
                __console_rewind_all();
                if (ft.nbcon_atomic)
                        nbcon_atomic_flush_pending();
                if (ft.nbcon_offload)
                        nbcon_kthreads_wake();
                if (ft.legacy_offload)
                        defer_console_output();
                /* Consoles are flushed as part of console_unlock(). */
                console_unlock();
        }
}
#endif

#ifdef CONFIG_SMP
static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);

bool is_printk_cpu_sync_owner(void)
{
        return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id());
}

/**
 * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
 *                            spinning lock is not owned by any CPU.
 *
 * Context: Any context.
 */
void __printk_cpu_sync_wait(void)
{
        do {
                cpu_relax();
        } while (atomic_read(&printk_cpu_sync_owner) != -1);
}
EXPORT_SYMBOL(__printk_cpu_sync_wait);

/**
 * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant
 *                               spinning lock.
 *
 * If no processor has the lock, the calling processor takes the lock and
 * becomes the owner. If the calling processor is already the owner of the
 * lock, this function succeeds immediately.
 *
 * Context: Any context. Expects interrupts to be disabled.
 * Return: 1 on success, otherwise 0.
 */
int __printk_cpu_sync_try_get(void)
{
        int cpu;
        int old;

        cpu = smp_processor_id();

        /*
         * Guarantee loads and stores from this CPU when it is the lock owner
         * are _not_ visible to the previous lock owner. This pairs with
         * __printk_cpu_sync_put:B.
         *
         * Memory barrier involvement:
         *
         * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
         * then __printk_cpu_sync_put:A can never read from
         * __printk_cpu_sync_try_get:B.
         *
         * Relies on:
         *
         * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
         * of the previous CPU
         *    matching
         * ACQUIRE from __printk_cpu_sync_try_get:A to
         * __printk_cpu_sync_try_get:B of this CPU
         */
        old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1,
                                     cpu); /* LMM(__printk_cpu_sync_try_get:A) */
        if (old == -1) {
                /*
                 * This CPU is now the owner and begins loading/storing
                 * data: LMM(__printk_cpu_sync_try_get:B)
                 */
                return 1;

        } else if (old == cpu) {
                /* This CPU is already the owner. */
                atomic_inc(&printk_cpu_sync_nested);
                return 1;
        }

        return 0;
}
EXPORT_SYMBOL(__printk_cpu_sync_try_get);

/**
 * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock.
 *
 * The calling processor must be the owner of the lock.
 *
 * Context: Any context. Expects interrupts to be disabled.
 */
void __printk_cpu_sync_put(void)
{
        if (atomic_read(&printk_cpu_sync_nested)) {
                atomic_dec(&printk_cpu_sync_nested);
                return;
        }

        /*
         * This CPU is finished loading/storing data:
         * LMM(__printk_cpu_sync_put:A)
         */

        /*
         * Guarantee loads and stores from this CPU when it was the
         * lock owner are visible to the next lock owner. This pairs
         * with __printk_cpu_sync_try_get:A.
         *
         * Memory barrier involvement:
         *
         * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
         * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A.
         *
         * Relies on:
         *
         * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
         * of this CPU
         *    matching
         * ACQUIRE from __printk_cpu_sync_try_get:A to
         * __printk_cpu_sync_try_get:B of the next CPU
         */
        atomic_set_release(&printk_cpu_sync_owner,
                           -1); /* LMM(__printk_cpu_sync_put:B) */
}
EXPORT_SYMBOL(__printk_cpu_sync_put);
#endif /* CONFIG_SMP */

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_BITOPS_LE_H_
#define _ASM_GENERIC_BITOPS_LE_H_

#include <asm/types.h>
#include <asm/byteorder.h>

#if defined(__LITTLE_ENDIAN)

#define BITOP_LE_SWIZZLE        0

#elif defined(__BIG_ENDIAN)

#define BITOP_LE_SWIZZLE        ((BITS_PER_LONG-1) & ~0x7)

#endif


static inline int test_bit_le(int nr, const void *addr)
{
        return test_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void set_bit_le(int nr, void *addr)
{
        set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void clear_bit_le(int nr, void *addr)
{
        clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void __set_bit_le(int nr, void *addr)
{
        __set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void __clear_bit_le(int nr, void *addr)
{
        __clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int test_and_set_bit_le(int nr, void *addr)
{
        return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int test_and_clear_bit_le(int nr, void *addr)
{
        return test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int __test_and_set_bit_le(int nr, void *addr)
{
        return __test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int __test_and_clear_bit_le(int nr, void *addr)
{
        return __test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

#endif /* _ASM_GENERIC_BITOPS_LE_H_ */















































































































    3 







    3 











































    2 








    3 



































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * net/l3mdev/l3mdev.c - L3 master device implementation
 * Copyright (c) 2015 Cumulus Networks
 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
 */

#include <linux/netdevice.h>
#include <net/fib_rules.h>
#include <net/l3mdev.h>

static DEFINE_SPINLOCK(l3mdev_lock);

struct l3mdev_handler {
        lookup_by_table_id_t dev_lookup;
};

static struct l3mdev_handler l3mdev_handlers[L3MDEV_TYPE_MAX + 1];

static int l3mdev_check_type(enum l3mdev_type l3type)
{
        if (l3type <= L3MDEV_TYPE_UNSPEC || l3type > L3MDEV_TYPE_MAX)
                return -EINVAL;

        return 0;
}

int l3mdev_table_lookup_register(enum l3mdev_type l3type,
                                 lookup_by_table_id_t fn)
{
        struct l3mdev_handler *hdlr;
        int res;

        res = l3mdev_check_type(l3type);
        if (res)
                return res;

        hdlr = &l3mdev_handlers[l3type];

        spin_lock(&l3mdev_lock);

        if (hdlr->dev_lookup) {
                res = -EBUSY;
                goto unlock;
        }

        hdlr->dev_lookup = fn;
        res = 0;

unlock:
        spin_unlock(&l3mdev_lock);

        return res;
}
EXPORT_SYMBOL_GPL(l3mdev_table_lookup_register);

void l3mdev_table_lookup_unregister(enum l3mdev_type l3type,
                                    lookup_by_table_id_t fn)
{
        struct l3mdev_handler *hdlr;

        if (l3mdev_check_type(l3type))
                return;

        hdlr = &l3mdev_handlers[l3type];

        spin_lock(&l3mdev_lock);

        if (hdlr->dev_lookup == fn)
                hdlr->dev_lookup = NULL;

        spin_unlock(&l3mdev_lock);
}
EXPORT_SYMBOL_GPL(l3mdev_table_lookup_unregister);

int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type,
                                      struct net *net, u32 table_id)
{
        lookup_by_table_id_t lookup;
        struct l3mdev_handler *hdlr;
        int ifindex = -EINVAL;
        int res;

        res = l3mdev_check_type(l3type);
        if (res)
                return res;

        hdlr = &l3mdev_handlers[l3type];

        spin_lock(&l3mdev_lock);

        lookup = hdlr->dev_lookup;
        if (!lookup)
                goto unlock;

        ifindex = lookup(net, table_id);

unlock:
        spin_unlock(&l3mdev_lock);

        return ifindex;
}
EXPORT_SYMBOL_GPL(l3mdev_ifindex_lookup_by_table_id);

/**
 *        l3mdev_master_ifindex_rcu - get index of L3 master device
 *        @dev: targeted interface
 */

int l3mdev_master_ifindex_rcu(const struct net_device *dev)
{
        int ifindex = 0;

        if (!dev)
                return 0;

        if (netif_is_l3_master(dev)) {
                ifindex = dev->ifindex;
        } else if (netif_is_l3_slave(dev)) {
                struct net_device *master;
                struct net_device *_dev = (struct net_device *)dev;

                /* netdev_master_upper_dev_get_rcu calls
                 * list_first_or_null_rcu to walk the upper dev list.
                 * list_first_or_null_rcu does not handle a const arg. We aren't
                 * making changes, just want the master device from that list so
                 * typecast to remove the const
                 */
                master = netdev_master_upper_dev_get_rcu(_dev);
                if (master)
                        ifindex = master->ifindex;
        }

        return ifindex;
}
EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu);

/**
 *        l3mdev_master_upper_ifindex_by_index_rcu - get index of upper l3 master
 *                                               device
 *        @net: network namespace for device index lookup
 *        @ifindex: targeted interface
 */
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
{
        struct net_device *dev;

        dev = dev_get_by_index_rcu(net, ifindex);
        while (dev && !netif_is_l3_master(dev))
                dev = netdev_master_upper_dev_get_rcu(dev);

        return dev ? dev->ifindex : 0;
}
EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu);

/**
 *        l3mdev_fib_table_rcu - get FIB table id associated with an L3
 *                             master interface
 *        @dev: targeted interface
 */

u32 l3mdev_fib_table_rcu(const struct net_device *dev)
{
        u32 tb_id = 0;

        if (!dev)
                return 0;

        if (netif_is_l3_master(dev)) {
                if (dev->l3mdev_ops->l3mdev_fib_table)
                        tb_id = dev->l3mdev_ops->l3mdev_fib_table(dev);
        } else if (netif_is_l3_slave(dev)) {
                /* Users of netdev_master_upper_dev_get_rcu need non-const,
                 * but current inet_*type functions take a const
                 */
                struct net_device *_dev = (struct net_device *) dev;
                const struct net_device *master;

                master = netdev_master_upper_dev_get_rcu(_dev);
                if (master &&
                    master->l3mdev_ops->l3mdev_fib_table)
                        tb_id = master->l3mdev_ops->l3mdev_fib_table(master);
        }

        return tb_id;
}
EXPORT_SYMBOL_GPL(l3mdev_fib_table_rcu);

u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;
        u32 tb_id = 0;

        if (!ifindex)
                return 0;

        rcu_read_lock();

        dev = dev_get_by_index_rcu(net, ifindex);
        if (dev)
                tb_id = l3mdev_fib_table_rcu(dev);

        rcu_read_unlock();

        return tb_id;
}
EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);

/**
 *        l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link
 *                             local and multicast addresses
 *        @net: network namespace for device index lookup
 *        @fl6: IPv6 flow struct for lookup
 *        This function does not hold refcnt on the returned dst.
 *        Caller must hold rcu_read_lock().
 */

struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
                                           struct flowi6 *fl6)
{
        struct dst_entry *dst = NULL;
        struct net_device *dev;

        WARN_ON_ONCE(!rcu_read_lock_held());
        if (fl6->flowi6_oif) {
                dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
                if (dev && netif_is_l3_slave(dev))
                        dev = netdev_master_upper_dev_get_rcu(dev);

                if (dev && netif_is_l3_master(dev) &&
                    dev->l3mdev_ops->l3mdev_link_scope_lookup)
                        dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
        }

        return dst;
}
EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);

/**
 *        l3mdev_fib_rule_match - Determine if flowi references an
 *                                L3 master device
 *        @net: network namespace for device index lookup
 *        @fl:  flow struct
 *        @arg: store the table the rule matched with here
 */

int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
                          struct fib_lookup_arg *arg)
{
        struct net_device *dev;
        int rc = 0;

        /* update flow ensures flowi_l3mdev is set when relevant */
        if (!fl->flowi_l3mdev)
                return 0;

        rcu_read_lock();

        dev = dev_get_by_index_rcu(net, fl->flowi_l3mdev);
        if (dev && netif_is_l3_master(dev) &&
            dev->l3mdev_ops->l3mdev_fib_table) {
                arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev);
                rc = 1;
        }

        rcu_read_unlock();

        return rc;
}

void l3mdev_update_flow(struct net *net, struct flowi *fl)
{
        struct net_device *dev;

        rcu_read_lock();

        if (fl->flowi_oif) {
                dev = dev_get_by_index_rcu(net, fl->flowi_oif);
                if (dev) {
                        if (!fl->flowi_l3mdev) {
                                fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev);
                                fl->flowi_flags |= FLOWI_FLAG_L3MDEV_OIF;
                        }

                        /* oif set to L3mdev directs lookup to its table;
                         * reset to avoid oif match in fib_lookup
                         */
                        if (netif_is_l3_master(dev))
                                fl->flowi_oif = 0;
                        goto out;
                }
        }

        if (fl->flowi_iif > LOOPBACK_IFINDEX && !fl->flowi_l3mdev) {
                dev = dev_get_by_index_rcu(net, fl->flowi_iif);
                if (dev)
                        fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev);
        }

out:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(l3mdev_update_flow);





























































































































































































































































































































































































































    1 











































































    1 















    1 


































































    1 





















    1 




































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* include/asm-generic/tlb.h
 *
 *        Generic TLB shootdown code
 *
 * Copyright 2001 Red Hat, Inc.
 * Based on code from mm/memory.c Copyright Linus Torvalds and others.
 *
 * Copyright 2011 Red Hat, Inc., Peter Zijlstra
 */
#ifndef _ASM_GENERIC__TLB_H
#define _ASM_GENERIC__TLB_H

#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/hugetlb_inline.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>

/*
 * Blindly accessing user memory from NMI context can be dangerous
 * if we're in the middle of switching the current user task or switching
 * the loaded mm.
 */
#ifndef nmi_uaccess_okay
# define nmi_uaccess_okay() true
#endif

#ifdef CONFIG_MMU

/*
 * Generic MMU-gather implementation.
 *
 * The mmu_gather data structure is used by the mm code to implement the
 * correct and efficient ordering of freeing pages and TLB invalidations.
 *
 * This correct ordering is:
 *
 *  1) unhook page
 *  2) TLB invalidate page
 *  3) free page
 *
 * That is, we must never free a page before we have ensured there are no live
 * translations left to it. Otherwise it might be possible to observe (or
 * worse, change) the page content after it has been reused.
 *
 * The mmu_gather API consists of:
 *
 *  - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_gather_mmu_vma() /
 *    tlb_finish_mmu()
 *
 *    start and finish a mmu_gather
 *
 *    Finish in particular will issue a (final) TLB invalidate and free
 *    all (remaining) queued pages.
 *
 *  - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA
 *
 *    Defaults to flushing at tlb_end_vma() to reset the range; helps when
 *    there's large holes between the VMAs.
 *
 *  - tlb_free_vmas()
 *
 *    tlb_free_vmas() marks the start of unlinking of one or more vmas
 *    and freeing page-tables.
 *
 *  - tlb_remove_table()
 *
 *    tlb_remove_table() is the basic primitive to free page-table directories
 *    (__p*_free_tlb()).  In it's most primitive form it is an alias for
 *    tlb_remove_page() below, for when page directories are pages and have no
 *    additional constraints.
 *
 *    See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE.
 *
 *  - tlb_remove_page() / tlb_remove_page_size()
 *  - __tlb_remove_folio_pages() / __tlb_remove_page_size()
 *  - __tlb_remove_folio_pages_size()
 *
 *    __tlb_remove_folio_pages_size() is the basic primitive that queues pages
 *    for freeing. It will return a boolean indicating if the queue is (now)
 *    full and a call to tlb_flush_mmu() is required.
 *
 *    tlb_remove_page() and tlb_remove_page_size() imply the call to
 *    tlb_flush_mmu() when required and has no return value.
 *
 *    __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(),
 *    however, instead of removing a single page, assume PAGE_SIZE and remove
 *    the given number of consecutive pages that are all part of the
 *    same (large) folio.
 *
 *  - tlb_change_page_size()
 *
 *    call before __tlb_remove_page*() to set the current page-size; implies a
 *    possible tlb_flush_mmu() call.
 *
 *  - tlb_flush_mmu() / tlb_flush_mmu_tlbonly()
 *
 *    tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets
 *                              related state, like the range)
 *
 *    tlb_flush_mmu() - in addition to the above TLB invalidate, also frees
 *                        whatever pages are still batched.
 *
 *  - mmu_gather::fullmm
 *
 *    A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free
 *    the entire mm; this allows a number of optimizations.
 *
 *    - We can ignore tlb_{start,end}_vma(); because we don't
 *      care about ranges. Everything will be shot down.
 *
 *    - (RISC) architectures that use ASIDs can cycle to a new ASID
 *      and delay the invalidation until ASID space runs out.
 *
 *  - mmu_gather::need_flush_all
 *
 *    A flag that can be set by the arch code if it wants to force
 *    flush the entire TLB irrespective of the range. For instance
 *    x86-PAE needs this when changing top-level entries.
 *
 * And allows the architecture to provide and implement tlb_flush():
 *
 * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make
 * use of:
 *
 *  - mmu_gather::start / mmu_gather::end
 *
 *    which provides the range that needs to be flushed to cover the pages to
 *    be freed.
 *
 *  - mmu_gather::freed_tables
 *
 *    set when we freed page table pages
 *
 *  - tlb_get_unmap_shift() / tlb_get_unmap_size()
 *
 *    returns the smallest TLB entry size unmapped in this range.
 *
 * If an architecture does not provide tlb_flush() a default implementation
 * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is
 * specified, in which case we'll default to flush_tlb_mm().
 *
 * Additionally there are a few opt-in features:
 *
 *  MMU_GATHER_PAGE_SIZE
 *
 *  This ensures we call tlb_flush() every time tlb_change_page_size() actually
 *  changes the size and provides mmu_gather::page_size to tlb_flush().
 *
 *  This might be useful if your architecture has size specific TLB
 *  invalidation instructions.
 *
 *  MMU_GATHER_TABLE_FREE
 *
 *  This provides tlb_remove_table(), to be used instead of tlb_remove_page()
 *  for page directores (__p*_free_tlb()).
 *
 *  Useful if your architecture has non-page page directories.
 *
 *  When used, an architecture is expected to provide __tlb_remove_table() or
 *  use the generic __tlb_remove_table(), which does the actual freeing of these
 *  pages.
 *
 *  MMU_GATHER_RCU_TABLE_FREE
 *
 *  Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see
 *  comment below).
 *
 *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
 *  and therefore doesn't naturally serialize with software page-table walkers.
 *
 *  MMU_GATHER_NO_FLUSH_CACHE
 *
 *  Indicates the architecture has flush_cache_range() but it needs *NOT* be called
 *  before unmapping a VMA.
 *
 *  NOTE: strictly speaking we shouldn't have this knob and instead rely on
 *          flush_cache_range() being a NOP, except Sparc64 seems to be
 *          different here.
 *
 *  MMU_GATHER_MERGE_VMAS
 *
 *  Indicates the architecture wants to merge ranges over VMAs; typical when
 *  multiple range invalidates are more expensive than a full invalidate.
 *
 *  MMU_GATHER_NO_RANGE
 *
 *  Use this if your architecture lacks an efficient flush_tlb_range(). This
 *  option implies MMU_GATHER_MERGE_VMAS above.
 *
 *  MMU_GATHER_NO_GATHER
 *
 *  If the option is set the mmu_gather will not track individual pages for
 *  delayed page free anymore. A platform that enables the option needs to
 *  provide its own implementation of the __tlb_remove_page_size() function to
 *  free pages.
 *
 *  This is useful if your architecture already flushes TLB entries in the
 *  various ptep_get_and_clear() functions.
 */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

struct mmu_table_batch {
#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
        struct rcu_head                rcu;
#endif
        unsigned int                nr;
        void                        *tables[];
};

#define MAX_TABLE_BATCH                \
        ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *))

#ifndef CONFIG_HAVE_ARCH_TLB_REMOVE_TABLE
static inline void __tlb_remove_table(void *table)
{
        struct ptdesc *ptdesc = (struct ptdesc *)table;

        pagetable_dtor_free(ptdesc);
}
#endif

extern void tlb_remove_table(struct mmu_gather *tlb, void *table);

#else /* !CONFIG_MMU_GATHER_TABLE_FREE */

static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page);
/*
 * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based
 * page directories and we can use the normal page batching to free them.
 */
static inline void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        struct ptdesc *ptdesc = (struct ptdesc *)table;

        pagetable_dtor(ptdesc);
        tlb_remove_page(tlb, ptdesc_page(ptdesc));
}
#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
/*
 * This allows an architecture that does not use the linux page-tables for
 * hardware to skip the TLBI when freeing page tables.
 */
#ifndef tlb_needs_table_invalidate
#define tlb_needs_table_invalidate() (true)
#endif

void tlb_remove_table_sync_one(void);

void tlb_remove_table_sync_rcu(void);

#else

#ifdef tlb_needs_table_invalidate
#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
#endif

static inline void tlb_remove_table_sync_one(void) { }

static inline void tlb_remove_table_sync_rcu(void) { }

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */


#ifndef CONFIG_MMU_GATHER_NO_GATHER
/*
 * If we can't allocate a page to make a big batch of page pointers
 * to work on, then just handle a few from the on-stack structure.
 */
#define MMU_GATHER_BUNDLE        8

struct mmu_gather_batch {
        struct mmu_gather_batch        *next;
        unsigned int                nr;
        unsigned int                max;
        struct encoded_page        *encoded_pages[];
};

#define MAX_GATHER_BATCH        \
        ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))

/*
 * Limit the maximum number of mmu_gather batches to reduce a risk of soft
 * lockups for non-preemptible kernels on huge machines when a lot of memory
 * is zapped during unmapping.
 * 10K pages freed at once should be safe even without a preemption point.
 */
#define MAX_GATHER_BATCH_COUNT        (10000UL/MAX_GATHER_BATCH)

extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size);
bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
                unsigned int nr_pages, bool delay_rmap);

#ifdef CONFIG_SMP
/*
 * This both sets 'delayed_rmap', and returns true. It would be an inline
 * function, except we define it before the 'struct mmu_gather'.
 */
#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true)
extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma);
#endif

#endif

/*
 * We have a no-op version of the rmap removal that doesn't
 * delay anything. That is used on S390, which flushes remote
 * TLBs synchronously, and on UP, which doesn't have any
 * remote TLBs to flush and is not preemptible due to this
 * all happening under the page table lock.
 */
#ifndef tlb_delay_rmap
#define tlb_delay_rmap(tlb) (false)
static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
#endif

/*
 * struct mmu_gather is an opaque type used by the mm code for passing around
 * any data needed by arch specific code for tlb_remove_page.
 */
struct mmu_gather {
        struct mm_struct        *mm;

#ifdef CONFIG_MMU_GATHER_TABLE_FREE
        struct mmu_table_batch        *batch;
#endif

        unsigned long                start;
        unsigned long                end;
        /*
         * we are in the middle of an operation to clear
         * a full mm and can make some optimizations
         */
        unsigned int                fullmm : 1;

        /*
         * we have performed an operation which
         * requires a complete flush of the tlb
         */
        unsigned int                need_flush_all : 1;

        /*
         * we have removed page directories
         */
        unsigned int                freed_tables : 1;

        /*
         * Do we have pending delayed rmap removals?
         */
        unsigned int                delayed_rmap : 1;

        /*
         * at which levels have we cleared entries?
         */
        unsigned int                cleared_ptes : 1;
        unsigned int                cleared_pmds : 1;
        unsigned int                cleared_puds : 1;
        unsigned int                cleared_p4ds : 1;

        /*
         * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma
         */
        unsigned int                vma_exec : 1;
        unsigned int                vma_huge : 1;
        unsigned int                vma_pfn  : 1;

        /*
         * Did we unshare (unmap) any shared page tables? For now only
         * used for hugetlb PMD table sharing.
         */
        unsigned int                unshared_tables : 1;

        /*
         * Did we unshare any page tables such that they are now exclusive
         * and could get reused+modified by the new owner? When setting this
         * flag, "unshared_tables" will be set as well. For now only used
         * for hugetlb PMD table sharing.
         */
        unsigned int                fully_unshared_tables : 1;

        unsigned int                batch_count;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        struct mmu_gather_batch *active;
        struct mmu_gather_batch        local;
        struct page                *__pages[MMU_GATHER_BUNDLE];

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        unsigned int page_size;
#endif
#endif
};

void tlb_flush_mmu(struct mmu_gather *tlb);

static inline void __tlb_adjust_range(struct mmu_gather *tlb,
                                      unsigned long address,
                                      unsigned int range_size)
{
        tlb->start = min(tlb->start, address);
        tlb->end = max(tlb->end, address + range_size);
}

static inline void __tlb_reset_range(struct mmu_gather *tlb)
{
        if (tlb->fullmm) {
                tlb->start = tlb->end = ~0;
        } else {
                tlb->start = TASK_SIZE;
                tlb->end = 0;
        }
        tlb->freed_tables = 0;
        tlb->cleared_ptes = 0;
        tlb->cleared_pmds = 0;
        tlb->cleared_puds = 0;
        tlb->cleared_p4ds = 0;
        tlb->unshared_tables = 0;
        /*
         * Do not reset mmu_gather::vma_* fields here, we do not
         * call into tlb_start_vma() again to set them if there is an
         * intermediate flush.
         */
}

#ifdef CONFIG_MMU_GATHER_NO_RANGE

#if defined(tlb_flush)
#error MMU_GATHER_NO_RANGE relies on default tlb_flush()
#endif

/*
 * When an architecture does not have efficient means of range flushing TLBs
 * there is no point in doing intermediate flushes on tlb_end_vma() to keep the
 * range small. We equally don't have to worry about page granularity or other
 * things.
 *
 * All we need to do is issue a full flush for any !0 range.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->end)
                flush_tlb_mm(tlb->mm);
}

#else /* CONFIG_MMU_GATHER_NO_RANGE */

#ifndef tlb_flush
/*
 * When an architecture does not provide its own tlb_flush() implementation
 * but does have a reasonably efficient flush_vma_range() implementation
 * use that.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->fullmm || tlb->need_flush_all) {
                flush_tlb_mm(tlb->mm);
        } else if (tlb->end) {
                struct vm_area_struct vma = {
                        .vm_mm = tlb->mm,
                        .vm_flags = (tlb->vma_exec ? VM_EXEC    : 0) |
                                    (tlb->vma_huge ? VM_HUGETLB : 0),
                };

                flush_tlb_range(&vma, tlb->start, tlb->end);
        }
}
#endif

#endif /* CONFIG_MMU_GATHER_NO_RANGE */

static inline void
tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        /*
         * flush_tlb_range() implementations that look at VM_HUGETLB (tile,
         * mips-4k) flush only large pages.
         *
         * flush_tlb_range() implementations that flush I-TLB also flush D-TLB
         * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing
         * range.
         *
         * We rely on tlb_end_vma() to issue a flush, such that when we reset
         * these values the batch is empty.
         */
        tlb->vma_huge = is_vm_hugetlb_page(vma);
        tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);

        /*
         * Track if there's at least one VM_PFNMAP/VM_MIXEDMAP vma
         * in the tracked range, see tlb_free_vmas().
         */
        tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
}

static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
        /*
         * Anything calling __tlb_adjust_range() also sets at least one of
         * these bits.
         */
        if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
              tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables))
                return;

        tlb_flush(tlb);
        __tlb_reset_range(tlb);
}

static inline void tlb_remove_page_size(struct mmu_gather *tlb,
                                        struct page *page, int page_size)
{
        if (__tlb_remove_page_size(tlb, page, page_size))
                tlb_flush_mmu(tlb);
}

static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
        return tlb_remove_page_size(tlb, page, PAGE_SIZE);
}

static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
{
        tlb_remove_table(tlb, pt);
}

static inline void tlb_change_page_size(struct mmu_gather *tlb,
                                                     unsigned int page_size)
{
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        if (tlb->page_size && tlb->page_size != page_size) {
                if (!tlb->fullmm && !tlb->need_flush_all)
                        tlb_flush_mmu(tlb);
        }

        tlb->page_size = page_size;
#endif
}

static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb)
{
        if (tlb->cleared_ptes)
                return PAGE_SHIFT;
        if (tlb->cleared_pmds)
                return PMD_SHIFT;
        if (tlb->cleared_puds)
                return PUD_SHIFT;
        if (tlb->cleared_p4ds)
                return P4D_SHIFT;

        return PAGE_SHIFT;
}

static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
{
        return 1UL << tlb_get_unmap_shift(tlb);
}

/*
 * In the case of tlb vma handling, we can optimise these away in the
 * case where we're doing a full MM flush.  When we're doing a munmap,
 * the vmas are adjusted to only cover the region to be torn down.
 */
static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        tlb_update_vma_flags(tlb, vma);
#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE
        flush_cache_range(vma, vma->vm_start, vma->vm_end);
#endif
}

static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
                return;

        /*
         * Do a TLB flush and reset the range at VMA boundaries; this avoids
         * the ranges growing with the unused space between consecutive VMAs,
         * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
         * this.
         */
        tlb_flush_mmu_tlbonly(tlb);
}

static inline void tlb_free_vmas(struct mmu_gather *tlb)
{
        if (tlb->fullmm)
                return;

        /*
         * VM_PFNMAP is more fragile because the core mm will not track the
         * page mapcount -- there might not be page-frames for these PFNs
         * after all.
         *
         * Specifically() there is a race between munmap() and
         * unmap_mapping_range(), where munmap() will unlink the VMA, such
         * that unmap_mapping_range() will no longer observe the VMA and
         * no-op, without observing the TLBI, returning prematurely.
         *
         * So if we're about to unlink such a VMA, and we have pending
         * TLBI for such a vma, flush things now.
         */
        if (tlb->vma_pfn)
                tlb_flush_mmu_tlbonly(tlb);
}

/*
 * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,
 * and set corresponding cleared_*.
 */
static inline void tlb_flush_pte_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_ptes = 1;
}

static inline void tlb_flush_pmd_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_pmds = 1;
}

static inline void tlb_flush_pud_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_puds = 1;
}

static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_p4ds = 1;
}

#ifndef __tlb_remove_tlb_entry
static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address)
{
}
#endif

/**
 * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
 *
 * Record the fact that pte's were really unmapped by updating the range,
 * so we can later optimise away the tlb invalidate.   This helps when
 * userspace is unmapping already-unmapped pages, which happens quite a lot.
 */
#define tlb_remove_tlb_entry(tlb, ptep, address)                \
        do {                                                        \
                tlb_flush_pte_range(tlb, address, PAGE_SIZE);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

/**
 * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for
 *                            later tlb invalidation.
 *
 * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple
 * consecutive ptes instead of only a single one.
 */
static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb,
                pte_t *ptep, unsigned int nr, unsigned long address)
{
        tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr);
        for (;;) {
                __tlb_remove_tlb_entry(tlb, ptep, address);
                if (--nr == 0)
                        break;
                ptep++;
                address += PAGE_SIZE;
        }
}

#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)        \
        do {                                                        \
                unsigned long _sz = huge_page_size(h);                \
                if (_sz >= P4D_SIZE)                                \
                        tlb_flush_p4d_range(tlb, address, _sz);        \
                else if (_sz >= PUD_SIZE)                        \
                        tlb_flush_pud_range(tlb, address, _sz);        \
                else if (_sz >= PMD_SIZE)                        \
                        tlb_flush_pmd_range(tlb, address, _sz);        \
                else                                                \
                        tlb_flush_pte_range(tlb, address, _sz);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

/**
 * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
 * This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pmd_tlb_entry
#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
#endif

#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)                        \
        do {                                                                \
                tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE);        \
                __tlb_remove_pmd_tlb_entry(tlb, pmdp, address);                \
        } while (0)

/**
 * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb
 * invalidation. This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pud_tlb_entry
#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0)
#endif

#define tlb_remove_pud_tlb_entry(tlb, pudp, address)                        \
        do {                                                                \
                tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE);        \
                __tlb_remove_pud_tlb_entry(tlb, pudp, address);                \
        } while (0)

/*
 * For things like page tables caches (ie caching addresses "inside" the
 * page tables, like x86 does), for legacy reasons, flushing an
 * individual page had better flush the page table caches behind it. This
 * is definitely how x86 works, for example. And if you have an
 * architected non-legacy page table cache (which I'm not aware of
 * anybody actually doing), you're going to have some architecturally
 * explicit flushing for that, likely *separate* from a regular TLB entry
 * flush, and thus you'd need more than just some range expansion..
 *
 * So if we ever find an architecture
 * that would want something that odd, I think it is up to that
 * architecture to do its own odd thing, not cause pain for others
 * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com
 *
 * For now w.r.t page table cache, mark the range_size as PAGE_SIZE
 */

#ifndef pte_free_tlb
#define pte_free_tlb(tlb, ptep, address)                        \
        do {                                                        \
                tlb_flush_pmd_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pte_free_tlb(tlb, ptep, address);                \
        } while (0)
#endif

#ifndef pmd_free_tlb
#define pmd_free_tlb(tlb, pmdp, address)                        \
        do {                                                        \
                tlb_flush_pud_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pmd_free_tlb(tlb, pmdp, address);                \
        } while (0)
#endif

#ifndef pud_free_tlb
#define pud_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                tlb_flush_p4d_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pud_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#ifndef p4d_free_tlb
#define p4d_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                __tlb_adjust_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __p4d_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#ifndef pte_needs_flush
static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
{
        return true;
}
#endif

#ifndef huge_pmd_needs_flush
static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
{
        return true;
}
#endif

#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt,
                                          unsigned long addr)
{
        /*
         * The caller must make sure that concurrent unsharing + exclusive
         * reuse is impossible until tlb_flush_unshared_tables() was called.
         */
        VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt));
        ptdesc_pmd_pts_dec(pt);

        /* Clearing a PUD pointing at a PMD table with PMD leaves. */
        tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE);

        /*
         * If the page table is now exclusively owned, we fully unshared
         * a page table.
         */
        if (!ptdesc_pmd_is_shared(pt))
                tlb->fully_unshared_tables = true;
        tlb->unshared_tables = true;
}

static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
{
        /*
         * As soon as the caller drops locks to allow for reuse of
         * previously-shared tables, these tables could get modified and
         * even reused outside of hugetlb context, so we have to make sure that
         * any page table walkers (incl. TLB, GUP-fast) are aware of that
         * change.
         *
         * Even if we are not fully unsharing a PMD table, we must
         * flush the TLB for the unsharer now.
         */
        if (tlb->unshared_tables)
                tlb_flush_mmu_tlbonly(tlb);

        /*
         * Similarly, we must make sure that concurrent GUP-fast will not
         * walk previously-shared page tables that are getting modified+reused
         * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast.
         *
         * We only perform this when we are the last sharer of a page table,
         * as the IPI will reach all CPUs: any GUP-fast.
         *
         * Note that on configs where tlb_remove_table_sync_one() is a NOP,
         * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
         * required IPIs already for us.
         */
        if (tlb->fully_unshared_tables) {
                tlb_remove_table_sync_one();
                tlb->fully_unshared_tables = false;
        }
}
#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */

#endif /* CONFIG_MMU */

#endif /* _ASM_GENERIC__TLB_H */











































































































    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Because linux/module.h has tracepoints in the header, and ftrace.h
 * used to include this file, define_trace.h includes linux/module.h
 * But we do not want the module.h to override the TRACE_SYSTEM macro
 * variable that define_trace.h is processing, so we only set it
 * when module events are being processed, which would happen when
 * CREATE_TRACE_POINTS is defined.
 */
#ifdef CREATE_TRACE_POINTS
#undef TRACE_SYSTEM
#define TRACE_SYSTEM module
#endif

#if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MODULE_H

#include <linux/tracepoint.h>

#ifdef CONFIG_MODULES

struct module;

#define show_module_flags(flags) __print_flags(flags, "",        \
        { (1UL << TAINT_PROPRIETARY_MODULE),        "P" },                \
        { (1UL << TAINT_OOT_MODULE),                "O" },                \
        { (1UL << TAINT_FORCED_MODULE),                "F" },                \
        { (1UL << TAINT_CRAP),                        "C" },                \
        { (1UL << TAINT_UNSIGNED_MODULE),        "E" })

TRACE_EVENT(module_load,

        TP_PROTO(struct module *mod),

        TP_ARGS(mod),

        TP_STRUCT__entry(
                __field(        unsigned int,        taints                )
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __entry->taints = mod->taints;
                __assign_str(name);
        ),

        TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints))
);

TRACE_EVENT(module_free,

        TP_PROTO(struct module *mod),

        TP_ARGS(mod),

        TP_STRUCT__entry(
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __assign_str(name);
        ),

        TP_printk("%s", __get_str(name))
);

#ifdef CONFIG_MODULE_UNLOAD
/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */

DECLARE_EVENT_CLASS(module_refcnt,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip),

        TP_STRUCT__entry(
                __field(        unsigned long,        ip                )
                __field(        int,                refcnt                )
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __entry->ip        = ip;
                __entry->refcnt        = atomic_read(&mod->refcnt);
                __assign_str(name);
        ),

        TP_printk("%s call_site=%ps refcnt=%d",
                  __get_str(name), (void *)__entry->ip, __entry->refcnt)
);

DEFINE_EVENT(module_refcnt, module_get,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip)
);

DEFINE_EVENT(module_refcnt, module_put,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip)
);
#endif /* CONFIG_MODULE_UNLOAD */

TRACE_EVENT(module_request,

        TP_PROTO(char *name, bool wait, unsigned long ip),

        TP_ARGS(name, wait, ip),

        TP_STRUCT__entry(
                __field(        unsigned long,        ip                )
                __field(        bool,                wait                )
                __string(        name,                name                )
        ),

        TP_fast_assign(
                __entry->ip        = ip;
                __entry->wait        = wait;
                __assign_str(name);
        ),

        TP_printk("%s wait=%d call_site=%ps",
                  __get_str(name), (int)__entry->wait, (void *)__entry->ip)
);

#endif /* CONFIG_MODULES */

#endif /* _TRACE_MODULE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  acpi_bus.h - ACPI Bus Driver ($Revision: 22 $)
 *
 *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
 *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
 */

#ifndef __ACPI_BUS_H__
#define __ACPI_BUS_H__

#include <linux/completion.h>
#include <linux/container_of.h>
#include <linux/device.h>
#include <linux/kobject.h>
#include <linux/mutex.h>
#include <linux/property.h>
#include <linux/types.h>

struct acpi_handle_list {
        u32 count;
        acpi_handle *handles;
};

/* acpi_utils.h */
acpi_status
acpi_extract_package(union acpi_object *package,
                     struct acpi_buffer *format, struct acpi_buffer *buffer);
acpi_status
acpi_evaluate_integer(acpi_handle handle,
                      acpi_string pathname,
                      struct acpi_object_list *arguments, unsigned long long *data);
bool acpi_evaluate_reference(acpi_handle handle, acpi_string pathname,
                             struct acpi_object_list *arguments,
                             struct acpi_handle_list *list);
bool acpi_handle_list_equal(struct acpi_handle_list *list1,
                            struct acpi_handle_list *list2);
void acpi_handle_list_replace(struct acpi_handle_list *dst,
                              struct acpi_handle_list *src);
void acpi_handle_list_free(struct acpi_handle_list *list);
bool acpi_device_dep(acpi_handle target, acpi_handle match);
acpi_status
acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code,
                  struct acpi_buffer *status_buf);

bool acpi_has_method(acpi_handle handle, char *name);
acpi_status acpi_execute_simple_method(acpi_handle handle, char *method,
                                       u64 arg);
acpi_status acpi_evaluate_ej0(acpi_handle handle);
acpi_status acpi_evaluate_lck(acpi_handle handle, int lock);
acpi_status acpi_evaluate_reg(acpi_handle handle, u8 space_id, u32 function);
bool acpi_ata_match(acpi_handle handle);
bool acpi_bay_match(acpi_handle handle);
bool acpi_dock_match(acpi_handle handle);

bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs);
union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid,
                        u64 rev, u64 func, union acpi_object *argv4);
#ifdef CONFIG_ACPI
bool
acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld);

static inline union acpi_object *
acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev,
                        u64 func, union acpi_object *argv4,
                        acpi_object_type type)
{
        union acpi_object *obj;

        obj = acpi_evaluate_dsm(handle, guid, rev, func, argv4);
        if (obj && obj->type != type) {
                ACPI_FREE(obj);
                obj = NULL;
        }

        return obj;
}
#endif

#define        ACPI_INIT_DSM_ARGV4(cnt, eles)                        \
        {                                                \
          .package.type = ACPI_TYPE_PACKAGE,                \
          .package.count = (cnt),                        \
          .package.elements = (eles)                        \
        }

bool acpi_dev_found(const char *hid);
bool acpi_dev_present(const char *hid, const char *uid, s64 hrv);
bool acpi_reduced_hardware(void);

#ifdef CONFIG_ACPI

struct proc_dir_entry;

#define ACPI_BUS_FILE_ROOT        "acpi"
extern struct proc_dir_entry *acpi_root_dir;

enum acpi_bus_device_type {
        ACPI_BUS_TYPE_DEVICE = 0,
        ACPI_BUS_TYPE_POWER,
        ACPI_BUS_TYPE_PROCESSOR,
        ACPI_BUS_TYPE_THERMAL,
        ACPI_BUS_TYPE_POWER_BUTTON,
        ACPI_BUS_TYPE_SLEEP_BUTTON,
        ACPI_BUS_TYPE_ECDT_EC,
        ACPI_BUS_DEVICE_TYPE_COUNT
};

struct acpi_driver;
struct acpi_device;

/*
 * ACPI Scan Handler
 * -----------------
 */

struct acpi_hotplug_profile {
        struct kobject kobj;
        int (*scan_dependent)(struct acpi_device *adev);
        void (*notify_online)(struct acpi_device *adev);
        bool enabled:1;
        bool demand_offline:1;
};

static inline struct acpi_hotplug_profile *to_acpi_hotplug_profile(
                                                struct kobject *kobj)
{
        return container_of(kobj, struct acpi_hotplug_profile, kobj);
}

struct acpi_scan_handler {
        struct list_head list_node;
        const struct acpi_device_id *ids;
        bool (*match)(const char *idstr, const struct acpi_device_id **matchid);
        int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id);
        void (*detach)(struct acpi_device *dev);
        void (*post_eject)(struct acpi_device *dev);
        void (*bind)(struct device *phys_dev);
        void (*unbind)(struct device *phys_dev);
        struct acpi_hotplug_profile hotplug;
};

/*
 * ACPI Hotplug Context
 * --------------------
 */

typedef int (*acpi_hp_notify) (struct acpi_device *, u32);
typedef void (*acpi_hp_uevent) (struct acpi_device *, u32);
typedef void (*acpi_hp_fixup) (struct acpi_device *);

struct acpi_hotplug_context {
        struct acpi_device *self;
        acpi_hp_notify notify;
        acpi_hp_uevent uevent;
        acpi_hp_fixup fixup;
};

/*
 * ACPI Driver
 * -----------
 */

typedef int (*acpi_op_add) (struct acpi_device * device);
typedef void (*acpi_op_remove) (struct acpi_device *device);
typedef void (*acpi_op_notify) (struct acpi_device * device, u32 event);

struct acpi_device_ops {
        acpi_op_add add;
        acpi_op_remove remove;
        acpi_op_notify notify;
};

#define ACPI_DRIVER_ALL_NOTIFY_EVENTS        0x1        /* system AND device events */

struct acpi_driver {
        char name[80];
        char class[80];
        const struct acpi_device_id *ids; /* Supported Hardware IDs */
        unsigned int flags;
        struct acpi_device_ops ops;
        struct device_driver drv;
};

/*
 * ACPI Device
 * -----------
 */

/* Status (_STA) */

struct acpi_device_status {
        u32 present:1;
        u32 enabled:1;
        u32 show_in_ui:1;
        u32 functional:1;
        u32 battery_present:1;
        u32 reserved:27;
};

/* Flags */

struct acpi_device_flags {
        u32 dynamic_status:1;
        u32 removable:1;
        u32 ejectable:1;
        u32 power_manageable:1;
        u32 match_driver:1;
        u32 initialized:1;
        u32 visited:1;
        u32 hotplug_notify:1;
        u32 is_dock_station:1;
        u32 of_compatible_ok:1;
        u32 coherent_dma:1;
        u32 cca_seen:1;
        u32 enumeration_by_parent:1;
        u32 honor_deps:1;
        u32 reserved:18;
};

/* File System */

struct acpi_device_dir {
        struct proc_dir_entry *entry;
};

#define acpi_device_dir(d)        ((d)->dir.entry)

/* Plug and Play */

#define MAX_ACPI_DEVICE_NAME_LEN        40
#define MAX_ACPI_CLASS_NAME_LEN                20
typedef char acpi_bus_id[8];
typedef u64 acpi_bus_address;
typedef char acpi_device_name[MAX_ACPI_DEVICE_NAME_LEN];
typedef char acpi_device_class[MAX_ACPI_CLASS_NAME_LEN];

struct acpi_hardware_id {
        struct list_head list;
        const char *id;
};

struct acpi_pnp_type {
        u32 hardware_id:1;
        u32 bus_address:1;
        u32 platform_id:1;
        u32 backlight:1;
        u32 reserved:28;
};

struct acpi_device_pnp {
        acpi_bus_id bus_id;                /* Object name */
        int instance_no;                /* Instance number of this object */
        struct acpi_pnp_type type;        /* ID type */
        acpi_bus_address bus_address;        /* _ADR */
        char *unique_id;                /* _UID */
        struct list_head ids;                /* _HID and _CIDs */
        acpi_device_name device_name;        /* Driver-determined */
        acpi_device_class device_class;        /*        "          */
};

#define acpi_device_bid(d)        ((d)->pnp.bus_id)
#define acpi_device_adr(d)        ((d)->pnp.bus_address)
const char *acpi_device_hid(struct acpi_device *device);
#define acpi_device_uid(d)        ((d)->pnp.unique_id)
#define acpi_device_name(d)        ((d)->pnp.device_name)
#define acpi_device_class(d)        ((d)->pnp.device_class)

/* Power Management */

struct acpi_device_power_flags {
        u32 explicit_get:1;        /* _PSC present? */
        u32 power_resources:1;        /* Power resources */
        u32 inrush_current:1;        /* Serialize Dx->D0 */
        u32 power_removed:1;        /* Optimize Dx->D0 */
        u32 ignore_parent:1;        /* Power is independent of parent power state */
        u32 dsw_present:1;        /* _DSW present? */
        u32 reserved:26;
};

struct acpi_device_power_state {
        struct list_head resources;        /* Power resources referenced */
        struct {
                u8 valid:1;
                u8 explicit_set:1;        /* _PSx present? */
                u8 reserved:6;
        } flags;
        int power;                /* % Power (compared to D0) */
        int latency;                /* Dx->D0 time (microseconds) */
};

struct acpi_device_power {
        int state;                /* Current state */
        struct acpi_device_power_flags flags;
        struct acpi_device_power_state states[ACPI_D_STATE_COUNT];        /* Power states (D0-D3Cold) */
        u8 state_for_enumeration; /* Deepest power state for enumeration */
};

struct acpi_dep_data {
        struct list_head node;
        acpi_handle supplier;
        acpi_handle consumer;
        bool honor_dep;
        bool met;
        bool free_when_met;
};

/* Performance Management */

struct acpi_device_perf_flags {
        u8 reserved:8;
};

struct acpi_device_perf_state {
        struct {
                u8 valid:1;
                u8 reserved:7;
        } flags;
        u8 power;                /* % Power (compared to P0) */
        u8 performance;                /* % Performance (    "   ) */
        int latency;                /* Px->P0 time (microseconds) */
};

struct acpi_device_perf {
        int state;
        struct acpi_device_perf_flags flags;
        int state_count;
        struct acpi_device_perf_state *states;
};

/* Wakeup Management */
struct acpi_device_wakeup_flags {
        u8 valid:1;                /* Can successfully enable wakeup? */
        u8 notifier_present:1;  /* Wake-up notify handler has been installed */
};

struct acpi_device_wakeup_context {
        void (*func)(struct acpi_device_wakeup_context *context);
        struct device *dev;
};

struct acpi_device_wakeup {
        acpi_handle gpe_device;
        u64 gpe_number;
        u64 sleep_state;
        struct list_head resources;
        struct acpi_device_wakeup_flags flags;
        struct acpi_device_wakeup_context context;
        struct wakeup_source *ws;
        int prepare_count;
        int enable_count;
};

struct acpi_device_physical_node {
        struct list_head node;
        struct device *dev;
        unsigned int node_id;
        bool put_online:1;
};

struct acpi_device_properties {
        struct list_head list;
        const guid_t *guid;
        union acpi_object *properties;
        void **bufs;
};

/* ACPI Device Specific Data (_DSD) */
struct acpi_device_data {
        const union acpi_object *pointer;
        struct list_head properties;
        const union acpi_object *of_compatible;
        struct list_head subnodes;
};

struct acpi_gpio_mapping;

#define ACPI_DEVICE_SWNODE_ROOT                        0

/*
 * The maximum expected number of CSI-2 data lanes.
 *
 * This number is not expected to ever have to be equal to or greater than the
 * number of bits in an unsigned long variable, but if it needs to be increased
 * above that limit, code will need to be adjusted accordingly.
 */
#define ACPI_DEVICE_CSI2_DATA_LANES                8

#define ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH        8

enum acpi_device_swnode_dev_props {
        ACPI_DEVICE_SWNODE_DEV_ROTATION,
        ACPI_DEVICE_SWNODE_DEV_CLOCK_FREQUENCY,
        ACPI_DEVICE_SWNODE_DEV_LED_MAX_MICROAMP,
        ACPI_DEVICE_SWNODE_DEV_FLASH_MAX_MICROAMP,
        ACPI_DEVICE_SWNODE_DEV_FLASH_MAX_TIMEOUT_US,
        ACPI_DEVICE_SWNODE_DEV_NUM_OF,
        ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES
};

enum acpi_device_swnode_port_props {
        ACPI_DEVICE_SWNODE_PORT_REG,
        ACPI_DEVICE_SWNODE_PORT_NUM_OF,
        ACPI_DEVICE_SWNODE_PORT_NUM_ENTRIES
};

enum acpi_device_swnode_ep_props {
        ACPI_DEVICE_SWNODE_EP_REMOTE_EP,
        ACPI_DEVICE_SWNODE_EP_BUS_TYPE,
        ACPI_DEVICE_SWNODE_EP_REG,
        ACPI_DEVICE_SWNODE_EP_CLOCK_LANES,
        ACPI_DEVICE_SWNODE_EP_DATA_LANES,
        ACPI_DEVICE_SWNODE_EP_LANE_POLARITIES,
        /* TX only */
        ACPI_DEVICE_SWNODE_EP_LINK_FREQUENCIES,
        ACPI_DEVICE_SWNODE_EP_NUM_OF,
        ACPI_DEVICE_SWNODE_EP_NUM_ENTRIES
};

/*
 * Each device has a root software node plus two times as many nodes as the
 * number of CSI-2 ports.
 */
#define ACPI_DEVICE_SWNODE_PORT(port)        (2 * (port) + 1)
#define ACPI_DEVICE_SWNODE_EP(endpoint)        \
                (ACPI_DEVICE_SWNODE_PORT(endpoint) + 1)

/**
 * struct acpi_device_software_node_port - MIPI DisCo for Imaging CSI-2 port
 * @port_name: Port name.
 * @data_lanes: "data-lanes" property values.
 * @lane_polarities: "lane-polarities" property values.
 * @link_frequencies: "link_frequencies" property values.
 * @port_nr: Port number.
 * @crs_crs2_local: _CRS CSI2 record present (i.e. this is a transmitter one).
 * @port_props: Port properties.
 * @ep_props: Endpoint properties.
 * @remote_ep: Reference to the remote endpoint.
 */
struct acpi_device_software_node_port {
        char port_name[ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH + 1];
        u32 data_lanes[ACPI_DEVICE_CSI2_DATA_LANES];
        u32 lane_polarities[ACPI_DEVICE_CSI2_DATA_LANES + 1 /* clock lane */];
        u64 link_frequencies[ACPI_DEVICE_CSI2_DATA_LANES];
        unsigned int port_nr;
        bool crs_csi2_local;

        struct property_entry port_props[ACPI_DEVICE_SWNODE_PORT_NUM_ENTRIES];
        struct property_entry ep_props[ACPI_DEVICE_SWNODE_EP_NUM_ENTRIES];

        struct software_node_ref_args remote_ep[1];
};

/**
 * struct acpi_device_software_nodes - Software nodes for an ACPI device
 * @dev_props: Device properties.
 * @nodes: Software nodes for root as well as ports and endpoints.
 * @nodeprts: Array of software node pointers, for (un)registering them.
 * @ports: Information related to each port and endpoint within a port.
 * @num_ports: The number of ports.
 */
struct acpi_device_software_nodes {
        struct property_entry dev_props[ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES];
        struct software_node *nodes;
        const struct software_node **nodeptrs;
        struct acpi_device_software_node_port *ports;
        unsigned int num_ports;
};

/* Device */
struct acpi_device {
        u32 pld_crc;
        int device_type;
        acpi_handle handle;                /* no handle for fixed hardware */
        struct fwnode_handle fwnode;
        struct list_head wakeup_list;
        struct list_head del_list;
        struct acpi_device_status status;
        struct acpi_device_flags flags;
        struct acpi_device_pnp pnp;
        struct acpi_device_power power;
        struct acpi_device_wakeup wakeup;
        struct acpi_device_perf performance;
        struct acpi_device_dir dir;
        struct acpi_device_data data;
        struct acpi_scan_handler *handler;
        struct acpi_hotplug_context *hp;
        struct acpi_device_software_nodes *swnodes;
        const struct acpi_gpio_mapping *driver_gpios;
        void *driver_data;
        struct device dev;
        unsigned int physical_node_count;
        unsigned int dep_unmet;
        struct list_head physical_node_list;
        struct mutex physical_node_lock;
        void (*remove)(struct acpi_device *);
};

/* Non-device subnode */
struct acpi_data_node {
        struct list_head sibling;
        const char *name;
        acpi_handle handle;
        struct fwnode_handle fwnode;
        struct fwnode_handle *parent;
        struct acpi_device_data data;
        struct kobject kobj;
        struct completion kobj_done;
};

extern const struct fwnode_operations acpi_device_fwnode_ops;
extern const struct fwnode_operations acpi_data_fwnode_ops;
extern const struct fwnode_operations acpi_static_fwnode_ops;

bool is_acpi_device_node(const struct fwnode_handle *fwnode);
bool is_acpi_data_node(const struct fwnode_handle *fwnode);

static inline bool is_acpi_node(const struct fwnode_handle *fwnode)
{
        return (is_acpi_device_node(fwnode) || is_acpi_data_node(fwnode));
}

#define to_acpi_device_node(__fwnode)                                        \
        ({                                                                \
                typeof(__fwnode) __to_acpi_device_node_fwnode = __fwnode; \
                                                                        \
                is_acpi_device_node(__to_acpi_device_node_fwnode) ?        \
                        container_of(__to_acpi_device_node_fwnode,        \
                                     struct acpi_device, fwnode) :        \
                        NULL;                                                \
        })

#define to_acpi_data_node(__fwnode)                                        \
        ({                                                                \
                typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode;        \
                                                                        \
                is_acpi_data_node(__to_acpi_data_node_fwnode) ?                \
                        container_of(__to_acpi_data_node_fwnode,        \
                                     struct acpi_data_node, fwnode) :        \
                        NULL;                                                \
        })

static inline bool is_acpi_static_node(const struct fwnode_handle *fwnode)
{
        return !IS_ERR_OR_NULL(fwnode) &&
                fwnode->ops == &acpi_static_fwnode_ops;
}

static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode,
                                        const char *name)
{
        return is_acpi_data_node(fwnode) ?
                (!strcmp(to_acpi_data_node(fwnode)->name, name)) : false;
}

static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev)
{
        return &adev->fwnode;
}

static inline void *acpi_driver_data(struct acpi_device *d)
{
        return d->driver_data;
}

#define to_acpi_device(d)        container_of(d, struct acpi_device, dev)
#define to_acpi_driver(d)        container_of_const(d, struct acpi_driver, drv)

static inline struct acpi_device *acpi_dev_parent(struct acpi_device *adev)
{
        if (adev->dev.parent)
                return to_acpi_device(adev->dev.parent);

        return NULL;
}

static inline void acpi_set_device_status(struct acpi_device *adev, u32 sta)
{
        *((u32 *)&adev->status) = sta;
}

static inline void acpi_set_hp_context(struct acpi_device *adev,
                                       struct acpi_hotplug_context *hp)
{
        hp->self = adev;
        adev->hp = hp;
}

void acpi_initialize_hp_context(struct acpi_device *adev,
                                struct acpi_hotplug_context *hp,
                                acpi_hp_notify notify, acpi_hp_uevent uevent);

/* acpi_device.dev.bus == &acpi_bus_type */
extern const struct bus_type acpi_bus_type;

int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data);
int acpi_dev_for_each_child(struct acpi_device *adev,
                            int (*fn)(struct acpi_device *, void *), void *data);
int acpi_dev_for_each_child_reverse(struct acpi_device *adev,
                                    int (*fn)(struct acpi_device *, void *),
                                    void *data);

/*
 * Events
 * ------
 */

struct acpi_bus_event {
        struct list_head node;
        acpi_device_class device_class;
        acpi_bus_id bus_id;
        u32 type;
        u32 data;
};

#define ACPI_AC_CLASS        "ac_adapter"

extern struct kobject *acpi_kobj;
extern int acpi_bus_generate_netlink_event(const char*, const char*, u8, int);
void acpi_bus_private_data_handler(acpi_handle, void *);
int acpi_bus_get_private_data(acpi_handle, void **);
int acpi_bus_attach_private_data(acpi_handle, void *);
void acpi_bus_detach_private_data(acpi_handle);
int acpi_dev_install_notify_handler(struct acpi_device *adev,
                                    u32 handler_type,
                                    acpi_notify_handler handler, void *context);
void acpi_dev_remove_notify_handler(struct acpi_device *adev,
                                    u32 handler_type,
                                    acpi_notify_handler handler);
extern int acpi_notifier_call_chain(const char *device_class,
                                    const char *bus_id, u32 type, u32 data);
extern int register_acpi_notifier(struct notifier_block *);
extern int unregister_acpi_notifier(struct notifier_block *);

/*
 * External Functions
 */

acpi_status acpi_bus_get_status_handle(acpi_handle handle,
                                       unsigned long long *sta);
int acpi_bus_get_status(struct acpi_device *device);

int acpi_bus_set_power(acpi_handle handle, int state);
const char *acpi_power_state_string(int state);
int acpi_device_set_power(struct acpi_device *device, int state);
int acpi_bus_init_power(struct acpi_device *device);
int acpi_device_fix_up_power(struct acpi_device *device);
void acpi_device_fix_up_power_extended(struct acpi_device *adev);
void acpi_device_fix_up_power_children(struct acpi_device *adev);
int acpi_bus_update_power(acpi_handle handle, int *state_p);
int acpi_device_update_power(struct acpi_device *device, int *state_p);
bool acpi_bus_power_manageable(acpi_handle handle);
void acpi_dev_power_up_children_with_adr(struct acpi_device *adev);
u8 acpi_dev_power_state_for_wake(struct acpi_device *adev);
int acpi_device_power_add_dependent(struct acpi_device *adev,
                                    struct device *dev);
void acpi_device_power_remove_dependent(struct acpi_device *adev,
                                        struct device *dev);

#ifdef CONFIG_PM
bool acpi_bus_can_wakeup(acpi_handle handle);
#else
static inline bool acpi_bus_can_wakeup(acpi_handle handle) { return false; }
#endif

void acpi_scan_lock_acquire(void);
void acpi_scan_lock_release(void);
void acpi_lock_hp_context(void);
void acpi_unlock_hp_context(void);
int acpi_scan_add_handler(struct acpi_scan_handler *handler);
/*
 * use a macro to avoid include chaining to get THIS_MODULE
 */
#define acpi_bus_register_driver(drv) \
        __acpi_bus_register_driver(drv, THIS_MODULE)
int __acpi_bus_register_driver(struct acpi_driver *driver, struct module *owner);
void acpi_bus_unregister_driver(struct acpi_driver *driver);
int acpi_bus_scan(acpi_handle handle);
void acpi_bus_trim(struct acpi_device *start);
acpi_status acpi_bus_get_ejd(acpi_handle handle, acpi_handle * ejd);
int acpi_match_device_ids(struct acpi_device *device,
                          const struct acpi_device_id *ids);
void acpi_set_modalias(struct acpi_device *adev, const char *default_id,
                       char *modalias, size_t len);

static inline bool acpi_device_enumerated(struct acpi_device *adev)
{
        return adev && adev->flags.initialized && adev->flags.visited;
}

/**
 * module_acpi_driver(acpi_driver) - Helper macro for registering an ACPI driver
 * @__acpi_driver: acpi_driver struct
 *
 * Helper macro for ACPI drivers which do not do anything special in module
 * init/exit. This eliminates a lot of boilerplate. Each module may only
 * use this macro once, and calling it replaces module_init() and module_exit()
 */
#define module_acpi_driver(__acpi_driver) \
        module_driver(__acpi_driver, acpi_bus_register_driver, \
                      acpi_bus_unregister_driver)

/*
 * Bind physical devices with ACPI devices
 */
struct acpi_bus_type {
        struct list_head list;
        const char *name;
        bool (*match)(struct device *dev);
        struct acpi_device * (*find_companion)(struct device *);
        void (*setup)(struct device *);
};
int register_acpi_bus_type(struct acpi_bus_type *);
int unregister_acpi_bus_type(struct acpi_bus_type *);
int acpi_bind_one(struct device *dev, struct acpi_device *adev);
int acpi_unbind_one(struct device *dev);

enum acpi_bridge_type {
        ACPI_BRIDGE_TYPE_PCIE = 1,
        ACPI_BRIDGE_TYPE_CXL,
};

struct acpi_pci_root {
        struct acpi_device * device;
        struct pci_bus *bus;
        u16 segment;
        int bridge_type;
        struct resource secondary;        /* downstream bus range */

        u32 osc_support_set;                /* _OSC state of support bits */
        u32 osc_control_set;                /* _OSC state of control bits */
        u32 osc_ext_support_set;        /* _OSC state of extended support bits */
        u32 osc_ext_control_set;        /* _OSC state of extended control bits */
        phys_addr_t mcfg_addr;
};

/* helper */

struct iommu_ops;

bool acpi_dma_supported(const struct acpi_device *adev);
enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
int acpi_iommu_fwspec_init(struct device *dev, u32 id,
                           struct fwnode_handle *fwnode);
int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map);
int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
                           const u32 *input_id);
static inline int acpi_dma_configure(struct device *dev,
                                     enum dev_dma_attr attr)
{
        return acpi_dma_configure_id(dev, attr, NULL);
}
struct acpi_device *acpi_find_child_device(struct acpi_device *parent,
                                           u64 address, bool check_children);
struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev,
                                           acpi_bus_address adr);
int acpi_is_root_bridge(acpi_handle);
struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle);

int acpi_enable_wakeup_device_power(struct acpi_device *dev, int state);
int acpi_disable_wakeup_device_power(struct acpi_device *dev);

#ifdef CONFIG_X86
bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status);
bool acpi_quirk_skip_acpi_ac_and_battery(void);
int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip);
#else
static inline bool acpi_device_override_status(struct acpi_device *adev,
                                               unsigned long long *status)
{
        return false;
}
static inline bool acpi_quirk_skip_acpi_ac_and_battery(void)
{
        return false;
}
static inline int
acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip)
{
        *skip = false;
        return 0;
}
#endif

#if IS_ENABLED(CONFIG_X86_ANDROID_TABLETS)
bool acpi_quirk_skip_i2c_client_enumeration(struct acpi_device *adev);
bool acpi_quirk_skip_gpio_event_handlers(void);
#else
static inline bool acpi_quirk_skip_i2c_client_enumeration(struct acpi_device *adev)
{
        return false;
}
static inline bool acpi_quirk_skip_gpio_event_handlers(void)
{
        return false;
}
#endif

#ifdef CONFIG_PM
void acpi_pm_wakeup_event(struct device *dev);
acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev,
                        void (*func)(struct acpi_device_wakeup_context *context));
acpi_status acpi_remove_pm_notifier(struct acpi_device *adev);
bool acpi_pm_device_can_wakeup(struct device *dev);
int acpi_pm_device_sleep_state(struct device *, int *, int);
int acpi_pm_set_device_wakeup(struct device *dev, bool enable);
#else
static inline void acpi_pm_wakeup_event(struct device *dev)
{
}
static inline acpi_status acpi_add_pm_notifier(struct acpi_device *adev,
                                               struct device *dev,
                                               void (*func)(struct acpi_device_wakeup_context *context))
{
        return AE_SUPPORT;
}
static inline acpi_status acpi_remove_pm_notifier(struct acpi_device *adev)
{
        return AE_SUPPORT;
}
static inline bool acpi_pm_device_can_wakeup(struct device *dev)
{
        return false;
}
static inline int acpi_pm_device_sleep_state(struct device *d, int *p, int m)
{
        if (p)
                *p = ACPI_STATE_D0;

        return (m >= ACPI_STATE_D0 && m <= ACPI_STATE_D3_COLD) ?
                m : ACPI_STATE_D0;
}
static inline int acpi_pm_set_device_wakeup(struct device *dev, bool enable)
{
        return -ENODEV;
}
#endif

#ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT
bool acpi_sleep_state_supported(u8 sleep_state);
#else
static inline bool acpi_sleep_state_supported(u8 sleep_state) { return false; }
#endif

#ifdef CONFIG_ACPI_SLEEP
u32 acpi_target_system_state(void);
#else
static inline u32 acpi_target_system_state(void) { return ACPI_STATE_S0; }
#endif

static inline bool acpi_device_power_manageable(struct acpi_device *adev)
{
        return adev->flags.power_manageable;
}

static inline bool acpi_device_can_wakeup(struct acpi_device *adev)
{
        return adev->wakeup.flags.valid;
}

static inline bool acpi_device_can_poweroff(struct acpi_device *adev)
{
        return adev->power.states[ACPI_STATE_D3_COLD].flags.valid ||
                ((acpi_gbl_FADT.header.revision < 6) &&
                adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set);
}

int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer);

static inline bool acpi_dev_hid_match(struct acpi_device *adev, const char *hid2)
{
        const char *hid1 = acpi_device_hid(adev);

        return hid1 && hid2 && !strcmp(hid1, hid2);
}

static inline bool acpi_str_uid_match(struct acpi_device *adev, const char *uid2)
{
        const char *uid1 = acpi_device_uid(adev);

        return uid1 && uid2 && !strcmp(uid1, uid2);
}

static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2)
{
        u64 uid1;

        return !acpi_dev_uid_to_integer(adev, &uid1) && uid1 == uid2;
}

#define TYPE_ENTRY(type, x)                        \
        const type: x,                                \
        type: x

#define ACPI_STR_TYPES(match)                        \
        TYPE_ENTRY(unsigned char *, match),        \
        TYPE_ENTRY(signed char *, match),                \
        TYPE_ENTRY(char *, match),                \
        TYPE_ENTRY(void *, match)

/**
 * acpi_dev_uid_match - Match device by supplied UID
 * @adev: ACPI device to match.
 * @uid2: Unique ID of the device.
 *
 * Matches UID in @adev with given @uid2.
 *
 * Returns: %true if matches, %false otherwise.
 */
#define acpi_dev_uid_match(adev, uid2)                                        \
        _Generic(uid2,                                                        \
                 /* Treat @uid2 as a string for acpi string types */        \
                 ACPI_STR_TYPES(acpi_str_uid_match),                        \
                 /* Treat as an integer otherwise */                        \
                 default: acpi_int_uid_match)(adev, uid2)

/**
 * acpi_dev_hid_uid_match - Match device by supplied HID and UID
 * @adev: ACPI device to match.
 * @hid2: Hardware ID of the device.
 * @uid2: Unique ID of the device, pass NULL to not check _UID.
 *
 * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2
 * will be treated as a match. If user wants to validate @uid2, it should be
 * done before calling this function.
 *
 * Returns: %true if matches or @uid2 is NULL, %false otherwise.
 */
#define acpi_dev_hid_uid_match(adev, hid2, uid2)                        \
        (acpi_dev_hid_match(adev, hid2) &&                                \
                /* Distinguish integer 0 from NULL @uid2 */                \
                (_Generic(uid2,        ACPI_STR_TYPES(!(uid2)), default: 0) ||        \
                acpi_dev_uid_match(adev, uid2)))

void acpi_dev_clear_dependencies(struct acpi_device *supplier);
bool acpi_dev_ready_for_enumeration(const struct acpi_device *device);
struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier,
                                                   struct acpi_device *start);

/**
 * for_each_acpi_consumer_dev - iterate over the consumer ACPI devices for a
 *                                given supplier
 * @supplier: Pointer to the supplier's ACPI device
 * @consumer: Pointer to &struct acpi_device to hold the consumer, initially NULL
 */
#define for_each_acpi_consumer_dev(supplier, consumer)                        \
        for (consumer = acpi_dev_get_next_consumer_dev(supplier, NULL);        \
             consumer;                                                        \
             consumer = acpi_dev_get_next_consumer_dev(supplier, consumer))

struct acpi_device *
acpi_dev_get_next_match_dev(struct acpi_device *adev, const char *hid, const char *uid, s64 hrv);
struct acpi_device *
acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv);

/**
 * for_each_acpi_dev_match - iterate over ACPI devices that matching the criteria
 * @adev: pointer to the matching ACPI device, NULL at the end of the loop
 * @hid: Hardware ID of the device.
 * @uid: Unique ID of the device, pass NULL to not check _UID
 * @hrv: Hardware Revision of the device, pass -1 to not check _HRV
 *
 * The caller is responsible for invoking acpi_dev_put() on the returned device.
 */
#define for_each_acpi_dev_match(adev, hid, uid, hrv)                        \
        for (adev = acpi_dev_get_first_match_dev(hid, uid, hrv);        \
             adev;                                                        \
             adev = acpi_dev_get_next_match_dev(adev, hid, uid, hrv))

static inline struct acpi_device *acpi_dev_get(struct acpi_device *adev)
{
        return adev ? to_acpi_device(get_device(&adev->dev)) : NULL;
}

static inline void acpi_dev_put(struct acpi_device *adev)
{
        if (adev)
                put_device(&adev->dev);
}

struct acpi_device *acpi_fetch_acpi_dev(acpi_handle handle);
struct acpi_device *acpi_get_acpi_dev(acpi_handle handle);

static inline void acpi_put_acpi_dev(struct acpi_device *adev)
{
        acpi_dev_put(adev);
}

int acpi_wait_for_acpi_ipmi(void);

int acpi_scan_add_dep(acpi_handle handle, struct acpi_handle_list *dep_devices);
u32 arch_acpi_add_auto_dep(acpi_handle handle);
#else        /* CONFIG_ACPI */

static inline int register_acpi_bus_type(void *bus) { return 0; }
static inline int unregister_acpi_bus_type(void *bus) { return 0; }

static inline int acpi_wait_for_acpi_ipmi(void) { return 0; }

static inline const char *acpi_device_hid(struct acpi_device *device)
{
        return "";
}

static inline bool
acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld)
{
        return false;
}

#define for_each_acpi_consumer_dev(supplier, consumer)                        \
        for (consumer = NULL; false && (supplier);)

#define for_each_acpi_dev_match(adev, hid, uid, hrv)                        \
        for (adev = NULL; false && (hid) && (uid) && (hrv); )

#endif                                /* CONFIG_ACPI */

#endif /*__ACPI_BUS_H__*/



































































































































    3 













    3 




















    3 










    3 



    3 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/usb/core/endpoint.c
 *
 * (C) Copyright 2002,2004,2006 Greg Kroah-Hartman
 * (C) Copyright 2002,2004 IBM Corp.
 * (C) Copyright 2006 Novell Inc.
 *
 * Released under the GPLv2 only.
 *
 * Endpoint sysfs stuff
 */

#include <linux/kernel.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/usb.h>
#include "usb.h"

struct ep_device {
        struct usb_endpoint_descriptor *desc;
        struct usb_device *udev;
        struct device dev;
};
#define to_ep_device(_dev) \
        container_of(_dev, struct ep_device, dev)

#define usb_ep_attr(field, format_string)                        \
static ssize_t field##_show(struct device *dev,                        \
                               struct device_attribute *attr,        \
                               char *buf)                        \
{                                                                \
        struct ep_device *ep = to_ep_device(dev);                \
        return sysfs_emit(buf, format_string, ep->desc->field);        \
}                                                                \
static DEVICE_ATTR_RO(field)

usb_ep_attr(bLength, "%02x\n");
usb_ep_attr(bEndpointAddress, "%02x\n");
usb_ep_attr(bmAttributes, "%02x\n");
usb_ep_attr(bInterval, "%02x\n");

static ssize_t wMaxPacketSize_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct ep_device *ep = to_ep_device(dev);
        return sysfs_emit(buf, "%04x\n", usb_endpoint_maxp(ep->desc));
}
static DEVICE_ATTR_RO(wMaxPacketSize);

static ssize_t type_show(struct device *dev, struct device_attribute *attr,
                         char *buf)
{
        struct ep_device *ep = to_ep_device(dev);
        char *type = "unknown";

        switch (usb_endpoint_type(ep->desc)) {
        case USB_ENDPOINT_XFER_CONTROL:
                type = "Control";
                break;
        case USB_ENDPOINT_XFER_ISOC:
                type = "Isoc";
                break;
        case USB_ENDPOINT_XFER_BULK:
                type = "Bulk";
                break;
        case USB_ENDPOINT_XFER_INT:
                type = "Interrupt";
                break;
        }
        return sysfs_emit(buf, "%s\n", type);
}
static DEVICE_ATTR_RO(type);

static ssize_t interval_show(struct device *dev, struct device_attribute *attr,
                             char *buf)
{
        struct ep_device *ep = to_ep_device(dev);
        unsigned int interval;
        char unit;

        interval = usb_decode_interval(ep->desc, ep->udev->speed);
        if (interval % 1000) {
                unit = 'u';
        } else {
                unit = 'm';
                interval /= 1000;
        }

        return sysfs_emit(buf, "%d%cs\n", interval, unit);
}
static DEVICE_ATTR_RO(interval);

static ssize_t direction_show(struct device *dev, struct device_attribute *attr,
                              char *buf)
{
        struct ep_device *ep = to_ep_device(dev);
        char *direction;

        if (usb_endpoint_xfer_control(ep->desc))
                direction = "both";
        else if (usb_endpoint_dir_in(ep->desc))
                direction = "in";
        else
                direction = "out";
        return sysfs_emit(buf, "%s\n", direction);
}
static DEVICE_ATTR_RO(direction);

static struct attribute *ep_dev_attrs[] = {
        &dev_attr_bLength.attr,
        &dev_attr_bEndpointAddress.attr,
        &dev_attr_bmAttributes.attr,
        &dev_attr_bInterval.attr,
        &dev_attr_wMaxPacketSize.attr,
        &dev_attr_interval.attr,
        &dev_attr_type.attr,
        &dev_attr_direction.attr,
        NULL,
};
static const struct attribute_group ep_dev_attr_grp = {
        .attrs = ep_dev_attrs,
};
static const struct attribute_group *ep_dev_groups[] = {
        &ep_dev_attr_grp,
        NULL
};

static void ep_device_release(struct device *dev)
{
        struct ep_device *ep_dev = to_ep_device(dev);

        kfree(ep_dev);
}

const struct device_type usb_ep_device_type = {
        .name =                "usb_endpoint",
        .release = ep_device_release,
};

int usb_create_ep_devs(struct device *parent,
                        struct usb_host_endpoint *endpoint,
                        struct usb_device *udev)
{
        struct ep_device *ep_dev;
        int retval;

        ep_dev = kzalloc_obj(*ep_dev);
        if (!ep_dev) {
                retval = -ENOMEM;
                goto exit;
        }

        ep_dev->desc = &endpoint->desc;
        ep_dev->udev = udev;
        ep_dev->dev.groups = ep_dev_groups;
        ep_dev->dev.type = &usb_ep_device_type;
        ep_dev->dev.parent = parent;
        dev_set_name(&ep_dev->dev, "ep_%02x", endpoint->desc.bEndpointAddress);

        retval = device_register(&ep_dev->dev);
        if (retval)
                goto error_register;

        device_enable_async_suspend(&ep_dev->dev);
        endpoint->ep_dev = ep_dev;
        return retval;

error_register:
        put_device(&ep_dev->dev);
exit:
        return retval;
}

void usb_remove_ep_devs(struct usb_host_endpoint *endpoint)
{
        struct ep_device *ep_dev = endpoint->ep_dev;

        if (ep_dev) {
                device_unregister(&ep_dev->dev);
                endpoint->ep_dev = NULL;
        }
}



























    3 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _DELAYED_CALL_H
#define _DELAYED_CALL_H

/*
 * Poor man's closures; I wish we could've done them sanely polymorphic,
 * but...
 */

struct delayed_call {
        void (*fn)(void *);
        void *arg;
};

#define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL}

/* I really wish we had closures with sane typechecking... */
static inline void set_delayed_call(struct delayed_call *call,
                void (*fn)(void *), void *arg)
{
        call->fn = fn;
        call->arg = arg;
}

static inline void do_delayed_call(struct delayed_call *call)
{
        if (call->fn)
                call->fn(call->arg);
}

static inline void clear_delayed_call(struct delayed_call *call)
{
        call->fn = NULL;
}
#endif












































































    2 


    1 



    2 













    2 





















    2 









    2 

    2 





    2 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
 * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * decompressor.c
 */

#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/slab.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "decompressor.h"
#include "squashfs.h"
#include "page_actor.h"

/*
 * This file (and decompressor.h) implements a decompressor framework for
 * Squashfs, allowing multiple decompressors to be easily supported
 */

static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
        NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
};

#ifndef CONFIG_SQUASHFS_LZ4
static const struct squashfs_decompressor squashfs_lz4_comp_ops = {
        NULL, NULL, NULL, NULL, LZ4_COMPRESSION, "lz4", 0
};
#endif

#ifndef CONFIG_SQUASHFS_LZO
static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
        NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
};
#endif

#ifndef CONFIG_SQUASHFS_XZ
static const struct squashfs_decompressor squashfs_xz_comp_ops = {
        NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
};
#endif

#ifndef CONFIG_SQUASHFS_ZLIB
static const struct squashfs_decompressor squashfs_zlib_comp_ops = {
        NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0
};
#endif

#ifndef CONFIG_SQUASHFS_ZSTD
static const struct squashfs_decompressor squashfs_zstd_comp_ops = {
        NULL, NULL, NULL, NULL, ZSTD_COMPRESSION, "zstd", 0
};
#endif

static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
        NULL, NULL, NULL, NULL, 0, "unknown", 0
};

static const struct squashfs_decompressor *decompressor[] = {
        &squashfs_zlib_comp_ops,
        &squashfs_lz4_comp_ops,
        &squashfs_lzo_comp_ops,
        &squashfs_xz_comp_ops,
        &squashfs_lzma_unsupported_comp_ops,
        &squashfs_zstd_comp_ops,
        &squashfs_unknown_comp_ops
};


const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
{
        int i;

        for (i = 0; decompressor[i]->id; i++)
                if (id == decompressor[i]->id)
                        break;

        return decompressor[i];
}


static void *get_comp_opts(struct super_block *sb, unsigned short flags)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        void *buffer = NULL, *comp_opts;
        struct squashfs_page_actor *actor = NULL;
        int length = 0;

        /*
         * Read decompressor specific options from file system if present
         */
        if (SQUASHFS_COMP_OPTS(flags)) {
                buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
                if (buffer == NULL) {
                        comp_opts = ERR_PTR(-ENOMEM);
                        goto out;
                }

                actor = squashfs_page_actor_init(&buffer, 1, 0);
                if (actor == NULL) {
                        comp_opts = ERR_PTR(-ENOMEM);
                        goto out;
                }

                length = squashfs_read_data(sb,
                        sizeof(struct squashfs_super_block), 0, NULL, actor);

                if (length < 0) {
                        comp_opts = ERR_PTR(length);
                        goto out;
                }
        }

        comp_opts = squashfs_comp_opts(msblk, buffer, length);

out:
        kfree(actor);
        kfree(buffer);
        return comp_opts;
}


void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        void *stream, *comp_opts = get_comp_opts(sb, flags);

        if (IS_ERR(comp_opts))
                return comp_opts;

        stream = msblk->thread_ops->create(msblk, comp_opts);
        if (IS_ERR(stream))
                kfree(comp_opts);

        return stream;
}































































































































































    9 
   12 
























































































    1 





    2 



    2 

















































































































































    4 


















































































































































    4 




















    4 

    4 
































    4 

















   14 

    4 


















   14 



    4 


   14 





    3 
    4 



   11 

   14 



   14 












    6 
   13 





















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
// SPDX-License-Identifier: GPL-2.0
/*
 * buffered writeback throttling. loosely based on CoDel. We can't drop
 * packets for IO scheduling, so the logic is something like this:
 *
 * - Monitor latencies in a defined window of time.
 * - If the minimum latency in the above window exceeds some target, increment
 *   scaling step and scale down queue depth by a factor of 2x. The monitoring
 *   window is then shrunk to 100 / sqrt(scaling step + 1).
 * - For any window where we don't have solid data on what the latencies
 *   look like, retain status quo.
 * - If latencies look good, decrement scaling step.
 * - If we're only doing writes, allow the scaling step to go negative. This
 *   will temporarily boost write performance, snapping back to a stable
 *   scaling step of 0 if reads show up or the heavy writers finish. Unlike
 *   positive scaling steps where we shrink the monitoring window, a negative
 *   scaling step retains the default step==0 window size.
 *
 * Copyright (C) 2016 Jens Axboe
 *
 */
#include <linux/kernel.h>
#include <linux/blk_types.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/swap.h>

#include "blk-stat.h"
#include "blk-wbt.h"
#include "blk-rq-qos.h"
#include "elevator.h"
#include "blk.h"

#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>

enum wbt_flags {
        WBT_TRACKED                = 1,        /* write, tracked for throttling */
        WBT_READ                = 2,        /* read */
        WBT_SWAP                = 4,        /* write, from swap_writeout() */
        WBT_DISCARD                = 8,        /* discard */

        WBT_NR_BITS                = 4,        /* number of bits */
};

enum {
        WBT_RWQ_BG                = 0,
        WBT_RWQ_SWAP,
        WBT_RWQ_DISCARD,
        WBT_NUM_RWQ,
};

/*
 * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
 * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
 * to WBT_STATE_OFF/ON_MANUAL.
 */
enum {
        WBT_STATE_ON_DEFAULT        = 1,        /* on by default */
        WBT_STATE_ON_MANUAL        = 2,        /* on manually by sysfs */
        WBT_STATE_OFF_DEFAULT        = 3,        /* off by default */
        WBT_STATE_OFF_MANUAL        = 4,        /* off manually by sysfs */
};

struct rq_wb {
        /*
         * Settings that govern how we throttle
         */
        unsigned int wb_background;                /* background writeback */
        unsigned int wb_normal;                        /* normal writeback */

        short enable_state;                        /* WBT_STATE_* */

        /*
         * Number of consecutive periods where we don't have enough
         * information to make a firm scale up/down decision.
         */
        unsigned int unknown_cnt;

        u64 win_nsec;                                /* default window size */
        u64 cur_win_nsec;                        /* current window size */

        struct blk_stat_callback *cb;

        u64 sync_issue;
        void *sync_cookie;

        unsigned long last_issue;        /* issue time of last read rq */
        unsigned long last_comp;        /* completion time of last read rq */
        unsigned long min_lat_nsec;
        struct rq_qos rqos;
        struct rq_wait rq_wait[WBT_NUM_RWQ];
        struct rq_depth rq_depth;
};

static int wbt_init(struct gendisk *disk, struct rq_wb *rwb);

static inline struct rq_wb *RQWB(struct rq_qos *rqos)
{
        return container_of(rqos, struct rq_wb, rqos);
}

static inline void wbt_clear_state(struct request *rq)
{
        rq->wbt_flags = 0;
}

static inline enum wbt_flags wbt_flags(struct request *rq)
{
        return rq->wbt_flags;
}

static inline bool wbt_is_tracked(struct request *rq)
{
        return rq->wbt_flags & WBT_TRACKED;
}

static inline bool wbt_is_read(struct request *rq)
{
        return rq->wbt_flags & WBT_READ;
}

enum {
        /*
         * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
         * from here depending on device stats
         */
        RWB_DEF_DEPTH        = 16,

        /*
         * 100msec window
         */
        RWB_WINDOW_NSEC                = 100 * 1000 * 1000ULL,

        /*
         * Disregard stats, if we don't meet this minimum
         */
        RWB_MIN_WRITE_SAMPLES        = 3,

        /*
         * If we have this number of consecutive windows without enough
         * information to scale up or down, slowly return to center state
         * (step == 0).
         */
        RWB_UNKNOWN_BUMP        = 5,
};

static inline bool rwb_enabled(struct rq_wb *rwb)
{
        return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
                      rwb->enable_state != WBT_STATE_OFF_MANUAL;
}

static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
{
        if (rwb_enabled(rwb)) {
                const unsigned long cur = jiffies;

                if (cur != *var)
                        *var = cur;
        }
}

/*
 * If a task was rate throttled in balance_dirty_pages() within the last
 * second or so, use that to indicate a higher cleaning rate.
 */
static bool wb_recent_wait(struct rq_wb *rwb)
{
        struct backing_dev_info *bdi = rwb->rqos.disk->bdi;

        return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}

static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
                                          enum wbt_flags wb_acct)
{
        if (wb_acct & WBT_SWAP)
                return &rwb->rq_wait[WBT_RWQ_SWAP];
        else if (wb_acct & WBT_DISCARD)
                return &rwb->rq_wait[WBT_RWQ_DISCARD];

        return &rwb->rq_wait[WBT_RWQ_BG];
}

static void rwb_wake_all(struct rq_wb *rwb)
{
        int i;

        for (i = 0; i < WBT_NUM_RWQ; i++) {
                struct rq_wait *rqw = &rwb->rq_wait[i];

                if (wq_has_sleeper(&rqw->wait))
                        wake_up_all(&rqw->wait);
        }
}

static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
                         enum wbt_flags wb_acct)
{
        int inflight, limit;

        inflight = atomic_dec_return(&rqw->inflight);

        /*
         * For discards, our limit is always the background. For writes, if
         * the device does write back caching, drop further down before we
         * wake people up.
         */
        if (wb_acct & WBT_DISCARD)
                limit = rwb->wb_background;
        else if (blk_queue_write_cache(rwb->rqos.disk->queue) &&
                 !wb_recent_wait(rwb))
                limit = 0;
        else
                limit = rwb->wb_normal;

        /*
         * Don't wake anyone up if we are above the normal limit.
         */
        if (inflight && inflight >= limit)
                return;

        if (wq_has_sleeper(&rqw->wait)) {
                int diff = limit - inflight;

                if (!inflight || diff >= rwb->wb_background / 2)
                        wake_up_all(&rqw->wait);
        }
}

static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
{
        struct rq_wb *rwb = RQWB(rqos);
        struct rq_wait *rqw;

        if (!(wb_acct & WBT_TRACKED))
                return;

        rqw = get_rq_wait(rwb, wb_acct);
        wbt_rqw_done(rwb, rqw, wb_acct);
}

/*
 * Called on completion of a request. Note that it's also called when
 * a request is merged, when the request gets freed.
 */
static void wbt_done(struct rq_qos *rqos, struct request *rq)
{
        struct rq_wb *rwb = RQWB(rqos);

        if (!wbt_is_tracked(rq)) {
                if (wbt_is_read(rq)) {
                        if (rwb->sync_cookie == rq) {
                                rwb->sync_issue = 0;
                                rwb->sync_cookie = NULL;
                        }

                        wb_timestamp(rwb, &rwb->last_comp);
                }
        } else {
                WARN_ON_ONCE(rq == rwb->sync_cookie);
                __wbt_done(rqos, wbt_flags(rq));
        }
        wbt_clear_state(rq);
}

static inline bool stat_sample_valid(struct blk_rq_stat *stat)
{
        /*
         * We need at least one read sample, and a minimum of
         * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
         * that it's writes impacting us, and not just some sole read on
         * a device that is in a lower power state.
         */
        return (stat[READ].nr_samples >= 1 &&
                stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
}

static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
{
        u64 issue = READ_ONCE(rwb->sync_issue);

        if (!issue || !rwb->sync_cookie)
                return 0;

        return blk_time_get_ns() - issue;
}

static inline unsigned int wbt_inflight(struct rq_wb *rwb)
{
        unsigned int i, ret = 0;

        for (i = 0; i < WBT_NUM_RWQ; i++)
                ret += atomic_read(&rwb->rq_wait[i].inflight);

        return ret;
}

enum {
        LAT_OK = 1,
        LAT_UNKNOWN,
        LAT_UNKNOWN_WRITES,
        LAT_EXCEEDED,
};

static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
{
        struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
        struct rq_depth *rqd = &rwb->rq_depth;
        u64 thislat;

        /*
         * If our stored sync issue exceeds the window size, or it
         * exceeds our min target AND we haven't logged any entries,
         * flag the latency as exceeded. wbt works off completion latencies,
         * but for a flooded device, a single sync IO can take a long time
         * to complete after being issued. If this time exceeds our
         * monitoring window AND we didn't see any other completions in that
         * window, then count that sync IO as a violation of the latency.
         */
        thislat = rwb_sync_issue_lat(rwb);
        if (thislat > rwb->cur_win_nsec ||
            (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
                trace_wbt_lat(bdi, thislat);
                return LAT_EXCEEDED;
        }

        /*
         * No read/write mix, if stat isn't valid
         */
        if (!stat_sample_valid(stat)) {
                /*
                 * If we had writes in this stat window and the window is
                 * current, we're only doing writes. If a task recently
                 * waited or still has writes in flights, consider us doing
                 * just writes as well.
                 */
                if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
                    wbt_inflight(rwb))
                        return LAT_UNKNOWN_WRITES;
                return LAT_UNKNOWN;
        }

        /*
         * If the 'min' latency exceeds our target, step down.
         */
        if (stat[READ].min > rwb->min_lat_nsec) {
                trace_wbt_lat(bdi, stat[READ].min);
                trace_wbt_stat(bdi, stat);
                return LAT_EXCEEDED;
        }

        if (rqd->scale_step)
                trace_wbt_stat(bdi, stat);

        return LAT_OK;
}

static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
{
        struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
        struct rq_depth *rqd = &rwb->rq_depth;

        trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
                        rwb->wb_background, rwb->wb_normal, rqd->max_depth);
}

static void calc_wb_limits(struct rq_wb *rwb)
{
        if (rwb->min_lat_nsec == 0) {
                rwb->wb_normal = rwb->wb_background = 0;
        } else if (rwb->rq_depth.max_depth <= 2) {
                rwb->wb_normal = rwb->rq_depth.max_depth;
                rwb->wb_background = 1;
        } else {
                rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
                rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
        }
}

static void scale_up(struct rq_wb *rwb)
{
        if (!rq_depth_scale_up(&rwb->rq_depth))
                return;
        calc_wb_limits(rwb);
        rwb->unknown_cnt = 0;
        rwb_wake_all(rwb);
        rwb_trace_step(rwb, tracepoint_string("scale up"));
}

static void scale_down(struct rq_wb *rwb, bool hard_throttle)
{
        if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle))
                return;
        calc_wb_limits(rwb);
        rwb->unknown_cnt = 0;
        rwb_trace_step(rwb, tracepoint_string("scale down"));
}

static void rwb_arm_timer(struct rq_wb *rwb)
{
        struct rq_depth *rqd = &rwb->rq_depth;

        if (rqd->scale_step > 0) {
                /*
                 * We should speed this up, using some variant of a fast
                 * integer inverse square root calculation. Since we only do
                 * this for every window expiration, it's not a huge deal,
                 * though.
                 */
                rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
                                        int_sqrt((rqd->scale_step + 1) << 8));
        } else {
                /*
                 * For step < 0, we don't want to increase/decrease the
                 * window size.
                 */
                rwb->cur_win_nsec = rwb->win_nsec;
        }

        blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
}

static void wb_timer_fn(struct blk_stat_callback *cb)
{
        struct rq_wb *rwb = cb->data;
        struct rq_depth *rqd = &rwb->rq_depth;
        unsigned int inflight = wbt_inflight(rwb);
        int status;

        if (!rwb->rqos.disk)
                return;

        status = latency_exceeded(rwb, cb->stat);

        trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight);

        /*
         * If we exceeded the latency target, step down. If we did not,
         * step one level up. If we don't know enough to say either exceeded
         * or ok, then don't do anything.
         */
        switch (status) {
        case LAT_EXCEEDED:
                scale_down(rwb, true);
                break;
        case LAT_OK:
                scale_up(rwb);
                break;
        case LAT_UNKNOWN_WRITES:
                /*
                 * We don't have a valid read/write sample, but we do have
                 * writes going on. Allow step to go negative, to increase
                 * write performance.
                 */
                scale_up(rwb);
                break;
        case LAT_UNKNOWN:
                if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
                        break;
                /*
                 * We get here when previously scaled reduced depth, and we
                 * currently don't have a valid read/write sample. For that
                 * case, slowly return to center state (step == 0).
                 */
                if (rqd->scale_step > 0)
                        scale_up(rwb);
                else if (rqd->scale_step < 0)
                        scale_down(rwb, false);
                break;
        default:
                break;
        }

        /*
         * Re-arm timer, if we have IO in flight
         */
        if (rqd->scale_step || inflight)
                rwb_arm_timer(rwb);
}

static void wbt_update_limits(struct rq_wb *rwb)
{
        struct rq_depth *rqd = &rwb->rq_depth;

        rqd->scale_step = 0;
        rqd->scaled_max = false;

        rq_depth_calc_max_depth(rqd);
        calc_wb_limits(rwb);

        rwb_wake_all(rwb);
}

bool wbt_disabled(struct request_queue *q)
{
        struct rq_qos *rqos = wbt_rq_qos(q);

        return !rqos || !rwb_enabled(RQWB(rqos));
}

u64 wbt_get_min_lat(struct request_queue *q)
{
        struct rq_qos *rqos = wbt_rq_qos(q);
        if (!rqos)
                return 0;
        return RQWB(rqos)->min_lat_nsec;
}

static void wbt_set_min_lat(struct request_queue *q, u64 val)
{
        struct rq_qos *rqos = wbt_rq_qos(q);
        if (!rqos)
                return;

        RQWB(rqos)->min_lat_nsec = val;
        if (val)
                RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
        else
                RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;

        wbt_update_limits(RQWB(rqos));
}


static bool close_io(struct rq_wb *rwb)
{
        const unsigned long now = jiffies;

        return time_before(now, rwb->last_issue + HZ / 10) ||
                time_before(now, rwb->last_comp + HZ / 10);
}

#define REQ_HIPRIO        (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP)

static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
{
        unsigned int limit;

        if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
                return rwb->wb_background;

        /*
         * At this point we know it's a buffered write. If this is
         * swap trying to free memory, or REQ_SYNC is set, then
         * it's WB_SYNC_ALL writeback, and we'll use the max limit for
         * that. If the write is marked as a background write, then use
         * the idle limit, or go to normal if we haven't had competing
         * IO for a bit.
         */
        if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb))
                limit = rwb->rq_depth.max_depth;
        else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
                /*
                 * If less than 100ms since we completed unrelated IO,
                 * limit us to half the depth for background writeback.
                 */
                limit = rwb->wb_background;
        } else
                limit = rwb->wb_normal;

        return limit;
}

struct wbt_wait_data {
        struct rq_wb *rwb;
        enum wbt_flags wb_acct;
        blk_opf_t opf;
};

static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
{
        struct wbt_wait_data *data = private_data;
        return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf));
}

static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
{
        struct wbt_wait_data *data = private_data;
        wbt_rqw_done(data->rwb, rqw, data->wb_acct);
}

/*
 * Block if we will exceed our limit, or if we are currently waiting for
 * the timer to kick off queuing again.
 */
static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
                       blk_opf_t opf)
{
        struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
        struct wbt_wait_data data = {
                .rwb = rwb,
                .wb_acct = wb_acct,
                .opf = opf,
        };

        rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
}

static inline bool wbt_should_throttle(struct bio *bio)
{
        switch (bio_op(bio)) {
        case REQ_OP_WRITE:
                /*
                 * Don't throttle WRITE_ODIRECT
                 */
                if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
                    (REQ_SYNC | REQ_IDLE))
                        return false;
                fallthrough;
        case REQ_OP_DISCARD:
                return true;
        default:
                return false;
        }
}

static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
{
        enum wbt_flags flags = 0;

        if (!rwb_enabled(rwb))
                return 0;

        if (bio_op(bio) == REQ_OP_READ) {
                flags = WBT_READ;
        } else if (wbt_should_throttle(bio)) {
                if (bio->bi_opf & REQ_SWAP)
                        flags |= WBT_SWAP;
                if (bio_op(bio) == REQ_OP_DISCARD)
                        flags |= WBT_DISCARD;
                flags |= WBT_TRACKED;
        }
        return flags;
}

static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
{
        struct rq_wb *rwb = RQWB(rqos);
        enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
        __wbt_done(rqos, flags);
}

/* May sleep, if we have exceeded the writeback limits. */
static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
{
        struct rq_wb *rwb = RQWB(rqos);
        enum wbt_flags flags;

        flags = bio_to_wbt_flags(rwb, bio);
        if (!(flags & WBT_TRACKED)) {
                if (flags & WBT_READ)
                        wb_timestamp(rwb, &rwb->last_issue);
                return;
        }

        __wbt_wait(rwb, flags, bio->bi_opf);

        if (!blk_stat_is_active(rwb->cb))
                rwb_arm_timer(rwb);
}

static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
{
        struct rq_wb *rwb = RQWB(rqos);
        rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
}

static void wbt_issue(struct rq_qos *rqos, struct request *rq)
{
        struct rq_wb *rwb = RQWB(rqos);

        if (!rwb_enabled(rwb))
                return;

        /*
         * Track sync issue, in case it takes a long time to complete. Allows us
         * to react quicker, if a sync IO takes a long time to complete. Note
         * that this is just a hint. The request can go away when it completes,
         * so it's important we never dereference it. We only use the address to
         * compare with, which is why we store the sync_issue time locally.
         */
        if (wbt_is_read(rq) && !rwb->sync_issue) {
                rwb->sync_cookie = rq;
                rwb->sync_issue = rq->io_start_time_ns;
        }
}

static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
{
        struct rq_wb *rwb = RQWB(rqos);
        if (!rwb_enabled(rwb))
                return;
        if (rq == rwb->sync_cookie) {
                rwb->sync_issue = 0;
                rwb->sync_cookie = NULL;
        }
}

static int wbt_data_dir(const struct request *rq)
{
        const enum req_op op = req_op(rq);

        if (op == REQ_OP_READ)
                return READ;
        else if (op_is_write(op))
                return WRITE;

        /* don't account */
        return -1;
}

static struct rq_wb *wbt_alloc(void)
{
        struct rq_wb *rwb = kzalloc_obj(*rwb);

        if (!rwb)
                return NULL;

        rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
        if (!rwb->cb) {
                kfree(rwb);
                return NULL;
        }

        return rwb;
}

static void wbt_free(struct rq_wb *rwb)
{
        blk_stat_free_callback(rwb->cb);
        kfree(rwb);
}

/*
 * Enable wbt if defaults are configured that way
 */
static bool __wbt_enable_default(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct rq_qos *rqos;
        bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);

        mutex_lock(&disk->rqos_state_mutex);

        if (blk_queue_disable_wbt(q))
                enable = false;

        /* Throttling already enabled? */
        rqos = wbt_rq_qos(q);
        if (rqos) {
                if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
                        RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
                mutex_unlock(&disk->rqos_state_mutex);
                return false;
        }
        mutex_unlock(&disk->rqos_state_mutex);

        /* Queue not registered? Maybe shutting down... */
        if (!blk_queue_registered(q))
                return false;

        if (queue_is_mq(q) && enable)
                return true;
        return false;
}

void wbt_enable_default(struct gendisk *disk)
{
        __wbt_enable_default(disk);
}
EXPORT_SYMBOL_GPL(wbt_enable_default);

void wbt_init_enable_default(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct rq_wb *rwb;
        unsigned int memflags;

        if (!__wbt_enable_default(disk))
                return;

        rwb = wbt_alloc();
        if (!rwb)
                return;

        if (wbt_init(disk, rwb)) {
                pr_warn("%s: failed to enable wbt\n", disk->disk_name);
                wbt_free(rwb);
                return;
        }

        memflags = blk_debugfs_lock(q);
        blk_mq_debugfs_register_rq_qos(q);
        blk_debugfs_unlock(q, memflags);
}

static u64 wbt_default_latency_nsec(struct request_queue *q)
{
        /*
         * We default to 2msec for non-rotational storage, and 75msec
         * for rotational storage.
         */
        if (blk_queue_rot(q))
                return 75000000ULL;
        return 2000000ULL;
}

static void wbt_queue_depth_changed(struct rq_qos *rqos)
{
        RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue);
        wbt_update_limits(RQWB(rqos));
}

static void wbt_exit(struct rq_qos *rqos)
{
        struct rq_wb *rwb = RQWB(rqos);

        blk_stat_remove_callback(rqos->disk->queue, rwb->cb);
        wbt_free(rwb);
}

/*
 * Disable wbt, if enabled by default.
 */
void wbt_disable_default(struct gendisk *disk)
{
        struct rq_qos *rqos = wbt_rq_qos(disk->queue);
        struct rq_wb *rwb;
        if (!rqos)
                return;
        mutex_lock(&disk->rqos_state_mutex);
        rwb = RQWB(rqos);
        if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
                blk_stat_deactivate(rwb->cb);
                rwb->enable_state = WBT_STATE_OFF_DEFAULT;
        }
        mutex_unlock(&disk->rqos_state_mutex);
}
EXPORT_SYMBOL_GPL(wbt_disable_default);

#ifdef CONFIG_BLK_DEBUG_FS
static int wbt_curr_win_nsec_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%llu\n", rwb->cur_win_nsec);
        return 0;
}

static int wbt_enabled_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%d\n", rwb->enable_state);
        return 0;
}

static int wbt_id_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;

        seq_printf(m, "%u\n", rqos->id);
        return 0;
}

static int wbt_inflight_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);
        int i;

        for (i = 0; i < WBT_NUM_RWQ; i++)
                seq_printf(m, "%d: inflight %d\n", i,
                           atomic_read(&rwb->rq_wait[i].inflight));
        return 0;
}

static int wbt_min_lat_nsec_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%lu\n", rwb->min_lat_nsec);
        return 0;
}

static int wbt_unknown_cnt_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%u\n", rwb->unknown_cnt);
        return 0;
}

static int wbt_normal_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%u\n", rwb->wb_normal);
        return 0;
}

static int wbt_background_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%u\n", rwb->wb_background);
        return 0;
}

static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
        {"curr_win_nsec", 0400, wbt_curr_win_nsec_show},
        {"enabled", 0400, wbt_enabled_show},
        {"id", 0400, wbt_id_show},
        {"inflight", 0400, wbt_inflight_show},
        {"min_lat_nsec", 0400, wbt_min_lat_nsec_show},
        {"unknown_cnt", 0400, wbt_unknown_cnt_show},
        {"wb_normal", 0400, wbt_normal_show},
        {"wb_background", 0400, wbt_background_show},
        {},
};
#endif

static const struct rq_qos_ops wbt_rqos_ops = {
        .throttle = wbt_wait,
        .issue = wbt_issue,
        .track = wbt_track,
        .requeue = wbt_requeue,
        .done = wbt_done,
        .cleanup = wbt_cleanup,
        .queue_depth_changed = wbt_queue_depth_changed,
        .exit = wbt_exit,
#ifdef CONFIG_BLK_DEBUG_FS
        .debugfs_attrs = wbt_debugfs_attrs,
#endif
};

static int wbt_init(struct gendisk *disk, struct rq_wb *rwb)
{
        struct request_queue *q = disk->queue;
        int ret;
        int i;

        for (i = 0; i < WBT_NUM_RWQ; i++)
                rq_wait_init(&rwb->rq_wait[i]);

        rwb->last_comp = rwb->last_issue = jiffies;
        rwb->win_nsec = RWB_WINDOW_NSEC;
        rwb->enable_state = WBT_STATE_ON_DEFAULT;
        rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
        rwb->min_lat_nsec = wbt_default_latency_nsec(q);
        rwb->rq_depth.queue_depth = blk_queue_depth(q);
        wbt_update_limits(rwb);

        /*
         * Assign rwb and add the stats callback.
         */
        mutex_lock(&q->rq_qos_mutex);
        ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
        mutex_unlock(&q->rq_qos_mutex);
        if (ret)
                return ret;

        blk_stat_add_callback(q, rwb->cb);
        return 0;
}

int wbt_set_lat(struct gendisk *disk, s64 val)
{
        struct request_queue *q = disk->queue;
        struct rq_qos *rqos = wbt_rq_qos(q);
        struct rq_wb *rwb = NULL;
        unsigned int memflags;
        int ret = 0;

        if (!rqos) {
                rwb = wbt_alloc();
                if (!rwb)
                        return -ENOMEM;
        }

        /*
         * Ensure that the queue is idled, in case the latency update
         * ends up either enabling or disabling wbt completely. We can't
         * have IO inflight if that happens.
         */
        memflags = blk_mq_freeze_queue(q);
        if (!rqos) {
                ret = wbt_init(disk, rwb);
                if (ret) {
                        wbt_free(rwb);
                        goto out;
                }
        }

        if (val == -1)
                val = wbt_default_latency_nsec(q);
        else if (val >= 0)
                val *= 1000ULL;

        if (wbt_get_min_lat(q) == val)
                goto out;

        blk_mq_quiesce_queue(q);

        mutex_lock(&disk->rqos_state_mutex);
        wbt_set_min_lat(q, val);
        mutex_unlock(&disk->rqos_state_mutex);

        blk_mq_unquiesce_queue(q);
out:
        blk_mq_unfreeze_queue(q, memflags);

        memflags = blk_debugfs_lock(q);
        blk_mq_debugfs_register_rq_qos(q);
        blk_debugfs_unlock(q, memflags);

        return ret;
}



























    1 


    1 


















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Isovalent */

#include <linux/bpf.h>

struct bpf_insn_array {
        struct bpf_map map;
        atomic_t used;
        long *ips;
        DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values);
};

#define cast_insn_array(MAP_PTR) \
        container_of((MAP_PTR), struct bpf_insn_array, map)

#define INSN_DELETED ((u32)-1)

static inline u64 insn_array_alloc_size(u32 max_entries)
{
        const u64 base_size = sizeof(struct bpf_insn_array);
        const u64 entry_size = sizeof(struct bpf_insn_array_value);

        return base_size + max_entries * (entry_size + sizeof(long));
}

static int insn_array_alloc_check(union bpf_attr *attr)
{
        u32 value_size = sizeof(struct bpf_insn_array_value);

        if (attr->max_entries == 0 || attr->key_size != 4 ||
            attr->value_size != value_size || attr->map_flags != 0)
                return -EINVAL;

        return 0;
}

static void insn_array_free(struct bpf_map *map)
{
        struct bpf_insn_array *insn_array = cast_insn_array(map);

        bpf_map_area_free(insn_array);
}

static struct bpf_map *insn_array_alloc(union bpf_attr *attr)
{
        u64 size = insn_array_alloc_size(attr->max_entries);
        struct bpf_insn_array *insn_array;

        insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE);
        if (!insn_array)
                return ERR_PTR(-ENOMEM);

        /* ips are allocated right after the insn_array->values[] array */
        insn_array->ips = (void *)&insn_array->values[attr->max_entries];

        bpf_map_init_from_attr(&insn_array->map, attr);

        /* BPF programs aren't allowed to write to the map */
        insn_array->map.map_flags |= BPF_F_RDONLY_PROG;

        return &insn_array->map;
}

static void *insn_array_lookup_elem(struct bpf_map *map, void *key)
{
        struct bpf_insn_array *insn_array = cast_insn_array(map);
        u32 index = *(u32 *)key;

        if (unlikely(index >= insn_array->map.max_entries))
                return NULL;

        return &insn_array->values[index];
}

static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
{
        struct bpf_insn_array *insn_array = cast_insn_array(map);
        u32 index = *(u32 *)key;
        struct bpf_insn_array_value val = {};

        if (unlikely(index >= insn_array->map.max_entries))
                return -E2BIG;

        if (unlikely(map_flags & BPF_NOEXIST))
                return -EEXIST;

        copy_map_value(map, &val, value);
        if (val.jitted_off || val.xlated_off)
                return -EINVAL;

        insn_array->values[index].orig_off = val.orig_off;

        return 0;
}

static long insn_array_delete_elem(struct bpf_map *map, void *key)
{
        return -EINVAL;
}

static int insn_array_check_btf(struct bpf_map *map,
                              const struct btf *btf,
                              const struct btf_type *key_type,
                              const struct btf_type *value_type)
{
        if (!btf_type_is_i32(key_type))
                return -EINVAL;

        if (!btf_type_is_i64(value_type))
                return -EINVAL;

        return 0;
}

static u64 insn_array_mem_usage(const struct bpf_map *map)
{
        return insn_array_alloc_size(map->max_entries);
}

static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
{
        struct bpf_insn_array *insn_array = cast_insn_array(map);

        if ((off % sizeof(long)) != 0 ||
            (off / sizeof(long)) >= map->max_entries)
                return -EACCES;

        /* from BPF's point of view, this map is a jump table */
        *imm = (unsigned long)insn_array->ips;

        return 0;
}

BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array)

const struct bpf_map_ops insn_array_map_ops = {
        .map_alloc_check = insn_array_alloc_check,
        .map_alloc = insn_array_alloc,
        .map_free = insn_array_free,
        .map_get_next_key = bpf_array_get_next_key,
        .map_lookup_elem = insn_array_lookup_elem,
        .map_update_elem = insn_array_update_elem,
        .map_delete_elem = insn_array_delete_elem,
        .map_check_btf = insn_array_check_btf,
        .map_mem_usage = insn_array_mem_usage,
        .map_direct_value_addr = insn_array_map_direct_value_addr,
        .map_btf_id = &insn_array_btf_ids[0],
};

static inline bool is_frozen(struct bpf_map *map)
{
        guard(mutex)(&map->freeze_mutex);

        return map->frozen;
}

static bool is_insn_array(const struct bpf_map *map)
{
        return map->map_type == BPF_MAP_TYPE_INSN_ARRAY;
}

static inline bool valid_offsets(const struct bpf_insn_array *insn_array,
                                 const struct bpf_prog *prog)
{
        u32 off;
        int i;

        for (i = 0; i < insn_array->map.max_entries; i++) {
                off = insn_array->values[i].orig_off;

                if (off >= prog->len)
                        return false;

                if (off > 0) {
                        if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM))
                                return false;
                }
        }

        return true;
}

int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog)
{
        struct bpf_insn_array *insn_array = cast_insn_array(map);
        struct bpf_insn_array_value *values = insn_array->values;
        int i;

        if (!is_frozen(map))
                return -EINVAL;

        if (!valid_offsets(insn_array, prog))
                return -EINVAL;

        /*
         * There can be only one program using the map
         */
        if (atomic_xchg(&insn_array->used, 1))
                return -EBUSY;

        /*
         * Reset all the map indexes to the original values.  This is needed,
         * e.g., when a replay of verification with different log level should
         * be performed.
         */
        for (i = 0; i < map->max_entries; i++)
                values[i].xlated_off = values[i].orig_off;

        return 0;
}

int bpf_insn_array_ready(struct bpf_map *map)
{
        struct bpf_insn_array *insn_array = cast_insn_array(map);
        int i;

        for (i = 0; i < map->max_entries; i++) {
                if (insn_array->values[i].xlated_off == INSN_DELETED)
                        continue;
                if (!insn_array->ips[i])
                        return -EFAULT;
        }

        return 0;
}

void bpf_insn_array_release(struct bpf_map *map)
{
        struct bpf_insn_array *insn_array = cast_insn_array(map);

        atomic_set(&insn_array->used, 0);
}

void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len)
{
        struct bpf_insn_array *insn_array = cast_insn_array(map);
        int i;

        if (len <= 1)
                return;

        for (i = 0; i < map->max_entries; i++) {
                if (insn_array->values[i].xlated_off <= off)
                        continue;
                if (insn_array->values[i].xlated_off == INSN_DELETED)
                        continue;
                insn_array->values[i].xlated_off += len - 1;
        }
}

void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len)
{
        struct bpf_insn_array *insn_array = cast_insn_array(map);
        int i;

        for (i = 0; i < map->max_entries; i++) {
                if (insn_array->values[i].xlated_off < off)
                        continue;
                if (insn_array->values[i].xlated_off == INSN_DELETED)
                        continue;
                if (insn_array->values[i].xlated_off < off + len)
                        insn_array->values[i].xlated_off = INSN_DELETED;
                else
                        insn_array->values[i].xlated_off -= len;
        }
}

/*
 * This function is called by JITs. The image is the real program
 * image, the offsets array set up the xlated -> jitted mapping.
 * The offsets[xlated] offset should point to the beginning of
 * the jitted instruction.
 */
void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
{
        struct bpf_insn_array *insn_array;
        struct bpf_map *map;
        u32 xlated_off;
        int i, j;

        if (!offsets || !image)
                return;

        for (i = 0; i < prog->aux->used_map_cnt; i++) {
                map = prog->aux->used_maps[i];
                if (!is_insn_array(map))
                        continue;

                insn_array = cast_insn_array(map);
                for (j = 0; j < map->max_entries; j++) {
                        xlated_off = insn_array->values[j].xlated_off;
                        if (xlated_off == INSN_DELETED)
                                continue;
                        if (xlated_off < prog->aux->subprog_start)
                                continue;
                        xlated_off -= prog->aux->subprog_start;
                        if (xlated_off >= prog->len)
                                continue;

                        insn_array->values[j].jitted_off = offsets[xlated_off];
                        insn_array->ips[j] = (long)(image + offsets[xlated_off]);
                }
        }
}
























































   10 








































    2 


    2 
































    2 


















    2 




























































































    3 

































    3 

    3 








    3 
    3 












    3 




























































































































    9 











































































































































































































   14 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
 */
#ifndef __LINUX_BIO_H
#define __LINUX_BIO_H

#include <linux/mempool.h>
/* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
#include <linux/blk_types.h>
#include <linux/uio.h>

#define BIO_MAX_VECS                256U
#define BIO_MAX_INLINE_VECS        UIO_MAXIOV

struct queue_limits;

static inline unsigned int bio_max_segs(unsigned int nr_segs)
{
        return min(nr_segs, BIO_MAX_VECS);
}

#define bio_iter_iovec(bio, iter)                                \
        bvec_iter_bvec((bio)->bi_io_vec, (iter))

#define bio_iter_page(bio, iter)                                \
        bvec_iter_page((bio)->bi_io_vec, (iter))
#define bio_iter_len(bio, iter)                                        \
        bvec_iter_len((bio)->bi_io_vec, (iter))
#define bio_iter_offset(bio, iter)                                \
        bvec_iter_offset((bio)->bi_io_vec, (iter))

#define bio_page(bio)                bio_iter_page((bio), (bio)->bi_iter)
#define bio_offset(bio)                bio_iter_offset((bio), (bio)->bi_iter)
#define bio_iovec(bio)                bio_iter_iovec((bio), (bio)->bi_iter)

#define bvec_iter_sectors(iter)        ((iter).bi_size >> 9)
#define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter)))

#define bio_sectors(bio)        bvec_iter_sectors((bio)->bi_iter)
#define bio_end_sector(bio)        bvec_iter_end_sector((bio)->bi_iter)

/*
 * Return the data direction, READ or WRITE.
 */
#define bio_data_dir(bio) \
        (op_is_write(bio_op(bio)) ? WRITE : READ)

static inline bool bio_flagged(const struct bio *bio, unsigned int bit)
{
        return bio->bi_flags & (1U << bit);
}

static inline void bio_set_flag(struct bio *bio, unsigned int bit)
{
        bio->bi_flags |= (1U << bit);
}

static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
{
        bio->bi_flags &= ~(1U << bit);
}

/*
 * Check whether this bio carries any data or not. A NULL bio is allowed.
 */
static inline bool bio_has_data(struct bio *bio)
{
        if (bio &&
            bio->bi_iter.bi_size &&
            bio_op(bio) != REQ_OP_DISCARD &&
            bio_op(bio) != REQ_OP_SECURE_ERASE &&
            bio_op(bio) != REQ_OP_WRITE_ZEROES)
                return true;

        return false;
}

static inline bool bio_no_advance_iter(const struct bio *bio)
{
        return bio_op(bio) == REQ_OP_DISCARD ||
               bio_op(bio) == REQ_OP_SECURE_ERASE ||
               bio_op(bio) == REQ_OP_WRITE_ZEROES;
}

static inline void *bio_data(struct bio *bio)
{
        if (bio_has_data(bio))
                return page_address(bio_page(bio)) + bio_offset(bio);

        return NULL;
}

static inline bool bio_next_segment(const struct bio *bio,
                                    struct bvec_iter_all *iter)
{
        if (iter->idx >= bio->bi_vcnt)
                return false;

        bvec_advance(&bio->bi_io_vec[iter->idx], iter);
        return true;
}

/*
 * drivers should _never_ use the all version - the bio may have been split
 * before it got to the driver and the driver won't own all of it
 */
#define bio_for_each_segment_all(bvl, bio, iter) \
        for (bvl = bvec_init_iter_all(&iter); bio_next_segment((bio), &iter); )

static inline void bio_advance_iter(const struct bio *bio,
                                    struct bvec_iter *iter, unsigned int bytes)
{
        iter->bi_sector += bytes >> 9;

        if (bio_no_advance_iter(bio))
                iter->bi_size -= bytes;
        else
                bvec_iter_advance(bio->bi_io_vec, iter, bytes);
                /* TODO: It is reasonable to complete bio with error here. */
}

/* @bytes should be less or equal to bvec[i->bi_idx].bv_len */
static inline void bio_advance_iter_single(const struct bio *bio,
                                           struct bvec_iter *iter,
                                           unsigned int bytes)
{
        iter->bi_sector += bytes >> 9;

        if (bio_no_advance_iter(bio))
                iter->bi_size -= bytes;
        else
                bvec_iter_advance_single(bio->bi_io_vec, iter, bytes);
}

void __bio_advance(struct bio *, unsigned bytes);

/**
 * bio_advance - increment/complete a bio by some number of bytes
 * @bio:        bio to advance
 * @nbytes:        number of bytes to complete
 *
 * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
 * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
 * be updated on the last bvec as well.
 *
 * @bio will then represent the remaining, uncompleted portion of the io.
 */
static inline void bio_advance(struct bio *bio, unsigned int nbytes)
{
        if (nbytes == bio->bi_iter.bi_size) {
                bio->bi_iter.bi_size = 0;
                return;
        }
        __bio_advance(bio, nbytes);
}

#define __bio_for_each_segment(bvl, bio, iter, start)                        \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = bio_iter_iovec((bio), (iter))), 1);                \
             bio_advance_iter_single((bio), &(iter), (bvl).bv_len))

#define bio_for_each_segment(bvl, bio, iter)                                \
        __bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)

#define __bio_for_each_bvec(bvl, bio, iter, start)                \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \
             bio_advance_iter_single((bio), &(iter), (bvl).bv_len))

/* iterate over multi-page bvec */
#define bio_for_each_bvec(bvl, bio, iter)                        \
        __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter)

/*
 * Iterate over all multi-page bvecs. Drivers shouldn't use this version for the
 * same reasons as bio_for_each_segment_all().
 */
#define bio_for_each_bvec_all(bvl, bio, i)                \
        for (i = 0, bvl = bio_first_bvec_all(bio);        \
             i < (bio)->bi_vcnt; i++, bvl++)

#define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)

static inline unsigned bio_segments(struct bio *bio)
{
        unsigned segs = 0;
        struct bio_vec bv;
        struct bvec_iter iter;

        /*
         * We special case discard/write same/write zeroes, because they
         * interpret bi_size differently:
         */

        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
        case REQ_OP_WRITE_ZEROES:
                return 0;
        default:
                break;
        }

        bio_for_each_segment(bv, bio, iter)
                segs++;

        return segs;
}

/*
 * get a reference to a bio, so it won't disappear. the intended use is
 * something like:
 *
 * bio_get(bio);
 * submit_bio(rw, bio);
 * if (bio->bi_flags ...)
 *        do_something
 * bio_put(bio);
 *
 * without the bio_get(), it could potentially complete I/O before submit_bio
 * returns. and then bio would be freed memory when if (bio->bi_flags ...)
 * runs
 */
static inline void bio_get(struct bio *bio)
{
        bio->bi_flags |= (1 << BIO_REFFED);
        smp_mb__before_atomic();
        atomic_inc(&bio->__bi_cnt);
}

static inline void bio_cnt_set(struct bio *bio, unsigned int count)
{
        if (count != 1) {
                bio->bi_flags |= (1 << BIO_REFFED);
                smp_mb();
        }
        atomic_set(&bio->__bi_cnt, count);
}

static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
{
        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
        return bio->bi_io_vec;
}

static inline struct page *bio_first_page_all(struct bio *bio)
{
        return bio_first_bvec_all(bio)->bv_page;
}

static inline struct folio *bio_first_folio_all(struct bio *bio)
{
        return page_folio(bio_first_page_all(bio));
}

/**
 * struct folio_iter - State for iterating all folios in a bio.
 * @folio: The current folio we're iterating.  NULL after the last folio.
 * @offset: The byte offset within the current folio.
 * @length: The number of bytes in this iteration (will not cross folio
 *        boundary).
 */
struct folio_iter {
        struct folio *folio;
        size_t offset;
        size_t length;
        /* private: for use by the iterator */
        struct folio *_next;
        size_t _seg_count;
        int _i;
};

static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio,
                                   int i)
{
        struct bio_vec *bvec = bio_first_bvec_all(bio) + i;

        if (unlikely(i >= bio->bi_vcnt)) {
                fi->folio = NULL;
                return;
        }

        fi->folio = page_folio(bvec->bv_page);
        fi->offset = bvec->bv_offset +
                        PAGE_SIZE * folio_page_idx(fi->folio, bvec->bv_page);
        fi->_seg_count = bvec->bv_len;
        fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count);
        fi->_next = folio_next(fi->folio);
        fi->_i = i;
}

static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio)
{
        fi->_seg_count -= fi->length;
        if (fi->_seg_count) {
                fi->folio = fi->_next;
                fi->offset = 0;
                fi->length = min(folio_size(fi->folio), fi->_seg_count);
                fi->_next = folio_next(fi->folio);
        } else {
                bio_first_folio(fi, bio, fi->_i + 1);
        }
}

/**
 * bio_for_each_folio_all - Iterate over each folio in a bio.
 * @fi: struct folio_iter which is updated for each folio.
 * @bio: struct bio to iterate over.
 */
#define bio_for_each_folio_all(fi, bio)                                \
        for (bio_first_folio(&fi, bio, 0); fi.folio; bio_next_folio(&fi, bio))

void bio_trim(struct bio *bio, sector_t offset, sector_t size);
extern struct bio *bio_split(struct bio *bio, int sectors,
                             gfp_t gfp, struct bio_set *bs);
int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
                unsigned *segs, unsigned max_bytes, unsigned len_align);
u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next,
                u8 gaps_bit);

/**
 * bio_next_split - get next @sectors from a bio, splitting if necessary
 * @bio:        bio to split
 * @sectors:        number of sectors to split from the front of @bio
 * @gfp:        gfp mask
 * @bs:                bio set to allocate from
 *
 * Return: a bio representing the next @sectors of @bio - if the bio is smaller
 * than @sectors, returns the original bio unchanged.
 */
static inline struct bio *bio_next_split(struct bio *bio, int sectors,
                                         gfp_t gfp, struct bio_set *bs)
{
        if (sectors >= bio_sectors(bio))
                return bio;

        return bio_split(bio, sectors, gfp, bs);
}

enum {
        BIOSET_NEED_BVECS = BIT(0),
        BIOSET_NEED_RESCUER = BIT(1),
        BIOSET_PERCPU_CACHE = BIT(2),
};
extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
extern void bioset_exit(struct bio_set *);
extern int biovec_init_pool(mempool_t *pool, int pool_entries);

struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
                             blk_opf_t opf, gfp_t gfp, struct bio_set *bs);
struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask);
extern void bio_put(struct bio *);

struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
                gfp_t gfp, struct bio_set *bs);
int bio_init_clone(struct block_device *bdev, struct bio *bio,
                struct bio *bio_src, gfp_t gfp);

extern struct bio_set fs_bio_set;

static inline struct bio *bio_alloc(struct block_device *bdev,
                unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp_mask)
{
        return bio_alloc_bioset(bdev, nr_vecs, opf, gfp_mask, &fs_bio_set);
}

void submit_bio(struct bio *bio);

extern void bio_endio(struct bio *);

static inline void bio_io_error(struct bio *bio)
{
        bio->bi_status = BLK_STS_IOERR;
        bio_endio(bio);
}

static inline void bio_wouldblock_error(struct bio *bio)
{
        bio_set_flag(bio, BIO_QUIET);
        bio->bi_status = BLK_STS_AGAIN;
        bio_endio(bio);
}

/*
 * Calculate number of bvec segments that should be allocated to fit data
 * pointed by @iter. If @iter is backed by bvec it's going to be reused
 * instead of allocating a new one.
 */
static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
{
        if (iov_iter_is_bvec(iter))
                return 0;
        return iov_iter_npages(iter, max_segs);
}

/**
 * bio_iov_bounce_nr_vecs - calculate number of bvecs for a bounce bio
 * @iter:        iter to bounce from
 * @op:                REQ_OP_* for the bio
 *
 * Calculates how many bvecs are needed for the next bio to bounce from/to
 * @iter.
 */
static inline unsigned short
bio_iov_bounce_nr_vecs(struct iov_iter *iter, blk_opf_t op)
{
        /*
         * We still need to bounce bvec iters, so don't special case them
         * here unlike in bio_iov_vecs_to_alloc.
         *
         * For reads we need to use a vector for the bounce buffer, account
         * for that here.
         */
        if (op_is_write(op))
                return iov_iter_npages(iter, BIO_MAX_VECS);
        return iov_iter_npages(iter, BIO_MAX_VECS - 1) + 1;
}

struct request_queue;

void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
              unsigned short max_vecs, blk_opf_t opf);
static inline void bio_init_inline(struct bio *bio, struct block_device *bdev,
              unsigned short max_vecs, blk_opf_t opf)
{
        bio_init(bio, bdev, bio_inline_vecs(bio), max_vecs, opf);
}
extern void bio_uninit(struct bio *);
void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf);
void bio_reuse(struct bio *bio, blk_opf_t opf);
void bio_chain(struct bio *, struct bio *);
void bio_await(struct bio *bio, void *priv,
               void (*submit)(struct bio *bio, void *priv));

int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len,
                              unsigned off);
bool __must_check bio_add_folio(struct bio *bio, struct folio *folio,
                                size_t len, size_t off);
void __bio_add_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int off);
void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
                          size_t off);
void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len);

/**
 * bio_add_max_vecs - number of bio_vecs needed to add data to a bio
 * @kaddr: kernel virtual address to add
 * @len: length in bytes to add
 *
 * Calculate how many bio_vecs need to be allocated to add the kernel virtual
 * address range in [@kaddr:@len] in the worse case.
 */
static inline unsigned int bio_add_max_vecs(void *kaddr, unsigned int len)
{
        if (is_vmalloc_addr(kaddr))
                return DIV_ROUND_UP(offset_in_page(kaddr) + len, PAGE_SIZE);
        return 1;
}

unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len);
bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len);

int submit_bio_wait(struct bio *bio);
int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
                size_t len, enum req_op op);

int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
                unsigned len_align_mask);

void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter);
void __bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);

int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen);
void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty);

extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                               struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);
extern void bio_free_pages(struct bio *bio);
void guard_bio_eod(struct bio *bio);
void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);

static inline void zero_fill_bio(struct bio *bio)
{
        zero_fill_bio_iter(bio, bio->bi_iter);
}

static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
{
        if (bio_flagged(bio, BIO_PAGE_PINNED))
                __bio_release_pages(bio, mark_dirty);
}

#define bio_dev(bio) \
        disk_devt((bio)->bi_bdev->bd_disk)

#ifdef CONFIG_BLK_CGROUP
void bio_associate_blkg(struct bio *bio);
void bio_associate_blkg_from_css(struct bio *bio,
                                 struct cgroup_subsys_state *css);
void bio_clone_blkg_association(struct bio *dst, struct bio *src);
void blkcg_punt_bio_submit(struct bio *bio);
#else        /* CONFIG_BLK_CGROUP */
static inline void bio_associate_blkg(struct bio *bio) { }
static inline void bio_associate_blkg_from_css(struct bio *bio,
                                               struct cgroup_subsys_state *css)
{ }
static inline void bio_clone_blkg_association(struct bio *dst,
                                              struct bio *src) { }
static inline void blkcg_punt_bio_submit(struct bio *bio)
{
        submit_bio(bio);
}
#endif        /* CONFIG_BLK_CGROUP */

static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
{
        bio_clear_flag(bio, BIO_REMAPPED);
        if (bio->bi_bdev != bdev)
                bio_clear_flag(bio, BIO_BPS_THROTTLED);
        bio->bi_bdev = bdev;
        bio_associate_blkg(bio);
}

/*
 * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
 *
 * A bio_list anchors a singly-linked list of bios chained through the bi_next
 * member of the bio.  The bio_list also caches the last list member to allow
 * fast access to the tail.
 */
struct bio_list {
        struct bio *head;
        struct bio *tail;
};

static inline int bio_list_empty(const struct bio_list *bl)
{
        return bl->head == NULL;
}

static inline void bio_list_init(struct bio_list *bl)
{
        bl->head = bl->tail = NULL;
}

#define BIO_EMPTY_LIST        { NULL, NULL }

#define bio_list_for_each(bio, bl) \
        for (bio = (bl)->head; bio; bio = bio->bi_next)

static inline unsigned bio_list_size(const struct bio_list *bl)
{
        unsigned sz = 0;
        struct bio *bio;

        bio_list_for_each(bio, bl)
                sz++;

        return sz;
}

static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
{
        bio->bi_next = NULL;

        if (bl->tail)
                bl->tail->bi_next = bio;
        else
                bl->head = bio;

        bl->tail = bio;
}

static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio)
{
        bio->bi_next = bl->head;

        bl->head = bio;

        if (!bl->tail)
                bl->tail = bio;
}

static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
{
        if (!bl2->head)
                return;

        if (bl->tail)
                bl->tail->bi_next = bl2->head;
        else
                bl->head = bl2->head;

        bl->tail = bl2->tail;
}

static inline void bio_list_merge_init(struct bio_list *bl,
                struct bio_list *bl2)
{
        bio_list_merge(bl, bl2);
        bio_list_init(bl2);
}

static inline void bio_list_merge_head(struct bio_list *bl,
                                       struct bio_list *bl2)
{
        if (!bl2->head)
                return;

        if (bl->head)
                bl2->tail->bi_next = bl->head;
        else
                bl->tail = bl2->tail;

        bl->head = bl2->head;
}

static inline struct bio *bio_list_peek(struct bio_list *bl)
{
        return bl->head;
}

static inline struct bio *bio_list_pop(struct bio_list *bl)
{
        struct bio *bio = bl->head;

        if (bio) {
                bl->head = bl->head->bi_next;
                if (!bl->head)
                        bl->tail = NULL;

                bio->bi_next = NULL;
        }

        return bio;
}

static inline struct bio *bio_list_get(struct bio_list *bl)
{
        struct bio *bio = bl->head;

        bl->head = bl->tail = NULL;

        return bio;
}

/*
 * Increment chain count for the bio. Make sure the CHAIN flag update
 * is visible before the raised count.
 */
static inline void bio_inc_remaining(struct bio *bio)
{
        bio_set_flag(bio, BIO_CHAIN);
        smp_mb__before_atomic();
        atomic_inc(&bio->__bi_remaining);
}

/*
 * bio_set is used to allow other portions of the IO system to
 * allocate their own private memory pools for bio and iovec structures.
 * These memory pools in turn all allocate from the bio_slab
 * and the bvec_slabs[].
 */
#define BIO_POOL_SIZE 2

struct bio_set {
        struct kmem_cache *bio_slab;
        unsigned int front_pad;

        /*
         * per-cpu bio alloc cache
         */
        struct bio_alloc_cache __percpu *cache;

        mempool_t bio_pool;
        mempool_t bvec_pool;

        unsigned int back_pad;
        /*
         * Deadlock avoidance for stacking block drivers: see comments in
         * bio_alloc_bioset() for details
         */
        spinlock_t                rescue_lock;
        struct bio_list                rescue_list;
        struct work_struct        rescue_work;
        struct workqueue_struct        *rescue_workqueue;

        /*
         * Hot un-plug notifier for the per-cpu cache, if used
         */
        struct hlist_node cpuhp_dead;
};

static inline bool bioset_initialized(struct bio_set *bs)
{
        return bs->bio_slab != NULL;
}

/*
 * Mark a bio as polled. Note that for async polled IO, the caller must
 * expect -EWOULDBLOCK if we cannot allocate a request (or other resources).
 * We cannot block waiting for requests on polled IO, as those completions
 * must be found by the caller. This is different than IRQ driven IO, where
 * it's safe to wait for IO to complete.
 */
static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
{
        bio->bi_opf |= REQ_POLLED;
        if (kiocb->ki_flags & IOCB_NOWAIT)
                bio->bi_opf |= REQ_NOWAIT;
}

static inline void bio_clear_polled(struct bio *bio)
{
        bio->bi_opf &= ~REQ_POLLED;
}

/**
 * bio_is_zone_append - is this a zone append bio?
 * @bio:        bio to check
 *
 * Check if @bio is a zone append operation.  Core block layer code and end_io
 * handlers must use this instead of an open coded REQ_OP_ZONE_APPEND check
 * because the block layer can rewrite REQ_OP_ZONE_APPEND to REQ_OP_WRITE if
 * it is not natively supported.
 */
static inline bool bio_is_zone_append(struct bio *bio)
{
        if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
                return false;
        return bio_op(bio) == REQ_OP_ZONE_APPEND ||
                bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
}

struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
                unsigned int nr_pages, blk_opf_t opf, gfp_t gfp);
struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new);

struct bio *blk_alloc_discard_bio(struct block_device *bdev,
                sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask);

#endif /* __LINUX_BIO_H */























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * descriptor table internals; you almost certainly want file.h instead.
 */

#ifndef __LINUX_FDTABLE_H
#define __LINUX_FDTABLE_H

#include <linux/posix_types.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/nospec.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>

#include <linux/atomic.h>

/*
 * The default fd array needs to be at least BITS_PER_LONG,
 * as this is the granularity returned by copy_fdset().
 */
#define NR_OPEN_DEFAULT BITS_PER_LONG

struct fdtable {
        unsigned int max_fds;
        struct file __rcu **fd;      /* current fd array */
        unsigned long *close_on_exec;
        unsigned long *open_fds;
        unsigned long *full_fds_bits;
        struct rcu_head rcu;
};

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
        atomic_t count;
        bool resize_in_progress;
        wait_queue_head_t resize_wait;

        struct fdtable __rcu *fdt;
        struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
        spinlock_t file_lock ____cacheline_aligned_in_smp;
        unsigned int next_fd;
        unsigned long close_on_exec_init[1];
        unsigned long open_fds_init[1];
        unsigned long full_fds_bits_init[1];
        struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

struct file_operations;
struct vfsmount;
struct dentry;

#define rcu_dereference_check_fdtable(files, fdtfd) \
        rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))

#define files_fdtable(files) \
        rcu_dereference_check_fdtable((files), (files)->fdt)

/*
 * The caller must ensure that fd table isn't shared or hold rcu or file lock
 */
static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds);
        struct file *needs_masking;

        /*
         * 'mask' is zero for an out-of-bounds fd, all ones for ok.
         * 'fd&mask' is 'fd' for ok, or 0 for out of bounds.
         *
         * Accessing fdt->fd[0] is ok, but needs masking of the result.
         */
        needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]);
        return (struct file *)(mask & (unsigned long)needs_masking);
}

static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
{
        RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock),
                           "suspicious rcu_dereference_check() usage");
        return files_lookup_fd_raw(files, fd);
}

static inline bool close_on_exec(unsigned int fd, const struct files_struct *files)
{
        return test_bit(fd, files_fdtable(files)->close_on_exec);
}

struct task_struct;

void put_files_struct(struct files_struct *fs);
int unshare_files(void);
struct fd_range {
        unsigned int from, to;
};
struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
                int (*)(const void *, struct file *, unsigned),
                const void *);

extern int close_fd(unsigned int fd);
extern struct file *file_close_fd(unsigned int fd);

extern struct kmem_cache *files_cachep;

#endif /* __LINUX_FDTABLE_H */


































    1 

















    1 









    2 













































    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
 * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * id.c
 */

/*
 * This file implements code to handle uids and gids.
 *
 * For space efficiency regular files store uid and gid indexes, which are
 * converted to 32-bit uids/gids using an id look up table.  This table is
 * stored compressed into metadata blocks.  A second index table is used to
 * locate these.  This second index table for speed of access (and because it
 * is small) is read at mount time and cached in memory.
 */

#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/slab.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs.h"

/*
 * Map uid/gid index into real 32-bit uid/gid using the id look up table
 */
int squashfs_get_id(struct super_block *sb, unsigned int index,
                                        unsigned int *id)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        int block = SQUASHFS_ID_BLOCK(index);
        int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
        u64 start_block;
        __le32 disk_id;
        int err;

        if (index >= msblk->ids)
                return -EINVAL;

        start_block = le64_to_cpu(msblk->id_table[block]);

        err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
                                                        sizeof(disk_id));
        if (err < 0)
                return err;

        *id = le32_to_cpu(disk_id);
        return 0;
}


/*
 * Read uncompressed id lookup table indexes from disk into memory
 */
__le64 *squashfs_read_id_index_table(struct super_block *sb,
                u64 id_table_start, u64 next_table, unsigned short no_ids)
{
        unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
        unsigned int indexes = SQUASHFS_ID_BLOCKS(no_ids);
        int n;
        __le64 *table;
        u64 start, end;

        TRACE("In read_id_index_table, length %d\n", length);

        /* Sanity check values */

        /* there should always be at least one id */
        if (no_ids == 0)
                return ERR_PTR(-EINVAL);

        /*
         * The computed size of the index table (length bytes) should exactly
         * match the table start and end points
         */
        if (length != (next_table - id_table_start))
                return ERR_PTR(-EINVAL);

        table = squashfs_read_table(sb, id_table_start, length);
        if (IS_ERR(table))
                return table;

        /*
         * table[0], table[1], ... table[indexes - 1] store the locations
         * of the compressed id blocks.   Each entry should be less than
         * the next (i.e. table[0] < table[1]), and the difference between them
         * should be SQUASHFS_METADATA_SIZE or less.  table[indexes - 1]
         * should be less than id_table_start, and again the difference
         * should be SQUASHFS_METADATA_SIZE or less
         */
        for (n = 0; n < (indexes - 1); n++) {
                start = le64_to_cpu(table[n]);
                end = le64_to_cpu(table[n + 1]);

                if (start >= end || (end - start) >
                                (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                        kfree(table);
                        return ERR_PTR(-EINVAL);
                }
        }

        start = le64_to_cpu(table[indexes - 1]);
        if (start >= id_table_start || (id_table_start - start) >
                                (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                kfree(table);
                return ERR_PTR(-EINVAL);
        }

        return table;
}






























































































   14 



































































































































































   27 
















    2 










    4 













    3 







    3 
















































































    1 










































    2 












    3 
























    3 



    4 















    1 


    1 
    1 
    1 


























































    3 


    3 












    1 
    1 
    1 































































































    1 







    2 
    1 































































   18 

    1 






   22 















    3 























   19 



    4 
   19 












    3 






    2 
    3 

    3 























































    4 




    3 













    4 















    4 
    3 













    4 





    4 










    4 
    4 

























    2 




















































    3 





    2 

    4 

    3 












































    4 




    4 

    4 










    4 






    3 







    4 




























    4 





    4 
    2 








    3 




    3 












    2 









    4 


    3 









    4 


   10 






























































































    1 









    1 
    1 

    1 











    1 















    2 











    2 

    1 












    1 
    1 


    1 













    1 






















   20 



    2 



   19 
   18 


   17 

   21 





   25 

   25 




































































































































































































































































    4 






    3 
    4 












    3 

    4 







    4 


























































































































































































































































































































































































































































































































































































































































































































































































    4 


    3 














































































    4 

















    4 









    4 




    4 









    4 











    4 







    3 







    4 

    4 

    4 

    4 

    4 

    4 
    4 





















    4 

































    4 











































    4 


    4 









    3 
    4 










    4 





    3 









    3 









    4 



    3 






    4 







    4 


























    3 









































































































































































































































































































































































































































































    4 


    1 
    3 








































































































































































































































































































































































































































    4 









    4 







    3 












   14 



   11 














    4 



   15 








   17 












   14 





















   17 

    2 
   14 


    4 


   15 


































































































































































































    2 













    3 








   12 









    3 


















   16 








    1 
   16 









   14 

























   16 


























   17 












   18 
































































































































































































   19 









   16 


    3 

   15 


   17 

   15 


   18 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 




    3 































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/capability.h>
#include <linux/mnt_namespace.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/idr.h>
#include <linux/init.h>                /* init_rootfs */
#include <linux/fs_struct.h>        /* get_fs_root et.al. */
#include <linux/fsnotify.h>        /* fsnotify_vfsmount_delete */
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/pidfs.h>
#include <linux/nstree.h>

#include "pnode.h"
#include "internal.h"

/* Maximum number of mounts in a mount namespace */
static unsigned int sysctl_mount_max __read_mostly = 100000;

static unsigned int m_hash_mask __ro_after_init;
static unsigned int m_hash_shift __ro_after_init;
static unsigned int mp_hash_mask __ro_after_init;
static unsigned int mp_hash_shift __ro_after_init;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
        return kstrtoul(str, 0, &mhash_entries) == 0;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
        return kstrtoul(str, 0, &mphash_entries) == 0;
}
__setup("mphash_entries=", set_mphash_entries);

static char * __initdata initramfs_options;
static int __init initramfs_options_setup(char *str)
{
        initramfs_options = str;
        return 1;
}

__setup("initramfs_options=", initramfs_options_setup);

static u64 event;
static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
static DEFINE_IDA(mnt_group_ida);

/* Don't allow confusion with old 32bit mount ID */
#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;

static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted);        /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */

static inline void namespace_lock(void);
static void namespace_unlock(void);
DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock())
DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem),
                                      up_read(&namespace_sem))

DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T))

#ifdef CONFIG_FSNOTIFY
LIST_HEAD(notify_list); /* protected by namespace_sem */
#endif

enum mount_kattr_flags_t {
        MOUNT_KATTR_RECURSE                = (1 << 0),
        MOUNT_KATTR_IDMAP_REPLACE        = (1 << 1),
};

struct mount_kattr {
        unsigned int attr_set;
        unsigned int attr_clr;
        unsigned int propagation;
        unsigned int lookup_flags;
        enum mount_kattr_flags_t kflags;
        struct user_namespace *mnt_userns;
        struct mnt_idmap *mnt_idmap;
};

/* /sys/fs */
struct kobject *fs_kobj __ro_after_init;
EXPORT_SYMBOL_GPL(fs_kobj);

/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static void mnt_ns_release(struct mnt_namespace *ns)
{
        /* keep alive for {list,stat}mount() */
        if (ns && refcount_dec_and_test(&ns->passive)) {
                fsnotify_mntns_delete(ns);
                put_user_ns(ns->user_ns);
                kfree(ns);
        }
}
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *,
            if (!IS_ERR(_T)) mnt_ns_release(_T))

static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
        mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu));
}

static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
        /* remove from global mount namespace list */
        if (ns_tree_active(ns))
                ns_tree_remove(ns);

        call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu);
}

/*
 * Lookup a mount namespace by id and take a passive reference count. Taking a
 * passive reference means the mount namespace can be emptied if e.g., the last
 * task holding an active reference exits. To access the mounts of the
 * namespace the @namespace_sem must first be acquired. If the namespace has
 * already shut down before acquiring @namespace_sem, {list,stat}mount() will
 * see that the mount rbtree of the namespace is empty.
 *
 * Note the lookup is lockless protected by a sequence counter. We only
 * need to guard against false negatives as false positives aren't
 * possible. So if we didn't find a mount namespace and the sequence
 * counter has changed we need to retry. If the sequence counter is
 * still the same we know the search actually failed.
 */
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
        struct mnt_namespace *mnt_ns;
        struct ns_common *ns;

        guard(rcu)();
        ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS);
        if (!ns)
                return NULL;

        /*
         * The last reference count is put with RCU delay so we can
         * unconditonally acquire a reference here.
         */
        mnt_ns = container_of(ns, struct mnt_namespace, ns);
        refcount_inc(&mnt_ns->passive);
        return mnt_ns;
}

static inline void lock_mount_hash(void)
{
        write_seqlock(&mount_lock);
}

static inline void unlock_mount_hash(void)
{
        write_sequnlock(&mount_lock);
}

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
        tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> m_hash_shift);
        return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> mp_hash_shift);
        return &mountpoint_hashtable[tmp & mp_hash_mask];
}

static int mnt_alloc_id(struct mount *mnt)
{
        int res;

        xa_lock(&mnt_id_xa);
        res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL);
        if (!res)
                mnt->mnt_id_unique = ++mnt_id_ctr;
        xa_unlock(&mnt_id_xa);
        return res;
}

static void mnt_free_id(struct mount *mnt)
{
        xa_erase(&mnt_id_xa, mnt->mnt_id);
}

/*
 * Allocate a new peer group ID
 */
static int mnt_alloc_group_id(struct mount *mnt)
{
        int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);

        if (res < 0)
                return res;
        mnt->mnt_group_id = res;
        return 0;
}

/*
 * Release a peer group ID
 */
void mnt_release_group_id(struct mount *mnt)
{
        ida_free(&mnt_group_ida, mnt->mnt_group_id);
        mnt->mnt_group_id = 0;
}

/*
 * vfsmount lock must be held for read
 */
static inline void mnt_add_count(struct mount *mnt, int n)
{
#ifdef CONFIG_SMP
        this_cpu_add(mnt->mnt_pcp->mnt_count, n);
#else
        preempt_disable();
        mnt->mnt_count += n;
        preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
int mnt_get_count(struct mount *mnt)
{
#ifdef CONFIG_SMP
        int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
        }

        return count;
#else
        return mnt->mnt_count;
#endif
}

static struct mount *alloc_vfsmnt(const char *name)
{
        struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
        if (mnt) {
                int err;

                err = mnt_alloc_id(mnt);
                if (err)
                        goto out_free_cache;

                if (name)
                        mnt->mnt_devname = kstrdup_const(name,
                                                         GFP_KERNEL_ACCOUNT);
                else
                        mnt->mnt_devname = "none";
                if (!mnt->mnt_devname)
                        goto out_free_id;

#ifdef CONFIG_SMP
                mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
                if (!mnt->mnt_pcp)
                        goto out_free_devname;

                this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
                mnt->mnt_count = 1;
                mnt->mnt_writers = 0;
#endif

                INIT_HLIST_NODE(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
                INIT_LIST_HEAD(&mnt->mnt_list);
                INIT_LIST_HEAD(&mnt->mnt_expire);
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_HLIST_HEAD(&mnt->mnt_slave_list);
                INIT_HLIST_NODE(&mnt->mnt_slave);
                INIT_HLIST_NODE(&mnt->mnt_mp_list);
                INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
                RB_CLEAR_NODE(&mnt->mnt_node);
                mnt->mnt.mnt_idmap = &nop_mnt_idmap;
        }
        return mnt;

#ifdef CONFIG_SMP
out_free_devname:
        kfree_const(mnt->mnt_devname);
#endif
out_free_id:
        mnt_free_id(mnt);
out_free_cache:
        kmem_cache_free(mnt_cache, mnt);
        return NULL;
}

/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
bool __mnt_is_readonly(const struct vfsmount *mnt)
{
        return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

static inline void mnt_inc_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_inc(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers++;
#endif
}

static inline void mnt_dec_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_dec(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers--;
#endif
}

static unsigned int mnt_get_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        unsigned int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
        }

        return count;
#else
        return mnt->mnt_writers;
#endif
}

static int mnt_is_readonly(const struct vfsmount *mnt)
{
        if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
                return 1;
        /*
         * The barrier pairs with the barrier in sb_start_ro_state_change()
         * making sure if we don't see s_readonly_remount set yet, we also will
         * not see any superblock / mount flag changes done by remount.
         * It also pairs with the barrier in sb_end_ro_state_change()
         * assuring that if we see s_readonly_remount already cleared, we will
         * see the values of superblock / mount flags updated by remount.
         */
        smp_rmb();
        return __mnt_is_readonly(mnt);
}

/*
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
 */
/**
 * mnt_get_write_access - get write access to a mount without freeze protection
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, mnt_put_write_access() must be
 * called. This is effectively a refcount.
 */
int mnt_get_write_access(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int ret = 0;

        preempt_disable();
        mnt_inc_writers(mnt);
        /*
         * The store to mnt_inc_writers must be visible before we pass
         * WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set WRITE_HOLD.
         */
        smp_mb();
        might_lock(&mount_lock.lock);
        while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) {
                if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
                        cpu_relax();
                } else {
                        /*
                         * This prevents priority inversion, if the task
                         * setting WRITE_HOLD got preempted on a remote
                         * CPU, and it prevents life lock if the task setting
                         * WRITE_HOLD has a lower priority and is bound to
                         * the same CPU as the task that is spinning here.
                         */
                        preempt_enable();
                        read_seqlock_excl(&mount_lock);
                        read_sequnlock_excl(&mount_lock);
                        preempt_disable();
                }
        }
        /*
         * The barrier pairs with the barrier sb_start_ro_state_change() making
         * sure that if we see WRITE_HOLD cleared, we will also see
         * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
         * mnt_is_readonly() and bail in case we are racing with remount
         * read-only.
         */
        smp_rmb();
        if (mnt_is_readonly(m)) {
                mnt_dec_writers(mnt);
                ret = -EROFS;
        }
        preempt_enable();

        return ret;
}
EXPORT_SYMBOL_GPL(mnt_get_write_access);

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
        int ret;

        sb_start_write(m->mnt_sb);
        ret = mnt_get_write_access(m);
        if (ret)
                sb_end_write(m->mnt_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);

/**
 * mnt_get_write_access_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_get_write_access, but if @file is already open for write it
 * skips incrementing mnt_writers (since the open file already has a reference)
 * and instead only does the check for emergency r/o remounts.  This must be
 * paired with mnt_put_write_access_file.
 */
int mnt_get_write_access_file(struct file *file)
{
        if (file->f_mode & FMODE_WRITER) {
                /*
                 * Superblock may have become readonly while there are still
                 * writable fd's, e.g. due to a fs error with errors=remount-ro
                 */
                if (__mnt_is_readonly(file->f_path.mnt))
                        return -EROFS;
                return 0;
        }
        return mnt_get_write_access(file->f_path.mnt);
}

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but if the file is already open for writing it
 * skips incrementing mnt_writers (since the open file already has a reference)
 * and instead only does the freeze protection and the check for emergency r/o
 * remounts.  This must be paired with mnt_drop_write_file.
 */
int mnt_want_write_file(struct file *file)
{
        int ret;

        sb_start_write(file_inode(file)->i_sb);
        ret = mnt_get_write_access_file(file);
        if (ret)
                sb_end_write(file_inode(file)->i_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write_file);

/**
 * mnt_put_write_access - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
 * mnt_get_write_access() call above.
 */
void mnt_put_write_access(struct vfsmount *mnt)
{
        preempt_disable();
        mnt_dec_writers(real_mount(mnt));
        preempt_enable();
}
EXPORT_SYMBOL_GPL(mnt_put_write_access);

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
        mnt_put_write_access(mnt);
        sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

void mnt_put_write_access_file(struct file *file)
{
        if (!(file->f_mode & FMODE_WRITER))
                mnt_put_write_access(file->f_path.mnt);
}

void mnt_drop_write_file(struct file *file)
{
        mnt_put_write_access_file(file);
        sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);

/**
 * mnt_hold_writers - prevent write access to the given mount
 * @mnt: mnt to prevent write access to
 *
 * Prevents write access to @mnt if there are no active writers for @mnt.
 * This function needs to be called and return successfully before changing
 * properties of @mnt that need to remain stable for callers with write access
 * to @mnt.
 *
 * After this functions has been called successfully callers must pair it with
 * a call to mnt_unhold_writers() in order to stop preventing write access to
 * @mnt.
 *
 * Context: This function expects to be in mount_locked_reader scope serializing
 *          setting WRITE_HOLD.
 * Return: On success 0 is returned.
 *           On error, -EBUSY is returned.
 */
static inline int mnt_hold_writers(struct mount *mnt)
{
        set_write_hold(mnt);
        /*
         * After storing WRITE_HOLD, we'll read the counters. This store
         * should be visible before we do.
         */
        smp_mb();

        /*
         * With writers on hold, if this value is zero, then there are
         * definitely no active writers (although held writers may subsequently
         * increment the count, they'll have to wait, and decrement it after
         * seeing MNT_READONLY).
         *
         * It is OK to have counter incremented on one CPU and decremented on
         * another: the sum will add up correctly. The danger would be when we
         * sum up each counter, if we read a counter before it is incremented,
         * but then read another CPU's count which it has been subsequently
         * decremented from -- we would see more decrements than we should.
         * WRITE_HOLD protects against this scenario, because
         * mnt_want_write first increments count, then smp_mb, then spins on
         * WRITE_HOLD, so it can't be decremented by another CPU while
         * we're counting up here.
         */
        if (mnt_get_writers(mnt) > 0)
                return -EBUSY;

        return 0;
}

/**
 * mnt_unhold_writers - stop preventing write access to the given mount
 * @mnt: mnt to stop preventing write access to
 *
 * Stop preventing write access to @mnt allowing callers to gain write access
 * to @mnt again.
 *
 * This function can only be called after a call to mnt_hold_writers().
 *
 * Context: This function expects to be in the same mount_locked_reader scope
 * as the matching mnt_hold_writers().
 */
static inline void mnt_unhold_writers(struct mount *mnt)
{
        if (!test_write_hold(mnt))
                return;
        /*
         * MNT_READONLY must become visible before ~WRITE_HOLD, so writers
         * that become unheld will see MNT_READONLY.
         */
        smp_wmb();
        clear_write_hold(mnt);
}

static inline void mnt_del_instance(struct mount *m)
{
        struct mount **p = m->mnt_pprev_for_sb;
        struct mount *next = m->mnt_next_for_sb;

        if (next)
                next->mnt_pprev_for_sb = p;
        *p = next;
}

static inline void mnt_add_instance(struct mount *m, struct super_block *s)
{
        struct mount *first = s->s_mounts;

        if (first)
                first->mnt_pprev_for_sb = &m->mnt_next_for_sb;
        m->mnt_next_for_sb = first;
        m->mnt_pprev_for_sb = &s->s_mounts;
        s->s_mounts = m;
}

static int mnt_make_readonly(struct mount *mnt)
{
        int ret;

        ret = mnt_hold_writers(mnt);
        if (!ret)
                mnt->mnt.mnt_flags |= MNT_READONLY;
        mnt_unhold_writers(mnt);
        return ret;
}

int sb_prepare_remount_readonly(struct super_block *sb)
{
        int err = 0;

        /* Racy optimization.  Recheck the counter under WRITE_HOLD */
        if (atomic_long_read(&sb->s_remove_count))
                return -EBUSY;

        guard(mount_locked_reader)();

        for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
                if (!(m->mnt.mnt_flags & MNT_READONLY)) {
                        err = mnt_hold_writers(m);
                        if (err)
                                break;
                }
        }
        if (!err && atomic_long_read(&sb->s_remove_count))
                err = -EBUSY;

        if (!err)
                sb_start_ro_state_change(sb);
        for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
                if (test_write_hold(m))
                        clear_write_hold(m);
        }

        return err;
}

static void free_vfsmnt(struct mount *mnt)
{
        mnt_idmap_put(mnt_idmap(&mnt->mnt));
        kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
        free_percpu(mnt->mnt_pcp);
#endif
        kmem_cache_free(mnt_cache, mnt);
}

static void delayed_free_vfsmnt(struct rcu_head *head)
{
        free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

/* call under rcu_read_lock */
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        struct mount *mnt;
        if (read_seqretry(&mount_lock, seq))
                return 1;
        if (bastard == NULL)
                return 0;
        mnt = real_mount(bastard);
        mnt_add_count(mnt, 1);
        smp_mb();                // see mntput_no_expire() and do_umount()
        if (likely(!read_seqretry(&mount_lock, seq)))
                return 0;
        lock_mount_hash();
        if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) {
                mnt_add_count(mnt, -1);
                unlock_mount_hash();
                return 1;
        }
        unlock_mount_hash();
        /* caller will mntput() */
        return -1;
}

/* call under rcu_read_lock */
static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        int res = __legitimize_mnt(bastard, seq);
        if (likely(!res))
                return true;
        if (unlikely(res < 0)) {
                rcu_read_unlock();
                mntput(bastard);
                rcu_read_lock();
        }
        return false;
}

/**
 * __lookup_mnt - mount hash lookup
 * @mnt:        parent mount
 * @dentry:        dentry of mountpoint
 *
 * If @mnt has a child mount @c mounted on @dentry find and return it.
 * Caller must either hold the spinlock component of @mount_lock or
 * hold rcu_read_lock(), sample the seqcount component before the call
 * and recheck it afterwards.
 *
 * Return: The child of @mnt mounted on @dentry or %NULL.
 */
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
        struct hlist_head *head = m_hash(mnt, dentry);
        struct mount *p;

        hlist_for_each_entry_rcu(p, head, mnt_hash)
                if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
                        return p;
        return NULL;
}

/**
 * lookup_mnt - Return the child mount mounted at given location
 * @path:        location in the namespace
 *
 * Acquires and returns a new reference to mount at given location
 * or %NULL if nothing is mounted there.
 */
struct vfsmount *lookup_mnt(const struct path *path)
{
        struct mount *child_mnt;
        struct vfsmount *m;
        unsigned seq;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                child_mnt = __lookup_mnt(path->mnt, path->dentry);
                m = child_mnt ? &child_mnt->mnt : NULL;
        } while (!legitimize_mnt(m, seq));
        rcu_read_unlock();
        return m;
}

/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(const struct dentry *dentry)
{
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        struct mount *mnt, *n;

        guard(namespace_shared)();

        rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node)
                if (mnt->mnt_mountpoint == dentry)
                        return true;

        return false;
}

struct pinned_mountpoint {
        struct hlist_node node;
        struct mountpoint *mp;
        struct mount *parent;
};

static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
{
        struct hlist_head *chain = mp_hash(dentry);
        struct mountpoint *mp;

        hlist_for_each_entry(mp, chain, m_hash) {
                if (mp->m_dentry == dentry) {
                        hlist_add_head(&m->node, &mp->m_list);
                        m->mp = mp;
                        return true;
                }
        }
        return false;
}

static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
{
        struct mountpoint *mp __free(kfree) = NULL;
        bool found;
        int ret;

        if (d_mountpoint(dentry)) {
                /* might be worth a WARN_ON() */
                if (d_unlinked(dentry))
                        return -ENOENT;
mountpoint:
                read_seqlock_excl(&mount_lock);
                found = lookup_mountpoint(dentry, m);
                read_sequnlock_excl(&mount_lock);
                if (found)
                        return 0;
        }

        if (!mp)
                mp = kmalloc_obj(struct mountpoint);
        if (!mp)
                return -ENOMEM;

        /* Exactly one processes may set d_mounted */
        ret = d_set_mounted(dentry);

        /* Someone else set d_mounted? */
        if (ret == -EBUSY)
                goto mountpoint;

        /* The dentry is not available as a mountpoint? */
        if (ret)
                return ret;

        /* Add the new mountpoint to the hash table */
        read_seqlock_excl(&mount_lock);
        mp->m_dentry = dget(dentry);
        hlist_add_head(&mp->m_hash, mp_hash(dentry));
        INIT_HLIST_HEAD(&mp->m_list);
        hlist_add_head(&m->node, &mp->m_list);
        m->mp = no_free_ptr(mp);
        read_sequnlock_excl(&mount_lock);
        return 0;
}

/*
 * vfsmount lock must be held.  Additionally, the caller is responsible
 * for serializing calls for given disposal list.
 */
static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list)
{
        if (hlist_empty(&mp->m_list)) {
                struct dentry *dentry = mp->m_dentry;
                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_MOUNTED;
                spin_unlock(&dentry->d_lock);
                dput_to_list(dentry, list);
                hlist_del(&mp->m_hash);
                kfree(mp);
        }
}

/*
 * locks: mount_lock [read_seqlock_excl], namespace_sem [excl]
 */
static void unpin_mountpoint(struct pinned_mountpoint *m)
{
        if (m->mp) {
                hlist_del(&m->node);
                maybe_free_mountpoint(m->mp, &ex_mountpoints);
        }
}

static inline int check_mnt(const struct mount *mnt)
{
        return mnt->mnt_ns == current->nsproxy->mnt_ns;
}

static inline bool check_anonymous_mnt(struct mount *mnt)
{
        u64 seq;

        if (!is_anon_ns(mnt->mnt_ns))
                return false;

        seq = mnt->mnt_ns->seq_origin;
        return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id);
}

/*
 * vfsmount lock must be held for write
 */
static void touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns) {
                ns->event = ++event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns && ns->event != event) {
                ns->event = event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * locks: mount_lock[write_seqlock]
 */
static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list)
{
        struct mountpoint *mp;
        struct mount *parent = mnt->mnt_parent;
        if (unlikely(parent->overmount == mnt))
                parent->overmount = NULL;
        mnt->mnt_parent = mnt;
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        list_del_init(&mnt->mnt_child);
        hlist_del_init_rcu(&mnt->mnt_hash);
        hlist_del_init(&mnt->mnt_mp_list);
        mp = mnt->mnt_mp;
        mnt->mnt_mp = NULL;
        maybe_free_mountpoint(mp, shrink_list);
}

/*
 * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints)
 */
static void umount_mnt(struct mount *mnt)
{
        __umount_mnt(mnt, &ex_mountpoints);
}

/*
 * vfsmount lock must be held for write
 */
void mnt_set_mountpoint(struct mount *mnt,
                        struct mountpoint *mp,
                        struct mount *child_mnt)
{
        child_mnt->mnt_mountpoint = mp->m_dentry;
        child_mnt->mnt_parent = mnt;
        child_mnt->mnt_mp = mp;
        hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}

static void make_visible(struct mount *mnt)
{
        struct mount *parent = mnt->mnt_parent;
        if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root))
                parent->overmount = mnt;
        hlist_add_head_rcu(&mnt->mnt_hash,
                           m_hash(&parent->mnt, mnt->mnt_mountpoint));
        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}

/**
 * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
 *              list of child mounts
 * @parent:  the parent
 * @mnt:     the new mount
 * @mp:      the new mountpoint
 *
 * Mount @mnt at @mp on @parent. Then attach @mnt
 * to @parent's child mount list and to @mount_hashtable.
 *
 * Note, when make_visible() is called @mnt->mnt_parent already points
 * to the correct parent.
 *
 * Context: This function expects namespace_lock() and lock_mount_hash()
 *          to have been acquired in that order.
 */
static void attach_mnt(struct mount *mnt, struct mount *parent,
                       struct mountpoint *mp)
{
        mnt_set_mountpoint(parent, mp, mnt);
        make_visible(mnt);
}

void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
        struct mountpoint *old_mp = mnt->mnt_mp;

        list_del_init(&mnt->mnt_child);
        hlist_del_init(&mnt->mnt_mp_list);
        hlist_del_init_rcu(&mnt->mnt_hash);

        attach_mnt(mnt, parent, mp);

        maybe_free_mountpoint(old_mp, &ex_mountpoints);
}

static inline struct mount *node_to_mount(struct rb_node *node)
{
        return node ? rb_entry(node, struct mount, mnt_node) : NULL;
}

static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
{
        struct rb_node **link = &ns->mounts.rb_node;
        struct rb_node *parent = NULL;
        bool mnt_first_node = true, mnt_last_node = true;

        WARN_ON(mnt_ns_attached(mnt));
        mnt->mnt_ns = ns;
        while (*link) {
                parent = *link;
                if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
                        link = &parent->rb_left;
                        mnt_last_node = false;
                } else {
                        link = &parent->rb_right;
                        mnt_first_node = false;
                }
        }

        if (mnt_last_node)
                ns->mnt_last_node = &mnt->mnt_node;
        if (mnt_first_node)
                ns->mnt_first_node = &mnt->mnt_node;
        rb_link_node(&mnt->mnt_node, parent, link);
        rb_insert_color(&mnt->mnt_node, &ns->mounts);

        mnt_notify_add(mnt);
}

static struct mount *next_mnt(struct mount *p, struct mount *root)
{
        struct list_head *next = p->mnt_mounts.next;
        if (next == &p->mnt_mounts) {
                while (1) {
                        if (p == root)
                                return NULL;
                        next = p->mnt_child.next;
                        if (next != &p->mnt_parent->mnt_mounts)
                                break;
                        p = p->mnt_parent;
                }
        }
        return list_entry(next, struct mount, mnt_child);
}

static struct mount *skip_mnt_tree(struct mount *p)
{
        struct list_head *prev = p->mnt_mounts.prev;
        while (prev != &p->mnt_mounts) {
                p = list_entry(prev, struct mount, mnt_child);
                prev = p->mnt_mounts.prev;
        }
        return p;
}

/*
 * vfsmount lock must be held for write
 */
static void commit_tree(struct mount *mnt)
{
        struct mnt_namespace *n = mnt->mnt_parent->mnt_ns;

        if (!mnt_ns_attached(mnt)) {
                for (struct mount *m = mnt; m; m = next_mnt(m, mnt))
                        mnt_add_to_ns(n, m);
                n->nr_mounts += n->pending_mounts;
                n->pending_mounts = 0;
        }

        make_visible(mnt);
        touch_mnt_namespace(n);
}

static void setup_mnt(struct mount *m, struct dentry *root)
{
        struct super_block *s = root->d_sb;

        atomic_inc(&s->s_active);
        m->mnt.mnt_sb = s;
        m->mnt.mnt_root = dget(root);
        m->mnt_mountpoint = m->mnt.mnt_root;
        m->mnt_parent = m;

        guard(mount_locked_reader)();
        mnt_add_instance(m, s);
}

/**
 * vfs_create_mount - Create a mount for a configured superblock
 * @fc: The configuration context with the superblock attached
 *
 * Create a mount to an already configured superblock.  If necessary, the
 * caller should invoke vfs_get_tree() before calling this.
 *
 * Note that this does not attach the mount to anything.
 */
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
        struct mount *mnt;

        if (!fc->root)
                return ERR_PTR(-EINVAL);

        mnt = alloc_vfsmnt(fc->source);
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (fc->sb_flags & SB_KERNMOUNT)
                mnt->mnt.mnt_flags = MNT_INTERNAL;

        setup_mnt(mnt, fc->root);

        return &mnt->mnt;
}
EXPORT_SYMBOL(vfs_create_mount);

struct vfsmount *fc_mount(struct fs_context *fc)
{
        int err = vfs_get_tree(fc);
        if (!err) {
                up_write(&fc->root->d_sb->s_umount);
                return vfs_create_mount(fc);
        }
        return ERR_PTR(err);
}
EXPORT_SYMBOL(fc_mount);

struct vfsmount *fc_mount_longterm(struct fs_context *fc)
{
        struct vfsmount *mnt = fc_mount(fc);
        if (!IS_ERR(mnt))
                real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
        return mnt;
}
EXPORT_SYMBOL(fc_mount_longterm);

struct vfsmount *vfs_kern_mount(struct file_system_type *type,
                                int flags, const char *name,
                                void *data)
{
        struct fs_context *fc;
        struct vfsmount *mnt;
        int ret = 0;

        if (!type)
                return ERR_PTR(-EINVAL);

        fc = fs_context_for_mount(type, flags);
        if (IS_ERR(fc))
                return ERR_CAST(fc);

        if (name)
                ret = vfs_parse_fs_string(fc, "source", name);
        if (!ret)
                ret = parse_monolithic_mount_data(fc, data);
        if (!ret)
                mnt = fc_mount(fc);
        else
                mnt = ERR_PTR(ret);

        put_fs_context(fc);
        return mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                                        int flag)
{
        struct mount *mnt;
        int err;

        mnt = alloc_vfsmnt(old->mnt_devname);
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) &
                             ~MNT_INTERNAL_FLAGS;

        if (flag & (CL_SLAVE | CL_PRIVATE))
                mnt->mnt_group_id = 0; /* not a peer of original */
        else
                mnt->mnt_group_id = old->mnt_group_id;

        if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
                err = mnt_alloc_group_id(mnt);
                if (err)
                        goto out_free;
        }

        if (mnt->mnt_group_id)
                set_mnt_shared(mnt);

        mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));

        setup_mnt(mnt, root);

        if (flag & CL_PRIVATE)        // we are done with it
                return mnt;

        if (peers(mnt, old))
                list_add(&mnt->mnt_share, &old->mnt_share);

        if ((flag & CL_SLAVE) && old->mnt_group_id) {
                hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list);
                mnt->mnt_master = old;
        } else if (IS_MNT_SLAVE(old)) {
                hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave);
                mnt->mnt_master = old->mnt_master;
        }
        return mnt;

 out_free:
        mnt_free_id(mnt);
        free_vfsmnt(mnt);
        return ERR_PTR(err);
}

static void cleanup_mnt(struct mount *mnt)
{
        struct hlist_node *p;
        struct mount *m;
        /*
         * The warning here probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this happens, the
         * filesystem was probably unable to make r/w->r/o transitions.
         * The locking used to deal with mnt_count decrement provides barriers,
         * so mnt_get_writers() below is safe.
         */
        WARN_ON(mnt_get_writers(mnt));
        if (unlikely(mnt->mnt_pins.first))
                mnt_pin_kill(mnt);
        hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
        fsnotify_vfsmount_delete(&mnt->mnt);
        dput(mnt->mnt.mnt_root);
        deactivate_super(mnt->mnt.mnt_sb);
        mnt_free_id(mnt);
        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}

static void __cleanup_mnt(struct rcu_head *head)
{
        cleanup_mnt(container_of(head, struct mount, mnt_rcu));
}

static LLIST_HEAD(delayed_mntput_list);
static void delayed_mntput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_mntput_list);
        struct mount *m, *t;

        llist_for_each_entry_safe(m, t, node, mnt_llist)
                cleanup_mnt(m);
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

static void noinline mntput_no_expire_slowpath(struct mount *mnt)
{
        LIST_HEAD(list);
        int count;

        VFS_BUG_ON(mnt->mnt_ns);
        lock_mount_hash();
        /*
         * make sure that if __legitimize_mnt() has not seen us grab
         * mount_lock, we'll see their refcount increment here.
         */
        smp_mb();
        mnt_add_count(mnt, -1);
        count = mnt_get_count(mnt);
        if (count != 0) {
                WARN_ON(count < 0);
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        mnt->mnt.mnt_flags |= MNT_DOOMED;
        rcu_read_unlock();

        mnt_del_instance(mnt);
        if (unlikely(!list_empty(&mnt->mnt_expire)))
                list_del(&mnt->mnt_expire);

        if (unlikely(!list_empty(&mnt->mnt_mounts))) {
                struct mount *p, *tmp;
                list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
                        __umount_mnt(p, &list);
                        hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
                }
        }
        unlock_mount_hash();
        shrink_dentry_list(&list);

        if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
                struct task_struct *task = current;
                if (likely(!(task->flags & PF_KTHREAD))) {
                        init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
                        if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
                                return;
                }
                if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
                        schedule_delayed_work(&delayed_mntput_work, 1);
                return;
        }
        cleanup_mnt(mnt);
}

static void mntput_no_expire(struct mount *mnt)
{
        rcu_read_lock();
        if (likely(READ_ONCE(mnt->mnt_ns))) {
                /*
                 * Since we don't do lock_mount_hash() here,
                 * ->mnt_ns can change under us.  However, if it's
                 * non-NULL, then there's a reference that won't
                 * be dropped until after an RCU delay done after
                 * turning ->mnt_ns NULL.  So if we observe it
                 * non-NULL under rcu_read_lock(), the reference
                 * we are dropping is not the final one.
                 */
                mnt_add_count(mnt, -1);
                rcu_read_unlock();
                return;
        }
        mntput_no_expire_slowpath(mnt);
}

void mntput(struct vfsmount *mnt)
{
        if (mnt) {
                struct mount *m = real_mount(mnt);
                /* avoid cacheline pingpong */
                if (unlikely(m->mnt_expiry_mark))
                        WRITE_ONCE(m->mnt_expiry_mark, 0);
                mntput_no_expire(m);
        }
}
EXPORT_SYMBOL(mntput);

struct vfsmount *mntget(struct vfsmount *mnt)
{
        if (mnt)
                mnt_add_count(real_mount(mnt), 1);
        return mnt;
}
EXPORT_SYMBOL(mntget);

/*
 * Make a mount point inaccessible to new lookups.
 * Because there may still be current users, the caller MUST WAIT
 * for an RCU grace period before destroying the mount point.
 */
void mnt_make_shortterm(struct vfsmount *mnt)
{
        if (mnt)
                real_mount(mnt)->mnt_ns = NULL;
}

/**
 * path_is_mountpoint() - Check if path is a mount in the current namespace.
 * @path: path to check
 *
 *  d_mountpoint() can only be used reliably to establish if a dentry is
 *  not mounted in any namespace and that common case is handled inline.
 *  d_mountpoint() isn't aware of the possibility there may be multiple
 *  mounts using a given dentry in a different namespace. This function
 *  checks if the passed in path is a mountpoint rather than the dentry
 *  alone.
 */
bool path_is_mountpoint(const struct path *path)
{
        unsigned seq;
        bool res;

        if (!d_mountpoint(path->dentry))
                return false;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                res = __path_is_mountpoint(path);
        } while (read_seqretry(&mount_lock, seq));
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(path_is_mountpoint);

struct vfsmount *mnt_clone_internal(const struct path *path)
{
        struct mount *p;
        p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
        if (IS_ERR(p))
                return ERR_CAST(p);
        p->mnt.mnt_flags |= MNT_INTERNAL;
        return &p->mnt;
}

/*
 * Returns the mount which either has the specified mnt_id, or has the next
 * smallest id afer the specified one.
 */
static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
{
        struct rb_node *node = ns->mounts.rb_node;
        struct mount *ret = NULL;

        while (node) {
                struct mount *m = node_to_mount(node);

                if (mnt_id <= m->mnt_id_unique) {
                        ret = node_to_mount(node);
                        if (mnt_id == m->mnt_id_unique)
                                break;
                        node = node->rb_left;
                } else {
                        node = node->rb_right;
                }
        }
        return ret;
}

/*
 * Returns the mount which either has the specified mnt_id, or has the next
 * greater id before the specified one.
 */
static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id)
{
        struct rb_node *node = ns->mounts.rb_node;
        struct mount *ret = NULL;

        while (node) {
                struct mount *m = node_to_mount(node);

                if (mnt_id >= m->mnt_id_unique) {
                        ret = node_to_mount(node);
                        if (mnt_id == m->mnt_id_unique)
                                break;
                        node = node->rb_right;
                } else {
                        node = node->rb_left;
                }
        }
        return ret;
}

#ifdef CONFIG_PROC_FS

/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
        struct proc_mounts *p = m->private;
        struct mount *mnt;

        down_read(&namespace_sem);

        mnt = mnt_find_id_at(p->ns, *pos);
        if (mnt)
                *pos = mnt->mnt_id_unique;
        return mnt;
}

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct mount *mnt = v;
        struct rb_node *node = rb_next(&mnt->mnt_node);

        if (node) {
                struct mount *next = node_to_mount(node);
                *pos = next->mnt_id_unique;
                return next;
        }

        /*
         * No more mounts. Set pos past current mount's ID so that if
         * iteration restarts, mnt_find_id_at() returns NULL.
         */
        *pos = mnt->mnt_id_unique + 1;
        return NULL;
}

static void m_stop(struct seq_file *m, void *v)
{
        up_read(&namespace_sem);
}

static int m_show(struct seq_file *m, void *v)
{
        struct proc_mounts *p = m->private;
        struct mount *r = v;
        return p->show(m, &r->mnt);
}

const struct seq_operations mounts_op = {
        .start        = m_start,
        .next        = m_next,
        .stop        = m_stop,
        .show        = m_show,
};

#endif  /* CONFIG_PROC_FS */

/**
 * may_umount_tree - check if a mount tree is busy
 * @m: root of mount tree
 *
 * This is called to check if a tree of mounts has any
 * open files, pwds, chroots or sub mounts that are
 * busy.
 */
int may_umount_tree(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        bool busy = false;

        /* write lock needed for mnt_get_count */
        lock_mount_hash();
        for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
                if (mnt_get_count(p) > (p == mnt ? 2 : 1)) {
                        busy = true;
                        break;
                }
        }
        unlock_mount_hash();

        return !busy;
}

EXPORT_SYMBOL(may_umount_tree);

/**
 * may_umount - check if a mount point is busy
 * @mnt: root of mount
 *
 * This is called to check if a mount point has any
 * open files, pwds, chroots or sub mounts. If the
 * mount has sub mounts this will return busy
 * regardless of whether the sub mounts are busy.
 *
 * Doesn't take quota and stuff into account. IOW, in some cases it will
 * give false negatives. The main reason why it's here is that we need
 * a non-destructive way to look for easily umountable filesystems.
 */
int may_umount(struct vfsmount *mnt)
{
        int ret = 1;
        down_read(&namespace_sem);
        lock_mount_hash();
        if (propagate_mount_busy(real_mount(mnt), 2))
                ret = 0;
        unlock_mount_hash();
        up_read(&namespace_sem);
        return ret;
}

EXPORT_SYMBOL(may_umount);

#ifdef CONFIG_FSNOTIFY
static void mnt_notify(struct mount *p)
{
        if (!p->prev_ns && p->mnt_ns) {
                fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
        } else if (p->prev_ns && !p->mnt_ns) {
                fsnotify_mnt_detach(p->prev_ns, &p->mnt);
        } else if (p->prev_ns == p->mnt_ns) {
                fsnotify_mnt_move(p->mnt_ns, &p->mnt);
        } else {
                fsnotify_mnt_detach(p->prev_ns, &p->mnt);
                fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
        }
        p->prev_ns = p->mnt_ns;
}

static void notify_mnt_list(void)
{
        struct mount *m, *tmp;
        /*
         * Notify about mounts that were added/reparented/detached/remain
         * connected after unmount.
         */
        list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
                mnt_notify(m);
                list_del_init(&m->to_notify);
        }
}

static bool need_notify_mnt_list(void)
{
        return !list_empty(&notify_list);
}
#else
static void notify_mnt_list(void)
{
}

static bool need_notify_mnt_list(void)
{
        return false;
}
#endif

static void free_mnt_ns(struct mnt_namespace *);
static void namespace_unlock(void)
{
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
        struct mnt_namespace *ns = emptied_ns;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
        list_splice_init(&ex_mountpoints, &list);
        emptied_ns = NULL;

        if (need_notify_mnt_list()) {
                /*
                 * No point blocking out concurrent readers while notifications
                 * are sent. This will also allow statmount()/listmount() to run
                 * concurrently.
                 */
                downgrade_write(&namespace_sem);
                notify_mnt_list();
                up_read(&namespace_sem);
        } else {
                up_write(&namespace_sem);
        }
        if (unlikely(ns)) {
                /* Make sure we notice when we leak mounts. */
                VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
                free_mnt_ns(ns);
        }

        shrink_dentry_list(&list);

        if (likely(hlist_empty(&head)))
                return;

        synchronize_rcu_expedited();

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
}

static inline void namespace_lock(void)
{
        down_write(&namespace_sem);
}

enum umount_tree_flags {
        UMOUNT_SYNC = 1,
        UMOUNT_PROPAGATE = 2,
        UMOUNT_CONNECTED = 4,
};

static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
{
        /* Leaving mounts connected is only valid for lazy umounts */
        if (how & UMOUNT_SYNC)
                return true;

        /* A mount without a parent has nothing to be connected to */
        if (!mnt_has_parent(mnt))
                return true;

        /* Because the reference counting rules change when mounts are
         * unmounted and connected, umounted mounts may not be
         * connected to mounted mounts.
         */
        if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
                return true;

        /* Has it been requested that the mount remain connected? */
        if (how & UMOUNT_CONNECTED)
                return false;

        /* Is the mount locked such that it needs to remain connected? */
        if (IS_MNT_LOCKED(mnt))
                return false;

        /* By default disconnect the mount */
        return true;
}

/*
 * mount_lock must be held
 * namespace_sem must be held for write
 */
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
        LIST_HEAD(tmp_list);
        struct mount *p;

        if (how & UMOUNT_PROPAGATE)
                propagate_mount_unlock(mnt);

        /* Gather the mounts to umount */
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                p->mnt.mnt_flags |= MNT_UMOUNT;
                if (mnt_ns_attached(p))
                        move_from_ns(p);
                list_add_tail(&p->mnt_list, &tmp_list);
        }

        /* Hide the mounts from mnt_mounts */
        list_for_each_entry(p, &tmp_list, mnt_list) {
                list_del_init(&p->mnt_child);
        }

        /* Add propagated mounts to the tmp_list */
        if (how & UMOUNT_PROPAGATE)
                propagate_umount(&tmp_list);

        bulk_make_private(&tmp_list);

        while (!list_empty(&tmp_list)) {
                struct mnt_namespace *ns;
                bool disconnect;
                p = list_first_entry(&tmp_list, struct mount, mnt_list);
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                ns = p->mnt_ns;
                if (ns) {
                        ns->nr_mounts--;
                        __touch_mnt_namespace(ns);
                }
                p->mnt_ns = NULL;
                if (how & UMOUNT_SYNC)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

                disconnect = disconnect_mount(p, how);
                if (mnt_has_parent(p)) {
                        if (!disconnect) {
                                /* Don't forget about p */
                                list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
                        } else {
                                umount_mnt(p);
                        }
                }
                if (disconnect)
                        hlist_add_head(&p->mnt_umount, &unmounted);

                /*
                 * At this point p->mnt_ns is NULL, notification will be queued
                 * only if
                 *
                 *  - p->prev_ns is non-NULL *and*
                 *  - p->prev_ns->n_fsnotify_marks is non-NULL
                 *
                 * This will preclude queuing the mount if this is a cleanup
                 * after a failed copy_tree() or destruction of an anonymous
                 * namespace, etc.
                 */
                mnt_notify_add(p);
        }
}

static void shrink_submounts(struct mount *mnt);

static int do_umount_root(struct super_block *sb)
{
        int ret = 0;

        down_write(&sb->s_umount);
        if (!sb_rdonly(sb)) {
                struct fs_context *fc;

                fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
                                                SB_RDONLY);
                if (IS_ERR(fc)) {
                        ret = PTR_ERR(fc);
                } else {
                        ret = parse_monolithic_mount_data(fc, NULL);
                        if (!ret)
                                ret = reconfigure_super(fc);
                        put_fs_context(fc);
                }
        }
        up_write(&sb->s_umount);
        return ret;
}

static int do_umount(struct mount *mnt, int flags)
{
        struct super_block *sb = mnt->mnt.mnt_sb;
        int retval;

        retval = security_sb_umount(&mnt->mnt, flags);
        if (retval)
                return retval;

        /*
         * Allow userspace to request a mountpoint be expired rather than
         * unmounting unconditionally. Unmount only happens if:
         *  (1) the mark is already set (the mark is cleared by mntput())
         *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
         */
        if (flags & MNT_EXPIRE) {
                if (&mnt->mnt == current->fs->root.mnt ||
                    flags & (MNT_FORCE | MNT_DETACH))
                        return -EINVAL;

                /*
                 * probably don't strictly need the lock here if we examined
                 * all race cases, but it's a slowpath.
                 */
                lock_mount_hash();
                if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) {
                        unlock_mount_hash();
                        return -EBUSY;
                }
                unlock_mount_hash();

                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
        }

        /*
         * If we may have to abort operations to get out of this
         * mount, and they will themselves hold resources we must
         * allow the fs to do things. In the Unix tradition of
         * 'Gee thats tricky lets do it in userspace' the umount_begin
         * might fail to complete on the first run through as other tasks
         * must return, and the like. Thats for the mount program to worry
         * about for the moment.
         */

        if (flags & MNT_FORCE && sb->s_op->umount_begin) {
                sb->s_op->umount_begin(sb);
        }

        /*
         * No sense to grab the lock for this test, but test itself looks
         * somewhat bogus. Suggestions for better replacement?
         * Ho-hum... In principle, we might treat that as umount + switch
         * to rootfs. GC would eventually take care of the old vfsmount.
         * Actually it makes sense, especially if rootfs would contain a
         * /reboot - static binary that would close all descriptors and
         * call reboot(9). Then init(8) could umount root and exec /reboot.
         */
        if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
                /*
                 * Special case for "unmounting" root ...
                 * we just try to remount it readonly.
                 */
                if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                        return -EPERM;
                return do_umount_root(sb);
        }

        namespace_lock();
        lock_mount_hash();

        /* Repeat the earlier racy checks, now that we are holding the locks */
        retval = -EINVAL;
        if (!check_mnt(mnt))
                goto out;

        if (mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        if (!mnt_has_parent(mnt)) /* not the absolute root */
                goto out;

        event++;
        if (flags & MNT_DETACH) {
                umount_tree(mnt, UMOUNT_PROPAGATE);
                retval = 0;
        } else {
                smp_mb(); // paired with __legitimize_mnt()
                shrink_submounts(mnt);
                retval = -EBUSY;
                if (!propagate_mount_busy(mnt, 2)) {
                        umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                        retval = 0;
                }
        }
out:
        unlock_mount_hash();
        namespace_unlock();
        return retval;
}

/*
 * __detach_mounts - lazily unmount all mounts on the specified dentry
 *
 * During unlink, rmdir, and d_drop it is possible to loose the path
 * to an existing mountpoint, and wind up leaking the mount.
 * detach_mounts allows lazily unmounting those mounts instead of
 * leaking them.
 *
 * The caller may hold dentry->d_inode->i_rwsem.
 */
void __detach_mounts(struct dentry *dentry)
{
        struct pinned_mountpoint mp = {};
        struct mount *mnt;

        guard(namespace_excl)();
        guard(mount_writer)();

        if (!lookup_mountpoint(dentry, &mp))
                return;

        event++;
        while (mp.node.next) {
                mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list);
                if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
                        umount_mnt(mnt);
                        hlist_add_head(&mnt->mnt_umount, &unmounted);
                }
                else umount_tree(mnt, UMOUNT_CONNECTED);
        }
        unpin_mountpoint(&mp);
}

/*
 * Is the caller allowed to modify his namespace?
 */
bool may_mount(void)
{
        return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}

static void warn_mandlock(void)
{
        pr_warn_once("=======================================================\n"
                     "WARNING: The mand mount option has been deprecated and\n"
                     "         and is ignored by this kernel. Remove the mand\n"
                     "         option from the mount to silence this warning.\n"
                     "=======================================================\n");
}

static int can_umount(const struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);
        struct super_block *sb = path->dentry->d_sb;

        if (!may_mount())
                return -EPERM;
        if (!path_mounted(path))
                return -EINVAL;
        if (!check_mnt(mnt))
                return -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
                return -EINVAL;
        if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

// caller is responsible for flags being sane
int path_umount(const struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        ret = can_umount(path, flags);
        if (!ret)
                ret = do_umount(mnt, flags);

        /* we mustn't call path_put() as that would clear mnt_expiry_mark */
        dput(path->dentry);
        mntput_no_expire(mnt);
        return ret;
}

static int ksys_umount(char __user *name, int flags)
{
        int lookup_flags = LOOKUP_MOUNTPOINT;
        struct path path;
        int ret;

        // basic validity checks done first
        if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
                return -EINVAL;

        if (!(flags & UMOUNT_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
        if (ret)
                return ret;
        return path_umount(&path, flags);
}

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{
        return ksys_umount(name, flags);
}

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

/*
 *        The 2.0 compatible umount. No flags.
 */
SYSCALL_DEFINE1(oldumount, char __user *, name)
{
        return ksys_umount(name, 0);
}

#endif

static bool is_mnt_ns_file(struct dentry *dentry)
{
        struct ns_common *ns;

        /* Is this a proxy for a mount namespace? */
        if (dentry->d_op != &ns_dentry_operations)
                return false;

        ns = d_inode(dentry)->i_private;

        return ns->ops == &mntns_operations;
}

struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
        return &mnt->ns;
}

struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
        struct ns_common *ns;

        guard(rcu)();

        for (;;) {
                ns = ns_tree_adjoined_rcu(mntns, previous);
                if (IS_ERR(ns))
                        return ERR_CAST(ns);

                mntns = to_mnt_ns(ns);

                /*
                 * The last passive reference count is put with RCU
                 * delay so accessing the mount namespace is not just
                 * safe but all relevant members are still valid.
                 */
                if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
                        continue;

                /*
                 * We need an active reference count as we're persisting
                 * the mount namespace and it might already be on its
                 * deathbed.
                 */
                if (!ns_ref_get(mntns))
                        continue;

                return mntns;
        }
}

struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry)
{
        if (!is_mnt_ns_file(dentry))
                return NULL;

        return to_mnt_ns(get_proc_ns(dentry->d_inode));
}

static bool mnt_ns_loop(struct dentry *dentry)
{
        /* Could bind mounting the mount namespace inode cause a
         * mount namespace loop?
         */
        struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);

        if (!mnt_ns)
                return false;

        return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id;
}

struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
                                        int flag)
{
        struct mount *res, *src_parent, *src_root_child, *src_mnt,
                *dst_parent, *dst_mnt;

        if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
                return ERR_PTR(-EINVAL);

        if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
                return ERR_PTR(-EINVAL);

        res = dst_mnt = clone_mnt(src_root, dentry, flag);
        if (IS_ERR(dst_mnt))
                return dst_mnt;

        src_parent = src_root;

        list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
                if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
                        continue;

                for (src_mnt = src_root_child; src_mnt;
                    src_mnt = next_mnt(src_mnt, src_root_child)) {
                        if (!(flag & CL_COPY_UNBINDABLE) &&
                            IS_MNT_UNBINDABLE(src_mnt)) {
                                if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
                                        /* Both unbindable and locked. */
                                        dst_mnt = ERR_PTR(-EPERM);
                                        goto out;
                                } else {
                                        src_mnt = skip_mnt_tree(src_mnt);
                                        continue;
                                }
                        }
                        if (!(flag & CL_COPY_MNT_NS_FILE) &&
                            is_mnt_ns_file(src_mnt->mnt.mnt_root)) {
                                src_mnt = skip_mnt_tree(src_mnt);
                                continue;
                        }
                        while (src_parent != src_mnt->mnt_parent) {
                                src_parent = src_parent->mnt_parent;
                                dst_mnt = dst_mnt->mnt_parent;
                        }

                        src_parent = src_mnt;
                        dst_parent = dst_mnt;
                        dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag);
                        if (IS_ERR(dst_mnt))
                                goto out;
                        lock_mount_hash();
                        if (src_mnt->mnt.mnt_flags & MNT_LOCKED)
                                dst_mnt->mnt.mnt_flags |= MNT_LOCKED;
                        if (unlikely(flag & CL_EXPIRE)) {
                                /* stick the duplicate mount on the same expiry
                                 * list as the original if that was on one */
                                if (!list_empty(&src_mnt->mnt_expire))
                                        list_add(&dst_mnt->mnt_expire,
                                                 &src_mnt->mnt_expire);
                        }
                        attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);
                        unlock_mount_hash();
                }
        }
        return res;

out:
        if (res) {
                lock_mount_hash();
                umount_tree(res, UMOUNT_SYNC);
                unlock_mount_hash();
        }
        return dst_mnt;
}

static inline bool extend_array(struct path **res, struct path **to_free,
                                unsigned n, unsigned *count, unsigned new_count)
{
        struct path *p;

        if (likely(n < *count))
                return true;
        p = kmalloc_objs(struct path, new_count);
        if (p && *count)
                memcpy(p, *res, *count * sizeof(struct path));
        *count = new_count;
        kfree(*to_free);
        *to_free = *res = p;
        return p;
}

const struct path *collect_paths(const struct path *path,
                              struct path *prealloc, unsigned count)
{
        struct mount *root = real_mount(path->mnt);
        struct mount *child;
        struct path *res = prealloc, *to_free = NULL;
        unsigned n = 0;

        guard(namespace_shared)();

        if (!check_mnt(root))
                return ERR_PTR(-EINVAL);
        if (!extend_array(&res, &to_free, 0, &count, 32))
                return ERR_PTR(-ENOMEM);
        res[n++] = *path;
        list_for_each_entry(child, &root->mnt_mounts, mnt_child) {
                if (!is_subdir(child->mnt_mountpoint, path->dentry))
                        continue;
                for (struct mount *m = child; m; m = next_mnt(m, child)) {
                        if (!extend_array(&res, &to_free, n, &count, 2 * count))
                                return ERR_PTR(-ENOMEM);
                        res[n].mnt = &m->mnt;
                        res[n].dentry = m->mnt.mnt_root;
                        n++;
                }
        }
        if (!extend_array(&res, &to_free, n, &count, count + 1))
                return ERR_PTR(-ENOMEM);
        memset(res + n, 0, (count - n) * sizeof(struct path));
        for (struct path *p = res; p->mnt; p++)
                path_get(p);
        return res;
}

void drop_collected_paths(const struct path *paths, const struct path *prealloc)
{
        for (const struct path *p = paths; p->mnt; p++)
                path_put(p);
        if (paths != prealloc)
                kfree(paths);
}

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);

void dissolve_on_fput(struct vfsmount *mnt)
{
        struct mount *m = real_mount(mnt);

        /*
         * m used to be the root of anon namespace; if it still is one,
         * we need to dissolve the mount tree and free that namespace.
         * Let's try to avoid taking namespace_sem if we can determine
         * that there's nothing to do without it - rcu_read_lock() is
         * enough to make anon_ns_root() memory-safe and once m has
         * left its namespace, it's no longer our concern, since it will
         * never become a root of anon ns again.
         */

        scoped_guard(rcu) {
                if (!anon_ns_root(m))
                        return;
        }

        scoped_guard(namespace_excl) {
                if (!anon_ns_root(m))
                        return;

                emptied_ns = m->mnt_ns;
                lock_mount_hash();
                umount_tree(m, UMOUNT_CONNECTED);
                unlock_mount_hash();
        }
}

/* locks: namespace_shared && pinned(mnt) || mount_locked_reader */
static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
{
        struct mount *child;

        list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                if (!is_subdir(child->mnt_mountpoint, dentry))
                        continue;

                if (child->mnt.mnt_flags & MNT_LOCKED)
                        return true;
        }
        return false;
}

bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
        guard(mount_locked_reader)();
        return __has_locked_children(mnt, dentry);
}

/*
 * Check that there aren't references to earlier/same mount namespaces in the
 * specified subtree.  Such references can act as pins for mount namespaces
 * that aren't checked by the mount-cycle checking code, thereby allowing
 * cycles to be made.
 *
 * locks: mount_locked_reader || namespace_shared && pinned(subtree)
 */
static bool check_for_nsfs_mounts(struct mount *subtree)
{
        for (struct mount *p = subtree; p; p = next_mnt(p, subtree))
                if (mnt_ns_loop(p->mnt.mnt_root))
                        return false;
        return true;
}

/**
 * clone_private_mount - create a private clone of a path
 * @path: path to clone
 *
 * This creates a new vfsmount, which will be the clone of @path.  The new mount
 * will not be attached anywhere in the namespace and will be private (i.e.
 * changes to the originating mount won't be propagated into this).
 *
 * This assumes caller has called or done the equivalent of may_mount().
 *
 * Release with mntput().
 */
struct vfsmount *clone_private_mount(const struct path *path)
{
        struct mount *old_mnt = real_mount(path->mnt);
        struct mount *new_mnt;

        guard(namespace_shared)();

        if (IS_MNT_UNBINDABLE(old_mnt))
                return ERR_PTR(-EINVAL);

        /*
         * Make sure the source mount is acceptable.
         * Anything mounted in our mount namespace is allowed.
         * Otherwise, it must be the root of an anonymous mount
         * namespace, and we need to make sure no namespace
         * loops get created.
         */
        if (!check_mnt(old_mnt)) {
                if (!anon_ns_root(old_mnt))
                        return ERR_PTR(-EINVAL);

                if (!check_for_nsfs_mounts(old_mnt))
                        return ERR_PTR(-EINVAL);
        }

        if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);

        if (__has_locked_children(old_mnt, path->dentry))
                return ERR_PTR(-EINVAL);

        new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
        if (IS_ERR(new_mnt))
                return ERR_PTR(-EINVAL);

        /* Longterm mount to be removed by kern_unmount*() */
        new_mnt->mnt_ns = MNT_NS_INTERNAL;
        return &new_mnt->mnt;
}
EXPORT_SYMBOL_GPL(clone_private_mount);

static void lock_mnt_tree(struct mount *mnt)
{
        struct mount *p;

        for (p = mnt; p; p = next_mnt(p, mnt)) {
                int flags = p->mnt.mnt_flags;
                /* Don't allow unprivileged users to change mount flags */
                flags |= MNT_LOCK_ATIME;

                if (flags & MNT_READONLY)
                        flags |= MNT_LOCK_READONLY;

                if (flags & MNT_NODEV)
                        flags |= MNT_LOCK_NODEV;

                if (flags & MNT_NOSUID)
                        flags |= MNT_LOCK_NOSUID;

                if (flags & MNT_NOEXEC)
                        flags |= MNT_LOCK_NOEXEC;
                /* Don't allow unprivileged users to reveal what is under a mount */
                if (list_empty(&p->mnt_expire) && p != mnt)
                        flags |= MNT_LOCKED;
                p->mnt.mnt_flags = flags;
        }
}

static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
        struct mount *p;

        for (p = mnt; p != end; p = next_mnt(p, mnt)) {
                if (p->mnt_group_id && !IS_MNT_SHARED(p))
                        mnt_release_group_id(p);
        }
}

static int invent_group_ids(struct mount *mnt, bool recurse)
{
        struct mount *p;

        for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
                if (!p->mnt_group_id) {
                        int err = mnt_alloc_group_id(p);
                        if (err) {
                                cleanup_group_ids(mnt, p);
                                return err;
                        }
                }
        }

        return 0;
}

int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
{
        unsigned int max = READ_ONCE(sysctl_mount_max);
        unsigned int mounts = 0;
        struct mount *p;

        if (ns->nr_mounts >= max)
                return -ENOSPC;
        max -= ns->nr_mounts;
        if (ns->pending_mounts >= max)
                return -ENOSPC;
        max -= ns->pending_mounts;

        for (p = mnt; p; p = next_mnt(p, mnt))
                mounts++;

        if (mounts > max)
                return -ENOSPC;

        ns->pending_mounts += mounts;
        return 0;
}

enum mnt_tree_flags_t {
        MNT_TREE_BENEATH = BIT(0),
        MNT_TREE_PROPAGATION = BIT(1),
};

/**
 * attach_recursive_mnt - attach a source mount tree
 * @source_mnt: mount tree to be attached
 * @dest:        the context for mounting at the place where the tree should go
 *
 *  NOTE: in the table below explains the semantics when a source mount
 *  of a given type is attached to a destination mount of a given type.
 * ---------------------------------------------------------------------------
 * |         BIND MOUNT OPERATION                                            |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
 * ***************************************************************************
 * A bind operation clones the source mount and mounts the clone on the
 * destination mount.
 *
 * (++)  the cloned mount is propagated to all the mounts in the propagation
 *          tree of the destination mount and the cloned mount is added to
 *          the peer group of the source mount.
 * (+)   the cloned mount is created under the destination mount and is marked
 *       as shared. The cloned mount is added to the peer group of the source
 *       mount.
 * (+++) the mount is propagated to all the mounts in the propagation tree
 *       of the destination mount and the cloned mount is made slave
 *       of the same master as that of the source mount. The cloned mount
 *       is marked as 'shared and slave'.
 * (*)   the cloned mount is made a slave of the same master as that of the
 *          source mount.
 *
 * ---------------------------------------------------------------------------
 * |                         MOVE MOUNT OPERATION                                 |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
 * ***************************************************************************
 *
 * (+)  the mount is moved to the destination. And is then propagated to
 *         all the mounts in the propagation tree of the destination mount.
 * (+*)  the mount is moved to the destination.
 * (+++)  the mount is moved to the destination and is then propagated to
 *         all the mounts belonging to the destination mount's propagation tree.
 *         the mount is marked as 'shared and slave'.
 * (*)        the mount continues to be a slave at the new location.
 *
 * if the source mount is a tree, the operations explained above is
 * applied to each mount in the tree.
 * Must be called without spinlocks held, since this function can sleep
 * in allocations.
 *
 * Context: The function expects namespace_lock() to be held.
 * Return: If @source_mnt was successfully attached 0 is returned.
 *         Otherwise a negative error code is returned.
 */
static int attach_recursive_mnt(struct mount *source_mnt,
                                const struct pinned_mountpoint *dest)
{
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        struct mount *dest_mnt = dest->parent;
        struct mountpoint *dest_mp = dest->mp;
        HLIST_HEAD(tree_list);
        struct mnt_namespace *ns = dest_mnt->mnt_ns;
        struct pinned_mountpoint root = {};
        struct mountpoint *shorter = NULL;
        struct mount *child, *p;
        struct mount *top;
        struct hlist_node *n;
        int err = 0;
        bool moving = mnt_has_parent(source_mnt);

        /*
         * Preallocate a mountpoint in case the new mounts need to be
         * mounted beneath mounts on the same mountpoint.
         */
        for (top = source_mnt; unlikely(top->overmount); top = top->overmount) {
                if (!shorter && is_mnt_ns_file(top->mnt.mnt_root))
                        shorter = top->mnt_mp;
        }
        err = get_mountpoint(top->mnt.mnt_root, &root);
        if (err)
                return err;

        /* Is there space to add these mounts to the mount namespace? */
        if (!moving) {
                err = count_mounts(ns, source_mnt);
                if (err)
                        goto out;
        }

        if (IS_MNT_SHARED(dest_mnt)) {
                err = invent_group_ids(source_mnt, true);
                if (err)
                        goto out;
                err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
        }
        lock_mount_hash();
        if (err)
                goto out_cleanup_ids;

        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        }

        if (moving) {
                umount_mnt(source_mnt);
                mnt_notify_add(source_mnt);
                /* if the mount is moved, it should no longer be expired
                 * automatically */
                list_del_init(&source_mnt->mnt_expire);
        } else {
                if (source_mnt->mnt_ns) {
                        /* move from anon - the caller will destroy */
                        emptied_ns = source_mnt->mnt_ns;
                        for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                                move_from_ns(p);
                }
        }

        mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
        /*
         * Now the original copy is in the same state as the secondaries -
         * its root attached to mountpoint, but not hashed and all mounts
         * in it are either in our namespace or in no namespace at all.
         * Add the original to the list of copies and deal with the
         * rest of work for all of them uniformly.
         */
        hlist_add_head(&source_mnt->mnt_hash, &tree_list);

        hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
                struct mount *q;
                hlist_del_init(&child->mnt_hash);
                /* Notice when we are propagating across user namespaces */
                if (child->mnt_parent->mnt_ns->user_ns != user_ns)
                        lock_mnt_tree(child);
                q = __lookup_mnt(&child->mnt_parent->mnt,
                                 child->mnt_mountpoint);
                commit_tree(child);
                if (q) {
                        struct mount *r = topmost_overmount(child);
                        struct mountpoint *mp = root.mp;

                        if (unlikely(shorter) && child != source_mnt)
                                mp = shorter;
                        /*
                         * If @q was locked it was meant to hide
                         * whatever was under it. Let @child take over
                         * that job and lock it, then we can unlock @q.
                         * That'll allow another namespace to shed @q
                         * and reveal @child. Clearly, that mounter
                         * consented to this by not severing the mount
                         * relationship. Otherwise, what's the point.
                         */
                        if (IS_MNT_LOCKED(q)) {
                                child->mnt.mnt_flags |= MNT_LOCKED;
                                q->mnt.mnt_flags &= ~MNT_LOCKED;
                        }
                        mnt_change_mountpoint(r, mp, q);
                }
        }
        unpin_mountpoint(&root);
        unlock_mount_hash();

        return 0;

 out_cleanup_ids:
        while (!hlist_empty(&tree_list)) {
                child = hlist_entry(tree_list.first, struct mount, mnt_hash);
                child->mnt_parent->mnt_ns->pending_mounts = 0;
                umount_tree(child, UMOUNT_SYNC);
        }
        unlock_mount_hash();
        cleanup_group_ids(source_mnt, NULL);
 out:
        ns->pending_mounts = 0;

        read_seqlock_excl(&mount_lock);
        unpin_mountpoint(&root);
        read_sequnlock_excl(&mount_lock);

        return err;
}

static inline struct mount *where_to_mount(const struct path *path,
                                           struct dentry **dentry,
                                           bool beneath)
{
        struct mount *m;

        if (unlikely(beneath)) {
                m = topmost_overmount(real_mount(path->mnt));
                *dentry = m->mnt_mountpoint;
                return m->mnt_parent;
        }
        m = __lookup_mnt(path->mnt, path->dentry);
        if (unlikely(m)) {
                m = topmost_overmount(m);
                *dentry = m->mnt.mnt_root;
                return m;
        }
        *dentry = path->dentry;
        return real_mount(path->mnt);
}

/**
 * do_lock_mount - acquire environment for mounting
 * @path:        target path
 * @res:        context to set up
 * @beneath:        whether the intention is to mount beneath @path
 *
 * To mount something at given location, we need
 *        namespace_sem locked exclusive
 *        inode of dentry we are mounting on locked exclusive
 *        struct mountpoint for that dentry
 *        struct mount we are mounting on
 *
 * Results are stored in caller-supplied context (pinned_mountpoint);
 * on success we have res->parent and res->mp pointing to parent and
 * mountpoint respectively and res->node inserted into the ->m_list
 * of the mountpoint, making sure the mountpoint won't disappear.
 * On failure we have res->parent set to ERR_PTR(-E...), res->mp
 * left NULL, res->node - empty.
 * In case of success do_lock_mount returns with locks acquired (in
 * proper order - inode lock nests outside of namespace_sem).
 *
 * Request to mount on overmounted location is treated as "mount on
 * top of whatever's overmounting it"; request to mount beneath
 * a location - "mount immediately beneath the topmost mount at that
 * place".
 *
 * In all cases the location must not have been unmounted and the
 * chosen mountpoint must be allowed to be mounted on.  For "beneath"
 * case we also require the location to be at the root of a mount
 * that has something mounted on top of it (i.e. has an overmount).
 */
static void do_lock_mount(const struct path *path,
                          struct pinned_mountpoint *res,
                          bool beneath)
{
        int err;

        if (unlikely(beneath) && !path_mounted(path)) {
                res->parent = ERR_PTR(-EINVAL);
                return;
        }

        do {
                struct dentry *dentry, *d;
                struct mount *m, *n;

                scoped_guard(mount_locked_reader) {
                        m = where_to_mount(path, &dentry, beneath);
                        if (&m->mnt != path->mnt) {
                                mntget(&m->mnt);
                                dget(dentry);
                        }
                }

                inode_lock(dentry->d_inode);
                namespace_lock();

                // check if the chain of mounts (if any) has changed.
                scoped_guard(mount_locked_reader)
                        n = where_to_mount(path, &d, beneath);

                if (unlikely(n != m || dentry != d))
                        err = -EAGAIN;                // something moved, retry
                else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt)))
                        err = -ENOENT;                // not to be mounted on
                else if (beneath && &m->mnt == path->mnt && !m->overmount)
                        err = -EINVAL;
                else
                        err = get_mountpoint(dentry, res);

                if (unlikely(err)) {
                        res->parent = ERR_PTR(err);
                        namespace_unlock();
                        inode_unlock(dentry->d_inode);
                } else {
                        res->parent = m;
                }
                /*
                 * Drop the temporary references.  This is subtle - on success
                 * we are doing that under namespace_sem, which would normally
                 * be forbidden.  However, in that case we are guaranteed that
                 * refcounts won't reach zero, since we know that path->mnt
                 * is mounted and thus all mounts reachable from it are pinned
                 * and stable, along with their mountpoints and roots.
                 */
                if (&m->mnt != path->mnt) {
                        dput(dentry);
                        mntput(&m->mnt);
                }
        } while (err == -EAGAIN);
}

static void __unlock_mount(struct pinned_mountpoint *m)
{
        inode_unlock(m->mp->m_dentry->d_inode);
        read_seqlock_excl(&mount_lock);
        unpin_mountpoint(m);
        read_sequnlock_excl(&mount_lock);
        namespace_unlock();
}

static inline void unlock_mount(struct pinned_mountpoint *m)
{
        if (!IS_ERR(m->parent))
                __unlock_mount(m);
}

static void lock_mount_exact(const struct path *path,
                             struct pinned_mountpoint *mp, bool copy_mount,
                             unsigned int copy_flags);

#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
        struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
        do_lock_mount((path), &mp, (beneath))
#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
#define LOCK_MOUNT_EXACT(mp, path) \
        struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
        lock_mount_exact((path), &mp, false, 0)
#define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \
        struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
        lock_mount_exact((path), &mp, true, (copy_flags))

static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
{
        if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
                return -EINVAL;

        if (d_is_dir(mp->mp->m_dentry) !=
              d_is_dir(mnt->mnt.mnt_root))
                return -ENOTDIR;

        return attach_recursive_mnt(mnt, mp);
}

static int may_change_propagation(const struct mount *m)
{
        struct mnt_namespace *ns = m->mnt_ns;

         // it must be mounted in some namespace
         if (IS_ERR_OR_NULL(ns))         // is_mounted()
                 return -EINVAL;
         // and the caller must be admin in userns of that namespace
         if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
                 return -EPERM;
         return 0;
}

/*
 * Sanity check the flags to change_mnt_propagation.
 */

static int flags_to_propagation_type(int ms_flags)
{
        int type = ms_flags & ~(MS_REC | MS_SILENT);

        /* Fail if any non-propagation flags are set */
        if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return 0;
        /* Only one propagation flag should be set */
        if (!is_power_of_2(type))
                return 0;
        return type;
}

/*
 * recursively change the type of the mountpoint.
 */
static int do_change_type(const struct path *path, int ms_flags)
{
        struct mount *m;
        struct mount *mnt = real_mount(path->mnt);
        int recurse = ms_flags & MS_REC;
        int type;
        int err;

        if (!path_mounted(path))
                return -EINVAL;

        type = flags_to_propagation_type(ms_flags);
        if (!type)
                return -EINVAL;

        guard(namespace_excl)();

        err = may_change_propagation(mnt);
        if (err)
                return err;

        if (type == MS_SHARED) {
                err = invent_group_ids(mnt, recurse);
                if (err)
                        return err;
        }

        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);

        return 0;
}

/* may_copy_tree() - check if a mount tree can be copied
 * @path: path to the mount tree to be copied
 *
 * This helper checks if the caller may copy the mount tree starting
 * from @path->mnt. The caller may copy the mount tree under the
 * following circumstances:
 *
 * (1) The caller is located in the mount namespace of the mount tree.
 *     This also implies that the mount does not belong to an anonymous
 *     mount namespace.
 * (2) The caller tries to copy an nfs mount referring to a mount
 *     namespace, i.e., the caller is trying to copy a mount namespace
 *     entry from nsfs.
 * (3) The caller tries to copy a pidfs mount referring to a pidfd.
 * (4) The caller is trying to copy a mount tree that belongs to an
 *     anonymous mount namespace.
 *
 *     For that to be safe, this helper enforces that the origin mount
 *     namespace the anonymous mount namespace was created from is the
 *     same as the caller's mount namespace by comparing the sequence
 *     numbers.
 *
 *     This is not strictly necessary. The current semantics of the new
 *     mount api enforce that the caller must be located in the same
 *     mount namespace as the mount tree it interacts with. Using the
 *     origin sequence number preserves these semantics even for
 *     anonymous mount namespaces. However, one could envision extending
 *     the api to directly operate across mount namespace if needed.
 *
 *     The ownership of a non-anonymous mount namespace such as the
 *     caller's cannot change.
 *     => We know that the caller's mount namespace is stable.
 *
 *     If the origin sequence number of the anonymous mount namespace is
 *     the same as the sequence number of the caller's mount namespace.
 *     => The owning namespaces are the same.
 *
 *     ==> The earlier capability check on the owning namespace of the
 *         caller's mount namespace ensures that the caller has the
 *         ability to copy the mount tree.
 *
 * Returns true if the mount tree can be copied, false otherwise.
 */
static inline bool may_copy_tree(const struct path *path)
{
        struct mount *mnt = real_mount(path->mnt);
        const struct dentry_operations *d_op;

        if (check_mnt(mnt))
                return true;

        d_op = path->dentry->d_op;
        if (d_op == &ns_dentry_operations)
                return true;

        if (d_op == &pidfs_dentry_operations)
                return true;

        if (!is_mounted(path->mnt))
                return false;

        return check_anonymous_mnt(mnt);
}

static struct mount *__do_loopback(const struct path *old_path,
                                   bool recurse, unsigned int copy_flags)
{
        struct mount *old = real_mount(old_path->mnt);

        if (IS_MNT_UNBINDABLE(old))
                return ERR_PTR(-EINVAL);

        if (!may_copy_tree(old_path))
                return ERR_PTR(-EINVAL);

        if (!recurse && __has_locked_children(old, old_path->dentry))
                return ERR_PTR(-EINVAL);

        if (recurse)
                return copy_tree(old, old_path->dentry, copy_flags);

        return clone_mnt(old, old_path->dentry, copy_flags);
}

/*
 * do loopback mount.
 */
static int do_loopback(const struct path *path, const char *old_name,
                       int recurse)
{
        struct path old_path __free(path_put) = {};
        struct mount *mnt = NULL;
        int err;

        if (!old_name || !*old_name)
                return -EINVAL;
        err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
        if (err)
                return err;

        if (mnt_ns_loop(old_path.dentry))
                return -EINVAL;

        LOCK_MOUNT(mp, path);
        if (IS_ERR(mp.parent))
                return PTR_ERR(mp.parent);

        if (!check_mnt(mp.parent))
                return -EINVAL;

        mnt = __do_loopback(&old_path, recurse, CL_COPY_MNT_NS_FILE);
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);

        err = graft_tree(mnt, &mp);
        if (err) {
                lock_mount_hash();
                umount_tree(mnt, UMOUNT_SYNC);
                unlock_mount_hash();
        }
        return err;
}

static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags)
{
        struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
        struct user_namespace *user_ns = mnt_ns->user_ns;
        struct mount *mnt, *p;

        ns = alloc_mnt_ns(user_ns, true);
        if (IS_ERR(ns))
                return ns;

        guard(namespace_excl)();

        /*
         * Record the sequence number of the source mount namespace.
         * This needs to hold namespace_sem to ensure that the mount
         * doesn't get attached.
         */
        if (is_mounted(path->mnt)) {
                src_mnt_ns = real_mount(path->mnt)->mnt_ns;
                if (is_anon_ns(src_mnt_ns))
                        ns->seq_origin = src_mnt_ns->seq_origin;
                else
                        ns->seq_origin = src_mnt_ns->ns.ns_id;
        }

        mnt = __do_loopback(path, (flags & AT_RECURSIVE), CL_COPY_MNT_NS_FILE);
        if (IS_ERR(mnt)) {
                emptied_ns = ns;
                return ERR_CAST(mnt);
        }

        for (p = mnt; p; p = next_mnt(p, mnt)) {
                mnt_add_to_ns(ns, p);
                ns->nr_mounts++;
        }
        ns->root = mnt;
        return ns;
}

static struct file *open_detached_copy(struct path *path, unsigned int flags)
{
        struct mnt_namespace *ns = get_detached_copy(path, flags);
        struct file *file;

        if (IS_ERR(ns))
                return ERR_CAST(ns);

        mntput(path->mnt);
        path->mnt = mntget(&ns->root->mnt);
        file = dentry_open(path, O_PATH, current_cred());
        if (IS_ERR(file))
                dissolve_on_fput(path->mnt);
        else
                file->f_mode |= FMODE_NEED_UNMOUNT;
        return file;
}

enum mount_copy_flags_t {
        MOUNT_COPY_RECURSIVE    = (1 << 0),
        MOUNT_COPY_NEW                = (1 << 1),
};

static struct mnt_namespace *create_new_namespace(struct path *path,
                                                  enum mount_copy_flags_t flags)
{
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        struct user_namespace *user_ns = current_user_ns();
        struct mnt_namespace *new_ns;
        struct mount *new_ns_root, *old_ns_root;
        struct path to_path;
        struct mount *mnt;
        unsigned int copy_flags = 0;
        bool locked = false, recurse = flags & MOUNT_COPY_RECURSIVE;

        if (user_ns != ns->user_ns)
                copy_flags |= CL_SLAVE;

        new_ns = alloc_mnt_ns(user_ns, false);
        if (IS_ERR(new_ns))
                return ERR_CAST(new_ns);

        old_ns_root = ns->root;
        to_path.mnt = &old_ns_root->mnt;
        to_path.dentry = old_ns_root->mnt.mnt_root;

        VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type);

        LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags);
        if (IS_ERR(mp.parent)) {
                free_mnt_ns(new_ns);
                return ERR_CAST(mp.parent);
        }
        new_ns_root = mp.parent;

        /*
         * If the real rootfs had a locked mount on top of it somewhere
         * in the stack, lock the new mount tree as well so it can't be
         * exposed.
         */
        mnt = old_ns_root;
        while (mnt->overmount) {
                mnt = mnt->overmount;
                if (mnt->mnt.mnt_flags & MNT_LOCKED)
                        locked = true;
        }

        /*
         * We don't emulate unshare()ing a mount namespace. We stick to
         * the restrictions of creating detached bind-mounts. It has a
         * lot saner and simpler semantics.
         */
        if (flags & MOUNT_COPY_NEW)
                mnt = clone_mnt(real_mount(path->mnt), path->dentry, copy_flags);
        else
                mnt = __do_loopback(path, recurse, copy_flags);
        scoped_guard(mount_writer) {
                if (IS_ERR(mnt)) {
                        emptied_ns = new_ns;
                        umount_tree(new_ns_root, 0);
                        return ERR_CAST(mnt);
                }

                if (locked)
                        mnt->mnt.mnt_flags |= MNT_LOCKED;
                /*
                 * now mount the detached tree on top of the copy
                 * of the real rootfs we created.
                 */
                attach_mnt(mnt, new_ns_root, mp.mp);
                if (user_ns != ns->user_ns)
                        lock_mnt_tree(new_ns_root);
        }

        for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) {
                mnt_add_to_ns(new_ns, mnt);
                new_ns->nr_mounts++;
        }

        new_ns->root = new_ns_root;
        ns_tree_add_raw(new_ns);
        return new_ns;
}

static struct file *open_new_namespace(struct path *path,
                                       enum mount_copy_flags_t flags)
{
        struct mnt_namespace *new_ns;

        new_ns = create_new_namespace(path, flags);
        if (IS_ERR(new_ns))
                return ERR_CAST(new_ns);
        return open_namespace_file(to_ns_common(new_ns));
}

static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
{
        int ret;
        struct path path __free(path_put) = {};
        int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;

        BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);

        if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
                      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
                      OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE))
                return ERR_PTR(-EINVAL);

        if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) ==
            AT_RECURSIVE)
                return ERR_PTR(-EINVAL);

        if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1)
                return ERR_PTR(-EINVAL);

        if (flags & AT_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;

        /*
         * If we create a new mount namespace with the cloned mount tree we
         * just care about being privileged over our current user namespace.
         * The new mount namespace will be owned by it.
         */
        if ((flags & OPEN_TREE_NAMESPACE) &&
            !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);

        if ((flags & OPEN_TREE_CLONE) && !may_mount())
                return ERR_PTR(-EPERM);

        CLASS(filename_uflags, name)(filename, flags);
        ret = filename_lookup(dfd, name, lookup_flags, &path, NULL);
        if (unlikely(ret))
                return ERR_PTR(ret);

        if (flags & OPEN_TREE_NAMESPACE)
                return open_new_namespace(&path, (flags & AT_RECURSIVE) ? MOUNT_COPY_RECURSIVE : 0);

        if (flags & OPEN_TREE_CLONE)
                return open_detached_copy(&path, flags);

        return dentry_open(&path, O_PATH, current_cred());
}

SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
{
        return FD_ADD(flags, vfs_open_tree(dfd, filename, flags));
}

/*
 * Don't allow locked mount flags to be cleared.
 *
 * No locks need to be held here while testing the various MNT_LOCK
 * flags because those flags can never be cleared once they are set.
 */
static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
{
        unsigned int fl = mnt->mnt.mnt_flags;

        if ((fl & MNT_LOCK_READONLY) &&
            !(mnt_flags & MNT_READONLY))
                return false;

        if ((fl & MNT_LOCK_NODEV) &&
            !(mnt_flags & MNT_NODEV))
                return false;

        if ((fl & MNT_LOCK_NOSUID) &&
            !(mnt_flags & MNT_NOSUID))
                return false;

        if ((fl & MNT_LOCK_NOEXEC) &&
            !(mnt_flags & MNT_NOEXEC))
                return false;

        if ((fl & MNT_LOCK_ATIME) &&
            ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
                return false;

        return true;
}

static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
{
        bool readonly_request = (mnt_flags & MNT_READONLY);

        if (readonly_request == __mnt_is_readonly(&mnt->mnt))
                return 0;

        if (readonly_request)
                return mnt_make_readonly(mnt);

        mnt->mnt.mnt_flags &= ~MNT_READONLY;
        return 0;
}

static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
{
        mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
        mnt->mnt.mnt_flags = mnt_flags;
        touch_mnt_namespace(mnt->mnt_ns);
}

static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
                                      struct vfsmount *mnt)
{
        struct super_block *sb = mnt->mnt_sb;

        if (!__mnt_is_readonly(mnt) &&
           (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
           (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
                char *buf, *mntpath;

                buf = (char *)__get_free_page(GFP_KERNEL);
                if (buf)
                        mntpath = d_path(mountpoint, buf, PAGE_SIZE);
                else
                        mntpath = ERR_PTR(-ENOMEM);
                if (IS_ERR(mntpath))
                        mntpath = "(unknown)";

                pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
                        sb->s_type->name,
                        is_mounted(mnt) ? "remounted" : "mounted",
                        mntpath, &sb->s_time_max,
                        (unsigned long long)sb->s_time_max);

                sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
                if (buf)
                        free_page((unsigned long)buf);
        }
}

/*
 * Handle reconfiguration of the mountpoint only without alteration of the
 * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
 * to mount(2).
 */
static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags)
{
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (!path_mounted(path))
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        /*
         * We're only checking whether the superblock is read-only not
         * changing it, so only take down_read(&sb->s_umount).
         */
        down_read(&sb->s_umount);
        lock_mount_hash();
        ret = change_mount_ro_state(mnt, mnt_flags);
        if (ret == 0)
                set_mount_attributes(mnt, mnt_flags);
        unlock_mount_hash();
        up_read(&sb->s_umount);

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        return ret;
}

/*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
 * on it - tough luck.
 */
static int do_remount(const struct path *path, int sb_flags,
                      int mnt_flags, void *data)
{
        int err;
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        struct fs_context *fc;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (!path_mounted(path))
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the remount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        err = parse_monolithic_mount_data(fc, data);
        if (!err) {
                down_write(&sb->s_umount);
                err = -EPERM;
                if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
                        err = reconfigure_super(fc);
                        if (!err) {
                                lock_mount_hash();
                                set_mount_attributes(mnt, mnt_flags);
                                unlock_mount_hash();
                        }
                }
                up_write(&sb->s_umount);
        }

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        put_fs_context(fc);
        return err;
}

static inline int tree_contains_unbindable(struct mount *mnt)
{
        struct mount *p;
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                if (IS_MNT_UNBINDABLE(p))
                        return 1;
        }
        return 0;
}

static int do_set_group(const struct path *from_path, const struct path *to_path)
{
        struct mount *from = real_mount(from_path->mnt);
        struct mount *to = real_mount(to_path->mnt);
        int err;

        guard(namespace_excl)();

        err = may_change_propagation(from);
        if (err)
                return err;
        err = may_change_propagation(to);
        if (err)
                return err;

        /* To and From paths should be mount roots */
        if (!path_mounted(from_path))
                return -EINVAL;
        if (!path_mounted(to_path))
                return -EINVAL;

        /* Setting sharing groups is only allowed across same superblock */
        if (from->mnt.mnt_sb != to->mnt.mnt_sb)
                return -EINVAL;

        /* From mount root should be wider than To mount root */
        if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
                return -EINVAL;

        /* From mount should not have locked children in place of To's root */
        if (__has_locked_children(from, to->mnt.mnt_root))
                return -EINVAL;

        /* Setting sharing groups is only allowed on private mounts */
        if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
                return -EINVAL;

        /* From should not be private */
        if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
                return -EINVAL;

        if (IS_MNT_SLAVE(from)) {
                hlist_add_behind(&to->mnt_slave, &from->mnt_slave);
                to->mnt_master = from->mnt_master;
        }

        if (IS_MNT_SHARED(from)) {
                to->mnt_group_id = from->mnt_group_id;
                list_add(&to->mnt_share, &from->mnt_share);
                set_mnt_shared(to);
        }
        return 0;
}

/**
 * path_overmounted - check if path is overmounted
 * @path: path to check
 *
 * Check if path is overmounted, i.e., if there's a mount on top of
 * @path->mnt with @path->dentry as mountpoint.
 *
 * Context: namespace_sem must be held at least shared.
 * MUST NOT be called under lock_mount_hash() (there one should just
 * call __lookup_mnt() and check if it returns NULL).
 * Return: If path is overmounted true is returned, false if not.
 */
static inline bool path_overmounted(const struct path *path)
{
        unsigned seq = read_seqbegin(&mount_lock);
        bool no_child;

        rcu_read_lock();
        no_child = !__lookup_mnt(path->mnt, path->dentry);
        rcu_read_unlock();
        if (need_seqretry(&mount_lock, seq)) {
                read_seqlock_excl(&mount_lock);
                no_child = !__lookup_mnt(path->mnt, path->dentry);
                read_sequnlock_excl(&mount_lock);
        }
        return unlikely(!no_child);
}

/*
 * Check if there is a possibly empty chain of descent from p1 to p2.
 * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl).
 */
static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
{
        while (p2 != p1 && mnt_has_parent(p2))
                p2 = p2->mnt_parent;
        return p2 == p1;
}

/**
 * can_move_mount_beneath - check that we can mount beneath the top mount
 * @mnt_from: mount we are trying to move
 * @mnt_to:   mount under which to mount
 * @mp:   mountpoint of @mnt_to
 *
 * - Make sure that the caller can unmount the topmost mount ensuring
 *   that the caller could reveal the underlying mountpoint.
 * - Ensure that nothing has been mounted on top of @mnt_from before we
 *   grabbed @namespace_sem to avoid creating pointless shadow mounts.
 * - Prevent mounting beneath a mount if the propagation relationship
 *   between the source mount, parent mount, and top mount would lead to
 *   nonsensical mount trees.
 *
 * Context: This function expects namespace_lock() to be held.
 * Return: On success 0, and on error a negative error code is returned.
 */
static int can_move_mount_beneath(const struct mount *mnt_from,
                                  const struct mount *mnt_to,
                                  struct pinned_mountpoint *mp)
{
        struct mount *parent_mnt_to = mnt_to->mnt_parent;

        /* Avoid creating shadow mounts during mount propagation. */
        if (mnt_from->overmount)
                return -EINVAL;

        if (mount_is_ancestor(mnt_to, mnt_from))
                return -EINVAL;

        /*
         * If the parent mount propagates to the child mount this would
         * mean mounting @mnt_from on @mnt_to->mnt_parent and then
         * propagating a copy @c of @mnt_from on top of @mnt_to. This
         * defeats the whole purpose of mounting beneath another mount.
         */
        if (propagation_would_overmount(parent_mnt_to, mnt_to, mp->mp))
                return -EINVAL;

        /*
         * If @mnt_to->mnt_parent propagates to @mnt_from this would
         * mean propagating a copy @c of @mnt_from on top of @mnt_from.
         * Afterwards @mnt_from would be mounted on top of
         * @mnt_to->mnt_parent and @mnt_to would be unmounted from
         * @mnt->mnt_parent and remounted on @mnt_from. But since @c is
         * already mounted on @mnt_from, @mnt_to would ultimately be
         * remounted on top of @c. Afterwards, @mnt_from would be
         * covered by a copy @c of @mnt_from and @c would be covered by
         * @mnt_from itself. This defeats the whole purpose of mounting
         * @mnt_from beneath @mnt_to.
         */
        if (check_mnt(mnt_from) &&
            propagation_would_overmount(parent_mnt_to, mnt_from, mp->mp))
                return -EINVAL;

        return 0;
}

/* may_use_mount() - check if a mount tree can be used
 * @mnt: vfsmount to be used
 *
 * This helper checks if the caller may use the mount tree starting
 * from @path->mnt. The caller may use the mount tree under the
 * following circumstances:
 *
 * (1) The caller is located in the mount namespace of the mount tree.
 *     This also implies that the mount does not belong to an anonymous
 *     mount namespace.
 * (2) The caller is trying to use a mount tree that belongs to an
 *     anonymous mount namespace.
 *
 *     For that to be safe, this helper enforces that the origin mount
 *     namespace the anonymous mount namespace was created from is the
 *     same as the caller's mount namespace by comparing the sequence
 *     numbers.
 *
 *     The ownership of a non-anonymous mount namespace such as the
 *     caller's cannot change.
 *     => We know that the caller's mount namespace is stable.
 *
 *     If the origin sequence number of the anonymous mount namespace is
 *     the same as the sequence number of the caller's mount namespace.
 *     => The owning namespaces are the same.
 *
 *     ==> The earlier capability check on the owning namespace of the
 *         caller's mount namespace ensures that the caller has the
 *         ability to use the mount tree.
 *
 * Returns true if the mount tree can be used, false otherwise.
 */
static inline bool may_use_mount(struct mount *mnt)
{
        if (check_mnt(mnt))
                return true;

        /*
         * Make sure that noone unmounted the target path or somehow
         * managed to get their hands on something purely kernel
         * internal.
         */
        if (!is_mounted(&mnt->mnt))
                return false;

        return check_anonymous_mnt(mnt);
}

static int do_move_mount(const struct path *old_path,
                         const struct path *new_path,
                         enum mnt_tree_flags_t flags)
{
        struct mount *old = real_mount(old_path->mnt);
        int err;
        bool beneath = flags & MNT_TREE_BENEATH;

        if (!path_mounted(old_path))
                return -EINVAL;

        if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry))
                return -EINVAL;

        LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath);
        if (IS_ERR(mp.parent))
                return PTR_ERR(mp.parent);

        if (check_mnt(old)) {
                /* if the source is in our namespace... */
                /* ... it should be detachable from parent */
                if (!mnt_has_parent(old) || IS_MNT_LOCKED(old))
                        return -EINVAL;
                /* ... which should not be shared */
                if (IS_MNT_SHARED(old->mnt_parent))
                        return -EINVAL;
                /* ... and the target should be in our namespace */
                if (!check_mnt(mp.parent))
                        return -EINVAL;
        } else {
                /*
                 * otherwise the source must be the root of some anon namespace.
                 */
                if (!anon_ns_root(old))
                        return -EINVAL;
                /*
                 * Bail out early if the target is within the same namespace -
                 * subsequent checks would've rejected that, but they lose
                 * some corner cases if we check it early.
                 */
                if (old->mnt_ns == mp.parent->mnt_ns)
                        return -EINVAL;
                /*
                 * Target should be either in our namespace or in an acceptable
                 * anon namespace, sensu check_anonymous_mnt().
                 */
                if (!may_use_mount(mp.parent))
                        return -EINVAL;
        }

        if (beneath) {
                struct mount *over = real_mount(new_path->mnt);

                if (mp.parent != over->mnt_parent)
                        over = mp.parent->overmount;
                err = can_move_mount_beneath(old, over, &mp);
                if (err)
                        return err;
        }

        /*
         * Don't move a mount tree containing unbindable mounts to a destination
         * mount which is shared.
         */
        if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old))
                return -EINVAL;
        if (!check_for_nsfs_mounts(old))
                return -ELOOP;
        if (mount_is_ancestor(old, mp.parent))
                return -ELOOP;

        return attach_recursive_mnt(old, &mp);
}

static int do_move_mount_old(const struct path *path, const char *old_name)
{
        struct path old_path __free(path_put) = {};
        int err;

        if (!old_name || !*old_name)
                return -EINVAL;

        err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
        if (err)
                return err;

        return do_move_mount(&old_path, path, 0);
}

/*
 * add a mount into a namespace's mount tree
 */
static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp,
                        int mnt_flags)
{
        struct mount *parent = mp->parent;

        if (IS_ERR(parent))
                return PTR_ERR(parent);

        mnt_flags &= ~MNT_INTERNAL_FLAGS;

        if (unlikely(!check_mnt(parent))) {
                /* that's acceptable only for automounts done in private ns */
                if (!(mnt_flags & MNT_SHRINKABLE))
                        return -EINVAL;
                /* ... and for those we'd better have mountpoint still alive */
                if (!parent->mnt_ns)
                        return -EINVAL;
        }

        /* Refuse the same filesystem on the same mount point */
        if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb &&
            parent->mnt.mnt_root == mp->mp->m_dentry)
                return -EBUSY;

        if (d_is_symlink(newmnt->mnt.mnt_root))
                return -EINVAL;

        newmnt->mnt.mnt_flags = mnt_flags;
        return graft_tree(newmnt, mp);
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);

/*
 * Create a new mount using a superblock configuration and request it
 * be added to the namespace tree.
 */
static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint,
                           unsigned int mnt_flags)
{
        struct super_block *sb;
        struct vfsmount *mnt __free(mntput) = fc_mount(fc);
        int error;

        if (IS_ERR(mnt))
                return PTR_ERR(mnt);

        sb = fc->root->d_sb;
        error = security_sb_kern_mount(sb);
        if (unlikely(error))
                return error;

        if (unlikely(mount_too_revealing(sb, &mnt_flags))) {
                errorfcp(fc, "VFS", "Mount too revealing");
                return -EPERM;
        }

        mnt_warn_timestamp_expiry(mountpoint, mnt);

        LOCK_MOUNT(mp, mountpoint);
        error = do_add_mount(real_mount(mnt), &mp, mnt_flags);
        if (!error)
                retain_and_null_ptr(mnt); // consumed on success
        return error;
}

/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(const struct path *path, const char *fstype,
                        int sb_flags, int mnt_flags,
                        const char *name, void *data)
{
        struct file_system_type *type;
        struct fs_context *fc;
        const char *subtype = NULL;
        int err = 0;

        if (!fstype)
                return -EINVAL;

        type = get_fs_type(fstype);
        if (!type)
                return -ENODEV;

        if (type->fs_flags & FS_HAS_SUBTYPE) {
                subtype = strchr(fstype, '.');
                if (subtype) {
                        subtype++;
                        if (!*subtype) {
                                put_filesystem(type);
                                return -EINVAL;
                        }
                }
        }

        fc = fs_context_for_mount(type, sb_flags);
        put_filesystem(type);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the mount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        if (subtype)
                err = vfs_parse_fs_string(fc, "subtype", subtype);
        if (!err && name)
                err = vfs_parse_fs_string(fc, "source", name);
        if (!err)
                err = parse_monolithic_mount_data(fc, data);
        if (!err && !mount_capable(fc))
                err = -EPERM;
        if (!err)
                err = do_new_mount_fc(fc, path, mnt_flags);

        put_fs_context(fc);
        return err;
}

static void lock_mount_exact(const struct path *path,
                             struct pinned_mountpoint *mp, bool copy_mount,
                             unsigned int copy_flags)
{
        struct dentry *dentry = path->dentry;
        int err;

        /* Assert that inode_lock() locked the correct inode. */
        VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path));

        inode_lock(dentry->d_inode);
        namespace_lock();
        if (unlikely(cant_mount(dentry)))
                err = -ENOENT;
        else if (!copy_mount && path_overmounted(path))
                err = -EBUSY;
        else
                err = get_mountpoint(dentry, mp);
        if (unlikely(err)) {
                namespace_unlock();
                inode_unlock(dentry->d_inode);
                mp->parent = ERR_PTR(err);
                return;
        }

        if (copy_mount)
                mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags);
        else
                mp->parent = real_mount(path->mnt);
        if (unlikely(IS_ERR(mp->parent)))
                __unlock_mount(mp);
}

int finish_automount(struct vfsmount *__m, const struct path *path)
{
        struct vfsmount *m __free(mntput) = __m;
        struct mount *mnt;
        int err;

        if (!m)
                return 0;
        if (IS_ERR(m))
                return PTR_ERR(m);

        mnt = real_mount(m);

        if (m->mnt_root == path->dentry)
                return -ELOOP;

        /*
         * we don't want to use LOCK_MOUNT() - in this case finding something
         * that overmounts our mountpoint to be means "quitely drop what we've
         * got", not "try to mount it on top".
         */
        LOCK_MOUNT_EXACT(mp, path);
        if (mp.parent == ERR_PTR(-EBUSY))
                return 0;

        err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE);
        if (likely(!err))
                retain_and_null_ptr(m);
        return err;
}

/**
 * mnt_set_expiry - Put a mount on an expiration list
 * @mnt: The mount to list.
 * @expiry_list: The list to add the mount to.
 */
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
        guard(mount_locked_reader)();
        list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
}
EXPORT_SYMBOL(mnt_set_expiry);

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
 * here
 */
void mark_mounts_for_expiry(struct list_head *mounts)
{
        struct mount *mnt, *next;
        LIST_HEAD(graveyard);

        if (list_empty(mounts))
                return;

        guard(namespace_excl)();
        guard(mount_writer)();

        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
         * - already mounted
         * - only referenced by its parent vfsmount
         * - still marked for expiry (marked on the last call here; marks are
         *   cleared by mntput())
         */
        list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
                if (!is_mounted(&mnt->mnt))
                        continue;
                if (!xchg(&mnt->mnt_expiry_mark, 1) ||
                        propagate_mount_busy(mnt, 1))
                        continue;
                list_move(&mnt->mnt_expire, &graveyard);
        }
        while (!list_empty(&graveyard)) {
                mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
        }
}

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

/*
 * Ripoff of 'select_parent()'
 *
 * search the list of submounts for a given mountpoint, and move any
 * shrinkable submounts to the 'graveyard' list.
 */
static int select_submounts(struct mount *parent, struct list_head *graveyard)
{
        struct mount *this_parent = parent;
        struct list_head *next;
        int found = 0;

repeat:
        next = this_parent->mnt_mounts.next;
resume:
        while (next != &this_parent->mnt_mounts) {
                struct list_head *tmp = next;
                struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

                next = tmp->next;
                if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
                        continue;
                /*
                 * Descend a level if the d_mounts list is non-empty.
                 */
                if (!list_empty(&mnt->mnt_mounts)) {
                        this_parent = mnt;
                        goto repeat;
                }

                if (!propagate_mount_busy(mnt, 1)) {
                        list_move_tail(&mnt->mnt_expire, graveyard);
                        found++;
                }
        }
        /*
         * All done at this level ... ascend and resume the search
         */
        if (this_parent != parent) {
                next = this_parent->mnt_child.next;
                this_parent = this_parent->mnt_parent;
                goto resume;
        }
        return found;
}

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
 *
 * mount_lock must be held for write
 */
static void shrink_submounts(struct mount *mnt)
{
        LIST_HEAD(graveyard);
        struct mount *m;

        /* extract submounts of 'mountpoint' from the expiration list */
        while (select_submounts(mnt, &graveyard)) {
                while (!list_empty(&graveyard)) {
                        m = list_first_entry(&graveyard, struct mount,
                                                mnt_expire);
                        touch_mnt_namespace(m->mnt_ns);
                        umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                }
        }
}

static void *copy_mount_options(const void __user * data)
{
        char *copy;
        unsigned left, offset;

        if (!data)
                return NULL;

        copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!copy)
                return ERR_PTR(-ENOMEM);

        left = copy_from_user(copy, data, PAGE_SIZE);

        /*
         * Not all architectures have an exact copy_from_user(). Resort to
         * byte at a time.
         */
        offset = PAGE_SIZE - left;
        while (left) {
                char c;
                if (get_user(c, (const char __user *)data + offset))
                        break;
                copy[offset] = c;
                left--;
                offset++;
        }

        if (left == PAGE_SIZE) {
                kfree(copy);
                return ERR_PTR(-EFAULT);
        }

        return copy;
}

static char *copy_mount_string(const void __user *data)
{
        return data ? strndup_user(data, PATH_MAX) : NULL;
}

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
int path_mount(const char *dev_name, const struct path *path,
                const char *type_page, unsigned long flags, void *data_page)
{
        unsigned int mnt_flags = 0, sb_flags;
        int ret;

        /* Discard magic */
        if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
                flags &= ~MS_MGC_MSK;

        /* Basic sanity checks */
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;

        if (flags & MS_NOUSER)
                return -EINVAL;

        ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
        if (ret)
                return ret;
        if (!may_mount())
                return -EPERM;
        if (flags & SB_MANDLOCK)
                warn_mandlock();

        /* Default to relatime unless overriden */
        if (!(flags & MS_NOATIME))
                mnt_flags |= MNT_RELATIME;

        /* Separate the per-mountpoint flags */
        if (flags & MS_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (flags & MS_NODEV)
                mnt_flags |= MNT_NODEV;
        if (flags & MS_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (flags & MS_NOATIME)
                mnt_flags |= MNT_NOATIME;
        if (flags & MS_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;
        if (flags & MS_STRICTATIME)
                mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
        if (flags & MS_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (flags & MS_NOSYMFOLLOW)
                mnt_flags |= MNT_NOSYMFOLLOW;

        /* The default atime for remount is preservation */
        if ((flags & MS_REMOUNT) &&
            ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
                       MS_STRICTATIME)) == 0)) {
                mnt_flags &= ~MNT_ATIME_MASK;
                mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
        }

        sb_flags = flags & (SB_RDONLY |
                            SB_SYNCHRONOUS |
                            SB_MANDLOCK |
                            SB_DIRSYNC |
                            SB_SILENT |
                            SB_POSIXACL |
                            SB_LAZYTIME |
                            SB_I_VERSION);

        if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
                return do_reconfigure_mnt(path, mnt_flags);
        if (flags & MS_REMOUNT)
                return do_remount(path, sb_flags, mnt_flags, data_page);
        if (flags & MS_BIND)
                return do_loopback(path, dev_name, flags & MS_REC);
        if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return do_change_type(path, flags);
        if (flags & MS_MOVE)
                return do_move_mount_old(path, dev_name);

        return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
                            data_page);
}

int do_mount(const char *dev_name, const char __user *dir_name,
                const char *type_page, unsigned long flags, void *data_page)
{
        struct path path __free(path_put) = {};
        int ret;

        ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
        if (ret)
                return ret;
        return path_mount(dev_name, &path, type_page, flags, data_page);
}

static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
}

static void dec_mnt_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
}

static void free_mnt_ns(struct mnt_namespace *ns)
{
        if (!is_anon_ns(ns))
                ns_common_free(ns);
        dec_mnt_namespaces(ns->ucounts);
        mnt_ns_tree_remove(ns);
}

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
        struct mnt_namespace *new_ns;
        struct ucounts *ucounts;
        int ret;

        ucounts = inc_mnt_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        new_ns = kzalloc_obj(struct mnt_namespace, GFP_KERNEL_ACCOUNT);
        if (!new_ns) {
                dec_mnt_namespaces(ucounts);
                return ERR_PTR(-ENOMEM);
        }

        if (anon)
                ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO);
        else
                ret = ns_common_init(new_ns);
        if (ret) {
                kfree(new_ns);
                dec_mnt_namespaces(ucounts);
                return ERR_PTR(ret);
        }
        ns_tree_gen_id(new_ns);

        new_ns->is_anon = anon;
        refcount_set(&new_ns->passive, 1);
        new_ns->mounts = RB_ROOT;
        init_waitqueue_head(&new_ns->poll);
        new_ns->user_ns = get_user_ns(user_ns);
        new_ns->ucounts = ucounts;
        return new_ns;
}

__latent_entropy
struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
                struct user_namespace *user_ns, struct fs_struct *new_fs)
{
        struct mnt_namespace *new_ns;
        struct path old_root __free(path_put) = {};
        struct path old_pwd __free(path_put) = {};
        struct mount *p, *q;
        struct mount *old;
        struct mount *new;
        int copy_flags;

        BUG_ON(!ns);

        if (likely(!(flags & CLONE_NEWNS))) {
                get_mnt_ns(ns);
                return ns;
        }

        old = ns->root;

        new_ns = alloc_mnt_ns(user_ns, false);
        if (IS_ERR(new_ns))
                return new_ns;

        guard(namespace_excl)();

        if (flags & CLONE_EMPTY_MNTNS)
                copy_flags = 0;
        else
                copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
        if (user_ns != ns->user_ns)
                copy_flags |= CL_SLAVE;

        if (flags & CLONE_EMPTY_MNTNS)
                new = clone_mnt(old, old->mnt.mnt_root, copy_flags);
        else
                new = copy_tree(old, old->mnt.mnt_root, copy_flags);
        if (IS_ERR(new)) {
                emptied_ns = new_ns;
                return ERR_CAST(new);
        }
        if (user_ns != ns->user_ns) {
                guard(mount_writer)();
                lock_mnt_tree(new);
        }
        new_ns->root = new;

        if (flags & CLONE_EMPTY_MNTNS) {
                /*
                 * Empty mount namespace: only the root mount exists.
                 * Reset root and pwd to the cloned mount's root dentry.
                 */
                if (new_fs) {
                        old_root = new_fs->root;
                        old_pwd = new_fs->pwd;

                        new_fs->root.mnt = mntget(&new->mnt);
                        new_fs->root.dentry = dget(new->mnt.mnt_root);

                        new_fs->pwd.mnt = mntget(&new->mnt);
                        new_fs->pwd.dentry = dget(new->mnt.mnt_root);
                }
                mnt_add_to_ns(new_ns, new);
                new_ns->nr_mounts++;
        } else {
                /*
                 * Full copy: walk old and new trees in parallel, switching
                 * the tsk->fs->* elements and marking new vfsmounts as
                 * belonging to new namespace.  We have already acquired a
                 * private fs_struct, so tsk->fs->lock is not needed.
                 */
                p = old;
                q = new;
                while (p) {
                        mnt_add_to_ns(new_ns, q);
                        new_ns->nr_mounts++;
                        if (new_fs) {
                                if (&p->mnt == new_fs->root.mnt) {
                                        old_root.mnt = new_fs->root.mnt;
                                        new_fs->root.mnt = mntget(&q->mnt);
                                }
                                if (&p->mnt == new_fs->pwd.mnt) {
                                        old_pwd.mnt = new_fs->pwd.mnt;
                                        new_fs->pwd.mnt = mntget(&q->mnt);
                                }
                        }
                        p = next_mnt(p, old);
                        q = next_mnt(q, new);
                        if (!q)
                                break;
                        // an mntns binding we'd skipped?
                        while (p->mnt.mnt_root != q->mnt.mnt_root)
                                p = next_mnt(skip_mnt_tree(p), old);
                }
        }
        ns_tree_add_raw(new_ns);
        return new_ns;
}

struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{
        struct mount *mnt = real_mount(m);
        struct mnt_namespace *ns;
        struct super_block *s;
        struct path path;
        int err;

        ns = alloc_mnt_ns(&init_user_ns, true);
        if (IS_ERR(ns)) {
                mntput(m);
                return ERR_CAST(ns);
        }
        ns->root = mnt;
        ns->nr_mounts++;
        mnt_add_to_ns(ns, mnt);

        err = vfs_path_lookup(m->mnt_root, m,
                        name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

        put_mnt_ns(ns);

        if (err)
                return ERR_PTR(err);

        /* trade a vfsmount reference for active sb one */
        s = path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
        mntput(path.mnt);
        /* lock the sucker */
        down_write(&s->s_umount);
        /* ... and return the root of (sub)tree on it */
        return path.dentry;
}
EXPORT_SYMBOL(mount_subtree);

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
{
        int ret;
        char *kernel_type;
        char *kernel_dev;
        void *options;

        kernel_type = copy_mount_string(type);
        ret = PTR_ERR(kernel_type);
        if (IS_ERR(kernel_type))
                goto out_type;

        kernel_dev = copy_mount_string(dev_name);
        ret = PTR_ERR(kernel_dev);
        if (IS_ERR(kernel_dev))
                goto out_dev;

        options = copy_mount_options(data);
        ret = PTR_ERR(options);
        if (IS_ERR(options))
                goto out_data;

        ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);

        kfree(options);
out_data:
        kfree(kernel_dev);
out_dev:
        kfree(kernel_type);
out_type:
        return ret;
}

#define FSMOUNT_VALID_FLAGS                                                    \
        (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |            \
         MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME |       \
         MOUNT_ATTR_NOSYMFOLLOW)

#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)

#define MOUNT_SETATTR_PROPAGATION_FLAGS \
        (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)

static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
{
        unsigned int mnt_flags = 0;

        if (attr_flags & MOUNT_ATTR_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (attr_flags & MOUNT_ATTR_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (attr_flags & MOUNT_ATTR_NODEV)
                mnt_flags |= MNT_NODEV;
        if (attr_flags & MOUNT_ATTR_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (attr_flags & MOUNT_ATTR_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;
        if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
                mnt_flags |= MNT_NOSYMFOLLOW;

        return mnt_flags;
}

/*
 * Create a kernel mount representation for a new, prepared superblock
 * (specified by fs_fd) and attach to an open_tree-like file descriptor.
 */
SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
                unsigned int, attr_flags)
{
        struct path new_path __free(path_put) = {};
        struct mnt_namespace *ns;
        struct fs_context *fc;
        struct vfsmount *new_mnt;
        struct mount *mnt;
        unsigned int mnt_flags = 0;
        long ret;

        if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0)
                return -EINVAL;

        if ((flags & FSMOUNT_NAMESPACE) &&
            !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
                return -EPERM;

        if (!(flags & FSMOUNT_NAMESPACE) && !may_mount())
                return -EPERM;

        if (attr_flags & ~FSMOUNT_VALID_FLAGS)
                return -EINVAL;

        mnt_flags = attr_flags_to_mnt_flags(attr_flags);

        switch (attr_flags & MOUNT_ATTR__ATIME) {
        case MOUNT_ATTR_STRICTATIME:
                break;
        case MOUNT_ATTR_NOATIME:
                mnt_flags |= MNT_NOATIME;
                break;
        case MOUNT_ATTR_RELATIME:
                mnt_flags |= MNT_RELATIME;
                break;
        default:
                return -EINVAL;
        }

        CLASS(fd, f)(fs_fd);
        if (fd_empty(f))
                return -EBADF;

        if (fd_file(f)->f_op != &fscontext_fops)
                return -EINVAL;

        fc = fd_file(f)->private_data;

        ACQUIRE(mutex_intr, uapi_mutex)(&fc->uapi_mutex);
        ret = ACQUIRE_ERR(mutex_intr, &uapi_mutex);
        if (ret)
                return ret;

        /* There must be a valid superblock or we can't mount it */
        ret = -EINVAL;
        if (!fc->root)
                return ret;

        ret = -EPERM;
        if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
                errorfcp(fc, "VFS", "Mount too revealing");
                return ret;
        }

        ret = -EBUSY;
        if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
                return ret;

        if (fc->sb_flags & SB_MANDLOCK)
                warn_mandlock();

        new_mnt = vfs_create_mount(fc);
        if (IS_ERR(new_mnt))
                return PTR_ERR(new_mnt);
        new_mnt->mnt_flags = mnt_flags;

        new_path.dentry = dget(fc->root);
        new_path.mnt = new_mnt;

        /* We've done the mount bit - now move the file context into more or
         * less the same state as if we'd done an fspick().  We don't want to
         * do any memory allocation or anything like that at this point as we
         * don't want to have to handle any errors incurred.
         */
        vfs_clean_context(fc);

        if (flags & FSMOUNT_NAMESPACE)
                return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
                              open_new_namespace(&new_path, MOUNT_COPY_NEW));

        ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
        if (IS_ERR(ns))
                return PTR_ERR(ns);
        mnt = real_mount(new_path.mnt);
        ns->root = mnt;
        ns->nr_mounts = 1;
        mnt_add_to_ns(ns, mnt);
        mntget(new_path.mnt);

        FD_PREPARE(fdf, (flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
                   dentry_open(&new_path, O_PATH, fc->cred));
        if (fdf.err) {
                dissolve_on_fput(new_path.mnt);
                return fdf.err;
        }

        /*
         * Attach to an apparent O_PATH fd with a note that we
         * need to unmount it, not just simply put it.
         */
        fd_prepare_file(fdf)->f_mode |= FMODE_NEED_UNMOUNT;
        return fd_publish(fdf);
}

static inline int vfs_move_mount(const struct path *from_path,
                                 const struct path *to_path,
                                 enum mnt_tree_flags_t mflags)
{
        int ret;

        ret = security_move_mount(from_path, to_path);
        if (ret)
                return ret;

        if (mflags & MNT_TREE_PROPAGATION)
                return do_set_group(from_path, to_path);

        return do_move_mount(from_path, to_path, mflags);
}

/*
 * Move a mount from one place to another.  In combination with
 * fsopen()/fsmount() this is used to install a new mount and in combination
 * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
 * a mount subtree.
 *
 * Note the flags value is a combination of MOVE_MOUNT_* flags.
 */
SYSCALL_DEFINE5(move_mount,
                int, from_dfd, const char __user *, from_pathname,
                int, to_dfd, const char __user *, to_pathname,
                unsigned int, flags)
{
        struct path to_path __free(path_put) = {};
        struct path from_path __free(path_put) = {};
        unsigned int lflags, uflags;
        enum mnt_tree_flags_t mflags = 0;
        int ret = 0;

        if (!may_mount())
                return -EPERM;

        if (flags & ~MOVE_MOUNT__MASK)
                return -EINVAL;

        if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
            (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
                return -EINVAL;

        if (flags & MOVE_MOUNT_SET_GROUP)        mflags |= MNT_TREE_PROPAGATION;
        if (flags & MOVE_MOUNT_BENEATH)                mflags |= MNT_TREE_BENEATH;

        uflags = 0;
        if (flags & MOVE_MOUNT_T_EMPTY_PATH)
                uflags = AT_EMPTY_PATH;

        CLASS(filename_maybe_null,to_name)(to_pathname, uflags);
        if (!to_name && to_dfd >= 0) {
                CLASS(fd_raw, f_to)(to_dfd);
                if (fd_empty(f_to))
                        return -EBADF;

                to_path = fd_file(f_to)->f_path;
                path_get(&to_path);
        } else {
                lflags = 0;
                if (flags & MOVE_MOUNT_T_SYMLINKS)
                        lflags |= LOOKUP_FOLLOW;
                if (flags & MOVE_MOUNT_T_AUTOMOUNTS)
                        lflags |= LOOKUP_AUTOMOUNT;
                ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
                if (ret)
                        return ret;
        }

        uflags = 0;
        if (flags & MOVE_MOUNT_F_EMPTY_PATH)
                uflags = AT_EMPTY_PATH;

        CLASS(filename_maybe_null,from_name)(from_pathname, uflags);
        if (!from_name && from_dfd >= 0) {
                CLASS(fd_raw, f_from)(from_dfd);
                if (fd_empty(f_from))
                        return -EBADF;

                return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
        }

        lflags = 0;
        if (flags & MOVE_MOUNT_F_SYMLINKS)
                lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_F_AUTOMOUNTS)
                lflags |= LOOKUP_AUTOMOUNT;
        ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
        if (ret)
                return ret;

        return vfs_move_mount(&from_path, &to_path, mflags);
}

/*
 * Return true if path is reachable from root
 *
 * locks: mount_locked_reader || namespace_shared && is_mounted(mnt)
 */
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
                         const struct path *root)
{
        while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
                dentry = mnt->mnt_mountpoint;
                mnt = mnt->mnt_parent;
        }
        return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}

bool path_is_under(const struct path *path1, const struct path *path2)
{
        guard(mount_locked_reader)();
        return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
}
EXPORT_SYMBOL(path_is_under);

int path_pivot_root(struct path *new, struct path *old)
{
        struct path root __free(path_put) = {};
        struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
        int error;

        if (!may_mount())
                return -EPERM;

        error = security_sb_pivotroot(old, new);
        if (error)
                return error;

        get_fs_root(current->fs, &root);

        LOCK_MOUNT(old_mp, old);
        old_mnt = old_mp.parent;
        if (IS_ERR(old_mnt))
                return PTR_ERR(old_mnt);

        new_mnt = real_mount(new->mnt);
        root_mnt = real_mount(root.mnt);
        ex_parent = new_mnt->mnt_parent;
        root_parent = root_mnt->mnt_parent;
        if (IS_MNT_SHARED(old_mnt) ||
                IS_MNT_SHARED(ex_parent) ||
                IS_MNT_SHARED(root_parent))
                return -EINVAL;
        if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
                return -EINVAL;
        if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
                return -EINVAL;
        if (d_unlinked(new->dentry))
                return -ENOENT;
        if (new_mnt == root_mnt || old_mnt == root_mnt)
                return -EBUSY; /* loop, on the same file system  */
        if (!path_mounted(&root))
                return -EINVAL; /* not a mountpoint */
        if (!mnt_has_parent(root_mnt))
                return -EINVAL; /* absolute root */
        if (!path_mounted(new))
                return -EINVAL; /* not a mountpoint */
        if (!mnt_has_parent(new_mnt))
                return -EINVAL; /* absolute root */
        /* make sure we can reach put_old from new_root */
        if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new))
                return -EINVAL;
        /* make certain new is below the root */
        if (!is_path_reachable(new_mnt, new->dentry, &root))
                return -EINVAL;
        lock_mount_hash();
        umount_mnt(new_mnt);
        if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
                new_mnt->mnt.mnt_flags |= MNT_LOCKED;
                root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
        }
        /* mount new_root on / */
        attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp);
        umount_mnt(root_mnt);
        /* mount old root on put_old */
        attach_mnt(root_mnt, old_mnt, old_mp.mp);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        /* A moved mount should not expire automatically */
        list_del_init(&new_mnt->mnt_expire);
        unlock_mount_hash();
        mnt_notify_add(root_mnt);
        mnt_notify_add(new_mnt);
        chroot_fs_refs(&root, new);
        return 0;
}

/*
 * pivot_root Semantics:
 * Moves the root file system of the current process to the directory put_old,
 * makes new_root as the new root file system of the current process, and sets
 * root/cwd of all processes which had them on the current root to new_root.
 *
 * Restrictions:
 * The new_root and put_old must be directories, and  must not be on the
 * same file  system as the current process root. The put_old  must  be
 * underneath new_root,  i.e. adding a non-zero number of /.. to the string
 * pointed to by put_old must yield the same directory as new_root. No other
 * file system may be mounted on put_old. After all, new_root is a mountpoint.
 *
 * The immutable nullfs filesystem is mounted as the true root of the VFS
 * hierarchy. The mutable rootfs (tmpfs/ramfs) is layered on top of this,
 * allowing pivot_root() to work normally from initramfs.
 *
 * Notes:
 *  - we don't move root/cwd if they are not at the root (reason: if something
 *    cared enough to change them, it's probably wrong to force them elsewhere)
 *  - it's okay to pick a root that isn't the root of a file system, e.g.
 *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
 *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
 *    first.
 */
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                const char __user *, put_old)
{
        struct path new __free(path_put) = {};
        struct path old __free(path_put) = {};
        int error;

        error = user_path_at(AT_FDCWD, new_root,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
        if (error)
                return error;

        error = user_path_at(AT_FDCWD, put_old,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
        if (error)
                return error;

        return path_pivot_root(&new, &old);
}

static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
{
        unsigned int flags = mnt->mnt.mnt_flags;

        /*  flags to clear */
        flags &= ~kattr->attr_clr;
        /* flags to raise */
        flags |= kattr->attr_set;

        return flags;
}

static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
        struct vfsmount *m = &mnt->mnt;
        struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;

        if (!kattr->mnt_idmap)
                return 0;

        /*
         * Creating an idmapped mount with the filesystem wide idmapping
         * doesn't make sense so block that. We don't allow mushy semantics.
         */
        if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
                return -EINVAL;

        /*
         * We only allow an mount to change it's idmapping if it has
         * never been accessible to userspace.
         */
        if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m))
                return -EPERM;

        /* The underlying filesystem doesn't support idmapped mounts yet. */
        if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
                return -EINVAL;

        /* The filesystem has turned off idmapped mounts. */
        if (m->mnt_sb->s_iflags & SB_I_NOIDMAP)
                return -EINVAL;

        /* We're not controlling the superblock. */
        if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Mount has already been visible in the filesystem hierarchy. */
        if (!is_anon_ns(mnt->mnt_ns))
                return -EINVAL;

        return 0;
}

/**
 * mnt_allow_writers() - check whether the attribute change allows writers
 * @kattr: the new mount attributes
 * @mnt: the mount to which @kattr will be applied
 *
 * Check whether thew new mount attributes in @kattr allow concurrent writers.
 *
 * Return: true if writers need to be held, false if not
 */
static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
                                     const struct mount *mnt)
{
        return (!(kattr->attr_set & MNT_READONLY) ||
                (mnt->mnt.mnt_flags & MNT_READONLY)) &&
               !kattr->mnt_idmap;
}

static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
{
        struct mount *m;
        int err;

        for (m = mnt; m; m = next_mnt(m, mnt)) {
                if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
                        err = -EPERM;
                        break;
                }

                err = can_idmap_mount(kattr, m);
                if (err)
                        break;

                if (!mnt_allow_writers(kattr, m)) {
                        err = mnt_hold_writers(m);
                        if (err) {
                                m = next_mnt(m, mnt);
                                break;
                        }
                }

                if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
                        return 0;
        }

        if (err) {
                /* undo all mnt_hold_writers() we'd done */
                for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt))
                        mnt_unhold_writers(p);
        }
        return err;
}

static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
        struct mnt_idmap *old_idmap;

        if (!kattr->mnt_idmap)
                return;

        old_idmap = mnt_idmap(&mnt->mnt);

        /* Pairs with smp_load_acquire() in mnt_idmap(). */
        smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
        mnt_idmap_put(old_idmap);
}

static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
{
        struct mount *m;

        for (m = mnt; m; m = next_mnt(m, mnt)) {
                unsigned int flags;

                do_idmap_mount(kattr, m);
                flags = recalc_flags(kattr, m);
                WRITE_ONCE(m->mnt.mnt_flags, flags);

                /* If we had to hold writers unblock them. */
                mnt_unhold_writers(m);

                if (kattr->propagation)
                        change_mnt_propagation(m, kattr->propagation);
                if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
                        break;
        }
        touch_mnt_namespace(mnt->mnt_ns);
}

static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr)
{
        struct mount *mnt = real_mount(path->mnt);
        int err = 0;

        if (!path_mounted(path))
                return -EINVAL;

        if (kattr->mnt_userns) {
                struct mnt_idmap *mnt_idmap;

                mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
                if (IS_ERR(mnt_idmap))
                        return PTR_ERR(mnt_idmap);
                kattr->mnt_idmap = mnt_idmap;
        }

        if (kattr->propagation) {
                /*
                 * Only take namespace_lock() if we're actually changing
                 * propagation.
                 */
                namespace_lock();
                if (kattr->propagation == MS_SHARED) {
                        err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE);
                        if (err) {
                                namespace_unlock();
                                return err;
                        }
                }
        }

        err = -EINVAL;
        lock_mount_hash();

        if (!anon_ns_root(mnt) && !check_mnt(mnt))
                goto out;

        /*
         * First, we get the mount tree in a shape where we can change mount
         * properties without failure. If we succeeded to do so we commit all
         * changes and if we failed we clean up.
         */
        err = mount_setattr_prepare(kattr, mnt);
        if (!err)
                mount_setattr_commit(kattr, mnt);

out:
        unlock_mount_hash();

        if (kattr->propagation) {
                if (err)
                        cleanup_group_ids(mnt, NULL);
                namespace_unlock();
        }

        return err;
}

static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
                                struct mount_kattr *kattr)
{
        struct ns_common *ns;
        struct user_namespace *mnt_userns;

        if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
                return 0;

        if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
                /*
                 * We can only remove an idmapping if it's never been
                 * exposed to userspace.
                 */
                if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
                        return -EINVAL;

                /*
                 * Removal of idmappings is equivalent to setting
                 * nop_mnt_idmap.
                 */
                if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
                        kattr->mnt_idmap = &nop_mnt_idmap;
                        return 0;
                }
        }

        if (attr->userns_fd > INT_MAX)
                return -EINVAL;

        CLASS(fd, f)(attr->userns_fd);
        if (fd_empty(f))
                return -EBADF;

        if (!proc_ns_file(fd_file(f)))
                return -EINVAL;

        ns = get_proc_ns(file_inode(fd_file(f)));
        if (ns->ns_type != CLONE_NEWUSER)
                return -EINVAL;

        /*
         * The initial idmapping cannot be used to create an idmapped
         * mount. We use the initial idmapping as an indicator of a mount
         * that is not idmapped. It can simply be passed into helpers that
         * are aware of idmapped mounts as a convenient shortcut. A user
         * can just create a dedicated identity mapping to achieve the same
         * result.
         */
        mnt_userns = container_of(ns, struct user_namespace, ns);
        if (mnt_userns == &init_user_ns)
                return -EPERM;

        /* We're not controlling the target namespace. */
        if (!ns_capable(mnt_userns, CAP_SYS_ADMIN))
                return -EPERM;

        kattr->mnt_userns = get_user_ns(mnt_userns);
        return 0;
}

static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
                             struct mount_kattr *kattr)
{
        if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
                return -EINVAL;
        if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
                return -EINVAL;
        kattr->propagation = attr->propagation;

        if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
                return -EINVAL;

        kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
        kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);

        /*
         * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
         * users wanting to transition to a different atime setting cannot
         * simply specify the atime setting in @attr_set, but must also
         * specify MOUNT_ATTR__ATIME in the @attr_clr field.
         * So ensure that MOUNT_ATTR__ATIME can't be partially set in
         * @attr_clr and that @attr_set can't have any atime bits set if
         * MOUNT_ATTR__ATIME isn't set in @attr_clr.
         */
        if (attr->attr_clr & MOUNT_ATTR__ATIME) {
                if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
                        return -EINVAL;

                /*
                 * Clear all previous time settings as they are mutually
                 * exclusive.
                 */
                kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
                switch (attr->attr_set & MOUNT_ATTR__ATIME) {
                case MOUNT_ATTR_RELATIME:
                        kattr->attr_set |= MNT_RELATIME;
                        break;
                case MOUNT_ATTR_NOATIME:
                        kattr->attr_set |= MNT_NOATIME;
                        break;
                case MOUNT_ATTR_STRICTATIME:
                        break;
                default:
                        return -EINVAL;
                }
        } else {
                if (attr->attr_set & MOUNT_ATTR__ATIME)
                        return -EINVAL;
        }

        return build_mount_idmapped(attr, usize, kattr);
}

static void finish_mount_kattr(struct mount_kattr *kattr)
{
        if (kattr->mnt_userns) {
                put_user_ns(kattr->mnt_userns);
                kattr->mnt_userns = NULL;
        }

        if (kattr->mnt_idmap)
                mnt_idmap_put(kattr->mnt_idmap);
}

static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize,
                               struct mount_kattr *kattr)
{
        int ret;
        struct mount_attr attr;

        BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
                return -EINVAL;

        if (!may_mount())
                return -EPERM;

        ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
        if (ret)
                return ret;

        /* Don't bother walking through the mounts if this is a nop. */
        if (attr.attr_set == 0 &&
            attr.attr_clr == 0 &&
            attr.propagation == 0)
                return 0; /* Tell caller to not bother. */

        ret = build_mount_kattr(&attr, usize, kattr);
        if (ret < 0)
                return ret;

        return 1;
}

SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
                unsigned int, flags, struct mount_attr __user *, uattr,
                size_t, usize)
{
        int err;
        struct path target;
        struct mount_kattr kattr;
        unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;

        if (flags & ~(AT_EMPTY_PATH |
                      AT_RECURSIVE |
                      AT_SYMLINK_NOFOLLOW |
                      AT_NO_AUTOMOUNT))
                return -EINVAL;

        if (flags & AT_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;

        kattr = (struct mount_kattr) {
                .lookup_flags        = lookup_flags,
        };

        if (flags & AT_RECURSIVE)
                kattr.kflags |= MOUNT_KATTR_RECURSE;

        err = wants_mount_setattr(uattr, usize, &kattr);
        if (err <= 0)
                return err;

        CLASS(filename_uflags, name)(path, flags);
        err = filename_lookup(dfd, name, kattr.lookup_flags, &target, NULL);
        if (!err) {
                err = do_mount_setattr(&target, &kattr);
                path_put(&target);
        }
        finish_mount_kattr(&kattr);
        return err;
}

SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
                unsigned, flags, struct mount_attr __user *, uattr,
                size_t, usize)
{
        if (!uattr && usize)
                return -EINVAL;

        FD_PREPARE(fdf, flags, vfs_open_tree(dfd, filename, flags));
        if (fdf.err)
                return fdf.err;

        if (uattr) {
                struct mount_kattr kattr = {};
                struct file *file = fd_prepare_file(fdf);
                int ret;

                if (flags & OPEN_TREE_CLONE)
                        kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
                if (flags & AT_RECURSIVE)
                        kattr.kflags |= MOUNT_KATTR_RECURSE;

                ret = wants_mount_setattr(uattr, usize, &kattr);
                if (ret > 0) {
                        ret = do_mount_setattr(&file->f_path, &kattr);
                        finish_mount_kattr(&kattr);
                }
                if (ret)
                        return ret;
        }

        return fd_publish(fdf);
}

int show_path(struct seq_file *m, struct dentry *root)
{
        if (root->d_sb->s_op->show_path)
                return root->d_sb->s_op->show_path(m, root);

        seq_dentry(m, root, " \t\n\\");
        return 0;
}

static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
{
        struct mount *mnt = mnt_find_id_at(ns, id);

        if (!mnt || mnt->mnt_id_unique != id)
                return NULL;

        return &mnt->mnt;
}

struct kstatmount {
        struct statmount __user *buf;
        size_t bufsize;
        struct vfsmount *mnt;
        struct mnt_idmap *idmap;
        u64 mask;
        struct path root;
        struct seq_file seq;

        /* Must be last --ends in a flexible-array member. */
        struct statmount sm;
};

static u64 mnt_to_attr_flags(struct vfsmount *mnt)
{
        unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
        u64 attr_flags = 0;

        if (mnt_flags & MNT_READONLY)
                attr_flags |= MOUNT_ATTR_RDONLY;
        if (mnt_flags & MNT_NOSUID)
                attr_flags |= MOUNT_ATTR_NOSUID;
        if (mnt_flags & MNT_NODEV)
                attr_flags |= MOUNT_ATTR_NODEV;
        if (mnt_flags & MNT_NOEXEC)
                attr_flags |= MOUNT_ATTR_NOEXEC;
        if (mnt_flags & MNT_NODIRATIME)
                attr_flags |= MOUNT_ATTR_NODIRATIME;
        if (mnt_flags & MNT_NOSYMFOLLOW)
                attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;

        if (mnt_flags & MNT_NOATIME)
                attr_flags |= MOUNT_ATTR_NOATIME;
        else if (mnt_flags & MNT_RELATIME)
                attr_flags |= MOUNT_ATTR_RELATIME;
        else
                attr_flags |= MOUNT_ATTR_STRICTATIME;

        if (is_idmapped_mnt(mnt))
                attr_flags |= MOUNT_ATTR_IDMAP;

        return attr_flags;
}

static u64 mnt_to_propagation_flags(struct mount *m)
{
        u64 propagation = 0;

        if (IS_MNT_SHARED(m))
                propagation |= MS_SHARED;
        if (IS_MNT_SLAVE(m))
                propagation |= MS_SLAVE;
        if (IS_MNT_UNBINDABLE(m))
                propagation |= MS_UNBINDABLE;
        if (!propagation)
                propagation |= MS_PRIVATE;

        return propagation;
}

u64 vfsmount_to_propagation_flags(struct vfsmount *mnt)
{
        return mnt_to_propagation_flags(real_mount(mnt));
}
EXPORT_SYMBOL_GPL(vfsmount_to_propagation_flags);

static void statmount_sb_basic(struct kstatmount *s)
{
        struct super_block *sb = s->mnt->mnt_sb;

        s->sm.mask |= STATMOUNT_SB_BASIC;
        s->sm.sb_dev_major = MAJOR(sb->s_dev);
        s->sm.sb_dev_minor = MINOR(sb->s_dev);
        s->sm.sb_magic = sb->s_magic;
        s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
}

static void statmount_mnt_basic(struct kstatmount *s)
{
        struct mount *m = real_mount(s->mnt);

        s->sm.mask |= STATMOUNT_MNT_BASIC;
        s->sm.mnt_id = m->mnt_id_unique;
        s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
        s->sm.mnt_id_old = m->mnt_id;
        s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
        s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
        s->sm.mnt_propagation = mnt_to_propagation_flags(m);
        s->sm.mnt_peer_group = m->mnt_group_id;
        s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
}

static void statmount_propagate_from(struct kstatmount *s)
{
        struct mount *m = real_mount(s->mnt);

        s->sm.mask |= STATMOUNT_PROPAGATE_FROM;
        if (IS_MNT_SLAVE(m))
                s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
}

static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
{
        int ret;
        size_t start = seq->count;

        ret = show_path(seq, s->mnt->mnt_root);
        if (ret)
                return ret;

        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        /*
         * Unescape the result. It would be better if supplied string was not
         * escaped in the first place, but that's a pretty invasive change.
         */
        seq->buf[seq->count] = '\0';
        seq->count = start;
        seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
        return 0;
}

static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
{
        struct vfsmount *mnt = s->mnt;
        struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
        int err;

        err = seq_path_root(seq, &mnt_path, &s->root, "");
        return err == SEQ_SKIP ? 0 : err;
}

static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
{
        struct super_block *sb = s->mnt->mnt_sb;

        seq_puts(seq, sb->s_type->name);
        return 0;
}

static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
{
        struct super_block *sb = s->mnt->mnt_sb;

        if (sb->s_subtype)
                seq_puts(seq, sb->s_subtype);
}

static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
{
        struct super_block *sb = s->mnt->mnt_sb;
        struct mount *r = real_mount(s->mnt);

        if (sb->s_op->show_devname) {
                size_t start = seq->count;
                int ret;

                ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
                if (ret)
                        return ret;

                if (unlikely(seq_has_overflowed(seq)))
                        return -EAGAIN;

                /* Unescape the result */
                seq->buf[seq->count] = '\0';
                seq->count = start;
                seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
        } else {
                seq_puts(seq, r->mnt_devname);
        }
        return 0;
}

static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
        s->sm.mask |= STATMOUNT_MNT_NS_ID;
        s->sm.mnt_ns_id = ns->ns.ns_id;
}

static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
{
        struct vfsmount *mnt = s->mnt;
        struct super_block *sb = mnt->mnt_sb;
        size_t start = seq->count;
        int err;

        err = security_sb_show_options(seq, sb);
        if (err)
                return err;

        if (sb->s_op->show_options) {
                err = sb->s_op->show_options(seq, mnt->mnt_root);
                if (err)
                        return err;
        }

        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        if (seq->count == start)
                return 0;

        /* skip leading comma */
        memmove(seq->buf + start, seq->buf + start + 1,
                seq->count - start - 1);
        seq->count--;

        return 0;
}

static inline int statmount_opt_process(struct seq_file *seq, size_t start)
{
        char *buf_end, *opt_end, *src, *dst;
        int count = 0;

        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        buf_end = seq->buf + seq->count;
        dst = seq->buf + start;
        src = dst + 1;        /* skip initial comma */

        if (src >= buf_end) {
                seq->count = start;
                return 0;
        }

        *buf_end = '\0';
        for (; src < buf_end; src = opt_end + 1) {
                opt_end = strchrnul(src, ',');
                *opt_end = '\0';
                dst += string_unescape(src, dst, 0, UNESCAPE_OCTAL) + 1;
                if (WARN_ON_ONCE(++count == INT_MAX))
                        return -EOVERFLOW;
        }
        seq->count = dst - 1 - seq->buf;
        return count;
}

static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
{
        struct vfsmount *mnt = s->mnt;
        struct super_block *sb = mnt->mnt_sb;
        size_t start = seq->count;
        int err;

        if (!sb->s_op->show_options)
                return 0;

        err = sb->s_op->show_options(seq, mnt->mnt_root);
        if (err)
                return err;

        err = statmount_opt_process(seq, start);
        if (err < 0)
                return err;

        s->sm.opt_num = err;
        return 0;
}

static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
{
        struct vfsmount *mnt = s->mnt;
        struct super_block *sb = mnt->mnt_sb;
        size_t start = seq->count;
        int err;

        err = security_sb_show_options(seq, sb);
        if (err)
                return err;

        err = statmount_opt_process(seq, start);
        if (err < 0)
                return err;

        s->sm.opt_sec_num = err;
        return 0;
}

static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq)
{
        int ret;

        ret = statmount_mnt_idmap(s->idmap, seq, true);
        if (ret < 0)
                return ret;

        s->sm.mnt_uidmap_num = ret;
        /*
         * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
         * mappings. This allows userspace to distinguish between a
         * non-idmapped mount and an idmapped mount where none of the
         * individual mappings are valid in the caller's idmapping.
         */
        if (is_valid_mnt_idmap(s->idmap))
                s->sm.mask |= STATMOUNT_MNT_UIDMAP;
        return 0;
}

static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq)
{
        int ret;

        ret = statmount_mnt_idmap(s->idmap, seq, false);
        if (ret < 0)
                return ret;

        s->sm.mnt_gidmap_num = ret;
        /*
         * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
         * mappings. This allows userspace to distinguish between a
         * non-idmapped mount and an idmapped mount where none of the
         * individual mappings are valid in the caller's idmapping.
         */
        if (is_valid_mnt_idmap(s->idmap))
                s->sm.mask |= STATMOUNT_MNT_GIDMAP;
        return 0;
}

static int statmount_string(struct kstatmount *s, u64 flag)
{
        int ret = 0;
        size_t kbufsize;
        struct seq_file *seq = &s->seq;
        struct statmount *sm = &s->sm;
        u32 start, *offp;

        /* Reserve an empty string at the beginning for any unset offsets */
        if (!seq->count)
                seq_putc(seq, 0);

        start = seq->count;

        switch (flag) {
        case STATMOUNT_FS_TYPE:
                offp = &sm->fs_type;
                ret = statmount_fs_type(s, seq);
                break;
        case STATMOUNT_MNT_ROOT:
                offp = &sm->mnt_root;
                ret = statmount_mnt_root(s, seq);
                break;
        case STATMOUNT_MNT_POINT:
                offp = &sm->mnt_point;
                ret = statmount_mnt_point(s, seq);
                break;
        case STATMOUNT_MNT_OPTS:
                offp = &sm->mnt_opts;
                ret = statmount_mnt_opts(s, seq);
                break;
        case STATMOUNT_OPT_ARRAY:
                offp = &sm->opt_array;
                ret = statmount_opt_array(s, seq);
                break;
        case STATMOUNT_OPT_SEC_ARRAY:
                offp = &sm->opt_sec_array;
                ret = statmount_opt_sec_array(s, seq);
                break;
        case STATMOUNT_FS_SUBTYPE:
                offp = &sm->fs_subtype;
                statmount_fs_subtype(s, seq);
                break;
        case STATMOUNT_SB_SOURCE:
                offp = &sm->sb_source;
                ret = statmount_sb_source(s, seq);
                break;
        case STATMOUNT_MNT_UIDMAP:
                offp = &sm->mnt_uidmap;
                ret = statmount_mnt_uidmap(s, seq);
                break;
        case STATMOUNT_MNT_GIDMAP:
                offp = &sm->mnt_gidmap;
                ret = statmount_mnt_gidmap(s, seq);
                break;
        default:
                WARN_ON_ONCE(true);
                return -EINVAL;
        }

        /*
         * If nothing was emitted, return to avoid setting the flag
         * and terminating the buffer.
         */
        if (seq->count == start)
                return ret;
        if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
                return -EOVERFLOW;
        if (kbufsize >= s->bufsize)
                return -EOVERFLOW;

        /* signal a retry */
        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        if (ret)
                return ret;

        seq->buf[seq->count++] = '\0';
        sm->mask |= flag;
        *offp = start;
        return 0;
}

static int copy_statmount_to_user(struct kstatmount *s)
{
        struct statmount *sm = &s->sm;
        struct seq_file *seq = &s->seq;
        char __user *str = ((char __user *)s->buf) + sizeof(*sm);
        size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));

        if (seq->count && copy_to_user(str, seq->buf, seq->count))
                return -EFAULT;

        /* Return the number of bytes copied to the buffer */
        sm->size = copysize + seq->count;
        if (copy_to_user(s->buf, sm, copysize))
                return -EFAULT;

        return 0;
}

static struct mount *listmnt_next(struct mount *curr, bool reverse)
{
        struct rb_node *node;

        if (reverse)
                node = rb_prev(&curr->mnt_node);
        else
                node = rb_next(&curr->mnt_node);

        return node_to_mount(node);
}

static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
{
        struct mount *first, *child;

        rwsem_assert_held(&namespace_sem);

        /* We're looking at our own ns, just use get_fs_root. */
        if (ns == current->nsproxy->mnt_ns) {
                get_fs_root(current->fs, root);
                return 0;
        }

        /*
         * We have to find the first mount in our ns and use that, however it
         * may not exist, so handle that properly.
         */
        if (mnt_ns_empty(ns))
                return -ENOENT;

        first = ns->root;
        for (child = node_to_mount(ns->mnt_first_node); child;
             child = listmnt_next(child, false)) {
                if (child != first && child->mnt_parent == first)
                        break;
        }
        if (!child)
                return -ENOENT;

        root->mnt = mntget(&child->mnt);
        root->dentry = dget(root->mnt->mnt_root);
        return 0;
}

/* This must be updated whenever a new flag is added */
#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \
                             STATMOUNT_MNT_BASIC | \
                             STATMOUNT_PROPAGATE_FROM | \
                             STATMOUNT_MNT_ROOT | \
                             STATMOUNT_MNT_POINT | \
                             STATMOUNT_FS_TYPE | \
                             STATMOUNT_MNT_NS_ID | \
                             STATMOUNT_MNT_OPTS | \
                             STATMOUNT_FS_SUBTYPE | \
                             STATMOUNT_SB_SOURCE | \
                             STATMOUNT_OPT_ARRAY | \
                             STATMOUNT_OPT_SEC_ARRAY | \
                             STATMOUNT_SUPPORTED_MASK | \
                             STATMOUNT_MNT_UIDMAP | \
                             STATMOUNT_MNT_GIDMAP)

/* locks: namespace_shared */
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
                        struct file *mnt_file, struct mnt_namespace *ns)
{
        int err;

        if (mnt_file) {
                WARN_ON_ONCE(ns != NULL);

                s->mnt = mnt_file->f_path.mnt;
                ns = real_mount(s->mnt)->mnt_ns;
                if (IS_ERR(ns))
                        return PTR_ERR(ns);
                if (!ns)
                        /*
                         * We can't set mount point and mnt_ns_id since we don't have a
                         * ns for the mount. This can happen if the mount is unmounted
                         * with MNT_DETACH.
                         */
                        s->mask &= ~(STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID);
        } else {
                /* Has the namespace already been emptied? */
                if (mnt_ns_id && mnt_ns_empty(ns))
                        return -ENOENT;

                s->mnt = lookup_mnt_in_ns(mnt_id, ns);
                if (!s->mnt)
                        return -ENOENT;
        }

        if (ns) {
                err = grab_requested_root(ns, &s->root);
                if (err)
                        return err;

                if (!mnt_file) {
                        struct mount *m;
                        /*
                         * Don't trigger audit denials. We just want to determine what
                         * mounts to show users.
                         */
                        m = real_mount(s->mnt);
                        if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
                            !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
                                return -EPERM;
                }
        }

        err = security_sb_statfs(s->mnt->mnt_root);
        if (err)
                return err;

        /*
         * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
         * can change concurrently as we only hold the read-side of the
         * namespace semaphore and mount properties may change with only
         * the mount lock held.
         *
         * We could sample the mount lock sequence counter to detect
         * those changes and retry. But it's not worth it. Worst that
         * happens is that the mnt->mnt_idmap pointer is already changed
         * while mnt->mnt_flags isn't or vica versa. So what.
         *
         * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved
         * via READ_ONCE()/WRITE_ONCE() and guard against theoretical
         * torn read/write. That's all we care about right now.
         */
        s->idmap = mnt_idmap(s->mnt);
        if (s->mask & STATMOUNT_MNT_BASIC)
                statmount_mnt_basic(s);

        if (s->mask & STATMOUNT_SB_BASIC)
                statmount_sb_basic(s);

        if (s->mask & STATMOUNT_PROPAGATE_FROM)
                statmount_propagate_from(s);

        if (s->mask & STATMOUNT_FS_TYPE)
                err = statmount_string(s, STATMOUNT_FS_TYPE);

        if (!err && s->mask & STATMOUNT_MNT_ROOT)
                err = statmount_string(s, STATMOUNT_MNT_ROOT);

        if (!err && s->mask & STATMOUNT_MNT_POINT)
                err = statmount_string(s, STATMOUNT_MNT_POINT);

        if (!err && s->mask & STATMOUNT_MNT_OPTS)
                err = statmount_string(s, STATMOUNT_MNT_OPTS);

        if (!err && s->mask & STATMOUNT_OPT_ARRAY)
                err = statmount_string(s, STATMOUNT_OPT_ARRAY);

        if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
                err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);

        if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
                err = statmount_string(s, STATMOUNT_FS_SUBTYPE);

        if (!err && s->mask & STATMOUNT_SB_SOURCE)
                err = statmount_string(s, STATMOUNT_SB_SOURCE);

        if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
                err = statmount_string(s, STATMOUNT_MNT_UIDMAP);

        if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
                err = statmount_string(s, STATMOUNT_MNT_GIDMAP);

        if (!err && s->mask & STATMOUNT_MNT_NS_ID)
                statmount_mnt_ns_id(s, ns);

        if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
                s->sm.mask |= STATMOUNT_SUPPORTED_MASK;
                s->sm.supported_mask = STATMOUNT_SUPPORTED;
        }

        if (err)
                return err;

        /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */
        WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);

        return 0;
}

static inline bool retry_statmount(const long ret, size_t *seq_size)
{
        if (likely(ret != -EAGAIN))
                return false;
        if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
                return false;
        if (unlikely(*seq_size > MAX_RW_COUNT))
                return false;
        return true;
}

#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
                              STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
                              STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
                              STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \
                              STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP)

static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
                              struct statmount __user *buf, size_t bufsize,
                              size_t seq_size)
{
        if (!access_ok(buf, bufsize))
                return -EFAULT;

        memset(ks, 0, sizeof(*ks));
        ks->mask = kreq->param;
        ks->buf = buf;
        ks->bufsize = bufsize;

        if (ks->mask & STATMOUNT_STRING_REQ) {
                if (bufsize == sizeof(ks->sm))
                        return -EOVERFLOW;

                ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
                if (!ks->seq.buf)
                        return -ENOMEM;

                ks->seq.size = seq_size;
        }

        return 0;
}

static int copy_mnt_id_req(const struct mnt_id_req __user *req,
                           struct mnt_id_req *kreq, unsigned int flags)
{
        int ret;
        size_t usize;

        BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);

        ret = get_user(usize, &req->size);
        if (ret)
                return -EFAULT;
        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
                return -EINVAL;
        memset(kreq, 0, sizeof(*kreq));
        ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
        if (ret)
                return ret;

        if (flags & STATMOUNT_BY_FD) {
                if (kreq->mnt_id || kreq->mnt_ns_id)
                        return -EINVAL;
        } else {
                if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id)
                        return -EINVAL;
                /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
                if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
                        return -EINVAL;
        }
        return 0;
}

/*
 * If the user requested a specific mount namespace id, look that up and return
 * that, or if not simply grab a passive reference on our mount namespace and
 * return that.
 */
static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
{
        struct mnt_namespace *mnt_ns;

        if (kreq->mnt_ns_id) {
                mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id);
                if (!mnt_ns)
                        return ERR_PTR(-ENOENT);
        } else if (kreq->mnt_ns_fd) {
                struct ns_common *ns;

                CLASS(fd, f)(kreq->mnt_ns_fd);
                if (fd_empty(f))
                        return ERR_PTR(-EBADF);

                if (!proc_ns_file(fd_file(f)))
                        return ERR_PTR(-EINVAL);

                ns = get_proc_ns(file_inode(fd_file(f)));
                if (ns->ns_type != CLONE_NEWNS)
                        return ERR_PTR(-EINVAL);

                mnt_ns = to_mnt_ns(ns);
                refcount_inc(&mnt_ns->passive);
        } else {
                mnt_ns = current->nsproxy->mnt_ns;
                refcount_inc(&mnt_ns->passive);
        }

        return mnt_ns;
}

SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
                struct statmount __user *, buf, size_t, bufsize,
                unsigned int, flags)
{
        struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
        struct kstatmount *ks __free(kfree) = NULL;
        struct file *mnt_file __free(fput) = NULL;
        struct mnt_id_req kreq;
        /* We currently support retrieval of 3 strings. */
        size_t seq_size = 3 * PATH_MAX;
        int ret;

        if (flags & ~STATMOUNT_BY_FD)
                return -EINVAL;

        ret = copy_mnt_id_req(req, &kreq, flags);
        if (ret)
                return ret;

        if (flags & STATMOUNT_BY_FD) {
                mnt_file = fget_raw(kreq.mnt_fd);
                if (!mnt_file)
                        return -EBADF;
                /* do_statmount sets ns in case of STATMOUNT_BY_FD */
        } else {
                ns = grab_requested_mnt_ns(&kreq);
                if (IS_ERR(ns))
                        return PTR_ERR(ns);

                if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
                    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
                        return -EPERM;
        }

        ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
        if (!ks)
                return -ENOMEM;

retry:
        ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size);
        if (ret)
                return ret;

        scoped_guard(namespace_shared)
                ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, mnt_file, ns);

        if (!ret)
                ret = copy_statmount_to_user(ks);
        kvfree(ks->seq.buf);
        path_put(&ks->root);
        if (retry_statmount(ret, &seq_size))
                goto retry;
        return ret;
}

struct klistmount {
        u64 last_mnt_id;
        u64 mnt_parent_id;
        u64 *kmnt_ids;
        u32 nr_mnt_ids;
        struct mnt_namespace *ns;
        struct path root;
};

/* locks: namespace_shared */
static ssize_t do_listmount(struct klistmount *kls, bool reverse)
{
        struct mnt_namespace *ns = kls->ns;
        u64 mnt_parent_id = kls->mnt_parent_id;
        u64 last_mnt_id = kls->last_mnt_id;
        u64 *mnt_ids = kls->kmnt_ids;
        size_t nr_mnt_ids = kls->nr_mnt_ids;
        struct path orig;
        struct mount *r, *first;
        ssize_t ret;

        rwsem_assert_held(&namespace_sem);

        ret = grab_requested_root(ns, &kls->root);
        if (ret)
                return ret;

        if (mnt_parent_id == LSMT_ROOT) {
                orig = kls->root;
        } else {
                orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
                if (!orig.mnt)
                        return -ENOENT;
                orig.dentry = orig.mnt->mnt_root;
        }

        /*
         * Don't trigger audit denials. We just want to determine what
         * mounts to show users.
         */
        if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) &&
            !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        ret = security_sb_statfs(orig.dentry);
        if (ret)
                return ret;

        if (!last_mnt_id) {
                if (reverse)
                        first = node_to_mount(ns->mnt_last_node);
                else
                        first = node_to_mount(ns->mnt_first_node);
        } else {
                if (reverse)
                        first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
                else
                        first = mnt_find_id_at(ns, last_mnt_id + 1);
        }

        for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) {
                if (r->mnt_id_unique == mnt_parent_id)
                        continue;
                if (!is_path_reachable(r, r->mnt.mnt_root, &orig))
                        continue;
                *mnt_ids = r->mnt_id_unique;
                mnt_ids++;
                nr_mnt_ids--;
                ret++;
        }
        return ret;
}

static void __free_klistmount_free(const struct klistmount *kls)
{
        path_put(&kls->root);
        kvfree(kls->kmnt_ids);
        mnt_ns_release(kls->ns);
}

static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq,
                                     size_t nr_mnt_ids)
{
        u64 last_mnt_id = kreq->param;
        struct mnt_namespace *ns;

        /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
        if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
                return -EINVAL;

        kls->last_mnt_id = last_mnt_id;

        kls->nr_mnt_ids = nr_mnt_ids;
        kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids),
                                       GFP_KERNEL_ACCOUNT);
        if (!kls->kmnt_ids)
                return -ENOMEM;

        ns = grab_requested_mnt_ns(kreq);
        if (IS_ERR(ns))
                return PTR_ERR(ns);
        kls->ns = ns;

        kls->mnt_parent_id = kreq->mnt_id;
        return 0;
}

SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
                u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
        struct klistmount kls __free(klistmount_free) = {};
        const size_t maxcount = 1000000;
        struct mnt_id_req kreq;
        ssize_t ret;

        if (flags & ~LISTMOUNT_REVERSE)
                return -EINVAL;

        /*
         * If the mount namespace really has more than 1 million mounts the
         * caller must iterate over the mount namespace (and reconsider their
         * system design...).
         */
        if (unlikely(nr_mnt_ids > maxcount))
                return -EOVERFLOW;

        if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
                return -EFAULT;

        ret = copy_mnt_id_req(req, &kreq, 0);
        if (ret)
                return ret;

        ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids);
        if (ret)
                return ret;

        if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) &&
            !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN))
                return -ENOENT;

        /*
         * We only need to guard against mount topology changes as
         * listmount() doesn't care about any mount properties.
         */
        scoped_guard(namespace_shared)
                ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE));
        if (ret <= 0)
                return ret;

        if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids)))
                return -EFAULT;

        return ret;
}

struct mnt_namespace init_mnt_ns = {
        .ns                = NS_COMMON_INIT(init_mnt_ns),
        .user_ns        = &init_user_ns,
        .passive        = REFCOUNT_INIT(1),
        .mounts                = RB_ROOT,
        .poll                = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
};

static void __init init_mount_tree(void)
{
        struct vfsmount *mnt, *nullfs_mnt;
        struct mount *mnt_root;
        struct path root;

        /*
         * We create two mounts:
         *
         * (1) nullfs with mount id 1
         * (2) mutable rootfs with mount id 2
         *
         * with (2) mounted on top of (1).
         */
        nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL);
        if (IS_ERR(nullfs_mnt))
                panic("VFS: Failed to create nullfs");

        mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");

        VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1);
        VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2);

        /* The namespace root is the nullfs mnt. */
        mnt_root                = real_mount(nullfs_mnt);
        init_mnt_ns.root        = mnt_root;

        /* Mount mutable rootfs on top of nullfs. */
        root.mnt                = nullfs_mnt;
        root.dentry                = nullfs_mnt->mnt_root;

        LOCK_MOUNT_EXACT(mp, &root);
        if (unlikely(IS_ERR(mp.parent)))
                panic("VFS: Failed to mount rootfs on nullfs");
        scoped_guard(mount_writer)
                attach_mnt(real_mount(mnt), mp.parent, mp.mp);

        pr_info("VFS: Finished mounting rootfs on nullfs\n");

        /*
         * We've dropped all locks here but that's fine. Not just are we
         * the only task that's running, there's no other mount
         * namespace in existence and the initial mount namespace is
         * completely empty until we add the mounts we just created.
         */
        for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) {
                mnt_add_to_ns(&init_mnt_ns, p);
                init_mnt_ns.nr_mounts++;
        }

        init_task.nsproxy->mnt_ns = &init_mnt_ns;
        get_mnt_ns(&init_mnt_ns);

        /* The root and pwd always point to the mutable rootfs. */
        root.mnt        = mnt;
        root.dentry        = mnt->mnt_root;
        set_fs_pwd(current->fs, &root);
        set_fs_root(current->fs, &root);

        ns_tree_add(&init_mnt_ns);
}

void __init mnt_init(void)
{
        int err;

        mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

        mount_hashtable = alloc_large_system_hash("Mount-cache",
                                sizeof(struct hlist_head),
                                mhash_entries, 19,
                                HASH_ZERO,
                                &m_hash_shift, &m_hash_mask, 0, 0);
        mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
                                sizeof(struct hlist_head),
                                mphash_entries, 19,
                                HASH_ZERO,
                                &mp_hash_shift, &mp_hash_mask, 0, 0);

        if (!mount_hashtable || !mountpoint_hashtable)
                panic("Failed to allocate mount hash table\n");

        kernfs_init();

        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
                        __func__, err);
        fs_kobj = kobject_create_and_add("fs", NULL);
        if (!fs_kobj)
                printk(KERN_WARNING "%s: kobj create error\n", __func__);
        shmem_init();
        init_rootfs();
        init_mount_tree();
}

void put_mnt_ns(struct mnt_namespace *ns)
{
        if (!ns_ref_put(ns))
                return;
        guard(namespace_excl)();
        emptied_ns = ns;
        guard(mount_writer)();
        umount_tree(ns->root, 0);
}

struct vfsmount *kern_mount(struct file_system_type *type)
{
        struct vfsmount *mnt;
        mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
        if (!IS_ERR(mnt)) {
                /*
                 * it is a longterm mount, don't release mnt until
                 * we unmount before file sys is unregistered
                */
                real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
        }
        return mnt;
}
EXPORT_SYMBOL_GPL(kern_mount);

void kern_unmount(struct vfsmount *mnt)
{
        /* release long term mount so mount point can be released */
        if (!IS_ERR(mnt)) {
                mnt_make_shortterm(mnt);
                synchronize_rcu();        /* yecchhh... */
                mntput(mnt);
        }
}
EXPORT_SYMBOL(kern_unmount);

void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
{
        unsigned int i;

        for (i = 0; i < num; i++)
                mnt_make_shortterm(mnt[i]);
        synchronize_rcu_expedited();
        for (i = 0; i < num; i++)
                mntput(mnt[i]);
}
EXPORT_SYMBOL(kern_unmount_array);

bool our_mnt(struct vfsmount *mnt)
{
        return check_mnt(real_mount(mnt));
}

bool current_chrooted(void)
{
        /* Does the current process have a non-standard root */
        struct path fs_root __free(path_put) = {};
        struct mount *root;

        get_fs_root(current->fs, &fs_root);

        /* Find the namespace root */

        guard(mount_locked_reader)();

        root = topmost_overmount(current->nsproxy->mnt_ns->root);

        return fs_root.mnt != &root->mnt || !path_mounted(&fs_root);
}

static bool mnt_already_visible(struct mnt_namespace *ns,
                                const struct super_block *sb,
                                int *new_mnt_flags)
{
        int new_flags = *new_mnt_flags;
        struct mount *mnt, *n;

        guard(namespace_shared)();
        rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
                struct mount *child;
                int mnt_flags;

                if (mnt->mnt.mnt_sb->s_type != sb->s_type)
                        continue;

                /* This mount is not fully visible if it's root directory
                 * is not the root directory of the filesystem.
                 */
                if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
                        continue;

                /* A local view of the mount flags */
                mnt_flags = mnt->mnt.mnt_flags;

                /* Don't miss readonly hidden in the superblock flags */
                if (sb_rdonly(mnt->mnt.mnt_sb))
                        mnt_flags |= MNT_LOCK_READONLY;

                /* Verify the mount flags are equal to or more permissive
                 * than the proposed new mount.
                 */
                if ((mnt_flags & MNT_LOCK_READONLY) &&
                    !(new_flags & MNT_READONLY))
                        continue;
                if ((mnt_flags & MNT_LOCK_ATIME) &&
                    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
                        continue;

                /* This mount is not fully visible if there are any
                 * locked child mounts that cover anything except for
                 * empty directories.
                 */
                list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                        struct inode *inode = child->mnt_mountpoint->d_inode;
                        /* Only worry about locked mounts */
                        if (!(child->mnt.mnt_flags & MNT_LOCKED))
                                continue;
                        /* Is the directory permanently empty? */
                        if (!is_empty_dir_inode(inode))
                                goto next;
                }
                /* Preserve the locked attributes */
                *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
                                               MNT_LOCK_ATIME);
                return true;
        next:        ;
        }
        return false;
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{
        const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        unsigned long s_iflags;

        if (ns->user_ns == &init_user_ns)
                return false;

        /* Can this filesystem be too revealing? */
        s_iflags = sb->s_iflags;
        if (!(s_iflags & SB_I_USERNS_VISIBLE))
                return false;

        if ((s_iflags & required_iflags) != required_iflags) {
                WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
                          required_iflags);
                return true;
        }

        return !mnt_already_visible(ns, sb, new_mnt_flags);
}

bool mnt_may_suid(struct vfsmount *mnt)
{
        /*
         * Foreign mounts (accessed via fchdir or through /proc
         * symlinks) are always treated as if they are nosuid.  This
         * prevents namespaces from trusting potentially unsafe
         * suid/sgid bits, file caps, or security labels that originate
         * in other namespaces.
         */
        return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
               current_in_userns(mnt->mnt_sb->s_user_ns);
}

static struct ns_common *mntns_get(struct task_struct *task)
{
        struct ns_common *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = &nsproxy->mnt_ns->ns;
                get_mnt_ns(to_mnt_ns(ns));
        }
        task_unlock(task);

        return ns;
}

static void mntns_put(struct ns_common *ns)
{
        put_mnt_ns(to_mnt_ns(ns));
}

static int mntns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct fs_struct *fs = nsset->fs;
        struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
        struct user_namespace *user_ns = nsset->cred->user_ns;
        struct path root;
        int err;

        if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(user_ns, CAP_SYS_CHROOT) ||
            !ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        if (is_anon_ns(mnt_ns))
                return -EINVAL;

        if (fs->users != 1)
                return -EINVAL;

        get_mnt_ns(mnt_ns);
        old_mnt_ns = nsproxy->mnt_ns;
        nsproxy->mnt_ns = mnt_ns;

        /* Find the root */
        err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
                                "/", LOOKUP_DOWN, &root);
        if (err) {
                /* revert to old namespace */
                nsproxy->mnt_ns = old_mnt_ns;
                put_mnt_ns(mnt_ns);
                return err;
        }

        put_mnt_ns(old_mnt_ns);

        /* Update the pwd and root */
        set_fs_pwd(fs, &root);
        set_fs_root(fs, &root);

        path_put(&root);
        return 0;
}

static struct user_namespace *mntns_owner(struct ns_common *ns)
{
        return to_mnt_ns(ns)->user_ns;
}

const struct proc_ns_operations mntns_operations = {
        .name                = "mnt",
        .get                = mntns_get,
        .put                = mntns_put,
        .install        = mntns_install,
        .owner                = mntns_owner,
};

#ifdef CONFIG_SYSCTL
static const struct ctl_table fs_namespace_sysctls[] = {
        {
                .procname        = "mount-max",
                .data                = &sysctl_mount_max,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ONE,
        },
};

static int __init init_fs_namespace_sysctls(void)
{
        register_sysctl_init("fs", fs_namespace_sysctls);
        return 0;
}
fs_initcall(init_fs_namespace_sysctls);

#endif /* CONFIG_SYSCTL */














































































































































































































































































































































































































































































































































































































































































   14 


























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
/*
 *  linux/include/linux/console.h
 *
 *  Copyright (C) 1993        Hamish Macdonald
 *
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file COPYING in the main directory of this archive
 * for more details.
 *
 * Changed:
 * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX
 */

#ifndef _LINUX_CONSOLE_H_
#define _LINUX_CONSOLE_H_ 1

#include <linux/atomic.h>
#include <linux/bits.h>
#include <linux/irq_work.h>
#include <linux/rculist.h>
#include <linux/rcuwait.h>
#include <linux/smp.h>
#include <linux/types.h>
#include <linux/vesa.h>

struct vc_data;
struct console_font_op;
struct console_font;
struct module;
struct tty_struct;
struct notifier_block;

enum con_scroll {
        SM_UP,
        SM_DOWN,
};

enum vc_intensity;

/**
 * struct consw - callbacks for consoles
 *
 * @owner:      the module to get references of when this console is used
 * @con_startup: set up the console and return its name (like VGA, EGA, ...)
 * @con_init:   initialize the console on @vc. @init is true for the very first
 *                call on this @vc.
 * @con_deinit: deinitialize the console from @vc.
 * @con_clear:  erase @count characters at [@x, @y] on @vc. @count >= 1.
 * @con_putc:   emit one character with attributes @ca to [@x, @y] on @vc.
 *                (optional -- @con_putcs would be called instead)
 * @con_putcs:  emit @count characters with attributes @s to [@x, @y] on @vc.
 * @con_cursor: enable/disable cursor depending on @enable
 * @con_scroll: move lines from @top to @bottom in direction @dir by @lines.
 *                Return true if no generic handling should be done.
 *                Invoked by csi_M and printing to the console.
 * @con_switch: notifier about the console switch; it is supposed to return
 *                true if a redraw is needed.
 * @con_blank:  blank/unblank the console. The target mode is passed in @blank.
 *                @mode_switch is set if changing from/to text/graphics. The hook
 *                is supposed to return true if a redraw is needed.
 * @con_font_set: set console @vc font to @font with height @vpitch. @flags can
 *                be %KD_FONT_FLAG_DONT_RECALC. (optional)
 * @con_font_get: fetch the current font on @vc of height @vpitch into @font.
 *                (optional)
 * @con_font_default: set default font on @vc. @name can be %NULL or font name
 *                to search for. @font can be filled back. (optional)
 * @con_resize:        resize the @vc console to @width x @height. @from_user is true
 *                when this change comes from the user space.
 * @con_set_palette: sets the palette of the console @vc to @table (optional)
 * @con_scrolldelta: the contents of the console should be scrolled by @lines.
 *                     Invoked by user. (optional)
 * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not
 *                provided or returns false, the origin is set to
 *                @vc->vc_screenbuf. (optional)
 * @con_save_screen: save screen content into @vc->vc_screenbuf. Called e.g.
 *                upon entering graphics. (optional)
 * @con_build_attr: build attributes based on @color, @intensity and other
 *                parameters. The result is used for both normal and erase
 *                characters. (optional)
 * @con_invert_region: invert a region of length @count on @vc starting at @p.
 *                (optional)
 */
struct consw {
        struct module *owner;
        const char *(*con_startup)(void);
        void        (*con_init)(struct vc_data *vc, bool init);
        void        (*con_deinit)(struct vc_data *vc);
        void        (*con_clear)(struct vc_data *vc, unsigned int y,
                             unsigned int x, unsigned int count);
        void        (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y,
                            unsigned int x);
        void        (*con_putcs)(struct vc_data *vc, const u16 *s,
                             unsigned int count, unsigned int ypos,
                             unsigned int xpos);
        void        (*con_cursor)(struct vc_data *vc, bool enable);
        bool        (*con_scroll)(struct vc_data *vc, unsigned int top,
                        unsigned int bottom, enum con_scroll dir,
                        unsigned int lines);
        bool        (*con_switch)(struct vc_data *vc);
        bool        (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank,
                             bool mode_switch);
        int        (*con_font_set)(struct vc_data *vc,
                                const struct console_font *font,
                                unsigned int vpitch, unsigned int flags);
        int        (*con_font_get)(struct vc_data *vc, struct console_font *font,
                        unsigned int vpitch);
        int        (*con_font_default)(struct vc_data *vc,
                        struct console_font *font, const char *name);
        int     (*con_resize)(struct vc_data *vc, unsigned int width,
                              unsigned int height, bool from_user);
        void        (*con_set_palette)(struct vc_data *vc,
                        const unsigned char *table);
        void        (*con_scrolldelta)(struct vc_data *vc, int lines);
        bool        (*con_set_origin)(struct vc_data *vc);
        void        (*con_save_screen)(struct vc_data *vc);
        u8        (*con_build_attr)(struct vc_data *vc, u8 color,
                        enum vc_intensity intensity,
                        bool blink, bool underline, bool reverse, bool italic);
        void        (*con_invert_region)(struct vc_data *vc, u16 *p, int count);
};

extern const struct consw *conswitchp;

extern const struct consw dummy_con;        /* dummy console buffer */
extern const struct consw vga_con;        /* VGA text console */
extern const struct consw newport_con;        /* SGI Newport console  */

struct screen_info;
#ifdef CONFIG_VGA_CONSOLE
void vgacon_register_screen(struct screen_info *si);
#else
static inline void vgacon_register_screen(struct screen_info *si) { }
#endif

int con_is_bound(const struct consw *csw);
int do_unregister_con_driver(const struct consw *csw);
int do_take_over_console(const struct consw *sw, int first, int last, int deflt);
void give_up_console(const struct consw *sw);
#ifdef CONFIG_VT
void con_debug_enter(struct vc_data *vc);
void con_debug_leave(void);
#else
static inline void con_debug_enter(struct vc_data *vc) { }
static inline void con_debug_leave(void) { }
#endif

/*
 * The interface for a console, or any other device that wants to capture
 * console messages (printer driver?)
 */

/**
 * enum cons_flags - General console flags
 * @CON_PRINTBUFFER:        Used by newly registered consoles to avoid duplicate
 *                        output of messages that were already shown by boot
 *                        consoles or read by userspace via syslog() syscall.
 * @CON_CONSDEV:        Indicates that the console driver is backing
 *                        /dev/console.
 * @CON_ENABLED:        Indicates if a console is allowed to print records. If
 *                        false, the console also will not advance to later
 *                        records.
 * @CON_BOOT:                Marks the console driver as early console driver which
 *                        is used during boot before the real driver becomes
 *                        available. It will be automatically unregistered
 *                        when the real console driver is registered unless
 *                        "keep_bootcon" parameter is used.
 * @CON_ANYTIME:        A misnomed historical flag which tells the core code
 *                        that the legacy @console::write callback can be invoked
 *                        on a CPU which is marked OFFLINE. That is misleading as
 *                        it suggests that there is no contextual limit for
 *                        invoking the callback. The original motivation was
 *                        readiness of the per-CPU areas.
 * @CON_BRL:                Indicates a braille device which is exempt from
 *                        receiving the printk spam for obvious reasons.
 * @CON_EXTENDED:        The console supports the extended output format of
 *                        /dev/kmesg which requires a larger output buffer.
 * @CON_SUSPENDED:        Indicates if a console is suspended. If true, the
 *                        printing callbacks must not be called.
 * @CON_NBCON:                Console can operate outside of the legacy style console_lock
 *                        constraints.
 * @CON_NBCON_ATOMIC_UNSAFE: The write_atomic() callback is not safe and is
 *                        therefore only used by nbcon_atomic_flush_unsafe().
 */
enum cons_flags {
        CON_PRINTBUFFER                = BIT(0),
        CON_CONSDEV                = BIT(1),
        CON_ENABLED                = BIT(2),
        CON_BOOT                = BIT(3),
        CON_ANYTIME                = BIT(4),
        CON_BRL                        = BIT(5),
        CON_EXTENDED                = BIT(6),
        CON_SUSPENDED                = BIT(7),
        CON_NBCON                = BIT(8),
        CON_NBCON_ATOMIC_UNSAFE        = BIT(9),
};

/**
 * struct nbcon_state - console state for nbcon consoles
 * @atom:        Compound of the state fields for atomic operations
 *
 * @req_prio:                The priority of a handover request
 * @prio:                The priority of the current owner
 * @unsafe:                Console is busy in a non takeover region
 * @unsafe_takeover:        A hostile takeover in an unsafe state happened in the
 *                        past. The console cannot be safe until re-initialized.
 * @cpu:                The CPU on which the owner runs
 *
 * To be used for reading and preparing of the value stored in the nbcon
 * state variable @console::nbcon_state.
 *
 * The @prio and @req_prio fields are particularly important to allow
 * spin-waiting to timeout and give up without the risk of a waiter being
 * assigned the lock after giving up.
 */
struct nbcon_state {
        union {
                unsigned int        atom;
                struct {
                        unsigned int prio                :  2;
                        unsigned int req_prio                :  2;
                        unsigned int unsafe                :  1;
                        unsigned int unsafe_takeover        :  1;
                        unsigned int cpu                : 24;
                };
        };
};

/*
 * The nbcon_state struct is used to easily create and interpret values that
 * are stored in the @console::nbcon_state variable. Ensure this struct stays
 * within the size boundaries of the atomic variable's underlying type in
 * order to avoid any accidental truncation.
 */
static_assert(sizeof(struct nbcon_state) <= sizeof(int));

/**
 * enum nbcon_prio - console owner priority for nbcon consoles
 * @NBCON_PRIO_NONE:                Unused
 * @NBCON_PRIO_NORMAL:                Normal (non-emergency) usage
 * @NBCON_PRIO_EMERGENCY:        Emergency output (WARN/OOPS...)
 * @NBCON_PRIO_PANIC:                Panic output
 * @NBCON_PRIO_MAX:                The number of priority levels
 *
 * A higher priority context can takeover the console when it is
 * in the safe state. The final attempt to flush consoles in panic()
 * can be allowed to do so even in an unsafe state (Hope and pray).
 */
enum nbcon_prio {
        NBCON_PRIO_NONE = 0,
        NBCON_PRIO_NORMAL,
        NBCON_PRIO_EMERGENCY,
        NBCON_PRIO_PANIC,
        NBCON_PRIO_MAX,
};

struct console;
struct printk_buffers;

/**
 * struct nbcon_context - Context for console acquire/release
 * @console:                        The associated console
 * @spinwait_max_us:                Limit for spin-wait acquire
 * @prio:                        Priority of the context
 * @allow_unsafe_takeover:        Allow performing takeover even if unsafe. Can
 *                                be used only with NBCON_PRIO_PANIC @prio. It
 *                                might cause a system freeze when the console
 *                                is used later.
 * @backlog:                        Ringbuffer has pending records
 * @pbufs:                        Pointer to the text buffer for this context
 * @seq:                        The sequence number to print for this context
 */
struct nbcon_context {
        /* members set by caller */
        struct console                *console;
        unsigned int                spinwait_max_us;
        enum nbcon_prio                prio;
        unsigned int                allow_unsafe_takeover        : 1;

        /* members set by emit */
        unsigned int                backlog                        : 1;

        /* members set by acquire */
        struct printk_buffers        *pbufs;
        u64                        seq;
};

/**
 * struct nbcon_write_context - Context handed to the nbcon write callbacks
 * @ctxt:                The core console context
 * @outbuf:                Pointer to the text buffer for output
 * @len:                Length to write
 * @unsafe_takeover:        If a hostile takeover in an unsafe state has occurred
 * @cpu:                CPU on which the message was generated
 * @pid:                PID of the task that generated the message
 * @comm:                Name of the task that generated the message
 */
struct nbcon_write_context {
        struct nbcon_context        __private ctxt;
        char                        *outbuf;
        unsigned int                len;
        bool                        unsafe_takeover;
#ifdef CONFIG_PRINTK_EXECUTION_CTX
        int                        cpu;
        pid_t                        pid;
        char                        comm[TASK_COMM_LEN];
#endif
};

/**
 * struct console - The console descriptor structure
 * @name:                The name of the console driver
 * @write:                Legacy write callback to output messages (Optional)
 * @read:                Read callback for console input (Optional)
 * @device:                The underlying TTY device driver (Optional)
 * @unblank:                Callback to unblank the console (Optional)
 * @setup:                Callback for initializing the console (Optional)
 * @exit:                Callback for teardown of the console (Optional)
 * @match:                Callback for matching a console (Optional)
 * @flags:                Console flags. See enum cons_flags
 * @index:                Console index, e.g. port number
 * @cflag:                TTY control mode flags
 * @ispeed:                TTY input speed
 * @ospeed:                TTY output speed
 * @seq:                Sequence number of the next ringbuffer record to print
 * @dropped:                Number of unreported dropped ringbuffer records
 * @data:                Driver private data
 * @node:                hlist node for the console list
 *
 * @nbcon_state:        State for nbcon consoles
 * @nbcon_seq:                Sequence number of the next record for nbcon to print
 * @nbcon_device_ctxt:        Context available for non-printing operations
 * @nbcon_prev_seq:        Seq num the previous nbcon owner was assigned to print
 * @pbufs:                Pointer to nbcon private buffer
 * @kthread:                Printer kthread for this console
 * @rcuwait:                RCU-safe wait object for @kthread waking
 * @irq_work:                Defer @kthread waking to IRQ work context
 */
struct console {
        char                        name[16];
        void                        (*write)(struct console *co, const char *s, unsigned int count);
        int                        (*read)(struct console *co, char *s, unsigned int count);
        struct tty_driver        *(*device)(struct console *co, int *index);
        void                        (*unblank)(void);
        int                        (*setup)(struct console *co, char *options);
        int                        (*exit)(struct console *co);
        int                        (*match)(struct console *co, char *name, int idx, char *options);
        short                        flags;
        short                        index;
        int                        cflag;
        uint                        ispeed;
        uint                        ospeed;
        u64                        seq;
        unsigned long                dropped;
        void                        *data;
        struct hlist_node        node;

        /* nbcon console specific members */

        /**
         * @write_atomic:
         *
         * NBCON callback to write out text in any context. (Optional)
         *
         * This callback is called with the console already acquired. However,
         * a higher priority context is allowed to take it over by default.
         *
         * The callback must call nbcon_enter_unsafe() and nbcon_exit_unsafe()
         * around any code where the takeover is not safe, for example, when
         * manipulating the serial port registers.
         *
         * nbcon_enter_unsafe() will fail if the context has lost the console
         * ownership in the meantime. In this case, the callback is no longer
         * allowed to go forward. It must back out immediately and carefully.
         * The buffer content is also no longer trusted since it no longer
         * belongs to the context.
         *
         * The callback should allow the takeover whenever it is safe. It
         * increases the chance to see messages when the system is in trouble.
         * If the driver must reacquire ownership in order to finalize or
         * revert hardware changes, nbcon_reacquire_nobuf() can be used.
         * However, on reacquire the buffer content is no longer available. A
         * reacquire cannot be used to resume printing.
         *
         * The callback can be called from any context (including NMI).
         * Therefore it must avoid usage of any locking and instead rely
         * on the console ownership for synchronization.
         */
        void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt);

        /**
         * @write_thread:
         *
         * NBCON callback to write out text in task context.
         *
         * This callback must be called only in task context with both
         * device_lock() and the nbcon console acquired with
         * NBCON_PRIO_NORMAL.
         *
         * The same rules for console ownership verification and unsafe
         * sections handling applies as with write_atomic().
         *
         * The console ownership handling is necessary for synchronization
         * against write_atomic() which is synchronized only via the context.
         *
         * The device_lock() provides the primary serialization for operations
         * on the device. It might be as relaxed (mutex)[*] or as tight
         * (disabled preemption and interrupts) as needed. It allows
         * the kthread to operate in the least restrictive mode[**].
         *
         * [*] Standalone nbcon_context_try_acquire() is not safe with
         *     the preemption enabled, see nbcon_owner_matches(). But it
         *     can be safe when always called in the preemptive context
         *     under the device_lock().
         *
         * [**] The device_lock() makes sure that nbcon_context_try_acquire()
         *      would never need to spin which is important especially with
         *      PREEMPT_RT.
         */
        void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt);

        /**
         * @device_lock:
         *
         * NBCON callback to begin synchronization with driver code.
         *
         * Console drivers typically must deal with access to the hardware
         * via user input/output (such as an interactive login shell) and
         * output of kernel messages via printk() calls. This callback is
         * called by the printk-subsystem whenever it needs to synchronize
         * with hardware access by the driver. It should be implemented to
         * use whatever synchronization mechanism the driver is using for
         * itself (for example, the port lock for uart serial consoles).
         *
         * The callback is always called from task context. It may use any
         * synchronization method required by the driver.
         *
         * IMPORTANT: The callback MUST disable migration. The console driver
         *        may be using a synchronization mechanism that already takes
         *        care of this (such as spinlocks). Otherwise this function must
         *        explicitly call migrate_disable().
         *
         * The flags argument is provided as a convenience to the driver. It
         * will be passed again to device_unlock(). It can be ignored if the
         * driver does not need it.
         */
        void (*device_lock)(struct console *con, unsigned long *flags);

        /**
         * @device_unlock:
         *
         * NBCON callback to finish synchronization with driver code.
         *
         * It is the counterpart to device_lock().
         *
         * This callback is always called from task context. It must
         * appropriately re-enable migration (depending on how device_lock()
         * disabled migration).
         *
         * The flags argument is the value of the same variable that was
         * passed to device_lock().
         */
        void (*device_unlock)(struct console *con, unsigned long flags);

        atomic_t                __private nbcon_state;
        atomic_long_t                __private nbcon_seq;
        struct nbcon_context        __private nbcon_device_ctxt;
        atomic_long_t           __private nbcon_prev_seq;

        struct printk_buffers        *pbufs;
        struct task_struct        *kthread;
        struct rcuwait                rcuwait;
        struct irq_work                irq_work;
};

#ifdef CONFIG_LOCKDEP
extern void lockdep_assert_console_list_lock_held(void);
#else
static inline void lockdep_assert_console_list_lock_held(void)
{
}
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern bool console_srcu_read_lock_is_held(void);
#else
static inline bool console_srcu_read_lock_is_held(void)
{
        return 1;
}
#endif

extern int console_srcu_read_lock(void);
extern void console_srcu_read_unlock(int cookie);

extern void console_list_lock(void);
extern void console_list_unlock(void);

extern struct hlist_head console_list;

/**
 * console_srcu_read_flags - Locklessly read flags of a possibly registered
 *                                console
 * @con:        struct console pointer of console to read flags from
 *
 * Locklessly reading @con->flags provides a consistent read value because
 * there is at most one CPU modifying @con->flags and that CPU is using only
 * read-modify-write operations to do so.
 *
 * Requires console_srcu_read_lock to be held, which implies that @con might
 * be a registered console. The purpose of holding console_srcu_read_lock is
 * to guarantee that the console state is valid (CON_SUSPENDED/CON_ENABLED)
 * and that no exit/cleanup routines will run if the console is currently
 * undergoing unregistration.
 *
 * If the caller is holding the console_list_lock or it is _certain_ that
 * @con is not and will not become registered, the caller may read
 * @con->flags directly instead.
 *
 * Context: Any context.
 * Return: The current value of the @con->flags field.
 */
static inline short console_srcu_read_flags(const struct console *con)
{
        WARN_ON_ONCE(!console_srcu_read_lock_is_held());

        /*
         * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified
         * for registered consoles with console_srcu_write_flags().
         */
        return data_race(READ_ONCE(con->flags));
}

/**
 * console_srcu_write_flags - Write flags for a registered console
 * @con:        struct console pointer of console to write flags to
 * @flags:        new flags value to write
 *
 * Only use this function to write flags for registered consoles. It
 * requires holding the console_list_lock.
 *
 * Context: Any context.
 */
static inline void console_srcu_write_flags(struct console *con, short flags)
{
        lockdep_assert_console_list_lock_held();

        /* This matches the READ_ONCE() in console_srcu_read_flags(). */
        WRITE_ONCE(con->flags, flags);
}

/* Variant of console_is_registered() when the console_list_lock is held. */
static inline bool console_is_registered_locked(const struct console *con)
{
        lockdep_assert_console_list_lock_held();
        return !hlist_unhashed(&con->node);
}

/*
 * console_is_registered - Check if the console is registered
 * @con:        struct console pointer of console to check
 *
 * Context: Process context. May sleep while acquiring console list lock.
 * Return: true if the console is in the console list, otherwise false.
 *
 * If false is returned for a console that was previously registered, it
 * can be assumed that the console's unregistration is fully completed,
 * including the exit() callback after console list removal.
 */
static inline bool console_is_registered(const struct console *con)
{
        bool ret;

        console_list_lock();
        ret = console_is_registered_locked(con);
        console_list_unlock();
        return ret;
}

/**
 * for_each_console_srcu() - Iterator over registered consoles
 * @con:        struct console pointer used as loop cursor
 *
 * Although SRCU guarantees the console list will be consistent, the
 * struct console fields may be updated by other CPUs while iterating.
 *
 * Requires console_srcu_read_lock to be held. Can be invoked from
 * any context.
 */
#define for_each_console_srcu(con)                                        \
        hlist_for_each_entry_srcu(con, &console_list, node,                \
                                  console_srcu_read_lock_is_held())

/**
 * for_each_console() - Iterator over registered consoles
 * @con:        struct console pointer used as loop cursor
 *
 * The console list and the &console.flags are immutable while iterating.
 *
 * Requires console_list_lock to be held.
 */
#define for_each_console(con)                                                \
        lockdep_assert_console_list_lock_held();                        \
        hlist_for_each_entry(con, &console_list, node)

#ifdef CONFIG_PRINTK
extern void nbcon_cpu_emergency_enter(void);
extern void nbcon_cpu_emergency_exit(void);
extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
extern void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
                                        char *buf, unsigned int len);
extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt);
extern bool nbcon_allow_unsafe_takeover(void);
extern bool nbcon_kdb_try_acquire(struct console *con,
                                  struct nbcon_write_context *wctxt);
extern void nbcon_kdb_release(struct nbcon_write_context *wctxt);

/*
 * Check if the given console is currently capable and allowed to print
 * records. Note that this function does not consider the current context,
 * which can also play a role in deciding if @con can be used to print
 * records.
 */
static inline bool console_is_usable(struct console *con, short flags, bool use_atomic)
{
        if (!(flags & CON_ENABLED))
                return false;

        if ((flags & CON_SUSPENDED))
                return false;

        if (flags & CON_NBCON) {
                if (use_atomic) {
                        /* The write_atomic() callback is optional. */
                        if (!con->write_atomic)
                                return false;

                        /*
                         * An unsafe write_atomic() callback is only usable
                         * when unsafe takeovers are allowed.
                         */
                        if ((flags & CON_NBCON_ATOMIC_UNSAFE) && !nbcon_allow_unsafe_takeover())
                                return false;
                }

                /*
                 * For the !use_atomic case, @printk_kthreads_running is not
                 * checked because the write_thread() callback is also used
                 * via the legacy loop when the printer threads are not
                 * available.
                 */
        } else {
                if (!con->write)
                        return false;
        }

        /*
         * Console drivers may assume that per-cpu resources have been
         * allocated. So unless they're explicitly marked as being able to
         * cope (CON_ANYTIME) don't call them until this CPU is officially up.
         */
        if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
                return false;

        return true;
}

#else
static inline void nbcon_cpu_emergency_enter(void) { }
static inline void nbcon_cpu_emergency_exit(void) { }
static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
static inline void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
                                               char *buf, unsigned int len) { }
static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { }
static inline bool nbcon_kdb_try_acquire(struct console *con,
                                         struct nbcon_write_context *wctxt) { return false; }
static inline void nbcon_kdb_release(struct nbcon_write_context *wctxt) { }
static inline bool console_is_usable(struct console *con, short flags,
                                     bool use_atomic) { return false; }
#endif

extern int console_set_on_cmdline;
extern struct console *early_console;

enum con_flush_mode {
        CONSOLE_FLUSH_PENDING,
        CONSOLE_REPLAY_ALL,
};

extern int add_preferred_console(const char *name, const short idx, char *options);
extern void console_force_preferred_locked(struct console *con);
extern void register_console(struct console *);
extern int unregister_console(struct console *);
extern void console_lock(void);
extern int console_trylock(void);
extern void console_unlock(void);
extern void console_unblank(void);
extern void console_flush_on_panic(enum con_flush_mode mode);
extern struct tty_driver *console_device(int *);
extern void console_suspend(struct console *);
extern void console_resume(struct console *);
extern int is_console_locked(void);
extern int braille_register_console(struct console *, int index,
                char *console_options, char *braille_options);
extern int braille_unregister_console(struct console *);
#ifdef CONFIG_TTY
extern void console_sysfs_notify(void);
#else
static inline void console_sysfs_notify(void)
{ }
#endif
extern bool console_suspend_enabled;

/* Suspend and resume console messages over PM events */
extern void console_suspend_all(void);
extern void console_resume_all(void);

int mda_console_init(void);

void vcs_make_sysfs(int index);
void vcs_remove_sysfs(int index);

/* Some debug stub to catch some of the obvious races in the VT code */
#define WARN_CONSOLE_UNLOCKED()                                                \
        WARN_ON(!atomic_read(&ignore_console_lock_warning) &&                \
                !is_console_locked() && !oops_in_progress)
/*
 * Increment ignore_console_lock_warning if you need to quiet
 * WARN_CONSOLE_UNLOCKED() for debugging purposes.
 */
extern atomic_t ignore_console_lock_warning;

DEFINE_LOCK_GUARD_0(console_lock, console_lock(), console_unlock());

extern void console_init(void);

/* For deferred console takeover */
void dummycon_register_output_notifier(struct notifier_block *nb);
void dummycon_unregister_output_notifier(struct notifier_block *nb);

#endif /* _LINUX_CONSOLE_H */

















    3 




    4 

    3 






















































    3 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
 *
 * Copyright (C) 2015 Martin Willi
 */

#include <crypto/chacha.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/unaligned.h>

static void chacha_permute(struct chacha_state *state, int nrounds)
{
        u32 *x = state->x;
        int i;

        /* whitelist the allowed round counts */
        WARN_ON_ONCE(nrounds != 20 && nrounds != 12);

        for (i = 0; i < nrounds; i += 2) {
                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);

                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
        }
}

/**
 * chacha_block_generic - generate one keystream block and increment block counter
 * @state: input state matrix
 * @out: output keystream block
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
 * The caller has already converted the endianness of the input.  This function
 * also handles incrementing the block counter in the input matrix.
 */
void chacha_block_generic(struct chacha_state *state,
                          u8 out[CHACHA_BLOCK_SIZE], int nrounds)
{
        struct chacha_state permuted_state = *state;
        int i;

        chacha_permute(&permuted_state, nrounds);

        for (i = 0; i < ARRAY_SIZE(state->x); i++)
                put_unaligned_le32(permuted_state.x[i] + state->x[i],
                                   &out[i * sizeof(u32)]);

        state->x[12]++;

        chacha_zeroize_state(&permuted_state);
}
EXPORT_SYMBOL(chacha_block_generic);

/**
 * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
 * @state: input state matrix
 * @out: the output words
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
 * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
 * skips the final addition of the initial state, and outputs only certain words
 * of the state.  It should not be used for streaming directly.
 */
void hchacha_block_generic(const struct chacha_state *state,
                           u32 out[HCHACHA_OUT_WORDS], int nrounds)
{
        struct chacha_state permuted_state = *state;

        chacha_permute(&permuted_state, nrounds);

        memcpy(&out[0], &permuted_state.x[0], 16);
        memcpy(&out[4], &permuted_state.x[12], 16);

        chacha_zeroize_state(&permuted_state);
}
EXPORT_SYMBOL(hchacha_block_generic);













































































































































































    1 


    1 






























































































































    1 








    1 







































































































    1 



    1 






















    1 





















    1 




    1 















































































    1 

























    1 















    1 


















































    1 


















































































    1 









































    1 



























































































    1 






    1 
























    1 





























    1 
































    1 



    1 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Fast Userspace Mutexes (which I call "Futexes!").
 *  (C) Rusty Russell, IBM 2002
 *
 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
 *
 *  Removed page pinning, fix privately mapped COW pages and other cleanups
 *  (C) Copyright 2003, 2004 Jamie Lokier
 *
 *  Robust futex support started by Ingo Molnar
 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
 *
 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
 *  PRIVATE futexes by Eric Dumazet
 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
 *
 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
 *  Copyright (C) IBM Corporation, 2009
 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
 *
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
 *
 *  "The futexes are also cursed."
 *  "But they come in a choice of three flavours!"
 */
#include <linux/compat.h>
#include <linux/jhash.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/plist.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
#include <linux/prctl.h>
#include <linux/mempolicy.h>
#include <linux/mmap_lock.h>

#include "futex.h"
#include "../locking/rtmutex_common.h"

/*
 * The base of the bucket array and its size are always used together
 * (after initialization only in futex_hash()), so ensure that they
 * reside in the same cacheline.
 */
static struct {
        unsigned long            hashmask;
        unsigned int                 hashshift;
        struct futex_hash_bucket *queues[MAX_NUMNODES];
} __futex_data __read_mostly __aligned(2*sizeof(long));

#define futex_hashmask        (__futex_data.hashmask)
#define futex_hashshift        (__futex_data.hashshift)
#define futex_queues        (__futex_data.queues)

struct futex_private_hash {
        int                state;
        unsigned int        hash_mask;
        struct rcu_head        rcu;
        void                *mm;
        bool                custom;
        struct futex_hash_bucket queues[];
};

/*
 * Fault injections for futexes.
 */
#ifdef CONFIG_FAIL_FUTEX

static struct {
        struct fault_attr attr;

        bool ignore_private;
} fail_futex = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_private = false,
};

static int __init setup_fail_futex(char *str)
{
        return setup_fault_attr(&fail_futex.attr, str);
}
__setup("fail_futex=", setup_fail_futex);

bool should_fail_futex(bool fshared)
{
        if (fail_futex.ignore_private && !fshared)
                return false;

        return should_fail(&fail_futex.attr, 1);
}

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_futex_debugfs(void)
{
        umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_futex", NULL,
                                        &fail_futex.attr);
        if (IS_ERR(dir))
                return PTR_ERR(dir);

        debugfs_create_bool("ignore-private", mode, dir,
                            &fail_futex.ignore_private);
        return 0;
}

late_initcall(fail_futex_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

#endif /* CONFIG_FAIL_FUTEX */

static struct futex_hash_bucket *
__futex_hash(union futex_key *key, struct futex_private_hash *fph);

#ifdef CONFIG_FUTEX_PRIVATE_HASH
static bool futex_ref_get(struct futex_private_hash *fph);
static bool futex_ref_put(struct futex_private_hash *fph);
static bool futex_ref_is_dead(struct futex_private_hash *fph);

enum { FR_PERCPU = 0, FR_ATOMIC };

static inline bool futex_key_is_private(union futex_key *key)
{
        /*
         * Relies on get_futex_key() to set either bit for shared
         * futexes -- see comment with union futex_key.
         */
        return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
}

static bool futex_private_hash_get(struct futex_private_hash *fph)
{
        return futex_ref_get(fph);
}

void futex_private_hash_put(struct futex_private_hash *fph)
{
        if (futex_ref_put(fph))
                wake_up_var(fph->mm);
}

/**
 * futex_hash_get - Get an additional reference for the local hash.
 * @hb:                    ptr to the private local hash.
 *
 * Obtain an additional reference for the already obtained hash bucket. The
 * caller must already own an reference.
 */
void futex_hash_get(struct futex_hash_bucket *hb)
{
        struct futex_private_hash *fph = hb->priv;

        if (!fph)
                return;
        WARN_ON_ONCE(!futex_private_hash_get(fph));
}

void futex_hash_put(struct futex_hash_bucket *hb)
{
        struct futex_private_hash *fph = hb->priv;

        if (!fph)
                return;
        futex_private_hash_put(fph);
}

static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
        u32 hash;

        if (!futex_key_is_private(key))
                return NULL;

        if (!fph)
                fph = rcu_dereference(key->private.mm->futex_phash);
        if (!fph || !fph->hash_mask)
                return NULL;

        hash = jhash2((void *)&key->private.address,
                      sizeof(key->private.address) / 4,
                      key->both.offset);
        return &fph->queues[hash & fph->hash_mask];
}

static void futex_rehash_private(struct futex_private_hash *old,
                                 struct futex_private_hash *new)
{
        struct futex_hash_bucket *hb_old, *hb_new;
        unsigned int slots = old->hash_mask + 1;
        unsigned int i;

        for (i = 0; i < slots; i++) {
                struct futex_q *this, *tmp;

                hb_old = &old->queues[i];

                spin_lock(&hb_old->lock);
                plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {

                        plist_del(&this->list, &hb_old->chain);
                        futex_hb_waiters_dec(hb_old);

                        WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);

                        hb_new = __futex_hash(&this->key, new);
                        futex_hb_waiters_inc(hb_new);
                        /*
                         * The new pointer isn't published yet but an already
                         * moved user can be unqueued due to timeout or signal.
                         */
                        spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
                        plist_add(&this->list, &hb_new->chain);
                        this->lock_ptr = &hb_new->lock;
                        spin_unlock(&hb_new->lock);
                }
                spin_unlock(&hb_old->lock);
        }
}

static bool __futex_pivot_hash(struct mm_struct *mm,
                               struct futex_private_hash *new)
{
        struct futex_private_hash *fph;

        WARN_ON_ONCE(mm->futex_phash_new);

        fph = rcu_dereference_protected(mm->futex_phash,
                                        lockdep_is_held(&mm->futex_hash_lock));
        if (fph) {
                if (!futex_ref_is_dead(fph)) {
                        mm->futex_phash_new = new;
                        return false;
                }

                futex_rehash_private(fph, new);
        }
        new->state = FR_PERCPU;
        scoped_guard(rcu) {
                mm->futex_batches = get_state_synchronize_rcu();
                rcu_assign_pointer(mm->futex_phash, new);
        }
        kvfree_rcu(fph, rcu);
        return true;
}

static void futex_pivot_hash(struct mm_struct *mm)
{
        scoped_guard(mutex, &mm->futex_hash_lock) {
                struct futex_private_hash *fph;

                fph = mm->futex_phash_new;
                if (fph) {
                        mm->futex_phash_new = NULL;
                        __futex_pivot_hash(mm, fph);
                }
        }
}

struct futex_private_hash *futex_private_hash(void)
{
        struct mm_struct *mm = current->mm;
        /*
         * Ideally we don't loop. If there is a replacement in progress
         * then a new private hash is already prepared and a reference can't be
         * obtained once the last user dropped it's.
         * In that case we block on mm_struct::futex_hash_lock and either have
         * to perform the replacement or wait while someone else is doing the
         * job. Eitherway, on the second iteration we acquire a reference on the
         * new private hash or loop again because a new replacement has been
         * requested.
         */
again:
        scoped_guard(rcu) {
                struct futex_private_hash *fph;

                fph = rcu_dereference(mm->futex_phash);
                if (!fph)
                        return NULL;

                if (futex_private_hash_get(fph))
                        return fph;
        }
        futex_pivot_hash(mm);
        goto again;
}

struct futex_hash_bucket *futex_hash(union futex_key *key)
{
        struct futex_private_hash *fph;
        struct futex_hash_bucket *hb;

again:
        scoped_guard(rcu) {
                hb = __futex_hash(key, NULL);
                fph = hb->priv;

                if (!fph || futex_private_hash_get(fph))
                        return hb;
        }
        futex_pivot_hash(key->private.mm);
        goto again;
}

#else /* !CONFIG_FUTEX_PRIVATE_HASH */

static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
        return NULL;
}

struct futex_hash_bucket *futex_hash(union futex_key *key)
{
        return __futex_hash(key, NULL);
}

#endif /* CONFIG_FUTEX_PRIVATE_HASH */

#ifdef CONFIG_FUTEX_MPOL

static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma = vma_lookup(mm, addr);
        struct mempolicy *mpol;
        int node = FUTEX_NO_NODE;

        if (!vma)
                return FUTEX_NO_NODE;

        mpol = READ_ONCE(vma->vm_policy);
        if (!mpol)
                return FUTEX_NO_NODE;

        switch (mpol->mode) {
        case MPOL_PREFERRED:
                node = first_node(mpol->nodes);
                break;
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
                if (mpol->home_node != NUMA_NO_NODE)
                        node = mpol->home_node;
                break;
        default:
                break;
        }

        return node;
}

static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr)
{
        int seq, node;

        guard(rcu)();

        if (!mmap_lock_speculate_try_begin(mm, &seq))
                return -EBUSY;

        node = __futex_key_to_node(mm, addr);

        if (mmap_lock_speculate_retry(mm, seq))
                return -EAGAIN;

        return node;
}

static int futex_mpol(struct mm_struct *mm, unsigned long addr)
{
        int node;

        node = futex_key_to_node_opt(mm, addr);
        if (node >= FUTEX_NO_NODE)
                return node;

        guard(mmap_read_lock)(mm);
        return __futex_key_to_node(mm, addr);
}

#else /* !CONFIG_FUTEX_MPOL */

static int futex_mpol(struct mm_struct *mm, unsigned long addr)
{
        return FUTEX_NO_NODE;
}

#endif /* CONFIG_FUTEX_MPOL */

/**
 * __futex_hash - Return the hash bucket
 * @key:        Pointer to the futex key for which the hash is calculated
 * @fph:        Pointer to private hash if known
 *
 * We hash on the keys returned from get_futex_key (see below) and return the
 * corresponding hash bucket.
 * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the
 * private hash) is returned if existing. Otherwise a hash bucket from the
 * global hash is returned.
 */
static struct futex_hash_bucket *
__futex_hash(union futex_key *key, struct futex_private_hash *fph)
{
        int node = key->both.node;
        u32 hash;

        if (node == FUTEX_NO_NODE) {
                struct futex_hash_bucket *hb;

                hb = __futex_hash_private(key, fph);
                if (hb)
                        return hb;
        }

        hash = jhash2((u32 *)key,
                      offsetof(typeof(*key), both.offset) / sizeof(u32),
                      key->both.offset);

        if (node == FUTEX_NO_NODE) {
                /*
                 * In case of !FLAGS_NUMA, use some unused hash bits to pick a
                 * node -- this ensures regular futexes are interleaved across
                 * the nodes and avoids having to allocate multiple
                 * hash-tables.
                 *
                 * NOTE: this isn't perfectly uniform, but it is fast and
                 * handles sparse node masks.
                 */
                node = (hash >> futex_hashshift) % nr_node_ids;
                if (!node_possible(node)) {
                        node = find_next_bit_wrap(node_possible_map.bits,
                                                  nr_node_ids, node);
                }
        }

        return &futex_queues[node][hash & futex_hashmask];
}

/**
 * futex_setup_timer - set up the sleeping hrtimer.
 * @time:        ptr to the given timeout value
 * @timeout:        the hrtimer_sleeper structure to be set up
 * @flags:        futex flags
 * @range_ns:        optional range in ns
 *
 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
 *           value given
 */
struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
                  int flags, u64 range_ns)
{
        if (!time)
                return NULL;

        hrtimer_setup_sleeper_on_stack(timeout,
                                       (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC,
                                       HRTIMER_MODE_ABS);
        /*
         * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
         * effectively the same as calling hrtimer_set_expires().
         */
        hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);

        return timeout;
}

/*
 * Generate a machine wide unique identifier for this inode.
 *
 * This relies on u64 not wrapping in the life-time of the machine; which with
 * 1ns resolution means almost 585 years.
 *
 * This further relies on the fact that a well formed program will not unmap
 * the file while it has a (shared) futex waiting on it. This mapping will have
 * a file reference which pins the mount and inode.
 *
 * If for some reason an inode gets evicted and read back in again, it will get
 * a new sequence number and will _NOT_ match, even though it is the exact same
 * file.
 *
 * It is important that futex_match() will never have a false-positive, esp.
 * for PI futexes that can mess up the state. The above argues that false-negatives
 * are only possible for malformed programs.
 */
static u64 get_inode_sequence_number(struct inode *inode)
{
        static atomic64_t i_seq;
        u64 old;

        /* Does the inode already have a sequence number? */
        old = atomic64_read(&inode->i_sequence);
        if (likely(old))
                return old;

        for (;;) {
                u64 new = atomic64_inc_return(&i_seq);
                if (WARN_ON_ONCE(!new))
                        continue;

                old = 0;
                if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new))
                        return old;
                return new;
        }
}

/**
 * get_futex_key() - Get parameters which are the keys for a futex
 * @uaddr:        virtual address of the futex
 * @flags:        FLAGS_*
 * @key:        address where result is stored.
 * @rw:                mapping needs to be read/write (values: FUTEX_READ,
 *              FUTEX_WRITE)
 *
 * Return: a negative error code or 0
 *
 * The key words are stored in @key on success.
 *
 * For shared mappings (when @fshared), the key is:
 *
 *   ( inode->i_sequence, page offset within mapping, offset_within_page )
 *
 * [ also see get_inode_sequence_number() ]
 *
 * For private mappings (or when !@fshared), the key is:
 *
 *   ( current->mm, address, 0 )
 *
 * This allows (cross process, where applicable) identification of the futex
 * without keeping the page pinned for the duration of the FUTEX_WAIT.
 *
 * lock_page() might sleep, the caller should not hold a spinlock.
 */
int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
                  enum futex_access rw)
{
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
        struct page *page;
        struct folio *folio;
        struct address_space *mapping;
        int node, err, size, ro = 0;
        bool node_updated = false;
        bool fshared;

        fshared = flags & FLAGS_SHARED;
        size = futex_size(flags);
        if (flags & FLAGS_NUMA)
                size *= 2;

        /*
         * The futex address must be "naturally" aligned.
         */
        key->both.offset = address % PAGE_SIZE;
        if (unlikely((address % size) != 0))
                return -EINVAL;
        address -= key->both.offset;

        if (unlikely(!access_ok(uaddr, size)))
                return -EFAULT;

        if (unlikely(should_fail_futex(fshared)))
                return -EFAULT;

        node = FUTEX_NO_NODE;

        if (flags & FLAGS_NUMA) {
                u32 __user *naddr = (void *)uaddr + size / 2;

                if (get_user_inline(node, naddr))
                        return -EFAULT;

                if ((node != FUTEX_NO_NODE) &&
                    ((unsigned int)node >= MAX_NUMNODES || !node_possible(node)))
                        return -EINVAL;
        }

        if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) {
                node = futex_mpol(mm, address);
                node_updated = true;
        }

        if (flags & FLAGS_NUMA) {
                u32 __user *naddr = (void *)uaddr + size / 2;

                if (node == FUTEX_NO_NODE) {
                        node = numa_node_id();
                        node_updated = true;
                }
                if (node_updated && put_user_inline(node, naddr))
                        return -EFAULT;
        }

        key->both.node = node;

        /*
         * PROCESS_PRIVATE futexes are fast.
         * As the mm cannot disappear under us and the 'key' only needs
         * virtual address, we dont even have to find the underlying vma.
         * Note : We do have to check 'uaddr' is a valid user address,
         *        but access_ok() should be faster than find_vma()
         */
        if (!fshared) {
                /*
                 * On no-MMU, shared futexes are treated as private, therefore
                 * we must not include the current process in the key. Since
                 * there is only one address space, the address is a unique key
                 * on its own.
                 */
                if (IS_ENABLED(CONFIG_MMU))
                        key->private.mm = mm;
                else
                        key->private.mm = NULL;

                key->private.address = address;
                return 0;
        }

again:
        /* Ignore any VERIFY_READ mapping (futex common case) */
        if (unlikely(should_fail_futex(true)))
                return -EFAULT;

        err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
        /*
         * If write access is not required (eg. FUTEX_WAIT), try
         * and get read-only access.
         */
        if (err == -EFAULT && rw == FUTEX_READ) {
                err = get_user_pages_fast(address, 1, 0, &page);
                ro = 1;
        }
        if (err < 0)
                return err;
        else
                err = 0;

        /*
         * The treatment of mapping from this point on is critical. The folio
         * lock protects many things but in this context the folio lock
         * stabilizes mapping, prevents inode freeing in the shared
         * file-backed region case and guards against movement to swap cache.
         *
         * Strictly speaking the folio lock is not needed in all cases being
         * considered here and folio lock forces unnecessarily serialization.
         * From this point on, mapping will be re-verified if necessary and
         * folio lock will be acquired only if it is unavoidable
         *
         * Mapping checks require the folio so it is looked up now. For
         * anonymous pages, it does not matter if the folio is split
         * in the future as the key is based on the address. For
         * filesystem-backed pages, the precise page is required as the
         * index of the page determines the key.
         */
        folio = page_folio(page);
        mapping = READ_ONCE(folio->mapping);

        /*
         * If folio->mapping is NULL, then it cannot be an anonymous
         * page; but it might be the ZERO_PAGE or in the gate area or
         * in a special mapping (all cases which we are happy to fail);
         * or it may have been a good file page when get_user_pages_fast
         * found it, but truncated or holepunched or subjected to
         * invalidate_complete_page2 before we got the folio lock (also
         * cases which we are happy to fail).  And we hold a reference,
         * so refcount care in invalidate_inode_page's remove_mapping
         * prevents drop_caches from setting mapping to NULL beneath us.
         *
         * The case we do have to guard against is when memory pressure made
         * shmem_writepage move it from filecache to swapcache beneath us:
         * an unlikely race, but we do need to retry for folio->mapping.
         */
        if (unlikely(!mapping)) {
                int shmem_swizzled;

                /*
                 * Folio lock is required to identify which special case above
                 * applies. If this is really a shmem page then the folio lock
                 * will prevent unexpected transitions.
                 */
                folio_lock(folio);
                shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
                folio_unlock(folio);
                folio_put(folio);

                if (shmem_swizzled)
                        goto again;

                return -EFAULT;
        }

        /*
         * Private mappings are handled in a simple way.
         *
         * If the futex key is stored in anonymous memory, then the associated
         * object is the mm which is implicitly pinned by the calling process.
         *
         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
        if (folio_test_anon(folio)) {
                /*
                 * A RO anonymous page will never change and thus doesn't make
                 * sense for futex operations.
                 */
                if (unlikely(should_fail_futex(true)) || ro) {
                        err = -EFAULT;
                        goto out;
                }

                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;

        } else {
                struct inode *inode;

                /*
                 * The associated futex object in this case is the inode and
                 * the folio->mapping must be traversed. Ordinarily this should
                 * be stabilised under folio lock but it's not strictly
                 * necessary in this case as we just want to pin the inode, not
                 * update i_pages or anything like that.
                 *
                 * The RCU read lock is taken as the inode is finally freed
                 * under RCU. If the mapping still matches expectations then the
                 * mapping->host can be safely accessed as being a valid inode.
                 */
                rcu_read_lock();

                if (READ_ONCE(folio->mapping) != mapping) {
                        rcu_read_unlock();
                        folio_put(folio);

                        goto again;
                }

                inode = READ_ONCE(mapping->host);
                if (!inode) {
                        rcu_read_unlock();
                        folio_put(folio);

                        goto again;
                }

                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
                key->shared.i_seq = get_inode_sequence_number(inode);
                key->shared.pgoff = page_pgoff(folio, page);
                rcu_read_unlock();
        }

out:
        folio_put(folio);
        return err;
}

/**
 * fault_in_user_writeable() - Fault in user address and verify RW access
 * @uaddr:        pointer to faulting user space address
 *
 * Slow path to fixup the fault we just took in the atomic write
 * access to @uaddr.
 *
 * We have no generic implementation of a non-destructive write to the
 * user address. We know that we faulted in the atomic pagefault
 * disabled section so we can as well avoid the #PF overhead by
 * calling get_user_pages() right away.
 */
int fault_in_user_writeable(u32 __user *uaddr)
{
        struct mm_struct *mm = current->mm;
        int ret;

        mmap_read_lock(mm);
        ret = fixup_user_fault(mm, (unsigned long)uaddr,
                               FAULT_FLAG_WRITE, NULL);
        mmap_read_unlock(mm);

        return ret < 0 ? ret : 0;
}

/**
 * futex_top_waiter() - Return the highest priority waiter on a futex
 * @hb:                the hash bucket the futex_q's reside in
 * @key:        the futex key (to distinguish it from other futex futex_q's)
 *
 * Must be called with the hb lock held.
 */
struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
{
        struct futex_q *this;

        plist_for_each_entry(this, &hb->chain, list) {
                if (futex_match(&this->key, key))
                        return this;
        }
        return NULL;
}

/**
 * wait_for_owner_exiting - Block until the owner has exited
 * @ret: owner's current futex lock status
 * @exiting:        Pointer to the exiting task
 *
 * Caller must hold a refcount on @exiting.
 */
void wait_for_owner_exiting(int ret, struct task_struct *exiting)
{
        if (ret != -EBUSY) {
                WARN_ON_ONCE(exiting);
                return;
        }

        if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
                return;

        mutex_lock(&exiting->futex_exit_mutex);
        /*
         * No point in doing state checking here. If the waiter got here
         * while the task was in exec()->exec_futex_release() then it can
         * have any FUTEX_STATE_* value when the waiter has acquired the
         * mutex. OK, if running, EXITING or DEAD if it reached exit()
         * already. Highly unlikely and not a problem. Just one more round
         * through the futex maze.
         */
        mutex_unlock(&exiting->futex_exit_mutex);

        put_task_struct(exiting);
}

/**
 * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
 * @q:        The futex_q to unqueue
 *
 * The q->lock_ptr must not be NULL and must be held by the caller.
 */
void __futex_unqueue(struct futex_q *q)
{
        struct futex_hash_bucket *hb;

        if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
                return;
        lockdep_assert_held(q->lock_ptr);

        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
        plist_del(&q->list, &hb->chain);
        futex_hb_waiters_dec(hb);
}

/* The key must be already stored in q->key. */
void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
{
        /*
         * Increment the counter before taking the lock so that
         * a potential waker won't miss a to-be-slept task that is
         * waiting for the spinlock. This is safe as all futex_q_lock()
         * users end up calling futex_queue(). Similarly, for housekeeping,
         * decrement the counter at futex_q_unlock() when some error has
         * occurred and we don't end up adding the task to the list.
         */
        futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */

        q->lock_ptr = &hb->lock;

        spin_lock(&hb->lock);
        __acquire(q->lock_ptr);
}

void futex_q_unlock(struct futex_hash_bucket *hb)
{
        futex_hb_waiters_dec(hb);
        spin_unlock(&hb->lock);
}

void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
                   struct task_struct *task)
{
        int prio;

        /*
         * The priority used to register this element is
         * - either the real thread-priority for the real-time threads
         * (i.e. threads with a priority lower than MAX_RT_PRIO)
         * - or MAX_RT_PRIO for non-RT threads.
         * Thus, all RT-threads are woken first in priority order, and
         * the others are woken last, in FIFO order.
         */
        prio = min(current->normal_prio, MAX_RT_PRIO);

        plist_node_init(&q->list, prio);
        plist_add(&q->list, &hb->chain);
        q->task = task;
}

/**
 * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
 * @q:        The futex_q to unqueue
 *
 * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
 * be paired with exactly one earlier call to futex_queue().
 *
 * Return:
 *  - 1 - if the futex_q was still queued (and we removed unqueued it);
 *  - 0 - if the futex_q was already removed by the waking thread
 */
int futex_unqueue(struct futex_q *q)
{
        spinlock_t *lock_ptr;
        int ret = 0;

        /* RCU so lock_ptr is not going away during locking. */
        guard(rcu)();
        /* In the common case we don't take the spinlock, which is nice. */
retry:
        /*
         * q->lock_ptr can change between this read and the following spin_lock.
         * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
         * optimizing lock_ptr out of the logic below.
         */
        lock_ptr = READ_ONCE(q->lock_ptr);
        if (lock_ptr != NULL) {
                spin_lock(lock_ptr);
                /*
                 * q->lock_ptr can change between reading it and
                 * spin_lock(), causing us to take the wrong lock.  This
                 * corrects the race condition.
                 *
                 * Reasoning goes like this: if we have the wrong lock,
                 * q->lock_ptr must have changed (maybe several times)
                 * between reading it and the spin_lock().  It can
                 * change again after the spin_lock() but only if it was
                 * already changed before the spin_lock().  It cannot,
                 * however, change back to the original value.  Therefore
                 * we can detect whether we acquired the correct lock.
                 */
                if (unlikely(lock_ptr != q->lock_ptr)) {
                        spin_unlock(lock_ptr);
                        goto retry;
                }
                __futex_unqueue(q);

                BUG_ON(q->pi_state);

                spin_unlock(lock_ptr);
                ret = 1;
        }

        return ret;
}

void futex_q_lockptr_lock(struct futex_q *q)
{
        spinlock_t *lock_ptr;

        /*
         * See futex_unqueue() why lock_ptr can change.
         */
        guard(rcu)();
retry:
        lock_ptr = READ_ONCE(q->lock_ptr);
        spin_lock(lock_ptr);

        if (unlikely(lock_ptr != q->lock_ptr)) {
                spin_unlock(lock_ptr);
                goto retry;
        }
}

/*
 * PI futexes can not be requeued and must remove themselves from the hash
 * bucket. The hash bucket lock (i.e. lock_ptr) is held.
 */
void futex_unqueue_pi(struct futex_q *q)
{
        /*
         * If the lock was not acquired (due to timeout or signal) then the
         * rt_waiter is removed before futex_q is. If this is observed by
         * an unlocker after dropping the rtmutex wait lock and before
         * acquiring the hash bucket lock, then the unlocker dequeues the
         * futex_q from the hash bucket list to guarantee consistent state
         * vs. userspace. Therefore the dequeue here must be conditional.
         */
        if (!plist_node_empty(&q->list))
                __futex_unqueue(q);

        BUG_ON(!q->pi_state);
        put_pi_state(q->pi_state);
        q->pi_state = NULL;
}

/* Constants for the pending_op argument of handle_futex_death */
#define HANDLE_DEATH_PENDING        true
#define HANDLE_DEATH_LIST        false

/*
 * Process a futex-list entry, check whether it's owned by the
 * dying task, and do notification if so:
 */
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
                              bool pi, bool pending_op)
{
        u32 uval, nval, mval;
        pid_t owner;
        int err;

        /* Futex address must be 32bit aligned */
        if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
                return -1;

retry:
        if (get_user(uval, uaddr))
                return -1;

        /*
         * Special case for regular (non PI) futexes. The unlock path in
         * user space has two race scenarios:
         *
         * 1. The unlock path releases the user space futex value and
         *    before it can execute the futex() syscall to wake up
         *    waiters it is killed.
         *
         * 2. A woken up waiter is killed before it can acquire the
         *    futex in user space.
         *
         * In the second case, the wake up notification could be generated
         * by the unlock path in user space after setting the futex value
         * to zero or by the kernel after setting the OWNER_DIED bit below.
         *
         * In both cases the TID validation below prevents a wakeup of
         * potential waiters which can cause these waiters to block
         * forever.
         *
         * In both cases the following conditions are met:
         *
         *        1) task->robust_list->list_op_pending != NULL
         *           @pending_op == true
         *        2) The owner part of user space futex value == 0
         *        3) Regular futex: @pi == false
         *
         * If these conditions are met, it is safe to attempt waking up a
         * potential waiter without touching the user space futex value and
         * trying to set the OWNER_DIED bit. If the futex value is zero,
         * the rest of the user space mutex state is consistent, so a woken
         * waiter will just take over the uncontended futex. Setting the
         * OWNER_DIED bit would create inconsistent state and malfunction
         * of the user space owner died handling. Otherwise, the OWNER_DIED
         * bit is already set, and the woken waiter is expected to deal with
         * this.
         */
        owner = uval & FUTEX_TID_MASK;

        if (pending_op && !pi && !owner) {
                futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
                           FUTEX_BITSET_MATCH_ANY);
                return 0;
        }

        if (owner != task_pid_vnr(curr))
                return 0;

        /*
         * Ok, this dying thread is truly holding a futex
         * of interest. Set the OWNER_DIED bit atomically
         * via cmpxchg, and if the value had FUTEX_WAITERS
         * set, wake up a waiter (if any). (We have to do a
         * futex_wake() even if OWNER_DIED is already set -
         * to handle the rare but possible case of recursive
         * thread-death.) The rest of the cleanup is done in
         * userspace.
         */
        mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;

        /*
         * We are not holding a lock here, but we want to have
         * the pagefault_disable/enable() protection because
         * we want to handle the fault gracefully. If the
         * access fails we try to fault in the futex with R/W
         * verification via get_user_pages. get_user() above
         * does not guarantee R/W access. If that fails we
         * give up and leave the futex locked.
         */
        if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
                switch (err) {
                case -EFAULT:
                        if (fault_in_user_writeable(uaddr))
                                return -1;
                        goto retry;

                case -EAGAIN:
                        cond_resched();
                        goto retry;

                default:
                        WARN_ON_ONCE(1);
                        return err;
                }
        }

        if (nval != uval)
                goto retry;

        /*
         * Wake robust non-PI futexes here. The wakeup of
         * PI futexes happens in exit_pi_state():
         */
        if (!pi && (uval & FUTEX_WAITERS)) {
                futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
                           FUTEX_BITSET_MATCH_ANY);
        }

        return 0;
}

/*
 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
 */
static inline int fetch_robust_entry(struct robust_list __user **entry,
                                     struct robust_list __user * __user *head,
                                     unsigned int *pi)
{
        unsigned long uentry;

        if (get_user(uentry, (unsigned long __user *)head))
                return -EFAULT;

        *entry = (void __user *)(uentry & ~1UL);
        *pi = uentry & 1;

        return 0;
}

/*
 * Walk curr->robust_list (very carefully, it's a userspace list!)
 * and mark any locks found there dead, and notify any waiters.
 *
 * We silently return on any sign of list-walking problem.
 */
static void exit_robust_list(struct task_struct *curr)
{
        struct robust_list_head __user *head = curr->robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
        unsigned int next_pi;
        unsigned long futex_offset;
        int rc;

        /*
         * Fetch the list head (which was registered earlier, via
         * sys_set_robust_list()):
         */
        if (fetch_robust_entry(&entry, &head->list.next, &pi))
                return;
        /*
         * Fetch the relative futex offset:
         */
        if (get_user(futex_offset, &head->futex_offset))
                return;
        /*
         * Fetch any possibly pending lock-add first, and handle it
         * if it exists:
         */
        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
                return;

        next_entry = NULL;        /* avoid warning with gcc */
        while (entry != &head->list) {
                /*
                 * Fetch the next entry in the list before calling
                 * handle_futex_death:
                 */
                rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
                /*
                 * A pending lock might already be on the list, so
                 * don't process it twice:
                 */
                if (entry != pending) {
                        if (handle_futex_death((void __user *)entry + futex_offset,
                                                curr, pi, HANDLE_DEATH_LIST))
                                return;
                }
                if (rc)
                        return;
                entry = next_entry;
                pi = next_pi;
                /*
                 * Avoid excessively long or circular lists:
                 */
                if (!--limit)
                        break;

                cond_resched();
        }

        if (pending) {
                handle_futex_death((void __user *)pending + futex_offset,
                                   curr, pip, HANDLE_DEATH_PENDING);
        }
}

#ifdef CONFIG_COMPAT
static void __user *futex_uaddr(struct robust_list __user *entry,
                                compat_long_t futex_offset)
{
        compat_uptr_t base = ptr_to_compat(entry);
        void __user *uaddr = compat_ptr(base + futex_offset);

        return uaddr;
}

/*
 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
 */
static inline int
compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
                   compat_uptr_t __user *head, unsigned int *pi)
{
        if (get_user(*uentry, head))
                return -EFAULT;

        *entry = compat_ptr((*uentry) & ~1);
        *pi = (unsigned int)(*uentry) & 1;

        return 0;
}

/*
 * Walk curr->robust_list (very carefully, it's a userspace list!)
 * and mark any locks found there dead, and notify any waiters.
 *
 * We silently return on any sign of list-walking problem.
 */
static void compat_exit_robust_list(struct task_struct *curr)
{
        struct compat_robust_list_head __user *head = curr->compat_robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
        unsigned int next_pi;
        compat_uptr_t uentry, next_uentry, upending;
        compat_long_t futex_offset;
        int rc;

        /*
         * Fetch the list head (which was registered earlier, via
         * sys_set_robust_list()):
         */
        if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
                return;
        /*
         * Fetch the relative futex offset:
         */
        if (get_user(futex_offset, &head->futex_offset))
                return;
        /*
         * Fetch any possibly pending lock-add first, and handle it
         * if it exists:
         */
        if (compat_fetch_robust_entry(&upending, &pending,
                               &head->list_op_pending, &pip))
                return;

        next_entry = NULL;        /* avoid warning with gcc */
        while (entry != (struct robust_list __user *) &head->list) {
                /*
                 * Fetch the next entry in the list before calling
                 * handle_futex_death:
                 */
                rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
                        (compat_uptr_t __user *)&entry->next, &next_pi);
                /*
                 * A pending lock might already be on the list, so
                 * dont process it twice:
                 */
                if (entry != pending) {
                        void __user *uaddr = futex_uaddr(entry, futex_offset);

                        if (handle_futex_death(uaddr, curr, pi,
                                               HANDLE_DEATH_LIST))
                                return;
                }
                if (rc)
                        return;
                uentry = next_uentry;
                entry = next_entry;
                pi = next_pi;
                /*
                 * Avoid excessively long or circular lists:
                 */
                if (!--limit)
                        break;

                cond_resched();
        }
        if (pending) {
                void __user *uaddr = futex_uaddr(pending, futex_offset);

                handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
        }
}
#endif

#ifdef CONFIG_FUTEX_PI

/*
 * This task is holding PI mutexes at exit time => bad.
 * Kernel cleans up PI-state, but userspace is likely hosed.
 * (Robust-futex cleanup is separate and might save the day for userspace.)
 */
static void exit_pi_state_list(struct task_struct *curr)
{
        struct list_head *next, *head = &curr->pi_state_list;
        struct futex_pi_state *pi_state;
        union futex_key key = FUTEX_KEY_INIT;

        /*
         * The mutex mm_struct::futex_hash_lock might be acquired.
         */
        might_sleep();
        /*
         * Ensure the hash remains stable (no resize) during the while loop
         * below. The hb pointer is acquired under the pi_lock so we can't block
         * on the mutex.
         */
        WARN_ON(curr != current);
        guard(private_hash)();
        /*
         * We are a ZOMBIE and nobody can enqueue itself on
         * pi_state_list anymore, but we have to be careful
         * versus waiters unqueueing themselves:
         */
        raw_spin_lock_irq(&curr->pi_lock);
        while (!list_empty(head)) {
                next = head->next;
                pi_state = list_entry(next, struct futex_pi_state, list);
                key = pi_state->key;
                if (1) {
                        CLASS(hb, hb)(&key);

                        /*
                         * We can race against put_pi_state() removing itself from the
                         * list (a waiter going away). put_pi_state() will first
                         * decrement the reference count and then modify the list, so
                         * its possible to see the list entry but fail this reference
                         * acquire.
                         *
                         * In that case; drop the locks to let put_pi_state() make
                         * progress and retry the loop.
                         */
                        if (!refcount_inc_not_zero(&pi_state->refcount)) {
                                raw_spin_unlock_irq(&curr->pi_lock);
                                cpu_relax();
                                raw_spin_lock_irq(&curr->pi_lock);
                                continue;
                        }
                        raw_spin_unlock_irq(&curr->pi_lock);

                        spin_lock(&hb->lock);
                        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
                        raw_spin_lock(&curr->pi_lock);
                        /*
                         * We dropped the pi-lock, so re-check whether this
                         * task still owns the PI-state:
                         */
                        if (head->next != next) {
                                /* retain curr->pi_lock for the loop invariant */
                                raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
                                spin_unlock(&hb->lock);
                                put_pi_state(pi_state);
                                continue;
                        }

                        WARN_ON(pi_state->owner != curr);
                        WARN_ON(list_empty(&pi_state->list));
                        list_del_init(&pi_state->list);
                        pi_state->owner = NULL;

                        raw_spin_unlock(&curr->pi_lock);
                        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
                        spin_unlock(&hb->lock);
                }

                rt_mutex_futex_unlock(&pi_state->pi_mutex);
                put_pi_state(pi_state);

                raw_spin_lock_irq(&curr->pi_lock);
        }
        raw_spin_unlock_irq(&curr->pi_lock);
}
#else
static inline void exit_pi_state_list(struct task_struct *curr) { }
#endif

static void futex_cleanup(struct task_struct *tsk)
{
        if (unlikely(tsk->robust_list)) {
                exit_robust_list(tsk);
                tsk->robust_list = NULL;
        }

#ifdef CONFIG_COMPAT
        if (unlikely(tsk->compat_robust_list)) {
                compat_exit_robust_list(tsk);
                tsk->compat_robust_list = NULL;
        }
#endif

        if (unlikely(!list_empty(&tsk->pi_state_list)))
                exit_pi_state_list(tsk);
}

/**
 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
 * @tsk:        task to set the state on
 *
 * Set the futex exit state of the task lockless. The futex waiter code
 * observes that state when a task is exiting and loops until the task has
 * actually finished the futex cleanup. The worst case for this is that the
 * waiter runs through the wait loop until the state becomes visible.
 *
 * This is called from the recursive fault handling path in make_task_dead().
 *
 * This is best effort. Either the futex exit code has run already or
 * not. If the OWNER_DIED bit has been set on the futex then the waiter can
 * take it over. If not, the problem is pushed back to user space. If the
 * futex exit code did not run yet, then an already queued waiter might
 * block forever, but there is nothing which can be done about that.
 */
void futex_exit_recursive(struct task_struct *tsk)
{
        /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
        if (tsk->futex_state == FUTEX_STATE_EXITING) {
                __assume_ctx_lock(&tsk->futex_exit_mutex);
                mutex_unlock(&tsk->futex_exit_mutex);
        }
        tsk->futex_state = FUTEX_STATE_DEAD;
}

static void futex_cleanup_begin(struct task_struct *tsk)
        __acquires(&tsk->futex_exit_mutex)
{
        /*
         * Prevent various race issues against a concurrent incoming waiter
         * including live locks by forcing the waiter to block on
         * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
         * attach_to_pi_owner().
         */
        mutex_lock(&tsk->futex_exit_mutex);

        /*
         * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
         *
         * This ensures that all subsequent checks of tsk->futex_state in
         * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
         * tsk->pi_lock held.
         *
         * It guarantees also that a pi_state which was queued right before
         * the state change under tsk->pi_lock by a concurrent waiter must
         * be observed in exit_pi_state_list().
         */
        raw_spin_lock_irq(&tsk->pi_lock);
        tsk->futex_state = FUTEX_STATE_EXITING;
        raw_spin_unlock_irq(&tsk->pi_lock);
}

static void futex_cleanup_end(struct task_struct *tsk, int state)
        __releases(&tsk->futex_exit_mutex)
{
        /*
         * Lockless store. The only side effect is that an observer might
         * take another loop until it becomes visible.
         */
        tsk->futex_state = state;
        /*
         * Drop the exit protection. This unblocks waiters which observed
         * FUTEX_STATE_EXITING to reevaluate the state.
         */
        mutex_unlock(&tsk->futex_exit_mutex);
}

void futex_exec_release(struct task_struct *tsk)
{
        /*
         * The state handling is done for consistency, but in the case of
         * exec() there is no way to prevent further damage as the PID stays
         * the same. But for the unlikely and arguably buggy case that a
         * futex is held on exec(), this provides at least as much state
         * consistency protection which is possible.
         */
        futex_cleanup_begin(tsk);
        futex_cleanup(tsk);
        /*
         * Reset the state to FUTEX_STATE_OK. The task is alive and about
         * exec a new binary.
         */
        futex_cleanup_end(tsk, FUTEX_STATE_OK);
}

void futex_exit_release(struct task_struct *tsk)
{
        futex_cleanup_begin(tsk);
        futex_cleanup(tsk);
        futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}

static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
                                   struct futex_private_hash *fph)
{
#ifdef CONFIG_FUTEX_PRIVATE_HASH
        fhb->priv = fph;
#endif
        atomic_set(&fhb->waiters, 0);
        plist_head_init(&fhb->chain);
        spin_lock_init(&fhb->lock);
}

#define FH_CUSTOM        0x01

#ifdef CONFIG_FUTEX_PRIVATE_HASH

/*
 * futex-ref
 *
 * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
 * code because it just doesn't fit right.
 *
 * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
 * re-initializes the state automatically, such that the fph swizzle is also a
 * transition back to per-cpu.
 */

static void futex_ref_rcu(struct rcu_head *head);

static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
{
        struct mm_struct *mm = fph->mm;

        /*
         * The counter we're about to switch to must have fully switched;
         * otherwise it would be impossible for it to have reported success
         * from futex_ref_is_dead().
         */
        WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);

        /*
         * Set the atomic to the bias value such that futex_ref_{get,put}()
         * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
         * when folding in the percpu count.
         */
        atomic_long_set(&mm->futex_atomic, LONG_MAX);
        smp_store_release(&fph->state, FR_ATOMIC);

        call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
}

static void __futex_ref_atomic_end(struct futex_private_hash *fph)
{
        struct mm_struct *mm = fph->mm;
        unsigned int count = 0;
        long ret;
        int cpu;

        /*
         * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
         * and per this RCU callback, everybody must now observe this state and
         * use the atomic variable.
         */
        WARN_ON_ONCE(fph->state != FR_ATOMIC);

        /*
         * Therefore the per-cpu counter is now stable, sum and reset.
         */
        for_each_possible_cpu(cpu) {
                unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
                count += *ptr;
                *ptr = 0;
        }

        /*
         * Re-init for the next cycle.
         */
        this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */

        /*
         * Add actual count, subtract bias and initial refcount.
         *
         * The moment this atomic operation happens, futex_ref_is_dead() can
         * become true.
         */
        ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
        if (!ret)
                wake_up_var(mm);

        WARN_ON_ONCE(ret < 0);
        mmput_async(mm);
}

static void futex_ref_rcu(struct rcu_head *head)
{
        struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
        struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);

        if (fph->state == FR_PERCPU) {
                /*
                 * Per this extra grace-period, everybody must now observe
                 * fph as the current fph and no previously observed fph's
                 * are in-flight.
                 *
                 * Notably, nobody will now rely on the atomic
                 * futex_ref_is_dead() state anymore so we can begin the
                 * migration of the per-cpu counter into the atomic.
                 */
                __futex_ref_atomic_begin(fph);
                return;
        }

        __futex_ref_atomic_end(fph);
}

/*
 * Drop the initial refcount and transition to atomics.
 */
static void futex_ref_drop(struct futex_private_hash *fph)
{
        struct mm_struct *mm = fph->mm;

        /*
         * Can only transition the current fph;
         */
        WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
        /*
         * We enqueue at least one RCU callback. Ensure mm stays if the task
         * exits before the transition is completed.
         */
        mmget(mm);

        /*
         * In order to avoid the following scenario:
         *
         * futex_hash()                        __futex_pivot_hash()
         *   guard(rcu);                  guard(mm->futex_hash_lock);
         *   fph = mm->futex_phash;
         *                                  rcu_assign_pointer(&mm->futex_phash, new);
         *                                futex_hash_allocate()
         *                                  futex_ref_drop()
         *                                    fph->state = FR_ATOMIC;
         *                                    atomic_set(, BIAS);
         *
         *   futex_private_hash_get(fph); // OOPS
         *
         * Where an old fph (which is FR_ATOMIC) and should fail on
         * inc_not_zero, will succeed because a new transition is started and
         * the atomic is bias'ed away from 0.
         *
         * There must be at least one full grace-period between publishing a
         * new fph and trying to replace it.
         */
        if (poll_state_synchronize_rcu(mm->futex_batches)) {
                /*
                 * There was a grace-period, we can begin now.
                 */
                __futex_ref_atomic_begin(fph);
                return;
        }

        call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
}

static bool futex_ref_get(struct futex_private_hash *fph)
{
        struct mm_struct *mm = fph->mm;

        guard(preempt)();

        if (READ_ONCE(fph->state) == FR_PERCPU) {
                __this_cpu_inc(*mm->futex_ref);
                return true;
        }

        return atomic_long_inc_not_zero(&mm->futex_atomic);
}

static bool futex_ref_put(struct futex_private_hash *fph)
{
        struct mm_struct *mm = fph->mm;

        guard(preempt)();

        if (READ_ONCE(fph->state) == FR_PERCPU) {
                __this_cpu_dec(*mm->futex_ref);
                return false;
        }

        return atomic_long_dec_and_test(&mm->futex_atomic);
}

static bool futex_ref_is_dead(struct futex_private_hash *fph)
{
        struct mm_struct *mm = fph->mm;

        guard(rcu)();

        if (smp_load_acquire(&fph->state) == FR_PERCPU)
                return false;

        return atomic_long_read(&mm->futex_atomic) == 0;
}

int futex_mm_init(struct mm_struct *mm)
{
        mutex_init(&mm->futex_hash_lock);
        RCU_INIT_POINTER(mm->futex_phash, NULL);
        mm->futex_phash_new = NULL;
        /* futex-ref */
        mm->futex_ref = NULL;
        atomic_long_set(&mm->futex_atomic, 0);
        mm->futex_batches = get_state_synchronize_rcu();
        return 0;
}

void futex_hash_free(struct mm_struct *mm)
{
        struct futex_private_hash *fph;

        free_percpu(mm->futex_ref);
        kvfree(mm->futex_phash_new);
        fph = rcu_dereference_raw(mm->futex_phash);
        if (fph)
                kvfree(fph);
}

static bool futex_pivot_pending(struct mm_struct *mm)
{
        struct futex_private_hash *fph;

        guard(rcu)();

        if (!mm->futex_phash_new)
                return true;

        fph = rcu_dereference(mm->futex_phash);
        return futex_ref_is_dead(fph);
}

static bool futex_hash_less(struct futex_private_hash *a,
                            struct futex_private_hash *b)
{
        /* user provided always wins */
        if (!a->custom && b->custom)
                return true;
        if (a->custom && !b->custom)
                return false;

        /* zero-sized hash wins */
        if (!b->hash_mask)
                return true;
        if (!a->hash_mask)
                return false;

        /* keep the biggest */
        if (a->hash_mask < b->hash_mask)
                return true;
        if (a->hash_mask > b->hash_mask)
                return false;

        return false; /* equal */
}

static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
{
        struct mm_struct *mm = current->mm;
        struct futex_private_hash *fph;
        bool custom = flags & FH_CUSTOM;
        int i;

        if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
                return -EINVAL;

        /*
         * Once we've disabled the global hash there is no way back.
         */
        scoped_guard(rcu) {
                fph = rcu_dereference(mm->futex_phash);
                if (fph && !fph->hash_mask) {
                        if (custom)
                                return -EBUSY;
                        return 0;
                }
        }

        if (!mm->futex_ref) {
                /*
                 * This will always be allocated by the first thread and
                 * therefore requires no locking.
                 */
                mm->futex_ref = alloc_percpu(unsigned int);
                if (!mm->futex_ref)
                        return -ENOMEM;
                this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
        }

        fph = kvzalloc(struct_size(fph, queues, hash_slots),
                       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
        if (!fph)
                return -ENOMEM;

        fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
        fph->custom = custom;
        fph->mm = mm;

        for (i = 0; i < hash_slots; i++)
                futex_hash_bucket_init(&fph->queues[i], fph);

        if (custom) {
                /*
                 * Only let prctl() wait / retry; don't unduly delay clone().
                 */
again:
                wait_var_event(mm, futex_pivot_pending(mm));
        }

        scoped_guard(mutex, &mm->futex_hash_lock) {
                struct futex_private_hash *free __free(kvfree) = NULL;
                struct futex_private_hash *cur, *new;

                cur = rcu_dereference_protected(mm->futex_phash,
                                                lockdep_is_held(&mm->futex_hash_lock));
                new = mm->futex_phash_new;
                mm->futex_phash_new = NULL;

                if (fph) {
                        if (cur && !cur->hash_mask) {
                                /*
                                 * If two threads simultaneously request the global
                                 * hash then the first one performs the switch,
                                 * the second one returns here.
                                 */
                                free = fph;
                                mm->futex_phash_new = new;
                                return -EBUSY;
                        }
                        if (cur && !new) {
                                /*
                                 * If we have an existing hash, but do not yet have
                                 * allocated a replacement hash, drop the initial
                                 * reference on the existing hash.
                                 */
                                futex_ref_drop(cur);
                        }

                        if (new) {
                                /*
                                 * Two updates raced; throw out the lesser one.
                                 */
                                if (futex_hash_less(new, fph)) {
                                        free = new;
                                        new = fph;
                                } else {
                                        free = fph;
                                }
                        } else {
                                new = fph;
                        }
                        fph = NULL;
                }

                if (new) {
                        /*
                         * Will set mm->futex_phash_new on failure;
                         * futex_private_hash_get() will try again.
                         */
                        if (!__futex_pivot_hash(mm, new) && custom)
                                goto again;
                }
        }
        return 0;
}

int futex_hash_allocate_default(void)
{
        unsigned int threads, buckets, current_buckets = 0;
        struct futex_private_hash *fph;

        if (!current->mm)
                return 0;

        scoped_guard(rcu) {
                threads = min_t(unsigned int,
                                get_nr_threads(current),
                                num_online_cpus());

                fph = rcu_dereference(current->mm->futex_phash);
                if (fph) {
                        if (fph->custom)
                                return 0;

                        current_buckets = fph->hash_mask + 1;
                }
        }

        /*
         * The default allocation will remain within
         *   16 <= threads * 4 <= global hash size
         */
        buckets = roundup_pow_of_two(4 * threads);
        buckets = clamp(buckets, 16, futex_hashmask + 1);

        if (current_buckets >= buckets)
                return 0;

        return futex_hash_allocate(buckets, 0);
}

static int futex_hash_get_slots(void)
{
        struct futex_private_hash *fph;

        guard(rcu)();
        fph = rcu_dereference(current->mm->futex_phash);
        if (fph && fph->hash_mask)
                return fph->hash_mask + 1;
        return 0;
}

#else

static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
{
        return -EINVAL;
}

static int futex_hash_get_slots(void)
{
        return 0;
}

#endif

int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
{
        unsigned int flags = FH_CUSTOM;
        int ret;

        switch (arg2) {
        case PR_FUTEX_HASH_SET_SLOTS:
                if (arg4)
                        return -EINVAL;
                ret = futex_hash_allocate(arg3, flags);
                break;

        case PR_FUTEX_HASH_GET_SLOTS:
                ret = futex_hash_get_slots();
                break;

        default:
                ret = -EINVAL;
                break;
        }
        return ret;
}

static int __init futex_init(void)
{
        unsigned long hashsize, i;
        unsigned int order, n;
        unsigned long size;

#ifdef CONFIG_BASE_SMALL
        hashsize = 16;
#else
        hashsize = 256 * num_possible_cpus();
        hashsize /= num_possible_nodes();
        hashsize = max(4, hashsize);
        hashsize = roundup_pow_of_two(hashsize);
#endif
        futex_hashshift = ilog2(hashsize);
        size = sizeof(struct futex_hash_bucket) * hashsize;
        order = get_order(size);

        for_each_node(n) {
                struct futex_hash_bucket *table;

                if (order > MAX_PAGE_ORDER)
                        table = vmalloc_huge_node(size, GFP_KERNEL, n);
                else
                        table = alloc_pages_exact_nid(n, size, GFP_KERNEL);

                BUG_ON(!table);

                for (i = 0; i < hashsize; i++)
                        futex_hash_bucket_init(&table[i], NULL);

                futex_queues[n] = table;
        }

        futex_hashmask = hashsize - 1;
        pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
                hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
                order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
        return 0;
}
core_initcall(futex_init);












































































































































































   26 









































   20 




   15 















    1 



















   15 









































    1 























































   19 


























   14 






















































































































































































































































































   48 
   55 















































































































































































































































   16 




   19 




















   13 
   14 



   14 






































   18 
   16 
















   16 
   17 














































    4 
    4 

















































   16 
   19 














































    3 



    4 










































































   13 
   15 



   17 





























   15 
   16 


   15 
















































































    1 














    1 



    1 


    1 










    1 














































    1 


    1 














































































































































































































































































































































    1 


    1 






























































    3 


    3 

















   21 


   24 












































































































































































































































































































































































































































































































































    6 
    6 






















   22 
   24 












   26 
   26 



   28 












   12 
   13 









   15 


   16 



   15 























































































   14 
   14 











    1 





































    1 
    1 






































    1 
    1 




















































































































   17 
   18 














   18 
   18 













































































































    1 
    1 




    1 
























































































    1 
    1 

















































    1 
    1 




































































































































   15 

   19 










































    1 
    1 































































































































    1 




    1 

    1 
























































































































































































































































































































































    1 
    1 




























   22 
   22 

   18 






































































































































































































































































































































































































































































































    3 
    3 


































































































































































































































   13 
   14 


















































































































































































































































































































































































































































































































































































































































































































    2 
    2 



    3 
    4 





















    1 
    1 



    2 





























    2 
    2 


























































































































   36 
   43 



























    6 
    6 












































   17 


   19 



   21 



















































































   14 
   15 




























    1 
    1 































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Security plug functions
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2016 Mellanox Technologies
 * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com>
 */

#define pr_fmt(fmt) "LSM: " fmt

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/dcache.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/lsm_hooks.h>
#include <linux/mman.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/backing-dev.h>
#include <linux/string.h>
#include <linux/xattr.h>
#include <linux/msg.h>
#include <linux/overflow.h>
#include <linux/perf_event.h>
#include <linux/fs.h>
#include <net/flow.h>
#include <net/sock.h>

#include "lsm.h"

/*
 * These are descriptions of the reasons that can be passed to the
 * security_locked_down() LSM hook. Placing this array here allows
 * all security modules to use the same descriptions for auditing
 * purposes.
 */
const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = {
        [LOCKDOWN_NONE] = "none",
        [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading",
        [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port",
        [LOCKDOWN_EFI_TEST] = "/dev/efi_test access",
        [LOCKDOWN_KEXEC] = "kexec of unsigned images",
        [LOCKDOWN_HIBERNATION] = "hibernation",
        [LOCKDOWN_PCI_ACCESS] = "direct PCI access",
        [LOCKDOWN_IOPORT] = "raw io port access",
        [LOCKDOWN_MSR] = "raw MSR access",
        [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables",
        [LOCKDOWN_DEVICE_TREE] = "modifying device tree contents",
        [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage",
        [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO",
        [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters",
        [LOCKDOWN_MMIOTRACE] = "unsafe mmio",
        [LOCKDOWN_DEBUGFS] = "debugfs access",
        [LOCKDOWN_XMON_WR] = "xmon write access",
        [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM",
        [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM",
        [LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection",
        [LOCKDOWN_XEN_USER_ACTIONS] = "Xen guest user action",
        [LOCKDOWN_INTEGRITY_MAX] = "integrity",
        [LOCKDOWN_KCORE] = "/proc/kcore access",
        [LOCKDOWN_KPROBES] = "use of kprobes",
        [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM",
        [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM",
        [LOCKDOWN_PERF] = "unsafe use of perf",
        [LOCKDOWN_TRACEFS] = "use of tracefs",
        [LOCKDOWN_XMON_RW] = "xmon read and write access",
        [LOCKDOWN_XFRM_SECRET] = "xfrm SA secret",
        [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
};

bool lsm_debug __ro_after_init;

unsigned int lsm_active_cnt __ro_after_init;
const struct lsm_id *lsm_idlist[MAX_LSM_COUNT];

struct lsm_blob_sizes blob_sizes;

struct kmem_cache *lsm_file_cache;
struct kmem_cache *lsm_backing_file_cache;
struct kmem_cache *lsm_inode_cache;

#define SECURITY_HOOK_ACTIVE_KEY(HOOK, IDX) security_hook_active_##HOOK##_##IDX

/*
 * Identifier for the LSM static calls.
 * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h
 * IDX is the index of the static call. 0 <= NUM < MAX_LSM_COUNT
 */
#define LSM_STATIC_CALL(HOOK, IDX) lsm_static_call_##HOOK##_##IDX

/*
 * Call the macro M for each LSM hook MAX_LSM_COUNT times.
 */
#define LSM_LOOP_UNROLL(M, ...)                 \
do {                                                \
        UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__)        \
} while (0)

#define LSM_DEFINE_UNROLL(M, ...) UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__)

#ifdef CONFIG_HAVE_STATIC_CALL
#define LSM_HOOK_TRAMP(NAME, NUM) \
        &STATIC_CALL_TRAMP(LSM_STATIC_CALL(NAME, NUM))
#else
#define LSM_HOOK_TRAMP(NAME, NUM) NULL
#endif

/*
 * Define static calls and static keys for each LSM hook.
 */
#define DEFINE_LSM_STATIC_CALL(NUM, NAME, RET, ...)                        \
        DEFINE_STATIC_CALL_NULL(LSM_STATIC_CALL(NAME, NUM),                \
                                *((RET(*)(__VA_ARGS__))NULL));                \
        static DEFINE_STATIC_KEY_FALSE(SECURITY_HOOK_ACTIVE_KEY(NAME, NUM));

#define LSM_HOOK(RET, DEFAULT, NAME, ...)                                \
        LSM_DEFINE_UNROLL(DEFINE_LSM_STATIC_CALL, NAME, RET, __VA_ARGS__)
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
#undef DEFINE_LSM_STATIC_CALL

/*
 * Initialise a table of static calls for each LSM hook.
 * DEFINE_STATIC_CALL_NULL invocation above generates a key (STATIC_CALL_KEY)
 * and a trampoline (STATIC_CALL_TRAMP) which are used to call
 * __static_call_update when updating the static call.
 *
 * The static calls table is used by early LSMs, some architectures can fault on
 * unaligned accesses and the fault handling code may not be ready by then.
 * Thus, the static calls table should be aligned to avoid any unhandled faults
 * in early init.
 */
struct lsm_static_calls_table
        static_calls_table __ro_after_init __aligned(sizeof(u64)) = {
#define INIT_LSM_STATIC_CALL(NUM, NAME)                                        \
        (struct lsm_static_call) {                                        \
                .key = &STATIC_CALL_KEY(LSM_STATIC_CALL(NAME, NUM)),        \
                .trampoline = LSM_HOOK_TRAMP(NAME, NUM),                \
                .active = &SECURITY_HOOK_ACTIVE_KEY(NAME, NUM),                \
        },
#define LSM_HOOK(RET, DEFAULT, NAME, ...)                                \
        .NAME = {                                                        \
                LSM_DEFINE_UNROLL(INIT_LSM_STATIC_CALL, NAME)                \
        },
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
#undef INIT_LSM_STATIC_CALL
        };

/**
 * lsm_file_alloc - allocate a composite file blob
 * @file: the file that needs a blob
 *
 * Allocate the file blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_file_alloc(struct file *file)
{
        if (!lsm_file_cache) {
                file->f_security = NULL;
                return 0;
        }

        file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
        if (file->f_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_backing_file_alloc - allocate a composite backing file blob
 * @backing_file: the backing file
 *
 * Allocate the backing file blob for all the modules.
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_backing_file_alloc(struct file *backing_file)
{
        void *blob;

        if (!lsm_backing_file_cache) {
                backing_file_set_security(backing_file, NULL);
                return 0;
        }

        blob = kmem_cache_zalloc(lsm_backing_file_cache, GFP_KERNEL);
        backing_file_set_security(backing_file, blob);
        if (!blob)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_blob_alloc - allocate a composite blob
 * @dest: the destination for the blob
 * @size: the size of the blob
 * @gfp: allocation type
 *
 * Allocate a blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_blob_alloc(void **dest, size_t size, gfp_t gfp)
{
        if (size == 0) {
                *dest = NULL;
                return 0;
        }

        *dest = kzalloc(size, gfp);
        if (*dest == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_cred_alloc - allocate a composite cred blob
 * @cred: the cred that needs a blob
 * @gfp: allocation type
 *
 * Allocate the cred blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
{
        return lsm_blob_alloc(&cred->security, blob_sizes.lbs_cred, gfp);
}

/**
 * lsm_inode_alloc - allocate a composite inode blob
 * @inode: the inode that needs a blob
 * @gfp: allocation flags
 *
 * Allocate the inode blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_inode_alloc(struct inode *inode, gfp_t gfp)
{
        if (!lsm_inode_cache) {
                inode->i_security = NULL;
                return 0;
        }

        inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp);
        if (inode->i_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_task_alloc - allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
int lsm_task_alloc(struct task_struct *task)
{
        return lsm_blob_alloc(&task->security, blob_sizes.lbs_task, GFP_KERNEL);
}

/**
 * lsm_ipc_alloc - allocate a composite ipc blob
 * @kip: the ipc that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
{
        return lsm_blob_alloc(&kip->security, blob_sizes.lbs_ipc, GFP_KERNEL);
}

#ifdef CONFIG_KEYS
/**
 * lsm_key_alloc - allocate a composite key blob
 * @key: the key that needs a blob
 *
 * Allocate the key blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_key_alloc(struct key *key)
{
        return lsm_blob_alloc(&key->security, blob_sizes.lbs_key, GFP_KERNEL);
}
#endif /* CONFIG_KEYS */

/**
 * lsm_msg_msg_alloc - allocate a composite msg_msg blob
 * @mp: the msg_msg that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_msg_msg_alloc(struct msg_msg *mp)
{
        return lsm_blob_alloc(&mp->security, blob_sizes.lbs_msg_msg,
                              GFP_KERNEL);
}

/**
 * lsm_bdev_alloc - allocate a composite block_device blob
 * @bdev: the block_device that needs a blob
 *
 * Allocate the block_device blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_bdev_alloc(struct block_device *bdev)
{
        return lsm_blob_alloc(&bdev->bd_security, blob_sizes.lbs_bdev,
                              GFP_KERNEL);
}

#ifdef CONFIG_BPF_SYSCALL
/**
 * lsm_bpf_map_alloc - allocate a composite bpf_map blob
 * @map: the bpf_map that needs a blob
 *
 * Allocate the bpf_map blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_bpf_map_alloc(struct bpf_map *map)
{
        return lsm_blob_alloc(&map->security, blob_sizes.lbs_bpf_map, GFP_KERNEL);
}

/**
 * lsm_bpf_prog_alloc - allocate a composite bpf_prog blob
 * @prog: the bpf_prog that needs a blob
 *
 * Allocate the bpf_prog blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_bpf_prog_alloc(struct bpf_prog *prog)
{
        return lsm_blob_alloc(&prog->aux->security, blob_sizes.lbs_bpf_prog, GFP_KERNEL);
}

/**
 * lsm_bpf_token_alloc - allocate a composite bpf_token blob
 * @token: the bpf_token that needs a blob
 *
 * Allocate the bpf_token blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_bpf_token_alloc(struct bpf_token *token)
{
        return lsm_blob_alloc(&token->security, blob_sizes.lbs_bpf_token, GFP_KERNEL);
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * lsm_superblock_alloc - allocate a composite superblock blob
 * @sb: the superblock that needs a blob
 *
 * Allocate the superblock blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_superblock_alloc(struct super_block *sb)
{
        return lsm_blob_alloc(&sb->s_security, blob_sizes.lbs_superblock,
                              GFP_KERNEL);
}

/**
 * lsm_fill_user_ctx - Fill a user space lsm_ctx structure
 * @uctx: a userspace LSM context to be filled
 * @uctx_len: available uctx size (input), used uctx size (output)
 * @val: the new LSM context value
 * @val_len: the size of the new LSM context value
 * @id: LSM id
 * @flags: LSM defined flags
 *
 * Fill all of the fields in a userspace lsm_ctx structure.  If @uctx is NULL
 * simply calculate the required size to output via @utc_len and return
 * success.
 *
 * Returns 0 on success, -E2BIG if userspace buffer is not large enough,
 * -EFAULT on a copyout error, -ENOMEM if memory can't be allocated.
 */
int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len,
                      void *val, size_t val_len,
                      u64 id, u64 flags)
{
        struct lsm_ctx *nctx = NULL;
        size_t nctx_len;
        int rc = 0;

        nctx_len = ALIGN(struct_size(nctx, ctx, val_len), sizeof(void *));
        if (nctx_len > *uctx_len) {
                rc = -E2BIG;
                goto out;
        }

        /* no buffer - return success/0 and set @uctx_len to the req size */
        if (!uctx)
                goto out;

        nctx = kzalloc(nctx_len, GFP_KERNEL);
        if (nctx == NULL) {
                rc = -ENOMEM;
                goto out;
        }
        nctx->id = id;
        nctx->flags = flags;
        nctx->len = nctx_len;
        nctx->ctx_len = val_len;
        memcpy(nctx->ctx, val, val_len);

        if (copy_to_user(uctx, nctx, nctx_len))
                rc = -EFAULT;

out:
        kfree(nctx);
        *uctx_len = nctx_len;
        return rc;
}

/*
 * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
 * can be accessed with:
 *
 *        LSM_RET_DEFAULT(<hook_name>)
 *
 * The macros below define static constants for the default value of each
 * LSM hook.
 */
#define LSM_RET_DEFAULT(NAME) (NAME##_default)
#define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME)
#define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \
        static const int __maybe_unused LSM_RET_DEFAULT(NAME) = (DEFAULT);
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME)

#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

/*
 * Hook list operation macros.
 *
 * call_void_hook:
 *        This is a hook that does not return a value.
 *
 * call_int_hook:
 *        This is a hook that returns a value.
 */
#define __CALL_STATIC_VOID(NUM, HOOK, ...)                                     \
do {                                                                             \
        if (static_branch_unlikely(&SECURITY_HOOK_ACTIVE_KEY(HOOK, NUM))) {    \
                static_call(LSM_STATIC_CALL(HOOK, NUM))(__VA_ARGS__);             \
        }                                                                     \
} while (0);

#define call_void_hook(HOOK, ...)                                 \
        do {                                                      \
                LSM_LOOP_UNROLL(__CALL_STATIC_VOID, HOOK, __VA_ARGS__); \
        } while (0)


#define __CALL_STATIC_INT(NUM, R, HOOK, LABEL, ...)                             \
do {                                                                             \
        if (static_branch_unlikely(&SECURITY_HOOK_ACTIVE_KEY(HOOK, NUM))) {  \
                R = static_call(LSM_STATIC_CALL(HOOK, NUM))(__VA_ARGS__);    \
                if (R != LSM_RET_DEFAULT(HOOK))                                     \
                        goto LABEL;                                             \
        }                                                                     \
} while (0);

#define call_int_hook(HOOK, ...)                                        \
({                                                                        \
        __label__ OUT;                                                        \
        int RC = LSM_RET_DEFAULT(HOOK);                                        \
                                                                        \
        LSM_LOOP_UNROLL(__CALL_STATIC_INT, RC, HOOK, OUT, __VA_ARGS__);        \
OUT:                                                                        \
        RC;                                                                \
})

#define lsm_for_each_hook(scall, NAME)                                        \
        for (scall = static_calls_table.NAME;                                \
             scall - static_calls_table.NAME < MAX_LSM_COUNT; scall++)  \
                if (static_key_enabled(&scall->active->key))

/* Security operations */

/**
 * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok
 * @mgr: task credentials of current binder process
 *
 * Check whether @mgr is allowed to be the binder context manager.
 *
 * Return: Return 0 if permission is granted.
 */
int security_binder_set_context_mgr(const struct cred *mgr)
{
        return call_int_hook(binder_set_context_mgr, mgr);
}

/**
 * security_binder_transaction() - Check if a binder transaction is allowed
 * @from: sending process
 * @to: receiving process
 *
 * Check whether @from is allowed to invoke a binder transaction call to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transaction(const struct cred *from,
                                const struct cred *to)
{
        return call_int_hook(binder_transaction, from, to);
}

/**
 * security_binder_transfer_binder() - Check if a binder transfer is allowed
 * @from: sending process
 * @to: receiving process
 *
 * Check whether @from is allowed to transfer a binder reference to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transfer_binder(const struct cred *from,
                                    const struct cred *to)
{
        return call_int_hook(binder_transfer_binder, from, to);
}

/**
 * security_binder_transfer_file() - Check if a binder file xfer is allowed
 * @from: sending process
 * @to: receiving process
 * @file: file being transferred
 *
 * Check whether @from is allowed to transfer @file to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transfer_file(const struct cred *from,
                                  const struct cred *to, const struct file *file)
{
        return call_int_hook(binder_transfer_file, from, to, file);
}

/**
 * security_ptrace_access_check() - Check if tracing is allowed
 * @child: target process
 * @mode: PTRACE_MODE flags
 *
 * Check permission before allowing the current process to trace the @child
 * process.  Security modules may also want to perform a process tracing check
 * during an execve in the set_security or apply_creds hooks of tracing check
 * during an execve in the bprm_set_creds hook of binprm_security_ops if the
 * process is being traced and its security attributes would be changed by the
 * execve.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        return call_int_hook(ptrace_access_check, child, mode);
}

/**
 * security_ptrace_traceme() - Check if tracing is allowed
 * @parent: tracing process
 *
 * Check that the @parent process has sufficient permission to trace the
 * current process before allowing the current process to present itself to the
 * @parent process for tracing.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ptrace_traceme(struct task_struct *parent)
{
        return call_int_hook(ptrace_traceme, parent);
}

/**
 * security_capget() - Get the capability sets for a process
 * @target: target process
 * @effective: effective capability set
 * @inheritable: inheritable capability set
 * @permitted: permitted capability set
 *
 * Get the @effective, @inheritable, and @permitted capability sets for the
 * @target process.  The hook may also perform permission checking to determine
 * if the current process is allowed to see the capability sets of the @target
 * process.
 *
 * Return: Returns 0 if the capability sets were successfully obtained.
 */
int security_capget(const struct task_struct *target,
                    kernel_cap_t *effective,
                    kernel_cap_t *inheritable,
                    kernel_cap_t *permitted)
{
        return call_int_hook(capget, target, effective, inheritable, permitted);
}

/**
 * security_capset() - Set the capability sets for a process
 * @new: new credentials for the target process
 * @old: current credentials of the target process
 * @effective: effective capability set
 * @inheritable: inheritable capability set
 * @permitted: permitted capability set
 *
 * Set the @effective, @inheritable, and @permitted capability sets for the
 * current process.
 *
 * Return: Returns 0 and update @new if permission is granted.
 */
int security_capset(struct cred *new, const struct cred *old,
                    const kernel_cap_t *effective,
                    const kernel_cap_t *inheritable,
                    const kernel_cap_t *permitted)
{
        return call_int_hook(capset, new, old, effective, inheritable,
                             permitted);
}

/**
 * security_capable() - Check if a process has the necessary capability
 * @cred: credentials to examine
 * @ns: user namespace
 * @cap: capability requested
 * @opts: capability check options
 *
 * Check whether the @tsk process has the @cap capability in the indicated
 * credentials.  @cap contains the capability <include/linux/capability.h>.
 * @opts contains options for the capable check <include/linux/security.h>.
 *
 * Return: Returns 0 if the capability is granted.
 */
int security_capable(const struct cred *cred,
                     struct user_namespace *ns,
                     int cap,
                     unsigned int opts)
{
        return call_int_hook(capable, cred, ns, cap, opts);
}

/**
 * security_quotactl() - Check if a quotactl() syscall is allowed for this fs
 * @cmds: commands
 * @type: type
 * @id: id
 * @sb: filesystem
 *
 * Check whether the quotactl syscall is allowed for this @sb.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_quotactl(int cmds, int type, int id, const struct super_block *sb)
{
        return call_int_hook(quotactl, cmds, type, id, sb);
}

/**
 * security_quota_on() - Check if QUOTAON is allowed for a dentry
 * @dentry: dentry
 *
 * Check whether QUOTAON is allowed for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_quota_on(struct dentry *dentry)
{
        return call_int_hook(quota_on, dentry);
}

/**
 * security_syslog() - Check if accessing the kernel message ring is allowed
 * @type: SYSLOG_ACTION_* type
 *
 * Check permission before accessing the kernel message ring or changing
 * logging to the console.  See the syslog(2) manual page for an explanation of
 * the @type values.
 *
 * Return: Return 0 if permission is granted.
 */
int security_syslog(int type)
{
        return call_int_hook(syslog, type);
}

/**
 * security_settime64() - Check if changing the system time is allowed
 * @ts: new time
 * @tz: timezone
 *
 * Check permission to change the system time, struct timespec64 is defined in
 * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_settime64(const struct timespec64 *ts, const struct timezone *tz)
{
        return call_int_hook(settime, ts, tz);
}

/**
 * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed
 * @mm: mm struct
 * @pages: number of pages
 *
 * Check permissions for allocating a new virtual mapping.  If all LSMs return
 * a positive value, __vm_enough_memory() will be called with cap_sys_admin
 * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be
 * called with cap_sys_admin cleared.
 *
 * Return: Returns 0 if permission is granted by the LSM infrastructure to the
 *         caller.
 */
int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
{
        struct lsm_static_call *scall;
        int cap_sys_admin = 1;
        int rc;

        /*
         * The module will respond with 0 if it thinks the __vm_enough_memory()
         * call should be made with the cap_sys_admin set. If all of the modules
         * agree that it should be set it will. If any module thinks it should
         * not be set it won't.
         */
        lsm_for_each_hook(scall, vm_enough_memory) {
                rc = scall->hl->hook.vm_enough_memory(mm, pages);
                if (rc < 0) {
                        cap_sys_admin = 0;
                        break;
                }
        }
        return __vm_enough_memory(mm, pages, cap_sys_admin);
}

/**
 * security_bprm_creds_for_exec() - Prepare the credentials for exec()
 * @bprm: binary program information
 *
 * If the setup in prepare_exec_creds did not setup @bprm->cred->security
 * properly for executing @bprm->file, update the LSM's portion of
 * @bprm->cred->security to be what commit_creds needs to install for the new
 * program.  This hook may also optionally check permissions (e.g. for
 * transitions between security domains).  The hook must set @bprm->secureexec
 * to 1 if AT_SECURE should be set to request libc enable secure mode.  @bprm
 * contains the linux_binprm structure.
 *
 * If execveat(2) is called with the AT_EXECVE_CHECK flag, bprm->is_check is
 * set.  The result must be the same as without this flag even if the execution
 * will never really happen and @bprm will always be dropped.
 *
 * This hook must not change current->cred, only @bprm->cred.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_creds_for_exec, bprm);
}

/**
 * security_bprm_creds_from_file() - Update linux_binprm creds based on file
 * @bprm: binary program information
 * @file: associated file
 *
 * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon
 * exec, update @bprm->cred to reflect that change. This is called after
 * finding the binary that will be executed without an interpreter.  This
 * ensures that the credentials will not be derived from a script that the
 * binary will need to reopen, which when reopend may end up being a completely
 * different file.  This hook may also optionally check permissions (e.g. for
 * transitions between security domains).  The hook must set @bprm->secureexec
 * to 1 if AT_SECURE should be set to request libc enable secure mode.  The
 * hook must add to @bprm->per_clear any personality flags that should be
 * cleared from current->personality.  @bprm contains the linux_binprm
 * structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
{
        return call_int_hook(bprm_creds_from_file, bprm, file);
}

/**
 * security_bprm_check() - Mediate binary handler search
 * @bprm: binary program information
 *
 * This hook mediates the point when a search for a binary handler will begin.
 * It allows a check against the @bprm->cred->security value which was set in
 * the preceding creds_for_exec call.  The argv list and envp list are reliably
 * available in @bprm.  This hook may be called multiple times during a single
 * execve.  @bprm contains the linux_binprm structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_check(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_check_security, bprm);
}

/**
 * security_bprm_committing_creds() - Install creds for a process during exec()
 * @bprm: binary program information
 *
 * Prepare to install the new security attributes of a process being
 * transformed by an execve operation, based on the old credentials pointed to
 * by @current->cred and the information set in @bprm->cred by the
 * bprm_creds_for_exec hook.  @bprm points to the linux_binprm structure.  This
 * hook is a good place to perform state changes on the process such as closing
 * open file descriptors to which access will no longer be granted when the
 * attributes are changed.  This is called immediately before commit_creds().
 */
void security_bprm_committing_creds(const struct linux_binprm *bprm)
{
        call_void_hook(bprm_committing_creds, bprm);
}

/**
 * security_bprm_committed_creds() - Tidy up after cred install during exec()
 * @bprm: binary program information
 *
 * Tidy up after the installation of the new security attributes of a process
 * being transformed by an execve operation.  The new credentials have, by this
 * point, been set to @current->cred.  @bprm points to the linux_binprm
 * structure.  This hook is a good place to perform state changes on the
 * process such as clearing out non-inheritable signal state.  This is called
 * immediately after commit_creds().
 */
void security_bprm_committed_creds(const struct linux_binprm *bprm)
{
        call_void_hook(bprm_committed_creds, bprm);
}

/**
 * security_fs_context_submount() - Initialise fc->security
 * @fc: new filesystem context
 * @reference: dentry reference for submount/remount
 *
 * Fill out the ->security field for a new fs_context.
 *
 * Return: Returns 0 on success or negative error code on failure.
 */
int security_fs_context_submount(struct fs_context *fc, struct super_block *reference)
{
        return call_int_hook(fs_context_submount, fc, reference);
}

/**
 * security_fs_context_dup() - Duplicate a fs_context LSM blob
 * @fc: destination filesystem context
 * @src_fc: source filesystem context
 *
 * Allocate and attach a security structure to sc->security.  This pointer is
 * initialised to NULL by the caller.  @fc indicates the new filesystem context.
 * @src_fc indicates the original filesystem context.
 *
 * Return: Returns 0 on success or a negative error code on failure.
 */
int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
        return call_int_hook(fs_context_dup, fc, src_fc);
}

/**
 * security_fs_context_parse_param() - Configure a filesystem context
 * @fc: filesystem context
 * @param: filesystem parameter
 *
 * Userspace provided a parameter to configure a superblock.  The LSM can
 * consume the parameter or return it to the caller for use elsewhere.
 *
 * Return: If the parameter is used by the LSM it should return 0, if it is
 *         returned to the caller -ENOPARAM is returned, otherwise a negative
 *         error code is returned.
 */
int security_fs_context_parse_param(struct fs_context *fc,
                                    struct fs_parameter *param)
{
        struct lsm_static_call *scall;
        int trc;
        int rc = -ENOPARAM;

        lsm_for_each_hook(scall, fs_context_parse_param) {
                trc = scall->hl->hook.fs_context_parse_param(fc, param);
                if (trc == 0)
                        rc = 0;
                else if (trc != -ENOPARAM)
                        return trc;
        }
        return rc;
}

/**
 * security_sb_alloc() - Allocate a super_block LSM blob
 * @sb: filesystem superblock
 *
 * Allocate and attach a security structure to the sb->s_security field.  The
 * s_security field is initialized to NULL when the structure is allocated.
 * @sb contains the super_block structure to be modified.
 *
 * Return: Returns 0 if operation was successful.
 */
int security_sb_alloc(struct super_block *sb)
{
        int rc = lsm_superblock_alloc(sb);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sb_alloc_security, sb);
        if (unlikely(rc))
                security_sb_free(sb);
        return rc;
}

/**
 * security_sb_delete() - Release super_block LSM associated objects
 * @sb: filesystem superblock
 *
 * Release objects tied to a superblock (e.g. inodes).  @sb contains the
 * super_block structure being released.
 */
void security_sb_delete(struct super_block *sb)
{
        call_void_hook(sb_delete, sb);
}

/**
 * security_sb_free() - Free a super_block LSM blob
 * @sb: filesystem superblock
 *
 * Deallocate and clear the sb->s_security field.  @sb contains the super_block
 * structure to be modified.
 */
void security_sb_free(struct super_block *sb)
{
        call_void_hook(sb_free_security, sb);
        kfree(sb->s_security);
        sb->s_security = NULL;
}

/**
 * security_free_mnt_opts() - Free memory associated with mount options
 * @mnt_opts: LSM processed mount options
 *
 * Free memory associated with @mnt_ops.
 */
void security_free_mnt_opts(void **mnt_opts)
{
        if (!*mnt_opts)
                return;
        call_void_hook(sb_free_mnt_opts, *mnt_opts);
        *mnt_opts = NULL;
}
EXPORT_SYMBOL(security_free_mnt_opts);

/**
 * security_sb_eat_lsm_opts() - Consume LSM mount options
 * @options: mount options
 * @mnt_opts: LSM processed mount options
 *
 * Eat (scan @options) and save them in @mnt_opts.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        return call_int_hook(sb_eat_lsm_opts, options, mnt_opts);
}
EXPORT_SYMBOL(security_sb_eat_lsm_opts);

/**
 * security_sb_mnt_opts_compat() - Check if new mount options are allowed
 * @sb: filesystem superblock
 * @mnt_opts: new mount options
 *
 * Determine if the new mount options in @mnt_opts are allowed given the
 * existing mounted filesystem at @sb.  @sb superblock being compared.
 *
 * Return: Returns 0 if options are compatible.
 */
int security_sb_mnt_opts_compat(struct super_block *sb,
                                void *mnt_opts)
{
        return call_int_hook(sb_mnt_opts_compat, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_mnt_opts_compat);

/**
 * security_sb_remount() - Verify no incompatible mount changes during remount
 * @sb: filesystem superblock
 * @mnt_opts: (re)mount options
 *
 * Extracts security system specific mount options and verifies no changes are
 * being made to those options.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_remount(struct super_block *sb,
                        void *mnt_opts)
{
        return call_int_hook(sb_remount, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_remount);

/**
 * security_sb_kern_mount() - Check if a kernel mount is allowed
 * @sb: filesystem superblock
 *
 * Mount this @sb if allowed by permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_kern_mount(const struct super_block *sb)
{
        return call_int_hook(sb_kern_mount, sb);
}

/**
 * security_sb_show_options() - Output the mount options for a superblock
 * @m: output file
 * @sb: filesystem superblock
 *
 * Show (print on @m) mount options for this @sb.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        return call_int_hook(sb_show_options, m, sb);
}

/**
 * security_sb_statfs() - Check if accessing fs stats is allowed
 * @dentry: superblock handle
 *
 * Check permission before obtaining filesystem statistics for the @mnt
 * mountpoint.  @dentry is a handle on the superblock for the filesystem.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_statfs(struct dentry *dentry)
{
        return call_int_hook(sb_statfs, dentry);
}

/**
 * security_sb_mount() - Check permission for mounting a filesystem
 * @dev_name: filesystem backing device
 * @path: mount point
 * @type: filesystem type
 * @flags: mount flags
 * @data: filesystem specific data
 *
 * Check permission before an object specified by @dev_name is mounted on the
 * mount point named by @nd.  For an ordinary mount, @dev_name identifies a
 * device if the file system type requires a device.  For a remount
 * (@flags & MS_REMOUNT), @dev_name is irrelevant.  For a loopback/bind mount
 * (@flags & MS_BIND), @dev_name identifies the        pathname of the object being
 * mounted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_mount(const char *dev_name, const struct path *path,
                      const char *type, unsigned long flags, void *data)
{
        return call_int_hook(sb_mount, dev_name, path, type, flags, data);
}

/**
 * security_sb_umount() - Check permission for unmounting a filesystem
 * @mnt: mounted filesystem
 * @flags: unmount flags
 *
 * Check permission before the @mnt file system is unmounted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_umount(struct vfsmount *mnt, int flags)
{
        return call_int_hook(sb_umount, mnt, flags);
}

/**
 * security_sb_pivotroot() - Check permissions for pivoting the rootfs
 * @old_path: new location for current rootfs
 * @new_path: location of the new rootfs
 *
 * Check permission before pivoting the root filesystem.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_pivotroot(const struct path *old_path,
                          const struct path *new_path)
{
        return call_int_hook(sb_pivotroot, old_path, new_path);
}

/**
 * security_sb_set_mnt_opts() - Set the mount options for a filesystem
 * @sb: filesystem superblock
 * @mnt_opts: binary mount options
 * @kern_flags: kernel flags (in)
 * @set_kern_flags: kernel flags (out)
 *
 * Set the security relevant mount options used for a superblock.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sb_set_mnt_opts(struct super_block *sb,
                             void *mnt_opts,
                             unsigned long kern_flags,
                             unsigned long *set_kern_flags)
{
        struct lsm_static_call *scall;
        int rc = mnt_opts ? -EOPNOTSUPP : LSM_RET_DEFAULT(sb_set_mnt_opts);

        lsm_for_each_hook(scall, sb_set_mnt_opts) {
                rc = scall->hl->hook.sb_set_mnt_opts(sb, mnt_opts, kern_flags,
                                              set_kern_flags);
                if (rc != LSM_RET_DEFAULT(sb_set_mnt_opts))
                        break;
        }
        return rc;
}
EXPORT_SYMBOL(security_sb_set_mnt_opts);

/**
 * security_sb_clone_mnt_opts() - Duplicate superblock mount options
 * @oldsb: source superblock
 * @newsb: destination superblock
 * @kern_flags: kernel flags (in)
 * @set_kern_flags: kernel flags (out)
 *
 * Copy all security options from a given superblock to another.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sb_clone_mnt_opts(const struct super_block *oldsb,
                               struct super_block *newsb,
                               unsigned long kern_flags,
                               unsigned long *set_kern_flags)
{
        return call_int_hook(sb_clone_mnt_opts, oldsb, newsb,
                             kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_clone_mnt_opts);

/**
 * security_move_mount() - Check permissions for moving a mount
 * @from_path: source mount point
 * @to_path: destination mount point
 *
 * Check permission before a mount is moved.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_move_mount(const struct path *from_path,
                        const struct path *to_path)
{
        return call_int_hook(move_mount, from_path, to_path);
}

/**
 * security_path_notify() - Check if setting a watch is allowed
 * @path: file path
 * @mask: event mask
 * @obj_type: file path type
 *
 * Check permissions before setting a watch on events as defined by @mask, on
 * an object at @path, whose type is defined by @obj_type.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_notify(const struct path *path, u64 mask,
                         unsigned int obj_type)
{
        return call_int_hook(path_notify, path, mask, obj_type);
}

/**
 * security_inode_alloc() - Allocate an inode LSM blob
 * @inode: the inode
 * @gfp: allocation flags
 *
 * Allocate and attach a security structure to @inode->i_security.  The
 * i_security field is initialized to NULL when the inode structure is
 * allocated.
 *
 * Return: Return 0 if operation was successful.
 */
int security_inode_alloc(struct inode *inode, gfp_t gfp)
{
        int rc = lsm_inode_alloc(inode, gfp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(inode_alloc_security, inode);
        if (unlikely(rc))
                security_inode_free(inode);
        return rc;
}

static void inode_free_by_rcu(struct rcu_head *head)
{
        /* The rcu head is at the start of the inode blob */
        call_void_hook(inode_free_security_rcu, head);
        kmem_cache_free(lsm_inode_cache, head);
}

/**
 * security_inode_free() - Free an inode's LSM blob
 * @inode: the inode
 *
 * Release any LSM resources associated with @inode, although due to the
 * inode's RCU protections it is possible that the resources will not be
 * fully released until after the current RCU grace period has elapsed.
 *
 * It is important for LSMs to note that despite being present in a call to
 * security_inode_free(), @inode may still be referenced in a VFS path walk
 * and calls to security_inode_permission() may be made during, or after,
 * a call to security_inode_free().  For this reason the inode->i_security
 * field is released via a call_rcu() callback and any LSMs which need to
 * retain inode state for use in security_inode_permission() should only
 * release that state in the inode_free_security_rcu() LSM hook callback.
 */
void security_inode_free(struct inode *inode)
{
        call_void_hook(inode_free_security, inode);
        if (!inode->i_security)
                return;
        call_rcu((struct rcu_head *)inode->i_security, inode_free_by_rcu);
}

/**
 * security_dentry_init_security() - Perform dentry initialization
 * @dentry: the dentry to initialize
 * @mode: mode used to determine resource type
 * @name: name of the last path component
 * @xattr_name: name of the security/LSM xattr
 * @lsmctx: pointer to the resulting LSM context
 *
 * Compute a context for a dentry as the inode is not yet available since NFSv4
 * has no label backed by an EA anyway.  It is important to note that
 * @xattr_name does not need to be free'd by the caller, it is a static string.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_dentry_init_security(struct dentry *dentry, int mode,
                                  const struct qstr *name,
                                  const char **xattr_name,
                                  struct lsm_context *lsmctx)
{
        return call_int_hook(dentry_init_security, dentry, mode, name,
                             xattr_name, lsmctx);
}
EXPORT_SYMBOL(security_dentry_init_security);

/**
 * security_dentry_create_files_as() - Perform dentry initialization
 * @dentry: the dentry to initialize
 * @mode: mode used to determine resource type
 * @name: name of the last path component
 * @old: creds to use for LSM context calculations
 * @new: creds to modify
 *
 * Compute a context for a dentry as the inode is not yet available and set
 * that context in passed in creds so that new files are created using that
 * context. Context is calculated using the passed in creds and not the creds
 * of the caller.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_dentry_create_files_as(struct dentry *dentry, int mode,
                                    const struct qstr *name,
                                    const struct cred *old, struct cred *new)
{
        return call_int_hook(dentry_create_files_as, dentry, mode,
                             name, old, new);
}
EXPORT_SYMBOL(security_dentry_create_files_as);

/**
 * security_inode_init_security() - Initialize an inode's LSM context
 * @inode: the inode
 * @dir: parent directory
 * @qstr: last component of the pathname
 * @initxattrs: callback function to write xattrs
 * @fs_data: filesystem specific data
 *
 * Obtain the security attribute name suffix and value to set on a newly
 * created inode and set up the incore security field for the new inode.  This
 * hook is called by the fs code as part of the inode creation transaction and
 * provides for atomic labeling of the inode, unlike the post_create/mkdir/...
 * hooks called by the VFS.
 *
 * The hook function is expected to populate the xattrs array, by calling
 * lsm_get_xattr_slot() to retrieve the slots reserved by the security module
 * with the lbs_xattr_count field of the lsm_blob_sizes structure.  For each
 * slot, the hook function should set ->name to the attribute name suffix
 * (e.g. selinux), to allocate ->value (will be freed by the caller) and set it
 * to the attribute value, to set ->value_len to the length of the value.  If
 * the security module does not use security attributes or does not wish to put
 * a security attribute on this particular inode, then it should return
 * -EOPNOTSUPP to skip this processing.
 *
 * Return: Returns 0 if the LSM successfully initialized all of the inode
 *         security attributes that are required, negative values otherwise.
 */
int security_inode_init_security(struct inode *inode, struct inode *dir,
                                 const struct qstr *qstr,
                                 const initxattrs initxattrs, void *fs_data)
{
        struct lsm_static_call *scall;
        struct xattr *new_xattrs = NULL;
        int ret = -EOPNOTSUPP, xattr_count = 0;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        if (!blob_sizes.lbs_xattr_count)
                return 0;

        if (initxattrs) {
                /* Allocate +1 as terminator. */
                new_xattrs = kcalloc(blob_sizes.lbs_xattr_count + 1,
                                     sizeof(*new_xattrs), GFP_NOFS);
                if (!new_xattrs)
                        return -ENOMEM;
        }

        lsm_for_each_hook(scall, inode_init_security) {
                ret = scall->hl->hook.inode_init_security(inode, dir, qstr, new_xattrs,
                                                  &xattr_count);
                if (ret && ret != -EOPNOTSUPP)
                        goto out;
                /*
                 * As documented in lsm_hooks.h, -EOPNOTSUPP in this context
                 * means that the LSM is not willing to provide an xattr, not
                 * that it wants to signal an error. Thus, continue to invoke
                 * the remaining LSMs.
                 */
        }

        /* If initxattrs() is NULL, xattr_count is zero, skip the call. */
        if (!xattr_count)
                goto out;

        ret = initxattrs(inode, new_xattrs, fs_data);
out:
        for (; xattr_count > 0; xattr_count--)
                kfree(new_xattrs[xattr_count - 1].value);
        kfree(new_xattrs);
        return (ret == -EOPNOTSUPP) ? 0 : ret;
}
EXPORT_SYMBOL(security_inode_init_security);

/**
 * security_inode_init_security_anon() - Initialize an anonymous inode
 * @inode: the inode
 * @name: the anonymous inode class
 * @context_inode: an optional related inode
 *
 * Set up the incore security field for the new anonymous inode and return
 * whether the inode creation is permitted by the security module or not.
 *
 * Return: Returns 0 on success, -EACCES if the security module denies the
 * creation of this inode, or another -errno upon other errors.
 */
int security_inode_init_security_anon(struct inode *inode,
                                      const struct qstr *name,
                                      const struct inode *context_inode)
{
        return call_int_hook(inode_init_security_anon, inode, name,
                             context_inode);
}

#ifdef CONFIG_SECURITY_PATH
/**
 * security_path_mknod() - Check if creating a special file is allowed
 * @dir: parent directory
 * @dentry: new file
 * @mode: new file mode
 * @dev: device number
 *
 * Check permissions when creating a file. Note that this hook is called even
 * if mknod operation is being done for a regular file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_mknod(const struct path *dir, struct dentry *dentry,
                        umode_t mode, unsigned int dev)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mknod, dir, dentry, mode, dev);
}
EXPORT_SYMBOL(security_path_mknod);

/**
 * security_path_post_mknod() - Update inode security after reg file creation
 * @idmap: idmap of the mount
 * @dentry: new file
 *
 * Update inode security field after a regular file has been created.
 */
void security_path_post_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(path_post_mknod, idmap, dentry);
}

/**
 * security_path_mkdir() - Check if creating a new directory is allowed
 * @dir: parent directory
 * @dentry: new directory
 * @mode: new directory mode
 *
 * Check permissions to create a new directory in the existing directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_mkdir(const struct path *dir, struct dentry *dentry,
                        umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mkdir, dir, dentry, mode);
}
EXPORT_SYMBOL(security_path_mkdir);

/**
 * security_path_rmdir() - Check if removing a directory is allowed
 * @dir: parent directory
 * @dentry: directory to remove
 *
 * Check the permission to remove a directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_rmdir(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_rmdir, dir, dentry);
}

/**
 * security_path_unlink() - Check if removing a hard link is allowed
 * @dir: parent directory
 * @dentry: file
 *
 * Check the permission to remove a hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_unlink(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_unlink, dir, dentry);
}
EXPORT_SYMBOL(security_path_unlink);

/**
 * security_path_symlink() - Check if creating a symbolic link is allowed
 * @dir: parent directory
 * @dentry: symbolic link
 * @old_name: file pathname
 *
 * Check the permission to create a symbolic link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_symlink(const struct path *dir, struct dentry *dentry,
                          const char *old_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_symlink, dir, dentry, old_name);
}

/**
 * security_path_link - Check if creating a hard link is allowed
 * @old_dentry: existing file
 * @new_dir: new parent directory
 * @new_dentry: new link
 *
 * Check permission before creating a new hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
                       struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(path_link, old_dentry, new_dir, new_dentry);
}

/**
 * security_path_rename() - Check if renaming a file is allowed
 * @old_dir: parent directory of the old file
 * @old_dentry: the old file
 * @new_dir: parent directory of the new file
 * @new_dentry: the new file
 * @flags: flags
 *
 * Check for permission to rename a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
                         const struct path *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) &&
                      IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        return call_int_hook(path_rename, old_dir, old_dentry, new_dir,
                             new_dentry, flags);
}
EXPORT_SYMBOL(security_path_rename);

/**
 * security_path_truncate() - Check if truncating a file is allowed
 * @path: file
 *
 * Check permission before truncating the file indicated by path.  Note that
 * truncation permissions may also be checked based on already opened files,
 * using the security_file_truncate() hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_truncate(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_truncate, path);
}

/**
 * security_path_chmod() - Check if changing the file's mode is allowed
 * @path: file
 * @mode: new mode
 *
 * Check for permission to change a mode of the file @path. The new mode is
 * specified in @mode which is a bitmask of constants from
 * <include/uapi/linux/stat.h>.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chmod(const struct path *path, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chmod, path, mode);
}

/**
 * security_path_chown() - Check if changing the file's owner/group is allowed
 * @path: file
 * @uid: file owner
 * @gid: file group
 *
 * Check for permission to change owner/group of a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chown, path, uid, gid);
}

/**
 * security_path_chroot() - Check if changing the root directory is allowed
 * @path: directory
 *
 * Check for permission to change root directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chroot(const struct path *path)
{
        return call_int_hook(path_chroot, path);
}
#endif /* CONFIG_SECURITY_PATH */

/**
 * security_inode_create() - Check if creating a file is allowed
 * @dir: the parent directory
 * @dentry: the file being created
 * @mode: requested file mode
 *
 * Check permission to create a regular file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_create(struct inode *dir, struct dentry *dentry,
                          umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_create, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_create);

/**
 * security_inode_post_create_tmpfile() - Update inode security of new tmpfile
 * @idmap: idmap of the mount
 * @inode: inode of the new tmpfile
 *
 * Update inode security data after a tmpfile has been created.
 */
void security_inode_post_create_tmpfile(struct mnt_idmap *idmap,
                                        struct inode *inode)
{
        if (unlikely(IS_PRIVATE(inode)))
                return;
        call_void_hook(inode_post_create_tmpfile, idmap, inode);
}

/**
 * security_inode_link() - Check if creating a hard link is allowed
 * @old_dentry: existing file
 * @dir: new parent directory
 * @new_dentry: new link
 *
 * Check permission before creating a new hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                        struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(inode_link, old_dentry, dir, new_dentry);
}

/**
 * security_inode_unlink() - Check if removing a hard link is allowed
 * @dir: parent directory
 * @dentry: file
 *
 * Check the permission to remove a hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_unlink, dir, dentry);
}

/**
 * security_inode_symlink() - Check if creating a symbolic link is allowed
 * @dir: parent directory
 * @dentry: symbolic link
 * @old_name: existing filename
 *
 * Check the permission to create a symbolic link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_symlink(struct inode *dir, struct dentry *dentry,
                           const char *old_name)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_symlink, dir, dentry, old_name);
}

/**
 * security_inode_mkdir() - Check if creating a new directory is allowed
 * @dir: parent directory
 * @dentry: new directory
 * @mode: new directory mode
 *
 * Check permissions to create a new directory in the existing directory
 * associated with inode structure @dir.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mkdir, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_mkdir);

/**
 * security_inode_rmdir() - Check if removing a directory is allowed
 * @dir: parent directory
 * @dentry: directory to be removed
 *
 * Check the permission to remove a directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_rmdir, dir, dentry);
}

/**
 * security_inode_mknod() - Check if creating a special file is allowed
 * @dir: parent directory
 * @dentry: new file
 * @mode: new file mode
 * @dev: device number
 *
 * Check permissions when creating a special file (or a socket or a fifo file
 * created via the mknod system call).  Note that if mknod operation is being
 * done for a regular file, then the create hook will be called and not this
 * hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_mknod(struct inode *dir, struct dentry *dentry,
                         umode_t mode, dev_t dev)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mknod, dir, dentry, mode, dev);
}

/**
 * security_inode_rename() - Check if renaming a file is allowed
 * @old_dir: parent directory of the old file
 * @old_dentry: the old file
 * @new_dir: parent directory of the new file
 * @new_dentry: the new file
 * @flags: flags
 *
 * Check for permission to rename a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry,
                          unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) &&
                      IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(inode_rename, new_dir, new_dentry,
                                        old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(inode_rename, old_dir, old_dentry,
                             new_dir, new_dentry);
}

/**
 * security_inode_readlink() - Check if reading a symbolic link is allowed
 * @dentry: link
 *
 * Check the permission to read the symbolic link.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_readlink(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_readlink, dentry);
}

/**
 * security_inode_follow_link() - Check if following a symbolic link is allowed
 * @dentry: link dentry
 * @inode: link inode
 * @rcu: true if in RCU-walk mode
 *
 * Check permission to follow a symbolic link when looking up a pathname.  If
 * @rcu is true, @inode is not stable.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
                               bool rcu)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_follow_link, dentry, inode, rcu);
}

/**
 * security_inode_permission() - Check if accessing an inode is allowed
 * @inode: inode
 * @mask: access mask
 *
 * Check permission before accessing an inode.  This hook is called by the
 * existing Linux permission function, so a security module can use it to
 * provide additional checking for existing Linux permission checks.  Notice
 * that this hook is called when a file is opened (as well as many other
 * operations), whereas the file_security_ops permission hook is called when
 * the actual read/write operations are performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_permission(struct inode *inode, int mask)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_permission, inode, mask);
}

/**
 * security_inode_setattr() - Check if setting file attributes is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @attr: new attributes
 *
 * Check permission before setting file attributes.  Note that the kernel call
 * to notify_change is performed from several locations, whenever file
 * attributes change (such as when a file is truncated, chown/chmod operations,
 * transferring disk quotas, etc).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_setattr(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *attr)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_setattr, idmap, dentry, attr);
}
EXPORT_SYMBOL_GPL(security_inode_setattr);

/**
 * security_inode_post_setattr() - Update the inode after a setattr operation
 * @idmap: idmap of the mount
 * @dentry: file
 * @ia_valid: file attributes set
 *
 * Update inode security field after successful setting file attributes.
 */
void security_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 int ia_valid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setattr, idmap, dentry, ia_valid);
}

/**
 * security_inode_getattr() - Check if getting file attributes is allowed
 * @path: file
 *
 * Check permission before obtaining file attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_getattr(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(inode_getattr, path);
}

/**
 * security_inode_setxattr() - Check if setting file xattrs is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @name: xattr name
 * @value: xattr value
 * @size: size of xattr value
 * @flags: flags
 *
 * This hook performs the desired permission checks before setting the extended
 * attributes (xattrs) on @dentry.  It is important to note that we have some
 * additional logic before the main LSM implementation calls to detect if we
 * need to perform an additional capability check at the LSM layer.
 *
 * Normally we enforce a capability check prior to executing the various LSM
 * hook implementations, but if a LSM wants to avoid this capability check,
 * it can register a 'inode_xattr_skipcap' hook and return a value of 1 for
 * xattrs that it wants to avoid the capability check, leaving the LSM fully
 * responsible for enforcing the access control for the specific xattr.  If all
 * of the enabled LSMs refrain from registering a 'inode_xattr_skipcap' hook,
 * or return a 0 (the default return value), the capability check is still
 * performed.  If no 'inode_xattr_skipcap' hooks are registered the capability
 * check is performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_setxattr(struct mnt_idmap *idmap,
                            struct dentry *dentry, const char *name,
                            const void *value, size_t size, int flags)
{
        int rc;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;

        /* enforce the capability checks at the lsm layer, if needed */
        if (!call_int_hook(inode_xattr_skipcap, name)) {
                rc = cap_inode_setxattr(dentry, name, value, size, flags);
                if (rc)
                        return rc;
        }

        return call_int_hook(inode_setxattr, idmap, dentry, name, value, size,
                             flags);
}

/**
 * security_inode_set_acl() - Check if setting posix acls is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 * @kacl: acl struct
 *
 * Check permission before setting posix acls, the posix acls in @kacl are
 * identified by @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_set_acl(struct mnt_idmap *idmap,
                           struct dentry *dentry, const char *acl_name,
                           struct posix_acl *kacl)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_set_acl, idmap, dentry, acl_name, kacl);
}

/**
 * security_inode_post_set_acl() - Update inode security from posix acls set
 * @dentry: file
 * @acl_name: acl name
 * @kacl: acl struct
 *
 * Update inode security data after successfully setting posix acls on @dentry.
 * The posix acls in @kacl are identified by @acl_name.
 */
void security_inode_post_set_acl(struct dentry *dentry, const char *acl_name,
                                 struct posix_acl *kacl)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_set_acl, dentry, acl_name, kacl);
}

/**
 * security_inode_get_acl() - Check if reading posix acls is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Check permission before getting osix acls, the posix acls are identified by
 * @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_get_acl(struct mnt_idmap *idmap,
                           struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_get_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_remove_acl() - Check if removing a posix acl is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Check permission before removing posix acls, the posix acls are identified
 * by @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_remove_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_remove_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_post_remove_acl() - Update inode security after rm posix acls
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Update inode security data after successfully removing posix acls on
 * @dentry in @idmap. The posix acls are identified by @acl_name.
 */
void security_inode_post_remove_acl(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_remove_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_post_setxattr() - Update the inode after a setxattr operation
 * @dentry: file
 * @name: xattr name
 * @value: xattr value
 * @size: xattr value size
 * @flags: flags
 *
 * Update inode security field after successful setxattr operation.
 */
void security_inode_post_setxattr(struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setxattr, dentry, name, value, size, flags);
}

/**
 * security_inode_getxattr() - Check if xattr access is allowed
 * @dentry: file
 * @name: xattr name
 *
 * Check permission before obtaining the extended attributes identified by
 * @name for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_getxattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_getxattr, dentry, name);
}

/**
 * security_inode_listxattr() - Check if listing xattrs is allowed
 * @dentry: file
 *
 * Check permission before obtaining the list of extended attribute names for
 * @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_listxattr(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_listxattr, dentry);
}

/**
 * security_inode_removexattr() - Check if removing an xattr is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @name: xattr name
 *
 * This hook performs the desired permission checks before setting the extended
 * attributes (xattrs) on @dentry.  It is important to note that we have some
 * additional logic before the main LSM implementation calls to detect if we
 * need to perform an additional capability check at the LSM layer.
 *
 * Normally we enforce a capability check prior to executing the various LSM
 * hook implementations, but if a LSM wants to avoid this capability check,
 * it can register a 'inode_xattr_skipcap' hook and return a value of 1 for
 * xattrs that it wants to avoid the capability check, leaving the LSM fully
 * responsible for enforcing the access control for the specific xattr.  If all
 * of the enabled LSMs refrain from registering a 'inode_xattr_skipcap' hook,
 * or return a 0 (the default return value), the capability check is still
 * performed.  If no 'inode_xattr_skipcap' hooks are registered the capability
 * check is performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_removexattr(struct mnt_idmap *idmap,
                               struct dentry *dentry, const char *name)
{
        int rc;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;

        /* enforce the capability checks at the lsm layer, if needed */
        if (!call_int_hook(inode_xattr_skipcap, name)) {
                rc = cap_inode_removexattr(idmap, dentry, name);
                if (rc)
                        return rc;
        }

        return call_int_hook(inode_removexattr, idmap, dentry, name);
}

/**
 * security_inode_post_removexattr() - Update the inode after a removexattr op
 * @dentry: file
 * @name: xattr name
 *
 * Update the inode after a successful removexattr operation.
 */
void security_inode_post_removexattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_removexattr, dentry, name);
}

/**
 * security_inode_file_setattr() - check if setting fsxattr is allowed
 * @dentry: file to set filesystem extended attributes on
 * @fa: extended attributes to set on the inode
 *
 * Called when file_setattr() syscall or FS_IOC_FSSETXATTR ioctl() is called on
 * inode
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_file_setattr(struct dentry *dentry, struct file_kattr *fa)
{
        return call_int_hook(inode_file_setattr, dentry, fa);
}

/**
 * security_inode_file_getattr() - check if retrieving fsxattr is allowed
 * @dentry: file to retrieve filesystem extended attributes from
 * @fa: extended attributes to get
 *
 * Called when file_getattr() syscall or FS_IOC_FSGETXATTR ioctl() is called on
 * inode
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_file_getattr(struct dentry *dentry, struct file_kattr *fa)
{
        return call_int_hook(inode_file_getattr, dentry, fa);
}

/**
 * security_inode_need_killpriv() - Check if security_inode_killpriv() required
 * @dentry: associated dentry
 *
 * Called when an inode has been changed to determine if
 * security_inode_killpriv() should be called.
 *
 * Return: Return <0 on error to abort the inode change operation, return 0 if
 *         security_inode_killpriv() does not need to be called, return >0 if
 *         security_inode_killpriv() does need to be called.
 */
int security_inode_need_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_need_killpriv, dentry);
}

/**
 * security_inode_killpriv() - The setuid bit is removed, update LSM state
 * @idmap: idmap of the mount
 * @dentry: associated dentry
 *
 * The @dentry's setuid bit is being removed.  Remove similar security labels.
 * Called with the dentry->d_inode->i_mutex held.
 *
 * Return: Return 0 on success.  If error is returned, then the operation
 *         causing setuid bit removal is failed.
 */
int security_inode_killpriv(struct mnt_idmap *idmap,
                            struct dentry *dentry)
{
        return call_int_hook(inode_killpriv, idmap, dentry);
}

/**
 * security_inode_getsecurity() - Get the xattr security label of an inode
 * @idmap: idmap of the mount
 * @inode: inode
 * @name: xattr name
 * @buffer: security label buffer
 * @alloc: allocation flag
 *
 * Retrieve a copy of the extended attribute representation of the security
 * label associated with @name for @inode via @buffer.  Note that @name is the
 * remainder of the attribute name after the security prefix has been removed.
 * @alloc is used to specify if the call should return a value via the buffer
 * or just the value length.
 *
 * Return: Returns size of buffer on success.
 */
int security_inode_getsecurity(struct mnt_idmap *idmap,
                               struct inode *inode, const char *name,
                               void **buffer, bool alloc)
{
        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_getsecurity);

        return call_int_hook(inode_getsecurity, idmap, inode, name, buffer,
                             alloc);
}

/**
 * security_inode_setsecurity() - Set the xattr security label of an inode
 * @inode: inode
 * @name: xattr name
 * @value: security label
 * @size: length of security label
 * @flags: flags
 *
 * Set the security label associated with @name for @inode from the extended
 * attribute value @value.  @size indicates the size of the @value in bytes.
 * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the
 * remainder of the attribute name after the security. prefix has been removed.
 *
 * Return: Returns 0 on success.
 */
int security_inode_setsecurity(struct inode *inode, const char *name,
                               const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_setsecurity);

        return call_int_hook(inode_setsecurity, inode, name, value, size,
                             flags);
}

/**
 * security_inode_listsecurity() - List the xattr security label names
 * @inode: inode
 * @buffer: buffer
 * @buffer_size: size of buffer
 *
 * Copy the extended attribute names for the security labels associated with
 * @inode into @buffer.  The maximum size of @buffer is specified by
 * @buffer_size.  @buffer may be NULL to request the size of the buffer
 * required.
 *
 * Return: Returns number of bytes used/required on success.
 */
int security_inode_listsecurity(struct inode *inode,
                                char *buffer, size_t buffer_size)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_listsecurity, inode, buffer, buffer_size);
}
EXPORT_SYMBOL(security_inode_listsecurity);

/**
 * security_inode_getlsmprop() - Get an inode's LSM data
 * @inode: inode
 * @prop: lsm specific information to return
 *
 * Get the lsm specific information associated with the node.
 */
void security_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
{
        call_void_hook(inode_getlsmprop, inode, prop);
}

/**
 * security_inode_copy_up() - Create new creds for an overlayfs copy-up op
 * @src: union dentry of copy-up file
 * @new: newly created creds
 *
 * A file is about to be copied up from lower layer to upper layer of overlay
 * filesystem. Security module can prepare a set of new creds and modify as
 * need be and return new creds. Caller will switch to new creds temporarily to
 * create new file and release newly allocated creds.
 *
 * Return: Returns 0 on success or a negative error code on error.
 */
int security_inode_copy_up(struct dentry *src, struct cred **new)
{
        return call_int_hook(inode_copy_up, src, new);
}
EXPORT_SYMBOL(security_inode_copy_up);

/**
 * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op
 * @src: union dentry of copy-up file
 * @name: xattr name
 *
 * Filter the xattrs being copied up when a unioned file is copied up from a
 * lower layer to the union/overlay layer.   The caller is responsible for
 * reading and writing the xattrs, this hook is merely a filter.
 *
 * Return: Returns 0 to accept the xattr, -ECANCELED to discard the xattr,
 *         -EOPNOTSUPP if the security module does not know about attribute,
 *         or a negative error code to abort the copy up.
 */
int security_inode_copy_up_xattr(struct dentry *src, const char *name)
{
        int rc;

        rc = call_int_hook(inode_copy_up_xattr, src, name);
        if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr))
                return rc;

        return LSM_RET_DEFAULT(inode_copy_up_xattr);
}
EXPORT_SYMBOL(security_inode_copy_up_xattr);

/**
 * security_inode_setintegrity() - Set the inode's integrity data
 * @inode: inode
 * @type: type of integrity, e.g. hash digest, signature, etc
 * @value: the integrity value
 * @size: size of the integrity value
 *
 * Register a verified integrity measurement of a inode with LSMs.
 * LSMs should free the previously saved data if @value is NULL.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_inode_setintegrity(const struct inode *inode,
                                enum lsm_integrity_type type, const void *value,
                                size_t size)
{
        return call_int_hook(inode_setintegrity, inode, type, value, size);
}
EXPORT_SYMBOL(security_inode_setintegrity);

/**
 * security_kernfs_init_security() - Init LSM context for a kernfs node
 * @kn_dir: parent kernfs node
 * @kn: the kernfs node to initialize
 *
 * Initialize the security context of a newly created kernfs node based on its
 * own and its parent's attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernfs_init_security(struct kernfs_node *kn_dir,
                                  struct kernfs_node *kn)
{
        return call_int_hook(kernfs_init_security, kn_dir, kn);
}

/**
 * security_file_permission() - Check file permissions
 * @file: file
 * @mask: requested permissions
 *
 * Check file permissions before accessing an open file.  This hook is called
 * by various operations that read or write files.  A security module can use
 * this hook to perform additional checking on these operations, e.g. to
 * revalidate permissions on use to support privilege bracketing or policy
 * changes.  Notice that this hook is used when the actual read/write
 * operations are performed, whereas the inode_security_ops hook is called when
 * a file is opened (as well as many other operations).  Although this hook can
 * be used to revalidate permissions for various system call operations that
 * read or write files, it does not address the revalidation of permissions for
 * memory-mapped files.  Security modules must handle this separately if they
 * need such revalidation.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_permission(struct file *file, int mask)
{
        return call_int_hook(file_permission, file, mask);
}

/**
 * security_file_alloc() - Allocate and init a file's LSM blob
 * @file: the file
 *
 * Allocate and attach a security structure to the file->f_security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Return 0 if the hook is successful and permission is granted.
 */
int security_file_alloc(struct file *file)
{
        int rc = lsm_file_alloc(file);

        if (rc)
                return rc;
        rc = call_int_hook(file_alloc_security, file);
        if (unlikely(rc))
                security_file_free(file);
        return rc;
}

/**
 * security_file_release() - Perform actions before releasing the file ref
 * @file: the file
 *
 * Perform actions before releasing the last reference to a file.
 */
void security_file_release(struct file *file)
{
        call_void_hook(file_release, file);
}

/**
 * security_file_free() - Free a file's LSM blob
 * @file: the file
 *
 * Deallocate and free any security structures stored in file->f_security.
 */
void security_file_free(struct file *file)
{
        void *blob;

        call_void_hook(file_free_security, file);

        blob = file->f_security;
        if (blob) {
                file->f_security = NULL;
                kmem_cache_free(lsm_file_cache, blob);
        }
}

/**
 * security_backing_file_alloc() - Allocate and setup a backing file blob
 * @backing_file: the backing file
 * @user_file: the associated user visible file
 *
 * Allocate a backing file LSM blob and perform any necessary initialization of
 * the LSM blob.  There will be some operations where the LSM will not have
 * access to @user_file after this point, so any important state associated
 * with @user_file that is important to the LSM should be captured in the
 * backing file's LSM blob.
 *
 * LSM's should avoid taking a reference to @user_file in this hook as it will
 * result in problems later when the system attempts to drop/put the file
 * references due to a circular dependency.
 *
 * Return: Return 0 if the hook is successful, negative values otherwise.
 */
int security_backing_file_alloc(struct file *backing_file,
                                const struct file *user_file)
{
        int rc;

        rc = lsm_backing_file_alloc(backing_file);
        if (rc)
                return rc;
        rc = call_int_hook(backing_file_alloc, backing_file, user_file);
        if (unlikely(rc))
                security_backing_file_free(backing_file);

        return rc;
}

/**
 * security_backing_file_free() - Free a backing file blob
 * @backing_file: the backing file
 *
 * Free any LSM state associate with a backing file's LSM blob, including the
 * blob itself.
 */
void security_backing_file_free(struct file *backing_file)
{
        void *blob = backing_file_security(backing_file);

        call_void_hook(backing_file_free, backing_file);

        if (blob) {
                backing_file_set_security(backing_file, NULL);
                kmem_cache_free(lsm_backing_file_cache, blob);
        }
}

/**
 * security_file_ioctl() - Check if an ioctl is allowed
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Check permission for an ioctl operation on @file.  Note that @arg sometimes
 * represents a user space pointer; in other cases, it may be a simple integer
 * value.  When @arg represents a user space pointer, it should never be used
 * by the security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_ioctl, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl);

/**
 * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Compat version of security_file_ioctl() that correctly handles 32-bit
 * processes running on 64-bit kernels.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl_compat(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        return call_int_hook(file_ioctl_compat, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl_compat);

static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
        /*
         * Does we have PROT_READ and does the application expect
         * it to imply PROT_EXEC?  If not, nothing to talk about...
         */
        if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
                return prot;
        if (!(current->personality & READ_IMPLIES_EXEC))
                return prot;
        /*
         * if that's an anonymous mapping, let it.
         */
        if (!file)
                return prot | PROT_EXEC;
        /*
         * ditto if it's not on noexec mount, except that on !MMU we need
         * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
         */
        if (!path_noexec(&file->f_path)) {
#ifndef CONFIG_MMU
                if (file->f_op->mmap_capabilities) {
                        unsigned caps = file->f_op->mmap_capabilities(file);
                        if (!(caps & NOMMU_MAP_EXEC))
                                return prot;
                }
#endif
                return prot | PROT_EXEC;
        }
        /* anything on noexec mount won't get PROT_EXEC */
        return prot;
}

/**
 * security_mmap_file() - Check if mmap'ing a file is allowed
 * @file: file
 * @prot: protection applied by the kernel
 * @flags: flags
 *
 * Check permissions for a mmap operation.  The @file may be NULL, e.g. if
 * mapping anonymous memory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_file(struct file *file, unsigned long prot,
                       unsigned long flags)
{
        return call_int_hook(mmap_file, file, prot, mmap_prot(file, prot),
                             flags);
}

/**
 * security_mmap_backing_file - Check if mmap'ing a backing file is allowed
 * @vma: the vm_area_struct for the mmap'd region
 * @backing_file: the backing file being mmap'd
 * @user_file: the user file being mmap'd
 *
 * Check permissions for a mmap operation on a stacked filesystem.  This hook
 * is called after the security_mmap_file() and is responsible for authorizing
 * the mmap on @backing_file.  It is important to note that the mmap operation
 * on @user_file has already been authorized and the @vma->vm_file has been
 * set to @backing_file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_backing_file(struct vm_area_struct *vma,
                               struct file *backing_file,
                               struct file *user_file)
{
        /* recommended by the stackable filesystem devs */
        if (WARN_ON_ONCE(!(backing_file->f_mode & FMODE_BACKING)))
                return -EIO;

        return call_int_hook(mmap_backing_file, vma, backing_file, user_file);
}
EXPORT_SYMBOL_GPL(security_mmap_backing_file);

/**
 * security_mmap_addr() - Check if mmap'ing an address is allowed
 * @addr: address
 *
 * Check permissions for a mmap operation at @addr.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_addr(unsigned long addr)
{
        return call_int_hook(mmap_addr, addr);
}

/**
 * security_file_mprotect() - Check if changing memory protections is allowed
 * @vma: memory region
 * @reqprot: application requested protection
 * @prot: protection applied by the kernel
 *
 * Check permissions before changing memory access permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                           unsigned long prot)
{
        return call_int_hook(file_mprotect, vma, reqprot, prot);
}

/**
 * security_file_lock() - Check if a file lock is allowed
 * @file: file
 * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK)
 *
 * Check permission before performing file locking operations.  Note the hook
 * mediates both flock and fcntl style locks.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_lock(struct file *file, unsigned int cmd)
{
        return call_int_hook(file_lock, file, cmd);
}

/**
 * security_file_fcntl() - Check if fcntl() op is allowed
 * @file: file
 * @cmd: fcntl command
 * @arg: command argument
 *
 * Check permission before allowing the file operation specified by @cmd from
 * being performed on the file @file.  Note that @arg sometimes represents a
 * user space pointer; in other cases, it may be a simple integer value.  When
 * @arg represents a user space pointer, it should never be used by the
 * security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_fcntl, file, cmd, arg);
}

/**
 * security_file_set_fowner() - Set the file owner info in the LSM blob
 * @file: the file
 *
 * Save owner security information (typically from current->security) in
 * file->f_security for later use by the send_sigiotask hook.
 *
 * This hook is called with file->f_owner.lock held.
 *
 * Return: Returns 0 on success.
 */
void security_file_set_fowner(struct file *file)
{
        call_void_hook(file_set_fowner, file);
}

/**
 * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed
 * @tsk: target task
 * @fown: signal sender
 * @sig: signal to be sent, SIGIO is sent if 0
 *
 * Check permission for the file owner @fown to send SIGIO or SIGURG to the
 * process @tsk.  Note that this hook is sometimes called from interrupt.  Note
 * that the fown_struct, @fown, is never outside the context of a struct file,
 * so the file structure (and associated security information) can always be
 * obtained: container_of(fown, struct file, f_owner).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_send_sigiotask(struct task_struct *tsk,
                                 struct fown_struct *fown, int sig)
{
        return call_int_hook(file_send_sigiotask, tsk, fown, sig);
}

/**
 * security_file_receive() - Check if receiving a file via IPC is allowed
 * @file: file being received
 *
 * This hook allows security modules to control the ability of a process to
 * receive an open file descriptor via socket IPC.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_receive(struct file *file)
{
        return call_int_hook(file_receive, file);
}

/**
 * security_file_open() - Save open() time state for late use by the LSM
 * @file:
 *
 * Save open-time permission checking state for later use upon file_permission,
 * and recheck access if anything has changed since inode_permission.
 *
 * We can check if a file is opened for execution (e.g. execve(2) call), either
 * directly or indirectly (e.g. ELF's ld.so) by checking file->f_flags &
 * __FMODE_EXEC .
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_open(struct file *file)
{
        return call_int_hook(file_open, file);
}

/**
 * security_file_post_open() - Evaluate a file after it has been opened
 * @file: the file
 * @mask: access mask
 *
 * Evaluate an opened file and the access mask requested with open(). The hook
 * is useful for LSMs that require the file content to be available in order to
 * make decisions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_post_open(struct file *file, int mask)
{
        return call_int_hook(file_post_open, file, mask);
}
EXPORT_SYMBOL_GPL(security_file_post_open);

/**
 * security_file_truncate() - Check if truncating a file is allowed
 * @file: file
 *
 * Check permission before truncating a file, i.e. using ftruncate.  Note that
 * truncation permission may also be checked based on the path, using the
 * @path_truncate hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_truncate(struct file *file)
{
        return call_int_hook(file_truncate, file);
}

/**
 * security_task_alloc() - Allocate a task's LSM blob
 * @task: the task
 * @clone_flags: flags indicating what is being shared
 *
 * Handle allocation of task-related resources.
 *
 * Return: Returns a zero on success, negative values on failure.
 */
int security_task_alloc(struct task_struct *task, u64 clone_flags)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                return rc;
        rc = call_int_hook(task_alloc, task, clone_flags);
        if (unlikely(rc))
                security_task_free(task);
        return rc;
}

/**
 * security_task_free() - Free a task's LSM blob and related resources
 * @task: task
 *
 * Handle release of task-related resources.  Note that this can be called from
 * interrupt context.
 */
void security_task_free(struct task_struct *task)
{
        call_void_hook(task_free, task);

        kfree(task->security);
        task->security = NULL;
}

/**
 * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer
 * @cred: credentials
 * @gfp: gfp flags
 *
 * Only allocate sufficient memory and attach to @cred such that
 * cred_transfer() will not get ENOMEM.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        int rc = lsm_cred_alloc(cred, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_alloc_blank, cred, gfp);
        if (unlikely(rc))
                security_cred_free(cred);
        return rc;
}

/**
 * security_cred_free() - Free the cred's LSM blob and associated resources
 * @cred: credentials
 *
 * Deallocate and clear the cred->security field in a set of credentials.
 */
void security_cred_free(struct cred *cred)
{
        /*
         * There is a failure case in prepare_creds() that
         * may result in a call here with ->security being NULL.
         */
        if (unlikely(cred->security == NULL))
                return;

        call_void_hook(cred_free, cred);

        kfree(cred->security);
        cred->security = NULL;
}

/**
 * security_prepare_creds() - Prepare a new set of credentials
 * @new: new credentials
 * @old: original credentials
 * @gfp: gfp flags
 *
 * Prepare a new set of credentials by copying the data from the old set.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
{
        int rc = lsm_cred_alloc(new, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_prepare, new, old, gfp);
        if (unlikely(rc))
                security_cred_free(new);
        return rc;
}

/**
 * security_transfer_creds() - Transfer creds
 * @new: target credentials
 * @old: original credentials
 *
 * Transfer data from original creds to new creds.
 */
void security_transfer_creds(struct cred *new, const struct cred *old)
{
        call_void_hook(cred_transfer, new, old);
}

/**
 * security_cred_getsecid() - Get the secid from a set of credentials
 * @c: credentials
 * @secid: secid value
 *
 * Retrieve the security identifier of the cred structure @c.  In case of
 * failure, @secid will be set to zero.
 */
void security_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = 0;
        call_void_hook(cred_getsecid, c, secid);
}
EXPORT_SYMBOL(security_cred_getsecid);

/**
 * security_cred_getlsmprop() - Get the LSM data from a set of credentials
 * @c: credentials
 * @prop: destination for the LSM data
 *
 * Retrieve the security data of the cred structure @c.  In case of
 * failure, @prop will be cleared.
 */
void security_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(cred_getlsmprop, c, prop);
}
EXPORT_SYMBOL(security_cred_getlsmprop);

/**
 * security_kernel_act_as() - Set the kernel credentials to act as secid
 * @new: credentials
 * @secid: secid
 *
 * Set the credentials for a kernel service to act as (subjective context).
 * The current task must be the one that nominated @secid.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_act_as(struct cred *new, u32 secid)
{
        return call_int_hook(kernel_act_as, new, secid);
}

/**
 * security_kernel_create_files_as() - Set file creation context using an inode
 * @new: target credentials
 * @inode: reference inode
 *
 * Set the file creation context in a set of credentials to be the same as the
 * objective context of the specified inode.  The current task must be the one
 * that nominated @inode.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        return call_int_hook(kernel_create_files_as, new, inode);
}

/**
 * security_kernel_module_request() - Check if loading a module is allowed
 * @kmod_name: module name
 *
 * Ability to trigger the kernel to automatically upcall to userspace for
 * userspace to load a kernel module with the given name.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_module_request(char *kmod_name)
{
        return call_int_hook(kernel_module_request, kmod_name);
}

/**
 * security_kernel_read_file() - Read a file specified by userspace
 * @file: file
 * @id: file identifier
 * @contents: trust if security_kernel_post_read_file() will be called
 *
 * Read a file specified by userspace.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_read_file(struct file *file, enum kernel_read_file_id id,
                              bool contents)
{
        return call_int_hook(kernel_read_file, file, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_read_file);

/**
 * security_kernel_post_read_file() - Read a file specified by userspace
 * @file: file
 * @buf: file contents
 * @size: size of file contents
 * @id: file identifier
 *
 * Read a file specified by userspace.  This must be paired with a prior call
 * to security_kernel_read_file() call that indicated this hook would also be
 * called, see security_kernel_read_file() for more information.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
                                   enum kernel_read_file_id id)
{
        return call_int_hook(kernel_post_read_file, file, buf, size, id);
}
EXPORT_SYMBOL_GPL(security_kernel_post_read_file);

/**
 * security_kernel_load_data() - Load data provided by userspace
 * @id: data identifier
 * @contents: true if security_kernel_post_load_data() will be called
 *
 * Load data provided by userspace.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        return call_int_hook(kernel_load_data, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_load_data);

/**
 * security_kernel_post_load_data() - Load userspace data from a non-file source
 * @buf: data
 * @size: size of data
 * @id: data identifier
 * @description: text description of data, specific to the id value
 *
 * Load data provided by a non-file source (usually userspace buffer).  This
 * must be paired with a prior security_kernel_load_data() call that indicated
 * this hook would also be called, see security_kernel_load_data() for more
 * information.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_post_load_data(char *buf, loff_t size,
                                   enum kernel_load_data_id id,
                                   char *description)
{
        return call_int_hook(kernel_post_load_data, buf, size, id, description);
}
EXPORT_SYMBOL_GPL(security_kernel_post_load_data);

/**
 * security_task_fix_setuid() - Update LSM with new user id attributes
 * @new: updated credentials
 * @old: credentials being replaced
 * @flags: LSM_SETID_* flag values
 *
 * Update the module's state after setting one or more of the user identity
 * attributes of the current process.  The @flags parameter indicates which of
 * the set*uid system calls invoked this hook.  If @new is the set of
 * credentials that will be installed.  Modifications should be made to this
 * rather than to @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setuid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setuid, new, old, flags);
}

/**
 * security_task_fix_setgid() - Update LSM with new group id attributes
 * @new: updated credentials
 * @old: credentials being replaced
 * @flags: LSM_SETID_* flag value
 *
 * Update the module's state after setting one or more of the group identity
 * attributes of the current process.  The @flags parameter indicates which of
 * the set*gid system calls invoked this hook.  @new is the set of credentials
 * that will be installed.  Modifications should be made to this rather than to
 * @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setgid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setgid, new, old, flags);
}

/**
 * security_task_fix_setgroups() - Update LSM with new supplementary groups
 * @new: updated credentials
 * @old: credentials being replaced
 *
 * Update the module's state after setting the supplementary group identity
 * attributes of the current process.  @new is the set of credentials that will
 * be installed.  Modifications should be made to this rather than to
 * @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setgroups(struct cred *new, const struct cred *old)
{
        return call_int_hook(task_fix_setgroups, new, old);
}

/**
 * security_task_setpgid() - Check if setting the pgid is allowed
 * @p: task being modified
 * @pgid: new pgid
 *
 * Check permission before setting the process group identifier of the process
 * @p to @pgid.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return call_int_hook(task_setpgid, p, pgid);
}

/**
 * security_task_getpgid() - Check if getting the pgid is allowed
 * @p: task
 *
 * Check permission before getting the process group identifier of the process
 * @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getpgid(struct task_struct *p)
{
        return call_int_hook(task_getpgid, p);
}

/**
 * security_task_getsid() - Check if getting the session id is allowed
 * @p: task
 *
 * Check permission before getting the session identifier of the process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getsid(struct task_struct *p)
{
        return call_int_hook(task_getsid, p);
}

/**
 * security_current_getlsmprop_subj() - Current task's subjective LSM data
 * @prop: lsm specific information
 *
 * Retrieve the subjective security identifier of the current task and return
 * it in @prop.
 */
void security_current_getlsmprop_subj(struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(current_getlsmprop_subj, prop);
}
EXPORT_SYMBOL(security_current_getlsmprop_subj);

/**
 * security_task_getlsmprop_obj() - Get a task's objective LSM data
 * @p: target task
 * @prop: lsm specific information
 *
 * Retrieve the objective security identifier of the task_struct in @p and
 * return it in @prop.
 */
void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(task_getlsmprop_obj, p, prop);
}
EXPORT_SYMBOL(security_task_getlsmprop_obj);

/**
 * security_task_setnice() - Check if setting a task's nice value is allowed
 * @p: target task
 * @nice: nice value
 *
 * Check permission before setting the nice value of @p to @nice.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setnice(struct task_struct *p, int nice)
{
        return call_int_hook(task_setnice, p, nice);
}

/**
 * security_task_setioprio() - Check if setting a task's ioprio is allowed
 * @p: target task
 * @ioprio: ioprio value
 *
 * Check permission before setting the ioprio value of @p to @ioprio.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setioprio(struct task_struct *p, int ioprio)
{
        return call_int_hook(task_setioprio, p, ioprio);
}

/**
 * security_task_getioprio() - Check if getting a task's ioprio is allowed
 * @p: task
 *
 * Check permission before getting the ioprio value of @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getioprio(struct task_struct *p)
{
        return call_int_hook(task_getioprio, p);
}

/**
 * security_task_prlimit() - Check if get/setting resources limits is allowed
 * @cred: current task credentials
 * @tcred: target task credentials
 * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both
 *
 * Check permission before getting and/or setting the resource limits of
 * another task.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
                          unsigned int flags)
{
        return call_int_hook(task_prlimit, cred, tcred, flags);
}

/**
 * security_task_setrlimit() - Check if setting a new rlimit value is allowed
 * @p: target task's group leader
 * @resource: resource whose limit is being set
 * @new_rlim: new resource limit
 *
 * Check permission before setting the resource limits of process @p for
 * @resource to @new_rlim.  The old resource limit values can be examined by
 * dereferencing (p->signal->rlim + resource).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setrlimit(struct task_struct *p, unsigned int resource,
                            struct rlimit *new_rlim)
{
        return call_int_hook(task_setrlimit, p, resource, new_rlim);
}

/**
 * security_task_setscheduler() - Check if setting sched policy/param is allowed
 * @p: target task
 *
 * Check permission before setting scheduling policy and/or parameters of
 * process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setscheduler(struct task_struct *p)
{
        return call_int_hook(task_setscheduler, p);
}

/**
 * security_task_getscheduler() - Check if getting scheduling info is allowed
 * @p: target task
 *
 * Check permission before obtaining scheduling information for process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getscheduler(struct task_struct *p)
{
        return call_int_hook(task_getscheduler, p);
}

/**
 * security_task_movememory() - Check if moving memory is allowed
 * @p: task
 *
 * Check permission before moving memory owned by process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_movememory(struct task_struct *p)
{
        return call_int_hook(task_movememory, p);
}

/**
 * security_task_kill() - Check if sending a signal is allowed
 * @p: target process
 * @info: signal information
 * @sig: signal value
 * @cred: credentials of the signal sender, NULL if @current
 *
 * Check permission before sending signal @sig to @p.  @info can be NULL, the
 * constant 1, or a pointer to a kernel_siginfo structure.  If @info is 1 or
 * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from
 * the kernel and should typically be permitted.  SIGIO signals are handled
 * separately by the send_sigiotask hook in file_security_ops.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                       int sig, const struct cred *cred)
{
        return call_int_hook(task_kill, p, info, sig, cred);
}

/**
 * security_task_prctl() - Check if a prctl op is allowed
 * @option: operation
 * @arg2: argument
 * @arg3: argument
 * @arg4: argument
 * @arg5: argument
 *
 * Check permission before performing a process control operation on the
 * current process.
 *
 * Return: Return -ENOSYS if no-one wanted to handle this op, any other value
 *         to cause prctl() to return immediately with that value.
 */
int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                        unsigned long arg4, unsigned long arg5)
{
        int thisrc;
        int rc = LSM_RET_DEFAULT(task_prctl);
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, task_prctl) {
                thisrc = scall->hl->hook.task_prctl(option, arg2, arg3, arg4, arg5);
                if (thisrc != LSM_RET_DEFAULT(task_prctl)) {
                        rc = thisrc;
                        if (thisrc != 0)
                                break;
                }
        }
        return rc;
}

/**
 * security_task_to_inode() - Set the security attributes of a task's inode
 * @p: task
 * @inode: inode
 *
 * Set the security attributes for an inode based on an associated task's
 * security attributes, e.g. for /proc/pid inodes.
 */
void security_task_to_inode(struct task_struct *p, struct inode *inode)
{
        call_void_hook(task_to_inode, p, inode);
}

/**
 * security_create_user_ns() - Check if creating a new userns is allowed
 * @cred: prepared creds
 *
 * Check permission prior to creating a new user namespace.
 *
 * Return: Returns 0 if successful, otherwise < 0 error code.
 */
int security_create_user_ns(const struct cred *cred)
{
        return call_int_hook(userns_create, cred);
}

/**
 * security_ipc_permission() - Check if sysv ipc access is allowed
 * @ipcp: ipc permission structure
 * @flag: requested permissions
 *
 * Check permissions for access to IPC.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        return call_int_hook(ipc_permission, ipcp, flag);
}

/**
 * security_ipc_getlsmprop() - Get the sysv ipc object LSM data
 * @ipcp: ipc permission structure
 * @prop: pointer to lsm information
 *
 * Get the lsm information associated with the ipc object.
 */

void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(ipc_getlsmprop, ipcp, prop);
}

/**
 * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob
 * @msg: message structure
 *
 * Allocate and attach a security structure to the msg->security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Return 0 if operation was successful and permission is granted.
 */
int security_msg_msg_alloc(struct msg_msg *msg)
{
        int rc = lsm_msg_msg_alloc(msg);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_msg_alloc_security, msg);
        if (unlikely(rc))
                security_msg_msg_free(msg);
        return rc;
}

/**
 * security_msg_msg_free() - Free a sysv ipc message LSM blob
 * @msg: message structure
 *
 * Deallocate the security structure for this message.
 */
void security_msg_msg_free(struct msg_msg *msg)
{
        call_void_hook(msg_msg_free_security, msg);
        kfree(msg->security);
        msg->security = NULL;
}

/**
 * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob
 * @msq: sysv ipc permission structure
 *
 * Allocate and attach a security structure to @msg. The security field is
 * initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_msg_queue_alloc(struct kern_ipc_perm *msq)
{
        int rc = lsm_ipc_alloc(msq);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_queue_alloc_security, msq);
        if (unlikely(rc))
                security_msg_queue_free(msq);
        return rc;
}

/**
 * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob
 * @msq: sysv ipc permission structure
 *
 * Deallocate security field @perm->security for the message queue.
 */
void security_msg_queue_free(struct kern_ipc_perm *msq)
{
        call_void_hook(msg_queue_free_security, msq);
        kfree(msq->security);
        msq->security = NULL;
}

/**
 * security_msg_queue_associate() - Check if a msg queue operation is allowed
 * @msq: sysv ipc permission structure
 * @msqflg: operation flags
 *
 * Check permission when a message queue is requested through the msgget system
 * call. This hook is only called when returning the message queue identifier
 * for an existing message queue, not when a new message queue is created.
 *
 * Return: Return 0 if permission is granted.
 */
int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        return call_int_hook(msg_queue_associate, msq, msqflg);
}

/**
 * security_msg_queue_msgctl() - Check if a msg queue operation is allowed
 * @msq: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a message control operation specified by @cmd is to be
 * performed on the message queue with permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        return call_int_hook(msg_queue_msgctl, msq, cmd);
}

/**
 * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed
 * @msq: sysv ipc permission structure
 * @msg: message
 * @msqflg: operation flags
 *
 * Check permission before a message, @msg, is enqueued on the message queue
 * with permissions specified in @msq.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgsnd(struct kern_ipc_perm *msq,
                              struct msg_msg *msg, int msqflg)
{
        return call_int_hook(msg_queue_msgsnd, msq, msg, msqflg);
}

/**
 * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed
 * @msq: sysv ipc permission structure
 * @msg: message
 * @target: target task
 * @type: type of message requested
 * @mode: operation flags
 *
 * Check permission before a message, @msg, is removed from the message        queue.
 * The @target task structure contains a pointer to the process that will be
 * receiving the message (not equal to the current process when inline receives
 * are being performed).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                              struct task_struct *target, long type, int mode)
{
        return call_int_hook(msg_queue_msgrcv, msq, msg, target, type, mode);
}

/**
 * security_shm_alloc() - Allocate a sysv shm LSM blob
 * @shp: sysv ipc permission structure
 *
 * Allocate and attach a security structure to the @shp security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_shm_alloc(struct kern_ipc_perm *shp)
{
        int rc = lsm_ipc_alloc(shp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(shm_alloc_security, shp);
        if (unlikely(rc))
                security_shm_free(shp);
        return rc;
}

/**
 * security_shm_free() - Free a sysv shm LSM blob
 * @shp: sysv ipc permission structure
 *
 * Deallocate the security structure @perm->security for the memory segment.
 */
void security_shm_free(struct kern_ipc_perm *shp)
{
        call_void_hook(shm_free_security, shp);
        kfree(shp->security);
        shp->security = NULL;
}

/**
 * security_shm_associate() - Check if a sysv shm operation is allowed
 * @shp: sysv ipc permission structure
 * @shmflg: operation flags
 *
 * Check permission when a shared memory region is requested through the shmget
 * system call. This hook is only called when returning the shared memory
 * region identifier for an existing region, not when a new shared memory
 * region is created.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        return call_int_hook(shm_associate, shp, shmflg);
}

/**
 * security_shm_shmctl() - Check if a sysv shm operation is allowed
 * @shp: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a shared memory control operation specified by @cmd is
 * to be performed on the shared memory region with permissions in @shp.
 *
 * Return: Return 0 if permission is granted.
 */
int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        return call_int_hook(shm_shmctl, shp, cmd);
}

/**
 * security_shm_shmat() - Check if a sysv shm attach operation is allowed
 * @shp: sysv ipc permission structure
 * @shmaddr: address of memory region to attach
 * @shmflg: operation flags
 *
 * Check permissions prior to allowing the shmat system call to attach the
 * shared memory segment with permissions @shp to the data segment of the
 * calling process. The attaching address is specified by @shmaddr.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_shm_shmat(struct kern_ipc_perm *shp,
                       char __user *shmaddr, int shmflg)
{
        return call_int_hook(shm_shmat, shp, shmaddr, shmflg);
}

/**
 * security_sem_alloc() - Allocate a sysv semaphore LSM blob
 * @sma: sysv ipc permission structure
 *
 * Allocate and attach a security structure to the @sma security field. The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_sem_alloc(struct kern_ipc_perm *sma)
{
        int rc = lsm_ipc_alloc(sma);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sem_alloc_security, sma);
        if (unlikely(rc))
                security_sem_free(sma);
        return rc;
}

/**
 * security_sem_free() - Free a sysv semaphore LSM blob
 * @sma: sysv ipc permission structure
 *
 * Deallocate security structure @sma->security for the semaphore.
 */
void security_sem_free(struct kern_ipc_perm *sma)
{
        call_void_hook(sem_free_security, sma);
        kfree(sma->security);
        sma->security = NULL;
}

/**
 * security_sem_associate() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @semflg: operation flags
 *
 * Check permission when a semaphore is requested through the semget system
 * call. This hook is only called when returning the semaphore identifier for
 * an existing semaphore, not when a new one must be created.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        return call_int_hook(sem_associate, sma, semflg);
}

/**
 * security_sem_semctl() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a semaphore operation specified by @cmd is to be
 * performed on the semaphore.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        return call_int_hook(sem_semctl, sma, cmd);
}

/**
 * security_sem_semop() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @sops: operations to perform
 * @nsops: number of operations
 * @alter: flag indicating changes will be made
 *
 * Check permissions before performing operations on members of the semaphore
 * set. If the @alter flag is nonzero, the semaphore set may be modified.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
                       unsigned nsops, int alter)
{
        return call_int_hook(sem_semop, sma, sops, nsops, alter);
}

/**
 * security_d_instantiate() - Populate an inode's LSM state based on a dentry
 * @dentry: dentry
 * @inode: inode
 *
 * Fill in @inode security information for a @dentry if allowed.
 */
void security_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (unlikely(inode && IS_PRIVATE(inode)))
                return;
        call_void_hook(d_instantiate, dentry, inode);
}
EXPORT_SYMBOL(security_d_instantiate);

/*
 * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
 */

/**
 * security_getselfattr - Read an LSM attribute of the current process.
 * @attr: which attribute to return
 * @uctx: the user-space destination for the information, or NULL
 * @size: pointer to the size of space available to receive the data
 * @flags: special handling options. LSM_FLAG_SINGLE indicates that only
 * attributes associated with the LSM identified in the passed @ctx be
 * reported.
 *
 * A NULL value for @uctx can be used to get both the number of attributes
 * and the size of the data.
 *
 * Returns the number of attributes found on success, negative value
 * on error. @size is reset to the total size of the data.
 * If @size is insufficient to contain the data -E2BIG is returned.
 */
int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
                         u32 __user *size, u32 flags)
{
        struct lsm_static_call *scall;
        struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, };
        u8 __user *base = (u8 __user *)uctx;
        u32 entrysize;
        u32 total = 0;
        u32 left;
        bool toobig = false;
        bool single = false;
        int count = 0;
        int rc;

        if (attr == LSM_ATTR_UNDEF)
                return -EINVAL;
        if (size == NULL)
                return -EINVAL;
        if (get_user(left, size))
                return -EFAULT;

        if (flags) {
                /*
                 * Only flag supported is LSM_FLAG_SINGLE
                 */
                if (flags != LSM_FLAG_SINGLE || !uctx)
                        return -EINVAL;
                if (copy_from_user(&lctx, uctx, sizeof(lctx)))
                        return -EFAULT;
                /*
                 * If the LSM ID isn't specified it is an error.
                 */
                if (lctx.id == LSM_ID_UNDEF)
                        return -EINVAL;
                single = true;
        }

        /*
         * In the usual case gather all the data from the LSMs.
         * In the single case only get the data from the LSM specified.
         */
        lsm_for_each_hook(scall, getselfattr) {
                if (single && lctx.id != scall->hl->lsmid->id)
                        continue;
                entrysize = left;
                if (base)
                        uctx = (struct lsm_ctx __user *)(base + total);
                rc = scall->hl->hook.getselfattr(attr, uctx, &entrysize, flags);
                if (rc == -EOPNOTSUPP)
                        continue;
                if (rc == -E2BIG) {
                        rc = 0;
                        left = 0;
                        toobig = true;
                } else if (rc < 0)
                        return rc;
                else
                        left -= entrysize;

                total += entrysize;
                count += rc;
                if (single)
                        break;
        }
        if (put_user(total, size))
                return -EFAULT;
        if (toobig)
                return -E2BIG;
        if (count == 0)
                return LSM_RET_DEFAULT(getselfattr);
        return count;
}

/*
 * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
 */

/**
 * security_setselfattr - Set an LSM attribute on the current process.
 * @attr: which attribute to set
 * @uctx: the user-space source for the information
 * @size: the size of the data
 * @flags: reserved for future use, must be 0
 *
 * Set an LSM attribute for the current process. The LSM, attribute
 * and new value are included in @uctx.
 *
 * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT
 * if the user buffer is inaccessible, E2BIG if size is too big, or an
 * LSM specific failure.
 */
int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
                         u32 size, u32 flags)
{
        struct lsm_static_call *scall;
        struct lsm_ctx *lctx;
        int rc = LSM_RET_DEFAULT(setselfattr);
        u64 required_len;

        if (flags)
                return -EINVAL;
        if (size < sizeof(*lctx))
                return -EINVAL;
        if (size > PAGE_SIZE)
                return -E2BIG;

        lctx = memdup_user(uctx, size);
        if (IS_ERR(lctx))
                return PTR_ERR(lctx);

        if (size < lctx->len ||
            check_add_overflow(sizeof(*lctx), lctx->ctx_len, &required_len) ||
            lctx->len < required_len) {
                rc = -EINVAL;
                goto free_out;
        }

        lsm_for_each_hook(scall, setselfattr)
                if ((scall->hl->lsmid->id) == lctx->id) {
                        rc = scall->hl->hook.setselfattr(attr, lctx, size, flags);
                        break;
                }

free_out:
        kfree(lctx);
        return rc;
}

/**
 * security_getprocattr() - Read an attribute for a task
 * @p: the task
 * @lsmid: LSM identification
 * @name: attribute name
 * @value: attribute value
 *
 * Read attribute @name for task @p and store it into @value if allowed.
 *
 * Return: Returns the length of @value on success, a negative value otherwise.
 */
int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
                         char **value)
{
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, getprocattr) {
                if (lsmid != 0 && lsmid != scall->hl->lsmid->id)
                        continue;
                return scall->hl->hook.getprocattr(p, name, value);
        }
        return LSM_RET_DEFAULT(getprocattr);
}

/**
 * security_setprocattr() - Set an attribute for a task
 * @lsmid: LSM identification
 * @name: attribute name
 * @value: attribute value
 * @size: attribute value size
 *
 * Write (set) the current task's attribute @name to @value, size @size if
 * allowed.
 *
 * Return: Returns bytes written on success, a negative value otherwise.
 */
int security_setprocattr(int lsmid, const char *name, void *value, size_t size)
{
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, setprocattr) {
                if (lsmid != 0 && lsmid != scall->hl->lsmid->id)
                        continue;
                return scall->hl->hook.setprocattr(name, value, size);
        }
        return LSM_RET_DEFAULT(setprocattr);
}

/**
 * security_ismaclabel() - Check if the named attribute is a MAC label
 * @name: full extended attribute name
 *
 * Check if the extended attribute specified by @name represents a MAC label.
 *
 * Return: Returns 1 if name is a MAC attribute otherwise returns 0.
 */
int security_ismaclabel(const char *name)
{
        return call_int_hook(ismaclabel, name);
}
EXPORT_SYMBOL(security_ismaclabel);

/**
 * security_secid_to_secctx() - Convert a secid to a secctx
 * @secid: secid
 * @cp: the LSM context
 *
 * Convert secid to security context.  If @cp is NULL the length of the
 * result will be returned, but no data will be returned.  This
 * does mean that the length could change between calls to check the length and
 * the next call which actually allocates and returns the data.
 *
 * Return: Return length of data on success, error on failure.
 */
int security_secid_to_secctx(u32 secid, struct lsm_context *cp)
{
        return call_int_hook(secid_to_secctx, secid, cp);
}
EXPORT_SYMBOL(security_secid_to_secctx);

/**
 * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx
 * @prop: lsm specific information
 * @cp: the LSM context
 * @lsmid: which security module to report
 *
 * Convert a @prop entry to security context.  If @cp is NULL the
 * length of the result will be returned. This does mean that the
 * length could change between calls to check the length and the
 * next call which actually allocates and returns the @cp.
 *
 * @lsmid identifies which LSM should supply the context.
 * A value of LSM_ID_UNDEF indicates that the first LSM suppling
 * the hook should be used. This is used in cases where the
 * ID of the supplying LSM is unambiguous.
 *
 * Return: Return length of data on success, error on failure.
 */
int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp,
                               int lsmid)
{
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, lsmprop_to_secctx) {
                if (lsmid != LSM_ID_UNDEF && lsmid != scall->hl->lsmid->id)
                        continue;
                return scall->hl->hook.lsmprop_to_secctx(prop, cp);
        }
        return LSM_RET_DEFAULT(lsmprop_to_secctx);
}
EXPORT_SYMBOL(security_lsmprop_to_secctx);

/**
 * security_secctx_to_secid() - Convert a secctx to a secid
 * @secdata: secctx
 * @seclen: length of secctx
 * @secid: secid
 *
 * Convert security context to secid.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        *secid = 0;
        return call_int_hook(secctx_to_secid, secdata, seclen, secid);
}
EXPORT_SYMBOL(security_secctx_to_secid);

/**
 * security_release_secctx() - Free a secctx buffer
 * @cp: the security context
 *
 * Release the security context.
 */
void security_release_secctx(struct lsm_context *cp)
{
        call_void_hook(release_secctx, cp);
        memset(cp, 0, sizeof(*cp));
}
EXPORT_SYMBOL(security_release_secctx);

/**
 * security_inode_invalidate_secctx() - Invalidate an inode's security label
 * @inode: inode
 *
 * Notify the security module that it must revalidate the security context of
 * an inode.
 */
void security_inode_invalidate_secctx(struct inode *inode)
{
        call_void_hook(inode_invalidate_secctx, inode);
}
EXPORT_SYMBOL(security_inode_invalidate_secctx);

/**
 * security_inode_notifysecctx() - Notify the LSM of an inode's security label
 * @inode: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * Notify the security module of what the security context of an inode should
 * be.  Initializes the incore security context managed by the security module
 * for this inode.  Example usage: NFS client invokes this hook to initialize
 * the security context in its incore inode to the value provided by the server
 * for the file when the server returned the file's attributes to the client.
 * Must be called with inode->i_mutex locked.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_notifysecctx, inode, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_notifysecctx);

/**
 * security_inode_setsecctx() - Change the security label of an inode
 * @dentry: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * Change the security context of an inode.  Updates the incore security
 * context managed by the security module and invokes the fs code as needed
 * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the
 * context.  Example usage: NFS server invokes this hook to change the security
 * context in its incore inode and on the backing filesystem to a value
 * provided by the client on a SETATTR operation.  Must be called with
 * inode->i_mutex locked.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_setsecctx, dentry, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_setsecctx);

/**
 * security_inode_getsecctx() - Get the security label of an inode
 * @inode: inode
 * @cp: security context
 *
 * On success, returns 0 and fills out @cp with the security context
 * for the given @inode.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_getsecctx(struct inode *inode, struct lsm_context *cp)
{
        memset(cp, 0, sizeof(*cp));
        return call_int_hook(inode_getsecctx, inode, cp);
}
EXPORT_SYMBOL(security_inode_getsecctx);

#ifdef CONFIG_WATCH_QUEUE
/**
 * security_post_notification() - Check if a watch notification can be posted
 * @w_cred: credentials of the task that set the watch
 * @cred: credentials of the task which triggered the watch
 * @n: the notification
 *
 * Check to see if a watch notification can be posted to a particular queue.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_post_notification(const struct cred *w_cred,
                               const struct cred *cred,
                               struct watch_notification *n)
{
        return call_int_hook(post_notification, w_cred, cred, n);
}
#endif /* CONFIG_WATCH_QUEUE */

#ifdef CONFIG_KEY_NOTIFICATIONS
/**
 * security_watch_key() - Check if a task is allowed to watch for key events
 * @key: the key to watch
 *
 * Check to see if a process is allowed to watch for event notifications from
 * a key or keyring.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_watch_key(struct key *key)
{
        return call_int_hook(watch_key, key);
}
#endif /* CONFIG_KEY_NOTIFICATIONS */

#ifdef CONFIG_SECURITY_NETWORK
/**
 * security_netlink_send() - Save info and check if netlink sending is allowed
 * @sk: sending socket
 * @skb: netlink message
 *
 * Save security information for a netlink message so that permission checking
 * can be performed when the message is processed.  The security information
 * can be saved using the eff_cap field of the netlink_skb_parms structure.
 * Also may be used to provide fine grained control over message transmission.
 *
 * Return: Returns 0 if the information was successfully saved and message is
 *         allowed to be transmitted.
 */
int security_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(netlink_send, sk, skb);
}

/**
 * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed
 * @sock: originating sock
 * @other: peer sock
 * @newsk: new sock
 *
 * Check permissions before establishing a Unix domain stream connection
 * between @sock and @other.
 *
 * The @unix_stream_connect and @unix_may_send hooks were necessary because
 * Linux provides an alternative to the conventional file name space for Unix
 * domain sockets.  Whereas binding and connecting to sockets in the file name
 * space is mediated by the typical file permissions (and caught by the mknod
 * and permission hooks in inode_security_ops), binding and connecting to
 * sockets in the abstract name space is completely unmediated.  Sufficient
 * control of Unix domain sockets in the abstract name space isn't possible
 * using only the socket layer hooks, since we need to know the actual target
 * socket, which is not looked up until we are inside the af_unix code.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_stream_connect(struct sock *sock, struct sock *other,
                                 struct sock *newsk)
{
        return call_int_hook(unix_stream_connect, sock, other, newsk);
}
EXPORT_SYMBOL(security_unix_stream_connect);

/**
 * security_unix_may_send() - Check if AF_UNIX socket can send datagrams
 * @sock: originating sock
 * @other: peer sock
 *
 * Check permissions before connecting or sending datagrams from @sock to
 * @other.
 *
 * The @unix_stream_connect and @unix_may_send hooks were necessary because
 * Linux provides an alternative to the conventional file name space for Unix
 * domain sockets.  Whereas binding and connecting to sockets in the file name
 * space is mediated by the typical file permissions (and caught by the mknod
 * and permission hooks in inode_security_ops), binding and connecting to
 * sockets in the abstract name space is completely unmediated.  Sufficient
 * control of Unix domain sockets in the abstract name space isn't possible
 * using only the socket layer hooks, since we need to know the actual target
 * socket, which is not looked up until we are inside the af_unix code.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_may_send(struct socket *sock,  struct socket *other)
{
        return call_int_hook(unix_may_send, sock, other);
}
EXPORT_SYMBOL(security_unix_may_send);

/**
 * security_socket_create() - Check if creating a new socket is allowed
 * @family: protocol family
 * @type: communications type
 * @protocol: requested protocol
 * @kern: set to 1 if a kernel socket is requested
 *
 * Check permissions prior to creating a new socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_create(int family, int type, int protocol, int kern)
{
        return call_int_hook(socket_create, family, type, protocol, kern);
}

/**
 * security_socket_post_create() - Initialize a newly created socket
 * @sock: socket
 * @family: protocol family
 * @type: communications type
 * @protocol: requested protocol
 * @kern: set to 1 if a kernel socket is requested
 *
 * This hook allows a module to update or allocate a per-socket security
 * structure. Note that the security field was not added directly to the socket
 * structure, but rather, the socket security information is stored in the
 * associated inode.  Typically, the inode alloc_security hook will allocate
 * and attach security information to SOCK_INODE(sock)->i_security.  This hook
 * may be used to update the SOCK_INODE(sock)->i_security field with additional
 * information that wasn't available when the inode was allocated.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_post_create(struct socket *sock, int family,
                                int type, int protocol, int kern)
{
        return call_int_hook(socket_post_create, sock, family, type,
                             protocol, kern);
}

/**
 * security_socket_socketpair() - Check if creating a socketpair is allowed
 * @socka: first socket
 * @sockb: second socket
 *
 * Check permissions before creating a fresh pair of sockets.
 *
 * Return: Returns 0 if permission is granted and the connection was
 *         established.
 */
int security_socket_socketpair(struct socket *socka, struct socket *sockb)
{
        return call_int_hook(socket_socketpair, socka, sockb);
}
EXPORT_SYMBOL(security_socket_socketpair);

/**
 * security_socket_bind() - Check if a socket bind operation is allowed
 * @sock: socket
 * @address: requested bind address
 * @addrlen: length of address
 *
 * Check permission before socket protocol layer bind operation is performed
 * and the socket @sock is bound to the address specified in the @address
 * parameter.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_bind(struct socket *sock,
                         struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_bind, sock, address, addrlen);
}

/**
 * security_socket_connect() - Check if a socket connect operation is allowed
 * @sock: socket
 * @address: address of remote connection point
 * @addrlen: length of address
 *
 * Check permission before socket protocol layer connect operation attempts to
 * connect socket @sock to a remote address, @address.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_connect(struct socket *sock,
                            struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_connect, sock, address, addrlen);
}

/**
 * security_socket_listen() - Check if a socket is allowed to listen
 * @sock: socket
 * @backlog: connection queue size
 *
 * Check permission before socket protocol layer listen operation.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_listen(struct socket *sock, int backlog)
{
        return call_int_hook(socket_listen, sock, backlog);
}

/**
 * security_socket_accept() - Check if a socket is allowed to accept connections
 * @sock: listening socket
 * @newsock: newly creation connection socket
 *
 * Check permission before accepting a new connection.  Note that the new
 * socket, @newsock, has been created and some information copied to it, but
 * the accept operation has not actually been performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_accept(struct socket *sock, struct socket *newsock)
{
        return call_int_hook(socket_accept, sock, newsock);
}

/**
 * security_socket_sendmsg() - Check if sending a message is allowed
 * @sock: sending socket
 * @msg: message to send
 * @size: size of message
 *
 * Check permission before transmitting a message to another socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
        return call_int_hook(socket_sendmsg, sock, msg, size);
}

/**
 * security_socket_recvmsg() - Check if receiving a message is allowed
 * @sock: receiving socket
 * @msg: message to receive
 * @size: size of message
 * @flags: operational flags
 *
 * Check permission before receiving a message from a socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                            int size, int flags)
{
        return call_int_hook(socket_recvmsg, sock, msg, size, flags);
}

/**
 * security_socket_getsockname() - Check if reading the socket addr is allowed
 * @sock: socket
 *
 * Check permission before reading the local address (name) of the socket
 * object.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getsockname(struct socket *sock)
{
        return call_int_hook(socket_getsockname, sock);
}

/**
 * security_socket_getpeername() - Check if reading the peer's addr is allowed
 * @sock: socket
 *
 * Check permission before the remote address (name) of a socket object.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getpeername(struct socket *sock)
{
        return call_int_hook(socket_getpeername, sock);
}

/**
 * security_socket_getsockopt() - Check if reading a socket option is allowed
 * @sock: socket
 * @level: option's protocol level
 * @optname: option name
 *
 * Check permissions before retrieving the options associated with socket
 * @sock.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_getsockopt, sock, level, optname);
}

/**
 * security_socket_setsockopt() - Check if setting a socket option is allowed
 * @sock: socket
 * @level: option's protocol level
 * @optname: option name
 *
 * Check permissions before setting the options associated with socket @sock.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_setsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_setsockopt, sock, level, optname);
}

/**
 * security_socket_shutdown() - Checks if shutting down the socket is allowed
 * @sock: socket
 * @how: flag indicating how sends and receives are handled
 *
 * Checks permission before all or part of a connection on the socket @sock is
 * shut down.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_shutdown(struct socket *sock, int how)
{
        return call_int_hook(socket_shutdown, sock, how);
}

/**
 * security_sock_rcv_skb() - Check if an incoming network packet is allowed
 * @sk: destination sock
 * @skb: incoming packet
 *
 * Check permissions on incoming network packets.  This hook is distinct from
 * Netfilter's IP input hooks since it is the first time that the incoming
 * sk_buff @skb has been associated with a particular socket, @sk.  Must not
 * sleep inside this hook because some callers hold spinlocks.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(socket_sock_rcv_skb, sk, skb);
}
EXPORT_SYMBOL(security_sock_rcv_skb);

/**
 * security_socket_getpeersec_stream() - Get the remote peer label
 * @sock: socket
 * @optval: destination buffer
 * @optlen: size of peer label copied into the buffer
 * @len: maximum size of the destination buffer
 *
 * This hook allows the security module to provide peer socket security state
 * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC.
 * For tcp sockets this can be meaningful if the socket is associated with an
 * ipsec SA.
 *
 * Return: Returns 0 if all is well, otherwise, typical getsockopt return
 *         values.
 */
int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
                                      sockptr_t optlen, unsigned int len)
{
        return call_int_hook(socket_getpeersec_stream, sock, optval, optlen,
                             len);
}

/**
 * security_socket_getpeersec_dgram() - Get the remote peer label
 * @sock: socket
 * @skb: datagram packet
 * @secid: remote peer label secid
 *
 * This hook allows the security module to provide peer socket security state
 * for udp sockets on a per-packet basis to userspace via getsockopt
 * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC
 * option via getsockopt. It can then retrieve the security state returned by
 * this hook for a packet via the SCM_SECURITY ancillary message type.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_socket_getpeersec_dgram(struct socket *sock,
                                     struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(socket_getpeersec_dgram, sock, skb, secid);
}
EXPORT_SYMBOL(security_socket_getpeersec_dgram);

/**
 * lsm_sock_alloc - allocate a composite sock blob
 * @sock: the sock that needs a blob
 * @gfp: allocation mode
 *
 * Allocate the sock blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_sock_alloc(struct sock *sock, gfp_t gfp)
{
        return lsm_blob_alloc(&sock->sk_security, blob_sizes.lbs_sock, gfp);
}

/**
 * security_sk_alloc() - Allocate and initialize a sock's LSM blob
 * @sk: sock
 * @family: protocol family
 * @priority: gfp flags
 *
 * Allocate and attach a security structure to the sk->sk_security field, which
 * is used to copy security attributes between local stream sockets.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
{
        int rc = lsm_sock_alloc(sk, priority);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sk_alloc_security, sk, family, priority);
        if (unlikely(rc))
                security_sk_free(sk);
        return rc;
}

/**
 * security_sk_free() - Free the sock's LSM blob
 * @sk: sock
 *
 * Deallocate security structure.
 */
void security_sk_free(struct sock *sk)
{
        call_void_hook(sk_free_security, sk);
        kfree(sk->sk_security);
        sk->sk_security = NULL;
}

/**
 * security_sk_clone() - Clone a sock's LSM state
 * @sk: original sock
 * @newsk: target sock
 *
 * Clone/copy security structure.
 */
void security_sk_clone(const struct sock *sk, struct sock *newsk)
{
        call_void_hook(sk_clone_security, sk, newsk);
}
EXPORT_SYMBOL(security_sk_clone);

/**
 * security_sk_classify_flow() - Set a flow's secid based on socket
 * @sk: original socket
 * @flic: target flow
 *
 * Set the target flow's secid to socket's secid.
 */
void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic)
{
        call_void_hook(sk_getsecid, sk, &flic->flowic_secid);
}
EXPORT_SYMBOL(security_sk_classify_flow);

/**
 * security_req_classify_flow() - Set a flow's secid based on request_sock
 * @req: request_sock
 * @flic: target flow
 *
 * Sets @flic's secid to @req's secid.
 */
void security_req_classify_flow(const struct request_sock *req,
                                struct flowi_common *flic)
{
        call_void_hook(req_classify_flow, req, flic);
}
EXPORT_SYMBOL(security_req_classify_flow);

/**
 * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket
 * @sk: sock being grafted
 * @parent: target parent socket
 *
 * Sets @parent's inode secid to @sk's secid and update @sk with any necessary
 * LSM state from @parent.
 */
void security_sock_graft(struct sock *sk, struct socket *parent)
{
        call_void_hook(sock_graft, sk, parent);
}
EXPORT_SYMBOL(security_sock_graft);

/**
 * security_inet_conn_request() - Set request_sock state using incoming connect
 * @sk: parent listening sock
 * @skb: incoming connection
 * @req: new request_sock
 *
 * Initialize the @req LSM state based on @sk and the incoming connect in @skb.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inet_conn_request(const struct sock *sk,
                               struct sk_buff *skb, struct request_sock *req)
{
        return call_int_hook(inet_conn_request, sk, skb, req);
}
EXPORT_SYMBOL(security_inet_conn_request);

/**
 * security_inet_csk_clone() - Set new sock LSM state based on request_sock
 * @newsk: new sock
 * @req: connection request_sock
 *
 * Set that LSM state of @sock using the LSM state from @req.
 */
void security_inet_csk_clone(struct sock *newsk,
                             const struct request_sock *req)
{
        call_void_hook(inet_csk_clone, newsk, req);
}

/**
 * security_inet_conn_established() - Update sock's LSM state with connection
 * @sk: sock
 * @skb: connection packet
 *
 * Update @sock's LSM state to represent a new connection from @skb.
 */
void security_inet_conn_established(struct sock *sk,
                                    struct sk_buff *skb)
{
        call_void_hook(inet_conn_established, sk, skb);
}
EXPORT_SYMBOL(security_inet_conn_established);

/**
 * security_secmark_relabel_packet() - Check if setting a secmark is allowed
 * @secid: new secmark value
 *
 * Check if the process should be allowed to relabel packets to @secid.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_secmark_relabel_packet(u32 secid)
{
        return call_int_hook(secmark_relabel_packet, secid);
}
EXPORT_SYMBOL(security_secmark_relabel_packet);

/**
 * security_secmark_refcount_inc() - Increment the secmark labeling rule count
 *
 * Tells the LSM to increment the number of secmark labeling rules loaded.
 */
void security_secmark_refcount_inc(void)
{
        call_void_hook(secmark_refcount_inc);
}
EXPORT_SYMBOL(security_secmark_refcount_inc);

/**
 * security_secmark_refcount_dec() - Decrement the secmark labeling rule count
 *
 * Tells the LSM to decrement the number of secmark labeling rules loaded.
 */
void security_secmark_refcount_dec(void)
{
        call_void_hook(secmark_refcount_dec);
}
EXPORT_SYMBOL(security_secmark_refcount_dec);

/**
 * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device
 * @security: pointer to the LSM blob
 *
 * This hook allows a module to allocate a security structure for a TUN        device,
 * returning the pointer in @security.
 *
 * Return: Returns a zero on success, negative values on failure.
 */
int security_tun_dev_alloc_security(void **security)
{
        int rc;

        rc = lsm_blob_alloc(security, blob_sizes.lbs_tun_dev, GFP_KERNEL);
        if (rc)
                return rc;

        rc = call_int_hook(tun_dev_alloc_security, *security);
        if (rc) {
                kfree(*security);
                *security = NULL;
        }
        return rc;
}
EXPORT_SYMBOL(security_tun_dev_alloc_security);

/**
 * security_tun_dev_free_security() - Free a TUN device LSM blob
 * @security: LSM blob
 *
 * This hook allows a module to free the security structure for a TUN device.
 */
void security_tun_dev_free_security(void *security)
{
        kfree(security);
}
EXPORT_SYMBOL(security_tun_dev_free_security);

/**
 * security_tun_dev_create() - Check if creating a TUN device is allowed
 *
 * Check permissions prior to creating a new TUN device.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_create(void)
{
        return call_int_hook(tun_dev_create);
}
EXPORT_SYMBOL(security_tun_dev_create);

/**
 * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed
 * @security: TUN device LSM blob
 *
 * Check permissions prior to attaching to a TUN device queue.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_attach_queue(void *security)
{
        return call_int_hook(tun_dev_attach_queue, security);
}
EXPORT_SYMBOL(security_tun_dev_attach_queue);

/**
 * security_tun_dev_attach() - Update TUN device LSM state on attach
 * @sk: associated sock
 * @security: TUN device LSM blob
 *
 * This hook can be used by the module to update any security state associated
 * with the TUN device's sock structure.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_attach(struct sock *sk, void *security)
{
        return call_int_hook(tun_dev_attach, sk, security);
}
EXPORT_SYMBOL(security_tun_dev_attach);

/**
 * security_tun_dev_open() - Update TUN device LSM state on open
 * @security: TUN device LSM blob
 *
 * This hook can be used by the module to update any security state associated
 * with the TUN device's security structure.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_open(void *security)
{
        return call_int_hook(tun_dev_open, security);
}
EXPORT_SYMBOL(security_tun_dev_open);

/**
 * security_sctp_assoc_request() - Update the LSM on a SCTP association req
 * @asoc: SCTP association
 * @skb: packet requesting the association
 *
 * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sctp_assoc_request(struct sctp_association *asoc,
                                struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_request, asoc, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_request);

/**
 * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option
 * @sk: socket
 * @optname: SCTP option to validate
 * @address: list of IP addresses to validate
 * @addrlen: length of the address list
 *
 * Validiate permissions required for each address associated with sock        @sk.
 * Depending on @optname, the addresses will be treated as either a connect or
 * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using
 * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6).
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sctp_bind_connect(struct sock *sk, int optname,
                               struct sockaddr *address, int addrlen)
{
        return call_int_hook(sctp_bind_connect, sk, optname, address, addrlen);
}
EXPORT_SYMBOL(security_sctp_bind_connect);

/**
 * security_sctp_sk_clone() - Clone a SCTP sock's LSM state
 * @asoc: SCTP association
 * @sk: original sock
 * @newsk: target sock
 *
 * Called whenever a new socket is created by accept(2) (i.e. a TCP style
 * socket) or when a socket is 'peeled off' e.g userspace calls
 * sctp_peeloff(3).
 */
void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
                            struct sock *newsk)
{
        call_void_hook(sctp_sk_clone, asoc, sk, newsk);
}
EXPORT_SYMBOL(security_sctp_sk_clone);

/**
 * security_sctp_assoc_established() - Update LSM state when assoc established
 * @asoc: SCTP association
 * @skb: packet establishing the association
 *
 * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the
 * security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sctp_assoc_established(struct sctp_association *asoc,
                                    struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_established, asoc, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_established);

/**
 * security_mptcp_add_subflow() - Inherit the LSM label from the MPTCP socket
 * @sk: the owning MPTCP socket
 * @ssk: the new subflow
 *
 * Update the labeling for the given MPTCP subflow, to match the one of the
 * owning MPTCP socket. This hook has to be called after the socket creation and
 * initialization via the security_socket_create() and
 * security_socket_post_create() LSM hooks.
 *
 * Return: Returns 0 on success or a negative error code on failure.
 */
int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk)
{
        return call_int_hook(mptcp_add_subflow, sk, ssk);
}

#endif        /* CONFIG_SECURITY_NETWORK */

#if defined(CONFIG_SECURITY_NETWORK) && defined(CONFIG_SECURITY_PATH)
/**
 * security_unix_find() - Check if a named AF_UNIX socket can connect
 * @path: path of the socket being connected to
 * @other: peer sock
 * @flags: flags associated with the socket
 *
 * This hook is called to check permissions before connecting to a named
 * AF_UNIX socket. The caller does not hold any locks on @other.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_find(const struct path *path, struct sock *other, int flags)
{
        return call_int_hook(unix_find, path, other, flags);
}
EXPORT_SYMBOL(security_unix_find);

#endif        /* CONFIG_SECURITY_NETWORK && CONFIG_SECURITY_PATH */

#ifdef CONFIG_SECURITY_INFINIBAND
/**
 * security_ib_pkey_access() - Check if access to an IB pkey is allowed
 * @sec: LSM blob
 * @subnet_prefix: subnet prefix of the port
 * @pkey: IB pkey
 *
 * Check permission to access a pkey when modifying a QP.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey)
{
        return call_int_hook(ib_pkey_access, sec, subnet_prefix, pkey);
}
EXPORT_SYMBOL(security_ib_pkey_access);

/**
 * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed
 * @sec: LSM blob
 * @dev_name: IB device name
 * @port_num: port number
 *
 * Check permissions to send and receive SMPs on a end port.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ib_endport_manage_subnet(void *sec,
                                      const char *dev_name, u8 port_num)
{
        return call_int_hook(ib_endport_manage_subnet, sec, dev_name, port_num);
}
EXPORT_SYMBOL(security_ib_endport_manage_subnet);

/**
 * security_ib_alloc_security() - Allocate an Infiniband LSM blob
 * @sec: LSM blob
 *
 * Allocate a security structure for Infiniband objects.
 *
 * Return: Returns 0 on success, non-zero on failure.
 */
int security_ib_alloc_security(void **sec)
{
        int rc;

        rc = lsm_blob_alloc(sec, blob_sizes.lbs_ib, GFP_KERNEL);
        if (rc)
                return rc;

        rc = call_int_hook(ib_alloc_security, *sec);
        if (rc) {
                kfree(*sec);
                *sec = NULL;
        }
        return rc;
}
EXPORT_SYMBOL(security_ib_alloc_security);

/**
 * security_ib_free_security() - Free an Infiniband LSM blob
 * @sec: LSM blob
 *
 * Deallocate an Infiniband security structure.
 */
void security_ib_free_security(void *sec)
{
        kfree(sec);
}
EXPORT_SYMBOL(security_ib_free_security);
#endif        /* CONFIG_SECURITY_INFINIBAND */

#ifdef CONFIG_SECURITY_NETWORK_XFRM
/**
 * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob
 * @ctxp: xfrm security context being added to the SPD
 * @sec_ctx: security label provided by userspace
 * @gfp: gfp flags
 *
 * Allocate a security structure to the xp->security field; the security field
 * is initialized to NULL when the xfrm_policy is allocated.
 *
 * Return:  Return 0 if operation was successful.
 */
int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
                               struct xfrm_user_sec_ctx *sec_ctx,
                               gfp_t gfp)
{
        return call_int_hook(xfrm_policy_alloc_security, ctxp, sec_ctx, gfp);
}
EXPORT_SYMBOL(security_xfrm_policy_alloc);

/**
 * security_xfrm_policy_clone() - Clone xfrm policy LSM state
 * @old_ctx: xfrm security context
 * @new_ctxp: target xfrm security context
 *
 * Allocate a security structure in new_ctxp that contains the information from
 * the old_ctx structure.
 *
 * Return: Return 0 if operation was successful.
 */
int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                               struct xfrm_sec_ctx **new_ctxp)
{
        return call_int_hook(xfrm_policy_clone_security, old_ctx, new_ctxp);
}

/**
 * security_xfrm_policy_free() - Free a xfrm security context
 * @ctx: xfrm security context
 *
 * Free LSM resources associated with @ctx.
 */
void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
{
        call_void_hook(xfrm_policy_free_security, ctx);
}
EXPORT_SYMBOL(security_xfrm_policy_free);

/**
 * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed
 * @ctx: xfrm security context
 *
 * Authorize deletion of a SPD entry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
{
        return call_int_hook(xfrm_policy_delete_security, ctx);
}

/**
 * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob
 * @x: xfrm state being added to the SAD
 * @sec_ctx: security label provided by userspace
 *
 * Allocate a security structure to the @x->security field; the security field
 * is initialized to NULL when the xfrm_state is allocated. Set the context to
 * correspond to @sec_ctx.
 *
 * Return: Return 0 if operation was successful.
 */
int security_xfrm_state_alloc(struct xfrm_state *x,
                              struct xfrm_user_sec_ctx *sec_ctx)
{
        return call_int_hook(xfrm_state_alloc, x, sec_ctx);
}
EXPORT_SYMBOL(security_xfrm_state_alloc);

/**
 * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob
 * @x: xfrm state being added to the SAD
 * @polsec: associated policy's security context
 * @secid: secid from the flow
 *
 * Allocate a security structure to the x->security field; the security field
 * is initialized to NULL when the xfrm_state is allocated.  Set the context to
 * correspond to secid.
 *
 * Return: Returns 0 if operation was successful.
 */
int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
                                      struct xfrm_sec_ctx *polsec, u32 secid)
{
        return call_int_hook(xfrm_state_alloc_acquire, x, polsec, secid);
}

/**
 * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed
 * @x: xfrm state
 *
 * Authorize deletion of x->security.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_xfrm_state_delete(struct xfrm_state *x)
{
        return call_int_hook(xfrm_state_delete_security, x);
}
EXPORT_SYMBOL(security_xfrm_state_delete);

/**
 * security_xfrm_state_free() - Free a xfrm state
 * @x: xfrm state
 *
 * Deallocate x->security.
 */
void security_xfrm_state_free(struct xfrm_state *x)
{
        call_void_hook(xfrm_state_free_security, x);
}

/**
 * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed
 * @ctx: target xfrm security context
 * @fl_secid: flow secid used to authorize access
 *
 * Check permission when a flow selects a xfrm_policy for processing XFRMs on a
 * packet.  The hook is called when selecting either a per-socket policy or a
 * generic xfrm policy.
 *
 * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on
 *         other errors.
 */
int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid)
{
        return call_int_hook(xfrm_policy_lookup, ctx, fl_secid);
}

/**
 * security_xfrm_state_pol_flow_match() - Check for a xfrm match
 * @x: xfrm state to match
 * @xp: xfrm policy to check for a match
 * @flic: flow to check for a match.
 *
 * Check @xp and @flic for a match with @x.
 *
 * Return: Returns 1 if there is a match.
 */
int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
                                       struct xfrm_policy *xp,
                                       const struct flowi_common *flic)
{
        struct lsm_static_call *scall;
        int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match);

        /*
         * Since this function is expected to return 0 or 1, the judgment
         * becomes difficult if multiple LSMs supply this call. Fortunately,
         * we can use the first LSM's judgment because currently only SELinux
         * supplies this call.
         *
         * For speed optimization, we explicitly break the loop rather than
         * using the macro
         */
        lsm_for_each_hook(scall, xfrm_state_pol_flow_match) {
                rc = scall->hl->hook.xfrm_state_pol_flow_match(x, xp, flic);
                break;
        }
        return rc;
}

/**
 * security_xfrm_decode_session() - Determine the xfrm secid for a packet
 * @skb: xfrm packet
 * @secid: secid
 *
 * Decode the packet in @skb and return the security label in @secid.
 *
 * Return: Return 0 if all xfrms used have the same secid.
 */
int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(xfrm_decode_session, skb, secid, 1);
}

void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic)
{
        int rc = call_int_hook(xfrm_decode_session, skb, &flic->flowic_secid,
                               0);

        BUG_ON(rc);
}
EXPORT_SYMBOL(security_skb_classify_flow);
#endif        /* CONFIG_SECURITY_NETWORK_XFRM */

#ifdef CONFIG_KEYS
/**
 * security_key_alloc() - Allocate and initialize a kernel key LSM blob
 * @key: key
 * @cred: credentials
 * @flags: allocation flags
 *
 * Permit allocation of a key and assign security data. Note that key does not
 * have a serial number assigned at this point.
 *
 * Return: Return 0 if permission is granted, -ve error otherwise.
 */
int security_key_alloc(struct key *key, const struct cred *cred,
                       unsigned long flags)
{
        int rc = lsm_key_alloc(key);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(key_alloc, key, cred, flags);
        if (unlikely(rc))
                security_key_free(key);
        return rc;
}

/**
 * security_key_free() - Free a kernel key LSM blob
 * @key: key
 *
 * Notification of destruction; free security data.
 */
void security_key_free(struct key *key)
{
        kfree(key->security);
        key->security = NULL;
}

/**
 * security_key_permission() - Check if a kernel key operation is allowed
 * @key_ref: key reference
 * @cred: credentials of actor requesting access
 * @need_perm: requested permissions
 *
 * See whether a specific operational right is granted to a process on a key.
 *
 * Return: Return 0 if permission is granted, -ve error otherwise.
 */
int security_key_permission(key_ref_t key_ref, const struct cred *cred,
                            enum key_need_perm need_perm)
{
        return call_int_hook(key_permission, key_ref, cred, need_perm);
}

/**
 * security_key_getsecurity() - Get the key's security label
 * @key: key
 * @buffer: security label buffer
 *
 * Get a textual representation of the security context attached to a key for
 * the purposes of honouring KEYCTL_GETSECURITY.  This function allocates the
 * storage for the NUL-terminated string and the caller should free it.
 *
 * Return: Returns the length of @buffer (including terminating NUL) or -ve if
 *         an error occurs.  May also return 0 (and a NULL buffer pointer) if
 *         there is no security label assigned to the key.
 */
int security_key_getsecurity(struct key *key, char **buffer)
{
        *buffer = NULL;
        return call_int_hook(key_getsecurity, key, buffer);
}

/**
 * security_key_post_create_or_update() - Notification of key create or update
 * @keyring: keyring to which the key is linked to
 * @key: created or updated key
 * @payload: data used to instantiate or update the key
 * @payload_len: length of payload
 * @flags: key flags
 * @create: flag indicating whether the key was created or updated
 *
 * Notify the caller of a key creation or update.
 */
void security_key_post_create_or_update(struct key *keyring, struct key *key,
                                        const void *payload, size_t payload_len,
                                        unsigned long flags, bool create)
{
        call_void_hook(key_post_create_or_update, keyring, key, payload,
                       payload_len, flags, create);
}
#endif        /* CONFIG_KEYS */

#ifdef CONFIG_AUDIT
/**
 * security_audit_rule_init() - Allocate and init an LSM audit rule struct
 * @field: audit action
 * @op: rule operator
 * @rulestr: rule context
 * @lsmrule: receive buffer for audit rule struct
 * @gfp: GFP flag used for kmalloc
 *
 * Allocate and initialize an LSM audit rule structure.
 *
 * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of
 *         an invalid rule.
 */
int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule,
                             gfp_t gfp)
{
        return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule, gfp);
}

/**
 * security_audit_rule_known() - Check if an audit rule contains LSM fields
 * @krule: audit rule
 *
 * Specifies whether given @krule contains any fields related to the current
 * LSM.
 *
 * Return: Returns 1 in case of relation found, 0 otherwise.
 */
int security_audit_rule_known(struct audit_krule *krule)
{
        return call_int_hook(audit_rule_known, krule);
}

/**
 * security_audit_rule_free() - Free an LSM audit rule struct
 * @lsmrule: audit rule struct
 *
 * Deallocate the LSM audit rule structure previously allocated by
 * audit_rule_init().
 */
void security_audit_rule_free(void *lsmrule)
{
        call_void_hook(audit_rule_free, lsmrule);
}

/**
 * security_audit_rule_match() - Check if a label matches an audit rule
 * @prop: security label
 * @field: LSM audit field
 * @op: matching operator
 * @lsmrule: audit rule
 *
 * Determine if given @secid matches a rule previously approved by
 * security_audit_rule_known().
 *
 * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on
 *         failure.
 */
int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
                              void *lsmrule)
{
        return call_int_hook(audit_rule_match, prop, field, op, lsmrule);
}
#endif /* CONFIG_AUDIT */

#ifdef CONFIG_BPF_SYSCALL
/**
 * security_bpf() - Check if the bpf syscall operation is allowed
 * @cmd: command
 * @attr: bpf attribute
 * @size: size
 * @kernel: whether or not call originated from kernel
 *
 * Do a initial check for all bpf syscalls after the attribute is copied into
 * the kernel. The actual security module can implement their own rules to
 * check the specific cmd they need.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel)
{
        return call_int_hook(bpf, cmd, attr, size, kernel);
}

/**
 * security_bpf_map() - Check if access to a bpf map is allowed
 * @map: bpf map
 * @fmode: mode
 *
 * Do a check when the kernel generates and returns a file descriptor for eBPF
 * maps.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        return call_int_hook(bpf_map, map, fmode);
}

/**
 * security_bpf_prog() - Check if access to a bpf program is allowed
 * @prog: bpf program
 *
 * Do a check when the kernel generates and returns a file descriptor for eBPF
 * programs.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf_prog(struct bpf_prog *prog)
{
        return call_int_hook(bpf_prog, prog);
}

/**
 * security_bpf_map_create() - Check if BPF map creation is allowed
 * @map: BPF map object
 * @attr: BPF syscall attributes used to create BPF map
 * @token: BPF token used to grant user access
 * @kernel: whether or not call originated from kernel
 *
 * Do a check when the kernel creates a new BPF map. This is also the
 * point where LSM blob is allocated for LSMs that need them.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
                            struct bpf_token *token, bool kernel)
{
        int rc;

        rc = lsm_bpf_map_alloc(map);
        if (unlikely(rc))
                return rc;

        rc = call_int_hook(bpf_map_create, map, attr, token, kernel);
        if (unlikely(rc))
                security_bpf_map_free(map);
        return rc;
}

/**
 * security_bpf_prog_load() - Check if loading of BPF program is allowed
 * @prog: BPF program object
 * @attr: BPF syscall attributes used to create BPF program
 * @token: BPF token used to grant user access to BPF subsystem
 * @kernel: whether or not call originated from kernel
 *
 * Perform an access control check when the kernel loads a BPF program and
 * allocates associated BPF program object. This hook is also responsible for
 * allocating any required LSM state for the BPF program.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
                           struct bpf_token *token, bool kernel)
{
        int rc;

        rc = lsm_bpf_prog_alloc(prog);
        if (unlikely(rc))
                return rc;

        rc = call_int_hook(bpf_prog_load, prog, attr, token, kernel);
        if (unlikely(rc))
                security_bpf_prog_free(prog);
        return rc;
}

/**
 * security_bpf_token_create() - Check if creating of BPF token is allowed
 * @token: BPF token object
 * @attr: BPF syscall attributes used to create BPF token
 * @path: path pointing to BPF FS mount point from which BPF token is created
 *
 * Do a check when the kernel instantiates a new BPF token object from BPF FS
 * instance. This is also the point where LSM blob can be allocated for LSMs.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
                              const struct path *path)
{
        int rc;

        rc = lsm_bpf_token_alloc(token);
        if (unlikely(rc))
                return rc;

        rc = call_int_hook(bpf_token_create, token, attr, path);
        if (unlikely(rc))
                security_bpf_token_free(token);
        return rc;
}

/**
 * security_bpf_token_cmd() - Check if BPF token is allowed to delegate
 * requested BPF syscall command
 * @token: BPF token object
 * @cmd: BPF syscall command requested to be delegated by BPF token
 *
 * Do a check when the kernel decides whether provided BPF token should allow
 * delegation of requested BPF syscall command.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
{
        return call_int_hook(bpf_token_cmd, token, cmd);
}

/**
 * security_bpf_token_capable() - Check if BPF token is allowed to delegate
 * requested BPF-related capability
 * @token: BPF token object
 * @cap: capabilities requested to be delegated by BPF token
 *
 * Do a check when the kernel decides whether provided BPF token should allow
 * delegation of requested BPF-related capabilities.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_capable(const struct bpf_token *token, int cap)
{
        return call_int_hook(bpf_token_capable, token, cap);
}

/**
 * security_bpf_map_free() - Free a bpf map's LSM blob
 * @map: bpf map
 *
 * Clean up the security information stored inside bpf map.
 */
void security_bpf_map_free(struct bpf_map *map)
{
        call_void_hook(bpf_map_free, map);
        kfree(map->security);
        map->security = NULL;
}

/**
 * security_bpf_prog_free() - Free a BPF program's LSM blob
 * @prog: BPF program struct
 *
 * Clean up the security information stored inside BPF program.
 */
void security_bpf_prog_free(struct bpf_prog *prog)
{
        call_void_hook(bpf_prog_free, prog);
        kfree(prog->aux->security);
        prog->aux->security = NULL;
}

/**
 * security_bpf_token_free() - Free a BPF token's LSM blob
 * @token: BPF token struct
 *
 * Clean up the security information stored inside BPF token.
 */
void security_bpf_token_free(struct bpf_token *token)
{
        call_void_hook(bpf_token_free, token);
        kfree(token->security);
        token->security = NULL;
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * security_locked_down() - Check if a kernel feature is allowed
 * @what: requested kernel feature
 *
 * Determine whether a kernel feature that potentially enables arbitrary code
 * execution in kernel space should be permitted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_locked_down(enum lockdown_reason what)
{
        return call_int_hook(locked_down, what);
}
EXPORT_SYMBOL(security_locked_down);

/**
 * security_bdev_alloc() - Allocate a block device LSM blob
 * @bdev: block device
 *
 * Allocate and attach a security structure to @bdev->bd_security.  The
 * security field is initialized to NULL when the bdev structure is
 * allocated.
 *
 * Return: Return 0 if operation was successful.
 */
int security_bdev_alloc(struct block_device *bdev)
{
        int rc = 0;

        rc = lsm_bdev_alloc(bdev);
        if (unlikely(rc))
                return rc;

        rc = call_int_hook(bdev_alloc_security, bdev);
        if (unlikely(rc))
                security_bdev_free(bdev);

        return rc;
}
EXPORT_SYMBOL(security_bdev_alloc);

/**
 * security_bdev_free() - Free a block device's LSM blob
 * @bdev: block device
 *
 * Deallocate the bdev security structure and set @bdev->bd_security to NULL.
 */
void security_bdev_free(struct block_device *bdev)
{
        if (!bdev->bd_security)
                return;

        call_void_hook(bdev_free_security, bdev);

        kfree(bdev->bd_security);
        bdev->bd_security = NULL;
}
EXPORT_SYMBOL(security_bdev_free);

/**
 * security_bdev_setintegrity() - Set the device's integrity data
 * @bdev: block device
 * @type: type of integrity, e.g. hash digest, signature, etc
 * @value: the integrity value
 * @size: size of the integrity value
 *
 * Register a verified integrity measurement of a bdev with LSMs.
 * LSMs should free the previously saved data if @value is NULL.
 * Please note that the new hook should be invoked every time the security
 * information is updated to keep these data current. For example, in dm-verity,
 * if the mapping table is reloaded and configured to use a different dm-verity
 * target with a new roothash and signing information, the previously stored
 * data in the LSM blob will become obsolete. It is crucial to re-invoke the
 * hook to refresh these data and ensure they are up to date. This necessity
 * arises from the design of device-mapper, where a device-mapper device is
 * first created, and then targets are subsequently loaded into it. These
 * targets can be modified multiple times during the device's lifetime.
 * Therefore, while the LSM blob is allocated during the creation of the block
 * device, its actual contents are not initialized at this stage and can change
 * substantially over time. This includes alterations from data that the LSMs
 * 'trusts' to those they do not, making it essential to handle these changes
 * correctly. Failure to address this dynamic aspect could potentially allow
 * for bypassing LSM checks.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_bdev_setintegrity(struct block_device *bdev,
                               enum lsm_integrity_type type, const void *value,
                               size_t size)
{
        return call_int_hook(bdev_setintegrity, bdev, type, value, size);
}
EXPORT_SYMBOL(security_bdev_setintegrity);

#ifdef CONFIG_PERF_EVENTS
/**
 * security_perf_event_open() - Check if a perf event open is allowed
 * @type: type of event
 *
 * Check whether the @type of perf_event_open syscall is allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_open(int type)
{
        return call_int_hook(perf_event_open, type);
}

/**
 * security_perf_event_alloc() - Allocate a perf event LSM blob
 * @event: perf event
 *
 * Allocate and save perf_event security info.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_perf_event_alloc(struct perf_event *event)
{
        int rc;

        rc = lsm_blob_alloc(&event->security, blob_sizes.lbs_perf_event,
                            GFP_KERNEL);
        if (rc)
                return rc;

        rc = call_int_hook(perf_event_alloc, event);
        if (rc) {
                kfree(event->security);
                event->security = NULL;
        }
        return rc;
}

/**
 * security_perf_event_free() - Free a perf event LSM blob
 * @event: perf event
 *
 * Release (free) perf_event security info.
 */
void security_perf_event_free(struct perf_event *event)
{
        kfree(event->security);
        event->security = NULL;
}

/**
 * security_perf_event_read() - Check if reading a perf event label is allowed
 * @event: perf event
 *
 * Read perf_event security info if allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_read(struct perf_event *event)
{
        return call_int_hook(perf_event_read, event);
}

/**
 * security_perf_event_write() - Check if writing a perf event label is allowed
 * @event: perf event
 *
 * Write perf_event security info if allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_write(struct perf_event *event)
{
        return call_int_hook(perf_event_write, event);
}
#endif /* CONFIG_PERF_EVENTS */

#ifdef CONFIG_IO_URING
/**
 * security_uring_override_creds() - Check if overriding creds is allowed
 * @new: new credentials
 *
 * Check if the current task, executing an io_uring operation, is allowed to
 * override it's credentials with @new.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_override_creds(const struct cred *new)
{
        return call_int_hook(uring_override_creds, new);
}

/**
 * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed
 *
 * Check whether the current task is allowed to spawn a io_uring polling thread
 * (IORING_SETUP_SQPOLL).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_sqpoll(void)
{
        return call_int_hook(uring_sqpoll);
}

/**
 * security_uring_cmd() - Check if a io_uring passthrough command is allowed
 * @ioucmd: command
 *
 * Check whether the file_operations uring_cmd is allowed to run.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_cmd(struct io_uring_cmd *ioucmd)
{
        return call_int_hook(uring_cmd, ioucmd);
}

/**
 * security_uring_allowed() - Check if io_uring_setup() is allowed
 *
 * Check whether the current task is allowed to call io_uring_setup().
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_allowed(void)
{
        return call_int_hook(uring_allowed);
}
#endif /* CONFIG_IO_URING */

/**
 * security_initramfs_populated() - Notify LSMs that initramfs has been loaded
 *
 * Tells the LSMs the initramfs has been unpacked into the rootfs.
 */
void security_initramfs_populated(void)
{
        call_void_hook(initramfs_populated);
}

































































































































































































































































































































































   14 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <linux/cache.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/idr.h>
#include <linux/rculist.h>
#include <linux/nsproxy.h>
#include <linux/fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
#include <linux/uidgid.h>
#include <linux/proc_fs.h>
#include <linux/nstree.h>

#include <net/aligned_data.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>

/*
 *        Our network namespace constructor/destructor lists
 */

static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;

LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);

/* Protects net_namespace_list. Nests iside rtnl_lock() */
DECLARE_RWSEM(net_rwsem);
EXPORT_SYMBOL_GPL(net_rwsem);

#ifdef CONFIG_KEYS
static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
#endif

struct net init_net;
EXPORT_SYMBOL(init_net);

static bool init_net_initialized;
/*
 * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
 * init_net_initialized and first_device pointer.
 * This is internal net namespace object. Please, don't use it
 * outside.
 */
DECLARE_RWSEM(pernet_ops_rwsem);

#define MIN_PERNET_OPS_ID        \
        ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))

#define INITIAL_NET_GEN_PTRS        13 /* +1 for len +2 for rcu_head */

static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;

static struct net_generic *net_alloc_generic(void)
{
        unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs);
        unsigned int generic_size;
        struct net_generic *ng;

        generic_size = offsetof(struct net_generic, ptr[gen_ptrs]);

        ng = kzalloc(generic_size, GFP_KERNEL);
        if (ng)
                ng->s.len = gen_ptrs;

        return ng;
}

static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
        struct net_generic *ng, *old_ng;

        BUG_ON(id < MIN_PERNET_OPS_ID);

        old_ng = rcu_dereference_protected(net->gen,
                                           lockdep_is_held(&pernet_ops_rwsem));
        if (old_ng->s.len > id) {
                old_ng->ptr[id] = data;
                return 0;
        }

        ng = net_alloc_generic();
        if (!ng)
                return -ENOMEM;

        /*
         * Some synchronisation notes:
         *
         * The net_generic explores the net->gen array inside rcu
         * read section. Besides once set the net->gen->ptr[x]
         * pointer never changes (see rules in netns/generic.h).
         *
         * That said, we simply duplicate this array and schedule
         * the old copy for kfree after a grace period.
         */

        memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
               (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
        ng->ptr[id] = data;

        rcu_assign_pointer(net->gen, ng);
        kfree_rcu(old_ng, s.rcu);
        return 0;
}

static int ops_init(const struct pernet_operations *ops, struct net *net)
{
        struct net_generic *ng;
        int err = -ENOMEM;
        void *data = NULL;

        if (ops->id) {
                data = kzalloc(ops->size, GFP_KERNEL);
                if (!data)
                        goto out;

                err = net_assign_generic(net, *ops->id, data);
                if (err)
                        goto cleanup;
        }
        err = 0;
        if (ops->init)
                err = ops->init(net);
        if (!err)
                return 0;

        if (ops->id) {
                ng = rcu_dereference_protected(net->gen,
                                               lockdep_is_held(&pernet_ops_rwsem));
                ng->ptr[*ops->id] = NULL;
        }

cleanup:
        kfree(data);

out:
        return err;
}

static void ops_pre_exit_list(const struct pernet_operations *ops,
                              struct list_head *net_exit_list)
{
        struct net *net;

        if (ops->pre_exit) {
                list_for_each_entry(net, net_exit_list, exit_list)
                        ops->pre_exit(net);
        }
}

static void ops_exit_rtnl_list(const struct list_head *ops_list,
                               const struct pernet_operations *ops,
                               struct list_head *net_exit_list)
{
        const struct pernet_operations *saved_ops = ops;
        LIST_HEAD(dev_kill_list);
        struct net *net;

        rtnl_lock();

        list_for_each_entry(net, net_exit_list, exit_list) {
                __rtnl_net_lock(net);

                ops = saved_ops;
                list_for_each_entry_continue_reverse(ops, ops_list, list) {
                        if (ops->exit_rtnl)
                                ops->exit_rtnl(net, &dev_kill_list);
                }

                __rtnl_net_unlock(net);
        }

        unregister_netdevice_many(&dev_kill_list);

        rtnl_unlock();
}

static void ops_exit_list(const struct pernet_operations *ops,
                          struct list_head *net_exit_list)
{
        if (ops->exit) {
                struct net *net;

                list_for_each_entry(net, net_exit_list, exit_list) {
                        ops->exit(net);
                        cond_resched();
                }
        }

        if (ops->exit_batch)
                ops->exit_batch(net_exit_list);
}

static void ops_free_list(const struct pernet_operations *ops,
                          struct list_head *net_exit_list)
{
        struct net *net;

        if (ops->id) {
                list_for_each_entry(net, net_exit_list, exit_list)
                        kfree(net_generic(net, *ops->id));
        }
}

static void ops_undo_list(const struct list_head *ops_list,
                          const struct pernet_operations *ops,
                          struct list_head *net_exit_list,
                          bool expedite_rcu)
{
        const struct pernet_operations *saved_ops;
        bool hold_rtnl = false;

        if (!ops)
                ops = list_entry(ops_list, typeof(*ops), list);

        saved_ops = ops;

        list_for_each_entry_continue_reverse(ops, ops_list, list) {
                hold_rtnl |= !!ops->exit_rtnl;
                ops_pre_exit_list(ops, net_exit_list);
        }

        /* Another CPU might be rcu-iterating the list, wait for it.
         * This needs to be before calling the exit() notifiers, so the
         * rcu_barrier() after ops_undo_list() isn't sufficient alone.
         * Also the pre_exit() and exit() methods need this barrier.
         */
        if (expedite_rcu)
                synchronize_rcu_expedited();
        else
                synchronize_rcu();

        if (hold_rtnl)
                ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list);

        ops = saved_ops;
        list_for_each_entry_continue_reverse(ops, ops_list, list)
                ops_exit_list(ops, net_exit_list);

        ops = saved_ops;
        list_for_each_entry_continue_reverse(ops, ops_list, list)
                ops_free_list(ops, net_exit_list);
}

static void ops_undo_single(struct pernet_operations *ops,
                            struct list_head *net_exit_list)
{
        LIST_HEAD(ops_list);

        list_add(&ops->list, &ops_list);
        ops_undo_list(&ops_list, NULL, net_exit_list, false);
        list_del(&ops->list);
}

/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
        int min = 0, max = 0;

        if (reqid >= 0) {
                min = reqid;
                max = reqid + 1;
        }

        return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
}

/* This function is used by idr_for_each(). If net is equal to peer, the
 * function returns the id so that idr_for_each() stops. Because we cannot
 * returns the id 0 (idr_for_each() will not stop), we return the magic value
 * NET_ID_ZERO (-1) for it.
 */
#define NET_ID_ZERO -1
static int net_eq_idr(int id, void *net, void *peer)
{
        if (net_eq(net, peer))
                return id ? : NET_ID_ZERO;
        return 0;
}

/* Must be called from RCU-critical section or with nsid_lock held */
static int __peernet2id(const struct net *net, struct net *peer)
{
        int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);

        /* Magic value for id 0. */
        if (id == NET_ID_ZERO)
                return 0;
        if (id > 0)
                return id;

        return NETNSA_NSID_NOT_ASSIGNED;
}

static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
                              struct nlmsghdr *nlh, gfp_t gfp);
/* This function returns the id of a peer netns. If no id is assigned, one will
 * be allocated and returned.
 */
int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
        int id;

        if (!check_net(net))
                return NETNSA_NSID_NOT_ASSIGNED;

        spin_lock(&net->nsid_lock);
        id = __peernet2id(net, peer);
        if (id >= 0) {
                spin_unlock(&net->nsid_lock);
                return id;
        }

        /* When peer is obtained from RCU lists, we may race with
         * its cleanup. Check whether it's alive, and this guarantees
         * we never hash a peer back to net->netns_ids, after it has
         * just been idr_remove()'d from there in cleanup_net().
         */
        if (!maybe_get_net(peer)) {
                spin_unlock(&net->nsid_lock);
                return NETNSA_NSID_NOT_ASSIGNED;
        }

        id = alloc_netid(net, peer, -1);
        spin_unlock(&net->nsid_lock);

        put_net(peer);
        if (id < 0)
                return NETNSA_NSID_NOT_ASSIGNED;

        rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp);

        return id;
}
EXPORT_SYMBOL_GPL(peernet2id_alloc);

/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(const struct net *net, struct net *peer)
{
        int id;

        rcu_read_lock();
        id = __peernet2id(net, peer);
        rcu_read_unlock();

        return id;
}
EXPORT_SYMBOL(peernet2id);

/* This function returns true is the peer netns has an id assigned into the
 * current netns.
 */
bool peernet_has_id(const struct net *net, struct net *peer)
{
        return peernet2id(net, peer) >= 0;
}

struct net *get_net_ns_by_id(const struct net *net, int id)
{
        struct net *peer;

        if (id < 0)
                return NULL;

        rcu_read_lock();
        peer = idr_find(&net->netns_ids, id);
        if (peer)
                peer = maybe_get_net(peer);
        rcu_read_unlock();

        return peer;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_id);

static __net_init void preinit_net_sysctl(struct net *net)
{
        net->core.sysctl_somaxconn = SOMAXCONN;
        /* Limits per socket sk_omem_alloc usage.
         * TCP zerocopy regular usage needs 128 KB.
         */
        net->core.sysctl_optmem_max = 128 * 1024;
        net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
        net->core.sysctl_tstamp_allow_data = 1;
        net->core.sysctl_txq_reselection = msecs_to_jiffies(1000);
}

/* init code that must occur even if setup_net() is not called. */
static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns)
{
        int ret;

        ret = ns_common_init(net);
        if (ret)
                return ret;

        refcount_set(&net->passive, 1);
        ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt");
        ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt");

        net->hash_mix = get_random_u32();
        net->dev_base_seq = 1;
        net->user_ns = user_ns;

        idr_init(&net->netns_ids);
        spin_lock_init(&net->nsid_lock);
        mutex_init(&net->ipv4.ra_mutex);

#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
        mutex_init(&net->rtnl_mutex);
        lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
#endif

        INIT_LIST_HEAD(&net->ptype_all);
        INIT_LIST_HEAD(&net->ptype_specific);
        preinit_net_sysctl(net);
        return 0;
}

/*
 * setup_net runs the initializers for the network namespace object.
 */
static __net_init int setup_net(struct net *net)
{
        /* Must be called with pernet_ops_rwsem held */
        const struct pernet_operations *ops;
        LIST_HEAD(net_exit_list);
        int error = 0;

        net->net_cookie = ns_tree_gen_id(net);

        list_for_each_entry(ops, &pernet_list, list) {
                error = ops_init(ops, net);
                if (error < 0)
                        goto out_undo;
        }
        down_write(&net_rwsem);
        list_add_tail_rcu(&net->list, &net_namespace_list);
        up_write(&net_rwsem);
        ns_tree_add_raw(net);
out:
        return error;

out_undo:
        /* Walk through the list backwards calling the exit functions
         * for the pernet modules whose init functions did not fail.
         */
        list_add(&net->exit_list, &net_exit_list);
        ops_undo_list(&pernet_list, ops, &net_exit_list, false);
        rcu_barrier();
        goto out;
}

#ifdef CONFIG_NET_NS
static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
}

static void dec_net_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
}

static struct kmem_cache *net_cachep __ro_after_init;
static struct workqueue_struct *netns_wq;

static struct net *net_alloc(void)
{
        struct net *net = NULL;
        struct net_generic *ng;

        ng = net_alloc_generic();
        if (!ng)
                goto out;

        net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
        if (!net)
                goto out_free;

#ifdef CONFIG_KEYS
        net->key_domain = kzalloc_obj(struct key_tag);
        if (!net->key_domain)
                goto out_free_2;
        refcount_set(&net->key_domain->usage, 1);
#endif

        rcu_assign_pointer(net->gen, ng);
out:
        return net;

#ifdef CONFIG_KEYS
out_free_2:
        kmem_cache_free(net_cachep, net);
        net = NULL;
#endif
out_free:
        kfree(ng);
        goto out;
}

static LLIST_HEAD(defer_free_list);

static void net_complete_free(void)
{
        struct llist_node *kill_list;
        struct net *net, *next;

        /* Get the list of namespaces to free from last round. */
        kill_list = llist_del_all(&defer_free_list);

        llist_for_each_entry_safe(net, next, kill_list, defer_free_list)
                kmem_cache_free(net_cachep, net);

}

void net_passive_dec(struct net *net)
{
        if (refcount_dec_and_test(&net->passive)) {
                kfree(rcu_access_pointer(net->gen));

                /* There should not be any trackers left there. */
                ref_tracker_dir_exit(&net->notrefcnt_tracker);

                /* Wait for an extra rcu_barrier() before final free. */
                llist_add(&net->defer_free_list, &defer_free_list);
        }
}

void net_drop_ns(struct ns_common *ns)
{
        if (ns)
                net_passive_dec(to_net_ns(ns));
}

struct net *copy_net_ns(u64 flags,
                        struct user_namespace *user_ns, struct net *old_net)
{
        struct ucounts *ucounts;
        struct net *net;
        int rv;

        if (!(flags & CLONE_NEWNET))
                return get_net(old_net);

        ucounts = inc_net_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        net = net_alloc();
        if (!net) {
                rv = -ENOMEM;
                goto dec_ucounts;
        }

        rv = preinit_net(net, user_ns);
        if (rv < 0)
                goto dec_ucounts;
        net->ucounts = ucounts;
        get_user_ns(user_ns);

        rv = down_read_killable(&pernet_ops_rwsem);
        if (rv < 0)
                goto put_userns;

        rv = setup_net(net);

        up_read(&pernet_ops_rwsem);

        if (rv < 0) {
put_userns:
                ns_common_free(net);
#ifdef CONFIG_KEYS
                key_remove_domain(net->key_domain);
#endif
                put_user_ns(user_ns);
                net_passive_dec(net);
dec_ucounts:
                dec_net_namespaces(ucounts);
                return ERR_PTR(rv);
        }
        return net;
}

/**
 * net_ns_get_ownership - get sysfs ownership data for @net
 * @net: network namespace in question (can be NULL)
 * @uid: kernel user ID for sysfs objects
 * @gid: kernel group ID for sysfs objects
 *
 * Returns the uid/gid pair of root in the user namespace associated with the
 * given network namespace.
 */
void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
{
        if (net) {
                kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
                kgid_t ns_root_gid = make_kgid(net->user_ns, 0);

                if (uid_valid(ns_root_uid))
                        *uid = ns_root_uid;

                if (gid_valid(ns_root_gid))
                        *gid = ns_root_gid;
        } else {
                *uid = GLOBAL_ROOT_UID;
                *gid = GLOBAL_ROOT_GID;
        }
}
EXPORT_SYMBOL_GPL(net_ns_get_ownership);

static void unhash_nsid(struct net *last)
{
        struct net *tmp, *peer;

        /* This function is only called from cleanup_net() work,
         * and this work is the only process, that may delete
         * a net from net_namespace_list. So, when the below
         * is executing, the list may only grow. Thus, we do not
         * use for_each_net_rcu() or net_rwsem.
         */
        for_each_net(tmp) {
                int id = 0;

                spin_lock(&tmp->nsid_lock);
                while ((peer = idr_get_next(&tmp->netns_ids, &id))) {
                        int curr_id = id;

                        id++;
                        if (!peer->is_dying)
                                continue;

                        idr_remove(&tmp->netns_ids, curr_id);
                        spin_unlock(&tmp->nsid_lock);
                        rtnl_net_notifyid(tmp, RTM_DELNSID, curr_id, 0, NULL,
                                          GFP_KERNEL);
                        spin_lock(&tmp->nsid_lock);
                }
                spin_unlock(&tmp->nsid_lock);
                if (tmp == last)
                        break;
        }
}

static LLIST_HEAD(cleanup_list);

struct task_struct *cleanup_net_task;

static void cleanup_net(struct work_struct *work)
{
        struct llist_node *net_kill_list;
        struct net *net, *tmp, *last;
        LIST_HEAD(net_exit_list);

        WRITE_ONCE(cleanup_net_task, current);

        /* Atomically snapshot the list of namespaces to cleanup */
        net_kill_list = llist_del_all(&cleanup_list);

        down_read(&pernet_ops_rwsem);

        /* Don't let anyone else find us. */
        down_write(&net_rwsem);
        llist_for_each_entry(net, net_kill_list, cleanup_list) {
                ns_tree_remove(net);
                list_del_rcu(&net->list);
                net->is_dying = true;
        }
        /* Cache last net. After we unlock rtnl, no one new net
         * added to net_namespace_list can assign nsid pointer
         * to a net from net_kill_list (see peernet2id_alloc()).
         * So, we skip them in unhash_nsid().
         *
         * Note, that unhash_nsid() does not delete nsid links
         * between net_kill_list's nets, as they've already
         * deleted from net_namespace_list. But, this would be
         * useless anyway, as netns_ids are destroyed there.
         */
        last = list_last_entry(&net_namespace_list, struct net, list);
        up_write(&net_rwsem);

        unhash_nsid(last);

        llist_for_each_entry(net, net_kill_list, cleanup_list) {
                idr_destroy(&net->netns_ids);
                list_add_tail(&net->exit_list, &net_exit_list);
        }

        ops_undo_list(&pernet_list, NULL, &net_exit_list, true);

        up_read(&pernet_ops_rwsem);

        /* Ensure there are no outstanding rcu callbacks using this
         * network namespace.
         */
        rcu_barrier();

        net_complete_free();

        /* Finally it is safe to free my network namespace structure */
        list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
                list_del_init(&net->exit_list);
                ns_common_free(net);
                dec_net_namespaces(net->ucounts);
#ifdef CONFIG_KEYS
                key_remove_domain(net->key_domain);
#endif
                put_user_ns(net->user_ns);
                net_passive_dec(net);
        }
        WRITE_ONCE(cleanup_net_task, NULL);
}

/**
 * net_ns_barrier - wait until concurrent net_cleanup_work is done
 *
 * cleanup_net runs from work queue and will first remove namespaces
 * from the global list, then run net exit functions.
 *
 * Call this in module exit path to make sure that all netns
 * ->exit ops have been invoked before the function is removed.
 */
void net_ns_barrier(void)
{
        down_write(&pernet_ops_rwsem);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL(net_ns_barrier);

static DECLARE_WORK(net_cleanup_work, cleanup_net);

void __put_net(struct net *net)
{
        ref_tracker_dir_exit(&net->refcnt_tracker);
        /* Cleanup the network namespace in process context */
        if (llist_add(&net->cleanup_list, &cleanup_list))
                queue_work(netns_wq, &net_cleanup_work);
}
EXPORT_SYMBOL_GPL(__put_net);

/**
 * get_net_ns - increment the refcount of the network namespace
 * @ns: common namespace (net)
 *
 * Returns the net's common namespace or ERR_PTR() if ref is zero.
 */
struct ns_common *get_net_ns(struct ns_common *ns)
{
        struct net *net;

        net = maybe_get_net(container_of(ns, struct net, ns));
        if (net)
                return &net->ns;
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns);

struct net *get_net_ns_by_fd(int fd)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return ERR_PTR(-EBADF);

        if (proc_ns_file(fd_file(f))) {
                struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
                if (ns->ops == &netns_operations)
                        return get_net(container_of(ns, struct net, ns));
        }

        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
#endif

struct net *get_net_ns_by_pid(pid_t pid)
{
        struct task_struct *tsk;
        struct net *net;

        /* Lookup the network namespace */
        net = ERR_PTR(-ESRCH);
        rcu_read_lock();
        tsk = find_task_by_vpid(pid);
        if (tsk) {
                struct nsproxy *nsproxy;
                task_lock(tsk);
                nsproxy = tsk->nsproxy;
                if (nsproxy)
                        net = get_net(nsproxy->net_ns);
                task_unlock(tsk);
        }
        rcu_read_unlock();
        return net;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_pid);

#ifdef CONFIG_NET_NS_REFCNT_TRACKER
static void net_ns_net_debugfs(struct net *net)
{
        ref_tracker_dir_symlink(&net->refcnt_tracker, "netns-%llx-%u-refcnt",
                                net->net_cookie, net->ns.inum);
        ref_tracker_dir_symlink(&net->notrefcnt_tracker, "netns-%llx-%u-notrefcnt",
                                net->net_cookie, net->ns.inum);
}

static int __init init_net_debugfs(void)
{
        ref_tracker_dir_debugfs(&init_net.refcnt_tracker);
        ref_tracker_dir_debugfs(&init_net.notrefcnt_tracker);
        net_ns_net_debugfs(&init_net);
        return 0;
}
late_initcall(init_net_debugfs);
#else
static void net_ns_net_debugfs(struct net *net)
{
}
#endif

static __net_init int net_ns_net_init(struct net *net)
{
        net_ns_net_debugfs(net);
        return 0;
}

static struct pernet_operations __net_initdata net_ns_ops = {
        .init = net_ns_net_init,
};

static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
        [NETNSA_NONE]                = { .type = NLA_UNSPEC },
        [NETNSA_NSID]                = { .type = NLA_S32 },
        [NETNSA_PID]                = { .type = NLA_U32 },
        [NETNSA_FD]                = { .type = NLA_U32 },
        [NETNSA_TARGET_NSID]        = { .type = NLA_S32 },
};

static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[NETNSA_MAX + 1];
        struct nlattr *nla;
        struct net *peer;
        int nsid, err;

        err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb,
                                     NETNSA_MAX, rtnl_net_policy, extack);
        if (err < 0)
                return err;
        if (!tb[NETNSA_NSID]) {
                NL_SET_ERR_MSG(extack, "nsid is missing");
                return -EINVAL;
        }
        nsid = nla_get_s32(tb[NETNSA_NSID]);

        if (tb[NETNSA_PID]) {
                peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
                nla = tb[NETNSA_PID];
        } else if (tb[NETNSA_FD]) {
                peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
                nla = tb[NETNSA_FD];
        } else {
                NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
                return -EINVAL;
        }
        if (IS_ERR(peer)) {
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
                return PTR_ERR(peer);
        }

        spin_lock(&net->nsid_lock);
        if (__peernet2id(net, peer) >= 0) {
                spin_unlock(&net->nsid_lock);
                err = -EEXIST;
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack,
                               "Peer netns already has a nsid assigned");
                goto out;
        }

        err = alloc_netid(net, peer, nsid);
        spin_unlock(&net->nsid_lock);
        if (err >= 0) {
                rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid,
                                  nlh, GFP_KERNEL);
                err = 0;
        } else if (err == -ENOSPC && nsid >= 0) {
                err = -EEXIST;
                NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]);
                NL_SET_ERR_MSG(extack, "The specified nsid is already used");
        }
out:
        put_net(peer);
        return err;
}

static int rtnl_net_get_size(void)
{
        return NLMSG_ALIGN(sizeof(struct rtgenmsg))
               + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
               + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */
               ;
}

struct net_fill_args {
        u32 portid;
        u32 seq;
        int flags;
        int cmd;
        int nsid;
        bool add_ref;
        int ref_nsid;
};

static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
{
        struct nlmsghdr *nlh;
        struct rtgenmsg *rth;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth),
                        args->flags);
        if (!nlh)
                return -EMSGSIZE;

        rth = nlmsg_data(nlh);
        rth->rtgen_family = AF_UNSPEC;

        if (nla_put_s32(skb, NETNSA_NSID, args->nsid))
                goto nla_put_failure;

        if (args->add_ref &&
            nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int rtnl_net_valid_getid_req(struct sk_buff *skb,
                                    const struct nlmsghdr *nlh,
                                    struct nlattr **tb,
                                    struct netlink_ext_ack *extack)
{
        int i, err;

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg),
                                              tb, NETNSA_MAX, rtnl_net_policy,
                                              extack);

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
                                            NETNSA_MAX, rtnl_net_policy,
                                            extack);
        if (err)
                return err;

        for (i = 0; i <= NETNSA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NETNSA_PID:
                case NETNSA_FD:
                case NETNSA_NSID:
                case NETNSA_TARGET_NSID:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[NETNSA_MAX + 1];
        struct net_fill_args fillargs = {
                .portid = NETLINK_CB(skb).portid,
                .seq = nlh->nlmsg_seq,
                .cmd = RTM_NEWNSID,
        };
        struct net *peer, *target = net;
        struct nlattr *nla;
        struct sk_buff *msg;
        int err;

        err = rtnl_net_valid_getid_req(skb, nlh, tb, extack);
        if (err < 0)
                return err;
        if (tb[NETNSA_PID]) {
                peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
                nla = tb[NETNSA_PID];
        } else if (tb[NETNSA_FD]) {
                peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
                nla = tb[NETNSA_FD];
        } else if (tb[NETNSA_NSID]) {
                peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID]));
                if (!peer)
                        peer = ERR_PTR(-ENOENT);
                nla = tb[NETNSA_NSID];
        } else {
                NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
                return -EINVAL;
        }

        if (IS_ERR(peer)) {
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
                return PTR_ERR(peer);
        }

        if (tb[NETNSA_TARGET_NSID]) {
                int id = nla_get_s32(tb[NETNSA_TARGET_NSID]);

                target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id);
                if (IS_ERR(target)) {
                        NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]);
                        NL_SET_ERR_MSG(extack,
                                       "Target netns reference is invalid");
                        err = PTR_ERR(target);
                        goto out;
                }
                fillargs.add_ref = true;
                fillargs.ref_nsid = peernet2id(net, peer);
        }

        msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
        if (!msg) {
                err = -ENOMEM;
                goto out;
        }

        fillargs.nsid = peernet2id(target, peer);
        err = rtnl_net_fill(msg, &fillargs);
        if (err < 0)
                goto err_out;

        err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
        goto out;

err_out:
        nlmsg_free(msg);
out:
        if (fillargs.add_ref)
                put_net(target);
        put_net(peer);
        return err;
}

struct rtnl_net_dump_cb {
        struct net *tgt_net;
        struct net *ref_net;
        struct sk_buff *skb;
        struct net_fill_args fillargs;
        int idx;
        int s_idx;
};

/* Runs in RCU-critical section. */
static int rtnl_net_dumpid_one(int id, void *peer, void *data)
{
        struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
        int ret;

        if (net_cb->idx < net_cb->s_idx)
                goto cont;

        net_cb->fillargs.nsid = id;
        if (net_cb->fillargs.add_ref)
                net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer);
        ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs);
        if (ret < 0)
                return ret;

cont:
        net_cb->idx++;
        return 0;
}

static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
                                   struct rtnl_net_dump_cb *net_cb,
                                   struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[NETNSA_MAX + 1];
        int err, i;

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
                                            NETNSA_MAX, rtnl_net_policy,
                                            extack);
        if (err < 0)
                return err;

        for (i = 0; i <= NETNSA_MAX; i++) {
                if (!tb[i])
                        continue;

                if (i == NETNSA_TARGET_NSID) {
                        struct net *net;

                        net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i]));
                        if (IS_ERR(net)) {
                                NL_SET_BAD_ATTR(extack, tb[i]);
                                NL_SET_ERR_MSG(extack,
                                               "Invalid target network namespace id");
                                return PTR_ERR(net);
                        }
                        net_cb->fillargs.add_ref = true;
                        net_cb->ref_net = net_cb->tgt_net;
                        net_cb->tgt_net = net;
                } else {
                        NL_SET_BAD_ATTR(extack, tb[i]);
                        NL_SET_ERR_MSG(extack,
                                       "Unsupported attribute in dump request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rtnl_net_dump_cb net_cb = {
                .tgt_net = sock_net(skb->sk),
                .skb = skb,
                .fillargs = {
                        .portid = NETLINK_CB(cb->skb).portid,
                        .seq = cb->nlh->nlmsg_seq,
                        .flags = NLM_F_MULTI,
                        .cmd = RTM_NEWNSID,
                },
                .idx = 0,
                .s_idx = cb->args[0],
        };
        int err = 0;

        if (cb->strict_check) {
                err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb);
                if (err < 0)
                        goto end;
        }

        rcu_read_lock();
        idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
        rcu_read_unlock();

        cb->args[0] = net_cb.idx;
end:
        if (net_cb.fillargs.add_ref)
                put_net(net_cb.tgt_net);
        return err;
}

static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
                              struct nlmsghdr *nlh, gfp_t gfp)
{
        struct net_fill_args fillargs = {
                .portid = portid,
                .seq = nlh ? nlh->nlmsg_seq : 0,
                .cmd = cmd,
                .nsid = id,
        };
        struct sk_buff *msg;
        int err = -ENOMEM;

        msg = nlmsg_new(rtnl_net_get_size(), gfp);
        if (!msg)
                goto out;

        err = rtnl_net_fill(msg, &fillargs);
        if (err < 0)
                goto err_out;

        rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp);
        return;

err_out:
        nlmsg_free(msg);
out:
        rtnl_set_sk_err(net, RTNLGRP_NSID, err);
}

#ifdef CONFIG_NET_NS
static void __init netns_ipv4_struct_check(void)
{
        /* TX readonly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_early_retrans);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_tso_win_divisor);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_tso_rtt_log);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_autocorking);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_min_snd_mss);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_notsent_lowat);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_limit_output_bytes);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_min_rtt_wlen);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_wmem);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_ip_fwd_use_pmtu);

        /* RX readonly hotpath cache line */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_moderate_rcvbuf);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_rcvbuf_low_rtt);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_ip_early_demux);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_early_demux);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_l3mdev_accept);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_reordering);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_rmem);
}
#endif

static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = {
        {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid,
         .flags = RTNL_FLAG_DOIT_UNLOCKED},
        {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid,
         .dumpit = rtnl_net_dumpid,
         .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
};

void __init net_ns_init(void)
{
        struct net_generic *ng;

#ifdef CONFIG_NET_NS
        netns_ipv4_struct_check();
        net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
                                        SMP_CACHE_BYTES,
                                        SLAB_PANIC|SLAB_ACCOUNT, NULL);

        /* Create workqueue for cleanup */
        netns_wq = create_singlethread_workqueue("netns");
        if (!netns_wq)
                panic("Could not create netns workq");
#endif

        ng = net_alloc_generic();
        if (!ng)
                panic("Could not allocate generic netns");

        rcu_assign_pointer(init_net.gen, ng);

#ifdef CONFIG_KEYS
        init_net.key_domain = &init_net_key_domain;
#endif
        /*
         * This currently cannot fail as the initial network namespace
         * has a static inode number.
         */
        if (preinit_net(&init_net, &init_user_ns))
                panic("Could not preinitialize the initial network namespace");

        down_write(&pernet_ops_rwsem);
        if (setup_net(&init_net))
                panic("Could not setup the initial network namespace");

        init_net_initialized = true;
        up_write(&pernet_ops_rwsem);

        if (register_pernet_subsys(&net_ns_ops))
                panic("Could not register network namespace subsystems");

        rtnl_register_many(net_ns_rtnl_msg_handlers);
}

#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
                                        struct pernet_operations *ops)
{
        LIST_HEAD(net_exit_list);
        struct net *net;
        int error;

        list_add_tail(&ops->list, list);
        if (ops->init || ops->id) {
                /* We held write locked pernet_ops_rwsem, and parallel
                 * setup_net() and cleanup_net() are not possible.
                 */
                for_each_net(net) {
                        error = ops_init(ops, net);
                        if (error)
                                goto out_undo;
                        list_add_tail(&net->exit_list, &net_exit_list);
                }
        }
        return 0;

out_undo:
        /* If I have an error cleanup all namespaces I initialized */
        list_del(&ops->list);
        ops_undo_single(ops, &net_exit_list);
        return error;
}

static void __unregister_pernet_operations(struct pernet_operations *ops)
{
        LIST_HEAD(net_exit_list);
        struct net *net;

        /* See comment in __register_pernet_operations() */
        for_each_net(net)
                list_add_tail(&net->exit_list, &net_exit_list);

        list_del(&ops->list);
        ops_undo_single(ops, &net_exit_list);
}

#else

static int __register_pernet_operations(struct list_head *list,
                                        struct pernet_operations *ops)
{
        if (!init_net_initialized) {
                list_add_tail(&ops->list, list);
                return 0;
        }

        return ops_init(ops, &init_net);
}

static void __unregister_pernet_operations(struct pernet_operations *ops)
{
        if (!init_net_initialized) {
                list_del(&ops->list);
        } else {
                LIST_HEAD(net_exit_list);

                list_add(&init_net.exit_list, &net_exit_list);
                ops_undo_single(ops, &net_exit_list);
        }
}

#endif /* CONFIG_NET_NS */

static DEFINE_IDA(net_generic_ids);

static int register_pernet_operations(struct list_head *list,
                                      struct pernet_operations *ops)
{
        int error;

        if (WARN_ON(!!ops->id ^ !!ops->size))
                return -EINVAL;

        if (ops->id) {
                error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
                                GFP_KERNEL);
                if (error < 0)
                        return error;
                *ops->id = error;
                /* This does not require READ_ONCE as writers already hold
                 * pernet_ops_rwsem. But WRITE_ONCE is needed to protect
                 * net_alloc_generic.
                 */
                WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1));
        }
        error = __register_pernet_operations(list, ops);
        if (error) {
                rcu_barrier();
                if (ops->id)
                        ida_free(&net_generic_ids, *ops->id);
        }

        return error;
}

static void unregister_pernet_operations(struct pernet_operations *ops)
{
        __unregister_pernet_operations(ops);
        rcu_barrier();
        if (ops->id)
                ida_free(&net_generic_ids, *ops->id);
}

/**
 *      register_pernet_subsys - register a network namespace subsystem
 *        @ops:  pernet operations structure for the subsystem
 *
 *        Register a subsystem which has init and exit functions
 *        that are called when network namespaces are created and
 *        destroyed respectively.
 *
 *        When registered all network namespace init functions are
 *        called for every existing network namespace.  Allowing kernel
 *        modules to have a race free view of the set of network namespaces.
 *
 *        When a new network namespace is created all of the init
 *        methods are called in the order in which they were registered.
 *
 *        When a network namespace is destroyed all of the exit methods
 *        are called in the reverse of the order with which they were
 *        registered.
 */
int register_pernet_subsys(struct pernet_operations *ops)
{
        int error;
        down_write(&pernet_ops_rwsem);
        error =  register_pernet_operations(first_device, ops);
        up_write(&pernet_ops_rwsem);
        return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);

/**
 *      unregister_pernet_subsys - unregister a network namespace subsystem
 *        @ops: pernet operations structure to manipulate
 *
 *        Remove the pernet operations structure from the list to be
 *        used when network namespaces are created or destroyed.  In
 *        addition run the exit method for all existing network
 *        namespaces.
 */
void unregister_pernet_subsys(struct pernet_operations *ops)
{
        down_write(&pernet_ops_rwsem);
        unregister_pernet_operations(ops);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);

/**
 *      register_pernet_device - register a network namespace device
 *        @ops:  pernet operations structure for the subsystem
 *
 *        Register a device which has init and exit functions
 *        that are called when network namespaces are created and
 *        destroyed respectively.
 *
 *        When registered all network namespace init functions are
 *        called for every existing network namespace.  Allowing kernel
 *        modules to have a race free view of the set of network namespaces.
 *
 *        When a new network namespace is created all of the init
 *        methods are called in the order in which they were registered.
 *
 *        When a network namespace is destroyed all of the exit methods
 *        are called in the reverse of the order with which they were
 *        registered.
 */
int register_pernet_device(struct pernet_operations *ops)
{
        int error;
        down_write(&pernet_ops_rwsem);
        error = register_pernet_operations(&pernet_list, ops);
        if (!error && (first_device == &pernet_list))
                first_device = &ops->list;
        up_write(&pernet_ops_rwsem);
        return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);

/**
 *      unregister_pernet_device - unregister a network namespace netdevice
 *        @ops: pernet operations structure to manipulate
 *
 *        Remove the pernet operations structure from the list to be
 *        used when network namespaces are created or destroyed.  In
 *        addition run the exit method for all existing network
 *        namespaces.
 */
void unregister_pernet_device(struct pernet_operations *ops)
{
        down_write(&pernet_ops_rwsem);
        if (&ops->list == first_device)
                first_device = first_device->next;
        unregister_pernet_operations(ops);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);

#ifdef CONFIG_NET_NS
static struct ns_common *netns_get(struct task_struct *task)
{
        struct net *net = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy)
                net = get_net(nsproxy->net_ns);
        task_unlock(task);

        return net ? &net->ns : NULL;
}

static void netns_put(struct ns_common *ns)
{
        put_net(to_net_ns(ns));
}

static int netns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct net *net = to_net_ns(ns);

        if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        put_net(nsproxy->net_ns);
        nsproxy->net_ns = get_net(net);
        return 0;
}

static struct user_namespace *netns_owner(struct ns_common *ns)
{
        return to_net_ns(ns)->user_ns;
}

const struct proc_ns_operations netns_operations = {
        .name                = "net",
        .get                = netns_get,
        .put                = netns_put,
        .install        = netns_install,
        .owner                = netns_owner,
};
#endif









































































   21 




   21 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_H
#define _ASM_X86_PAGE_H

#include <linux/types.h>

#ifdef __KERNEL__

#include <asm/page_types.h>

#ifdef CONFIG_X86_64
#include <asm/page_64.h>
#else
#include <asm/page_32.h>
#endif        /* CONFIG_X86_64 */

#ifndef __ASSEMBLER__

struct page;

#include <linux/range.h>
extern struct range pfn_mapped[];
extern int nr_pfn_mapped;

static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
                                  struct page *topage)
{
        copy_page(to, from);
}

#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
        vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)

#ifndef __pa
#define __pa(x)                __phys_addr((unsigned long)(x))
#endif

#define __pa_nodebug(x)        __phys_addr_nodebug((unsigned long)(x))
/* __pa_symbol should be used for C visible symbols.
   This seems to be the official gcc blessed way to do such arithmetic. */
/*
 * We need __phys_reloc_hide() here because gcc may assume that there is no
 * overflow during __pa() calculation and can optimize it unexpectedly.
 * Newer versions of gcc provide -fno-strict-overflow switch to handle this
 * case properly. Once all supported versions of gcc understand it, we can
 * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
 */
#define __pa_symbol(x) \
        __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))

#ifndef __va
#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
#endif

#define __boot_va(x)                __va(x)
#define __boot_pa(x)                __pa(x)

/*
 * virt_to_page(kaddr) returns a valid pointer if and only if
 * virt_addr_valid(kaddr) returns true.
 */
#define virt_to_page(kaddr)        pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
extern bool __virt_addr_valid(unsigned long kaddr);
#define virt_addr_valid(kaddr)        __virt_addr_valid((unsigned long) (kaddr))

static __always_inline void *pfn_to_kaddr(unsigned long pfn)
{
        return __va(pfn << PAGE_SHIFT);
}

static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits)
{
        return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
}

static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits)
{
        return __canonical_address(vaddr, vaddr_bits) == vaddr;
}

#endif        /* __ASSEMBLER__ */

#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>

#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA

#endif        /* __KERNEL__ */
#endif /* _ASM_X86_PAGE_H */
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

















    1 



    1 















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux NET3:        Internet Group Management Protocol  [IGMP]
 *
 *        This code implements the IGMP protocol as defined in RFC1112. There has
 *        been a further revision of this protocol since which is now supported.
 *
 *        If you have trouble with this module be careful what gcc you have used,
 *        the older version didn't come out right using gcc 2.5.8, the newer one
 *        seems to fall out with gcc 2.6.2.
 *
 *        Authors:
 *                Alan Cox <alan@lxorguk.ukuu.org.uk>
 *
 *        Fixes:
 *
 *                Alan Cox        :        Added lots of __inline__ to optimise
 *                                        the memory usage of all the tiny little
 *                                        functions.
 *                Alan Cox        :        Dumped the header building experiment.
 *                Alan Cox        :        Minor tweaks ready for multicast routing
 *                                        and extended IGMP protocol.
 *                Alan Cox        :        Removed a load of inline directives. Gcc 2.5.8
 *                                        writes utterly bogus code otherwise (sigh)
 *                                        fixed IGMP loopback to behave in the manner
 *                                        desired by mrouted, fixed the fact it has been
 *                                        broken since 1.3.6 and cleaned up a few minor
 *                                        points.
 *
 *                Chih-Jen Chang        :        Tried to revise IGMP to Version 2
 *                Tsu-Sheng Tsao                E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
 *                                        The enhancements are mainly based on Steve Deering's
 *                                         ipmulti-3.5 source code.
 *                Chih-Jen Chang        :        Added the igmp_get_mrouter_info and
 *                Tsu-Sheng Tsao                igmp_set_mrouter_info to keep track of
 *                                        the mrouted version on that device.
 *                Chih-Jen Chang        :        Added the max_resp_time parameter to
 *                Tsu-Sheng Tsao                igmp_heard_query(). Using this parameter
 *                                        to identify the multicast router version
 *                                        and do what the IGMP version 2 specified.
 *                Chih-Jen Chang        :        Added a timer to revert to IGMP V2 router
 *                Tsu-Sheng Tsao                if the specified time expired.
 *                Alan Cox        :        Stop IGMP from 0.0.0.0 being accepted.
 *                Alan Cox        :        Use GFP_ATOMIC in the right places.
 *                Christian Daudt :        igmp timer wasn't set for local group
 *                                        memberships but was being deleted,
 *                                        which caused a "del_timer() called
 *                                        from %p with timer not initialized\n"
 *                                        message (960131).
 *                Christian Daudt :        removed del_timer from
 *                                        igmp_timer_expire function (960205).
 *             Christian Daudt :       igmp_heard_report now only calls
 *                                     igmp_timer_expire if tm->running is
 *                                     true (960216).
 *                Malcolm Beattie :        ttl comparison wrong in igmp_rcv made
 *                                        igmp_heard_query never trigger. Expiry
 *                                        miscalculation fixed in igmp_heard_query
 *                                        and random() made to return unsigned to
 *                                        prevent negative expiry times.
 *                Alexey Kuznetsov:        Wrong group leaving behaviour, backport
 *                                        fix from pending 2.1.x patches.
 *                Alan Cox:                Forget to enable FDDI support earlier.
 *                Alexey Kuznetsov:        Fixed leaving groups on device down.
 *                Alexey Kuznetsov:        Accordance to igmp-v2-06 draft.
 *                David L Stevens:        IGMPv3 support, with help from
 *                                        Vinay Kulkarni
 */

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include "igmp_internal.h"
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
#include <linux/pkt_sched.h>
#include <linux/byteorder/generic.h>

#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/addrconf.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/inet_common.h>
#include <linux/netfilter_ipv4.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#endif

#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */

#define IGMP_QUERY_INTERVAL                        (125*HZ)
#define IGMP_QUERY_RESPONSE_INTERVAL                (10*HZ)

#define IGMP_INITIAL_REPORT_DELAY                (1)

/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs!
 * IGMP specs require to report membership immediately after
 * joining a group, but we delay the first report by a
 * small interval. It seems more natural and still does not
 * contradict to specs provided this delay is small enough.
 */

#define IGMP_V1_SEEN(in_dev) \
        (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
         IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
         ((in_dev)->mr_v1_seen && \
          time_before(jiffies, (in_dev)->mr_v1_seen)))
#define IGMP_V2_SEEN(in_dev) \
        (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
         IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
         ((in_dev)->mr_v2_seen && \
          time_before(jiffies, (in_dev)->mr_v2_seen)))

static int unsolicited_report_interval(struct in_device *in_dev)
{
        int interval_ms, interval_jiffies;

        if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
                interval_ms = IN_DEV_CONF_GET(
                        in_dev,
                        IGMPV2_UNSOLICITED_REPORT_INTERVAL);
        else /* v3 */
                interval_ms = IN_DEV_CONF_GET(
                        in_dev,
                        IGMPV3_UNSOLICITED_REPORT_INTERVAL);

        interval_jiffies = msecs_to_jiffies(interval_ms);

        /* _timer functions can't handle a delay of 0 jiffies so ensure
         *  we always return a positive value.
         */
        if (interval_jiffies <= 0)
                interval_jiffies = 1;
        return interval_jiffies;
}

static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
                              gfp_t gfp);
static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_clear_delrec(struct in_device *in_dev);
static int sf_setstate(struct ip_mc_list *pmc);
static void sf_markstate(struct ip_mc_list *pmc);
#endif
static void ip_mc_clear_src(struct ip_mc_list *pmc);
static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
                         int sfcount, __be32 *psfsrc, int delta);

static void ip_ma_put(struct ip_mc_list *im)
{
        if (refcount_dec_and_test(&im->refcnt)) {
                in_dev_put(im->interface);
                kfree_rcu(im, rcu);
        }
}

#define for_each_pmc_rcu(in_dev, pmc)                                \
        for (pmc = rcu_dereference(in_dev->mc_list);                \
             pmc != NULL;                                        \
             pmc = rcu_dereference(pmc->next_rcu))

#define for_each_pmc_rtnl(in_dev, pmc)                                \
        for (pmc = rtnl_dereference(in_dev->mc_list);                \
             pmc != NULL;                                        \
             pmc = rtnl_dereference(pmc->next_rcu))

static void ip_sf_list_clear_all(struct ip_sf_list *psf)
{
        struct ip_sf_list *next;

        while (psf) {
                next = psf->sf_next;
                kfree(psf);
                psf = next;
        }
}

#ifdef CONFIG_IP_MULTICAST

/*
 *        Timer management
 */

static void igmp_stop_timer(struct ip_mc_list *im)
{
        spin_lock_bh(&im->lock);
        if (timer_delete(&im->timer))
                refcount_dec(&im->refcnt);
        im->tm_running = 0;
        im->reporter = 0;
        im->unsolicit_count = 0;
        spin_unlock_bh(&im->lock);
}

/* It must be called with locked im->lock */
static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
        int tv = get_random_u32_below(max_delay);

        im->tm_running = 1;
        if (refcount_inc_not_zero(&im->refcnt)) {
                if (mod_timer(&im->timer, jiffies + tv + 2))
                        ip_ma_put(im);
        }
}

static void igmp_gq_start_timer(struct in_device *in_dev)
{
        int tv = get_random_u32_below(READ_ONCE(in_dev->mr_maxdelay));
        unsigned long exp = jiffies + tv + 2;

        if (in_dev->mr_gq_running &&
            time_after_eq(exp, (in_dev->mr_gq_timer).expires))
                return;

        in_dev->mr_gq_running = 1;
        if (!mod_timer(&in_dev->mr_gq_timer, exp))
                in_dev_hold(in_dev);
}

static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
{
        int tv = get_random_u32_below(delay);

        if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
                in_dev_hold(in_dev);
}

static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
{
        spin_lock_bh(&im->lock);
        im->unsolicit_count = 0;
        if (timer_delete(&im->timer)) {
                if ((long)(im->timer.expires-jiffies) < max_delay) {
                        add_timer(&im->timer);
                        im->tm_running = 1;
                        spin_unlock_bh(&im->lock);
                        return;
                }
                refcount_dec(&im->refcnt);
        }
        igmp_start_timer(im, max_delay);
        spin_unlock_bh(&im->lock);
}


/*
 *        Send an IGMP report.
 */

#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)


static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
        int gdeleted, int sdeleted)
{
        switch (type) {
        case IGMPV3_MODE_IS_INCLUDE:
        case IGMPV3_MODE_IS_EXCLUDE:
                if (gdeleted || sdeleted)
                        return 0;
                if (!(pmc->gsquery && !psf->sf_gsresp)) {
                        if (pmc->sfmode == MCAST_INCLUDE)
                                return 1;
                        /* don't include if this source is excluded
                         * in all filters
                         */
                        if (psf->sf_count[MCAST_INCLUDE])
                                return type == IGMPV3_MODE_IS_INCLUDE;
                        return pmc->sfcount[MCAST_EXCLUDE] ==
                                psf->sf_count[MCAST_EXCLUDE];
                }
                return 0;
        case IGMPV3_CHANGE_TO_INCLUDE:
                if (gdeleted || sdeleted)
                        return 0;
                return psf->sf_count[MCAST_INCLUDE] != 0;
        case IGMPV3_CHANGE_TO_EXCLUDE:
                if (gdeleted || sdeleted)
                        return 0;
                if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
                    psf->sf_count[MCAST_INCLUDE])
                        return 0;
                return pmc->sfcount[MCAST_EXCLUDE] ==
                        psf->sf_count[MCAST_EXCLUDE];
        case IGMPV3_ALLOW_NEW_SOURCES:
                if (gdeleted || !psf->sf_crcount)
                        return 0;
                return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
        case IGMPV3_BLOCK_OLD_SOURCES:
                if (pmc->sfmode == MCAST_INCLUDE)
                        return gdeleted || (psf->sf_crcount && sdeleted);
                return psf->sf_crcount && !gdeleted && !sdeleted;
        }
        return 0;
}

static int
igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
{
        struct ip_sf_list *psf;
        int scount = 0;

        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (!is_in(pmc, psf, type, gdeleted, sdeleted))
                        continue;
                scount++;
        }
        return scount;
}

/* source address selection per RFC 3376 section 4.2.13 */
static __be32 igmpv3_get_srcaddr(struct net_device *dev,
                                 const struct flowi4 *fl4)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        const struct in_ifaddr *ifa;

        if (!in_dev)
                return htonl(INADDR_ANY);

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (fl4->saddr == ifa->ifa_local)
                        return fl4->saddr;
        }

        return htonl(INADDR_ANY);
}

static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
{
        struct sk_buff *skb;
        struct rtable *rt;
        struct iphdr *pip;
        struct igmpv3_report *pig;
        struct net *net = dev_net(dev);
        struct flowi4 fl4;
        int hlen = LL_RESERVED_SPACE(dev);
        int tlen = dev->needed_tailroom;
        unsigned int size;

        size = min(mtu, IP_MAX_MTU);
        while (1) {
                skb = alloc_skb(size + hlen + tlen,
                                GFP_ATOMIC | __GFP_NOWARN);
                if (skb)
                        break;
                size >>= 1;
                if (size < 256)
                        return NULL;
        }
        skb->priority = TC_PRIO_CONTROL;

        rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
                                   0, 0,
                                   IPPROTO_IGMP, 0, dev->ifindex);
        if (IS_ERR(rt)) {
                kfree_skb(skb);
                return NULL;
        }

        skb_dst_set(skb, &rt->dst);
        skb->dev = dev;

        skb_reserve(skb, hlen);
        skb_tailroom_reserve(skb, mtu, tlen);

        skb_reset_network_header(skb);
        pip = ip_hdr(skb);
        skb_put(skb, sizeof(struct iphdr) + 4);

        pip->version  = 4;
        pip->ihl      = (sizeof(struct iphdr)+4)>>2;
        pip->tos      = 0xc0;
        pip->frag_off = htons(IP_DF);
        pip->ttl      = 1;
        pip->daddr    = fl4.daddr;

        rcu_read_lock();
        pip->saddr    = igmpv3_get_srcaddr(dev, &fl4);
        rcu_read_unlock();

        pip->protocol = IPPROTO_IGMP;
        pip->tot_len  = 0;        /* filled in later */
        ip_select_ident(net, skb, NULL);
        ((u8 *)&pip[1])[0] = IPOPT_RA;
        ((u8 *)&pip[1])[1] = 4;
        ((u8 *)&pip[1])[2] = 0;
        ((u8 *)&pip[1])[3] = 0;

        skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
        skb_put(skb, sizeof(*pig));
        pig = igmpv3_report_hdr(skb);
        pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
        pig->resv1 = 0;
        pig->csum = 0;
        pig->resv2 = 0;
        pig->ngrec = 0;
        return skb;
}

static int igmpv3_sendpack(struct sk_buff *skb)
{
        struct igmphdr *pig = igmp_hdr(skb);
        const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);

        pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);

        return ip_local_out(skb_dst_dev_net(skb), skb->sk, skb);
}

static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
{
        return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
}

static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
        int type, struct igmpv3_grec **ppgr, unsigned int mtu)
{
        struct net_device *dev = pmc->interface->dev;
        struct igmpv3_report *pih;
        struct igmpv3_grec *pgr;

        if (!skb) {
                skb = igmpv3_newpack(dev, mtu);
                if (!skb)
                        return NULL;
        }
        pgr = skb_put(skb, sizeof(struct igmpv3_grec));
        pgr->grec_type = type;
        pgr->grec_auxwords = 0;
        pgr->grec_nsrcs = 0;
        pgr->grec_mca = pmc->multiaddr;
        pih = igmpv3_report_hdr(skb);
        pih->ngrec = htons(ntohs(pih->ngrec)+1);
        *ppgr = pgr;
        return skb;
}

#define AVAILABLE(skb)        ((skb) ? skb_availroom(skb) : 0)

static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
        int type, int gdeleted, int sdeleted)
{
        struct net_device *dev = pmc->interface->dev;
        struct net *net = dev_net(dev);
        struct igmpv3_report *pih;
        struct igmpv3_grec *pgr = NULL;
        struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
        int scount, stotal, first, isquery, truncate;
        unsigned int mtu;

        if (pmc->multiaddr == IGMP_ALL_HOSTS)
                return skb;
        if (ipv4_is_local_multicast(pmc->multiaddr) &&
            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                return skb;

        mtu = READ_ONCE(dev->mtu);
        if (mtu < IPV4_MIN_MTU)
                return skb;

        isquery = type == IGMPV3_MODE_IS_INCLUDE ||
                  type == IGMPV3_MODE_IS_EXCLUDE;
        truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
                    type == IGMPV3_CHANGE_TO_EXCLUDE;

        stotal = scount = 0;

        psf_list = sdeleted ? &pmc->tomb : &pmc->sources;

        if (!*psf_list)
                goto empty_source;

        pih = skb ? igmpv3_report_hdr(skb) : NULL;

        /* EX and TO_EX get a fresh packet, if needed */
        if (truncate) {
                if (pih && pih->ngrec &&
                    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
                        if (skb)
                                igmpv3_sendpack(skb);
                        skb = igmpv3_newpack(dev, mtu);
                }
        }
        first = 1;
        psf_prev = NULL;
        for (psf = *psf_list; psf; psf = psf_next) {
                __be32 *psrc;

                psf_next = psf->sf_next;

                if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
                        psf_prev = psf;
                        continue;
                }

                /* Based on RFC3376 5.1. Should not send source-list change
                 * records when there is a filter mode change.
                 */
                if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) ||
                     (!gdeleted && pmc->crcount)) &&
                    (type == IGMPV3_ALLOW_NEW_SOURCES ||
                     type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount)
                        goto decrease_sf_crcount;

                /* clear marks on query responses */
                if (isquery)
                        psf->sf_gsresp = 0;

                if (AVAILABLE(skb) < sizeof(__be32) +
                    first*sizeof(struct igmpv3_grec)) {
                        if (truncate && !first)
                                break;         /* truncate these */
                        if (pgr)
                                pgr->grec_nsrcs = htons(scount);
                        if (skb)
                                igmpv3_sendpack(skb);
                        skb = igmpv3_newpack(dev, mtu);
                        first = 1;
                        scount = 0;
                }
                if (first) {
                        skb = add_grhead(skb, pmc, type, &pgr, mtu);
                        first = 0;
                }
                if (!skb)
                        return NULL;
                psrc = skb_put(skb, sizeof(__be32));
                *psrc = psf->sf_inaddr;
                scount++; stotal++;
                if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
                     type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
decrease_sf_crcount:
                        psf->sf_crcount--;
                        if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
                                if (psf_prev)
                                        psf_prev->sf_next = psf->sf_next;
                                else
                                        *psf_list = psf->sf_next;
                                kfree(psf);
                                continue;
                        }
                }
                psf_prev = psf;
        }

empty_source:
        if (!stotal) {
                if (type == IGMPV3_ALLOW_NEW_SOURCES ||
                    type == IGMPV3_BLOCK_OLD_SOURCES)
                        return skb;
                if (pmc->crcount || isquery) {
                        /* make sure we have room for group header */
                        if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
                                igmpv3_sendpack(skb);
                                skb = NULL; /* add_grhead will get a new one */
                        }
                        skb = add_grhead(skb, pmc, type, &pgr, mtu);
                }
        }
        if (pgr)
                pgr->grec_nsrcs = htons(scount);

        if (isquery)
                pmc->gsquery = 0;        /* clear query state on report */
        return skb;
}

static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
{
        struct sk_buff *skb = NULL;
        struct net *net = dev_net(in_dev->dev);
        int type;

        if (!pmc) {
                rcu_read_lock();
                for_each_pmc_rcu(in_dev, pmc) {
                        if (pmc->multiaddr == IGMP_ALL_HOSTS)
                                continue;
                        if (ipv4_is_local_multicast(pmc->multiaddr) &&
                            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                                continue;
                        spin_lock_bh(&pmc->lock);
                        if (pmc->sfcount[MCAST_EXCLUDE])
                                type = IGMPV3_MODE_IS_EXCLUDE;
                        else
                                type = IGMPV3_MODE_IS_INCLUDE;
                        skb = add_grec(skb, pmc, type, 0, 0);
                        spin_unlock_bh(&pmc->lock);
                }
                rcu_read_unlock();
        } else {
                spin_lock_bh(&pmc->lock);
                if (pmc->sfcount[MCAST_EXCLUDE])
                        type = IGMPV3_MODE_IS_EXCLUDE;
                else
                        type = IGMPV3_MODE_IS_INCLUDE;
                skb = add_grec(skb, pmc, type, 0, 0);
                spin_unlock_bh(&pmc->lock);
        }
        if (!skb)
                return 0;
        return igmpv3_sendpack(skb);
}

/*
 * remove zero-count source records from a source filter list
 */
static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
{
        struct ip_sf_list *psf_prev, *psf_next, *psf;

        psf_prev = NULL;
        for (psf = *ppsf; psf; psf = psf_next) {
                psf_next = psf->sf_next;
                if (psf->sf_crcount == 0) {
                        if (psf_prev)
                                psf_prev->sf_next = psf->sf_next;
                        else
                                *ppsf = psf->sf_next;
                        kfree(psf);
                } else
                        psf_prev = psf;
        }
}

static void kfree_pmc(struct ip_mc_list *pmc)
{
        ip_sf_list_clear_all(pmc->sources);
        ip_sf_list_clear_all(pmc->tomb);
        kfree(pmc);
}

static void igmpv3_send_cr(struct in_device *in_dev)
{
        struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
        struct sk_buff *skb = NULL;
        int type, dtype;

        rcu_read_lock();
        spin_lock_bh(&in_dev->mc_tomb_lock);

        /* deleted MCA's */
        pmc_prev = NULL;
        for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
                pmc_next = pmc->next;
                if (pmc->sfmode == MCAST_INCLUDE) {
                        type = IGMPV3_BLOCK_OLD_SOURCES;
                        dtype = IGMPV3_BLOCK_OLD_SOURCES;
                        skb = add_grec(skb, pmc, type, 1, 0);
                        skb = add_grec(skb, pmc, dtype, 1, 1);
                }
                if (pmc->crcount) {
                        if (pmc->sfmode == MCAST_EXCLUDE) {
                                type = IGMPV3_CHANGE_TO_INCLUDE;
                                skb = add_grec(skb, pmc, type, 1, 0);
                        }
                        pmc->crcount--;
                        if (pmc->crcount == 0) {
                                igmpv3_clear_zeros(&pmc->tomb);
                                igmpv3_clear_zeros(&pmc->sources);
                        }
                }
                if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
                        if (pmc_prev)
                                pmc_prev->next = pmc_next;
                        else
                                in_dev->mc_tomb = pmc_next;
                        in_dev_put(pmc->interface);
                        kfree_pmc(pmc);
                } else
                        pmc_prev = pmc;
        }
        spin_unlock_bh(&in_dev->mc_tomb_lock);

        /* change recs */
        for_each_pmc_rcu(in_dev, pmc) {
                spin_lock_bh(&pmc->lock);
                if (pmc->sfcount[MCAST_EXCLUDE]) {
                        type = IGMPV3_BLOCK_OLD_SOURCES;
                        dtype = IGMPV3_ALLOW_NEW_SOURCES;
                } else {
                        type = IGMPV3_ALLOW_NEW_SOURCES;
                        dtype = IGMPV3_BLOCK_OLD_SOURCES;
                }
                skb = add_grec(skb, pmc, type, 0, 0);
                skb = add_grec(skb, pmc, dtype, 0, 1);        /* deleted sources */

                /* filter mode changes */
                if (pmc->crcount) {
                        if (pmc->sfmode == MCAST_EXCLUDE)
                                type = IGMPV3_CHANGE_TO_EXCLUDE;
                        else
                                type = IGMPV3_CHANGE_TO_INCLUDE;
                        skb = add_grec(skb, pmc, type, 0, 0);
                        pmc->crcount--;
                }
                spin_unlock_bh(&pmc->lock);
        }
        rcu_read_unlock();

        if (!skb)
                return;
        (void) igmpv3_sendpack(skb);
}

static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
        int type)
{
        struct sk_buff *skb;
        struct iphdr *iph;
        struct igmphdr *ih;
        struct rtable *rt;
        struct net_device *dev = in_dev->dev;
        struct net *net = dev_net(dev);
        __be32        group = pmc ? pmc->multiaddr : 0;
        struct flowi4 fl4;
        __be32        dst;
        int hlen, tlen;

        if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
                return igmpv3_send_report(in_dev, pmc);

        if (ipv4_is_local_multicast(group) &&
            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                return 0;

        if (type == IGMP_HOST_LEAVE_MESSAGE)
                dst = IGMP_ALL_ROUTER;
        else
                dst = group;

        rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
                                   0, 0,
                                   IPPROTO_IGMP, 0, dev->ifindex);
        if (IS_ERR(rt))
                return -1;

        hlen = LL_RESERVED_SPACE(dev);
        tlen = dev->needed_tailroom;
        skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
        if (!skb) {
                ip_rt_put(rt);
                return -1;
        }
        skb->priority = TC_PRIO_CONTROL;

        skb_dst_set(skb, &rt->dst);

        skb_reserve(skb, hlen);

        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
        skb_put(skb, sizeof(struct iphdr) + 4);

        iph->version  = 4;
        iph->ihl      = (sizeof(struct iphdr)+4)>>2;
        iph->tos      = 0xc0;
        iph->frag_off = htons(IP_DF);
        iph->ttl      = 1;
        iph->daddr    = dst;
        iph->saddr    = fl4.saddr;
        iph->protocol = IPPROTO_IGMP;
        ip_select_ident(net, skb, NULL);
        ((u8 *)&iph[1])[0] = IPOPT_RA;
        ((u8 *)&iph[1])[1] = 4;
        ((u8 *)&iph[1])[2] = 0;
        ((u8 *)&iph[1])[3] = 0;

        ih = skb_put(skb, sizeof(struct igmphdr));
        ih->type = type;
        ih->code = 0;
        ih->csum = 0;
        ih->group = group;
        ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));

        return ip_local_out(net, skb->sk, skb);
}

static void igmp_gq_timer_expire(struct timer_list *t)
{
        struct in_device *in_dev = timer_container_of(in_dev, t, mr_gq_timer);

        in_dev->mr_gq_running = 0;
        igmpv3_send_report(in_dev, NULL);
        in_dev_put(in_dev);
}

static void igmp_ifc_timer_expire(struct timer_list *t)
{
        struct in_device *in_dev = timer_container_of(in_dev, t, mr_ifc_timer);
        u32 mr_ifc_count;

        igmpv3_send_cr(in_dev);
restart:
        mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count);

        if (mr_ifc_count) {
                if (cmpxchg(&in_dev->mr_ifc_count,
                            mr_ifc_count,
                            mr_ifc_count - 1) != mr_ifc_count)
                        goto restart;
                igmp_ifc_start_timer(in_dev,
                                     unsolicited_report_interval(in_dev));
        }
        in_dev_put(in_dev);
}

static void igmp_ifc_event(struct in_device *in_dev)
{
        struct net *net = dev_net(in_dev->dev);
        if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
                return;
        WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv));
        igmp_ifc_start_timer(in_dev, 1);
}


static void igmp_timer_expire(struct timer_list *t)
{
        struct ip_mc_list *im = timer_container_of(im, t, timer);
        struct in_device *in_dev = im->interface;

        spin_lock(&im->lock);
        im->tm_running = 0;

        if (im->unsolicit_count && --im->unsolicit_count)
                igmp_start_timer(im, unsolicited_report_interval(in_dev));

        im->reporter = 1;
        spin_unlock(&im->lock);

        if (IGMP_V1_SEEN(in_dev))
                igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
        else if (IGMP_V2_SEEN(in_dev))
                igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
        else
                igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);

        ip_ma_put(im);
}

/* mark EXCLUDE-mode sources */
static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
{
        struct ip_sf_list *psf;
        int i, scount;

        scount = 0;
        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (scount == nsrcs)
                        break;
                for (i = 0; i < nsrcs; i++) {
                        /* skip inactive filters */
                        if (psf->sf_count[MCAST_INCLUDE] ||
                            pmc->sfcount[MCAST_EXCLUDE] !=
                            psf->sf_count[MCAST_EXCLUDE])
                                break;
                        if (srcs[i] == psf->sf_inaddr) {
                                scount++;
                                break;
                        }
                }
        }
        pmc->gsquery = 0;
        if (scount == nsrcs)        /* all sources excluded */
                return 0;
        return 1;
}

static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
{
        struct ip_sf_list *psf;
        int i, scount;

        if (pmc->sfmode == MCAST_EXCLUDE)
                return igmp_xmarksources(pmc, nsrcs, srcs);

        /* mark INCLUDE-mode sources */
        scount = 0;
        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (scount == nsrcs)
                        break;
                for (i = 0; i < nsrcs; i++)
                        if (srcs[i] == psf->sf_inaddr) {
                                psf->sf_gsresp = 1;
                                scount++;
                                break;
                        }
        }
        if (!scount) {
                pmc->gsquery = 0;
                return 0;
        }
        pmc->gsquery = 1;
        return 1;
}

/* return true if packet was dropped */
static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
{
        struct ip_mc_list *im;
        struct net *net = dev_net(in_dev->dev);

        /* Timers are only set for non-local groups */

        if (group == IGMP_ALL_HOSTS)
                return false;
        if (ipv4_is_local_multicast(group) &&
            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                return false;

        rcu_read_lock();
        for_each_pmc_rcu(in_dev, im) {
                if (im->multiaddr == group) {
                        igmp_stop_timer(im);
                        break;
                }
        }
        rcu_read_unlock();
        return false;
}

/* return true if packet was dropped */
static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
        int len)
{
        struct igmphdr                 *ih = igmp_hdr(skb);
        struct igmpv3_query *ih3 = igmpv3_query_hdr(skb);
        struct ip_mc_list        *im;
        __be32                        group = ih->group;
        int                        max_delay;
        int                        mark = 0;
        struct net                *net = dev_net(in_dev->dev);


        if (len == 8) {
                if (ih->code == 0) {
                        /* Alas, old v1 router presents here. */

                        max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
                        in_dev->mr_v1_seen = jiffies +
                                (in_dev->mr_qrv * in_dev->mr_qi) +
                                in_dev->mr_qri;
                        group = 0;
                } else {
                        /* v2 router present */
                        max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
                        in_dev->mr_v2_seen = jiffies +
                                (in_dev->mr_qrv * in_dev->mr_qi) +
                                in_dev->mr_qri;
                }
                /* cancel the interface change timer */
                WRITE_ONCE(in_dev->mr_ifc_count, 0);
                if (timer_delete(&in_dev->mr_ifc_timer))
                        __in_dev_put(in_dev);
                /* clear deleted report items */
                igmpv3_clear_delrec(in_dev);
        } else if (len < 12) {
                return true;        /* ignore bogus packet; freed by caller */
        } else if (IGMP_V1_SEEN(in_dev)) {
                /* This is a v3 query with v1 queriers present */
                max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
                group = 0;
        } else if (IGMP_V2_SEEN(in_dev)) {
                /* this is a v3 query with v2 queriers present;
                 * Interpretation of the max_delay code is problematic here.
                 * A real v2 host would use ih_code directly, while v3 has a
                 * different encoding. We use the v3 encoding as more likely
                 * to be intended in a v3 query.
                 */
                max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
                if (!max_delay)
                        max_delay = 1;        /* can't mod w/ 0 */
        } else { /* v3 */
                if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
                        return true;

                ih3 = igmpv3_query_hdr(skb);
                if (ih3->nsrcs) {
                        if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
                                           + ntohs(ih3->nsrcs)*sizeof(__be32)))
                                return true;
                        ih3 = igmpv3_query_hdr(skb);
                }

                max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
                if (!max_delay)
                        max_delay = 1;        /* can't mod w/ 0 */
                WRITE_ONCE(in_dev->mr_maxdelay, max_delay);

                /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
                 * received value was zero, use the default or statically
                 * configured value.
                 */
                in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;

                /* RFC3376, 8.3. Query Response Interval:
                 * The number of seconds represented by the [Query Response
                 * Interval] must be less than the [Query Interval].
                 */
                if (in_dev->mr_qri >= in_dev->mr_qi)
                        in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ;

                if (!group) { /* general query */
                        if (ih3->nsrcs)
                                return true;        /* no sources allowed */
                        igmp_gq_start_timer(in_dev);
                        return false;
                }
                /* mark sources to include, if group & source-specific */
                mark = ih3->nsrcs != 0;
        }

        /*
         * - Start the timers in all of our membership records
         *   that the query applies to for the interface on
         *   which the query arrived excl. those that belong
         *   to a "local" group (224.0.0.X)
         * - For timers already running check if they need to
         *   be reset.
         * - Use the igmp->igmp_code field as the maximum
         *   delay possible
         */
        rcu_read_lock();
        for_each_pmc_rcu(in_dev, im) {
                int changed;

                if (group && group != im->multiaddr)
                        continue;
                if (im->multiaddr == IGMP_ALL_HOSTS)
                        continue;
                if (ipv4_is_local_multicast(im->multiaddr) &&
                    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                        continue;
                spin_lock_bh(&im->lock);
                if (im->tm_running)
                        im->gsquery = im->gsquery && mark;
                else
                        im->gsquery = mark;
                changed = !im->gsquery ||
                        igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
                spin_unlock_bh(&im->lock);
                if (changed)
                        igmp_mod_timer(im, max_delay);
        }
        rcu_read_unlock();
        return false;
}

/* called in rcu_read_lock() section */
int igmp_rcv(struct sk_buff *skb)
{
        /* This basically follows the spec line by line -- see RFC1112 */
        struct igmphdr *ih;
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
        int len = skb->len;
        bool dropped = true;

        if (netif_is_l3_master(dev)) {
                dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
                if (!dev)
                        goto drop;
        }

        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                goto drop;

        if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
                goto drop;

        if (skb_checksum_simple_validate(skb))
                goto drop;

        ih = igmp_hdr(skb);
        switch (ih->type) {
        case IGMP_HOST_MEMBERSHIP_QUERY:
                dropped = igmp_heard_query(in_dev, skb, len);
                break;
        case IGMP_HOST_MEMBERSHIP_REPORT:
        case IGMPV2_HOST_MEMBERSHIP_REPORT:
                /* Is it our report looped back? */
                if (rt_is_output_route(skb_rtable(skb)))
                        break;
                /* don't rely on MC router hearing unicast reports */
                if (skb->pkt_type == PACKET_MULTICAST ||
                    skb->pkt_type == PACKET_BROADCAST)
                        dropped = igmp_heard_report(in_dev, ih->group);
                break;
        case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
                return pim_rcv_v1(skb);
#endif
        case IGMPV3_HOST_MEMBERSHIP_REPORT:
        case IGMP_DVMRP:
        case IGMP_TRACE:
        case IGMP_HOST_LEAVE_MESSAGE:
        case IGMP_MTRACE:
        case IGMP_MTRACE_RESP:
                break;
        default:
                break;
        }

drop:
        if (dropped)
                kfree_skb(skb);
        else
                consume_skb(skb);
        return 0;
}

#endif


/*
 *        Add a filter to a device
 */

static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
{
        char buf[MAX_ADDR_LEN];
        struct net_device *dev = in_dev->dev;

        /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
           We will get multicast token leakage, when IFF_MULTICAST
           is changed. This check should be done in ndo_set_rx_mode
           routine. Something sort of:
           if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
           --ANK
           */
        if (arp_mc_map(addr, buf, dev, 0) == 0)
                dev_mc_add(dev, buf);
}

/*
 *        Remove a filter from a device
 */

static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
{
        char buf[MAX_ADDR_LEN];
        struct net_device *dev = in_dev->dev;

        if (arp_mc_map(addr, buf, dev, 0) == 0)
                dev_mc_del(dev, buf);
}

#ifdef CONFIG_IP_MULTICAST
/*
 * deleted ip_mc_list manipulation
 */
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
                              gfp_t gfp)
{
        struct ip_mc_list *pmc;
        struct net *net = dev_net(in_dev->dev);

        /* this is an "ip_mc_list" for convenience; only the fields below
         * are actually used. In particular, the refcnt and users are not
         * used for management of the delete list. Using the same structure
         * for deleted items allows change reports to use common code with
         * non-deleted or query-response MCA's.
         */
        pmc = kzalloc_obj(*pmc, gfp);
        if (!pmc)
                return;
        spin_lock_init(&pmc->lock);
        spin_lock_bh(&im->lock);
        pmc->interface = im->interface;
        in_dev_hold(in_dev);
        pmc->multiaddr = im->multiaddr;
        pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
        pmc->sfmode = im->sfmode;
        if (pmc->sfmode == MCAST_INCLUDE) {
                struct ip_sf_list *psf;

                pmc->tomb = im->tomb;
                pmc->sources = im->sources;
                im->tomb = im->sources = NULL;
                for (psf = pmc->sources; psf; psf = psf->sf_next)
                        psf->sf_crcount = pmc->crcount;
        }
        spin_unlock_bh(&im->lock);

        spin_lock_bh(&in_dev->mc_tomb_lock);
        pmc->next = in_dev->mc_tomb;
        in_dev->mc_tomb = pmc;
        spin_unlock_bh(&in_dev->mc_tomb_lock);
}

/*
 * restore ip_mc_list deleted records
 */
static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
{
        struct ip_mc_list *pmc, *pmc_prev;
        struct ip_sf_list *psf;
        struct net *net = dev_net(in_dev->dev);
        __be32 multiaddr = im->multiaddr;

        spin_lock_bh(&in_dev->mc_tomb_lock);
        pmc_prev = NULL;
        for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
                if (pmc->multiaddr == multiaddr)
                        break;
                pmc_prev = pmc;
        }
        if (pmc) {
                if (pmc_prev)
                        pmc_prev->next = pmc->next;
                else
                        in_dev->mc_tomb = pmc->next;
        }
        spin_unlock_bh(&in_dev->mc_tomb_lock);

        spin_lock_bh(&im->lock);
        if (pmc) {
                im->interface = pmc->interface;
                if (im->sfmode == MCAST_INCLUDE) {
                        swap(im->tomb, pmc->tomb);
                        swap(im->sources, pmc->sources);
                        for (psf = im->sources; psf; psf = psf->sf_next)
                                psf->sf_crcount = in_dev->mr_qrv ?:
                                        READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                } else {
                        im->crcount = in_dev->mr_qrv ?:
                                READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                }
                in_dev_put(pmc->interface);
                kfree_pmc(pmc);
        }
        spin_unlock_bh(&im->lock);
}

/*
 * flush ip_mc_list deleted records
 */
static void igmpv3_clear_delrec(struct in_device *in_dev)
{
        struct ip_mc_list *pmc, *nextpmc;

        spin_lock_bh(&in_dev->mc_tomb_lock);
        pmc = in_dev->mc_tomb;
        in_dev->mc_tomb = NULL;
        spin_unlock_bh(&in_dev->mc_tomb_lock);

        for (; pmc; pmc = nextpmc) {
                nextpmc = pmc->next;
                ip_mc_clear_src(pmc);
                in_dev_put(pmc->interface);
                kfree_pmc(pmc);
        }
        /* clear dead sources, too */
        rcu_read_lock();
        for_each_pmc_rcu(in_dev, pmc) {
                struct ip_sf_list *psf;

                spin_lock_bh(&pmc->lock);
                psf = pmc->tomb;
                pmc->tomb = NULL;
                spin_unlock_bh(&pmc->lock);
                ip_sf_list_clear_all(psf);
        }
        rcu_read_unlock();
}
#endif

static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp)
{
        struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
        struct net *net = dev_net(in_dev->dev);
        int reporter;
#endif

        if (im->loaded) {
                im->loaded = 0;
                ip_mc_filter_del(in_dev, im->multiaddr);
        }

#ifdef CONFIG_IP_MULTICAST
        if (im->multiaddr == IGMP_ALL_HOSTS)
                return;
        if (ipv4_is_local_multicast(im->multiaddr) &&
            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                return;

        reporter = im->reporter;
        igmp_stop_timer(im);

        if (!in_dev->dead) {
                if (IGMP_V1_SEEN(in_dev))
                        return;
                if (IGMP_V2_SEEN(in_dev)) {
                        if (reporter)
                                igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
                        return;
                }
                /* IGMPv3 */
                igmpv3_add_delrec(in_dev, im, gfp);

                igmp_ifc_event(in_dev);
        }
#endif
}

static void igmp_group_dropped(struct ip_mc_list *im)
{
        __igmp_group_dropped(im, GFP_KERNEL);
}

static void igmp_group_added(struct ip_mc_list *im)
{
        struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
        struct net *net = dev_net(in_dev->dev);
#endif

        if (im->loaded == 0) {
                im->loaded = 1;
                ip_mc_filter_add(in_dev, im->multiaddr);
        }

#ifdef CONFIG_IP_MULTICAST
        if (im->multiaddr == IGMP_ALL_HOSTS)
                return;
        if (ipv4_is_local_multicast(im->multiaddr) &&
            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                return;

        if (in_dev->dead)
                return;

        im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
        if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
                spin_lock_bh(&im->lock);
                igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
                spin_unlock_bh(&im->lock);
                return;
        }
        /* else, v3 */

        /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should
         * not send filter-mode change record as the mode should be from
         * IN() to IN(A).
         */
        if (im->sfmode == MCAST_EXCLUDE)
                im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);

        igmp_ifc_event(in_dev);
#endif
}


/*
 *        Multicast list managers
 */

static u32 ip_mc_hash(const struct ip_mc_list *im)
{
        return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
}

static void ip_mc_hash_add(struct in_device *in_dev,
                           struct ip_mc_list *im)
{
        struct ip_mc_list __rcu **mc_hash;
        u32 hash;

        mc_hash = rtnl_dereference(in_dev->mc_hash);
        if (mc_hash) {
                hash = ip_mc_hash(im);
                im->next_hash = mc_hash[hash];
                rcu_assign_pointer(mc_hash[hash], im);
                return;
        }

        /* do not use a hash table for small number of items */
        if (in_dev->mc_count < 4)
                return;

        mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
                          GFP_KERNEL);
        if (!mc_hash)
                return;

        for_each_pmc_rtnl(in_dev, im) {
                hash = ip_mc_hash(im);
                im->next_hash = mc_hash[hash];
                RCU_INIT_POINTER(mc_hash[hash], im);
        }

        rcu_assign_pointer(in_dev->mc_hash, mc_hash);
}

static void ip_mc_hash_remove(struct in_device *in_dev,
                              struct ip_mc_list *im)
{
        struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
        struct ip_mc_list *aux;

        if (!mc_hash)
                return;
        mc_hash += ip_mc_hash(im);
        while ((aux = rtnl_dereference(*mc_hash)) != im)
                mc_hash = &aux->next_hash;
        *mc_hash = im->next_hash;
}

int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev,
                       const struct ip_mc_list *im,
                       struct inet_fill_args *args)
{
        struct ifa_cacheinfo ci;
        struct ifaddrmsg *ifm;
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
                        sizeof(struct ifaddrmsg), args->flags);
        if (!nlh)
                return -EMSGSIZE;

        ifm = nlmsg_data(nlh);
        ifm->ifa_family = AF_INET;
        ifm->ifa_prefixlen = 32;
        ifm->ifa_flags = IFA_F_PERMANENT;
        ifm->ifa_scope = RT_SCOPE_UNIVERSE;
        ifm->ifa_index = dev->ifindex;

        ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ;
        ci.tstamp = ci.cstamp;
        ci.ifa_prefered = INFINITY_LIFE_TIME;
        ci.ifa_valid = INFINITY_LIFE_TIME;

        if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 ||
            nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static void inet_ifmcaddr_notify(struct net_device *dev,
                                 const struct ip_mc_list *im, int event)
{
        struct inet_fill_args fillargs = {
                .event = event,
        };
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -ENOMEM;

        skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
                        nla_total_size(sizeof(__be32)) +
                        nla_total_size(sizeof(struct ifa_cacheinfo)),
                        GFP_KERNEL);
        if (!skb)
                goto error;

        err = inet_fill_ifmcaddr(skb, dev, im, &fillargs);
        if (err < 0) {
                WARN_ON_ONCE(err == -EMSGSIZE);
                nlmsg_free(skb);
                goto error;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL);
        return;
error:
        rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err);
}

/*
 *        A socket has joined a multicast group on device dev.
 */
static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
                                unsigned int mode, gfp_t gfp)
{
        struct ip_mc_list __rcu **mc_hash;
        struct ip_mc_list *im;

        ASSERT_RTNL();

        mc_hash = rtnl_dereference(in_dev->mc_hash);
        if (mc_hash) {
                u32 hash = hash_32((__force u32)addr, MC_HASH_SZ_LOG);

                for (im = rtnl_dereference(mc_hash[hash]);
                     im;
                     im = rtnl_dereference(im->next_hash)) {
                        if (im->multiaddr == addr)
                                break;
                }
        } else {
                for_each_pmc_rtnl(in_dev, im) {
                        if (im->multiaddr == addr)
                                break;
                }
        }

        if  (im) {
                im->users++;
                ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
                goto out;
        }

        im = kzalloc_obj(*im, gfp);
        if (!im)
                goto out;

        im->users = 1;
        im->interface = in_dev;
        in_dev_hold(in_dev);
        im->multiaddr = addr;
        im->mca_cstamp = jiffies;
        im->mca_tstamp = im->mca_cstamp;
        /* initial mode is (EX, empty) */
        im->sfmode = mode;
        im->sfcount[mode] = 1;
        refcount_set(&im->refcnt, 1);
        spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
        timer_setup(&im->timer, igmp_timer_expire, 0);
#endif

        im->next_rcu = in_dev->mc_list;
        in_dev->mc_count++;
        rcu_assign_pointer(in_dev->mc_list, im);

        ip_mc_hash_add(in_dev, im);

#ifdef CONFIG_IP_MULTICAST
        igmpv3_del_delrec(in_dev, im);
#endif
        igmp_group_added(im);
        inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST);
        if (!in_dev->dead)
                ip_rt_multicast_event(in_dev);
out:
        return;
}

void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
{
        ____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp);
}
EXPORT_SYMBOL(__ip_mc_inc_group);

void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
        __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
}
EXPORT_SYMBOL(ip_mc_inc_group);

static int ip_mc_check_iphdr(struct sk_buff *skb)
{
        const struct iphdr *iph;
        unsigned int len;
        unsigned int offset = skb_network_offset(skb) + sizeof(*iph);

        if (!pskb_may_pull(skb, offset))
                return -EINVAL;

        iph = ip_hdr(skb);

        if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
                return -EINVAL;

        offset += ip_hdrlen(skb) - sizeof(*iph);

        if (!pskb_may_pull(skb, offset))
                return -EINVAL;

        iph = ip_hdr(skb);

        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
                return -EINVAL;

        len = skb_network_offset(skb) + ntohs(iph->tot_len);
        if (skb->len < len || len < offset)
                return -EINVAL;

        skb_set_transport_header(skb, offset);

        return 0;
}

static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
{
        unsigned int len = skb_transport_offset(skb);

        len += sizeof(struct igmpv3_report);

        return ip_mc_may_pull(skb, len) ? 0 : -EINVAL;
}

static int ip_mc_check_igmp_query(struct sk_buff *skb)
{
        unsigned int transport_len = ip_transport_len(skb);
        unsigned int len;

        /* IGMPv{1,2}? */
        if (transport_len != sizeof(struct igmphdr)) {
                /* or IGMPv3? */
                if (transport_len < sizeof(struct igmpv3_query))
                        return -EINVAL;

                len = skb_transport_offset(skb) + sizeof(struct igmpv3_query);
                if (!ip_mc_may_pull(skb, len))
                        return -EINVAL;
        }

        /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
         * all-systems destination addresses (224.0.0.1) for general queries
         */
        if (!igmp_hdr(skb)->group &&
            ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
                return -EINVAL;

        return 0;
}

static int ip_mc_check_igmp_msg(struct sk_buff *skb)
{
        switch (igmp_hdr(skb)->type) {
        case IGMP_HOST_LEAVE_MESSAGE:
        case IGMP_HOST_MEMBERSHIP_REPORT:
        case IGMPV2_HOST_MEMBERSHIP_REPORT:
                return 0;
        case IGMPV3_HOST_MEMBERSHIP_REPORT:
                return ip_mc_check_igmp_reportv3(skb);
        case IGMP_HOST_MEMBERSHIP_QUERY:
                return ip_mc_check_igmp_query(skb);
        default:
                return -ENOMSG;
        }
}

static __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
{
        return skb_checksum_simple_validate(skb);
}

static int ip_mc_check_igmp_csum(struct sk_buff *skb)
{
        unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
        unsigned int transport_len = ip_transport_len(skb);
        struct sk_buff *skb_chk;

        if (!ip_mc_may_pull(skb, len))
                return -EINVAL;

        skb_chk = skb_checksum_trimmed(skb, transport_len,
                                       ip_mc_validate_checksum);
        if (!skb_chk)
                return -EINVAL;

        if (skb_chk != skb)
                kfree_skb(skb_chk);

        return 0;
}

/**
 * ip_mc_check_igmp - checks whether this is a sane IGMP packet
 * @skb: the skb to validate
 *
 * Checks whether an IPv4 packet is a valid IGMP packet. If so sets
 * skb transport header accordingly and returns zero.
 *
 * -EINVAL: A broken packet was detected, i.e. it violates some internet
 *  standard
 * -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
 * -ENOMEM: A memory allocation failure happened.
 *
 * Caller needs to set the skb network header and free any returned skb if it
 * differs from the provided skb.
 */
int ip_mc_check_igmp(struct sk_buff *skb)
{
        int ret = ip_mc_check_iphdr(skb);

        if (ret < 0)
                return ret;

        if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
                return -ENOMSG;

        ret = ip_mc_check_igmp_csum(skb);
        if (ret < 0)
                return ret;

        return ip_mc_check_igmp_msg(skb);
}
EXPORT_SYMBOL(ip_mc_check_igmp);

/*
 *        Resend IGMP JOIN report; used by netdev notifier.
 */
static void ip_mc_rejoin_groups(struct in_device *in_dev)
{
#ifdef CONFIG_IP_MULTICAST
        struct ip_mc_list *im;
        int type;
        struct net *net = dev_net(in_dev->dev);

        ASSERT_RTNL();

        for_each_pmc_rtnl(in_dev, im) {
                if (im->multiaddr == IGMP_ALL_HOSTS)
                        continue;
                if (ipv4_is_local_multicast(im->multiaddr) &&
                    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                        continue;

                /* a failover is happening and switches
                 * must be notified immediately
                 */
                if (IGMP_V1_SEEN(in_dev))
                        type = IGMP_HOST_MEMBERSHIP_REPORT;
                else if (IGMP_V2_SEEN(in_dev))
                        type = IGMPV2_HOST_MEMBERSHIP_REPORT;
                else
                        type = IGMPV3_HOST_MEMBERSHIP_REPORT;
                igmp_send_report(in_dev, im, type);
        }
#endif
}

/*
 *        A socket has left a multicast group on device dev
 */

void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
{
        struct ip_mc_list *i;
        struct ip_mc_list __rcu **ip;

        ASSERT_RTNL();

        for (ip = &in_dev->mc_list;
             (i = rtnl_dereference(*ip)) != NULL;
             ip = &i->next_rcu) {
                if (i->multiaddr == addr) {
                        if (--i->users == 0) {
                                ip_mc_hash_remove(in_dev, i);
                                *ip = i->next_rcu;
                                in_dev->mc_count--;
                                __igmp_group_dropped(i, gfp);
                                inet_ifmcaddr_notify(in_dev->dev, i,
                                                     RTM_DELMULTICAST);
                                ip_mc_clear_src(i);

                                if (!in_dev->dead)
                                        ip_rt_multicast_event(in_dev);

                                ip_ma_put(i);
                                return;
                        }
                        break;
                }
        }
}
EXPORT_SYMBOL(__ip_mc_dec_group);

/* Device changing type */

void ip_mc_unmap(struct in_device *in_dev)
{
        struct ip_mc_list *pmc;

        ASSERT_RTNL();

        for_each_pmc_rtnl(in_dev, pmc)
                igmp_group_dropped(pmc);
}

void ip_mc_remap(struct in_device *in_dev)
{
        struct ip_mc_list *pmc;

        ASSERT_RTNL();

        for_each_pmc_rtnl(in_dev, pmc) {
#ifdef CONFIG_IP_MULTICAST
                igmpv3_del_delrec(in_dev, pmc);
#endif
                igmp_group_added(pmc);
        }
}

/* Device going down */

void ip_mc_down(struct in_device *in_dev)
{
        struct ip_mc_list *pmc;

        ASSERT_RTNL();

        for_each_pmc_rtnl(in_dev, pmc)
                igmp_group_dropped(pmc);

#ifdef CONFIG_IP_MULTICAST
        WRITE_ONCE(in_dev->mr_ifc_count, 0);
        if (timer_delete(&in_dev->mr_ifc_timer))
                __in_dev_put(in_dev);
        in_dev->mr_gq_running = 0;
        if (timer_delete(&in_dev->mr_gq_timer))
                __in_dev_put(in_dev);
#endif

        ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
}

#ifdef CONFIG_IP_MULTICAST
static void ip_mc_reset(struct in_device *in_dev)
{
        struct net *net = dev_net(in_dev->dev);

        in_dev->mr_qi = IGMP_QUERY_INTERVAL;
        in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL;
        in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
}
#else
static void ip_mc_reset(struct in_device *in_dev)
{
}
#endif

void ip_mc_init_dev(struct in_device *in_dev)
{
        ASSERT_RTNL();

#ifdef CONFIG_IP_MULTICAST
        timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
        timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
#endif
        ip_mc_reset(in_dev);

        spin_lock_init(&in_dev->mc_tomb_lock);
}

/* Device going up */

void ip_mc_up(struct in_device *in_dev)
{
        struct ip_mc_list *pmc;

        ASSERT_RTNL();

        ip_mc_reset(in_dev);
        ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);

        for_each_pmc_rtnl(in_dev, pmc) {
#ifdef CONFIG_IP_MULTICAST
                igmpv3_del_delrec(in_dev, pmc);
#endif
                igmp_group_added(pmc);
        }
}

/*
 *        Device is about to be destroyed: clean up.
 */

void ip_mc_destroy_dev(struct in_device *in_dev)
{
        struct ip_mc_list *i;

        ASSERT_RTNL();

        /* Deactivate timers */
        ip_mc_down(in_dev);
#ifdef CONFIG_IP_MULTICAST
        igmpv3_clear_delrec(in_dev);
#endif

        while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
                in_dev->mc_list = i->next_rcu;
                in_dev->mc_count--;
                ip_mc_clear_src(i);
                ip_ma_put(i);
        }
}

/* RTNL is locked */
static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
        struct net_device *dev = NULL;
        struct in_device *idev = NULL;

        if (imr->imr_ifindex) {
                idev = inetdev_by_index(net, imr->imr_ifindex);
                return idev;
        }
        if (imr->imr_address.s_addr) {
                dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
                if (!dev)
                        return NULL;
        }

        if (!dev) {
                struct rtable *rt = ip_route_output(net,
                                                    imr->imr_multiaddr.s_addr,
                                                    0, 0, 0,
                                                    RT_SCOPE_UNIVERSE);
                if (!IS_ERR(rt)) {
                        dev = rt->dst.dev;
                        ip_rt_put(rt);
                }
        }
        if (dev) {
                imr->imr_ifindex = dev->ifindex;
                idev = __in_dev_get_rtnl(dev);
        }
        return idev;
}

/*
 *        Join a socket to a group
 */

static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
        __be32 *psfsrc)
{
        struct ip_sf_list *psf, *psf_prev;
        int rv = 0;

        psf_prev = NULL;
        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (psf->sf_inaddr == *psfsrc)
                        break;
                psf_prev = psf;
        }
        if (!psf || psf->sf_count[sfmode] == 0) {
                /* source filter not found, or count wrong =>  bug */
                return -ESRCH;
        }
        psf->sf_count[sfmode]--;
        if (psf->sf_count[sfmode] == 0) {
                ip_rt_multicast_event(pmc->interface);
        }
        if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
                struct in_device *in_dev = pmc->interface;
                struct net *net = dev_net(in_dev->dev);
#endif

                /* no more filters for this source */
                if (psf_prev)
                        psf_prev->sf_next = psf->sf_next;
                else
                        pmc->sources = psf->sf_next;
#ifdef CONFIG_IP_MULTICAST
                if (psf->sf_oldin &&
                    !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
                        psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                        psf->sf_next = pmc->tomb;
                        pmc->tomb = psf;
                        rv = 1;
                } else
#endif
                        kfree(psf);
        }
        return rv;
}

#ifndef CONFIG_IP_MULTICAST
#define igmp_ifc_event(x)        do { } while (0)
#endif

static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
                         int sfcount, __be32 *psfsrc, int delta)
{
        struct ip_mc_list *pmc;
        int        changerec = 0;
        int        i, err;

        if (!in_dev)
                return -ENODEV;
        rcu_read_lock();
        for_each_pmc_rcu(in_dev, pmc) {
                if (*pmca == pmc->multiaddr)
                        break;
        }
        if (!pmc) {
                /* MCA not found?? bug */
                rcu_read_unlock();
                return -ESRCH;
        }
        spin_lock_bh(&pmc->lock);
        rcu_read_unlock();
#ifdef CONFIG_IP_MULTICAST
        sf_markstate(pmc);
#endif
        if (!delta) {
                err = -EINVAL;
                if (!pmc->sfcount[sfmode])
                        goto out_unlock;
                pmc->sfcount[sfmode]--;
        }
        err = 0;
        for (i = 0; i < sfcount; i++) {
                int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);

                changerec |= rv > 0;
                if (!err && rv < 0)
                        err = rv;
        }
        if (pmc->sfmode == MCAST_EXCLUDE &&
            pmc->sfcount[MCAST_EXCLUDE] == 0 &&
            pmc->sfcount[MCAST_INCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
                struct ip_sf_list *psf;
                struct net *net = dev_net(in_dev->dev);
#endif

                /* filter mode change */
                pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
                pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
                for (psf = pmc->sources; psf; psf = psf->sf_next)
                        psf->sf_crcount = 0;
                igmp_ifc_event(pmc->interface);
        } else if (sf_setstate(pmc) || changerec) {
                igmp_ifc_event(pmc->interface);
#endif
        }
out_unlock:
        spin_unlock_bh(&pmc->lock);
        return err;
}

/*
 * Add multicast single-source filter to the interface list
 */
static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
        __be32 *psfsrc)
{
        struct ip_sf_list *psf, *psf_prev;

        psf_prev = NULL;
        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (psf->sf_inaddr == *psfsrc)
                        break;
                psf_prev = psf;
        }
        if (!psf) {
                psf = kzalloc_obj(*psf, GFP_ATOMIC);
                if (!psf)
                        return -ENOBUFS;
                psf->sf_inaddr = *psfsrc;
                if (psf_prev) {
                        psf_prev->sf_next = psf;
                } else
                        pmc->sources = psf;
        }
        psf->sf_count[sfmode]++;
        if (psf->sf_count[sfmode] == 1) {
                ip_rt_multicast_event(pmc->interface);
        }
        return 0;
}

#ifdef CONFIG_IP_MULTICAST
static void sf_markstate(struct ip_mc_list *pmc)
{
        struct ip_sf_list *psf;
        int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];

        for (psf = pmc->sources; psf; psf = psf->sf_next)
                if (pmc->sfcount[MCAST_EXCLUDE]) {
                        psf->sf_oldin = mca_xcount ==
                                psf->sf_count[MCAST_EXCLUDE] &&
                                !psf->sf_count[MCAST_INCLUDE];
                } else
                        psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
}

static int sf_setstate(struct ip_mc_list *pmc)
{
        struct ip_sf_list *psf, *dpsf;
        int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
        int qrv = pmc->interface->mr_qrv;
        int new_in, rv;

        rv = 0;
        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (pmc->sfcount[MCAST_EXCLUDE]) {
                        new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
                                !psf->sf_count[MCAST_INCLUDE];
                } else
                        new_in = psf->sf_count[MCAST_INCLUDE] != 0;
                if (new_in) {
                        if (!psf->sf_oldin) {
                                struct ip_sf_list *prev = NULL;

                                for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
                                        if (dpsf->sf_inaddr == psf->sf_inaddr)
                                                break;
                                        prev = dpsf;
                                }
                                if (dpsf) {
                                        if (prev)
                                                prev->sf_next = dpsf->sf_next;
                                        else
                                                pmc->tomb = dpsf->sf_next;
                                        kfree(dpsf);
                                }
                                psf->sf_crcount = qrv;
                                rv++;
                        }
                } else if (psf->sf_oldin) {

                        psf->sf_crcount = 0;
                        /*
                         * add or update "delete" records if an active filter
                         * is now inactive
                         */
                        for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
                                if (dpsf->sf_inaddr == psf->sf_inaddr)
                                        break;
                        if (!dpsf) {
                                dpsf = kmalloc_obj(*dpsf, GFP_ATOMIC);
                                if (!dpsf)
                                        continue;
                                *dpsf = *psf;
                                /* pmc->lock held by callers */
                                dpsf->sf_next = pmc->tomb;
                                pmc->tomb = dpsf;
                        }
                        dpsf->sf_crcount = qrv;
                        rv++;
                }
        }
        return rv;
}
#endif

/*
 * Add multicast source filter list to the interface list
 */
static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
                         int sfcount, __be32 *psfsrc, int delta)
{
        struct ip_mc_list *pmc;
        int        isexclude;
        int        i, err;

        if (!in_dev)
                return -ENODEV;
        rcu_read_lock();
        for_each_pmc_rcu(in_dev, pmc) {
                if (*pmca == pmc->multiaddr)
                        break;
        }
        if (!pmc) {
                /* MCA not found?? bug */
                rcu_read_unlock();
                return -ESRCH;
        }
        spin_lock_bh(&pmc->lock);
        rcu_read_unlock();

#ifdef CONFIG_IP_MULTICAST
        sf_markstate(pmc);
#endif
        isexclude = pmc->sfmode == MCAST_EXCLUDE;
        if (!delta)
                pmc->sfcount[sfmode]++;
        err = 0;
        for (i = 0; i < sfcount; i++) {
                err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
                if (err)
                        break;
        }
        if (err) {
                int j;

                if (!delta)
                        pmc->sfcount[sfmode]--;
                for (j = 0; j < i; j++)
                        (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
        } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
                struct ip_sf_list *psf;
                struct net *net = dev_net(pmc->interface->dev);
                in_dev = pmc->interface;
#endif

                /* filter mode change */
                if (pmc->sfcount[MCAST_EXCLUDE])
                        pmc->sfmode = MCAST_EXCLUDE;
                else if (pmc->sfcount[MCAST_INCLUDE])
                        pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
                /* else no filters; keep old mode for reports */

                pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
                for (psf = pmc->sources; psf; psf = psf->sf_next)
                        psf->sf_crcount = 0;
                igmp_ifc_event(in_dev);
        } else if (sf_setstate(pmc)) {
                igmp_ifc_event(in_dev);
#endif
        }
        spin_unlock_bh(&pmc->lock);
        return err;
}

static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
        struct ip_sf_list *tomb, *sources;

        spin_lock_bh(&pmc->lock);
        tomb = pmc->tomb;
        pmc->tomb = NULL;
        sources = pmc->sources;
        pmc->sources = NULL;
        pmc->sfmode = MCAST_EXCLUDE;
        pmc->sfcount[MCAST_INCLUDE] = 0;
        pmc->sfcount[MCAST_EXCLUDE] = 1;
        spin_unlock_bh(&pmc->lock);

        ip_sf_list_clear_all(tomb);
        ip_sf_list_clear_all(sources);
}

/* Join a multicast group
 */
static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
                              unsigned int mode)
{
        __be32 addr = imr->imr_multiaddr.s_addr;
        struct ip_mc_socklist *iml, *i;
        struct in_device *in_dev;
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        int ifindex;
        int count = 0;
        int err;

        ASSERT_RTNL();

        if (!ipv4_is_multicast(addr))
                return -EINVAL;

        in_dev = ip_mc_find_dev(net, imr);

        if (!in_dev) {
                err = -ENODEV;
                goto done;
        }

        err = -EADDRINUSE;
        ifindex = imr->imr_ifindex;
        for_each_pmc_rtnl(inet, i) {
                if (i->multi.imr_multiaddr.s_addr == addr &&
                    i->multi.imr_ifindex == ifindex)
                        goto done;
                count++;
        }
        err = -ENOBUFS;
        if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships))
                goto done;
        iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
        if (!iml)
                goto done;

        memcpy(&iml->multi, imr, sizeof(*imr));
        iml->next_rcu = inet->mc_list;
        iml->sflist = NULL;
        iml->sfmode = mode;
        rcu_assign_pointer(inet->mc_list, iml);
        ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
        err = 0;
done:
        return err;
}

/* Join ASM (Any-Source Multicast) group
 */
int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
{
        return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE);
}
EXPORT_SYMBOL(ip_mc_join_group);

/* Join SSM (Source-Specific Multicast) group
 */
int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
                         unsigned int mode)
{
        return __ip_mc_join_group(sk, imr, mode);
}

static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
                           struct in_device *in_dev)
{
        struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
        int err;

        if (!psf) {
                /* any-source empty exclude case */
                return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
                        iml->sfmode, 0, NULL, 0);
        }
        err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
                        iml->sfmode, psf->sl_count, psf->sl_addr, 0);
        RCU_INIT_POINTER(iml->sflist, NULL);
        /* decrease mem now to avoid the memleak warning */
        atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc);
        kfree_rcu(psf, rcu);
        return err;
}

int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ip_mc_socklist *iml;
        struct ip_mc_socklist __rcu **imlp;
        struct in_device *in_dev;
        struct net *net = sock_net(sk);
        __be32 group = imr->imr_multiaddr.s_addr;
        u32 ifindex;
        int ret = -EADDRNOTAVAIL;

        ASSERT_RTNL();

        in_dev = ip_mc_find_dev(net, imr);
        if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) {
                ret = -ENODEV;
                goto out;
        }
        ifindex = imr->imr_ifindex;
        for (imlp = &inet->mc_list;
             (iml = rtnl_dereference(*imlp)) != NULL;
             imlp = &iml->next_rcu) {
                if (iml->multi.imr_multiaddr.s_addr != group)
                        continue;
                if (ifindex) {
                        if (iml->multi.imr_ifindex != ifindex)
                                continue;
                } else if (imr->imr_address.s_addr && imr->imr_address.s_addr !=
                                iml->multi.imr_address.s_addr)
                        continue;

                (void) ip_mc_leave_src(sk, iml, in_dev);

                *imlp = iml->next_rcu;

                if (in_dev)
                        ip_mc_dec_group(in_dev, group);

                /* decrease mem now to avoid the memleak warning */
                atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
                kfree_rcu(iml, rcu);
                return 0;
        }
out:
        return ret;
}
EXPORT_SYMBOL(ip_mc_leave_group);

int ip_mc_source(int add, int omode, struct sock *sk, struct
        ip_mreq_source *mreqs, int ifindex)
{
        int err;
        struct ip_mreqn imr;
        __be32 addr = mreqs->imr_multiaddr;
        struct ip_mc_socklist *pmc;
        struct in_device *in_dev = NULL;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_sf_socklist *psl;
        struct net *net = sock_net(sk);
        int leavegroup = 0;
        int i, j, rv;

        if (!ipv4_is_multicast(addr))
                return -EINVAL;

        ASSERT_RTNL();

        imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
        imr.imr_address.s_addr = mreqs->imr_interface;
        imr.imr_ifindex = ifindex;
        in_dev = ip_mc_find_dev(net, &imr);

        if (!in_dev) {
                err = -ENODEV;
                goto done;
        }
        err = -EADDRNOTAVAIL;

        for_each_pmc_rtnl(inet, pmc) {
                if ((pmc->multi.imr_multiaddr.s_addr ==
                     imr.imr_multiaddr.s_addr) &&
                    (pmc->multi.imr_ifindex == imr.imr_ifindex))
                        break;
        }
        if (!pmc) {                /* must have a prior join */
                err = -EINVAL;
                goto done;
        }
        /* if a source filter was set, must be the same mode as before */
        if (pmc->sflist) {
                if (pmc->sfmode != omode) {
                        err = -EINVAL;
                        goto done;
                }
        } else if (pmc->sfmode != omode) {
                /* allow mode switches for empty-set filters */
                ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
                ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
                        NULL, 0);
                pmc->sfmode = omode;
        }

        psl = rtnl_dereference(pmc->sflist);
        if (!add) {
                if (!psl)
                        goto done;        /* err = -EADDRNOTAVAIL */
                rv = !0;
                for (i = 0; i < psl->sl_count; i++) {
                        rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
                                sizeof(__be32));
                        if (rv == 0)
                                break;
                }
                if (rv)                /* source not found */
                        goto done;        /* err = -EADDRNOTAVAIL */

                /* special case - (INCLUDE, empty) == LEAVE_GROUP */
                if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
                        leavegroup = 1;
                        goto done;
                }

                /* update the interface filter */
                ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
                        &mreqs->imr_sourceaddr, 1);

                for (j = i+1; j < psl->sl_count; j++)
                        psl->sl_addr[j-1] = psl->sl_addr[j];
                psl->sl_count--;
                err = 0;
                goto done;
        }
        /* else, add a new source to the filter */

        if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) {
                err = -ENOBUFS;
                goto done;
        }
        if (!psl || psl->sl_count == psl->sl_max) {
                struct ip_sf_socklist *newpsl;
                int count = IP_SFBLOCK;

                if (psl)
                        count += psl->sl_max;
                newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
                                      GFP_KERNEL);
                if (!newpsl) {
                        err = -ENOBUFS;
                        goto done;
                }
                newpsl->sl_max = count;
                newpsl->sl_count = count - IP_SFBLOCK;
                if (psl) {
                        for (i = 0; i < psl->sl_count; i++)
                                newpsl->sl_addr[i] = psl->sl_addr[i];
                        /* decrease mem now to avoid the memleak warning */
                        atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
                                   &sk->sk_omem_alloc);
                }
                rcu_assign_pointer(pmc->sflist, newpsl);
                if (psl)
                        kfree_rcu(psl, rcu);
                psl = newpsl;
        }
        rv = 1;        /* > 0 for insert logic below if sl_count is 0 */
        for (i = 0; i < psl->sl_count; i++) {
                rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
                        sizeof(__be32));
                if (rv == 0)
                        break;
        }
        if (rv == 0)                /* address already there is an error */
                goto done;
        for (j = psl->sl_count-1; j >= i; j--)
                psl->sl_addr[j+1] = psl->sl_addr[j];
        psl->sl_addr[i] = mreqs->imr_sourceaddr;
        psl->sl_count++;
        err = 0;
        /* update the interface list */
        ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
                &mreqs->imr_sourceaddr, 1);
done:
        if (leavegroup)
                err = ip_mc_leave_group(sk, &imr);
        return err;
}

int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
{
        int err = 0;
        struct ip_mreqn        imr;
        __be32 addr = msf->imsf_multiaddr;
        struct ip_mc_socklist *pmc;
        struct in_device *in_dev;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_sf_socklist *newpsl, *psl;
        struct net *net = sock_net(sk);
        int leavegroup = 0;

        if (!ipv4_is_multicast(addr))
                return -EINVAL;
        if (msf->imsf_fmode != MCAST_INCLUDE &&
            msf->imsf_fmode != MCAST_EXCLUDE)
                return -EINVAL;

        ASSERT_RTNL();

        imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
        imr.imr_address.s_addr = msf->imsf_interface;
        imr.imr_ifindex = ifindex;
        in_dev = ip_mc_find_dev(net, &imr);

        if (!in_dev) {
                err = -ENODEV;
                goto done;
        }

        /* special case - (INCLUDE, empty) == LEAVE_GROUP */
        if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
                leavegroup = 1;
                goto done;
        }

        for_each_pmc_rtnl(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
                    pmc->multi.imr_ifindex == imr.imr_ifindex)
                        break;
        }
        if (!pmc) {                /* must have a prior join */
                err = -EINVAL;
                goto done;
        }
        if (msf->imsf_numsrc) {
                newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
                                                      msf->imsf_numsrc),
                                      GFP_KERNEL);
                if (!newpsl) {
                        err = -ENOBUFS;
                        goto done;
                }
                newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
                memcpy(newpsl->sl_addr, msf->imsf_slist_flex,
                       flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc));
                err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
                        msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
                if (err) {
                        sock_kfree_s(sk, newpsl,
                                     struct_size(newpsl, sl_addr,
                                                 newpsl->sl_max));
                        goto done;
                }
        } else {
                newpsl = NULL;
                (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
                                     msf->imsf_fmode, 0, NULL, 0);
        }
        psl = rtnl_dereference(pmc->sflist);
        if (psl) {
                (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
                        psl->sl_count, psl->sl_addr, 0);
                /* decrease mem now to avoid the memleak warning */
                atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
                           &sk->sk_omem_alloc);
        } else {
                (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
                        0, NULL, 0);
        }
        rcu_assign_pointer(pmc->sflist, newpsl);
        if (psl)
                kfree_rcu(psl, rcu);
        pmc->sfmode = msf->imsf_fmode;
        err = 0;
done:
        if (leavegroup)
                err = ip_mc_leave_group(sk, &imr);
        return err;
}
int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
                 sockptr_t optval, sockptr_t optlen)
{
        int err, len, count, copycount, msf_size;
        struct ip_mreqn        imr;
        __be32 addr = msf->imsf_multiaddr;
        struct ip_mc_socklist *pmc;
        struct in_device *in_dev;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_sf_socklist *psl;
        struct net *net = sock_net(sk);

        ASSERT_RTNL();

        if (!ipv4_is_multicast(addr))
                return -EINVAL;

        imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
        imr.imr_address.s_addr = msf->imsf_interface;
        imr.imr_ifindex = 0;
        in_dev = ip_mc_find_dev(net, &imr);

        if (!in_dev) {
                err = -ENODEV;
                goto done;
        }
        err = -EADDRNOTAVAIL;

        for_each_pmc_rtnl(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
                    pmc->multi.imr_ifindex == imr.imr_ifindex)
                        break;
        }
        if (!pmc)                /* must have a prior join */
                goto done;
        msf->imsf_fmode = pmc->sfmode;
        psl = rtnl_dereference(pmc->sflist);
        if (!psl) {
                count = 0;
        } else {
                count = psl->sl_count;
        }
        copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
        len = flex_array_size(psl, sl_addr, copycount);
        msf->imsf_numsrc = count;
        msf_size = IP_MSFILTER_SIZE(copycount);
        if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) ||
            copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) {
                return -EFAULT;
        }
        if (len &&
            copy_to_sockptr_offset(optval,
                                   offsetof(struct ip_msfilter, imsf_slist_flex),
                                   psl->sl_addr, len))
                return -EFAULT;
        return 0;
done:
        return err;
}

int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
                 sockptr_t optval, size_t ss_offset)
{
        int i, count, copycount;
        struct sockaddr_in *psin;
        __be32 addr;
        struct ip_mc_socklist *pmc;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_sf_socklist *psl;

        ASSERT_RTNL();

        psin = (struct sockaddr_in *)&gsf->gf_group;
        if (psin->sin_family != AF_INET)
                return -EINVAL;
        addr = psin->sin_addr.s_addr;
        if (!ipv4_is_multicast(addr))
                return -EINVAL;

        for_each_pmc_rtnl(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == addr &&
                    pmc->multi.imr_ifindex == gsf->gf_interface)
                        break;
        }
        if (!pmc)                /* must have a prior join */
                return -EADDRNOTAVAIL;
        gsf->gf_fmode = pmc->sfmode;
        psl = rtnl_dereference(pmc->sflist);
        count = psl ? psl->sl_count : 0;
        copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
        gsf->gf_numsrc = count;
        for (i = 0; i < copycount; i++) {
                struct sockaddr_storage ss;

                psin = (struct sockaddr_in *)&ss;
                memset(&ss, 0, sizeof(ss));
                psin->sin_family = AF_INET;
                psin->sin_addr.s_addr = psl->sl_addr[i];
                if (copy_to_sockptr_offset(optval, ss_offset,
                                           &ss, sizeof(ss)))
                        return -EFAULT;
                ss_offset += sizeof(ss);
        }
        return 0;
}

/*
 * check if a multicast source filter allows delivery for a given <src,dst,intf>
 */
int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
                   int dif, int sdif)
{
        const struct inet_sock *inet = inet_sk(sk);
        struct ip_mc_socklist *pmc;
        struct ip_sf_socklist *psl;
        int i;
        int ret;

        ret = 1;
        if (!ipv4_is_multicast(loc_addr))
                goto out;

        rcu_read_lock();
        for_each_pmc_rcu(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
                    (pmc->multi.imr_ifindex == dif ||
                     (sdif && pmc->multi.imr_ifindex == sdif)))
                        break;
        }
        ret = inet_test_bit(MC_ALL, sk);
        if (!pmc)
                goto unlock;
        psl = rcu_dereference(pmc->sflist);
        ret = (pmc->sfmode == MCAST_EXCLUDE);
        if (!psl)
                goto unlock;

        for (i = 0; i < psl->sl_count; i++) {
                if (psl->sl_addr[i] == rmt_addr)
                        break;
        }
        ret = 0;
        if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
                goto unlock;
        if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
                goto unlock;
        ret = 1;
unlock:
        rcu_read_unlock();
out:
        return ret;
}

/*
 *        A socket is closing.
 */

void ip_mc_drop_socket(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ip_mc_socklist *iml;
        struct net *net = sock_net(sk);

        if (!inet->mc_list)
                return;

        rtnl_lock();
        while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
                struct in_device *in_dev;

                inet->mc_list = iml->next_rcu;
                in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
                (void) ip_mc_leave_src(sk, iml, in_dev);
                if (in_dev)
                        ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
                /* decrease mem now to avoid the memleak warning */
                atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
                kfree_rcu(iml, rcu);
        }
        rtnl_unlock();
}

/* called with rcu_read_lock() */
int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto)
{
        struct ip_mc_list *im;
        struct ip_mc_list __rcu **mc_hash;
        struct ip_sf_list *psf;
        int rv = 0;

        mc_hash = rcu_dereference(in_dev->mc_hash);
        if (mc_hash) {
                u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);

                for (im = rcu_dereference(mc_hash[hash]);
                     im != NULL;
                     im = rcu_dereference(im->next_hash)) {
                        if (im->multiaddr == mc_addr)
                                break;
                }
        } else {
                for_each_pmc_rcu(in_dev, im) {
                        if (im->multiaddr == mc_addr)
                                break;
                }
        }
        if (im && proto == IPPROTO_IGMP) {
                rv = 1;
        } else if (im) {
                if (src_addr) {
                        spin_lock_bh(&im->lock);
                        for (psf = im->sources; psf; psf = psf->sf_next) {
                                if (psf->sf_inaddr == src_addr)
                                        break;
                        }
                        if (psf)
                                rv = psf->sf_count[MCAST_INCLUDE] ||
                                        psf->sf_count[MCAST_EXCLUDE] !=
                                        im->sfcount[MCAST_EXCLUDE];
                        else
                                rv = im->sfcount[MCAST_EXCLUDE] != 0;
                        spin_unlock_bh(&im->lock);
                } else
                        rv = 1; /* unspecified source; tentatively allow */
        }
        return rv;
}

#if defined(CONFIG_PROC_FS)
struct igmp_mc_iter_state {
        struct seq_net_private p;
        struct net_device *dev;
        struct in_device *in_dev;
};

#define        igmp_mc_seq_private(seq)        ((struct igmp_mc_iter_state *)(seq)->private)

static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
{
        struct net *net = seq_file_net(seq);
        struct ip_mc_list *im = NULL;
        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);

        state->in_dev = NULL;
        for_each_netdev_rcu(net, state->dev) {
                struct in_device *in_dev;

                in_dev = __in_dev_get_rcu(state->dev);
                if (!in_dev)
                        continue;
                im = rcu_dereference(in_dev->mc_list);
                if (im) {
                        state->in_dev = in_dev;
                        break;
                }
        }
        return im;
}

static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
{
        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);

        im = rcu_dereference(im->next_rcu);
        while (!im) {
                state->dev = next_net_device_rcu(state->dev);
                if (!state->dev) {
                        state->in_dev = NULL;
                        break;
                }
                state->in_dev = __in_dev_get_rcu(state->dev);
                if (!state->in_dev)
                        continue;
                im = rcu_dereference(state->in_dev->mc_list);
        }
        return im;
}

static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
{
        struct ip_mc_list *im = igmp_mc_get_first(seq);
        if (im)
                while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
                        --pos;
        return pos ? NULL : im;
}

static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(rcu)
{
        rcu_read_lock();
        return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}

static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct ip_mc_list *im;
        if (v == SEQ_START_TOKEN)
                im = igmp_mc_get_first(seq);
        else
                im = igmp_mc_get_next(seq, v);
        ++*pos;
        return im;
}

static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
        __releases(rcu)
{
        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);

        state->in_dev = NULL;
        state->dev = NULL;
        rcu_read_unlock();
}

static int igmp_mc_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN)
                seq_puts(seq,
                         "Idx\tDevice    : Count Querier\tGroup    Users Timer\tReporter\n");
        else {
                struct ip_mc_list *im = v;
                struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
                char   *querier;
                long delta;

#ifdef CONFIG_IP_MULTICAST
                querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
                          IGMP_V2_SEEN(state->in_dev) ? "V2" :
                          "V3";
#else
                querier = "NONE";
#endif

                if (rcu_access_pointer(state->in_dev->mc_list) == im) {
                        seq_printf(seq, "%d\t%-10s: %5d %7s\n",
                                   state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
                }

                delta = im->timer.expires - jiffies;
                seq_printf(seq,
                           "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
                           im->multiaddr, im->users,
                           im->tm_running,
                           im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
                           im->reporter);
        }
        return 0;
}

static const struct seq_operations igmp_mc_seq_ops = {
        .start        =        igmp_mc_seq_start,
        .next        =        igmp_mc_seq_next,
        .stop        =        igmp_mc_seq_stop,
        .show        =        igmp_mc_seq_show,
};

struct igmp_mcf_iter_state {
        struct seq_net_private p;
        struct net_device *dev;
        struct in_device *idev;
        struct ip_mc_list *im;
};

#define igmp_mcf_seq_private(seq)        ((struct igmp_mcf_iter_state *)(seq)->private)

static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
{
        struct net *net = seq_file_net(seq);
        struct ip_sf_list *psf = NULL;
        struct ip_mc_list *im = NULL;
        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);

        state->idev = NULL;
        state->im = NULL;
        for_each_netdev_rcu(net, state->dev) {
                struct in_device *idev;
                idev = __in_dev_get_rcu(state->dev);
                if (unlikely(!idev))
                        continue;
                im = rcu_dereference(idev->mc_list);
                if (likely(im)) {
                        spin_lock_bh(&im->lock);
                        psf = im->sources;
                        if (likely(psf)) {
                                state->im = im;
                                state->idev = idev;
                                break;
                        }
                        spin_unlock_bh(&im->lock);
                }
        }
        return psf;
}

static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
{
        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);

        psf = psf->sf_next;
        while (!psf) {
                spin_unlock_bh(&state->im->lock);
                state->im = state->im->next;
                while (!state->im) {
                        state->dev = next_net_device_rcu(state->dev);
                        if (!state->dev) {
                                state->idev = NULL;
                                goto out;
                        }
                        state->idev = __in_dev_get_rcu(state->dev);
                        if (!state->idev)
                                continue;
                        state->im = rcu_dereference(state->idev->mc_list);
                }
                spin_lock_bh(&state->im->lock);
                psf = state->im->sources;
        }
out:
        return psf;
}

static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
{
        struct ip_sf_list *psf = igmp_mcf_get_first(seq);
        if (psf)
                while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
                        --pos;
        return pos ? NULL : psf;
}

static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(rcu)
{
        rcu_read_lock();
        return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}

static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct ip_sf_list *psf;
        if (v == SEQ_START_TOKEN)
                psf = igmp_mcf_get_first(seq);
        else
                psf = igmp_mcf_get_next(seq, v);
        ++*pos;
        return psf;
}

static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
        __releases(rcu)
{
        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
        if (likely(state->im)) {
                spin_unlock_bh(&state->im->lock);
                state->im = NULL;
        }
        state->idev = NULL;
        state->dev = NULL;
        rcu_read_unlock();
}

static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
{
        struct ip_sf_list *psf = v;
        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "Idx Device        MCA        SRC    INC    EXC\n");
        } else {
                seq_printf(seq,
                           "%3d %6.6s 0x%08x "
                           "0x%08x %6lu %6lu\n",
                           state->dev->ifindex, state->dev->name,
                           ntohl(state->im->multiaddr),
                           ntohl(psf->sf_inaddr),
                           psf->sf_count[MCAST_INCLUDE],
                           psf->sf_count[MCAST_EXCLUDE]);
        }
        return 0;
}

static const struct seq_operations igmp_mcf_seq_ops = {
        .start        =        igmp_mcf_seq_start,
        .next        =        igmp_mcf_seq_next,
        .stop        =        igmp_mcf_seq_stop,
        .show        =        igmp_mcf_seq_show,
};

static int __net_init igmp_net_init(struct net *net)
{
        struct proc_dir_entry *pde;
        int err;

        pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops,
                        sizeof(struct igmp_mc_iter_state));
        if (!pde)
                goto out_igmp;
        pde = proc_create_net("mcfilter", 0444, net->proc_net,
                        &igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state));
        if (!pde)
                goto out_mcfilter;
        err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
                                   SOCK_DGRAM, 0, net);
        if (err < 0) {
                pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
                       err);
                goto out_sock;
        }

        return 0;

out_sock:
        remove_proc_entry("mcfilter", net->proc_net);
out_mcfilter:
        remove_proc_entry("igmp", net->proc_net);
out_igmp:
        return -ENOMEM;
}

static void __net_exit igmp_net_exit(struct net *net)
{
        remove_proc_entry("mcfilter", net->proc_net);
        remove_proc_entry("igmp", net->proc_net);
        inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
}

static struct pernet_operations igmp_net_ops = {
        .init = igmp_net_init,
        .exit = igmp_net_exit,
};
#endif

static int igmp_netdev_event(struct notifier_block *this,
                             unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct in_device *in_dev;

        switch (event) {
        case NETDEV_RESEND_IGMP:
                in_dev = __in_dev_get_rtnl(dev);
                if (in_dev)
                        ip_mc_rejoin_groups(in_dev);
                break;
        default:
                break;
        }
        return NOTIFY_DONE;
}

static struct notifier_block igmp_notifier = {
        .notifier_call = igmp_netdev_event,
};

int __init igmp_mc_init(void)
{
#if defined(CONFIG_PROC_FS)
        int err;

        err = register_pernet_subsys(&igmp_net_ops);
        if (err)
                return err;
        err = register_netdevice_notifier(&igmp_notifier);
        if (err)
                goto reg_notif_fail;
        return 0;

reg_notif_fail:
        unregister_pernet_subsys(&igmp_net_ops);
        return err;
#else
        return register_netdevice_notifier(&igmp_notifier);
#endif
}







































































    6 
    6 
    6 
    6 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_GRE_H
#define __LINUX_GRE_H

#include <linux/skbuff.h>
#include <net/ip_tunnels.h>

struct gre_base_hdr {
        __be16 flags;
        __be16 protocol;
} __packed;

struct gre_full_hdr {
        struct gre_base_hdr fixed_header;
        __be16 csum;
        __be16 reserved1;
        __be32 key;
        __be32 seq;
} __packed;
#define GRE_HEADER_SECTION 4

#define GREPROTO_CISCO                0
#define GREPROTO_PPTP                1
#define GREPROTO_MAX                2
#define GRE_IP_PROTO_MAX        2

struct gre_protocol {
        int  (*handler)(struct sk_buff *skb);
        void (*err_handler)(struct sk_buff *skb, u32 info);
};

int gre_add_protocol(const struct gre_protocol *proto, u8 version);
int gre_del_protocol(const struct gre_protocol *proto, u8 version);

struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
                                       u8 name_assign_type);
int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
                     bool *csum_err, __be16 proto, int nhs);

static inline bool netif_is_gretap(const struct net_device *dev)
{
        return dev->rtnl_link_ops &&
               !strcmp(dev->rtnl_link_ops->kind, "gretap");
}

static inline bool netif_is_ip6gretap(const struct net_device *dev)
{
        return dev->rtnl_link_ops &&
               !strcmp(dev->rtnl_link_ops->kind, "ip6gretap");
}

static inline int gre_calc_hlen(const unsigned long *o_flags)
{
        int addend = 4;

        if (test_bit(IP_TUNNEL_CSUM_BIT, o_flags))
                addend += 4;
        if (test_bit(IP_TUNNEL_KEY_BIT, o_flags))
                addend += 4;
        if (test_bit(IP_TUNNEL_SEQ_BIT, o_flags))
                addend += 4;
        return addend;
}

static inline void gre_flags_to_tnl_flags(unsigned long *dst, __be16 flags)
{
        IP_TUNNEL_DECLARE_FLAGS(res) = { };

        __assign_bit(IP_TUNNEL_CSUM_BIT, res, flags & GRE_CSUM);
        __assign_bit(IP_TUNNEL_ROUTING_BIT, res, flags & GRE_ROUTING);
        __assign_bit(IP_TUNNEL_KEY_BIT, res, flags & GRE_KEY);
        __assign_bit(IP_TUNNEL_SEQ_BIT, res, flags & GRE_SEQ);
        __assign_bit(IP_TUNNEL_STRICT_BIT, res, flags & GRE_STRICT);
        __assign_bit(IP_TUNNEL_REC_BIT, res, flags & GRE_REC);
        __assign_bit(IP_TUNNEL_VERSION_BIT, res, flags & GRE_VERSION);

        ip_tunnel_flags_copy(dst, res);
}

static inline __be16 gre_tnl_flags_to_gre_flags(const unsigned long *tflags)
{
        __be16 flags = 0;

        if (test_bit(IP_TUNNEL_CSUM_BIT, tflags))
                flags |= GRE_CSUM;
        if (test_bit(IP_TUNNEL_ROUTING_BIT, tflags))
                flags |= GRE_ROUTING;
        if (test_bit(IP_TUNNEL_KEY_BIT, tflags))
                flags |= GRE_KEY;
        if (test_bit(IP_TUNNEL_SEQ_BIT, tflags))
                flags |= GRE_SEQ;
        if (test_bit(IP_TUNNEL_STRICT_BIT, tflags))
                flags |= GRE_STRICT;
        if (test_bit(IP_TUNNEL_REC_BIT, tflags))
                flags |= GRE_REC;
        if (test_bit(IP_TUNNEL_VERSION_BIT, tflags))
                flags |= GRE_VERSION;

        return flags;
}

static inline void gre_build_header(struct sk_buff *skb, int hdr_len,
                                    const unsigned long *flags, __be16 proto,
                                    __be32 key, __be32 seq)
{
        IP_TUNNEL_DECLARE_FLAGS(cond) = { };
        struct gre_base_hdr *greh;

        skb_push(skb, hdr_len);

        skb_set_inner_protocol(skb, proto);
        skb_reset_transport_header(skb);
        greh = (struct gre_base_hdr *)skb->data;
        greh->flags = gre_tnl_flags_to_gre_flags(flags);
        greh->protocol = proto;

        __set_bit(IP_TUNNEL_KEY_BIT, cond);
        __set_bit(IP_TUNNEL_CSUM_BIT, cond);
        __set_bit(IP_TUNNEL_SEQ_BIT, cond);

        if (ip_tunnel_flags_intersect(flags, cond)) {
                __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);

                if (test_bit(IP_TUNNEL_SEQ_BIT, flags)) {
                        *ptr = seq;
                        ptr--;
                }
                if (test_bit(IP_TUNNEL_KEY_BIT, flags)) {
                        *ptr = key;
                        ptr--;
                }
                if (test_bit(IP_TUNNEL_CSUM_BIT, flags) &&
                    !(skb_shinfo(skb)->gso_type &
                      (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
                        *ptr = 0;
                        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                                *(__sum16 *)ptr = csum_fold(lco_csum(skb));
                        } else {
                                skb->ip_summed = CHECKSUM_PARTIAL;
                                skb->csum_start = skb_transport_header(skb) - skb->head;
                                skb->csum_offset = sizeof(*greh);
                        }
                }
        }
}

#endif









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 











    3 











































    3 


































































    2 






    3 




    3 








    1 
    1 












    3 











































































































    2 










    2 

















































    2 








    2 












    2 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xfrm_policy.c
 *
 * Changes:
 *        Mitsuru KANDA @USAGI
 *         Kazunori MIYAZAWA @USAGI
 *         Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *                 IPv6 support
 *         Kazunori MIYAZAWA @USAGI
 *         YOSHIFUJI Hideaki
 *                 Split up af-specific portion
 *        Derek Atkins <derek@ihtfp.com>                Add the post_input processor
 *
 */

#include <linux/err.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/cpu.h>
#include <linux/audit.h>
#include <linux/rhashtable.h>
#include <linux/if_tunnel.h>
#include <linux/icmp.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/gre.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/mip6.h>
#endif
#ifdef CONFIG_XFRM_STATISTICS
#include <net/snmp.h>
#endif
#ifdef CONFIG_XFRM_ESPINTCP
#include <net/espintcp.h>
#endif
#include <net/inet_dscp.h>

#include "xfrm_hash.h"

#define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10))
#define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
#define XFRM_MAX_QUEUE_LEN        100

struct xfrm_flo {
        struct dst_entry *dst_orig;
        u8 flags;
};

/* prefixes smaller than this are stored in lists, not trees. */
#define INEXACT_PREFIXLEN_IPV4        16
#define INEXACT_PREFIXLEN_IPV6        48

struct xfrm_pol_inexact_node {
        struct rb_node node;
        union {
                xfrm_address_t addr;
                struct rcu_head rcu;
        };
        u8 prefixlen;

        struct rb_root root;

        /* the policies matching this node, can be empty list */
        struct hlist_head hhead;
};

/* xfrm inexact policy search tree:
 * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
 *  |
 * +---- root_d: sorted by daddr:prefix
 * |                 |
 * |        xfrm_pol_inexact_node
 * |                 |
 * |                 +- root: sorted by saddr/prefix
 * |                 |              |
 * |                 |         xfrm_pol_inexact_node
 * |                 |              |
 * |                 |              + root: unused
 * |                 |              |
 * |                 |              + hhead: saddr:daddr policies
 * |                 |
 * |                 +- coarse policies and all any:daddr policies
 * |
 * +---- root_s: sorted by saddr:prefix
 * |                 |
 * |        xfrm_pol_inexact_node
 * |                 |
 * |                 + root: unused
 * |                 |
 * |                 + hhead: saddr:any policies
 * |
 * +---- coarse policies and all any:any policies
 *
 * Lookups return four candidate lists:
 * 1. any:any list from top-level xfrm_pol_inexact_bin
 * 2. any:daddr list from daddr tree
 * 3. saddr:daddr list from 2nd level daddr tree
 * 4. saddr:any list from saddr tree
 *
 * This result set then needs to be searched for the policy with
 * the lowest priority.  If two candidates have the same priority, the
 * struct xfrm_policy pos member with the lower number is used.
 *
 * This replicates previous single-list-search algorithm which would
 * return first matching policy in the (ordered-by-priority) list.
 */

struct xfrm_pol_inexact_key {
        possible_net_t net;
        u32 if_id;
        u16 family;
        u8 dir, type;
};

struct xfrm_pol_inexact_bin {
        struct xfrm_pol_inexact_key k;
        struct rhash_head head;
        /* list containing '*:*' policies */
        struct hlist_head hhead;

        seqcount_spinlock_t count;
        /* tree sorted by daddr/prefix */
        struct rb_root root_d;

        /* tree sorted by saddr/prefix */
        struct rb_root root_s;

        /* slow path below */
        struct list_head inexact_bins;
        struct rcu_head rcu;
};

enum xfrm_pol_inexact_candidate_type {
        XFRM_POL_CAND_BOTH,
        XFRM_POL_CAND_SADDR,
        XFRM_POL_CAND_DADDR,
        XFRM_POL_CAND_ANY,

        XFRM_POL_CAND_MAX,
};

struct xfrm_pol_inexact_candidates {
        struct hlist_head *res[XFRM_POL_CAND_MAX];
};

struct xfrm_flow_keys {
        struct flow_dissector_key_basic basic;
        struct flow_dissector_key_control control;
        union {
                struct flow_dissector_key_ipv4_addrs ipv4;
                struct flow_dissector_key_ipv6_addrs ipv6;
        } addrs;
        struct flow_dissector_key_ip ip;
        struct flow_dissector_key_icmp icmp;
        struct flow_dissector_key_ports ports;
        struct flow_dissector_key_keyid gre;
};

static struct flow_dissector xfrm_session_dissector __ro_after_init;

static DEFINE_SPINLOCK(xfrm_if_cb_lock);
static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;

static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
                                                __read_mostly;

static struct kmem_cache *xfrm_dst_cache __ro_after_init;

static struct rhashtable xfrm_policy_inexact_table;
static const struct rhashtable_params xfrm_pol_inexact_params;

static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
static int stale_bundle(struct dst_entry *dst);
static int xfrm_bundle_ok(struct xfrm_dst *xdst);
static void xfrm_policy_queue_process(struct timer_list *t);

static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
                                                int dir);

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
                           u32 if_id);

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup_rcu(struct net *net,
                               u8 type, u16 family, u8 dir, u32 if_id);
static struct xfrm_policy *
xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
                        bool excl);

static bool
xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
                                    struct xfrm_pol_inexact_bin *b,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr);

static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
{
        return refcount_inc_not_zero(&policy->refcnt);
}

static inline bool
__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi4 *fl4 = &fl->u.ip4;

        return  addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) &&
                addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) &&
                !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
                !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
                (fl4->flowi4_proto == sel->proto || !sel->proto) &&
                (fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
}

static inline bool
__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi6 *fl6 = &fl->u.ip6;

        return  addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
                addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
                !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
                !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
                (fl6->flowi6_proto == sel->proto || !sel->proto) &&
                (fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
}

bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
                         unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_selector_match(sel, fl);
        case AF_INET6:
                return __xfrm6_selector_match(sel, fl);
        }
        return false;
}

static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
{
        const struct xfrm_policy_afinfo *afinfo;

        if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
                return NULL;
        rcu_read_lock();
        afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
        if (unlikely(!afinfo))
                rcu_read_unlock();
        return afinfo;
}

/* Called with rcu_read_lock(). */
static const struct xfrm_if_cb *xfrm_if_get_cb(void)
{
        return rcu_dereference(xfrm_if_cb);
}

struct dst_entry *__xfrm_dst_lookup(int family,
                                    const struct xfrm_dst_lookup_params *params)
{
        const struct xfrm_policy_afinfo *afinfo;
        struct dst_entry *dst;

        afinfo = xfrm_policy_get_afinfo(family);
        if (unlikely(afinfo == NULL))
                return ERR_PTR(-EAFNOSUPPORT);

        dst = afinfo->dst_lookup(params);

        rcu_read_unlock();

        return dst;
}
EXPORT_SYMBOL(__xfrm_dst_lookup);

static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
                                                dscp_t dscp, int oif,
                                                xfrm_address_t *prev_saddr,
                                                xfrm_address_t *prev_daddr,
                                                int family, u32 mark)
{
        struct xfrm_dst_lookup_params params;
        struct net *net = xs_net(x);
        xfrm_address_t *saddr = &x->props.saddr;
        xfrm_address_t *daddr = &x->id.daddr;
        struct dst_entry *dst;

        if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) {
                saddr = x->coaddr;
                daddr = prev_daddr;
        }
        if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) {
                saddr = prev_saddr;
                daddr = x->coaddr;
        }

        params.net = net;
        params.saddr = saddr;
        params.daddr = daddr;
        params.dscp = dscp;
        params.oif = oif;
        params.mark = mark;
        params.ipproto = x->id.proto;
        if (x->encap) {
                switch (x->encap->encap_type) {
                case UDP_ENCAP_ESPINUDP:
                        params.ipproto = IPPROTO_UDP;
                        params.uli.ports.sport = x->encap->encap_sport;
                        params.uli.ports.dport = x->encap->encap_dport;
                        break;
                case TCP_ENCAP_ESPINTCP:
                        params.ipproto = IPPROTO_TCP;
                        params.uli.ports.sport = x->encap->encap_sport;
                        params.uli.ports.dport = x->encap->encap_dport;
                        break;
                }
        }

        dst = __xfrm_dst_lookup(family, &params);

        if (!IS_ERR(dst)) {
                if (prev_saddr != saddr)
                        memcpy(prev_saddr, saddr,  sizeof(*prev_saddr));
                if (prev_daddr != daddr)
                        memcpy(prev_daddr, daddr,  sizeof(*prev_daddr));
        }

        return dst;
}

static inline unsigned long make_jiffies(long secs)
{
        if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
                return MAX_SCHEDULE_TIMEOUT-1;
        else
                return secs*HZ;
}

static void xfrm_policy_timer(struct timer_list *t)
{
        struct xfrm_policy *xp = timer_container_of(xp, t, timer);
        time64_t now = ktime_get_real_seconds();
        time64_t next = TIME64_MAX;
        int warn = 0;
        int dir;

        read_lock(&xp->lock);

        if (unlikely(xp->walk.dead))
                goto out;

        dir = xfrm_policy_id2dir(xp->index);

        if (xp->lft.hard_add_expires_seconds) {
                time64_t tmo = xp->lft.hard_add_expires_seconds +
                        xp->curlft.add_time - now;
                if (tmo <= 0)
                        goto expired;
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.hard_use_expires_seconds) {
                time64_t tmo = xp->lft.hard_use_expires_seconds +
                        (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now;
                if (tmo <= 0)
                        goto expired;
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.soft_add_expires_seconds) {
                time64_t tmo = xp->lft.soft_add_expires_seconds +
                        xp->curlft.add_time - now;
                if (tmo <= 0) {
                        warn = 1;
                        tmo = XFRM_KM_TIMEOUT;
                }
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.soft_use_expires_seconds) {
                time64_t tmo = xp->lft.soft_use_expires_seconds +
                        (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now;
                if (tmo <= 0) {
                        warn = 1;
                        tmo = XFRM_KM_TIMEOUT;
                }
                if (tmo < next)
                        next = tmo;
        }

        if (warn)
                km_policy_expired(xp, dir, 0, 0);
        if (next != TIME64_MAX &&
            !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
                xfrm_pol_hold(xp);

out:
        read_unlock(&xp->lock);
        xfrm_pol_put(xp);
        return;

expired:
        read_unlock(&xp->lock);
        if (!xfrm_policy_delete(xp, dir))
                km_policy_expired(xp, dir, 1, 0);
        xfrm_pol_put(xp);
}

/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
 * SPD calls.
 */

struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
{
        struct xfrm_policy *policy;

        policy = kzalloc_obj(struct xfrm_policy, gfp);

        if (policy) {
                write_pnet(&policy->xp_net, net);
                INIT_LIST_HEAD(&policy->walk.all);
                INIT_HLIST_HEAD(&policy->state_cache_list);
                INIT_HLIST_NODE(&policy->bydst);
                INIT_HLIST_NODE(&policy->byidx);
                rwlock_init(&policy->lock);
                refcount_set(&policy->refcnt, 1);
                skb_queue_head_init(&policy->polq.hold_queue);
                timer_setup(&policy->timer, xfrm_policy_timer, 0);
                timer_setup(&policy->polq.hold_timer,
                            xfrm_policy_queue_process, 0);
        }
        return policy;
}
EXPORT_SYMBOL(xfrm_policy_alloc);

static void xfrm_policy_destroy_rcu(struct rcu_head *head)
{
        struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu);

        security_xfrm_policy_free(policy->security);
        kfree(policy);
}

/* Destroy xfrm_policy: descendant resources must be released to this moment. */

void xfrm_policy_destroy(struct xfrm_policy *policy)
{
        BUG_ON(!policy->walk.dead);

        if (timer_delete(&policy->timer) || timer_delete(&policy->polq.hold_timer))
                BUG();

        xfrm_dev_policy_free(policy);
        call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
}
EXPORT_SYMBOL(xfrm_policy_destroy);

/* Rule must be locked. Release descendant resources, announce
 * entry dead. The rule must be unlinked from lists to the moment.
 */

static void xfrm_policy_kill(struct xfrm_policy *policy)
{
        struct net *net = xp_net(policy);
        struct xfrm_state *x;

        xfrm_dev_policy_delete(policy);

        write_lock_bh(&policy->lock);
        policy->walk.dead = 1;
        write_unlock_bh(&policy->lock);

        atomic_inc(&policy->genid);

        if (timer_delete(&policy->polq.hold_timer))
                xfrm_pol_put(policy);
        skb_queue_purge(&policy->polq.hold_queue);

        if (timer_delete(&policy->timer))
                xfrm_pol_put(policy);

        /* XXX: Flush state cache */
        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        hlist_for_each_entry_rcu(x, &policy->state_cache_list, state_cache) {
                hlist_del_init_rcu(&x->state_cache);
        }
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        xfrm_pol_put(policy);
}

static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;

static inline unsigned int idx_hash(struct net *net, u32 index)
{
        return __idx_hash(index, net->xfrm.policy_idx_hmask);
}

/* calculate policy hash thresholds */
static void __get_hash_thresh(struct net *net,
                              unsigned short family, int dir,
                              u8 *dbits, u8 *sbits)
{
        switch (family) {
        case AF_INET:
                *dbits = net->xfrm.policy_bydst[dir].dbits4;
                *sbits = net->xfrm.policy_bydst[dir].sbits4;
                break;

        case AF_INET6:
                *dbits = net->xfrm.policy_bydst[dir].dbits6;
                *sbits = net->xfrm.policy_bydst[dir].sbits6;
                break;

        default:
                *dbits = 0;
                *sbits = 0;
        }
}

static struct hlist_head *policy_hash_bysel(struct net *net,
                                            const struct xfrm_selector *sel,
                                            unsigned short family, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int hash;
        u8 dbits;
        u8 sbits;

        __get_hash_thresh(net, family, dir, &dbits, &sbits);
        hash = __sel_hash(sel, family, hmask, dbits, sbits);

        if (hash == hmask + 1)
                return NULL;

        return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
                     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}

static struct hlist_head *policy_hash_direct(struct net *net,
                                             const xfrm_address_t *daddr,
                                             const xfrm_address_t *saddr,
                                             unsigned short family, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int hash;
        u8 dbits;
        u8 sbits;

        __get_hash_thresh(net, family, dir, &dbits, &sbits);
        hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);

        return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
                     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}

static void xfrm_dst_hash_transfer(struct net *net,
                                   struct hlist_head *list,
                                   struct hlist_head *ndsttable,
                                   unsigned int nhashmask,
                                   int dir)
{
        struct hlist_node *tmp, *entry0 = NULL;
        struct xfrm_policy *pol;
        unsigned int h0 = 0;
        u8 dbits;
        u8 sbits;

redo:
        hlist_for_each_entry_safe(pol, tmp, list, bydst) {
                unsigned int h;

                __get_hash_thresh(net, pol->family, dir, &dbits, &sbits);
                h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
                                pol->family, nhashmask, dbits, sbits);
                if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
                        hlist_del_rcu(&pol->bydst);
                        hlist_add_head_rcu(&pol->bydst, ndsttable + h);
                        h0 = h;
                } else {
                        if (h != h0)
                                continue;
                        hlist_del_rcu(&pol->bydst);
                        hlist_add_behind_rcu(&pol->bydst, entry0);
                }
                entry0 = &pol->bydst;
        }
        if (!hlist_empty(list)) {
                entry0 = NULL;
                goto redo;
        }
}

static void xfrm_idx_hash_transfer(struct hlist_head *list,
                                   struct hlist_head *nidxtable,
                                   unsigned int nhashmask)
{
        struct hlist_node *tmp;
        struct xfrm_policy *pol;

        hlist_for_each_entry_safe(pol, tmp, list, byidx) {
                unsigned int h;

                h = __idx_hash(pol->index, nhashmask);
                hlist_add_head(&pol->byidx, nidxtable+h);
        }
}

static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
{
        return ((old_hmask + 1) << 1) - 1;
}

static void xfrm_bydst_resize(struct net *net, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int nhashmask = xfrm_new_hash_mask(hmask);
        unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
        struct hlist_head *ndst = xfrm_hash_alloc(nsize);
        struct hlist_head *odst;
        int i;

        if (!ndst)
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);

        odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
                                lockdep_is_held(&net->xfrm.xfrm_policy_lock));

        for (i = hmask; i >= 0; i--)
                xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);

        rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
        net->xfrm.policy_bydst[dir].hmask = nhashmask;

        write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        synchronize_rcu();

        xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
}

static void xfrm_byidx_resize(struct net *net)
{
        unsigned int hmask = net->xfrm.policy_idx_hmask;
        unsigned int nhashmask = xfrm_new_hash_mask(hmask);
        unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
        struct hlist_head *oidx = net->xfrm.policy_byidx;
        struct hlist_head *nidx = xfrm_hash_alloc(nsize);
        int i;

        if (!nidx)
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        for (i = hmask; i >= 0; i--)
                xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);

        net->xfrm.policy_byidx = nidx;
        net->xfrm.policy_idx_hmask = nhashmask;

        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
}

static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total)
{
        unsigned int cnt = net->xfrm.policy_count[dir];
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;

        if (total)
                *total += cnt;

        if ((hmask + 1) < xfrm_policy_hashmax &&
            cnt > hmask)
                return 1;

        return 0;
}

static inline int xfrm_byidx_should_resize(struct net *net, int total)
{
        unsigned int hmask = net->xfrm.policy_idx_hmask;

        if ((hmask + 1) < xfrm_policy_hashmax &&
            total > hmask)
                return 1;

        return 0;
}

void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
{
        si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
        si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
        si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
        si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
        si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
        si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
        si->spdhcnt = net->xfrm.policy_idx_hmask;
        si->spdhmcnt = xfrm_policy_hashmax;
}
EXPORT_SYMBOL(xfrm_spd_getinfo);

static DEFINE_MUTEX(hash_resize_mutex);
static void xfrm_hash_resize(struct work_struct *work)
{
        struct net *net = container_of(work, struct net, xfrm.policy_hash_work);
        int dir, total;

        mutex_lock(&hash_resize_mutex);

        total = 0;
        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                if (xfrm_bydst_should_resize(net, dir, &total))
                        xfrm_bydst_resize(net, dir);
        }
        if (xfrm_byidx_should_resize(net, total))
                xfrm_byidx_resize(net);

        mutex_unlock(&hash_resize_mutex);
}

/* Make sure *pol can be inserted into fastbin.
 * Useful to check that later insert requests will be successful
 * (provided xfrm_policy_lock is held throughout).
 */
static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
{
        struct xfrm_pol_inexact_bin *bin, *prev;
        struct xfrm_pol_inexact_key k = {
                .family = pol->family,
                .type = pol->type,
                .dir = dir,
                .if_id = pol->if_id,
        };
        struct net *net = xp_net(pol);

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        write_pnet(&k.net, net);
        bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
                                     xfrm_pol_inexact_params);
        if (bin)
                return bin;

        bin = kzalloc_obj(*bin, GFP_ATOMIC);
        if (!bin)
                return NULL;

        bin->k = k;
        INIT_HLIST_HEAD(&bin->hhead);
        bin->root_d = RB_ROOT;
        bin->root_s = RB_ROOT;
        seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock);

        prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
                                                &bin->k, &bin->head,
                                                xfrm_pol_inexact_params);
        if (!prev) {
                list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
                return bin;
        }

        kfree(bin);

        return IS_ERR(prev) ? NULL : prev;
}

static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
                                               int family, u8 prefixlen)
{
        if (xfrm_addr_any(addr, family))
                return true;

        if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
                return true;

        if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
                return true;

        return false;
}

static bool
xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
{
        const xfrm_address_t *addr;
        bool saddr_any, daddr_any;
        u8 prefixlen;

        addr = &policy->selector.saddr;
        prefixlen = policy->selector.prefixlen_s;

        saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
                                                       policy->family,
                                                       prefixlen);
        addr = &policy->selector.daddr;
        prefixlen = policy->selector.prefixlen_d;
        daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
                                                       policy->family,
                                                       prefixlen);
        return saddr_any && daddr_any;
}

static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
                                       const xfrm_address_t *addr, u8 prefixlen)
{
        node->addr = *addr;
        node->prefixlen = prefixlen;
}

static struct xfrm_pol_inexact_node *
xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
{
        struct xfrm_pol_inexact_node *node;

        node = kzalloc_obj(*node, GFP_ATOMIC);
        if (node)
                xfrm_pol_inexact_node_init(node, addr, prefixlen);

        return node;
}

static int xfrm_policy_addr_delta(const xfrm_address_t *a,
                                  const xfrm_address_t *b,
                                  u8 prefixlen, u16 family)
{
        u32 ma, mb, mask;
        unsigned int pdw, pbi;
        int delta = 0;

        switch (family) {
        case AF_INET:
                if (prefixlen == 0)
                        return 0;
                mask = ~0U << (32 - prefixlen);
                ma = ntohl(a->a4) & mask;
                mb = ntohl(b->a4) & mask;
                if (ma < mb)
                        delta = -1;
                else if (ma > mb)
                        delta = 1;
                break;
        case AF_INET6:
                pdw = prefixlen >> 5;
                pbi = prefixlen & 0x1f;

                if (pdw) {
                        delta = memcmp(a->a6, b->a6, pdw << 2);
                        if (delta)
                                return delta;
                }
                if (pbi) {
                        mask = ~0U << (32 - pbi);
                        ma = ntohl(a->a6[pdw]) & mask;
                        mb = ntohl(b->a6[pdw]) & mask;
                        if (ma < mb)
                                delta = -1;
                        else if (ma > mb)
                                delta = 1;
                }
                break;
        default:
                break;
        }

        return delta;
}

static void xfrm_policy_inexact_list_reinsert(struct net *net,
                                              struct xfrm_pol_inexact_node *n,
                                              u16 family)
{
        unsigned int matched_s, matched_d;
        struct xfrm_policy *policy, *p;

        matched_s = 0;
        matched_d = 0;

        list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
                struct hlist_node *newpos = NULL;
                bool matches_s, matches_d;

                if (policy->walk.dead || !policy->bydst_reinsert)
                        continue;

                WARN_ON_ONCE(policy->family != family);

                policy->bydst_reinsert = false;
                hlist_for_each_entry(p, &n->hhead, bydst) {
                        if (policy->priority > p->priority)
                                newpos = &p->bydst;
                        else if (policy->priority == p->priority &&
                                 policy->pos > p->pos)
                                newpos = &p->bydst;
                        else
                                break;
                }

                if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                        hlist_add_behind_rcu(&policy->bydst, newpos);
                else
                        hlist_add_head_rcu(&policy->bydst, &n->hhead);

                /* paranoia checks follow.
                 * Check that the reinserted policy matches at least
                 * saddr or daddr for current node prefix.
                 *
                 * Matching both is fine, matching saddr in one policy
                 * (but not daddr) and then matching only daddr in another
                 * is a bug.
                 */
                matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
                                                   &n->addr,
                                                   n->prefixlen,
                                                   family) == 0;
                matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
                                                   &n->addr,
                                                   n->prefixlen,
                                                   family) == 0;
                if (matches_s && matches_d)
                        continue;

                WARN_ON_ONCE(!matches_s && !matches_d);
                if (matches_s)
                        matched_s++;
                if (matches_d)
                        matched_d++;
                WARN_ON_ONCE(matched_s && matched_d);
        }
}

static void xfrm_policy_inexact_node_reinsert(struct net *net,
                                              struct xfrm_pol_inexact_node *n,
                                              struct rb_root *new,
                                              u16 family)
{
        struct xfrm_pol_inexact_node *node;
        struct rb_node **p, *parent;

        /* we should not have another subtree here */
        WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
restart:
        parent = NULL;
        p = &new->rb_node;
        while (*p) {
                u8 prefixlen;
                int delta;

                parent = *p;
                node = rb_entry(*p, struct xfrm_pol_inexact_node, node);

                prefixlen = min(node->prefixlen, n->prefixlen);

                delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
                                               prefixlen, family);
                if (delta < 0) {
                        p = &parent->rb_left;
                } else if (delta > 0) {
                        p = &parent->rb_right;
                } else {
                        bool same_prefixlen = node->prefixlen == n->prefixlen;
                        struct xfrm_policy *tmp;

                        hlist_for_each_entry(tmp, &n->hhead, bydst) {
                                tmp->bydst_reinsert = true;
                                hlist_del_rcu(&tmp->bydst);
                        }

                        node->prefixlen = prefixlen;

                        xfrm_policy_inexact_list_reinsert(net, node, family);

                        if (same_prefixlen) {
                                kfree_rcu(n, rcu);
                                return;
                        }

                        rb_erase(*p, new);
                        kfree_rcu(n, rcu);
                        n = node;
                        goto restart;
                }
        }

        rb_link_node_rcu(&n->node, parent, p);
        rb_insert_color(&n->node, new);
}

/* merge nodes v and n */
static void xfrm_policy_inexact_node_merge(struct net *net,
                                           struct xfrm_pol_inexact_node *v,
                                           struct xfrm_pol_inexact_node *n,
                                           u16 family)
{
        struct xfrm_pol_inexact_node *node;
        struct xfrm_policy *tmp;
        struct rb_node *rnode;

        /* To-be-merged node v has a subtree.
         *
         * Dismantle it and insert its nodes to n->root.
         */
        while ((rnode = rb_first(&v->root)) != NULL) {
                node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
                rb_erase(&node->node, &v->root);
                xfrm_policy_inexact_node_reinsert(net, node, &n->root,
                                                  family);
        }

        hlist_for_each_entry(tmp, &v->hhead, bydst) {
                tmp->bydst_reinsert = true;
                hlist_del_rcu(&tmp->bydst);
        }

        xfrm_policy_inexact_list_reinsert(net, n, family);
}

static struct xfrm_pol_inexact_node *
xfrm_policy_inexact_insert_node(struct net *net,
                                struct rb_root *root,
                                xfrm_address_t *addr,
                                u16 family, u8 prefixlen, u8 dir)
{
        struct xfrm_pol_inexact_node *cached = NULL;
        struct rb_node **p, *parent = NULL;
        struct xfrm_pol_inexact_node *node;

        p = &root->rb_node;
        while (*p) {
                int delta;

                parent = *p;
                node = rb_entry(*p, struct xfrm_pol_inexact_node, node);

                delta = xfrm_policy_addr_delta(addr, &node->addr,
                                               node->prefixlen,
                                               family);
                if (delta == 0 && prefixlen >= node->prefixlen) {
                        WARN_ON_ONCE(cached); /* ipsec policies got lost */
                        return node;
                }

                if (delta < 0)
                        p = &parent->rb_left;
                else
                        p = &parent->rb_right;

                if (prefixlen < node->prefixlen) {
                        delta = xfrm_policy_addr_delta(addr, &node->addr,
                                                       prefixlen,
                                                       family);
                        if (delta)
                                continue;

                        /* This node is a subnet of the new prefix. It needs
                         * to be removed and re-inserted with the smaller
                         * prefix and all nodes that are now also covered
                         * by the reduced prefixlen.
                         */
                        rb_erase(&node->node, root);

                        if (!cached) {
                                xfrm_pol_inexact_node_init(node, addr,
                                                           prefixlen);
                                cached = node;
                        } else {
                                /* This node also falls within the new
                                 * prefixlen. Merge the to-be-reinserted
                                 * node and this one.
                                 */
                                xfrm_policy_inexact_node_merge(net, node,
                                                               cached, family);
                                kfree_rcu(node, rcu);
                        }

                        /* restart */
                        p = &root->rb_node;
                        parent = NULL;
                }
        }

        node = cached;
        if (!node) {
                node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
                if (!node)
                        return NULL;
        }

        rb_link_node_rcu(&node->node, parent, p);
        rb_insert_color(&node->node, root);

        return node;
}

static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
{
        struct xfrm_pol_inexact_node *node;
        struct rb_node *rn = rb_first(r);

        while (rn) {
                node = rb_entry(rn, struct xfrm_pol_inexact_node, node);

                xfrm_policy_inexact_gc_tree(&node->root, rm);
                rn = rb_next(rn);

                if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
                        WARN_ON_ONCE(rm);
                        continue;
                }

                rb_erase(&node->node, r);
                kfree_rcu(node, rcu);
        }
}

static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
{
        write_seqcount_begin(&b->count);
        xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
        xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
        write_seqcount_end(&b->count);

        if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
            !hlist_empty(&b->hhead)) {
                WARN_ON_ONCE(net_exit);
                return;
        }

        if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
                                   xfrm_pol_inexact_params) == 0) {
                list_del(&b->inexact_bins);
                kfree_rcu(b, rcu);
        }
}

static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
{
        struct net *net = read_pnet(&b->k.net);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        __xfrm_policy_inexact_prune_bin(b, false);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}

static void __xfrm_policy_inexact_flush(struct net *net)
{
        struct xfrm_pol_inexact_bin *bin, *t;

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
                __xfrm_policy_inexact_prune_bin(bin, false);
}

static struct hlist_head *
xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
                                struct xfrm_policy *policy, u8 dir)
{
        struct xfrm_pol_inexact_node *n;
        struct net *net;

        net = xp_net(policy);
        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        if (xfrm_policy_inexact_insert_use_any_list(policy))
                return &bin->hhead;

        if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
                                               policy->family,
                                               policy->selector.prefixlen_d)) {
                write_seqcount_begin(&bin->count);
                n = xfrm_policy_inexact_insert_node(net,
                                                    &bin->root_s,
                                                    &policy->selector.saddr,
                                                    policy->family,
                                                    policy->selector.prefixlen_s,
                                                    dir);
                write_seqcount_end(&bin->count);
                if (!n)
                        return NULL;

                return &n->hhead;
        }

        /* daddr is fixed */
        write_seqcount_begin(&bin->count);
        n = xfrm_policy_inexact_insert_node(net,
                                            &bin->root_d,
                                            &policy->selector.daddr,
                                            policy->family,
                                            policy->selector.prefixlen_d, dir);
        write_seqcount_end(&bin->count);
        if (!n)
                return NULL;

        /* saddr is wildcard */
        if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
                                               policy->family,
                                               policy->selector.prefixlen_s))
                return &n->hhead;

        write_seqcount_begin(&bin->count);
        n = xfrm_policy_inexact_insert_node(net,
                                            &n->root,
                                            &policy->selector.saddr,
                                            policy->family,
                                            policy->selector.prefixlen_s, dir);
        write_seqcount_end(&bin->count);
        if (!n)
                return NULL;

        return &n->hhead;
}

static struct xfrm_policy *
xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
{
        struct xfrm_pol_inexact_bin *bin;
        struct xfrm_policy *delpol;
        struct hlist_head *chain;
        struct net *net;

        bin = xfrm_policy_inexact_alloc_bin(policy, dir);
        if (!bin)
                return ERR_PTR(-ENOMEM);

        net = xp_net(policy);
        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
        if (!chain) {
                __xfrm_policy_inexact_prune_bin(bin, false);
                return ERR_PTR(-ENOMEM);
        }

        delpol = xfrm_policy_insert_list(chain, policy, excl);
        if (delpol && excl) {
                __xfrm_policy_inexact_prune_bin(bin, false);
                return ERR_PTR(-EEXIST);
        }

        if (delpol)
                __xfrm_policy_inexact_prune_bin(bin, false);

        return delpol;
}

static bool xfrm_policy_is_dead_or_sk(const struct xfrm_policy *policy)
{
        int dir;

        if (policy->walk.dead)
                return true;

        dir = xfrm_policy_id2dir(policy->index);
        return dir >= XFRM_POLICY_MAX;
}

static void xfrm_hash_rebuild(struct work_struct *work)
{
        struct net *net = container_of(work, struct net,
                                       xfrm.policy_hthresh.work);
        struct xfrm_policy *pol;
        struct xfrm_policy *policy;
        struct hlist_head *chain;
        struct hlist_node *newpos;
        int dir;
        unsigned seq;
        u8 lbits4, rbits4, lbits6, rbits6;

        mutex_lock(&hash_resize_mutex);

        /* read selector prefixlen thresholds */
        do {
                seq = read_seqbegin(&net->xfrm.policy_hthresh.lock);

                lbits4 = net->xfrm.policy_hthresh.lbits4;
                rbits4 = net->xfrm.policy_hthresh.rbits4;
                lbits6 = net->xfrm.policy_hthresh.lbits6;
                rbits6 = net->xfrm.policy_hthresh.rbits6;
        } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);

        /* make sure that we can insert the indirect policies again before
         * we start with destructive action.
         */
        list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
                struct xfrm_pol_inexact_bin *bin;
                u8 dbits, sbits;

                if (xfrm_policy_is_dead_or_sk(policy))
                        continue;

                dir = xfrm_policy_id2dir(policy->index);
                if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
                        if (policy->family == AF_INET) {
                                dbits = rbits4;
                                sbits = lbits4;
                        } else {
                                dbits = rbits6;
                                sbits = lbits6;
                        }
                } else {
                        if (policy->family == AF_INET) {
                                dbits = lbits4;
                                sbits = rbits4;
                        } else {
                                dbits = lbits6;
                                sbits = rbits6;
                        }
                }

                if (policy->selector.prefixlen_d < dbits ||
                    policy->selector.prefixlen_s < sbits)
                        continue;

                bin = xfrm_policy_inexact_alloc_bin(policy, dir);
                if (!bin)
                        goto out_unlock;

                if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
                        goto out_unlock;
        }

        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
                        /* dir out => dst = remote, src = local */
                        net->xfrm.policy_bydst[dir].dbits4 = rbits4;
                        net->xfrm.policy_bydst[dir].sbits4 = lbits4;
                        net->xfrm.policy_bydst[dir].dbits6 = rbits6;
                        net->xfrm.policy_bydst[dir].sbits6 = lbits6;
                } else {
                        /* dir in/fwd => dst = local, src = remote */
                        net->xfrm.policy_bydst[dir].dbits4 = lbits4;
                        net->xfrm.policy_bydst[dir].sbits4 = rbits4;
                        net->xfrm.policy_bydst[dir].dbits6 = lbits6;
                        net->xfrm.policy_bydst[dir].sbits6 = rbits6;
                }
        }

        /* re-insert all policies by order of creation */
        list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
                if (xfrm_policy_is_dead_or_sk(policy))
                        continue;

                hlist_del_rcu(&policy->bydst);

                newpos = NULL;
                dir = xfrm_policy_id2dir(policy->index);
                chain = policy_hash_bysel(net, &policy->selector,
                                          policy->family, dir);

                if (!chain) {
                        void *p = xfrm_policy_inexact_insert(policy, dir, 0);

                        WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
                        continue;
                }

                hlist_for_each_entry(pol, chain, bydst) {
                        if (policy->priority >= pol->priority)
                                newpos = &pol->bydst;
                        else
                                break;
                }
                if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                        hlist_add_behind_rcu(&policy->bydst, newpos);
                else
                        hlist_add_head_rcu(&policy->bydst, chain);
        }

out_unlock:
        __xfrm_policy_inexact_flush(net);
        write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        mutex_unlock(&hash_resize_mutex);
}

void xfrm_policy_hash_rebuild(struct net *net)
{
        schedule_work(&net->xfrm.policy_hthresh.work);
}
EXPORT_SYMBOL(xfrm_policy_hash_rebuild);

/* Generate new index... KAME seems to generate them ordered by cost
 * of an absolute inpredictability of ordering of rules. This will not pass. */
static u32 xfrm_gen_index(struct net *net, int dir, u32 index)
{
        for (;;) {
                struct hlist_head *list;
                struct xfrm_policy *p;
                u32 idx;
                int found;

                if (!index) {
                        idx = (net->xfrm.idx_generator | dir);
                        net->xfrm.idx_generator += 8;
                } else {
                        idx = index;
                        index = 0;
                }

                if (idx == 0)
                        idx = 8;
                list = net->xfrm.policy_byidx + idx_hash(net, idx);
                found = 0;
                hlist_for_each_entry(p, list, byidx) {
                        if (p->index == idx) {
                                found = 1;
                                break;
                        }
                }
                if (!found)
                        return idx;
        }
}

static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
{
        u32 *p1 = (u32 *) s1;
        u32 *p2 = (u32 *) s2;
        int len = sizeof(struct xfrm_selector) / sizeof(u32);
        int i;

        for (i = 0; i < len; i++) {
                if (p1[i] != p2[i])
                        return 1;
        }

        return 0;
}

static void xfrm_policy_requeue(struct xfrm_policy *old,
                                struct xfrm_policy *new)
{
        struct xfrm_policy_queue *pq = &old->polq;
        struct sk_buff_head list;

        if (skb_queue_empty(&pq->hold_queue))
                return;

        __skb_queue_head_init(&list);

        spin_lock_bh(&pq->hold_queue.lock);
        skb_queue_splice_init(&pq->hold_queue, &list);
        if (timer_delete(&pq->hold_timer))
                xfrm_pol_put(old);
        spin_unlock_bh(&pq->hold_queue.lock);

        pq = &new->polq;

        spin_lock_bh(&pq->hold_queue.lock);
        skb_queue_splice(&list, &pq->hold_queue);
        pq->timeout = XFRM_QUEUE_TMO_MIN;
        if (!mod_timer(&pq->hold_timer, jiffies))
                xfrm_pol_hold(new);
        spin_unlock_bh(&pq->hold_queue.lock);
}

static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark,
                                          struct xfrm_policy *pol)
{
        return mark->v == pol->mark.v && mark->m == pol->mark.m;
}

static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
{
        const struct xfrm_pol_inexact_key *k = data;
        u32 a = k->type << 24 | k->dir << 16 | k->family;

        return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
                            seed);
}

static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
{
        const struct xfrm_pol_inexact_bin *b = data;

        return xfrm_pol_bin_key(&b->k, 0, seed);
}

static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
                            const void *ptr)
{
        const struct xfrm_pol_inexact_key *key = arg->key;
        const struct xfrm_pol_inexact_bin *b = ptr;
        int ret;

        if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
                return -1;

        ret = b->k.dir ^ key->dir;
        if (ret)
                return ret;

        ret = b->k.type ^ key->type;
        if (ret)
                return ret;

        ret = b->k.family ^ key->family;
        if (ret)
                return ret;

        return b->k.if_id ^ key->if_id;
}

static const struct rhashtable_params xfrm_pol_inexact_params = {
        .head_offset                = offsetof(struct xfrm_pol_inexact_bin, head),
        .hashfn                        = xfrm_pol_bin_key,
        .obj_hashfn                = xfrm_pol_bin_obj,
        .obj_cmpfn                = xfrm_pol_bin_cmp,
        .automatic_shrinking        = true,
};

static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
                                                   struct xfrm_policy *policy,
                                                   bool excl)
{
        struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;

        hlist_for_each_entry(pol, chain, bydst) {
                if (pol->type == policy->type &&
                    pol->if_id == policy->if_id &&
                    !selector_cmp(&pol->selector, &policy->selector) &&
                    xfrm_policy_mark_match(&policy->mark, pol) &&
                    xfrm_sec_ctx_match(pol->security, policy->security) &&
                    !WARN_ON(delpol)) {
                        if (excl)
                                return ERR_PTR(-EEXIST);
                        delpol = pol;
                        if (policy->priority > pol->priority)
                                continue;
                } else if (policy->priority >= pol->priority) {
                        newpos = pol;
                        continue;
                }
                if (delpol)
                        break;
        }

        if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
        else
                /* Packet offload policies enter to the head
                 * to speed-up lookups.
                 */
                hlist_add_head_rcu(&policy->bydst, chain);

        return delpol;
}

int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
{
        struct net *net = xp_net(policy);
        struct xfrm_policy *delpol;
        struct hlist_head *chain;

        /* Sanitize mark before store */
        policy->mark.v &= policy->mark.m;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
        if (chain)
                delpol = xfrm_policy_insert_list(chain, policy, excl);
        else
                delpol = xfrm_policy_inexact_insert(policy, dir, excl);

        if (IS_ERR(delpol)) {
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                return PTR_ERR(delpol);
        }

        __xfrm_policy_link(policy, dir);

        /* After previous checking, family can either be AF_INET or AF_INET6 */
        if (policy->family == AF_INET)
                rt_genid_bump_ipv4(net);
        else
                rt_genid_bump_ipv6(net);

        if (delpol) {
                xfrm_policy_requeue(delpol, policy);
                __xfrm_policy_unlink(delpol, dir);
        }
        policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
        hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
        policy->curlft.add_time = ktime_get_real_seconds();
        policy->curlft.use_time = 0;
        if (!mod_timer(&policy->timer, jiffies + HZ))
                xfrm_pol_hold(policy);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (delpol)
                xfrm_policy_kill(delpol);
        else if (xfrm_bydst_should_resize(net, dir, NULL))
                schedule_work(&net->xfrm.policy_hash_work);

        return 0;
}
EXPORT_SYMBOL(xfrm_policy_insert);

static struct xfrm_policy *
__xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark,
                        u32 if_id, u8 type, int dir, struct xfrm_selector *sel,
                        struct xfrm_sec_ctx *ctx)
{
        struct xfrm_policy *pol;

        if (!chain)
                return NULL;

        hlist_for_each_entry(pol, chain, bydst) {
                if (pol->type == type &&
                    pol->if_id == if_id &&
                    xfrm_policy_mark_match(mark, pol) &&
                    !selector_cmp(sel, &pol->selector) &&
                    xfrm_sec_ctx_match(ctx, pol->security))
                        return pol;
        }

        return NULL;
}

struct xfrm_policy *
xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id,
                      u8 type, int dir, struct xfrm_selector *sel,
                      struct xfrm_sec_ctx *ctx, int delete, int *err)
{
        struct xfrm_pol_inexact_bin *bin = NULL;
        struct xfrm_policy *pol, *ret = NULL;
        struct hlist_head *chain;

        *err = 0;
        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = policy_hash_bysel(net, sel, sel->family, dir);
        if (!chain) {
                struct xfrm_pol_inexact_candidates cand;
                int i;

                bin = xfrm_policy_inexact_lookup(net, type,
                                                 sel->family, dir, if_id);
                if (!bin) {
                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                        return NULL;
                }

                if (!xfrm_policy_find_inexact_candidates(&cand, bin,
                                                         &sel->saddr,
                                                         &sel->daddr)) {
                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                        return NULL;
                }

                pol = NULL;
                for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
                        struct xfrm_policy *tmp;

                        tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
                                                      if_id, type, dir,
                                                      sel, ctx);
                        if (!tmp)
                                continue;

                        if (!pol || tmp->pos < pol->pos)
                                pol = tmp;
                }
        } else {
                pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
                                              sel, ctx);
        }

        if (pol) {
                xfrm_pol_hold(pol);
                if (delete) {
                        *err = security_xfrm_policy_delete(pol->security);
                        if (*err) {
                                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                                return pol;
                        }
                        __xfrm_policy_unlink(pol, dir);
                }
                ret = pol;
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (ret && delete)
                xfrm_policy_kill(ret);
        if (bin && delete)
                xfrm_policy_inexact_prune_bin(bin);
        return ret;
}
EXPORT_SYMBOL(xfrm_policy_bysel_ctx);

struct xfrm_policy *
xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id,
                 u8 type, int dir, u32 id, int delete, int *err)
{
        struct xfrm_policy *pol, *ret;
        struct hlist_head *chain;

        *err = -ENOENT;
        if (xfrm_policy_id2dir(id) != dir)
                return NULL;

        *err = 0;
        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = net->xfrm.policy_byidx + idx_hash(net, id);
        ret = NULL;
        hlist_for_each_entry(pol, chain, byidx) {
                if (pol->type == type && pol->index == id &&
                    pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) {
                        xfrm_pol_hold(pol);
                        if (delete) {
                                *err = security_xfrm_policy_delete(
                                                                pol->security);
                                if (*err) {
                                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                                        return pol;
                                }
                                __xfrm_policy_unlink(pol, dir);
                        }
                        ret = pol;
                        break;
                }
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (ret && delete)
                xfrm_policy_kill(ret);
        return ret;
}
EXPORT_SYMBOL(xfrm_policy_byid);

#ifdef CONFIG_SECURITY_NETWORK_XFRM
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
        struct xfrm_policy *pol;
        int err = 0;

        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead ||
                    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
                    pol->type != type)
                        continue;

                err = security_xfrm_policy_delete(pol->security);
                if (err) {
                        xfrm_audit_policy_delete(pol, 0, task_valid);
                        return err;
                }
        }
        return err;
}

static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
                                                     struct net_device *dev,
                                                     bool task_valid)
{
        struct xfrm_policy *pol;
        int err = 0;

        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead ||
                    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
                    pol->xdo.dev != dev)
                        continue;

                err = security_xfrm_policy_delete(pol->security);
                if (err) {
                        xfrm_audit_policy_delete(pol, 0, task_valid);
                        return err;
                }
        }
        return err;
}
#else
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
        return 0;
}

static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
                                                     struct net_device *dev,
                                                     bool task_valid)
{
        return 0;
}
#endif

int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
{
        int dir, err = 0, cnt = 0;
        struct xfrm_policy *pol;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        err = xfrm_policy_flush_secctx_check(net, type, task_valid);
        if (err)
                goto out;

again:
        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead)
                        continue;

                dir = xfrm_policy_id2dir(pol->index);
                if (dir >= XFRM_POLICY_MAX ||
                    pol->type != type)
                        continue;

                __xfrm_policy_unlink(pol, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                cnt++;
                xfrm_audit_policy_delete(pol, 1, task_valid);
                xfrm_policy_kill(pol);
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                goto again;
        }
        if (cnt)
                __xfrm_policy_inexact_flush(net);
        else
                err = -ESRCH;
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_policy_flush);

int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
                          bool task_valid)
{
        int dir, err = 0, cnt = 0;
        struct xfrm_policy *pol;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid);
        if (err)
                goto out;

again:
        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead)
                        continue;

                dir = xfrm_policy_id2dir(pol->index);
                if (dir >= XFRM_POLICY_MAX ||
                    pol->xdo.dev != dev)
                        continue;

                __xfrm_policy_unlink(pol, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                cnt++;
                xfrm_audit_policy_delete(pol, 1, task_valid);
                xfrm_policy_kill(pol);
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                goto again;
        }
        if (cnt)
                __xfrm_policy_inexact_flush(net);
        else
                err = -ESRCH;
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_dev_policy_flush);

int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
                     int (*func)(struct xfrm_policy *, int, int, void*),
                     void *data)
{
        struct xfrm_policy *pol;
        struct xfrm_policy_walk_entry *x;
        int error = 0;

        if (walk->type >= XFRM_POLICY_TYPE_MAX &&
            walk->type != XFRM_POLICY_TYPE_ANY)
                return -EINVAL;

        if (list_empty(&walk->walk.all) && walk->seq != 0)
                return 0;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        if (list_empty(&walk->walk.all))
                x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
        else
                x = list_first_entry(&walk->walk.all,
                                     struct xfrm_policy_walk_entry, all);

        list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
                if (x->dead)
                        continue;
                pol = container_of(x, struct xfrm_policy, walk);
                if (walk->type != XFRM_POLICY_TYPE_ANY &&
                    walk->type != pol->type)
                        continue;
                error = func(pol, xfrm_policy_id2dir(pol->index),
                             walk->seq, data);
                if (error) {
                        list_move_tail(&walk->walk.all, &x->all);
                        goto out;
                }
                walk->seq++;
        }
        if (walk->seq == 0) {
                error = -ENOENT;
                goto out;
        }
        list_del_init(&walk->walk.all);
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return error;
}
EXPORT_SYMBOL(xfrm_policy_walk);

void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
{
        INIT_LIST_HEAD(&walk->walk.all);
        walk->walk.dead = 1;
        walk->type = type;
        walk->seq = 0;
}
EXPORT_SYMBOL(xfrm_policy_walk_init);

void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
{
        if (list_empty(&walk->walk.all))
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
        list_del(&walk->walk.all);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
EXPORT_SYMBOL(xfrm_policy_walk_done);

/*
 * Find policy to apply to this flow.
 *
 * Returns 0 if policy found, else an -errno.
 */
static int xfrm_policy_match(const struct xfrm_policy *pol,
                             const struct flowi *fl,
                             u8 type, u16 family, u32 if_id)
{
        const struct xfrm_selector *sel = &pol->selector;
        int ret = -ESRCH;
        bool match;

        if (pol->family != family ||
            pol->if_id != if_id ||
            (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
            pol->type != type)
                return ret;

        match = xfrm_selector_match(sel, fl, family);
        if (match)
                ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid);
        return ret;
}

static struct xfrm_pol_inexact_node *
xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
                                seqcount_spinlock_t *count,
                                const xfrm_address_t *addr, u16 family)
{
        const struct rb_node *parent;
        int seq;

again:
        seq = read_seqcount_begin(count);

        parent = rcu_dereference_raw(r->rb_node);
        while (parent) {
                struct xfrm_pol_inexact_node *node;
                int delta;

                node = rb_entry(parent, struct xfrm_pol_inexact_node, node);

                delta = xfrm_policy_addr_delta(addr, &node->addr,
                                               node->prefixlen, family);
                if (delta < 0) {
                        parent = rcu_dereference_raw(parent->rb_left);
                        continue;
                } else if (delta > 0) {
                        parent = rcu_dereference_raw(parent->rb_right);
                        continue;
                }

                return node;
        }

        if (read_seqcount_retry(count, seq))
                goto again;

        return NULL;
}

static bool
xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
                                    struct xfrm_pol_inexact_bin *b,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr)
{
        struct xfrm_pol_inexact_node *n;
        u16 family;

        if (!b)
                return false;

        family = b->k.family;
        memset(cand, 0, sizeof(*cand));
        cand->res[XFRM_POL_CAND_ANY] = &b->hhead;

        n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
                                            family);
        if (n) {
                cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
                n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
                                                    family);
                if (n)
                        cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
        }

        n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
                                            family);
        if (n)
                cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;

        return true;
}

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
                               u8 dir, u32 if_id)
{
        struct xfrm_pol_inexact_key k = {
                .family = family,
                .type = type,
                .dir = dir,
                .if_id = if_id,
        };

        write_pnet(&k.net, net);

        return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
                                 xfrm_pol_inexact_params);
}

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
                           u8 dir, u32 if_id)
{
        struct xfrm_pol_inexact_bin *bin;

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        rcu_read_lock();
        bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
        rcu_read_unlock();

        return bin;
}

static struct xfrm_policy *
__xfrm_policy_eval_candidates(struct hlist_head *chain,
                              struct xfrm_policy *prefer,
                              const struct flowi *fl,
                              u8 type, u16 family, u32 if_id)
{
        u32 priority = prefer ? prefer->priority : ~0u;
        struct xfrm_policy *pol;

        if (!chain)
                return NULL;

        hlist_for_each_entry_rcu(pol, chain, bydst) {
                int err;

                if (pol->priority > priority)
                        break;

                err = xfrm_policy_match(pol, fl, type, family, if_id);
                if (err) {
                        if (err != -ESRCH)
                                return ERR_PTR(err);

                        continue;
                }

                if (prefer) {
                        /* matches.  Is it older than *prefer? */
                        if (pol->priority == priority &&
                            prefer->pos < pol->pos)
                                return prefer;
                }

                return pol;
        }

        return NULL;
}

static struct xfrm_policy *
xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
                            struct xfrm_policy *prefer,
                            const struct flowi *fl,
                            u8 type, u16 family, u32 if_id)
{
        struct xfrm_policy *tmp;
        int i;

        for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
                tmp = __xfrm_policy_eval_candidates(cand->res[i],
                                                    prefer,
                                                    fl, type, family, if_id);
                if (!tmp)
                        continue;

                if (IS_ERR(tmp))
                        return tmp;
                prefer = tmp;
        }

        return prefer;
}

static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
                                                     const struct flowi *fl,
                                                     u16 family, u8 dir,
                                                     u32 if_id)
{
        struct xfrm_pol_inexact_candidates cand;
        const xfrm_address_t *daddr, *saddr;
        struct xfrm_pol_inexact_bin *bin;
        struct xfrm_policy *pol, *ret;
        struct hlist_head *chain;
        unsigned int sequence;
        int err;

        daddr = xfrm_flowi_daddr(fl, family);
        saddr = xfrm_flowi_saddr(fl, family);
        if (unlikely(!daddr || !saddr))
                return NULL;

        rcu_read_lock();
 retry:
        do {
                sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
                chain = policy_hash_direct(net, daddr, saddr, family, dir);
        } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence));

        ret = NULL;
        hlist_for_each_entry_rcu(pol, chain, bydst) {
                err = xfrm_policy_match(pol, fl, type, family, if_id);
                if (err) {
                        if (err == -ESRCH)
                                continue;
                        else {
                                ret = ERR_PTR(err);
                                goto fail;
                        }
                } else {
                        ret = pol;
                        break;
                }
        }
        if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET)
                goto skip_inexact;

        bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
        if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
                                                         daddr))
                goto skip_inexact;

        pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
                                          family, if_id);
        if (pol) {
                ret = pol;
                if (IS_ERR(pol))
                        goto fail;
        }

skip_inexact:
        if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence))
                goto retry;

        if (ret && !xfrm_pol_hold_rcu(ret))
                goto retry;
fail:
        rcu_read_unlock();

        return ret;
}

static struct xfrm_policy *xfrm_policy_lookup(struct net *net,
                                              const struct flowi *fl,
                                              u16 family, u8 dir, u32 if_id)
{
#ifdef CONFIG_XFRM_SUB_POLICY
        struct xfrm_policy *pol;

        pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family,
                                        dir, if_id);
        if (pol != NULL)
                return pol;
#endif
        return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family,
                                         dir, if_id);
}

static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
                                                 const struct flowi *fl,
                                                 u16 family, u32 if_id)
{
        struct xfrm_policy *pol;

        rcu_read_lock();
 again:
        pol = rcu_dereference(sk->sk_policy[dir]);
        if (pol != NULL) {
                bool match;
                int err = 0;

                if (pol->family != family) {
                        pol = NULL;
                        goto out;
                }

                match = xfrm_selector_match(&pol->selector, fl, family);
                if (match) {
                        if ((READ_ONCE(sk->sk_mark) & pol->mark.m) != pol->mark.v ||
                            pol->if_id != if_id) {
                                pol = NULL;
                                goto out;
                        }
                        err = security_xfrm_policy_lookup(pol->security,
                                                      fl->flowi_secid);
                        if (!err) {
                                if (!xfrm_pol_hold_rcu(pol))
                                        goto again;
                        } else if (err == -ESRCH) {
                                pol = NULL;
                        } else {
                                pol = ERR_PTR(err);
                        }
                } else
                        pol = NULL;
        }
out:
        rcu_read_unlock();
        return pol;
}

static u32 xfrm_gen_pos_slow(struct net *net)
{
        struct xfrm_policy *policy;
        u32 i = 0;

        /* oldest entry is last in list */
        list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
                if (!xfrm_policy_is_dead_or_sk(policy))
                        policy->pos = ++i;
        }

        return i;
}

static u32 xfrm_gen_pos(struct net *net)
{
        const struct xfrm_policy *policy;
        u32 i = 0;

        /* most recently added policy is at the head of the list */
        list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
                if (xfrm_policy_is_dead_or_sk(policy))
                        continue;

                if (policy->pos == UINT_MAX)
                        return xfrm_gen_pos_slow(net);

                i = policy->pos + 1;
                break;
        }

        return i;
}

static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
{
        struct net *net = xp_net(pol);

        switch (dir) {
        case XFRM_POLICY_IN:
        case XFRM_POLICY_FWD:
        case XFRM_POLICY_OUT:
                pol->pos = xfrm_gen_pos(net);
                break;
        }

        list_add(&pol->walk.all, &net->xfrm.policy_all);
        net->xfrm.policy_count[dir]++;
        xfrm_pol_hold(pol);
}

static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
                                                int dir)
{
        struct net *net = xp_net(pol);

        if (list_empty(&pol->walk.all))
                return NULL;

        /* Socket policies are not hashed. */
        if (!hlist_unhashed(&pol->bydst)) {
                hlist_del_rcu(&pol->bydst);
                hlist_del(&pol->byidx);
        }

        list_del_init(&pol->walk.all);
        net->xfrm.policy_count[dir]--;

        return pol;
}

static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir)
{
        __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir);
}

static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir)
{
        __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir);
}

int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
{
        struct net *net = xp_net(pol);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        pol = __xfrm_policy_unlink(pol, dir);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        if (pol) {
                xfrm_policy_kill(pol);
                return 0;
        }
        return -ENOENT;
}
EXPORT_SYMBOL(xfrm_policy_delete);

int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
{
        struct net *net = sock_net(sk);
        struct xfrm_policy *old_pol;

#ifdef CONFIG_XFRM_SUB_POLICY
        if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
                return -EINVAL;
#endif

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        old_pol = rcu_dereference_protected(sk->sk_policy[dir],
                                lockdep_is_held(&net->xfrm.xfrm_policy_lock));
        if (pol) {
                pol->curlft.add_time = ktime_get_real_seconds();
                pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
                xfrm_sk_policy_link(pol, dir);
        }
        rcu_assign_pointer(sk->sk_policy[dir], pol);
        if (old_pol) {
                if (pol)
                        xfrm_policy_requeue(old_pol, pol);

                /* Unlinking succeeds always. This is the only function
                 * allowed to delete or replace socket policy.
                 */
                xfrm_sk_policy_unlink(old_pol, dir);
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (old_pol) {
                xfrm_policy_kill(old_pol);
        }
        return 0;
}

static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
{
        struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
        struct net *net = xp_net(old);

        if (newp) {
                newp->selector = old->selector;
                if (security_xfrm_policy_clone(old->security,
                                               &newp->security)) {
                        kfree(newp);
                        return NULL;  /* ENOMEM */
                }
                newp->lft = old->lft;
                newp->curlft = old->curlft;
                newp->mark = old->mark;
                newp->if_id = old->if_id;
                newp->action = old->action;
                newp->flags = old->flags;
                newp->xfrm_nr = old->xfrm_nr;
                newp->index = old->index;
                newp->type = old->type;
                newp->family = old->family;
                memcpy(newp->xfrm_vec, old->xfrm_vec,
                       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                xfrm_sk_policy_link(newp, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                xfrm_pol_put(newp);
        }
        return newp;
}

int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
        const struct xfrm_policy *p;
        struct xfrm_policy *np;
        int i, ret = 0;

        rcu_read_lock();
        for (i = 0; i < 2; i++) {
                p = rcu_dereference(osk->sk_policy[i]);
                if (p) {
                        np = clone_policy(p, i);
                        if (unlikely(!np)) {
                                ret = -ENOMEM;
                                break;
                        }
                        rcu_assign_pointer(sk->sk_policy[i], np);
                }
        }
        rcu_read_unlock();
        return ret;
}

static int
xfrm_get_saddr(unsigned short family, xfrm_address_t *saddr,
               const struct xfrm_dst_lookup_params *params)
{
        int err;
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);

        if (unlikely(afinfo == NULL))
                return -EINVAL;
        err = afinfo->get_saddr(saddr, params);
        rcu_read_unlock();
        return err;
}

/* Resolve list of templates for the flow, given policy. */

static int
xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
                      struct xfrm_state **xfrm, unsigned short family)
{
        struct net *net = xp_net(policy);
        int nx;
        int i, error;
        xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
        xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
        xfrm_address_t tmp;

        for (nx = 0, i = 0; i < policy->xfrm_nr; i++) {
                struct xfrm_state *x;
                xfrm_address_t *remote = daddr;
                xfrm_address_t *local  = saddr;
                struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];

                if (tmpl->mode == XFRM_MODE_TUNNEL ||
                    tmpl->mode == XFRM_MODE_IPTFS ||
                    tmpl->mode == XFRM_MODE_BEET) {
                        remote = &tmpl->id.daddr;
                        local = &tmpl->saddr;
                        if (xfrm_addr_any(local, tmpl->encap_family)) {
                                struct xfrm_dst_lookup_params params;

                                memset(&params, 0, sizeof(params));
                                params.net = net;
                                params.oif = fl->flowi_oif;
                                params.daddr = remote;
                                error = xfrm_get_saddr(tmpl->encap_family, &tmp,
                                                       &params);
                                if (error)
                                        goto fail;
                                local = &tmp;
                        }
                }

                x = xfrm_state_find(remote, local, fl, tmpl, policy, &error,
                                    family, policy->if_id);
                if (x && x->dir && x->dir != XFRM_SA_DIR_OUT) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEDIRERROR);
                        xfrm_state_put(x);
                        error = -EINVAL;
                        goto fail;
                }

                if (x && x->km.state == XFRM_STATE_VALID) {
                        xfrm[nx++] = x;
                        daddr = remote;
                        saddr = local;
                        continue;
                }
                if (x) {
                        error = (x->km.state == XFRM_STATE_ERROR ?
                                 -EINVAL : -EAGAIN);
                        xfrm_state_put(x);
                } else if (error == -ESRCH) {
                        error = -EAGAIN;
                }

                if (!tmpl->optional)
                        goto fail;
        }
        return nx;

fail:
        for (nx--; nx >= 0; nx--)
                xfrm_state_put(xfrm[nx]);
        return error;
}

static int
xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
                  struct xfrm_state **xfrm, unsigned short family)
{
        struct xfrm_state *tp[XFRM_MAX_DEPTH];
        struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
        int cnx = 0;
        int error;
        int ret;
        int i;

        for (i = 0; i < npols; i++) {
                if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
                        error = -ENOBUFS;
                        goto fail;
                }

                ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
                if (ret < 0) {
                        error = ret;
                        goto fail;
                } else
                        cnx += ret;
        }

        /* found states are sorted for outbound processing */
        if (npols > 1)
                xfrm_state_sort(xfrm, tpp, cnx, family);

        return cnx;

 fail:
        for (cnx--; cnx >= 0; cnx--)
                xfrm_state_put(tpp[cnx]);
        return error;

}

static dscp_t xfrm_get_dscp(const struct flowi *fl, int family)
{
        if (family == AF_INET)
                return fl->u.ip4.flowi4_dscp;

        return 0;
}

static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
{
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
        struct dst_ops *dst_ops;
        struct xfrm_dst *xdst;

        if (!afinfo)
                return ERR_PTR(-EINVAL);

        switch (family) {
        case AF_INET:
                dst_ops = &net->xfrm.xfrm4_dst_ops;
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                dst_ops = &net->xfrm.xfrm6_dst_ops;
                break;
#endif
        default:
                BUG();
        }
        xdst = dst_alloc(dst_ops, NULL, DST_OBSOLETE_NONE, 0);

        if (likely(xdst)) {
                memset_after(xdst, 0, u.dst);
        } else
                xdst = ERR_PTR(-ENOBUFS);

        rcu_read_unlock();

        return xdst;
}

static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
                           int nfheader_len)
{
        if (dst->ops->family == AF_INET6) {
                path->path_cookie = rt6_get_cookie(dst_rt6_info(dst));
                path->u.rt6.rt6i_nfheader_len = nfheader_len;
        }
}

static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
                                const struct flowi *fl)
{
        const struct xfrm_policy_afinfo *afinfo =
                xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
        int err;

        if (!afinfo)
                return -EINVAL;

        err = afinfo->fill_dst(xdst, dev, fl);

        rcu_read_unlock();

        return err;
}


/* Allocate chain of dst_entry's, attach known xfrm's, calculate
 * all the metrics... Shortly, bundle a bundle.
 */

static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
                                            struct xfrm_state **xfrm,
                                            struct xfrm_dst **bundle,
                                            int nx,
                                            const struct flowi *fl,
                                            struct dst_entry *dst)
{
        const struct xfrm_state_afinfo *afinfo;
        const struct xfrm_mode *inner_mode;
        struct net *net = xp_net(policy);
        unsigned long now = jiffies;
        struct net_device *dev;
        struct xfrm_dst *xdst_prev = NULL;
        struct xfrm_dst *xdst0 = NULL;
        int i = 0;
        int err;
        int header_len = 0;
        int nfheader_len = 0;
        int trailer_len = 0;
        int family = policy->selector.family;
        xfrm_address_t saddr, daddr;
        dscp_t dscp;

        xfrm_flowi_addr_get(fl, &saddr, &daddr, family);

        dscp = xfrm_get_dscp(fl, family);

        dst_hold(dst);

        for (; i < nx; i++) {
                struct xfrm_dst *xdst = xfrm_alloc_dst(net, family);
                struct dst_entry *dst1 = &xdst->u.dst;

                err = PTR_ERR(xdst);
                if (IS_ERR(xdst)) {
                        dst_release(dst);
                        goto put_states;
                }

                bundle[i] = xdst;
                if (!xdst_prev)
                        xdst0 = xdst;
                else
                        /* Ref count is taken during xfrm_alloc_dst()
                         * No need to do dst_clone() on dst1
                         */
                        xfrm_dst_set_child(xdst_prev, &xdst->u.dst);

                if (xfrm[i]->sel.family == AF_UNSPEC) {
                        inner_mode = xfrm_ip2inner_mode(xfrm[i],
                                                        xfrm_af2proto(family));
                        if (!inner_mode) {
                                err = -EAFNOSUPPORT;
                                dst_release(dst);
                                goto put_states;
                        }
                } else
                        inner_mode = &xfrm[i]->inner_mode;

                xdst->route = dst;
                dst_copy_metrics(dst1, dst);

                if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
                        __u32 mark = 0;
                        int oif;

                        if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m)
                                mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);

                        if (xfrm[i]->xso.type != XFRM_DEV_OFFLOAD_PACKET)
                                family = xfrm[i]->props.family;

                        oif = fl->flowi_oif ? : fl->flowi_l3mdev;
                        dst = xfrm_dst_lookup(xfrm[i], dscp, oif, &saddr,
                                              &daddr, family, mark);
                        err = PTR_ERR(dst);
                        if (IS_ERR(dst))
                                goto put_states;
                } else
                        dst_hold(dst);

                dst1->xfrm = xfrm[i];
                xdst->xfrm_genid = xfrm[i]->genid;

                dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
                dst1->lastuse = now;

                dst1->input = dst_discard;

                if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) {
                        dst1->output = xfrm[i]->mode_cbs->output;
                } else {
                        rcu_read_lock();
                        afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
                        if (likely(afinfo))
                                dst1->output = afinfo->output;
                        else
                                dst1->output = dst_discard_out;
                        rcu_read_unlock();
                }

                xdst_prev = xdst;

                header_len += xfrm[i]->props.header_len;
                if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
                        nfheader_len += xfrm[i]->props.header_len;
                trailer_len += xfrm[i]->props.trailer_len;
        }

        xfrm_dst_set_child(xdst_prev, dst);
        xdst0->path = dst;

        err = -ENODEV;
        dev = dst->dev;
        if (!dev)
                goto free_dst;

        xfrm_init_path(xdst0, dst, nfheader_len);
        xfrm_init_pmtu(bundle, nx);

        for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst;
             xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) {
                err = xfrm_fill_dst(xdst_prev, dev, fl);
                if (err)
                        goto free_dst;

                xdst_prev->u.dst.header_len = header_len;
                xdst_prev->u.dst.trailer_len = trailer_len;
                header_len -= xdst_prev->u.dst.xfrm->props.header_len;
                trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len;
        }

        return &xdst0->u.dst;

put_states:
        for (; i < nx; i++)
                xfrm_state_put(xfrm[i]);
free_dst:
        if (xdst0)
                dst_release_immediate(&xdst0->u.dst);

        return ERR_PTR(err);
}

static int xfrm_expand_policies(const struct flowi *fl, u16 family,
                                struct xfrm_policy **pols,
                                int *num_pols, int *num_xfrms)
{
        int i;

        if (*num_pols == 0 || !pols[0]) {
                *num_pols = 0;
                *num_xfrms = 0;
                return 0;
        }
        if (IS_ERR(pols[0])) {
                *num_pols = 0;
                return PTR_ERR(pols[0]);
        }

        *num_xfrms = pols[0]->xfrm_nr;

#ifdef CONFIG_XFRM_SUB_POLICY
        if (pols[0]->action == XFRM_POLICY_ALLOW &&
            pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
                pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
                                                    XFRM_POLICY_TYPE_MAIN,
                                                    fl, family,
                                                    XFRM_POLICY_OUT,
                                                    pols[0]->if_id);
                if (pols[1]) {
                        if (IS_ERR(pols[1])) {
                                xfrm_pols_put(pols, *num_pols);
                                *num_pols = 0;
                                return PTR_ERR(pols[1]);
                        }
                        (*num_pols)++;
                        (*num_xfrms) += pols[1]->xfrm_nr;
                }
        }
#endif
        for (i = 0; i < *num_pols; i++) {
                if (pols[i]->action != XFRM_POLICY_ALLOW) {
                        *num_xfrms = -1;
                        break;
                }
        }

        return 0;

}

static struct xfrm_dst *
xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
                               const struct flowi *fl, u16 family,
                               struct dst_entry *dst_orig)
{
        struct net *net = xp_net(pols[0]);
        struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
        struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
        struct xfrm_dst *xdst;
        struct dst_entry *dst;
        int err;

        /* Try to instantiate a bundle */
        err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
        if (err <= 0) {
                if (err == 0)
                        return NULL;

                if (err != -EAGAIN)
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
                return ERR_PTR(err);
        }

        dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
        if (IS_ERR(dst)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
                return ERR_CAST(dst);
        }

        xdst = (struct xfrm_dst *)dst;
        xdst->num_xfrms = err;
        xdst->num_pols = num_pols;
        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
        xdst->policy_genid = atomic_read(&pols[0]->genid);

        return xdst;
}

static void xfrm_policy_queue_process(struct timer_list *t)
{
        struct sk_buff *skb;
        struct sock *sk;
        struct dst_entry *dst;
        struct xfrm_policy *pol = timer_container_of(pol, t, polq.hold_timer);
        struct net *net = xp_net(pol);
        struct xfrm_policy_queue *pq = &pol->polq;
        struct flowi fl;
        struct sk_buff_head list;
        __u32 skb_mark;

        spin_lock(&pq->hold_queue.lock);
        skb = skb_peek(&pq->hold_queue);
        if (!skb) {
                spin_unlock(&pq->hold_queue.lock);
                goto out;
        }
        dst = skb_dst(skb);
        sk = skb->sk;

        /* Fixup the mark to support VTI. */
        skb_mark = skb->mark;
        skb->mark = pol->mark.v;
        xfrm_decode_session(net, skb, &fl, dst->ops->family);
        skb->mark = skb_mark;
        spin_unlock(&pq->hold_queue.lock);

        dst_hold(xfrm_dst_path(dst));
        dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE);
        if (IS_ERR(dst))
                goto purge_queue;

        if (dst->flags & DST_XFRM_QUEUE) {
                dst_release(dst);

                if (pq->timeout >= XFRM_QUEUE_TMO_MAX)
                        goto purge_queue;

                pq->timeout = pq->timeout << 1;
                if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
                        xfrm_pol_hold(pol);
                goto out;
        }

        dst_release(dst);

        __skb_queue_head_init(&list);

        spin_lock(&pq->hold_queue.lock);
        pq->timeout = 0;
        skb_queue_splice_init(&pq->hold_queue, &list);
        spin_unlock(&pq->hold_queue.lock);

        while (!skb_queue_empty(&list)) {
                skb = __skb_dequeue(&list);

                /* Fixup the mark to support VTI. */
                skb_mark = skb->mark;
                skb->mark = pol->mark.v;
                xfrm_decode_session(net, skb, &fl, skb_dst(skb)->ops->family);
                skb->mark = skb_mark;

                dst_hold(xfrm_dst_path(skb_dst(skb)));
                dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
                if (IS_ERR(dst)) {
                        kfree_skb(skb);
                        continue;
                }

                nf_reset_ct(skb);
                skb_dst_drop(skb);
                skb_dst_set(skb, dst);

                dst_output(net, skb_to_full_sk(skb), skb);
        }

out:
        xfrm_pol_put(pol);
        return;

purge_queue:
        pq->timeout = 0;
        skb_queue_purge(&pq->hold_queue);
        xfrm_pol_put(pol);
}

static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        unsigned long sched_next;
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
        struct xfrm_policy *pol = xdst->pols[0];
        struct xfrm_policy_queue *pq = &pol->polq;

        if (unlikely(skb_fclone_busy(sk, skb))) {
                kfree_skb(skb);
                return 0;
        }

        if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
                kfree_skb(skb);
                return -EAGAIN;
        }

        skb_dst_force(skb);

        spin_lock_bh(&pq->hold_queue.lock);

        if (!pq->timeout)
                pq->timeout = XFRM_QUEUE_TMO_MIN;

        sched_next = jiffies + pq->timeout;

        if (timer_delete(&pq->hold_timer)) {
                if (time_before(pq->hold_timer.expires, sched_next))
                        sched_next = pq->hold_timer.expires;
                xfrm_pol_put(pol);
        }

        __skb_queue_tail(&pq->hold_queue, skb);
        if (!mod_timer(&pq->hold_timer, sched_next))
                xfrm_pol_hold(pol);

        spin_unlock_bh(&pq->hold_queue.lock);

        return 0;
}

static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
                                                 struct xfrm_flo *xflo,
                                                 const struct flowi *fl,
                                                 int num_xfrms,
                                                 u16 family)
{
        int err;
        struct net_device *dev;
        struct dst_entry *dst;
        struct dst_entry *dst1;
        struct xfrm_dst *xdst;

        xdst = xfrm_alloc_dst(net, family);
        if (IS_ERR(xdst))
                return xdst;

        if (!(xflo->flags & XFRM_LOOKUP_QUEUE) ||
            net->xfrm.sysctl_larval_drop ||
            num_xfrms <= 0)
                return xdst;

        dst = xflo->dst_orig;
        dst1 = &xdst->u.dst;
        dst_hold(dst);
        xdst->route = dst;

        dst_copy_metrics(dst1, dst);

        dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
        dst1->flags |= DST_XFRM_QUEUE;
        dst1->lastuse = jiffies;

        dst1->input = dst_discard;
        dst1->output = xdst_queue_output;

        dst_hold(dst);
        xfrm_dst_set_child(xdst, dst);
        xdst->path = dst;

        xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);

        err = -ENODEV;
        dev = dst->dev;
        if (!dev)
                goto free_dst;

        err = xfrm_fill_dst(xdst, dev, fl);
        if (err)
                goto free_dst;

out:
        return xdst;

free_dst:
        dst_release(dst1);
        xdst = ERR_PTR(err);
        goto out;
}

static struct xfrm_dst *xfrm_bundle_lookup(struct net *net,
                                           const struct flowi *fl,
                                           u16 family, u8 dir,
                                           struct xfrm_flo *xflo, u32 if_id)
{
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int num_pols = 0, num_xfrms = 0, err;
        struct xfrm_dst *xdst;

        /* Resolve policies to use if we couldn't get them from
         * previous cache entry */
        num_pols = 1;
        pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id);
        err = xfrm_expand_policies(fl, family, pols,
                                           &num_pols, &num_xfrms);
        if (err < 0)
                goto inc_error;
        if (num_pols == 0)
                return NULL;
        if (num_xfrms <= 0)
                goto make_dummy_bundle;

        xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
                                              xflo->dst_orig);
        if (IS_ERR(xdst)) {
                err = PTR_ERR(xdst);
                if (err == -EREMOTE) {
                        xfrm_pols_put(pols, num_pols);
                        return NULL;
                }

                if (err != -EAGAIN)
                        goto error;
                goto make_dummy_bundle;
        } else if (xdst == NULL) {
                num_xfrms = 0;
                goto make_dummy_bundle;
        }

        return xdst;

make_dummy_bundle:
        /* We found policies, but there's no bundles to instantiate:
         * either because the policy blocks, has no transformations or
         * we could not build template (no xfrm_states).*/
        xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family);
        if (IS_ERR(xdst)) {
                xfrm_pols_put(pols, num_pols);
                return ERR_CAST(xdst);
        }
        xdst->num_pols = num_pols;
        xdst->num_xfrms = num_xfrms;
        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);

        return xdst;

inc_error:
        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
error:
        xfrm_pols_put(pols, num_pols);
        return ERR_PTR(err);
}

static struct dst_entry *make_blackhole(struct net *net, u16 family,
                                        struct dst_entry *dst_orig)
{
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
        struct dst_entry *ret;

        if (!afinfo) {
                dst_release(dst_orig);
                return ERR_PTR(-EINVAL);
        } else {
                ret = afinfo->blackhole_route(net, dst_orig);
        }
        rcu_read_unlock();

        return ret;
}

/* Finds/creates a bundle for given flow and if_id
 *
 * At the moment we eat a raw IP route. Mostly to speed up lookups
 * on interfaces with disabled IPsec.
 *
 * xfrm_lookup uses an if_id of 0 by default, and is provided for
 * compatibility
 */
struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
                                        struct dst_entry *dst_orig,
                                        const struct flowi *fl,
                                        const struct sock *sk,
                                        int flags, u32 if_id)
{
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        struct xfrm_dst *xdst;
        struct dst_entry *dst, *route;
        u16 family = dst_orig->ops->family;
        u8 dir = XFRM_POLICY_OUT;
        int i, err, num_pols, num_xfrms = 0, drop_pols = 0;

        dst = NULL;
        xdst = NULL;
        route = NULL;

        sk = sk_const_to_full_sk(sk);
        if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
                num_pols = 1;
                pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family,
                                                if_id);
                err = xfrm_expand_policies(fl, family, pols,
                                           &num_pols, &num_xfrms);
                if (err < 0)
                        goto dropdst;

                if (num_pols) {
                        if (num_xfrms <= 0) {
                                drop_pols = num_pols;
                                goto no_transform;
                        }

                        xdst = xfrm_resolve_and_create_bundle(
                                        pols, num_pols, fl,
                                        family, dst_orig);

                        if (IS_ERR(xdst)) {
                                xfrm_pols_put(pols, num_pols);
                                err = PTR_ERR(xdst);
                                if (err == -EREMOTE)
                                        goto nopol;

                                goto dropdst;
                        } else if (xdst == NULL) {
                                num_xfrms = 0;
                                drop_pols = num_pols;
                                goto no_transform;
                        }

                        route = xdst->route;
                }
        }

        if (xdst == NULL) {
                struct xfrm_flo xflo;

                xflo.dst_orig = dst_orig;
                xflo.flags = flags;

                /* To accelerate a bit...  */
                if (!if_id && ((dst_orig->flags & DST_NOXFRM) ||
                               !net->xfrm.policy_count[XFRM_POLICY_OUT]))
                        goto nopol;

                xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id);
                if (xdst == NULL)
                        goto nopol;
                if (IS_ERR(xdst)) {
                        err = PTR_ERR(xdst);
                        goto dropdst;
                }

                num_pols = xdst->num_pols;
                num_xfrms = xdst->num_xfrms;
                memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols);
                route = xdst->route;
        }

        dst = &xdst->u.dst;
        if (route == NULL && num_xfrms > 0) {
                /* The only case when xfrm_bundle_lookup() returns a
                 * bundle with null route, is when the template could
                 * not be resolved. It means policies are there, but
                 * bundle could not be created, since we don't yet
                 * have the xfrm_state's. We need to wait for KM to
                 * negotiate new SA's or bail out with error.*/
                if (net->xfrm.sysctl_larval_drop) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
                        err = -EREMOTE;
                        goto error;
                }

                err = -EAGAIN;

                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
                goto error;
        }

no_transform:
        if (num_pols == 0)
                goto nopol;

        if ((flags & XFRM_LOOKUP_ICMP) &&
            !(pols[0]->flags & XFRM_POLICY_ICMP)) {
                err = -ENOENT;
                goto error;
        }

        for (i = 0; i < num_pols; i++)
                WRITE_ONCE(pols[i]->curlft.use_time, ktime_get_real_seconds());

        if (num_xfrms < 0) {
                /* Prohibit the flow */
                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
                err = -EPERM;
                goto error;
        } else if (num_xfrms > 0) {
                /* Flow transformed */
                dst_release(dst_orig);
        } else {
                /* Flow passes untransformed */
                dst_release(dst);
                dst = dst_orig;
        }

ok:
        xfrm_pols_put(pols, drop_pols);
        if (dst->xfrm &&
            (dst->xfrm->props.mode == XFRM_MODE_TUNNEL ||
             dst->xfrm->props.mode == XFRM_MODE_IPTFS))
                dst->flags |= DST_XFRM_TUNNEL;
        return dst;

nopol:
        if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) &&
            net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
                err = -EPERM;
                goto error;
        }
        if (!(flags & XFRM_LOOKUP_ICMP)) {
                dst = dst_orig;
                goto ok;
        }
        err = -ENOENT;
error:
        dst_release(dst);
dropdst:
        if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
                dst_release(dst_orig);
        xfrm_pols_put(pols, drop_pols);
        return ERR_PTR(err);
}
EXPORT_SYMBOL(xfrm_lookup_with_ifid);

/* Main function: finds/creates a bundle for given flow.
 *
 * At the moment we eat a raw IP route. Mostly to speed up lookups
 * on interfaces with disabled IPsec.
 */
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                              const struct flowi *fl, const struct sock *sk,
                              int flags)
{
        return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0);
}
EXPORT_SYMBOL(xfrm_lookup);

/* Callers of xfrm_lookup_route() must ensure a call to dst_output().
 * Otherwise we may send out blackholed packets.
 */
struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
                                    const struct flowi *fl,
                                    const struct sock *sk, int flags)
{
        struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk,
                                            flags | XFRM_LOOKUP_QUEUE |
                                            XFRM_LOOKUP_KEEP_DST_REF);

        if (PTR_ERR(dst) == -EREMOTE)
                return make_blackhole(net, dst_orig->ops->family, dst_orig);

        if (IS_ERR(dst))
                dst_release(dst_orig);

        return dst;
}
EXPORT_SYMBOL(xfrm_lookup_route);

static inline int
xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
{
        struct sec_path *sp = skb_sec_path(skb);
        struct xfrm_state *x;

        if (!sp || idx < 0 || idx >= sp->len)
                return 0;
        x = sp->xvec[idx];
        if (!x->type->reject)
                return 0;
        return x->type->reject(x, skb, fl);
}

/* When skb is transformed back to its "native" form, we have to
 * check policy restrictions. At the moment we make this in maximally
 * stupid way. Shame on me. :-) Of course, connected sockets must
 * have policy cached at them.
 */

static inline int
xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
              unsigned short family, u32 if_id)
{
        if (xfrm_state_kern(x))
                return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
        return        x->id.proto == tmpl->id.proto &&
                (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
                (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
                x->props.mode == tmpl->mode &&
                (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
                 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
                !(x->props.mode != XFRM_MODE_TRANSPORT &&
                  xfrm_state_addr_cmp(tmpl, x, family)) &&
                (if_id == 0 || if_id == x->if_id);
}

/*
 * 0 or more than 0 is returned when validation is succeeded (either bypass
 * because of optional transport mode, or next index of the matched secpath
 * state with the template.
 * -1 is returned when no matching template is found.
 * Otherwise "-2 - errored_index" is returned.
 */
static inline int
xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
               unsigned short family, u32 if_id)
{
        int idx = start;

        if (tmpl->optional) {
                if (tmpl->mode == XFRM_MODE_TRANSPORT)
                        return start;
        } else
                start = -1;
        for (; idx < sp->len; idx++) {
                if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id))
                        return ++idx;
                if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
                        if (idx < sp->verified_cnt) {
                                /* Secpath entry previously verified, consider optional and
                                 * continue searching
                                 */
                                continue;
                        }

                        if (start == -1)
                                start = -2-idx;
                        break;
                }
        }
        return start;
}

static void
decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse)
{
        struct flowi4 *fl4 = &fl->u.ip4;

        memset(fl4, 0, sizeof(struct flowi4));

        if (reverse) {
                fl4->saddr = flkeys->addrs.ipv4.dst;
                fl4->daddr = flkeys->addrs.ipv4.src;
                fl4->fl4_sport = flkeys->ports.dst;
                fl4->fl4_dport = flkeys->ports.src;
        } else {
                fl4->saddr = flkeys->addrs.ipv4.src;
                fl4->daddr = flkeys->addrs.ipv4.dst;
                fl4->fl4_sport = flkeys->ports.src;
                fl4->fl4_dport = flkeys->ports.dst;
        }

        switch (flkeys->basic.ip_proto) {
        case IPPROTO_GRE:
                fl4->fl4_gre_key = flkeys->gre.keyid;
                break;
        case IPPROTO_ICMP:
                fl4->fl4_icmp_type = flkeys->icmp.type;
                fl4->fl4_icmp_code = flkeys->icmp.code;
                break;
        }

        fl4->flowi4_proto = flkeys->basic.ip_proto;
        fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos);
}

#if IS_ENABLED(CONFIG_IPV6)
static void
decode_session6(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse)
{
        struct flowi6 *fl6 = &fl->u.ip6;

        memset(fl6, 0, sizeof(struct flowi6));

        if (reverse) {
                fl6->saddr = flkeys->addrs.ipv6.dst;
                fl6->daddr = flkeys->addrs.ipv6.src;
                fl6->fl6_sport = flkeys->ports.dst;
                fl6->fl6_dport = flkeys->ports.src;
        } else {
                fl6->saddr = flkeys->addrs.ipv6.src;
                fl6->daddr = flkeys->addrs.ipv6.dst;
                fl6->fl6_sport = flkeys->ports.src;
                fl6->fl6_dport = flkeys->ports.dst;
        }

        switch (flkeys->basic.ip_proto) {
        case IPPROTO_GRE:
                fl6->fl6_gre_key = flkeys->gre.keyid;
                break;
        case IPPROTO_ICMPV6:
                fl6->fl6_icmp_type = flkeys->icmp.type;
                fl6->fl6_icmp_code = flkeys->icmp.code;
                break;
        }

        fl6->flowi6_proto = flkeys->basic.ip_proto;
}
#endif

int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl,
                          unsigned int family, int reverse)
{
        struct xfrm_flow_keys flkeys;

        memset(&flkeys, 0, sizeof(flkeys));
        __skb_flow_dissect(net, skb, &xfrm_session_dissector, &flkeys,
                           NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_ENCAP);

        switch (family) {
        case AF_INET:
                decode_session4(&flkeys, fl, reverse);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                decode_session6(&flkeys, fl, reverse);
                break;
#endif
        default:
                return -EAFNOSUPPORT;
        }

        fl->flowi_mark = skb->mark;
        if (reverse) {
                fl->flowi_oif = skb->skb_iif;
        } else {
                int oif = 0;

                if (skb_dst(skb) && skb_dst(skb)->dev)
                        oif = skb_dst(skb)->dev->ifindex;

                fl->flowi_oif = oif;
        }

        return security_xfrm_decode_session(skb, &fl->flowi_secid);
}
EXPORT_SYMBOL(__xfrm_decode_session);

static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
{
        for (; k < sp->len; k++) {
                if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
                        *idxp = k;
                        return 1;
                }
        }

        return 0;
}

static bool icmp_err_packet(const struct flowi *fl, unsigned short family)
{
        const struct flowi4 *fl4 = &fl->u.ip4;

        if (family == AF_INET &&
            fl4->flowi4_proto == IPPROTO_ICMP &&
            (fl4->fl4_icmp_type == ICMP_DEST_UNREACH ||
             fl4->fl4_icmp_type == ICMP_TIME_EXCEEDED))
                return true;

#if IS_ENABLED(CONFIG_IPV6)
        if (family == AF_INET6) {
                const struct flowi6 *fl6 = &fl->u.ip6;

                if (fl6->flowi6_proto == IPPROTO_ICMPV6 &&
                    (fl6->fl6_icmp_type == ICMPV6_DEST_UNREACH ||
                    fl6->fl6_icmp_type == ICMPV6_PKT_TOOBIG ||
                    fl6->fl6_icmp_type == ICMPV6_TIME_EXCEED))
                        return true;
        }
#endif
        return false;
}

static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family,
                                  const struct flowi *fl, struct flowi *fl1)
{
        bool ret = true;
        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
        int hl = family == AF_INET ? (sizeof(struct iphdr) +  sizeof(struct icmphdr)) :
                 (sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr));

        if (!newskb)
                return true;

        if (!pskb_pull(newskb, hl))
                goto out;

        skb_reset_network_header(newskb);

        if (xfrm_decode_session_reverse(dev_net(skb->dev), newskb, fl1, family) < 0)
                goto out;

        fl1->flowi_oif = fl->flowi_oif;
        fl1->flowi_mark = fl->flowi_mark;
        fl1->flowi_dscp = fl->flowi_dscp;
        nf_nat_decode_session(newskb, fl1, family);
        ret = false;

out:
        consume_skb(newskb);
        return ret;
}

static bool xfrm_selector_inner_icmp_match(struct sk_buff *skb, unsigned short family,
                                           const struct xfrm_selector *sel,
                                           const struct flowi *fl)
{
        bool ret = false;

        if (icmp_err_packet(fl, family)) {
                struct flowi fl1;

                if (xfrm_icmp_flow_decode(skb, family, fl, &fl1))
                        return ret;

                ret = xfrm_selector_match(sel, &fl1, family);
        }

        return ret;
}

static inline struct
xfrm_policy *xfrm_in_fwd_icmp(struct sk_buff *skb,
                              const struct flowi *fl, unsigned short family,
                              u32 if_id)
{
        struct xfrm_policy *pol = NULL;

        if (icmp_err_packet(fl, family)) {
                struct flowi fl1;
                struct net *net = dev_net(skb->dev);

                if (xfrm_icmp_flow_decode(skb, family, fl, &fl1))
                        return pol;

                pol = xfrm_policy_lookup(net, &fl1, family, XFRM_POLICY_FWD, if_id);
                if (IS_ERR(pol))
                        pol = NULL;
        }

        return pol;
}

static inline struct
dst_entry *xfrm_out_fwd_icmp(struct sk_buff *skb, struct flowi *fl,
                             unsigned short family, struct dst_entry *dst)
{
        if (icmp_err_packet(fl, family)) {
                struct net *net = dev_net(skb->dev);
                struct dst_entry *dst2;
                struct flowi fl1;

                if (xfrm_icmp_flow_decode(skb, family, fl, &fl1))
                        return dst;

                dst_hold(dst);

                dst2 = xfrm_lookup(net, dst, &fl1, NULL, (XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_ICMP));

                if (IS_ERR(dst2))
                        return dst;

                if (dst2->xfrm) {
                        dst_release(dst);
                        dst = dst2;
                } else {
                        dst_release(dst2);
                }
        }

        return dst;
}

int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
                        unsigned short family)
{
        struct net *net = dev_net(skb->dev);
        struct xfrm_policy *pol;
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int npols = 0;
        int xfrm_nr;
        int pi;
        int reverse;
        struct flowi fl;
        int xerr_idx = -1;
        const struct xfrm_if_cb *ifcb;
        struct sec_path *sp;
        u32 if_id = 0;

        rcu_read_lock();
        ifcb = xfrm_if_get_cb();

        if (ifcb) {
                struct xfrm_if_decode_session_result r;

                if (ifcb->decode_session(skb, family, &r)) {
                        if_id = r.if_id;
                        net = r.net;
                }
        }
        rcu_read_unlock();

        reverse = dir & ~XFRM_POLICY_MASK;
        dir &= XFRM_POLICY_MASK;

        if (__xfrm_decode_session(net, skb, &fl, family, reverse) < 0) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
                return 0;
        }

        nf_nat_decode_session(skb, &fl, family);

        /* First, check used SA against their selectors. */
        sp = skb_sec_path(skb);
        if (sp) {
                int i;

                for (i = sp->len - 1; i >= 0; i--) {
                        struct xfrm_state *x = sp->xvec[i];
                        int ret = 0;

                        if (!xfrm_selector_match(&x->sel, &fl, family)) {
                                ret = 1;
                                if (x->props.flags & XFRM_STATE_ICMP &&
                                    xfrm_selector_inner_icmp_match(skb, family, &x->sel, &fl))
                                        ret = 0;
                                if (ret) {
                                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
                                        return 0;
                                }
                        }
                }
        }

        pol = NULL;
        sk = sk_to_full_sk(sk);
        if (sk && sk->sk_policy[dir]) {
                pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id);
                if (IS_ERR(pol)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                        return 0;
                }
        }

        if (!pol)
                pol = xfrm_policy_lookup(net, &fl, family, dir, if_id);

        if (IS_ERR(pol)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                return 0;
        }

        if (!pol && dir == XFRM_POLICY_FWD)
                pol = xfrm_in_fwd_icmp(skb, &fl, family, if_id);

        if (!pol) {
                const bool is_crypto_offload = sp &&
                        (xfrm_input_state(skb)->xso.type == XFRM_DEV_OFFLOAD_CRYPTO);

                if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
                        return 0;
                }

                if (sp && secpath_has_nontransport(sp, 0, &xerr_idx) && !is_crypto_offload) {
                        xfrm_secpath_reject(xerr_idx, skb, &fl);
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
                        return 0;
                }
                return 1;
        }

        /* This lockless write can happen from different cpus. */
        WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds());

        pols[0] = pol;
        npols++;
#ifdef CONFIG_XFRM_SUB_POLICY
        if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
                pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
                                                    &fl, family,
                                                    XFRM_POLICY_IN, if_id);
                if (pols[1]) {
                        if (IS_ERR(pols[1])) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                                xfrm_pol_put(pols[0]);
                                return 0;
                        }
                        /* This write can happen from different cpus. */
                        WRITE_ONCE(pols[1]->curlft.use_time,
                                   ktime_get_real_seconds());
                        npols++;
                }
        }
#endif

        if (pol->action == XFRM_POLICY_ALLOW) {
                static struct sec_path dummy;
                struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
                struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
                struct xfrm_tmpl **tpp = tp;
                int i, k = 0;
                int ti = 0;

                sp = skb_sec_path(skb);
                if (!sp)
                        sp = &dummy;

                for (pi = 0; pi < npols; pi++) {
                        if (pols[pi] != pol &&
                            pols[pi]->action != XFRM_POLICY_ALLOW) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
                                goto reject;
                        }
                        if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
                                goto reject_error;
                        }
                        for (i = 0; i < pols[pi]->xfrm_nr; i++)
                                tpp[ti++] = &pols[pi]->xfrm_vec[i];
                }
                xfrm_nr = ti;

                if (npols > 1) {
                        xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
                        tpp = stp;
                }

                if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET && sp == &dummy)
                        /* This policy template was already checked by HW
                         * and secpath was removed in __xfrm_policy_check2.
                         */
                        goto out;

                /* For each tunnel xfrm, find the first matching tmpl.
                 * For each tmpl before that, find corresponding xfrm.
                 * Order is _important_. Later we will implement
                 * some barriers, but at the moment barriers
                 * are implied between each two transformations.
                 * Upon success, marks secpath entries as having been
                 * verified to allow them to be skipped in future policy
                 * checks (e.g. nested tunnels).
                 */
                for (i = xfrm_nr - 1; i >= 0; i--) {
                        k = xfrm_policy_ok(tpp[i], sp, k, family, if_id);
                        if (k < 0) {
                                if (k < -1)
                                        /* "-2 - errored_index" returned */
                                        xerr_idx = -(2+k);
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
                                goto reject;
                        }
                }

                if (secpath_has_nontransport(sp, k, &xerr_idx)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
                        goto reject;
                }

out:
                xfrm_pols_put(pols, npols);
                sp->verified_cnt = k;

                return 1;
        }
        XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);

reject:
        xfrm_secpath_reject(xerr_idx, skb, &fl);
reject_error:
        xfrm_pols_put(pols, npols);
        return 0;
}
EXPORT_SYMBOL(__xfrm_policy_check);

int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
        struct net *net = dev_net(skb->dev);
        struct flowi fl;
        struct dst_entry *dst;
        int res = 1;

        if (xfrm_decode_session(net, skb, &fl, family) < 0) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
                return 0;
        }

        skb_dst_force(skb);
        dst = skb_dst(skb);
        if (!dst) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
                return 0;
        }

        /* ignore return value from skb_dstref_steal, xfrm_lookup takes
         * care of dropping the refcnt if needed.
         */
        skb_dstref_steal(skb);

        dst = xfrm_lookup(net, dst, &fl, NULL, XFRM_LOOKUP_QUEUE);
        if (IS_ERR(dst)) {
                res = 0;
                dst = NULL;
        }

        if (dst && !dst->xfrm)
                dst = xfrm_out_fwd_icmp(skb, &fl, family, dst);

        skb_dst_set(skb, dst);
        return res;
}
EXPORT_SYMBOL(__xfrm_route_forward);

/* Optimize later using cookies and generation ids. */

static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
{
        /* Code (such as xfrm_bundle_create()) sets dst->obsolete
         * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to
         * get validated by dst_ops->check on every use.  We do this
         * because when a normal route referenced by an XFRM dst is
         * obsoleted we do not go looking around for all parent
         * referencing XFRM dsts so that we can invalidate them.  It
         * is just too much work.  Instead we make the checks here on
         * every use.  For example:
         *
         *        XFRM dst A --> IPv4 dst X
         *
         * X is the "xdst->route" of A (X is also the "dst->path" of A
         * in this example).  If X is marked obsolete, "A" will not
         * notice.  That's what we are validating here via the
         * stale_bundle() check.
         *
         * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
         * be marked on it.
         * This will force stale_bundle() to fail on any xdst bundle with
         * this dst linked in it.
         */
        if (READ_ONCE(dst->obsolete) < 0 && !stale_bundle(dst))
                return dst;

        return NULL;
}

static int stale_bundle(struct dst_entry *dst)
{
        return !xfrm_bundle_ok((struct xfrm_dst *)dst);
}

void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
        while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) {
                dst->dev = blackhole_netdev;
                dev_hold(dst->dev);
                dev_put(dev);
        }
}
EXPORT_SYMBOL(xfrm_dst_ifdown);

static void xfrm_link_failure(struct sk_buff *skb)
{
        /* Impossible. Such dst must be popped before reaches point of failure. */
}

static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst)
{
        if (READ_ONCE(dst->obsolete))
                sk_dst_reset(sk);
}

static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr)
{
        while (nr--) {
                struct xfrm_dst *xdst = bundle[nr];
                u32 pmtu, route_mtu_cached;
                struct dst_entry *dst;

                dst = &xdst->u.dst;
                pmtu = dst_mtu(xfrm_dst_child(dst));
                xdst->child_mtu_cached = pmtu;

                pmtu = xfrm_state_mtu(dst->xfrm, pmtu);

                route_mtu_cached = dst_mtu(xdst->route);
                xdst->route_mtu_cached = route_mtu_cached;

                if (pmtu > route_mtu_cached)
                        pmtu = route_mtu_cached;

                dst_metric_set(dst, RTAX_MTU, pmtu);
        }
}

/* Check that the bundle accepts the flow and its components are
 * still valid.
 */

static int xfrm_bundle_ok(struct xfrm_dst *first)
{
        struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
        struct dst_entry *dst = &first->u.dst;
        struct xfrm_dst *xdst;
        int start_from, nr;
        u32 mtu;

        if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) ||
            (dst->dev && !netif_running(dst->dev)))
                return 0;

        if (dst->flags & DST_XFRM_QUEUE)
                return 1;

        start_from = nr = 0;
        do {
                struct xfrm_dst *xdst = (struct xfrm_dst *)dst;

                if (dst->xfrm->km.state != XFRM_STATE_VALID)
                        return 0;
                if (xdst->xfrm_genid != dst->xfrm->genid)
                        return 0;
                if (xdst->num_pols > 0 &&
                    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
                        return 0;

                bundle[nr++] = xdst;

                mtu = dst_mtu(xfrm_dst_child(dst));
                if (xdst->child_mtu_cached != mtu) {
                        start_from = nr;
                        xdst->child_mtu_cached = mtu;
                }

                if (!dst_check(xdst->route, xdst->route_cookie))
                        return 0;
                mtu = dst_mtu(xdst->route);
                if (xdst->route_mtu_cached != mtu) {
                        start_from = nr;
                        xdst->route_mtu_cached = mtu;
                }

                dst = xfrm_dst_child(dst);
        } while (dst->xfrm);

        if (likely(!start_from))
                return 1;

        xdst = bundle[start_from - 1];
        mtu = xdst->child_mtu_cached;
        while (start_from--) {
                dst = &xdst->u.dst;

                mtu = xfrm_state_mtu(dst->xfrm, mtu);
                if (mtu > xdst->route_mtu_cached)
                        mtu = xdst->route_mtu_cached;
                dst_metric_set(dst, RTAX_MTU, mtu);
                if (!start_from)
                        break;

                xdst = bundle[start_from - 1];
                xdst->child_mtu_cached = mtu;
        }

        return 1;
}

static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
{
        return dst_metric_advmss(xfrm_dst_path(dst));
}

static unsigned int xfrm_mtu(const struct dst_entry *dst)
{
        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

        return mtu ? : dst_mtu(xfrm_dst_path(dst));
}

static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
                                        const void *daddr)
{
        while (dst->xfrm) {
                const struct xfrm_state *xfrm = dst->xfrm;

                dst = xfrm_dst_child(dst);

                if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
                        continue;
                if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
                        daddr = xfrm->coaddr;
                else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
                        daddr = &xfrm->id.daddr;
        }
        return daddr;
}

static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
                                           struct sk_buff *skb,
                                           const void *daddr)
{
        const struct dst_entry *path = xfrm_dst_path(dst);

        if (!skb)
                daddr = xfrm_get_dst_nexthop(dst, daddr);
        return path->ops->neigh_lookup(path, skb, daddr);
}

static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
        const struct dst_entry *path = xfrm_dst_path(dst);

        daddr = xfrm_get_dst_nexthop(dst, daddr);
        path->ops->confirm_neigh(path, daddr);
}

int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
{
        int err = 0;

        if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
                return -EAFNOSUPPORT;

        spin_lock(&xfrm_policy_afinfo_lock);
        if (unlikely(xfrm_policy_afinfo[family] != NULL))
                err = -EEXIST;
        else {
                struct dst_ops *dst_ops = afinfo->dst_ops;
                if (likely(dst_ops->kmem_cachep == NULL))
                        dst_ops->kmem_cachep = xfrm_dst_cache;
                if (likely(dst_ops->check == NULL))
                        dst_ops->check = xfrm_dst_check;
                if (likely(dst_ops->default_advmss == NULL))
                        dst_ops->default_advmss = xfrm_default_advmss;
                if (likely(dst_ops->mtu == NULL))
                        dst_ops->mtu = xfrm_mtu;
                if (likely(dst_ops->negative_advice == NULL))
                        dst_ops->negative_advice = xfrm_negative_advice;
                if (likely(dst_ops->link_failure == NULL))
                        dst_ops->link_failure = xfrm_link_failure;
                if (likely(dst_ops->neigh_lookup == NULL))
                        dst_ops->neigh_lookup = xfrm_neigh_lookup;
                if (likely(!dst_ops->confirm_neigh))
                        dst_ops->confirm_neigh = xfrm_confirm_neigh;
                rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
        }
        spin_unlock(&xfrm_policy_afinfo_lock);

        return err;
}
EXPORT_SYMBOL(xfrm_policy_register_afinfo);

void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
{
        struct dst_ops *dst_ops = afinfo->dst_ops;
        int i;

        for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
                if (rcu_access_pointer(xfrm_policy_afinfo[i]) != afinfo)
                        continue;
                RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
                break;
        }

        synchronize_rcu();

        dst_ops->kmem_cachep = NULL;
        dst_ops->check = NULL;
        dst_ops->negative_advice = NULL;
        dst_ops->link_failure = NULL;
}
EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);

void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
{
        spin_lock(&xfrm_if_cb_lock);
        rcu_assign_pointer(xfrm_if_cb, ifcb);
        spin_unlock(&xfrm_if_cb_lock);
}
EXPORT_SYMBOL(xfrm_if_register_cb);

void xfrm_if_unregister_cb(void)
{
        RCU_INIT_POINTER(xfrm_if_cb, NULL);
        synchronize_rcu();
}
EXPORT_SYMBOL(xfrm_if_unregister_cb);

#ifdef CONFIG_XFRM_STATISTICS
static int __net_init xfrm_statistics_init(struct net *net)
{
        int rv;
        net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib);
        if (!net->mib.xfrm_statistics)
                return -ENOMEM;
        rv = xfrm_proc_init(net);
        if (rv < 0)
                free_percpu(net->mib.xfrm_statistics);
        return rv;
}

static void xfrm_statistics_fini(struct net *net)
{
        xfrm_proc_fini(net);
        free_percpu(net->mib.xfrm_statistics);
}
#else
static int __net_init xfrm_statistics_init(struct net *net)
{
        return 0;
}

static void xfrm_statistics_fini(struct net *net)
{
}
#endif

static int __net_init xfrm_policy_init(struct net *net)
{
        unsigned int hmask, sz;
        int dir, err;

        if (net_eq(net, &init_net)) {
                xfrm_dst_cache = KMEM_CACHE(xfrm_dst, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
                err = rhashtable_init(&xfrm_policy_inexact_table,
                                      &xfrm_pol_inexact_params);
                BUG_ON(err);
        }

        hmask = 8 - 1;
        sz = (hmask+1) * sizeof(struct hlist_head);

        net->xfrm.policy_byidx = xfrm_hash_alloc(sz);
        if (!net->xfrm.policy_byidx)
                goto out_byidx;
        net->xfrm.policy_idx_hmask = hmask;

        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                struct xfrm_policy_hash *htab;

                net->xfrm.policy_count[dir] = 0;
                net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0;

                htab = &net->xfrm.policy_bydst[dir];
                rcu_assign_pointer(htab->table, xfrm_hash_alloc(sz));
                if (!htab->table)
                        goto out_bydst;
                htab->hmask = hmask;
                htab->dbits4 = 32;
                htab->sbits4 = 32;
                htab->dbits6 = 128;
                htab->sbits6 = 128;
        }
        net->xfrm.policy_hthresh.lbits4 = 32;
        net->xfrm.policy_hthresh.rbits4 = 32;
        net->xfrm.policy_hthresh.lbits6 = 128;
        net->xfrm.policy_hthresh.rbits6 = 128;

        seqlock_init(&net->xfrm.policy_hthresh.lock);

        INIT_LIST_HEAD(&net->xfrm.policy_all);
        INIT_LIST_HEAD(&net->xfrm.inexact_bins);
        INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
        INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
        return 0;

out_bydst:
        for (dir--; dir >= 0; dir--) {
                struct xfrm_policy_hash *htab;

                htab = &net->xfrm.policy_bydst[dir];
                xfrm_hash_free(rcu_dereference_protected(htab->table, true), sz);
        }
        xfrm_hash_free(net->xfrm.policy_byidx, sz);
out_byidx:
        return -ENOMEM;
}

static void xfrm_policy_fini(struct net *net)
{
        struct xfrm_pol_inexact_bin *b, *t;
        unsigned int sz;
        int dir;

        disable_work_sync(&net->xfrm.policy_hthresh.work);

        flush_work(&net->xfrm.policy_hash_work);
#ifdef CONFIG_XFRM_SUB_POLICY
        xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false);
#endif
        xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false);

        synchronize_rcu();

        WARN_ON(!list_empty(&net->xfrm.policy_all));

        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                struct xfrm_policy_hash *htab;

                htab = &net->xfrm.policy_bydst[dir];
                sz = (htab->hmask + 1) * sizeof(struct hlist_head);
                WARN_ON(!hlist_empty(rcu_dereference_protected(htab->table, true)));
                xfrm_hash_free(rcu_dereference_protected(htab->table, true), sz);
        }

        sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
        WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
        xfrm_hash_free(net->xfrm.policy_byidx, sz);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
                __xfrm_policy_inexact_prune_bin(b, true);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}

static int __net_init xfrm_net_init(struct net *net)
{
        int rv;

        /* Initialize the per-net locks here */
        spin_lock_init(&net->xfrm.xfrm_state_lock);
        spin_lock_init(&net->xfrm.xfrm_policy_lock);
        seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock);
        mutex_init(&net->xfrm.xfrm_cfg_mutex);
        net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT;
        net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT;
        net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT;

        rv = xfrm_statistics_init(net);
        if (rv < 0)
                goto out_statistics;
        rv = xfrm_state_init(net);
        if (rv < 0)
                goto out_state;
        rv = xfrm_policy_init(net);
        if (rv < 0)
                goto out_policy;
        rv = xfrm_sysctl_init(net);
        if (rv < 0)
                goto out_sysctl;

        rv = xfrm_nat_keepalive_net_init(net);
        if (rv < 0)
                goto out_nat_keepalive;

        return 0;

out_nat_keepalive:
        xfrm_sysctl_fini(net);
out_sysctl:
        xfrm_policy_fini(net);
out_policy:
        xfrm_state_fini(net);
out_state:
        xfrm_statistics_fini(net);
out_statistics:
        return rv;
}

static void __net_exit xfrm_net_exit(struct net *net)
{
        xfrm_nat_keepalive_net_fini(net);
        xfrm_sysctl_fini(net);
        xfrm_policy_fini(net);
        xfrm_state_fini(net);
        xfrm_statistics_fini(net);
}

static struct pernet_operations __net_initdata xfrm_net_ops = {
        .init = xfrm_net_init,
        .exit = xfrm_net_exit,
};

static const struct flow_dissector_key xfrm_flow_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct xfrm_flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct xfrm_flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct xfrm_flow_keys, addrs.ipv4),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct xfrm_flow_keys, addrs.ipv6),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct xfrm_flow_keys, ports),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
                .offset = offsetof(struct xfrm_flow_keys, gre),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IP,
                .offset = offsetof(struct xfrm_flow_keys, ip),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_ICMP,
                .offset = offsetof(struct xfrm_flow_keys, icmp),
        },
};

void __init xfrm_init(void)
{
        skb_flow_dissector_init(&xfrm_session_dissector,
                                xfrm_flow_dissector_keys,
                                ARRAY_SIZE(xfrm_flow_dissector_keys));

        register_pernet_subsys(&xfrm_net_ops);
        xfrm_dev_init();
        xfrm_input_init();

#ifdef CONFIG_XFRM_ESPINTCP
        espintcp_init();
#endif

        register_xfrm_state_bpf();
        xfrm_nat_keepalive_init(AF_INET);
}

#ifdef CONFIG_AUDITSYSCALL
static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
                                         struct audit_buffer *audit_buf)
{
        struct xfrm_sec_ctx *ctx = xp->security;
        struct xfrm_selector *sel = &xp->selector;

        if (ctx)
                audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
                                 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);

        switch (sel->family) {
        case AF_INET:
                audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4);
                if (sel->prefixlen_s != 32)
                        audit_log_format(audit_buf, " src_prefixlen=%d",
                                         sel->prefixlen_s);
                audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4);
                if (sel->prefixlen_d != 32)
                        audit_log_format(audit_buf, " dst_prefixlen=%d",
                                         sel->prefixlen_d);
                break;
        case AF_INET6:
                audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
                if (sel->prefixlen_s != 128)
                        audit_log_format(audit_buf, " src_prefixlen=%d",
                                         sel->prefixlen_s);
                audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
                if (sel->prefixlen_d != 128)
                        audit_log_format(audit_buf, " dst_prefixlen=%d",
                                         sel->prefixlen_d);
                break;
        }
}

void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SPD-add");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        xfrm_audit_common_policyinfo(xp, audit_buf);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);

void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                              bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SPD-delete");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        xfrm_audit_common_policyinfo(xp, audit_buf);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
#endif

#ifdef CONFIG_XFRM_MIGRATE
static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel,
                                                    u8 dir, u8 type, struct net *net, u32 if_id)
{
        struct xfrm_policy *pol;
        struct flowi fl;

        memset(&fl, 0, sizeof(fl));

        fl.flowi_proto = sel->proto;

        switch (sel->family) {
        case AF_INET:
                fl.u.ip4.saddr = sel->saddr.a4;
                fl.u.ip4.daddr = sel->daddr.a4;
                if (sel->proto == IPSEC_ULPROTO_ANY)
                        break;
                fl.u.flowi4_oif = sel->ifindex;
                fl.u.ip4.fl4_sport = sel->sport;
                fl.u.ip4.fl4_dport = sel->dport;
                break;
        case AF_INET6:
                fl.u.ip6.saddr = sel->saddr.in6;
                fl.u.ip6.daddr = sel->daddr.in6;
                if (sel->proto == IPSEC_ULPROTO_ANY)
                        break;
                fl.u.flowi6_oif = sel->ifindex;
                fl.u.ip6.fl4_sport = sel->sport;
                fl.u.ip6.fl4_dport = sel->dport;
                break;
        default:
                return ERR_PTR(-EAFNOSUPPORT);
        }

        rcu_read_lock();

        pol = xfrm_policy_lookup_bytype(net, type, &fl, sel->family, dir, if_id);
        if (IS_ERR_OR_NULL(pol))
                goto out_unlock;
out_unlock:
        rcu_read_unlock();
        return pol;
}

static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
{
        int match = 0;

        if (t->mode == m->mode && t->id.proto == m->proto &&
            (m->reqid == 0 || t->reqid == m->reqid)) {
                switch (t->mode) {
                case XFRM_MODE_TUNNEL:
                case XFRM_MODE_BEET:
                case XFRM_MODE_IPTFS:
                        if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
                                            m->old_family) &&
                            xfrm_addr_equal(&t->saddr, &m->old_saddr,
                                            m->old_family)) {
                                match = 1;
                        }
                        break;
                case XFRM_MODE_TRANSPORT:
                        /* in case of transport mode, template does not store
                           any IP addresses, hence we just compare mode and
                           protocol */
                        match = 1;
                        break;
                default:
                        break;
                }
        }
        return match;
}

/* update endpoint address(es) of template(s) */
static int xfrm_policy_migrate(struct xfrm_policy *pol,
                               struct xfrm_migrate *m, int num_migrate,
                               struct netlink_ext_ack *extack)
{
        struct xfrm_migrate *mp;
        int i, j, n = 0;

        write_lock_bh(&pol->lock);
        if (unlikely(pol->walk.dead)) {
                /* target policy has been deleted */
                NL_SET_ERR_MSG(extack, "Target policy not found");
                write_unlock_bh(&pol->lock);
                return -ENOENT;
        }

        for (i = 0; i < pol->xfrm_nr; i++) {
                for (j = 0, mp = m; j < num_migrate; j++, mp++) {
                        if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i]))
                                continue;
                        n++;
                        if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
                            pol->xfrm_vec[i].mode != XFRM_MODE_BEET &&
                            pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS)
                                continue;
                        /* update endpoints */
                        memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
                               sizeof(pol->xfrm_vec[i].id.daddr));
                        memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr,
                               sizeof(pol->xfrm_vec[i].saddr));
                        pol->xfrm_vec[i].encap_family = mp->new_family;
                        /* flush bundles */
                        atomic_inc(&pol->genid);
                }
        }

        write_unlock_bh(&pol->lock);

        if (!n)
                return -ENODATA;

        return 0;
}

static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate,
                              struct netlink_ext_ack *extack)
{
        int i, j;

        if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) {
                NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)");
                return -EINVAL;
        }

        for (i = 0; i < num_migrate; i++) {
                if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
                    xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) {
                        NL_SET_ERR_MSG(extack, "Addresses in the MIGRATE attribute's list cannot be null");
                        return -EINVAL;
                }

                /* check if there is any duplicated entry */
                for (j = i + 1; j < num_migrate; j++) {
                        if (!memcmp(&m[i].old_daddr, &m[j].old_daddr,
                                    sizeof(m[i].old_daddr)) &&
                            !memcmp(&m[i].old_saddr, &m[j].old_saddr,
                                    sizeof(m[i].old_saddr)) &&
                            m[i].proto == m[j].proto &&
                            m[i].mode == m[j].mode &&
                            m[i].reqid == m[j].reqid &&
                            m[i].old_family == m[j].old_family) {
                                NL_SET_ERR_MSG(extack, "Entries in the MIGRATE attribute's list must be unique");
                                return -EINVAL;
                        }
                }
        }

        return 0;
}

int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                 struct xfrm_migrate *m, int num_migrate,
                 struct xfrm_kmaddress *k, struct net *net,
                 struct xfrm_encap_tmpl *encap, u32 if_id,
                 struct netlink_ext_ack *extack, struct xfrm_user_offload *xuo)
{
        int i, err, nx_cur = 0, nx_new = 0;
        struct xfrm_policy *pol = NULL;
        struct xfrm_state *x, *xc;
        struct xfrm_state *x_cur[XFRM_MAX_DEPTH];
        struct xfrm_state *x_new[XFRM_MAX_DEPTH];
        struct xfrm_migrate *mp;

        /* Stage 0 - sanity checks */
        err = xfrm_migrate_check(m, num_migrate, extack);
        if (err < 0)
                goto out;

        if (dir >= XFRM_POLICY_MAX) {
                NL_SET_ERR_MSG(extack, "Invalid policy direction");
                err = -EINVAL;
                goto out;
        }

        /* Stage 1 - find policy */
        pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id);
        if (IS_ERR_OR_NULL(pol)) {
                NL_SET_ERR_MSG(extack, "Target policy not found");
                err = IS_ERR(pol) ? PTR_ERR(pol) : -ENOENT;
                goto out;
        }

        /* Stage 2 - find and update state(s) */
        for (i = 0, mp = m; i < num_migrate; i++, mp++) {
                if ((x = xfrm_migrate_state_find(mp, net, if_id))) {
                        x_cur[nx_cur] = x;
                        nx_cur++;
                        xc = xfrm_state_migrate(x, mp, encap, net, xuo, extack);
                        if (xc) {
                                x_new[nx_new] = xc;
                                nx_new++;
                        } else {
                                err = -ENODATA;
                                goto restore_state;
                        }
                }
        }

        /* Stage 3 - update policy */
        err = xfrm_policy_migrate(pol, m, num_migrate, extack);
        if (err < 0)
                goto restore_state;

        /* Stage 4 - delete old state(s) */
        if (nx_cur) {
                xfrm_states_put(x_cur, nx_cur);
                xfrm_states_delete(x_cur, nx_cur);
        }

        /* Stage 5 - announce */
        km_migrate(sel, dir, type, m, num_migrate, k, encap);

        xfrm_pol_put(pol);

        return 0;
out:
        return err;

restore_state:
        if (pol)
                xfrm_pol_put(pol);
        if (nx_cur)
                xfrm_states_put(x_cur, nx_cur);
        if (nx_new)
                xfrm_states_delete(x_new, nx_new);

        return err;
}
EXPORT_SYMBOL(xfrm_migrate);
#endif





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   22 












































   17 


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * security/tomoyo/common.h
 *
 * Header file for TOMOYO.
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#ifndef _SECURITY_TOMOYO_COMMON_H
#define _SECURITY_TOMOYO_COMMON_H

#define pr_fmt(fmt) fmt

#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/kmod.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/list.h>
#include <linux/cred.h>
#include <linux/poll.h>
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/un.h>
#include <linux/lsm_hooks.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/udp.h>

/********** Constants definitions. **********/

/*
 * TOMOYO uses this hash only when appending a string into the string
 * table. Frequency of appending strings is very low. So we don't need
 * large (e.g. 64k) hash size. 256 will be sufficient.
 */
#define TOMOYO_HASH_BITS  8
#define TOMOYO_MAX_HASH (1u<<TOMOYO_HASH_BITS)

/*
 * TOMOYO checks only SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, SOCK_SEQPACKET.
 * Therefore, we don't need SOCK_MAX.
 */
#define TOMOYO_SOCK_MAX 6

#define TOMOYO_EXEC_TMPSIZE     4096

/* Garbage collector is trying to kfree() this element. */
#define TOMOYO_GC_IN_PROGRESS -1

/* Profile number is an integer between 0 and 255. */
#define TOMOYO_MAX_PROFILES 256

/* Group number is an integer between 0 and 255. */
#define TOMOYO_MAX_ACL_GROUPS 256

/* Index numbers for "struct tomoyo_condition". */
enum tomoyo_conditions_index {
        TOMOYO_TASK_UID,             /* current_uid()   */
        TOMOYO_TASK_EUID,            /* current_euid()  */
        TOMOYO_TASK_SUID,            /* current_suid()  */
        TOMOYO_TASK_FSUID,           /* current_fsuid() */
        TOMOYO_TASK_GID,             /* current_gid()   */
        TOMOYO_TASK_EGID,            /* current_egid()  */
        TOMOYO_TASK_SGID,            /* current_sgid()  */
        TOMOYO_TASK_FSGID,           /* current_fsgid() */
        TOMOYO_TASK_PID,             /* sys_getpid()   */
        TOMOYO_TASK_PPID,            /* sys_getppid()  */
        TOMOYO_EXEC_ARGC,            /* "struct linux_binprm *"->argc */
        TOMOYO_EXEC_ENVC,            /* "struct linux_binprm *"->envc */
        TOMOYO_TYPE_IS_SOCKET,       /* S_IFSOCK */
        TOMOYO_TYPE_IS_SYMLINK,      /* S_IFLNK */
        TOMOYO_TYPE_IS_FILE,         /* S_IFREG */
        TOMOYO_TYPE_IS_BLOCK_DEV,    /* S_IFBLK */
        TOMOYO_TYPE_IS_DIRECTORY,    /* S_IFDIR */
        TOMOYO_TYPE_IS_CHAR_DEV,     /* S_IFCHR */
        TOMOYO_TYPE_IS_FIFO,         /* S_IFIFO */
        TOMOYO_MODE_SETUID,          /* S_ISUID */
        TOMOYO_MODE_SETGID,          /* S_ISGID */
        TOMOYO_MODE_STICKY,          /* S_ISVTX */
        TOMOYO_MODE_OWNER_READ,      /* S_IRUSR */
        TOMOYO_MODE_OWNER_WRITE,     /* S_IWUSR */
        TOMOYO_MODE_OWNER_EXECUTE,   /* S_IXUSR */
        TOMOYO_MODE_GROUP_READ,      /* S_IRGRP */
        TOMOYO_MODE_GROUP_WRITE,     /* S_IWGRP */
        TOMOYO_MODE_GROUP_EXECUTE,   /* S_IXGRP */
        TOMOYO_MODE_OTHERS_READ,     /* S_IROTH */
        TOMOYO_MODE_OTHERS_WRITE,    /* S_IWOTH */
        TOMOYO_MODE_OTHERS_EXECUTE,  /* S_IXOTH */
        TOMOYO_EXEC_REALPATH,
        TOMOYO_SYMLINK_TARGET,
        TOMOYO_PATH1_UID,
        TOMOYO_PATH1_GID,
        TOMOYO_PATH1_INO,
        TOMOYO_PATH1_MAJOR,
        TOMOYO_PATH1_MINOR,
        TOMOYO_PATH1_PERM,
        TOMOYO_PATH1_TYPE,
        TOMOYO_PATH1_DEV_MAJOR,
        TOMOYO_PATH1_DEV_MINOR,
        TOMOYO_PATH2_UID,
        TOMOYO_PATH2_GID,
        TOMOYO_PATH2_INO,
        TOMOYO_PATH2_MAJOR,
        TOMOYO_PATH2_MINOR,
        TOMOYO_PATH2_PERM,
        TOMOYO_PATH2_TYPE,
        TOMOYO_PATH2_DEV_MAJOR,
        TOMOYO_PATH2_DEV_MINOR,
        TOMOYO_PATH1_PARENT_UID,
        TOMOYO_PATH1_PARENT_GID,
        TOMOYO_PATH1_PARENT_INO,
        TOMOYO_PATH1_PARENT_PERM,
        TOMOYO_PATH2_PARENT_UID,
        TOMOYO_PATH2_PARENT_GID,
        TOMOYO_PATH2_PARENT_INO,
        TOMOYO_PATH2_PARENT_PERM,
        TOMOYO_MAX_CONDITION_KEYWORD,
        TOMOYO_NUMBER_UNION,
        TOMOYO_NAME_UNION,
        TOMOYO_ARGV_ENTRY,
        TOMOYO_ENVP_ENTRY,
};


/* Index numbers for stat(). */
enum tomoyo_path_stat_index {
        /* Do not change this order. */
        TOMOYO_PATH1,
        TOMOYO_PATH1_PARENT,
        TOMOYO_PATH2,
        TOMOYO_PATH2_PARENT,
        TOMOYO_MAX_PATH_STAT
};

/* Index numbers for operation mode. */
enum tomoyo_mode_index {
        TOMOYO_CONFIG_DISABLED,
        TOMOYO_CONFIG_LEARNING,
        TOMOYO_CONFIG_PERMISSIVE,
        TOMOYO_CONFIG_ENFORCING,
        TOMOYO_CONFIG_MAX_MODE,
        TOMOYO_CONFIG_WANT_REJECT_LOG =  64,
        TOMOYO_CONFIG_WANT_GRANT_LOG  = 128,
        TOMOYO_CONFIG_USE_DEFAULT     = 255,
};

/* Index numbers for entry type. */
enum tomoyo_policy_id {
        TOMOYO_ID_GROUP,
        TOMOYO_ID_ADDRESS_GROUP,
        TOMOYO_ID_PATH_GROUP,
        TOMOYO_ID_NUMBER_GROUP,
        TOMOYO_ID_TRANSITION_CONTROL,
        TOMOYO_ID_AGGREGATOR,
        TOMOYO_ID_MANAGER,
        TOMOYO_ID_CONDITION,
        TOMOYO_ID_NAME,
        TOMOYO_ID_ACL,
        TOMOYO_ID_DOMAIN,
        TOMOYO_MAX_POLICY
};

/* Index numbers for domain's attributes. */
enum tomoyo_domain_info_flags_index {
        /* Quota warnning flag.   */
        TOMOYO_DIF_QUOTA_WARNED,
        /*
         * This domain was unable to create a new domain at
         * tomoyo_find_next_domain() because the name of the domain to be
         * created was too long or it could not allocate memory.
         * More than one process continued execve() without domain transition.
         */
        TOMOYO_DIF_TRANSITION_FAILED,
        TOMOYO_MAX_DOMAIN_INFO_FLAGS
};

/* Index numbers for audit type. */
enum tomoyo_grant_log {
        /* Follow profile's configuration. */
        TOMOYO_GRANTLOG_AUTO,
        /* Do not generate grant log. */
        TOMOYO_GRANTLOG_NO,
        /* Generate grant_log. */
        TOMOYO_GRANTLOG_YES,
};

/* Index numbers for group entries. */
enum tomoyo_group_id {
        TOMOYO_PATH_GROUP,
        TOMOYO_NUMBER_GROUP,
        TOMOYO_ADDRESS_GROUP,
        TOMOYO_MAX_GROUP
};

/* Index numbers for type of numeric values. */
enum tomoyo_value_type {
        TOMOYO_VALUE_TYPE_INVALID,
        TOMOYO_VALUE_TYPE_DECIMAL,
        TOMOYO_VALUE_TYPE_OCTAL,
        TOMOYO_VALUE_TYPE_HEXADECIMAL,
};

/* Index numbers for domain transition control keywords. */
enum tomoyo_transition_type {
        /* Do not change this order, */
        TOMOYO_TRANSITION_CONTROL_NO_RESET,
        TOMOYO_TRANSITION_CONTROL_RESET,
        TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE,
        TOMOYO_TRANSITION_CONTROL_INITIALIZE,
        TOMOYO_TRANSITION_CONTROL_NO_KEEP,
        TOMOYO_TRANSITION_CONTROL_KEEP,
        TOMOYO_MAX_TRANSITION_TYPE
};

/* Index numbers for Access Controls. */
enum tomoyo_acl_entry_type_index {
        TOMOYO_TYPE_PATH_ACL,
        TOMOYO_TYPE_PATH2_ACL,
        TOMOYO_TYPE_PATH_NUMBER_ACL,
        TOMOYO_TYPE_MKDEV_ACL,
        TOMOYO_TYPE_MOUNT_ACL,
        TOMOYO_TYPE_INET_ACL,
        TOMOYO_TYPE_UNIX_ACL,
        TOMOYO_TYPE_ENV_ACL,
        TOMOYO_TYPE_MANUAL_TASK_ACL,
};

/* Index numbers for access controls with one pathname. */
enum tomoyo_path_acl_index {
        TOMOYO_TYPE_EXECUTE,
        TOMOYO_TYPE_READ,
        TOMOYO_TYPE_WRITE,
        TOMOYO_TYPE_APPEND,
        TOMOYO_TYPE_UNLINK,
        TOMOYO_TYPE_GETATTR,
        TOMOYO_TYPE_RMDIR,
        TOMOYO_TYPE_TRUNCATE,
        TOMOYO_TYPE_SYMLINK,
        TOMOYO_TYPE_CHROOT,
        TOMOYO_TYPE_UMOUNT,
        TOMOYO_MAX_PATH_OPERATION
};

/* Index numbers for /sys/kernel/security/tomoyo/stat interface. */
enum tomoyo_memory_stat_type {
        TOMOYO_MEMORY_POLICY,
        TOMOYO_MEMORY_AUDIT,
        TOMOYO_MEMORY_QUERY,
        TOMOYO_MAX_MEMORY_STAT
};

enum tomoyo_mkdev_acl_index {
        TOMOYO_TYPE_MKBLOCK,
        TOMOYO_TYPE_MKCHAR,
        TOMOYO_MAX_MKDEV_OPERATION
};

/* Index numbers for socket operations. */
enum tomoyo_network_acl_index {
        TOMOYO_NETWORK_BIND,    /* bind() operation. */
        TOMOYO_NETWORK_LISTEN,  /* listen() operation. */
        TOMOYO_NETWORK_CONNECT, /* connect() operation. */
        TOMOYO_NETWORK_SEND,    /* send() operation. */
        TOMOYO_MAX_NETWORK_OPERATION
};

/* Index numbers for access controls with two pathnames. */
enum tomoyo_path2_acl_index {
        TOMOYO_TYPE_LINK,
        TOMOYO_TYPE_RENAME,
        TOMOYO_TYPE_PIVOT_ROOT,
        TOMOYO_MAX_PATH2_OPERATION
};

/* Index numbers for access controls with one pathname and one number. */
enum tomoyo_path_number_acl_index {
        TOMOYO_TYPE_CREATE,
        TOMOYO_TYPE_MKDIR,
        TOMOYO_TYPE_MKFIFO,
        TOMOYO_TYPE_MKSOCK,
        TOMOYO_TYPE_IOCTL,
        TOMOYO_TYPE_CHMOD,
        TOMOYO_TYPE_CHOWN,
        TOMOYO_TYPE_CHGRP,
        TOMOYO_MAX_PATH_NUMBER_OPERATION
};

/* Index numbers for /sys/kernel/security/tomoyo/ interfaces. */
enum tomoyo_securityfs_interface_index {
        TOMOYO_DOMAINPOLICY,
        TOMOYO_EXCEPTIONPOLICY,
        TOMOYO_PROCESS_STATUS,
        TOMOYO_STAT,
        TOMOYO_AUDIT,
        TOMOYO_VERSION,
        TOMOYO_PROFILE,
        TOMOYO_QUERY,
        TOMOYO_MANAGER
};

/* Index numbers for special mount operations. */
enum tomoyo_special_mount {
        TOMOYO_MOUNT_BIND,            /* mount --bind /source /dest   */
        TOMOYO_MOUNT_MOVE,            /* mount --move /old /new       */
        TOMOYO_MOUNT_REMOUNT,         /* mount -o remount /dir        */
        TOMOYO_MOUNT_MAKE_UNBINDABLE, /* mount --make-unbindable /dir */
        TOMOYO_MOUNT_MAKE_PRIVATE,    /* mount --make-private /dir    */
        TOMOYO_MOUNT_MAKE_SLAVE,      /* mount --make-slave /dir      */
        TOMOYO_MOUNT_MAKE_SHARED,     /* mount --make-shared /dir     */
        TOMOYO_MAX_SPECIAL_MOUNT
};

/* Index numbers for functionality. */
enum tomoyo_mac_index {
        TOMOYO_MAC_FILE_EXECUTE,
        TOMOYO_MAC_FILE_OPEN,
        TOMOYO_MAC_FILE_CREATE,
        TOMOYO_MAC_FILE_UNLINK,
        TOMOYO_MAC_FILE_GETATTR,
        TOMOYO_MAC_FILE_MKDIR,
        TOMOYO_MAC_FILE_RMDIR,
        TOMOYO_MAC_FILE_MKFIFO,
        TOMOYO_MAC_FILE_MKSOCK,
        TOMOYO_MAC_FILE_TRUNCATE,
        TOMOYO_MAC_FILE_SYMLINK,
        TOMOYO_MAC_FILE_MKBLOCK,
        TOMOYO_MAC_FILE_MKCHAR,
        TOMOYO_MAC_FILE_LINK,
        TOMOYO_MAC_FILE_RENAME,
        TOMOYO_MAC_FILE_CHMOD,
        TOMOYO_MAC_FILE_CHOWN,
        TOMOYO_MAC_FILE_CHGRP,
        TOMOYO_MAC_FILE_IOCTL,
        TOMOYO_MAC_FILE_CHROOT,
        TOMOYO_MAC_FILE_MOUNT,
        TOMOYO_MAC_FILE_UMOUNT,
        TOMOYO_MAC_FILE_PIVOT_ROOT,
        TOMOYO_MAC_NETWORK_INET_STREAM_BIND,
        TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN,
        TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT,
        TOMOYO_MAC_NETWORK_INET_DGRAM_BIND,
        TOMOYO_MAC_NETWORK_INET_DGRAM_SEND,
        TOMOYO_MAC_NETWORK_INET_RAW_BIND,
        TOMOYO_MAC_NETWORK_INET_RAW_SEND,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT,
        TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND,
        TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT,
        TOMOYO_MAC_ENVIRON,
        TOMOYO_MAX_MAC_INDEX
};

/* Index numbers for category of functionality. */
enum tomoyo_mac_category_index {
        TOMOYO_MAC_CATEGORY_FILE,
        TOMOYO_MAC_CATEGORY_NETWORK,
        TOMOYO_MAC_CATEGORY_MISC,
        TOMOYO_MAX_MAC_CATEGORY_INDEX
};

/*
 * Retry this request. Returned by tomoyo_supervisor() if policy violation has
 * occurred in enforcing mode and the userspace daemon decided to retry.
 *
 * We must choose a positive value in order to distinguish "granted" (which is
 * 0) and "rejected" (which is a negative value) and "retry".
 */
#define TOMOYO_RETRY_REQUEST 1

/* Index numbers for /sys/kernel/security/tomoyo/stat interface. */
enum tomoyo_policy_stat_type {
        /* Do not change this order. */
        TOMOYO_STAT_POLICY_UPDATES,
        TOMOYO_STAT_POLICY_LEARNING,   /* == TOMOYO_CONFIG_LEARNING */
        TOMOYO_STAT_POLICY_PERMISSIVE, /* == TOMOYO_CONFIG_PERMISSIVE */
        TOMOYO_STAT_POLICY_ENFORCING,  /* == TOMOYO_CONFIG_ENFORCING */
        TOMOYO_MAX_POLICY_STAT
};

/* Index numbers for profile's PREFERENCE values. */
enum tomoyo_pref_index {
        TOMOYO_PREF_MAX_AUDIT_LOG,
        TOMOYO_PREF_MAX_LEARNING_ENTRY,
        TOMOYO_MAX_PREF
};

/********** Structure definitions. **********/

/* Common header for holding ACL entries. */
struct tomoyo_acl_head {
        struct list_head list;
        s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */
} __packed;

/* Common header for shared entries. */
struct tomoyo_shared_acl_head {
        struct list_head list;
        atomic_t users;
} __packed;

struct tomoyo_policy_namespace;

/* Structure for request info. */
struct tomoyo_request_info {
        /*
         * For holding parameters specific to operations which deal files.
         * NULL if not dealing files.
         */
        struct tomoyo_obj_info *obj;
        /*
         * For holding parameters specific to execve() request.
         * NULL if not dealing execve().
         */
        struct tomoyo_execve *ee;
        struct tomoyo_domain_info *domain;
        /* For holding parameters. */
        union {
                struct {
                        const struct tomoyo_path_info *filename;
                        /* For using wildcards at tomoyo_find_next_domain(). */
                        const struct tomoyo_path_info *matched_path;
                        /* One of values in "enum tomoyo_path_acl_index". */
                        u8 operation;
                } path;
                struct {
                        const struct tomoyo_path_info *filename1;
                        const struct tomoyo_path_info *filename2;
                        /* One of values in "enum tomoyo_path2_acl_index". */
                        u8 operation;
                } path2;
                struct {
                        const struct tomoyo_path_info *filename;
                        unsigned int mode;
                        unsigned int major;
                        unsigned int minor;
                        /* One of values in "enum tomoyo_mkdev_acl_index". */
                        u8 operation;
                } mkdev;
                struct {
                        const struct tomoyo_path_info *filename;
                        unsigned long number;
                        /*
                         * One of values in
                         * "enum tomoyo_path_number_acl_index".
                         */
                        u8 operation;
                } path_number;
                struct {
                        const struct tomoyo_path_info *name;
                } environ;
                struct {
                        const __be32 *address;
                        u16 port;
                        /* One of values smaller than TOMOYO_SOCK_MAX. */
                        u8 protocol;
                        /* One of values in "enum tomoyo_network_acl_index". */
                        u8 operation;
                        bool is_ipv6;
                } inet_network;
                struct {
                        const struct tomoyo_path_info *address;
                        /* One of values smaller than TOMOYO_SOCK_MAX. */
                        u8 protocol;
                        /* One of values in "enum tomoyo_network_acl_index". */
                        u8 operation;
                } unix_network;
                struct {
                        const struct tomoyo_path_info *type;
                        const struct tomoyo_path_info *dir;
                        const struct tomoyo_path_info *dev;
                        unsigned long flags;
                        int need_dev;
                } mount;
                struct {
                        const struct tomoyo_path_info *domainname;
                } task;
        } param;
        struct tomoyo_acl_info *matched_acl;
        u8 param_type;
        bool granted;
        u8 retry;
        u8 profile;
        u8 mode; /* One of tomoyo_mode_index . */
        u8 type;
};

/* Structure for holding a token. */
struct tomoyo_path_info {
        const char *name;
        u32 hash;          /* = full_name_hash(name, strlen(name)) */
        u16 const_len;     /* = tomoyo_const_part_length(name)     */
        bool is_dir;       /* = tomoyo_strendswith(name, "/")      */
        bool is_patterned; /* = tomoyo_path_contains_pattern(name) */
};

/* Structure for holding string data. */
struct tomoyo_name {
        struct tomoyo_shared_acl_head head;
        struct tomoyo_path_info entry;
};

/* Structure for holding a word. */
struct tomoyo_name_union {
        /* Either @filename or @group is NULL. */
        const struct tomoyo_path_info *filename;
        struct tomoyo_group *group;
};

/* Structure for holding a number. */
struct tomoyo_number_union {
        unsigned long values[2];
        struct tomoyo_group *group; /* Maybe NULL. */
        /* One of values in "enum tomoyo_value_type". */
        u8 value_type[2];
};

/* Structure for holding an IP address. */
struct tomoyo_ipaddr_union {
        struct in6_addr ip[2]; /* Big endian. */
        struct tomoyo_group *group; /* Pointer to address group. */
        bool is_ipv6; /* Valid only if @group == NULL. */
};

/* Structure for "path_group"/"number_group"/"address_group" directive. */
struct tomoyo_group {
        struct tomoyo_shared_acl_head head;
        const struct tomoyo_path_info *group_name;
        struct list_head member_list;
};

/* Structure for "path_group" directive. */
struct tomoyo_path_group {
        struct tomoyo_acl_head head;
        const struct tomoyo_path_info *member_name;
};

/* Structure for "number_group" directive. */
struct tomoyo_number_group {
        struct tomoyo_acl_head head;
        struct tomoyo_number_union number;
};

/* Structure for "address_group" directive. */
struct tomoyo_address_group {
        struct tomoyo_acl_head head;
        /* Structure for holding an IP address. */
        struct tomoyo_ipaddr_union address;
};

/* Subset of "struct stat". Used by conditional ACL and audit logs. */
struct tomoyo_mini_stat {
        kuid_t uid;
        kgid_t gid;
        ino_t ino;
        umode_t mode;
        dev_t dev;
        dev_t rdev;
};

/* Structure for dumping argv[] and envp[] of "struct linux_binprm". */
struct tomoyo_page_dump {
        struct page *page;    /* Previously dumped page. */
        char *data;           /* Contents of "page". Size is PAGE_SIZE. */
};

/* Structure for attribute checks in addition to pathname checks. */
struct tomoyo_obj_info {
        /*
         * True if tomoyo_get_attributes() was already called, false otherwise.
         */
        bool validate_done;
        /* True if @stat[] is valid. */
        bool stat_valid[TOMOYO_MAX_PATH_STAT];
        /* First pathname. Initialized with { NULL, NULL } if no path. */
        struct path path1;
        /* Second pathname. Initialized with { NULL, NULL } if no path. */
        struct path path2;
        /*
         * Information on @path1, @path1's parent directory, @path2, @path2's
         * parent directory.
         */
        struct tomoyo_mini_stat stat[TOMOYO_MAX_PATH_STAT];
        /*
         * Content of symbolic link to be created. NULL for operations other
         * than symlink().
         */
        struct tomoyo_path_info *symlink_target;
};

/* Structure for argv[]. */
struct tomoyo_argv {
        unsigned long index;
        const struct tomoyo_path_info *value;
        bool is_not;
};

/* Structure for envp[]. */
struct tomoyo_envp {
        const struct tomoyo_path_info *name;
        const struct tomoyo_path_info *value;
        bool is_not;
};

/* Structure for execve() operation. */
struct tomoyo_execve {
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj;
        struct linux_binprm *bprm;
        const struct tomoyo_path_info *transition;
        /* For dumping argv[] and envp[]. */
        struct tomoyo_page_dump dump;
        /* For temporary use. */
        char *tmp; /* Size is TOMOYO_EXEC_TMPSIZE bytes */
};

/* Structure for entries which follows "struct tomoyo_condition". */
struct tomoyo_condition_element {
        /*
         * Left hand operand. A "struct tomoyo_argv" for TOMOYO_ARGV_ENTRY, a
         * "struct tomoyo_envp" for TOMOYO_ENVP_ENTRY is attached to the tail
         * of the array of this struct.
         */
        u8 left;
        /*
         * Right hand operand. A "struct tomoyo_number_union" for
         * TOMOYO_NUMBER_UNION, a "struct tomoyo_name_union" for
         * TOMOYO_NAME_UNION is attached to the tail of the array of this
         * struct.
         */
        u8 right;
        /* Equation operator. True if equals or overlaps, false otherwise. */
        bool equals;
};

/* Structure for optional arguments. */
struct tomoyo_condition {
        struct tomoyo_shared_acl_head head;
        u32 size; /* Memory size allocated for this entry. */
        u16 condc; /* Number of conditions in this struct. */
        u16 numbers_count; /* Number of "struct tomoyo_number_union values". */
        u16 names_count; /* Number of "struct tomoyo_name_union names". */
        u16 argc; /* Number of "struct tomoyo_argv". */
        u16 envc; /* Number of "struct tomoyo_envp". */
        u8 grant_log; /* One of values in "enum tomoyo_grant_log". */
        const struct tomoyo_path_info *transit; /* Maybe NULL. */
        /*
         * struct tomoyo_condition_element condition[condc];
         * struct tomoyo_number_union values[numbers_count];
         * struct tomoyo_name_union names[names_count];
         * struct tomoyo_argv argv[argc];
         * struct tomoyo_envp envp[envc];
         */
};

/* Common header for individual entries. */
struct tomoyo_acl_info {
        struct list_head list;
        struct tomoyo_condition *cond; /* Maybe NULL. */
        s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */
        u8 type; /* One of values in "enum tomoyo_acl_entry_type_index". */
} __packed;

/* Structure for domain information. */
struct tomoyo_domain_info {
        struct list_head list;
        struct list_head acl_info_list;
        /* Name of this domain. Never NULL.          */
        const struct tomoyo_path_info *domainname;
        /* Namespace for this domain. Never NULL. */
        struct tomoyo_policy_namespace *ns;
        /* Group numbers to use.   */
        unsigned long group[TOMOYO_MAX_ACL_GROUPS / BITS_PER_LONG];
        u8 profile;        /* Profile number to use. */
        bool is_deleted;   /* Delete flag.           */
        bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
        atomic_t users; /* Number of referring tasks. */
};

/*
 * Structure for "task manual_domain_transition" directive.
 */
struct tomoyo_task_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MANUAL_TASK_ACL */
        /* Pointer to domainname. */
        const struct tomoyo_path_info *domainname;
};

/*
 * Structure for "file execute", "file read", "file write", "file append",
 * "file unlink", "file getattr", "file rmdir", "file truncate",
 * "file symlink", "file chroot" and "file unmount" directive.
 */
struct tomoyo_path_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_ACL */
        u16 perm; /* Bitmask of values in "enum tomoyo_path_acl_index". */
        struct tomoyo_name_union name;
};

/*
 * Structure for "file create", "file mkdir", "file mkfifo", "file mksock",
 * "file ioctl", "file chmod", "file chown" and "file chgrp" directive.
 */
struct tomoyo_path_number_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_NUMBER_ACL */
        /* Bitmask of values in "enum tomoyo_path_number_acl_index". */
        u8 perm;
        struct tomoyo_name_union name;
        struct tomoyo_number_union number;
};

/* Structure for "file mkblock" and "file mkchar" directive. */
struct tomoyo_mkdev_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MKDEV_ACL */
        u8 perm; /* Bitmask of values in "enum tomoyo_mkdev_acl_index". */
        struct tomoyo_name_union name;
        struct tomoyo_number_union mode;
        struct tomoyo_number_union major;
        struct tomoyo_number_union minor;
};

/*
 * Structure for "file rename", "file link" and "file pivot_root" directive.
 */
struct tomoyo_path2_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH2_ACL */
        u8 perm; /* Bitmask of values in "enum tomoyo_path2_acl_index". */
        struct tomoyo_name_union name1;
        struct tomoyo_name_union name2;
};

/* Structure for "file mount" directive. */
struct tomoyo_mount_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MOUNT_ACL */
        struct tomoyo_name_union dev_name;
        struct tomoyo_name_union dir_name;
        struct tomoyo_name_union fs_type;
        struct tomoyo_number_union flags;
};

/* Structure for "misc env" directive in domain policy. */
struct tomoyo_env_acl {
        struct tomoyo_acl_info head;        /* type = TOMOYO_TYPE_ENV_ACL  */
        const struct tomoyo_path_info *env; /* environment variable */
};

/* Structure for "network inet" directive. */
struct tomoyo_inet_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_INET_ACL */
        u8 protocol;
        u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */
        struct tomoyo_ipaddr_union address;
        struct tomoyo_number_union port;
};

/* Structure for "network unix" directive. */
struct tomoyo_unix_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_UNIX_ACL */
        u8 protocol;
        u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */
        struct tomoyo_name_union name;
};

/* Structure for holding a line from /sys/kernel/security/tomoyo/ interface. */
struct tomoyo_acl_param {
        char *data;
        struct list_head *list;
        struct tomoyo_policy_namespace *ns;
        bool is_delete;
};

#define TOMOYO_MAX_IO_READ_QUEUE 64

/*
 * Structure for reading/writing policy via /sys/kernel/security/tomoyo
 * interfaces.
 */
struct tomoyo_io_buffer {
        void (*read)(struct tomoyo_io_buffer *head);
        int (*write)(struct tomoyo_io_buffer *head);
        __poll_t (*poll)(struct file *file, poll_table *wait);
        /* Exclusive lock for this structure.   */
        struct mutex io_sem;
        char __user *read_user_buf;
        size_t read_user_buf_avail;
        struct {
                struct list_head *ns;
                struct list_head *domain;
                struct list_head *group;
                struct list_head *acl;
                size_t avail;
                unsigned int step;
                unsigned int query_index;
                u16 index;
                u16 cond_index;
                u8 acl_group_index;
                u8 cond_step;
                u8 bit;
                u8 w_pos;
                bool eof;
                bool print_this_domain_only;
                bool print_transition_related_only;
                bool print_cond_part;
                const char *w[TOMOYO_MAX_IO_READ_QUEUE];
        } r;
        struct {
                struct tomoyo_policy_namespace *ns;
                /* The position currently writing to.   */
                struct tomoyo_domain_info *domain;
                /* Bytes available for writing.         */
                size_t avail;
                bool is_delete;
        } w;
        /* Buffer for reading.                  */
        char *read_buf                __guarded_by(&io_sem);
        /* Size of read buffer.                 */
        size_t readbuf_size        __guarded_by(&io_sem);
        /* Buffer for writing.                  */
        char *write_buf                __guarded_by(&io_sem);
        /* Size of write buffer.                */
        size_t writebuf_size        __guarded_by(&io_sem);
        /* Type of this interface.              */
        enum tomoyo_securityfs_interface_index type;
        /* Users counter protected by tomoyo_io_buffer_list_lock. */
        u8 users;
        /* List for telling GC not to kfree() elements. */
        struct list_head list;
};

/*
 * Structure for "initialize_domain"/"no_initialize_domain"/"keep_domain"/
 * "no_keep_domain" keyword.
 */
struct tomoyo_transition_control {
        struct tomoyo_acl_head head;
        u8 type; /* One of values in "enum tomoyo_transition_type".  */
        /* True if the domainname is tomoyo_get_last_name(). */
        bool is_last_name;
        const struct tomoyo_path_info *domainname; /* Maybe NULL */
        const struct tomoyo_path_info *program;    /* Maybe NULL */
};

/* Structure for "aggregator" keyword. */
struct tomoyo_aggregator {
        struct tomoyo_acl_head head;
        const struct tomoyo_path_info *original_name;
        const struct tomoyo_path_info *aggregated_name;
};

/* Structure for policy manager. */
struct tomoyo_manager {
        struct tomoyo_acl_head head;
        /* A path to program or a domainname. */
        const struct tomoyo_path_info *manager;
};

struct tomoyo_preference {
        unsigned int learning_max_entry;
        bool enforcing_verbose;
        bool learning_verbose;
        bool permissive_verbose;
};

/* Structure for /sys/kernel/security/tomnoyo/profile interface. */
struct tomoyo_profile {
        const struct tomoyo_path_info *comment;
        struct tomoyo_preference *learning;
        struct tomoyo_preference *permissive;
        struct tomoyo_preference *enforcing;
        struct tomoyo_preference preference;
        u8 default_config;
        u8 config[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX];
        unsigned int pref[TOMOYO_MAX_PREF];
};

/* Structure for representing YYYY/MM/DD hh/mm/ss. */
struct tomoyo_time {
        u16 year;
        u8 month;
        u8 day;
        u8 hour;
        u8 min;
        u8 sec;
};

/* Structure for policy namespace. */
struct tomoyo_policy_namespace {
        /* Profile table. Memory is allocated as needed. */
        struct tomoyo_profile *profile_ptr[TOMOYO_MAX_PROFILES];
        /* List of "struct tomoyo_group". */
        struct list_head group_list[TOMOYO_MAX_GROUP];
        /* List of policy. */
        struct list_head policy_list[TOMOYO_MAX_POLICY];
        /* The global ACL referred by "use_group" keyword. */
        struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS];
        /* List for connecting to tomoyo_namespace_list list. */
        struct list_head namespace_list;
        /* Profile version. Currently only 20150505 is defined. */
        unsigned int profile_version;
        /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */
        const char *name;
};

/* Structure for "struct task_struct"->security. */
struct tomoyo_task {
        struct tomoyo_domain_info *domain_info;
        struct tomoyo_domain_info *old_domain_info;
};

/********** External variable definitions. **********/

extern bool tomoyo_policy_loaded;
extern int tomoyo_enabled;
extern const char * const tomoyo_condition_keyword
[TOMOYO_MAX_CONDITION_KEYWORD];
extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
extern const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX
                                              + TOMOYO_MAX_MAC_CATEGORY_INDEX];
extern const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE];
extern const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION];
extern const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX];
extern const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION];
extern const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX];
extern const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION];
extern const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION];
extern const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION];
extern struct list_head tomoyo_condition_list;
extern struct list_head tomoyo_domain_list;
extern struct list_head tomoyo_name_list[TOMOYO_MAX_HASH];
extern struct list_head tomoyo_namespace_list;
extern struct mutex tomoyo_policy_lock;
extern struct srcu_struct tomoyo_ss;
extern struct tomoyo_domain_info tomoyo_kernel_domain;
extern struct tomoyo_policy_namespace tomoyo_kernel_namespace;
extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT];
extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT];
extern struct lsm_blob_sizes tomoyo_blob_sizes;

/********** Function prototypes. **********/

int tomoyo_interface_init(void);

bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address,
                                  const struct tomoyo_group *group);
bool tomoyo_compare_number_union(const unsigned long value,
                                 const struct tomoyo_number_union *ptr);
bool tomoyo_condition(struct tomoyo_request_info *r,
                      const struct tomoyo_condition *cond);
bool tomoyo_correct_domain(const unsigned char *domainname);
bool tomoyo_correct_path(const char *filename);
bool tomoyo_correct_word(const char *string);
bool tomoyo_domain_def(const unsigned char *buffer);
bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r);
bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
                      struct tomoyo_page_dump *dump);
bool tomoyo_memory_ok(void *ptr);
bool tomoyo_number_matches_group(const unsigned long min,
                                 const unsigned long max,
                                 const struct tomoyo_group *group);
bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param,
                               struct tomoyo_ipaddr_union *ptr);
bool tomoyo_parse_name_union(struct tomoyo_acl_param *param,
                             struct tomoyo_name_union *ptr);
bool tomoyo_parse_number_union(struct tomoyo_acl_param *param,
                               struct tomoyo_number_union *ptr);
bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
                                 const struct tomoyo_path_info *pattern);
bool tomoyo_permstr(const char *string, const char *keyword);
bool tomoyo_str_starts(char **src, const char *find);
char *tomoyo_encode(const char *str);
char *tomoyo_encode2(const char *str, int str_len);
char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
                      va_list args) __printf(3, 0);
char *tomoyo_read_token(struct tomoyo_acl_param *param);
char *tomoyo_realpath_from_path(const struct path *path);
char *tomoyo_realpath_nofollow(const char *pathname);
const char *tomoyo_get_exe(void);
const struct tomoyo_path_info *tomoyo_compare_name_union
(const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr);
const struct tomoyo_path_info *tomoyo_get_domainname
(struct tomoyo_acl_param *param);
const struct tomoyo_path_info *tomoyo_get_name(const char *name);
const struct tomoyo_path_info *tomoyo_path_matches_group
(const struct tomoyo_path_info *pathname, const struct tomoyo_group *group);
int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
                                 const struct path *path, const int flag);
void tomoyo_close_control(struct tomoyo_io_buffer *head);
int tomoyo_env_perm(struct tomoyo_request_info *r, const char *env) __must_hold_shared(&tomoyo_ss);
int tomoyo_execute_permission(struct tomoyo_request_info *r,
                              const struct tomoyo_path_info *filename) __must_hold_shared(&tomoyo_ss);
int tomoyo_find_next_domain(struct linux_binprm *bprm) __must_hold_shared(&tomoyo_ss);
int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile,
                    const u8 index);
int tomoyo_init_request_info(struct tomoyo_request_info *r,
                             struct tomoyo_domain_info *domain,
                             const u8 index);
int tomoyo_mkdev_perm(const u8 operation, const struct path *path,
                      const unsigned int mode, unsigned int dev);
int tomoyo_mount_permission(const char *dev_name, const struct path *path,
                            const char *type, unsigned long flags,
                            void *data_page);
int tomoyo_open_control(const u8 type, struct file *file);
int tomoyo_path2_perm(const u8 operation, const struct path *path1,
                      const struct path *path2);
int tomoyo_path_number_perm(const u8 operation, const struct path *path,
                            unsigned long number);
int tomoyo_path_perm(const u8 operation, const struct path *path,
                     const char *target);
__poll_t tomoyo_poll_control(struct file *file, poll_table *wait);
__poll_t tomoyo_poll_log(struct file *file, poll_table *wait);
int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr,
                                  int addr_len);
int tomoyo_socket_connect_permission(struct socket *sock,
                                     struct sockaddr *addr, int addr_len);
int tomoyo_socket_listen_permission(struct socket *sock);
int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg,
                                     int size);
int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
        __must_hold_shared(&tomoyo_ss)
        __printf(2, 3);
int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)
                         (const struct tomoyo_acl_info *,
                          const struct tomoyo_acl_info *),
                         bool (*merge_duplicate)
                         (struct tomoyo_acl_info *, struct tomoyo_acl_info *,
                          const bool));
int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)
                         (const struct tomoyo_acl_head *,
                          const struct tomoyo_acl_head *));
int tomoyo_write_aggregator(struct tomoyo_acl_param *param);
int tomoyo_write_file(struct tomoyo_acl_param *param);
int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type);
int tomoyo_write_misc(struct tomoyo_acl_param *param);
int tomoyo_write_inet_network(struct tomoyo_acl_param *param);
int tomoyo_write_transition_control(struct tomoyo_acl_param *param,
                                    const u8 type);
int tomoyo_write_unix_network(struct tomoyo_acl_param *param);
ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer,
                            const int buffer_len);
ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
                             const char __user *buffer, const int buffer_len);
struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param);
struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
                                                const bool transit);
struct tomoyo_domain_info *tomoyo_domain(void);
struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname);
struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param,
                                      const u8 idx);
struct tomoyo_policy_namespace *tomoyo_assign_namespace
(const char *domainname);
struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
                                      const u8 profile);
u8 tomoyo_parse_ulong(unsigned long *result, char **str);
void *tomoyo_commit_ok(void *data, const unsigned int size);
void __init tomoyo_load_builtin_policy(void);
void __init tomoyo_mm_init(void);
void tomoyo_check_acl(struct tomoyo_request_info *r,
                      bool (*check_entry)(struct tomoyo_request_info *,
                                          const struct tomoyo_acl_info *));
void tomoyo_check_profile(void);
void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp);
void tomoyo_del_condition(struct list_head *element);
void tomoyo_fill_path_info(struct tomoyo_path_info *ptr);
void tomoyo_get_attributes(struct tomoyo_obj_info *obj);
void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns);
void tomoyo_load_policy(const char *filename);
void tomoyo_normalize_line(unsigned char *buffer);
void tomoyo_notify_gc(struct tomoyo_io_buffer *head, const bool is_register);
void tomoyo_print_ip(char *buf, const unsigned int size,
                     const struct tomoyo_ipaddr_union *ptr);
void tomoyo_print_ulong(char *buffer, const int buffer_len,
                        const unsigned long value, const u8 type);
void tomoyo_put_name_union(struct tomoyo_name_union *ptr);
void tomoyo_put_number_union(struct tomoyo_number_union *ptr);
void tomoyo_read_log(struct tomoyo_io_buffer *head) __must_hold(&head->io_sem);
void tomoyo_update_stat(const u8 index);
void tomoyo_warn_oom(const char *function);
void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
        __printf(2, 3);
void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
                       va_list args) __printf(3, 0);

/********** Inlined functions. **********/

/**
 * tomoyo_read_lock - Take lock for protecting policy.
 *
 * Returns index number for tomoyo_read_unlock().
 */
static inline int tomoyo_read_lock(void)
        __acquires_shared(&tomoyo_ss)
{
        return srcu_read_lock(&tomoyo_ss);
}

/**
 * tomoyo_read_unlock - Release lock for protecting policy.
 *
 * @idx: Index number returned by tomoyo_read_lock().
 *
 * Returns nothing.
 */
static inline void tomoyo_read_unlock(int idx)
        __releases_shared(&tomoyo_ss)
{
        srcu_read_unlock(&tomoyo_ss, idx);
}

/**
 * tomoyo_sys_getppid - Copy of getppid().
 *
 * Returns parent process's PID.
 *
 * Alpha does not have getppid() defined. To be able to build this module on
 * Alpha, I have to copy getppid() from kernel/timer.c.
 */
static inline pid_t tomoyo_sys_getppid(void)
{
        pid_t pid;

        rcu_read_lock();
        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
        rcu_read_unlock();
        return pid;
}

/**
 * tomoyo_sys_getpid - Copy of getpid().
 *
 * Returns current thread's PID.
 *
 * Alpha does not have getpid() defined. To be able to build this module on
 * Alpha, I have to copy getpid() from kernel/timer.c.
 */
static inline pid_t tomoyo_sys_getpid(void)
{
        return task_tgid_vnr(current);
}

/**
 * tomoyo_pathcmp - strcmp() for "struct tomoyo_path_info" structure.
 *
 * @a: Pointer to "struct tomoyo_path_info".
 * @b: Pointer to "struct tomoyo_path_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_pathcmp(const struct tomoyo_path_info *a,
                                  const struct tomoyo_path_info *b)
{
        return a->hash != b->hash || strcmp(a->name, b->name);
}

/**
 * tomoyo_put_name - Drop reference on "struct tomoyo_name".
 *
 * @name: Pointer to "struct tomoyo_path_info". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_name(const struct tomoyo_path_info *name)
{
        if (name) {
                struct tomoyo_name *ptr =
                        container_of(name, typeof(*ptr), entry);
                atomic_dec(&ptr->head.users);
        }
}

/**
 * tomoyo_put_condition - Drop reference on "struct tomoyo_condition".
 *
 * @cond: Pointer to "struct tomoyo_condition". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_condition(struct tomoyo_condition *cond)
{
        if (cond)
                atomic_dec(&cond->head.users);
}

/**
 * tomoyo_put_group - Drop reference on "struct tomoyo_group".
 *
 * @group: Pointer to "struct tomoyo_group". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_group(struct tomoyo_group *group)
{
        if (group)
                atomic_dec(&group->head.users);
}

/**
 * tomoyo_task - Get "struct tomoyo_task" for specified thread.
 *
 * @task - Pointer to "struct task_struct".
 *
 * Returns pointer to "struct tomoyo_task" for specified thread.
 */
static inline struct tomoyo_task *tomoyo_task(struct task_struct *task)
{
        return task->security + tomoyo_blob_sizes.lbs_task;
}

/**
 * tomoyo_same_name_union - Check for duplicated "struct tomoyo_name_union" entry.
 *
 * @a: Pointer to "struct tomoyo_name_union".
 * @b: Pointer to "struct tomoyo_name_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_name_union
(const struct tomoyo_name_union *a, const struct tomoyo_name_union *b)
{
        return a->filename == b->filename && a->group == b->group;
}

/**
 * tomoyo_same_number_union - Check for duplicated "struct tomoyo_number_union" entry.
 *
 * @a: Pointer to "struct tomoyo_number_union".
 * @b: Pointer to "struct tomoyo_number_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_number_union
(const struct tomoyo_number_union *a, const struct tomoyo_number_union *b)
{
        return a->values[0] == b->values[0] && a->values[1] == b->values[1] &&
                a->group == b->group && a->value_type[0] == b->value_type[0] &&
                a->value_type[1] == b->value_type[1];
}

/**
 * tomoyo_same_ipaddr_union - Check for duplicated "struct tomoyo_ipaddr_union" entry.
 *
 * @a: Pointer to "struct tomoyo_ipaddr_union".
 * @b: Pointer to "struct tomoyo_ipaddr_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_ipaddr_union
(const struct tomoyo_ipaddr_union *a, const struct tomoyo_ipaddr_union *b)
{
        return !memcmp(a->ip, b->ip, sizeof(a->ip)) && a->group == b->group &&
                a->is_ipv6 == b->is_ipv6;
}

/**
 * tomoyo_current_namespace - Get "struct tomoyo_policy_namespace" for current thread.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" for current thread.
 */
static inline struct tomoyo_policy_namespace *tomoyo_current_namespace(void)
{
        return tomoyo_domain()->ns;
}

/**
 * list_for_each_cookie - iterate over a list with cookie.
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:       the head for your list.
 */
#define list_for_each_cookie(pos, head)                                        \
        if (!pos)                                                        \
                pos =  srcu_dereference((head)->next, &tomoyo_ss);        \
        for ( ; pos != (head); pos = srcu_dereference(pos->next, &tomoyo_ss))

#endif /* !defined(_SECURITY_TOMOYO_COMMON_H) */














































































































































































































































































































































































































































































































































































































































































































































































    1 



















    1 

    1 

    1 








    1 







    1 




















    1 



































































    1 










    1 








    1 








    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 */
#include "xfs_platform.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_log.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_da_format.h"
#include "xfs_health.h"
#include "xfs_ag.h"
#include "xfs_rtbitmap.h"
#include "xfs_exchrange.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"

/*
 * Physical superblock buffer manipulations. Shared with libxfs in userspace.
 */

/*
 * Check that all the V4 feature bits that the V5 filesystem format requires are
 * correctly set.
 */
static bool
xfs_sb_validate_v5_features(
        struct xfs_sb        *sbp)
{
        /* We must not have any unknown V4 feature bits set */
        if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS)
                return false;

        /*
         * The CRC bit is considered an invalid V4 flag, so we have to add it
         * manually to the OKBITS mask.
         */
        if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS |
                                  XFS_SB_VERSION2_CRCBIT))
                return false;

        /* Now check all the required V4 feature flags are set. */

#define V5_VERS_FLAGS        (XFS_SB_VERSION_NLINKBIT        | \
                        XFS_SB_VERSION_ALIGNBIT                | \
                        XFS_SB_VERSION_LOGV2BIT                | \
                        XFS_SB_VERSION_EXTFLGBIT        | \
                        XFS_SB_VERSION_DIRV2BIT                | \
                        XFS_SB_VERSION_MOREBITSBIT)

#define V5_FEAT_FLAGS        (XFS_SB_VERSION2_LAZYSBCOUNTBIT        | \
                        XFS_SB_VERSION2_ATTR2BIT        | \
                        XFS_SB_VERSION2_PROJID32BIT        | \
                        XFS_SB_VERSION2_CRCBIT)

        if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS)
                return false;
        if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS)
                return false;
        return true;
}

/*
 * We current support XFS v5 formats with known features and v4 superblocks with
 * at least V2 directories.
 */
bool
xfs_sb_good_version(
        struct xfs_sb        *sbp)
{
        /*
         * All v5 filesystems are supported, but we must check that all the
         * required v4 feature flags are enabled correctly as the code checks
         * those flags and not for v5 support.
         */
        if (xfs_sb_is_v5(sbp))
                return xfs_sb_validate_v5_features(sbp);

        /* versions prior to v4 are not supported */
        if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_4)
                return false;

        /* We must not have any unknown v4 feature bits set */
        if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
            ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
             (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
                return false;

        /* V4 filesystems need v2 directories and unwritten extents */
        if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
                return false;
        if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
                return false;

        /* It's a supported v4 filesystem */
        return true;
}

uint64_t
xfs_sb_version_to_features(
        struct xfs_sb        *sbp)
{
        uint64_t        features = 0;

        /* optional V4 features */
        if (sbp->sb_rblocks > 0)
                features |= XFS_FEAT_REALTIME;
        if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)
                features |= XFS_FEAT_NLINK;
        if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)
                features |= XFS_FEAT_ATTR;
        if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT)
                features |= XFS_FEAT_QUOTA;
        if (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)
                features |= XFS_FEAT_ALIGN;
        if (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)
                features |= XFS_FEAT_LOGV2;
        if (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT)
                features |= XFS_FEAT_DALIGN;
        if (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)
                features |= XFS_FEAT_EXTFLG;
        if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT)
                features |= XFS_FEAT_SECTOR;
        if (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT)
                features |= XFS_FEAT_ASCIICI;
        if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) {
                if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)
                        features |= XFS_FEAT_LAZYSBCOUNT;
                if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)
                        features |= XFS_FEAT_PROJID32;
                if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)
                        features |= XFS_FEAT_FTYPE;
        }

        if (!xfs_sb_is_v5(sbp))
                return features;

        /* Always on V5 features */
        features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG |
                    XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 |
                    XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO;

        /* Optional V5 features */
        if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT)
                features |= XFS_FEAT_FINOBT;
        if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT)
                features |= XFS_FEAT_RMAPBT;
        if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK)
                features |= XFS_FEAT_REFLINK;
        if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
                features |= XFS_FEAT_INOBTCNT;
        if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
                features |= XFS_FEAT_FTYPE;
        if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
                features |= XFS_FEAT_SPINODES;
        if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
                features |= XFS_FEAT_META_UUID;
        if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME)
                features |= XFS_FEAT_BIGTIME;
        if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
                features |= XFS_FEAT_NEEDSREPAIR;
        if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
                features |= XFS_FEAT_NREXT64;
        if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)
                features |= XFS_FEAT_EXCHANGE_RANGE;
        if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT)
                features |= XFS_FEAT_PARENT;
        if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
                features |= XFS_FEAT_METADIR;
        if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)
                features |= XFS_FEAT_ZONED;

        return features;
}

/* Check all the superblock fields we care about when reading one in. */
STATIC int
xfs_validate_sb_read(
        struct xfs_mount        *mp,
        struct xfs_sb                *sbp)
{
        if (!xfs_sb_is_v5(sbp))
                return 0;

        /*
         * Version 5 superblock feature mask validation. Reject combinations
         * the kernel cannot support up front before checking anything else.
         */
        if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) {
                xfs_warn(mp,
"Superblock has unknown compatible features (0x%x) enabled.",
                        (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN));
                xfs_warn(mp,
"Using a more recent kernel is recommended.");
        }

        if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
                xfs_alert(mp,
"Superblock has unknown read-only compatible features (0x%x) enabled.",
                        (sbp->sb_features_ro_compat &
                                        XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
                if (!xfs_is_readonly(mp)) {
                        xfs_warn(mp,
"Attempted to mount read-only compatible filesystem read-write.");
                        xfs_warn(mp,
"Filesystem can only be safely mounted read only.");

                        return -EINVAL;
                }
        }
        if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
                xfs_warn(mp,
"Superblock has unknown incompatible features (0x%x) enabled.",
                        (sbp->sb_features_incompat &
                                        XFS_SB_FEAT_INCOMPAT_UNKNOWN));
                xfs_warn(mp,
"Filesystem cannot be safely mounted by this kernel.");
                return -EINVAL;
        }

        return 0;
}

/* Return the number of extents covered by a single rt bitmap file */
static xfs_rtbxlen_t
xfs_extents_per_rbm(
        struct xfs_sb                *sbp)
{
        if (xfs_sb_is_v5(sbp) &&
            (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR))
                return sbp->sb_rgextents;
        return sbp->sb_rextents;
}

/*
 * Return the payload size of a single rt bitmap block (without the metadata
 * header if any).
 */
static inline unsigned int
xfs_rtbmblock_size(
        struct xfs_sb                *sbp)
{
        if (xfs_sb_is_v5(sbp) &&
            (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR))
                return sbp->sb_blocksize - sizeof(struct xfs_rtbuf_blkinfo);
        return sbp->sb_blocksize;
}

static uint64_t
xfs_expected_rbmblocks(
        struct xfs_sb                *sbp)
{
        if (xfs_sb_is_v5(sbp) &&
            (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
                return 0;
        return howmany_64(xfs_extents_per_rbm(sbp),
                          NBBY * xfs_rtbmblock_size(sbp));
}

/* Validate the realtime geometry */
bool
xfs_validate_rt_geometry(
        struct xfs_sb                *sbp)
{
        if (xfs_sb_is_v5(sbp) &&
            (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
                if (sbp->sb_rextsize != 1)
                        return false;
        } else {
                if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
                    sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
                        return false;
        }

        if (sbp->sb_rblocks == 0) {
                if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
                    sbp->sb_rextslog != 0 || sbp->sb_frextents != 0)
                        return false;
                return true;
        }

        if (sbp->sb_rextents == 0 ||
            sbp->sb_rextents != div_u64(sbp->sb_rblocks, sbp->sb_rextsize) ||
            sbp->sb_rextslog != xfs_compute_rextslog(sbp->sb_rextents) ||
            sbp->sb_rbmblocks != xfs_expected_rbmblocks(sbp))
                return false;

        if (xfs_sb_is_v5(sbp) &&
            (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
                uint32_t                mod;

                /*
                 * Zoned RT devices must be aligned to the RT group size,
                 * because garbage collection assumes that all zones have the
                 * same size to avoid insane complexity if that weren't the
                 * case.
                 */
                div_u64_rem(sbp->sb_rextents, sbp->sb_rgextents, &mod);
                if (mod)
                        return false;
        }

        return true;
}

/* Check all the superblock fields we care about when writing one out. */
STATIC int
xfs_validate_sb_write(
        struct xfs_mount        *mp,
        struct xfs_buf                *bp,
        struct xfs_sb                *sbp)
{
        /*
         * Carry out additional sb summary counter sanity checks when we write
         * the superblock.  We skip this in the read validator because there
         * could be newer superblocks in the log and if the values are garbage
         * even after replay we'll recalculate them at the end of log mount.
         *
         * mkfs has traditionally written zeroed counters to inprogress and
         * secondary superblocks, so allow this usage to continue because
         * we never read counters from such superblocks.
         */
        if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
            (sbp->sb_fdblocks > sbp->sb_dblocks ||
             !xfs_verify_icount(mp, sbp->sb_icount) ||
             sbp->sb_ifree > sbp->sb_icount)) {
                xfs_warn(mp, "SB summary counter sanity check failed");
                return -EFSCORRUPTED;
        }

        if (!xfs_sb_is_v5(sbp))
                return 0;

        /*
         * Version 5 superblock feature mask validation. Reject combinations
         * the kernel cannot support since we checked for unsupported bits in
         * the read verifier, which means that memory is corrupt.
         */
        if (!xfs_is_readonly(mp) &&
            xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
                xfs_alert(mp,
"Corruption detected in superblock read-only compatible features (0x%x)!",
                        (sbp->sb_features_ro_compat &
                                        XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
                return -EFSCORRUPTED;
        }
        if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
                xfs_warn(mp,
"Corruption detected in superblock incompatible features (0x%x)!",
                        (sbp->sb_features_incompat &
                                        XFS_SB_FEAT_INCOMPAT_UNKNOWN));
                return -EFSCORRUPTED;
        }
        if (xfs_sb_has_incompat_log_feature(sbp,
                        XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
                xfs_warn(mp,
"Corruption detected in superblock incompatible log features (0x%x)!",
                        (sbp->sb_features_log_incompat &
                                        XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
                return -EFSCORRUPTED;
        }

        /*
         * We can't read verify the sb LSN because the read verifier is called
         * before the log is allocated and processed. We know the log is set up
         * before write verifier calls, so check it here.
         */
        if (!xfs_log_check_lsn(mp, sbp->sb_lsn))
                return -EFSCORRUPTED;

        return 0;
}

int
xfs_compute_rgblklog(
        xfs_rtxlen_t        rgextents,
        xfs_rgblock_t        rextsize)
{
        uint64_t        rgblocks = (uint64_t)rgextents * rextsize;

        return xfs_highbit64(rgblocks - 1) + 1;
}

static int
xfs_validate_sb_rtgroups(
        struct xfs_mount        *mp,
        struct xfs_sb                *sbp)
{
        uint64_t                groups;
        int                        rgblklog;

        if (sbp->sb_rextsize == 0) {
                xfs_warn(mp,
"Realtime extent size must not be zero.");
                return -EINVAL;
        }

        if (sbp->sb_rgextents > XFS_MAX_RGBLOCKS / sbp->sb_rextsize) {
                xfs_warn(mp,
"Realtime group size (%u) must be less than %u rt extents.",
                                sbp->sb_rgextents,
                                XFS_MAX_RGBLOCKS / sbp->sb_rextsize);
                return -EINVAL;
        }

        if (sbp->sb_rgextents < XFS_MIN_RGEXTENTS) {
                xfs_warn(mp,
"Realtime group size (%u) must be at least %u rt extents.",
                                sbp->sb_rgextents, XFS_MIN_RGEXTENTS);
                return -EINVAL;
        }

        if (sbp->sb_rgcount > XFS_MAX_RGNUMBER) {
                xfs_warn(mp,
"Realtime groups (%u) must be less than %u.",
                                sbp->sb_rgcount, XFS_MAX_RGNUMBER);
                return -EINVAL;
        }

        groups = howmany_64(sbp->sb_rextents, sbp->sb_rgextents);
        if (groups != sbp->sb_rgcount) {
                xfs_warn(mp,
"Realtime groups (%u) do not cover the entire rt section; need (%llu) groups.",
                                sbp->sb_rgcount, groups);
                return -EINVAL;
        }

        /* Exchange-range is required for fsr to work on realtime files */
        if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)) {
                xfs_warn(mp,
"Realtime groups feature requires exchange-range support.");
                return -EINVAL;
        }

        rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, sbp->sb_rextsize);
        if (sbp->sb_rgblklog != rgblklog) {
                xfs_warn(mp,
"Realtime group log (%d) does not match expected value (%d).",
                                sbp->sb_rgblklog, rgblklog);
                return -EINVAL;
        }

        return 0;
}

static int
xfs_validate_sb_zoned(
        struct xfs_mount        *mp,
        struct xfs_sb                *sbp)
{
        if (sbp->sb_frextents != 0) {
                xfs_warn(mp,
"sb_frextents must be zero for zoned file systems.");
                return -EINVAL;
        }

        if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) {
                xfs_warn(mp,
"sb_rtstart (%lld) overlaps sb_dblocks (%lld).",
                        sbp->sb_rtstart, sbp->sb_dblocks);
                return -EINVAL;
        }

        if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) {
                xfs_warn(mp,
"sb_rtreserved (%lld) larger than sb_rblocks (%lld).",
                        sbp->sb_rtreserved, sbp->sb_rblocks);
                return -EINVAL;
        }

        return 0;
}

/* Check the validity of the SB. */
STATIC int
xfs_validate_sb_common(
        struct xfs_mount        *mp,
        struct xfs_buf                *bp,
        struct xfs_sb                *sbp)
{
        struct xfs_dsb                *dsb = bp->b_addr;
        uint32_t                agcount = 0;
        uint32_t                rem;
        bool                        has_dalign;
        int                        error;

        if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
                xfs_warn(mp,
"Superblock has bad magic number 0x%x. Not an XFS filesystem?",
                        be32_to_cpu(dsb->sb_magicnum));
                return -EWRONGFS;
        }

        if (!xfs_sb_good_version(sbp)) {
                xfs_warn(mp,
"Superblock has unknown features enabled or corrupted feature masks.");
                return -EWRONGFS;
        }

        /*
         * Validate feature flags and state
         */
        if (xfs_sb_is_v5(sbp)) {
                if (sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
                        xfs_notice(mp,
"Block size (%u bytes) too small for Version 5 superblock (minimum %d bytes)",
                                sbp->sb_blocksize, XFS_MIN_CRC_BLOCKSIZE);
                        return -EFSCORRUPTED;
                }

                /* V5 has a separate project quota inode */
                if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
                        xfs_notice(mp,
                           "Version 5 of Super block has XFS_OQUOTA bits.");
                        return -EFSCORRUPTED;
                }

                /*
                 * Full inode chunks must be aligned to inode chunk size when
                 * sparse inodes are enabled to support the sparse chunk
                 * allocation algorithm and prevent overlapping inode records.
                 */
                if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) {
                        uint32_t        align;

                        align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
                                        >> sbp->sb_blocklog;
                        if (sbp->sb_inoalignmt != align) {
                                xfs_warn(mp,
"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
                                         sbp->sb_inoalignmt, align);
                                return -EINVAL;
                        }

                        if (sbp->sb_spino_align &&
                            (sbp->sb_spino_align > sbp->sb_inoalignmt ||
                             (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0)) {
                                xfs_warn(mp,
"Sparse inode alignment (%u) is invalid, must be integer factor of (%u).",
                                        sbp->sb_spino_align,
                                        sbp->sb_inoalignmt);
                                return -EINVAL;
                        }
                } else if (sbp->sb_spino_align) {
                        xfs_warn(mp,
                                "Sparse inode alignment (%u) should be zero.",
                                sbp->sb_spino_align);
                        return -EINVAL;
                }

                if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
                        if (memchr_inv(sbp->sb_pad, 0, sizeof(sbp->sb_pad))) {
                                xfs_warn(mp,
"Metadir superblock padding fields must be zero.");
                                return -EINVAL;
                        }

                        error = xfs_validate_sb_rtgroups(mp, sbp);
                        if (error)
                                return error;
                }
                if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
                        error = xfs_validate_sb_zoned(mp, sbp);
                        if (error)
                                return error;
                }
        } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
                                XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
                        xfs_notice(mp,
"Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits.");
                        return -EFSCORRUPTED;
        }

        if (unlikely(
            sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
                xfs_warn(mp,
                "filesystem is marked as having an external log; "
                "specify logdev on the mount command line.");
                return -EINVAL;
        }

        if (unlikely(
            sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
                xfs_warn(mp,
                "filesystem is marked as having an internal log; "
                "do not specify logdev on the mount command line.");
                return -EINVAL;
        }

        /* Compute agcount for this number of dblocks and agblocks */
        if (sbp->sb_agblocks) {
                agcount = div_u64_rem(sbp->sb_dblocks, sbp->sb_agblocks, &rem);
                if (rem)
                        agcount++;
        }

        /*
         * More sanity checking.  Most of these were stolen directly from
         * xfs_repair.
         */
        if (unlikely(
            sbp->sb_agcount <= 0                                        ||
            sbp->sb_sectsize < XFS_MIN_SECTORSIZE                        ||
            sbp->sb_sectsize > XFS_MAX_SECTORSIZE                        ||
            sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                        ||
            sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                        ||
            sbp->sb_sectsize != (1 << sbp->sb_sectlog)                        ||
            sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                        ||
            sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                        ||
            sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                        ||
            sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                        ||
            sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
            sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
            sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                        ||
            sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                        ||
            sbp->sb_inodelog < XFS_DINODE_MIN_LOG                        ||
            sbp->sb_inodelog > XFS_DINODE_MAX_LOG                        ||
            sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
            sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
            XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES        ||
            XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES        ||
            sbp->sb_agblklog != xfs_highbit32(sbp->sb_agblocks - 1) + 1        ||
            agcount == 0 || agcount != sbp->sb_agcount                        ||
            (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)        ||
            (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)        ||
            (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)        ||
            (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */)        ||
            sbp->sb_dblocks == 0                                        ||
            sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                        ||
            sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp)                        ||
            sbp->sb_shared_vn != 0)) {
                xfs_notice(mp, "SB sanity check failed");
                return -EFSCORRUPTED;
        }

        /*
         * Logs that are too large are not supported at all. Reject them
         * outright. Logs that are too small are tolerated on v4 filesystems,
         * but we can only check that when mounting the log. Hence we skip
         * those checks here.
         */
        if (sbp->sb_logblocks > XFS_MAX_LOG_BLOCKS) {
                xfs_notice(mp,
                "Log size 0x%x blocks too large, maximum size is 0x%llx blocks",
                         sbp->sb_logblocks, XFS_MAX_LOG_BLOCKS);
                return -EFSCORRUPTED;
        }

        if (XFS_FSB_TO_B(mp, sbp->sb_logblocks) > XFS_MAX_LOG_BYTES) {
                xfs_warn(mp,
                "log size 0x%llx bytes too large, maximum size is 0x%llx bytes",
                         XFS_FSB_TO_B(mp, sbp->sb_logblocks),
                         XFS_MAX_LOG_BYTES);
                return -EFSCORRUPTED;
        }

        /*
         * Do not allow filesystems with corrupted log sector or stripe units to
         * be mounted. We cannot safely size the iclogs or write to the log if
         * the log stripe unit is not valid.
         */
        if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) {
                if (sbp->sb_logsectsize != (1U << sbp->sb_logsectlog)) {
                        xfs_notice(mp,
                        "log sector size in bytes/log2 (0x%x/0x%x) must match",
                                sbp->sb_logsectsize, 1U << sbp->sb_logsectlog);
                        return -EFSCORRUPTED;
                }
        } else if (sbp->sb_logsectsize || sbp->sb_logsectlog) {
                xfs_notice(mp,
                "log sector size in bytes/log2 (0x%x/0x%x) are not zero",
                        sbp->sb_logsectsize, sbp->sb_logsectlog);
                return -EFSCORRUPTED;
        }

        if (sbp->sb_logsunit > 1) {
                if (sbp->sb_logsunit % sbp->sb_blocksize) {
                        xfs_notice(mp,
                "log stripe unit 0x%x bytes must be a multiple of block size",
                                sbp->sb_logsunit);
                        return -EFSCORRUPTED;
                }
                if (sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE) {
                        xfs_notice(mp,
                "log stripe unit 0x%x bytes over maximum size (0x%x bytes)",
                                sbp->sb_logsunit, XLOG_MAX_RECORD_BSIZE);
                        return -EFSCORRUPTED;
                }
        }

        if (!xfs_validate_rt_geometry(sbp)) {
                xfs_notice(mp,
                        "realtime %sgeometry check failed",
                        sbp->sb_rblocks ? "" : "zeroed ");
                return -EFSCORRUPTED;
        }

        /*
         * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign)
         * would imply the image is corrupted.
         */
        has_dalign = sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT;
        if (!!sbp->sb_unit ^ has_dalign) {
                xfs_notice(mp, "SB stripe alignment sanity check failed");
                return -EFSCORRUPTED;
        }

        if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit),
                        XFS_FSB_TO_B(mp, sbp->sb_width), 0,
                        xfs_buf_daddr(bp) == XFS_SB_DADDR, false))
                return -EFSCORRUPTED;

        /*
         * Currently only very few inode sizes are supported.
         */
        switch (sbp->sb_inodesize) {
        case 256:
        case 512:
        case 1024:
        case 2048:
                break;
        default:
                xfs_warn(mp, "inode size of %d bytes not supported",
                                sbp->sb_inodesize);
                return -ENOSYS;
        }

        return 0;
}

void
xfs_sb_quota_from_disk(struct xfs_sb *sbp)
{
        if (xfs_sb_is_v5(sbp) &&
            (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
                sbp->sb_uquotino = NULLFSINO;
                sbp->sb_gquotino = NULLFSINO;
                sbp->sb_pquotino = NULLFSINO;
                return;
        }

        /*
         * older mkfs doesn't initialize quota inodes to NULLFSINO. This
         * leads to in-core values having two different values for a quota
         * inode to be invalid: 0 and NULLFSINO. Change it to a single value
         * NULLFSINO.
         *
         * Note that this change affect only the in-core values. These
         * values are not written back to disk unless any quota information
         * is written to the disk. Even in that case, sb_pquotino field is
         * not written to disk unless the superblock supports pquotino.
         */
        if (sbp->sb_uquotino == 0)
                sbp->sb_uquotino = NULLFSINO;
        if (sbp->sb_gquotino == 0)
                sbp->sb_gquotino = NULLFSINO;
        if (sbp->sb_pquotino == 0)
                sbp->sb_pquotino = NULLFSINO;

        /*
         * We need to do these manipilations only if we are working
         * with an older version of on-disk superblock.
         */
        if (xfs_sb_is_v5(sbp))
                return;

        if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
                sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
                                        XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
        if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
                sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
                                        XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
        sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);

        if (sbp->sb_qflags & XFS_PQUOTA_ACCT &&
            sbp->sb_gquotino != NULLFSINO)  {
                /*
                 * In older version of superblock, on-disk superblock only
                 * has sb_gquotino, and in-core superblock has both sb_gquotino
                 * and sb_pquotino. But, only one of them is supported at any
                 * point of time. So, if PQUOTA is set in disk superblock,
                 * copy over sb_gquotino to sb_pquotino.  The NULLFSINO test
                 * above is to make sure we don't do this twice and wipe them
                 * both out!
                 */
                sbp->sb_pquotino = sbp->sb_gquotino;
                sbp->sb_gquotino = NULLFSINO;
        }
}

static void
__xfs_sb_from_disk(
        struct xfs_sb        *to,
        struct xfs_dsb        *from,
        bool                convert_xquota)
{
        to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
        to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
        to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
        to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
        to->sb_rextents = be64_to_cpu(from->sb_rextents);
        memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
        to->sb_logstart = be64_to_cpu(from->sb_logstart);
        to->sb_rootino = be64_to_cpu(from->sb_rootino);
        to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
        to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
        to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
        to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
        to->sb_agcount = be32_to_cpu(from->sb_agcount);
        to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
        to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
        to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
        to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
        to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
        to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
        memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
        to->sb_blocklog = from->sb_blocklog;
        to->sb_sectlog = from->sb_sectlog;
        to->sb_inodelog = from->sb_inodelog;
        to->sb_inopblog = from->sb_inopblog;
        to->sb_agblklog = from->sb_agblklog;
        to->sb_rextslog = from->sb_rextslog;
        to->sb_inprogress = from->sb_inprogress;
        to->sb_imax_pct = from->sb_imax_pct;
        to->sb_icount = be64_to_cpu(from->sb_icount);
        to->sb_ifree = be64_to_cpu(from->sb_ifree);
        to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
        to->sb_frextents = be64_to_cpu(from->sb_frextents);
        to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
        to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
        to->sb_qflags = be16_to_cpu(from->sb_qflags);
        to->sb_flags = from->sb_flags;
        to->sb_shared_vn = from->sb_shared_vn;
        to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
        to->sb_unit = be32_to_cpu(from->sb_unit);
        to->sb_width = be32_to_cpu(from->sb_width);
        to->sb_dirblklog = from->sb_dirblklog;
        to->sb_logsectlog = from->sb_logsectlog;
        to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
        to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
        to->sb_features2 = be32_to_cpu(from->sb_features2);
        to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
        to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
        to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
        to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
        to->sb_features_log_incompat =
                                be32_to_cpu(from->sb_features_log_incompat);
        /* crc is only used on disk, not in memory; just init to 0 here. */
        to->sb_crc = 0;
        to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
        to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
        to->sb_lsn = be64_to_cpu(from->sb_lsn);
        /*
         * sb_meta_uuid is only on disk if it differs from sb_uuid and the
         * feature flag is set; if not set we keep it only in memory.
         */
        if (xfs_sb_is_v5(to) &&
            (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID))
                uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
        else
                uuid_copy(&to->sb_meta_uuid, &from->sb_uuid);
        /* Convert on-disk flags to in-memory flags? */
        if (convert_xquota)
                xfs_sb_quota_from_disk(to);

        if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
                to->sb_metadirino = be64_to_cpu(from->sb_metadirino);
                to->sb_rgblklog = from->sb_rgblklog;
                memcpy(to->sb_pad, from->sb_pad, sizeof(to->sb_pad));
                to->sb_rgcount = be32_to_cpu(from->sb_rgcount);
                to->sb_rgextents = be32_to_cpu(from->sb_rgextents);
                to->sb_rbmino = NULLFSINO;
                to->sb_rsumino = NULLFSINO;
        } else {
                to->sb_metadirino = NULLFSINO;
                to->sb_rgcount = 1;
                to->sb_rgextents = 0;
        }

        if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
                to->sb_rtstart = be64_to_cpu(from->sb_rtstart);
                to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved);
        } else {
                to->sb_rtstart = 0;
                to->sb_rtreserved = 0;
        }
}

void
xfs_sb_from_disk(
        struct xfs_sb        *to,
        struct xfs_dsb        *from)
{
        __xfs_sb_from_disk(to, from, true);
}

static void
xfs_sb_quota_to_disk(
        struct xfs_dsb        *to,
        struct xfs_sb        *from)
{
        uint16_t        qflags = from->sb_qflags;

        if (xfs_sb_is_v5(from) &&
            (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
                to->sb_qflags = cpu_to_be16(from->sb_qflags);
                to->sb_uquotino = cpu_to_be64(0);
                to->sb_gquotino = cpu_to_be64(0);
                to->sb_pquotino = cpu_to_be64(0);
                return;
        }

        to->sb_uquotino = cpu_to_be64(from->sb_uquotino);

        /*
         * The in-memory superblock quota state matches the v5 on-disk format so
         * just write them out and return
         */
        if (xfs_sb_is_v5(from)) {
                to->sb_qflags = cpu_to_be16(from->sb_qflags);
                to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
                to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
                return;
        }

        /*
         * For older superblocks (v4), the in-core version of sb_qflags do not
         * have XFS_OQUOTA_* flags, whereas the on-disk version does.  So,
         * convert incore XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
         */
        qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
                        XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);

        if (from->sb_qflags &
                        (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
                qflags |= XFS_OQUOTA_ENFD;
        if (from->sb_qflags &
                        (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
                qflags |= XFS_OQUOTA_CHKD;
        to->sb_qflags = cpu_to_be16(qflags);

        /*
         * GQUOTINO and PQUOTINO cannot be used together in versions
         * of superblock that do not have pquotino. from->sb_flags
         * tells us which quota is active and should be copied to
         * disk. If neither are active, we should NULL the inode.
         *
         * In all cases, the separate pquotino must remain 0 because it
         * is beyond the "end" of the valid non-pquotino superblock.
         */
        if (from->sb_qflags & XFS_GQUOTA_ACCT)
                to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
        else if (from->sb_qflags & XFS_PQUOTA_ACCT)
                to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
        else {
                /*
                 * We can't rely on just the fields being logged to tell us
                 * that it is safe to write NULLFSINO - we should only do that
                 * if quotas are not actually enabled. Hence only write
                 * NULLFSINO if both in-core quota inodes are NULL.
                 */
                if (from->sb_gquotino == NULLFSINO &&
                    from->sb_pquotino == NULLFSINO)
                        to->sb_gquotino = cpu_to_be64(NULLFSINO);
        }

        to->sb_pquotino = 0;
}

void
xfs_sb_to_disk(
        struct xfs_dsb        *to,
        struct xfs_sb        *from)
{
        xfs_sb_quota_to_disk(to, from);

        to->sb_magicnum = cpu_to_be32(from->sb_magicnum);
        to->sb_blocksize = cpu_to_be32(from->sb_blocksize);
        to->sb_dblocks = cpu_to_be64(from->sb_dblocks);
        to->sb_rblocks = cpu_to_be64(from->sb_rblocks);
        to->sb_rextents = cpu_to_be64(from->sb_rextents);
        memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
        to->sb_logstart = cpu_to_be64(from->sb_logstart);
        to->sb_rootino = cpu_to_be64(from->sb_rootino);
        to->sb_rbmino = cpu_to_be64(from->sb_rbmino);
        to->sb_rsumino = cpu_to_be64(from->sb_rsumino);
        to->sb_rextsize = cpu_to_be32(from->sb_rextsize);
        to->sb_agblocks = cpu_to_be32(from->sb_agblocks);
        to->sb_agcount = cpu_to_be32(from->sb_agcount);
        to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks);
        to->sb_logblocks = cpu_to_be32(from->sb_logblocks);
        to->sb_versionnum = cpu_to_be16(from->sb_versionnum);
        to->sb_sectsize = cpu_to_be16(from->sb_sectsize);
        to->sb_inodesize = cpu_to_be16(from->sb_inodesize);
        to->sb_inopblock = cpu_to_be16(from->sb_inopblock);
        memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
        to->sb_blocklog = from->sb_blocklog;
        to->sb_sectlog = from->sb_sectlog;
        to->sb_inodelog = from->sb_inodelog;
        to->sb_inopblog = from->sb_inopblog;
        to->sb_agblklog = from->sb_agblklog;
        to->sb_rextslog = from->sb_rextslog;
        to->sb_inprogress = from->sb_inprogress;
        to->sb_imax_pct = from->sb_imax_pct;
        to->sb_icount = cpu_to_be64(from->sb_icount);
        to->sb_ifree = cpu_to_be64(from->sb_ifree);
        to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks);
        to->sb_frextents = cpu_to_be64(from->sb_frextents);

        to->sb_flags = from->sb_flags;
        to->sb_shared_vn = from->sb_shared_vn;
        to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt);
        to->sb_unit = cpu_to_be32(from->sb_unit);
        to->sb_width = cpu_to_be32(from->sb_width);
        to->sb_dirblklog = from->sb_dirblklog;
        to->sb_logsectlog = from->sb_logsectlog;
        to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize);
        to->sb_logsunit = cpu_to_be32(from->sb_logsunit);

        /*
         * We need to ensure that bad_features2 always matches features2.
         * Hence we enforce that here rather than having to remember to do it
         * everywhere else that updates features2.
         */
        from->sb_bad_features2 = from->sb_features2;
        to->sb_features2 = cpu_to_be32(from->sb_features2);
        to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);

        if (!xfs_sb_is_v5(from))
                return;

        to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
        to->sb_features_ro_compat =
                        cpu_to_be32(from->sb_features_ro_compat);
        to->sb_features_incompat =
                        cpu_to_be32(from->sb_features_incompat);
        to->sb_features_log_incompat =
                        cpu_to_be32(from->sb_features_log_incompat);
        to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
        to->sb_lsn = cpu_to_be64(from->sb_lsn);
        if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
                uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);

        if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
                to->sb_metadirino = cpu_to_be64(from->sb_metadirino);
                to->sb_rgblklog = from->sb_rgblklog;
                memset(to->sb_pad, 0, sizeof(to->sb_pad));
                to->sb_rgcount = cpu_to_be32(from->sb_rgcount);
                to->sb_rgextents = cpu_to_be32(from->sb_rgextents);
                to->sb_rbmino = cpu_to_be64(0);
                to->sb_rsumino = cpu_to_be64(0);
        }

        if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
                to->sb_rtstart = cpu_to_be64(from->sb_rtstart);
                to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved);
        }
}

/*
 * If the superblock has the CRC feature bit set or the CRC field is non-null,
 * check that the CRC is valid.  We check the CRC field is non-null because a
 * single bit error could clear the feature bit and unused parts of the
 * superblock are supposed to be zero. Hence a non-null crc field indicates that
 * we've potentially lost a feature bit and we should check it anyway.
 *
 * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
 * last field in V4 secondary superblocks.  So for secondary superblocks,
 * we are more forgiving, and ignore CRC failures if the primary doesn't
 * indicate that the fs version is V5.
 */
static void
xfs_sb_read_verify(
        struct xfs_buf                *bp)
{
        struct xfs_sb                sb;
        struct xfs_mount        *mp = bp->b_mount;
        struct xfs_dsb                *dsb = bp->b_addr;
        int                        error;

        /*
         * open code the version check to avoid needing to convert the entire
         * superblock from disk order just to check the version number
         */
        if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
            (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
                                                XFS_SB_VERSION_5) ||
             dsb->sb_crc != 0)) {

                if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
                        /* Only fail bad secondaries on a known V5 filesystem */
                        if (xfs_buf_daddr(bp) == XFS_SB_DADDR ||
                            xfs_has_crc(mp)) {
                                error = -EFSBADCRC;
                                goto out_error;
                        }
                }
        }

        /*
         * Check all the superblock fields.  Don't byteswap the xquota flags
         * because _verify_common checks the on-disk values.
         */
        __xfs_sb_from_disk(&sb, dsb, false);
        error = xfs_validate_sb_common(mp, bp, &sb);
        if (error)
                goto out_error;
        error = xfs_validate_sb_read(mp, &sb);

out_error:
        if (error == -EFSCORRUPTED || error == -EFSBADCRC)
                xfs_verifier_error(bp, error, __this_address);
        else if (error)
                xfs_buf_ioerror(bp, error);
}

/*
 * We may be probed for a filesystem match, so we may not want to emit
 * messages when the superblock buffer is not actually an XFS superblock.
 * If we find an XFS superblock, then run a normal, noisy mount because we are
 * really going to mount it and want to know about errors.
 */
static void
xfs_sb_quiet_read_verify(
        struct xfs_buf        *bp)
{
        struct xfs_dsb        *dsb = bp->b_addr;

        if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
                /* XFS filesystem, verify noisily! */
                xfs_sb_read_verify(bp);
                return;
        }
        /* quietly fail */
        xfs_buf_ioerror(bp, -EWRONGFS);
}

static void
xfs_sb_write_verify(
        struct xfs_buf                *bp)
{
        struct xfs_sb                sb;
        struct xfs_mount        *mp = bp->b_mount;
        struct xfs_buf_log_item        *bip = bp->b_log_item;
        struct xfs_dsb                *dsb = bp->b_addr;
        int                        error;

        /*
         * Check all the superblock fields.  Don't byteswap the xquota flags
         * because _verify_common checks the on-disk values.
         */
        __xfs_sb_from_disk(&sb, dsb, false);
        error = xfs_validate_sb_common(mp, bp, &sb);
        if (error)
                goto out_error;
        error = xfs_validate_sb_write(mp, bp, &sb);
        if (error)
                goto out_error;

        if (!xfs_sb_is_v5(&sb))
                return;

        if (bip)
                dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);

        xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
        return;

out_error:
        xfs_verifier_error(bp, error, __this_address);
}

const struct xfs_buf_ops xfs_sb_buf_ops = {
        .name = "xfs_sb",
        .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
        .verify_read = xfs_sb_read_verify,
        .verify_write = xfs_sb_write_verify,
};

const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
        .name = "xfs_sb_quiet",
        .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
        .verify_read = xfs_sb_quiet_read_verify,
        .verify_write = xfs_sb_write_verify,
};

/* Compute cached rt geometry from the incore sb. */
void
xfs_sb_mount_rextsize(
        struct xfs_mount        *mp,
        struct xfs_sb                *sbp)
{
        struct xfs_groups        *rgs = &mp->m_groups[XG_TYPE_RTG];

        mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
        mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);

        if (xfs_sb_is_v5(sbp) &&
            (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
                rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize;
                rgs->blklog = mp->m_sb.sb_rgblklog;
                rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
                rgs->start_fsb = mp->m_sb.sb_rtstart;
                if (xfs_sb_has_incompat_feature(sbp,
                                XFS_SB_FEAT_INCOMPAT_ZONE_GAPS))
                        rgs->has_daddr_gaps = true;
        } else {
                rgs->blocks = 0;
                rgs->blklog = 0;
                rgs->blkmask = (uint64_t)-1;
        }
}

/* Update incore sb rt extent size, then recompute the cached rt geometry. */
void
xfs_mount_sb_set_rextsize(
        struct xfs_mount        *mp,
        struct xfs_sb                *sbp,
        xfs_agblock_t                rextsize)
{
        sbp->sb_rextsize = rextsize;
        if (xfs_sb_is_v5(sbp) &&
            (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR))
                sbp->sb_rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents,
                                                        rextsize);

        xfs_sb_mount_rextsize(mp, sbp);
}

/*
 * xfs_mount_common
 *
 * Mount initialization code establishing various mount
 * fields from the superblock associated with the given
 * mount structure.
 *
 * Inode geometry are calculated in xfs_ialloc_setup_geometry.
 */
void
xfs_sb_mount_common(
        struct xfs_mount        *mp,
        struct xfs_sb                *sbp)
{
        struct xfs_groups        *ags = &mp->m_groups[XG_TYPE_AG];

        mp->m_agfrotor = 0;
        atomic_set(&mp->m_agirotor, 0);
        mp->m_maxagi = mp->m_sb.sb_agcount;
        mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
        mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
        mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
        mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
        mp->m_blockmask = sbp->sb_blocksize - 1;
        mp->m_blockwsize = xfs_rtbmblock_size(sbp) >> XFS_WORDLOG;
        mp->m_rtx_per_rbmblock = mp->m_blockwsize << XFS_NBWORDLOG;

        ags->blocks = mp->m_sb.sb_agblocks;
        ags->blklog = mp->m_sb.sb_agblklog;
        ags->blkmask = xfs_mask32lo(mp->m_sb.sb_agblklog);

        xfs_sb_mount_rextsize(mp, sbp);

        mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true);
        mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false);
        mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
        mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;

        mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, true);
        mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, false);
        mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
        mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;

        mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, true);
        mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, false);
        mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
        mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;

        mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true);
        mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false);
        mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2;
        mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2;

        mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true);
        mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false);
        mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
        mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;

        mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
                        true);
        mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
                        false);
        mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2;
        mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2;

        mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
        mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
        mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
}

/*
 * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
 * into the superblock buffer to be logged.  It does not provide the higher
 * level of locking that is needed to protect the in-core superblock from
 * concurrent access.
 */
void
xfs_log_sb(
        struct xfs_trans        *tp)
{
        struct xfs_mount        *mp = tp->t_mountp;
        struct xfs_buf                *bp = xfs_trans_getsb(tp);

        /*
         * Lazy sb counters don't update the in-core superblock so do that now.
         * If this is at unmount, the counters will be exactly correct, but at
         * any other time they will only be ballpark correct because of
         * reservations that have been taken out percpu counters. If we have an
         * unclean shutdown, this will be corrected by log recovery rebuilding
         * the counters from the AGF block counts.
         */
        if (xfs_has_lazysbcount(mp)) {
                mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount);
                mp->m_sb.sb_ifree = min_t(uint64_t,
                                percpu_counter_sum_positive(&mp->m_ifree),
                                mp->m_sb.sb_icount);
                mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
        }

        /*
         * sb_frextents was added to the lazy sb counters when the rt groups
         * feature was introduced.  This counter can go negative due to the way
         * we handle nearly-lockless reservations, so we must use the _positive
         * variant here to avoid writing out nonsense frextents.
         *
         * RT groups are only supported on v5 file systems, which always
         * have lazy SB counters.
         */
        if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) {
                mp->m_sb.sb_frextents =
                                xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
        }

        xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
        xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
}

/*
 * xfs_sync_sb
 *
 * Sync the superblock to disk.
 *
 * Note that the caller is responsible for checking the frozen state of the
 * filesystem. This procedure uses the non-blocking transaction allocator and
 * thus will allow modifications to a frozen fs. This is required because this
 * code can be called during the process of freezing where use of the high-level
 * allocator would deadlock.
 */
int
xfs_sync_sb(
        struct xfs_mount        *mp,
        bool                        wait)
{
        struct xfs_trans        *tp;
        int                        error;

        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0,
                        XFS_TRANS_NO_WRITECOUNT, &tp);
        if (error)
                return error;

        xfs_log_sb(tp);
        if (wait)
                xfs_trans_set_sync(tp);
        return xfs_trans_commit(tp);
}

/*
 * Update all the secondary superblocks to match the new state of the primary.
 * Because we are completely overwriting all the existing fields in the
 * secondary superblock buffers, there is no need to read them in from disk.
 * Just get a new buffer, stamp it and write it.
 *
 * The sb buffers need to be cached here so that we serialise against other
 * operations that access the secondary superblocks, but we don't want to keep
 * them in memory once it is written so we mark it as a one-shot buffer.
 */
int
xfs_update_secondary_sbs(
        struct xfs_mount        *mp)
{
        struct xfs_perag        *pag = NULL;
        int                        saved_error = 0;
        int                        error = 0;
        LIST_HEAD                (buffer_list);

        /* update secondary superblocks. */
        while ((pag = xfs_perag_next_from(mp, pag, 1))) {
                struct xfs_buf                *bp;

                error = xfs_buf_get(mp->m_ddev_targp,
                                 XFS_AG_DADDR(mp, pag_agno(pag), XFS_SB_DADDR),
                                 XFS_FSS_TO_BB(mp, 1), &bp);
                /*
                 * If we get an error reading or writing alternate superblocks,
                 * continue.  xfs_repair chooses the "best" superblock based
                 * on most matches; if we break early, we'll leave more
                 * superblocks un-updated than updated, and xfs_repair may
                 * pick them over the properly-updated primary.
                 */
                if (error) {
                        xfs_warn(mp,
                "error allocating secondary superblock for ag %d",
                                pag_agno(pag));
                        if (!saved_error)
                                saved_error = error;
                        continue;
                }

                bp->b_ops = &xfs_sb_buf_ops;
                xfs_buf_oneshot(bp);
                xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
                xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
                xfs_buf_delwri_queue(bp, &buffer_list);
                xfs_buf_relse(bp);

                /* don't hold too many buffers at once */
                if (pag_agno(pag) % 16)
                        continue;

                error = xfs_buf_delwri_submit(&buffer_list);
                if (error) {
                        xfs_warn(mp,
                "write error %d updating a secondary superblock near ag %d",
                                error, pag_agno(pag));
                        if (!saved_error)
                                saved_error = error;
                        continue;
                }
        }
        error = xfs_buf_delwri_submit(&buffer_list);
        if (error)
                xfs_warn(mp, "error %d writing secondary superblocks", error);
        return saved_error ? saved_error : error;
}

/*
 * Same behavior as xfs_sync_sb, except that it is always synchronous and it
 * also writes the superblock buffer to disk sector 0 immediately.
 */
int
xfs_sync_sb_buf(
        struct xfs_mount        *mp,
        bool                        update_rtsb)
{
        struct xfs_trans        *tp;
        struct xfs_buf                *bp;
        struct xfs_buf                *rtsb_bp = NULL;
        int                        error;

        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
        if (error)
                return error;

        bp = xfs_trans_getsb(tp);
        xfs_log_sb(tp);
        xfs_trans_bhold(tp, bp);
        if (update_rtsb) {
                rtsb_bp = xfs_log_rtsb(tp, bp);
                if (rtsb_bp)
                        xfs_trans_bhold(tp, rtsb_bp);
        }
        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp);
        if (error)
                goto out;
        /*
         * write out the sb buffer to get the changes to disk
         */
        error = xfs_bwrite(bp);
        if (!error && rtsb_bp)
                error = xfs_bwrite(rtsb_bp);
out:
        if (rtsb_bp)
                xfs_buf_relse(rtsb_bp);
        xfs_buf_relse(bp);
        return error;
}

void
xfs_fs_geometry(
        struct xfs_mount        *mp,
        struct xfs_fsop_geom        *geo,
        int                        struct_version)
{
        struct xfs_sb                *sbp = &mp->m_sb;

        memset(geo, 0, sizeof(struct xfs_fsop_geom));

        geo->blocksize = sbp->sb_blocksize;
        geo->rtextsize = sbp->sb_rextsize;
        geo->agblocks = sbp->sb_agblocks;
        geo->agcount = sbp->sb_agcount;
        geo->logblocks = sbp->sb_logblocks;
        geo->sectsize = sbp->sb_sectsize;
        geo->inodesize = sbp->sb_inodesize;
        geo->imaxpct = sbp->sb_imax_pct;
        geo->datablocks = sbp->sb_dblocks;
        geo->rtblocks = sbp->sb_rblocks;
        geo->rtextents = sbp->sb_rextents;
        geo->logstart = sbp->sb_logstart;
        BUILD_BUG_ON(sizeof(geo->uuid) != sizeof(sbp->sb_uuid));
        memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid));

        if (struct_version < 2)
                return;

        geo->sunit = sbp->sb_unit;
        geo->swidth = sbp->sb_width;

        if (struct_version < 3)
                return;

        geo->version = XFS_FSOP_GEOM_VERSION;
        geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
                     XFS_FSOP_GEOM_FLAGS_DIRV2 |
                     XFS_FSOP_GEOM_FLAGS_EXTFLG |
                     XFS_FSOP_GEOM_FLAGS_ATTR2;
        if (xfs_has_attr(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
        if (xfs_has_quota(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA;
        if (xfs_has_align(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN;
        if (xfs_has_dalign(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN;
        if (xfs_has_asciici(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI;
        if (xfs_has_lazysbcount(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB;
        if (xfs_has_projid32(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32;
        if (xfs_has_crc(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB;
        if (xfs_has_ftype(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE;
        if (xfs_has_finobt(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT;
        if (xfs_has_sparseinodes(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES;
        if (xfs_has_rmapbt(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT;
        if (xfs_has_reflink(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK;
        if (xfs_has_bigtime(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
        if (xfs_has_inobtcounts(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT;
        if (xfs_has_parent(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_PARENT;
        if (xfs_has_sector(mp)) {
                geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
                geo->logsectsize = sbp->sb_logsectsize;
        } else {
                geo->logsectsize = BBSIZE;
        }
        if (xfs_has_large_extent_counts(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
        if (xfs_has_exchange_range(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
        if (xfs_has_metadir(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
        if (xfs_has_zoned(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED;
        geo->rtsectsize = sbp->sb_blocksize;
        geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);

        if (struct_version < 4)
                return;

        if (xfs_has_logv2(mp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2;

        geo->logsunit = sbp->sb_logsunit;

        if (struct_version < 5)
                return;

        geo->version = XFS_FSOP_GEOM_VERSION_V5;

        if (xfs_has_rtgroups(mp)) {
                geo->rgcount = sbp->sb_rgcount;
                geo->rgextents = sbp->sb_rgextents;
        }
        if (xfs_has_zoned(mp)) {
                geo->rtstart = sbp->sb_rtstart;
                geo->rtreserved = sbp->sb_rtreserved;
        }
}

/* Read a secondary superblock. */
int
xfs_sb_read_secondary(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
        xfs_agnumber_t                agno,
        struct xfs_buf                **bpp)
{
        struct xfs_buf                *bp;
        int                        error;

        ASSERT(agno != 0 && agno != NULLAGNUMBER);
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
                        XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
        if (xfs_metadata_is_sick(error))
                xfs_agno_mark_sick(mp, agno, XFS_SICK_AG_SB);
        if (error)
                return error;
        xfs_buf_set_ref(bp, XFS_SSB_REF);
        *bpp = bp;
        return 0;
}

/* Get an uninitialised secondary superblock buffer. */
int
xfs_sb_get_secondary(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
        xfs_agnumber_t                agno,
        struct xfs_buf                **bpp)
{
        struct xfs_buf                *bp;
        int                        error;

        ASSERT(agno != 0 && agno != NULLAGNUMBER);
        error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
        if (error)
                return error;
        bp->b_ops = &xfs_sb_buf_ops;
        xfs_buf_oneshot(bp);
        *bpp = bp;
        return 0;
}

/*
 * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users
 * won't be confused by values in error messages.  This function returns false
 * if the stripe geometry is invalid and the caller is unable to repair the
 * stripe configuration later in the mount process.
 */
bool
xfs_validate_stripe_geometry(
        struct xfs_mount        *mp,
        __s64                        sunit,
        __s64                        swidth,
        int                        sectorsize,
        bool                        may_repair,
        bool                        silent)
{
        if (swidth > INT_MAX) {
                if (!silent)
                        xfs_notice(mp,
"stripe width (%lld) is too large", swidth);
                goto check_override;
        }

        if (sunit > swidth) {
                if (!silent)
                        xfs_notice(mp,
"stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth);
                goto check_override;
        }

        if (sectorsize && (int)sunit % sectorsize) {
                if (!silent)
                        xfs_notice(mp,
"stripe unit (%lld) must be a multiple of the sector size (%d)",
                                   sunit, sectorsize);
                goto check_override;
        }

        if (sunit && !swidth) {
                if (!silent)
                        xfs_notice(mp,
"invalid stripe unit (%lld) and stripe width of 0", sunit);
                goto check_override;
        }

        if (!sunit && swidth) {
                if (!silent)
                        xfs_notice(mp,
"invalid stripe width (%lld) and stripe unit of 0", swidth);
                goto check_override;
        }

        if (sunit && (int)swidth % (int)sunit) {
                if (!silent)
                        xfs_notice(mp,
"stripe width (%lld) must be a multiple of the stripe unit (%lld)",
                                   swidth, sunit);
                goto check_override;
        }
        return true;

check_override:
        if (!may_repair)
                return false;
        /*
         * During mount, mp->m_dalign will not be set unless the sunit mount
         * option was set. If it was set, ignore the bad stripe alignment values
         * and allow the validation and overwrite later in the mount process to
         * attempt to overwrite the bad stripe alignment values with the values
         * supplied by mount options.
         */
        if (!mp->m_dalign)
                return false;
        if (!silent)
                xfs_notice(mp,
"Will try to correct with specified mount options sunit (%d) and swidth (%d)",
                        BBTOB(mp->m_dalign), BBTOB(mp->m_swidth));
        return true;
}

/*
 * Compute the maximum level number of the realtime summary file, as defined by
 * mkfs.  The historic use of highbit32 on a 64-bit quantity prohibited correct
 * use of rt volumes with more than 2^32 extents.
 */
uint8_t
xfs_compute_rextslog(
        xfs_rtbxlen_t                rtextents)
{
        if (!rtextents)
                return 0;
        return xfs_highbit64(rtextents);
}

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SOCK_REUSEPORT_H
#define _SOCK_REUSEPORT_H

#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <net/sock.h>

extern spinlock_t reuseport_lock;

struct sock_reuseport {
        struct rcu_head                rcu;

        u16                        max_socks;                /* length of socks */
        u16                        num_socks;                /* elements in socks */
        u16                        num_closed_socks;        /* closed elements in socks */
        u16                        incoming_cpu;
        /* The last synq overflow event timestamp of this
         * reuse->socks[] group.
         */
        unsigned int                synq_overflow_ts;
        /* ID stays the same even after the size of socks[] grows. */
        unsigned int                reuseport_id;
        unsigned int                bind_inany:1;
        unsigned int                has_conns:1;
        struct bpf_prog __rcu        *prog;                /* optional BPF sock selector */
        struct sock                *socks[] __counted_by(max_socks);
};

extern int reuseport_alloc(struct sock *sk, bool bind_inany);
extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
                              bool bind_inany);
extern void reuseport_detach_sock(struct sock *sk);
void reuseport_stop_listen_sock(struct sock *sk);
extern struct sock *reuseport_select_sock(struct sock *sk,
                                          u32 hash,
                                          struct sk_buff *skb,
                                          int hdr_len);
struct sock *reuseport_migrate_sock(struct sock *sk,
                                    struct sock *migrating_sk,
                                    struct sk_buff *skb);
extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
extern int reuseport_detach_prog(struct sock *sk);

static inline bool reuseport_has_conns(struct sock *sk)
{
        struct sock_reuseport *reuse;
        bool ret = false;

        rcu_read_lock();
        reuse = rcu_dereference(sk->sk_reuseport_cb);
        if (reuse && reuse->has_conns)
                ret = true;
        rcu_read_unlock();

        return ret;
}

void reuseport_has_conns_set(struct sock *sk);
void reuseport_update_incoming_cpu(struct sock *sk, int val);

#endif  /* _SOCK_REUSEPORT_H */



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

















































































































































































































































































































































































































































































































































































































   18 




































































































































    1 





    1 



















   10 















   24 

    1 

































































































































































   12 





































   10 





























































































































   21 









   21 


























   12 
    1 


















   13 

















































    3 































































































































































































































































































































































































   25 














    9 































































































































































































   26 

    1 











    2 




















    3 





























   14 





























































































































































    2 
































































































































































































































































































































































    8 




























































    1 





    1 













































































































    2 























    4 


























   11 
   12 





   12 





   12 
    2 


















































































































































    4 





   11 





















    4 
























    6 
    6 




















    4 

    5 










































    1 




    1 




    1 





    2 




















    1 








    2 


    2 




























































































































































































































































    1 











































































































































































































































































































































































































































































































































   22 
































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_H
#define _LINUX_MM_H

#include <linux/args.h>
#include <linux/errno.h>
#include <linux/mmdebug.h>
#include <linux/gfp.h>
#include <linux/pgalloc_tag.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
#include <linux/compiler.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
#include <linux/pfn.h>
#include <linux/percpu-refcount.h>
#include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
#include <linux/resource.h>
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
#include <linux/overflow.h>
#include <linux/sched.h>
#include <linux/pgtable.h>
#include <linux/kasan.h>
#include <linux/memremap.h>
#include <linux/slab.h>
#include <linux/cacheinfo.h>
#include <linux/rcuwait.h>
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/iommu-debug-pagealloc.h>

struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
struct user_struct;
struct pt_regs;
struct folio_batch;

void arch_mm_preinit(void);
void mm_core_init_early(void);
void mm_core_init(void);
void init_mm_internals(void);

extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
        return (unsigned long)atomic_long_read(&_totalram_pages);
}

static inline void totalram_pages_inc(void)
{
        atomic_long_inc(&_totalram_pages);
}

static inline void totalram_pages_dec(void)
{
        atomic_long_dec(&_totalram_pages);
}

static inline void totalram_pages_add(long count)
{
        atomic_long_add(count, &_totalram_pages);
}

extern void * high_memory;

/*
 * Convert between pages and MB
 * 20 is the shift for 1MB (2^20 = 1MB)
 * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages)
 * So (20 - PAGE_SHIFT) converts between pages and MB
 */
#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT))
#define MB_TO_PAGES(mb)    ((mb) << (20 - PAGE_SHIFT))

#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
#define sysctl_legacy_va_layout 0
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
extern int mmap_rnd_bits_max __ro_after_init;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
extern const int mmap_rnd_compat_bits_min;
extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif

#ifndef DIRECT_MAP_PHYSMEM_END
# ifdef MAX_PHYSMEM_BITS
# define DIRECT_MAP_PHYSMEM_END        ((1ULL << MAX_PHYSMEM_BITS) - 1)
# else
# define DIRECT_MAP_PHYSMEM_END        (((phys_addr_t)-1)&~(1ULL<<63))
# endif
#endif

#define INVALID_PHYS_ADDR (~(phys_addr_t)0)

#include <asm/page.h>
#include <asm/processor.h>

#ifndef __pa_symbol
#define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
#endif

#ifndef page_to_virt
#define page_to_virt(x)        __va(PFN_PHYS(page_to_pfn(x)))
#endif

#ifndef lm_alias
#define lm_alias(x)        __va(__pa_symbol(x))
#endif

/*
 * To prevent common memory management code establishing
 * a zero page mapping on a read fault.
 * This macro should be defined within <asm/pgtable.h>.
 * s390 does this to prevent multiplexing of hardware bits
 * related to the physical page in case of virtualization.
 */
#ifndef mm_forbids_zeropage
#define mm_forbids_zeropage(X)        (0)
#endif

/*
 * On some architectures it is expensive to call memset() for small sizes.
 * If an architecture decides to implement their own version of
 * mm_zero_struct_page they should wrap the defines below in a #ifndef and
 * define their own version of this macro in <asm/pgtable.h>
 */
#if BITS_PER_LONG == 64
/* This function must be updated when the size of struct page grows above 96
 * or reduces below 56. The idea that compiler optimizes out switch()
 * statement, and only leaves move/store instructions. Also the compiler can
 * combine write statements if they are both assignments and can be reordered,
 * this can result in several of the writes here being dropped.
 */
#define        mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
static inline void __mm_zero_struct_page(struct page *page)
{
        unsigned long *_pp = (void *)page;

         /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */
        BUILD_BUG_ON(sizeof(struct page) & 7);
        BUILD_BUG_ON(sizeof(struct page) < 56);
        BUILD_BUG_ON(sizeof(struct page) > 96);

        switch (sizeof(struct page)) {
        case 96:
                _pp[11] = 0;
                fallthrough;
        case 88:
                _pp[10] = 0;
                fallthrough;
        case 80:
                _pp[9] = 0;
                fallthrough;
        case 72:
                _pp[8] = 0;
                fallthrough;
        case 64:
                _pp[7] = 0;
                fallthrough;
        case 56:
                _pp[6] = 0;
                _pp[5] = 0;
                _pp[4] = 0;
                _pp[3] = 0;
                _pp[2] = 0;
                _pp[1] = 0;
                _pp[0] = 0;
        }
}
#else
#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
#endif

/*
 * Default maximum number of active map areas, this limits the number of vmas
 * per mm struct. Users can overwrite this number by sysctl but there is a
 * problem.
 *
 * When a program's coredump is generated as ELF format, a section is created
 * per a vma. In ELF, the number of sections is represented in unsigned short.
 * This means the number of sections should be smaller than 65535 at coredump.
 * Because the kernel adds some informative sections to a image of program at
 * generating coredump, we need some margin. The number of extra sections is
 * 1-3 now and depends on arch. We use "5" as safe margin, here.
 *
 * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
 * not a hard limit any more. Although some userspace tools can be surprised by
 * that.
 */
#define MAPCOUNT_ELF_CORE_MARGIN        (5)
#define DEFAULT_MAX_MAP_COUNT        (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)

extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
bool page_range_contiguous(const struct page *page, unsigned long nr_pages);
#else
static inline bool page_range_contiguous(const struct page *page,
                unsigned long nr_pages)
{
        return true;
}
#endif

/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)

/* to align the pointer to the (prev) page boundary */
#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)

/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr)        IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)

/**
 * folio_page_idx - Return the number of a page in a folio.
 * @folio: The folio.
 * @page: The folio page.
 *
 * This function expects that the page is actually part of the folio.
 * The returned number is relative to the start of the folio.
 */
static inline unsigned long folio_page_idx(const struct folio *folio,
                const struct page *page)
{
        return page - &folio->page;
}

static inline struct folio *lru_to_folio(struct list_head *head)
{
        return list_entry((head)->prev, struct folio, lru);
}

void setup_initial_init_mm(void *start_code, void *end_code,
                           void *end_data, void *brk);

/*
 * Linux kernel virtual memory manager primitives.
 * The idea being to have a "virtual" mm in the same way
 * we have a virtual fs - giving a cleaner interface to the
 * mm details, and allowing different kinds of memory mappings
 * (from shared memory to executable loading to arbitrary
 * mmap() functions).
 */

struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);

#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
extern struct rw_semaphore nommu_region_sem;

extern unsigned int kobjsize(const void *objp);
#endif

/*
 * vm_flags in vm_area_struct, see mm_types.h.
 * When changing, update also include/trace/events/mmflags.h
 */

#define VM_NONE                0x00000000

/**
 * typedef vma_flag_t - specifies an individual VMA flag by bit number.
 *
 * This value is made type safe by sparse to avoid passing invalid flag values
 * around.
 */
typedef int __bitwise vma_flag_t;

#define DECLARE_VMA_BIT(name, bitnum) \
        VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
#define DECLARE_VMA_BIT_ALIAS(name, aliased) \
        VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT)
enum {
        DECLARE_VMA_BIT(READ, 0),
        DECLARE_VMA_BIT(WRITE, 1),
        DECLARE_VMA_BIT(EXEC, 2),
        DECLARE_VMA_BIT(SHARED, 3),
        /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
        DECLARE_VMA_BIT(MAYREAD, 4),        /* limits for mprotect() etc. */
        DECLARE_VMA_BIT(MAYWRITE, 5),
        DECLARE_VMA_BIT(MAYEXEC, 6),
        DECLARE_VMA_BIT(MAYSHARE, 7),
        DECLARE_VMA_BIT(GROWSDOWN, 8),        /* general info on the segment */
#ifdef CONFIG_MMU
        DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
#else
        /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
        DECLARE_VMA_BIT(MAYOVERLAY, 9),
#endif /* CONFIG_MMU */
        /* Page-ranges managed without "struct page", just pure PFN */
        DECLARE_VMA_BIT(PFNMAP, 10),
        DECLARE_VMA_BIT(MAYBE_GUARD, 11),
        DECLARE_VMA_BIT(UFFD_WP, 12),        /* wrprotect pages tracking */
        DECLARE_VMA_BIT(LOCKED, 13),
        DECLARE_VMA_BIT(IO, 14),        /* Memory mapped I/O or similar */
        DECLARE_VMA_BIT(SEQ_READ, 15),        /* App will access data sequentially */
        DECLARE_VMA_BIT(RAND_READ, 16),        /* App will not benefit from clustered reads */
        DECLARE_VMA_BIT(DONTCOPY, 17),        /* Do not copy this vma on fork */
        DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
        DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
        DECLARE_VMA_BIT(ACCOUNT, 20),        /* Is a VM accounted object */
        DECLARE_VMA_BIT(NORESERVE, 21),        /* should the VM suppress accounting */
        DECLARE_VMA_BIT(HUGETLB, 22),        /* Huge TLB Page VM */
        DECLARE_VMA_BIT(SYNC, 23),        /* Synchronous page faults */
        DECLARE_VMA_BIT(ARCH_1, 24),        /* Architecture-specific flag */
        DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
        DECLARE_VMA_BIT(DONTDUMP, 26),        /* Do not include in the core dump */
        DECLARE_VMA_BIT(SOFTDIRTY, 27),        /* NOT soft dirty clean area */
        DECLARE_VMA_BIT(MIXEDMAP, 28),        /* Can contain struct page and pure PFN pages */
        DECLARE_VMA_BIT(HUGEPAGE, 29),        /* MADV_HUGEPAGE marked this vma */
        DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
        DECLARE_VMA_BIT(MERGEABLE, 31),        /* KSM may merge identical pages */
        /* These bits are reused, we define specific uses below. */
        DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
        DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
        DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
        DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
        DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
        DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
        DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
        /*
         * This flag is used to connect VFIO to arch specific KVM code. It
         * indicates that the memory under this VMA is safe for use with any
         * non-cachable memory type inside KVM. Some VFIO devices, on some
         * platforms, are thought to be unsafe and can cause machine crashes
         * if KVM does not lock down the memory type.
         */
        DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
#if defined(CONFIG_PPC32)
        DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
#elif defined(CONFIG_64BIT)
        DECLARE_VMA_BIT(DROPPABLE, 40),
#endif
        DECLARE_VMA_BIT(UFFD_MINOR, 41),
        DECLARE_VMA_BIT(SEALED, 42),
        /* Flags that reuse flags above. */
        DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
        DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
        DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
        DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
        DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_RISCV_USER_CFI)
        /*
         * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
         * support core mm.
         *
         * These VMAs will get a single end guard page. This helps userspace
         * protect itself from attacks. A single page is enough for current
         * shadow stack archs (x86). See the comments near alloc_shstk() in
         * arch/x86/kernel/shstk.c for more details on the guard size.
         */
        DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
#elif defined(CONFIG_ARM64_GCS)
        /*
         * arm64's Guarded Control Stack implements similar functionality and
         * has similar constraints to shadow stacks.
         */
        DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
#endif
        DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1),                /* Strong Access Ordering (powerpc) */
        DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1),                /* parisc */
        DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1),        /* sparc64 */
        DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1),        /* arm64 */
        DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1),        /* sparc64, arm64 */
        DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1),        /* !CONFIG_MMU */
        DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4),        /* arm64 */
        DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
#ifdef CONFIG_STACK_GROWSUP
        DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
        DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
#else
        DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
#endif
};
#undef DECLARE_VMA_BIT
#undef DECLARE_VMA_BIT_ALIAS

#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
#define VM_READ                INIT_VM_FLAG(READ)
#define VM_WRITE        INIT_VM_FLAG(WRITE)
#define VM_EXEC                INIT_VM_FLAG(EXEC)
#define VM_SHARED        INIT_VM_FLAG(SHARED)
#define VM_MAYREAD        INIT_VM_FLAG(MAYREAD)
#define VM_MAYWRITE        INIT_VM_FLAG(MAYWRITE)
#define VM_MAYEXEC        INIT_VM_FLAG(MAYEXEC)
#define VM_MAYSHARE        INIT_VM_FLAG(MAYSHARE)
#define VM_GROWSDOWN        INIT_VM_FLAG(GROWSDOWN)
#ifdef CONFIG_MMU
#define VM_UFFD_MISSING        INIT_VM_FLAG(UFFD_MISSING)
#else
#define VM_UFFD_MISSING        VM_NONE
#define VM_MAYOVERLAY        INIT_VM_FLAG(MAYOVERLAY)
#endif
#define VM_PFNMAP        INIT_VM_FLAG(PFNMAP)
#define VM_MAYBE_GUARD        INIT_VM_FLAG(MAYBE_GUARD)
#define VM_UFFD_WP        INIT_VM_FLAG(UFFD_WP)
#define VM_LOCKED        INIT_VM_FLAG(LOCKED)
#define VM_IO                INIT_VM_FLAG(IO)
#define VM_SEQ_READ        INIT_VM_FLAG(SEQ_READ)
#define VM_RAND_READ        INIT_VM_FLAG(RAND_READ)
#define VM_DONTCOPY        INIT_VM_FLAG(DONTCOPY)
#define VM_DONTEXPAND        INIT_VM_FLAG(DONTEXPAND)
#define VM_LOCKONFAULT        INIT_VM_FLAG(LOCKONFAULT)
#define VM_ACCOUNT        INIT_VM_FLAG(ACCOUNT)
#define VM_NORESERVE        INIT_VM_FLAG(NORESERVE)
#define VM_HUGETLB        INIT_VM_FLAG(HUGETLB)
#define VM_SYNC                INIT_VM_FLAG(SYNC)
#define VM_ARCH_1        INIT_VM_FLAG(ARCH_1)
#define VM_WIPEONFORK        INIT_VM_FLAG(WIPEONFORK)
#define VM_DONTDUMP        INIT_VM_FLAG(DONTDUMP)
#ifdef CONFIG_MEM_SOFT_DIRTY
#define VM_SOFTDIRTY        INIT_VM_FLAG(SOFTDIRTY)
#else
#define VM_SOFTDIRTY        VM_NONE
#endif
#define VM_MIXEDMAP        INIT_VM_FLAG(MIXEDMAP)
#define VM_HUGEPAGE        INIT_VM_FLAG(HUGEPAGE)
#define VM_NOHUGEPAGE        INIT_VM_FLAG(NOHUGEPAGE)
#define VM_MERGEABLE        INIT_VM_FLAG(MERGEABLE)
#define VM_STACK        INIT_VM_FLAG(STACK)
#ifdef CONFIG_STACK_GROWSUP
#define VM_STACK_EARLY        INIT_VM_FLAG(STACK_EARLY)
#else
#define VM_STACK_EARLY        VM_NONE
#endif
#ifdef CONFIG_ARCH_HAS_PKEYS
#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
/* Despite the naming, these are FLAGS not bits. */
#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
#if CONFIG_ARCH_PKEY_BITS > 3
#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
#else
#define VM_PKEY_BIT3  VM_NONE
#endif /* CONFIG_ARCH_PKEY_BITS > 3 */
#if CONFIG_ARCH_PKEY_BITS > 4
#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
#else
#define VM_PKEY_BIT4  VM_NONE
#endif /* CONFIG_ARCH_PKEY_BITS > 4 */
#endif /* CONFIG_ARCH_HAS_PKEYS */
#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) || \
        defined(CONFIG_RISCV_USER_CFI)
#define VM_SHADOW_STACK        INIT_VM_FLAG(SHADOW_STACK)
#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT, VMA_SHADOW_STACK_BIT)
#else
#define VM_SHADOW_STACK        VM_NONE
#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT)
#endif
#if defined(CONFIG_PPC64)
#define VM_SAO                INIT_VM_FLAG(SAO)
#elif defined(CONFIG_PARISC)
#define VM_GROWSUP        INIT_VM_FLAG(GROWSUP)
#elif defined(CONFIG_SPARC64)
#define VM_SPARC_ADI        INIT_VM_FLAG(SPARC_ADI)
#define VM_ARCH_CLEAR        INIT_VM_FLAG(ARCH_CLEAR)
#elif defined(CONFIG_ARM64)
#define VM_ARM64_BTI        INIT_VM_FLAG(ARM64_BTI)
#define VM_ARCH_CLEAR        INIT_VM_FLAG(ARCH_CLEAR)
#elif !defined(CONFIG_MMU)
#define VM_MAPPED_COPY        INIT_VM_FLAG(MAPPED_COPY)
#endif
#ifndef VM_GROWSUP
#define VM_GROWSUP        VM_NONE
#endif
#ifdef CONFIG_ARM64_MTE
#define VM_MTE                INIT_VM_FLAG(MTE)
#define VM_MTE_ALLOWED        INIT_VM_FLAG(MTE_ALLOWED)
#else
#define VM_MTE                VM_NONE
#define VM_MTE_ALLOWED        VM_NONE
#endif
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
#define VM_UFFD_MINOR        INIT_VM_FLAG(UFFD_MINOR)
#else
#define VM_UFFD_MINOR        VM_NONE
#endif
#ifdef CONFIG_64BIT
#define VM_ALLOW_ANY_UNCACHED        INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
#define VM_SEALED                INIT_VM_FLAG(SEALED)
#else
#define VM_ALLOW_ANY_UNCACHED        VM_NONE
#define VM_SEALED                VM_NONE
#endif
#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
#define VM_DROPPABLE                INIT_VM_FLAG(DROPPABLE)
#define VMA_DROPPABLE                mk_vma_flags(VMA_DROPPABLE_BIT)
#else
#define VM_DROPPABLE                VM_NONE
#define VMA_DROPPABLE                EMPTY_VMA_FLAGS
#endif

/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)

#define TASK_EXEC_BIT ((current->personality & READ_IMPLIES_EXEC) ? \
                       VMA_EXEC_BIT : VMA_READ_BIT)

/* Common data flag combinations */
#define VMA_DATA_FLAGS_TSK_EXEC        mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \
                TASK_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT,          \
                VMA_MAYEXEC_BIT)
#define VMA_DATA_FLAGS_NON_EXEC        mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \
                VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT)
#define VMA_DATA_FLAGS_EXEC        mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \
                VMA_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT,          \
                VMA_MAYEXEC_BIT)

#ifndef VMA_DATA_DEFAULT_FLAGS                /* arch can override this */
#define VMA_DATA_DEFAULT_FLAGS  VMA_DATA_FLAGS_EXEC
#endif

#ifndef VMA_STACK_DEFAULT_FLAGS                /* arch can override this */
#define VMA_STACK_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS
#endif

#define VMA_STACK_FLAGS        append_vma_flags(VMA_STACK_DEFAULT_FLAGS,        \
                VMA_STACK_BIT, VMA_ACCOUNT_BIT)

/* Temporary until VMA flags conversion complete. */
#define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS)

#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
#define VM_SEALED_SYSMAP        VM_SEALED
#else
#define VM_SEALED_SYSMAP        VM_NONE
#endif

/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
#define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT)

/*
 * Special vmas that are non-mergable, non-mlock()able.
 */

#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \
                                       VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT)
#define VM_SPECIAL vma_flags_to_legacy(VMA_SPECIAL_FLAGS)

/*
 * Physically remapped pages are special. Tell the
 * rest of the world about it:
 *   IO tells people not to look at these pages
 *        (accesses can have side effects).
 *   PFNMAP tells the core MM that the base pages are just
 *        raw PFN mappings, and do not have a "struct page" associated
 *        with them.
 *   DONTEXPAND
 *      Disable vma merging and expanding with mremap().
 *   DONTDUMP
 *      Omit vma from core dump, even when VM_IO turned off.
 */
#define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT,        \
                                     VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)

/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)

/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK        VM_NOHUGEPAGE

/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK        (VM_LOCKED | VM_LOCKONFAULT)

#define VMA_LOCKED_MASK        mk_vma_flags(VMA_LOCKED_BIT, VMA_LOCKONFAULT_BIT)

/* These flags can be updated atomically via VMA/mmap read lock. */
#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD

/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
#define VM_ARCH_CLEAR        VM_NONE
#endif
#define VM_FLAGS_CLEAR        (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)

/*
 * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
 * possesses it but the other does not, the merged VMA should nonetheless have
 * applied to it:
 *
 *   VMA_SOFTDIRTY_BIT - if a VMA is marked soft-dirty, that is has not had its
 *                       references cleared via /proc/$pid/clear_refs, any
 *                       merged VMA should be considered soft-dirty also as it
 *                       operates at a VMA granularity.
 *
 * VMA_MAYBE_GUARD_BIT - If a VMA may have guard regions in place it implies
 *                       that mapped page tables may contain metadata not
 *                       described by the VMA and thus any merged VMA may also
 *                       contain this metadata, and thus we must make this flag
 *                       sticky.
 */
#ifdef CONFIG_MEM_SOFT_DIRTY
#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT)
#else
#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT)
#endif

/*
 * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
 * of these flags and the other not does not preclude a merge.
 *
 *    VMA_STICKY_FLAGS - When merging VMAs, VMA flags must match, unless they
 *                       are 'sticky'. If any sticky flags exist in either VMA,
 *                       we simply set all of them on the merged VMA.
 */
#define VMA_IGNORE_MERGE_FLAGS VMA_STICKY_FLAGS

/*
 * Flags which should result in page tables being copied on fork. These are
 * flags which indicate that the VMA maps page tables which cannot be
 * reconsistuted upon page fault, so necessitate page table copying upon fork.
 *
 * Note that these flags should be compared with the DESTINATION VMA not the
 * source, as VM_UFFD_WP may not be propagated to destination, while all other
 * flags will be.
 *
 * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
 *                           reasonably reconstructed on page fault.
 *
 *              VM_UFFD_WP - Encodes metadata about an installed uffd
 *                           write protect handler, which cannot be
 *                           reconstructed on page fault.
 *
 *                           We always copy pgtables when dst_vma has uffd-wp
 *                           enabled even if it's file-backed
 *                           (e.g. shmem). Because when uffd-wp is enabled,
 *                           pgtable contains uffd-wp protection information,
 *                           that's something we can't retrieve from page cache,
 *                           and skip copying will lose those info.
 *
 *          VM_MAYBE_GUARD - Could contain page guard region markers which
 *                           by design are a property of the page tables
 *                           only and thus cannot be reconstructed on page
 *                           fault.
 */
#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)

/*
 * mapping from the currently active vm_flags protection bits (the
 * low four bits) to a page protection mask..
 */

/*
 * The default fault flags that should be used by most of the
 * arch-specific page fault handlers.
 */
#define FAULT_FLAG_DEFAULT  (FAULT_FLAG_ALLOW_RETRY | \
                             FAULT_FLAG_KILLABLE | \
                             FAULT_FLAG_INTERRUPTIBLE)

/**
 * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
 * @flags: Fault flags.
 *
 * This is mostly used for places where we want to try to avoid taking
 * the mmap_lock for too long a time when waiting for another condition
 * to change, in which case we can try to be polite to release the
 * mmap_lock in the first round to avoid potential starvation of other
 * processes that would also want the mmap_lock.
 *
 * Return: true if the page fault allows retry and this is the first
 * attempt of the fault handling; false otherwise.
 */
static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{
        return (flags & FAULT_FLAG_ALLOW_RETRY) &&
            (!(flags & FAULT_FLAG_TRIED));
}

#define FAULT_FLAG_TRACE \
        { FAULT_FLAG_WRITE,                "WRITE" }, \
        { FAULT_FLAG_MKWRITE,                "MKWRITE" }, \
        { FAULT_FLAG_ALLOW_RETRY,        "ALLOW_RETRY" }, \
        { FAULT_FLAG_RETRY_NOWAIT,        "RETRY_NOWAIT" }, \
        { FAULT_FLAG_KILLABLE,                "KILLABLE" }, \
        { FAULT_FLAG_TRIED,                "TRIED" }, \
        { FAULT_FLAG_USER,                "USER" }, \
        { FAULT_FLAG_REMOTE,                "REMOTE" }, \
        { FAULT_FLAG_INSTRUCTION,        "INSTRUCTION" }, \
        { FAULT_FLAG_INTERRUPTIBLE,        "INTERRUPTIBLE" }, \
        { FAULT_FLAG_VMA_LOCK,                "VMA_LOCK" }

/*
 * vm_fault is filled by the pagefault handler and passed to the vma's
 * ->fault function. The vma's ->fault is responsible for returning a bitmask
 * of VM_FAULT_xxx flags that give details about how the fault was handled.
 *
 * MM layer fills up gfp_mask for page allocations but fault handler might
 * alter it if its implementation requires a different allocation context.
 *
 * pgoff should be used in favour of virtual_address, if possible.
 */
struct vm_fault {
        const struct {
                struct vm_area_struct *vma;        /* Target VMA */
                gfp_t gfp_mask;                        /* gfp mask to be used for allocations */
                pgoff_t pgoff;                        /* Logical page offset based on vma */
                unsigned long address;                /* Faulting virtual address - masked */
                unsigned long real_address;        /* Faulting virtual address - unmasked */
        };
        enum fault_flag flags;                /* FAULT_FLAG_xxx flags
                                         * XXX: should really be 'const' */
        pmd_t *pmd;                        /* Pointer to pmd entry matching
                                         * the 'address' */
        pud_t *pud;                        /* Pointer to pud entry matching
                                         * the 'address'
                                         */
        union {
                pte_t orig_pte;                /* Value of PTE at the time of fault */
                pmd_t orig_pmd;                /* Value of PMD at the time of fault,
                                         * used by PMD fault only.
                                         */
        };

        struct page *cow_page;                /* Page handler may use for COW fault */
        struct page *page;                /* ->fault handlers should return a
                                         * page here, unless VM_FAULT_NOPAGE
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
        /* These three entries are valid only while holding ptl lock */
        pte_t *pte;                        /* Pointer to pte entry matching
                                         * the 'address'. NULL if the page
                                         * table hasn't been allocated.
                                         */
        spinlock_t *ptl;                /* Page table lock.
                                         * Protects pte page table if 'pte'
                                         * is not NULL, otherwise pmd.
                                         */
        pgtable_t prealloc_pte;                /* Pre-allocated pte page table.
                                         * vm_ops->map_pages() sets up a page
                                         * table from atomic context.
                                         * do_fault_around() pre-allocates
                                         * page table to avoid allocation from
                                         * atomic context.
                                         */
};

struct vm_uffd_ops;

/*
 * These are the virtual MM functions - opening of an area, closing and
 * unmapping it (needed to keep files on disk up-to-date etc), pointer
 * to the functions called when a no-page or a wp-page exception occurs.
 */
struct vm_operations_struct {
        /**
         * @open: Called when a VMA is remapped, split or forked. Not called
         * upon first mapping a VMA.
         * Context: User context.  May sleep.  Caller holds mmap_lock.
         */
        void (*open)(struct vm_area_struct *vma);
        /**
         * @close: Called when the VMA is being removed from the MM.
         * Context: User context.  May sleep.  Caller holds mmap_lock.
         */
        void (*close)(struct vm_area_struct *vma);
        /**
         * @mapped: Called when the VMA is first mapped in the MM. Not called if
         * the new VMA is merged with an adjacent VMA.
         *
         * The @vm_private_data field is an output field allowing the user to
         * modify vma->vm_private_data as necessary.
         *
         * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
         * set from f_op->mmap.
         *
         * Returns %0 on success, or an error otherwise. On error, the VMA will
         * be unmapped.
         *
         * Context: User context.  May sleep.  Caller holds mmap_lock.
         */
        int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
                      const struct file *file, void **vm_private_data);
        /* Called any time before splitting to check if it's allowed */
        int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
        int (*mremap)(struct vm_area_struct *vma);
        /*
         * Called by mprotect() to make driver-specific permission
         * checks before mprotect() is finalised.   The VMA must not
         * be modified.  Returns 0 if mprotect() can proceed.
         */
        int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, unsigned long newflags);
        vm_fault_t (*fault)(struct vm_fault *vmf);
        vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
        vm_fault_t (*map_pages)(struct vm_fault *vmf,
                        pgoff_t start_pgoff, pgoff_t end_pgoff);
        unsigned long (*pagesize)(struct vm_area_struct *vma);

        /* notification that a previously read-only page is about to become
         * writable, if an error is returned it will cause a SIGBUS */
        vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);

        /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
        vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);

        /* called by access_process_vm when get_user_pages() fails, typically
         * for use by special VMAs. See also generic_access_phys() for a generic
         * implementation useful for any iomem mapping.
         */
        int (*access)(struct vm_area_struct *vma, unsigned long addr,
                      void *buf, int len, int write);

        /* Called by the /proc/PID/maps code to ask the vma whether it
         * has a special name.  Returning non-NULL will also cause this
         * vma to be dumped unconditionally. */
        const char *(*name)(struct vm_area_struct *vma);

#ifdef CONFIG_NUMA
        /*
         * set_policy() op must add a reference to any non-NULL @new mempolicy
         * to hold the policy upon return.  Caller should pass NULL @new to
         * remove a policy and fall back to surrounding context--i.e. do not
         * install a MPOL_DEFAULT policy, nor the task or system default
         * mempolicy.
         */
        int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);

        /*
         * get_policy() op must add reference [mpol_get()] to any policy at
         * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
         * in mm/mempolicy.c will do this automatically.
         * get_policy() must NOT add a ref if the policy at (vma,addr) is not
         * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
         * If no [shared/vma] mempolicy exists at the addr, get_policy() op
         * must return NULL--i.e., do not "fallback" to task or system default
         * policy.
         */
        struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
                                        unsigned long addr, pgoff_t *ilx);
#endif
#ifdef CONFIG_FIND_NORMAL_PAGE
        /*
         * Called by vm_normal_page() for special PTEs in @vma at @addr. This
         * allows for returning a "normal" page from vm_normal_page() even
         * though the PTE indicates that the "struct page" either does not exist
         * or should not be touched: "special".
         *
         * Do not add new users: this really only works when a "normal" page
         * was mapped, but then the PTE got changed to something weird (+
         * marked special) that would not make pte_pfn() identify the originally
         * inserted page.
         */
        struct page *(*find_normal_page)(struct vm_area_struct *vma,
                                         unsigned long addr);
#endif /* CONFIG_FIND_NORMAL_PAGE */
#ifdef CONFIG_USERFAULTFD
        const struct vm_uffd_ops *uffd_ops;
#endif
};

#ifdef CONFIG_NUMA_BALANCING
static inline void vma_numab_state_init(struct vm_area_struct *vma)
{
        vma->numab_state = NULL;
}
static inline void vma_numab_state_free(struct vm_area_struct *vma)
{
        kfree(vma->numab_state);
}
#else
static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_NUMA_BALANCING */

/*
 * These must be here rather than mmap_lock.h as dependent on vm_fault type,
 * declared in this header.
 */
#ifdef CONFIG_PER_VMA_LOCK
static inline void release_fault_lock(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                vma_end_read(vmf->vma);
        else
                mmap_read_unlock(vmf->vma->vm_mm);
}

static inline void assert_fault_locked(const struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                vma_assert_locked(vmf->vma);
        else
                mmap_assert_locked(vmf->vma->vm_mm);
}
#else
static inline void release_fault_lock(struct vm_fault *vmf)
{
        mmap_read_unlock(vmf->vma->vm_mm);
}

static inline void assert_fault_locked(const struct vm_fault *vmf)
{
        mmap_assert_locked(vmf->vma->vm_mm);
}
#endif /* CONFIG_PER_VMA_LOCK */

static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
{
        return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}

static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm)
{
        return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}

static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm)
{
        return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}

static inline void mm_flags_set(int flag, struct mm_struct *mm)
{
        set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}

static inline void mm_flags_clear(int flag, struct mm_struct *mm)
{
        clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}

static inline void mm_flags_clear_all(struct mm_struct *mm)
{
        bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS);
}

extern const struct vm_operations_struct vma_dummy_vm_ops;

static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_ops = &vma_dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
        vma_lock_init(vma, false);
}

/* Use when VMA is not part of the VMA tree and needs no locking */
static inline void vm_flags_init(struct vm_area_struct *vma,
                                 vm_flags_t flags)
{
        VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
        vma_flags_clear_all(&vma->flags);
        vma_flags_overwrite_word(&vma->flags, flags);
}

/*
 * Use when VMA is part of the VMA tree and modifications need coordination
 * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
 * it should be locked explicitly beforehand.
 */
static inline void vm_flags_reset(struct vm_area_struct *vma,
                                  vm_flags_t flags)
{
        VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
        vma_assert_write_locked(vma);
        vm_flags_init(vma, flags);
}

static inline void vma_flags_reset_once(struct vm_area_struct *vma,
                                        vma_flags_t *flags)
{
        const unsigned long word = flags->__vma_flags[0];

        /* It is assumed only the first system word must be written once. */
        vma_flags_overwrite_word_once(&vma->flags, word);
        /* The remainder can be copied normally. */
        if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) {
                unsigned long *dst = &vma->flags.__vma_flags[1];
                const unsigned long *src = &flags->__vma_flags[1];

                bitmap_copy(dst, src, NUM_VMA_FLAG_BITS - BITS_PER_LONG);
        }
}

static inline void vm_flags_set(struct vm_area_struct *vma,
                                vm_flags_t flags)
{
        vma_start_write(vma);
        vma_flags_set_word(&vma->flags, flags);
}

static inline void vm_flags_clear(struct vm_area_struct *vma,
                                  vm_flags_t flags)
{
        VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
        vma_start_write(vma);
        vma_flags_clear_word(&vma->flags, flags);
}

/*
 * Use only if VMA is not part of the VMA tree or has no other users and
 * therefore needs no locking.
 */
static inline void __vm_flags_mod(struct vm_area_struct *vma,
                                  vm_flags_t set, vm_flags_t clear)
{
        vm_flags_init(vma, (vma->vm_flags | set) & ~clear);
}

/*
 * Use only when the order of set/clear operations is unimportant, otherwise
 * use vm_flags_{set|clear} explicitly.
 */
static inline void vm_flags_mod(struct vm_area_struct *vma,
                                vm_flags_t set, vm_flags_t clear)
{
        vma_start_write(vma);
        __vm_flags_mod(vma, set, clear);
}

static __always_inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma,
                vma_flag_t bit)
{
        const vm_flags_t mask = BIT((__force int)bit);

        /* Only specific flags are permitted */
        if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED)))
                return false;

        return true;
}

/*
 * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific
 * valid flags are allowed to do this.
 */
static __always_inline void vma_set_atomic_flag(struct vm_area_struct *vma,
                vma_flag_t bit)
{
        unsigned long *bitmap = vma->flags.__vma_flags;

        vma_assert_stabilised(vma);
        if (__vma_atomic_valid_flag(vma, bit))
                set_bit((__force int)bit, bitmap);
}

/*
 * Test for VMA flag atomically. Requires no locks. Only specific valid flags
 * are allowed to do this.
 *
 * This is necessarily racey, so callers must ensure that serialisation is
 * achieved through some other means, or that races are permissible.
 */
static __always_inline bool vma_test_atomic_flag(struct vm_area_struct *vma,
                vma_flag_t bit)
{
        if (__vma_atomic_valid_flag(vma, bit))
                return test_bit((__force int)bit, &vma->vm_flags);

        return false;
}

/* Set an individual VMA flag in flags, non-atomically. */
static __always_inline void vma_flags_set_flag(vma_flags_t *flags,
                vma_flag_t bit)
{
        unsigned long *bitmap = flags->__vma_flags;

        __set_bit((__force int)bit, bitmap);
}

static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags,
                size_t count, const vma_flag_t *bits)
{
        int i;

        for (i = 0; i < count; i++)
                vma_flags_set_flag(&flags, bits[i]);
        return flags;
}

/*
 * Helper macro which bitwise-or combines the specified input flags into a
 * vma_flags_t bitmap value. E.g.:
 *
 * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT,
 *              VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT);
 *
 * The compiler cleverly optimises away all of the work and this ends up being
 * equivalent to aggregating the values manually.
 */
#define mk_vma_flags(...) __mk_vma_flags(EMPTY_VMA_FLAGS,                        \
                COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__})

/*
 * Helper macro which acts like mk_vma_flags, only appending to a copy of the
 * specified flags rather than establishing new flags. E.g.:
 *
 * vma_flags_t flags = append_vma_flags(VMA_STACK_DEFAULT_FLAGS, VMA_STACK_BIT,
 *              VMA_ACCOUNT_BIT);
 */
#define append_vma_flags(flags, ...) __mk_vma_flags(flags,                        \
                COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__})

/* Calculates the number of set bits in the specified VMA flags. */
static __always_inline int vma_flags_count(const vma_flags_t *flags)
{
        const unsigned long *bitmap = flags->__vma_flags;

        return bitmap_weight(bitmap, NUM_VMA_FLAG_BITS);
}

/*
 * Test whether a specific VMA flag is set, e.g.:
 *
 * if (vma_flags_test(flags, VMA_READ_BIT)) { ... }
 */
static __always_inline bool vma_flags_test(const vma_flags_t *flags,
                vma_flag_t bit)
{
        const unsigned long *bitmap = flags->__vma_flags;

        return test_bit((__force int)bit, bitmap);
}

/*
 * Obtain a set of VMA flags which contain the overlapping flags contained
 * within flags and to_and.
 */
static __always_inline vma_flags_t vma_flags_and_mask(const vma_flags_t *flags,
                                                      vma_flags_t to_and)
{
        vma_flags_t dst;
        unsigned long *bitmap_dst = dst.__vma_flags;
        const unsigned long *bitmap = flags->__vma_flags;
        const unsigned long *bitmap_to_and = to_and.__vma_flags;

        bitmap_and(bitmap_dst, bitmap, bitmap_to_and, NUM_VMA_FLAG_BITS);
        return dst;
}

/*
 * Obtain a set of VMA flags which contains the specified overlapping flags,
 * e.g.:
 *
 * vma_flags_t read_flags = vma_flags_and(&flags, VMA_READ_BIT,
 *                                        VMA_MAY_READ_BIT);
 */
#define vma_flags_and(flags, ...)                                \
        vma_flags_and_mask(flags, mk_vma_flags(__VA_ARGS__))

/*  Test each of to_test flags in flags, non-atomically. */
static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags,
                vma_flags_t to_test)
{
        const unsigned long *bitmap = flags->__vma_flags;
        const unsigned long *bitmap_to_test = to_test.__vma_flags;

        return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
}

/*
 * Test whether any specified VMA flag is set, e.g.:
 *
 * if (vma_flags_test_any(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
 */
#define vma_flags_test_any(flags, ...) \
        vma_flags_test_any_mask(flags, mk_vma_flags(__VA_ARGS__))

/* Test that ALL of the to_test flags are set, non-atomically. */
static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags,
                vma_flags_t to_test)
{
        const unsigned long *bitmap = flags->__vma_flags;
        const unsigned long *bitmap_to_test = to_test.__vma_flags;

        return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
}

/*
 * Test whether ALL specified VMA flags are set, e.g.:
 *
 * if (vma_flags_test_all(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
 */
#define vma_flags_test_all(flags, ...) \
        vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__))

/*
 * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set
 * (returning false if flagmask has no flags set).
 *
 * This is defined to make the semantics clearer when testing an optionally
 * defined VMA flags mask, e.g.:
 *
 * if (vma_flags_test_single_mask(&flags, VMA_DROPPABLE)) { ... }
 *
 * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS
 * otherwise.
 */
static __always_inline bool vma_flags_test_single_mask(const vma_flags_t *flags,
                vma_flags_t flagmask)
{
        VM_WARN_ON_ONCE(vma_flags_count(&flagmask) > 1);

        return vma_flags_test_any_mask(flags, flagmask);
}

/* Set each of the to_set flags in flags, non-atomically. */
static __always_inline void vma_flags_set_mask(vma_flags_t *flags,
                vma_flags_t to_set)
{
        unsigned long *bitmap = flags->__vma_flags;
        const unsigned long *bitmap_to_set = to_set.__vma_flags;

        bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS);
}

/*
 * Set all specified VMA flags, e.g.:
 *
 * vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
 */
#define vma_flags_set(flags, ...) \
        vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__))

/* Clear all of the to-clear flags in flags, non-atomically. */
static __always_inline void vma_flags_clear_mask(vma_flags_t *flags,
                vma_flags_t to_clear)
{
        unsigned long *bitmap = flags->__vma_flags;
        const unsigned long *bitmap_to_clear = to_clear.__vma_flags;

        bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS);
}

/*
 * Clear all specified individual flags, e.g.:
 *
 * vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
 */
#define vma_flags_clear(flags, ...) \
        vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__))

/*
 * Obtain a VMA flags value containing those flags that are present in flags or
 * flags_other but not in both.
 */
static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags,
                const vma_flags_t *flags_other)
{
        vma_flags_t dst;
        const unsigned long *bitmap_other = flags_other->__vma_flags;
        const unsigned long *bitmap = flags->__vma_flags;
        unsigned long *bitmap_dst = dst.__vma_flags;

        bitmap_xor(bitmap_dst, bitmap, bitmap_other, NUM_VMA_FLAG_BITS);
        return dst;
}

/* Determine if flags and flags_other have precisely the same flags set. */
static __always_inline bool vma_flags_same_pair(const vma_flags_t *flags,
                                                const vma_flags_t *flags_other)
{
        const unsigned long *bitmap = flags->__vma_flags;
        const unsigned long *bitmap_other = flags_other->__vma_flags;

        return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS);
}

/* Determine if flags and flags_other have precisely the same flags set.  */
static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags,
                                                vma_flags_t flags_other)
{
        const unsigned long *bitmap = flags->__vma_flags;
        const unsigned long *bitmap_other = flags_other.__vma_flags;

        return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS);
}

/*
 * Helper macro to determine if only the specific flags are set, e.g.:
 *
 * if (vma_flags_same(&flags, VMA_WRITE_BIT) { ... }
 */
#define vma_flags_same(flags, ...) \
        vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__))

/*
 * Test whether a specific flag in the VMA is set, e.g.:
 *
 * if (vma_test(vma, VMA_READ_BIT)) { ... }
 */
static __always_inline bool vma_test(const struct vm_area_struct *vma,
                vma_flag_t bit)
{
        return vma_flags_test(&vma->flags, bit);
}

/* Helper to test any VMA flags in a VMA . */
static __always_inline bool vma_test_any_mask(const struct vm_area_struct *vma,
                vma_flags_t flags)
{
        return vma_flags_test_any_mask(&vma->flags, flags);
}

/*
 * Helper macro for testing whether any VMA flags are set in a VMA,
 * e.g.:
 *
 * if (vma_test_any(vma, VMA_IO_BIT, VMA_PFNMAP_BIT,
 *                VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... }
 */
#define vma_test_any(vma, ...) \
        vma_test_any_mask(vma, mk_vma_flags(__VA_ARGS__))

/*
 * Helper to test that ALL specified flags are set in a VMA.
 *
 * Note: appropriate locks must be held, this function does not acquire them for
 * you.
 */
static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma,
                vma_flags_t flags)
{
        return vma_flags_test_all_mask(&vma->flags, flags);
}

/*
 * Helper macro for checking that ALL specified flags are set in a VMA, e.g.:
 *
 * if (vma_test_all(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... }
 */
#define vma_test_all(vma, ...) \
        vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__))

/*
 * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set
 * (returning false if flagmask has no flags set).
 *
 * This is useful when a flag needs to be either defined or not depending upon
 * kernel configuration, e.g.:
 *
 * if (vma_test_single_mask(vma, VMA_DROPPABLE)) { ... }
 *
 * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS
 * otherwise.
 */
static __always_inline bool
vma_test_single_mask(const struct vm_area_struct *vma, vma_flags_t flagmask)
{
        return vma_flags_test_single_mask(&vma->flags, flagmask);
}

/*
 * Helper to set all VMA flags in a VMA.
 *
 * Note: appropriate locks must be held, this function does not acquire them for
 * you.
 */
static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma,
                vma_flags_t flags)
{
        vma_flags_set_mask(&vma->flags, flags);
}

/*
 * Helper macro for specifying VMA flags in a VMA, e.g.:
 *
 * vma_set_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
 *                 VMA_DONTDUMP_BIT);
 *
 * Note: appropriate locks must be held, this function does not acquire them for
 * you.
 */
#define vma_set_flags(vma, ...) \
        vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__))

/* Helper to clear all VMA flags in a VMA. */
static __always_inline void vma_clear_flags_mask(struct vm_area_struct *vma,
                vma_flags_t flags)
{
        vma_flags_clear_mask(&vma->flags, flags);
}

/*
 * Helper macro for clearing VMA flags, e.g.:
 *
 * vma_clear_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
 *                 VMA_DONTDUMP_BIT);
 */
#define vma_clear_flags(vma, ...) \
        vma_clear_flags_mask(vma, mk_vma_flags(__VA_ARGS__))

/*
 * Test whether a specific VMA flag is set in a VMA descriptor, e.g.:
 *
 * if (vma_desc_test(desc, VMA_READ_BIT)) { ... }
 */
static __always_inline bool vma_desc_test(const struct vm_area_desc *desc,
                vma_flag_t bit)
{
        return vma_flags_test(&desc->vma_flags, bit);
}

/* Helper to test any VMA flags in a VMA descriptor. */
static __always_inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc,
                vma_flags_t flags)
{
        return vma_flags_test_any_mask(&desc->vma_flags, flags);
}

/*
 * Helper macro for testing whether any VMA flags are set in a VMA descriptor,
 * e.g.:
 *
 * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT,
 *                VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... }
 */
#define vma_desc_test_any(desc, ...) \
        vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__))

/* Helper to test all VMA flags in a VMA descriptor. */
static __always_inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc,
                vma_flags_t flags)
{
        return vma_flags_test_all_mask(&desc->vma_flags, flags);
}

/*
 * Helper macro for testing whether ALL VMA flags are set in a VMA descriptor,
 * e.g.:
 *
 * if (vma_desc_test_all(desc, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
 */
#define vma_desc_test_all(desc, ...) \
        vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__))

/* Helper to set all VMA flags in a VMA descriptor. */
static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc,
                vma_flags_t flags)
{
        vma_flags_set_mask(&desc->vma_flags, flags);
}

/*
 * Helper macro for specifying VMA flags for an input pointer to a struct
 * vm_area_desc object describing a proposed VMA, e.g.:
 *
 * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
 *                 VMA_DONTDUMP_BIT);
 */
#define vma_desc_set_flags(desc, ...) \
        vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__))

/* Helper to clear all VMA flags in a VMA descriptor. */
static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc,
                vma_flags_t flags)
{
        vma_flags_clear_mask(&desc->vma_flags, flags);
}

/*
 * Helper macro for clearing VMA flags for an input pointer to a struct
 * vm_area_desc object describing a proposed VMA, e.g.:
 *
 * vma_desc_clear_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
 *                 VMA_DONTDUMP_BIT);
 */
#define vma_desc_clear_flags(desc, ...) \
        vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__))

static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
        vma->vm_ops = NULL;
}

static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
        return !vma->vm_ops;
}

/*
 * Indicate if the VMA is a heap for the given task; for
 * /proc/PID/maps that is the heap of the main task.
 */
static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
{
        return vma->vm_start < vma->vm_mm->brk &&
                vma->vm_end > vma->vm_mm->start_brk;
}

/*
 * Indicate if the VMA is a stack for the given task; for
 * /proc/PID/maps that is the stack of the main task.
 */
static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
{
        /*
         * We make no effort to guess what a given thread considers to be
         * its "stack".  It's not even well-defined for programs written
         * languages like Go.
         */
        return vma->vm_start <= vma->vm_mm->start_stack &&
                vma->vm_end >= vma->vm_mm->start_stack;
}

static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma)
{
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);

        if (!maybe_stack)
                return false;

        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
                                                VM_STACK_INCOMPLETE_SETUP)
                return true;

        return false;
}

static inline bool vma_is_foreign(const struct vm_area_struct *vma)
{
        if (!current->mm)
                return true;

        if (current->mm != vma->vm_mm)
                return true;

        return false;
}

static inline bool vma_is_accessible(const struct vm_area_struct *vma)
{
        return vma->vm_flags & VM_ACCESS_FLAGS;
}

static inline bool is_shared_maywrite(const vma_flags_t *flags)
{
        return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT);
}

static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma)
{
        return is_shared_maywrite(&vma->flags);
}

/**
 * vma_kernel_pagesize - Default page size granularity for this VMA.
 * @vma: The user mapping.
 *
 * The kernel page size specifies in which granularity VMA modifications
 * can be performed. Folios in this VMA will be aligned to, and at least
 * the size of the number of bytes returned by this function.
 *
 * The default kernel page size is not affected by Transparent Huge Pages
 * being in effect.
 *
 * Return: The default page size granularity for this VMA.
 */
static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
        if (unlikely(vma->vm_ops && vma->vm_ops->pagesize))
                return vma->vm_ops->pagesize(vma);
        return PAGE_SIZE;
}

unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);

static inline
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
{
        return mas_find(&vmi->mas, max - 1);
}

static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
{
        /*
         * Uses mas_find() to get the first VMA when the iterator starts.
         * Calling mas_next() could skip the first entry.
         */
        return mas_find(&vmi->mas, ULONG_MAX);
}

static inline
struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
{
        return mas_next_range(&vmi->mas, ULONG_MAX);
}


static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
{
        return mas_prev(&vmi->mas, 0);
}

static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
                        unsigned long start, unsigned long end, gfp_t gfp)
{
        __mas_set_range(&vmi->mas, start, end - 1);
        mas_store_gfp(&vmi->mas, NULL, gfp);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        return 0;
}

/* Free any unused preallocations */
static inline void vma_iter_free(struct vma_iterator *vmi)
{
        mas_destroy(&vmi->mas);
}

static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
                                      struct vm_area_struct *vma)
{
        vmi->mas.index = vma->vm_start;
        vmi->mas.last = vma->vm_end - 1;
        mas_store(&vmi->mas, vma);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        vma_mark_attached(vma);
        return 0;
}

static inline void vma_iter_invalidate(struct vma_iterator *vmi)
{
        mas_pause(&vmi->mas);
}

static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
{
        mas_set(&vmi->mas, addr);
}

#define for_each_vma(__vmi, __vma)                                        \
        while (((__vma) = vma_next(&(__vmi))) != NULL)

/* The MM code likes to work with exclusive end addresses */
#define for_each_vma_range(__vmi, __vma, __end)                                \
        while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)

#ifdef CONFIG_SHMEM
/*
 * The vma_is_shmem is not inline because it is used only by slow
 * paths in userfault.
 */
bool vma_is_shmem(const struct vm_area_struct *vma);
bool vma_is_anon_shmem(const struct vm_area_struct *vma);
#else
static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; }
static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; }
#endif

int vma_is_stack_for_current(const struct vm_area_struct *vma);

/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }

struct mmu_gather;
struct inode;

extern void prep_compound_page(struct page *page, unsigned int order);

static inline unsigned int folio_large_order(const struct folio *folio)
{
        return folio->_flags_1 & 0xff;
}

#ifdef NR_PAGES_IN_LARGE_FOLIO
static inline unsigned long folio_large_nr_pages(const struct folio *folio)
{
        return folio->_nr_pages;
}
#else
static inline unsigned long folio_large_nr_pages(const struct folio *folio)
{
        return 1L << folio_large_order(folio);
}
#endif

/*
 * compound_order() can be called without holding a reference, which means
 * that niceties like page_folio() don't work.  These callers should be
 * prepared to handle wild return values.  For example, PG_head may be
 * set before the order is initialised, or this may be a tail page.
 * See compaction.c for some good examples.
 */
static inline unsigned int compound_order(const struct page *page)
{
        const struct folio *folio = (struct folio *)page;

        if (!test_bit(PG_head, &folio->flags.f))
                return 0;
        return folio_large_order(folio);
}

/**
 * folio_order - The allocation order of a folio.
 * @folio: The folio.
 *
 * A folio is composed of 2^order pages.  See get_order() for the definition
 * of order.
 *
 * Return: The order of the folio.
 */
static inline unsigned int folio_order(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return 0;
        return folio_large_order(folio);
}

/**
 * folio_reset_order - Reset the folio order and derived _nr_pages
 * @folio: The folio.
 *
 * Reset the order and derived _nr_pages to 0. Must only be used in the
 * process of splitting large folios.
 */
static inline void folio_reset_order(struct folio *folio)
{
        if (WARN_ON_ONCE(!folio_test_large(folio)))
                return;
        folio->_flags_1 &= ~0xffUL;
#ifdef NR_PAGES_IN_LARGE_FOLIO
        folio->_nr_pages = 0;
#endif
}

#include <linux/huge_mm.h>

/*
 * Methods to modify the page usage count.
 *
 * What counts for a page usage:
 * - cache mapping   (page->mapping)
 * - private data    (page->private)
 * - page mapped in a task's page tables, each mapping
 *   is counted separately
 *
 * Also, many kernel routines increase the page count before a critical
 * routine so they can be sure the page doesn't go away from under them.
 */

/*
 * Drop a ref, return true if the refcount fell to zero (the page has no users)
 */
static inline int put_page_testzero(struct page *page)
{
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
        return page_ref_dec_and_test(page);
}

static inline int folio_put_testzero(struct folio *folio)
{
        return put_page_testzero(&folio->page);
}

/*
 * Try to grab a ref unless the page has a refcount of zero, return false if
 * that is the case.
 * This can be called when MMU is off so it must not access
 * any of the virtual mappings.
 */
static inline bool get_page_unless_zero(struct page *page)
{
        return page_ref_add_unless_zero(page, 1);
}

static inline struct folio *folio_get_nontail_page(struct page *page)
{
        if (unlikely(!get_page_unless_zero(page)))
                return NULL;
        return (struct folio *)page;
}

extern int page_is_ram(unsigned long pfn);

enum {
        REGION_INTERSECTS,
        REGION_DISJOINT,
        REGION_MIXED,
};

int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
                      unsigned long desc);

/* Support for virtually mapped pages */
struct page *vmalloc_to_page(const void *addr);
unsigned long vmalloc_to_pfn(const void *addr);

/*
 * Determine if an address is within the vmalloc range
 *
 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
 * is no special casing required.
 */
#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
#else
static inline bool is_vmalloc_addr(const void *x)
{
        return false;
}
static inline int is_vmalloc_or_module_addr(const void *x)
{
        return 0;
}
#endif

/*
 * How many times the entire folio is mapped as a single unit (eg by a
 * PMD or PUD entry).  This is probably not what you want, except for
 * debugging purposes or implementation of other core folio_*() primitives.
 */
static inline int folio_entire_mapcount(const struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
        if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1))
                return 0;
        return atomic_read(&folio->_entire_mapcount) + 1;
}

static inline int folio_large_mapcount(const struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
        return atomic_read(&folio->_large_mapcount) + 1;
}

/**
 * folio_mapcount() - Number of mappings of this folio.
 * @folio: The folio.
 *
 * The folio mapcount corresponds to the number of present user page table
 * entries that reference any part of a folio. Each such present user page
 * table entry must be paired with exactly on folio reference.
 *
 * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
 * exactly once.
 *
 * For hugetlb folios, each abstracted "hugetlb" user page table entry that
 * references the entire folio counts exactly once, even when such special
 * page table entries are comprised of multiple ordinary page table entries.
 *
 * Will report 0 for pages which cannot be mapped into userspace, such as
 * slab, page tables and similar.
 *
 * Return: The number of times this folio is mapped.
 */
static inline int folio_mapcount(const struct folio *folio)
{
        int mapcount;

        if (likely(!folio_test_large(folio))) {
                mapcount = atomic_read(&folio->_mapcount) + 1;
                if (page_mapcount_is_type(mapcount))
                        mapcount = 0;
                return mapcount;
        }
        return folio_large_mapcount(folio);
}

/**
 * folio_mapped - Is this folio mapped into userspace?
 * @folio: The folio.
 *
 * Return: True if any page in this folio is referenced by user page tables.
 */
static inline bool folio_mapped(const struct folio *folio)
{
        return folio_mapcount(folio) >= 1;
}

/*
 * Return true if this page is mapped into pagetables.
 * For compound page it returns true if any sub-page of compound page is mapped,
 * even if this particular sub-page is not itself mapped by any PTE or PMD.
 */
static inline bool page_mapped(const struct page *page)
{
        return folio_mapped(page_folio(page));
}

static inline struct page *virt_to_head_page(const void *x)
{
        struct page *page = virt_to_page(x);

        return compound_head(page);
}

static inline struct folio *virt_to_folio(const void *x)
{
        struct page *page = virt_to_page(x);

        return page_folio(page);
}

void __folio_put(struct folio *folio);

void split_page(struct page *page, unsigned int order);
void folio_copy(struct folio *dst, struct folio *src);
int folio_mc_copy(struct folio *dst, struct folio *src);

unsigned long nr_free_buffer_pages(void);

/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(const struct page *page)
{
        return PAGE_SIZE << compound_order(page);
}

/* Returns the number of bits needed for the number of bytes in a page */
static inline unsigned int page_shift(struct page *page)
{
        return PAGE_SHIFT + compound_order(page);
}

/**
 * thp_order - Order of a transparent huge page.
 * @page: Head page of a transparent huge page.
 */
static inline unsigned int thp_order(struct page *page)
{
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        return compound_order(page);
}

/**
 * thp_size - Size of a transparent huge page.
 * @page: Head page of a transparent huge page.
 *
 * Return: Number of bytes in this page.
 */
static inline unsigned long thp_size(struct page *page)
{
        return PAGE_SIZE << thp_order(page);
}

#ifdef CONFIG_MMU
/*
 * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
 * servicing faults for write access.  In the normal case, do always want
 * pte_mkwrite.  But get_user_pages can cause write faults for mappings
 * that do not have writing enabled, when used by access_process_vm.
 */
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pte = pte_mkwrite(pte, vma);
        return pte;
}

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page);
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                struct page *page, unsigned int nr, unsigned long addr);

vm_fault_t finish_fault(struct vm_fault *vmf);
#endif

/*
 * Multiple processes may "see" the same page. E.g. for untouched
 * mappings of /dev/null, all processes see the same page full of
 * zeroes, and text pages of executables and shared libraries have
 * only one copy in memory, at most, normally.
 *
 * For the non-reserved pages, page_count(page) denotes a reference count.
 *   page_count() == 0 means the page is free. page->lru is then used for
 *   freelist management in the buddy allocator.
 *   page_count() > 0  means the page has been allocated.
 *
 * Pages are allocated by the slab allocator in order to provide memory
 * to kmalloc and kmem_cache_alloc. In this case, the management of the
 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
 * unless a particular usage is carefully commented. (the responsibility of
 * freeing the kmalloc memory is the caller's, of course).
 *
 * A page may be used by anyone else who does a __get_free_page().
 * In this case, page_count still tracks the references, and should only
 * be used through the normal accessor functions. The top bits of page->flags
 * and page->virtual store page management information, but all other fields
 * are unused and could be used privately, carefully. The management of this
 * page is the responsibility of the one who allocated it, and those who have
 * subsequently been given references to it.
 *
 * The other pages (we may call them "pagecache pages") are completely
 * managed by the Linux memory manager: I/O, buffers, swapping etc.
 * The following discussion applies only to them.
 *
 * A pagecache page contains an opaque `private' member, which belongs to the
 * page's address_space. Usually, this is the address of a circular list of
 * the page's disk buffers. PG_private must be set to tell the VM to call
 * into the filesystem to release these pages.
 *
 * A folio may belong to an inode's memory mapping. In this case,
 * folio->mapping points to the inode, and folio->index is the file
 * offset of the folio, in units of PAGE_SIZE.
 *
 * If pagecache pages are not associated with an inode, they are said to be
 * anonymous pages. These may become associated with the swapcache, and in that
 * case PG_swapcache is set, and page->private is an offset into the swapcache.
 *
 * In either case (swapcache or inode backed), the pagecache itself holds one
 * reference to the page. Setting PG_private should also increment the
 * refcount. The each user mapping also has a reference to the page.
 *
 * The pagecache pages are stored in a per-mapping radix tree, which is
 * rooted at mapping->i_pages, and indexed by offset.
 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
 * lists, we instead now tag pages as dirty/writeback in the radix tree.
 *
 * All pagecache pages may be subject to I/O:
 * - inode pages may need to be read from disk,
 * - inode pages which have been modified and are MAP_SHARED may need
 *   to be written back to the inode on disk,
 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
 *   modified may need to be swapped out to swap space and (later) to be read
 *   back into memory.
 */

/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
        ((unsigned int) folio_ref_count(folio) + 127u <= 127u)

/**
 * folio_get - Increment the reference count on a folio.
 * @folio: The folio.
 *
 * Context: May be called in any context, as long as you know that
 * you have a refcount on the folio.  If you do not already have one,
 * folio_try_get() may be the right interface for you to use.
 */
static inline void folio_get(struct folio *folio)
{
        VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
        folio_ref_inc(folio);
}

static inline void get_page(struct page *page)
{
        struct folio *folio = page_folio(page);
        if (WARN_ON_ONCE(folio_test_slab(folio)))
                return;
        if (WARN_ON_ONCE(folio_test_large_kmalloc(folio)))
                return;
        folio_get(folio);
}

static inline __must_check bool try_get_page(struct page *page)
{
        page = compound_head(page);
        if (WARN_ON_ONCE(page_ref_count(page) <= 0))
                return false;
        page_ref_inc(page);
        return true;
}

/**
 * folio_put - Decrement the reference count on a folio.
 * @folio: The folio.
 *
 * If the folio's reference count reaches zero, the memory will be
 * released back to the page allocator and may be used by another
 * allocation immediately.  Do not access the memory or the struct folio
 * after calling folio_put() unless you can be sure that it wasn't the
 * last reference.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folio_put(struct folio *folio)
{
        if (folio_put_testzero(folio))
                __folio_put(folio);
}

/**
 * folio_put_refs - Reduce the reference count on a folio.
 * @folio: The folio.
 * @refs: The amount to subtract from the folio's reference count.
 *
 * If the folio's reference count reaches zero, the memory will be
 * released back to the page allocator and may be used by another
 * allocation immediately.  Do not access the memory or the struct folio
 * after calling folio_put_refs() unless you can be sure that these weren't
 * the last references.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folio_put_refs(struct folio *folio, int refs)
{
        if (folio_ref_sub_and_test(folio, refs))
                __folio_put(folio);
}

void folios_put_refs(struct folio_batch *folios, unsigned int *refs);

/*
 * union release_pages_arg - an array of pages or folios
 *
 * release_pages() releases a simple array of multiple pages, and
 * accepts various different forms of said page array: either
 * a regular old boring array of pages, an array of folios, or
 * an array of encoded page pointers.
 *
 * The transparent union syntax for this kind of "any of these
 * argument types" is all kinds of ugly, so look away.
 */
typedef union {
        struct page **pages;
        struct folio **folios;
        struct encoded_page **encoded_pages;
} release_pages_arg __attribute__ ((__transparent_union__));

void release_pages(release_pages_arg, int nr);

/**
 * folios_put - Decrement the reference count on an array of folios.
 * @folios: The folios.
 *
 * Like folio_put(), but for a batch of folios.  This is more efficient
 * than writing the loop yourself as it will optimise the locks which need
 * to be taken if the folios are freed.  The folios batch is returned
 * empty and ready to be reused for another batch; there is no need to
 * reinitialise it.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folios_put(struct folio_batch *folios)
{
        folios_put_refs(folios, NULL);
}

static inline void put_page(struct page *page)
{
        struct folio *folio = page_folio(page);

        if (folio_test_slab(folio) || folio_test_large_kmalloc(folio))
                return;

        folio_put(folio);
}

/*
 * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
 * the page's refcount so that two separate items are tracked: the original page
 * reference count, and also a new count of how many pin_user_pages() calls were
 * made against the page. ("gup-pinned" is another term for the latter).
 *
 * With this scheme, pin_user_pages() becomes special: such pages are marked as
 * distinct from normal pages. As such, the unpin_user_page() call (and its
 * variants) must be used in order to release gup-pinned pages.
 *
 * Choice of value:
 *
 * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
 * counts with respect to pin_user_pages() and unpin_user_page() becomes
 * simpler, due to the fact that adding an even power of two to the page
 * refcount has the effect of using only the upper N bits, for the code that
 * counts up using the bias value. This means that the lower bits are left for
 * the exclusive use of the original code that increments and decrements by one
 * (or at least, by much smaller values than the bias value).
 *
 * Of course, once the lower bits overflow into the upper bits (and this is
 * OK, because subtraction recovers the original values), then visual inspection
 * no longer suffices to directly view the separate counts. However, for normal
 * applications that don't have huge page reference counts, this won't be an
 * issue.
 *
 * Locking: the lockless algorithm described in folio_try_get_rcu()
 * provides safe operation for get_user_pages(), folio_mkclean() and
 * other calls that race to set up page table entries.
 */
#define GUP_PIN_COUNTING_BIAS (1U << 10)

void unpin_user_page(struct page *page);
void unpin_folio(struct folio *folio);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty);
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
                                      bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);
void unpin_user_folio(struct folio *folio, unsigned long npages);
void unpin_folios(struct folio **folios, unsigned long nfolios);

static inline bool is_cow_mapping(vm_flags_t flags)
{
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}

static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc)
{
        const vma_flags_t *flags = &desc->vma_flags;

        return vma_flags_test(flags, VMA_MAYWRITE_BIT) &&
                !vma_flags_test(flags, VMA_SHARED_BIT);
}

#ifndef CONFIG_MMU
static inline bool is_nommu_shared_mapping(vm_flags_t flags)
{
        /*
         * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
         * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
         * a file mapping. R/O MAP_PRIVATE mappings might still modify
         * underlying memory if ptrace is active, so this is only possible if
         * ptrace does not apply. Note that there is no mprotect() to upgrade
         * write permissions later.
         */
        return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
}

static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags)
{
        return vma_flags_test_any(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT);
}
#endif

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif

/*
 * The identification function is mainly used by the buddy allocator for
 * determining if two pages could be buddies. We are not really identifying
 * the zone since we could be using the section number id if we do not have
 * node id available in page flags.
 * We only guarantee that it will return the same value for two combinable
 * pages in a zone.
 */
static inline int page_zone_id(struct page *page)
{
        return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK;
}

#ifdef NODE_NOT_IN_PAGE_FLAGS
int memdesc_nid(memdesc_flags_t mdf);
#else
static inline int memdesc_nid(memdesc_flags_t mdf)
{
        return (mdf.f >> NODES_PGSHIFT) & NODES_MASK;
}
#endif

static inline int page_to_nid(const struct page *page)
{
        return memdesc_nid(PF_POISONED_CHECK(page)->flags);
}

static inline int folio_nid(const struct folio *folio)
{
        return memdesc_nid(folio->flags);
}

#ifdef CONFIG_NUMA_BALANCING
/* page access time bits needs to hold at least 4 seconds */
#define PAGE_ACCESS_TIME_MIN_BITS        12
#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
#define PAGE_ACCESS_TIME_BUCKETS                                \
        (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
#else
#define PAGE_ACCESS_TIME_BUCKETS        0
#endif

#define PAGE_ACCESS_TIME_MASK                                \
        (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)

static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
        return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
}

static inline int cpupid_to_pid(int cpupid)
{
        return cpupid & LAST__PID_MASK;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
}

static inline int cpupid_to_nid(int cpupid)
{
        return cpu_to_node(cpupid_to_cpu(cpupid));
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
}

static inline bool cpupid_cpu_unset(int cpupid)
{
        return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
}

static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
{
        return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
}

#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
        return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}

static inline int folio_last_cpupid(struct folio *folio)
{
        return folio->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
        page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
static inline int folio_last_cpupid(struct folio *folio)
{
        return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}

int folio_xchg_last_cpupid(struct folio *folio, int cpupid);

static inline void page_cpupid_reset_last(struct page *page)
{
        page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */

static inline int folio_xchg_access_time(struct folio *folio, int time)
{
        int last_time;

        last_time = folio_xchg_last_cpupid(folio,
                                           time >> PAGE_ACCESS_TIME_BUCKETS);
        return last_time << PAGE_ACCESS_TIME_BUCKETS;
}

static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
        unsigned int pid_bit;

        pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
        if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
                __set_bit(pid_bit, &vma->numab_state->pids_active[1]);
        }
}

bool folio_use_access_time(struct folio *folio);
#else /* !CONFIG_NUMA_BALANCING */
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
        return folio_nid(folio); /* XXX */
}

static inline int folio_xchg_access_time(struct folio *folio, int time)
{
        return 0;
}

static inline int folio_last_cpupid(struct folio *folio)
{
        return folio_nid(folio); /* XXX */
}

static inline int cpupid_to_nid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_pid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return -1;
}

static inline int cpu_pid_to_cpupid(int nid, int pid)
{
        return -1;
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return true;
}

static inline void page_cpupid_reset_last(struct page *page)
{
}

static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
        return false;
}

static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
}
static inline bool folio_use_access_time(struct folio *folio)
{
        return false;
}
#endif /* CONFIG_NUMA_BALANCING */

#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)

/*
 * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
 * setting tags for all pages to native kernel tag value 0xff, as the default
 * value 0x00 maps to 0xff.
 */

static inline u8 page_kasan_tag(const struct page *page)
{
        u8 tag = KASAN_TAG_KERNEL;

        if (kasan_enabled()) {
                tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
                tag ^= 0xff;
        }

        return tag;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
        unsigned long old_flags, flags;

        if (!kasan_enabled())
                return;

        tag ^= 0xff;
        old_flags = READ_ONCE(page->flags.f);
        do {
                flags = old_flags;
                flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
                flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
        } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags)));
}

static inline void page_kasan_tag_reset(struct page *page)
{
        if (kasan_enabled())
                page_kasan_tag_set(page, KASAN_TAG_KERNEL);
}

#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline u8 page_kasan_tag(const struct page *page)
{
        return 0xff;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
static inline void page_kasan_tag_reset(struct page *page) { }

#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline struct zone *page_zone(const struct page *page)
{
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}

static inline pg_data_t *page_pgdat(const struct page *page)
{
        return NODE_DATA(page_to_nid(page));
}

static inline pg_data_t *folio_pgdat(const struct folio *folio)
{
        return NODE_DATA(folio_nid(folio));
}

static inline struct zone *folio_zone(const struct folio *folio)
{
        return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)];
}

#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
        page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
        page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}

static inline unsigned long memdesc_section(memdesc_flags_t mdf)
{
        return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#else /* !SECTION_IN_PAGE_FLAGS */
static inline unsigned long memdesc_section(memdesc_flags_t mdf)
{
        return 0;
}
#endif /* SECTION_IN_PAGE_FLAGS */

/**
 * folio_pfn - Return the Page Frame Number of a folio.
 * @folio: The folio.
 *
 * A folio may contain multiple pages.  The pages have consecutive
 * Page Frame Numbers.
 *
 * Return: The Page Frame Number of the first page in the folio.
 */
static inline unsigned long folio_pfn(const struct folio *folio)
{
        return page_to_pfn(&folio->page);
}

static inline struct folio *pfn_folio(unsigned long pfn)
{
        return page_folio(pfn_to_page(pfn));
}

#ifdef CONFIG_MMU
static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot)
{
        return pfn_pte(page_to_pfn(page), pgprot);
}

/**
 * folio_mk_pte - Create a PTE for this folio
 * @folio: The folio to create a PTE for
 * @pgprot: The page protection bits to use
 *
 * Create a page table entry for the first page of this folio.
 * This is suitable for passing to set_ptes().
 *
 * Return: A page table entry suitable for mapping this folio.
 */
static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot)
{
        return pfn_pte(folio_pfn(folio), pgprot);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/**
 * folio_mk_pmd - Create a PMD for this folio
 * @folio: The folio to create a PMD for
 * @pgprot: The page protection bits to use
 *
 * Create a page table entry for the first page of this folio.
 * This is suitable for passing to set_pmd_at().
 *
 * Return: A page table entry suitable for mapping this folio.
 */
static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot)
{
        return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
/**
 * folio_mk_pud - Create a PUD for this folio
 * @folio: The folio to create a PUD for
 * @pgprot: The page protection bits to use
 *
 * Create a page table entry for the first page of this folio.
 * This is suitable for passing to set_pud_at().
 *
 * Return: A page table entry suitable for mapping this folio.
 */
static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot)
{
        return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot));
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_MMU */

static inline bool folio_has_pincount(const struct folio *folio)
{
        if (IS_ENABLED(CONFIG_64BIT))
                return folio_test_large(folio);
        return folio_order(folio) > 1;
}

/**
 * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
 * @folio: The folio.
 *
 * This function checks if a folio has been pinned via a call to
 * a function in the pin_user_pages() family.
 *
 * For small folios, the return value is partially fuzzy: false is not fuzzy,
 * because it means "definitely not pinned for DMA", but true means "probably
 * pinned for DMA, but possibly a false positive due to having at least
 * GUP_PIN_COUNTING_BIAS worth of normal folio references".
 *
 * False positives are OK, because: a) it's unlikely for a folio to
 * get that many refcounts, and b) all the callers of this routine are
 * expected to be able to deal gracefully with a false positive.
 *
 * For most large folios, the result will be exactly correct. That's because
 * we have more tracking data available: the _pincount field is used
 * instead of the GUP_PIN_COUNTING_BIAS scheme.
 *
 * For more information, please see Documentation/core-api/pin_user_pages.rst.
 *
 * Return: True, if it is likely that the folio has been "dma-pinned".
 * False, if the folio is definitely not dma-pinned.
 */
static inline bool folio_maybe_dma_pinned(struct folio *folio)
{
        if (folio_has_pincount(folio))
                return atomic_read(&folio->_pincount) > 0;

        /*
         * folio_ref_count() is signed. If that refcount overflows, then
         * folio_ref_count() returns a negative value, and callers will avoid
         * further incrementing the refcount.
         *
         * Here, for that overflow case, use the sign bit to count a little
         * bit higher via unsigned math, and thus still get an accurate result.
         */
        return ((unsigned int)folio_ref_count(folio)) >=
                GUP_PIN_COUNTING_BIAS;
}

/*
 * This should most likely only be called during fork() to see whether we
 * should break the cow immediately for an anon page on the src mm.
 *
 * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
 */
static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
                                          struct folio *folio)
{
        VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));

        if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))
                return false;

        return folio_maybe_dma_pinned(folio);
}

/**
 * is_zero_page - Query if a page is a zero page
 * @page: The page to query
 *
 * This returns true if @page is one of the permanent zero pages.
 */
static inline bool is_zero_page(const struct page *page)
{
        return is_zero_pfn(page_to_pfn(page));
}

/**
 * is_zero_folio - Query if a folio is a zero page
 * @folio: The folio to query
 *
 * This returns true if @folio is one of the permanent zero pages.
 */
static inline bool is_zero_folio(const struct folio *folio)
{
        return is_zero_page(&folio->page);
}

/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */
#ifdef CONFIG_MIGRATION
static inline bool folio_is_longterm_pinnable(struct folio *folio)
{
#ifdef CONFIG_CMA
        int mt = folio_migratetype(folio);

        if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
                return false;
#endif
        /* The zero page can be "pinned" but gets special handling. */
        if (is_zero_folio(folio))
                return true;

        /* Coherent device memory must always allow eviction. */
        if (folio_is_device_coherent(folio))
                return false;

        /*
         * Filesystems can only tolerate transient delays to truncate and
         * hole-punch operations
         */
        if (folio_is_fsdax(folio))
                return false;

        /* Otherwise, non-movable zone folios can be pinned. */
        return !folio_is_zone_movable(folio);

}
#else
static inline bool folio_is_longterm_pinnable(struct folio *folio)
{
        return true;
}
#endif

static inline void set_page_zone(struct page *page, enum zone_type zone)
{
        page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT);
        page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}

static inline void set_page_node(struct page *page, unsigned long node)
{
        page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT);
        page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT;
}

static inline void set_page_links(struct page *page, enum zone_type zone,
        unsigned long node, unsigned long pfn)
{
        set_page_zone(page, zone);
        set_page_node(page, node);
#ifdef SECTION_IN_PAGE_FLAGS
        set_page_section(page, pfn_to_section_nr(pfn));
#endif
}

/**
 * folio_nr_pages - The number of pages in the folio.
 * @folio: The folio.
 *
 * Return: A positive power of two.
 */
static inline unsigned long folio_nr_pages(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return 1;
        return folio_large_nr_pages(folio);
}

/*
 * compound_nr() returns the number of pages in this potentially compound
 * page.  compound_nr() can be called on a tail page, and is defined to
 * return 1 in that case.
 */
static inline unsigned long compound_nr(const struct page *page)
{
        const struct folio *folio = (struct folio *)page;

        if (!test_bit(PG_head, &folio->flags.f))
                return 1;
        return folio_large_nr_pages(folio);
}

/**
 * folio_next - Move to the next physical folio.
 * @folio: The folio we're currently operating on.
 *
 * If you have physically contiguous memory which may span more than
 * one folio (eg a &struct bio_vec), use this function to move from one
 * folio to the next.  Do not use it if the memory is only virtually
 * contiguous as the folios are almost certainly not adjacent to each
 * other.  This is the folio equivalent to writing ``page++``.
 *
 * Context: We assume that the folios are refcounted and/or locked at a
 * higher level and do not adjust the reference counts.
 * Return: The next struct folio.
 */
static inline struct folio *folio_next(struct folio *folio)
{
        return (struct folio *)folio_page(folio, folio_nr_pages(folio));
}

/**
 * folio_shift - The size of the memory described by this folio.
 * @folio: The folio.
 *
 * A folio represents a number of bytes which is a power-of-two in size.
 * This function tells you which power-of-two the folio is.  See also
 * folio_size() and folio_order().
 *
 * Context: The caller should have a reference on the folio to prevent
 * it from being split.  It is not necessary for the folio to be locked.
 * Return: The base-2 logarithm of the size of this folio.
 */
static inline unsigned int folio_shift(const struct folio *folio)
{
        return PAGE_SHIFT + folio_order(folio);
}

/**
 * folio_size - The number of bytes in a folio.
 * @folio: The folio.
 *
 * Context: The caller should have a reference on the folio to prevent
 * it from being split.  It is not necessary for the folio to be locked.
 * Return: The number of bytes in this folio.
 */
static inline size_t folio_size(const struct folio *folio)
{
        return PAGE_SIZE << folio_order(folio);
}

/**
 * folio_maybe_mapped_shared - Whether the folio is mapped into the page
 *                               tables of more than one MM
 * @folio: The folio.
 *
 * This function checks if the folio maybe currently mapped into more than one
 * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single
 * MM ("mapped exclusively").
 *
 * For KSM folios, this function also returns "mapped shared" when a folio is
 * mapped multiple times into the same MM, because the individual page mappings
 * are independent.
 *
 * For small anonymous folios and anonymous hugetlb folios, the return
 * value will be exactly correct: non-KSM folios can only be mapped at most once
 * into an MM, and they cannot be partially mapped. KSM folios are
 * considered shared even if mapped multiple times into the same MM.
 *
 * For other folios, the result can be fuzzy:
 *    #. For partially-mappable large folios (THP), the return value can wrongly
 *       indicate "mapped shared" (false positive) if a folio was mapped by
 *       more than two MMs at one point in time.
 *    #. For pagecache folios (including hugetlb), the return value can wrongly
 *       indicate "mapped shared" (false positive) when two VMAs in the same MM
 *       cover the same file range.
 *
 * Further, this function only considers current page table mappings that
 * are tracked using the folio mapcount(s).
 *
 * This function does not consider:
 *    #. If the folio might get mapped in the (near) future (e.g., swapcache,
 *       pagecache, temporary unmapping for migration).
 *    #. If the folio is mapped differently (VM_PFNMAP).
 *    #. If hugetlb page table sharing applies. Callers might want to check
 *       hugetlb_pmd_shared().
 *
 * Return: Whether the folio is estimated to be mapped into more than one MM.
 */
static inline bool folio_maybe_mapped_shared(struct folio *folio)
{
        int mapcount = folio_mapcount(folio);

        /* Only partially-mappable folios require more care. */
        if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
                return mapcount > 1;

        /*
         * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ...
         * simply assume "mapped shared", nobody should really care
         * about this for arbitrary kernel allocations.
         */
        if (!IS_ENABLED(CONFIG_MM_ID))
                return true;

        /*
         * A single mapping implies "mapped exclusively", even if the
         * folio flag says something different: it's easier to handle this
         * case here instead of on the RMAP hot path.
         */
        if (mapcount <= 1)
                return false;
        return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
}

/**
 * folio_expected_ref_count - calculate the expected folio refcount
 * @folio: the folio
 *
 * Calculate the expected folio refcount, taking references from the pagecache,
 * swapcache, PG_private and page table mappings into account. Useful in
 * combination with folio_ref_count() to detect unexpected references (e.g.,
 * GUP or other temporary references).
 *
 * Does currently not consider references from the LRU cache. If the folio
 * was isolated from the LRU (which is the case during migration or split),
 * the LRU cache does not apply.
 *
 * Calling this function on an unmapped folio -- !folio_mapped() -- that is
 * locked will return a stable result.
 *
 * Calling this function on a mapped folio will not result in a stable result,
 * because nothing stops additional page table mappings from coming (e.g.,
 * fork()) or going (e.g., munmap()).
 *
 * Calling this function without the folio lock will also not result in a
 * stable result: for example, the folio might get dropped from the swapcache
 * concurrently.
 *
 * However, even when called without the folio lock or on a mapped folio,
 * this function can be used to detect unexpected references early (for example,
 * if it makes sense to even lock the folio and unmap it).
 *
 * The caller must add any reference (e.g., from folio_try_get()) it might be
 * holding itself to the result.
 *
 * Returns: the expected folio refcount.
 */
static inline int folio_expected_ref_count(const struct folio *folio)
{
        const int order = folio_order(folio);
        int ref_count = 0;

        if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio)))
                return 0;

        /* One reference per page from the swapcache. */
        ref_count += folio_test_swapcache(folio) << order;

        if (!folio_test_anon(folio)) {
                /* One reference per page from the pagecache. */
                ref_count += !!folio->mapping << order;
                /* One reference from PG_private. */
                ref_count += folio_test_private(folio);
        }

        /* One reference per page table mapping. */
        return ref_count + folio_mapcount(folio);
}

#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
static inline int arch_make_folio_accessible(struct folio *folio)
{
        return 0;
}
#endif

/*
 * Some inline functions in vmstat.h depend on page_zone()
 */
#include <linux/vmstat.h>

#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
#define HASHED_PAGE_VIRTUAL
#endif

#if defined(WANT_PAGE_VIRTUAL)
static inline void *page_address(const struct page *page)
{
        return page->virtual;
}
static inline void set_page_address(struct page *page, void *address)
{
        page->virtual = address;
}
#define page_address_init()  do { } while(0)
#endif

#if defined(HASHED_PAGE_VIRTUAL)
void *page_address(const struct page *page);
void set_page_address(struct page *page, void *virtual);
void page_address_init(void);
#endif

static __always_inline void *lowmem_page_address(const struct page *page)
{
        return page_to_virt(page);
}

#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
#define page_address(page) lowmem_page_address(page)
#define set_page_address(page, address)  do { } while(0)
#define page_address_init()  do { } while(0)
#endif

static inline void *folio_address(const struct folio *folio)
{
        return page_address(&folio->page);
}

/*
 * Return true only if the page has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool page_is_pfmemalloc(const struct page *page)
{
        /*
         * lru.next has bit 1 set if the page is allocated from the
         * pfmemalloc reserves.  Callers may simply overwrite it if
         * they do not need to preserve that information.
         */
        return (uintptr_t)page->lru.next & BIT(1);
}

/*
 * Return true only if the folio has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool folio_is_pfmemalloc(const struct folio *folio)
{
        /*
         * lru.next has bit 1 set if the page is allocated from the
         * pfmemalloc reserves.  Callers may simply overwrite it if
         * they do not need to preserve that information.
         */
        return (uintptr_t)folio->lru.next & BIT(1);
}

/*
 * Only to be called by the page allocator on a freshly allocated
 * page.
 */
static inline void set_page_pfmemalloc(struct page *page)
{
        page->lru.next = (void *)BIT(1);
}

static inline void clear_page_pfmemalloc(struct page *page)
{
        page->lru.next = NULL;
}

/*
 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
 */
extern void pagefault_out_of_memory(void);

#define offset_in_page(p)        ((unsigned long)(p) & ~PAGE_MASK)
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))

/*
 * Parameter block passed down to zap_pte_range in exceptional cases.
 */
struct zap_details {
        struct folio *single_folio;        /* Locked folio to be unmapped */
        bool skip_cows;                        /* Do not zap COWed private pages */
        bool reclaim_pt;                /* Need reclaim page tables? */
        bool reaping;                        /* Reaping, do not block. */
        zap_flags_t zap_flags;                /* Extra flags for zapping */
};

/*
 * Whether to drop the pte markers, for example, the uffd-wp information for
 * file-backed memory.  This should only be specified when we will completely
 * drop the page in the mm, either by truncation or unmapping of the vma.  By
 * default, the flag is not set.
 */
#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
/* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))

#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
static inline bool can_do_mlock(void) { return false; }
#endif
extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct ucounts *);

struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd);
struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t pud);

void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address,
                  unsigned long size);
void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
                           unsigned long size);
/**
 * zap_vma - zap all page table entries in a vma
 * @vma: The vma to zap.
 */
static inline void zap_vma(struct vm_area_struct *vma)
{
        zap_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
}
struct mmu_notifier_range;

void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write);

struct follow_pfnmap_args {
        /**
         * Inputs:
         * @vma: Pointer to @vm_area_struct struct
         * @address: the virtual address to walk
         */
        struct vm_area_struct *vma;
        unsigned long address;
        /**
         * Internals:
         *
         * The caller shouldn't touch any of these.
         */
        spinlock_t *lock;
        pte_t *ptep;
        /**
         * Outputs:
         *
         * @pfn: the PFN of the address
         * @addr_mask: address mask covering pfn
         * @pgprot: the pgprot_t of the mapping
         * @writable: whether the mapping is writable
         * @special: whether the mapping is a special mapping (real PFN maps)
         */
        unsigned long pfn;
        unsigned long addr_mask;
        pgprot_t pgprot;
        bool writable;
        bool special;
};
int follow_pfnmap_start(struct follow_pfnmap_args *args);
void follow_pfnmap_end(struct follow_pfnmap_args *args);

extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
int generic_error_remove_folio(struct address_space *mapping,
                struct folio *folio);

struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                unsigned long address, struct pt_regs *regs);

#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                  unsigned long address, unsigned int flags,
                                  struct pt_regs *regs);
extern int fixup_user_fault(struct mm_struct *mm,
                            unsigned long address, unsigned int fault_flags,
                            bool *unlocked);
void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows);
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                         unsigned long address, unsigned int flags,
                                         struct pt_regs *regs)
{
        /* should never happen if there's no MMU */
        BUG();
        return VM_FAULT_SIGBUS;
}
static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
                unsigned int fault_flags, bool *unlocked)
{
        /* should never happen if there's no MMU */
        BUG();
        return -EFAULT;
}
static inline void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows) { }
#endif

static inline void unmap_shared_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen)
{
        unmap_mapping_range(mapping, holebegin, holelen, 0);
}

static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm,
                                                unsigned long addr);

extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);

#ifdef CONFIG_BPF_SYSCALL
extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
                              void *buf, int len, unsigned int gup_flags);
#endif

long get_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked);
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked);

/*
 * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
 */
static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    int gup_flags,
                                                    struct vm_area_struct **vmap)
{
        struct page *page;
        struct vm_area_struct *vma;
        int got;

        if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
                return ERR_PTR(-EINVAL);

        got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);

        if (got < 0)
                return ERR_PTR(got);

        vma = vma_lookup(mm, addr);
        if (WARN_ON_ONCE(!vma)) {
                put_page(page);
                return ERR_PTR(-EINVAL);
        }

        *vmap = vma;
        return page;
}

long get_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages);
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
                      struct folio **folios, unsigned int max_folios,
                      pgoff_t *offset);
int folio_add_pins(struct folio *folio, unsigned int pins);

int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
void folio_add_pin(struct folio *folio);

int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        const struct task_struct *task, bool bypass_rlim);

struct kvec;
struct page *get_dump_page(unsigned long addr, int *locked);

bool folio_mark_dirty(struct folio *folio);
bool folio_mark_dirty_lock(struct folio *folio);
bool set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);

int get_cmdline(struct task_struct *task, char *buffer, int buflen);

/*
 * Flags used by change_protection().  For now we make it a bitmap so
 * that we can pass in multiple flags just like parameters.  However
 * for now all the callers are only use one of the flags at the same
 * time.
 */
/*
 * Whether we should manually check if we can map individual PTEs writable,
 * because something (e.g., COW, uffd-wp) blocks that from happening for all
 * PTEs automatically in a writable mapping.
 */
#define  MM_CP_TRY_CHANGE_WRITABLE           (1UL << 0)
/* Whether this protection change is for NUMA hints */
#define  MM_CP_PROT_NUMA                   (1UL << 1)
/* Whether this change is for write protecting */
#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                            MM_CP_UFFD_WP_RESOLVE)

bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
extern long change_protection(struct mmu_gather *tlb,
                              struct vm_area_struct *vma, unsigned long start,
                              unsigned long end, unsigned long cp_flags);
extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
          struct vm_area_struct *vma, struct vm_area_struct **pprev,
          unsigned long start, unsigned long end, vm_flags_t newflags);

/*
 * doesn't attempt to fault and will return short.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages);

static inline bool get_user_page_fast_only(unsigned long addr,
                        unsigned int gup_flags, struct page **pagep)
{
        return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
}
/*
 * per-process(per-mm_struct) statistics.
 */
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
        return percpu_counter_read_positive(&mm->rss_stat[member]);
}

static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member)
{
        return percpu_counter_sum_positive(&mm->rss_stat[member]);
}

void mm_trace_rss_stat(struct mm_struct *mm, int member);

static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
        percpu_counter_add(&mm->rss_stat[member], value);

        mm_trace_rss_stat(mm, member);
}

static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
        percpu_counter_inc(&mm->rss_stat[member]);

        mm_trace_rss_stat(mm, member);
}

static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
        percpu_counter_dec(&mm->rss_stat[member]);

        mm_trace_rss_stat(mm, member);
}

/* Optimized variant when folio is already known not to be anon */
static inline int mm_counter_file(struct folio *folio)
{
        if (folio_test_swapbacked(folio))
                return MM_SHMEMPAGES;
        return MM_FILEPAGES;
}

static inline int mm_counter(struct folio *folio)
{
        if (folio_test_anon(folio))
                return MM_ANONPAGES;
        return mm_counter_file(folio);
}

static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
        return get_mm_counter(mm, MM_FILEPAGES) +
                get_mm_counter(mm, MM_ANONPAGES) +
                get_mm_counter(mm, MM_SHMEMPAGES);
}

static inline unsigned long get_mm_rss_sum(struct mm_struct *mm)
{
        return get_mm_counter_sum(mm, MM_FILEPAGES) +
                get_mm_counter_sum(mm, MM_ANONPAGES) +
                get_mm_counter_sum(mm, MM_SHMEMPAGES);
}

static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
        return max(mm->hiwater_rss, get_mm_rss(mm));
}

static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
{
        return max(mm->hiwater_vm, mm->total_vm);
}

static inline void update_hiwater_rss(struct mm_struct *mm)
{
        unsigned long _rss = get_mm_rss(mm);

        if (data_race(mm->hiwater_rss) < _rss)
                data_race(mm->hiwater_rss = _rss);
}

static inline void update_hiwater_vm(struct mm_struct *mm)
{
        if (mm->hiwater_vm < mm->total_vm)
                mm->hiwater_vm = mm->total_vm;
}

static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
{
        mm->hiwater_rss = get_mm_rss(mm);
}

static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
                                         struct mm_struct *mm)
{
        unsigned long hiwater_rss = get_mm_hiwater_rss(mm);

        if (*maxrss < hiwater_rss)
                *maxrss = hiwater_rss;
}

#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
        return 0;
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte;
}
#endif

#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
static inline bool pmd_special(pmd_t pmd)
{
        return false;
}

static inline pmd_t pmd_mkspecial(pmd_t pmd)
{
        return pmd;
}
#endif        /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */

#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
static inline bool pud_special(pud_t pud)
{
        return false;
}

static inline pud_t pud_mkspecial(pud_t pud)
{
        return pud;
}
#endif        /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */

extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
                             spinlock_t **ptl);

#ifdef __PAGETABLE_P4D_FOLDED
static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                                                unsigned long address)
{
        return 0;
}
#else
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
#endif

#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                                                unsigned long address)
{
        return 0;
}
static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
static inline void mm_dec_nr_puds(struct mm_struct *mm) {}

#else
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);

static inline void mm_inc_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}
#endif

#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
                                                unsigned long address)
{
        return 0;
}

static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}

#else
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);

static inline void mm_inc_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}
#endif

#ifdef CONFIG_MMU
static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->pgtables_bytes, 0);
}

static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return atomic_long_read(&mm->pgtables_bytes);
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm)
{
        atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_ptes(struct mm_struct *mm)
{
        atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}
#else

static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return 0;
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
#endif

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
int __pte_alloc_kernel(pmd_t *pmd);

#if defined(CONFIG_MMU)

static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                unsigned long address)
{
        return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
                NULL : p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                unsigned long address)
{
        return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
                NULL : pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

enum pt_flags {
        PT_kernel = PG_referenced,
        PT_reserved = PG_reserved,
        /* High bits are used for zone/node/section */
};

static inline struct ptdesc *virt_to_ptdesc(const void *x)
{
        return page_ptdesc(virt_to_page(x));
}

/**
 * ptdesc_address - Virtual address of page table.
 * @pt: Page table descriptor.
 *
 * Return: The first byte of the page table described by @pt.
 */
static inline void *ptdesc_address(const struct ptdesc *pt)
{
        return folio_address(ptdesc_folio(pt));
}

static inline bool pagetable_is_reserved(struct ptdesc *pt)
{
        return test_bit(PT_reserved, &pt->pt_flags.f);
}

/**
 * ptdesc_set_kernel - Mark a ptdesc used to map the kernel
 * @ptdesc: The ptdesc to be marked
 *
 * Kernel page tables often need special handling. Set a flag so that
 * the handling code knows this ptdesc will not be used for userspace.
 */
static inline void ptdesc_set_kernel(struct ptdesc *ptdesc)
{
        set_bit(PT_kernel, &ptdesc->pt_flags.f);
}

/**
 * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel
 * @ptdesc: The ptdesc to be unmarked
 *
 * Use when the ptdesc is no longer used to map the kernel and no longer
 * needs special handling.
 */
static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc)
{
        /*
         * Note: the 'PG_referenced' bit does not strictly need to be
         * cleared before freeing the page. But this is nice for
         * symmetry.
         */
        clear_bit(PT_kernel, &ptdesc->pt_flags.f);
}

/**
 * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel
 * @ptdesc: The ptdesc being tested
 *
 * Call to tell if the ptdesc used to map the kernel.
 */
static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc)
{
        return test_bit(PT_kernel, &ptdesc->pt_flags.f);
}

/**
 * pagetable_alloc - Allocate pagetables
 * @gfp:    GFP flags
 * @order:  desired pagetable order
 *
 * pagetable_alloc allocates memory for page tables as well as a page table
 * descriptor to describe that memory.
 *
 * Return: The ptdesc describing the allocated page tables.
 */
static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
{
        struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);

        return page_ptdesc(page);
}
#define pagetable_alloc(...)        alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))

static inline void __pagetable_free(struct ptdesc *pt)
{
        struct page *page = ptdesc_page(pt);

        __free_pages(page, compound_order(page));
}

#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
void pagetable_free_kernel(struct ptdesc *pt);
#else
static inline void pagetable_free_kernel(struct ptdesc *pt)
{
        __pagetable_free(pt);
}
#endif
/**
 * pagetable_free - Free pagetables
 * @pt:        The page table descriptor
 *
 * pagetable_free frees the memory of all page tables described by a page
 * table descriptor and the memory for the descriptor itself.
 */
static inline void pagetable_free(struct ptdesc *pt)
{
        if (ptdesc_test_kernel(pt)) {
                ptdesc_clear_kernel(pt);
                pagetable_free_kernel(pt);
        } else {
                __pagetable_free(pt);
        }
}

#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
bool ptlock_alloc(struct ptdesc *ptdesc);
void ptlock_free(struct ptdesc *ptdesc);

static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
        return ptdesc->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}

static inline bool ptlock_alloc(struct ptdesc *ptdesc)
{
        return true;
}

static inline void ptlock_free(struct ptdesc *ptdesc)
{
}

static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
        return &ptdesc->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */

static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}

static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
{
        BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE));
        BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE);
        return ptlock_ptr(virt_to_ptdesc(pte));
}

static inline bool ptlock_init(struct ptdesc *ptdesc)
{
        /*
         * prep_new_page() initialize page->private (and therefore page->ptl)
         * with 0. Make sure nobody took it in use in between.
         *
         * It can happen if arch try to use slab for page table allocation:
         * slab code uses page->slab_cache, which share storage with page->ptl.
         */
        VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc));
        if (!ptlock_alloc(ptdesc))
                return false;
        spin_lock_init(ptlock_ptr(ptdesc));
        return true;
}

#else        /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
/*
 * We use mm->page_table_lock to guard all pagetable pages of the mm.
 */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}
static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
{
        return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */

static inline void __pagetable_ctor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        __folio_set_pgtable(folio);
        lruvec_stat_add_folio(folio, NR_PAGETABLE);
}

static inline void pagetable_dtor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        ptlock_free(ptdesc);
        __folio_clear_pgtable(folio);
        lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}

static inline void pagetable_dtor_free(struct ptdesc *ptdesc)
{
        pagetable_dtor(ptdesc);
        pagetable_free(ptdesc);
}

static inline bool pagetable_pte_ctor(struct mm_struct *mm,
                                      struct ptdesc *ptdesc)
{
        if (mm != &init_mm && !ptlock_init(ptdesc))
                return false;
        __pagetable_ctor(ptdesc);
        return true;
}

pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);

static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
{
        return __pte_offset_map(pmd, addr, NULL);
}

pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                           unsigned long addr, spinlock_t **ptlp);

pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, spinlock_t **ptlp);
pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, pmd_t *pmdvalp,
                                spinlock_t **ptlp);

#define pte_unmap_unlock(pte, ptl)        do {                \
        spin_unlock(ptl);                                \
        pte_unmap(pte);                                        \
} while (0)

#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))

#define pte_alloc_map(mm, pmd, address)                        \
        (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))

#define pte_alloc_map_lock(mm, pmd, address, ptlp)        \
        (pte_alloc(mm, pmd) ?                        \
                 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))

#define pte_alloc_kernel(pmd, address)                        \
        ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
                NULL: pte_offset_kernel(pmd, address))

#if defined(CONFIG_SPLIT_PMD_PTLOCKS)

static inline struct page *pmd_pgtable_page(pmd_t *pmd)
{
        unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
        return virt_to_page((void *)((unsigned long) pmd & mask));
}

static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
{
        return page_ptdesc(pmd_pgtable_page(pmd));
}

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(pmd_ptdesc(pmd));
}

static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        ptdesc->pmd_huge_pte = NULL;
#endif
        return ptlock_init(ptdesc);
}

#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)

#else

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}

static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }

#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)

#endif

static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
        spinlock_t *ptl = pmd_lockptr(mm, pmd);
        spin_lock(ptl);
        return ptl;
}

static inline bool pagetable_pmd_ctor(struct mm_struct *mm,
                                      struct ptdesc *ptdesc)
{
        if (mm != &init_mm && !pmd_ptlock_init(ptdesc))
                return false;
        ptdesc_pmd_pts_init(ptdesc);
        __pagetable_ctor(ptdesc);
        return true;
}

/*
 * No scalability reason to split PUD locks yet, but follow the same pattern
 * as the PMD locks to make it easier if we decide to.  The VM should not be
 * considered ready to switch to split PUD locks yet; there may be places
 * which need to be converted from page_table_lock.
 */
static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
{
        return &mm->page_table_lock;
}

static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
{
        spinlock_t *ptl = pud_lockptr(mm, pud);

        spin_lock(ptl);
        return ptl;
}

static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
{
        __pagetable_ctor(ptdesc);
}

static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc)
{
        __pagetable_ctor(ptdesc);
}

static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc)
{
        __pagetable_ctor(ptdesc);
}

extern void __init pagecache_init(void);
extern void free_initmem(void);

/*
 * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
 * into the buddy system. The freed pages will be poisoned with pattern
 * "poison" if it's within range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
extern unsigned long free_reserved_area(void *start, void *end,
                                        int poison, const char *s);

extern void adjust_managed_page_count(struct page *page, long count);

/* Free the reserved page into the buddy system, so it gets managed. */
void free_reserved_page(struct page *page);

static inline void mark_page_reserved(struct page *page)
{
        SetPageReserved(page);
        adjust_managed_page_count(page, -1);
}

static inline void free_reserved_ptdesc(struct ptdesc *pt)
{
        free_reserved_page(ptdesc_page(pt));
}

/*
 * Default method to free all the __init memory into the buddy system.
 * The freed pages will be poisoned with pattern "poison" if it's within
 * range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
static inline unsigned long free_initmem_default(int poison)
{
        extern char __init_begin[], __init_end[];

        return free_reserved_area(&__init_begin, &__init_end,
                                  poison, "unused kernel image (initmem)");
}

static inline unsigned long get_num_physpages(void)
{
        int nid;
        unsigned long phys_pages = 0;

        for_each_online_node(nid)
                phys_pages += node_present_pages(nid);

        return phys_pages;
}

/*
 * FIXME: Using memblock node mappings, an architecture may initialise its
 * zones, allocate the backing mem_map and account for memory holes in an
 * architecture independent manner.
 *
 * An architecture is expected to register range of page frames backed by
 * physical memory with memblock_add[_node]() before calling
 * free_area_init() passing in the PFN each zone ends at. At a basic
 * usage, an architecture is expected to do something like
 *
 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
 *                                                          max_highmem_pfn};
 * for_each_valid_physical_page_range()
 *        memblock_add_node(base, size, nid, MEMBLOCK_NONE)
 * free_area_init(max_zone_pfns);
 */
void arch_zone_limits_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
                                                unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn);

#ifndef CONFIG_NUMA
static inline int early_pfn_to_nid(unsigned long pfn)
{
        return 0;
}
#else
/* please see mm/page_alloc.c */
extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif

extern void mem_init(void);
extern void __init mmap_init(void);

extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
static inline void show_mem(void)
{
        __show_mem(0, NULL, MAX_NR_ZONES - 1);
}
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);

extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);

extern void setup_per_cpu_pageset(void);

/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);

/* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node,
                              struct rb_root_cached *root);
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
                              struct rb_root_cached *root);
struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node,
                                unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
                                unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
                                unsigned long start, unsigned long last);

#define vma_interval_tree_foreach(vma, root, start, last)                \
        for (vma = vma_interval_tree_iter_first(root, start, last);        \
             vma; vma = vma_interval_tree_iter_next(vma, start, last))

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long start, unsigned long last);
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
        struct anon_vma_chain *node, unsigned long start, unsigned long last);
#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
#endif

#define anon_vma_interval_tree_foreach(avc, root, start, last)                 \
        for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
             avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))

/* mmap.c */
extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
                                 unsigned long addr, bool write);

static inline int check_data_rlimit(unsigned long rlim,
                                    unsigned long new,
                                    unsigned long start,
                                    unsigned long end_data,
                                    unsigned long start_data)
{
        if (rlim < RLIM_INFINITY) {
                if (((new - start) + (end_data - start_data)) > rlim)
                        return -ENOSPC;
        }

        return 0;
}

extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);

extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern struct file *get_mm_exe_file(struct mm_struct *mm);
extern struct file *get_task_exe_file(struct task_struct *task);

extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);

extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
                                   const struct vm_special_mapping *sm);
struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   vm_flags_t vm_flags,
                                   const struct vm_special_mapping *spec);

unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);

unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                    unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);

static inline unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                  unsigned long pgoff, unsigned long flags)
{
        return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
}

extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
        vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
        struct list_head *uf);
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                         unsigned long start, size_t len, struct list_head *uf,
                         bool unlock);
int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                    struct mm_struct *mm, unsigned long start,
                    unsigned long end, struct list_head *uf, bool unlock);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
                     struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);

#ifdef CONFIG_MMU
extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
{
        /* Ignore errors */
        (void) __mm_populate(addr, len, 1);
}
#else
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif

/* This takes the mm semaphore itself */
int __must_check vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec);
int vm_munmap(unsigned long start, size_t len);
unsigned long __must_check vm_mmap(struct file *file, unsigned long addr,
                unsigned long len, unsigned long prot,
                unsigned long flag, unsigned long offset);
unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr,
                unsigned long len, unsigned long flags);

struct vm_unmapped_area_info {
#define VM_UNMAPPED_AREA_TOPDOWN 1
        unsigned long flags;
        unsigned long length;
        unsigned long low_limit;
        unsigned long high_limit;
        unsigned long align_mask;
        unsigned long align_offset;
        unsigned long start_gap;
};

extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);

/* truncate.c */
void truncate_inode_pages(struct address_space *mapping, loff_t lstart);
void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart,
                uoff_t lend);
void truncate_inode_pages_final(struct address_space *mapping);

/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);

extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
                                             struct vm_area_struct **pprev);

/*
 * Look up the first VMA which intersects the interval [start_addr, end_addr)
 * NULL if none.  Assume start_addr < end_addr.
 */
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                        unsigned long start_addr, unsigned long end_addr);

/**
 * vma_lookup() - Find a VMA at a specific address
 * @mm: The process address space.
 * @addr: The user address.
 *
 * Return: The vm_area_struct at the given address, %NULL otherwise.
 */
static inline
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
{
        return mtree_load(&mm->mm_mt, addr);
}

static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_GROWSDOWN)
                return stack_guard_gap;

        /* See reasoning around the VM_SHADOW_STACK definition */
        if (vma->vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

static inline unsigned long vm_start_gap(const struct vm_area_struct *vma)
{
        unsigned long gap = stack_guard_start_gap(vma);
        unsigned long vm_start = vma->vm_start;

        vm_start -= gap;
        if (vm_start > vma->vm_start)
                vm_start = 0;
        return vm_start;
}

static inline unsigned long vm_end_gap(const struct vm_area_struct *vma)
{
        unsigned long vm_end = vma->vm_end;

        if (vma->vm_flags & VM_GROWSUP) {
                vm_end += stack_guard_gap;
                if (vm_end < vma->vm_end)
                        vm_end = -PAGE_SIZE;
        }
        return vm_end;
}

static inline unsigned long vma_pages(const struct vm_area_struct *vma)
{
        return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}

static inline unsigned long vma_last_pgoff(struct vm_area_struct *vma)
{
        return vma->vm_pgoff + vma_pages(vma) - 1;
}

static inline unsigned long vma_desc_size(const struct vm_area_desc *desc)
{
        return desc->end - desc->start;
}

static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc)
{
        return vma_desc_size(desc) >> PAGE_SHIFT;
}

/**
 * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN
 * remap is required.
 * @desc: The VMA descriptor for the VMA requiring remap.
 * @start: The virtual address to start the remap from, must be within the VMA.
 * @start_pfn: The first PFN in the range to remap.
 * @size: The size of the range to remap, in bytes, at most spanning to the end
 * of the VMA.
 */
static inline void mmap_action_remap(struct vm_area_desc *desc,
                                     unsigned long start,
                                     unsigned long start_pfn,
                                     unsigned long size)
{
        struct mmap_action *action = &desc->action;

        /* [start, start + size) must be within the VMA. */
        WARN_ON_ONCE(start < desc->start || start >= desc->end);
        WARN_ON_ONCE(start + size > desc->end);

        action->type = MMAP_REMAP_PFN;
        action->remap.start = start;
        action->remap.start_pfn = start_pfn;
        action->remap.size = size;
        action->remap.pgprot = desc->page_prot;
}

/**
 * mmap_action_remap_full - helper for mmap_prepare hook to specify that the
 * entirety of a VMA should be PFN remapped.
 * @desc: The VMA descriptor for the VMA requiring remap.
 * @start_pfn: The first PFN in the range to remap.
 */
static inline void mmap_action_remap_full(struct vm_area_desc *desc,
                                          unsigned long start_pfn)
{
        mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc));
}

/**
 * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN
 * I/O remap is required.
 * @desc: The VMA descriptor for the VMA requiring remap.
 * @start: The virtual address to start the remap from, must be within the VMA.
 * @start_pfn: The first PFN in the range to remap.
 * @size: The size of the range to remap, in bytes, at most spanning to the end
 * of the VMA.
 */
static inline void mmap_action_ioremap(struct vm_area_desc *desc,
                                       unsigned long start,
                                       unsigned long start_pfn,
                                       unsigned long size)
{
        mmap_action_remap(desc, start, start_pfn, size);
        desc->action.type = MMAP_IO_REMAP_PFN;
}

/**
 * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the
 * entirety of a VMA should be PFN I/O remapped.
 * @desc: The VMA descriptor for the VMA requiring remap.
 * @start_pfn: The first PFN in the range to remap.
 */
static inline void mmap_action_ioremap_full(struct vm_area_desc *desc,
                                            unsigned long start_pfn)
{
        mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc));
}

/**
 * mmap_action_simple_ioremap - helper for mmap_prepare hook to specify that the
 * physical range in [start_phys_addr, start_phys_addr + size) should be I/O
 * remapped.
 * @desc: The VMA descriptor for the VMA requiring remap.
 * @start_phys_addr: Start of the physical memory to be mapped.
 * @size: Size of the area to map.
 *
 * NOTE: Some drivers might want to tweak desc->page_prot for purposes of
 * write-combine or similar.
 */
static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc,
                                              phys_addr_t start_phys_addr,
                                              unsigned long size)
{
        struct mmap_action *action = &desc->action;

        action->simple_ioremap.start_phys_addr = start_phys_addr;
        action->simple_ioremap.size = size;
        action->type = MMAP_SIMPLE_IO_REMAP;
}

/**
 * mmap_action_map_kernel_pages - helper for mmap_prepare hook to specify that
 * @num kernel pages contained in the @pages array should be mapped to userland
 * starting at virtual address @start.
 * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped.
 * @start: The virtual address from which to map them.
 * @pages: An array of struct page pointers describing the memory to map.
 * @nr_pages: The number of entries in the @pages aray.
 */
static inline void mmap_action_map_kernel_pages(struct vm_area_desc *desc,
                unsigned long start, struct page **pages,
                unsigned long nr_pages)
{
        struct mmap_action *action = &desc->action;

        action->type = MMAP_MAP_KERNEL_PAGES;
        action->map_kernel.start = start;
        action->map_kernel.pages = pages;
        action->map_kernel.nr_pages = nr_pages;
        action->map_kernel.pgoff = desc->pgoff;
}

/**
 * mmap_action_map_kernel_pages_full - helper for mmap_prepare hook to specify that
 * kernel pages contained in the @pages array should be mapped to userland
 * from @desc->start to @desc->end.
 * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped.
 * @pages: An array of struct page pointers describing the memory to map.
 *
 * The caller must ensure that @pages contains sufficient entries to cover the
 * entire range described by @desc.
 */
static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc,
                struct page **pages)
{
        mmap_action_map_kernel_pages(desc, desc->start, pages,
                                     vma_desc_pages(desc));
}

int mmap_action_prepare(struct vm_area_desc *desc);
int mmap_action_complete(struct vm_area_struct *vma,
                         struct mmap_action *action);

/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
                                unsigned long vm_start, unsigned long vm_end)
{
        struct vm_area_struct *vma = vma_lookup(mm, vm_start);

        if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
                vma = NULL;

        return vma;
}

/**
 * range_is_subset - Is the specified inner range a subset of the outer range?
 * @outer_start: The start of the outer range.
 * @outer_end: The exclusive end of the outer range.
 * @inner_start: The start of the inner range.
 * @inner_end: The exclusive end of the inner range.
 *
 * Returns: %true if [inner_start, inner_end) is a subset of [outer_start,
 * outer_end), otherwise %false.
 */
static inline bool range_is_subset(unsigned long outer_start,
                                   unsigned long outer_end,
                                   unsigned long inner_start,
                                   unsigned long inner_end)
{
        return outer_start <= inner_start && inner_end <= outer_end;
}

/**
 * range_in_vma - is the specified [@start, @end) range a subset of the VMA?
 * @vma: The VMA against which we want to check [@start, @end).
 * @start: The start of the range we wish to check.
 * @end: The exclusive end of the range we wish to check.
 *
 * Returns: %true if [@start, @end) is a subset of [@vma->vm_start,
 * @vma->vm_end), %false otherwise.
 */
static inline bool range_in_vma(const struct vm_area_struct *vma,
                                unsigned long start, unsigned long end)
{
        if (!vma)
                return false;

        return range_is_subset(vma->vm_start, vma->vm_end, start, end);
}

/**
 * range_in_vma_desc - is the specified [@start, @end) range a subset of the VMA
 * described by @desc, a VMA descriptor?
 * @desc: The VMA descriptor against which we want to check [@start, @end).
 * @start: The start of the range we wish to check.
 * @end: The exclusive end of the range we wish to check.
 *
 * Returns: %true if [@start, @end) is a subset of [@desc->start, @desc->end),
 * %false otherwise.
 */
static inline bool range_in_vma_desc(const struct vm_area_desc *desc,
                                     unsigned long start, unsigned long end)
{
        if (!desc)
                return false;

        return range_is_subset(desc->start, desc->end, start, end);
}

#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(vm_flags_t vm_flags);

static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags)
{
        const vm_flags_t vm_flags = vma_flags_to_legacy(vma_flags);

        return vm_get_page_prot(vm_flags);
}

void vma_set_page_prot(struct vm_area_struct *vma);
#else
static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
{
        return __pgprot(0);
}
static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags)
{
        return __pgprot(0);
}
static inline void vma_set_page_prot(struct vm_area_struct *vma)
{
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
}
#endif

void vma_set_file(struct vm_area_struct *vma, struct file *file);

#ifdef CONFIG_NUMA_BALANCING
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
#endif

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
                unsigned long addr);
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                    unsigned long pfn, unsigned long size, pgprot_t pgprot);

int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num);
int map_kernel_pages_prepare(struct vm_area_desc *desc);
int map_kernel_pages_complete(struct vm_area_struct *vma,
                              struct mmap_action *action);
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
                        bool write);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, unsigned long pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);

static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
                                unsigned long addr, struct page *page)
{
        int err = vm_insert_page(vma, addr, page);

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

#ifndef io_remap_pfn_range_pfn
static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
                unsigned long size)
{
        return pfn;
}
#endif

static inline int io_remap_pfn_range(struct vm_area_struct *vma,
                                     unsigned long addr, unsigned long orig_pfn,
                                     unsigned long size, pgprot_t orig_prot)
{
        const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
        const pgprot_t prot = pgprot_decrypted(orig_prot);

        return remap_pfn_range(vma, addr, pfn, size, prot);
}

static inline vm_fault_t vmf_error(int err)
{
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        else if (err == -EHWPOISON)
                return VM_FAULT_HWPOISON;
        return VM_FAULT_SIGBUS;
}

/*
 * Convert errno to return value for ->page_mkwrite() calls.
 *
 * This should eventually be merged with vmf_error() above, but will need a
 * careful audit of all vmf_error() callers.
 */
static inline vm_fault_t vmf_fs_error(int err)
{
        if (err == 0)
                return VM_FAULT_LOCKED;
        if (err == -EFAULT || err == -EAGAIN)
                return VM_FAULT_NOPAGE;
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        /* -ENOSPC, -EDQUOT, -EIO ... */
        return VM_FAULT_SIGBUS;
}

static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
        if (vm_fault & VM_FAULT_OOM)
                return -ENOMEM;
        if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
                return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
        if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                return -EFAULT;
        return 0;
}

/*
 * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
 * a (NUMA hinting) fault is required.
 */
static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma,
                                           unsigned int flags)
{
        /*
         * If callers don't want to honor NUMA hinting faults, no need to
         * determine if we would actually have to trigger a NUMA hinting fault.
         */
        if (!(flags & FOLL_HONOR_NUMA_FAULT))
                return true;

        /*
         * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs.
         *
         * Requiring a fault here even for inaccessible VMAs would mean that
         * FOLL_FORCE cannot make any progress, because handle_mm_fault()
         * refuses to process NUMA hinting faults in inaccessible VMAs.
         */
        return !vma_is_accessible(vma);
}

typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
                               unsigned long size, pte_fn_t fn, void *data);
extern int apply_to_existing_page_range(struct mm_struct *mm,
                                   unsigned long address, unsigned long size,
                                   pte_fn_t fn, void *data);

#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
extern bool _page_poisoning_enabled_early;
DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled);
static inline bool page_poisoning_enabled(void)
{
        return _page_poisoning_enabled_early;
}
/*
 * For use in fast paths after init_mem_debugging() has run, or when a
 * false negative result is not harmful when called too early.
 */
static inline bool page_poisoning_enabled_static(void)
{
        return static_branch_unlikely(&_page_poisoning_enabled);
}
static inline void kernel_poison_pages(struct page *page, int numpages)
{
        if (page_poisoning_enabled_static())
                __kernel_poison_pages(page, numpages);
}
static inline void kernel_unpoison_pages(struct page *page, int numpages)
{
        if (page_poisoning_enabled_static())
                __kernel_unpoison_pages(page, numpages);
}
#else
static inline bool page_poisoning_enabled(void) { return false; }
static inline bool page_poisoning_enabled_static(void) { return false; }
static inline void __kernel_poison_pages(struct page *page, int nunmpages) { }
static inline void kernel_poison_pages(struct page *page, int numpages) { }
static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
#endif

DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
static inline bool want_init_on_alloc(gfp_t flags)
{
        if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                &init_on_alloc))
                return true;
        return flags & __GFP_ZERO;
}

DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
static inline bool want_init_on_free(void)
{
        return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
                                   &init_on_free);
}

extern bool _debug_pagealloc_enabled_early;
DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);

static inline bool debug_pagealloc_enabled(void)
{
        return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
                _debug_pagealloc_enabled_early;
}

/*
 * For use in fast paths after mem_debugging_and_hardening_init() has run,
 * or when a false negative result is not harmful when called too early.
 */
static inline bool debug_pagealloc_enabled_static(void)
{
        if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
                return false;

        return static_branch_unlikely(&_debug_pagealloc_enabled);
}

/*
 * To support DEBUG_PAGEALLOC architecture must ensure that
 * __kernel_map_pages() never fails
 */
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
#ifdef CONFIG_DEBUG_PAGEALLOC
static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
{
        iommu_debug_check_unmapped(page, numpages);

        if (debug_pagealloc_enabled_static())
                __kernel_map_pages(page, numpages, 1);
}

static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
{
        iommu_debug_check_unmapped(page, numpages);

        if (debug_pagealloc_enabled_static())
                __kernel_map_pages(page, numpages, 0);
}

extern unsigned int _debug_guardpage_minorder;
DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);

static inline unsigned int debug_guardpage_minorder(void)
{
        return _debug_guardpage_minorder;
}

static inline bool debug_guardpage_enabled(void)
{
        return static_branch_unlikely(&_debug_guardpage_enabled);
}

static inline bool page_is_guard(const struct page *page)
{
        if (!debug_guardpage_enabled())
                return false;

        return PageGuard(page);
}

bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline bool set_page_guard(struct zone *zone, struct page *page,
                                  unsigned int order)
{
        if (!debug_guardpage_enabled())
                return false;
        return __set_page_guard(zone, page, order);
}

void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline void clear_page_guard(struct zone *zone, struct page *page,
                                    unsigned int order)
{
        if (!debug_guardpage_enabled())
                return;
        __clear_page_guard(zone, page, order);
}

#else        /* CONFIG_DEBUG_PAGEALLOC */
static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(const struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
                        unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
                                unsigned int order) {}
#endif        /* CONFIG_DEBUG_PAGEALLOC */

#ifndef clear_pages
/**
 * clear_pages() - clear a page range for kernel-internal use.
 * @addr: start address
 * @npages: number of pages
 *
 * Use clear_user_pages() instead when clearing a page range to be
 * mapped to user space.
 *
 * Does absolutely no exception handling.
 *
 * Note that even though the clearing operation is preemptible, clear_pages()
 * does not (and on architectures where it reduces to a few long-running
 * instructions, might not be able to) call cond_resched() to check if
 * rescheduling is required.
 *
 * When running under preemptible models this is not a problem. Under
 * cooperatively scheduled models, however, the caller is expected to
 * limit @npages to no more than PROCESS_PAGES_NON_PREEMPT_BATCH.
 */
static inline void clear_pages(void *addr, unsigned int npages)
{
        do {
                clear_page(addr);
                addr += PAGE_SIZE;
        } while (--npages);
}
#endif

#ifndef PROCESS_PAGES_NON_PREEMPT_BATCH
#ifdef clear_pages
/*
 * The architecture defines clear_pages(), and we assume that it is
 * generally "fast". So choose a batch size large enough to allow the processor
 * headroom for optimizing the operation and yet small enough that we see
 * reasonable preemption latency for when this optimization is not possible
 * (ex. slow microarchitectures, memory bandwidth saturation.)
 *
 * With a value of 32MB and assuming a memory bandwidth of ~10GBps, this should
 * result in worst case preemption latency of around 3ms when clearing pages.
 *
 * (See comment above clear_pages() for why preemption latency is a concern
 * here.)
 */
#define PROCESS_PAGES_NON_PREEMPT_BATCH                (SZ_32M >> PAGE_SHIFT)
#else /* !clear_pages */
/*
 * The architecture does not provide a clear_pages() implementation. Assume
 * that clear_page() -- which clear_pages() will fallback to -- is relatively
 * slow and choose a small value for PROCESS_PAGES_NON_PREEMPT_BATCH.
 */
#define PROCESS_PAGES_NON_PREEMPT_BATCH                1
#endif
#endif

#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
extern int in_gate_area_no_mm(unsigned long addr);
extern int in_gate_area(struct mm_struct *mm, unsigned long addr);
#else
static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
        return NULL;
}
static inline int in_gate_area_no_mm(unsigned long addr) { return 0; }
static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        return 0;
}
#endif        /* __HAVE_ARCH_GATE_AREA */

bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm);

void drop_slab(void);

#ifndef CONFIG_MMU
#define randomize_va_space 0
#else
extern int randomize_va_space;
#endif

const char * arch_vma_name(struct vm_area_struct *vma);
#ifdef CONFIG_MMU
void print_vma_addr(char *prefix, unsigned long rip);
#else
static inline void print_vma_addr(char *prefix, unsigned long rip)
{
}
#endif

void *sparse_buffer_alloc(unsigned long size);
unsigned long section_map_size(void);
struct page * __populate_section_memmap(unsigned long pfn,
                unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
                struct dev_pagemap *pgmap);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
                            struct vmem_altmap *altmap, unsigned long ptpfn,
                            unsigned long flags);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
                              struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
                     unsigned long addr, unsigned long next);
int vmemmap_check_pmd(pmd_t *pmd, int node,
                      unsigned long addr, unsigned long next);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap);
int vmemmap_populate_hvo(unsigned long start, unsigned long end,
                         unsigned int order, struct zone *zone,
                         unsigned long headsize);
void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
                          unsigned long headsize);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap);
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
{
        /* number of pfns from base where pfn_to_page() is valid */
        if (altmap)
                return altmap->reserve + altmap->free;
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                                    unsigned long nr_pfns)
{
        altmap->alloc -= nr_pfns;
}
#else
static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
{
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                                    unsigned long nr_pfns)
{
}
#endif

#define VMEMMAP_RESERVE_NR        2
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
                                          struct dev_pagemap *pgmap)
{
        unsigned long nr_pages;
        unsigned long nr_vmemmap_pages;

        if (!pgmap || !is_power_of_2(sizeof(struct page)))
                return false;

        nr_pages = pgmap_vmemmap_nr(pgmap);
        nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
        /*
         * For vmemmap optimization with DAX we need minimum 2 vmemmap
         * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
         */
        return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
}
/*
 * If we don't have an architecture override, use the generic rule
 */
#ifndef vmemmap_can_optimize
#define vmemmap_can_optimize __vmemmap_can_optimize
#endif

#else
static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
                                           struct dev_pagemap *pgmap)
{
        return false;
}
#endif

enum mf_flags {
        MF_COUNT_INCREASED = 1 << 0,
        MF_ACTION_REQUIRED = 1 << 1,
        MF_MUST_KILL = 1 << 2,
        MF_SOFT_OFFLINE = 1 << 3,
        MF_UNPOISON = 1 << 4,
        MF_SW_SIMULATED = 1 << 5,
        MF_NO_RETRY = 1 << 6,
        MF_MEM_PRE_REMOVE = 1 << 7,
};
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
                      unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
extern int unpoison_memory(unsigned long pfn);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Sysfs entries for memory failure handling statistics.
 */
extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
#else
static inline void memory_failure_queue(unsigned long pfn, int flags)
{
}

static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared)
{
        return 0;
}

static inline void num_poisoned_pages_inc(unsigned long pfn)
{
}

static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
{
}
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
extern void memblk_nr_poison_inc(unsigned long pfn);
extern void memblk_nr_poison_sub(unsigned long pfn, long i);
#else
static inline void memblk_nr_poison_inc(unsigned long pfn)
{
}

static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
{
}
#endif

#ifndef arch_memory_failure
static inline int arch_memory_failure(unsigned long pfn, int flags)
{
        return -ENXIO;
}
#endif

#ifndef arch_is_platform_page
static inline bool arch_is_platform_page(u64 paddr)
{
        return false;
}
#endif

/*
 * Error handlers for various types of pages.
 */
enum mf_result {
        MF_IGNORED,        /* Error: cannot be handled */
        MF_FAILED,        /* Error: handling failed */
        MF_DELAYED,        /* Will be handled later */
        MF_RECOVERED,        /* Successfully recovered */
};

enum mf_action_page_type {
        MF_MSG_KERNEL,
        MF_MSG_KERNEL_HIGH_ORDER,
        MF_MSG_DIFFERENT_COMPOUND,
        MF_MSG_HUGE,
        MF_MSG_FREE_HUGE,
        MF_MSG_GET_HWPOISON,
        MF_MSG_UNMAP_FAILED,
        MF_MSG_DIRTY_SWAPCACHE,
        MF_MSG_CLEAN_SWAPCACHE,
        MF_MSG_DIRTY_MLOCKED_LRU,
        MF_MSG_CLEAN_MLOCKED_LRU,
        MF_MSG_DIRTY_UNEVICTABLE_LRU,
        MF_MSG_CLEAN_UNEVICTABLE_LRU,
        MF_MSG_DIRTY_LRU,
        MF_MSG_CLEAN_LRU,
        MF_MSG_TRUNCATED_LRU,
        MF_MSG_BUDDY,
        MF_MSG_DAX,
        MF_MSG_UNSPLIT_THP,
        MF_MSG_ALREADY_POISONED,
        MF_MSG_PFN_MAP,
        MF_MSG_UNKNOWN,
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
void folio_zero_user(struct folio *folio, unsigned long addr_hint);
int copy_user_large_folio(struct folio *dst, struct folio *src,
                          unsigned long addr_hint,
                          struct vm_area_struct *vma);
long copy_folio_from_user(struct folio *dst_folio,
                           const void __user *usr_src,
                           bool allow_pagefault);

#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
static inline void setup_nr_node_ids(void) {}
#endif

extern int memcmp_pages(struct page *page1, struct page *page2);

static inline int pages_identical(struct page *page1, struct page *page2)
{
        return !memcmp_pages(page1, page2);
}

#ifdef CONFIG_MAPPING_DIRTY_HELPERS
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
                                                pgoff_t first_index, pgoff_t nr,
                                                pgoff_t bitmap_pgoff,
                                                unsigned long *bitmap,
                                                pgoff_t *start,
                                                pgoff_t *end);

unsigned long wp_shared_mapping_range(struct address_space *mapping,
                                      pgoff_t first_index, pgoff_t nr);
#endif

#ifdef CONFIG_ANON_VMA_NAME
int set_anon_vma_name(unsigned long addr, unsigned long size,
                      const char __user *uname);
#else
static inline
int set_anon_vma_name(unsigned long addr, unsigned long size,
                      const char __user *uname)
{
        return -EINVAL;
}
#endif

#ifdef CONFIG_UNACCEPTED_MEMORY

bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size);
void accept_memory(phys_addr_t start, unsigned long size);

#else

static inline bool range_contains_unaccepted_memory(phys_addr_t start,
                                                    unsigned long size)
{
        return false;
}

static inline void accept_memory(phys_addr_t start, unsigned long size)
{
}

#endif

static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
{
        return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE);
}

void vma_pgtable_walk_begin(struct vm_area_struct *vma);
void vma_pgtable_walk_end(struct vm_area_struct *vma);

int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size);
int reserve_mem_release_by_name(const char *name);

#ifdef CONFIG_64BIT
int do_mseal(unsigned long start, size_t len_in, unsigned long flags);
#else
static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
{
        /* noop on 32 bit */
        return 0;
}
#endif

/*
 * user_alloc_needs_zeroing checks if a user folio from page allocator needs to
 * be zeroed or not.
 */
static inline bool user_alloc_needs_zeroing(void)
{
        /*
         * for user folios, arch with cache aliasing requires cache flush and
         * arc changes folio->flags to make icache coherent with dcache, so
         * always return false to make caller use
         * clear_user_page()/clear_user_highpage().
         */
        return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() ||
               !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                   &init_on_alloc);
}

int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);

/*
 * DMA mapping IDs for page_pool
 *
 * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and
 * stashes it in the upper bits of page->pp_magic. Non-PP pages can have
 * arbitrary kernel pointers stored in the same field as pp_magic (since
 * it overlaps with page->lru.next), so we must ensure that we cannot
 * mistake a valid kernel pointer with any of the values we write into this
 * field.
 *
 * On architectures that set POISON_POINTER_DELTA, this is already ensured,
 * since this value becomes part of PP_SIGNATURE; meaning we can just use the
 * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the
 * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is
 * 0, we use the lowest bit of PAGE_OFFSET as the boundary if that value is
 * known at compile-time.
 *
 * If the value of PAGE_OFFSET is not known at compile time, or if it is too
 * small to leave at least 8 bits available above PP_SIGNATURE, we define the
 * number of bits to be 0, which turns off the DMA index tracking altogether
 * (see page_pool_register_dma_index()).
 */
#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA))
#if POISON_POINTER_DELTA > 0
/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA
 * index to not overlap with that if set
 */
#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT)
#else
/* Use the lowest bit of PAGE_OFFSET if there's at least 8 bits available; see above */
#define PP_DMA_INDEX_MIN_OFFSET (1 << (PP_DMA_INDEX_SHIFT + 8))
#define PP_DMA_INDEX_BITS ((__builtin_constant_p(PAGE_OFFSET) && \
                            PAGE_OFFSET >= PP_DMA_INDEX_MIN_OFFSET && \
                            !(PAGE_OFFSET & (PP_DMA_INDEX_MIN_OFFSET - 1))) ? \
                              MIN(32, __ffs(PAGE_OFFSET) - PP_DMA_INDEX_SHIFT) : 0)

#endif

#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \
                                  PP_DMA_INDEX_SHIFT)

#define PAGE_SNAPSHOT_FAITHFUL (1 << 0)
#define PAGE_SNAPSHOT_PG_BUDDY (1 << 1)
#define PAGE_SNAPSHOT_PG_IDLE  (1 << 2)

struct page_snapshot {
        struct folio folio_snapshot;
        struct page page_snapshot;
        unsigned long pfn;
        unsigned long idx;
        unsigned long flags;
};

static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps)
{
        return ps->flags & PAGE_SNAPSHOT_FAITHFUL;
}

void snapshot_page(struct page_snapshot *ps, const struct page *page);

void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte,
                struct vm_area_struct *vma, unsigned long addr,
                bool uffd_wp);

#endif /* _LINUX_MM_H */


































    2 









    3 















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include "internal.h"

/*
 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_root(struct fs_struct *fs, const struct path *path)
{
        struct path old_root;

        path_get(path);
        write_seqlock(&fs->seq);
        old_root = fs->root;
        fs->root = *path;
        write_sequnlock(&fs->seq);
        if (old_root.dentry)
                path_put(&old_root);
}

/*
 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_pwd(struct fs_struct *fs, const struct path *path)
{
        struct path old_pwd;

        path_get(path);
        write_seqlock(&fs->seq);
        old_pwd = fs->pwd;
        fs->pwd = *path;
        write_sequnlock(&fs->seq);

        if (old_pwd.dentry)
                path_put(&old_pwd);
}

static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
{
        if (likely(p->dentry != old->dentry || p->mnt != old->mnt))
                return 0;
        *p = *new;
        return 1;
}

void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
{
        struct task_struct *g, *p;
        struct fs_struct *fs;
        int count = 0;

        read_lock(&tasklist_lock);
        for_each_process_thread(g, p) {
                task_lock(p);
                fs = p->fs;
                if (fs) {
                        int hits = 0;
                        write_seqlock(&fs->seq);
                        hits += replace_path(&fs->root, old_root, new_root);
                        hits += replace_path(&fs->pwd, old_root, new_root);
                        while (hits--) {
                                count++;
                                path_get(new_root);
                        }
                        write_sequnlock(&fs->seq);
                }
                task_unlock(p);
        }
        read_unlock(&tasklist_lock);
        while (count--)
                path_put(old_root);
}

void free_fs_struct(struct fs_struct *fs)
{
        path_put(&fs->root);
        path_put(&fs->pwd);
        kmem_cache_free(fs_cachep, fs);
}

void exit_fs(struct task_struct *tsk)
{
        struct fs_struct *fs = tsk->fs;

        if (fs) {
                int kill;
                task_lock(tsk);
                read_seqlock_excl(&fs->seq);
                tsk->fs = NULL;
                kill = !--fs->users;
                read_sequnlock_excl(&fs->seq);
                task_unlock(tsk);
                if (kill)
                        free_fs_struct(fs);
        }
}

struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
        /* We don't need to lock fs - think why ;-) */
        if (fs) {
                fs->users = 1;
                fs->in_exec = 0;
                seqlock_init(&fs->seq);
                fs->umask = old->umask;

                read_seqlock_excl(&old->seq);
                fs->root = old->root;
                path_get(&fs->root);
                fs->pwd = old->pwd;
                path_get(&fs->pwd);
                read_sequnlock_excl(&old->seq);
        }
        return fs;
}

int unshare_fs_struct(void)
{
        struct fs_struct *fs = current->fs;
        struct fs_struct *new_fs = copy_fs_struct(fs);
        int kill;

        if (!new_fs)
                return -ENOMEM;

        task_lock(current);
        read_seqlock_excl(&fs->seq);
        kill = !--fs->users;
        current->fs = new_fs;
        read_sequnlock_excl(&fs->seq);
        task_unlock(current);

        if (kill)
                free_fs_struct(fs);

        return 0;
}
EXPORT_SYMBOL_GPL(unshare_fs_struct);

/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
        .users                = 1,
        .seq                = __SEQLOCK_UNLOCKED(init_fs.seq),
        .umask                = 0022,
};



































































































































































































































































    1 








    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










































































































































































































































































































































    1 

















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/kernel/sys.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/export.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/kmod.h>
#include <linux/ksm.h>
#include <linux/perf_event.h>
#include <linux/resource.h>
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/capability.h>
#include <linux/device.h>
#include <linux/key.h>
#include <linux/times.h>
#include <linux/posix-timers.h>
#include <linux/security.h>
#include <linux/random.h>
#include <linux/suspend.h>
#include <linux/tty.h>
#include <linux/signal.h>
#include <linux/cn_proc.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/seccomp.h>
#include <linux/cpu.h>
#include <linux/personality.h>
#include <linux/ptrace.h>
#include <linux/fs_struct.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>
#include <linux/syscall_user_dispatch.h>

#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
#include <linux/time_namespace.h>
#include <linux/binfmts.h>
#include <linux/futex.h>
#include <linux/rseq.h>

#include <linux/sched.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/stat.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>

#include <linux/nospec.h>

#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>

#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/unistd.h>

#include <trace/events/task.h>

#include "uid16.h"

#ifndef SET_UNALIGN_CTL
# define SET_UNALIGN_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_UNALIGN_CTL
# define GET_UNALIGN_CTL(a, b)        (-EINVAL)
#endif
#ifndef SET_FPEMU_CTL
# define SET_FPEMU_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_FPEMU_CTL
# define GET_FPEMU_CTL(a, b)        (-EINVAL)
#endif
#ifndef SET_FPEXC_CTL
# define SET_FPEXC_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_FPEXC_CTL
# define GET_FPEXC_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_ENDIAN
# define GET_ENDIAN(a, b)        (-EINVAL)
#endif
#ifndef SET_ENDIAN
# define SET_ENDIAN(a, b)        (-EINVAL)
#endif
#ifndef GET_TSC_CTL
# define GET_TSC_CTL(a)                (-EINVAL)
#endif
#ifndef SET_TSC_CTL
# define SET_TSC_CTL(a)                (-EINVAL)
#endif
#ifndef GET_FP_MODE
# define GET_FP_MODE(a)                (-EINVAL)
#endif
#ifndef SET_FP_MODE
# define SET_FP_MODE(a,b)        (-EINVAL)
#endif
#ifndef SVE_SET_VL
# define SVE_SET_VL(a)                (-EINVAL)
#endif
#ifndef SVE_GET_VL
# define SVE_GET_VL()                (-EINVAL)
#endif
#ifndef SME_SET_VL
# define SME_SET_VL(a)                (-EINVAL)
#endif
#ifndef SME_GET_VL
# define SME_GET_VL()                (-EINVAL)
#endif
#ifndef PAC_RESET_KEYS
# define PAC_RESET_KEYS(a, b)        (-EINVAL)
#endif
#ifndef PAC_SET_ENABLED_KEYS
# define PAC_SET_ENABLED_KEYS(a, b, c)        (-EINVAL)
#endif
#ifndef PAC_GET_ENABLED_KEYS
# define PAC_GET_ENABLED_KEYS(a)        (-EINVAL)
#endif
#ifndef SET_TAGGED_ADDR_CTRL
# define SET_TAGGED_ADDR_CTRL(a)        (-EINVAL)
#endif
#ifndef GET_TAGGED_ADDR_CTRL
# define GET_TAGGED_ADDR_CTRL()                (-EINVAL)
#endif
#ifndef RISCV_V_SET_CONTROL
# define RISCV_V_SET_CONTROL(a)                (-EINVAL)
#endif
#ifndef RISCV_V_GET_CONTROL
# define RISCV_V_GET_CONTROL()                (-EINVAL)
#endif
#ifndef RISCV_SET_ICACHE_FLUSH_CTX
# define RISCV_SET_ICACHE_FLUSH_CTX(a, b)        (-EINVAL)
#endif
#ifndef PPC_GET_DEXCR_ASPECT
# define PPC_GET_DEXCR_ASPECT(a, b)        (-EINVAL)
#endif
#ifndef PPC_SET_DEXCR_ASPECT
# define PPC_SET_DEXCR_ASPECT(a, b, c)        (-EINVAL)
#endif

/*
 * this is where the system-wide overflow UID and GID are defined, for
 * architectures that now have 32-bit UID/GID but didn't in the past
 */

int overflowuid = DEFAULT_OVERFLOWUID;
int overflowgid = DEFAULT_OVERFLOWGID;

EXPORT_SYMBOL(overflowuid);
EXPORT_SYMBOL(overflowgid);

/*
 * the same as above, but for filesystems which can only store a 16-bit
 * UID and GID. as such, this is needed on all architectures
 */

int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;

EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);

static const struct ctl_table overflow_sysctl_table[] = {
        {
                .procname        = "overflowuid",
                .data                = &overflowuid,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_MAXOLDUID,
        },
        {
                .procname        = "overflowgid",
                .data                = &overflowgid,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_MAXOLDUID,
        },
};

static int __init init_overflow_sysctl(void)
{
        register_sysctl_init("kernel", overflow_sysctl_table);
        return 0;
}

postcore_initcall(init_overflow_sysctl);

/*
 * Returns true if current's euid is same as p's uid or euid,
 * or has CAP_SYS_NICE to p's user_ns.
 *
 * Called with rcu_read_lock, creds are safe
 */
static bool set_one_prio_perm(struct task_struct *p)
{
        const struct cred *cred = current_cred(), *pcred = __task_cred(p);

        if (uid_eq(pcred->uid,  cred->euid) ||
            uid_eq(pcred->euid, cred->euid))
                return true;
        if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
                return true;
        return false;
}

/*
 * set the priority of a task
 * - the caller must hold the RCU read lock
 */
static int set_one_prio(struct task_struct *p, int niceval, int error)
{
        int no_nice;

        if (!set_one_prio_perm(p)) {
                error = -EPERM;
                goto out;
        }
        if (niceval < task_nice(p) && !can_nice(p, niceval)) {
                error = -EACCES;
                goto out;
        }
        no_nice = security_task_setnice(p, niceval);
        if (no_nice) {
                error = no_nice;
                goto out;
        }
        if (error == -ESRCH)
                error = 0;
        set_user_nice(p, niceval);
out:
        return error;
}

SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
{
        struct task_struct *g, *p;
        struct user_struct *user;
        const struct cred *cred = current_cred();
        int error = -EINVAL;
        struct pid *pgrp;
        kuid_t uid;

        if (which > PRIO_USER || which < PRIO_PROCESS)
                goto out;

        /* normalize: avoid signed division (rounding problems) */
        error = -ESRCH;
        if (niceval < MIN_NICE)
                niceval = MIN_NICE;
        if (niceval > MAX_NICE)
                niceval = MAX_NICE;

        rcu_read_lock();
        switch (which) {
        case PRIO_PROCESS:
                if (who)
                        p = find_task_by_vpid(who);
                else
                        p = current;
                if (p)
                        error = set_one_prio(p, niceval, error);
                break;
        case PRIO_PGRP:
                if (who)
                        pgrp = find_vpid(who);
                else
                        pgrp = task_pgrp(current);
                read_lock(&tasklist_lock);
                do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                        error = set_one_prio(p, niceval, error);
                } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                read_unlock(&tasklist_lock);
                break;
        case PRIO_USER:
                uid = make_kuid(cred->user_ns, who);
                user = cred->user;
                if (!who)
                        uid = cred->uid;
                else if (!uid_eq(uid, cred->uid)) {
                        user = find_user(uid);
                        if (!user)
                                goto out_unlock;        /* No processes for this user */
                }
                for_each_process_thread(g, p) {
                        if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
                                error = set_one_prio(p, niceval, error);
                }
                if (!uid_eq(uid, cred->uid))
                        free_uid(user);                /* For find_user() */
                break;
        }
out_unlock:
        rcu_read_unlock();
out:
        return error;
}

/*
 * Ugh. To avoid negative return values, "getpriority()" will
 * not return the normal nice-value, but a negated value that
 * has been offset by 20 (ie it returns 40..1 instead of -20..19)
 * to stay compatible.
 */
SYSCALL_DEFINE2(getpriority, int, which, int, who)
{
        struct task_struct *g, *p;
        struct user_struct *user;
        const struct cred *cred = current_cred();
        long niceval, retval = -ESRCH;
        struct pid *pgrp;
        kuid_t uid;

        if (which > PRIO_USER || which < PRIO_PROCESS)
                return -EINVAL;

        rcu_read_lock();
        switch (which) {
        case PRIO_PROCESS:
                if (who)
                        p = find_task_by_vpid(who);
                else
                        p = current;
                if (p) {
                        niceval = nice_to_rlimit(task_nice(p));
                        if (niceval > retval)
                                retval = niceval;
                }
                break;
        case PRIO_PGRP:
                if (who)
                        pgrp = find_vpid(who);
                else
                        pgrp = task_pgrp(current);
                read_lock(&tasklist_lock);
                do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                        niceval = nice_to_rlimit(task_nice(p));
                        if (niceval > retval)
                                retval = niceval;
                } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                read_unlock(&tasklist_lock);
                break;
        case PRIO_USER:
                uid = make_kuid(cred->user_ns, who);
                user = cred->user;
                if (!who)
                        uid = cred->uid;
                else if (!uid_eq(uid, cred->uid)) {
                        user = find_user(uid);
                        if (!user)
                                goto out_unlock;        /* No processes for this user */
                }
                for_each_process_thread(g, p) {
                        if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
                                niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
                        }
                }
                if (!uid_eq(uid, cred->uid))
                        free_uid(user);                /* for find_user() */
                break;
        }
out_unlock:
        rcu_read_unlock();

        return retval;
}

/*
 * Unprivileged users may change the real gid to the effective gid
 * or vice versa.  (BSD-style)
 *
 * If you set the real gid at all, or set the effective gid to a value not
 * equal to the real gid, then the saved gid is set to the new effective gid.
 *
 * This makes it possible for a setgid program to completely drop its
 * privileges, which is often a useful assertion to make when you are doing
 * a security audit over a program.
 *
 * The general idea is that a program which uses just setregid() will be
 * 100% compatible with BSD.  A program which uses just setgid() will be
 * 100% compatible with POSIX with saved IDs.
 *
 * SMP: There are not races, the GIDs are checked only by filesystem
 *      operations (as far as semantic preservation is concerned).
 */
#ifdef CONFIG_MULTIUSER
long __sys_setregid(gid_t rgid, gid_t egid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kgid_t krgid, kegid;

        krgid = make_kgid(ns, rgid);
        kegid = make_kgid(ns, egid);

        if ((rgid != (gid_t) -1) && !gid_valid(krgid))
                return -EINVAL;
        if ((egid != (gid_t) -1) && !gid_valid(kegid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (rgid != (gid_t) -1) {
                if (gid_eq(old->gid, krgid) ||
                    gid_eq(old->egid, krgid) ||
                    ns_capable_setid(old->user_ns, CAP_SETGID))
                        new->gid = krgid;
                else
                        goto error;
        }
        if (egid != (gid_t) -1) {
                if (gid_eq(old->gid, kegid) ||
                    gid_eq(old->egid, kegid) ||
                    gid_eq(old->sgid, kegid) ||
                    ns_capable_setid(old->user_ns, CAP_SETGID))
                        new->egid = kegid;
                else
                        goto error;
        }

        if (rgid != (gid_t) -1 ||
            (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
                new->sgid = new->egid;
        new->fsgid = new->egid;

        retval = security_task_fix_setgid(new, old, LSM_SETID_RE);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
{
        return __sys_setregid(rgid, egid);
}

/*
 * setgid() is implemented like SysV w/ SAVED_IDS
 *
 * SMP: Same implicit races as above.
 */
long __sys_setgid(gid_t gid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kgid_t kgid;

        kgid = make_kgid(ns, gid);
        if (!gid_valid(kgid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (ns_capable_setid(old->user_ns, CAP_SETGID))
                new->gid = new->egid = new->sgid = new->fsgid = kgid;
        else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
                new->egid = new->fsgid = kgid;
        else
                goto error;

        retval = security_task_fix_setgid(new, old, LSM_SETID_ID);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE1(setgid, gid_t, gid)
{
        return __sys_setgid(gid);
}

/*
 * change the user struct in a credentials set to match the new UID
 */
static int set_user(struct cred *new)
{
        struct user_struct *new_user;

        new_user = alloc_uid(new->uid);
        if (!new_user)
                return -EAGAIN;

        free_uid(new->user);
        new->user = new_user;
        return 0;
}

static void flag_nproc_exceeded(struct cred *new)
{
        if (new->ucounts == current_ucounts())
                return;

        /*
         * We don't fail in case of NPROC limit excess here because too many
         * poorly written programs don't check set*uid() return code, assuming
         * it never fails if called by root.  We may still enforce NPROC limit
         * for programs doing set*uid()+execve() by harmlessly deferring the
         * failure to the execve() stage.
         */
        if (is_rlimit_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
                        new->user != INIT_USER)
                current->flags |= PF_NPROC_EXCEEDED;
        else
                current->flags &= ~PF_NPROC_EXCEEDED;
}

/*
 * Unprivileged users may change the real uid to the effective uid
 * or vice versa.  (BSD-style)
 *
 * If you set the real uid at all, or set the effective uid to a value not
 * equal to the real uid, then the saved uid is set to the new effective uid.
 *
 * This makes it possible for a setuid program to completely drop its
 * privileges, which is often a useful assertion to make when you are doing
 * a security audit over a program.
 *
 * The general idea is that a program which uses just setreuid() will be
 * 100% compatible with BSD.  A program which uses just setuid() will be
 * 100% compatible with POSIX with saved IDs.
 */
long __sys_setreuid(uid_t ruid, uid_t euid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kuid_t kruid, keuid;

        kruid = make_kuid(ns, ruid);
        keuid = make_kuid(ns, euid);

        if ((ruid != (uid_t) -1) && !uid_valid(kruid))
                return -EINVAL;
        if ((euid != (uid_t) -1) && !uid_valid(keuid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (ruid != (uid_t) -1) {
                new->uid = kruid;
                if (!uid_eq(old->uid, kruid) &&
                    !uid_eq(old->euid, kruid) &&
                    !ns_capable_setid(old->user_ns, CAP_SETUID))
                        goto error;
        }

        if (euid != (uid_t) -1) {
                new->euid = keuid;
                if (!uid_eq(old->uid, keuid) &&
                    !uid_eq(old->euid, keuid) &&
                    !uid_eq(old->suid, keuid) &&
                    !ns_capable_setid(old->user_ns, CAP_SETUID))
                        goto error;
        }

        if (!uid_eq(new->uid, old->uid)) {
                retval = set_user(new);
                if (retval < 0)
                        goto error;
        }
        if (ruid != (uid_t) -1 ||
            (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
                new->suid = new->euid;
        new->fsuid = new->euid;

        retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
        if (retval < 0)
                goto error;

        retval = set_cred_ucounts(new);
        if (retval < 0)
                goto error;

        flag_nproc_exceeded(new);
        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
        return __sys_setreuid(ruid, euid);
}

/*
 * setuid() is implemented like SysV with SAVED_IDS
 *
 * Note that SAVED_ID's is deficient in that a setuid root program
 * like sendmail, for example, cannot set its uid to be a normal
 * user and then switch back, because if you're root, setuid() sets
 * the saved uid too.  If you don't like this, blame the bright people
 * in the POSIX committee and/or USG.  Note that the BSD-style setreuid()
 * will allow a root program to temporarily drop privileges and be able to
 * regain them by swapping the real and effective uid.
 */
long __sys_setuid(uid_t uid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kuid_t kuid;

        kuid = make_kuid(ns, uid);
        if (!uid_valid(kuid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (ns_capable_setid(old->user_ns, CAP_SETUID)) {
                new->suid = new->uid = kuid;
                if (!uid_eq(kuid, old->uid)) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
        } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
                goto error;
        }

        new->fsuid = new->euid = kuid;

        retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
        if (retval < 0)
                goto error;

        retval = set_cred_ucounts(new);
        if (retval < 0)
                goto error;

        flag_nproc_exceeded(new);
        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE1(setuid, uid_t, uid)
{
        return __sys_setuid(uid);
}


/*
 * This function implements a generic ability to update ruid, euid,
 * and suid.  This allows you to implement the 4.4 compatible seteuid().
 */
long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kuid_t kruid, keuid, ksuid;
        bool ruid_new, euid_new, suid_new;

        kruid = make_kuid(ns, ruid);
        keuid = make_kuid(ns, euid);
        ksuid = make_kuid(ns, suid);

        if ((ruid != (uid_t) -1) && !uid_valid(kruid))
                return -EINVAL;

        if ((euid != (uid_t) -1) && !uid_valid(keuid))
                return -EINVAL;

        if ((suid != (uid_t) -1) && !uid_valid(ksuid))
                return -EINVAL;

        old = current_cred();

        /* check for no-op */
        if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) &&
            (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) &&
                                    uid_eq(keuid, old->fsuid))) &&
            (suid == (uid_t) -1 || uid_eq(ksuid, old->suid)))
                return 0;

        ruid_new = ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
                   !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid);
        euid_new = euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
                   !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid);
        suid_new = suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
                   !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid);
        if ((ruid_new || euid_new || suid_new) &&
            !ns_capable_setid(old->user_ns, CAP_SETUID))
                return -EPERM;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        if (ruid != (uid_t) -1) {
                new->uid = kruid;
                if (!uid_eq(kruid, old->uid)) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
        }
        if (euid != (uid_t) -1)
                new->euid = keuid;
        if (suid != (uid_t) -1)
                new->suid = ksuid;
        new->fsuid = new->euid;

        retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
        if (retval < 0)
                goto error;

        retval = set_cred_ucounts(new);
        if (retval < 0)
                goto error;

        flag_nproc_exceeded(new);
        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
{
        return __sys_setresuid(ruid, euid, suid);
}

SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
{
        const struct cred *cred = current_cred();
        int retval;
        uid_t ruid, euid, suid;

        ruid = from_kuid_munged(cred->user_ns, cred->uid);
        euid = from_kuid_munged(cred->user_ns, cred->euid);
        suid = from_kuid_munged(cred->user_ns, cred->suid);

        retval = put_user(ruid, ruidp);
        if (!retval) {
                retval = put_user(euid, euidp);
                if (!retval)
                        return put_user(suid, suidp);
        }
        return retval;
}

/*
 * Same as above, but for rgid, egid, sgid.
 */
long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kgid_t krgid, kegid, ksgid;
        bool rgid_new, egid_new, sgid_new;

        krgid = make_kgid(ns, rgid);
        kegid = make_kgid(ns, egid);
        ksgid = make_kgid(ns, sgid);

        if ((rgid != (gid_t) -1) && !gid_valid(krgid))
                return -EINVAL;
        if ((egid != (gid_t) -1) && !gid_valid(kegid))
                return -EINVAL;
        if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
                return -EINVAL;

        old = current_cred();

        /* check for no-op */
        if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) &&
            (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) &&
                                    gid_eq(kegid, old->fsgid))) &&
            (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid)))
                return 0;

        rgid_new = rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
                   !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid);
        egid_new = egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
                   !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid);
        sgid_new = sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
                   !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid);
        if ((rgid_new || egid_new || sgid_new) &&
            !ns_capable_setid(old->user_ns, CAP_SETGID))
                return -EPERM;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        if (rgid != (gid_t) -1)
                new->gid = krgid;
        if (egid != (gid_t) -1)
                new->egid = kegid;
        if (sgid != (gid_t) -1)
                new->sgid = ksgid;
        new->fsgid = new->egid;

        retval = security_task_fix_setgid(new, old, LSM_SETID_RES);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
{
        return __sys_setresgid(rgid, egid, sgid);
}

SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
{
        const struct cred *cred = current_cred();
        int retval;
        gid_t rgid, egid, sgid;

        rgid = from_kgid_munged(cred->user_ns, cred->gid);
        egid = from_kgid_munged(cred->user_ns, cred->egid);
        sgid = from_kgid_munged(cred->user_ns, cred->sgid);

        retval = put_user(rgid, rgidp);
        if (!retval) {
                retval = put_user(egid, egidp);
                if (!retval)
                        retval = put_user(sgid, sgidp);
        }

        return retval;
}


/*
 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
 * is used for "access()" and for the NFS daemon (letting nfsd stay at
 * whatever uid it wants to). It normally shadows "euid", except when
 * explicitly set by setfsuid() or for access..
 */
long __sys_setfsuid(uid_t uid)
{
        const struct cred *old;
        struct cred *new;
        uid_t old_fsuid;
        kuid_t kuid;

        old = current_cred();
        old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);

        kuid = make_kuid(old->user_ns, uid);
        if (!uid_valid(kuid))
                return old_fsuid;

        new = prepare_creds();
        if (!new)
                return old_fsuid;

        if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
            uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
            ns_capable_setid(old->user_ns, CAP_SETUID)) {
                if (!uid_eq(kuid, old->fsuid)) {
                        new->fsuid = kuid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
                                goto change_okay;
                }
        }

        abort_creds(new);
        return old_fsuid;

change_okay:
        commit_creds(new);
        return old_fsuid;
}

SYSCALL_DEFINE1(setfsuid, uid_t, uid)
{
        return __sys_setfsuid(uid);
}

/*
 * Samma på svenska..
 */
long __sys_setfsgid(gid_t gid)
{
        const struct cred *old;
        struct cred *new;
        gid_t old_fsgid;
        kgid_t kgid;

        old = current_cred();
        old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);

        kgid = make_kgid(old->user_ns, gid);
        if (!gid_valid(kgid))
                return old_fsgid;

        new = prepare_creds();
        if (!new)
                return old_fsgid;

        if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
            gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
            ns_capable_setid(old->user_ns, CAP_SETGID)) {
                if (!gid_eq(kgid, old->fsgid)) {
                        new->fsgid = kgid;
                        if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0)
                                goto change_okay;
                }
        }

        abort_creds(new);
        return old_fsgid;

change_okay:
        commit_creds(new);
        return old_fsgid;
}

SYSCALL_DEFINE1(setfsgid, gid_t, gid)
{
        return __sys_setfsgid(gid);
}
#endif /* CONFIG_MULTIUSER */

/**
 * sys_getpid - return the thread group id of the current process
 *
 * Note, despite the name, this returns the tgid not the pid.  The tgid and
 * the pid are identical unless CLONE_THREAD was specified on clone() in
 * which case the tgid is the same in all threads of the same group.
 *
 * This is SMP safe as current->tgid does not change.
 */
SYSCALL_DEFINE0(getpid)
{
        return task_tgid_vnr(current);
}

/* Thread ID - the internal kernel "pid" */
SYSCALL_DEFINE0(gettid)
{
        return task_pid_vnr(current);
}

/*
 * Accessing ->real_parent is not SMP-safe, it could
 * change from under us. However, we can use a stale
 * value of ->real_parent under rcu_read_lock(), see
 * release_task()->call_rcu(delayed_put_task_struct).
 */
SYSCALL_DEFINE0(getppid)
{
        int pid;

        rcu_read_lock();
        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
        rcu_read_unlock();

        return pid;
}

SYSCALL_DEFINE0(getuid)
{
        /* Only we change this so SMP safe */
        return from_kuid_munged(current_user_ns(), current_uid());
}

SYSCALL_DEFINE0(geteuid)
{
        /* Only we change this so SMP safe */
        return from_kuid_munged(current_user_ns(), current_euid());
}

SYSCALL_DEFINE0(getgid)
{
        /* Only we change this so SMP safe */
        return from_kgid_munged(current_user_ns(), current_gid());
}

SYSCALL_DEFINE0(getegid)
{
        /* Only we change this so SMP safe */
        return from_kgid_munged(current_user_ns(), current_egid());
}

static void do_sys_times(struct tms *tms)
{
        u64 tgutime, tgstime, cutime, cstime;

        thread_group_cputime_adjusted(current, &tgutime, &tgstime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
        tms->tms_utime = nsec_to_clock_t(tgutime);
        tms->tms_stime = nsec_to_clock_t(tgstime);
        tms->tms_cutime = nsec_to_clock_t(cutime);
        tms->tms_cstime = nsec_to_clock_t(cstime);
}

SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
{
        if (tbuf) {
                struct tms tmp;

                do_sys_times(&tmp);
                if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return (long) jiffies_64_to_clock_t(get_jiffies_64());
}

#ifdef CONFIG_COMPAT
static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
{
        return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
}

COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
{
        if (tbuf) {
                struct tms tms;
                struct compat_tms tmp;

                do_sys_times(&tms);
                /* Convert our struct tms to the compat version. */
                tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
                tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
                tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
                tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
                if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return compat_jiffies_to_clock_t(jiffies);
}
#endif

/*
 * This needs some heavy checking ...
 * I just haven't the stomach for it. I also don't fully
 * understand sessions/pgrp etc. Let somebody who does explain it.
 *
 * OK, I think I have the protection semantics right.... this is really
 * only important on a multi-user system anyway, to make sure one user
 * can't send a signal to a process owned by another.  -TYT, 12/12/91
 *
 * !PF_FORKNOEXEC check to conform completely to POSIX.
 */
SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
{
        struct task_struct *p;
        struct task_struct *group_leader = current->group_leader;
        struct pid *pids[PIDTYPE_MAX] = { 0 };
        struct pid *pgrp;
        int err;

        if (!pid)
                pid = task_pid_vnr(group_leader);
        if (!pgid)
                pgid = pid;
        if (pgid < 0)
                return -EINVAL;
        rcu_read_lock();

        /* From this point forward we keep holding onto the tasklist lock
         * so that our parent does not change from under us. -DaveM
         */
        write_lock_irq(&tasklist_lock);

        err = -ESRCH;
        p = find_task_by_vpid(pid);
        if (!p)
                goto out;

        err = -EINVAL;
        if (!thread_group_leader(p))
                goto out;

        if (same_thread_group(p->real_parent, group_leader)) {
                err = -EPERM;
                if (task_session(p) != task_session(group_leader))
                        goto out;
                err = -EACCES;
                if (!(p->flags & PF_FORKNOEXEC))
                        goto out;
        } else {
                err = -ESRCH;
                if (p != group_leader)
                        goto out;
        }

        err = -EPERM;
        if (p->signal->leader)
                goto out;

        pgrp = task_pid(p);
        if (pgid != pid) {
                struct task_struct *g;

                pgrp = find_vpid(pgid);
                g = pid_task(pgrp, PIDTYPE_PGID);
                if (!g || task_session(g) != task_session(group_leader))
                        goto out;
        }

        err = security_task_setpgid(p, pgid);
        if (err)
                goto out;

        if (task_pgrp(p) != pgrp)
                change_pid(pids, p, PIDTYPE_PGID, pgrp);

        err = 0;
out:
        /* All paths lead to here, thus we are safe. -DaveM */
        write_unlock_irq(&tasklist_lock);
        rcu_read_unlock();
        free_pids(pids);
        return err;
}

static int do_getpgid(pid_t pid)
{
        struct task_struct *p;
        struct pid *grp;
        int retval;

        rcu_read_lock();
        if (!pid)
                grp = task_pgrp(current);
        else {
                retval = -ESRCH;
                p = find_task_by_vpid(pid);
                if (!p)
                        goto out;
                grp = task_pgrp(p);
                if (!grp)
                        goto out;

                retval = security_task_getpgid(p);
                if (retval)
                        goto out;
        }
        retval = pid_vnr(grp);
out:
        rcu_read_unlock();
        return retval;
}

SYSCALL_DEFINE1(getpgid, pid_t, pid)
{
        return do_getpgid(pid);
}

#ifdef __ARCH_WANT_SYS_GETPGRP

SYSCALL_DEFINE0(getpgrp)
{
        return do_getpgid(0);
}

#endif

SYSCALL_DEFINE1(getsid, pid_t, pid)
{
        struct task_struct *p;
        struct pid *sid;
        int retval;

        rcu_read_lock();
        if (!pid)
                sid = task_session(current);
        else {
                retval = -ESRCH;
                p = find_task_by_vpid(pid);
                if (!p)
                        goto out;
                sid = task_session(p);
                if (!sid)
                        goto out;

                retval = security_task_getsid(p);
                if (retval)
                        goto out;
        }
        retval = pid_vnr(sid);
out:
        rcu_read_unlock();
        return retval;
}

static void set_special_pids(struct pid **pids, struct pid *pid)
{
        struct task_struct *curr = current->group_leader;

        if (task_session(curr) != pid)
                change_pid(pids, curr, PIDTYPE_SID, pid);

        if (task_pgrp(curr) != pid)
                change_pid(pids, curr, PIDTYPE_PGID, pid);
}

int ksys_setsid(void)
{
        struct task_struct *group_leader = current->group_leader;
        struct pid *sid = task_pid(group_leader);
        struct pid *pids[PIDTYPE_MAX] = { 0 };
        pid_t session = pid_vnr(sid);
        int err = -EPERM;

        write_lock_irq(&tasklist_lock);
        /* Fail if I am already a session leader */
        if (group_leader->signal->leader)
                goto out;

        /* Fail if a process group id already exists that equals the
         * proposed session id.
         */
        if (pid_task(sid, PIDTYPE_PGID))
                goto out;

        group_leader->signal->leader = 1;
        set_special_pids(pids, sid);

        proc_clear_tty(group_leader);

        err = session;
out:
        write_unlock_irq(&tasklist_lock);
        free_pids(pids);
        if (err > 0) {
                proc_sid_connector(group_leader);
                sched_autogroup_create_attach(group_leader);
        }
        return err;
}

SYSCALL_DEFINE0(setsid)
{
        return ksys_setsid();
}

DECLARE_RWSEM(uts_sem);

#ifdef COMPAT_UTS_MACHINE
#define override_architecture(name) \
        (personality(current->personality) == PER_LINUX32 && \
         copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
                      sizeof(COMPAT_UTS_MACHINE)))
#else
#define override_architecture(name)        0
#endif

/*
 * Work around broken programs that cannot handle "Linux 3.0".
 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
 * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be
 * 2.6.60.
 */
static int override_release(char __user *release, size_t len)
{
        int ret = 0;

        if (current->personality & UNAME26) {
                const char *rest = UTS_RELEASE;
                char buf[65] = { 0 };
                int ndots = 0;
                unsigned v;
                size_t copy;

                while (*rest) {
                        if (*rest == '.' && ++ndots >= 3)
                                break;
                        if (!isdigit(*rest) && *rest != '.')
                                break;
                        rest++;
                }
                v = LINUX_VERSION_PATCHLEVEL + 60;
                copy = clamp_t(size_t, len, 1, sizeof(buf));
                copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
                ret = copy_to_user(release, buf, copy + 1);
        }
        return ret;
}

SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
{
        struct new_utsname tmp;

        down_read(&uts_sem);
        memcpy(&tmp, utsname(), sizeof(tmp));
        up_read(&uts_sem);
        if (copy_to_user(name, &tmp, sizeof(tmp)))
                return -EFAULT;

        if (override_release(name->release, sizeof(name->release)))
                return -EFAULT;
        if (override_architecture(name))
                return -EFAULT;
        return 0;
}

#ifdef __ARCH_WANT_SYS_OLD_UNAME
/*
 * Old cruft
 */
SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
{
        struct old_utsname tmp;

        if (!name)
                return -EFAULT;

        down_read(&uts_sem);
        memcpy(&tmp, utsname(), sizeof(tmp));
        up_read(&uts_sem);
        if (copy_to_user(name, &tmp, sizeof(tmp)))
                return -EFAULT;

        if (override_release(name->release, sizeof(name->release)))
                return -EFAULT;
        if (override_architecture(name))
                return -EFAULT;
        return 0;
}

SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
{
        struct oldold_utsname tmp;

        if (!name)
                return -EFAULT;

        memset(&tmp, 0, sizeof(tmp));

        down_read(&uts_sem);
        memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN);
        memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN);
        memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN);
        memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN);
        memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN);
        up_read(&uts_sem);
        if (copy_to_user(name, &tmp, sizeof(tmp)))
                return -EFAULT;

        if (override_architecture(name))
                return -EFAULT;
        if (override_release(name->release, sizeof(name->release)))
                return -EFAULT;
        return 0;
}
#endif

SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
{
        int errno;
        char tmp[__NEW_UTS_LEN];

        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
        errno = -EFAULT;
        if (!copy_from_user(tmp, name, len)) {
                struct new_utsname *u;

                add_device_randomness(tmp, len);
                down_write(&uts_sem);
                u = utsname();
                memcpy(u->nodename, tmp, len);
                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                errno = 0;
                uts_proc_notify(UTS_PROC_HOSTNAME);
                up_write(&uts_sem);
        }
        return errno;
}

#ifdef __ARCH_WANT_SYS_GETHOSTNAME

SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
{
        int i;
        struct new_utsname *u;
        char tmp[__NEW_UTS_LEN + 1];

        if (len < 0)
                return -EINVAL;
        down_read(&uts_sem);
        u = utsname();
        i = 1 + strlen(u->nodename);
        if (i > len)
                i = len;
        memcpy(tmp, u->nodename, i);
        up_read(&uts_sem);
        if (copy_to_user(name, tmp, i))
                return -EFAULT;
        return 0;
}

#endif

/*
 * Only setdomainname; getdomainname can be implemented by calling
 * uname()
 */
SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
{
        int errno;
        char tmp[__NEW_UTS_LEN];

        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;

        errno = -EFAULT;
        if (!copy_from_user(tmp, name, len)) {
                struct new_utsname *u;

                add_device_randomness(tmp, len);
                down_write(&uts_sem);
                u = utsname();
                memcpy(u->domainname, tmp, len);
                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                errno = 0;
                uts_proc_notify(UTS_PROC_DOMAINNAME);
                up_write(&uts_sem);
        }
        return errno;
}

/* make sure you are allowed to change @tsk limits before calling this */
static int do_prlimit(struct task_struct *tsk, unsigned int resource,
                      struct rlimit *new_rlim, struct rlimit *old_rlim)
{
        struct rlimit *rlim;
        int retval = 0;

        if (resource >= RLIM_NLIMITS)
                return -EINVAL;
        resource = array_index_nospec(resource, RLIM_NLIMITS);

        if (new_rlim) {
                if (new_rlim->rlim_cur > new_rlim->rlim_max)
                        return -EINVAL;
                if (resource == RLIMIT_NOFILE &&
                                new_rlim->rlim_max > sysctl_nr_open)
                        return -EPERM;
        }

        /* Holding a refcount on tsk protects tsk->signal from disappearing. */
        rlim = tsk->signal->rlim + resource;
        task_lock(tsk->group_leader);
        if (new_rlim) {
                /*
                 * Keep the capable check against init_user_ns until cgroups can
                 * contain all limits.
                 */
                if (new_rlim->rlim_max > rlim->rlim_max &&
                                !capable(CAP_SYS_RESOURCE))
                        retval = -EPERM;
                if (!retval)
                        retval = security_task_setrlimit(tsk, resource, new_rlim);
        }
        if (!retval) {
                if (old_rlim)
                        *old_rlim = *rlim;
                if (new_rlim)
                        *rlim = *new_rlim;
        }
        task_unlock(tsk->group_leader);

        /*
         * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
         * infinite. In case of RLIM_INFINITY the posix CPU timer code
         * ignores the rlimit.
         */
        if (!retval && new_rlim && resource == RLIMIT_CPU &&
            new_rlim->rlim_cur != RLIM_INFINITY &&
            IS_ENABLED(CONFIG_POSIX_TIMERS)) {
                /*
                 * update_rlimit_cpu can fail if the task is exiting, but there
                 * may be other tasks in the thread group that are not exiting,
                 * and they need their cpu timers adjusted.
                 *
                 * The group_leader is the last task to be released, so if we
                 * cannot update_rlimit_cpu on it, then the entire process is
                 * exiting and we do not need to update at all.
                 */
                update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur);
        }

        return retval;
}

SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
        struct rlimit value;
        int ret;

        ret = do_prlimit(current, resource, NULL, &value);
        if (!ret)
                ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;

        return ret;
}

#ifdef CONFIG_COMPAT

COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
                       struct compat_rlimit __user *, rlim)
{
        struct rlimit r;
        struct compat_rlimit r32;

        if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit)))
                return -EFAULT;

        if (r32.rlim_cur == COMPAT_RLIM_INFINITY)
                r.rlim_cur = RLIM_INFINITY;
        else
                r.rlim_cur = r32.rlim_cur;
        if (r32.rlim_max == COMPAT_RLIM_INFINITY)
                r.rlim_max = RLIM_INFINITY;
        else
                r.rlim_max = r32.rlim_max;
        return do_prlimit(current, resource, &r, NULL);
}

COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
                       struct compat_rlimit __user *, rlim)
{
        struct rlimit r;
        int ret;

        ret = do_prlimit(current, resource, NULL, &r);
        if (!ret) {
                struct compat_rlimit r32;
                if (r.rlim_cur > COMPAT_RLIM_INFINITY)
                        r32.rlim_cur = COMPAT_RLIM_INFINITY;
                else
                        r32.rlim_cur = r.rlim_cur;
                if (r.rlim_max > COMPAT_RLIM_INFINITY)
                        r32.rlim_max = COMPAT_RLIM_INFINITY;
                else
                        r32.rlim_max = r.rlim_max;

                if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit)))
                        return -EFAULT;
        }
        return ret;
}

#endif

#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT

/*
 *        Back compatibility for getrlimit. Needed for some apps.
 */
SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
                struct rlimit __user *, rlim)
{
        struct rlimit x;
        if (resource >= RLIM_NLIMITS)
                return -EINVAL;

        resource = array_index_nospec(resource, RLIM_NLIMITS);
        task_lock(current->group_leader);
        x = current->signal->rlim[resource];
        task_unlock(current->group_leader);
        if (x.rlim_cur > 0x7FFFFFFF)
                x.rlim_cur = 0x7FFFFFFF;
        if (x.rlim_max > 0x7FFFFFFF)
                x.rlim_max = 0x7FFFFFFF;
        return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
                       struct compat_rlimit __user *, rlim)
{
        struct rlimit r;

        if (resource >= RLIM_NLIMITS)
                return -EINVAL;

        resource = array_index_nospec(resource, RLIM_NLIMITS);
        task_lock(current->group_leader);
        r = current->signal->rlim[resource];
        task_unlock(current->group_leader);
        if (r.rlim_cur > 0x7FFFFFFF)
                r.rlim_cur = 0x7FFFFFFF;
        if (r.rlim_max > 0x7FFFFFFF)
                r.rlim_max = 0x7FFFFFFF;

        if (put_user(r.rlim_cur, &rlim->rlim_cur) ||
            put_user(r.rlim_max, &rlim->rlim_max))
                return -EFAULT;
        return 0;
}
#endif

#endif

static inline bool rlim64_is_infinity(__u64 rlim64)
{
#if BITS_PER_LONG < 64
        return rlim64 >= ULONG_MAX;
#else
        return rlim64 == RLIM64_INFINITY;
#endif
}

static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
{
        if (rlim->rlim_cur == RLIM_INFINITY)
                rlim64->rlim_cur = RLIM64_INFINITY;
        else
                rlim64->rlim_cur = rlim->rlim_cur;
        if (rlim->rlim_max == RLIM_INFINITY)
                rlim64->rlim_max = RLIM64_INFINITY;
        else
                rlim64->rlim_max = rlim->rlim_max;
}

static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
{
        if (rlim64_is_infinity(rlim64->rlim_cur))
                rlim->rlim_cur = RLIM_INFINITY;
        else
                rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
        if (rlim64_is_infinity(rlim64->rlim_max))
                rlim->rlim_max = RLIM_INFINITY;
        else
                rlim->rlim_max = (unsigned long)rlim64->rlim_max;
}

/* rcu lock must be held */
static int check_prlimit_permission(struct task_struct *task,
                                    unsigned int flags)
{
        const struct cred *cred = current_cred(), *tcred;
        bool id_match;

        if (current == task)
                return 0;

        tcred = __task_cred(task);
        id_match = (uid_eq(cred->uid, tcred->euid) &&
                    uid_eq(cred->uid, tcred->suid) &&
                    uid_eq(cred->uid, tcred->uid)  &&
                    gid_eq(cred->gid, tcred->egid) &&
                    gid_eq(cred->gid, tcred->sgid) &&
                    gid_eq(cred->gid, tcred->gid));
        if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
                return -EPERM;

        return security_task_prlimit(cred, tcred, flags);
}

SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
                const struct rlimit64 __user *, new_rlim,
                struct rlimit64 __user *, old_rlim)
{
        struct rlimit64 old64, new64;
        struct rlimit old, new;
        struct task_struct *tsk;
        unsigned int checkflags = 0;
        bool need_tasklist;
        int ret;

        if (old_rlim)
                checkflags |= LSM_PRLIMIT_READ;

        if (new_rlim) {
                if (copy_from_user(&new64, new_rlim, sizeof(new64)))
                        return -EFAULT;
                rlim64_to_rlim(&new64, &new);
                checkflags |= LSM_PRLIMIT_WRITE;
        }

        rcu_read_lock();
        tsk = pid ? find_task_by_vpid(pid) : current;
        if (!tsk) {
                rcu_read_unlock();
                return -ESRCH;
        }
        ret = check_prlimit_permission(tsk, checkflags);
        if (ret) {
                rcu_read_unlock();
                return ret;
        }
        get_task_struct(tsk);
        rcu_read_unlock();

        need_tasklist = !same_thread_group(tsk, current);
        if (need_tasklist) {
                /*
                 * Ensure we can't race with group exit or de_thread(),
                 * so tsk->group_leader can't be freed or changed until
                 * read_unlock(tasklist_lock) below.
                 */
                read_lock(&tasklist_lock);
                if (!pid_alive(tsk))
                        ret = -ESRCH;
        }

        if (!ret) {
                ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
                                old_rlim ? &old : NULL);
        }

        if (need_tasklist)
                read_unlock(&tasklist_lock);

        if (!ret && old_rlim) {
                rlim_to_rlim64(&old, &old64);
                if (copy_to_user(old_rlim, &old64, sizeof(old64)))
                        ret = -EFAULT;
        }

        put_task_struct(tsk);
        return ret;
}

SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
        struct rlimit new_rlim;

        if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
                return -EFAULT;
        return do_prlimit(current, resource, &new_rlim, NULL);
}

/*
 * It would make sense to put struct rusage in the task_struct,
 * except that would make the task_struct be *really big*.  After
 * task_struct gets moved into malloc'ed memory, it would
 * make sense to do this.  It will make moving the rest of the information
 * a lot simpler!  (Which we're not doing right now because we're not
 * measuring them yet).
 *
 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
 * races with threads incrementing their own counters.  But since word
 * reads are atomic, we either get new values or old values and we don't
 * care which for the sums.  We always take the siglock to protect reading
 * the c* fields from p->signal from races with exit.c updating those
 * fields when reaping, so a sample either gets all the additions of a
 * given child after it's reaped, or none so this sample is before reaping.
 *
 * Locking:
 * We need to take the siglock for CHILDEREN, SELF and BOTH
 * for  the cases current multithreaded, non-current single threaded
 * non-current multithreaded.  Thread traversal is now safe with
 * the siglock held.
 * Strictly speaking, we donot need to take the siglock if we are current and
 * single threaded,  as no one else can take our signal_struct away, no one
 * else can  reap the  children to update signal->c* counters, and no one else
 * can race with the signal-> fields. If we do not take any lock, the
 * signal-> fields could be read out of order while another thread was just
 * exiting. So we should  place a read memory barrier when we avoid the lock.
 * On the writer side,  write memory barrier is implied in  __exit_signal
 * as __exit_signal releases  the siglock spinlock after updating the signal->
 * fields. But we don't do this yet to keep things simple.
 *
 */

static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
{
        r->ru_nvcsw += t->nvcsw;
        r->ru_nivcsw += t->nivcsw;
        r->ru_minflt += t->min_flt;
        r->ru_majflt += t->maj_flt;
        r->ru_inblock += task_io_get_inblock(t);
        r->ru_oublock += task_io_get_oublock(t);
}

void getrusage(struct task_struct *p, int who, struct rusage *r)
{
        struct task_struct *t;
        unsigned long flags;
        u64 tgutime, tgstime, utime, stime;
        unsigned long maxrss;
        struct mm_struct *mm;
        struct signal_struct *sig = p->signal;
        unsigned int seq = 0;

retry:
        memset(r, 0, sizeof(*r));
        utime = stime = 0;
        maxrss = 0;

        if (who == RUSAGE_THREAD) {
                task_cputime_adjusted(current, &utime, &stime);
                accumulate_thread_rusage(p, r);
                maxrss = sig->maxrss;
                goto out_thread;
        }

        flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);

        switch (who) {
        case RUSAGE_BOTH:
        case RUSAGE_CHILDREN:
                utime = sig->cutime;
                stime = sig->cstime;
                r->ru_nvcsw = sig->cnvcsw;
                r->ru_nivcsw = sig->cnivcsw;
                r->ru_minflt = sig->cmin_flt;
                r->ru_majflt = sig->cmaj_flt;
                r->ru_inblock = sig->cinblock;
                r->ru_oublock = sig->coublock;
                maxrss = sig->cmaxrss;

                if (who == RUSAGE_CHILDREN)
                        break;
                fallthrough;

        case RUSAGE_SELF:
                r->ru_nvcsw += sig->nvcsw;
                r->ru_nivcsw += sig->nivcsw;
                r->ru_minflt += sig->min_flt;
                r->ru_majflt += sig->maj_flt;
                r->ru_inblock += sig->inblock;
                r->ru_oublock += sig->oublock;
                if (maxrss < sig->maxrss)
                        maxrss = sig->maxrss;

                rcu_read_lock();
                __for_each_thread(sig, t)
                        accumulate_thread_rusage(t, r);
                rcu_read_unlock();

                break;

        default:
                BUG();
        }

        if (need_seqretry(&sig->stats_lock, seq)) {
                seq = 1;
                goto retry;
        }
        done_seqretry_irqrestore(&sig->stats_lock, seq, flags);

        if (who == RUSAGE_CHILDREN)
                goto out_children;

        thread_group_cputime_adjusted(p, &tgutime, &tgstime);
        utime += tgutime;
        stime += tgstime;

out_thread:
        mm = get_task_mm(p);
        if (mm) {
                setmax_mm_hiwater_rss(&maxrss, mm);
                mmput(mm);
        }

out_children:
        r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
        r->ru_utime = ns_to_kernel_old_timeval(utime);
        r->ru_stime = ns_to_kernel_old_timeval(stime);
}

SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
{
        struct rusage r;

        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
            who != RUSAGE_THREAD)
                return -EINVAL;

        getrusage(current, who, &r);
        return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
{
        struct rusage r;

        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
            who != RUSAGE_THREAD)
                return -EINVAL;

        getrusage(current, who, &r);
        return put_compat_rusage(&r, ru);
}
#endif

SYSCALL_DEFINE1(umask, int, mask)
{
        mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
        return mask;
}

static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
        CLASS(fd, exe)(fd);
        struct inode *inode;
        int err;

        if (fd_empty(exe))
                return -EBADF;

        inode = file_inode(fd_file(exe));

        /*
         * Because the original mm->exe_file points to executable file, make
         * sure that this one is executable as well, to avoid breaking an
         * overall picture.
         */
        if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path))
                return -EACCES;

        err = file_permission(fd_file(exe), MAY_EXEC);
        if (err)
                return err;

        return replace_mm_exe_file(mm, fd_file(exe));
}

/*
 * Check arithmetic relations of passed addresses.
 *
 * WARNING: we don't require any capability here so be very careful
 * in what is allowed for modification from userspace.
 */
static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
{
        unsigned long mmap_max_addr = TASK_SIZE;
        int error = -EINVAL, i;

        static const unsigned char offsets[] = {
                offsetof(struct prctl_mm_map, start_code),
                offsetof(struct prctl_mm_map, end_code),
                offsetof(struct prctl_mm_map, start_data),
                offsetof(struct prctl_mm_map, end_data),
                offsetof(struct prctl_mm_map, start_brk),
                offsetof(struct prctl_mm_map, brk),
                offsetof(struct prctl_mm_map, start_stack),
                offsetof(struct prctl_mm_map, arg_start),
                offsetof(struct prctl_mm_map, arg_end),
                offsetof(struct prctl_mm_map, env_start),
                offsetof(struct prctl_mm_map, env_end),
        };

        /*
         * Make sure the members are not somewhere outside
         * of allowed address space.
         */
        for (i = 0; i < ARRAY_SIZE(offsets); i++) {
                u64 val = *(u64 *)((char *)prctl_map + offsets[i]);

                if ((unsigned long)val >= mmap_max_addr ||
                    (unsigned long)val < mmap_min_addr)
                        goto out;
        }

        /*
         * Make sure the pairs are ordered.
         */
#define __prctl_check_order(__m1, __op, __m2)                                \
        ((unsigned long)prctl_map->__m1 __op                                \
         (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
        error  = __prctl_check_order(start_code, <, end_code);
        error |= __prctl_check_order(start_data,<=, end_data);
        error |= __prctl_check_order(start_brk, <=, brk);
        error |= __prctl_check_order(arg_start, <=, arg_end);
        error |= __prctl_check_order(env_start, <=, env_end);
        if (error)
                goto out;
#undef __prctl_check_order

        error = -EINVAL;

        /*
         * Neither we should allow to override limits if they set.
         */
        if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
                              prctl_map->start_brk, prctl_map->end_data,
                              prctl_map->start_data))
                        goto out;

        error = 0;
out:
        return error;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
{
        struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
        unsigned long user_auxv[AT_VECTOR_SIZE];
        struct mm_struct *mm = current->mm;
        int error;

        BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
        BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);

        if (opt == PR_SET_MM_MAP_SIZE)
                return put_user((unsigned int)sizeof(prctl_map),
                                (unsigned int __user *)addr);

        if (data_size != sizeof(prctl_map))
                return -EINVAL;

        if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
                return -EFAULT;

        error = validate_prctl_map_addr(&prctl_map);
        if (error)
                return error;

        if (prctl_map.auxv_size) {
                /*
                 * Someone is trying to cheat the auxv vector.
                 */
                if (!prctl_map.auxv ||
                                prctl_map.auxv_size > sizeof(mm->saved_auxv))
                        return -EINVAL;

                memset(user_auxv, 0, sizeof(user_auxv));
                if (copy_from_user(user_auxv,
                                   (const void __user *)prctl_map.auxv,
                                   prctl_map.auxv_size))
                        return -EFAULT;

                /* Last entry must be AT_NULL as specification requires */
                user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
                user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
        }

        if (prctl_map.exe_fd != (u32)-1) {
                /*
                 * Check if the current user is checkpoint/restore capable.
                 * At the time of this writing, it checks for CAP_SYS_ADMIN
                 * or CAP_CHECKPOINT_RESTORE.
                 * Note that a user with access to ptrace can masquerade an
                 * arbitrary program as any executable, even setuid ones.
                 * This may have implications in the tomoyo subsystem.
                 */
                if (!checkpoint_restore_ns_capable(current_user_ns()))
                        return -EPERM;

                error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
                if (error)
                        return error;
        }

        /*
         * arg_lock protects concurrent updates but we still need mmap_lock for
         * read to exclude races with sys_brk.
         */
        mmap_read_lock(mm);

        /*
         * We don't validate if these members are pointing to
         * real present VMAs because application may have correspond
         * VMAs already unmapped and kernel uses these members for statistics
         * output in procfs mostly, except
         *
         *  - @start_brk/@brk which are used in do_brk_flags but kernel lookups
         *    for VMAs when updating these members so anything wrong written
         *    here cause kernel to swear at userspace program but won't lead
         *    to any problem in kernel itself
         */

        spin_lock(&mm->arg_lock);
        mm->start_code        = prctl_map.start_code;
        mm->end_code        = prctl_map.end_code;
        mm->start_data        = prctl_map.start_data;
        mm->end_data        = prctl_map.end_data;
        mm->start_brk        = prctl_map.start_brk;
        mm->brk                = prctl_map.brk;
        mm->start_stack        = prctl_map.start_stack;
        mm->arg_start        = prctl_map.arg_start;
        mm->arg_end        = prctl_map.arg_end;
        mm->env_start        = prctl_map.env_start;
        mm->env_end        = prctl_map.env_end;
        spin_unlock(&mm->arg_lock);

        /*
         * Note this update of @saved_auxv is lockless thus
         * if someone reads this member in procfs while we're
         * updating -- it may get partly updated results. It's
         * known and acceptable trade off: we leave it as is to
         * not introduce additional locks here making the kernel
         * more complex.
         */
        if (prctl_map.auxv_size)
                memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));

        mmap_read_unlock(mm);
        return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */

static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
                          unsigned long len)
{
        /*
         * This doesn't move the auxiliary vector itself since it's pinned to
         * mm_struct, but it permits filling the vector with new values.  It's
         * up to the caller to provide sane values here, otherwise userspace
         * tools which use this vector might be unhappy.
         */
        unsigned long user_auxv[AT_VECTOR_SIZE] = {};

        if (len > sizeof(user_auxv))
                return -EINVAL;

        if (copy_from_user(user_auxv, (const void __user *)addr, len))
                return -EFAULT;

        /* Make sure the last entry is always AT_NULL */
        user_auxv[AT_VECTOR_SIZE - 2] = 0;
        user_auxv[AT_VECTOR_SIZE - 1] = 0;

        BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));

        task_lock(current);
        memcpy(mm->saved_auxv, user_auxv, len);
        task_unlock(current);

        return 0;
}

static int prctl_set_mm(int opt, unsigned long addr,
                        unsigned long arg4, unsigned long arg5)
{
        struct mm_struct *mm = current->mm;
        struct prctl_mm_map prctl_map = {
                .auxv = NULL,
                .auxv_size = 0,
                .exe_fd = -1,
        };
        struct vm_area_struct *vma;
        int error;

        if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
                              opt != PR_SET_MM_MAP &&
                              opt != PR_SET_MM_MAP_SIZE)))
                return -EINVAL;

#ifdef CONFIG_CHECKPOINT_RESTORE
        if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
                return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
#endif

        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;

        if (opt == PR_SET_MM_EXE_FILE)
                return prctl_set_mm_exe_file(mm, (unsigned int)addr);

        if (opt == PR_SET_MM_AUXV)
                return prctl_set_auxv(mm, addr, arg4);

        if (addr >= TASK_SIZE || addr < mmap_min_addr)
                return -EINVAL;

        error = -EINVAL;

        /*
         * arg_lock protects concurrent updates of arg boundaries, we need
         * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr
         * validation.
         */
        mmap_read_lock(mm);
        vma = find_vma(mm, addr);

        spin_lock(&mm->arg_lock);
        prctl_map.start_code        = mm->start_code;
        prctl_map.end_code        = mm->end_code;
        prctl_map.start_data        = mm->start_data;
        prctl_map.end_data        = mm->end_data;
        prctl_map.start_brk        = mm->start_brk;
        prctl_map.brk                = mm->brk;
        prctl_map.start_stack        = mm->start_stack;
        prctl_map.arg_start        = mm->arg_start;
        prctl_map.arg_end        = mm->arg_end;
        prctl_map.env_start        = mm->env_start;
        prctl_map.env_end        = mm->env_end;

        switch (opt) {
        case PR_SET_MM_START_CODE:
                prctl_map.start_code = addr;
                break;
        case PR_SET_MM_END_CODE:
                prctl_map.end_code = addr;
                break;
        case PR_SET_MM_START_DATA:
                prctl_map.start_data = addr;
                break;
        case PR_SET_MM_END_DATA:
                prctl_map.end_data = addr;
                break;
        case PR_SET_MM_START_STACK:
                prctl_map.start_stack = addr;
                break;
        case PR_SET_MM_START_BRK:
                prctl_map.start_brk = addr;
                break;
        case PR_SET_MM_BRK:
                prctl_map.brk = addr;
                break;
        case PR_SET_MM_ARG_START:
                prctl_map.arg_start = addr;
                break;
        case PR_SET_MM_ARG_END:
                prctl_map.arg_end = addr;
                break;
        case PR_SET_MM_ENV_START:
                prctl_map.env_start = addr;
                break;
        case PR_SET_MM_ENV_END:
                prctl_map.env_end = addr;
                break;
        default:
                goto out;
        }

        error = validate_prctl_map_addr(&prctl_map);
        if (error)
                goto out;

        switch (opt) {
        /*
         * If command line arguments and environment
         * are placed somewhere else on stack, we can
         * set them up here, ARG_START/END to setup
         * command line arguments and ENV_START/END
         * for environment.
         */
        case PR_SET_MM_START_STACK:
        case PR_SET_MM_ARG_START:
        case PR_SET_MM_ARG_END:
        case PR_SET_MM_ENV_START:
        case PR_SET_MM_ENV_END:
                if (!vma) {
                        error = -EFAULT;
                        goto out;
                }
        }

        mm->start_code        = prctl_map.start_code;
        mm->end_code        = prctl_map.end_code;
        mm->start_data        = prctl_map.start_data;
        mm->end_data        = prctl_map.end_data;
        mm->start_brk        = prctl_map.start_brk;
        mm->brk                = prctl_map.brk;
        mm->start_stack        = prctl_map.start_stack;
        mm->arg_start        = prctl_map.arg_start;
        mm->arg_end        = prctl_map.arg_end;
        mm->env_start        = prctl_map.env_start;
        mm->env_end        = prctl_map.env_end;

        error = 0;
out:
        spin_unlock(&mm->arg_lock);
        mmap_read_unlock(mm);
        return error;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr)
{
        return put_user(me->clear_child_tid, tid_addr);
}
#else
static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr)
{
        return -EINVAL;
}
#endif

static int propagate_has_child_subreaper(struct task_struct *p, void *data)
{
        /*
         * If task has has_child_subreaper - all its descendants
         * already have these flag too and new descendants will
         * inherit it on fork, skip them.
         *
         * If we've found child_reaper - skip descendants in
         * it's subtree as they will never get out pidns.
         */
        if (p->signal->has_child_subreaper ||
            is_child_reaper(task_pid(p)))
                return 0;

        p->signal->has_child_subreaper = 1;
        return 1;
}

int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
{
        return -EINVAL;
}

int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
                                    unsigned long ctrl)
{
        return -EINVAL;
}

int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status)
{
        return -EINVAL;
}

int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status)
{
        return -EINVAL;
}

int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status)
{
        return -EINVAL;
}

int __weak arch_prctl_get_branch_landing_pad_state(struct task_struct *t,
                                                   unsigned long __user *state)
{
        return -EINVAL;
}

int __weak arch_prctl_set_branch_landing_pad_state(struct task_struct *t, unsigned long state)
{
        return -EINVAL;
}

int __weak arch_prctl_lock_branch_landing_pad_state(struct task_struct *t)
{
        return -EINVAL;
}

#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)

static int prctl_set_vma(unsigned long opt, unsigned long addr,
                         unsigned long size, unsigned long arg)
{
        int error;

        switch (opt) {
        case PR_SET_VMA_ANON_NAME:
                error = set_anon_vma_name(addr, size, (const char __user *)arg);
                break;
        default:
                error = -EINVAL;
        }

        return error;
}

static inline unsigned long get_current_mdwe(void)
{
        unsigned long ret = 0;

        if (mm_flags_test(MMF_HAS_MDWE, current->mm))
                ret |= PR_MDWE_REFUSE_EXEC_GAIN;
        if (mm_flags_test(MMF_HAS_MDWE_NO_INHERIT, current->mm))
                ret |= PR_MDWE_NO_INHERIT;

        return ret;
}

static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
                                 unsigned long arg4, unsigned long arg5)
{
        unsigned long current_bits;

        if (arg3 || arg4 || arg5)
                return -EINVAL;

        if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
                return -EINVAL;

        /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
        if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
                return -EINVAL;

        /*
         * EOPNOTSUPP might be more appropriate here in principle, but
         * existing userspace depends on EINVAL specifically.
         */
        if (!arch_memory_deny_write_exec_supported())
                return -EINVAL;

        current_bits = get_current_mdwe();
        if (current_bits && current_bits != bits)
                return -EPERM; /* Cannot unset the flags */

        if (bits & PR_MDWE_NO_INHERIT)
                mm_flags_set(MMF_HAS_MDWE_NO_INHERIT, current->mm);
        if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
                mm_flags_set(MMF_HAS_MDWE, current->mm);

        return 0;
}

static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
                                 unsigned long arg4, unsigned long arg5)
{
        if (arg2 || arg3 || arg4 || arg5)
                return -EINVAL;
        return get_current_mdwe();
}

static int prctl_get_auxv(void __user *addr, unsigned long len)
{
        struct mm_struct *mm = current->mm;
        unsigned long size = min_t(unsigned long, sizeof(mm->saved_auxv), len);

        if (size && copy_to_user(addr, mm->saved_auxv, size))
                return -EFAULT;
        return sizeof(mm->saved_auxv);
}

static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3,
                                 unsigned long arg4, unsigned long arg5)
{
        struct mm_struct *mm = current->mm;

        if (arg2 || arg3 || arg4 || arg5)
                return -EINVAL;

        /* If disabled, we return "1 | flags", otherwise 0. */
        if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
                return 1;
        else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm))
                return 1 | PR_THP_DISABLE_EXCEPT_ADVISED;
        return 0;
}

static int prctl_set_thp_disable(bool thp_disable, unsigned long flags,
                                 unsigned long arg4, unsigned long arg5)
{
        struct mm_struct *mm = current->mm;

        if (arg4 || arg5)
                return -EINVAL;

        /* Flags are only allowed when disabling. */
        if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED))
                return -EINVAL;
        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
        if (thp_disable) {
                if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) {
                        mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
                        mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
                } else {
                        mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm);
                        mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
                }
        } else {
                mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
                mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
        }
        mmap_write_unlock(current->mm);
        return 0;
}

SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
{
        struct task_struct *me = current;
        unsigned char comm[sizeof(me->comm)];
        long error;

        error = security_task_prctl(option, arg2, arg3, arg4, arg5);
        if (error != -ENOSYS)
                return error;

        error = 0;
        switch (option) {
        case PR_SET_PDEATHSIG:
                if (!valid_signal(arg2)) {
                        error = -EINVAL;
                        break;
                }
                /*
                 * Ensure that either:
                 *
                 * 1. Subsequent getppid() calls reflect the parent process having died.
                 * 2. forget_original_parent() will send the new me->pdeath_signal.
                 *
                 * Also prevent the read of me->pdeath_signal from being a data race.
                 */
                read_lock(&tasklist_lock);
                me->pdeath_signal = arg2;
                read_unlock(&tasklist_lock);
                break;
        case PR_GET_PDEATHSIG:
                error = put_user(me->pdeath_signal, (int __user *)arg2);
                break;
        case PR_GET_DUMPABLE:
                error = get_dumpable(me->mm);
                break;
        case PR_SET_DUMPABLE:
                if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
                        error = -EINVAL;
                        break;
                }
                set_dumpable(me->mm, arg2);
                break;

        case PR_SET_UNALIGN:
                error = SET_UNALIGN_CTL(me, arg2);
                break;
        case PR_GET_UNALIGN:
                error = GET_UNALIGN_CTL(me, arg2);
                break;
        case PR_SET_FPEMU:
                error = SET_FPEMU_CTL(me, arg2);
                break;
        case PR_GET_FPEMU:
                error = GET_FPEMU_CTL(me, arg2);
                break;
        case PR_SET_FPEXC:
                error = SET_FPEXC_CTL(me, arg2);
                break;
        case PR_GET_FPEXC:
                error = GET_FPEXC_CTL(me, arg2);
                break;
        case PR_GET_TIMING:
                error = PR_TIMING_STATISTICAL;
                break;
        case PR_SET_TIMING:
                if (arg2 != PR_TIMING_STATISTICAL)
                        error = -EINVAL;
                break;
        case PR_SET_NAME:
                comm[sizeof(me->comm) - 1] = 0;
                if (strncpy_from_user(comm, (char __user *)arg2,
                                      sizeof(me->comm) - 1) < 0)
                        return -EFAULT;
                set_task_comm(me, comm);
                proc_comm_connector(me);
                break;
        case PR_GET_NAME:
                get_task_comm(comm, me);
                if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
                        return -EFAULT;
                break;
        case PR_GET_ENDIAN:
                error = GET_ENDIAN(me, arg2);
                break;
        case PR_SET_ENDIAN:
                error = SET_ENDIAN(me, arg2);
                break;
        case PR_GET_SECCOMP:
                error = prctl_get_seccomp();
                break;
        case PR_SET_SECCOMP:
                error = prctl_set_seccomp(arg2, (char __user *)arg3);
                break;
        case PR_GET_TSC:
                error = GET_TSC_CTL(arg2);
                break;
        case PR_SET_TSC:
                error = SET_TSC_CTL(arg2);
                break;
        case PR_TASK_PERF_EVENTS_DISABLE:
                error = perf_event_task_disable();
                break;
        case PR_TASK_PERF_EVENTS_ENABLE:
                error = perf_event_task_enable();
                break;
        case PR_GET_TIMERSLACK:
                if (current->timer_slack_ns > ULONG_MAX)
                        error = ULONG_MAX;
                else
                        error = current->timer_slack_ns;
                break;
        case PR_SET_TIMERSLACK:
                if (rt_or_dl_task_policy(current))
                        break;
                if (arg2 <= 0)
                        current->timer_slack_ns =
                                        current->default_timer_slack_ns;
                else
                        current->timer_slack_ns = arg2;
                break;
        case PR_MCE_KILL:
                if (arg4 | arg5)
                        return -EINVAL;
                switch (arg2) {
                case PR_MCE_KILL_CLEAR:
                        if (arg3 != 0)
                                return -EINVAL;
                        current->flags &= ~PF_MCE_PROCESS;
                        break;
                case PR_MCE_KILL_SET:
                        current->flags |= PF_MCE_PROCESS;
                        if (arg3 == PR_MCE_KILL_EARLY)
                                current->flags |= PF_MCE_EARLY;
                        else if (arg3 == PR_MCE_KILL_LATE)
                                current->flags &= ~PF_MCE_EARLY;
                        else if (arg3 == PR_MCE_KILL_DEFAULT)
                                current->flags &=
                                                ~(PF_MCE_EARLY|PF_MCE_PROCESS);
                        else
                                return -EINVAL;
                        break;
                default:
                        return -EINVAL;
                }
                break;
        case PR_MCE_KILL_GET:
                if (arg2 | arg3 | arg4 | arg5)
                        return -EINVAL;
                if (current->flags & PF_MCE_PROCESS)
                        error = (current->flags & PF_MCE_EARLY) ?
                                PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
                else
                        error = PR_MCE_KILL_DEFAULT;
                break;
        case PR_SET_MM:
                error = prctl_set_mm(arg2, arg3, arg4, arg5);
                break;
        case PR_GET_TID_ADDRESS:
                error = prctl_get_tid_address(me, (int __user * __user *)arg2);
                break;
        case PR_SET_CHILD_SUBREAPER:
                me->signal->is_child_subreaper = !!arg2;
                if (!arg2)
                        break;

                walk_process_tree(me, propagate_has_child_subreaper, NULL);
                break;
        case PR_GET_CHILD_SUBREAPER:
                error = put_user(me->signal->is_child_subreaper,
                                 (int __user *)arg2);
                break;
        case PR_SET_NO_NEW_PRIVS:
                if (arg2 != 1 || arg3 || arg4 || arg5)
                        return -EINVAL;

                task_set_no_new_privs(current);
                break;
        case PR_GET_NO_NEW_PRIVS:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
                return task_no_new_privs(current) ? 1 : 0;
        case PR_GET_THP_DISABLE:
                error = prctl_get_thp_disable(arg2, arg3, arg4, arg5);
                break;
        case PR_SET_THP_DISABLE:
                error = prctl_set_thp_disable(arg2, arg3, arg4, arg5);
                break;
        case PR_MPX_ENABLE_MANAGEMENT:
        case PR_MPX_DISABLE_MANAGEMENT:
                /* No longer implemented: */
                return -EINVAL;
        case PR_SET_FP_MODE:
                error = SET_FP_MODE(me, arg2);
                break;
        case PR_GET_FP_MODE:
                error = GET_FP_MODE(me);
                break;
        case PR_SVE_SET_VL:
                error = SVE_SET_VL(arg2);
                break;
        case PR_SVE_GET_VL:
                error = SVE_GET_VL();
                break;
        case PR_SME_SET_VL:
                error = SME_SET_VL(arg2);
                break;
        case PR_SME_GET_VL:
                error = SME_GET_VL();
                break;
        case PR_GET_SPECULATION_CTRL:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = arch_prctl_spec_ctrl_get(me, arg2);
                break;
        case PR_SET_SPECULATION_CTRL:
                if (arg4 || arg5)
                        return -EINVAL;
                error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
                break;
        case PR_PAC_RESET_KEYS:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = PAC_RESET_KEYS(me, arg2);
                break;
        case PR_PAC_SET_ENABLED_KEYS:
                if (arg4 || arg5)
                        return -EINVAL;
                error = PAC_SET_ENABLED_KEYS(me, arg2, arg3);
                break;
        case PR_PAC_GET_ENABLED_KEYS:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
                error = PAC_GET_ENABLED_KEYS(me);
                break;
        case PR_SET_TAGGED_ADDR_CTRL:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = SET_TAGGED_ADDR_CTRL(arg2);
                break;
        case PR_GET_TAGGED_ADDR_CTRL:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
                error = GET_TAGGED_ADDR_CTRL();
                break;
        case PR_SET_IO_FLUSHER:
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;

                if (arg3 || arg4 || arg5)
                        return -EINVAL;

                if (arg2 == 1)
                        current->flags |= PR_IO_FLUSHER;
                else if (!arg2)
                        current->flags &= ~PR_IO_FLUSHER;
                else
                        return -EINVAL;
                break;
        case PR_GET_IO_FLUSHER:
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;

                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;

                error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
                break;
        case PR_SET_SYSCALL_USER_DISPATCH:
                error = set_syscall_user_dispatch(arg2, arg3, arg4,
                                                  (char __user *) arg5);
                break;
#ifdef CONFIG_SCHED_CORE
        case PR_SCHED_CORE:
                error = sched_core_share_pid(arg2, arg3, arg4, arg5);
                break;
#endif
        case PR_SET_MDWE:
                error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
                break;
        case PR_GET_MDWE:
                error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
                break;
        case PR_PPC_GET_DEXCR:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = PPC_GET_DEXCR_ASPECT(me, arg2);
                break;
        case PR_PPC_SET_DEXCR:
                if (arg4 || arg5)
                        return -EINVAL;
                error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3);
                break;
        case PR_SET_VMA:
                error = prctl_set_vma(arg2, arg3, arg4, arg5);
                break;
        case PR_GET_AUXV:
                if (arg4 || arg5)
                        return -EINVAL;
                error = prctl_get_auxv((void __user *)arg2, arg3);
                break;
#ifdef CONFIG_KSM
        case PR_SET_MEMORY_MERGE:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                if (mmap_write_lock_killable(me->mm))
                        return -EINTR;

                if (arg2)
                        error = ksm_enable_merge_any(me->mm);
                else
                        error = ksm_disable_merge_any(me->mm);
                mmap_write_unlock(me->mm);
                break;
        case PR_GET_MEMORY_MERGE:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;

                error = !!mm_flags_test(MMF_VM_MERGE_ANY, me->mm);
                break;
#endif
        case PR_RISCV_V_SET_CONTROL:
                error = RISCV_V_SET_CONTROL(arg2);
                break;
        case PR_RISCV_V_GET_CONTROL:
                error = RISCV_V_GET_CONTROL();
                break;
        case PR_RISCV_SET_ICACHE_FLUSH_CTX:
                error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
                break;
        case PR_GET_SHADOW_STACK_STATUS:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2);
                break;
        case PR_SET_SHADOW_STACK_STATUS:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = arch_set_shadow_stack_status(me, arg2);
                break;
        case PR_LOCK_SHADOW_STACK_STATUS:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = arch_lock_shadow_stack_status(me, arg2);
                break;
        case PR_TIMER_CREATE_RESTORE_IDS:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = posixtimer_create_prctl(arg2);
                break;
        case PR_FUTEX_HASH:
                error = futex_hash_prctl(arg2, arg3, arg4);
                break;
        case PR_RSEQ_SLICE_EXTENSION:
                if (arg4 || arg5)
                        return -EINVAL;
                error = rseq_slice_extension_prctl(arg2, arg3);
                break;
        case PR_GET_CFI:
                if (arg2 != PR_CFI_BRANCH_LANDING_PADS)
                        return -EINVAL;
                if (arg4 || arg5)
                        return -EINVAL;
                error = arch_prctl_get_branch_landing_pad_state(me, (unsigned long __user *)arg3);
                break;
        case PR_SET_CFI:
                if (arg2 != PR_CFI_BRANCH_LANDING_PADS)
                        return -EINVAL;
                if (arg4 || arg5)
                        return -EINVAL;
                error = arch_prctl_set_branch_landing_pad_state(me, arg3);
                if (error)
                        break;
                if (arg3 & PR_CFI_LOCK && !(arg3 & PR_CFI_DISABLE))
                        error = arch_prctl_lock_branch_landing_pad_state(me);
                break;
        default:
                trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
                error = -EINVAL;
                break;
        }
        return error;
}

SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, void __user *, unused)
{
        int err = 0;
        int cpu = raw_smp_processor_id();

        if (cpup)
                err |= put_user(cpu, cpup);
        if (nodep)
                err |= put_user(cpu_to_node(cpu), nodep);
        return err ? -EFAULT : 0;
}

/**
 * do_sysinfo - fill in sysinfo struct
 * @info: pointer to buffer to fill
 */
static int do_sysinfo(struct sysinfo *info)
{
        unsigned long mem_total, sav_total;
        unsigned int mem_unit, bitcount;
        struct timespec64 tp;

        memset(info, 0, sizeof(struct sysinfo));

        ktime_get_boottime_ts64(&tp);
        timens_add_boottime(&tp);
        info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);

        get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);

        info->procs = nr_threads;

        si_meminfo(info);
        si_swapinfo(info);

        /*
         * If the sum of all the available memory (i.e. ram + swap)
         * is less than can be stored in a 32 bit unsigned long then
         * we can be binary compatible with 2.2.x kernels.  If not,
         * well, in that case 2.2.x was broken anyways...
         *
         *  -Erik Andersen <andersee@debian.org>
         */

        mem_total = info->totalram + info->totalswap;
        if (mem_total < info->totalram || mem_total < info->totalswap)
                goto out;
        bitcount = 0;
        mem_unit = info->mem_unit;
        while (mem_unit > 1) {
                bitcount++;
                mem_unit >>= 1;
                sav_total = mem_total;
                mem_total <<= 1;
                if (mem_total < sav_total)
                        goto out;
        }

        /*
         * If mem_total did not overflow, multiply all memory values by
         * info->mem_unit and set it to 1.  This leaves things compatible
         * with 2.2.x, and also retains compatibility with earlier 2.4.x
         * kernels...
         */

        info->mem_unit = 1;
        info->totalram <<= bitcount;
        info->freeram <<= bitcount;
        info->sharedram <<= bitcount;
        info->bufferram <<= bitcount;
        info->totalswap <<= bitcount;
        info->freeswap <<= bitcount;
        info->totalhigh <<= bitcount;
        info->freehigh <<= bitcount;

out:
        return 0;
}

SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
{
        struct sysinfo val;

        do_sysinfo(&val);

        if (copy_to_user(info, &val, sizeof(struct sysinfo)))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
struct compat_sysinfo {
        s32 uptime;
        u32 loads[3];
        u32 totalram;
        u32 freeram;
        u32 sharedram;
        u32 bufferram;
        u32 totalswap;
        u32 freeswap;
        u16 procs;
        u16 pad;
        u32 totalhigh;
        u32 freehigh;
        u32 mem_unit;
        char _f[20-2*sizeof(u32)-sizeof(int)];
};

COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
{
        struct sysinfo s;
        struct compat_sysinfo s_32;

        do_sysinfo(&s);

        /* Check to see if any memory value is too large for 32-bit and scale
         *  down if needed
         */
        if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
                int bitcount = 0;

                while (s.mem_unit < PAGE_SIZE) {
                        s.mem_unit <<= 1;
                        bitcount++;
                }

                s.totalram >>= bitcount;
                s.freeram >>= bitcount;
                s.sharedram >>= bitcount;
                s.bufferram >>= bitcount;
                s.totalswap >>= bitcount;
                s.freeswap >>= bitcount;
                s.totalhigh >>= bitcount;
                s.freehigh >>= bitcount;
        }

        memset(&s_32, 0, sizeof(s_32));
        s_32.uptime = s.uptime;
        s_32.loads[0] = s.loads[0];
        s_32.loads[1] = s.loads[1];
        s_32.loads[2] = s.loads[2];
        s_32.totalram = s.totalram;
        s_32.freeram = s.freeram;
        s_32.sharedram = s.sharedram;
        s_32.bufferram = s.bufferram;
        s_32.totalswap = s.totalswap;
        s_32.freeswap = s.freeswap;
        s_32.procs = s.procs;
        s_32.totalhigh = s.totalhigh;
        s_32.freehigh = s.freehigh;
        s_32.mem_unit = s.mem_unit;
        if (copy_to_user(info, &s_32, sizeof(s_32)))
                return -EFAULT;
        return 0;
}
#endif /* CONFIG_COMPAT */




























































































































































































































































































































































































































































































    1 






























    1 


    1 










































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/srcu.h>
#include <linux/interval_tree.h>

struct mmu_notifier_subscriptions;
struct mmu_notifier;
struct mmu_notifier_range;
struct mmu_interval_notifier;

/**
 * enum mmu_notifier_event - reason for the mmu notifier callback
 * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
 * move the range
 *
 * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
 * madvise() or replacing a page by another one, ...).
 *
 * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
 * ie using the vma access permission (vm_page_prot) to update the whole range
 * is enough no need to inspect changes to the CPU page table (mprotect()
 * syscall)
 *
 * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
 * pages in the range so to mirror those changes the user must inspect the CPU
 * page table (from the end callback).
 *
 * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
 * access flags). User should soft dirty the page in the end callback to make
 * sure that anyone relying on soft dirtiness catch pages that might be written
 * through non CPU mappings.
 *
 * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
 * that the mm refcount is zero and the range is no longer accessible.
 *
 * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
 * a device driver to possibly ignore the invalidation if the
 * owner field matches the driver's device private pgmap owner.
 *
 * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive.
 * The owner is initialized to the value provided by the caller of
 * make_device_exclusive(), such that this caller can filter out these
 * events.
 */
enum mmu_notifier_event {
        MMU_NOTIFY_UNMAP = 0,
        MMU_NOTIFY_CLEAR,
        MMU_NOTIFY_PROTECTION_VMA,
        MMU_NOTIFY_PROTECTION_PAGE,
        MMU_NOTIFY_SOFT_DIRTY,
        MMU_NOTIFY_RELEASE,
        MMU_NOTIFY_MIGRATE,
        MMU_NOTIFY_EXCLUSIVE,
};

#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

struct mmu_notifier_ops {
        /*
         * Called either by mmu_notifier_unregister or when the mm is
         * being destroyed by exit_mmap, always before all pages are
         * freed. This can run concurrently with other mmu notifier
         * methods (the ones invoked outside the mm context) and it
         * should tear down all secondary mmu mappings and freeze the
         * secondary mmu. If this method isn't implemented you've to
         * be sure that nothing could possibly write to the pages
         * through the secondary mmu by the time the last thread with
         * tsk->mm == mm exits.
         *
         * As side note: the pages freed after ->release returns could
         * be immediately reallocated by the gart at an alias physical
         * address with a different cache model, so if ->release isn't
         * implemented because all _software_ driven memory accesses
         * through the secondary mmu are terminated by the time the
         * last thread of this mm quits, you've also to be sure that
         * speculative _hardware_ operations can't allocate dirty
         * cachelines in the cpu that could not be snooped and made
         * coherent with the other read and write operations happening
         * through the gart alias address, so leading to memory
         * corruption.
         */
        void (*release)(struct mmu_notifier *subscription,
                        struct mm_struct *mm);

        /*
         * clear_flush_young is called after the VM is
         * test-and-clearing the young/accessed bitflag in the
         * pte. This way the VM will provide proper aging to the
         * accesses to the page through the secondary MMUs and not
         * only to the ones through the Linux pte.
         * Start-end is necessary in case the secondary MMU is mapping the page
         * at a smaller granularity than the primary MMU.
         */
        bool (*clear_flush_young)(struct mmu_notifier *subscription,
                                  struct mm_struct *mm,
                                  unsigned long start,
                                  unsigned long end);

        /*
         * clear_young is a lightweight version of clear_flush_young. Like the
         * latter, it is supposed to test-and-clear the young/accessed bitflag
         * in the secondary pte, but it may omit flushing the secondary tlb.
         */
        bool (*clear_young)(struct mmu_notifier *subscription,
                            struct mm_struct *mm,
                            unsigned long start,
                            unsigned long end);

        /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
         * frequently used without actually clearing the flag or tearing
         * down the secondary mapping on the page.
         */
        bool (*test_young)(struct mmu_notifier *subscription,
                           struct mm_struct *mm,
                           unsigned long address);

        /*
         * invalidate_range_start() and invalidate_range_end() must be
         * paired and are called only when the mmap_lock and/or the
         * locks protecting the reverse maps are held. If the subsystem
         * can't guarantee that no additional references are taken to
         * the pages in the range, it has to implement the
         * invalidate_range() notifier to remove any references taken
         * after invalidate_range_start().
         *
         * Invalidation of multiple concurrent ranges may be
         * optionally permitted by the driver. Either way the
         * establishment of sptes is forbidden in the range passed to
         * invalidate_range_begin/end for the whole duration of the
         * invalidate_range_begin/end critical section.
         *
         * invalidate_range_start() is called when all pages in the
         * range are still mapped and have at least a refcount of one.
         *
         * invalidate_range_end() is called when all pages in the
         * range have been unmapped and the pages have been freed by
         * the VM.
         *
         * The VM will remove the page table entries and potentially
         * the page between invalidate_range_start() and
         * invalidate_range_end(). If the page must not be freed
         * because of pending I/O or other circumstances then the
         * invalidate_range_start() callback (or the initial mapping
         * by the driver) must make sure that the refcount is kept
         * elevated.
         *
         * If the driver increases the refcount when the pages are
         * initially mapped into an address space then either
         * invalidate_range_start() or invalidate_range_end() may
         * decrease the refcount. If the refcount is decreased on
         * invalidate_range_start() then the VM can free pages as page
         * table entries are removed.  If the refcount is only
         * dropped on invalidate_range_end() then the driver itself
         * will drop the last refcount but it must take care to flush
         * any secondary tlb before doing the final free on the
         * page. Pages will no longer be referenced by the linux
         * address space but may still be referenced by sptes until
         * the last refcount is dropped.
         *
         * If blockable argument is set to false then the callback cannot
         * sleep and has to return with -EAGAIN if sleeping would be required.
         * 0 should be returned otherwise. Please note that notifiers that can
         * fail invalidate_range_start are not allowed to implement
         * invalidate_range_end, as there is no mechanism for informing the
         * notifier that its start failed.
         */
        int (*invalidate_range_start)(struct mmu_notifier *subscription,
                                      const struct mmu_notifier_range *range);
        void (*invalidate_range_end)(struct mmu_notifier *subscription,
                                     const struct mmu_notifier_range *range);

        /*
         * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB
         * which shares page-tables with the CPU. The
         * invalidate_range_start()/end() callbacks should not be implemented as
         * invalidate_secondary_tlbs() already catches the points in time when
         * an external TLB needs to be flushed.
         *
         * This requires arch_invalidate_secondary_tlbs() to be called while
         * holding the ptl spin-lock and therefore this callback is not allowed
         * to sleep.
         *
         * This is called by architecture code whenever invalidating a TLB
         * entry. It is assumed that any secondary TLB has the same rules for
         * when invalidations are required. If this is not the case architecture
         * code will need to call this explicitly when required for secondary
         * TLB invalidation.
         */
        void (*arch_invalidate_secondary_tlbs)(
                                        struct mmu_notifier *subscription,
                                        struct mm_struct *mm,
                                        unsigned long start,
                                        unsigned long end);

        /*
         * These callbacks are used with the get/put interface to manage the
         * lifetime of the mmu_notifier memory. alloc_notifier() returns a new
         * notifier for use with the mm.
         *
         * free_notifier() is only called after the mmu_notifier has been
         * fully put, calls to any ops callback are prevented and no ops
         * callbacks are currently running. It is called from a SRCU callback
         * and cannot sleep.
         */
        struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
        void (*free_notifier)(struct mmu_notifier *subscription);
};

/*
 * The notifier chains are protected by mmap_lock and/or the reverse map
 * semaphores. Notifier chains are only changed when all reverse maps and
 * the mmap_lock locks are taken.
 *
 * Therefore notifier chains can only be traversed when either
 *
 * 1. mmap_lock is held.
 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
 * 3. No other concurrent thread can access the list (release)
 */
struct mmu_notifier {
        struct hlist_node hlist;
        const struct mmu_notifier_ops *ops;
        struct mm_struct *mm;
        struct rcu_head rcu;
        unsigned int users;
};

/**
 * struct mmu_interval_notifier_finish - mmu_interval_notifier two-pass abstraction
 * @link: Lockless list link for the notifiers pending pass list
 * @notifier: The mmu_interval_notifier for which the finish pass is called.
 *
 * Allocate, typically using GFP_NOWAIT in the interval notifier's start pass.
 * Note that with a large number of notifiers implementing two passes,
 * allocation with GFP_NOWAIT will become increasingly likely to fail, so consider
 * implementing a small pool instead of using kmalloc() allocations.
 *
 * If the implementation needs to pass data between the start and the finish passes,
 * the recommended way is to embed struct mmu_interval_notifier_finish into a larger
 * structure that also contains the data needed to be shared. Keep in mind that
 * a notifier callback can be invoked in parallel, and each invocation needs its
 * own struct mmu_interval_notifier_finish.
 *
 * If allocation fails, then the &mmu_interval_notifier_ops->invalidate_start op
 * needs to implements the full notifier functionality. Please refer to its
 * documentation.
 */
struct mmu_interval_notifier_finish {
        struct llist_node link;
        struct mmu_interval_notifier *notifier;
};

/**
 * struct mmu_interval_notifier_ops - callback for range notification
 * @invalidate: Upon return the caller must stop using any SPTEs within this
 *              range. This function can sleep. Return false only if sleeping
 *              was required but mmu_notifier_range_blockable(range) is false.
 * @invalidate_start: Similar to @invalidate, but intended for two-pass notifier
 *                    callbacks where the call to @invalidate_start is the first
 *                    pass and any struct mmu_interval_notifier_finish pointer
 *                    returned in the @finish parameter describes the finish pass.
 *                    If *@finish is %NULL on return, then no final pass will be
 *                    called, and @invalidate_start needs to implement the full
 *                    notifier, behaving like @invalidate. The value of *@finish
 *                    is guaranteed to be %NULL at function entry.
 * @invalidate_finish: Called as the second pass for any notifier that returned
 *                     a non-NULL *@finish from @invalidate_start. The @finish
 *                     pointer passed here is the same one returned by
 *                     @invalidate_start.
 */
struct mmu_interval_notifier_ops {
        bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
                           const struct mmu_notifier_range *range,
                           unsigned long cur_seq);
        bool (*invalidate_start)(struct mmu_interval_notifier *interval_sub,
                                 const struct mmu_notifier_range *range,
                                 unsigned long cur_seq,
                                 struct mmu_interval_notifier_finish **finish);
        void (*invalidate_finish)(struct mmu_interval_notifier_finish *finish);
};

struct mmu_interval_notifier {
        struct interval_tree_node interval_tree;
        const struct mmu_interval_notifier_ops *ops;
        struct mm_struct *mm;
        struct hlist_node deferred_item;
        unsigned long invalidate_seq;
};

#ifdef CONFIG_MMU_NOTIFIER

#ifdef CONFIG_LOCKDEP
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif

struct mmu_notifier_range {
        struct mm_struct *mm;
        unsigned long start;
        unsigned long end;
        unsigned flags;
        enum mmu_notifier_event event;
        void *owner;
};

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return unlikely(mm->notifier_subscriptions);
}

struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
                                             struct mm_struct *mm);
static inline struct mmu_notifier *
mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
{
        struct mmu_notifier *ret;

        mmap_write_lock(mm);
        ret = mmu_notifier_get_locked(ops, mm);
        mmap_write_unlock(mm);
        return ret;
}
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);

extern int mmu_notifier_register(struct mmu_notifier *subscription,
                                 struct mm_struct *mm);
extern int __mmu_notifier_register(struct mmu_notifier *subscription,
                                   struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *subscription,
                                    struct mm_struct *mm);

unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long length,
                                 const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        unsigned long start, unsigned long length,
        const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);

/**
 * mmu_interval_set_seq - Save the invalidation sequence
 * @interval_sub: The subscription passed to invalidate
 * @cur_seq: The cur_seq passed to the invalidate() callback
 *
 * This must be called unconditionally from the invalidate callback of a
 * struct mmu_interval_notifier_ops under the same lock that is used to call
 * mmu_interval_read_retry(). It updates the sequence number for later use by
 * mmu_interval_read_retry(). The provided cur_seq will always be odd.
 *
 * If the caller does not call mmu_interval_read_begin() or
 * mmu_interval_read_retry() then this call is not required.
 */
static inline void
mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub,
                     unsigned long cur_seq)
{
        WRITE_ONCE(interval_sub->invalidate_seq, cur_seq);
}

/**
 * mmu_interval_read_retry - End a read side critical section against a VA range
 * @interval_sub: The subscription
 * @seq: The return of the paired mmu_interval_read_begin()
 *
 * This MUST be called under a user provided lock that is also held
 * unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
 *
 * Each call should be paired with a single mmu_interval_read_begin() and
 * should be used to conclude the read side.
 *
 * Returns: true if an invalidation collided with this critical section, and
 * the caller should retry.
 */
static inline bool
mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
                        unsigned long seq)
{
        return interval_sub->invalidate_seq != seq;
}

/**
 * mmu_interval_check_retry - Test if a collision has occurred
 * @interval_sub: The subscription
 * @seq: The return of the matching mmu_interval_read_begin()
 *
 * This can be used in the critical section between mmu_interval_read_begin()
 * and mmu_interval_read_retry().
 *
 * This call can be used as part of loops and other expensive operations to
 * expedite a retry.
 * It can be called many times and does not have to hold the user
 * provided lock.
 *
 * Returns: true indicates an invalidation has collided with this critical
 * region and a future mmu_interval_read_retry() will return true.
 * False is not reliable and only suggests a collision may not have
 * occurred.
 */
static inline bool
mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
                         unsigned long seq)
{
        /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
        return READ_ONCE(interval_sub->invalidate_seq) != seq;
}

extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
bool __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                unsigned long start, unsigned long end);
bool __mmu_notifier_clear_young(struct mm_struct *mm,
                unsigned long start, unsigned long end);
bool __mmu_notifier_test_young(struct mm_struct *mm,
                unsigned long address);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r);
extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end);
extern bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_release(mm);
}

static inline bool mmu_notifier_clear_flush_young(struct mm_struct *mm,
                unsigned long start, unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_flush_young(mm, start, end);
        return false;
}

static inline bool mmu_notifier_clear_young(struct mm_struct *mm,
                unsigned long start, unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_young(mm, start, end);
        return false;
}

static inline bool mmu_notifier_test_young(struct mm_struct *mm,
                unsigned long address)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_test_young(mm, address);
        return false;
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
        might_sleep();

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
                __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}

/*
 * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it
 * can return an error if a notifier can't proceed without blocking, in which
 * case you're not allowed to modify PTEs in the specified range.
 *
 * This is mainly intended for OOM handling.
 */
static inline int __must_check
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        int ret = 0;

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
                ret = __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
        return ret;
}

static inline void
mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
        if (mmu_notifier_range_blockable(range))
                might_sleep();

        if (mm_has_notifiers(range->mm))
                __mmu_notifier_invalidate_range_end(range);
}

static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
        mm->notifier_subscriptions = NULL;
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_subscriptions_destroy(mm);
}


static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
                                           enum mmu_notifier_event event,
                                           unsigned flags,
                                           struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        range->event = event;
        range->mm = mm;
        range->start = start;
        range->end = end;
        range->flags = flags;
}

static inline void mmu_notifier_range_init_owner(
                        struct mmu_notifier_range *range,
                        enum mmu_notifier_event event, unsigned int flags,
                        struct mm_struct *mm, unsigned long start,
                        unsigned long end, void *owner)
{
        mmu_notifier_range_init(range, event, flags, mm, start, end);
        range->owner = owner;
}

#else /* CONFIG_MMU_NOTIFIER */

struct mmu_notifier_range {
        unsigned long start;
        unsigned long end;
};

static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
                                            unsigned long start,
                                            unsigned long end)
{
        range->start = start;
        range->end = end;
}

#define mmu_notifier_range_init(range,event,flags,mm,start,end)  \
        _mmu_notifier_range_init(range, start, end)
#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \
                                        end, owner) \
        _mmu_notifier_range_init(range, start, end)

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return true;
}

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return 0;
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
}

static inline bool mmu_notifier_clear_flush_young(struct mm_struct *mm,
                unsigned long start, unsigned long end)
{
        return false;
}

static inline bool mmu_notifier_clear_young(struct mm_struct *mm,
                unsigned long start, unsigned long end)
{
        return false;
}

static inline bool mmu_notifier_test_young(struct mm_struct *mm,
                unsigned long address)
{
        return false;
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
}

static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        return 0;
}

static inline
void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
}

static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
{
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
}

#define mmu_notifier_range_update_to_read_only(r) false

static inline void mmu_notifier_synchronize(void)
{
}

#endif /* CONFIG_MMU_NOTIFIER */

#endif /* _LINUX_MMU_NOTIFIER_H */




















































    1 


































































    2 



















    2 














































































    2 





    1 













    2 




















































































































































































    1 




    1 




    2 
    2 



    1 










































































































































































































































































































































































    1 



    1 































































    2 




















    2 



    2 



    2 







    1 































    1 
    2 
    2 





    1 


    1 









    2 











    2 







    1 














    1 


    1 





    1 






    2 







    1 
    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS recovery logic
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Ryusuke Konishi.
 */

#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/crc32.h>
#include "nilfs.h"
#include "segment.h"
#include "sufile.h"
#include "page.h"
#include "segbuf.h"

/*
 * Segment check result
 */
enum {
        NILFS_SEG_VALID,
        NILFS_SEG_NO_SUPER_ROOT,
        NILFS_SEG_FAIL_IO,
        NILFS_SEG_FAIL_MAGIC,
        NILFS_SEG_FAIL_SEQ,
        NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
        NILFS_SEG_FAIL_CHECKSUM_FULL,
        NILFS_SEG_FAIL_CONSISTENCY,
};

/* work structure for recovery */
struct nilfs_recovery_block {
        ino_t ino;                /*
                                 * Inode number of the file that this block
                                 * belongs to
                                 */
        sector_t blocknr;        /* block number */
        __u64 vblocknr;                /* virtual block number */
        unsigned long blkoff;        /* File offset of the data block (per block) */
        struct list_head list;
};


static int nilfs_warn_segment_error(struct super_block *sb, int err)
{
        const char *msg = NULL;

        switch (err) {
        case NILFS_SEG_FAIL_IO:
                nilfs_err(sb, "I/O error reading segment");
                return -EIO;
        case NILFS_SEG_FAIL_MAGIC:
                msg = "Magic number mismatch";
                break;
        case NILFS_SEG_FAIL_SEQ:
                msg = "Sequence number mismatch";
                break;
        case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
                msg = "Checksum error in super root";
                break;
        case NILFS_SEG_FAIL_CHECKSUM_FULL:
                msg = "Checksum error in segment payload";
                break;
        case NILFS_SEG_FAIL_CONSISTENCY:
                msg = "Inconsistency found";
                break;
        case NILFS_SEG_NO_SUPER_ROOT:
                msg = "No super root in the last segment";
                break;
        default:
                nilfs_err(sb, "unrecognized segment error %d", err);
                return -EINVAL;
        }
        nilfs_warn(sb, "invalid segment: %s", msg);
        return -EINVAL;
}

/**
 * nilfs_compute_checksum - compute checksum of blocks continuously
 * @nilfs: nilfs object
 * @bhs: buffer head of start block
 * @sum: place to store result
 * @offset: offset bytes in the first block
 * @check_bytes: number of bytes to be checked
 * @start: DBN of start block
 * @nblock: number of blocks to be checked
 *
 * Return: 0 on success, or %-EIO if an I/O error occurs.
 */
static int nilfs_compute_checksum(struct the_nilfs *nilfs,
                                  struct buffer_head *bhs, u32 *sum,
                                  unsigned long offset, u64 check_bytes,
                                  sector_t start, unsigned long nblock)
{
        unsigned int blocksize = nilfs->ns_blocksize;
        unsigned long size;
        u32 crc;

        BUG_ON(offset >= blocksize);
        check_bytes -= offset;
        size = min_t(u64, check_bytes, blocksize - offset);
        crc = crc32_le(nilfs->ns_crc_seed,
                       (unsigned char *)bhs->b_data + offset, size);
        if (--nblock > 0) {
                do {
                        struct buffer_head *bh;

                        bh = __bread(nilfs->ns_bdev, ++start, blocksize);
                        if (!bh)
                                return -EIO;
                        check_bytes -= size;
                        size = min_t(u64, check_bytes, blocksize);
                        crc = crc32_le(crc, bh->b_data, size);
                        brelse(bh);
                } while (--nblock > 0);
        }
        *sum = crc;
        return 0;
}

/**
 * nilfs_read_super_root_block - read super root block
 * @nilfs: nilfs object
 * @sr_block: disk block number of the super root block
 * @pbh: address of a buffer_head pointer to return super root buffer
 * @check: CRC check flag
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - Super root block corrupted.
 * * %-EIO        - I/O error.
 */
int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
                                struct buffer_head **pbh, int check)
{
        struct buffer_head *bh_sr;
        struct nilfs_super_root *sr;
        u32 crc;
        int ret;

        *pbh = NULL;
        bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize);
        if (unlikely(!bh_sr)) {
                ret = NILFS_SEG_FAIL_IO;
                goto failed;
        }

        sr = (struct nilfs_super_root *)bh_sr->b_data;
        if (check) {
                unsigned int bytes = le16_to_cpu(sr->sr_bytes);

                if (bytes == 0 || bytes > nilfs->ns_blocksize) {
                        ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
                        goto failed_bh;
                }
                if (nilfs_compute_checksum(
                            nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes,
                            sr_block, 1)) {
                        ret = NILFS_SEG_FAIL_IO;
                        goto failed_bh;
                }
                if (crc != le32_to_cpu(sr->sr_sum)) {
                        ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
                        goto failed_bh;
                }
        }
        *pbh = bh_sr;
        return 0;

 failed_bh:
        brelse(bh_sr);

 failed:
        return nilfs_warn_segment_error(nilfs->ns_sb, ret);
}

/**
 * nilfs_read_log_header - read summary header of the specified log
 * @nilfs: nilfs object
 * @start_blocknr: start block number of the log
 * @sum: pointer to return segment summary structure
 *
 * Return: Buffer head pointer, or NULL if an I/O error occurs.
 */
static struct buffer_head *
nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
                      struct nilfs_segment_summary **sum)
{
        struct buffer_head *bh_sum;

        bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
        if (bh_sum)
                *sum = (struct nilfs_segment_summary *)bh_sum->b_data;
        return bh_sum;
}

/**
 * nilfs_validate_log - verify consistency of log
 * @nilfs: nilfs object
 * @seg_seq: sequence number of segment
 * @bh_sum: buffer head of summary block
 * @sum: segment summary struct
 *
 * Return: 0 on success, or one of the following internal codes on failure:
 * * %NILFS_SEG_FAIL_MAGIC            - Magic number mismatch.
 * * %NILFS_SEG_FAIL_SEQ            - Sequence number mismatch.
 * * %NIFLS_SEG_FAIL_CONSISTENCY    - Block count out of range.
 * * %NILFS_SEG_FAIL_IO                    - I/O error.
 * * %NILFS_SEG_FAIL_CHECKSUM_FULL  - Full log checksum verification failed.
 */
static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq,
                              struct buffer_head *bh_sum,
                              struct nilfs_segment_summary *sum)
{
        unsigned long nblock;
        u32 crc;
        int ret;

        ret = NILFS_SEG_FAIL_MAGIC;
        if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC)
                goto out;

        ret = NILFS_SEG_FAIL_SEQ;
        if (le64_to_cpu(sum->ss_seq) != seg_seq)
                goto out;

        nblock = le32_to_cpu(sum->ss_nblocks);
        ret = NILFS_SEG_FAIL_CONSISTENCY;
        if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment))
                /* This limits the number of blocks read in the CRC check */
                goto out;

        ret = NILFS_SEG_FAIL_IO;
        if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum),
                                   ((u64)nblock << nilfs->ns_blocksize_bits),
                                   bh_sum->b_blocknr, nblock))
                goto out;

        ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
        if (crc != le32_to_cpu(sum->ss_datasum))
                goto out;
        ret = 0;
out:
        return ret;
}

/**
 * nilfs_read_summary_info - read an item on summary blocks of a log
 * @nilfs: nilfs object
 * @pbh: the current buffer head on summary blocks [in, out]
 * @offset: the current byte offset on summary blocks [in, out]
 * @bytes: byte size of the item to be read
 *
 * Return: Kernel space address of current segment summary entry, or
 * NULL if an I/O error occurs.
 */
static void *nilfs_read_summary_info(struct the_nilfs *nilfs,
                                     struct buffer_head **pbh,
                                     unsigned int *offset, unsigned int bytes)
{
        void *ptr;
        sector_t blocknr;

        BUG_ON((*pbh)->b_size < *offset);
        if (bytes > (*pbh)->b_size - *offset) {
                blocknr = (*pbh)->b_blocknr;
                brelse(*pbh);
                *pbh = __bread(nilfs->ns_bdev, blocknr + 1,
                               nilfs->ns_blocksize);
                if (unlikely(!*pbh))
                        return NULL;
                *offset = 0;
        }
        ptr = (*pbh)->b_data + *offset;
        *offset += bytes;
        return ptr;
}

/**
 * nilfs_skip_summary_info - skip items on summary blocks of a log
 * @nilfs: nilfs object
 * @pbh: the current buffer head on summary blocks [in, out]
 * @offset: the current byte offset on summary blocks [in, out]
 * @bytes: byte size of the item to be skipped
 * @count: number of items to be skipped
 */
static void nilfs_skip_summary_info(struct the_nilfs *nilfs,
                                    struct buffer_head **pbh,
                                    unsigned int *offset, unsigned int bytes,
                                    unsigned long count)
{
        unsigned int rest_item_in_current_block
                = ((*pbh)->b_size - *offset) / bytes;

        if (count <= rest_item_in_current_block) {
                *offset += bytes * count;
        } else {
                sector_t blocknr = (*pbh)->b_blocknr;
                unsigned int nitem_per_block = (*pbh)->b_size / bytes;
                unsigned int bcnt;

                count -= rest_item_in_current_block;
                bcnt = DIV_ROUND_UP(count, nitem_per_block);
                *offset = bytes * (count - (bcnt - 1) * nitem_per_block);

                brelse(*pbh);
                *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt,
                               nilfs->ns_blocksize);
        }
}

/**
 * nilfs_scan_dsync_log - get block information of a log written for data sync
 * @nilfs: nilfs object
 * @start_blocknr: start block number of the log
 * @sum: log summary information
 * @head: list head to add nilfs_recovery_block struct
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error.
 * * %-ENOMEM        - Insufficient memory available.
 */
static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
                                struct nilfs_segment_summary *sum,
                                struct list_head *head)
{
        struct buffer_head *bh;
        unsigned int offset;
        u32 nfinfo, sumbytes;
        sector_t blocknr;
        ino_t ino;
        int err = -EIO;

        nfinfo = le32_to_cpu(sum->ss_nfinfo);
        if (!nfinfo)
                return 0;

        sumbytes = le32_to_cpu(sum->ss_sumbytes);
        blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize);
        bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
        if (unlikely(!bh))
                goto out;

        offset = le16_to_cpu(sum->ss_bytes);
        for (;;) {
                unsigned long nblocks, ndatablk, nnodeblk;
                struct nilfs_finfo *finfo;

                finfo = nilfs_read_summary_info(nilfs, &bh, &offset,
                                                sizeof(*finfo));
                if (unlikely(!finfo))
                        goto out;

                ino = le64_to_cpu(finfo->fi_ino);
                nblocks = le32_to_cpu(finfo->fi_nblocks);
                ndatablk = le32_to_cpu(finfo->fi_ndatablk);
                nnodeblk = nblocks - ndatablk;

                while (ndatablk-- > 0) {
                        struct nilfs_recovery_block *rb;
                        struct nilfs_binfo_v *binfo;

                        binfo = nilfs_read_summary_info(nilfs, &bh, &offset,
                                                        sizeof(*binfo));
                        if (unlikely(!binfo))
                                goto out;

                        rb = kmalloc_obj(*rb, GFP_NOFS);
                        if (unlikely(!rb)) {
                                err = -ENOMEM;
                                goto out;
                        }
                        rb->ino = ino;
                        rb->blocknr = blocknr++;
                        rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
                        rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
                        /* INIT_LIST_HEAD(&rb->list); */
                        list_add_tail(&rb->list, head);
                }
                if (--nfinfo == 0)
                        break;
                blocknr += nnodeblk; /* always 0 for data sync logs */
                nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64),
                                        nnodeblk);
                if (unlikely(!bh))
                        goto out;
        }
        err = 0;
 out:
        brelse(bh);   /* brelse(NULL) is just ignored */
        return err;
}

static void dispose_recovery_list(struct list_head *head)
{
        while (!list_empty(head)) {
                struct nilfs_recovery_block *rb;

                rb = list_first_entry(head, struct nilfs_recovery_block, list);
                list_del(&rb->list);
                kfree(rb);
        }
}

struct nilfs_segment_entry {
        struct list_head        list;
        __u64                        segnum;
};

static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
{
        struct nilfs_segment_entry *ent = kmalloc_obj(*ent, GFP_NOFS);

        if (unlikely(!ent))
                return -ENOMEM;

        ent->segnum = segnum;
        INIT_LIST_HEAD(&ent->list);
        list_add_tail(&ent->list, head);
        return 0;
}

void nilfs_dispose_segment_list(struct list_head *head)
{
        while (!list_empty(head)) {
                struct nilfs_segment_entry *ent;

                ent = list_first_entry(head, struct nilfs_segment_entry, list);
                list_del(&ent->list);
                kfree(ent);
        }
}

static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
                                              struct super_block *sb,
                                              struct nilfs_recovery_info *ri)
{
        struct list_head *head = &ri->ri_used_segments;
        struct nilfs_segment_entry *ent, *n;
        struct inode *sufile = nilfs->ns_sufile;
        __u64 segnum[4];
        int err;
        int i;

        segnum[0] = nilfs->ns_segnum;
        segnum[1] = nilfs->ns_nextnum;
        segnum[2] = ri->ri_segnum;
        segnum[3] = ri->ri_nextnum;

        /*
         * Releasing the next segment of the latest super root.
         * The next segment is invalidated by this recovery.
         */
        err = nilfs_sufile_free(sufile, segnum[1]);
        if (unlikely(err)) {
                if (err == -ENOENT) {
                        nilfs_err(sb,
                                  "checkpoint log inconsistency at block %llu (segment %llu): next segment %llu is unallocated",
                                  (unsigned long long)nilfs->ns_last_pseg,
                                  (unsigned long long)nilfs->ns_segnum,
                                  (unsigned long long)segnum[1]);
                        err = -EINVAL;
                }
                goto failed;
        }

        for (i = 1; i < 4; i++) {
                err = nilfs_segment_list_add(head, segnum[i]);
                if (unlikely(err))
                        goto failed;
        }

        /*
         * Collecting segments written after the latest super root.
         * These are marked dirty to avoid being reallocated in the next write.
         */
        list_for_each_entry_safe(ent, n, head, list) {
                if (ent->segnum != segnum[0]) {
                        err = nilfs_sufile_scrap(sufile, ent->segnum);
                        if (unlikely(err))
                                goto failed;
                }
                list_del(&ent->list);
                kfree(ent);
        }

        /* Allocate new segments for recovery */
        err = nilfs_sufile_alloc(sufile, &segnum[0]);
        if (unlikely(err))
                goto failed;

        nilfs->ns_pseg_offset = 0;
        nilfs->ns_seg_seq = ri->ri_seq + 2;
        nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];

 failed:
        /* No need to recover sufile because it will be destroyed on error */
        return err;
}

static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
                                     struct nilfs_recovery_block *rb,
                                     loff_t pos, struct folio *folio)
{
        struct buffer_head *bh_org;
        size_t from = offset_in_folio(folio, pos);

        bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
        if (unlikely(!bh_org))
                return -EIO;

        memcpy_to_folio(folio, from, bh_org->b_data, bh_org->b_size);
        brelse(bh_org);
        return 0;
}

static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
                                      struct super_block *sb,
                                      struct nilfs_root *root,
                                      struct list_head *head,
                                      unsigned long *nr_salvaged_blocks)
{
        struct inode *inode;
        struct nilfs_recovery_block *rb, *n;
        unsigned int blocksize = nilfs->ns_blocksize;
        struct folio *folio;
        loff_t pos;
        int err = 0, err2 = 0;

        list_for_each_entry_safe(rb, n, head, list) {
                inode = nilfs_iget(sb, root, rb->ino);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        inode = NULL;
                        goto failed_inode;
                }

                pos = rb->blkoff << inode->i_blkbits;
                err = block_write_begin(inode->i_mapping, pos, blocksize,
                                        &folio, nilfs_get_block);
                if (unlikely(err)) {
                        loff_t isize = inode->i_size;

                        if (pos + blocksize > isize)
                                nilfs_write_failed(inode->i_mapping,
                                                        pos + blocksize);
                        goto failed_inode;
                }

                err = nilfs_recovery_copy_block(nilfs, rb, pos, folio);
                if (unlikely(err))
                        goto failed_folio;

                err = nilfs_set_file_dirty(inode, 1);
                if (unlikely(err))
                        goto failed_folio;

                block_write_end(pos, blocksize, blocksize, folio);

                folio_unlock(folio);
                folio_put(folio);

                (*nr_salvaged_blocks)++;
                goto next;

 failed_folio:
                folio_unlock(folio);
                folio_put(folio);

 failed_inode:
                nilfs_warn(sb,
                           "error %d recovering data block (ino=%lu, block-offset=%llu)",
                           err, (unsigned long)rb->ino,
                           (unsigned long long)rb->blkoff);
                if (!err2)
                        err2 = err;
 next:
                iput(inode); /* iput(NULL) is just ignored */
                list_del_init(&rb->list);
                kfree(rb);
        }
        return err2;
}

/**
 * nilfs_do_roll_forward - salvage logical segments newer than the latest
 * checkpoint
 * @nilfs: nilfs object
 * @sb: super block instance
 * @root: NILFS root instance
 * @ri: pointer to a nilfs_recovery_info
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - Log format error.
 * * %-EIO        - I/O error.
 * * %-ENOMEM        - Insufficient memory available.
 */
static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                                 struct super_block *sb,
                                 struct nilfs_root *root,
                                 struct nilfs_recovery_info *ri)
{
        struct buffer_head *bh_sum = NULL;
        struct nilfs_segment_summary *sum = NULL;
        sector_t pseg_start;
        sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
        unsigned long nsalvaged_blocks = 0;
        unsigned int flags;
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        int empty_seg = 0;
        int err = 0, ret;
        LIST_HEAD(dsync_blocks);  /* list of data blocks to be recovered */
        enum {
                RF_INIT_ST,
                RF_DSYNC_ST,   /* scanning data-sync segments */
        };
        int state = RF_INIT_ST;

        pseg_start = ri->ri_lsegs_start;
        seg_seq = ri->ri_lsegs_start_seq;
        segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
        nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);

        while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
                brelse(bh_sum);
                bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
                if (!bh_sum) {
                        err = -EIO;
                        goto failed;
                }

                ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
                if (ret) {
                        if (ret == NILFS_SEG_FAIL_IO) {
                                err = -EIO;
                                goto failed;
                        }
                        goto strayed;
                }

                flags = le16_to_cpu(sum->ss_flags);
                if (flags & NILFS_SS_SR)
                        goto confused;

                /* Found a valid partial segment; do recovery actions */
                nextnum = nilfs_get_segnum_of_block(nilfs,
                                                    le64_to_cpu(sum->ss_next));
                empty_seg = 0;
                nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
                if (!(flags & NILFS_SS_GC))
                        nilfs->ns_nongc_ctime = nilfs->ns_ctime;

                switch (state) {
                case RF_INIT_ST:
                        if (!(flags & NILFS_SS_LOGBGN) ||
                            !(flags & NILFS_SS_SYNDT))
                                goto try_next_pseg;
                        state = RF_DSYNC_ST;
                        fallthrough;
                case RF_DSYNC_ST:
                        if (!(flags & NILFS_SS_SYNDT))
                                goto confused;

                        err = nilfs_scan_dsync_log(nilfs, pseg_start, sum,
                                                   &dsync_blocks);
                        if (unlikely(err))
                                goto failed;
                        if (flags & NILFS_SS_LOGEND) {
                                err = nilfs_recover_dsync_blocks(
                                        nilfs, sb, root, &dsync_blocks,
                                        &nsalvaged_blocks);
                                if (unlikely(err))
                                        goto failed;
                                state = RF_INIT_ST;
                        }
                        break; /* Fall through to try_next_pseg */
                }

 try_next_pseg:
                if (pseg_start == ri->ri_lsegs_end)
                        break;
                pseg_start += le32_to_cpu(sum->ss_nblocks);
                if (pseg_start < seg_end)
                        continue;
                goto feed_segment;

 strayed:
                if (pseg_start == ri->ri_lsegs_end)
                        break;

 feed_segment:
                /* Looking to the next full segment */
                if (empty_seg++)
                        break;
                seg_seq++;
                segnum = nextnum;
                nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
                pseg_start = seg_start;
        }

        if (nsalvaged_blocks) {
                nilfs_info(sb, "salvaged %lu blocks", nsalvaged_blocks);
                ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
        }
 out:
        brelse(bh_sum);
        dispose_recovery_list(&dsync_blocks);
        return err;

 confused:
        err = -EINVAL;
 failed:
        nilfs_err(sb,
                  "error %d roll-forwarding partial segment at blocknr = %llu",
                  err, (unsigned long long)pseg_start);
        goto out;
}

static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
                                      struct nilfs_recovery_info *ri)
{
        struct buffer_head *bh;
        int err;

        if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
            nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
                return;

        bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize);
        if (WARN_ON(!bh))
                return;  /* should never happen */

        lock_buffer(bh);
        memset(bh->b_data, 0, bh->b_size);
        set_buffer_uptodate(bh);
        set_buffer_dirty(bh);
        unlock_buffer(bh);

        err = sync_dirty_buffer(bh);
        if (unlikely(err))
                nilfs_warn(nilfs->ns_sb,
                           "buffer sync write failed during post-cleaning of recovery.");
        brelse(bh);
}

/**
 * nilfs_abort_roll_forward - cleaning up after a failed rollforward recovery
 * @nilfs: nilfs object
 */
static void nilfs_abort_roll_forward(struct the_nilfs *nilfs)
{
        struct nilfs_inode_info *ii, *n;
        LIST_HEAD(head);

        /* Abandon inodes that have read recovery data */
        spin_lock(&nilfs->ns_inode_lock);
        list_splice_init(&nilfs->ns_dirty_files, &head);
        spin_unlock(&nilfs->ns_inode_lock);
        if (list_empty(&head))
                return;

        set_nilfs_purging(nilfs);
        list_for_each_entry_safe(ii, n, &head, i_dirty) {
                spin_lock(&nilfs->ns_inode_lock);
                list_del_init(&ii->i_dirty);
                spin_unlock(&nilfs->ns_inode_lock);

                iput(&ii->vfs_inode);
        }
        clear_nilfs_purging(nilfs);
}

/**
 * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
 * @nilfs: nilfs object
 * @sb: super block instance
 * @ri: pointer to a nilfs_recovery_info struct to store search results.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL                - Inconsistent filesystem state.
 * * %-EIO                - I/O error.
 * * %-ENOMEM                - Insufficient memory available.
 * * %-ENOSPC                - No space left on device (only in a panic state).
 * * %-ERESTARTSYS        - Interrupted.
 */
int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
                              struct super_block *sb,
                              struct nilfs_recovery_info *ri)
{
        struct nilfs_root *root;
        int err;

        if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
                return 0;

        err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
        if (unlikely(err)) {
                nilfs_err(sb, "error %d loading the latest checkpoint", err);
                return err;
        }

        err = nilfs_do_roll_forward(nilfs, sb, root, ri);
        if (unlikely(err))
                goto failed;

        if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
                err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
                if (unlikely(err)) {
                        nilfs_err(sb, "error %d preparing segment for recovery",
                                  err);
                        goto failed;
                }

                err = nilfs_attach_log_writer(sb, root);
                if (unlikely(err))
                        goto failed;

                set_nilfs_discontinued(nilfs);
                err = nilfs_construct_segment(sb);
                nilfs_detach_log_writer(sb);

                if (unlikely(err)) {
                        nilfs_err(sb, "error %d writing segment for recovery",
                                  err);
                        goto put_root;
                }

                nilfs_finish_roll_forward(nilfs, ri);
        }

put_root:
        nilfs_put_root(root);
        return err;

failed:
        nilfs_abort_roll_forward(nilfs);
        goto put_root;
}

/**
 * nilfs_search_super_root - search the latest valid super root
 * @nilfs: the_nilfs
 * @ri: pointer to a nilfs_recovery_info struct to store search results.
 *
 * nilfs_search_super_root() looks for the latest super-root from a partial
 * segment pointed by the superblock.  It sets up struct the_nilfs through
 * this search. It fills nilfs_recovery_info (ri) required for recovery.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - No valid segment found.
 * * %-EIO        - I/O error.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_search_super_root(struct the_nilfs *nilfs,
                            struct nilfs_recovery_info *ri)
{
        struct buffer_head *bh_sum = NULL;
        struct nilfs_segment_summary *sum = NULL;
        sector_t pseg_start, pseg_end, sr_pseg_start = 0;
        sector_t seg_start, seg_end; /* range of full segment (block number) */
        sector_t b, end;
        unsigned long nblocks;
        unsigned int flags;
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        __u64 cno;
        LIST_HEAD(segments);
        int empty_seg = 0, scan_newer = 0;
        int ret;

        pseg_start = nilfs->ns_last_pseg;
        seg_seq = nilfs->ns_last_seq;
        cno = nilfs->ns_last_cno;
        segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);

        /* Calculate range of segment */
        nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);

        /* Read ahead segment */
        b = seg_start;
        while (b <= seg_end)
                __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize);

        for (;;) {
                brelse(bh_sum);
                ret = NILFS_SEG_FAIL_IO;
                bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
                if (!bh_sum)
                        goto failed;

                ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
                if (ret) {
                        if (ret == NILFS_SEG_FAIL_IO)
                                goto failed;
                        goto strayed;
                }

                nblocks = le32_to_cpu(sum->ss_nblocks);
                pseg_end = pseg_start + nblocks - 1;
                if (unlikely(pseg_end > seg_end)) {
                        ret = NILFS_SEG_FAIL_CONSISTENCY;
                        goto strayed;
                }

                /* A valid partial segment */
                ri->ri_pseg_start = pseg_start;
                ri->ri_seq = seg_seq;
                ri->ri_segnum = segnum;
                nextnum = nilfs_get_segnum_of_block(nilfs,
                                                    le64_to_cpu(sum->ss_next));
                ri->ri_nextnum = nextnum;
                empty_seg = 0;

                flags = le16_to_cpu(sum->ss_flags);
                if (!(flags & NILFS_SS_SR) && !scan_newer) {
                        /*
                         * This will never happen because a superblock
                         * (last_segment) always points to a pseg with
                         * a super root.
                         */
                        ret = NILFS_SEG_FAIL_CONSISTENCY;
                        goto failed;
                }

                if (pseg_start == seg_start) {
                        nilfs_get_segment_range(nilfs, nextnum, &b, &end);
                        while (b <= end)
                                __breadahead(nilfs->ns_bdev, b++,
                                             nilfs->ns_blocksize);
                }
                if (!(flags & NILFS_SS_SR)) {
                        if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) {
                                ri->ri_lsegs_start = pseg_start;
                                ri->ri_lsegs_start_seq = seg_seq;
                        }
                        if (flags & NILFS_SS_LOGEND)
                                ri->ri_lsegs_end = pseg_start;
                        goto try_next_pseg;
                }

                /* A valid super root was found. */
                ri->ri_cno = cno++;
                ri->ri_super_root = pseg_end;
                ri->ri_lsegs_start = ri->ri_lsegs_end = 0;

                nilfs_dispose_segment_list(&segments);
                sr_pseg_start = pseg_start;
                nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start;
                nilfs->ns_seg_seq = seg_seq;
                nilfs->ns_segnum = segnum;
                nilfs->ns_cno = cno;  /* nilfs->ns_cno = ri->ri_cno + 1 */
                nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
                nilfs->ns_nextnum = nextnum;

                if (scan_newer)
                        ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
                else {
                        if (nilfs->ns_mount_state & NILFS_VALID_FS)
                                goto super_root_found;
                        scan_newer = 1;
                }

 try_next_pseg:
                /* Standing on a course, or met an inconsistent state */
                pseg_start += nblocks;
                if (pseg_start < seg_end)
                        continue;
                goto feed_segment;

 strayed:
                /* Off the trail */
                if (!scan_newer)
                        /*
                         * This can happen if a checkpoint was written without
                         * barriers, or as a result of an I/O failure.
                         */
                        goto failed;

 feed_segment:
                /* Looking to the next full segment */
                if (empty_seg++)
                        goto super_root_found; /* found a valid super root */

                ret = nilfs_segment_list_add(&segments, segnum);
                if (unlikely(ret))
                        goto failed;

                seg_seq++;
                segnum = nextnum;
                nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
                pseg_start = seg_start;
        }

 super_root_found:
        /* Updating pointers relating to the latest checkpoint */
        brelse(bh_sum);
        list_splice_tail(&segments, &ri->ri_used_segments);
        nilfs->ns_last_pseg = sr_pseg_start;
        nilfs->ns_last_seq = nilfs->ns_seg_seq;
        nilfs->ns_last_cno = ri->ri_cno;
        return 0;

 failed:
        brelse(bh_sum);
        nilfs_dispose_segment_list(&segments);
        return ret < 0 ? ret : nilfs_warn_segment_error(nilfs->ns_sb, ret);
}





























   18 












   17 
   18 



















   16 
   13 

    7 



   15 




































   17 




   18 
   15 


    8 









    4 






















    1 









    1 












    1 















    1 



























    1 



    3 

    3 

































    3 



    3 

    2 


    3 






    4 

    3 


















































    2 


    2 






    2 























    1 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
// SPDX-License-Identifier: GPL-2.0-or-later
/* Filesystem parameter parser.
 *
 * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/namei.h>
#include "internal.h"

static const struct constant_table bool_names[] = {
        { "0",                false },
        { "1",                true },
        { "false",        false },
        { "no",                false },
        { "true",        true },
        { "yes",        true },
        { },
};

static const struct constant_table *
__lookup_constant(const struct constant_table *tbl, const char *name)
{
        for ( ; tbl->name; tbl++)
                if (strcmp(name, tbl->name) == 0)
                        return tbl;
        return NULL;
}

/**
 * lookup_constant - Look up a constant by name in an ordered table
 * @tbl: The table of constants to search.
 * @name: The name to look up.
 * @not_found: The value to return if the name is not found.
 */
int lookup_constant(const struct constant_table *tbl, const char *name, int not_found)
{
        const struct constant_table *p = __lookup_constant(tbl, name);

        return p ? p->value : not_found;
}
EXPORT_SYMBOL(lookup_constant);

static inline bool is_flag(const struct fs_parameter_spec *p)
{
        return p->type == NULL;
}

static const struct fs_parameter_spec *fs_lookup_key(
        const struct fs_parameter_spec *desc,
        struct fs_parameter *param, bool *negated)
{
        const struct fs_parameter_spec *p, *other = NULL;
        const char *name = param->key;
        bool want_flag = param->type == fs_value_is_flag;

        *negated = false;
        for (p = desc; p->name; p++) {
                if (strcmp(p->name, name) != 0)
                        continue;
                if (likely(is_flag(p) == want_flag))
                        return p;
                other = p;
        }
        if (want_flag) {
                if (name[0] == 'n' && name[1] == 'o' && name[2]) {
                        for (p = desc; p->name; p++) {
                                if (strcmp(p->name, name + 2) != 0)
                                        continue;
                                if (!(p->flags & fs_param_neg_with_no))
                                        continue;
                                *negated = true;
                                return p;
                        }
                }
        }
        return other;
}

/*
 * __fs_parse - Parse a filesystem configuration parameter
 * @log: The filesystem context to log errors through.
 * @desc: The parameter description to use.
 * @param: The parameter.
 * @result: Where to place the result of the parse
 *
 * Parse a filesystem configuration parameter and attempt a conversion for a
 * simple parameter for which this is requested.  If successful, the determined
 * parameter ID is placed into @result->key, the desired type is indicated in
 * @result->t and any converted value is placed into an appropriate member of
 * the union in @result.
 *
 * The function returns the parameter number if the parameter was matched,
 * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that
 * unknown parameters are okay and -EINVAL if there was a conversion issue or
 * the parameter wasn't recognised and unknowns aren't okay.
 */
int __fs_parse(struct p_log *log,
             const struct fs_parameter_spec *desc,
             struct fs_parameter *param,
             struct fs_parse_result *result)
{
        const struct fs_parameter_spec *p;

        result->uint_64 = 0;

        p = fs_lookup_key(desc, param, &result->negated);
        if (!p)
                return -ENOPARAM;

        if (p->flags & fs_param_deprecated)
                warn_plog(log, "Deprecated parameter '%s'", param->key);

        /* Try to turn the type we were given into the type desired by the
         * parameter and give an error if we can't.
         */
        if (is_flag(p)) {
                if (param->type != fs_value_is_flag)
                        return inval_plog(log, "Unexpected value for '%s'",
                                      param->key);
                result->boolean = !result->negated;
        } else  {
                int ret = p->type(log, p, param, result);
                if (ret)
                        return ret;
        }
        return p->opt;
}
EXPORT_SYMBOL(__fs_parse);

/**
 * fs_lookup_param - Look up a path referred to by a parameter
 * @fc: The filesystem context to log errors through.
 * @param: The parameter.
 * @want_bdev: T if want a blockdev
 * @flags: Pathwalk flags passed to filename_lookup()
 * @_path: The result of the lookup
 */
int fs_lookup_param(struct fs_context *fc,
                    struct fs_parameter *param,
                    bool want_bdev,
                    unsigned int flags,
                    struct path *_path)
{
        struct filename *f;
        bool put_f;
        int ret;

        switch (param->type) {
        case fs_value_is_string:
                f = getname_kernel(param->string);
                if (IS_ERR(f))
                        return PTR_ERR(f);
                param->dirfd = AT_FDCWD;
                put_f = true;
                break;
        case fs_value_is_filename:
                f = param->name;
                put_f = false;
                break;
        default:
                return invalf(fc, "%s: not usable as path", param->key);
        }

        ret = filename_lookup(param->dirfd, f, flags, _path, NULL);
        if (ret < 0) {
                errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name);
                goto out;
        }

        if (want_bdev &&
            !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) {
                path_put(_path);
                _path->dentry = NULL;
                _path->mnt = NULL;
                errorf(fc, "%s: Non-blockdev passed as '%s'",
                       param->key, f->name);
                ret = -ENOTBLK;
        }

out:
        if (put_f)
                putname(f);
        return ret;
}
EXPORT_SYMBOL(fs_lookup_param);

static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param)
{
        return inval_plog(log, "Bad value for '%s'", param->key);
}

int fs_param_is_bool(struct p_log *log, const struct fs_parameter_spec *p,
                     struct fs_parameter *param, struct fs_parse_result *result)
{
        int b;
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        if (!*param->string && (p->flags & fs_param_can_be_empty))
                return 0;
        b = lookup_constant(bool_names, param->string, -1);
        if (b == -1)
                return fs_param_bad_value(log, param);
        result->boolean = b;
        return 0;
}
EXPORT_SYMBOL(fs_param_is_bool);

int fs_param_is_u32(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        int base = (unsigned long)p->data;
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        if (!*param->string && (p->flags & fs_param_can_be_empty))
                return 0;
        if (kstrtouint(param->string, base, &result->uint_32) < 0)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_u32);

int fs_param_is_s32(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        if (!*param->string && (p->flags & fs_param_can_be_empty))
                return 0;
        if (kstrtoint(param->string, 0, &result->int_32) < 0)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_s32);

int fs_param_is_u64(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        if (!*param->string && (p->flags & fs_param_can_be_empty))
                return 0;
        if (kstrtoull(param->string, 0, &result->uint_64) < 0)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_u64);

int fs_param_is_enum(struct p_log *log, const struct fs_parameter_spec *p,
                     struct fs_parameter *param, struct fs_parse_result *result)
{
        const struct constant_table *c;
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        if (!*param->string && (p->flags & fs_param_can_be_empty))
                return 0;
        c = __lookup_constant(p->data, param->string);
        if (!c)
                return fs_param_bad_value(log, param);
        result->uint_32 = c->value;
        return 0;
}
EXPORT_SYMBOL(fs_param_is_enum);

int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p,
                       struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_string ||
            (!*param->string && !(p->flags & fs_param_can_be_empty)))
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_string);

int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
                  struct fs_parameter *param, struct fs_parse_result *result)
{
        switch (param->type) {
        case fs_value_is_string:
                if ((!*param->string && !(p->flags & fs_param_can_be_empty)) ||
                    kstrtouint(param->string, 0, &result->uint_32) < 0)
                        break;
                if (result->uint_32 <= INT_MAX)
                        return 0;
                break;
        case fs_value_is_file:
                result->uint_32 = param->dirfd;
                if (result->uint_32 <= INT_MAX)
                        return 0;
                break;
        default:
                break;
        }
        return fs_param_bad_value(log, param);
}
EXPORT_SYMBOL(fs_param_is_fd);

int fs_param_is_file_or_string(struct p_log *log,
                               const struct fs_parameter_spec *p,
                               struct fs_parameter *param,
                               struct fs_parse_result *result)
{
        switch (param->type) {
        case fs_value_is_string:
                return fs_param_is_string(log, p, param, result);
        case fs_value_is_file:
                result->uint_32 = param->dirfd;
                if (result->uint_32 <= INT_MAX)
                        return 0;
                break;
        default:
                break;
        }
        return fs_param_bad_value(log, param);
}
EXPORT_SYMBOL(fs_param_is_file_or_string);

int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        kuid_t uid;

        if (fs_param_is_u32(log, p, param, result) != 0)
                return fs_param_bad_value(log, param);

        uid = make_kuid(current_user_ns(), result->uint_32);
        if (!uid_valid(uid))
                return inval_plog(log, "Invalid uid '%s'", param->string);

        result->uid = uid;
        return 0;
}
EXPORT_SYMBOL(fs_param_is_uid);

int fs_param_is_gid(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        kgid_t gid;

        if (fs_param_is_u32(log, p, param, result) != 0)
                return fs_param_bad_value(log, param);

        gid = make_kgid(current_user_ns(), result->uint_32);
        if (!gid_valid(gid))
                return inval_plog(log, "Invalid gid '%s'", param->string);

        result->gid = gid;
        return 0;
}
EXPORT_SYMBOL(fs_param_is_gid);

int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p,
                  struct fs_parameter *param, struct fs_parse_result *result)
{
        return 0;
}
EXPORT_SYMBOL(fs_param_is_blockdev);

#ifdef CONFIG_VALIDATE_FS_PARSER
/**
 * fs_validate_description - Validate a parameter specification array
 * @name: Owner name of the parameter specification array
 * @desc: The parameter specification array to validate.
 */
bool fs_validate_description(const char *name,
        const struct fs_parameter_spec *desc)
{
        const struct fs_parameter_spec *param, *p2;
        bool good = true;

        for (param = desc; param->name; param++) {
                /* Check for duplicate parameter names */
                for (p2 = desc; p2 < param; p2++) {
                        if (strcmp(param->name, p2->name) == 0) {
                                if (is_flag(param) != is_flag(p2))
                                        continue;
                                pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n",
                                       name, param->name);
                                good = false;
                        }
                }
        }
        return good;
}
#endif /* CONFIG_VALIDATE_FS_PARSER */
















































































    3 




    4 











    4 























































































































    2 

































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
/* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 *
 * SipHash: a fast short-input PRF
 * https://131002.net/siphash/
 *
 * This implementation is specifically for SipHash2-4 for a secure PRF
 * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
 * hashtables.
 */

#include <linux/siphash.h>
#include <linux/unaligned.h>

#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
#include <linux/dcache.h>
#include <asm/word-at-a-time.h>
#endif

#define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3)

#define PREAMBLE(len) \
        u64 v0 = SIPHASH_CONST_0; \
        u64 v1 = SIPHASH_CONST_1; \
        u64 v2 = SIPHASH_CONST_2; \
        u64 v3 = SIPHASH_CONST_3; \
        u64 b = ((u64)(len)) << 56; \
        v3 ^= key->key[1]; \
        v2 ^= key->key[0]; \
        v1 ^= key->key[1]; \
        v0 ^= key->key[0];

#define POSTAMBLE \
        v3 ^= b; \
        SIPROUND; \
        SIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        SIPROUND; \
        SIPROUND; \
        SIPROUND; \
        SIPROUND; \
        return (v0 ^ v1) ^ (v2 ^ v3);

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        PREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = le64_to_cpup(data);
                v3 ^= m;
                SIPROUND;
                SIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= le32_to_cpup(data); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
#endif
        POSTAMBLE
}
EXPORT_SYMBOL(__siphash_aligned);
#endif

u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        PREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = get_unaligned_le64(data);
                v3 ^= m;
                SIPROUND;
                SIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= get_unaligned_le32(end); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
#endif
        POSTAMBLE
}
EXPORT_SYMBOL(__siphash_unaligned);

/**
 * siphash_1u64 - compute 64-bit siphash PRF value of a u64
 * @first: first u64
 * @key: the siphash key
 */
u64 siphash_1u64(const u64 first, const siphash_key_t *key)
{
        PREAMBLE(8)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_1u64);

/**
 * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
 * @first: first u64
 * @second: second u64
 * @key: the siphash key
 */
u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key)
{
        PREAMBLE(16)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_2u64);

/**
 * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
 * @first: first u64
 * @second: second u64
 * @third: third u64
 * @key: the siphash key
 */
u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
                 const siphash_key_t *key)
{
        PREAMBLE(24)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        v3 ^= third;
        SIPROUND;
        SIPROUND;
        v0 ^= third;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_3u64);

/**
 * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
 * @first: first u64
 * @second: second u64
 * @third: third u64
 * @forth: forth u64
 * @key: the siphash key
 */
u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
                 const u64 forth, const siphash_key_t *key)
{
        PREAMBLE(32)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        v3 ^= third;
        SIPROUND;
        SIPROUND;
        v0 ^= third;
        v3 ^= forth;
        SIPROUND;
        SIPROUND;
        v0 ^= forth;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_4u64);

u64 siphash_1u32(const u32 first, const siphash_key_t *key)
{
        PREAMBLE(4)
        b |= first;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_1u32);

u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
                 const siphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        PREAMBLE(12)
        v3 ^= combined;
        SIPROUND;
        SIPROUND;
        v0 ^= combined;
        b |= third;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_3u32);

#if BITS_PER_LONG == 64
/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for
 * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3.
 */

#define HSIPROUND SIPROUND
#define HPREAMBLE(len) PREAMBLE(len)
#define HPOSTAMBLE \
        v3 ^= b; \
        HSIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        HSIPROUND; \
        HSIPROUND; \
        HSIPROUND; \
        return (v0 ^ v1) ^ (v2 ^ v3);

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = le64_to_cpup(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= le32_to_cpup(data); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
#endif
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_aligned);
#endif

u32 __hsiphash_unaligned(const void *data, size_t len,
                         const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = get_unaligned_le64(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= get_unaligned_le32(end); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
#endif
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_unaligned);

/**
 * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
 * @first: first u32
 * @key: the hsiphash key
 */
u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
{
        HPREAMBLE(4)
        b |= first;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_1u32);

/**
 * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
 * @first: first u32
 * @second: second u32
 * @key: the hsiphash key
 */
u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(8)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_2u32);

/**
 * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @key: the hsiphash key
 */
u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
                  const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(12)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        b |= third;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_3u32);

/**
 * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @forth: forth u32
 * @key: the hsiphash key
 */
u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
                  const u32 forth, const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(16)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        combined = (u64)forth << 32 | third;
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_4u32);
#else
#define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3)

#define HPREAMBLE(len) \
        u32 v0 = HSIPHASH_CONST_0; \
        u32 v1 = HSIPHASH_CONST_1; \
        u32 v2 = HSIPHASH_CONST_2; \
        u32 v3 = HSIPHASH_CONST_3; \
        u32 b = ((u32)(len)) << 24; \
        v3 ^= key->key[1]; \
        v2 ^= key->key[0]; \
        v1 ^= key->key[1]; \
        v0 ^= key->key[0];

#define HPOSTAMBLE \
        v3 ^= b; \
        HSIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        HSIPROUND; \
        HSIPROUND; \
        HSIPROUND; \
        return v1 ^ v3;

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u32));
        const u8 left = len & (sizeof(u32) - 1);
        u32 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u32)) {
                m = le32_to_cpup(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
        switch (left) {
        case 3: b |= ((u32)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_aligned);
#endif

u32 __hsiphash_unaligned(const void *data, size_t len,
                         const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u32));
        const u8 left = len & (sizeof(u32) - 1);
        u32 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u32)) {
                m = get_unaligned_le32(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
        switch (left) {
        case 3: b |= ((u32)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_unaligned);

/**
 * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
 * @first: first u32
 * @key: the hsiphash key
 */
u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
{
        HPREAMBLE(4)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_1u32);

/**
 * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
 * @first: first u32
 * @second: second u32
 * @key: the hsiphash key
 */
u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
{
        HPREAMBLE(8)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_2u32);

/**
 * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @key: the hsiphash key
 */
u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
                  const hsiphash_key_t *key)
{
        HPREAMBLE(12)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        v3 ^= third;
        HSIPROUND;
        v0 ^= third;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_3u32);

/**
 * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @forth: forth u32
 * @key: the hsiphash key
 */
u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
                  const u32 forth, const hsiphash_key_t *key)
{
        HPREAMBLE(16)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        v3 ^= third;
        HSIPROUND;
        v0 ^= third;
        v3 ^= forth;
        HSIPROUND;
        v0 ^= forth;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_4u32);
#endif































    4 













   52 


















   53 
    4 



































































   18 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * kref.h - library routines for handling generic reference counted objects
 *
 * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2004 IBM Corp.
 *
 * based on kobject.h which was:
 * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (C) 2002-2003 Open Source Development Labs
 */

#ifndef _KREF_H_
#define _KREF_H_

#include <linux/spinlock.h>
#include <linux/refcount.h>

struct kref {
        refcount_t refcount;
};

#define KREF_INIT(n)        { .refcount = REFCOUNT_INIT(n), }

/**
 * kref_init - initialize object.
 * @kref: object in question.
 */
static inline void kref_init(struct kref *kref)
{
        refcount_set(&kref->refcount, 1);
}

static inline unsigned int kref_read(const struct kref *kref)
{
        return refcount_read(&kref->refcount);
}

/**
 * kref_get - increment refcount for object.
 * @kref: object.
 */
static inline void kref_get(struct kref *kref)
{
        refcount_inc(&kref->refcount);
}

/**
 * kref_put - Decrement refcount for object
 * @kref: Object
 * @release: Pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 *
 * Decrement the refcount, and if 0, call @release.  The caller may not
 * pass NULL or kfree() as the release function.
 *
 * Return: 1 if this call removed the object, otherwise return 0.  Beware,
 * if this function returns 0, another caller may have removed the object
 * by the time this function returns.  The return value is only certain
 * if you want to see if the object is definitely released.
 */
static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
        if (refcount_dec_and_test(&kref->refcount)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_put_mutex - Decrement refcount for object
 * @kref: Object
 * @release: Pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 * @mutex: Mutex which protects the release function.
 *
 * This variant of kref_lock() calls the @release function with the @mutex
 * held.  The @release function will release the mutex.
 */
static inline int kref_put_mutex(struct kref *kref,
                                 void (*release)(struct kref *kref),
                                 struct mutex *mutex)
        __cond_acquires(true, mutex)
{
        if (refcount_dec_and_mutex_lock(&kref->refcount, mutex)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_put_lock - Decrement refcount for object
 * @kref: Object
 * @release: Pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 * @lock: Spinlock which protects the release function.
 *
 * This variant of kref_lock() calls the @release function with the @lock
 * held.  The @release function will release the lock.
 */
static inline int kref_put_lock(struct kref *kref,
                                void (*release)(struct kref *kref),
                                spinlock_t *lock)
        __cond_acquires(true, lock)
{
        if (refcount_dec_and_lock(&kref->refcount, lock)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_get_unless_zero - Increment refcount for object unless it is zero.
 * @kref: object.
 *
 * This function is intended to simplify locking around refcounting for
 * objects that can be looked up from a lookup structure, and which are
 * removed from that lookup structure in the object destructor.
 * Operations on such objects require at least a read lock around
 * lookup + kref_get, and a write lock around kref_put + remove from lookup
 * structure. Furthermore, RCU implementations become extremely tricky.
 * With a lookup followed by a kref_get_unless_zero *with return value check*
 * locking in the kref_put path can be deferred to the actual removal from
 * the lookup structure and RCU lookups become trivial.
 *
 * Return: non-zero if the increment succeeded. Otherwise return 0.
 */
static inline int __must_check kref_get_unless_zero(struct kref *kref)
{
        return refcount_inc_not_zero(&kref->refcount);
}
#endif /* _KREF_H_ */





































































    3 










    3 




    3 













    3 


    3 

















































































































































    2 
















    3 
    2 


    3 






























































































































































































































































































































































































































































































    3 













































































    2 





























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
// SPDX-License-Identifier: GPL-2.0
/*
 * Devices PM QoS constraints management
 *
 * Copyright (C) 2011 Texas Instruments, Inc.
 *
 * This module exposes the interface to kernel space for specifying
 * per-device PM QoS dependencies. It provides infrastructure for registration
 * of:
 *
 * Dependents on a QoS value : register requests
 * Watchers of QoS value : get notified when target QoS value changes
 *
 * This QoS design is best effort based. Dependents register their QoS needs.
 * Watchers register to keep track of the current QoS needs of the system.
 * Watchers can register a per-device notification callback using the
 * dev_pm_qos_*_notifier API. The notification chain data is stored in the
 * per-device constraint data struct.
 *
 * Note about the per-device constraint data struct allocation:
 * . The per-device constraints data struct ptr is stored into the device
 *    dev_pm_info.
 * . To minimize the data usage by the per-device constraints, the data struct
 *   is only allocated at the first call to dev_pm_qos_add_request.
 * . The data is later free'd when the device is removed from the system.
 *  . A global mutex protects the constraints users from the data being
 *     allocated and free'd.
 */

#include <linux/pm_qos.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/device.h>
#include <linux/mutex.h>
#include <linux/export.h>
#include <linux/pm_runtime.h>
#include <linux/err.h>
#include <trace/events/power.h>

#include "power.h"

static DEFINE_MUTEX(dev_pm_qos_mtx);
static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx);

/**
 * __dev_pm_qos_flags - Check PM QoS flags for a given device.
 * @dev: Device to check the PM QoS flags for.
 * @mask: Flags to check against.
 *
 * This routine must be called with dev->power.lock held.
 */
enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask)
{
        struct dev_pm_qos *qos = dev->power.qos;
        struct pm_qos_flags *pqf;
        s32 val;

        lockdep_assert_held(&dev->power.lock);

        if (IS_ERR_OR_NULL(qos))
                return PM_QOS_FLAGS_UNDEFINED;

        pqf = &qos->flags;
        if (list_empty(&pqf->list))
                return PM_QOS_FLAGS_UNDEFINED;

        val = pqf->effective_flags & mask;
        if (val)
                return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME;

        return PM_QOS_FLAGS_NONE;
}

/**
 * dev_pm_qos_flags - Check PM QoS flags for a given device (locked).
 * @dev: Device to check the PM QoS flags for.
 * @mask: Flags to check against.
 */
enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask)
{
        unsigned long irqflags;
        enum pm_qos_flags_status ret;

        spin_lock_irqsave(&dev->power.lock, irqflags);
        ret = __dev_pm_qos_flags(dev, mask);
        spin_unlock_irqrestore(&dev->power.lock, irqflags);

        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_flags);

/**
 * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device.
 * @dev: Device to get the PM QoS constraint value for.
 *
 * This routine must be called with dev->power.lock held.
 */
s32 __dev_pm_qos_resume_latency(struct device *dev)
{
        lockdep_assert_held(&dev->power.lock);

        return dev_pm_qos_raw_resume_latency(dev);
}

/**
 * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked).
 * @dev: Device to get the PM QoS constraint value for.
 * @type: QoS request type.
 */
s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type)
{
        struct dev_pm_qos *qos = dev->power.qos;
        unsigned long flags;
        s32 ret;

        spin_lock_irqsave(&dev->power.lock, flags);

        switch (type) {
        case DEV_PM_QOS_RESUME_LATENCY:
                ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT
                        : pm_qos_read_value(&qos->resume_latency);
                break;
        case DEV_PM_QOS_MIN_FREQUENCY:
                ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE
                        : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN);
                break;
        case DEV_PM_QOS_MAX_FREQUENCY:
                ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE
                        : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX);
                break;
        default:
                WARN_ON(1);
                ret = 0;
        }

        spin_unlock_irqrestore(&dev->power.lock, flags);

        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_read_value);

/**
 * apply_constraint - Add/modify/remove device PM QoS request.
 * @req: Constraint request to apply
 * @action: Action to perform (add/update/remove).
 * @value: Value to assign to the QoS request.
 *
 * Internal function to update the constraints list using the PM QoS core
 * code and if needed call the per-device callbacks.
 */
static int apply_constraint(struct dev_pm_qos_request *req,
                            enum pm_qos_req_action action, s32 value)
{
        struct dev_pm_qos *qos = req->dev->power.qos;
        int ret;

        switch(req->type) {
        case DEV_PM_QOS_RESUME_LATENCY:
                if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0))
                        value = 0;

                ret = pm_qos_update_target(&qos->resume_latency,
                                           &req->data.pnode, action, value);
                break;
        case DEV_PM_QOS_LATENCY_TOLERANCE:
                ret = pm_qos_update_target(&qos->latency_tolerance,
                                           &req->data.pnode, action, value);
                if (ret) {
                        value = pm_qos_read_value(&qos->latency_tolerance);
                        req->dev->power.set_latency_tolerance(req->dev, value);
                }
                break;
        case DEV_PM_QOS_MIN_FREQUENCY:
        case DEV_PM_QOS_MAX_FREQUENCY:
                ret = freq_qos_apply(&req->data.freq, action, value);
                break;
        case DEV_PM_QOS_FLAGS:
                ret = pm_qos_update_flags(&qos->flags, &req->data.flr,
                                          action, value);
                break;
        default:
                ret = -EINVAL;
        }

        return ret;
}

/*
 * dev_pm_qos_constraints_allocate
 * @dev: device to allocate data for
 *
 * Called at the first call to add_request, for constraint data allocation
 * Must be called with the dev_pm_qos_mtx mutex held
 */
static int dev_pm_qos_constraints_allocate(struct device *dev)
{
        struct dev_pm_qos *qos;
        struct pm_qos_constraints *c;
        struct blocking_notifier_head *n;

        qos = kzalloc_obj(*qos);
        if (!qos)
                return -ENOMEM;

        n = kzalloc_objs(*n, 3);
        if (!n) {
                kfree(qos);
                return -ENOMEM;
        }

        c = &qos->resume_latency;
        plist_head_init(&c->list);
        c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE;
        c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE;
        c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
        c->type = PM_QOS_MIN;
        c->notifiers = n;
        BLOCKING_INIT_NOTIFIER_HEAD(n);

        c = &qos->latency_tolerance;
        plist_head_init(&c->list);
        c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE;
        c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE;
        c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
        c->type = PM_QOS_MIN;

        freq_constraints_init(&qos->freq);

        INIT_LIST_HEAD(&qos->flags.list);

        spin_lock_irq(&dev->power.lock);
        dev->power.qos = qos;
        spin_unlock_irq(&dev->power.lock);

        return 0;
}

static void __dev_pm_qos_hide_latency_limit(struct device *dev);
static void __dev_pm_qos_hide_flags(struct device *dev);

/**
 * dev_pm_qos_constraints_destroy
 * @dev: target device
 *
 * Called from the device PM subsystem on device removal under device_pm_lock().
 */
void dev_pm_qos_constraints_destroy(struct device *dev)
{
        struct dev_pm_qos *qos;
        struct dev_pm_qos_request *req, *tmp;
        struct pm_qos_constraints *c;
        struct pm_qos_flags *f;

        mutex_lock(&dev_pm_qos_sysfs_mtx);

        /*
         * If the device's PM QoS resume latency limit or PM QoS flags have been
         * exposed to user space, they have to be hidden at this point.
         */
        pm_qos_sysfs_remove_resume_latency(dev);
        pm_qos_sysfs_remove_flags(dev);

        mutex_lock(&dev_pm_qos_mtx);

        __dev_pm_qos_hide_latency_limit(dev);
        __dev_pm_qos_hide_flags(dev);

        qos = dev->power.qos;
        if (!qos)
                goto out;

        /* Flush the constraints lists for the device. */
        c = &qos->resume_latency;
        plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
                /*
                 * Update constraints list and call the notification
                 * callbacks if needed
                 */
                apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
                memset(req, 0, sizeof(*req));
        }

        c = &qos->latency_tolerance;
        plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
                apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
                memset(req, 0, sizeof(*req));
        }

        c = &qos->freq.min_freq;
        plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) {
                apply_constraint(req, PM_QOS_REMOVE_REQ,
                                 PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE);
                memset(req, 0, sizeof(*req));
        }

        c = &qos->freq.max_freq;
        plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) {
                apply_constraint(req, PM_QOS_REMOVE_REQ,
                                 PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
                memset(req, 0, sizeof(*req));
        }

        f = &qos->flags;
        list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) {
                apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
                memset(req, 0, sizeof(*req));
        }

        spin_lock_irq(&dev->power.lock);
        dev->power.qos = ERR_PTR(-ENODEV);
        spin_unlock_irq(&dev->power.lock);

        kfree(qos->resume_latency.notifiers);
        kfree(qos);

 out:
        mutex_unlock(&dev_pm_qos_mtx);

        mutex_unlock(&dev_pm_qos_sysfs_mtx);
}

static bool dev_pm_qos_invalid_req_type(struct device *dev,
                                        enum dev_pm_qos_req_type type)
{
        return type == DEV_PM_QOS_LATENCY_TOLERANCE &&
               !dev->power.set_latency_tolerance;
}

static int __dev_pm_qos_add_request(struct device *dev,
                                    struct dev_pm_qos_request *req,
                                    enum dev_pm_qos_req_type type, s32 value)
{
        int ret = 0;

        if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type))
                return -EINVAL;

        if (WARN(dev_pm_qos_request_active(req),
                 "%s() called for already added request\n", __func__))
                return -EINVAL;

        if (IS_ERR(dev->power.qos))
                ret = -ENODEV;
        else if (!dev->power.qos)
                ret = dev_pm_qos_constraints_allocate(dev);

        trace_dev_pm_qos_add_request(dev_name(dev), type, value);
        if (ret)
                return ret;

        req->dev = dev;
        req->type = type;
        if (req->type == DEV_PM_QOS_MIN_FREQUENCY)
                ret = freq_qos_add_request(&dev->power.qos->freq,
                                           &req->data.freq,
                                           FREQ_QOS_MIN, value);
        else if (req->type == DEV_PM_QOS_MAX_FREQUENCY)
                ret = freq_qos_add_request(&dev->power.qos->freq,
                                           &req->data.freq,
                                           FREQ_QOS_MAX, value);
        else
                ret = apply_constraint(req, PM_QOS_ADD_REQ, value);

        return ret;
}

/**
 * dev_pm_qos_add_request - inserts new qos request into the list
 * @dev: target device for the constraint
 * @req: pointer to a preallocated handle
 * @type: type of the request
 * @value: defines the qos request
 *
 * This function inserts a new entry in the device constraints list of
 * requested qos performance characteristics. It recomputes the aggregate
 * QoS expectations of parameters and initializes the dev_pm_qos_request
 * handle.  Caller needs to save this handle for later use in updates and
 * removal.
 *
 * Returns 1 if the aggregated constraint value has changed,
 * 0 if the aggregated constraint value has not changed,
 * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory
 * to allocate for data structures, -ENODEV if the device has just been removed
 * from the system.
 *
 * Callers should ensure that the target device is not RPM_SUSPENDED before
 * using this function for requests of type DEV_PM_QOS_FLAGS.
 */
int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
                           enum dev_pm_qos_req_type type, s32 value)
{
        int ret;

        mutex_lock(&dev_pm_qos_mtx);
        ret = __dev_pm_qos_add_request(dev, req, type, value);
        mutex_unlock(&dev_pm_qos_mtx);
        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_add_request);

/**
 * __dev_pm_qos_update_request - Modify an existing device PM QoS request.
 * @req : PM QoS request to modify.
 * @new_value: New value to request.
 */
static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req,
                                       s32 new_value)
{
        s32 curr_value;
        int ret = 0;

        if (!req) /*guard against callers passing in null */
                return -EINVAL;

        if (WARN(!dev_pm_qos_request_active(req),
                 "%s() called for unknown object\n", __func__))
                return -EINVAL;

        if (IS_ERR_OR_NULL(req->dev->power.qos))
                return -ENODEV;

        switch(req->type) {
        case DEV_PM_QOS_RESUME_LATENCY:
        case DEV_PM_QOS_LATENCY_TOLERANCE:
                curr_value = req->data.pnode.prio;
                break;
        case DEV_PM_QOS_MIN_FREQUENCY:
        case DEV_PM_QOS_MAX_FREQUENCY:
                curr_value = req->data.freq.pnode.prio;
                break;
        case DEV_PM_QOS_FLAGS:
                curr_value = req->data.flr.flags;
                break;
        default:
                return -EINVAL;
        }

        trace_dev_pm_qos_update_request(dev_name(req->dev), req->type,
                                        new_value);
        if (curr_value != new_value)
                ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value);

        return ret;
}

/**
 * dev_pm_qos_update_request - modifies an existing qos request
 * @req : handle to list element holding a dev_pm_qos request to use
 * @new_value: defines the qos request
 *
 * Updates an existing dev PM qos request along with updating the
 * target value.
 *
 * Attempts are made to make this code callable on hot code paths.
 *
 * Returns 1 if the aggregated constraint value has changed,
 * 0 if the aggregated constraint value has not changed,
 * -EINVAL in case of wrong parameters, -ENODEV if the device has been
 * removed from the system
 *
 * Callers should ensure that the target device is not RPM_SUSPENDED before
 * using this function for requests of type DEV_PM_QOS_FLAGS.
 */
int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value)
{
        int ret;

        mutex_lock(&dev_pm_qos_mtx);
        ret = __dev_pm_qos_update_request(req, new_value);
        mutex_unlock(&dev_pm_qos_mtx);
        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_update_request);

static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req)
{
        int ret;

        if (!req) /*guard against callers passing in null */
                return -EINVAL;

        if (WARN(!dev_pm_qos_request_active(req),
                 "%s() called for unknown object\n", __func__))
                return -EINVAL;

        if (IS_ERR_OR_NULL(req->dev->power.qos))
                return -ENODEV;

        trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type,
                                        PM_QOS_DEFAULT_VALUE);
        ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
        memset(req, 0, sizeof(*req));
        return ret;
}

/**
 * dev_pm_qos_remove_request - modifies an existing qos request
 * @req: handle to request list element
 *
 * Will remove pm qos request from the list of constraints and
 * recompute the current target value. Call this on slow code paths.
 *
 * Returns 1 if the aggregated constraint value has changed,
 * 0 if the aggregated constraint value has not changed,
 * -EINVAL in case of wrong parameters, -ENODEV if the device has been
 * removed from the system
 *
 * Callers should ensure that the target device is not RPM_SUSPENDED before
 * using this function for requests of type DEV_PM_QOS_FLAGS.
 */
int dev_pm_qos_remove_request(struct dev_pm_qos_request *req)
{
        int ret;

        mutex_lock(&dev_pm_qos_mtx);
        ret = __dev_pm_qos_remove_request(req);
        mutex_unlock(&dev_pm_qos_mtx);
        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request);

/**
 * dev_pm_qos_add_notifier - sets notification entry for changes to target value
 * of per-device PM QoS constraints
 *
 * @dev: target device for the constraint
 * @notifier: notifier block managed by caller.
 * @type: request type.
 *
 * Will register the notifier into a notification chain that gets called
 * upon changes to the target value for the device.
 *
 * If the device's constraints object doesn't exist when this routine is called,
 * it will be created (or error code will be returned if that fails).
 */
int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
                            enum dev_pm_qos_req_type type)
{
        int ret = 0;

        mutex_lock(&dev_pm_qos_mtx);

        if (IS_ERR(dev->power.qos))
                ret = -ENODEV;
        else if (!dev->power.qos)
                ret = dev_pm_qos_constraints_allocate(dev);

        if (ret)
                goto unlock;

        switch (type) {
        case DEV_PM_QOS_RESUME_LATENCY:
                ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers,
                                                       notifier);
                break;
        case DEV_PM_QOS_MIN_FREQUENCY:
                ret = freq_qos_add_notifier(&dev->power.qos->freq,
                                            FREQ_QOS_MIN, notifier);
                break;
        case DEV_PM_QOS_MAX_FREQUENCY:
                ret = freq_qos_add_notifier(&dev->power.qos->freq,
                                            FREQ_QOS_MAX, notifier);
                break;
        default:
                WARN_ON(1);
                ret = -EINVAL;
        }

unlock:
        mutex_unlock(&dev_pm_qos_mtx);
        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier);

/**
 * dev_pm_qos_remove_notifier - deletes notification for changes to target value
 * of per-device PM QoS constraints
 *
 * @dev: target device for the constraint
 * @notifier: notifier block to be removed.
 * @type: request type.
 *
 * Will remove the notifier from the notification chain that gets called
 * upon changes to the target value.
 */
int dev_pm_qos_remove_notifier(struct device *dev,
                               struct notifier_block *notifier,
                               enum dev_pm_qos_req_type type)
{
        int ret = 0;

        mutex_lock(&dev_pm_qos_mtx);

        /* Silently return if the constraints object is not present. */
        if (IS_ERR_OR_NULL(dev->power.qos))
                goto unlock;

        switch (type) {
        case DEV_PM_QOS_RESUME_LATENCY:
                ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
                                                         notifier);
                break;
        case DEV_PM_QOS_MIN_FREQUENCY:
                ret = freq_qos_remove_notifier(&dev->power.qos->freq,
                                               FREQ_QOS_MIN, notifier);
                break;
        case DEV_PM_QOS_MAX_FREQUENCY:
                ret = freq_qos_remove_notifier(&dev->power.qos->freq,
                                               FREQ_QOS_MAX, notifier);
                break;
        default:
                WARN_ON(1);
                ret = -EINVAL;
        }

unlock:
        mutex_unlock(&dev_pm_qos_mtx);
        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier);

/**
 * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor.
 * @dev: Device whose ancestor to add the request for.
 * @req: Pointer to the preallocated handle.
 * @type: Type of the request.
 * @value: Constraint latency value.
 */
int dev_pm_qos_add_ancestor_request(struct device *dev,
                                    struct dev_pm_qos_request *req,
                                    enum dev_pm_qos_req_type type, s32 value)
{
        struct device *ancestor = dev->parent;
        int ret = -ENODEV;

        switch (type) {
        case DEV_PM_QOS_RESUME_LATENCY:
                while (ancestor && !ancestor->power.ignore_children)
                        ancestor = ancestor->parent;

                break;
        case DEV_PM_QOS_LATENCY_TOLERANCE:
                while (ancestor && !ancestor->power.set_latency_tolerance)
                        ancestor = ancestor->parent;

                break;
        default:
                ancestor = NULL;
        }
        if (ancestor)
                ret = dev_pm_qos_add_request(ancestor, req, type, value);

        if (ret < 0)
                req->dev = NULL;

        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request);

static void __dev_pm_qos_drop_user_request(struct device *dev,
                                           enum dev_pm_qos_req_type type)
{
        struct dev_pm_qos_request *req = NULL;

        switch(type) {
        case DEV_PM_QOS_RESUME_LATENCY:
                req = dev->power.qos->resume_latency_req;
                dev->power.qos->resume_latency_req = NULL;
                break;
        case DEV_PM_QOS_LATENCY_TOLERANCE:
                req = dev->power.qos->latency_tolerance_req;
                dev->power.qos->latency_tolerance_req = NULL;
                break;
        case DEV_PM_QOS_FLAGS:
                req = dev->power.qos->flags_req;
                dev->power.qos->flags_req = NULL;
                break;
        default:
                WARN_ON(1);
                return;
        }
        __dev_pm_qos_remove_request(req);
        kfree(req);
}

static void dev_pm_qos_drop_user_request(struct device *dev,
                                         enum dev_pm_qos_req_type type)
{
        mutex_lock(&dev_pm_qos_mtx);
        __dev_pm_qos_drop_user_request(dev, type);
        mutex_unlock(&dev_pm_qos_mtx);
}

/**
 * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space.
 * @dev: Device whose PM QoS latency limit is to be exposed to user space.
 * @value: Initial value of the latency limit.
 */
int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value)
{
        struct dev_pm_qos_request *req;
        int ret;

        if (!device_is_registered(dev) || value < 0)
                return -EINVAL;

        req = kzalloc_obj(*req);
        if (!req)
                return -ENOMEM;

        ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value);
        if (ret < 0) {
                kfree(req);
                return ret;
        }

        mutex_lock(&dev_pm_qos_sysfs_mtx);

        mutex_lock(&dev_pm_qos_mtx);

        if (IS_ERR_OR_NULL(dev->power.qos))
                ret = -ENODEV;
        else if (dev->power.qos->resume_latency_req)
                ret = -EEXIST;

        if (ret < 0) {
                __dev_pm_qos_remove_request(req);
                kfree(req);
                mutex_unlock(&dev_pm_qos_mtx);
                goto out;
        }
        dev->power.qos->resume_latency_req = req;

        mutex_unlock(&dev_pm_qos_mtx);

        ret = pm_qos_sysfs_add_resume_latency(dev);
        if (ret)
                dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY);

 out:
        mutex_unlock(&dev_pm_qos_sysfs_mtx);
        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit);

static void __dev_pm_qos_hide_latency_limit(struct device *dev)
{
        if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req)
                __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY);
}

/**
 * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space.
 * @dev: Device whose PM QoS latency limit is to be hidden from user space.
 */
void dev_pm_qos_hide_latency_limit(struct device *dev)
{
        mutex_lock(&dev_pm_qos_sysfs_mtx);

        pm_qos_sysfs_remove_resume_latency(dev);

        mutex_lock(&dev_pm_qos_mtx);
        __dev_pm_qos_hide_latency_limit(dev);
        mutex_unlock(&dev_pm_qos_mtx);

        mutex_unlock(&dev_pm_qos_sysfs_mtx);
}
EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit);

/**
 * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space.
 * @dev: Device whose PM QoS flags are to be exposed to user space.
 * @val: Initial values of the flags.
 */
int dev_pm_qos_expose_flags(struct device *dev, s32 val)
{
        struct dev_pm_qos_request *req;
        int ret;

        if (!device_is_registered(dev))
                return -EINVAL;

        req = kzalloc_obj(*req);
        if (!req)
                return -ENOMEM;

        ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val);
        if (ret < 0) {
                kfree(req);
                return ret;
        }

        pm_runtime_get_sync(dev);
        mutex_lock(&dev_pm_qos_sysfs_mtx);

        mutex_lock(&dev_pm_qos_mtx);

        if (IS_ERR_OR_NULL(dev->power.qos))
                ret = -ENODEV;
        else if (dev->power.qos->flags_req)
                ret = -EEXIST;

        if (ret < 0) {
                __dev_pm_qos_remove_request(req);
                kfree(req);
                mutex_unlock(&dev_pm_qos_mtx);
                goto out;
        }
        dev->power.qos->flags_req = req;

        mutex_unlock(&dev_pm_qos_mtx);

        ret = pm_qos_sysfs_add_flags(dev);
        if (ret)
                dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS);

 out:
        mutex_unlock(&dev_pm_qos_sysfs_mtx);
        pm_runtime_put(dev);
        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags);

static void __dev_pm_qos_hide_flags(struct device *dev)
{
        if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req)
                __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS);
}

/**
 * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space.
 * @dev: Device whose PM QoS flags are to be hidden from user space.
 */
void dev_pm_qos_hide_flags(struct device *dev)
{
        pm_runtime_get_sync(dev);
        mutex_lock(&dev_pm_qos_sysfs_mtx);

        pm_qos_sysfs_remove_flags(dev);

        mutex_lock(&dev_pm_qos_mtx);
        __dev_pm_qos_hide_flags(dev);
        mutex_unlock(&dev_pm_qos_mtx);

        mutex_unlock(&dev_pm_qos_sysfs_mtx);
        pm_runtime_put(dev);
}
EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags);

/**
 * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space.
 * @dev: Device to update the PM QoS flags request for.
 * @mask: Flags to set/clear.
 * @set: Whether to set or clear the flags (true means set).
 */
int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set)
{
        s32 value;
        int ret;

        pm_runtime_get_sync(dev);
        mutex_lock(&dev_pm_qos_mtx);

        if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) {
                ret = -EINVAL;
                goto out;
        }

        value = dev_pm_qos_requested_flags(dev);
        if (set)
                value |= mask;
        else
                value &= ~mask;

        ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value);

 out:
        mutex_unlock(&dev_pm_qos_mtx);
        pm_runtime_put(dev);
        return ret;
}

/**
 * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance.
 * @dev: Device to obtain the user space latency tolerance for.
 */
s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev)
{
        s32 ret;

        mutex_lock(&dev_pm_qos_mtx);
        ret = IS_ERR_OR_NULL(dev->power.qos)
                || !dev->power.qos->latency_tolerance_req ?
                        PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT :
                        dev->power.qos->latency_tolerance_req->data.pnode.prio;
        mutex_unlock(&dev_pm_qos_mtx);
        return ret;
}

/**
 * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance.
 * @dev: Device to update the user space latency tolerance for.
 * @val: New user space latency tolerance for @dev (negative values disable).
 */
int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
{
        int ret;

        mutex_lock(&dev_pm_qos_mtx);

        if (IS_ERR_OR_NULL(dev->power.qos)
            || !dev->power.qos->latency_tolerance_req) {
                struct dev_pm_qos_request *req;

                if (val < 0) {
                        if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT)
                                ret = 0;
                        else
                                ret = -EINVAL;
                        goto out;
                }
                req = kzalloc_obj(*req);
                if (!req) {
                        ret = -ENOMEM;
                        goto out;
                }
                ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val);
                if (ret < 0) {
                        kfree(req);
                        goto out;
                }
                dev->power.qos->latency_tolerance_req = req;
        } else {
                if (val < 0) {
                        __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE);
                        ret = 0;
                } else {
                        ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val);
                }
        }

 out:
        mutex_unlock(&dev_pm_qos_mtx);
        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance);

/**
 * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace
 * @dev: Device whose latency tolerance to expose
 */
int dev_pm_qos_expose_latency_tolerance(struct device *dev)
{
        int ret;

        if (!dev->power.set_latency_tolerance)
                return -EINVAL;

        mutex_lock(&dev_pm_qos_sysfs_mtx);
        ret = pm_qos_sysfs_add_latency_tolerance(dev);
        mutex_unlock(&dev_pm_qos_sysfs_mtx);

        return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance);

/**
 * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace
 * @dev: Device whose latency tolerance to hide
 */
void dev_pm_qos_hide_latency_tolerance(struct device *dev)
{
        mutex_lock(&dev_pm_qos_sysfs_mtx);
        pm_qos_sysfs_remove_latency_tolerance(dev);
        mutex_unlock(&dev_pm_qos_sysfs_mtx);

        /* Remove the request from user space now */
        pm_runtime_get_sync(dev);
        dev_pm_qos_update_user_latency_tolerance(dev,
                PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT);
        pm_runtime_put(dev);
}
EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance);




































































    6 











































































































































































































































































































































































































































































   21 



























    6 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VMSTAT_H
#define _LINUX_VMSTAT_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/mmzone.h>
#include <linux/vm_event_item.h>
#include <linux/atomic.h>
#include <linux/static_key.h>
#include <linux/mmdebug.h>

#ifdef CONFIG_NUMA
DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
#endif

struct reclaim_stat {
        unsigned nr_dirty;
        unsigned nr_unqueued_dirty;
        unsigned nr_congested;
        unsigned nr_writeback;
        unsigned nr_immediate;
        unsigned nr_pageout;
        unsigned nr_activate[ANON_AND_FILE];
        unsigned nr_ref_keep;
        unsigned nr_unmap_fail;
        unsigned nr_lazyfree_fail;
        unsigned nr_demoted;
};

/* Stat data for system wide items */
enum vm_stat_item {
        NR_DIRTY_THRESHOLD,
        NR_DIRTY_BG_THRESHOLD,
        NR_MEMMAP_PAGES,        /* page metadata allocated through buddy allocator */
        NR_MEMMAP_BOOT_PAGES,        /* page metadata allocated through boot allocator */
        NR_VM_STAT_ITEMS,
};

#ifdef CONFIG_VM_EVENT_COUNTERS
/*
 * Light weight per cpu counter implementation.
 *
 * Counters should only be incremented and no critical kernel component
 * should rely on the counter values.
 *
 * Counters are handled completely inline. On many platforms the code
 * generated will simply be the increment of a global address.
 */

struct vm_event_state {
        unsigned long event[NR_VM_EVENT_ITEMS];
};

DECLARE_PER_CPU(struct vm_event_state, vm_event_states);

/*
 * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
 * local_irq_disable overhead.
 */
static inline void __count_vm_event(enum vm_event_item item)
{
        raw_cpu_inc(vm_event_states.event[item]);
}

static inline void count_vm_event(enum vm_event_item item)
{
        this_cpu_inc(vm_event_states.event[item]);
}

static inline void __count_vm_events(enum vm_event_item item, long delta)
{
        raw_cpu_add(vm_event_states.event[item], delta);
}

static inline void count_vm_events(enum vm_event_item item, long delta)
{
        this_cpu_add(vm_event_states.event[item], delta);
}

extern void all_vm_events(unsigned long *);

extern void vm_events_fold_cpu(int cpu);

#else

/* Disable counters */
static inline void count_vm_event(enum vm_event_item item)
{
}
static inline void count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void __count_vm_event(enum vm_event_item item)
{
}
static inline void __count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void all_vm_events(unsigned long *ret)
{
}
static inline void vm_events_fold_cpu(int cpu)
{
}

#endif /* CONFIG_VM_EVENT_COUNTERS */

#ifdef CONFIG_NUMA_BALANCING
#define count_vm_numa_event(x)     count_vm_event(x)
#define count_vm_numa_events(x, y) count_vm_events(x, y)
#else
#define count_vm_numa_event(x) do {} while (0)
#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_DEBUG_TLBFLUSH
#define count_vm_tlb_event(x)           count_vm_event(x)
#define count_vm_tlb_events(x, y)  count_vm_events(x, y)
#else
#define count_vm_tlb_event(x)     do {} while (0)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif

#ifdef CONFIG_PER_VMA_LOCK_STATS
#define count_vm_vma_lock_event(x) count_vm_event(x)
#else
#define count_vm_vma_lock_event(x) do {} while (0)
#endif

#define __count_zid_vm_events(item, zid, delta) \
        __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)

/*
 * Zone and node-based page accounting with per cpu differentials.
 */
extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];

#ifdef CONFIG_NUMA
static inline void zone_numa_event_add(long x, struct zone *zone,
                                enum numa_stat_item item)
{
        atomic_long_add(x, &zone->vm_numa_event[item]);
        atomic_long_add(x, &vm_numa_event[item]);
}

static inline unsigned long zone_numa_event_state(struct zone *zone,
                                        enum numa_stat_item item)
{
        return atomic_long_read(&zone->vm_numa_event[item]);
}

static inline unsigned long
global_numa_event_state(enum numa_stat_item item)
{
        return atomic_long_read(&vm_numa_event[item]);
}
#endif /* CONFIG_NUMA */

static inline void zone_page_state_add(long x, struct zone *zone,
                                 enum zone_stat_item item)
{
        atomic_long_add(x, &zone->vm_stat[item]);
        atomic_long_add(x, &vm_zone_stat[item]);
}

static inline void node_page_state_add(long x, struct pglist_data *pgdat,
                                 enum node_stat_item item)
{
        atomic_long_add(x, &pgdat->vm_stat[item]);
        atomic_long_add(x, &vm_node_stat[item]);
}

static inline unsigned long global_zone_page_state(enum zone_stat_item item)
{
        long x = atomic_long_read(&vm_zone_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline
unsigned long global_node_page_state_pages(enum node_stat_item item)
{
        long x = atomic_long_read(&vm_node_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline unsigned long global_node_page_state(enum node_stat_item item)
{
        VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));

        return global_node_page_state_pages(item);
}

static inline unsigned long zone_page_state(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

/*
 * More accurate version that also considers the currently pending
 * deltas. For that we need to loop over all cpus to find the current
 * deltas. There is no synchronization so the result cannot be
 * exactly accurate either.
 */
static inline unsigned long zone_page_state_snapshot(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);

#ifdef CONFIG_SMP
        int cpu;
        for_each_online_cpu(cpu)
                x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item];

        if (x < 0)
                x = 0;
#endif
        return x;
}

#ifdef CONFIG_NUMA
/* See __count_vm_event comment on why raw_cpu_inc is used. */
static inline void
__count_numa_event(struct zone *zone, enum numa_stat_item item)
{
        struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;

        raw_cpu_inc(pzstats->vm_numa_event[item]);
}

static inline void
__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
{
        struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;

        raw_cpu_add(pzstats->vm_numa_event[item], delta);
}

extern unsigned long sum_zone_node_page_state(int node,
                                              enum zone_stat_item item);
extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
                                                enum node_stat_item item);
extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
                                           enum node_stat_item item);
extern void fold_vm_numa_events(void);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
#define node_page_state_pages(node, item) global_node_page_state_pages(item)
static inline void fold_vm_numa_events(void)
{
}
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item);

void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
void __inc_node_page_state(struct page *, enum node_stat_item);
void __dec_node_page_state(struct page *, enum node_stat_item);

void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
void inc_zone_page_state(struct page *, enum zone_stat_item);
void dec_zone_page_state(struct page *, enum zone_stat_item);

void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
void inc_node_page_state(struct page *, enum node_stat_item);
void dec_node_page_state(struct page *, enum node_stat_item);

extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);

void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);

void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);

int calculate_pressure_threshold(struct zone *zone);
int calculate_normal_threshold(struct zone *zone);
void set_pgdat_percpu_threshold(pg_data_t *pgdat,
                                int (*calculate_pressure)(struct zone *));
void vmstat_flush_workqueue(void);
#else /* CONFIG_SMP */

/*
 * We do not maintain differentials in a single processor configuration.
 * The functions directly modify the zone and global counters.
 */
static inline void __mod_zone_page_state(struct zone *zone,
                        enum zone_stat_item item, long delta)
{
        zone_page_state_add(delta, zone, item);
}

static inline void __mod_node_page_state(struct pglist_data *pgdat,
                        enum node_stat_item item, int delta)
{
        if (vmstat_item_in_bytes(item)) {
                /*
                 * Only cgroups use subpage accounting right now; at
                 * the global level, these items still change in
                 * multiples of whole pages. Store them as pages
                 * internally to keep the per-cpu counters compact.
                 */
                VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
                delta >>= PAGE_SHIFT;
        }

        node_page_state_add(delta, pgdat, item);
}

static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_inc(&zone->vm_stat[item]);
        atomic_long_inc(&vm_zone_stat[item]);
}

static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_inc(&pgdat->vm_stat[item]);
        atomic_long_inc(&vm_node_stat[item]);
}

static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_dec(&zone->vm_stat[item]);
        atomic_long_dec(&vm_zone_stat[item]);
}

static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_dec(&pgdat->vm_stat[item]);
        atomic_long_dec(&vm_node_stat[item]);
}

static inline void __inc_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __inc_zone_state(page_zone(page), item);
}

static inline void __inc_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __inc_node_state(page_pgdat(page), item);
}


static inline void __dec_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __dec_zone_state(page_zone(page), item);
}

static inline void __dec_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __dec_node_state(page_pgdat(page), item);
}


/*
 * We only use atomic operations to update counters. So there is no need to
 * disable interrupts.
 */
#define inc_zone_page_state __inc_zone_page_state
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state

#define inc_node_page_state __inc_node_page_state
#define dec_node_page_state __dec_node_page_state
#define mod_node_page_state __mod_node_page_state

#define set_pgdat_percpu_threshold(pgdat, callback) { }

static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
static inline void quiet_vmstat(void) { }
static inline void vmstat_flush_workqueue(void) { }

static inline void drain_zonestat(struct zone *zone,
                        struct per_cpu_zonestat *pzstats) { }
#endif                /* CONFIG_SMP */

static inline void __zone_stat_mod_folio(struct folio *folio,
                enum zone_stat_item item, long nr)
{
        __mod_zone_page_state(folio_zone(folio), item, nr);
}

static inline void __zone_stat_add_folio(struct folio *folio,
                enum zone_stat_item item)
{
        __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
}

static inline void __zone_stat_sub_folio(struct folio *folio,
                enum zone_stat_item item)
{
        __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
}

static inline void zone_stat_mod_folio(struct folio *folio,
                enum zone_stat_item item, long nr)
{
        mod_zone_page_state(folio_zone(folio), item, nr);
}

static inline void zone_stat_add_folio(struct folio *folio,
                enum zone_stat_item item)
{
        mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
}

static inline void zone_stat_sub_folio(struct folio *folio,
                enum zone_stat_item item)
{
        mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
}

static inline void __node_stat_mod_folio(struct folio *folio,
                enum node_stat_item item, long nr)
{
        __mod_node_page_state(folio_pgdat(folio), item, nr);
}

static inline void __node_stat_add_folio(struct folio *folio,
                enum node_stat_item item)
{
        __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
}

static inline void __node_stat_sub_folio(struct folio *folio,
                enum node_stat_item item)
{
        __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}

static inline void node_stat_mod_folio(struct folio *folio,
                enum node_stat_item item, long nr)
{
        mod_node_page_state(folio_pgdat(folio), item, nr);
}

static inline void node_stat_add_folio(struct folio *folio,
                enum node_stat_item item)
{
        mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
}

static inline void node_stat_sub_folio(struct folio *folio,
                enum node_stat_item item)
{
        mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}

extern const char * const vmstat_text[];

static inline const char *zone_stat_name(enum zone_stat_item item)
{
        return vmstat_text[item];
}

#ifdef CONFIG_NUMA
static inline const char *numa_stat_name(enum numa_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_NUMA */

static inline const char *node_stat_name(enum node_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           item];
}

static inline const char *lru_list_name(enum lru_list lru)
{
        return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
}

#if defined(CONFIG_VM_EVENT_COUNTERS)
static inline const char *vm_event_name(enum vm_event_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           NR_VM_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_VM_EVENT_COUNTERS */

#ifdef CONFIG_MEMCG

void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                        int val);

void lruvec_stat_mod_folio(struct folio *folio,
                             enum node_stat_item idx, int val);

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        lruvec_stat_mod_folio(page_folio(page), idx, val);
}

#else

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(folio_pgdat(folio), idx, val);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(page_pgdat(page), idx, val);
}

#endif /* CONFIG_MEMCG */

static inline void lruvec_stat_add_folio(struct folio *folio,
                                         enum node_stat_item idx)
{
        lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
}

static inline void lruvec_stat_sub_folio(struct folio *folio,
                                         enum node_stat_item idx)
{
        lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
}

void memmap_boot_pages_add(long delta);
void memmap_pages_add(long delta);
#endif /* _LINUX_VMSTAT_H */






















































    2 































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Power Management Quality of Service (PM QoS) support base.
 *
 * Copyright (C) 2020 Intel Corporation
 *
 * Authors:
 *        Mark Gross <mgross@linux.intel.com>
 *        Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 *
 * Provided here is an interface for specifying PM QoS dependencies.  It allows
 * entities depending on QoS constraints to register their requests which are
 * aggregated as appropriate to produce effective constraints (target values)
 * that can be monitored by entities needing to respect them, either by polling
 * or through a built-in notification mechanism.
 *
 * In addition to the basic functionality, more specific interfaces for managing
 * global CPU latency QoS requests and frequency QoS requests are provided.
 */

/*#define DEBUG*/

#include <linux/pm_qos.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/miscdevice.h>
#include <linux/string.h>
#include <linux/platform_device.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>

#include <linux/uaccess.h>
#include <linux/export.h>
#include <trace/events/power.h>

/*
 * locking rule: all changes to constraints or notifiers lists
 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
 * held, taken with _irqsave.  One lock to rule them all
 */
static DEFINE_SPINLOCK(pm_qos_lock);

/**
 * pm_qos_read_value - Return the current effective constraint value.
 * @c: List of PM QoS constraint requests.
 */
s32 pm_qos_read_value(struct pm_qos_constraints *c)
{
        return READ_ONCE(c->target_value);
}

static int pm_qos_get_value(struct pm_qos_constraints *c)
{
        if (plist_head_empty(&c->list))
                return c->no_constraint_value;

        switch (c->type) {
        case PM_QOS_MIN:
                return plist_first(&c->list)->prio;

        case PM_QOS_MAX:
                return plist_last(&c->list)->prio;

        default:
                WARN(1, "Unknown PM QoS type in %s\n", __func__);
                return PM_QOS_DEFAULT_VALUE;
        }
}

static void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
{
        WRITE_ONCE(c->target_value, value);
}

/**
 * pm_qos_update_target - Update a list of PM QoS constraint requests.
 * @c: List of PM QoS requests.
 * @node: Target list entry.
 * @action: Action to carry out (add, update or remove).
 * @value: New request value for the target list entry.
 *
 * Update the given list of PM QoS constraint requests, @c, by carrying an
 * @action involving the @node list entry and @value on it.
 *
 * The recognized values of @action are PM_QOS_ADD_REQ (store @value in @node
 * and add it to the list), PM_QOS_UPDATE_REQ (remove @node from the list, store
 * @value in it and add it to the list again), and PM_QOS_REMOVE_REQ (remove
 * @node from the list, ignore @value).
 *
 * Return: 1 if the aggregate constraint value has changed, 0  otherwise.
 */
int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
                         enum pm_qos_req_action action, int value)
{
        int prev_value, curr_value, new_value;
        unsigned long flags;

        spin_lock_irqsave(&pm_qos_lock, flags);

        prev_value = pm_qos_get_value(c);
        if (value == PM_QOS_DEFAULT_VALUE)
                new_value = c->default_value;
        else
                new_value = value;

        switch (action) {
        case PM_QOS_REMOVE_REQ:
                plist_del(node, &c->list);
                break;
        case PM_QOS_UPDATE_REQ:
                /*
                 * To change the list, atomically remove, reinit with new value
                 * and add, then see if the aggregate has changed.
                 */
                plist_del(node, &c->list);
                fallthrough;
        case PM_QOS_ADD_REQ:
                plist_node_init(node, new_value);
                plist_add(node, &c->list);
                break;
        default:
                /* no action */
                ;
        }

        curr_value = pm_qos_get_value(c);
        pm_qos_set_value(c, curr_value);

        spin_unlock_irqrestore(&pm_qos_lock, flags);

        trace_pm_qos_update_target(action, prev_value, curr_value);

        if (prev_value == curr_value)
                return 0;

        if (c->notifiers)
                blocking_notifier_call_chain(c->notifiers, curr_value, NULL);

        return 1;
}

/**
 * pm_qos_flags_remove_req - Remove device PM QoS flags request.
 * @pqf: Device PM QoS flags set to remove the request from.
 * @req: Request to remove from the set.
 */
static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
                                    struct pm_qos_flags_request *req)
{
        s32 val = 0;

        list_del(&req->node);
        list_for_each_entry(req, &pqf->list, node)
                val |= req->flags;

        pqf->effective_flags = val;
}

/**
 * pm_qos_update_flags - Update a set of PM QoS flags.
 * @pqf: Set of PM QoS flags to update.
 * @req: Request to add to the set, to modify, or to remove from the set.
 * @action: Action to take on the set.
 * @val: Value of the request to add or modify.
 *
 * Return: 1 if the aggregate constraint value has changed, 0 otherwise.
 */
bool pm_qos_update_flags(struct pm_qos_flags *pqf,
                         struct pm_qos_flags_request *req,
                         enum pm_qos_req_action action, s32 val)
{
        unsigned long irqflags;
        s32 prev_value, curr_value;

        spin_lock_irqsave(&pm_qos_lock, irqflags);

        prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;

        switch (action) {
        case PM_QOS_REMOVE_REQ:
                pm_qos_flags_remove_req(pqf, req);
                break;
        case PM_QOS_UPDATE_REQ:
                pm_qos_flags_remove_req(pqf, req);
                fallthrough;
        case PM_QOS_ADD_REQ:
                req->flags = val;
                INIT_LIST_HEAD(&req->node);
                list_add_tail(&req->node, &pqf->list);
                pqf->effective_flags |= val;
                break;
        default:
                /* no action */
                ;
        }

        curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;

        spin_unlock_irqrestore(&pm_qos_lock, irqflags);

        trace_pm_qos_update_flags(action, prev_value, curr_value);

        return prev_value != curr_value;
}

#ifdef CONFIG_CPU_IDLE
/* Definitions related to the CPU latency QoS. */

static struct pm_qos_constraints cpu_latency_constraints = {
        .list = PLIST_HEAD_INIT(cpu_latency_constraints.list),
        .target_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE,
        .default_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE,
        .no_constraint_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE,
        .type = PM_QOS_MIN,
};

static inline bool cpu_latency_qos_value_invalid(s32 value)
{
        return value < 0 && value != PM_QOS_DEFAULT_VALUE;
}

/**
 * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit.
 */
s32 cpu_latency_qos_limit(void)
{
        return pm_qos_read_value(&cpu_latency_constraints);
}

/**
 * cpu_latency_qos_request_active - Check the given PM QoS request.
 * @req: PM QoS request to check.
 *
 * Return: 'true' if @req has been added to the CPU latency QoS list, 'false'
 * otherwise.
 */
bool cpu_latency_qos_request_active(struct pm_qos_request *req)
{
        return req->qos == &cpu_latency_constraints;
}
EXPORT_SYMBOL_GPL(cpu_latency_qos_request_active);

static void cpu_latency_qos_apply(struct pm_qos_request *req,
                                  enum pm_qos_req_action action, s32 value)
{
        int ret = pm_qos_update_target(req->qos, &req->node, action, value);
        if (ret > 0)
                wake_up_all_idle_cpus();
}

/**
 * cpu_latency_qos_add_request - Add new CPU latency QoS request.
 * @req: Pointer to a preallocated handle.
 * @value: Requested constraint value.
 *
 * Use @value to initialize the request handle pointed to by @req, insert it as
 * a new entry to the CPU latency QoS list and recompute the effective QoS
 * constraint for that list.
 *
 * Callers need to save the handle for later use in updates and removal of the
 * QoS request represented by it.
 */
void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value)
{
        if (!req || cpu_latency_qos_value_invalid(value))
                return;

        if (cpu_latency_qos_request_active(req)) {
                WARN(1, KERN_ERR "%s called for already added request\n", __func__);
                return;
        }

        trace_pm_qos_add_request(value);

        req->qos = &cpu_latency_constraints;
        cpu_latency_qos_apply(req, PM_QOS_ADD_REQ, value);
}
EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request);

/**
 * cpu_latency_qos_update_request - Modify existing CPU latency QoS request.
 * @req : QoS request to update.
 * @new_value: New requested constraint value.
 *
 * Use @new_value to update the QoS request represented by @req in the CPU
 * latency QoS list along with updating the effective constraint value for that
 * list.
 */
void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value)
{
        if (!req || cpu_latency_qos_value_invalid(new_value))
                return;

        if (!cpu_latency_qos_request_active(req)) {
                WARN(1, KERN_ERR "%s called for unknown object\n", __func__);
                return;
        }

        trace_pm_qos_update_request(new_value);

        if (new_value == req->node.prio)
                return;

        cpu_latency_qos_apply(req, PM_QOS_UPDATE_REQ, new_value);
}
EXPORT_SYMBOL_GPL(cpu_latency_qos_update_request);

/**
 * cpu_latency_qos_remove_request - Remove existing CPU latency QoS request.
 * @req: QoS request to remove.
 *
 * Remove the CPU latency QoS request represented by @req from the CPU latency
 * QoS list along with updating the effective constraint value for that list.
 */
void cpu_latency_qos_remove_request(struct pm_qos_request *req)
{
        if (!req)
                return;

        if (!cpu_latency_qos_request_active(req)) {
                WARN(1, KERN_ERR "%s called for unknown object\n", __func__);
                return;
        }

        trace_pm_qos_remove_request(PM_QOS_DEFAULT_VALUE);

        cpu_latency_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
        memset(req, 0, sizeof(*req));
}
EXPORT_SYMBOL_GPL(cpu_latency_qos_remove_request);

/* User space interface to the CPU latency QoS via misc device. */

static int cpu_latency_qos_open(struct inode *inode, struct file *filp)
{
        struct pm_qos_request *req;

        req = kzalloc_obj(*req);
        if (!req)
                return -ENOMEM;

        cpu_latency_qos_add_request(req, PM_QOS_DEFAULT_VALUE);
        filp->private_data = req;

        return 0;
}

static int cpu_latency_qos_release(struct inode *inode, struct file *filp)
{
        struct pm_qos_request *req = filp->private_data;

        filp->private_data = NULL;

        cpu_latency_qos_remove_request(req);
        kfree(req);

        return 0;
}

static ssize_t cpu_latency_qos_read(struct file *filp, char __user *buf,
                                    size_t count, loff_t *f_pos)
{
        struct pm_qos_request *req = filp->private_data;
        unsigned long flags;
        s32 value;

        if (!req || !cpu_latency_qos_request_active(req))
                return -EINVAL;

        spin_lock_irqsave(&pm_qos_lock, flags);
        value = pm_qos_get_value(&cpu_latency_constraints);
        spin_unlock_irqrestore(&pm_qos_lock, flags);

        return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
}

static ssize_t cpu_latency_qos_write(struct file *filp, const char __user *buf,
                                     size_t count, loff_t *f_pos)
{
        s32 value;

        if (count == sizeof(s32)) {
                if (copy_from_user(&value, buf, sizeof(s32)))
                        return -EFAULT;
        } else {
                int ret;

                ret = kstrtos32_from_user(buf, count, 16, &value);
                if (ret)
                        return ret;
        }

        cpu_latency_qos_update_request(filp->private_data, value);

        return count;
}

static const struct file_operations cpu_latency_qos_fops = {
        .write = cpu_latency_qos_write,
        .read = cpu_latency_qos_read,
        .open = cpu_latency_qos_open,
        .release = cpu_latency_qos_release,
        .llseek = noop_llseek,
};

static struct miscdevice cpu_latency_qos_miscdev = {
        .minor = MISC_DYNAMIC_MINOR,
        .name = "cpu_dma_latency",
        .fops = &cpu_latency_qos_fops,
};

#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
/* The CPU system wakeup latency QoS. */
static struct pm_qos_constraints cpu_wakeup_latency_constraints = {
        .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list),
        .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
        .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
        .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
        .type = PM_QOS_MIN,
};

/**
 * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit.
 *
 * Returns the current CPU system wakeup latency QoS limit that may have been
 * requested by user space.
 */
s32 cpu_wakeup_latency_qos_limit(void)
{
        return pm_qos_read_value(&cpu_wakeup_latency_constraints);
}

static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp)
{
        struct pm_qos_request *req;

        req = kzalloc_obj(*req);
        if (!req)
                return -ENOMEM;

        req->qos = &cpu_wakeup_latency_constraints;
        pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ,
                             PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
        filp->private_data = req;

        return 0;
}

static int cpu_wakeup_latency_qos_release(struct inode *inode,
                                          struct file *filp)
{
        struct pm_qos_request *req = filp->private_data;

        filp->private_data = NULL;
        pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ,
                             PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
        kfree(req);

        return 0;
}

static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf,
                                           size_t count, loff_t *f_pos)
{
        s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints);

        return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
}

static ssize_t cpu_wakeup_latency_qos_write(struct file *filp,
                                            const char __user *buf,
                                            size_t count, loff_t *f_pos)
{
        struct pm_qos_request *req = filp->private_data;
        s32 value;

        if (count == sizeof(s32)) {
                if (copy_from_user(&value, buf, sizeof(s32)))
                        return -EFAULT;
        } else {
                int ret;

                ret = kstrtos32_from_user(buf, count, 16, &value);
                if (ret)
                        return ret;
        }

        if (value < 0)
                return -EINVAL;

        pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value);

        return count;
}

static const struct file_operations cpu_wakeup_latency_qos_fops = {
        .open = cpu_wakeup_latency_qos_open,
        .release = cpu_wakeup_latency_qos_release,
        .read = cpu_wakeup_latency_qos_read,
        .write = cpu_wakeup_latency_qos_write,
        .llseek = noop_llseek,
};

static struct miscdevice cpu_wakeup_latency_qos_miscdev = {
        .minor = MISC_DYNAMIC_MINOR,
        .name = "cpu_wakeup_latency",
        .fops = &cpu_wakeup_latency_qos_fops,
};
#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */

static int __init cpu_latency_qos_init(void)
{
        int ret;

        ret = misc_register(&cpu_latency_qos_miscdev);
        if (ret < 0)
                pr_err("%s: %s setup failed\n", __func__,
                       cpu_latency_qos_miscdev.name);

#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
        ret = misc_register(&cpu_wakeup_latency_qos_miscdev);
        if (ret < 0)
                pr_err("%s: %s setup failed\n", __func__,
                       cpu_wakeup_latency_qos_miscdev.name);
#endif

        return ret;
}
late_initcall(cpu_latency_qos_init);
#endif /* CONFIG_CPU_IDLE */

/* Definitions related to the frequency QoS below. */

static inline bool freq_qos_value_invalid(s32 value)
{
        return value < 0 && value != PM_QOS_DEFAULT_VALUE;
}

/**
 * freq_constraints_init - Initialize frequency QoS constraints.
 * @qos: Frequency QoS constraints to initialize.
 */
void freq_constraints_init(struct freq_constraints *qos)
{
        struct pm_qos_constraints *c;

        c = &qos->min_freq;
        plist_head_init(&c->list);
        c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE;
        c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE;
        c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE;
        c->type = PM_QOS_MAX;
        c->notifiers = &qos->min_freq_notifiers;
        BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers);

        c = &qos->max_freq;
        plist_head_init(&c->list);
        c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE;
        c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE;
        c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE;
        c->type = PM_QOS_MIN;
        c->notifiers = &qos->max_freq_notifiers;
        BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers);
}

/**
 * freq_qos_read_value - Get frequency QoS constraint for a given list.
 * @qos: Constraints to evaluate.
 * @type: QoS request type.
 */
s32 freq_qos_read_value(struct freq_constraints *qos,
                        enum freq_qos_req_type type)
{
        s32 ret;

        switch (type) {
        case FREQ_QOS_MIN:
                ret = IS_ERR_OR_NULL(qos) ?
                        FREQ_QOS_MIN_DEFAULT_VALUE :
                        pm_qos_read_value(&qos->min_freq);
                break;
        case FREQ_QOS_MAX:
                ret = IS_ERR_OR_NULL(qos) ?
                        FREQ_QOS_MAX_DEFAULT_VALUE :
                        pm_qos_read_value(&qos->max_freq);
                break;
        default:
                WARN_ON(1);
                ret = 0;
        }

        return ret;
}

/**
 * freq_qos_apply - Add/modify/remove frequency QoS request.
 * @req: Constraint request to apply.
 * @action: Action to perform (add/update/remove).
 * @value: Value to assign to the QoS request.
 *
 * This is only meant to be called from inside pm_qos, not drivers.
 */
int freq_qos_apply(struct freq_qos_request *req,
                          enum pm_qos_req_action action, s32 value)
{
        int ret;

        switch(req->type) {
        case FREQ_QOS_MIN:
                ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode,
                                           action, value);
                break;
        case FREQ_QOS_MAX:
                ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode,
                                           action, value);
                break;
        default:
                ret = -EINVAL;
        }

        return ret;
}

/**
 * freq_qos_add_request - Insert new frequency QoS request into a given list.
 * @qos: Constraints to update.
 * @req: Preallocated request object.
 * @type: Request type.
 * @value: Request value.
 *
 * Insert a new entry into the @qos list of requests, recompute the effective
 * QoS constraint value for that list and initialize the @req object.  The
 * caller needs to save that object for later use in updates and removal.
 *
 * Return 1 if the effective constraint value has changed, 0 if the effective
 * constraint value has not changed, or a negative error code on failures.
 */
int freq_qos_add_request(struct freq_constraints *qos,
                         struct freq_qos_request *req,
                         enum freq_qos_req_type type, s32 value)
{
        int ret;

        if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value))
                return -EINVAL;

        if (WARN(freq_qos_request_active(req),
                 "%s() called for active request\n", __func__))
                return -EINVAL;

        req->qos = qos;
        req->type = type;
        ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value);
        if (ret < 0) {
                req->qos = NULL;
                req->type = 0;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(freq_qos_add_request);

/**
 * freq_qos_update_request - Modify existing frequency QoS request.
 * @req: Request to modify.
 * @new_value: New request value.
 *
 * Update an existing frequency QoS request along with the effective constraint
 * value for the list of requests it belongs to.
 *
 * Return 1 if the effective constraint value has changed, 0 if the effective
 * constraint value has not changed, or a negative error code on failures.
 */
int freq_qos_update_request(struct freq_qos_request *req, s32 new_value)
{
        if (!req || freq_qos_value_invalid(new_value))
                return -EINVAL;

        if (WARN(!freq_qos_request_active(req),
                 "%s() called for unknown object\n", __func__))
                return -EINVAL;

        if (req->pnode.prio == new_value)
                return 0;

        return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value);
}
EXPORT_SYMBOL_GPL(freq_qos_update_request);

/**
 * freq_qos_remove_request - Remove frequency QoS request from its list.
 * @req: Request to remove.
 *
 * Remove the given frequency QoS request from the list of constraints it
 * belongs to and recompute the effective constraint value for that list.
 *
 * Return 1 if the effective constraint value has changed, 0 if the effective
 * constraint value has not changed, or a negative error code on failures.
 */
int freq_qos_remove_request(struct freq_qos_request *req)
{
        int ret;

        if (!req)
                return -EINVAL;

        if (WARN(!freq_qos_request_active(req),
                 "%s() called for unknown object\n", __func__))
                return -EINVAL;

        ret = freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
        req->qos = NULL;
        req->type = 0;

        return ret;
}
EXPORT_SYMBOL_GPL(freq_qos_remove_request);

/**
 * freq_qos_add_notifier - Add frequency QoS change notifier.
 * @qos: List of requests to add the notifier to.
 * @type: Request type.
 * @notifier: Notifier block to add.
 */
int freq_qos_add_notifier(struct freq_constraints *qos,
                          enum freq_qos_req_type type,
                          struct notifier_block *notifier)
{
        int ret;

        if (IS_ERR_OR_NULL(qos) || !notifier)
                return -EINVAL;

        switch (type) {
        case FREQ_QOS_MIN:
                ret = blocking_notifier_chain_register(qos->min_freq.notifiers,
                                                       notifier);
                break;
        case FREQ_QOS_MAX:
                ret = blocking_notifier_chain_register(qos->max_freq.notifiers,
                                                       notifier);
                break;
        default:
                WARN_ON(1);
                ret = -EINVAL;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(freq_qos_add_notifier);

/**
 * freq_qos_remove_notifier - Remove frequency QoS change notifier.
 * @qos: List of requests to remove the notifier from.
 * @type: Request type.
 * @notifier: Notifier block to remove.
 */
int freq_qos_remove_notifier(struct freq_constraints *qos,
                             enum freq_qos_req_type type,
                             struct notifier_block *notifier)
{
        int ret;

        if (IS_ERR_OR_NULL(qos) || !notifier)
                return -EINVAL;

        switch (type) {
        case FREQ_QOS_MIN:
                ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers,
                                                         notifier);
                break;
        case FREQ_QOS_MAX:
                ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers,
                                                         notifier);
                break;
        default:
                WARN_ON(1);
                ret = -EINVAL;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(freq_qos_remove_notifier);



















































































    1 



    2 








    1 


    1 

















    2 


    2 

    2 



    2 




























    2 





    2 


















































































































    4 



    3 


















    4 













    2 


    1 





    1 
    2 














    4 









    4 





    5 













































































































































































































































































































































































































































    2 
    2 







































    1 

    1 










































    2 






    2 

































































































































































































































































































































































































































































































   15 






   15 


















    1 






    1 
    1 









    1 






    1 
    1 


























































    2 














































































































    1 

    1 










   15 





   14 































































































































    8 
    9 






















    1 





    2 

    2 











    1 









    2 






























    1 










    1 
    1 




    2 














    3 





    4 

    1 

    3 


















    4 























    2 













    2 


































































































































































































































































































































































































































































































































































































































































































    4 




    5 









    3 
















    4 


    1 
    2 


    4 










    2 








    1 






    5 










    3 
















    2 
    4 














    2 
















    1 

    2 



    2 






    2 








































































































































































































































    3 







    2 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/fs-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
 * Contains all the functions related to writing back and waiting
 * upon dirty inodes against superblocks, and writing back dirty
 * pages against inodes.  ie: data writeback.  Writeout of the
 * inode itself is not handled here.
 *
 * 10Apr2002        Andrew Morton
 *                Split out of fs/inode.c
 *                Additions for address_space-based writeback
 */

#include <linux/sched/sysctl.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
#include "internal.h"

/*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
        enum writeback_sync_modes sync_mode;
        unsigned int tagged_writepages:1;
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
        unsigned int auto_free:1;        /* free on completion */
        enum wb_reason reason;                /* why was writeback initiated? */

        struct list_head list;                /* pending work list */
        struct wb_completion *done;        /* set if the caller waits */
};

/*
 * If an inode is constantly having its pages dirtied, but then the
 * updates stop dirtytime_expire_interval seconds in the past, it's
 * possible for the worst case time between when an inode has its
 * timestamps updated and when they finally get written out to be two
 * dirtytime_expire_intervals.  We set the default to 12 hours (in
 * seconds), which means most of the time inodes will have their
 * timestamps written to disk after 12 hours, but in the worst case a
 * few inodes might not their timestamps updated for 24 hours.
 */
static unsigned int dirtytime_expire_interval = 12 * 60 * 60;

static inline struct inode *wb_inode(struct list_head *head)
{
        return list_entry(head, struct inode, i_io_list);
}

/*
 * Include the creation of the trace points after defining the
 * wb_writeback_work structure and inline functions so that the definition
 * remains local to this file.
 */
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>

EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);

static bool wb_io_lists_populated(struct bdi_writeback *wb)
{
        if (wb_has_dirty_io(wb)) {
                return false;
        } else {
                set_bit(WB_has_dirty_io, &wb->state);
                WARN_ON_ONCE(!wb->avg_write_bandwidth);
                atomic_long_add(wb->avg_write_bandwidth,
                                &wb->bdi->tot_write_bandwidth);
                return true;
        }
}

static void wb_io_lists_depopulated(struct bdi_writeback *wb)
{
        if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
            list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
                clear_bit(WB_has_dirty_io, &wb->state);
                WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
                                        &wb->bdi->tot_write_bandwidth) < 0);
        }
}

/**
 * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
 * @inode: inode to be moved
 * @wb: target bdi_writeback
 * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
 *
 * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
 * Returns %true if @inode is the first occupant of the !dirty_time IO
 * lists; otherwise, %false.
 */
static bool inode_io_list_move_locked(struct inode *inode,
                                      struct bdi_writeback *wb,
                                      struct list_head *head)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode_state_read(inode) & I_FREEING);

        list_move(&inode->i_io_list, head);

        /* dirty_time doesn't count as dirty_io until expiration */
        if (head != &wb->b_dirty_time)
                return wb_io_lists_populated(wb);

        wb_io_lists_depopulated(wb);
        return false;
}

static void wb_wakeup(struct bdi_writeback *wb)
{
        spin_lock_irq(&wb->work_lock);
        if (test_bit(WB_registered, &wb->state))
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        spin_unlock_irq(&wb->work_lock);
}

/*
 * This function is used when the first inode for this wb is marked dirty. It
 * wakes-up the corresponding bdi thread which should then take care of the
 * periodic background write-out of dirty inodes. Since the write-out would
 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 * set up a timer which wakes the bdi thread up later.
 *
 * Note, we wouldn't bother setting up the timer, but this function is on the
 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 * by delaying the wake-up.
 *
 * We have to be careful not to postpone flush work if it is scheduled for
 * earlier. Thus we use queue_delayed_work().
 */
static void wb_wakeup_delayed(struct bdi_writeback *wb)
{
        unsigned long timeout;

        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
        spin_lock_irq(&wb->work_lock);
        if (test_bit(WB_registered, &wb->state))
                queue_delayed_work(bdi_wq, &wb->dwork, timeout);
        spin_unlock_irq(&wb->work_lock);
}

static void finish_writeback_work(struct wb_writeback_work *work)
{
        struct wb_completion *done = work->done;

        if (work->auto_free)
                kfree(work);
        if (done) {
                wait_queue_head_t *waitq = done->waitq;

                /* @done can't be accessed after the following dec */
                if (atomic_dec_and_test(&done->cnt))
                        wake_up_all(waitq);
        }
}

static void wb_queue_work(struct bdi_writeback *wb,
                          struct wb_writeback_work *work)
{
        trace_writeback_queue(wb, work);

        if (work->done)
                atomic_inc(&work->done->cnt);

        spin_lock_irq(&wb->work_lock);

        if (test_bit(WB_registered, &wb->state)) {
                list_add_tail(&work->list, &wb->work_list);
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        } else
                finish_writeback_work(work);

        spin_unlock_irq(&wb->work_lock);
}

static bool wb_wait_for_completion_cb(struct wb_completion *done)
{
        unsigned long timeout = sysctl_hung_task_timeout_secs;
        unsigned long waited_secs = (jiffies - done->wait_start) / HZ;

        done->progress_stamp = jiffies;
        if (timeout && (waited_secs > timeout))
                pr_info("INFO: The task %s:%d has been waiting for writeback "
                        "completion for more than %lu seconds.",
                        current->comm, current->pid, waited_secs);

        return !atomic_read(&done->cnt);
}

/**
 * wb_wait_for_completion - wait for completion of bdi_writeback_works
 * @done: target wb_completion
 *
 * Wait for one or more work items issued to @bdi with their ->done field
 * set to @done, which should have been initialized with
 * DEFINE_WB_COMPLETION().  This function returns after all such work items
 * are completed.  Work items which are waited upon aren't freed
 * automatically on completion.
 */
void wb_wait_for_completion(struct wb_completion *done)
{
        done->wait_start = jiffies;
        atomic_dec(&done->cnt);                /* put down the initial count */
        wait_event(*done->waitq, wb_wait_for_completion_cb(done));
}

#ifdef CONFIG_CGROUP_WRITEBACK

/*
 * Parameters for foreign inode detection, see wbc_detach_inode() to see
 * how they're used.
 *
 * These paramters are inherently heuristical as the detection target
 * itself is fuzzy.  All we want to do is detaching an inode from the
 * current owner if it's being written to by some other cgroups too much.
 *
 * The current cgroup writeback is built on the assumption that multiple
 * cgroups writing to the same inode concurrently is very rare and a mode
 * of operation which isn't well supported.  As such, the goal is not
 * taking too long when a different cgroup takes over an inode while
 * avoiding too aggressive flip-flops from occasional foreign writes.
 *
 * We record, very roughly, 2s worth of IO time history and if more than
 * half of that is foreign, trigger the switch.  The recording is quantized
 * to 16 slots.  To avoid tiny writes from swinging the decision too much,
 * writes smaller than 1/8 of avg size are ignored.
 */
#define WB_FRN_TIME_SHIFT        13        /* 1s = 2^13, upto 8 secs w/ 16bit */
#define WB_FRN_TIME_AVG_SHIFT        3        /* avg = avg * 7/8 + new * 1/8 */
#define WB_FRN_TIME_CUT_DIV        8        /* ignore rounds < avg / 8 */
#define WB_FRN_TIME_PERIOD        (2 * (1 << WB_FRN_TIME_SHIFT))        /* 2s */

#define WB_FRN_HIST_SLOTS        16        /* inode->i_wb_frn_history is 16bit */
#define WB_FRN_HIST_UNIT        (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
                                        /* each slot's duration is 2s / 16 */
#define WB_FRN_HIST_THR_SLOTS        (WB_FRN_HIST_SLOTS / 2)
                                        /* if foreign slots >= 8, switch */
#define WB_FRN_HIST_MAX_SLOTS        (WB_FRN_HIST_THR_SLOTS / 2 + 1)
                                        /* one round can affect upto 5 slots */
#define WB_FRN_MAX_IN_FLIGHT        1024        /* don't queue too many concurrently */

/*
 * Maximum inodes per isw.  A specific value has been chosen to make
 * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
 */
#define WB_MAX_INODES_PER_ISW  ((1024UL - sizeof(struct inode_switch_wbs_context)) \
                                / sizeof(struct inode *))

static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;

void __inode_attach_wb(struct inode *inode, struct folio *folio)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct bdi_writeback *wb = NULL;

        if (inode_cgwb_enabled(inode)) {
                struct cgroup_subsys_state *memcg_css;

                /* must pin memcg_css, see wb_get_create() */
                if (folio)
                        memcg_css = get_mem_cgroup_css_from_folio(folio);
                else
                        memcg_css = task_get_css(current, memory_cgrp_id);
                wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                css_put(memcg_css);
        }

        if (!wb)
                wb = &bdi->wb;

        /*
         * There may be multiple instances of this function racing to
         * update the same inode.  Use cmpxchg() to tell the winner.
         */
        if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
                wb_put(wb);
}

/**
 * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
 * @inode: inode of interest with i_lock held
 * @wb: target bdi_writeback
 *
 * Remove the inode from wb's io lists and if necessarily put onto b_attached
 * list.  Only inodes attached to cgwb's are kept on this list.
 */
static void inode_cgwb_move_to_attached(struct inode *inode,
                                        struct bdi_writeback *wb)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode_state_read(inode) & I_FREEING);

        inode_state_clear(inode, I_SYNC_QUEUED);
        if (wb != &wb->bdi->wb)
                list_move(&inode->i_io_list, &wb->b_attached);
        else
                list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);
}

/**
 * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
 * @inode: inode of interest with i_lock held
 *
 * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
 * held on entry and is released on return.  The returned wb is guaranteed
 * to stay @inode's associated wb until its list_lock is released.
 */
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
        __acquires(&wb->list_lock)
{
        while (true) {
                struct bdi_writeback *wb = inode_to_wb(inode);

                /*
                 * inode_to_wb() association is protected by both
                 * @inode->i_lock and @wb->list_lock but list_lock nests
                 * outside i_lock.  Drop i_lock and verify that the
                 * association hasn't changed after acquiring list_lock.
                 */
                wb_get(wb);
                spin_unlock(&inode->i_lock);
                spin_lock(&wb->list_lock);

                /* i_wb may have changed inbetween, can't use inode_to_wb() */
                if (likely(wb == inode->i_wb)) {
                        wb_put(wb);        /* @inode already has ref */
                        return wb;
                }

                spin_unlock(&wb->list_lock);
                wb_put(wb);
                cpu_relax();
                spin_lock(&inode->i_lock);
        }
}

/**
 * inode_to_wb_and_lock_list - determine an inode's wb and lock it
 * @inode: inode of interest
 *
 * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
 * on entry.
 */
static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
        __acquires(&wb->list_lock)
{
        spin_lock(&inode->i_lock);
        return locked_inode_to_wb_and_lock_list(inode);
}

struct inode_switch_wbs_context {
        /* List of queued switching contexts for the wb */
        struct llist_node        list;

        /*
         * Multiple inodes can be switched at once.  The switching procedure
         * consists of two parts, separated by a RCU grace period.  To make
         * sure that the second part is executed for each inode gone through
         * the first part, all inode pointers are placed into a NULL-terminated
         * array embedded into struct inode_switch_wbs_context.  Otherwise
         * an inode could be left in a non-consistent state.
         */
        struct inode                *inodes[];
};

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
        down_write(&bdi->wb_switch_rwsem);
}

static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
        up_write(&bdi->wb_switch_rwsem);
}

static bool inode_do_switch_wbs(struct inode *inode,
                                struct bdi_writeback *old_wb,
                                struct bdi_writeback *new_wb)
{
        struct address_space *mapping = inode->i_mapping;
        XA_STATE(xas, &mapping->i_pages, 0);
        struct folio *folio;
        bool switched = false;

        spin_lock(&inode->i_lock);
        xa_lock_irq(&mapping->i_pages);

        /*
         * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
         * path owns the inode and we shouldn't modify ->i_io_list.
         */
        if (unlikely(inode_state_read(inode) & (I_FREEING | I_WILL_FREE)))
                goto skip_switch;

        trace_inode_switch_wbs(inode, old_wb, new_wb);

        /*
         * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
         * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to
         * folios actually under writeback.
         */
        xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
                if (folio_test_dirty(folio)) {
                        long nr = folio_nr_pages(folio);
                        wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
                        wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
                }
        }

        xas_set(&xas, 0);
        xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
                long nr = folio_nr_pages(folio);
                WARN_ON_ONCE(!folio_test_writeback(folio));
                wb_stat_mod(old_wb, WB_WRITEBACK, -nr);
                wb_stat_mod(new_wb, WB_WRITEBACK, nr);
        }

        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
                atomic_dec(&old_wb->writeback_inodes);
                atomic_inc(&new_wb->writeback_inodes);
        }

        wb_get(new_wb);

        /*
         * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
         * the specific list @inode was on is ignored and the @inode is put on
         * ->b_dirty which is always correct including from ->b_dirty_time.
         * If the @inode was clean, it means it was on the b_attached list, so
         * move it onto the b_attached list of @new_wb.
         */
        if (!list_empty(&inode->i_io_list)) {
                inode->i_wb = new_wb;

                if (inode_state_read(inode) & I_DIRTY_ALL) {
                        /*
                         * We need to keep b_dirty list sorted by
                         * dirtied_time_when. However properly sorting the
                         * inode in the list gets too expensive when switching
                         * many inodes. So just attach inode at the end of the
                         * dirty list and clobber the dirtied_time_when.
                         */
                        inode->dirtied_time_when = jiffies;
                        inode_io_list_move_locked(inode, new_wb,
                                                  &new_wb->b_dirty);
                } else {
                        inode_cgwb_move_to_attached(inode, new_wb);
                }
        } else {
                inode->i_wb = new_wb;
        }

        /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
        switched = true;
skip_switch:
        /*
         * Paired with an acquire fence in unlocked_inode_to_wb_begin() and
         * ensures that the new wb is visible if they see !I_WB_SWITCH.
         */
        smp_wmb();
        inode_state_clear(inode, I_WB_SWITCH);

        xa_unlock_irq(&mapping->i_pages);
        spin_unlock(&inode->i_lock);

        return switched;
}

static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
                                     struct inode_switch_wbs_context *isw)
{
        struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
        struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
        unsigned long nr_switched = 0;
        struct inode **inodep;

        /*
         * If @inode switches cgwb membership while sync_inodes_sb() is
         * being issued, sync_inodes_sb() might miss it.  Synchronize.
         */
        down_read(&bdi->wb_switch_rwsem);

        inodep = isw->inodes;
        /*
         * By the time control reaches here, RCU grace period has passed
         * since I_WB_SWITCH assertion and all wb stat update transactions
         * between unlocked_inode_to_wb_begin/end() are guaranteed to be
         * synchronizing against the i_pages lock.
         *
         * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
         * gives us exclusion against all wb related operations on @inode
         * including IO list manipulations and stat updates.
         */
relock:
        if (old_wb < new_wb) {
                spin_lock(&old_wb->list_lock);
                spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
        } else {
                spin_lock(&new_wb->list_lock);
                spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
        }

        while (*inodep) {
                WARN_ON_ONCE((*inodep)->i_wb != old_wb);
                if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
                        nr_switched++;
                inodep++;
                if (*inodep && need_resched()) {
                        spin_unlock(&new_wb->list_lock);
                        spin_unlock(&old_wb->list_lock);
                        cond_resched();
                        goto relock;
                }
        }

        spin_unlock(&new_wb->list_lock);
        spin_unlock(&old_wb->list_lock);

        up_read(&bdi->wb_switch_rwsem);

        if (nr_switched) {
                wb_wakeup(new_wb);
                wb_put_many(old_wb, nr_switched);
        }

        for (inodep = isw->inodes; *inodep; inodep++)
                iput(*inodep);
        wb_put(new_wb);
        kfree(isw);
        atomic_dec(&isw_nr_in_flight);
}

void inode_switch_wbs_work_fn(struct work_struct *work)
{
        struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
                                                    switch_work);
        struct inode_switch_wbs_context *isw, *next_isw;
        struct llist_node *list;

        /*
         * Grab out reference to wb so that it cannot get freed under us
         * after we process all the isw items.
         */
        wb_get(new_wb);
        while (1) {
                list = llist_del_all(&new_wb->switch_wbs_ctxs);
                /* Nothing to do? */
                if (!list)
                        break;
                /*
                 * In addition to synchronizing among switchers, I_WB_SWITCH
                 * tells the RCU protected stat update paths to grab the i_page
                 * lock so that stat transfer can synchronize against them.
                 * Let's continue after I_WB_SWITCH is guaranteed to be
                 * visible.
                 */
                synchronize_rcu();

                llist_for_each_entry_safe(isw, next_isw, list, list)
                        process_inode_switch_wbs(new_wb, isw);
        }
        wb_put(new_wb);
}

static bool inode_prepare_wbs_switch(struct inode *inode,
                                     struct bdi_writeback *new_wb)
{
        /*
         * Paired with smp_mb() in cgroup_writeback_umount().
         * isw_nr_in_flight must be increased before checking SB_ACTIVE and
         * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
         * in cgroup_writeback_umount() and the isw_wq will be not flushed.
         */
        smp_mb();

        if (IS_DAX(inode))
                return false;

        /* while holding I_WB_SWITCH, no one else can update the association */
        spin_lock(&inode->i_lock);
        if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
            inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
            inode_to_wb(inode) == new_wb) {
                spin_unlock(&inode->i_lock);
                return false;
        }
        inode_state_set(inode, I_WB_SWITCH);
        __iget(inode);
        spin_unlock(&inode->i_lock);

        return true;
}

static void wb_queue_isw(struct bdi_writeback *wb,
                         struct inode_switch_wbs_context *isw)
{
        if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
                queue_work(isw_wq, &wb->switch_work);
}

/**
 * inode_switch_wbs - change the wb association of an inode
 * @inode: target inode
 * @new_wb_id: ID of the new wb
 *
 * Switch @inode's wb association to the wb identified by @new_wb_id.  The
 * switching is performed asynchronously and may fail silently.
 */
static void inode_switch_wbs(struct inode *inode, int new_wb_id)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct cgroup_subsys_state *memcg_css;
        struct inode_switch_wbs_context *isw;
        struct bdi_writeback *new_wb = NULL;

        /* noop if seems to be already in progress */
        if (inode_state_read_once(inode) & I_WB_SWITCH)
                return;

        /* avoid queueing a new switch if too many are already in flight */
        if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
                return;

        isw = kzalloc_flex(*isw, inodes, 2, GFP_ATOMIC);
        if (!isw)
                return;

        atomic_inc(&isw_nr_in_flight);

        /* find and pin the new wb */
        rcu_read_lock();
        memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
        if (memcg_css && !css_tryget(memcg_css))
                memcg_css = NULL;
        rcu_read_unlock();
        if (!memcg_css)
                goto out_free;

        new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
        css_put(memcg_css);
        if (!new_wb)
                goto out_free;

        if (!inode_prepare_wbs_switch(inode, new_wb))
                goto out_free;

        isw->inodes[0] = inode;

        trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1);
        wb_queue_isw(new_wb, isw);
        return;

out_free:
        atomic_dec(&isw_nr_in_flight);
        if (new_wb)
                wb_put(new_wb);
        kfree(isw);
}

static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
                                   struct inode_switch_wbs_context *isw,
                                   struct list_head *list, int *nr)
{
        struct inode *inode;

        list_for_each_entry(inode, list, i_io_list) {
                if (!inode_prepare_wbs_switch(inode, new_wb))
                        continue;

                isw->inodes[*nr] = inode;
                (*nr)++;

                if (*nr >= WB_MAX_INODES_PER_ISW - 1)
                        return true;
        }
        return false;
}

/**
 * cleanup_offline_cgwb - detach associated inodes
 * @wb: target wb
 *
 * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
 * to eventually release the dying @wb.  Returns %true if not all inodes were
 * switched and the function has to be restarted.
 */
bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
        struct cgroup_subsys_state *memcg_css;
        struct inode_switch_wbs_context *isw;
        struct bdi_writeback *new_wb;
        int nr;
        bool restart = false;

        isw = kzalloc_flex(*isw, inodes, WB_MAX_INODES_PER_ISW);
        if (!isw)
                return restart;

        atomic_inc(&isw_nr_in_flight);

        for (memcg_css = wb->memcg_css->parent; memcg_css;
             memcg_css = memcg_css->parent) {
                new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
                if (new_wb)
                        break;
        }
        if (unlikely(!new_wb))
                new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */

        nr = 0;
        spin_lock(&wb->list_lock);
        /*
         * In addition to the inodes that have completed writeback, also switch
         * cgwbs for those inodes only with dirty timestamps. Otherwise, those
         * inodes won't be written back for a long time when lazytime is
         * enabled, and thus pinning the dying cgwbs. It won't break the
         * bandwidth restrictions, as writeback of inode metadata is not
         * accounted for.
         */
        restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
        if (!restart)
                restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
                                                 &nr);
        spin_unlock(&wb->list_lock);

        /* no attached inodes? bail out */
        if (nr == 0) {
                atomic_dec(&isw_nr_in_flight);
                wb_put(new_wb);
                kfree(isw);
                return restart;
        }

        trace_inode_switch_wbs_queue(wb, new_wb, nr);
        wb_queue_isw(new_wb, isw);

        return restart;
}

/**
 * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
 * @wbc: writeback_control of interest
 * @inode: target inode
 *
 * @inode is locked and about to be written back under the control of @wbc.
 * Record @inode's writeback context into @wbc and unlock the i_lock.  On
 * writeback completion, wbc_detach_inode() should be called.  This is used
 * to track the cgroup writeback context.
 */
static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                struct inode *inode)
        __releases(&inode->i_lock)
{
        if (!inode_cgwb_enabled(inode)) {
                spin_unlock(&inode->i_lock);
                return;
        }

        wbc->wb = inode_to_wb(inode);
        wbc->inode = inode;

        wbc->wb_id = wbc->wb->memcg_css->id;
        wbc->wb_lcand_id = inode->i_wb_frn_winner;
        wbc->wb_tcand_id = 0;
        wbc->wb_bytes = 0;
        wbc->wb_lcand_bytes = 0;
        wbc->wb_tcand_bytes = 0;

        wb_get(wbc->wb);
        spin_unlock(&inode->i_lock);

        /*
         * A dying wb indicates that either the blkcg associated with the
         * memcg changed or the associated memcg is dying.  In the first
         * case, a replacement wb should already be available and we should
         * refresh the wb immediately.  In the second case, trying to
         * refresh will keep failing.
         */
        if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
                inode_switch_wbs(inode, wbc->wb_id);
}

/**
 * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
 * @wbc: writeback_control of interest
 * @inode: target inode
 *
 * This function is to be used by filemap_writeback(), which is an alternative
 * entry point into writeback code, and first ensures @inode is associated with
 * a bdi_writeback and attaches it to @wbc.
 */
void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                struct inode *inode)
{
        spin_lock(&inode->i_lock);
        inode_attach_wb(inode, NULL);
        wbc_attach_and_unlock_inode(wbc, inode);
}
EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode);

/**
 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
 * @wbc: writeback_control of the just finished writeback
 *
 * To be called after a writeback attempt of an inode finishes and undoes
 * wbc_attach_and_unlock_inode().  Can be called under any context.
 *
 * As concurrent write sharing of an inode is expected to be very rare and
 * memcg only tracks page ownership on first-use basis severely confining
 * the usefulness of such sharing, cgroup writeback tracks ownership
 * per-inode.  While the support for concurrent write sharing of an inode
 * is deemed unnecessary, an inode being written to by different cgroups at
 * different points in time is a lot more common, and, more importantly,
 * charging only by first-use can too readily lead to grossly incorrect
 * behaviors (single foreign page can lead to gigabytes of writeback to be
 * incorrectly attributed).
 *
 * To resolve this issue, cgroup writeback detects the majority dirtier of
 * an inode and transfers the ownership to it.  To avoid unnecessary
 * oscillation, the detection mechanism keeps track of history and gives
 * out the switch verdict only if the foreign usage pattern is stable over
 * a certain amount of time and/or writeback attempts.
 *
 * On each writeback attempt, @wbc tries to detect the majority writer
 * using Boyer-Moore majority vote algorithm.  In addition to the byte
 * count from the majority voting, it also counts the bytes written for the
 * current wb and the last round's winner wb (max of last round's current
 * wb, the winner from two rounds ago, and the last round's majority
 * candidate).  Keeping track of the historical winner helps the algorithm
 * to semi-reliably detect the most active writer even when it's not the
 * absolute majority.
 *
 * Once the winner of the round is determined, whether the winner is
 * foreign or not and how much IO time the round consumed is recorded in
 * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
 * over a certain threshold, the switch verdict is given.
 */
void wbc_detach_inode(struct writeback_control *wbc)
{
        struct bdi_writeback *wb = wbc->wb;
        struct inode *inode = wbc->inode;
        unsigned long avg_time, max_bytes, max_time;
        u16 history;
        int max_id;

        if (!wb)
                return;

        history = inode->i_wb_frn_history;
        avg_time = inode->i_wb_frn_avg_time;

        /* pick the winner of this round */
        if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
            wbc->wb_bytes >= wbc->wb_tcand_bytes) {
                max_id = wbc->wb_id;
                max_bytes = wbc->wb_bytes;
        } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
                max_id = wbc->wb_lcand_id;
                max_bytes = wbc->wb_lcand_bytes;
        } else {
                max_id = wbc->wb_tcand_id;
                max_bytes = wbc->wb_tcand_bytes;
        }

        /*
         * Calculate the amount of IO time the winner consumed and fold it
         * into the running average kept per inode.  If the consumed IO
         * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
         * deciding whether to switch or not.  This is to prevent one-off
         * small dirtiers from skewing the verdict.
         */
        max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
                                wb->avg_write_bandwidth);
        if (avg_time)
                avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
                            (avg_time >> WB_FRN_TIME_AVG_SHIFT);
        else
                avg_time = max_time;        /* immediate catch up on first run */

        if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
                int slots;

                /*
                 * The switch verdict is reached if foreign wb's consume
                 * more than a certain proportion of IO time in a
                 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
                 * history mask where each bit represents one sixteenth of
                 * the period.  Determine the number of slots to shift into
                 * history from @max_time.
                 */
                slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
                            (unsigned long)WB_FRN_HIST_MAX_SLOTS);
                history <<= slots;
                if (wbc->wb_id != max_id)
                        history |= (1U << slots) - 1;

                if (history)
                        trace_inode_foreign_history(inode, wbc, history);

                /*
                 * Switch if the current wb isn't the consistent winner.
                 * If there are multiple closely competing dirtiers, the
                 * inode may switch across them repeatedly over time, which
                 * is okay.  The main goal is avoiding keeping an inode on
                 * the wrong wb for an extended period of time.
                 */
                if (hweight16(history) > WB_FRN_HIST_THR_SLOTS)
                        inode_switch_wbs(inode, max_id);
        }

        /*
         * Multiple instances of this function may race to update the
         * following fields but we don't mind occassional inaccuracies.
         */
        inode->i_wb_frn_winner = max_id;
        inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
        inode->i_wb_frn_history = history;

        wb_put(wbc->wb);
        wbc->wb = NULL;
}
EXPORT_SYMBOL_GPL(wbc_detach_inode);

/**
 * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
 * @wbc: writeback_control of the writeback in progress
 * @folio: folio being written out
 * @bytes: number of bytes being written out
 *
 * @bytes from @folio are about to written out during the writeback
 * controlled by @wbc.  Keep the book for foreign inode detection.  See
 * wbc_detach_inode().
 */
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
                              size_t bytes)
{
        struct cgroup_subsys_state *css;
        int id;

        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.  This is intentional as we don't want the function to block
         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
         * regular writeback instead of writing things out itself.
         */
        if (!wbc->wb || wbc->no_cgroup_owner)
                return;

        css = get_mem_cgroup_css_from_folio(folio);
        /* dead cgroups shouldn't contribute to inode ownership arbitration */
        if (!css_is_online(css))
                goto out;

        id = css->id;

        if (id == wbc->wb_id) {
                wbc->wb_bytes += bytes;
                goto out;
        }

        if (id == wbc->wb_lcand_id)
                wbc->wb_lcand_bytes += bytes;

        /* Boyer-Moore majority vote algorithm */
        if (!wbc->wb_tcand_bytes)
                wbc->wb_tcand_id = id;
        if (id == wbc->wb_tcand_id)
                wbc->wb_tcand_bytes += bytes;
        else
                wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
out:
        css_put(css);
}
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);

/**
 * wb_split_bdi_pages - split nr_pages to write according to bandwidth
 * @wb: target bdi_writeback to split @nr_pages to
 * @nr_pages: number of pages to write for the whole bdi
 *
 * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
 * relation to the total write bandwidth of all wb's w/ dirty inodes on
 * @wb->bdi.
 */
static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
        unsigned long this_bw = wb->avg_write_bandwidth;
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);

        if (nr_pages == LONG_MAX)
                return LONG_MAX;

        /*
         * This may be called on clean wb's and proportional distribution
         * may not make sense, just use the original @nr_pages in those
         * cases.  In general, we wanna err on the side of writing more.
         */
        if (!tot_bw || this_bw >= tot_bw)
                return nr_pages;
        else
                return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
}

/**
 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
 * @bdi: target backing_dev_info
 * @base_work: wb_writeback_work to issue
 * @skip_if_busy: skip wb's which already have writeback in progress
 *
 * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
 * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
 * distributed to the busy wbs according to each wb's proportion in the
 * total active write bandwidth of @bdi.
 */
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
{
        struct bdi_writeback *last_wb = NULL;
        struct bdi_writeback *wb = list_entry(&bdi->wb_list,
                                              struct bdi_writeback, bdi_node);

        might_sleep();
restart:
        rcu_read_lock();
        list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                DEFINE_WB_COMPLETION(fallback_work_done, bdi);
                struct wb_writeback_work fallback_work;
                struct wb_writeback_work *work;
                long nr_pages;

                if (last_wb) {
                        wb_put(last_wb);
                        last_wb = NULL;
                }

                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
                     list_empty(&wb->b_dirty_time)))
                        continue;
                if (skip_if_busy && writeback_in_progress(wb))
                        continue;

                nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);

                work = kmalloc_obj(*work, GFP_ATOMIC);
                if (work) {
                        *work = *base_work;
                        work->nr_pages = nr_pages;
                        work->auto_free = 1;
                        wb_queue_work(wb, work);
                        continue;
                }

                /*
                 * If wb_tryget fails, the wb has been shutdown, skip it.
                 *
                 * Pin @wb so that it stays on @bdi->wb_list.  This allows
                 * continuing iteration from @wb after dropping and
                 * regrabbing rcu read lock.
                 */
                if (!wb_tryget(wb))
                        continue;

                /* alloc failed, execute synchronously using on-stack fallback */
                work = &fallback_work;
                *work = *base_work;
                work->nr_pages = nr_pages;
                work->auto_free = 0;
                work->done = &fallback_work_done;

                wb_queue_work(wb, work);
                last_wb = wb;

                rcu_read_unlock();
                wb_wait_for_completion(&fallback_work_done);
                goto restart;
        }
        rcu_read_unlock();

        if (last_wb)
                wb_put(last_wb);
}

/**
 * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
 * @bdi_id: target bdi id
 * @memcg_id: target memcg css id
 * @reason: reason why some writeback work initiated
 * @done: target wb_completion
 *
 * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
 * with the specified parameters.
 */
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
                           enum wb_reason reason, struct wb_completion *done)
{
        struct backing_dev_info *bdi;
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;
        struct wb_writeback_work *work;
        unsigned long dirty;
        int ret;

        /* lookup bdi and memcg */
        bdi = bdi_get_by_id(bdi_id);
        if (!bdi)
                return -ENOENT;

        rcu_read_lock();
        memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
        if (memcg_css && !css_tryget(memcg_css))
                memcg_css = NULL;
        rcu_read_unlock();
        if (!memcg_css) {
                ret = -ENOENT;
                goto out_bdi_put;
        }

        /*
         * And find the associated wb.  If the wb isn't there already
         * there's nothing to flush, don't create one.
         */
        wb = wb_get_lookup(bdi, memcg_css);
        if (!wb) {
                ret = -ENOENT;
                goto out_css_put;
        }

        /*
         * The caller is attempting to write out most of
         * the currently dirty pages.  Let's take the current dirty page
         * count and inflate it by 25% which should be large enough to
         * flush out most dirty pages while avoiding getting livelocked by
         * concurrent dirtiers.
         *
         * BTW the memcg stats are flushed periodically and this is best-effort
         * estimation, so some potential error is ok.
         */
        dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
        dirty = dirty * 10 / 8;

        /* issue the writeback work */
        work = kzalloc_obj(*work, GFP_NOWAIT);
        if (work) {
                work->nr_pages = dirty;
                work->sync_mode = WB_SYNC_NONE;
                work->range_cyclic = 1;
                work->reason = reason;
                work->done = done;
                work->auto_free = 1;
                wb_queue_work(wb, work);
                ret = 0;
        } else {
                ret = -ENOMEM;
        }

        wb_put(wb);
out_css_put:
        css_put(memcg_css);
out_bdi_put:
        bdi_put(bdi);
        return ret;
}

/**
 * cgroup_writeback_umount - flush inode wb switches for umount
 * @sb: target super_block
 *
 * This function is called when a super_block is about to be destroyed and
 * flushes in-flight inode wb switches.  An inode wb switch goes through
 * RCU and then workqueue, so the two need to be flushed in order to ensure
 * that all previously scheduled switches are finished.  As wb switches are
 * rare occurrences and synchronize_rcu() can take a while, perform
 * flushing iff wb switches are in flight.
 */
void cgroup_writeback_umount(struct super_block *sb)
{

        if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK))
                return;

        /*
         * SB_ACTIVE should be reliably cleared before checking
         * isw_nr_in_flight, see generic_shutdown_super().
         */
        smp_mb();

        if (atomic_read(&isw_nr_in_flight)) {
                /*
                 * Use rcu_barrier() to wait for all pending callbacks to
                 * ensure that all in-flight wb switches are in the workqueue.
                 */
                rcu_barrier();
                flush_workqueue(isw_wq);
        }
}

static int __init cgroup_writeback_init(void)
{
        isw_wq = alloc_workqueue("inode_switch_wbs", WQ_PERCPU, 0);
        if (!isw_wq)
                return -ENOMEM;
        return 0;
}
fs_initcall(cgroup_writeback_init);

#else        /* CONFIG_CGROUP_WRITEBACK */

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }

static void inode_cgwb_move_to_attached(struct inode *inode,
                                        struct bdi_writeback *wb)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode_state_read(inode) & I_FREEING);

        inode_state_clear(inode, I_SYNC_QUEUED);
        list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);
}

static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
        __acquires(&wb->list_lock)
{
        struct bdi_writeback *wb = inode_to_wb(inode);

        spin_unlock(&inode->i_lock);
        spin_lock(&wb->list_lock);
        return wb;
}

static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
        __acquires(&wb->list_lock)
{
        struct bdi_writeback *wb = inode_to_wb(inode);

        spin_lock(&wb->list_lock);
        return wb;
}

static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
        return nr_pages;
}

static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
{
        might_sleep();

        if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
                base_work->auto_free = 0;
                wb_queue_work(&bdi->wb, base_work);
        }
}

static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                               struct inode *inode)
        __releases(&inode->i_lock)
{
        spin_unlock(&inode->i_lock);
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * Add in the number of potentially dirty inodes, because each inode
 * write can dirty pagecache in the underlying blockdev.
 */
static unsigned long get_nr_dirty_pages(void)
{
        return global_node_page_state(NR_FILE_DIRTY) +
                get_nr_dirty_inodes();
}

static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
{
        if (!wb_has_dirty_io(wb))
                return;

        /*
         * All callers of this function want to start writeback of all
         * dirty pages. Places like vmscan can call this at a very
         * high frequency, causing pointless allocations of tons of
         * work items and keeping the flusher threads busy retrieving
         * that work. Ensure that we only allow one of them pending and
         * inflight at the time.
         */
        if (test_bit(WB_start_all, &wb->state) ||
            test_and_set_bit(WB_start_all, &wb->state))
                return;

        wb->start_all_reason = reason;
        wb_wakeup(wb);
}

/**
 * wb_start_background_writeback - start background writeback
 * @wb: bdi_writback to write from
 *
 * Description:
 *   This makes sure WB_SYNC_NONE background writeback happens. When
 *   this function returns, it is only guaranteed that for given wb
 *   some IO is happening if we are over background dirty threshold.
 *   Caller need not hold sb s_umount semaphore.
 */
void wb_start_background_writeback(struct bdi_writeback *wb)
{
        /*
         * We just wake up the flusher thread. It will perform background
         * writeback as soon as there is no other work to do.
         */
        trace_writeback_wake_background(wb);
        wb_wakeup(wb);
}

/*
 * Remove the inode from the writeback list it is on.
 */
void inode_io_list_del(struct inode *inode)
{
        struct bdi_writeback *wb;

        /*
         * FIXME: ext4 can call here from ext4_evict_inode() after evict() already
         * unlinked the inode.
         */
        if (list_empty_careful(&inode->i_io_list))
                return;

        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);

        inode_state_clear(inode, I_SYNC_QUEUED);
        list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);

        spin_unlock(&inode->i_lock);
        spin_unlock(&wb->list_lock);
}
EXPORT_SYMBOL(inode_io_list_del);

/*
 * mark an inode as under writeback on the sb
 */
void sb_mark_inode_writeback(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned long flags;

        if (list_empty(&inode->i_wb_list)) {
                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
                if (list_empty(&inode->i_wb_list)) {
                        list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
                        trace_sb_mark_inode_writeback(inode);
                }
                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
        }
}

/*
 * clear an inode as under writeback on the sb
 */
void sb_clear_inode_writeback(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned long flags;

        if (!list_empty(&inode->i_wb_list)) {
                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
                if (!list_empty(&inode->i_wb_list)) {
                        list_del_init(&inode->i_wb_list);
                        trace_sb_clear_inode_writeback(inode);
                }
                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
        }
}

/*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
 *
 * Before stamping the inode's ->dirtied_when, we check to see whether it is
 * already the most-recently-dirtied inode on the b_dirty list.  If that is
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
        assert_spin_locked(&inode->i_lock);

        inode_state_clear(inode, I_SYNC_QUEUED);
        /*
         * When the inode is being freed just don't bother with dirty list
         * tracking. Flush worker will ignore this inode anyway and it will
         * trigger assertions in inode_io_list_move_locked().
         */
        if (inode_state_read(inode) & I_FREEING) {
                list_del_init(&inode->i_io_list);
                wb_io_lists_depopulated(wb);
                return;
        }
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;

                tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
        inode_io_list_move_locked(inode, wb, &wb->b_dirty);
}

static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
        spin_lock(&inode->i_lock);
        redirty_tail_locked(inode, wb);
        spin_unlock(&inode->i_lock);
}

/*
 * requeue inode for re-scanning after bdi->b_io list is exhausted.
 */
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
        inode_io_list_move_locked(inode, wb, &wb->b_more_io);
}

static void inode_sync_complete(struct inode *inode)
{
        assert_spin_locked(&inode->i_lock);

        inode_state_clear(inode, I_SYNC);
        /* If inode is clean an unused, put it into LRU now... */
        inode_lru_list_add(inode);
        /* Called with inode->i_lock which ensures memory ordering. */
        inode_wake_up_bit(inode, __I_SYNC);
}

static bool inode_dirtied_after(struct inode *inode, unsigned long t)
{
        bool ret = time_after(inode->dirtied_when, t);
#ifndef CONFIG_64BIT
        /*
         * For inodes being constantly redirtied, dirtied_when can get stuck.
         * It _appears_ to be in the future, but is actually in distant past.
         * This test is necessary to prevent such wrapped-around relative times
         * from permanently stopping the whole bdi writeback.
         */
        ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
        return ret;
}

/*
 * Move expired (dirtied before dirtied_before) dirty inodes from
 * @delaying_queue to @dispatch_queue.
 */
static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
                               unsigned long dirtied_before)
{
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
        struct super_block *sb = NULL;
        struct inode *inode;
        int do_sb_sort = 0;
        int moved = 0;

        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
                if (inode_dirtied_after(inode, dirtied_before))
                        break;
                spin_lock(&inode->i_lock);
                list_move(&inode->i_io_list, &tmp);
                moved++;
                inode_state_set(inode, I_SYNC_QUEUED);
                spin_unlock(&inode->i_lock);
                if (sb_is_blkdev_sb(inode->i_sb))
                        continue;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
        }

        /* just one sb in list, splice to dispatch_queue and we're done */
        if (!do_sb_sort) {
                list_splice(&tmp, dispatch_queue);
                goto out;
        }

        /*
         * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue',
         * we don't take inode->i_lock here because it is just a pointless overhead.
         * Inode is already marked as I_SYNC_QUEUED so writeback list handling is
         * fully under our control.
         */
        while (!list_empty(&tmp)) {
                sb = wb_inode(tmp.prev)->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
                        inode = wb_inode(pos);
                        if (inode->i_sb == sb)
                                list_move(&inode->i_io_list, dispatch_queue);
                }
        }
out:
        return moved;
}

/*
 * Queue all expired dirty inodes for io, eldest first.
 * Before
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    gf         edc     BA
 * After
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    g          fBAedc
 *                                           |
 *                                           +--> dequeue for IO
 */
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
                     unsigned long dirtied_before)
{
        int moved;
        unsigned long time_expire_jif = dirtied_before;

        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
        if (!work->for_sync)
                time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
        moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
                                     time_expire_jif);
        if (moved)
                wb_io_lists_populated(wb);
        trace_writeback_queue_io(wb, work, dirtied_before, moved);
}

static int write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int ret;

        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
                trace_writeback_write_inode_start(inode, wbc);
                ret = inode->i_sb->s_op->write_inode(inode, wbc);
                trace_writeback_write_inode(inode, wbc);
                return ret;
        }
        return 0;
}

/*
 * Wait for writeback on an inode to complete. Called with i_lock held.
 * Caller must make sure inode cannot go away when we drop i_lock.
 */
void inode_wait_for_writeback(struct inode *inode)
{
        struct wait_bit_queue_entry wqe;
        struct wait_queue_head *wq_head;

        assert_spin_locked(&inode->i_lock);

        if (!(inode_state_read(inode) & I_SYNC))
                return;

        wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
        for (;;) {
                prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
                /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
                if (!(inode_state_read(inode) & I_SYNC))
                        break;
                spin_unlock(&inode->i_lock);
                schedule();
                spin_lock(&inode->i_lock);
        }
        finish_wait(wq_head, &wqe.wq_entry);
}

/*
 * Sleep until I_SYNC is cleared. This function must be called with i_lock
 * held and drops it. It is aimed for callers not holding any inode reference
 * so once i_lock is dropped, inode can go away.
 */
static void inode_sleep_on_writeback(struct inode *inode)
        __releases(inode->i_lock)
{
        struct wait_bit_queue_entry wqe;
        struct wait_queue_head *wq_head;
        bool sleep;

        assert_spin_locked(&inode->i_lock);

        wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
        prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
        /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
        sleep = !!(inode_state_read(inode) & I_SYNC);
        spin_unlock(&inode->i_lock);
        if (sleep)
                schedule();
        finish_wait(wq_head, &wqe.wq_entry);
}

/*
 * Find proper writeback list for the inode depending on its current state and
 * possibly also change of its state while we were doing writeback.  Here we
 * handle things such as livelock prevention or fairness of writeback among
 * inodes. This function can be called only by flusher thread - noone else
 * processes all inodes in writeback lists and requeueing inodes behind flusher
 * thread's back can have unexpected consequences.
 */
static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                          struct writeback_control *wbc,
                          unsigned long dirtied_before)
{
        if (inode_state_read(inode) & I_FREEING)
                return;

        /*
         * Sync livelock prevention. Each inode is tagged and synced in one
         * shot. If still dirty, it will be redirty_tail()'ed below.  Update
         * the dirty time to prevent enqueue and sync it again.
         */
        if ((inode_state_read(inode) & I_DIRTY) &&
            (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
                inode->dirtied_when = jiffies;

        if (wbc->pages_skipped) {
                /*
                 * Writeback is not making progress due to locked buffers.
                 * Skip this inode for now. Although having skipped pages
                 * is odd for clean inodes, it can happen for some
                 * filesystems so handle that gracefully.
                 */
                if (inode_state_read(inode) & I_DIRTY_ALL)
                        redirty_tail_locked(inode, wb);
                else
                        inode_cgwb_move_to_attached(inode, wb);
                return;
        }

        if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
                /*
                 * We didn't write back all the pages.  nfs_writepages()
                 * sometimes bales out without doing anything.
                 */
                if (wbc->nr_to_write <= 0 &&
                    !inode_dirtied_after(inode, dirtied_before)) {
                        /* Slice used up. Queue for next turn. */
                        requeue_io(inode, wb);
                } else {
                        /*
                         * Writeback blocked by something other than
                         * congestion. Delay the inode for some time to
                         * avoid spinning on the CPU (100% iowait)
                         * retrying writeback of the dirty page/inode
                         * that cannot be performed immediately.
                         */
                        redirty_tail_locked(inode, wb);
                }
        } else if (inode_state_read(inode) & I_DIRTY) {
                /*
                 * Filesystems can dirty the inode during writeback operations,
                 * such as delayed allocation during submission or metadata
                 * updates after data IO completion.
                 */
                redirty_tail_locked(inode, wb);
        } else if (inode_state_read(inode) & I_DIRTY_TIME) {
                inode->dirtied_when = jiffies;
                inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
                inode_state_clear(inode, I_SYNC_QUEUED);
        } else {
                /* The inode is clean. Remove from writeback lists. */
                inode_cgwb_move_to_attached(inode, wb);
        }
}

static bool __sync_lazytime(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        if (!(inode_state_read(inode) & I_DIRTY_TIME)) {
                spin_unlock(&inode->i_lock);
                return false;
        }
        inode_state_clear(inode, I_DIRTY_TIME);
        spin_unlock(&inode->i_lock);
        inode->i_op->sync_lazytime(inode);
        return true;
}

bool sync_lazytime(struct inode *inode)
{
        if (!(inode_state_read_once(inode) & I_DIRTY_TIME))
                return false;

        trace_writeback_lazytime(inode);
        if (inode->i_op->sync_lazytime)
                return __sync_lazytime(inode);
        mark_inode_dirty_sync(inode);
        return true;
}

/*
 * Write out an inode and its dirty pages (or some of its dirty pages, depending
 * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state.
 *
 * This doesn't remove the inode from the writeback list it is on, except
 * potentially to move it from b_dirty_time to b_dirty due to timestamp
 * expiration.  The caller is otherwise responsible for writeback list handling.
 *
 * The caller is also responsible for setting the I_SYNC flag beforehand and
 * calling inode_sync_complete() to clear it afterwards.
 */
static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
        struct address_space *mapping = inode->i_mapping;
        long nr_to_write = wbc->nr_to_write;
        unsigned dirty;
        int ret;

        WARN_ON(!(inode_state_read_once(inode) & I_SYNC));

        trace_writeback_single_inode_start(inode, wbc, nr_to_write);

        ret = do_writepages(mapping, wbc);

        /*
         * Make sure to wait on the data before writing out the metadata.
         * This is important for filesystems that modify metadata on data
         * I/O completion. We don't do it for sync(2) writeback because it has a
         * separate, external IO completion path and ->sync_fs for guaranteeing
         * inode metadata is written back correctly.
         */
        if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
        }

        /*
         * For data integrity writeback, or when the dirty interval expired,
         * ask the file system to propagata lazy timestamp updates into real
         * dirty state.
         */
        if ((inode_state_read_once(inode) & I_DIRTY_TIME) &&
            (wbc->sync_mode == WB_SYNC_ALL ||
             time_after(jiffies, inode->dirtied_time_when +
                        dirtytime_expire_interval * HZ)))
                sync_lazytime(inode);

        /*
         * Get and clear the dirty flags from i_state.  This needs to be done
         * after calling writepages because some filesystems may redirty the
         * inode during writepages due to delalloc.  It also needs to be done
         * after handling timestamp expiration, as that may dirty the inode too.
         */
        spin_lock(&inode->i_lock);
        dirty = inode_state_read(inode) & I_DIRTY;
        inode_state_clear(inode, dirty);

        /*
         * Paired with smp_mb() in __mark_inode_dirty().  This allows
         * __mark_inode_dirty() to test i_state without grabbing i_lock -
         * either they see the I_DIRTY bits cleared or we see the dirtied
         * inode.
         *
         * I_DIRTY_PAGES is always cleared together above even if @mapping
         * still has dirty pages.  The flag is reinstated after smp_mb() if
         * necessary.  This guarantees that either __mark_inode_dirty()
         * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
         */
        smp_mb();

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                inode_state_set(inode, I_DIRTY_PAGES);
        else if (unlikely(inode_state_read(inode) & I_PINNING_NETFS_WB)) {
                if (!(inode_state_read(inode) & I_DIRTY_PAGES)) {
                        inode_state_clear(inode, I_PINNING_NETFS_WB);
                        wbc->unpinned_netfs_wb = true;
                        dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
                }
        }

        spin_unlock(&inode->i_lock);

        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & ~I_DIRTY_PAGES) {
                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
        }
        wbc->unpinned_netfs_wb = false;
        trace_writeback_single_inode(inode, wbc, nr_to_write);
        return ret;
}

/*
 * Write out an inode's dirty data and metadata on-demand, i.e. separately from
 * the regular batched writeback done by the flusher threads in
 * writeback_sb_inodes().  @wbc controls various aspects of the write, such as
 * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE).
 *
 * To prevent the inode from going away, either the caller must have a reference
 * to the inode, or the inode must have I_WILL_FREE or I_FREEING set.
 */
static int writeback_single_inode(struct inode *inode,
                                  struct writeback_control *wbc)
{
        struct bdi_writeback *wb;
        int ret = 0;

        spin_lock(&inode->i_lock);
        if (!icount_read(inode))
                WARN_ON(!(inode_state_read(inode) & (I_WILL_FREE | I_FREEING)));
        else
                WARN_ON(inode_state_read(inode) & I_WILL_FREE);

        if (inode_state_read(inode) & I_SYNC) {
                /*
                 * Writeback is already running on the inode.  For WB_SYNC_NONE,
                 * that's enough and we can just return.  For WB_SYNC_ALL, we
                 * must wait for the existing writeback to complete, then do
                 * writeback again if there's anything left.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL)
                        goto out;
                inode_wait_for_writeback(inode);
        }
        WARN_ON(inode_state_read(inode) & I_SYNC);
        /*
         * If the inode is already fully clean, then there's nothing to do.
         *
         * For data-integrity syncs we also need to check whether any pages are
         * still under writeback, e.g. due to prior WB_SYNC_NONE writeback.  If
         * there are any such pages, we'll need to wait for them.
         */
        if (!(inode_state_read(inode) & I_DIRTY_ALL) &&
            (wbc->sync_mode != WB_SYNC_ALL ||
             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                goto out;
        inode_state_set(inode, I_SYNC);
        wbc_attach_and_unlock_inode(wbc, inode);

        ret = __writeback_single_inode(inode, wbc);

        wbc_detach_inode(wbc);

        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);
        /*
         * If the inode is freeing, its i_io_list shoudn't be updated
         * as it can be finally deleted at this moment.
         */
        if (!(inode_state_read(inode) & I_FREEING)) {
                /*
                 * If the inode is now fully clean, then it can be safely
                 * removed from its writeback list (if any). Otherwise the
                 * flusher threads are responsible for the writeback lists.
                 */
                if (!(inode_state_read(inode) & I_DIRTY_ALL))
                        inode_cgwb_move_to_attached(inode, wb);
                else if (!(inode_state_read(inode) & I_SYNC_QUEUED)) {
                        if ((inode_state_read(inode) & I_DIRTY))
                                redirty_tail_locked(inode, wb);
                        else if (inode_state_read(inode) & I_DIRTY_TIME) {
                                inode->dirtied_when = jiffies;
                                inode_io_list_move_locked(inode,
                                                          wb,
                                                          &wb->b_dirty_time);
                        }
                }
        }

        spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
out:
        spin_unlock(&inode->i_lock);
        return ret;
}

static long writeback_chunk_size(struct super_block *sb,
                struct bdi_writeback *wb, struct wb_writeback_work *work)
{
        long pages;

        /*
         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
         * here avoids calling into writeback_inodes_wb() more than once.
         *
         * The intended call sequence for WB_SYNC_ALL writeback is:
         *
         *      wb_writeback()
         *          writeback_sb_inodes()       <== called only once
         *              write_cache_pages()     <== called once for each inode
         *                   (quickly) tag currently dirty pages
         *                   (maybe slowly) sync all tagged pages
         */
        if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
                return LONG_MAX;

        pages = min(wb->avg_write_bandwidth / 2,
                    global_wb_domain.dirty_limit / DIRTY_SCOPE);
        pages = min(pages, work->nr_pages);
        return round_down(pages + sb->s_min_writeback_pages,
                        sb->s_min_writeback_pages);
}

/*
 * Write a portion of b_io inodes which belong to @sb.
 *
 * Return the number of pages and/or inodes written.
 *
 * NOTE! This is called with wb->list_lock held, and will
 * unlock and relock that for each inode it ends up doing
 * IO for.
 */
static long writeback_sb_inodes(struct super_block *sb,
                                struct bdi_writeback *wb,
                                struct wb_writeback_work *work)
{
        struct writeback_control wbc = {
                .sync_mode                = work->sync_mode,
                .tagged_writepages        = work->tagged_writepages,
                .for_kupdate                = work->for_kupdate,
                .for_background                = work->for_background,
                .for_sync                = work->for_sync,
                .range_cyclic                = work->range_cyclic,
                .range_start                = 0,
                .range_end                = LLONG_MAX,
        };
        unsigned long start_time = jiffies;
        unsigned long timeout = sysctl_hung_task_timeout_secs;
        long write_chunk;
        long total_wrote = 0;  /* count both pages and inodes */
        unsigned long dirtied_before = jiffies;

        if (work->for_kupdate)
                dirtied_before = jiffies -
                        msecs_to_jiffies(dirty_expire_interval * 10);

        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct bdi_writeback *tmp_wb;
                long wrote;

                if (inode->i_sb != sb) {
                        if (work->sb) {
                                /*
                                 * We only want to write back data for this
                                 * superblock, move all inodes not belonging
                                 * to it back onto the dirty list.
                                 */
                                redirty_tail(inode, wb);
                                continue;
                        }

                        /*
                         * The inode belongs to a different superblock.
                         * Bounce back to the caller to unpin this and
                         * pin the next superblock.
                         */
                        break;
                }

                /*
                 * Don't bother with new inodes or inodes being freed, first
                 * kind does not need periodic writeout yet, and for the latter
                 * kind writeout is handled by the freer.
                 */
                spin_lock(&inode->i_lock);
                if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        redirty_tail_locked(inode, wb);
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if ((inode_state_read(inode) & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
                        /*
                         * If this inode is locked for writeback and we are not
                         * doing writeback-for-data-integrity, move it to
                         * b_more_io so that writeback can proceed with the
                         * other inodes on s_io.
                         *
                         * We'll have another go at writing back this inode
                         * when we completed a full scan of b_io.
                         */
                        requeue_io(inode, wb);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_sb_inodes_requeue(inode);
                        continue;
                }
                spin_unlock(&wb->list_lock);

                /*
                 * We already requeued the inode if it had I_SYNC set and we
                 * are doing WB_SYNC_NONE writeback. So this catches only the
                 * WB_SYNC_ALL case.
                 */
                if (inode_state_read(inode) & I_SYNC) {
                        /* Wait for I_SYNC. This function drops i_lock... */
                        inode_sleep_on_writeback(inode);
                        /* Inode may be gone, start again */
                        spin_lock(&wb->list_lock);
                        continue;
                }
                inode_state_set(inode, I_SYNC);
                wbc_attach_and_unlock_inode(&wbc, inode);

                write_chunk = writeback_chunk_size(inode->i_sb, wb, work);
                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;

                /*
                 * We use I_SYNC to pin the inode in memory. While it is set
                 * evict_inode() will wait so the inode cannot be freed.
                 */
                __writeback_single_inode(inode, &wbc);

                /* Report progress to inform the hung task detector of the progress. */
                if (work->done && work->done->progress_stamp && timeout &&
                   (jiffies - work->done->progress_stamp) > HZ * timeout / 2)
                        wake_up_all(work->done->waitq);

                wbc_detach_inode(&wbc);
                work->nr_pages -= write_chunk - wbc.nr_to_write;
                wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
                wrote = wrote < 0 ? 0 : wrote;
                total_wrote += wrote;

                if (need_resched()) {
                        /*
                         * We're trying to balance between building up a nice
                         * long list of IOs to improve our merge rate, and
                         * getting those IOs out quickly for anyone throttling
                         * in balance_dirty_pages().  cond_resched() doesn't
                         * unplug, so get our IOs out the door before we
                         * give up the CPU.
                         */
                        blk_flush_plug(current->plug, false);
                        cond_resched();
                }

                /*
                 * Requeue @inode if still dirty.  Be careful as @inode may
                 * have been switched to another wb in the meantime.
                 */
                tmp_wb = inode_to_wb_and_lock_list(inode);
                spin_lock(&inode->i_lock);
                if (!(inode_state_read(inode) & I_DIRTY_ALL))
                        total_wrote++;
                requeue_inode(inode, tmp_wb, &wbc, dirtied_before);
                inode_sync_complete(inode);
                spin_unlock(&inode->i_lock);

                if (unlikely(tmp_wb != wb)) {
                        spin_unlock(&tmp_wb->list_lock);
                        spin_lock(&wb->list_lock);
                }

                /*
                 * bail out to wb_writeback() often enough to check
                 * background threshold and other termination conditions.
                 */
                if (total_wrote) {
                        if (time_is_before_jiffies(start_time + HZ / 10UL))
                                break;
                        if (work->nr_pages <= 0)
                                break;
                }
        }
        return total_wrote;
}

static long __writeback_inodes_wb(struct bdi_writeback *wb,
                                  struct wb_writeback_work *work)
{
        unsigned long start_time = jiffies;
        long wrote = 0;

        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct super_block *sb = inode->i_sb;

                if (!super_trylock_shared(sb)) {
                        /*
                         * super_trylock_shared() may fail consistently due to
                         * s_umount being grabbed by someone else. Don't use
                         * requeue_io() to avoid busy retrying the inode/sb.
                         */
                        redirty_tail(inode, wb);
                        continue;
                }
                wrote += writeback_sb_inodes(sb, wb, work);
                up_read(&sb->s_umount);

                /* refer to the same tests at the end of writeback_sb_inodes */
                if (wrote) {
                        if (time_is_before_jiffies(start_time + HZ / 10UL))
                                break;
                        if (work->nr_pages <= 0)
                                break;
                }
        }
        /* Leave any unwritten inodes on b_io */
        return wrote;
}

static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
                                enum wb_reason reason)
{
        struct wb_writeback_work work = {
                .nr_pages        = nr_pages,
                .sync_mode        = WB_SYNC_NONE,
                .range_cyclic        = 1,
                .reason                = reason,
        };
        struct blk_plug plug;

        blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
        if (list_empty(&wb->b_io))
                queue_io(wb, &work, jiffies);
        __writeback_inodes_wb(wb, &work);
        spin_unlock(&wb->list_lock);
        blk_finish_plug(&plug);

        return nr_pages - work.nr_pages;
}

/*
 * Explicit flushing or periodic writeback of "old" data.
 *
 * Define "old": the first time one of an inode's pages is dirtied, we mark the
 * dirtying-time in the inode's address_space.  So this periodic writeback code
 * just walks the superblock inode list, writing back any inodes which are
 * older than a specific point in time.
 *
 * Try to run once per dirty_writeback_interval.  But if a writeback event
 * takes longer than a dirty_writeback_interval interval, then leave a
 * one-second gap.
 *
 * dirtied_before takes precedence over nr_to_write.  So we'll only write back
 * all dirty pages if they are all attached to "old" mappings.
 */
static long wb_writeback(struct bdi_writeback *wb,
                         struct wb_writeback_work *work)
{
        long nr_pages = work->nr_pages;
        unsigned long dirtied_before = jiffies;
        struct inode *inode;
        long progress;
        struct blk_plug plug;
        bool queued = false;

        blk_start_plug(&plug);
        for (;;) {
                /*
                 * Stop writeback when nr_pages has been consumed
                 */
                if (work->nr_pages <= 0)
                        break;

                /*
                 * Background writeout and kupdate-style writeback may
                 * run forever. Stop them if there is other work to do
                 * so that e.g. sync can proceed. They'll be restarted
                 * after the other works are all done.
                 */
                if ((work->for_background || work->for_kupdate) &&
                    !list_empty(&wb->work_list))
                        break;

                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
                if (work->for_background && !wb_over_bg_thresh(wb))
                        break;


                spin_lock(&wb->list_lock);

                trace_writeback_start(wb, work);
                if (list_empty(&wb->b_io)) {
                        /*
                         * Kupdate and background works are special and we want
                         * to include all inodes that need writing. Livelock
                         * avoidance is handled by these works yielding to any
                         * other work so we are safe.
                         */
                        if (work->for_kupdate) {
                                dirtied_before = jiffies -
                                        msecs_to_jiffies(dirty_expire_interval *
                                                         10);
                        } else if (work->for_background)
                                dirtied_before = jiffies;

                        queue_io(wb, work, dirtied_before);
                        queued = true;
                }
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
                        progress = __writeback_inodes_wb(wb, work);
                trace_writeback_written(wb, work);

                /*
                 * Did we write something? Try for more
                 *
                 * Dirty inodes are moved to b_io for writeback in batches.
                 * The completion of the current batch does not necessarily
                 * mean the overall work is done. So we keep looping as long
                 * as made some progress on cleaning pages or inodes.
                 */
                if (progress || !queued) {
                        spin_unlock(&wb->list_lock);
                        continue;
                }

                /*
                 * No more inodes for IO, bail
                 */
                if (list_empty(&wb->b_more_io)) {
                        spin_unlock(&wb->list_lock);
                        break;
                }

                /*
                 * Nothing written. Wait for some inode to
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
                trace_writeback_wait(wb, work);
                inode = wb_inode(wb->b_more_io.prev);
                spin_lock(&inode->i_lock);
                spin_unlock(&wb->list_lock);
                /* This function drops i_lock... */
                inode_sleep_on_writeback(inode);
        }
        blk_finish_plug(&plug);

        return nr_pages - work->nr_pages;
}

/*
 * Return the next wb_writeback_work struct that hasn't been processed yet.
 */
static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
        struct wb_writeback_work *work = NULL;

        spin_lock_irq(&wb->work_lock);
        if (!list_empty(&wb->work_list)) {
                work = list_entry(wb->work_list.next,
                                  struct wb_writeback_work, list);
                list_del_init(&work->list);
        }
        spin_unlock_irq(&wb->work_lock);
        return work;
}

static long wb_check_background_flush(struct bdi_writeback *wb)
{
        if (wb_over_bg_thresh(wb)) {

                struct wb_writeback_work work = {
                        .nr_pages        = LONG_MAX,
                        .sync_mode        = WB_SYNC_NONE,
                        .for_background        = 1,
                        .range_cyclic        = 1,
                        .reason                = WB_REASON_BACKGROUND,
                };

                return wb_writeback(wb, &work);
        }

        return 0;
}

static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
        unsigned long expired;
        long nr_pages;

        /*
         * When set to zero, disable periodic writeback
         */
        if (!dirty_writeback_interval)
                return 0;

        expired = wb->last_old_flush +
                        msecs_to_jiffies(dirty_writeback_interval * 10);
        if (time_before(jiffies, expired))
                return 0;

        wb->last_old_flush = jiffies;
        nr_pages = get_nr_dirty_pages();

        if (nr_pages) {
                struct wb_writeback_work work = {
                        .nr_pages        = nr_pages,
                        .sync_mode        = WB_SYNC_NONE,
                        .for_kupdate        = 1,
                        .range_cyclic        = 1,
                        .reason                = WB_REASON_PERIODIC,
                };

                return wb_writeback(wb, &work);
        }

        return 0;
}

static long wb_check_start_all(struct bdi_writeback *wb)
{
        long nr_pages;

        if (!test_bit(WB_start_all, &wb->state))
                return 0;

        nr_pages = get_nr_dirty_pages();
        if (nr_pages) {
                struct wb_writeback_work work = {
                        .nr_pages        = wb_split_bdi_pages(wb, nr_pages),
                        .sync_mode        = WB_SYNC_NONE,
                        .range_cyclic        = 1,
                        .reason                = wb->start_all_reason,
                };

                nr_pages = wb_writeback(wb, &work);
        }

        clear_bit(WB_start_all, &wb->state);
        return nr_pages;
}


/*
 * Retrieve work items and do the writeback they describe
 */
static long wb_do_writeback(struct bdi_writeback *wb)
{
        struct wb_writeback_work *work;
        long wrote = 0;

        set_bit(WB_writeback_running, &wb->state);
        while ((work = get_next_work_item(wb)) != NULL) {
                trace_writeback_exec(wb, work);
                wrote += wb_writeback(wb, work);
                finish_writeback_work(work);
        }

        /*
         * Check for a flush-everything request
         */
        wrote += wb_check_start_all(wb);

        /*
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
        wrote += wb_check_background_flush(wb);
        clear_bit(WB_writeback_running, &wb->state);

        return wrote;
}

/*
 * Handle writeback of dirty data for the device backed by this bdi. Also
 * reschedules periodically and does kupdated style flushing.
 */
void wb_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                struct bdi_writeback, dwork);
        long pages_written;

        set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));

        if (likely(!current_is_workqueue_rescuer() ||
                   !test_bit(WB_registered, &wb->state))) {
                /*
                 * The normal path.  Keep writing back @wb until its
                 * work_list is empty.  Note that this path is also taken
                 * if @wb is shutting down even when we're running off the
                 * rescuer as work_list needs to be drained.
                 */
                do {
                        pages_written = wb_do_writeback(wb);
                        trace_writeback_pages_written(pages_written);
                } while (!list_empty(&wb->work_list));
        } else {
                /*
                 * bdi_wq can't get enough workers and we're running off
                 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
                 * enough for efficient IO.
                 */
                pages_written = writeback_inodes_wb(wb, 1024,
                                                    WB_REASON_FORKER_THREAD);
                trace_writeback_pages_written(pages_written);
        }

        if (!list_empty(&wb->work_list))
                wb_wakeup(wb);
        else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
                wb_wakeup_delayed(wb);
}

/*
 * Start writeback of all dirty pages on this bdi.
 */
static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                         enum wb_reason reason)
{
        struct bdi_writeback *wb;

        if (!bdi_has_dirty_io(bdi))
                return;

        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                wb_start_writeback(wb, reason);
}

void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                enum wb_reason reason)
{
        rcu_read_lock();
        __wakeup_flusher_threads_bdi(bdi, reason);
        rcu_read_unlock();
}

/*
 * Wakeup the flusher threads to start writeback of all currently dirty pages
 */
void wakeup_flusher_threads(enum wb_reason reason)
{
        struct backing_dev_info *bdi;

        /*
         * If we are expecting writeback progress we must submit plugged IO.
         */
        blk_flush_plug(current->plug, true);

        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
                __wakeup_flusher_threads_bdi(bdi, reason);
        rcu_read_unlock();
}

/*
 * Wake up bdi's periodically to make sure dirtytime inodes gets
 * written back periodically.  We deliberately do *not* check the
 * b_dirtytime list in wb_has_dirty_io(), since this would cause the
 * kernel to be constantly waking up once there are any dirtytime
 * inodes on the system.  So instead we define a separate delayed work
 * function which gets called much more rarely.  (By default, only
 * once every 12 hours.)
 *
 * If there is any other write activity going on in the file system,
 * this function won't be necessary.  But if the only thing that has
 * happened on the file system is a dirtytime inode caused by an atime
 * update, we need this infrastructure below to make sure that inode
 * eventually gets pushed out to disk.
 */
static void wakeup_dirtytime_writeback(struct work_struct *w);
static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);

static void wakeup_dirtytime_writeback(struct work_struct *w)
{
        struct backing_dev_info *bdi;

        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;

                list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                        if (!list_empty(&wb->b_dirty_time))
                                wb_wakeup(wb);
        }
        rcu_read_unlock();
        if (dirtytime_expire_interval)
                schedule_delayed_work(&dirtytime_work,
                                      round_jiffies_relative(dirtytime_expire_interval * HZ));
}

static int dirtytime_interval_handler(const struct ctl_table *table, int write,
                               void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write) {
                if (dirtytime_expire_interval)
                        mod_delayed_work(system_percpu_wq, &dirtytime_work, 0);
                else
                        cancel_delayed_work_sync(&dirtytime_work);
        }
        return ret;
}

static const struct ctl_table vm_fs_writeback_table[] = {
        {
                .procname        = "dirtytime_expire_seconds",
                .data                = &dirtytime_expire_interval,
                .maxlen                = sizeof(dirtytime_expire_interval),
                .mode                = 0644,
                .proc_handler        = dirtytime_interval_handler,
                .extra1                = SYSCTL_ZERO,
        },
};

static int __init start_dirtytime_writeback(void)
{
        if (dirtytime_expire_interval)
                schedule_delayed_work(&dirtytime_work,
                                      round_jiffies_relative(dirtytime_expire_interval * HZ));
        register_sysctl_init("vm", vm_fs_writeback_table);
        return 0;
}
__initcall(start_dirtytime_writeback);

/**
 * __mark_inode_dirty -        internal function to mark an inode dirty
 *
 * @inode: inode to mark
 * @flags: what kind of dirty, e.g. I_DIRTY_SYNC.  This can be a combination of
 *           multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined
 *           with I_DIRTY_PAGES.
 *
 * Mark an inode as dirty.  We notify the filesystem, then update the inode's
 * dirty flags.  Then, if needed we add the inode to the appropriate dirty list.
 *
 * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync()
 * instead of calling this directly.
 *
 * CAREFUL!  We only add the inode to the dirty list if it is hashed or if it
 * refers to a blockdev.  Unhashed inodes will never be added to the dirty list
 * even if they are later hashed, as they will have been marked dirty already.
 *
 * In short, ensure you hash any inodes _before_ you start marking them dirty.
 *
 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
 * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
 * the kernel-internal blockdev inode represents the dirtying time of the
 * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
 * page->mapping->host, so the page-dirtying time is recorded in the internal
 * blockdev inode.
 */
void __mark_inode_dirty(struct inode *inode, int flags)
{
        struct super_block *sb = inode->i_sb;
        int dirtytime = 0;
        struct bdi_writeback *wb = NULL;

        trace_writeback_mark_inode_dirty(inode, flags);

        if (flags & I_DIRTY_INODE) {
                bool was_dirty_time = false;

                /*
                 * Inode timestamp update will piggback on this dirtying.
                 * We tell ->dirty_inode callback that timestamps need to
                 * be updated by setting I_DIRTY_TIME in flags.
                 */
                if (inode_state_read_once(inode) & I_DIRTY_TIME) {
                        spin_lock(&inode->i_lock);
                        if (inode_state_read(inode) & I_DIRTY_TIME) {
                                inode_state_clear(inode, I_DIRTY_TIME);
                                flags |= I_DIRTY_TIME;
                                was_dirty_time = true;
                        }
                        spin_unlock(&inode->i_lock);
                }

                /*
                 * Notify the filesystem about the inode being dirtied, so that
                 * (if needed) it can update on-disk fields and journal the
                 * inode.  This is only needed when the inode itself is being
                 * dirtied now.  I.e. it's only needed for I_DIRTY_INODE, not
                 * for just I_DIRTY_PAGES or I_DIRTY_TIME.
                 */
                trace_writeback_dirty_inode_start(inode, flags);
                if (sb->s_op->dirty_inode) {
                        sb->s_op->dirty_inode(inode,
                                flags & (I_DIRTY_INODE | I_DIRTY_TIME));
                } else if (was_dirty_time && inode->i_op->sync_lazytime) {
                        inode->i_op->sync_lazytime(inode);
                }
                trace_writeback_dirty_inode(inode, flags);

                /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
                flags &= ~I_DIRTY_TIME;
        } else {
                /*
                 * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing.
                 * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME
                 * in one call to __mark_inode_dirty().)
                 */
                dirtytime = flags & I_DIRTY_TIME;
                WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
        }

        /*
         * Paired with smp_mb() in __writeback_single_inode() for the
         * following lockless i_state test.  See there for details.
         */
        smp_mb();

        if ((inode_state_read_once(inode) & flags) == flags)
                return;

        spin_lock(&inode->i_lock);
        if ((inode_state_read(inode) & flags) != flags) {
                const int was_dirty = inode_state_read(inode) & I_DIRTY;

                inode_attach_wb(inode, NULL);

                inode_state_set(inode, flags);

                /*
                 * Grab inode's wb early because it requires dropping i_lock and we
                 * need to make sure following checks happen atomically with dirty
                 * list handling so that we don't move inodes under flush worker's
                 * hands.
                 */
                if (!was_dirty) {
                        wb = locked_inode_to_wb_and_lock_list(inode);
                        spin_lock(&inode->i_lock);
                }

                /*
                 * If the inode is queued for writeback by flush worker, just
                 * update its dirty state. Once the flush worker is done with
                 * the inode it will place it on the appropriate superblock
                 * list, based upon its state.
                 */
                if (inode_state_read(inode) & I_SYNC_QUEUED)
                        goto out_unlock;

                /*
                 * Only add valid (hashed) inodes to the superblock's
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
                        if (inode_unhashed(inode))
                                goto out_unlock;
                }
                if (inode_state_read(inode) & I_FREEING)
                        goto out_unlock;

                /*
                 * If the inode was already on b_dirty/b_io/b_more_io, don't
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
                        struct list_head *dirty_list;
                        bool wakeup_bdi = false;

                        inode->dirtied_when = jiffies;
                        if (dirtytime)
                                inode->dirtied_time_when = jiffies;

                        if (inode_state_read(inode) & I_DIRTY)
                                dirty_list = &wb->b_dirty;
                        else
                                dirty_list = &wb->b_dirty_time;

                        wakeup_bdi = inode_io_list_move_locked(inode, wb,
                                                               dirty_list);

                        /*
                         * If this is the first dirty inode for this bdi,
                         * we have to wake-up the corresponding bdi thread
                         * to make sure background write-back happens
                         * later.
                         */
                        if (wakeup_bdi &&
                            (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
                                wb_wakeup_delayed(wb);

                        spin_unlock(&wb->list_lock);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_dirty_inode_enqueue(inode);

                        return;
                }
        }
out_unlock:
        if (wb)
                spin_unlock(&wb->list_lock);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(__mark_inode_dirty);

/*
 * The @s_sync_lock is used to serialise concurrent sync operations
 * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
 * Concurrent callers will block on the s_sync_lock rather than doing contending
 * walks. The queueing maintains sync(2) required behaviour as all the IO that
 * has been issued up to the time this function is enter is guaranteed to be
 * completed by the time we have gained the lock and waited for all IO that is
 * in progress regardless of the order callers are granted the lock.
 */
static void wait_sb_inodes(struct super_block *sb)
{
        LIST_HEAD(sync_list);

        /*
         * We need to be protected against the filesystem going from
         * r/o to r/w or vice versa.
         */
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        mutex_lock(&sb->s_sync_lock);

        /*
         * Splice the writeback list onto a temporary list to avoid waiting on
         * inodes that have started writeback after this point.
         *
         * Use rcu_read_lock() to keep the inodes around until we have a
         * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
         * the local list because inodes can be dropped from either by writeback
         * completion.
         */
        rcu_read_lock();
        spin_lock_irq(&sb->s_inode_wblist_lock);
        list_splice_init(&sb->s_inodes_wb, &sync_list);

        /*
         * Data integrity sync. Must wait for all pages under writeback, because
         * there may have been pages dirtied before our sync call, but which had
         * writeout started before we write it out.  In which case, the inode
         * may not be on the dirty list, but we still have to wait for that
         * writeout.
         */
        while (!list_empty(&sync_list)) {
                struct inode *inode = list_first_entry(&sync_list, struct inode,
                                                       i_wb_list);
                struct address_space *mapping = inode->i_mapping;

                /*
                 * Move each inode back to the wb list before we drop the lock
                 * to preserve consistency between i_wb_list and the mapping
                 * writeback tag. Writeback completion is responsible to remove
                 * the inode from either list once the writeback tag is cleared.
                 */
                list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);

                /*
                 * The mapping can appear untagged while still on-list since we
                 * do not have the mapping lock. Skip it here, wb completion
                 * will remove it.
                 */
                if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                        continue;

                spin_unlock_irq(&sb->s_inode_wblist_lock);

                spin_lock(&inode->i_lock);
                if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) {
                        spin_unlock(&inode->i_lock);

                        spin_lock_irq(&sb->s_inode_wblist_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                rcu_read_unlock();

                /*
                 * We keep the error status of individual mapping so that
                 * applications can catch the writeback error using fsync(2).
                 * See filemap_fdatawait_keep_errors() for details.
                 */
                filemap_fdatawait_keep_errors(mapping);

                cond_resched();

                iput(inode);

                rcu_read_lock();
                spin_lock_irq(&sb->s_inode_wblist_lock);
        }
        spin_unlock_irq(&sb->s_inode_wblist_lock);
        rcu_read_unlock();
        mutex_unlock(&sb->s_sync_lock);
}

static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
                                     enum wb_reason reason, bool skip_if_busy)
{
        struct backing_dev_info *bdi = sb->s_bdi;
        DEFINE_WB_COMPLETION(done, bdi);
        struct wb_writeback_work work = {
                .sb                        = sb,
                .sync_mode                = WB_SYNC_NONE,
                .tagged_writepages        = 1,
                .done                        = &done,
                .nr_pages                = nr,
                .reason                        = reason,
        };

        if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
        wb_wait_for_completion(&done);
}

/**
 * writeback_inodes_sb_nr -        writeback dirty inodes from given super_block
 * @sb: the superblock
 * @nr: the number of pages to write
 * @reason: reason why some writeback work initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
void writeback_inodes_sb_nr(struct super_block *sb,
                            unsigned long nr,
                            enum wb_reason reason)
{
        __writeback_inodes_sb_nr(sb, nr, reason, false);
}
EXPORT_SYMBOL(writeback_inodes_sb_nr);

/**
 * writeback_inodes_sb        -        writeback dirty inodes from given super_block
 * @sb: the superblock
 * @reason: reason why some writeback work was initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
        writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
EXPORT_SYMBOL(writeback_inodes_sb);

/**
 * try_to_writeback_inodes_sb - try to start writeback if none underway
 * @sb: the superblock
 * @reason: reason why some writeback work was initiated
 *
 * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
 */
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
        if (!down_read_trylock(&sb->s_umount))
                return;

        __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
        up_read(&sb->s_umount);
}
EXPORT_SYMBOL(try_to_writeback_inodes_sb);

/**
 * sync_inodes_sb        -        sync sb inode pages
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
 * super_block.
 */
void sync_inodes_sb(struct super_block *sb)
{
        struct backing_dev_info *bdi = sb->s_bdi;
        DEFINE_WB_COMPLETION(done, bdi);
        struct wb_writeback_work work = {
                .sb                = sb,
                .sync_mode        = WB_SYNC_ALL,
                .nr_pages        = LONG_MAX,
                .range_cyclic        = 0,
                .done                = &done,
                .reason                = WB_REASON_SYNC,
                .for_sync        = 1,
        };

        /*
         * Can't skip on !bdi_has_dirty() because we should wait for !dirty
         * inodes under writeback and I_DIRTY_TIME inodes ignored by
         * bdi_has_dirty() need to be written out too.
         */
        if (bdi == &noop_backing_dev_info)
                return;

        /*
         * If the superblock has SB_I_NO_DATA_INTEGRITY set, there's no need to
         * wait for the writeout to complete, as the filesystem cannot guarantee
         * data persistence on sync. Just kick off writeback and return.
         */
        if (sb->s_iflags & SB_I_NO_DATA_INTEGRITY) {
                wakeup_flusher_threads_bdi(bdi, WB_REASON_SYNC);
                return;
        }

        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
        bdi_down_write_wb_switch_rwsem(bdi);
        bdi_split_work_to_wbs(bdi, &work, false);
        wb_wait_for_completion(&done);
        bdi_up_write_wb_switch_rwsem(bdi);

        wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);

/**
 * write_inode_now        -        write an inode to disk
 * @inode: inode to write to disk
 * @sync: whether the write should be synchronous or not
 *
 * This function commits an inode to disk immediately if it is dirty. This is
 * primarily needed by knfsd.
 *
 * The caller must either have a ref on the inode or must have set I_WILL_FREE.
 */
int write_inode_now(struct inode *inode, int sync)
{
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
                .range_start = 0,
                .range_end = LLONG_MAX,
        };

        if (!mapping_can_writeback(inode->i_mapping))
                wbc.nr_to_write = 0;

        might_sleep();
        return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(write_inode_now);

/**
 * sync_inode_metadata - write an inode to disk
 * @inode: the inode to sync
 * @wait: wait for I/O to complete.
 *
 * Write an inode to disk and adjust its dirty state after completion.
 *
 * Note: only writes the actual inode, no associated data or other metadata.
 */
int sync_inode_metadata(struct inode *inode, int wait)
{
        struct writeback_control wbc = {
                .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
                .nr_to_write = 0, /* metadata-only */
        };

        return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(sync_inode_metadata);























































































    1 





























































































































































































    1 













































    1 
    1 
















































































    1 
    1 



















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
// SPDX-License-Identifier: GPL-2.0
/*
 * Tty port functions
 */

#include <linux/types.h>
#include <linux/errno.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/tty_flip.h>
#include <linux/serial.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/serdev.h>
#include "tty.h"

static size_t tty_port_default_receive_buf(struct tty_port *port, const u8 *p,
                                           const u8 *f, size_t count)
{
        struct tty_struct *tty;
        struct tty_ldisc *ld;

        tty = READ_ONCE(port->itty);
        if (!tty)
                return 0;

        ld = tty_ldisc_ref(tty);
        if (!ld)
                return 0;

        count = tty_ldisc_receive_buf(ld, p, f, count);

        tty_ldisc_deref(ld);

        return count;
}

static void tty_port_default_lookahead_buf(struct tty_port *port, const u8 *p,
                                           const u8 *f, size_t count)
{
        struct tty_struct *tty;
        struct tty_ldisc *ld;

        tty = READ_ONCE(port->itty);
        if (!tty)
                return;

        ld = tty_ldisc_ref(tty);
        if (!ld)
                return;

        if (ld->ops->lookahead_buf)
                ld->ops->lookahead_buf(ld->tty, p, f, count);

        tty_ldisc_deref(ld);
}

static void tty_port_default_wakeup(struct tty_port *port)
{
        scoped_guard(tty_port_tty, port)
                tty_wakeup(scoped_tty());
}

const struct tty_port_client_operations tty_port_default_client_ops = {
        .receive_buf = tty_port_default_receive_buf,
        .lookahead_buf = tty_port_default_lookahead_buf,
        .write_wakeup = tty_port_default_wakeup,
};
EXPORT_SYMBOL_GPL(tty_port_default_client_ops);

/**
 * tty_port_init - initialize tty_port
 * @port: tty_port to initialize
 *
 * Initializes the state of struct tty_port. When a port was initialized using
 * this function, one has to destroy the port by tty_port_destroy(). Either
 * indirectly by using &tty_port refcounting (tty_port_put()) or directly if
 * refcounting is not used.
 */
void tty_port_init(struct tty_port *port)
{
        memset(port, 0, sizeof(*port));
        tty_buffer_init(port);
        init_waitqueue_head(&port->open_wait);
        init_waitqueue_head(&port->delta_msr_wait);
        mutex_init(&port->mutex);
        mutex_init(&port->buf_mutex);
        spin_lock_init(&port->lock);
        port->close_delay = (50 * HZ) / 100;
        port->closing_wait = (3000 * HZ) / 100;
        port->client_ops = &tty_port_default_client_ops;
        kref_init(&port->kref);
}
EXPORT_SYMBOL(tty_port_init);

/**
 * tty_port_link_wq - link tty_port and flip workqueue
 * @port: tty_port of the device
 * @flip_wq: workqueue to queue flip buffer work on
 *
 * Whenever %TTY_DRIVER_NO_WORKQUEUE is used, every tty_port can be linked to
 * a workqueue manually by this function.
 * tty_port will use system_dfl_wq when buf.flip_wq is NULL.
 *
 * Note that tty_port API will NOT destroy the workqueue.
 */
void tty_port_link_wq(struct tty_port *port, struct workqueue_struct *flip_wq)
{
        port->buf.flip_wq = flip_wq;
}
EXPORT_SYMBOL_GPL(tty_port_link_wq);

/**
 * tty_port_link_device - link tty and tty_port
 * @port: tty_port of the device
 * @driver: tty_driver for this device
 * @index: index of the tty
 *
 * Provide the tty layer with a link from a tty (specified by @index) to a
 * tty_port (@port). Use this only if neither tty_port_register_device() nor
 * tty_port_install() is used in the driver. If used, this has to be called
 * before tty_register_driver().
 */
void tty_port_link_device(struct tty_port *port,
                struct tty_driver *driver, unsigned index)
{
        if (WARN_ON(index >= driver->num))
                return;
        driver->ports[index] = port;
}
EXPORT_SYMBOL_GPL(tty_port_link_device);

/**
 * tty_port_register_device - register tty device
 * @port: tty_port of the device
 * @driver: tty_driver for this device
 * @index: index of the tty
 * @device: parent if exists, otherwise NULL
 *
 * It is the same as tty_register_device() except the provided @port is linked
 * to a concrete tty specified by @index. Use this or tty_port_install() (or
 * both). Call tty_port_link_device() as a last resort.
 */
struct device *tty_port_register_device(struct tty_port *port,
                struct tty_driver *driver, unsigned index,
                struct device *device)
{
        return tty_port_register_device_attr(port, driver, index, device, NULL, NULL);
}
EXPORT_SYMBOL_GPL(tty_port_register_device);

/**
 * tty_port_register_device_attr - register tty device
 * @port: tty_port of the device
 * @driver: tty_driver for this device
 * @index: index of the tty
 * @device: parent if exists, otherwise NULL
 * @drvdata: Driver data to be set to device.
 * @attr_grp: Attribute group to be set on device.
 *
 * It is the same as tty_register_device_attr() except the provided @port is
 * linked to a concrete tty specified by @index. Use this or tty_port_install()
 * (or both). Call tty_port_link_device() as a last resort.
 */
struct device *tty_port_register_device_attr(struct tty_port *port,
                struct tty_driver *driver, unsigned index,
                struct device *device, void *drvdata,
                const struct attribute_group **attr_grp)
{
        tty_port_link_device(port, driver, index);
        tty_port_link_driver_wq(port, driver);
        return tty_register_device_attr(driver, index, device, drvdata,
                        attr_grp);
}
EXPORT_SYMBOL_GPL(tty_port_register_device_attr);

/**
 * tty_port_register_device_attr_serdev - register tty or serdev device
 * @port: tty_port of the device
 * @driver: tty_driver for this device
 * @index: index of the tty
 * @host: serial port hardware device
 * @parent: parent if exists, otherwise NULL
 * @drvdata: driver data for the device
 * @attr_grp: attribute group for the device
 *
 * Register a serdev or tty device depending on if the parent device has any
 * defined serdev clients or not.
 */
struct device *tty_port_register_device_attr_serdev(struct tty_port *port,
                struct tty_driver *driver, unsigned index,
                struct device *host, struct device *parent, void *drvdata,
                const struct attribute_group **attr_grp)
{
        struct device *dev;

        tty_port_link_device(port, driver, index);
        tty_port_link_driver_wq(port, driver);

        dev = serdev_tty_port_register(port, host, parent, driver, index);
        if (PTR_ERR(dev) != -ENODEV) {
                /* Skip creating cdev if we registered a serdev device */
                return dev;
        }

        return tty_register_device_attr(driver, index, parent, drvdata,
                        attr_grp);
}
EXPORT_SYMBOL_GPL(tty_port_register_device_attr_serdev);

/**
 * tty_port_unregister_device - deregister a tty or serdev device
 * @port: tty_port of the device
 * @driver: tty_driver for this device
 * @index: index of the tty
 *
 * If a tty or serdev device is registered with a call to
 * tty_port_register_device_serdev() then this function must be called when
 * the device is gone.
 */
void tty_port_unregister_device(struct tty_port *port,
                struct tty_driver *driver, unsigned index)
{
        int ret;

        WRITE_ONCE(port->buf.flip_wq, NULL);
        ret = serdev_tty_port_unregister(port);
        if (ret == 0)
                return;

        tty_unregister_device(driver, index);
}
EXPORT_SYMBOL_GPL(tty_port_unregister_device);

int tty_port_alloc_xmit_buf(struct tty_port *port)
{
        /* We may sleep in get_zeroed_page() */
        guard(mutex)(&port->buf_mutex);

        if (port->xmit_buf)
                return 0;

        port->xmit_buf = (u8 *)get_zeroed_page(GFP_KERNEL);
        if (port->xmit_buf == NULL)
                return -ENOMEM;

        kfifo_init(&port->xmit_fifo, port->xmit_buf, PAGE_SIZE);

        return 0;
}
EXPORT_SYMBOL(tty_port_alloc_xmit_buf);

void tty_port_free_xmit_buf(struct tty_port *port)
{
        guard(mutex)(&port->buf_mutex);
        free_page((unsigned long)port->xmit_buf);
        port->xmit_buf = NULL;
        INIT_KFIFO(port->xmit_fifo);
}
EXPORT_SYMBOL(tty_port_free_xmit_buf);

/**
 * tty_port_destroy - destroy inited port
 * @port: tty port to be destroyed
 *
 * When a port was initialized using tty_port_init(), one has to destroy the
 * port by this function. Either indirectly by using &tty_port refcounting
 * (tty_port_put()) or directly if refcounting is not used.
 */
void tty_port_destroy(struct tty_port *port)
{
        tty_buffer_cancel_work(port);
        tty_buffer_free_all(port);
        WRITE_ONCE(port->buf.flip_wq, NULL);
}
EXPORT_SYMBOL(tty_port_destroy);

static void tty_port_destructor(struct kref *kref)
{
        struct tty_port *port = container_of(kref, struct tty_port, kref);

        /* check if last port ref was dropped before tty release */
        if (WARN_ON(port->itty))
                return;
        free_page((unsigned long)port->xmit_buf);
        tty_port_destroy(port);
        if (port->ops && port->ops->destruct)
                port->ops->destruct(port);
        else
                kfree(port);
}

/**
 * tty_port_put - drop a reference to tty_port
 * @port: port to drop a reference of (can be NULL)
 *
 * The final put will destroy and free up the @port using
 * @port->ops->destruct() hook, or using kfree() if not provided.
 */
void tty_port_put(struct tty_port *port)
{
        if (port)
                kref_put(&port->kref, tty_port_destructor);
}
EXPORT_SYMBOL(tty_port_put);

/**
 * tty_port_tty_get        -        get a tty reference
 * @port: tty port
 *
 * Return a refcount protected tty instance or %NULL if the port is not
 * associated with a tty (eg due to close or hangup).
 */
struct tty_struct *tty_port_tty_get(struct tty_port *port)
{
        guard(spinlock_irqsave)(&port->lock);
        return tty_kref_get(port->tty);
}
EXPORT_SYMBOL(tty_port_tty_get);

/**
 * tty_port_tty_set        -        set the tty of a port
 * @port: tty port
 * @tty: the tty
 *
 * Associate the port and tty pair. Manages any internal refcounts. Pass %NULL
 * to deassociate a port.
 */
void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty)
{
        guard(spinlock_irqsave)(&port->lock);
        tty_kref_put(port->tty);
        port->tty = tty_kref_get(tty);
}
EXPORT_SYMBOL(tty_port_tty_set);

/**
 * tty_port_shutdown - internal helper to shutdown the device
 * @port: tty port to be shut down
 * @tty: the associated tty
 *
 * It is used by tty_port_hangup() and tty_port_close(). Its task is to
 * shutdown the device if it was initialized (note consoles remain
 * functioning). It lowers DTR/RTS (if @tty has HUPCL set) and invokes
 * @port->ops->shutdown().
 */
static void tty_port_shutdown(struct tty_port *port, struct tty_struct *tty)
{
        guard(mutex)(&port->mutex);

        if (port->console)
                return;

        if (!tty_port_initialized(port))
                return;

        tty_port_set_initialized(port, false);
        /*
         * Drop DTR/RTS if HUPCL is set. This causes any attached
         * modem to hang up the line.
         */
        if (tty && C_HUPCL(tty))
                tty_port_lower_dtr_rts(port);

        if (port->ops->shutdown)
                port->ops->shutdown(port);
}

/**
 * tty_port_hangup                -        hangup helper
 * @port: tty port
 *
 * Perform port level tty hangup flag and count changes. Drop the tty
 * reference.
 *
 * Caller holds tty lock.
 */
void tty_port_hangup(struct tty_port *port)
{
        struct tty_struct *tty;

        scoped_guard(spinlock_irqsave, &port->lock) {
                port->count = 0;
                tty = port->tty;
                if (tty)
                        set_bit(TTY_IO_ERROR, &tty->flags);
                port->tty = NULL;
        }

        tty_port_set_active(port, false);
        tty_port_shutdown(port, tty);
        tty_kref_put(tty);
        wake_up_interruptible(&port->open_wait);
        wake_up_interruptible(&port->delta_msr_wait);
}
EXPORT_SYMBOL(tty_port_hangup);

void __tty_port_tty_hangup(struct tty_port *port, bool check_clocal, bool async)
{
        scoped_guard(tty_port_tty, port) {
                struct tty_struct *tty = scoped_tty();

                if (!check_clocal || !C_CLOCAL(tty)) {
                        if (async)
                                tty_hangup(tty);
                        else
                                tty_vhangup(tty);
                }
        }
}
EXPORT_SYMBOL_GPL(__tty_port_tty_hangup);

/**
 * tty_port_tty_wakeup - helper to wake up a tty
 * @port: tty port
 */
void tty_port_tty_wakeup(struct tty_port *port)
{
        port->client_ops->write_wakeup(port);
}
EXPORT_SYMBOL_GPL(tty_port_tty_wakeup);

/**
 * tty_port_carrier_raised        -        carrier raised check
 * @port: tty port
 *
 * Wrapper for the carrier detect logic. For the moment this is used
 * to hide some internal details. This will eventually become entirely
 * internal to the tty port.
 */
bool tty_port_carrier_raised(struct tty_port *port)
{
        if (port->ops->carrier_raised == NULL)
                return true;
        return port->ops->carrier_raised(port);
}
EXPORT_SYMBOL(tty_port_carrier_raised);

/**
 * tty_port_raise_dtr_rts        -        Raise DTR/RTS
 * @port: tty port
 *
 * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide
 * some internal details. This will eventually become entirely internal to the
 * tty port.
 */
void tty_port_raise_dtr_rts(struct tty_port *port)
{
        if (port->ops->dtr_rts)
                port->ops->dtr_rts(port, true);
}
EXPORT_SYMBOL(tty_port_raise_dtr_rts);

/**
 * tty_port_lower_dtr_rts        -        Lower DTR/RTS
 * @port: tty port
 *
 * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide
 * some internal details. This will eventually become entirely internal to the
 * tty port.
 */
void tty_port_lower_dtr_rts(struct tty_port *port)
{
        if (port->ops->dtr_rts)
                port->ops->dtr_rts(port, false);
}
EXPORT_SYMBOL(tty_port_lower_dtr_rts);

/**
 * tty_port_block_til_ready        -        Waiting logic for tty open
 * @port: the tty port being opened
 * @tty: the tty device being bound
 * @filp: the file pointer of the opener or %NULL
 *
 * Implement the core POSIX/SuS tty behaviour when opening a tty device.
 * Handles:
 *
 *        - hangup (both before and during)
 *        - non blocking open
 *        - rts/dtr/dcd
 *        - signals
 *        - port flags and counts
 *
 * The passed @port must implement the @port->ops->carrier_raised method if it
 * can do carrier detect and the @port->ops->dtr_rts method if it supports
 * software management of these lines. Note that the dtr/rts raise is done each
 * iteration as a hangup may have previously dropped them while we wait.
 *
 * Caller holds tty lock.
 *
 * Note: May drop and reacquire tty lock when blocking, so @tty and @port may
 * have changed state (eg., may have been hung up).
 */
int tty_port_block_til_ready(struct tty_port *port,
                                struct tty_struct *tty, struct file *filp)
{
        int do_clocal = 0, retval;
        DEFINE_WAIT(wait);

        /* if non-blocking mode is set we can pass directly to open unless
         * the port has just hung up or is in another error state.
         */
        if (tty_io_error(tty)) {
                tty_port_set_active(port, true);
                return 0;
        }
        if (filp == NULL || (filp->f_flags & O_NONBLOCK)) {
                /* Indicate we are open */
                if (C_BAUD(tty))
                        tty_port_raise_dtr_rts(port);
                tty_port_set_active(port, true);
                return 0;
        }

        if (C_CLOCAL(tty))
                do_clocal = 1;

        /* Block waiting until we can proceed. We may need to wait for the
         * carrier, but we must also wait for any close that is in progress
         * before the next open may complete.
         */

        retval = 0;

        /* The port lock protects the port counts */
        scoped_guard(spinlock_irqsave, &port->lock) {
                port->count--;
                port->blocked_open++;
        }

        while (1) {
                /* Indicate we are open */
                if (C_BAUD(tty) && tty_port_initialized(port))
                        tty_port_raise_dtr_rts(port);

                prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE);
                /* Check for a hangup or uninitialised port.
                 * Return accordingly.
                 */
                if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
                        if (port->flags & ASYNC_HUP_NOTIFY)
                                retval = -EAGAIN;
                        else
                                retval = -ERESTARTSYS;
                        break;
                }
                /*
                 * Probe the carrier. For devices with no carrier detect
                 * tty_port_carrier_raised will always return true.
                 * Never ask drivers if CLOCAL is set, this causes troubles
                 * on some hardware.
                 */
                if (do_clocal || tty_port_carrier_raised(port))
                        break;
                if (signal_pending(current)) {
                        retval = -ERESTARTSYS;
                        break;
                }
                tty_unlock(tty);
                schedule();
                tty_lock(tty);
        }
        finish_wait(&port->open_wait, &wait);

        /* Update counts. A parallel hangup will have set count to zero and
         * we must not mess that up further.
         */
        scoped_guard(spinlock_irqsave, &port->lock) {
                if (!tty_hung_up_p(filp))
                        port->count++;
                port->blocked_open--;
        }
        if (retval == 0)
                tty_port_set_active(port, true);
        return retval;
}
EXPORT_SYMBOL(tty_port_block_til_ready);

static void tty_port_drain_delay(struct tty_port *port, struct tty_struct *tty)
{
        unsigned int bps = tty_get_baud_rate(tty);
        long timeout;

        if (bps > 1200) {
                timeout = (HZ * 10 * port->drain_delay) / bps;
                timeout = max_t(long, timeout, HZ / 10);
        } else {
                timeout = 2 * HZ;
        }
        schedule_timeout_interruptible(timeout);
}

/**
 * tty_port_close_start - helper for tty->ops->close, part 1/2
 * @port: tty_port of the device
 * @tty: tty being closed
 * @filp: passed file pointer
 *
 * Decrements and checks open count. Flushes the port if this is the last
 * close. That means, dropping the data from the outpu buffer on the device and
 * waiting for sending logic to finish. The rest of close handling is performed
 * in tty_port_close_end().
 *
 * Locking: Caller holds tty lock.
 *
 * Return: 1 if this is the last close, otherwise 0
 */
int tty_port_close_start(struct tty_port *port,
                                struct tty_struct *tty, struct file *filp)
{
        if (tty_hung_up_p(filp))
                return 0;

        scoped_guard(spinlock_irqsave, &port->lock) {
                if (tty->count == 1 && port->count != 1) {
                        tty_warn(tty, "%s: tty->count = 1 port count = %d\n", __func__,
                                 port->count);
                        port->count = 1;
                }
                if (--port->count < 0) {
                        tty_warn(tty, "%s: bad port count (%d)\n", __func__,
                                 port->count);
                        port->count = 0;
                }

                if (port->count)
                        return 0;
        }

        tty->closing = 1;

        if (tty_port_initialized(port)) {
                /* Don't block on a stalled port, just pull the chain */
                if (tty->flow.tco_stopped)
                        tty_driver_flush_buffer(tty);
                if (port->closing_wait != ASYNC_CLOSING_WAIT_NONE)
                        tty_wait_until_sent(tty, port->closing_wait);
                if (port->drain_delay)
                        tty_port_drain_delay(port, tty);
        }
        /* Flush the ldisc buffering */
        tty_ldisc_flush(tty);

        /* Report to caller this is the last port reference */
        return 1;
}
EXPORT_SYMBOL(tty_port_close_start);

/**
 * tty_port_close_end - helper for tty->ops->close, part 2/2
 * @port: tty_port of the device
 * @tty: tty being closed
 *
 * This is a continuation of the first part: tty_port_close_start(). This
 * should be called after turning off the device. It flushes the data from the
 * line discipline and delays the close by @port->close_delay.
 *
 * Locking: Caller holds tty lock.
 */
void tty_port_close_end(struct tty_port *port, struct tty_struct *tty)
{
        unsigned long flags;

        tty_ldisc_flush(tty);
        tty->closing = 0;

        spin_lock_irqsave(&port->lock, flags);

        if (port->blocked_open) {
                spin_unlock_irqrestore(&port->lock, flags);
                if (port->close_delay)
                        msleep_interruptible(jiffies_to_msecs(port->close_delay));
                spin_lock_irqsave(&port->lock, flags);
                wake_up_interruptible(&port->open_wait);
        }
        spin_unlock_irqrestore(&port->lock, flags);
        tty_port_set_active(port, false);
}
EXPORT_SYMBOL(tty_port_close_end);

/**
 * tty_port_close - generic tty->ops->close handler
 * @port: tty_port of the device
 * @tty: tty being closed
 * @filp: passed file pointer
 *
 * It is a generic helper to be used in driver's @tty->ops->close. It wraps a
 * sequence of tty_port_close_start(), tty_port_shutdown(), and
 * tty_port_close_end(). The latter two are called only if this is the last
 * close. See the respective functions for the details.
 *
 * Locking: Caller holds tty lock
 */
void tty_port_close(struct tty_port *port, struct tty_struct *tty,
                                                        struct file *filp)
{
        if (tty_port_close_start(port, tty, filp) == 0)
                return;
        tty_port_shutdown(port, tty);
        if (!port->console)
                set_bit(TTY_IO_ERROR, &tty->flags);
        tty_port_close_end(port, tty);
        tty_port_tty_set(port, NULL);
}
EXPORT_SYMBOL(tty_port_close);

/**
 * tty_port_install - generic tty->ops->install handler
 * @port: tty_port of the device
 * @driver: tty_driver for this device
 * @tty: tty to be installed
 *
 * It is the same as tty_standard_install() except the provided @port is linked
 * to a concrete tty specified by @tty. Use this or tty_port_register_device()
 * (or both). Call tty_port_link_device() as a last resort.
 */
int tty_port_install(struct tty_port *port, struct tty_driver *driver,
                struct tty_struct *tty)
{
        tty->port = port;
        tty_port_link_driver_wq(port, driver);
        return tty_standard_install(driver, tty);
}
EXPORT_SYMBOL_GPL(tty_port_install);

/**
 * tty_port_open - generic tty->ops->open handler
 * @port: tty_port of the device
 * @tty: tty to be opened
 * @filp: passed file pointer
 *
 * It is a generic helper to be used in driver's @tty->ops->open. It activates
 * the devices using @port->ops->activate if not active already. And waits for
 * the device to be ready using tty_port_block_til_ready() (e.g.  raises
 * DTR/CTS and waits for carrier).
 *
 * Note that @port->ops->shutdown is not called when @port->ops->activate
 * returns an error (on the contrary, @tty->ops->close is).
 *
 * Locking: Caller holds tty lock.
 *
 * Note: may drop and reacquire tty lock (in tty_port_block_til_ready()) so
 * @tty and @port may have changed state (eg., may be hung up now).
 */
int tty_port_open(struct tty_port *port, struct tty_struct *tty,
                                                        struct file *filp)
{
        scoped_guard(spinlock_irq, &port->lock)
                ++port->count;
        tty_port_tty_set(port, tty);

        /*
         * Do the device-specific open only if the hardware isn't
         * already initialized. Serialize open and shutdown using the
         * port mutex.
         */

        scoped_guard(mutex, &port->mutex) {
                if (tty_port_initialized(port))
                        break;
                clear_bit(TTY_IO_ERROR, &tty->flags);
                if (port->ops->activate) {
                        int retval = port->ops->activate(port, tty);
                        if (retval)
                                return retval;
                }
                tty_port_set_initialized(port, true);
        }
        return tty_port_block_til_ready(port, tty, filp);
}
EXPORT_SYMBOL(tty_port_open);










































































































































































    1 






    1 



    1 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
// SPDX-License-Identifier: GPL-2.0
#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/smp.h>
#include <linux/sem.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <linux/stat.h>
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/uaccess.h>
#include <linux/elf.h>
#include <linux/hugetlb.h>

#include <asm/elf.h>
#include <asm/ia32.h>

/*
 * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
 */
static unsigned long get_align_mask(struct file *filp)
{
        if (filp && is_file_hugepages(filp))
                return huge_page_mask_align(filp);
        /* handle 32- and 64-bit case with a single conditional */
        if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
                return 0;

        if (!(current->flags & PF_RANDOMIZE))
                return 0;

        return va_align.mask;
}

/*
 * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
 * va_align.bits, [12:upper_bit), are set to a random value instead of
 * zeroing them. This random value is computed once per boot. This form
 * of ASLR is known as "per-boot ASLR".
 *
 * To achieve this, the random value is added to the info.align_offset
 * value before calling vm_unmapped_area() or ORed directly to the
 * address.
 */
static unsigned long get_align_bits(void)
{
        return va_align.bits & get_align_mask(NULL);
}

static int __init control_va_addr_alignment(char *str)
{
        /* guard against enabling this on other CPU families */
        if (va_align.flags < 0)
                return 1;

        if (*str == 0)
                return 1;

        if (!strcmp(str, "32"))
                va_align.flags = ALIGN_VA_32;
        else if (!strcmp(str, "64"))
                va_align.flags = ALIGN_VA_64;
        else if (!strcmp(str, "off"))
                va_align.flags = 0;
        else if (!strcmp(str, "on"))
                va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
        else
                pr_warn("invalid option value: 'align_va_addr=%s'\n", str);

        return 1;
}
__setup("align_va_addr=", control_va_addr_alignment);

SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, off)
{
        if (off & ~PAGE_MASK)
                return -EINVAL;

        return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}

static void find_start_end(unsigned long addr, unsigned long flags,
                unsigned long *begin, unsigned long *end)
{
        if (!in_32bit_syscall() && (flags & MAP_32BIT)) {
                /* This is usually used needed to map code in small
                   model, so it needs to be in the first 31bit. Limit
                   it to that.  This means we need to move the
                   unmapped base down for this case. This can give
                   conflicts with the heap, but we assume that glibc
                   malloc knows how to fall back to mmap. Give it 1GB
                   of playground for now. -AK */
                *begin = 0x40000000;
                *end = 0x80000000;
                if (current->flags & PF_RANDOMIZE) {
                        *begin = randomize_page(*begin, 0x02000000);
                }
                return;
        }

        *begin        = get_mmap_base(1);
        if (in_32bit_syscall())
                *end = task_size_32bit();
        else
                *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
}

static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
{
        if (vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len,
                       unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        struct vm_unmapped_area_info info = {};
        unsigned long begin, end;

        if (flags & MAP_FIXED)
                return addr;

        find_start_end(addr, flags, &begin, &end);

        if (len > end)
                return -ENOMEM;

        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma(mm, addr);
                if (end - len >= addr &&
                    (!vma || addr + len <= vm_start_gap(vma)))
                        return addr;
        }

        info.length = len;
        info.low_limit = begin;
        info.high_limit = end;
        if (!(filp && is_file_hugepages(filp))) {
                info.align_offset = pgoff << PAGE_SHIFT;
                info.start_gap = stack_guard_placement(vm_flags);
        }
        if (filp) {
                info.align_mask = get_align_mask(filp);
                info.align_offset += get_align_bits();
        }

        return vm_unmapped_area(&info);
}

unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags, vm_flags_t vm_flags)
{
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
        unsigned long addr = addr0;
        struct vm_unmapped_area_info info = {};

        /* requested length too big for entire address space */
        if (len > TASK_SIZE)
                return -ENOMEM;

        /* No address checking. See comment at mmap_address_hint_valid() */
        if (flags & MAP_FIXED)
                return addr;

        /* for MAP_32BIT mappings we force the legacy mmap base */
        if (!in_32bit_syscall() && (flags & MAP_32BIT))
                goto bottomup;

        /* requesting a specific address */
        if (addr) {
                addr &= PAGE_MASK;
                if (!mmap_address_hint_valid(addr, len))
                        goto get_unmapped_area;

                vma = find_vma(mm, addr);
                if (!vma || addr + len <= vm_start_gap(vma))
                        return addr;
        }
get_unmapped_area:

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        if (!in_32bit_syscall() && (flags & MAP_ABOVE4G))
                info.low_limit = SZ_4G;
        else
                info.low_limit = PAGE_SIZE;

        info.high_limit = get_mmap_base(0);
        if (!(filp && is_file_hugepages(filp))) {
                info.start_gap = stack_guard_placement(vm_flags);
                info.align_offset = pgoff << PAGE_SHIFT;
        }

        /*
         * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
         * in the full address space.
         *
         * !in_32bit_syscall() check to avoid high addresses for x32
         * (and make it no op on native i386).
         */
        if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
                info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;

        if (filp) {
                info.align_mask = get_align_mask(filp);
                info.align_offset += get_align_bits();
        }
        addr = vm_unmapped_area(&info);
        if (!(addr & ~PAGE_MASK))
                return addr;
        VM_BUG_ON(addr != -ENOMEM);

bottomup:
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        return arch_get_unmapped_area(filp, addr0, len, pgoff, flags, 0);
}












































































































































































































































































    1 
    1 






    1 















    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
// SPDX-License-Identifier: GPL-2.0
/*
 * USB Serial Console driver
 *
 * Copyright (C) 2001 - 2002 Greg Kroah-Hartman (greg@kroah.com)
 *
 * Thanks to Randy Dunlap for the original version of this code.
 *
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/tty.h>
#include <linux/console.h>
#include <linux/serial.h>
#include <linux/usb.h>
#include <linux/usb/serial.h>

struct usbcons_info {
        int                        magic;
        int                        break_flag;
        struct usb_serial_port        *port;
};

static struct usbcons_info usbcons_info;
static struct console usbcons;

/*
 * ------------------------------------------------------------
 * USB Serial console driver
 *
 * Much of the code here is copied from drivers/char/serial.c
 * and implements a phony serial console in the same way that
 * serial.c does so that in case some software queries it,
 * it will get the same results.
 *
 * Things that are different from the way the serial port code
 * does things, is that we call the lower level usb-serial
 * driver code to initialize the device, and we set the initial
 * console speeds based on the command line arguments.
 * ------------------------------------------------------------
 */

static const struct tty_operations usb_console_fake_tty_ops = {
};

/*
 * The parsing of the command line works exactly like the
 * serial.c code, except that the specifier is "ttyUSB" instead
 * of "ttyS".
 */
static int usb_console_setup(struct console *co, char *options)
{
        struct usbcons_info *info = &usbcons_info;
        int baud = 9600;
        int bits = 8;
        int parity = 'n';
        int doflow = 0;
        int cflag = CREAD | HUPCL | CLOCAL;
        char *s;
        struct usb_serial *serial;
        struct usb_serial_port *port;
        int retval;
        struct tty_struct *tty = NULL;
        struct ktermios dummy;

        if (options) {
                baud = simple_strtoul(options, NULL, 10);
                s = options;
                while (*s >= '0' && *s <= '9')
                        s++;
                if (*s)
                        parity = *s++;
                if (*s)
                        bits   = *s++ - '0';
                if (*s)
                        doflow = (*s++ == 'r');
        }

        /* Sane default */
        if (baud == 0)
                baud = 9600;

        switch (bits) {
        case 7:
                cflag |= CS7;
                break;
        default:
        case 8:
                cflag |= CS8;
                break;
        }
        switch (parity) {
        case 'o': case 'O':
                cflag |= PARODD;
                break;
        case 'e': case 'E':
                cflag |= PARENB;
                break;
        }

        if (doflow)
                cflag |= CRTSCTS;

        /*
         * no need to check the index here: if the index is wrong, console
         * code won't call us
         */
        port = usb_serial_port_get_by_minor(co->index);
        if (port == NULL) {
                /* no device is connected yet, sorry :( */
                pr_err("No USB device connected to ttyUSB%i\n", co->index);
                return -ENODEV;
        }
        serial = port->serial;

        retval = usb_autopm_get_interface(serial->interface);
        if (retval)
                goto error_get_interface;

        tty_port_tty_set(&port->port, NULL);

        info->port = port;

        ++port->port.count;
        if (!tty_port_initialized(&port->port)) {
                if (serial->type->set_termios) {
                        /*
                         * allocate a fake tty so the driver can initialize
                         * the termios structure, then later call set_termios to
                         * configure according to command line arguments
                         */
                        tty = kzalloc_obj(*tty);
                        if (!tty) {
                                retval = -ENOMEM;
                                goto reset_open_count;
                        }
                        kref_init(&tty->kref);
                        tty->driver = usb_serial_tty_driver;
                        tty->index = co->index;
                        init_ldsem(&tty->ldisc_sem);
                        spin_lock_init(&tty->files_lock);
                        INIT_LIST_HEAD(&tty->tty_files);
                        kref_get(&tty->driver->kref);
                        __module_get(tty->driver->owner);
                        tty->ops = &usb_console_fake_tty_ops;
                        tty_init_termios(tty);
                        tty_port_tty_set(&port->port, tty);
                }

                /* only call the device specific open if this
                 * is the first time the port is opened */
                retval = serial->type->open(NULL, port);
                if (retval) {
                        dev_err(&port->dev, "could not open USB console port\n");
                        goto fail;
                }

                if (serial->type->set_termios) {
                        tty->termios.c_cflag = cflag;
                        tty_termios_encode_baud_rate(&tty->termios, baud, baud);
                        memset(&dummy, 0, sizeof(struct ktermios));
                        serial->type->set_termios(tty, port, &dummy);

                        tty_port_tty_set(&port->port, NULL);
                        tty_save_termios(tty);
                        tty_kref_put(tty);
                }
                tty_port_set_initialized(&port->port, true);
        }
        /* Now that any required fake tty operations are completed restore
         * the tty port count */
        --port->port.count;
        /* The console is special in terms of closing the device so
         * indicate this port is now acting as a system console. */
        port->port.console = 1;

        mutex_unlock(&serial->disc_mutex);
        return retval;

 fail:
        tty_port_tty_set(&port->port, NULL);
        tty_kref_put(tty);
 reset_open_count:
        port->port.count = 0;
        info->port = NULL;
        usb_autopm_put_interface(serial->interface);
 error_get_interface:
        mutex_unlock(&serial->disc_mutex);
        usb_serial_put(serial);
        return retval;
}

static void usb_console_write(struct console *co,
                                        const char *buf, unsigned count)
{
        static struct usbcons_info *info = &usbcons_info;
        struct usb_serial_port *port = info->port;
        struct usb_serial *serial;
        int retval = -ENODEV;

        if (!port || port->serial->dev->state == USB_STATE_NOTATTACHED)
                return;
        serial = port->serial;

        if (count == 0)
                return;

        dev_dbg(&port->dev, "%s - %d byte(s)\n", __func__, count);

        if (!port->port.console) {
                dev_dbg(&port->dev, "%s - port not opened\n", __func__);
                return;
        }

        while (count) {
                unsigned int i;
                unsigned int lf;
                /* search for LF so we can insert CR if necessary */
                for (i = 0, lf = 0 ; i < count ; i++) {
                        if (*(buf + i) == 10) {
                                lf = 1;
                                i++;
                                break;
                        }
                }
                /* pass on to the driver specific version of this function if
                   it is available */
                retval = serial->type->write(NULL, port, buf, i);
                dev_dbg(&port->dev, "%s - write: %d\n", __func__, retval);
                if (lf) {
                        /* append CR after LF */
                        unsigned char cr = 13;
                        retval = serial->type->write(NULL, port, &cr, 1);
                        dev_dbg(&port->dev, "%s - write cr: %d\n",
                                                        __func__, retval);
                }
                buf += i;
                count -= i;
        }
}

static struct tty_driver *usb_console_device(struct console *co, int *index)
{
        struct tty_driver **p = (struct tty_driver **)co->data;

        if (!*p)
                return NULL;

        *index = co->index;
        return *p;
}

static struct console usbcons = {
        .name =                "ttyUSB",
        .write =        usb_console_write,
        .device =        usb_console_device,
        .setup =        usb_console_setup,
        .flags =        CON_PRINTBUFFER,
        .index =        -1,
        .data =         &usb_serial_tty_driver,
};

void usb_serial_console_disconnect(struct usb_serial *serial)
{
        if (serial->port[0] && serial->port[0] == usbcons_info.port) {
                usb_serial_console_exit();
                usb_serial_put(serial);
        }
}

void usb_serial_console_init(int minor)
{
        if (minor == 0) {
                /*
                 * Call register_console() if this is the first device plugged
                 * in.  If we call it earlier, then the callback to
                 * console_setup() will fail, as there is not a device seen by
                 * the USB subsystem yet.
                 */
                /*
                 * Register console.
                 * NOTES:
                 * console_setup() is called (back) immediately (from
                 * register_console). console_write() is called immediately
                 * from register_console iff CON_PRINTBUFFER is set in flags.
                 */
                pr_debug("registering the USB serial console.\n");
                register_console(&usbcons);
        }
}

void usb_serial_console_exit(void)
{
        if (usbcons_info.port) {
                unregister_console(&usbcons);
                usbcons_info.port->port.console = 0;
                usbcons_info.port = NULL;
        }
}
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_PREEMPT_H
#define __ASM_PREEMPT_H

#include <asm/rmwcc.h>
#include <asm/percpu.h>

#include <linux/static_call_types.h>

DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count);

/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED        0x80000000

/*
 * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
 * that a decrement hitting 0 means we can and should reschedule.
 */
#define PREEMPT_ENABLED        (0 + PREEMPT_NEED_RESCHED)

/*
 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
 * that think a non-zero value indicates we cannot preempt.
 */
static __always_inline int preempt_count(void)
{
        return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
}

static __always_inline void preempt_count_set(int pc)
{
        int old, new;

        old = raw_cpu_read_4(__preempt_count);
        do {
                new = (old & PREEMPT_NEED_RESCHED) |
                        (pc & ~PREEMPT_NEED_RESCHED);
        } while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new));
}

/*
 * must be macros to avoid header recursion hell
 */
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
        per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)

/*
 * We fold the NEED_RESCHED bit into the preempt count such that
 * preempt_enable() can decrement and test for needing to reschedule with a
 * single instruction.
 *
 * We invert the actual bit, so that when the decrement hits 0 we know we both
 * need to resched (the bit is cleared) and can resched (no preempt count).
 */

static __always_inline void set_preempt_need_resched(void)
{
        raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
}

static __always_inline void clear_preempt_need_resched(void)
{
        raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
}

static __always_inline bool test_preempt_need_resched(void)
{
        return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
}

/*
 * The various preempt_count add/sub methods
 */

static __always_inline void __preempt_count_add(int val)
{
        raw_cpu_add_4(__preempt_count, val);
}

static __always_inline void __preempt_count_sub(int val)
{
        raw_cpu_add_4(__preempt_count, -val);
}

/*
 * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
 * a decrement which hits zero means we have no preempt_count and should
 * reschedule.
 */
static __always_inline bool __preempt_count_dec_and_test(void)
{
        return GEN_UNARY_RMWcc("decl", __my_cpu_var(__preempt_count), e,
                               __percpu_arg([var]));
}

/*
 * Returns true when we need to resched and can (barring IRQ state).
 */
static __always_inline bool should_resched(int preempt_offset)
{
        return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
}

#ifdef CONFIG_PREEMPTION

extern asmlinkage void preempt_schedule(void);
extern asmlinkage void preempt_schedule_thunk(void);

#define preempt_schedule_dynamic_enabled        preempt_schedule_thunk
#define preempt_schedule_dynamic_disabled        NULL

extern asmlinkage void preempt_schedule_notrace(void);
extern asmlinkage void preempt_schedule_notrace_thunk(void);

#define preempt_schedule_notrace_dynamic_enabled        preempt_schedule_notrace_thunk
#define preempt_schedule_notrace_dynamic_disabled        NULL

#ifdef CONFIG_PREEMPT_DYNAMIC

DECLARE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);

#define __preempt_schedule() \
do { \
        __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \
        asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
} while (0)

DECLARE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);

#define __preempt_schedule_notrace() \
do { \
        __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \
        asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \
} while (0)

#else /* PREEMPT_DYNAMIC */

#define __preempt_schedule() \
        asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT);

#define __preempt_schedule_notrace() \
        asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT);

#endif /* PREEMPT_DYNAMIC */

#endif /* PREEMPTION */

#endif /* __ASM_PREEMPT_H */












































































































































































































































































































































































































































    3 


































































































































































































































































































































































































































































































































































































































    3 








    3 






    2 












    3 


















    3 














    3 


    3 





    2 
























    3 




    3 





    3 


    3 



    3 

    2 





















    1 

























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Internet Control Message Protocol (ICMPv6)
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Based on net/ipv4/icmp.c
 *
 *        RFC 1885
 */

/*
 *        Changes:
 *
 *        Andi Kleen                :        exception handling
 *        Andi Kleen                        add rate limits. never reply to a icmp.
 *                                        add more length checks and other fixes.
 *        yoshfuji                :        ensure to sent parameter problem for
 *                                        fragments.
 *        YOSHIFUJI Hideaki @USAGI:        added sysctl for icmp rate limit.
 *        Randy Dunlap and
 *        YOSHIFUJI Hideaki @USAGI:        Per-interface statistics support
 *        Kazunori MIYAZAWA @USAGI:       change output process to use ip6_append_data
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/netfilter.h>
#include <linux/slab.h>

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/icmpv6.h>

#include <net/ip.h>
#include <net/sock.h>

#include <net/ipv6.h>
#include <net/ip6_checksum.h>
#include <net/ping.h>
#include <net/protocol.h>
#include <net/raw.h>
#include <net/rawv6.h>
#include <net/seg6.h>
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/icmp.h>
#include <net/xfrm.h>
#include <net/inet_common.h>
#include <net/dsfield.h>
#include <net/l3mdev.h>

#include <linux/uaccess.h>

static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk);

static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                       u8 type, u8 code, int offset, __be32 info)
{
        /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */
        struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset);
        struct net *net = dev_net_rcu(skb->dev);

        if (type == ICMPV6_PKT_TOOBIG)
                ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL));
        else if (type == NDISC_REDIRECT)
                ip6_redirect(skb, net, skb->dev->ifindex, 0,
                             sock_net_uid(net, NULL));

        if (!(type & ICMPV6_INFOMSG_MASK))
                if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
                        ping_err(skb, offset, ntohl(info));

        return 0;
}

static int icmpv6_rcv(struct sk_buff *skb);

static const struct inet6_protocol icmpv6_protocol = {
        .handler        =        icmpv6_rcv,
        .err_handler        =        icmpv6_err,
        .flags                =        INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};

/* Called with BH disabled */
static struct sock *icmpv6_xmit_lock(struct net *net)
{
        struct sock *sk;

        sk = this_cpu_read(ipv6_icmp_sk);
        if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
                /* This can happen if the output path (f.e. SIT or
                 * ip6ip6 tunnel) signals dst_link_failure() for an
                 * outgoing ICMP6 packet.
                 */
                return NULL;
        }
        sock_net_set(sk, net);
        return sk;
}

static void icmpv6_xmit_unlock(struct sock *sk)
{
        sock_net_set(sk, &init_net);
        spin_unlock(&sk->sk_lock.slock);
}

/*
 * Figure out, may we reply to this packet with icmp error.
 *
 * We do not reply, if:
 *        - it was icmp error message.
 *        - it is truncated, so that it is known, that protocol is ICMPV6
 *          (i.e. in the middle of some exthdr)
 *
 *        --ANK (980726)
 */

static bool is_ineligible(const struct sk_buff *skb)
{
        int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data;
        int len = skb->len - ptr;
        __u8 nexthdr = ipv6_hdr(skb)->nexthdr;
        __be16 frag_off;

        if (len < 0)
                return true;

        ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off);
        if (ptr < 0)
                return false;
        if (nexthdr == IPPROTO_ICMPV6) {
                u8 _type, *tp;
                tp = skb_header_pointer(skb,
                        ptr+offsetof(struct icmp6hdr, icmp6_type),
                        sizeof(_type), &_type);

                /* Based on RFC 8200, Section 4.5 Fragment Header, return
                 * false if this is a fragment packet with no icmp header info.
                 */
                if (!tp && frag_off != 0)
                        return false;
                else if (!tp || !(*tp & ICMPV6_INFOMSG_MASK))
                        return true;
        }
        return false;
}

static bool icmpv6_mask_allow(struct net *net, int type)
{
        if (type > ICMPV6_MSG_MAX)
                return true;

        /* Limit if icmp type is set in ratemask. */
        if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask))
                return true;

        return false;
}

static bool icmpv6_global_allow(struct net *net, int type,
                                bool *apply_ratelimit)
{
        if (icmpv6_mask_allow(net, type))
                return true;

        if (icmp_global_allow(net)) {
                *apply_ratelimit = true;
                return true;
        }
        __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
        return false;
}

/*
 * Check the ICMP output rate limit
 */
static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
                               struct flowi6 *fl6, bool apply_ratelimit)
{
        struct net *net = sock_net(sk);
        struct net_device *dev;
        struct dst_entry *dst;
        bool res = false;

        if (!apply_ratelimit)
                return true;

        /*
         * Look up the output route.
         * XXX: perhaps the expire for routing entries cloned by
         * this lookup should be more aggressive (not longer than timeout).
         */
        dst = ip6_route_output(net, sk, fl6);
        rcu_read_lock();
        dev = dst_dev_rcu(dst);
        if (dst->error) {
                IP6_INC_STATS(net, ip6_dst_idev(dst),
                              IPSTATS_MIB_OUTNOROUTES);
        } else if (dev && (dev->flags & IFF_LOOPBACK)) {
                res = true;
        } else {
                int tmo = READ_ONCE(net->ipv6.sysctl.icmpv6_time);
                struct inet_peer *peer;

                if (!tmo) {
                        res = true;
                } else {
                        peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr);
                        res = inet_peer_xrlim_allow(peer, tmo);
                }
        }
        rcu_read_unlock();
        if (!res)
                __ICMP6_INC_STATS(net, NULL, ICMP6_MIB_RATELIMITHOST);
        else
                icmp_global_consume(net);
        dst_release(dst);
        return res;
}

static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type,
                                  struct flowi6 *fl6)
{
        struct net *net = sock_net(sk);
        struct dst_entry *dst;
        bool res = false;

        dst = ip6_route_output(net, sk, fl6);
        if (!dst->error) {
                struct rt6_info *rt = dst_rt6_info(dst);
                struct in6_addr prefsrc;

                rt6_get_prefsrc(rt, &prefsrc);
                res = !ipv6_addr_any(&prefsrc);
        }
        dst_release(dst);
        return res;
}

/*
 *        an inline helper for the "simple" if statement below
 *        checks if parameter problem report is caused by an
 *        unrecognized IPv6 option that has the Option Type
 *        highest-order two bits set to 10
 */

static bool opt_unrec(struct sk_buff *skb, __u32 offset)
{
        u8 _optval, *op;

        offset += skb_network_offset(skb);
        op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval);
        if (!op)
                return true;
        return (*op & 0xC0) == 0x80;
}

void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
                                struct icmp6hdr *thdr, int len)
{
        struct sk_buff *skb;
        struct icmp6hdr *icmp6h;

        skb = skb_peek(&sk->sk_write_queue);
        if (!skb)
                return;

        icmp6h = icmp6_hdr(skb);
        memcpy(icmp6h, thdr, sizeof(struct icmp6hdr));
        icmp6h->icmp6_cksum = 0;

        if (skb_queue_len(&sk->sk_write_queue) == 1) {
                skb->csum = csum_partial(icmp6h,
                                        sizeof(struct icmp6hdr), skb->csum);
                icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
                                                      &fl6->daddr,
                                                      len, fl6->flowi6_proto,
                                                      skb->csum);
        } else {
                __wsum tmp_csum = 0;

                skb_queue_walk(&sk->sk_write_queue, skb) {
                        tmp_csum = csum_add(tmp_csum, skb->csum);
                }

                tmp_csum = csum_partial(icmp6h,
                                        sizeof(struct icmp6hdr), tmp_csum);
                icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
                                                      &fl6->daddr,
                                                      len, fl6->flowi6_proto,
                                                      tmp_csum);
        }
        ip6_push_pending_frames(sk);
}

struct icmpv6_msg {
        struct sk_buff        *skb;
        int                offset;
        uint8_t                type;
};

static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
        struct icmpv6_msg *msg = (struct icmpv6_msg *) from;
        struct sk_buff *org_skb = msg->skb;
        __wsum csum;

        csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset,
                                      to, len);
        skb->csum = csum_block_add(skb->csum, csum, odd);
        if (!(msg->type & ICMPV6_INFOMSG_MASK))
                nf_ct_attach(skb, org_skb);
        return 0;
}

#if IS_ENABLED(CONFIG_IPV6_MIP6)
static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt)
{
        struct ipv6hdr *iph = ipv6_hdr(skb);
        struct ipv6_destopt_hao *hao;
        int off;

        if (opt->dsthao) {
                off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO);
                if (likely(off >= 0)) {
                        hao = (struct ipv6_destopt_hao *)
                                        (skb_network_header(skb) + off);
                        swap(iph->saddr, hao->addr);
                }
        }
}
#else
static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {}
#endif

static struct dst_entry *icmpv6_route_lookup(struct net *net,
                                             struct sk_buff *skb,
                                             struct sock *sk,
                                             struct flowi6 *fl6)
{
        struct dst_entry *dst, *dst2;
        struct flowi6 fl2;
        int err;

        err = ip6_dst_lookup(net, sk, &dst, fl6);
        if (err)
                return ERR_PTR(err);

        /*
         * We won't send icmp if the destination is known
         * anycast unless we need to treat anycast as unicast.
         */
        if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) &&
            ipv6_anycast_destination(dst, &fl6->daddr)) {
                net_dbg_ratelimited("icmp6_send: acast source\n");
                dst_release(dst);
                return ERR_PTR(-EINVAL);
        }

        /* No need to clone since we're just using its address. */
        dst2 = dst;

        dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0);
        if (!IS_ERR(dst)) {
                if (dst != dst2)
                        return dst;
        } else {
                if (PTR_ERR(dst) == -EPERM)
                        dst = NULL;
                else
                        return dst;
        }

        err = xfrm_decode_session_reverse(net, skb, flowi6_to_flowi(&fl2), AF_INET6);
        if (err)
                goto relookup_failed;

        err = ip6_dst_lookup(net, sk, &dst2, &fl2);
        if (err)
                goto relookup_failed;

        dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP);
        if (!IS_ERR(dst2)) {
                dst_release(dst);
                dst = dst2;
        } else {
                err = PTR_ERR(dst2);
                if (err == -EPERM) {
                        dst_release(dst);
                        return dst2;
                } else
                        goto relookup_failed;
        }

relookup_failed:
        if (dst)
                return dst;
        return ERR_PTR(err);
}

static struct net_device *icmp6_dev(const struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;

        /* for local traffic to local address, skb dev is the loopback
         * device. Check if there is a dst attached to the skb and if so
         * get the real device index. Same is needed for replies to a link
         * local address on a device enslaved to an L3 master device
         */
        if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) {
                const struct rt6_info *rt6 = skb_rt6_info(skb);

                /* The destination could be an external IP in Ext Hdr (SRv6, RPL, etc.),
                 * and ip6_null_entry could be set to skb if no route is found.
                 */
                if (rt6 && rt6->rt6i_idev)
                        dev = rt6->rt6i_idev->dev;
        }

        return dev;
}

static int icmp6_iif(const struct sk_buff *skb)
{
        return icmp6_dev(skb)->ifindex;
}

struct icmp6_ext_iio_addr6_subobj {
        __be16 afi;
        __be16 reserved;
        struct in6_addr addr6;
};

static unsigned int icmp6_ext_iio_len(void)
{
        return sizeof(struct icmp_extobj_hdr) +
                /* ifIndex */
                sizeof(__be32) +
                /* Interface Address Sub-Object */
                sizeof(struct icmp6_ext_iio_addr6_subobj) +
                /* Interface Name Sub-Object. Length must be a multiple of 4
                 * bytes.
                 */
                ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
                /* MTU */
                sizeof(__be32);
}

static unsigned int icmp6_ext_max_len(u8 ext_objs)
{
        unsigned int ext_max_len;

        ext_max_len = sizeof(struct icmp_ext_hdr);

        if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
                ext_max_len += icmp6_ext_iio_len();

        return ext_max_len;
}

static struct in6_addr *icmp6_ext_iio_addr6_find(const struct net_device *dev)
{
        struct inet6_dev *in6_dev;
        struct inet6_ifaddr *ifa;

        in6_dev = __in6_dev_get(dev);
        if (!in6_dev)
                return NULL;

        /* It is unclear from RFC 5837 which IP address should be chosen, but
         * it makes sense to choose a global unicast address.
         */
        list_for_each_entry_rcu(ifa, &in6_dev->addr_list, if_list) {
                if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DADFAILED))
                        continue;
                if (ipv6_addr_type(&ifa->addr) != IPV6_ADDR_UNICAST ||
                    ipv6_addr_src_scope(&ifa->addr) != IPV6_ADDR_SCOPE_GLOBAL)
                        continue;
                return &ifa->addr;
        }

        return NULL;
}

static void icmp6_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
                                     int iif)
{
        struct icmp_ext_iio_name_subobj *name_subobj;
        struct icmp_extobj_hdr *objh;
        struct net_device *dev;
        struct in6_addr *addr6;
        __be32 data;

        if (!iif)
                return;

        /* Add the fields in the order specified by RFC 5837. */
        objh = skb_put(skb, sizeof(*objh));
        objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
        objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);

        data = htonl(iif);
        skb_put_data(skb, &data, sizeof(__be32));
        objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;

        rcu_read_lock();

        dev = dev_get_by_index_rcu(net, iif);
        if (!dev)
                goto out;

        addr6 = icmp6_ext_iio_addr6_find(dev);
        if (addr6) {
                struct icmp6_ext_iio_addr6_subobj *addr6_subobj;

                addr6_subobj = skb_put_zero(skb, sizeof(*addr6_subobj));
                addr6_subobj->afi = htons(ICMP_AFI_IP6);
                addr6_subobj->addr6 = *addr6;
                objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
        }

        name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
        name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
        netdev_copy_name(dev, name_subobj->name);
        objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;

        data = htonl(READ_ONCE(dev->mtu));
        skb_put_data(skb, &data, sizeof(__be32));
        objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;

out:
        rcu_read_unlock();
        objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
}

static void icmp6_ext_objs_append(struct net *net, struct sk_buff *skb,
                                  u8 ext_objs, int iif)
{
        if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
                icmp6_ext_iio_iif_append(net, skb, iif);
}

static struct sk_buff *
icmp6_ext_append(struct net *net, struct sk_buff *skb_in,
                 struct icmp6hdr *icmp6h, unsigned int room, int iif)
{
        unsigned int payload_len, ext_max_len, ext_len;
        struct icmp_ext_hdr *ext_hdr;
        struct sk_buff *skb;
        u8 ext_objs;
        int nhoff;

        switch (icmp6h->icmp6_type) {
        case ICMPV6_DEST_UNREACH:
        case ICMPV6_TIME_EXCEED:
                break;
        default:
                return NULL;
        }

        /* Do not overwrite existing extensions. This can happen when we
         * receive an ICMPv4 message with extensions from a tunnel and
         * translate it to an ICMPv6 message towards an IPv6 host in the
         * overlay network.
         */
        if (icmp6h->icmp6_datagram_len)
                return NULL;

        ext_objs = READ_ONCE(net->ipv6.sysctl.icmpv6_errors_extension_mask);
        if (!ext_objs)
                return NULL;

        ext_max_len = icmp6_ext_max_len(ext_objs);
        if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
                return NULL;

        skb = skb_clone(skb_in, GFP_ATOMIC);
        if (!skb)
                return NULL;

        nhoff = skb_network_offset(skb);
        payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);

        if (!pskb_network_may_pull(skb, payload_len))
                goto free_skb;

        if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
            __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
                goto free_skb;

        if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
                goto free_skb;

        ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
        ext_hdr->version = ICMP_EXT_VERSION_2;

        icmp6_ext_objs_append(net, skb, ext_objs, iif);

        /* Do not send an empty extension structure. */
        ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
        if (ext_len == sizeof(*ext_hdr))
                goto free_skb;

        ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
        /* The length of the original datagram in 64-bit words (RFC 4884). */
        icmp6h->icmp6_datagram_len = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u64);

        return skb;

free_skb:
        consume_skb(skb);
        return NULL;
}

/*
 *        Send an ICMP message in response to a packet in error
 */
void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
                const struct in6_addr *force_saddr,
                const struct inet6_skb_parm *parm)
{
        struct inet6_dev *idev = NULL;
        struct ipv6hdr *hdr = ipv6_hdr(skb);
        struct sock *sk;
        struct net *net;
        struct ipv6_pinfo *np;
        const struct in6_addr *saddr = NULL;
        bool apply_ratelimit = false;
        struct sk_buff *ext_skb;
        struct dst_entry *dst;
        unsigned int room;
        struct icmp6hdr tmp_hdr;
        struct flowi6 fl6;
        struct icmpv6_msg msg;
        struct ipcm6_cookie ipc6;
        int iif = 0;
        int addr_type = 0;
        int len;
        u32 mark;

        if ((u8 *)hdr < skb->head ||
            (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb))
                return;

        if (!skb->dev)
                return;

        rcu_read_lock();

        net = dev_net_rcu(skb->dev);
        mark = IP6_REPLY_MARK(net, skb->mark);
        /*
         *        Make sure we respect the rules
         *        i.e. RFC 1885 2.4(e)
         *        Rule (e.1) is enforced by not using icmp6_send
         *        in any code that processes icmp errors.
         */
        addr_type = ipv6_addr_type(&hdr->daddr);

        if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) ||
            ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr))
                saddr = &hdr->daddr;

        /*
         *        Dest addr check
         */

        if (addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST) {
                if (type != ICMPV6_PKT_TOOBIG &&
                    !(type == ICMPV6_PARAMPROB &&
                      code == ICMPV6_UNK_OPTION &&
                      (opt_unrec(skb, info))))
                        goto out;

                saddr = NULL;
        }

        addr_type = ipv6_addr_type(&hdr->saddr);

        /*
         *        Source addr check
         */

        if (__ipv6_addr_needs_scope_id(addr_type)) {
                iif = icmp6_iif(skb);
        } else {
                /*
                 * The source device is used for looking up which routing table
                 * to use for sending an ICMP error.
                 */
                iif = l3mdev_master_ifindex(skb->dev);
        }

        /*
         *        Must not send error if the source does not uniquely
         *        identify a single node (RFC2463 Section 2.4).
         *        We check unspecified / multicast addresses here,
         *        and anycast addresses will be checked later.
         */
        if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
                net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n",
                                    &hdr->saddr, &hdr->daddr);
                goto out;
        }

        /*
         *        Never answer to a ICMP packet.
         */
        if (is_ineligible(skb)) {
                net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n",
                                    &hdr->saddr, &hdr->daddr);
                goto out;
        }

        /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */
        local_bh_disable();

        /* Check global sysctl_icmp_msgs_per_sec ratelimit */
        if (!(skb->dev->flags & IFF_LOOPBACK) &&
            !icmpv6_global_allow(net, type, &apply_ratelimit))
                goto out_bh_enable;

        mip6_addr_swap(skb, parm);

        sk = icmpv6_xmit_lock(net);
        if (!sk)
                goto out_bh_enable;

        memset(&fl6, 0, sizeof(fl6));
        fl6.flowi6_proto = IPPROTO_ICMPV6;
        fl6.daddr = hdr->saddr;
        if (force_saddr)
                saddr = force_saddr;
        if (saddr) {
                fl6.saddr = *saddr;
        } else if (!icmpv6_rt_has_prefsrc(sk, type, &fl6)) {
                /* select a more meaningful saddr from input if */
                struct net_device *in_netdev;

                in_netdev = dev_get_by_index(net, parm->iif);
                if (in_netdev) {
                        ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr,
                                           inet6_sk(sk)->srcprefs,
                                           &fl6.saddr);
                        dev_put(in_netdev);
                }
        }
        fl6.flowi6_mark = mark;
        fl6.flowi6_oif = iif;
        fl6.fl6_icmp_type = type;
        fl6.fl6_icmp_code = code;
        fl6.flowi6_uid = sock_net_uid(net, NULL);
        fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
        security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));

        np = inet6_sk(sk);

        if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit))
                goto out_unlock;

        tmp_hdr.icmp6_type = type;
        tmp_hdr.icmp6_code = code;
        tmp_hdr.icmp6_cksum = 0;
        tmp_hdr.icmp6_pointer = htonl(info);

        if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
                fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
        else if (!fl6.flowi6_oif)
                fl6.flowi6_oif = READ_ONCE(np->ucast_oif);

        ipcm6_init_sk(&ipc6, sk);
        ipc6.sockc.mark = mark;
        fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);

        dst = icmpv6_route_lookup(net, skb, sk, &fl6);
        if (IS_ERR(dst))
                goto out_unlock;

        ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);

        msg.skb = skb;
        msg.offset = skb_network_offset(skb);
        msg.type = type;

        room = IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr);
        ext_skb = icmp6_ext_append(net, skb, &tmp_hdr, room, parm->iif);
        if (ext_skb)
                msg.skb = ext_skb;

        len = msg.skb->len - msg.offset;
        len = min_t(unsigned int, len, room);
        if (len < 0) {
                net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n",
                                    &hdr->saddr, &hdr->daddr);
                goto out_dst_release;
        }

        idev = __in6_dev_get(skb->dev);

        if (ip6_append_data(sk, icmpv6_getfrag, &msg,
                            len + sizeof(struct icmp6hdr),
                            sizeof(struct icmp6hdr),
                            &ipc6, &fl6, dst_rt6_info(dst),
                            MSG_DONTWAIT)) {
                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
                ip6_flush_pending_frames(sk);
        } else {
                icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
                                           len + sizeof(struct icmp6hdr));
        }

out_dst_release:
        if (ext_skb)
                consume_skb(ext_skb);
        dst_release(dst);
out_unlock:
        icmpv6_xmit_unlock(sk);
out_bh_enable:
        local_bh_enable();
out:
        rcu_read_unlock();
}
EXPORT_SYMBOL(icmp6_send);

/* Slightly more convenient version of icmp6_send with drop reasons.
 */
void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos,
                              enum skb_drop_reason reason)
{
        icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb));
        kfree_skb_reason(skb, reason);
}

/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH
 * if sufficient data bytes are available
 * @nhs is the size of the tunnel header(s) :
 *  Either an IPv4 header for SIT encap
 *         an IPv4 header + GRE header for GRE encap
 */
int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
                               unsigned int data_len)
{
        struct in6_addr temp_saddr;
        struct rt6_info *rt;
        struct sk_buff *skb2;
        u32 info = 0;

        if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8))
                return 1;

        /* RFC 4884 (partial) support for ICMP extensions */
        if (data_len < 128 || (data_len & 7) || skb->len < data_len)
                data_len = 0;

        skb2 = data_len ? skb_copy(skb, GFP_ATOMIC) : skb_clone(skb, GFP_ATOMIC);

        if (!skb2)
                return 1;

        /* Remove debris left by IPv4 stack. */
        memset(IP6CB(skb2), 0, sizeof(*IP6CB(skb2)));

        skb_dst_drop(skb2);
        skb_pull(skb2, nhs);
        skb_reset_network_header(skb2);

        rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr,
                        NULL, 0, skb, 0);

        if (rt && rt->dst.dev)
                skb2->dev = rt->dst.dev;

        ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr);

        if (data_len) {
                /* RFC 4884 (partial) support :
                 * insert 0 padding at the end, before the extensions
                 */
                __skb_push(skb2, nhs);
                skb_reset_network_header(skb2);
                memmove(skb2->data, skb2->data + nhs, data_len - nhs);
                memset(skb2->data + data_len - nhs, 0, nhs);
                /* RFC 4884 4.5 : Length is measured in 64-bit words,
                 * and stored in reserved[0]
                 */
                info = (data_len/8) << 24;
        }
        if (type == ICMP_TIME_EXCEEDED)
                icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
                           info, &temp_saddr, IP6CB(skb2));
        else
                icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
                           info, &temp_saddr, IP6CB(skb2));
        if (rt)
                ip6_rt_put(rt);

        kfree_skb(skb2);

        return 0;
}
EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach);

static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
{
        struct net *net = dev_net_rcu(skb->dev);
        struct sock *sk;
        struct inet6_dev *idev;
        struct ipv6_pinfo *np;
        const struct in6_addr *saddr = NULL;
        struct icmp6hdr *icmph = icmp6_hdr(skb);
        bool apply_ratelimit = false;
        struct icmp6hdr tmp_hdr;
        struct flowi6 fl6;
        struct icmpv6_msg msg;
        struct dst_entry *dst;
        struct ipcm6_cookie ipc6;
        u32 mark = IP6_REPLY_MARK(net, skb->mark);
        SKB_DR(reason);
        bool acast;
        u8 type;

        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) &&
            net->ipv6.sysctl.icmpv6_echo_ignore_multicast)
                return reason;

        saddr = &ipv6_hdr(skb)->daddr;

        acast = ipv6_anycast_destination(skb_dst(skb), saddr);
        if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast)
                return reason;

        if (!ipv6_unicast_destination(skb) &&
            !(net->ipv6.sysctl.anycast_src_echo_reply && acast))
                saddr = NULL;

        if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST)
                type = ICMPV6_EXT_ECHO_REPLY;
        else
                type = ICMPV6_ECHO_REPLY;

        memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
        tmp_hdr.icmp6_type = type;

        memset(&fl6, 0, sizeof(fl6));
        if (READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) &
            FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES)
                fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb));

        fl6.flowi6_proto = IPPROTO_ICMPV6;
        fl6.daddr = ipv6_hdr(skb)->saddr;
        if (saddr)
                fl6.saddr = *saddr;
        fl6.flowi6_oif = ipv6_addr_loopback(&fl6.daddr) ?
                         skb->dev->ifindex :
                         icmp6_iif(skb);
        fl6.fl6_icmp_type = type;
        fl6.flowi6_mark = mark;
        fl6.flowi6_uid = sock_net_uid(net, NULL);
        security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));

        local_bh_disable();
        sk = icmpv6_xmit_lock(net);
        if (!sk)
                goto out_bh_enable;
        np = inet6_sk(sk);

        if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
                fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
        else if (!fl6.flowi6_oif)
                fl6.flowi6_oif = READ_ONCE(np->ucast_oif);

        if (ip6_dst_lookup(net, sk, &dst, &fl6))
                goto out;
        dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
        if (IS_ERR(dst))
                goto out;

        /* Check the ratelimit */
        if ((!(skb->dev->flags & IFF_LOOPBACK) &&
            !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) ||
            !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit))
                goto out_dst_release;

        idev = __in6_dev_get(skb->dev);

        msg.skb = skb;
        msg.offset = 0;
        msg.type = type;

        ipcm6_init_sk(&ipc6, sk);
        ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
        ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
        ipc6.sockc.mark = mark;

        if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST)
                if (!icmp_build_probe(skb, (struct icmphdr *)&tmp_hdr))
                        goto out_dst_release;

        if (ip6_append_data(sk, icmpv6_getfrag, &msg,
                            skb->len + sizeof(struct icmp6hdr),
                            sizeof(struct icmp6hdr), &ipc6, &fl6,
                            dst_rt6_info(dst), MSG_DONTWAIT)) {
                __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
                ip6_flush_pending_frames(sk);
        } else {
                icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
                                           skb->len + sizeof(struct icmp6hdr));
                reason = SKB_CONSUMED;
        }
out_dst_release:
        dst_release(dst);
out:
        icmpv6_xmit_unlock(sk);
out_bh_enable:
        local_bh_enable();
        return reason;
}

enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
                                   u8 code, __be32 info)
{
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct net *net = dev_net_rcu(skb->dev);
        const struct inet6_protocol *ipprot;
        enum skb_drop_reason reason;
        int inner_offset;
        __be16 frag_off;
        u8 nexthdr;

        reason = pskb_may_pull_reason(skb, sizeof(struct ipv6hdr));
        if (reason != SKB_NOT_DROPPED_YET)
                goto out;

        seg6_icmp_srh(skb, opt);

        nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr;
        if (ipv6_ext_hdr(nexthdr)) {
                /* now skip over extension headers */
                inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
                                                &nexthdr, &frag_off);
                if (inner_offset < 0) {
                        SKB_DR_SET(reason, IPV6_BAD_EXTHDR);
                        goto out;
                }
        } else {
                inner_offset = sizeof(struct ipv6hdr);
        }

        /* Checkin header including 8 bytes of inner protocol header. */
        reason = pskb_may_pull_reason(skb, inner_offset + 8);
        if (reason != SKB_NOT_DROPPED_YET)
                goto out;

        if (nexthdr == IPPROTO_RAW) {
                /* Add a more specific reason later ? */
                reason = SKB_DROP_REASON_NOT_SPECIFIED;
                goto out;
        }

        /* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
           Without this we will not able f.e. to make source routed
           pmtu discovery.
           Corresponding argument (opt) to notifiers is already added.
           --ANK (980726)
         */

        ipprot = rcu_dereference(inet6_protos[nexthdr]);
        if (ipprot && ipprot->err_handler)
                ipprot->err_handler(skb, opt, type, code, inner_offset, info);

        raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info);
        return SKB_CONSUMED;

out:
        __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
        return reason;
}

/*
 *        Handle icmp messages
 */

static int icmpv6_rcv(struct sk_buff *skb)
{
        enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct net *net = dev_net_rcu(skb->dev);
        struct net_device *dev = icmp6_dev(skb);
        struct inet6_dev *idev = __in6_dev_get(dev);
        const struct in6_addr *saddr, *daddr;
        struct icmp6hdr *hdr;
        u8 type;

        if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                struct sec_path *sp = skb_sec_path(skb);
                int nh;

                if (!(sp && sp->xvec[sp->len - 1]->props.flags &
                                 XFRM_STATE_ICMP)) {
                        reason = SKB_DROP_REASON_XFRM_POLICY;
                        goto drop_no_count;
                }

                if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr)))
                        goto drop_no_count;

                nh = skb_network_offset(skb);
                skb_set_network_header(skb, sizeof(*hdr));

                if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN,
                                                skb)) {
                        reason = SKB_DROP_REASON_XFRM_POLICY;
                        goto drop_no_count;
                }

                skb_set_network_header(skb, nh);
        }

        __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS);

        saddr = &ipv6_hdr(skb)->saddr;
        daddr = &ipv6_hdr(skb)->daddr;

        if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) {
                net_dbg_ratelimited("ICMPv6 checksum failed [%pI6c > %pI6c]\n",
                                    saddr, daddr);
                goto csum_error;
        }

        if (!pskb_pull(skb, sizeof(*hdr)))
                goto discard_it;

        hdr = icmp6_hdr(skb);

        type = hdr->icmp6_type;

        ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type);

        switch (type) {
        case ICMPV6_ECHO_REQUEST:
                if (!net->ipv6.sysctl.icmpv6_echo_ignore_all)
                        reason = icmpv6_echo_reply(skb);
                break;
        case ICMPV6_EXT_ECHO_REQUEST:
                if (!net->ipv6.sysctl.icmpv6_echo_ignore_all &&
                    READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
                        reason = icmpv6_echo_reply(skb);
                break;

        case ICMPV6_ECHO_REPLY:
        case ICMPV6_EXT_ECHO_REPLY:
                ping_rcv(skb);
                return 0;

        case ICMPV6_PKT_TOOBIG:
                /* BUGGG_FUTURE: if packet contains rthdr, we cannot update
                   standard destination cache. Seems, only "advanced"
                   destination cache will allow to solve this problem
                   --ANK (980726)
                 */
                if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
                        goto discard_it;
                hdr = icmp6_hdr(skb);

                /* to notify */
                fallthrough;
        case ICMPV6_DEST_UNREACH:
        case ICMPV6_TIME_EXCEED:
        case ICMPV6_PARAMPROB:
                reason = icmpv6_notify(skb, type, hdr->icmp6_code,
                                       hdr->icmp6_mtu);
                break;

        case NDISC_ROUTER_SOLICITATION:
        case NDISC_ROUTER_ADVERTISEMENT:
        case NDISC_NEIGHBOUR_SOLICITATION:
        case NDISC_NEIGHBOUR_ADVERTISEMENT:
        case NDISC_REDIRECT:
                reason = ndisc_rcv(skb);
                break;

        case ICMPV6_MGM_QUERY:
                igmp6_event_query(skb);
                return 0;

        case ICMPV6_MGM_REPORT:
                igmp6_event_report(skb);
                return 0;

        case ICMPV6_MGM_REDUCTION:
        case ICMPV6_NI_QUERY:
        case ICMPV6_NI_REPLY:
        case ICMPV6_MLD2_REPORT:
        case ICMPV6_DHAAD_REQUEST:
        case ICMPV6_DHAAD_REPLY:
        case ICMPV6_MOBILE_PREFIX_SOL:
        case ICMPV6_MOBILE_PREFIX_ADV:
                break;

        default:
                /* informational */
                if (type & ICMPV6_INFOMSG_MASK)
                        break;

                net_dbg_ratelimited("icmpv6: msg of unknown type [%pI6c > %pI6c]\n",
                                    saddr, daddr);

                /*
                 * error of unknown type.
                 * must pass to upper level
                 */

                reason = icmpv6_notify(skb, type, hdr->icmp6_code,
                                       hdr->icmp6_mtu);
        }

        /* until the v6 path can be better sorted assume failure and
         * preserve the status quo behaviour for the rest of the paths to here
         */
        if (reason)
                kfree_skb_reason(skb, reason);
        else
                consume_skb(skb);

        return 0;

csum_error:
        reason = SKB_DROP_REASON_ICMP_CSUM;
        __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS);
discard_it:
        __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS);
drop_no_count:
        kfree_skb_reason(skb, reason);
        return 0;
}

void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type,
                      const struct in6_addr *saddr,
                      const struct in6_addr *daddr, int oif)
{
        memset(fl6, 0, sizeof(*fl6));
        fl6->saddr = *saddr;
        fl6->daddr = *daddr;
        fl6->flowi6_proto        = IPPROTO_ICMPV6;
        fl6->fl6_icmp_type        = type;
        fl6->fl6_icmp_code        = 0;
        fl6->flowi6_oif                = oif;
        security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
}

int __init icmpv6_init(void)
{
        struct sock *sk;
        int err, i;

        for_each_possible_cpu(i) {
                err = inet_ctl_sock_create(&sk, PF_INET6,
                                           SOCK_RAW, IPPROTO_ICMPV6, &init_net);
                if (err < 0) {
                        pr_err("Failed to initialize the ICMP6 control socket (err %d)\n",
                               err);
                        return err;
                }

                per_cpu(ipv6_icmp_sk, i) = sk;

                /* Enough space for 2 64K ICMP packets, including
                 * sk_buff struct overhead.
                 */
                sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
        }

        err = -EAGAIN;
        if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
                goto fail;

        return 0;

fail:
        pr_err("Failed to register ICMP6 protocol\n");
        return err;
}

void icmpv6_cleanup(void)
{
        inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
}


static const struct icmp6_err {
        int err;
        int fatal;
} tab_unreach[] = {
        {        /* NOROUTE */
                .err        = ENETUNREACH,
                .fatal        = 0,
        },
        {        /* ADM_PROHIBITED */
                .err        = EACCES,
                .fatal        = 1,
        },
        {        /* Was NOT_NEIGHBOUR, now reserved */
                .err        = EHOSTUNREACH,
                .fatal        = 0,
        },
        {        /* ADDR_UNREACH        */
                .err        = EHOSTUNREACH,
                .fatal        = 0,
        },
        {        /* PORT_UNREACH        */
                .err        = ECONNREFUSED,
                .fatal        = 1,
        },
        {        /* POLICY_FAIL */
                .err        = EACCES,
                .fatal        = 1,
        },
        {        /* REJECT_ROUTE        */
                .err        = EACCES,
                .fatal        = 1,
        },
};

int icmpv6_err_convert(u8 type, u8 code, int *err)
{
        int fatal = 0;

        *err = EPROTO;

        switch (type) {
        case ICMPV6_DEST_UNREACH:
                fatal = 1;
                if (code < ARRAY_SIZE(tab_unreach)) {
                        *err  = tab_unreach[code].err;
                        fatal = tab_unreach[code].fatal;
                }
                break;

        case ICMPV6_PKT_TOOBIG:
                *err = EMSGSIZE;
                break;

        case ICMPV6_PARAMPROB:
                *err = EPROTO;
                fatal = 1;
                break;

        case ICMPV6_TIME_EXCEED:
                *err = EHOSTUNREACH;
                break;
        }

        return fatal;
}
EXPORT_SYMBOL(icmpv6_err_convert);

#ifdef CONFIG_SYSCTL

static u32 icmpv6_errors_extension_mask_all =
        GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);

static struct ctl_table ipv6_icmp_table_template[] = {
        {
                .procname        = "ratelimit",
                .data                = &init_net.ipv6.sysctl.icmpv6_time,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_ms_jiffies,
        },
        {
                .procname        = "echo_ignore_all",
                .data                = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler = proc_dou8vec_minmax,
        },
        {
                .procname        = "echo_ignore_multicast",
                .data                = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler = proc_dou8vec_minmax,
        },
        {
                .procname        = "echo_ignore_anycast",
                .data                = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler = proc_dou8vec_minmax,
        },
        {
                .procname        = "ratemask",
                .data                = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr,
                .maxlen                = ICMPV6_MSG_MAX + 1,
                .mode                = 0644,
                .proc_handler = proc_do_large_bitmap,
        },
        {
                .procname        = "error_anycast_as_unicast",
                .data                = &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler        = proc_dou8vec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "errors_extension_mask",
                .data                = &init_net.ipv6.sysctl.icmpv6_errors_extension_mask,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler        = proc_dou8vec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = &icmpv6_errors_extension_mask_all,
        },
};

struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
{
        struct ctl_table *table;

        table = kmemdup(ipv6_icmp_table_template,
                        sizeof(ipv6_icmp_table_template),
                        GFP_KERNEL);

        if (table) {
                table[0].data = &net->ipv6.sysctl.icmpv6_time;
                table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all;
                table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast;
                table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast;
                table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr;
                table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast;
                table[6].data = &net->ipv6.sysctl.icmpv6_errors_extension_mask;
        }
        return table;
}

size_t ipv6_icmp_sysctl_table_size(void)
{
        return ARRAY_SIZE(ipv6_icmp_table_template);
}
#endif






















































































    1 

    1 























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
/*
 * Copyright IBM Corporation, 2012
 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 */

#ifndef _LINUX_HUGETLB_CGROUP_H
#define _LINUX_HUGETLB_CGROUP_H

#include <linux/mmdebug.h>

struct hugetlb_cgroup;
struct resv_map;
struct file_region;

#ifdef CONFIG_CGROUP_HUGETLB
enum hugetlb_memory_event {
        HUGETLB_MAX,
        HUGETLB_NR_MEMORY_EVENTS,
};

struct hugetlb_cgroup_per_node {
        /* hugetlb usage in pages over all hstates. */
        unsigned long usage[HUGE_MAX_HSTATE];
};

struct hugetlb_cgroup {
        struct cgroup_subsys_state css;

        /*
         * the counter to account for hugepages from hugetlb.
         */
        struct page_counter hugepage[HUGE_MAX_HSTATE];

        /*
         * the counter to account for hugepage reservations from hugetlb.
         */
        struct page_counter rsvd_hugepage[HUGE_MAX_HSTATE];

        atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
        atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];

        /* Handle for "hugetlb.events" */
        struct cgroup_file events_file[HUGE_MAX_HSTATE];

        /* Handle for "hugetlb.events.local" */
        struct cgroup_file events_local_file[HUGE_MAX_HSTATE];

        struct hugetlb_cgroup_per_node *nodeinfo[];
};

static inline struct hugetlb_cgroup *
__hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd)
{
        VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        if (rsvd)
                return folio->_hugetlb_cgroup_rsvd;
        else
                return folio->_hugetlb_cgroup;
}

static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio)
{
        return __hugetlb_cgroup_from_folio(folio, false);
}

static inline struct hugetlb_cgroup *
hugetlb_cgroup_from_folio_rsvd(struct folio *folio)
{
        return __hugetlb_cgroup_from_folio(folio, true);
}

static inline void __set_hugetlb_cgroup(struct folio *folio,
                                       struct hugetlb_cgroup *h_cg, bool rsvd)
{
        VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        if (rsvd)
                folio->_hugetlb_cgroup_rsvd = h_cg;
        else
                folio->_hugetlb_cgroup = h_cg;
}

static inline void set_hugetlb_cgroup(struct folio *folio,
                                     struct hugetlb_cgroup *h_cg)
{
        __set_hugetlb_cgroup(folio, h_cg, false);
}

static inline void set_hugetlb_cgroup_rsvd(struct folio *folio,
                                          struct hugetlb_cgroup *h_cg)
{
        __set_hugetlb_cgroup(folio, h_cg, true);
}

static inline bool hugetlb_cgroup_disabled(void)
{
        return !cgroup_subsys_enabled(hugetlb_cgrp_subsys);
}

static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
{
        css_put(&h_cg->css);
}

static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
                                                struct resv_map *resv_map)
{
        if (resv_map->css)
                css_get(resv_map->css);
}

static inline void resv_map_put_hugetlb_cgroup_uncharge_info(
                                                struct resv_map *resv_map)
{
        if (resv_map->css)
                css_put(resv_map->css);
}

extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                        struct hugetlb_cgroup **ptr);
extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
                                             struct hugetlb_cgroup **ptr);
extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
                                         struct hugetlb_cgroup *h_cg,
                                         struct folio *folio);
extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
                                              struct hugetlb_cgroup *h_cg,
                                              struct folio *folio);
extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
                                         struct folio *folio);
extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
                                              struct folio *folio);

extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
                                           struct hugetlb_cgroup *h_cg);
extern void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
                                                struct hugetlb_cgroup *h_cg);
extern void hugetlb_cgroup_uncharge_counter(struct resv_map *resv,
                                            unsigned long start,
                                            unsigned long end);

extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
                                                struct file_region *rg,
                                                unsigned long nr_pages,
                                                bool region_del);

extern void hugetlb_cgroup_file_init(void) __init;
extern void hugetlb_cgroup_migrate(struct folio *old_folio,
                                   struct folio *new_folio);

#else
static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
                                                       struct file_region *rg,
                                                       unsigned long nr_pages,
                                                       bool region_del)
{
}

static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio)
{
        return NULL;
}

static inline struct hugetlb_cgroup *
hugetlb_cgroup_from_folio_rsvd(struct folio *folio)
{
        return NULL;
}

static inline void set_hugetlb_cgroup(struct folio *folio,
                                     struct hugetlb_cgroup *h_cg)
{
}

static inline void set_hugetlb_cgroup_rsvd(struct folio *folio,
                                          struct hugetlb_cgroup *h_cg)
{
}

static inline bool hugetlb_cgroup_disabled(void)
{
        return true;
}

static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
{
}

static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
                                                struct resv_map *resv_map)
{
}

static inline void resv_map_put_hugetlb_cgroup_uncharge_info(
                                                struct resv_map *resv_map)
{
}

static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                               struct hugetlb_cgroup **ptr)
{
        return 0;
}

static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx,
                                                    unsigned long nr_pages,
                                                    struct hugetlb_cgroup **ptr)
{
        return 0;
}

static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
                                                struct hugetlb_cgroup *h_cg,
                                                struct folio *folio)
{
}

static inline void
hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
                                  struct hugetlb_cgroup *h_cg,
                                  struct folio *folio)
{
}

static inline void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
                                                struct folio *folio)
{
}

static inline void hugetlb_cgroup_uncharge_folio_rsvd(int idx,
                                                     unsigned long nr_pages,
                                                     struct folio *folio)
{
}
static inline void hugetlb_cgroup_uncharge_cgroup(int idx,
                                                  unsigned long nr_pages,
                                                  struct hugetlb_cgroup *h_cg)
{
}

static inline void
hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
                                    struct hugetlb_cgroup *h_cg)
{
}

static inline void hugetlb_cgroup_uncharge_counter(struct resv_map *resv,
                                                   unsigned long start,
                                                   unsigned long end)
{
}

static inline void hugetlb_cgroup_file_init(void)
{
}

static inline void hugetlb_cgroup_migrate(struct folio *old_folio,
                                          struct folio *new_folio)
{
}

#endif  /* CONFIG_MEM_RES_CTLR_HUGETLB */
#endif































































































































































































































































































    1 






    1 
























































































































































    1 











































   10 
    9 









   11 
















   10 




    9 










































   12 





   11 



























































































































































































































































    4 






























































































    4 

























    4 



    4 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  This file contains the interface functions for the various time related
 *  system calls: time, stime, gettimeofday, settimeofday, adjtime
 *
 * Modification history:
 *
 * 1993-09-02    Philip Gladstone
 *      Created file with time related functions from sched/core.c and adjtimex()
 * 1993-10-08    Torsten Duwe
 *      adjtime interface update and CMOS clock write code
 * 1995-08-13    Torsten Duwe
 *      kernel PLL updated to 1994-12-13 specs (rfc-1589)
 * 1999-01-16    Ulrich Windl
 *        Introduced error checking for many cases in adjtimex().
 *        Updated NTP code according to technical memorandum Jan '96
 *        "A Kernel Model for Precision Timekeeping" by Dave Mills
 *        Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
 *        (Even though the technical memorandum forbids it)
 * 2004-07-14         Christoph Lameter
 *        Added getnstimeofday to allow the posix timer functions to return
 *        with nanosecond accuracy
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/timex.h>
#include <linux/capability.h>
#include <linux/timekeeper_internal.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/fs.h>
#include <linux/math64.h>
#include <linux/ptrace.h>

#include <linux/uaccess.h>
#include <linux/compat.h>
#include <asm/unistd.h>

#include <generated/timeconst.h>
#include "timekeeping.h"

/*
 * The timezone where the local system is located.  Used as a default by some
 * programs who obtain this value by using gettimeofday.
 */
struct timezone sys_tz;

EXPORT_SYMBOL(sys_tz);

#ifdef __ARCH_WANT_SYS_TIME

/*
 * sys_time() can be implemented in user-level using
 * sys_gettimeofday().  Is this for backwards compatibility?  If so,
 * why not move it into the appropriate arch directory (for those
 * architectures that need it).
 */
SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc)
{
        __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds();

        if (tloc) {
                if (put_user(i,tloc))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return i;
}

/*
 * sys_stime() can be implemented in user-level using
 * sys_settimeofday().  Is this for backwards compatibility?  If so,
 * why not move it into the appropriate arch directory (for those
 * architectures that need it).
 */

SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr)
{
        struct timespec64 tv;
        int err;

        if (get_user(tv.tv_sec, tptr))
                return -EFAULT;

        tv.tv_nsec = 0;

        err = security_settime64(&tv, NULL);
        if (err)
                return err;

        do_settimeofday64(&tv);
        return 0;
}

#endif /* __ARCH_WANT_SYS_TIME */

#ifdef CONFIG_COMPAT_32BIT_TIME
#ifdef __ARCH_WANT_SYS_TIME32

/* old_time32_t is a 32 bit "long" and needs to get converted. */
SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)
{
        old_time32_t i;

        i = (old_time32_t)ktime_get_real_seconds();

        if (tloc) {
                if (put_user(i,tloc))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return i;
}

SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
{
        struct timespec64 tv;
        int err;

        if (get_user(tv.tv_sec, tptr))
                return -EFAULT;

        tv.tv_nsec = 0;

        err = security_settime64(&tv, NULL);
        if (err)
                return err;

        do_settimeofday64(&tv);
        return 0;
}

#endif /* __ARCH_WANT_SYS_TIME32 */
#endif

SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv,
                struct timezone __user *, tz)
{
        if (likely(tv != NULL)) {
                struct timespec64 ts;

                ktime_get_real_ts64(&ts);
                if (put_user(ts.tv_sec, &tv->tv_sec) ||
                    put_user(ts.tv_nsec / 1000, &tv->tv_usec))
                        return -EFAULT;
        }
        if (unlikely(tz != NULL)) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }
        return 0;
}

/*
 * In case for some reason the CMOS clock has not already been running
 * in UTC, but in some local time: The first time we set the timezone,
 * we will warp the clock so that it is ticking UTC time instead of
 * local time. Presumably, if someone is setting the timezone then we
 * are running in an environment where the programs understand about
 * timezones. This should be done at boot time in the /etc/rc script,
 * as soon as possible, so that the clock can be set right. Otherwise,
 * various programs will get confused when the clock gets warped.
 */

int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz)
{
        static int firsttime = 1;
        int error = 0;

        if (tv && !timespec64_valid_settod(tv))
                return -EINVAL;

        error = security_settime64(tv, tz);
        if (error)
                return error;

        if (tz) {
                /* Verify we're within the +-15 hrs range */
                if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
                        return -EINVAL;

                sys_tz = *tz;
                update_vsyscall_tz();
                if (firsttime) {
                        firsttime = 0;
                        if (!tv)
                                timekeeping_warp_clock();
                }
        }
        if (tv)
                return do_settimeofday64(tv);
        return 0;
}

SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
                struct timezone __user *, tz)
{
        struct timespec64 new_ts;
        struct timezone new_tz;

        if (tv) {
                if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
                    get_user(new_ts.tv_nsec, &tv->tv_usec))
                        return -EFAULT;

                if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
                        return -EINVAL;

                new_ts.tv_nsec *= NSEC_PER_USEC;
        }
        if (tz) {
                if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                        return -EFAULT;
        }

        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv,
                       struct timezone __user *, tz)
{
        if (tv) {
                struct timespec64 ts;

                ktime_get_real_ts64(&ts);
                if (put_user(ts.tv_sec, &tv->tv_sec) ||
                    put_user(ts.tv_nsec / 1000, &tv->tv_usec))
                        return -EFAULT;
        }
        if (tz) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }

        return 0;
}

COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
                       struct timezone __user *, tz)
{
        struct timespec64 new_ts;
        struct timezone new_tz;

        if (tv) {
                if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
                    get_user(new_ts.tv_nsec, &tv->tv_usec))
                        return -EFAULT;

                if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
                        return -EINVAL;

                new_ts.tv_nsec *= NSEC_PER_USEC;
        }
        if (tz) {
                if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                        return -EFAULT;
        }

        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
#endif

#ifdef CONFIG_64BIT
SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
        struct __kernel_timex txc;                /* Local copy of parameter */
        int ret;

        /* Copy the user data space into the kernel copy
         * structure. But bear in mind that the structures
         * may change
         */
        if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex)))
                return -EFAULT;
        ret = do_adjtimex(&txc);
        return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret;
}
#endif

#ifdef CONFIG_COMPAT_32BIT_TIME
int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp)
{
        struct old_timex32 tx32;

        memset(txc, 0, sizeof(struct __kernel_timex));
        if (copy_from_user(&tx32, utp, sizeof(struct old_timex32)))
                return -EFAULT;

        txc->modes = tx32.modes;
        txc->offset = tx32.offset;
        txc->freq = tx32.freq;
        txc->maxerror = tx32.maxerror;
        txc->esterror = tx32.esterror;
        txc->status = tx32.status;
        txc->constant = tx32.constant;
        txc->precision = tx32.precision;
        txc->tolerance = tx32.tolerance;
        txc->time.tv_sec = tx32.time.tv_sec;
        txc->time.tv_usec = tx32.time.tv_usec;
        txc->tick = tx32.tick;
        txc->ppsfreq = tx32.ppsfreq;
        txc->jitter = tx32.jitter;
        txc->shift = tx32.shift;
        txc->stabil = tx32.stabil;
        txc->jitcnt = tx32.jitcnt;
        txc->calcnt = tx32.calcnt;
        txc->errcnt = tx32.errcnt;
        txc->stbcnt = tx32.stbcnt;

        return 0;
}

int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc)
{
        struct old_timex32 tx32;

        memset(&tx32, 0, sizeof(struct old_timex32));
        tx32.modes = txc->modes;
        tx32.offset = txc->offset;
        tx32.freq = txc->freq;
        tx32.maxerror = txc->maxerror;
        tx32.esterror = txc->esterror;
        tx32.status = txc->status;
        tx32.constant = txc->constant;
        tx32.precision = txc->precision;
        tx32.tolerance = txc->tolerance;
        tx32.time.tv_sec = txc->time.tv_sec;
        tx32.time.tv_usec = txc->time.tv_usec;
        tx32.tick = txc->tick;
        tx32.ppsfreq = txc->ppsfreq;
        tx32.jitter = txc->jitter;
        tx32.shift = txc->shift;
        tx32.stabil = txc->stabil;
        tx32.jitcnt = txc->jitcnt;
        tx32.calcnt = txc->calcnt;
        tx32.errcnt = txc->errcnt;
        tx32.stbcnt = txc->stbcnt;
        tx32.tai = txc->tai;
        if (copy_to_user(utp, &tx32, sizeof(struct old_timex32)))
                return -EFAULT;
        return 0;
}

SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
{
        struct __kernel_timex txc;
        int err, ret;

        err = get_old_timex32(&txc, utp);
        if (err)
                return err;

        ret = do_adjtimex(&txc);

        err = put_old_timex32(utp, &txc);
        if (err)
                return err;

        return ret;
}
#endif

#if HZ > MSEC_PER_SEC || (MSEC_PER_SEC % HZ)
/**
 * jiffies_to_msecs - Convert jiffies to milliseconds
 * @j: jiffies value
 *
 * Return: milliseconds value
 */
unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
        return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >>
               HZ_TO_MSEC_SHR32;
# else
        return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);
#endif

#if (USEC_PER_SEC % HZ)
/**
 * jiffies_to_usecs - Convert jiffies to microseconds
 * @j: jiffies value
 *
 * Return: microseconds value
 */
unsigned int jiffies_to_usecs(const unsigned long j)
{
        /*
         * Hz usually doesn't go much further MSEC_PER_SEC.
         * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
         */
        BUILD_BUG_ON(HZ > USEC_PER_SEC);

#if BITS_PER_LONG == 32
        return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
#else
        return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);
#endif

/**
 * mktime64 - Converts date to seconds.
 * @year0: year to convert
 * @mon0: month to convert
 * @day: day to convert
 * @hour: hour to convert
 * @min: minute to convert
 * @sec: second to convert
 *
 * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
 *
 * [For the Julian calendar (which was used in Russia before 1917,
 * Britain & colonies before 1752, anywhere else before 1582,
 * and is still in use by some communities) leave out the
 * -year/100+year/400 terms, and add 10.]
 *
 * This algorithm was first published by Gauss (I think).
 *
 * A leap second can be indicated by calling this function with sec as
 * 60 (allowable under ISO 8601).  The leap second is treated the same
 * as the following second since they don't exist in UNIX time.
 *
 * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
 * tomorrow - (allowable under ISO 8601) is supported.
 *
 * Return: seconds since the epoch time for the given input date
 */
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
                const unsigned int day, const unsigned int hour,
                const unsigned int min, const unsigned int sec)
{
        unsigned int mon = mon0, year = year0;

        /* 1..12 -> 11,12,1..10 */
        if (0 >= (int) (mon -= 2)) {
                mon += 12;        /* Puts Feb last since it has leap day */
                year -= 1;
        }

        return ((((time64_t)
                  (year/4 - year/100 + year/400 + 367*mon/12 + day) +
                  year*365 - 719499
            )*24 + hour /* now have hours - midnight tomorrow handled here */
          )*60 + min /* now have minutes */
        )*60 + sec; /* finally seconds */
}
EXPORT_SYMBOL(mktime64);

struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec)
{
        struct timespec64 ts = ns_to_timespec64(nsec);
        struct __kernel_old_timeval tv;

        tv.tv_sec = ts.tv_sec;
        tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000;

        return tv;
}
EXPORT_SYMBOL(ns_to_kernel_old_timeval);

/**
 * set_normalized_timespec64 - set timespec sec and nsec parts and normalize
 *
 * @ts:                pointer to timespec variable to be set
 * @sec:        seconds to set
 * @nsec:        nanoseconds to set
 *
 * Set seconds and nanoseconds field of a timespec variable and
 * normalize to the timespec storage format
 *
 * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC.
 * For negative values only the tv_sec field is negative !
 */
void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
{
        while (nsec >= NSEC_PER_SEC) {
                /*
                 * The following asm() prevents the compiler from
                 * optimising this loop into a modulo operation. See
                 * also __iter_div_u64_rem() in include/linux/time.h
                 */
                asm("" : "+rm"(nsec));
                nsec -= NSEC_PER_SEC;
                ++sec;
        }
        while (nsec < 0) {
                asm("" : "+rm"(nsec));
                nsec += NSEC_PER_SEC;
                --sec;
        }
        ts->tv_sec = sec;
        ts->tv_nsec = nsec;
}
EXPORT_SYMBOL(set_normalized_timespec64);

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:       the nanoseconds value to be converted
 *
 * Return: the timespec64 representation of the nsec parameter.
 */
struct timespec64 ns_to_timespec64(s64 nsec)
{
        struct timespec64 ts = { 0, 0 };
        s32 rem;

        if (likely(nsec > 0)) {
                ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
                ts.tv_nsec = rem;
        } else if (nsec < 0) {
                /*
                 * With negative times, tv_sec points to the earlier
                 * second, and tv_nsec counts the nanoseconds since
                 * then, so tv_nsec is always a positive number.
                 */
                ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1;
                ts.tv_nsec = NSEC_PER_SEC - rem - 1;
        }

        return ts;
}
EXPORT_SYMBOL(ns_to_timespec64);

/**
 * __msecs_to_jiffies: - convert milliseconds to jiffies
 * @m:        time in milliseconds
 *
 * conversion is done as follows:
 *
 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
 *
 * - 'too large' values [that would result in larger than
 *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
 *
 * - all other values are converted to jiffies by either multiplying
 *   the input value by a factor or dividing it with a factor and
 *   handling any 32-bit overflows.
 *   for the details see _msecs_to_jiffies()
 *
 * msecs_to_jiffies() checks for the passed in value being a constant
 * via __builtin_constant_p() allowing gcc to eliminate most of the
 * code, __msecs_to_jiffies() is called if the value passed does not
 * allow constant folding and the actual conversion must be done at
 * runtime.
 * The _msecs_to_jiffies helpers are the HZ dependent conversion
 * routines found in include/linux/jiffies.h
 *
 * Return: jiffies value
 */
unsigned long __msecs_to_jiffies(const unsigned int m)
{
        /*
         * Negative value, means infinite timeout:
         */
        if ((int)m < 0)
                return MAX_JIFFY_OFFSET;
        return _msecs_to_jiffies(m);
}
EXPORT_SYMBOL(__msecs_to_jiffies);

/**
 * __usecs_to_jiffies: - convert microseconds to jiffies
 * @u:        time in milliseconds
 *
 * Return: jiffies value
 */
unsigned long __usecs_to_jiffies(const unsigned int u)
{
        if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;
        return _usecs_to_jiffies(u);
}
EXPORT_SYMBOL(__usecs_to_jiffies);

/**
 * timespec64_to_jiffies - convert a timespec64 value to jiffies
 * @value: pointer to &struct timespec64
 *
 * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
 * that a remainder subtract here would not do the right thing as the
 * resolution values don't fall on second boundaries.  I.e. the line:
 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
 * Note that due to the small error in the multiplier here, this
 * rounding is incorrect for sufficiently large values of tv_nsec, but
 * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
 * OK.
 *
 * Rather, we just shift the bits off the right.
 *
 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
 * value to a scaled second value.
 *
 * Return: jiffies value
 */
unsigned long
timespec64_to_jiffies(const struct timespec64 *value)
{
        u64 sec = value->tv_sec;
        long nsec = value->tv_nsec + TICK_NSEC - 1;

        if (sec >= MAX_SEC_IN_JIFFIES){
                sec = MAX_SEC_IN_JIFFIES;
                nsec = 0;
        }
        return ((sec * SEC_CONVERSION) +
                (((u64)nsec * NSEC_CONVERSION) >>
                 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;

}
EXPORT_SYMBOL(timespec64_to_jiffies);

/**
 * jiffies_to_timespec64 - convert jiffies value to &struct timespec64
 * @jiffies: jiffies value
 * @value: pointer to &struct timespec64
 */
void
jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
        /*
         * Convert jiffies to nanoseconds and separate with
         * one divide.
         */
        u32 rem;
        value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
                                    NSEC_PER_SEC, &rem);
        value->tv_nsec = rem;
}
EXPORT_SYMBOL(jiffies_to_timespec64);

/*
 * Convert jiffies/jiffies_64 to clock_t and back.
 */

/**
 * jiffies_to_clock_t - Convert jiffies to clock_t
 * @x: jiffies value
 *
 * Return: jiffies converted to clock_t (CLOCKS_PER_SEC)
 */
clock_t jiffies_to_clock_t(unsigned long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
        return x * (USER_HZ / HZ);
# else
        return x / (HZ / USER_HZ);
# endif
#else
        return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
#endif
}
EXPORT_SYMBOL(jiffies_to_clock_t);

/**
 * clock_t_to_jiffies - Convert clock_t to jiffies
 * @x: clock_t value
 *
 * Return: clock_t value converted to jiffies
 */
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
        if (x >= ~0UL / (HZ / USER_HZ))
                return ~0UL;
        return x * (HZ / USER_HZ);
#else
        /* Don't worry about loss of precision here .. */
        if (x >= ~0UL / HZ * USER_HZ)
                return ~0UL;

        /* .. but do try to contain it here */
        return div_u64((u64)x * HZ, USER_HZ);
#endif
}
EXPORT_SYMBOL(clock_t_to_jiffies);

/**
 * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t
 * @x: jiffies_64 value
 *
 * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
 */
notrace u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
        x = div_u64(x * USER_HZ, HZ);
# elif HZ > USER_HZ
        x = div_u64(x, HZ / USER_HZ);
# else
        /* Nothing to do */
# endif
#else
        /*
         * There are better ways that don't overflow early,
         * but even this doesn't overflow in hundreds of years
         * in 64 bits, so..
         */
        x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
#endif
        return x;
}
EXPORT_SYMBOL(jiffies_64_to_clock_t);

/**
 * nsec_to_clock_t - Convert nsec value to clock_t
 * @x: nsec value
 *
 * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
 */
u64 nsec_to_clock_t(u64 x)
{
#if (NSEC_PER_SEC % USER_HZ) == 0
        return div_u64(x, NSEC_PER_SEC / USER_HZ);
#elif (USER_HZ % 512) == 0
        return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
#else
        /*
         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
         * overflow after 64.99 years.
         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
         */
        return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
#endif
}

/**
 * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds
 * @j: jiffies64 value
 *
 * Return: nanoseconds value
 */
u64 jiffies64_to_nsecs(u64 j)
{
#if !(NSEC_PER_SEC % HZ)
        return (NSEC_PER_SEC / HZ) * j;
# else
        return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_nsecs);

/**
 * jiffies64_to_msecs - Convert jiffies64 to milliseconds
 * @j: jiffies64 value
 *
 * Return: milliseconds value
 */
u64 jiffies64_to_msecs(const u64 j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
#else
        return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_msecs);

/**
 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
 *
 * @n:        nsecs in u64
 *
 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
 * for scheduler, not for use in device drivers to calculate timeout value.
 *
 * note:
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 *
 * Return: nsecs converted to jiffies64 value
 */
u64 nsecs_to_jiffies64(u64 n)
{
#if (NSEC_PER_SEC % HZ) == 0
        /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
        return div_u64(n, NSEC_PER_SEC / HZ);
#elif (HZ % 512) == 0
        /* overflow after 292 years if HZ = 1024 */
        return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
#else
        /*
         * Generic case - optimized for cases where HZ is a multiple of 3.
         * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
         */
        return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
#endif
}
EXPORT_SYMBOL(nsecs_to_jiffies64);

/**
 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
 *
 * @n:        nsecs in u64
 *
 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
 * for scheduler, not for use in device drivers to calculate timeout value.
 *
 * note:
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 *
 * Return: nsecs converted to jiffies value
 */
unsigned long nsecs_to_jiffies(u64 n)
{
        return (unsigned long)nsecs_to_jiffies64(n);
}
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);

/**
 * timespec64_add_safe - Add two timespec64 values and do a safety check
 * for overflow.
 * @lhs: first (left) timespec64 to add
 * @rhs: second (right) timespec64 to add
 *
 * It's assumed that both values are valid (>= 0).
 * And, each timespec64 is in normalized form.
 *
 * Return: sum of @lhs + @rhs
 */
struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                const struct timespec64 rhs)
{
        struct timespec64 res;

        set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
                        lhs.tv_nsec + rhs.tv_nsec);

        if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
                res.tv_sec = TIME64_MAX;
                res.tv_nsec = 0;
        }

        return res;
}
EXPORT_SYMBOL_GPL(timespec64_add_safe);

/**
 * get_timespec64 - get user's time value into kernel space
 * @ts: destination &struct timespec64
 * @uts: user's time value as &struct __kernel_timespec
 *
 * Handles compat or 32-bit modes.
 *
 * Return: 0 on success or negative errno on error
 */
int get_timespec64(struct timespec64 *ts,
                   const struct __kernel_timespec __user *uts)
{
        struct __kernel_timespec kts;
        int ret;

        ret = copy_from_user(&kts, uts, sizeof(kts));
        if (ret)
                return -EFAULT;

        ts->tv_sec = kts.tv_sec;

        /* Zero out the padding in compat mode */
        if (in_compat_syscall())
                kts.tv_nsec &= 0xFFFFFFFFUL;

        /* In 32-bit mode, this drops the padding */
        ts->tv_nsec = kts.tv_nsec;

        return 0;
}
EXPORT_SYMBOL_GPL(get_timespec64);

/**
 * put_timespec64 - convert timespec64 value to __kernel_timespec format and
 *                     copy the latter to userspace
 * @ts: input &struct timespec64
 * @uts: user's &struct __kernel_timespec
 *
 * Return: 0 on success or negative errno on error
 */
int put_timespec64(const struct timespec64 *ts,
                   struct __kernel_timespec __user *uts)
{
        struct __kernel_timespec kts = {
                .tv_sec = ts->tv_sec,
                .tv_nsec = ts->tv_nsec
        };

        return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(put_timespec64);

static int __get_old_timespec32(struct timespec64 *ts64,
                                   const struct old_timespec32 __user *cts)
{
        struct old_timespec32 ts;
        int ret;

        ret = copy_from_user(&ts, cts, sizeof(ts));
        if (ret)
                return -EFAULT;

        ts64->tv_sec = ts.tv_sec;
        ts64->tv_nsec = ts.tv_nsec;

        return 0;
}

static int __put_old_timespec32(const struct timespec64 *ts64,
                                   struct old_timespec32 __user *cts)
{
        struct old_timespec32 ts = {
                .tv_sec = ts64->tv_sec,
                .tv_nsec = ts64->tv_nsec
        };
        return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
}

/**
 * get_old_timespec32 - get user's old-format time value into kernel space
 * @ts: destination &struct timespec64
 * @uts: user's old-format time value (&struct old_timespec32)
 *
 * Handles X86_X32_ABI compatibility conversion.
 *
 * Return: 0 on success or negative errno on error
 */
int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
{
        if (COMPAT_USE_64BIT_TIME)
                return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __get_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(get_old_timespec32);

/**
 * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and
 *                         copy the latter to userspace
 * @ts: input &struct timespec64
 * @uts: user's &struct old_timespec32
 *
 * Handles X86_X32_ABI compatibility conversion.
 *
 * Return: 0 on success or negative errno on error
 */
int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
{
        if (COMPAT_USE_64BIT_TIME)
                return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __put_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(put_old_timespec32);

/**
 * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space
 * @it: destination &struct itimerspec64
 * @uit: user's &struct __kernel_itimerspec
 *
 * Return: 0 on success or negative errno on error
 */
int get_itimerspec64(struct itimerspec64 *it,
                        const struct __kernel_itimerspec __user *uit)
{
        int ret;

        ret = get_timespec64(&it->it_interval, &uit->it_interval);
        if (ret)
                return ret;

        ret = get_timespec64(&it->it_value, &uit->it_value);

        return ret;
}
EXPORT_SYMBOL_GPL(get_itimerspec64);

/**
 * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format
 *                       and copy the latter to userspace
 * @it: input &struct itimerspec64
 * @uit: user's &struct __kernel_itimerspec
 *
 * Return: 0 on success or negative errno on error
 */
int put_itimerspec64(const struct itimerspec64 *it,
                        struct __kernel_itimerspec __user *uit)
{
        int ret;

        ret = put_timespec64(&it->it_interval, &uit->it_interval);
        if (ret)
                return ret;

        ret = put_timespec64(&it->it_value, &uit->it_value);

        return ret;
}
EXPORT_SYMBOL_GPL(put_itimerspec64);

/**
 * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space
 * @its: destination &struct itimerspec64
 * @uits: user's &struct old_itimerspec32
 *
 * Return: 0 on success or negative errno on error
 */
int get_old_itimerspec32(struct itimerspec64 *its,
                        const struct old_itimerspec32 __user *uits)
{

        if (__get_old_timespec32(&its->it_interval, &uits->it_interval) ||
            __get_old_timespec32(&its->it_value, &uits->it_value))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(get_old_itimerspec32);

/**
 * put_old_itimerspec32 - convert &struct itimerspec64 to &struct
 *                          old_itimerspec32 and copy the latter to userspace
 * @its: input &struct itimerspec64
 * @uits: user's &struct old_itimerspec32
 *
 * Return: 0 on success or negative errno on error
 */
int put_old_itimerspec32(const struct itimerspec64 *its,
                        struct old_itimerspec32 __user *uits)
{
        if (__put_old_timespec32(&its->it_interval, &uits->it_interval) ||
            __put_old_timespec32(&its->it_value, &uits->it_value))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(put_old_itimerspec32);







































   18 





   18 






   17 
   18 





















































































































































































































   18 
   19 






   19 


   16 

   19 







   18 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/filesystems.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  table of configured filesystems
 */

#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs_parser.h>

/*
 * Handling of filesystem drivers list.
 * Rules:
 *        Inclusion to/removals from/scanning of list are protected by spinlock.
 *        During the unload module must call unregister_filesystem().
 *        We can access the fields of list element if:
 *                1) spinlock is held or
 *                2) we hold the reference to the module.
 *        The latter can be guaranteed by call of try_module_get(); if it
 *        returned 0 we must skip the element, otherwise we got the reference.
 *        Once the reference is obtained we can drop the spinlock.
 */

static struct file_system_type *file_systems;
static DEFINE_RWLOCK(file_systems_lock);

/* WARNING: This can be used only if we _already_ own a reference */
struct file_system_type *get_filesystem(struct file_system_type *fs)
{
        __module_get(fs->owner);
        return fs;
}

void put_filesystem(struct file_system_type *fs)
{
        module_put(fs->owner);
}

static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
        struct file_system_type **p;
        for (p = &file_systems; *p; p = &(*p)->next)
                if (strncmp((*p)->name, name, len) == 0 &&
                    !(*p)->name[len])
                        break;
        return p;
}

/**
 *        register_filesystem - register a new filesystem
 *        @fs: the file system structure
 *
 *        Adds the file system passed to the list of file systems the kernel
 *        is aware of for mount and other syscalls. Returns 0 on success,
 *        or a negative errno code on an error.
 *
 *        The &struct file_system_type that is passed is linked into the kernel 
 *        structures and must not be freed until the file system has been
 *        unregistered.
 */
 
int register_filesystem(struct file_system_type * fs)
{
        int res = 0;
        struct file_system_type ** p;

        if (fs->parameters &&
            !fs_validate_description(fs->name, fs->parameters))
                return -EINVAL;

        BUG_ON(strchr(fs->name, '.'));
        if (fs->next)
                return -EBUSY;
        write_lock(&file_systems_lock);
        p = find_filesystem(fs->name, strlen(fs->name));
        if (*p)
                res = -EBUSY;
        else
                *p = fs;
        write_unlock(&file_systems_lock);
        return res;
}

EXPORT_SYMBOL(register_filesystem);

/**
 *        unregister_filesystem - unregister a file system
 *        @fs: filesystem to unregister
 *
 *        Remove a file system that was previously successfully registered
 *        with the kernel. An error is returned if the file system is not found.
 *        Zero is returned on a success.
 *        
 *        Once this function has returned the &struct file_system_type structure
 *        may be freed or reused.
 */
 
int unregister_filesystem(struct file_system_type * fs)
{
        struct file_system_type ** tmp;

        write_lock(&file_systems_lock);
        tmp = &file_systems;
        while (*tmp) {
                if (fs == *tmp) {
                        *tmp = fs->next;
                        fs->next = NULL;
                        write_unlock(&file_systems_lock);
                        synchronize_rcu();
                        return 0;
                }
                tmp = &(*tmp)->next;
        }
        write_unlock(&file_systems_lock);

        return -EINVAL;
}

EXPORT_SYMBOL(unregister_filesystem);

#ifdef CONFIG_SYSFS_SYSCALL
static int fs_index(const char __user * __name)
{
        struct file_system_type * tmp;
        char *name __free(kfree) = strndup_user(__name, PATH_MAX);
        int err, index;

        if (IS_ERR(name))
                return PTR_ERR(name);

        err = -EINVAL;
        read_lock(&file_systems_lock);
        for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
                if (strcmp(tmp->name, name) == 0) {
                        err = index;
                        break;
                }
        }
        read_unlock(&file_systems_lock);
        return err;
}

static int fs_name(unsigned int index, char __user * buf)
{
        struct file_system_type * tmp;
        int len, res = -EINVAL;

        read_lock(&file_systems_lock);
        for (tmp = file_systems; tmp; tmp = tmp->next, index--) {
                if (index == 0) {
                        if (try_module_get(tmp->owner))
                                res = 0;
                        break;
                }
        }
        read_unlock(&file_systems_lock);
        if (res)
                return res;

        /* OK, we got the reference, so we can safely block */
        len = strlen(tmp->name) + 1;
        res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
        put_filesystem(tmp);
        return res;
}

static int fs_maxindex(void)
{
        struct file_system_type * tmp;
        int index;

        read_lock(&file_systems_lock);
        for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
                ;
        read_unlock(&file_systems_lock);
        return index;
}

/*
 * Whee.. Weird sysv syscall. 
 */
SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
{
        int retval = -EINVAL;

        switch (option) {
                case 1:
                        retval = fs_index((const char __user *) arg1);
                        break;

                case 2:
                        retval = fs_name(arg1, (char __user *) arg2);
                        break;

                case 3:
                        retval = fs_maxindex();
                        break;
        }
        return retval;
}
#endif

int __init list_bdev_fs_names(char *buf, size_t size)
{
        struct file_system_type *p;
        size_t len;
        int count = 0;

        read_lock(&file_systems_lock);
        for (p = file_systems; p; p = p->next) {
                if (!(p->fs_flags & FS_REQUIRES_DEV))
                        continue;
                len = strlen(p->name) + 1;
                if (len > size) {
                        pr_warn("%s: truncating file system list\n", __func__);
                        break;
                }
                memcpy(buf, p->name, len);
                buf += len;
                size -= len;
                count++;
        }
        read_unlock(&file_systems_lock);
        return count;
}

#ifdef CONFIG_PROC_FS
static int filesystems_proc_show(struct seq_file *m, void *v)
{
        struct file_system_type * tmp;

        read_lock(&file_systems_lock);
        tmp = file_systems;
        while (tmp) {
                seq_printf(m, "%s\t%s\n",
                        (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
                        tmp->name);
                tmp = tmp->next;
        }
        read_unlock(&file_systems_lock);
        return 0;
}

static int __init proc_filesystems_init(void)
{
        proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
        return 0;
}
module_init(proc_filesystems_init);
#endif

static struct file_system_type *__get_fs_type(const char *name, int len)
{
        struct file_system_type *fs;

        read_lock(&file_systems_lock);
        fs = *(find_filesystem(name, len));
        if (fs && !try_module_get(fs->owner))
                fs = NULL;
        read_unlock(&file_systems_lock);
        return fs;
}

struct file_system_type *get_fs_type(const char *name)
{
        struct file_system_type *fs;
        const char *dot = strchr(name, '.');
        int len = dot ? dot - name : strlen(name);

        fs = __get_fs_type(name, len);
        if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
                fs = __get_fs_type(name, len);
                if (!fs)
                        pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
                                     len, name);
        }

        if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
                put_filesystem(fs);
                fs = NULL;
        }
        return fs;
}

EXPORT_SYMBOL(get_fs_type);





    1 




    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#include <linux/dcache.h>
#include "internal.h"

unsigned name_to_int(const struct qstr *qstr)
{
        const char *name = qstr->name;
        int len = qstr->len;
        unsigned n = 0;

        if (len > 1 && *name == '0')
                goto out;
        do {
                unsigned c = *name++ - '0';
                if (c > 9)
                        goto out;
                if (n >= (~0U-9)/10)
                        goto out;
                n *= 10;
                n += c;
        } while (--len > 0);
        return n;
out:
        return ~0U;
}










































   49 












































































   49 






































   19 
   49 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_IRQFLAGS_H_
#define _X86_IRQFLAGS_H_

#include <asm/processor-flags.h>

#ifndef __ASSEMBLER__

#include <asm/nospec-branch.h>

/*
 * Interrupt control:
 */

/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
extern inline unsigned long native_save_fl(void);
extern __always_inline unsigned long native_save_fl(void)
{
        unsigned long flags;

        /*
         * "=rm" is safe here, because "pop" adjusts the stack before
         * it evaluates its effective address -- this is part of the
         * documented behavior of the "pop" instruction.
         */
        asm volatile("# __raw_save_flags\n\t"
                     "pushf ; pop %0"
                     : ASM_OUTPUT_RM (flags)
                     : /* no input */
                     : "memory");

        return flags;
}

static __always_inline void native_irq_disable(void)
{
        asm volatile("cli": : :"memory");
}

static __always_inline void native_irq_enable(void)
{
        asm volatile("sti": : :"memory");
}

static __always_inline void native_safe_halt(void)
{
        x86_idle_clear_cpu_buffers();
        asm volatile("sti; hlt": : :"memory");
}

static __always_inline void native_halt(void)
{
        x86_idle_clear_cpu_buffers();
        asm volatile("hlt": : :"memory");
}

static __always_inline int native_irqs_disabled_flags(unsigned long flags)
{
        return !(flags & X86_EFLAGS_IF);
}

static __always_inline unsigned long native_local_irq_save(void)
{
        unsigned long flags = native_save_fl();

        native_irq_disable();

        return flags;
}

static __always_inline void native_local_irq_restore(unsigned long flags)
{
        if (!native_irqs_disabled_flags(flags))
                native_irq_enable();
}

#endif

#ifndef CONFIG_PARAVIRT
#ifndef __ASSEMBLER__
/*
 * Used in the idle loop; sti takes one instruction cycle
 * to complete:
 */
static __always_inline void arch_safe_halt(void)
{
        native_safe_halt();
}

/*
 * Used when interrupts are already enabled or to
 * shutdown the processor:
 */
static __always_inline void halt(void)
{
        native_halt();
}
#endif /* __ASSEMBLER__ */
#else
#include <asm/paravirt.h>
#endif /* CONFIG_PARAVIRT */

#ifndef CONFIG_PARAVIRT_XXL
#ifndef __ASSEMBLER__
#include <linux/types.h>

static __always_inline unsigned long arch_local_save_flags(void)
{
        return native_save_fl();
}

static __always_inline void arch_local_irq_disable(void)
{
        native_irq_disable();
}

static __always_inline void arch_local_irq_enable(void)
{
        native_irq_enable();
}

/*
 * For spinlocks, etc:
 */
static __always_inline unsigned long arch_local_irq_save(void)
{
        unsigned long flags = arch_local_save_flags();
        arch_local_irq_disable();
        return flags;
}
#else

#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS                pushfq; popq %rax
#endif

#endif

#endif /* __ASSEMBLER__ */
#endif /* CONFIG_PARAVIRT_XXL */

#ifndef __ASSEMBLER__
static __always_inline int arch_irqs_disabled_flags(unsigned long flags)
{
        return !(flags & X86_EFLAGS_IF);
}

static __always_inline int arch_irqs_disabled(void)
{
        unsigned long flags = arch_local_save_flags();

        return arch_irqs_disabled_flags(flags);
}

static __always_inline void arch_local_irq_restore(unsigned long flags)
{
        if (!arch_irqs_disabled_flags(flags))
                arch_local_irq_enable();
}
#endif /* !__ASSEMBLER__ */

#endif










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   17 








   18 















   17 





   15 









































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
// SPDX-License-Identifier: GPL-2.0-only
// Copyright (C) 2022 Linutronix GmbH, John Ogness
// Copyright (C) 2022 Intel, Thomas Gleixner

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/console.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/irqflags.h>
#include <linux/kdb.h>
#include <linux/kthread.h>
#include <linux/minmax.h>
#include <linux/panic.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
#include "internal.h"
#include "printk_ringbuffer.h"
/*
 * Printk console printing implementation for consoles which does not depend
 * on the legacy style console_lock mechanism.
 *
 * The state of the console is maintained in the "nbcon_state" atomic
 * variable.
 *
 * The console is locked when:
 *
 *   - The 'prio' field contains the priority of the context that owns the
 *     console. Only higher priority contexts are allowed to take over the
 *     lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.
 *
 *   - The 'cpu' field denotes on which CPU the console is locked. It is used
 *     to prevent busy waiting on the same CPU. Also it informs the lock owner
 *     that it has lost the lock in a more complex scenario when the lock was
 *     taken over by a higher priority context, released, and taken on another
 *     CPU with the same priority as the interrupted owner.
 *
 * The acquire mechanism uses a few more fields:
 *
 *   - The 'req_prio' field is used by the handover approach to make the
 *     current owner aware that there is a context with a higher priority
 *     waiting for the friendly handover.
 *
 *   - The 'unsafe' field allows to take over the console in a safe way in the
 *     middle of emitting a message. The field is set only when accessing some
 *     shared resources or when the console device is manipulated. It can be
 *     cleared, for example, after emitting one character when the console
 *     device is in a consistent state.
 *
 *   - The 'unsafe_takeover' field is set when a hostile takeover took the
 *     console in an unsafe state. The console will stay in the unsafe state
 *     until re-initialized.
 *
 * The acquire mechanism uses three approaches:
 *
 *   1) Direct acquire when the console is not owned or is owned by a lower
 *      priority context and is in a safe state.
 *
 *   2) Friendly handover mechanism uses a request/grant handshake. It is used
 *      when the current owner has lower priority and the console is in an
 *      unsafe state.
 *
 *      The requesting context:
 *
 *        a) Sets its priority into the 'req_prio' field.
 *
 *        b) Waits (with a timeout) for the owning context to unlock the
 *           console.
 *
 *        c) Takes the lock and clears the 'req_prio' field.
 *
 *      The owning context:
 *
 *        a) Observes the 'req_prio' field set on exit from the unsafe
 *           console state.
 *
 *        b) Gives up console ownership by clearing the 'prio' field.
 *
 *   3) Unsafe hostile takeover allows to take over the lock even when the
 *      console is an unsafe state. It is used only in panic() by the final
 *      attempt to flush consoles in a try and hope mode.
 *
 *      Note that separate record buffers are used in panic(). As a result,
 *      the messages can be read and formatted without any risk even after
 *      using the hostile takeover in unsafe state.
 *
 * The release function simply clears the 'prio' field.
 *
 * All operations on @console::nbcon_state are atomic cmpxchg based to
 * handle concurrency.
 *
 * The acquire/release functions implement only minimal policies:
 *
 *   - Preference for higher priority contexts.
 *   - Protection of the panic CPU.
 *
 * All other policy decisions must be made at the call sites:
 *
 *   - What is marked as an unsafe section.
 *   - Whether to spin-wait if there is already an owner and the console is
 *     in an unsafe state.
 *   - Whether to attempt an unsafe hostile takeover.
 *
 * The design allows to implement the well known:
 *
 *     acquire()
 *     output_one_printk_record()
 *     release()
 *
 * The output of one printk record might be interrupted with a higher priority
 * context. The new owner is supposed to reprint the entire interrupted record
 * from scratch.
 */

/* Counter of active nbcon emergency contexts. */
static atomic_t nbcon_cpu_emergency_cnt = ATOMIC_INIT(0);

/**
 * nbcon_state_set - Helper function to set the console state
 * @con:        Console to update
 * @new:        The new state to write
 *
 * Only to be used when the console is not yet or no longer visible in the
 * system. Otherwise use nbcon_state_try_cmpxchg().
 */
static inline void nbcon_state_set(struct console *con, struct nbcon_state *new)
{
        atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom);
}

/**
 * nbcon_state_read - Helper function to read the console state
 * @con:        Console to read
 * @state:        The state to store the result
 */
static inline void nbcon_state_read(struct console *con, struct nbcon_state *state)
{
        state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state));
}

/**
 * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state
 * @con:        Console to update
 * @cur:        Old/expected state
 * @new:        New state
 *
 * Return: True on success. False on fail and @cur is updated.
 */
static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur,
                                           struct nbcon_state *new)
{
        return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
}

/**
 * nbcon_seq_read - Read the current console sequence
 * @con:        Console to read the sequence of
 *
 * Return:        Sequence number of the next record to print on @con.
 */
u64 nbcon_seq_read(struct console *con)
{
        unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));

        return __ulseq_to_u64seq(prb, nbcon_seq);
}

/**
 * nbcon_seq_force - Force console sequence to a specific value
 * @con:        Console to work on
 * @seq:        Sequence number value to set
 *
 * Only to be used during init (before registration) or in extreme situations
 * (such as panic with CONSOLE_REPLAY_ALL).
 */
void nbcon_seq_force(struct console *con, u64 seq)
{
        /*
         * If the specified record no longer exists, the oldest available record
         * is chosen. This is especially important on 32bit systems because only
         * the lower 32 bits of the sequence number are stored. The upper 32 bits
         * are derived from the sequence numbers available in the ringbuffer.
         */
        u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));

        atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq));
}

/**
 * nbcon_seq_try_update - Try to update the console sequence number
 * @ctxt:        Pointer to an acquire context that contains
 *                all information about the acquire mode
 * @new_seq:        The new sequence number to set
 *
 * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to
 * the 64bit value). This could be a different value than @new_seq if
 * nbcon_seq_force() was used or the current context no longer owns the
 * console. In the later case, it will stop printing anyway.
 */
static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
{
        unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq);
        struct console *con = ctxt->console;

        if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
                                    __u64seq_to_ulseq(new_seq))) {
                ctxt->seq = new_seq;
        } else {
                ctxt->seq = nbcon_seq_read(con);
        }
}

/**
 * nbcon_context_try_acquire_direct - Try to acquire directly
 * @ctxt:                The context of the caller
 * @cur:                The current console state
 * @is_reacquire:        This acquire is a reacquire
 *
 * Acquire the console when it is released. Also acquire the console when
 * the current owner has a lower priority and the console is in a safe state.
 *
 * Return:        0 on success. Otherwise, an error code on failure. Also @cur
 *                is updated to the latest state when failed to modify it.
 *
 * Errors:
 *
 *        -EPERM:                A panic is in progress and this is neither the panic
 *                        CPU nor is this a reacquire. Or the current owner or
 *                        waiter has the same or higher priority. No acquire
 *                        method can be successful in these cases.
 *
 *        -EBUSY:                The current owner has a lower priority but the console
 *                        in an unsafe state. The caller should try using
 *                        the handover acquire method.
 */
static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
                                            struct nbcon_state *cur, bool is_reacquire)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;

        do {
                /*
                 * Panic does not imply that the console is owned. However,
                 * since all non-panic CPUs are stopped during panic(), it
                 * is safer to have them avoid gaining console ownership.
                 *
                 * One exception is when kdb has locked for printing on this CPU.
                 *
                 * Second exception is a reacquire (and an unsafe takeover
                 * has not previously occurred) then it is allowed to attempt
                 * a direct acquire in panic. This gives console drivers an
                 * opportunity to perform any necessary cleanup if they were
                 * interrupted by the panic CPU while printing.
                 */
                if (panic_on_other_cpu() &&
                    !kdb_printf_on_this_cpu() &&
                    (!is_reacquire || cur->unsafe_takeover)) {
                        return -EPERM;
                }

                if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
                        return -EPERM;

                if (cur->unsafe)
                        return -EBUSY;

                /*
                 * The console should never be safe for a direct acquire
                 * if an unsafe hostile takeover has ever happened.
                 */
                WARN_ON_ONCE(cur->unsafe_takeover);

                new.atom = cur->atom;
                new.prio        = ctxt->prio;
                new.req_prio        = NBCON_PRIO_NONE;
                new.unsafe        = cur->unsafe_takeover;
                new.cpu                = cpu;

        } while (!nbcon_state_try_cmpxchg(con, cur, &new));

        return 0;
}

static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
{
        /*
         * The request context is well defined by the @req_prio because:
         *
         * - Only a context with a priority higher than the owner can become
         *   a waiter.
         * - Only a context with a priority higher than the waiter can
         *   directly take over the request.
         * - There are only three priorities.
         * - Only one CPU is allowed to request PANIC priority.
         * - Lower priorities are ignored during panic() until reboot.
         *
         * As a result, the following scenario is *not* possible:
         *
         * 1. This context is currently a waiter.
         * 2. Another context with a higher priority than this context
         *    directly takes ownership.
         * 3. The higher priority context releases the ownership.
         * 4. Another lower priority context takes the ownership.
         * 5. Another context with the same priority as this context
         *    creates a request and starts waiting.
         *
         * Event #1 implies this context is EMERGENCY.
         * Event #2 implies the new context is PANIC.
         * Event #3 occurs when panic() has flushed the console.
         * Event #4 occurs when a non-panic CPU reacquires.
         * Event #5 is not possible due to the panic_on_other_cpu() check
         *          in nbcon_context_try_acquire_handover().
         */

        return (cur->req_prio == expected_prio);
}

/**
 * nbcon_context_try_acquire_requested - Try to acquire after having
 *                                         requested a handover
 * @ctxt:        The context of the caller
 * @cur:        The current console state
 *
 * This is a helper function for nbcon_context_try_acquire_handover().
 * It is called when the console is in an unsafe state. The current
 * owner will release the console on exit from the unsafe region.
 *
 * Return:        0 on success and @cur is updated to the new console state.
 *                Otherwise an error code on failure.
 *
 * Errors:
 *
 *        -EPERM:                A panic is in progress and this is not the panic CPU
 *                        or this context is no longer the waiter.
 *
 *        -EBUSY:                The console is still locked. The caller should
 *                        continue waiting.
 *
 * Note: The caller must still remove the request when an error has occurred
 *       except when this context is no longer the waiter.
 */
static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
                                               struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;

        /* Note that the caller must still remove the request! */
        if (panic_on_other_cpu())
                return -EPERM;

        /*
         * Note that the waiter will also change if there was an unsafe
         * hostile takeover.
         */
        if (!nbcon_waiter_matches(cur, ctxt->prio))
                return -EPERM;

        /* If still locked, caller should continue waiting. */
        if (cur->prio != NBCON_PRIO_NONE)
                return -EBUSY;

        /*
         * The previous owner should have never released ownership
         * in an unsafe region.
         */
        WARN_ON_ONCE(cur->unsafe);

        new.atom = cur->atom;
        new.prio        = ctxt->prio;
        new.req_prio        = NBCON_PRIO_NONE;
        new.unsafe        = cur->unsafe_takeover;
        new.cpu                = cpu;

        if (!nbcon_state_try_cmpxchg(con, cur, &new)) {
                /*
                 * The acquire could fail only when it has been taken
                 * over by a higher priority context.
                 */
                WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio));
                return -EPERM;
        }

        /* Handover success. This context now owns the console. */
        return 0;
}

/**
 * nbcon_context_try_acquire_handover - Try to acquire via handover
 * @ctxt:        The context of the caller
 * @cur:        The current console state
 *
 * The function must be called only when the context has higher priority
 * than the current owner and the console is in an unsafe state.
 * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY.
 *
 * The function sets "req_prio" field to make the current owner aware of
 * the request. Then it waits until the current owner releases the console,
 * or an even higher context takes over the request, or timeout expires.
 *
 * The current owner checks the "req_prio" field on exit from the unsafe
 * region and releases the console. It does not touch the "req_prio" field
 * so that the console stays reserved for the waiter.
 *
 * Return:        0 on success. Otherwise, an error code on failure. Also @cur
 *                is updated to the latest state when failed to modify it.
 *
 * Errors:
 *
 *        -EPERM:                A panic is in progress and this is not the panic CPU.
 *                        Or a higher priority context has taken over the
 *                        console or the handover request.
 *
 *        -EBUSY:                The current owner is on the same CPU so that the hand
 *                        shake could not work. Or the current owner is not
 *                        willing to wait (zero timeout). Or the console does
 *                        not enter the safe state before timeout passed. The
 *                        caller might still use the unsafe hostile takeover
 *                        when allowed.
 *
 *        -EAGAIN:        @cur has changed when creating the handover request.
 *                        The caller should retry with direct acquire.
 */
static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
                                              struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;
        int timeout;
        int request_err = -EBUSY;

        /*
         * Check that the handover is called when the direct acquire failed
         * with -EBUSY.
         */
        WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
        WARN_ON_ONCE(!cur->unsafe);

        /*
         * Panic does not imply that the console is owned. However, it
         * is critical that non-panic CPUs during panic are unable to
         * wait for a handover in order to satisfy the assumptions of
         * nbcon_waiter_matches(). In particular, the assumption that
         * lower priorities are ignored during panic.
         */
        if (panic_on_other_cpu())
                return -EPERM;

        /* Handover is not possible on the same CPU. */
        if (cur->cpu == cpu)
                return -EBUSY;

        /*
         * Console stays unsafe after an unsafe takeover until re-initialized.
         * Waiting is not going to help in this case.
         */
        if (cur->unsafe_takeover)
                return -EBUSY;

        /* Is the caller willing to wait? */
        if (ctxt->spinwait_max_us == 0)
                return -EBUSY;

        /*
         * Setup a request for the handover. The caller should try to acquire
         * the console directly when the current state has been modified.
         */
        new.atom = cur->atom;
        new.req_prio = ctxt->prio;
        if (!nbcon_state_try_cmpxchg(con, cur, &new))
                return -EAGAIN;

        cur->atom = new.atom;

        /* Wait until there is no owner and then acquire the console. */
        for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
                /* On successful acquire, this request is cleared. */
                request_err = nbcon_context_try_acquire_requested(ctxt, cur);
                if (!request_err)
                        return 0;

                /*
                 * If the acquire should be aborted, it must be ensured
                 * that the request is removed before returning to caller.
                 */
                if (request_err == -EPERM)
                        break;

                udelay(1);

                /* Re-read the state because some time has passed. */
                nbcon_state_read(con, cur);
        }

        /* Timed out or aborted. Carefully remove handover request. */
        do {
                /*
                 * No need to remove request if there is a new waiter. This
                 * can only happen if a higher priority context has taken over
                 * the console or the handover request.
                 */
                if (!nbcon_waiter_matches(cur, ctxt->prio))
                        return -EPERM;

                /* Unset request for handover. */
                new.atom = cur->atom;
                new.req_prio = NBCON_PRIO_NONE;
                if (nbcon_state_try_cmpxchg(con, cur, &new)) {
                        /*
                         * Request successfully unset. Report failure of
                         * acquiring via handover.
                         */
                        cur->atom = new.atom;
                        return request_err;
                }

                /*
                 * Unable to remove request. Try to acquire in case
                 * the owner has released the lock.
                 */
        } while (nbcon_context_try_acquire_requested(ctxt, cur));

        /* Lucky timing. The acquire succeeded while removing the request. */
        return 0;
}

/**
 * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover
 * @ctxt:        The context of the caller
 * @cur:        The current console state
 *
 * Acquire the console even in the unsafe state.
 *
 * It can be permitted by setting the 'allow_unsafe_takeover' field only
 * by the final attempt to flush messages in panic().
 *
 * Return:        0 on success. -EPERM when not allowed by the context.
 */
static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
                                             struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;

        if (!ctxt->allow_unsafe_takeover)
                return -EPERM;

        /* Ensure caller is allowed to perform unsafe hostile takeovers. */
        if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC))
                return -EPERM;

        /*
         * Check that try_acquire_direct() and try_acquire_handover() returned
         * -EBUSY in the right situation.
         */
        WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
        WARN_ON_ONCE(cur->unsafe != true);

        do {
                new.atom = cur->atom;
                new.cpu                        = cpu;
                new.prio                = ctxt->prio;
                new.unsafe                |= cur->unsafe_takeover;
                new.unsafe_takeover        |= cur->unsafe;

        } while (!nbcon_state_try_cmpxchg(con, cur, &new));

        return 0;
}

static struct printk_buffers panic_nbcon_pbufs;

/**
 * nbcon_context_try_acquire - Try to acquire nbcon console
 * @ctxt:                The context of the caller
 * @is_reacquire:        This acquire is a reacquire
 *
 * Context:        Under @ctxt->con->device_lock() or local_irq_save().
 * Return:        True if the console was acquired. False otherwise.
 *
 * If the caller allowed an unsafe hostile takeover, on success the
 * caller should check the current console state to see if it is
 * in an unsafe state. Otherwise, on success the caller may assume
 * the console is not in an unsafe state.
 */
static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire)
{
        struct console *con = ctxt->console;
        struct nbcon_state cur;
        int err;

        nbcon_state_read(con, &cur);
try_again:
        err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire);
        if (err != -EBUSY)
                goto out;

        err = nbcon_context_try_acquire_handover(ctxt, &cur);
        if (err == -EAGAIN)
                goto try_again;
        if (err != -EBUSY)
                goto out;

        err = nbcon_context_try_acquire_hostile(ctxt, &cur);
out:
        if (err)
                return false;

        /* Acquire succeeded. */

        /* Assign the appropriate buffer for this context. */
        if (panic_on_this_cpu())
                ctxt->pbufs = &panic_nbcon_pbufs;
        else
                ctxt->pbufs = con->pbufs;

        /* Set the record sequence for this context to print. */
        ctxt->seq = nbcon_seq_read(ctxt->console);

        return true;
}

static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
                                int expected_prio)
{
        /*
         * A similar function, nbcon_waiter_matches(), only deals with
         * EMERGENCY and PANIC priorities. However, this function must also
         * deal with the NORMAL priority, which requires additional checks
         * and constraints.
         *
         * For the case where preemption and interrupts are disabled, it is
         * enough to also verify that the owning CPU has not changed.
         *
         * For the case where preemption or interrupts are enabled, an
         * external synchronization method *must* be used. In particular,
         * the driver-specific locking mechanism used in device_lock()
         * (including disabling migration) should be used. It prevents
         * scenarios such as:
         *
         * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and
         *    is scheduled out.
         * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY
         *    and releases it.
         * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X]
         *    and is scheduled out.
         * 4. [Task A] gets running on [CPU X] and sees that the console is
         *    still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus
         *    [Task A] thinks it is the owner when it is not.
         */

        if (cur->prio != expected_prio)
                return false;

        if (cur->cpu != expected_cpu)
                return false;

        return true;
}

/**
 * nbcon_context_release - Release the console
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 */
static void nbcon_context_release(struct nbcon_context *ctxt)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state cur;
        struct nbcon_state new;

        nbcon_state_read(con, &cur);

        do {
                if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
                        break;

                new.atom = cur.atom;
                new.prio = NBCON_PRIO_NONE;

                /*
                 * If @unsafe_takeover is set, it is kept set so that
                 * the state remains permanently unsafe.
                 */
                new.unsafe |= cur.unsafe_takeover;

        } while (!nbcon_state_try_cmpxchg(con, &cur, &new));

        ctxt->pbufs = NULL;
}

/**
 * nbcon_context_can_proceed - Check whether ownership can proceed
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 * @cur:        The current console state
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * Must be invoked when entering the unsafe state to make sure that it still
 * owns the lock. Also must be invoked when exiting the unsafe context
 * to eventually free the lock for a higher priority context which asked
 * for the friendly handover.
 *
 * It can be called inside an unsafe section when the console is just
 * temporary in safe state instead of exiting and entering the unsafe
 * state.
 *
 * Also it can be called in the safe context before doing an expensive
 * safe operation. It does not make sense to do the operation when
 * a higher priority context took the lock.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();

        /* Make sure this context still owns the console. */
        if (!nbcon_owner_matches(cur, cpu, ctxt->prio))
                return false;

        /* The console owner can proceed if there is no waiter. */
        if (cur->req_prio == NBCON_PRIO_NONE)
                return true;

        /*
         * A console owner within an unsafe region is always allowed to
         * proceed, even if there are waiters. It can perform a handover
         * when exiting the unsafe region. Otherwise the waiter will
         * need to perform an unsafe hostile takeover.
         */
        if (cur->unsafe)
                return true;

        /* Waiters always have higher priorities than owners. */
        WARN_ON_ONCE(cur->req_prio <= cur->prio);

        /*
         * Having a safe point for take over and eventually a few
         * duplicated characters or a full line is way better than a
         * hostile takeover. Post processing can take care of the garbage.
         * Release and hand over.
         */
        nbcon_context_release(ctxt);

        /*
         * It is not clear whether the waiter really took over ownership. The
         * outermost callsite must make the final decision whether console
         * ownership is needed for it to proceed. If yes, it must reacquire
         * ownership (possibly hostile) before carefully proceeding.
         *
         * The calling context no longer owns the console so go back all the
         * way instead of trying to implement reacquire heuristics in tons of
         * places.
         */
        return false;
}

/**
 * nbcon_can_proceed - Check whether ownership can proceed
 * @wctxt:        The write context that was handed to the write function
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * It is used in nbcon_enter_unsafe() to make sure that it still owns the
 * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock
 * for a higher priority context which asked for the friendly handover.
 *
 * It can be called inside an unsafe section when the console is just
 * temporary in safe state instead of exiting and entering the unsafe state.
 *
 * Also it can be called in the safe context before doing an expensive safe
 * operation. It does not make sense to do the operation when a higher
 * priority context took the lock.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        struct nbcon_state cur;

        nbcon_state_read(con, &cur);

        return nbcon_context_can_proceed(ctxt, &cur);
}
EXPORT_SYMBOL_GPL(nbcon_can_proceed);

#define nbcon_context_enter_unsafe(c)        __nbcon_context_update_unsafe(c, true)
#define nbcon_context_exit_unsafe(c)        __nbcon_context_update_unsafe(c, false)

/**
 * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 * @unsafe:        The new value for the unsafe bit
 *
 * Return:        True if the unsafe state was updated and this context still
 *                owns the console. Otherwise false if ownership was handed
 *                over or taken.
 *
 * This function allows console owners to modify the unsafe status of the
 * console.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 *
 * Internal helper to avoid duplicated code.
 */
static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
{
        struct console *con = ctxt->console;
        struct nbcon_state cur;
        struct nbcon_state new;

        nbcon_state_read(con, &cur);

        do {
                /*
                 * The unsafe bit must not be cleared if an
                 * unsafe hostile takeover has occurred.
                 */
                if (!unsafe && cur.unsafe_takeover)
                        goto out;

                if (!nbcon_context_can_proceed(ctxt, &cur))
                        return false;

                new.atom = cur.atom;
                new.unsafe = unsafe;
        } while (!nbcon_state_try_cmpxchg(con, &cur, &new));

        cur.atom = new.atom;
out:
        return nbcon_context_can_proceed(ctxt, &cur);
}

void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
                                 char *buf, unsigned int len)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        struct nbcon_state cur;

        wctxt->outbuf = buf;
        wctxt->len = len;
        nbcon_state_read(con, &cur);
        wctxt->unsafe_takeover = cur.unsafe_takeover;
}

/**
 * nbcon_enter_unsafe - Enter an unsafe region in the driver
 * @wctxt:        The write context that was handed to the write function
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        bool is_owner;

        is_owner = nbcon_context_enter_unsafe(ctxt);
        if (!is_owner)
                nbcon_write_context_set_buf(wctxt, NULL, 0);
        return is_owner;
}
EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);

/**
 * nbcon_exit_unsafe - Exit an unsafe region in the driver
 * @wctxt:        The write context that was handed to the write function
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        bool ret;

        ret = nbcon_context_exit_unsafe(ctxt);
        if (!ret)
                nbcon_write_context_set_buf(wctxt, NULL, 0);
        return ret;
}
EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);

/**
 * nbcon_reacquire_nobuf - Reacquire a console after losing ownership
 *                                while printing
 * @wctxt:        The write context that was handed to the write callback
 *
 * Since ownership can be lost at any time due to handover or takeover, a
 * printing context _must_ be prepared to back out immediately and
 * carefully. However, there are scenarios where the printing context must
 * reacquire ownership in order to finalize or revert hardware changes.
 *
 * This function allows a printing context to reacquire ownership using the
 * same priority as its previous ownership.
 *
 * Note that after a successful reacquire the printing context will have no
 * output buffer because that has been lost. This function cannot be used to
 * resume printing.
 */
void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);

        while (!nbcon_context_try_acquire(ctxt, true))
                cpu_relax();

        nbcon_write_context_set_buf(wctxt, NULL, 0);
}
EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf);

#ifdef CONFIG_PRINTK_EXECUTION_CTX
static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt,
                                     struct printk_message *pmsg)
{
        wctxt->cpu = pmsg->cpu;
        wctxt->pid = pmsg->pid;
        memcpy(wctxt->comm, pmsg->comm, sizeof(wctxt->comm));
        static_assert(sizeof(wctxt->comm) == sizeof(pmsg->comm));
}
#else
static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt,
                                     struct printk_message *pmsg) {}
#endif

/**
 * nbcon_emit_next_record - Emit a record in the acquired context
 * @wctxt:        The write context that will be handed to the write function
 * @use_atomic:        True if the write_atomic() callback is to be used
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context. If the caller
 * wants to do more it must reacquire the console first.
 *
 * When true is returned, @wctxt->ctxt.backlog indicates whether there are
 * still records pending in the ringbuffer,
 */
static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
        struct printk_message pmsg = {
                .pbufs = ctxt->pbufs,
        };
        unsigned long con_dropped;
        struct nbcon_state cur;
        unsigned long dropped;
        unsigned long ulseq;

        /*
         * This function should never be called for consoles that have not
         * implemented the necessary callback for writing: i.e. legacy
         * consoles and, when atomic, nbcon consoles with no write_atomic().
         * Handle it as if ownership was lost and try to continue.
         *
         * Note that for nbcon consoles the write_thread() callback is
         * mandatory and was already checked in nbcon_alloc().
         */
        if (WARN_ON_ONCE((use_atomic && !con->write_atomic) ||
                         !(console_srcu_read_flags(con) & CON_NBCON))) {
                nbcon_context_release(ctxt);
                return false;
        }

        /*
         * The printk buffers are filled within an unsafe section. This
         * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from
         * clobbering each other.
         */

        if (!nbcon_context_enter_unsafe(ctxt))
                return false;

        ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true);
        if (!ctxt->backlog)
                return nbcon_context_exit_unsafe(ctxt);

        /*
         * @con->dropped is not protected in case of an unsafe hostile
         * takeover. In that situation the update can be racy so
         * annotate it accordingly.
         */
        con_dropped = data_race(READ_ONCE(con->dropped));

        dropped = con_dropped + pmsg.dropped;
        if (dropped && !is_extended)
                console_prepend_dropped(&pmsg, dropped);

        /*
         * If the previous owner was assigned the same record, this context
         * has taken over ownership and is replaying the record. Prepend a
         * message to let the user know the record is replayed.
         */
        ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq));
        if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) {
                console_prepend_replay(&pmsg);
        } else {
                /*
                 * Ensure this context is still the owner before trying to
                 * update @nbcon_prev_seq. Otherwise the value in @ulseq may
                 * not be from the previous owner and instead be some later
                 * value from the context that took over ownership.
                 */
                nbcon_state_read(con, &cur);
                if (!nbcon_context_can_proceed(ctxt, &cur))
                        return false;

                atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq,
                                        __u64seq_to_ulseq(pmsg.seq));
        }

        if (!nbcon_context_exit_unsafe(ctxt))
                return false;

        /* For skipped records just update seq/dropped in @con. */
        if (pmsg.outbuf_len == 0)
                goto update_con;

        /* Initialize the write context for driver callbacks. */
        nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len);

        wctxt_load_execution_ctx(wctxt, &pmsg);

        if (use_atomic)
                con->write_atomic(con, wctxt);
        else
                con->write_thread(con, wctxt);

        if (!wctxt->outbuf) {
                /*
                 * Ownership was lost and reacquired by the driver. Handle it
                 * as if ownership was lost.
                 */
                nbcon_context_release(ctxt);
                return false;
        }

        /*
         * Ownership may have been lost but _not_ reacquired by the driver.
         * This case is detected and handled when entering unsafe to update
         * dropped/seq values.
         */

        /*
         * Since any dropped message was successfully output, reset the
         * dropped count for the console.
         */
        dropped = 0;
update_con:
        /*
         * The dropped count and the sequence number are updated within an
         * unsafe section. This limits update races to the panic context and
         * allows the panic context to win.
         */

        if (!nbcon_context_enter_unsafe(ctxt))
                return false;

        if (dropped != con_dropped) {
                /* Counterpart to the READ_ONCE() above. */
                WRITE_ONCE(con->dropped, dropped);
        }

        nbcon_seq_try_update(ctxt, pmsg.seq + 1);

        return nbcon_context_exit_unsafe(ctxt);
}

/*
 * nbcon_emit_one - Print one record for an nbcon console using the
 *                        specified callback
 * @wctxt:        An initialized write context struct to use for this context
 * @use_atomic:        True if the write_atomic() callback is to be used
 *
 * Return:        True, when a record has been printed and there are still
 *                pending records. The caller might want to continue flushing.
 *
 *                False, when there is no pending record, or when the console
 *                context cannot be acquired, or the ownership has been lost.
 *                The caller should give up. Either the job is done, cannot be
 *                done, or will be handled by the owning context.
 *
 * This is an internal helper to handle the locking of the console before
 * calling nbcon_emit_next_record().
 */
static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        unsigned long flags;
        bool ret = false;

        if (!use_atomic) {
                con->device_lock(con, &flags);

                /*
                 * Ensure this stays on the CPU to make handover and
                 * takeover possible.
                 */
                cant_migrate();
        }

        if (!nbcon_context_try_acquire(ctxt, false))
                goto out;

        /*
         * nbcon_emit_next_record() returns false when the console was
         * handed over or taken over. In both cases the context is no
         * longer valid.
         *
         * The higher priority printing context takes over responsibility
         * to print the pending records.
         */
        if (!nbcon_emit_next_record(wctxt, use_atomic))
                goto out;

        nbcon_context_release(ctxt);

        ret = ctxt->backlog;
out:
        if (!use_atomic)
                con->device_unlock(con, flags);
        return ret;
}

/**
 * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup
 * @con:        Console to operate on
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 *
 * Return:        True if the thread should shutdown or if the console is
 *                allowed to print and a record is available. False otherwise.
 *
 * After the thread wakes up, it must first check if it should shutdown before
 * attempting any printing.
 */
static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt)
{
        bool ret = false;
        short flags;
        int cookie;

        if (kthread_should_stop())
                return true;

        /*
         * Block the kthread when the system is in an emergency or panic mode.
         * It increases the chance that these contexts would be able to show
         * the messages directly. And it reduces the risk of interrupted writes
         * where the context with a higher priority takes over the nbcon console
         * ownership in the middle of a message.
         */
        if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) ||
            unlikely(panic_in_progress()))
                return false;

        cookie = console_srcu_read_lock();

        flags = console_srcu_read_flags(con);
        if (console_is_usable(con, flags, false)) {
                /* Bring the sequence in @ctxt up to date */
                ctxt->seq = nbcon_seq_read(con);

                ret = prb_read_valid(prb, ctxt->seq, NULL);
        }

        console_srcu_read_unlock(cookie);
        return ret;
}

/**
 * nbcon_kthread_func - The printer thread function
 * @__console:        Console to operate on
 *
 * Return:        0
 */
static int nbcon_kthread_func(void *__console)
{
        struct console *con = __console;
        struct nbcon_write_context wctxt = {
                .ctxt.console        = con,
                .ctxt.prio        = NBCON_PRIO_NORMAL,
        };
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
        short con_flags;
        bool backlog;
        int cookie;

wait_for_event:
        /*
         * Guarantee this task is visible on the rcuwait before
         * checking the wake condition.
         *
         * The full memory barrier within set_current_state() of
         * ___rcuwait_wait_event() pairs with the full memory
         * barrier within rcuwait_has_sleeper().
         *
         * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A.
         */
        rcuwait_wait_event(&con->rcuwait,
                           nbcon_kthread_should_wakeup(con, ctxt),
                           TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */

        do {
                if (kthread_should_stop())
                        return 0;

                /*
                 * Block the kthread when the system is in an emergency or panic
                 * mode. See nbcon_kthread_should_wakeup() for more details.
                 */
                if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) ||
                    unlikely(panic_in_progress()))
                        goto wait_for_event;

                backlog = false;

                /*
                 * Keep the srcu read lock around the entire operation so that
                 * synchronize_srcu() can guarantee that the kthread stopped
                 * or suspended printing.
                 */
                cookie = console_srcu_read_lock();

                con_flags = console_srcu_read_flags(con);

                if (console_is_usable(con, con_flags, false))
                        backlog = nbcon_emit_one(&wctxt, false);

                console_srcu_read_unlock(cookie);

                cond_resched();

        } while (backlog);

        goto wait_for_event;
}

/**
 * nbcon_irq_work - irq work to wake console printer thread
 * @irq_work:        The irq work to operate on
 */
static void nbcon_irq_work(struct irq_work *irq_work)
{
        struct console *con = container_of(irq_work, struct console, irq_work);

        nbcon_kthread_wake(con);
}

static inline bool rcuwait_has_sleeper(struct rcuwait *w)
{
        /*
         * Guarantee any new records can be seen by tasks preparing to wait
         * before this context checks if the rcuwait is empty.
         *
         * This full memory barrier pairs with the full memory barrier within
         * set_current_state() of ___rcuwait_wait_event(), which is called
         * after prepare_to_rcuwait() adds the waiter but before it has
         * checked the wait condition.
         *
         * This pairs with nbcon_kthread_func:A.
         */
        smp_mb(); /* LMM(rcuwait_has_sleeper:A) */
        return rcuwait_active(w);
}

/**
 * nbcon_kthreads_wake - Wake up printing threads using irq_work
 */
void nbcon_kthreads_wake(void)
{
        struct console *con;
        int cookie;

        if (!printk_kthreads_running)
                return;

        /*
         * It is not allowed to call this function when console irq_work
         * is blocked.
         */
        if (WARN_ON_ONCE(console_irqwork_blocked))
                return;

        cookie = console_srcu_read_lock();
        for_each_console_srcu(con) {
                if (!(console_srcu_read_flags(con) & CON_NBCON))
                        continue;

                /*
                 * Only schedule irq_work if the printing thread is
                 * actively waiting. If not waiting, the thread will
                 * notice by itself that it has work to do.
                 */
                if (rcuwait_has_sleeper(&con->rcuwait))
                        irq_work_queue(&con->irq_work);
        }
        console_srcu_read_unlock(cookie);
}

/*
 * nbcon_kthread_stop - Stop a console printer thread
 * @con:        Console to operate on
 */
void nbcon_kthread_stop(struct console *con)
{
        lockdep_assert_console_list_lock_held();

        if (!con->kthread)
                return;

        kthread_stop(con->kthread);
        con->kthread = NULL;
}

/**
 * nbcon_kthread_create - Create a console printer thread
 * @con:        Console to operate on
 *
 * Return:        True if the kthread was started or already exists.
 *                Otherwise false and @con must not be registered.
 *
 * This function is called when it will be expected that nbcon consoles are
 * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL
 * will be no longer flushed by the legacy loop. This is why failure must
 * be fatal for console registration.
 *
 * If @con was already registered and this function fails, @con must be
 * unregistered before the global state variable @printk_kthreads_running
 * can be set.
 */
bool nbcon_kthread_create(struct console *con)
{
        struct task_struct *kt;

        lockdep_assert_console_list_lock_held();

        if (con->kthread)
                return true;

        kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index);
        if (WARN_ON(IS_ERR(kt))) {
                con_printk(KERN_ERR, con, "failed to start printing thread\n");
                return false;
        }

        con->kthread = kt;

        /*
         * It is important that console printing threads are scheduled
         * shortly after a printk call and with generous runtime budgets.
         */
        sched_set_normal(con->kthread, -20);

        return true;
}

/* Track the nbcon emergency nesting per CPU. */
static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting);
static unsigned int early_nbcon_pcpu_emergency_nesting __initdata;

/**
 * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer
 *
 * Context:        For reading, any context. For writing, any context which could
 *                not be migrated to another CPU.
 * Return:        Either a pointer to the per CPU emergency nesting counter of
 *                the current CPU or to the init data during early boot.
 *
 * The function is safe for reading per-CPU variables in any context because
 * preemption is disabled if the current CPU is in the emergency state. See
 * also nbcon_cpu_emergency_enter().
 */
static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void)
{
        /*
         * The value of __printk_percpu_data_ready gets set in normal
         * context and before SMP initialization. As a result it could
         * never change while inside an nbcon emergency section.
         */
        if (!printk_percpu_data_ready())
                return &early_nbcon_pcpu_emergency_nesting;

        return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting);
}

/**
 * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon
 *                                printing on the current CPU
 *
 * Context:        Any context.
 * Return:        The nbcon_prio to use for acquiring an nbcon console in this
 *                context for printing.
 *
 * The function is safe for reading per-CPU data in any context because
 * preemption is disabled if the current CPU is in the emergency or panic
 * state.
 */
enum nbcon_prio nbcon_get_default_prio(void)
{
        unsigned int *cpu_emergency_nesting;

        if (panic_on_this_cpu())
                return NBCON_PRIO_PANIC;

        cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
        if (*cpu_emergency_nesting)
                return NBCON_PRIO_EMERGENCY;

        return NBCON_PRIO_NORMAL;
}

/*
 * Track if it is allowed to perform unsafe hostile takeovers of console
 * ownership. When true, console drivers might perform unsafe actions while
 * printing. It is externally available via nbcon_allow_unsafe_takeover().
 */
static bool panic_nbcon_allow_unsafe_takeover;

/**
 * nbcon_allow_unsafe_takeover - Check if unsafe console takeovers are allowed
 *
 * Return:        True, when it is permitted to perform unsafe console printing
 *
 * This is also used by console_is_usable() to determine if it is allowed to
 * call write_atomic() callbacks flagged as unsafe (CON_NBCON_ATOMIC_UNSAFE).
 */
bool nbcon_allow_unsafe_takeover(void)
{
        return panic_on_this_cpu() && panic_nbcon_allow_unsafe_takeover;
}

/**
 * nbcon_legacy_emit_next_record - Print one record for an nbcon console
 *                                        in legacy contexts
 * @con:        The console to print on
 * @handover:        Will be set to true if a printk waiter has taken over the
 *                console_lock, in which case the caller is no longer holding
 *                both the console_lock and the SRCU read lock. Otherwise it
 *                is set to false.
 * @cookie:        The cookie from the SRCU read lock.
 * @use_atomic: Set true when called in an atomic or unknown context.
 *                It affects which nbcon callback will be used: write_atomic()
 *                or write_thread().
 *
 *                When false, the write_thread() callback is used and would be
 *                called in a preemtible context unless disabled by the
 *                device_lock. The legacy handover is not allowed in this mode.
 *
 * Context:        Any context except NMI.
 * Return:        True, when a record has been printed and there are still
 *                pending records. The caller might want to continue flushing.
 *
 *                False, when there is no pending record, or when the console
 *                context cannot be acquired, or the ownership has been lost.
 *                The caller should give up. Either the job is done, cannot be
 *                done, or will be handled by the owning context.
 *
 * This function is meant to be called by console_flush_all() to print records
 * on nbcon consoles from legacy context (printing via console unlocking).
 * Essentially it is the nbcon version of console_emit_next_record().
 */
bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
                                   int cookie, bool use_atomic)
{
        struct nbcon_write_context wctxt = { };
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
        unsigned long flags;
        bool progress;

        ctxt->console        = con;
        ctxt->prio        = nbcon_get_default_prio();

        if (use_atomic) {
                /*
                 * In an atomic or unknown context, use the same procedure as
                 * in console_emit_next_record(). It allows to handover.
                 */
                printk_safe_enter_irqsave(flags);
                console_lock_spinning_enable();
                stop_critical_timings();
        }

        progress = nbcon_emit_one(&wctxt, use_atomic);

        if (use_atomic) {
                start_critical_timings();
                *handover = console_lock_spinning_disable_and_check(cookie);
                printk_safe_exit_irqrestore(flags);
        } else {
                /* Non-atomic does not perform legacy spinning handovers. */
                *handover = false;
        }

        return progress;
}

/**
 * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
 *                                        write_atomic() callback
 * @con:                        The nbcon console to flush
 * @stop_seq:                        Flush up until this record
 *
 * Return:        0 if @con was flushed up to @stop_seq Otherwise, error code on
 *                failure.
 *
 * Errors:
 *
 *        -EPERM:                Unable to acquire console ownership.
 *
 *        -EAGAIN:        Another context took over ownership while printing.
 *
 *        -ENOENT:        A record before @stop_seq is not available.
 *
 * If flushing up to @stop_seq was not successful, it only makes sense for the
 * caller to try again when -EAGAIN was returned. When -EPERM is returned,
 * this context is not allowed to acquire the console. When -ENOENT is
 * returned, it cannot be expected that the unfinalized record will become
 * available.
 */
static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq)
{
        struct nbcon_write_context wctxt = { };
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
        int err = 0;

        ctxt->console                        = con;
        ctxt->spinwait_max_us                = 2000;
        ctxt->prio                        = nbcon_get_default_prio();
        ctxt->allow_unsafe_takeover        = nbcon_allow_unsafe_takeover();

        while (nbcon_seq_read(con) < stop_seq) {
                /*
                 * Atomic flushing does not use console driver synchronization
                 * (i.e. it does not hold the port lock for uart consoles).
                 * Therefore IRQs must be disabled to avoid being interrupted
                 * and then calling into a driver that will deadlock trying
                 * to acquire console ownership.
                 */
                scoped_guard(irqsave) {
                        if (!nbcon_context_try_acquire(ctxt, false))
                                return -EPERM;

                        /*
                         * nbcon_emit_next_record() returns false when
                         * the console was handed over or taken over.
                         * In both cases the context is no longer valid.
                         */
                        if (!nbcon_emit_next_record(&wctxt, true))
                                return -EAGAIN;

                        nbcon_context_release(ctxt);
                }

                if (!ctxt->backlog) {
                        /* Are there reserved but not yet finalized records? */
                        if (nbcon_seq_read(con) < stop_seq)
                                err = -ENOENT;
                        break;
                }
        }

        return err;
}

/**
 * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
 *                                        write_atomic() callback
 * @con:                        The nbcon console to flush
 * @stop_seq:                        Flush up until this record
 *
 * This will stop flushing before @stop_seq if another context has ownership.
 * That context is then responsible for the flushing. Likewise, if new records
 * are added while this context was flushing and there is no other context
 * to handle the printing, this context must also flush those records.
 */
static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq)
{
        struct console_flush_type ft;
        int err;

again:
        err = __nbcon_atomic_flush_pending_con(con, stop_seq);

        /*
         * If there was a new owner (-EPERM, -EAGAIN), that context is
         * responsible for completing.
         *
         * Do not wait for records not yet finalized (-ENOENT) to avoid a
         * possible deadlock. They will either get flushed by the writer or
         * eventually skipped on panic CPU.
         */
        if (err)
                return;

        /*
         * If flushing was successful but more records are available, this
         * context must flush those remaining records if the printer thread
         * is not available do it.
         */
        printk_get_console_flush_type(&ft);
        if (!ft.nbcon_offload &&
            prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
                stop_seq = prb_next_reserve_seq(prb);
                goto again;
        }
}

/**
 * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their
 *                                        write_atomic() callback
 * @stop_seq:                        Flush up until this record
 */
static void __nbcon_atomic_flush_pending(u64 stop_seq)
{
        struct console *con;
        int cookie;

        cookie = console_srcu_read_lock();
        for_each_console_srcu(con) {
                short flags = console_srcu_read_flags(con);

                if (!(flags & CON_NBCON))
                        continue;

                if (!console_is_usable(con, flags, true))
                        continue;

                if (nbcon_seq_read(con) >= stop_seq)
                        continue;

                nbcon_atomic_flush_pending_con(con, stop_seq);
        }
        console_srcu_read_unlock(cookie);
}

/**
 * nbcon_atomic_flush_pending - Flush all nbcon consoles using their
 *                                write_atomic() callback
 *
 * Flush the backlog up through the currently newest record. Any new
 * records added while flushing will not be flushed if there is another
 * context available to handle the flushing. This is to avoid one CPU
 * printing unbounded because other CPUs continue to add records.
 */
void nbcon_atomic_flush_pending(void)
{
        __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb));
}

/**
 * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their
 *        write_atomic() callback and allowing unsafe hostile takeovers
 *
 * Flush the backlog up through the currently newest record. Unsafe hostile
 * takeovers will be performed, if necessary.
 */
void nbcon_atomic_flush_unsafe(void)
{
        panic_nbcon_allow_unsafe_takeover = true;
        __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb));
        panic_nbcon_allow_unsafe_takeover = false;
}

/**
 * nbcon_cpu_emergency_enter - Enter an emergency section where printk()
 *                                messages for that CPU are flushed directly
 *
 * Context:        Any context. Disables preemption.
 *
 * When within an emergency section, printk() calls will attempt to flush any
 * pending messages in the ringbuffer.
 */
void nbcon_cpu_emergency_enter(void)
{
        unsigned int *cpu_emergency_nesting;

        preempt_disable();

        atomic_inc(&nbcon_cpu_emergency_cnt);

        cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
        (*cpu_emergency_nesting)++;
}

/**
 * nbcon_cpu_emergency_exit - Exit an emergency section
 *
 * Context:        Within an emergency section. Enables preemption.
 */
void nbcon_cpu_emergency_exit(void)
{
        unsigned int *cpu_emergency_nesting;

        cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
        if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0))
                (*cpu_emergency_nesting)--;

        /*
         * Wake up kthreads because there might be some pending messages
         * added by other CPUs with normal priority since the last flush
         * in the emergency context.
         */
        if (!WARN_ON_ONCE(atomic_read(&nbcon_cpu_emergency_cnt) == 0)) {
                if (atomic_dec_return(&nbcon_cpu_emergency_cnt) == 0) {
                        struct console_flush_type ft;

                        printk_get_console_flush_type(&ft);
                        if (ft.nbcon_offload)
                                nbcon_kthreads_wake();
                }
        }

        preempt_enable();
}

/**
 * nbcon_alloc - Allocate and init the nbcon console specific data
 * @con:        Console to initialize
 *
 * Return:        True if the console was fully allocated and initialized.
 *                Otherwise @con must not be registered.
 *
 * When allocation and init was successful, the console must be properly
 * freed using nbcon_free() once it is no longer needed.
 */
bool nbcon_alloc(struct console *con)
{
        struct nbcon_state state = { };

        /* Synchronize the kthread start. */
        lockdep_assert_console_list_lock_held();

        /* Check for mandatory nbcon callbacks. */
        if (WARN_ON(!con->write_thread ||
                    !con->device_lock ||
                    !con->device_unlock)) {
                return false;
        }

        rcuwait_init(&con->rcuwait);
        init_irq_work(&con->irq_work, nbcon_irq_work);
        atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL);
        nbcon_state_set(con, &state);

        /*
         * Initialize @nbcon_seq to the highest possible sequence number so
         * that practically speaking it will have nothing to print until a
         * desired initial sequence number has been set via nbcon_seq_force().
         */
        atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb));

        if (con->flags & CON_BOOT) {
                /*
                 * Boot console printing is synchronized with legacy console
                 * printing, so boot consoles can share the same global printk
                 * buffers.
                 */
                con->pbufs = &printk_shared_pbufs;
        } else {
                con->pbufs = kmalloc_obj(*con->pbufs);
                if (!con->pbufs) {
                        con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
                        return false;
                }

                if (printk_kthreads_ready && !have_boot_console) {
                        if (!nbcon_kthread_create(con)) {
                                kfree(con->pbufs);
                                con->pbufs = NULL;
                                return false;
                        }

                        /* Might be the first kthread. */
                        printk_kthreads_running = true;
                }
        }

        return true;
}

/**
 * nbcon_free - Free and cleanup the nbcon console specific data
 * @con:        Console to free/cleanup nbcon data
 *
 * Important: @have_nbcon_console must be updated before calling
 *        this function. In particular, it can be set only when there
 *        is still another nbcon console registered.
 */
void nbcon_free(struct console *con)
{
        struct nbcon_state state = { };

        /* Synchronize the kthread stop. */
        lockdep_assert_console_list_lock_held();

        if (printk_kthreads_running) {
                nbcon_kthread_stop(con);

                /* Might be the last nbcon console.
                 *
                 * Do not rely on printk_kthreads_check_locked(). It is not
                 * called in some code paths, see nbcon_free() callers.
                 */
                if (!have_nbcon_console)
                        printk_kthreads_running = false;
        }

        nbcon_state_set(con, &state);

        /* Boot consoles share global printk buffers. */
        if (!(con->flags & CON_BOOT))
                kfree(con->pbufs);

        con->pbufs = NULL;
}

/**
 * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe
 *                                section
 * @con:        The nbcon console to acquire
 *
 * Context:        Under the locking mechanism implemented in
 *                @con->device_lock() including disabling migration.
 * Return:        True if the console was acquired. False otherwise.
 *
 * Console drivers will usually use their own internal synchronization
 * mechasism to synchronize between console printing and non-printing
 * activities (such as setting baud rates). However, nbcon console drivers
 * supporting atomic consoles may also want to mark unsafe sections when
 * performing non-printing activities in order to synchronize against their
 * atomic_write() callback.
 *
 * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL
 * and marks it unsafe for handover/takeover.
 */
bool nbcon_device_try_acquire(struct console *con)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);

        cant_migrate();

        memset(ctxt, 0, sizeof(*ctxt));
        ctxt->console        = con;
        ctxt->prio        = NBCON_PRIO_NORMAL;

        if (!nbcon_context_try_acquire(ctxt, false))
                return false;

        if (!nbcon_context_enter_unsafe(ctxt))
                return false;

        return true;
}
EXPORT_SYMBOL_GPL(nbcon_device_try_acquire);

/**
 * nbcon_device_release - Exit unsafe section and release the nbcon console
 * @con:        The nbcon console acquired in nbcon_device_try_acquire()
 */
void nbcon_device_release(struct console *con)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);
        struct console_flush_type ft;
        int cookie;

        if (!nbcon_context_exit_unsafe(ctxt))
                return;

        nbcon_context_release(ctxt);

        /*
         * This context must flush any new records added while the console
         * was locked if the printer thread is not available to do it. The
         * console_srcu_read_lock must be taken to ensure the console is
         * usable throughout flushing.
         */
        cookie = console_srcu_read_lock();
        printk_get_console_flush_type(&ft);
        if (console_is_usable(con, console_srcu_read_flags(con), true) &&
            !ft.nbcon_offload &&
            prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
                /*
                 * If nbcon_atomic flushing is not available, fallback to
                 * using the legacy loop.
                 */
                if (ft.nbcon_atomic) {
                        __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb));
                } else if (ft.legacy_direct) {
                        if (console_trylock())
                                console_unlock();
                } else if (ft.legacy_offload) {
                        defer_console_output();
                }
        }
        console_srcu_read_unlock(cookie);
}
EXPORT_SYMBOL_GPL(nbcon_device_release);

/**
 * nbcon_kdb_try_acquire - Try to acquire nbcon console and enter unsafe
 *                           section
 * @con:        The nbcon console to acquire
 * @wctxt:        The nbcon write context to be used on success
 *
 * Context:        Under console_srcu_read_lock() for emitting a single kdb message
 *                using the given con->write_atomic() callback. Can be called
 *                only when the console is usable at the moment.
 *
 * Return:        True if the console was acquired. False otherwise.
 *
 * kdb emits messages on consoles registered for printk() without
 * storing them into the ring buffer. It has to acquire the console
 * ownerhip so that it could call con->write_atomic() callback a safe way.
 *
 * This function acquires the nbcon console using priority NBCON_PRIO_EMERGENCY
 * and marks it unsafe for handover/takeover.
 */
bool nbcon_kdb_try_acquire(struct console *con,
                           struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);

        memset(ctxt, 0, sizeof(*ctxt));
        ctxt->console = con;
        ctxt->prio    = NBCON_PRIO_EMERGENCY;

        if (!nbcon_context_try_acquire(ctxt, false))
                return false;

        if (!nbcon_context_enter_unsafe(ctxt))
                return false;

        return true;
}

/**
 * nbcon_kdb_release - Exit unsafe section and release the nbcon console
 *
 * @wctxt:        The nbcon write context initialized by a successful
 *                nbcon_kdb_try_acquire()
 */
void nbcon_kdb_release(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);

        if (!nbcon_context_exit_unsafe(ctxt))
                return;

        nbcon_context_release(ctxt);

        /*
         * Flush any new printk() messages added when the console was blocked.
         * Only the console used by the given write context was        blocked.
         * The console was locked only when the write_atomic() callback
         * was usable.
         */
        __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb));
}




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KDEV_T_H
#define _LINUX_KDEV_T_H

#include <uapi/linux/kdev_t.h>

#define MINORBITS        20
#define MINORMASK        ((1U << MINORBITS) - 1)

#define MAJOR(dev)        ((unsigned int) ((dev) >> MINORBITS))
#define MINOR(dev)        ((unsigned int) ((dev) & MINORMASK))
#define MKDEV(ma,mi)        (((ma) << MINORBITS) | (mi))

#define print_dev_t(buffer, dev)                                        \
        sprintf((buffer), "%u:%u\n", MAJOR(dev), MINOR(dev))

#define format_dev_t(buffer, dev)                                        \
        ({                                                                \
                sprintf(buffer, "%u:%u", MAJOR(dev), MINOR(dev));        \
                buffer;                                                        \
        })

/* acceptable for old filesystems */
static __always_inline bool old_valid_dev(dev_t dev)
{
        return MAJOR(dev) < 256 && MINOR(dev) < 256;
}

static __always_inline u16 old_encode_dev(dev_t dev)
{
        return (MAJOR(dev) << 8) | MINOR(dev);
}

static __always_inline dev_t old_decode_dev(u16 val)
{
        return MKDEV((val >> 8) & 255, val & 255);
}

static __always_inline u32 new_encode_dev(dev_t dev)
{
        unsigned major = MAJOR(dev);
        unsigned minor = MINOR(dev);
        return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12);
}

static __always_inline dev_t new_decode_dev(u32 dev)
{
        unsigned major = (dev & 0xfff00) >> 8;
        unsigned minor = (dev & 0xff) | ((dev >> 12) & 0xfff00);
        return MKDEV(major, minor);
}

static __always_inline u64 huge_encode_dev(dev_t dev)
{
        return new_encode_dev(dev);
}

static __always_inline dev_t huge_decode_dev(u64 dev)
{
        return new_decode_dev(dev);
}

static __always_inline int sysv_valid_dev(dev_t dev)
{
        return MAJOR(dev) < (1<<14) && MINOR(dev) < (1<<18);
}

static __always_inline u32 sysv_encode_dev(dev_t dev)
{
        return MINOR(dev) | (MAJOR(dev) << 18);
}

static __always_inline unsigned sysv_major(u32 dev)
{
        return (dev >> 18) & 0x3fff;
}

static __always_inline unsigned sysv_minor(u32 dev)
{
        return dev & 0x3ffff;
}

#endif




































































































    6 



    6 





























































































































































































































































































































































































































    6 


    4 
    6 






























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
#include <linux/gfp.h>
#include <linux/initrd.h>
#include <linux/ioport.h>
#include <linux/swap.h>
#include <linux/memblock.h>
#include <linux/swapfile.h>
#include <linux/swapops.h>
#include <linux/kmemleak.h>
#include <linux/sched/task.h>
#include <linux/execmem.h>

#include <asm/set_memory.h>
#include <asm/cpu_device_id.h>
#include <asm/e820/api.h>
#include <asm/init.h>
#include <asm/page.h>
#include <asm/page_types.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/proto.h>
#include <asm/dma.h>                /* for MAX_DMA_PFN */
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
#include <asm/pti.h>
#include <asm/text-patching.h>
#include <asm/memtype.h>
#include <asm/mmu_context.h>

/*
 * We need to define the tracepoints somewhere, and tlb.c
 * is only compiled when SMP=y.
 */
#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>

#include "mm_internal.h"

/*
 * Tables translating between page_cache_type_t and pte encoding.
 *
 * The default values are defined statically as minimal supported mode;
 * WC and WT fall back to UC-.  pat_init() updates these values to support
 * more cache modes, WC and WT, when it is safe to do so.  See pat_init()
 * for the details.  Note, __early_ioremap() used during early boot-time
 * takes pgprot_t (pte encoding) and does not use these tables.
 *
 *   Index into __cachemode2pte_tbl[] is the cachemode.
 *
 *   Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
 *   (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
 */
static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
        [_PAGE_CACHE_MODE_WB      ]        = 0         | 0        ,
        [_PAGE_CACHE_MODE_WC      ]        = 0         | _PAGE_PCD,
        [_PAGE_CACHE_MODE_UC_MINUS]        = 0         | _PAGE_PCD,
        [_PAGE_CACHE_MODE_UC      ]        = _PAGE_PWT | _PAGE_PCD,
        [_PAGE_CACHE_MODE_WT      ]        = 0         | _PAGE_PCD,
        [_PAGE_CACHE_MODE_WP      ]        = 0         | _PAGE_PCD,
};

unsigned long cachemode2protval(enum page_cache_mode pcm)
{
        if (likely(pcm == 0))
                return 0;
        return __cachemode2pte_tbl[pcm];
}
EXPORT_SYMBOL(cachemode2protval);

static uint8_t __pte2cachemode_tbl[8] = {
        [__pte2cm_idx( 0        | 0         | 0        )] = _PAGE_CACHE_MODE_WB,
        [__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
        [__pte2cm_idx( 0        | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
        [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC,
        [__pte2cm_idx( 0        | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
        [__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
        [__pte2cm_idx(0         | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
        [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};

/*
 * Check that the write-protect PAT entry is set for write-protect.
 * To do this without making assumptions how PAT has been set up (Xen has
 * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache
 * mode via the __cachemode2pte_tbl[] into protection bits (those protection
 * bits will select a cache mode of WP or better), and then translate the
 * protection bits back into the cache mode using __pte2cm_idx() and the
 * __pte2cachemode_tbl[] array. This will return the really used cache mode.
 */
bool x86_has_pat_wp(void)
{
        uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP];

        return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP;
}

enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
{
        unsigned long masked;

        masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
        if (likely(masked == 0))
                return 0;
        return __pte2cachemode_tbl[__pte2cm_idx(masked)];
}

static unsigned long __initdata pgt_buf_start;
static unsigned long __initdata pgt_buf_end;
static unsigned long __initdata pgt_buf_top;

static unsigned long min_pfn_mapped;

static bool __initdata can_use_brk_pgt = true;

/*
 * Pages returned are already directly mapped.
 *
 * Changing that is likely to break Xen, see commit:
 *
 *    279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
 *
 * for detailed information.
 */
__ref void *alloc_low_pages(unsigned int num)
{
        unsigned long pfn;
        int i;

        if (after_bootmem) {
                unsigned int order;

                order = get_order((unsigned long)num << PAGE_SHIFT);
                return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
        }

        if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
                unsigned long ret = 0;

                if (min_pfn_mapped < max_pfn_mapped) {
                        ret = memblock_phys_alloc_range(
                                        PAGE_SIZE * num, PAGE_SIZE,
                                        min_pfn_mapped << PAGE_SHIFT,
                                        max_pfn_mapped << PAGE_SHIFT);
                }
                if (!ret && can_use_brk_pgt)
                        ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));

                if (!ret)
                        panic("alloc_low_pages: can not alloc memory");

                pfn = ret >> PAGE_SHIFT;
        } else {
                pfn = pgt_buf_end;
                pgt_buf_end += num;
        }

        for (i = 0; i < num; i++) {
                void *adr;

                adr = __va((pfn + i) << PAGE_SHIFT);
                clear_page(adr);
        }

        return __va(pfn << PAGE_SHIFT);
}

/*
 * By default need to be able to allocate page tables below PGD firstly for
 * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping.
 * With KASLR memory randomization, depending on the machine e820 memory and the
 * PUD alignment, twice that many pages may be needed when KASLR memory
 * randomization is enabled.
 */

#define INIT_PGD_PAGE_TABLES    4

#ifndef CONFIG_RANDOMIZE_MEMORY
#define INIT_PGD_PAGE_COUNT      (2 * INIT_PGD_PAGE_TABLES)
#else
#define INIT_PGD_PAGE_COUNT      (4 * INIT_PGD_PAGE_TABLES)
#endif

#define INIT_PGT_BUF_SIZE        (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void  __init early_alloc_pgt_buf(void)
{
        unsigned long tables = INIT_PGT_BUF_SIZE;
        phys_addr_t base;

        base = __pa(extend_brk(tables, PAGE_SIZE));

        pgt_buf_start = base >> PAGE_SHIFT;
        pgt_buf_end = pgt_buf_start;
        pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
}

int after_bootmem;

early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);

struct map_range {
        unsigned long start;
        unsigned long end;
        unsigned page_size_mask;
};

static int page_size_mask;

/*
 * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
 * enable and PPro Global page enable), so that any CPU's that boot
 * up after us can get the correct flags. Invoked on the boot CPU.
 */
static inline void cr4_set_bits_and_update_boot(unsigned long mask)
{
        mmu_cr4_features |= mask;
        if (trampoline_cr4_features)
                *trampoline_cr4_features = mmu_cr4_features;
        cr4_set_bits(mask);
}

static void __init probe_page_size_mask(void)
{
        /*
         * For pagealloc debugging, identity mapping will use small pages.
         * This will simplify cpa(), which otherwise needs to support splitting
         * large pages into small in interrupt context, etc.
         */
        if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
                page_size_mask |= 1 << PG_LEVEL_2M;
        else
                direct_gbpages = 0;

        /* Enable PSE if available */
        if (boot_cpu_has(X86_FEATURE_PSE))
                cr4_set_bits_and_update_boot(X86_CR4_PSE);

        /* Enable PGE if available */
        __supported_pte_mask &= ~_PAGE_GLOBAL;
        if (boot_cpu_has(X86_FEATURE_PGE)) {
                cr4_set_bits_and_update_boot(X86_CR4_PGE);
                __supported_pte_mask |= _PAGE_GLOBAL;
        }

        /* By the default is everything supported: */
        __default_kernel_pte_mask = __supported_pte_mask;
        /* Except when with PTI where the kernel is mostly non-Global: */
        if (cpu_feature_enabled(X86_FEATURE_PTI))
                __default_kernel_pte_mask &= ~_PAGE_GLOBAL;

        /* Enable 1 GB linear kernel mappings if available: */
        if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
                printk(KERN_INFO "Using GB pages for direct mapping\n");
                page_size_mask |= 1 << PG_LEVEL_1G;
        } else {
                direct_gbpages = 0;
        }
}

/*
 * INVLPG may not properly flush Global entries on
 * these CPUs.  New microcode fixes the issue.
 */
static const struct x86_cpu_id invlpg_miss_ids[] = {
        X86_MATCH_VFM(INTEL_ALDERLAKE,            0x2e),
        X86_MATCH_VFM(INTEL_ALDERLAKE_L,    0x42c),
        X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11),
        X86_MATCH_VFM(INTEL_RAPTORLAKE,            0x118),
        X86_MATCH_VFM(INTEL_RAPTORLAKE_P,   0x4117),
        X86_MATCH_VFM(INTEL_RAPTORLAKE_S,   0x2e),
        {}
};

static void setup_pcid(void)
{
        const struct x86_cpu_id *invlpg_miss_match;

        if (!IS_ENABLED(CONFIG_X86_64))
                return;

        if (!boot_cpu_has(X86_FEATURE_PCID))
                return;

        invlpg_miss_match = x86_match_cpu(invlpg_miss_ids);

        if (invlpg_miss_match &&
            boot_cpu_data.microcode < invlpg_miss_match->driver_data) {
                pr_info("Incomplete global flushes, disabling PCID");
                setup_clear_cpu_cap(X86_FEATURE_PCID);
                return;
        }

        if (boot_cpu_has(X86_FEATURE_PGE)) {
                /*
                 * This can't be cr4_set_bits_and_update_boot() -- the
                 * trampoline code can't handle CR4.PCIDE and it wouldn't
                 * do any good anyway.  Despite the name,
                 * cr4_set_bits_and_update_boot() doesn't actually cause
                 * the bits in question to remain set all the way through
                 * the secondary boot asm.
                 *
                 * Instead, we brute-force it and set CR4.PCIDE manually in
                 * start_secondary().
                 */
                cr4_set_bits(X86_CR4_PCIDE);
        } else {
                /*
                 * flush_tlb_all(), as currently implemented, won't work if
                 * PCID is on but PGE is not.  Since that combination
                 * doesn't exist on real hardware, there's no reason to try
                 * to fully support it, but it's polite to avoid corrupting
                 * data if we're on an improperly configured VM.
                 */
                setup_clear_cpu_cap(X86_FEATURE_PCID);
        }
}

#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
#define NR_RANGE_MR 5
#endif

static int __meminit save_mr(struct map_range *mr, int nr_range,
                             unsigned long start_pfn, unsigned long end_pfn,
                             unsigned long page_size_mask)
{
        if (start_pfn < end_pfn) {
                if (nr_range >= NR_RANGE_MR)
                        panic("run out of range for init_memory_mapping\n");
                mr[nr_range].start = start_pfn<<PAGE_SHIFT;
                mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
                mr[nr_range].page_size_mask = page_size_mask;
                nr_range++;
        }

        return nr_range;
}

/*
 * adjust the page_size_mask for small range to go with
 *        big page size instead small one if nearby are ram too.
 */
static void __ref adjust_range_page_size_mask(struct map_range *mr,
                                                         int nr_range)
{
        int i;

        for (i = 0; i < nr_range; i++) {
                if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
                    !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
                        unsigned long start = round_down(mr[i].start, PMD_SIZE);
                        unsigned long end = round_up(mr[i].end, PMD_SIZE);

#ifdef CONFIG_X86_32
                        if ((end >> PAGE_SHIFT) > max_low_pfn)
                                continue;
#endif

                        if (memblock_is_region_memory(start, end - start))
                                mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
                }
                if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
                    !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
                        unsigned long start = round_down(mr[i].start, PUD_SIZE);
                        unsigned long end = round_up(mr[i].end, PUD_SIZE);

                        if (memblock_is_region_memory(start, end - start))
                                mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
                }
        }
}

static const char *page_size_string(struct map_range *mr)
{
        static const char str_1g[] = "1G";
        static const char str_2m[] = "2M";
        static const char str_4m[] = "4M";
        static const char str_4k[] = "4k";

        if (mr->page_size_mask & (1<<PG_LEVEL_1G))
                return str_1g;
        /*
         * 32-bit without PAE has a 4M large page size.
         * PG_LEVEL_2M is misnamed, but we can at least
         * print out the right size in the string.
         */
        if (IS_ENABLED(CONFIG_X86_32) &&
            !IS_ENABLED(CONFIG_X86_PAE) &&
            mr->page_size_mask & (1<<PG_LEVEL_2M))
                return str_4m;

        if (mr->page_size_mask & (1<<PG_LEVEL_2M))
                return str_2m;

        return str_4k;
}

static int __meminit split_mem_range(struct map_range *mr, int nr_range,
                                     unsigned long start,
                                     unsigned long end)
{
        unsigned long start_pfn, end_pfn, limit_pfn;
        unsigned long pfn;
        int i;

        limit_pfn = PFN_DOWN(end);

        /* head if not big page alignment ? */
        pfn = start_pfn = PFN_DOWN(start);
#ifdef CONFIG_X86_32
        /*
         * Don't use a large page for the first 2/4MB of memory
         * because there are often fixed size MTRRs in there
         * and overlapping MTRRs into large pages can cause
         * slowdowns.
         */
        if (pfn == 0)
                end_pfn = PFN_DOWN(PMD_SIZE);
        else
                end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
        end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#endif
        if (end_pfn > limit_pfn)
                end_pfn = limit_pfn;
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
                pfn = end_pfn;
        }

        /* big page (2M) range */
        start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#ifdef CONFIG_X86_32
        end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
        end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
        if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
                end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#endif

        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask & (1<<PG_LEVEL_2M));
                pfn = end_pfn;
        }

#ifdef CONFIG_X86_64
        /* big page (1G) range */
        start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
        end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask &
                                 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
                pfn = end_pfn;
        }

        /* tail is not big page (1G) alignment */
        start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
        end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask & (1<<PG_LEVEL_2M));
                pfn = end_pfn;
        }
#endif

        /* tail is not big page (2M) alignment */
        start_pfn = pfn;
        end_pfn = limit_pfn;
        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);

        if (!after_bootmem)
                adjust_range_page_size_mask(mr, nr_range);

        /* try to merge same page size and continuous */
        for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
                unsigned long old_start;
                if (mr[i].end != mr[i+1].start ||
                    mr[i].page_size_mask != mr[i+1].page_size_mask)
                        continue;
                /* move it */
                old_start = mr[i].start;
                memmove(&mr[i], &mr[i+1],
                        (nr_range - 1 - i) * sizeof(struct map_range));
                mr[i--].start = old_start;
                nr_range--;
        }

        for (i = 0; i < nr_range; i++)
                pr_debug(" [mem %#010lx-%#010lx] page %s\n",
                                mr[i].start, mr[i].end - 1,
                                page_size_string(&mr[i]));

        return nr_range;
}

struct range pfn_mapped[E820_MAX_ENTRIES];
int nr_pfn_mapped;

static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
{
        nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES,
                                             nr_pfn_mapped, start_pfn, end_pfn);
        nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES);

        max_pfn_mapped = max(max_pfn_mapped, end_pfn);

        if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
                max_low_pfn_mapped = max(max_low_pfn_mapped,
                                         min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
}

bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
{
        int i;

        for (i = 0; i < nr_pfn_mapped; i++)
                if ((start_pfn >= pfn_mapped[i].start) &&
                    (end_pfn <= pfn_mapped[i].end))
                        return true;

        return false;
}

/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
unsigned long __ref init_memory_mapping(unsigned long start,
                                        unsigned long end, pgprot_t prot)
{
        struct map_range mr[NR_RANGE_MR];
        unsigned long ret = 0;
        int nr_range, i;

        pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
               start, end - 1);

        memset(mr, 0, sizeof(mr));
        nr_range = split_mem_range(mr, 0, start, end);

        for (i = 0; i < nr_range; i++)
                ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
                                                   mr[i].page_size_mask,
                                                   prot);

        add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);

        return ret >> PAGE_SHIFT;
}

/*
 * We need to iterate through the E820 memory map and create direct mappings
 * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply
 * create direct mappings for all pfns from [0 to max_low_pfn) and
 * [4GB to max_pfn) because of possible memory holes in high addresses
 * that cannot be marked as UC by fixed/variable range MTRRs.
 * Depending on the alignment of E820 ranges, this may possibly result
 * in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
 *
 * init_mem_mapping() calls init_range_memory_mapping() with big range.
 * That range would have hole in the middle or ends, and only ram parts
 * will be mapped in init_range_memory_mapping().
 */
static unsigned long __init init_range_memory_mapping(
                                           unsigned long r_start,
                                           unsigned long r_end)
{
        unsigned long start_pfn, end_pfn;
        unsigned long mapped_ram_size = 0;
        int i;

        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
                u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
                u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
                if (start >= end)
                        continue;

                /*
                 * if it is overlapping with brk pgt, we need to
                 * alloc pgt buf from memblock instead.
                 */
                can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
                                    min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
                init_memory_mapping(start, end, PAGE_KERNEL);
                mapped_ram_size += end - start;
                can_use_brk_pgt = true;
        }

        return mapped_ram_size;
}

static unsigned long __init get_new_step_size(unsigned long step_size)
{
        /*
         * Initial mapped size is PMD_SIZE (2M).
         * We can not set step_size to be PUD_SIZE (1G) yet.
         * In worse case, when we cross the 1G boundary, and
         * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
         * to map 1G range with PTE. Hence we use one less than the
         * difference of page table level shifts.
         *
         * Don't need to worry about overflow in the top-down case, on 32bit,
         * when step_size is 0, round_down() returns 0 for start, and that
         * turns it into 0x100000000ULL.
         * In the bottom-up case, round_up(x, 0) returns 0 though too, which
         * needs to be taken into consideration by the code below.
         */
        return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
}

/**
 * memory_map_top_down - Map [map_start, map_end) top down
 * @map_start: start address of the target memory range
 * @map_end: end address of the target memory range
 *
 * This function will setup direct mapping for memory range
 * [map_start, map_end) in top-down. That said, the page tables
 * will be allocated at the end of the memory, and we map the
 * memory in top-down.
 */
static void __init memory_map_top_down(unsigned long map_start,
                                       unsigned long map_end)
{
        unsigned long real_end, last_start;
        unsigned long step_size;
        unsigned long addr;
        unsigned long mapped_ram_size = 0;

        /*
         * Systems that have many reserved areas near top of the memory,
         * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will
         * require lots of 4K mappings which may exhaust pgt_buf.
         * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure
         * there is enough mapped memory that can be allocated from
         * memblock.
         */
        addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
                                         map_end);
        if (!addr) {
                pr_warn("Failed to release memory for alloc_low_pages()");
                real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE));
        } else {
                memblock_phys_free(addr, PMD_SIZE);
                real_end = addr + PMD_SIZE;
        }

        /* step_size need to be small so pgt_buf from BRK could cover it */
        step_size = PMD_SIZE;
        max_pfn_mapped = 0; /* will get exact value next */
        min_pfn_mapped = real_end >> PAGE_SHIFT;
        last_start = real_end;

        /*
         * We start from the top (end of memory) and go to the bottom.
         * The memblock_find_in_range() gets us a block of RAM from the
         * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
         * for page table.
         */
        while (last_start > map_start) {
                unsigned long start;

                if (last_start > step_size) {
                        start = round_down(last_start - 1, step_size);
                        if (start < map_start)
                                start = map_start;
                } else
                        start = map_start;
                mapped_ram_size += init_range_memory_mapping(start,
                                                        last_start);
                last_start = start;
                min_pfn_mapped = last_start >> PAGE_SHIFT;
                if (mapped_ram_size >= step_size)
                        step_size = get_new_step_size(step_size);
        }

        if (real_end < map_end)
                init_range_memory_mapping(real_end, map_end);
}

/**
 * memory_map_bottom_up - Map [map_start, map_end) bottom up
 * @map_start: start address of the target memory range
 * @map_end: end address of the target memory range
 *
 * This function will setup direct mapping for memory range
 * [map_start, map_end) in bottom-up. Since we have limited the
 * bottom-up allocation above the kernel, the page tables will
 * be allocated just above the kernel and we map the memory
 * in [map_start, map_end) in bottom-up.
 */
static void __init memory_map_bottom_up(unsigned long map_start,
                                        unsigned long map_end)
{
        unsigned long next, start;
        unsigned long mapped_ram_size = 0;
        /* step_size need to be small so pgt_buf from BRK could cover it */
        unsigned long step_size = PMD_SIZE;

        start = map_start;
        min_pfn_mapped = start >> PAGE_SHIFT;

        /*
         * We start from the bottom (@map_start) and go to the top (@map_end).
         * The memblock_find_in_range() gets us a block of RAM from the
         * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
         * for page table.
         */
        while (start < map_end) {
                if (step_size && map_end - start > step_size) {
                        next = round_up(start + 1, step_size);
                        if (next > map_end)
                                next = map_end;
                } else {
                        next = map_end;
                }

                mapped_ram_size += init_range_memory_mapping(start, next);
                start = next;

                if (mapped_ram_size >= step_size)
                        step_size = get_new_step_size(step_size);
        }
}

/*
 * The real mode trampoline, which is required for bootstrapping CPUs
 * occupies only a small area under the low 1MB.  See reserve_real_mode()
 * for details.
 *
 * If KASLR is disabled the first PGD entry of the direct mapping is copied
 * to map the real mode trampoline.
 *
 * If KASLR is enabled, copy only the PUD which covers the low 1MB
 * area. This limits the randomization granularity to 1GB for both 4-level
 * and 5-level paging.
 */
static void __init init_trampoline(void)
{
#ifdef CONFIG_X86_64
        /*
         * The code below will alias kernel page-tables in the user-range of the
         * address space, including the Global bit. So global TLB entries will
         * be created when using the trampoline page-table.
         */
        if (!kaslr_memory_enabled())
                trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
        else
                init_trampoline_kaslr();
#endif
}

void __init init_mem_mapping(void)
{
        unsigned long end;

        pti_check_boottime_disable();
        probe_page_size_mask();
        setup_pcid();

#ifdef CONFIG_X86_64
        end = max_pfn << PAGE_SHIFT;
#else
        end = max_low_pfn << PAGE_SHIFT;
#endif

        /* the ISA range is always mapped regardless of memory holes */
        init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL);

        /* Init the trampoline, possibly with KASLR memory offset */
        init_trampoline();

        /*
         * If the allocation is in bottom-up direction, we setup direct mapping
         * in bottom-up, otherwise we setup direct mapping in top-down.
         */
        if (memblock_bottom_up()) {
                unsigned long kernel_end = __pa_symbol(_end);

                /*
                 * we need two separate calls here. This is because we want to
                 * allocate page tables above the kernel. So we first map
                 * [kernel_end, end) to make memory above the kernel be mapped
                 * as soon as possible. And then use page tables allocated above
                 * the kernel to map [ISA_END_ADDRESS, kernel_end).
                 */
                memory_map_bottom_up(kernel_end, end);
                memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
        } else {
                memory_map_top_down(ISA_END_ADDRESS, end);
        }

#ifdef CONFIG_X86_64
        if (max_pfn > max_low_pfn) {
                /* can we preserve max_low_pfn ?*/
                max_low_pfn = max_pfn;
        }
#else
        early_ioremap_page_table_range_init();
#endif

        load_cr3(swapper_pg_dir);
        __flush_tlb_all();

        x86_init.hyper.init_mem_mapping();

        early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}

/*
 * Initialize an mm_struct to be used during poking and a pointer to be used
 * during patching.
 */
void __init poking_init(void)
{
        spinlock_t *ptl;
        pte_t *ptep;

        text_poke_mm = mm_alloc();
        BUG_ON(!text_poke_mm);

        /* Xen PV guests need the PGD to be pinned. */
        paravirt_enter_mmap(text_poke_mm);

        set_notrack_mm(text_poke_mm);

        /*
         * Randomize the poking address, but make sure that the following page
         * will be mapped at the same PMD. We need 2 pages, so find space for 3,
         * and adjust the address if the PMD ends after the first one.
         */
        text_poke_mm_addr = TASK_UNMAPPED_BASE;
        if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
                text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
                        (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);

        if (((text_poke_mm_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
                text_poke_mm_addr += PAGE_SIZE;

        /*
         * We need to trigger the allocation of the page-tables that will be
         * needed for poking now. Later, poking may be performed in an atomic
         * section, which might cause allocation to fail.
         */
        ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
        BUG_ON(!ptep);
        pte_unmap_unlock(ptep, ptl);
}

/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 * On x86, access has to be given to the first megabyte of RAM because that
 * area traditionally contains BIOS code and data regions used by X, dosemu,
 * and similar apps. Since they map the entire memory range, the whole range
 * must be allowed (for mapping), but any areas that would otherwise be
 * disallowed are flagged as being "zero filled" instead of rejected.
 * Access has to be given to non-kernel-ram areas as well, these contain the
 * PCI mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
        if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE,
                                IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
                        != REGION_DISJOINT) {
                /*
                 * For disallowed memory regions in the low 1MB range,
                 * request that the page be shown as all zeros.
                 */
                if (pagenr < 256)
                        return 2;

                return 0;
        }

        /*
         * This must follow RAM test, since System RAM is considered a
         * restricted resource under CONFIG_STRICT_DEVMEM.
         */
        if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
                /* Low 1MB bypasses iomem restrictions. */
                if (pagenr < 256)
                        return 1;

                return 0;
        }

        return 1;
}

void free_init_pages(const char *what, unsigned long begin, unsigned long end)
{
        unsigned long begin_aligned, end_aligned;

        /* Make sure boundaries are page aligned */
        begin_aligned = PAGE_ALIGN(begin);
        end_aligned   = end & PAGE_MASK;

        if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
                begin = begin_aligned;
                end   = end_aligned;
        }

        if (begin >= end)
                return;

        /*
         * If debugging page accesses then do not free this memory but
         * mark them not present - any buggy init-section access will
         * create a kernel page fault:
         */
        if (debug_pagealloc_enabled()) {
                pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
                        begin, end - 1);
                /*
                 * Inform kmemleak about the hole in the memory since the
                 * corresponding pages will be unmapped.
                 */
                kmemleak_free_part((void *)begin, end - begin);
                set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
        } else {
                /*
                 * We just marked the kernel text read only above, now that
                 * we are going to free part of that, we need to make that
                 * writeable and non-executable first.
                 */
                set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
                set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);

                free_reserved_area((void *)begin, (void *)end,
                                   POISON_FREE_INITMEM, what);
        }
}

/*
 * begin/end can be in the direct map or the "high kernel mapping"
 * used for the kernel image only.  free_init_pages() will do the
 * right thing for either kind of address.
 */
void free_kernel_image_pages(const char *what, void *begin, void *end)
{
        unsigned long begin_ul = (unsigned long)begin;
        unsigned long end_ul = (unsigned long)end;
        unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;

        free_init_pages(what, begin_ul, end_ul);

        /*
         * PTI maps some of the kernel into userspace.  For performance,
         * this includes some kernel areas that do not contain secrets.
         * Those areas might be adjacent to the parts of the kernel image
         * being freed, which may contain secrets.  Remove the "high kernel
         * image mapping" for these freed areas, ensuring they are not even
         * potentially vulnerable to Meltdown regardless of the specific
         * optimizations PTI is currently using.
         *
         * The "noalias" prevents unmapping the direct map alias which is
         * needed to access the freed pages.
         *
         * This is only valid for 64bit kernels. 32bit has only one mapping
         * which can't be treated in this way for obvious reasons.
         */
        if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
                set_memory_np_noalias(begin_ul, len_pages);
}

void __ref free_initmem(void)
{
        e820__reallocate_tables();

        mem_encrypt_free_decrypted_mem();

        free_kernel_image_pages("unused kernel image (initmem)",
                                &__init_begin, &__init_end);
}

#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
        /*
         * end could be not aligned, and We can not align that,
         * decompressor could be confused by aligned initrd_end
         * We already reserve the end partial page before in
         *   - i386_start_kernel()
         *   - x86_64_start_kernel()
         *   - relocate_initrd()
         * So here We can do PAGE_ALIGN() safely to get partial page to be freed
         */
        free_init_pages("initrd", start, PAGE_ALIGN(end));
}
#endif

void __init arch_zone_limits_init(unsigned long *max_zone_pfns)
{
#ifdef CONFIG_ZONE_DMA
        max_zone_pfns[ZONE_DMA]                = min(MAX_DMA_PFN, max_low_pfn);
#endif
#ifdef CONFIG_ZONE_DMA32
        max_zone_pfns[ZONE_DMA32]        = min(MAX_DMA32_PFN, max_low_pfn);
#endif
        max_zone_pfns[ZONE_NORMAL]        = max_low_pfn;
#ifdef CONFIG_HIGHMEM
        max_zone_pfns[ZONE_HIGHMEM]        = max_pfn;
#endif
}

__visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
        .loaded_mm = &init_mm,
        .next_asid = 1,
        .cr4 = ~0UL,        /* fail hard if we screw up cr4 shadow initialization */
};

#ifdef CONFIG_ADDRESS_MASKING
DEFINE_PER_CPU(u64, tlbstate_untag_mask);
EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask);
#endif

void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
        /* entry 0 MUST be WB (hardwired to speed up translations) */
        BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);

        __cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
        __pte2cachemode_tbl[entry] = cache;
}

#ifdef CONFIG_SWAP
unsigned long arch_max_swapfile_size(void)
{
        unsigned long pages;

        pages = generic_max_swapfile_size();

        if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
                /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
                unsigned long long l1tf_limit = l1tf_pfn_limit();
                /*
                 * We encode swap offsets also with 3 bits below those for pfn
                 * which makes the usable limit higher.
                 */
#if CONFIG_PGTABLE_LEVELS > 2
                l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
#endif
                pages = min_t(unsigned long long, l1tf_limit, pages);
        }
        return pages;
}
#endif

#ifdef CONFIG_EXECMEM
static struct execmem_info execmem_info __ro_after_init;

#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
void execmem_fill_trapping_insns(void *ptr, size_t size)
{
        memset(ptr, INT3_INSN_OPCODE, size);
}
#endif

struct execmem_info __init *execmem_arch_setup(void)
{
        unsigned long start, offset = 0;
        enum execmem_range_flags flags;
        pgprot_t pgprot;

        if (kaslr_enabled())
                offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;

        start = MODULES_VADDR + offset;

        if (IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) &&
            cpu_feature_enabled(X86_FEATURE_PSE)) {
                pgprot = PAGE_KERNEL_ROX;
                flags = EXECMEM_KASAN_SHADOW | EXECMEM_ROX_CACHE;
        } else {
                pgprot = PAGE_KERNEL;
                flags = EXECMEM_KASAN_SHADOW;
        }

        execmem_info = (struct execmem_info){
                .ranges = {
                        [EXECMEM_MODULE_TEXT] = {
                                .flags        = flags,
                                .start        = start,
                                .end        = MODULES_END,
                                .pgprot        = pgprot,
                                .alignment = MODULE_ALIGN,
                        },
                        [EXECMEM_KPROBES] = {
                                .flags        = flags,
                                .start        = start,
                                .end        = MODULES_END,
                                .pgprot        = PAGE_KERNEL_ROX,
                                .alignment = MODULE_ALIGN,
                        },
                        [EXECMEM_FTRACE] = {
                                .flags        = flags,
                                .start        = start,
                                .end        = MODULES_END,
                                .pgprot        = pgprot,
                                .alignment = MODULE_ALIGN,
                        },
                        [EXECMEM_BPF] = {
                                .flags        = EXECMEM_KASAN_SHADOW,
                                .start        = start,
                                .end        = MODULES_END,
                                .pgprot        = PAGE_KERNEL,
                                .alignment = MODULE_ALIGN,
                        },
                        [EXECMEM_MODULE_DATA] = {
                                .flags        = EXECMEM_KASAN_SHADOW,
                                .start        = start,
                                .end        = MODULES_END,
                                .pgprot        = PAGE_KERNEL,
                                .alignment = MODULE_ALIGN,
                        },
                },
        };

        return &execmem_info;
}
#endif /* CONFIG_EXECMEM */























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MM_SWAP_H
#define _MM_SWAP_H

#include <linux/atomic.h> /* for atomic_long_t */
struct mempolicy;
struct swap_iocb;

extern int page_cluster;

#ifdef CONFIG_THP_SWAP
#define SWAPFILE_CLUSTER        HPAGE_PMD_NR
#define swap_entry_order(order)        (order)
#else
#define SWAPFILE_CLUSTER        256
#define swap_entry_order(order)        0
#endif

extern struct swap_info_struct *swap_info[];

/*
 * We use this to track usage of a cluster. A cluster is a block of swap disk
 * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
 * free clusters are organized into a list. We fetch an entry from the list to
 * get a free cluster.
 *
 * The flags field determines if a cluster is free. This is
 * protected by cluster lock.
 */
struct swap_cluster_info {
        spinlock_t lock;        /*
                                 * Protect swap_cluster_info fields
                                 * other than list, and swap_info_struct->swap_map
                                 * elements corresponding to the swap cluster.
                                 */
        u16 count;
        u8 flags;
        u8 order;
        atomic_long_t __rcu *table;        /* Swap table entries, see mm/swap_table.h */
        unsigned int *extend_table;        /* For large swap count, protected by ci->lock */
        struct list_head list;
};

/* All on-list cluster must have a non-zero flag. */
enum swap_cluster_flags {
        CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */
        CLUSTER_FLAG_FREE,
        CLUSTER_FLAG_NONFULL,
        CLUSTER_FLAG_FRAG,
        /* Clusters with flags above are allocatable */
        CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG,
        CLUSTER_FLAG_FULL,
        CLUSTER_FLAG_DISCARD,
        CLUSTER_FLAG_MAX,
};

#ifdef CONFIG_SWAP
#include <linux/swapops.h> /* for swp_offset */
#include <linux/blk_types.h> /* for bio_end_io_t */

static inline unsigned int swp_cluster_offset(swp_entry_t entry)
{
        return swp_offset(entry) % SWAPFILE_CLUSTER;
}

/*
 * Callers of all helpers below must ensure the entry, type, or offset is
 * valid, and protect the swap device with reference count or locks.
 */
static inline struct swap_info_struct *__swap_type_to_info(int type)
{
        struct swap_info_struct *si;

        si = READ_ONCE(swap_info[type]); /* rcu_dereference() */
        VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
        return si;
}

static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
{
        return __swap_type_to_info(swp_type(entry));
}

static inline struct swap_cluster_info *__swap_offset_to_cluster(
                struct swap_info_struct *si, pgoff_t offset)
{
        VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
        VM_WARN_ON_ONCE(offset >= roundup(si->max, SWAPFILE_CLUSTER));
        return &si->cluster_info[offset / SWAPFILE_CLUSTER];
}

static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry)
{
        return __swap_offset_to_cluster(__swap_entry_to_info(entry),
                                        swp_offset(entry));
}

static __always_inline struct swap_cluster_info *__swap_cluster_lock(
                struct swap_info_struct *si, unsigned long offset, bool irq)
{
        struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset);

        /*
         * Nothing modifies swap cache in an IRQ context. All access to
         * swap cache is wrapped by swap_cache_* helpers, and swap cache
         * writeback is handled outside of IRQs. Swapin or swapout never
         * occurs in IRQ, and neither does in-place split or replace.
         *
         * Besides, modifying swap cache requires synchronization with
         * swap_map, which was never IRQ safe.
         */
        VM_WARN_ON_ONCE(!in_task());
        VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
        if (irq)
                spin_lock_irq(&ci->lock);
        else
                spin_lock(&ci->lock);
        return ci;
}

/**
 * swap_cluster_lock - Lock and return the swap cluster of given offset.
 * @si: swap device the cluster belongs to.
 * @offset: the swap entry offset, pointing to a valid slot.
 *
 * Context: The caller must ensure the offset is in the valid range and
 * protect the swap device with reference count or locks.
 */
static inline struct swap_cluster_info *swap_cluster_lock(
                struct swap_info_struct *si, unsigned long offset)
{
        return __swap_cluster_lock(si, offset, false);
}

static inline struct swap_cluster_info *__swap_cluster_get_and_lock(
                const struct folio *folio, bool irq)
{
        VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
        return __swap_cluster_lock(__swap_entry_to_info(folio->swap),
                                   swp_offset(folio->swap), irq);
}

/*
 * swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries.
 * @folio: The folio.
 *
 * This locks and returns the swap cluster that contains a folio's swap
 * entries. The swap entries of a folio are always in one single cluster.
 * The folio has to be locked so its swap entries won't change and the
 * cluster won't be freed.
 *
 * Context: Caller must ensure the folio is locked and in the swap cache.
 * Return: Pointer to the swap cluster.
 */
static inline struct swap_cluster_info *swap_cluster_get_and_lock(
                const struct folio *folio)
{
        return __swap_cluster_get_and_lock(folio, false);
}

/*
 * swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries.
 * @folio: The folio.
 *
 * Same as swap_cluster_get_and_lock but also disable IRQ.
 *
 * Context: Caller must ensure the folio is locked and in the swap cache.
 * Return: Pointer to the swap cluster.
 */
static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq(
                const struct folio *folio)
{
        return __swap_cluster_get_and_lock(folio, true);
}

static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
{
        spin_unlock(&ci->lock);
}

static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
{
        spin_unlock_irq(&ci->lock);
}

extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp);

/*
 * Below are the core routines for doing swap for a folio.
 * All helpers requires the folio to be locked, and a locked folio
 * in the swap cache pins the swap entries / slots allocated to the
 * folio, swap relies heavily on the swap cache and folio lock for
 * synchronization.
 *
 * folio_alloc_swap(): the entry point for a folio to be swapped
 * out. It allocates swap slots and pins the slots with swap cache.
 * The slots start with a swap count of zero. The slots are pinned
 * by swap cache reference which doesn't contribute to swap count.
 *
 * folio_dup_swap(): increases the swap count of a folio, usually
 * during it gets unmapped and a swap entry is installed to replace
 * it (e.g., swap entry in page table). A swap slot with swap
 * count == 0 can only be increased by this helper.
 *
 * folio_put_swap(): does the opposite thing of folio_dup_swap().
 */
int folio_alloc_swap(struct folio *folio);
int folio_dup_swap(struct folio *folio, struct page *subpage);
void folio_put_swap(struct folio *folio, struct page *subpage);

/* For internal use */
extern void __swap_cluster_free_entries(struct swap_info_struct *si,
                                        struct swap_cluster_info *ci,
                                        unsigned int ci_off, unsigned int nr_pages);

/* linux/mm/page_io.c */
int sio_pool_init(void);
struct swap_iocb;
void swap_read_folio(struct folio *folio, struct swap_iocb **plug);
void __swap_read_unplug(struct swap_iocb *plug);
static inline void swap_read_unplug(struct swap_iocb *plug)
{
        if (unlikely(plug))
                __swap_read_unplug(plug);
}
void swap_write_unplug(struct swap_iocb *sio);
int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);

/* linux/mm/swap_state.c */
extern struct address_space swap_space __read_mostly;
static inline struct address_space *swap_address_space(swp_entry_t entry)
{
        return &swap_space;
}

/*
 * Return the swap device position of the swap entry.
 */
static inline loff_t swap_dev_pos(swp_entry_t entry)
{
        return ((loff_t)swp_offset(entry)) << PAGE_SHIFT;
}

/**
 * folio_matches_swap_entry - Check if a folio matches a given swap entry.
 * @folio: The folio.
 * @entry: The swap entry to check against.
 *
 * Context: The caller should have the folio locked to ensure it's stable
 * and nothing will move it in or out of the swap cache.
 * Return: true or false.
 */
static inline bool folio_matches_swap_entry(const struct folio *folio,
                                            swp_entry_t entry)
{
        swp_entry_t folio_entry = folio->swap;
        long nr_pages = folio_nr_pages(folio);

        VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
        if (!folio_test_swapcache(folio))
                return false;
        VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio_entry.val, nr_pages), folio);
        return folio_entry.val == round_down(entry.val, nr_pages);
}

/*
 * All swap cache helpers below require the caller to ensure the swap entries
 * used are valid and stabilize the device by any of the following ways:
 * - Hold a reference by get_swap_device(): this ensures a single entry is
 *   valid and increases the swap device's refcount.
 * - Locking a folio in the swap cache: this ensures the folio's swap entries
 *   are valid and pinned, also implies reference to the device.
 * - Locking anything referencing the swap entry: e.g. PTL that protects
 *   swap entries in the page table, similar to locking swap cache folio.
 * - See the comment of get_swap_device() for more complex usage.
 */
bool swap_cache_has_folio(swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry);
void *swap_cache_get_shadow(swp_entry_t entry);
void swap_cache_del_folio(struct folio *folio);
struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags,
                                     struct mempolicy *mpol, pgoff_t ilx,
                                     bool *alloced);
/* Below helpers require the caller to lock and pass in the swap cluster. */
void __swap_cache_add_folio(struct swap_cluster_info *ci,
                            struct folio *folio, swp_entry_t entry);
void __swap_cache_del_folio(struct swap_cluster_info *ci,
                            struct folio *folio, swp_entry_t entry, void *shadow);
void __swap_cache_replace_folio(struct swap_cluster_info *ci,
                                struct folio *old, struct folio *new);

void show_swap_cache_info(void);
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                struct vm_area_struct *vma, unsigned long addr,
                struct swap_iocb **plug);
struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
                struct mempolicy *mpol, pgoff_t ilx);
struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
                struct vm_fault *vmf);
struct folio *swapin_folio(swp_entry_t entry, struct folio *folio);
void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
                           unsigned long addr);

static inline unsigned int folio_swap_flags(struct folio *folio)
{
        return __swap_entry_to_info(folio->swap)->flags;
}

/*
 * Return the count of contiguous swap entries that share the same
 * zeromap status as the starting entry. If is_zeromap is not NULL,
 * it will return the zeromap status of the starting entry.
 */
static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
                bool *is_zeromap)
{
        struct swap_info_struct *sis = __swap_entry_to_info(entry);
        unsigned long start = swp_offset(entry);
        unsigned long end = start + max_nr;
        bool first_bit;

        first_bit = test_bit(start, sis->zeromap);
        if (is_zeromap)
                *is_zeromap = first_bit;

        if (max_nr <= 1)
                return max_nr;
        if (first_bit)
                return find_next_zero_bit(sis->zeromap, end, start) - start;
        else
                return find_next_bit(sis->zeromap, end, start) - start;
}

static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
{
        int i;

        /*
         * While allocating a large folio and doing mTHP swapin, we need to
         * ensure all entries are not cached, otherwise, the mTHP folio will
         * be in conflict with the folio in swap cache.
         */
        for (i = 0; i < max_nr; i++) {
                if (swap_cache_has_folio(entry))
                        return i;
                entry.val++;
        }

        return i;
}

#else /* CONFIG_SWAP */
struct swap_iocb;
static inline struct swap_cluster_info *swap_cluster_lock(
        struct swap_info_struct *si, pgoff_t offset, bool irq)
{
        return NULL;
}

static inline struct swap_cluster_info *swap_cluster_get_and_lock(
                struct folio *folio)
{
        return NULL;
}

static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq(
                struct folio *folio)
{
        return NULL;
}

static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
{
}

static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
{
}

static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
{
        return NULL;
}

static inline int folio_alloc_swap(struct folio *folio)
{
        return -EINVAL;
}

static inline int folio_dup_swap(struct folio *folio, struct page *page)
{
        return -EINVAL;
}

static inline void folio_put_swap(struct folio *folio, struct page *page)
{
}

static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
{
}

static inline void swap_write_unplug(struct swap_iocb *sio)
{
}

static inline struct address_space *swap_address_space(swp_entry_t entry)
{
        return NULL;
}

static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry)
{
        return false;
}

static inline void show_swap_cache_info(void)
{
}

static inline struct folio *swap_cluster_readahead(swp_entry_t entry,
                        gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx)
{
        return NULL;
}

static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
                        struct vm_fault *vmf)
{
        return NULL;
}

static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
{
        return NULL;
}

static inline void swap_update_readahead(struct folio *folio,
                struct vm_area_struct *vma, unsigned long addr)
{
}

static inline int swap_writeout(struct folio *folio,
                struct swap_iocb **swap_plug)
{
        return 0;
}

static inline int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
{
        return -EINVAL;
}

static inline bool swap_cache_has_folio(swp_entry_t entry)
{
        return false;
}

static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
{
        return NULL;
}

static inline void *swap_cache_get_shadow(swp_entry_t entry)
{
        return NULL;
}

static inline void swap_cache_del_folio(struct folio *folio)
{
}

static inline void __swap_cache_del_folio(struct swap_cluster_info *ci,
                struct folio *folio, swp_entry_t entry, void *shadow)
{
}

static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci,
                struct folio *old, struct folio *new)
{
}

static inline unsigned int folio_swap_flags(struct folio *folio)
{
        return 0;
}

static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
                bool *has_zeromap)
{
        return 0;
}

static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
{
        return 0;
}
#endif /* CONFIG_SWAP */
#endif /* _MM_SWAP_H */


































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
/* SPDX-License-Identifier: GPL-2.0-only */
/* include/net/xdp.h
 *
 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
 */
#ifndef __LINUX_NET_XDP_H__
#define __LINUX_NET_XDP_H__

#include <linux/bitfield.h>
#include <linux/filter.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h> /* skb_shared_info */

#include <net/page_pool/types.h>

/**
 * DOC: XDP RX-queue information
 *
 * The XDP RX-queue info (xdp_rxq_info) is associated with the driver
 * level RX-ring queues.  It is information that is specific to how
 * the driver has configured a given RX-ring queue.
 *
 * Each xdp_buff frame received in the driver carries a (pointer)
 * reference to this xdp_rxq_info structure.  This provides the XDP
 * data-path read-access to RX-info for both kernel and bpf-side
 * (limited subset).
 *
 * For now, direct access is only safe while running in NAPI/softirq
 * context.  Contents are read-mostly and must not be updated during
 * driver NAPI/softirq poll.
 *
 * The driver usage API is a register and unregister API.
 *
 * The struct is not directly tied to the XDP prog.  A new XDP prog
 * can be attached as long as it doesn't change the underlying
 * RX-ring.  If the RX-ring does change significantly, the NIC driver
 * naturally needs to stop the RX-ring before purging and reallocating
 * memory.  In that process the driver MUST call unregister (which
 * also applies for driver shutdown and unload).  The register API is
 * also mandatory during RX-ring setup.
 */

enum xdp_mem_type {
        MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
        MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
        MEM_TYPE_PAGE_POOL,
        MEM_TYPE_XSK_BUFF_POOL,
        MEM_TYPE_MAX,
};

/* XDP flags for ndo_xdp_xmit */
#define XDP_XMIT_FLUSH                (1U << 0)        /* doorbell signal consumer */
#define XDP_XMIT_FLAGS_MASK        XDP_XMIT_FLUSH

struct xdp_mem_info {
        u32 type; /* enum xdp_mem_type, but known size type */
        u32 id;
};

struct page_pool;

struct xdp_rxq_info {
        struct net_device *dev;
        u32 queue_index;
        u32 reg_state;
        struct xdp_mem_info mem;
        u32 frag_size;
} ____cacheline_aligned; /* perf critical, avoid false-sharing */

struct xdp_txq_info {
        struct net_device *dev;
};

enum xdp_buff_flags {
        XDP_FLAGS_HAS_FRAGS                = BIT(0), /* non-linear xdp buff */
        XDP_FLAGS_FRAGS_PF_MEMALLOC        = BIT(1), /* xdp paged memory is under
                                                   * pressure
                                                   */
        /* frags have unreadable mem, this can't be true for real XDP packets,
         * but drivers may use XDP helpers to construct Rx pkt state even when
         * XDP program is not attached.
         */
        XDP_FLAGS_FRAGS_UNREADABLE        = BIT(2),
};

struct xdp_buff {
        void *data;
        void *data_end;
        void *data_meta;
        void *data_hard_start;
        struct xdp_rxq_info *rxq;
        struct xdp_txq_info *txq;

        union {
                struct {
                        /* frame size to deduce data_hard_end/tailroom */
                        u32 frame_sz;
                        /* supported values defined in xdp_buff_flags */
                        u32 flags;
                };

#ifdef __LITTLE_ENDIAN
                /* Used to micro-optimize xdp_init_buff(), don't use directly */
                u64 frame_sz_flags_init;
#endif
        };
};

static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp)
{
        return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS);
}

static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp)
{
        xdp->flags |= XDP_FLAGS_HAS_FRAGS;
}

static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp)
{
        xdp->flags &= ~XDP_FLAGS_HAS_FRAGS;
}

static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
{
        xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
}

static __always_inline void xdp_buff_set_frag_unreadable(struct xdp_buff *xdp)
{
        xdp->flags |= XDP_FLAGS_FRAGS_UNREADABLE;
}

static __always_inline u32 xdp_buff_get_skb_flags(const struct xdp_buff *xdp)
{
        return xdp->flags;
}

static __always_inline void xdp_buff_clear_frag_pfmemalloc(struct xdp_buff *xdp)
{
        xdp->flags &= ~XDP_FLAGS_FRAGS_PF_MEMALLOC;
}

static __always_inline void
xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
{
        xdp->rxq = rxq;

#ifdef __LITTLE_ENDIAN
        /*
         * Force the compilers to initialize ::flags and assign ::frame_sz with
         * one write on 64-bit LE architectures as they're often unable to do
         * it themselves.
         */
        xdp->frame_sz_flags_init = frame_sz;
#else
        xdp->frame_sz = frame_sz;
        xdp->flags = 0;
#endif
}

static __always_inline void
xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
                 int headroom, int data_len, const bool meta_valid)
{
        unsigned char *data = hard_start + headroom;

        xdp->data_hard_start = hard_start;
        xdp->data = data;
        xdp->data_end = data + data_len;
        xdp->data_meta = meta_valid ? data : data + 1;
}

/* Reserve memory area at end-of data area.
 *
 * This macro reserves tailroom in the XDP buffer by limiting the
 * XDP/BPF data access to data_hard_end.  Notice same area (and size)
 * is used for XDP_PASS, when constructing the SKB via build_skb().
 */
#define xdp_data_hard_end(xdp)                                \
        ((xdp)->data_hard_start + (xdp)->frame_sz -        \
         SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

static inline struct skb_shared_info *
xdp_get_shared_info_from_buff(const struct xdp_buff *xdp)
{
        return (struct skb_shared_info *)xdp_data_hard_end(xdp);
}

static __always_inline unsigned int
xdp_get_buff_len(const struct xdp_buff *xdp)
{
        unsigned int len = xdp->data_end - xdp->data;
        const struct skb_shared_info *sinfo;

        if (likely(!xdp_buff_has_frags(xdp)))
                goto out;

        sinfo = xdp_get_shared_info_from_buff(xdp);
        len += sinfo->xdp_frags_size;
out:
        return len;
}

void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp);

/**
 * __xdp_buff_add_frag - attach frag to &xdp_buff
 * @xdp: XDP buffer to attach the frag to
 * @netmem: network memory containing the frag
 * @offset: offset at which the frag starts
 * @size: size of the frag
 * @truesize: total memory size occupied by the frag
 * @try_coalesce: whether to try coalescing the frags (not valid for XSk)
 *
 * Attach frag to the XDP buffer. If it currently has no frags attached,
 * initialize the related fields, otherwise check that the frag number
 * didn't reach the limit of ``MAX_SKB_FRAGS``. If possible, try coalescing
 * the frag with the previous one.
 * The function doesn't check/update the pfmemalloc bit. Please use the
 * non-underscored wrapper in drivers.
 *
 * Return: true on success, false if there's no space for the frag in
 * the shared info struct.
 */
static inline bool __xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem,
                                       u32 offset, u32 size, u32 truesize,
                                       bool try_coalesce)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        skb_frag_t *prev;
        u32 nr_frags;

        if (!xdp_buff_has_frags(xdp)) {
                xdp_buff_set_frags_flag(xdp);

                nr_frags = 0;
                sinfo->xdp_frags_size = 0;
                sinfo->xdp_frags_truesize = 0;

                goto fill;
        }

        nr_frags = sinfo->nr_frags;
        prev = &sinfo->frags[nr_frags - 1];

        if (try_coalesce && netmem == skb_frag_netmem(prev) &&
            offset == skb_frag_off(prev) + skb_frag_size(prev)) {
                skb_frag_size_add(prev, size);
                /* Guaranteed to only decrement the refcount */
                xdp_return_frag(netmem, xdp);
        } else if (unlikely(nr_frags == MAX_SKB_FRAGS)) {
                return false;
        } else {
fill:
                __skb_fill_netmem_desc_noacc(sinfo, nr_frags++, netmem,
                                             offset, size);
        }

        sinfo->nr_frags = nr_frags;
        sinfo->xdp_frags_size += size;
        sinfo->xdp_frags_truesize += truesize;

        return true;
}

/**
 * xdp_buff_add_frag - attach frag to &xdp_buff
 * @xdp: XDP buffer to attach the frag to
 * @netmem: network memory containing the frag
 * @offset: offset at which the frag starts
 * @size: size of the frag
 * @truesize: total memory size occupied by the frag
 *
 * Version of __xdp_buff_add_frag() which takes care of the pfmemalloc bit.
 *
 * Return: true on success, false if there's no space for the frag in
 * the shared info struct.
 */
static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem,
                                     u32 offset, u32 size, u32 truesize)
{
        if (!__xdp_buff_add_frag(xdp, netmem, offset, size, truesize, true))
                return false;

        if (unlikely(netmem_is_pfmemalloc(netmem)))
                xdp_buff_set_frag_pfmemalloc(xdp);
        if (unlikely(netmem_is_net_iov(netmem)))
                xdp_buff_set_frag_unreadable(xdp);

        return true;
}

struct xdp_frame {
        void *data;
        u32 len;
        u32 headroom;
        u32 metasize; /* uses lower 8-bits */
        /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
         * while mem_type is valid on remote CPU.
         */
        enum xdp_mem_type mem_type:32;
        struct net_device *dev_rx; /* used by cpumap */
        u32 frame_sz;
        u32 flags; /* supported values defined in xdp_buff_flags */
};

static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame)
{
        return !!(frame->flags & XDP_FLAGS_HAS_FRAGS);
}

static __always_inline u32
xdp_frame_get_skb_flags(const struct xdp_frame *frame)
{
        return frame->flags;
}

#define XDP_BULK_QUEUE_SIZE        16
struct xdp_frame_bulk {
        int count;
        netmem_ref q[XDP_BULK_QUEUE_SIZE];
};

static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq)
{
        bq->count = 0;
}

static inline struct skb_shared_info *
xdp_get_shared_info_from_frame(const struct xdp_frame *frame)
{
        void *data_hard_start = frame->data - frame->headroom - sizeof(*frame);

        return (struct skb_shared_info *)(data_hard_start + frame->frame_sz -
                                SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
}

struct xdp_cpumap_stats {
        unsigned int redirect;
        unsigned int pass;
        unsigned int drop;
};

/* Clear kernel pointers in xdp_frame */
static inline void xdp_scrub_frame(struct xdp_frame *frame)
{
        frame->data = NULL;
        frame->dev_rx = NULL;
}

static inline void
xdp_update_skb_frags_info(struct sk_buff *skb, u8 nr_frags,
                          unsigned int size, unsigned int truesize,
                          u32 xdp_flags)
{
        struct skb_shared_info *sinfo = skb_shinfo(skb);

        sinfo->nr_frags = nr_frags;
        /*
         * ``destructor_arg`` is unionized with ``xdp_frags_{,true}size``,
         * reset it after that these fields aren't used anymore.
         */
        sinfo->destructor_arg = NULL;

        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
        skb->pfmemalloc |= !!(xdp_flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
        skb->unreadable |= !!(xdp_flags & XDP_FLAGS_FRAGS_UNREADABLE);
}

/* Avoids inlining WARN macro in fast-path */
void xdp_warn(const char *msg, const char *func, const int line);
#define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__)

struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp);
struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp);
struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
                                           struct sk_buff *skb,
                                           struct net_device *dev);
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
                                         struct net_device *dev);
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);

static inline
void xdp_convert_frame_to_buff(const struct xdp_frame *frame,
                               struct xdp_buff *xdp)
{
        xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame);
        xdp->data = frame->data;
        xdp->data_end = frame->data + frame->len;
        xdp->data_meta = frame->data - frame->metasize;
        xdp->frame_sz = frame->frame_sz;
        xdp->flags = frame->flags;
}

static inline
int xdp_update_frame_from_buff(const struct xdp_buff *xdp,
                               struct xdp_frame *xdp_frame)
{
        int metasize, headroom;

        /* Assure headroom is available for storing info */
        headroom = xdp->data - xdp->data_hard_start;
        metasize = xdp->data - xdp->data_meta;
        metasize = metasize > 0 ? metasize : 0;
        if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
                return -ENOSPC;

        /* Catch if driver didn't reserve tailroom for skb_shared_info */
        if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
                XDP_WARN("Driver BUG: missing reserved tailroom");
                return -ENOSPC;
        }

        xdp_frame->data = xdp->data;
        xdp_frame->len  = xdp->data_end - xdp->data;
        xdp_frame->headroom = headroom - sizeof(*xdp_frame);
        xdp_frame->metasize = metasize;
        xdp_frame->frame_sz = xdp->frame_sz;
        xdp_frame->flags = xdp->flags;

        return 0;
}

/* Convert xdp_buff to xdp_frame */
static inline
struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
{
        struct xdp_frame *xdp_frame;

        if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
                return xdp_convert_zc_to_xdp_frame(xdp);

        /* Store info in top of packet */
        xdp_frame = xdp->data_hard_start;
        if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0))
                return NULL;

        /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */
        xdp_frame->mem_type = xdp->rxq->mem.type;

        return xdp_frame;
}

void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
                  bool napi_direct, struct xdp_buff *xdp);
void xdp_return_frame(struct xdp_frame *xdpf);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
void xdp_return_buff(struct xdp_buff *xdp);
void xdp_return_frame_bulk(struct xdp_frame *xdpf,
                           struct xdp_frame_bulk *bq);

static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
{
        if (unlikely(!bq->count))
                return;

        page_pool_put_netmem_bulk(bq->q, bq->count);
        bq->count = 0;
}

static __always_inline unsigned int
xdp_get_frame_len(const struct xdp_frame *xdpf)
{
        const struct skb_shared_info *sinfo;
        unsigned int len = xdpf->len;

        if (likely(!xdp_frame_has_frags(xdpf)))
                goto out;

        sinfo = xdp_get_shared_info_from_frame(xdpf);
        len += sinfo->xdp_frags_size;
out:
        return len;
}

int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
                       struct net_device *dev, u32 queue_index,
                       unsigned int napi_id, u32 frag_size);
static inline int
xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
                 struct net_device *dev, u32 queue_index,
                 unsigned int napi_id)
{
        return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0);
}

void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
                               enum xdp_mem_type type, void *allocator);
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq);
int xdp_reg_mem_model(struct xdp_mem_info *mem,
                      enum xdp_mem_type type, void *allocator);
void xdp_unreg_mem_model(struct xdp_mem_info *mem);
int xdp_reg_page_pool(struct page_pool *pool);
void xdp_unreg_page_pool(const struct page_pool *pool);
void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq,
                                   const struct page_pool *pool);

/**
 * xdp_rxq_info_attach_mem_model - attach registered mem info to RxQ info
 * @xdp_rxq: XDP RxQ info to attach the memory info to
 * @mem: already registered memory info
 *
 * If the driver registers its memory providers manually, it must use this
 * function instead of xdp_rxq_info_reg_mem_model().
 */
static inline void
xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq,
                              const struct xdp_mem_info *mem)
{
        xdp_rxq->mem = *mem;
}

/**
 * xdp_rxq_info_detach_mem_model - detach registered mem info from RxQ info
 * @xdp_rxq: XDP RxQ info to detach the memory info from
 *
 * If the driver registers its memory providers manually and then attaches it
 * via xdp_rxq_info_attach_mem_model(), it must call this function before
 * xdp_rxq_info_unreg().
 */
static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq)
{
        xdp_rxq->mem = (struct xdp_mem_info){ };
}

/* Drivers not supporting XDP metadata can use this helper, which
 * rejects any room expansion for metadata as a result.
 */
static __always_inline void
xdp_set_data_meta_invalid(struct xdp_buff *xdp)
{
        xdp->data_meta = xdp->data + 1;
}

static __always_inline bool
xdp_data_meta_unsupported(const struct xdp_buff *xdp)
{
        return unlikely(xdp->data_meta > xdp->data);
}

static inline bool xdp_metalen_invalid(unsigned long metalen)
{
        unsigned long meta_max;

        meta_max = type_max(typeof_member(struct skb_shared_info, meta_len));
        BUILD_BUG_ON(!__builtin_constant_p(meta_max));

        return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max;
}

struct xdp_attachment_info {
        struct bpf_prog *prog;
        u32 flags;
};

struct netdev_bpf;
void xdp_attachment_setup(struct xdp_attachment_info *info,
                          struct netdev_bpf *bpf);

#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE

/* Define the relationship between xdp-rx-metadata kfunc and
 * various other entities:
 * - xdp_rx_metadata enum
 * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml)
 * - kfunc name
 * - xdp_metadata_ops field
 */
#define XDP_METADATA_KFUNC_xxx        \
        XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
                           NETDEV_XDP_RX_METADATA_TIMESTAMP, \
                           bpf_xdp_metadata_rx_timestamp, \
                           xmo_rx_timestamp) \
        XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \
                           NETDEV_XDP_RX_METADATA_HASH, \
                           bpf_xdp_metadata_rx_hash, \
                           xmo_rx_hash) \
        XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
                           NETDEV_XDP_RX_METADATA_VLAN_TAG, \
                           bpf_xdp_metadata_rx_vlan_tag, \
                           xmo_rx_vlan_tag) \

enum xdp_rx_metadata {
#define XDP_METADATA_KFUNC(name, _, __, ___) name,
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
MAX_XDP_METADATA_KFUNC,
};

enum xdp_rss_hash_type {
        /* First part: Individual bits for L3/L4 types */
        XDP_RSS_L3_IPV4                = BIT(0),
        XDP_RSS_L3_IPV6                = BIT(1),

        /* The fixed (L3) IPv4 and IPv6 headers can both be followed by
         * variable/dynamic headers, IPv4 called Options and IPv6 called
         * Extension Headers. HW RSS type can contain this info.
         */
        XDP_RSS_L3_DYNHDR        = BIT(2),

        /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in
         * addition to the protocol specific bit.  This ease interaction with
         * SKBs and avoids reserving a fixed mask for future L4 protocol bits.
         */
        XDP_RSS_L4                = BIT(3), /* L4 based hash, proto can be unknown */
        XDP_RSS_L4_TCP                = BIT(4),
        XDP_RSS_L4_UDP                = BIT(5),
        XDP_RSS_L4_SCTP                = BIT(6),
        XDP_RSS_L4_IPSEC        = BIT(7), /* L4 based hash include IPSEC SPI */
        XDP_RSS_L4_ICMP                = BIT(8),

        /* Second part: RSS hash type combinations used for driver HW mapping */
        XDP_RSS_TYPE_NONE            = 0,
        XDP_RSS_TYPE_L2              = XDP_RSS_TYPE_NONE,

        XDP_RSS_TYPE_L3_IPV4         = XDP_RSS_L3_IPV4,
        XDP_RSS_TYPE_L3_IPV6         = XDP_RSS_L3_IPV6,
        XDP_RSS_TYPE_L3_IPV4_OPT     = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR,
        XDP_RSS_TYPE_L3_IPV6_EX      = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR,

        XDP_RSS_TYPE_L4_ANY          = XDP_RSS_L4,
        XDP_RSS_TYPE_L4_IPV4_TCP     = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP,
        XDP_RSS_TYPE_L4_IPV4_UDP     = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP,
        XDP_RSS_TYPE_L4_IPV4_SCTP    = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP,
        XDP_RSS_TYPE_L4_IPV4_IPSEC   = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC,
        XDP_RSS_TYPE_L4_IPV4_ICMP    = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP,

        XDP_RSS_TYPE_L4_IPV6_TCP     = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP,
        XDP_RSS_TYPE_L4_IPV6_UDP     = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP,
        XDP_RSS_TYPE_L4_IPV6_SCTP    = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP,
        XDP_RSS_TYPE_L4_IPV6_IPSEC   = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC,
        XDP_RSS_TYPE_L4_IPV6_ICMP    = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP,

        XDP_RSS_TYPE_L4_IPV6_TCP_EX  = XDP_RSS_TYPE_L4_IPV6_TCP  | XDP_RSS_L3_DYNHDR,
        XDP_RSS_TYPE_L4_IPV6_UDP_EX  = XDP_RSS_TYPE_L4_IPV6_UDP  | XDP_RSS_L3_DYNHDR,
        XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR,
};

struct xdp_metadata_ops {
        int        (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp);
        int        (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash,
                               enum xdp_rss_hash_type *rss_type);
        int        (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto,
                                   u16 *vlan_tci);
};

#ifdef CONFIG_NET
u32 bpf_xdp_metadata_kfunc_id(int id);
bool bpf_dev_bound_kfunc_id(u32 btf_id);
void xdp_set_features_flag(struct net_device *dev, xdp_features_t val);
void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val);
void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg);
void xdp_features_set_redirect_target_locked(struct net_device *dev,
                                             bool support_sg);
void xdp_features_clear_redirect_target(struct net_device *dev);
void xdp_features_clear_redirect_target_locked(struct net_device *dev);
#else
static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; }
static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; }

static inline void
xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
{
}

static inline void
xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
{
}

static inline void
xdp_features_clear_redirect_target(struct net_device *dev)
{
}
#endif

static inline void xdp_clear_features_flag(struct net_device *dev)
{
        xdp_set_features_flag(dev, 0);
}

static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
                                            struct xdp_buff *xdp)
{
        /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus
         * under local_bh_disable(), which provides the needed RCU protection
         * for accessing map entries.
         */
        u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp));

        if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) {
                if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev))
                        act = xdp_master_redirect(xdp);
        }

        return act;
}
#endif /* __LINUX_NET_XDP_H__ */
































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_IOPRIO_H
#define _UAPI_LINUX_IOPRIO_H

#include <linux/stddef.h>
#include <linux/types.h>

/*
 * Gives us 8 prio classes with 13-bits of data for each class
 */
#define IOPRIO_CLASS_SHIFT        13
#define IOPRIO_NR_CLASSES        8
#define IOPRIO_CLASS_MASK        (IOPRIO_NR_CLASSES - 1)
#define IOPRIO_PRIO_MASK        ((1UL << IOPRIO_CLASS_SHIFT) - 1)

#define IOPRIO_PRIO_CLASS(ioprio)        \
        (((ioprio) >> IOPRIO_CLASS_SHIFT) & IOPRIO_CLASS_MASK)
#define IOPRIO_PRIO_DATA(ioprio)        ((ioprio) & IOPRIO_PRIO_MASK)

/*
 * These are the io priority classes as implemented by the BFQ and mq-deadline
 * schedulers. RT is the realtime class, it always gets premium service. For
 * ATA disks supporting NCQ IO priority, RT class IOs will be processed using
 * high priority NCQ commands. BE is the best-effort scheduling class, the
 * default for any process. IDLE is the idle scheduling class, it is only
 * served when no one else is using the disk.
 */
enum {
        IOPRIO_CLASS_NONE        = 0,
        IOPRIO_CLASS_RT                = 1,
        IOPRIO_CLASS_BE                = 2,
        IOPRIO_CLASS_IDLE        = 3,

        /* Special class to indicate an invalid ioprio value */
        IOPRIO_CLASS_INVALID        = 7,
};

/*
 * The RT and BE priority classes both support up to 8 priority levels that
 * can be specified using the lower 3-bits of the priority data.
 */
#define IOPRIO_LEVEL_NR_BITS                3
#define IOPRIO_NR_LEVELS                (1 << IOPRIO_LEVEL_NR_BITS)
#define IOPRIO_LEVEL_MASK                (IOPRIO_NR_LEVELS - 1)
#define IOPRIO_PRIO_LEVEL(ioprio)        ((ioprio) & IOPRIO_LEVEL_MASK)

#define IOPRIO_BE_NR                        IOPRIO_NR_LEVELS

/*
 * Possible values for the "which" argument of the ioprio_get() and
 * ioprio_set() system calls (see "man ioprio_set").
 */
enum {
        IOPRIO_WHO_PROCESS = 1,
        IOPRIO_WHO_PGRP,
        IOPRIO_WHO_USER,
};

/*
 * Fallback BE class priority level.
 */
#define IOPRIO_NORM        4
#define IOPRIO_BE_NORM        IOPRIO_NORM

/*
 * The 10 bits between the priority class and the priority level are used to
 * optionally define I/O hints for any combination of I/O priority class and
 * level. Depending on the kernel configuration, I/O scheduler being used and
 * the target I/O device being used, hints can influence how I/Os are processed
 * without affecting the I/O scheduling ordering defined by the I/O priority
 * class and level.
 */
#define IOPRIO_HINT_SHIFT                IOPRIO_LEVEL_NR_BITS
#define IOPRIO_HINT_NR_BITS                10
#define IOPRIO_NR_HINTS                        (1 << IOPRIO_HINT_NR_BITS)
#define IOPRIO_HINT_MASK                (IOPRIO_NR_HINTS - 1)
#define IOPRIO_PRIO_HINT(ioprio)        \
        (((ioprio) >> IOPRIO_HINT_SHIFT) & IOPRIO_HINT_MASK)

/*
 * I/O hints.
 */
enum {
        /* No hint */
        IOPRIO_HINT_NONE = 0,

        /*
         * Device command duration limits: indicate to the device a desired
         * duration limit for the commands that will be used to process an I/O.
         * These will currently only be effective for SCSI and ATA devices that
         * support the command duration limits feature. If this feature is
         * enabled, then the commands issued to the device to process an I/O with
         * one of these hints set will have the duration limit index (dld field)
         * set to the value of the hint.
         */
        IOPRIO_HINT_DEV_DURATION_LIMIT_1 = 1,
        IOPRIO_HINT_DEV_DURATION_LIMIT_2 = 2,
        IOPRIO_HINT_DEV_DURATION_LIMIT_3 = 3,
        IOPRIO_HINT_DEV_DURATION_LIMIT_4 = 4,
        IOPRIO_HINT_DEV_DURATION_LIMIT_5 = 5,
        IOPRIO_HINT_DEV_DURATION_LIMIT_6 = 6,
        IOPRIO_HINT_DEV_DURATION_LIMIT_7 = 7,
};

#define IOPRIO_BAD_VALUE(val, max) ((val) < 0 || (val) >= (max))

/*
 * Return an I/O priority value based on a class, a level and a hint.
 */
static __always_inline __u16 ioprio_value(int prioclass, int priolevel,
                                          int priohint)
{
        if (IOPRIO_BAD_VALUE(prioclass, IOPRIO_NR_CLASSES) ||
            IOPRIO_BAD_VALUE(priolevel, IOPRIO_NR_LEVELS) ||
            IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS))
                return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT;

        return (prioclass << IOPRIO_CLASS_SHIFT) |
                (priohint << IOPRIO_HINT_SHIFT) | priolevel;
}

#define IOPRIO_PRIO_VALUE(prioclass, priolevel)                        \
        ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE)
#define IOPRIO_PRIO_VALUE_HINT(prioclass, priolevel, priohint)        \
        ioprio_value(prioclass, priolevel, priohint)

#endif /* _UAPI_LINUX_IOPRIO_H */


















































































































   10 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2020 Christoph Hellwig.
 *
 * Support for "universal" pointers that can point to either kernel or userspace
 * memory.
 */
#ifndef _LINUX_SOCKPTR_H
#define _LINUX_SOCKPTR_H

#include <linux/slab.h>
#include <linux/uaccess.h>

typedef struct {
        union {
                void                *kernel;
                void __user        *user;
        };
        bool                is_kernel : 1;
} sockptr_t;

static inline bool sockptr_is_kernel(sockptr_t sockptr)
{
        return sockptr.is_kernel;
}

static inline sockptr_t KERNEL_SOCKPTR(void *p)
{
        return (sockptr_t) { .kernel = p, .is_kernel = true };
}

static inline sockptr_t USER_SOCKPTR(void __user *p)
{
        return (sockptr_t) { .user = p };
}

static inline bool sockptr_is_null(sockptr_t sockptr)
{
        if (sockptr_is_kernel(sockptr))
                return !sockptr.kernel;
        return !sockptr.user;
}

static inline int copy_from_sockptr_offset(void *dst, sockptr_t src,
                size_t offset, size_t size)
{
        if (!sockptr_is_kernel(src))
                return copy_from_user(dst, src.user + offset, size);
        memcpy(dst, src.kernel + offset, size);
        return 0;
}

/* Deprecated.
 * This is unsafe, unless caller checked user provided optlen.
 * Prefer copy_safe_from_sockptr() instead.
 *
 * Returns 0 for success, or number of bytes not copied on error.
 */
static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size)
{
        return copy_from_sockptr_offset(dst, src, 0, size);
}

/**
 * copy_safe_from_sockptr: copy a struct from sockptr
 * @dst:   Destination address, in kernel space. This buffer must be @ksize
 *         bytes long.
 * @ksize: Size of @dst struct.
 * @optval: Source address. (in user or kernel space)
 * @optlen: Size of @optval data.
 *
 * Returns:
 *  * -EINVAL: @optlen < @ksize
 *  * -EFAULT: access to userspace failed.
 *  * 0 : @ksize bytes were copied
 */
static inline int copy_safe_from_sockptr(void *dst, size_t ksize,
                                         sockptr_t optval, unsigned int optlen)
{
        if (optlen < ksize)
                return -EINVAL;
        if (copy_from_sockptr(dst, optval, ksize))
                return -EFAULT;
        return 0;
}

static inline int copy_struct_from_sockptr(void *dst, size_t ksize,
                sockptr_t src, size_t usize)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        if (!sockptr_is_kernel(src))
                return copy_struct_from_user(dst, ksize, src.user, size);

        if (usize < ksize) {
                memset(dst + size, 0, rest);
        } else if (usize > ksize) {
                char *p = src.kernel;

                while (rest--) {
                        if (*p++)
                                return -E2BIG;
                }
        }
        memcpy(dst, src.kernel, size);
        return 0;
}

static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset,
                const void *src, size_t size)
{
        if (!sockptr_is_kernel(dst))
                return copy_to_user(dst.user + offset, src, size);
        memcpy(dst.kernel + offset, src, size);
        return 0;
}

static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size)
{
        return copy_to_sockptr_offset(dst, 0, src, size);
}

static inline void *memdup_sockptr_noprof(sockptr_t src, size_t len)
{
        void *p = kmalloc_track_caller_noprof(len, GFP_USER | __GFP_NOWARN);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_sockptr(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        return p;
}
#define memdup_sockptr(...)        alloc_hooks(memdup_sockptr_noprof(__VA_ARGS__))

static inline void *memdup_sockptr_nul_noprof(sockptr_t src, size_t len)
{
        char *p = kmalloc_track_caller_noprof(len + 1, GFP_KERNEL);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_sockptr(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';
        return p;
}
#define memdup_sockptr_nul(...)        alloc_hooks(memdup_sockptr_nul_noprof(__VA_ARGS__))

static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count)
{
        if (sockptr_is_kernel(src)) {
                size_t len = min(strnlen(src.kernel, count - 1) + 1, count);

                memcpy(dst, src.kernel, len);
                return len;
        }
        return strncpy_from_user(dst, src.user, count);
}

static inline int check_zeroed_sockptr(sockptr_t src, size_t offset,
                                       size_t size)
{
        if (!sockptr_is_kernel(src))
                return check_zeroed_user(src.user + offset, size);
        return memchr_inv(src.kernel + offset, 0, size) == NULL;
}

#endif /* _LINUX_SOCKPTR_H */





















































































































































































































































































































































































































































































































































































































    1 

















































































































































































    1 


    1 















    1 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
// SPDX-License-Identifier: GPL-2.0-only
/*
 *
 * Author        Karsten Keil <kkeil@novell.com>
 *
 * Copyright 2008  by Karsten Keil <kkeil@novell.com>
 */

#include <linux/mISDNif.h>
#include <linux/slab.h>
#include <linux/export.h>
#include "core.h"

static u_int        *debug;

static struct proto mISDN_proto = {
        .name                = "misdn",
        .owner                = THIS_MODULE,
        .obj_size        = sizeof(struct mISDN_sock)
};

#define _pms(sk)        ((struct mISDN_sock *)sk)

static struct mISDN_sock_list        data_sockets = {
        .lock = __RW_LOCK_UNLOCKED(data_sockets.lock)
};

static struct mISDN_sock_list        base_sockets = {
        .lock = __RW_LOCK_UNLOCKED(base_sockets.lock)
};

#define L2_HEADER_LEN        4

static inline struct sk_buff *
_l2_alloc_skb(unsigned int len, gfp_t gfp_mask)
{
        struct sk_buff  *skb;

        skb = alloc_skb(len + L2_HEADER_LEN, gfp_mask);
        if (likely(skb))
                skb_reserve(skb, L2_HEADER_LEN);
        return skb;
}

static void
mISDN_sock_link(struct mISDN_sock_list *l, struct sock *sk)
{
        write_lock_bh(&l->lock);
        sk_add_node(sk, &l->head);
        write_unlock_bh(&l->lock);
}

static void mISDN_sock_unlink(struct mISDN_sock_list *l, struct sock *sk)
{
        write_lock_bh(&l->lock);
        sk_del_node_init(sk);
        write_unlock_bh(&l->lock);
}

static int
mISDN_send(struct mISDNchannel *ch, struct sk_buff *skb)
{
        struct mISDN_sock *msk;
        int        err;

        msk = container_of(ch, struct mISDN_sock, ch);
        if (*debug & DEBUG_SOCKET)
                printk(KERN_DEBUG "%s len %d %p\n", __func__, skb->len, skb);
        if (msk->sk.sk_state == MISDN_CLOSED)
                return -EUNATCH;
        __net_timestamp(skb);
        err = sock_queue_rcv_skb(&msk->sk, skb);
        if (err)
                printk(KERN_WARNING "%s: error %d\n", __func__, err);
        return err;
}

static int
mISDN_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg)
{
        struct mISDN_sock *msk;

        msk = container_of(ch, struct mISDN_sock, ch);
        if (*debug & DEBUG_SOCKET)
                printk(KERN_DEBUG "%s(%p, %x, %p)\n", __func__, ch, cmd, arg);
        switch (cmd) {
        case CLOSE_CHANNEL:
                msk->sk.sk_state = MISDN_CLOSED;
                break;
        }
        return 0;
}

static inline void
mISDN_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
{
        struct __kernel_old_timeval        tv;

        if (_pms(sk)->cmask & MISDN_TIME_STAMP) {
                skb_get_timestamp(skb, &tv);
                put_cmsg(msg, SOL_MISDN, MISDN_TIME_STAMP, sizeof(tv), &tv);
        }
}

static int
mISDN_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                   int flags)
{
        struct sk_buff                *skb;
        struct sock                *sk = sock->sk;

        int                copied, err;

        if (*debug & DEBUG_SOCKET)
                printk(KERN_DEBUG "%s: len %d, flags %x ch.nr %d, proto %x\n",
                       __func__, (int)len, flags, _pms(sk)->ch.nr,
                       sk->sk_protocol);
        if (flags & (MSG_OOB))
                return -EOPNOTSUPP;

        if (sk->sk_state == MISDN_CLOSED)
                return 0;

        skb = skb_recv_datagram(sk, flags, &err);
        if (!skb)
                return err;

        if (msg->msg_name) {
                DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name);

                maddr->family = AF_ISDN;
                maddr->dev = _pms(sk)->dev->id;
                if ((sk->sk_protocol == ISDN_P_LAPD_TE) ||
                    (sk->sk_protocol == ISDN_P_LAPD_NT)) {
                        maddr->channel = (mISDN_HEAD_ID(skb) >> 16) & 0xff;
                        maddr->tei =  (mISDN_HEAD_ID(skb) >> 8) & 0xff;
                        maddr->sapi = mISDN_HEAD_ID(skb) & 0xff;
                } else {
                        maddr->channel = _pms(sk)->ch.nr;
                        maddr->sapi = _pms(sk)->ch.addr & 0xFF;
                        maddr->tei =  (_pms(sk)->ch.addr >> 8) & 0xFF;
                }
                msg->msg_namelen = sizeof(*maddr);
        }

        copied = skb->len + MISDN_HEADER_LEN;
        if (len < copied) {
                if (flags & MSG_PEEK)
                        refcount_dec(&skb->users);
                else
                        skb_queue_head(&sk->sk_receive_queue, skb);
                return -ENOSPC;
        }
        memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb),
               MISDN_HEADER_LEN);

        err = skb_copy_datagram_msg(skb, 0, msg, copied);

        mISDN_sock_cmsg(sk, msg, skb);

        skb_free_datagram(sk, skb);

        return err ? : copied;
}

static int
mISDN_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock                *sk = sock->sk;
        struct sk_buff                *skb;
        int                        err = -ENOMEM;

        if (*debug & DEBUG_SOCKET)
                printk(KERN_DEBUG "%s: len %d flags %x ch %d proto %x\n",
                       __func__, (int)len, msg->msg_flags, _pms(sk)->ch.nr,
                       sk->sk_protocol);

        if (msg->msg_flags & MSG_OOB)
                return -EOPNOTSUPP;

        if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE))
                return -EINVAL;

        if (len < MISDN_HEADER_LEN)
                return -EINVAL;

        if (sk->sk_state != MISDN_BOUND)
                return -EBADFD;

        lock_sock(sk);

        skb = _l2_alloc_skb(len, GFP_KERNEL);
        if (!skb)
                goto done;

        if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
                err = -EFAULT;
                goto done;
        }

        memcpy(mISDN_HEAD_P(skb), skb->data, MISDN_HEADER_LEN);
        skb_pull(skb, MISDN_HEADER_LEN);

        if (msg->msg_namelen >= sizeof(struct sockaddr_mISDN)) {
                /* if we have a address, we use it */
                DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name);
                mISDN_HEAD_ID(skb) = maddr->channel;
        } else { /* use default for L2 messages */
                if ((sk->sk_protocol == ISDN_P_LAPD_TE) ||
                    (sk->sk_protocol == ISDN_P_LAPD_NT))
                        mISDN_HEAD_ID(skb) = _pms(sk)->ch.nr;
        }

        if (*debug & DEBUG_SOCKET)
                printk(KERN_DEBUG "%s: ID:%x\n",
                       __func__, mISDN_HEAD_ID(skb));

        err = -ENODEV;
        if (!_pms(sk)->ch.peer)
                goto done;
        err = _pms(sk)->ch.recv(_pms(sk)->ch.peer, skb);
        if (err)
                goto done;
        else {
                skb = NULL;
                err = len;
        }

done:
        kfree_skb(skb);
        release_sock(sk);
        return err;
}

static int
data_sock_release(struct socket *sock)
{
        struct sock *sk = sock->sk;

        if (*debug & DEBUG_SOCKET)
                printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk);
        if (!sk)
                return 0;
        switch (sk->sk_protocol) {
        case ISDN_P_TE_S0:
        case ISDN_P_NT_S0:
        case ISDN_P_TE_E1:
        case ISDN_P_NT_E1:
                if (sk->sk_state == MISDN_BOUND)
                        delete_channel(&_pms(sk)->ch);
                else
                        mISDN_sock_unlink(&data_sockets, sk);
                break;
        case ISDN_P_LAPD_TE:
        case ISDN_P_LAPD_NT:
        case ISDN_P_B_RAW:
        case ISDN_P_B_HDLC:
        case ISDN_P_B_X75SLP:
        case ISDN_P_B_L2DTMF:
        case ISDN_P_B_L2DSP:
        case ISDN_P_B_L2DSPHDLC:
                delete_channel(&_pms(sk)->ch);
                mISDN_sock_unlink(&data_sockets, sk);
                break;
        }

        lock_sock(sk);

        sock_orphan(sk);
        skb_queue_purge(&sk->sk_receive_queue);

        release_sock(sk);
        sock_put(sk);

        return 0;
}

static int
data_sock_ioctl_bound(struct sock *sk, unsigned int cmd, void __user *p)
{
        struct mISDN_ctrl_req        cq;
        int                        err = -EINVAL, val[2];
        struct mISDNchannel        *bchan, *next;

        lock_sock(sk);
        if (!_pms(sk)->dev) {
                err = -ENODEV;
                goto done;
        }
        switch (cmd) {
        case IMCTRLREQ:
                if (copy_from_user(&cq, p, sizeof(cq))) {
                        err = -EFAULT;
                        break;
                }
                if ((sk->sk_protocol & ~ISDN_P_B_MASK) == ISDN_P_B_START) {
                        list_for_each_entry_safe(bchan, next,
                                                 &_pms(sk)->dev->bchannels, list) {
                                if (bchan->nr == cq.channel) {
                                        err = bchan->ctrl(bchan,
                                                          CONTROL_CHANNEL, &cq);
                                        break;
                                }
                        }
                } else
                        err = _pms(sk)->dev->D.ctrl(&_pms(sk)->dev->D,
                                                    CONTROL_CHANNEL, &cq);
                if (err)
                        break;
                if (copy_to_user(p, &cq, sizeof(cq)))
                        err = -EFAULT;
                break;
        case IMCLEAR_L2:
                if (sk->sk_protocol != ISDN_P_LAPD_NT) {
                        err = -EINVAL;
                        break;
                }
                val[0] = cmd;
                if (get_user(val[1], (int __user *)p)) {
                        err = -EFAULT;
                        break;
                }
                err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr,
                                                  CONTROL_CHANNEL, val);
                break;
        case IMHOLD_L1:
                if (sk->sk_protocol != ISDN_P_LAPD_NT
                    && sk->sk_protocol != ISDN_P_LAPD_TE) {
                        err = -EINVAL;
                        break;
                }
                val[0] = cmd;
                if (get_user(val[1], (int __user *)p)) {
                        err = -EFAULT;
                        break;
                }
                err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr,
                                                  CONTROL_CHANNEL, val);
                break;
        default:
                err = -EINVAL;
                break;
        }
done:
        release_sock(sk);
        return err;
}

static int
data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        int                        err = 0, id;
        struct sock                *sk = sock->sk;
        struct mISDNdevice        *dev;
        struct mISDNversion        ver;

        switch (cmd) {
        case IMGETVERSION:
                ver.major = MISDN_MAJOR_VERSION;
                ver.minor = MISDN_MINOR_VERSION;
                ver.release = MISDN_RELEASE;
                if (copy_to_user((void __user *)arg, &ver, sizeof(ver)))
                        err = -EFAULT;
                break;
        case IMGETCOUNT:
                id = get_mdevice_count();
                if (put_user(id, (int __user *)arg))
                        err = -EFAULT;
                break;
        case IMGETDEVINFO:
                if (get_user(id, (int __user *)arg)) {
                        err = -EFAULT;
                        break;
                }
                dev = get_mdevice(id);
                if (dev) {
                        struct mISDN_devinfo di;

                        memset(&di, 0, sizeof(di));
                        di.id = dev->id;
                        di.Dprotocols = dev->Dprotocols;
                        di.Bprotocols = dev->Bprotocols | get_all_Bprotocols();
                        di.protocol = dev->D.protocol;
                        memcpy(di.channelmap, dev->channelmap,
                               sizeof(di.channelmap));
                        di.nrbchan = dev->nrbchan;
                        strscpy(di.name, dev_name(&dev->dev), sizeof(di.name));
                        if (copy_to_user((void __user *)arg, &di, sizeof(di)))
                                err = -EFAULT;
                } else
                        err = -ENODEV;
                break;
        default:
                if (sk->sk_state == MISDN_BOUND)
                        err = data_sock_ioctl_bound(sk, cmd,
                                                    (void __user *)arg);
                else
                        err = -ENOTCONN;
        }
        return err;
}

static int data_sock_setsockopt(struct socket *sock, int level, int optname,
                                sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        int err = 0, opt = 0;

        if (*debug & DEBUG_SOCKET)
                printk(KERN_DEBUG "%s(%p, %d, %x, optval, %d)\n", __func__, sock,
                       level, optname, optlen);

        lock_sock(sk);

        switch (optname) {
        case MISDN_TIME_STAMP:
                err = copy_safe_from_sockptr(&opt, sizeof(opt),
                                             optval, optlen);
                if (err)
                        break;

                if (opt)
                        _pms(sk)->cmask |= MISDN_TIME_STAMP;
                else
                        _pms(sk)->cmask &= ~MISDN_TIME_STAMP;
                break;
        default:
                err = -ENOPROTOOPT;
                break;
        }
        release_sock(sk);
        return err;
}

static int data_sock_getsockopt(struct socket *sock, int level, int optname,
                                char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;
        int len, opt;

        if (get_user(len, optlen))
                return -EFAULT;

        if (len != sizeof(char))
                return -EINVAL;

        switch (optname) {
        case MISDN_TIME_STAMP:
                if (_pms(sk)->cmask & MISDN_TIME_STAMP)
                        opt = 1;
                else
                        opt = 0;

                if (put_user(opt, optval))
                        return -EFAULT;
                break;
        default:
                return -ENOPROTOOPT;
        }

        return 0;
}

static int
data_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
{
        struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr;
        struct sock *sk = sock->sk;
        struct sock *csk;
        int err = 0;

        if (*debug & DEBUG_SOCKET)
                printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk);
        if (addr_len != sizeof(struct sockaddr_mISDN))
                return -EINVAL;
        if (!maddr || maddr->family != AF_ISDN)
                return -EINVAL;

        lock_sock(sk);

        if (_pms(sk)->dev) {
                err = -EALREADY;
                goto done;
        }
        _pms(sk)->dev = get_mdevice(maddr->dev);
        if (!_pms(sk)->dev) {
                err = -ENODEV;
                goto done;
        }

        if (sk->sk_protocol < ISDN_P_B_START) {
                read_lock_bh(&data_sockets.lock);
                sk_for_each(csk, &data_sockets.head) {
                        if (sk == csk)
                                continue;
                        if (_pms(csk)->dev != _pms(sk)->dev)
                                continue;
                        if (csk->sk_protocol >= ISDN_P_B_START)
                                continue;
                        if (IS_ISDN_P_TE(csk->sk_protocol)
                            == IS_ISDN_P_TE(sk->sk_protocol))
                                continue;
                        read_unlock_bh(&data_sockets.lock);
                        err = -EBUSY;
                        goto done;
                }
                read_unlock_bh(&data_sockets.lock);
        }

        _pms(sk)->ch.send = mISDN_send;
        _pms(sk)->ch.ctrl = mISDN_ctrl;

        switch (sk->sk_protocol) {
        case ISDN_P_TE_S0:
        case ISDN_P_NT_S0:
        case ISDN_P_TE_E1:
        case ISDN_P_NT_E1:
                mISDN_sock_unlink(&data_sockets, sk);
                err = connect_layer1(_pms(sk)->dev, &_pms(sk)->ch,
                                     sk->sk_protocol, maddr);
                if (err)
                        mISDN_sock_link(&data_sockets, sk);
                break;
        case ISDN_P_LAPD_TE:
        case ISDN_P_LAPD_NT:
                err = create_l2entity(_pms(sk)->dev, &_pms(sk)->ch,
                                      sk->sk_protocol, maddr);
                break;
        case ISDN_P_B_RAW:
        case ISDN_P_B_HDLC:
        case ISDN_P_B_X75SLP:
        case ISDN_P_B_L2DTMF:
        case ISDN_P_B_L2DSP:
        case ISDN_P_B_L2DSPHDLC:
                err = connect_Bstack(_pms(sk)->dev, &_pms(sk)->ch,
                                     sk->sk_protocol, maddr);
                break;
        default:
                err = -EPROTONOSUPPORT;
        }
        if (err)
                goto done;
        sk->sk_state = MISDN_BOUND;
        _pms(sk)->ch.protocol = sk->sk_protocol;

done:
        release_sock(sk);
        return err;
}

static int
data_sock_getname(struct socket *sock, struct sockaddr *addr,
                  int peer)
{
        struct sockaddr_mISDN        *maddr = (struct sockaddr_mISDN *) addr;
        struct sock                *sk = sock->sk;

        if (!_pms(sk)->dev)
                return -EBADFD;

        lock_sock(sk);

        maddr->family = AF_ISDN;
        maddr->dev = _pms(sk)->dev->id;
        maddr->channel = _pms(sk)->ch.nr;
        maddr->sapi = _pms(sk)->ch.addr & 0xff;
        maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xff;
        release_sock(sk);
        return sizeof(*maddr);
}

static const struct proto_ops data_sock_ops = {
        .family                = PF_ISDN,
        .owner                = THIS_MODULE,
        .release        = data_sock_release,
        .ioctl                = data_sock_ioctl,
        .bind                = data_sock_bind,
        .getname        = data_sock_getname,
        .sendmsg        = mISDN_sock_sendmsg,
        .recvmsg        = mISDN_sock_recvmsg,
        .poll                = datagram_poll,
        .listen                = sock_no_listen,
        .shutdown        = sock_no_shutdown,
        .setsockopt        = data_sock_setsockopt,
        .getsockopt        = data_sock_getsockopt,
        .connect        = sock_no_connect,
        .socketpair        = sock_no_socketpair,
        .accept                = sock_no_accept,
        .mmap                = sock_no_mmap
};

static int
data_sock_create(struct net *net, struct socket *sock, int protocol, int kern)
{
        struct sock *sk;

        if (sock->type != SOCK_DGRAM)
                return -ESOCKTNOSUPPORT;

        sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern);
        if (!sk)
                return -ENOMEM;

        sock_init_data(sock, sk);

        sock->ops = &data_sock_ops;
        sock->state = SS_UNCONNECTED;
        sock_reset_flag(sk, SOCK_ZAPPED);

        sk->sk_protocol = protocol;
        sk->sk_state    = MISDN_OPEN;
        mISDN_sock_link(&data_sockets, sk);

        return 0;
}

static int
base_sock_release(struct socket *sock)
{
        struct sock *sk = sock->sk;

        printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk);
        if (!sk)
                return 0;

        mISDN_sock_unlink(&base_sockets, sk);
        sock_orphan(sk);
        sock_put(sk);

        return 0;
}

static int
base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        int                        err = 0, id;
        struct mISDNdevice        *dev;
        struct mISDNversion        ver;

        switch (cmd) {
        case IMGETVERSION:
                ver.major = MISDN_MAJOR_VERSION;
                ver.minor = MISDN_MINOR_VERSION;
                ver.release = MISDN_RELEASE;
                if (copy_to_user((void __user *)arg, &ver, sizeof(ver)))
                        err = -EFAULT;
                break;
        case IMGETCOUNT:
                id = get_mdevice_count();
                if (put_user(id, (int __user *)arg))
                        err = -EFAULT;
                break;
        case IMGETDEVINFO:
                if (get_user(id, (int __user *)arg)) {
                        err = -EFAULT;
                        break;
                }
                dev = get_mdevice(id);
                if (dev) {
                        struct mISDN_devinfo di;

                        memset(&di, 0, sizeof(di));
                        di.id = dev->id;
                        di.Dprotocols = dev->Dprotocols;
                        di.Bprotocols = dev->Bprotocols | get_all_Bprotocols();
                        di.protocol = dev->D.protocol;
                        memcpy(di.channelmap, dev->channelmap,
                               sizeof(di.channelmap));
                        di.nrbchan = dev->nrbchan;
                        strscpy(di.name, dev_name(&dev->dev), sizeof(di.name));
                        if (copy_to_user((void __user *)arg, &di, sizeof(di)))
                                err = -EFAULT;
                } else
                        err = -ENODEV;
                break;
        case IMSETDEVNAME:
        {
                struct mISDN_devrename dn;
                if (copy_from_user(&dn, (void __user *)arg,
                                   sizeof(dn))) {
                        err = -EFAULT;
                        break;
                }
                dn.name[sizeof(dn.name) - 1] = '\0';
                dev = get_mdevice(dn.id);
                if (dev)
                        err = device_rename(&dev->dev, dn.name);
                else
                        err = -ENODEV;
        }
        break;
        default:
                err = -EINVAL;
        }
        return err;
}

static int
base_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
{
        struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr;
        struct sock *sk = sock->sk;
        int err = 0;

        if (addr_len < sizeof(struct sockaddr_mISDN))
                return -EINVAL;

        if (!maddr || maddr->family != AF_ISDN)
                return -EINVAL;

        lock_sock(sk);

        if (_pms(sk)->dev) {
                err = -EALREADY;
                goto done;
        }

        _pms(sk)->dev = get_mdevice(maddr->dev);
        if (!_pms(sk)->dev) {
                err = -ENODEV;
                goto done;
        }
        sk->sk_state = MISDN_BOUND;

done:
        release_sock(sk);
        return err;
}

static const struct proto_ops base_sock_ops = {
        .family                = PF_ISDN,
        .owner                = THIS_MODULE,
        .release        = base_sock_release,
        .ioctl                = base_sock_ioctl,
        .bind                = base_sock_bind,
        .getname        = sock_no_getname,
        .sendmsg        = sock_no_sendmsg,
        .recvmsg        = sock_no_recvmsg,
        .listen                = sock_no_listen,
        .shutdown        = sock_no_shutdown,
        .connect        = sock_no_connect,
        .socketpair        = sock_no_socketpair,
        .accept                = sock_no_accept,
        .mmap                = sock_no_mmap
};


static int
base_sock_create(struct net *net, struct socket *sock, int protocol, int kern)
{
        struct sock *sk;

        if (sock->type != SOCK_RAW)
                return -ESOCKTNOSUPPORT;
        if (!capable(CAP_NET_RAW))
                return -EPERM;

        sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern);
        if (!sk)
                return -ENOMEM;

        sock_init_data(sock, sk);
        sock->ops = &base_sock_ops;
        sock->state = SS_UNCONNECTED;
        sock_reset_flag(sk, SOCK_ZAPPED);
        sk->sk_protocol = protocol;
        sk->sk_state    = MISDN_OPEN;
        mISDN_sock_link(&base_sockets, sk);

        return 0;
}

static int
mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern)
{
        int err = -EPROTONOSUPPORT;

        switch (proto) {
        case ISDN_P_BASE:
                err = base_sock_create(net, sock, proto, kern);
                break;
        case ISDN_P_TE_S0:
        case ISDN_P_NT_S0:
        case ISDN_P_TE_E1:
        case ISDN_P_NT_E1:
        case ISDN_P_LAPD_TE:
        case ISDN_P_LAPD_NT:
        case ISDN_P_B_RAW:
        case ISDN_P_B_HDLC:
        case ISDN_P_B_X75SLP:
        case ISDN_P_B_L2DTMF:
        case ISDN_P_B_L2DSP:
        case ISDN_P_B_L2DSPHDLC:
                err = data_sock_create(net, sock, proto, kern);
                break;
        default:
                return err;
        }

        return err;
}

static const struct net_proto_family mISDN_sock_family_ops = {
        .owner  = THIS_MODULE,
        .family = PF_ISDN,
        .create = mISDN_sock_create,
};

int
misdn_sock_init(u_int *deb)
{
        int err;

        debug = deb;
        err = sock_register(&mISDN_sock_family_ops);
        if (err)
                printk(KERN_ERR "%s: error(%d)\n", __func__, err);
        return err;
}

void
misdn_sock_cleanup(void)
{
        sock_unregister(PF_ISDN);
}






























   22 









   21 
    8 
   21 
   20 
   17 
   21 
   22 






























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Access kernel or user memory without faulting.
 */
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>

bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
                size_t size)
{
        return true;
}

/*
 * The below only uses kmsan_check_memory() to ensure uninitialized kernel
 * memory isn't leaked.
 */
#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label)        \
        while (len >= sizeof(type)) {                                        \
                __get_kernel_nofault(dst, src, type, err_label);        \
                kmsan_check_memory(src, sizeof(type));                        \
                dst += sizeof(type);                                        \
                src += sizeof(type);                                        \
                len -= sizeof(type);                                        \
        }

long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
{
        unsigned long align = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                align = (unsigned long)dst | (unsigned long)src;

        if (!copy_from_kernel_nofault_allowed(src, size))
                return -ERANGE;

        pagefault_disable();
        if (!(align & 7))
                copy_from_kernel_nofault_loop(dst, src, size, u64, Efault);
        if (!(align & 3))
                copy_from_kernel_nofault_loop(dst, src, size, u32, Efault);
        if (!(align & 1))
                copy_from_kernel_nofault_loop(dst, src, size, u16, Efault);
        copy_from_kernel_nofault_loop(dst, src, size, u8, Efault);
        pagefault_enable();
        return 0;
Efault:
        pagefault_enable();
        return -EFAULT;
}
EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);

#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label)        \
        while (len >= sizeof(type)) {                                        \
                __put_kernel_nofault(dst, src, type, err_label);        \
                instrument_write(dst, sizeof(type));                        \
                dst += sizeof(type);                                        \
                src += sizeof(type);                                        \
                len -= sizeof(type);                                        \
        }

long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
{
        unsigned long align = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                align = (unsigned long)dst | (unsigned long)src;

        pagefault_disable();
        if (!(align & 7))
                copy_to_kernel_nofault_loop(dst, src, size, u64, Efault);
        if (!(align & 3))
                copy_to_kernel_nofault_loop(dst, src, size, u32, Efault);
        if (!(align & 1))
                copy_to_kernel_nofault_loop(dst, src, size, u16, Efault);
        copy_to_kernel_nofault_loop(dst, src, size, u8, Efault);
        pagefault_enable();
        return 0;
Efault:
        pagefault_enable();
        return -EFAULT;
}

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
        const void *src = unsafe_addr;

        if (unlikely(count <= 0))
                return 0;
        if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
                return -ERANGE;

        pagefault_disable();
        do {
                __get_kernel_nofault(dst, src, u8, Efault);
                dst++;
                src++;
        } while (dst[-1] && src - unsafe_addr < count);
        pagefault_enable();

        dst[-1] = '\0';
        return src - unsafe_addr;
Efault:
        pagefault_enable();
        dst[0] = '\0';
        return -EFAULT;
}

/**
 * copy_from_user_nofault(): safely attempt to read from a user-space location
 * @dst: pointer to the buffer that shall take the data
 * @src: address to read from. This must be a user address.
 * @size: size of the data chunk
 *
 * Safely read from user address @src to the buffer at @dst. If a kernel fault
 * happens, handle that and return -EFAULT.
 */
long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
{
        long ret = -EFAULT;

        if (!__access_ok(src, size))
                return ret;

        if (!nmi_uaccess_okay())
                return ret;

        pagefault_disable();
        ret = __copy_from_user_inatomic(dst, src, size);
        pagefault_enable();

        if (ret)
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(copy_from_user_nofault);

/**
 * copy_to_user_nofault(): safely attempt to write to a user-space location
 * @dst: address to write to
 * @src: pointer to the data that shall be written
 * @size: size of the data chunk
 *
 * Safely write to address @dst from the buffer at @src.  If a kernel fault
 * happens, handle that and return -EFAULT.
 */
long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
{
        long ret = -EFAULT;

        if (access_ok(dst, size)) {
                pagefault_disable();
                ret = __copy_to_user_inatomic(dst, src, size);
                pagefault_enable();
        }

        if (ret)
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(copy_to_user_nofault);

/**
 * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user
 *                                address.
 * @dst:   Destination address, in kernel space.  This buffer must be at
 *         least @count bytes long.
 * @unsafe_addr: Unsafe user address.
 * @count: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Copies a NUL-terminated string from unsafe user address to kernel buffer.
 *
 * On success, returns the length of the string INCLUDING the trailing NUL.
 *
 * If access fails, returns -EFAULT (some data may have been copied
 * and the trailing NUL added).
 *
 * If @count is smaller than the length of the string, copies @count-1 bytes,
 * sets the last byte of @dst buffer to NUL and returns @count.
 */
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                              long count)
{
        long ret;

        if (unlikely(count <= 0))
                return 0;

        pagefault_disable();
        ret = strncpy_from_user(dst, unsafe_addr, count);
        pagefault_enable();

        if (ret >= count) {
                ret = count;
                dst[ret - 1] = '\0';
        } else if (ret >= 0) {
                ret++;
        }

        return ret;
}

/**
 * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL.
 * @unsafe_addr: The string to measure.
 * @count: Maximum count (including NUL)
 *
 * Get the size of a NUL-terminated string in user space without pagefault.
 *
 * Returns the size of the string INCLUDING the terminating NUL.
 *
 * If the string is too long, returns a number larger than @count. User
 * has to check the return value against "> count".
 * On exception (or invalid count), returns 0.
 *
 * Unlike strnlen_user, this can be used from IRQ handler etc. because
 * it disables pagefaults.
 */
long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
        int ret;

        pagefault_disable();
        ret = strnlen_user(unsafe_addr, count);
        pagefault_enable();

        return ret;
}

void __copy_overflow(int size, unsigned long count)
{
        WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
}
EXPORT_SYMBOL(__copy_overflow);















































































































   13 

   12 





   13 


















   13 















   14 




   12 

































































   13 




    2 



    2 































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
// SPDX-License-Identifier: GPL-2.0
/*
 * Tag allocation using scalable bitmaps. Uses active queue tracking to support
 * fairer distribution of tags between multiple submitters when a shared tag map
 * is used.
 *
 * Copyright (C) 2013-2014 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/kmemleak.h>

#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"

/*
 * Recalculate wakeup batch when tag is shared by hctx.
 */
static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
                unsigned int users)
{
        if (!users)
                return;

        sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
                        users);
        sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
                        users);
}

/*
 * If a previously inactive queue goes active, bump the active user count.
 * We need to do this before try to allocate driver tag, then even if fail
 * to get tag when first time, the other shared-tag users could reserve
 * budget for it.
 */
void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
        unsigned int users;
        unsigned long flags;
        struct blk_mq_tags *tags = hctx->tags;

        /*
         * calling test_bit() prior to test_and_set_bit() is intentional,
         * it avoids dirtying the cacheline if the queue is already active.
         */
        if (blk_mq_is_shared_tags(hctx->flags)) {
                struct request_queue *q = hctx->queue;

                if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
                    test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
                        return;
        } else {
                if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
                    test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        return;
        }

        spin_lock_irqsave(&tags->lock, flags);
        users = tags->active_queues + 1;
        WRITE_ONCE(tags->active_queues, users);
        blk_mq_update_wake_batch(tags, users);
        spin_unlock_irqrestore(&tags->lock, flags);
}

/*
 * Wakeup all potentially sleeping on tags
 */
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
        sbitmap_queue_wake_all(&tags->bitmap_tags);
        if (include_reserve)
                sbitmap_queue_wake_all(&tags->breserved_tags);
}

/*
 * If a previously busy queue goes inactive, potential waiters could now
 * be allowed to queue. Wake them up and check.
 */
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
        struct blk_mq_tags *tags = hctx->tags;
        unsigned int users;

        if (blk_mq_is_shared_tags(hctx->flags)) {
                struct request_queue *q = hctx->queue;

                if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
                                        &q->queue_flags))
                        return;
        } else {
                if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        return;
        }

        spin_lock_irq(&tags->lock);
        users = tags->active_queues - 1;
        WRITE_ONCE(tags->active_queues, users);
        blk_mq_update_wake_batch(tags, users);
        spin_unlock_irq(&tags->lock);

        blk_mq_tag_wakeup_all(tags, false);
}

static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
                            struct sbitmap_queue *bt)
{
        if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) &&
                        !hctx_may_queue(data->hctx, bt))
                return BLK_MQ_NO_TAG;

        if (data->shallow_depth)
                return sbitmap_queue_get_shallow(bt, data->shallow_depth);
        else
                return __sbitmap_queue_get(bt);
}

unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
                              unsigned int *offset)
{
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct sbitmap_queue *bt = &tags->bitmap_tags;
        unsigned long ret;

        if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
            data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                return 0;
        ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
        *offset += tags->nr_reserved_tags;
        return ret;
}

unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct sbitmap_queue *bt;
        struct sbq_wait_state *ws;
        DEFINE_SBQ_WAIT(wait);
        unsigned int tag_offset;
        int tag;

        if (data->flags & BLK_MQ_REQ_RESERVED) {
                if (unlikely(!tags->nr_reserved_tags)) {
                        WARN_ON_ONCE(1);
                        return BLK_MQ_NO_TAG;
                }
                bt = &tags->breserved_tags;
                tag_offset = 0;
        } else {
                bt = &tags->bitmap_tags;
                tag_offset = tags->nr_reserved_tags;
        }

        tag = __blk_mq_get_tag(data, bt);
        if (tag != BLK_MQ_NO_TAG)
                goto found_tag;

        if (data->flags & BLK_MQ_REQ_NOWAIT)
                return BLK_MQ_NO_TAG;

        ws = bt_wait_ptr(bt, data->hctx);
        do {
                struct sbitmap_queue *bt_prev;

                /*
                 * We're out of tags on this hardware queue, kick any
                 * pending IO submits before going to sleep waiting for
                 * some to complete.
                 */
                blk_mq_run_hw_queue(data->hctx, false);

                /*
                 * Retry tag allocation after running the hardware queue,
                 * as running the queue may also have found completions.
                 */
                tag = __blk_mq_get_tag(data, bt);
                if (tag != BLK_MQ_NO_TAG)
                        break;

                sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);

                tag = __blk_mq_get_tag(data, bt);
                if (tag != BLK_MQ_NO_TAG)
                        break;

                bt_prev = bt;
                io_schedule();

                sbitmap_finish_wait(bt, ws, &wait);

                data->ctx = blk_mq_get_ctx(data->q);
                data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);
                tags = blk_mq_tags_from_data(data);
                if (data->flags & BLK_MQ_REQ_RESERVED)
                        bt = &tags->breserved_tags;
                else
                        bt = &tags->bitmap_tags;

                /*
                 * If destination hw queue is changed, fake wake up on
                 * previous queue for compensating the wake up miss, so
                 * other allocations on previous queue won't be starved.
                 */
                if (bt != bt_prev)
                        sbitmap_queue_wake_up(bt_prev, 1);

                ws = bt_wait_ptr(bt, data->hctx);
        } while (1);

        sbitmap_finish_wait(bt, ws, &wait);

found_tag:
        /*
         * Give up this allocation if the hctx is inactive.  The caller will
         * retry on an active hctx.
         */
        if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
                blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
                return BLK_MQ_NO_TAG;
        }
        return tag + tag_offset;
}

void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                    unsigned int tag)
{
        if (!blk_mq_tag_is_reserved(tags, tag)) {
                const int real_tag = tag - tags->nr_reserved_tags;

                BUG_ON(real_tag >= tags->nr_tags);
                sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
        } else {
                sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
        }
}

void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
{
        sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
                                        tag_array, nr_tags);
}

struct bt_iter_data {
        struct blk_mq_hw_ctx *hctx;
        struct request_queue *q;
        busy_tag_iter_fn *fn;
        void *data;
        bool reserved;
};

static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
                unsigned int bitnr)
{
        struct request *rq;

        rq = tags->rqs[bitnr];
        if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
                rq = NULL;
        return rq;
}

static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
        struct bt_iter_data *iter_data = data;
        struct blk_mq_hw_ctx *hctx = iter_data->hctx;
        struct request_queue *q = iter_data->q;
        struct blk_mq_tag_set *set = q->tag_set;
        struct blk_mq_tags *tags;
        struct request *rq;
        bool ret = true;

        if (blk_mq_is_shared_tags(set->flags))
                tags = set->shared_tags;
        else
                tags = hctx->tags;

        if (!iter_data->reserved)
                bitnr += tags->nr_reserved_tags;
        /*
         * We can hit rq == NULL here, because the tagging functions
         * test and set the bit before assigning ->rqs[].
         */
        rq = blk_mq_find_and_get_req(tags, bitnr);
        if (!rq)
                return true;

        if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
                ret = iter_data->fn(rq, iter_data->data);
        blk_mq_put_rq_ref(rq);
        return ret;
}

/**
 * bt_for_each - iterate over the requests associated with a hardware queue
 * @hctx:        Hardware queue to examine.
 * @q:                Request queue @hctx is associated with (@hctx->queue).
 * @bt:                sbitmap to examine. This is either the breserved_tags member
 *                or the bitmap_tags member of struct blk_mq_tags.
 * @fn:                Pointer to the function that will be called for each request
 *                associated with @hctx that has been assigned a driver tag.
 *                @fn will be called as follows: @fn(rq, @data) where rq is a
 *                pointer to a request. Return %true to continue iterating tags;
 *                %false to stop.
 * @data:        Will be passed as second argument to @fn.
 * @reserved:        Indicates whether @bt is the breserved_tags member or the
 *                bitmap_tags member of struct blk_mq_tags.
 */
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q,
                        struct sbitmap_queue *bt, busy_tag_iter_fn *fn,
                        void *data, bool reserved)
{
        struct bt_iter_data iter_data = {
                .hctx = hctx,
                .fn = fn,
                .data = data,
                .reserved = reserved,
                .q = q,
        };

        sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
}

struct bt_tags_iter_data {
        struct blk_mq_tags *tags;
        busy_tag_iter_fn *fn;
        void *data;
        unsigned int flags;
};

#define BT_TAG_ITER_RESERVED                (1 << 0)
#define BT_TAG_ITER_STARTED                (1 << 1)
#define BT_TAG_ITER_STATIC_RQS                (1 << 2)

static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
        struct bt_tags_iter_data *iter_data = data;
        struct blk_mq_tags *tags = iter_data->tags;
        struct request *rq;
        bool ret = true;
        bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS);

        if (!(iter_data->flags & BT_TAG_ITER_RESERVED))
                bitnr += tags->nr_reserved_tags;

        /*
         * We can hit rq == NULL here, because the tagging functions
         * test and set the bit before assigning ->rqs[].
         */
        if (iter_static_rqs)
                rq = tags->static_rqs[bitnr];
        else
                rq = blk_mq_find_and_get_req(tags, bitnr);
        if (!rq)
                return true;

        if (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
            blk_mq_request_started(rq))
                ret = iter_data->fn(rq, iter_data->data);
        if (!iter_static_rqs)
                blk_mq_put_rq_ref(rq);
        return ret;
}

/**
 * bt_tags_for_each - iterate over the requests in a tag map
 * @tags:        Tag map to iterate over.
 * @bt:                sbitmap to examine. This is either the breserved_tags member
 *                or the bitmap_tags member of struct blk_mq_tags.
 * @fn:                Pointer to the function that will be called for each started
 *                request. @fn will be called as follows: @fn(rq, @data) where rq
 *                is a pointer to a request. Return %true to continue iterating
 *                tags; %false to stop.
 * @data:        Will be passed as second argument to @fn.
 * @flags:        BT_TAG_ITER_*
 */
static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
                             busy_tag_iter_fn *fn, void *data, unsigned int flags)
{
        struct bt_tags_iter_data iter_data = {
                .tags = tags,
                .fn = fn,
                .data = data,
                .flags = flags,
        };

        if (tags->rqs)
                sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
}

static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
                busy_tag_iter_fn *fn, void *priv, unsigned int flags)
{
        WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);

        if (tags->nr_reserved_tags)
                bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
                                 flags | BT_TAG_ITER_RESERVED);
        bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
}

/**
 * blk_mq_all_tag_iter - iterate over all requests in a tag map
 * @tags:        Tag map to iterate over.
 * @fn:                Pointer to the function that will be called for each
 *                request. @fn will be called as follows: @fn(rq, @priv) where rq
 *                is a pointer to a request. Return %true to continue iterating
 *                tags; %false to stop.
 * @priv:        Will be passed as second argument to @fn.
 *
 * Caller has to pass the tag map from which requests are allocated.
 */
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
                void *priv)
{
        __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
}

/**
 * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
 * @tagset:        Tag set to iterate over.
 * @fn:                Pointer to the function that will be called for each started
 *                request. @fn will be called as follows: @fn(rq, @priv) where
 *                rq is a pointer to a request. Return true to continue iterating
 *                tags, false to stop.
 * @priv:        Will be passed as second argument to @fn.
 *
 * We grab one request reference before calling @fn and release it after
 * @fn returns.
 */
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv)
{
        unsigned int flags = tagset->flags;
        int i, nr_tags, srcu_idx;

        srcu_idx = srcu_read_lock(&tagset->tags_srcu);

        nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;

        for (i = 0; i < nr_tags; i++) {
                if (tagset->tags && tagset->tags[i])
                        __blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
                                              BT_TAG_ITER_STARTED);
        }
        srcu_read_unlock(&tagset->tags_srcu, srcu_idx);
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);

static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data)
{
        unsigned *count = data;

        if (blk_mq_request_completed(rq))
                (*count)++;
        return true;
}

/**
 * blk_mq_tagset_wait_completed_request - Wait until all scheduled request
 * completions have finished.
 * @tagset:        Tag set to drain completed request
 *
 * Note: This function has to be run after all IO queues are shutdown
 */
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset)
{
        while (true) {
                unsigned count = 0;

                blk_mq_tagset_busy_iter(tagset,
                                blk_mq_tagset_count_completed_rqs, &count);
                if (!count)
                        break;
                msleep(5);
        }
}
EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);

/**
 * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
 * @q:                Request queue to examine.
 * @fn:                Pointer to the function that will be called for each request
 *                on @q. @fn will be called as follows: @fn(rq, @priv) where rq
 *                is a pointer to a request and hctx points to the hardware queue
 *                associated with the request.
 * @priv:        Will be passed as second argument to @fn.
 *
 * Note: if @q->tag_set is shared with other request queues then @fn will be
 * called for all requests on all queues that share that tag set and not only
 * for requests associated with @q.
 */
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
                void *priv)
{
        int srcu_idx;

        /*
         * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
         * while the queue is frozen. So we can use q_usage_counter to avoid
         * racing with it.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;

        srcu_idx = srcu_read_lock(&q->tag_set->tags_srcu);
        if (blk_mq_is_shared_tags(q->tag_set->flags)) {
                struct blk_mq_tags *tags = q->tag_set->shared_tags;
                struct sbitmap_queue *bresv = &tags->breserved_tags;
                struct sbitmap_queue *btags = &tags->bitmap_tags;

                if (tags->nr_reserved_tags)
                        bt_for_each(NULL, q, bresv, fn, priv, true);
                bt_for_each(NULL, q, btags, fn, priv, false);
        } else {
                struct blk_mq_hw_ctx *hctx;
                unsigned long i;

                queue_for_each_hw_ctx(q, hctx, i) {
                        struct blk_mq_tags *tags = hctx->tags;
                        struct sbitmap_queue *bresv = &tags->breserved_tags;
                        struct sbitmap_queue *btags = &tags->bitmap_tags;

                        /*
                         * If no software queues are currently mapped to this
                         * hardware queue, there's nothing to check
                         */
                        if (!blk_mq_hw_queue_mapped(hctx))
                                continue;

                        if (tags->nr_reserved_tags)
                                bt_for_each(hctx, q, bresv, fn, priv, true);
                        bt_for_each(hctx, q, btags, fn, priv, false);
                }
        }
        srcu_read_unlock(&q->tag_set->tags_srcu, srcu_idx);
        blk_queue_exit(q);
}

static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
                    bool round_robin, int node)
{
        return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
                                       node);
}

struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
                unsigned int reserved_tags, unsigned int flags, int node)
{
        unsigned int depth = total_tags - reserved_tags;
        bool round_robin = flags & BLK_MQ_F_TAG_RR;
        struct blk_mq_tags *tags;

        if (total_tags > BLK_MQ_TAG_MAX) {
                pr_err("blk-mq: tag depth too large\n");
                return NULL;
        }

        tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
        if (!tags)
                return NULL;

        tags->nr_tags = total_tags;
        tags->nr_reserved_tags = reserved_tags;
        spin_lock_init(&tags->lock);
        INIT_LIST_HEAD(&tags->page_list);

        if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
                goto out_free_tags;
        if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node))
                goto out_free_bitmap_tags;

        return tags;

out_free_bitmap_tags:
        sbitmap_queue_free(&tags->bitmap_tags);
out_free_tags:
        kfree(tags);
        return NULL;
}

static void blk_mq_free_tags_callback(struct rcu_head *head)
{
        struct blk_mq_tags *tags = container_of(head, struct blk_mq_tags,
                                                rcu_head);
        struct page *page;

        while (!list_empty(&tags->page_list)) {
                page = list_first_entry(&tags->page_list, struct page, lru);
                list_del_init(&page->lru);
                /*
                 * Remove kmemleak object previously allocated in
                 * blk_mq_alloc_rqs().
                 */
                kmemleak_free(page_address(page));
                __free_pages(page, page->private);
        }
        kfree(tags);
}

void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags)
{
        sbitmap_queue_free(&tags->bitmap_tags);
        sbitmap_queue_free(&tags->breserved_tags);

        /* if tags pages is not allocated yet, free tags directly */
        if (list_empty(&tags->page_list)) {
                kfree(tags);
                return;
        }

        call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback);
}

void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
{
        struct blk_mq_tags *tags = set->shared_tags;

        sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
}

void blk_mq_tag_update_sched_shared_tags(struct request_queue *q,
                                         unsigned int nr)
{
        sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
                             nr - q->tag_set->reserved_tags);
}

/**
 * blk_mq_unique_tag() - return a tag that is unique queue-wide
 * @rq: request for which to compute a unique tag
 *
 * The tag field in struct request is unique per hardware queue but not over
 * all hardware queues. Hence this function that returns a tag with the
 * hardware context index in the upper bits and the per hardware queue tag in
 * the lower bits.
 *
 * Note: When called for a request that is queued on a non-multiqueue request
 * queue, the hardware context index is set to zero.
 */
u32 blk_mq_unique_tag(struct request *rq)
{
        return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) |
                (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
}
EXPORT_SYMBOL(blk_mq_unique_tag);











    3 















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
// SPDX-License-Identifier: GPL-2.0
#include <net/ip.h>
#include <net/ip6_checksum.h>
#include <net/udp.h>
#include <asm/checksum.h>

#ifndef _HAVE_ARCH_IPV6_CSUM
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
                        const struct in6_addr *daddr,
                        __u32 len, __u8 proto, __wsum csum)
{

        int carry;
        __u32 ulen;
        __u32 uproto;
        __u32 sum = (__force u32)csum;

        sum += (__force u32)saddr->s6_addr32[0];
        carry = (sum < (__force u32)saddr->s6_addr32[0]);
        sum += carry;

        sum += (__force u32)saddr->s6_addr32[1];
        carry = (sum < (__force u32)saddr->s6_addr32[1]);
        sum += carry;

        sum += (__force u32)saddr->s6_addr32[2];
        carry = (sum < (__force u32)saddr->s6_addr32[2]);
        sum += carry;

        sum += (__force u32)saddr->s6_addr32[3];
        carry = (sum < (__force u32)saddr->s6_addr32[3]);
        sum += carry;

        sum += (__force u32)daddr->s6_addr32[0];
        carry = (sum < (__force u32)daddr->s6_addr32[0]);
        sum += carry;

        sum += (__force u32)daddr->s6_addr32[1];
        carry = (sum < (__force u32)daddr->s6_addr32[1]);
        sum += carry;

        sum += (__force u32)daddr->s6_addr32[2];
        carry = (sum < (__force u32)daddr->s6_addr32[2]);
        sum += carry;

        sum += (__force u32)daddr->s6_addr32[3];
        carry = (sum < (__force u32)daddr->s6_addr32[3]);
        sum += carry;

        ulen = (__force u32)htonl((__u32) len);
        sum += ulen;
        carry = (sum < ulen);
        sum += carry;

        uproto = (__force u32)htonl(proto);
        sum += uproto;
        carry = (sum < uproto);
        sum += carry;

        return csum_fold((__force __wsum)sum);
}
EXPORT_SYMBOL(csum_ipv6_magic);
#endif

/* Function to set UDP checksum for an IPv6 UDP packet. This is intended
 * for the simple case like when setting the checksum for a UDP tunnel.
 */
void udp6_set_csum(bool nocheck, struct sk_buff *skb,
                   const struct in6_addr *saddr,
                   const struct in6_addr *daddr, int len)
{
        struct udphdr *uh = udp_hdr(skb);

        if (nocheck)
                uh->check = 0;
        else if (skb_is_gso(skb))
                uh->check = ~udp_v6_check(len, saddr, daddr, 0);
        else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                uh->check = 0;
                uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb));
                if (uh->check == 0)
                        uh->check = CSUM_MANGLED_0;
        } else {
                skb->ip_summed = CHECKSUM_PARTIAL;
                skb->csum_start = skb_transport_header(skb) - skb->head;
                skb->csum_offset = offsetof(struct udphdr, check);
                uh->check = ~udp_v6_check(len, saddr, daddr, 0);
        }
}
EXPORT_SYMBOL(udp6_set_csum);































































































































































































   63 































   16 





   16 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_XSTATE_H
#define __X86_KERNEL_FPU_XSTATE_H

#include <asm/cpufeature.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/xcr.h>
#include <asm/msr.h>

#ifdef CONFIG_X86_64
DECLARE_PER_CPU(u64, xfd_state);
#endif

static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
{
        /*
         * XRSTORS requires these bits set in xcomp_bv, or it will
         * trigger #GP:
         */
        if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED))
                xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
}

static inline u64 xstate_get_group_perm(bool guest)
{
        struct fpu *fpu = x86_task_fpu(current->group_leader);
        struct fpu_state_perm *perm;

        /* Pairs with WRITE_ONCE() in xstate_request_perm() */
        perm = guest ? &fpu->guest_perm : &fpu->perm;
        return READ_ONCE(perm->__state_perm);
}

static inline u64 xstate_get_host_group_perm(void)
{
        return xstate_get_group_perm(false);
}

enum xstate_copy_mode {
        XSTATE_COPY_FP,
        XSTATE_COPY_FX,
        XSTATE_COPY_XSAVE,
};

struct membuf;
extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
                                      u64 xfeatures, u32 pkru_val,
                                      enum xstate_copy_mode copy_mode);
extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
                                    enum xstate_copy_mode mode);
extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf);


extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system_xstate(unsigned int legacy_size);

extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr);

static inline u64 xfeatures_mask_supervisor(void)
{
        return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
}

static inline u64 xfeatures_mask_independent(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
                return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR;

        return fpu_kernel_cfg.independent_features;
}

static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask)
{
        u64 xfeatures;
        int err;

        /* Read the xfeatures value already saved in the user buffer */
        err  = __get_user(xfeatures, &xbuf->header.xfeatures);
        xfeatures |= mask;
        err |= __put_user(xfeatures, &xbuf->header.xfeatures);

        return err;
}

/*
 * Update the value of PKRU register that was already pushed onto the signal frame.
 */
static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru)
{
        int err;

        if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
                return 0;

        /* Mark PKRU as in-use so that it is restored correctly. */
        err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU);
        if (err)
                return err;

        /* Update PKRU value in the userspace xsave buffer. */
        return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU));
}

/* XSAVE/XRSTOR wrapper functions */

#ifdef CONFIG_X86_64
#define REX_SUFFIX        "64"
#else
#define REX_SUFFIX
#endif

#define XSAVE                "xsave" REX_SUFFIX " %[xa]"
#define XSAVEOPT        "xsaveopt" REX_SUFFIX " %[xa]"
#define XSAVEC                "xsavec" REX_SUFFIX " %[xa]"
#define XSAVES                "xsaves" REX_SUFFIX " %[xa]"
#define XRSTOR                "xrstor" REX_SUFFIX " %[xa]"
#define XRSTORS                "xrstors" REX_SUFFIX " %[xa]"

/*
 * After this @err contains 0 on success or the trap number when the
 * operation raises an exception.
 *
 * The [xa] input parameter below represents the struct xregs_state pointer
 * and the asm symbolic name for the argument used in the XSAVE/XRSTOR insns
 * above.
 */
#define XSTATE_OP(op, st, lmask, hmask, err)                                \
        asm volatile("1:" op "\n\t"                                        \
                     "xor %[err], %[err]\n"                                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)        \
                     : [err] "=a" (err)                                        \
                     : [xa] "m" (*(st)), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor
 * states in addition to XSAVEC.
 *
 * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports
 * compacted storage format in addition to XSAVEOPT.
 *
 * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
 * supports modified optimization which is not supported by XSAVE.
 *
 * Use XSAVE as a fallback.
 */
#define XSTATE_XSAVE(st, lmask, hmask, err)                                \
        asm volatile("1: " ALTERNATIVE_3(XSAVE,                                \
                                   XSAVEOPT, X86_FEATURE_XSAVEOPT,        \
                                   XSAVEC,   X86_FEATURE_XSAVEC,        \
                                   XSAVES,   X86_FEATURE_XSAVES)        \
                     "\n\t"                                                \
                     "xor %[err], %[err]\n"                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
                     : [err] "=r" (err)                                        \
                     : [xa] "m" (*(st)), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
 * XSAVE area format.
 */
#define XSTATE_XRESTORE(st, lmask, hmask)                                \
        asm volatile("1: " ALTERNATIVE(XRSTOR,                                \
                                 XRSTORS, X86_FEATURE_XSAVES)                \
                     "\n"                                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE)        \
                     :                                                        \
                     : [xa] "m" (*(st)), "a" (lmask), "d" (hmask)        \
                     : "memory")

#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
#else
static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
#endif

#ifdef CONFIG_X86_64
static inline void xfd_set_state(u64 xfd)
{
        wrmsrq(MSR_IA32_XFD, xfd);
        __this_cpu_write(xfd_state, xfd);
}

static inline void xfd_update_state(struct fpstate *fpstate)
{
        if (fpu_state_size_dynamic()) {
                u64 xfd = fpstate->xfd;

                if (__this_cpu_read(xfd_state) != xfd)
                        xfd_set_state(xfd);
        }
}

extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
#else
static inline void xfd_set_state(u64 xfd) { }

static inline void xfd_update_state(struct fpstate *fpstate) { }

static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
        return -EPERM;
}
#endif

/*
 * Save processor xstate to xsave area.
 *
 * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
 * and command line options. The choice is permanent until the next reboot.
 */
static inline void os_xsave(struct fpstate *fpstate)
{
        u64 mask = fpstate->xfeatures;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        WARN_ON_FPU(!alternatives_patched);
        xfd_validate_state(fpstate, mask, false);

        XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);

        /* We should never fault when copying to a kernel buffer: */
        WARN_ON_FPU(err);
}

/*
 * Restore processor xstate from xsave area.
 *
 * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
 */
static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
{
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        xfd_validate_state(fpstate, mask, true);
        XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
}

/* Restore of supervisor state. Does not require XFD */
static inline void os_xrstor_supervisor(struct fpstate *fpstate)
{
        u64 mask = xfeatures_mask_supervisor();
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
}

/*
 * XSAVE itself always writes all requested xfeatures.  Removing features
 * from the request bitmap reduces the features which are written.
 * Generate a mask of features which must be written to a sigframe.  The
 * unset features can be optimized away and not written.
 *
 * This optimization is user-visible.  Only use for states where
 * uninitialized sigframe contents are tolerable, like dynamic features.
 *
 * Users of buffers produced with this optimization must check XSTATE_BV
 * to determine which features have been optimized out.
 */
static inline u64 xfeatures_need_sigframe_write(void)
{
        u64 xfeatures_to_write;

        /* In-use features must be written: */
        xfeatures_to_write = xfeatures_in_use();

        /* Also write all non-optimizable sigframe features: */
        xfeatures_to_write |= XFEATURE_MASK_USER_SUPPORTED &
                             ~XFEATURE_MASK_SIGFRAME_INITOPT;

        return xfeatures_to_write;
}

/*
 * Save xstate to user space xsave area.
 *
 * We don't use modified optimization because xrstor/xrstors might track
 * a different application.
 *
 * We don't use compacted format xsave area for backward compatibility for
 * old applications which don't understand the compacted format of the
 * xsave area.
 *
 * The caller has to zero buf::header before calling this because XSAVE*
 * does not touch the reserved fields in the header.
 */
static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkru)
{
        /*
         * Include the features which are not xsaved/rstored by the kernel
         * internally, e.g. PKRU. That's user space ABI and also required
         * to allow the signal handler to modify PKRU.
         */
        struct fpstate *fpstate = x86_task_fpu(current)->fpstate;
        u64 mask = fpstate->user_xfeatures;
        u32 lmask;
        u32 hmask;
        int err;

        /* Optimize away writing unnecessary xfeatures: */
        if (fpu_state_size_dynamic())
                mask &= xfeatures_need_sigframe_write();

        lmask = mask;
        hmask = mask >> 32;
        xfd_validate_state(fpstate, mask, false);

        stac();
        XSTATE_OP(XSAVE, buf, lmask, hmask, err);
        clac();

        if (!err)
                err = update_pkru_in_sigframe(buf, pkru);

        return err;
}

/*
 * Restore xstate from user space xsave area.
 */
static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
{
        struct xregs_state *xstate = ((__force struct xregs_state *)buf);
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        xfd_validate_state(x86_task_fpu(current)->fpstate, mask, true);

        stac();
        XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from kernel space xsave area, return an error code instead of
 * an exception.
 */
static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
{
        struct xregs_state *xstate = &fpstate->regs.xsave;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        /* Ensure that XFD is up to date */
        xfd_update_state(fpstate);

        if (cpu_feature_enabled(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        return err;
}


#endif






























































































































































































































   12 







   14 




















   11 














































































































































































































































































































































































































































   14 


















    3 
   14 


























    3 













   14 






   13 























   11 








   11 















   11 






    2 
   11 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
// SPDX-License-Identifier: GPL-2.0
#include <linux/memcontrol.h>
#include <linux/rwsem.h>
#include <linux/shrinker.h>
#include <linux/rculist.h>
#include <trace/events/vmscan.h>

#include "internal.h"

LIST_HEAD(shrinker_list);
DEFINE_MUTEX(shrinker_mutex);

#ifdef CONFIG_MEMCG
static int shrinker_nr_max;

static inline int shrinker_unit_size(int nr_items)
{
        return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
}

static inline void shrinker_unit_free(struct shrinker_info *info, int start)
{
        struct shrinker_info_unit **unit;
        int nr, i;

        if (!info)
                return;

        unit = info->unit;
        nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);

        for (i = start; i < nr; i++) {
                if (!unit[i])
                        break;

                kfree(unit[i]);
                unit[i] = NULL;
        }
}

static inline int shrinker_unit_alloc(struct shrinker_info *new,
                                       struct shrinker_info *old, int nid)
{
        struct shrinker_info_unit *unit;
        int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
        int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
        int i;

        for (i = start; i < nr; i++) {
                unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
                if (!unit) {
                        shrinker_unit_free(new, start);
                        return -ENOMEM;
                }

                new->unit[i] = unit;
        }

        return 0;
}

void free_shrinker_info(struct mem_cgroup *memcg)
{
        struct mem_cgroup_per_node *pn;
        struct shrinker_info *info;
        int nid;

        for_each_node(nid) {
                pn = memcg->nodeinfo[nid];
                info = rcu_dereference_protected(pn->shrinker_info, true);
                shrinker_unit_free(info, 0);
                kvfree(info);
                rcu_assign_pointer(pn->shrinker_info, NULL);
        }
}

int alloc_shrinker_info(struct mem_cgroup *memcg)
{
        int nid, ret = 0;
        int array_size = 0;

        mutex_lock(&shrinker_mutex);
        array_size = shrinker_unit_size(shrinker_nr_max);
        for_each_node(nid) {
                struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size,
                                                           GFP_KERNEL, nid);
                if (!info)
                        goto err;
                info->map_nr_max = shrinker_nr_max;
                if (shrinker_unit_alloc(info, NULL, nid)) {
                        kvfree(info);
                        goto err;
                }
                rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
        }
        mutex_unlock(&shrinker_mutex);

        return ret;

err:
        mutex_unlock(&shrinker_mutex);
        free_shrinker_info(memcg);
        return -ENOMEM;
}

static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
                                                     int nid)
{
        return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
                                         lockdep_is_held(&shrinker_mutex));
}

static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
                                    int old_size, int new_nr_max)
{
        struct shrinker_info *new, *old;
        struct mem_cgroup_per_node *pn;
        int nid;

        for_each_node(nid) {
                pn = memcg->nodeinfo[nid];
                old = shrinker_info_protected(memcg, nid);
                /* Not yet online memcg */
                if (!old)
                        return 0;

                /* Already expanded this shrinker_info */
                if (new_nr_max <= old->map_nr_max)
                        continue;

                new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
                if (!new)
                        return -ENOMEM;

                new->map_nr_max = new_nr_max;

                memcpy(new->unit, old->unit, old_size);
                if (shrinker_unit_alloc(new, old, nid)) {
                        kvfree(new);
                        return -ENOMEM;
                }

                rcu_assign_pointer(pn->shrinker_info, new);
                kvfree_rcu(old, rcu);
        }

        return 0;
}

static int expand_shrinker_info(int new_id)
{
        int ret = 0;
        int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
        int new_size, old_size = 0;
        struct mem_cgroup *memcg;

        if (!root_mem_cgroup)
                goto out;

        lockdep_assert_held(&shrinker_mutex);

        new_size = shrinker_unit_size(new_nr_max);
        old_size = shrinker_unit_size(shrinker_nr_max);

        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
                ret = expand_one_shrinker_info(memcg, new_size, old_size,
                                               new_nr_max);
                if (ret) {
                        mem_cgroup_iter_break(NULL, memcg);
                        goto out;
                }
        } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
out:
        if (!ret)
                shrinker_nr_max = new_nr_max;

        return ret;
}

static inline int shrinker_id_to_index(int shrinker_id)
{
        return shrinker_id / SHRINKER_UNIT_BITS;
}

static inline int shrinker_id_to_offset(int shrinker_id)
{
        return shrinker_id % SHRINKER_UNIT_BITS;
}

static inline int calc_shrinker_id(int index, int offset)
{
        return index * SHRINKER_UNIT_BITS + offset;
}

void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
        if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
                struct shrinker_info *info;
                struct shrinker_info_unit *unit;

                rcu_read_lock();
                info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
                unit = info->unit[shrinker_id_to_index(shrinker_id)];
                if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
                        /* Pairs with smp mb in shrink_slab() */
                        smp_mb__before_atomic();
                        set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
                }
                rcu_read_unlock();
        }
}

static DEFINE_IDR(shrinker_idr);

static int shrinker_memcg_alloc(struct shrinker *shrinker)
{
        int id, ret = -ENOMEM;

        if (mem_cgroup_disabled())
                return -ENOSYS;
        if (mem_cgroup_kmem_disabled() && !(shrinker->flags & SHRINKER_NONSLAB))
                return -ENOSYS;

        mutex_lock(&shrinker_mutex);
        id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
        if (id < 0)
                goto unlock;

        if (id >= shrinker_nr_max) {
                if (expand_shrinker_info(id)) {
                        idr_remove(&shrinker_idr, id);
                        goto unlock;
                }
        }
        shrinker->id = id;
        ret = 0;
unlock:
        mutex_unlock(&shrinker_mutex);
        return ret;
}

static void shrinker_memcg_remove(struct shrinker *shrinker)
{
        int id = shrinker->id;

        BUG_ON(id < 0);

        lockdep_assert_held(&shrinker_mutex);

        idr_remove(&shrinker_idr, id);
}

static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
                                   struct mem_cgroup *memcg)
{
        struct shrinker_info *info;
        struct shrinker_info_unit *unit;
        long nr_deferred;

        rcu_read_lock();
        info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
        unit = info->unit[shrinker_id_to_index(shrinker->id)];
        nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
        rcu_read_unlock();

        return nr_deferred;
}

static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
                                  struct mem_cgroup *memcg)
{
        struct shrinker_info *info;
        struct shrinker_info_unit *unit;
        long nr_deferred;

        rcu_read_lock();
        info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
        unit = info->unit[shrinker_id_to_index(shrinker->id)];
        nr_deferred =
                atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
        rcu_read_unlock();

        return nr_deferred;
}

void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
        int nid, index, offset;
        long nr;
        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
        struct shrinker_info *child_info, *parent_info;
        struct shrinker_info_unit *child_unit, *parent_unit;

        /* Prevent from concurrent shrinker_info expand */
        mutex_lock(&shrinker_mutex);
        for_each_node(nid) {
                child_info = shrinker_info_protected(memcg, nid);
                parent_info = shrinker_info_protected(parent, nid);
                for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
                        child_unit = child_info->unit[index];
                        parent_unit = parent_info->unit[index];
                        for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
                                nr = atomic_long_read(&child_unit->nr_deferred[offset]);
                                atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
                        }
                }
        }
        mutex_unlock(&shrinker_mutex);
}
#else
static int shrinker_memcg_alloc(struct shrinker *shrinker)
{
        return -ENOSYS;
}

static void shrinker_memcg_remove(struct shrinker *shrinker)
{
}

static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
                                   struct mem_cgroup *memcg)
{
        return 0;
}

static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
                                  struct mem_cgroup *memcg)
{
        return 0;
}
#endif /* CONFIG_MEMCG */

static long xchg_nr_deferred(struct shrinker *shrinker,
                             struct shrink_control *sc)
{
        int nid = sc->nid;

        if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
                nid = 0;

        if (sc->memcg &&
            (shrinker->flags & SHRINKER_MEMCG_AWARE))
                return xchg_nr_deferred_memcg(nid, shrinker,
                                              sc->memcg);

        return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
}


static long add_nr_deferred(long nr, struct shrinker *shrinker,
                            struct shrink_control *sc)
{
        int nid = sc->nid;

        if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
                nid = 0;

        if (sc->memcg &&
            (shrinker->flags & SHRINKER_MEMCG_AWARE))
                return add_nr_deferred_memcg(nr, nid, shrinker,
                                             sc->memcg);

        return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
}

#define SHRINK_BATCH 128

static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                                    struct shrinker *shrinker, int priority)
{
        unsigned long freed = 0;
        unsigned long long delta;
        long total_scan;
        long freeable;
        long nr;
        long new_nr;
        long batch_size = shrinker->batch ? shrinker->batch
                                          : SHRINK_BATCH;
        long scanned = 0, next_deferred;

        freeable = shrinker->count_objects(shrinker, shrinkctl);
        if (freeable == 0 || freeable == SHRINK_EMPTY)
                return freeable;

        /*
         * copy the current shrinker scan count into a local variable
         * and zero it so that other concurrent shrinker invocations
         * don't also do this scanning work.
         */
        nr = xchg_nr_deferred(shrinker, shrinkctl);

        if (shrinker->seeks) {
                delta = freeable >> priority;
                delta *= 4;
                do_div(delta, shrinker->seeks);
        } else {
                /*
                 * These objects don't require any IO to create. Trim
                 * them aggressively under memory pressure to keep
                 * them from causing refetches in the IO caches.
                 */
                delta = freeable / 2;
        }

        total_scan = nr >> priority;
        total_scan += delta;
        total_scan = min(total_scan, (2 * freeable));

        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
                                   freeable, delta, total_scan, priority,
                                   shrinkctl->memcg);

        /*
         * Normally, we should not scan less than batch_size objects in one
         * pass to avoid too frequent shrinker calls, but if the slab has less
         * than batch_size objects in total and we are really tight on memory,
         * we will try to reclaim all available objects, otherwise we can end
         * up failing allocations although there are plenty of reclaimable
         * objects spread over several slabs with usage less than the
         * batch_size.
         *
         * We detect the "tight on memory" situations by looking at the total
         * number of objects we want to scan (total_scan). If it is greater
         * than the total number of objects on slab (freeable), we must be
         * scanning at high prio and therefore should try to reclaim as much as
         * possible.
         */
        while (total_scan >= batch_size ||
               total_scan >= freeable) {
                unsigned long ret;
                unsigned long nr_to_scan = min(batch_size, total_scan);

                shrinkctl->nr_to_scan = nr_to_scan;
                shrinkctl->nr_scanned = nr_to_scan;
                ret = shrinker->scan_objects(shrinker, shrinkctl);
                if (ret == SHRINK_STOP)
                        break;
                freed += ret;

                count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
                total_scan -= shrinkctl->nr_scanned;
                scanned += shrinkctl->nr_scanned;

                cond_resched();
        }

        /*
         * The deferred work is increased by any new work (delta) that wasn't
         * done, decreased by old deferred work that was done now.
         *
         * And it is capped to two times of the freeable items.
         */
        next_deferred = max_t(long, (nr + delta - scanned), 0);
        next_deferred = min(next_deferred, (2 * freeable));

        /*
         * move the unused scan count back into the shrinker in a
         * manner that handles concurrent updates.
         */
        new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);

        trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan,
                                 shrinkctl->memcg);
        return freed;
}

#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
{
        struct shrinker_info *info;
        unsigned long ret, freed = 0;
        int offset, index = 0;

        if (!mem_cgroup_online(memcg))
                return 0;

        /*
         * lockless algorithm of memcg shrink.
         *
         * The shrinker_info may be freed asynchronously via RCU in the
         * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
         * to ensure the existence of the shrinker_info.
         *
         * The shrinker_info_unit is never freed unless its corresponding memcg
         * is destroyed. Here we already hold the refcount of memcg, so the
         * memcg will not be destroyed, and of course shrinker_info_unit will
         * not be freed.
         *
         * So in the memcg shrink:
         *  step 1: use rcu_read_lock() to guarantee existence of the
         *          shrinker_info.
         *  step 2: after getting shrinker_info_unit we can safely release the
         *          RCU lock.
         *  step 3: traverse the bitmap and calculate shrinker_id
         *  step 4: use rcu_read_lock() to guarantee existence of the shrinker.
         *  step 5: use shrinker_id to find the shrinker, then use
         *          shrinker_try_get() to guarantee existence of the shrinker,
         *          then we can release the RCU lock to do do_shrink_slab() that
         *          may sleep.
         *  step 6: do shrinker_put() paired with step 5 to put the refcount,
         *          if the refcount reaches 0, then wake up the waiter in
         *          shrinker_free() by calling complete().
         *          Note: here is different from the global shrink, we don't
         *                need to acquire the RCU lock to guarantee existence of
         *                the shrinker, because we don't need to use this
         *                shrinker to traverse the next shrinker in the bitmap.
         *  step 7: we have already exited the read-side of rcu critical section
         *          before calling do_shrink_slab(), the shrinker_info may be
         *          released in expand_one_shrinker_info(), so go back to step 1
         *          to reacquire the shrinker_info.
         */
again:
        rcu_read_lock();
        info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
        if (unlikely(!info))
                goto unlock;

        if (index < shrinker_id_to_index(info->map_nr_max)) {
                struct shrinker_info_unit *unit;

                unit = info->unit[index];

                rcu_read_unlock();

                for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
                        struct shrink_control sc = {
                                .gfp_mask = gfp_mask,
                                .nid = nid,
                                .memcg = memcg,
                        };
                        struct shrinker *shrinker;
                        int shrinker_id = calc_shrinker_id(index, offset);

                        rcu_read_lock();
                        shrinker = idr_find(&shrinker_idr, shrinker_id);
                        if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
                                clear_bit(offset, unit->map);
                                rcu_read_unlock();
                                continue;
                        }
                        rcu_read_unlock();

                        /* Call non-slab shrinkers even though kmem is disabled */
                        if (!memcg_kmem_online() &&
                            !(shrinker->flags & SHRINKER_NONSLAB)) {
                                clear_bit(offset, unit->map);
                                shrinker_put(shrinker);
                                continue;
                        }

                        ret = do_shrink_slab(&sc, shrinker, priority);
                        if (ret == SHRINK_EMPTY) {
                                clear_bit(offset, unit->map);
                                /*
                                 * After the shrinker reported that it had no objects to
                                 * free, but before we cleared the corresponding bit in
                                 * the memcg shrinker map, a new object might have been
                                 * added. To make sure, we have the bit set in this
                                 * case, we invoke the shrinker one more time and reset
                                 * the bit if it reports that it is not empty anymore.
                                 * The memory barrier here pairs with the barrier in
                                 * set_shrinker_bit():
                                 *
                                 * list_lru_add()     shrink_slab_memcg()
                                 *   list_add_tail()    clear_bit()
                                 *   <MB>               <MB>
                                 *   set_bit()          do_shrink_slab()
                                 */
                                smp_mb__after_atomic();
                                ret = do_shrink_slab(&sc, shrinker, priority);
                                if (ret == SHRINK_EMPTY)
                                        ret = 0;
                                else
                                        set_shrinker_bit(memcg, nid, shrinker_id);
                        }
                        freed += ret;
                        shrinker_put(shrinker);
                }

                index++;
                goto again;
        }
unlock:
        rcu_read_unlock();
        return freed;
}
#else /* !CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
{
        return 0;
}
#endif /* CONFIG_MEMCG */

/**
 * shrink_slab - shrink slab caches
 * @gfp_mask: allocation context
 * @nid: node whose slab caches to target
 * @memcg: memory cgroup whose slab caches to target
 * @priority: the reclaim priority
 *
 * Call the shrink functions to age shrinkable caches.
 *
 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 * unaware shrinkers will receive a node id of 0 instead.
 *
 * @memcg specifies the memory cgroup to target. Unaware shrinkers
 * are called only if it is the root cgroup.
 *
 * @priority is sc->priority, we take the number of objects and >> by priority
 * in order to get the scan target.
 *
 * Returns the number of reclaimed slab objects.
 */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
                          int priority)
{
        unsigned long ret, freed = 0;
        struct shrinker *shrinker;

        /*
         * The root memcg might be allocated even though memcg is disabled
         * via "cgroup_disable=memory" boot parameter.  This could make
         * mem_cgroup_is_root() return false, then just run memcg slab
         * shrink, but skip global shrink.  This may result in premature
         * oom.
         */
        if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
                return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

        /*
         * lockless algorithm of global shrink.
         *
         * In the unregistration setp, the shrinker will be freed asynchronously
         * via RCU after its refcount reaches 0. So both rcu_read_lock() and
         * shrinker_try_get() can be used to ensure the existence of the shrinker.
         *
         * So in the global shrink:
         *  step 1: use rcu_read_lock() to guarantee existence of the shrinker
         *          and the validity of the shrinker_list walk.
         *  step 2: use shrinker_try_get() to try get the refcount, if successful,
         *          then the existence of the shrinker can also be guaranteed,
         *          so we can release the RCU lock to do do_shrink_slab() that
         *          may sleep.
         *  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
         *          which ensures that neither this shrinker nor the next shrinker
         *          will be freed in the next traversal operation.
         *  step 4: do shrinker_put() paired with step 2 to put the refcount,
         *          if the refcount reaches 0, then wake up the waiter in
         *          shrinker_free() by calling complete().
         */
        rcu_read_lock();
        list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
                        .memcg = memcg,
                };

                if (!shrinker_try_get(shrinker))
                        continue;

                rcu_read_unlock();

                ret = do_shrink_slab(&sc, shrinker, priority);
                if (ret == SHRINK_EMPTY)
                        ret = 0;
                freed += ret;

                rcu_read_lock();
                shrinker_put(shrinker);
        }

        rcu_read_unlock();
        cond_resched();
        return freed;
}

struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
{
        struct shrinker *shrinker;
        unsigned int size;
        va_list ap;
        int err;

        shrinker = kzalloc_obj(struct shrinker);
        if (!shrinker)
                return NULL;

        va_start(ap, fmt);
        err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
        va_end(ap);
        if (err)
                goto err_name;

        shrinker->flags = flags | SHRINKER_ALLOCATED;
        shrinker->seeks = DEFAULT_SEEKS;

        if (flags & SHRINKER_MEMCG_AWARE) {
                err = shrinker_memcg_alloc(shrinker);
                if (err == -ENOSYS) {
                        /* Memcg is not supported, fallback to non-memcg-aware shrinker. */
                        shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
                        goto non_memcg;
                }

                if (err)
                        goto err_flags;

                return shrinker;
        }

non_memcg:
        /*
         * The nr_deferred is available on per memcg level for memcg aware
         * shrinkers, so only allocate nr_deferred in the following cases:
         *  - non-memcg-aware shrinkers
         *  - !CONFIG_MEMCG
         *  - memcg is disabled by kernel command line
         *  - non-slab shrinkers: when memcg kmem is disabled
         */
        size = sizeof(*shrinker->nr_deferred);
        if (flags & SHRINKER_NUMA_AWARE)
                size *= nr_node_ids;

        shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
        if (!shrinker->nr_deferred)
                goto err_flags;

        return shrinker;

err_flags:
        shrinker_debugfs_name_free(shrinker);
err_name:
        kfree(shrinker);
        return NULL;
}
EXPORT_SYMBOL_GPL(shrinker_alloc);

void shrinker_register(struct shrinker *shrinker)
{
        if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
                pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
                return;
        }

        mutex_lock(&shrinker_mutex);
        list_add_tail_rcu(&shrinker->list, &shrinker_list);
        shrinker->flags |= SHRINKER_REGISTERED;
        shrinker_debugfs_add(shrinker);
        mutex_unlock(&shrinker_mutex);

        init_completion(&shrinker->done);
        /*
         * Now the shrinker is fully set up, take the first reference to it to
         * indicate that lookup operations are now allowed to use it via
         * shrinker_try_get().
         */
        refcount_set(&shrinker->refcount, 1);
}
EXPORT_SYMBOL_GPL(shrinker_register);

static void shrinker_free_rcu_cb(struct rcu_head *head)
{
        struct shrinker *shrinker = container_of(head, struct shrinker, rcu);

        kfree(shrinker->nr_deferred);
        kfree(shrinker);
}

void shrinker_free(struct shrinker *shrinker)
{
        struct dentry *debugfs_entry = NULL;
        int debugfs_id;

        if (!shrinker)
                return;

        if (shrinker->flags & SHRINKER_REGISTERED) {
                /* drop the initial refcount */
                shrinker_put(shrinker);
                /*
                 * Wait for all lookups of the shrinker to complete, after that,
                 * no shrinker is running or will run again, then we can safely
                 * free it asynchronously via RCU and safely free the structure
                 * where the shrinker is located, such as super_block etc.
                 */
                wait_for_completion(&shrinker->done);
        }

        mutex_lock(&shrinker_mutex);
        if (shrinker->flags & SHRINKER_REGISTERED) {
                /*
                 * Now we can safely remove it from the shrinker_list and then
                 * free it.
                 */
                list_del_rcu(&shrinker->list);
                debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
                shrinker->flags &= ~SHRINKER_REGISTERED;
        }

        shrinker_debugfs_name_free(shrinker);

        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
                shrinker_memcg_remove(shrinker);
        mutex_unlock(&shrinker_mutex);

        if (debugfs_entry)
                shrinker_debugfs_remove(debugfs_entry, debugfs_id);

        call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
}
EXPORT_SYMBOL_GPL(shrinker_free);























   10 














































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KTHREAD_H
#define _LINUX_KTHREAD_H
/* Simple interface for creating and stopping kernel threads without mess. */
#include <linux/err.h>
#include <linux/sched.h>

struct mm_struct;

/* opaque kthread data */
struct kthread;

/*
 * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
 * always remain a kthread.  For kthreads p->worker_private always
 * points to a struct kthread.  For tasks that are not kthreads
 * p->worker_private is used to point to other things.
 *
 * Return NULL for any task that is not a kthread.
 */
static inline struct kthread *tsk_is_kthread(struct task_struct *p)
{
        if (p->flags & PF_KTHREAD)
                return p->worker_private;
        return NULL;
}

__printf(4, 5)
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           void *data,
                                           int node,
                                           const char namefmt[], ...);

/**
 * kthread_create - create a kthread on the current node
 * @threadfn: the function to run in the thread
 * @data: data pointer for @threadfn()
 * @namefmt: printf-style format string for the thread name
 * @arg: arguments for @namefmt.
 *
 * This macro will create a kthread on the current node, leaving it in
 * the stopped state.  This is just a helper for kthread_create_on_node();
 * see the documentation there for more details.
 */
#define kthread_create(threadfn, data, namefmt, arg...) \
        kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)


struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
                                          void *data,
                                          unsigned int cpu,
                                          const char *namefmt);

void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk);
bool set_kthread_struct(struct task_struct *p);

void kthread_set_per_cpu(struct task_struct *k, int cpu);
bool kthread_is_per_cpu(struct task_struct *k);

/**
 * kthread_run - create and wake a thread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @namefmt: printf-style name for the thread.
 *
 * Description: Convenient wrapper for kthread_create() followed by
 * wake_up_process().  Returns the kthread or ERR_PTR(-ENOMEM).
 */
#define kthread_run(threadfn, data, namefmt, ...)                           \
({                                                                           \
        struct task_struct *__k                                                   \
                = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
        if (!IS_ERR(__k))                                                   \
                wake_up_process(__k);                                           \
        __k;                                                                   \
})

/**
 * kthread_run_on_cpu - create and wake a cpu bound thread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @cpu: The cpu on which the thread should be bound,
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Description: Convenient wrapper for kthread_create_on_cpu()
 * followed by wake_up_process().  Returns the kthread or
 * ERR_PTR(-ENOMEM).
 */
static inline struct task_struct *
kthread_run_on_cpu(int (*threadfn)(void *data), void *data,
                        unsigned int cpu, const char *namefmt)
{
        struct task_struct *p;

        p = kthread_create_on_cpu(threadfn, data, cpu, namefmt);
        if (!IS_ERR(p))
                wake_up_process(p);

        return p;
}

void free_kthread_struct(struct task_struct *k);
void kthread_bind(struct task_struct *k, unsigned int cpu);
void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask);
int kthread_stop(struct task_struct *k);
int kthread_stop_put(struct task_struct *k);
bool kthread_should_stop(void);
bool kthread_should_park(void);
bool kthread_should_stop_or_park(void);
bool kthread_freezable_should_stop(bool *was_frozen);
void *kthread_func(struct task_struct *k);
void *kthread_data(struct task_struct *k);
void *kthread_probe_data(struct task_struct *k);
int kthread_park(struct task_struct *k);
void kthread_unpark(struct task_struct *k);
void kthread_parkme(void);
#define kthread_exit(result) do_exit(result)
void kthread_complete_and_exit(struct completion *, long) __noreturn;
int kthreads_update_housekeeping(void);
void kthread_do_exit(struct kthread *, long);

int kthreadd(void *unused);
extern struct task_struct *kthreadd_task;
extern int tsk_fork_get_node(struct task_struct *tsk);

/*
 * Simple work processor based on kthread.
 *
 * This provides easier way to make use of kthreads.  A kthread_work
 * can be queued and flushed using queue/kthread_flush_work()
 * respectively.  Queued kthread_works are processed by a kthread
 * running kthread_worker_fn().
 */
struct kthread_work;
typedef void (*kthread_work_func_t)(struct kthread_work *work);
void kthread_delayed_work_timer_fn(struct timer_list *t);

enum {
        KTW_FREEZABLE                = 1 << 0,        /* freeze during suspend */
};

struct kthread_worker {
        unsigned int                flags;
        raw_spinlock_t                lock;
        struct list_head        work_list;
        struct list_head        delayed_work_list;
        struct task_struct        *task;
        struct kthread_work        *current_work;
};

struct kthread_work {
        struct list_head        node;
        kthread_work_func_t        func;
        struct kthread_worker        *worker;
        /* Number of canceling calls that are running at the moment. */
        int                        canceling;
};

struct kthread_delayed_work {
        struct kthread_work work;
        struct timer_list timer;
};

#define KTHREAD_WORK_INIT(work, fn)        {                                \
        .node = LIST_HEAD_INIT((work).node),                                \
        .func = (fn),                                                        \
        }

#define KTHREAD_DELAYED_WORK_INIT(dwork, fn) {                                \
        .work = KTHREAD_WORK_INIT((dwork).work, (fn)),                        \
        .timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn,\
                                     TIMER_IRQSAFE),                        \
        }

#define DEFINE_KTHREAD_WORK(work, fn)                                        \
        struct kthread_work work = KTHREAD_WORK_INIT(work, fn)

#define DEFINE_KTHREAD_DELAYED_WORK(dwork, fn)                                \
        struct kthread_delayed_work dwork =                                \
                KTHREAD_DELAYED_WORK_INIT(dwork, fn)

extern void __kthread_init_worker(struct kthread_worker *worker,
                        const char *name, struct lock_class_key *key);

#define kthread_init_worker(worker)                                        \
        do {                                                                \
                static struct lock_class_key __key;                        \
                __kthread_init_worker((worker), "("#worker")->lock", &__key); \
        } while (0)

#define kthread_init_work(work, fn)                                        \
        do {                                                                \
                memset((work), 0, sizeof(struct kthread_work));                \
                INIT_LIST_HEAD(&(work)->node);                                \
                (work)->func = (fn);                                        \
        } while (0)

#define kthread_init_delayed_work(dwork, fn)                                \
        do {                                                                \
                kthread_init_work(&(dwork)->work, (fn));                \
                timer_setup(&(dwork)->timer,                                \
                             kthread_delayed_work_timer_fn,                \
                             TIMER_IRQSAFE);                                \
        } while (0)

int kthread_worker_fn(void *worker_ptr);

__printf(3, 4)
struct kthread_worker *kthread_create_worker_on_node(unsigned int flags,
                                                     int node,
                                                     const char namefmt[], ...);

#define kthread_create_worker(flags, namefmt, ...) \
        kthread_create_worker_on_node(flags, NUMA_NO_NODE, namefmt, ## __VA_ARGS__);

/**
 * kthread_run_worker - create and wake a kthread worker.
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the thread.
 *
 * Description: Convenient wrapper for kthread_create_worker() followed by
 * wake_up_process().  Returns the kthread_worker or ERR_PTR(-ENOMEM).
 */
#define kthread_run_worker(flags, namefmt, ...)                                        \
({                                                                                \
        struct kthread_worker *__kw                                                \
                = kthread_create_worker(flags, namefmt, ## __VA_ARGS__);        \
        if (!IS_ERR(__kw))                                                        \
                wake_up_process(__kw->task);                                        \
        __kw;                                                                        \
})

struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
                             const char namefmt[]);

/**
 * kthread_run_worker_on_cpu - create and wake a cpu bound kthread worker.
 * @cpu: CPU number
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Description: Convenient wrapper for kthread_create_worker_on_cpu()
 * followed by wake_up_process().  Returns the kthread_worker or
 * ERR_PTR(-ENOMEM).
 */
static inline struct kthread_worker *
kthread_run_worker_on_cpu(int cpu, unsigned int flags,
                          const char namefmt[])
{
        struct kthread_worker *kw;

        kw = kthread_create_worker_on_cpu(cpu, flags, namefmt);
        if (!IS_ERR(kw))
                wake_up_process(kw->task);

        return kw;
}

bool kthread_queue_work(struct kthread_worker *worker,
                        struct kthread_work *work);

bool kthread_queue_delayed_work(struct kthread_worker *worker,
                                struct kthread_delayed_work *dwork,
                                unsigned long delay);

bool kthread_mod_delayed_work(struct kthread_worker *worker,
                              struct kthread_delayed_work *dwork,
                              unsigned long delay);

void kthread_flush_work(struct kthread_work *work);
void kthread_flush_worker(struct kthread_worker *worker);

bool kthread_cancel_work_sync(struct kthread_work *work);
bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);

void kthread_destroy_worker(struct kthread_worker *worker);

void kthread_use_mm(struct mm_struct *mm);
void kthread_unuse_mm(struct mm_struct *mm);

struct cgroup_subsys_state;

#ifdef CONFIG_BLK_CGROUP
void kthread_associate_blkcg(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *kthread_blkcg(void);
#else
static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { }
#endif
#endif /* _LINUX_KTHREAD_H */












































































































































    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 















































    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS segment usage file.
 *
 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Koji Sato.
 * Revised by Ryusuke Konishi.
 */

#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/errno.h>
#include "mdt.h"
#include "sufile.h"

#include <trace/events/nilfs2.h>

/**
 * struct nilfs_sufile_info - on-memory private data of sufile
 * @mi: on-memory private data of metadata file
 * @ncleansegs: number of clean segments
 * @allocmin: lower limit of allocatable segment range
 * @allocmax: upper limit of allocatable segment range
 */
struct nilfs_sufile_info {
        struct nilfs_mdt_info mi;
        unsigned long ncleansegs;/* number of clean segments */
        __u64 allocmin;                /* lower limit of allocatable segment range */
        __u64 allocmax;                /* upper limit of allocatable segment range */
};

static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
{
        return (struct nilfs_sufile_info *)NILFS_MDT(sufile);
}

static inline unsigned long
nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
{
        return NILFS_MDT(sufile)->mi_entries_per_block;
}

static unsigned long
nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
{
        __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;

        t = div64_ul(t, nilfs_sufile_segment_usages_per_block(sufile));
        return (unsigned long)t;
}

static unsigned long
nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
{
        __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;

        return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
}

static unsigned long
nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
                                     __u64 max)
{
        return min_t(unsigned long,
                     nilfs_sufile_segment_usages_per_block(sufile) -
                     nilfs_sufile_get_offset(sufile, curr),
                     max - curr + 1);
}

/**
 * nilfs_sufile_segment_usage_offset - calculate the byte offset of a segment
 *                                     usage entry in the folio containing it
 * @sufile: segment usage file inode
 * @segnum: number of segment usage
 * @bh:     buffer head of block containing segment usage indexed by @segnum
 *
 * Return: Byte offset in the folio of the segment usage entry.
 */
static size_t nilfs_sufile_segment_usage_offset(const struct inode *sufile,
                                                __u64 segnum,
                                                struct buffer_head *bh)
{
        return offset_in_folio(bh->b_folio, bh->b_data) +
                nilfs_sufile_get_offset(sufile, segnum) *
                NILFS_MDT(sufile)->mi_entry_size;
}

static int nilfs_sufile_get_header_block(struct inode *sufile,
                                         struct buffer_head **bhp)
{
        int err = nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);

        if (unlikely(err == -ENOENT)) {
                nilfs_error(sufile->i_sb,
                            "missing header block in segment usage metadata");
                err = -EIO;
        }
        return err;
}

static inline int
nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
                                     int create, struct buffer_head **bhp)
{
        return nilfs_mdt_get_block(sufile,
                                   nilfs_sufile_get_blkoff(sufile, segnum),
                                   create, NULL, bhp);
}

static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile,
                                                   __u64 segnum)
{
        return nilfs_mdt_delete_block(sufile,
                                      nilfs_sufile_get_blkoff(sufile, segnum));
}

static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
                                     u64 ncleanadd, u64 ndirtyadd)
{
        struct nilfs_sufile_header *header;

        header = kmap_local_folio(header_bh->b_folio, 0);
        le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
        le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
        kunmap_local(header);

        mark_buffer_dirty(header_bh);
}

/**
 * nilfs_sufile_get_ncleansegs - return the number of clean segments
 * @sufile: inode of segment usage file
 *
 * Return: Number of clean segments.
 */
unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
{
        return NILFS_SUI(sufile)->ncleansegs;
}

/**
 * nilfs_sufile_updatev - modify multiple segment usages at a time
 * @sufile: inode of segment usage file
 * @segnumv: array of segment numbers
 * @nsegs: size of @segnumv array
 * @create: creation flag
 * @ndone: place to store number of modified segments on @segnumv
 * @dofunc: primitive operation for the update
 *
 * Description: nilfs_sufile_updatev() repeatedly calls @dofunc
 * against the given array of segments.  The @dofunc is called with
 * buffers of a header block and the sufile block in which the target
 * segment usage entry is contained.  If @ndone is given, the number
 * of successfully modified segments from the head is stored in the
 * place @ndone points to.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - Invalid segment usage number
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - Given segment usage is in hole block (may be returned if
 *                  @create is zero)
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
                         int create, size_t *ndone,
                         void (*dofunc)(struct inode *, __u64,
                                        struct buffer_head *,
                                        struct buffer_head *))
{
        struct buffer_head *header_bh, *bh;
        unsigned long blkoff, prev_blkoff;
        __u64 *seg;
        size_t nerr = 0, n = 0;
        int ret = 0;

        if (unlikely(nsegs == 0))
                goto out;

        down_write(&NILFS_MDT(sufile)->mi_sem);
        for (seg = segnumv; seg < segnumv + nsegs; seg++) {
                if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
                        nilfs_warn(sufile->i_sb,
                                   "%s: invalid segment number: %llu",
                                   __func__, (unsigned long long)*seg);
                        nerr++;
                }
        }
        if (nerr > 0) {
                ret = -EINVAL;
                goto out_sem;
        }

        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
        if (ret < 0)
                goto out_sem;

        seg = segnumv;
        blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
        ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
        if (ret < 0)
                goto out_header;

        for (;;) {
                dofunc(sufile, *seg, header_bh, bh);

                if (++seg >= segnumv + nsegs)
                        break;
                prev_blkoff = blkoff;
                blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
                if (blkoff == prev_blkoff)
                        continue;

                /* get different block */
                brelse(bh);
                ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
                if (unlikely(ret < 0))
                        goto out_header;
        }
        brelse(bh);

 out_header:
        n = seg - segnumv;
        brelse(header_bh);
 out_sem:
        up_write(&NILFS_MDT(sufile)->mi_sem);
 out:
        if (ndone)
                *ndone = n;
        return ret;
}

int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
                        void (*dofunc)(struct inode *, __u64,
                                       struct buffer_head *,
                                       struct buffer_head *))
{
        struct buffer_head *header_bh, *bh;
        int ret;

        if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
                nilfs_warn(sufile->i_sb, "%s: invalid segment number: %llu",
                           __func__, (unsigned long long)segnum);
                return -EINVAL;
        }
        down_write(&NILFS_MDT(sufile)->mi_sem);

        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
        if (ret < 0)
                goto out_sem;

        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh);
        if (!ret) {
                dofunc(sufile, segnum, header_bh, bh);
                brelse(bh);
        }
        brelse(header_bh);

 out_sem:
        up_write(&NILFS_MDT(sufile)->mi_sem);
        return ret;
}

/**
 * nilfs_sufile_set_alloc_range - limit range of segment to be allocated
 * @sufile: inode of segment usage file
 * @start: minimum segment number of allocatable region (inclusive)
 * @end: maximum segment number of allocatable region (inclusive)
 *
 * Return: 0 on success, or %-ERANGE if segment range is invalid.
 */
int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end)
{
        struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
        __u64 nsegs;
        int ret = -ERANGE;

        down_write(&NILFS_MDT(sufile)->mi_sem);
        nsegs = nilfs_sufile_get_nsegments(sufile);

        if (start <= end && end < nsegs) {
                sui->allocmin = start;
                sui->allocmax = end;
                ret = 0;
        }
        up_write(&NILFS_MDT(sufile)->mi_sem);
        return ret;
}

/**
 * nilfs_sufile_alloc - allocate a segment
 * @sufile: inode of segment usage file
 * @segnump: pointer to segment number
 *
 * Description: nilfs_sufile_alloc() allocates a clean segment, and stores
 * its segment number in the place pointed to by @segnump.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 * * %-ENOSPC        - No clean segment left.
 */
int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
{
        struct buffer_head *header_bh, *su_bh;
        struct nilfs_sufile_header *header;
        struct nilfs_segment_usage *su;
        struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
        __u64 segnum, maxsegnum, last_alloc;
        size_t offset;
        void *kaddr;
        unsigned long nsegments, nsus, cnt;
        int ret, j;

        down_write(&NILFS_MDT(sufile)->mi_sem);

        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
        if (ret < 0)
                goto out_sem;
        header = kmap_local_folio(header_bh->b_folio, 0);
        last_alloc = le64_to_cpu(header->sh_last_alloc);
        kunmap_local(header);

        nsegments = nilfs_sufile_get_nsegments(sufile);
        maxsegnum = sui->allocmax;
        segnum = last_alloc + 1;
        if (segnum < sui->allocmin || segnum > sui->allocmax)
                segnum = sui->allocmin;

        for (cnt = 0; cnt < nsegments; cnt += nsus) {
                if (segnum > maxsegnum) {
                        if (cnt < sui->allocmax - sui->allocmin + 1) {
                                /*
                                 * wrap around in the limited region.
                                 * if allocation started from
                                 * sui->allocmin, this never happens.
                                 */
                                segnum = sui->allocmin;
                                maxsegnum = last_alloc;
                        } else if (segnum > sui->allocmin &&
                                   sui->allocmax + 1 < nsegments) {
                                segnum = sui->allocmax + 1;
                                maxsegnum = nsegments - 1;
                        } else if (sui->allocmin > 0)  {
                                segnum = 0;
                                maxsegnum = sui->allocmin - 1;
                        } else {
                                break; /* never happens */
                        }
                }
                trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
                                                           &su_bh);
                if (ret < 0)
                        goto out_header;

                offset = nilfs_sufile_segment_usage_offset(sufile, segnum,
                                                           su_bh);
                su = kaddr = kmap_local_folio(su_bh->b_folio, offset);

                nsus = nilfs_sufile_segment_usages_in_block(
                        sufile, segnum, maxsegnum);
                for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
                        if (!nilfs_segment_usage_clean(su))
                                continue;
                        /* found a clean segment */
                        nilfs_segment_usage_set_dirty(su);
                        kunmap_local(kaddr);

                        header = kmap_local_folio(header_bh->b_folio, 0);
                        le64_add_cpu(&header->sh_ncleansegs, -1);
                        le64_add_cpu(&header->sh_ndirtysegs, 1);
                        header->sh_last_alloc = cpu_to_le64(segnum);
                        kunmap_local(header);

                        sui->ncleansegs--;
                        mark_buffer_dirty(header_bh);
                        mark_buffer_dirty(su_bh);
                        nilfs_mdt_mark_dirty(sufile);
                        brelse(su_bh);
                        *segnump = segnum;

                        trace_nilfs2_segment_usage_allocated(sufile, segnum);

                        goto out_header;
                }

                kunmap_local(kaddr);
                brelse(su_bh);
        }

        /* no segments left */
        ret = -ENOSPC;

 out_header:
        brelse(header_bh);

 out_sem:
        up_write(&NILFS_MDT(sufile)->mi_sem);
        return ret;
}

void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
                                 struct buffer_head *header_bh,
                                 struct buffer_head *su_bh)
{
        struct nilfs_segment_usage *su;
        size_t offset;

        offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh);
        su = kmap_local_folio(su_bh->b_folio, offset);
        if (unlikely(!nilfs_segment_usage_clean(su))) {
                nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean",
                           __func__, (unsigned long long)segnum);
                kunmap_local(su);
                return;
        }
        nilfs_segment_usage_set_dirty(su);
        kunmap_local(su);

        nilfs_sufile_mod_counter(header_bh, -1, 1);
        NILFS_SUI(sufile)->ncleansegs--;

        mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
}

void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
                           struct buffer_head *header_bh,
                           struct buffer_head *su_bh)
{
        struct nilfs_segment_usage *su;
        size_t offset;
        int clean, dirty;

        offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh);
        su = kmap_local_folio(su_bh->b_folio, offset);
        if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) &&
            su->su_nblocks == cpu_to_le32(0)) {
                kunmap_local(su);
                return;
        }
        clean = nilfs_segment_usage_clean(su);
        dirty = nilfs_segment_usage_dirty(su);

        /* make the segment garbage */
        su->su_lastmod = cpu_to_le64(0);
        su->su_nblocks = cpu_to_le32(0);
        su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY));
        kunmap_local(su);

        nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
        NILFS_SUI(sufile)->ncleansegs -= clean;

        mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
}

void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
                          struct buffer_head *header_bh,
                          struct buffer_head *su_bh)
{
        struct nilfs_segment_usage *su;
        size_t offset;
        int sudirty;

        offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh);
        su = kmap_local_folio(su_bh->b_folio, offset);
        if (nilfs_segment_usage_clean(su)) {
                nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean",
                           __func__, (unsigned long long)segnum);
                kunmap_local(su);
                return;
        }
        if (unlikely(nilfs_segment_usage_error(su)))
                nilfs_warn(sufile->i_sb, "free segment %llu marked in error",
                           (unsigned long long)segnum);

        sudirty = nilfs_segment_usage_dirty(su);
        if (unlikely(!sudirty))
                nilfs_warn(sufile->i_sb, "free unallocated segment %llu",
                           (unsigned long long)segnum);

        nilfs_segment_usage_set_clean(su);
        kunmap_local(su);
        mark_buffer_dirty(su_bh);

        nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
        NILFS_SUI(sufile)->ncleansegs++;

        nilfs_mdt_mark_dirty(sufile);

        trace_nilfs2_segment_usage_freed(sufile, segnum);
}

/**
 * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty
 * @sufile: inode of segment usage file
 * @segnum: segment number
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
{
        struct buffer_head *bh;
        size_t offset;
        struct nilfs_segment_usage *su;
        int ret;

        down_write(&NILFS_MDT(sufile)->mi_sem);
        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
        if (unlikely(ret)) {
                if (ret == -ENOENT) {
                        nilfs_error(sufile->i_sb,
                                    "segment usage for segment %llu is unreadable due to a hole block",
                                    (unsigned long long)segnum);
                        ret = -EIO;
                }
                goto out_sem;
        }

        offset = nilfs_sufile_segment_usage_offset(sufile, segnum, bh);
        su = kmap_local_folio(bh->b_folio, offset);
        if (unlikely(nilfs_segment_usage_error(su))) {
                struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;

                kunmap_local(su);
                brelse(bh);
                if (nilfs_segment_is_active(nilfs, segnum)) {
                        nilfs_error(sufile->i_sb,
                                    "active segment %llu is erroneous",
                                    (unsigned long long)segnum);
                } else {
                        /*
                         * Segments marked erroneous are never allocated by
                         * nilfs_sufile_alloc(); only active segments, ie,
                         * the segments indexed by ns_segnum or ns_nextnum,
                         * can be erroneous here.
                         */
                        WARN_ON_ONCE(1);
                }
                ret = -EIO;
        } else {
                nilfs_segment_usage_set_dirty(su);
                kunmap_local(su);
                mark_buffer_dirty(bh);
                nilfs_mdt_mark_dirty(sufile);
                brelse(bh);
        }
out_sem:
        up_write(&NILFS_MDT(sufile)->mi_sem);
        return ret;
}

/**
 * nilfs_sufile_set_segment_usage - set usage of a segment
 * @sufile: inode of segment usage file
 * @segnum: segment number
 * @nblocks: number of live blocks in the segment
 * @modtime: modification time (option)
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
                                   unsigned long nblocks, time64_t modtime)
{
        struct buffer_head *bh;
        struct nilfs_segment_usage *su;
        size_t offset;
        int ret;

        down_write(&NILFS_MDT(sufile)->mi_sem);
        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
        if (ret < 0)
                goto out_sem;

        offset = nilfs_sufile_segment_usage_offset(sufile, segnum, bh);
        su = kmap_local_folio(bh->b_folio, offset);
        if (modtime) {
                /*
                 * Check segusage error and set su_lastmod only when updating
                 * this entry with a valid timestamp, not for cancellation.
                 */
                WARN_ON_ONCE(nilfs_segment_usage_error(su));
                su->su_lastmod = cpu_to_le64(modtime);
        }
        su->su_nblocks = cpu_to_le32(nblocks);
        kunmap_local(su);

        mark_buffer_dirty(bh);
        nilfs_mdt_mark_dirty(sufile);
        brelse(bh);

 out_sem:
        up_write(&NILFS_MDT(sufile)->mi_sem);
        return ret;
}

/**
 * nilfs_sufile_get_stat - get segment usage statistics
 * @sufile: inode of segment usage file
 * @sustat: pointer to a structure of segment usage statistics
 *
 * Description: nilfs_sufile_get_stat() retrieves segment usage statistics
 * and stores them in the location pointed to by @sustat.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
{
        struct buffer_head *header_bh;
        struct nilfs_sufile_header *header;
        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
        int ret;

        down_read(&NILFS_MDT(sufile)->mi_sem);

        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
        if (ret < 0)
                goto out_sem;

        header = kmap_local_folio(header_bh->b_folio, 0);
        sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
        sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
        sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
        sustat->ss_ctime = nilfs->ns_ctime;
        sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
        spin_lock(&nilfs->ns_last_segment_lock);
        sustat->ss_prot_seq = nilfs->ns_prot_seq;
        spin_unlock(&nilfs->ns_last_segment_lock);
        kunmap_local(header);
        brelse(header_bh);

 out_sem:
        up_read(&NILFS_MDT(sufile)->mi_sem);
        return ret;
}

void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
                               struct buffer_head *header_bh,
                               struct buffer_head *su_bh)
{
        struct nilfs_segment_usage *su;
        size_t offset;
        int suclean;

        offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh);
        su = kmap_local_folio(su_bh->b_folio, offset);
        if (nilfs_segment_usage_error(su)) {
                kunmap_local(su);
                return;
        }
        suclean = nilfs_segment_usage_clean(su);
        nilfs_segment_usage_set_error(su);
        kunmap_local(su);

        if (suclean) {
                nilfs_sufile_mod_counter(header_bh, -1, 0);
                NILFS_SUI(sufile)->ncleansegs--;
        }
        mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
}

/**
 * nilfs_sufile_truncate_range - truncate range of segment array
 * @sufile: inode of segment usage file
 * @start: start segment number (inclusive)
 * @end: end segment number (inclusive)
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EBUSY        - Dirty or active segments are present in the range.
 * * %-EINVAL        - Invalid number of segments specified.
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
static int nilfs_sufile_truncate_range(struct inode *sufile,
                                       __u64 start, __u64 end)
{
        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
        struct buffer_head *header_bh;
        struct buffer_head *su_bh;
        struct nilfs_segment_usage *su, *su2;
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
        unsigned long segusages_per_block;
        unsigned long nsegs, ncleaned;
        __u64 segnum;
        size_t offset;
        ssize_t n, nc;
        int ret;
        int j;

        nsegs = nilfs_sufile_get_nsegments(sufile);

        ret = -EINVAL;
        if (start > end || start >= nsegs)
                goto out;

        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
        if (ret < 0)
                goto out;

        segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
        ncleaned = 0;

        for (segnum = start; segnum <= end; segnum += n) {
                n = min_t(unsigned long,
                          segusages_per_block -
                                  nilfs_sufile_get_offset(sufile, segnum),
                          end - segnum + 1);
                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
                                                           &su_bh);
                if (ret < 0) {
                        if (ret != -ENOENT)
                                goto out_header;
                        /* hole */
                        continue;
                }
                offset = nilfs_sufile_segment_usage_offset(sufile, segnum,
                                                           su_bh);
                su = kmap_local_folio(su_bh->b_folio, offset);
                su2 = su;
                for (j = 0; j < n; j++, su = (void *)su + susz) {
                        if ((le32_to_cpu(su->su_flags) &
                             ~BIT(NILFS_SEGMENT_USAGE_ERROR)) ||
                            nilfs_segment_is_active(nilfs, segnum + j)) {
                                ret = -EBUSY;
                                kunmap_local(su2);
                                brelse(su_bh);
                                goto out_header;
                        }
                }
                nc = 0;
                for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) {
                        if (nilfs_segment_usage_error(su)) {
                                nilfs_segment_usage_set_clean(su);
                                nc++;
                        }
                }
                kunmap_local(su2);
                if (nc > 0) {
                        mark_buffer_dirty(su_bh);
                        ncleaned += nc;
                }
                brelse(su_bh);

                if (n == segusages_per_block) {
                        /* make hole */
                        nilfs_sufile_delete_segment_usage_block(sufile, segnum);
                }
        }
        ret = 0;

out_header:
        if (ncleaned > 0) {
                NILFS_SUI(sufile)->ncleansegs += ncleaned;
                nilfs_sufile_mod_counter(header_bh, ncleaned, 0);
                nilfs_mdt_mark_dirty(sufile);
        }
        brelse(header_bh);
out:
        return ret;
}

/**
 * nilfs_sufile_resize - resize segment array
 * @sufile: inode of segment usage file
 * @newnsegs: new number of segments
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EBUSY        - Dirty or active segments exist in the region to be truncated.
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 * * %-ENOSPC        - Enough free space is not left for shrinking.
 */
int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
{
        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
        struct buffer_head *header_bh;
        struct nilfs_sufile_header *header;
        struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
        unsigned long nsegs, nrsvsegs;
        int ret = 0;

        down_write(&NILFS_MDT(sufile)->mi_sem);

        nsegs = nilfs_sufile_get_nsegments(sufile);
        if (nsegs == newnsegs)
                goto out;

        ret = -ENOSPC;
        nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs);
        if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs)
                goto out;

        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
        if (ret < 0)
                goto out;

        if (newnsegs > nsegs) {
                sui->ncleansegs += newnsegs - nsegs;
        } else /* newnsegs < nsegs */ {
                ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1);
                if (ret < 0)
                        goto out_header;

                sui->ncleansegs -= nsegs - newnsegs;

                /*
                 * If the sufile is successfully truncated, immediately adjust
                 * the segment allocation space while locking the semaphore
                 * "mi_sem" so that nilfs_sufile_alloc() never allocates
                 * segments in the truncated space.
                 */
                sui->allocmax = newnsegs - 1;
                sui->allocmin = 0;
        }

        header = kmap_local_folio(header_bh->b_folio, 0);
        header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
        kunmap_local(header);

        mark_buffer_dirty(header_bh);
        nilfs_mdt_mark_dirty(sufile);
        nilfs_set_nsegments(nilfs, newnsegs);

out_header:
        brelse(header_bh);
out:
        up_write(&NILFS_MDT(sufile)->mi_sem);
        return ret;
}

/**
 * nilfs_sufile_get_suinfo - get segment usage information
 * @sufile: inode of segment usage file
 * @segnum: segment number to start looking
 * @buf:    array of suinfo
 * @sisz:   byte size of suinfo
 * @nsi:    size of suinfo array
 *
 * Return: Count of segment usage info items stored in the output buffer on
 * success, or one of the following negative error codes on failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
                                unsigned int sisz, size_t nsi)
{
        struct buffer_head *su_bh;
        struct nilfs_segment_usage *su;
        struct nilfs_suinfo *si = buf;
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
        size_t offset;
        void *kaddr;
        unsigned long nsegs, segusages_per_block;
        ssize_t n;
        int ret, i, j;

        down_read(&NILFS_MDT(sufile)->mi_sem);

        segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
        nsegs = min_t(unsigned long,
                      nilfs_sufile_get_nsegments(sufile) - segnum,
                      nsi);
        for (i = 0; i < nsegs; i += n, segnum += n) {
                n = min_t(unsigned long,
                          segusages_per_block -
                                  nilfs_sufile_get_offset(sufile, segnum),
                          nsegs - i);
                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
                                                           &su_bh);
                if (ret < 0) {
                        if (ret != -ENOENT)
                                goto out;
                        /* hole */
                        memset(si, 0, sisz * n);
                        si = (void *)si + sisz * n;
                        continue;
                }

                offset = nilfs_sufile_segment_usage_offset(sufile, segnum,
                                                           su_bh);
                su = kaddr = kmap_local_folio(su_bh->b_folio, offset);
                for (j = 0; j < n;
                     j++, su = (void *)su + susz, si = (void *)si + sisz) {
                        si->sui_lastmod = le64_to_cpu(su->su_lastmod);
                        si->sui_nblocks = le32_to_cpu(su->su_nblocks);
                        si->sui_flags = le32_to_cpu(su->su_flags) &
                                ~BIT(NILFS_SEGMENT_USAGE_ACTIVE);
                        if (nilfs_segment_is_active(nilfs, segnum + j))
                                si->sui_flags |=
                                        BIT(NILFS_SEGMENT_USAGE_ACTIVE);
                }
                kunmap_local(kaddr);
                brelse(su_bh);
        }
        ret = nsegs;

 out:
        up_read(&NILFS_MDT(sufile)->mi_sem);
        return ret;
}

/**
 * nilfs_sufile_set_suinfo - sets segment usage info
 * @sufile: inode of segment usage file
 * @buf: array of suinfo_update
 * @supsz: byte size of suinfo_update
 * @nsup: size of suinfo_update array
 *
 * Description: Takes an array of nilfs_suinfo_update structs and updates
 * segment usage accordingly. Only the fields indicated by the sup_flags
 * are updated.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - Invalid values in input (segment number, flags or nblocks).
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
                                unsigned int supsz, size_t nsup)
{
        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
        struct buffer_head *header_bh, *bh;
        struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup;
        struct nilfs_segment_usage *su;
        size_t offset;
        unsigned long blkoff, prev_blkoff;
        int cleansi, cleansu, dirtysi, dirtysu;
        long ncleaned = 0, ndirtied = 0;
        int ret = 0;

        if (unlikely(nsup == 0))
                return ret;

        for (sup = buf; sup < supend; sup = (void *)sup + supsz) {
                if (sup->sup_segnum >= nilfs->ns_nsegments
                        || (sup->sup_flags &
                                (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS))
                        || (nilfs_suinfo_update_nblocks(sup) &&
                                sup->sup_sui.sui_nblocks >
                                nilfs->ns_blocks_per_segment))
                        return -EINVAL;
        }

        down_write(&NILFS_MDT(sufile)->mi_sem);

        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
        if (ret < 0)
                goto out_sem;

        sup = buf;
        blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
        ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
        if (ret < 0)
                goto out_header;

        for (;;) {
                offset = nilfs_sufile_segment_usage_offset(
                        sufile, sup->sup_segnum, bh);
                su = kmap_local_folio(bh->b_folio, offset);

                if (nilfs_suinfo_update_lastmod(sup))
                        su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod);

                if (nilfs_suinfo_update_nblocks(sup))
                        su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks);

                if (nilfs_suinfo_update_flags(sup)) {
                        /*
                         * Active flag is a virtual flag projected by running
                         * nilfs kernel code - drop it not to write it to
                         * disk.
                         */
                        sup->sup_sui.sui_flags &=
                                        ~BIT(NILFS_SEGMENT_USAGE_ACTIVE);

                        cleansi = nilfs_suinfo_clean(&sup->sup_sui);
                        cleansu = nilfs_segment_usage_clean(su);
                        dirtysi = nilfs_suinfo_dirty(&sup->sup_sui);
                        dirtysu = nilfs_segment_usage_dirty(su);

                        if (cleansi && !cleansu)
                                ++ncleaned;
                        else if (!cleansi && cleansu)
                                --ncleaned;

                        if (dirtysi && !dirtysu)
                                ++ndirtied;
                        else if (!dirtysi && dirtysu)
                                --ndirtied;

                        su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
                }

                kunmap_local(su);

                sup = (void *)sup + supsz;
                if (sup >= supend)
                        break;

                prev_blkoff = blkoff;
                blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
                if (blkoff == prev_blkoff)
                        continue;

                /* get different block */
                mark_buffer_dirty(bh);
                put_bh(bh);
                ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
                if (unlikely(ret < 0))
                        goto out_mark;
        }
        mark_buffer_dirty(bh);
        put_bh(bh);

 out_mark:
        if (ncleaned || ndirtied) {
                nilfs_sufile_mod_counter(header_bh, (u64)ncleaned,
                                (u64)ndirtied);
                NILFS_SUI(sufile)->ncleansegs += ncleaned;
        }
        nilfs_mdt_mark_dirty(sufile);
 out_header:
        put_bh(header_bh);
 out_sem:
        up_write(&NILFS_MDT(sufile)->mi_sem);
        return ret;
}

/**
 * nilfs_sufile_trim_fs() - trim ioctl handle function
 * @sufile: inode of segment usage file
 * @range: fstrim_range structure
 *
 * start:        First Byte to trim
 * len:                number of Bytes to trim from start
 * minlen:        minimum extent length in Bytes
 *
 * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
 * from start to start+len. start is rounded up to the next block boundary
 * and start+len is rounded down. For each clean segment blkdev_issue_discard
 * function is invoked.
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
{
        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
        struct buffer_head *su_bh;
        struct nilfs_segment_usage *su;
        size_t offset;
        void *kaddr;
        size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
        sector_t seg_start, seg_end, start_block, end_block;
        sector_t start = 0, nblocks = 0;
        u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0;
        int ret = 0;
        unsigned int sects_per_block;

        sects_per_block = (1 << nilfs->ns_blocksize_bits) /
                        bdev_logical_block_size(nilfs->ns_bdev);
        len = range->len >> nilfs->ns_blocksize_bits;
        minlen = range->minlen >> nilfs->ns_blocksize_bits;
        max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment);

        if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits)
                return -EINVAL;

        start_block = (range->start + nilfs->ns_blocksize - 1) >>
                        nilfs->ns_blocksize_bits;

        /*
         * range->len can be very large (actually, it is set to
         * ULLONG_MAX by default) - truncate upper end of the range
         * carefully so as not to overflow.
         */
        if (max_blocks - start_block < len)
                end_block = max_blocks - 1;
        else
                end_block = start_block + len - 1;

        if (end_block < nilfs->ns_first_data_block)
                goto out;

        segnum = nilfs_get_segnum_of_block(nilfs, start_block);
        segnum_end = nilfs_get_segnum_of_block(nilfs, end_block);

        down_read(&NILFS_MDT(sufile)->mi_sem);

        while (segnum <= segnum_end) {
                n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
                                segnum_end);

                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
                                                           &su_bh);
                if (ret < 0) {
                        if (ret != -ENOENT)
                                goto out_sem;
                        /* hole */
                        segnum += n;
                        continue;
                }

                offset = nilfs_sufile_segment_usage_offset(sufile, segnum,
                                                           su_bh);
                su = kaddr = kmap_local_folio(su_bh->b_folio, offset);
                for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
                        if (!nilfs_segment_usage_clean(su))
                                continue;

                        nilfs_get_segment_range(nilfs, segnum, &seg_start,
                                                &seg_end);

                        if (!nblocks) {
                                /* start new extent */
                                start = seg_start;
                                nblocks = seg_end - seg_start + 1;
                                continue;
                        }

                        if (start + nblocks == seg_start) {
                                /* add to previous extent */
                                nblocks += seg_end - seg_start + 1;
                                continue;
                        }

                        /* discard previous extent */
                        if (start < start_block) {
                                nblocks -= start_block - start;
                                start = start_block;
                        }

                        if (nblocks >= minlen) {
                                kunmap_local(kaddr);

                                ret = blkdev_issue_discard(nilfs->ns_bdev,
                                                start * sects_per_block,
                                                nblocks * sects_per_block,
                                                GFP_NOFS);
                                if (ret < 0) {
                                        put_bh(su_bh);
                                        goto out_sem;
                                }

                                ndiscarded += nblocks;
                                offset = nilfs_sufile_segment_usage_offset(
                                        sufile, segnum, su_bh);
                                su = kaddr = kmap_local_folio(su_bh->b_folio,
                                                              offset);
                        }

                        /* start new extent */
                        start = seg_start;
                        nblocks = seg_end - seg_start + 1;
                }
                kunmap_local(kaddr);
                put_bh(su_bh);
        }


        if (nblocks) {
                /* discard last extent */
                if (start < start_block) {
                        nblocks -= start_block - start;
                        start = start_block;
                }
                if (start + nblocks > end_block + 1)
                        nblocks = end_block - start + 1;

                if (nblocks >= minlen) {
                        ret = blkdev_issue_discard(nilfs->ns_bdev,
                                        start * sects_per_block,
                                        nblocks * sects_per_block,
                                        GFP_NOFS);
                        if (!ret)
                                ndiscarded += nblocks;
                }
        }

out_sem:
        up_read(&NILFS_MDT(sufile)->mi_sem);

out:
        range->len = ndiscarded << nilfs->ns_blocksize_bits;
        return ret;
}

/**
 * nilfs_sufile_read - read or get sufile inode
 * @sb: super block instance
 * @susize: size of a segment usage entry
 * @raw_inode: on-disk sufile inode
 * @inodep: buffer to store the inode
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_sufile_read(struct super_block *sb, size_t susize,
                      struct nilfs_inode *raw_inode, struct inode **inodep)
{
        struct inode *sufile;
        struct nilfs_sufile_info *sui;
        struct buffer_head *header_bh;
        struct nilfs_sufile_header *header;
        int err;

        if (susize > sb->s_blocksize) {
                nilfs_err(sb, "too large segment usage size: %zu bytes",
                          susize);
                return -EINVAL;
        } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
                nilfs_err(sb, "too small segment usage size: %zu bytes",
                          susize);
                return -EINVAL;
        }

        sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
        if (unlikely(!sufile))
                return -ENOMEM;
        if (!(inode_state_read_once(sufile) & I_NEW))
                goto out;

        err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
        if (err)
                goto failed;

        nilfs_mdt_set_entry_size(sufile, susize,
                                 sizeof(struct nilfs_sufile_header));

        err = nilfs_read_inode_common(sufile, raw_inode);
        if (err)
                goto failed;

        err = nilfs_mdt_get_block(sufile, 0, 0, NULL, &header_bh);
        if (unlikely(err)) {
                if (err == -ENOENT) {
                        nilfs_err(sb,
                                  "missing header block in segment usage metadata");
                        err = -EINVAL;
                }
                goto failed;
        }

        sui = NILFS_SUI(sufile);
        header = kmap_local_folio(header_bh->b_folio, 0);
        sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
        kunmap_local(header);
        brelse(header_bh);

        sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;
        sui->allocmin = 0;

        unlock_new_inode(sufile);
 out:
        *inodep = sufile;
        return 0;
 failed:
        iget_failed(sufile);
        return err;
}











































































































   11 




























































    3 
    8 





























































    1 





    1 
























    1 




    1 



    1 





























    1 






























    1 






    1 














    1 

    1 
    1 
















    1 










    1 

    1 












    1 








    1 




















    4 





















    4 




    4 






    1 
    1 





    1 






















































































































































































































































































































































































































































































































































































































































































































































    8 



    8 




















































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 




    1 












    1 




    1 
    1 







    1 
    1 




    1 


















    1 



    1 

    1 
    1 



















































    1 









    1 




    1 

    1 


    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   11 
    3 




































































































































































    8 


































































































































































































































































































































































































































































































































































































































































































































































   12 

   12 

    3 
































































































































































































































































































































































































































































































































































































































































































    7 

    8 
























































    8 





    7 
    8 
    8 




    7 
    8 





    7 





    7 
    8 


    8 
    7 
























   12 





    3 





    4 











    3 









    8 



    8 





    8 












    7 


















    8 


   10 















































































































































































    3 






    3 








    3 

    3 


    3 






    3 






    8 

    8 

    3 



















































































































































































































    3 







































    3 
    3 













































































































































    3 






    3 























































































































































































































































































































































    4 


























   11 




















































   11 








   11 



   11 



   11 
    1 







   11 
















   12 







    8 
    4 








    4 










   11 


















   12 
























   12 








    8 



























    1 





   10 






   10 




























    6 




    7 


























   12 





   10 

   10 



   13 












    7 
    3 

   10 


    1 

   12 













    9 
    3 






    3 



   13 

























































    2 

    2 





    2 















































































































































































































































































































































































































































































































































































































    1 




    1 






















    1 
    1 

    1 























    1 
    1 























































































































    1 

    1 



    1 

    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *                Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *                (Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/kmsan.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/leafops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/shmem_fs.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
#include <linux/pgalloc.h>
#include <linux/uaccess.h>

#include <trace/events/kmem.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>

#include "pgalloc-track.h"
#include "internal.h"
#include "swap.h"

#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif

static vm_fault_t do_fault(struct vm_fault *vmf);
static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
static bool vmf_pte_changed(struct vm_fault *vmf);

/*
 * Return true if the original pte was a uffd-wp pte marker (so the pte was
 * wr-protected).
 */
static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
{
        if (!userfaultfd_wp(vmf->vma))
                return false;
        if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
                return false;

        return pte_is_uffd_wp_marker(vmf->orig_pte);
}

/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
                                        1;
#else
                                        2;
#endif

static const struct ctl_table mmu_sysctl_table[] = {
        {
                .procname        = "randomize_va_space",
                .data                = &randomize_va_space,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};

static int __init init_mm_sysctl(void)
{
        register_sysctl_init("kernel", mmu_sysctl_table);
        return 0;
}

subsys_initcall(init_mm_sysctl);

#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
        /*
         * Transitioning a PTE from 'old' to 'young' can be expensive on
         * some architectures, even if it's performed in hardware. By
         * default, "false" means prefaulted entries will be 'young'.
         */
        return false;
}
#endif

static int __init disable_randmaps(char *s)
{
        randomize_va_space = 0;
        return 1;
}
__setup("norandmaps", disable_randmaps);

unsigned long highest_memmap_pfn __read_mostly;

void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
        trace_rss_stat(mm, member);
}

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                           unsigned long addr)
{
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
        pte_free_tlb(tlb, token, addr);
        mm_dec_nr_ptes(tlb->mm);
}

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pmd_t *pmd;
        unsigned long next;
        unsigned long start;

        start = addr;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                free_pte_range(tlb, pmd, addr);
        } while (pmd++, addr = next, addr != end);

        start &= PUD_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PUD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pmd = pmd_offset(pud, start);
        pud_clear(pud);
        pmd_free_tlb(tlb, pmd, start);
        mm_dec_nr_pmds(tlb->mm);
}

static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pud_t *pud;
        unsigned long next;
        unsigned long start;

        start = addr;
        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                free_pmd_range(tlb, pud, addr, next, floor, ceiling);
        } while (pud++, addr = next, addr != end);

        start &= P4D_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= P4D_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pud = pud_offset(p4d, start);
        p4d_clear(p4d);
        pud_free_tlb(tlb, pud, start);
        mm_dec_nr_puds(tlb->mm);
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        p4d_t *p4d;
        unsigned long next;
        unsigned long start;

        start = addr;
        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                free_pud_range(tlb, p4d, addr, next, floor, ceiling);
        } while (p4d++, addr = next, addr != end);

        start &= PGDIR_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PGDIR_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        p4d = p4d_offset(pgd, start);
        pgd_clear(pgd);
        p4d_free_tlb(tlb, p4d, start);
}

/**
 * free_pgd_range - Unmap and free page tables in the range
 * @tlb: the mmu_gather containing pending TLB flush info
 * @addr: virtual address start
 * @end: virtual address end
 * @floor: lowest address boundary
 * @ceiling: highest address boundary
 *
 * This function tears down all user-level page tables in the
 * specified virtual address range [@addr..@end). It is part of
 * the memory unmap flow.
 */
void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
{
        pgd_t *pgd;
        unsigned long next;

        /*
         * The next few lines have given us lots of grief...
         *
         * Why are we testing PMD* at this top level?  Because often
         * there will be no work to do at all, and we'd prefer not to
         * go all the way down to the bottom just to discover that.
         *
         * Why all these "- 1"s?  Because 0 represents both the bottom
         * of the address space and the top of it (using -1 for the
         * top wouldn't help much: the masks would do the wrong thing).
         * The rule is that addr 0 and floor 0 refer to the bottom of
         * the address space, but end 0 and ceiling 0 refer to the top
         * Comparisons need to use "end - 1" and "ceiling - 1" (though
         * that end 0 case should be mythical).
         *
         * Wherever addr is brought up or ceiling brought down, we must
         * be careful to reject "the opposite 0" before it confuses the
         * subsequent tests.  But what about where end is brought down
         * by PMD_SIZE below? no, end can't go down to 0 there.
         *
         * Whereas we round start (addr) and ceiling down, by different
         * masks at different levels, in order to test whether a table
         * now has no other vmas using it, so can be freed, we don't
         * bother to round floor or end up - the tests don't need that.
         */

        addr &= PMD_MASK;
        if (addr < floor) {
                addr += PMD_SIZE;
                if (!addr)
                        return;
        }
        if (ceiling) {
                ceiling &= PMD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                end -= PMD_SIZE;
        if (addr > end - 1)
                return;
        /*
         * We add page table cache pages with PAGE_SIZE,
         * (see pte_free_tlb()), flush the tlb if we need
         */
        tlb_change_page_size(tlb, PAGE_SIZE);
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
}

/**
 * free_pgtables() - Free a range of page tables
 * @tlb: The mmu gather
 * @unmap: The unmap_desc
 *
 * Note: pg_start and pg_end are provided to indicate the absolute range of the
 * page tables that should be removed.  This can differ from the vma mappings on
 * some archs that may have mappings that need to be removed outside the vmas.
 * Note that the prev->vm_end and next->vm_start are often used.
 *
 * The vma_end differs from the pg_end when a dup_mmap() failed and the tree has
 * unrelated data to the mm_struct being torn down.
 */
void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap)
{
        struct unlink_vma_file_batch vb;
        struct ma_state *mas = unmap->mas;
        struct vm_area_struct *vma = unmap->first;

        /*
         * Note: USER_PGTABLES_CEILING may be passed as the value of pg_end and
         * may be 0.  Underflow is expected in this case.  Otherwise the
         * pagetable end is exclusive.  vma_end is exclusive.  The last vma
         * address should never be larger than the pagetable end.
         */
        WARN_ON_ONCE(unmap->vma_end - 1 > unmap->pg_end - 1);

        tlb_free_vmas(tlb);

        do {
                unsigned long addr = vma->vm_start;
                struct vm_area_struct *next;

                next = mas_find(mas, unmap->tree_end - 1);

                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
                if (unmap->mm_wr_locked)
                        vma_start_write(vma);
                unlink_anon_vmas(vma);

                unlink_file_vma_batch_init(&vb);
                unlink_file_vma_batch_add(&vb, vma);

                /*
                 * Optimization: gather nearby vmas into one call down
                 */
                while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
                        vma = next;
                        next = mas_find(mas, unmap->tree_end - 1);
                        if (unmap->mm_wr_locked)
                                vma_start_write(vma);
                        unlink_anon_vmas(vma);
                        unlink_file_vma_batch_add(&vb, vma);
                }
                unlink_file_vma_batch_final(&vb);

                free_pgd_range(tlb, addr, vma->vm_end, unmap->pg_start,
                               next ? next->vm_start : unmap->pg_end);
                vma = next;
        } while (vma);
}

void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
{
        spinlock_t *ptl = pmd_lock(mm, pmd);

        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                mm_inc_nr_ptes(mm);
                /*
                 * Ensure all pte setup (eg. pte page lock and page clearing) are
                 * visible before the pte is made visible to other CPUs by being
                 * put into page tables.
                 *
                 * The other side of the story is the pointer chasing in the page
                 * table walking code (when walking the page table without locking;
                 * ie. most of the time). Fortunately, these data accesses consist
                 * of a chain of data-dependent loads, meaning most CPUs (alpha
                 * being the notable exception) will already guarantee loads are
                 * seen in-order. See the alpha page table accessors for the
                 * smp_rmb() barriers in page table walking code.
                 */
                smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
                pmd_populate(mm, pmd, *pte);
                *pte = NULL;
        }
        spin_unlock(ptl);
}

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
        pgtable_t new = pte_alloc_one(mm);
        if (!new)
                return -ENOMEM;

        pmd_install(mm, pmd, &new);
        if (new)
                pte_free(mm, new);
        return 0;
}

int __pte_alloc_kernel(pmd_t *pmd)
{
        pte_t *new = pte_alloc_one_kernel(&init_mm);
        if (!new)
                return -ENOMEM;

        spin_lock(&init_mm.page_table_lock);
        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                smp_wmb(); /* See comment in pmd_install() */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
        }
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
        return 0;
}

static inline void init_rss_vec(int *rss)
{
        memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
        int i;

        for (i = 0; i < NR_MM_COUNTERS; i++)
                if (rss[i])
                        add_mm_counter(mm, i, rss[i]);
}

static bool is_bad_page_map_ratelimited(void)
{
        static unsigned long resume;
        static unsigned long nr_shown;
        static unsigned long nr_unshown;

        /*
         * Allow a burst of 60 reports, then keep quiet for that minute;
         * or allow a steady drip of one report per second.
         */
        if (nr_shown == 60) {
                if (time_before(jiffies, resume)) {
                        nr_unshown++;
                        return true;
                }
                if (nr_unshown) {
                        pr_alert("BUG: Bad page map: %lu messages suppressed\n",
                                 nr_unshown);
                        nr_unshown = 0;
                }
                nr_shown = 0;
        }
        if (nr_shown++ == 0)
                resume = jiffies + 60 * HZ;
        return false;
}

static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr)
{
        unsigned long long pgdv, p4dv, pudv, pmdv;
        p4d_t p4d, *p4dp;
        pud_t pud, *pudp;
        pmd_t pmd, *pmdp;
        pgd_t *pgdp;

        /*
         * Although this looks like a fully lockless pgtable walk, it is not:
         * see locking requirements for print_bad_page_map().
         */
        pgdp = pgd_offset(mm, addr);
        pgdv = pgd_val(*pgdp);

        if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) {
                pr_alert("pgd:%08llx\n", pgdv);
                return;
        }

        p4dp = p4d_offset(pgdp, addr);
        p4d = p4dp_get(p4dp);
        p4dv = p4d_val(p4d);

        if (!p4d_present(p4d) || p4d_leaf(p4d)) {
                pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
                return;
        }

        pudp = pud_offset(p4dp, addr);
        pud = pudp_get(pudp);
        pudv = pud_val(pud);

        if (!pud_present(pud) || pud_leaf(pud)) {
                pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
                return;
        }

        pmdp = pmd_offset(pudp, addr);
        pmd = pmdp_get(pmdp);
        pmdv = pmd_val(pmd);

        /*
         * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
         * because the table should already be mapped by the caller and
         * doing another map would be bad. print_bad_page_map() should
         * already take care of printing the PTE.
         */
        pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
                 p4dv, pudv, pmdv);
}

/*
 * This function is called to print an error when a bad page table entry (e.g.,
 * corrupted page table entry) is found. For example, we might have a
 * PFN-mapped pte in a region that doesn't allow it.
 *
 * The calling function must still handle the error.
 *
 * This function must be called during a proper page table walk, as it will
 * re-walk the page table to dump information: the caller MUST prevent page
 * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
 * page table lock.
 */
static void print_bad_page_map(struct vm_area_struct *vma,
                unsigned long addr, unsigned long long entry, struct page *page,
                enum pgtable_level level)
{
        struct address_space *mapping;
        pgoff_t index;

        if (is_bad_page_map_ratelimited())
                return;

        mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
        index = linear_page_index(vma, addr);

        pr_alert("BUG: Bad page map in process %s  %s:%08llx", current->comm,
                 pgtable_level_to_str(level), entry);
        __print_bad_page_map_pgtable(vma->vm_mm, addr);
        if (page)
                dump_page(page, "bad page map");
        pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
                 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
        pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
                 vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->fault : NULL,
                 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
                 vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
                 mapping ? mapping->a_ops->read_folio : NULL);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
#define print_bad_pte(vma, addr, pte, page) \
        print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)

/**
 * __vm_normal_page() - Get the "struct page" associated with a page table entry.
 * @vma: The VMA mapping the page table entry.
 * @addr: The address where the page table entry is mapped.
 * @pfn: The PFN stored in the page table entry.
 * @special: Whether the page table entry is marked "special".
 * @level: The page table level for error reporting purposes only.
 * @entry: The page table entry value for error reporting purposes only.
 *
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page and
 * are ordinarily refcounted.
 *
 * Page mappings of the shared zero folios are always considered "special", as
 * they are not ordinarily refcounted: neither the refcount nor the mapcount
 * of these folios is adjusted when mapping them into user page tables.
 * Selected page table walkers (such as GUP) can still identify mappings of the
 * shared zero folios and work with the underlying "struct page".
 *
 * There are 2 broad cases. Firstly, an architecture may define a "special"
 * page table entry bit, such as pte_special(), in which case this function is
 * trivial. Secondly, an architecture may not have a spare page table
 * entry bit, which requires a more complicated scheme, described below.
 *
 * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
 * page table entries that actually map "normal" pages: however, that page
 * cannot be looked up through the PFN stored in the page table entry, but
 * instead will be looked up through vm_ops->find_normal_page(). So far, this
 * only applies to PTEs.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
 *
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
 *
 *        pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
 *
 *
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true, except the shared zero
 * folios) are refcounted and considered normal pages by the VM.
 *
 * The disadvantage is that pages are refcounted (which can be slower and
 * simply not an option for some PFNMAP users). The advantage is that we
 * don't have to follow the strict linearity rule of PFNMAP mappings in
 * order to support COWable mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
static inline struct page *__vm_normal_page(struct vm_area_struct *vma,
                unsigned long addr, unsigned long pfn, bool special,
                unsigned long long entry, enum pgtable_level level)
{
        if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
                if (unlikely(special)) {
#ifdef CONFIG_FIND_NORMAL_PAGE
                        if (vma->vm_ops && vma->vm_ops->find_normal_page)
                                return vma->vm_ops->find_normal_page(vma, addr);
#endif /* CONFIG_FIND_NORMAL_PAGE */
                        if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                                return NULL;
                        if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
                                return NULL;

                        print_bad_page_map(vma, addr, entry, NULL, level);
                        return NULL;
                }
                /*
                 * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
                 * mappings (incl. shared zero folios) are marked accordingly.
                 */
        } else {
                if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) {
                        if (vma->vm_flags & VM_MIXEDMAP) {
                                /* If it has a "struct page", it's "normal". */
                                if (!pfn_valid(pfn))
                                        return NULL;
                        } else {
                                unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;

                                /* Only CoW'ed anon folios are "normal". */
                                if (pfn == vma->vm_pgoff + off)
                                        return NULL;
                                if (!is_cow_mapping(vma->vm_flags))
                                        return NULL;
                        }
                }

                if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
                        return NULL;
        }

        if (unlikely(pfn > highest_memmap_pfn)) {
                /* Corrupted page table entry. */
                print_bad_page_map(vma, addr, entry, NULL, level);
                return NULL;
        }
        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * For example, VDSO mappings can cause them to exist.
         */
        VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn));
        return pfn_to_page(pfn);
}

/**
 * vm_normal_page() - Get the "struct page" associated with a PTE
 * @vma: The VMA mapping the @pte.
 * @addr: The address where the @pte is mapped.
 * @pte: The PTE.
 *
 * Get the "struct page" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte),
                                pte_val(pte), PGTABLE_LEVEL_PTE);
}

/**
 * vm_normal_folio() - Get the "struct folio" associated with a PTE
 * @vma: The VMA mapping the @pte.
 * @addr: The address where the @pte is mapped.
 * @pte: The PTE.
 *
 * Get the "struct folio" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        struct page *page = vm_normal_page(vma, addr, pte);

        if (page)
                return page_folio(page);
        return NULL;
}

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
/**
 * vm_normal_page_pmd() - Get the "struct page" associated with a PMD
 * @vma: The VMA mapping the @pmd.
 * @addr: The address where the @pmd is mapped.
 * @pmd: The PMD.
 *
 * Get the "struct page" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd)
{
        return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd),
                                pmd_val(pmd), PGTABLE_LEVEL_PMD);
}

/**
 * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
 * @vma: The VMA mapping the @pmd.
 * @addr: The address where the @pmd is mapped.
 * @pmd: The PMD.
 *
 * Get the "struct folio" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t pmd)
{
        struct page *page = vm_normal_page_pmd(vma, addr, pmd);

        if (page)
                return page_folio(page);
        return NULL;
}

/**
 * vm_normal_page_pud() - Get the "struct page" associated with a PUD
 * @vma: The VMA mapping the @pud.
 * @addr: The address where the @pud is mapped.
 * @pud: The PUD.
 *
 * Get the "struct page" associated with a PUD. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
struct page *vm_normal_page_pud(struct vm_area_struct *vma,
                unsigned long addr, pud_t pud)
{
        return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud),
                                pud_val(pud), PGTABLE_LEVEL_PUD);
}
#endif

/**
 * restore_exclusive_pte - Restore a device-exclusive entry
 * @vma: VMA covering @address
 * @folio: the mapped folio
 * @page: the mapped folio page
 * @address: the virtual address
 * @ptep: pte pointer into the locked page table mapping the folio page
 * @orig_pte: pte value at @ptep
 *
 * Restore a device-exclusive non-swap entry to an ordinary present pte.
 *
 * The folio and the page table must be locked, and MMU notifiers must have
 * been called to invalidate any (exclusive) device mappings.
 *
 * Locking the folio makes sure that anybody who just converted the pte to
 * a device-exclusive entry can map it into the device to make forward
 * progress without others converting it back until the folio was unlocked.
 *
 * If the folio lock ever becomes an issue, we can stop relying on the folio
 * lock; it might make some scenarios with heavy thrashing less likely to
 * make forward progress, but these scenarios might not be valid use cases.
 *
 * Note that the folio lock does not protect against all cases of concurrent
 * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
 * must use MMU notifiers to sync against any concurrent changes.
 */
static void restore_exclusive_pte(struct vm_area_struct *vma,
                struct folio *folio, struct page *page, unsigned long address,
                pte_t *ptep, pte_t orig_pte)
{
        pte_t pte;

        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);

        pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
        if (pte_swp_soft_dirty(orig_pte))
                pte = pte_mksoft_dirty(pte);

        if (pte_swp_uffd_wp(orig_pte))
                pte = pte_mkuffd_wp(pte);

        if ((vma->vm_flags & VM_WRITE) &&
            can_change_pte_writable(vma, address, pte)) {
                if (folio_test_dirty(folio))
                        pte = pte_mkdirty(pte);
                pte = pte_mkwrite(pte, vma);
        }
        set_pte_at(vma->vm_mm, address, ptep, pte);

        /*
         * No need to invalidate - it was non-present before. However
         * secondary CPUs may have mappings that need invalidating.
         */
        update_mmu_cache(vma, address, ptep);
}

/*
 * Tries to restore an exclusive pte if the page lock can be acquired without
 * sleeping.
 */
static int try_restore_exclusive_pte(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep, pte_t orig_pte)
{
        const softleaf_t entry = softleaf_from_pte(orig_pte);
        struct page *page = softleaf_to_page(entry);
        struct folio *folio = page_folio(page);

        if (folio_trylock(folio)) {
                restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte);
                folio_unlock(folio);
                return 0;
        }

        return -EBUSY;
}

/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
        vm_flags_t vm_flags = dst_vma->vm_flags;
        pte_t orig_pte = ptep_get(src_pte);
        softleaf_t entry = softleaf_from_pte(orig_pte);
        pte_t pte = orig_pte;
        struct folio *folio;
        struct page *page;

        if (likely(softleaf_is_swap(entry))) {
                if (swap_dup_entry_direct(entry) < 0)
                        return -EIO;

                /* make sure dst_mm is on swapoff's mmlist. */
                if (unlikely(list_empty(&dst_mm->mmlist))) {
                        spin_lock(&mmlist_lock);
                        if (list_empty(&dst_mm->mmlist))
                                list_add(&dst_mm->mmlist,
                                                &src_mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
                /* Mark the swap entry as shared. */
                if (pte_swp_exclusive(orig_pte)) {
                        pte = pte_swp_clear_exclusive(orig_pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
                rss[MM_SWAPENTS]++;
        } else if (softleaf_is_migration(entry)) {
                folio = softleaf_to_folio(entry);

                rss[mm_counter(folio)]++;

                if (!softleaf_is_migration_read(entry) &&
                                is_cow_mapping(vm_flags)) {
                        /*
                         * COW mappings require pages in both parent and child
                         * to be set to read. A previously exclusive entry is
                         * now shared.
                         */
                        entry = make_readable_migration_entry(
                                                        swp_offset(entry));
                        pte = softleaf_to_pte(entry);
                        if (pte_swp_soft_dirty(orig_pte))
                                pte = pte_swp_mksoft_dirty(pte);
                        if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (softleaf_is_device_private(entry)) {
                page = softleaf_to_page(entry);
                folio = page_folio(page);

                /*
                 * Update rss count even for unaddressable pages, as
                 * they should treated just like normal pages in this
                 * respect.
                 *
                 * We will likely want to have some new rss counters
                 * for unaddressable pages, at some point. But for now
                 * keep things as they are.
                 */
                folio_get(folio);
                rss[mm_counter(folio)]++;
                /* Cannot fail as these pages cannot get pinned. */
                folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma);

                /*
                 * We do not preserve soft-dirty information, because so
                 * far, checkpoint/restore is the only feature that
                 * requires that. And checkpoint/restore does not work
                 * when a device driver is involved (you cannot easily
                 * save and restore device driver state).
                 */
                if (softleaf_is_device_private_write(entry) &&
                    is_cow_mapping(vm_flags)) {
                        entry = make_readable_device_private_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (softleaf_is_device_exclusive(entry)) {
                /*
                 * Make device exclusive entries present by restoring the
                 * original entry then copying as for a present pte. Device
                 * exclusive entries currently only support private writable
                 * (ie. COW) mappings.
                 */
                VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
                if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
                        return -EBUSY;
                return -ENOENT;
        } else if (softleaf_is_marker(entry)) {
                pte_marker marker = copy_pte_marker(entry, dst_vma);

                if (marker)
                        set_pte_at(dst_mm, addr, dst_pte,
                                   make_pte_marker(marker));
                return 0;
        }
        if (!userfaultfd_wp(dst_vma))
                pte = pte_swp_clear_uffd_wp(pte);
        set_pte_at(dst_mm, addr, dst_pte, pte);
        return 0;
}

/*
 * Copy a present and normal page.
 *
 * NOTE! The usual case is that this isn't required;
 * instead, the caller can just increase the page refcount
 * and re-use the pte the traditional way.
 *
 * And if we need a pre-allocated page but don't yet have
 * one, return a negative error to let the preallocation
 * code know so that it can do so outside the page table
 * lock.
 */
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
                  struct folio **prealloc, struct page *page)
{
        struct folio *new_folio;
        pte_t pte;

        new_folio = *prealloc;
        if (!new_folio)
                return -EAGAIN;

        /*
         * We have a prealloc page, all good!  Take it
         * over and copy the page & arm it.
         */

        if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma))
                return -EHWPOISON;

        *prealloc = NULL;
        __folio_mark_uptodate(new_folio);
        folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE);
        folio_add_lru_vma(new_folio, dst_vma);
        rss[MM_ANONPAGES]++;

        /* All done, just insert the new page copy in the child */
        pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot);
        pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
        if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
                /* Uffd-wp needs to be delivered to dest pte as well */
                pte = pte_mkuffd_wp(pte);
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
}

static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
                pte_t pte, unsigned long addr, int nr)
{
        struct mm_struct *src_mm = src_vma->vm_mm;

        /* If it's a COW mapping, write protect it both processes. */
        if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
                wrprotect_ptes(src_mm, addr, src_pte, nr);
                pte = pte_wrprotect(pte);
        }

        /* If it's a shared mapping, mark it clean in the child. */
        if (src_vma->vm_flags & VM_SHARED)
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);

        if (!userfaultfd_wp(dst_vma))
                pte = pte_clear_uffd_wp(pte);

        set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
}

/*
 * Copy one present PTE, trying to batch-process subsequent PTEs that map
 * consecutive pages of the same folio by copying them as well.
 *
 * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
 * Otherwise, returns the number of copied PTEs (at least 1).
 */
static inline int
copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
                 int max_nr, int *rss, struct folio **prealloc)
{
        fpb_t flags = FPB_MERGE_WRITE;
        struct page *page;
        struct folio *folio;
        int err, nr;

        page = vm_normal_page(src_vma, addr, pte);
        if (unlikely(!page))
                goto copy_pte;

        folio = page_folio(page);

        /*
         * If we likely have to copy, just don't bother with batching. Make
         * sure that the common "small folio" case is as fast as possible
         * by keeping the batching logic separate.
         */
        if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
                if (!(src_vma->vm_flags & VM_SHARED))
                        flags |= FPB_RESPECT_DIRTY;
                if (vma_soft_dirty_enabled(src_vma))
                        flags |= FPB_RESPECT_SOFT_DIRTY;

                nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags);
                folio_ref_add(folio, nr);
                if (folio_test_anon(folio)) {
                        if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
                                                                  nr, dst_vma, src_vma))) {
                                folio_ref_sub(folio, nr);
                                return -EAGAIN;
                        }
                        rss[MM_ANONPAGES] += nr;
                        VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
                } else {
                        folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
                        rss[mm_counter_file(folio)] += nr;
                }
                __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
                                    addr, nr);
                return nr;
        }

        folio_get(folio);
        if (folio_test_anon(folio)) {
                /*
                 * If this page may have been pinned by the parent process,
                 * copy the page immediately for the child so that we'll always
                 * guarantee the pinned page won't be randomly replaced in the
                 * future.
                 */
                if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) {
                        /* Page may be pinned, we have to copy. */
                        folio_put(folio);
                        err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                                                addr, rss, prealloc, page);
                        return err ? err : 1;
                }
                rss[MM_ANONPAGES]++;
                VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
        } else {
                folio_dup_file_rmap_pte(folio, page, dst_vma);
                rss[mm_counter_file(folio)]++;
        }

copy_pte:
        __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
        return 1;
}

static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
                struct vm_area_struct *vma, unsigned long addr, bool need_zero)
{
        struct folio *new_folio;

        if (need_zero)
                new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
        else
                new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);

        if (!new_folio)
                return NULL;

        if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
                folio_put(new_folio);
                return NULL;
        }
        folio_throttle_swaprate(new_folio, GFP_KERNEL);

        return new_folio;
}

static int
copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
        pmd_t dummy_pmdval;
        pte_t ptent;
        spinlock_t *src_ptl, *dst_ptl;
        int progress, max_nr, ret = 0;
        int rss[NR_MM_COUNTERS];
        softleaf_t entry = softleaf_mk_none();
        struct folio *prealloc = NULL;
        int nr;

again:
        progress = 0;
        init_rss_vec(rss);

        /*
         * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
         * error handling here, assume that exclusive mmap_lock on dst and src
         * protects anon from unexpected THP transitions; with shmem and file
         * protected by mmap_lock-less collapse skipping areas with anon_vma
         * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
         * can remove such assumptions later, but this is good enough for now.
         */
        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte) {
                ret = -ENOMEM;
                goto out;
        }

        /*
         * We already hold the exclusive mmap_lock, the copy_pte_range() and
         * retract_page_tables() are using vma->anon_vma to be exclusive, so
         * the PTE page is stable, and there is no need to get pmdval and do
         * pmd_same() check.
         */
        src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval,
                                           &src_ptl);
        if (!src_pte) {
                pte_unmap_unlock(dst_pte, dst_ptl);
                /* ret == 0 */
                goto out;
        }
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
        orig_dst_pte = dst_pte;
        lazy_mmu_mode_enable();

        do {
                nr = 1;

                /*
                 * We are holding two locks at this point - either of them
                 * could generate latencies in another task on another CPU.
                 */
                if (progress >= 32) {
                        progress = 0;
                        if (need_resched() ||
                            spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
                                break;
                }
                ptent = ptep_get(src_pte);
                if (pte_none(ptent)) {
                        progress++;
                        continue;
                }
                if (unlikely(!pte_present(ptent))) {
                        ret = copy_nonpresent_pte(dst_mm, src_mm,
                                                  dst_pte, src_pte,
                                                  dst_vma, src_vma,
                                                  addr, rss);
                        if (ret == -EIO) {
                                entry = softleaf_from_pte(ptep_get(src_pte));
                                break;
                        } else if (ret == -EBUSY) {
                                break;
                        } else if (!ret) {
                                progress += 8;
                                continue;
                        }
                        ptent = ptep_get(src_pte);
                        VM_WARN_ON_ONCE(!pte_present(ptent));

                        /*
                         * Device exclusive entry restored, continue by copying
                         * the now present pte.
                         */
                        WARN_ON_ONCE(ret != -ENOENT);
                }
                /* copy_present_ptes() will clear `*prealloc' if consumed */
                max_nr = (end - addr) / PAGE_SIZE;
                ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
                                        ptent, addr, max_nr, rss, &prealloc);
                /*
                 * If we need a pre-allocated page for this pte, drop the
                 * locks, allocate, and try again.
                 * If copy failed due to hwpoison in source page, break out.
                 */
                if (unlikely(ret == -EAGAIN || ret == -EHWPOISON))
                        break;
                if (unlikely(prealloc)) {
                        /*
                         * pre-alloc page cannot be reused by next time so as
                         * to strictly follow mempolicy (e.g., alloc_page_vma()
                         * will allocate page according to address).  This
                         * could only happen if one pinned pte changed.
                         */
                        folio_put(prealloc);
                        prealloc = NULL;
                }
                nr = ret;
                progress += 8 * nr;
        } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
                 addr != end);

        lazy_mmu_mode_disable();
        pte_unmap_unlock(orig_src_pte, src_ptl);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();

        if (ret == -EIO) {
                VM_WARN_ON_ONCE(!entry.val);
                if (swap_retry_table_alloc(entry, GFP_KERNEL) < 0) {
                        ret = -ENOMEM;
                        goto out;
                }
                entry.val = 0;
        } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) {
                goto out;
        } else if (ret ==  -EAGAIN) {
                prealloc = folio_prealloc(src_mm, src_vma, addr, false);
                if (!prealloc)
                        return -ENOMEM;
        } else if (ret < 0) {
                VM_WARN_ON_ONCE(1);
        }

        /* We've captured and resolved the error. Reset, try again. */
        ret = 0;

        if (addr != end)
                goto again;
out:
        if (unlikely(prealloc))
                folio_put(prealloc);
        return ret;
}

static inline int
copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pmd_t *src_pmd, *dst_pmd;
        unsigned long next;

        dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
        if (!dst_pmd)
                return -ENOMEM;
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_is_huge(*src_pmd)) {
                        int err;

                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
                        err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
                                            addr, dst_vma, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pmd++, src_pmd++, addr = next, addr != end);
        return 0;
}

static inline int
copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pud_t *src_pud, *dst_pud;
        unsigned long next;

        dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
        if (!dst_pud)
                return -ENOMEM;
        src_pud = pud_offset(src_p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*src_pud)) {
                        int err;

                        VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
                        err = copy_huge_pud(dst_mm, src_mm,
                                            dst_pud, src_pud, addr, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(src_pud))
                        continue;
                if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pud++, src_pud++, addr = next, addr != end);
        return 0;
}

static inline int
copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        p4d_t *src_p4d, *dst_p4d;
        unsigned long next;

        dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
        if (!dst_p4d)
                return -ENOMEM;
        src_p4d = p4d_offset(src_pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(src_p4d))
                        continue;
                if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_p4d++, src_p4d++, addr = next, addr != end);
        return 0;
}

/*
 * Return true if the vma needs to copy the pgtable during this fork().  Return
 * false when we can speed up fork() by allowing lazy page faults later until
 * when the child accesses the memory range.
 */
static bool
vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        /*
         * We check against dst_vma as while sane VMA flags will have been
         * copied, VM_UFFD_WP may be set only on dst_vma.
         */
        if (dst_vma->vm_flags & VM_COPY_ON_FORK)
                return true;
        /*
         * The presence of an anon_vma indicates an anonymous VMA has page
         * tables which naturally cannot be reconstituted on page fault.
         */
        if (src_vma->anon_vma)
                return true;

        /*
         * Don't copy ptes where a page fault will fill them correctly.  Fork
         * becomes much lighter when there are big shared or private readonly
         * mappings. The tradeoff is that copy_page_range is more efficient
         * than faulting.
         */
        return false;
}

int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        pgd_t *src_pgd, *dst_pgd;
        unsigned long addr = src_vma->vm_start;
        unsigned long end = src_vma->vm_end;
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        struct mmu_notifier_range range;
        unsigned long next;
        bool is_cow;
        int ret;

        if (!vma_needs_copy(dst_vma, src_vma))
                return 0;

        if (is_vm_hugetlb_page(src_vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);

        /*
         * We need to invalidate the secondary MMU mappings only when
         * there could be a permission downgrade on the ptes of the
         * parent mm. And a permission downgrade will only happen if
         * is_cow_mapping() returns true.
         */
        is_cow = is_cow_mapping(src_vma->vm_flags);

        if (is_cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                        0, src_mm, addr, end);
                mmu_notifier_invalidate_range_start(&range);
                /*
                 * Disabling preemption is not needed for the write side, as
                 * the read side doesn't spin, but goes to the mmap_lock.
                 *
                 * Use the raw variant of the seqcount_t write API to avoid
                 * lockdep complaining about preemptibility.
                 */
                vma_assert_write_locked(src_vma);
                raw_write_seqcount_begin(&src_mm->write_protect_seq);
        }

        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
                if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                                            addr, next))) {
                        ret = -ENOMEM;
                        break;
                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);

        if (is_cow) {
                raw_write_seqcount_end(&src_mm->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
        }
        return ret;
}

/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
        /* By default, zap all pages */
        if (!details)
                return true;

        VM_WARN_ON_ONCE(details->skip_cows && details->reclaim_pt);

        /* Or, we zap COWed pages only if the caller wants to */
        return !details->skip_cows;
}

/* Decides whether we should zap this folio with the folio pointer specified */
static inline bool should_zap_folio(struct zap_details *details,
                                    struct folio *folio)
{
        /* If we can make a decision without *folio.. */
        if (should_zap_cows(details))
                return true;

        /* Otherwise we should only zap non-anon folios */
        return !folio_test_anon(folio);
}

static inline bool zap_drop_markers(struct zap_details *details)
{
        if (!details)
                return false;

        return details->zap_flags & ZAP_FLAG_DROP_MARKER;
}

/*
 * This function makes sure that we'll replace the none pte with an uffd-wp
 * swap special pte marker when necessary. Must be with the pgtable lock held.
 *
 * Returns true if uffd-wp ptes was installed, false otherwise.
 */
static inline bool
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *pte, int nr,
                              struct zap_details *details, pte_t pteval)
{
        bool was_installed = false;

        if (!uffd_supports_wp_marker())
                return false;

        /* Zap on anonymous always means dropping everything */
        if (vma_is_anonymous(vma))
                return false;

        if (zap_drop_markers(details))
                return false;

        for (;;) {
                /* the PFN in the PTE is irrelevant. */
                if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
                        was_installed = true;
                if (--nr == 0)
                        break;
                pte++;
                addr += PAGE_SIZE;
        }

        return was_installed;
}

static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, struct folio *folio,
                struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
                unsigned long addr, struct zap_details *details, int *rss,
                bool *force_flush, bool *force_break, bool *any_skipped)
{
        struct mm_struct *mm = tlb->mm;
        bool delay_rmap = false;

        if (!folio_test_anon(folio)) {
                ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                if (pte_dirty(ptent)) {
                        folio_mark_dirty(folio);
                        if (tlb_delay_rmap(tlb)) {
                                delay_rmap = true;
                                *force_flush = true;
                        }
                }
                if (pte_young(ptent) && likely(vma_has_recency(vma)))
                        folio_mark_accessed(folio);
                rss[mm_counter(folio)] -= nr;
        } else {
                /* We don't need up-to-date accessed/dirty bits. */
                clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                rss[MM_ANONPAGES] -= nr;
        }
        /* Checking a single PTE in a batch is sufficient. */
        arch_check_zapped_pte(vma, ptent);
        tlb_remove_tlb_entries(tlb, pte, nr, addr);
        if (unlikely(userfaultfd_pte_wp(vma, ptent)))
                *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
                                                             nr, details, ptent);

        if (!delay_rmap) {
                folio_remove_rmap_ptes(folio, page, nr, vma);

                if (unlikely(folio_mapcount(folio) < 0))
                        print_bad_pte(vma, addr, ptent, page);
        }
        if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
                *force_flush = true;
                *force_break = true;
        }
}

/*
 * Zap or skip at least one present PTE, trying to batch-process subsequent
 * PTEs that map consecutive pages of the same folio.
 *
 * Returns the number of processed (skipped or zapped) PTEs (at least 1).
 */
static inline int zap_present_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
                unsigned int max_nr, unsigned long addr,
                struct zap_details *details, int *rss, bool *force_flush,
                bool *force_break, bool *any_skipped)
{
        struct mm_struct *mm = tlb->mm;
        struct folio *folio;
        struct page *page;
        int nr;

        page = vm_normal_page(vma, addr, ptent);
        if (!page) {
                /* We don't need up-to-date accessed/dirty bits. */
                ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
                arch_check_zapped_pte(vma, ptent);
                tlb_remove_tlb_entry(tlb, pte, addr);
                if (userfaultfd_pte_wp(vma, ptent))
                        *any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
                                                pte, 1, details, ptent);
                ksm_might_unmap_zero_page(mm, ptent);
                return 1;
        }

        folio = page_folio(page);
        if (unlikely(!should_zap_folio(details, folio))) {
                *any_skipped = true;
                return 1;
        }

        /*
         * Make sure that the common "small folio" case is as fast as possible
         * by keeping the batching logic separate.
         */
        if (unlikely(folio_test_large(folio) && max_nr != 1)) {
                nr = folio_pte_batch(folio, pte, ptent, max_nr);
                zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
                                       addr, details, rss, force_flush,
                                       force_break, any_skipped);
                return nr;
        }
        zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
                               details, rss, force_flush, force_break, any_skipped);
        return 1;
}

static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
                unsigned int max_nr, unsigned long addr,
                struct zap_details *details, int *rss, bool *any_skipped)
{
        softleaf_t entry;
        int nr = 1;

        *any_skipped = true;
        entry = softleaf_from_pte(ptent);
        if (softleaf_is_device_private(entry) ||
            softleaf_is_device_exclusive(entry)) {
                struct page *page = softleaf_to_page(entry);
                struct folio *folio = page_folio(page);

                if (unlikely(!should_zap_folio(details, folio)))
                        return 1;
                /*
                 * Both device private/exclusive mappings should only
                 * work with anonymous page so far, so we don't need to
                 * consider uffd-wp bit when zap. For more information,
                 * see zap_install_uffd_wp_if_needed().
                 */
                WARN_ON_ONCE(!vma_is_anonymous(vma));
                rss[mm_counter(folio)]--;
                folio_remove_rmap_pte(folio, page, vma);
                folio_put(folio);
        } else if (softleaf_is_swap(entry)) {
                /* Genuine swap entries, hence a private anon pages */
                if (!should_zap_cows(details))
                        return 1;

                nr = swap_pte_batch(pte, max_nr, ptent);
                rss[MM_SWAPENTS] -= nr;
                swap_put_entries_direct(entry, nr);
        } else if (softleaf_is_migration(entry)) {
                struct folio *folio = softleaf_to_folio(entry);

                if (!should_zap_folio(details, folio))
                        return 1;
                rss[mm_counter(folio)]--;
        } else if (softleaf_is_uffd_wp_marker(entry)) {
                /*
                 * For anon: always drop the marker; for file: only
                 * drop the marker if explicitly requested.
                 */
                if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
                        return 1;
        } else if (softleaf_is_guard_marker(entry)) {
                /*
                 * Ordinary zapping should not remove guard PTE
                 * markers. Only do so if we should remove PTE markers
                 * in general.
                 */
                if (!zap_drop_markers(details))
                        return 1;
        } else if (softleaf_is_hwpoison(entry) ||
                   softleaf_is_poison_marker(entry)) {
                if (!should_zap_cows(details))
                        return 1;
        } else {
                /* We should have covered all the swap entry types */
                pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
                WARN_ON_ONCE(1);
        }
        clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm);
        *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);

        return nr;
}

static inline int do_zap_pte_range(struct mmu_gather *tlb,
                                   struct vm_area_struct *vma, pte_t *pte,
                                   unsigned long addr, unsigned long end,
                                   struct zap_details *details, int *rss,
                                   bool *force_flush, bool *force_break,
                                   bool *any_skipped)
{
        pte_t ptent = ptep_get(pte);
        int max_nr = (end - addr) / PAGE_SIZE;
        int nr = 0;

        /* Skip all consecutive none ptes */
        if (pte_none(ptent)) {
                for (nr = 1; nr < max_nr; nr++) {
                        ptent = ptep_get(pte + nr);
                        if (!pte_none(ptent))
                                break;
                }
                max_nr -= nr;
                if (!max_nr)
                        return nr;
                pte += nr;
                addr += nr * PAGE_SIZE;
        }

        if (pte_present(ptent))
                nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
                                       details, rss, force_flush, force_break,
                                       any_skipped);
        else
                nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
                                          details, rss, any_skipped);

        return nr;
}

static bool pte_table_reclaim_possible(unsigned long start, unsigned long end,
                struct zap_details *details)
{
        if (!IS_ENABLED(CONFIG_PT_RECLAIM))
                return false;
        /* Only zap if we are allowed to and cover the full page table. */
        return details && details->reclaim_pt && (end - start >= PMD_SIZE);
}

static bool zap_empty_pte_table(struct mm_struct *mm, pmd_t *pmd,
                spinlock_t *ptl, pmd_t *pmdval)
{
        spinlock_t *pml = pmd_lockptr(mm, pmd);

        if (ptl != pml && !spin_trylock(pml))
                return false;

        *pmdval = pmdp_get(pmd);
        pmd_clear(pmd);
        if (ptl != pml)
                spin_unlock(pml);
        return true;
}

static bool zap_pte_table_if_empty(struct mm_struct *mm, pmd_t *pmd,
                unsigned long addr, pmd_t *pmdval)
{
        spinlock_t *pml, *ptl = NULL;
        pte_t *start_pte, *pte;
        int i;

        pml = pmd_lock(mm, pmd);
        start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, pmdval, &ptl);
        if (!start_pte)
                goto out_ptl;
        if (ptl != pml)
                spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

        for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) {
                if (!pte_none(ptep_get(pte)))
                        goto out_ptl;
        }
        pte_unmap(start_pte);

        pmd_clear(pmd);

        if (ptl != pml)
                spin_unlock(ptl);
        spin_unlock(pml);
        return true;
out_ptl:
        if (start_pte)
                pte_unmap_unlock(start_pte, ptl);
        if (ptl != pml)
                spin_unlock(pml);
        return false;
}

static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        bool can_reclaim_pt = pte_table_reclaim_possible(addr, end, details);
        bool force_flush = false, force_break = false;
        struct mm_struct *mm = tlb->mm;
        int rss[NR_MM_COUNTERS];
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
        pmd_t pmdval;
        unsigned long start = addr;
        bool direct_reclaim = true;
        int nr;

retry:
        tlb_change_page_size(tlb, PAGE_SIZE);
        init_rss_vec(rss);
        start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return addr;

        flush_tlb_batched_pending(mm);
        lazy_mmu_mode_enable();
        do {
                bool any_skipped = false;

                if (need_resched()) {
                        direct_reclaim = false;
                        break;
                }

                nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
                                      &force_flush, &force_break, &any_skipped);
                if (any_skipped)
                        can_reclaim_pt = false;
                if (unlikely(force_break)) {
                        addr += nr * PAGE_SIZE;
                        direct_reclaim = false;
                        break;
                }
        } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);

        /*
         * Fast path: try to hold the pmd lock and unmap the PTE page.
         *
         * If the pte lock was released midway (retry case), or if the attempt
         * to hold the pmd lock failed, then we need to recheck all pte entries
         * to ensure they are still none, thereby preventing the pte entries
         * from being repopulated by another thread.
         */
        if (can_reclaim_pt && direct_reclaim && addr == end)
                direct_reclaim = zap_empty_pte_table(mm, pmd, ptl, &pmdval);

        add_mm_rss_vec(mm, rss);
        lazy_mmu_mode_disable();

        /* Do the actual TLB flush before dropping ptl */
        if (force_flush) {
                tlb_flush_mmu_tlbonly(tlb);
                tlb_flush_rmaps(tlb, vma);
        }
        pte_unmap_unlock(start_pte, ptl);

        /*
         * If we forced a TLB flush (either due to running out of
         * batch buffers or because we needed to flush dirty TLB
         * entries before releasing the ptl), free the batched
         * memory too. Come back again if we didn't do everything.
         */
        if (force_flush)
                tlb_flush_mmu(tlb);

        if (addr != end) {
                cond_resched();
                force_flush = false;
                force_break = false;
                goto retry;
        }

        if (can_reclaim_pt) {
                if (direct_reclaim || zap_pte_table_if_empty(mm, pmd, start, &pmdval)) {
                        pte_free_tlb(tlb, pmd_pgtable(pmdval), addr);
                        mm_dec_nr_ptes(mm);
                }
        }

        return addr;
}

static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_is_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false);
                        else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
                                addr = next;
                                continue;
                        }
                        /* fall through */
                } else if (details && details->single_folio &&
                           folio_test_pmd_mappable(details->single_folio) &&
                           next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
                        sync_with_folio_pmd_zap(tlb->mm, pmd);
                }
                if (pmd_none(*pmd)) {
                        addr = next;
                        continue;
                }
                addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
                if (addr != next)
                        pmd--;
        } while (pmd++, cond_resched(), addr != end);

        return addr;
}

static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*pud)) {
                        if (next - addr != HPAGE_PUD_SIZE)
                                split_huge_pud(vma, pud, addr);
                        else if (zap_huge_pud(tlb, vma, pud, addr))
                                goto next;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(pud))
                        continue;
                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
next:
                cond_resched();
        } while (pud++, addr = next, addr != end);

        return addr;
}

static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                next = zap_pud_range(tlb, vma, p4d, addr, next, details);
        } while (p4d++, addr = next, addr != end);

        return addr;
}

static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                struct zap_details *details)
{
        const bool reaping = details && details->reaping;

        VM_WARN_ON_ONCE(start >= end || !range_in_vma(vma, start, end));

        /* uprobe_munmap() might sleep, so skip it when reaping. */
        if (vma->vm_file && !reaping)
                uprobe_munmap(vma, start, end);

        if (unlikely(is_vm_hugetlb_page(vma))) {
                zap_flags_t zap_flags = details ? details->zap_flags : 0;

                VM_WARN_ON_ONCE(reaping);
                /*
                 * vm_file will be NULL when we fail early while instantiating
                 * a new mapping. In this case, no pages were mapped yet and
                 * there is nothing to do.
                 */
                if (!vma->vm_file)
                        return;
                __unmap_hugepage_range(tlb, vma, start, end, NULL, zap_flags);
        } else {
                unsigned long next, addr = start;
                pgd_t *pgd;

                tlb_start_vma(tlb, vma);
                pgd = pgd_offset(vma->vm_mm, addr);
                do {
                        next = pgd_addr_end(addr, end);
                        if (pgd_none_or_clear_bad(pgd))
                                continue;
                        next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
                } while (pgd++, addr = next, addr != end);
                tlb_end_vma(tlb, vma);
        }
}

/**
 * zap_vma_for_reaping - zap all page table entries in the vma without blocking
 * @vma: The vma to zap.
 *
 * Zap all page table entries in the vma without blocking for use by the oom
 * killer. Hugetlb vmas are not supported.
 *
 * Returns: 0 on success, -EBUSY if we would have to block.
 */
int zap_vma_for_reaping(struct vm_area_struct *vma)
{
        struct zap_details details = {
                .reaping = true,
        };
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                vma->vm_start, vma->vm_end);
        tlb_gather_mmu(&tlb, vma->vm_mm);
        if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
                tlb_finish_mmu(&tlb);
                return -EBUSY;
        }
        __zap_vma_range(&tlb, vma, range.start, range.end, &details);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
        return 0;
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
 * @tlb: address of the caller's struct mmu_gather
 * @unmap: The unmap_desc
 *
 * Unmap all pages in the vma list.
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
{
        struct vm_area_struct *vma;
        struct mmu_notifier_range range;
        struct zap_details details = {
                .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
        };

        vma = unmap->first;
        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
                                unmap->vma_start, unmap->vma_end);
        mmu_notifier_invalidate_range_start(&range);
        do {
                unsigned long start = max(vma->vm_start, unmap->vma_start);
                unsigned long end = min(vma->vm_end, unmap->vma_end);

                hugetlb_zap_begin(vma, &start, &end);
                __zap_vma_range(tlb, vma, start, end, &details);
                hugetlb_zap_end(vma, &details);
                vma = mas_find(unmap->mas, unmap->tree_end - 1);
        } while (vma);
        mmu_notifier_invalidate_range_end(&range);
}

/**
 * zap_vma_range_batched - zap page table entries in a vma range
 * @tlb: pointer to the caller's struct mmu_gather
 * @vma: the vma covering the range to zap
 * @address: starting address of the range to zap
 * @size: number of bytes to zap
 * @details: details specifying zapping behavior
 *
 * @tlb must not be NULL. The provided address range must be fully
 * contained within @vma. If @vma is for hugetlb, @tlb is flushed and
 * re-initialized by this function.
 *
 * If @details is NULL, this function will zap all page table entries.
 */
void zap_vma_range_batched(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
{
        const unsigned long end = address + size;
        struct mmu_notifier_range range;

        VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm);

        if (unlikely(!size))
                return;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, end);
        hugetlb_zap_begin(vma, &range.start, &range.end);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
        /*
         * unmap 'address-end' not 'range.start-range.end' as range
         * could have been expanded for hugetlb pmd sharing.
         */
        __zap_vma_range(tlb, vma, address, end, details);
        mmu_notifier_invalidate_range_end(&range);
        if (is_vm_hugetlb_page(vma)) {
                /*
                 * flush tlb and free resources before hugetlb_zap_end(), to
                 * avoid concurrent page faults' allocation failure.
                 */
                tlb_finish_mmu(tlb);
                hugetlb_zap_end(vma, details);
                tlb_gather_mmu(tlb, vma->vm_mm);
        }
}

/**
 * zap_vma_range - zap all page table entries in a vma range
 * @vma: the vma covering the range to zap
 * @address: starting address of the range to zap
 * @size: number of bytes to zap
 *
 * The provided address range must be fully contained within @vma.
 */
void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
                unsigned long size)
{
        struct mmu_gather tlb;

        tlb_gather_mmu(&tlb, vma->vm_mm);
        zap_vma_range_batched(&tlb, vma, address, size, NULL);
        tlb_finish_mmu(&tlb);
}

/**
 * zap_special_vma_range - zap all page table entries in a special vma range
 * @vma: the vma covering the range to zap
 * @address: starting address of the range to zap
 * @size: number of bytes to zap
 *
 * This function does nothing when the provided address range is not fully
 * contained in @vma, or when the @vma is not VM_PFNMAP or VM_MIXEDMAP.
 */
void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address,
                unsigned long size)
{
        if (!range_in_vma(vma, address, address + size) ||
           !(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
                return;

        zap_vma_range(vma, address, size);
}
EXPORT_SYMBOL_GPL(zap_special_vma_range);

static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return NULL;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return NULL;

        VM_BUG_ON(pmd_trans_huge(*pmd));
        return pmd;
}

pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
                      spinlock_t **ptl)
{
        pmd_t *pmd = walk_to_pmd(mm, addr);

        if (!pmd)
                return NULL;
        return pte_alloc_map_lock(mm, pmd, addr, ptl);
}

static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma)
{
        VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP);
        /*
         * Whoever wants to forbid the zeropage after some zeropages
         * might already have been mapped has to scan the page tables and
         * bail out on any zeropages. Zeropages in COW mappings can
         * be unshared using FAULT_FLAG_UNSHARE faults.
         */
        if (mm_forbids_zeropage(vma->vm_mm))
                return false;
        /* zeropages in COW mappings are common and unproblematic. */
        if (is_cow_mapping(vma->vm_flags))
                return true;
        /* Mappings that do not allow for writable PTEs are unproblematic. */
        if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
                return true;
        /*
         * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could
         * find the shared zeropage and longterm-pin it, which would
         * be problematic as soon as the zeropage gets replaced by a different
         * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would
         * now differ to what GUP looked up. FSDAX is incompatible to
         * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see
         * check_vma_flags).
         */
        return vma->vm_ops && vma->vm_ops->pfn_mkwrite &&
               (vma_is_fsdax(vma) || vma->vm_flags & VM_IO);
}

static int validate_page_before_insert(struct vm_area_struct *vma,
                                       struct page *page)
{
        struct folio *folio = page_folio(page);

        if (!folio_ref_count(folio))
                return -EINVAL;
        if (unlikely(is_zero_folio(folio))) {
                if (!vm_mixed_zeropage_allowed(vma))
                        return -EINVAL;
                return 0;
        }
        if (folio_test_anon(folio) || page_has_type(page))
                return -EINVAL;
        flush_dcache_folio(folio);
        return 0;
}

static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
                                unsigned long addr, struct page *page,
                                pgprot_t prot, bool mkwrite)
{
        struct folio *folio = page_folio(page);
        pte_t pteval = ptep_get(pte);

        if (!pte_none(pteval)) {
                if (!mkwrite)
                        return -EBUSY;

                /* see insert_pfn(). */
                if (pte_pfn(pteval) != page_to_pfn(page)) {
                        WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval)));
                        return -EFAULT;
                }
                pteval = maybe_mkwrite(pteval, vma);
                pteval = pte_mkyoung(pteval);
                if (ptep_set_access_flags(vma, addr, pte, pteval, 1))
                        update_mmu_cache(vma, addr, pte);
                return 0;
        }

        /* Ok, finally just insert the thing.. */
        pteval = mk_pte(page, prot);
        if (unlikely(is_zero_folio(folio))) {
                pteval = pte_mkspecial(pteval);
        } else {
                folio_get(folio);
                pteval = mk_pte(page, prot);
                if (mkwrite) {
                        pteval = pte_mkyoung(pteval);
                        pteval = maybe_mkwrite(pte_mkdirty(pteval), vma);
                }
                inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
                folio_add_file_rmap_pte(folio, page, vma);
        }
        set_pte_at(vma->vm_mm, addr, pte, pteval);
        return 0;
}

static int insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page, pgprot_t prot, bool mkwrite)
{
        int retval;
        pte_t *pte;
        spinlock_t *ptl;

        retval = validate_page_before_insert(vma, page);
        if (retval)
                goto out;
        retval = -ENOMEM;
        pte = get_locked_pte(vma->vm_mm, addr, &ptl);
        if (!pte)
                goto out;
        retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
                                        mkwrite);
        pte_unmap_unlock(pte, ptl);
out:
        return retval;
}

static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        int err;

        err = validate_page_before_insert(vma, page);
        if (err)
                return err;
        return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
}

/* insert_pages() amortizes the cost of spinlock operations
 * when inserting pages in a loop.
 */
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num, pgprot_t prot)
{
        pmd_t *pmd = NULL;
        pte_t *start_pte, *pte;
        spinlock_t *pte_lock;
        struct mm_struct *const mm = vma->vm_mm;
        unsigned long curr_page_idx = 0;
        unsigned long remaining_pages_total = *num;
        unsigned long pages_to_write_in_pmd;
        int ret;
more:
        ret = -EFAULT;
        pmd = walk_to_pmd(mm, addr);
        if (!pmd)
                goto out;

        pages_to_write_in_pmd = min_t(unsigned long,
                remaining_pages_total, PTRS_PER_PTE - pte_index(addr));

        /* Allocate the PTE if necessary; takes PMD lock once only. */
        ret = -ENOMEM;
        if (pte_alloc(mm, pmd))
                goto out;

        while (pages_to_write_in_pmd) {
                int pte_idx = 0;
                const int batch_size = min_t(int, pages_to_write_in_pmd, 8);

                start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
                if (!start_pte) {
                        ret = -EFAULT;
                        goto out;
                }
                for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
                        int err = insert_page_in_batch_locked(vma, pte,
                                addr, pages[curr_page_idx], prot);
                        if (unlikely(err)) {
                                pte_unmap_unlock(start_pte, pte_lock);
                                ret = err;
                                remaining_pages_total -= pte_idx;
                                goto out;
                        }
                        addr += PAGE_SIZE;
                        ++curr_page_idx;
                }
                pte_unmap_unlock(start_pte, pte_lock);
                pages_to_write_in_pmd -= batch_size;
                remaining_pages_total -= batch_size;
        }
        if (remaining_pages_total)
                goto more;
        ret = 0;
out:
        *num = remaining_pages_total;
        return ret;
}

/**
 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
 * @vma: user vma to map to
 * @addr: target start user address of these pages
 * @pages: source kernel pages
 * @num: in: number of pages to map. out: number of pages that were *not*
 * mapped. (0 means all pages were successfully mapped).
 *
 * Preferred over vm_insert_page() when inserting multiple pages.
 *
 * In case of error, we may have mapped a subset of the provided
 * pages. It is the caller's responsibility to account for this case.
 *
 * The same restrictions apply as in vm_insert_page().
 */
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num)
{
        const unsigned long nr_pages = *num;
        const unsigned long end = addr + PAGE_SIZE * nr_pages;

        if (!range_in_vma(vma, addr, end))
                return -EFAULT;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                VM_WARN_ON_ONCE(mmap_read_trylock(vma->vm_mm));
                VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP);
                vm_flags_set(vma, VM_MIXEDMAP);
        }
        /* Defer page refcount checking till we're about to map that page. */
        return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_pages);

int map_kernel_pages_prepare(struct vm_area_desc *desc)
{
        const struct mmap_action *action = &desc->action;
        const unsigned long addr = action->map_kernel.start;
        unsigned long nr_pages, end;

        if (!vma_desc_test(desc, VMA_MIXEDMAP_BIT)) {
                VM_WARN_ON_ONCE(mmap_read_trylock(desc->mm));
                VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_PFNMAP_BIT));
                vma_desc_set_flags(desc, VMA_MIXEDMAP_BIT);
        }

        nr_pages = action->map_kernel.nr_pages;
        end = addr + PAGE_SIZE * nr_pages;
        if (!range_in_vma_desc(desc, addr, end))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(map_kernel_pages_prepare);

int map_kernel_pages_complete(struct vm_area_struct *vma,
                              struct mmap_action *action)
{
        unsigned long nr_pages;

        nr_pages = action->map_kernel.nr_pages;
        return insert_pages(vma, action->map_kernel.start,
                            action->map_kernel.pages,
                            &nr_pages, vma->vm_page_prot);
}
EXPORT_SYMBOL(map_kernel_pages_complete);

/**
 * vm_insert_page - insert single page into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @page: source kernel page
 *
 * This allows drivers to insert individual pages they've allocated
 * into a user vma. The zeropage is supported in some VMAs,
 * see vm_mixed_zeropage_allowed().
 *
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
 * (see split_page()).
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
 * that. Your vma protection will have to be set up correctly, which
 * means that if you want a shared writable mapping, you'd better
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
 *
 * Usually this function is called from f_op->mmap() handler
 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
 * Caller must set VM_MIXEDMAP on vma if it wants to call this
 * function from other places, for example from page-fault handler.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
{
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vm_flags_set(vma, VM_MIXEDMAP);
        }
        return insert_page(vma, addr, page, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vm_insert_page);

/*
 * __vm_map_pages - maps range of kernel pages into user vma
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 * @offset: user's requested vm_pgoff
 *
 * This allows drivers to map range of kernel pages into a user vma.
 * The zeropage is supported in some VMAs, see
 * vm_mixed_zeropage_allowed().
 *
 * Return: 0 on success and error code otherwise.
 */
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num, unsigned long offset)
{
        unsigned long count = vma_pages(vma);
        unsigned long uaddr = vma->vm_start;

        /* Fail if the user requested offset is beyond the end of the object */
        if (offset >= num)
                return -ENXIO;

        /* Fail if the user requested size exceeds available object size */
        if (count > num - offset)
                return -ENXIO;

        return vm_insert_pages(vma, uaddr, pages + offset, &count);
}

/**
 * vm_map_pages - maps range of kernel pages starts with non zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Maps an object consisting of @num pages, catering for the user's
 * requested vm_pgoff
 *
 * If we fail to insert any page into the vma, the function will return
 * immediately leaving any previously inserted pages present.  Callers
 * from the mmap handler may immediately return the error as their caller
 * will destroy the vma, removing any successfully inserted pages. Other
 * callers should make their own arrangements for calling unmap_region().
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);

/**
 * vm_map_pages_zero - map range of kernel pages starts with zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Similar to vm_map_pages(), except that it explicitly sets the offset
 * to 0. This function is intended for the drivers that did not consider
 * vm_pgoff.
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);

static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t prot, bool mkwrite)
{
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, entry;
        spinlock_t *ptl;

        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
                return VM_FAULT_OOM;
        entry = ptep_get(pte);
        if (!pte_none(entry)) {
                if (mkwrite) {
                        /*
                         * For read faults on private mappings the PFN passed
                         * in may not match the PFN we have mapped if the
                         * mapped PFN is a writeable COW page.  In the mkwrite
                         * case we are creating a writable PTE for a shared
                         * mapping and we expect the PFNs to match. If they
                         * don't match, we are likely racing with block
                         * allocation and mapping invalidation so just skip the
                         * update.
                         */
                        if (pte_pfn(entry) != pfn) {
                                WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
                                goto out_unlock;
                        }
                        entry = pte_mkyoung(entry);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (ptep_set_access_flags(vma, addr, pte, entry, 1))
                                update_mmu_cache(vma, addr, pte);
                }
                goto out_unlock;
        }

        /* Ok, finally just insert the thing.. */
        entry = pte_mkspecial(pfn_pte(pfn, prot));

        if (mkwrite) {
                entry = pte_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        }

        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */

out_unlock:
        pte_unmap_unlock(pte, ptl);
        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
 * This is exactly like vmf_insert_pfn(), except that it allows drivers
 * to override pgprot on a per-page basis.
 *
 * This only makes sense for IO mappings, and it makes no sense for
 * COW mappings.  In general, using multiple vmas is preferable;
 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
 * impractical.
 *
 * pgprot typically only differs from @vma->vm_page_prot when drivers set
 * caching- and encryption bits different than those of @vma->vm_page_prot,
 * because the caching- or encryption mode may not be known at mmap() time.
 *
 * This is ok as long as @vma->vm_page_prot is not used by the core vm
 * to set caching and encryption bits for those vmas (except for COW pages).
 * This is ensured by core vm only modifying these page table entries using
 * functions that don't touch caching- or encryption bits, using pte_modify()
 * if needed. (See for example mprotect()).
 *
 * Also when new page-table entries are created, this is only done using the
 * fault() callback, and never using the value of vma->vm_page_prot,
 * except for page-table entries that point to anonymous pages as the result
 * of COW.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot)
{
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
         * consistency in testing and feature parity among all, so we should
         * try to keep these invariants in place for everybody.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (!pfn_modify_allowed(pfn, pgprot))
                return VM_FAULT_SIGBUS;

        pfnmap_setup_cachemode_pfn(pfn, &pgprot);

        return insert_pfn(vma, addr, pfn, pgprot, false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);

/**
 * vmf_insert_pfn - insert single pfn into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
 * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return the result of this function.
 *
 * vma cannot be a COW mapping.
 *
 * As this is called only for pages that do not currently exist, we
 * do not need to flush old virtual caches or the TLB.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
{
        return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);

static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn,
                        bool mkwrite)
{
        if (unlikely(is_zero_pfn(pfn)) &&
            (mkwrite || !vm_mixed_zeropage_allowed(vma)))
                return false;
        /* these checks mirror the abort conditions in vm_normal_page */
        if (vma->vm_flags & VM_MIXEDMAP)
                return true;
        if (is_zero_pfn(pfn))
                return true;
        return false;
}

static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
                unsigned long addr, unsigned long pfn, bool mkwrite)
{
        pgprot_t pgprot = vma->vm_page_prot;
        int err;

        if (!vm_mixed_ok(vma, pfn, mkwrite))
                return VM_FAULT_SIGBUS;

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        pfnmap_setup_cachemode_pfn(pfn, &pgprot);

        if (!pfn_modify_allowed(pfn, pgprot))
                return VM_FAULT_SIGBUS;

        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
         * refcount the page if pfn_valid is true (hence insert_page rather
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
        if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) {
                struct page *page;

                /*
                 * At this point we are committed to insert_page()
                 * regardless of whether the caller specified flags that
                 * result in pfn_t_has_page() == false.
                 */
                page = pfn_to_page(pfn);
                err = insert_page(vma, addr, page, pgprot, mkwrite);
        } else {
                return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
        }

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
                        bool write)
{
        pgprot_t pgprot = vmf->vma->vm_page_prot;
        unsigned long addr = vmf->address;
        int err;

        if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end)
                return VM_FAULT_SIGBUS;

        err = insert_page(vmf->vma, addr, page, pgprot, write);
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);

vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);

/*
 *  If the insertion of PTE failed because someone else already added a
 *  different entry in the mean time, we treat that as success as we assume
 *  the same entry was actually inserted.
 */
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, unsigned long pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, true);
}

/*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
 * in null mappings (currently treated as "copy-on-access")
 */
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pte_t *pte, *mapped_pte;
        spinlock_t *ptl;
        int err = 0;

        mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return -ENOMEM;
        lazy_mmu_mode_enable();
        do {
                BUG_ON(!pte_none(ptep_get(pte)));
                if (!pfn_modify_allowed(pfn, prot)) {
                        err = -EACCES;
                        break;
                }
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        lazy_mmu_mode_disable();
        pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pmd_t *pmd;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                err = remap_pte_range(mm, pmd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pud_t *pud;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                err = remap_pmd_range(mm, pud, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        p4d_t *p4d;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                err = remap_pud_range(mm, p4d, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int get_remap_pgoff(bool is_cow, unsigned long addr,
                unsigned long end, unsigned long vm_start, unsigned long vm_end,
                unsigned long pfn, pgoff_t *vm_pgoff_p)
{
        /*
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         * See vm_normal_page() for details.
         */
        if (is_cow) {
                if (addr != vm_start || end != vm_end)
                        return -EINVAL;
                *vm_pgoff_p = pfn;
        }

        return 0;
}

static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        pgd_t *pgd;
        unsigned long next;
        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
        int err;

        if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
                return -EINVAL;

        VM_WARN_ON_ONCE(!vma_test_all_mask(vma, VMA_REMAP_FLAGS));

        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
        pgd = pgd_offset(mm, addr);
        flush_cache_range(vma, addr, end);
        do {
                next = pgd_addr_end(addr, end);
                err = remap_p4d_range(mm, pgd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pgd++, addr = next, addr != end);

        return 0;
}

/*
 * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
 * must have pre-validated the caching bits of the pgprot_t.
 */
static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);

        if (!error)
                return 0;

        /*
         * A partial pfn range mapping is dangerous: it does not
         * maintain page reference counts, and callers may free
         * pages due to the error. So zap it early.
         */
        zap_vma_range(vma, addr, size);
        return error;
}

#ifdef __HAVE_PFNMAP_TRACKING
static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
                unsigned long size, pgprot_t *prot)
{
        struct pfnmap_track_ctx *ctx;

        if (pfnmap_track(pfn, size, prot))
                return ERR_PTR(-EINVAL);

        ctx = kmalloc_obj(*ctx);
        if (unlikely(!ctx)) {
                pfnmap_untrack(pfn, size);
                return ERR_PTR(-ENOMEM);
        }

        ctx->pfn = pfn;
        ctx->size = size;
        kref_init(&ctx->kref);
        return ctx;
}

void pfnmap_track_ctx_release(struct kref *ref)
{
        struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref);

        pfnmap_untrack(ctx->pfn, ctx->size);
        kfree(ctx);
}

static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        struct pfnmap_track_ctx *ctx = NULL;
        int err;

        size = PAGE_ALIGN(size);

        /*
         * If we cover the full VMA, we'll perform actual tracking, and
         * remember to untrack when the last reference to our tracking
         * context from a VMA goes away. We'll keep tracking the whole pfn
         * range even during VMA splits and partial unmapping.
         *
         * If we only cover parts of the VMA, we'll only setup the cachemode
         * in the pgprot for the pfn range.
         */
        if (addr == vma->vm_start && addr + size == vma->vm_end) {
                if (vma->pfnmap_track_ctx)
                        return -EINVAL;
                ctx = pfnmap_track_ctx_alloc(pfn, size, &prot);
                if (IS_ERR(ctx))
                        return PTR_ERR(ctx);
        } else if (pfnmap_setup_cachemode(pfn, size, &prot)) {
                return -EINVAL;
        }

        err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
        if (ctx) {
                if (err)
                        kref_put(&ctx->kref, pfnmap_track_ctx_release);
                else
                        vma->pfnmap_track_ctx = ctx;
        }
        return err;
}

static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        return remap_pfn_range_track(vma, addr, pfn, size, prot);
}
#else
static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
}
#endif

int remap_pfn_range_prepare(struct vm_area_desc *desc)
{
        const struct mmap_action *action = &desc->action;
        const unsigned long start = action->remap.start;
        const unsigned long end = start + action->remap.size;
        const unsigned long pfn = action->remap.start_pfn;
        const bool is_cow = vma_desc_is_cow_mapping(desc);
        int err;

        if (!range_in_vma_desc(desc, start, end))
                return -EFAULT;

        err = get_remap_pgoff(is_cow, start, end, desc->start, desc->end, pfn,
                              &desc->pgoff);
        if (err)
                return err;

        vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS);
        return 0;
}

static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma,
                                       unsigned long addr, unsigned long pfn,
                                       unsigned long size)
{
        const unsigned long end = addr + PAGE_ALIGN(size);
        const bool is_cow = is_cow_mapping(vma->vm_flags);
        int err;

        err = get_remap_pgoff(is_cow, addr, end, vma->vm_start, vma->vm_end,
                              pfn, &vma->vm_pgoff);
        if (err)
                return err;

        vma_set_flags_mask(vma, VMA_REMAP_FLAGS);
        return 0;
}

/**
 * remap_pfn_range - remap kernel memory to userspace
 * @vma: user vma to map to
 * @addr: target page aligned user address to start at
 * @pfn: page frame number of kernel physical memory address
 * @size: size of mapping area
 * @prot: page protection flags for this mapping
 *
 * Note: this is only safe if the mm semaphore is held when called.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                    unsigned long pfn, unsigned long size, pgprot_t prot)
{
        int err;

        err = remap_pfn_range_prepare_vma(vma, addr, pfn, size);
        if (err)
                return err;

        return do_remap_pfn_range(vma, addr, pfn, size, prot);
}
EXPORT_SYMBOL(remap_pfn_range);

int remap_pfn_range_complete(struct vm_area_struct *vma,
                             struct mmap_action *action)
{
        const unsigned long start = action->remap.start;
        const unsigned long pfn = action->remap.start_pfn;
        const unsigned long size = action->remap.size;
        const pgprot_t prot = action->remap.pgprot;

        return do_remap_pfn_range(vma, start, pfn, size, prot);
}

static int __simple_ioremap_prep(unsigned long vm_len, pgoff_t vm_pgoff,
                                 phys_addr_t start_phys, unsigned long size,
                                 unsigned long *pfnp)
{
        unsigned long pfn, pages;

        /* Check that the physical memory area passed in looks valid */
        if (start_phys + size < start_phys)
                return -EINVAL;
        /*
         * You *really* shouldn't map things that aren't page-aligned,
         * but we've historically allowed it because IO memory might
         * just have smaller alignment.
         */
        size += start_phys & ~PAGE_MASK;
        pfn = start_phys >> PAGE_SHIFT;
        pages = (size + ~PAGE_MASK) >> PAGE_SHIFT;
        if (pfn + pages < pfn)
                return -EINVAL;

        /* We start the mapping 'vm_pgoff' pages into the area */
        if (vm_pgoff > pages)
                return -EINVAL;
        pfn += vm_pgoff;
        pages -= vm_pgoff;

        /* Can we fit all of the mapping? */
        if ((vm_len >> PAGE_SHIFT) > pages)
                return -EINVAL;

        *pfnp = pfn;
        return 0;
}

int simple_ioremap_prepare(struct vm_area_desc *desc)
{
        struct mmap_action *action = &desc->action;
        const phys_addr_t start = action->simple_ioremap.start_phys_addr;
        const unsigned long size = action->simple_ioremap.size;
        unsigned long pfn;
        int err;

        err = __simple_ioremap_prep(vma_desc_size(desc), desc->pgoff,
                                    start, size, &pfn);
        if (err)
                return err;

        /* The I/O remap logic does the heavy lifting. */
        mmap_action_ioremap_full(desc, pfn);
        return io_remap_pfn_range_prepare(desc);
}

/**
 * vm_iomap_memory - remap memory to userspace
 * @vma: user vma to map to
 * @start: start of the physical memory to be mapped
 * @len: size of area
 *
 * This is a simplified io_remap_pfn_range() for common driver use. The
 * driver just needs to give us the physical memory range to be mapped,
 * we'll figure out the rest from the vma information.
 *
 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
 * whatever write-combining details or similar.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
        const unsigned long vm_start = vma->vm_start;
        const unsigned long vm_end = vma->vm_end;
        const unsigned long vm_len = vm_end - vm_start;
        unsigned long pfn;
        int err;

        err = __simple_ioremap_prep(vm_len, vma->vm_pgoff, start, len, &pfn);
        if (err)
                return err;

        /* Ok, let it rip */
        return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);

static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pte_t *pte, *mapped_pte;
        int err = 0;
        spinlock_t *ptl;

        if (create) {
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_alloc_kernel_track(pmd, addr, mask) :
                        pte_alloc_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -ENOMEM;
        } else {
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_offset_kernel(pmd, addr) :
                        pte_offset_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -EINVAL;
        }

        lazy_mmu_mode_enable();

        if (fn) {
                do {
                        if (create || !pte_none(ptep_get(pte))) {
                                err = fn(pte, addr, data);
                                if (err)
                                        break;
                        }
                } while (pte++, addr += PAGE_SIZE, addr != end);
        }
        *mask |= PGTBL_PTE_MODIFIED;

        lazy_mmu_mode_disable();

        if (mm != &init_mm)
                pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int err = 0;

        BUG_ON(pud_leaf(*pud));

        if (create) {
                pmd = pmd_alloc_track(mm, pud, addr, mask);
                if (!pmd)
                        return -ENOMEM;
        } else {
                pmd = pmd_offset(pud, addr);
        }
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none(*pmd) && !create)
                        continue;
                if (WARN_ON_ONCE(pmd_leaf(*pmd)))
                        return -EINVAL;
                if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
                        if (!create)
                                continue;
                        pmd_clear_bad(pmd);
                }
                err = apply_to_pte_range(mm, pmd, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (pmd++, addr = next, addr != end);

        return err;
}

static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int err = 0;

        if (create) {
                pud = pud_alloc_track(mm, p4d, addr, mask);
                if (!pud)
                        return -ENOMEM;
        } else {
                pud = pud_offset(p4d, addr);
        }
        do {
                next = pud_addr_end(addr, end);
                if (pud_none(*pud) && !create)
                        continue;
                if (WARN_ON_ONCE(pud_leaf(*pud)))
                        return -EINVAL;
                if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
                        if (!create)
                                continue;
                        pud_clear_bad(pud);
                }
                err = apply_to_pmd_range(mm, pud, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (pud++, addr = next, addr != end);

        return err;
}

static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;
        int err = 0;

        if (create) {
                p4d = p4d_alloc_track(mm, pgd, addr, mask);
                if (!p4d)
                        return -ENOMEM;
        } else {
                p4d = p4d_offset(pgd, addr);
        }
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none(*p4d) && !create)
                        continue;
                if (WARN_ON_ONCE(p4d_leaf(*p4d)))
                        return -EINVAL;
                if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
                        if (!create)
                                continue;
                        p4d_clear_bad(p4d);
                }
                err = apply_to_pud_range(mm, p4d, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (p4d++, addr = next, addr != end);

        return err;
}

static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn,
                                 void *data, bool create)
{
        pgd_t *pgd;
        unsigned long start = addr, next;
        unsigned long end = addr + size;
        pgtbl_mod_mask mask = 0;
        int err = 0;

        if (WARN_ON(addr >= end))
                return -EINVAL;

        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none(*pgd) && !create)
                        continue;
                if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
                        err = -EINVAL;
                        break;
                }
                if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
                        if (!create)
                                continue;
                        pgd_clear_bad(pgd);
                }
                err = apply_to_p4d_range(mm, pgd, addr, next,
                                         fn, data, create, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, start + size);

        return err;
}

/*
 * Scan a region of virtual memory, filling in page tables as necessary
 * and calling a provided function on each leaf page table.
 */
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                        unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, true);
}
EXPORT_SYMBOL_GPL(apply_to_page_range);

/*
 * Scan a region of virtual memory, calling a provided function on
 * each leaf page table where it exists.
 *
 * Unlike apply_to_page_range, this does _not_ fill in page tables
 * where they are absent.
 */
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, false);
}

/*
 * handle_pte_fault chooses page fault handler according to an entry which was
 * read non-atomically.  Before making any commitment, on those architectures
 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
 * parts, do_swap_page must check under lock before unmapping the pte and
 * proceeding (but do_wp_page is only called after already making such a check;
 * and do_anonymous_page can safely check later on).
 */
static inline int pte_unmap_same(struct vm_fault *vmf)
{
        int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
                spin_lock(vmf->ptl);
                same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
                spin_unlock(vmf->ptl);
        }
#endif
        pte_unmap(vmf->pte);
        vmf->pte = NULL;
        return same;
}

/*
 * Return:
 *        0:                copied succeeded
 *        -EHWPOISON:        copy failed due to hwpoison in source page
 *        -EAGAIN:        copied failed (some other reason)
 */
static inline int __wp_page_copy_user(struct page *dst, struct page *src,
                                      struct vm_fault *vmf)
{
        int ret;
        void *kaddr;
        void __user *uaddr;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = vmf->address;

        if (likely(src)) {
                if (copy_mc_user_highpage(dst, src, addr, vma))
                        return -EHWPOISON;
                return 0;
        }

        /*
         * If the source page was a PFN mapping, we don't have
         * a "struct page" for it. We do a best-effort copy by
         * just copying from the original user address. If that
         * fails, we just zero-fill it. Live with it.
         */
        kaddr = kmap_local_page(dst);
        pagefault_disable();
        uaddr = (void __user *)(addr & PAGE_MASK);

        /*
         * On architectures with software "accessed" bits, we would
         * take a double page fault, so mark it accessed here.
         */
        vmf->pte = NULL;
        if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
                pte_t entry;

                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /*
                         * Other thread has already handled the fault
                         * and update local tlb only
                         */
                        if (vmf->pte)
                                update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }

                entry = pte_mkyoung(vmf->orig_pte);
                if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
                        update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
        }

        /*
         * This really shouldn't fail, because the page is there
         * in the page tables. But it might just be unreadable,
         * in which case we just give up and fill the result with
         * zeroes.
         */
        if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                if (vmf->pte)
                        goto warn;

                /* Re-validate under PTL if the page is still mapped */
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /* The PTE changed under us, update local tlb */
                        if (vmf->pte)
                                update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }

                /*
                 * The same page can be mapped back since last copy attempt.
                 * Try to copy again under PTL.
                 */
                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                        /*
                         * Give a warn in case there can be some obscure
                         * use-case
                         */
warn:
                        WARN_ON_ONCE(1);
                        clear_page(kaddr);
                }
        }

        ret = 0;

pte_unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        pagefault_enable();
        kunmap_local(kaddr);
        flush_dcache_page(dst);

        return ret;
}

static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
        struct file *vm_file = vma->vm_file;

        if (vm_file)
                return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;

        /*
         * Special mappings (e.g. VDSO) do not have any file so fake
         * a default GFP_KERNEL for them.
         */
        return GFP_KERNEL;
}

/*
 * Notify the address space that the page is about to become writable so that
 * it can prohibit this or wait for the page to get into an appropriate state.
 *
 * We do this without the lock held, so that it can sleep if it needs to.
 */
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
{
        vm_fault_t ret;
        unsigned int old_flags = vmf->flags;

        vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

        if (vmf->vma->vm_file &&
            IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
                return VM_FAULT_SIGBUS;

        ret = vmf->vma->vm_ops->page_mkwrite(vmf);
        /* Restore original flags so that caller is not surprised */
        vmf->flags = old_flags;
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
        if (unlikely(!(ret & VM_FAULT_LOCKED))) {
                folio_lock(folio);
                if (!folio->mapping) {
                        folio_unlock(folio);
                        return 0; /* retry */
                }
                ret |= VM_FAULT_LOCKED;
        } else
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        return ret;
}

/*
 * Handle dirtying of a page in shared file mapping on a write fault.
 *
 * The function expects the page to be locked and unlocks it.
 */
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct address_space *mapping;
        struct folio *folio = page_folio(vmf->page);
        bool dirtied;
        bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;

        dirtied = folio_mark_dirty(folio);
        VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
        /*
         * Take a local copy of the address_space - folio.mapping may be zeroed
         * by truncate after folio_unlock().   The address_space itself remains
         * pinned by vma->vm_file's reference.  We rely on folio_unlock()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
        mapping = folio_raw_mapping(folio);
        folio_unlock(folio);

        if (!page_mkwrite)
                file_update_time(vma->vm_file);

        /*
         * Throttle page dirtying rate down to writeback speed.
         *
         * mapping may be NULL here because some device drivers do not
         * set page.mapping but still dirty their pages
         *
         * Drop the mmap_lock before waiting on IO, if we can. The file
         * is pinning the mapping, as per above.
         */
        if ((dirtied || page_mkwrite) && mapping) {
                struct file *fpin;

                fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                balance_dirty_pages_ratelimited(mapping);
                if (fpin) {
                        fput(fpin);
                        return VM_FAULT_COMPLETED;
                }
        }

        return 0;
}

/*
 * Handle write page faults for pages that can be reused in the current vma
 *
 * This can happen either due to the mapping being with the VM_SHARED flag,
 * or due to us being the last reference standing to the page. In either
 * case, all we need to do here is to mark the page as writable and update
 * any related book-keeping.
 */
static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        pte_t entry;

        VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
        VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte)));

        if (folio) {
                VM_BUG_ON(folio_test_anon(folio) &&
                          !PageAnonExclusive(vmf->page));
                /*
                 * Clear the folio's cpupid information as the existing
                 * information potentially belongs to a now completely
                 * unrelated process.
                 */
                folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
        }

        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
        entry = pte_mkyoung(vmf->orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
                update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        count_vm_event(PGREUSE);
}

/*
 * We could add a bitflag somewhere, but for now, we know that all
 * vm_ops that have a ->map_pages have been audited and don't need
 * the mmap_lock to be held.
 */
static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
                return 0;
        vma_end_read(vma);
        return VM_FAULT_RETRY;
}

/**
 * __vmf_anon_prepare - Prepare to handle an anonymous fault.
 * @vmf: The vm_fault descriptor passed from the fault handler.
 *
 * When preparing to insert an anonymous page into a VMA from a
 * fault handler, call this function rather than anon_vma_prepare().
 * If this vma does not already have an associated anon_vma and we are
 * only protected by the per-VMA lock, the caller must retry with the
 * mmap_lock held.  __anon_vma_prepare() will look at adjacent VMAs to
 * determine if this VMA can share its anon_vma, and that's not safe to
 * do with only the per-VMA lock held for this VMA.
 *
 * Return: 0 if fault handling can proceed.  Any other value should be
 * returned to the caller.
 */
vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        if (likely(vma->anon_vma))
                return 0;
        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                if (!mmap_read_trylock(vma->vm_mm))
                        return VM_FAULT_RETRY;
        }
        if (__anon_vma_prepare(vma))
                ret = VM_FAULT_OOM;
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                mmap_read_unlock(vma->vm_mm);
        return ret;
}

/*
 * Handle the case of a page which we actually need to copy to a new page,
 * either due to COW or unsharing.
 *
 * Called with mmap_lock locked and the old page referenced, but
 * without the ptl held.
 *
 * High level logic flow:
 *
 * - Allocate a page, copy the content of the old page to the new one.
 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
 * - Take the PTL. If the pte changed, bail out and release the allocated page
 * - If the pte is still the way we remember it, update the page table and all
 *   relevant references. This includes dropping the reference the page-table
 *   held to the old page, as well as updating the rmap.
 * - In any case, unlock the PTL and drop the reference we took to the old page.
 */
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct folio *old_folio = NULL;
        struct folio *new_folio = NULL;
        pte_t entry;
        int page_copied = 0;
        struct mmu_notifier_range range;
        vm_fault_t ret;
        bool pfn_is_zero;

        delayacct_wpcopy_start();

        if (vmf->page)
                old_folio = page_folio(vmf->page);
        ret = vmf_anon_prepare(vmf);
        if (unlikely(ret))
                goto out;

        pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte));
        new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero);
        if (!new_folio)
                goto oom;

        if (!pfn_is_zero) {
                int err;

                err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
                if (err) {
                        /*
                         * COW failed, if the fault was solved by other,
                         * it's fine. If not, userspace would re-fault on
                         * the same address and we will handle the fault
                         * from the second attempt.
                         * The -EHWPOISON case will not be retried.
                         */
                        folio_put(new_folio);
                        if (old_folio)
                                folio_put(old_folio);

                        delayacct_wpcopy_end();
                        return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
                }
                kmsan_copy_page_meta(&new_folio->page, vmf->page);
        }

        __folio_mark_uptodate(new_folio);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Re-check the pte - we dropped the lock
         */
        vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
        if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                if (old_folio) {
                        if (!folio_test_anon(old_folio)) {
                                dec_mm_counter(mm, mm_counter_file(old_folio));
                                inc_mm_counter(mm, MM_ANONPAGES);
                        }
                } else {
                        ksm_might_unmap_zero_page(mm, vmf->orig_pte);
                        inc_mm_counter(mm, MM_ANONPAGES);
                }
                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = folio_mk_pte(new_folio, vma->vm_page_prot);
                entry = pte_sw_mkyoung(entry);
                if (unlikely(unshare)) {
                        if (pte_soft_dirty(vmf->orig_pte))
                                entry = pte_mksoft_dirty(entry);
                        if (pte_uffd_wp(vmf->orig_pte))
                                entry = pte_mkuffd_wp(entry);
                } else {
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                }

                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry, to keep TLBs on different CPUs in
                 * sync. This code used to set the new PTE then flush TLBs, but
                 * that left a window where the new PTE could be loaded into
                 * some TLBs while the old PTE remains in others.
                 */
                ptep_clear_flush(vma, vmf->address, vmf->pte);
                folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE);
                folio_add_lru_vma(new_folio, vma);
                BUG_ON(unshare && pte_write(entry));
                set_pte_at(mm, vmf->address, vmf->pte, entry);
                update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
                if (old_folio) {
                        /*
                         * Only after switching the pte to the new page may
                         * we remove the mapcount here. Otherwise another
                         * process may come and find the rmap count decremented
                         * before the pte is switched to the new page, and
                         * "reuse" the old page writing into it while our pte
                         * here still points into it and can be read by other
                         * threads.
                         *
                         * The critical issue is to order this
                         * folio_remove_rmap_pte() with the ptp_clear_flush
                         * above. Those stores are ordered by (if nothing else,)
                         * the barrier present in the atomic_add_negative
                         * in folio_remove_rmap_pte();
                         *
                         * Then the TLB flush in ptep_clear_flush ensures that
                         * no process can access the old page before the
                         * decremented mapcount is visible. And the old page
                         * cannot be reused until after the decremented
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
                        folio_remove_rmap_pte(old_folio, vmf->page, vma);
                }

                /* Free the old page.. */
                new_folio = old_folio;
                page_copied = 1;
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        } else if (vmf->pte) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        }

        mmu_notifier_invalidate_range_end(&range);

        if (new_folio)
                folio_put(new_folio);
        if (old_folio) {
                if (page_copied)
                        free_swap_cache(old_folio);
                folio_put(old_folio);
        }

        delayacct_wpcopy_end();
        return 0;
oom:
        ret = VM_FAULT_OOM;
out:
        if (old_folio)
                folio_put(old_folio);

        delayacct_wpcopy_end();
        return ret;
}

/**
 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
 *                          writeable once the page is prepared
 *
 * @vmf: structure describing the fault
 * @folio: the folio of vmf->page
 *
 * This function handles all that is needed to finish a write page fault in a
 * shared mapping due to PTE being read-only once the mapped page is prepared.
 * It handles locking of PTE and modifying it.
 *
 * The function expects the page to be locked or other protection against
 * concurrent faults / writeback (such as DAX radix tree locks).
 *
 * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
 * we acquired PTE lock.
 */
static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
{
        WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
                                       &vmf->ptl);
        if (!vmf->pte)
                return VM_FAULT_NOPAGE;
        /*
         * We might have raced with another page fault while we released the
         * pte_offset_map_lock.
         */
        if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return VM_FAULT_NOPAGE;
        }
        wp_page_reuse(vmf, folio);
        return 0;
}

/*
 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
 * mapping
 */
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
                vm_fault_t ret;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                ret = vmf_can_call_fault(vmf);
                if (ret)
                        return ret;

                vmf->flags |= FAULT_FLAG_MKWRITE;
                ret = vma->vm_ops->pfn_mkwrite(vmf);
                if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                        return ret;
                return finish_mkwrite_fault(vmf, NULL);
        }
        wp_page_reuse(vmf, NULL);
        return 0;
}

static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        folio_get(folio);

        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                vm_fault_t tmp;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                tmp = vmf_can_call_fault(vmf);
                if (tmp) {
                        folio_put(folio);
                        return tmp;
                }

                tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        folio_put(folio);
                        return tmp;
                }
                tmp = finish_mkwrite_fault(vmf, folio);
                if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return tmp;
                }
        } else {
                wp_page_reuse(vmf, folio);
                folio_lock(folio);
        }
        ret |= fault_dirty_shared_page(vmf);
        folio_put(folio);

        return ret;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
                struct vm_area_struct *vma)
{
        bool exclusive = false;

        /* Let's just free up a large folio if only a single page is mapped. */
        if (folio_large_mapcount(folio) <= 1)
                return false;

        /*
         * The assumption for anonymous folios is that each page can only get
         * mapped once into each MM. The only exception are KSM folios, which
         * are always small.
         *
         * Each taken mapcount must be paired with exactly one taken reference,
         * whereby the refcount must be incremented before the mapcount when
         * mapping a page, and the refcount must be decremented after the
         * mapcount when unmapping a page.
         *
         * If all folio references are from mappings, and all mappings are in
         * the page tables of this MM, then this folio is exclusive to this MM.
         */
        if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
                return false;

        VM_WARN_ON_ONCE(folio_test_ksm(folio));

        if (unlikely(folio_test_swapcache(folio))) {
                /*
                 * Note: freeing up the swapcache will fail if some PTEs are
                 * still swap entries.
                 */
                if (!folio_trylock(folio))
                        return false;
                folio_free_swap(folio);
                folio_unlock(folio);
        }

        if (folio_large_mapcount(folio) != folio_ref_count(folio))
                return false;

        /* Stabilize the mapcount vs. refcount and recheck. */
        folio_lock_large_mapcount(folio);
        VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);

        if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
                goto unlock;
        if (folio_large_mapcount(folio) != folio_ref_count(folio))
                goto unlock;

        VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
                        folio_mm_id(folio, 1) != vma->vm_mm->mm_id);

        /*
         * Do we need the folio lock? Likely not. If there would have been
         * references from page migration/swapout, we would have detected
         * an additional folio reference and never ended up here.
         */
        exclusive = true;
unlock:
        folio_unlock_large_mapcount(folio);
        return exclusive;
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
                struct vm_area_struct *vma)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static bool wp_can_reuse_anon_folio(struct folio *folio,
                                    struct vm_area_struct *vma)
{
        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio))
                return __wp_can_reuse_large_anon_folio(folio, vma);

        /*
         * We have to verify under folio lock: these early checks are
         * just an optimization to avoid locking the folio and freeing
         * the swapcache if there is little hope that we can reuse.
         *
         * KSM doesn't necessarily raise the folio refcount.
         */
        if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
                return false;
        if (!folio_test_lru(folio))
                /*
                 * We cannot easily detect+handle references from
                 * remote LRU caches or references to LRU folios.
                 */
                lru_add_drain();
        if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
                return false;
        if (!folio_trylock(folio))
                return false;
        if (folio_test_swapcache(folio))
                folio_free_swap(folio);
        if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
                folio_unlock(folio);
                return false;
        }
        /*
         * Ok, we've got the only folio reference from our mapping
         * and the folio is locked, it's dark out, and we're wearing
         * sunglasses. Hit it.
         */
        folio_move_anon_rmap(folio, vma);
        folio_unlock(folio);
        return true;
}

/*
 * This routine handles present pages, when
 * * users try to write to a shared page (FAULT_FLAG_WRITE)
 * * GUP wants to take a R/O pin on a possibly shared anonymous page
 *   (FAULT_FLAG_UNSHARE)
 *
 * It is done by copying the page to a new address and decrementing the
 * shared-page counter for the old page.
 *
 * Note that this routine assumes that the protection checks have been
 * done by the caller (the low-level page fault routine in most cases).
 * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
 * done any necessary COW.
 *
 * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
 * though the page will change only once the write actually happens. This
 * avoids a few races, and potentially makes it more efficient.
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), with pte both mapped and locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_wp_page(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = NULL;
        pte_t pte;

        if (likely(!unshare)) {
                if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
                        if (!userfaultfd_wp_async(vma)) {
                                pte_unmap_unlock(vmf->pte, vmf->ptl);
                                return handle_userfault(vmf, VM_UFFD_WP);
                        }

                        /*
                         * Nothing needed (cache flush, TLB invalidations,
                         * etc.) because we're only removing the uffd-wp bit,
                         * which is completely invisible to the user.
                         */
                        pte = pte_clear_uffd_wp(ptep_get(vmf->pte));

                        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
                        /*
                         * Update this to be prepared for following up CoW
                         * handling
                         */
                        vmf->orig_pte = pte;
                }

                /*
                 * Userfaultfd write-protect can defer flushes. Ensure the TLB
                 * is flushed in this case before copying.
                 */
                if (unlikely(userfaultfd_wp(vmf->vma) &&
                             mm_tlb_flush_pending(vmf->vma->vm_mm)))
                        flush_tlb_page(vmf->vma, vmf->address);
        }

        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);

        if (vmf->page)
                folio = page_folio(vmf->page);

        /*
         * Shared mapping: we are guaranteed to have VM_WRITE and
         * FAULT_FLAG_WRITE set at this point.
         */
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
                 * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
                 *
                 * We should not cow pages in a shared writeable mapping.
                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
                 */
                if (!vmf->page || is_fsdax_page(vmf->page)) {
                        vmf->page = NULL;
                        return wp_pfn_shared(vmf);
                }
                return wp_page_shared(vmf, folio);
        }

        /*
         * Private mapping: create an exclusive anonymous page copy if reuse
         * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
         *
         * If we encounter a page that is marked exclusive, we must reuse
         * the page without further checks.
         */
        if (folio && folio_test_anon(folio) &&
            (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
                if (!PageAnonExclusive(vmf->page))
                        SetPageAnonExclusive(vmf->page);
                if (unlikely(unshare)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return 0;
                }
                wp_page_reuse(vmf, folio);
                return 0;
        }
        /*
         * Ok, we need to copy. Oh, well..
         */
        if (folio)
                folio_get(folio);

        pte_unmap_unlock(vmf->pte, vmf->ptl);
#ifdef CONFIG_KSM
        if (folio && folio_test_ksm(folio))
                count_vm_event(COW_KSM);
#endif
        return wp_page_copy(vmf);
}

static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
                                            pgoff_t first_index,
                                            pgoff_t last_index,
                                            struct zap_details *details)
{
        struct vm_area_struct *vma;
        unsigned long start, size;
        struct mmu_gather tlb;

        vma_interval_tree_foreach(vma, root, first_index, last_index) {
                const pgoff_t start_idx = max(first_index, vma->vm_pgoff);
                const pgoff_t end_idx = min(last_index, vma_last_pgoff(vma)) + 1;

                start = vma->vm_start + ((start_idx - vma->vm_pgoff) << PAGE_SHIFT);
                size = (end_idx - start_idx) << PAGE_SHIFT;

                tlb_gather_mmu(&tlb, vma->vm_mm);
                zap_vma_range_batched(&tlb, vma, start, size, details);
                tlb_finish_mmu(&tlb);
        }
}

/**
 * unmap_mapping_folio() - Unmap single folio from processes.
 * @folio: The locked folio to be unmapped.
 *
 * Unmap this folio from any userspace process which still has it mmaped.
 * Typically, for efficiency, the range of nearby pages has already been
 * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
 * truncation or invalidation holds the lock on a folio, it may find that
 * the page has been remapped again: and then uses unmap_mapping_folio()
 * to unmap it finally.
 */
void unmap_mapping_folio(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;
        struct zap_details details = { };
        pgoff_t        first_index;
        pgoff_t        last_index;

        VM_BUG_ON(!folio_test_locked(folio));

        first_index = folio->index;
        last_index = folio_next_index(folio) - 1;

        details.skip_cows = true;
        details.single_folio = folio;
        details.zap_flags = ZAP_FLAG_DROP_MARKER;

        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
        i_mmap_unlock_read(mapping);
}

/**
 * unmap_mapping_pages() - Unmap pages from processes.
 * @mapping: The address space containing pages to be unmapped.
 * @start: Index of first page to be unmapped.
 * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
 * @even_cows: Whether to unmap even private COWed pages.
 *
 * Unmap the pages in this address space from any userspace process which
 * has them mmaped.  Generally, you want to remove COWed pages as well when
 * a file is being truncated, but not when invalidating pages from the page
 * cache.
 */
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
                pgoff_t nr, bool even_cows)
{
        struct zap_details details = { };
        pgoff_t        first_index = start;
        pgoff_t        last_index = start + nr - 1;

        details.skip_cows = !even_cows;
        if (last_index < first_index)
                last_index = ULONG_MAX;

        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
        i_mmap_unlock_read(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);

/**
 * unmap_mapping_range - unmap the portion of all mmaps in the specified
 * address_space corresponding to the specified byte range in the underlying
 * file.
 *
 * @mapping: the address space containing mmaps to be unmapped.
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
 * boundary.  Note that this is different from truncate_pagecache(), which
 * must keep the partial page.  In contrast, we must get rid of
 * partial pages.
 * @holelen: size of prospective hole in bytes.  This will be rounded
 * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
 * end of the file.
 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
 * but 0 when invalidating pagecache, don't throw away private data.
 */
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows)
{
        pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
        pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;

        /* Check for overflow. */
        if (sizeof(holelen) > sizeof(hlen)) {
                long long holeend =
                        (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
                if (holeend & ~(long long)ULONG_MAX)
                        hlen = ULONG_MAX - hba + 1;
        }

        unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);

/*
 * Restore a potential device exclusive pte to a working pte entry
 */
static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
{
        struct folio *folio = page_folio(vmf->page);
        struct vm_area_struct *vma = vmf->vma;
        struct mmu_notifier_range range;
        vm_fault_t ret;

        /*
         * We need a reference to lock the folio because we don't hold
         * the PTL so a racing thread can remove the device-exclusive
         * entry and unmap it. If the folio is free the entry must
         * have been removed already. If it happens to have already
         * been re-allocated after being freed all we do is lock and
         * unlock it.
         */
        if (!folio_try_get(folio))
                return 0;

        ret = folio_lock_or_retry(folio, vmf);
        if (ret) {
                folio_put(folio);
                return ret;
        }
        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0,
                                vma->vm_mm, vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
        mmu_notifier_invalidate_range_start(&range);

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                                &vmf->ptl);
        if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                restore_exclusive_pte(vma, folio, vmf->page, vmf->address,
                                      vmf->pte, vmf->orig_pte);

        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        folio_unlock(folio);
        folio_put(folio);

        mmu_notifier_invalidate_range_end(&range);
        return 0;
}

/*
 * Check if we should call folio_free_swap to free the swap cache.
 * folio_free_swap only frees the swap cache to release the slot if swap
 * count is zero, so we don't need to check the swap count here.
 */
static inline bool should_try_to_free_swap(struct swap_info_struct *si,
                                           struct folio *folio,
                                           struct vm_area_struct *vma,
                                           unsigned int extra_refs,
                                           unsigned int fault_flags)
{
        if (!folio_test_swapcache(folio))
                return false;
        /*
         * Always try to free swap cache for SWP_SYNCHRONOUS_IO devices. Swap
         * cache can help save some IO or memory overhead, but these devices
         * are fast, and meanwhile, swap cache pinning the slot deferring the
         * release of metadata or fragmentation is a more critical issue.
         */
        if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
                return true;
        if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
            folio_test_mlocked(folio))
                return true;
        /*
         * If we want to map a page that's in the swapcache writable, we
         * have to detect via the refcount if we're really the exclusive
         * user. Try freeing the swapcache to get rid of the swapcache
         * reference only in case it's likely that we'll be the exclusive user.
         */
        return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
                folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
}

static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
{
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
        if (!vmf->pte)
                return 0;
        /*
         * Be careful so that we will only recover a special uffd-wp pte into a
         * none pte.  Otherwise it means the pte could have changed, so retry.
         *
         * This should also cover the case where e.g. the pte changed
         * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
         * So pte_is_marker() check is not enough to safely drop the pte.
         */
        if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
                pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

static vm_fault_t do_pte_missing(struct vm_fault *vmf)
{
        if (vma_is_anonymous(vmf->vma))
                return do_anonymous_page(vmf);
        else
                return do_fault(vmf);
}

/*
 * This is actually a page-missing access, but with uffd-wp special pte
 * installed.  It means this pte was wr-protected before being unmapped.
 */
static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
{
        /*
         * Just in case there're leftover special ptes even after the region
         * got unregistered - we can simply clear them.
         */
        if (unlikely(!userfaultfd_wp(vmf->vma)))
                return pte_marker_clear(vmf);

        return do_pte_missing(vmf);
}

static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
{
        const softleaf_t entry = softleaf_from_pte(vmf->orig_pte);
        const pte_marker marker = softleaf_to_marker(entry);

        /*
         * PTE markers should never be empty.  If anything weird happened,
         * the best thing to do is to kill the process along with its mm.
         */
        if (WARN_ON_ONCE(!marker))
                return VM_FAULT_SIGBUS;

        /* Higher priority than uffd-wp when data corrupted */
        if (marker & PTE_MARKER_POISONED)
                return VM_FAULT_HWPOISON;

        /* Hitting a guard page is always a fatal condition. */
        if (marker & PTE_MARKER_GUARD)
                return VM_FAULT_SIGSEGV;

        if (softleaf_is_uffd_wp_marker(entry))
                return pte_marker_handle_uffd_wp(vmf);

        /* This is an unknown pte marker */
        return VM_FAULT_SIGBUS;
}

static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        softleaf_t entry;

        folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
        if (!folio)
                return NULL;

        entry = softleaf_from_pte(vmf->orig_pte);
        if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
                                           GFP_KERNEL, entry)) {
                folio_put(folio);
                return NULL;
        }

        return folio;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * Check if the PTEs within a range are contiguous swap entries
 * and have consistent swapcache, zeromap.
 */
static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
{
        unsigned long addr;
        softleaf_t entry;
        int idx;
        pte_t pte;

        addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
        idx = (vmf->address - addr) / PAGE_SIZE;
        pte = ptep_get(ptep);

        if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
                return false;
        entry = softleaf_from_pte(pte);
        if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
                return false;

        /*
         * swap_read_folio() can't handle the case a large folio is hybridly
         * from different backends. And they are likely corner cases. Similar
         * things might be added once zswap support large folios.
         */
        if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
                return false;
        if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
                return false;

        return true;
}

static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
                                                     unsigned long addr,
                                                     unsigned long orders)
{
        int order, nr;

        order = highest_order(orders);

        /*
         * To swap in a THP with nr pages, we require that its first swap_offset
         * is aligned with that number, as it was when the THP was swapped out.
         * This helps filter out most invalid entries.
         */
        while (orders) {
                nr = 1 << order;
                if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr)
                        break;
                order = next_order(&orders, order);
        }

        return orders;
}

static struct folio *alloc_swap_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long orders;
        struct folio *folio;
        unsigned long addr;
        softleaf_t entry;
        spinlock_t *ptl;
        pte_t *pte;
        gfp_t gfp;
        int order;

        /*
         * If uffd is active for the vma we need per-page fault fidelity to
         * maintain the uffd semantics.
         */
        if (unlikely(userfaultfd_armed(vma)))
                goto fallback;

        /*
         * A large swapped out folio could be partially or fully in zswap. We
         * lack handling for such cases, so fallback to swapping in order-0
         * folio.
         */
        if (!zswap_never_enabled())
                goto fallback;

        entry = softleaf_from_pte(vmf->orig_pte);
        /*
         * Get a list of all the (large) orders below PMD_ORDER that are enabled
         * and suitable for swapping THP.
         */
        orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
                                          BIT(PMD_ORDER) - 1);
        orders = thp_vma_suitable_orders(vma, vmf->address, orders);
        orders = thp_swap_suitable_orders(swp_offset(entry),
                                          vmf->address, orders);

        if (!orders)
                goto fallback;

        pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                  vmf->address & PMD_MASK, &ptl);
        if (unlikely(!pte))
                goto fallback;

        /*
         * For do_swap_page, find the highest order where the aligned range is
         * completely swap entries with contiguous swap offsets.
         */
        order = highest_order(orders);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
                        break;
                order = next_order(&orders, order);
        }

        pte_unmap_unlock(pte, ptl);

        /* Try allocating the highest of the remaining orders. */
        gfp = vma_thp_gfp_mask(vma);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                folio = vma_alloc_folio(gfp, order, vma, addr);
                if (folio) {
                        if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
                                                            gfp, entry))
                                return folio;
                        count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
                        folio_put(folio);
                }
                count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
                order = next_order(&orders, order);
        }

fallback:
        return __alloc_swap_folio(vmf);
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static struct folio *alloc_swap_folio(struct vm_fault *vmf)
{
        return __alloc_swap_folio(vmf);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

/* Sanity check that a folio is fully exclusive */
static void check_swap_exclusive(struct folio *folio, swp_entry_t entry,
                                 unsigned int nr_pages)
{
        /* Called under PT locked and folio locked, the swap count is stable */
        do {
                VM_WARN_ON_ONCE_FOLIO(__swap_count(entry) != 1, folio);
                entry.val++;
        } while (--nr_pages);
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * We return with the mmap_lock locked or unlocked in the same cases
 * as does filemap_fault().
 */
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *swapcache = NULL, *folio;
        struct page *page;
        struct swap_info_struct *si = NULL;
        rmap_t rmap_flags = RMAP_NONE;
        bool exclusive = false;
        softleaf_t entry;
        pte_t pte;
        vm_fault_t ret = 0;
        int nr_pages;
        unsigned long page_idx;
        unsigned long address;
        pte_t *ptep;

        if (!pte_unmap_same(vmf))
                goto out;

        entry = softleaf_from_pte(vmf->orig_pte);
        if (unlikely(!softleaf_is_swap(entry))) {
                if (softleaf_is_migration(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
                } else if (softleaf_is_device_exclusive(entry)) {
                        vmf->page = softleaf_to_page(entry);
                        ret = remove_device_exclusive_entry(vmf);
                } else if (softleaf_is_device_private(entry)) {
                        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                                /*
                                 * migrate_to_ram is not yet ready to operate
                                 * under VMA lock.
                                 */
                                vma_end_read(vma);
                                ret = VM_FAULT_RETRY;
                                goto out;
                        }

                        vmf->page = softleaf_to_page(entry);
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (unlikely(!vmf->pte ||
                                     !pte_same(ptep_get(vmf->pte),
                                                        vmf->orig_pte)))
                                goto unlock;

                        /*
                         * Get a page reference while we know the page can't be
                         * freed.
                         */
                        if (trylock_page(vmf->page)) {
                                struct dev_pagemap *pgmap;

                                get_page(vmf->page);
                                pte_unmap_unlock(vmf->pte, vmf->ptl);
                                pgmap = page_pgmap(vmf->page);
                                ret = pgmap->ops->migrate_to_ram(vmf);
                                unlock_page(vmf->page);
                                put_page(vmf->page);
                        } else {
                                pte_unmap(vmf->pte);
                                softleaf_entry_wait_on_locked(entry, vmf->ptl);
                        }
                } else if (softleaf_is_hwpoison(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else if (softleaf_is_marker(entry)) {
                        ret = handle_pte_marker(vmf);
                } else {
                        print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
                        ret = VM_FAULT_SIGBUS;
                }
                goto out;
        }

        /* Prevent swapoff from happening to us. */
        si = get_swap_device(entry);
        if (unlikely(!si))
                goto out;

        folio = swap_cache_get_folio(entry);
        if (folio)
                swap_update_readahead(folio, vma, vmf->address);
        if (!folio) {
                if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
                        folio = alloc_swap_folio(vmf);
                        if (folio) {
                                /*
                                 * folio is charged, so swapin can only fail due
                                 * to raced swapin and return NULL.
                                 */
                                swapcache = swapin_folio(entry, folio);
                                if (swapcache != folio)
                                        folio_put(folio);
                                folio = swapcache;
                        }
                } else {
                        folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
                }

                if (!folio) {
                        /*
                         * Back out if somebody else faulted in this pte
                         * while we released the pte lock.
                         */
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (likely(vmf->pte &&
                                   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        goto unlock;
                }

                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
        }

        swapcache = folio;
        ret |= folio_lock_or_retry(folio, vmf);
        if (ret & VM_FAULT_RETRY)
                goto out_release;

        page = folio_file_page(folio, swp_offset(entry));
        /*
         * Make sure folio_free_swap() or swapoff did not release the
         * swapcache from under us.  The page pin, and pte_same test
         * below, are not enough to exclude that.  Even if it is still
         * swapcache, we need to check that the page's swap has not
         * changed.
         */
        if (unlikely(!folio_matches_swap_entry(folio, entry)))
                goto out_page;

        if (unlikely(PageHWPoison(page))) {
                /*
                 * hwpoisoned dirty swapcache pages are kept for killing
                 * owner processes (which may be unknown at hwpoison time)
                 */
                ret = VM_FAULT_HWPOISON;
                goto out_page;
        }

        /*
         * KSM sometimes has to copy on read faults, for example, if
         * folio->index of non-ksm folios would be nonlinear inside the
         * anon VMA -- the ksm flag is lost on actual swapout.
         */
        folio = ksm_might_need_to_copy(folio, vma, vmf->address);
        if (unlikely(!folio)) {
                ret = VM_FAULT_OOM;
                folio = swapcache;
                goto out_page;
        } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
                ret = VM_FAULT_HWPOISON;
                folio = swapcache;
                goto out_page;
        } else if (folio != swapcache)
                page = folio_page(folio, 0);

        /*
         * If we want to map a page that's in the swapcache writable, we
         * have to detect via the refcount if we're really the exclusive
         * owner. Try removing the extra reference from the local LRU
         * caches if required.
         */
        if ((vmf->flags & FAULT_FLAG_WRITE) &&
            !folio_test_ksm(folio) && !folio_test_lru(folio))
                lru_add_drain();

        folio_throttle_swaprate(folio, GFP_KERNEL);

        /*
         * Back out if somebody else already faulted in this pte.
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                goto out_nomap;

        if (unlikely(!folio_test_uptodate(folio))) {
                ret = VM_FAULT_SIGBUS;
                goto out_nomap;
        }

        nr_pages = 1;
        page_idx = 0;
        address = vmf->address;
        ptep = vmf->pte;
        if (folio_test_large(folio) && folio_test_swapcache(folio)) {
                int nr = folio_nr_pages(folio);
                unsigned long idx = folio_page_idx(folio, page);
                unsigned long folio_start = address - idx * PAGE_SIZE;
                unsigned long folio_end = folio_start + nr * PAGE_SIZE;
                pte_t *folio_ptep;
                pte_t folio_pte;

                if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start)))
                        goto check_folio;
                if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end)))
                        goto check_folio;

                folio_ptep = vmf->pte - idx;
                folio_pte = ptep_get(folio_ptep);
                if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
                    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
                        goto check_folio;

                page_idx = idx;
                address = folio_start;
                ptep = folio_ptep;
                nr_pages = nr;
                entry = folio->swap;
                page = &folio->page;
        }

check_folio:
        /*
         * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
         * must never point at an anonymous page in the swapcache that is
         * PG_anon_exclusive. Sanity check that this holds and especially, that
         * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
         * check after taking the PT lock and making sure that nobody
         * concurrently faulted in this page and set PG_anon_exclusive.
         */
        BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
        BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));

        /*
         * If a large folio already belongs to anon mapping, then we
         * can just go on and map it partially.
         * If not, with the large swapin check above failing, the page table
         * have changed, so sub pages might got charged to the wrong cgroup,
         * or even should be shmem. So we have to free it and fallback.
         * Nothing should have touched it, both anon and shmem checks if a
         * large folio is fully appliable before use.
         *
         * This will be removed once we unify folio allocation in the swap cache
         * layer, where allocation of a folio stabilizes the swap entries.
         */
        if (!folio_test_anon(folio) && folio_test_large(folio) &&
            nr_pages != folio_nr_pages(folio)) {
                if (!WARN_ON_ONCE(folio_test_dirty(folio)))
                        swap_cache_del_folio(folio);
                goto out_nomap;
        }

        /*
         * Check under PT lock (to protect against concurrent fork() sharing
         * the swap entry concurrently) for certainly exclusive pages.
         */
        if (!folio_test_ksm(folio)) {
                /*
                 * The can_swapin_thp check above ensures all PTE have
                 * same exclusiveness. Checking just one PTE is fine.
                 */
                exclusive = pte_swp_exclusive(vmf->orig_pte);
                if (exclusive)
                        check_swap_exclusive(folio, entry, nr_pages);
                if (folio != swapcache) {
                        /*
                         * We have a fresh page that is not exposed to the
                         * swapcache -> certainly exclusive.
                         */
                        exclusive = true;
                } else if (exclusive && folio_test_writeback(folio) &&
                          data_race(si->flags & SWP_STABLE_WRITES)) {
                        /*
                         * This is tricky: not all swap backends support
                         * concurrent page modifications while under writeback.
                         *
                         * So if we stumble over such a page in the swapcache
                         * we must not set the page exclusive, otherwise we can
                         * map it writable without further checks and modify it
                         * while still under writeback.
                         *
                         * For these problematic swap backends, simply drop the
                         * exclusive marker: this is perfectly fine as we start
                         * writeback only if we fully unmapped the page and
                         * there are no unexpected references on the page after
                         * unmapping succeeded. After fully unmapped, no
                         * further GUP references (FOLL_GET and FOLL_PIN) can
                         * appear, so dropping the exclusive marker and mapping
                         * it only R/O is fine.
                         */
                        exclusive = false;
                }
        }

        /*
         * Some architectures may have to restore extra metadata to the page
         * when reading from swap. This metadata may be indexed by swap entry
         * so this must be called before folio_put_swap().
         */
        arch_swap_restore(folio_swap(entry, folio), folio);

        add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
        add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
        pte = mk_pte(page, vma->vm_page_prot);
        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
        if (pte_swp_uffd_wp(vmf->orig_pte))
                pte = pte_mkuffd_wp(pte);

        /*
         * Same logic as in do_wp_page(); however, optimize for pages that are
         * certainly not shared either because we just allocated them without
         * exposing them to the swapcache or because the swap entry indicates
         * exclusivity.
         */
        if (!folio_test_ksm(folio) &&
            (exclusive || folio_ref_count(folio) == 1)) {
                if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) &&
                    !pte_needs_soft_dirty_wp(vma, pte)) {
                        pte = pte_mkwrite(pte, vma);
                        if (vmf->flags & FAULT_FLAG_WRITE) {
                                pte = pte_mkdirty(pte);
                                vmf->flags &= ~FAULT_FLAG_WRITE;
                        }
                }
                rmap_flags |= RMAP_EXCLUSIVE;
        }
        folio_ref_add(folio, nr_pages - 1);
        flush_icache_pages(vma, page, nr_pages);
        vmf->orig_pte = pte_advance_pfn(pte, page_idx);

        /* ksm created a completely new copy */
        if (unlikely(folio != swapcache)) {
                folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
                folio_add_lru_vma(folio, vma);
                folio_put_swap(swapcache, NULL);
        } else if (!folio_test_anon(folio)) {
                /*
                 * We currently only expect !anon folios that are fully
                 * mappable. See the comment after can_swapin_thp above.
                 */
                VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio);
                VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
                folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
                folio_put_swap(folio, NULL);
        } else {
                VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio));
                folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
                                         rmap_flags);
                folio_put_swap(folio, nr_pages == 1 ? page : NULL);
        }

        VM_BUG_ON(!folio_test_anon(folio) ||
                        (pte_write(pte) && !PageAnonExclusive(page)));
        set_ptes(vma->vm_mm, address, ptep, pte, nr_pages);
        arch_do_swap_page_nr(vma->vm_mm, vma, address,
                        pte, pte, nr_pages);

        /*
         * Remove the swap entry and conditionally try to free up the swapcache.
         * Do it after mapping, so raced page faults will likely see the folio
         * in swap cache and wait on the folio lock.
         */
        if (should_try_to_free_swap(si, folio, vma, nr_pages, vmf->flags))
                folio_free_swap(folio);

        folio_unlock(folio);
        if (unlikely(folio != swapcache)) {
                /*
                 * Hold the lock to avoid the swap entry to be reused
                 * until we take the PT lock for the pte_same() check
                 * (to avoid false positives from pte_same). For
                 * further safety release the lock after the folio_put_swap
                 * so that the swap count won't change under a
                 * parallel locked swapcache.
                 */
                folio_unlock(swapcache);
                folio_put(swapcache);
        }

        if (vmf->flags & FAULT_FLAG_WRITE) {
                ret |= do_wp_page(vmf);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
        }

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_range(vmf, vma, address, ptep, nr_pages);
unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        if (si)
                put_swap_device(si);
        return ret;
out_nomap:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
        if (folio_test_swapcache(folio))
                folio_free_swap(folio);
        folio_unlock(folio);
out_release:
        folio_put(folio);
        if (folio != swapcache) {
                folio_unlock(swapcache);
                folio_put(swapcache);
        }
        if (si)
                put_swap_device(si);
        return ret;
}

static bool pte_range_none(pte_t *pte, int nr_pages)
{
        int i;

        for (i = 0; i < nr_pages; i++) {
                if (!pte_none(ptep_get_lockless(pte + i)))
                        return false;
        }

        return true;
}

static struct folio *alloc_anon_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        unsigned long orders;
        struct folio *folio;
        unsigned long addr;
        pte_t *pte;
        gfp_t gfp;
        int order;

        /*
         * If uffd is active for the vma we need per-page fault fidelity to
         * maintain the uffd semantics.
         */
        if (unlikely(userfaultfd_armed(vma)))
                goto fallback;

        /*
         * Get a list of all the (large) orders below PMD_ORDER that are enabled
         * for this vma. Then filter out the orders that can't be allocated over
         * the faulting address and still be fully contained in the vma.
         */
        orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
                                          BIT(PMD_ORDER) - 1);
        orders = thp_vma_suitable_orders(vma, vmf->address, orders);

        if (!orders)
                goto fallback;

        pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
        if (!pte)
                return ERR_PTR(-EAGAIN);

        /*
         * Find the highest order where the aligned range is completely
         * pte_none(). Note that all remaining orders will be completely
         * pte_none().
         */
        order = highest_order(orders);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                if (pte_range_none(pte + pte_index(addr), 1 << order))
                        break;
                order = next_order(&orders, order);
        }

        pte_unmap(pte);

        if (!orders)
                goto fallback;

        /* Try allocating the highest of the remaining orders. */
        gfp = vma_thp_gfp_mask(vma);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                folio = vma_alloc_folio(gfp, order, vma, addr);
                if (folio) {
                        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
                                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                                folio_put(folio);
                                goto next;
                        }
                        folio_throttle_swaprate(folio, gfp);
                        /*
                         * When a folio is not zeroed during allocation
                         * (__GFP_ZERO not used) or user folios require special
                         * handling, folio_zero_user() is used to make sure
                         * that the page corresponding to the faulting address
                         * will be hot in the cache after zeroing.
                         */
                        if (user_alloc_needs_zeroing())
                                folio_zero_user(folio, vmf->address);
                        return folio;
                }
next:
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
                order = next_order(&orders, order);
        }

fallback:
#endif
        return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
}

void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte,
                struct vm_area_struct *vma, unsigned long addr,
                bool uffd_wp)
{
        const unsigned int nr_pages = folio_nr_pages(folio);
        pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);

        entry = pte_sw_mkyoung(entry);

        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry), vma);
        if (uffd_wp)
                entry = pte_mkuffd_wp(entry);

        folio_ref_add(folio, nr_pages - 1);
        folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
        folio_add_lru_vma(folio, vma);
        set_ptes(vma->vm_mm, addr, pte, entry, nr_pages);
        update_mmu_cache_range(NULL, vma, addr, pte, nr_pages);
}

static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte,
                struct vm_area_struct *vma, unsigned long addr, bool uffd_wp)
{
        const unsigned int order = folio_order(folio);

        map_anon_folio_pte_nopf(folio, pte, vma, addr, uffd_wp);
        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1L << order);
        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC);
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long addr = vmf->address;
        struct folio *folio;
        vm_fault_t ret = 0;
        int nr_pages;
        pte_t entry;

        /* File mapping without ->vm_ops ? */
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;

        /*
         * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
         * be distinguished from a transient failure of pte_offset_map().
         */
        if (pte_alloc(vma->vm_mm, vmf->pmd))
                return VM_FAULT_OOM;

        /* Use the zero-page for reads */
        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
                entry = pte_mkspecial(pfn_pte(zero_pfn(vmf->address),
                                                vma->vm_page_prot));
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                vmf->address, &vmf->ptl);
                if (!vmf->pte)
                        goto unlock;
                if (vmf_pte_changed(vmf)) {
                        update_mmu_tlb(vma, vmf->address, vmf->pte);
                        goto unlock;
                }
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock;
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return handle_userfault(vmf, VM_UFFD_MISSING);
                }
                if (vmf_orig_pte_uffd_wp(vmf))
                        entry = pte_mkuffd_wp(entry);
                set_pte_at(vma->vm_mm, addr, vmf->pte, entry);

                /* No need to invalidate - it was non-present before */
                update_mmu_cache(vma, addr, vmf->pte);
                goto unlock;
        }

        /* Allocate our own private page. */
        ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;
        /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
        folio = alloc_anon_folio(vmf);
        if (IS_ERR(folio))
                return 0;
        if (!folio)
                goto oom;

        nr_pages = folio_nr_pages(folio);
        addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);

        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
         * preceding stores to the page contents become visible before
         * the set_pte_at() write.
         */
        __folio_mark_uptodate(folio);

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
        if (!vmf->pte)
                goto release;
        if (nr_pages == 1 && vmf_pte_changed(vmf)) {
                update_mmu_tlb(vma, addr, vmf->pte);
                goto release;
        } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
                update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
                goto release;
        }

        ret = check_stable_address_space(vma->vm_mm);
        if (ret)
                goto release;

        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                folio_put(folio);
                return handle_userfault(vmf, VM_UFFD_MISSING);
        }
        map_anon_folio_pte_pf(folio, vmf->pte, vma, addr,
                              vmf_orig_pte_uffd_wp(vmf));
unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
release:
        folio_put(folio);
        goto unlock;
oom:
        return VM_FAULT_OOM;
}

/*
 * The mmap_lock must have been held on entry, and may have been
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret;

        /*
         * Preallocate pte before we take page_lock because this might lead to
         * deadlocks for memcg reclaim which waits for pages under writeback:
         *                                lock_page(A)
         *                                SetPageWriteback(A)
         *                                unlock_page(A)
         * lock_page(B)
         *                                lock_page(B)
         * pte_alloc_one
         *   shrink_folio_list
         *     wait_on_page_writeback(A)
         *                                SetPageWriteback(B)
         *                                unlock_page(B)
         *                                # flush A, B to clear the writeback
         */
        if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        ret = vma->vm_ops->fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                            VM_FAULT_DONE_COW)))
                return ret;

        folio = page_folio(vmf->page);
        if (unlikely(PageHWPoison(vmf->page))) {
                vm_fault_t poisonret = VM_FAULT_HWPOISON;
                if (ret & VM_FAULT_LOCKED) {
                        if (page_mapped(vmf->page))
                                unmap_mapping_folio(folio);
                        /* Retry if a clean folio was removed from the cache. */
                        if (mapping_evict_folio(folio->mapping, folio))
                                poisonret = VM_FAULT_NOPAGE;
                        folio_unlock(folio);
                }
                folio_put(folio);
                vmf->page = NULL;
                return poisonret;
        }

        if (unlikely(!(ret & VM_FAULT_LOCKED)))
                folio_lock(folio);
        else
                VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page);

        return ret;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
        /*
         * We are going to consume the prealloc table,
         * count that as nr_ptes.
         */
        mm_inc_nr_ptes(vma->vm_mm);
        vmf->prealloc_pte = NULL;
}

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t entry;
        vm_fault_t ret = VM_FAULT_FALLBACK;

        /*
         * It is too late to allocate a small folio, we already have a large
         * folio in the pagecache: especially s390 KVM cannot tolerate any
         * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
         * PMD mappings if THPs are disabled. As we already have a THP,
         * behave as if we are forcing a collapse.
         */
        if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags,
                                                     /* forced_collapse=*/ true))
                return ret;

        if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                return ret;

        if (!is_pmd_order(folio_order(folio)))
                return ret;
        page = &folio->page;

        /*
         * Just backoff if any subpage of a THP is corrupted otherwise
         * the corrupted page may mapped by PMD silently to escape the
         * check.  This kind of THP just can be PTE mapped.  Access to
         * the corrupted subpage should trigger SIGBUS as expected.
         */
        if (unlikely(folio_test_has_hwpoisoned(folio)))
                return ret;

        /*
         * Archs like ppc64 need additional space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd)))
                goto out;

        flush_icache_pages(vma, page, HPAGE_PMD_NR);

        entry = folio_mk_pmd(folio, vma->vm_page_prot);
        if (write)
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);

        add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
        folio_add_file_rmap_pmd(folio, page, vma);

        /*
         * deposit and withdraw with pmd lock held
         */
        if (arch_needs_pgtable_deposit())
                deposit_prealloc_pte(vmf);

        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);

        update_mmu_cache_pmd(vma, haddr, vmf->pmd);

        /* fault is handled */
        ret = 0;
        count_vm_event(THP_FILE_MAPPED);
out:
        spin_unlock(vmf->ptl);
        return ret;
}
#else
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
        return VM_FAULT_FALLBACK;
}
#endif

/**
 * set_pte_range - Set a range of PTEs to point to pages in a folio.
 * @vmf: Fault description.
 * @folio: The folio that contains @page.
 * @page: The first page to create a PTE for.
 * @nr: The number of PTEs to create.
 * @addr: The first address to create a PTE for.
 */
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                struct page *page, unsigned int nr, unsigned long addr)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE);
        pte_t entry;

        flush_icache_pages(vma, page, nr);
        entry = mk_pte(page, vma->vm_page_prot);

        if (prefault && arch_wants_old_prefaulted_pte())
                entry = pte_mkold(entry);
        else
                entry = pte_sw_mkyoung(entry);

        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        else if (pte_write(entry) && folio_test_dirty(folio))
                entry = pte_mkdirty(entry);
        if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
                entry = pte_mkuffd_wp(entry);
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
                VM_BUG_ON_FOLIO(nr != 1, folio);
                folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
                folio_add_lru_vma(folio, vma);
        } else {
                folio_add_file_rmap_ptes(folio, page, nr, vma);
        }
        set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);

        /* no need to invalidate: a not-present page won't be cached */
        update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
}

static bool vmf_pte_changed(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
                return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);

        return !pte_none(ptep_get(vmf->pte));
}

/**
 * finish_fault - finish page fault once we have prepared the page to fault
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a page fault once the
 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
 * given page, adds reverse page mapping, handles memcg charges and LRU
 * addition.
 *
 * The function expects the page to be locked and on success it consumes a
 * reference of a page being mapped (for the PTE which maps it).
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
 */
vm_fault_t finish_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page;
        struct folio *folio;
        vm_fault_t ret;
        bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
                      !(vma->vm_flags & VM_SHARED);
        int type, nr_pages;
        unsigned long addr;
        bool needs_fallback = false;

fallback:
        addr = vmf->address;

        /* Did we COW the page? */
        if (is_cow)
                page = vmf->cow_page;
        else
                page = vmf->page;

        folio = page_folio(page);
        /*
         * check even for read faults because we might have lost our CoWed
         * page
         */
        if (!(vma->vm_flags & VM_SHARED)) {
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        return ret;
        }

        if (!needs_fallback && vma->vm_file) {
                struct address_space *mapping = vma->vm_file->f_mapping;
                pgoff_t file_end;

                file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);

                /*
                 * Do not allow to map with PTEs beyond i_size and with PMD
                 * across i_size to preserve SIGBUS semantics.
                 *
                 * Make an exception for shmem/tmpfs that for long time
                 * intentionally mapped with PMDs across i_size.
                 */
                needs_fallback = !shmem_mapping(mapping) &&
                        file_end < folio_next_index(folio);
        }

        if (pmd_none(*vmf->pmd)) {
                if (!needs_fallback && folio_test_pmd_mappable(folio)) {
                        ret = do_set_pmd(vmf, folio, page);
                        if (ret != VM_FAULT_FALLBACK)
                                return ret;
                }

                if (vmf->prealloc_pte)
                        pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
                else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
                        return VM_FAULT_OOM;
        }

        nr_pages = folio_nr_pages(folio);

        /* Using per-page fault to maintain the uffd semantics */
        if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) {
                nr_pages = 1;
        } else if (nr_pages > 1) {
                pgoff_t idx = folio_page_idx(folio, page);
                /* The page offset of vmf->address within the VMA. */
                pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
                /* The index of the entry in the pagetable for fault page. */
                pgoff_t pte_off = pte_index(vmf->address);

                /*
                 * Fallback to per-page fault in case the folio size in page
                 * cache beyond the VMA limits and PMD pagetable limits.
                 */
                if (unlikely(vma_off < idx ||
                            vma_off + (nr_pages - idx) > vma_pages(vma) ||
                            pte_off < idx ||
                            pte_off + (nr_pages - idx)  > PTRS_PER_PTE)) {
                        nr_pages = 1;
                } else {
                        /* Now we can set mappings for the whole large folio. */
                        addr = vmf->address - idx * PAGE_SIZE;
                        page = &folio->page;
                }
        }

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                       addr, &vmf->ptl);
        if (!vmf->pte)
                return VM_FAULT_NOPAGE;

        /* Re-check under ptl */
        if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
                update_mmu_tlb(vma, addr, vmf->pte);
                ret = VM_FAULT_NOPAGE;
                goto unlock;
        } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
                needs_fallback = true;
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto fallback;
        }

        folio_ref_add(folio, nr_pages - 1);
        set_pte_range(vmf, folio, page, nr_pages, addr);
        type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
        add_mm_counter(vma->vm_mm, type, nr_pages);
        ret = 0;

unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
}

static unsigned long fault_around_pages __read_mostly =
        65536 >> PAGE_SHIFT;

#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
        *val = fault_around_pages << PAGE_SHIFT;
        return 0;
}

/*
 * fault_around_bytes must be rounded down to the nearest page order as it's
 * what do_fault_around() expects to see.
 */
static int fault_around_bytes_set(void *data, u64 val)
{
        if (val / PAGE_SIZE > PTRS_PER_PTE)
                return -EINVAL;

        /*
         * The minimum value is 1 page, however this results in no fault-around
         * at all. See should_fault_around().
         */
        val = max(val, PAGE_SIZE);
        fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;

        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
                fault_around_bytes_get, fault_around_bytes_set, "%llu\n");

static int __init fault_around_debugfs(void)
{
        debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
                                   &fault_around_bytes_fops);
        return 0;
}
late_initcall(fault_around_debugfs);
#endif

/*
 * do_fault_around() tries to map few pages around the fault address. The hope
 * is that the pages will be needed soon and this will lower the number of
 * faults to handle.
 *
 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
 * not ready to be mapped: not up-to-date, locked, etc.
 *
 * This function doesn't cross VMA or page table boundaries, in order to call
 * map_pages() and acquire a PTE lock only once.
 *
 * fault_around_pages defines how many pages we'll try to map.
 * do_fault_around() expects it to be set to a power of two less than or equal
 * to PTRS_PER_PTE.
 *
 * The virtual address of the area that we map is naturally aligned to
 * fault_around_pages * PAGE_SIZE rounded down to the machine page size
 * (and therefore to page order).  This way it's easier to guarantee
 * that we don't cross page table boundaries.
 */
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
        pgoff_t nr_pages = READ_ONCE(fault_around_pages);
        pgoff_t pte_off = pte_index(vmf->address);
        /* The page offset of vmf->address within the VMA. */
        pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
        pgoff_t from_pte, to_pte;
        vm_fault_t ret;

        /* The PTE offset of the start address, clamped to the VMA. */
        from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
                       pte_off - min(pte_off, vma_off));

        /* The PTE offset of the end address, clamped to the VMA and PTE. */
        to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
                      pte_off + vma_pages(vmf->vma) - vma_off) - 1;

        if (pmd_none(*vmf->pmd)) {
                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        rcu_read_lock();
        ret = vmf->vma->vm_ops->map_pages(vmf,
                        vmf->pgoff + from_pte - pte_off,
                        vmf->pgoff + to_pte - pte_off);
        rcu_read_unlock();

        return ret;
}

/* Return true if we should do read fault-around, false otherwise */
static inline bool should_fault_around(struct vm_fault *vmf)
{
        /* No ->map_pages?  No way to fault around... */
        if (!vmf->vma->vm_ops->map_pages)
                return false;

        if (uffd_disable_fault_around(vmf->vma))
                return false;

        /* A single page implies no faulting 'around' at all. */
        return fault_around_pages > 1;
}

static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
        vm_fault_t ret = 0;
        struct folio *folio;

        /*
         * Let's call ->map_pages() first and use ->fault() as fallback
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
        if (should_fault_around(vmf)) {
                ret = do_fault_around(vmf);
                if (ret)
                        return ret;
        }

        ret = vmf_can_call_fault(vmf);
        if (ret)
                return ret;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        ret |= finish_fault(vmf);
        folio = page_folio(vmf->page);
        folio_unlock(folio);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                folio_put(folio);
        return ret;
}

static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret;

        ret = vmf_can_call_fault(vmf);
        if (!ret)
                ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;

        folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false);
        if (!folio)
                return VM_FAULT_OOM;

        vmf->cow_page = &folio->page;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        if (ret & VM_FAULT_DONE_COW)
                return ret;

        if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) {
                ret = VM_FAULT_HWPOISON;
                goto unlock;
        }
        __folio_mark_uptodate(folio);

        ret |= finish_fault(vmf);
unlock:
        unlock_page(vmf->page);
        put_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        return ret;
uncharge_out:
        folio_put(folio);
        return ret;
}

static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret, tmp;
        struct folio *folio;

        ret = vmf_can_call_fault(vmf);
        if (ret)
                return ret;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        folio = page_folio(vmf->page);

        /*
         * Check if the backing address space wants to know that the page is
         * about to become writable
         */
        if (vma->vm_ops->page_mkwrite) {
                folio_unlock(folio);
                tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp ||
                                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        folio_put(folio);
                        return tmp;
                }
        }

        ret |= finish_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                        VM_FAULT_RETRY))) {
                folio_unlock(folio);
                folio_put(folio);
                return ret;
        }

        ret |= fault_dirty_shared_page(vmf);
        return ret;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults).
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __folio_lock_or_retry().
 * If mmap_lock is released, vma may become invalid (for example
 * by other thread calling munmap()).
 */
static vm_fault_t do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *vm_mm = vma->vm_mm;
        vm_fault_t ret;

        /*
         * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
         */
        if (!vma->vm_ops->fault) {
                vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                               vmf->address, &vmf->ptl);
                if (unlikely(!vmf->pte))
                        ret = VM_FAULT_SIGBUS;
                else {
                        /*
                         * Make sure this is not a temporary clearing of pte
                         * by holding ptl and checking again. A R/M/W update
                         * of pte involves: take ptl, clearing the pte so that
                         * we don't have concurrent modification by hardware
                         * followed by an update.
                         */
                        if (unlikely(pte_none(ptep_get(vmf->pte))))
                                ret = VM_FAULT_SIGBUS;
                        else
                                ret = VM_FAULT_NOPAGE;

                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                }
        } else if (!(vmf->flags & FAULT_FLAG_WRITE))
                ret = do_read_fault(vmf);
        else if (!(vma->vm_flags & VM_SHARED))
                ret = do_cow_fault(vmf);
        else
                ret = do_shared_fault(vmf);

        /* preallocated pagetable is unused: free it */
        if (vmf->prealloc_pte) {
                pte_free(vm_mm, vmf->prealloc_pte);
                vmf->prealloc_pte = NULL;
        }
        return ret;
}

int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
                      unsigned long addr, int *flags,
                      bool writable, int *last_cpupid)
{
        struct vm_area_struct *vma = vmf->vma;

        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
         * the case where a mapping is writable but the process never writes
         * to it but pte_write gets cleared during protection updates and
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
        if (!writable)
                *flags |= TNF_NO_GROUP;

        /*
         * Flag if the folio is shared between multiple address spaces. This
         * is later used when determining whether to group tasks together
         */
        if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
                *flags |= TNF_SHARED;
        /*
         * For memory tiering mode, cpupid of slow memory page is used
         * to record page access time.  So use default value.
         */
        if (folio_use_access_time(folio))
                *last_cpupid = (-1 & LAST_CPUPID_MASK);
        else
                *last_cpupid = folio_last_cpupid(folio);

        /* Record the current PID accessing VMA */
        vma_set_access_pid_bit(vma);

        count_vm_numa_event(NUMA_HINT_FAULTS);
#ifdef CONFIG_NUMA_BALANCING
        count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1);
#endif
        if (folio_nid(folio) == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
                *flags |= TNF_FAULT_LOCAL;
        }

        return mpol_misplaced(folio, vmf, addr);
}

static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
                                        unsigned long fault_addr, pte_t *fault_pte,
                                        bool writable)
{
        pte_t pte, old_pte;

        old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
        if (writable)
                pte = pte_mkwrite(pte, vma);
        ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte);
        update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
}

static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
                                       struct folio *folio, pte_t fault_pte,
                                       bool ignore_writable, bool pte_write_upgrade)
{
        int nr = pte_pfn(fault_pte) - folio_pfn(folio);
        unsigned long start, end, addr = vmf->address;
        unsigned long addr_start = addr - (nr << PAGE_SHIFT);
        unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE);
        pte_t *start_ptep;

        /* Stay within the VMA and within the page table. */
        start = max3(addr_start, pt_start, vma->vm_start);
        end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE,
                   vma->vm_end);
        start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT);

        /* Restore all PTEs' mapping of the large folio */
        for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
                pte_t ptent = ptep_get(start_ptep);
                bool writable = false;

                if (!pte_present(ptent) || !pte_protnone(ptent))
                        continue;

                if (pfn_folio(pte_pfn(ptent)) != folio)
                        continue;

                if (!ignore_writable) {
                        ptent = pte_modify(ptent, vma->vm_page_prot);
                        writable = pte_write(ptent);
                        if (!writable && pte_write_upgrade &&
                            can_change_pte_writable(vma, addr, ptent))
                                writable = true;
                }

                numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable);
        }
}

static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = NULL;
        int nid = NUMA_NO_NODE;
        bool writable = false, ignore_writable = false;
        bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
        int last_cpupid;
        int target_nid;
        pte_t pte, old_pte;
        int flags = 0, nr_pages;

        /*
         * The pte cannot be used safely until we verify, while holding the page
         * table lock, that its contents have not changed during fault handling.
         */
        spin_lock(vmf->ptl);
        /* Read the live PTE from the page tables: */
        old_pte = ptep_get(vmf->pte);

        if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }

        pte = pte_modify(old_pte, vma->vm_page_prot);

        /*
         * Detect now whether the PTE could be writable; this information
         * is only valid while holding the PT lock.
         */
        writable = pte_write(pte);
        if (!writable && pte_write_upgrade &&
            can_change_pte_writable(vma, vmf->address, pte))
                writable = true;

        folio = vm_normal_folio(vma, vmf->address, pte);
        if (!folio || folio_is_zone_device(folio))
                goto out_map;

        nid = folio_nid(folio);
        nr_pages = folio_nr_pages(folio);

        target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags,
                                        writable, &last_cpupid);
        if (target_nid == NUMA_NO_NODE)
                goto out_map;
        if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
                flags |= TNF_MIGRATE_FAIL;
                goto out_map;
        }
        /* The folio is isolated and isolation code holds a folio reference. */
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        writable = false;
        ignore_writable = true;

        /* Migrate to the requested node */
        if (!migrate_misplaced_folio(folio, target_nid)) {
                nid = target_nid;
                flags |= TNF_MIGRATED;
                task_numa_fault(last_cpupid, nid, nr_pages, flags);
                return 0;
        }

        flags |= TNF_MIGRATE_FAIL;
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
        if (unlikely(!vmf->pte))
                return 0;
        if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }
out_map:
        /*
         * Make it present again, depending on how arch implements
         * non-accessible ptes, some can allow access by kernel mode.
         */
        if (folio && folio_test_large(folio))
                numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
                                           pte_write_upgrade);
        else
                numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
                                            writable);
        pte_unmap_unlock(vmf->pte, vmf->ptl);

        if (nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, nid, nr_pages, flags);
        return 0;
}

static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        if (vma_is_anonymous(vma))
                return do_huge_pmd_anonymous_page(vmf);
        if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
        return VM_FAULT_FALLBACK;
}

/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        vm_fault_t ret;

        if (vma_is_anonymous(vma)) {
                if (likely(!unshare) &&
                    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
                        if (userfaultfd_wp_async(vmf->vma))
                                goto split;
                        return handle_userfault(vmf, VM_UFFD_WP);
                }
                return do_huge_pmd_wp_page(vmf);
        }

        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
                        ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }

split:
        /* COW or write-notify handled on pte level: split pmd. */
        __split_huge_pmd(vma, vmf->pmd, vmf->address, false);

        return VM_FAULT_FALLBACK;
}

static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        struct vm_area_struct *vma = vmf->vma;
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                return VM_FAULT_FALLBACK;
        if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        return VM_FAULT_FALLBACK;
}

static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;

        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                goto split;
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
                        ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }
split:
        /* COW or write-notify not handled on PUD level: split pud.*/
        __split_huge_pud(vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
        return VM_FAULT_FALLBACK;
}

/*
 * The page faults may be spurious because of the racy access to the
 * page table.  For example, a non-populated virtual page is accessed
 * on 2 CPUs simultaneously, thus the page faults are triggered on
 * both CPUs.  However, it's possible that one CPU (say CPU A) cannot
 * find the reason for the page fault if the other CPU (say CPU B) has
 * changed the page table before the PTE is checked on CPU A.  Most of
 * the time, the spurious page faults can be ignored safely.  However,
 * if the page fault is for the write access, it's possible that a
 * stale read-only TLB entry exists in the local CPU and needs to be
 * flushed on some architectures.  This is called the spurious page
 * fault fixing.
 *
 * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
 * by default and used as such on most architectures, while
 * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
 * used as such on most architectures.
 */
static void fix_spurious_fault(struct vm_fault *vmf,
                               enum pgtable_level ptlevel)
{
        /* Skip spurious TLB flush for retried page fault */
        if (vmf->flags & FAULT_FLAG_TRIED)
                return;
        /*
         * This is needed only for protection faults but the arch code
         * is not yet telling us if this is a protection fault or not.
         * This still avoids useless tlb flushes for .text page faults
         * with threads.
         */
        if (vmf->flags & FAULT_FLAG_WRITE) {
                if (ptlevel == PGTABLE_LEVEL_PTE)
                        flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
                                                     vmf->pte);
                else
                        flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
                                                         vmf->pmd);
        }
}
/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
 * concurrent faults).
 *
 * The mmap_lock may have been released depending on flags and our return value.
 * See filemap_fault() and __folio_lock_or_retry().
 */
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
        pte_t entry;

        if (unlikely(pmd_none(*vmf->pmd))) {
                /*
                 * Leave __pte_alloc() until later: because vm_ops->fault may
                 * want to allocate huge page, and if we expose page table
                 * for an instant, it will be difficult to retract from
                 * concurrent faults and from rmap lookups.
                 */
                vmf->pte = NULL;
                vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
        } else {
                pmd_t dummy_pmdval;

                /*
                 * A regular pmd is established and it can't morph into a huge
                 * pmd by anon khugepaged, since that takes mmap_lock in write
                 * mode; but shmem or file collapse to THP could still morph
                 * it into a huge pmd: just retry later if so.
                 *
                 * Use the maywrite version to indicate that vmf->pte may be
                 * modified, but since we will use pte_same() to detect the
                 * change of the !pte_none() entry, there is no need to recheck
                 * the pmdval. Here we choose to pass a dummy variable instead
                 * of NULL, which helps new user think about why this place is
                 * special.
                 */
                vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd,
                                                    vmf->address, &dummy_pmdval,
                                                    &vmf->ptl);
                if (unlikely(!vmf->pte))
                        return 0;
                vmf->orig_pte = ptep_get_lockless(vmf->pte);
                vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;

                if (pte_none(vmf->orig_pte)) {
                        pte_unmap(vmf->pte);
                        vmf->pte = NULL;
                }
        }

        if (!vmf->pte)
                return do_pte_missing(vmf);

        if (!pte_present(vmf->orig_pte))
                return do_swap_page(vmf);

        if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
                return do_numa_page(vmf);

        spin_lock(vmf->ptl);
        entry = vmf->orig_pte;
        if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                goto unlock;
        }
        if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                if (!pte_write(entry))
                        return do_wp_page(vmf);
                else if (likely(vmf->flags & FAULT_FLAG_WRITE))
                        entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                                vmf->flags & FAULT_FLAG_WRITE))
                update_mmu_cache_range(vmf, vmf->vma, vmf->address,
                                vmf->pte, 1);
        else
                fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE);
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

/*
 * On entry, we hold either the VMA lock or the mmap_lock
 * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
 * the result, the mmap_lock is not held on exit.  See filemap_fault()
 * and __folio_lock_or_retry().
 */
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
{
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & PAGE_MASK,
                .real_address = address,
                .flags = flags,
                .pgoff = linear_page_index(vma, address),
                .gfp_mask = __get_fault_gfp_mask(vma),
        };
        struct mm_struct *mm = vma->vm_mm;
        vm_flags_t vm_flags = vma->vm_flags;
        pgd_t *pgd;
        p4d_t *p4d;
        vm_fault_t ret;

        pgd = pgd_offset(mm, address);
        p4d = p4d_alloc(mm, pgd, address);
        if (!p4d)
                return VM_FAULT_OOM;

        vmf.pud = pud_alloc(mm, p4d, address);
        if (!vmf.pud)
                return VM_FAULT_OOM;
retry_pud:
        if (pud_none(*vmf.pud) &&
            thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
                ret = create_huge_pud(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                pud_t orig_pud = *vmf.pud;

                barrier();
                if (pud_trans_huge(orig_pud)) {

                        /*
                         * TODO once we support anonymous PUDs: NUMA case and
                         * FAULT_FLAG_UNSHARE handling.
                         */
                        if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
                                ret = wp_huge_pud(&vmf, orig_pud);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pud_set_accessed(&vmf, orig_pud);
                                return 0;
                        }
                }
        }

        vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;

        /* Huge pud page fault raced with pmd_alloc? */
        if (pud_trans_unstable(vmf.pud))
                goto retry_pud;

        if (pmd_none(*vmf.pmd) &&
            thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
                ret = create_huge_pmd(&vmf);
                if (ret & VM_FAULT_FALLBACK)
                        goto fallback;
                else
                        return ret;
        }

        vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
        if (pmd_none(vmf.orig_pmd))
                goto fallback;

        if (unlikely(!pmd_present(vmf.orig_pmd))) {
                if (pmd_is_device_private_entry(vmf.orig_pmd))
                        return do_huge_pmd_device_private(&vmf);

                if (pmd_is_migration_entry(vmf.orig_pmd))
                        pmd_migration_entry_wait(mm, vmf.pmd);
                return 0;
        }
        if (pmd_trans_huge(vmf.orig_pmd)) {
                if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
                        return do_huge_pmd_numa_page(&vmf);

                if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
                    !pmd_write(vmf.orig_pmd)) {
                        ret = wp_huge_pmd(&vmf);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                } else {
                        vmf.ptl = pmd_lock(mm, vmf.pmd);
                        if (!huge_pmd_set_accessed(&vmf))
                                fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
                        spin_unlock(vmf.ptl);
                        return 0;
                }
        }

fallback:
        return handle_pte_fault(&vmf);
}

/**
 * mm_account_fault - Do page fault accounting
 * @mm: mm from which memcg should be extracted. It can be NULL.
 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
 *        of perf event counters, but we'll still do the per-task accounting to
 *        the task who triggered this page fault.
 * @address: the faulted address.
 * @flags: the fault flags.
 * @ret: the fault retcode.
 *
 * This will take care of most of the page fault accounting.  Meanwhile, it
 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
 * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
 * still be in per-arch page fault handlers at the entry of page fault.
 */
static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
                                    unsigned long address, unsigned int flags,
                                    vm_fault_t ret)
{
        bool major;

        /* Incomplete faults will be accounted upon completion. */
        if (ret & VM_FAULT_RETRY)
                return;

        /*
         * To preserve the behavior of older kernels, PGFAULT counters record
         * both successful and failed faults, as opposed to perf counters,
         * which ignore failed cases.
         */
        count_vm_event(PGFAULT);
        count_memcg_event_mm(mm, PGFAULT);

        /*
         * Do not account for unsuccessful faults (e.g. when the address wasn't
         * valid).  That includes arch_vma_access_permitted() failing before
         * reaching here. So this is not a "this many hardware page faults"
         * counter.  We should use the hw profiling for that.
         */
        if (ret & VM_FAULT_ERROR)
                return;

        /*
         * We define the fault as a major fault when the final successful fault
         * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
         * handle it immediately previously).
         */
        major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);

        if (major)
                current->maj_flt++;
        else
                current->min_flt++;

        /*
         * If the fault is done for GUP, regs will be NULL.  We only do the
         * accounting for the per thread fault counters who triggered the
         * fault, and we skip the perf event updates.
         */
        if (!regs)
                return;

        if (major)
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
        else
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}

#ifdef CONFIG_LRU_GEN
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
        /* the LRU algorithm only applies to accesses with recency */
        current->in_lru_fault = vma_has_recency(vma);
}

static void lru_gen_exit_fault(void)
{
        current->in_lru_fault = false;
}
#else
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
}

static void lru_gen_exit_fault(void)
{
}
#endif /* CONFIG_LRU_GEN */

static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
                                       unsigned int *flags)
{
        if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
                if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
                        return VM_FAULT_SIGSEGV;
                /*
                 * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
                 * just treat it like an ordinary read-fault otherwise.
                 */
                if (!is_cow_mapping(vma->vm_flags))
                        *flags &= ~FAULT_FLAG_UNSHARE;
        } else if (*flags & FAULT_FLAG_WRITE) {
                /* Write faults on read-only mappings are impossible ... */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
                        return VM_FAULT_SIGSEGV;
                /* ... and FOLL_FORCE only applies to COW mappings. */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
                                 !is_cow_mapping(vma->vm_flags)))
                        return VM_FAULT_SIGSEGV;
        }
#ifdef CONFIG_PER_VMA_LOCK
        /*
         * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
         * the assumption that lock is dropped on VM_FAULT_RETRY.
         */
        if (WARN_ON_ONCE((*flags &
                        (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
                        (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
                return VM_FAULT_SIGSEGV;
#endif

        return 0;
}

/*
 * By the time we get here, we already hold either the VMA lock or the
 * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
 *
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __folio_lock_or_retry().
 */
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                           unsigned int flags, struct pt_regs *regs)
{
        /* If the fault handler drops the mmap_lock, vma may be freed */
        struct mm_struct *mm = vma->vm_mm;
        vm_fault_t ret;
        bool is_droppable;

        __set_current_state(TASK_RUNNING);

        ret = sanitize_fault_flags(vma, &flags);
        if (ret)
                goto out;

        if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                            flags & FAULT_FLAG_INSTRUCTION,
                                            flags & FAULT_FLAG_REMOTE)) {
                ret = VM_FAULT_SIGSEGV;
                goto out;
        }

        is_droppable = !!(vma->vm_flags & VM_DROPPABLE);

        /*
         * Enable the memcg OOM handling for faults triggered in user
         * space.  Kernel faults are handled more gracefully.
         */
        if (flags & FAULT_FLAG_USER)
                mem_cgroup_enter_user_fault();

        lru_gen_enter_fault(vma);

        if (unlikely(is_vm_hugetlb_page(vma)))
                ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
        else
                ret = __handle_mm_fault(vma, address, flags);

        /*
         * Warning: It is no longer safe to dereference vma-> after this point,
         * because mmap_lock might have been dropped by __handle_mm_fault(), so
         * vma might be destroyed from underneath us.
         */

        lru_gen_exit_fault();

        /* If the mapping is droppable, then errors due to OOM aren't fatal. */
        if (is_droppable)
                ret &= ~VM_FAULT_OOM;

        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_exit_user_fault();
                /*
                 * The task may have entered a memcg OOM situation but
                 * if the allocation error was handled gracefully (no
                 * VM_FAULT_OOM), there is no need to kill anything.
                 * Just clean up the OOM state peacefully.
                 */
                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
                        mem_cgroup_oom_synchronize(false);
        }
out:
        mm_account_fault(mm, regs, address, flags, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);

#ifndef __PAGETABLE_P4D_FOLDED
/*
 * Allocate p4d page table.
 * We've already handled the fast-path in-line.
 */
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
        p4d_t *new = p4d_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        spin_lock(&mm->page_table_lock);
        if (pgd_present(*pgd)) {        /* Another has populated it */
                p4d_free(mm, new);
        } else {
                smp_wmb(); /* See comment in pmd_install() */
                pgd_populate(mm, pgd, new);
        }
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */

#ifndef __PAGETABLE_PUD_FOLDED
/*
 * Allocate page upper directory.
 * We've already handled the fast-path in-line.
 */
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
        pud_t *new = pud_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        spin_lock(&mm->page_table_lock);
        if (!p4d_present(*p4d)) {
                mm_inc_nr_puds(mm);
                smp_wmb(); /* See comment in pmd_install() */
                p4d_populate(mm, p4d, new);
        } else        /* Another has populated it */
                pud_free(mm, new);
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */

#ifndef __PAGETABLE_PMD_FOLDED
/*
 * Allocate page middle directory.
 * We've already handled the fast-path in-line.
 */
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        spinlock_t *ptl;
        pmd_t *new = pmd_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        ptl = pud_lock(mm, pud);
        if (!pud_present(*pud)) {
                mm_inc_nr_pmds(mm);
                smp_wmb(); /* See comment in pmd_install() */
                pud_populate(mm, pud, new);
        } else {        /* Another has populated it */
                pmd_free(mm, new);
        }
        spin_unlock(ptl);
        return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */

static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
                                     spinlock_t *lock, pte_t *ptep,
                                     pgprot_t pgprot, unsigned long pfn_base,
                                     unsigned long addr_mask, bool writable,
                                     bool special)
{
        args->lock = lock;
        args->ptep = ptep;
        args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
        args->addr_mask = addr_mask;
        args->pgprot = pgprot;
        args->writable = writable;
        args->special = special;
}

static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
{
#ifdef CONFIG_LOCKDEP
        struct file *file = vma->vm_file;
        struct address_space *mapping = file ? file->f_mapping : NULL;

        if (mapping)
                lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) ||
                               lockdep_is_held(&vma->vm_mm->mmap_lock));
        else
                lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
#endif
}

/**
 * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
 * @args: Pointer to struct @follow_pfnmap_args
 *
 * The caller needs to setup args->vma and args->address to point to the
 * virtual address as the target of such lookup.  On a successful return,
 * the results will be put into other output fields.
 *
 * After the caller finished using the fields, the caller must invoke
 * another follow_pfnmap_end() to proper releases the locks and resources
 * of such look up request.
 *
 * During the start() and end() calls, the results in @args will be valid
 * as proper locks will be held.  After the end() is called, all the fields
 * in @follow_pfnmap_args will be invalid to be further accessed.  Further
 * use of such information after end() may require proper synchronizations
 * by the caller with page table updates, otherwise it can create a
 * security bug.
 *
 * If the PTE maps a refcounted page, callers are responsible to protect
 * against invalidation with MMU notifiers; otherwise access to the PFN at
 * a later point in time can trigger use-after-free.
 *
 * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
 * should be taken for read, and the mmap semaphore cannot be released
 * before the end() is invoked.
 *
 * This function must not be used to modify PTE content.
 *
 * Return: zero on success, negative otherwise.
 */
int follow_pfnmap_start(struct follow_pfnmap_args *args)
{
        struct vm_area_struct *vma = args->vma;
        unsigned long address = args->address;
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *lock;
        pgd_t *pgdp;
        p4d_t *p4dp, p4d;
        pud_t *pudp, pud;
        pmd_t *pmdp, pmd;
        pte_t *ptep, pte;

        pfnmap_lockdep_assert(vma);

        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                goto out;

        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                goto out;
retry:
        pgdp = pgd_offset(mm, address);
        if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
                goto out;

        p4dp = p4d_offset(pgdp, address);
        p4d = p4dp_get(p4dp);
        if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
                goto out;

        pudp = pud_offset(p4dp, address);
        pud = pudp_get(pudp);
        if (!pud_present(pud))
                goto out;
        if (pud_leaf(pud)) {
                lock = pud_lock(mm, pudp);
                pud = pudp_get(pudp);

                if (unlikely(!pud_present(pud))) {
                        spin_unlock(lock);
                        goto out;
                } else if (unlikely(!pud_leaf(pud))) {
                        spin_unlock(lock);
                        goto retry;
                }
                pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
                                  pud_pfn(pud), PUD_MASK, pud_write(pud),
                                  pud_special(pud));
                return 0;
        }

        pmdp = pmd_offset(pudp, address);
        pmd = pmdp_get_lockless(pmdp);
        if (!pmd_present(pmd))
                goto out;
        if (pmd_leaf(pmd)) {
                lock = pmd_lock(mm, pmdp);
                pmd = pmdp_get(pmdp);

                if (unlikely(!pmd_present(pmd))) {
                        spin_unlock(lock);
                        goto out;
                } else if (unlikely(!pmd_leaf(pmd))) {
                        spin_unlock(lock);
                        goto retry;
                }
                pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
                                  pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
                                  pmd_special(pmd));
                return 0;
        }

        ptep = pte_offset_map_lock(mm, pmdp, address, &lock);
        if (!ptep)
                goto out;
        pte = ptep_get(ptep);
        if (!pte_present(pte))
                goto unlock;
        pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
                          pte_pfn(pte), PAGE_MASK, pte_write(pte),
                          pte_special(pte));
        return 0;
unlock:
        pte_unmap_unlock(ptep, lock);
out:
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(follow_pfnmap_start);

/**
 * follow_pfnmap_end(): End a follow_pfnmap_start() process
 * @args: Pointer to struct @follow_pfnmap_args
 *
 * Must be used in pair of follow_pfnmap_start().  See the start() function
 * above for more information.
 */
void follow_pfnmap_end(struct follow_pfnmap_args *args)
{
        if (args->lock)
                spin_unlock(args->lock);
        if (args->ptep)
                pte_unmap(args->ptep);
}
EXPORT_SYMBOL_GPL(follow_pfnmap_end);

#ifdef CONFIG_HAVE_IOREMAP_PROT
/**
 * generic_access_phys - generic implementation for iomem mmap access
 * @vma: the vma to access
 * @addr: userspace address, not relative offset within @vma
 * @buf: buffer to read/write
 * @len: length of transfer
 * @write: set to FOLL_WRITE when writing, otherwise reading
 *
 * This is a generic implementation for &vm_operations_struct.access for an
 * iomem mapping. This callback is used by access_process_vm() when the @vma is
 * not page based.
 */
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write)
{
        resource_size_t phys_addr;
        pgprot_t prot = __pgprot(0);
        void __iomem *maddr;
        int offset = offset_in_page(addr);
        int ret = -EINVAL;
        bool writable;
        struct follow_pfnmap_args args = { .vma = vma, .address = addr };

retry:
        if (follow_pfnmap_start(&args))
                return -EINVAL;
        prot = args.pgprot;
        phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
        writable = args.writable;
        follow_pfnmap_end(&args);

        if ((write & FOLL_WRITE) && !writable)
                return -EINVAL;

        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
        if (!maddr)
                return -ENOMEM;

        if (follow_pfnmap_start(&args))
                goto out_unmap;

        if ((pgprot_val(prot) != pgprot_val(args.pgprot)) ||
            (phys_addr != (args.pfn << PAGE_SHIFT)) ||
            (writable != args.writable)) {
                follow_pfnmap_end(&args);
                iounmap(maddr);
                goto retry;
        }

        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
                memcpy_fromio(buf, maddr + offset, len);
        ret = len;
        follow_pfnmap_end(&args);
out_unmap:
        iounmap(maddr);

        return ret;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif

/*
 * Access another process' address space as given in mm.
 */
static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
                              void *buf, int len, unsigned int gup_flags)
{
        void *old_buf = buf;
        int write = gup_flags & FOLL_WRITE;

        if (mmap_read_lock_killable(mm))
                return 0;

        /* Untag the address before looking up the VMA */
        addr = untagged_addr_remote(mm, addr);

        /* Avoid triggering the temporary warning in __get_user_pages */
        if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
                return 0;

        /* ignore errors, just check how much was successfully transferred */
        while (len) {
                int bytes, offset;
                void *maddr;
                struct folio *folio;
                struct vm_area_struct *vma = NULL;
                struct page *page = get_user_page_vma_remote(mm, addr,
                                                             gup_flags, &vma);

                if (IS_ERR(page)) {
                        /* We might need to expand the stack to access it */
                        vma = vma_lookup(mm, addr);
                        if (!vma) {
                                vma = expand_stack(mm, addr);

                                /* mmap_lock was dropped on failure */
                                if (!vma)
                                        return buf - old_buf;

                                /* Try again if stack expansion worked */
                                continue;
                        }

                        /*
                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
                         * we can access using slightly different code.
                         */
                        bytes = 0;
#ifdef CONFIG_HAVE_IOREMAP_PROT
                        if (vma->vm_ops && vma->vm_ops->access)
                                bytes = vma->vm_ops->access(vma, addr, buf,
                                                            len, write);
#endif
                        if (bytes <= 0)
                                break;
                } else {
                        folio = page_folio(page);
                        bytes = len;
                        offset = addr & (PAGE_SIZE-1);
                        if (bytes > PAGE_SIZE-offset)
                                bytes = PAGE_SIZE-offset;

                        maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
                        if (write) {
                                copy_to_user_page(vma, page, addr,
                                                  maddr + offset, buf, bytes);
                                folio_mark_dirty_lock(folio);
                        } else {
                                copy_from_user_page(vma, page, addr,
                                                    buf, maddr + offset, bytes);
                        }
                        folio_release_kmap(folio, maddr);
                }
                len -= bytes;
                buf += bytes;
                addr += bytes;
        }
        mmap_read_unlock(mm);

        return buf - old_buf;
}

/**
 * access_remote_vm - access another process' address space
 * @mm:                the mm_struct of the target address space
 * @addr:        start address to access
 * @buf:        source or destination buffer
 * @len:        number of bytes to transfer
 * @gup_flags:        flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from source to destination.
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        return __access_remote_vm(mm, addr, buf, len, gup_flags);
}

/*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = __access_remote_vm(mm, addr, buf, len, gup_flags);

        mmput(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(access_process_vm);

#ifdef CONFIG_BPF_SYSCALL
/*
 * Copy a string from another process's address space as given in mm.
 * If there is any error return -EFAULT.
 */
static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
                                void *buf, int len, unsigned int gup_flags)
{
        void *old_buf = buf;
        int err = 0;

        *(char *)buf = '\0';

        if (mmap_read_lock_killable(mm))
                return -EFAULT;

        addr = untagged_addr_remote(mm, addr);

        /* Avoid triggering the temporary warning in __get_user_pages */
        if (!vma_lookup(mm, addr)) {
                err = -EFAULT;
                goto out;
        }

        while (len) {
                int bytes, offset, retval;
                void *maddr;
                struct folio *folio;
                struct page *page;
                struct vm_area_struct *vma = NULL;

                page = get_user_page_vma_remote(mm, addr, gup_flags, &vma);
                if (IS_ERR(page)) {
                        /*
                         * Treat as a total failure for now until we decide how
                         * to handle the CONFIG_HAVE_IOREMAP_PROT case and
                         * stack expansion.
                         */
                        *(char *)buf = '\0';
                        err = -EFAULT;
                        goto out;
                }

                folio = page_folio(page);
                bytes = len;
                offset = addr & (PAGE_SIZE - 1);
                if (bytes > PAGE_SIZE - offset)
                        bytes = PAGE_SIZE - offset;

                maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
                retval = strscpy(buf, maddr + offset, bytes);
                if (retval >= 0) {
                        /* Found the end of the string */
                        buf += retval;
                        folio_release_kmap(folio, maddr);
                        break;
                }

                buf += bytes - 1;
                /*
                 * Because strscpy always NUL terminates we need to
                 * copy the last byte in the page if we are going to
                 * load more pages
                 */
                if (bytes != len) {
                        addr += bytes - 1;
                        copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1);
                        buf += 1;
                        addr += 1;
                }
                len -= bytes;

                folio_release_kmap(folio, maddr);
        }

out:
        mmap_read_unlock(mm);
        if (err)
                return err;
        return buf - old_buf;
}

/**
 * copy_remote_vm_str - copy a string from another process's address space.
 * @tsk:        the task of the target address space
 * @addr:        start address to read from
 * @buf:        destination buffer
 * @len:        number of bytes to copy
 * @gup_flags:        flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from @addr (source) to @buf (destination);
 * not including the trailing NUL. Always guaranteed to leave NUL-terminated
 * buffer. On any error, return -EFAULT.
 */
int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
                       void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        if (unlikely(len == 0))
                return 0;

        mm = get_task_mm(tsk);
        if (!mm) {
                *(char *)buf = '\0';
                return -EFAULT;
        }

        ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);

        mmput(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(copy_remote_vm_str);
#endif /* CONFIG_BPF_SYSCALL */

/*
 * Print the name of a VMA.
 */
void print_vma_addr(char *prefix, unsigned long ip)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;

        /*
         * we might be running from an atomic context so we cannot sleep
         */
        if (!mmap_read_trylock(mm))
                return;

        vma = vma_lookup(mm, ip);
        if (vma && vma->vm_file) {
                struct file *f = vma->vm_file;
                ip -= vma->vm_start;
                ip += vma->vm_pgoff << PAGE_SHIFT;
                printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip,
                                vma->vm_start,
                                vma->vm_end - vma->vm_start);
        }
        mmap_read_unlock(mm);
}

#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
        if (pagefault_disabled())
                return;
        __might_sleep(file, line);
        if (current->mm)
                might_lock_read(&current->mm->mmap_lock);
}
EXPORT_SYMBOL(__might_fault);
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
/*
 * Process all subpages of the specified huge page with the specified
 * operation.  The target subpage will be processed last to keep its
 * cache lines hot.
 */
static inline int process_huge_page(
        unsigned long addr_hint, unsigned int nr_pages,
        int (*process_subpage)(unsigned long addr, int idx, void *arg),
        void *arg)
{
        int i, n, base, l, ret;
        unsigned long addr = addr_hint &
                ~(((unsigned long)nr_pages << PAGE_SHIFT) - 1);

        /* Process target subpage last to keep its cache lines hot */
        might_sleep();
        n = (addr_hint - addr) / PAGE_SIZE;
        if (2 * n <= nr_pages) {
                /* If target subpage in first half of huge page */
                base = 0;
                l = n;
                /* Process subpages at the end of huge page */
                for (i = nr_pages - 1; i >= 2 * n; i--) {
                        cond_resched();
                        ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
                        if (ret)
                                return ret;
                }
        } else {
                /* If target subpage in second half of huge page */
                base = nr_pages - 2 * (nr_pages - n);
                l = nr_pages - n;
                /* Process subpages at the begin of huge page */
                for (i = 0; i < base; i++) {
                        cond_resched();
                        ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
                        if (ret)
                                return ret;
                }
        }
        /*
         * Process remaining subpages in left-right-left-right pattern
         * towards the target subpage
         */
        for (i = 0; i < l; i++) {
                int left_idx = base + i;
                int right_idx = base + 2 * l - 1 - i;

                cond_resched();
                ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
                if (ret)
                        return ret;
                cond_resched();
                ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
                if (ret)
                        return ret;
        }
        return 0;
}

static void clear_contig_highpages(struct page *page, unsigned long addr,
                                   unsigned int nr_pages)
{
        unsigned int i, count;
        /*
         * When clearing we want to operate on the largest extent possible to
         * allow for architecture specific extent based optimizations.
         *
         * However, since clear_user_highpages() (and primitives clear_user_pages(),
         * clear_pages()), do not call cond_resched(), limit the unit size when
         * running under non-preemptible scheduling models.
         */
        const unsigned int unit = preempt_model_preemptible() ?
                                   nr_pages : PROCESS_PAGES_NON_PREEMPT_BATCH;

        might_sleep();

        for (i = 0; i < nr_pages; i += count) {
                cond_resched();

                count = min(unit, nr_pages - i);
                clear_user_highpages(page + i, addr + i * PAGE_SIZE, count);
        }
}

/*
 * When zeroing a folio, we want to differentiate between pages in the
 * vicinity of the faulting address where we have spatial and temporal
 * locality, and those far away where we don't.
 *
 * Use a radius of 2 for determining the local neighbourhood.
 */
#define FOLIO_ZERO_LOCALITY_RADIUS        2

/**
 * folio_zero_user - Zero a folio which will be mapped to userspace.
 * @folio: The folio to zero.
 * @addr_hint: The address accessed by the user or the base address.
 */
void folio_zero_user(struct folio *folio, unsigned long addr_hint)
{
        const unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio));
        const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE;
        const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1);
        const long radius = FOLIO_ZERO_LOCALITY_RADIUS;
        struct range r[3];
        int i;

        /*
         * Faulting page and its immediate neighbourhood. Will be cleared at the
         * end to keep its cachelines hot.
         */
        r[2] = DEFINE_RANGE(fault_idx - radius < (long)pg.start ? pg.start : fault_idx - radius,
                            fault_idx + radius > (long)pg.end   ? pg.end   : fault_idx + radius);


        /* Region to the left of the fault */
        r[1] = DEFINE_RANGE(pg.start, r[2].start - 1);

        /* Region to the right of the fault: always valid for the common fault_idx=0 case. */
        r[0] = DEFINE_RANGE(r[2].end + 1, pg.end);

        for (i = 0; i < ARRAY_SIZE(r); i++) {
                const unsigned long addr = base_addr + r[i].start * PAGE_SIZE;
                const long nr_pages = (long)range_len(&r[i]);
                struct page *page = folio_page(folio, r[i].start);

                if (nr_pages > 0)
                        clear_contig_highpages(page, addr, nr_pages);
        }
}

static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
                                   unsigned long addr_hint,
                                   struct vm_area_struct *vma,
                                   unsigned int nr_pages)
{
        unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst));
        struct page *dst_page;
        struct page *src_page;
        int i;

        for (i = 0; i < nr_pages; i++) {
                dst_page = folio_page(dst, i);
                src_page = folio_page(src, i);

                cond_resched();
                if (copy_mc_user_highpage(dst_page, src_page,
                                          addr + i*PAGE_SIZE, vma))
                        return -EHWPOISON;
        }
        return 0;
}

struct copy_subpage_arg {
        struct folio *dst;
        struct folio *src;
        struct vm_area_struct *vma;
};

static int copy_subpage(unsigned long addr, int idx, void *arg)
{
        struct copy_subpage_arg *copy_arg = arg;
        struct page *dst = folio_page(copy_arg->dst, idx);
        struct page *src = folio_page(copy_arg->src, idx);

        if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma))
                return -EHWPOISON;
        return 0;
}

int copy_user_large_folio(struct folio *dst, struct folio *src,
                          unsigned long addr_hint, struct vm_area_struct *vma)
{
        unsigned int nr_pages = folio_nr_pages(dst);
        struct copy_subpage_arg arg = {
                .dst = dst,
                .src = src,
                .vma = vma,
        };

        if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
                return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages);

        return process_huge_page(addr_hint, nr_pages, copy_subpage, &arg);
}

long copy_folio_from_user(struct folio *dst_folio,
                           const void __user *usr_src,
                           bool allow_pagefault)
{
        void *kaddr;
        unsigned long i, rc = 0;
        unsigned int nr_pages = folio_nr_pages(dst_folio);
        unsigned long ret_val = nr_pages * PAGE_SIZE;
        struct page *subpage;

        for (i = 0; i < nr_pages; i++) {
                subpage = folio_page(dst_folio, i);
                kaddr = kmap_local_page(subpage);
                if (!allow_pagefault)
                        pagefault_disable();
                rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
                if (!allow_pagefault)
                        pagefault_enable();
                kunmap_local(kaddr);

                ret_val -= (PAGE_SIZE - rc);
                if (rc)
                        break;

                flush_dcache_page(subpage);

                cond_resched();
        }
        return ret_val;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS

static struct kmem_cache *page_ptl_cachep;

void __init ptlock_cache_init(void)
{
        page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
                        SLAB_PANIC, NULL);
}

bool ptlock_alloc(struct ptdesc *ptdesc)
{
        spinlock_t *ptl;

        ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
        if (!ptl)
                return false;
        ptdesc->ptl = ptl;
        return true;
}

void ptlock_free(struct ptdesc *ptdesc)
{
        if (ptdesc->ptl)
                kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif

void vma_pgtable_walk_begin(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_lock_read(vma);
}

void vma_pgtable_walk_end(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_unlock_read(vma);
}









































































































































































































































































































    1 










































































































































































































































































































































































































































































    2 


































































    2 
























































    3 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_USB_H
#define __LINUX_USB_H

#include <linux/mod_devicetable.h>
#include <linux/usb/ch9.h>

#define USB_MAJOR                        180
#define USB_DEVICE_MAJOR                189


#ifdef __KERNEL__

#include <linux/errno.h>        /* for -ENODEV */
#include <linux/delay.h>        /* for mdelay() */
#include <linux/interrupt.h>        /* for in_interrupt() */
#include <linux/list.h>                /* for struct list_head */
#include <linux/kref.h>                /* for struct kref */
#include <linux/device.h>        /* for struct device */
#include <linux/fs.h>                /* for struct file_operations */
#include <linux/completion.h>        /* for struct completion */
#include <linux/sched.h>        /* for current && schedule_timeout */
#include <linux/mutex.h>        /* for struct mutex */
#include <linux/spinlock.h>        /* for spinlock_t */
#include <linux/pm_runtime.h>        /* for runtime PM */

struct usb_device;
struct usb_driver;

/*-------------------------------------------------------------------------*/

/*
 * Host-side wrappers for standard USB descriptors ... these are parsed
 * from the data provided by devices.  Parsing turns them from a flat
 * sequence of descriptors into a hierarchy:
 *
 *  - devices have one (usually) or more configs;
 *  - configs have one (often) or more interfaces;
 *  - interfaces have one (usually) or more settings;
 *  - each interface setting has zero or (usually) more endpoints.
 *  - a SuperSpeed endpoint has a companion descriptor
 *
 * And there might be other descriptors mixed in with those.
 *
 * Devices may also have class-specific or vendor-specific descriptors.
 */

struct ep_device;

/**
 * struct usb_host_endpoint - host-side endpoint descriptor and queue
 * @desc: descriptor for this endpoint, wMaxPacketSize in native byteorder
 * @ss_ep_comp: SuperSpeed companion descriptor for this endpoint
 * @ssp_isoc_ep_comp: SuperSpeedPlus isoc companion descriptor for this endpoint
 * @eusb2_isoc_ep_comp: eUSB2 isoc companion descriptor for this endpoint
 * @urb_list: urbs queued to this endpoint; maintained by usbcore
 * @hcpriv: for use by HCD; typically holds hardware dma queue head (QH)
 *        with one or more transfer descriptors (TDs) per urb; must be preserved
 *        by core while BW is allocated for the endpoint
 * @ep_dev: ep_device for sysfs info
 * @extra: descriptors following this endpoint in the configuration
 * @extralen: how many bytes of "extra" are valid
 * @enabled: URBs may be submitted to this endpoint
 * @streams: number of USB-3 streams allocated on the endpoint
 *
 * USB requests are always queued to a given endpoint, identified by a
 * descriptor within an active interface in a given USB configuration.
 */
struct usb_host_endpoint {
        struct usb_endpoint_descriptor                        desc;
        struct usb_ss_ep_comp_descriptor                ss_ep_comp;
        struct usb_ssp_isoc_ep_comp_descriptor                ssp_isoc_ep_comp;
        struct usb_eusb2_isoc_ep_comp_descriptor        eusb2_isoc_ep_comp;
        struct list_head                urb_list;
        void                                *hcpriv;
        struct ep_device                *ep_dev;        /* For sysfs info */

        unsigned char *extra;   /* Extra descriptors */
        int extralen;
        int enabled;
        int streams;
};

/* host-side wrapper for one interface setting's parsed descriptors */
struct usb_host_interface {
        struct usb_interface_descriptor        desc;

        int extralen;
        unsigned char *extra;   /* Extra descriptors */

        /* array of desc.bNumEndpoints endpoints associated with this
         * interface setting.  these will be in no particular order.
         */
        struct usb_host_endpoint *endpoint;

        char *string;                /* iInterface string, if present */
};

enum usb_interface_condition {
        USB_INTERFACE_UNBOUND = 0,
        USB_INTERFACE_BINDING,
        USB_INTERFACE_BOUND,
        USB_INTERFACE_UNBINDING,
};

int __must_check
usb_find_common_endpoints(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **bulk_in,
                struct usb_endpoint_descriptor **bulk_out,
                struct usb_endpoint_descriptor **int_in,
                struct usb_endpoint_descriptor **int_out);

int __must_check
usb_find_common_endpoints_reverse(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **bulk_in,
                struct usb_endpoint_descriptor **bulk_out,
                struct usb_endpoint_descriptor **int_in,
                struct usb_endpoint_descriptor **int_out);

static inline int __must_check
usb_find_bulk_in_endpoint(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **bulk_in)
{
        return usb_find_common_endpoints(alt, bulk_in, NULL, NULL, NULL);
}

static inline int __must_check
usb_find_bulk_out_endpoint(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **bulk_out)
{
        return usb_find_common_endpoints(alt, NULL, bulk_out, NULL, NULL);
}

static inline int __must_check
usb_find_int_in_endpoint(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **int_in)
{
        return usb_find_common_endpoints(alt, NULL, NULL, int_in, NULL);
}

static inline int __must_check
usb_find_int_out_endpoint(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **int_out)
{
        return usb_find_common_endpoints(alt, NULL, NULL, NULL, int_out);
}

static inline int __must_check
usb_find_last_bulk_in_endpoint(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **bulk_in)
{
        return usb_find_common_endpoints_reverse(alt, bulk_in, NULL, NULL, NULL);
}

static inline int __must_check
usb_find_last_bulk_out_endpoint(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **bulk_out)
{
        return usb_find_common_endpoints_reverse(alt, NULL, bulk_out, NULL, NULL);
}

static inline int __must_check
usb_find_last_int_in_endpoint(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **int_in)
{
        return usb_find_common_endpoints_reverse(alt, NULL, NULL, int_in, NULL);
}

static inline int __must_check
usb_find_last_int_out_endpoint(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **int_out)
{
        return usb_find_common_endpoints_reverse(alt, NULL, NULL, NULL, int_out);
}

enum usb_wireless_status {
        USB_WIRELESS_STATUS_NA = 0,
        USB_WIRELESS_STATUS_DISCONNECTED,
        USB_WIRELESS_STATUS_CONNECTED,
};

/**
 * struct usb_interface - what usb device drivers talk to
 * @altsetting: array of interface structures, one for each alternate
 *        setting that may be selected.  Each one includes a set of
 *        endpoint configurations.  They will be in no particular order.
 * @cur_altsetting: the current altsetting.
 * @num_altsetting: number of altsettings defined.
 * @intf_assoc: interface association descriptor
 * @minor: the minor number assigned to this interface, if this
 *        interface is bound to a driver that uses the USB major number.
 *        If this interface does not use the USB major, this field should
 *        be unused.  The driver should set this value in the probe()
 *        function of the driver, after it has been assigned a minor
 *        number from the USB core by calling usb_register_dev().
 * @condition: binding state of the interface: not bound, binding
 *        (in probe()), bound to a driver, or unbinding (in disconnect())
 * @sysfs_files_created: sysfs attributes exist
 * @ep_devs_created: endpoint child pseudo-devices exist
 * @unregistering: flag set when the interface is being unregistered
 * @needs_remote_wakeup: flag set when the driver requires remote-wakeup
 *        capability during autosuspend.
 * @needs_altsetting0: flag set when a set-interface request for altsetting 0
 *        has been deferred.
 * @needs_binding: flag set when the driver should be re-probed or unbound
 *        following a reset or suspend operation it doesn't support.
 * @authorized: This allows to (de)authorize individual interfaces instead
 *        a whole device in contrast to the device authorization.
 * @wireless_status: if the USB device uses a receiver/emitter combo, whether
 *        the emitter is connected.
 * @wireless_status_work: Used for scheduling wireless status changes
 *        from atomic context.
 * @dev: driver model's view of this device
 * @usb_dev: if an interface is bound to the USB major, this will point
 *        to the sysfs representation for that device.
 * @reset_ws: Used for scheduling resets from atomic context.
 * @resetting_device: USB core reset the device, so use alt setting 0 as
 *        current; needs bandwidth alloc after reset.
 *
 * USB device drivers attach to interfaces on a physical device.  Each
 * interface encapsulates a single high level function, such as feeding
 * an audio stream to a speaker or reporting a change in a volume control.
 * Many USB devices only have one interface.  The protocol used to talk to
 * an interface's endpoints can be defined in a usb "class" specification,
 * or by a product's vendor.  The (default) control endpoint is part of
 * every interface, but is never listed among the interface's descriptors.
 *
 * The driver that is bound to the interface can use standard driver model
 * calls such as dev_get_drvdata() on the dev member of this structure.
 *
 * Each interface may have alternate settings.  The initial configuration
 * of a device sets altsetting 0, but the device driver can change
 * that setting using usb_set_interface().  Alternate settings are often
 * used to control the use of periodic endpoints, such as by having
 * different endpoints use different amounts of reserved USB bandwidth.
 * All standards-conformant USB devices that use isochronous endpoints
 * will use them in non-default settings.
 *
 * The USB specification says that alternate setting numbers must run from
 * 0 to one less than the total number of alternate settings.  But some
 * devices manage to mess this up, and the structures aren't necessarily
 * stored in numerical order anyhow.  Use usb_altnum_to_altsetting() to
 * look up an alternate setting in the altsetting array based on its number.
 */
struct usb_interface {
        /* array of alternate settings for this interface,
         * stored in no particular order */
        struct usb_host_interface *altsetting;

        struct usb_host_interface *cur_altsetting;        /* the currently
                                         * active alternate setting */
        unsigned num_altsetting;        /* number of alternate settings */

        /* If there is an interface association descriptor then it will list
         * the associated interfaces */
        struct usb_interface_assoc_descriptor *intf_assoc;

        int minor;                        /* minor number this interface is
                                         * bound to */
        enum usb_interface_condition condition;                /* state of binding */
        unsigned sysfs_files_created:1;        /* the sysfs attributes exist */
        unsigned ep_devs_created:1;        /* endpoint "devices" exist */
        unsigned unregistering:1;        /* unregistration is in progress */
        unsigned needs_remote_wakeup:1;        /* driver requires remote wakeup */
        unsigned needs_altsetting0:1;        /* switch to altsetting 0 is pending */
        unsigned needs_binding:1;        /* needs delayed unbind/rebind */
        unsigned resetting_device:1;        /* true: bandwidth alloc after reset */
        unsigned authorized:1;                /* used for interface authorization */
        enum usb_wireless_status wireless_status;
        struct work_struct wireless_status_work;

        struct device dev;                /* interface specific device info */
        struct device *usb_dev;
        struct work_struct reset_ws;        /* for resets in atomic context */
};

#define to_usb_interface(__dev)        container_of_const(__dev, struct usb_interface, dev)

static inline void *usb_get_intfdata(struct usb_interface *intf)
{
        return dev_get_drvdata(&intf->dev);
}

/**
 * usb_set_intfdata() - associate driver-specific data with an interface
 * @intf: USB interface
 * @data: driver data
 *
 * Drivers can use this function in their probe() callbacks to associate
 * driver-specific data with an interface.
 *
 * Note that there is generally no need to clear the driver-data pointer even
 * if some drivers do so for historical or implementation-specific reasons.
 */
static inline void usb_set_intfdata(struct usb_interface *intf, void *data)
{
        dev_set_drvdata(&intf->dev, data);
}

struct usb_interface *usb_get_intf(struct usb_interface *intf);
void usb_put_intf(struct usb_interface *intf);

/* Hard limit */
#define USB_MAXENDPOINTS        30
/* this maximum is arbitrary */
#define USB_MAXINTERFACES        32
#define USB_MAXIADS                (USB_MAXINTERFACES/2)

bool usb_check_bulk_endpoints(
                const struct usb_interface *intf, const u8 *ep_addrs);
bool usb_check_int_endpoints(
                const struct usb_interface *intf, const u8 *ep_addrs);

/*
 * USB Resume Timer: Every Host controller driver should drive the resume
 * signalling on the bus for the amount of time defined by this macro.
 *
 * That way we will have a 'stable' behavior among all HCDs supported by Linux.
 *
 * Note that the USB Specification states we should drive resume for *at least*
 * 20 ms, but it doesn't give an upper bound. This creates two possible
 * situations which we want to avoid:
 *
 * (a) sometimes an msleep(20) might expire slightly before 20 ms, which causes
 * us to fail USB Electrical Tests, thus failing Certification
 *
 * (b) Some (many) devices actually need more than 20 ms of resume signalling,
 * and while we can argue that's against the USB Specification, we don't have
 * control over which devices a certification laboratory will be using for
 * certification. If CertLab uses a device which was tested against Windows and
 * that happens to have relaxed resume signalling rules, we might fall into
 * situations where we fail interoperability and electrical tests.
 *
 * In order to avoid both conditions, we're using a 40 ms resume timeout, which
 * should cope with both LPJ calibration errors and devices not following every
 * detail of the USB Specification.
 */
#define USB_RESUME_TIMEOUT        40 /* ms */

/**
 * struct usb_interface_cache - long-term representation of a device interface
 * @num_altsetting: number of altsettings defined.
 * @ref: reference counter.
 * @altsetting: variable-length array of interface structures, one for
 *        each alternate setting that may be selected.  Each one includes a
 *        set of endpoint configurations.  They will be in no particular order.
 *
 * These structures persist for the lifetime of a usb_device, unlike
 * struct usb_interface (which persists only as long as its configuration
 * is installed).  The altsetting arrays can be accessed through these
 * structures at any time, permitting comparison of configurations and
 * providing support for the /sys/kernel/debug/usb/devices pseudo-file.
 */
struct usb_interface_cache {
        unsigned num_altsetting;        /* number of alternate settings */
        struct kref ref;                /* reference counter */

        /* variable-length array of alternate settings for this interface,
         * stored in no particular order */
        struct usb_host_interface altsetting[];
};
#define        ref_to_usb_interface_cache(r) \
                container_of(r, struct usb_interface_cache, ref)
#define        altsetting_to_usb_interface_cache(a) \
                container_of(a, struct usb_interface_cache, altsetting[0])

/**
 * struct usb_host_config - representation of a device's configuration
 * @desc: the device's configuration descriptor.
 * @string: pointer to the cached version of the iConfiguration string, if
 *        present for this configuration.
 * @intf_assoc: list of any interface association descriptors in this config
 * @interface: array of pointers to usb_interface structures, one for each
 *        interface in the configuration.  The number of interfaces is stored
 *        in desc.bNumInterfaces.  These pointers are valid only while the
 *        configuration is active.
 * @intf_cache: array of pointers to usb_interface_cache structures, one
 *        for each interface in the configuration.  These structures exist
 *        for the entire life of the device.
 * @extra: pointer to buffer containing all extra descriptors associated
 *        with this configuration (those preceding the first interface
 *        descriptor).
 * @extralen: length of the extra descriptors buffer.
 *
 * USB devices may have multiple configurations, but only one can be active
 * at any time.  Each encapsulates a different operational environment;
 * for example, a dual-speed device would have separate configurations for
 * full-speed and high-speed operation.  The number of configurations
 * available is stored in the device descriptor as bNumConfigurations.
 *
 * A configuration can contain multiple interfaces.  Each corresponds to
 * a different function of the USB device, and all are available whenever
 * the configuration is active.  The USB standard says that interfaces
 * are supposed to be numbered from 0 to desc.bNumInterfaces-1, but a lot
 * of devices get this wrong.  In addition, the interface array is not
 * guaranteed to be sorted in numerical order.  Use usb_ifnum_to_if() to
 * look up an interface entry based on its number.
 *
 * Device drivers should not attempt to activate configurations.  The choice
 * of which configuration to install is a policy decision based on such
 * considerations as available power, functionality provided, and the user's
 * desires (expressed through userspace tools).  However, drivers can call
 * usb_reset_configuration() to reinitialize the current configuration and
 * all its interfaces.
 */
struct usb_host_config {
        struct usb_config_descriptor        desc;

        char *string;                /* iConfiguration string, if present */

        /* List of any Interface Association Descriptors in this
         * configuration. */
        struct usb_interface_assoc_descriptor *intf_assoc[USB_MAXIADS];

        /* the interfaces associated with this configuration,
         * stored in no particular order */
        struct usb_interface *interface[USB_MAXINTERFACES];

        /* Interface information available even when this is not the
         * active configuration */
        struct usb_interface_cache *intf_cache[USB_MAXINTERFACES];

        unsigned char *extra;   /* Extra descriptors */
        int extralen;
};

/* USB2.0 and USB3.0 device BOS descriptor set */
struct usb_host_bos {
        struct usb_bos_descriptor        *desc;

        struct usb_ext_cap_descriptor        *ext_cap;
        struct usb_ss_cap_descriptor        *ss_cap;
        struct usb_ssp_cap_descriptor        *ssp_cap;
        struct usb_ss_container_id_descriptor        *ss_id;
        struct usb_ptm_cap_descriptor        *ptm_cap;
};

int __usb_get_extra_descriptor(char *buffer, unsigned size,
        unsigned char type, void **ptr, size_t min);
#define usb_get_extra_descriptor(ifpoint, type, ptr) \
                                __usb_get_extra_descriptor((ifpoint)->extra, \
                                (ifpoint)->extralen, \
                                type, (void **)ptr, sizeof(**(ptr)))

/* ----------------------------------------------------------------------- */

/*
 * Allocated per bus (tree of devices) we have:
 */
struct usb_bus {
        struct device *controller;        /* host side hardware */
        struct device *sysdev;                /* as seen from firmware or bus */
        int busnum;                        /* Bus number (in order of reg) */
        const char *bus_name;                /* stable id (PCI slot_name etc) */
        u8 uses_pio_for_control;        /*
                                         * Does the host controller use PIO
                                         * for control transfers?
                                         */
        u8 otg_port;                        /* 0, or number of OTG/HNP port */
        unsigned is_b_host:1;                /* true during some HNP roleswitches */
        unsigned b_hnp_enable:1;        /* OTG: did A-Host enable HNP? */
        unsigned no_stop_on_short:1;    /*
                                         * Quirk: some controllers don't stop
                                         * the ep queue on a short transfer
                                         * with the URB_SHORT_NOT_OK flag set.
                                         */
        unsigned no_sg_constraint:1;        /* no sg constraint */
        unsigned sg_tablesize;                /* 0 or largest number of sg list entries */

        int devnum_next;                /* Next open device number in
                                         * round-robin allocation */
        struct mutex devnum_next_mutex; /* devnum_next mutex */

        DECLARE_BITMAP(devmap, 128);        /* USB device number allocation bitmap */
        struct usb_device *root_hub;        /* Root hub */
        struct usb_bus *hs_companion;        /* Companion EHCI bus, if any */

        int bandwidth_allocated;        /* on this bus: how much of the time
                                         * reserved for periodic (intr/iso)
                                         * requests is used, on average?
                                         * Units: microseconds/frame.
                                         * Limits: Full/low speed reserve 90%,
                                         * while high speed reserves 80%.
                                         */
        int bandwidth_int_reqs;                /* number of Interrupt requests */
        int bandwidth_isoc_reqs;        /* number of Isoc. requests */

        unsigned resuming_ports;        /* bit array: resuming root-hub ports */

#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE)
        struct mon_bus *mon_bus;        /* non-null when associated */
        int monitored;                        /* non-zero when monitored */
#endif
};

struct usb_dev_state;

/* ----------------------------------------------------------------------- */

struct usb_tt;

enum usb_link_tunnel_mode {
        USB_LINK_UNKNOWN = 0,
        USB_LINK_NATIVE,
        USB_LINK_TUNNELED,
};

enum usb_port_connect_type {
        USB_PORT_CONNECT_TYPE_UNKNOWN = 0,
        USB_PORT_CONNECT_TYPE_HOT_PLUG,
        USB_PORT_CONNECT_TYPE_HARD_WIRED,
        USB_PORT_NOT_USED,
};

/*
 * USB port quirks.
 */

/* For the given port, prefer the old (faster) enumeration scheme. */
#define USB_PORT_QUIRK_OLD_SCHEME        BIT(0)

/* Decrease TRSTRCY to 10ms during device enumeration. */
#define USB_PORT_QUIRK_FAST_ENUM        BIT(1)

/*
 * USB 2.0 Link Power Management (LPM) parameters.
 */
struct usb2_lpm_parameters {
        /* Best effort service latency indicate how long the host will drive
         * resume on an exit from L1.
         */
        unsigned int besl;

        /* Timeout value in microseconds for the L1 inactivity (LPM) timer.
         * When the timer counts to zero, the parent hub will initiate a LPM
         * transition to L1.
         */
        int timeout;
};

/*
 * USB 3.0 Link Power Management (LPM) parameters.
 *
 * PEL and SEL are USB 3.0 Link PM latencies for device-initiated LPM exit.
 * MEL is the USB 3.0 Link PM latency for host-initiated LPM exit.
 * All three are stored in nanoseconds.
 */
struct usb3_lpm_parameters {
        /*
         * Maximum exit latency (MEL) for the host to send a packet to the
         * device (either a Ping for isoc endpoints, or a data packet for
         * interrupt endpoints), the hubs to decode the packet, and for all hubs
         * in the path to transition the links to U0.
         */
        unsigned int mel;
        /*
         * Maximum exit latency for a device-initiated LPM transition to bring
         * all links into U0.  Abbreviated as "PEL" in section 9.4.12 of the USB
         * 3.0 spec, with no explanation of what "P" stands for.  "Path"?
         */
        unsigned int pel;

        /*
         * The System Exit Latency (SEL) includes PEL, and three other
         * latencies.  After a device initiates a U0 transition, it will take
         * some time from when the device sends the ERDY to when it will finally
         * receive the data packet.  Basically, SEL should be the worse-case
         * latency from when a device starts initiating a U0 transition to when
         * it will get data.
         */
        unsigned int sel;
        /*
         * The idle timeout value that is currently programmed into the parent
         * hub for this device.  When the timer counts to zero, the parent hub
         * will initiate an LPM transition to either U1 or U2.
         */
        int timeout;
};

/**
 * struct usb_device - kernel's representation of a USB device
 * @devnum: device number; address on a USB bus
 * @devpath: device ID string for use in messages (e.g., /port/...)
 * @route: tree topology hex string for use with xHCI
 * @state: device state: configured, not attached, etc.
 * @speed: device speed: high/full/low (or error)
 * @rx_lanes: number of rx lanes in use, USB 3.2 adds dual-lane support
 * @tx_lanes: number of tx lanes in use, USB 3.2 adds dual-lane support
 * @ssp_rate: SuperSpeed Plus phy signaling rate and lane count
 * @tt: Transaction Translator info; used with low/full speed dev, highspeed hub
 * @ttport: device port on that tt hub
 * @toggle: one bit for each endpoint, with ([0] = IN, [1] = OUT) endpoints
 * @parent: our hub, unless we're the root
 * @bus: bus we're part of
 * @ep0: endpoint 0 data (default control pipe)
 * @dev: generic device interface
 * @descriptor: USB device descriptor
 * @bos: USB device BOS descriptor set
 * @config: all of the device's configs
 * @actconfig: the active configuration
 * @ep_in: array of IN endpoints
 * @ep_out: array of OUT endpoints
 * @rawdescriptors: raw descriptors for each config
 * @bus_mA: Current available from the bus
 * @portnum: parent port number (origin 1)
 * @level: number of USB hub ancestors
 * @devaddr: device address, XHCI: assigned by HW, others: same as devnum
 * @can_submit: URBs may be submitted
 * @persist_enabled:  USB_PERSIST enabled for this device
 * @reset_in_progress: the device is being reset
 * @have_langid: whether string_langid is valid
 * @authorized: policy has said we can use it;
 *        (user space) policy determines if we authorize this device to be
 *        used or not. By default, wired USB devices are authorized.
 *        WUSB devices are not, until we authorize them from user space.
 *        FIXME -- complete doc
 * @authenticated: Crypto authentication passed
 * @tunnel_mode: Connection native or tunneled over USB4
 * @usb4_link: device link to the USB4 host interface
 * @lpm_capable: device supports LPM
 * @lpm_devinit_allow: Allow USB3 device initiated LPM, exit latency is in range
 * @usb2_hw_lpm_capable: device can perform USB2 hardware LPM
 * @usb2_hw_lpm_besl_capable: device can perform USB2 hardware BESL LPM
 * @usb2_hw_lpm_enabled: USB2 hardware LPM is enabled
 * @usb2_hw_lpm_allowed: Userspace allows USB 2.0 LPM to be enabled
 * @usb3_lpm_u1_enabled: USB3 hardware U1 LPM enabled
 * @usb3_lpm_u2_enabled: USB3 hardware U2 LPM enabled
 * @string_langid: language ID for strings
 * @product: iProduct string, if present (static)
 * @manufacturer: iManufacturer string, if present (static)
 * @serial: iSerialNumber string, if present (static)
 * @filelist: usbfs files that are open to this device
 * @maxchild: number of ports if hub
 * @quirks: quirks of the whole device
 * @urbnum: number of URBs submitted for the whole device
 * @active_duration: total time device is not suspended
 * @connect_time: time device was first connected
 * @do_remote_wakeup:  remote wakeup should be enabled
 * @reset_resume: needs reset instead of resume
 * @port_is_suspended: the upstream port is suspended (L2 or U3)
 * @offload_pm_locked: prevents offload_usage changes during PM transitions.
 * @offload_usage: number of offload activities happening on this usb device.
 * @offload_lock: protects offload_usage and offload_pm_locked
 * @slot_id: Slot ID assigned by xHCI
 * @l1_params: best effor service latency for USB2 L1 LPM state, and L1 timeout.
 * @u1_params: exit latencies for USB3 U1 LPM state, and hub-initiated timeout.
 * @u2_params: exit latencies for USB3 U2 LPM state, and hub-initiated timeout.
 * @lpm_disable_count: Ref count used by usb_disable_lpm() and usb_enable_lpm()
 *        to keep track of the number of functions that require USB 3.0 Link Power
 *        Management to be disabled for this usb_device.  This count should only
 *        be manipulated by those functions, with the bandwidth_mutex is held.
 * @hub_delay: cached value consisting of:
 *        parent->hub_delay + wHubDelay + tTPTransmissionDelay (40ns)
 *        Will be used as wValue for SetIsochDelay requests.
 * @use_generic_driver: ask driver core to reprobe using the generic driver.
 *
 * Notes:
 * Usbcore drivers should not set usbdev->state directly.  Instead use
 * usb_set_device_state().
 */
struct usb_device {
        int                devnum;
        char                devpath[16];
        u32                route;
        enum usb_device_state        state;
        enum usb_device_speed        speed;
        unsigned int                rx_lanes;
        unsigned int                tx_lanes;
        enum usb_ssp_rate        ssp_rate;

        struct usb_tt        *tt;
        int                ttport;

        unsigned int toggle[2];

        struct usb_device *parent;
        struct usb_bus *bus;
        struct usb_host_endpoint ep0;

        struct device dev;

        struct usb_device_descriptor descriptor;
        struct usb_host_bos *bos;
        struct usb_host_config *config;

        struct usb_host_config *actconfig;
        struct usb_host_endpoint *ep_in[16];
        struct usb_host_endpoint *ep_out[16];

        char **rawdescriptors;

        unsigned short bus_mA;
        u8 portnum;
        u8 level;
        u8 devaddr;

        unsigned can_submit:1;
        unsigned persist_enabled:1;
        unsigned reset_in_progress:1;
        unsigned have_langid:1;
        unsigned authorized:1;
        unsigned authenticated:1;
        unsigned lpm_capable:1;
        unsigned lpm_devinit_allow:1;
        unsigned usb2_hw_lpm_capable:1;
        unsigned usb2_hw_lpm_besl_capable:1;
        unsigned usb2_hw_lpm_enabled:1;
        unsigned usb2_hw_lpm_allowed:1;
        unsigned usb3_lpm_u1_enabled:1;
        unsigned usb3_lpm_u2_enabled:1;
        int string_langid;

        /* static strings from the device */
        char *product;
        char *manufacturer;
        char *serial;

        struct list_head filelist;

        int maxchild;

        u32 quirks;
        atomic_t urbnum;

        unsigned long active_duration;

        unsigned long connect_time;

        unsigned do_remote_wakeup:1;
        unsigned reset_resume:1;
        unsigned port_is_suspended:1;
        unsigned offload_pm_locked:1;
        int offload_usage;
        spinlock_t offload_lock;
        enum usb_link_tunnel_mode tunnel_mode;
        struct device_link *usb4_link;

        int slot_id;
        struct usb2_lpm_parameters l1_params;
        struct usb3_lpm_parameters u1_params;
        struct usb3_lpm_parameters u2_params;
        unsigned lpm_disable_count;

        u16 hub_delay;
        unsigned use_generic_driver:1;
};

#define to_usb_device(__dev)        container_of_const(__dev, struct usb_device, dev)

static inline struct usb_device *__intf_to_usbdev(struct usb_interface *intf)
{
        return to_usb_device(intf->dev.parent);
}
static inline const struct usb_device *__intf_to_usbdev_const(const struct usb_interface *intf)
{
        return to_usb_device((const struct device *)intf->dev.parent);
}

#define interface_to_usbdev(intf)                                        \
        _Generic((intf),                                                \
                 const struct usb_interface *: __intf_to_usbdev_const,        \
                 struct usb_interface *: __intf_to_usbdev)(intf)

extern struct usb_device *usb_get_dev(struct usb_device *dev);
extern void usb_put_dev(struct usb_device *dev);
extern struct usb_device *usb_hub_find_child(struct usb_device *hdev,
        int port1);

/**
 * usb_hub_for_each_child - iterate over all child devices on the hub
 * @hdev:  USB device belonging to the usb hub
 * @port1: portnum associated with child device
 * @child: child device pointer
 */
#define usb_hub_for_each_child(hdev, port1, child) \
        for (port1 = 1,        child =        usb_hub_find_child(hdev, port1); \
                        port1 <= hdev->maxchild; \
                        child = usb_hub_find_child(hdev, ++port1)) \
                if (!child) continue; else

/* USB device locking */
#define usb_lock_device(udev)                        device_lock(&(udev)->dev)
#define usb_unlock_device(udev)                        device_unlock(&(udev)->dev)
#define usb_lock_device_interruptible(udev)        device_lock_interruptible(&(udev)->dev)
#define usb_trylock_device(udev)                device_trylock(&(udev)->dev)
extern int usb_lock_device_for_reset(struct usb_device *udev,
                                     const struct usb_interface *iface);

/* USB port reset for device reinitialization */
extern int usb_reset_device(struct usb_device *dev);
extern void usb_queue_reset_device(struct usb_interface *dev);

extern struct device *usb_intf_get_dma_device(struct usb_interface *intf);

#ifdef CONFIG_ACPI
extern int usb_acpi_set_power_state(struct usb_device *hdev, int index,
        bool enable);
extern bool usb_acpi_power_manageable(struct usb_device *hdev, int index);
extern int usb_acpi_port_lpm_incapable(struct usb_device *hdev, int index);
#else
static inline int usb_acpi_set_power_state(struct usb_device *hdev, int index,
        bool enable) { return 0; }
static inline bool usb_acpi_power_manageable(struct usb_device *hdev, int index)
        { return true; }
static inline int usb_acpi_port_lpm_incapable(struct usb_device *hdev, int index)
        { return 0; }
#endif

/* USB autosuspend and autoresume */
#ifdef CONFIG_PM
extern void usb_enable_autosuspend(struct usb_device *udev);
extern void usb_disable_autosuspend(struct usb_device *udev);

extern int usb_autopm_get_interface(struct usb_interface *intf);
extern void usb_autopm_put_interface(struct usb_interface *intf);
extern int usb_autopm_get_interface_async(struct usb_interface *intf);
extern void usb_autopm_put_interface_async(struct usb_interface *intf);
extern void usb_autopm_get_interface_no_resume(struct usb_interface *intf);
extern void usb_autopm_put_interface_no_suspend(struct usb_interface *intf);

static inline void usb_mark_last_busy(struct usb_device *udev)
{
        pm_runtime_mark_last_busy(&udev->dev);
}

#else

static inline void usb_enable_autosuspend(struct usb_device *udev)
{ }
static inline void usb_disable_autosuspend(struct usb_device *udev)
{ }

static inline int usb_autopm_get_interface(struct usb_interface *intf)
{ return 0; }
static inline int usb_autopm_get_interface_async(struct usb_interface *intf)
{ return 0; }

static inline void usb_autopm_put_interface(struct usb_interface *intf)
{ }
static inline void usb_autopm_put_interface_async(struct usb_interface *intf)
{ }
static inline void usb_autopm_get_interface_no_resume(
                struct usb_interface *intf)
{ }
static inline void usb_autopm_put_interface_no_suspend(
                struct usb_interface *intf)
{ }
static inline void usb_mark_last_busy(struct usb_device *udev)
{ }
#endif

#if IS_ENABLED(CONFIG_USB_XHCI_SIDEBAND)
int usb_offload_get(struct usb_device *udev);
int usb_offload_put(struct usb_device *udev);
bool usb_offload_check(struct usb_device *udev);
void usb_offload_set_pm_locked(struct usb_device *udev, bool locked);
#else

static inline int usb_offload_get(struct usb_device *udev)
{ return 0; }
static inline int usb_offload_put(struct usb_device *udev)
{ return 0; }
static inline bool usb_offload_check(struct usb_device *udev)
{ return false; }
static inline void usb_offload_set_pm_locked(struct usb_device *udev, bool locked)
{ }
#endif

extern int usb_disable_lpm(struct usb_device *udev);
extern void usb_enable_lpm(struct usb_device *udev);
/* Same as above, but these functions lock/unlock the bandwidth_mutex. */
extern int usb_unlocked_disable_lpm(struct usb_device *udev);
extern void usb_unlocked_enable_lpm(struct usb_device *udev);

extern int usb_disable_ltm(struct usb_device *udev);
extern void usb_enable_ltm(struct usb_device *udev);

static inline bool usb_device_supports_ltm(struct usb_device *udev)
{
        if (udev->speed < USB_SPEED_SUPER || !udev->bos || !udev->bos->ss_cap)
                return false;
        return udev->bos->ss_cap->bmAttributes & USB_LTM_SUPPORT;
}

static inline bool usb_device_no_sg_constraint(struct usb_device *udev)
{
        return udev && udev->bus && udev->bus->no_sg_constraint;
}


/*-------------------------------------------------------------------------*/

/* for drivers using iso endpoints */
extern int usb_get_current_frame_number(struct usb_device *usb_dev);

/* Sets up a group of bulk endpoints to support multiple stream IDs. */
extern int usb_alloc_streams(struct usb_interface *interface,
                struct usb_host_endpoint **eps, unsigned int num_eps,
                unsigned int num_streams, gfp_t mem_flags);

/* Reverts a group of bulk endpoints back to not using stream IDs. */
extern int usb_free_streams(struct usb_interface *interface,
                struct usb_host_endpoint **eps, unsigned int num_eps,
                gfp_t mem_flags);

/* used these for multi-interface device registration */
extern int usb_driver_claim_interface(struct usb_driver *driver,
                        struct usb_interface *iface, void *data);

/**
 * usb_interface_claimed - returns true iff an interface is claimed
 * @iface: the interface being checked
 *
 * Return: %true (nonzero) iff the interface is claimed, else %false
 * (zero).
 *
 * Note:
 * Callers must own the driver model's usb bus readlock.  So driver
 * probe() entries don't need extra locking, but other call contexts
 * may need to explicitly claim that lock.
 *
 */
static inline int usb_interface_claimed(struct usb_interface *iface)
{
        return (iface->dev.driver != NULL);
}

extern void usb_driver_release_interface(struct usb_driver *driver,
                        struct usb_interface *iface);

int usb_set_wireless_status(struct usb_interface *iface,
                        enum usb_wireless_status status);

const struct usb_device_id *usb_match_id(struct usb_interface *interface,
                                         const struct usb_device_id *id);
extern int usb_match_one_id(struct usb_interface *interface,
                            const struct usb_device_id *id);

extern int usb_for_each_dev(void *data, int (*fn)(struct usb_device *, void *));
extern struct usb_interface *usb_find_interface(struct usb_driver *drv,
                int minor);
extern struct usb_interface *usb_ifnum_to_if(const struct usb_device *dev,
                unsigned ifnum);
extern struct usb_host_interface *usb_altnum_to_altsetting(
                const struct usb_interface *intf, unsigned int altnum);
extern struct usb_host_interface *usb_find_alt_setting(
                struct usb_host_config *config,
                unsigned int iface_num,
                unsigned int alt_num);

/* port claiming functions */
int usb_hub_claim_port(struct usb_device *hdev, unsigned port1,
                struct usb_dev_state *owner);
int usb_hub_release_port(struct usb_device *hdev, unsigned port1,
                struct usb_dev_state *owner);

/**
 * usb_make_path - returns stable device path in the usb tree
 * @dev: the device whose path is being constructed
 * @buf: where to put the string
 * @size: how big is "buf"?
 *
 * Return: Length of the string (> 0) or negative if size was too small.
 *
 * Note:
 * This identifier is intended to be "stable", reflecting physical paths in
 * hardware such as physical bus addresses for host controllers or ports on
 * USB hubs.  That makes it stay the same until systems are physically
 * reconfigured, by re-cabling a tree of USB devices or by moving USB host
 * controllers.  Adding and removing devices, including virtual root hubs
 * in host controller driver modules, does not change these path identifiers;
 * neither does rebooting or re-enumerating.  These are more useful identifiers
 * than changeable ("unstable") ones like bus numbers or device addresses.
 *
 * With a partial exception for devices connected to USB 2.0 root hubs, these
 * identifiers are also predictable.  So long as the device tree isn't changed,
 * plugging any USB device into a given hub port always gives it the same path.
 * Because of the use of "companion" controllers, devices connected to ports on
 * USB 2.0 root hubs (EHCI host controllers) will get one path ID if they are
 * high speed, and a different one if they are full or low speed.
 */
static inline int usb_make_path(struct usb_device *dev, char *buf, size_t size)
{
        int actual;
        actual = snprintf(buf, size, "usb-%s-%s", dev->bus->bus_name,
                          dev->devpath);
        return (actual >= (int)size) ? -1 : actual;
}

/*-------------------------------------------------------------------------*/

#define USB_DEVICE_ID_MATCH_DEVICE \
                (USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_PRODUCT)
#define USB_DEVICE_ID_MATCH_DEV_RANGE \
                (USB_DEVICE_ID_MATCH_DEV_LO | USB_DEVICE_ID_MATCH_DEV_HI)
#define USB_DEVICE_ID_MATCH_DEVICE_AND_VERSION \
                (USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_DEV_RANGE)
#define USB_DEVICE_ID_MATCH_DEV_INFO \
                (USB_DEVICE_ID_MATCH_DEV_CLASS | \
                USB_DEVICE_ID_MATCH_DEV_SUBCLASS | \
                USB_DEVICE_ID_MATCH_DEV_PROTOCOL)
#define USB_DEVICE_ID_MATCH_INT_INFO \
                (USB_DEVICE_ID_MATCH_INT_CLASS | \
                USB_DEVICE_ID_MATCH_INT_SUBCLASS | \
                USB_DEVICE_ID_MATCH_INT_PROTOCOL)

/**
 * USB_DEVICE - macro used to describe a specific usb device
 * @vend: the 16 bit USB Vendor ID
 * @prod: the 16 bit USB Product ID
 *
 * This macro is used to create a struct usb_device_id that matches a
 * specific device.
 */
#define USB_DEVICE(vend, prod) \
        .match_flags = USB_DEVICE_ID_MATCH_DEVICE, \
        .idVendor = (vend), \
        .idProduct = (prod)
/**
 * USB_DEVICE_VER - describe a specific usb device with a version range
 * @vend: the 16 bit USB Vendor ID
 * @prod: the 16 bit USB Product ID
 * @lo: the bcdDevice_lo value
 * @hi: the bcdDevice_hi value
 *
 * This macro is used to create a struct usb_device_id that matches a
 * specific device, with a version range.
 */
#define USB_DEVICE_VER(vend, prod, lo, hi) \
        .match_flags = USB_DEVICE_ID_MATCH_DEVICE_AND_VERSION, \
        .idVendor = (vend), \
        .idProduct = (prod), \
        .bcdDevice_lo = (lo), \
        .bcdDevice_hi = (hi)

/**
 * USB_DEVICE_INTERFACE_CLASS - describe a usb device with a specific interface class
 * @vend: the 16 bit USB Vendor ID
 * @prod: the 16 bit USB Product ID
 * @cl: bInterfaceClass value
 *
 * This macro is used to create a struct usb_device_id that matches a
 * specific interface class of devices.
 */
#define USB_DEVICE_INTERFACE_CLASS(vend, prod, cl) \
        .match_flags = USB_DEVICE_ID_MATCH_DEVICE | \
                       USB_DEVICE_ID_MATCH_INT_CLASS, \
        .idVendor = (vend), \
        .idProduct = (prod), \
        .bInterfaceClass = (cl)

/**
 * USB_DEVICE_INTERFACE_PROTOCOL - describe a usb device with a specific interface protocol
 * @vend: the 16 bit USB Vendor ID
 * @prod: the 16 bit USB Product ID
 * @pr: bInterfaceProtocol value
 *
 * This macro is used to create a struct usb_device_id that matches a
 * specific interface protocol of devices.
 */
#define USB_DEVICE_INTERFACE_PROTOCOL(vend, prod, pr) \
        .match_flags = USB_DEVICE_ID_MATCH_DEVICE | \
                       USB_DEVICE_ID_MATCH_INT_PROTOCOL, \
        .idVendor = (vend), \
        .idProduct = (prod), \
        .bInterfaceProtocol = (pr)

/**
 * USB_DEVICE_INTERFACE_NUMBER - describe a usb device with a specific interface number
 * @vend: the 16 bit USB Vendor ID
 * @prod: the 16 bit USB Product ID
 * @num: bInterfaceNumber value
 *
 * This macro is used to create a struct usb_device_id that matches a
 * specific interface number of devices.
 */
#define USB_DEVICE_INTERFACE_NUMBER(vend, prod, num) \
        .match_flags = USB_DEVICE_ID_MATCH_DEVICE | \
                       USB_DEVICE_ID_MATCH_INT_NUMBER, \
        .idVendor = (vend), \
        .idProduct = (prod), \
        .bInterfaceNumber = (num)

/**
 * USB_DEVICE_INFO - macro used to describe a class of usb devices
 * @cl: bDeviceClass value
 * @sc: bDeviceSubClass value
 * @pr: bDeviceProtocol value
 *
 * This macro is used to create a struct usb_device_id that matches a
 * specific class of devices.
 */
#define USB_DEVICE_INFO(cl, sc, pr) \
        .match_flags = USB_DEVICE_ID_MATCH_DEV_INFO, \
        .bDeviceClass = (cl), \
        .bDeviceSubClass = (sc), \
        .bDeviceProtocol = (pr)

/**
 * USB_INTERFACE_INFO - macro used to describe a class of usb interfaces
 * @cl: bInterfaceClass value
 * @sc: bInterfaceSubClass value
 * @pr: bInterfaceProtocol value
 *
 * This macro is used to create a struct usb_device_id that matches a
 * specific class of interfaces.
 */
#define USB_INTERFACE_INFO(cl, sc, pr) \
        .match_flags = USB_DEVICE_ID_MATCH_INT_INFO, \
        .bInterfaceClass = (cl), \
        .bInterfaceSubClass = (sc), \
        .bInterfaceProtocol = (pr)

/**
 * USB_DEVICE_AND_INTERFACE_INFO - describe a specific usb device with a class of usb interfaces
 * @vend: the 16 bit USB Vendor ID
 * @prod: the 16 bit USB Product ID
 * @cl: bInterfaceClass value
 * @sc: bInterfaceSubClass value
 * @pr: bInterfaceProtocol value
 *
 * This macro is used to create a struct usb_device_id that matches a
 * specific device with a specific class of interfaces.
 *
 * This is especially useful when explicitly matching devices that have
 * vendor specific bDeviceClass values, but standards-compliant interfaces.
 */
#define USB_DEVICE_AND_INTERFACE_INFO(vend, prod, cl, sc, pr) \
        .match_flags = USB_DEVICE_ID_MATCH_INT_INFO \
                | USB_DEVICE_ID_MATCH_DEVICE, \
        .idVendor = (vend), \
        .idProduct = (prod), \
        .bInterfaceClass = (cl), \
        .bInterfaceSubClass = (sc), \
        .bInterfaceProtocol = (pr)

/**
 * USB_VENDOR_AND_INTERFACE_INFO - describe a specific usb vendor with a class of usb interfaces
 * @vend: the 16 bit USB Vendor ID
 * @cl: bInterfaceClass value
 * @sc: bInterfaceSubClass value
 * @pr: bInterfaceProtocol value
 *
 * This macro is used to create a struct usb_device_id that matches a
 * specific vendor with a specific class of interfaces.
 *
 * This is especially useful when explicitly matching devices that have
 * vendor specific bDeviceClass values, but standards-compliant interfaces.
 */
#define USB_VENDOR_AND_INTERFACE_INFO(vend, cl, sc, pr) \
        .match_flags = USB_DEVICE_ID_MATCH_INT_INFO \
                | USB_DEVICE_ID_MATCH_VENDOR, \
        .idVendor = (vend), \
        .bInterfaceClass = (cl), \
        .bInterfaceSubClass = (sc), \
        .bInterfaceProtocol = (pr)

/* ----------------------------------------------------------------------- */

/* Stuff for dynamic usb ids */
extern struct mutex usb_dynids_lock;
struct usb_dynids {
        struct list_head list;
};

struct usb_dynid {
        struct list_head node;
        struct usb_device_id id;
};

extern ssize_t usb_store_new_id(struct usb_dynids *dynids,
                                const struct usb_device_id *id_table,
                                struct device_driver *driver,
                                const char *buf, size_t count);

extern ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf);

/**
 * struct usb_driver - identifies USB interface driver to usbcore
 * @name: The driver name should be unique among USB drivers,
 *        and should normally be the same as the module name.
 * @probe: Called to see if the driver is willing to manage a particular
 *        interface on a device.  If it is, probe returns zero and uses
 *        usb_set_intfdata() to associate driver-specific data with the
 *        interface.  It may also use usb_set_interface() to specify the
 *        appropriate altsetting.  If unwilling to manage the interface,
 *        return -ENODEV, if genuine IO errors occurred, an appropriate
 *        negative errno value.
 * @disconnect: Called when the interface is no longer accessible, usually
 *        because its device has been (or is being) disconnected or the
 *        driver module is being unloaded.
 * @unlocked_ioctl: Used for drivers that want to talk to userspace through
 *        the "usbfs" filesystem.  This lets devices provide ways to
 *        expose information to user space regardless of where they
 *        do (or don't) show up otherwise in the filesystem.
 * @suspend: Called when the device is going to be suspended by the
 *        system either from system sleep or runtime suspend context. The
 *        return value will be ignored in system sleep context, so do NOT
 *        try to continue using the device if suspend fails in this case.
 *        Instead, let the resume or reset-resume routine recover from
 *        the failure.
 * @resume: Called when the device is being resumed by the system.
 * @reset_resume: Called when the suspended device has been reset instead
 *        of being resumed.
 * @pre_reset: Called by usb_reset_device() when the device is about to be
 *        reset.  This routine must not return until the driver has no active
 *        URBs for the device, and no more URBs may be submitted until the
 *        post_reset method is called.
 * @post_reset: Called by usb_reset_device() after the device
 *        has been reset
 * @shutdown: Called at shut-down time to quiesce the device.
 * @id_table: USB drivers use ID table to support hotplugging.
 *        Export this with MODULE_DEVICE_TABLE(usb,...).  This must be set
 *        or your driver's probe function will never get called.
 * @dev_groups: Attributes attached to the device that will be created once it
 *        is bound to the driver.
 * @dynids: used internally to hold the list of dynamically added device
 *        ids for this driver.
 * @driver: The driver-model core driver structure.
 * @no_dynamic_id: if set to 1, the USB core will not allow dynamic ids to be
 *        added to this driver by preventing the sysfs file from being created.
 * @supports_autosuspend: if set to 0, the USB core will not allow autosuspend
 *        for interfaces bound to this driver.
 * @soft_unbind: if set to 1, the USB core will not kill URBs and disable
 *        endpoints before calling the driver's disconnect method.
 * @disable_hub_initiated_lpm: if set to 1, the USB core will not allow hubs
 *        to initiate lower power link state transitions when an idle timeout
 *        occurs.  Device-initiated USB 3.0 link PM will still be allowed.
 *
 * USB interface drivers must provide a name, probe() and disconnect()
 * methods, and an id_table.  Other driver fields are optional.
 *
 * The id_table is used in hotplugging.  It holds a set of descriptors,
 * and specialized data may be associated with each entry.  That table
 * is used by both user and kernel mode hotplugging support.
 *
 * The probe() and disconnect() methods are called in a context where
 * they can sleep, but they should avoid abusing the privilege.  Most
 * work to connect to a device should be done when the device is opened,
 * and undone at the last close.  The disconnect code needs to address
 * concurrency issues with respect to open() and close() methods, as
 * well as forcing all pending I/O requests to complete (by unlinking
 * them as necessary, and blocking until the unlinks complete).
 */
struct usb_driver {
        const char *name;

        int (*probe) (struct usb_interface *intf,
                      const struct usb_device_id *id);

        void (*disconnect) (struct usb_interface *intf);

        int (*unlocked_ioctl) (struct usb_interface *intf, unsigned int code,
                        void *buf);

        int (*suspend) (struct usb_interface *intf, pm_message_t message);
        int (*resume) (struct usb_interface *intf);
        int (*reset_resume)(struct usb_interface *intf);

        int (*pre_reset)(struct usb_interface *intf);
        int (*post_reset)(struct usb_interface *intf);

        void (*shutdown)(struct usb_interface *intf);

        const struct usb_device_id *id_table;
        const struct attribute_group **dev_groups;

        struct usb_dynids dynids;
        struct device_driver driver;
        unsigned int no_dynamic_id:1;
        unsigned int supports_autosuspend:1;
        unsigned int disable_hub_initiated_lpm:1;
        unsigned int soft_unbind:1;
};
#define        to_usb_driver(d) container_of_const(d, struct usb_driver, driver)

/**
 * struct usb_device_driver - identifies USB device driver to usbcore
 * @name: The driver name should be unique among USB drivers,
 *        and should normally be the same as the module name.
 * @match: If set, used for better device/driver matching.
 * @probe: Called to see if the driver is willing to manage a particular
 *        device.  If it is, probe returns zero and uses dev_set_drvdata()
 *        to associate driver-specific data with the device.  If unwilling
 *        to manage the device, return a negative errno value.
 * @disconnect: Called when the device is no longer accessible, usually
 *        because it has been (or is being) disconnected or the driver's
 *        module is being unloaded.
 * @suspend: Called when the device is going to be suspended by the system.
 * @resume: Called when the device is being resumed by the system.
 * @choose_configuration: If non-NULL, called instead of the default
 *        usb_choose_configuration(). If this returns an error then we'll go
 *        on to call the normal usb_choose_configuration().
 * @dev_groups: Attributes attached to the device that will be created once it
 *        is bound to the driver.
 * @driver: The driver-model core driver structure.
 * @id_table: used with @match() to select better matching driver at
 *         probe() time.
 * @supports_autosuspend: if set to 0, the USB core will not allow autosuspend
 *        for devices bound to this driver.
 * @generic_subclass: if set to 1, the generic USB driver's probe, disconnect,
 *        resume and suspend functions will be called in addition to the driver's
 *        own, so this part of the setup does not need to be replicated.
 *
 * USB device drivers must provide a name, other driver fields are optional.
 */
struct usb_device_driver {
        const char *name;

        bool (*match) (struct usb_device *udev);
        int (*probe) (struct usb_device *udev);
        void (*disconnect) (struct usb_device *udev);

        int (*suspend) (struct usb_device *udev, pm_message_t message);
        int (*resume) (struct usb_device *udev, pm_message_t message);

        int (*choose_configuration) (struct usb_device *udev);

        const struct attribute_group **dev_groups;
        struct device_driver driver;
        const struct usb_device_id *id_table;
        unsigned int supports_autosuspend:1;
        unsigned int generic_subclass:1;
};
#define        to_usb_device_driver(d) container_of_const(d, struct usb_device_driver, driver)

/**
 * struct usb_class_driver - identifies a USB driver that wants to use the USB major number
 * @name: the usb class device name for this driver.  Will show up in sysfs.
 * @devnode: Callback to provide a naming hint for a possible
 *        device node to create.
 * @fops: pointer to the struct file_operations of this driver.
 * @minor_base: the start of the minor range for this driver.
 *
 * This structure is used for the usb_register_dev() and
 * usb_deregister_dev() functions, to consolidate a number of the
 * parameters used for them.
 */
struct usb_class_driver {
        char *name;
        char *(*devnode)(const struct device *dev, umode_t *mode);
        const struct file_operations *fops;
        int minor_base;
};

/*
 * use these in module_init()/module_exit()
 * and don't forget MODULE_DEVICE_TABLE(usb, ...)
 */
extern int usb_register_driver(struct usb_driver *, struct module *,
                               const char *);

/* use a define to avoid include chaining to get THIS_MODULE & friends */
#define usb_register(driver) \
        usb_register_driver(driver, THIS_MODULE, KBUILD_MODNAME)

extern void usb_deregister(struct usb_driver *);

/**
 * module_usb_driver() - Helper macro for registering a USB driver
 * @__usb_driver: usb_driver struct
 *
 * Helper macro for USB drivers which do not do anything special in module
 * init/exit. This eliminates a lot of boilerplate. Each module may only
 * use this macro once, and calling it replaces module_init() and module_exit()
 */
#define module_usb_driver(__usb_driver) \
        module_driver(__usb_driver, usb_register, \
                       usb_deregister)

extern int usb_register_device_driver(struct usb_device_driver *,
                        struct module *);
extern void usb_deregister_device_driver(struct usb_device_driver *);

extern int usb_register_dev(struct usb_interface *intf,
                            struct usb_class_driver *class_driver);
extern void usb_deregister_dev(struct usb_interface *intf,
                               struct usb_class_driver *class_driver);

extern int usb_disabled(void);

/* ----------------------------------------------------------------------- */

/*
 * URB support, for asynchronous request completions
 */

/*
 * urb->transfer_flags:
 *
 * Note: URB_DIR_IN/OUT is automatically set in usb_submit_urb().
 */
#define URB_SHORT_NOT_OK        0x0001        /* report short reads as errors */
#define URB_ISO_ASAP                0x0002        /* iso-only; use the first unexpired
                                         * slot in the schedule */
#define URB_NO_TRANSFER_DMA_MAP        0x0004        /* urb->transfer_dma valid on submit */
#define URB_ZERO_PACKET                0x0040        /* Finish bulk OUT with short packet */
#define URB_NO_INTERRUPT        0x0080        /* HINT: no non-error interrupt
                                         * needed */
#define URB_FREE_BUFFER                0x0100        /* Free transfer buffer with the URB */

/* The following flags are used internally by usbcore and HCDs */
#define URB_DIR_IN                0x0200        /* Transfer from device to host */
#define URB_DIR_OUT                0
#define URB_DIR_MASK                URB_DIR_IN

#define URB_DMA_MAP_SINGLE        0x00010000        /* Non-scatter-gather mapping */
#define URB_DMA_MAP_PAGE        0x00020000        /* HCD-unsupported S-G */
#define URB_DMA_MAP_SG                0x00040000        /* HCD-supported S-G */
#define URB_MAP_LOCAL                0x00080000        /* HCD-local-memory mapping */
#define URB_SETUP_MAP_SINGLE        0x00100000        /* Setup packet DMA mapped */
#define URB_SETUP_MAP_LOCAL        0x00200000        /* HCD-local setup packet */
#define URB_DMA_SG_COMBINED        0x00400000        /* S-G entries were combined */
#define URB_ALIGNED_TEMP_BUFFER        0x00800000        /* Temp buffer was alloc'd */

struct usb_iso_packet_descriptor {
        unsigned int offset;
        unsigned int length;                /* expected length */
        unsigned int actual_length;
        int status;
};

struct urb;

struct usb_anchor {
        struct list_head urb_list;
        wait_queue_head_t wait;
        spinlock_t lock;
        atomic_t suspend_wakeups;
        unsigned int poisoned:1;
};

static inline void init_usb_anchor(struct usb_anchor *anchor)
{
        memset(anchor, 0, sizeof(*anchor));
        INIT_LIST_HEAD(&anchor->urb_list);
        init_waitqueue_head(&anchor->wait);
        spin_lock_init(&anchor->lock);
}

typedef void (*usb_complete_t)(struct urb *);

/**
 * struct urb - USB Request Block
 * @urb_list: For use by current owner of the URB.
 * @anchor_list: membership in the list of an anchor
 * @anchor: to anchor URBs to a common mooring
 * @ep: Points to the endpoint's data structure.  Will eventually
 *        replace @pipe.
 * @pipe: Holds endpoint number, direction, type, and more.
 *        Create these values with the eight macros available;
 *        usb_{snd,rcv}TYPEpipe(dev,endpoint), where the TYPE is "ctrl"
 *        (control), "bulk", "int" (interrupt), or "iso" (isochronous).
 *        For example usb_sndbulkpipe() or usb_rcvintpipe().  Endpoint
 *        numbers range from zero to fifteen.  Note that "in" endpoint two
 *        is a different endpoint (and pipe) from "out" endpoint two.
 *        The current configuration controls the existence, type, and
 *        maximum packet size of any given endpoint.
 * @stream_id: the endpoint's stream ID for bulk streams
 * @dev: Identifies the USB device to perform the request.
 * @status: This is read in non-iso completion functions to get the
 *        status of the particular request.  ISO requests only use it
 *        to tell whether the URB was unlinked; detailed status for
 *        each frame is in the fields of the iso_frame-desc.
 * @transfer_flags: A variety of flags may be used to affect how URB
 *        submission, unlinking, or operation are handled.  Different
 *        kinds of URB can use different flags.
 * @transfer_buffer:  This identifies the buffer to (or from) which the I/O
 *        request will be performed unless URB_NO_TRANSFER_DMA_MAP is set
 *        (however, do not leave garbage in transfer_buffer even then).
 *        This buffer must be suitable for DMA; allocate it with
 *        kmalloc() or equivalent.  For transfers to "in" endpoints, contents
 *        of this buffer will be modified.  This buffer is used for the data
 *        stage of control transfers.
 * @transfer_dma: When transfer_flags includes URB_NO_TRANSFER_DMA_MAP,
 *        the device driver is saying that it provided this DMA address,
 *        which the host controller driver should use in preference to the
 *        transfer_buffer.
 * @sg: scatter gather buffer list, the buffer size of each element in
 *         the list (except the last) must be divisible by the endpoint's
 *         max packet size if no_sg_constraint isn't set in 'struct usb_bus'
 * @sgt: used to hold a scatter gather table returned by usb_alloc_noncoherent(),
 *      which describes the allocated non-coherent and possibly non-contiguous
 *      memory and is guaranteed to have 1 single DMA mapped segment. The
 *      allocated memory needs to be freed by usb_free_noncoherent().
 * @num_mapped_sgs: (internal) number of mapped sg entries
 * @num_sgs: number of entries in the sg list
 * @transfer_buffer_length: How big is transfer_buffer.  The transfer may
 *        be broken up into chunks according to the current maximum packet
 *        size for the endpoint, which is a function of the configuration
 *        and is encoded in the pipe.  When the length is zero, neither
 *        transfer_buffer nor transfer_dma is used.
 * @actual_length: This is read in non-iso completion functions, and
 *        it tells how many bytes (out of transfer_buffer_length) were
 *        transferred.  It will normally be the same as requested, unless
 *        either an error was reported or a short read was performed.
 *        The URB_SHORT_NOT_OK transfer flag may be used to make such
 *        short reads be reported as errors.
 * @setup_packet: Only used for control transfers, this points to eight bytes
 *        of setup data.  Control transfers always start by sending this data
 *        to the device.  Then transfer_buffer is read or written, if needed.
 * @setup_dma: DMA pointer for the setup packet.  The caller must not use
 *        this field; setup_packet must point to a valid buffer.
 * @start_frame: Returns the initial frame for isochronous transfers.
 * @number_of_packets: Lists the number of ISO transfer buffers.
 * @interval: Specifies the polling interval for interrupt or isochronous
 *        transfers.  The units are frames (milliseconds) for full and low
 *        speed devices, and microframes (1/8 millisecond) for highspeed
 *        and SuperSpeed devices.
 * @error_count: Returns the number of ISO transfers that reported errors.
 * @context: For use in completion functions.  This normally points to
 *        request-specific driver context.
 * @complete: Completion handler. This URB is passed as the parameter to the
 *        completion function.  The completion function may then do what
 *        it likes with the URB, including resubmitting or freeing it.
 * @iso_frame_desc: Used to provide arrays of ISO transfer buffers and to
 *        collect the transfer status for each buffer.
 *
 * This structure identifies USB transfer requests.  URBs must be allocated by
 * calling usb_alloc_urb() and freed with a call to usb_free_urb().
 * Initialization may be done using various usb_fill_*_urb() functions.  URBs
 * are submitted using usb_submit_urb(), and pending requests may be canceled
 * using usb_unlink_urb() or usb_kill_urb().
 *
 * Data Transfer Buffers:
 *
 * Normally drivers provide I/O buffers allocated with kmalloc() or otherwise
 * taken from the general page pool.  That is provided by transfer_buffer
 * (control requests also use setup_packet), and host controller drivers
 * perform a dma mapping (and unmapping) for each buffer transferred.  Those
 * mapping operations can be expensive on some platforms (perhaps using a dma
 * bounce buffer or talking to an IOMMU),
 * although they're cheap on commodity x86 and ppc hardware.
 *
 * Alternatively, drivers may pass the URB_NO_TRANSFER_DMA_MAP transfer flag,
 * which tells the host controller driver that no such mapping is needed for
 * the transfer_buffer since
 * the device driver is DMA-aware.  For example, a device driver might
 * allocate a DMA buffer with usb_alloc_coherent() or call usb_buffer_map().
 * When this transfer flag is provided, host controller drivers will
 * attempt to use the dma address found in the transfer_dma
 * field rather than determining a dma address themselves.
 *
 * Note that transfer_buffer must still be set if the controller
 * does not support DMA (as indicated by hcd_uses_dma()) and when talking
 * to root hub. If you have to transfer between highmem zone and the device
 * on such controller, create a bounce buffer or bail out with an error.
 * If transfer_buffer cannot be set (is in highmem) and the controller is DMA
 * capable, assign NULL to it, so that usbmon knows not to use the value.
 * The setup_packet must always be set, so it cannot be located in highmem.
 *
 * Initialization:
 *
 * All URBs submitted must initialize the dev, pipe, transfer_flags (may be
 * zero), and complete fields.  All URBs must also initialize
 * transfer_buffer and transfer_buffer_length.  They may provide the
 * URB_SHORT_NOT_OK transfer flag, indicating that short reads are
 * to be treated as errors; that flag is invalid for write requests.
 *
 * Bulk URBs may
 * use the URB_ZERO_PACKET transfer flag, indicating that bulk OUT transfers
 * should always terminate with a short packet, even if it means adding an
 * extra zero length packet.
 *
 * Control URBs must provide a valid pointer in the setup_packet field.
 * Unlike the transfer_buffer, the setup_packet may not be mapped for DMA
 * beforehand.
 *
 * Interrupt URBs must provide an interval, saying how often (in milliseconds
 * or, for highspeed devices, 125 microsecond units)
 * to poll for transfers.  After the URB has been submitted, the interval
 * field reflects how the transfer was actually scheduled.
 * The polling interval may be more frequent than requested.
 * For example, some controllers have a maximum interval of 32 milliseconds,
 * while others support intervals of up to 1024 milliseconds.
 * Isochronous URBs also have transfer intervals.  (Note that for isochronous
 * endpoints, as well as high speed interrupt endpoints, the encoding of
 * the transfer interval in the endpoint descriptor is logarithmic.
 * Device drivers must convert that value to linear units themselves.)
 *
 * If an isochronous endpoint queue isn't already running, the host
 * controller will schedule a new URB to start as soon as bandwidth
 * utilization allows.  If the queue is running then a new URB will be
 * scheduled to start in the first transfer slot following the end of the
 * preceding URB, if that slot has not already expired.  If the slot has
 * expired (which can happen when IRQ delivery is delayed for a long time),
 * the scheduling behavior depends on the URB_ISO_ASAP flag.  If the flag
 * is clear then the URB will be scheduled to start in the expired slot,
 * implying that some of its packets will not be transferred; if the flag
 * is set then the URB will be scheduled in the first unexpired slot,
 * breaking the queue's synchronization.  Upon URB completion, the
 * start_frame field will be set to the (micro)frame number in which the
 * transfer was scheduled.  Ranges for frame counter values are HC-specific
 * and can go from as low as 256 to as high as 65536 frames.
 *
 * Isochronous URBs have a different data transfer model, in part because
 * the quality of service is only "best effort".  Callers provide specially
 * allocated URBs, with number_of_packets worth of iso_frame_desc structures
 * at the end.  Each such packet is an individual ISO transfer.  Isochronous
 * URBs are normally queued, submitted by drivers to arrange that
 * transfers are at least double buffered, and then explicitly resubmitted
 * in completion handlers, so
 * that data (such as audio or video) streams at as constant a rate as the
 * host controller scheduler can support.
 *
 * Completion Callbacks:
 *
 * The completion callback is made in_interrupt(), and one of the first
 * things that a completion handler should do is check the status field.
 * The status field is provided for all URBs.  It is used to report
 * unlinked URBs, and status for all non-ISO transfers.  It should not
 * be examined before the URB is returned to the completion handler.
 *
 * The context field is normally used to link URBs back to the relevant
 * driver or request state.
 *
 * When the completion callback is invoked for non-isochronous URBs, the
 * actual_length field tells how many bytes were transferred.  This field
 * is updated even when the URB terminated with an error or was unlinked.
 *
 * ISO transfer status is reported in the status and actual_length fields
 * of the iso_frame_desc array, and the number of errors is reported in
 * error_count.  Completion callbacks for ISO transfers will normally
 * (re)submit URBs to ensure a constant transfer rate.
 *
 * Note that even fields marked "public" should not be touched by the driver
 * when the urb is owned by the hcd, that is, since the call to
 * usb_submit_urb() till the entry into the completion routine.
 */
struct urb {
        /* private: usb core and host controller only fields in the urb */
        struct kref kref;                /* reference count of the URB */
        int unlinked;                        /* unlink error code */
        void *hcpriv;                        /* private data for host controller */
        atomic_t use_count;                /* concurrent submissions counter */
        atomic_t reject;                /* submissions will fail */

        /* public: documented fields in the urb that can be used by drivers */
        struct list_head urb_list;        /* list head for use by the urb's
                                         * current owner */
        struct list_head anchor_list;        /* the URB may be anchored */
        struct usb_anchor *anchor;
        struct usb_device *dev;                /* (in) pointer to associated device */
        struct usb_host_endpoint *ep;        /* (internal) pointer to endpoint */
        unsigned int pipe;                /* (in) pipe information */
        unsigned int stream_id;                /* (in) stream ID */
        int status;                        /* (return) non-ISO status */
        unsigned int transfer_flags;        /* (in) URB_SHORT_NOT_OK | ...*/
        void *transfer_buffer;                /* (in) associated data buffer */
        dma_addr_t transfer_dma;        /* (in) dma addr for transfer_buffer */
        struct scatterlist *sg;                /* (in) scatter gather buffer list */
        struct sg_table *sgt;                /* (in) scatter gather table for noncoherent buffer */
        int num_mapped_sgs;                /* (internal) mapped sg entries */
        int num_sgs;                        /* (in) number of entries in the sg list */
        u32 transfer_buffer_length;        /* (in) data buffer length */
        u32 actual_length;                /* (return) actual transfer length */
        unsigned char *setup_packet;        /* (in) setup packet (control only) */
        dma_addr_t setup_dma;                /* (in) dma addr for setup_packet */
        int start_frame;                /* (modify) start frame (ISO) */
        int number_of_packets;                /* (in) number of ISO packets */
        int interval;                        /* (modify) transfer interval
                                         * (INT/ISO) */
        int error_count;                /* (return) number of ISO errors */
        void *context;                        /* (in) context for completion */
        usb_complete_t complete;        /* (in) completion routine */
        struct usb_iso_packet_descriptor iso_frame_desc[];
                                        /* (in) ISO ONLY */
};

/* ----------------------------------------------------------------------- */

/**
 * usb_fill_control_urb - initializes a control urb
 * @urb: pointer to the urb to initialize.
 * @dev: pointer to the struct usb_device for this urb.
 * @pipe: the endpoint pipe
 * @setup_packet: pointer to the setup_packet buffer. The buffer must be
 *        suitable for DMA.
 * @transfer_buffer: pointer to the transfer buffer. The buffer must be
 *        suitable for DMA.
 * @buffer_length: length of the transfer buffer
 * @complete_fn: pointer to the usb_complete_t function
 * @context: what to set the urb context to.
 *
 * Initializes a control urb with the proper information needed to submit
 * it to a device.
 *
 * The transfer buffer and the setup_packet buffer will most likely be filled
 * or read via DMA. The simplest way to get a buffer that can be DMAed to is
 * allocating it via kmalloc() or equivalent, even for very small buffers.
 * If the buffers are embedded in a bigger structure, there is a risk that
 * the buffer itself, the previous fields and/or the next fields are corrupted
 * due to cache incoherencies; or slowed down if they are evicted from the
 * cache. For more information, check &struct urb.
 *
 */
static inline void usb_fill_control_urb(struct urb *urb,
                                        struct usb_device *dev,
                                        unsigned int pipe,
                                        unsigned char *setup_packet,
                                        void *transfer_buffer,
                                        int buffer_length,
                                        usb_complete_t complete_fn,
                                        void *context)
{
        urb->dev = dev;
        urb->pipe = pipe;
        urb->setup_packet = setup_packet;
        urb->transfer_buffer = transfer_buffer;
        urb->transfer_buffer_length = buffer_length;
        urb->complete = complete_fn;
        urb->context = context;
}

/**
 * usb_fill_bulk_urb - macro to help initialize a bulk urb
 * @urb: pointer to the urb to initialize.
 * @dev: pointer to the struct usb_device for this urb.
 * @pipe: the endpoint pipe
 * @transfer_buffer: pointer to the transfer buffer. The buffer must be
 *        suitable for DMA.
 * @buffer_length: length of the transfer buffer
 * @complete_fn: pointer to the usb_complete_t function
 * @context: what to set the urb context to.
 *
 * Initializes a bulk urb with the proper information needed to submit it
 * to a device.
 *
 * Refer to usb_fill_control_urb() for a description of the requirements for
 * transfer_buffer.
 */
static inline void usb_fill_bulk_urb(struct urb *urb,
                                     struct usb_device *dev,
                                     unsigned int pipe,
                                     void *transfer_buffer,
                                     int buffer_length,
                                     usb_complete_t complete_fn,
                                     void *context)
{
        urb->dev = dev;
        urb->pipe = pipe;
        urb->transfer_buffer = transfer_buffer;
        urb->transfer_buffer_length = buffer_length;
        urb->complete = complete_fn;
        urb->context = context;
}

/**
 * usb_fill_int_urb - macro to help initialize a interrupt urb
 * @urb: pointer to the urb to initialize.
 * @dev: pointer to the struct usb_device for this urb.
 * @pipe: the endpoint pipe
 * @transfer_buffer: pointer to the transfer buffer. The buffer must be
 *        suitable for DMA.
 * @buffer_length: length of the transfer buffer
 * @complete_fn: pointer to the usb_complete_t function
 * @context: what to set the urb context to.
 * @interval: what to set the urb interval to, encoded like
 *        the endpoint descriptor's bInterval value.
 *
 * Initializes a interrupt urb with the proper information needed to submit
 * it to a device.
 *
 * Refer to usb_fill_control_urb() for a description of the requirements for
 * transfer_buffer.
 *
 * Note that High Speed and SuperSpeed(+) interrupt endpoints use a logarithmic
 * encoding of the endpoint interval, and express polling intervals in
 * microframes (eight per millisecond) rather than in frames (one per
 * millisecond).
 */
static inline void usb_fill_int_urb(struct urb *urb,
                                    struct usb_device *dev,
                                    unsigned int pipe,
                                    void *transfer_buffer,
                                    int buffer_length,
                                    usb_complete_t complete_fn,
                                    void *context,
                                    int interval)
{
        urb->dev = dev;
        urb->pipe = pipe;
        urb->transfer_buffer = transfer_buffer;
        urb->transfer_buffer_length = buffer_length;
        urb->complete = complete_fn;
        urb->context = context;

        if (dev->speed == USB_SPEED_HIGH || dev->speed >= USB_SPEED_SUPER) {
                /* make sure interval is within allowed range */
                interval = clamp(interval, 1, 16);

                urb->interval = 1 << (interval - 1);
        } else {
                urb->interval = interval;
        }

        urb->start_frame = -1;
}

extern void usb_init_urb(struct urb *urb);
extern struct urb *usb_alloc_urb(int iso_packets, gfp_t mem_flags);
extern void usb_free_urb(struct urb *urb);
#define usb_put_urb usb_free_urb
extern struct urb *usb_get_urb(struct urb *urb);
extern int usb_submit_urb(struct urb *urb, gfp_t mem_flags);
extern int usb_unlink_urb(struct urb *urb);
extern void usb_kill_urb(struct urb *urb);
extern void usb_poison_urb(struct urb *urb);
extern void usb_unpoison_urb(struct urb *urb);
extern void usb_block_urb(struct urb *urb);
extern void usb_kill_anchored_urbs(struct usb_anchor *anchor);
extern void usb_poison_anchored_urbs(struct usb_anchor *anchor);
extern void usb_unpoison_anchored_urbs(struct usb_anchor *anchor);
extern void usb_anchor_suspend_wakeups(struct usb_anchor *anchor);
extern void usb_anchor_resume_wakeups(struct usb_anchor *anchor);
extern void usb_anchor_urb(struct urb *urb, struct usb_anchor *anchor);
extern void usb_unanchor_urb(struct urb *urb);
extern int usb_wait_anchor_empty_timeout(struct usb_anchor *anchor,
                                         unsigned int timeout);
extern struct urb *usb_get_from_anchor(struct usb_anchor *anchor);
extern void usb_scuttle_anchored_urbs(struct usb_anchor *anchor);
extern int usb_anchor_empty(struct usb_anchor *anchor);

#define usb_unblock_urb        usb_unpoison_urb

/**
 * usb_urb_dir_in - check if an URB describes an IN transfer
 * @urb: URB to be checked
 *
 * Return: 1 if @urb describes an IN transfer (device-to-host),
 * otherwise 0.
 */
static inline int usb_urb_dir_in(struct urb *urb)
{
        return (urb->transfer_flags & URB_DIR_MASK) == URB_DIR_IN;
}

/**
 * usb_urb_dir_out - check if an URB describes an OUT transfer
 * @urb: URB to be checked
 *
 * Return: 1 if @urb describes an OUT transfer (host-to-device),
 * otherwise 0.
 */
static inline int usb_urb_dir_out(struct urb *urb)
{
        return (urb->transfer_flags & URB_DIR_MASK) == URB_DIR_OUT;
}

int usb_pipe_type_check(struct usb_device *dev, unsigned int pipe);
int usb_urb_ep_type_check(const struct urb *urb);

void *usb_alloc_coherent(struct usb_device *dev, size_t size,
        gfp_t mem_flags, dma_addr_t *dma);
void usb_free_coherent(struct usb_device *dev, size_t size,
        void *addr, dma_addr_t dma);

enum dma_data_direction;

void *usb_alloc_noncoherent(struct usb_device *dev, size_t size,
                            gfp_t mem_flags, dma_addr_t *dma,
                            enum dma_data_direction dir,
                            struct sg_table **table);
void usb_free_noncoherent(struct usb_device *dev, size_t size,
                          void *addr, enum dma_data_direction dir,
                          struct sg_table *table);

/*-------------------------------------------------------------------*
 *                         SYNCHRONOUS CALL SUPPORT                  *
 *-------------------------------------------------------------------*/

/* Maximum value allowed for timeout in synchronous routines below */
#define USB_MAX_SYNCHRONOUS_TIMEOUT                60000        /* ms */

extern int usb_control_msg(struct usb_device *dev, unsigned int pipe,
        __u8 request, __u8 requesttype, __u16 value, __u16 index,
        void *data, __u16 size, int timeout);
extern int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe,
        void *data, int len, int *actual_length, int timeout);
extern int usb_bulk_msg(struct usb_device *usb_dev, unsigned int pipe,
        void *data, int len, int *actual_length, int timeout);
extern int usb_bulk_msg_killable(struct usb_device *usb_dev, unsigned int pipe,
        void *data, int len, int *actual_length, int timeout);

/* wrappers around usb_control_msg() for the most common standard requests */
int usb_control_msg_send(struct usb_device *dev, __u8 endpoint, __u8 request,
                         __u8 requesttype, __u16 value, __u16 index,
                         const void *data, __u16 size, int timeout,
                         gfp_t memflags);
int usb_control_msg_recv(struct usb_device *dev, __u8 endpoint, __u8 request,
                         __u8 requesttype, __u16 value, __u16 index,
                         void *data, __u16 size, int timeout,
                         gfp_t memflags);
extern int usb_get_descriptor(struct usb_device *dev, unsigned char desctype,
        unsigned char descindex, void *buf, int size);
extern int usb_get_status(struct usb_device *dev,
        int recip, int type, int target, void *data);

static inline int usb_get_std_status(struct usb_device *dev,
        int recip, int target, void *data)
{
        return usb_get_status(dev, recip, USB_STATUS_TYPE_STANDARD, target,
                data);
}

static inline int usb_get_ptm_status(struct usb_device *dev, void *data)
{
        return usb_get_status(dev, USB_RECIP_DEVICE, USB_STATUS_TYPE_PTM,
                0, data);
}

extern int usb_string(struct usb_device *dev, int index,
        char *buf, size_t size);
extern char *usb_cache_string(struct usb_device *udev, int index);

/* wrappers that also update important state inside usbcore */
extern int usb_clear_halt(struct usb_device *dev, int pipe);
extern int usb_reset_configuration(struct usb_device *dev);
extern int usb_set_interface(struct usb_device *dev, int ifnum, int alternate);
extern void usb_reset_endpoint(struct usb_device *dev, unsigned int epaddr);

/* this request isn't really synchronous, but it belongs with the others */
extern int usb_driver_set_configuration(struct usb_device *udev, int config);

/* choose and set configuration for device */
extern int usb_choose_configuration(struct usb_device *udev);
extern int usb_set_configuration(struct usb_device *dev, int configuration);

/*
 * timeouts, in milliseconds, used for sending/receiving control messages
 * they typically complete within a few frames (msec) after they're issued
 * USB identifies 5 second timeouts, maybe more in a few cases, and a few
 * slow devices (like some MGE Ellipse UPSes) actually push that limit.
 */
#define USB_CTRL_GET_TIMEOUT        5000
#define USB_CTRL_SET_TIMEOUT        5000


/**
 * struct usb_sg_request - support for scatter/gather I/O
 * @status: zero indicates success, else negative errno
 * @bytes: counts bytes transferred.
 *
 * These requests are initialized using usb_sg_init(), and then are used
 * as request handles passed to usb_sg_wait() or usb_sg_cancel().  Most
 * members of the request object aren't for driver access.
 *
 * The status and bytecount values are valid only after usb_sg_wait()
 * returns.  If the status is zero, then the bytecount matches the total
 * from the request.
 *
 * After an error completion, drivers may need to clear a halt condition
 * on the endpoint.
 */
struct usb_sg_request {
        int                        status;
        size_t                        bytes;

        /* private:
         * members below are private to usbcore,
         * and are not provided for driver access!
         */
        spinlock_t                lock;

        struct usb_device        *dev;
        int                        pipe;

        int                        entries;
        struct urb                **urbs;

        int                        count;
        struct completion        complete;
};

int usb_sg_init(
        struct usb_sg_request        *io,
        struct usb_device        *dev,
        unsigned                pipe,
        unsigned                period,
        struct scatterlist        *sg,
        int                        nents,
        size_t                        length,
        gfp_t                        mem_flags
);
void usb_sg_cancel(struct usb_sg_request *io);
void usb_sg_wait(struct usb_sg_request *io);


/* ----------------------------------------------------------------------- */

/*
 * For various legacy reasons, Linux has a small cookie that's paired with
 * a struct usb_device to identify an endpoint queue.  Queue characteristics
 * are defined by the endpoint's descriptor.  This cookie is called a "pipe",
 * an unsigned int encoded as:
 *
 *  - direction:        bit 7                (0 = Host-to-Device [Out],
 *                                         1 = Device-to-Host [In] ...
 *                                        like endpoint bEndpointAddress)
 *  - device address:        bits 8-14       ... bit positions known to uhci-hcd
 *  - endpoint:                bits 15-18      ... bit positions known to uhci-hcd
 *  - pipe type:        bits 30-31        (00 = isochronous, 01 = interrupt,
 *                                         10 = control, 11 = bulk)
 *
 * Given the device address and endpoint descriptor, pipes are redundant.
 */

/* NOTE:  these are not the standard USB_ENDPOINT_XFER_* values!! */
/* (yet ... they're the values used by usbfs) */
#define PIPE_ISOCHRONOUS                0
#define PIPE_INTERRUPT                        1
#define PIPE_CONTROL                        2
#define PIPE_BULK                        3

#define usb_pipein(pipe)        ((pipe) & USB_DIR_IN)
#define usb_pipeout(pipe)        (!usb_pipein(pipe))

#define usb_pipedevice(pipe)        (((pipe) >> 8) & 0x7f)
#define usb_pipeendpoint(pipe)        (((pipe) >> 15) & 0xf)

#define usb_pipetype(pipe)        (((pipe) >> 30) & 3)
#define usb_pipeisoc(pipe)        (usb_pipetype((pipe)) == PIPE_ISOCHRONOUS)
#define usb_pipeint(pipe)        (usb_pipetype((pipe)) == PIPE_INTERRUPT)
#define usb_pipecontrol(pipe)        (usb_pipetype((pipe)) == PIPE_CONTROL)
#define usb_pipebulk(pipe)        (usb_pipetype((pipe)) == PIPE_BULK)

static inline unsigned int __create_pipe(struct usb_device *dev,
                unsigned int endpoint)
{
        return (dev->devnum << 8) | (endpoint << 15);
}

/* Create various pipes... */
#define usb_sndctrlpipe(dev, endpoint)        \
        ((PIPE_CONTROL << 30) | __create_pipe(dev, endpoint))
#define usb_rcvctrlpipe(dev, endpoint)        \
        ((PIPE_CONTROL << 30) | __create_pipe(dev, endpoint) | USB_DIR_IN)
#define usb_sndisocpipe(dev, endpoint)        \
        ((PIPE_ISOCHRONOUS << 30) | __create_pipe(dev, endpoint))
#define usb_rcvisocpipe(dev, endpoint)        \
        ((PIPE_ISOCHRONOUS << 30) | __create_pipe(dev, endpoint) | USB_DIR_IN)
#define usb_sndbulkpipe(dev, endpoint)        \
        ((PIPE_BULK << 30) | __create_pipe(dev, endpoint))
#define usb_rcvbulkpipe(dev, endpoint)        \
        ((PIPE_BULK << 30) | __create_pipe(dev, endpoint) | USB_DIR_IN)
#define usb_sndintpipe(dev, endpoint)        \
        ((PIPE_INTERRUPT << 30) | __create_pipe(dev, endpoint))
#define usb_rcvintpipe(dev, endpoint)        \
        ((PIPE_INTERRUPT << 30) | __create_pipe(dev, endpoint) | USB_DIR_IN)

static inline struct usb_host_endpoint *
usb_pipe_endpoint(struct usb_device *dev, unsigned int pipe)
{
        struct usb_host_endpoint **eps;
        eps = usb_pipein(pipe) ? dev->ep_in : dev->ep_out;
        return eps[usb_pipeendpoint(pipe)];
}

static inline u16 usb_maxpacket(struct usb_device *udev, int pipe)
{
        struct usb_host_endpoint *ep = usb_pipe_endpoint(udev, pipe);

        if (!ep)
                return 0;

        /* NOTE:  only 0x07ff bits are for packet size... */
        return usb_endpoint_maxp(&ep->desc);
}

u32 usb_endpoint_max_periodic_payload(struct usb_device *udev,
                                      const struct usb_host_endpoint *ep);

bool usb_endpoint_is_hs_isoc_double(struct usb_device *udev,
                                    const struct usb_host_endpoint *ep);

/* translate USB error codes to codes user space understands */
static inline int usb_translate_errors(int error_code)
{
        switch (error_code) {
        case 0:
        case -ENOMEM:
        case -ENODEV:
        case -EOPNOTSUPP:
                return error_code;
        case -ENOSPC:
                return -EBUSY;
        default:
                return -EIO;
        }
}

/* Events from the usb core */
#define USB_DEVICE_ADD                0x0001
#define USB_DEVICE_REMOVE        0x0002
#define USB_BUS_ADD                0x0003
#define USB_BUS_REMOVE                0x0004
extern void usb_register_notify(struct notifier_block *nb);
extern void usb_unregister_notify(struct notifier_block *nb);

/* debugfs stuff */
extern struct dentry *usb_debug_root;

/* LED triggers */
enum usb_led_event {
        USB_LED_EVENT_HOST = 0,
        USB_LED_EVENT_GADGET = 1,
};

#ifdef CONFIG_USB_LED_TRIG
extern void usb_led_activity(enum usb_led_event ev);
#else
static inline void usb_led_activity(enum usb_led_event ev) {}
#endif

#endif  /* __KERNEL__ */

#endif









































   19 



























   19 







   21 







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 1994 Linus Torvalds
 *
 * Pentium III FXSR, SSE support
 * General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 * x86-64 work by Andi Kleen 2002
 */

#ifndef _ASM_X86_FPU_API_H
#define _ASM_X86_FPU_API_H
#include <linux/bottom_half.h>

#include <asm/fpu/types.h>

/*
 * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
 * disables preemption and softirq processing, so be careful if you intend to
 * use it for long periods of time.  Kernel-mode FPU cannot be used in all
 * contexts -- see irq_fpu_usable() for details.
 */

/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */
#define KFPU_387        _BITUL(0)        /* 387 state will be initialized */
#define KFPU_MXCSR        _BITUL(1)        /* MXCSR will be initialized */

extern void kernel_fpu_begin_mask(unsigned int kfpu_mask);
extern void kernel_fpu_end(void);
extern bool irq_fpu_usable(void);
extern void fpregs_mark_activate(void);

/* Code that is unaware of kernel_fpu_begin_mask() can use this */
static inline void kernel_fpu_begin(void)
{
#ifdef CONFIG_X86_64
        /*
         * Any 64-bit code that uses 387 instructions must explicitly request
         * KFPU_387.
         */
        kernel_fpu_begin_mask(KFPU_MXCSR);
#else
        /*
         * 32-bit kernel code may use 387 operations as well as SSE2, etc,
         * as long as it checks that the CPU has the required capability.
         */
        kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
#endif
}

/*
 * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate, or while
 * using the FPU in kernel mode.  A context switch will (and softirq might) save
 * CPU's FPU registers to fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving
 * CPU's FPU registers in a random state.
 *
 * local_bh_disable() protects against both preemption and soft interrupts
 * on !RT kernels.
 *
 * On RT kernels local_bh_disable() is not sufficient because it only
 * serializes soft interrupt related sections via a local lock, but stays
 * preemptible. Disabling preemption is the right choice here as bottom
 * half processing is always in thread context on RT kernels so it
 * implicitly prevents bottom half processing as well.
 */
static inline void fpregs_lock(void)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_bh_disable();
        else
                preempt_disable();
}

static inline void fpregs_unlock(void)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_bh_enable();
        else
                preempt_enable();
}

/*
 * FPU state gets lazily restored before returning to userspace. So when in the
 * kernel, the valid FPU state may be kept in the buffer. This function will force
 * restore all the fpu state to the registers early if needed, and lock them from
 * being automatically saved/restored. Then FPU state can be modified safely in the
 * registers, before unlocking with fpregs_unlock().
 */
void fpregs_lock_and_load(void);

#ifdef CONFIG_X86_DEBUG_FPU
extern void fpregs_assert_state_consistent(void);
#else
static inline void fpregs_assert_state_consistent(void) { }
#endif

/*
 * Load the task FPU state before returning to userspace.
 */
extern void switch_fpu_return(void);

/*
 * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
 *
 * If 'feature_name' is set then put a human-readable description of
 * the feature there as well - this can be used to print error (or success)
 * messages.
 */
extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);

/* Trap handling */
extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
extern void fpu_sync_fpstate(struct fpu *fpu);
extern void fpu_reset_from_exception_fixup(void);

/* Boot, hotplug and resume */
extern void fpu__init_cpu(void);
extern void fpu__init_system(void);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);

#ifdef CONFIG_MATH_EMULATION
extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif

/* State tracking */
DECLARE_PER_CPU(bool, kernel_fpu_allowed);
DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/* Process cleanup */
#ifdef CONFIG_X86_64
extern void fpstate_free(struct fpu *fpu);
#else
static inline void fpstate_free(struct fpu *fpu) { }
#endif

/* fpstate-related functions which are exported to KVM */
extern void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature);

extern u64 xstate_get_guest_group_perm(void);

extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);


/* KVM specific functions */
extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);

#ifdef CONFIG_X86_64
extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
extern void fpu_sync_guest_vmexit_xfd_state(void);
#else
static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
static inline void fpu_sync_guest_vmexit_xfd_state(void) { }
#endif

extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
                                           unsigned int size, u64 xfeatures, u32 pkru);
extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);

static inline void fpstate_set_confidential(struct fpu_guest *gfpu)
{
        gfpu->fpstate->is_confidential = true;
}

static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
{
        return gfpu->fpstate->is_confidential;
}

/* prctl */
extern long fpu_xstate_prctl(int option, unsigned long arg2);

extern void fpu_idle_fpregs(void);

#endif /* _ASM_X86_FPU_API_H */























    2 



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the UDP protocol.
 *
 * Version:        @(#)udp.h        1.0.2        04/28/93
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_UDP_H
#define _LINUX_UDP_H

#include <net/inet_sock.h>
#include <linux/skbuff.h>
#include <net/netns/hash.h>
#include <uapi/linux/udp.h>

static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
{
        return (struct udphdr *)skb_transport_header(skb);
}

#define UDP_HTABLE_SIZE_MIN_PERNET        128
#define UDP_HTABLE_SIZE_MIN                (IS_ENABLED(CONFIG_BASE_SMALL) ? 128 : 256)
#define UDP_HTABLE_SIZE_MAX                65536

static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
{
        return (num + net_hash_mix(net)) & mask;
}

enum {
        UDP_FLAGS_CORK,                /* Cork is required */
        UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
        UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
        UDP_FLAGS_GRO_ENABLED,        /* Request GRO aggregation */
        UDP_FLAGS_ACCEPT_FRAGLIST,
        UDP_FLAGS_ACCEPT_L4,
        UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */
};

/* per NUMA structure for lockless producer usage. */
struct udp_prod_queue {
        struct llist_head        ll_root ____cacheline_aligned_in_smp;
        atomic_t                rmem_alloc;
};

struct udp_sock {
        /* inet_sock has to be the first member */
        struct inet_sock inet;
#define udp_port_hash                inet.sk.__sk_common.skc_u16hashes[0]
#define udp_portaddr_hash        inet.sk.__sk_common.skc_u16hashes[1]
#define udp_portaddr_node        inet.sk.__sk_common.skc_portaddr_node

        unsigned long         udp_flags;

        int                 pending;        /* Any pending frames ? */
        __u8                 encap_type;        /* Is this an Encapsulation socket? */

#if !IS_ENABLED(CONFIG_BASE_SMALL)
        /* For UDP 4-tuple hash */
        __u16 udp_lrpa_hash;
        struct hlist_nulls_node udp_lrpa_node;
#endif

        /*
         * Following member retains the information to create a UDP header
         * when the socket is uncorked.
         */
        __u16                 len;                /* total length of pending frames */
        __u16                 gso_size;

        /*
         * For encapsulation sockets.
         */
        int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
        void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err,
                              __be16 port, u32 info, u8 *payload);
        int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
        void (*encap_destroy)(struct sock *sk);

        /* GRO functions for UDP socket */
        struct sk_buff *        (*gro_receive)(struct sock *sk,
                                               struct list_head *head,
                                               struct sk_buff *skb);
        int                        (*gro_complete)(struct sock *sk,
                                                struct sk_buff *skb,
                                                int nhoff);

        struct udp_prod_queue *udp_prod_queue;

        /* udp_recvmsg try to use this before splicing sk_receive_queue */
        struct sk_buff_head        reader_queue ____cacheline_aligned_in_smp;

        /* This field is dirtied by udp_recvmsg() */
        int                forward_deficit;

        /* This fields follows rcvbuf value, and is touched by udp_recvmsg */
        int                forward_threshold;

        /* Cache friendly copy of sk->sk_peek_off >= 0 */
        bool                peeking_with_offset;

        /*
         * Accounting for the tunnel GRO fastpath.
         * Unprotected by compilers guard, as it uses space available in
         * the last UDP socket cacheline.
         */
        struct hlist_node        tunnel_list;
        struct numa_drop_counters drop_counters;
};

#define udp_test_bit(nr, sk)                        \
        test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
#define udp_set_bit(nr, sk)                        \
        set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
#define udp_test_and_set_bit(nr, sk)                \
        test_and_set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
#define udp_clear_bit(nr, sk)                        \
        clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
#define udp_assign_bit(nr, sk, val)                \
        assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val)

#define UDP_MAX_SEGMENTS        (1 << 7UL)

#define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk)

static inline int udp_set_peek_off(struct sock *sk, int val)
{
        sk_set_peek_off(sk, val);
        WRITE_ONCE(udp_sk(sk)->peeking_with_offset, val >= 0);
        return 0;
}

static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
{
        udp_assign_bit(NO_CHECK6_TX, sk, val);
}

static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
{
        udp_assign_bit(NO_CHECK6_RX, sk, val);
}

static inline bool udp_get_no_check6_tx(const struct sock *sk)
{
        return udp_test_bit(NO_CHECK6_TX, sk);
}

static inline bool udp_get_no_check6_rx(const struct sock *sk)
{
        return udp_test_bit(NO_CHECK6_RX, sk);
}

static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
                                 struct sk_buff *skb)
{
        int gso_size;

        if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
                gso_size = skb_shinfo(skb)->gso_size;
                put_cmsg(msg, SOL_UDP, UDP_GRO, sizeof(gso_size), &gso_size);
        }
}

DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
#if IS_ENABLED(CONFIG_IPV6)
DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
#endif

static inline bool udp_encap_needed(void)
{
        if (static_branch_unlikely(&udp_encap_needed_key))
                return true;

#if IS_ENABLED(CONFIG_IPV6)
        if (static_branch_unlikely(&udpv6_encap_needed_key))
                return true;
#endif

        return false;
}

static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
{
        if (!skb_is_gso(skb))
                return false;

        if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
            !udp_test_bit(ACCEPT_L4, sk))
                return true;

        if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST &&
            !udp_test_bit(ACCEPT_FRAGLIST, sk))
                return true;

        /* GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits might still
         * land in a tunnel as the socket check in udp_gro_receive cannot be
         * foolproof.
         */
        if (udp_encap_needed() &&
            READ_ONCE(udp_sk(sk)->encap_rcv) &&
            !(skb_shinfo(skb)->gso_type &
              (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM)))
                return true;

        return false;
}

static inline void udp_allow_gso(struct sock *sk)
{
        udp_set_bit(ACCEPT_L4, sk);
        udp_set_bit(ACCEPT_FRAGLIST, sk);
}

#define udp_portaddr_for_each_entry(__sk, list) \
        hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)

#define udp_portaddr_for_each_entry_from(__sk) \
        hlist_for_each_entry_from(__sk, __sk_common.skc_portaddr_node)

#define udp_portaddr_for_each_entry_rcu(__sk, list) \
        hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)

#if !IS_ENABLED(CONFIG_BASE_SMALL)
#define udp_lrpa_for_each_entry_rcu(__up, node, list) \
        hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node)
#endif

static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6)
{
#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
        return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk);
#else
        return NULL;
#endif
}

#endif        /* _LINUX_UDP_H */














































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SIGNAL_H
#define _LINUX_SIGNAL_H

#include <linux/bug.h>
#include <linux/list.h>
#include <linux/signal_types.h>
#include <linux/string.h>

struct task_struct;

/* for sysctl */
extern int print_fatal_signals;

static inline void copy_siginfo(kernel_siginfo_t *to,
                                const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*to));
}

static inline void clear_siginfo(kernel_siginfo_t *info)
{
        memset(info, 0, sizeof(*info));
}

#define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo))

static inline void copy_siginfo_to_external(siginfo_t *to,
                                            const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*from));
        memset(((char *)to) + sizeof(struct kernel_siginfo), 0,
                SI_EXPANSION_SIZE);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from);
int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from);

enum siginfo_layout {
        SIL_KILL,
        SIL_TIMER,
        SIL_POLL,
        SIL_FAULT,
        SIL_FAULT_TRAPNO,
        SIL_FAULT_MCEERR,
        SIL_FAULT_BNDERR,
        SIL_FAULT_PKUERR,
        SIL_FAULT_PERF_EVENT,
        SIL_CHLD,
        SIL_RT,
        SIL_SYS,
};

enum siginfo_layout siginfo_layout(unsigned sig, int si_code);

/*
 * Define some primitives to manipulate sigset_t.
 */

#ifndef __HAVE_ARCH_SIG_BITOPS
#include <linux/bitops.h>

/* We don't use <linux/bitops.h> for these because there is no need to
   be atomic.  */
static inline void sigaddset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] |= 1UL << sig;
        else
                set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW);
}

static inline void sigdelset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] &= ~(1UL << sig);
        else
                set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW));
}

static inline int sigismember(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                return 1 & (set->sig[0] >> sig);
        else
                return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
}

#endif /* __HAVE_ARCH_SIG_BITOPS */

static inline int sigisemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        case 4:
                return (set->sig[3] | set->sig[2] |
                        set->sig[1] | set->sig[0]) == 0;
        case 2:
                return (set->sig[1] | set->sig[0]) == 0;
        case 1:
                return set->sig[0] == 0;
        default:
                BUILD_BUG();
                return 0;
        }
}

static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
{
        switch (_NSIG_WORDS) {
        case 4:
                return        (set1->sig[3] == set2->sig[3]) &&
                        (set1->sig[2] == set2->sig[2]) &&
                        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 2:
                return        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 1:
                return        set1->sig[0] == set2->sig[0];
        }
        return 0;
}

#define sigmask(sig)        (1UL << ((sig) - 1))

#ifndef __HAVE_ARCH_SIG_SETOPS

#define _SIG_SET_BINOP(name, op)                                        \
static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \
{                                                                        \
        unsigned long a0, a1, a2, a3, b0, b1, b2, b3;                        \
                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:                                                                \
                a3 = a->sig[3]; a2 = a->sig[2];                                \
                b3 = b->sig[3]; b2 = b->sig[2];                                \
                r->sig[3] = op(a3, b3);                                        \
                r->sig[2] = op(a2, b2);                                        \
                fallthrough;                                                \
        case 2:                                                                \
                a1 = a->sig[1]; b1 = b->sig[1];                                \
                r->sig[1] = op(a1, b1);                                        \
                fallthrough;                                                \
        case 1:                                                                \
                a0 = a->sig[0]; b0 = b->sig[0];                                \
                r->sig[0] = op(a0, b0);                                        \
                break;                                                        \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_or(x,y)        ((x) | (y))
_SIG_SET_BINOP(sigorsets, _sig_or)

#define _sig_and(x,y)        ((x) & (y))
_SIG_SET_BINOP(sigandsets, _sig_and)

#define _sig_andn(x,y)        ((x) & ~(y))
_SIG_SET_BINOP(sigandnsets, _sig_andn)

#undef _SIG_SET_BINOP
#undef _sig_or
#undef _sig_and
#undef _sig_andn

#define _SIG_SET_OP(name, op)                                                \
static inline void name(sigset_t *set)                                        \
{                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:        set->sig[3] = op(set->sig[3]);                                \
                set->sig[2] = op(set->sig[2]);                                \
                fallthrough;                                                \
        case 2:        set->sig[1] = op(set->sig[1]);                                \
                fallthrough;                                                \
        case 1:        set->sig[0] = op(set->sig[0]);                                \
                    break;                                                \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_not(x)        (~(x))
_SIG_SET_OP(signotset, _sig_not)

#undef _SIG_SET_OP
#undef _sig_not

static inline void sigemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, 0, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = 0;
                fallthrough;
        case 1:        set->sig[0] = 0;
                break;
        }
}

static inline void sigfillset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, -1, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = -1;
                fallthrough;
        case 1:        set->sig[0] = -1;
                break;
        }
}

/* Some extensions for manipulating the low 32 signals in particular.  */

static inline void sigaddsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] |= mask;
}

static inline void sigdelsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] &= ~mask;
}

static inline int sigtestsetmask(sigset_t *set, unsigned long mask)
{
        return (set->sig[0] & mask) != 0;
}

static inline void siginitset(sigset_t *set, unsigned long mask)
{
        set->sig[0] = mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = 0;
                break;
        case 1: ;
        }
}

static inline void siginitsetinv(sigset_t *set, unsigned long mask)
{
        set->sig[0] = ~mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = -1;
                break;
        case 1: ;
        }
}

#endif /* __HAVE_ARCH_SIG_SETOPS */

static inline void init_sigpending(struct sigpending *sig)
{
        sigemptyset(&sig->signal);
        INIT_LIST_HEAD(&sig->list);
}

extern void flush_sigqueue(struct sigpending *queue);

/* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
static inline int valid_signal(unsigned long sig)
{
        return sig <= _NSIG ? 1 : 0;
}

struct timespec;
struct pt_regs;
enum pid_type;

extern int next_signal(struct sigpending *pending, sigset_t *mask);
extern int do_send_sig_info(int sig, struct kernel_siginfo *info,
                                struct task_struct *p, enum pid_type type);
extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
                               struct task_struct *p, enum pid_type type);
extern int send_signal_locked(int sig, struct kernel_siginfo *info,
                              struct task_struct *p, enum pid_type type);
extern int sigprocmask(int, sigset_t *, sigset_t *);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;

extern bool get_signal(struct ksignal *ksig);
extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
extern void exit_signals(struct task_struct *tsk);
extern void kernel_sigaction(int, __sighandler_t);

#define SIG_KTHREAD ((__force __sighandler_t)2)
#define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3)

static inline void allow_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know it'll be handled, so that they don't get converted to
         * SIGKILL or just silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD);
}

static inline void allow_kernel_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know signals sent by the kernel will be handled, so that they
         * don't get silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD_KERNEL);
}

static inline void disallow_signal(int sig)
{
        kernel_sigaction(sig, SIG_IGN);
}

extern struct kmem_cache *sighand_cachep;

extern bool unhandled_signal(struct task_struct *tsk, int sig);

/*
 * In POSIX a signal is sent either to a specific thread (Linux task)
 * or to the process as a whole (Linux thread group).  How the signal
 * is sent determines whether it's to one thread or the whole group,
 * which determines which signal mask(s) are involved in blocking it
 * from being delivered until later.  When the signal is delivered,
 * either it's caught or ignored by a user handler or it has a default
 * effect that applies to the whole thread group (POSIX process).
 *
 * The possible effects an unblocked signal set to SIG_DFL can have are:
 *   ignore        - Nothing Happens
 *   terminate        - kill the process, i.e. all threads in the group,
 *                   similar to exit_group.  The group leader (only) reports
 *                  WIFSIGNALED status to its parent.
 *   coredump        - write a core dump file describing all threads using
 *                  the same mm and then kill all those threads
 *   stop         - stop all the threads in the group, i.e. TASK_STOPPED state
 *
 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
 * Other signals when not blocked and set to SIG_DFL behaves as follows.
 * The job control signals also have other special effects.
 *
 *        +--------------------+------------------+
 *        |  POSIX signal      |  default action  |
 *        +--------------------+------------------+
 *        |  SIGHUP            |  terminate        |
 *        |  SIGINT            |        terminate        |
 *        |  SIGQUIT           |        coredump         |
 *        |  SIGILL            |        coredump         |
 *        |  SIGTRAP           |        coredump         |
 *        |  SIGABRT/SIGIOT    |        coredump         |
 *        |  SIGBUS            |        coredump         |
 *        |  SIGFPE            |        coredump         |
 *        |  SIGKILL           |        terminate(+)        |
 *        |  SIGUSR1           |        terminate        |
 *        |  SIGSEGV           |        coredump         |
 *        |  SIGUSR2           |        terminate        |
 *        |  SIGPIPE           |        terminate        |
 *        |  SIGALRM           |        terminate        |
 *        |  SIGTERM           |        terminate        |
 *        |  SIGCHLD           |        ignore           |
 *        |  SIGCONT           |        ignore(*)        |
 *        |  SIGSTOP           |        stop(*)(+)          |
 *        |  SIGTSTP           |        stop(*)          |
 *        |  SIGTTIN           |        stop(*)          |
 *        |  SIGTTOU           |        stop(*)          |
 *        |  SIGURG            |        ignore           |
 *        |  SIGXCPU           |        coredump         |
 *        |  SIGXFSZ           |        coredump         |
 *        |  SIGVTALRM         |        terminate        |
 *        |  SIGPROF           |        terminate        |
 *        |  SIGPOLL/SIGIO     |        terminate        |
 *        |  SIGSYS/SIGUNUSED  |        coredump         |
 *        |  SIGSTKFLT         |        terminate        |
 *        |  SIGWINCH          |        ignore           |
 *        |  SIGPWR            |        terminate        |
 *        |  SIGRTMIN-SIGRTMAX |        terminate       |
 *        +--------------------+------------------+
 *        |  non-POSIX signal  |  default action  |
 *        +--------------------+------------------+
 *        |  SIGEMT            |  coredump        |
 *        +--------------------+------------------+
 *
 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
 * (*) Special job control effects:
 * When SIGCONT is sent, it resumes the process (all threads in the group)
 * from TASK_STOPPED state and also clears any pending/queued stop signals
 * (any of those marked with "stop(*)").  This happens regardless of blocking,
 * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
 * any pending/queued SIGCONT signals; this happens regardless of blocking,
 * catching, or ignored the stop signal, though (except for SIGSTOP) the
 * default action of stopping the process may happen later or never.
 */

#ifdef SIGEMT
#define SIGEMT_MASK        rt_sigmask(SIGEMT)
#else
#define SIGEMT_MASK        0
#endif

#if SIGRTMIN > BITS_PER_LONG
#define rt_sigmask(sig)        (1ULL << ((sig)-1))
#else
#define rt_sigmask(sig)        sigmask(sig)
#endif

#define siginmask(sig, mask) \
        ((sig) > 0 && (sig) < SIGRTMIN && (rt_sigmask(sig) & (mask)))

#define SIG_KERNEL_ONLY_MASK (\
        rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))

#define SIG_KERNEL_STOP_MASK (\
        rt_sigmask(SIGSTOP)   |  rt_sigmask(SIGTSTP)   | \
        rt_sigmask(SIGTTIN)   |  rt_sigmask(SIGTTOU)   )

#define SIG_KERNEL_COREDUMP_MASK (\
        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
        rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
        SIGEMT_MASK                                       )

#define SIG_KERNEL_IGNORE_MASK (\
        rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )

#define SIG_SPECIFIC_SICODES_MASK (\
        rt_sigmask(SIGILL)    |  rt_sigmask(SIGFPE)    | \
        rt_sigmask(SIGSEGV)   |  rt_sigmask(SIGBUS)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGPOLL)   |  rt_sigmask(SIGSYS)    | \
        SIGEMT_MASK                                    )

#define sig_kernel_only(sig)                siginmask(sig, SIG_KERNEL_ONLY_MASK)
#define sig_kernel_coredump(sig)        siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
#define sig_kernel_ignore(sig)                siginmask(sig, SIG_KERNEL_IGNORE_MASK)
#define sig_kernel_stop(sig)                siginmask(sig, SIG_KERNEL_STOP_MASK)
#define sig_specific_sicodes(sig)        siginmask(sig, SIG_SPECIFIC_SICODES_MASK)

#define sig_fatal(t, signr) \
        (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
         (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)

void signals_init(void);

int restore_altstack(const stack_t __user *);
int __save_altstack(stack_t __user *, unsigned long);

#define unsafe_save_altstack(uss, sp, label) do { \
        stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
        unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
        unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
        unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
} while (0);

#ifdef CONFIG_DYNAMIC_SIGFRAME
bool sigaltstack_size_valid(size_t ss_size);
#else
static inline bool sigaltstack_size_valid(size_t size) { return true; }
#endif /* !CONFIG_DYNAMIC_SIGFRAME */

#ifdef CONFIG_PROC_FS
struct seq_file;
extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
#endif

#ifndef arch_untagged_si_addr
/*
 * Given a fault address and a signal and si_code which correspond to the
 * _sigfault union member, returns the address that must appear in si_addr if
 * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags.
 */
static inline void __user *arch_untagged_si_addr(void __user *addr,
                                                 unsigned long sig,
                                                 unsigned long si_code)
{
        return addr;
}
#endif

#endif /* _LINUX_SIGNAL_H */














































    3 






    2 
    1 

    3 

    3 

    2 
    1 

















    2 












    4 













    2 





    2 



















    2 










    1 









    1 

    1 









    1 










    1 





    4 





























































    4 








    4 





    3 























    2 











    1 



    2 


























































































    2 












    2 



























    1 




    1 























    3 






    4 




    1 
    3 








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
// SPDX-License-Identifier: GPL-2.0
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                The options processing module for ip.c
 *
 * Authors:        A.N.Kuznetsov
 *
 */

#define pr_fmt(fmt) "IPv4: " fmt

#include <linux/capability.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/unaligned.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/route.h>
#include <net/cipso_ipv4.h>
#include <net/ip_fib.h>

/*
 * Write options to IP header, record destination address to
 * source route option, address of outgoing interface
 * (we should already know it, so that this  function is allowed be
 * called only after routing decision) and timestamp,
 * if we originate this datagram.
 *
 * daddr is real destination address, next hop is recorded in IP header.
 * saddr is address of outgoing interface.
 */

void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
                      __be32 daddr, struct rtable *rt)
{
        unsigned char *iph = skb_network_header(skb);

        memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
        memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen);
        opt = &(IPCB(skb)->opt);

        if (opt->srr)
                memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4);

        if (opt->rr_needaddr)
                ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
        if (opt->ts_needaddr)
                ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
        if (opt->ts_needtime) {
                __be32 midtime;

                midtime = inet_current_timestamp();
                memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
        }
}

/*
 * Provided (sopt, skb) points to received options,
 * build in dopt compiled option set appropriate for answering.
 * i.e. invert SRR option, copy anothers,
 * and grab room in RR/TS options.
 *
 * NOTE: dopt cannot point to skb.
 */

int __ip_options_echo(struct net *net, struct ip_options *dopt,
                      struct sk_buff *skb, const struct ip_options *sopt)
{
        unsigned char *sptr, *dptr;
        int soffset, doffset;
        int        optlen;

        memset(dopt, 0, sizeof(struct ip_options));

        if (sopt->optlen == 0)
                return 0;

        sptr = skb_network_header(skb);
        dptr = dopt->__data;

        if (sopt->rr) {
                optlen  = sptr[sopt->rr+1];
                soffset = sptr[sopt->rr+2];
                dopt->rr = dopt->optlen + sizeof(struct iphdr);
                memcpy(dptr, sptr+sopt->rr, optlen);
                if (sopt->rr_needaddr && soffset <= optlen) {
                        if (soffset + 3 > optlen)
                                return -EINVAL;
                        dptr[2] = soffset + 4;
                        dopt->rr_needaddr = 1;
                }
                dptr += optlen;
                dopt->optlen += optlen;
        }
        if (sopt->ts) {
                optlen = sptr[sopt->ts+1];
                soffset = sptr[sopt->ts+2];
                dopt->ts = dopt->optlen + sizeof(struct iphdr);
                memcpy(dptr, sptr+sopt->ts, optlen);
                if (soffset <= optlen) {
                        if (sopt->ts_needaddr) {
                                if (soffset + 3 > optlen)
                                        return -EINVAL;
                                dopt->ts_needaddr = 1;
                                soffset += 4;
                        }
                        if (sopt->ts_needtime) {
                                if (soffset + 3 > optlen)
                                        return -EINVAL;
                                if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
                                        dopt->ts_needtime = 1;
                                        soffset += 4;
                                } else {
                                        dopt->ts_needtime = 0;

                                        if (soffset + 7 <= optlen) {
                                                __be32 addr;

                                                memcpy(&addr, dptr+soffset-1, 4);
                                                if (inet_addr_type(net, addr) != RTN_UNICAST) {
                                                        dopt->ts_needtime = 1;
                                                        soffset += 8;
                                                }
                                        }
                                }
                        }
                        dptr[2] = soffset;
                }
                dptr += optlen;
                dopt->optlen += optlen;
        }
        if (sopt->srr) {
                unsigned char *start = sptr+sopt->srr;
                __be32 faddr;

                optlen  = start[1];
                soffset = start[2];
                doffset = 0;
                if (soffset > optlen)
                        soffset = optlen + 1;
                soffset -= 4;
                if (soffset > 3) {
                        memcpy(&faddr, &start[soffset-1], 4);
                        for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)
                                memcpy(&dptr[doffset-1], &start[soffset-1], 4);
                        /*
                         * RFC1812 requires to fix illegal source routes.
                         */
                        if (memcmp(&ip_hdr(skb)->saddr,
                                   &start[soffset + 3], 4) == 0)
                                doffset -= 4;
                }
                if (doffset > 3) {
                        dopt->faddr = faddr;
                        dptr[0] = start[0];
                        dptr[1] = doffset+3;
                        dptr[2] = 4;
                        dptr += doffset+3;
                        dopt->srr = dopt->optlen + sizeof(struct iphdr);
                        dopt->optlen += doffset+3;
                        dopt->is_strictroute = sopt->is_strictroute;
                }
        }
        if (sopt->cipso) {
                optlen  = sptr[sopt->cipso+1];
                dopt->cipso = dopt->optlen+sizeof(struct iphdr);
                memcpy(dptr, sptr+sopt->cipso, optlen);
                dptr += optlen;
                dopt->optlen += optlen;
        }
        while (dopt->optlen & 3) {
                *dptr++ = IPOPT_END;
                dopt->optlen++;
        }
        return 0;
}

/*
 *        Options "fragmenting", just fill options not
 *        allowed in fragments with NOOPs.
 *        Simple and stupid 8), but the most efficient way.
 */

void ip_options_fragment(struct sk_buff *skb)
{
        unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
        struct ip_options *opt = &(IPCB(skb)->opt);
        int  l = opt->optlen;
        int  optlen;

        while (l > 0) {
                switch (*optptr) {
                case IPOPT_END:
                        return;
                case IPOPT_NOOP:
                        l--;
                        optptr++;
                        continue;
                }
                optlen = optptr[1];
                if (optlen < 2 || optlen > l)
                  return;
                if (!IPOPT_COPIED(*optptr))
                        memset(optptr, IPOPT_NOOP, optlen);
                l -= optlen;
                optptr += optlen;
        }
        opt->ts = 0;
        opt->rr = 0;
        opt->rr_needaddr = 0;
        opt->ts_needaddr = 0;
        opt->ts_needtime = 0;
}

/* helper used by ip_options_compile() to call fib_compute_spec_dst()
 * at most one time.
 */
static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
{
        if (*spec_dst == htonl(INADDR_ANY))
                *spec_dst = fib_compute_spec_dst(skb);
}

/*
 * Verify options and fill pointers in struct options.
 * Caller should clear *opt, and set opt->data.
 * If opt == NULL, then skb->data should point to IP header.
 */

int __ip_options_compile(struct net *net,
                         struct ip_options *opt, struct sk_buff *skb,
                         __be32 *info)
{
        __be32 spec_dst = htonl(INADDR_ANY);
        unsigned char *pp_ptr = NULL;
        struct rtable *rt = NULL;
        unsigned char *optptr;
        unsigned char *iph;
        int optlen, l;

        if (skb) {
                rt = skb_rtable(skb);
                optptr = (unsigned char *)&(ip_hdr(skb)[1]);
        } else
                optptr = opt->__data;
        iph = optptr - sizeof(struct iphdr);

        for (l = opt->optlen; l > 0; ) {
                switch (*optptr) {
                case IPOPT_END:
                        for (optptr++, l--; l > 0; optptr++, l--) {
                                if (*optptr != IPOPT_END) {
                                        *optptr = IPOPT_END;
                                        opt->is_changed = 1;
                                }
                        }
                        goto eol;
                case IPOPT_NOOP:
                        l--;
                        optptr++;
                        continue;
                }
                if (unlikely(l < 2)) {
                        pp_ptr = optptr;
                        goto error;
                }
                optlen = optptr[1];
                if (optlen < 2 || optlen > l) {
                        pp_ptr = optptr;
                        goto error;
                }
                switch (*optptr) {
                case IPOPT_SSRR:
                case IPOPT_LSRR:
                        if (optlen < 3) {
                                pp_ptr = optptr + 1;
                                goto error;
                        }
                        if (optptr[2] < 4) {
                                pp_ptr = optptr + 2;
                                goto error;
                        }
                        /* NB: cf RFC-1812 5.2.4.1 */
                        if (opt->srr) {
                                pp_ptr = optptr;
                                goto error;
                        }
                        if (!skb) {
                                if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
                                        pp_ptr = optptr + 1;
                                        goto error;
                                }
                                memcpy(&opt->faddr, &optptr[3], 4);
                                if (optlen > 7)
                                        memmove(&optptr[3], &optptr[7], optlen-7);
                        }
                        opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
                        opt->srr = optptr - iph;
                        break;
                case IPOPT_RR:
                        if (opt->rr) {
                                pp_ptr = optptr;
                                goto error;
                        }
                        if (optlen < 3) {
                                pp_ptr = optptr + 1;
                                goto error;
                        }
                        if (optptr[2] < 4) {
                                pp_ptr = optptr + 2;
                                goto error;
                        }
                        if (optptr[2] <= optlen) {
                                if (optptr[2]+3 > optlen) {
                                        pp_ptr = optptr + 2;
                                        goto error;
                                }
                                if (rt) {
                                        spec_dst_fill(&spec_dst, skb);
                                        memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
                                        opt->is_changed = 1;
                                }
                                optptr[2] += 4;
                                opt->rr_needaddr = 1;
                        }
                        opt->rr = optptr - iph;
                        break;
                case IPOPT_TIMESTAMP:
                        if (opt->ts) {
                                pp_ptr = optptr;
                                goto error;
                        }
                        if (optlen < 4) {
                                pp_ptr = optptr + 1;
                                goto error;
                        }
                        if (optptr[2] < 5) {
                                pp_ptr = optptr + 2;
                                goto error;
                        }
                        if (optptr[2] <= optlen) {
                                unsigned char *timeptr = NULL;
                                if (optptr[2]+3 > optlen) {
                                        pp_ptr = optptr + 2;
                                        goto error;
                                }
                                switch (optptr[3]&0xF) {
                                case IPOPT_TS_TSONLY:
                                        if (skb)
                                                timeptr = &optptr[optptr[2]-1];
                                        opt->ts_needtime = 1;
                                        optptr[2] += 4;
                                        break;
                                case IPOPT_TS_TSANDADDR:
                                        if (optptr[2]+7 > optlen) {
                                                pp_ptr = optptr + 2;
                                                goto error;
                                        }
                                        if (rt)  {
                                                spec_dst_fill(&spec_dst, skb);
                                                memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
                                                timeptr = &optptr[optptr[2]+3];
                                        }
                                        opt->ts_needaddr = 1;
                                        opt->ts_needtime = 1;
                                        optptr[2] += 8;
                                        break;
                                case IPOPT_TS_PRESPEC:
                                        if (optptr[2]+7 > optlen) {
                                                pp_ptr = optptr + 2;
                                                goto error;
                                        }
                                        {
                                                __be32 addr;
                                                memcpy(&addr, &optptr[optptr[2]-1], 4);
                                                if (inet_addr_type(net, addr) == RTN_UNICAST)
                                                        break;
                                                if (skb)
                                                        timeptr = &optptr[optptr[2]+3];
                                        }
                                        opt->ts_needtime = 1;
                                        optptr[2] += 8;
                                        break;
                                default:
                                        if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
                                                pp_ptr = optptr + 3;
                                                goto error;
                                        }
                                        break;
                                }
                                if (timeptr) {
                                        __be32 midtime;

                                        midtime = inet_current_timestamp();
                                        memcpy(timeptr, &midtime, 4);
                                        opt->is_changed = 1;
                                }
                        } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
                                unsigned int overflow = optptr[3]>>4;
                                if (overflow == 15) {
                                        pp_ptr = optptr + 3;
                                        goto error;
                                }
                                if (skb) {
                                        optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
                                        opt->is_changed = 1;
                                }
                        }
                        opt->ts = optptr - iph;
                        break;
                case IPOPT_RA:
                        if (optlen < 4) {
                                pp_ptr = optptr + 1;
                                goto error;
                        }
                        if (optptr[2] == 0 && optptr[3] == 0)
                                opt->router_alert = optptr - iph;
                        break;
                case IPOPT_CIPSO:
                        if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
                                pp_ptr = optptr;
                                goto error;
                        }
                        opt->cipso = optptr - iph;
                        if (cipso_v4_validate(skb, &optptr)) {
                                pp_ptr = optptr;
                                goto error;
                        }
                        break;
                case IPOPT_SEC:
                case IPOPT_SID:
                default:
                        if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
                                pp_ptr = optptr;
                                goto error;
                        }
                        break;
                }
                l -= optlen;
                optptr += optlen;
        }

eol:
        if (!pp_ptr)
                return 0;

error:
        if (info)
                *info = htonl((pp_ptr-iph)<<24);
        return -EINVAL;
}
EXPORT_SYMBOL(__ip_options_compile);

int ip_options_compile(struct net *net,
                       struct ip_options *opt, struct sk_buff *skb)
{
        int ret;
        __be32 info;

        ret = __ip_options_compile(net, opt, skb, &info);
        if (ret != 0 && skb)
                icmp_send(skb, ICMP_PARAMETERPROB, 0, info);
        return ret;
}
EXPORT_SYMBOL(ip_options_compile);

/*
 *        Undo all the changes done by ip_options_compile().
 */

void ip_options_undo(struct ip_options *opt)
{
        if (opt->srr) {
                unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr);

                memmove(optptr + 7, optptr + 3, optptr[1] - 7);
                memcpy(optptr + 3, &opt->faddr, 4);
        }
        if (opt->rr_needaddr) {
                unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr);

                optptr[2] -= 4;
                memset(&optptr[optptr[2] - 1], 0, 4);
        }
        if (opt->ts) {
                unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr);

                if (opt->ts_needtime) {
                        optptr[2] -= 4;
                        memset(&optptr[optptr[2] - 1], 0, 4);
                        if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC)
                                optptr[2] -= 4;
                }
                if (opt->ts_needaddr) {
                        optptr[2] -= 4;
                        memset(&optptr[optptr[2] - 1], 0, 4);
                }
        }
}

int ip_options_get(struct net *net, struct ip_options_rcu **optp,
                   sockptr_t data, int optlen)
{
        struct ip_options_rcu *opt;

        opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
                       GFP_KERNEL);
        if (!opt)
                return -ENOMEM;
        if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) {
                kfree(opt);
                return -EFAULT;
        }

        while (optlen & 3)
                opt->opt.__data[optlen++] = IPOPT_END;
        opt->opt.optlen = optlen;
        if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
                kfree(opt);
                return -EINVAL;
        }
        kfree(*optp);
        *optp = opt;
        return 0;
}

void ip_forward_options(struct sk_buff *skb)
{
        struct   ip_options *opt        = &(IPCB(skb)->opt);
        unsigned char *optptr;
        struct rtable *rt = skb_rtable(skb);
        unsigned char *raw = skb_network_header(skb);

        if (opt->rr_needaddr) {
                optptr = (unsigned char *)raw + opt->rr;
                ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
                opt->is_changed = 1;
        }
        if (opt->srr_is_hit) {
                int srrptr, srrspace;

                optptr = raw + opt->srr;

                for ( srrptr = optptr[2], srrspace = optptr[1];
                     srrptr <= srrspace;
                     srrptr += 4
                     ) {
                        if (srrptr + 3 > srrspace)
                                break;
                        if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
                                break;
                }
                if (srrptr + 3 <= srrspace) {
                        opt->is_changed = 1;
                        ip_hdr(skb)->daddr = opt->nexthop;
                        ip_rt_get_source(&optptr[srrptr-1], skb, rt);
                        optptr[2] = srrptr+4;
                } else {
                        net_crit_ratelimited("%s(): Argh! Destination lost!\n",
                                             __func__);
                }
                if (opt->ts_needaddr) {
                        optptr = raw + opt->ts;
                        ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
                        opt->is_changed = 1;
                }
        }
        if (opt->is_changed) {
                opt->is_changed = 0;
                ip_send_check(ip_hdr(skb));
        }
}

int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev)
{
        struct ip_options *opt = &(IPCB(skb)->opt);
        int srrspace, srrptr;
        __be32 nexthop;
        struct iphdr *iph = ip_hdr(skb);
        unsigned char *optptr = skb_network_header(skb) + opt->srr;
        struct rtable *rt = skb_rtable(skb);
        struct rtable *rt2;
        unsigned long orefdst;
        int err;

        if (!rt)
                return 0;

        if (skb->pkt_type != PACKET_HOST)
                return -EINVAL;
        if (rt->rt_type == RTN_UNICAST) {
                if (!opt->is_strictroute)
                        return 0;
                icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
                return -EINVAL;
        }
        if (rt->rt_type != RTN_LOCAL)
                return -EINVAL;

        for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
                if (srrptr + 3 > srrspace) {
                        icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
                        return -EINVAL;
                }
                memcpy(&nexthop, &optptr[srrptr-1], 4);

                orefdst = skb_dstref_steal(skb);
                err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph),
                                     dev) ? -EINVAL : 0;
                rt2 = skb_rtable(skb);
                if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
                        skb_dst_drop(skb);
                        skb_dstref_restore(skb, orefdst);
                        return -EINVAL;
                }
                refdst_drop(orefdst);
                if (rt2->rt_type != RTN_LOCAL)
                        break;
                /* Superfast 8) loopback forward */
                iph->daddr = nexthop;
                opt->is_changed = 1;
        }
        if (srrptr <= srrspace) {
                opt->srr_is_hit = 1;
                opt->nexthop = nexthop;
                opt->is_changed = 1;
        }
        return 0;
}
EXPORT_SYMBOL(ip_options_rcv_srr);






























   20 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BRIDGE_NETFILTER_H
#define __LINUX_BRIDGE_NETFILTER_H

#include <uapi/linux/netfilter_bridge.h>
#include <linux/skbuff.h>

struct nf_bridge_frag_data {
        char    mac[ETH_HLEN];
        bool    vlan_present;
        u16     vlan_tci;
        __be16  vlan_proto;
};

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)

int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);

static inline void br_drop_fake_rtable(struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && (dst->flags & DST_FAKE_RTABLE))
                skb_dst_drop(skb);
}

static inline struct nf_bridge_info *
nf_bridge_info_get(const struct sk_buff *skb)
{
        return skb_ext_find(skb, SKB_EXT_BRIDGE_NF);
}

static inline bool nf_bridge_info_exists(const struct sk_buff *skb)
{
        return skb_ext_exist(skb, SKB_EXT_BRIDGE_NF);
}

static inline int nf_bridge_get_physinif(const struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (!nf_bridge)
                return 0;

        return nf_bridge->physinif;
}

static inline int nf_bridge_get_physoutif(const struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (!nf_bridge)
                return 0;

        return nf_bridge->physoutdev ? nf_bridge->physoutdev->ifindex : 0;
}

static inline struct net_device *
nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        return nf_bridge ? dev_get_by_index_rcu(net, nf_bridge->physinif) : NULL;
}

static inline struct net_device *
nf_bridge_get_physoutdev(const struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        return nf_bridge ? nf_bridge->physoutdev : NULL;
}

static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        return nf_bridge && nf_bridge->in_prerouting;
}
#else
#define br_drop_fake_rtable(skb)                do { } while (0)
static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb)
{
        return false;
}
#endif /* CONFIG_BRIDGE_NETFILTER */

#endif

















    1 
    1 






















    1 
























    1 
    1 
    1 









    1 


    1 

    1 









    1 


















    1 

    1 




































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Input Multitouch Library
 *
 * Copyright (c) 2008-2010 Henrik Rydberg
 */

#include <linux/input/mt.h>
#include <linux/export.h>
#include <linux/slab.h>
#include "input-core-private.h"

#define TRKID_SGN        ((TRKID_MAX + 1) >> 1)

static void copy_abs(struct input_dev *dev, unsigned int dst, unsigned int src)
{
        if (dev->absinfo && test_bit(src, dev->absbit)) {
                dev->absinfo[dst] = dev->absinfo[src];
                dev->absinfo[dst].fuzz = 0;
                __set_bit(dst, dev->absbit);
        }
}

/**
 * input_mt_init_slots() - initialize MT input slots
 * @dev: input device supporting MT events and finger tracking
 * @num_slots: number of slots used by the device
 * @flags: mt tasks to handle in core
 *
 * This function allocates all necessary memory for MT slot handling
 * in the input device, prepares the ABS_MT_SLOT and
 * ABS_MT_TRACKING_ID events for use and sets up appropriate buffers.
 * Depending on the flags set, it also performs pointer emulation and
 * frame synchronization.
 *
 * May be called repeatedly. Returns -EINVAL if attempting to
 * reinitialize with a different number of slots.
 */
int input_mt_init_slots(struct input_dev *dev, unsigned int num_slots,
                        unsigned int flags)
{
        if (!num_slots)
                return 0;

        if (dev->mt)
                return dev->mt->num_slots != num_slots ? -EINVAL : 0;

        /* Arbitrary limit for avoiding too large memory allocation. */
        if (num_slots > 1024)
                return -EINVAL;

        struct input_mt *mt __free(kfree) =
                        kzalloc_flex(*mt, slots, num_slots);
        if (!mt)
                return -ENOMEM;

        mt->num_slots = num_slots;
        mt->flags = flags;
        input_set_abs_params(dev, ABS_MT_SLOT, 0, num_slots - 1, 0, 0);
        input_set_abs_params(dev, ABS_MT_TRACKING_ID, 0, TRKID_MAX, 0, 0);

        if (flags & (INPUT_MT_POINTER | INPUT_MT_DIRECT)) {
                __set_bit(EV_KEY, dev->evbit);
                __set_bit(BTN_TOUCH, dev->keybit);

                copy_abs(dev, ABS_X, ABS_MT_POSITION_X);
                copy_abs(dev, ABS_Y, ABS_MT_POSITION_Y);
                copy_abs(dev, ABS_PRESSURE, ABS_MT_PRESSURE);
        }
        if (flags & INPUT_MT_POINTER) {
                __set_bit(BTN_TOOL_FINGER, dev->keybit);
                __set_bit(BTN_TOOL_DOUBLETAP, dev->keybit);
                if (num_slots >= 3)
                        __set_bit(BTN_TOOL_TRIPLETAP, dev->keybit);
                if (num_slots >= 4)
                        __set_bit(BTN_TOOL_QUADTAP, dev->keybit);
                if (num_slots >= 5)
                        __set_bit(BTN_TOOL_QUINTTAP, dev->keybit);
                __set_bit(INPUT_PROP_POINTER, dev->propbit);
        }
        if (flags & INPUT_MT_DIRECT)
                __set_bit(INPUT_PROP_DIRECT, dev->propbit);
        if (flags & INPUT_MT_SEMI_MT)
                __set_bit(INPUT_PROP_SEMI_MT, dev->propbit);
        if (flags & INPUT_MT_TRACK) {
                unsigned int n2 = num_slots * num_slots;
                mt->red = kzalloc_objs(*mt->red, n2);
                if (!mt->red)
                        return -ENOMEM;
        }

        /* Mark slots as 'inactive' */
        for (unsigned int i = 0; i < num_slots; i++)
                input_mt_set_value(&mt->slots[i], ABS_MT_TRACKING_ID, -1);

        /* Mark slots as 'unused' */
        mt->frame = 1;

        dev->mt = no_free_ptr(mt);
        return 0;
}
EXPORT_SYMBOL(input_mt_init_slots);

/**
 * input_mt_destroy_slots() - frees the MT slots of the input device
 * @dev: input device with allocated MT slots
 *
 * This function is only needed in error path as the input core will
 * automatically free the MT slots when the device is destroyed.
 */
void input_mt_destroy_slots(struct input_dev *dev)
{
        if (dev->mt) {
                kfree(dev->mt->red);
                kfree(dev->mt);
        }
        dev->mt = NULL;
}
EXPORT_SYMBOL(input_mt_destroy_slots);

/**
 * input_mt_report_slot_state() - report contact state
 * @dev: input device with allocated MT slots
 * @tool_type: the tool type to use in this slot
 * @active: true if contact is active, false otherwise
 *
 * Reports a contact via ABS_MT_TRACKING_ID, and optionally
 * ABS_MT_TOOL_TYPE. If active is true and the slot is currently
 * inactive, or if the tool type is changed, a new tracking id is
 * assigned to the slot. The tool type is only reported if the
 * corresponding absbit field is set.
 *
 * Returns true if contact is active.
 */
bool input_mt_report_slot_state(struct input_dev *dev,
                                unsigned int tool_type, bool active)
{
        struct input_mt *mt = dev->mt;
        struct input_mt_slot *slot;
        int id;

        if (!mt)
                return false;

        slot = &mt->slots[mt->slot];
        slot->frame = mt->frame;

        if (!active) {
                input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1);
                return false;
        }

        id = input_mt_get_value(slot, ABS_MT_TRACKING_ID);
        if (id < 0)
                id = input_mt_new_trkid(mt);

        input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, id);
        input_event(dev, EV_ABS, ABS_MT_TOOL_TYPE, tool_type);

        return true;
}
EXPORT_SYMBOL(input_mt_report_slot_state);

/**
 * input_mt_report_finger_count() - report contact count
 * @dev: input device with allocated MT slots
 * @count: the number of contacts
 *
 * Reports the contact count via BTN_TOOL_FINGER, BTN_TOOL_DOUBLETAP,
 * BTN_TOOL_TRIPLETAP and BTN_TOOL_QUADTAP.
 *
 * The input core ensures only the KEY events already setup for
 * this device will produce output.
 */
void input_mt_report_finger_count(struct input_dev *dev, int count)
{
        input_event(dev, EV_KEY, BTN_TOOL_FINGER, count == 1);
        input_event(dev, EV_KEY, BTN_TOOL_DOUBLETAP, count == 2);
        input_event(dev, EV_KEY, BTN_TOOL_TRIPLETAP, count == 3);
        input_event(dev, EV_KEY, BTN_TOOL_QUADTAP, count == 4);
        input_event(dev, EV_KEY, BTN_TOOL_QUINTTAP, count == 5);
}
EXPORT_SYMBOL(input_mt_report_finger_count);

/**
 * input_mt_report_pointer_emulation() - common pointer emulation
 * @dev: input device with allocated MT slots
 * @use_count: report number of active contacts as finger count
 *
 * Performs legacy pointer emulation via BTN_TOUCH, ABS_X, ABS_Y and
 * ABS_PRESSURE. Touchpad finger count is emulated if use_count is true.
 *
 * The input core ensures only the KEY and ABS axes already setup for
 * this device will produce output.
 */
void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count)
{
        struct input_mt *mt = dev->mt;
        struct input_mt_slot *oldest;
        int oldid, count, i;
        int p, reported_p = 0;

        if (!mt)
                return;

        oldest = NULL;
        oldid = mt->trkid;
        count = 0;

        for (i = 0; i < mt->num_slots; ++i) {
                struct input_mt_slot *ps = &mt->slots[i];
                int id = input_mt_get_value(ps, ABS_MT_TRACKING_ID);

                if (id < 0)
                        continue;
                if ((id - oldid) & TRKID_SGN) {
                        oldest = ps;
                        oldid = id;
                }
                if (test_bit(ABS_MT_PRESSURE, dev->absbit)) {
                        p = input_mt_get_value(ps, ABS_MT_PRESSURE);
                        if (mt->flags & INPUT_MT_TOTAL_FORCE)
                                reported_p += p;
                        else if (oldid == id)
                                reported_p = p;
                }
                count++;
        }

        input_event(dev, EV_KEY, BTN_TOUCH, count > 0);

        if (use_count) {
                if (count == 0 &&
                    !test_bit(ABS_MT_DISTANCE, dev->absbit) &&
                    test_bit(ABS_DISTANCE, dev->absbit) &&
                    input_abs_get_val(dev, ABS_DISTANCE) != 0) {
                        /*
                         * Force reporting BTN_TOOL_FINGER for devices that
                         * only report general hover (and not per-contact
                         * distance) when contact is in proximity but not
                         * on the surface.
                         */
                        count = 1;
                }

                input_mt_report_finger_count(dev, count);
        }

        if (oldest) {
                int x = input_mt_get_value(oldest, ABS_MT_POSITION_X);
                int y = input_mt_get_value(oldest, ABS_MT_POSITION_Y);

                input_event(dev, EV_ABS, ABS_X, x);
                input_event(dev, EV_ABS, ABS_Y, y);

                if (test_bit(ABS_MT_PRESSURE, dev->absbit))
                        input_event(dev, EV_ABS, ABS_PRESSURE, reported_p);
        } else {
                if (test_bit(ABS_MT_PRESSURE, dev->absbit))
                        input_event(dev, EV_ABS, ABS_PRESSURE, 0);
        }
}
EXPORT_SYMBOL(input_mt_report_pointer_emulation);

static void __input_mt_drop_unused(struct input_dev *dev, struct input_mt *mt)
{
        int i;

        lockdep_assert_held(&dev->event_lock);

        for (i = 0; i < mt->num_slots; i++) {
                if (input_mt_is_active(&mt->slots[i]) &&
                    !input_mt_is_used(mt, &mt->slots[i])) {
                        input_handle_event(dev, EV_ABS, ABS_MT_SLOT, i);
                        input_handle_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1);
                }
        }
}

/**
 * input_mt_drop_unused() - Inactivate slots not seen in this frame
 * @dev: input device with allocated MT slots
 *
 * Lift all slots not seen since the last call to this function.
 */
void input_mt_drop_unused(struct input_dev *dev)
{
        struct input_mt *mt = dev->mt;

        if (mt) {
                guard(spinlock_irqsave)(&dev->event_lock);

                __input_mt_drop_unused(dev, mt);
                mt->frame++;
        }
}
EXPORT_SYMBOL(input_mt_drop_unused);

/**
 * input_mt_release_slots() - Deactivate all slots
 * @dev: input device with allocated MT slots
 *
 * Lift all active slots.
 */
void input_mt_release_slots(struct input_dev *dev)
{
        struct input_mt *mt = dev->mt;

        lockdep_assert_held(&dev->event_lock);

        if (mt) {
                /* This will effectively mark all slots unused. */
                mt->frame++;

                __input_mt_drop_unused(dev, mt);

                if (test_bit(ABS_PRESSURE, dev->absbit))
                        input_handle_event(dev, EV_ABS, ABS_PRESSURE, 0);

                mt->frame++;
        }
}

/**
 * input_mt_sync_frame() - synchronize mt frame
 * @dev: input device with allocated MT slots
 *
 * Close the frame and prepare the internal state for a new one.
 * Depending on the flags, marks unused slots as inactive and performs
 * pointer emulation.
 */
void input_mt_sync_frame(struct input_dev *dev)
{
        struct input_mt *mt = dev->mt;
        bool use_count = false;

        if (!mt)
                return;

        if (mt->flags & INPUT_MT_DROP_UNUSED) {
                guard(spinlock_irqsave)(&dev->event_lock);
                __input_mt_drop_unused(dev, mt);
        }

        if ((mt->flags & INPUT_MT_POINTER) && !(mt->flags & INPUT_MT_SEMI_MT))
                use_count = true;

        input_mt_report_pointer_emulation(dev, use_count);

        mt->frame++;
}
EXPORT_SYMBOL(input_mt_sync_frame);

static int adjust_dual(int *begin, int step, int *end, int eq, int mu)
{
        int f, *p, s, c;

        if (begin == end)
                return 0;

        f = *begin;
        p = begin + step;
        s = p == end ? f + 1 : *p;

        for (; p != end; p += step) {
                if (*p < f) {
                        s = f;
                        f = *p;
                } else if (*p < s) {
                        s = *p;
                }
        }

        c = (f + s + 1) / 2;
        if (c == 0 || (c > mu && (!eq || mu > 0)))
                return 0;
        /* Improve convergence for positive matrices by penalizing overcovers */
        if (s < 0 && mu <= 0)
                c *= 2;

        for (p = begin; p != end; p += step)
                *p -= c;

        return (c < s && s <= 0) || (f >= 0 && f < c);
}

static void find_reduced_matrix(int *w, int nr, int nc, int nrc, int mu)
{
        int i, k, sum;

        for (k = 0; k < nrc; k++) {
                for (i = 0; i < nr; i++)
                        adjust_dual(w + i, nr, w + i + nrc, nr <= nc, mu);
                sum = 0;
                for (i = 0; i < nrc; i += nr)
                        sum += adjust_dual(w + i, 1, w + i + nr, nc <= nr, mu);
                if (!sum)
                        break;
        }
}

static int input_mt_set_matrix(struct input_mt *mt,
                               const struct input_mt_pos *pos, int num_pos,
                               int mu)
{
        const struct input_mt_pos *p;
        struct input_mt_slot *s;
        int *w = mt->red;
        int x, y;

        for (s = mt->slots; s != mt->slots + mt->num_slots; s++) {
                if (!input_mt_is_active(s))
                        continue;
                x = input_mt_get_value(s, ABS_MT_POSITION_X);
                y = input_mt_get_value(s, ABS_MT_POSITION_Y);
                for (p = pos; p != pos + num_pos; p++) {
                        int dx = x - p->x, dy = y - p->y;
                        *w++ = dx * dx + dy * dy - mu;
                }
        }

        return w - mt->red;
}

static void input_mt_set_slots(struct input_mt *mt,
                               int *slots, int num_pos)
{
        struct input_mt_slot *s;
        int *w = mt->red, j;

        for (j = 0; j != num_pos; j++)
                slots[j] = -1;

        for (s = mt->slots; s != mt->slots + mt->num_slots; s++) {
                if (!input_mt_is_active(s))
                        continue;

                for (j = 0; j != num_pos; j++) {
                        if (w[j] < 0) {
                                slots[j] = s - mt->slots;
                                break;
                        }
                }

                w += num_pos;
        }

        for (s = mt->slots; s != mt->slots + mt->num_slots; s++) {
                if (input_mt_is_active(s))
                        continue;

                for (j = 0; j != num_pos; j++) {
                        if (slots[j] < 0) {
                                slots[j] = s - mt->slots;
                                break;
                        }
                }
        }
}

/**
 * input_mt_assign_slots() - perform a best-match assignment
 * @dev: input device with allocated MT slots
 * @slots: the slot assignment to be filled
 * @pos: the position array to match
 * @num_pos: number of positions
 * @dmax: maximum ABS_MT_POSITION displacement (zero for infinite)
 *
 * Performs a best match against the current contacts and returns
 * the slot assignment list. New contacts are assigned to unused
 * slots.
 *
 * The assignments are balanced so that all coordinate displacements are
 * below the euclidian distance dmax. If no such assignment can be found,
 * some contacts are assigned to unused slots.
 *
 * Returns zero on success, or negative error in case of failure.
 */
int input_mt_assign_slots(struct input_dev *dev, int *slots,
                          const struct input_mt_pos *pos, int num_pos,
                          int dmax)
{
        struct input_mt *mt = dev->mt;
        int mu = 2 * dmax * dmax;
        int nrc;

        if (!mt || !mt->red)
                return -ENXIO;
        if (num_pos > mt->num_slots)
                return -EINVAL;
        if (num_pos < 1)
                return 0;

        nrc = input_mt_set_matrix(mt, pos, num_pos, mu);
        find_reduced_matrix(mt->red, num_pos, nrc / num_pos, nrc, mu);
        input_mt_set_slots(mt, slots, num_pos);

        return 0;
}
EXPORT_SYMBOL(input_mt_assign_slots);

/**
 * input_mt_get_slot_by_key() - return slot matching key
 * @dev: input device with allocated MT slots
 * @key: the key of the sought slot
 *
 * Returns the slot of the given key, if it exists, otherwise
 * set the key on the first unused slot and return.
 *
 * If no available slot can be found, -1 is returned.
 * Note that for this function to work properly, input_mt_sync_frame() has
 * to be called at each frame.
 */
int input_mt_get_slot_by_key(struct input_dev *dev, int key)
{
        struct input_mt *mt = dev->mt;
        struct input_mt_slot *s;

        if (!mt)
                return -1;

        for (s = mt->slots; s != mt->slots + mt->num_slots; s++)
                if (input_mt_is_active(s) && s->key == key)
                        return s - mt->slots;

        for (s = mt->slots; s != mt->slots + mt->num_slots; s++)
                if (!input_mt_is_active(s) && !input_mt_is_used(mt, s)) {
                        s->key = key;
                        return s - mt->slots;
                }

        return -1;
}
EXPORT_SYMBOL(input_mt_get_slot_by_key);








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/dax.c - Direct Access filesystem code
 * Copyright (c) 2013-2014 Intel Corporation
 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
 */

#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
#include <linux/rmap.h>
#include <linux/pgalloc.h>

#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>

/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)

/* The 'colour' (ie low bits) within a PMD of a page offset.  */
#define PG_PMD_COLOUR        ((PMD_SIZE >> PAGE_SHIFT) - 1)
#define PG_PMD_NR        (PMD_SIZE >> PAGE_SHIFT)

static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];

static int __init init_dax_wait_table(void)
{
        int i;

        for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
                init_waitqueue_head(wait_table + i);
        return 0;
}
fs_initcall(init_dax_wait_table);

/*
 * DAX pagecache entries use XArray value entries so they can't be mistaken
 * for pages.  We use one bit for locking, one bit for the entry size (PMD)
 * and two more to tell us if the entry is a zero page or an empty entry that
 * is just used for locking.  In total four special bits.
 *
 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
 * block allocation.
 */
#define DAX_SHIFT        (4)
#define DAX_LOCKED        (1UL << 0)
#define DAX_PMD                (1UL << 1)
#define DAX_ZERO_PAGE        (1UL << 2)
#define DAX_EMPTY        (1UL << 3)

static unsigned long dax_to_pfn(void *entry)
{
        return xa_to_value(entry) >> DAX_SHIFT;
}

static struct folio *dax_to_folio(void *entry)
{
        return page_folio(pfn_to_page(dax_to_pfn(entry)));
}

static void *dax_make_entry(unsigned long pfn, unsigned long flags)
{
        return xa_mk_value(flags | (pfn << DAX_SHIFT));
}

static bool dax_is_locked(void *entry)
{
        return xa_to_value(entry) & DAX_LOCKED;
}

static unsigned int dax_entry_order(void *entry)
{
        if (xa_to_value(entry) & DAX_PMD)
                return PMD_ORDER;
        return 0;
}

static unsigned long dax_is_pmd_entry(void *entry)
{
        return xa_to_value(entry) & DAX_PMD;
}

static bool dax_is_pte_entry(void *entry)
{
        return !(xa_to_value(entry) & DAX_PMD);
}

static int dax_is_zero_entry(void *entry)
{
        return xa_to_value(entry) & DAX_ZERO_PAGE;
}

static int dax_is_empty_entry(void *entry)
{
        return xa_to_value(entry) & DAX_EMPTY;
}

/*
 * true if the entry that was found is of a smaller order than the entry
 * we were looking for
 */
static bool dax_is_conflict(void *entry)
{
        return entry == XA_RETRY_ENTRY;
}

/*
 * DAX page cache entry locking
 */
struct exceptional_entry_key {
        struct xarray *xa;
        pgoff_t entry_start;
};

struct wait_exceptional_entry_queue {
        wait_queue_entry_t wait;
        struct exceptional_entry_key key;
};

/**
 * enum dax_wake_mode: waitqueue wakeup behaviour
 * @WAKE_ALL: wake all waiters in the waitqueue
 * @WAKE_NEXT: wake only the first waiter in the waitqueue
 */
enum dax_wake_mode {
        WAKE_ALL,
        WAKE_NEXT,
};

static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
                void *entry, struct exceptional_entry_key *key)
{
        unsigned long hash;
        unsigned long index = xas->xa_index;

        /*
         * If 'entry' is a PMD, align the 'index' that we use for the wait
         * queue to the start of that PMD.  This ensures that all offsets in
         * the range covered by the PMD map to the same bit lock.
         */
        if (dax_is_pmd_entry(entry))
                index &= ~PG_PMD_COLOUR;
        key->xa = xas->xa;
        key->entry_start = index;

        hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
        return wait_table + hash;
}

static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
                unsigned int mode, int sync, void *keyp)
{
        struct exceptional_entry_key *key = keyp;
        struct wait_exceptional_entry_queue *ewait =
                container_of(wait, struct wait_exceptional_entry_queue, wait);

        if (key->xa != ewait->key.xa ||
            key->entry_start != ewait->key.entry_start)
                return 0;
        return autoremove_wake_function(wait, mode, sync, NULL);
}

/*
 * @entry may no longer be the entry at the index in the mapping.
 * The important information it's conveying is whether the entry at
 * this index used to be a PMD entry.
 */
static void dax_wake_entry(struct xa_state *xas, void *entry,
                           enum dax_wake_mode mode)
{
        struct exceptional_entry_key key;
        wait_queue_head_t *wq;

        wq = dax_entry_waitqueue(xas, entry, &key);

        /*
         * Checking for locked entry and prepare_to_wait_exclusive() happens
         * under the i_pages lock, ditto for entry handling in our callers.
         * So at this point all tasks that could have seen our entry locked
         * must be in the waitqueue and the following check will see them.
         */
        if (waitqueue_active(wq))
                __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
}

/*
 * Look up entry in page cache, wait for it to become unlocked if it
 * is a DAX entry and return it.  The caller must subsequently call
 * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
 * if it did.  The entry returned may have a larger order than @order.
 * If @order is larger than the order of the entry found in i_pages, this
 * function returns a dax_is_conflict entry.
 *
 * Must be called with the i_pages lock held.
 */
static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order)
{
        void *entry;
        struct wait_exceptional_entry_queue ewait;
        wait_queue_head_t *wq;

        init_wait(&ewait.wait);
        ewait.wait.func = wake_exceptional_entry_func;

        for (;;) {
                entry = xas_find_conflict(xas);
                if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
                        return entry;
                if (dax_entry_order(entry) < order)
                        return XA_RETRY_ENTRY;
                if (!dax_is_locked(entry))
                        return entry;

                wq = dax_entry_waitqueue(xas, entry, &ewait.key);
                prepare_to_wait_exclusive(wq, &ewait.wait,
                                          TASK_UNINTERRUPTIBLE);
                xas_unlock_irq(xas);
                xas_reset(xas);
                schedule();
                finish_wait(wq, &ewait.wait);
                xas_lock_irq(xas);
        }
}

/*
 * Wait for the given entry to become unlocked. Caller must hold the i_pages
 * lock and call either put_unlocked_entry() if it did not lock the entry or
 * dax_unlock_entry() if it did. Returns an unlocked entry if still present.
 */
static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry)
{
        struct wait_exceptional_entry_queue ewait;
        wait_queue_head_t *wq;

        init_wait(&ewait.wait);
        ewait.wait.func = wake_exceptional_entry_func;

        while (unlikely(dax_is_locked(entry))) {
                wq = dax_entry_waitqueue(xas, entry, &ewait.key);
                prepare_to_wait_exclusive(wq, &ewait.wait,
                                        TASK_UNINTERRUPTIBLE);
                xas_reset(xas);
                xas_unlock_irq(xas);
                schedule();
                finish_wait(wq, &ewait.wait);
                xas_lock_irq(xas);
                entry = xas_load(xas);
        }

        if (xa_is_internal(entry))
                return NULL;

        return entry;
}

/*
 * The only thing keeping the address space around is the i_pages lock
 * (it's cycled in clear_inode() after removing the entries from i_pages)
 * After we call xas_unlock_irq(), we cannot touch xas->xa.
 */
static void wait_entry_unlocked(struct xa_state *xas, void *entry)
{
        struct wait_exceptional_entry_queue ewait;
        wait_queue_head_t *wq;

        init_wait(&ewait.wait);
        ewait.wait.func = wake_exceptional_entry_func;

        wq = dax_entry_waitqueue(xas, entry, &ewait.key);
        /*
         * Unlike get_next_unlocked_entry() there is no guarantee that this
         * path ever successfully retrieves an unlocked entry before an
         * inode dies. Perform a non-exclusive wait in case this path
         * never successfully performs its own wake up.
         */
        prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
        xas_unlock_irq(xas);
        schedule();
        finish_wait(wq, &ewait.wait);
}

static void put_unlocked_entry(struct xa_state *xas, void *entry,
                               enum dax_wake_mode mode)
{
        if (entry && !dax_is_conflict(entry))
                dax_wake_entry(xas, entry, mode);
}

/*
 * We used the xa_state to get the entry, but then we locked the entry and
 * dropped the xa_lock, so we know the xa_state is stale and must be reset
 * before use.
 */
static void dax_unlock_entry(struct xa_state *xas, void *entry)
{
        void *old;

        BUG_ON(dax_is_locked(entry));
        xas_reset(xas);
        xas_lock_irq(xas);
        old = xas_store(xas, entry);
        xas_unlock_irq(xas);
        BUG_ON(!dax_is_locked(old));
        dax_wake_entry(xas, entry, WAKE_NEXT);
}

/*
 * Return: The entry stored at this location before it was locked.
 */
static void *dax_lock_entry(struct xa_state *xas, void *entry)
{
        unsigned long v = xa_to_value(entry);
        return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
}

static unsigned long dax_entry_size(void *entry)
{
        if (dax_is_zero_entry(entry))
                return 0;
        else if (dax_is_empty_entry(entry))
                return 0;
        else if (dax_is_pmd_entry(entry))
                return PMD_SIZE;
        else
                return PAGE_SIZE;
}

/*
 * A DAX folio is considered shared if it has no mapping set and ->share (which
 * shares the ->index field) is non-zero. Note this may return false even if the
 * page is shared between multiple files but has not yet actually been mapped
 * into multiple address spaces.
 */
static inline bool dax_folio_is_shared(struct folio *folio)
{
        return !folio->mapping && folio->share;
}

/*
 * When it is called by dax_insert_entry(), the shared flag will indicate
 * whether this entry is shared by multiple files. If the page has not
 * previously been associated with any mappings the ->mapping and ->index
 * fields will be set. If it has already been associated with a mapping
 * the mapping will be cleared and the share count set. It's then up to
 * reverse map users like memory_failure() to call back into the filesystem to
 * recover ->mapping and ->index information. For example by implementing
 * dax_holder_operations.
 */
static void dax_folio_make_shared(struct folio *folio)
{
        /*
         * folio is not currently shared so mark it as shared by clearing
         * folio->mapping.
         */
        folio->mapping = NULL;

        /*
         * folio has previously been mapped into one address space so set the
         * share count.
         */
        folio->share = 1;
}

static inline unsigned long dax_folio_put(struct folio *folio)
{
        unsigned long ref;
        int order, i;

        if (!dax_folio_is_shared(folio))
                ref = 0;
        else
                ref = --folio->share;

        if (ref)
                return ref;

        folio->mapping = NULL;
        order = folio_order(folio);
        if (!order)
                return 0;
        folio_reset_order(folio);

        for (i = 0; i < (1UL << order); i++) {
                struct dev_pagemap *pgmap = page_pgmap(&folio->page);
                struct page *page = folio_page(folio, i);
                struct folio *new_folio = (struct folio *)page;

                ClearPageHead(page);
                clear_compound_head(page);

                new_folio->mapping = NULL;
                /*
                 * Reset pgmap which was over-written by
                 * prep_compound_page().
                 */
                new_folio->pgmap = pgmap;
                new_folio->share = 0;
                WARN_ON_ONCE(folio_ref_count(new_folio));
        }

        return ref;
}

static void dax_folio_init(void *entry)
{
        struct folio *folio = dax_to_folio(entry);
        int order = dax_entry_order(entry);

        /*
         * Folio should have been split back to order-0 pages in
         * dax_folio_put() when they were removed from their
         * final mapping.
         */
        WARN_ON_ONCE(folio_order(folio));

        if (order > 0) {
                prep_compound_page(&folio->page, order);
                if (order > 1)
                        INIT_LIST_HEAD(&folio->_deferred_list);
                WARN_ON_ONCE(folio_ref_count(folio));
        }
}

static void dax_associate_entry(void *entry, struct address_space *mapping,
                                struct vm_area_struct *vma,
                                unsigned long address, bool shared)
{
        unsigned long size = dax_entry_size(entry), index;
        struct folio *folio = dax_to_folio(entry);

        if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
                return;

        index = linear_page_index(vma, address & ~(size - 1));
        if (shared && (folio->mapping || dax_folio_is_shared(folio))) {
                if (folio->mapping)
                        dax_folio_make_shared(folio);

                WARN_ON_ONCE(!folio->share);
                WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
                folio->share++;
        } else {
                WARN_ON_ONCE(folio->mapping);
                dax_folio_init(entry);
                folio = dax_to_folio(entry);
                folio->mapping = mapping;
                folio->index = index;
        }
}

static void dax_disassociate_entry(void *entry, struct address_space *mapping,
                                bool trunc)
{
        struct folio *folio = dax_to_folio(entry);

        if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
                return;

        dax_folio_put(folio);
}

static struct page *dax_busy_page(void *entry)
{
        struct folio *folio = dax_to_folio(entry);

        if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
                return NULL;

        if (folio_ref_count(folio) - folio_mapcount(folio))
                return &folio->page;
        else
                return NULL;
}

/**
 * dax_lock_folio - Lock the DAX entry corresponding to a folio
 * @folio: The folio whose entry we want to lock
 *
 * Context: Process context.
 * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
 * not be locked.
 */
dax_entry_t dax_lock_folio(struct folio *folio)
{
        XA_STATE(xas, NULL, 0);
        void *entry;

        /* Ensure folio->mapping isn't freed while we look at it */
        rcu_read_lock();
        for (;;) {
                struct address_space *mapping = READ_ONCE(folio->mapping);

                entry = NULL;
                if (!mapping || !dax_mapping(mapping))
                        break;

                /*
                 * In the device-dax case there's no need to lock, a
                 * struct dev_pagemap pin is sufficient to keep the
                 * inode alive, and we assume we have dev_pagemap pin
                 * otherwise we would not have a valid pfn_to_page()
                 * translation.
                 */
                entry = (void *)~0UL;
                if (S_ISCHR(mapping->host->i_mode))
                        break;

                xas.xa = &mapping->i_pages;
                xas_lock_irq(&xas);
                if (mapping != folio->mapping) {
                        xas_unlock_irq(&xas);
                        continue;
                }
                xas_set(&xas, folio->index);
                entry = xas_load(&xas);
                if (dax_is_locked(entry)) {
                        rcu_read_unlock();
                        wait_entry_unlocked(&xas, entry);
                        rcu_read_lock();
                        continue;
                }
                dax_lock_entry(&xas, entry);
                xas_unlock_irq(&xas);
                break;
        }
        rcu_read_unlock();
        return (dax_entry_t)entry;
}

void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
{
        struct address_space *mapping = folio->mapping;
        XA_STATE(xas, &mapping->i_pages, folio->index);

        if (S_ISCHR(mapping->host->i_mode))
                return;

        dax_unlock_entry(&xas, (void *)cookie);
}

/*
 * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
 * @mapping: the file's mapping whose entry we want to lock
 * @index: the offset within this file
 * @page: output the dax page corresponding to this dax entry
 *
 * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
 * could not be locked.
 */
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
                struct page **page)
{
        XA_STATE(xas, NULL, 0);
        void *entry;

        rcu_read_lock();
        for (;;) {
                entry = NULL;
                if (!dax_mapping(mapping))
                        break;

                xas.xa = &mapping->i_pages;
                xas_lock_irq(&xas);
                xas_set(&xas, index);
                entry = xas_load(&xas);
                if (dax_is_locked(entry)) {
                        rcu_read_unlock();
                        wait_entry_unlocked(&xas, entry);
                        rcu_read_lock();
                        continue;
                }
                if (!entry ||
                    dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
                        /*
                         * Because we are looking for entry from file's mapping
                         * and index, so the entry may not be inserted for now,
                         * or even a zero/empty entry.  We don't think this is
                         * an error case.  So, return a special value and do
                         * not output @page.
                         */
                        entry = (void *)~0UL;
                } else {
                        *page = pfn_to_page(dax_to_pfn(entry));
                        dax_lock_entry(&xas, entry);
                }
                xas_unlock_irq(&xas);
                break;
        }
        rcu_read_unlock();
        return (dax_entry_t)entry;
}

void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
                dax_entry_t cookie)
{
        XA_STATE(xas, &mapping->i_pages, index);

        if (cookie == ~0UL)
                return;

        dax_unlock_entry(&xas, (void *)cookie);
}

/*
 * Find page cache entry at given index. If it is a DAX entry, return it
 * with the entry locked. If the page cache doesn't contain an entry at
 * that index, add a locked empty entry.
 *
 * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
 * either return that locked entry or will return VM_FAULT_FALLBACK.
 * This will happen if there are any PTE entries within the PMD range
 * that we are requesting.
 *
 * We always favor PTE entries over PMD entries. There isn't a flow where we
 * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
 * insertion will fail if it finds any PTE entries already in the tree, and a
 * PTE insertion will cause an existing PMD entry to be unmapped and
 * downgraded to PTE entries.  This happens for both PMD zero pages as
 * well as PMD empty entries.
 *
 * The exception to this downgrade path is for PMD entries that have
 * real storage backing them.  We will leave these real PMD entries in
 * the tree, and PTE writes will simply dirty the entire PMD entry.
 *
 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
 * persistent memory the benefit is doubtful. We can add that later if we can
 * show it helps.
 *
 * On error, this function does not return an ERR_PTR.  Instead it returns
 * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
 * overlap with xarray value entries.
 */
static void *grab_mapping_entry(struct xa_state *xas,
                struct address_space *mapping, unsigned int order)
{
        unsigned long index = xas->xa_index;
        bool pmd_downgrade;        /* splitting PMD entry into PTE entries? */
        void *entry;

retry:
        pmd_downgrade = false;
        xas_lock_irq(xas);
        entry = get_next_unlocked_entry(xas, order);

        if (entry) {
                if (dax_is_conflict(entry))
                        goto fallback;
                if (!xa_is_value(entry)) {
                        xas_set_err(xas, -EIO);
                        goto out_unlock;
                }

                if (order == 0) {
                        if (dax_is_pmd_entry(entry) &&
                            (dax_is_zero_entry(entry) ||
                             dax_is_empty_entry(entry))) {
                                pmd_downgrade = true;
                        }
                }
        }

        if (pmd_downgrade) {
                /*
                 * Make sure 'entry' remains valid while we drop
                 * the i_pages lock.
                 */
                dax_lock_entry(xas, entry);

                /*
                 * Besides huge zero pages the only other thing that gets
                 * downgraded are empty entries which don't need to be
                 * unmapped.
                 */
                if (dax_is_zero_entry(entry)) {
                        xas_unlock_irq(xas);
                        unmap_mapping_pages(mapping,
                                        xas->xa_index & ~PG_PMD_COLOUR,
                                        PG_PMD_NR, false);
                        xas_reset(xas);
                        xas_lock_irq(xas);
                }

                dax_disassociate_entry(entry, mapping, false);
                xas_store(xas, NULL);        /* undo the PMD join */
                dax_wake_entry(xas, entry, WAKE_ALL);
                mapping->nrpages -= PG_PMD_NR;
                entry = NULL;
                xas_set(xas, index);
        }

        if (entry) {
                dax_lock_entry(xas, entry);
        } else {
                unsigned long flags = DAX_EMPTY;

                if (order > 0)
                        flags |= DAX_PMD;
                entry = dax_make_entry(0, flags);
                dax_lock_entry(xas, entry);
                if (xas_error(xas))
                        goto out_unlock;
                mapping->nrpages += 1UL << order;
        }

out_unlock:
        xas_unlock_irq(xas);
        if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
                goto retry;
        if (xas->xa_node == XA_ERROR(-ENOMEM))
                return xa_mk_internal(VM_FAULT_OOM);
        if (xas_error(xas))
                return xa_mk_internal(VM_FAULT_SIGBUS);
        return entry;
fallback:
        xas_unlock_irq(xas);
        return xa_mk_internal(VM_FAULT_FALLBACK);
}

/**
 * dax_layout_busy_page_range - find first pinned page in @mapping
 * @mapping: address space to scan for a page with ref count > 1
 * @start: Starting offset. Page containing 'start' is included.
 * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
 *       pages from 'start' till the end of file are included.
 *
 * DAX requires ZONE_DEVICE mapped pages. These pages are never
 * 'onlined' to the page allocator so they are considered idle when
 * page->count == 1. A filesystem uses this interface to determine if
 * any page in the mapping is busy, i.e. for DMA, or other
 * get_user_pages() usages.
 *
 * It is expected that the filesystem is holding locks to block the
 * establishment of new mappings in this address_space. I.e. it expects
 * to be able to run unmap_mapping_range() and subsequently not race
 * mapping_mapped() becoming true.
 */
struct page *dax_layout_busy_page_range(struct address_space *mapping,
                                        loff_t start, loff_t end)
{
        void *entry;
        unsigned int scanned = 0;
        struct page *page = NULL;
        pgoff_t start_idx = start >> PAGE_SHIFT;
        pgoff_t end_idx;
        XA_STATE(xas, &mapping->i_pages, start_idx);

        if (!dax_mapping(mapping))
                return NULL;

        /* If end == LLONG_MAX, all pages from start to till end of file */
        if (end == LLONG_MAX)
                end_idx = ULONG_MAX;
        else
                end_idx = end >> PAGE_SHIFT;
        /*
         * If we race get_user_pages_fast() here either we'll see the
         * elevated page count in the iteration and wait, or
         * get_user_pages_fast() will see that the page it took a reference
         * against is no longer mapped in the page tables and bail to the
         * get_user_pages() slow path.  The slow path is protected by
         * pte_lock() and pmd_lock(). New references are not taken without
         * holding those locks, and unmap_mapping_pages() will not zero the
         * pte or pmd without holding the respective lock, so we are
         * guaranteed to either see new references or prevent new
         * references from being established.
         */
        unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);

        xas_lock_irq(&xas);
        xas_for_each(&xas, entry, end_idx) {
                if (WARN_ON_ONCE(!xa_is_value(entry)))
                        continue;
                entry = wait_entry_unlocked_exclusive(&xas, entry);
                if (entry)
                        page = dax_busy_page(entry);
                put_unlocked_entry(&xas, entry, WAKE_NEXT);
                if (page)
                        break;
                if (++scanned % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);
        return page;
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);

struct page *dax_layout_busy_page(struct address_space *mapping)
{
        return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);

static int __dax_invalidate_entry(struct address_space *mapping,
                                  pgoff_t index, bool trunc)
{
        XA_STATE(xas, &mapping->i_pages, index);
        int ret = 0;
        void *entry;

        xas_lock_irq(&xas);
        entry = get_next_unlocked_entry(&xas, 0);
        if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
                goto out;
        if (!trunc &&
            (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
             xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
                goto out;
        dax_disassociate_entry(entry, mapping, trunc);
        xas_store(&xas, NULL);
        mapping->nrpages -= 1UL << dax_entry_order(entry);
        ret = 1;
out:
        put_unlocked_entry(&xas, entry, WAKE_ALL);
        xas_unlock_irq(&xas);
        return ret;
}

static int __dax_clear_dirty_range(struct address_space *mapping,
                pgoff_t start, pgoff_t end)
{
        XA_STATE(xas, &mapping->i_pages, start);
        unsigned int scanned = 0;
        void *entry;

        xas_lock_irq(&xas);
        xas_for_each(&xas, entry, end) {
                entry = wait_entry_unlocked_exclusive(&xas, entry);
                if (!entry)
                        continue;
                xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
                xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
                put_unlocked_entry(&xas, entry, WAKE_NEXT);

                if (++scanned % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);

        return 0;
}

/*
 * Delete DAX entry at @index from @mapping.  Wait for it
 * to be unlocked before deleting it.
 */
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
        int ret = __dax_invalidate_entry(mapping, index, true);

        /*
         * This gets called from truncate / punch_hole path. As such, the caller
         * must hold locks protecting against concurrent modifications of the
         * page cache (usually fs-private i_mmap_sem for writing). Since the
         * caller has seen a DAX entry for this index, we better find it
         * at that index as well...
         */
        WARN_ON_ONCE(!ret);
        return ret;
}

void dax_delete_mapping_range(struct address_space *mapping,
                                loff_t start, loff_t end)
{
        void *entry;
        pgoff_t start_idx = start >> PAGE_SHIFT;
        pgoff_t end_idx;
        XA_STATE(xas, &mapping->i_pages, start_idx);

        /* If end == LLONG_MAX, all pages from start to till end of file */
        if (end == LLONG_MAX)
                end_idx = ULONG_MAX;
        else
                end_idx = end >> PAGE_SHIFT;

        xas_lock_irq(&xas);
        xas_for_each(&xas, entry, end_idx) {
                if (!xa_is_value(entry))
                        continue;
                entry = wait_entry_unlocked_exclusive(&xas, entry);
                if (!entry)
                        continue;
                dax_disassociate_entry(entry, mapping, true);
                xas_store(&xas, NULL);
                mapping->nrpages -= 1UL << dax_entry_order(entry);
                put_unlocked_entry(&xas, entry, WAKE_ALL);
        }
        xas_unlock_irq(&xas);
}
EXPORT_SYMBOL_GPL(dax_delete_mapping_range);

static int wait_page_idle(struct page *page,
                        void (cb)(struct inode *),
                        struct inode *inode)
{
        return ___wait_var_event(page, dax_page_is_idle(page),
                                TASK_INTERRUPTIBLE, 0, 0, cb(inode));
}

static void wait_page_idle_uninterruptible(struct page *page,
                                        struct inode *inode)
{
        ___wait_var_event(page, dax_page_is_idle(page),
                        TASK_UNINTERRUPTIBLE, 0, 0, schedule());
}

/*
 * Unmaps the inode and waits for any DMA to complete prior to deleting the
 * DAX mapping entries for the range.
 *
 * For NOWAIT behavior, pass @cb as NULL to early-exit on first found
 * busy page
 */
int dax_break_layout(struct inode *inode, loff_t start, loff_t end,
                void (cb)(struct inode *))
{
        struct page *page;
        int error = 0;

        if (!dax_mapping(inode->i_mapping))
                return 0;

        do {
                page = dax_layout_busy_page_range(inode->i_mapping, start, end);
                if (!page)
                        break;
                if (!cb) {
                        error = -ERESTARTSYS;
                        break;
                }

                error = wait_page_idle(page, cb, inode);
        } while (error == 0);

        if (!page)
                dax_delete_mapping_range(inode->i_mapping, start, end);

        return error;
}
EXPORT_SYMBOL_GPL(dax_break_layout);

void dax_break_layout_final(struct inode *inode)
{
        struct page *page;

        if (!dax_mapping(inode->i_mapping))
                return;

        do {
                page = dax_layout_busy_page_range(inode->i_mapping, 0,
                                                LLONG_MAX);
                if (!page)
                        break;

                wait_page_idle_uninterruptible(page, inode);
        } while (true);

        if (!page)
                dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
}
EXPORT_SYMBOL_GPL(dax_break_layout_final);

/*
 * Invalidate DAX entry if it is clean.
 */
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index)
{
        return __dax_invalidate_entry(mapping, index, false);
}

static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
{
        return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
}

static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
{
        pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
        void *vto, *kaddr;
        long rc;
        int id;

        id = dax_read_lock();
        rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
                                &kaddr, NULL);
        if (rc < 0) {
                dax_read_unlock(id);
                return rc;
        }
        vto = kmap_atomic(vmf->cow_page);
        copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
        kunmap_atomic(vto);
        dax_read_unlock(id);
        return 0;
}

/*
 * MAP_SYNC on a dax mapping guarantees dirty metadata is
 * flushed on write-faults (non-cow), but not read-faults.
 */
static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
                struct vm_area_struct *vma)
{
        return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
                (iter->iomap.flags & IOMAP_F_DIRTY);
}

/*
 * By this point grab_mapping_entry() has ensured that we have a locked entry
 * of the appropriate size so we don't have to worry about downgrading PMDs to
 * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
 * already in the tree, we will skip the insertion and just dirty the PMD as
 * appropriate.
 */
static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
                const struct iomap_iter *iter, void *entry, unsigned long pfn,
                unsigned long flags)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        void *new_entry = dax_make_entry(pfn, flags);
        bool write = iter->flags & IOMAP_WRITE;
        bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
        bool shared = iter->iomap.flags & IOMAP_F_SHARED;

        if (dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

        if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
                unsigned long index = xas->xa_index;
                /* we are replacing a zero page with block mapping */
                if (dax_is_pmd_entry(entry))
                        unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
                                        PG_PMD_NR, false);
                else /* pte entry */
                        unmap_mapping_pages(mapping, index, 1, false);
        }

        xas_reset(xas);
        xas_lock_irq(xas);
        if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
                void *old;

                dax_disassociate_entry(entry, mapping, false);
                dax_associate_entry(new_entry, mapping, vmf->vma,
                                        vmf->address, shared);

                /*
                 * Only swap our new entry into the page cache if the current
                 * entry is a zero page or an empty entry.  If a normal PTE or
                 * PMD entry is already in the cache, we leave it alone.  This
                 * means that if we are trying to insert a PTE and the
                 * existing entry is a PMD, we will just leave the PMD in the
                 * tree and dirty it if necessary.
                 */
                old = dax_lock_entry(xas, new_entry);
                WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
                                        DAX_LOCKED));
                entry = new_entry;
        } else {
                xas_load(xas);        /* Walk the xa_state */
        }

        if (dirty)
                xas_set_mark(xas, PAGECACHE_TAG_DIRTY);

        if (write && shared)
                xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);

        xas_unlock_irq(xas);
        return entry;
}

static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
                struct address_space *mapping, void *entry)
{
        unsigned long pfn, index, count, end;
        long ret = 0;
        struct vm_area_struct *vma;

        /*
         * A page got tagged dirty in DAX mapping? Something is seriously
         * wrong.
         */
        if (WARN_ON(!xa_is_value(entry)))
                return -EIO;

        if (unlikely(dax_is_locked(entry))) {
                void *old_entry = entry;

                entry = get_next_unlocked_entry(xas, 0);

                /* Entry got punched out / reallocated? */
                if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
                        goto put_unlocked;
                /*
                 * Entry got reallocated elsewhere? No need to writeback.
                 * We have to compare pfns as we must not bail out due to
                 * difference in lockbit or entry type.
                 */
                if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
                        goto put_unlocked;
                if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
                                        dax_is_zero_entry(entry))) {
                        ret = -EIO;
                        goto put_unlocked;
                }

                /* Another fsync thread may have already done this entry */
                if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
                        goto put_unlocked;
        }

        /* Lock the entry to serialize with page faults */
        dax_lock_entry(xas, entry);

        /*
         * We can clear the tag now but we have to be careful so that concurrent
         * dax_writeback_one() calls for the same index cannot finish before we
         * actually flush the caches. This is achieved as the calls will look
         * at the entry only under the i_pages lock and once they do that
         * they will see the entry locked and wait for it to unlock.
         */
        xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
        xas_unlock_irq(xas);

        /*
         * If dax_writeback_mapping_range() was given a wbc->range_start
         * in the middle of a PMD, the 'index' we use needs to be
         * aligned to the start of the PMD.
         * This allows us to flush for PMD_SIZE and not have to worry about
         * partial PMD writebacks.
         */
        pfn = dax_to_pfn(entry);
        count = 1UL << dax_entry_order(entry);
        index = xas->xa_index & ~(count - 1);
        end = index + count - 1;

        /* Walk all mappings of a given index of a file and writeprotect them */
        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
                pfn_mkclean_range(pfn, count, index, vma);
                cond_resched();
        }
        i_mmap_unlock_read(mapping);

        dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
        /*
         * After we have flushed the cache, we can clear the dirty tag. There
         * cannot be new dirty data in the pfn after the flush has completed as
         * the pfn mappings are writeprotected and fault waits for mapping
         * entry lock.
         */
        xas_reset(xas);
        xas_lock_irq(xas);
        xas_store(xas, entry);
        xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
        dax_wake_entry(xas, entry, WAKE_NEXT);

        trace_dax_writeback_one(mapping->host, index, count);
        return ret;

 put_unlocked:
        put_unlocked_entry(xas, entry, WAKE_NEXT);
        return ret;
}

/*
 * Flush the mapping to the persistent domain within the byte range of [start,
 * end]. This is required by data integrity operations to ensure file data is
 * on persistent storage prior to completion of the operation.
 */
int dax_writeback_mapping_range(struct address_space *mapping,
                struct dax_device *dax_dev, struct writeback_control *wbc)
{
        XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
        struct inode *inode = mapping->host;
        pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
        void *entry;
        int ret = 0;
        unsigned int scanned = 0;

        if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
                return -EIO;

        if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
                return 0;

        trace_dax_writeback_range(inode, xas.xa_index, end_index);

        tag_pages_for_writeback(mapping, xas.xa_index, end_index);

        xas_lock_irq(&xas);
        xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
                ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
                if (ret < 0) {
                        mapping_set_error(mapping, ret);
                        break;
                }
                if (++scanned % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);
        trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
        return ret;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);

static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
                size_t size, void **kaddr, unsigned long *pfnp)
{
        pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
        int id, rc = 0;
        long length;

        id = dax_read_lock();
        length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
                                   DAX_ACCESS, kaddr, pfnp);
        if (length < 0) {
                rc = length;
                goto out;
        }
        if (!pfnp)
                goto out_check_addr;
        rc = -EINVAL;
        if (PFN_PHYS(length) < size)
                goto out;
        if (*pfnp & (PHYS_PFN(size)-1))
                goto out;

        rc = 0;

out_check_addr:
        if (!kaddr)
                goto out;
        if (!*kaddr)
                rc = -EFAULT;
out:
        dax_read_unlock(id);
        return rc;
}

/**
 * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
 * by copying the data before and after the range to be written.
 * @pos:        address to do copy from.
 * @length:        size of copy operation.
 * @align_size:        aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
 * @srcmap:        iomap srcmap
 * @daddr:        destination address to copy to.
 *
 * This can be called from two places. Either during DAX write fault (page
 * aligned), to copy the length size data to daddr. Or, while doing normal DAX
 * write operation, dax_iomap_iter() might call this to do the copy of either
 * start or end unaligned address. In the latter case the rest of the copy of
 * aligned ranges is taken care by dax_iomap_iter() itself.
 * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
 * area to make sure no old data remains.
 */
static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
                const struct iomap *srcmap, void *daddr)
{
        loff_t head_off = pos & (align_size - 1);
        size_t size = ALIGN(head_off + length, align_size);
        loff_t end = pos + length;
        loff_t pg_end = round_up(end, align_size);
        /* copy_all is usually in page fault case */
        bool copy_all = head_off == 0 && end == pg_end;
        /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
        bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
                         srcmap->type == IOMAP_UNWRITTEN;
        void *saddr = NULL;
        int ret = 0;

        if (!zero_edge) {
                ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
                if (ret)
                        return dax_mem2blk_err(ret);
        }

        if (copy_all) {
                if (zero_edge)
                        memset(daddr, 0, size);
                else
                        ret = copy_mc_to_kernel(daddr, saddr, length);
                goto out;
        }

        /* Copy the head part of the range */
        if (head_off) {
                if (zero_edge)
                        memset(daddr, 0, head_off);
                else {
                        ret = copy_mc_to_kernel(daddr, saddr, head_off);
                        if (ret)
                                return -EIO;
                }
        }

        /* Copy the tail part of the range */
        if (end < pg_end) {
                loff_t tail_off = head_off + length;
                loff_t tail_len = pg_end - end;

                if (zero_edge)
                        memset(daddr + tail_off, 0, tail_len);
                else {
                        ret = copy_mc_to_kernel(daddr + tail_off,
                                                saddr + tail_off, tail_len);
                        if (ret)
                                return -EIO;
                }
        }
out:
        if (zero_edge)
                dax_flush(srcmap->dax_dev, daddr, size);
        return ret ? -EIO : 0;
}

/*
 * The user has performed a load from a hole in the file.  Allocating a new
 * page in the file would cause excessive storage usage for workloads with
 * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
 * If this page is ever written to we will re-fault and change the mapping to
 * point to real DAX storage instead.
 */
static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
                const struct iomap_iter *iter, void **entry)
{
        struct inode *inode = iter->inode;
        unsigned long vaddr = vmf->address;
        unsigned long pfn = zero_pfn(vaddr);
        vm_fault_t ret;

        *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);

        ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), false);
        trace_dax_load_hole(inode, vmf, ret);
        return ret;
}

#ifdef CONFIG_FS_DAX_PMD
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
                const struct iomap_iter *iter, void **entry)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        struct inode *inode = mapping->host;
        struct folio *zero_folio;
        vm_fault_t ret;

        zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);

        if (unlikely(!zero_folio)) {
                trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
                return VM_FAULT_FALLBACK;
        }

        *entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio),
                                  DAX_PMD | DAX_ZERO_PAGE);

        ret = vmf_insert_folio_pmd(vmf, zero_folio, false);
        if (ret == VM_FAULT_NOPAGE)
                trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
        return ret;
}
#else
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
                const struct iomap_iter *iter, void **entry)
{
        return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */

static int dax_unshare_iter(struct iomap_iter *iter)
{
        struct iomap *iomap = &iter->iomap;
        const struct iomap *srcmap = iomap_iter_srcmap(iter);
        loff_t copy_pos = iter->pos;
        u64 copy_len = iomap_length(iter);
        u32 mod;
        int id = 0;
        s64 ret;
        void *daddr = NULL, *saddr = NULL;

        if (!iomap_want_unshare_iter(iter))
                return iomap_iter_advance_full(iter);

        /*
         * Extend the file range to be aligned to fsblock/pagesize, because
         * we need to copy entire blocks, not just the byte range specified.
         * Invalidate the mapping because we're about to CoW.
         */
        mod = offset_in_page(copy_pos);
        if (mod) {
                copy_len += mod;
                copy_pos -= mod;
        }

        mod = offset_in_page(copy_pos + copy_len);
        if (mod)
                copy_len += PAGE_SIZE - mod;

        invalidate_inode_pages2_range(iter->inode->i_mapping,
                                      copy_pos >> PAGE_SHIFT,
                                      (copy_pos + copy_len - 1) >> PAGE_SHIFT);

        id = dax_read_lock();
        ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL);
        if (ret < 0)
                goto out_unlock;

        ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL);
        if (ret < 0)
                goto out_unlock;

        if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0)
                ret = -EIO;

out_unlock:
        dax_read_unlock(id);
        if (ret < 0)
                return dax_mem2blk_err(ret);
        return iomap_iter_advance_full(iter);
}

int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
                const struct iomap_ops *ops)
{
        struct iomap_iter iter = {
                .inode                = inode,
                .pos                = pos,
                .flags                = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
        };
        loff_t size = i_size_read(inode);
        int ret;

        if (pos < 0 || pos >= size)
                return 0;

        iter.len = min(len, size - pos);
        while ((ret = iomap_iter(&iter, ops)) > 0)
                iter.status = dax_unshare_iter(&iter);
        return ret;
}
EXPORT_SYMBOL_GPL(dax_file_unshare);

static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
{
        const struct iomap *iomap = &iter->iomap;
        const struct iomap *srcmap = iomap_iter_srcmap(iter);
        unsigned offset = offset_in_page(pos);
        pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
        void *kaddr;
        long ret;

        ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
                                NULL);
        if (ret < 0)
                return dax_mem2blk_err(ret);

        memset(kaddr + offset, 0, size);
        if (iomap->flags & IOMAP_F_SHARED)
                ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
                                            kaddr);
        else
                dax_flush(iomap->dax_dev, kaddr + offset, size);
        return ret;
}

static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
        const struct iomap *iomap = &iter->iomap;
        const struct iomap *srcmap = iomap_iter_srcmap(iter);
        u64 length = iomap_length(iter);
        int ret;

        /* already zeroed?  we're done. */
        if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
                return iomap_iter_advance(iter, length);

        /*
         * invalidate the pages whose sharing state is to be changed
         * because of CoW.
         */
        if (iomap->flags & IOMAP_F_SHARED)
                invalidate_inode_pages2_range(iter->inode->i_mapping,
                                iter->pos >> PAGE_SHIFT,
                                (iter->pos + length - 1) >> PAGE_SHIFT);

        do {
                loff_t pos = iter->pos;
                unsigned offset = offset_in_page(pos);
                pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
                int id;

                length = min_t(u64, PAGE_SIZE - offset, length);

                id = dax_read_lock();
                if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
                        ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
                else
                        ret = dax_memzero(iter, pos, length);
                dax_read_unlock(id);

                if (ret < 0)
                        return ret;

                ret = iomap_iter_advance(iter, length);
                if (ret)
                        return ret;
        } while ((length = iomap_length(iter)) > 0);

        if (did_zero)
                *did_zero = true;
        return ret;
}

int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
                const struct iomap_ops *ops)
{
        struct iomap_iter iter = {
                .inode                = inode,
                .pos                = pos,
                .len                = len,
                .flags                = IOMAP_DAX | IOMAP_ZERO,
        };
        int ret;

        while ((ret = iomap_iter(&iter, ops)) > 0)
                iter.status = dax_zero_iter(&iter, did_zero);
        return ret;
}
EXPORT_SYMBOL_GPL(dax_zero_range);

int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
                const struct iomap_ops *ops)
{
        unsigned int blocksize = i_blocksize(inode);
        unsigned int off = pos & (blocksize - 1);

        /* Block boundary? Nothing to do */
        if (!off)
                return 0;
        return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
}
EXPORT_SYMBOL_GPL(dax_truncate_page);

static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
{
        const struct iomap *iomap = &iomi->iomap;
        const struct iomap *srcmap = iomap_iter_srcmap(iomi);
        loff_t length = iomap_length(iomi);
        loff_t pos = iomi->pos;
        struct dax_device *dax_dev = iomap->dax_dev;
        loff_t end = pos + length, done = 0;
        bool write = iov_iter_rw(iter) == WRITE;
        bool cow = write && iomap->flags & IOMAP_F_SHARED;
        ssize_t ret = 0;
        size_t xfer;
        int id;

        if (!write) {
                end = min(end, i_size_read(iomi->inode));
                if (pos >= end)
                        return 0;

                if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
                        done = iov_iter_zero(min(length, end - pos), iter);
                        return iomap_iter_advance(iomi, done);
                }
        }

        /*
         * In DAX mode, enforce either pure overwrites of written extents, or
         * writes to unwritten extents as part of a copy-on-write operation.
         */
        if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
                        !(iomap->flags & IOMAP_F_SHARED)))
                return -EIO;

        /*
         * Write can allocate block for an area which has a hole page mapped
         * into page tables. We have to tear down these mappings so that data
         * written by write(2) is visible in mmap.
         */
        if (iomap->flags & IOMAP_F_NEW || cow) {
                /*
                 * Filesystem allows CoW on non-shared extents. The src extents
                 * may have been mmapped with dirty mark before. To be able to
                 * invalidate its dax entries, we need to clear the dirty mark
                 * in advance.
                 */
                if (cow)
                        __dax_clear_dirty_range(iomi->inode->i_mapping,
                                                pos >> PAGE_SHIFT,
                                                (end - 1) >> PAGE_SHIFT);
                invalidate_inode_pages2_range(iomi->inode->i_mapping,
                                              pos >> PAGE_SHIFT,
                                              (end - 1) >> PAGE_SHIFT);
        }

        id = dax_read_lock();
        while ((pos = iomi->pos) < end) {
                unsigned offset = pos & (PAGE_SIZE - 1);
                const size_t size = ALIGN(length + offset, PAGE_SIZE);
                pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
                ssize_t map_len;
                bool recovery = false;
                void *kaddr;

                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }

                map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
                                DAX_ACCESS, &kaddr, NULL);
                if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
                        map_len = dax_direct_access(dax_dev, pgoff,
                                        PHYS_PFN(size), DAX_RECOVERY_WRITE,
                                        &kaddr, NULL);
                        if (map_len > 0)
                                recovery = true;
                }
                if (map_len < 0) {
                        ret = dax_mem2blk_err(map_len);
                        break;
                }

                if (cow) {
                        ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
                                                    srcmap, kaddr);
                        if (ret)
                                break;
                }

                map_len = PFN_PHYS(map_len);
                kaddr += offset;
                map_len -= offset;
                if (map_len > end - pos)
                        map_len = end - pos;

                if (recovery)
                        xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
                                        map_len, iter);
                else if (write)
                        xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
                                        map_len, iter);
                else
                        xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
                                        map_len, iter);

                ret = iomap_iter_advance(iomi, xfer);
                if (!ret && xfer == 0)
                        ret = -EFAULT;
                if (xfer < map_len)
                        break;
                length = iomap_length(iomi);
        }
        dax_read_unlock(id);

        return ret;
}

/**
 * dax_iomap_rw - Perform I/O to a DAX file
 * @iocb:        The control block for this I/O
 * @iter:        The addresses to do I/O from or to
 * @ops:        iomap ops passed from the file system
 *
 * This function performs read and write operations to directly mapped
 * persistent memory.  The callers needs to take care of read/write exclusion
 * and evicting any page cache pages in the region under I/O.
 */
ssize_t
dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops)
{
        struct iomap_iter iomi = {
                .inode                = iocb->ki_filp->f_mapping->host,
                .pos                = iocb->ki_pos,
                .len                = iov_iter_count(iter),
                .flags                = IOMAP_DAX,
        };
        loff_t done = 0;
        int ret;

        if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC))
                return -EIO;

        if (!iomi.len)
                return 0;

        if (iov_iter_rw(iter) == WRITE) {
                lockdep_assert_held_write(&iomi.inode->i_rwsem);
                iomi.flags |= IOMAP_WRITE;
        } else if (!sb_rdonly(iomi.inode->i_sb)) {
                lockdep_assert_held(&iomi.inode->i_rwsem);
        }

        if (iocb->ki_flags & IOCB_NOWAIT)
                iomi.flags |= IOMAP_NOWAIT;

        while ((ret = iomap_iter(&iomi, ops)) > 0)
                iomi.status = dax_iomap_iter(&iomi, iter);

        done = iomi.pos - iocb->ki_pos;
        iocb->ki_pos = iomi.pos;
        return done ? done : ret;
}
EXPORT_SYMBOL_GPL(dax_iomap_rw);

static vm_fault_t dax_fault_return(int error)
{
        if (error == 0)
                return VM_FAULT_NOPAGE;
        return vmf_error(error);
}

/*
 * When handling a synchronous page fault and the inode need a fsync, we can
 * insert the PTE/PMD into page tables only after that fsync happened. Skip
 * insertion for now and return the pfn so that caller can insert it after the
 * fsync is done.
 */
static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp,
                                        unsigned long pfn)
{
        if (WARN_ON_ONCE(!pfnp))
                return VM_FAULT_SIGBUS;
        *pfnp = pfn;
        return VM_FAULT_NEEDDSYNC;
}

static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
                const struct iomap_iter *iter)
{
        vm_fault_t ret;
        int error = 0;

        switch (iter->iomap.type) {
        case IOMAP_HOLE:
        case IOMAP_UNWRITTEN:
                clear_user_highpage(vmf->cow_page, vmf->address);
                break;
        case IOMAP_MAPPED:
                error = copy_cow_page_dax(vmf, iter);
                break;
        default:
                WARN_ON_ONCE(1);
                error = -EIO;
                break;
        }

        if (error)
                return dax_fault_return(error);

        __SetPageUptodate(vmf->cow_page);
        ret = finish_fault(vmf);
        if (!ret)
                return VM_FAULT_DONE_COW;
        return ret;
}

/**
 * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
 * @vmf:        vm fault instance
 * @iter:        iomap iter
 * @pfnp:        pfn to be returned
 * @xas:        the dax mapping tree of a file
 * @entry:        an unlocked dax entry to be inserted
 * @pmd:        distinguish whether it is a pmd fault
 */
static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
                const struct iomap_iter *iter, unsigned long *pfnp,
                struct xa_state *xas, void **entry, bool pmd)
{
        const struct iomap *iomap = &iter->iomap;
        const struct iomap *srcmap = iomap_iter_srcmap(iter);
        size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
        loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
        bool write = iter->flags & IOMAP_WRITE;
        unsigned long entry_flags = pmd ? DAX_PMD : 0;
        struct folio *folio;
        int ret, err = 0;
        unsigned long pfn;
        void *kaddr;

        if (!pmd && vmf->cow_page)
                return dax_fault_cow_page(vmf, iter);

        /* if we are reading UNWRITTEN and HOLE, return a hole. */
        if (!write &&
            (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
                if (!pmd)
                        return dax_load_hole(xas, vmf, iter, entry);
                return dax_pmd_load_hole(xas, vmf, iter, entry);
        }

        if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
                WARN_ON_ONCE(1);
                return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
        }

        err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
        if (err)
                return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);

        *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);

        if (write && iomap->flags & IOMAP_F_SHARED) {
                err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
                if (err)
                        return dax_fault_return(err);
        }

        folio = dax_to_folio(*entry);
        if (dax_fault_is_synchronous(iter, vmf->vma))
                return dax_fault_synchronous_pfnp(pfnp, pfn);

        folio_ref_inc(folio);
        if (pmd)
                ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write);
        else
                ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write);
        folio_put(folio);

        return ret;
}

static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
                               int *iomap_errp, const struct iomap_ops *ops)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
        struct iomap_iter iter = {
                .inode                = mapping->host,
                .pos                = (loff_t)vmf->pgoff << PAGE_SHIFT,
                .len                = PAGE_SIZE,
                .flags                = IOMAP_DAX | IOMAP_FAULT,
        };
        vm_fault_t ret = 0;
        void *entry;
        int error;

        trace_dax_pte_fault(iter.inode, vmf, ret);
        /*
         * Check whether offset isn't beyond end of file now. Caller is supposed
         * to hold locks serializing us with truncate / punch hole so this is
         * a reliable test.
         */
        if (iter.pos >= i_size_read(iter.inode)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
        }

        if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
                iter.flags |= IOMAP_WRITE;

        entry = grab_mapping_entry(&xas, mapping, 0);
        if (xa_is_internal(entry)) {
                ret = xa_to_internal(entry);
                goto out;
        }

        /*
         * It is possible, particularly with mixed reads & writes to private
         * mappings, that we have raced with a PMD fault that overlaps with
         * the PTE we need to set up.  If so just return and the fault will be
         * retried.
         */
        if (pmd_trans_huge(*vmf->pmd)) {
                ret = VM_FAULT_NOPAGE;
                goto unlock_entry;
        }

        while ((error = iomap_iter(&iter, ops)) > 0) {
                if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
                        iter.status = -EIO;        /* fs corruption? */
                        continue;
                }

                ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
                if (ret != VM_FAULT_SIGBUS &&
                    (iter.iomap.flags & IOMAP_F_NEW)) {
                        count_vm_event(PGMAJFAULT);
                        count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                        ret |= VM_FAULT_MAJOR;
                }

                if (!(ret & VM_FAULT_ERROR))
                        iter.status = iomap_iter_advance(&iter, PAGE_SIZE);
        }

        if (iomap_errp)
                *iomap_errp = error;
        if (!ret && error)
                ret = dax_fault_return(error);

unlock_entry:
        dax_unlock_entry(&xas, entry);
out:
        trace_dax_pte_fault_done(iter.inode, vmf, ret);
        return ret;
}

#ifdef CONFIG_FS_DAX_PMD
static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
                pgoff_t max_pgoff)
{
        unsigned long pmd_addr = vmf->address & PMD_MASK;
        bool write = vmf->flags & FAULT_FLAG_WRITE;

        /*
         * Make sure that the faulting address's PMD offset (color) matches
         * the PMD offset from the start of the file.  This is necessary so
         * that a PMD range in the page table overlaps exactly with a PMD
         * range in the page cache.
         */
        if ((vmf->pgoff & PG_PMD_COLOUR) !=
            ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
                return true;

        /* Fall back to PTEs if we're going to COW */
        if (write && !(vmf->vma->vm_flags & VM_SHARED))
                return true;

        /* If the PMD would extend outside the VMA */
        if (pmd_addr < vmf->vma->vm_start)
                return true;
        if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
                return true;

        /* If the PMD would extend beyond the file size */
        if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
                return true;

        return false;
}

static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
                               const struct iomap_ops *ops)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
        struct iomap_iter iter = {
                .inode                = mapping->host,
                .len                = PMD_SIZE,
                .flags                = IOMAP_DAX | IOMAP_FAULT,
        };
        vm_fault_t ret = VM_FAULT_FALLBACK;
        pgoff_t max_pgoff;
        void *entry;

        if (vmf->flags & FAULT_FLAG_WRITE)
                iter.flags |= IOMAP_WRITE;

        /*
         * Check whether offset isn't beyond end of file now. Caller is
         * supposed to hold locks serializing us with truncate / punch hole so
         * this is a reliable test.
         */
        max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);

        trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);

        if (xas.xa_index >= max_pgoff) {
                ret = VM_FAULT_SIGBUS;
                goto out;
        }

        if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
                goto fallback;

        /*
         * grab_mapping_entry() will make sure we get an empty PMD entry,
         * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
         * entry is already in the array, for instance), it will return
         * VM_FAULT_FALLBACK.
         */
        entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
        if (xa_is_internal(entry)) {
                ret = xa_to_internal(entry);
                goto fallback;
        }

        /*
         * It is possible, particularly with mixed reads & writes to private
         * mappings, that we have raced with a PTE fault that overlaps with
         * the PMD we need to set up.  If so just return and the fault will be
         * retried.
         */
        if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) {
                ret = 0;
                goto unlock_entry;
        }

        iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
        while (iomap_iter(&iter, ops) > 0) {
                if (iomap_length(&iter) < PMD_SIZE)
                        continue; /* actually breaks out of the loop */

                ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
                if (ret != VM_FAULT_FALLBACK)
                        iter.status = iomap_iter_advance(&iter, PMD_SIZE);
        }

unlock_entry:
        dax_unlock_entry(&xas, entry);
fallback:
        if (ret == VM_FAULT_FALLBACK) {
                split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
                count_vm_event(THP_FAULT_FALLBACK);
        }
out:
        trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
        return ret;
}
#else
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
                               const struct iomap_ops *ops)
{
        return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */

/**
 * dax_iomap_fault - handle a page fault on a DAX file
 * @vmf: The description of the fault
 * @order: Order of the page to fault in
 * @pfnp: PFN to insert for synchronous faults if fsync is required
 * @iomap_errp: Storage for detailed error code in case of error
 * @ops: Iomap ops passed from the file system
 *
 * When a page fault occurs, filesystems may call this helper in
 * their fault handler for DAX files. dax_iomap_fault() assumes the caller
 * has done all the necessary locking for page fault to proceed
 * successfully.
 */
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
                        unsigned long *pfnp, int *iomap_errp,
                        const struct iomap_ops *ops)
{
        if (order == 0)
                return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
        else if (order == PMD_ORDER)
                return dax_iomap_pmd_fault(vmf, pfnp, ops);
        else
                return VM_FAULT_FALLBACK;
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);

/*
 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
 * @vmf: The description of the fault
 * @pfn: PFN to insert
 * @order: Order of entry to insert.
 *
 * This function inserts a writeable PTE or PMD entry into the page tables
 * for an mmaped DAX file.  It also marks the page cache entry as dirty.
 */
static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
                                        unsigned long pfn, unsigned int order)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
        struct folio *folio;
        void *entry;
        vm_fault_t ret;

        xas_lock_irq(&xas);
        entry = get_next_unlocked_entry(&xas, order);
        /* Did we race with someone splitting entry or so? */
        if (!entry || dax_is_conflict(entry) ||
            (order == 0 && !dax_is_pte_entry(entry))) {
                put_unlocked_entry(&xas, entry, WAKE_NEXT);
                xas_unlock_irq(&xas);
                trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
                                                      VM_FAULT_NOPAGE);
                return VM_FAULT_NOPAGE;
        }
        xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
        dax_lock_entry(&xas, entry);
        xas_unlock_irq(&xas);
        folio = pfn_folio(pfn);
        folio_ref_inc(folio);
        if (order == 0)
                ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
#ifdef CONFIG_FS_DAX_PMD
        else if (order == PMD_ORDER)
                ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE);
#endif
        else
                ret = VM_FAULT_FALLBACK;
        folio_put(folio);
        dax_unlock_entry(&xas, entry);
        trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
        return ret;
}

/**
 * dax_finish_sync_fault - finish synchronous page fault
 * @vmf: The description of the fault
 * @order: Order of entry to be inserted
 * @pfn: PFN to insert
 *
 * This function ensures that the file range touched by the page fault is
 * stored persistently on the media and handles inserting of appropriate page
 * table entry.
 */
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
                unsigned long pfn)
{
        int err;
        loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
        size_t len = PAGE_SIZE << order;

        err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
        if (err)
                return VM_FAULT_SIGBUS;
        return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);

static int dax_range_compare_iter(struct iomap_iter *it_src,
                struct iomap_iter *it_dest, u64 len, bool *same)
{
        const struct iomap *smap = &it_src->iomap;
        const struct iomap *dmap = &it_dest->iomap;
        loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
        void *saddr, *daddr;
        int id, ret;

        len = min(len, min(smap->length, dmap->length));

        if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
                *same = true;
                goto advance;
        }

        if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
                *same = false;
                return 0;
        }

        id = dax_read_lock();
        ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
                                      &saddr, NULL);
        if (ret < 0)
                goto out_unlock;

        ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
                                      &daddr, NULL);
        if (ret < 0)
                goto out_unlock;

        *same = !memcmp(saddr, daddr, len);
        if (!*same)
                len = 0;
        dax_read_unlock(id);

advance:
        ret = iomap_iter_advance(it_src, len);
        if (!ret)
                ret = iomap_iter_advance(it_dest, len);
        return ret;

out_unlock:
        dax_read_unlock(id);
        return -EIO;
}

int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
                struct inode *dst, loff_t dstoff, loff_t len, bool *same,
                const struct iomap_ops *ops)
{
        struct iomap_iter src_iter = {
                .inode                = src,
                .pos                = srcoff,
                .len                = len,
                .flags                = IOMAP_DAX,
        };
        struct iomap_iter dst_iter = {
                .inode                = dst,
                .pos                = dstoff,
                .len                = len,
                .flags                = IOMAP_DAX,
        };
        int ret, status;

        while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
               (ret = iomap_iter(&dst_iter, ops)) > 0) {
                status = dax_range_compare_iter(&src_iter, &dst_iter,
                                min(src_iter.len, dst_iter.len), same);
                if (status < 0)
                        return ret;
                src_iter.status = dst_iter.status = status;
        }
        return ret;
}

int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                              struct file *file_out, loff_t pos_out,
                              loff_t *len, unsigned int remap_flags,
                              const struct iomap_ops *ops)
{
        return __generic_remap_file_range_prep(file_in, pos_in, file_out,
                                               pos_out, len, remap_flags, ops);
}
EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);


















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMENS_H
#define _LINUX_TIMENS_H


#include <linux/sched.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/err.h>
#include <linux/time64.h>
#include <linux/cleanup.h>

struct user_namespace;
extern struct user_namespace init_user_ns;

struct seq_file;
struct vm_area_struct;

struct timens_offsets {
        struct timespec64 monotonic;
        struct timespec64 boottime;
};

struct time_namespace {
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct ns_common        ns;
        struct timens_offsets        offsets;
#ifdef CONFIG_TIME_NS_VDSO
        struct page                *vvar_page;
#endif
        /* If set prevents changing offsets after any task joined namespace. */
        bool                        frozen_offsets;
} __randomize_layout;

extern struct time_namespace init_time_ns;

#ifdef CONFIG_TIME_NS
static inline struct time_namespace *to_time_ns(struct ns_common *ns)
{
        return container_of(ns, struct time_namespace, ns);
}
void __init time_ns_init(void);

static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
        ns_ref_inc(ns);
        return ns;
}

struct time_namespace *copy_time_ns(u64 flags,
                                    struct user_namespace *user_ns,
                                    struct time_namespace *old_ns);
void free_time_ns(struct time_namespace *ns);
void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);

static inline void put_time_ns(struct time_namespace *ns)
{
        if (ns_ref_put(ns))
                free_time_ns(ns);
}

void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m);

struct proc_timens_offset {
        int                        clockid;
        struct timespec64        val;
};

int proc_timens_set_offset(struct file *file, struct task_struct *p,
                           struct proc_timens_offset *offsets, int n);

static inline void timens_add_monotonic(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_add(*ts, ns_offsets->monotonic);
}

static inline void timens_add_boottime(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_add(*ts, ns_offsets->boottime);
}

static inline u64 timens_add_boottime_ns(u64 nsec)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        return nsec + timespec64_to_ns(&ns_offsets->boottime);
}

static inline void timens_sub_boottime(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_sub(*ts, ns_offsets->boottime);
}

ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
                                struct timens_offsets *offsets);

static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
{
        struct time_namespace *ns = current->nsproxy->time_ns;

        if (likely(ns == &init_time_ns))
                return tim;

        return do_timens_ktime_to_host(clockid, tim, &ns->offsets);
}

#else
static inline void __init time_ns_init(void)
{
}

static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
        return NULL;
}

static inline void put_time_ns(struct time_namespace *ns)
{
}

static inline
struct time_namespace *copy_time_ns(u64 flags,
                                    struct user_namespace *user_ns,
                                    struct time_namespace *old_ns)
{
        if (flags & CLONE_NEWTIME)
                return ERR_PTR(-EINVAL);

        return old_ns;
}

static inline void timens_on_fork(struct nsproxy *nsproxy,
                                 struct task_struct *tsk)
{
        return;
}

static inline void timens_add_monotonic(struct timespec64 *ts) { }
static inline void timens_add_boottime(struct timespec64 *ts) { }

static inline u64 timens_add_boottime_ns(u64 nsec)
{
        return nsec;
}

static inline void timens_sub_boottime(struct timespec64 *ts) { }

static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
{
        return tim;
}
#endif

#ifdef CONFIG_TIME_NS_VDSO
extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);
struct page *find_timens_vvar_page(struct vm_area_struct *vma);
#else /* !CONFIG_TIME_NS_VDSO */
static inline void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
{
}

static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma)
{
        return NULL;
}
#endif /* CONFIG_TIME_NS_VDSO */

DEFINE_FREE(time_ns, struct time_namespace *, if (_T) put_time_ns(_T))

#endif /* _LINUX_TIMENS_H */






































   59 











   71 




   64 
   59 





   64 













   71 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_CONTEXT_H
#define __X86_KERNEL_FPU_CONTEXT_H

#include <asm/fpu/xstate.h>
#include <asm/trace/fpu.h>

/* Functions related to FPU context tracking */

/*
 * The in-register FPU state for an FPU context on a CPU is assumed to be
 * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
 * matches the FPU.
 *
 * If the FPU register state is valid, the kernel can skip restoring the
 * FPU state from memory.
 *
 * Any code that clobbers the FPU registers or updates the in-memory
 * FPU state for a task MUST let the rest of the kernel know that the
 * FPU registers are no longer valid for this task.
 *
 * Invalidate a resource you control: CPU if using the CPU for something else
 * (with preemption disabled), FPU for the current task, or a task that
 * is prevented from running by the current task.
 */
static inline void __cpu_invalidate_fpregs_state(void)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}

static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
        fpu->last_cpu = -1;
}

static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
        return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}

static inline void fpregs_deactivate(struct fpu *fpu)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        trace_x86_fpu_regs_deactivated(fpu);
}

static inline void fpregs_activate(struct fpu *fpu)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, fpu);
        trace_x86_fpu_regs_activated(fpu);
}

/* Internal helper for switch_fpu_return() and signal frame setup */
static inline void fpregs_restore_userregs(void)
{
        struct fpu *fpu = x86_task_fpu(current);
        int cpu = smp_processor_id();

        if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER)))
                return;

        if (!fpregs_state_valid(fpu, cpu)) {
                /*
                 * This restores _all_ xstate which has not been
                 * established yet.
                 *
                 * If PKRU is enabled, then the PKRU value is already
                 * correct because it was either set in switch_to() or in
                 * flush_thread(). So it is excluded because it might be
                 * not up to date in current->thread.fpu->xsave state.
                 *
                 * XFD state is handled in restore_fpregs_from_fpstate().
                 */
                restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);

                fpregs_activate(fpu);
                fpu->last_cpu = cpu;
        }
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

#endif





















































































































































































































































































































































































    2 


    2 





    2 































































































































































































































































































































































































    2 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HIGHMEM_H
#define _LINUX_HIGHMEM_H

#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/cacheflush.h>
#include <linux/kmsan.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>

#include "highmem-internal.h"

/**
 * kmap - Map a page for long term usage
 * @page:        Pointer to the page to be mapped
 *
 * Returns: The virtual address of the mapping
 *
 * Can only be invoked from preemptible task context because on 32bit
 * systems with CONFIG_HIGHMEM enabled this function might sleep.
 *
 * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area
 * this returns the virtual address of the direct kernel mapping.
 *
 * The returned virtual address is globally visible and valid up to the
 * point where it is unmapped via kunmap(). The pointer can be handed to
 * other contexts.
 *
 * For highmem pages on 32bit systems this can be slow as the mapping space
 * is limited and protected by a global lock. In case that there is no
 * mapping slot available the function blocks until a slot is released via
 * kunmap().
 */
static inline void *kmap(struct page *page);

/**
 * kunmap - Unmap the virtual address mapped by kmap()
 * @page:        Pointer to the page which was mapped by kmap()
 *
 * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of
 * pages in the low memory area.
 */
static inline void kunmap(const struct page *page);

/**
 * kmap_to_page - Get the page for a kmap'ed address
 * @addr:        The address to look up
 *
 * Returns: The page which is mapped to @addr.
 */
static inline struct page *kmap_to_page(void *addr);

/**
 * kmap_flush_unused - Flush all unused kmap mappings in order to
 *                       remove stray mappings
 */
static inline void kmap_flush_unused(void);

/**
 * kmap_local_page - Map a page for temporary usage
 * @page: Pointer to the page to be mapped
 *
 * Returns: The virtual address of the mapping
 *
 * Can be invoked from any context, including interrupts.
 *
 * Requires careful handling when nesting multiple mappings because the map
 * management is stack based. The unmap has to be in the reverse order of
 * the map operation:
 *
 * addr1 = kmap_local_page(page1);
 * addr2 = kmap_local_page(page2);
 * ...
 * kunmap_local(addr2);
 * kunmap_local(addr1);
 *
 * Unmapping addr1 before addr2 is invalid and causes malfunction.
 *
 * Contrary to kmap() mappings the mapping is only valid in the context of
 * the caller and cannot be handed to other contexts.
 *
 * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
 * virtual address of the direct mapping. Only real highmem pages are
 * temporarily mapped.
 *
 * While kmap_local_page() is significantly faster than kmap() for the highmem
 * case it comes with restrictions about the pointer validity.
 *
 * On HIGHMEM enabled systems mapping a highmem page has the side effect of
 * disabling migration in order to keep the virtual address stable across
 * preemption. No caller of kmap_local_page() can rely on this side effect.
 */
static inline void *kmap_local_page(const struct page *page);

/**
 * kmap_local_folio - Map a page in this folio for temporary usage
 * @folio: The folio containing the page.
 * @offset: The byte offset within the folio which identifies the page.
 *
 * Requires careful handling when nesting multiple mappings because the map
 * management is stack based. The unmap has to be in the reverse order of
 * the map operation::
 *
 *   addr1 = kmap_local_folio(folio1, offset1);
 *   addr2 = kmap_local_folio(folio2, offset2);
 *   ...
 *   kunmap_local(addr2);
 *   kunmap_local(addr1);
 *
 * Unmapping addr1 before addr2 is invalid and causes malfunction.
 *
 * Contrary to kmap() mappings the mapping is only valid in the context of
 * the caller and cannot be handed to other contexts.
 *
 * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
 * virtual address of the direct mapping. Only real highmem pages are
 * temporarily mapped.
 *
 * While it is significantly faster than kmap() for the highmem case it
 * comes with restrictions about the pointer validity.
 *
 * On HIGHMEM enabled systems mapping a highmem page has the side effect of
 * disabling migration in order to keep the virtual address stable across
 * preemption. No caller of kmap_local_folio() can rely on this side effect.
 *
 * Context: Can be invoked from any context.
 * Return: The virtual address of @offset.
 */
static inline void *kmap_local_folio(const struct folio *folio, size_t offset);

/**
 * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
 * @page:        Pointer to the page to be mapped
 *
 * Returns: The virtual address of the mapping
 *
 * In fact a wrapper around kmap_local_page() which also disables pagefaults
 * and, depending on PREEMPT_RT configuration, also CPU migration and
 * preemption. Therefore users should not count on the latter two side effects.
 *
 * Mappings should always be released by kunmap_atomic().
 *
 * Do not use in new code. Use kmap_local_page() instead.
 *
 * It is used in atomic context when code wants to access the contents of a
 * page that might be allocated from high memory (see __GFP_HIGHMEM), for
 * example a page in the pagecache.  The API has two functions, and they
 * can be used in a manner similar to the following::
 *
 *   // Find the page of interest.
 *   struct page *page = find_get_page(mapping, offset);
 *
 *   // Gain access to the contents of that page.
 *   void *vaddr = kmap_atomic(page);
 *
 *   // Do something to the contents of that page.
 *   memset(vaddr, 0, PAGE_SIZE);
 *
 *   // Unmap that page.
 *   kunmap_atomic(vaddr);
 *
 * Note that the kunmap_atomic() call takes the result of the kmap_atomic()
 * call, not the argument.
 *
 * If you need to map two pages because you want to copy from one page to
 * another you need to keep the kmap_atomic calls strictly nested, like:
 *
 * vaddr1 = kmap_atomic(page1);
 * vaddr2 = kmap_atomic(page2);
 *
 * memcpy(vaddr1, vaddr2, PAGE_SIZE);
 *
 * kunmap_atomic(vaddr2);
 * kunmap_atomic(vaddr1);
 */
static inline void *kmap_atomic(const struct page *page);

/* Highmem related interfaces for management code */
static inline unsigned long nr_free_highpages(void);
static inline unsigned long totalhigh_pages(void);

#ifndef ARCH_HAS_FLUSH_ANON_PAGE
static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr)
{
}
#endif

#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
static inline void flush_kernel_vmap_range(void *vaddr, int size)
{
}
static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
{
}
#endif

#ifndef clear_user_highpage
#ifndef clear_user_page
/**
 * clear_user_page() - clear a page to be mapped to user space
 * @addr: the address of the page
 * @vaddr: the address of the user mapping
 * @page: the page
 *
 * We condition the definition of clear_user_page() on the architecture
 * not having a custom clear_user_highpage(). That's because if there
 * is some special flushing needed for clear_user_highpage() then it
 * is likely that clear_user_page() also needs some magic. And, since
 * our only caller is the generic clear_user_highpage(), not defining
 * is not much of a loss.
 */
static inline void clear_user_page(void *addr, unsigned long vaddr, struct page *page)
{
        clear_page(addr);
}
#endif

/**
 * clear_user_pages() - clear a page range to be mapped to user space
 * @addr: start address
 * @vaddr: start address of the user mapping
 * @page: start page
 * @npages: number of pages
 *
 * Assumes that the region (@addr, +@npages) has been validated
 * already so this does no exception handling.
 *
 * If the architecture provides a clear_user_page(), use that;
 * otherwise, we can safely use clear_pages().
 */
static inline void clear_user_pages(void *addr, unsigned long vaddr,
                struct page *page, unsigned int npages)
{

#ifdef clear_user_page
        do {
                clear_user_page(addr, vaddr, page);
                addr += PAGE_SIZE;
                vaddr += PAGE_SIZE;
                page++;
        } while (--npages);
#else
        /*
         * Prefer clear_pages() to allow for architectural optimizations
         * when operating on contiguous page ranges.
         */
        clear_pages(addr, npages);
#endif
}

/**
 * clear_user_highpage() - clear a page to be mapped to user space
 * @page: start page
 * @vaddr: start address of the user mapping
 *
 * With !CONFIG_HIGHMEM this (and the copy_user_highpage() below) will
 * be plain clear_user_page() (and copy_user_page()).
 */
static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
{
        void *addr = kmap_local_page(page);
        clear_user_page(addr, vaddr, page);
        kunmap_local(addr);
}
#endif /* clear_user_highpage */

/**
 * clear_user_highpages() - clear a page range to be mapped to user space
 * @page: start page
 * @vaddr: start address of the user mapping
 * @npages: number of pages
 *
 * Assumes that all the pages in the region (@page, +@npages) are valid
 * so this does no exception handling.
 */
static inline void clear_user_highpages(struct page *page, unsigned long vaddr,
                                        unsigned int npages)
{

#if defined(clear_user_highpage) || defined(CONFIG_HIGHMEM)
        /*
         * An architecture defined clear_user_highpage() implies special
         * handling is needed.
         *
         * So we use that or, the generic variant if CONFIG_HIGHMEM is
         * enabled.
         */
        do {
                clear_user_highpage(page, vaddr);
                vaddr += PAGE_SIZE;
                page++;
        } while (--npages);
#else

        /*
         * Prefer clear_user_pages() to allow for architectural optimizations
         * when operating on contiguous page ranges.
         */
        clear_user_pages(page_address(page), vaddr, page, npages);
#endif
}

#ifndef vma_alloc_zeroed_movable_folio
/**
 * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA.
 * @vma: The VMA the page is to be allocated for.
 * @vaddr: The virtual address the page will be inserted into.
 *
 * This function will allocate a page suitable for inserting into this
 * VMA at this virtual address.  It may be allocated from highmem or
 * the movable zone.  An architecture may provide its own implementation.
 *
 * Return: A folio containing one allocated and zeroed page or NULL if
 * we are out of memory.
 */
static inline
struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
                                   unsigned long vaddr)
{
        struct folio *folio;

        folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr);
        if (folio && user_alloc_needs_zeroing())
                clear_user_highpage(&folio->page, vaddr);

        return folio;
}
#endif

static inline void clear_highpage(struct page *page)
{
        void *kaddr = kmap_local_page(page);
        clear_page(kaddr);
        kunmap_local(kaddr);
}

static inline void clear_highpage_kasan_tagged(struct page *page)
{
        void *kaddr = kmap_local_page(page);

        clear_page(kasan_reset_tag(kaddr));
        kunmap_local(kaddr);
}

#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES

/* Return false to let people know we did not initialize the pages */
static inline bool tag_clear_highpages(struct page *page, int numpages)
{
        return false;
}

#endif

/*
 * If we pass in a base or tail page, we can zero up to PAGE_SIZE.
 * If we pass in a head page, we can zero up to the size of the compound page.
 */
#ifdef CONFIG_HIGHMEM
void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
                unsigned start2, unsigned end2);
#else
static inline void zero_user_segments(struct page *page,
                unsigned start1, unsigned end1,
                unsigned start2, unsigned end2)
{
        void *kaddr = kmap_local_page(page);
        unsigned int i;

        BUG_ON(end1 > page_size(page) || end2 > page_size(page));

        if (end1 > start1)
                memset(kaddr + start1, 0, end1 - start1);

        if (end2 > start2)
                memset(kaddr + start2, 0, end2 - start2);

        kunmap_local(kaddr);
        for (i = 0; i < compound_nr(page); i++)
                flush_dcache_page(page + i);
}
#endif

static inline void zero_user_segment(struct page *page,
        unsigned start, unsigned end)
{
        zero_user_segments(page, start, end, 0, 0);
}

#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE

static inline void copy_user_highpage(struct page *to, struct page *from,
        unsigned long vaddr, struct vm_area_struct *vma)
{
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        copy_user_page(vto, vfrom, vaddr, to);
        kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
        kunmap_local(vto);
        kunmap_local(vfrom);
}

#endif

#ifndef __HAVE_ARCH_COPY_HIGHPAGE

static inline void copy_highpage(struct page *to, struct page *from)
{
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        copy_page(vto, vfrom);
        kmsan_copy_page_meta(to, from);
        kunmap_local(vto);
        kunmap_local(vfrom);
}

#endif

#ifdef copy_mc_to_kernel
/*
 * If architecture supports machine check exception handling, define the
 * #MC versions of copy_user_highpage and copy_highpage. They copy a memory
 * page with #MC in source page (@from) handled, and return the number
 * of bytes not copied if there was a #MC, otherwise 0 for success.
 */
static inline int copy_mc_user_highpage(struct page *to, struct page *from,
                                        unsigned long vaddr, struct vm_area_struct *vma)
{
        unsigned long ret;
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
        if (!ret)
                kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
        kunmap_local(vto);
        kunmap_local(vfrom);

        if (ret)
                memory_failure_queue(page_to_pfn(from), 0);

        return ret;
}

static inline int copy_mc_highpage(struct page *to, struct page *from)
{
        unsigned long ret;
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
        if (!ret)
                kmsan_copy_page_meta(to, from);
        kunmap_local(vto);
        kunmap_local(vfrom);

        if (ret)
                memory_failure_queue(page_to_pfn(from), 0);

        return ret;
}
#else
static inline int copy_mc_user_highpage(struct page *to, struct page *from,
                                        unsigned long vaddr, struct vm_area_struct *vma)
{
        copy_user_highpage(to, from, vaddr, vma);
        return 0;
}

static inline int copy_mc_highpage(struct page *to, struct page *from)
{
        copy_highpage(to, from);
        return 0;
}
#endif

static inline void memcpy_page(struct page *dst_page, size_t dst_off,
                               struct page *src_page, size_t src_off,
                               size_t len)
{
        char *dst = kmap_local_page(dst_page);
        char *src = kmap_local_page(src_page);

        VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
        memcpy(dst + dst_off, src + src_off, len);
        kunmap_local(src);
        kunmap_local(dst);
}

static inline void memcpy_folio(struct folio *dst_folio, size_t dst_off,
                struct folio *src_folio, size_t src_off, size_t len)
{
        VM_BUG_ON(dst_off + len > folio_size(dst_folio));
        VM_BUG_ON(src_off + len > folio_size(src_folio));

        do {
                char *dst = kmap_local_folio(dst_folio, dst_off);
                const char *src = kmap_local_folio(src_folio, src_off);
                size_t chunk = len;

                if (folio_test_highmem(dst_folio) &&
                    chunk > PAGE_SIZE - offset_in_page(dst_off))
                        chunk = PAGE_SIZE - offset_in_page(dst_off);
                if (folio_test_highmem(src_folio) &&
                    chunk > PAGE_SIZE - offset_in_page(src_off))
                        chunk = PAGE_SIZE - offset_in_page(src_off);
                memcpy(dst, src, chunk);
                kunmap_local(src);
                kunmap_local(dst);

                dst_off += chunk;
                src_off += chunk;
                len -= chunk;
        } while (len > 0);
}

static inline void memset_page(struct page *page, size_t offset, int val,
                               size_t len)
{
        char *addr = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memset(addr + offset, val, len);
        kunmap_local(addr);
}

static inline void memcpy_from_page(char *to, struct page *page,
                                    size_t offset, size_t len)
{
        char *from = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memcpy(to, from + offset, len);
        kunmap_local(from);
}

static inline void memcpy_to_page(struct page *page, size_t offset,
                                  const char *from, size_t len)
{
        char *to = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memcpy(to + offset, from, len);
        flush_dcache_page(page);
        kunmap_local(to);
}

static inline void memzero_page(struct page *page, size_t offset, size_t len)
{
        char *addr = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memset(addr + offset, 0, len);
        flush_dcache_page(page);
        kunmap_local(addr);
}

/**
 * memcpy_from_folio - Copy a range of bytes from a folio.
 * @to: The memory to copy to.
 * @folio: The folio to read from.
 * @offset: The first byte in the folio to read.
 * @len: The number of bytes to copy.
 */
static inline void memcpy_from_folio(char *to, struct folio *folio,
                size_t offset, size_t len)
{
        VM_BUG_ON(offset + len > folio_size(folio));

        do {
                const char *from = kmap_local_folio(folio, offset);
                size_t chunk = len;

                if (folio_test_partial_kmap(folio) &&
                    chunk > PAGE_SIZE - offset_in_page(offset))
                        chunk = PAGE_SIZE - offset_in_page(offset);
                memcpy(to, from, chunk);
                kunmap_local(from);

                to += chunk;
                offset += chunk;
                len -= chunk;
        } while (len > 0);
}

/**
 * memcpy_to_folio - Copy a range of bytes to a folio.
 * @folio: The folio to write to.
 * @offset: The first byte in the folio to store to.
 * @from: The memory to copy from.
 * @len: The number of bytes to copy.
 */
static inline void memcpy_to_folio(struct folio *folio, size_t offset,
                const char *from, size_t len)
{
        VM_BUG_ON(offset + len > folio_size(folio));

        do {
                char *to = kmap_local_folio(folio, offset);
                size_t chunk = len;

                if (folio_test_partial_kmap(folio) &&
                    chunk > PAGE_SIZE - offset_in_page(offset))
                        chunk = PAGE_SIZE - offset_in_page(offset);
                memcpy(to, from, chunk);
                kunmap_local(to);

                from += chunk;
                offset += chunk;
                len -= chunk;
        } while (len > 0);

        flush_dcache_folio(folio);
}

/**
 * folio_zero_tail - Zero the tail of a folio.
 * @folio: The folio to zero.
 * @offset: The byte offset in the folio to start zeroing at.
 * @kaddr: The address the folio is currently mapped to.
 *
 * If you have already used kmap_local_folio() to map a folio, written
 * some data to it and now need to zero the end of the folio (and flush
 * the dcache), you can use this function.  If you do not have the
 * folio kmapped (eg the folio has been partially populated by DMA),
 * use folio_zero_range() or folio_zero_segment() instead.
 *
 * Return: An address which can be passed to kunmap_local().
 */
static inline __must_check void *folio_zero_tail(struct folio *folio,
                size_t offset, void *kaddr)
{
        size_t len = folio_size(folio) - offset;

        if (folio_test_partial_kmap(folio)) {
                size_t max = PAGE_SIZE - offset_in_page(offset);

                while (len > max) {
                        memset(kaddr, 0, max);
                        kunmap_local(kaddr);
                        len -= max;
                        offset += max;
                        max = PAGE_SIZE;
                        kaddr = kmap_local_folio(folio, offset);
                }
        }

        memset(kaddr, 0, len);
        flush_dcache_folio(folio);

        return kaddr;
}

/**
 * folio_fill_tail - Copy some data to a folio and pad with zeroes.
 * @folio: The destination folio.
 * @offset: The offset into @folio at which to start copying.
 * @from: The data to copy.
 * @len: How many bytes of data to copy.
 *
 * This function is most useful for filesystems which support inline data.
 * When they want to copy data from the inode into the page cache, this
 * function does everything for them.  It supports large folios even on
 * HIGHMEM configurations.
 */
static inline void folio_fill_tail(struct folio *folio, size_t offset,
                const char *from, size_t len)
{
        char *to = kmap_local_folio(folio, offset);

        VM_BUG_ON(offset + len > folio_size(folio));

        if (folio_test_partial_kmap(folio)) {
                size_t max = PAGE_SIZE - offset_in_page(offset);

                while (len > max) {
                        memcpy(to, from, max);
                        kunmap_local(to);
                        len -= max;
                        from += max;
                        offset += max;
                        max = PAGE_SIZE;
                        to = kmap_local_folio(folio, offset);
                }
        }

        memcpy(to, from, len);
        to = folio_zero_tail(folio, offset + len, to + len);
        kunmap_local(to);
}

/**
 * memcpy_from_file_folio - Copy some bytes from a file folio.
 * @to: The destination buffer.
 * @folio: The folio to copy from.
 * @pos: The position in the file.
 * @len: The maximum number of bytes to copy.
 *
 * Copy up to @len bytes from this folio.  This may be limited by PAGE_SIZE
 * if the folio comes from HIGHMEM, and by the size of the folio.
 *
 * Return: The number of bytes copied from the folio.
 */
static inline size_t memcpy_from_file_folio(char *to, struct folio *folio,
                loff_t pos, size_t len)
{
        size_t offset = offset_in_folio(folio, pos);
        char *from = kmap_local_folio(folio, offset);

        if (folio_test_partial_kmap(folio)) {
                offset = offset_in_page(offset);
                len = min_t(size_t, len, PAGE_SIZE - offset);
        } else
                len = min(len, folio_size(folio) - offset);

        memcpy(to, from, len);
        kunmap_local(from);

        return len;
}

/**
 * folio_zero_segments() - Zero two byte ranges in a folio.
 * @folio: The folio to write to.
 * @start1: The first byte to zero.
 * @xend1: One more than the last byte in the first range.
 * @start2: The first byte to zero in the second range.
 * @xend2: One more than the last byte in the second range.
 */
static inline void folio_zero_segments(struct folio *folio,
                size_t start1, size_t xend1, size_t start2, size_t xend2)
{
        zero_user_segments(&folio->page, start1, xend1, start2, xend2);
}

/**
 * folio_zero_segment() - Zero a byte range in a folio.
 * @folio: The folio to write to.
 * @start: The first byte to zero.
 * @xend: One more than the last byte to zero.
 */
static inline void folio_zero_segment(struct folio *folio,
                size_t start, size_t xend)
{
        zero_user_segments(&folio->page, start, xend, 0, 0);
}

/**
 * folio_zero_range() - Zero a byte range in a folio.
 * @folio: The folio to write to.
 * @start: The first byte to zero.
 * @length: The number of bytes to zero.
 */
static inline void folio_zero_range(struct folio *folio,
                size_t start, size_t length)
{
        zero_user_segments(&folio->page, start, start + length, 0, 0);
}

/**
 * folio_release_kmap - Unmap a folio and drop a refcount.
 * @folio: The folio to release.
 * @addr: The address previously returned by a call to kmap_local_folio().
 *
 * It is common, eg in directory handling to kmap a folio.  This function
 * unmaps the folio and drops the refcount that was being held to keep the
 * folio alive while we accessed it.
 */
static inline void folio_release_kmap(struct folio *folio, void *addr)
{
        kunmap_local(addr);
        folio_put(folio);
}
#endif /* _LINUX_HIGHMEM_H */




































    3 
















    3 














    3 



    2 

















    3 


    3 



























































    3 













    3 


    3 






























































































































































































































































































































































































































































































































































































































































































    3 
















    3 


































    1 














    2 
























    2 











    2 

    1 



    1 
    2 









    2 









    2 




























    2 

























    2 














    2 




    1 



    1 


























    3 




    2 








    1 



    2 




















    3 








    3 
























































































































































































    1 

    1 











    1 

    1 
















    3 








    3 
    3 


    3 
    3 

    3 
    3 

    1 
    3 






































    2 



    2 
    1 











    3 



    2 











    3 
    3 
    3 














    3 











    1 











    2 






    2 









    3 




















    2 




    3 
    3 
    1 

    3 














    1 



    2 
    1 











































    1 
















    1 


































































































































































































































    2 




    2 












































    2 

















    2 






























    2 





























































































































    3 










    2 




















    3 





    3 













    2 













    3 



    3 












    3 


















    2 










    2 








    1 
    1 


    2 

    2 




    2 


    1 


















    3 

























    1 

    2 












    3 


    2 










    2 






    2 












































    3 


































































    1 














    1 











    1 












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
// SPDX-License-Identifier: GPL-2.0
/*
 * message.c - synchronous message handling
 *
 * Released under the GPLv2 only.
 */

#include <linux/acpi.h>
#include <linux/pci.h>        /* for scatterlist macros */
#include <linux/usb.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/timer.h>
#include <linux/ctype.h>
#include <linux/nls.h>
#include <linux/device.h>
#include <linux/scatterlist.h>
#include <linux/usb/cdc.h>
#include <linux/usb/quirks.h>
#include <linux/usb/hcd.h>        /* for usbcore internals */
#include <linux/usb/of.h>
#include <asm/byteorder.h>

#include "usb.h"

static void cancel_async_set_config(struct usb_device *udev);

struct api_context {
        struct completion        done;
        int                        status;
};

static void usb_api_blocking_completion(struct urb *urb)
{
        struct api_context *ctx = urb->context;

        ctx->status = urb->status;
        complete(&ctx->done);
}


/*
 * Starts urb and waits for completion or timeout.
 * Whether or not the wait is killable depends on the flag passed in.
 * For example, compare usb_bulk_msg() and usb_bulk_msg_killable().
 *
 * For non-killable waits, we enforce a maximum limit on the timeout value.
 */
static int usb_start_wait_urb(struct urb *urb, int timeout, int *actual_length,
                bool killable)
{
        struct api_context ctx;
        unsigned long expire;
        int retval;
        long rc;

        init_completion(&ctx.done);
        urb->context = &ctx;
        urb->actual_length = 0;
        retval = usb_submit_urb(urb, GFP_NOIO);
        if (unlikely(retval))
                goto out;

        if (!killable && (timeout <= 0 || timeout > USB_MAX_SYNCHRONOUS_TIMEOUT))
                timeout = USB_MAX_SYNCHRONOUS_TIMEOUT;
        expire = (timeout > 0) ? msecs_to_jiffies(timeout) : MAX_SCHEDULE_TIMEOUT;
        if (killable)
                rc = wait_for_completion_killable_timeout(&ctx.done, expire);
        else
                rc = wait_for_completion_timeout(&ctx.done, expire);
        if (rc <= 0) {
                usb_kill_urb(urb);
                if (ctx.status != -ENOENT)
                        retval = ctx.status;
                else if (rc == 0)
                        retval = -ETIMEDOUT;
                else
                        retval = rc;

                dev_dbg(&urb->dev->dev,
                        "%s timed out or killed on ep%d%s len=%u/%u\n",
                        current->comm,
                        usb_endpoint_num(&urb->ep->desc),
                        usb_urb_dir_in(urb) ? "in" : "out",
                        urb->actual_length,
                        urb->transfer_buffer_length);
        } else
                retval = ctx.status;
out:
        if (actual_length)
                *actual_length = urb->actual_length;

        usb_free_urb(urb);
        return retval;
}

/*-------------------------------------------------------------------*/
/* returns status (negative) or length (positive) */
static int usb_internal_control_msg(struct usb_device *usb_dev,
                                    unsigned int pipe,
                                    struct usb_ctrlrequest *cmd,
                                    void *data, int len, int timeout)
{
        struct urb *urb;
        int retv;
        int length;

        urb = usb_alloc_urb(0, GFP_NOIO);
        if (!urb)
                return -ENOMEM;

        usb_fill_control_urb(urb, usb_dev, pipe, (unsigned char *)cmd, data,
                             len, usb_api_blocking_completion, NULL);

        retv = usb_start_wait_urb(urb, timeout, &length, false);
        if (retv < 0)
                return retv;
        else
                return length;
}

/**
 * usb_control_msg - Builds a control urb, sends it off and waits for completion
 * @dev: pointer to the usb device to send the message to
 * @pipe: endpoint "pipe" to send the message to
 * @request: USB message request value
 * @requesttype: USB message request type value
 * @value: USB message value
 * @index: USB message index value
 * @data: pointer to the data to send
 * @size: length in bytes of the data to send
 * @timeout: time in msecs to wait for the message to complete before timing out
 *
 * Context: task context, might sleep.
 *
 * This function sends a simple control message to a specified endpoint and
 * waits for the message to complete, or timeout.
 *
 * Don't use this function from within an interrupt context. If you need
 * an asynchronous message, or need to send a message from within interrupt
 * context, use usb_submit_urb(). If a thread in your driver uses this call,
 * make sure your disconnect() method can wait for it to complete. Since you
 * don't have a handle on the URB used, you can't cancel the request.
 *
 * Return: If successful, the number of bytes transferred. Otherwise, a negative
 * error number.
 */
int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request,
                    __u8 requesttype, __u16 value, __u16 index, void *data,
                    __u16 size, int timeout)
{
        struct usb_ctrlrequest *dr;
        int ret;

        dr = kmalloc_obj(struct usb_ctrlrequest, GFP_NOIO);
        if (!dr)
                return -ENOMEM;

        dr->bRequestType = requesttype;
        dr->bRequest = request;
        dr->wValue = cpu_to_le16(value);
        dr->wIndex = cpu_to_le16(index);
        dr->wLength = cpu_to_le16(size);

        ret = usb_internal_control_msg(dev, pipe, dr, data, size, timeout);

        /* Linger a bit, prior to the next control message. */
        if (dev->quirks & USB_QUIRK_DELAY_CTRL_MSG)
                msleep(200);

        kfree(dr);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_control_msg);

/**
 * usb_control_msg_send - Builds a control "send" message, sends it off and waits for completion
 * @dev: pointer to the usb device to send the message to
 * @endpoint: endpoint to send the message to
 * @request: USB message request value
 * @requesttype: USB message request type value
 * @value: USB message value
 * @index: USB message index value
 * @driver_data: pointer to the data to send
 * @size: length in bytes of the data to send
 * @timeout: time in msecs to wait for the message to complete before timing out
 * @memflags: the flags for memory allocation for buffers
 *
 * Context: !in_interrupt ()
 *
 * This function sends a control message to a specified endpoint that is not
 * expected to fill in a response (i.e. a "send message") and waits for the
 * message to complete, or timeout.
 *
 * Do not use this function from within an interrupt context. If you need
 * an asynchronous message, or need to send a message from within interrupt
 * context, use usb_submit_urb(). If a thread in your driver uses this call,
 * make sure your disconnect() method can wait for it to complete. Since you
 * don't have a handle on the URB used, you can't cancel the request.
 *
 * The data pointer can be made to a reference on the stack, or anywhere else,
 * as it will not be modified at all.  This does not have the restriction that
 * usb_control_msg() has where the data pointer must be to dynamically allocated
 * memory (i.e. memory that can be successfully DMAed to a device).
 *
 * Return: If successful, 0 is returned, Otherwise, a negative error number.
 */
int usb_control_msg_send(struct usb_device *dev, __u8 endpoint, __u8 request,
                         __u8 requesttype, __u16 value, __u16 index,
                         const void *driver_data, __u16 size, int timeout,
                         gfp_t memflags)
{
        unsigned int pipe = usb_sndctrlpipe(dev, endpoint);
        int ret;
        u8 *data = NULL;

        if (size) {
                data = kmemdup(driver_data, size, memflags);
                if (!data)
                        return -ENOMEM;
        }

        ret = usb_control_msg(dev, pipe, request, requesttype, value, index,
                              data, size, timeout);
        kfree(data);

        if (ret < 0)
                return ret;

        return 0;
}
EXPORT_SYMBOL_GPL(usb_control_msg_send);

/**
 * usb_control_msg_recv - Builds a control "receive" message, sends it off and waits for completion
 * @dev: pointer to the usb device to send the message to
 * @endpoint: endpoint to send the message to
 * @request: USB message request value
 * @requesttype: USB message request type value
 * @value: USB message value
 * @index: USB message index value
 * @driver_data: pointer to the data to be filled in by the message
 * @size: length in bytes of the data to be received
 * @timeout: time in msecs to wait for the message to complete before timing out
 * @memflags: the flags for memory allocation for buffers
 *
 * Context: !in_interrupt ()
 *
 * This function sends a control message to a specified endpoint that is
 * expected to fill in a response (i.e. a "receive message") and waits for the
 * message to complete, or timeout.
 *
 * Do not use this function from within an interrupt context. If you need
 * an asynchronous message, or need to send a message from within interrupt
 * context, use usb_submit_urb(). If a thread in your driver uses this call,
 * make sure your disconnect() method can wait for it to complete. Since you
 * don't have a handle on the URB used, you can't cancel the request.
 *
 * The data pointer can be made to a reference on the stack, or anywhere else
 * that can be successfully written to.  This function does not have the
 * restriction that usb_control_msg() has where the data pointer must be to
 * dynamically allocated memory (i.e. memory that can be successfully DMAed to a
 * device).
 *
 * The "whole" message must be properly received from the device in order for
 * this function to be successful.  If a device returns less than the expected
 * amount of data, then the function will fail.  Do not use this for messages
 * where a variable amount of data might be returned.
 *
 * Return: If successful, 0 is returned, Otherwise, a negative error number.
 */
int usb_control_msg_recv(struct usb_device *dev, __u8 endpoint, __u8 request,
                         __u8 requesttype, __u16 value, __u16 index,
                         void *driver_data, __u16 size, int timeout,
                         gfp_t memflags)
{
        unsigned int pipe = usb_rcvctrlpipe(dev, endpoint);
        int ret;
        u8 *data;

        if (!size || !driver_data)
                return -EINVAL;

        data = kmalloc(size, memflags);
        if (!data)
                return -ENOMEM;

        ret = usb_control_msg(dev, pipe, request, requesttype, value, index,
                              data, size, timeout);

        if (ret < 0)
                goto exit;

        if (ret == size) {
                memcpy(driver_data, data, size);
                ret = 0;
        } else {
                ret = -EREMOTEIO;
        }

exit:
        kfree(data);
        return ret;
}
EXPORT_SYMBOL_GPL(usb_control_msg_recv);

/**
 * usb_interrupt_msg - Builds an interrupt urb, sends it off and waits for completion
 * @usb_dev: pointer to the usb device to send the message to
 * @pipe: endpoint "pipe" to send the message to
 * @data: pointer to the data to send
 * @len: length in bytes of the data to send
 * @actual_length: pointer to a location to put the actual length transferred
 *        in bytes
 * @timeout: time in msecs to wait for the message to complete before timing out
 *
 * Context: task context, might sleep.
 *
 * This function sends a simple interrupt message to a specified endpoint and
 * waits for the message to complete, or timeout.
 *
 * Don't use this function from within an interrupt context. If you need
 * an asynchronous message, or need to send a message from within interrupt
 * context, use usb_submit_urb() If a thread in your driver uses this call,
 * make sure your disconnect() method can wait for it to complete. Since you
 * don't have a handle on the URB used, you can't cancel the request.
 *
 * Return:
 * If successful, 0. Otherwise a negative error number. The number of actual
 * bytes transferred will be stored in the @actual_length parameter.
 */
int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe,
                      void *data, int len, int *actual_length, int timeout)
{
        return usb_bulk_msg(usb_dev, pipe, data, len, actual_length, timeout);
}
EXPORT_SYMBOL_GPL(usb_interrupt_msg);

/**
 * usb_bulk_msg - Builds a bulk urb, sends it off and waits for completion
 * @usb_dev: pointer to the usb device to send the message to
 * @pipe: endpoint "pipe" to send the message to
 * @data: pointer to the data to send
 * @len: length in bytes of the data to send
 * @actual_length: pointer to a location to put the actual length transferred
 *        in bytes
 * @timeout: time in msecs to wait for the message to complete before timing out
 *
 * Context: task context, might sleep.
 *
 * This function sends a simple bulk message to a specified endpoint
 * and waits for the message to complete, or timeout.
 *
 * Don't use this function from within an interrupt context. If you need
 * an asynchronous message, or need to send a message from within interrupt
 * context, use usb_submit_urb() If a thread in your driver uses this call,
 * make sure your disconnect() method can wait for it to complete. Since you
 * don't have a handle on the URB used, you can't cancel the request.
 *
 * Because there is no usb_interrupt_msg() and no USBDEVFS_INTERRUPT ioctl,
 * users are forced to abuse this routine by using it to submit URBs for
 * interrupt endpoints.  We will take the liberty of creating an interrupt URB
 * (with the default interval) if the target is an interrupt endpoint.
 *
 * Return:
 * If successful, 0. Otherwise a negative error number. The number of actual
 * bytes transferred will be stored in the @actual_length parameter.
 *
 */
int usb_bulk_msg(struct usb_device *usb_dev, unsigned int pipe,
                 void *data, int len, int *actual_length, int timeout)
{
        struct urb *urb;
        struct usb_host_endpoint *ep;

        ep = usb_pipe_endpoint(usb_dev, pipe);
        if (!ep || len < 0)
                return -EINVAL;

        urb = usb_alloc_urb(0, GFP_KERNEL);
        if (!urb)
                return -ENOMEM;

        if ((ep->desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                        USB_ENDPOINT_XFER_INT) {
                pipe = (pipe & ~(3 << 30)) | (PIPE_INTERRUPT << 30);
                usb_fill_int_urb(urb, usb_dev, pipe, data, len,
                                usb_api_blocking_completion, NULL,
                                ep->desc.bInterval);
        } else
                usb_fill_bulk_urb(urb, usb_dev, pipe, data, len,
                                usb_api_blocking_completion, NULL);

        return usb_start_wait_urb(urb, timeout, actual_length, false);
}
EXPORT_SYMBOL_GPL(usb_bulk_msg);

/**
 * usb_bulk_msg_killable - Builds a bulk urb, sends it off and waits for completion in a killable state
 * @usb_dev: pointer to the usb device to send the message to
 * @pipe: endpoint "pipe" to send the message to
 * @data: pointer to the data to send
 * @len: length in bytes of the data to send
 * @actual_length: pointer to a location to put the actual length transferred
 *        in bytes
 * @timeout: time in msecs to wait for the message to complete before
 *        timing out (if <= 0, the wait is as long as possible)
 *
 * Context: task context, might sleep.
 *
 * This function is just like usb_blk_msg(), except that it waits in a
 * killable state and there is no limit on the timeout length.
 *
 * Return:
 * If successful, 0. Otherwise a negative error number. The number of actual
 * bytes transferred will be stored in the @actual_length parameter.
 *
 */
int usb_bulk_msg_killable(struct usb_device *usb_dev, unsigned int pipe,
                 void *data, int len, int *actual_length, int timeout)
{
        struct urb *urb;
        struct usb_host_endpoint *ep;

        ep = usb_pipe_endpoint(usb_dev, pipe);
        if (!ep || len < 0)
                return -EINVAL;

        urb = usb_alloc_urb(0, GFP_KERNEL);
        if (!urb)
                return -ENOMEM;

        if ((ep->desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                        USB_ENDPOINT_XFER_INT) {
                pipe = (pipe & ~(3 << 30)) | (PIPE_INTERRUPT << 30);
                usb_fill_int_urb(urb, usb_dev, pipe, data, len,
                                usb_api_blocking_completion, NULL,
                                ep->desc.bInterval);
        } else
                usb_fill_bulk_urb(urb, usb_dev, pipe, data, len,
                                usb_api_blocking_completion, NULL);

        return usb_start_wait_urb(urb, timeout, actual_length, true);
}
EXPORT_SYMBOL_GPL(usb_bulk_msg_killable);

/*-------------------------------------------------------------------*/

static void sg_clean(struct usb_sg_request *io)
{
        if (io->urbs) {
                while (io->entries--)
                        usb_free_urb(io->urbs[io->entries]);
                kfree(io->urbs);
                io->urbs = NULL;
        }
        io->dev = NULL;
}

static void sg_complete(struct urb *urb)
{
        unsigned long flags;
        struct usb_sg_request *io = urb->context;
        int status = urb->status;

        spin_lock_irqsave(&io->lock, flags);

        /* In 2.5 we require hcds' endpoint queues not to progress after fault
         * reports, until the completion callback (this!) returns.  That lets
         * device driver code (like this routine) unlink queued urbs first,
         * if it needs to, since the HC won't work on them at all.  So it's
         * not possible for page N+1 to overwrite page N, and so on.
         *
         * That's only for "hard" faults; "soft" faults (unlinks) sometimes
         * complete before the HCD can get requests away from hardware,
         * though never during cleanup after a hard fault.
         */
        if (io->status
                        && (io->status != -ECONNRESET
                                || status != -ECONNRESET)
                        && urb->actual_length) {
                dev_err(io->dev->bus->controller,
                        "dev %s ep%d%s scatterlist error %d/%d\n",
                        io->dev->devpath,
                        usb_endpoint_num(&urb->ep->desc),
                        usb_urb_dir_in(urb) ? "in" : "out",
                        status, io->status);
                /* BUG (); */
        }

        if (io->status == 0 && status && status != -ECONNRESET) {
                int i, found, retval;

                io->status = status;

                /* the previous urbs, and this one, completed already.
                 * unlink pending urbs so they won't rx/tx bad data.
                 * careful: unlink can sometimes be synchronous...
                 */
                spin_unlock_irqrestore(&io->lock, flags);
                for (i = 0, found = 0; i < io->entries; i++) {
                        if (!io->urbs[i])
                                continue;
                        if (found) {
                                usb_block_urb(io->urbs[i]);
                                retval = usb_unlink_urb(io->urbs[i]);
                                if (retval != -EINPROGRESS &&
                                    retval != -ENODEV &&
                                    retval != -EBUSY &&
                                    retval != -EIDRM)
                                        dev_err(&io->dev->dev,
                                                "%s, unlink --> %d\n",
                                                __func__, retval);
                        } else if (urb == io->urbs[i])
                                found = 1;
                }
                spin_lock_irqsave(&io->lock, flags);
        }

        /* on the last completion, signal usb_sg_wait() */
        io->bytes += urb->actual_length;
        io->count--;
        if (!io->count)
                complete(&io->complete);

        spin_unlock_irqrestore(&io->lock, flags);
}


/**
 * usb_sg_init - initializes scatterlist-based bulk/interrupt I/O request
 * @io: request block being initialized.  until usb_sg_wait() returns,
 *        treat this as a pointer to an opaque block of memory,
 * @dev: the usb device that will send or receive the data
 * @pipe: endpoint "pipe" used to transfer the data
 * @period: polling rate for interrupt endpoints, in frames or
 *         (for high speed endpoints) microframes; ignored for bulk
 * @sg: scatterlist entries
 * @nents: how many entries in the scatterlist
 * @length: how many bytes to send from the scatterlist, or zero to
 *         send every byte identified in the list.
 * @mem_flags: SLAB_* flags affecting memory allocations in this call
 *
 * This initializes a scatter/gather request, allocating resources such as
 * I/O mappings and urb memory (except maybe memory used by USB controller
 * drivers).
 *
 * The request must be issued using usb_sg_wait(), which waits for the I/O to
 * complete (or to be canceled) and then cleans up all resources allocated by
 * usb_sg_init().
 *
 * The request may be canceled with usb_sg_cancel(), either before or after
 * usb_sg_wait() is called.
 *
 * Return: Zero for success, else a negative errno value.
 */
int usb_sg_init(struct usb_sg_request *io, struct usb_device *dev,
                unsigned pipe, unsigned        period, struct scatterlist *sg,
                int nents, size_t length, gfp_t mem_flags)
{
        int i;
        int urb_flags;
        int use_sg;

        if (!io || !dev || !sg
                        || usb_pipecontrol(pipe)
                        || usb_pipeisoc(pipe)
                        || nents <= 0)
                return -EINVAL;

        spin_lock_init(&io->lock);
        io->dev = dev;
        io->pipe = pipe;

        if (dev->bus->sg_tablesize > 0) {
                use_sg = true;
                io->entries = 1;
        } else {
                use_sg = false;
                io->entries = nents;
        }

        /* initialize all the urbs we'll use */
        io->urbs = kmalloc_objs(*io->urbs, io->entries, mem_flags);
        if (!io->urbs)
                goto nomem;

        urb_flags = URB_NO_INTERRUPT;
        if (usb_pipein(pipe))
                urb_flags |= URB_SHORT_NOT_OK;

        for_each_sg(sg, sg, io->entries, i) {
                struct urb *urb;
                unsigned len;

                urb = usb_alloc_urb(0, mem_flags);
                if (!urb) {
                        io->entries = i;
                        goto nomem;
                }
                io->urbs[i] = urb;

                urb->dev = NULL;
                urb->pipe = pipe;
                urb->interval = period;
                urb->transfer_flags = urb_flags;
                urb->complete = sg_complete;
                urb->context = io;
                urb->sg = sg;

                if (use_sg) {
                        /* There is no single transfer buffer */
                        urb->transfer_buffer = NULL;
                        urb->num_sgs = nents;

                        /* A length of zero means transfer the whole sg list */
                        len = length;
                        if (len == 0) {
                                struct scatterlist        *sg2;
                                int                        j;

                                for_each_sg(sg, sg2, nents, j)
                                        len += sg2->length;
                        }
                } else {
                        /*
                         * Some systems can't use DMA; they use PIO instead.
                         * For their sakes, transfer_buffer is set whenever
                         * possible.
                         */
                        if (!PageHighMem(sg_page(sg)))
                                urb->transfer_buffer = sg_virt(sg);
                        else
                                urb->transfer_buffer = NULL;

                        len = sg->length;
                        if (length) {
                                len = min_t(size_t, len, length);
                                length -= len;
                                if (length == 0)
                                        io->entries = i + 1;
                        }
                }
                urb->transfer_buffer_length = len;
        }
        io->urbs[--i]->transfer_flags &= ~URB_NO_INTERRUPT;

        /* transaction state */
        io->count = io->entries;
        io->status = 0;
        io->bytes = 0;
        init_completion(&io->complete);
        return 0;

nomem:
        sg_clean(io);
        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(usb_sg_init);

/**
 * usb_sg_wait - synchronously execute scatter/gather request
 * @io: request block handle, as initialized with usb_sg_init().
 *         some fields become accessible when this call returns.
 *
 * Context: task context, might sleep.
 *
 * This function blocks until the specified I/O operation completes.  It
 * leverages the grouping of the related I/O requests to get good transfer
 * rates, by queueing the requests.  At higher speeds, such queuing can
 * significantly improve USB throughput.
 *
 * There are three kinds of completion for this function.
 *
 * (1) success, where io->status is zero.  The number of io->bytes
 *     transferred is as requested.
 * (2) error, where io->status is a negative errno value.  The number
 *     of io->bytes transferred before the error is usually less
 *     than requested, and can be nonzero.
 * (3) cancellation, a type of error with status -ECONNRESET that
 *     is initiated by usb_sg_cancel().
 *
 * When this function returns, all memory allocated through usb_sg_init() or
 * this call will have been freed.  The request block parameter may still be
 * passed to usb_sg_cancel(), or it may be freed.  It could also be
 * reinitialized and then reused.
 *
 * Data Transfer Rates:
 *
 * Bulk transfers are valid for full or high speed endpoints.
 * The best full speed data rate is 19 packets of 64 bytes each
 * per frame, or 1216 bytes per millisecond.
 * The best high speed data rate is 13 packets of 512 bytes each
 * per microframe, or 52 KBytes per millisecond.
 *
 * The reason to use interrupt transfers through this API would most likely
 * be to reserve high speed bandwidth, where up to 24 KBytes per millisecond
 * could be transferred.  That capability is less useful for low or full
 * speed interrupt endpoints, which allow at most one packet per millisecond,
 * of at most 8 or 64 bytes (respectively).
 *
 * It is not necessary to call this function to reserve bandwidth for devices
 * under an xHCI host controller, as the bandwidth is reserved when the
 * configuration or interface alt setting is selected.
 */
void usb_sg_wait(struct usb_sg_request *io)
{
        int i;
        int entries = io->entries;

        /* queue the urbs.  */
        spin_lock_irq(&io->lock);
        i = 0;
        while (i < entries && !io->status) {
                int retval;

                io->urbs[i]->dev = io->dev;
                spin_unlock_irq(&io->lock);

                retval = usb_submit_urb(io->urbs[i], GFP_NOIO);

                switch (retval) {
                        /* maybe we retrying will recover */
                case -ENXIO:        /* hc didn't queue this one */
                case -EAGAIN:
                case -ENOMEM:
                        retval = 0;
                        yield();
                        break;

                        /* no error? continue immediately.
                         *
                         * NOTE: to work better with UHCI (4K I/O buffer may
                         * need 3K of TDs) it may be good to limit how many
                         * URBs are queued at once; N milliseconds?
                         */
                case 0:
                        ++i;
                        cpu_relax();
                        break;

                        /* fail any uncompleted urbs */
                default:
                        io->urbs[i]->status = retval;
                        dev_dbg(&io->dev->dev, "%s, submit --> %d\n",
                                __func__, retval);
                        usb_sg_cancel(io);
                }
                spin_lock_irq(&io->lock);
                if (retval && (io->status == 0 || io->status == -ECONNRESET))
                        io->status = retval;
        }
        io->count -= entries - i;
        if (io->count == 0)
                complete(&io->complete);
        spin_unlock_irq(&io->lock);

        /* OK, yes, this could be packaged as non-blocking.
         * So could the submit loop above ... but it's easier to
         * solve neither problem than to solve both!
         */
        wait_for_completion(&io->complete);

        sg_clean(io);
}
EXPORT_SYMBOL_GPL(usb_sg_wait);

/**
 * usb_sg_cancel - stop scatter/gather i/o issued by usb_sg_wait()
 * @io: request block, initialized with usb_sg_init()
 *
 * This stops a request after it has been started by usb_sg_wait().
 * It can also prevents one initialized by usb_sg_init() from starting,
 * so that call just frees resources allocated to the request.
 */
void usb_sg_cancel(struct usb_sg_request *io)
{
        unsigned long flags;
        int i, retval;

        spin_lock_irqsave(&io->lock, flags);
        if (io->status || io->count == 0) {
                spin_unlock_irqrestore(&io->lock, flags);
                return;
        }
        /* shut everything down */
        io->status = -ECONNRESET;
        io->count++;                /* Keep the request alive until we're done */
        spin_unlock_irqrestore(&io->lock, flags);

        for (i = io->entries - 1; i >= 0; --i) {
                usb_block_urb(io->urbs[i]);

                retval = usb_unlink_urb(io->urbs[i]);
                if (retval != -EINPROGRESS
                    && retval != -ENODEV
                    && retval != -EBUSY
                    && retval != -EIDRM)
                        dev_warn(&io->dev->dev, "%s, unlink --> %d\n",
                                 __func__, retval);
        }

        spin_lock_irqsave(&io->lock, flags);
        io->count--;
        if (!io->count)
                complete(&io->complete);
        spin_unlock_irqrestore(&io->lock, flags);
}
EXPORT_SYMBOL_GPL(usb_sg_cancel);

/*-------------------------------------------------------------------*/

/**
 * usb_get_descriptor - issues a generic GET_DESCRIPTOR request
 * @dev: the device whose descriptor is being retrieved
 * @type: the descriptor type (USB_DT_*)
 * @index: the number of the descriptor
 * @buf: where to put the descriptor
 * @size: how big is "buf"?
 *
 * Context: task context, might sleep.
 *
 * Gets a USB descriptor.  Convenience functions exist to simplify
 * getting some types of descriptors.  Use
 * usb_get_string() or usb_string() for USB_DT_STRING.
 * Device (USB_DT_DEVICE) and configuration descriptors (USB_DT_CONFIG)
 * are part of the device structure.
 * In addition to a number of USB-standard descriptors, some
 * devices also use class-specific or vendor-specific descriptors.
 *
 * This call is synchronous, and may not be used in an interrupt context.
 *
 * Return: The number of bytes received on success, or else the status code
 * returned by the underlying usb_control_msg() call.
 */
int usb_get_descriptor(struct usb_device *dev, unsigned char type,
                       unsigned char index, void *buf, int size)
{
        int i;
        int result;

        if (size <= 0)                /* No point in asking for no data */
                return -EINVAL;

        memset(buf, 0, size);        /* Make sure we parse really received data */

        for (i = 0; i < 3; ++i) {
                /* retry on length 0 or error; some devices are flakey */
                result = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0),
                                USB_REQ_GET_DESCRIPTOR, USB_DIR_IN,
                                (type << 8) + index, 0, buf, size,
                                USB_CTRL_GET_TIMEOUT);
                if (result <= 0 && result != -ETIMEDOUT)
                        continue;
                if (result > 1 && ((u8 *)buf)[1] != type) {
                        result = -ENODATA;
                        continue;
                }
                break;
        }
        return result;
}
EXPORT_SYMBOL_GPL(usb_get_descriptor);

/**
 * usb_get_string - gets a string descriptor
 * @dev: the device whose string descriptor is being retrieved
 * @langid: code for language chosen (from string descriptor zero)
 * @index: the number of the descriptor
 * @buf: where to put the string
 * @size: how big is "buf"?
 *
 * Context: task context, might sleep.
 *
 * Retrieves a string, encoded using UTF-16LE (Unicode, 16 bits per character,
 * in little-endian byte order).
 * The usb_string() function will often be a convenient way to turn
 * these strings into kernel-printable form.
 *
 * Strings may be referenced in device, configuration, interface, or other
 * descriptors, and could also be used in vendor-specific ways.
 *
 * This call is synchronous, and may not be used in an interrupt context.
 *
 * Return: The number of bytes received on success, or else the status code
 * returned by the underlying usb_control_msg() call.
 */
static int usb_get_string(struct usb_device *dev, unsigned short langid,
                          unsigned char index, void *buf, int size)
{
        int i;
        int result;

        if (size <= 0)                /* No point in asking for no data */
                return -EINVAL;

        for (i = 0; i < 3; ++i) {
                /* retry on length 0 or stall; some devices are flakey */
                result = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0),
                        USB_REQ_GET_DESCRIPTOR, USB_DIR_IN,
                        (USB_DT_STRING << 8) + index, langid, buf, size,
                        USB_CTRL_GET_TIMEOUT);
                if (result == 0 || result == -EPIPE)
                        continue;
                if (result > 1 && ((u8 *) buf)[1] != USB_DT_STRING) {
                        result = -ENODATA;
                        continue;
                }
                break;
        }
        return result;
}

static void usb_try_string_workarounds(unsigned char *buf, int *length)
{
        int newlength, oldlength = *length;

        for (newlength = 2; newlength + 1 < oldlength; newlength += 2)
                if (!isprint(buf[newlength]) || buf[newlength + 1])
                        break;

        if (newlength > 2) {
                buf[0] = newlength;
                *length = newlength;
        }
}

static int usb_string_sub(struct usb_device *dev, unsigned int langid,
                          unsigned int index, unsigned char *buf)
{
        int rc;

        /* Try to read the string descriptor by asking for the maximum
         * possible number of bytes */
        if (dev->quirks & USB_QUIRK_STRING_FETCH_255)
                rc = -EIO;
        else
                rc = usb_get_string(dev, langid, index, buf, 255);

        /* If that failed try to read the descriptor length, then
         * ask for just that many bytes */
        if (rc < 2) {
                rc = usb_get_string(dev, langid, index, buf, 2);
                if (rc == 2)
                        rc = usb_get_string(dev, langid, index, buf, buf[0]);
        }

        if (rc >= 2) {
                if (!buf[0] && !buf[1])
                        usb_try_string_workarounds(buf, &rc);

                /* There might be extra junk at the end of the descriptor */
                if (buf[0] < rc)
                        rc = buf[0];

                rc = rc - (rc & 1); /* force a multiple of two */
        }

        if (rc < 2)
                rc = (rc < 0 ? rc : -EINVAL);

        return rc;
}

static int usb_get_langid(struct usb_device *dev, unsigned char *tbuf)
{
        int err;

        if (dev->have_langid)
                return 0;

        if (dev->string_langid < 0)
                return -EPIPE;

        err = usb_string_sub(dev, 0, 0, tbuf);

        /* If the string was reported but is malformed, default to english
         * (0x0409) */
        if (err == -ENODATA || (err > 0 && err < 4)) {
                dev->string_langid = 0x0409;
                dev->have_langid = 1;
                dev_err(&dev->dev,
                        "language id specifier not provided by device, defaulting to English\n");
                return 0;
        }

        /* In case of all other errors, we assume the device is not able to
         * deal with strings at all. Set string_langid to -1 in order to
         * prevent any string to be retrieved from the device */
        if (err < 0) {
                dev_info(&dev->dev, "string descriptor 0 read error: %d\n",
                                        err);
                dev->string_langid = -1;
                return -EPIPE;
        }

        /* always use the first langid listed */
        dev->string_langid = tbuf[2] | (tbuf[3] << 8);
        dev->have_langid = 1;
        dev_dbg(&dev->dev, "default language 0x%04x\n",
                                dev->string_langid);
        return 0;
}

/**
 * usb_string - returns UTF-8 version of a string descriptor
 * @dev: the device whose string descriptor is being retrieved
 * @index: the number of the descriptor
 * @buf: where to put the string
 * @size: how big is "buf"?
 *
 * Context: task context, might sleep.
 *
 * This converts the UTF-16LE encoded strings returned by devices, from
 * usb_get_string_descriptor(), to null-terminated UTF-8 encoded ones
 * that are more usable in most kernel contexts.  Note that this function
 * chooses strings in the first language supported by the device.
 *
 * This call is synchronous, and may not be used in an interrupt context.
 *
 * Return: length of the string (>= 0) or usb_control_msg status (< 0).
 */
int usb_string(struct usb_device *dev, int index, char *buf, size_t size)
{
        unsigned char *tbuf;
        int err;

        if (dev->state == USB_STATE_SUSPENDED)
                return -EHOSTUNREACH;
        if (size <= 0 || !buf)
                return -EINVAL;
        buf[0] = 0;
        if (index <= 0 || index >= 256)
                return -EINVAL;
        tbuf = kmalloc(256, GFP_NOIO);
        if (!tbuf)
                return -ENOMEM;

        err = usb_get_langid(dev, tbuf);
        if (err < 0)
                goto errout;

        err = usb_string_sub(dev, dev->string_langid, index, tbuf);
        if (err < 0)
                goto errout;

        size--;                /* leave room for trailing NULL char in output buffer */
        err = utf16s_to_utf8s((wchar_t *) &tbuf[2], (err - 2) / 2,
                        UTF16_LITTLE_ENDIAN, buf, size);
        buf[err] = 0;

        if (tbuf[1] != USB_DT_STRING)
                dev_dbg(&dev->dev,
                        "wrong descriptor type %02x for string %d (\"%s\")\n",
                        tbuf[1], index, buf);

 errout:
        kfree(tbuf);
        return err;
}
EXPORT_SYMBOL_GPL(usb_string);

/* one 16-bit character, when UTF-8-encoded, has at most three bytes */
#define MAX_USB_STRING_SIZE (127 * 3 + 1)

/**
 * usb_cache_string - read a string descriptor and cache it for later use
 * @udev: the device whose string descriptor is being read
 * @index: the descriptor index
 *
 * Return: A pointer to a kmalloc'ed buffer containing the descriptor string,
 * or %NULL if the index is 0 or the string could not be read.
 */
char *usb_cache_string(struct usb_device *udev, int index)
{
        char *buf;
        char *smallbuf = NULL;
        int len;

        if (index <= 0)
                return NULL;

        buf = kmalloc(MAX_USB_STRING_SIZE, GFP_NOIO);
        if (!buf)
                return NULL;

        len = usb_string(udev, index, buf, MAX_USB_STRING_SIZE);
        if (len <= 0) {
                kfree(buf);
                return NULL;
        }

        smallbuf = krealloc(buf, len + 1, GFP_NOIO);
        if (unlikely(!smallbuf))
                return buf;
        return smallbuf;
}
EXPORT_SYMBOL_GPL(usb_cache_string);

/*
 * usb_get_device_descriptor - read the device descriptor
 * @udev: the device whose device descriptor should be read
 *
 * Context: task context, might sleep.
 *
 * Not exported, only for use by the core.  If drivers really want to read
 * the device descriptor directly, they can call usb_get_descriptor() with
 * type = USB_DT_DEVICE and index = 0.
 *
 * Returns: a pointer to a dynamically allocated usb_device_descriptor
 * structure (which the caller must deallocate), or an ERR_PTR value.
 */
struct usb_device_descriptor *usb_get_device_descriptor(struct usb_device *udev)
{
        struct usb_device_descriptor *desc;
        int ret;

        desc = kmalloc_obj(*desc, GFP_NOIO);
        if (!desc)
                return ERR_PTR(-ENOMEM);

        ret = usb_get_descriptor(udev, USB_DT_DEVICE, 0, desc, sizeof(*desc));
        if (ret == sizeof(*desc))
                return desc;

        if (ret >= 0)
                ret = -EMSGSIZE;
        kfree(desc);
        return ERR_PTR(ret);
}

/*
 * usb_set_isoch_delay - informs the device of the packet transmit delay
 * @dev: the device whose delay is to be informed
 * Context: task context, might sleep
 *
 * Since this is an optional request, we don't bother if it fails.
 */
int usb_set_isoch_delay(struct usb_device *dev)
{
        /* skip hub devices */
        if (dev->descriptor.bDeviceClass == USB_CLASS_HUB)
                return 0;

        /* skip non-SS/non-SSP devices */
        if (dev->speed < USB_SPEED_SUPER)
                return 0;

        return usb_control_msg_send(dev, 0,
                        USB_REQ_SET_ISOCH_DELAY,
                        USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
                        dev->hub_delay, 0, NULL, 0,
                        USB_CTRL_SET_TIMEOUT,
                        GFP_NOIO);
}

/**
 * usb_get_status - issues a GET_STATUS call
 * @dev: the device whose status is being checked
 * @recip: USB_RECIP_*; for device, interface, or endpoint
 * @type: USB_STATUS_TYPE_*; for standard or PTM status types
 * @target: zero (for device), else interface or endpoint number
 * @data: pointer to two bytes of bitmap data
 *
 * Context: task context, might sleep.
 *
 * Returns device, interface, or endpoint status.  Normally only of
 * interest to see if the device is self powered, or has enabled the
 * remote wakeup facility; or whether a bulk or interrupt endpoint
 * is halted ("stalled").
 *
 * Bits in these status bitmaps are set using the SET_FEATURE request,
 * and cleared using the CLEAR_FEATURE request.  The usb_clear_halt()
 * function should be used to clear halt ("stall") status.
 *
 * This call is synchronous, and may not be used in an interrupt context.
 *
 * Returns 0 and the status value in *@data (in host byte order) on success,
 * or else the status code from the underlying usb_control_msg() call.
 */
int usb_get_status(struct usb_device *dev, int recip, int type, int target,
                void *data)
{
        int ret;
        void *status;
        int length;

        switch (type) {
        case USB_STATUS_TYPE_STANDARD:
                length = 2;
                break;
        case USB_STATUS_TYPE_PTM:
                if (recip != USB_RECIP_DEVICE)
                        return -EINVAL;

                length = 4;
                break;
        default:
                return -EINVAL;
        }

        status =  kmalloc(length, GFP_KERNEL);
        if (!status)
                return -ENOMEM;

        ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0),
                USB_REQ_GET_STATUS, USB_DIR_IN | recip, USB_STATUS_TYPE_STANDARD,
                target, status, length, USB_CTRL_GET_TIMEOUT);

        switch (ret) {
        case 4:
                if (type != USB_STATUS_TYPE_PTM) {
                        ret = -EIO;
                        break;
                }

                *(u32 *) data = le32_to_cpu(*(__le32 *) status);
                ret = 0;
                break;
        case 2:
                if (type != USB_STATUS_TYPE_STANDARD) {
                        ret = -EIO;
                        break;
                }

                *(u16 *) data = le16_to_cpu(*(__le16 *) status);
                ret = 0;
                break;
        default:
                ret = -EIO;
        }

        kfree(status);
        return ret;
}
EXPORT_SYMBOL_GPL(usb_get_status);

/**
 * usb_clear_halt - tells device to clear endpoint halt/stall condition
 * @dev: device whose endpoint is halted
 * @pipe: endpoint "pipe" being cleared
 *
 * Context: task context, might sleep.
 *
 * This is used to clear halt conditions for bulk and interrupt endpoints,
 * as reported by URB completion status.  Endpoints that are halted are
 * sometimes referred to as being "stalled".  Such endpoints are unable
 * to transmit or receive data until the halt status is cleared.  Any URBs
 * queued for such an endpoint should normally be unlinked by the driver
 * before clearing the halt condition, as described in sections 5.7.5
 * and 5.8.5 of the USB 2.0 spec.
 *
 * Note that control and isochronous endpoints don't halt, although control
 * endpoints report "protocol stall" (for unsupported requests) using the
 * same status code used to report a true stall.
 *
 * This call is synchronous, and may not be used in an interrupt context.
 * If a thread in your driver uses this call, make sure your disconnect()
 * method can wait for it to complete.
 *
 * Return: Zero on success, or else the status code returned by the
 * underlying usb_control_msg() call.
 */
int usb_clear_halt(struct usb_device *dev, int pipe)
{
        int result;
        int endp = usb_pipeendpoint(pipe);

        if (usb_pipein(pipe))
                endp |= USB_DIR_IN;

        /* we don't care if it wasn't halted first. in fact some devices
         * (like some ibmcam model 1 units) seem to expect hosts to make
         * this request for iso endpoints, which can't halt!
         */
        result = usb_control_msg_send(dev, 0,
                                      USB_REQ_CLEAR_FEATURE, USB_RECIP_ENDPOINT,
                                      USB_ENDPOINT_HALT, endp, NULL, 0,
                                      USB_CTRL_SET_TIMEOUT, GFP_NOIO);

        /* don't un-halt or force to DATA0 except on success */
        if (result)
                return result;

        /* NOTE:  seems like Microsoft and Apple don't bother verifying
         * the clear "took", so some devices could lock up if you check...
         * such as the Hagiwara FlashGate DUAL.  So we won't bother.
         *
         * NOTE:  make sure the logic here doesn't diverge much from
         * the copy in usb-storage, for as long as we need two copies.
         */

        usb_reset_endpoint(dev, endp);

        return 0;
}
EXPORT_SYMBOL_GPL(usb_clear_halt);

static int create_intf_ep_devs(struct usb_interface *intf)
{
        struct usb_device *udev = interface_to_usbdev(intf);
        struct usb_host_interface *alt = intf->cur_altsetting;
        int i;

        if (intf->ep_devs_created || intf->unregistering)
                return 0;

        for (i = 0; i < alt->desc.bNumEndpoints; ++i)
                (void) usb_create_ep_devs(&intf->dev, &alt->endpoint[i], udev);
        intf->ep_devs_created = 1;
        return 0;
}

static void remove_intf_ep_devs(struct usb_interface *intf)
{
        struct usb_host_interface *alt = intf->cur_altsetting;
        int i;

        if (!intf->ep_devs_created)
                return;

        for (i = 0; i < alt->desc.bNumEndpoints; ++i)
                usb_remove_ep_devs(&alt->endpoint[i]);
        intf->ep_devs_created = 0;
}

/**
 * usb_disable_endpoint -- Disable an endpoint by address
 * @dev: the device whose endpoint is being disabled
 * @epaddr: the endpoint's address.  Endpoint number for output,
 *        endpoint number + USB_DIR_IN for input
 * @reset_hardware: flag to erase any endpoint state stored in the
 *        controller hardware
 *
 * Disables the endpoint for URB submission and nukes all pending URBs.
 * If @reset_hardware is set then also deallocates hcd/hardware state
 * for the endpoint.
 */
void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr,
                bool reset_hardware)
{
        unsigned int epnum = epaddr & USB_ENDPOINT_NUMBER_MASK;
        struct usb_host_endpoint *ep;

        if (!dev)
                return;

        if (usb_endpoint_out(epaddr)) {
                ep = dev->ep_out[epnum];
                if (reset_hardware && epnum != 0)
                        dev->ep_out[epnum] = NULL;
        } else {
                ep = dev->ep_in[epnum];
                if (reset_hardware && epnum != 0)
                        dev->ep_in[epnum] = NULL;
        }
        if (ep) {
                ep->enabled = 0;
                usb_hcd_flush_endpoint(dev, ep);
                if (reset_hardware)
                        usb_hcd_disable_endpoint(dev, ep);
        }
}

/**
 * usb_reset_endpoint - Reset an endpoint's state.
 * @dev: the device whose endpoint is to be reset
 * @epaddr: the endpoint's address.  Endpoint number for output,
 *        endpoint number + USB_DIR_IN for input
 *
 * Resets any host-side endpoint state such as the toggle bit,
 * sequence number or current window.
 */
void usb_reset_endpoint(struct usb_device *dev, unsigned int epaddr)
{
        unsigned int epnum = epaddr & USB_ENDPOINT_NUMBER_MASK;
        struct usb_host_endpoint *ep;

        if (usb_endpoint_out(epaddr))
                ep = dev->ep_out[epnum];
        else
                ep = dev->ep_in[epnum];
        if (ep)
                usb_hcd_reset_endpoint(dev, ep);
}
EXPORT_SYMBOL_GPL(usb_reset_endpoint);


/**
 * usb_disable_interface -- Disable all endpoints for an interface
 * @dev: the device whose interface is being disabled
 * @intf: pointer to the interface descriptor
 * @reset_hardware: flag to erase any endpoint state stored in the
 *        controller hardware
 *
 * Disables all the endpoints for the interface's current altsetting.
 */
void usb_disable_interface(struct usb_device *dev, struct usb_interface *intf,
                bool reset_hardware)
{
        struct usb_host_interface *alt = intf->cur_altsetting;
        int i;

        for (i = 0; i < alt->desc.bNumEndpoints; ++i) {
                usb_disable_endpoint(dev,
                                alt->endpoint[i].desc.bEndpointAddress,
                                reset_hardware);
        }
}

/*
 * usb_disable_device_endpoints -- Disable all endpoints for a device
 * @dev: the device whose endpoints are being disabled
 * @skip_ep0: 0 to disable endpoint 0, 1 to skip it.
 */
static void usb_disable_device_endpoints(struct usb_device *dev, int skip_ep0)
{
        struct usb_hcd *hcd = bus_to_hcd(dev->bus);
        int i;

        if (hcd->driver->check_bandwidth) {
                /* First pass: Cancel URBs, leave endpoint pointers intact. */
                for (i = skip_ep0; i < 16; ++i) {
                        usb_disable_endpoint(dev, i, false);
                        usb_disable_endpoint(dev, i + USB_DIR_IN, false);
                }
                /* Remove endpoints from the host controller internal state */
                mutex_lock(hcd->bandwidth_mutex);
                usb_hcd_alloc_bandwidth(dev, NULL, NULL, NULL);
                mutex_unlock(hcd->bandwidth_mutex);
        }
        /* Second pass: remove endpoint pointers */
        for (i = skip_ep0; i < 16; ++i) {
                usb_disable_endpoint(dev, i, true);
                usb_disable_endpoint(dev, i + USB_DIR_IN, true);
        }
}

/**
 * usb_disable_device - Disable all the endpoints for a USB device
 * @dev: the device whose endpoints are being disabled
 * @skip_ep0: 0 to disable endpoint 0, 1 to skip it.
 *
 * Disables all the device's endpoints, potentially including endpoint 0.
 * Deallocates hcd/hardware state for the endpoints (nuking all or most
 * pending urbs) and usbcore state for the interfaces, so that usbcore
 * must usb_set_configuration() before any interfaces could be used.
 */
void usb_disable_device(struct usb_device *dev, int skip_ep0)
{
        int i;

        /* getting rid of interfaces will disconnect
         * any drivers bound to them (a key side effect)
         */
        if (dev->actconfig) {
                /*
                 * FIXME: In order to avoid self-deadlock involving the
                 * bandwidth_mutex, we have to mark all the interfaces
                 * before unregistering any of them.
                 */
                for (i = 0; i < dev->actconfig->desc.bNumInterfaces; i++)
                        dev->actconfig->interface[i]->unregistering = 1;

                for (i = 0; i < dev->actconfig->desc.bNumInterfaces; i++) {
                        struct usb_interface        *interface;

                        /* remove this interface if it has been registered */
                        interface = dev->actconfig->interface[i];
                        if (!device_is_registered(&interface->dev))
                                continue;
                        dev_dbg(&dev->dev, "unregistering interface %s\n",
                                dev_name(&interface->dev));
                        remove_intf_ep_devs(interface);
                        device_del(&interface->dev);
                }

                /* Now that the interfaces are unbound, nobody should
                 * try to access them.
                 */
                for (i = 0; i < dev->actconfig->desc.bNumInterfaces; i++) {
                        put_device(&dev->actconfig->interface[i]->dev);
                        dev->actconfig->interface[i] = NULL;
                }

                usb_disable_usb2_hardware_lpm(dev);
                usb_unlocked_disable_lpm(dev);
                usb_disable_ltm(dev);

                dev->actconfig = NULL;
                if (dev->state == USB_STATE_CONFIGURED)
                        usb_set_device_state(dev, USB_STATE_ADDRESS);
        }

        dev_dbg(&dev->dev, "%s nuking %s URBs\n", __func__,
                skip_ep0 ? "non-ep0" : "all");

        usb_disable_device_endpoints(dev, skip_ep0);
}

/**
 * usb_enable_endpoint - Enable an endpoint for USB communications
 * @dev: the device whose interface is being enabled
 * @ep: the endpoint
 * @reset_ep: flag to reset the endpoint state
 *
 * Resets the endpoint state if asked, and sets dev->ep_{in,out} pointers.
 * For control endpoints, both the input and output sides are handled.
 */
void usb_enable_endpoint(struct usb_device *dev, struct usb_host_endpoint *ep,
                bool reset_ep)
{
        int epnum = usb_endpoint_num(&ep->desc);
        int is_out = usb_endpoint_dir_out(&ep->desc);
        int is_control = usb_endpoint_xfer_control(&ep->desc);

        if (reset_ep)
                usb_hcd_reset_endpoint(dev, ep);
        if (is_out || is_control)
                dev->ep_out[epnum] = ep;
        if (!is_out || is_control)
                dev->ep_in[epnum] = ep;
        ep->enabled = 1;
}

/**
 * usb_enable_interface - Enable all the endpoints for an interface
 * @dev: the device whose interface is being enabled
 * @intf: pointer to the interface descriptor
 * @reset_eps: flag to reset the endpoints' state
 *
 * Enables all the endpoints for the interface's current altsetting.
 */
void usb_enable_interface(struct usb_device *dev,
                struct usb_interface *intf, bool reset_eps)
{
        struct usb_host_interface *alt = intf->cur_altsetting;
        int i;

        for (i = 0; i < alt->desc.bNumEndpoints; ++i)
                usb_enable_endpoint(dev, &alt->endpoint[i], reset_eps);
}

/**
 * usb_set_interface - Makes a particular alternate setting be current
 * @dev: the device whose interface is being updated
 * @interface: the interface being updated
 * @alternate: the setting being chosen.
 *
 * Context: task context, might sleep.
 *
 * This is used to enable data transfers on interfaces that may not
 * be enabled by default.  Not all devices support such configurability.
 * Only the driver bound to an interface may change its setting.
 *
 * Within any given configuration, each interface may have several
 * alternative settings.  These are often used to control levels of
 * bandwidth consumption.  For example, the default setting for a high
 * speed interrupt endpoint may not send more than 64 bytes per microframe,
 * while interrupt transfers of up to 3KBytes per microframe are legal.
 * Also, isochronous endpoints may never be part of an
 * interface's default setting.  To access such bandwidth, alternate
 * interface settings must be made current.
 *
 * Note that in the Linux USB subsystem, bandwidth associated with
 * an endpoint in a given alternate setting is not reserved until an URB
 * is submitted that needs that bandwidth.  Some other operating systems
 * allocate bandwidth early, when a configuration is chosen.
 *
 * xHCI reserves bandwidth and configures the alternate setting in
 * usb_hcd_alloc_bandwidth(). If it fails the original interface altsetting
 * may be disabled. Drivers cannot rely on any particular alternate
 * setting being in effect after a failure.
 *
 * This call is synchronous, and may not be used in an interrupt context.
 * Also, drivers must not change altsettings while urbs are scheduled for
 * endpoints in that interface; all such urbs must first be completed
 * (perhaps forced by unlinking). If a thread in your driver uses this call,
 * make sure your disconnect() method can wait for it to complete.
 *
 * Return: Zero on success, or else the status code returned by the
 * underlying usb_control_msg() call.
 */
int usb_set_interface(struct usb_device *dev, int interface, int alternate)
{
        struct usb_interface *iface;
        struct usb_host_interface *alt;
        struct usb_hcd *hcd = bus_to_hcd(dev->bus);
        int i, ret, manual = 0;
        unsigned int epaddr;
        unsigned int pipe;

        if (dev->state == USB_STATE_SUSPENDED)
                return -EHOSTUNREACH;

        iface = usb_ifnum_to_if(dev, interface);
        if (!iface) {
                dev_dbg(&dev->dev, "selecting invalid interface %d\n",
                        interface);
                return -EINVAL;
        }
        if (iface->unregistering)
                return -ENODEV;

        alt = usb_altnum_to_altsetting(iface, alternate);
        if (!alt) {
                dev_warn(&dev->dev, "selecting invalid altsetting %d\n",
                         alternate);
                return -EINVAL;
        }
        /*
         * usb3 hosts configure the interface in usb_hcd_alloc_bandwidth,
         * including freeing dropped endpoint ring buffers.
         * Make sure the interface endpoints are flushed before that
         */
        usb_disable_interface(dev, iface, false);

        /* Make sure we have enough bandwidth for this alternate interface.
         * Remove the current alt setting and add the new alt setting.
         */
        mutex_lock(hcd->bandwidth_mutex);
        /* Disable LPM, and re-enable it once the new alt setting is installed,
         * so that the xHCI driver can recalculate the U1/U2 timeouts.
         */
        if (usb_disable_lpm(dev)) {
                dev_err(&iface->dev, "%s Failed to disable LPM\n", __func__);
                mutex_unlock(hcd->bandwidth_mutex);
                return -ENOMEM;
        }
        /* Changing alt-setting also frees any allocated streams */
        for (i = 0; i < iface->cur_altsetting->desc.bNumEndpoints; i++)
                iface->cur_altsetting->endpoint[i].streams = 0;

        ret = usb_hcd_alloc_bandwidth(dev, NULL, iface->cur_altsetting, alt);
        if (ret < 0) {
                dev_info(&dev->dev, "Not enough bandwidth for altsetting %d\n",
                                alternate);
                usb_enable_lpm(dev);
                mutex_unlock(hcd->bandwidth_mutex);
                return ret;
        }

        if (dev->quirks & USB_QUIRK_NO_SET_INTF)
                ret = -EPIPE;
        else
                ret = usb_control_msg_send(dev, 0,
                                           USB_REQ_SET_INTERFACE,
                                           USB_RECIP_INTERFACE, alternate,
                                           interface, NULL, 0, 5000,
                                           GFP_NOIO);

        /* 9.4.10 says devices don't need this and are free to STALL the
         * request if the interface only has one alternate setting.
         */
        if (ret == -EPIPE && iface->num_altsetting == 1) {
                dev_dbg(&dev->dev,
                        "manual set_interface for iface %d, alt %d\n",
                        interface, alternate);
                manual = 1;
        } else if (ret) {
                /* Re-instate the old alt setting */
                usb_hcd_alloc_bandwidth(dev, NULL, alt, iface->cur_altsetting);
                usb_enable_lpm(dev);
                mutex_unlock(hcd->bandwidth_mutex);
                return ret;
        }
        mutex_unlock(hcd->bandwidth_mutex);

        /* FIXME drivers shouldn't need to replicate/bugfix the logic here
         * when they implement async or easily-killable versions of this or
         * other "should-be-internal" functions (like clear_halt).
         * should hcd+usbcore postprocess control requests?
         */

        /* prevent submissions using previous endpoint settings */
        if (iface->cur_altsetting != alt) {
                remove_intf_ep_devs(iface);
                usb_remove_sysfs_intf_files(iface);
        }
        usb_disable_interface(dev, iface, true);

        iface->cur_altsetting = alt;

        /* Now that the interface is installed, re-enable LPM. */
        usb_unlocked_enable_lpm(dev);

        /* If the interface only has one altsetting and the device didn't
         * accept the request, we attempt to carry out the equivalent action
         * by manually clearing the HALT feature for each endpoint in the
         * new altsetting.
         */
        if (manual) {
                for (i = 0; i < alt->desc.bNumEndpoints; i++) {
                        epaddr = alt->endpoint[i].desc.bEndpointAddress;
                        pipe = __create_pipe(dev,
                                        USB_ENDPOINT_NUMBER_MASK & epaddr) |
                                        (usb_endpoint_out(epaddr) ?
                                        USB_DIR_OUT : USB_DIR_IN);

                        usb_clear_halt(dev, pipe);
                }
        }

        /* 9.1.1.5: reset toggles for all endpoints in the new altsetting
         *
         * Note:
         * Despite EP0 is always present in all interfaces/AS, the list of
         * endpoints from the descriptor does not contain EP0. Due to its
         * omnipresence one might expect EP0 being considered "affected" by
         * any SetInterface request and hence assume toggles need to be reset.
         * However, EP0 toggles are re-synced for every individual transfer
         * during the SETUP stage - hence EP0 toggles are "don't care" here.
         * (Likewise, EP0 never "halts" on well designed devices.)
         */
        usb_enable_interface(dev, iface, true);
        if (device_is_registered(&iface->dev)) {
                usb_create_sysfs_intf_files(iface);
                create_intf_ep_devs(iface);
        }
        return 0;
}
EXPORT_SYMBOL_GPL(usb_set_interface);

/**
 * usb_reset_configuration - lightweight device reset
 * @dev: the device whose configuration is being reset
 *
 * This issues a standard SET_CONFIGURATION request to the device using
 * the current configuration.  The effect is to reset most USB-related
 * state in the device, including interface altsettings (reset to zero),
 * endpoint halts (cleared), and endpoint state (only for bulk and interrupt
 * endpoints).  Other usbcore state is unchanged, including bindings of
 * usb device drivers to interfaces.
 *
 * Because this affects multiple interfaces, avoid using this with composite
 * (multi-interface) devices.  Instead, the driver for each interface may
 * use usb_set_interface() on the interfaces it claims.  Be careful though;
 * some devices don't support the SET_INTERFACE request, and others won't
 * reset all the interface state (notably endpoint state).  Resetting the whole
 * configuration would affect other drivers' interfaces.
 *
 * The caller must own the device lock.
 *
 * Return: Zero on success, else a negative error code.
 *
 * If this routine fails the device will probably be in an unusable state
 * with endpoints disabled, and interfaces only partially enabled.
 */
int usb_reset_configuration(struct usb_device *dev)
{
        int                        i, retval;
        struct usb_host_config        *config;
        struct usb_hcd *hcd = bus_to_hcd(dev->bus);

        if (dev->state == USB_STATE_SUSPENDED)
                return -EHOSTUNREACH;

        /* caller must have locked the device and must own
         * the usb bus readlock (so driver bindings are stable);
         * calls during probe() are fine
         */

        usb_disable_device_endpoints(dev, 1); /* skip ep0*/

        config = dev->actconfig;
        retval = 0;
        mutex_lock(hcd->bandwidth_mutex);
        /* Disable LPM, and re-enable it once the configuration is reset, so
         * that the xHCI driver can recalculate the U1/U2 timeouts.
         */
        if (usb_disable_lpm(dev)) {
                dev_err(&dev->dev, "%s Failed to disable LPM\n", __func__);
                mutex_unlock(hcd->bandwidth_mutex);
                return -ENOMEM;
        }

        /* xHCI adds all endpoints in usb_hcd_alloc_bandwidth */
        retval = usb_hcd_alloc_bandwidth(dev, config, NULL, NULL);
        if (retval < 0) {
                usb_enable_lpm(dev);
                mutex_unlock(hcd->bandwidth_mutex);
                return retval;
        }
        retval = usb_control_msg_send(dev, 0, USB_REQ_SET_CONFIGURATION, 0,
                                      config->desc.bConfigurationValue, 0,
                                      NULL, 0, USB_CTRL_SET_TIMEOUT,
                                      GFP_NOIO);
        if (retval) {
                usb_hcd_alloc_bandwidth(dev, NULL, NULL, NULL);
                usb_enable_lpm(dev);
                mutex_unlock(hcd->bandwidth_mutex);
                return retval;
        }
        mutex_unlock(hcd->bandwidth_mutex);

        /* re-init hc/hcd interface/endpoint state */
        for (i = 0; i < config->desc.bNumInterfaces; i++) {
                struct usb_interface *intf = config->interface[i];
                struct usb_host_interface *alt;

                alt = usb_altnum_to_altsetting(intf, 0);

                /* No altsetting 0?  We'll assume the first altsetting.
                 * We could use a GetInterface call, but if a device is
                 * so non-compliant that it doesn't have altsetting 0
                 * then I wouldn't trust its reply anyway.
                 */
                if (!alt)
                        alt = &intf->altsetting[0];

                if (alt != intf->cur_altsetting) {
                        remove_intf_ep_devs(intf);
                        usb_remove_sysfs_intf_files(intf);
                }
                intf->cur_altsetting = alt;
                usb_enable_interface(dev, intf, true);
                if (device_is_registered(&intf->dev)) {
                        usb_create_sysfs_intf_files(intf);
                        create_intf_ep_devs(intf);
                }
        }
        /* Now that the interfaces are installed, re-enable LPM. */
        usb_unlocked_enable_lpm(dev);
        return 0;
}
EXPORT_SYMBOL_GPL(usb_reset_configuration);

static void usb_release_interface(struct device *dev)
{
        struct usb_interface *intf = to_usb_interface(dev);
        struct usb_interface_cache *intfc =
                        altsetting_to_usb_interface_cache(intf->altsetting);

        kref_put(&intfc->ref, usb_release_interface_cache);
        usb_put_dev(interface_to_usbdev(intf));
        of_node_put(dev->of_node);
        kfree(intf);
}

/*
 * usb_deauthorize_interface - deauthorize an USB interface
 *
 * @intf: USB interface structure
 */
void usb_deauthorize_interface(struct usb_interface *intf)
{
        struct device *dev = &intf->dev;

        device_lock(dev->parent);

        if (intf->authorized) {
                device_lock(dev);
                intf->authorized = 0;
                device_unlock(dev);

                usb_forced_unbind_intf(intf);
        }

        device_unlock(dev->parent);
}

/*
 * usb_authorize_interface - authorize an USB interface
 *
 * @intf: USB interface structure
 */
void usb_authorize_interface(struct usb_interface *intf)
{
        struct device *dev = &intf->dev;

        if (!intf->authorized) {
                device_lock(dev);
                intf->authorized = 1; /* authorize interface */
                device_unlock(dev);
        }
}

static int usb_if_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        const struct usb_device *usb_dev;
        const struct usb_interface *intf;
        const struct usb_host_interface *alt;

        intf = to_usb_interface(dev);
        usb_dev = interface_to_usbdev(intf);
        alt = intf->cur_altsetting;

        if (add_uevent_var(env, "INTERFACE=%d/%d/%d",
                   alt->desc.bInterfaceClass,
                   alt->desc.bInterfaceSubClass,
                   alt->desc.bInterfaceProtocol))
                return -ENOMEM;

        if (add_uevent_var(env,
                   "MODALIAS=usb:"
                   "v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02Xic%02Xisc%02Xip%02Xin%02X",
                   le16_to_cpu(usb_dev->descriptor.idVendor),
                   le16_to_cpu(usb_dev->descriptor.idProduct),
                   le16_to_cpu(usb_dev->descriptor.bcdDevice),
                   usb_dev->descriptor.bDeviceClass,
                   usb_dev->descriptor.bDeviceSubClass,
                   usb_dev->descriptor.bDeviceProtocol,
                   alt->desc.bInterfaceClass,
                   alt->desc.bInterfaceSubClass,
                   alt->desc.bInterfaceProtocol,
                   alt->desc.bInterfaceNumber))
                return -ENOMEM;

        return 0;
}

const struct device_type usb_if_device_type = {
        .name =                "usb_interface",
        .release =        usb_release_interface,
        .uevent =        usb_if_uevent,
};

static struct usb_interface_assoc_descriptor *find_iad(struct usb_device *dev,
                                                struct usb_host_config *config,
                                                u8 inum)
{
        struct usb_interface_assoc_descriptor *retval = NULL;
        struct usb_interface_assoc_descriptor *intf_assoc;
        int first_intf;
        int last_intf;
        int i;

        for (i = 0; (i < USB_MAXIADS && config->intf_assoc[i]); i++) {
                intf_assoc = config->intf_assoc[i];
                if (intf_assoc->bInterfaceCount == 0)
                        continue;

                first_intf = intf_assoc->bFirstInterface;
                last_intf = first_intf + (intf_assoc->bInterfaceCount - 1);
                if (inum >= first_intf && inum <= last_intf) {
                        if (!retval)
                                retval = intf_assoc;
                        else
                                dev_err(&dev->dev, "Interface #%d referenced"
                                        " by multiple IADs\n", inum);
                }
        }

        return retval;
}


/*
 * Internal function to queue a device reset
 * See usb_queue_reset_device() for more details
 */
static void __usb_queue_reset_device(struct work_struct *ws)
{
        int rc;
        struct usb_interface *iface =
                container_of(ws, struct usb_interface, reset_ws);
        struct usb_device *udev = interface_to_usbdev(iface);

        rc = usb_lock_device_for_reset(udev, iface);
        if (rc >= 0) {
                usb_reset_device(udev);
                usb_unlock_device(udev);
        }
        usb_put_intf(iface);        /* Undo _get_ in usb_queue_reset_device() */
}

/*
 * Internal function to set the wireless_status sysfs attribute
 * See usb_set_wireless_status() for more details
 */
static void __usb_wireless_status_intf(struct work_struct *ws)
{
        struct usb_interface *iface =
                container_of(ws, struct usb_interface, wireless_status_work);

        device_lock(iface->dev.parent);
        if (iface->sysfs_files_created)
                usb_update_wireless_status_attr(iface);
        device_unlock(iface->dev.parent);
        usb_put_intf(iface);        /* Undo _get_ in usb_set_wireless_status() */
}

/**
 * usb_set_wireless_status - sets the wireless_status struct member
 * @iface: the interface to modify
 * @status: the new wireless status
 *
 * Set the wireless_status struct member to the new value, and emit
 * sysfs changes as necessary.
 *
 * Returns: 0 on success, -EALREADY if already set.
 */
int usb_set_wireless_status(struct usb_interface *iface,
                enum usb_wireless_status status)
{
        if (iface->wireless_status == status)
                return -EALREADY;

        usb_get_intf(iface);
        iface->wireless_status = status;
        schedule_work(&iface->wireless_status_work);

        return 0;
}
EXPORT_SYMBOL_GPL(usb_set_wireless_status);

/*
 * usb_set_configuration - Makes a particular device setting be current
 * @dev: the device whose configuration is being updated
 * @configuration: the configuration being chosen.
 *
 * Context: task context, might sleep. Caller holds device lock.
 *
 * This is used to enable non-default device modes.  Not all devices
 * use this kind of configurability; many devices only have one
 * configuration.
 *
 * @configuration is the value of the configuration to be installed.
 * According to the USB spec (e.g. section 9.1.1.5), configuration values
 * must be non-zero; a value of zero indicates that the device in
 * unconfigured.  However some devices erroneously use 0 as one of their
 * configuration values.  To help manage such devices, this routine will
 * accept @configuration = -1 as indicating the device should be put in
 * an unconfigured state.
 *
 * USB device configurations may affect Linux interoperability,
 * power consumption and the functionality available.  For example,
 * the default configuration is limited to using 100mA of bus power,
 * so that when certain device functionality requires more power,
 * and the device is bus powered, that functionality should be in some
 * non-default device configuration.  Other device modes may also be
 * reflected as configuration options, such as whether two ISDN
 * channels are available independently; and choosing between open
 * standard device protocols (like CDC) or proprietary ones.
 *
 * Note that a non-authorized device (dev->authorized == 0) will only
 * be put in unconfigured mode.
 *
 * Note that USB has an additional level of device configurability,
 * associated with interfaces.  That configurability is accessed using
 * usb_set_interface().
 *
 * This call is synchronous. The calling context must be able to sleep,
 * must own the device lock, and must not hold the driver model's USB
 * bus mutex; usb interface driver probe() methods cannot use this routine.
 *
 * Returns zero on success, or else the status code returned by the
 * underlying call that failed.  On successful completion, each interface
 * in the original device configuration has been destroyed, and each one
 * in the new configuration has been probed by all relevant usb device
 * drivers currently known to the kernel.
 */
int usb_set_configuration(struct usb_device *dev, int configuration)
{
        int i, ret;
        struct usb_host_config *cp = NULL;
        struct usb_interface **new_interfaces = NULL;
        struct usb_hcd *hcd = bus_to_hcd(dev->bus);
        int n, nintf;

        if (dev->authorized == 0 || configuration == -1)
                configuration = 0;
        else {
                for (i = 0; i < dev->descriptor.bNumConfigurations; i++) {
                        if (dev->config[i].desc.bConfigurationValue ==
                                        configuration) {
                                cp = &dev->config[i];
                                break;
                        }
                }
        }
        if ((!cp && configuration != 0))
                return -EINVAL;

        /* The USB spec says configuration 0 means unconfigured.
         * But if a device includes a configuration numbered 0,
         * we will accept it as a correctly configured state.
         * Use -1 if you really want to unconfigure the device.
         */
        if (cp && configuration == 0)
                dev_warn(&dev->dev, "config 0 descriptor??\n");

        /* Allocate memory for new interfaces before doing anything else,
         * so that if we run out then nothing will have changed. */
        n = nintf = 0;
        if (cp) {
                nintf = cp->desc.bNumInterfaces;
                new_interfaces = kmalloc_objs(*new_interfaces, nintf, GFP_NOIO);
                if (!new_interfaces)
                        return -ENOMEM;

                for (; n < nintf; ++n) {
                        new_interfaces[n] = kzalloc_obj(struct usb_interface,
                                                        GFP_NOIO);
                        if (!new_interfaces[n]) {
                                ret = -ENOMEM;
free_interfaces:
                                while (--n >= 0)
                                        kfree(new_interfaces[n]);
                                kfree(new_interfaces);
                                return ret;
                        }
                }

                i = dev->bus_mA - usb_get_max_power(dev, cp);
                if (i < 0)
                        dev_warn(&dev->dev, "new config #%d exceeds power "
                                        "limit by %dmA\n",
                                        configuration, -i);
        }

        /* Wake up the device so we can send it the Set-Config request */
        ret = usb_autoresume_device(dev);
        if (ret)
                goto free_interfaces;

        /* if it's already configured, clear out old state first.
         * getting rid of old interfaces means unbinding their drivers.
         */
        if (dev->state != USB_STATE_ADDRESS)
                usb_disable_device(dev, 1);        /* Skip ep0 */

        /* Get rid of pending async Set-Config requests for this device */
        cancel_async_set_config(dev);

        /* Make sure we have bandwidth (and available HCD resources) for this
         * configuration.  Remove endpoints from the schedule if we're dropping
         * this configuration to set configuration 0.  After this point, the
         * host controller will not allow submissions to dropped endpoints.  If
         * this call fails, the device state is unchanged.
         */
        mutex_lock(hcd->bandwidth_mutex);
        /* Disable LPM, and re-enable it once the new configuration is
         * installed, so that the xHCI driver can recalculate the U1/U2
         * timeouts.
         */
        if (dev->actconfig && usb_disable_lpm(dev)) {
                dev_err(&dev->dev, "%s Failed to disable LPM\n", __func__);
                mutex_unlock(hcd->bandwidth_mutex);
                ret = -ENOMEM;
                goto free_interfaces;
        }
        ret = usb_hcd_alloc_bandwidth(dev, cp, NULL, NULL);
        if (ret < 0) {
                if (dev->actconfig)
                        usb_enable_lpm(dev);
                mutex_unlock(hcd->bandwidth_mutex);
                usb_autosuspend_device(dev);
                goto free_interfaces;
        }

        /*
         * Initialize the new interface structures and the
         * hc/hcd/usbcore interface/endpoint state.
         */
        for (i = 0; i < nintf; ++i) {
                struct usb_interface_cache *intfc;
                struct usb_interface *intf;
                struct usb_host_interface *alt;
                u8 ifnum;

                cp->interface[i] = intf = new_interfaces[i];
                intfc = cp->intf_cache[i];
                intf->altsetting = intfc->altsetting;
                intf->num_altsetting = intfc->num_altsetting;
                intf->authorized = !!HCD_INTF_AUTHORIZED(hcd);
                kref_get(&intfc->ref);

                alt = usb_altnum_to_altsetting(intf, 0);

                /* No altsetting 0?  We'll assume the first altsetting.
                 * We could use a GetInterface call, but if a device is
                 * so non-compliant that it doesn't have altsetting 0
                 * then I wouldn't trust its reply anyway.
                 */
                if (!alt)
                        alt = &intf->altsetting[0];

                ifnum = alt->desc.bInterfaceNumber;
                intf->intf_assoc = find_iad(dev, cp, ifnum);
                intf->cur_altsetting = alt;
                usb_enable_interface(dev, intf, true);
                intf->dev.parent = &dev->dev;
                if (usb_of_has_combined_node(dev)) {
                        device_set_of_node_from_dev(&intf->dev, &dev->dev);
                } else {
                        intf->dev.of_node = usb_of_get_interface_node(dev,
                                        configuration, ifnum);
                }
                ACPI_COMPANION_SET(&intf->dev, ACPI_COMPANION(&dev->dev));
                intf->dev.driver = NULL;
                intf->dev.bus = &usb_bus_type;
                intf->dev.type = &usb_if_device_type;
                intf->dev.groups = usb_interface_groups;
                INIT_WORK(&intf->reset_ws, __usb_queue_reset_device);
                INIT_WORK(&intf->wireless_status_work, __usb_wireless_status_intf);
                intf->minor = -1;
                device_initialize(&intf->dev);
                pm_runtime_no_callbacks(&intf->dev);
                dev_set_name(&intf->dev, "%d-%s:%d.%d", dev->bus->busnum,
                                dev->devpath, configuration, ifnum);
                usb_get_dev(dev);
        }
        kfree(new_interfaces);

        ret = usb_control_msg_send(dev, 0, USB_REQ_SET_CONFIGURATION, 0,
                                   configuration, 0, NULL, 0,
                                   USB_CTRL_SET_TIMEOUT, GFP_NOIO);
        if (ret && cp) {
                /*
                 * All the old state is gone, so what else can we do?
                 * The device is probably useless now anyway.
                 */
                usb_hcd_alloc_bandwidth(dev, NULL, NULL, NULL);
                for (i = 0; i < nintf; ++i) {
                        usb_disable_interface(dev, cp->interface[i], true);
                        put_device(&cp->interface[i]->dev);
                        cp->interface[i] = NULL;
                }
                cp = NULL;
        }

        dev->actconfig = cp;
        mutex_unlock(hcd->bandwidth_mutex);

        if (!cp) {
                usb_set_device_state(dev, USB_STATE_ADDRESS);

                /* Leave LPM disabled while the device is unconfigured. */
                usb_autosuspend_device(dev);
                return ret;
        }
        usb_set_device_state(dev, USB_STATE_CONFIGURED);

        if (cp->string == NULL &&
                        !(dev->quirks & USB_QUIRK_CONFIG_INTF_STRINGS))
                cp->string = usb_cache_string(dev, cp->desc.iConfiguration);

        /* Now that the interfaces are installed, re-enable LPM. */
        usb_unlocked_enable_lpm(dev);
        /* Enable LTM if it was turned off by usb_disable_device. */
        usb_enable_ltm(dev);

        /* Now that all the interfaces are set up, register them
         * to trigger binding of drivers to interfaces.  probe()
         * routines may install different altsettings and may
         * claim() any interfaces not yet bound.  Many class drivers
         * need that: CDC, audio, video, etc.
         */
        for (i = 0; i < nintf; ++i) {
                struct usb_interface *intf = cp->interface[i];

                if (intf->dev.of_node &&
                    !of_device_is_available(intf->dev.of_node)) {
                        dev_info(&dev->dev, "skipping disabled interface %d\n",
                                 intf->cur_altsetting->desc.bInterfaceNumber);
                        continue;
                }

                dev_dbg(&dev->dev,
                        "adding %s (config #%d, interface %d)\n",
                        dev_name(&intf->dev), configuration,
                        intf->cur_altsetting->desc.bInterfaceNumber);
                device_enable_async_suspend(&intf->dev);
                ret = device_add(&intf->dev);
                if (ret != 0) {
                        dev_err(&dev->dev, "device_add(%s) --> %d\n",
                                dev_name(&intf->dev), ret);
                        continue;
                }
                create_intf_ep_devs(intf);
        }

        usb_autosuspend_device(dev);
        return 0;
}
EXPORT_SYMBOL_GPL(usb_set_configuration);

static LIST_HEAD(set_config_list);
static DEFINE_SPINLOCK(set_config_lock);

struct set_config_request {
        struct usb_device        *udev;
        int                        config;
        struct work_struct        work;
        struct list_head        node;
};

/* Worker routine for usb_driver_set_configuration() */
static void driver_set_config_work(struct work_struct *work)
{
        struct set_config_request *req =
                container_of(work, struct set_config_request, work);
        struct usb_device *udev = req->udev;

        usb_lock_device(udev);
        spin_lock(&set_config_lock);
        list_del(&req->node);
        spin_unlock(&set_config_lock);

        if (req->config >= -1)                /* Is req still valid? */
                usb_set_configuration(udev, req->config);
        usb_unlock_device(udev);
        usb_put_dev(udev);
        kfree(req);
}

/* Cancel pending Set-Config requests for a device whose configuration
 * was just changed
 */
static void cancel_async_set_config(struct usb_device *udev)
{
        struct set_config_request *req;

        spin_lock(&set_config_lock);
        list_for_each_entry(req, &set_config_list, node) {
                if (req->udev == udev)
                        req->config = -999;        /* Mark as cancelled */
        }
        spin_unlock(&set_config_lock);
}

/**
 * usb_driver_set_configuration - Provide a way for drivers to change device configurations
 * @udev: the device whose configuration is being updated
 * @config: the configuration being chosen.
 * Context: In process context, must be able to sleep
 *
 * Device interface drivers are not allowed to change device configurations.
 * This is because changing configurations will destroy the interface the
 * driver is bound to and create new ones; it would be like a floppy-disk
 * driver telling the computer to replace the floppy-disk drive with a
 * tape drive!
 *
 * Still, in certain specialized circumstances the need may arise.  This
 * routine gets around the normal restrictions by using a work thread to
 * submit the change-config request.
 *
 * Return: 0 if the request was successfully queued, error code otherwise.
 * The caller has no way to know whether the queued request will eventually
 * succeed.
 */
int usb_driver_set_configuration(struct usb_device *udev, int config)
{
        struct set_config_request *req;

        req = kmalloc_obj(*req);
        if (!req)
                return -ENOMEM;
        req->udev = udev;
        req->config = config;
        INIT_WORK(&req->work, driver_set_config_work);

        spin_lock(&set_config_lock);
        list_add(&req->node, &set_config_list);
        spin_unlock(&set_config_lock);

        usb_get_dev(udev);
        schedule_work(&req->work);
        return 0;
}
EXPORT_SYMBOL_GPL(usb_driver_set_configuration);

/**
 * cdc_parse_cdc_header - parse the extra headers present in CDC devices
 * @hdr: the place to put the results of the parsing
 * @intf: the interface for which parsing is requested
 * @buffer: pointer to the extra headers to be parsed
 * @buflen: length of the extra headers
 *
 * This evaluates the extra headers present in CDC devices which
 * bind the interfaces for data and control and provide details
 * about the capabilities of the device.
 *
 * Return: number of descriptors parsed or -EINVAL
 * if the header is contradictory beyond salvage
 */

int cdc_parse_cdc_header(struct usb_cdc_parsed_header *hdr,
                                struct usb_interface *intf,
                                u8 *buffer,
                                int buflen)
{
        /* duplicates are ignored */
        struct usb_cdc_union_desc *union_header = NULL;

        /* duplicates are not tolerated */
        struct usb_cdc_header_desc *header = NULL;
        struct usb_cdc_ether_desc *ether = NULL;
        struct usb_cdc_mdlm_detail_desc *detail = NULL;
        struct usb_cdc_mdlm_desc *desc = NULL;

        unsigned int elength;
        int cnt = 0;

        memset(hdr, 0x00, sizeof(struct usb_cdc_parsed_header));
        hdr->phonet_magic_present = false;
        while (buflen > 0) {
                elength = buffer[0];
                if (!elength) {
                        dev_err(&intf->dev, "skipping garbage byte\n");
                        elength = 1;
                        goto next_desc;
                }
                if ((buflen < elength) || (elength < 3)) {
                        dev_err(&intf->dev, "invalid descriptor buffer length\n");
                        break;
                }
                if (buffer[1] != USB_DT_CS_INTERFACE) {
                        dev_err(&intf->dev, "skipping garbage\n");
                        goto next_desc;
                }

                switch (buffer[2]) {
                case USB_CDC_UNION_TYPE: /* we've found it */
                        if (elength < sizeof(struct usb_cdc_union_desc))
                                goto next_desc;
                        if (union_header) {
                                dev_err(&intf->dev, "More than one union descriptor, skipping ...\n");
                                goto next_desc;
                        }
                        union_header = (struct usb_cdc_union_desc *)buffer;
                        break;
                case USB_CDC_COUNTRY_TYPE:
                        if (elength < sizeof(struct usb_cdc_country_functional_desc))
                                goto next_desc;
                        hdr->usb_cdc_country_functional_desc =
                                (struct usb_cdc_country_functional_desc *)buffer;
                        break;
                case USB_CDC_HEADER_TYPE:
                        if (elength != sizeof(struct usb_cdc_header_desc))
                                goto next_desc;
                        if (header)
                                return -EINVAL;
                        header = (struct usb_cdc_header_desc *)buffer;
                        break;
                case USB_CDC_ACM_TYPE:
                        if (elength < sizeof(struct usb_cdc_acm_descriptor))
                                goto next_desc;
                        hdr->usb_cdc_acm_descriptor =
                                (struct usb_cdc_acm_descriptor *)buffer;
                        break;
                case USB_CDC_ETHERNET_TYPE:
                        if (elength != sizeof(struct usb_cdc_ether_desc))
                                goto next_desc;
                        if (ether)
                                return -EINVAL;
                        ether = (struct usb_cdc_ether_desc *)buffer;
                        break;
                case USB_CDC_CALL_MANAGEMENT_TYPE:
                        if (elength < sizeof(struct usb_cdc_call_mgmt_descriptor))
                                goto next_desc;
                        hdr->usb_cdc_call_mgmt_descriptor =
                                (struct usb_cdc_call_mgmt_descriptor *)buffer;
                        break;
                case USB_CDC_DMM_TYPE:
                        if (elength < sizeof(struct usb_cdc_dmm_desc))
                                goto next_desc;
                        hdr->usb_cdc_dmm_desc =
                                (struct usb_cdc_dmm_desc *)buffer;
                        break;
                case USB_CDC_MDLM_TYPE:
                        if (elength < sizeof(struct usb_cdc_mdlm_desc))
                                goto next_desc;
                        if (desc)
                                return -EINVAL;
                        desc = (struct usb_cdc_mdlm_desc *)buffer;
                        break;
                case USB_CDC_MDLM_DETAIL_TYPE:
                        if (elength < sizeof(struct usb_cdc_mdlm_detail_desc))
                                goto next_desc;
                        if (detail)
                                return -EINVAL;
                        detail = (struct usb_cdc_mdlm_detail_desc *)buffer;
                        break;
                case USB_CDC_NCM_TYPE:
                        if (elength < sizeof(struct usb_cdc_ncm_desc))
                                goto next_desc;
                        hdr->usb_cdc_ncm_desc = (struct usb_cdc_ncm_desc *)buffer;
                        break;
                case USB_CDC_MBIM_TYPE:
                        if (elength < sizeof(struct usb_cdc_mbim_desc))
                                goto next_desc;

                        hdr->usb_cdc_mbim_desc = (struct usb_cdc_mbim_desc *)buffer;
                        break;
                case USB_CDC_MBIM_EXTENDED_TYPE:
                        if (elength < sizeof(struct usb_cdc_mbim_extended_desc))
                                goto next_desc;
                        hdr->usb_cdc_mbim_extended_desc =
                                (struct usb_cdc_mbim_extended_desc *)buffer;
                        break;
                case CDC_PHONET_MAGIC_NUMBER:
                        hdr->phonet_magic_present = true;
                        break;
                default:
                        /*
                         * there are LOTS more CDC descriptors that
                         * could legitimately be found here.
                         */
                        dev_dbg(&intf->dev, "Ignoring descriptor: type %02x, length %ud\n",
                                        buffer[2], elength);
                        goto next_desc;
                }
                cnt++;
next_desc:
                buflen -= elength;
                buffer += elength;
        }
        hdr->usb_cdc_union_desc = union_header;
        hdr->usb_cdc_header_desc = header;
        hdr->usb_cdc_mdlm_detail_desc = detail;
        hdr->usb_cdc_mdlm_desc = desc;
        hdr->usb_cdc_ether_desc = ether;
        return cnt;
}

EXPORT_SYMBOL(cdc_parse_cdc_header);


































































































































































































    1 














































    1 





















    1 















    1 







    1 






























    1 







    1 
















    1 

    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 
    1 



































































































































































































































































    1 









    1 







    1 
















    1 

































    1 











    1 


































































    1 

























    1 






























    1 







    1 















    1 






    1 































    1 














    1 










































































































    1 









    1 






    1 




































































    1 
















    1 



































































    1 












    1 

    1 
    1 


















    1 
















    1 
















































































































































































































































































































































































































    1 











    1 











    1 









    1 


    1 

    1 


























    1 

    1 
    1 




    1 

    1 

    1 








    1 










    1 

    1 
















    1 




























    1 




































    1 


    1 



    1 



















    1 
    1 



























































































































































































































































    1 






































    1 




    1 













































    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/xattr.c
 *
 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
 *
 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
 * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
 * Extended attributes for symlinks and special files added per
 *  suggestion of Luka Renko <luka.renko@hermes.si>.
 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
 *  Red Hat Inc.
 * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
 *  and Andreas Gruenbacher <agruen@suse.de>.
 */

/*
 * Extended attributes are stored directly in inodes (on file systems with
 * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
 * field contains the block number if an inode uses an additional block. All
 * attributes must fit in the inode and one additional block. Blocks that
 * contain the identical set of attributes may be shared among several inodes.
 * Identical blocks are detected by keeping a cache of blocks that have
 * recently been accessed.
 *
 * The attributes in inodes and on blocks have a different header; the entries
 * are stored in the same format:
 *
 *   +------------------+
 *   | header           |
 *   | entry 1          | |
 *   | entry 2          | | growing downwards
 *   | entry 3          | v
 *   | four null bytes  |
 *   | . . .            |
 *   | value 1          | ^
 *   | value 3          | | growing upwards
 *   | value 2          | |
 *   +------------------+
 *
 * The header is followed by multiple entry descriptors. In disk blocks, the
 * entry descriptors are kept sorted. In inodes, they are unsorted. The
 * attribute values are aligned to the end of the block in no specific order.
 *
 * Locking strategy
 * ----------------
 * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
 * EA blocks are only changed if they are exclusive to an inode, so
 * holding xattr_sem also means that nothing but the EA block's reference
 * count can change. Multiple writers to the same block are synchronized
 * by the buffer lock.
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/mbcache.h>
#include <linux/quotaops.h>
#include <linux/iversion.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
#include "acl.h"

#ifdef EXT4_XATTR_DEBUG
# define ea_idebug(inode, fmt, ...)                                        \
        printk(KERN_DEBUG "inode %s:%llu: " fmt "\n",                        \
               inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__)
# define ea_bdebug(bh, fmt, ...)                                        \
        printk(KERN_DEBUG "block %pg:%lu: " fmt "\n",                        \
               bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__)
#else
# define ea_idebug(inode, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
# define ea_bdebug(bh, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

static void ext4_xattr_block_cache_insert(struct mb_cache *,
                                          struct buffer_head *);
static struct buffer_head *
ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
                            struct mb_cache_entry **);
static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
                                    size_t value_count);
static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value,
                                    size_t value_count);
static void ext4_xattr_rehash(struct ext4_xattr_header *);

static const struct xattr_handler * const ext4_xattr_handler_map[] = {
        [EXT4_XATTR_INDEX_USER]                     = &ext4_xattr_user_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &nop_posix_acl_access,
        [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
#endif
        [EXT4_XATTR_INDEX_TRUSTED]             = &ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
        [EXT4_XATTR_INDEX_SECURITY]             = &ext4_xattr_security_handler,
#endif
        [EXT4_XATTR_INDEX_HURD]                     = &ext4_xattr_hurd_handler,
};

const struct xattr_handler * const ext4_xattr_handlers[] = {
        &ext4_xattr_user_handler,
        &ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
        &ext4_xattr_security_handler,
#endif
        &ext4_xattr_hurd_handler,
        NULL
};

#define EA_BLOCK_CACHE(inode)        (((struct ext4_sb_info *) \
                                inode->i_sb->s_fs_info)->s_ea_block_cache)

#define EA_INODE_CACHE(inode)        (((struct ext4_sb_info *) \
                                inode->i_sb->s_fs_info)->s_ea_inode_cache)

static int
ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
                        struct inode *inode);

#ifdef CONFIG_LOCKDEP
void ext4_xattr_inode_set_class(struct inode *ea_inode)
{
        struct ext4_inode_info *ei = EXT4_I(ea_inode);

        lockdep_set_subclass(&ea_inode->i_rwsem, 1);
        (void) ei;        /* shut up clang warning if !CONFIG_LOCKDEP */
        lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA);
}
#endif

static __le32 ext4_xattr_block_csum(struct inode *inode,
                                    sector_t block_nr,
                                    struct ext4_xattr_header *hdr)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 csum;
        __le64 dsk_block_nr = cpu_to_le64(block_nr);
        __u32 dummy_csum = 0;
        int offset = offsetof(struct ext4_xattr_header, h_checksum);

        csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
                           sizeof(dsk_block_nr));
        csum = ext4_chksum(csum, (__u8 *)hdr, offset);
        csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
        offset += sizeof(dummy_csum);
        csum = ext4_chksum(csum, (__u8 *)hdr + offset,
                           EXT4_BLOCK_SIZE(inode->i_sb) - offset);

        return cpu_to_le32(csum);
}

static int ext4_xattr_block_csum_verify(struct inode *inode,
                                        struct buffer_head *bh)
{
        struct ext4_xattr_header *hdr = BHDR(bh);
        int ret = 1;

        if (ext4_has_feature_metadata_csum(inode->i_sb)) {
                lock_buffer(bh);
                ret = (hdr->h_checksum == ext4_xattr_block_csum(inode,
                                                        bh->b_blocknr, hdr));
                unlock_buffer(bh);
        }
        return ret;
}

static void ext4_xattr_block_csum_set(struct inode *inode,
                                      struct buffer_head *bh)
{
        if (ext4_has_feature_metadata_csum(inode->i_sb))
                BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode,
                                                bh->b_blocknr, BHDR(bh));
}

static inline const char *ext4_xattr_prefix(int name_index,
                                            struct dentry *dentry)
{
        const struct xattr_handler *handler = NULL;

        if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
                handler = ext4_xattr_handler_map[name_index];

        if (!xattr_handler_can_list(handler, dentry))
                return NULL;

        return xattr_prefix(handler);
}

static int
check_xattrs(struct inode *inode, struct buffer_head *bh,
             struct ext4_xattr_entry *entry, void *end, void *value_start,
             const char *function, unsigned int line)
{
        struct ext4_xattr_entry *e = entry;
        int err = -EFSCORRUPTED;
        char *err_str;

        if (bh) {
                if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
                    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
                        err_str = "invalid header";
                        goto errout;
                }
                if (buffer_verified(bh))
                        return 0;
                if (!ext4_xattr_block_csum_verify(inode, bh)) {
                        err = -EFSBADCRC;
                        err_str = "invalid checksum";
                        goto errout;
                }
        } else {
                struct ext4_xattr_ibody_header *header = value_start;

                header -= 1;
                if (end - (void *)header < sizeof(*header) + sizeof(u32)) {
                        err_str = "in-inode xattr block too small";
                        goto errout;
                }
                if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
                        err_str = "bad magic number in in-inode xattr";
                        goto errout;
                }
        }

        /* Find the end of the names list */
        while (!IS_LAST_ENTRY(e)) {
                struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
                if ((void *)next + sizeof(u32) > end) {
                        err_str = "e_name out of bounds";
                        goto errout;
                }
                if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) {
                        err_str = "bad e_name length";
                        goto errout;
                }
                e = next;
        }

        /* Check the values */
        while (!IS_LAST_ENTRY(entry)) {
                u32 size = le32_to_cpu(entry->e_value_size);
                unsigned long ea_ino = le32_to_cpu(entry->e_value_inum);

                if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) {
                        err_str = "ea_inode specified without ea_inode feature enabled";
                        goto errout;
                }
                if (ea_ino && ((ea_ino == EXT4_ROOT_INO) ||
                               !ext4_valid_inum(inode->i_sb, ea_ino))) {
                        err_str = "invalid ea_ino";
                        goto errout;
                }
                if (ea_ino && !size) {
                        err_str = "invalid size in ea xattr";
                        goto errout;
                }
                if (size > EXT4_XATTR_SIZE_MAX) {
                        err_str = "e_value size too large";
                        goto errout;
                }

                if (size != 0 && entry->e_value_inum == 0) {
                        u16 offs = le16_to_cpu(entry->e_value_offs);
                        void *value;

                        /*
                         * The value cannot overlap the names, and the value
                         * with padding cannot extend beyond 'end'.  Check both
                         * the padded and unpadded sizes, since the size may
                         * overflow to 0 when adding padding.
                         */
                        if (offs > end - value_start) {
                                err_str = "e_value out of bounds";
                                goto errout;
                        }
                        value = value_start + offs;
                        if (value < (void *)e + sizeof(u32) ||
                            size > end - value ||
                            EXT4_XATTR_SIZE(size) > end - value) {
                                err_str = "overlapping e_value ";
                                goto errout;
                        }
                }
                entry = EXT4_XATTR_NEXT(entry);
        }
        if (bh)
                set_buffer_verified(bh);
        return 0;

errout:
        if (bh)
                __ext4_error_inode(inode, function, line, 0, -err,
                                   "corrupted xattr block %llu: %s",
                                   (unsigned long long) bh->b_blocknr,
                                   err_str);
        else
                __ext4_error_inode(inode, function, line, 0, -err,
                                   "corrupted in-inode xattr: %s", err_str);
        return err;
}

static inline int
__ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
                         const char *function, unsigned int line)
{
        return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size,
                            bh->b_data, function, line);
}

#define ext4_xattr_check_block(inode, bh) \
        __ext4_xattr_check_block((inode), (bh),  __func__, __LINE__)


int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
                         void *end, const char *function, unsigned int line)
{
        return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header),
                            function, line);
}

static int
xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
                 void *end, int name_index, const char *name, int sorted)
{
        struct ext4_xattr_entry *entry, *next;
        size_t name_len;
        int cmp = 1;

        if (name == NULL)
                return -EINVAL;
        name_len = strlen(name);
        for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) {
                next = EXT4_XATTR_NEXT(entry);
                if ((void *) next >= end) {
                        EXT4_ERROR_INODE(inode, "corrupted xattr entries");
                        return -EFSCORRUPTED;
                }
                cmp = name_index - entry->e_name_index;
                if (!cmp)
                        cmp = name_len - entry->e_name_len;
                if (!cmp)
                        cmp = memcmp(name, entry->e_name, name_len);
                if (!cmp || (cmp < 0 && sorted))
                        break;
        }
        *pentry = entry;
        return cmp ? -ENODATA : 0;
}

static u32
ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
{
        return ext4_chksum(sbi->s_csum_seed, buffer, size);
}

static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
        return ((u64) inode_get_ctime_sec(ea_inode) << 32) |
                (u32) inode_peek_iversion_raw(ea_inode);
}

static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
{
        inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0);
        inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff);
}

static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
{
        return (u32) inode_get_atime_sec(ea_inode);
}

static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
{
        inode_set_atime(ea_inode, hash, 0);
}

/*
 * Read the EA value from an inode.
 */
static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
{
        int blocksize = 1 << ea_inode->i_blkbits;
        int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits;
        int tail_size = (size % blocksize) ?: blocksize;
        struct buffer_head *bhs_inline[8];
        struct buffer_head **bhs = bhs_inline;
        int i, ret;

        if (bh_count > ARRAY_SIZE(bhs_inline)) {
                bhs = kmalloc_objs(*bhs, bh_count, GFP_NOFS);
                if (!bhs)
                        return -ENOMEM;
        }

        ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count,
                               true /* wait */, bhs);
        if (ret)
                goto free_bhs;

        for (i = 0; i < bh_count; i++) {
                /* There shouldn't be any holes in ea_inode. */
                if (!bhs[i]) {
                        ret = -EFSCORRUPTED;
                        goto put_bhs;
                }
                memcpy((char *)buf + blocksize * i, bhs[i]->b_data,
                       i < bh_count - 1 ? blocksize : tail_size);
        }
        ret = 0;
put_bhs:
        for (i = 0; i < bh_count; i++)
                brelse(bhs[i]);
free_bhs:
        if (bhs != bhs_inline)
                kfree(bhs);
        return ret;
}

#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode)))

static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
                                 u32 ea_inode_hash, struct inode **ea_inode)
{
        struct inode *inode;
        int err;

        /*
         * We have to check for this corruption early as otherwise
         * iget_locked() could wait indefinitely for the state of our
         * parent inode.
         */
        if (parent->i_ino == ea_ino) {
                ext4_error(parent->i_sb,
                           "Parent and EA inode have the same ino %lu", ea_ino);
                return -EFSCORRUPTED;
        }

        inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ext4_error(parent->i_sb,
                           "error while reading EA inode %lu err=%d", ea_ino,
                           err);
                return err;
        }
        ext4_xattr_inode_set_class(inode);

        /*
         * Check whether this is an old Lustre-style xattr inode. Lustre
         * implementation does not have hash validation, rather it has a
         * backpointer from ea_inode to the parent inode.
         */
        if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) &&
            EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino &&
            inode->i_generation == parent->i_generation) {
                ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE);
                ext4_xattr_inode_set_ref(inode, 1);
        } else {
                inode_lock_nested(inode, I_MUTEX_XATTR);
                inode->i_flags |= S_NOQUOTA;
                inode_unlock(inode);
        }

        *ea_inode = inode;
        return 0;
}

/* Remove entry from mbcache when EA inode is getting evicted */
void ext4_evict_ea_inode(struct inode *inode)
{
        struct mb_cache_entry *oe;

        if (!EA_INODE_CACHE(inode))
                return;
        /* Wait for entry to get unused so that we can remove it */
        while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode),
                        ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
                mb_cache_entry_wait_unused(oe);
                mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
        }
}

static int
ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
                               struct ext4_xattr_entry *entry, void *buffer,
                               size_t size)
{
        u32 hash;

        /* Verify stored hash matches calculated hash. */
        hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size);
        if (hash != ext4_xattr_inode_get_hash(ea_inode))
                return -EFSCORRUPTED;

        if (entry) {
                __le32 e_hash, tmp_data;

                /* Verify entry hash. */
                tmp_data = cpu_to_le32(hash);
                e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len,
                                               &tmp_data, 1);
                /* All good? */
                if (e_hash == entry->e_hash)
                        return 0;

                /*
                 * Not good. Maybe the entry hash was calculated
                 * using the buggy signed char version?
                 */
                e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len,
                                                        &tmp_data, 1);
                /* Still no match - bad */
                if (e_hash != entry->e_hash)
                        return -EFSCORRUPTED;

                /* Let people know about old hash */
                pr_warn_once("ext4: filesystem with signed xattr name hash");
        }
        return 0;
}

/*
 * Read xattr value from the EA inode.
 */
static int
ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry,
                     void *buffer, size_t size)
{
        struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
        struct inode *ea_inode;
        int err;

        err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum),
                                    le32_to_cpu(entry->e_hash), &ea_inode);
        if (err) {
                ea_inode = NULL;
                goto out;
        }

        if (i_size_read(ea_inode) != size) {
                ext4_warning_inode(ea_inode,
                                   "ea_inode file size=%llu entry size=%zu",
                                   i_size_read(ea_inode), size);
                err = -EFSCORRUPTED;
                goto out;
        }

        err = ext4_xattr_inode_read(ea_inode, buffer, size);
        if (err)
                goto out;

        if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) {
                err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer,
                                                     size);
                if (err) {
                        ext4_warning_inode(ea_inode,
                                           "EA inode hash validation failed");
                        goto out;
                }

                if (ea_inode_cache)
                        mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
                                        ext4_xattr_inode_get_hash(ea_inode),
                                        ea_inode->i_ino, true /* reusable */);
        }
out:
        iput(ea_inode);
        return err;
}

static int
ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
                     void *buffer, size_t buffer_size)
{
        struct buffer_head *bh = NULL;
        struct ext4_xattr_entry *entry;
        size_t size;
        void *end;
        int error;
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);

        ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
                  name_index, name, buffer, (long)buffer_size);

        if (!EXT4_I(inode)->i_file_acl)
                return -ENODATA;
        ea_idebug(inode, "reading block %llu",
                  (unsigned long long)EXT4_I(inode)->i_file_acl);
        bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        error = ext4_xattr_check_block(inode, bh);
        if (error)
                goto cleanup;
        ext4_xattr_block_cache_insert(ea_block_cache, bh);
        entry = BFIRST(bh);
        end = bh->b_data + bh->b_size;
        error = xattr_find_entry(inode, &entry, end, name_index, name, 1);
        if (error)
                goto cleanup;
        size = le32_to_cpu(entry->e_value_size);
        error = -ERANGE;
        if (unlikely(size > EXT4_XATTR_SIZE_MAX))
                goto cleanup;
        if (buffer) {
                if (size > buffer_size)
                        goto cleanup;
                if (entry->e_value_inum) {
                        error = ext4_xattr_inode_get(inode, entry, buffer,
                                                     size);
                        if (error)
                                goto cleanup;
                } else {
                        u16 offset = le16_to_cpu(entry->e_value_offs);
                        void *p = bh->b_data + offset;

                        if (unlikely(p + size > end))
                                goto cleanup;
                        memcpy(buffer, p, size);
                }
        }
        error = size;

cleanup:
        brelse(bh);
        return error;
}

int
ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
                     void *buffer, size_t buffer_size)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_entry *entry;
        struct ext4_inode *raw_inode;
        struct ext4_iloc iloc;
        size_t size;
        void *end;
        int error;

        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return -ENODATA;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
                return error;
        raw_inode = ext4_raw_inode(&iloc);
        header = IHDR(inode, raw_inode);
        end = ITAIL(inode, raw_inode);
        entry = IFIRST(header);
        error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
        if (error)
                goto cleanup;
        size = le32_to_cpu(entry->e_value_size);
        error = -ERANGE;
        if (unlikely(size > EXT4_XATTR_SIZE_MAX))
                goto cleanup;
        if (buffer) {
                if (size > buffer_size)
                        goto cleanup;
                if (entry->e_value_inum) {
                        error = ext4_xattr_inode_get(inode, entry, buffer,
                                                     size);
                        if (error)
                                goto cleanup;
                } else {
                        u16 offset = le16_to_cpu(entry->e_value_offs);
                        void *p = (void *)IFIRST(header) + offset;

                        if (unlikely(p + size > end))
                                goto cleanup;
                        memcpy(buffer, p, size);
                }
        }
        error = size;

cleanup:
        brelse(iloc.bh);
        return error;
}

/*
 * ext4_xattr_get()
 *
 * Copy an extended attribute into the buffer
 * provided, or compute the buffer size required.
 * Buffer is NULL to compute the size of the buffer required.
 *
 * Returns a negative error number on failure, or the number of bytes
 * used / required on success.
 */
int
ext4_xattr_get(struct inode *inode, int name_index, const char *name,
               void *buffer, size_t buffer_size)
{
        int error;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (strlen(name) > 255)
                return -ERANGE;

        down_read(&EXT4_I(inode)->xattr_sem);
        error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
                                     buffer_size);
        if (error == -ENODATA)
                error = ext4_xattr_block_get(inode, name_index, name, buffer,
                                             buffer_size);
        up_read(&EXT4_I(inode)->xattr_sem);
        return error;
}

static int
ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
                        char *buffer, size_t buffer_size)
{
        size_t rest = buffer_size;

        for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
                const char *prefix;

                prefix = ext4_xattr_prefix(entry->e_name_index, dentry);
                if (prefix) {
                        size_t prefix_len = strlen(prefix);
                        size_t size = prefix_len + entry->e_name_len + 1;

                        if (buffer) {
                                if (size > rest)
                                        return -ERANGE;
                                memcpy(buffer, prefix, prefix_len);
                                buffer += prefix_len;
                                memcpy(buffer, entry->e_name, entry->e_name_len);
                                buffer += entry->e_name_len;
                                *buffer++ = 0;
                        }
                        rest -= size;
                }
        }
        return buffer_size - rest;  /* total size */
}

static int
ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        struct inode *inode = d_inode(dentry);
        struct buffer_head *bh = NULL;
        int error;

        ea_idebug(inode, "buffer=%p, buffer_size=%ld",
                  buffer, (long)buffer_size);

        if (!EXT4_I(inode)->i_file_acl)
                return 0;
        ea_idebug(inode, "reading block %llu",
                  (unsigned long long)EXT4_I(inode)->i_file_acl);
        bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        error = ext4_xattr_check_block(inode, bh);
        if (error)
                goto cleanup;
        ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh);
        error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer,
                                        buffer_size);
cleanup:
        brelse(bh);
        return error;
}

static int
ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        struct inode *inode = d_inode(dentry);
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        struct ext4_iloc iloc;
        int error;

        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return 0;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
                return error;
        raw_inode = ext4_raw_inode(&iloc);
        header = IHDR(inode, raw_inode);
        error = ext4_xattr_list_entries(dentry, IFIRST(header),
                                        buffer, buffer_size);

        brelse(iloc.bh);
        return error;
}

/*
 * Inode operation listxattr()
 *
 * d_inode(dentry)->i_rwsem: don't care
 *
 * Copy a list of attribute names into the buffer
 * provided, or compute the buffer size required.
 * Buffer is NULL to compute the size of the buffer required.
 *
 * Returns a negative error number on failure, or the number of bytes
 * used / required on success.
 */
ssize_t
ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        int ret, ret2;

        down_read(&EXT4_I(d_inode(dentry))->xattr_sem);
        ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
        if (ret < 0)
                goto errout;
        if (buffer) {
                buffer += ret;
                buffer_size -= ret;
        }
        ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
        if (ret < 0)
                goto errout;
        ret += ret2;
errout:
        up_read(&EXT4_I(d_inode(dentry))->xattr_sem);
        return ret;
}

/*
 * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
 * not set, set it.
 */
static void ext4_xattr_update_super_block(handle_t *handle,
                                          struct super_block *sb)
{
        if (ext4_has_feature_xattr(sb))
                return;

        BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
        if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
                                          EXT4_JTR_NONE) == 0) {
                lock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_set_feature_xattr(sb);
                ext4_superblock_csum_set(sb);
                unlock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        }
}

int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
{
        struct ext4_iloc iloc = { .bh = NULL };
        struct buffer_head *bh = NULL;
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_entry *entry;
        qsize_t ea_inode_refs = 0;
        int ret;

        lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);

        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                ret = ext4_get_inode_loc(inode, &iloc);
                if (ret)
                        goto out;
                raw_inode = ext4_raw_inode(&iloc);
                header = IHDR(inode, raw_inode);

                for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
                     entry = EXT4_XATTR_NEXT(entry))
                        if (entry->e_value_inum)
                                ea_inode_refs++;
        }

        if (EXT4_I(inode)->i_file_acl) {
                bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bh)) {
                        ret = PTR_ERR(bh);
                        bh = NULL;
                        goto out;
                }

                ret = ext4_xattr_check_block(inode, bh);
                if (ret)
                        goto out;

                for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
                     entry = EXT4_XATTR_NEXT(entry))
                        if (entry->e_value_inum)
                                ea_inode_refs++;
        }
        *usage = ea_inode_refs + 1;
        ret = 0;
out:
        brelse(iloc.bh);
        brelse(bh);
        return ret;
}

static inline size_t round_up_cluster(struct inode *inode, size_t length)
{
        struct super_block *sb = inode->i_sb;
        size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
                                    inode->i_blkbits);
        size_t mask = ~(cluster_size - 1);

        return (length + cluster_size - 1) & mask;
}

static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
{
        int err;

        err = dquot_alloc_inode(inode);
        if (err)
                return err;
        err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
        if (err)
                dquot_free_inode(inode);
        return err;
}

static void ext4_xattr_inode_free_quota(struct inode *parent,
                                        struct inode *ea_inode,
                                        size_t len)
{
        if (ea_inode &&
            ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE))
                return;
        dquot_free_space_nodirty(parent, round_up_cluster(parent, len));
        dquot_free_inode(parent);
}

int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
                             struct buffer_head *block_bh, size_t value_len,
                             bool is_create)
{
        int credits;
        int blocks;

        /*
         * 1) Owner inode update
         * 2) Ref count update on old xattr block
         * 3) new xattr block
         * 4) block bitmap update for new xattr block
         * 5) group descriptor for new xattr block
         * 6) block bitmap update for old xattr block
         * 7) group descriptor for old block
         *
         * 6 & 7 can happen if we have two racing threads T_a and T_b
         * which are each trying to set an xattr on inodes I_a and I_b
         * which were both initially sharing an xattr block.
         */
        credits = 7;

        /* Quota updates. */
        credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);

        /*
         * In case of inline data, we may push out the data to a block,
         * so we need to reserve credits for this eventuality
         */
        if (inode && ext4_has_inline_data(inode))
                credits += ext4_chunk_trans_extent(inode, 1) + 1;

        /* We are done if ea_inode feature is not enabled. */
        if (!ext4_has_feature_ea_inode(sb))
                return credits;

        /* New ea_inode, inode map, block bitmap, group descriptor. */
        credits += 4;

        /* Data blocks. */
        blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;

        /* Indirection block or one level of extent tree. */
        blocks += 1;

        /* Block bitmap and group descriptor updates for each block. */
        credits += blocks * 2;

        /* Blocks themselves. */
        credits += blocks;

        if (!is_create) {
                /* Dereference ea_inode holding old xattr value.
                 * Old ea_inode, inode map, block bitmap, group descriptor.
                 */
                credits += 4;

                /* Data blocks for old ea_inode. */
                blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;

                /* Indirection block or one level of extent tree for old
                 * ea_inode.
                 */
                blocks += 1;

                /* Block bitmap and group descriptor updates for each block. */
                credits += blocks * 2;
        }

        /* We may need to clone the existing xattr block in which case we need
         * to increment ref counts for existing ea_inodes referenced by it.
         */
        if (block_bh) {
                struct ext4_xattr_entry *entry = BFIRST(block_bh);

                for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
                        if (entry->e_value_inum)
                                /* Ref count update on ea_inode. */
                                credits += 1;
        }
        return credits;
}

static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
                                       int ref_change)
{
        struct ext4_iloc iloc;
        u64 ref_count;
        int ret;

        inode_lock_nested(ea_inode, I_MUTEX_XATTR);

        ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
        if (ret)
                goto out;

        ref_count = ext4_xattr_inode_get_ref(ea_inode);
        if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) {
                ext4_error_inode(ea_inode, __func__, __LINE__, 0,
                        "EA inode %llu ref wraparound: ref_count=%lld ref_change=%d",
                        ea_inode->i_ino, ref_count, ref_change);
                brelse(iloc.bh);
                ret = -EFSCORRUPTED;
                goto out;
        }
        ref_count += ref_change;
        ext4_xattr_inode_set_ref(ea_inode, ref_count);

        if (ref_change > 0) {
                if (ref_count == 1) {
                        WARN_ONCE(ea_inode->i_nlink, "EA inode %llu i_nlink=%u",
                                  ea_inode->i_ino, ea_inode->i_nlink);

                        set_nlink(ea_inode, 1);
                        ext4_orphan_del(handle, ea_inode);
                }
        } else {
                if (ref_count == 0) {
                        WARN_ONCE(ea_inode->i_nlink != 1,
                                  "EA inode %llu i_nlink=%u",
                                  ea_inode->i_ino, ea_inode->i_nlink);

                        clear_nlink(ea_inode);
                        ext4_orphan_add(handle, ea_inode);
                }
        }

        ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
        if (ret)
                ext4_warning_inode(ea_inode,
                                   "ext4_mark_iloc_dirty() failed ret=%d", ret);
out:
        inode_unlock(ea_inode);
        return ret;
}

static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
{
        return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
}

static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
{
        return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
}

static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
                                        struct ext4_xattr_entry *first)
{
        struct inode *ea_inode;
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_entry *failed_entry;
        unsigned int ea_ino;
        int err, saved_err;

        for (entry = first; !IS_LAST_ENTRY(entry);
             entry = EXT4_XATTR_NEXT(entry)) {
                if (!entry->e_value_inum)
                        continue;
                ea_ino = le32_to_cpu(entry->e_value_inum);
                err = ext4_xattr_inode_iget(parent, ea_ino,
                                            le32_to_cpu(entry->e_hash),
                                            &ea_inode);
                if (err)
                        goto cleanup;
                err = ext4_xattr_inode_inc_ref(handle, ea_inode);
                if (err) {
                        ext4_warning_inode(ea_inode, "inc ref error %d", err);
                        iput(ea_inode);
                        goto cleanup;
                }
                iput(ea_inode);
        }
        return 0;

cleanup:
        saved_err = err;
        failed_entry = entry;

        for (entry = first; entry != failed_entry;
             entry = EXT4_XATTR_NEXT(entry)) {
                if (!entry->e_value_inum)
                        continue;
                ea_ino = le32_to_cpu(entry->e_value_inum);
                err = ext4_xattr_inode_iget(parent, ea_ino,
                                            le32_to_cpu(entry->e_hash),
                                            &ea_inode);
                if (err) {
                        ext4_warning(parent->i_sb,
                                     "cleanup ea_ino %u iget error %d", ea_ino,
                                     err);
                        continue;
                }
                err = ext4_xattr_inode_dec_ref(handle, ea_inode);
                if (err)
                        ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
                                           err);
                iput(ea_inode);
        }
        return saved_err;
}

static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
                        struct buffer_head *bh, bool block_csum, bool dirty)
{
        int error;

        if (bh && dirty) {
                if (block_csum)
                        ext4_xattr_block_csum_set(inode, bh);
                error = ext4_handle_dirty_metadata(handle, NULL, bh);
                if (error) {
                        ext4_warning(inode->i_sb, "Handle metadata (error %d)",
                                     error);
                        return error;
                }
        }
        return 0;
}

static void
ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
                             struct buffer_head *bh,
                             struct ext4_xattr_entry *first, bool block_csum,
                             struct ext4_xattr_inode_array **ea_inode_array,
                             int extra_credits, bool skip_quota)
{
        struct inode *ea_inode;
        struct ext4_xattr_entry *entry;
        struct ext4_iloc iloc = { .bh = NULL };
        bool dirty = false;
        unsigned int ea_ino;
        int err;
        int credits;
        void *end;

        if (block_csum)
                end = (void *)bh->b_data + bh->b_size;
        else {
                err = ext4_get_inode_loc(parent, &iloc);
                if (err) {
                        EXT4_ERROR_INODE(parent, "parent inode loc (error %d)", err);
                        return;
                }
                end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size;
        }

        /* One credit for dec ref on ea_inode, one for orphan list addition, */
        credits = 2 + extra_credits;

        for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
             entry = EXT4_XATTR_NEXT(entry)) {
                if (!entry->e_value_inum)
                        continue;
                ea_ino = le32_to_cpu(entry->e_value_inum);
                err = ext4_xattr_inode_iget(parent, ea_ino,
                                            le32_to_cpu(entry->e_hash),
                                            &ea_inode);
                if (err)
                        continue;

                err = ext4_expand_inode_array(ea_inode_array, ea_inode);
                if (err) {
                        ext4_warning_inode(ea_inode,
                                           "Expand inode array err=%d", err);
                        iput(ea_inode);
                        continue;
                }

                err = ext4_journal_ensure_credits_fn(handle, credits, credits,
                        ext4_free_metadata_revoke_credits(parent->i_sb, 1),
                        ext4_xattr_restart_fn(handle, parent, bh, block_csum,
                                              dirty));
                if (err < 0) {
                        ext4_warning_inode(ea_inode, "Ensure credits err=%d",
                                           err);
                        continue;
                }
                if (err > 0) {
                        err = ext4_journal_get_write_access(handle,
                                        parent->i_sb, bh, EXT4_JTR_NONE);
                        if (err) {
                                ext4_warning_inode(ea_inode,
                                                "Re-get write access err=%d",
                                                err);
                                continue;
                        }
                }

                err = ext4_xattr_inode_dec_ref(handle, ea_inode);
                if (err) {
                        ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
                                           err);
                        continue;
                }

                if (!skip_quota)
                        ext4_xattr_inode_free_quota(parent, ea_inode,
                                              le32_to_cpu(entry->e_value_size));

                /*
                 * Forget about ea_inode within the same transaction that
                 * decrements the ref count. This avoids duplicate decrements in
                 * case the rest of the work spills over to subsequent
                 * transactions.
                 */
                entry->e_value_inum = 0;
                entry->e_value_size = 0;

                dirty = true;
        }

        if (dirty) {
                /*
                 * Note that we are deliberately skipping csum calculation for
                 * the final update because we do not expect any journal
                 * restarts until xattr block is freed.
                 */

                err = ext4_handle_dirty_metadata(handle, NULL, bh);
                if (err)
                        ext4_warning_inode(parent,
                                           "handle dirty metadata err=%d", err);
        }

        brelse(iloc.bh);
}

/*
 * Release the xattr block BH: If the reference count is > 1, decrement it;
 * otherwise free the block.
 */
static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh,
                         struct ext4_xattr_inode_array **ea_inode_array,
                         int extra_credits)
{
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
        u32 hash, ref;
        int error = 0;

        BUFFER_TRACE(bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                              EXT4_JTR_NONE);
        if (error)
                goto out;

retry_ref:
        lock_buffer(bh);
        hash = le32_to_cpu(BHDR(bh)->h_hash);
        ref = le32_to_cpu(BHDR(bh)->h_refcount);
        if (ref == 1) {
                ea_bdebug(bh, "refcount now=0; freeing");
                /*
                 * This must happen under buffer lock for
                 * ext4_xattr_block_set() to reliably detect freed block
                 */
                if (ea_block_cache) {
                        struct mb_cache_entry *oe;

                        oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
                                                          bh->b_blocknr);
                        if (oe) {
                                unlock_buffer(bh);
                                mb_cache_entry_wait_unused(oe);
                                mb_cache_entry_put(ea_block_cache, oe);
                                goto retry_ref;
                        }
                }
                get_bh(bh);
                unlock_buffer(bh);

                if (ext4_has_feature_ea_inode(inode->i_sb))
                        ext4_xattr_inode_dec_ref_all(handle, inode, bh,
                                                     BFIRST(bh),
                                                     true /* block_csum */,
                                                     ea_inode_array,
                                                     extra_credits,
                                                     true /* skip_quota */);
                ext4_free_blocks(handle, inode, bh, 0, 1,
                                 EXT4_FREE_BLOCKS_METADATA |
                                 EXT4_FREE_BLOCKS_FORGET);
        } else {
                ref--;
                BHDR(bh)->h_refcount = cpu_to_le32(ref);
                if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
                        struct mb_cache_entry *ce;

                        if (ea_block_cache) {
                                ce = mb_cache_entry_get(ea_block_cache, hash,
                                                        bh->b_blocknr);
                                if (ce) {
                                        set_bit(MBE_REUSABLE_B, &ce->e_flags);
                                        mb_cache_entry_put(ea_block_cache, ce);
                                }
                        }
                }

                ext4_xattr_block_csum_set(inode, bh);
                /*
                 * Beware of this ugliness: Releasing of xattr block references
                 * from different inodes can race and so we have to protect
                 * from a race where someone else frees the block (and releases
                 * its journal_head) before we are done dirtying the buffer. In
                 * nojournal mode this race is harmless and we actually cannot
                 * call ext4_handle_dirty_metadata() with locked buffer as
                 * that function can call sync_dirty_buffer() so for that case
                 * we handle the dirtying after unlocking the buffer.
                 */
                if (ext4_handle_valid(handle))
                        error = ext4_handle_dirty_metadata(handle, inode, bh);
                unlock_buffer(bh);
                if (!ext4_handle_valid(handle))
                        error = ext4_handle_dirty_metadata(handle, inode, bh);
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
                dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
                ea_bdebug(bh, "refcount now=%d; releasing",
                          le32_to_cpu(BHDR(bh)->h_refcount));
        }
out:
        ext4_std_error(inode->i_sb, error);
        return;
}

/*
 * Find the available free space for EAs. This also returns the total number of
 * bytes used by EA entries.
 */
static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
                                    size_t *min_offs, void *base, int *total)
{
        for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
                if (!last->e_value_inum && last->e_value_size) {
                        size_t offs = le16_to_cpu(last->e_value_offs);
                        if (offs < *min_offs)
                                *min_offs = offs;
                }
                if (total)
                        *total += EXT4_XATTR_LEN(last->e_name_len);
        }
        return (*min_offs - ((void *)last - base) - sizeof(__u32));
}

/*
 * Write the value of the EA in an inode.
 */
static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
                                  const void *buf, int bufsize)
{
        struct buffer_head *bh = NULL;
        unsigned long block = 0;
        int blocksize = ea_inode->i_sb->s_blocksize;
        int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
        int csize, wsize = 0;
        int ret = 0, ret2 = 0;
        int retries = 0;

retry:
        while (ret >= 0 && ret < max_blocks) {
                struct ext4_map_blocks map;
                map.m_lblk = block += ret;
                map.m_len = max_blocks -= ret;

                ret = ext4_map_blocks(handle, ea_inode, &map,
                                      EXT4_GET_BLOCKS_CREATE);
                if (ret <= 0) {
                        ext4_mark_inode_dirty(handle, ea_inode);
                        if (ret == -ENOSPC &&
                            ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
                                ret = 0;
                                goto retry;
                        }
                        break;
                }
        }

        if (ret < 0)
                return ret;

        block = 0;
        while (wsize < bufsize) {
                brelse(bh);
                csize = (bufsize - wsize) > blocksize ? blocksize :
                                                                bufsize - wsize;
                bh = ext4_getblk(handle, ea_inode, block, 0);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                if (!bh) {
                        WARN_ON_ONCE(1);
                        EXT4_ERROR_INODE(ea_inode,
                                         "ext4_getblk() return bh = NULL");
                        return -EFSCORRUPTED;
                }
                ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh,
                                                   EXT4_JTR_NONE);
                if (ret)
                        goto out;

                memcpy(bh->b_data, buf, csize);
                /*
                 * Zero out block tail to avoid writing uninitialized memory
                 * to disk.
                 */
                if (csize < blocksize)
                        memset(bh->b_data + csize, 0, blocksize - csize);
                set_buffer_uptodate(bh);
                ext4_handle_dirty_metadata(handle, ea_inode, bh);

                buf += csize;
                wsize += csize;
                block += 1;
        }

        inode_lock(ea_inode);
        i_size_write(ea_inode, wsize);
        ext4_update_i_disksize(ea_inode, wsize);
        inode_unlock(ea_inode);

        ret2 = ext4_mark_inode_dirty(handle, ea_inode);
        if (unlikely(ret2 && !ret))
                ret = ret2;

out:
        brelse(bh);

        return ret;
}

/*
 * Create an inode to store the value of a large EA.
 */
static struct inode *ext4_xattr_inode_create(handle_t *handle,
                                             struct inode *inode, u32 hash)
{
        struct inode *ea_inode = NULL;
        uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
        int err;

        if (inode->i_sb->s_root == NULL) {
                ext4_warning(inode->i_sb,
                             "refuse to create EA inode when umounting");
                WARN_ON(1);
                return ERR_PTR(-EINVAL);
        }

        /*
         * Let the next inode be the goal, so we try and allocate the EA inode
         * in the same group, or nearby one.
         */
        ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
                                  S_IFREG | 0600, NULL, inode->i_ino + 1, owner,
                                  EXT4_EA_INODE_FL);
        if (!IS_ERR(ea_inode)) {
                ea_inode->i_op = &ext4_file_inode_operations;
                ea_inode->i_fop = &ext4_file_operations;
                ext4_set_aops(ea_inode);
                ext4_xattr_inode_set_class(ea_inode);
                unlock_new_inode(ea_inode);
                ext4_xattr_inode_set_ref(ea_inode, 1);
                ext4_xattr_inode_set_hash(ea_inode, hash);
                err = ext4_mark_inode_dirty(handle, ea_inode);
                if (!err)
                        err = ext4_inode_attach_jinode(ea_inode);
                if (err) {
                        if (ext4_xattr_inode_dec_ref(handle, ea_inode))
                                ext4_warning_inode(ea_inode,
                                        "cleanup dec ref error %d", err);
                        iput(ea_inode);
                        return ERR_PTR(err);
                }

                /*
                 * Xattr inodes are shared therefore quota charging is performed
                 * at a higher level.
                 */
                dquot_free_inode(ea_inode);
                dquot_drop(ea_inode);
                inode_lock(ea_inode);
                ea_inode->i_flags |= S_NOQUOTA;
                inode_unlock(ea_inode);
        }

        return ea_inode;
}

static struct inode *
ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
                            size_t value_len, u32 hash)
{
        struct inode *ea_inode;
        struct mb_cache_entry *ce;
        struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
        void *ea_data;

        if (!ea_inode_cache)
                return NULL;

        ce = mb_cache_entry_find_first(ea_inode_cache, hash);
        if (!ce)
                return NULL;

        WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
                     !(current->flags & PF_MEMALLOC_NOFS));

        ea_data = kvmalloc(value_len, GFP_NOFS);
        if (!ea_data) {
                mb_cache_entry_put(ea_inode_cache, ce);
                return NULL;
        }

        while (ce) {
                ea_inode = ext4_iget(inode->i_sb, ce->e_value,
                                     EXT4_IGET_EA_INODE);
                if (IS_ERR(ea_inode))
                        goto next_entry;
                ext4_xattr_inode_set_class(ea_inode);
                if (i_size_read(ea_inode) == value_len &&
                    !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
                    !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data,
                                                    value_len) &&
                    !memcmp(value, ea_data, value_len)) {
                        mb_cache_entry_touch(ea_inode_cache, ce);
                        mb_cache_entry_put(ea_inode_cache, ce);
                        kvfree(ea_data);
                        return ea_inode;
                }
                iput(ea_inode);
        next_entry:
                ce = mb_cache_entry_find_next(ea_inode_cache, ce);
        }
        kvfree(ea_data);
        return NULL;
}

/*
 * Add value of the EA in an inode.
 */
static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle,
                struct inode *inode, const void *value, size_t value_len)
{
        struct inode *ea_inode;
        u32 hash;
        int err;

        /* Account inode & space to quota even if sharing... */
        err = ext4_xattr_inode_alloc_quota(inode, value_len);
        if (err)
                return ERR_PTR(err);

        hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
        ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
        if (ea_inode) {
                err = ext4_xattr_inode_inc_ref(handle, ea_inode);
                if (err)
                        goto out_err;
                return ea_inode;
        }

        /* Create an inode for the EA value */
        ea_inode = ext4_xattr_inode_create(handle, inode, hash);
        if (IS_ERR(ea_inode)) {
                ext4_xattr_inode_free_quota(inode, NULL, value_len);
                return ea_inode;
        }

        err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
        if (err) {
                if (ext4_xattr_inode_dec_ref(handle, ea_inode))
                        ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err);
                goto out_err;
        }

        if (EA_INODE_CACHE(inode))
                mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
                                      ea_inode->i_ino, true /* reusable */);
        return ea_inode;
out_err:
        iput(ea_inode);
        ext4_xattr_inode_free_quota(inode, NULL, value_len);
        return ERR_PTR(err);
}

/*
 * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode
 * feature is enabled.
 */
#define EXT4_XATTR_BLOCK_RESERVE(inode)        min(i_blocksize(inode)/8, 1024U)

static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
                                struct ext4_xattr_search *s,
                                handle_t *handle, struct inode *inode,
                                struct inode *new_ea_inode,
                                bool is_block)
{
        struct ext4_xattr_entry *last, *next;
        struct ext4_xattr_entry *here = s->here;
        size_t min_offs = s->end - s->base, name_len = strlen(i->name);
        int in_inode = i->in_inode;
        struct inode *old_ea_inode = NULL;
        size_t old_size, new_size;
        int ret;

        /* Space used by old and new values. */
        old_size = (!s->not_found && !here->e_value_inum) ?
                        EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0;
        new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0;

        /*
         * Optimization for the simple case when old and new values have the
         * same padded sizes. Not applicable if external inodes are involved.
         */
        if (new_size && new_size == old_size) {
                size_t offs = le16_to_cpu(here->e_value_offs);
                void *val = s->base + offs;

                here->e_value_size = cpu_to_le32(i->value_len);
                if (i->value == EXT4_ZERO_XATTR_VALUE) {
                        memset(val, 0, new_size);
                } else {
                        memcpy(val, i->value, i->value_len);
                        /* Clear padding bytes. */
                        memset(val + i->value_len, 0, new_size - i->value_len);
                }
                goto update_hash;
        }

        /* Compute min_offs and last. */
        last = s->first;
        for (; !IS_LAST_ENTRY(last); last = next) {
                next = EXT4_XATTR_NEXT(last);
                if ((void *)next >= s->end) {
                        EXT4_ERROR_INODE(inode, "corrupted xattr entries");
                        ret = -EFSCORRUPTED;
                        goto out;
                }
                if (!last->e_value_inum && last->e_value_size) {
                        size_t offs = le16_to_cpu(last->e_value_offs);
                        if (offs < min_offs)
                                min_offs = offs;
                }
        }

        /* Check whether we have enough space. */
        if (i->value) {
                size_t free;

                free = min_offs - ((void *)last - s->base) - sizeof(__u32);
                if (!s->not_found)
                        free += EXT4_XATTR_LEN(name_len) + old_size;

                if (free < EXT4_XATTR_LEN(name_len) + new_size) {
                        ret = -ENOSPC;
                        goto out;
                }

                /*
                 * If storing the value in an external inode is an option,
                 * reserve space for xattr entries/names in the external
                 * attribute block so that a long value does not occupy the
                 * whole space and prevent further entries being added.
                 */
                if (ext4_has_feature_ea_inode(inode->i_sb) &&
                    new_size && is_block &&
                    (min_offs + old_size - new_size) <
                                        EXT4_XATTR_BLOCK_RESERVE(inode)) {
                        ret = -ENOSPC;
                        goto out;
                }
        }

        /*
         * Getting access to old and new ea inodes is subject to failures.
         * Finish that work before doing any modifications to the xattr data.
         */
        if (!s->not_found && here->e_value_inum) {
                ret = ext4_xattr_inode_iget(inode,
                                            le32_to_cpu(here->e_value_inum),
                                            le32_to_cpu(here->e_hash),
                                            &old_ea_inode);
                if (ret) {
                        old_ea_inode = NULL;
                        goto out;
                }

                /* We are ready to release ref count on the old_ea_inode. */
                ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
                if (ret)
                        goto out;

                ext4_xattr_inode_free_quota(inode, old_ea_inode,
                                            le32_to_cpu(here->e_value_size));
        }

        /* No failures allowed past this point. */

        if (!s->not_found && here->e_value_size && !here->e_value_inum) {
                /* Remove the old value. */
                void *first_val = s->base + min_offs;
                size_t offs = le16_to_cpu(here->e_value_offs);
                void *val = s->base + offs;

                memmove(first_val + old_size, first_val, val - first_val);
                memset(first_val, 0, old_size);
                min_offs += old_size;

                /* Adjust all value offsets. */
                last = s->first;
                while (!IS_LAST_ENTRY(last)) {
                        size_t o = le16_to_cpu(last->e_value_offs);

                        if (!last->e_value_inum &&
                            last->e_value_size && o < offs)
                                last->e_value_offs = cpu_to_le16(o + old_size);
                        last = EXT4_XATTR_NEXT(last);
                }
        }

        if (!i->value) {
                /* Remove old name. */
                size_t size = EXT4_XATTR_LEN(name_len);

                last = ENTRY((void *)last - size);
                memmove(here, (void *)here + size,
                        (void *)last - (void *)here + sizeof(__u32));
                memset(last, 0, size);

                /*
                 * Update i_inline_off - moved ibody region might contain
                 * system.data attribute.  Handling a failure here won't
                 * cause other complications for setting an xattr.
                 */
                if (!is_block && ext4_has_inline_data(inode)) {
                        ret = ext4_find_inline_data_nolock(inode);
                        if (ret) {
                                ext4_warning_inode(inode,
                                        "unable to update i_inline_off");
                                goto out;
                        }
                }
        } else if (s->not_found) {
                /* Insert new name. */
                size_t size = EXT4_XATTR_LEN(name_len);
                size_t rest = (void *)last - (void *)here + sizeof(__u32);

                memmove((void *)here + size, here, rest);
                memset(here, 0, size);
                here->e_name_index = i->name_index;
                here->e_name_len = name_len;
                memcpy(here->e_name, i->name, name_len);
        } else {
                /* This is an update, reset value info. */
                here->e_value_inum = 0;
                here->e_value_offs = 0;
                here->e_value_size = 0;
        }

        if (i->value) {
                /* Insert new value. */
                if (in_inode) {
                        here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
                } else if (i->value_len) {
                        void *val = s->base + min_offs - new_size;

                        here->e_value_offs = cpu_to_le16(min_offs - new_size);
                        if (i->value == EXT4_ZERO_XATTR_VALUE) {
                                memset(val, 0, new_size);
                        } else {
                                memcpy(val, i->value, i->value_len);
                                /* Clear padding bytes. */
                                memset(val + i->value_len, 0,
                                       new_size - i->value_len);
                        }
                }
                here->e_value_size = cpu_to_le32(i->value_len);
        }

update_hash:
        if (i->value) {
                __le32 hash = 0;

                /* Entry hash calculation. */
                if (in_inode) {
                        __le32 crc32c_hash;

                        /*
                         * Feed crc32c hash instead of the raw value for entry
                         * hash calculation. This is to avoid walking
                         * potentially long value buffer again.
                         */
                        crc32c_hash = cpu_to_le32(
                                       ext4_xattr_inode_get_hash(new_ea_inode));
                        hash = ext4_xattr_hash_entry(here->e_name,
                                                     here->e_name_len,
                                                     &crc32c_hash, 1);
                } else if (is_block) {
                        __le32 *value = s->base + le16_to_cpu(
                                                        here->e_value_offs);

                        hash = ext4_xattr_hash_entry(here->e_name,
                                                     here->e_name_len, value,
                                                     new_size >> 2);
                }
                here->e_hash = hash;
        }

        if (is_block)
                ext4_xattr_rehash((struct ext4_xattr_header *)s->base);

        ret = 0;
out:
        iput(old_ea_inode);
        return ret;
}

struct ext4_xattr_block_find {
        struct ext4_xattr_search s;
        struct buffer_head *bh;
};

static int
ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                      struct ext4_xattr_block_find *bs)
{
        struct super_block *sb = inode->i_sb;
        int error;

        ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
                  i->name_index, i->name, i->value, (long)i->value_len);

        if (EXT4_I(inode)->i_file_acl) {
                /* The inode already has an extended attribute block. */
                bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bs->bh)) {
                        error = PTR_ERR(bs->bh);
                        bs->bh = NULL;
                        return error;
                }
                ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
                        atomic_read(&(bs->bh->b_count)),
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                error = ext4_xattr_check_block(inode, bs->bh);
                if (error)
                        return error;
                /* Find the named attribute. */
                bs->s.base = BHDR(bs->bh);
                bs->s.first = BFIRST(bs->bh);
                bs->s.end = bs->bh->b_data + bs->bh->b_size;
                bs->s.here = bs->s.first;
                error = xattr_find_entry(inode, &bs->s.here, bs->s.end,
                                         i->name_index, i->name, 1);
                if (error && error != -ENODATA)
                        return error;
                bs->s.not_found = error;
        }
        return 0;
}

static int
ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                     struct ext4_xattr_info *i,
                     struct ext4_xattr_block_find *bs)
{
        struct super_block *sb = inode->i_sb;
        struct buffer_head *new_bh = NULL;
        struct ext4_xattr_search s_copy = bs->s;
        struct ext4_xattr_search *s = &s_copy;
        struct mb_cache_entry *ce = NULL;
        int error = 0;
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
        struct inode *ea_inode = NULL, *tmp_inode;
        size_t old_ea_inode_quota = 0;
        unsigned int ea_ino;

#define header(x) ((struct ext4_xattr_header *)(x))

        /* If we need EA inode, prepare it before locking the buffer */
        if (i->value && i->in_inode) {
                WARN_ON_ONCE(!i->value_len);

                ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
                                        i->value, i->value_len);
                if (IS_ERR(ea_inode)) {
                        error = PTR_ERR(ea_inode);
                        ea_inode = NULL;
                        goto cleanup;
                }
        }

        if (s->base) {
                int offset = (char *)s->here - bs->bh->b_data;

                BUFFER_TRACE(bs->bh, "get_write_access");
                error = ext4_journal_get_write_access(handle, sb, bs->bh,
                                                      EXT4_JTR_NONE);
                if (error)
                        goto cleanup;

                lock_buffer(bs->bh);

                if (header(s->base)->h_refcount == cpu_to_le32(1)) {
                        __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);

                        /*
                         * This must happen under buffer lock for
                         * ext4_xattr_block_set() to reliably detect modified
                         * block
                         */
                        if (ea_block_cache) {
                                struct mb_cache_entry *oe;

                                oe = mb_cache_entry_delete_or_get(ea_block_cache,
                                        hash, bs->bh->b_blocknr);
                                if (oe) {
                                        /*
                                         * Xattr block is getting reused. Leave
                                         * it alone.
                                         */
                                        mb_cache_entry_put(ea_block_cache, oe);
                                        goto clone_block;
                                }
                        }
                        ea_bdebug(bs->bh, "modifying in-place");
                        error = ext4_xattr_set_entry(i, s, handle, inode,
                                             ea_inode, true /* is_block */);
                        ext4_xattr_block_csum_set(inode, bs->bh);
                        unlock_buffer(bs->bh);
                        if (error == -EFSCORRUPTED)
                                goto bad_block;
                        if (!error)
                                error = ext4_handle_dirty_metadata(handle,
                                                                   inode,
                                                                   bs->bh);
                        if (error)
                                goto cleanup;
                        goto inserted;
                }
clone_block:
                unlock_buffer(bs->bh);
                ea_bdebug(bs->bh, "cloning");
                s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
                error = -ENOMEM;
                if (s->base == NULL)
                        goto cleanup;
                s->first = ENTRY(header(s->base)+1);
                header(s->base)->h_refcount = cpu_to_le32(1);
                s->here = ENTRY(s->base + offset);
                s->end = s->base + bs->bh->b_size;

                /*
                 * If existing entry points to an xattr inode, we need
                 * to prevent ext4_xattr_set_entry() from decrementing
                 * ref count on it because the reference belongs to the
                 * original block. In this case, make the entry look
                 * like it has an empty value.
                 */
                if (!s->not_found && s->here->e_value_inum) {
                        ea_ino = le32_to_cpu(s->here->e_value_inum);
                        error = ext4_xattr_inode_iget(inode, ea_ino,
                                      le32_to_cpu(s->here->e_hash),
                                      &tmp_inode);
                        if (error)
                                goto cleanup;

                        if (!ext4_test_inode_state(tmp_inode,
                                        EXT4_STATE_LUSTRE_EA_INODE)) {
                                /*
                                 * Defer quota free call for previous
                                 * inode until success is guaranteed.
                                 */
                                old_ea_inode_quota = le32_to_cpu(
                                                s->here->e_value_size);
                        }
                        iput(tmp_inode);

                        s->here->e_value_inum = 0;
                        s->here->e_value_size = 0;
                }
        } else {
                /* Allocate a buffer where we construct the new block. */
                s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
                error = -ENOMEM;
                if (s->base == NULL)
                        goto cleanup;
                header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
                header(s->base)->h_blocks = cpu_to_le32(1);
                header(s->base)->h_refcount = cpu_to_le32(1);
                s->first = ENTRY(header(s->base)+1);
                s->here = ENTRY(header(s->base)+1);
                s->end = s->base + sb->s_blocksize;
        }

        error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode,
                                     true /* is_block */);
        if (error == -EFSCORRUPTED)
                goto bad_block;
        if (error)
                goto cleanup;

inserted:
        if (!IS_LAST_ENTRY(s->first)) {
                new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce);
                if (IS_ERR(new_bh)) {
                        error = PTR_ERR(new_bh);
                        new_bh = NULL;
                        goto cleanup;
                }

                if (new_bh) {
                        /* We found an identical block in the cache. */
                        if (new_bh == bs->bh)
                                ea_bdebug(new_bh, "keeping");
                        else {
                                u32 ref;

#ifdef EXT4_XATTR_DEBUG
                                WARN_ON_ONCE(dquot_initialize_needed(inode));
#endif
                                /* The old block is released after updating
                                   the inode. */
                                error = dquot_alloc_block(inode,
                                                EXT4_C2B(EXT4_SB(sb), 1));
                                if (error)
                                        goto cleanup;
                                BUFFER_TRACE(new_bh, "get_write_access");
                                error = ext4_journal_get_write_access(
                                                handle, sb, new_bh,
                                                EXT4_JTR_NONE);
                                if (error)
                                        goto cleanup_dquot;
                                lock_buffer(new_bh);
                                /*
                                 * We have to be careful about races with
                                 * adding references to xattr block. Once we
                                 * hold buffer lock xattr block's state is
                                 * stable so we can check the additional
                                 * reference fits.
                                 */
                                ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
                                if (ref > EXT4_XATTR_REFCOUNT_MAX) {
                                        /*
                                         * Undo everything and check mbcache
                                         * again.
                                         */
                                        unlock_buffer(new_bh);
                                        dquot_free_block(inode,
                                                         EXT4_C2B(EXT4_SB(sb),
                                                                  1));
                                        brelse(new_bh);
                                        mb_cache_entry_put(ea_block_cache, ce);
                                        ce = NULL;
                                        new_bh = NULL;
                                        goto inserted;
                                }
                                BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
                                if (ref == EXT4_XATTR_REFCOUNT_MAX)
                                        clear_bit(MBE_REUSABLE_B, &ce->e_flags);
                                ea_bdebug(new_bh, "reusing; refcount now=%d",
                                          ref);
                                ext4_xattr_block_csum_set(inode, new_bh);
                                unlock_buffer(new_bh);
                                error = ext4_handle_dirty_metadata(handle,
                                                                   inode,
                                                                   new_bh);
                                if (error)
                                        goto cleanup_dquot;
                        }
                        mb_cache_entry_touch(ea_block_cache, ce);
                        mb_cache_entry_put(ea_block_cache, ce);
                        ce = NULL;
                } else if (bs->bh && s->base == bs->bh->b_data) {
                        /* We were modifying this block in-place. */
                        ea_bdebug(bs->bh, "keeping this block");
                        ext4_xattr_block_cache_insert(ea_block_cache, bs->bh);
                        new_bh = bs->bh;
                        get_bh(new_bh);
                } else {
                        /* We need to allocate a new block */
                        ext4_fsblk_t goal, block;

#ifdef EXT4_XATTR_DEBUG
                        WARN_ON_ONCE(dquot_initialize_needed(inode));
#endif
                        goal = ext4_group_first_block_no(sb,
                                                EXT4_I(inode)->i_block_group);
                        block = ext4_new_meta_blocks(handle, inode, goal, 0,
                                                     NULL, &error);
                        if (error)
                                goto cleanup;

                        ea_idebug(inode, "creating block %llu",
                                  (unsigned long long)block);

                        new_bh = sb_getblk(sb, block);
                        if (unlikely(!new_bh)) {
                                error = -ENOMEM;
getblk_failed:
                                ext4_free_blocks(handle, inode, NULL, block, 1,
                                                 EXT4_FREE_BLOCKS_METADATA);
                                goto cleanup;
                        }
                        error = ext4_xattr_inode_inc_ref_all(handle, inode,
                                                      ENTRY(header(s->base)+1));
                        if (error)
                                goto getblk_failed;
                        if (ea_inode) {
                                /* Drop the extra ref on ea_inode. */
                                error = ext4_xattr_inode_dec_ref(handle,
                                                                 ea_inode);
                                if (error)
                                        ext4_warning_inode(ea_inode,
                                                           "dec ref error=%d",
                                                           error);
                                iput(ea_inode);
                                ea_inode = NULL;
                        }

                        lock_buffer(new_bh);
                        error = ext4_journal_get_create_access(handle, sb,
                                                        new_bh, EXT4_JTR_NONE);
                        if (error) {
                                unlock_buffer(new_bh);
                                error = -EIO;
                                goto getblk_failed;
                        }
                        memcpy(new_bh->b_data, s->base, new_bh->b_size);
                        ext4_xattr_block_csum_set(inode, new_bh);
                        set_buffer_uptodate(new_bh);
                        unlock_buffer(new_bh);
                        ext4_xattr_block_cache_insert(ea_block_cache, new_bh);
                        error = ext4_handle_dirty_metadata(handle, inode,
                                                           new_bh);
                        if (error)
                                goto cleanup;
                }
        }

        if (old_ea_inode_quota)
                ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota);

        /* Update the inode. */
        EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;

        /* Drop the previous xattr block. */
        if (bs->bh && bs->bh != new_bh) {
                struct ext4_xattr_inode_array *ea_inode_array = NULL;

                ext4_xattr_release_block(handle, inode, bs->bh,
                                         &ea_inode_array,
                                         0 /* extra_credits */);
                ext4_xattr_inode_array_free(ea_inode_array);
        }
        error = 0;

cleanup:
        if (ea_inode) {
                if (error) {
                        int error2;

                        error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
                        if (error2)
                                ext4_warning_inode(ea_inode, "dec ref error=%d",
                                                   error2);
                        ext4_xattr_inode_free_quota(inode, ea_inode,
                                                    i_size_read(ea_inode));
                }
                iput(ea_inode);
        }
        if (ce)
                mb_cache_entry_put(ea_block_cache, ce);
        brelse(new_bh);
        if (!(bs->bh && s->base == bs->bh->b_data))
                kfree(s->base);

        return error;

cleanup_dquot:
        dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));
        goto cleanup;

bad_block:
        EXT4_ERROR_INODE(inode, "bad block %llu",
                         EXT4_I(inode)->i_file_acl);
        goto cleanup;

#undef header
}

int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
                          struct ext4_xattr_ibody_find *is)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        int error;

        if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
                return 0;

        raw_inode = ext4_raw_inode(&is->iloc);
        header = IHDR(inode, raw_inode);
        is->s.base = is->s.first = IFIRST(header);
        is->s.here = is->s.first;
        is->s.end = ITAIL(inode, raw_inode);
        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                /* Find the named attribute. */
                error = xattr_find_entry(inode, &is->s.here, is->s.end,
                                         i->name_index, i->name, 0);
                if (error && error != -ENODATA)
                        return error;
                is->s.not_found = error;
        }
        return 0;
}

int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
                                struct ext4_xattr_info *i,
                                struct ext4_xattr_ibody_find *is)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_search *s = &is->s;
        struct inode *ea_inode = NULL;
        int error;

        if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
                return -ENOSPC;

        /* If we need EA inode, prepare it before locking the buffer */
        if (i->value && i->in_inode) {
                WARN_ON_ONCE(!i->value_len);

                ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
                                        i->value, i->value_len);
                if (IS_ERR(ea_inode))
                        return PTR_ERR(ea_inode);
        }
        error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode,
                                     false /* is_block */);
        if (error) {
                if (ea_inode) {
                        int error2;

                        error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
                        if (error2)
                                ext4_warning_inode(ea_inode, "dec ref error=%d",
                                                   error2);

                        ext4_xattr_inode_free_quota(inode, ea_inode,
                                                    i_size_read(ea_inode));
                        iput(ea_inode);
                }
                return error;
        }
        header = IHDR(inode, ext4_raw_inode(&is->iloc));
        if (!IS_LAST_ENTRY(s->first)) {
                header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
        } else {
                header->h_magic = cpu_to_le32(0);
                ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
        }
        iput(ea_inode);
        return 0;
}

static int ext4_xattr_value_same(struct ext4_xattr_search *s,
                                 struct ext4_xattr_info *i)
{
        void *value;

        /* When e_value_inum is set the value is stored externally. */
        if (s->here->e_value_inum)
                return 0;
        if (le32_to_cpu(s->here->e_value_size) != i->value_len)
                return 0;
        value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
        return !memcmp(value, i->value, i->value_len);
}

static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
{
        struct buffer_head *bh;
        int error;

        if (!EXT4_I(inode)->i_file_acl)
                return NULL;
        bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
        if (IS_ERR(bh))
                return bh;
        error = ext4_xattr_check_block(inode, bh);
        if (error) {
                brelse(bh);
                return ERR_PTR(error);
        }
        return bh;
}

/*
 * ext4_xattr_set_handle()
 *
 * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
 * specify that an extended attribute must exist and must not exist
 * previous to the call, respectively.
 *
 * Returns 0, or a negative error number on failure.
 */
int
ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                      const char *name, const void *value, size_t value_len,
                      int flags)
{
        struct ext4_xattr_info i = {
                .name_index = name_index,
                .name = name,
                .value = value,
                .value_len = value_len,
                .in_inode = 0,
        };
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_block_find bs = {
                .s = { .not_found = -ENODATA, },
        };
        int no_expand;
        int error;

        if (!name)
                return -EINVAL;
        if (strlen(name) > 255)
                return -ERANGE;

        ext4_write_lock_xattr(inode, &no_expand);

        /* Check journal credits under write lock. */
        if (ext4_handle_valid(handle)) {
                struct buffer_head *bh;
                int credits;

                bh = ext4_xattr_get_block(inode);
                if (IS_ERR(bh)) {
                        error = PTR_ERR(bh);
                        goto cleanup;
                }

                credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh,
                                                   value_len,
                                                   flags & XATTR_CREATE);
                brelse(bh);

                if (jbd2_handle_buffer_credits(handle) < credits) {
                        error = -ENOSPC;
                        goto cleanup;
                }
                WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
        }

        error = ext4_reserve_inode_write(handle, inode, &is.iloc);
        if (error)
                goto cleanup;

        if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
                struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
                ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        }

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto cleanup;
        if (is.s.not_found)
                error = ext4_xattr_block_find(inode, &i, &bs);
        if (error)
                goto cleanup;
        if (is.s.not_found && bs.s.not_found) {
                error = -ENODATA;
                if (flags & XATTR_REPLACE)
                        goto cleanup;
                error = 0;
                if (!value)
                        goto cleanup;
        } else {
                error = -EEXIST;
                if (flags & XATTR_CREATE)
                        goto cleanup;
        }

        if (!value) {
                if (!is.s.not_found)
                        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
                else if (!bs.s.not_found)
                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
        } else {
                error = 0;
                /* Xattr value did not change? Save us some work and bail out */
                if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i))
                        goto cleanup;
                if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
                        goto cleanup;

                if (ext4_has_feature_ea_inode(inode->i_sb) &&
                    (EXT4_XATTR_SIZE(i.value_len) >
                        EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
                        i.in_inode = 1;
retry_inode:
                error = ext4_xattr_ibody_set(handle, inode, &i, &is);
                if (!error && !bs.s.not_found) {
                        i.value = NULL;
                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
                } else if (error == -ENOSPC) {
                        if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
                                brelse(bs.bh);
                                bs.bh = NULL;
                                error = ext4_xattr_block_find(inode, &i, &bs);
                                if (error)
                                        goto cleanup;
                        }
                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
                        if (!error && !is.s.not_found) {
                                i.value = NULL;
                                error = ext4_xattr_ibody_set(handle, inode, &i,
                                                             &is);
                        } else if (error == -ENOSPC) {
                                /*
                                 * Xattr does not fit in the block, store at
                                 * external inode if possible.
                                 */
                                if (ext4_has_feature_ea_inode(inode->i_sb) &&
                                    i.value_len && !i.in_inode) {
                                        i.in_inode = 1;
                                        goto retry_inode;
                                }
                        }
                }
        }
        if (!error) {
                ext4_xattr_update_super_block(handle, inode->i_sb);
                inode_set_ctime_current(inode);
                inode_inc_iversion(inode);
                if (!value)
                        no_expand = 0;
                error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
                /*
                 * The bh is consumed by ext4_mark_iloc_dirty, even with
                 * error != 0.
                 */
                is.iloc.bh = NULL;
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
        }
        ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);

cleanup:
        brelse(is.iloc.bh);
        brelse(bs.bh);
        ext4_write_unlock_xattr(inode, &no_expand);
        return error;
}

int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
                           bool is_create, int *credits)
{
        struct buffer_head *bh;
        int err;

        *credits = 0;

        if (!EXT4_SB(inode->i_sb)->s_journal)
                return 0;

        down_read(&EXT4_I(inode)->xattr_sem);

        bh = ext4_xattr_get_block(inode);
        if (IS_ERR(bh)) {
                err = PTR_ERR(bh);
        } else {
                *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh,
                                                    value_len, is_create);
                brelse(bh);
                err = 0;
        }

        up_read(&EXT4_I(inode)->xattr_sem);
        return err;
}

/*
 * ext4_xattr_set()
 *
 * Like ext4_xattr_set_handle, but start from an inode. This extended
 * attribute modification is a filesystem transaction by itself.
 *
 * Returns 0, or a negative error number on failure.
 */
int
ext4_xattr_set(struct inode *inode, int name_index, const char *name,
               const void *value, size_t value_len, int flags)
{
        handle_t *handle;
        struct super_block *sb = inode->i_sb;
        int error, retries = 0;
        int credits;

        error = dquot_initialize(inode);
        if (error)
                return error;

retry:
        error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE,
                                       &credits);
        if (error)
                return error;

        handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
        } else {
                int error2;

                error = ext4_xattr_set_handle(handle, inode, name_index, name,
                                              value, value_len, flags);
                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
                                        handle);
                error2 = ext4_journal_stop(handle);
                if (error == -ENOSPC &&
                    ext4_should_retry_alloc(sb, &retries))
                        goto retry;
                if (error == 0)
                        error = error2;
        }

        return error;
}

/*
 * Shift the EA entries in the inode to create space for the increased
 * i_extra_isize.
 */
static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
                                     int value_offs_shift, void *to,
                                     void *from, size_t n)
{
        struct ext4_xattr_entry *last = entry;
        int new_offs;

        /* We always shift xattr headers further thus offsets get lower */
        BUG_ON(value_offs_shift > 0);

        /* Adjust the value offsets of the entries */
        for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
                if (!last->e_value_inum && last->e_value_size) {
                        new_offs = le16_to_cpu(last->e_value_offs) +
                                                        value_offs_shift;
                        last->e_value_offs = cpu_to_le16(new_offs);
                }
        }
        /* Shift the entries by n bytes */
        memmove(to, from, n);
}

/*
 * Move xattr pointed to by 'entry' from inode into external xattr block
 */
static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
                                    struct ext4_inode *raw_inode,
                                    struct ext4_xattr_entry *entry)
{
        struct ext4_xattr_ibody_find *is = NULL;
        struct ext4_xattr_block_find *bs = NULL;
        char *buffer = NULL, *b_entry_name = NULL;
        size_t value_size = le32_to_cpu(entry->e_value_size);
        struct ext4_xattr_info i = {
                .value = NULL,
                .value_len = 0,
                .name_index = entry->e_name_index,
                .in_inode = !!entry->e_value_inum,
        };
        struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
        int needs_kvfree = 0;
        int error;

        is = kzalloc_obj(struct ext4_xattr_ibody_find, GFP_NOFS);
        bs = kzalloc_obj(struct ext4_xattr_block_find, GFP_NOFS);
        b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
        if (!is || !bs || !b_entry_name) {
                error = -ENOMEM;
                goto out;
        }

        is->s.not_found = -ENODATA;
        bs->s.not_found = -ENODATA;
        is->iloc.bh = NULL;
        bs->bh = NULL;

        /* Save the entry name and the entry value */
        if (entry->e_value_inum) {
                buffer = kvmalloc(value_size, GFP_NOFS);
                if (!buffer) {
                        error = -ENOMEM;
                        goto out;
                }
                needs_kvfree = 1;
                error = ext4_xattr_inode_get(inode, entry, buffer, value_size);
                if (error)
                        goto out;
        } else {
                size_t value_offs = le16_to_cpu(entry->e_value_offs);
                buffer = (void *)IFIRST(header) + value_offs;
        }

        memcpy(b_entry_name, entry->e_name, entry->e_name_len);
        b_entry_name[entry->e_name_len] = '\0';
        i.name = b_entry_name;

        error = ext4_get_inode_loc(inode, &is->iloc);
        if (error)
                goto out;

        error = ext4_xattr_ibody_find(inode, &i, is);
        if (error)
                goto out;

        i.value = buffer;
        i.value_len = value_size;
        error = ext4_xattr_block_find(inode, &i, bs);
        if (error)
                goto out;

        /* Move ea entry from the inode into the block */
        error = ext4_xattr_block_set(handle, inode, &i, bs);
        if (error)
                goto out;

        /* Remove the chosen entry from the inode */
        i.value = NULL;
        i.value_len = 0;
        error = ext4_xattr_ibody_set(handle, inode, &i, is);

out:
        kfree(b_entry_name);
        if (needs_kvfree && buffer)
                kvfree(buffer);
        if (is)
                brelse(is->iloc.bh);
        if (bs)
                brelse(bs->bh);
        kfree(is);
        kfree(bs);

        return error;
}

static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode,
                                       struct ext4_inode *raw_inode,
                                       int isize_diff, size_t ifree,
                                       size_t bfree, int *total_ino)
{
        struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
        struct ext4_xattr_entry *small_entry;
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_entry *last;
        unsigned int entry_size;        /* EA entry size */
        unsigned int total_size;        /* EA entry size + value size */
        unsigned int min_total_size;
        int error;

        while (isize_diff > ifree) {
                entry = NULL;
                small_entry = NULL;
                min_total_size = ~0U;
                last = IFIRST(header);
                /* Find the entry best suited to be pushed into EA block */
                for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
                        /* never move system.data out of the inode */
                        if ((last->e_name_len == 4) &&
                            (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) &&
                            !memcmp(last->e_name, "data", 4))
                                continue;
                        total_size = EXT4_XATTR_LEN(last->e_name_len);
                        if (!last->e_value_inum)
                                total_size += EXT4_XATTR_SIZE(
                                               le32_to_cpu(last->e_value_size));
                        if (total_size <= bfree &&
                            total_size < min_total_size) {
                                if (total_size + ifree < isize_diff) {
                                        small_entry = last;
                                } else {
                                        entry = last;
                                        min_total_size = total_size;
                                }
                        }
                }

                if (entry == NULL) {
                        if (small_entry == NULL)
                                return -ENOSPC;
                        entry = small_entry;
                }

                entry_size = EXT4_XATTR_LEN(entry->e_name_len);
                total_size = entry_size;
                if (!entry->e_value_inum)
                        total_size += EXT4_XATTR_SIZE(
                                              le32_to_cpu(entry->e_value_size));
                error = ext4_xattr_move_to_block(handle, inode, raw_inode,
                                                 entry);
                if (error)
                        return error;

                *total_ino -= entry_size;
                ifree += total_size;
                bfree -= total_size;
        }

        return 0;
}

/*
 * Expand an inode by new_extra_isize bytes when EAs are present.
 * Returns 0 on success or negative error number on failure.
 */
int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                               struct ext4_inode *raw_inode, handle_t *handle)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        static unsigned int mnt_count;
        size_t min_offs;
        size_t ifree, bfree;
        int total_ino;
        void *base, *end;
        int error = 0, tried_min_extra_isize = 0;
        int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize);
        int isize_diff;        /* How much do we need to grow i_extra_isize */

retry:
        isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize;
        if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
                return 0;

        header = IHDR(inode, raw_inode);

        /*
         * Check if enough free space is available in the inode to shift the
         * entries ahead by new_extra_isize.
         */

        base = IFIRST(header);
        end = ITAIL(inode, raw_inode);
        min_offs = end - base;
        total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);

        ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino);
        if (ifree >= isize_diff)
                goto shift;

        /*
         * Enough free space isn't available in the inode, check if
         * EA block can hold new_extra_isize bytes.
         */
        if (EXT4_I(inode)->i_file_acl) {
                struct buffer_head *bh;

                bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bh)) {
                        error = PTR_ERR(bh);
                        goto cleanup;
                }
                error = ext4_xattr_check_block(inode, bh);
                if (error) {
                        brelse(bh);
                        goto cleanup;
                }
                base = BHDR(bh);
                end = bh->b_data + bh->b_size;
                min_offs = end - base;
                bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base,
                                              NULL);
                brelse(bh);
                if (bfree + ifree < isize_diff) {
                        if (!tried_min_extra_isize && s_min_extra_isize) {
                                tried_min_extra_isize++;
                                new_extra_isize = s_min_extra_isize;
                                goto retry;
                        }
                        error = -ENOSPC;
                        goto cleanup;
                }
        } else {
                bfree = inode->i_sb->s_blocksize;
        }

        error = ext4_xattr_make_inode_space(handle, inode, raw_inode,
                                            isize_diff, ifree, bfree,
                                            &total_ino);
        if (error) {
                if (error == -ENOSPC && !tried_min_extra_isize &&
                    s_min_extra_isize) {
                        tried_min_extra_isize++;
                        new_extra_isize = s_min_extra_isize;
                        goto retry;
                }
                goto cleanup;
        }
shift:
        /* Adjust the offsets and shift the remaining entries ahead */
        ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize
                        - new_extra_isize, (void *)raw_inode +
                        EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
                        (void *)header, total_ino);
        EXT4_I(inode)->i_extra_isize = new_extra_isize;

        if (ext4_has_inline_data(inode))
                error = ext4_find_inline_data_nolock(inode);

cleanup:
        if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) {
                ext4_warning(inode->i_sb, "Unable to expand inode %llu. Delete some EAs or run e2fsck.",
                             inode->i_ino);
                mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count);
        }
        return error;
}

#define EIA_INCR 16 /* must be 2^n */
#define EIA_MASK (EIA_INCR - 1)

/* Add the large xattr @inode into @ea_inode_array for deferred iput().
 * If @ea_inode_array is new or full it will be grown and the old
 * contents copied over.
 */
static int
ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
                        struct inode *inode)
{
        if (*ea_inode_array == NULL) {
                /*
                 * Start with 15 inodes, so it fits into a power-of-two size.
                 */
                (*ea_inode_array) = kmalloc_flex(**ea_inode_array, inodes,
                                                 EIA_MASK, GFP_NOFS);
                if (*ea_inode_array == NULL)
                        return -ENOMEM;
                (*ea_inode_array)->count = 0;
        } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
                /* expand the array once all 15 + n * 16 slots are full */
                struct ext4_xattr_inode_array *new_array = NULL;

                new_array = kmalloc_flex(**ea_inode_array, inodes,
                                         (*ea_inode_array)->count + EIA_INCR,
                                         GFP_NOFS);
                if (new_array == NULL)
                        return -ENOMEM;
                memcpy(new_array, *ea_inode_array,
                       struct_size(*ea_inode_array, inodes,
                                   (*ea_inode_array)->count));
                kfree(*ea_inode_array);
                *ea_inode_array = new_array;
        }
        (*ea_inode_array)->count++;
        (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode;
        return 0;
}

/*
 * ext4_xattr_delete_inode()
 *
 * Free extended attribute resources associated with this inode. Traverse
 * all entries and decrement reference on any xattr inodes associated with this
 * inode. This is called immediately before an inode is freed. We have exclusive
 * access to the inode. If an orphan inode is deleted it will also release its
 * references on xattr block and xattr inodes.
 */
int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
                            struct ext4_xattr_inode_array **ea_inode_array,
                            int extra_credits)
{
        struct buffer_head *bh = NULL;
        struct ext4_xattr_ibody_header *header;
        struct ext4_iloc iloc = { .bh = NULL };
        struct ext4_xattr_entry *entry;
        struct inode *ea_inode;
        int error;

        error = ext4_journal_ensure_credits(handle, extra_credits,
                        ext4_free_metadata_revoke_credits(inode->i_sb, 1));
        if (error < 0) {
                EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
                goto cleanup;
        }

        if (ext4_has_feature_ea_inode(inode->i_sb) &&
            ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {

                error = ext4_get_inode_loc(inode, &iloc);
                if (error) {
                        EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
                        goto cleanup;
                }

                error = ext4_journal_get_write_access(handle, inode->i_sb,
                                                iloc.bh, EXT4_JTR_NONE);
                if (error) {
                        EXT4_ERROR_INODE(inode, "write access (error %d)",
                                         error);
                        goto cleanup;
                }

                header = IHDR(inode, ext4_raw_inode(&iloc));
                if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
                        ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
                                                     IFIRST(header),
                                                     false /* block_csum */,
                                                     ea_inode_array,
                                                     extra_credits,
                                                     false /* skip_quota */);
        }

        if (EXT4_I(inode)->i_file_acl) {
                bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bh)) {
                        error = PTR_ERR(bh);
                        if (error == -EIO) {
                                EXT4_ERROR_INODE_ERR(inode, EIO,
                                                     "block %llu read error",
                                                     EXT4_I(inode)->i_file_acl);
                        }
                        bh = NULL;
                        goto cleanup;
                }
                error = ext4_xattr_check_block(inode, bh);
                if (error)
                        goto cleanup;

                if (ext4_has_feature_ea_inode(inode->i_sb)) {
                        for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
                             entry = EXT4_XATTR_NEXT(entry)) {
                                if (!entry->e_value_inum)
                                        continue;
                                error = ext4_xattr_inode_iget(inode,
                                              le32_to_cpu(entry->e_value_inum),
                                              le32_to_cpu(entry->e_hash),
                                              &ea_inode);
                                if (error)
                                        continue;
                                ext4_xattr_inode_free_quota(inode, ea_inode,
                                              le32_to_cpu(entry->e_value_size));
                                iput(ea_inode);
                        }

                }

                ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
                                         extra_credits);
                /*
                 * Update i_file_acl value in the same transaction that releases
                 * block.
                 */
                EXT4_I(inode)->i_file_acl = 0;
                error = ext4_mark_inode_dirty(handle, inode);
                if (error) {
                        EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
                                         error);
                        goto cleanup;
                }
                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
        }
        error = 0;
cleanup:
        brelse(iloc.bh);
        brelse(bh);
        return error;
}

void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
{
        int idx;

        if (ea_inode_array == NULL)
                return;

        for (idx = 0; idx < ea_inode_array->count; ++idx)
                iput(ea_inode_array->inodes[idx]);
        kfree(ea_inode_array);
}

/*
 * ext4_xattr_block_cache_insert()
 *
 * Create a new entry in the extended attribute block cache, and insert
 * it unless such an entry is already in the cache.
 */
static void
ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
                              struct buffer_head *bh)
{
        struct ext4_xattr_header *header = BHDR(bh);
        __u32 hash = le32_to_cpu(header->h_hash);
        int reusable = le32_to_cpu(header->h_refcount) <
                       EXT4_XATTR_REFCOUNT_MAX;
        int error;

        if (!ea_block_cache)
                return;
        error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash,
                                      bh->b_blocknr, reusable);
        if (error) {
                if (error == -EBUSY)
                        ea_bdebug(bh, "already in cache");
        } else
                ea_bdebug(bh, "inserting [%x]", (int)hash);
}

/*
 * ext4_xattr_cmp()
 *
 * Compare two extended attribute blocks for equality.
 *
 * Returns 0 if the blocks are equal, 1 if they differ.
 */
static int
ext4_xattr_cmp(struct ext4_xattr_header *header1,
               struct ext4_xattr_header *header2)
{
        struct ext4_xattr_entry *entry1, *entry2;

        entry1 = ENTRY(header1+1);
        entry2 = ENTRY(header2+1);
        while (!IS_LAST_ENTRY(entry1)) {
                if (IS_LAST_ENTRY(entry2))
                        return 1;
                if (entry1->e_hash != entry2->e_hash ||
                    entry1->e_name_index != entry2->e_name_index ||
                    entry1->e_name_len != entry2->e_name_len ||
                    entry1->e_value_size != entry2->e_value_size ||
                    entry1->e_value_inum != entry2->e_value_inum ||
                    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
                        return 1;
                if (!entry1->e_value_inum &&
                    memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
                           (char *)header2 + le16_to_cpu(entry2->e_value_offs),
                           le32_to_cpu(entry1->e_value_size)))
                        return 1;

                entry1 = EXT4_XATTR_NEXT(entry1);
                entry2 = EXT4_XATTR_NEXT(entry2);
        }
        if (!IS_LAST_ENTRY(entry2))
                return 1;
        return 0;
}

/*
 * ext4_xattr_block_cache_find()
 *
 * Find an identical extended attribute block.
 *
 * Returns a pointer to the block found, or NULL if such a block was not
 * found, or an error pointer if an error occurred while reading ea block.
 */
static struct buffer_head *
ext4_xattr_block_cache_find(struct inode *inode,
                            struct ext4_xattr_header *header,
                            struct mb_cache_entry **pce)
{
        __u32 hash = le32_to_cpu(header->h_hash);
        struct mb_cache_entry *ce;
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);

        if (!ea_block_cache)
                return NULL;
        if (!header->h_hash)
                return NULL;  /* never share */
        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
        ce = mb_cache_entry_find_first(ea_block_cache, hash);
        while (ce) {
                struct buffer_head *bh;

                bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO);
                if (IS_ERR(bh)) {
                        if (PTR_ERR(bh) != -ENOMEM)
                                EXT4_ERROR_INODE(inode, "block %lu read error",
                                                 (unsigned long)ce->e_value);
                        mb_cache_entry_put(ea_block_cache, ce);
                        return bh;
                } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
                        *pce = ce;
                        return bh;
                }
                brelse(bh);
                ce = mb_cache_entry_find_next(ea_block_cache, ce);
        }
        return NULL;
}

#define NAME_HASH_SHIFT 5
#define VALUE_HASH_SHIFT 16

/*
 * ext4_xattr_hash_entry()
 *
 * Compute the hash of an extended attribute.
 */
static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
                                    size_t value_count)
{
        __u32 hash = 0;

        while (name_len--) {
                hash = (hash << NAME_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
                       (unsigned char)*name++;
        }
        while (value_count--) {
                hash = (hash << VALUE_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
                       le32_to_cpu(*value++);
        }
        return cpu_to_le32(hash);
}

/*
 * ext4_xattr_hash_entry_signed()
 *
 * Compute the hash of an extended attribute incorrectly.
 */
static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count)
{
        __u32 hash = 0;

        while (name_len--) {
                hash = (hash << NAME_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
                       (signed char)*name++;
        }
        while (value_count--) {
                hash = (hash << VALUE_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
                       le32_to_cpu(*value++);
        }
        return cpu_to_le32(hash);
}

#undef NAME_HASH_SHIFT
#undef VALUE_HASH_SHIFT

#define BLOCK_HASH_SHIFT 16

/*
 * ext4_xattr_rehash()
 *
 * Re-compute the extended attribute hash value after an entry has changed.
 */
static void ext4_xattr_rehash(struct ext4_xattr_header *header)
{
        struct ext4_xattr_entry *here;
        __u32 hash = 0;

        here = ENTRY(header+1);
        while (!IS_LAST_ENTRY(here)) {
                if (!here->e_hash) {
                        /* Block is not shared if an entry's hash value == 0 */
                        hash = 0;
                        break;
                }
                hash = (hash << BLOCK_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
                       le32_to_cpu(here->e_hash);
                here = EXT4_XATTR_NEXT(here);
        }
        header->h_hash = cpu_to_le32(hash);
}

#undef BLOCK_HASH_SHIFT

#define        HASH_BUCKET_BITS        10

struct mb_cache *
ext4_xattr_create_cache(void)
{
        return mb_cache_create(HASH_BUCKET_BITS);
}

void ext4_xattr_destroy_cache(struct mb_cache *cache)
{
        if (cache)
                mb_cache_destroy(cache);
}






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
// SPDX-License-Identifier: GPL-2.0+
/*
 * User-space Probes (UProbes)
 *
 * Copyright (C) IBM Corporation, 2008-2012
 * Authors:
 *        Srikar Dronamraju
 *        Jim Keniston
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 */

#include <linux/kernel.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>        /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/export.h>
#include <linux/rmap.h>                /* anon_vma_prepare */
#include <linux/mmu_notifier.h>
#include <linux/swap.h>                /* folio_free_swap */
#include <linux/ptrace.h>        /* user_enable_single_step */
#include <linux/kdebug.h>        /* notifier mechanism */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
#include <linux/rcupdate_trace.h>
#include <linux/workqueue.h>
#include <linux/srcu.h>
#include <linux/oom.h>          /* check_stable_address_space */
#include <linux/pagewalk.h>

#include <linux/uprobes.h>

#define UINSNS_PER_PAGE                        (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
#define MAX_UPROBE_XOL_SLOTS                UINSNS_PER_PAGE

static struct rb_root uprobes_tree = RB_ROOT;
/*
 * allows us to skip the uprobe_mmap if there are no uprobe events active
 * at this time.  Probably a fine grained per inode count is better?
 */
#define no_uprobe_events()        RB_EMPTY_ROOT(&uprobes_tree)

static DEFINE_RWLOCK(uprobes_treelock);        /* serialize rbtree access */
static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);

#define UPROBES_HASH_SZ        13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v)        (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])

DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);

/* Covers return_instance's uprobe lifetime. */
DEFINE_STATIC_SRCU(uretprobes_srcu);

/* Have a copy of original instruction */
#define UPROBE_COPY_INSN        0

struct uprobe {
        struct rb_node                rb_node;        /* node in the rb tree */
        refcount_t                ref;
        struct rw_semaphore        register_rwsem;
        struct rw_semaphore        consumer_rwsem;
        struct list_head        pending_list;
        struct list_head        consumers;
        struct inode                *inode;                /* Also hold a ref to inode */
        union {
                struct rcu_head                rcu;
                struct work_struct        work;
        };
        loff_t                        offset;
        loff_t                        ref_ctr_offset;
        unsigned long                flags;                /* "unsigned long" so bitops work */

        /*
         * The generic code assumes that it has two members of unknown type
         * owned by the arch-specific code:
         *
         *        insn -        copy_insn() saves the original instruction here for
         *                arch_uprobe_analyze_insn().
         *
         *        ixol -        potentially modified instruction to execute out of
         *                line, copied to xol_area by xol_get_insn_slot().
         */
        struct arch_uprobe        arch;
};

struct delayed_uprobe {
        struct list_head list;
        struct uprobe *uprobe;
        struct mm_struct *mm;
};

static DEFINE_MUTEX(delayed_uprobe_lock);
static LIST_HEAD(delayed_uprobe_list);

/*
 * Execute out of line area: anonymous executable mapping installed
 * by the probed task to execute the copy of the original instruction
 * mangled by set_swbp().
 *
 * On a breakpoint hit, thread contests for a slot.  It frees the
 * slot after singlestep. Currently a fixed number of slots are
 * allocated.
 */
struct xol_area {
        wait_queue_head_t                wq;                /* if all slots are busy */
        unsigned long                        *bitmap;        /* 0 = free slot */

        struct page                        *page;
        /*
         * We keep the vma's vm_start rather than a pointer to the vma
         * itself.  The probed process or a naughty kernel module could make
         * the vma go away, and we must handle that reasonably gracefully.
         */
        unsigned long                        vaddr;                /* Page(s) of instruction slots */
};

static void uprobe_warn(struct task_struct *t, const char *msg)
{
        pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg);
}

/*
 * valid_vma: Verify if the specified vma is an executable vma
 * Relax restrictions while unregistering: vm_flags might have
 * changed after breakpoint was inserted.
 *        - is_register: indicates if we are in register context.
 *        - Return 1 if the specified virtual address is in an
 *          executable vma.
 */
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;

        if (is_register)
                flags |= VM_WRITE;

        return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
}

static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
{
        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
}

static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
{
        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
}

/**
 * is_swbp_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_swbp_insn
 * Returns true if @insn is a breakpoint instruction.
 */
bool __weak is_swbp_insn(uprobe_opcode_t *insn)
{
        return *insn == UPROBE_SWBP_INSN;
}

/**
 * is_trap_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_trap_insn
 * Returns true if @insn is a breakpoint instruction.
 *
 * This function is needed for the case where an architecture has multiple
 * trap instructions (like powerpc).
 */
bool __weak is_trap_insn(uprobe_opcode_t *insn)
{
        return is_swbp_insn(insn);
}

void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
        void *kaddr = kmap_local_page(page);
        memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
        kunmap_local(kaddr);
}

static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
{
        void *kaddr = kmap_local_page(page);
        memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
        kunmap_local(kaddr);
}

static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
                         int nbytes, void *data)
{
        uprobe_opcode_t old_opcode;
        bool is_swbp;

        /*
         * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
         * We do not check if it is any other 'trap variant' which could
         * be conditional trap instruction such as the one powerpc supports.
         *
         * The logic is that we do not care if the underlying instruction
         * is a trap variant; uprobes always wins over any other (gdb)
         * breakpoint.
         */
        uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
        is_swbp = is_swbp_insn(&old_opcode);

        if (is_swbp_insn(insn)) {
                if (is_swbp)                /* register: already installed? */
                        return 0;
        } else {
                if (!is_swbp)                /* unregister: was it changed by us? */
                        return 0;
        }

        return 1;
}

static struct delayed_uprobe *
delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct delayed_uprobe *du;

        list_for_each_entry(du, &delayed_uprobe_list, list)
                if (du->uprobe == uprobe && du->mm == mm)
                        return du;
        return NULL;
}

static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct delayed_uprobe *du;

        if (delayed_uprobe_check(uprobe, mm))
                return 0;

        du = kzalloc_obj(*du);
        if (!du)
                return -ENOMEM;

        du->uprobe = uprobe;
        du->mm = mm;
        list_add(&du->list, &delayed_uprobe_list);
        return 0;
}

static void delayed_uprobe_delete(struct delayed_uprobe *du)
{
        if (WARN_ON(!du))
                return;
        list_del(&du->list);
        kfree(du);
}

static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct list_head *pos, *q;
        struct delayed_uprobe *du;

        if (!uprobe && !mm)
                return;

        list_for_each_safe(pos, q, &delayed_uprobe_list) {
                du = list_entry(pos, struct delayed_uprobe, list);

                if (uprobe && du->uprobe != uprobe)
                        continue;
                if (mm && du->mm != mm)
                        continue;

                delayed_uprobe_delete(du);
        }
}

static bool valid_ref_ctr_vma(struct uprobe *uprobe,
                              struct vm_area_struct *vma)
{
        unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);

        return uprobe->ref_ctr_offset &&
                vma->vm_file &&
                file_inode(vma->vm_file) == uprobe->inode &&
                (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
                vma->vm_start <= vaddr &&
                vma->vm_end > vaddr;
}

static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
        VMA_ITERATOR(vmi, mm, 0);
        struct vm_area_struct *tmp;

        for_each_vma(vmi, tmp)
                if (valid_ref_ctr_vma(uprobe, tmp))
                        return tmp;

        return NULL;
}

static int
__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
        void *kaddr;
        struct page *page;
        int ret;
        short *ptr;

        if (!vaddr || !d)
                return -EINVAL;

        ret = get_user_pages_remote(mm, vaddr, 1,
                                    FOLL_WRITE, &page, NULL);
        if (unlikely(ret <= 0)) {
                /*
                 * We are asking for 1 page. If get_user_pages_remote() fails,
                 * it may return 0, in that case we have to return error.
                 */
                return ret == 0 ? -EBUSY : ret;
        }

        kaddr = kmap_local_page(page);
        ptr = kaddr + (vaddr & ~PAGE_MASK);

        if (unlikely(*ptr + d < 0)) {
                pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
                        "curr val: %d, delta: %d\n", vaddr, *ptr, d);
                ret = -EINVAL;
                goto out;
        }

        *ptr += d;
        ret = 0;
out:
        kunmap_local(kaddr);
        put_page(page);
        return ret;
}

static void update_ref_ctr_warn(struct uprobe *uprobe,
                                struct mm_struct *mm, short d)
{
        pr_warn("ref_ctr %s failed for inode: 0x%llx offset: "
                "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
                d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
                (unsigned long long) uprobe->offset,
                (unsigned long long) uprobe->ref_ctr_offset, mm);
}

static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
                          short d)
{
        struct vm_area_struct *rc_vma;
        unsigned long rc_vaddr;
        int ret = 0;

        rc_vma = find_ref_ctr_vma(uprobe, mm);

        if (rc_vma) {
                rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
                ret = __update_ref_ctr(mm, rc_vaddr, d);
                if (ret)
                        update_ref_ctr_warn(uprobe, mm, d);

                if (d > 0)
                        return ret;
        }

        mutex_lock(&delayed_uprobe_lock);
        if (d > 0)
                ret = delayed_uprobe_add(uprobe, mm);
        else
                delayed_uprobe_remove(uprobe, mm);
        mutex_unlock(&delayed_uprobe_lock);

        return ret;
}

static bool orig_page_is_identical(struct vm_area_struct *vma,
                unsigned long vaddr, struct page *page, bool *pmd_mappable)
{
        const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
        struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping,
                                                    index);
        struct page *orig_page;
        bool identical;

        if (IS_ERR(orig_folio))
                return false;
        orig_page = folio_file_page(orig_folio, index);

        *pmd_mappable = folio_test_pmd_mappable(orig_folio);
        identical = folio_test_uptodate(orig_folio) &&
                    pages_identical(page, orig_page);
        folio_put(orig_folio);
        return identical;
}

static int __uprobe_write(struct vm_area_struct *vma,
                struct folio_walk *fw, struct folio *folio,
                unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
                bool is_register)
{
        const unsigned long vaddr = insn_vaddr & PAGE_MASK;
        bool pmd_mappable;

        /* For now, we'll only handle PTE-mapped folios. */
        if (fw->level != FW_LEVEL_PTE)
                return -EFAULT;

        /*
         * See can_follow_write_pte(): we'd actually prefer a writable PTE here,
         * but the VMA might not be writable.
         */
        if (!pte_write(fw->pte)) {
                if (!PageAnonExclusive(fw->page))
                        return -EFAULT;
                if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
                        return -EFAULT;
                /* SOFTDIRTY is handled via pte_mkdirty() below. */
        }

        /*
         * We'll temporarily unmap the page and flush the TLB, such that we can
         * modify the page atomically.
         */
        flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
        fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
        copy_to_page(fw->page, insn_vaddr, insn, nbytes);

        /*
         * When unregistering, we may only zap a PTE if uffd is disabled and
         * there are no unexpected folio references ...
         */
        if (is_register || userfaultfd_missing(vma) ||
            (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1))
                goto remap;

        /*
         * ... and the mapped page is identical to the original page that
         * would get faulted in on next access.
         */
        if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable))
                goto remap;

        dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
        folio_remove_rmap_pte(folio, fw->page, vma);
        if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
             folio_trylock(folio)) {
                folio_free_swap(folio);
                folio_unlock(folio);
        }
        folio_put(folio);

        return pmd_mappable;
remap:
        /*
         * Make sure that our copy_to_page() changes become visible before the
         * set_pte_at() write.
         */
        smp_wmb();
        /* We modified the page. Make sure to mark the PTE dirty. */
        set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
        return 0;
}

/*
 * NOTE:
 * Expect the breakpoint instruction to be the smallest size instruction for
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
 * supported by that architecture then we need to modify is_trap_at_addr and
 * uprobe_write_opcode accordingly. This would never be a problem for archs
 * that have fixed length instructions.
 *
 * uprobe_write_opcode - write the opcode at a given virtual address.
 * @auprobe: arch specific probepoint information.
 * @vma: the probed virtual memory area.
 * @opcode_vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @opcode_vaddr.
 *
 * Called with mm->mmap_lock held for write.
 * Return 0 (success) or a negative errno.
 */
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
                const unsigned long opcode_vaddr, uprobe_opcode_t opcode,
                bool is_register)
{
        return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
                            verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
}

int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
                 const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
                 uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
                 void *data)
{
        const unsigned long vaddr = insn_vaddr & PAGE_MASK;
        struct mm_struct *mm = vma->vm_mm;
        struct uprobe *uprobe;
        int ret, ref_ctr_updated = 0;
        unsigned int gup_flags = FOLL_FORCE;
        struct mmu_notifier_range range;
        struct folio_walk fw;
        struct folio *folio;
        struct page *page;

        uprobe = container_of(auprobe, struct uprobe, arch);

        if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
                return -EINVAL;

        /*
         * When registering, we have to break COW to get an exclusive anonymous
         * page that we can safely modify. Use FOLL_WRITE to trigger a write
         * fault if required. When unregistering, we might be lucky and the
         * anon page is already gone. So defer write faults until really
         * required. Use FOLL_SPLIT_PMD, because __uprobe_write()
         * cannot deal with PMDs yet.
         */
        if (is_register)
                gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;

retry:
        ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL);
        if (ret <= 0)
                goto out;
        folio = page_folio(page);

        ret = verify(page, insn_vaddr, insn, nbytes, data);
        if (ret <= 0) {
                folio_put(folio);
                goto out;
        }

        /* We are going to replace instruction, update ref_ctr. */
        if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) {
                ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
                if (ret) {
                        folio_put(folio);
                        goto out;
                }

                ref_ctr_updated = 1;
        }

        ret = 0;
        if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) {
                VM_WARN_ON_ONCE(is_register);
                folio_put(folio);
                goto out;
        }

        if (!is_register) {
                /*
                 * In the common case, we'll be able to zap the page when
                 * unregistering. So trigger MMU notifiers now, as we won't
                 * be able to do it under PTL.
                 */
                mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                        vaddr, vaddr + PAGE_SIZE);
                mmu_notifier_invalidate_range_start(&range);
        }

        ret = -EAGAIN;
        /* Walk the page tables again, to perform the actual update. */
        if (folio_walk_start(&fw, vma, vaddr, 0)) {
                if (fw.page == page)
                        ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register);
                folio_walk_end(&fw, vma);
        }

        if (!is_register)
                mmu_notifier_invalidate_range_end(&range);

        folio_put(folio);
        switch (ret) {
        case -EFAULT:
                gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
                fallthrough;
        case -EAGAIN:
                goto retry;
        default:
                break;
        }

out:
        /* Revert back reference counter if instruction update failed. */
        if (do_update_ref_ctr && ret < 0 && ref_ctr_updated)
                update_ref_ctr(uprobe, mm, is_register ? -1 : 1);

        /* try collapse pmd for compound page */
        if (ret > 0)
                collapse_pte_mapped_thp(mm, vaddr, false);

        return ret < 0 ? ret : 0;
}

/**
 * set_swbp - store breakpoint at a given address.
 * @auprobe: arch specific probepoint information.
 * @vma: the probed virtual memory area.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, store the breakpoint instruction at @vaddr.
 * Return 0 (success) or a negative errno.
 */
int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
                unsigned long vaddr)
{
        return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true);
}

/**
 * set_orig_insn - Restore the original instruction.
 * @vma: the probed virtual memory area.
 * @auprobe: arch specific probepoint information.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, restore the original opcode (opcode) at @vaddr.
 * Return 0 (success) or a negative errno.
 */
int __weak set_orig_insn(struct arch_uprobe *auprobe,
                struct vm_area_struct *vma, unsigned long vaddr)
{
        return uprobe_write_opcode(auprobe, vma, vaddr,
                        *(uprobe_opcode_t *)&auprobe->insn, false);
}

/* uprobe should have guaranteed positive refcount */
static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
        refcount_inc(&uprobe->ref);
        return uprobe;
}

/*
 * uprobe should have guaranteed lifetime, which can be either of:
 *   - caller already has refcount taken (and wants an extra one);
 *   - uprobe is RCU protected and won't be freed until after grace period;
 *   - we are holding uprobes_treelock (for read or write, doesn't matter).
 */
static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
{
        if (refcount_inc_not_zero(&uprobe->ref))
                return uprobe;
        return NULL;
}

static inline bool uprobe_is_active(struct uprobe *uprobe)
{
        return !RB_EMPTY_NODE(&uprobe->rb_node);
}

static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu)
{
        struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);

        kfree(uprobe);
}

static void uprobe_free_srcu(struct rcu_head *rcu)
{
        struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);

        call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace);
}

static void uprobe_free_deferred(struct work_struct *work)
{
        struct uprobe *uprobe = container_of(work, struct uprobe, work);

        write_lock(&uprobes_treelock);

        if (uprobe_is_active(uprobe)) {
                write_seqcount_begin(&uprobes_seqcount);
                rb_erase(&uprobe->rb_node, &uprobes_tree);
                write_seqcount_end(&uprobes_seqcount);
        }

        write_unlock(&uprobes_treelock);

        /*
         * If application munmap(exec_vma) before uprobe_unregister()
         * gets called, we don't get a chance to remove uprobe from
         * delayed_uprobe_list from remove_breakpoint(). Do it here.
         */
        mutex_lock(&delayed_uprobe_lock);
        delayed_uprobe_remove(uprobe, NULL);
        mutex_unlock(&delayed_uprobe_lock);

        /* start srcu -> rcu_tasks_trace -> kfree chain */
        call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu);
}

static void put_uprobe(struct uprobe *uprobe)
{
        if (!refcount_dec_and_test(&uprobe->ref))
                return;

        INIT_WORK(&uprobe->work, uprobe_free_deferred);
        schedule_work(&uprobe->work);
}

/* Initialize hprobe as SRCU-protected "leased" uprobe */
static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx)
{
        WARN_ON(!uprobe);
        hprobe->state = HPROBE_LEASED;
        hprobe->uprobe = uprobe;
        hprobe->srcu_idx = srcu_idx;
}

/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */
static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe)
{
        hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE;
        hprobe->uprobe = uprobe;
        hprobe->srcu_idx = -1;
}

/*
 * hprobe_consume() fetches hprobe's underlying uprobe and detects whether
 * uprobe is SRCU protected or is refcounted. hprobe_consume() can be
 * used only once for a given hprobe.
 *
 * Caller has to call hprobe_finalize() and pass previous hprobe_state, so
 * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever
 * is appropriate.
 */
static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate)
{
        *hstate = xchg(&hprobe->state, HPROBE_CONSUMED);
        switch (*hstate) {
        case HPROBE_LEASED:
        case HPROBE_STABLE:
                return hprobe->uprobe;
        case HPROBE_GONE:        /* uprobe is NULL, no SRCU */
        case HPROBE_CONSUMED:        /* uprobe was finalized already, do nothing */
                return NULL;
        default:
                WARN(1, "hprobe invalid state %d", *hstate);
                return NULL;
        }
}

/*
 * Reset hprobe state and, if hprobe was LEASED, release SRCU lock.
 * hprobe_finalize() can only be used from current context after
 * hprobe_consume() call (which determines uprobe and hstate value).
 */
static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate)
{
        switch (hstate) {
        case HPROBE_LEASED:
                __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
                break;
        case HPROBE_STABLE:
                put_uprobe(hprobe->uprobe);
                break;
        case HPROBE_GONE:
        case HPROBE_CONSUMED:
                break;
        default:
                WARN(1, "hprobe invalid state %d", hstate);
                break;
        }
}

/*
 * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED)
 * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of
 * them can win the race to perform SRCU unlocking. Whoever wins must perform
 * SRCU unlock.
 *
 * Returns underlying valid uprobe or NULL, if there was no underlying uprobe
 * to begin with or we failed to bump its refcount and it's going away.
 *
 * Returned non-NULL uprobe can be still safely used within an ongoing SRCU
 * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has
 * an extra refcount for caller to assume and use. Otherwise, it's not
 * guaranteed that returned uprobe has a positive refcount, so caller has to
 * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current
 * SRCU lock region. See dup_utask().
 */
static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
{
        enum hprobe_state hstate;

        /*
         * Caller should guarantee that return_instance is not going to be
         * freed from under us. This can be achieved either through holding
         * rcu_read_lock() or by owning return_instance in the first place.
         *
         * Underlying uprobe is itself protected from reuse by SRCU, so ensure
         * SRCU lock is held properly.
         */
        lockdep_assert(srcu_read_lock_held(&uretprobes_srcu));

        hstate = READ_ONCE(hprobe->state);
        switch (hstate) {
        case HPROBE_STABLE:
                /* uprobe has positive refcount, bump refcount, if necessary */
                return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe;
        case HPROBE_GONE:
                /*
                 * SRCU was unlocked earlier and we didn't manage to take
                 * uprobe refcnt, so it's effectively NULL
                 */
                return NULL;
        case HPROBE_CONSUMED:
                /*
                 * uprobe was consumed, so it's effectively NULL as far as
                 * uretprobe processing logic is concerned
                 */
                return NULL;
        case HPROBE_LEASED: {
                struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe);
                /*
                 * Try to switch hprobe state, guarding against
                 * hprobe_consume() or another hprobe_expire() racing with us.
                 * Note, if we failed to get uprobe refcount, we use special
                 * HPROBE_GONE state to signal that hprobe->uprobe shouldn't
                 * be used as it will be freed after SRCU is unlocked.
                 */
                if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) {
                        /* We won the race, we are the ones to unlock SRCU */
                        __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
                        return get ? get_uprobe(uprobe) : uprobe;
                }

                /*
                 * We lost the race, undo refcount bump (if it ever happened),
                 * unless caller would like an extra refcount anyways.
                 */
                if (uprobe && !get)
                        put_uprobe(uprobe);
                /*
                 * Even if hprobe_consume() or another hprobe_expire() wins
                 * the state update race and unlocks SRCU from under us, we
                 * still have a guarantee that underyling uprobe won't be
                 * freed due to ongoing caller's SRCU lock region, so we can
                 * return it regardless. Also, if `get` was true, we also have
                 * an extra ref for the caller to own. This is used in dup_utask().
                 */
                return uprobe;
        }
        default:
                WARN(1, "unknown hprobe state %d", hstate);
                return NULL;
        }
}

static __always_inline
int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
               const struct uprobe *r)
{
        if (l_inode < r->inode)
                return -1;

        if (l_inode > r->inode)
                return 1;

        if (l_offset < r->offset)
                return -1;

        if (l_offset > r->offset)
                return 1;

        return 0;
}

#define __node_2_uprobe(node) \
        rb_entry((node), struct uprobe, rb_node)

struct __uprobe_key {
        struct inode *inode;
        loff_t offset;
};

static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
{
        const struct __uprobe_key *a = key;
        return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
}

static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
{
        struct uprobe *u = __node_2_uprobe(a);
        return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}

/*
 * Assumes being inside RCU protected region.
 * No refcount is taken on returned uprobe.
 */
static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
{
        struct __uprobe_key key = {
                .inode = inode,
                .offset = offset,
        };
        struct rb_node *node;
        unsigned int seq;

        lockdep_assert(rcu_read_lock_trace_held());

        do {
                seq = read_seqcount_begin(&uprobes_seqcount);
                node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
                /*
                 * Lockless RB-tree lookups can result only in false negatives.
                 * If the element is found, it is correct and can be returned
                 * under RCU protection. If we find nothing, we need to
                 * validate that seqcount didn't change. If it did, we have to
                 * try again as we might have missed the element (false
                 * negative). If seqcount is unchanged, search truly failed.
                 */
                if (node)
                        return __node_2_uprobe(node);
        } while (read_seqcount_retry(&uprobes_seqcount, seq));

        return NULL;
}

/*
 * Attempt to insert a new uprobe into uprobes_tree.
 *
 * If uprobe already exists (for given inode+offset), we just increment
 * refcount of previously existing uprobe.
 *
 * If not, a provided new instance of uprobe is inserted into the tree (with
 * assumed initial refcount == 1).
 *
 * In any case, we return a uprobe instance that ends up being in uprobes_tree.
 * Caller has to clean up new uprobe instance, if it ended up not being
 * inserted into the tree.
 *
 * We assume that uprobes_treelock is held for writing.
 */
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
        struct rb_node *node;
again:
        node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
        if (node) {
                struct uprobe *u = __node_2_uprobe(node);

                if (!try_get_uprobe(u)) {
                        rb_erase(node, &uprobes_tree);
                        RB_CLEAR_NODE(&u->rb_node);
                        goto again;
                }

                return u;
        }

        return uprobe;
}

/*
 * Acquire uprobes_treelock and insert uprobe into uprobes_tree
 * (or reuse existing one, see __insert_uprobe() comments above).
 */
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
        struct uprobe *u;

        write_lock(&uprobes_treelock);
        write_seqcount_begin(&uprobes_seqcount);
        u = __insert_uprobe(uprobe);
        write_seqcount_end(&uprobes_seqcount);
        write_unlock(&uprobes_treelock);

        return u;
}

static void
ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
{
        pr_warn("ref_ctr_offset mismatch. inode: 0x%llx offset: 0x%llx "
                "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
                uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
                (unsigned long long) cur_uprobe->ref_ctr_offset,
                (unsigned long long) uprobe->ref_ctr_offset);
}

static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
                                   loff_t ref_ctr_offset)
{
        struct uprobe *uprobe, *cur_uprobe;

        uprobe = kzalloc_obj(struct uprobe);
        if (!uprobe)
                return ERR_PTR(-ENOMEM);

        uprobe->inode = inode;
        uprobe->offset = offset;
        uprobe->ref_ctr_offset = ref_ctr_offset;
        INIT_LIST_HEAD(&uprobe->consumers);
        init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);
        RB_CLEAR_NODE(&uprobe->rb_node);
        refcount_set(&uprobe->ref, 1);

        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
        /* a uprobe exists for this inode:offset combination */
        if (cur_uprobe != uprobe) {
                if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
                        ref_ctr_mismatch_warn(cur_uprobe, uprobe);
                        put_uprobe(cur_uprobe);
                        kfree(uprobe);
                        return ERR_PTR(-EINVAL);
                }
                kfree(uprobe);
                uprobe = cur_uprobe;
        }

        return uprobe;
}

static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        static atomic64_t id;

        down_write(&uprobe->consumer_rwsem);
        list_add_rcu(&uc->cons_node, &uprobe->consumers);
        uc->id = (__u64) atomic64_inc_return(&id);
        up_write(&uprobe->consumer_rwsem);
}

/*
 * For uprobe @uprobe, delete the consumer @uc.
 * Should never be called with consumer that's not part of @uprobe->consumers.
 */
static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        down_write(&uprobe->consumer_rwsem);
        list_del_rcu(&uc->cons_node);
        up_write(&uprobe->consumer_rwsem);
}

static int __copy_insn(struct address_space *mapping, struct file *filp,
                        void *insn, int nbytes, loff_t offset)
{
        struct page *page;
        /*
         * Ensure that the page that has the original instruction is populated
         * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
         * see uprobe_register().
         */
        if (mapping->a_ops->read_folio)
                page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
        else
                page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
        if (IS_ERR(page))
                return PTR_ERR(page);

        uprobe_copy_from_page(page, offset, insn, nbytes);
        put_page(page);

        return 0;
}

static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
        struct address_space *mapping = uprobe->inode->i_mapping;
        loff_t offs = uprobe->offset;
        void *insn = &uprobe->arch.insn;
        int size = sizeof(uprobe->arch.insn);
        int len, err = -EIO;

        /* Copy only available bytes, -EIO if nothing was read */
        do {
                if (offs >= i_size_read(uprobe->inode))
                        break;

                len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
                err = __copy_insn(mapping, filp, insn, len, offs);
                if (err)
                        break;

                insn += len;
                offs += len;
                size -= len;
        } while (size);

        return err;
}

static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
                                struct mm_struct *mm, unsigned long vaddr)
{
        int ret = 0;

        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                return ret;

        /* TODO: move this into _register, until then we abuse this sem. */
        down_write(&uprobe->consumer_rwsem);
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                goto out;

        ret = copy_insn(uprobe, file);
        if (ret)
                goto out;

        ret = -ENOTSUPP;
        if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
                goto out;

        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
        if (ret)
                goto out;

        smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
        set_bit(UPROBE_COPY_INSN, &uprobe->flags);

 out:
        up_write(&uprobe->consumer_rwsem);

        return ret;
}

static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
        return !uc->filter || uc->filter(uc, mm);
}

static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct uprobe_consumer *uc;
        bool ret = false;

        down_read(&uprobe->consumer_rwsem);
        list_for_each_entry(uc, &uprobe->consumers, cons_node) {
                ret = consumer_filter(uc, mm);
                if (ret)
                        break;
        }
        up_read(&uprobe->consumer_rwsem);

        return ret;
}

static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
                unsigned long vaddr)
{
        struct mm_struct *mm = vma->vm_mm;
        bool first_uprobe;
        int ret;

        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
        if (ret)
                return ret;

        /*
         * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
         * the task can hit this breakpoint right after __replace_page().
         */
        first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm);
        if (first_uprobe)
                mm_flags_set(MMF_HAS_UPROBES, mm);

        ret = set_swbp(&uprobe->arch, vma, vaddr);
        if (!ret)
                mm_flags_clear(MMF_RECALC_UPROBES, mm);
        else if (first_uprobe)
                mm_flags_clear(MMF_HAS_UPROBES, mm);

        return ret;
}

static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
                unsigned long vaddr)
{
        struct mm_struct *mm = vma->vm_mm;

        mm_flags_set(MMF_RECALC_UPROBES, mm);
        return set_orig_insn(&uprobe->arch, vma, vaddr);
}

struct map_info {
        struct map_info *next;
        struct mm_struct *mm;
        unsigned long vaddr;
};

static inline struct map_info *free_map_info(struct map_info *info)
{
        struct map_info *next = info->next;
        kfree(info);
        return next;
}

static struct map_info *
build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
        unsigned long pgoff = offset >> PAGE_SHIFT;
        struct vm_area_struct *vma;
        struct map_info *curr = NULL;
        struct map_info *prev = NULL;
        struct map_info *info;
        int more = 0;

 again:
        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;

                if (!prev && !more) {
                        /*
                         * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
                         * reclaim. This is optimistic, no harm done if it fails.
                         */
                        prev = kmalloc_obj(struct map_info,
                                           GFP_NOWAIT | __GFP_NOMEMALLOC);
                        if (prev)
                                prev->next = NULL;
                }
                if (!prev) {
                        more++;
                        continue;
                }

                if (!mmget_not_zero(vma->vm_mm))
                        continue;

                info = prev;
                prev = prev->next;
                info->next = curr;
                curr = info;

                info->mm = vma->vm_mm;
                info->vaddr = offset_to_vaddr(vma, offset);
        }
        i_mmap_unlock_read(mapping);

        if (!more)
                goto out;

        prev = curr;
        while (curr) {
                mmput(curr->mm);
                curr = curr->next;
        }

        do {
                info = kmalloc_obj(struct map_info);
                if (!info) {
                        curr = ERR_PTR(-ENOMEM);
                        goto out;
                }
                info->next = prev;
                prev = info;
        } while (--more);

        goto again;
 out:
        while (prev)
                prev = free_map_info(prev);
        return curr;
}

static int
register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
{
        bool is_register = !!new;
        struct map_info *info;
        int err = 0;

        percpu_down_write(&dup_mmap_sem);
        info = build_map_info(uprobe->inode->i_mapping,
                                        uprobe->offset, is_register);
        if (IS_ERR(info)) {
                err = PTR_ERR(info);
                goto out;
        }

        while (info) {
                struct mm_struct *mm = info->mm;
                struct vm_area_struct *vma;

                if (err && is_register)
                        goto free;
                /*
                 * We take mmap_lock for writing to avoid the race with
                 * find_active_uprobe_rcu() which takes mmap_lock for reading.
                 * Thus this install_breakpoint() can not make
                 * is_trap_at_addr() true right after find_uprobe_rcu()
                 * returns NULL in find_active_uprobe_rcu().
                 */
                mmap_write_lock(mm);
                if (check_stable_address_space(mm))
                        goto unlock;

                vma = find_vma(mm, info->vaddr);
                if (!vma || !valid_vma(vma, is_register) ||
                    file_inode(vma->vm_file) != uprobe->inode)
                        goto unlock;

                if (vma->vm_start > info->vaddr ||
                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
                        goto unlock;

                if (is_register) {
                        /* consult only the "caller", new consumer. */
                        if (consumer_filter(new, mm))
                                err = install_breakpoint(uprobe, vma, info->vaddr);
                } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) {
                        if (!filter_chain(uprobe, mm))
                                err |= remove_breakpoint(uprobe, vma, info->vaddr);
                }

 unlock:
                mmap_write_unlock(mm);
 free:
                mmput(mm);
                info = free_map_info(info);
        }
 out:
        percpu_up_write(&dup_mmap_sem);
        return err;
}

/**
 * uprobe_unregister_nosync - unregister an already registered probe.
 * @uprobe: uprobe to remove
 * @uc: identify which probe if multiple probes are colocated.
 */
void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        int err;

        down_write(&uprobe->register_rwsem);
        consumer_del(uprobe, uc);
        err = register_for_each_vma(uprobe, NULL);
        up_write(&uprobe->register_rwsem);

        /* TODO : cant unregister? schedule a worker thread */
        if (unlikely(err)) {
                uprobe_warn(current, "unregister, leaking uprobe");
                return;
        }

        put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);

void uprobe_unregister_sync(void)
{
        /*
         * Now that handler_chain() and handle_uretprobe_chain() iterate over
         * uprobe->consumers list under RCU protection without holding
         * uprobe->register_rwsem, we need to wait for RCU grace period to
         * make sure that we can't call into just unregistered
         * uprobe_consumer's callbacks anymore. If we don't do that, fast and
         * unlucky enough caller can free consumer's memory and cause
         * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
         */
        synchronize_rcu_tasks_trace();
        synchronize_srcu(&uretprobes_srcu);
}
EXPORT_SYMBOL_GPL(uprobe_unregister_sync);

/**
 * uprobe_register - register a probe
 * @inode: the file in which the probe has to be placed.
 * @offset: offset from the start of the file.
 * @ref_ctr_offset: offset of SDT marker / reference counter
 * @uc: information on howto handle the probe..
 *
 * Apart from the access refcount, uprobe_register() takes a creation
 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
 * inserted into the rbtree (i.e first consumer for a @inode:@offset
 * tuple).  Creation refcount stops uprobe_unregister from freeing the
 * @uprobe even before the register operation is complete. Creation
 * refcount is released when the last @uc for the @uprobe
 * unregisters. Caller of uprobe_register() is required to keep @inode
 * (and the containing mount) referenced.
 *
 * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
 */
struct uprobe *uprobe_register(struct inode *inode,
                                loff_t offset, loff_t ref_ctr_offset,
                                struct uprobe_consumer *uc)
{
        struct uprobe *uprobe;
        int ret;

        /* Uprobe must have at least one set consumer */
        if (!uc->handler && !uc->ret_handler)
                return ERR_PTR(-EINVAL);

        /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
        if (!inode->i_mapping->a_ops->read_folio &&
            !shmem_mapping(inode->i_mapping))
                return ERR_PTR(-EIO);
        /* Racy, just to catch the obvious mistakes */
        if (offset > i_size_read(inode))
                return ERR_PTR(-EINVAL);

        /*
         * This ensures that uprobe_copy_from_page(), copy_to_page() and
         * __update_ref_ctr() can't cross page boundary.
         */
        if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
                return ERR_PTR(-EINVAL);
        if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
                return ERR_PTR(-EINVAL);

        uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
        if (IS_ERR(uprobe))
                return uprobe;

        down_write(&uprobe->register_rwsem);
        consumer_add(uprobe, uc);
        ret = register_for_each_vma(uprobe, uc);
        up_write(&uprobe->register_rwsem);

        if (ret) {
                uprobe_unregister_nosync(uprobe, uc);
                /*
                 * Registration might have partially succeeded, so we can have
                 * this consumer being called right at this time. We need to
                 * sync here. It's ok, it's unlikely slow path.
                 */
                uprobe_unregister_sync();
                return ERR_PTR(ret);
        }

        return uprobe;
}
EXPORT_SYMBOL_GPL(uprobe_register);

/**
 * uprobe_apply - add or remove the breakpoints according to @uc->filter
 * @uprobe: uprobe which "owns" the breakpoint
 * @uc: consumer which wants to add more or remove some breakpoints
 * @add: add or remove the breakpoints
 * Return: 0 on success or negative error code.
 */
int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
        struct uprobe_consumer *con;
        int ret = -ENOENT;

        down_write(&uprobe->register_rwsem);

        rcu_read_lock_trace();
        list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
                if (con == uc) {
                        ret = register_for_each_vma(uprobe, add ? uc : NULL);
                        break;
                }
        }
        rcu_read_unlock_trace();

        up_write(&uprobe->register_rwsem);

        return ret;
}

static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
        VMA_ITERATOR(vmi, mm, 0);
        struct vm_area_struct *vma;
        int err = 0;

        mmap_write_lock(mm);
        for_each_vma(vmi, vma) {
                unsigned long vaddr;
                loff_t offset;

                if (!valid_vma(vma, false) ||
                    file_inode(vma->vm_file) != uprobe->inode)
                        continue;

                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
                if (uprobe->offset <  offset ||
                    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
                        continue;

                vaddr = offset_to_vaddr(vma, uprobe->offset);
                err |= remove_breakpoint(uprobe, vma, vaddr);
        }
        mmap_write_unlock(mm);

        return err;
}

static struct rb_node *
find_node_in_range(struct inode *inode, loff_t min, loff_t max)
{
        struct rb_node *n = uprobes_tree.rb_node;

        while (n) {
                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);

                if (inode < u->inode) {
                        n = n->rb_left;
                } else if (inode > u->inode) {
                        n = n->rb_right;
                } else {
                        if (max < u->offset)
                                n = n->rb_left;
                        else if (min > u->offset)
                                n = n->rb_right;
                        else
                                break;
                }
        }

        return n;
}

/*
 * For a given range in vma, build a list of probes that need to be inserted.
 */
static void build_probe_list(struct inode *inode,
                                struct vm_area_struct *vma,
                                unsigned long start, unsigned long end,
                                struct list_head *head)
{
        loff_t min, max;
        struct rb_node *n, *t;
        struct uprobe *u;

        INIT_LIST_HEAD(head);
        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;

        read_lock(&uprobes_treelock);
        n = find_node_in_range(inode, min, max);
        if (n) {
                for (t = n; t; t = rb_prev(t)) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset < min)
                                break;
                        /* if uprobe went away, it's safe to ignore it */
                        if (try_get_uprobe(u))
                                list_add(&u->pending_list, head);
                }
                for (t = n; (t = rb_next(t)); ) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset > max)
                                break;
                        /* if uprobe went away, it's safe to ignore it */
                        if (try_get_uprobe(u))
                                list_add(&u->pending_list, head);
                }
        }
        read_unlock(&uprobes_treelock);
}

/* @vma contains reference counter, not the probed instruction. */
static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
{
        struct list_head *pos, *q;
        struct delayed_uprobe *du;
        unsigned long vaddr;
        int ret = 0, err = 0;

        mutex_lock(&delayed_uprobe_lock);
        list_for_each_safe(pos, q, &delayed_uprobe_list) {
                du = list_entry(pos, struct delayed_uprobe, list);

                if (du->mm != vma->vm_mm ||
                    !valid_ref_ctr_vma(du->uprobe, vma))
                        continue;

                vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
                ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
                if (ret) {
                        update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
                        if (!err)
                                err = ret;
                }
                delayed_uprobe_delete(du);
        }
        mutex_unlock(&delayed_uprobe_lock);
        return err;
}

/*
 * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
 *
 * Currently we ignore all errors and always return 0, the callers
 * can't handle the failure anyway.
 */
int uprobe_mmap(struct vm_area_struct *vma)
{
        struct list_head tmp_list;
        struct uprobe *uprobe, *u;
        struct inode *inode;

        if (no_uprobe_events())
                return 0;

        if (vma->vm_file &&
            (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
            mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm))
                delayed_ref_ctr_inc(vma);

        if (!valid_vma(vma, true))
                return 0;

        inode = file_inode(vma->vm_file);
        if (!inode)
                return 0;

        mutex_lock(uprobes_mmap_hash(inode));
        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
        /*
         * We can race with uprobe_unregister(), this uprobe can be already
         * removed. But in this case filter_chain() must return false, all
         * consumers have gone away.
         */
        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
                if (!fatal_signal_pending(current) &&
                    filter_chain(uprobe, vma->vm_mm)) {
                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
                        install_breakpoint(uprobe, vma, vaddr);
                }
                put_uprobe(uprobe);
        }
        mutex_unlock(uprobes_mmap_hash(inode));

        return 0;
}

static bool
vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
        loff_t min, max;
        struct inode *inode;
        struct rb_node *n;

        inode = file_inode(vma->vm_file);

        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;

        read_lock(&uprobes_treelock);
        n = find_node_in_range(inode, min, max);
        read_unlock(&uprobes_treelock);

        return !!n;
}

/*
 * Called in context of a munmap of a vma.
 */
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
        if (no_uprobe_events() || !valid_vma(vma, false))
                return;

        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
                return;

        if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) ||
             mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm))
                return;

        if (vma_has_uprobes(vma, start, end))
                mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm);
}

static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
                            struct vm_area_struct *vma, struct vm_fault *vmf)
{
        struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;

        vmf->page = area->page;
        get_page(vmf->page);
        return 0;
}

static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
{
        return -EPERM;
}

static const struct vm_special_mapping xol_mapping = {
        .name = "[uprobes]",
        .fault = xol_fault,
        .mremap = xol_mremap,
};

unsigned long __weak arch_uprobe_get_xol_area(void)
{
        /* Try to map as high as possible, this is only a hint. */
        return get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
}

/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
        struct vm_area_struct *vma;
        int ret;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        if (mm->uprobes_state.xol_area) {
                ret = -EALREADY;
                goto fail;
        }

        if (!area->vaddr) {
                area->vaddr = arch_uprobe_get_xol_area();
                if (IS_ERR_VALUE(area->vaddr)) {
                        ret = area->vaddr;
                        goto fail;
                }
        }

        vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO|
                                VM_SEALED_SYSMAP,
                                &xol_mapping);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto fail;
        }

        ret = 0;
        /* pairs with get_xol_area() */
        smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
 fail:
        mmap_write_unlock(mm);

        return ret;
}

void * __weak arch_uretprobe_trampoline(unsigned long *psize)
{
        static uprobe_opcode_t insn = UPROBE_SWBP_INSN;

        *psize = UPROBE_SWBP_INSN_SIZE;
        return &insn;
}

static struct xol_area *__create_xol_area(unsigned long vaddr)
{
        struct mm_struct *mm = current->mm;
        unsigned long insns_size;
        struct xol_area *area;
        void *insns;

        area = kzalloc_obj(*area);
        if (unlikely(!area))
                goto out;

        area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
                               GFP_KERNEL);
        if (!area->bitmap)
                goto free_area;

        area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
        if (!area->page)
                goto free_bitmap;

        area->vaddr = vaddr;
        init_waitqueue_head(&area->wq);
        /* Reserve the 1st slot for get_trampoline_vaddr() */
        set_bit(0, area->bitmap);
        insns = arch_uretprobe_trampoline(&insns_size);
        arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);

        if (!xol_add_vma(mm, area))
                return area;

        __free_page(area->page);
 free_bitmap:
        kfree(area->bitmap);
 free_area:
        kfree(area);
 out:
        return NULL;
}

/*
 * get_xol_area - Allocate process's xol_area if necessary.
 * This area will be used for storing instructions for execution out of line.
 *
 * Returns the allocated area or NULL.
 */
static struct xol_area *get_xol_area(void)
{
        struct mm_struct *mm = current->mm;
        struct xol_area *area;

        if (!mm->uprobes_state.xol_area)
                __create_xol_area(0);

        /* Pairs with xol_add_vma() smp_store_release() */
        area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
        return area;
}

void __weak arch_uprobe_clear_state(struct mm_struct *mm)
{
}

void __weak arch_uprobe_init_state(struct mm_struct *mm)
{
}

/*
 * uprobe_clear_state - Free the area allocated for slots.
 */
void uprobe_clear_state(struct mm_struct *mm)
{
        struct xol_area *area = mm->uprobes_state.xol_area;

        mutex_lock(&delayed_uprobe_lock);
        delayed_uprobe_remove(NULL, mm);
        mutex_unlock(&delayed_uprobe_lock);

        arch_uprobe_clear_state(mm);

        if (!area)
                return;

        put_page(area->page);
        kfree(area->bitmap);
        kfree(area);
}

void uprobe_start_dup_mmap(void)
{
        percpu_down_read(&dup_mmap_sem);
}

void uprobe_end_dup_mmap(void)
{
        percpu_up_read(&dup_mmap_sem);
}

void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
        if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) {
                mm_flags_set(MMF_HAS_UPROBES, newmm);
                /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
                mm_flags_set(MMF_RECALC_UPROBES, newmm);
        }
}

static unsigned long xol_get_slot_nr(struct xol_area *area)
{
        unsigned long slot_nr;

        slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
        if (slot_nr < UINSNS_PER_PAGE) {
                if (!test_and_set_bit(slot_nr, area->bitmap))
                        return slot_nr;
        }

        return UINSNS_PER_PAGE;
}

/*
 * xol_get_insn_slot - allocate a slot for xol.
 */
static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask)
{
        struct xol_area *area = get_xol_area();
        unsigned long slot_nr;

        if (!area)
                return false;

        wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE);

        utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES;
        arch_uprobe_copy_ixol(area->page, utask->xol_vaddr,
                              &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
        return true;
}

/*
 * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot()
 */
static void xol_free_insn_slot(struct uprobe_task *utask)
{
        struct xol_area *area = current->mm->uprobes_state.xol_area;
        unsigned long offset = utask->xol_vaddr - area->vaddr;
        unsigned int slot_nr;

        utask->xol_vaddr = 0;
        /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */
        if (WARN_ON_ONCE(offset >= PAGE_SIZE))
                return;

        slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
        clear_bit(slot_nr, area->bitmap);
        smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
        if (waitqueue_active(&area->wq))
                wake_up(&area->wq);
}

void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
                                  void *src, unsigned long len)
{
        /* Initialize the slot */
        copy_to_page(page, vaddr, src, len);

        /*
         * We probably need flush_icache_user_page() but it needs vma.
         * This should work on most of architectures by default. If
         * architecture needs to do something different it can define
         * its own version of the function.
         */
        flush_dcache_page(page);
}

/**
 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
 * @regs: Reflects the saved state of the task after it has hit a breakpoint
 * instruction.
 * Return the address of the breakpoint instruction.
 */
unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
{
        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}

unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (unlikely(utask && utask->active_uprobe))
                return utask->vaddr;

        return instruction_pointer(regs);
}

static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri)
{
        ri->cons_cnt = 0;
        ri->next = utask->ri_pool;
        utask->ri_pool = ri;
}

static struct return_instance *ri_pool_pop(struct uprobe_task *utask)
{
        struct return_instance *ri = utask->ri_pool;

        if (likely(ri))
                utask->ri_pool = ri->next;

        return ri;
}

static void ri_free(struct return_instance *ri)
{
        kfree(ri->extra_consumers);
        kfree_rcu(ri, rcu);
}

static void free_ret_instance(struct uprobe_task *utask,
                              struct return_instance *ri, bool cleanup_hprobe)
{
        unsigned seq;

        if (cleanup_hprobe) {
                enum hprobe_state hstate;

                (void)hprobe_consume(&ri->hprobe, &hstate);
                hprobe_finalize(&ri->hprobe, hstate);
        }

        /*
         * At this point return_instance is unlinked from utask's
         * return_instances list and this has become visible to ri_timer().
         * If seqcount now indicates that ri_timer's return instance
         * processing loop isn't active, we can return ri into the pool of
         * to-be-reused return instances for future uretprobes. If ri_timer()
         * happens to be running right now, though, we fallback to safety and
         * just perform RCU-delated freeing of ri.
         * Admittedly, this is a rather simple use of seqcount, but it nicely
         * abstracts away all the necessary memory barriers, so we use
         * a well-supported kernel primitive here.
         */
        if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
                /* immediate reuse of ri without RCU GP is OK */
                ri_pool_push(utask, ri);
        } else {
                /* we might be racing with ri_timer(), so play it safe */
                ri_free(ri);
        }
}

/*
 * Called with no locks held.
 * Called in context of an exiting or an exec-ing thread.
 */
void uprobe_free_utask(struct task_struct *t)
{
        struct uprobe_task *utask = t->utask;
        struct return_instance *ri, *ri_next;

        if (!utask)
                return;

        t->utask = NULL;
        WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);

        timer_delete_sync(&utask->ri_timer);

        ri = utask->return_instances;
        while (ri) {
                ri_next = ri->next;
                free_ret_instance(utask, ri, true /* cleanup_hprobe */);
                ri = ri_next;
        }

        /* free_ret_instance() above might add to ri_pool, so this loop should come last */
        ri = utask->ri_pool;
        while (ri) {
                ri_next = ri->next;
                ri_free(ri);
                ri = ri_next;
        }

        kfree(utask);
}

#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */

#define for_each_ret_instance_rcu(pos, head) \
        for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next))

static void ri_timer(struct timer_list *timer)
{
        struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
        struct return_instance *ri;

        /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */
        guard(srcu)(&uretprobes_srcu);
        /* RCU protects return_instance from freeing. */
        guard(rcu)();

        /*
         * See free_ret_instance() for notes on seqcount use.
         * We also employ raw API variants to avoid lockdep false-positive
         * warning complaining about enabled preemption. The timer can only be
         * invoked once for a uprobe_task. Therefore there can only be one
         * writer. The reader does not require an even sequence count to make
         * progress, so it is OK to remain preemptible on PREEMPT_RT.
         */
        raw_write_seqcount_begin(&utask->ri_seqcount);

        for_each_ret_instance_rcu(ri, utask->return_instances)
                hprobe_expire(&ri->hprobe, false);

        raw_write_seqcount_end(&utask->ri_seqcount);
}

static struct uprobe_task *alloc_utask(void)
{
        struct uprobe_task *utask;

        utask = kzalloc_obj(*utask);
        if (!utask)
                return NULL;

        timer_setup(&utask->ri_timer, ri_timer, 0);
        seqcount_init(&utask->ri_seqcount);

        return utask;
}

/*
 * Allocate a uprobe_task object for the task if necessary.
 * Called when the thread hits a breakpoint.
 *
 * Returns:
 * - pointer to new uprobe_task on success
 * - NULL otherwise
 */
static struct uprobe_task *get_utask(void)
{
        if (!current->utask)
                current->utask = alloc_utask();
        return current->utask;
}

static struct return_instance *alloc_return_instance(struct uprobe_task *utask)
{
        struct return_instance *ri;

        ri = ri_pool_pop(utask);
        if (ri)
                return ri;

        ri = kzalloc_obj(*ri);
        if (!ri)
                return ZERO_SIZE_PTR;

        return ri;
}

static struct return_instance *dup_return_instance(struct return_instance *old)
{
        struct return_instance *ri;

        ri = kmemdup(old, sizeof(*ri), GFP_KERNEL);
        if (!ri)
                return NULL;

        if (unlikely(old->cons_cnt > 1)) {
                ri->extra_consumers = kmemdup(old->extra_consumers,
                                              sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1),
                                              GFP_KERNEL);
                if (!ri->extra_consumers) {
                        kfree(ri);
                        return NULL;
                }
        }

        return ri;
}

static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
{
        struct uprobe_task *n_utask;
        struct return_instance **p, *o, *n;
        struct uprobe *uprobe;

        n_utask = alloc_utask();
        if (!n_utask)
                return -ENOMEM;
        t->utask = n_utask;

        /* protect uprobes from freeing, we'll need try_get_uprobe() them */
        guard(srcu)(&uretprobes_srcu);

        p = &n_utask->return_instances;
        for (o = o_utask->return_instances; o; o = o->next) {
                n = dup_return_instance(o);
                if (!n)
                        return -ENOMEM;

                /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */
                uprobe = hprobe_expire(&o->hprobe, true);

                /*
                 * New utask will have stable properly refcounted uprobe or
                 * NULL. Even if we failed to get refcounted uprobe, we still
                 * need to preserve full set of return_instances for proper
                 * uretprobe handling and nesting in forked task.
                 */
                hprobe_init_stable(&n->hprobe, uprobe);

                n->next = NULL;
                rcu_assign_pointer(*p, n);
                p = &n->next;

                n_utask->depth++;
        }

        return 0;
}

static void dup_xol_work(struct callback_head *work)
{
        if (current->flags & PF_EXITING)
                return;

        if (!__create_xol_area(current->utask->dup_xol_addr) &&
                        !fatal_signal_pending(current))
                uprobe_warn(current, "dup xol area");
}

/*
 * Called in context of a new clone/fork from copy_process.
 */
void uprobe_copy_process(struct task_struct *t, u64 flags)
{
        struct uprobe_task *utask = current->utask;
        struct mm_struct *mm = current->mm;
        struct xol_area *area;

        t->utask = NULL;

        if (!utask || !utask->return_instances)
                return;

        if (mm == t->mm && !(flags & CLONE_VFORK))
                return;

        if (dup_utask(t, utask))
                return uprobe_warn(t, "dup ret instances");

        /* The task can fork() after dup_xol_work() fails */
        area = mm->uprobes_state.xol_area;
        if (!area)
                return uprobe_warn(t, "dup xol area");

        if (mm == t->mm)
                return;

        t->utask->dup_xol_addr = area->vaddr;
        init_task_work(&t->utask->dup_xol_work, dup_xol_work);
        task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
}

/*
 * Current area->vaddr notion assume the trampoline address is always
 * equal area->vaddr.
 *
 * Returns -1 in case the xol_area is not allocated.
 */
unsigned long uprobe_get_trampoline_vaddr(void)
{
        unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR;
        struct xol_area *area;

        /* Pairs with xol_add_vma() smp_store_release() */
        area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
        if (area)
                trampoline_vaddr = area->vaddr;

        return trampoline_vaddr;
}

static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
                                        struct pt_regs *regs)
{
        struct return_instance *ri = utask->return_instances, *ri_next;
        enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;

        while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
                ri_next = ri->next;
                rcu_assign_pointer(utask->return_instances, ri_next);
                utask->depth--;

                free_ret_instance(utask, ri, true /* cleanup_hprobe */);
                ri = ri_next;
        }
}

static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
                              struct return_instance *ri)
{
        struct uprobe_task *utask = current->utask;
        unsigned long orig_ret_vaddr, trampoline_vaddr;
        bool chained;
        int srcu_idx;

        if (!get_xol_area())
                goto free;

        if (utask->depth >= MAX_URETPROBE_DEPTH) {
                printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
                                " nestedness limit pid/tgid=%d/%d\n",
                                current->pid, current->tgid);
                goto free;
        }

        trampoline_vaddr = uprobe_get_trampoline_vaddr();
        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
        if (orig_ret_vaddr == -1)
                goto free;

        /* drop the entries invalidated by longjmp() */
        chained = (orig_ret_vaddr == trampoline_vaddr);
        cleanup_return_instances(utask, chained, regs);

        /*
         * We don't want to keep trampoline address in stack, rather keep the
         * original return address of first caller thru all the consequent
         * instances. This also makes breakpoint unwrapping easier.
         */
        if (chained) {
                if (!utask->return_instances) {
                        /*
                         * This situation is not possible. Likely we have an
                         * attack from user-space.
                         */
                        uprobe_warn(current, "handle tail call");
                        goto free;
                }
                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
        }

        /* __srcu_read_lock() because SRCU lock survives switch to user space */
        srcu_idx = __srcu_read_lock(&uretprobes_srcu);

        ri->func = instruction_pointer(regs);
        ri->stack = user_stack_pointer(regs);
        ri->orig_ret_vaddr = orig_ret_vaddr;
        ri->chained = chained;

        utask->depth++;

        hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx);
        ri->next = utask->return_instances;
        rcu_assign_pointer(utask->return_instances, ri);

        mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD);

        return;
free:
        ri_free(ri);
}

/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
        struct uprobe_task *utask = current->utask;
        int err;

        if (!try_get_uprobe(uprobe))
                return -EINVAL;

        if (!xol_get_insn_slot(uprobe, utask)) {
                err = -ENOMEM;
                goto err_out;
        }

        utask->vaddr = bp_vaddr;
        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
        if (unlikely(err)) {
                xol_free_insn_slot(utask);
                goto err_out;
        }

        utask->active_uprobe = uprobe;
        utask->state = UTASK_SSTEP;
        return 0;
err_out:
        put_uprobe(uprobe);
        return err;
}

/*
 * If we are singlestepping, then ensure this thread is not connected to
 * non-fatal signals until completion of singlestep.  When xol insn itself
 * triggers the signal,  restart the original insn even if the task is
 * already SIGKILL'ed (since coredump should report the correct ip).  This
 * is even more important if the task has a handler for SIGSEGV/etc, The
 * _same_ instruction should be repeated again after return from the signal
 * handler, and SSTEP can never finish in this case.
 */
bool uprobe_deny_signal(void)
{
        struct task_struct *t = current;
        struct uprobe_task *utask = t->utask;

        if (likely(!utask || !utask->active_uprobe))
                return false;

        WARN_ON_ONCE(utask->state != UTASK_SSTEP);

        if (task_sigpending(t)) {
                utask->signal_denied = true;
                clear_tsk_thread_flag(t, TIF_SIGPENDING);

                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
                        utask->state = UTASK_SSTEP_TRAPPED;
                        set_tsk_thread_flag(t, TIF_UPROBE);
                }
        }

        return true;
}

static void mmf_recalc_uprobes(struct mm_struct *mm)
{
        VMA_ITERATOR(vmi, mm, 0);
        struct vm_area_struct *vma;

        for_each_vma(vmi, vma) {
                if (!valid_vma(vma, false))
                        continue;
                /*
                 * This is not strictly accurate, we can race with
                 * uprobe_unregister() and see the already removed
                 * uprobe if delete_uprobe() was not yet called.
                 * Or this uprobe can be filtered out.
                 */
                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
                        return;
        }

        mm_flags_clear(MMF_HAS_UPROBES, mm);
}

static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
{
        struct page *page;
        uprobe_opcode_t opcode;
        int result;

        if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
                return -EINVAL;

        pagefault_disable();
        result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
        pagefault_enable();

        if (likely(result == 0))
                goto out;

        result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
        if (result < 0)
                return result;

        uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
        put_page(page);
 out:
        /* This needs to return true for any variant of the trap insn */
        return is_trap_insn(&opcode);
}

static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
{
        struct mm_struct *mm = current->mm;
        struct uprobe *uprobe = NULL;
        struct vm_area_struct *vma;
        struct file *vm_file;
        loff_t offset;
        unsigned int seq;

        guard(rcu)();

        if (!mmap_lock_speculate_try_begin(mm, &seq))
                return NULL;

        vma = vma_lookup(mm, bp_vaddr);
        if (!vma)
                return NULL;

        /*
         * vm_file memory can be reused for another instance of struct file,
         * but can't be freed from under us, so it's safe to read fields from
         * it, even if the values are some garbage values; ultimately
         * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
         * that whatever we speculatively found is correct
         */
        vm_file = READ_ONCE(vma->vm_file);
        if (!vm_file)
                return NULL;

        offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
        uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
        if (!uprobe)
                return NULL;

        /* now double check that nothing about MM changed */
        if (mmap_lock_speculate_retry(mm, seq))
                return NULL;

        return uprobe;
}

/* assumes being inside RCU protected region */
static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{
        struct mm_struct *mm = current->mm;
        struct uprobe *uprobe = NULL;
        struct vm_area_struct *vma;

        uprobe = find_active_uprobe_speculative(bp_vaddr);
        if (uprobe)
                return uprobe;

        mmap_read_lock(mm);
        vma = vma_lookup(mm, bp_vaddr);
        if (vma) {
                if (vma->vm_file) {
                        struct inode *inode = file_inode(vma->vm_file);
                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);

                        uprobe = find_uprobe_rcu(inode, offset);
                }

                if (!uprobe)
                        *is_swbp = is_trap_at_addr(mm, bp_vaddr);
        } else {
                *is_swbp = -EFAULT;
        }

        if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm))
                mmf_recalc_uprobes(mm);
        mmap_read_unlock(mm);

        return uprobe;
}

static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie)
{
        struct return_consumer *ric;

        if (unlikely(ri == ZERO_SIZE_PTR))
                return ri;

        if (unlikely(ri->cons_cnt > 0)) {
                ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL);
                if (!ric) {
                        ri_free(ri);
                        return ZERO_SIZE_PTR;
                }
                ri->extra_consumers = ric;
        }

        ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1];
        ric->id = id;
        ric->cookie = cookie;

        ri->cons_cnt++;
        return ri;
}

static struct return_consumer *
return_consumer_find(struct return_instance *ri, int *iter, int id)
{
        struct return_consumer *ric;
        int idx;

        for (idx = *iter; idx < ri->cons_cnt; idx++)
        {
                ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1];
                if (ric->id == id) {
                        *iter = idx + 1;
                        return ric;
                }
        }

        return NULL;
}

static bool ignore_ret_handler(int rc)
{
        return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE;
}

static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
        struct uprobe_consumer *uc;
        bool has_consumers = false, remove = true;
        struct return_instance *ri = NULL;
        struct uprobe_task *utask = current->utask;

        utask->auprobe = &uprobe->arch;

        list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
                bool session = uc->handler && uc->ret_handler;
                __u64 cookie = 0;
                int rc = 0;

                if (uc->handler) {
                        rc = uc->handler(uc, regs, &cookie);
                        WARN(rc < 0 || rc > 2,
                                "bad rc=0x%x from %ps()\n", rc, uc->handler);
                }

                remove &= rc == UPROBE_HANDLER_REMOVE;
                has_consumers = true;

                if (!uc->ret_handler || ignore_ret_handler(rc))
                        continue;

                if (!ri)
                        ri = alloc_return_instance(utask);

                if (session)
                        ri = push_consumer(ri, uc->id, cookie);
        }
        utask->auprobe = NULL;

        if (!ZERO_OR_NULL_PTR(ri))
                prepare_uretprobe(uprobe, regs, ri);

        if (remove && has_consumers) {
                down_read(&uprobe->register_rwsem);

                /* re-check that removal is still required, this time under lock */
                if (!filter_chain(uprobe, current->mm)) {
                        WARN_ON(!uprobe_is_active(uprobe));
                        unapply_uprobe(uprobe, current->mm);
                }

                up_read(&uprobe->register_rwsem);
        }
}

static void
handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs)
{
        struct return_consumer *ric;
        struct uprobe_consumer *uc;
        int ric_idx = 0;

        /* all consumers unsubscribed meanwhile */
        if (unlikely(!uprobe))
                return;

        rcu_read_lock_trace();
        list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
                bool session = uc->handler && uc->ret_handler;

                if (uc->ret_handler) {
                        ric = return_consumer_find(ri, &ric_idx, uc->id);
                        if (!session || ric)
                                uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL);
                }
        }
        rcu_read_unlock_trace();
}

static struct return_instance *find_next_ret_chain(struct return_instance *ri)
{
        bool chained;

        do {
                chained = ri->chained;
                ri = ri->next;        /* can't be NULL if chained */
        } while (chained);

        return ri;
}

void uprobe_handle_trampoline(struct pt_regs *regs)
{
        struct uprobe_task *utask;
        struct return_instance *ri, *ri_next, *next_chain;
        struct uprobe *uprobe;
        enum hprobe_state hstate;
        bool valid;

        utask = current->utask;
        if (!utask)
                goto sigill;

        ri = utask->return_instances;
        if (!ri)
                goto sigill;

        do {
                /*
                 * We should throw out the frames invalidated by longjmp().
                 * If this chain is valid, then the next one should be alive
                 * or NULL; the latter case means that nobody but ri->func
                 * could hit this trampoline on return. TODO: sigaltstack().
                 */
                next_chain = find_next_ret_chain(ri);
                valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs);

                instruction_pointer_set(regs, ri->orig_ret_vaddr);
                do {
                        /* pop current instance from the stack of pending return instances,
                         * as it's not pending anymore: we just fixed up original
                         * instruction pointer in regs and are about to call handlers;
                         * this allows fixup_uretprobe_trampoline_entries() to properly fix up
                         * captured stack traces from uretprobe handlers, in which pending
                         * trampoline addresses on the stack are replaced with correct
                         * original return addresses
                         */
                        ri_next = ri->next;
                        rcu_assign_pointer(utask->return_instances, ri_next);
                        utask->depth--;

                        uprobe = hprobe_consume(&ri->hprobe, &hstate);
                        if (valid)
                                handle_uretprobe_chain(ri, uprobe, regs);
                        hprobe_finalize(&ri->hprobe, hstate);

                        /* We already took care of hprobe, no need to waste more time on that. */
                        free_ret_instance(utask, ri, false /* !cleanup_hprobe */);
                        ri = ri_next;
                } while (ri != next_chain);
        } while (!valid);

        return;

sigill:
        uprobe_warn(current, "handle uretprobe, sending SIGILL.");
        force_sig(SIGILL);
}

bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
{
        return false;
}

bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
                                        struct pt_regs *regs)
{
        return true;
}

void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
{
}

/*
 * Run handler and ask thread to singlestep.
 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
 */
static void handle_swbp(struct pt_regs *regs)
{
        struct uprobe *uprobe;
        unsigned long bp_vaddr;
        int is_swbp;

        bp_vaddr = uprobe_get_swbp_addr(regs);
        if (bp_vaddr == uprobe_get_trampoline_vaddr())
                return uprobe_handle_trampoline(regs);

        rcu_read_lock_trace();

        uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
        if (!uprobe) {
                if (is_swbp > 0) {
                        /* No matching uprobe; signal SIGTRAP. */
                        force_sig(SIGTRAP);
                } else {
                        /*
                         * Either we raced with uprobe_unregister() or we can't
                         * access this memory. The latter is only possible if
                         * another thread plays with our ->mm. In both cases
                         * we can simply restart. If this vma was unmapped we
                         * can pretend this insn was not executed yet and get
                         * the (correct) SIGSEGV after restart.
                         */
                        instruction_pointer_set(regs, bp_vaddr);
                }
                goto out;
        }

        /* change it in advance for ->handler() and restart */
        instruction_pointer_set(regs, bp_vaddr);

        /*
         * TODO: move copy_insn/etc into _register and remove this hack.
         * After we hit the bp, _unregister + _register can install the
         * new and not-yet-analyzed uprobe at the same address, restart.
         */
        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
                goto out;

        /*
         * Pairs with the smp_wmb() in prepare_uprobe().
         *
         * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
         * we must also see the stores to &uprobe->arch performed by the
         * prepare_uprobe() call.
         */
        smp_rmb();

        /* Tracing handlers use ->utask to communicate with fetch methods */
        if (!get_utask())
                goto out;

        if (arch_uprobe_ignore(&uprobe->arch, regs))
                goto out;

        handler_chain(uprobe, regs);

        /* Try to optimize after first hit. */
        arch_uprobe_optimize(&uprobe->arch, bp_vaddr);

        /*
         * If user decided to take execution elsewhere, it makes little sense
         * to execute the original instruction, so let's skip it.
         */
        if (instruction_pointer(regs) != bp_vaddr)
                goto out;

        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
                goto out;

        if (pre_ssout(uprobe, regs, bp_vaddr))
                goto out;

out:
        /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
        rcu_read_unlock_trace();
}

void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
{
        struct uprobe *uprobe;
        int is_swbp;

        guard(rcu_tasks_trace)();

        uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
        if (!uprobe)
                return;
        if (!get_utask())
                return;
        if (arch_uprobe_ignore(&uprobe->arch, regs))
                return;
        handler_chain(uprobe, regs);
}

/*
 * Perform required fix-ups and disable singlestep.
 * Allow pending signals to take effect.
 */
static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
{
        struct uprobe *uprobe;
        int err = 0;

        uprobe = utask->active_uprobe;
        if (utask->state == UTASK_SSTEP_ACK)
                err = arch_uprobe_post_xol(&uprobe->arch, regs);
        else if (utask->state == UTASK_SSTEP_TRAPPED)
                arch_uprobe_abort_xol(&uprobe->arch, regs);
        else
                WARN_ON_ONCE(1);

        put_uprobe(uprobe);
        utask->active_uprobe = NULL;
        utask->state = UTASK_RUNNING;
        xol_free_insn_slot(utask);

        if (utask->signal_denied) {
                set_thread_flag(TIF_SIGPENDING);
                utask->signal_denied = false;
        }

        if (unlikely(err)) {
                uprobe_warn(current, "execute the probed insn, sending SIGILL.");
                force_sig(SIGILL);
        }
}

/*
 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
 * allows the thread to return from interrupt. After that handle_swbp()
 * sets utask->active_uprobe.
 *
 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
 * and allows the thread to return from interrupt.
 *
 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
 * uprobe_notify_resume().
 */
void uprobe_notify_resume(struct pt_regs *regs)
{
        struct uprobe_task *utask;

        clear_thread_flag(TIF_UPROBE);

        utask = current->utask;
        if (utask && utask->active_uprobe)
                handle_singlestep(utask, regs);
        else
                handle_swbp(regs);
}

/*
 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
 */
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
        if (!current->mm)
                return 0;

        if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) &&
            (!current->utask || !current->utask->return_instances))
                return 0;

        set_thread_flag(TIF_UPROBE);
        return 1;
}

/*
 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
 */
int uprobe_post_sstep_notifier(struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (!current->mm || !utask || !utask->active_uprobe)
                /* task is currently not uprobed */
                return 0;

        utask->state = UTASK_SSTEP_ACK;
        set_thread_flag(TIF_UPROBE);
        return 1;
}

static struct notifier_block uprobe_exception_nb = {
        .notifier_call                = arch_uprobe_exception_notify,
        .priority                = INT_MAX-1,        /* notified after kprobes, kgdb */
};

void __init uprobes_init(void)
{
        int i;

        for (i = 0; i < UPROBES_HASH_SZ; i++)
                mutex_init(&uprobes_mmap_mutex[i]);

        BUG_ON(register_die_notifier(&uprobe_exception_nb));
}


































































































    2 














    1 



   11 









    2 

































































































































































































































































































    7 











































    8 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   22 

   23 



















    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGTABLE_H
#define _LINUX_PGTABLE_H

#include <linux/pfn.h>
#include <asm/pgtable.h>

#define PMD_ORDER        (PMD_SHIFT - PAGE_SHIFT)
#define PUD_ORDER        (PUD_SHIFT - PAGE_SHIFT)

#ifndef __ASSEMBLY__
#ifdef CONFIG_MMU

#include <linux/mm_types.h>
#include <linux/bug.h>
#include <linux/errno.h>
#include <asm-generic/pgtable_uffd.h>
#include <linux/page_table_check.h>

#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
        defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
#endif

/*
 * This defines the generic helper for accessing PMD page
 * table page. Although platforms can still override this
 * via their respective <asm/pgtable.h>.
 */
#ifndef pmd_pgtable
#define pmd_pgtable(pmd) pmd_page(pmd)
#endif

#define pmd_folio(pmd) page_folio(pmd_page(pmd))

/*
 * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
 *
 * The pXx_index() functions return the index of the entry in the page
 * table page which would control the given virtual address
 *
 * As these functions may be used by the same code for different levels of
 * the page table folding, they are always available, regardless of
 * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0
 * because in such cases PTRS_PER_PxD equals 1.
 */

static inline unsigned long pte_index(unsigned long address)
{
        return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}

#ifndef pmd_index
static inline unsigned long pmd_index(unsigned long address)
{
        return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
}
#define pmd_index pmd_index
#endif

#ifndef pud_index
static inline unsigned long pud_index(unsigned long address)
{
        return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}
#define pud_index pud_index
#endif

#ifndef pgd_index
/* Must be a compile-time constant, so implement it as a macro */
#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif

#ifndef kernel_pte_init
static inline void kernel_pte_init(void *addr)
{
}
#define kernel_pte_init kernel_pte_init
#endif

#ifndef pmd_init
static inline void pmd_init(void *addr)
{
}
#define pmd_init pmd_init
#endif

#ifndef pud_init
static inline void pud_init(void *addr)
{
}
#define pud_init pud_init
#endif

#ifndef pte_offset_kernel
static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
        return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
}
#define pte_offset_kernel pte_offset_kernel
#endif

#ifdef CONFIG_HIGHPTE
#define __pte_map(pmd, address) \
        ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address)))
#define pte_unmap(pte)        do {        \
        kunmap_local((pte));        \
        rcu_read_unlock();        \
} while (0)
#else
static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
{
        return pte_offset_kernel(pmd, address);
}
static inline void pte_unmap(pte_t *pte)
{
        rcu_read_unlock();
}
#endif

void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);

/* Find an entry in the second-level page table.. */
#ifndef pmd_offset
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
{
        return pud_pgtable(*pud) + pmd_index(address);
}
#define pmd_offset pmd_offset
#endif

#ifndef pud_offset
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
        return p4d_pgtable(*p4d) + pud_index(address);
}
#define pud_offset pud_offset
#endif

static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
{
        return (pgd + pgd_index(address));
};

/*
 * a shortcut to get a pgd_t in a given mm
 */
#ifndef pgd_offset
#define pgd_offset(mm, address)                pgd_offset_pgd((mm)->pgd, (address))
#endif

/*
 * a shortcut which implies the use of the kernel's pgd, instead
 * of a process's
 */
#define pgd_offset_k(address)                pgd_offset(&init_mm, (address))

/*
 * In many cases it is known that a virtual address is mapped at PMD or PTE
 * level, so instead of traversing all the page table levels, we can get a
 * pointer to the PMD entry in user or kernel page table or translate a virtual
 * address to the pointer in the PTE in the kernel page tables with simple
 * helpers.
 */
static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
}

static inline pmd_t *pmd_off_k(unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
}

static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
        pmd_t *pmd = pmd_off_k(vaddr);

        return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
}

#ifndef pmd_young
static inline int pmd_young(pmd_t pmd)
{
        return 0;
}
#endif

#ifndef pmd_dirty
static inline int pmd_dirty(pmd_t pmd)
{
        return 0;
}
#endif

/*
 * A facility to provide lazy MMU batching.  This allows PTE updates and
 * page invalidations to be delayed until a call to leave lazy MMU mode
 * is issued.  Some architectures may benefit from doing this, and it is
 * beneficial for both shadow and direct mode hypervisors, which may batch
 * the PTE updates which happen during this window.  Note that using this
 * interface requires that read hazards be removed from the code.  A read
 * hazard could result in the direct mode hypervisor case, since the actual
 * write to the page tables may not yet have taken place, so reads though
 * a raw PTE pointer after it has been modified are not guaranteed to be
 * up to date.
 *
 * In the general case, no lock is guaranteed to be held between entry and exit
 * of the lazy mode. (In practice, for user PTE updates, the appropriate page
 * table lock(s) are held, but for kernel PTE updates, no lock is held).
 * The implementation must therefore assume preemption may be enabled upon
 * entry to the mode and cpu migration is possible; it must take steps to be
 * robust against this. An implementation may handle this by disabling
 * preemption, as a consequence generic code may not sleep while the lazy MMU
 * mode is active.
 *
 * The mode is disabled in interrupt context and calls to the lazy_mmu API have
 * no effect.
 *
 * The lazy MMU mode is enabled for a given block of code using:
 *
 *   lazy_mmu_mode_enable();
 *   <code>
 *   lazy_mmu_mode_disable();
 *
 * Nesting is permitted: <code> may itself use an enable()/disable() pair.
 * A nested call to enable() has no functional effect; however disable() causes
 * any batched architectural state to be flushed regardless of nesting. After a
 * call to disable(), the caller can therefore rely on all previous page table
 * modifications to have taken effect, but the lazy MMU mode may still be
 * enabled.
 *
 * In certain cases, it may be desirable to temporarily pause the lazy MMU mode.
 * This can be done using:
 *
 *   lazy_mmu_mode_pause();
 *   <code>
 *   lazy_mmu_mode_resume();
 *
 * pause() ensures that the mode is exited regardless of the nesting level;
 * resume() re-enters the mode at the same nesting level. Any call to the
 * lazy_mmu_mode_* API between those two calls has no effect. In particular,
 * this means that pause()/resume() pairs may nest.
 *
 * is_lazy_mmu_mode_active() can be used to check whether the lazy MMU mode is
 * currently enabled.
 */
#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE
/**
 * lazy_mmu_mode_enable() - Enable the lazy MMU mode.
 *
 * Enters a new lazy MMU mode section; if the mode was not already enabled,
 * enables it and calls arch_enter_lazy_mmu_mode().
 *
 * Must be paired with a call to lazy_mmu_mode_disable().
 *
 * Has no effect if called:
 * - While paused - see lazy_mmu_mode_pause()
 * - In interrupt context
 */
static inline void lazy_mmu_mode_enable(void)
{
        struct lazy_mmu_state *state = &current->lazy_mmu_state;

        if (in_interrupt() || state->pause_count > 0)
                return;

        VM_WARN_ON_ONCE(state->enable_count == U8_MAX);

        if (state->enable_count++ == 0)
                arch_enter_lazy_mmu_mode();
}

/**
 * lazy_mmu_mode_disable() - Disable the lazy MMU mode.
 *
 * Exits the current lazy MMU mode section. If it is the outermost section,
 * disables the mode and calls arch_leave_lazy_mmu_mode(). Otherwise (nested
 * section), calls arch_flush_lazy_mmu_mode().
 *
 * Must match a call to lazy_mmu_mode_enable().
 *
 * Has no effect if called:
 * - While paused - see lazy_mmu_mode_pause()
 * - In interrupt context
 */
static inline void lazy_mmu_mode_disable(void)
{
        struct lazy_mmu_state *state = &current->lazy_mmu_state;

        if (in_interrupt() || state->pause_count > 0)
                return;

        VM_WARN_ON_ONCE(state->enable_count == 0);

        if (--state->enable_count == 0)
                arch_leave_lazy_mmu_mode();
        else /* Exiting a nested section */
                arch_flush_lazy_mmu_mode();

}

/**
 * lazy_mmu_mode_pause() - Pause the lazy MMU mode.
 *
 * Pauses the lazy MMU mode; if it is currently active, disables it and calls
 * arch_leave_lazy_mmu_mode().
 *
 * Must be paired with a call to lazy_mmu_mode_resume(). Calls to the
 * lazy_mmu_mode_* API have no effect until the matching resume() call.
 *
 * Has no effect if called:
 * - While paused (inside another pause()/resume() pair)
 * - In interrupt context
 */
static inline void lazy_mmu_mode_pause(void)
{
        struct lazy_mmu_state *state = &current->lazy_mmu_state;

        if (in_interrupt())
                return;

        VM_WARN_ON_ONCE(state->pause_count == U8_MAX);

        if (state->pause_count++ == 0 && state->enable_count > 0)
                arch_leave_lazy_mmu_mode();
}

/**
 * lazy_mmu_mode_resume() - Resume the lazy MMU mode.
 *
 * Resumes the lazy MMU mode; if it was active at the point where the matching
 * call to lazy_mmu_mode_pause() was made, re-enables it and calls
 * arch_enter_lazy_mmu_mode().
 *
 * Must match a call to lazy_mmu_mode_pause().
 *
 * Has no effect if called:
 * - While paused (inside another pause()/resume() pair)
 * - In interrupt context
 */
static inline void lazy_mmu_mode_resume(void)
{
        struct lazy_mmu_state *state = &current->lazy_mmu_state;

        if (in_interrupt())
                return;

        VM_WARN_ON_ONCE(state->pause_count == 0);

        if (--state->pause_count == 0 && state->enable_count > 0)
                arch_enter_lazy_mmu_mode();
}
#else
static inline void lazy_mmu_mode_enable(void) {}
static inline void lazy_mmu_mode_disable(void) {}
static inline void lazy_mmu_mode_pause(void) {}
static inline void lazy_mmu_mode_resume(void) {}
#endif

#ifndef pte_batch_hint
/**
 * pte_batch_hint - Number of pages that can be added to batch without scanning.
 * @ptep: Page table pointer for the entry.
 * @pte: Page table entry.
 *
 * Some architectures know that a set of contiguous ptes all map the same
 * contiguous memory with the same permissions. In this case, it can provide a
 * hint to aid pte batching without the core code needing to scan every pte.
 *
 * An architecture implementation may ignore the PTE accessed state. Further,
 * the dirty state must apply atomically to all the PTEs described by the hint.
 *
 * May be overridden by the architecture, else pte_batch_hint is always 1.
 */
static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
{
        return 1;
}
#endif

#ifndef pte_advance_pfn
static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
        return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#endif

#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)

#ifndef set_ptes
/**
 * set_ptes - Map consecutive pages to a contiguous range of addresses.
 * @mm: Address space to map the pages into.
 * @addr: Address to map the first page at.
 * @ptep: Page table pointer for the first entry.
 * @pte: Page table entry for the first page.
 * @nr: Number of pages to map.
 *
 * When nr==1, initial state of pte may be present or not present, and new state
 * may be present or not present. When nr>1, initial state of all ptes must be
 * not present, and new state must be present.
 *
 * May be overridden by the architecture, or the architecture can define
 * set_pte() and PFN_PTE_SHIFT.
 *
 * Context: The caller holds the page table lock.  The pages all belong
 * to the same folio.  The PTEs are all in the same PMD.
 */
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
        page_table_check_ptes_set(mm, addr, ptep, pte, nr);

        for (;;) {
                set_pte(ptep, pte);
                if (--nr == 0)
                        break;
                ptep++;
                pte = pte_next_pfn(pte);
        }
}
#endif
#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);
#endif

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);
#else
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmdp,
                                        pmd_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
static inline int pudp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pud_t *pudp,
                                        pud_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef ptep_get
static inline pte_t ptep_get(pte_t *ptep)
{
        return READ_ONCE(*ptep);
}
#endif

#ifndef pmdp_get
static inline pmd_t pmdp_get(pmd_t *pmdp)
{
        return READ_ONCE(*pmdp);
}
#endif

#ifndef pudp_get
static inline pud_t pudp_get(pud_t *pudp)
{
        return READ_ONCE(*pudp);
}
#endif

#ifndef p4dp_get
static inline p4d_t p4dp_get(p4d_t *p4dp)
{
        return READ_ONCE(*p4dp);
}
#endif

#ifndef pgdp_get
static inline pgd_t pgdp_get(pgd_t *pgdp)
{
        return READ_ONCE(*pgdp);
}
#endif

#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma,
                unsigned long address, pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);
        bool young = true;

        if (!pte_young(pte))
                young = false;
        else
                set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
        return young;
}
#endif

#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;
        bool young = true;

        if (!pmd_young(pmd))
                young = false;
        else
                set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
        return young;
}
#else
static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
        return false;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
bool ptep_clear_flush_young(struct vm_area_struct *vma,
                unsigned long address, pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool pmdp_clear_flush_young(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp);
#else
/*
 * Despite relevant to THP only, this API is called from generic rmap code
 * under PageTransHuge(), hence needs a dummy implementation for !THP
 */
static inline bool pmdp_clear_flush_young(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
        return false;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef arch_has_hw_nonleaf_pmd_young
/*
 * Return whether the accessed bit in non-leaf PMD entries is supported on the
 * local CPU.
 */
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
}
#endif

#ifndef arch_has_hw_pte_young
/*
 * Return whether the accessed bit is supported on the local CPU.
 *
 * This stub assumes accessing through an old PTE triggers a page fault.
 * Architectures that automatically set the access bit should overwrite it.
 */
static inline bool arch_has_hw_pte_young(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG);
}
#endif

#ifndef exec_folio_order
/*
 * Returns preferred minimum folio order for executable file-backed memory. Must
 * be in range [0, PMD_ORDER). Default to order-0.
 */
static inline unsigned int exec_folio_order(void)
{
        return 0;
}
#endif

#ifndef arch_check_zapped_pte
static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
                                         pte_t pte)
{
}
#endif

#ifndef arch_check_zapped_pmd
static inline void arch_check_zapped_pmd(struct vm_area_struct *vma,
                                         pmd_t pmd)
{
}
#endif

#ifndef arch_check_zapped_pud
static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
{
}
#endif

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long address,
                                       pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);
        pte_clear(mm, address, ptep);
        page_table_check_pte_clear(mm, address, pte);
        return pte;
}
#endif

#ifndef clear_young_dirty_ptes
/**
 * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the
 *                same folio as old/clean.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to mark old/clean.
 * @flags: Flags to modify the PTE batch semantics.
 *
 * May be overridden by the architecture; otherwise, implemented by
 * get_and_clear/modify/set for each pte in the range.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
                                          unsigned long addr, pte_t *ptep,
                                          unsigned int nr, cydp_t flags)
{
        pte_t pte;

        for (;;) {
                if (flags == CYDP_CLEAR_YOUNG)
                        ptep_test_and_clear_young(vma, addr, ptep);
                else {
                        pte = ptep_get_and_clear(vma->vm_mm, addr, ptep);
                        if (flags & CYDP_CLEAR_YOUNG)
                                pte = pte_mkold(pte);
                        if (flags & CYDP_CLEAR_DIRTY)
                                pte = pte_mkclean(pte);
                        set_pte_at(vma->vm_mm, addr, ptep, pte);
                }
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);

        pte_clear(mm, addr, ptep);
        /*
         * No need for ptep_get_and_clear(): page table check doesn't care about
         * any bits that could have been set by HW concurrently.
         */
        page_table_check_pte_clear(mm, addr, pte);
}

#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH
/*
 * For walking the pagetables without holding any locks.  Some architectures
 * (eg x86-32 PAE) cannot load the entries atomically without using expensive
 * instructions.  We are guaranteed that a PTE will only either go from not
 * present to present, or present to not present -- it will not switch to a
 * completely different present page without a TLB flush inbetween; which we
 * are blocking by holding interrupts off.
 *
 * Setting ptes from not present to present goes:
 *
 *   ptep->pte_high = h;
 *   smp_wmb();
 *   ptep->pte_low = l;
 *
 * And present to not present goes:
 *
 *   ptep->pte_low = 0;
 *   smp_wmb();
 *   ptep->pte_high = 0;
 *
 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
 * We load pte_high *after* loading pte_low, which ensures we don't see an older
 * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
 * picked up a changed pte high. We might have gotten rubbish values from
 * pte_low and pte_high, but we are guaranteed that pte_low will not have the
 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
 * operates on present ptes we're safe.
 */
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        pte_t pte;

        do {
                pte.pte_low = ptep->pte_low;
                smp_rmb();
                pte.pte_high = ptep->pte_high;
                smp_rmb();
        } while (unlikely(pte.pte_low != ptep->pte_low));

        return pte;
}
#define ptep_get_lockless ptep_get_lockless

#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
        pmd_t pmd;

        do {
                pmd.pmd_low = pmdp->pmd_low;
                smp_rmb();
                pmd.pmd_high = pmdp->pmd_high;
                smp_rmb();
        } while (unlikely(pmd.pmd_low != pmdp->pmd_low));

        return pmd;
}
#define pmdp_get_lockless pmdp_get_lockless
#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */

/*
 * We require that the PTE can be read atomically.
 */
#ifndef ptep_get_lockless
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        return ptep_get(ptep);
}
#endif

#ifndef pmdp_get_lockless
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
        return pmdp_get(pmdp);
}
static inline void pmdp_get_lockless_sync(void)
{
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;

        pmd_clear(pmdp);
        page_table_check_pmd_clear(mm, address, pmd);

        return pmd;
}
#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pud_t *pudp)
{
        pud_t pud = *pudp;

        pud_clear(pudp);
        page_table_check_pud_clear(mm, address, pud);

        return pud;
}
#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pmd_t *pmdp,
                                            int full)
{
        return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
}
#endif

#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pud_t *pudp,
                                            int full)
{
        return pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pte_t *ptep,
                                            int full)
{
        return ptep_get_and_clear(mm, address, ptep);
}
#endif

#ifndef get_and_clear_full_ptes
/**
 * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
 *                             the same folio, collecting dirty/accessed bits.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
 * returned PTE.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned int nr, int full)
{
        pte_t pte, tmp_pte;

        pte = ptep_get_and_clear_full(mm, addr, ptep, full);
        while (--nr) {
                ptep++;
                addr += PAGE_SIZE;
                tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
                if (pte_dirty(tmp_pte))
                        pte = pte_mkdirty(pte);
                if (pte_young(tmp_pte))
                        pte = pte_mkyoung(pte);
        }
        return pte;
}
#endif

/**
 * get_and_clear_ptes - Clear present PTEs that map consecutive pages of
 *                        the same folio, collecting dirty/accessed bits.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 *
 * Use this instead of get_and_clear_full_ptes() if it is known that we don't
 * need to clear the full mm, which is mostly the case.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
        return get_and_clear_full_ptes(mm, addr, ptep, nr, 0);
}

#ifndef clear_full_ptes
/**
 * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
 *                     folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_get_and_clear_full().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                ptep_get_and_clear_full(mm, addr, ptep, full);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

/**
 * clear_ptes - Clear present PTEs that map consecutive pages of the same folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 *
 * Use this instead of clear_full_ptes() if it is known that we don't need to
 * clear the full mm, which is mostly the case.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
        clear_full_ptes(mm, addr, ptep, nr, 0);
}

/*
 * If two threads concurrently fault at the same page, the thread that
 * won the race updates the PTE and its local TLB/Cache. The other thread
 * gives up, simply does nothing, and continues; on architectures where
 * software can update TLB,  local TLB can be updated here to avoid next page
 * fault. This function updates TLB only, do nothing with cache or others.
 * It is the difference with function update_mmu_cache.
 */
#ifndef update_mmu_tlb_range
static inline void update_mmu_tlb_range(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep, unsigned int nr)
{
}
#endif

static inline void update_mmu_tlb(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep)
{
        update_mmu_tlb_range(vma, address, ptep, 1);
}

/*
 * Some architectures may be able to avoid expensive synchronization
 * primitives when modifications are made to PTE's which are already
 * not present, or in the process of an address space destruction.
 */
#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
static inline void pte_clear_not_present_full(struct mm_struct *mm,
                                              unsigned long address,
                                              pte_t *ptep,
                                              int full)
{
        pte_clear(mm, address, ptep);
}
#endif

#ifndef clear_not_present_full_ptes
/**
 * clear_not_present_full_ptes - Clear multiple not present PTEs which are
 *                                 consecutive in the pgtable.
 * @mm: Address space the ptes represent.
 * @addr: Address of the first pte.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over pte_clear_not_present_full().
 *
 * Context: The caller holds the page table lock.  The PTEs are all not present.
 * The PTEs are all in the same PMD.
 */
static inline void clear_not_present_full_ptes(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                pte_clear_not_present_full(mm, addr, ptep, full);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pmd_t *pmdp);
extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pud_t *pudp);
#endif

#ifndef pte_mkwrite
static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        return pte_mkwrite_novma(pte);
}
#endif

#if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite)
static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        return pmd_mkwrite_novma(pmd);
}
#endif

#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
struct mm_struct;
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
        pte_t old_pte = ptep_get(ptep);
        set_pte_at(mm, address, ptep, pte_wrprotect(old_pte));
}
#endif

#ifndef wrprotect_ptes
/**
 * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
 *                    folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to write-protect.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_set_wrprotect().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
        for (;;) {
                ptep_set_wrprotect(mm, addr, ptep);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

#ifndef clear_flush_young_ptes
/**
 * clear_flush_young_ptes - Mark PTEs that map consecutive pages of the same
 *                            folio as old and flush the TLB.
 * @vma: The virtual memory area the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear access bit.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_clear_flush_young().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline bool clear_flush_young_ptes(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep, unsigned int nr)
{
        bool young = false;

        for (;;) {
                young |= ptep_clear_flush_young(vma, addr, ptep);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }

        return young;
}
#endif

#ifndef test_and_clear_young_ptes
/**
 * test_and_clear_young_ptes - Mark PTEs that map consecutive pages of the same
 *                               folio as old
 * @vma: The virtual memory area the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear access bit.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_test_and_clear_young().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 *
 * Returns: whether any PTE was young.
 */
static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep, unsigned int nr)
{
        bool young = false;

        for (;;) {
                young |= ptep_test_and_clear_young(vma, addr, ptep);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }

        return young;
}
#endif

/*
 * On some architectures hardware does not set page access bit when accessing
 * memory page, it is responsibility of software setting this bit. It brings
 * out extra page fault penalty to track page access bit. For optimization page
 * access bit can be set during all page fault flow on these arches.
 * To be differentiate with macro pte_mkyoung, this macro is used on platforms
 * where software maintains page access bit.
 */
#ifndef pte_sw_mkyoung
static inline pte_t pte_sw_mkyoung(pte_t pte)
{
        return pte;
}
#define pte_sw_mkyoung        pte_sw_mkyoung
#endif

#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
}
#else
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        pud_t old_pud = *pudp;

        set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
}
#else
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif

#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
#else
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pmd_t *pmdp)
{
        BUILD_BUG();
        return *pmdp;
}
#define pmdp_collapse_flush pmdp_collapse_flush
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
#endif

#ifndef arch_needs_pgtable_deposit
#define arch_needs_pgtable_deposit() (false)
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * This is an implementation of pmdp_establish() that is only suitable for an
 * architecture that doesn't have hardware dirty/accessed bits. In this case we
 * can't race with CPU which sets these bits and non-atomic approach is fine.
 */
static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
        return old_pmd;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD

/*
 * pmdp_invalidate_ad() invalidates the PMD while changing a transparent
 * hugepage mapping in the page tables. This function is similar to
 * pmdp_invalidate(), but should only be used if the access and dirty bits would
 * not be cleared by the software in the new PMD value. The function ensures
 * that hardware changes of the access and dirty bits updates would not be lost.
 *
 * Doing so can allow in certain architectures to avoid a TLB flush in most
 * cases. Yet, another TLB flush might be necessary later if the PMD update
 * itself requires such flush (e.g., if protection was set to be stricter). Yet,
 * even when a TLB flush is needed because of the update, the caller may be able
 * to batch these TLB flushing operations, so fewer TLB flush operations are
 * needed.
 */
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
                                unsigned long address, pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t pte_a, pte_t pte_b)
{
        return pte_val(pte_a) == pte_val(pte_b);
}
#endif

#ifndef __HAVE_ARCH_PTE_UNUSED
/*
 * Some architectures provide facilities to virtualization guests
 * so that they can flag allocated pages as unused. This allows the
 * host to transparently reclaim unused pages. This function returns
 * whether the pte's page is unused.
 */
static inline int pte_unused(pte_t pte)
{
        return 0;
}
#endif

#ifndef pte_access_permitted
#define pte_access_permitted(pte, write) \
        (pte_present(pte) && (!(write) || pte_write(pte)))
#endif

#ifndef pmd_access_permitted
#define pmd_access_permitted(pmd, write) \
        (pmd_present(pmd) && (!(write) || pmd_write(pmd)))
#endif

#ifndef pud_access_permitted
#define pud_access_permitted(pud, write) \
        (pud_present(pud) && (!(write) || pud_write(pud)))
#endif

#ifndef p4d_access_permitted
#define p4d_access_permitted(p4d, write) \
        (p4d_present(p4d) && (!(write) || p4d_write(p4d)))
#endif

#ifndef pgd_access_permitted
#define pgd_access_permitted(pgd, write) \
        (pgd_present(pgd) && (!(write) || pgd_write(pgd)))
#endif

#ifndef __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
        return pmd_val(pmd_a) == pmd_val(pmd_b);
}
#endif

#ifndef pud_same
static inline int pud_same(pud_t pud_a, pud_t pud_b)
{
        return pud_val(pud_a) == pud_val(pud_b);
}
#define pud_same pud_same
#endif

#ifndef __HAVE_ARCH_P4D_SAME
static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
{
        return p4d_val(p4d_a) == p4d_val(p4d_b);
}
#endif

#ifndef __HAVE_ARCH_PGD_SAME
static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
{
        return pgd_val(pgd_a) == pgd_val(pgd_b);
}
#endif

#ifndef __HAVE_ARCH_DO_SWAP_PAGE
static inline void arch_do_swap_page_nr(struct mm_struct *mm,
                                     struct vm_area_struct *vma,
                                     unsigned long addr,
                                     pte_t pte, pte_t oldpte,
                                     int nr)
{

}
#else
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_do_swap_page() can restore this
 * metadata when a page is swapped back in.
 */
static inline void arch_do_swap_page_nr(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long addr,
                                        pte_t pte, pte_t oldpte,
                                        int nr)
{
        for (int i = 0; i < nr; i++) {
                arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE,
                                pte_advance_pfn(pte, i),
                                pte_advance_pfn(oldpte, i));
        }
}
#endif

#ifndef __HAVE_ARCH_UNMAP_ONE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_unmap_one() can save this
 * metadata on a swap-out of a page.
 */
static inline int arch_unmap_one(struct mm_struct *mm,
                                  struct vm_area_struct *vma,
                                  unsigned long addr,
                                  pte_t orig_pte)
{
        return 0;
}
#endif

/*
 * Allow architectures to preserve additional metadata associated with
 * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function
 * prototypes must be defined in the arch-specific asm/pgtable.h file.
 */
#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
static inline int arch_prepare_to_swap(struct folio *folio)
{
        return 0;
}
#endif

#ifndef __HAVE_ARCH_SWAP_INVALIDATE
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
{
}

static inline void arch_swap_invalidate_area(int type)
{
}
#endif

#ifndef __HAVE_ARCH_SWAP_RESTORE
static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
{
}
#endif

#ifndef __HAVE_ARCH_MOVE_PTE
#define move_pte(pte, old_addr, new_addr)        (pte)
#endif

#ifndef pte_accessible
# define pte_accessible(mm, pte)        ((void)(pte), 1)
#endif

#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
#endif

#ifndef flush_tlb_fix_spurious_fault_pmd
#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0)
#endif

/*
 * When walking page tables, get the address of the next boundary,
 * or the end address of the range if that comes earlier.  Although no
 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
 */

#define pgd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})

#ifndef p4d_addr_end
#define p4d_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pud_addr_end
#define pud_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pmd_addr_end
#define pmd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

/*
 * When walking page tables, we usually want to skip any p?d_none entries;
 * and any p?d_bad entries - reporting the error before resetting to none.
 * Do the tests inline, but report and clear the bad entry in mm/memory.c.
 */
void pgd_clear_bad(pgd_t *);

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *);
#else
#define p4d_clear_bad(p4d)        do { } while (0)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *);
#else
#define pud_clear_bad(p4d)        do { } while (0)
#endif

void pmd_clear_bad(pmd_t *);

static inline int pgd_none_or_clear_bad(pgd_t *pgd)
{
        if (pgd_none(*pgd))
                return 1;
        if (unlikely(pgd_bad(*pgd))) {
                pgd_clear_bad(pgd);
                return 1;
        }
        return 0;
}

static inline int p4d_none_or_clear_bad(p4d_t *p4d)
{
        if (p4d_none(*p4d))
                return 1;
        if (unlikely(p4d_bad(*p4d))) {
                p4d_clear_bad(p4d);
                return 1;
        }
        return 0;
}

static inline int pud_none_or_clear_bad(pud_t *pud)
{
        if (pud_none(*pud))
                return 1;
        if (unlikely(pud_bad(*pud))) {
                pud_clear_bad(pud);
                return 1;
        }
        return 0;
}

static inline int pmd_none_or_clear_bad(pmd_t *pmd)
{
        if (pmd_none(*pmd))
                return 1;
        if (unlikely(pmd_bad(*pmd))) {
                pmd_clear_bad(pmd);
                return 1;
        }
        return 0;
}

static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep)
{
        /*
         * Get the current pte state, but zero it out to make it
         * non-present, preventing the hardware from asynchronously
         * updating it.
         */
        return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}

static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep, pte_t pte)
{
        /*
         * The pte is non-present, so there's no hardware state to
         * preserve.
         */
        set_pte_at(vma->vm_mm, addr, ptep, pte);
}

#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
/*
 * Start a pte protection read-modify-write transaction, which
 * protects against asynchronous hardware modifications to the pte.
 * The intention is not to prevent the hardware from making pte
 * updates, but to prevent any updates it may make from being lost.
 *
 * This does not protect against other software modifications of the
 * pte; the appropriate pte lock must be held over the transaction.
 *
 * Note that this interface is intended to be batchable, meaning that
 * ptep_modify_prot_commit may not actually update the pte, but merely
 * queue the update to be done at some later time.  The update must be
 * actually committed before the pte lock is released, however.
 */
static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep)
{
        return __ptep_modify_prot_start(vma, addr, ptep);
}

/*
 * Commit an update to a pte, leaving any hardware-controlled bits in
 * the PTE unmodified. The pte returned from ptep_modify_prot_start() may
 * additionally have young and/or dirty bits set where previously they were not,
 * so the updated pte may have these additional changes.
 */
static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep, pte_t old_pte, pte_t pte)
{
        __ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */

/**
 * modify_prot_start_ptes - Start a pte protection read-modify-write transaction
 * over a batch of ptes, which protects against asynchronous hardware
 * modifications to the ptes. The intention is not to prevent the hardware from
 * making pte updates, but to prevent any updates it may make from being lost.
 * Please see the comment above ptep_modify_prot_start() for full description.
 *
 * @vma: The virtual memory area the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte
 * in the batch.
 *
 * Note that PTE bits in the PTE batch besides the PFN can differ.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio. All other PTE bits must be identical for
 * all PTEs in the batch except for young and dirty bits.  The PTEs are all in
 * the same PMD.
 */
#ifndef modify_prot_start_ptes
static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep, unsigned int nr)
{
        pte_t pte, tmp_pte;

        pte = ptep_modify_prot_start(vma, addr, ptep);
        while (--nr) {
                ptep++;
                addr += PAGE_SIZE;
                tmp_pte = ptep_modify_prot_start(vma, addr, ptep);
                if (pte_dirty(tmp_pte))
                        pte = pte_mkdirty(pte);
                if (pte_young(tmp_pte))
                        pte = pte_mkyoung(pte);
        }
        return pte;
}
#endif

/**
 * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any
 * hardware-controlled bits in the PTE unmodified.
 *
 * @vma: The virtual memory area the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @old_pte: Old page table entry (for the first entry) which is now cleared.
 * @pte: New page table entry to be set.
 * @nr: Number of entries.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_modify_prot_commit().
 *
 * Context: The caller holds the page table lock. The PTEs are all in the same
 * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by
 * ptep_modify_prot_start() may additionally have young and/or dirty bits set
 * where previously they were not, so the updated ptes may have these
 * additional changes.
 */
#ifndef modify_prot_commit_ptes
static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr,
                pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr)
{
        int i;

        for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) {
                ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte);

                /* Advance PFN only, set same prot */
                old_pte = pte_next_pfn(old_pte);
                pte = pte_next_pfn(pte);
        }
}
#endif

/*
 * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
 * and let generic vmalloc, ioremap and page table update code know when
 * arch_sync_kernel_mappings() needs to be called.
 */
#ifndef ARCH_PAGE_TABLE_SYNC_MASK
#define ARCH_PAGE_TABLE_SYNC_MASK 0
#endif

/*
 * There is no default implementation for arch_sync_kernel_mappings(). It is
 * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK
 * is 0.
 */
void arch_sync_kernel_mappings(unsigned long start, unsigned long end);

#endif /* CONFIG_MMU */

/*
 * On almost all architectures and configurations, 0 can be used as the
 * upper ceiling to free_pgtables(): on many architectures it has the same
 * effect as using TASK_SIZE.  However, there is one configuration which
 * must impose a more careful limit, to avoid freeing kernel pgtables.
 */
#ifndef USER_PGTABLES_CEILING
#define USER_PGTABLES_CEILING        0UL
#endif

/*
 * This defines the first usable user address. Platforms
 * can override its value with custom FIRST_USER_ADDRESS
 * defined in their respective <asm/pgtable.h>.
 */
#ifndef FIRST_USER_ADDRESS
#define FIRST_USER_ADDRESS        0UL
#endif

/*
 * No-op macros that just return the current protection value. Defined here
 * because these macros can be used even if CONFIG_MMU is not defined.
 */

#ifndef pgprot_nx
#define pgprot_nx(prot)        (prot)
#endif

#ifndef pgprot_noncached
#define pgprot_noncached(prot)        (prot)
#endif

#ifndef pgprot_writecombine
#define pgprot_writecombine pgprot_noncached
#endif

#ifndef pgprot_writethrough
#define pgprot_writethrough pgprot_noncached
#endif

#ifndef pgprot_device
#define pgprot_device pgprot_noncached
#endif

#ifndef pgprot_mhp
#define pgprot_mhp(prot)        (prot)
#endif

#ifdef CONFIG_MMU
#ifndef pgprot_modify
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot)))
                newprot = pgprot_noncached(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot)))
                newprot = pgprot_writecombine(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot)))
                newprot = pgprot_device(newprot);
        return newprot;
}
#endif
#endif /* CONFIG_MMU */

#ifndef pgprot_encrypted
#define pgprot_encrypted(prot)        (prot)
#endif

#ifndef pgprot_decrypted
#define pgprot_decrypted(prot)        (prot)
#endif

/*
 * A facility to provide batching of the reload of page tables and
 * other process state with the actual context switch code for
 * paravirtualized guests.  By convention, only one of the batched
 * update (lazy) modes (CPU, MMU) should be active at any given time,
 * entry should never be nested, and entry and exits should always be
 * paired.  This is for sanity of maintaining and reasoning about the
 * kernel code.  In this case, the exit (end of the context switch) is
 * in architecture-specific code, and so doesn't need a generic
 * definition.
 */
#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
#define arch_start_context_switch(prev)        do {} while (0)
#endif

/*
 * Some platforms can customize the PTE soft-dirty bit making it unavailable
 * even if the architecture provides the resource.
 * Adding this API allows architectures to add their own checks for the
 * devices on which the kernel is running.
 * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY
 * is part of this macro.
 */
#ifndef pgtable_supports_soft_dirty
#define pgtable_supports_soft_dirty()        IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)
#endif

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif
#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline int pte_soft_dirty(pte_t pte)
{
        return 0;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return 0;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif

#ifndef __HAVE_PFNMAP_TRACKING
/*
 * Interfaces that can be used by architecture code to keep track of
 * memory type of pfn mappings specified by the remap_pfn_range,
 * vmf_insert_pfn.
 */

static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size,
                pgprot_t *prot)
{
        return 0;
}

static inline int pfnmap_track(unsigned long pfn, unsigned long size,
                pgprot_t *prot)
{
        return 0;
}

static inline void pfnmap_untrack(unsigned long pfn, unsigned long size)
{
}
#else
/**
 * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range
 * @pfn: the start of the pfn range
 * @size: the size of the pfn range in bytes
 * @prot: the pgprot to modify
 *
 * Lookup the cachemode for the pfn range starting at @pfn with the size
 * @size and store it in @prot, leaving other data in @prot unchanged.
 *
 * This allows for a hardware implementation to have fine-grained control of
 * memory cache behavior at page level granularity. Without a hardware
 * implementation, this function does nothing.
 *
 * Currently there is only one implementation for this - x86 Page Attribute
 * Table (PAT). See Documentation/arch/x86/pat.rst for more details.
 *
 * This function can fail if the pfn range spans pfns that require differing
 * cachemodes. If the pfn range was previously verified to have a single
 * cachemode, it is sufficient to query only a single pfn. The assumption is
 * that this is the case for drivers using the vmf_insert_pfn*() interface.
 *
 * Returns 0 on success and -EINVAL on error.
 */
int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size,
                pgprot_t *prot);

/**
 * pfnmap_track - track a pfn range
 * @pfn: the start of the pfn range
 * @size: the size of the pfn range in bytes
 * @prot: the pgprot to track
 *
 * Requested the pfn range to be 'tracked' by a hardware implementation and
 * setup the cachemode in @prot similar to pfnmap_setup_cachemode().
 *
 * This allows for fine-grained control of memory cache behaviour at page
 * level granularity. Tracking memory this way is persisted across VMA splits
 * (VMA merging does not apply for VM_PFNMAP).
 *
 * Currently, there is only one implementation for this - x86 Page Attribute
 * Table (PAT). See Documentation/arch/x86/pat.rst for more details.
 *
 * Returns 0 on success and -EINVAL on error.
 */
int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot);

/**
 * pfnmap_untrack - untrack a pfn range
 * @pfn: the start of the pfn range
 * @size: the size of the pfn range in bytes
 *
 * Untrack a pfn range previously tracked through pfnmap_track().
 */
void pfnmap_untrack(unsigned long pfn, unsigned long size);
#endif

/**
 * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn
 * @pfn: the pfn
 * @prot: the pgprot to modify
 *
 * Lookup the cachemode for @pfn and store it in @prot, leaving other
 * data in @prot unchanged.
 *
 * See pfnmap_setup_cachemode() for details.
 */
static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot)
{
        pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot);
}

/*
 * ZERO_PAGE() is global shared page(s) that is always zero. It is used for
 * zero-mapped memory areas, CoW etc.
 *
 * On architectures that __HAVE_COLOR_ZERO_PAGE there are several such pages
 * for different ranges in the virtual address space.
 *
 * zero_page_pfn identifies the first (or the only) pfn for these pages.
 *
 * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in
 * empty_zero_page in BSS.
 */
void arch_setup_zero_pages(void);

#ifdef __HAVE_COLOR_ZERO_PAGE
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_page_pfn;
        unsigned long offset_from_zero_pfn = pfn - zero_page_pfn;

        return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
}

#define zero_pfn(addr)        page_to_pfn(ZERO_PAGE(addr))

#else
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_page_pfn;

        return pfn == zero_page_pfn;
}

static inline unsigned long zero_pfn(unsigned long addr)
{
        extern unsigned long zero_page_pfn;

        return zero_page_pfn;
}

extern uint8_t empty_zero_page[PAGE_SIZE];
extern struct page *__zero_page;

static inline struct page *_zero_page(unsigned long addr)
{
        return __zero_page;
}
#define ZERO_PAGE(vaddr) _zero_page(vaddr)

#endif /* __HAVE_COLOR_ZERO_PAGE */

#ifdef CONFIG_MMU

#ifndef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
        return 0;
}
#ifndef pmd_write
static inline int pmd_write(pmd_t pmd)
{
        BUG();
        return 0;
}
#endif /* pmd_write */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef pud_write
static inline int pud_write(pud_t pud)
{
        BUG();
        return 0;
}
#endif /* pud_write */

#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
        !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline int pud_trans_huge(pud_t pud)
{
        return 0;
}
#endif

static inline int pud_trans_unstable(pud_t *pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        pud_t pudval = pudp_get(pud);

        if (pud_none(pudval) || pud_trans_huge(pudval))
                return 1;
        if (unlikely(pud_bad(pudval))) {
                pud_clear_bad(pud);
                return 1;
        }
#endif
        return 0;
}

#ifndef CONFIG_NUMA_BALANCING
/*
 * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is
 * perfectly valid to indicate "no" in that case, which is why our default
 * implementation defaults to "always no".
 *
 * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE
 * page protection due to NUMA hinting. NUMA hinting faults only apply in
 * accessible VMAs.
 *
 * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault,
 * looking at the VMA accessibility is sufficient.
 */
static inline int pte_protnone(pte_t pte)
{
        return 0;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return 0;
}
#endif /* CONFIG_NUMA_BALANCING */

#endif /* CONFIG_MMU */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

#ifndef __PAGETABLE_P4D_FOLDED
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
void p4d_clear_huge(p4d_t *p4d);
#else
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline void p4d_clear_huge(p4d_t *p4d) { }
#endif /* !__PAGETABLE_P4D_FOLDED */

int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd);
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else        /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline void p4d_clear_huge(p4d_t *p4d) { }
static inline int pud_clear_huge(pud_t *pud)
{
        return 0;
}
static inline int pmd_clear_huge(pmd_t *pmd)
{
        return 0;
}
static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
{
        return 0;
}
static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        return 0;
}
static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return 0;
}
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * ARCHes with special requirements for evicting THP backing TLB entries can
 * implement this. Otherwise also, it can help optimize normal TLB flush in
 * THP regime. Stock flush_tlb_range() typically has optimization to nuke the
 * entire TLB if flush span is greater than a threshold, which will
 * likely be true for a single huge page. Thus a single THP flush will
 * invalidate the entire TLB which is not desirable.
 * e.g. see arch/arc: flush_pmd_tlb_range
 */
#define flush_pmd_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#define flush_pud_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#else
#define flush_pmd_tlb_range(vma, addr, end)        BUILD_BUG()
#define flush_pud_tlb_range(vma, addr, end)        BUILD_BUG()
#endif
#endif

struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                        unsigned long size, pgprot_t *vma_prot);

#ifndef CONFIG_X86_ESPFIX64
static inline void init_espfix_bsp(void) { }
#endif

extern void __init pgtable_cache_init(void);

#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
        return true;
}

static inline bool arch_has_pfn_modify_check(void)
{
        return false;
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */

/*
 * Architecture PAGE_KERNEL_* fallbacks
 *
 * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
 * because they really don't support them, or the port needs to be updated to
 * reflect the required functionality. Below are a set of relatively safe
 * fallbacks, as best effort, which we can count on in lieu of the architectures
 * not defining them on their own yet.
 */

#ifndef PAGE_KERNEL_RO
# define PAGE_KERNEL_RO PAGE_KERNEL
#endif

#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif

/*
 * Page Table Modification bits for pgtbl_mod_mask.
 *
 * These are used by the p?d_alloc_track*() and p*d_populate_kernel()
 * functions in the generic vmalloc, ioremap and page table update code
 * to track at which page-table levels entries have been modified.
 * Based on that the code can better decide when page table changes need
 * to be synchronized to other page-tables in the system.
 */
#define                __PGTBL_PGD_MODIFIED        0
#define                __PGTBL_P4D_MODIFIED        1
#define                __PGTBL_PUD_MODIFIED        2
#define                __PGTBL_PMD_MODIFIED        3
#define                __PGTBL_PTE_MODIFIED        4

#define                PGTBL_PGD_MODIFIED        BIT(__PGTBL_PGD_MODIFIED)
#define                PGTBL_P4D_MODIFIED        BIT(__PGTBL_P4D_MODIFIED)
#define                PGTBL_PUD_MODIFIED        BIT(__PGTBL_PUD_MODIFIED)
#define                PGTBL_PMD_MODIFIED        BIT(__PGTBL_PMD_MODIFIED)
#define                PGTBL_PTE_MODIFIED        BIT(__PGTBL_PTE_MODIFIED)

/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;

enum pgtable_level {
        PGTABLE_LEVEL_PTE = 0,
        PGTABLE_LEVEL_PMD,
        PGTABLE_LEVEL_PUD,
        PGTABLE_LEVEL_P4D,
        PGTABLE_LEVEL_PGD,
};

static inline const char *pgtable_level_to_str(enum pgtable_level level)
{
        switch (level) {
        case PGTABLE_LEVEL_PTE:
                return "pte";
        case PGTABLE_LEVEL_PMD:
                return "pmd";
        case PGTABLE_LEVEL_PUD:
                return "pud";
        case PGTABLE_LEVEL_P4D:
                return "p4d";
        case PGTABLE_LEVEL_PGD:
                return "pgd";
        default:
                return "unknown";
        }
}

#endif /* !__ASSEMBLY__ */

#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
#ifdef CONFIG_PHYS_ADDR_T_64BIT
/*
 * ZSMALLOC needs to know the highest PFN on 32-bit architectures
 * with physical address space extension, but falls back to
 * BITS_PER_LONG otherwise.
 */
#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
#else
#define MAX_POSSIBLE_PHYSMEM_BITS 32
#endif
#endif

#ifndef has_transparent_hugepage
#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
#endif

#ifndef has_transparent_pud_hugepage
#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
#endif
/*
 * On some architectures it depends on the mm if the p4d/pud or pmd
 * layer of the page table hierarchy is folded or not.
 */
#ifndef mm_p4d_folded
#define mm_p4d_folded(mm)        __is_defined(__PAGETABLE_P4D_FOLDED)
#endif

#ifndef mm_pud_folded
#define mm_pud_folded(mm)        __is_defined(__PAGETABLE_PUD_FOLDED)
#endif

#ifndef mm_pmd_folded
#define mm_pmd_folded(mm)        __is_defined(__PAGETABLE_PMD_FOLDED)
#endif

#ifndef p4d_offset_lockless
#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
#endif
#ifndef pud_offset_lockless
#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
#endif
#ifndef pmd_offset_lockless
#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
#endif

/*
 * pXd_leaf() is the API to check whether a pgtable entry is a huge page
 * mapping.  It should work globally across all archs, without any
 * dependency on CONFIG_* options.  For architectures that do not support
 * huge mappings on specific levels, below fallbacks will be used.
 *
 * A leaf pgtable entry should always imply the following:
 *
 * - It is a "present" entry.  IOW, before using this API, please check it
 *   with pXd_present() first. NOTE: it may not always mean the "present
 *   bit" is set.  For example, PROT_NONE entries are always "present".
 *
 * - It should _never_ be a swap entry of any type.  Above "present" check
 *   should have guarded this, but let's be crystal clear on this.
 *
 * - It should contain a huge PFN, which points to a huge page larger than
 *   PAGE_SIZE of the platform.  The PFN format isn't important here.
 *
 * - It should cover all kinds of huge mappings (i.e. pXd_trans_huge()
 *   or hugetlb mappings).
 */
#ifndef pgd_leaf
#define pgd_leaf(x)        false
#endif
#ifndef p4d_leaf
#define p4d_leaf(x)        false
#endif
#ifndef pud_leaf
#define pud_leaf(x)        false
#endif
#ifndef pmd_leaf
#define pmd_leaf(x)        false
#endif

#ifndef pgd_leaf_size
#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
#endif
#ifndef p4d_leaf_size
#define p4d_leaf_size(x) P4D_SIZE
#endif
#ifndef pud_leaf_size
#define pud_leaf_size(x) PUD_SIZE
#endif
#ifndef pmd_leaf_size
#define pmd_leaf_size(x) PMD_SIZE
#endif
#ifndef __pte_leaf_size
#ifndef pte_leaf_size
#define pte_leaf_size(x) PAGE_SIZE
#endif
#define __pte_leaf_size(x,y) pte_leaf_size(y)
#endif

/*
 * We always define pmd_pfn for all archs as it's used in lots of generic
 * code.  Now it happens too for pud_pfn (and can happen for larger
 * mappings too in the future; we're not there yet).  Instead of defining
 * it for all archs (like pmd_pfn), provide a fallback.
 *
 * Note that returning 0 here means any arch that didn't define this can
 * get severely wrong when it hits a real pud leaf.  It's arch's
 * responsibility to properly define it when a huge pud is possible.
 */
#ifndef pud_pfn
#define pud_pfn(x) 0
#endif

/*
 * Some architectures have MMUs that are configurable or selectable at boot
 * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
 * helps to have a static maximum value.
 */

#ifndef MAX_PTRS_PER_PTE
#define MAX_PTRS_PER_PTE PTRS_PER_PTE
#endif

#ifndef MAX_PTRS_PER_PMD
#define MAX_PTRS_PER_PMD PTRS_PER_PMD
#endif

#ifndef MAX_PTRS_PER_PUD
#define MAX_PTRS_PER_PUD PTRS_PER_PUD
#endif

#ifndef MAX_PTRS_PER_P4D
#define MAX_PTRS_PER_P4D PTRS_PER_P4D
#endif

#ifndef pte_pgprot
#define pte_pgprot(x) ((pgprot_t) {0})
#endif

#ifndef pmd_pgprot
#define pmd_pgprot(x) ((pgprot_t) {0})
#endif

#ifndef pud_pgprot
#define pud_pgprot(x) ((pgprot_t) {0})
#endif

/* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
 *
 * map_type        prot
 *                PROT_NONE        PROT_READ        PROT_WRITE        PROT_EXEC
 * MAP_SHARED        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (yes) yes        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * MAP_PRIVATE        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (copy) copy        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
 * MAP_PRIVATE (with Enhanced PAN supported):
 *                                                                r: (no) no
 *                                                                w: (no) no
 *                                                                x: (yes) yes
 */
#define DECLARE_VM_GET_PAGE_PROT                                        \
pgprot_t vm_get_page_prot(vm_flags_t vm_flags)                                \
{                                                                        \
                return protection_map[vm_flags &                        \
                        (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)];        \
}                                                                        \
EXPORT_SYMBOL(vm_get_page_prot);

#endif /* _LINUX_PGTABLE_H */






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#ifndef _LINUX_HASH_H
#define _LINUX_HASH_H
/* Fast hashing routine for ints,  longs and pointers.
   (C) 2002 Nadia Yvette Chambers, IBM */

#include <asm/types.h>
#include <linux/compiler.h>

/*
 * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
 * fs/inode.c.  It's not actually prime any more (the previous primes
 * were actively bad for hashing), but the name remains.
 */
#if BITS_PER_LONG == 32
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
#define hash_long(val, bits) hash_32(val, bits)
#elif BITS_PER_LONG == 64
#define hash_long(val, bits) hash_64(val, bits)
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
#else
#error Wordsize not 32 or 64
#endif

/*
 * This hash multiplies the input by a large odd number and takes the
 * high bits.  Since multiplication propagates changes to the most
 * significant end only, it is essential that the high bits of the
 * product be used for the hash value.
 *
 * Chuck Lever verified the effectiveness of this technique:
 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
 *
 * Although a random odd number will do, it turns out that the golden
 * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
 * properties.  (See Knuth vol 3, section 6.4, exercise 9.)
 *
 * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
 * which is very slightly easier to multiply by and makes no
 * difference to the hash distribution.
 */
#define GOLDEN_RATIO_32 0x61C88647
#define GOLDEN_RATIO_64 0x61C8864680B583EBull

#ifdef CONFIG_HAVE_ARCH_HASH
/* This header may use the GOLDEN_RATIO_xx constants */
#include <asm/hash.h>
#endif

/*
 * The _generic versions exist only so lib/test_hash.c can compare
 * the arch-optimized versions with the generic.
 *
 * Note that if you change these, any <asm/hash.h> that aren't updated
 * to match need to have their HAVE_ARCH_* define values updated so the
 * self-test will not false-positive.
 */
#ifndef HAVE_ARCH__HASH_32
#define __hash_32 __hash_32_generic
#endif
static inline u32 __hash_32_generic(u32 val)
{
        return val * GOLDEN_RATIO_32;
}

static inline u32 hash_32(u32 val, unsigned int bits)
{
        /* High bits are more random, so use them. */
        return __hash_32(val) >> (32 - bits);
}

#ifndef HAVE_ARCH_HASH_64
#define hash_64 hash_64_generic
#endif
static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
{
#if BITS_PER_LONG == 64
        /* 64x64-bit multiply is efficient on all 64-bit processors */
        return val * GOLDEN_RATIO_64 >> (64 - bits);
#else
        /* Hash 64 bits using only 32x32-bit multiply. */
        return hash_32((u32)val ^ __hash_32(val >> 32), bits);
#endif
}

static inline u32 hash_ptr(const void *ptr, unsigned int bits)
{
        return hash_long((unsigned long)ptr, bits);
}

/* This really should be called fold32_ptr; it does no hashing to speak of. */
static inline u32 hash32_ptr(const void *ptr)
{
        unsigned long val = (unsigned long)ptr;

#if BITS_PER_LONG == 64
        val ^= (val >> 32);
#endif
        return (u32)val;
}

#endif /* _LINUX_HASH_H */




































































































































































































































   15 



    1 

   14 























































































































































































































































































































































































































   24 





    1 










    1 


















   23 



   24 
   22 































































































































































































































































   17 






   17 















































   17 











   22 














   22 



   18 
   17 




   22 













   21 








   19 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/util.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/slab.h>
#include <linux/rculist.h>

#include "common.h"

/* Lock for protecting policy. */
DEFINE_MUTEX(tomoyo_policy_lock);

/* Has /sbin/init started? */
bool tomoyo_policy_loaded;

/*
 * Mapping table from "enum tomoyo_mac_index" to
 * "enum tomoyo_mac_category_index".
 */
const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX] = {
        /* CONFIG::file group */
        [TOMOYO_MAC_FILE_EXECUTE]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_OPEN]       = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CREATE]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_UNLINK]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_GETATTR]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKDIR]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_RMDIR]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKFIFO]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKSOCK]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_TRUNCATE]   = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_SYMLINK]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKBLOCK]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKCHAR]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_LINK]       = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_RENAME]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHMOD]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHOWN]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHGRP]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_IOCTL]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHROOT]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MOUNT]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_UMOUNT]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_PIVOT_ROOT] = TOMOYO_MAC_CATEGORY_FILE,
        /* CONFIG::network group */
        [TOMOYO_MAC_NETWORK_INET_STREAM_BIND]       =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN]     =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_RAW_BIND]          =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_RAW_SEND]          =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND]       =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN]     =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN]  =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] =
        TOMOYO_MAC_CATEGORY_NETWORK,
        /* CONFIG::misc group */
        [TOMOYO_MAC_ENVIRON]         = TOMOYO_MAC_CATEGORY_MISC,
};

/**
 * tomoyo_convert_time - Convert time_t to YYYY/MM/DD hh/mm/ss.
 *
 * @time64: Seconds since 1970/01/01 00:00:00.
 * @stamp:  Pointer to "struct tomoyo_time".
 *
 * Returns nothing.
 */
void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp)
{
        struct tm tm;

        time64_to_tm(time64, 0, &tm);
        stamp->sec = tm.tm_sec;
        stamp->min = tm.tm_min;
        stamp->hour = tm.tm_hour;
        stamp->day = tm.tm_mday;
        stamp->month = tm.tm_mon + 1;
        stamp->year = tm.tm_year + 1900;
}

/**
 * tomoyo_permstr - Find permission keywords.
 *
 * @string: String representation for permissions in foo/bar/buz format.
 * @keyword: Keyword to find from @string/
 *
 * Returns true if @keyword was found in @string, false otherwise.
 *
 * This function assumes that strncmp(w1, w2, strlen(w1)) != 0 if w1 != w2.
 */
bool tomoyo_permstr(const char *string, const char *keyword)
{
        const char *cp = strstr(string, keyword);

        if (cp)
                return cp == string || *(cp - 1) == '/';
        return false;
}

/**
 * tomoyo_read_token - Read a word from a line.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns a word on success, "" otherwise.
 *
 * To allow the caller to skip NULL check, this function returns "" rather than
 * NULL if there is no more words to read.
 */
char *tomoyo_read_token(struct tomoyo_acl_param *param)
{
        char *pos = param->data;
        char *del = strchr(pos, ' ');

        if (del)
                *del++ = '\0';
        else
                del = pos + strlen(pos);
        param->data = del;
        return pos;
}

static bool tomoyo_correct_path2(const char *filename, const size_t len);

/**
 * tomoyo_get_domainname - Read a domainname from a line.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns a domainname on success, NULL otherwise.
 */
const struct tomoyo_path_info *tomoyo_get_domainname
(struct tomoyo_acl_param *param)
{
        char *start = param->data;
        char *pos = start;

        while (*pos) {
                if (*pos++ != ' ' ||
                    tomoyo_correct_path2(pos, strchrnul(pos, ' ') - pos))
                        continue;
                *(pos - 1) = '\0';
                break;
        }
        param->data = pos;
        if (tomoyo_correct_domain(start))
                return tomoyo_get_name(start);
        return NULL;
}

/**
 * tomoyo_parse_ulong - Parse an "unsigned long" value.
 *
 * @result: Pointer to "unsigned long".
 * @str:    Pointer to string to parse.
 *
 * Returns one of values in "enum tomoyo_value_type".
 *
 * The @src is updated to point the first character after the value
 * on success.
 */
u8 tomoyo_parse_ulong(unsigned long *result, char **str)
{
        const char *cp = *str;
        char *ep;
        int base = 10;

        if (*cp == '0') {
                char c = *(cp + 1);

                if (c == 'x' || c == 'X') {
                        base = 16;
                        cp += 2;
                } else if (c >= '0' && c <= '7') {
                        base = 8;
                        cp++;
                }
        }
        *result = simple_strtoul(cp, &ep, base);
        if (cp == ep)
                return TOMOYO_VALUE_TYPE_INVALID;
        *str = ep;
        switch (base) {
        case 16:
                return TOMOYO_VALUE_TYPE_HEXADECIMAL;
        case 8:
                return TOMOYO_VALUE_TYPE_OCTAL;
        default:
                return TOMOYO_VALUE_TYPE_DECIMAL;
        }
}

/**
 * tomoyo_print_ulong - Print an "unsigned long" value.
 *
 * @buffer:     Pointer to buffer.
 * @buffer_len: Size of @buffer.
 * @value:      An "unsigned long" value.
 * @type:       Type of @value.
 *
 * Returns nothing.
 */
void tomoyo_print_ulong(char *buffer, const int buffer_len,
                        const unsigned long value, const u8 type)
{
        if (type == TOMOYO_VALUE_TYPE_DECIMAL)
                snprintf(buffer, buffer_len, "%lu", value);
        else if (type == TOMOYO_VALUE_TYPE_OCTAL)
                snprintf(buffer, buffer_len, "0%lo", value);
        else if (type == TOMOYO_VALUE_TYPE_HEXADECIMAL)
                snprintf(buffer, buffer_len, "0x%lX", value);
        else
                snprintf(buffer, buffer_len, "type(%u)", type);
}

/**
 * tomoyo_parse_name_union - Parse a tomoyo_name_union.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_name_union".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_parse_name_union(struct tomoyo_acl_param *param,
                             struct tomoyo_name_union *ptr)
{
        char *filename;

        if (param->data[0] == '@') {
                param->data++;
                ptr->group = tomoyo_get_group(param, TOMOYO_PATH_GROUP);
                return ptr->group != NULL;
        }
        filename = tomoyo_read_token(param);
        if (!tomoyo_correct_word(filename))
                return false;
        ptr->filename = tomoyo_get_name(filename);
        return ptr->filename != NULL;
}

/**
 * tomoyo_parse_number_union - Parse a tomoyo_number_union.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_number_union".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_parse_number_union(struct tomoyo_acl_param *param,
                               struct tomoyo_number_union *ptr)
{
        char *data;
        u8 type;
        unsigned long v;

        memset(ptr, 0, sizeof(*ptr));
        if (param->data[0] == '@') {
                param->data++;
                ptr->group = tomoyo_get_group(param, TOMOYO_NUMBER_GROUP);
                return ptr->group != NULL;
        }
        data = tomoyo_read_token(param);
        type = tomoyo_parse_ulong(&v, &data);
        if (type == TOMOYO_VALUE_TYPE_INVALID)
                return false;
        ptr->values[0] = v;
        ptr->value_type[0] = type;
        if (!*data) {
                ptr->values[1] = v;
                ptr->value_type[1] = type;
                return true;
        }
        if (*data++ != '-')
                return false;
        type = tomoyo_parse_ulong(&v, &data);
        if (type == TOMOYO_VALUE_TYPE_INVALID || *data || ptr->values[0] > v)
                return false;
        ptr->values[1] = v;
        ptr->value_type[1] = type;
        return true;
}

/**
 * tomoyo_byte_range - Check whether the string is a \ooo style octal value.
 *
 * @str: Pointer to the string.
 *
 * Returns true if @str is a \ooo style octal value, false otherwise.
 *
 * TOMOYO uses \ooo style representation for 0x01 - 0x20 and 0x7F - 0xFF.
 * This function verifies that \ooo is in valid range.
 */
static inline bool tomoyo_byte_range(const char *str)
{
        return *str >= '0' && *str++ <= '3' &&
                *str >= '0' && *str++ <= '7' &&
                *str >= '0' && *str <= '7';
}

/**
 * tomoyo_alphabet_char - Check whether the character is an alphabet.
 *
 * @c: The character to check.
 *
 * Returns true if @c is an alphabet character, false otherwise.
 */
static inline bool tomoyo_alphabet_char(const char c)
{
        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}

/**
 * tomoyo_make_byte - Make byte value from three octal characters.
 *
 * @c1: The first character.
 * @c2: The second character.
 * @c3: The third character.
 *
 * Returns byte value.
 */
static inline u8 tomoyo_make_byte(const u8 c1, const u8 c2, const u8 c3)
{
        return ((c1 - '0') << 6) + ((c2 - '0') << 3) + (c3 - '0');
}

/**
 * tomoyo_valid - Check whether the character is a valid char.
 *
 * @c: The character to check.
 *
 * Returns true if @c is a valid character, false otherwise.
 */
static inline bool tomoyo_valid(const unsigned char c)
{
        return c > ' ' && c < 127;
}

/**
 * tomoyo_invalid - Check whether the character is an invalid char.
 *
 * @c: The character to check.
 *
 * Returns true if @c is an invalid character, false otherwise.
 */
static inline bool tomoyo_invalid(const unsigned char c)
{
        return c && (c <= ' ' || c >= 127);
}

/**
 * tomoyo_str_starts - Check whether the given string starts with the given keyword.
 *
 * @src:  Pointer to pointer to the string.
 * @find: Pointer to the keyword.
 *
 * Returns true if @src starts with @find, false otherwise.
 *
 * The @src is updated to point the first character after the @find
 * if @src starts with @find.
 */
bool tomoyo_str_starts(char **src, const char *find)
{
        const int len = strlen(find);
        char *tmp = *src;

        if (strncmp(tmp, find, len))
                return false;
        tmp += len;
        *src = tmp;
        return true;
}

/**
 * tomoyo_normalize_line - Format string.
 *
 * @buffer: The line to normalize.
 *
 * Leading and trailing whitespaces are removed.
 * Multiple whitespaces are packed into single space.
 *
 * Returns nothing.
 */
void tomoyo_normalize_line(unsigned char *buffer)
{
        unsigned char *sp = buffer;
        unsigned char *dp = buffer;
        bool first = true;

        while (tomoyo_invalid(*sp))
                sp++;
        while (*sp) {
                if (!first)
                        *dp++ = ' ';
                first = false;
                while (tomoyo_valid(*sp))
                        *dp++ = *sp++;
                while (tomoyo_invalid(*sp))
                        sp++;
        }
        *dp = '\0';
}

/**
 * tomoyo_correct_word2 - Validate a string.
 *
 * @string: The string to check. Maybe non-'\0'-terminated.
 * @len:    Length of @string.
 *
 * Check whether the given string follows the naming rules.
 * Returns true if @string follows the naming rules, false otherwise.
 */
static bool tomoyo_correct_word2(const char *string, size_t len)
{
        u8 recursion = 20;
        const char *const start = string;
        bool in_repetition = false;

        if (!len)
                goto out;
        while (len--) {
                unsigned char c = *string++;

                if (c == '\\') {
                        if (!len--)
                                goto out;
                        c = *string++;
                        if (c >= '0' && c <= '3') {
                                unsigned char d;
                                unsigned char e;

                                if (!len-- || !len--)
                                        goto out;
                                d = *string++;
                                e = *string++;
                                if (d < '0' || d > '7' || e < '0' || e > '7')
                                        goto out;
                                c = tomoyo_make_byte(c, d, e);
                                if (c <= ' ' || c >= 127)
                                        continue;
                                goto out;
                        }
                        switch (c) {
                        case '\\':  /* "\\" */
                        case '+':   /* "\+" */
                        case '?':   /* "\?" */
                        case 'x':   /* "\x" */
                        case 'a':   /* "\a" */
                        case '-':   /* "\-" */
                                continue;
                        }
                        if (!recursion--)
                                goto out;
                        switch (c) {
                        case '*':   /* "\*" */
                        case '@':   /* "\@" */
                        case '$':   /* "\$" */
                        case 'X':   /* "\X" */
                        case 'A':   /* "\A" */
                                continue;
                        case '{':   /* "/\{" */
                                if (string - 3 < start || *(string - 3) != '/')
                                        goto out;
                                in_repetition = true;
                                continue;
                        case '}':   /* "\}/" */
                                if (*string != '/')
                                        goto out;
                                if (!in_repetition)
                                        goto out;
                                in_repetition = false;
                                continue;
                        }
                        goto out;
                } else if (in_repetition && c == '/') {
                        goto out;
                } else if (c <= ' ' || c >= 127) {
                        goto out;
                }
        }
        if (in_repetition)
                goto out;
        return true;
 out:
        return false;
}

/**
 * tomoyo_correct_word - Validate a string.
 *
 * @string: The string to check.
 *
 * Check whether the given string follows the naming rules.
 * Returns true if @string follows the naming rules, false otherwise.
 */
bool tomoyo_correct_word(const char *string)
{
        return tomoyo_correct_word2(string, strlen(string));
}

/**
 * tomoyo_correct_path2 - Check whether the given pathname follows the naming rules.
 *
 * @filename: The pathname to check.
 * @len:      Length of @filename.
 *
 * Returns true if @filename follows the naming rules, false otherwise.
 */
static bool tomoyo_correct_path2(const char *filename, const size_t len)
{
        const char *cp1 = memchr(filename, '/', len);
        const char *cp2 = memchr(filename, '.', len);

        return cp1 && (!cp2 || (cp1 < cp2)) && tomoyo_correct_word2(filename, len);
}

/**
 * tomoyo_correct_path - Validate a pathname.
 *
 * @filename: The pathname to check.
 *
 * Check whether the given pathname follows the naming rules.
 * Returns true if @filename follows the naming rules, false otherwise.
 */
bool tomoyo_correct_path(const char *filename)
{
        return tomoyo_correct_path2(filename, strlen(filename));
}

/**
 * tomoyo_correct_domain - Check whether the given domainname follows the naming rules.
 *
 * @domainname: The domainname to check.
 *
 * Returns true if @domainname follows the naming rules, false otherwise.
 */
bool tomoyo_correct_domain(const unsigned char *domainname)
{
        if (!domainname || !tomoyo_domain_def(domainname))
                return false;
        domainname = strchr(domainname, ' ');
        if (!domainname++)
                return true;
        while (1) {
                const unsigned char *cp = strchr(domainname, ' ');

                if (!cp)
                        break;
                if (!tomoyo_correct_path2(domainname, cp - domainname))
                        return false;
                domainname = cp + 1;
        }
        return tomoyo_correct_path(domainname);
}

/**
 * tomoyo_domain_def - Check whether the given token can be a domainname.
 *
 * @buffer: The token to check.
 *
 * Returns true if @buffer possibly be a domainname, false otherwise.
 */
bool tomoyo_domain_def(const unsigned char *buffer)
{
        const unsigned char *cp;
        int len;

        if (*buffer != '<')
                return false;
        cp = strchr(buffer, ' ');
        if (!cp)
                len = strlen(buffer);
        else
                len = cp - buffer;
        if (buffer[len - 1] != '>' ||
            !tomoyo_correct_word2(buffer + 1, len - 2))
                return false;
        return true;
}

/**
 * tomoyo_find_domain - Find a domain by the given name.
 *
 * @domainname: The domainname to find.
 *
 * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname)
{
        struct tomoyo_domain_info *domain;
        struct tomoyo_path_info name;

        name.name = domainname;
        tomoyo_fill_path_info(&name);
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (!domain->is_deleted &&
                    !tomoyo_pathcmp(&name, domain->domainname))
                        return domain;
        }
        return NULL;
}

/**
 * tomoyo_const_part_length - Evaluate the initial length without a pattern in a token.
 *
 * @filename: The string to evaluate.
 *
 * Returns the initial length without a pattern in @filename.
 */
static int tomoyo_const_part_length(const char *filename)
{
        char c;
        int len = 0;

        if (!filename)
                return 0;
        while ((c = *filename++) != '\0') {
                if (c != '\\') {
                        len++;
                        continue;
                }
                c = *filename++;
                switch (c) {
                case '\\':  /* "\\" */
                        len += 2;
                        continue;
                case '0':   /* "\ooo" */
                case '1':
                case '2':
                case '3':
                        c = *filename++;
                        if (c < '0' || c > '7')
                                break;
                        c = *filename++;
                        if (c < '0' || c > '7')
                                break;
                        len += 4;
                        continue;
                }
                break;
        }
        return len;
}

/**
 * tomoyo_fill_path_info - Fill in "struct tomoyo_path_info" members.
 *
 * @ptr: Pointer to "struct tomoyo_path_info" to fill in.
 *
 * The caller sets "struct tomoyo_path_info"->name.
 */
void tomoyo_fill_path_info(struct tomoyo_path_info *ptr)
{
        const char *name = ptr->name;
        const int len = strlen(name);

        ptr->const_len = tomoyo_const_part_length(name);
        ptr->is_dir = len && (name[len - 1] == '/');
        ptr->is_patterned = (ptr->const_len < len);
        ptr->hash = full_name_hash(NULL, name, len);
}

/**
 * tomoyo_file_matches_pattern2 - Pattern matching without '/' character and "\-" pattern.
 *
 * @filename:     The start of string to check.
 * @filename_end: The end of string to check.
 * @pattern:      The start of pattern to compare.
 * @pattern_end:  The end of pattern to compare.
 *
 * Returns true if @filename matches @pattern, false otherwise.
 */
static bool tomoyo_file_matches_pattern2(const char *filename,
                                         const char *filename_end,
                                         const char *pattern,
                                         const char *pattern_end)
{
        while (filename < filename_end && pattern < pattern_end) {
                char c;
                int i;
                int j;

                if (*pattern != '\\') {
                        if (*filename++ != *pattern++)
                                return false;
                        continue;
                }
                c = *filename;
                pattern++;
                switch (*pattern) {
                case '?':
                        if (c == '/') {
                                return false;
                        } else if (c == '\\') {
                                if (filename[1] == '\\')
                                        filename++;
                                else if (tomoyo_byte_range(filename + 1))
                                        filename += 3;
                                else
                                        return false;
                        }
                        break;
                case '\\':
                        if (c != '\\')
                                return false;
                        if (*++filename != '\\')
                                return false;
                        break;
                case '+':
                        if (!isdigit(c))
                                return false;
                        break;
                case 'x':
                        if (!isxdigit(c))
                                return false;
                        break;
                case 'a':
                        if (!tomoyo_alphabet_char(c))
                                return false;
                        break;
                case '0':
                case '1':
                case '2':
                case '3':
                        if (c == '\\' && tomoyo_byte_range(filename + 1)
                            && strncmp(filename + 1, pattern, 3) == 0) {
                                filename += 3;
                                pattern += 2;
                                break;
                        }
                        return false; /* Not matched. */
                case '*':
                case '@':
                        for (i = 0; i <= filename_end - filename; i++) {
                                if (tomoyo_file_matches_pattern2(
                                                    filename + i, filename_end,
                                                    pattern + 1, pattern_end))
                                        return true;
                                c = filename[i];
                                if (c == '.' && *pattern == '@')
                                        break;
                                if (c != '\\')
                                        continue;
                                if (filename[i + 1] == '\\')
                                        i++;
                                else if (tomoyo_byte_range(filename + i + 1))
                                        i += 3;
                                else
                                        break; /* Bad pattern. */
                        }
                        return false; /* Not matched. */
                default:
                        j = 0;
                        c = *pattern;
                        if (c == '$') {
                                while (isdigit(filename[j]))
                                        j++;
                        } else if (c == 'X') {
                                while (isxdigit(filename[j]))
                                        j++;
                        } else if (c == 'A') {
                                while (tomoyo_alphabet_char(filename[j]))
                                        j++;
                        }
                        for (i = 1; i <= j; i++) {
                                if (tomoyo_file_matches_pattern2(
                                                    filename + i, filename_end,
                                                    pattern + 1, pattern_end))
                                        return true;
                        }
                        return false; /* Not matched or bad pattern. */
                }
                filename++;
                pattern++;
        }
        while (*pattern == '\\' &&
               (*(pattern + 1) == '*' || *(pattern + 1) == '@'))
                pattern += 2;
        return filename == filename_end && pattern == pattern_end;
}

/**
 * tomoyo_file_matches_pattern - Pattern matching without '/' character.
 *
 * @filename:     The start of string to check.
 * @filename_end: The end of string to check.
 * @pattern:      The start of pattern to compare.
 * @pattern_end:  The end of pattern to compare.
 *
 * Returns true if @filename matches @pattern, false otherwise.
 */
static bool tomoyo_file_matches_pattern(const char *filename,
                                        const char *filename_end,
                                        const char *pattern,
                                        const char *pattern_end)
{
        const char *pattern_start = pattern;
        bool first = true;
        bool result;

        while (pattern < pattern_end - 1) {
                /* Split at "\-" pattern. */
                if (*pattern++ != '\\' || *pattern++ != '-')
                        continue;
                result = tomoyo_file_matches_pattern2(filename,
                                                      filename_end,
                                                      pattern_start,
                                                      pattern - 2);
                if (first)
                        result = !result;
                if (result)
                        return false;
                first = false;
                pattern_start = pattern;
        }
        result = tomoyo_file_matches_pattern2(filename, filename_end,
                                              pattern_start, pattern_end);
        return first ? result : !result;
}

/**
 * tomoyo_path_matches_pattern2 - Do pathname pattern matching.
 *
 * @f: The start of string to check.
 * @p: The start of pattern to compare.
 *
 * Returns true if @f matches @p, false otherwise.
 */
static bool tomoyo_path_matches_pattern2(const char *f, const char *p)
{
        const char *f_delimiter;
        const char *p_delimiter;

        while (*f && *p) {
                f_delimiter = strchr(f, '/');
                if (!f_delimiter)
                        f_delimiter = f + strlen(f);
                p_delimiter = strchr(p, '/');
                if (!p_delimiter)
                        p_delimiter = p + strlen(p);
                if (*p == '\\' && *(p + 1) == '{')
                        goto recursive;
                if (!tomoyo_file_matches_pattern(f, f_delimiter, p,
                                                 p_delimiter))
                        return false;
                f = f_delimiter;
                if (*f)
                        f++;
                p = p_delimiter;
                if (*p)
                        p++;
        }
        /* Ignore trailing "\*" and "\@" in @pattern. */
        while (*p == '\\' &&
               (*(p + 1) == '*' || *(p + 1) == '@'))
                p += 2;
        return !*f && !*p;
 recursive:
        /*
         * The "\{" pattern is permitted only after '/' character.
         * This guarantees that below "*(p - 1)" is safe.
         * Also, the "\}" pattern is permitted only before '/' character
         * so that "\{" + "\}" pair will not break the "\-" operator.
         */
        if (*(p - 1) != '/' || p_delimiter <= p + 3 || *p_delimiter != '/' ||
            *(p_delimiter - 1) != '}' || *(p_delimiter - 2) != '\\')
                return false; /* Bad pattern. */
        do {
                /* Compare current component with pattern. */
                if (!tomoyo_file_matches_pattern(f, f_delimiter, p + 2,
                                                 p_delimiter - 2))
                        break;
                /* Proceed to next component. */
                f = f_delimiter;
                if (!*f)
                        break;
                f++;
                /* Continue comparison. */
                if (tomoyo_path_matches_pattern2(f, p_delimiter + 1))
                        return true;
                f_delimiter = strchr(f, '/');
        } while (f_delimiter);
        return false; /* Not matched. */
}

/**
 * tomoyo_path_matches_pattern - Check whether the given filename matches the given pattern.
 *
 * @filename: The filename to check.
 * @pattern:  The pattern to compare.
 *
 * Returns true if matches, false otherwise.
 *
 * The following patterns are available.
 *   \\     \ itself.
 *   \ooo   Octal representation of a byte.
 *   \*     Zero or more repetitions of characters other than '/'.
 *   \@     Zero or more repetitions of characters other than '/' or '.'.
 *   \?     1 byte character other than '/'.
 *   \$     One or more repetitions of decimal digits.
 *   \+     1 decimal digit.
 *   \X     One or more repetitions of hexadecimal digits.
 *   \x     1 hexadecimal digit.
 *   \A     One or more repetitions of alphabet characters.
 *   \a     1 alphabet character.
 *
 *   \-     Subtraction operator.
 *
 *   /\{dir\}/   '/' + 'One or more repetitions of dir/' (e.g. /dir/ /dir/dir/
 *               /dir/dir/dir/ ).
 */
bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
                                 const struct tomoyo_path_info *pattern)
{
        const char *f = filename->name;
        const char *p = pattern->name;
        const int len = pattern->const_len;

        /* If @pattern doesn't contain pattern, I can use strcmp(). */
        if (!pattern->is_patterned)
                return !tomoyo_pathcmp(filename, pattern);
        /* Don't compare directory and non-directory. */
        if (filename->is_dir != pattern->is_dir)
                return false;
        /* Compare the initial length without patterns. */
        if (strncmp(f, p, len))
                return false;
        f += len;
        p += len;
        return tomoyo_path_matches_pattern2(f, p);
}

/**
 * tomoyo_get_exe - Get tomoyo_realpath() of current process.
 *
 * Returns the tomoyo_realpath() of current process on success, NULL otherwise.
 *
 * This function uses kzalloc(), so the caller must call kfree()
 * if this function didn't return NULL.
 */
const char *tomoyo_get_exe(void)
{
        struct file *exe_file;
        const char *cp;
        struct mm_struct *mm = current->mm;

        if (!mm)
                return NULL;
        exe_file = get_mm_exe_file(mm);
        if (!exe_file)
                return NULL;

        cp = tomoyo_realpath_from_path(&exe_file->f_path);
        fput(exe_file);
        return cp;
}

/**
 * tomoyo_get_mode - Get MAC mode.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number.
 * @index:   Index number of functionality.
 *
 * Returns mode.
 */
int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile,
                    const u8 index)
{
        u8 mode;
        struct tomoyo_profile *p;

        if (!tomoyo_policy_loaded)
                return TOMOYO_CONFIG_DISABLED;
        p = tomoyo_profile(ns, profile);
        mode = p->config[index];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->config[tomoyo_index2category[index]
                                 + TOMOYO_MAX_MAC_INDEX];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->default_config;
        return mode & 3;
}

/**
 * tomoyo_init_request_info - Initialize "struct tomoyo_request_info" members.
 *
 * @r:      Pointer to "struct tomoyo_request_info" to initialize.
 * @domain: Pointer to "struct tomoyo_domain_info". NULL for tomoyo_domain().
 * @index:  Index number of functionality.
 *
 * Returns mode.
 */
int tomoyo_init_request_info(struct tomoyo_request_info *r,
                             struct tomoyo_domain_info *domain, const u8 index)
{
        u8 profile;

        memset(r, 0, sizeof(*r));
        if (!domain)
                domain = tomoyo_domain();
        r->domain = domain;
        profile = domain->profile;
        r->profile = profile;
        r->type = index;
        r->mode = tomoyo_get_mode(domain->ns, profile, index);
        return r->mode;
}

/**
 * tomoyo_domain_quota_is_ok - Check for domain's quota.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns true if the domain is not exceeded quota, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r)
{
        unsigned int count = 0;
        struct tomoyo_domain_info *domain = r->domain;
        struct tomoyo_acl_info *ptr;

        if (r->mode != TOMOYO_CONFIG_LEARNING)
                return false;
        if (!domain)
                return true;
        if (READ_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED]))
                return false;
        list_for_each_entry_rcu(ptr, &domain->acl_info_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                u16 perm;

                if (ptr->is_deleted)
                        continue;
                /*
                 * Reading perm bitmap might race with tomoyo_merge_*() because
                 * caller does not hold tomoyo_policy_lock mutex. But exceeding
                 * max_learning_entry parameter by a few entries does not harm.
                 */
                switch (ptr->type) {
                case TOMOYO_TYPE_PATH_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_PATH2_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path2_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_PATH_NUMBER_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path_number_acl, head)
                                  ->perm);
                        break;
                case TOMOYO_TYPE_MKDEV_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_mkdev_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_INET_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_inet_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_UNIX_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_unix_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_MANUAL_TASK_ACL:
                        perm = 0;
                        break;
                default:
                        perm = 1;
                }
                count += hweight16(perm);
        }
        if (count < tomoyo_profile(domain->ns, domain->profile)->
            pref[TOMOYO_PREF_MAX_LEARNING_ENTRY])
                return true;
        WRITE_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED], true);
        /* r->granted = false; */
        tomoyo_write_log(r, "%s", tomoyo_dif[TOMOYO_DIF_QUOTA_WARNED]);
#ifndef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING
        pr_warn("WARNING: Domain '%s' has too many ACLs to hold. Stopped learning mode.\n",
                domain->domainname->name);
#endif
        return false;
}































































































































































































































































































































































    2 





    1 

    2 




























    2 


    1 
    1 






























































































    2 

















    3 


    3 















































































    3 






























    3 


























    3 













    3 












































    3 
























    3 



















    3 
    3 





















    3 

    3 












    2 

    2 




















    1 

    1 













    1 

    1 






























































































































    3 


    3 
















































    1 


    1 
















    1 




    1 





























































































































    3 
    3 











    3 


    3 


    3 

    2 




    2 

    2 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/usb/core/usb.c
 *
 * (C) Copyright Linus Torvalds 1999
 * (C) Copyright Johannes Erdfelt 1999-2001
 * (C) Copyright Andreas Gal 1999
 * (C) Copyright Gregory P. Smith 1999
 * (C) Copyright Deti Fliegl 1999 (new USB architecture)
 * (C) Copyright Randy Dunlap 2000
 * (C) Copyright David Brownell 2000-2004
 * (C) Copyright Yggdrasil Computing, Inc. 2000
 *     (usb_device_id matching changes by Adam J. Richter)
 * (C) Copyright Greg Kroah-Hartman 2002-2003
 *
 * Released under the GPLv2 only.
 *
 * NOTE! This is not actually a driver at all, rather this is
 * just a collection of helper routines that implement the
 * generic USB things that the real drivers can use..
 *
 * Think of this as a "USB library" rather than anything else,
 * with no callbacks.  Callbacks are evil.
 */

#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/of.h>
#include <linux/string.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/errno.h>
#include <linux/usb.h>
#include <linux/usb/hcd.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/debugfs.h>
#include <linux/usb/of.h>

#include <asm/io.h>
#include <linux/scatterlist.h>
#include <linux/mm.h>
#include <linux/dma-mapping.h>

#include "hub.h"
#include "trace.h"

const char *usbcore_name = "usbcore";

static bool nousb;        /* Disable USB when built into kernel image */

module_param(nousb, bool, 0444);

/*
 * for external read access to <nousb>
 */
int usb_disabled(void)
{
        return nousb;
}
EXPORT_SYMBOL_GPL(usb_disabled);

#ifdef        CONFIG_PM
/* Default delay value, in seconds */
static int usb_autosuspend_delay = CONFIG_USB_AUTOSUSPEND_DELAY;
module_param_named(autosuspend, usb_autosuspend_delay, int, 0644);
MODULE_PARM_DESC(autosuspend, "default autosuspend delay");

#else
#define usb_autosuspend_delay                0
#endif

static bool match_endpoint(struct usb_endpoint_descriptor *epd,
                struct usb_endpoint_descriptor **bulk_in,
                struct usb_endpoint_descriptor **bulk_out,
                struct usb_endpoint_descriptor **int_in,
                struct usb_endpoint_descriptor **int_out)
{
        switch (usb_endpoint_type(epd)) {
        case USB_ENDPOINT_XFER_BULK:
                if (usb_endpoint_dir_in(epd)) {
                        if (bulk_in && !*bulk_in) {
                                *bulk_in = epd;
                                break;
                        }
                } else {
                        if (bulk_out && !*bulk_out) {
                                *bulk_out = epd;
                                break;
                        }
                }

                return false;
        case USB_ENDPOINT_XFER_INT:
                if (usb_endpoint_dir_in(epd)) {
                        if (int_in && !*int_in) {
                                *int_in = epd;
                                break;
                        }
                } else {
                        if (int_out && !*int_out) {
                                *int_out = epd;
                                break;
                        }
                }

                return false;
        default:
                return false;
        }

        return (!bulk_in || *bulk_in) && (!bulk_out || *bulk_out) &&
                        (!int_in || *int_in) && (!int_out || *int_out);
}

/**
 * usb_find_common_endpoints() -- look up common endpoint descriptors
 * @alt:        alternate setting to search
 * @bulk_in:        pointer to descriptor pointer, or NULL
 * @bulk_out:        pointer to descriptor pointer, or NULL
 * @int_in:        pointer to descriptor pointer, or NULL
 * @int_out:        pointer to descriptor pointer, or NULL
 *
 * Search the alternate setting's endpoint descriptors for the first bulk-in,
 * bulk-out, interrupt-in and interrupt-out endpoints and return them in the
 * provided pointers (unless they are NULL).
 *
 * If a requested endpoint is not found, the corresponding pointer is set to
 * NULL.
 *
 * Return: Zero if all requested descriptors were found, or -ENXIO otherwise.
 */
int usb_find_common_endpoints(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **bulk_in,
                struct usb_endpoint_descriptor **bulk_out,
                struct usb_endpoint_descriptor **int_in,
                struct usb_endpoint_descriptor **int_out)
{
        struct usb_endpoint_descriptor *epd;
        int i;

        if (bulk_in)
                *bulk_in = NULL;
        if (bulk_out)
                *bulk_out = NULL;
        if (int_in)
                *int_in = NULL;
        if (int_out)
                *int_out = NULL;

        for (i = 0; i < alt->desc.bNumEndpoints; ++i) {
                epd = &alt->endpoint[i].desc;

                if (match_endpoint(epd, bulk_in, bulk_out, int_in, int_out))
                        return 0;
        }

        return -ENXIO;
}
EXPORT_SYMBOL_GPL(usb_find_common_endpoints);

/**
 * usb_find_common_endpoints_reverse() -- look up common endpoint descriptors
 * @alt:        alternate setting to search
 * @bulk_in:        pointer to descriptor pointer, or NULL
 * @bulk_out:        pointer to descriptor pointer, or NULL
 * @int_in:        pointer to descriptor pointer, or NULL
 * @int_out:        pointer to descriptor pointer, or NULL
 *
 * Search the alternate setting's endpoint descriptors for the last bulk-in,
 * bulk-out, interrupt-in and interrupt-out endpoints and return them in the
 * provided pointers (unless they are NULL).
 *
 * If a requested endpoint is not found, the corresponding pointer is set to
 * NULL.
 *
 * Return: Zero if all requested descriptors were found, or -ENXIO otherwise.
 */
int usb_find_common_endpoints_reverse(struct usb_host_interface *alt,
                struct usb_endpoint_descriptor **bulk_in,
                struct usb_endpoint_descriptor **bulk_out,
                struct usb_endpoint_descriptor **int_in,
                struct usb_endpoint_descriptor **int_out)
{
        struct usb_endpoint_descriptor *epd;
        int i;

        if (bulk_in)
                *bulk_in = NULL;
        if (bulk_out)
                *bulk_out = NULL;
        if (int_in)
                *int_in = NULL;
        if (int_out)
                *int_out = NULL;

        for (i = alt->desc.bNumEndpoints - 1; i >= 0; --i) {
                epd = &alt->endpoint[i].desc;

                if (match_endpoint(epd, bulk_in, bulk_out, int_in, int_out))
                        return 0;
        }

        return -ENXIO;
}
EXPORT_SYMBOL_GPL(usb_find_common_endpoints_reverse);

/**
 * usb_find_endpoint() - Given an endpoint address, search for the endpoint's
 * usb_host_endpoint structure in an interface's current altsetting.
 * @intf: the interface whose current altsetting should be searched
 * @ep_addr: the endpoint address (number and direction) to find
 *
 * Search the altsetting's list of endpoints for one with the specified address.
 *
 * Return: Pointer to the usb_host_endpoint if found, %NULL otherwise.
 */
static const struct usb_host_endpoint *usb_find_endpoint(
                const struct usb_interface *intf, unsigned int ep_addr)
{
        int n;
        const struct usb_host_endpoint *ep;

        n = intf->cur_altsetting->desc.bNumEndpoints;
        ep = intf->cur_altsetting->endpoint;
        for (; n > 0; (--n, ++ep)) {
                if (ep->desc.bEndpointAddress == ep_addr)
                        return ep;
        }
        return NULL;
}

/**
 * usb_check_bulk_endpoints - Check whether an interface's current altsetting
 * contains a set of bulk endpoints with the given addresses.
 * @intf: the interface whose current altsetting should be searched
 * @ep_addrs: 0-terminated array of the endpoint addresses (number and
 * direction) to look for
 *
 * Search for endpoints with the specified addresses and check their types.
 *
 * Return: %true if all the endpoints are found and are bulk, %false otherwise.
 */
bool usb_check_bulk_endpoints(
                const struct usb_interface *intf, const u8 *ep_addrs)
{
        const struct usb_host_endpoint *ep;

        for (; *ep_addrs; ++ep_addrs) {
                ep = usb_find_endpoint(intf, *ep_addrs);
                if (!ep || !usb_endpoint_xfer_bulk(&ep->desc))
                        return false;
        }
        return true;
}
EXPORT_SYMBOL_GPL(usb_check_bulk_endpoints);

/**
 * usb_check_int_endpoints - Check whether an interface's current altsetting
 * contains a set of interrupt endpoints with the given addresses.
 * @intf: the interface whose current altsetting should be searched
 * @ep_addrs: 0-terminated array of the endpoint addresses (number and
 * direction) to look for
 *
 * Search for endpoints with the specified addresses and check their types.
 *
 * Return: %true if all the endpoints are found and are interrupt,
 * %false otherwise.
 */
bool usb_check_int_endpoints(
                const struct usb_interface *intf, const u8 *ep_addrs)
{
        const struct usb_host_endpoint *ep;

        for (; *ep_addrs; ++ep_addrs) {
                ep = usb_find_endpoint(intf, *ep_addrs);
                if (!ep || !usb_endpoint_xfer_int(&ep->desc))
                        return false;
        }
        return true;
}
EXPORT_SYMBOL_GPL(usb_check_int_endpoints);

/**
 * usb_find_alt_setting() - Given a configuration, find the alternate setting
 * for the given interface.
 * @config: the configuration to search (not necessarily the current config).
 * @iface_num: interface number to search in
 * @alt_num: alternate interface setting number to search for.
 *
 * Search the configuration's interface cache for the given alt setting.
 *
 * Return: The alternate setting, if found. %NULL otherwise.
 */
struct usb_host_interface *usb_find_alt_setting(
                struct usb_host_config *config,
                unsigned int iface_num,
                unsigned int alt_num)
{
        struct usb_interface_cache *intf_cache = NULL;
        int i;

        if (!config)
                return NULL;
        for (i = 0; i < config->desc.bNumInterfaces; i++) {
                if (config->intf_cache[i]->altsetting[0].desc.bInterfaceNumber
                                == iface_num) {
                        intf_cache = config->intf_cache[i];
                        break;
                }
        }
        if (!intf_cache)
                return NULL;
        for (i = 0; i < intf_cache->num_altsetting; i++)
                if (intf_cache->altsetting[i].desc.bAlternateSetting == alt_num)
                        return &intf_cache->altsetting[i];

        printk(KERN_DEBUG "Did not find alt setting %u for intf %u, "
                        "config %u\n", alt_num, iface_num,
                        config->desc.bConfigurationValue);
        return NULL;
}
EXPORT_SYMBOL_GPL(usb_find_alt_setting);

/**
 * usb_ifnum_to_if - get the interface object with a given interface number
 * @dev: the device whose current configuration is considered
 * @ifnum: the desired interface
 *
 * This walks the device descriptor for the currently active configuration
 * to find the interface object with the particular interface number.
 *
 * Note that configuration descriptors are not required to assign interface
 * numbers sequentially, so that it would be incorrect to assume that
 * the first interface in that descriptor corresponds to interface zero.
 * This routine helps device drivers avoid such mistakes.
 * However, you should make sure that you do the right thing with any
 * alternate settings available for this interfaces.
 *
 * Don't call this function unless you are bound to one of the interfaces
 * on this device or you have locked the device!
 *
 * Return: A pointer to the interface that has @ifnum as interface number,
 * if found. %NULL otherwise.
 */
struct usb_interface *usb_ifnum_to_if(const struct usb_device *dev,
                                      unsigned ifnum)
{
        struct usb_host_config *config = dev->actconfig;
        int i;

        if (!config)
                return NULL;
        for (i = 0; i < config->desc.bNumInterfaces; i++)
                if (config->interface[i]->altsetting[0]
                                .desc.bInterfaceNumber == ifnum)
                        return config->interface[i];

        return NULL;
}
EXPORT_SYMBOL_GPL(usb_ifnum_to_if);

/**
 * usb_altnum_to_altsetting - get the altsetting structure with a given alternate setting number.
 * @intf: the interface containing the altsetting in question
 * @altnum: the desired alternate setting number
 *
 * This searches the altsetting array of the specified interface for
 * an entry with the correct bAlternateSetting value.
 *
 * Note that altsettings need not be stored sequentially by number, so
 * it would be incorrect to assume that the first altsetting entry in
 * the array corresponds to altsetting zero.  This routine helps device
 * drivers avoid such mistakes.
 *
 * Don't call this function unless you are bound to the intf interface
 * or you have locked the device!
 *
 * Return: A pointer to the entry of the altsetting array of @intf that
 * has @altnum as the alternate setting number. %NULL if not found.
 */
struct usb_host_interface *usb_altnum_to_altsetting(
                                        const struct usb_interface *intf,
                                        unsigned int altnum)
{
        int i;

        for (i = 0; i < intf->num_altsetting; i++) {
                if (intf->altsetting[i].desc.bAlternateSetting == altnum)
                        return &intf->altsetting[i];
        }
        return NULL;
}
EXPORT_SYMBOL_GPL(usb_altnum_to_altsetting);

struct find_interface_arg {
        int minor;
        struct device_driver *drv;
};

static int __find_interface(struct device *dev, const void *data)
{
        const struct find_interface_arg *arg = data;
        struct usb_interface *intf;

        if (!is_usb_interface(dev))
                return 0;

        if (dev->driver != arg->drv)
                return 0;
        intf = to_usb_interface(dev);
        return intf->minor == arg->minor;
}

/**
 * usb_find_interface - find usb_interface pointer for driver and device
 * @drv: the driver whose current configuration is considered
 * @minor: the minor number of the desired device
 *
 * This walks the bus device list and returns a pointer to the interface
 * with the matching minor and driver.  Note, this only works for devices
 * that share the USB major number.
 *
 * Return: A pointer to the interface with the matching major and @minor.
 */
struct usb_interface *usb_find_interface(struct usb_driver *drv, int minor)
{
        struct find_interface_arg argb;
        struct device *dev;

        argb.minor = minor;
        argb.drv = &drv->driver;

        dev = bus_find_device(&usb_bus_type, NULL, &argb, __find_interface);

        /* Drop reference count from bus_find_device */
        put_device(dev);

        return dev ? to_usb_interface(dev) : NULL;
}
EXPORT_SYMBOL_GPL(usb_find_interface);

struct each_dev_arg {
        void *data;
        int (*fn)(struct usb_device *, void *);
};

static int __each_dev(struct device *dev, void *data)
{
        struct each_dev_arg *arg = (struct each_dev_arg *)data;

        /* There are struct usb_interface on the same bus, filter them out */
        if (!is_usb_device(dev))
                return 0;

        return arg->fn(to_usb_device(dev), arg->data);
}

/**
 * usb_for_each_dev - iterate over all USB devices in the system
 * @data: data pointer that will be handed to the callback function
 * @fn: callback function to be called for each USB device
 *
 * Iterate over all USB devices and call @fn for each, passing it @data. If it
 * returns anything other than 0, we break the iteration prematurely and return
 * that value.
 */
int usb_for_each_dev(void *data, int (*fn)(struct usb_device *, void *))
{
        struct each_dev_arg arg = {data, fn};

        return bus_for_each_dev(&usb_bus_type, NULL, &arg, __each_dev);
}
EXPORT_SYMBOL_GPL(usb_for_each_dev);

/**
 * usb_release_dev - free a usb device structure when all users of it are finished.
 * @dev: device that's been disconnected
 *
 * Will be called only by the device core when all users of this usb device are
 * done.
 */
static void usb_release_dev(struct device *dev)
{
        struct usb_device *udev;
        struct usb_hcd *hcd;

        udev = to_usb_device(dev);
        hcd = bus_to_hcd(udev->bus);

        usb_destroy_configuration(udev);
        usb_release_bos_descriptor(udev);
        of_node_put(dev->of_node);
        usb_put_hcd(hcd);
        kfree(udev->product);
        kfree(udev->manufacturer);
        kfree(udev->serial);
        kfree(udev);
}

static int usb_dev_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        const struct usb_device *usb_dev;

        usb_dev = to_usb_device(dev);

        if (add_uevent_var(env, "BUSNUM=%03d", usb_dev->bus->busnum))
                return -ENOMEM;

        if (add_uevent_var(env, "DEVNUM=%03d", usb_dev->devnum))
                return -ENOMEM;

        return 0;
}

#ifdef        CONFIG_PM

/* USB device Power-Management thunks.
 * There's no need to distinguish here between quiescing a USB device
 * and powering it down; the generic_suspend() routine takes care of
 * it by skipping the usb_port_suspend() call for a quiesce.  And for
 * USB interfaces there's no difference at all.
 */

static int usb_dev_prepare(struct device *dev)
{
        return 0;                /* Implement eventually? */
}

static void usb_dev_complete(struct device *dev)
{
        /* Currently used only for rebinding interfaces */
        usb_resume_complete(dev);
}

static int usb_dev_suspend(struct device *dev)
{
        return usb_suspend(dev, PMSG_SUSPEND);
}

static int usb_dev_resume(struct device *dev)
{
        return usb_resume(dev, PMSG_RESUME);
}

static int usb_dev_freeze(struct device *dev)
{
        return usb_suspend(dev, PMSG_FREEZE);
}

static int usb_dev_thaw(struct device *dev)
{
        return usb_resume(dev, PMSG_THAW);
}

static int usb_dev_poweroff(struct device *dev)
{
        return usb_suspend(dev, PMSG_HIBERNATE);
}

static int usb_dev_restore(struct device *dev)
{
        return usb_resume(dev, PMSG_RESTORE);
}

static const struct dev_pm_ops usb_device_pm_ops = {
        .prepare =        usb_dev_prepare,
        .complete =        usb_dev_complete,
        .suspend =        usb_dev_suspend,
        .resume =        usb_dev_resume,
        .freeze =        usb_dev_freeze,
        .thaw =                usb_dev_thaw,
        .poweroff =        usb_dev_poweroff,
        .restore =        usb_dev_restore,
        .runtime_suspend =        usb_runtime_suspend,
        .runtime_resume =        usb_runtime_resume,
        .runtime_idle =                usb_runtime_idle,
};

#endif        /* CONFIG_PM */


static char *usb_devnode(const struct device *dev,
                         umode_t *mode, kuid_t *uid, kgid_t *gid)
{
        const struct usb_device *usb_dev;

        usb_dev = to_usb_device(dev);
        return kasprintf(GFP_KERNEL, "bus/usb/%03d/%03d",
                         usb_dev->bus->busnum, usb_dev->devnum);
}

const struct device_type usb_device_type = {
        .name =                "usb_device",
        .release =        usb_release_dev,
        .uevent =        usb_dev_uevent,
        .devnode =         usb_devnode,
#ifdef CONFIG_PM
        .pm =                &usb_device_pm_ops,
#endif
};

static bool usb_dev_authorized(struct usb_device *dev, struct usb_hcd *hcd)
{
        struct usb_hub *hub;

        if (!dev->parent)
                return true; /* Root hub always ok [and always wired] */

        switch (hcd->dev_policy) {
        case USB_DEVICE_AUTHORIZE_NONE:
        default:
                return false;

        case USB_DEVICE_AUTHORIZE_ALL:
                return true;

        case USB_DEVICE_AUTHORIZE_INTERNAL:
                hub = usb_hub_to_struct_hub(dev->parent);
                return hub->ports[dev->portnum - 1]->connect_type ==
                                USB_PORT_CONNECT_TYPE_HARD_WIRED;
        }
}

/**
 * usb_alloc_dev - usb device constructor (usbcore-internal)
 * @parent: hub to which device is connected; null to allocate a root hub
 * @bus: bus used to access the device
 * @port1: one-based index of port; ignored for root hubs
 *
 * Context: task context, might sleep.
 *
 * Only hub drivers (including virtual root hub drivers for host
 * controllers) should ever call this.
 *
 * This call may not be used in a non-sleeping context.
 *
 * Return: On success, a pointer to the allocated usb device. %NULL on
 * failure.
 */
struct usb_device *usb_alloc_dev(struct usb_device *parent,
                                 struct usb_bus *bus, unsigned port1)
{
        struct usb_device *dev;
        struct usb_hcd *usb_hcd = bus_to_hcd(bus);
        unsigned raw_port = port1;

        dev = kzalloc_obj(*dev);
        if (!dev)
                return NULL;

        if (!usb_get_hcd(usb_hcd)) {
                kfree(dev);
                return NULL;
        }
        /* Root hubs aren't true devices, so don't allocate HCD resources */
        if (usb_hcd->driver->alloc_dev && parent &&
                !usb_hcd->driver->alloc_dev(usb_hcd, dev)) {
                usb_put_hcd(bus_to_hcd(bus));
                kfree(dev);
                return NULL;
        }

        device_initialize(&dev->dev);
        dev->dev.bus = &usb_bus_type;
        dev->dev.type = &usb_device_type;
        dev->dev.groups = usb_device_groups;
        set_dev_node(&dev->dev, dev_to_node(bus->sysdev));
        dev->state = USB_STATE_ATTACHED;
        dev->lpm_disable_count = 1;
        spin_lock_init(&dev->offload_lock);
        dev->offload_usage = 0;
        atomic_set(&dev->urbnum, 0);

        INIT_LIST_HEAD(&dev->ep0.urb_list);
        dev->ep0.desc.bLength = USB_DT_ENDPOINT_SIZE;
        dev->ep0.desc.bDescriptorType = USB_DT_ENDPOINT;
        /* ep0 maxpacket comes later, from device descriptor */
        usb_enable_endpoint(dev, &dev->ep0, false);
        dev->can_submit = 1;

        /* Save readable and stable topology id, distinguishing devices
         * by location for diagnostics, tools, driver model, etc.  The
         * string is a path along hub ports, from the root.  Each device's
         * dev->devpath will be stable until USB is re-cabled, and hubs
         * are often labeled with these port numbers.  The name isn't
         * as stable:  bus->busnum changes easily from modprobe order,
         * cardbus or pci hotplugging, and so on.
         */
        if (unlikely(!parent)) {
                dev->devpath[0] = '0';
                dev->route = 0;

                dev->dev.parent = bus->controller;
                device_set_of_node_from_dev(&dev->dev, bus->sysdev);
                dev_set_name(&dev->dev, "usb%d", bus->busnum);
        } else {
                int n;

                /* match any labeling on the hubs; it's one-based */
                if (parent->devpath[0] == '0') {
                        n = snprintf(dev->devpath, sizeof(dev->devpath), "%d", port1);
                        /* Root ports are not counted in route string */
                        dev->route = 0;
                } else {
                        n = snprintf(dev->devpath, sizeof(dev->devpath), "%s.%d",
                                     parent->devpath, port1);
                        /* Route string assumes hubs have less than 16 ports */
                        if (port1 < 15)
                                dev->route = parent->route +
                                        (port1 << ((parent->level - 1)*4));
                        else
                                dev->route = parent->route +
                                        (15 << ((parent->level - 1)*4));
                }
                if (n >= sizeof(dev->devpath)) {
                        usb_put_hcd(bus_to_hcd(bus));
                        usb_put_dev(dev);
                        return NULL;
                }

                dev->dev.parent = &parent->dev;
                dev_set_name(&dev->dev, "%d-%s", bus->busnum, dev->devpath);

                if (!parent->parent) {
                        /* device under root hub's port */
                        raw_port = usb_hcd_find_raw_port_number(usb_hcd,
                                port1);
                }
                dev->dev.of_node = usb_of_get_device_node(parent, raw_port);

                /* hub driver sets up TT records */
        }

        dev->portnum = port1;
        dev->bus = bus;
        dev->parent = parent;
        INIT_LIST_HEAD(&dev->filelist);

#ifdef        CONFIG_PM
        pm_runtime_set_autosuspend_delay(&dev->dev,
                        usb_autosuspend_delay * 1000);
        dev->connect_time = jiffies;
        dev->active_duration = -jiffies;
#endif

        dev->authorized = usb_dev_authorized(dev, usb_hcd);
        trace_usb_alloc_dev(dev);
        return dev;
}
EXPORT_SYMBOL_GPL(usb_alloc_dev);

/**
 * usb_get_dev - increments the reference count of the usb device structure
 * @dev: the device being referenced
 *
 * Each live reference to a device should be refcounted.
 *
 * Drivers for USB interfaces should normally record such references in
 * their probe() methods, when they bind to an interface, and release
 * them by calling usb_put_dev(), in their disconnect() methods.
 * However, if a driver does not access the usb_device structure after
 * its disconnect() method returns then refcounting is not necessary,
 * because the USB core guarantees that a usb_device will not be
 * deallocated until after all of its interface drivers have been unbound.
 *
 * Return: A pointer to the device with the incremented reference counter.
 */
struct usb_device *usb_get_dev(struct usb_device *dev)
{
        if (dev)
                get_device(&dev->dev);
        return dev;
}
EXPORT_SYMBOL_GPL(usb_get_dev);

/**
 * usb_put_dev - release a use of the usb device structure
 * @dev: device that's been disconnected
 *
 * Must be called when a user of a device is finished with it.  When the last
 * user of the device calls this function, the memory of the device is freed.
 */
void usb_put_dev(struct usb_device *dev)
{
        if (dev)
                put_device(&dev->dev);
}
EXPORT_SYMBOL_GPL(usb_put_dev);

/**
 * usb_get_intf - increments the reference count of the usb interface structure
 * @intf: the interface being referenced
 *
 * Each live reference to a interface must be refcounted.
 *
 * Drivers for USB interfaces should normally record such references in
 * their probe() methods, when they bind to an interface, and release
 * them by calling usb_put_intf(), in their disconnect() methods.
 * However, if a driver does not access the usb_interface structure after
 * its disconnect() method returns then refcounting is not necessary,
 * because the USB core guarantees that a usb_interface will not be
 * deallocated until after its driver has been unbound.
 *
 * Return: A pointer to the interface with the incremented reference counter.
 */
struct usb_interface *usb_get_intf(struct usb_interface *intf)
{
        if (intf)
                get_device(&intf->dev);
        return intf;
}
EXPORT_SYMBOL_GPL(usb_get_intf);

/**
 * usb_put_intf - release a use of the usb interface structure
 * @intf: interface that's been decremented
 *
 * Must be called when a user of an interface is finished with it.  When the
 * last user of the interface calls this function, the memory of the interface
 * is freed.
 */
void usb_put_intf(struct usb_interface *intf)
{
        if (intf)
                put_device(&intf->dev);
}
EXPORT_SYMBOL_GPL(usb_put_intf);

/**
 * usb_intf_get_dma_device - acquire a reference on the usb interface's DMA endpoint
 * @intf: the usb interface
 *
 * While a USB device cannot perform DMA operations by itself, many USB
 * controllers can. A call to usb_intf_get_dma_device() returns the DMA endpoint
 * for the given USB interface, if any. The returned device structure must be
 * released with put_device().
 *
 * See also usb_get_dma_device().
 *
 * Returns: A reference to the usb interface's DMA endpoint; or NULL if none
 *          exists.
 */
struct device *usb_intf_get_dma_device(struct usb_interface *intf)
{
        struct usb_device *udev = interface_to_usbdev(intf);
        struct device *dmadev;

        if (!udev->bus)
                return NULL;

        dmadev = get_device(udev->bus->sysdev);
        if (!dmadev || !dmadev->dma_mask) {
                put_device(dmadev);
                return NULL;
        }

        return dmadev;
}
EXPORT_SYMBOL_GPL(usb_intf_get_dma_device);

/*                        USB device locking
 *
 * USB devices and interfaces are locked using the semaphore in their
 * embedded struct device.  The hub driver guarantees that whenever a
 * device is connected or disconnected, drivers are called with the
 * USB device locked as well as their particular interface.
 *
 * Complications arise when several devices are to be locked at the same
 * time.  Only hub-aware drivers that are part of usbcore ever have to
 * do this; nobody else needs to worry about it.  The rule for locking
 * is simple:
 *
 *        When locking both a device and its parent, always lock the
 *        parent first.
 */

/**
 * usb_lock_device_for_reset - cautiously acquire the lock for a usb device structure
 * @udev: device that's being locked
 * @iface: interface bound to the driver making the request (optional)
 *
 * Attempts to acquire the device lock, but fails if the device is
 * NOTATTACHED or SUSPENDED, or if iface is specified and the interface
 * is neither BINDING nor BOUND.  Rather than sleeping to wait for the
 * lock, the routine polls repeatedly.  This is to prevent deadlock with
 * disconnect; in some drivers (such as usb-storage) the disconnect()
 * or suspend() method will block waiting for a device reset to complete.
 *
 * Return: A negative error code for failure, otherwise 0.
 */
int usb_lock_device_for_reset(struct usb_device *udev,
                              const struct usb_interface *iface)
{
        unsigned long jiffies_expire = jiffies + HZ;

        if (udev->state == USB_STATE_NOTATTACHED)
                return -ENODEV;
        if (udev->state == USB_STATE_SUSPENDED)
                return -EHOSTUNREACH;
        if (iface && (iface->condition == USB_INTERFACE_UNBINDING ||
                        iface->condition == USB_INTERFACE_UNBOUND))
                return -EINTR;

        while (!usb_trylock_device(udev)) {

                /* If we can't acquire the lock after waiting one second,
                 * we're probably deadlocked */
                if (time_after(jiffies, jiffies_expire))
                        return -EBUSY;

                msleep(15);
                if (udev->state == USB_STATE_NOTATTACHED)
                        return -ENODEV;
                if (udev->state == USB_STATE_SUSPENDED)
                        return -EHOSTUNREACH;
                if (iface && (iface->condition == USB_INTERFACE_UNBINDING ||
                                iface->condition == USB_INTERFACE_UNBOUND))
                        return -EINTR;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(usb_lock_device_for_reset);

/**
 * usb_get_current_frame_number - return current bus frame number
 * @dev: the device whose bus is being queried
 *
 * Return: The current frame number for the USB host controller used
 * with the given USB device. This can be used when scheduling
 * isochronous requests.
 *
 * Note: Different kinds of host controller have different "scheduling
 * horizons". While one type might support scheduling only 32 frames
 * into the future, others could support scheduling up to 1024 frames
 * into the future.
 *
 */
int usb_get_current_frame_number(struct usb_device *dev)
{
        return usb_hcd_get_frame_number(dev);
}
EXPORT_SYMBOL_GPL(usb_get_current_frame_number);

/*-------------------------------------------------------------------*/
/*
 * __usb_get_extra_descriptor() finds a descriptor of specific type in the
 * extra field of the interface and endpoint descriptor structs.
 */

int __usb_get_extra_descriptor(char *buffer, unsigned size,
                               unsigned char type, void **ptr, size_t minsize)
{
        struct usb_descriptor_header *header;

        while (size >= sizeof(struct usb_descriptor_header)) {
                header = (struct usb_descriptor_header *)buffer;

                if (header->bLength < 2 || header->bLength > size) {
                        printk(KERN_ERR
                                "%s: bogus descriptor, type %d length %d\n",
                                usbcore_name,
                                header->bDescriptorType,
                                header->bLength);
                        return -1;
                }

                if (header->bDescriptorType == type && header->bLength >= minsize) {
                        *ptr = header;
                        return 0;
                }

                buffer += header->bLength;
                size -= header->bLength;
        }
        return -1;
}
EXPORT_SYMBOL_GPL(__usb_get_extra_descriptor);

/**
 * usb_alloc_coherent - allocate dma-consistent buffer for URB_NO_xxx_DMA_MAP
 * @dev: device the buffer will be used with
 * @size: requested buffer size
 * @mem_flags: affect whether allocation may block
 * @dma: used to return DMA address of buffer
 *
 * Return: Either null (indicating no buffer could be allocated), or the
 * cpu-space pointer to a buffer that may be used to perform DMA to the
 * specified device.  Such cpu-space buffers are returned along with the DMA
 * address (through the pointer provided).
 *
 * Note:
 * These buffers are used with URB_NO_xxx_DMA_MAP set in urb->transfer_flags
 * to avoid behaviors like using "DMA bounce buffers", or thrashing IOMMU
 * hardware during URB completion/resubmit.  The implementation varies between
 * platforms, depending on details of how DMA will work to this device.
 * Using these buffers also eliminates cacheline sharing problems on
 * architectures where CPU caches are not DMA-coherent.  On systems without
 * bus-snooping caches, these buffers are uncached.
 *
 * When the buffer is no longer used, free it with usb_free_coherent().
 */
void *usb_alloc_coherent(struct usb_device *dev, size_t size, gfp_t mem_flags,
                         dma_addr_t *dma)
{
        if (!dev || !dev->bus)
                return NULL;
        return hcd_buffer_alloc(dev->bus, size, mem_flags, dma);
}
EXPORT_SYMBOL_GPL(usb_alloc_coherent);

/**
 * usb_free_coherent - free memory allocated with usb_alloc_coherent()
 * @dev: device the buffer was used with
 * @size: requested buffer size
 * @addr: CPU address of buffer
 * @dma: DMA address of buffer
 *
 * This reclaims an I/O buffer, letting it be reused.  The memory must have
 * been allocated using usb_alloc_coherent(), and the parameters must match
 * those provided in that allocation request.
 */
void usb_free_coherent(struct usb_device *dev, size_t size, void *addr,
                       dma_addr_t dma)
{
        if (!dev || !dev->bus)
                return;
        if (!addr)
                return;
        hcd_buffer_free(dev->bus, size, addr, dma);
}
EXPORT_SYMBOL_GPL(usb_free_coherent);

/**
 * usb_alloc_noncoherent - allocate dma-noncoherent buffer for URB_NO_xxx_DMA_MAP
 * @dev: device the buffer will be used with
 * @size: requested buffer size
 * @mem_flags: affect whether allocation may block
 * @dma: used to return DMA address of buffer
 * @dir: DMA transfer direction
 * @table: used to return sg_table of allocated memory
 *
 * To explicit manage the memory ownership for the kernel vs the device by
 * USB core, the user needs save sg_table to urb->sgt. Then USB core will
 * do DMA sync for CPU and device properly.
 *
 * When the buffer is no longer used, free it with usb_free_noncoherent().
 *
 * Return: Either null (indicating no buffer could be allocated), or the
 * cpu-space pointer to a buffer that may be used to perform DMA to the
 * specified device.  Such cpu-space buffers are returned along with the DMA
 * address (through the pointer provided).
 */
void *usb_alloc_noncoherent(struct usb_device *dev, size_t size,
                            gfp_t mem_flags, dma_addr_t *dma,
                            enum dma_data_direction dir,
                            struct sg_table **table)
{
        struct device *dmadev;
        struct sg_table *sgt;
        void *buffer;

        if (!dev || !dev->bus)
                return NULL;

        dmadev = bus_to_hcd(dev->bus)->self.sysdev;

        sgt = dma_alloc_noncontiguous(dmadev, size, dir, mem_flags, 0);
        if (!sgt)
                return NULL;

        buffer = dma_vmap_noncontiguous(dmadev, size, sgt);
        if (!buffer) {
                dma_free_noncontiguous(dmadev, size, sgt, dir);
                return NULL;
        }

        *table = sgt;
        *dma = sg_dma_address(sgt->sgl);

        return buffer;
}
EXPORT_SYMBOL_GPL(usb_alloc_noncoherent);

/**
 * usb_free_noncoherent - free memory allocated with usb_alloc_noncoherent()
 * @dev: device the buffer was used with
 * @size: requested buffer size
 * @addr: CPU address of buffer
 * @dir: DMA transfer direction
 * @table: describe the allocated and DMA mapped memory,
 *
 * This reclaims an I/O buffer, letting it be reused.  The memory must have
 * been allocated using usb_alloc_noncoherent(), and the parameters must match
 * those provided in that allocation request.
 */
void usb_free_noncoherent(struct usb_device *dev, size_t size,
                          void *addr, enum dma_data_direction dir,
                          struct sg_table *table)
{
        struct device *dmadev;

        if (!dev || !dev->bus)
                return;
        if (!addr)
                return;

        dmadev = bus_to_hcd(dev->bus)->self.sysdev;
        dma_vunmap_noncontiguous(dmadev, addr);
        dma_free_noncontiguous(dmadev, size, table, dir);
}
EXPORT_SYMBOL_GPL(usb_free_noncoherent);

/**
 * usb_endpoint_max_periodic_payload - Get maximum payload bytes per service
 *                                       interval
 * @udev: The USB device
 * @ep: The endpoint
 *
 * Returns: the maximum number of bytes isochronous or interrupt endpoint @ep
 * can transfer during a service interval, or 0 for other endpoints.
 */
u32 usb_endpoint_max_periodic_payload(struct usb_device *udev,
                                      const struct usb_host_endpoint *ep)
{
        if (!usb_endpoint_xfer_isoc(&ep->desc) &&
            !usb_endpoint_xfer_int(&ep->desc))
                return 0;

        switch (udev->speed) {
        case USB_SPEED_SUPER_PLUS:
                if (USB_SS_SSP_ISOC_COMP(ep->ss_ep_comp.bmAttributes))
                        return le32_to_cpu(ep->ssp_isoc_ep_comp.dwBytesPerInterval);
                fallthrough;
        case USB_SPEED_SUPER:
                return le16_to_cpu(ep->ss_ep_comp.wBytesPerInterval);
        default:
                if (usb_endpoint_is_hs_isoc_double(udev, ep))
                        return le32_to_cpu(ep->eusb2_isoc_ep_comp.dwBytesPerInterval);
                return usb_endpoint_maxp(&ep->desc) * usb_endpoint_maxp_mult(&ep->desc);
        }
}
EXPORT_SYMBOL_GPL(usb_endpoint_max_periodic_payload);

/**
 * usb_endpoint_is_hs_isoc_double - Tell whether an endpoint uses USB 2
 *                                  Isochronous Double IN Bandwidth
 * @udev: The USB device
 * @ep: The endpoint
 *
 * Returns: true if an endpoint @ep conforms to USB 2 Isochronous Double IN
 * Bandwidth ECN, false otherwise.
 */
bool usb_endpoint_is_hs_isoc_double(struct usb_device *udev,
                                    const struct usb_host_endpoint *ep)
{
        return ep->eusb2_isoc_ep_comp.bDescriptorType &&
                le16_to_cpu(udev->descriptor.bcdUSB) == 0x220 &&
                usb_endpoint_is_isoc_in(&ep->desc) &&
                !le16_to_cpu(ep->desc.wMaxPacketSize);
}
EXPORT_SYMBOL_GPL(usb_endpoint_is_hs_isoc_double);

/*
 * Notifications of device and interface registration
 */
static int usb_bus_notify(struct notifier_block *nb, unsigned long action,
                void *data)
{
        struct device *dev = data;

        switch (action) {
        case BUS_NOTIFY_ADD_DEVICE:
                if (dev->type == &usb_device_type)
                        (void) usb_create_sysfs_dev_files(to_usb_device(dev));
                else if (dev->type == &usb_if_device_type)
                        usb_create_sysfs_intf_files(to_usb_interface(dev));
                break;

        case BUS_NOTIFY_DEL_DEVICE:
                if (dev->type == &usb_device_type)
                        usb_remove_sysfs_dev_files(to_usb_device(dev));
                else if (dev->type == &usb_if_device_type)
                        usb_remove_sysfs_intf_files(to_usb_interface(dev));
                break;
        }
        return 0;
}

static struct notifier_block usb_bus_nb = {
        .notifier_call = usb_bus_notify,
};

static void usb_debugfs_init(void)
{
        debugfs_create_file("devices", 0444, usb_debug_root, NULL,
                            &usbfs_devices_fops);
}

static void usb_debugfs_cleanup(void)
{
        debugfs_lookup_and_remove("devices", usb_debug_root);
}

/*
 * Init
 */
static int __init usb_init(void)
{
        int retval;
        if (usb_disabled()) {
                pr_info("%s: USB support disabled\n", usbcore_name);
                return 0;
        }
        usb_init_pool_max();

        usb_debugfs_init();

        usb_acpi_register();
        retval = bus_register(&usb_bus_type);
        if (retval)
                goto bus_register_failed;
        retval = bus_register_notifier(&usb_bus_type, &usb_bus_nb);
        if (retval)
                goto bus_notifier_failed;
        retval = usb_major_init();
        if (retval)
                goto major_init_failed;
        retval = class_register(&usbmisc_class);
        if (retval)
                goto class_register_failed;
        retval = usb_register(&usbfs_driver);
        if (retval)
                goto driver_register_failed;
        retval = usb_devio_init();
        if (retval)
                goto usb_devio_init_failed;
        retval = usb_hub_init();
        if (retval)
                goto hub_init_failed;
        retval = usb_register_device_driver(&usb_generic_driver, THIS_MODULE);
        if (!retval)
                goto out;

        usb_hub_cleanup();
hub_init_failed:
        usb_devio_cleanup();
usb_devio_init_failed:
        usb_deregister(&usbfs_driver);
driver_register_failed:
        class_unregister(&usbmisc_class);
class_register_failed:
        usb_major_cleanup();
major_init_failed:
        bus_unregister_notifier(&usb_bus_type, &usb_bus_nb);
bus_notifier_failed:
        bus_unregister(&usb_bus_type);
bus_register_failed:
        usb_acpi_unregister();
        usb_debugfs_cleanup();
out:
        return retval;
}

/*
 * Cleanup
 */
static void __exit usb_exit(void)
{
        /* This will matter if shutdown/reboot does exitcalls. */
        if (usb_disabled())
                return;

        usb_release_quirk_list();
        usb_deregister_device_driver(&usb_generic_driver);
        usb_major_cleanup();
        usb_deregister(&usbfs_driver);
        usb_devio_cleanup();
        usb_hub_cleanup();
        class_unregister(&usbmisc_class);
        bus_unregister_notifier(&usb_bus_type, &usb_bus_nb);
        bus_unregister(&usb_bus_type);
        usb_acpi_unregister();
        usb_debugfs_cleanup();
        idr_destroy(&usb_bus_idr);
}

subsys_initcall(usb_init);
module_exit(usb_exit);
MODULE_DESCRIPTION("USB core host-side support");
MODULE_LICENSE("GPL");



































































































































































    3 
    3 

    3 

































































































































































































































































































    3 




















































































































































    3 










    3 







    3 









    3 




























    3 
    2 














    3 











    3 




















    3 












    3 

    3 














































    3 





    3 
















    3 









    3 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    3 

































































































































































































































































































































    2 




    3 



















    3 








    3 



    3 




    3 
    3 



    3 
    3 


    3 





























    3 


















    2 






    3 


    3 




























    3 










    3 








    3 







    3 





    3 



    3 























    3 









    3 











    3 











    3 









    3 


    3 





    3 











    3 







    3 

    3 







    1 

    2 



    3 












    3 
    3 
    3 



































    3 







































































    3 













    3 



































    3 























    3 








































    3 







    3 













    3 












    1 
    2 
    1 
    2 
    1 

    2 

    3 


    3 













    3 
















    3 

    2 






































































































































































































    1 




















    3 

























    3 

















    3 















    3 

















    3 




















    2 



    1 






    3 














    3 

























    3 











    3 




    3 



















































    3 












    2 


    2 







    2 









































    3 

























































































































    3 



    3 

















    3 



    3 

























































































































































































































































































































































































































































































































































    2 














































































































































































































































































































































































































































































































































    3 




    3 

































    3 






    3 
















    3 





    3 














































    3 





    3 





































































































































    3 








    3 


    3 




    3 





















    3 














    3 
    3 

















    3 



    3 

    3 
















    3 


    2 
















    3 










































    3 


    3 












































    3 
























    3 

















    3 










    3 































    3 





    3 










    2 





    1 
















    1 


    3 





    3 







    3 

    3 

























    3 










    3 






















































    3 














    2 



    1 


    1 
































    3 









    2 

    1 









    3 

    3 




























    1 










    3 




    3 










































































































































    3 
    3 







    3 

    3 

    2 










    3 






    3 








    3 


    3 











    3 



















    2 









    3 














    3 









    3 


























    2 

    1 

















    3 













    3 









































































    3 












    3 












































    3 
























































    3 
    3 



    3 

















    3 















    3 




    3 




    3 




    3 






    3 






    2 








    3 




























    3 
    3 

















































    2 













    3 



    3 
    3 












    3 






    3 



































    3 



























































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
// SPDX-License-Identifier: GPL-2.0
/*
 * USB hub driver.
 *
 * (C) Copyright 1999 Linus Torvalds
 * (C) Copyright 1999 Johannes Erdfelt
 * (C) Copyright 1999 Gregory P. Smith
 * (C) Copyright 2001 Brad Hards (bhards@bigpond.net.au)
 *
 * Released under the GPLv2 only.
 */

#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/completion.h>
#include <linux/sched/mm.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/string_choices.h>
#include <linux/kcov.h>
#include <linux/ioctl.h>
#include <linux/usb.h>
#include <linux/usbdevice_fs.h>
#include <linux/usb/hcd.h>
#include <linux/usb/onboard_dev.h>
#include <linux/usb/otg.h>
#include <linux/usb/quirks.h>
#include <linux/workqueue.h>
#include <linux/minmax.h>
#include <linux/mutex.h>
#include <linux/random.h>
#include <linux/pm_qos.h>
#include <linux/kobject.h>

#include <linux/bitfield.h>
#include <linux/uaccess.h>
#include <asm/byteorder.h>

#include "hub.h"
#include "phy.h"
#include "otg_productlist.h"
#include "trace.h"

#define USB_VENDOR_GENESYS_LOGIC                0x05e3
#define USB_VENDOR_SMSC                                0x0424
#define USB_PRODUCT_USB5534B                        0x5534
#define USB_VENDOR_CYPRESS                        0x04b4
#define USB_PRODUCT_CY7C65632                        0x6570
#define USB_VENDOR_TEXAS_INSTRUMENTS                0x0451
#define USB_PRODUCT_TUSB8041_USB3                0x8140
#define USB_PRODUCT_TUSB8041_USB2                0x8142
#define USB_VENDOR_MICROCHIP                        0x0424
#define USB_PRODUCT_USB4913                        0x4913
#define USB_PRODUCT_USB4914                        0x4914
#define USB_PRODUCT_USB4915                        0x4915
#define HUB_QUIRK_CHECK_PORT_AUTOSUSPEND        BIT(0)
#define HUB_QUIRK_DISABLE_AUTOSUSPEND                BIT(1)
#define HUB_QUIRK_REDUCE_FRAME_INTR_BINTERVAL        BIT(2)

#define USB_TP_TRANSMISSION_DELAY        40        /* ns */
#define USB_TP_TRANSMISSION_DELAY_MAX        65535        /* ns */
#define USB_PING_RESPONSE_TIME                400        /* ns */
#define USB_REDUCE_FRAME_INTR_BINTERVAL        9

/*
 * The SET_ADDRESS request timeout will be 500 ms when
 * USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT quirk flag is set.
 */
#define USB_SHORT_SET_ADDRESS_REQ_TIMEOUT        500  /* ms */

/*
 * Give SS hubs 200ms time after wake to train downstream links before
 * assuming no port activity and allowing hub to runtime suspend back.
 */
#define USB_SS_PORT_U0_WAKE_TIME        200  /* ms */

/* Protect struct usb_device->state and ->children members
 * Note: Both are also protected by ->dev.sem, except that ->state can
 * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */
static DEFINE_SPINLOCK(device_state_lock);

/* workqueue to process hub events */
static struct workqueue_struct *hub_wq;
static void hub_event(struct work_struct *work);

/* synchronize hub-port add/remove and peering operations */
DEFINE_MUTEX(usb_port_peer_mutex);

/* cycle leds on hubs that aren't blinking for attention */
static bool blinkenlights;
module_param(blinkenlights, bool, S_IRUGO);
MODULE_PARM_DESC(blinkenlights, "true to cycle leds on hubs");

/*
 * Device SATA8000 FW1.0 from DATAST0R Technology Corp requires about
 * 10 seconds to send reply for the initial 64-byte descriptor request.
 */
/* define initial 64-byte descriptor request timeout in milliseconds */
static int initial_descriptor_timeout = USB_CTRL_GET_TIMEOUT;
module_param(initial_descriptor_timeout, int, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(initial_descriptor_timeout,
                "initial 64-byte descriptor request timeout in milliseconds "
                "(default 5000 - 5.0 seconds)");

/*
 * As of 2.6.10 we introduce a new USB device initialization scheme which
 * closely resembles the way Windows works.  Hopefully it will be compatible
 * with a wider range of devices than the old scheme.  However some previously
 * working devices may start giving rise to "device not accepting address"
 * errors; if that happens the user can try the old scheme by adjusting the
 * following module parameters.
 *
 * For maximum flexibility there are two boolean parameters to control the
 * hub driver's behavior.  On the first initialization attempt, if the
 * "old_scheme_first" parameter is set then the old scheme will be used,
 * otherwise the new scheme is used.  If that fails and "use_both_schemes"
 * is set, then the driver will make another attempt, using the other scheme.
 */
static bool old_scheme_first;
module_param(old_scheme_first, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(old_scheme_first,
                 "start with the old device initialization scheme");

static bool use_both_schemes = true;
module_param(use_both_schemes, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(use_both_schemes,
                "try the other device initialization scheme if the "
                "first one fails");

/* Mutual exclusion for EHCI CF initialization.  This interferes with
 * port reset on some companion controllers.
 */
DECLARE_RWSEM(ehci_cf_port_reset_rwsem);
EXPORT_SYMBOL_GPL(ehci_cf_port_reset_rwsem);

#define HUB_DEBOUNCE_TIMEOUT        2000
#define HUB_DEBOUNCE_STEP          25
#define HUB_DEBOUNCE_STABLE         100

static int usb_reset_and_verify_device(struct usb_device *udev);
static int hub_port_disable(struct usb_hub *hub, int port1, int set_state);
static bool hub_port_warm_reset_required(struct usb_hub *hub, int port1,
                u16 portstatus);

static inline char *portspeed(struct usb_hub *hub, int portstatus)
{
        if (hub_is_superspeedplus(hub->hdev))
                return "10.0 Gb/s";
        if (hub_is_superspeed(hub->hdev))
                return "5.0 Gb/s";
        if (portstatus & USB_PORT_STAT_HIGH_SPEED)
                return "480 Mb/s";
        else if (portstatus & USB_PORT_STAT_LOW_SPEED)
                return "1.5 Mb/s";
        else
                return "12 Mb/s";
}

/* Note that hdev or one of its children must be locked! */
struct usb_hub *usb_hub_to_struct_hub(struct usb_device *hdev)
{
        if (!hdev || !hdev->actconfig || !hdev->maxchild)
                return NULL;
        return usb_get_intfdata(hdev->actconfig->interface[0]);
}

int usb_device_supports_lpm(struct usb_device *udev)
{
        /* Some devices have trouble with LPM */
        if (udev->quirks & USB_QUIRK_NO_LPM)
                return 0;

        /* Skip if the device BOS descriptor couldn't be read */
        if (!udev->bos)
                return 0;

        /* USB 2.1 (and greater) devices indicate LPM support through
         * their USB 2.0 Extended Capabilities BOS descriptor.
         */
        if (udev->speed == USB_SPEED_HIGH || udev->speed == USB_SPEED_FULL) {
                if (udev->bos->ext_cap &&
                        (USB_LPM_SUPPORT &
                         le32_to_cpu(udev->bos->ext_cap->bmAttributes)))
                        return 1;
                return 0;
        }

        /*
         * According to the USB 3.0 spec, all USB 3.0 devices must support LPM.
         * However, there are some that don't, and they set the U1/U2 exit
         * latencies to zero.
         */
        if (!udev->bos->ss_cap) {
                dev_info(&udev->dev, "No LPM exit latency info found, disabling LPM.\n");
                return 0;
        }

        if (udev->bos->ss_cap->bU1devExitLat == 0 &&
                        udev->bos->ss_cap->bU2DevExitLat == 0) {
                if (udev->parent)
                        dev_info(&udev->dev, "LPM exit latency is zeroed, disabling LPM.\n");
                else
                        dev_info(&udev->dev, "We don't know the algorithms for LPM for this host, disabling LPM.\n");
                return 0;
        }

        if (!udev->parent || udev->parent->lpm_capable)
                return 1;
        return 0;
}

/*
 * Set the Maximum Exit Latency (MEL) for the host to wakup up the path from
 * U1/U2, send a PING to the device and receive a PING_RESPONSE.
 * See USB 3.1 section C.1.5.2
 */
static void usb_set_lpm_mel(struct usb_device *udev,
                struct usb3_lpm_parameters *udev_lpm_params,
                unsigned int udev_exit_latency,
                struct usb_hub *hub,
                struct usb3_lpm_parameters *hub_lpm_params,
                unsigned int hub_exit_latency)
{
        unsigned int total_mel;

        /*
         * tMEL1. time to transition path from host to device into U0.
         * MEL for parent already contains the delay up to parent, so only add
         * the exit latency for the last link (pick the slower exit latency),
         * and the hub header decode latency. See USB 3.1 section C 2.2.1
         * Store MEL in nanoseconds
         */
        total_mel = hub_lpm_params->mel +
                max(udev_exit_latency, hub_exit_latency) * 1000 +
                hub->descriptor->u.ss.bHubHdrDecLat * 100;

        /*
         * tMEL2. Time to submit PING packet. Sum of tTPTransmissionDelay for
         * each link + wHubDelay for each hub. Add only for last link.
         * tMEL4, the time for PING_RESPONSE to traverse upstream is similar.
         * Multiply by 2 to include it as well.
         */
        total_mel += (__le16_to_cpu(hub->descriptor->u.ss.wHubDelay) +
                      USB_TP_TRANSMISSION_DELAY) * 2;

        /*
         * tMEL3, tPingResponse. Time taken by device to generate PING_RESPONSE
         * after receiving PING. Also add 2100ns as stated in USB 3.1 C 1.5.2.4
         * to cover the delay if the PING_RESPONSE is queued behind a Max Packet
         * Size DP.
         * Note these delays should be added only once for the entire path, so
         * add them to the MEL of the device connected to the roothub.
         */
        if (!hub->hdev->parent)
                total_mel += USB_PING_RESPONSE_TIME + 2100;

        udev_lpm_params->mel = total_mel;
}

/*
 * Set the maximum Device to Host Exit Latency (PEL) for the device to initiate
 * a transition from either U1 or U2.
 */
static void usb_set_lpm_pel(struct usb_device *udev,
                struct usb3_lpm_parameters *udev_lpm_params,
                unsigned int udev_exit_latency,
                struct usb_hub *hub,
                struct usb3_lpm_parameters *hub_lpm_params,
                unsigned int hub_exit_latency,
                unsigned int port_to_port_exit_latency)
{
        unsigned int first_link_pel;
        unsigned int hub_pel;

        /*
         * First, the device sends an LFPS to transition the link between the
         * device and the parent hub into U0.  The exit latency is the bigger of
         * the device exit latency or the hub exit latency.
         */
        first_link_pel = max(udev_exit_latency, hub_exit_latency) * 1000;

        /*
         * When the hub starts to receive the LFPS, there is a slight delay for
         * it to figure out that one of the ports is sending an LFPS.  Then it
         * will forward the LFPS to its upstream link.  The exit latency is the
         * delay, plus the PEL that we calculated for this hub.
         */
        hub_pel = port_to_port_exit_latency * 1000 + hub_lpm_params->pel;

        /*
         * According to figure C-7 in the USB 3.0 spec, the PEL for this device
         * is the greater of the two exit latencies.
         */
        udev_lpm_params->pel = max(first_link_pel, hub_pel);
}

/*
 * Set the System Exit Latency (SEL) to indicate the total worst-case time from
 * when a device initiates a transition to U0, until when it will receive the
 * first packet from the host controller.
 *
 * Section C.1.5.1 describes the four components to this:
 *  - t1: device PEL
 *  - t2: time for the ERDY to make it from the device to the host.
 *  - t3: a host-specific delay to process the ERDY.
 *  - t4: time for the packet to make it from the host to the device.
 *
 * t3 is specific to both the xHCI host and the platform the host is integrated
 * into.  The Intel HW folks have said it's negligible, FIXME if a different
 * vendor says otherwise.
 */
static void usb_set_lpm_sel(struct usb_device *udev,
                struct usb3_lpm_parameters *udev_lpm_params)
{
        struct usb_device *parent;
        unsigned int num_hubs;
        unsigned int total_sel;

        /* t1 = device PEL */
        total_sel = udev_lpm_params->pel;
        /* How many external hubs are in between the device & the root port. */
        for (parent = udev->parent, num_hubs = 0; parent->parent;
                        parent = parent->parent)
                num_hubs++;
        /* t2 = 2.1us + 250ns * (num_hubs - 1) */
        if (num_hubs > 0)
                total_sel += 2100 + 250 * (num_hubs - 1);

        /* t4 = 250ns * num_hubs */
        total_sel += 250 * num_hubs;

        udev_lpm_params->sel = total_sel;
}

static void usb_set_lpm_parameters(struct usb_device *udev)
{
        struct usb_hub *hub;
        unsigned int port_to_port_delay;
        unsigned int udev_u1_del;
        unsigned int udev_u2_del;
        unsigned int hub_u1_del;
        unsigned int hub_u2_del;

        if (!udev->lpm_capable || udev->speed < USB_SPEED_SUPER)
                return;

        /* Skip if the device BOS descriptor couldn't be read */
        if (!udev->bos)
                return;

        hub = usb_hub_to_struct_hub(udev->parent);
        /* It doesn't take time to transition the roothub into U0, since it
         * doesn't have an upstream link.
         */
        if (!hub)
                return;

        udev_u1_del = udev->bos->ss_cap->bU1devExitLat;
        udev_u2_del = le16_to_cpu(udev->bos->ss_cap->bU2DevExitLat);
        hub_u1_del = udev->parent->bos->ss_cap->bU1devExitLat;
        hub_u2_del = le16_to_cpu(udev->parent->bos->ss_cap->bU2DevExitLat);

        usb_set_lpm_mel(udev, &udev->u1_params, udev_u1_del,
                        hub, &udev->parent->u1_params, hub_u1_del);

        usb_set_lpm_mel(udev, &udev->u2_params, udev_u2_del,
                        hub, &udev->parent->u2_params, hub_u2_del);

        /*
         * Appendix C, section C.2.2.2, says that there is a slight delay from
         * when the parent hub notices the downstream port is trying to
         * transition to U0 to when the hub initiates a U0 transition on its
         * upstream port.  The section says the delays are tPort2PortU1EL and
         * tPort2PortU2EL, but it doesn't define what they are.
         *
         * The hub chapter, sections 10.4.2.4 and 10.4.2.5 seem to be talking
         * about the same delays.  Use the maximum delay calculations from those
         * sections.  For U1, it's tHubPort2PortExitLat, which is 1us max.  For
         * U2, it's tHubPort2PortExitLat + U2DevExitLat - U1DevExitLat.  I
         * assume the device exit latencies they are talking about are the hub
         * exit latencies.
         *
         * What do we do if the U2 exit latency is less than the U1 exit
         * latency?  It's possible, although not likely...
         */
        port_to_port_delay = 1;

        usb_set_lpm_pel(udev, &udev->u1_params, udev_u1_del,
                        hub, &udev->parent->u1_params, hub_u1_del,
                        port_to_port_delay);

        if (hub_u2_del > hub_u1_del)
                port_to_port_delay = 1 + hub_u2_del - hub_u1_del;
        else
                port_to_port_delay = 1 + hub_u1_del;

        usb_set_lpm_pel(udev, &udev->u2_params, udev_u2_del,
                        hub, &udev->parent->u2_params, hub_u2_del,
                        port_to_port_delay);

        /* Now that we've got PEL, calculate SEL. */
        usb_set_lpm_sel(udev, &udev->u1_params);
        usb_set_lpm_sel(udev, &udev->u2_params);
}

/* USB 2.0 spec Section 11.24.4.5 */
static int get_hub_descriptor(struct usb_device *hdev,
                struct usb_hub_descriptor *desc)
{
        int i, ret, size;
        unsigned dtype;

        if (hub_is_superspeed(hdev)) {
                dtype = USB_DT_SS_HUB;
                size = USB_DT_SS_HUB_SIZE;
        } else {
                dtype = USB_DT_HUB;
                size = sizeof(struct usb_hub_descriptor);
        }

        for (i = 0; i < 3; i++) {
                ret = usb_control_msg(hdev, usb_rcvctrlpipe(hdev, 0),
                        USB_REQ_GET_DESCRIPTOR, USB_DIR_IN | USB_RT_HUB,
                        dtype << 8, 0, desc, size,
                        USB_CTRL_GET_TIMEOUT);
                if (hub_is_superspeed(hdev)) {
                        if (ret == size)
                                return ret;
                } else if (ret >= USB_DT_HUB_NONVAR_SIZE + 2) {
                        /* Make sure we have the DeviceRemovable field. */
                        size = USB_DT_HUB_NONVAR_SIZE + desc->bNbrPorts / 8 + 1;
                        if (ret < size)
                                return -EMSGSIZE;
                        return ret;
                }
        }
        return -EINVAL;
}

/*
 * USB 2.0 spec Section 11.24.2.1
 */
static int clear_hub_feature(struct usb_device *hdev, int feature)
{
        return usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0),
                USB_REQ_CLEAR_FEATURE, USB_RT_HUB, feature, 0, NULL, 0, 1000);
}

/*
 * USB 2.0 spec Section 11.24.2.2
 */
int usb_clear_port_feature(struct usb_device *hdev, int port1, int feature)
{
        return usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0),
                USB_REQ_CLEAR_FEATURE, USB_RT_PORT, feature, port1,
                NULL, 0, 1000);
}

/*
 * USB 2.0 spec Section 11.24.2.13
 */
static int set_port_feature(struct usb_device *hdev, int port1, int feature)
{
        return usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0),
                USB_REQ_SET_FEATURE, USB_RT_PORT, feature, port1,
                NULL, 0, 1000);
}

static char *to_led_name(int selector)
{
        switch (selector) {
        case HUB_LED_AMBER:
                return "amber";
        case HUB_LED_GREEN:
                return "green";
        case HUB_LED_OFF:
                return "off";
        case HUB_LED_AUTO:
                return "auto";
        default:
                return "??";
        }
}

/*
 * USB 2.0 spec Section 11.24.2.7.1.10 and table 11-7
 * for info about using port indicators
 */
static void set_port_led(struct usb_hub *hub, int port1, int selector)
{
        struct usb_port *port_dev = hub->ports[port1 - 1];
        int status;

        status = set_port_feature(hub->hdev, (selector << 8) | port1,
                        USB_PORT_FEAT_INDICATOR);
        dev_dbg(&port_dev->dev, "indicator %s status %d\n",
                to_led_name(selector), status);
}

#define        LED_CYCLE_PERIOD        ((2*HZ)/3)

static void led_work(struct work_struct *work)
{
        struct usb_hub                *hub =
                container_of(work, struct usb_hub, leds.work);
        struct usb_device        *hdev = hub->hdev;
        unsigned                i;
        unsigned                changed = 0;
        int                        cursor = -1;

        if (hdev->state != USB_STATE_CONFIGURED || hub->quiescing)
                return;

        for (i = 0; i < hdev->maxchild; i++) {
                unsigned        selector, mode;

                /* 30%-50% duty cycle */

                switch (hub->indicator[i]) {
                /* cycle marker */
                case INDICATOR_CYCLE:
                        cursor = i;
                        selector = HUB_LED_AUTO;
                        mode = INDICATOR_AUTO;
                        break;
                /* blinking green = sw attention */
                case INDICATOR_GREEN_BLINK:
                        selector = HUB_LED_GREEN;
                        mode = INDICATOR_GREEN_BLINK_OFF;
                        break;
                case INDICATOR_GREEN_BLINK_OFF:
                        selector = HUB_LED_OFF;
                        mode = INDICATOR_GREEN_BLINK;
                        break;
                /* blinking amber = hw attention */
                case INDICATOR_AMBER_BLINK:
                        selector = HUB_LED_AMBER;
                        mode = INDICATOR_AMBER_BLINK_OFF;
                        break;
                case INDICATOR_AMBER_BLINK_OFF:
                        selector = HUB_LED_OFF;
                        mode = INDICATOR_AMBER_BLINK;
                        break;
                /* blink green/amber = reserved */
                case INDICATOR_ALT_BLINK:
                        selector = HUB_LED_GREEN;
                        mode = INDICATOR_ALT_BLINK_OFF;
                        break;
                case INDICATOR_ALT_BLINK_OFF:
                        selector = HUB_LED_AMBER;
                        mode = INDICATOR_ALT_BLINK;
                        break;
                default:
                        continue;
                }
                if (selector != HUB_LED_AUTO)
                        changed = 1;
                set_port_led(hub, i + 1, selector);
                hub->indicator[i] = mode;
        }
        if (!changed && blinkenlights) {
                cursor++;
                cursor %= hdev->maxchild;
                set_port_led(hub, cursor + 1, HUB_LED_GREEN);
                hub->indicator[cursor] = INDICATOR_CYCLE;
                changed++;
        }
        if (changed)
                queue_delayed_work(system_power_efficient_wq,
                                &hub->leds, LED_CYCLE_PERIOD);
}

/* use a short timeout for hub/port status fetches */
#define        USB_STS_TIMEOUT                1000
#define        USB_STS_RETRIES                5

/*
 * USB 2.0 spec Section 11.24.2.6
 */
static int get_hub_status(struct usb_device *hdev,
                struct usb_hub_status *data)
{
        int i, status = -ETIMEDOUT;

        for (i = 0; i < USB_STS_RETRIES &&
                        (status == -ETIMEDOUT || status == -EPIPE); i++) {
                status = usb_control_msg(hdev, usb_rcvctrlpipe(hdev, 0),
                        USB_REQ_GET_STATUS, USB_DIR_IN | USB_RT_HUB, 0, 0,
                        data, sizeof(*data), USB_STS_TIMEOUT);
        }
        return status;
}

/*
 * USB 2.0 spec Section 11.24.2.7
 * USB 3.1 takes into use the wValue and wLength fields, spec Section 10.16.2.6
 */
static int get_port_status(struct usb_device *hdev, int port1,
                           void *data, u16 value, u16 length)
{
        int i, status = -ETIMEDOUT;

        for (i = 0; i < USB_STS_RETRIES &&
                        (status == -ETIMEDOUT || status == -EPIPE); i++) {
                status = usb_control_msg(hdev, usb_rcvctrlpipe(hdev, 0),
                        USB_REQ_GET_STATUS, USB_DIR_IN | USB_RT_PORT, value,
                        port1, data, length, USB_STS_TIMEOUT);
        }
        return status;
}

static int hub_ext_port_status(struct usb_hub *hub, int port1, int type,
                               u16 *status, u16 *change, u32 *ext_status)
{
        int ret;
        int len = 4;

        if (type != HUB_PORT_STATUS)
                len = 8;

        mutex_lock(&hub->status_mutex);
        ret = get_port_status(hub->hdev, port1, &hub->status->port, type, len);
        if (ret < len) {
                if (ret != -ENODEV)
                        dev_err(hub->intfdev,
                                "%s failed (err = %d)\n", __func__, ret);
                if (ret >= 0)
                        ret = -EIO;
        } else {
                *status = le16_to_cpu(hub->status->port.wPortStatus);
                *change = le16_to_cpu(hub->status->port.wPortChange);
                if (type != HUB_PORT_STATUS && ext_status)
                        *ext_status = le32_to_cpu(
                                hub->status->port.dwExtPortStatus);
                ret = 0;
        }
        mutex_unlock(&hub->status_mutex);

        /*
         * There is no need to lock status_mutex here, because status_mutex
         * protects hub->status, and the phy driver only checks the port
         * status without changing the status.
         */
        if (!ret) {
                struct usb_device *hdev = hub->hdev;

                /*
                 * Only roothub will be notified of connection changes,
                 * since the USB PHY only cares about changes at the next
                 * level.
                 */
                if (is_root_hub(hdev)) {
                        struct usb_hcd *hcd = bus_to_hcd(hdev->bus);
                        bool connect;
                        bool connect_change;

                        connect_change = *change & USB_PORT_STAT_C_CONNECTION;
                        connect = *status & USB_PORT_STAT_CONNECTION;
                        if (connect_change && connect)
                                usb_phy_roothub_notify_connect(hcd->phy_roothub, port1 - 1);
                        else if (connect_change)
                                usb_phy_roothub_notify_disconnect(hcd->phy_roothub, port1 - 1);
                }
        }

        return ret;
}

int usb_hub_port_status(struct usb_hub *hub, int port1,
                u16 *status, u16 *change)
{
        return hub_ext_port_status(hub, port1, HUB_PORT_STATUS,
                                   status, change, NULL);
}

static void hub_resubmit_irq_urb(struct usb_hub *hub)
{
        unsigned long flags;
        int status;

        spin_lock_irqsave(&hub->irq_urb_lock, flags);

        if (hub->quiescing) {
                spin_unlock_irqrestore(&hub->irq_urb_lock, flags);
                return;
        }

        status = usb_submit_urb(hub->urb, GFP_ATOMIC);
        if (status && status != -ENODEV && status != -EPERM &&
            status != -ESHUTDOWN) {
                dev_err(hub->intfdev, "resubmit --> %d\n", status);
                mod_timer(&hub->irq_urb_retry, jiffies + HZ);
        }

        spin_unlock_irqrestore(&hub->irq_urb_lock, flags);
}

static void hub_retry_irq_urb(struct timer_list *t)
{
        struct usb_hub *hub = timer_container_of(hub, t, irq_urb_retry);

        hub_resubmit_irq_urb(hub);
}


static void kick_hub_wq(struct usb_hub *hub)
{
        struct usb_interface *intf;

        if (hub->disconnected || work_pending(&hub->events))
                return;

        /*
         * Suppress autosuspend until the event is proceed.
         *
         * Be careful and make sure that the symmetric operation is
         * always called. We are here only when there is no pending
         * work for this hub. Therefore put the interface either when
         * the new work is called or when it is canceled.
         */
        intf = to_usb_interface(hub->intfdev);
        usb_autopm_get_interface_no_resume(intf);
        hub_get(hub);

        if (queue_work(hub_wq, &hub->events))
                return;

        /* the work has already been scheduled */
        usb_autopm_put_interface_async(intf);
        hub_put(hub);
}

void usb_kick_hub_wq(struct usb_device *hdev)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);

        if (hub)
                kick_hub_wq(hub);
}

/*
 * Let the USB core know that a USB 3.0 device has sent a Function Wake Device
 * Notification, which indicates it had initiated remote wakeup.
 *
 * USB 3.0 hubs do not report the port link state change from U3 to U0 when the
 * device initiates resume, so the USB core will not receive notice of the
 * resume through the normal hub interrupt URB.
 */
void usb_wakeup_notification(struct usb_device *hdev,
                unsigned int portnum)
{
        struct usb_hub *hub;
        struct usb_port *port_dev;

        if (!hdev)
                return;

        hub = usb_hub_to_struct_hub(hdev);
        if (hub) {
                port_dev = hub->ports[portnum - 1];
                if (port_dev && port_dev->child)
                        pm_wakeup_event(&port_dev->child->dev, 0);

                set_bit(portnum, hub->wakeup_bits);
                kick_hub_wq(hub);
        }
}
EXPORT_SYMBOL_GPL(usb_wakeup_notification);

/* completion function, fires on port status changes and various faults */
static void hub_irq(struct urb *urb)
{
        struct usb_hub *hub = urb->context;
        int status = urb->status;
        unsigned i;
        unsigned long bits;

        switch (status) {
        case -ENOENT:                /* synchronous unlink */
        case -ECONNRESET:        /* async unlink */
        case -ESHUTDOWN:        /* hardware going away */
                return;

        default:                /* presumably an error */
                /* Cause a hub reset after 10 consecutive errors */
                dev_dbg(hub->intfdev, "transfer --> %d\n", status);
                if ((++hub->nerrors < 10) || hub->error)
                        goto resubmit;
                hub->error = status;
                fallthrough;

        /* let hub_wq handle things */
        case 0:                        /* we got data:  port status changed */
                bits = 0;
                for (i = 0; i < urb->actual_length; ++i)
                        bits |= ((unsigned long) ((*hub->buffer)[i]))
                                        << (i*8);
                hub->event_bits[0] = bits;
                break;
        }

        hub->nerrors = 0;

        /* Something happened, let hub_wq figure it out */
        kick_hub_wq(hub);

resubmit:
        hub_resubmit_irq_urb(hub);
}

/* USB 2.0 spec Section 11.24.2.3 */
static inline int
hub_clear_tt_buffer(struct usb_device *hdev, u16 devinfo, u16 tt)
{
        /* Need to clear both directions for control ep */
        if (((devinfo >> 11) & USB_ENDPOINT_XFERTYPE_MASK) ==
                        USB_ENDPOINT_XFER_CONTROL) {
                int status = usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0),
                                HUB_CLEAR_TT_BUFFER, USB_RT_PORT,
                                devinfo ^ 0x8000, tt, NULL, 0, 1000);
                if (status)
                        return status;
        }
        return usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0),
                               HUB_CLEAR_TT_BUFFER, USB_RT_PORT, devinfo,
                               tt, NULL, 0, 1000);
}

/*
 * enumeration blocks hub_wq for a long time. we use keventd instead, since
 * long blocking there is the exception, not the rule.  accordingly, HCDs
 * talking to TTs must queue control transfers (not just bulk and iso), so
 * both can talk to the same hub concurrently.
 */
static void hub_tt_work(struct work_struct *work)
{
        struct usb_hub                *hub =
                container_of(work, struct usb_hub, tt.clear_work);
        unsigned long                flags;

        spin_lock_irqsave(&hub->tt.lock, flags);
        while (!list_empty(&hub->tt.clear_list)) {
                struct list_head        *next;
                struct usb_tt_clear        *clear;
                struct usb_device        *hdev = hub->hdev;
                const struct hc_driver        *drv;
                int                        status;

                next = hub->tt.clear_list.next;
                clear = list_entry(next, struct usb_tt_clear, clear_list);
                list_del(&clear->clear_list);

                /* drop lock so HCD can concurrently report other TT errors */
                spin_unlock_irqrestore(&hub->tt.lock, flags);
                status = hub_clear_tt_buffer(hdev, clear->devinfo, clear->tt);
                if (status && status != -ENODEV)
                        dev_err(&hdev->dev,
                                "clear tt %d (%04x) error %d\n",
                                clear->tt, clear->devinfo, status);

                /* Tell the HCD, even if the operation failed */
                drv = clear->hcd->driver;
                if (drv->clear_tt_buffer_complete)
                        (drv->clear_tt_buffer_complete)(clear->hcd, clear->ep);

                kfree(clear);
                spin_lock_irqsave(&hub->tt.lock, flags);
        }
        spin_unlock_irqrestore(&hub->tt.lock, flags);
}

/**
 * usb_hub_set_port_power - control hub port's power state
 * @hdev: USB device belonging to the usb hub
 * @hub: target hub
 * @port1: port index
 * @set: expected status
 *
 * call this function to control port's power via setting or
 * clearing the port's PORT_POWER feature.
 *
 * Return: 0 if successful. A negative error code otherwise.
 */
int usb_hub_set_port_power(struct usb_device *hdev, struct usb_hub *hub,
                           int port1, bool set)
{
        int ret;

        if (set)
                ret = set_port_feature(hdev, port1, USB_PORT_FEAT_POWER);
        else
                ret = usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_POWER);

        if (ret)
                return ret;

        if (set)
                set_bit(port1, hub->power_bits);
        else
                clear_bit(port1, hub->power_bits);
        return 0;
}

/**
 * usb_hub_clear_tt_buffer - clear control/bulk TT state in high speed hub
 * @urb: an URB associated with the failed or incomplete split transaction
 *
 * High speed HCDs use this to tell the hub driver that some split control or
 * bulk transaction failed in a way that requires clearing internal state of
 * a transaction translator.  This is normally detected (and reported) from
 * interrupt context.
 *
 * It may not be possible for that hub to handle additional full (or low)
 * speed transactions until that state is fully cleared out.
 *
 * Return: 0 if successful. A negative error code otherwise.
 */
int usb_hub_clear_tt_buffer(struct urb *urb)
{
        struct usb_device        *udev = urb->dev;
        int                        pipe = urb->pipe;
        struct usb_tt                *tt = udev->tt;
        unsigned long                flags;
        struct usb_tt_clear        *clear;

        /* we've got to cope with an arbitrary number of pending TT clears,
         * since each TT has "at least two" buffers that can need it (and
         * there can be many TTs per hub).  even if they're uncommon.
         */
        clear = kmalloc_obj(*clear, GFP_ATOMIC);
        if (clear == NULL) {
                dev_err(&udev->dev, "can't save CLEAR_TT_BUFFER state\n");
                /* FIXME recover somehow ... RESET_TT? */
                return -ENOMEM;
        }

        /* info that CLEAR_TT_BUFFER needs */
        clear->tt = tt->multi ? udev->ttport : 1;
        clear->devinfo = usb_pipeendpoint (pipe);
        clear->devinfo |= ((u16)udev->devaddr) << 4;
        clear->devinfo |= usb_pipecontrol(pipe)
                        ? (USB_ENDPOINT_XFER_CONTROL << 11)
                        : (USB_ENDPOINT_XFER_BULK << 11);
        if (usb_pipein(pipe))
                clear->devinfo |= 1 << 15;

        /* info for completion callback */
        clear->hcd = bus_to_hcd(udev->bus);
        clear->ep = urb->ep;

        /* tell keventd to clear state for this TT */
        spin_lock_irqsave(&tt->lock, flags);
        list_add_tail(&clear->clear_list, &tt->clear_list);
        schedule_work(&tt->clear_work);
        spin_unlock_irqrestore(&tt->lock, flags);
        return 0;
}
EXPORT_SYMBOL_GPL(usb_hub_clear_tt_buffer);

static void hub_power_on(struct usb_hub *hub, bool do_delay)
{
        int port1;

        /* Enable power on each port.  Some hubs have reserved values
         * of LPSM (> 2) in their descriptors, even though they are
         * USB 2.0 hubs.  Some hubs do not implement port-power switching
         * but only emulate it.  In all cases, the ports won't work
         * unless we send these messages to the hub.
         */
        if (hub_is_port_power_switchable(hub))
                dev_dbg(hub->intfdev, "enabling power on all ports\n");
        else
                dev_dbg(hub->intfdev, "trying to enable port power on "
                                "non-switchable hub\n");
        for (port1 = 1; port1 <= hub->hdev->maxchild; port1++)
                if (test_bit(port1, hub->power_bits))
                        set_port_feature(hub->hdev, port1, USB_PORT_FEAT_POWER);
                else
                        usb_clear_port_feature(hub->hdev, port1,
                                                USB_PORT_FEAT_POWER);
        if (do_delay)
                msleep(hub_power_on_good_delay(hub));
}

static int hub_hub_status(struct usb_hub *hub,
                u16 *status, u16 *change)
{
        int ret;

        mutex_lock(&hub->status_mutex);
        ret = get_hub_status(hub->hdev, &hub->status->hub);
        if (ret < 0) {
                if (ret != -ENODEV)
                        dev_err(hub->intfdev,
                                "%s failed (err = %d)\n", __func__, ret);
        } else {
                *status = le16_to_cpu(hub->status->hub.wHubStatus);
                *change = le16_to_cpu(hub->status->hub.wHubChange);
                ret = 0;
        }
        mutex_unlock(&hub->status_mutex);
        return ret;
}

static int hub_set_port_link_state(struct usb_hub *hub, int port1,
                        unsigned int link_status)
{
        return set_port_feature(hub->hdev,
                        port1 | (link_status << 3),
                        USB_PORT_FEAT_LINK_STATE);
}

/*
 * Disable a port and mark a logical connect-change event, so that some
 * time later hub_wq will disconnect() any existing usb_device on the port
 * and will re-enumerate if there actually is a device attached.
 */
static void hub_port_logical_disconnect(struct usb_hub *hub, int port1)
{
        dev_dbg(&hub->ports[port1 - 1]->dev, "logical disconnect\n");
        hub_port_disable(hub, port1, 1);

        /* FIXME let caller ask to power down the port:
         *  - some devices won't enumerate without a VBUS power cycle
         *  - SRP saves power that way
         *  - ... new call, TBD ...
         * That's easy if this hub can switch power per-port, and
         * hub_wq reactivates the port later (timer, SRP, etc).
         * Powerdown must be optional, because of reset/DFU.
         */

        set_bit(port1, hub->change_bits);
        kick_hub_wq(hub);
}

/**
 * usb_remove_device - disable a device's port on its parent hub
 * @udev: device to be disabled and removed
 * Context: @udev locked, must be able to sleep.
 *
 * After @udev's port has been disabled, hub_wq is notified and it will
 * see that the device has been disconnected.  When the device is
 * physically unplugged and something is plugged in, the events will
 * be received and processed normally.
 *
 * Return: 0 if successful. A negative error code otherwise.
 */
int usb_remove_device(struct usb_device *udev)
{
        struct usb_hub *hub;
        struct usb_interface *intf;
        int ret;

        if (!udev->parent)        /* Can't remove a root hub */
                return -EINVAL;
        hub = usb_hub_to_struct_hub(udev->parent);
        intf = to_usb_interface(hub->intfdev);

        ret = usb_autopm_get_interface(intf);
        if (ret < 0)
                return ret;

        set_bit(udev->portnum, hub->removed_bits);
        hub_port_logical_disconnect(hub, udev->portnum);
        usb_autopm_put_interface(intf);
        return 0;
}

enum hub_activation_type {
        HUB_INIT, HUB_INIT2, HUB_INIT3,                /* INITs must come first */
        HUB_POST_RESET, HUB_RESUME, HUB_RESET_RESUME,
};

static void hub_init_func2(struct work_struct *ws);
static void hub_init_func3(struct work_struct *ws);

static void hub_activate(struct usb_hub *hub, enum hub_activation_type type)
{
        struct usb_device *hdev = hub->hdev;
        struct usb_hcd *hcd;
        int ret;
        int port1;
        int status;
        bool need_debounce_delay = false;
        unsigned delay;

        /* Continue a partial initialization */
        if (type == HUB_INIT2 || type == HUB_INIT3) {
                device_lock(&hdev->dev);

                /* Was the hub disconnected while we were waiting? */
                if (hub->disconnected)
                        goto disconnected;
                if (type == HUB_INIT2)
                        goto init2;
                goto init3;
        }

        hub_get(hub);

        /* The superspeed hub except for root hub has to use Hub Depth
         * value as an offset into the route string to locate the bits
         * it uses to determine the downstream port number. So hub driver
         * should send a set hub depth request to superspeed hub after
         * the superspeed hub is set configuration in initialization or
         * reset procedure.
         *
         * After a resume, port power should still be on.
         * For any other type of activation, turn it on.
         */
        if (type != HUB_RESUME) {
                if (hdev->parent && hub_is_superspeed(hdev)) {
                        ret = usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0),
                                        HUB_SET_DEPTH, USB_RT_HUB,
                                        hdev->level - 1, 0, NULL, 0,
                                        USB_CTRL_SET_TIMEOUT);
                        if (ret < 0)
                                dev_err(hub->intfdev,
                                                "set hub depth failed\n");
                }

                /* Speed up system boot by using a delayed_work for the
                 * hub's initial power-up delays.  This is pretty awkward
                 * and the implementation looks like a home-brewed sort of
                 * setjmp/longjmp, but it saves at least 100 ms for each
                 * root hub (assuming usbcore is compiled into the kernel
                 * rather than as a module).  It adds up.
                 *
                 * This can't be done for HUB_RESUME or HUB_RESET_RESUME
                 * because for those activation types the ports have to be
                 * operational when we return.  In theory this could be done
                 * for HUB_POST_RESET, but it's easier not to.
                 */
                if (type == HUB_INIT) {
                        delay = hub_power_on_good_delay(hub);

                        hub_power_on(hub, false);
                        INIT_DELAYED_WORK(&hub->init_work, hub_init_func2);
                        queue_delayed_work(system_power_efficient_wq,
                                        &hub->init_work,
                                        msecs_to_jiffies(delay));

                        /* Suppress autosuspend until init is done */
                        usb_autopm_get_interface_no_resume(
                                        to_usb_interface(hub->intfdev));
                        return;                /* Continues at init2: below */
                } else if (type == HUB_RESET_RESUME) {
                        /* The internal host controller state for the hub device
                         * may be gone after a host power loss on system resume.
                         * Update the device's info so the HW knows it's a hub.
                         */
                        hcd = bus_to_hcd(hdev->bus);
                        if (hcd->driver->update_hub_device) {
                                ret = hcd->driver->update_hub_device(hcd, hdev,
                                                &hub->tt, GFP_NOIO);
                                if (ret < 0) {
                                        dev_err(hub->intfdev,
                                                "Host not accepting hub info update\n");
                                        dev_err(hub->intfdev,
                                                "LS/FS devices and hubs may not work under this hub\n");
                                }
                        }
                        hub_power_on(hub, true);
                } else {
                        hub_power_on(hub, true);
                }
        /* Give some time on remote wakeup to let links to transit to U0 */
        } else if (hub_is_superspeed(hub->hdev))
                msleep(20);

 init2:

        /*
         * Check each port and set hub->change_bits to let hub_wq know
         * which ports need attention.
         */
        for (port1 = 1; port1 <= hdev->maxchild; ++port1) {
                struct usb_port *port_dev = hub->ports[port1 - 1];
                struct usb_device *udev = port_dev->child;
                u16 portstatus, portchange;

                portstatus = portchange = 0;
                status = usb_hub_port_status(hub, port1, &portstatus, &portchange);
                if (status)
                        goto abort;

                if (udev || (portstatus & USB_PORT_STAT_CONNECTION))
                        dev_dbg(&port_dev->dev, "status %04x change %04x\n",
                                        portstatus, portchange);

                /*
                 * After anything other than HUB_RESUME (i.e., initialization
                 * or any sort of reset), every port should be disabled.
                 * Unconnected ports should likewise be disabled (paranoia),
                 * and so should ports for which we have no usb_device.
                 */
                if ((portstatus & USB_PORT_STAT_ENABLE) && (
                                type != HUB_RESUME ||
                                !(portstatus & USB_PORT_STAT_CONNECTION) ||
                                !udev ||
                                udev->state == USB_STATE_NOTATTACHED)) {
                        /*
                         * USB3 protocol ports will automatically transition
                         * to Enabled state when detect an USB3.0 device attach.
                         * Do not disable USB3 protocol ports, just pretend
                         * power was lost
                         */
                        portstatus &= ~USB_PORT_STAT_ENABLE;
                        if (!hub_is_superspeed(hdev))
                                usb_clear_port_feature(hdev, port1,
                                                   USB_PORT_FEAT_ENABLE);
                }

                /* Make sure a warm-reset request is handled by port_event */
                if (type == HUB_RESUME &&
                    hub_port_warm_reset_required(hub, port1, portstatus))
                        set_bit(port1, hub->event_bits);

                /*
                 * Add debounce if USB3 link is in polling/link training state.
                 * Link will automatically transition to Enabled state after
                 * link training completes.
                 */
                if (hub_is_superspeed(hdev) &&
                    ((portstatus & USB_PORT_STAT_LINK_STATE) ==
                                                USB_SS_PORT_LS_POLLING))
                        need_debounce_delay = true;

                /* Clear status-change flags; we'll debounce later */
                if (portchange & USB_PORT_STAT_C_CONNECTION) {
                        need_debounce_delay = true;
                        usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_CONNECTION);
                }
                if (portchange & USB_PORT_STAT_C_ENABLE) {
                        need_debounce_delay = true;
                        usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_ENABLE);
                }
                if (portchange & USB_PORT_STAT_C_RESET) {
                        need_debounce_delay = true;
                        usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_RESET);
                }
                if ((portchange & USB_PORT_STAT_C_BH_RESET) &&
                                hub_is_superspeed(hub->hdev)) {
                        need_debounce_delay = true;
                        usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_BH_PORT_RESET);
                }
                /* We can forget about a "removed" device when there's a
                 * physical disconnect or the connect status changes.
                 */
                if (!(portstatus & USB_PORT_STAT_CONNECTION) ||
                                (portchange & USB_PORT_STAT_C_CONNECTION))
                        clear_bit(port1, hub->removed_bits);

                if (!udev || udev->state == USB_STATE_NOTATTACHED) {
                        /* Tell hub_wq to disconnect the device or
                         * check for a new connection or over current condition.
                         * Based on USB2.0 Spec Section 11.12.5,
                         * C_PORT_OVER_CURRENT could be set while
                         * PORT_OVER_CURRENT is not. So check for any of them.
                         */
                        if (udev || (portstatus & USB_PORT_STAT_CONNECTION) ||
                            (portchange & USB_PORT_STAT_C_CONNECTION) ||
                            (portstatus & USB_PORT_STAT_OVERCURRENT) ||
                            (portchange & USB_PORT_STAT_C_OVERCURRENT))
                                set_bit(port1, hub->change_bits);

                } else if (portstatus & USB_PORT_STAT_ENABLE) {
                        bool port_resumed = (portstatus &
                                        USB_PORT_STAT_LINK_STATE) ==
                                USB_SS_PORT_LS_U0;
                        /* The power session apparently survived the resume.
                         * If there was an overcurrent or suspend change
                         * (i.e., remote wakeup request), have hub_wq
                         * take care of it.  Look at the port link state
                         * for USB 3.0 hubs, since they don't have a suspend
                         * change bit, and they don't set the port link change
                         * bit on device-initiated resume.
                         */
                        if (portchange || (hub_is_superspeed(hub->hdev) &&
                                                port_resumed))
                                set_bit(port1, hub->event_bits);

                } else if (udev->persist_enabled) {
#ifdef CONFIG_PM
                        udev->reset_resume = 1;
#endif
                        /* Don't set the change_bits when the device
                         * was powered off.
                         */
                        if (test_bit(port1, hub->power_bits))
                                set_bit(port1, hub->change_bits);

                } else {
                        /* The power session is gone; tell hub_wq */
                        usb_set_device_state(udev, USB_STATE_NOTATTACHED);
                        set_bit(port1, hub->change_bits);
                }
        }

        /* If no port-status-change flags were set, we don't need any
         * debouncing.  If flags were set we can try to debounce the
         * ports all at once right now, instead of letting hub_wq do them
         * one at a time later on.
         *
         * If any port-status changes do occur during this delay, hub_wq
         * will see them later and handle them normally.
         */
        if (need_debounce_delay) {
                delay = HUB_DEBOUNCE_STABLE;

                /* Don't do a long sleep inside a workqueue routine */
                if (type == HUB_INIT2) {
                        INIT_DELAYED_WORK(&hub->init_work, hub_init_func3);
                        queue_delayed_work(system_power_efficient_wq,
                                        &hub->init_work,
                                        msecs_to_jiffies(delay));
                        device_unlock(&hdev->dev);
                        return;                /* Continues at init3: below */
                } else {
                        msleep(delay);
                }
        }
 init3:
        hub->quiescing = 0;

        status = usb_submit_urb(hub->urb, GFP_NOIO);
        if (status < 0)
                dev_err(hub->intfdev, "activate --> %d\n", status);
        if (hub->has_indicators && blinkenlights)
                queue_delayed_work(system_power_efficient_wq,
                                &hub->leds, LED_CYCLE_PERIOD);

        /* Scan all ports that need attention */
        kick_hub_wq(hub);
 abort:
        if (type == HUB_INIT2 || type == HUB_INIT3) {
                /* Allow autosuspend if it was suppressed */
 disconnected:
                usb_autopm_put_interface_async(to_usb_interface(hub->intfdev));
                device_unlock(&hdev->dev);
        }

        if (type == HUB_RESUME && hub_is_superspeed(hub->hdev)) {
                /* give usb3 downstream links training time after hub resume */
                usb_autopm_get_interface_no_resume(
                        to_usb_interface(hub->intfdev));

                queue_delayed_work(system_power_efficient_wq,
                                   &hub->post_resume_work,
                                   msecs_to_jiffies(USB_SS_PORT_U0_WAKE_TIME));
                return;
        }

        hub_put(hub);
}

/* Implement the continuations for the delays above */
static void hub_init_func2(struct work_struct *ws)
{
        struct usb_hub *hub = container_of(ws, struct usb_hub, init_work.work);

        hub_activate(hub, HUB_INIT2);
}

static void hub_init_func3(struct work_struct *ws)
{
        struct usb_hub *hub = container_of(ws, struct usb_hub, init_work.work);

        hub_activate(hub, HUB_INIT3);
}

static void hub_post_resume(struct work_struct *ws)
{
        struct usb_hub *hub = container_of(ws, struct usb_hub, post_resume_work.work);

        usb_autopm_put_interface_async(to_usb_interface(hub->intfdev));
        hub_put(hub);
}

enum hub_quiescing_type {
        HUB_DISCONNECT, HUB_PRE_RESET, HUB_SUSPEND
};

static void hub_quiesce(struct usb_hub *hub, enum hub_quiescing_type type)
{
        struct usb_device *hdev = hub->hdev;
        unsigned long flags;
        int i;

        /* hub_wq and related activity won't re-trigger */
        spin_lock_irqsave(&hub->irq_urb_lock, flags);
        hub->quiescing = 1;
        spin_unlock_irqrestore(&hub->irq_urb_lock, flags);

        if (type != HUB_SUSPEND) {
                /* Disconnect all the children */
                for (i = 0; i < hdev->maxchild; ++i) {
                        if (hub->ports[i]->child)
                                usb_disconnect(&hub->ports[i]->child);
                }
        }

        /* Stop hub_wq and related activity */
        timer_delete_sync(&hub->irq_urb_retry);
        flush_delayed_work(&hub->post_resume_work);
        usb_kill_urb(hub->urb);
        if (hub->has_indicators)
                cancel_delayed_work_sync(&hub->leds);
        if (hub->tt.hub)
                flush_work(&hub->tt.clear_work);
}

static void hub_pm_barrier_for_all_ports(struct usb_hub *hub)
{
        int i;

        for (i = 0; i < hub->hdev->maxchild; ++i)
                pm_runtime_barrier(&hub->ports[i]->dev);
}

/* caller has locked the hub device */
static int hub_pre_reset(struct usb_interface *intf)
{
        struct usb_hub *hub = usb_get_intfdata(intf);

        hub_quiesce(hub, HUB_PRE_RESET);
        hub->in_reset = 1;
        hub_pm_barrier_for_all_ports(hub);
        return 0;
}

/* caller has locked the hub device */
static int hub_post_reset(struct usb_interface *intf)
{
        struct usb_hub *hub = usb_get_intfdata(intf);

        hub->in_reset = 0;
        hub_pm_barrier_for_all_ports(hub);
        hub_activate(hub, HUB_POST_RESET);
        return 0;
}

static int hub_configure(struct usb_hub *hub,
        struct usb_endpoint_descriptor *endpoint)
{
        struct usb_hcd *hcd;
        struct usb_device *hdev = hub->hdev;
        struct device *hub_dev = hub->intfdev;
        u16 hubstatus, hubchange;
        u16 wHubCharacteristics;
        unsigned int pipe;
        int maxp, ret, i;
        char *message = "out of memory";
        unsigned unit_load;
        unsigned full_load;
        unsigned maxchild;

        hub->buffer = kmalloc_obj(*hub->buffer);
        if (!hub->buffer) {
                ret = -ENOMEM;
                goto fail;
        }

        hub->status = kmalloc_obj(*hub->status);
        if (!hub->status) {
                ret = -ENOMEM;
                goto fail;
        }
        mutex_init(&hub->status_mutex);

        hub->descriptor = kzalloc_obj(*hub->descriptor);
        if (!hub->descriptor) {
                ret = -ENOMEM;
                goto fail;
        }

        /* Request the entire hub descriptor.
         * hub->descriptor can handle USB_MAXCHILDREN ports,
         * but a (non-SS) hub can/will return fewer bytes here.
         */
        ret = get_hub_descriptor(hdev, hub->descriptor);
        if (ret < 0) {
                message = "can't read hub descriptor";
                goto fail;
        }

        maxchild = USB_MAXCHILDREN;
        if (hub_is_superspeed(hdev))
                maxchild = min_t(unsigned, maxchild, USB_SS_MAXPORTS);

        if (hub->descriptor->bNbrPorts > maxchild) {
                message = "hub has too many ports!";
                ret = -ENODEV;
                goto fail;
        } else if (hub->descriptor->bNbrPorts == 0) {
                message = "hub doesn't have any ports!";
                ret = -ENODEV;
                goto fail;
        }

        /*
         * Accumulate wHubDelay + 40ns for every hub in the tree of devices.
         * The resulting value will be used for SetIsochDelay() request.
         */
        if (hub_is_superspeed(hdev) || hub_is_superspeedplus(hdev)) {
                u32 delay = __le16_to_cpu(hub->descriptor->u.ss.wHubDelay);

                if (hdev->parent)
                        delay += hdev->parent->hub_delay;

                delay += USB_TP_TRANSMISSION_DELAY;
                hdev->hub_delay = min_t(u32, delay, USB_TP_TRANSMISSION_DELAY_MAX);
        }

        maxchild = hub->descriptor->bNbrPorts;
        dev_info(hub_dev, "%d port%s detected\n", maxchild,
                        str_plural(maxchild));

        hub->ports = kzalloc_objs(struct usb_port *, maxchild);
        if (!hub->ports) {
                ret = -ENOMEM;
                goto fail;
        }

        wHubCharacteristics = le16_to_cpu(hub->descriptor->wHubCharacteristics);
        if (hub_is_superspeed(hdev)) {
                unit_load = 150;
                full_load = 900;
        } else {
                unit_load = 100;
                full_load = 500;
        }

        /* FIXME for USB 3.0, skip for now */
        if ((wHubCharacteristics & HUB_CHAR_COMPOUND) &&
                        !(hub_is_superspeed(hdev))) {
                char        portstr[USB_MAXCHILDREN + 1];

                for (i = 0; i < maxchild; i++)
                        portstr[i] = hub->descriptor->u.hs.DeviceRemovable
                                    [((i + 1) / 8)] & (1 << ((i + 1) % 8))
                                ? 'F' : 'R';
                portstr[maxchild] = 0;
                dev_dbg(hub_dev, "compound device; port removable status: %s\n", portstr);
        } else
                dev_dbg(hub_dev, "standalone hub\n");

        switch (wHubCharacteristics & HUB_CHAR_LPSM) {
        case HUB_CHAR_COMMON_LPSM:
                dev_dbg(hub_dev, "ganged power switching\n");
                break;
        case HUB_CHAR_INDV_PORT_LPSM:
                dev_dbg(hub_dev, "individual port power switching\n");
                break;
        case HUB_CHAR_NO_LPSM:
        case HUB_CHAR_LPSM:
                dev_dbg(hub_dev, "no power switching (usb 1.0)\n");
                break;
        }

        switch (wHubCharacteristics & HUB_CHAR_OCPM) {
        case HUB_CHAR_COMMON_OCPM:
                dev_dbg(hub_dev, "global over-current protection\n");
                break;
        case HUB_CHAR_INDV_PORT_OCPM:
                dev_dbg(hub_dev, "individual port over-current protection\n");
                break;
        case HUB_CHAR_NO_OCPM:
        case HUB_CHAR_OCPM:
                dev_dbg(hub_dev, "no over-current protection\n");
                break;
        }

        spin_lock_init(&hub->tt.lock);
        INIT_LIST_HEAD(&hub->tt.clear_list);
        INIT_WORK(&hub->tt.clear_work, hub_tt_work);
        switch (hdev->descriptor.bDeviceProtocol) {
        case USB_HUB_PR_FS:
                break;
        case USB_HUB_PR_HS_SINGLE_TT:
                dev_dbg(hub_dev, "Single TT\n");
                hub->tt.hub = hdev;
                break;
        case USB_HUB_PR_HS_MULTI_TT:
                ret = usb_set_interface(hdev, 0, 1);
                if (ret == 0) {
                        dev_dbg(hub_dev, "TT per port\n");
                        hub->tt.multi = 1;
                } else
                        dev_err(hub_dev, "Using single TT (err %d)\n",
                                ret);
                hub->tt.hub = hdev;
                break;
        case USB_HUB_PR_SS:
                /* USB 3.0 hubs don't have a TT */
                break;
        default:
                dev_dbg(hub_dev, "Unrecognized hub protocol %d\n",
                        hdev->descriptor.bDeviceProtocol);
                break;
        }

        /* Note 8 FS bit times == (8 bits / 12000000 bps) ~= 666ns */
        switch (wHubCharacteristics & HUB_CHAR_TTTT) {
        case HUB_TTTT_8_BITS:
                if (hdev->descriptor.bDeviceProtocol != 0) {
                        hub->tt.think_time = 666;
                        dev_dbg(hub_dev, "TT requires at most %d "
                                        "FS bit times (%d ns)\n",
                                8, hub->tt.think_time);
                }
                break;
        case HUB_TTTT_16_BITS:
                hub->tt.think_time = 666 * 2;
                dev_dbg(hub_dev, "TT requires at most %d "
                                "FS bit times (%d ns)\n",
                        16, hub->tt.think_time);
                break;
        case HUB_TTTT_24_BITS:
                hub->tt.think_time = 666 * 3;
                dev_dbg(hub_dev, "TT requires at most %d "
                                "FS bit times (%d ns)\n",
                        24, hub->tt.think_time);
                break;
        case HUB_TTTT_32_BITS:
                hub->tt.think_time = 666 * 4;
                dev_dbg(hub_dev, "TT requires at most %d "
                                "FS bit times (%d ns)\n",
                        32, hub->tt.think_time);
                break;
        }

        /* probe() zeroes hub->indicator[] */
        if (wHubCharacteristics & HUB_CHAR_PORTIND) {
                hub->has_indicators = 1;
                dev_dbg(hub_dev, "Port indicators are supported\n");
        }

        dev_dbg(hub_dev, "power on to power good time: %dms\n",
                hub->descriptor->bPwrOn2PwrGood * 2);

        /* power budgeting mostly matters with bus-powered hubs,
         * and battery-powered root hubs (may provide just 8 mA).
         */
        ret = usb_get_std_status(hdev, USB_RECIP_DEVICE, 0, &hubstatus);
        if (ret) {
                message = "can't get hub status";
                goto fail;
        }
        hcd = bus_to_hcd(hdev->bus);
        if (hdev == hdev->bus->root_hub) {
                if (hcd->power_budget > 0)
                        hdev->bus_mA = hcd->power_budget;
                else
                        hdev->bus_mA = full_load * maxchild;
                if (hdev->bus_mA >= full_load)
                        hub->mA_per_port = full_load;
                else {
                        hub->mA_per_port = hdev->bus_mA;
                        hub->limited_power = 1;
                }
        } else if ((hubstatus & (1 << USB_DEVICE_SELF_POWERED)) == 0) {
                int remaining = hdev->bus_mA -
                        hub->descriptor->bHubContrCurrent;

                dev_dbg(hub_dev, "hub controller current requirement: %dmA\n",
                        hub->descriptor->bHubContrCurrent);
                hub->limited_power = 1;

                if (remaining < maxchild * unit_load)
                        dev_warn(hub_dev,
                                        "insufficient power available "
                                        "to use all downstream ports\n");
                hub->mA_per_port = unit_load;        /* 7.2.1 */

        } else {        /* Self-powered external hub */
                /* FIXME: What about battery-powered external hubs that
                 * provide less current per port? */
                hub->mA_per_port = full_load;
        }
        if (hub->mA_per_port < full_load)
                dev_dbg(hub_dev, "%umA bus power budget for each child\n",
                                hub->mA_per_port);

        ret = hub_hub_status(hub, &hubstatus, &hubchange);
        if (ret < 0) {
                message = "can't get hub status";
                goto fail;
        }

        /* local power status reports aren't always correct */
        if (hdev->actconfig->desc.bmAttributes & USB_CONFIG_ATT_SELFPOWER)
                dev_dbg(hub_dev, "local power source is %s\n",
                        (hubstatus & HUB_STATUS_LOCAL_POWER)
                        ? "lost (inactive)" : "good");

        if ((wHubCharacteristics & HUB_CHAR_OCPM) == 0)
                dev_dbg(hub_dev, "%sover-current condition exists\n",
                        (hubstatus & HUB_STATUS_OVERCURRENT) ? "" : "no ");

        /* set up the interrupt endpoint
         * We use the EP's maxpacket size instead of (PORTS+1+7)/8
         * bytes as USB2.0[11.12.3] says because some hubs are known
         * to send more data (and thus cause overflow). For root hubs,
         * maxpktsize is defined in hcd.c's fake endpoint descriptors
         * to be big enough for at least USB_MAXCHILDREN ports. */
        pipe = usb_rcvintpipe(hdev, endpoint->bEndpointAddress);
        maxp = usb_maxpacket(hdev, pipe);

        if (maxp > sizeof(*hub->buffer))
                maxp = sizeof(*hub->buffer);

        hub->urb = usb_alloc_urb(0, GFP_KERNEL);
        if (!hub->urb) {
                ret = -ENOMEM;
                goto fail;
        }

        usb_fill_int_urb(hub->urb, hdev, pipe, *hub->buffer, maxp, hub_irq,
                hub, endpoint->bInterval);

        /* maybe cycle the hub leds */
        if (hub->has_indicators && blinkenlights)
                hub->indicator[0] = INDICATOR_CYCLE;

        mutex_lock(&usb_port_peer_mutex);
        for (i = 0; i < maxchild; i++) {
                ret = usb_hub_create_port_device(hub, i + 1);
                if (ret < 0) {
                        dev_err(hub->intfdev,
                                "couldn't create port%d device.\n", i + 1);
                        break;
                }
        }
        hdev->maxchild = i;
        for (i = 0; i < hdev->maxchild; i++) {
                struct usb_port *port_dev = hub->ports[i];

                pm_runtime_put(&port_dev->dev);
        }

        mutex_unlock(&usb_port_peer_mutex);
        if (ret < 0)
                goto fail;

        /* Update the HCD's internal representation of this hub before hub_wq
         * starts getting port status changes for devices under the hub.
         */
        if (hcd->driver->update_hub_device) {
                ret = hcd->driver->update_hub_device(hcd, hdev,
                                &hub->tt, GFP_KERNEL);
                if (ret < 0) {
                        message = "can't update HCD hub info";
                        goto fail;
                }
        }

        usb_hub_adjust_deviceremovable(hdev, hub->descriptor);

        hub_activate(hub, HUB_INIT);
        return 0;

fail:
        dev_err(hub_dev, "config failed, %s (err %d)\n",
                        message, ret);
        /* hub_disconnect() frees urb and descriptor */
        return ret;
}

static void hub_release(struct kref *kref)
{
        struct usb_hub *hub = container_of(kref, struct usb_hub, kref);

        usb_put_dev(hub->hdev);
        usb_put_intf(to_usb_interface(hub->intfdev));
        kfree(hub);
}

void hub_get(struct usb_hub *hub)
{
        kref_get(&hub->kref);
}

void hub_put(struct usb_hub *hub)
{
        kref_put(&hub->kref, hub_release);
}

static unsigned highspeed_hubs;

static void hub_disconnect(struct usb_interface *intf)
{
        struct usb_hub *hub = usb_get_intfdata(intf);
        struct usb_device *hdev = interface_to_usbdev(intf);
        int port1;

        /*
         * Stop adding new hub events. We do not want to block here and thus
         * will not try to remove any pending work item.
         */
        hub->disconnected = 1;

        /* Disconnect all children and quiesce the hub */
        hub->error = 0;
        hub_quiesce(hub, HUB_DISCONNECT);

        mutex_lock(&usb_port_peer_mutex);

        /* Avoid races with recursively_mark_NOTATTACHED() */
        spin_lock_irq(&device_state_lock);
        port1 = hdev->maxchild;
        hdev->maxchild = 0;
        usb_set_intfdata(intf, NULL);
        spin_unlock_irq(&device_state_lock);

        for (; port1 > 0; --port1)
                usb_hub_remove_port_device(hub, port1);

        mutex_unlock(&usb_port_peer_mutex);

        if (hub->hdev->speed == USB_SPEED_HIGH)
                highspeed_hubs--;

        usb_free_urb(hub->urb);
        kfree(hub->ports);
        kfree(hub->descriptor);
        kfree(hub->status);
        kfree(hub->buffer);

        pm_suspend_ignore_children(&intf->dev, false);

        if (hub->quirk_disable_autosuspend)
                usb_autopm_put_interface(intf);

        onboard_dev_destroy_pdevs(&hub->onboard_devs);

        hub_put(hub);
}

static bool hub_descriptor_is_sane(struct usb_host_interface *desc)
{
        /* Some hubs have a subclass of 1, which AFAICT according to the */
        /*  specs is not defined, but it works */
        if (desc->desc.bInterfaceSubClass != 0 &&
            desc->desc.bInterfaceSubClass != 1)
                return false;

        /* Multiple endpoints? What kind of mutant ninja-hub is this? */
        if (desc->desc.bNumEndpoints != 1)
                return false;

        /* If the first endpoint is not interrupt IN, we'd better punt! */
        if (!usb_endpoint_is_int_in(&desc->endpoint[0].desc))
                return false;

        return true;
}

static int hub_probe(struct usb_interface *intf, const struct usb_device_id *id)
{
        struct usb_host_interface *desc;
        struct usb_device *hdev;
        struct usb_hub *hub;

        desc = intf->cur_altsetting;
        hdev = interface_to_usbdev(intf);

        /*
         * The USB 2.0 spec prohibits hubs from having more than one
         * configuration or interface, and we rely on this prohibition.
         * Refuse to accept a device that violates it.
         */
        if (hdev->descriptor.bNumConfigurations > 1 ||
                        hdev->actconfig->desc.bNumInterfaces > 1) {
                dev_err(&intf->dev, "Invalid hub with more than one config or interface\n");
                return -EINVAL;
        }

        /*
         * Set default autosuspend delay as 0 to speedup bus suspend,
         * based on the below considerations:
         *
         * - Unlike other drivers, the hub driver does not rely on the
         *   autosuspend delay to provide enough time to handle a wakeup
         *   event, and the submitted status URB is just to check future
         *   change on hub downstream ports, so it is safe to do it.
         *
         * - The patch might cause one or more auto supend/resume for
         *   below very rare devices when they are plugged into hub
         *   first time:
         *
         *           devices having trouble initializing, and disconnect
         *           themselves from the bus and then reconnect a second
         *           or so later
         *
         *           devices just for downloading firmware, and disconnects
         *           themselves after completing it
         *
         *   For these quite rare devices, their drivers may change the
         *   autosuspend delay of their parent hub in the probe() to one
         *   appropriate value to avoid the subtle problem if someone
         *   does care it.
         *
         * - The patch may cause one or more auto suspend/resume on
         *   hub during running 'lsusb', but it is probably too
         *   infrequent to worry about.
         *
         * - Change autosuspend delay of hub can avoid unnecessary auto
         *   suspend timer for hub, also may decrease power consumption
         *   of USB bus.
         *
         * - If user has indicated to prevent autosuspend by passing
         *   usbcore.autosuspend = -1 then keep autosuspend disabled.
         */
#ifdef CONFIG_PM
        if (hdev->dev.power.autosuspend_delay >= 0)
                pm_runtime_set_autosuspend_delay(&hdev->dev, 0);
#endif

        /*
         * Hubs have proper suspend/resume support, except for root hubs
         * where the controller driver doesn't have bus_suspend and
         * bus_resume methods.
         */
        if (hdev->parent) {                /* normal device */
                usb_enable_autosuspend(hdev);
        } else {                        /* root hub */
                const struct hc_driver *drv = bus_to_hcd(hdev->bus)->driver;

                if (drv->bus_suspend && drv->bus_resume)
                        usb_enable_autosuspend(hdev);
        }

        if (hdev->level == MAX_TOPO_LEVEL) {
                dev_err(&intf->dev,
                        "Unsupported bus topology: hub nested too deep\n");
                return -E2BIG;
        }

#ifdef        CONFIG_USB_OTG_DISABLE_EXTERNAL_HUB
        if (hdev->parent) {
                dev_warn(&intf->dev, "ignoring external hub\n");
                return -ENODEV;
        }
#endif

        if (!hub_descriptor_is_sane(desc)) {
                dev_err(&intf->dev, "bad descriptor, ignoring hub\n");
                return -EIO;
        }

        /* We found a hub */
        dev_info(&intf->dev, "USB hub found\n");

        hub = kzalloc_obj(*hub);
        if (!hub)
                return -ENOMEM;

        kref_init(&hub->kref);
        hub->intfdev = &intf->dev;
        hub->hdev = hdev;
        INIT_DELAYED_WORK(&hub->leds, led_work);
        INIT_DELAYED_WORK(&hub->init_work, NULL);
        INIT_DELAYED_WORK(&hub->post_resume_work, hub_post_resume);
        INIT_WORK(&hub->events, hub_event);
        INIT_LIST_HEAD(&hub->onboard_devs);
        spin_lock_init(&hub->irq_urb_lock);
        timer_setup(&hub->irq_urb_retry, hub_retry_irq_urb, 0);
        usb_get_intf(intf);
        usb_get_dev(hdev);

        usb_set_intfdata(intf, hub);
        intf->needs_remote_wakeup = 1;
        pm_suspend_ignore_children(&intf->dev, true);

        if (hdev->speed == USB_SPEED_HIGH)
                highspeed_hubs++;

        if (id->driver_info & HUB_QUIRK_CHECK_PORT_AUTOSUSPEND)
                hub->quirk_check_port_auto_suspend = 1;

        if (id->driver_info & HUB_QUIRK_DISABLE_AUTOSUSPEND) {
                hub->quirk_disable_autosuspend = 1;
                usb_autopm_get_interface_no_resume(intf);
        }

        if ((id->driver_info & HUB_QUIRK_REDUCE_FRAME_INTR_BINTERVAL) &&
            desc->endpoint[0].desc.bInterval > USB_REDUCE_FRAME_INTR_BINTERVAL) {
                desc->endpoint[0].desc.bInterval =
                        USB_REDUCE_FRAME_INTR_BINTERVAL;
                /* Tell the HCD about the interrupt ep's new bInterval */
                usb_set_interface(hdev, 0, 0);
        }

        if (hub_configure(hub, &desc->endpoint[0].desc) >= 0) {
                onboard_dev_create_pdevs(hdev, &hub->onboard_devs);

                return 0;
        }

        hub_disconnect(intf);
        return -ENODEV;
}

static int
hub_ioctl(struct usb_interface *intf, unsigned int code, void *user_data)
{
        struct usb_device *hdev = interface_to_usbdev(intf);
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);

        /* assert ifno == 0 (part of hub spec) */
        switch (code) {
        case USBDEVFS_HUB_PORTINFO: {
                struct usbdevfs_hub_portinfo *info = user_data;
                int i;

                spin_lock_irq(&device_state_lock);
                if (hdev->devnum <= 0)
                        info->nports = 0;
                else {
                        info->nports = hdev->maxchild;
                        for (i = 0; i < info->nports; i++) {
                                if (hub->ports[i]->child == NULL)
                                        info->port[i] = 0;
                                else
                                        info->port[i] =
                                                hub->ports[i]->child->devnum;
                        }
                }
                spin_unlock_irq(&device_state_lock);

                return info->nports + 1;
                }

        default:
                return -ENOSYS;
        }
}

/*
 * Allow user programs to claim ports on a hub.  When a device is attached
 * to one of these "claimed" ports, the program will "own" the device.
 */
static int find_port_owner(struct usb_device *hdev, unsigned port1,
                struct usb_dev_state ***ppowner)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);

        if (hdev->state == USB_STATE_NOTATTACHED)
                return -ENODEV;
        if (port1 == 0 || port1 > hdev->maxchild)
                return -EINVAL;

        /* Devices not managed by the hub driver
         * will always have maxchild equal to 0.
         */
        *ppowner = &(hub->ports[port1 - 1]->port_owner);
        return 0;
}

/* In the following three functions, the caller must hold hdev's lock */
int usb_hub_claim_port(struct usb_device *hdev, unsigned port1,
                       struct usb_dev_state *owner)
{
        int rc;
        struct usb_dev_state **powner;

        rc = find_port_owner(hdev, port1, &powner);
        if (rc)
                return rc;
        if (*powner)
                return -EBUSY;
        *powner = owner;
        return rc;
}
EXPORT_SYMBOL_GPL(usb_hub_claim_port);

int usb_hub_release_port(struct usb_device *hdev, unsigned port1,
                         struct usb_dev_state *owner)
{
        int rc;
        struct usb_dev_state **powner;

        rc = find_port_owner(hdev, port1, &powner);
        if (rc)
                return rc;
        if (*powner != owner)
                return -ENOENT;
        *powner = NULL;
        return rc;
}
EXPORT_SYMBOL_GPL(usb_hub_release_port);

void usb_hub_release_all_ports(struct usb_device *hdev, struct usb_dev_state *owner)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
        int n;

        for (n = 0; n < hdev->maxchild; n++) {
                if (hub->ports[n]->port_owner == owner)
                        hub->ports[n]->port_owner = NULL;
        }

}

/* The caller must hold udev's lock */
bool usb_device_is_owned(struct usb_device *udev)
{
        struct usb_hub *hub;

        if (udev->state == USB_STATE_NOTATTACHED || !udev->parent)
                return false;
        hub = usb_hub_to_struct_hub(udev->parent);
        return !!hub->ports[udev->portnum - 1]->port_owner;
}

static void update_port_device_state(struct usb_device *udev)
{
        struct usb_hub *hub;
        struct usb_port *port_dev;

        if (udev->parent) {
                hub = usb_hub_to_struct_hub(udev->parent);

                /*
                 * The Link Layer Validation System Driver (lvstest)
                 * has a test step to unbind the hub before running the
                 * rest of the procedure. This triggers hub_disconnect
                 * which will set the hub's maxchild to 0, further
                 * resulting in usb_hub_to_struct_hub returning NULL.
                 */
                if (hub) {
                        port_dev = hub->ports[udev->portnum - 1];
                        WRITE_ONCE(port_dev->state, udev->state);
                        sysfs_notify_dirent(port_dev->state_kn);
                }
        }
}

static void update_usb_device_state(struct usb_device *udev,
                                    enum usb_device_state new_state)
{
        if (udev->state == USB_STATE_SUSPENDED &&
            new_state != USB_STATE_SUSPENDED)
                udev->active_duration -= jiffies;
        else if (new_state == USB_STATE_SUSPENDED &&
                 udev->state != USB_STATE_SUSPENDED)
                udev->active_duration += jiffies;

        udev->state = new_state;
        update_port_device_state(udev);
        trace_usb_set_device_state(udev);
}

static void recursively_mark_NOTATTACHED(struct usb_device *udev)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(udev);
        int i;

        for (i = 0; i < udev->maxchild; ++i) {
                if (hub->ports[i]->child)
                        recursively_mark_NOTATTACHED(hub->ports[i]->child);
        }
        update_usb_device_state(udev, USB_STATE_NOTATTACHED);
}

/**
 * usb_set_device_state - change a device's current state (usbcore, hcds)
 * @udev: pointer to device whose state should be changed
 * @new_state: new state value to be stored
 *
 * udev->state is _not_ fully protected by the device lock.  Although
 * most transitions are made only while holding the lock, the state can
 * can change to USB_STATE_NOTATTACHED at almost any time.  This
 * is so that devices can be marked as disconnected as soon as possible,
 * without having to wait for any semaphores to be released.  As a result,
 * all changes to any device's state must be protected by the
 * device_state_lock spinlock.
 *
 * Once a device has been added to the device tree, all changes to its state
 * should be made using this routine.  The state should _not_ be set directly.
 *
 * If udev->state is already USB_STATE_NOTATTACHED then no change is made.
 * Otherwise udev->state is set to new_state, and if new_state is
 * USB_STATE_NOTATTACHED then all of udev's descendants' states are also set
 * to USB_STATE_NOTATTACHED.
 */
void usb_set_device_state(struct usb_device *udev,
                enum usb_device_state new_state)
{
        unsigned long flags;
        int wakeup = -1;

        spin_lock_irqsave(&device_state_lock, flags);
        if (udev->state == USB_STATE_NOTATTACHED)
                ;        /* do nothing */
        else if (new_state != USB_STATE_NOTATTACHED) {

                /* root hub wakeup capabilities are managed out-of-band
                 * and may involve silicon errata ... ignore them here.
                 */
                if (udev->parent) {
                        if (udev->state == USB_STATE_SUSPENDED
                                        || new_state == USB_STATE_SUSPENDED)
                                ;        /* No change to wakeup settings */
                        else if (new_state == USB_STATE_CONFIGURED)
                                wakeup = (udev->quirks &
                                        USB_QUIRK_IGNORE_REMOTE_WAKEUP) ? 0 :
                                        udev->actconfig->desc.bmAttributes &
                                        USB_CONFIG_ATT_WAKEUP;
                        else
                                wakeup = 0;
                }
                update_usb_device_state(udev, new_state);
        } else
                recursively_mark_NOTATTACHED(udev);
        spin_unlock_irqrestore(&device_state_lock, flags);
        if (wakeup >= 0)
                device_set_wakeup_capable(&udev->dev, wakeup);
}
EXPORT_SYMBOL_GPL(usb_set_device_state);

/*
 * Choose a device number.
 *
 * Device numbers are used as filenames in usbfs.  On USB-1.1 and
 * USB-2.0 buses they are also used as device addresses, however on
 * USB-3.0 buses the address is assigned by the controller hardware
 * and it usually is not the same as the device number.
 *
 * Devices connected under xHCI are not as simple.  The host controller
 * supports virtualization, so the hardware assigns device addresses and
 * the HCD must setup data structures before issuing a set address
 * command to the hardware.
 */
static void choose_devnum(struct usb_device *udev)
{
        int                devnum;
        struct usb_bus        *bus = udev->bus;

        /* be safe when more hub events are proceed in parallel */
        mutex_lock(&bus->devnum_next_mutex);

        /* Try to allocate the next devnum beginning at bus->devnum_next. */
        devnum = find_next_zero_bit(bus->devmap, 128, bus->devnum_next);
        if (devnum >= 128)
                devnum = find_next_zero_bit(bus->devmap, 128, 1);
        bus->devnum_next = (devnum >= 127 ? 1 : devnum + 1);
        if (devnum < 128) {
                set_bit(devnum, bus->devmap);
                udev->devnum = devnum;
        }
        mutex_unlock(&bus->devnum_next_mutex);
}

static void release_devnum(struct usb_device *udev)
{
        if (udev->devnum > 0) {
                clear_bit(udev->devnum, udev->bus->devmap);
                udev->devnum = -1;
        }
}

static void update_devnum(struct usb_device *udev, int devnum)
{
        udev->devnum = devnum;
        if (!udev->devaddr)
                udev->devaddr = (u8)devnum;
}

static void hub_free_dev(struct usb_device *udev)
{
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);

        /* Root hubs aren't real devices, so don't free HCD resources */
        if (hcd->driver->free_dev && udev->parent)
                hcd->driver->free_dev(hcd, udev);
}

static void hub_disconnect_children(struct usb_device *udev)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(udev);
        int i;

        /* Free up all the children before we remove this device */
        for (i = 0; i < udev->maxchild; i++) {
                if (hub->ports[i]->child)
                        usb_disconnect(&hub->ports[i]->child);
        }
}

/**
 * usb_disconnect - disconnect a device (usbcore-internal)
 * @pdev: pointer to device being disconnected
 *
 * Context: task context, might sleep
 *
 * Something got disconnected. Get rid of it and all of its children.
 *
 * If *pdev is a normal device then the parent hub must already be locked.
 * If *pdev is a root hub then the caller must hold the usb_bus_idr_lock,
 * which protects the set of root hubs as well as the list of buses.
 *
 * Only hub drivers (including virtual root hub drivers for host
 * controllers) should ever call this.
 *
 * This call is synchronous, and may not be used in an interrupt context.
 */
void usb_disconnect(struct usb_device **pdev)
{
        struct usb_port *port_dev = NULL;
        struct usb_device *udev = *pdev;
        struct usb_hub *hub = NULL;
        int port1 = 1;

        /* mark the device as inactive, so any further urb submissions for
         * this device (and any of its children) will fail immediately.
         * this quiesces everything except pending urbs.
         */
        usb_set_device_state(udev, USB_STATE_NOTATTACHED);
        dev_info(&udev->dev, "USB disconnect, device number %d\n",
                        udev->devnum);

        /*
         * Ensure that the pm runtime code knows that the USB device
         * is in the process of being disconnected.
         */
        pm_runtime_barrier(&udev->dev);

        usb_lock_device(udev);

        hub_disconnect_children(udev);

        /* deallocate hcd/hardware state ... nuking all pending urbs and
         * cleaning up all state associated with the current configuration
         * so that the hardware is now fully quiesced.
         */
        dev_dbg(&udev->dev, "unregistering device\n");
        usb_disable_device(udev, 0);
        usb_hcd_synchronize_unlinks(udev);

        if (udev->parent) {
                port1 = udev->portnum;
                hub = usb_hub_to_struct_hub(udev->parent);
                port_dev = hub->ports[port1 - 1];

                sysfs_remove_link(&udev->dev.kobj, "port");
                sysfs_remove_link(&port_dev->dev.kobj, "device");

                /*
                 * As usb_port_runtime_resume() de-references udev, make
                 * sure no resumes occur during removal
                 */
                if (!test_and_set_bit(port1, hub->child_usage_bits))
                        pm_runtime_get_sync(&port_dev->dev);

                typec_deattach(port_dev->connector, &udev->dev);
        }

        usb_remove_ep_devs(&udev->ep0);
        usb_unlock_device(udev);

        if (udev->usb4_link)
                device_link_del(udev->usb4_link);

        /* Unregister the device.  The device driver is responsible
         * for de-configuring the device and invoking the remove-device
         * notifier chain (used by usbfs and possibly others).
         */
        device_del(&udev->dev);

        /* Free the device number and delete the parent's children[]
         * (or root_hub) pointer.
         */
        release_devnum(udev);

        /* Avoid races with recursively_mark_NOTATTACHED() */
        spin_lock_irq(&device_state_lock);
        *pdev = NULL;
        spin_unlock_irq(&device_state_lock);

        if (port_dev && test_and_clear_bit(port1, hub->child_usage_bits))
                pm_runtime_put(&port_dev->dev);

        hub_free_dev(udev);

        put_device(&udev->dev);
}

#ifdef CONFIG_USB_ANNOUNCE_NEW_DEVICES
static void show_string(struct usb_device *udev, char *id, char *string)
{
        if (!string)
                return;
        dev_info(&udev->dev, "%s: %s\n", id, string);
}

static void announce_device(struct usb_device *udev)
{
        u16 bcdDevice = le16_to_cpu(udev->descriptor.bcdDevice);

        dev_info(&udev->dev,
                "New USB device found, idVendor=%04x, idProduct=%04x, bcdDevice=%2x.%02x\n",
                le16_to_cpu(udev->descriptor.idVendor),
                le16_to_cpu(udev->descriptor.idProduct),
                bcdDevice >> 8, bcdDevice & 0xff);
        dev_info(&udev->dev,
                "New USB device strings: Mfr=%d, Product=%d, SerialNumber=%d\n",
                udev->descriptor.iManufacturer,
                udev->descriptor.iProduct,
                udev->descriptor.iSerialNumber);
        show_string(udev, "Product", udev->product);
        show_string(udev, "Manufacturer", udev->manufacturer);
        show_string(udev, "SerialNumber", udev->serial);
}
#else
static inline void announce_device(struct usb_device *udev) { }
#endif


/**
 * usb_enumerate_device_otg - FIXME (usbcore-internal)
 * @udev: newly addressed device (in ADDRESS state)
 *
 * Finish enumeration for On-The-Go devices
 *
 * Return: 0 if successful. A negative error code otherwise.
 */
static int usb_enumerate_device_otg(struct usb_device *udev)
{
        int err = 0;

#ifdef        CONFIG_USB_OTG
        /*
         * OTG-aware devices on OTG-capable root hubs may be able to use SRP,
         * to wake us after we've powered off VBUS; and HNP, switching roles
         * "host" to "peripheral".  The OTG descriptor helps figure this out.
         */
        if (!udev->bus->is_b_host
                        && udev->config
                        && udev->parent == udev->bus->root_hub) {
                struct usb_otg_descriptor        *desc = NULL;
                struct usb_bus                        *bus = udev->bus;
                unsigned                        port1 = udev->portnum;

                /* descriptor may appear anywhere in config */
                err = __usb_get_extra_descriptor(udev->rawdescriptors[0],
                                le16_to_cpu(udev->config[0].desc.wTotalLength),
                                USB_DT_OTG, (void **) &desc, sizeof(*desc));
                if (err || !(desc->bmAttributes & USB_OTG_HNP))
                        return 0;

                dev_info(&udev->dev, "Dual-Role OTG device on %sHNP port\n",
                                        (port1 == bus->otg_port) ? "" : "non-");

                /* enable HNP before suspend, it's simpler */
                if (port1 == bus->otg_port) {
                        bus->b_hnp_enable = 1;
                        err = usb_control_msg(udev,
                                usb_sndctrlpipe(udev, 0),
                                USB_REQ_SET_FEATURE, 0,
                                USB_DEVICE_B_HNP_ENABLE,
                                0, NULL, 0,
                                USB_CTRL_SET_TIMEOUT);
                        if (err < 0) {
                                /*
                                 * OTG MESSAGE: report errors here,
                                 * customize to match your product.
                                 */
                                dev_err(&udev->dev, "can't set HNP mode: %d\n",
                                                                        err);
                                bus->b_hnp_enable = 0;
                        }
                } else if (desc->bLength == sizeof
                                (struct usb_otg_descriptor)) {
                        /*
                         * We are operating on a legacy OTP device
                         * These should be told that they are operating
                         * on the wrong port if we have another port that does
                         * support HNP
                         */
                        if (bus->otg_port != 0) {
                                /* Set a_alt_hnp_support for legacy otg device */
                                err = usb_control_msg(udev,
                                        usb_sndctrlpipe(udev, 0),
                                        USB_REQ_SET_FEATURE, 0,
                                        USB_DEVICE_A_ALT_HNP_SUPPORT,
                                        0, NULL, 0,
                                        USB_CTRL_SET_TIMEOUT);
                                if (err < 0)
                                        dev_err(&udev->dev,
                                                "set a_alt_hnp_support failed: %d\n",
                                                err);
                        }
                }
        }
#endif
        return err;
}


/**
 * usb_enumerate_device - Read device configs/intfs/otg (usbcore-internal)
 * @udev: newly addressed device (in ADDRESS state)
 *
 * This is only called by usb_new_device() -- all comments that apply there
 * apply here wrt to environment.
 *
 * If the device is WUSB and not authorized, we don't attempt to read
 * the string descriptors, as they will be errored out by the device
 * until it has been authorized.
 *
 * Return: 0 if successful. A negative error code otherwise.
 */
static int usb_enumerate_device(struct usb_device *udev)
{
        int err;
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);

        if (udev->config == NULL) {
                err = usb_get_configuration(udev);
                if (err < 0) {
                        if (err != -ENODEV)
                                dev_err(&udev->dev, "can't read configurations, error %d\n",
                                                err);
                        return err;
                }
        }

        /* read the standard strings and cache them if present */
        udev->product = usb_cache_string(udev, udev->descriptor.iProduct);
        udev->manufacturer = usb_cache_string(udev,
                                              udev->descriptor.iManufacturer);
        udev->serial = usb_cache_string(udev, udev->descriptor.iSerialNumber);

        err = usb_enumerate_device_otg(udev);
        if (err < 0)
                return err;

        if (IS_ENABLED(CONFIG_USB_OTG_PRODUCTLIST) && hcd->tpl_support &&
                !is_targeted(udev)) {
                /* Maybe it can talk to us, though we can't talk to it.
                 * (Includes HNP test device.)
                 */
                if (IS_ENABLED(CONFIG_USB_OTG) && (udev->bus->b_hnp_enable
                        || udev->bus->is_b_host)) {
                        err = usb_port_suspend(udev, PMSG_AUTO_SUSPEND);
                        if (err < 0)
                                dev_dbg(&udev->dev, "HNP fail, %d\n", err);
                }
                return -ENOTSUPP;
        }

        usb_detect_interface_quirks(udev);

        return 0;
}

static void set_usb_port_removable(struct usb_device *udev)
{
        struct usb_device *hdev = udev->parent;
        struct usb_hub *hub;
        u8 port = udev->portnum;
        u16 wHubCharacteristics;
        bool removable = true;

        dev_set_removable(&udev->dev, DEVICE_REMOVABLE_UNKNOWN);

        if (!hdev)
                return;

        hub = usb_hub_to_struct_hub(udev->parent);

        /*
         * If the platform firmware has provided information about a port,
         * use that to determine whether it's removable.
         */
        switch (hub->ports[udev->portnum - 1]->connect_type) {
        case USB_PORT_CONNECT_TYPE_HOT_PLUG:
                dev_set_removable(&udev->dev, DEVICE_REMOVABLE);
                return;
        case USB_PORT_CONNECT_TYPE_HARD_WIRED:
        case USB_PORT_NOT_USED:
                dev_set_removable(&udev->dev, DEVICE_FIXED);
                return;
        default:
                break;
        }

        /*
         * Otherwise, check whether the hub knows whether a port is removable
         * or not
         */
        wHubCharacteristics = le16_to_cpu(hub->descriptor->wHubCharacteristics);

        if (!(wHubCharacteristics & HUB_CHAR_COMPOUND))
                return;

        if (hub_is_superspeed(hdev)) {
                if (le16_to_cpu(hub->descriptor->u.ss.DeviceRemovable)
                                & (1 << port))
                        removable = false;
        } else {
                if (hub->descriptor->u.hs.DeviceRemovable[port / 8] & (1 << (port % 8)))
                        removable = false;
        }

        if (removable)
                dev_set_removable(&udev->dev, DEVICE_REMOVABLE);
        else
                dev_set_removable(&udev->dev, DEVICE_FIXED);

}

/**
 * usb_new_device - perform initial device setup (usbcore-internal)
 * @udev: newly addressed device (in ADDRESS state)
 *
 * This is called with devices which have been detected but not fully
 * enumerated.  The device descriptor is available, but not descriptors
 * for any device configuration.  The caller must have locked either
 * the parent hub (if udev is a normal device) or else the
 * usb_bus_idr_lock (if udev is a root hub).  The parent's pointer to
 * udev has already been installed, but udev is not yet visible through
 * sysfs or other filesystem code.
 *
 * This call is synchronous, and may not be used in an interrupt context.
 *
 * Only the hub driver or root-hub registrar should ever call this.
 *
 * Return: Whether the device is configured properly or not. Zero if the
 * interface was registered with the driver core; else a negative errno
 * value.
 *
 */
int usb_new_device(struct usb_device *udev)
{
        int err;

        if (udev->parent) {
                /* Initialize non-root-hub device wakeup to disabled;
                 * device (un)configuration controls wakeup capable
                 * sysfs power/wakeup controls wakeup enabled/disabled
                 */
                device_init_wakeup(&udev->dev, 0);
        }

        /* Tell the runtime-PM framework the device is active */
        pm_runtime_set_active(&udev->dev);
        pm_runtime_get_noresume(&udev->dev);
        pm_runtime_use_autosuspend(&udev->dev);
        pm_runtime_enable(&udev->dev);

        /* By default, forbid autosuspend for all devices.  It will be
         * allowed for hubs during binding.
         */
        usb_disable_autosuspend(udev);

        err = usb_enumerate_device(udev);        /* Read descriptors */
        if (err < 0)
                goto fail;
        dev_dbg(&udev->dev, "udev %d, busnum %d, minor = %d\n",
                        udev->devnum, udev->bus->busnum,
                        (((udev->bus->busnum-1) * 128) + (udev->devnum-1)));
        /* export the usbdev device-node for libusb */
        udev->dev.devt = MKDEV(USB_DEVICE_MAJOR,
                        (((udev->bus->busnum-1) * 128) + (udev->devnum-1)));

        /* Tell the world! */
        announce_device(udev);

        if (udev->serial)
                add_device_randomness(udev->serial, strlen(udev->serial));
        if (udev->product)
                add_device_randomness(udev->product, strlen(udev->product));
        if (udev->manufacturer)
                add_device_randomness(udev->manufacturer,
                                      strlen(udev->manufacturer));

        device_enable_async_suspend(&udev->dev);

        /* check whether the hub or firmware marks this port as non-removable */
        set_usb_port_removable(udev);

        /* Register the device.  The device driver is responsible
         * for configuring the device and invoking the add-device
         * notifier chain (used by usbfs and possibly others).
         */
        err = device_add(&udev->dev);
        if (err) {
                dev_err(&udev->dev, "can't device_add, error %d\n", err);
                goto fail;
        }

        /* Create link files between child device and usb port device. */
        if (udev->parent) {
                struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent);
                int port1 = udev->portnum;
                struct usb_port        *port_dev = hub->ports[port1 - 1];

                err = sysfs_create_link(&udev->dev.kobj,
                                &port_dev->dev.kobj, "port");
                if (err)
                        goto out_del_dev;

                err = sysfs_create_link(&port_dev->dev.kobj,
                                &udev->dev.kobj, "device");
                if (err) {
                        sysfs_remove_link(&udev->dev.kobj, "port");
                        goto out_del_dev;
                }

                if (!test_and_set_bit(port1, hub->child_usage_bits))
                        pm_runtime_get_sync(&port_dev->dev);

                typec_attach(port_dev->connector, &udev->dev);
        }

        (void) usb_create_ep_devs(&udev->dev, &udev->ep0, udev);
        usb_mark_last_busy(udev);
        pm_runtime_put_sync_autosuspend(&udev->dev);
        return err;

out_del_dev:
        device_del(&udev->dev);
fail:
        usb_set_device_state(udev, USB_STATE_NOTATTACHED);
        pm_runtime_disable(&udev->dev);
        pm_runtime_set_suspended(&udev->dev);
        return err;
}


/**
 * usb_deauthorize_device - deauthorize a device (usbcore-internal)
 * @usb_dev: USB device
 *
 * Move the USB device to a very basic state where interfaces are disabled
 * and the device is in fact unconfigured and unusable.
 *
 * We share a lock (that we have) with device_del(), so we need to
 * defer its call.
 *
 * Return: 0.
 */
int usb_deauthorize_device(struct usb_device *usb_dev)
{
        usb_lock_device(usb_dev);
        if (usb_dev->authorized == 0)
                goto out_unauthorized;

        usb_dev->authorized = 0;
        usb_set_configuration(usb_dev, -1);

out_unauthorized:
        usb_unlock_device(usb_dev);
        return 0;
}


int usb_authorize_device(struct usb_device *usb_dev)
{
        int result = 0, c;

        usb_lock_device(usb_dev);
        if (usb_dev->authorized == 1)
                goto out_authorized;

        result = usb_autoresume_device(usb_dev);
        if (result < 0) {
                dev_err(&usb_dev->dev,
                        "can't autoresume for authorization: %d\n", result);
                goto error_autoresume;
        }

        usb_dev->authorized = 1;
        /* Choose and set the configuration.  This registers the interfaces
         * with the driver core and lets interface drivers bind to them.
         */
        c = usb_choose_configuration(usb_dev);
        if (c >= 0) {
                result = usb_set_configuration(usb_dev, c);
                if (result) {
                        dev_err(&usb_dev->dev,
                                "can't set config #%d, error %d\n", c, result);
                        /* This need not be fatal.  The user can try to
                         * set other configurations. */
                }
        }
        dev_info(&usb_dev->dev, "authorized to connect\n");

        usb_autosuspend_device(usb_dev);
error_autoresume:
out_authorized:
        usb_unlock_device(usb_dev);        /* complements locktree */
        return result;
}

/**
 * get_port_ssp_rate - Match the extended port status to SSP rate
 * @hdev: The hub device
 * @ext_portstatus: extended port status
 *
 * Match the extended port status speed id to the SuperSpeed Plus sublink speed
 * capability attributes. Base on the number of connected lanes and speed,
 * return the corresponding enum usb_ssp_rate.
 */
static enum usb_ssp_rate get_port_ssp_rate(struct usb_device *hdev,
                                           u32 ext_portstatus)
{
        struct usb_ssp_cap_descriptor *ssp_cap;
        u32 attr;
        u8 speed_id;
        u8 ssac;
        u8 lanes;
        int i;

        if (!hdev->bos)
                goto out;

        ssp_cap = hdev->bos->ssp_cap;
        if (!ssp_cap)
                goto out;

        speed_id = ext_portstatus & USB_EXT_PORT_STAT_RX_SPEED_ID;
        lanes = USB_EXT_PORT_RX_LANES(ext_portstatus) + 1;

        ssac = le32_to_cpu(ssp_cap->bmAttributes) &
                USB_SSP_SUBLINK_SPEED_ATTRIBS;

        for (i = 0; i <= ssac; i++) {
                u8 ssid;

                attr = le32_to_cpu(ssp_cap->bmSublinkSpeedAttr[i]);
                ssid = FIELD_GET(USB_SSP_SUBLINK_SPEED_SSID, attr);
                if (speed_id == ssid) {
                        u16 mantissa;
                        u8 lse;
                        u8 type;

                        /*
                         * Note: currently asymmetric lane types are only
                         * applicable for SSIC operate in SuperSpeed protocol
                         */
                        type = FIELD_GET(USB_SSP_SUBLINK_SPEED_ST, attr);
                        if (type == USB_SSP_SUBLINK_SPEED_ST_ASYM_RX ||
                            type == USB_SSP_SUBLINK_SPEED_ST_ASYM_TX)
                                goto out;

                        if (FIELD_GET(USB_SSP_SUBLINK_SPEED_LP, attr) !=
                            USB_SSP_SUBLINK_SPEED_LP_SSP)
                                goto out;

                        lse = FIELD_GET(USB_SSP_SUBLINK_SPEED_LSE, attr);
                        mantissa = FIELD_GET(USB_SSP_SUBLINK_SPEED_LSM, attr);

                        /* Convert to Gbps */
                        for (; lse < USB_SSP_SUBLINK_SPEED_LSE_GBPS; lse++)
                                mantissa /= 1000;

                        if (mantissa >= 10 && lanes == 1)
                                return USB_SSP_GEN_2x1;

                        if (mantissa >= 10 && lanes == 2)
                                return USB_SSP_GEN_2x2;

                        if (mantissa >= 5 && lanes == 2)
                                return USB_SSP_GEN_1x2;

                        goto out;
                }
        }

out:
        return USB_SSP_GEN_UNKNOWN;
}

#ifdef CONFIG_USB_FEW_INIT_RETRIES
#define PORT_RESET_TRIES        2
#define SET_ADDRESS_TRIES        1
#define GET_DESCRIPTOR_TRIES        1
#define GET_MAXPACKET0_TRIES        1
#define PORT_INIT_TRIES                4

#else
#define PORT_RESET_TRIES        5
#define SET_ADDRESS_TRIES        2
#define GET_DESCRIPTOR_TRIES        2
#define GET_MAXPACKET0_TRIES        3
#define PORT_INIT_TRIES                4
#endif        /* CONFIG_USB_FEW_INIT_RETRIES */

#define DETECT_DISCONNECT_TRIES 5

#define HUB_ROOT_RESET_TIME        60        /* times are in msec */
#define HUB_SHORT_RESET_TIME        10
#define HUB_BH_RESET_TIME        50
#define HUB_LONG_RESET_TIME        200
#define HUB_RESET_TIMEOUT        800

static bool use_new_scheme(struct usb_device *udev, int retry,
                           struct usb_port *port_dev)
{
        int old_scheme_first_port =
                (port_dev->quirks & USB_PORT_QUIRK_OLD_SCHEME) ||
                old_scheme_first;

        /*
         * "New scheme" enumeration causes an extra state transition to be
         * exposed to an xhci host and causes USB3 devices to receive control
         * commands in the default state.  This has been seen to cause
         * enumeration failures, so disable this enumeration scheme for USB3
         * devices.
         */
        if (udev->speed >= USB_SPEED_SUPER)
                return false;

        /*
         * If use_both_schemes is set, use the first scheme (whichever
         * it is) for the larger half of the retries, then use the other
         * scheme.  Otherwise, use the first scheme for all the retries.
         */
        if (use_both_schemes && retry >= (PORT_INIT_TRIES + 1) / 2)
                return old_scheme_first_port;        /* Second half */
        return !old_scheme_first_port;                /* First half or all */
}

/* Is a USB 3.0 port in the Inactive or Compliance Mode state?
 * Port warm reset is required to recover
 */
static bool hub_port_warm_reset_required(struct usb_hub *hub, int port1,
                u16 portstatus)
{
        u16 link_state;

        if (!hub_is_superspeed(hub->hdev))
                return false;

        if (test_bit(port1, hub->warm_reset_bits))
                return true;

        link_state = portstatus & USB_PORT_STAT_LINK_STATE;
        return link_state == USB_SS_PORT_LS_SS_INACTIVE
                || link_state == USB_SS_PORT_LS_COMP_MOD;
}

static int hub_port_wait_reset(struct usb_hub *hub, int port1,
                        struct usb_device *udev, unsigned int delay, bool warm)
{
        int delay_time, ret;
        u16 portstatus;
        u16 portchange;
        u32 ext_portstatus = 0;

        for (delay_time = 0;
                        delay_time < HUB_RESET_TIMEOUT;
                        delay_time += delay) {
                /* wait to give the device a chance to reset */
                msleep(delay);

                /* read and decode port status */
                if (hub_is_superspeedplus(hub->hdev))
                        ret = hub_ext_port_status(hub, port1,
                                                  HUB_EXT_PORT_STATUS,
                                                  &portstatus, &portchange,
                                                  &ext_portstatus);
                else
                        ret = usb_hub_port_status(hub, port1, &portstatus,
                                              &portchange);
                if (ret < 0)
                        return ret;

                /*
                 * The port state is unknown until the reset completes.
                 *
                 * On top of that, some chips may require additional time
                 * to re-establish a connection after the reset is complete,
                 * so also wait for the connection to be re-established.
                 */
                if (!(portstatus & USB_PORT_STAT_RESET) &&
                    (portstatus & USB_PORT_STAT_CONNECTION))
                        break;

                /* switch to the long delay after two short delay failures */
                if (delay_time >= 2 * HUB_SHORT_RESET_TIME)
                        delay = HUB_LONG_RESET_TIME;

                dev_dbg(&hub->ports[port1 - 1]->dev,
                                "not %sreset yet, waiting %dms\n",
                                warm ? "warm " : "", delay);
        }

        if ((portstatus & USB_PORT_STAT_RESET))
                return -EBUSY;

        if (hub_port_warm_reset_required(hub, port1, portstatus))
                return -ENOTCONN;

        /* Device went away? */
        if (!(portstatus & USB_PORT_STAT_CONNECTION))
                return -ENOTCONN;

        /* Retry if connect change is set but status is still connected.
         * A USB 3.0 connection may bounce if multiple warm resets were issued,
         * but the device may have successfully re-connected. Ignore it.
         */
        if (!hub_is_superspeed(hub->hdev) &&
            (portchange & USB_PORT_STAT_C_CONNECTION)) {
                usb_clear_port_feature(hub->hdev, port1,
                                       USB_PORT_FEAT_C_CONNECTION);
                return -EAGAIN;
        }

        if (!(portstatus & USB_PORT_STAT_ENABLE))
                return -EBUSY;

        if (!udev)
                return 0;

        if (hub_is_superspeedplus(hub->hdev)) {
                /* extended portstatus Rx and Tx lane count are zero based */
                udev->rx_lanes = USB_EXT_PORT_RX_LANES(ext_portstatus) + 1;
                udev->tx_lanes = USB_EXT_PORT_TX_LANES(ext_portstatus) + 1;
                udev->ssp_rate = get_port_ssp_rate(hub->hdev, ext_portstatus);
        } else {
                udev->rx_lanes = 1;
                udev->tx_lanes = 1;
                udev->ssp_rate = USB_SSP_GEN_UNKNOWN;
        }
        if (udev->ssp_rate != USB_SSP_GEN_UNKNOWN)
                udev->speed = USB_SPEED_SUPER_PLUS;
        else if (hub_is_superspeed(hub->hdev))
                udev->speed = USB_SPEED_SUPER;
        else if (portstatus & USB_PORT_STAT_HIGH_SPEED)
                udev->speed = USB_SPEED_HIGH;
        else if (portstatus & USB_PORT_STAT_LOW_SPEED)
                udev->speed = USB_SPEED_LOW;
        else
                udev->speed = USB_SPEED_FULL;
        return 0;
}

/* Handle port reset and port warm(BH) reset (for USB3 protocol ports) */
static int hub_port_reset(struct usb_hub *hub, int port1,
                        struct usb_device *udev, unsigned int delay, bool warm)
{
        int i, status;
        u16 portchange, portstatus;
        struct usb_port *port_dev = hub->ports[port1 - 1];
        int reset_recovery_time;

        if (!hub_is_superspeed(hub->hdev)) {
                if (warm) {
                        dev_err(hub->intfdev, "only USB3 hub support "
                                                "warm reset\n");
                        return -EINVAL;
                }
                /* Block EHCI CF initialization during the port reset.
                 * Some companion controllers don't like it when they mix.
                 */
                down_read(&ehci_cf_port_reset_rwsem);
        } else if (!warm) {
                /*
                 * If the caller hasn't explicitly requested a warm reset,
                 * double check and see if one is needed.
                 */
                if (usb_hub_port_status(hub, port1, &portstatus,
                                        &portchange) == 0)
                        if (hub_port_warm_reset_required(hub, port1,
                                                        portstatus))
                                warm = true;
        }
        clear_bit(port1, hub->warm_reset_bits);

        /* Reset the port */
        for (i = 0; i < PORT_RESET_TRIES; i++) {
                status = set_port_feature(hub->hdev, port1, (warm ?
                                        USB_PORT_FEAT_BH_PORT_RESET :
                                        USB_PORT_FEAT_RESET));
                if (status == -ENODEV) {
                        ;        /* The hub is gone */
                } else if (status) {
                        dev_err(&port_dev->dev,
                                        "cannot %sreset (err = %d)\n",
                                        warm ? "warm " : "", status);
                } else {
                        status = hub_port_wait_reset(hub, port1, udev, delay,
                                                                warm);
                        if (status && status != -ENOTCONN && status != -ENODEV)
                                dev_dbg(hub->intfdev,
                                                "port_wait_reset: err = %d\n",
                                                status);
                }

                /*
                 * Check for disconnect or reset, and bail out after several
                 * reset attempts to avoid warm reset loop.
                 */
                if (status == 0 || status == -ENOTCONN || status == -ENODEV ||
                    (status == -EBUSY && i == PORT_RESET_TRIES - 1)) {
                        usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_RESET);

                        if (!hub_is_superspeed(hub->hdev))
                                goto done;

                        usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_BH_PORT_RESET);
                        usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_PORT_LINK_STATE);

                        if (udev)
                                usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_CONNECTION);

                        /*
                         * If a USB 3.0 device migrates from reset to an error
                         * state, re-issue the warm reset.
                         */
                        if (usb_hub_port_status(hub, port1,
                                        &portstatus, &portchange) < 0)
                                goto done;

                        if (!hub_port_warm_reset_required(hub, port1,
                                        portstatus))
                                goto done;

                        /*
                         * If the port is in SS.Inactive or Compliance Mode, the
                         * hot or warm reset failed.  Try another warm reset.
                         */
                        if (!warm) {
                                dev_dbg(&port_dev->dev,
                                                "hot reset failed, warm reset\n");
                                warm = true;
                        }
                }

                dev_dbg(&port_dev->dev,
                                "not enabled, trying %sreset again...\n",
                                warm ? "warm " : "");
                delay = HUB_LONG_RESET_TIME;
        }

        dev_err(&port_dev->dev, "Cannot enable. Maybe the USB cable is bad?\n");

done:
        if (status == 0) {
                if (port_dev->quirks & USB_PORT_QUIRK_FAST_ENUM)
                        usleep_range(10000, 12000);
                else {
                        /* TRSTRCY = 10 ms; plus some extra */
                        reset_recovery_time = 10 + 40;

                        /* Hub needs extra delay after resetting its port. */
                        if (hub->hdev->quirks & USB_QUIRK_HUB_SLOW_RESET)
                                reset_recovery_time += 100;

                        msleep(reset_recovery_time);
                }

                if (udev) {
                        struct usb_hcd *hcd = bus_to_hcd(udev->bus);

                        update_devnum(udev, 0);
                        /* The xHC may think the device is already reset,
                         * so ignore the status.
                         */
                        if (hcd->driver->reset_device)
                                hcd->driver->reset_device(hcd, udev);

                        usb_set_device_state(udev, USB_STATE_DEFAULT);
                }
        } else {
                if (udev)
                        usb_set_device_state(udev, USB_STATE_NOTATTACHED);
        }

        if (!hub_is_superspeed(hub->hdev))
                up_read(&ehci_cf_port_reset_rwsem);

        return status;
}

/*
 * hub_port_stop_enumerate - stop USB enumeration or ignore port events
 * @hub: target hub
 * @port1: port num of the port
 * @retries: port retries number of hub_port_init()
 *
 * Return:
 *    true: ignore port actions/events or give up connection attempts.
 *    false: keep original behavior.
 *
 * This function will be based on retries to check whether the port which is
 * marked with early_stop attribute would stop enumeration or ignore events.
 *
 * Note:
 * This function didn't change anything if early_stop is not set, and it will
 * prevent all connection attempts when early_stop is set and the attempts of
 * the port are more than 1.
 */
static bool hub_port_stop_enumerate(struct usb_hub *hub, int port1, int retries)
{
        struct usb_port *port_dev = hub->ports[port1 - 1];

        if (port_dev->early_stop) {
                if (port_dev->ignore_event)
                        return true;

                /*
                 * We want unsuccessful attempts to fail quickly.
                 * Since some devices may need one failure during
                 * port initialization, we allow two tries but no
                 * more.
                 */
                if (retries < 2)
                        return false;

                port_dev->ignore_event = 1;
        } else
                port_dev->ignore_event = 0;

        return port_dev->ignore_event;
}

/* Check if a port is power on */
int usb_port_is_power_on(struct usb_hub *hub, unsigned int portstatus)
{
        int ret = 0;

        if (hub_is_superspeed(hub->hdev)) {
                if (portstatus & USB_SS_PORT_STAT_POWER)
                        ret = 1;
        } else {
                if (portstatus & USB_PORT_STAT_POWER)
                        ret = 1;
        }

        return ret;
}

static void usb_lock_port(struct usb_port *port_dev)
                __acquires(&port_dev->status_lock)
{
        mutex_lock(&port_dev->status_lock);
        __acquire(&port_dev->status_lock);
}

static void usb_unlock_port(struct usb_port *port_dev)
                __releases(&port_dev->status_lock)
{
        mutex_unlock(&port_dev->status_lock);
        __release(&port_dev->status_lock);
}

#ifdef        CONFIG_PM

/* Check if a port is suspended(USB2.0 port) or in U3 state(USB3.0 port) */
static int port_is_suspended(struct usb_hub *hub, unsigned portstatus)
{
        int ret = 0;

        if (hub_is_superspeed(hub->hdev)) {
                if ((portstatus & USB_PORT_STAT_LINK_STATE)
                                == USB_SS_PORT_LS_U3)
                        ret = 1;
        } else {
                if (portstatus & USB_PORT_STAT_SUSPEND)
                        ret = 1;
        }

        return ret;
}

/* Determine whether the device on a port is ready for a normal resume,
 * is ready for a reset-resume, or should be disconnected.
 */
static int check_port_resume_type(struct usb_device *udev,
                struct usb_hub *hub, int port1,
                int status, u16 portchange, u16 portstatus)
{
        struct usb_port *port_dev = hub->ports[port1 - 1];
        int retries = 3;

 retry:
        /* Is a warm reset needed to recover the connection? */
        if (status == 0 && udev->reset_resume
                && hub_port_warm_reset_required(hub, port1, portstatus)) {
                /* pass */;
        }
        /* Is the device still present? */
        else if (status || port_is_suspended(hub, portstatus) ||
                        !usb_port_is_power_on(hub, portstatus)) {
                if (status >= 0)
                        status = -ENODEV;
        } else if (!(portstatus & USB_PORT_STAT_CONNECTION)) {
                if (retries--) {
                        usleep_range(200, 300);
                        status = usb_hub_port_status(hub, port1, &portstatus,
                                                             &portchange);
                        goto retry;
                }
                status = -ENODEV;
        }

        /* Can't do a normal resume if the port isn't enabled,
         * so try a reset-resume instead.
         */
        else if (!(portstatus & USB_PORT_STAT_ENABLE) && !udev->reset_resume) {
                if (udev->persist_enabled)
                        udev->reset_resume = 1;
                else
                        status = -ENODEV;
        }

        if (status) {
                dev_dbg(&port_dev->dev, "status %04x.%04x after resume, %d\n",
                                portchange, portstatus, status);
        } else if (udev->reset_resume) {

                /* Late port handoff can set status-change bits */
                if (portchange & USB_PORT_STAT_C_CONNECTION)
                        usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_CONNECTION);
                if (portchange & USB_PORT_STAT_C_ENABLE)
                        usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_ENABLE);

                /*
                 * Whatever made this reset-resume necessary may have
                 * turned on the port1 bit in hub->change_bits.  But after
                 * a successful reset-resume we want the bit to be clear;
                 * if it was on it would indicate that something happened
                 * following the reset-resume.
                 */
                clear_bit(port1, hub->change_bits);
        }

        return status;
}

int usb_disable_ltm(struct usb_device *udev)
{
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);

        /* Check if the roothub and device supports LTM. */
        if (!usb_device_supports_ltm(hcd->self.root_hub) ||
                        !usb_device_supports_ltm(udev))
                return 0;

        /* Clear Feature LTM Enable can only be sent if the device is
         * configured.
         */
        if (!udev->actconfig)
                return 0;

        return usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                        USB_REQ_CLEAR_FEATURE, USB_RECIP_DEVICE,
                        USB_DEVICE_LTM_ENABLE, 0, NULL, 0,
                        USB_CTRL_SET_TIMEOUT);
}
EXPORT_SYMBOL_GPL(usb_disable_ltm);

void usb_enable_ltm(struct usb_device *udev)
{
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);

        /* Check if the roothub and device supports LTM. */
        if (!usb_device_supports_ltm(hcd->self.root_hub) ||
                        !usb_device_supports_ltm(udev))
                return;

        /* Set Feature LTM Enable can only be sent if the device is
         * configured.
         */
        if (!udev->actconfig)
                return;

        usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                        USB_REQ_SET_FEATURE, USB_RECIP_DEVICE,
                        USB_DEVICE_LTM_ENABLE, 0, NULL, 0,
                        USB_CTRL_SET_TIMEOUT);
}
EXPORT_SYMBOL_GPL(usb_enable_ltm);

/*
 * usb_enable_remote_wakeup - enable remote wakeup for a device
 * @udev: target device
 *
 * For USB-2 devices: Set the device's remote wakeup feature.
 *
 * For USB-3 devices: Assume there's only one function on the device and
 * enable remote wake for the first interface.  FIXME if the interface
 * association descriptor shows there's more than one function.
 */
static int usb_enable_remote_wakeup(struct usb_device *udev)
{
        if (udev->speed < USB_SPEED_SUPER)
                return usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                                USB_REQ_SET_FEATURE, USB_RECIP_DEVICE,
                                USB_DEVICE_REMOTE_WAKEUP, 0, NULL, 0,
                                USB_CTRL_SET_TIMEOUT);
        else
                return usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                                USB_REQ_SET_FEATURE, USB_RECIP_INTERFACE,
                                USB_INTRF_FUNC_SUSPEND,
                                USB_INTRF_FUNC_SUSPEND_RW |
                                        USB_INTRF_FUNC_SUSPEND_LP,
                                NULL, 0, USB_CTRL_SET_TIMEOUT);
}

/*
 * usb_disable_remote_wakeup - disable remote wakeup for a device
 * @udev: target device
 *
 * For USB-2 devices: Clear the device's remote wakeup feature.
 *
 * For USB-3 devices: Assume there's only one function on the device and
 * disable remote wake for the first interface.  FIXME if the interface
 * association descriptor shows there's more than one function.
 */
static int usb_disable_remote_wakeup(struct usb_device *udev)
{
        if (udev->speed < USB_SPEED_SUPER)
                return usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                                USB_REQ_CLEAR_FEATURE, USB_RECIP_DEVICE,
                                USB_DEVICE_REMOTE_WAKEUP, 0, NULL, 0,
                                USB_CTRL_SET_TIMEOUT);
        else
                return usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                                USB_REQ_SET_FEATURE, USB_RECIP_INTERFACE,
                                USB_INTRF_FUNC_SUSPEND,        0, NULL, 0,
                                USB_CTRL_SET_TIMEOUT);
}

/* Count of wakeup-enabled devices at or below udev */
unsigned usb_wakeup_enabled_descendants(struct usb_device *udev)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(udev);

        return udev->do_remote_wakeup +
                        (hub ? hub->wakeup_enabled_descendants : 0);
}
EXPORT_SYMBOL_GPL(usb_wakeup_enabled_descendants);

/*
 * usb_port_suspend - suspend a usb device's upstream port
 * @udev: device that's no longer in active use, not a root hub
 * Context: must be able to sleep; device not locked; pm locks held
 *
 * Suspends a USB device that isn't in active use, conserving power.
 * Devices may wake out of a suspend, if anything important happens,
 * using the remote wakeup mechanism.  They may also be taken out of
 * suspend by the host, using usb_port_resume().  It's also routine
 * to disconnect devices while they are suspended.
 *
 * This only affects the USB hardware for a device; its interfaces
 * (and, for hubs, child devices) must already have been suspended.
 *
 * Selective port suspend reduces power; most suspended devices draw
 * less than 500 uA.  It's also used in OTG, along with remote wakeup.
 * All devices below the suspended port are also suspended.
 *
 * Devices leave suspend state when the host wakes them up.  Some devices
 * also support "remote wakeup", where the device can activate the USB
 * tree above them to deliver data, such as a keypress or packet.  In
 * some cases, this wakes the USB host.
 *
 * Suspending OTG devices may trigger HNP, if that's been enabled
 * between a pair of dual-role devices.  That will change roles, such
 * as from A-Host to A-Peripheral or from B-Host back to B-Peripheral.
 *
 * Devices on USB hub ports have only one "suspend" state, corresponding
 * to ACPI D2, "may cause the device to lose some context".
 * State transitions include:
 *
 *   - suspend, resume ... when the VBUS power link stays live
 *   - suspend, disconnect ... VBUS lost
 *
 * Once VBUS drop breaks the circuit, the port it's using has to go through
 * normal re-enumeration procedures, starting with enabling VBUS power.
 * Other than re-initializing the hub (plug/unplug, except for root hubs),
 * Linux (2.6) currently has NO mechanisms to initiate that:  no hub_wq
 * timer, no SRP, no requests through sysfs.
 *
 * If Runtime PM isn't enabled or used, non-SuperSpeed devices may not get
 * suspended until their bus goes into global suspend (i.e., the root
 * hub is suspended).  Nevertheless, we change @udev->state to
 * USB_STATE_SUSPENDED as this is the device's "logical" state.  The actual
 * upstream port setting is stored in @udev->port_is_suspended.
 *
 * Returns 0 on success, else negative errno.
 */
int usb_port_suspend(struct usb_device *udev, pm_message_t msg)
{
        struct usb_hub        *hub = usb_hub_to_struct_hub(udev->parent);
        struct usb_port *port_dev = hub->ports[udev->portnum - 1];
        int                port1 = udev->portnum;
        int                status;
        bool                really_suspend = true;

        usb_lock_port(port_dev);

        /* enable remote wakeup when appropriate; this lets the device
         * wake up the upstream hub (including maybe the root hub).
         *
         * NOTE:  OTG devices may issue remote wakeup (or SRP) even when
         * we don't explicitly enable it here.
         */
        if (udev->do_remote_wakeup) {
                status = usb_enable_remote_wakeup(udev);
                if (status) {
                        dev_dbg(&udev->dev, "won't remote wakeup, status %d\n",
                                        status);
                        /* bail if autosuspend is requested */
                        if (PMSG_IS_AUTO(msg))
                                goto err_wakeup;
                }
        }

        /* disable USB2 hardware LPM */
        usb_disable_usb2_hardware_lpm(udev);

        if (usb_disable_ltm(udev)) {
                dev_err(&udev->dev, "Failed to disable LTM before suspend\n");
                status = -ENOMEM;
                if (PMSG_IS_AUTO(msg))
                        goto err_ltm;
        }

        /* see 7.1.7.6 */
        if (hub_is_superspeed(hub->hdev))
                status = hub_set_port_link_state(hub, port1, USB_SS_PORT_LS_U3);

        /*
         * For system suspend, we do not need to enable the suspend feature
         * on individual USB-2 ports.  The devices will automatically go
         * into suspend a few ms after the root hub stops sending packets.
         * The USB 2.0 spec calls this "global suspend".
         *
         * However, many USB hubs have a bug: They don't relay wakeup requests
         * from a downstream port if the port's suspend feature isn't on.
         * Therefore we will turn on the suspend feature if udev or any of its
         * descendants is enabled for remote wakeup.
         */
        else if (PMSG_IS_AUTO(msg) || usb_wakeup_enabled_descendants(udev) > 0)
                status = set_port_feature(hub->hdev, port1,
                                USB_PORT_FEAT_SUSPEND);
        else {
                really_suspend = false;
                status = 0;
        }
        if (status) {
                /* Check if the port has been suspended for the timeout case
                 * to prevent the suspended port from incorrect handling.
                 */
                if (status == -ETIMEDOUT) {
                        int ret;
                        u16 portstatus, portchange;

                        portstatus = portchange = 0;
                        ret = usb_hub_port_status(hub, port1, &portstatus,
                                        &portchange);

                        dev_dbg(&port_dev->dev,
                                "suspend timeout, status %04x\n", portstatus);

                        if (ret == 0 && port_is_suspended(hub, portstatus)) {
                                status = 0;
                                goto suspend_done;
                        }
                }

                dev_dbg(&port_dev->dev, "can't suspend, status %d\n", status);

                /* Try to enable USB3 LTM again */
                usb_enable_ltm(udev);
 err_ltm:
                /* Try to enable USB2 hardware LPM again */
                usb_enable_usb2_hardware_lpm(udev);

                if (udev->do_remote_wakeup)
                        (void) usb_disable_remote_wakeup(udev);
 err_wakeup:

                /* System sleep transitions should never fail */
                if (!PMSG_IS_AUTO(msg))
                        status = 0;
        } else {
 suspend_done:
                dev_dbg(&udev->dev, "usb %ssuspend, wakeup %d\n",
                                (PMSG_IS_AUTO(msg) ? "auto-" : ""),
                                udev->do_remote_wakeup);
                if (really_suspend) {
                        udev->port_is_suspended = 1;

                        /* device has up to 10 msec to fully suspend */
                        msleep(10);
                }
                usb_set_device_state(udev, USB_STATE_SUSPENDED);
        }

        if (status == 0 && !udev->do_remote_wakeup && udev->persist_enabled
                        && test_and_clear_bit(port1, hub->child_usage_bits))
                pm_runtime_put_sync(&port_dev->dev);

        usb_mark_last_busy(hub->hdev);

        usb_unlock_port(port_dev);
        return status;
}

/*
 * If the USB "suspend" state is in use (rather than "global suspend"),
 * many devices will be individually taken out of suspend state using
 * special "resume" signaling.  This routine kicks in shortly after
 * hardware resume signaling is finished, either because of selective
 * resume (by host) or remote wakeup (by device) ... now see what changed
 * in the tree that's rooted at this device.
 *
 * If @udev->reset_resume is set then the device is reset before the
 * status check is done.
 */
static int finish_port_resume(struct usb_device *udev)
{
        int        status = 0;
        u16        devstatus = 0;

        /* caller owns the udev device lock */
        dev_dbg(&udev->dev, "%s\n",
                udev->reset_resume ? "finish reset-resume" : "finish resume");

        /* usb ch9 identifies four variants of SUSPENDED, based on what
         * state the device resumes to.  Linux currently won't see the
         * first two on the host side; they'd be inside hub_port_init()
         * during many timeouts, but hub_wq can't suspend until later.
         */
        usb_set_device_state(udev, udev->actconfig
                        ? USB_STATE_CONFIGURED
                        : USB_STATE_ADDRESS);

        /* 10.5.4.5 says not to reset a suspended port if the attached
         * device is enabled for remote wakeup.  Hence the reset
         * operation is carried out here, after the port has been
         * resumed.
         */
        if (udev->reset_resume) {
                /*
                 * If the device morphs or switches modes when it is reset,
                 * we don't want to perform a reset-resume.  We'll fail the
                 * resume, which will cause a logical disconnect, and then
                 * the device will be rediscovered.
                 */
 retry_reset_resume:
                if (udev->quirks & USB_QUIRK_RESET)
                        status = -ENODEV;
                else
                        status = usb_reset_and_verify_device(udev);
        }

        /* 10.5.4.5 says be sure devices in the tree are still there.
         * For now let's assume the device didn't go crazy on resume,
         * and device drivers will know about any resume quirks.
         */
        if (status == 0) {
                devstatus = 0;
                status = usb_get_std_status(udev, USB_RECIP_DEVICE, 0, &devstatus);

                /* If a normal resume failed, try doing a reset-resume */
                if (status && !udev->reset_resume && udev->persist_enabled) {
                        dev_dbg(&udev->dev, "retry with reset-resume\n");
                        udev->reset_resume = 1;
                        goto retry_reset_resume;
                }
        }

        if (status) {
                dev_dbg(&udev->dev, "gone after usb resume? status %d\n",
                                status);
        /*
         * There are a few quirky devices which violate the standard
         * by claiming to have remote wakeup enabled after a reset,
         * which crash if the feature is cleared, hence check for
         * udev->reset_resume
         */
        } else if (udev->actconfig && !udev->reset_resume) {
                if (udev->speed < USB_SPEED_SUPER) {
                        if (devstatus & (1 << USB_DEVICE_REMOTE_WAKEUP))
                                status = usb_disable_remote_wakeup(udev);
                } else {
                        status = usb_get_std_status(udev, USB_RECIP_INTERFACE, 0,
                                        &devstatus);
                        if (!status && devstatus & (USB_INTRF_STAT_FUNC_RW_CAP
                                        | USB_INTRF_STAT_FUNC_RW))
                                status = usb_disable_remote_wakeup(udev);
                }

                if (status)
                        dev_dbg(&udev->dev,
                                "disable remote wakeup, status %d\n",
                                status);
                status = 0;
        }
        return status;
}

/*
 * There are some SS USB devices which take longer time for link training.
 * XHCI specs 4.19.4 says that when Link training is successful, port
 * sets CCS bit to 1. So if SW reads port status before successful link
 * training, then it will not find device to be present.
 * USB Analyzer log with such buggy devices show that in some cases
 * device switch on the RX termination after long delay of host enabling
 * the VBUS. In few other cases it has been seen that device fails to
 * negotiate link training in first attempt. It has been
 * reported till now that few devices take as long as 2000 ms to train
 * the link after host enabling its VBUS and termination. Following
 * routine implements a 2000 ms timeout for link training. If in a case
 * link trains before timeout, loop will exit earlier.
 *
 * There are also some 2.0 hard drive based devices and 3.0 thumb
 * drives that, when plugged into a 2.0 only port, take a long
 * time to set CCS after VBUS enable.
 *
 * FIXME: If a device was connected before suspend, but was removed
 * while system was asleep, then the loop in the following routine will
 * only exit at timeout.
 *
 * This routine should only be called when persist is enabled.
 */
static int wait_for_connected(struct usb_device *udev,
                struct usb_hub *hub, int port1,
                u16 *portchange, u16 *portstatus)
{
        int status = 0, delay_ms = 0;

        while (delay_ms < 2000) {
                if (status || *portstatus & USB_PORT_STAT_CONNECTION)
                        break;
                if (!usb_port_is_power_on(hub, *portstatus)) {
                        status = -ENODEV;
                        break;
                }
                msleep(20);
                delay_ms += 20;
                status = usb_hub_port_status(hub, port1, portstatus, portchange);
        }
        dev_dbg(&udev->dev, "Waited %dms for CONNECT\n", delay_ms);
        return status;
}

/*
 * usb_port_resume - re-activate a suspended usb device's upstream port
 * @udev: device to re-activate, not a root hub
 * Context: must be able to sleep; device not locked; pm locks held
 *
 * This will re-activate the suspended device, increasing power usage
 * while letting drivers communicate again with its endpoints.
 * USB resume explicitly guarantees that the power session between
 * the host and the device is the same as it was when the device
 * suspended.
 *
 * If @udev->reset_resume is set then this routine won't check that the
 * port is still enabled.  Furthermore, finish_port_resume() above will
 * reset @udev.  The end result is that a broken power session can be
 * recovered and @udev will appear to persist across a loss of VBUS power.
 *
 * For example, if a host controller doesn't maintain VBUS suspend current
 * during a system sleep or is reset when the system wakes up, all the USB
 * power sessions below it will be broken.  This is especially troublesome
 * for mass-storage devices containing mounted filesystems, since the
 * device will appear to have disconnected and all the memory mappings
 * to it will be lost.  Using the USB_PERSIST facility, the device can be
 * made to appear as if it had not disconnected.
 *
 * This facility can be dangerous.  Although usb_reset_and_verify_device() makes
 * every effort to insure that the same device is present after the
 * reset as before, it cannot provide a 100% guarantee.  Furthermore it's
 * quite possible for a device to remain unaltered but its media to be
 * changed.  If the user replaces a flash memory card while the system is
 * asleep, he will have only himself to blame when the filesystem on the
 * new card is corrupted and the system crashes.
 *
 * Returns 0 on success, else negative errno.
 */
int usb_port_resume(struct usb_device *udev, pm_message_t msg)
{
        struct usb_hub        *hub = usb_hub_to_struct_hub(udev->parent);
        struct usb_port *port_dev = hub->ports[udev->portnum  - 1];
        int                port1 = udev->portnum;
        int                status;
        u16                portchange, portstatus;

        if (!test_and_set_bit(port1, hub->child_usage_bits)) {
                status = pm_runtime_resume_and_get(&port_dev->dev);
                if (status < 0) {
                        dev_dbg(&udev->dev, "can't resume usb port, status %d\n",
                                        status);
                        return status;
                }
        }

        usb_lock_port(port_dev);

        /* Skip the initial Clear-Suspend step for a remote wakeup */
        status = usb_hub_port_status(hub, port1, &portstatus, &portchange);
        if (status == 0 && !port_is_suspended(hub, portstatus)) {
                if (portchange & USB_PORT_STAT_C_SUSPEND)
                        pm_wakeup_event(&udev->dev, 0);
                goto SuspendCleared;
        }

        /* see 7.1.7.7; affects power usage, but not budgeting */
        if (hub_is_superspeed(hub->hdev))
                status = hub_set_port_link_state(hub, port1, USB_SS_PORT_LS_U0);
        else
                status = usb_clear_port_feature(hub->hdev,
                                port1, USB_PORT_FEAT_SUSPEND);
        if (status) {
                dev_dbg(&port_dev->dev, "can't resume, status %d\n", status);
        } else {
                /* drive resume for USB_RESUME_TIMEOUT msec */
                dev_dbg(&udev->dev, "usb %sresume\n",
                                (PMSG_IS_AUTO(msg) ? "auto-" : ""));
                msleep(USB_RESUME_TIMEOUT);

                /* Virtual root hubs can trigger on GET_PORT_STATUS to
                 * stop resume signaling.  Then finish the resume
                 * sequence.
                 */
                status = usb_hub_port_status(hub, port1, &portstatus, &portchange);
        }

 SuspendCleared:
        if (status == 0) {
                udev->port_is_suspended = 0;
                if (hub_is_superspeed(hub->hdev)) {
                        if (portchange & USB_PORT_STAT_C_LINK_STATE)
                                usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_PORT_LINK_STATE);
                } else {
                        if (portchange & USB_PORT_STAT_C_SUSPEND)
                                usb_clear_port_feature(hub->hdev, port1,
                                                USB_PORT_FEAT_C_SUSPEND);
                }

                /* TRSMRCY = 10 msec */
                msleep(10);
        }

        if (udev->persist_enabled)
                status = wait_for_connected(udev, hub, port1, &portchange,
                                &portstatus);

        status = check_port_resume_type(udev,
                        hub, port1, status, portchange, portstatus);
        if (status == 0)
                status = finish_port_resume(udev);
        if (status < 0) {
                dev_dbg(&udev->dev, "can't resume, status %d\n", status);
                hub_port_logical_disconnect(hub, port1);
        } else  {
                /* Try to enable USB2 hardware LPM */
                usb_enable_usb2_hardware_lpm(udev);

                /* Try to enable USB3 LTM */
                usb_enable_ltm(udev);
        }

        usb_unlock_port(port_dev);

        return status;
}

int usb_remote_wakeup(struct usb_device *udev)
{
        int        status = 0;

        usb_lock_device(udev);
        if (udev->state == USB_STATE_SUSPENDED) {
                dev_dbg(&udev->dev, "usb %sresume\n", "wakeup-");
                status = usb_autoresume_device(udev);
                if (status == 0) {
                        /* Let the drivers do their thing, then... */
                        usb_autosuspend_device(udev);
                }
        }
        usb_unlock_device(udev);
        return status;
}

/* Returns 1 if there was a remote wakeup and a connect status change. */
static int hub_handle_remote_wakeup(struct usb_hub *hub, unsigned int port,
                u16 portstatus, u16 portchange)
                __must_hold(&port_dev->status_lock)
{
        struct usb_port *port_dev = hub->ports[port - 1];
        struct usb_device *hdev;
        struct usb_device *udev;
        int connect_change = 0;
        u16 link_state;
        int ret;

        hdev = hub->hdev;
        udev = port_dev->child;
        if (!hub_is_superspeed(hdev)) {
                if (!(portchange & USB_PORT_STAT_C_SUSPEND))
                        return 0;
                usb_clear_port_feature(hdev, port, USB_PORT_FEAT_C_SUSPEND);
        } else {
                link_state = portstatus & USB_PORT_STAT_LINK_STATE;
                if (!udev || udev->state != USB_STATE_SUSPENDED ||
                                (link_state != USB_SS_PORT_LS_U0 &&
                                 link_state != USB_SS_PORT_LS_U1 &&
                                 link_state != USB_SS_PORT_LS_U2))
                        return 0;
        }

        if (udev) {
                /* TRSMRCY = 10 msec */
                msleep(10);

                usb_unlock_port(port_dev);
                ret = usb_remote_wakeup(udev);
                usb_lock_port(port_dev);
                if (ret < 0)
                        connect_change = 1;
        } else {
                ret = -ENODEV;
                hub_port_disable(hub, port, 1);
        }
        dev_dbg(&port_dev->dev, "resume, status %d\n", ret);
        return connect_change;
}

static int check_ports_changed(struct usb_hub *hub)
{
        int port1;

        for (port1 = 1; port1 <= hub->hdev->maxchild; ++port1) {
                u16 portstatus, portchange;
                int status;

                status = usb_hub_port_status(hub, port1, &portstatus, &portchange);
                if (!status && portchange)
                        return 1;
        }
        return 0;
}

static int hub_suspend(struct usb_interface *intf, pm_message_t msg)
{
        struct usb_hub                *hub = usb_get_intfdata(intf);
        struct usb_device        *hdev = hub->hdev;
        unsigned                port1;

        /*
         * Warn if children aren't already suspended.
         * Also, add up the number of wakeup-enabled descendants.
         */
        hub->wakeup_enabled_descendants = 0;
        for (port1 = 1; port1 <= hdev->maxchild; port1++) {
                struct usb_port *port_dev = hub->ports[port1 - 1];
                struct usb_device *udev = port_dev->child;

                if (udev && udev->can_submit) {
                        dev_warn(&port_dev->dev, "device %s not suspended yet\n",
                                        dev_name(&udev->dev));
                        if (PMSG_IS_AUTO(msg))
                                return -EBUSY;
                }
                if (udev)
                        hub->wakeup_enabled_descendants +=
                                        usb_wakeup_enabled_descendants(udev);
        }

        if (hdev->do_remote_wakeup && hub->quirk_check_port_auto_suspend) {
                /* check if there are changes pending on hub ports */
                if (check_ports_changed(hub)) {
                        if (PMSG_IS_AUTO(msg))
                                return -EBUSY;
                        pm_wakeup_event(&hdev->dev, 2000);
                }
        }

        if (hub_is_superspeed(hdev) && hdev->do_remote_wakeup) {
                /* Enable hub to send remote wakeup for all ports. */
                for (port1 = 1; port1 <= hdev->maxchild; port1++) {
                        set_port_feature(hdev,
                                         port1 |
                                         USB_PORT_FEAT_REMOTE_WAKE_CONNECT |
                                         USB_PORT_FEAT_REMOTE_WAKE_DISCONNECT |
                                         USB_PORT_FEAT_REMOTE_WAKE_OVER_CURRENT,
                                         USB_PORT_FEAT_REMOTE_WAKE_MASK);
                }
        }

        dev_dbg(&intf->dev, "%s\n", __func__);

        /* stop hub_wq and related activity */
        hub_quiesce(hub, HUB_SUSPEND);
        return 0;
}

/* Report wakeup requests from the ports of a resuming root hub */
static void report_wakeup_requests(struct usb_hub *hub)
{
        struct usb_device        *hdev = hub->hdev;
        struct usb_device        *udev;
        struct usb_hcd                *hcd;
        unsigned long                resuming_ports;
        int                        i;

        if (hdev->parent)
                return;                /* Not a root hub */

        hcd = bus_to_hcd(hdev->bus);
        if (hcd->driver->get_resuming_ports) {

                /*
                 * The get_resuming_ports() method returns a bitmap (origin 0)
                 * of ports which have started wakeup signaling but have not
                 * yet finished resuming.  During system resume we will
                 * resume all the enabled ports, regardless of any wakeup
                 * signals, which means the wakeup requests would be lost.
                 * To prevent this, report them to the PM core here.
                 */
                resuming_ports = hcd->driver->get_resuming_ports(hcd);
                for (i = 0; i < hdev->maxchild; ++i) {
                        if (test_bit(i, &resuming_ports)) {
                                udev = hub->ports[i]->child;
                                if (udev)
                                        pm_wakeup_event(&udev->dev, 0);
                        }
                }
        }
}

static int hub_resume(struct usb_interface *intf)
{
        struct usb_hub *hub = usb_get_intfdata(intf);

        dev_dbg(&intf->dev, "%s\n", __func__);
        hub_activate(hub, HUB_RESUME);

        /*
         * This should be called only for system resume, not runtime resume.
         * We can't tell the difference here, so some wakeup requests will be
         * reported at the wrong time or more than once.  This shouldn't
         * matter much, so long as they do get reported.
         */
        report_wakeup_requests(hub);
        return 0;
}

static int hub_reset_resume(struct usb_interface *intf)
{
        struct usb_hub *hub = usb_get_intfdata(intf);

        dev_dbg(&intf->dev, "%s\n", __func__);
        hub_activate(hub, HUB_RESET_RESUME);
        return 0;
}

/**
 * usb_root_hub_lost_power - called by HCD if the root hub lost Vbus power
 * @rhdev: struct usb_device for the root hub
 *
 * The USB host controller driver calls this function when its root hub
 * is resumed and Vbus power has been interrupted or the controller
 * has been reset.  The routine marks @rhdev as having lost power.
 * When the hub driver is resumed it will take notice and carry out
 * power-session recovery for all the "USB-PERSIST"-enabled child devices;
 * the others will be disconnected.
 */
void usb_root_hub_lost_power(struct usb_device *rhdev)
{
        dev_notice(&rhdev->dev, "root hub lost power or was reset\n");
        rhdev->reset_resume = 1;
}
EXPORT_SYMBOL_GPL(usb_root_hub_lost_power);

static const char * const usb3_lpm_names[]  = {
        "U0",
        "U1",
        "U2",
        "U3",
};

/*
 * Send a Set SEL control transfer to the device, prior to enabling
 * device-initiated U1 or U2.  This lets the device know the exit latencies from
 * the time the device initiates a U1 or U2 exit, to the time it will receive a
 * packet from the host.
 *
 * This function will fail if the SEL or PEL values for udev are greater than
 * the maximum allowed values for the link state to be enabled.
 */
static int usb_req_set_sel(struct usb_device *udev)
{
        struct usb_set_sel_req *sel_values;
        unsigned long long u1_sel;
        unsigned long long u1_pel;
        unsigned long long u2_sel;
        unsigned long long u2_pel;
        int ret;

        if (!udev->parent || udev->speed < USB_SPEED_SUPER || !udev->lpm_capable)
                return 0;

        /* Convert SEL and PEL stored in ns to us */
        u1_sel = DIV_ROUND_UP(udev->u1_params.sel, 1000);
        u1_pel = DIV_ROUND_UP(udev->u1_params.pel, 1000);
        u2_sel = DIV_ROUND_UP(udev->u2_params.sel, 1000);
        u2_pel = DIV_ROUND_UP(udev->u2_params.pel, 1000);

        /*
         * Make sure that the calculated SEL and PEL values for the link
         * state we're enabling aren't bigger than the max SEL/PEL
         * value that will fit in the SET SEL control transfer.
         * Otherwise the device would get an incorrect idea of the exit
         * latency for the link state, and could start a device-initiated
         * U1/U2 when the exit latencies are too high.
         */
        if (u1_sel > USB3_LPM_MAX_U1_SEL_PEL ||
            u1_pel > USB3_LPM_MAX_U1_SEL_PEL ||
            u2_sel > USB3_LPM_MAX_U2_SEL_PEL ||
            u2_pel > USB3_LPM_MAX_U2_SEL_PEL) {
                dev_dbg(&udev->dev, "Device-initiated U1/U2 disabled due to long SEL or PEL\n");
                return -EINVAL;
        }

        /*
         * usb_enable_lpm() can be called as part of a failed device reset,
         * which may be initiated by an error path of a mass storage driver.
         * Therefore, use GFP_NOIO.
         */
        sel_values = kmalloc_obj(*(sel_values), GFP_NOIO);
        if (!sel_values)
                return -ENOMEM;

        sel_values->u1_sel = u1_sel;
        sel_values->u1_pel = u1_pel;
        sel_values->u2_sel = cpu_to_le16(u2_sel);
        sel_values->u2_pel = cpu_to_le16(u2_pel);

        ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                        USB_REQ_SET_SEL,
                        USB_RECIP_DEVICE,
                        0, 0,
                        sel_values, sizeof *(sel_values),
                        USB_CTRL_SET_TIMEOUT);
        kfree(sel_values);

        if (ret > 0)
                udev->lpm_devinit_allow = 1;

        return ret;
}

/*
 * Enable or disable device-initiated U1 or U2 transitions.
 */
static int usb_set_device_initiated_lpm(struct usb_device *udev,
                enum usb3_link_state state, bool enable)
{
        int ret;
        int feature;

        switch (state) {
        case USB3_LPM_U1:
                feature = USB_DEVICE_U1_ENABLE;
                break;
        case USB3_LPM_U2:
                feature = USB_DEVICE_U2_ENABLE;
                break;
        default:
                dev_warn(&udev->dev, "%s: Can't %s non-U1 or U2 state.\n",
                                __func__, str_enable_disable(enable));
                return -EINVAL;
        }

        if (udev->state != USB_STATE_CONFIGURED) {
                dev_dbg(&udev->dev, "%s: Can't %s %s state "
                                "for unconfigured device.\n",
                                __func__, str_enable_disable(enable),
                                usb3_lpm_names[state]);
                return -EINVAL;
        }

        if (enable) {
                /*
                 * Now send the control transfer to enable device-initiated LPM
                 * for either U1 or U2.
                 */
                ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                                USB_REQ_SET_FEATURE,
                                USB_RECIP_DEVICE,
                                feature,
                                0, NULL, 0,
                                USB_CTRL_SET_TIMEOUT);
        } else {
                ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                                USB_REQ_CLEAR_FEATURE,
                                USB_RECIP_DEVICE,
                                feature,
                                0, NULL, 0,
                                USB_CTRL_SET_TIMEOUT);
        }
        if (ret < 0) {
                dev_warn(&udev->dev, "%s of device-initiated %s failed.\n",
                         str_enable_disable(enable), usb3_lpm_names[state]);
                return -EBUSY;
        }
        return 0;
}

static int usb_set_lpm_timeout(struct usb_device *udev,
                enum usb3_link_state state, int timeout)
{
        int ret;
        int feature;

        switch (state) {
        case USB3_LPM_U1:
                feature = USB_PORT_FEAT_U1_TIMEOUT;
                break;
        case USB3_LPM_U2:
                feature = USB_PORT_FEAT_U2_TIMEOUT;
                break;
        default:
                dev_warn(&udev->dev, "%s: Can't set timeout for non-U1 or U2 state.\n",
                                __func__);
                return -EINVAL;
        }

        if (state == USB3_LPM_U1 && timeout > USB3_LPM_U1_MAX_TIMEOUT &&
                        timeout != USB3_LPM_DEVICE_INITIATED) {
                dev_warn(&udev->dev, "Failed to set %s timeout to 0x%x, "
                                "which is a reserved value.\n",
                                usb3_lpm_names[state], timeout);
                return -EINVAL;
        }

        ret = set_port_feature(udev->parent,
                        USB_PORT_LPM_TIMEOUT(timeout) | udev->portnum,
                        feature);
        if (ret < 0) {
                dev_warn(&udev->dev, "Failed to set %s timeout to 0x%x,"
                                "error code %i\n", usb3_lpm_names[state],
                                timeout, ret);
                return -EBUSY;
        }
        if (state == USB3_LPM_U1)
                udev->u1_params.timeout = timeout;
        else
                udev->u2_params.timeout = timeout;
        return 0;
}

/*
 * Don't allow device intiated U1/U2 if device isn't in the configured state,
 * or the system exit latency + one bus interval is greater than the minimum
 * service interval of any active periodic endpoint. See USB 3.2 section 9.4.9
 */
static bool usb_device_may_initiate_lpm(struct usb_device *udev,
                                        enum usb3_link_state state)
{
        unsigned int sel;                /* us */
        int i, j;

        if (!udev->lpm_devinit_allow || !udev->actconfig)
                return false;

        if (state == USB3_LPM_U1)
                sel = DIV_ROUND_UP(udev->u1_params.sel, 1000);
        else if (state == USB3_LPM_U2)
                sel = DIV_ROUND_UP(udev->u2_params.sel, 1000);
        else
                return false;

        for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) {
                struct usb_interface *intf;
                struct usb_endpoint_descriptor *desc;
                unsigned int interval;

                intf = udev->actconfig->interface[i];
                if (!intf)
                        continue;

                for (j = 0; j < intf->cur_altsetting->desc.bNumEndpoints; j++) {
                        desc = &intf->cur_altsetting->endpoint[j].desc;

                        if (usb_endpoint_xfer_int(desc) ||
                            usb_endpoint_xfer_isoc(desc)) {
                                interval = (1 << (desc->bInterval - 1)) * 125;
                                if (sel + 125 > interval)
                                        return false;
                        }
                }
        }
        return true;
}

/*
 * Enable the hub-initiated U1/U2 idle timeouts, and enable device-initiated
 * U1/U2 entry.
 *
 * We will attempt to enable U1 or U2, but there are no guarantees that the
 * control transfers to set the hub timeout or enable device-initiated U1/U2
 * will be successful.
 *
 * If the control transfer to enable device-initiated U1/U2 entry fails, then
 * hub-initiated U1/U2 will be disabled.
 *
 * If we cannot set the parent hub U1/U2 timeout, we attempt to let the xHCI
 * driver know about it.  If that call fails, it should be harmless, and just
 * take up more slightly more bus bandwidth for unnecessary U1/U2 exit latency.
 */
static int usb_enable_link_state(struct usb_hcd *hcd, struct usb_device *udev,
                enum usb3_link_state state)
{
        int timeout;
        __u8 u1_mel;
        __le16 u2_mel;

        /* Skip if the device BOS descriptor couldn't be read */
        if (!udev->bos)
                return -EINVAL;

        u1_mel = udev->bos->ss_cap->bU1devExitLat;
        u2_mel = udev->bos->ss_cap->bU2DevExitLat;

        /* If the device says it doesn't have *any* exit latency to come out of
         * U1 or U2, it's probably lying.  Assume it doesn't implement that link
         * state.
         */
        if ((state == USB3_LPM_U1 && u1_mel == 0) ||
                        (state == USB3_LPM_U2 && u2_mel == 0))
                return -EINVAL;

        /* We allow the host controller to set the U1/U2 timeout internally
         * first, so that it can change its schedule to account for the
         * additional latency to send data to a device in a lower power
         * link state.
         */
        timeout = hcd->driver->enable_usb3_lpm_timeout(hcd, udev, state);

        /* xHCI host controller doesn't want to enable this LPM state. */
        if (timeout == 0)
                return -EINVAL;

        if (timeout < 0) {
                dev_warn(&udev->dev, "Could not enable %s link state, "
                                "xHCI error %i.\n", usb3_lpm_names[state],
                                timeout);
                return timeout;
        }

        if (usb_set_lpm_timeout(udev, state, timeout)) {
                /* If we can't set the parent hub U1/U2 timeout,
                 * device-initiated LPM won't be allowed either, so let the xHCI
                 * host know that this link state won't be enabled.
                 */
                hcd->driver->disable_usb3_lpm_timeout(hcd, udev, state);
                return -EBUSY;
        }

        if (state == USB3_LPM_U1)
                udev->usb3_lpm_u1_enabled = 1;
        else if (state == USB3_LPM_U2)
                udev->usb3_lpm_u2_enabled = 1;

        return 0;
}
/*
 * Disable the hub-initiated U1/U2 idle timeouts, and disable device-initiated
 * U1/U2 entry.
 *
 * If this function returns -EBUSY, the parent hub will still allow U1/U2 entry.
 * If zero is returned, the parent will not allow the link to go into U1/U2.
 *
 * If zero is returned, device-initiated U1/U2 entry may still be enabled, but
 * it won't have an effect on the bus link state because the parent hub will
 * still disallow device-initiated U1/U2 entry.
 *
 * If zero is returned, the xHCI host controller may still think U1/U2 entry is
 * possible.  The result will be slightly more bus bandwidth will be taken up
 * (to account for U1/U2 exit latency), but it should be harmless.
 */
static int usb_disable_link_state(struct usb_hcd *hcd, struct usb_device *udev,
                enum usb3_link_state state)
{
        switch (state) {
        case USB3_LPM_U1:
        case USB3_LPM_U2:
                break;
        default:
                dev_warn(&udev->dev, "%s: Can't disable non-U1 or U2 state.\n",
                                __func__);
                return -EINVAL;
        }

        if (usb_set_lpm_timeout(udev, state, 0))
                return -EBUSY;

        if (hcd->driver->disable_usb3_lpm_timeout(hcd, udev, state))
                dev_warn(&udev->dev, "Could not disable xHCI %s timeout, "
                                "bus schedule bandwidth may be impacted.\n",
                                usb3_lpm_names[state]);

        /* As soon as usb_set_lpm_timeout(0) return 0, hub initiated LPM
         * is disabled. Hub will disallows link to enter U1/U2 as well,
         * even device is initiating LPM. Hence LPM is disabled if hub LPM
         * timeout set to 0, no matter device-initiated LPM is disabled or
         * not.
         */
        if (state == USB3_LPM_U1)
                udev->usb3_lpm_u1_enabled = 0;
        else if (state == USB3_LPM_U2)
                udev->usb3_lpm_u2_enabled = 0;

        return 0;
}

/*
 * Disable hub-initiated and device-initiated U1 and U2 entry.
 * Caller must own the bandwidth_mutex.
 *
 * This will call usb_enable_lpm() on failure, which will decrement
 * lpm_disable_count, and will re-enable LPM if lpm_disable_count reaches zero.
 */
int usb_disable_lpm(struct usb_device *udev)
{
        struct usb_hcd *hcd;
        int err;

        if (!udev || !udev->parent ||
                        udev->speed < USB_SPEED_SUPER ||
                        !udev->lpm_capable ||
                        udev->state < USB_STATE_CONFIGURED)
                return 0;

        hcd = bus_to_hcd(udev->bus);
        if (!hcd || !hcd->driver->disable_usb3_lpm_timeout)
                return 0;

        udev->lpm_disable_count++;
        if ((udev->u1_params.timeout == 0 && udev->u2_params.timeout == 0))
                return 0;

        /* If LPM is enabled, attempt to disable it. */
        if (usb_disable_link_state(hcd, udev, USB3_LPM_U1))
                goto disable_failed;
        if (usb_disable_link_state(hcd, udev, USB3_LPM_U2))
                goto disable_failed;

        err = usb_set_device_initiated_lpm(udev, USB3_LPM_U1, false);
        if (!err)
                usb_set_device_initiated_lpm(udev, USB3_LPM_U2, false);

        return 0;

disable_failed:
        udev->lpm_disable_count--;

        return -EBUSY;
}
EXPORT_SYMBOL_GPL(usb_disable_lpm);

/* Grab the bandwidth_mutex before calling usb_disable_lpm() */
int usb_unlocked_disable_lpm(struct usb_device *udev)
{
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);
        int ret;

        if (!hcd)
                return -EINVAL;

        mutex_lock(hcd->bandwidth_mutex);
        ret = usb_disable_lpm(udev);
        mutex_unlock(hcd->bandwidth_mutex);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_unlocked_disable_lpm);

/*
 * Attempt to enable device-initiated and hub-initiated U1 and U2 entry.  The
 * xHCI host policy may prevent U1 or U2 from being enabled.
 *
 * Other callers may have disabled link PM, so U1 and U2 entry will be disabled
 * until the lpm_disable_count drops to zero.  Caller must own the
 * bandwidth_mutex.
 */
void usb_enable_lpm(struct usb_device *udev)
{
        struct usb_hcd *hcd;
        struct usb_hub *hub;
        struct usb_port *port_dev;

        if (!udev || !udev->parent ||
                        udev->speed < USB_SPEED_SUPER ||
                        !udev->lpm_capable ||
                        udev->state < USB_STATE_CONFIGURED)
                return;

        udev->lpm_disable_count--;
        hcd = bus_to_hcd(udev->bus);
        /* Double check that we can both enable and disable LPM.
         * Device must be configured to accept set feature U1/U2 timeout.
         */
        if (!hcd || !hcd->driver->enable_usb3_lpm_timeout ||
                        !hcd->driver->disable_usb3_lpm_timeout)
                return;

        if (udev->lpm_disable_count > 0)
                return;

        hub = usb_hub_to_struct_hub(udev->parent);
        if (!hub)
                return;

        port_dev = hub->ports[udev->portnum - 1];

        if (port_dev->usb3_lpm_u1_permit)
                if (usb_enable_link_state(hcd, udev, USB3_LPM_U1))
                        return;

        if (port_dev->usb3_lpm_u2_permit)
                if (usb_enable_link_state(hcd, udev, USB3_LPM_U2))
                        return;

        /*
         * Enable device initiated U1/U2 with a SetFeature(U1/U2_ENABLE) request
         * if system exit latency is short enough and device is configured
         */
        if (usb_device_may_initiate_lpm(udev, USB3_LPM_U1)) {
                if (usb_set_device_initiated_lpm(udev, USB3_LPM_U1, true))
                        return;

                if (usb_device_may_initiate_lpm(udev, USB3_LPM_U2))
                        usb_set_device_initiated_lpm(udev, USB3_LPM_U2, true);
        }
}
EXPORT_SYMBOL_GPL(usb_enable_lpm);

/* Grab the bandwidth_mutex before calling usb_enable_lpm() */
void usb_unlocked_enable_lpm(struct usb_device *udev)
{
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);

        if (!hcd)
                return;

        mutex_lock(hcd->bandwidth_mutex);
        usb_enable_lpm(udev);
        mutex_unlock(hcd->bandwidth_mutex);
}
EXPORT_SYMBOL_GPL(usb_unlocked_enable_lpm);

/* usb3 devices use U3 for disabled, make sure remote wakeup is disabled */
static void hub_usb3_port_prepare_disable(struct usb_hub *hub,
                                          struct usb_port *port_dev)
{
        struct usb_device *udev = port_dev->child;
        int ret;

        if (udev && udev->port_is_suspended && udev->do_remote_wakeup) {
                ret = hub_set_port_link_state(hub, port_dev->portnum,
                                              USB_SS_PORT_LS_U0);
                if (!ret) {
                        msleep(USB_RESUME_TIMEOUT);
                        ret = usb_disable_remote_wakeup(udev);
                }
                if (ret)
                        dev_warn(&udev->dev,
                                 "Port disable: can't disable remote wake\n");
                udev->do_remote_wakeup = 0;
        }
}

#else        /* CONFIG_PM */

#define hub_suspend                NULL
#define hub_resume                NULL
#define hub_reset_resume        NULL

static inline void hub_usb3_port_prepare_disable(struct usb_hub *hub,
                                                 struct usb_port *port_dev) { }

int usb_disable_lpm(struct usb_device *udev)
{
        return 0;
}
EXPORT_SYMBOL_GPL(usb_disable_lpm);

void usb_enable_lpm(struct usb_device *udev) { }
EXPORT_SYMBOL_GPL(usb_enable_lpm);

int usb_unlocked_disable_lpm(struct usb_device *udev)
{
        return 0;
}
EXPORT_SYMBOL_GPL(usb_unlocked_disable_lpm);

void usb_unlocked_enable_lpm(struct usb_device *udev) { }
EXPORT_SYMBOL_GPL(usb_unlocked_enable_lpm);

int usb_disable_ltm(struct usb_device *udev)
{
        return 0;
}
EXPORT_SYMBOL_GPL(usb_disable_ltm);

void usb_enable_ltm(struct usb_device *udev) { }
EXPORT_SYMBOL_GPL(usb_enable_ltm);

static int hub_handle_remote_wakeup(struct usb_hub *hub, unsigned int port,
                u16 portstatus, u16 portchange)
{
        return 0;
}

static int usb_req_set_sel(struct usb_device *udev)
{
        return 0;
}

#endif        /* CONFIG_PM */

/*
 * USB-3 does not have a similar link state as USB-2 that will avoid negotiating
 * a connection with a plugged-in cable but will signal the host when the cable
 * is unplugged. Disable remote wake and set link state to U3 for USB-3 devices
 */
static int hub_port_disable(struct usb_hub *hub, int port1, int set_state)
{
        struct usb_port *port_dev = hub->ports[port1 - 1];
        struct usb_device *hdev = hub->hdev;
        int ret = 0;

        if (!hub->error) {
                if (hub_is_superspeed(hub->hdev)) {
                        hub_usb3_port_prepare_disable(hub, port_dev);
                        ret = hub_set_port_link_state(hub, port_dev->portnum,
                                                      USB_SS_PORT_LS_U3);
                } else {
                        ret = usb_clear_port_feature(hdev, port1,
                                        USB_PORT_FEAT_ENABLE);
                }
        }
        if (port_dev->child && set_state)
                usb_set_device_state(port_dev->child, USB_STATE_NOTATTACHED);
        if (ret && ret != -ENODEV)
                dev_err(&port_dev->dev, "cannot disable (err = %d)\n", ret);
        return ret;
}

/*
 * usb_port_disable - disable a usb device's upstream port
 * @udev: device to disable
 * Context: @udev locked, must be able to sleep.
 *
 * Disables a USB device that isn't in active use.
 */
int usb_port_disable(struct usb_device *udev)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent);

        return hub_port_disable(hub, udev->portnum, 0);
}

/* USB 2.0 spec, 7.1.7.3 / fig 7-29:
 *
 * Between connect detection and reset signaling there must be a delay
 * of 100ms at least for debounce and power-settling.  The corresponding
 * timer shall restart whenever the downstream port detects a disconnect.
 *
 * Apparently there are some bluetooth and irda-dongles and a number of
 * low-speed devices for which this debounce period may last over a second.
 * Not covered by the spec - but easy to deal with.
 *
 * This implementation uses a 1500ms total debounce timeout; if the
 * connection isn't stable by then it returns -ETIMEDOUT.  It checks
 * every 25ms for transient disconnects.  When the port status has been
 * unchanged for 100ms it returns the port status.
 */
int hub_port_debounce(struct usb_hub *hub, int port1, bool must_be_connected)
{
        int ret;
        u16 portchange, portstatus;
        unsigned connection = 0xffff;
        int total_time, stable_time = 0;
        struct usb_port *port_dev = hub->ports[port1 - 1];

        for (total_time = 0; ; total_time += HUB_DEBOUNCE_STEP) {
                ret = usb_hub_port_status(hub, port1, &portstatus, &portchange);
                if (ret < 0)
                        return ret;

                if (!(portchange & USB_PORT_STAT_C_CONNECTION) &&
                     (portstatus & USB_PORT_STAT_CONNECTION) == connection) {
                        if (!must_be_connected ||
                             (connection == USB_PORT_STAT_CONNECTION))
                                stable_time += HUB_DEBOUNCE_STEP;
                        if (stable_time >= HUB_DEBOUNCE_STABLE)
                                break;
                } else {
                        stable_time = 0;
                        connection = portstatus & USB_PORT_STAT_CONNECTION;
                }

                if (portchange & USB_PORT_STAT_C_CONNECTION) {
                        usb_clear_port_feature(hub->hdev, port1,
                                        USB_PORT_FEAT_C_CONNECTION);
                }

                if (total_time >= HUB_DEBOUNCE_TIMEOUT)
                        break;
                msleep(HUB_DEBOUNCE_STEP);
        }

        dev_dbg(&port_dev->dev, "debounce total %dms stable %dms status 0x%x\n",
                        total_time, stable_time, portstatus);

        if (stable_time < HUB_DEBOUNCE_STABLE)
                return -ETIMEDOUT;
        return portstatus;
}

void usb_ep0_reinit(struct usb_device *udev)
{
        usb_disable_endpoint(udev, 0 + USB_DIR_IN, true);
        usb_disable_endpoint(udev, 0 + USB_DIR_OUT, true);
        usb_enable_endpoint(udev, &udev->ep0, true);
}
EXPORT_SYMBOL_GPL(usb_ep0_reinit);

static int hub_set_address(struct usb_device *udev, int devnum)
{
        int retval;
        unsigned int timeout_ms = USB_CTRL_SET_TIMEOUT;
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);
        struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent);

        if (hub->hdev->quirks & USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT)
                timeout_ms = USB_SHORT_SET_ADDRESS_REQ_TIMEOUT;

        /*
         * The host controller will choose the device address,
         * instead of the core having chosen it earlier
         */
        if (!hcd->driver->address_device && devnum <= 1)
                return -EINVAL;
        if (udev->state == USB_STATE_ADDRESS)
                return 0;
        if (udev->state != USB_STATE_DEFAULT)
                return -EINVAL;
        if (hcd->driver->address_device)
                retval = hcd->driver->address_device(hcd, udev, timeout_ms);
        else
                retval = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                                USB_REQ_SET_ADDRESS, 0, devnum, 0,
                                NULL, 0, timeout_ms);
        if (retval == 0) {
                update_devnum(udev, devnum);
                /* Device now using proper address. */
                usb_set_device_state(udev, USB_STATE_ADDRESS);
                usb_ep0_reinit(udev);
        }
        return retval;
}

/*
 * There are reports of USB 3.0 devices that say they support USB 2.0 Link PM
 * when they're plugged into a USB 2.0 port, but they don't work when LPM is
 * enabled.
 *
 * Only enable USB 2.0 Link PM if the port is internal (hardwired), or the
 * device says it supports the new USB 2.0 Link PM errata by setting the BESL
 * support bit in the BOS descriptor.
 */
static void hub_set_initial_usb2_lpm_policy(struct usb_device *udev)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent);
        int connect_type = USB_PORT_CONNECT_TYPE_UNKNOWN;

        if (!udev->usb2_hw_lpm_capable || !udev->bos)
                return;

        if (hub)
                connect_type = hub->ports[udev->portnum - 1]->connect_type;

        if ((udev->bos->ext_cap->bmAttributes & cpu_to_le32(USB_BESL_SUPPORT)) ||
                        connect_type == USB_PORT_CONNECT_TYPE_HARD_WIRED) {
                udev->usb2_hw_lpm_allowed = 1;
                usb_enable_usb2_hardware_lpm(udev);
        }
}

static int hub_enable_device(struct usb_device *udev)
{
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);

        if (!hcd->driver->enable_device)
                return 0;
        if (udev->state == USB_STATE_ADDRESS)
                return 0;
        if (udev->state != USB_STATE_DEFAULT)
                return -EINVAL;

        return hcd->driver->enable_device(hcd, udev);
}

/*
 * Get the bMaxPacketSize0 value during initialization by reading the
 * device's device descriptor.  Since we don't already know this value,
 * the transfer is unsafe and it ignores I/O errors, only testing for
 * reasonable received values.
 *
 * For "old scheme" initialization, size will be 8 so we read just the
 * start of the device descriptor, which should work okay regardless of
 * the actual bMaxPacketSize0 value.  For "new scheme" initialization,
 * size will be 64 (and buf will point to a sufficiently large buffer),
 * which might not be kosher according to the USB spec but it's what
 * Windows does and what many devices expect.
 *
 * Returns: bMaxPacketSize0 or a negative error code.
 */
static int get_bMaxPacketSize0(struct usb_device *udev,
                struct usb_device_descriptor *buf, int size, bool first_time)
{
        int i, rc;

        /*
         * Retry on all errors; some devices are flakey.
         * 255 is for WUSB devices, we actually need to use
         * 512 (WUSB1.0[4.8.1]).
         */
        for (i = 0; i < GET_MAXPACKET0_TRIES; ++i) {
                /* Start with invalid values in case the transfer fails */
                buf->bDescriptorType = buf->bMaxPacketSize0 = 0;
                rc = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
                                USB_REQ_GET_DESCRIPTOR, USB_DIR_IN,
                                USB_DT_DEVICE << 8, 0,
                                buf, size,
                                initial_descriptor_timeout);
                switch (buf->bMaxPacketSize0) {
                case 8: case 16: case 32: case 64: case 9:
                        if (buf->bDescriptorType == USB_DT_DEVICE) {
                                rc = buf->bMaxPacketSize0;
                                break;
                        }
                        fallthrough;
                default:
                        if (rc >= 0)
                                rc = -EPROTO;
                        break;
                }

                /*
                 * Some devices time out if they are powered on
                 * when already connected. They need a second
                 * reset, so return early. But only on the first
                 * attempt, lest we get into a time-out/reset loop.
                 */
                if (rc > 0 || (rc == -ETIMEDOUT && first_time &&
                                udev->speed > USB_SPEED_FULL))
                        break;
        }
        return rc;
}

#define GET_DESCRIPTOR_BUFSIZE        64

/* Reset device, (re)assign address, get device descriptor.
 * Device connection must be stable, no more debouncing needed.
 * Returns device in USB_STATE_ADDRESS, except on error.
 *
 * If this is called for an already-existing device (as part of
 * usb_reset_and_verify_device), the caller must own the device lock and
 * the port lock.  For a newly detected device that is not accessible
 * through any global pointers, it's not necessary to lock the device,
 * but it is still necessary to lock the port.
 *
 * For a newly detected device, @dev_descr must be NULL.  The device
 * descriptor retrieved from the device will then be stored in
 * @udev->descriptor.  For an already existing device, @dev_descr
 * must be non-NULL.  The device descriptor will be stored there,
 * not in @udev->descriptor, because descriptors for registered
 * devices are meant to be immutable.
 */
static int
hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
                int retry_counter, struct usb_device_descriptor *dev_descr)
{
        struct usb_device        *hdev = hub->hdev;
        struct usb_hcd                *hcd = bus_to_hcd(hdev->bus);
        struct usb_port                *port_dev = hub->ports[port1 - 1];
        int                        retries, operations, retval, i;
        unsigned                delay = HUB_SHORT_RESET_TIME;
        enum usb_device_speed        oldspeed = udev->speed;
        const char                *speed;
        int                        devnum = udev->devnum;
        const char                *driver_name;
        bool                        do_new_scheme;
        const bool                initial = !dev_descr;
        int                        maxp0;
        struct usb_device_descriptor        *buf, *descr;

        buf = kmalloc(GET_DESCRIPTOR_BUFSIZE, GFP_NOIO);
        if (!buf)
                return -ENOMEM;

        /* root hub ports have a slightly longer reset period
         * (from USB 2.0 spec, section 7.1.7.5)
         */
        if (!hdev->parent) {
                delay = HUB_ROOT_RESET_TIME;
                if (port1 == hdev->bus->otg_port)
                        hdev->bus->b_hnp_enable = 0;
        }

        /* Some low speed devices have problems with the quick delay, so */
        /*  be a bit pessimistic with those devices. RHbug #23670 */
        if (oldspeed == USB_SPEED_LOW)
                delay = HUB_LONG_RESET_TIME;

        /* Reset the device; full speed may morph to high speed */
        /* FIXME a USB 2.0 device may morph into SuperSpeed on reset. */
        retval = hub_port_reset(hub, port1, udev, delay, false);
        if (retval < 0)                /* error or disconnect */
                goto fail;
        /* success, speed is known */

        retval = -ENODEV;

        /* Don't allow speed changes at reset, except usb 3.0 to faster */
        if (oldspeed != USB_SPEED_UNKNOWN && oldspeed != udev->speed &&
            !(oldspeed == USB_SPEED_SUPER && udev->speed > oldspeed)) {
                dev_dbg(&udev->dev, "device reset changed speed!\n");
                goto fail;
        }
        oldspeed = udev->speed;

        if (initial) {
                /* USB 2.0 section 5.5.3 talks about ep0 maxpacket ...
                 * it's fixed size except for full speed devices.
                 */
                switch (udev->speed) {
                case USB_SPEED_SUPER_PLUS:
                case USB_SPEED_SUPER:
                        udev->ep0.desc.wMaxPacketSize = cpu_to_le16(512);
                        break;
                case USB_SPEED_HIGH:                /* fixed at 64 */
                        udev->ep0.desc.wMaxPacketSize = cpu_to_le16(64);
                        break;
                case USB_SPEED_FULL:                /* 8, 16, 32, or 64 */
                        /* to determine the ep0 maxpacket size, try to read
                         * the device descriptor to get bMaxPacketSize0 and
                         * then correct our initial guess.
                         */
                        udev->ep0.desc.wMaxPacketSize = cpu_to_le16(64);
                        break;
                case USB_SPEED_LOW:                /* fixed at 8 */
                        udev->ep0.desc.wMaxPacketSize = cpu_to_le16(8);
                        break;
                default:
                        goto fail;
                }
        }

        speed = usb_speed_string(udev->speed);

        /*
         * The controller driver may be NULL if the controller device
         * is the middle device between platform device and roothub.
         * This middle device may not need a device driver due to
         * all hardware control can be at platform device driver, this
         * platform device is usually a dual-role USB controller device.
         */
        if (udev->bus->controller->driver)
                driver_name = udev->bus->controller->driver->name;
        else
                driver_name = udev->bus->sysdev->driver->name;

        if (udev->speed < USB_SPEED_SUPER)
                dev_info(&udev->dev,
                                "%s %s USB device number %d using %s\n",
                                (initial ? "new" : "reset"), speed,
                                devnum, driver_name);

        if (initial) {
                /* Set up TT records, if needed  */
                if (hdev->tt) {
                        udev->tt = hdev->tt;
                        udev->ttport = hdev->ttport;
                } else if (udev->speed != USB_SPEED_HIGH
                                && hdev->speed == USB_SPEED_HIGH) {
                        if (!hub->tt.hub) {
                                dev_err(&udev->dev, "parent hub has no TT\n");
                                retval = -EINVAL;
                                goto fail;
                        }
                        udev->tt = &hub->tt;
                        udev->ttport = port1;
                }
        }

        /* Why interleave GET_DESCRIPTOR and SET_ADDRESS this way?
         * Because device hardware and firmware is sometimes buggy in
         * this area, and this is how Linux has done it for ages.
         * Change it cautiously.
         *
         * NOTE:  If use_new_scheme() is true we will start by issuing
         * a 64-byte GET_DESCRIPTOR request.  This is what Windows does,
         * so it may help with some non-standards-compliant devices.
         * Otherwise we start with SET_ADDRESS and then try to read the
         * first 8 bytes of the device descriptor to get the ep0 maxpacket
         * value.
         */
        do_new_scheme = use_new_scheme(udev, retry_counter, port_dev);

        for (retries = 0; retries < GET_DESCRIPTOR_TRIES; (++retries, msleep(100))) {
                if (hub_port_stop_enumerate(hub, port1, retries)) {
                        retval = -ENODEV;
                        break;
                }

                if (do_new_scheme) {
                        retval = hub_enable_device(udev);
                        if (retval < 0) {
                                dev_err(&udev->dev,
                                        "hub failed to enable device, error %d\n",
                                        retval);
                                goto fail;
                        }

                        maxp0 = get_bMaxPacketSize0(udev, buf,
                                        GET_DESCRIPTOR_BUFSIZE, retries == 0);
                        if (maxp0 > 0 && !initial &&
                                        maxp0 != udev->descriptor.bMaxPacketSize0) {
                                dev_err(&udev->dev, "device reset changed ep0 maxpacket size!\n");
                                retval = -ENODEV;
                                goto fail;
                        }

                        retval = hub_port_reset(hub, port1, udev, delay, false);
                        if (retval < 0)                /* error or disconnect */
                                goto fail;
                        if (oldspeed != udev->speed) {
                                dev_dbg(&udev->dev,
                                        "device reset changed speed!\n");
                                retval = -ENODEV;
                                goto fail;
                        }
                        if (maxp0 < 0) {
                                if (maxp0 != -ENODEV)
                                        dev_err(&udev->dev, "device descriptor read/64, error %d\n",
                                                        maxp0);
                                retval = maxp0;
                                continue;
                        }
                }

                for (operations = 0; operations < SET_ADDRESS_TRIES; ++operations) {
                        retval = hub_set_address(udev, devnum);
                        if (retval >= 0)
                                break;
                        msleep(200);
                }
                if (retval < 0) {
                        if (retval != -ENODEV)
                                dev_err(&udev->dev, "device not accepting address %d, error %d\n",
                                                devnum, retval);
                        goto fail;
                }
                if (udev->speed >= USB_SPEED_SUPER) {
                        devnum = udev->devnum;
                        dev_info(&udev->dev,
                                        "%s SuperSpeed%s%s USB device number %d using %s\n",
                                        (udev->config) ? "reset" : "new",
                                 (udev->speed == USB_SPEED_SUPER_PLUS) ?
                                                " Plus" : "",
                                 (udev->ssp_rate == USB_SSP_GEN_2x2) ?
                                                " Gen 2x2" :
                                 (udev->ssp_rate == USB_SSP_GEN_2x1) ?
                                                " Gen 2x1" :
                                 (udev->ssp_rate == USB_SSP_GEN_1x2) ?
                                                " Gen 1x2" : "",
                                 devnum, driver_name);
                }

                /*
                 * cope with hardware quirkiness:
                 *  - let SET_ADDRESS settle, some device hardware wants it
                 *  - read ep0 maxpacket even for high and low speed,
                 */
                msleep(10);

                if (do_new_scheme)
                        break;

                maxp0 = get_bMaxPacketSize0(udev, buf, 8, retries == 0);
                if (maxp0 < 0) {
                        retval = maxp0;
                        if (retval != -ENODEV)
                                dev_err(&udev->dev,
                                        "device descriptor read/8, error %d\n",
                                        retval);
                } else {
                        u32 delay;

                        if (!initial && maxp0 != udev->descriptor.bMaxPacketSize0) {
                                dev_err(&udev->dev, "device reset changed ep0 maxpacket size!\n");
                                retval = -ENODEV;
                                goto fail;
                        }

                        delay = udev->parent->hub_delay;
                        udev->hub_delay = min_t(u32, delay,
                                                USB_TP_TRANSMISSION_DELAY_MAX);
                        retval = usb_set_isoch_delay(udev);
                        if (retval) {
                                dev_dbg(&udev->dev,
                                        "Failed set isoch delay, error %d\n",
                                        retval);
                                retval = 0;
                        }
                        break;
                }
        }
        if (retval)
                goto fail;

        /*
         * Check the ep0 maxpacket guess and correct it if necessary.
         * maxp0 is the value stored in the device descriptor;
         * i is the value it encodes (logarithmic for SuperSpeed or greater).
         */
        i = maxp0;
        if (udev->speed >= USB_SPEED_SUPER) {
                if (maxp0 <= 16)
                        i = 1 << maxp0;
                else
                        i = 0;                /* Invalid */
        }
        if (usb_endpoint_maxp(&udev->ep0.desc) == i) {
                ;        /* Initial ep0 maxpacket guess is right */
        } else if (((udev->speed == USB_SPEED_FULL ||
                                udev->speed == USB_SPEED_HIGH) &&
                        (i == 8 || i == 16 || i == 32 || i == 64)) ||
                        (udev->speed >= USB_SPEED_SUPER && i > 0)) {
                /* Initial guess is wrong; use the descriptor's value */
                if (udev->speed == USB_SPEED_FULL)
                        dev_dbg(&udev->dev, "ep0 maxpacket = %d\n", i);
                else
                        dev_warn(&udev->dev, "Using ep0 maxpacket: %d\n", i);
                udev->ep0.desc.wMaxPacketSize = cpu_to_le16(i);
                usb_ep0_reinit(udev);
        } else {
                /* Initial guess is wrong and descriptor's value is invalid */
                dev_err(&udev->dev, "Invalid ep0 maxpacket: %d\n", maxp0);
                retval = -EMSGSIZE;
                goto fail;
        }

        descr = usb_get_device_descriptor(udev);
        if (IS_ERR(descr)) {
                retval = PTR_ERR(descr);
                if (retval != -ENODEV)
                        dev_err(&udev->dev, "device descriptor read/all, error %d\n",
                                        retval);
                goto fail;
        }
        if (initial)
                udev->descriptor = *descr;
        else
                *dev_descr = *descr;
        kfree(descr);

        /*
         * Some superspeed devices have finished the link training process
         * and attached to a superspeed hub port, but the device descriptor
         * got from those devices show they aren't superspeed devices. Warm
         * reset the port attached by the devices can fix them.
         */
        if ((udev->speed >= USB_SPEED_SUPER) &&
                        (le16_to_cpu(udev->descriptor.bcdUSB) < 0x0300)) {
                dev_err(&udev->dev, "got a wrong device descriptor, warm reset device\n");
                hub_port_reset(hub, port1, udev, HUB_BH_RESET_TIME, true);
                retval = -EINVAL;
                goto fail;
        }

        usb_detect_quirks(udev);

        if (le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0201) {
                retval = usb_get_bos_descriptor(udev);
                if (!retval) {
                        udev->lpm_capable = usb_device_supports_lpm(udev);
                        udev->lpm_disable_count = 1;
                        usb_set_lpm_parameters(udev);
                        usb_req_set_sel(udev);
                }
        }

        retval = 0;
        /* notify HCD that we have a device connected and addressed */
        if (hcd->driver->update_device)
                hcd->driver->update_device(hcd, udev);
        hub_set_initial_usb2_lpm_policy(udev);
fail:
        if (retval) {
                hub_port_disable(hub, port1, 0);
                update_devnum(udev, devnum);        /* for disconnect processing */
        }
        kfree(buf);
        return retval;
}

static void
check_highspeed(struct usb_hub *hub, struct usb_device *udev, int port1)
{
        struct usb_qualifier_descriptor        *qual;
        int                                status;

        if (udev->quirks & USB_QUIRK_DEVICE_QUALIFIER)
                return;

        qual = kmalloc_obj(*qual);
        if (qual == NULL)
                return;

        status = usb_get_descriptor(udev, USB_DT_DEVICE_QUALIFIER, 0,
                        qual, sizeof *qual);
        if (status == sizeof *qual) {
                dev_info(&udev->dev, "not running at top speed; "
                        "connect to a high speed hub\n");
                /* hub LEDs are probably harder to miss than syslog */
                if (hub->has_indicators) {
                        hub->indicator[port1-1] = INDICATOR_GREEN_BLINK;
                        queue_delayed_work(system_power_efficient_wq,
                                        &hub->leds, 0);
                }
        }
        kfree(qual);
}

static unsigned
hub_power_remaining(struct usb_hub *hub)
{
        struct usb_device *hdev = hub->hdev;
        int remaining;
        int port1;

        if (!hub->limited_power)
                return 0;

        remaining = hdev->bus_mA - hub->descriptor->bHubContrCurrent;
        for (port1 = 1; port1 <= hdev->maxchild; ++port1) {
                struct usb_port *port_dev = hub->ports[port1 - 1];
                struct usb_device *udev = port_dev->child;
                unsigned unit_load;
                int delta;

                if (!udev)
                        continue;
                if (hub_is_superspeed(udev))
                        unit_load = 150;
                else
                        unit_load = 100;

                /*
                 * Unconfigured devices may not use more than one unit load,
                 * or 8mA for OTG ports
                 */
                if (udev->actconfig)
                        delta = usb_get_max_power(udev, udev->actconfig);
                else if (port1 != udev->bus->otg_port || hdev->parent)
                        delta = unit_load;
                else
                        delta = 8;
                if (delta > hub->mA_per_port)
                        dev_warn(&port_dev->dev, "%dmA is over %umA budget!\n",
                                        delta, hub->mA_per_port);
                remaining -= delta;
        }
        if (remaining < 0) {
                dev_warn(hub->intfdev, "%dmA over power budget!\n",
                        -remaining);
                remaining = 0;
        }
        return remaining;
}


static int descriptors_changed(struct usb_device *udev,
                struct usb_device_descriptor *new_device_descriptor,
                struct usb_host_bos *old_bos)
{
        int                changed = 0;
        unsigned        index;
        unsigned        serial_len = 0;
        unsigned        len;
        unsigned        old_length;
        int                length;
        char                *buf;

        if (memcmp(&udev->descriptor, new_device_descriptor,
                        sizeof(*new_device_descriptor)) != 0)
                return 1;

        if ((old_bos && !udev->bos) || (!old_bos && udev->bos))
                return 1;
        if (udev->bos) {
                len = le16_to_cpu(udev->bos->desc->wTotalLength);
                if (len != le16_to_cpu(old_bos->desc->wTotalLength))
                        return 1;
                if (memcmp(udev->bos->desc, old_bos->desc, len))
                        return 1;
        }

        /* Since the idVendor, idProduct, and bcdDevice values in the
         * device descriptor haven't changed, we will assume the
         * Manufacturer and Product strings haven't changed either.
         * But the SerialNumber string could be different (e.g., a
         * different flash card of the same brand).
         */
        if (udev->serial)
                serial_len = strlen(udev->serial) + 1;

        len = serial_len;
        for (index = 0; index < udev->descriptor.bNumConfigurations; index++) {
                old_length = le16_to_cpu(udev->config[index].desc.wTotalLength);
                len = max(len, old_length);
        }

        buf = kmalloc(len, GFP_NOIO);
        if (!buf)
                /* assume the worst */
                return 1;

        for (index = 0; index < udev->descriptor.bNumConfigurations; index++) {
                old_length = le16_to_cpu(udev->config[index].desc.wTotalLength);
                length = usb_get_descriptor(udev, USB_DT_CONFIG, index, buf,
                                old_length);
                if (length != old_length) {
                        dev_dbg(&udev->dev, "config index %d, error %d\n",
                                        index, length);
                        changed = 1;
                        break;
                }
                if (memcmp(buf, udev->rawdescriptors[index], old_length)
                                != 0) {
                        dev_dbg(&udev->dev, "config index %d changed (#%d)\n",
                                index,
                                ((struct usb_config_descriptor *) buf)->
                                        bConfigurationValue);
                        changed = 1;
                        break;
                }
        }

        if (!changed && serial_len) {
                length = usb_string(udev, udev->descriptor.iSerialNumber,
                                buf, serial_len);
                if (length + 1 != serial_len) {
                        dev_dbg(&udev->dev, "serial string error %d\n",
                                        length);
                        changed = 1;
                } else if (memcmp(buf, udev->serial, length) != 0) {
                        dev_dbg(&udev->dev, "serial string changed\n");
                        changed = 1;
                }
        }

        kfree(buf);
        return changed;
}

static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus,
                u16 portchange)
{
        int status = -ENODEV;
        int i;
        unsigned unit_load;
        struct usb_device *hdev = hub->hdev;
        struct usb_hcd *hcd = bus_to_hcd(hdev->bus);
        struct usb_port *port_dev = hub->ports[port1 - 1];
        struct usb_device *udev = port_dev->child;
        static int unreliable_port = -1;
        bool retry_locked;

        /* Disconnect any existing devices under this port */
        if (udev) {
                if (hcd->usb_phy && !hdev->parent)
                        usb_phy_notify_disconnect(hcd->usb_phy, udev->speed);
                usb_disconnect(&port_dev->child);
        }

        /* We can forget about a "removed" device when there's a physical
         * disconnect or the connect status changes.
         */
        if (!(portstatus & USB_PORT_STAT_CONNECTION) ||
                        (portchange & USB_PORT_STAT_C_CONNECTION))
                clear_bit(port1, hub->removed_bits);

        if (portchange & (USB_PORT_STAT_C_CONNECTION |
                                USB_PORT_STAT_C_ENABLE)) {
                status = hub_port_debounce_be_stable(hub, port1);
                if (status < 0) {
                        if (status != -ENODEV &&
                                port1 != unreliable_port &&
                                printk_ratelimit())
                                dev_err(&port_dev->dev, "connect-debounce failed\n");
                        portstatus &= ~USB_PORT_STAT_CONNECTION;
                        unreliable_port = port1;
                } else {
                        portstatus = status;
                }
        }

        /* Return now if debouncing failed or nothing is connected or
         * the device was "removed".
         */
        if (!(portstatus & USB_PORT_STAT_CONNECTION) ||
                        test_bit(port1, hub->removed_bits)) {

                /*
                 * maybe switch power back on (e.g. root hub was reset)
                 * but only if the port isn't owned by someone else.
                 */
                if (hub_is_port_power_switchable(hub)
                                && !usb_port_is_power_on(hub, portstatus)
                                && !port_dev->port_owner)
                        set_port_feature(hdev, port1, USB_PORT_FEAT_POWER);

                if (portstatus & USB_PORT_STAT_ENABLE)
                        goto done;
                return;
        }
        if (hub_is_superspeed(hub->hdev))
                unit_load = 150;
        else
                unit_load = 100;

        status = 0;

        for (i = 0; i < PORT_INIT_TRIES; i++) {
                if (hub_port_stop_enumerate(hub, port1, i)) {
                        status = -ENODEV;
                        break;
                }

                usb_lock_port(port_dev);
                mutex_lock(hcd->address0_mutex);
                retry_locked = true;
                /* reallocate for each attempt, since references
                 * to the previous one can escape in various ways
                 */
                udev = usb_alloc_dev(hdev, hdev->bus, port1);
                if (!udev) {
                        dev_err(&port_dev->dev,
                                        "couldn't allocate usb_device\n");
                        mutex_unlock(hcd->address0_mutex);
                        usb_unlock_port(port_dev);
                        goto done;
                }

                usb_set_device_state(udev, USB_STATE_POWERED);
                udev->bus_mA = hub->mA_per_port;
                udev->level = hdev->level + 1;

                /* Devices connected to SuperSpeed hubs are USB 3.0 or later */
                if (hub_is_superspeed(hub->hdev))
                        udev->speed = USB_SPEED_SUPER;
                else
                        udev->speed = USB_SPEED_UNKNOWN;

                choose_devnum(udev);
                if (udev->devnum <= 0) {
                        status = -ENOTCONN;        /* Don't retry */
                        goto loop;
                }

                /* reset (non-USB 3.0 devices) and get descriptor */
                status = hub_port_init(hub, udev, port1, i, NULL);
                if (status < 0)
                        goto loop;

                mutex_unlock(hcd->address0_mutex);
                usb_unlock_port(port_dev);
                retry_locked = false;

                if (udev->quirks & USB_QUIRK_DELAY_INIT)
                        msleep(2000);

                /* consecutive bus-powered hubs aren't reliable; they can
                 * violate the voltage drop budget.  if the new child has
                 * a "powered" LED, users should notice we didn't enable it
                 * (without reading syslog), even without per-port LEDs
                 * on the parent.
                 */
                if (udev->descriptor.bDeviceClass == USB_CLASS_HUB
                                && udev->bus_mA <= unit_load) {
                        u16        devstat;

                        status = usb_get_std_status(udev, USB_RECIP_DEVICE, 0,
                                        &devstat);
                        if (status) {
                                dev_dbg(&udev->dev, "get status %d ?\n", status);
                                goto loop_disable;
                        }
                        if ((devstat & (1 << USB_DEVICE_SELF_POWERED)) == 0) {
                                dev_err(&udev->dev,
                                        "can't connect bus-powered hub "
                                        "to this port\n");
                                if (hub->has_indicators) {
                                        hub->indicator[port1-1] =
                                                INDICATOR_AMBER_BLINK;
                                        queue_delayed_work(
                                                system_power_efficient_wq,
                                                &hub->leds, 0);
                                }
                                status = -ENOTCONN;        /* Don't retry */
                                goto loop_disable;
                        }
                }

                /* check for devices running slower than they could */
                if (le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0200
                                && udev->speed == USB_SPEED_FULL
                                && highspeed_hubs != 0)
                        check_highspeed(hub, udev, port1);

                /* Store the parent's children[] pointer.  At this point
                 * udev becomes globally accessible, although presumably
                 * no one will look at it until hdev is unlocked.
                 */
                status = 0;

                mutex_lock(&usb_port_peer_mutex);

                /* We mustn't add new devices if the parent hub has
                 * been disconnected; we would race with the
                 * recursively_mark_NOTATTACHED() routine.
                 */
                spin_lock_irq(&device_state_lock);
                if (hdev->state == USB_STATE_NOTATTACHED)
                        status = -ENOTCONN;
                else
                        port_dev->child = udev;
                spin_unlock_irq(&device_state_lock);
                mutex_unlock(&usb_port_peer_mutex);

                /* Run it through the hoops (find a driver, etc) */
                if (!status) {
                        status = usb_new_device(udev);
                        if (status) {
                                mutex_lock(&usb_port_peer_mutex);
                                spin_lock_irq(&device_state_lock);
                                port_dev->child = NULL;
                                spin_unlock_irq(&device_state_lock);
                                mutex_unlock(&usb_port_peer_mutex);
                        } else {
                                if (hcd->usb_phy && !hdev->parent)
                                        usb_phy_notify_connect(hcd->usb_phy,
                                                        udev->speed);
                        }
                }

                if (status)
                        goto loop_disable;

                status = hub_power_remaining(hub);
                if (status)
                        dev_dbg(hub->intfdev, "%dmA power budget left\n", status);

                return;

loop_disable:
                hub_port_disable(hub, port1, 1);
loop:
                usb_ep0_reinit(udev);
                release_devnum(udev);
                hub_free_dev(udev);
                if (retry_locked) {
                        mutex_unlock(hcd->address0_mutex);
                        usb_unlock_port(port_dev);
                }
                usb_put_dev(udev);
                if ((status == -ENOTCONN) || (status == -ENOTSUPP))
                        break;

                /* When halfway through our retry count, power-cycle the port */
                if (i == (PORT_INIT_TRIES - 1) / 2) {
                        dev_info(&port_dev->dev, "attempt power cycle\n");
                        usb_hub_set_port_power(hdev, hub, port1, false);
                        msleep(2 * hub_power_on_good_delay(hub));
                        usb_hub_set_port_power(hdev, hub, port1, true);
                        msleep(hub_power_on_good_delay(hub));
                }
        }
        if (hub->hdev->parent ||
                        !hcd->driver->port_handed_over ||
                        !(hcd->driver->port_handed_over)(hcd, port1)) {
                if (status != -ENOTCONN && status != -ENODEV)
                        dev_err(&port_dev->dev,
                                        "unable to enumerate USB device\n");
        }

done:
        hub_port_disable(hub, port1, 1);
        if (hcd->driver->relinquish_port && !hub->hdev->parent) {
                if (status != -ENOTCONN && status != -ENODEV)
                        hcd->driver->relinquish_port(hcd, port1);
        }
}

/* Handle physical or logical connection change events.
 * This routine is called when:
 *        a port connection-change occurs;
 *        a port enable-change occurs (often caused by EMI);
 *        usb_reset_and_verify_device() encounters changed descriptors (as from
 *                a firmware download)
 * caller already locked the hub
 */
static void hub_port_connect_change(struct usb_hub *hub, int port1,
                                        u16 portstatus, u16 portchange)
                __must_hold(&port_dev->status_lock)
{
        struct usb_port *port_dev = hub->ports[port1 - 1];
        struct usb_device *udev = port_dev->child;
        struct usb_device_descriptor *descr;
        int status = -ENODEV;

        dev_dbg(&port_dev->dev, "status %04x, change %04x, %s\n", portstatus,
                        portchange, portspeed(hub, portstatus));

        if (hub->has_indicators) {
                set_port_led(hub, port1, HUB_LED_AUTO);
                hub->indicator[port1-1] = INDICATOR_AUTO;
        }

#ifdef        CONFIG_USB_OTG
        /* during HNP, don't repeat the debounce */
        if (hub->hdev->bus->is_b_host)
                portchange &= ~(USB_PORT_STAT_C_CONNECTION |
                                USB_PORT_STAT_C_ENABLE);
#endif

        /* Try to resuscitate an existing device */
        if ((portstatus & USB_PORT_STAT_CONNECTION) && udev &&
                        udev->state != USB_STATE_NOTATTACHED) {
                if (portstatus & USB_PORT_STAT_ENABLE) {
                        /*
                         * USB-3 connections are initialized automatically by
                         * the hostcontroller hardware. Therefore check for
                         * changed device descriptors before resuscitating the
                         * device.
                         */
                        descr = usb_get_device_descriptor(udev);
                        if (IS_ERR(descr)) {
                                dev_dbg(&udev->dev,
                                                "can't read device descriptor %ld\n",
                                                PTR_ERR(descr));
                        } else {
                                if (descriptors_changed(udev, descr,
                                                udev->bos)) {
                                        dev_dbg(&udev->dev,
                                                        "device descriptor has changed\n");
                                } else {
                                        status = 0; /* Nothing to do */
                                }
                                kfree(descr);
                        }
#ifdef CONFIG_PM
                } else if (udev->state == USB_STATE_SUSPENDED &&
                                udev->persist_enabled) {
                        /* For a suspended device, treat this as a
                         * remote wakeup event.
                         */
                        usb_unlock_port(port_dev);
                        status = usb_remote_wakeup(udev);
                        usb_lock_port(port_dev);
#endif
                } else {
                        /* Don't resuscitate */;
                }
        }
        clear_bit(port1, hub->change_bits);

        /* successfully revalidated the connection */
        if (status == 0)
                return;

        usb_unlock_port(port_dev);
        hub_port_connect(hub, port1, portstatus, portchange);
        usb_lock_port(port_dev);
}

/* Handle notifying userspace about hub over-current events */
static void port_over_current_notify(struct usb_port *port_dev)
{
        char *envp[3] = { NULL, NULL, NULL };
        struct device *hub_dev;
        char *port_dev_path;

        sysfs_notify(&port_dev->dev.kobj, NULL, "over_current_count");

        hub_dev = port_dev->dev.parent;

        if (!hub_dev)
                return;

        port_dev_path = kobject_get_path(&port_dev->dev.kobj, GFP_KERNEL);
        if (!port_dev_path)
                return;

        envp[0] = kasprintf(GFP_KERNEL, "OVER_CURRENT_PORT=%s", port_dev_path);
        if (!envp[0])
                goto exit;

        envp[1] = kasprintf(GFP_KERNEL, "OVER_CURRENT_COUNT=%u",
                        port_dev->over_current_count);
        if (!envp[1])
                goto exit;

        kobject_uevent_env(&hub_dev->kobj, KOBJ_CHANGE, envp);

exit:
        kfree(envp[1]);
        kfree(envp[0]);
        kfree(port_dev_path);
}

static void port_event(struct usb_hub *hub, int port1)
                __must_hold(&port_dev->status_lock)
{
        int connect_change;
        struct usb_port *port_dev = hub->ports[port1 - 1];
        struct usb_device *udev = port_dev->child;
        struct usb_device *hdev = hub->hdev;
        u16 portstatus, portchange;
        int i = 0;
        int err;

        connect_change = test_bit(port1, hub->change_bits);
        clear_bit(port1, hub->event_bits);
        clear_bit(port1, hub->wakeup_bits);

        if (usb_hub_port_status(hub, port1, &portstatus, &portchange) < 0)
                return;

        if (portchange & USB_PORT_STAT_C_CONNECTION) {
                usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION);
                connect_change = 1;
        }

        if (portchange & USB_PORT_STAT_C_ENABLE) {
                if (!connect_change)
                        dev_dbg(&port_dev->dev, "enable change, status %08x\n",
                                        portstatus);
                usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_ENABLE);

                /*
                 * EM interference sometimes causes badly shielded USB devices
                 * to be shutdown by the hub, this hack enables them again.
                 * Works at least with mouse driver.
                 */
                if (!(portstatus & USB_PORT_STAT_ENABLE)
                    && !connect_change && udev) {
                        dev_err(&port_dev->dev, "disabled by hub (EMI?), re-enabling...\n");
                        connect_change = 1;
                }
        }

        if (portchange & USB_PORT_STAT_C_OVERCURRENT) {
                u16 status = 0, unused;
                port_dev->over_current_count++;
                port_over_current_notify(port_dev);

                dev_dbg(&port_dev->dev, "over-current change #%u\n",
                        port_dev->over_current_count);
                usb_clear_port_feature(hdev, port1,
                                USB_PORT_FEAT_C_OVER_CURRENT);
                msleep(100);        /* Cool down */
                hub_power_on(hub, true);
                usb_hub_port_status(hub, port1, &status, &unused);
                if (status & USB_PORT_STAT_OVERCURRENT)
                        dev_err(&port_dev->dev, "over-current condition\n");
        }

        if (portchange & USB_PORT_STAT_C_RESET) {
                dev_dbg(&port_dev->dev, "reset change\n");
                usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_RESET);
        }
        if ((portchange & USB_PORT_STAT_C_BH_RESET)
            && hub_is_superspeed(hdev)) {
                dev_dbg(&port_dev->dev, "warm reset change\n");
                usb_clear_port_feature(hdev, port1,
                                USB_PORT_FEAT_C_BH_PORT_RESET);
        }
        if (portchange & USB_PORT_STAT_C_LINK_STATE) {
                dev_dbg(&port_dev->dev, "link state change\n");
                usb_clear_port_feature(hdev, port1,
                                USB_PORT_FEAT_C_PORT_LINK_STATE);
        }
        if (portchange & USB_PORT_STAT_C_CONFIG_ERROR) {
                dev_warn(&port_dev->dev, "config error\n");
                usb_clear_port_feature(hdev, port1,
                                USB_PORT_FEAT_C_PORT_CONFIG_ERROR);
        }

        /* skip port actions that require the port to be powered on */
        if (!pm_runtime_active(&port_dev->dev))
                return;

        /* skip port actions if ignore_event and early_stop are true */
        if (port_dev->ignore_event && port_dev->early_stop)
                return;

        if (hub_handle_remote_wakeup(hub, port1, portstatus, portchange))
                connect_change = 1;

        /*
         * Avoid trying to recover a USB3 SS.Inactive port with a warm reset if
         * the device was disconnected. A 12ms disconnect detect timer in
         * SS.Inactive state transitions the port to RxDetect automatically.
         * SS.Inactive link error state is common during device disconnect.
         */
        while (hub_port_warm_reset_required(hub, port1, portstatus)) {
                if ((i++ < DETECT_DISCONNECT_TRIES) && udev) {
                        u16 unused;

                        msleep(20);
                        usb_hub_port_status(hub, port1, &portstatus, &unused);
                        dev_dbg(&port_dev->dev, "Wait for inactive link disconnect detect\n");
                        continue;
                } else if (!udev || !(portstatus & USB_PORT_STAT_CONNECTION)
                                || udev->state == USB_STATE_NOTATTACHED) {
                        dev_dbg(&port_dev->dev, "do warm reset, port only\n");
                        err = hub_port_reset(hub, port1, NULL,
                                             HUB_BH_RESET_TIME, true);
                        if (!udev && err == -ENOTCONN)
                                connect_change = 0;
                        else if (err < 0)
                                hub_port_disable(hub, port1, 1);
                } else {
                        dev_dbg(&port_dev->dev, "do warm reset, full device\n");
                        usb_unlock_port(port_dev);
                        usb_lock_device(udev);
                        usb_reset_device(udev);
                        usb_unlock_device(udev);
                        usb_lock_port(port_dev);
                        connect_change = 0;
                }
                break;
        }

        if (connect_change)
                hub_port_connect_change(hub, port1, portstatus, portchange);
}

static void hub_event(struct work_struct *work)
{
        struct usb_device *hdev;
        struct usb_interface *intf;
        struct usb_hub *hub;
        struct device *hub_dev;
        u16 hubstatus;
        u16 hubchange;
        int i, ret;

        hub = container_of(work, struct usb_hub, events);
        hdev = hub->hdev;
        hub_dev = hub->intfdev;
        intf = to_usb_interface(hub_dev);

        kcov_remote_start_usb((u64)hdev->bus->busnum);

        dev_dbg(hub_dev, "state %d ports %d chg %04x evt %04x\n",
                        hdev->state, hdev->maxchild,
                        /* NOTE: expects max 15 ports... */
                        (u16) hub->change_bits[0],
                        (u16) hub->event_bits[0]);

        /* Lock the device, then check to see if we were
         * disconnected while waiting for the lock to succeed. */
        usb_lock_device(hdev);
        if (unlikely(hub->disconnected))
                goto out_hdev_lock;

        /* If the hub has died, clean up after it */
        if (hdev->state == USB_STATE_NOTATTACHED) {
                hub->error = -ENODEV;
                hub_quiesce(hub, HUB_DISCONNECT);
                goto out_hdev_lock;
        }

        /* Autoresume */
        ret = usb_autopm_get_interface(intf);
        if (ret) {
                dev_dbg(hub_dev, "Can't autoresume: %d\n", ret);
                goto out_hdev_lock;
        }

        /* If this is an inactive hub, do nothing */
        if (hub->quiescing)
                goto out_autopm;

        if (hub->error) {
                dev_dbg(hub_dev, "resetting for error %d\n", hub->error);

                ret = usb_reset_device(hdev);
                if (ret) {
                        dev_dbg(hub_dev, "error resetting hub: %d\n", ret);
                        goto out_autopm;
                }

                hub->nerrors = 0;
                hub->error = 0;
        }

        /* deal with port status changes */
        for (i = 1; i <= hdev->maxchild; i++) {
                struct usb_port *port_dev = hub->ports[i - 1];

                if (test_bit(i, hub->event_bits)
                                || test_bit(i, hub->change_bits)
                                || test_bit(i, hub->wakeup_bits)) {
                        /*
                         * The get_noresume and barrier ensure that if
                         * the port was in the process of resuming, we
                         * flush that work and keep the port active for
                         * the duration of the port_event().  However,
                         * if the port is runtime pm suspended
                         * (powered-off), we leave it in that state, run
                         * an abbreviated port_event(), and move on.
                         */
                        pm_runtime_get_noresume(&port_dev->dev);
                        pm_runtime_barrier(&port_dev->dev);
                        usb_lock_port(port_dev);
                        port_event(hub, i);
                        usb_unlock_port(port_dev);
                        pm_runtime_put_sync(&port_dev->dev);
                }
        }

        /* deal with hub status changes */
        if (test_and_clear_bit(0, hub->event_bits) == 0)
                ;        /* do nothing */
        else if (hub_hub_status(hub, &hubstatus, &hubchange) < 0)
                dev_err(hub_dev, "get_hub_status failed\n");
        else {
                if (hubchange & HUB_CHANGE_LOCAL_POWER) {
                        dev_dbg(hub_dev, "power change\n");
                        clear_hub_feature(hdev, C_HUB_LOCAL_POWER);
                        if (hubstatus & HUB_STATUS_LOCAL_POWER)
                                /* FIXME: Is this always true? */
                                hub->limited_power = 1;
                        else
                                hub->limited_power = 0;
                }
                if (hubchange & HUB_CHANGE_OVERCURRENT) {
                        u16 status = 0;
                        u16 unused;

                        dev_dbg(hub_dev, "over-current change\n");
                        clear_hub_feature(hdev, C_HUB_OVER_CURRENT);
                        msleep(500);        /* Cool down */
                        hub_power_on(hub, true);
                        hub_hub_status(hub, &status, &unused);
                        if (status & HUB_STATUS_OVERCURRENT)
                                dev_err(hub_dev, "over-current condition\n");
                }
        }

out_autopm:
        /* Balance the usb_autopm_get_interface() above */
        usb_autopm_put_interface_no_suspend(intf);
out_hdev_lock:
        usb_unlock_device(hdev);

        /* Balance the stuff in kick_hub_wq() and allow autosuspend */
        usb_autopm_put_interface(intf);
        hub_put(hub);

        kcov_remote_stop();
}

static const struct usb_device_id hub_id_table[] = {
    { .match_flags = USB_DEVICE_ID_MATCH_VENDOR
                   | USB_DEVICE_ID_MATCH_PRODUCT
                   | USB_DEVICE_ID_MATCH_INT_CLASS,
      .idVendor = USB_VENDOR_SMSC,
      .idProduct = USB_PRODUCT_USB5534B,
      .bInterfaceClass = USB_CLASS_HUB,
      .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND},
    { .match_flags = USB_DEVICE_ID_MATCH_VENDOR
                   | USB_DEVICE_ID_MATCH_PRODUCT,
      .idVendor = USB_VENDOR_CYPRESS,
      .idProduct = USB_PRODUCT_CY7C65632,
      .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND},
    { .match_flags = USB_DEVICE_ID_MATCH_VENDOR
                        | USB_DEVICE_ID_MATCH_INT_CLASS,
      .idVendor = USB_VENDOR_GENESYS_LOGIC,
      .bInterfaceClass = USB_CLASS_HUB,
      .driver_info = HUB_QUIRK_CHECK_PORT_AUTOSUSPEND},
    { .match_flags = USB_DEVICE_ID_MATCH_VENDOR
                        | USB_DEVICE_ID_MATCH_PRODUCT,
      .idVendor = USB_VENDOR_TEXAS_INSTRUMENTS,
      .idProduct = USB_PRODUCT_TUSB8041_USB2,
      .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND},
    { .match_flags = USB_DEVICE_ID_MATCH_VENDOR
                        | USB_DEVICE_ID_MATCH_PRODUCT,
      .idVendor = USB_VENDOR_TEXAS_INSTRUMENTS,
      .idProduct = USB_PRODUCT_TUSB8041_USB3,
      .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND},
        { .match_flags = USB_DEVICE_ID_MATCH_VENDOR
                        | USB_DEVICE_ID_MATCH_PRODUCT,
          .idVendor = USB_VENDOR_MICROCHIP,
          .idProduct = USB_PRODUCT_USB4913,
          .driver_info = HUB_QUIRK_REDUCE_FRAME_INTR_BINTERVAL},
        { .match_flags = USB_DEVICE_ID_MATCH_VENDOR
                        | USB_DEVICE_ID_MATCH_PRODUCT,
          .idVendor = USB_VENDOR_MICROCHIP,
          .idProduct = USB_PRODUCT_USB4914,
          .driver_info = HUB_QUIRK_REDUCE_FRAME_INTR_BINTERVAL},
        { .match_flags = USB_DEVICE_ID_MATCH_VENDOR
                        | USB_DEVICE_ID_MATCH_PRODUCT,
          .idVendor = USB_VENDOR_MICROCHIP,
          .idProduct = USB_PRODUCT_USB4915,
          .driver_info = HUB_QUIRK_REDUCE_FRAME_INTR_BINTERVAL},
    { .match_flags = USB_DEVICE_ID_MATCH_DEV_CLASS,
      .bDeviceClass = USB_CLASS_HUB},
    { .match_flags = USB_DEVICE_ID_MATCH_INT_CLASS,
      .bInterfaceClass = USB_CLASS_HUB},
    { }                                                /* Terminating entry */
};

MODULE_DEVICE_TABLE(usb, hub_id_table);

static struct usb_driver hub_driver = {
        .name =                "hub",
        .probe =        hub_probe,
        .disconnect =        hub_disconnect,
        .suspend =        hub_suspend,
        .resume =        hub_resume,
        .reset_resume =        hub_reset_resume,
        .pre_reset =        hub_pre_reset,
        .post_reset =        hub_post_reset,
        .unlocked_ioctl = hub_ioctl,
        .id_table =        hub_id_table,
        .supports_autosuspend =        1,
};

int usb_hub_init(void)
{
        if (usb_register(&hub_driver) < 0) {
                printk(KERN_ERR "%s: can't register hub driver\n",
                        usbcore_name);
                return -1;
        }

        /*
         * The workqueue needs to be freezable to avoid interfering with
         * USB-PERSIST port handover. Otherwise it might see that a full-speed
         * device was gone before the EHCI controller had handed its port
         * over to the companion full-speed controller.
         */
        hub_wq = alloc_workqueue("usb_hub_wq", WQ_FREEZABLE | WQ_PERCPU, 0);
        if (hub_wq)
                return 0;

        /* Fall through if kernel_thread failed */
        usb_deregister(&hub_driver);
        pr_err("%s: can't allocate workqueue for usb hub\n", usbcore_name);

        return -1;
}

void usb_hub_cleanup(void)
{
        destroy_workqueue(hub_wq);

        /*
         * Hub resources are freed for us by usb_deregister. It calls
         * usb_driver_purge on every device which in turn calls that
         * devices disconnect function if it is using this driver.
         * The hub_disconnect function takes care of releasing the
         * individual hub resources. -greg
         */
        usb_deregister(&hub_driver);
} /* usb_hub_cleanup() */

/**
 * hub_hc_release_resources - clear resources used by host controller
 * @udev: pointer to device being released
 *
 * Context: task context, might sleep
 *
 * Function releases the host controller resources in correct order before
 * making any operation on resuming usb device. The host controller resources
 * allocated for devices in tree should be released starting from the last
 * usb device in tree toward the root hub. This function is used only during
 * resuming device when usb device require reinitialization – that is, when
 * flag udev->reset_resume is set.
 *
 * This call is synchronous, and may not be used in an interrupt context.
 */
static void hub_hc_release_resources(struct usb_device *udev)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(udev);
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);
        int i;

        /* Release up resources for all children before this device */
        for (i = 0; i < udev->maxchild; i++)
                if (hub->ports[i]->child)
                        hub_hc_release_resources(hub->ports[i]->child);

        if (hcd->driver->reset_device)
                hcd->driver->reset_device(hcd, udev);
}

/**
 * usb_reset_and_verify_device - perform a USB port reset to reinitialize a device
 * @udev: device to reset (not in SUSPENDED or NOTATTACHED state)
 *
 * WARNING - don't use this routine to reset a composite device
 * (one with multiple interfaces owned by separate drivers)!
 * Use usb_reset_device() instead.
 *
 * Do a port reset, reassign the device's address, and establish its
 * former operating configuration.  If the reset fails, or the device's
 * descriptors change from their values before the reset, or the original
 * configuration and altsettings cannot be restored, a flag will be set
 * telling hub_wq to pretend the device has been disconnected and then
 * re-connected.  All drivers will be unbound, and the device will be
 * re-enumerated and probed all over again.
 *
 * Return: 0 if the reset succeeded, -ENODEV if the device has been
 * flagged for logical disconnection, or some other negative error code
 * if the reset wasn't even attempted.
 *
 * Note:
 * The caller must own the device lock and the port lock, the latter is
 * taken by usb_reset_device().  For example, it's safe to use
 * usb_reset_device() from a driver probe() routine after downloading
 * new firmware.  For calls that might not occur during probe(), drivers
 * should lock the device using usb_lock_device_for_reset().
 *
 * Locking exception: This routine may also be called from within an
 * autoresume handler.  Such usage won't conflict with other tasks
 * holding the device lock because these tasks should always call
 * usb_autopm_resume_device(), thereby preventing any unwanted
 * autoresume.  The autoresume handler is expected to have already
 * acquired the port lock before calling this routine.
 */
static int usb_reset_and_verify_device(struct usb_device *udev)
{
        struct usb_device                *parent_hdev = udev->parent;
        struct usb_hub                        *parent_hub;
        struct usb_hcd                        *hcd = bus_to_hcd(udev->bus);
        struct usb_device_descriptor        descriptor;
        struct usb_interface                *intf;
        struct usb_host_bos                *bos;
        int                                i, j, ret = 0;
        int                                port1 = udev->portnum;

        if (udev->state == USB_STATE_NOTATTACHED ||
                        udev->state == USB_STATE_SUSPENDED) {
                dev_dbg(&udev->dev, "device reset not allowed in state %d\n",
                                udev->state);
                return -EINVAL;
        }

        if (!parent_hdev)
                return -EISDIR;

        parent_hub = usb_hub_to_struct_hub(parent_hdev);

        /* Disable USB2 hardware LPM.
         * It will be re-enabled by the enumeration process.
         */
        usb_disable_usb2_hardware_lpm(udev);

        bos = udev->bos;
        udev->bos = NULL;

        if (udev->reset_resume)
                hub_hc_release_resources(udev);

        mutex_lock(hcd->address0_mutex);

        for (i = 0; i < PORT_INIT_TRIES; ++i) {
                if (hub_port_stop_enumerate(parent_hub, port1, i)) {
                        ret = -ENODEV;
                        break;
                }

                /* ep0 maxpacket size may change; let the HCD know about it.
                 * Other endpoints will be handled by re-enumeration. */
                usb_ep0_reinit(udev);
                ret = hub_port_init(parent_hub, udev, port1, i, &descriptor);
                if (ret >= 0 || ret == -ENOTCONN || ret == -ENODEV)
                        break;
        }
        mutex_unlock(hcd->address0_mutex);

        if (ret < 0)
                goto re_enumerate;

        /* Device might have changed firmware (DFU or similar) */
        if (descriptors_changed(udev, &descriptor, bos)) {
                dev_info(&udev->dev, "device firmware changed\n");
                goto re_enumerate;
        }

        /* Restore the device's previous configuration */
        if (!udev->actconfig)
                goto done;

        /*
         * Some devices can't handle setting default altsetting 0 with a
         * Set-Interface request. Disable host-side endpoints of those
         * interfaces here. Enable and reset them back after host has set
         * its internal endpoint structures during usb_hcd_alloc_bandwith()
         */
        for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) {
                intf = udev->actconfig->interface[i];
                if (intf->cur_altsetting->desc.bAlternateSetting == 0)
                        usb_disable_interface(udev, intf, true);
        }

        mutex_lock(hcd->bandwidth_mutex);
        ret = usb_hcd_alloc_bandwidth(udev, udev->actconfig, NULL, NULL);
        if (ret < 0) {
                dev_warn(&udev->dev,
                                "Busted HC?  Not enough HCD resources for "
                                "old configuration.\n");
                mutex_unlock(hcd->bandwidth_mutex);
                goto re_enumerate;
        }
        ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                        USB_REQ_SET_CONFIGURATION, 0,
                        udev->actconfig->desc.bConfigurationValue, 0,
                        NULL, 0, USB_CTRL_SET_TIMEOUT);
        if (ret < 0) {
                dev_err(&udev->dev,
                        "can't restore configuration #%d (error=%d)\n",
                        udev->actconfig->desc.bConfigurationValue, ret);
                mutex_unlock(hcd->bandwidth_mutex);
                goto re_enumerate;
        }
        mutex_unlock(hcd->bandwidth_mutex);
        usb_set_device_state(udev, USB_STATE_CONFIGURED);

        /* Put interfaces back into the same altsettings as before.
         * Don't bother to send the Set-Interface request for interfaces
         * that were already in altsetting 0; besides being unnecessary,
         * many devices can't handle it.  Instead just reset the host-side
         * endpoint state.
         */
        for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) {
                struct usb_host_config *config = udev->actconfig;
                struct usb_interface_descriptor *desc;

                intf = config->interface[i];
                desc = &intf->cur_altsetting->desc;
                if (desc->bAlternateSetting == 0) {
                        usb_enable_interface(udev, intf, true);
                        ret = 0;
                } else {
                        /* Let the bandwidth allocation function know that this
                         * device has been reset, and it will have to use
                         * alternate setting 0 as the current alternate setting.
                         */
                        intf->resetting_device = 1;
                        ret = usb_set_interface(udev, desc->bInterfaceNumber,
                                        desc->bAlternateSetting);
                        intf->resetting_device = 0;
                }
                if (ret < 0) {
                        dev_err(&udev->dev, "failed to restore interface %d "
                                "altsetting %d (error=%d)\n",
                                desc->bInterfaceNumber,
                                desc->bAlternateSetting,
                                ret);
                        goto re_enumerate;
                }
                /* Resetting also frees any allocated streams */
                for (j = 0; j < intf->cur_altsetting->desc.bNumEndpoints; j++)
                        intf->cur_altsetting->endpoint[j].streams = 0;
        }

done:
        /* Now that the alt settings are re-installed, enable LTM and LPM. */
        usb_enable_usb2_hardware_lpm(udev);
        usb_unlocked_enable_lpm(udev);
        usb_enable_ltm(udev);
        usb_release_bos_descriptor(udev);
        udev->bos = bos;
        return 0;

re_enumerate:
        usb_release_bos_descriptor(udev);
        udev->bos = bos;
        hub_port_logical_disconnect(parent_hub, port1);
        return -ENODEV;
}

/**
 * usb_reset_device - warn interface drivers and perform a USB port reset
 * @udev: device to reset (not in NOTATTACHED state)
 *
 * Warns all drivers bound to registered interfaces (using their pre_reset
 * method), performs the port reset, and then lets the drivers know that
 * the reset is over (using their post_reset method).
 *
 * Return: The same as for usb_reset_and_verify_device().
 * However, if a reset is already in progress (for instance, if a
 * driver doesn't have pre_reset() or post_reset() callbacks, and while
 * being unbound or re-bound during the ongoing reset its disconnect()
 * or probe() routine tries to perform a second, nested reset), the
 * routine returns -EINPROGRESS.
 *
 * Note:
 * The caller must own the device lock.  For example, it's safe to use
 * this from a driver probe() routine after downloading new firmware.
 * For calls that might not occur during probe(), drivers should lock
 * the device using usb_lock_device_for_reset().
 *
 * If an interface is currently being probed or disconnected, we assume
 * its driver knows how to handle resets.  For all other interfaces,
 * if the driver doesn't have pre_reset and post_reset methods then
 * we attempt to unbind it and rebind afterward.
 */
int usb_reset_device(struct usb_device *udev)
{
        int ret;
        int i;
        unsigned int noio_flag;
        struct usb_port *port_dev;
        struct usb_host_config *config = udev->actconfig;
        struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent);

        if (udev->state == USB_STATE_NOTATTACHED) {
                dev_dbg(&udev->dev, "device reset not allowed in state %d\n",
                                udev->state);
                return -EINVAL;
        }

        if (!udev->parent) {
                /* this requires hcd-specific logic; see ohci_restart() */
                dev_dbg(&udev->dev, "%s for root hub!\n", __func__);
                return -EISDIR;
        }

        if (udev->reset_in_progress)
                return -EINPROGRESS;
        udev->reset_in_progress = 1;

        port_dev = hub->ports[udev->portnum - 1];

        /*
         * Don't allocate memory with GFP_KERNEL in current
         * context to avoid possible deadlock if usb mass
         * storage interface or usbnet interface(iSCSI case)
         * is included in current configuration. The easist
         * approach is to do it for every device reset,
         * because the device 'memalloc_noio' flag may have
         * not been set before reseting the usb device.
         */
        noio_flag = memalloc_noio_save();

        /* Prevent autosuspend during the reset */
        usb_autoresume_device(udev);

        if (config) {
                for (i = 0; i < config->desc.bNumInterfaces; ++i) {
                        struct usb_interface *cintf = config->interface[i];
                        struct usb_driver *drv;
                        int unbind = 0;

                        if (cintf->dev.driver) {
                                drv = to_usb_driver(cintf->dev.driver);
                                if (drv->pre_reset && drv->post_reset)
                                        unbind = (drv->pre_reset)(cintf);
                                else if (cintf->condition ==
                                                USB_INTERFACE_BOUND)
                                        unbind = 1;
                                if (unbind)
                                        usb_forced_unbind_intf(cintf);
                        }
                }
        }

        usb_lock_port(port_dev);
        ret = usb_reset_and_verify_device(udev);
        usb_unlock_port(port_dev);

        if (config) {
                for (i = config->desc.bNumInterfaces - 1; i >= 0; --i) {
                        struct usb_interface *cintf = config->interface[i];
                        struct usb_driver *drv;
                        int rebind = cintf->needs_binding;

                        if (!rebind && cintf->dev.driver) {
                                drv = to_usb_driver(cintf->dev.driver);
                                if (drv->post_reset)
                                        rebind = (drv->post_reset)(cintf);
                                else if (cintf->condition ==
                                                USB_INTERFACE_BOUND)
                                        rebind = 1;
                                if (rebind)
                                        cintf->needs_binding = 1;
                        }
                }

                /* If the reset failed, hub_wq will unbind drivers later */
                if (ret == 0)
                        usb_unbind_and_rebind_marked_interfaces(udev);
        }

        usb_autosuspend_device(udev);
        memalloc_noio_restore(noio_flag);
        udev->reset_in_progress = 0;
        return ret;
}
EXPORT_SYMBOL_GPL(usb_reset_device);


/**
 * usb_queue_reset_device - Reset a USB device from an atomic context
 * @iface: USB interface belonging to the device to reset
 *
 * This function can be used to reset a USB device from an atomic
 * context, where usb_reset_device() won't work (as it blocks).
 *
 * Doing a reset via this method is functionally equivalent to calling
 * usb_reset_device(), except for the fact that it is delayed to a
 * workqueue. This means that any drivers bound to other interfaces
 * might be unbound, as well as users from usbfs in user space.
 *
 * Corner cases:
 *
 * - Scheduling two resets at the same time from two different drivers
 *   attached to two different interfaces of the same device is
 *   possible; depending on how the driver attached to each interface
 *   handles ->pre_reset(), the second reset might happen or not.
 *
 * - If the reset is delayed so long that the interface is unbound from
 *   its driver, the reset will be skipped.
 *
 * - This function can be called during .probe().  It can also be called
 *   during .disconnect(), but doing so is pointless because the reset
 *   will not occur.  If you really want to reset the device during
 *   .disconnect(), call usb_reset_device() directly -- but watch out
 *   for nested unbinding issues!
 */
void usb_queue_reset_device(struct usb_interface *iface)
{
        if (schedule_work(&iface->reset_ws))
                usb_get_intf(iface);
}
EXPORT_SYMBOL_GPL(usb_queue_reset_device);

/**
 * usb_hub_find_child - Get the pointer of child device
 * attached to the port which is specified by @port1.
 * @hdev: USB device belonging to the usb hub
 * @port1: port num to indicate which port the child device
 *        is attached to.
 *
 * USB drivers call this function to get hub's child device
 * pointer.
 *
 * Return: %NULL if input param is invalid and
 * child's usb_device pointer if non-NULL.
 */
struct usb_device *usb_hub_find_child(struct usb_device *hdev,
                int port1)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);

        if (port1 < 1 || port1 > hdev->maxchild)
                return NULL;
        return hub->ports[port1 - 1]->child;
}
EXPORT_SYMBOL_GPL(usb_hub_find_child);

void usb_hub_adjust_deviceremovable(struct usb_device *hdev,
                struct usb_hub_descriptor *desc)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
        enum usb_port_connect_type connect_type;
        int i;

        if (!hub)
                return;

        if (!hub_is_superspeed(hdev)) {
                for (i = 1; i <= hdev->maxchild; i++) {
                        struct usb_port *port_dev = hub->ports[i - 1];

                        connect_type = port_dev->connect_type;
                        if (connect_type == USB_PORT_CONNECT_TYPE_HARD_WIRED) {
                                u8 mask = 1 << (i%8);

                                if (!(desc->u.hs.DeviceRemovable[i/8] & mask)) {
                                        dev_dbg(&port_dev->dev, "DeviceRemovable is changed to 1 according to platform information.\n");
                                        desc->u.hs.DeviceRemovable[i/8]        |= mask;
                                }
                        }
                }
        } else {
                u16 port_removable = le16_to_cpu(desc->u.ss.DeviceRemovable);

                for (i = 1; i <= hdev->maxchild; i++) {
                        struct usb_port *port_dev = hub->ports[i - 1];

                        connect_type = port_dev->connect_type;
                        if (connect_type == USB_PORT_CONNECT_TYPE_HARD_WIRED) {
                                u16 mask = 1 << i;

                                if (!(port_removable & mask)) {
                                        dev_dbg(&port_dev->dev, "DeviceRemovable is changed to 1 according to platform information.\n");
                                        port_removable |= mask;
                                }
                        }
                }

                desc->u.ss.DeviceRemovable = cpu_to_le16(port_removable);
        }
}

#ifdef CONFIG_ACPI
/**
 * usb_get_hub_port_acpi_handle - Get the usb port's acpi handle
 * @hdev: USB device belonging to the usb hub
 * @port1: port num of the port
 *
 * Return: Port's acpi handle if successful, %NULL if params are
 * invalid.
 */
acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev,
        int port1)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);

        if (!hub)
                return NULL;

        return ACPI_HANDLE(&hub->ports[port1 - 1]->dev);
}
#endif











































    1 









































































    1 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM readahead

#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_READAHEAD_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/pagemap.h>

TRACE_EVENT(page_cache_ra_unbounded,
        TP_PROTO(struct inode *inode, pgoff_t index, unsigned long nr_to_read,
                 unsigned long lookahead_size),

        TP_ARGS(inode, index, nr_to_read, lookahead_size),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(dev_t, s_dev)
                __field(pgoff_t, index)
                __field(unsigned long, nr_to_read)
                __field(unsigned long, lookahead_size)
        ),

        TP_fast_assign(
                __entry->i_ino = inode->i_ino;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->index = index;
                __entry->nr_to_read = nr_to_read;
                __entry->lookahead_size = lookahead_size;
        ),

        TP_printk(
                "dev=%d:%d ino=%llx index=%lu nr_to_read=%lu lookahead_size=%lu",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino,
                __entry->index, __entry->nr_to_read, __entry->lookahead_size
        )
);

TRACE_EVENT(page_cache_ra_order,
        TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra),

        TP_ARGS(inode, index, ra),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(dev_t, s_dev)
                __field(pgoff_t, index)
                __field(unsigned int, order)
                __field(unsigned int, size)
                __field(unsigned int, async_size)
                __field(unsigned int, ra_pages)
        ),

        TP_fast_assign(
                __entry->i_ino = inode->i_ino;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->index = index;
                __entry->order = ra->order;
                __entry->size = ra->size;
                __entry->async_size = ra->async_size;
                __entry->ra_pages = ra->ra_pages;
        ),

        TP_printk(
                "dev=%d:%d ino=%llx index=%lu order=%u size=%u async_size=%u ra_pages=%u",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino,
                __entry->index, __entry->order, __entry->size,
                __entry->async_size, __entry->ra_pages
        )
);

DECLARE_EVENT_CLASS(page_cache_ra_op,
        TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra,
                 unsigned long req_count),

        TP_ARGS(inode, index, ra, req_count),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(loff_t, prev_pos)
                __field(pgoff_t, index)
                __field(unsigned long, req_count)
                __field(dev_t, s_dev)
                __field(unsigned int, order)
                __field(unsigned int, size)
                __field(unsigned int, async_size)
                __field(unsigned int, ra_pages)
                __field(unsigned int, mmap_miss)
        ),

        TP_fast_assign(
                __entry->i_ino = inode->i_ino;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->index = index;
                __entry->order = ra->order;
                __entry->size = ra->size;
                __entry->async_size = ra->async_size;
                __entry->ra_pages = ra->ra_pages;
                __entry->mmap_miss = ra->mmap_miss;
                __entry->prev_pos = ra->prev_pos;
                __entry->req_count = req_count;
        ),

        TP_printk(
                "dev=%d:%d ino=%llx index=%lu req_count=%lu order=%u size=%u async_size=%u ra_pages=%u mmap_miss=%u prev_pos=%lld",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino,
                __entry->index, __entry->req_count, __entry->order,
                __entry->size, __entry->async_size, __entry->ra_pages,
                __entry->mmap_miss, __entry->prev_pos
        )
);

DEFINE_EVENT(page_cache_ra_op, page_cache_sync_ra,
        TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra,
                 unsigned long req_count),
        TP_ARGS(inode, index, ra, req_count)
);

DEFINE_EVENT(page_cache_ra_op, page_cache_async_ra,
        TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra,
                 unsigned long req_count),
        TP_ARGS(inode, index, ra, req_count)
);

#endif /* _TRACE_FILEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






























































































   36 





    5 











   39 



























































    1 






    2 











    3 













    3 
    2 












    3 
    2 







    2 















    2 





    2 





























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
// SPDX-License-Identifier: GPL-2.0
/*
 * Fast batching percpu counters.
 */

#include <linux/percpu_counter.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/debugobjects.h>

#ifdef CONFIG_HOTPLUG_CPU
static LIST_HEAD(percpu_counters);
static DEFINE_SPINLOCK(percpu_counters_lock);
#endif

#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER

static const struct debug_obj_descr percpu_counter_debug_descr;

static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
{
        struct percpu_counter *fbc = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                percpu_counter_destroy(fbc);
                debug_object_free(fbc, &percpu_counter_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr percpu_counter_debug_descr = {
        .name                = "percpu_counter",
        .fixup_free        = percpu_counter_fixup_free,
};

static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{
        debug_object_init(fbc, &percpu_counter_debug_descr);
        debug_object_activate(fbc, &percpu_counter_debug_descr);
}

static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{
        debug_object_deactivate(fbc, &percpu_counter_debug_descr);
        debug_object_free(fbc, &percpu_counter_debug_descr);
}

#else        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{ }
static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{ }
#endif        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */

void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        for_each_possible_cpu(cpu) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                *pcount = 0;
        }
        fbc->count = amount;
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_set);

/*
 * Add to a counter while respecting batch size.
 *
 * There are 2 implementations, both dealing with the following problem:
 *
 * The decision slow path/fast path and the actual update must be atomic.
 * Otherwise a call in process context could check the current values and
 * decide that the fast path can be used. If now an interrupt occurs before
 * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters),
 * then the this_cpu_add() that is executed after the interrupt has completed
 * can produce values larger than "batch" or even overflows.
 */
#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
/*
 * Safety against interrupts is achieved in 2 ways:
 * 1. the fast path uses local cmpxchg (note: no lock prefix)
 * 2. the slow path operates with interrupts disabled
 */
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        s64 count;
        unsigned long flags;

        count = this_cpu_read(*fbc->counters);
        do {
                if (unlikely(abs(count + amount) >= batch)) {
                        raw_spin_lock_irqsave(&fbc->lock, flags);
                        /*
                         * Note: by now we might have migrated to another CPU
                         * or the value might have changed.
                         */
                        count = __this_cpu_read(*fbc->counters);
                        fbc->count += count + amount;
                        __this_cpu_sub(*fbc->counters, count);
                        raw_spin_unlock_irqrestore(&fbc->lock, flags);
                        return;
                }
        } while (!this_cpu_try_cmpxchg(*fbc->counters, &count, count + amount));
}
#else
/*
 * local_irq_save() is used to make the function irq safe:
 * - The slow path would be ok as protected by an irq-safe spinlock.
 * - this_cpu_add would be ok as it is irq-safe by definition.
 */
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        s64 count;
        unsigned long flags;

        local_irq_save(flags);
        count = __this_cpu_read(*fbc->counters) + amount;
        if (abs(count) >= batch) {
                raw_spin_lock(&fbc->lock);
                fbc->count += count;
                __this_cpu_sub(*fbc->counters, count - amount);
                raw_spin_unlock(&fbc->lock);
        } else {
                this_cpu_add(*fbc->counters, amount);
        }
        local_irq_restore(flags);
}
#endif
EXPORT_SYMBOL(percpu_counter_add_batch);

/*
 * For percpu_counter with a big batch, the devication of its count could
 * be big, and there is requirement to reduce the deviation, like when the
 * counter's batch could be runtime decreased to get a better accuracy,
 * which can be achieved by running this sync function on each CPU.
 */
void percpu_counter_sync(struct percpu_counter *fbc)
{
        unsigned long flags;
        s64 count;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        count = __this_cpu_read(*fbc->counters);
        fbc->count += count;
        __this_cpu_sub(*fbc->counters, count);
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_sync);

/*
 * Add up all the per-cpu counts, return the result.  This is a more accurate
 * but much slower version of percpu_counter_read_positive().
 *
 * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums
 * from CPUs that are in the process of being taken offline. Dying cpus have
 * been removed from the online mask, but may not have had the hotplug dead
 * notifier called to fold the percpu count back into the global counter sum.
 * By including dying CPUs in the iteration mask, we avoid this race condition
 * so __percpu_counter_sum() just does the right thing when CPUs are being taken
 * offline.
 */
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
        s64 ret;
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        ret = fbc->count;
        for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                ret += *pcount;
        }
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
        return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);

int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
                               gfp_t gfp, u32 nr_counters,
                               struct lock_class_key *key)
{
        unsigned long flags __maybe_unused;
        size_t counter_size;
        s32 __percpu *counters;
        u32 i;

        counter_size = ALIGN(sizeof(*counters), __alignof__(*counters));
        counters = __alloc_percpu_gfp(nr_counters * counter_size,
                                      __alignof__(*counters), gfp);
        if (!counters) {
                fbc[0].counters = NULL;
                return -ENOMEM;
        }

        for (i = 0; i < nr_counters; i++) {
                raw_spin_lock_init(&fbc[i].lock);
                lockdep_set_class(&fbc[i].lock, key);
#ifdef CONFIG_HOTPLUG_CPU
                INIT_LIST_HEAD(&fbc[i].list);
#endif
                fbc[i].count = amount;
                fbc[i].counters = (void __percpu *)counters + i * counter_size;

                debug_percpu_counter_activate(&fbc[i]);
        }

#ifdef CONFIG_HOTPLUG_CPU
        spin_lock_irqsave(&percpu_counters_lock, flags);
        for (i = 0; i < nr_counters; i++)
                list_add(&fbc[i].list, &percpu_counters);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif
        return 0;
}
EXPORT_SYMBOL(__percpu_counter_init_many);

void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters)
{
        unsigned long flags __maybe_unused;
        u32 i;

        if (WARN_ON_ONCE(!fbc))
                return;

        if (!fbc[0].counters)
                return;

        for (i = 0; i < nr_counters; i++)
                debug_percpu_counter_deactivate(&fbc[i]);

#ifdef CONFIG_HOTPLUG_CPU
        spin_lock_irqsave(&percpu_counters_lock, flags);
        for (i = 0; i < nr_counters; i++)
                list_del(&fbc[i].list);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif

        free_percpu(fbc[0].counters);

        for (i = 0; i < nr_counters; i++)
                fbc[i].counters = NULL;
}
EXPORT_SYMBOL(percpu_counter_destroy_many);

int percpu_counter_batch __read_mostly = 32;
EXPORT_SYMBOL(percpu_counter_batch);

static int compute_batch_value(unsigned int cpu)
{
        int nr = num_online_cpus();

        percpu_counter_batch = max(32, nr*2);
        return 0;
}

static int percpu_counter_cpu_dead(unsigned int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
        struct percpu_counter *fbc;

        compute_batch_value(cpu);

        spin_lock_irq(&percpu_counters_lock);
        list_for_each_entry(fbc, &percpu_counters, list) {
                s32 *pcount;

                raw_spin_lock(&fbc->lock);
                pcount = per_cpu_ptr(fbc->counters, cpu);
                fbc->count += *pcount;
                *pcount = 0;
                raw_spin_unlock(&fbc->lock);
        }
        spin_unlock_irq(&percpu_counters_lock);
#endif
        return 0;
}

/*
 * Compare counter against given value.
 * Return 1 if greater, 0 if equal and -1 if less
 */
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        s64        count;

        count = percpu_counter_read(fbc);
        /* Check to see if rough count will be sufficient for comparison */
        if (abs(count - rhs) > (batch * num_online_cpus())) {
                if (count > rhs)
                        return 1;
                else
                        return -1;
        }
        /* Need to use precise count */
        count = percpu_counter_sum(fbc);
        if (count > rhs)
                return 1;
        else if (count < rhs)
                return -1;
        else
                return 0;
}
EXPORT_SYMBOL(__percpu_counter_compare);

/*
 * Compare counter, and add amount if total is: less than or equal to limit if
 * amount is positive, or greater than or equal to limit if amount is negative.
 * Return true if amount is added, or false if total would be beyond the limit.
 *
 * Negative limit is allowed, but unusual.
 * When negative amounts (subs) are given to percpu_counter_limited_add(),
 * the limit would most naturally be 0 - but other limits are also allowed.
 *
 * Overflow beyond S64_MAX is not allowed for: counter, limit and amount
 * are all assumed to be sane (far from S64_MIN and S64_MAX).
 */
bool __percpu_counter_limited_add(struct percpu_counter *fbc,
                                  s64 limit, s64 amount, s32 batch)
{
        s64 count;
        s64 unknown;
        unsigned long flags;
        bool good = false;

        if (amount == 0)
                return true;

        local_irq_save(flags);
        unknown = batch * num_online_cpus();
        count = __this_cpu_read(*fbc->counters);

        /* Skip taking the lock when safe */
        if (abs(count + amount) <= batch &&
            ((amount > 0 && fbc->count + unknown <= limit) ||
             (amount < 0 && fbc->count - unknown >= limit))) {
                this_cpu_add(*fbc->counters, amount);
                local_irq_restore(flags);
                return true;
        }

        raw_spin_lock(&fbc->lock);
        count = fbc->count + amount;

        /* Skip percpu_counter_sum() when safe */
        if (amount > 0) {
                if (count - unknown > limit)
                        goto out;
                if (count + unknown <= limit)
                        good = true;
        } else {
                if (count + unknown < limit)
                        goto out;
                if (count - unknown >= limit)
                        good = true;
        }

        if (!good) {
                s32 *pcount;
                int cpu;

                for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
                        pcount = per_cpu_ptr(fbc->counters, cpu);
                        count += *pcount;
                }
                if (amount > 0) {
                        if (count > limit)
                                goto out;
                } else {
                        if (count < limit)
                                goto out;
                }
                good = true;
        }

        count = __this_cpu_read(*fbc->counters);
        fbc->count += count + amount;
        __this_cpu_sub(*fbc->counters, count);
out:
        raw_spin_unlock(&fbc->lock);
        local_irq_restore(flags);
        return good;
}

static int __init percpu_counter_startup(void)
{
        int ret;

        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
                                compute_batch_value, NULL);
        WARN_ON(ret < 0);
        ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
                                        "lib/percpu_cnt:dead", NULL,
                                        percpu_counter_cpu_dead);
        WARN_ON(ret < 0);
        return 0;
}
module_init(percpu_counter_startup);






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   23 


























































































































































































































































































































































    1 
    1 








    1 



















    1 














    1 




    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_H
#define _LINUX_SCHED_H

/*
 * Define 'struct task_struct' and provide the main scheduler
 * APIs (schedule(), wakeup variants, etc.)
 */

#include <uapi/linux/sched.h>

#include <asm/current.h>
#include <asm/processor.h>
#include <linux/thread_info.h>
#include <linux/preempt.h>
#include <linux/cpumask_types.h>

#include <linux/cache.h>
#include <linux/irqflags_types.h>
#include <linux/smp_types.h>
#include <linux/pid_types.h>
#include <linux/sem_types.h>
#include <linux/shm.h>
#include <linux/kmsan_types.h>
#include <linux/mutex_types.h>
#include <linux/plist_types.h>
#include <linux/hrtimer_types.h>
#include <linux/timer_types.h>
#include <linux/seccomp_types.h>
#include <linux/nodemask_types.h>
#include <linux/refcount_types.h>
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
#include <linux/spinlock.h>
#include <linux/syscall_user_dispatch_types.h>
#include <linux/mm_types_task.h>
#include <linux/netdevice_xmit.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers_types.h>
#include <linux/restart_block.h>
#include <linux/rseq_types.h>
#include <linux/seqlock_types.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
#include <linux/uidgid_types.h>
#include <linux/tracepoint-defs.h>
#include <linux/unwind_deferred_types.h>
#include <asm/kmap_size.h>
#include <linux/time64.h>
#ifndef COMPILE_OFFSETS
#include <generated/rq-offsets.h>
#endif

/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
struct bio_list;
struct blk_plug;
struct bpf_local_storage;
struct bpf_run_ctx;
struct bpf_net_context;
struct capture_control;
struct cfs_rq;
struct fs_struct;
struct futex_pi_state;
struct io_context;
struct io_uring_task;
struct mempolicy;
struct nameidata;
struct nsproxy;
struct perf_event_context;
struct perf_ctx_data;
struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
struct sched_dl_entity;
struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
struct task_struct;
struct timespec64;
struct user_event_mm;

#include <linux/sched/ext.h>

/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->__state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */

/* Used in tsk->__state: */
#define TASK_RUNNING                        0x00000000
#define TASK_INTERRUPTIBLE                0x00000001
#define TASK_UNINTERRUPTIBLE                0x00000002
#define __TASK_STOPPED                        0x00000004
#define __TASK_TRACED                        0x00000008
/* Used in tsk->exit_state: */
#define EXIT_DEAD                        0x00000010
#define EXIT_ZOMBIE                        0x00000020
#define EXIT_TRACE                        (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->__state again: */
#define TASK_PARKED                        0x00000040
#define TASK_DEAD                        0x00000080
#define TASK_WAKEKILL                        0x00000100
#define TASK_WAKING                        0x00000200
#define TASK_NOLOAD                        0x00000400
#define TASK_NEW                        0x00000800
#define TASK_RTLOCK_WAIT                0x00001000
#define TASK_FREEZABLE                        0x00002000
#define __TASK_FREEZABLE_UNSAFE               (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP))
#define TASK_FROZEN                        0x00008000
#define TASK_STATE_MAX                        0x00010000

#define TASK_ANY                        (TASK_STATE_MAX-1)

/*
 * DO NOT ADD ANY NEW USERS !
 */
#define TASK_FREEZABLE_UNSAFE                (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)

/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE                        (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
#define TASK_STOPPED                        (TASK_WAKEKILL | __TASK_STOPPED)
#define TASK_TRACED                        __TASK_TRACED

#define TASK_IDLE                        (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)

/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL                        (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)

/* get_task_state(): */
#define TASK_REPORT                        (TASK_RUNNING | TASK_INTERRUPTIBLE | \
                                         TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
                                         __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
                                         TASK_PARKED)

#define task_is_running(task)                (READ_ONCE((task)->__state) == TASK_RUNNING)

#define task_is_traced(task)                ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
#define task_is_stopped(task)                ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
#define task_is_stopped_or_traced(task)        ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)

/*
 * Special states are those that do not use the normal wait-loop pattern. See
 * the comment with set_special_state().
 */
#define is_special_task_state(state)                                        \
        ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED |        \
                    TASK_DEAD | TASK_FROZEN))

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value)                                \
        do {                                                                \
                WARN_ON_ONCE(is_special_task_state(state_value));        \
                current->task_state_change = _THIS_IP_;                        \
        } while (0)

# define debug_special_state_change(state_value)                        \
        do {                                                                \
                WARN_ON_ONCE(!is_special_task_state(state_value));        \
                current->task_state_change = _THIS_IP_;                        \
        } while (0)

# define debug_rtlock_wait_set_state()                                        \
        do {                                                                 \
                current->saved_state_change = current->task_state_change;\
                current->task_state_change = _THIS_IP_;                         \
        } while (0)

# define debug_rtlock_wait_restore_state()                                \
        do {                                                                 \
                current->task_state_change = current->saved_state_change;\
        } while (0)

#else
# define debug_normal_state_change(cond)        do { } while (0)
# define debug_special_state_change(cond)        do { } while (0)
# define debug_rtlock_wait_set_state()                do { } while (0)
# define debug_rtlock_wait_restore_state()        do { } while (0)
#endif

#define trace_set_current_state(state_value)                     \
        do {                                                     \
                if (tracepoint_enabled(sched_set_state_tp))      \
                        __trace_set_current_state(state_value); \
        } while (0)

/*
 * set_current_state() includes a barrier so that the write of current->__state
 * is correctly serialised wrt the caller's subsequent test of whether to
 * actually sleep:
 *
 *   for (;;) {
 *        set_current_state(TASK_UNINTERRUPTIBLE);
 *        if (CONDITION)
 *           break;
 *
 *        schedule();
 *   }
 *   __set_current_state(TASK_RUNNING);
 *
 * If the caller does not need such serialisation (because, for instance, the
 * CONDITION test and condition change and wakeup are under the same lock) then
 * use __set_current_state().
 *
 * The above is typically ordered against the wakeup, which does:
 *
 *   CONDITION = 1;
 *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
 *
 * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
 * accessing p->__state.
 *
 * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is,
 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
 *
 * However, with slightly different timing the wakeup TASK_RUNNING store can
 * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
 * a problem either because that will result in one extra go around the loop
 * and our @cond test will save the day.
 *
 * Also see the comments of try_to_wake_up().
 */
#define __set_current_state(state_value)                                \
        do {                                                                \
                debug_normal_state_change((state_value));                \
                trace_set_current_state(state_value);                        \
                WRITE_ONCE(current->__state, (state_value));                \
        } while (0)

#define set_current_state(state_value)                                        \
        do {                                                                \
                debug_normal_state_change((state_value));                \
                trace_set_current_state(state_value);                        \
                smp_store_mb(current->__state, (state_value));                \
        } while (0)

/*
 * set_special_state() should be used for those states when the blocking task
 * can not use the regular condition based wait-loop. In that case we must
 * serialize against wakeups such that any possible in-flight TASK_RUNNING
 * stores will not collide with our state change.
 */
#define set_special_state(state_value)                                        \
        do {                                                                \
                unsigned long flags; /* may shadow */                        \
                                                                        \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                debug_special_state_change((state_value));                \
                trace_set_current_state(state_value);                        \
                WRITE_ONCE(current->__state, (state_value));                \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);        \
        } while (0)

/*
 * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
 *
 * RT's spin/rwlock substitutions are state preserving. The state of the
 * task when blocking on the lock is saved in task_struct::saved_state and
 * restored after the lock has been acquired.  These operations are
 * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
 * lock related wakeups while the task is blocked on the lock are
 * redirected to operate on task_struct::saved_state to ensure that these
 * are not dropped. On restore task_struct::saved_state is set to
 * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
 *
 * The lock operation looks like this:
 *
 *        current_save_and_set_rtlock_wait_state();
 *        for (;;) {
 *                if (try_lock())
 *                        break;
 *                raw_spin_unlock_irq(&lock->wait_lock);
 *                schedule_rtlock();
 *                raw_spin_lock_irq(&lock->wait_lock);
 *                set_current_state(TASK_RTLOCK_WAIT);
 *        }
 *        current_restore_rtlock_saved_state();
 */
#define current_save_and_set_rtlock_wait_state()                        \
        do {                                                                \
                lockdep_assert_irqs_disabled();                                \
                raw_spin_lock(&current->pi_lock);                        \
                current->saved_state = current->__state;                \
                debug_rtlock_wait_set_state();                                \
                trace_set_current_state(TASK_RTLOCK_WAIT);                \
                WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);                \
                raw_spin_unlock(&current->pi_lock);                        \
        } while (0);

#define current_restore_rtlock_saved_state()                                \
        do {                                                                \
                lockdep_assert_irqs_disabled();                                \
                raw_spin_lock(&current->pi_lock);                        \
                debug_rtlock_wait_restore_state();                        \
                trace_set_current_state(current->saved_state);                \
                WRITE_ONCE(current->__state, current->saved_state);        \
                current->saved_state = TASK_RUNNING;                        \
                raw_spin_unlock(&current->pi_lock);                        \
        } while (0);

#define get_current_state()        READ_ONCE(current->__state)

/*
 * Define the task command name length as enum, then it can be visible to
 * BPF programs.
 */
enum {
        TASK_COMM_LEN = 16,
};

extern void sched_tick(void);

#define        MAX_SCHEDULE_TIMEOUT                LONG_MAX

extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);
#ifdef CONFIG_PREEMPT_RT
 extern void schedule_rtlock(void);
#endif

extern int __must_check io_schedule_prepare(void);
extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);

/* wrapper functions to trace from this header file */
DECLARE_TRACEPOINT(sched_set_state_tp);
extern void __trace_set_current_state(int state_value);
DECLARE_TRACEPOINT(sched_set_need_resched_tp);
extern void __trace_set_need_resched(struct task_struct *curr, int tif);

/**
 * struct prev_cputime - snapshot of system and user cputime
 * @utime: time spent in user mode
 * @stime: time spent in system mode
 * @lock: protects the above two fields
 *
 * Stores previous user/system time values such that we can guarantee
 * monotonicity.
 */
struct prev_cputime {
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        u64                                utime;
        u64                                stime;
        raw_spinlock_t                        lock;
#endif
};

enum vtime_state {
        /* Task is sleeping or running in a CPU with VTIME inactive: */
        VTIME_INACTIVE = 0,
        /* Task is idle */
        VTIME_IDLE,
        /* Task runs in kernelspace in a CPU with VTIME active: */
        VTIME_SYS,
        /* Task runs in userspace in a CPU with VTIME active: */
        VTIME_USER,
        /* Task runs as guests in a CPU with VTIME active: */
        VTIME_GUEST,
};

struct vtime {
        seqcount_t                seqcount;
        unsigned long long        starttime;
        enum vtime_state        state;
        unsigned int                cpu;
        u64                        utime;
        u64                        stime;
        u64                        gtime;
};

/*
 * Utilization clamp constraints.
 * @UCLAMP_MIN:        Minimum utilization
 * @UCLAMP_MAX:        Maximum utilization
 * @UCLAMP_CNT:        Utilization clamp constraints count
 */
enum uclamp_id {
        UCLAMP_MIN = 0,
        UCLAMP_MAX,
        UCLAMP_CNT
};

extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
extern void sched_domains_mutex_lock(void);
extern void sched_domains_mutex_unlock(void);

struct sched_param {
        int sched_priority;
};

struct sched_info {
#ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */

        /* # of times we have run on this CPU: */
        unsigned long                        pcount;

        /* Time spent waiting on a runqueue: */
        unsigned long long                run_delay;

        /* Max time spent waiting on a runqueue: */
        unsigned long long                max_run_delay;

        /* Min time spent waiting on a runqueue: */
        unsigned long long                min_run_delay;

        /* Timestamps: */

        /* When did we last run on a CPU? */
        unsigned long long                last_arrival;

        /* When were we last queued to run? */
        unsigned long long                last_queued;

        /* Timestamp of max time spent waiting on a runqueue: */
        struct timespec64                max_run_delay_ts;

#endif /* CONFIG_SCHED_INFO */
};

/*
 * Integer metrics need fixed point arithmetic, e.g., sched/fair
 * has a few: load, load_avg, util_avg, freq, and capacity.
 *
 * We define a basic fixed point arithmetic range, and then formalize
 * all these metrics based on that basic range.
 */
# define SCHED_FIXEDPOINT_SHIFT                10
# define SCHED_FIXEDPOINT_SCALE                (1L << SCHED_FIXEDPOINT_SHIFT)

/* Increase resolution of cpu_capacity calculations */
# define SCHED_CAPACITY_SHIFT                SCHED_FIXEDPOINT_SHIFT
# define SCHED_CAPACITY_SCALE                (1L << SCHED_CAPACITY_SHIFT)

struct load_weight {
        unsigned long                        weight;
        u32                                inv_weight;
};

/*
 * The load/runnable/util_avg accumulates an infinite geometric series
 * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
 *
 * [load_avg definition]
 *
 *   load_avg = runnable% * scale_load_down(load)
 *
 * [runnable_avg definition]
 *
 *   runnable_avg = runnable% * SCHED_CAPACITY_SCALE
 *
 * [util_avg definition]
 *
 *   util_avg = running% * SCHED_CAPACITY_SCALE
 *
 * where runnable% is the time ratio that a sched_entity is runnable and
 * running% the time ratio that a sched_entity is running.
 *
 * For cfs_rq, they are the aggregated values of all runnable and blocked
 * sched_entities.
 *
 * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
 * capacity scaling. The scaling is done through the rq_clock_pelt that is used
 * for computing those signals (see update_rq_clock_pelt())
 *
 * N.B., the above ratios (runnable% and running%) themselves are in the
 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
 * to as large a range as necessary. This is for example reflected by
 * util_avg's SCHED_CAPACITY_SCALE.
 *
 * [Overflow issue]
 *
 * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
 * with the highest load (=88761), always runnable on a single cfs_rq,
 * and should not overflow as the number already hits PID_MAX_LIMIT.
 *
 * For all other cases (including 32-bit kernels), struct load_weight's
 * weight will overflow first before we do, because:
 *
 *    Max(load_avg) <= Max(load.weight)
 *
 * Then it is the load_weight's responsibility to consider overflow
 * issues.
 */
struct sched_avg {
        u64                                last_update_time;
        u64                                load_sum;
        u64                                runnable_sum;
        u32                                util_sum;
        u32                                period_contrib;
        unsigned long                        load_avg;
        unsigned long                        runnable_avg;
        unsigned long                        util_avg;
        unsigned int                        util_est;
} ____cacheline_aligned;

/*
 * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
 * updates. When a task is dequeued, its util_est should not be updated if its
 * util_avg has not been updated in the meantime.
 * This information is mapped into the MSB bit of util_est at dequeue time.
 * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
 * it is safe to use MSB.
 */
#define UTIL_EST_WEIGHT_SHIFT                2
#define UTIL_AVG_UNCHANGED                0x80000000

struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
        u64                                wait_start;
        u64                                wait_max;
        u64                                wait_count;
        u64                                wait_sum;
        u64                                iowait_count;
        u64                                iowait_sum;

        u64                                sleep_start;
        u64                                sleep_max;
        s64                                sum_sleep_runtime;

        u64                                block_start;
        u64                                block_max;
        s64                                sum_block_runtime;

        s64                                exec_max;
        u64                                slice_max;

        u64                                nr_migrations_cold;
        u64                                nr_failed_migrations_affine;
        u64                                nr_failed_migrations_running;
        u64                                nr_failed_migrations_hot;
        u64                                nr_forced_migrations;

        u64                                nr_wakeups;
        u64                                nr_wakeups_sync;
        u64                                nr_wakeups_migrate;
        u64                                nr_wakeups_local;
        u64                                nr_wakeups_remote;
        u64                                nr_wakeups_affine;
        u64                                nr_wakeups_affine_attempts;
        u64                                nr_wakeups_passive;
        u64                                nr_wakeups_idle;

#ifdef CONFIG_SCHED_CORE
        u64                                core_forceidle_sum;
#endif
#endif /* CONFIG_SCHEDSTATS */
} ____cacheline_aligned;

struct sched_entity {
        /* For load-balancing: */
        struct load_weight                load;
        struct rb_node                        run_node;
        u64                                deadline;
        u64                                min_vruntime;
        u64                                min_slice;
        u64                                max_slice;

        struct list_head                group_node;
        unsigned char                        on_rq;
        unsigned char                        sched_delayed;
        unsigned char                        rel_deadline;
        unsigned char                        custom_slice;
                                        /* hole */

        u64                                exec_start;
        u64                                sum_exec_runtime;
        u64                                prev_sum_exec_runtime;
        u64                                vruntime;
        /* Approximated virtual lag: */
        s64                                vlag;
        /* 'Protected' deadline, to give out minimum quantums: */
        u64                                vprot;
        u64                                slice;

        u64                                nr_migrations;

#ifdef CONFIG_FAIR_GROUP_SCHED
        int                                depth;
        struct sched_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct cfs_rq                        *cfs_rq;
        /* rq "owned" by this entity/group: */
        struct cfs_rq                        *my_q;
        /* cached value of my_q->h_nr_running */
        unsigned long                        runnable_weight;
#endif

        /*
         * Per entity load average tracking.
         *
         * Put into separate cache line so it does not
         * collide with read-mostly values above.
         */
        struct sched_avg                avg;
};

struct sched_rt_entity {
        struct list_head                run_list;
        unsigned long                        timeout;
        unsigned long                        watchdog_stamp;
        unsigned int                        time_slice;
        unsigned short                        on_rq;
        unsigned short                        on_list;

        struct sched_rt_entity                *back;
#ifdef CONFIG_RT_GROUP_SCHED
        struct sched_rt_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct rt_rq                        *rt_rq;
        /* rq "owned" by this entity/group: */
        struct rt_rq                        *my_q;
#endif
} __randomize_layout;

struct rq_flags;
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf);

struct sched_dl_entity {
        struct rb_node                        rb_node;

        /*
         * Original scheduling parameters. Copied here from sched_attr
         * during sched_setattr(), they will remain the same until
         * the next sched_setattr().
         */
        u64                                dl_runtime;        /* Maximum runtime for each instance        */
        u64                                dl_deadline;        /* Relative deadline of each instance        */
        u64                                dl_period;        /* Separation of two instances (period) */
        u64                                dl_bw;                /* dl_runtime / dl_period                */
        u64                                dl_density;        /* dl_runtime / dl_deadline                */

        /*
         * Actual scheduling parameters. Initialized with the values above,
         * they are continuously updated during task execution. Note that
         * the remaining runtime could be < 0 in case we are in overrun.
         */
        s64                                runtime;        /* Remaining runtime for this instance        */
        u64                                deadline;        /* Absolute deadline for this instance        */
        unsigned int                        flags;                /* Specifying the scheduler behaviour        */

        /*
         * Some bool flags:
         *
         * @dl_throttled tells if we exhausted the runtime. If so, the
         * task has to wait for a replenishment to be performed at the
         * next firing of dl_timer.
         *
         * @dl_yielded tells if task gave up the CPU before consuming
         * all its available runtime during the last job.
         *
         * @dl_non_contending tells if the task is inactive while still
         * contributing to the active utilization. In other words, it
         * indicates if the inactive timer has been armed and its handler
         * has not been executed yet. This flag is useful to avoid race
         * conditions between the inactive timer handler and the wakeup
         * code.
         *
         * @dl_overrun tells if the task asked to be informed about runtime
         * overruns.
         *
         * @dl_server tells if this is a server entity.
         *
         * @dl_server_active tells if the dlserver is active(started).
         * dlserver is started on first cfs enqueue on an idle runqueue
         * and is stopped when a dequeue results in 0 cfs tasks on the
         * runqueue. In other words, dlserver is active only when cpu's
         * runqueue has atleast one cfs task.
         *
         * @dl_defer tells if this is a deferred or regular server. For
         * now only defer server exists.
         *
         * @dl_defer_armed tells if the deferrable server is waiting
         * for the replenishment timer to activate it.
         *
         * @dl_defer_running tells if the deferrable server is actually
         * running, skipping the defer phase.
         *
         * @dl_defer_idle tracks idle state
         */
        unsigned int                        dl_throttled      : 1;
        unsigned int                        dl_yielded        : 1;
        unsigned int                        dl_non_contending : 1;
        unsigned int                        dl_overrun          : 1;
        unsigned int                        dl_server         : 1;
        unsigned int                        dl_server_active  : 1;
        unsigned int                        dl_defer          : 1;
        unsigned int                        dl_defer_armed          : 1;
        unsigned int                        dl_defer_running  : 1;
        unsigned int                        dl_defer_idle     : 1;

        /*
         * Bandwidth enforcement timer. Each -deadline task has its
         * own bandwidth to be enforced, thus we need one timer per task.
         */
        struct hrtimer                        dl_timer;

        /*
         * Inactive timer, responsible for decreasing the active utilization
         * at the "0-lag time". When a -deadline task blocks, it contributes
         * to GRUB's active utilization until the "0-lag time", hence a
         * timer is needed to decrease the active utilization at the correct
         * time.
         */
        struct hrtimer                        inactive_timer;

        /*
         * Bits for DL-server functionality. Also see the comment near
         * dl_server_update().
         *
         * @rq the runqueue this server is for
         */
        struct rq                        *rq;
        dl_server_pick_f                server_pick_task;

#ifdef CONFIG_RT_MUTEXES
        /*
         * Priority Inheritance. When a DEADLINE scheduling entity is boosted
         * pi_se points to the donor, otherwise points to the dl_se it belongs
         * to (the original one/itself).
         */
        struct sched_dl_entity *pi_se;
#endif
};

#ifdef CONFIG_UCLAMP_TASK
/* Number of utilization clamp buckets (shorter alias) */
#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT

/*
 * Utilization clamp for a scheduling entity
 * @value:                clamp value "assigned" to a se
 * @bucket_id:                bucket index corresponding to the "assigned" value
 * @active:                the se is currently refcounted in a rq's bucket
 * @user_defined:        the requested clamp value comes from user-space
 *
 * The bucket_id is the index of the clamp bucket matching the clamp value
 * which is pre-computed and stored to avoid expensive integer divisions from
 * the fast path.
 *
 * The active bit is set whenever a task has got an "effective" value assigned,
 * which can be different from the clamp value "requested" from user-space.
 * This allows to know a task is refcounted in the rq's bucket corresponding
 * to the "effective" bucket_id.
 *
 * The user_defined bit is set whenever a task has got a task-specific clamp
 * value requested from userspace, i.e. the system defaults apply to this task
 * just as a restriction. This allows to relax default clamps when a less
 * restrictive task-specific value has been requested, thus allowing to
 * implement a "nice" semantic. For example, a task running with a 20%
 * default boost can still drop its own boosting to 0%.
 */
struct uclamp_se {
        unsigned int value                : bits_per(SCHED_CAPACITY_SCALE);
        unsigned int bucket_id                : bits_per(UCLAMP_BUCKETS);
        unsigned int active                : 1;
        unsigned int user_defined        : 1;
};
#endif /* CONFIG_UCLAMP_TASK */

union rcu_special {
        struct {
                u8                        blocked;
                u8                        need_qs;
                u8                        exp_hint; /* Hint for performance. */
                u8                        need_mb; /* Readers need smp_mb(). */
        } b; /* Bits. */
        u32 s; /* Set of bits. */
};

enum perf_event_task_context {
        perf_invalid_context = -1,
        perf_hw_context = 0,
        perf_sw_context,
        perf_nr_task_contexts,
};

/*
 * Number of contexts where an event can trigger:
 *      task, softirq, hardirq, nmi.
 */
#define PERF_NR_CONTEXTS        4

struct wake_q_node {
        struct wake_q_node *next;
};

struct kmap_ctrl {
#ifdef CONFIG_KMAP_LOCAL
        int                                idx;
        pte_t                                pteval[KM_MAX_IDX];
#endif
};

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /*
         * For reasons of header soup (see current_thread_info()), this
         * must be the first element of task_struct.
         */
        struct thread_info                thread_info;
#endif
        unsigned int                        __state;

        /* saved state for "spinlock sleepers" */
        unsigned int                        saved_state;

        /*
         * This begins the randomizable portion of task_struct. Only
         * scheduling-critical items should be added above here.
         */
        randomized_struct_fields_start

        void                                *stack;
        refcount_t                        usage;
        /* Per task flags (PF_*), defined further below: */
        unsigned int                        flags;
        unsigned int                        ptrace;

#ifdef CONFIG_MEM_ALLOC_PROFILING
        struct alloc_tag                *alloc_tag;
#endif

        int                                on_cpu;
        struct __call_single_node        wake_entry;
        unsigned int                        wakee_flips;
        unsigned long                        wakee_flip_decay_ts;
        struct task_struct                *last_wakee;

        /*
         * recent_used_cpu is initially set as the last CPU used by a task
         * that wakes affine another task. Waker/wakee relationships can
         * push tasks around a CPU where each wakeup moves to the next one.
         * Tracking a recently used CPU allows a quick search for a recently
         * used CPU that may be idle.
         */
        int                                recent_used_cpu;
        int                                wake_cpu;
        int                                on_rq;

        int                                prio;
        int                                static_prio;
        int                                normal_prio;
        unsigned int                        rt_priority;

        struct sched_entity                se;
        struct sched_rt_entity                rt;
        struct sched_dl_entity                dl;
        struct sched_dl_entity                *dl_server;
#ifdef CONFIG_SCHED_CLASS_EXT
        struct sched_ext_entity                scx;
#endif
        const struct sched_class        *sched_class;

#ifdef CONFIG_SCHED_CORE
        struct rb_node                        core_node;
        unsigned long                        core_cookie;
        unsigned int                        core_occupation;
#endif

#ifdef CONFIG_CGROUP_SCHED
        struct task_group                *sched_task_group;
#ifdef CONFIG_CFS_BANDWIDTH
        struct callback_head                sched_throttle_work;
        struct list_head                throttle_node;
        bool                                throttled;
#endif
#endif


#ifdef CONFIG_UCLAMP_TASK
        /*
         * Clamp values requested for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp_req[UCLAMP_CNT];
        /*
         * Effective clamp values used for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp[UCLAMP_CNT];
#endif

        struct sched_statistics         stats;

#ifdef CONFIG_PREEMPT_NOTIFIERS
        /* List of struct preempt_notifier: */
        struct hlist_head                preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
        unsigned int                        btrace_seq;
#endif

        unsigned int                        policy;
        unsigned long                        max_allowed_capacity;
        int                                nr_cpus_allowed;
        const cpumask_t                        *cpus_ptr;
        cpumask_t                        *user_cpus_ptr;
        cpumask_t                        cpus_mask;
        void                                *migration_pending;
        unsigned short                        migration_disabled;
        unsigned short                        migration_flags;

#ifdef CONFIG_PREEMPT_RCU
        int                                rcu_read_lock_nesting;
        union rcu_special                rcu_read_unlock_special;
        struct list_head                rcu_node_entry;
        struct rcu_node                        *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TASKS_RCU
        unsigned long                        rcu_tasks_nvcsw;
        u8                                rcu_tasks_holdout;
        u8                                rcu_tasks_idx;
        int                                rcu_tasks_idle_cpu;
        struct list_head                rcu_tasks_holdout_list;
        int                                rcu_tasks_exit_cpu;
        struct list_head                rcu_tasks_exit_list;
#endif /* #ifdef CONFIG_TASKS_RCU */

#ifdef CONFIG_TASKS_TRACE_RCU
        int                                trc_reader_nesting;
        struct srcu_ctr __percpu        *trc_reader_scp;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */

#ifdef CONFIG_TRIVIAL_PREEMPT_RCU
        int                                rcu_trivial_preempt_nesting;
#endif /* #ifdef CONFIG_TRIVIAL_PREEMPT_RCU */

        struct sched_info                sched_info;

        struct list_head                tasks;
        struct plist_node                pushable_tasks;
        struct rb_node                        pushable_dl_tasks;

        struct mm_struct                *mm;
        struct mm_struct                *active_mm;

        int                                exit_state;
        int                                exit_code;
        int                                exit_signal;
        /* The signal sent when the parent dies: */
        int                                pdeath_signal;
        /* JOBCTL_*, siglock protected: */
        unsigned long                        jobctl;

        /* Used for emulating ABI behavior of previous Linux versions: */
        unsigned int                        personality;

        /* Scheduler bits, serialized by scheduler locks: */
        unsigned                        sched_reset_on_fork:1;
        unsigned                        sched_contributes_to_load:1;
        unsigned                        sched_migrated:1;
        unsigned                        sched_task_hot:1;

        /* Force alignment to the next boundary: */
        unsigned                        :0;

        /* Unserialized, strictly 'current' */

        /*
         * This field must not be in the scheduler word above due to wakelist
         * queueing no longer being serialized by p->on_cpu. However:
         *
         * p->XXX = X;                        ttwu()
         * schedule()                          if (p->on_rq && ..) // false
         *   smp_mb__after_spinlock();          if (smp_load_acquire(&p->on_cpu) && //true
         *   deactivate_task()                      ttwu_queue_wakelist())
         *     p->on_rq = 0;                        p->sched_remote_wakeup = Y;
         *
         * guarantees all stores of 'current' are visible before
         * ->sched_remote_wakeup gets used, so it can be in this word.
         */
        unsigned                        sched_remote_wakeup:1;
#ifdef CONFIG_RT_MUTEXES
        unsigned                        sched_rt_mutex:1;
#endif

        /* Bit to tell TOMOYO we're in execve(): */
        unsigned                        in_execve:1;
        unsigned                        in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
        unsigned                        restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG_V1
        unsigned                        in_user_fault:1;
#endif
#ifdef CONFIG_LRU_GEN
        /* whether the LRU algorithm may apply to this access */
        unsigned                        in_lru_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
        unsigned                        brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
        /* disallow userland-initiated cgroup migration */
        unsigned                        no_cgroup_migration:1;
        /* task is frozen/stopped (used by the cgroup freezer) */
        unsigned                        frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
        unsigned                        use_memdelay:1;
#endif
#ifdef CONFIG_PSI
        /* Stalled due to lack of memory */
        unsigned                        in_memstall:1;
#endif
#ifdef CONFIG_PAGE_OWNER
        /* Used by page_owner=on to detect recursion in page tracking. */
        unsigned                        in_page_owner:1;
#endif
#ifdef CONFIG_EVENTFD
        /* Recursion prevention for eventfd_signal() */
        unsigned                        in_eventfd:1;
#endif
#ifdef CONFIG_ARCH_HAS_CPU_PASID
        unsigned                        pasid_activated:1;
#endif
#ifdef CONFIG_X86_BUS_LOCK_DETECT
        unsigned                        reported_split_lock:1;
#endif
#ifdef CONFIG_TASK_DELAY_ACCT
        /* delay due to memory thrashing */
        unsigned                        in_thrashing:1;
#endif
        unsigned                        in_nf_duplicate:1;
#ifdef CONFIG_PREEMPT_RT
        struct netdev_xmit                net_xmit;
#endif
        unsigned long                        atomic_flags; /* Flags requiring atomic access. */

        struct restart_block                restart_block;

        pid_t                                pid;
        pid_t                                tgid;

#ifdef CONFIG_STACKPROTECTOR
        /* Canary value for the -fstack-protector GCC feature: */
        unsigned long                        stack_canary;
#endif
        /*
         * Pointers to the (original) parent process, youngest child, younger sibling,
         * older sibling, respectively.  (p->father can be replaced with
         * p->real_parent->pid)
         */

        /* Real parent process: */
        struct task_struct __rcu        *real_parent;

        /* Recipient of SIGCHLD, wait4() reports: */
        struct task_struct __rcu        *parent;

        /*
         * Children/sibling form the list of natural children:
         */
        struct list_head                children;
        struct list_head                sibling;
        struct task_struct                *group_leader;

        /*
         * 'ptraced' is the list of tasks this task is using ptrace() on.
         *
         * This includes both natural children and PTRACE_ATTACH targets.
         * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
         */
        struct list_head                ptraced;
        struct list_head                ptrace_entry;

        /* PID/PID hash table linkage. */
        struct pid                        *thread_pid;
        struct hlist_node                pid_links[PIDTYPE_MAX];
        struct list_head                thread_node;

        struct completion                *vfork_done;

        /* CLONE_CHILD_SETTID: */
        int __user                        *set_child_tid;

        /* CLONE_CHILD_CLEARTID: */
        int __user                        *clear_child_tid;

        /* PF_KTHREAD | PF_IO_WORKER */
        void                                *worker_private;

        u64                                utime;
        u64                                stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        u64                                utimescaled;
        u64                                stimescaled;
#endif
        u64                                gtime;
        struct prev_cputime                prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        struct vtime                        vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
        atomic_t                        tick_dep_mask;
#endif
        /* Context switch counts: */
        unsigned long                        nvcsw;
        unsigned long                        nivcsw;

        /* Monotonic time in nsecs: */
        u64                                start_time;

        /* Boot based time in nsecs: */
        u64                                start_boottime;

        /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
        unsigned long                        min_flt;
        unsigned long                        maj_flt;

        /* Empty if CONFIG_POSIX_CPUTIMERS=n */
        struct posix_cputimers                posix_cputimers;

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
        struct posix_cputimers_work        posix_cputimers_work;
#endif

        /* Process credentials: */

        /* Tracer's credentials at attach: */
        const struct cred __rcu                *ptracer_cred;

        /* Objective and real subjective task credentials (COW): */
        const struct cred __rcu                *real_cred;

        /* Effective (overridable) subjective task credentials (COW): */
        const struct cred __rcu                *cred;

#ifdef CONFIG_KEYS
        /* Cached requested key. */
        struct key                        *cached_requested_key;
#endif

        /*
         * executable name, excluding path.
         *
         * - normally initialized by begin_new_exec()
         * - set it with set_task_comm() to ensure it is always
         *   NUL-terminated and zero-padded
         */
        char                                comm[TASK_COMM_LEN];

        struct nameidata                *nameidata;

#ifdef CONFIG_SYSVIPC
        struct sysv_sem                        sysvsem;
        struct sysv_shm                        sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
        unsigned long                        last_switch_count;
        unsigned long                        last_switch_time;
#endif
        /* Filesystem information: */
        struct fs_struct                *fs;

        /* Open file information: */
        struct files_struct                *files;

#ifdef CONFIG_IO_URING
        struct io_uring_task                *io_uring;
        struct io_restriction                *io_uring_restrict;
#endif

        /* Namespaces: */
        struct nsproxy                        *nsproxy;

        /* Signal handlers: */
        struct signal_struct                *signal;
        struct sighand_struct __rcu                *sighand;
        sigset_t                        blocked;
        sigset_t                        real_blocked;
        /* Restored if set_restore_sigmask() was used: */
        sigset_t                        saved_sigmask;
        struct sigpending                pending;
        unsigned long                        sas_ss_sp;
        size_t                                sas_ss_size;
        unsigned int                        sas_ss_flags;

        struct callback_head                *task_works;

#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
        struct audit_context                *audit_context;
#endif
        kuid_t                                loginuid;
        unsigned int                        sessionid;
#endif
        struct seccomp                        seccomp;
        struct syscall_user_dispatch        syscall_dispatch;

        /* Thread group tracking: */
        u64                                parent_exec_id;
        u64                                self_exec_id;

        /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
        spinlock_t                        alloc_lock;

        /* Protection of the PI data structures: */
        raw_spinlock_t                        pi_lock;

        struct wake_q_node                wake_q;

#ifdef CONFIG_RT_MUTEXES
        /* PI waiters blocked on a rt_mutex held by this task: */
        struct rb_root_cached                pi_waiters;
        /* Updated under owner's pi_lock and rq lock */
        struct task_struct                *pi_top_task;
        /* Deadlock detection and priority inheritance handling: */
        struct rt_mutex_waiter                *pi_blocked_on;
#endif

        struct mutex                        *blocked_on;        /* lock we're blocked on */
        raw_spinlock_t                        blocked_lock;

#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
        /*
         * Encoded lock address causing task block (lower 2 bits = type from
         * <linux/hung_task.h>). Accessed via hung_task_*() helpers.
         */
        unsigned long                        blocker;
#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        int                                non_block_count;
#endif

#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                irqtrace;
        unsigned int                        hardirq_threaded;
        u64                                hardirq_chain_key;
        int                                softirqs_enabled;
        int                                softirq_context;
        int                                irq_config;
#endif
#ifdef CONFIG_PREEMPT_RT
        int                                softirq_disable_cnt;
#endif

#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH                        48UL
        u64                                curr_chain_key;
        int                                lockdep_depth;
        unsigned int                        lockdep_recursion;
        struct held_lock                held_locks[MAX_LOCK_DEPTH];
#endif

#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
        unsigned int                        in_ubsan;
#endif

        /* Journalling filesystem info: */
        void                                *journal_info;

        /* Stacked block device info: */
        struct bio_list                        *bio_list;

        /* Stack plugging: */
        struct blk_plug                        *plug;

        /* VM state: */
        struct reclaim_state                *reclaim_state;

        struct io_context                *io_context;

#ifdef CONFIG_COMPACTION
        struct capture_control                *capture_control;
#endif
        /* Ptrace state: */
        unsigned long                        ptrace_message;
        kernel_siginfo_t                *last_siginfo;

        struct task_io_accounting        ioac;
#ifdef CONFIG_PSI
        /* Pressure stall state */
        unsigned int                        psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
        /* Accumulated RSS usage: */
        u64                                acct_rss_mem1;
        /* Accumulated virtual memory usage: */
        u64                                acct_vm_mem1;
        /* stime + utime since last update: */
        u64                                acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
        /* Protected by ->alloc_lock: */
        nodemask_t                        mems_allowed;
        /* Sequence number to catch updates: */
        seqcount_spinlock_t                mems_allowed_seq;
        int                                cpuset_mem_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
        /* Control Group info protected by css_set_lock: */
        struct css_set __rcu                *cgroups;
        /* cg_list protected by css_set_lock and tsk->alloc_lock: */
        struct list_head                cg_list;
#ifdef CONFIG_PREEMPT_RT
        struct llist_node                cg_dead_lnode;
#endif        /* CONFIG_PREEMPT_RT */
#endif        /* CONFIG_CGROUPS */
#ifdef CONFIG_X86_CPU_RESCTRL
        u32                                closid;
        u32                                rmid;
#endif
#ifdef CONFIG_FUTEX
        struct robust_list_head __user        *robust_list;
#ifdef CONFIG_COMPAT
        struct compat_robust_list_head __user *compat_robust_list;
#endif
        struct list_head                pi_state_list;
        struct futex_pi_state                *pi_state_cache;
        struct mutex                        futex_exit_mutex;
        unsigned int                        futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
        u8                                perf_recursion[PERF_NR_CONTEXTS];
        struct perf_event_context        *perf_event_ctxp;
        struct mutex                        perf_event_mutex;
        struct list_head                perf_event_list;
        struct perf_ctx_data __rcu        *perf_ctx_data;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
        unsigned long                        preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
        /* Protected by alloc_lock: */
        struct mempolicy                *mempolicy;
        short                                il_prev;
        u8                                il_weight;
        short                                pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
        int                                numa_scan_seq;
        unsigned int                        numa_scan_period;
        unsigned int                        numa_scan_period_max;
        int                                numa_preferred_nid;
        unsigned long                        numa_migrate_retry;
        /* Migration stamp: */
        u64                                node_stamp;
        u64                                last_task_numa_placement;
        u64                                last_sum_exec_runtime;
        struct callback_head                numa_work;

        /*
         * This pointer is only modified for current in syscall and
         * pagefault context (and for tasks being destroyed), so it can be read
         * from any of the following contexts:
         *  - RCU read-side critical section
         *  - current->numa_group from everywhere
         *  - task's runqueue locked, task not running
         */
        struct numa_group __rcu                *numa_group;

        /*
         * numa_faults is an array split into four regions:
         * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
         * in this precise order.
         *
         * faults_memory: Exponential decaying average of faults on a per-node
         * basis. Scheduling placement decisions are made based on these
         * counts. The values remain static for the duration of a PTE scan.
         * faults_cpu: Track the nodes the process was running on when a NUMA
         * hinting fault was incurred.
         * faults_memory_buffer and faults_cpu_buffer: Record faults per node
         * during the current scan window. When the scan completes, the counts
         * in faults_memory and faults_cpu decay and these values are copied.
         */
        unsigned long                        *numa_faults;
        unsigned long                        total_numa_faults;

        /*
         * numa_faults_locality tracks if faults recorded during the last
         * scan window were remote/local or failed to migrate. The task scan
         * period is adapted based on the locality of the faults with different
         * weights depending on whether they were shared or private faults
         */
        unsigned long                        numa_faults_locality[3];

        unsigned long                        numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

        struct rseq_data                rseq;
        struct sched_mm_cid                mm_cid;

        struct tlbflush_unmap_batch        tlb_ubc;

        /* Cache last used pipe for splice(): */
        struct pipe_inode_info                *splice_pipe;

        struct page_frag                task_frag;

#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE
        struct lazy_mmu_state                lazy_mmu_state;
#endif

#ifdef CONFIG_TASK_DELAY_ACCT
        struct task_delay_info                *delays;
#endif

#ifdef CONFIG_FAULT_INJECTION
        int                                make_it_fail;
        unsigned int                        fail_nth;
#endif
        /*
         * When (nr_dirtied >= nr_dirtied_pause), it's time to call
         * balance_dirty_pages() for a dirty throttling pause:
         */
        int                                nr_dirtied;
        int                                nr_dirtied_pause;
        /* Start of a write-and-pause period: */
        unsigned long                        dirty_paused_when;

#ifdef CONFIG_LATENCYTOP
        int                                latency_record_count;
        struct latency_record                latency_record[LT_SAVECOUNT];
#endif
        /*
         * Time slack values; these are used to round up poll() and
         * select() etc timeout values. These are in nanoseconds.
         */
        u64                                timer_slack_ns;
        u64                                default_timer_slack_ns;

#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
        unsigned int                        kasan_depth;
#endif

#ifdef CONFIG_KCSAN
        struct kcsan_ctx                kcsan_ctx;
#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                kcsan_save_irqtrace;
#endif
#ifdef CONFIG_KCSAN_WEAK_MEMORY
        int                                kcsan_stack_depth;
#endif
#endif

#ifdef CONFIG_KMSAN
        struct kmsan_ctx                kmsan_ctx;
#endif

#if IS_ENABLED(CONFIG_KUNIT)
        struct kunit                        *kunit_test;
#endif

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        /* Index of current stored address in ret_stack: */
        int                                curr_ret_stack;
        int                                curr_ret_depth;

        /* Stack of return addresses for return function tracing: */
        unsigned long                        *ret_stack;

        /* Timestamp for last schedule: */
        unsigned long long                ftrace_timestamp;
        unsigned long long                ftrace_sleeptime;

        /*
         * Number of functions that haven't been traced
         * because of depth overrun:
         */
        atomic_t                        trace_overrun;

        /* Pause tracing: */
        atomic_t                        tracing_graph_pause;
#endif

#ifdef CONFIG_TRACING
        /* Bitmask and counter of trace recursion: */
        unsigned long                        trace_recursion;
#endif /* CONFIG_TRACING */

#ifdef CONFIG_KCOV
        /* See kernel/kcov.c for more details. */

        /* Coverage collection mode enabled for this task (0 if disabled): */
        unsigned int                        kcov_mode;

        /* Size of the kcov_area: */
        unsigned int                        kcov_size;

        /* Buffer for coverage collection: */
        void                                *kcov_area;

        /* KCOV descriptor wired with this task or NULL: */
        struct kcov                        *kcov;

        /* KCOV common handle for remote coverage collection: */
        u64                                kcov_handle;

        /* KCOV sequence number: */
        int                                kcov_sequence;

        /* Collect coverage from softirq context: */
        unsigned int                        kcov_softirq;
#endif

#ifdef CONFIG_MEMCG_V1
        struct mem_cgroup                *memcg_in_oom;
#endif

#ifdef CONFIG_MEMCG
        /* Number of pages to reclaim on returning to userland: */
        unsigned int                        memcg_nr_pages_over_high;

        /* Used by memcontrol for targeted memcg charge: */
        struct mem_cgroup                *active_memcg;

        /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */
        struct obj_cgroup                *objcg;
#endif

#ifdef CONFIG_BLK_CGROUP
        struct gendisk                        *throttle_disk;
#endif

#ifdef CONFIG_UPROBES
        struct uprobe_task                *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
        unsigned int                        sequential_io;
        unsigned int                        sequential_io_avg;
#endif
        struct kmap_ctrl                kmap_ctrl;
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        unsigned long                        task_state_change;
# ifdef CONFIG_PREEMPT_RT
        unsigned long                        saved_state_change;
# endif
#endif
        struct rcu_head                        rcu;
        refcount_t                        rcu_users;
        int                                pagefault_disabled;
#ifdef CONFIG_MMU
        struct task_struct                *oom_reaper_list;
        struct timer_list                oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
        struct vm_struct                *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /* A live task holds one reference: */
        refcount_t                        stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
        int patch_state;
#endif
#ifdef CONFIG_SECURITY
        /* Used by LSM modules for access restriction: */
        void                                *security;
#endif
#ifdef CONFIG_BPF_SYSCALL
        /* Used by BPF task local storage */
        struct bpf_local_storage __rcu        *bpf_storage;
        /* Used for BPF run context */
        struct bpf_run_ctx                *bpf_ctx;
#endif
        /* Used by BPF for per-TASK xdp storage */
        struct bpf_net_context                *bpf_net_context;

#ifdef CONFIG_KSTACK_ERASE
        unsigned long                        lowest_stack;
#endif
#ifdef CONFIG_KSTACK_ERASE_METRICS
        unsigned long                        prev_lowest_stack;
#endif

#ifdef CONFIG_X86_MCE
        void __user                        *mce_vaddr;
        __u64                                mce_kflags;
        u64                                mce_addr;
        __u64                                mce_ripv : 1,
                                        mce_whole_page : 1,
                                        __mce_reserved : 62;
        struct callback_head                mce_kill_me;
        int                                mce_count;
#endif

#ifdef CONFIG_KRETPROBES
        struct llist_head               kretprobe_instances;
#endif
#ifdef CONFIG_RETHOOK
        struct llist_head               rethooks;
#endif

#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
        /*
         * If L1D flush is supported on mm context switch
         * then we use this callback head to queue kill work
         * to kill tasks that are not running on SMT disabled
         * cores
         */
        struct callback_head                l1d_flush_kill;
#endif

#ifdef CONFIG_RV
        /*
         * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS.
         * If memory becomes a concern, we can think about a dynamic method.
         */
        union rv_task_monitor                rv[CONFIG_RV_PER_TASK_MONITORS];
#endif

#ifdef CONFIG_USER_EVENTS
        struct user_event_mm                *user_event_mm;
#endif

#ifdef CONFIG_UNWIND_USER
        struct unwind_task_info                unwind_info;
#endif

        /* CPU-specific state of this task: */
        struct thread_struct                thread;

        /*
         * New fields for task_struct should be added above here, so that
         * they are included in the randomized portion of task_struct.
         */
        randomized_struct_fields_end
} __attribute__ ((aligned (64)));

#ifdef CONFIG_SCHED_PROXY_EXEC
DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec);
static inline bool sched_proxy_exec(void)
{
        return static_branch_likely(&__sched_proxy_exec);
}
#else
static inline bool sched_proxy_exec(void)
{
        return false;
}
#endif

#define TASK_REPORT_IDLE        (TASK_REPORT + 1)
#define TASK_REPORT_MAX                (TASK_REPORT_IDLE << 1)

static inline unsigned int __task_state_index(unsigned int tsk_state,
                                              unsigned int tsk_exit_state)
{
        unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;

        BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);

        if ((tsk_state & TASK_IDLE) == TASK_IDLE)
                state = TASK_REPORT_IDLE;

        /*
         * We're lying here, but rather than expose a completely new task state
         * to userspace, we can make this appear as if the task has gone through
         * a regular rt_mutex_lock() call.
         * Report frozen tasks as uninterruptible.
         */
        if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN))
                state = TASK_UNINTERRUPTIBLE;

        return fls(state);
}

static inline unsigned int task_state_index(struct task_struct *tsk)
{
        return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
}

static inline char task_index_to_char(unsigned int state)
{
        static const char state_char[] = "RSDTtXZPI";

        BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1));

        return state_char[state];
}

static inline char task_state_to_char(struct task_struct *tsk)
{
        return task_index_to_char(task_state_index(tsk));
}

#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE
/**
 * __task_lazy_mmu_mode_active() - Test the lazy MMU mode state for a task.
 * @tsk: The task to check.
 *
 * Test whether @tsk has its lazy MMU mode state set to active (i.e. enabled
 * and not paused).
 *
 * This function only considers the state saved in task_struct; to test whether
 * current actually is in lazy MMU mode, is_lazy_mmu_mode_active() should be
 * used instead.
 *
 * This function is intended for architectures that implement the lazy MMU
 * mode; it must not be called from generic code.
 */
static inline bool __task_lazy_mmu_mode_active(struct task_struct *tsk)
{
        struct lazy_mmu_state *state = &tsk->lazy_mmu_state;

        return state->enable_count > 0 && state->pause_count == 0;
}

/**
 * is_lazy_mmu_mode_active() - Test whether we are currently in lazy MMU mode.
 *
 * Test whether the current context is in lazy MMU mode. This is true if both:
 * 1. We are not in interrupt context
 * 2. Lazy MMU mode is active for the current task
 *
 * This function is intended for architectures that implement the lazy MMU
 * mode; it must not be called from generic code.
 */
static inline bool is_lazy_mmu_mode_active(void)
{
        if (in_interrupt())
                return false;

        return __task_lazy_mmu_mode_active(current);
}
#endif

extern struct pid *cad_pid;

/*
 * Per process flags
 */
#define PF_VCPU                        0x00000001        /* I'm a virtual CPU */
#define PF_IDLE                        0x00000002        /* I am an IDLE thread */
#define PF_EXITING                0x00000004        /* Getting shut down */
#define PF_POSTCOREDUMP                0x00000008        /* Coredumps should ignore this task */
#define PF_IO_WORKER                0x00000010        /* Task is an IO worker */
#define PF_WQ_WORKER                0x00000020        /* I'm a workqueue worker */
#define PF_FORKNOEXEC                0x00000040        /* Forked but didn't exec */
#define PF_MCE_PROCESS                0x00000080      /* Process policy on mce errors */
#define PF_SUPERPRIV                0x00000100        /* Used super-user privileges */
#define PF_DUMPCORE                0x00000200        /* Dumped core */
#define PF_SIGNALED                0x00000400        /* Killed by a signal */
#define PF_MEMALLOC                0x00000800        /* Allocating memory to free memory. See memalloc_noreclaim_save() */
#define PF_NPROC_EXCEEDED        0x00001000        /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH                0x00002000        /* If unset the fpu must be initialized before use */
#define PF_USER_WORKER                0x00004000        /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE                0x00008000        /* This thread should not be frozen */
#define PF_KCOMPACTD                0x00010000        /* I am kcompactd */
#define PF_KSWAPD                0x00020000        /* I am kswapd */
#define PF_MEMALLOC_NOFS        0x00040000        /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
#define PF_MEMALLOC_NOIO        0x00080000        /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
#define PF_LOCAL_THROTTLE        0x00100000        /* Throttle writes only against the bdi I write to,
                                                 * I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD                0x00200000        /* I am a kernel thread */
#define PF_RANDOMIZE                0x00400000        /* Randomize virtual address space */
#define PF__HOLE__00800000        0x00800000
#define PF__HOLE__01000000        0x01000000
#define PF__HOLE__02000000        0x02000000
#define PF_NO_SETAFFINITY        0x04000000        /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY                0x08000000      /* Early kill for mce process policy */
#define PF_MEMALLOC_PIN                0x10000000        /* Allocations constrained to zones which allow long term pinning.
                                                 * See memalloc_pin_save() */
#define PF_BLOCK_TS                0x20000000        /* plug has ts that needs updating */
#define PF__HOLE__40000000        0x40000000
#define PF_SUSPEND_TASK                0x80000000      /* This thread called freeze_processes() and should not be frozen */

/*
 * Only the _current_ task can read/write to tsk->flags, but other
 * tasks can access tsk->flags in readonly mode for example
 * with tsk_used_math (like during threaded core dumping).
 * There is however an exception to this rule during ptrace
 * or during fork: the ptracer task is allowed to write to the
 * child->flags of its traced child (same goes for fork, the parent
 * can write to the child->flags), because we're guaranteed the
 * child is not running and in turn not changing child->flags
 * at the same time the parent does it.
 */
#define clear_stopped_child_used_math(child)        do { (child)->flags &= ~PF_USED_MATH; } while (0)
#define set_stopped_child_used_math(child)        do { (child)->flags |= PF_USED_MATH; } while (0)
#define clear_used_math()                        clear_stopped_child_used_math(current)
#define set_used_math()                                set_stopped_child_used_math(current)

#define conditional_stopped_child_used_math(condition, child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)

#define conditional_used_math(condition)        conditional_stopped_child_used_math(condition, current)

#define copy_to_stopped_child_used_math(child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)

/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
#define tsk_used_math(p)                        ((p)->flags & PF_USED_MATH)
#define used_math()                                tsk_used_math(current)

static __always_inline bool is_percpu_thread(void)
{
        return (current->flags & PF_NO_SETAFFINITY) &&
                (current->nr_cpus_allowed  == 1);
}

static __always_inline bool is_user_task(struct task_struct *task)
{
        return task->mm && !(task->flags & (PF_KTHREAD | PF_USER_WORKER));
}

/* Per-process atomic flags. */
#define PFA_NO_NEW_PRIVS                0        /* May not gain new privileges. */
#define PFA_SPREAD_PAGE                        1        /* Spread page cache over cpuset */
#define PFA_SPREAD_SLAB                        2        /* Spread some slab caches over cpuset */
#define PFA_SPEC_SSB_DISABLE                3        /* Speculative Store Bypass disabled */
#define PFA_SPEC_SSB_FORCE_DISABLE        4        /* Speculative Store Bypass force disabled*/
#define PFA_SPEC_IB_DISABLE                5        /* Indirect branch speculation restricted */
#define PFA_SPEC_IB_FORCE_DISABLE        6        /* Indirect branch speculation permanently restricted */
#define PFA_SPEC_SSB_NOEXEC                7        /* Speculative Store Bypass clear on execve() */

#define TASK_PFA_TEST(name, func)                                        \
        static inline bool task_##func(struct task_struct *p)                \
        { return test_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_SET(name, func)                                        \
        static inline void task_set_##func(struct task_struct *p)        \
        { set_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_CLEAR(name, func)                                        \
        static inline void task_clear_##func(struct task_struct *p)        \
        { clear_bit(PFA_##name, &p->atomic_flags); }

TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)

TASK_PFA_TEST(SPREAD_PAGE, spread_page)
TASK_PFA_SET(SPREAD_PAGE, spread_page)
TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)

TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
TASK_PFA_SET(SPREAD_SLAB, spread_slab)
TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)

TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)

TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)

TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)

TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)

TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)

static inline void
current_restore_flags(unsigned long orig_flags, unsigned long flags)
{
        current->flags &= ~flags;
        current->flags |= orig_flags & flags;
}

extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);

/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);

/**
 * set_cpus_allowed_ptr - set CPU affinity mask of a task
 * @p: the task
 * @new_mask: CPU affinity mask
 *
 * Return: zero if successful, or a negative error code
 */
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
extern void release_user_cpus_ptr(struct task_struct *p);
extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);

extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);

/**
 * task_nice - return the nice value of a given task.
 * @p: the task in question.
 *
 * Return: The nice value [ -20 ... 0 ... 19 ].
 */
static inline int task_nice(const struct task_struct *p)
{
        return PRIO_TO_NICE((p)->static_prio);
}

extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
extern void sched_set_fifo_low(struct task_struct *p);
extern void sched_set_fifo_secondary(struct task_struct *p);
extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);

/**
 * is_idle_task - is the specified task an idle task?
 * @p: the task in question.
 *
 * Return: 1 if @p is an idle task. 0 otherwise.
 */
static __always_inline bool is_idle_task(const struct task_struct *p)
{
        return !!(p->flags & PF_IDLE);
}

extern struct task_struct *curr_task(int cpu);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);

void yield(void);

union thread_union {
        struct task_struct task;
#ifndef CONFIG_THREAD_INFO_IN_TASK
        struct thread_info thread_info;
#endif
        unsigned long stack[THREAD_SIZE/sizeof(long)];
};

#ifndef CONFIG_THREAD_INFO_IN_TASK
extern struct thread_info init_thread_info;
#endif

extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];

#ifdef CONFIG_THREAD_INFO_IN_TASK
# define task_thread_info(task)        (&(task)->thread_info)
#else
# define task_thread_info(task)        ((struct thread_info *)(task)->stack)
#endif

/*
 * find a task by one of its numerical ids
 *
 * find_task_by_pid_ns():
 *      finds a task by its pid in the specified namespace
 * find_task_by_vpid():
 *      finds a task by its virtual pid
 *
 * see also find_vpid() etc in include/linux/pid.h
 */

extern struct task_struct *find_task_by_vpid(pid_t nr);
extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);

/*
 * find a task by its virtual pid and get the task struct
 */
extern struct task_struct *find_get_task_by_vpid(pid_t nr);

extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk);

extern void kick_process(struct task_struct *tsk);

extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
#define set_task_comm(tsk, from) ({                        \
        BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN);        \
        __set_task_comm(tsk, from, false);                \
})

/*
 * - Why not use task_lock()?
 *   User space can randomly change their names anyway, so locking for readers
 *   doesn't make sense. For writers, locking is probably necessary, as a race
 *   condition could lead to long-term mixed results.
 *   The strscpy_pad() in __set_task_comm() can ensure that the task comm is
 *   always NUL-terminated and zero-padded. Therefore the race condition between
 *   reader and writer is not an issue.
 *
 * - BUILD_BUG_ON() can help prevent the buf from being truncated.
 *   Since the callers don't perform any return value checks, this safeguard is
 *   necessary.
 */
#define get_task_comm(buf, tsk) ({                        \
        BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN);        \
        strscpy_pad(buf, (tsk)->comm);                        \
        buf;                                                \
})

static __always_inline void scheduler_ipi(void)
{
        /*
         * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
         * TIF_NEED_RESCHED remotely (for the first time) will also send
         * this IPI.
         */
        preempt_fold_need_resched();
}

extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);

/*
 * Set thread flags in other task's structures.
 * See asm/thread_info.h for TIF_xxxx flags available:
 */
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
                                          bool value)
{
        update_ti_thread_flag(task_thread_info(tsk), flag, value);
}

static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
        if (tracepoint_enabled(sched_set_need_resched_tp) &&
            !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED))
                __trace_set_need_resched(tsk, TIF_NEED_RESCHED);
        set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
        atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
                           (atomic_long_t *)&task_thread_info(tsk)->flags);
}

static inline int test_tsk_need_resched(struct task_struct *tsk)
{
        return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}

static inline void set_need_resched_current(void)
{
        lockdep_assert_irqs_disabled();
        set_tsk_need_resched(current);
        set_preempt_need_resched();
}

/*
 * cond_resched() and cond_resched_lock(): latency reduction via
 * explicit rescheduling in places that are safe. The return
 * value indicates whether a reschedule was done in fact.
 * cond_resched_lock() will drop the spinlock before scheduling,
 */
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
extern int __cond_resched(void);

#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)

DECLARE_STATIC_CALL(cond_resched, __cond_resched);

static __always_inline int _cond_resched(void)
{
        return static_call_mod(cond_resched)();
}

#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)

extern int dynamic_cond_resched(void);

static __always_inline int _cond_resched(void)
{
        return dynamic_cond_resched();
}

#else /* !CONFIG_PREEMPTION */

static inline int _cond_resched(void)
{
        return __cond_resched();
}

#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */

#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */

static inline int _cond_resched(void)
{
        return 0;
}

#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */

#define cond_resched() ({                        \
        __might_resched(__FILE__, __LINE__, 0);        \
        _cond_resched();                        \
})

extern int __cond_resched_lock(spinlock_t *lock) __must_hold(lock);
extern int __cond_resched_rwlock_read(rwlock_t *lock) __must_hold_shared(lock);
extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock);

#define MIGHT_RESCHED_RCU_SHIFT                8
#define MIGHT_RESCHED_PREEMPT_MASK        ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1)

#ifndef CONFIG_PREEMPT_RT
/*
 * Non RT kernels have an elevated preempt count due to the held lock,
 * but are not allowed to be inside a RCU read side critical section
 */
# define PREEMPT_LOCK_RESCHED_OFFSETS        PREEMPT_LOCK_OFFSET
#else
/*
 * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in
 * cond_resched*lock() has to take that into account because it checks for
 * preempt_count() and rcu_preempt_depth().
 */
# define PREEMPT_LOCK_RESCHED_OFFSETS        \
        (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT))
#endif

#define cond_resched_lock(lock) ({                                                \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_lock(lock);                                                \
})

#define cond_resched_rwlock_read(lock) ({                                        \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_rwlock_read(lock);                                        \
})

#define cond_resched_rwlock_write(lock) ({                                        \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_rwlock_write(lock);                                        \
})

#ifndef CONFIG_PREEMPT_RT

/*
 * With proxy exec, if a task has been proxy-migrated, it may be a donor
 * on a cpu that it can't actually run on. Thus we need a special state
 * to denote that the task is being woken, but that it needs to be
 * evaluated for return-migration before it is run. So if the task is
 * blocked_on PROXY_WAKING, return migrate it before running it.
 */
#define PROXY_WAKING ((struct mutex *)(-1L))

static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
{
        lockdep_assert_held_once(&p->blocked_lock);
        return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on;
}

static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
{
        WARN_ON_ONCE(!m);
        /* The task should only be setting itself as blocked */
        WARN_ON_ONCE(p != current);
        /* Currently we serialize blocked_on under the task::blocked_lock */
        lockdep_assert_held_once(&p->blocked_lock);
        /*
         * Check ensure we don't overwrite existing mutex value
         * with a different mutex. Note, setting it to the same
         * lock repeatedly is ok.
         */
        WARN_ON_ONCE(p->blocked_on && p->blocked_on != m);
        p->blocked_on = m;
}

static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
{
        /* Currently we serialize blocked_on under the task::blocked_lock */
        lockdep_assert_held_once(&p->blocked_lock);
        /*
         * There may be cases where we re-clear already cleared
         * blocked_on relationships, but make sure we are not
         * clearing the relationship with a different lock.
         */
        WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
        p->blocked_on = NULL;
}

static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
{
        guard(raw_spinlock_irqsave)(&p->blocked_lock);
        __clear_task_blocked_on(p, m);
}

static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m)
{
        /* Currently we serialize blocked_on under the task::blocked_lock */
        lockdep_assert_held_once(&p->blocked_lock);

        if (!sched_proxy_exec()) {
                __clear_task_blocked_on(p, m);
                return;
        }

        /* Don't set PROXY_WAKING if blocked_on was already cleared */
        if (!p->blocked_on)
                return;
        /*
         * There may be cases where we set PROXY_WAKING on tasks that were
         * already set to waking, but make sure we are not changing
         * the relationship with a different lock.
         */
        WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
        p->blocked_on = PROXY_WAKING;
}

static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m)
{
        guard(raw_spinlock_irqsave)(&p->blocked_lock);
        __set_task_blocked_on_waking(p, m);
}

#else
static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
{
}

static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
{
}

static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m)
{
}

static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m)
{
}
#endif /* !CONFIG_PREEMPT_RT */

static __always_inline bool need_resched(void)
{
        return unlikely(tif_need_resched());
}

/*
 * Wrappers for p->thread_info->cpu access. No-op on UP.
 */
#ifdef CONFIG_SMP

static inline unsigned int task_cpu(const struct task_struct *p)
{
        return READ_ONCE(task_thread_info(p)->cpu);
}

extern void set_task_cpu(struct task_struct *p, unsigned int cpu);

#else

static inline unsigned int task_cpu(const struct task_struct *p)
{
        return 0;
}

static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
}

#endif /* CONFIG_SMP */

static inline bool task_is_runnable(struct task_struct *p)
{
        return p->on_rq && !p->se.sched_delayed;
}

extern bool sched_task_on_rq(struct task_struct *p);
extern unsigned long get_wchan(struct task_struct *p);
extern struct task_struct *cpu_curr_snapshot(int cpu);

/*
 * In order to reduce various lock holder preemption latencies provide an
 * interface to see if a vCPU is currently running or not.
 *
 * This allows us to terminate optimistic spin loops and block, analogous to
 * the native optimistic spin heuristic of testing if the lock owner task is
 * running or not.
 */
#ifndef vcpu_is_preempted
static inline bool vcpu_is_preempted(int cpu)
{
        return false;
}
#endif

extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);

#ifndef TASK_SIZE_OF
#define TASK_SIZE_OF(tsk)        TASK_SIZE
#endif

static inline bool owner_on_cpu(struct task_struct *owner)
{
        /*
         * As lock holder preemption issue, we both skip spinning if
         * task is not on cpu or its cpu is preempted
         */
        return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner));
}

/* Returns effective CPU energy utilization, as seen by the scheduler */
unsigned long sched_cpu_util(int cpu);

#ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p);
extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
                                unsigned long uaddr);
extern int sched_core_idle_cpu(int cpu);
#else
static inline void sched_core_free(struct task_struct *tsk) { }
static inline void sched_core_fork(struct task_struct *p) { }
static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
#endif

extern void sched_set_stop_task(int cpu, struct task_struct *stop);

#ifdef CONFIG_MEM_ALLOC_PROFILING
static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
{
        swap(current->alloc_tag, tag);
        return tag;
}

static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
{
#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
        WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
#endif
        current->alloc_tag = old;
}
#else
#define alloc_tag_save(_tag)                        NULL
#define alloc_tag_restore(_tag, _old)                do {} while (0)
#endif

/* Avoids recursive inclusion hell */
#ifdef CONFIG_SCHED_MM_CID
void sched_mm_cid_before_execve(struct task_struct *t);
void sched_mm_cid_after_execve(struct task_struct *t);
void sched_mm_cid_exit(struct task_struct *t);
static __always_inline int task_mm_cid(struct task_struct *t)
{
        return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT);
}
#else
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
static inline void sched_mm_cid_exit(struct task_struct *t) { }
static __always_inline int task_mm_cid(struct task_struct *t)
{
        /*
         * Use the processor id as a fall-back when the mm cid feature is
         * disabled. This provides functional per-cpu data structure accesses
         * in user-space, althrough it won't provide the memory usage benefits.
         */
        return task_cpu(t);
}
#endif

#ifndef MODULE
#ifndef COMPILE_OFFSETS

extern void ___migrate_enable(void);

struct rq;
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

/*
 * The "struct rq" is not available here, so we can't access the
 * "runqueues" with this_cpu_ptr(), as the compilation will fail in
 * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
 *   typeof((ptr) + 0)
 *
 * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here.
 */
#ifdef CONFIG_SMP
#define this_rq_raw() arch_raw_cpu_ptr(&runqueues)
#else
#define this_rq_raw() PERCPU_PTR(&runqueues)
#endif
#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned))

static inline void __migrate_enable(void)
{
        struct task_struct *p = current;

#ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Check both overflow from migrate_disable() and superfluous
         * migrate_enable().
         */
        if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
                return;
#endif

        if (p->migration_disabled > 1) {
                p->migration_disabled--;
                return;
        }

        /*
         * Ensure stop_task runs either before or after this, and that
         * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
         */
        guard(preempt)();
        if (unlikely(p->cpus_ptr != &p->cpus_mask))
                ___migrate_enable();
        /*
         * Mustn't clear migration_disabled() until cpus_ptr points back at the
         * regular cpus_mask, otherwise things that race (eg.
         * select_fallback_rq) get confused.
         */
        barrier();
        p->migration_disabled = 0;
        this_rq_pinned()--;
}

static inline void __migrate_disable(void)
{
        struct task_struct *p = current;

        if (p->migration_disabled) {
#ifdef CONFIG_DEBUG_PREEMPT
                /*
                 *Warn about overflow half-way through the range.
                 */
                WARN_ON_ONCE((s16)p->migration_disabled < 0);
#endif
                p->migration_disabled++;
                return;
        }

        guard(preempt)();
        this_rq_pinned()++;
        p->migration_disabled = 1;
}
#else /* !COMPILE_OFFSETS */
static inline void __migrate_disable(void) { }
static inline void __migrate_enable(void) { }
#endif /* !COMPILE_OFFSETS */

/*
 * So that it is possible to not export the runqueues variable, define and
 * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use
 * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will
 * be defined in kernel/sched/core.c.
 */
#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE
static __always_inline void migrate_disable(void)
{
        __migrate_disable();
}

static __always_inline void migrate_enable(void)
{
        __migrate_enable();
}
#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
extern void migrate_disable(void);
extern void migrate_enable(void);
#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */

#else /* MODULE */
extern void migrate_disable(void);
extern void migrate_enable(void);
#endif /* MODULE */

DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

#endif
































































































    3 

   10 
   13 
   14 



   15 














































































   28 






   25 





















































   23 







   24 













   26 

























   12 







   13 






























































   20 





    7 
   14 
   15 
    4 

   16 
    5 




   18 
    3 
















    7 

















    6 








    7 


















   13 














   13 

























   11 










   13 




   12 



   13 



   10 
   12 
   11 



   12 

   12 

   12 



   13 













   11 























   12 



    3 





   12 













   36 
   41 
   10 

























   11 
    9 










    2 
    3 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/file_table.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/eventpoll.h>
#include <linux/rcupdate.h>
#include <linux/mount.h>
#include <linux/capability.h>
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/task_work.h>
#include <linux/swap.h>
#include <linux/kmemleak.h>

#include <linux/atomic.h>

#include <asm/runtime-const.h>

#include "internal.h"

/* sysctl tunables... */
static struct files_stat_struct files_stat = {
        .max_files = NR_FILE
};

/* SLAB cache for file structures */
static struct kmem_cache *__filp_cache __ro_after_init;
#define filp_cache runtime_const_ptr(__filp_cache)
static struct kmem_cache *__bfilp_cache __ro_after_init;
#define bfilp_cache runtime_const_ptr(__bfilp_cache)

static struct percpu_counter nr_files __cacheline_aligned_in_smp;

/* Container for backing file with optional user path */
struct backing_file {
        struct file file;
        union {
                struct path user_path;
                freeptr_t bf_freeptr;
        };
#ifdef CONFIG_SECURITY
        void *security;
#endif
};

#define backing_file(f) container_of(f, struct backing_file, file)

const struct path *backing_file_user_path(const struct file *f)
{
        return &backing_file(f)->user_path;
}
EXPORT_SYMBOL_GPL(backing_file_user_path);

void backing_file_set_user_path(struct file *f, const struct path *path)
{
        backing_file(f)->user_path = *path;
}
EXPORT_SYMBOL_GPL(backing_file_set_user_path);

#ifdef CONFIG_SECURITY
void *backing_file_security(const struct file *f)
{
        return backing_file(f)->security;
}

void backing_file_set_security(struct file *f, void *security)
{
        backing_file(f)->security = security;
}
#endif /* CONFIG_SECURITY */

static inline void backing_file_free(struct backing_file *ff)
{
        security_backing_file_free(&ff->file);
        path_put(&ff->user_path);
        kmem_cache_free(bfilp_cache, ff);
}

static inline void file_free(struct file *f)
{
        security_file_free(f);
        if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
                percpu_counter_dec(&nr_files);
        put_cred(f->f_cred);
        if (unlikely(f->f_mode & FMODE_BACKING)) {
                backing_file_free(backing_file(f));
        } else {
                kmem_cache_free(filp_cache, f);
        }
}

/*
 * Return the total number of open files in the system
 */
static long get_nr_files(void)
{
        return percpu_counter_read_positive(&nr_files);
}

/*
 * Return the maximum number of open files in the system
 */
unsigned long get_max_files(void)
{
        return files_stat.max_files;
}
EXPORT_SYMBOL_GPL(get_max_files);

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)

/*
 * Handle nr_files sysctl
 */
static int proc_nr_files(const struct ctl_table *table, int write, void *buffer,
                         size_t *lenp, loff_t *ppos)
{
        files_stat.nr_files = percpu_counter_sum_positive(&nr_files);
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table fs_stat_sysctls[] = {
        {
                .procname        = "file-nr",
                .data                = &files_stat,
                .maxlen                = sizeof(files_stat),
                .mode                = 0444,
                .proc_handler        = proc_nr_files,
        },
        {
                .procname        = "file-max",
                .data                = &files_stat.max_files,
                .maxlen                = sizeof(files_stat.max_files),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
                .extra1                = SYSCTL_LONG_ZERO,
                .extra2                = SYSCTL_LONG_MAX,
        },
        {
                .procname        = "nr_open",
                .data                = &sysctl_nr_open,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = proc_douintvec_minmax,
                .extra1                = &sysctl_nr_open_min,
                .extra2                = &sysctl_nr_open_max,
        },
};

static int __init init_fs_stat_sysctls(void)
{
        register_sysctl_init("fs", fs_stat_sysctls);
        if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
                struct ctl_table_header *hdr;

                hdr = register_sysctl_mount_point("fs/binfmt_misc");
                kmemleak_not_leak(hdr);
        }
        return 0;
}
fs_initcall(init_fs_stat_sysctls);
#endif

static int init_file(struct file *f, int flags, const struct cred *cred)
{
        int error;

        f->f_cred = get_cred(cred);
        error = security_file_alloc(f);
        if (unlikely(error)) {
                put_cred(f->f_cred);
                return error;
        }

        spin_lock_init(&f->f_lock);
        /*
         * Note that f_pos_lock is only used for files raising
         * FMODE_ATOMIC_POS and directories. Other files such as pipes
         * don't need it and since f_pos_lock is in a union may reuse
         * the space for other purposes. They are expected to initialize
         * the respective member when opening the file.
         */
        mutex_init(&f->f_pos_lock);
        memset(&f->__f_path, 0, sizeof(f->f_path));
        memset(&f->f_ra, 0, sizeof(f->f_ra));

        f->f_flags        = flags;
        f->f_mode        = OPEN_FMODE(flags);
        /*
         * Disable permission and pre-content events for all files by default.
         * They may be enabled later by fsnotify_open_perm_and_set_mode().
         */
        file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM);

        f->f_op                = NULL;
        f->f_mapping        = NULL;
        f->private_data = NULL;
        f->f_inode        = NULL;
        f->f_owner        = NULL;
#ifdef CONFIG_EPOLL
        f->f_ep                = NULL;
#endif

        f->f_iocb_flags = 0;
        f->f_pos        = 0;
        f->f_wb_err        = 0;
        f->f_sb_err        = 0;

        /*
         * We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last. While
         * fget-rcu pattern users need to be able to handle spurious
         * refcount bumps we should reinitialize the reused file first.
         */
        file_ref_init(&f->f_ref, 1);
        return 0;
}

/* Find an unused file structure and return a pointer to it.
 * Returns an error pointer if some error happend e.g. we over file
 * structures limit, run out of memory or operation is not permitted.
 *
 * Be very careful using this.  You are responsible for
 * getting write access to any mount that you might assign
 * to this filp, if it is opened for write.  If this is not
 * done, you will imbalance int the mount's writer count
 * and a warning at __fput() time.
 */
struct file *alloc_empty_file(int flags, const struct cred *cred)
{
        static long old_max;
        struct file *f;
        int error;

        /*
         * Privileged users can go above max_files
         */
        if (unlikely(get_nr_files() >= files_stat.max_files) &&
            !capable(CAP_SYS_ADMIN)) {
                /*
                 * percpu_counters are inaccurate.  Do an expensive check before
                 * we go and fail.
                 */
                if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
                        goto over;
        }

        f = kmem_cache_alloc(filp_cache, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        error = init_file(f, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(filp_cache, f);
                return ERR_PTR(error);
        }

        percpu_counter_inc(&nr_files);

        return f;

over:
        /* Ran out of filps - report that */
        if (get_nr_files() > old_max) {
                pr_info("VFS: file-max limit %lu reached\n", get_max_files());
                old_max = get_nr_files();
        }
        return ERR_PTR(-ENFILE);
}

/*
 * Variant of alloc_empty_file() that doesn't check and modify nr_files.
 *
 * This is only for kernel internal use, and the allocate file must not be
 * installed into file tables or such.
 */
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
        struct file *f;
        int error;

        f = kmem_cache_alloc(filp_cache, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        error = init_file(f, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(filp_cache, f);
                return ERR_PTR(error);
        }

        f->f_mode |= FMODE_NOACCOUNT;

        return f;
}

static int init_backing_file(struct backing_file *ff,
                             const struct file *user_file)
{
        memset(&ff->user_path, 0, sizeof(ff->user_path));
        backing_file_set_security(&ff->file, NULL);
        return security_backing_file_alloc(&ff->file, user_file);
}

/*
 * Variant of alloc_empty_file() that allocates a backing_file container
 * and doesn't check and modify nr_files.
 *
 * This is only for kernel internal use, and the allocate file must not be
 * installed into file tables or such.
 */
struct file *alloc_empty_backing_file(int flags, const struct cred *cred,
                                      const struct file *user_file)
{
        struct backing_file *ff;
        int error;

        ff = kmem_cache_alloc(bfilp_cache, GFP_KERNEL);
        if (unlikely(!ff))
                return ERR_PTR(-ENOMEM);

        error = init_file(&ff->file, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(bfilp_cache, ff);
                return ERR_PTR(error);
        }

        /* The f_mode flags must be set before fput(). */
        ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
        error = init_backing_file(ff, user_file);
        if (unlikely(error)) {
                fput(&ff->file);
                return ERR_PTR(error);
        }

        return &ff->file;
}
EXPORT_SYMBOL_GPL(alloc_empty_backing_file);

/**
 * file_init_path - initialize a 'struct file' based on path
 *
 * @file: the file to set up
 * @path: the (dentry, vfsmount) pair for the new file
 * @fop: the 'struct file_operations' for the new file
 */
static void file_init_path(struct file *file, const struct path *path,
                           const struct file_operations *fop)
{
        file->__f_path = *path;
        file->f_inode = path->dentry->d_inode;
        file->f_mapping = path->dentry->d_inode->i_mapping;
        file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
        file->f_sb_err = file_sample_sb_err(file);
        if (fop->llseek)
                file->f_mode |= FMODE_LSEEK;
        if ((file->f_mode & FMODE_READ) &&
             likely(fop->read || fop->read_iter))
                file->f_mode |= FMODE_CAN_READ;
        if ((file->f_mode & FMODE_WRITE) &&
             likely(fop->write || fop->write_iter))
                file->f_mode |= FMODE_CAN_WRITE;
        file->f_iocb_flags = iocb_flags(file);
        file->f_mode |= FMODE_OPENED;
        file->f_op = fop;
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(path->dentry->d_inode);
}

/**
 * alloc_file - allocate and initialize a 'struct file'
 *
 * @path: the (dentry, vfsmount) pair for the new file
 * @flags: O_... flags with which the new file will be opened
 * @fop: the 'struct file_operations' for the new file
 */
static struct file *alloc_file(const struct path *path, int flags,
                const struct file_operations *fop)
{
        struct file *file;

        file = alloc_empty_file(flags, current_cred());
        if (!IS_ERR(file))
                file_init_path(file, path, fop);
        return file;
}

static inline int alloc_path_pseudo(const char *name, struct inode *inode,
                                    struct vfsmount *mnt, struct path *path)
{
        path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name));
        if (!path->dentry)
                return -ENOMEM;
        path->mnt = mntget(mnt);
        d_instantiate(path->dentry, inode);
        return 0;
}

struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
                               const char *name, int flags,
                               const struct file_operations *fops)
{
        int ret;
        struct path path;
        struct file *file;

        ret = alloc_path_pseudo(name, inode, mnt, &path);
        if (ret)
                return ERR_PTR(ret);

        file = alloc_file(&path, flags, fops);
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
                return file;
        }
        /*
         * Disable all fsnotify events for pseudo files by default.
         * They may be enabled by caller with file_set_fsnotify_mode().
         */
        file_set_fsnotify_mode(file, FMODE_NONOTIFY);
        return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);

struct file *alloc_file_pseudo_noaccount(struct inode *inode,
                                         struct vfsmount *mnt, const char *name,
                                         int flags,
                                         const struct file_operations *fops)
{
        int ret;
        struct path path;
        struct file *file;

        ret = alloc_path_pseudo(name, inode, mnt, &path);
        if (ret)
                return ERR_PTR(ret);

        file = alloc_empty_file_noaccount(flags, current_cred());
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
                return file;
        }
        file_init_path(file, &path, fops);
        /*
         * Disable all fsnotify events for pseudo files by default.
         * They may be enabled by caller with file_set_fsnotify_mode().
         */
        file_set_fsnotify_mode(file, FMODE_NONOTIFY);
        return file;
}
EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);

struct file *alloc_file_clone(struct file *base, int flags,
                                const struct file_operations *fops)
{
        struct file *f;

        f = alloc_file(&base->f_path, flags, fops);
        if (!IS_ERR(f)) {
                path_get(&f->f_path);
                f->f_mapping = base->f_mapping;
        }
        return f;
}

/* the real guts of fput() - releasing the last reference to file
 */
static void __fput(struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
        struct inode *inode = file->f_inode;
        fmode_t mode = file->f_mode;

        if (unlikely(!(file->f_mode & FMODE_OPENED)))
                goto out;

        might_sleep();

        fsnotify_close(file);
        /*
         * The function eventpoll_release() should be the first called
         * in the file cleanup chain.
         */
        eventpoll_release(file);
        locks_remove_file(file);

        security_file_release(file);
        if (unlikely(file->f_flags & FASYNC)) {
                if (file->f_op->fasync)
                        file->f_op->fasync(-1, file, 0);
        }
        if (file->f_op->release)
                file->f_op->release(inode, file);
        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
                     !(mode & FMODE_PATH))) {
                cdev_put(inode->i_cdev);
        }
        fops_put(file->f_op);
        file_f_owner_release(file);
        put_file_access(file);
        dput(dentry);
        if (unlikely(mode & FMODE_NEED_UNMOUNT))
                dissolve_on_fput(mnt);
        mntput(mnt);
out:
        file_free(file);
}

static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_fput_list);
        struct file *f, *t;

        llist_for_each_entry_safe(f, t, node, f_llist)
                __fput(f);
}

static void ____fput(struct callback_head *work)
{
        __fput(container_of(work, struct file, f_task_work));
}

static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);

/*
 * If kernel thread really needs to have the final fput() it has done
 * to complete, call this.  The only user right now is the boot - we
 * *do* need to make sure our writes to binaries on initramfs has
 * not left us with opened struct file waiting for __fput() - execve()
 * won't work without that.  Please, don't add more callers without
 * very good reasons; in particular, never call that with locks
 * held and never call that from a thread that might need to do
 * some work on any kind of umount.
 */
void flush_delayed_fput(void)
{
        delayed_fput(NULL);
        flush_delayed_work(&delayed_fput_work);
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);

static void __fput_deferred(struct file *file)
{
        struct task_struct *task = current;

        if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
                file_free(file);
                return;
        }

        if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
                init_task_work(&file->f_task_work, ____fput);
                if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
                        return;
                /*
                 * After this task has run exit_task_work(),
                 * task_work_add() will fail.  Fall through to delayed
                 * fput to avoid leaking *file.
                 */
        }

        if (llist_add(&file->f_llist, &delayed_fput_list))
                schedule_delayed_work(&delayed_fput_work, 1);
}

void fput(struct file *file)
{
        if (unlikely(file_ref_put(&file->f_ref)))
                __fput_deferred(file);
}
EXPORT_SYMBOL(fput);

/*
 * synchronous analog of fput(); for kernel threads that might be needed
 * in some umount() (and thus can't use flush_delayed_fput() without
 * risking deadlocks), need to wait for completion of __fput() and know
 * for this specific struct file it won't involve anything that would
 * need them.  Use only if you really need it - at the very least,
 * don't blindly convert fput() by kernel thread to that.
 */
void __fput_sync(struct file *file)
{
        if (file_ref_put(&file->f_ref))
                __fput(file);
}
EXPORT_SYMBOL(__fput_sync);

/*
 * Equivalent to __fput_sync(), but optimized for being called with the last
 * reference.
 *
 * See file_ref_put_close() for details.
 */
void fput_close_sync(struct file *file)
{
        if (likely(file_ref_put_close(&file->f_ref)))
                __fput(file);
}

/*
 * Equivalent to fput(), but optimized for being called with the last
 * reference.
 *
 * See file_ref_put_close() for details.
 */
void fput_close(struct file *file)
{
        if (file_ref_put_close(&file->f_ref))
                __fput_deferred(file);
}

void __init files_init(void)
{
        struct kmem_cache_args args = {
                .use_freeptr_offset = true,
                .freeptr_offset = offsetof(struct file, f_freeptr),
        };

        __filp_cache = kmem_cache_create("filp", sizeof(struct file), &args,
                                SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
        runtime_const_init(ptr, __filp_cache);

        args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
        __bfilp_cache = kmem_cache_create("bfilp", sizeof(struct backing_file),
                                &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
        runtime_const_init(ptr, __bfilp_cache);

        percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}

/*
 * One file with associated inode and dcache is very roughly 1K. Per default
 * do not use more than 10% of our memory for files.
 */
void __init files_maxfiles_init(void)
{
        unsigned long n;
        unsigned long nr_pages = totalram_pages();
        unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;

        memreserve = min(memreserve, nr_pages - 1);
        n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;

        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
}
























































































































































































































    3 
    3 














































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
// SPDX-License-Identifier: GPL-2.0
/*
 * The USB Monitor, inspired by Dave Harding's USBMon.
 *
 * mon_main.c: Main file, module initiation and exit, registrations, etc.
 *
 * Copyright (C) 2005 Pete Zaitcev (zaitcev@redhat.com)
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/usb.h>
#include <linux/usb/hcd.h>
#include <linux/slab.h>
#include <linux/notifier.h>
#include <linux/mutex.h>

#include "usb_mon.h"


static void mon_stop(struct mon_bus *mbus);
static void mon_dissolve(struct mon_bus *mbus, struct usb_bus *ubus);
static void mon_bus_drop(struct kref *r);
static void mon_bus_init(struct usb_bus *ubus);

DEFINE_MUTEX(mon_lock);

struct mon_bus mon_bus0;                /* Pseudo bus meaning "all buses" */
static LIST_HEAD(mon_buses);                /* All buses we know: struct mon_bus */

/*
 * Link a reader into the bus.
 *
 * This must be called with mon_lock taken because of mbus->ref.
 */
void mon_reader_add(struct mon_bus *mbus, struct mon_reader *r)
{
        unsigned long flags;
        struct list_head *p;

        spin_lock_irqsave(&mbus->lock, flags);
        if (mbus->nreaders == 0) {
                if (mbus == &mon_bus0) {
                        list_for_each (p, &mon_buses) {
                                struct mon_bus *m1;
                                m1 = list_entry(p, struct mon_bus, bus_link);
                                m1->u_bus->monitored = 1;
                        }
                } else {
                        mbus->u_bus->monitored = 1;
                }
        }
        mbus->nreaders++;
        list_add_tail(&r->r_link, &mbus->r_list);
        spin_unlock_irqrestore(&mbus->lock, flags);

        kref_get(&mbus->ref);
}

/*
 * Unlink reader from the bus.
 *
 * This is called with mon_lock taken, so we can decrement mbus->ref.
 */
void mon_reader_del(struct mon_bus *mbus, struct mon_reader *r)
{
        unsigned long flags;

        spin_lock_irqsave(&mbus->lock, flags);
        list_del(&r->r_link);
        --mbus->nreaders;
        if (mbus->nreaders == 0)
                mon_stop(mbus);
        spin_unlock_irqrestore(&mbus->lock, flags);

        kref_put(&mbus->ref, mon_bus_drop);
}

/*
 */
static void mon_bus_submit(struct mon_bus *mbus, struct urb *urb)
{
        unsigned long flags;
        struct mon_reader *r;

        spin_lock_irqsave(&mbus->lock, flags);
        mbus->cnt_events++;
        list_for_each_entry(r, &mbus->r_list, r_link)
                r->rnf_submit(r->r_data, urb);
        spin_unlock_irqrestore(&mbus->lock, flags);
}

static void mon_submit(struct usb_bus *ubus, struct urb *urb)
{
        struct mon_bus *mbus;

        mbus = ubus->mon_bus;
        if (mbus != NULL)
                mon_bus_submit(mbus, urb);
        mon_bus_submit(&mon_bus0, urb);
}

/*
 */
static void mon_bus_submit_error(struct mon_bus *mbus, struct urb *urb, int error)
{
        unsigned long flags;
        struct mon_reader *r;

        spin_lock_irqsave(&mbus->lock, flags);
        mbus->cnt_events++;
        list_for_each_entry(r, &mbus->r_list, r_link)
                r->rnf_error(r->r_data, urb, error);
        spin_unlock_irqrestore(&mbus->lock, flags);
}

static void mon_submit_error(struct usb_bus *ubus, struct urb *urb, int error)
{
        struct mon_bus *mbus;

        mbus = ubus->mon_bus;
        if (mbus != NULL)
                mon_bus_submit_error(mbus, urb, error);
        mon_bus_submit_error(&mon_bus0, urb, error);
}

/*
 */
static void mon_bus_complete(struct mon_bus *mbus, struct urb *urb, int status)
{
        unsigned long flags;
        struct mon_reader *r;

        spin_lock_irqsave(&mbus->lock, flags);
        mbus->cnt_events++;
        list_for_each_entry(r, &mbus->r_list, r_link)
                r->rnf_complete(r->r_data, urb, status);
        spin_unlock_irqrestore(&mbus->lock, flags);
}

static void mon_complete(struct usb_bus *ubus, struct urb *urb, int status)
{
        struct mon_bus *mbus;

        mbus = ubus->mon_bus;
        if (mbus != NULL)
                mon_bus_complete(mbus, urb, status);
        mon_bus_complete(&mon_bus0, urb, status);
}

/* int (*unlink_urb) (struct urb *urb, int status); */

/*
 * Stop monitoring.
 */
static void mon_stop(struct mon_bus *mbus)
{
        struct usb_bus *ubus;

        if (mbus == &mon_bus0) {
                list_for_each_entry(mbus, &mon_buses, bus_link) {
                        /*
                         * We do not change nreaders here, so rely on mon_lock.
                         */
                        if (mbus->nreaders == 0 && (ubus = mbus->u_bus) != NULL)
                                ubus->monitored = 0;
                }
        } else {
                /*
                 * A stop can be called for a dissolved mon_bus in case of
                 * a reader staying across an rmmod foo_hcd, so test ->u_bus.
                 */
                if (mon_bus0.nreaders == 0 && (ubus = mbus->u_bus) != NULL) {
                        ubus->monitored = 0;
                        mb();
                }
        }
}

/*
 * Add a USB bus (usually by a modprobe foo-hcd)
 *
 * This does not return an error code because the core cannot care less
 * if monitoring is not established.
 */
static void mon_bus_add(struct usb_bus *ubus)
{
        mon_bus_init(ubus);
        mutex_lock(&mon_lock);
        if (mon_bus0.nreaders != 0)
                ubus->monitored = 1;
        mutex_unlock(&mon_lock);
}

/*
 * Remove a USB bus (either from rmmod foo-hcd or from a hot-remove event).
 */
static void mon_bus_remove(struct usb_bus *ubus)
{
        struct mon_bus *mbus = ubus->mon_bus;

        mutex_lock(&mon_lock);
        list_del(&mbus->bus_link);
        if (mbus->text_inited)
                mon_text_del(mbus);
        if (mbus->bin_inited)
                mon_bin_del(mbus);

        mon_dissolve(mbus, ubus);
        kref_put(&mbus->ref, mon_bus_drop);
        mutex_unlock(&mon_lock);
}

static int mon_notify(struct notifier_block *self, unsigned long action,
                      void *dev)
{
        switch (action) {
        case USB_BUS_ADD:
                mon_bus_add(dev);
                break;
        case USB_BUS_REMOVE:
                mon_bus_remove(dev);
        }
        return NOTIFY_OK;
}

static struct notifier_block mon_nb = {
        .notifier_call =         mon_notify,
};

/*
 * Ops
 */
static const struct usb_mon_operations mon_ops_0 = {
        .urb_submit =        mon_submit,
        .urb_submit_error = mon_submit_error,
        .urb_complete =        mon_complete,
};

/*
 * Tear usb_bus and mon_bus apart.
 */
static void mon_dissolve(struct mon_bus *mbus, struct usb_bus *ubus)
{

        if (ubus->monitored) {
                ubus->monitored = 0;
                mb();
        }

        ubus->mon_bus = NULL;
        mbus->u_bus = NULL;
        mb();

        /* We want synchronize_irq() here, but that needs an argument. */
}

/*
 */
static void mon_bus_drop(struct kref *r)
{
        struct mon_bus *mbus = container_of(r, struct mon_bus, ref);
        kfree(mbus);
}

/*
 * Initialize a bus for us:
 *  - allocate mon_bus
 *  - refcount USB bus struct
 *  - link
 */
static void mon_bus_init(struct usb_bus *ubus)
{
        struct mon_bus *mbus;

        mbus = kzalloc_obj(struct mon_bus);
        if (mbus == NULL)
                goto err_alloc;
        kref_init(&mbus->ref);
        spin_lock_init(&mbus->lock);
        INIT_LIST_HEAD(&mbus->r_list);

        /*
         * We don't need to take a reference to ubus, because we receive
         * a notification if the bus is about to be removed.
         */
        mbus->u_bus = ubus;
        ubus->mon_bus = mbus;

        mbus->text_inited = mon_text_add(mbus, ubus);
        mbus->bin_inited = mon_bin_add(mbus, ubus);

        mutex_lock(&mon_lock);
        list_add_tail(&mbus->bus_link, &mon_buses);
        mutex_unlock(&mon_lock);
        return;

err_alloc:
        return;
}

static void mon_bus0_init(void)
{
        struct mon_bus *mbus = &mon_bus0;

        kref_init(&mbus->ref);
        spin_lock_init(&mbus->lock);
        INIT_LIST_HEAD(&mbus->r_list);

        mbus->text_inited = mon_text_add(mbus, NULL);
        mbus->bin_inited = mon_bin_add(mbus, NULL);
}

/*
 * Search a USB bus by number. Notice that USB bus numbers start from one,
 * which we may later use to identify "all" with zero.
 *
 * This function must be called with mon_lock held.
 *
 * This is obviously inefficient and may be revised in the future.
 */
struct mon_bus *mon_bus_lookup(unsigned int num)
{
        struct mon_bus *mbus;

        if (num == 0) {
                return &mon_bus0;
        }
        list_for_each_entry(mbus, &mon_buses, bus_link) {
                if (mbus->u_bus->busnum == num) {
                        return mbus;
                }
        }
        return NULL;
}

static int __init mon_init(void)
{
        struct usb_bus *ubus;
        int rc, id;

        if ((rc = mon_text_init()) != 0)
                goto err_text;
        if ((rc = mon_bin_init()) != 0)
                goto err_bin;

        mon_bus0_init();

        if (usb_mon_register(&mon_ops_0) != 0) {
                printk(KERN_NOTICE TAG ": unable to register with the core\n");
                rc = -ENODEV;
                goto err_reg;
        }
        // MOD_INC_USE_COUNT(which_module?);

        mutex_lock(&usb_bus_idr_lock);
        idr_for_each_entry(&usb_bus_idr, ubus, id)
                mon_bus_init(ubus);
        usb_register_notify(&mon_nb);
        mutex_unlock(&usb_bus_idr_lock);
        return 0;

err_reg:
        mon_bin_exit();
err_bin:
        mon_text_exit();
err_text:
        return rc;
}

static void __exit mon_exit(void)
{
        struct mon_bus *mbus;
        struct list_head *p;

        usb_unregister_notify(&mon_nb);
        usb_mon_deregister();

        mutex_lock(&mon_lock);

        while (!list_empty(&mon_buses)) {
                p = mon_buses.next;
                mbus = list_entry(p, struct mon_bus, bus_link);
                list_del(p);

                if (mbus->text_inited)
                        mon_text_del(mbus);
                if (mbus->bin_inited)
                        mon_bin_del(mbus);

                /*
                 * This never happens, because the open/close paths in
                 * file level maintain module use counters and so rmmod fails
                 * before reaching here. However, better be safe...
                 */
                if (mbus->nreaders) {
                        printk(KERN_ERR TAG
                            ": Outstanding opens (%d) on usb%d, leaking...\n",
                            mbus->nreaders, mbus->u_bus->busnum);
                        kref_get(&mbus->ref); /* Force leak */
                }

                mon_dissolve(mbus, mbus->u_bus);
                kref_put(&mbus->ref, mon_bus_drop);
        }

        mbus = &mon_bus0;
        if (mbus->text_inited)
                mon_text_del(mbus);
        if (mbus->bin_inited)
                mon_bin_del(mbus);

        mutex_unlock(&mon_lock);

        mon_text_exit();
        mon_bin_exit();
}

module_init(mon_init);
module_exit(mon_exit);

MODULE_DESCRIPTION("USB Monitor");
MODULE_LICENSE("GPL");






















































































































































































































































































































































































































    2 





    3 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef __LINUX_USB_TYPEC_H
#define __LINUX_USB_TYPEC_H

#include <linux/types.h>

/* USB Type-C Specification releases */
#define USB_TYPEC_REV_1_0        0x100 /* 1.0 */
#define USB_TYPEC_REV_1_1        0x110 /* 1.1 */
#define USB_TYPEC_REV_1_2        0x120 /* 1.2 */
#define USB_TYPEC_REV_1_3        0x130 /* 1.3 */
#define USB_TYPEC_REV_1_4        0x140 /* 1.4 */
#define USB_TYPEC_REV_2_0        0x200 /* 2.0 */

struct typec_partner;
struct typec_cable;
struct typec_plug;
struct typec_port;
struct typec_altmode_ops;
struct typec_cable_ops;

struct bus_type;
struct fwnode_handle;
struct device;

struct usb_power_delivery;
struct usb_power_delivery_desc;

extern const struct bus_type typec_bus;

enum typec_port_type {
        TYPEC_PORT_SRC,
        TYPEC_PORT_SNK,
        TYPEC_PORT_DRP,
};

enum typec_port_data {
        TYPEC_PORT_DFP,
        TYPEC_PORT_UFP,
        TYPEC_PORT_DRD,
};

enum typec_plug_type {
        USB_PLUG_NONE,
        USB_PLUG_TYPE_A,
        USB_PLUG_TYPE_B,
        USB_PLUG_TYPE_C,
        USB_PLUG_CAPTIVE,
};

enum typec_data_role {
        TYPEC_DEVICE,
        TYPEC_HOST,
};

enum typec_role {
        TYPEC_SINK,
        TYPEC_SOURCE,
};

static inline int is_sink(enum typec_role role)
{
        return role == TYPEC_SINK;
}

static inline int is_source(enum typec_role role)
{
        return role == TYPEC_SOURCE;
}

enum typec_pwr_opmode {
        TYPEC_PWR_MODE_USB,
        TYPEC_PWR_MODE_1_5A,
        TYPEC_PWR_MODE_3_0A,
        TYPEC_PWR_MODE_PD,
};

enum typec_accessory {
        TYPEC_ACCESSORY_NONE,
        TYPEC_ACCESSORY_AUDIO,
        TYPEC_ACCESSORY_DEBUG,
};

#define TYPEC_MAX_ACCESSORY        3

enum typec_orientation {
        TYPEC_ORIENTATION_NONE,
        TYPEC_ORIENTATION_NORMAL,
        TYPEC_ORIENTATION_REVERSE,
};

enum usb_mode {
        USB_MODE_NONE,
        USB_MODE_USB2,
        USB_MODE_USB3,
        USB_MODE_USB4
};

#define USB_CAPABILITY_USB2        BIT(0)
#define USB_CAPABILITY_USB3        BIT(1)
#define USB_CAPABILITY_USB4        BIT(2)

/*
 * struct enter_usb_data - Enter_USB Message details
 * @eudo: Enter_USB Data Object
 * @active_link_training: Active Cable Plug Link Training
 *
 * @active_link_training is a flag that should be set with uni-directional SBRX
 * communication, and left 0 with passive cables and with bi-directional SBRX
 * communication.
 */
struct enter_usb_data {
        u32                        eudo;
        unsigned char                active_link_training:1;
};

/*
 * struct usb_pd_identity - USB Power Delivery identity data
 * @id_header: ID Header VDO
 * @cert_stat: Cert Stat VDO
 * @product: Product VDO
 * @vdo: Product Type Specific VDOs
 *
 * USB power delivery Discover Identity command response data.
 *
 * REVISIT: This is USB Power Delivery specific information, so this structure
 * probable belongs to USB Power Delivery header file once we have them.
 */
struct usb_pd_identity {
        u32                        id_header;
        u32                        cert_stat;
        u32                        product;
        u32                        vdo[3];
};

int typec_partner_set_identity(struct typec_partner *partner);
int typec_cable_set_identity(struct typec_cable *cable);

/*
 * struct typec_altmode_desc - USB Type-C Alternate Mode Descriptor
 * @svid: Standard or Vendor ID
 * @mode: Index of the Mode
 * @vdo: VDO returned by Discover Modes USB PD command
 * @roles: Only for ports. DRP if the mode is available in both roles
 * @inactive: Only for ports. Make this port inactive (default is active).
 *
 * Description of an Alternate Mode which a connector, cable plug or partner
 * supports.
 */
struct typec_altmode_desc {
        u16                        svid;
        u8                        mode;
        u32                        vdo;
        /* Only used with ports */
        enum typec_port_data        roles;
        bool                        inactive;
        bool                        mode_selection;
};

void typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision);
int typec_partner_set_num_altmodes(struct typec_partner *partner, int num_altmodes);
struct typec_altmode
*typec_partner_register_altmode(struct typec_partner *partner,
                                const struct typec_altmode_desc *desc);
int typec_plug_set_num_altmodes(struct typec_plug *plug, int num_altmodes);
struct typec_altmode
*typec_plug_register_altmode(struct typec_plug *plug,
                             const struct typec_altmode_desc *desc);
struct typec_altmode
*typec_port_register_altmode(struct typec_port *port,
                             const struct typec_altmode_desc *desc);

void typec_port_register_altmodes(struct typec_port *port,
        const struct typec_altmode_ops *ops, void *drvdata,
        struct typec_altmode **altmodes, size_t n);

void typec_port_register_cable_ops(struct typec_altmode **altmodes, int max_altmodes,
                                   const struct typec_cable_ops *ops);

void typec_unregister_altmode(struct typec_altmode *altmode);

struct typec_port *typec_altmode2port(struct typec_altmode *alt);

void typec_altmode_update_active(struct typec_altmode *alt, bool active);

void typec_altmode_set_ops(struct typec_altmode *alt,
                           const struct typec_altmode_ops *ops);

enum typec_plug_index {
        TYPEC_PLUG_SOP_P,
        TYPEC_PLUG_SOP_PP,
};

/*
 * struct typec_plug_desc - USB Type-C Cable Plug Descriptor
 * @index: SOP Prime for the plug connected to DFP and SOP Double Prime for the
 *         plug connected to UFP
 *
 * Represents USB Type-C Cable Plug.
 */
struct typec_plug_desc {
        enum typec_plug_index        index;
};

/*
 * struct typec_cable_desc - USB Type-C Cable Descriptor
 * @type: The plug type from USB PD Cable VDO
 * @active: Is the cable active or passive
 * @identity: Result of Discover Identity command
 * @pd_revision: USB Power Delivery Specification revision if supported
 *
 * Represents USB Type-C Cable attached to USB Type-C port.
 */
struct typec_cable_desc {
        enum typec_plug_type        type;
        unsigned int                active:1;
        struct usb_pd_identity        *identity;
        u16                        pd_revision; /* 0300H = "3.0" */

};

/*
 * struct typec_partner_desc - USB Type-C Partner Descriptor
 * @usb_pd: USB Power Delivery support
 * @accessory: Audio, Debug or none.
 * @identity: Discover Identity command data
 * @pd_revision: USB Power Delivery Specification Revision if supported
 * @usb_capability: Supported USB Modes
 * @attach: Notification about attached USB device
 * @deattach: Notification about removed USB device
 *
 * Details about a partner that is attached to USB Type-C port. If @identity
 * member exists when partner is registered, a directory named "identity" is
 * created to sysfs for the partner device.
 *
 * @pd_revision is based on the setting of the "Specification Revision" field
 * in the message header on the initial "Source Capabilities" message received
 * from the partner, or a "Request" message received from the partner, depending
 * on whether our port is a Sink or a Source.
 */
struct typec_partner_desc {
        unsigned int                usb_pd:1;
        enum typec_accessory        accessory;
        struct usb_pd_identity        *identity;
        u16                        pd_revision; /* 0300H = "3.0" */
        u8                        usb_capability;

        void (*attach)(struct typec_partner *partner, struct device *dev);
        void (*deattach)(struct typec_partner *partner, struct device *dev);
};

/**
 * struct typec_operations - USB Type-C Port Operations
 * @try_role: Set data role preference for DRP port
 * @dr_set: Set Data Role
 * @pr_set: Set Power Role
 * @vconn_set: Source VCONN
 * @port_type_set: Set port type
 * @pd_get: Get available USB Power Delivery Capabilities.
 * @pd_set: Set USB Power Delivery Capabilities.
 * @default_usb_mode_set: USB Mode to be used by default with Enter_USB Message
 * @enter_usb_mode: Change the active USB Mode
 */
struct typec_operations {
        int (*try_role)(struct typec_port *port, int role);
        int (*dr_set)(struct typec_port *port, enum typec_data_role role);
        int (*pr_set)(struct typec_port *port, enum typec_role role);
        int (*vconn_set)(struct typec_port *port, enum typec_role role);
        int (*port_type_set)(struct typec_port *port,
                             enum typec_port_type type);
        struct usb_power_delivery **(*pd_get)(struct typec_port *port);
        int (*pd_set)(struct typec_port *port, struct usb_power_delivery *pd);
        int (*default_usb_mode_set)(struct typec_port *port, enum usb_mode mode);
        int (*enter_usb_mode)(struct typec_port *port, enum usb_mode mode);
};

enum usb_pd_svdm_ver {
        SVDM_VER_1_0 = 0,
        SVDM_VER_2_0 = 1,
        SVDM_VER_MAX = SVDM_VER_2_0,
};

/*
 * struct typec_capability - USB Type-C Port Capabilities
 * @type: Supported power role of the port
 * @data: Supported data role of the port
 * @revision: USB Type-C Specification release. Binary coded decimal
 * @pd_revision: USB Power Delivery Specification revision if supported
 * @svdm_version: USB PD Structured VDM version if supported
 * @prefer_role: Initial role preference (DRP ports).
 * @accessory: Supported Accessory Modes
 * @usb_capability: Supported USB Modes
 * @no_mode_control: Ability to manage Alternate Modes
 * @fwnode: Optional fwnode of the port
 * @driver_data: Private pointer for driver specific info
 * @pd: Optional USB Power Delivery Support
 * @ops: Port operations vector
 *
 * Static capabilities of a single USB Type-C port.
 */
struct typec_capability {
        enum typec_port_type        type;
        enum typec_port_data        data;
        u16                        revision; /* 0120H = "1.2" */
        u16                        pd_revision; /* 0300H = "3.0" */
        enum usb_pd_svdm_ver        svdm_version;
        int                        prefer_role;
        enum typec_accessory        accessory[TYPEC_MAX_ACCESSORY];
        unsigned int                orientation_aware:1;
        u8                        usb_capability;
        bool                        no_mode_control;

        struct fwnode_handle        *fwnode;
        void                        *driver_data;

        struct usb_power_delivery *pd;

        const struct typec_operations        *ops;
};

/* Specific to try_role(). Indicates the user want's to clear the preference. */
#define TYPEC_NO_PREFERRED_ROLE        (-1)

struct typec_port *typec_register_port(struct device *parent,
                                       const struct typec_capability *cap);
void typec_unregister_port(struct typec_port *port);

struct typec_partner *typec_register_partner(struct typec_port *port,
                                             struct typec_partner_desc *desc);
void typec_unregister_partner(struct typec_partner *partner);

struct typec_cable *typec_register_cable(struct typec_port *port,
                                         struct typec_cable_desc *desc);
void typec_unregister_cable(struct typec_cable *cable);

struct typec_cable *typec_cable_get(struct typec_port *port);
void typec_cable_put(struct typec_cable *cable);
int typec_cable_is_active(struct typec_cable *cable);

struct typec_plug *typec_register_plug(struct typec_cable *cable,
                                       struct typec_plug_desc *desc);
void typec_unregister_plug(struct typec_plug *plug);

void typec_set_data_role(struct typec_port *port, enum typec_data_role role);
enum typec_data_role typec_get_data_role(struct typec_port *port);
void typec_set_pwr_role(struct typec_port *port, enum typec_role role);
void typec_set_vconn_role(struct typec_port *port, enum typec_role role);
void typec_set_pwr_opmode(struct typec_port *port, enum typec_pwr_opmode mode);

int typec_set_orientation(struct typec_port *port,
                          enum typec_orientation orientation);
enum typec_orientation typec_get_orientation(struct typec_port *port);
int typec_set_mode(struct typec_port *port, int mode);

void *typec_get_drvdata(struct typec_port *port);

int typec_get_fw_cap(struct typec_capability *cap,
                     struct fwnode_handle *fwnode);

int typec_find_pwr_opmode(const char *name);
int typec_find_orientation(const char *name);
int typec_find_port_power_role(const char *name);
int typec_find_power_role(const char *name);
int typec_find_port_data_role(const char *name);

void typec_partner_set_svdm_version(struct typec_partner *partner,
                                    enum usb_pd_svdm_ver svdm_version);
int typec_get_negotiated_svdm_version(struct typec_port *port);

int typec_get_cable_svdm_version(struct typec_port *port);
void typec_cable_set_svdm_version(struct typec_cable *cable, enum usb_pd_svdm_ver svdm_version);

struct usb_power_delivery *typec_partner_usb_power_delivery_register(struct typec_partner *partner,
                                                        struct usb_power_delivery_desc *desc);

int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_delivery *pd);
int typec_partner_set_usb_power_delivery(struct typec_partner *partner,
                                         struct usb_power_delivery *pd);

void typec_partner_set_usb_mode(struct typec_partner *partner, enum usb_mode usb_mode);
void typec_port_set_usb_mode(struct typec_port *port, enum usb_mode mode);

/**
 * struct typec_connector - Representation of Type-C port for external drivers
 * @attach: notification about device removal
 * @deattach: notification about device removal
 *
 * Drivers that control the USB and other ports (DisplayPorts, etc.), that are
 * connected to the Type-C connectors, can use these callbacks to inform the
 * Type-C connector class about connections and disconnections. That information
 * can then be used by the typec-port drivers to power on or off parts that are
 * needed or not needed - as an example, in USB mode if USB2 device is
 * enumerated, USB3 components (retimers, phys, and what have you) do not need
 * to be powered on.
 *
 * The attached (enumerated) devices will be liked with the typec-partner device.
 */
struct typec_connector {
        void (*attach)(struct typec_connector *con, struct device *dev);
        void (*deattach)(struct typec_connector *con, struct device *dev);
};

static inline void typec_attach(struct typec_connector *con, struct device *dev)
{
        if (con && con->attach)
                con->attach(con, dev);
}

static inline void typec_deattach(struct typec_connector *con, struct device *dev)
{
        if (con && con->deattach)
                con->deattach(con, dev);
}

#endif /* __LINUX_USB_TYPEC_H */













































   14 













































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BIO_INTEGRITY_H
#define _LINUX_BIO_INTEGRITY_H

#include <linux/bio.h>

enum bip_flags {
        BIP_BLOCK_INTEGRITY        = 1 << 0, /* block layer owns integrity data */
        BIP_MAPPED_INTEGRITY        = 1 << 1, /* ref tag has been remapped */
        BIP_DISK_NOCHECK        = 1 << 2, /* disable disk integrity checking */
        BIP_IP_CHECKSUM                = 1 << 3, /* IP checksum */
        BIP_COPY_USER                = 1 << 4, /* Kernel bounce buffer in use */
        BIP_CHECK_GUARD                = 1 << 5, /* guard check */
        BIP_CHECK_REFTAG        = 1 << 6, /* reftag check */
        BIP_CHECK_APPTAG        = 1 << 7, /* apptag check */

        BIP_MEMPOOL                = 1 << 15, /* buffer backed by mempool */
};

struct bio_integrity_payload {
        struct bvec_iter        bip_iter;

        unsigned short                bip_vcnt;        /* # of integrity bio_vecs */
        unsigned short                bip_max_vcnt;        /* integrity bio_vec slots */
        unsigned short                bip_flags;        /* control flags */
        u16                        app_tag;        /* application tag value */

        struct bio_vec                *bip_vec;
};

#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \
                         BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG)

#ifdef CONFIG_BLK_DEV_INTEGRITY

#define bip_for_each_vec(bvl, bip, iter)                                \
        for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter)

#define bio_for_each_integrity_vec(_bvl, _bio, _iter)                        \
        for_each_bio(_bio)                                                \
                bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)

static inline struct bio_integrity_payload *bio_integrity(struct bio *bio)
{
        if (bio->bi_opf & REQ_INTEGRITY)
                return bio->bi_integrity;

        return NULL;
}

static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);

        if (bip)
                return bip->bip_flags & flag;

        return false;
}

static inline sector_t bip_get_seed(struct bio_integrity_payload *bip)
{
        return bip->bip_iter.bi_sector;
}

static inline void bip_set_seed(struct bio_integrity_payload *bip,
                                sector_t seed)
{
        bip->bip_iter.bi_sector = seed;
}

void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip,
                struct bio_vec *bvecs, unsigned int nr_vecs);
struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp,
                unsigned int nr);
int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len,
                unsigned int offset);
int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter);
int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta);
void bio_integrity_unmap_user(struct bio *bio);
void bio_integrity_prep(struct bio *bio, unsigned int action);
void bio_integrity_advance(struct bio *bio, unsigned int bytes_done);
void bio_integrity_trim(struct bio *bio);
int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask);

#else /* CONFIG_BLK_DEV_INTEGRITY */

static inline struct bio_integrity_payload *bio_integrity(struct bio *bio)
{
        return NULL;
}

static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
{
        return -EINVAL;
}

static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
{
        return -EINVAL;
}

static inline void bio_integrity_unmap_user(struct bio *bio)
{
}

static inline void bio_integrity_prep(struct bio *bio, unsigned int action)
{
}

static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
                gfp_t gfp_mask)
{
        return 0;
}

static inline void bio_integrity_advance(struct bio *bio,
                unsigned int bytes_done)
{
}

static inline void bio_integrity_trim(struct bio *bio)
{
}

static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag)
{
        return false;
}

static inline struct bio_integrity_payload *
bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr)
{
        return ERR_PTR(-EINVAL);
}

static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
                                        unsigned int len, unsigned int offset)
{
        return 0;
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */

void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer);
void bio_integrity_free_buf(struct bio_integrity_payload *bip);
void bio_integrity_setup_default(struct bio *bio);

unsigned int fs_bio_integrity_alloc(struct bio *bio);
void fs_bio_integrity_free(struct bio *bio);
void fs_bio_integrity_generate(struct bio *bio);
int fs_bio_integrity_verify(struct bio *bio, sector_t sector,
                unsigned int size);

#endif /* _LINUX_BIO_INTEGRITY_H */


























































































































































































































































    3 














    3 



















































    3 












    3 





























    3 







    3 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
// SPDX-License-Identifier: GPL-2.0
/*
 * USB-ACPI glue code
 *
 * Copyright 2012 Red Hat <mjg@redhat.com>
 */
#include <linux/module.h>
#include <linux/usb.h>
#include <linux/device.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/pci.h>
#include <linux/usb/hcd.h>

#include "hub.h"

/**
 * usb_acpi_power_manageable - check whether usb port has
 * acpi power resource.
 * @hdev: USB device belonging to the usb hub
 * @index: port index based zero
 *
 * Return true if the port has acpi power resource and false if no.
 */
bool usb_acpi_power_manageable(struct usb_device *hdev, int index)
{
        acpi_handle port_handle;
        int port1 = index + 1;

        port_handle = usb_get_hub_port_acpi_handle(hdev,
                port1);
        if (port_handle)
                return acpi_bus_power_manageable(port_handle);
        else
                return false;
}
EXPORT_SYMBOL_GPL(usb_acpi_power_manageable);

#define UUID_USB_CONTROLLER_DSM "ce2ee385-00e6-48cb-9f05-2edb927c4899"
#define USB_DSM_DISABLE_U1_U2_FOR_PORT        5

/**
 * usb_acpi_port_lpm_incapable - check if lpm should be disabled for a port.
 * @hdev: USB device belonging to the usb hub
 * @index: zero based port index
 *
 * Some USB3 ports may not support USB3 link power management U1/U2 states
 * due to different retimer setup. ACPI provides _DSM method which returns 0x01
 * if U1 and U2 states should be disabled. Evaluate _DSM with:
 * Arg0: UUID = ce2ee385-00e6-48cb-9f05-2edb927c4899
 * Arg1: Revision ID = 0
 * Arg2: Function Index = 5
 * Arg3: (empty)
 *
 * Return 1 if USB3 port is LPM incapable, negative on error, otherwise 0
 */

int usb_acpi_port_lpm_incapable(struct usb_device *hdev, int index)
{
        union acpi_object *obj;
        acpi_handle port_handle;
        int port1 = index + 1;
        guid_t guid;
        int ret;

        ret = guid_parse(UUID_USB_CONTROLLER_DSM, &guid);
        if (ret)
                return ret;

        port_handle = usb_get_hub_port_acpi_handle(hdev, port1);
        if (!port_handle) {
                dev_dbg(&hdev->dev, "port-%d no acpi handle\n", port1);
                return -ENODEV;
        }

        if (!acpi_check_dsm(port_handle, &guid, 0,
                            BIT(USB_DSM_DISABLE_U1_U2_FOR_PORT))) {
                dev_dbg(&hdev->dev, "port-%d no _DSM function %d\n",
                        port1, USB_DSM_DISABLE_U1_U2_FOR_PORT);
                return -ENODEV;
        }

        obj = acpi_evaluate_dsm_typed(port_handle, &guid, 0,
                                      USB_DSM_DISABLE_U1_U2_FOR_PORT, NULL,
                                      ACPI_TYPE_INTEGER);
        if (!obj) {
                dev_dbg(&hdev->dev, "evaluate port-%d _DSM failed\n", port1);
                return -EINVAL;
        }

        if (obj->integer.value == 0x01)
                ret = 1;

        ACPI_FREE(obj);

        return ret;
}
EXPORT_SYMBOL_GPL(usb_acpi_port_lpm_incapable);

/**
 * usb_acpi_set_power_state - control usb port's power via acpi power
 * resource
 * @hdev: USB device belonging to the usb hub
 * @index: port index based zero
 * @enable: power state expected to be set
 *
 * Notice to use usb_acpi_power_manageable() to check whether the usb port
 * has acpi power resource before invoking this function.
 *
 * Returns 0 on success, else negative errno.
 */
int usb_acpi_set_power_state(struct usb_device *hdev, int index, bool enable)
{
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
        struct usb_port *port_dev;
        acpi_handle port_handle;
        unsigned char state;
        int port1 = index + 1;
        int error = -EINVAL;

        if (!hub)
                return -ENODEV;
        port_dev = hub->ports[port1 - 1];

        port_handle = (acpi_handle) usb_get_hub_port_acpi_handle(hdev, port1);
        if (!port_handle)
                return error;

        if (enable)
                state = ACPI_STATE_D0;
        else
                state = ACPI_STATE_D3_COLD;

        error = acpi_bus_set_power(port_handle, state);
        if (!error)
                dev_dbg(&port_dev->dev, "acpi: power was set to %d\n", enable);
        else
                dev_dbg(&port_dev->dev, "acpi: power failed to be set\n");

        return error;
}
EXPORT_SYMBOL_GPL(usb_acpi_set_power_state);

/**
 * usb_acpi_add_usb4_devlink - add device link to USB4 Host Interface for tunneled USB3 devices
 *
 * @udev: Tunneled USB3 device connected to a roothub.
 *
 * Adds a device link between a tunneled USB3 device and the USB4 Host Interface
 * device to ensure correct runtime PM suspend and resume order. This function
 * should only be called for tunneled USB3 devices.
 * The USB4 Host Interface this tunneled device depends on is found from the roothub
 * port ACPI device specific data _DSD entry.
 *
 * Return: negative error code on failure, 0 otherwise
 */
static int usb_acpi_add_usb4_devlink(struct usb_device *udev)
{
        struct device_link *link;
        struct usb_port *port_dev;
        struct usb_hub *hub;

        if (!udev->parent || udev->parent->parent)
                return 0;

        hub = usb_hub_to_struct_hub(udev->parent);
        if (!hub)
                return 0;
        port_dev = hub->ports[udev->portnum - 1];

        struct fwnode_handle *nhi_fwnode __free(fwnode_handle) =
                fwnode_find_reference(dev_fwnode(&port_dev->dev), "usb4-host-interface", 0);

        if (IS_ERR(nhi_fwnode) || !nhi_fwnode->dev)
                return 0;

        link = device_link_add(&port_dev->child->dev, nhi_fwnode->dev,
                               DL_FLAG_STATELESS |
                               DL_FLAG_RPM_ACTIVE |
                               DL_FLAG_PM_RUNTIME);
        if (!link) {
                dev_err(&port_dev->dev, "Failed to created device link from %s to %s\n",
                        dev_name(&port_dev->child->dev), dev_name(nhi_fwnode->dev));
                return -EINVAL;
        }

        dev_dbg(&port_dev->dev, "Created device link from %s to %s\n",
                dev_name(&port_dev->child->dev), dev_name(nhi_fwnode->dev));

        udev->usb4_link = link;

        return 0;
}

/*
 * Private to usb-acpi, all the core needs to know is that
 * port_dev->location is non-zero when it has been set by the firmware.
 */
#define USB_ACPI_LOCATION_VALID (1 << 31)

static void
usb_acpi_get_connect_type(struct usb_port *port_dev, acpi_handle *handle)
{
        enum usb_port_connect_type connect_type = USB_PORT_CONNECT_TYPE_UNKNOWN;
        struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
        union acpi_object *upc = NULL;
        struct acpi_pld_info *pld = NULL;
        acpi_status status;

        /*
         * According to 9.14 in ACPI Spec 6.2. _PLD indicates whether usb port
         * is user visible and _UPC indicates whether it is connectable. If
         * the port was visible and connectable, it could be freely connected
         * and disconnected with USB devices. If no visible and connectable,
         * a usb device is directly hard-wired to the port. If no visible and
         * no connectable, the port would be not used.
         */

        if (acpi_get_physical_device_location(handle, &pld) && pld)
                port_dev->location = USB_ACPI_LOCATION_VALID |
                        pld->group_token << 8 | pld->group_position;

        status = acpi_evaluate_object(handle, "_UPC", NULL, &buffer);
        if (ACPI_FAILURE(status))
                goto out;

        upc = buffer.pointer;
        if (!upc || (upc->type != ACPI_TYPE_PACKAGE) || upc->package.count != 4)
                goto out;

        /* UPC states port is connectable */
        if (upc->package.elements[0].integer.value)
                if (!pld)
                        ; /* keep connect_type as unknown */
                else if (pld->user_visible)
                        connect_type = USB_PORT_CONNECT_TYPE_HOT_PLUG;
                else
                        connect_type = USB_PORT_CONNECT_TYPE_HARD_WIRED;
        else
                connect_type = USB_PORT_NOT_USED;
out:
        port_dev->connect_type = connect_type;
        kfree(upc);
        ACPI_FREE(pld);
}

static struct acpi_device *
usb_acpi_get_companion_for_port(struct usb_port *port_dev)
{
        struct usb_device *udev;
        struct acpi_device *adev;
        acpi_handle *parent_handle;
        int port1;

        /* Get the struct usb_device point of port's hub */
        udev = to_usb_device(port_dev->dev.parent->parent);

        /*
         * The root hub ports' parent is the root hub. The non-root-hub
         * ports' parent is the parent hub port which the hub is
         * connected to.
         */
        if (!udev->parent) {
                adev = ACPI_COMPANION(&udev->dev);
                port1 = usb_hcd_find_raw_port_number(bus_to_hcd(udev->bus),
                                                     port_dev->portnum);
        } else {
                parent_handle = usb_get_hub_port_acpi_handle(udev->parent,
                                                             udev->portnum);
                if (!parent_handle)
                        return NULL;

                adev = acpi_fetch_acpi_dev(parent_handle);
                port1 = port_dev->portnum;
        }

        return acpi_find_child_by_adr(adev, port1);
}

static struct acpi_device *
usb_acpi_find_companion_for_port(struct usb_port *port_dev)
{
        struct acpi_device *adev;

        adev = usb_acpi_get_companion_for_port(port_dev);
        if (!adev)
                return NULL;

        usb_acpi_get_connect_type(port_dev, adev->handle);

        return adev;
}

static struct acpi_device *
usb_acpi_find_companion_for_device(struct usb_device *udev)
{
        struct acpi_device *adev;
        struct usb_port *port_dev;
        struct usb_hub *hub;

        if (!udev->parent) {
                /*
                 * root hub is only child (_ADR=0) under its parent, the HC.
                 * sysdev pointer is the HC as seen from firmware.
                 */
                adev = ACPI_COMPANION(udev->bus->sysdev);
                return acpi_find_child_device(adev, 0, false);
        }

        hub = usb_hub_to_struct_hub(udev->parent);
        if (!hub)
                return NULL;


        /* Tunneled USB3 devices depend on USB4 Host Interface, set device link to it */
        if (udev->speed >= USB_SPEED_SUPER &&
            udev->tunnel_mode != USB_LINK_NATIVE)
                usb_acpi_add_usb4_devlink(udev);

        /*
         * This is an embedded USB device connected to a port and such
         * devices share port's ACPI companion.
         */
        port_dev = hub->ports[udev->portnum - 1];
        return usb_acpi_get_companion_for_port(port_dev);
}

static struct acpi_device *usb_acpi_find_companion(struct device *dev)
{
        /*
         * The USB hierarchy like following:
         *
         * Device (EHC1)
         *        Device (HUBN)
         *                Device (PR01)
         *                        Device (PR11)
         *                        Device (PR12)
         *                                Device (FN12)
         *                                Device (FN13)
         *                        Device (PR13)
         *                        ...
         * where HUBN is root hub, and PRNN are USB ports and devices
         * connected to them, and FNNN are individualk functions for
         * connected composite USB devices. PRNN and FNNN may contain
         * _CRS and other methods describing sideband resources for
         * the connected device.
         *
         * On the kernel side both root hub and embedded USB devices are
         * represented as instances of usb_device structure, and ports
         * are represented as usb_port structures, so the whole process
         * is split into 2 parts: finding companions for devices and
         * finding companions for ports.
         *
         * Note that we do not handle individual functions of composite
         * devices yet, for that we would need to assign companions to
         * devices corresponding to USB interfaces.
         */
        if (is_usb_device(dev))
                return usb_acpi_find_companion_for_device(to_usb_device(dev));
        else if (is_usb_port(dev))
                return usb_acpi_find_companion_for_port(to_usb_port(dev));

        return NULL;
}

static bool usb_acpi_bus_match(struct device *dev)
{
        return is_usb_device(dev) || is_usb_port(dev);
}

static struct acpi_bus_type usb_acpi_bus = {
        .name = "USB",
        .match = usb_acpi_bus_match,
        .find_companion = usb_acpi_find_companion,
};

int usb_acpi_register(void)
{
        return register_acpi_bus_type(&usb_acpi_bus);
}

void usb_acpi_unregister(void)
{
        unregister_acpi_bus_type(&usb_acpi_bus);
}





















































    3 


















    3 








    1 














































    1 


































    2 













    2 












    1 





    3 














    3 



    3 


    3 

    3 



    3 







    3 






    3 











    3 














    3 




    3 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/usb/core/generic.c - generic driver for USB devices (not interfaces)
 *
 * (C) Copyright 2005 Greg Kroah-Hartman <gregkh@suse.de>
 *
 * based on drivers/usb/usb.c which had the following copyrights:
 *        (C) Copyright Linus Torvalds 1999
 *        (C) Copyright Johannes Erdfelt 1999-2001
 *        (C) Copyright Andreas Gal 1999
 *        (C) Copyright Gregory P. Smith 1999
 *        (C) Copyright Deti Fliegl 1999 (new USB architecture)
 *        (C) Copyright Randy Dunlap 2000
 *        (C) Copyright David Brownell 2000-2004
 *        (C) Copyright Yggdrasil Computing, Inc. 2000
 *                (usb_device_id matching changes by Adam J. Richter)
 *        (C) Copyright Greg Kroah-Hartman 2002-2003
 *
 * Released under the GPLv2 only.
 */

#include <linux/usb.h>
#include <linux/usb/hcd.h>
#include <linux/string_choices.h>
#include <uapi/linux/usb/audio.h>
#include "usb.h"

static int is_rndis(struct usb_interface_descriptor *desc)
{
        return desc->bInterfaceClass == USB_CLASS_COMM
                && desc->bInterfaceSubClass == 2
                && desc->bInterfaceProtocol == 0xff;
}

static int is_activesync(struct usb_interface_descriptor *desc)
{
        return desc->bInterfaceClass == USB_CLASS_MISC
                && desc->bInterfaceSubClass == 1
                && desc->bInterfaceProtocol == 1;
}

static bool is_audio(struct usb_interface_descriptor *desc)
{
        return desc->bInterfaceClass == USB_CLASS_AUDIO;
}

static bool is_uac3_config(struct usb_interface_descriptor *desc)
{
        return desc->bInterfaceProtocol == UAC_VERSION_3;
}

int usb_choose_configuration(struct usb_device *udev)
{
        int i;
        int num_configs;
        int insufficient_power = 0;
        struct usb_host_config *c, *best;
        struct usb_device_driver *udriver;

        /*
         * If a USB device (not an interface) doesn't have a driver then the
         * kernel has no business trying to select or install a configuration
         * for it.
         */
        if (!udev->dev.driver)
                return -1;
        udriver = to_usb_device_driver(udev->dev.driver);

        if (usb_device_is_owned(udev))
                return 0;

        if (udriver->choose_configuration) {
                i = udriver->choose_configuration(udev);
                if (i >= 0)
                        return i;
        }

        best = NULL;
        c = udev->config;
        num_configs = udev->descriptor.bNumConfigurations;
        for (i = 0; i < num_configs; (i++, c++)) {
                struct usb_interface_descriptor        *desc = NULL;

                /* It's possible that a config has no interfaces! */
                if (c->desc.bNumInterfaces > 0)
                        desc = &c->intf_cache[0]->altsetting->desc;

                /*
                 * HP's USB bus-powered keyboard has only one configuration
                 * and it claims to be self-powered; other devices may have
                 * similar errors in their descriptors.  If the next test
                 * were allowed to execute, such configurations would always
                 * be rejected and the devices would not work as expected.
                 * In the meantime, we run the risk of selecting a config
                 * that requires external power at a time when that power
                 * isn't available.  It seems to be the lesser of two evils.
                 *
                 * Bugzilla #6448 reports a device that appears to crash
                 * when it receives a GET_DEVICE_STATUS request!  We don't
                 * have any other way to tell whether a device is self-powered,
                 * but since we don't use that information anywhere but here,
                 * the call has been removed.
                 *
                 * Maybe the GET_DEVICE_STATUS call and the test below can
                 * be reinstated when device firmwares become more reliable.
                 * Don't hold your breath.
                 */
#if 0
                /* Rule out self-powered configs for a bus-powered device */
                if (bus_powered && (c->desc.bmAttributes &
                                        USB_CONFIG_ATT_SELFPOWER))
                        continue;
#endif

                /*
                 * The next test may not be as effective as it should be.
                 * Some hubs have errors in their descriptor, claiming
                 * to be self-powered when they are really bus-powered.
                 * We will overestimate the amount of current such hubs
                 * make available for each port.
                 *
                 * This is a fairly benign sort of failure.  It won't
                 * cause us to reject configurations that we should have
                 * accepted.
                 */

                /* Rule out configs that draw too much bus current */
                if (usb_get_max_power(udev, c) > udev->bus_mA) {
                        insufficient_power++;
                        continue;
                }

                /*
                 * Select first configuration as default for audio so that
                 * devices that don't comply with UAC3 protocol are supported.
                 * But, still iterate through other configurations and
                 * select UAC3 compliant config if present.
                 */
                if (desc && is_audio(desc)) {
                        /* Always prefer the first found UAC3 config */
                        if (is_uac3_config(desc)) {
                                best = c;
                                break;
                        }

                        /* If there is no UAC3 config, prefer the first config */
                        else if (i == 0)
                                best = c;

                        /* Unconditional continue, because the rest of the code
                         * in the loop is irrelevant for audio devices, and
                         * because it can reassign best, which for audio devices
                         * we don't want.
                         */
                        continue;
                }

                /* When the first config's first interface is one of Microsoft's
                 * pet nonstandard Ethernet-over-USB protocols, ignore it unless
                 * this kernel has enabled the necessary host side driver.
                 * But: Don't ignore it if it's the only config.
                 */
                if (i == 0 && num_configs > 1 && desc &&
                                (is_rndis(desc) || is_activesync(desc))) {
#if !defined(CONFIG_USB_NET_RNDIS_HOST) && !defined(CONFIG_USB_NET_RNDIS_HOST_MODULE)
                        continue;
#else
                        best = c;
#endif
                }

                /* From the remaining configs, choose the first one whose
                 * first interface is for a non-vendor-specific class.
                 * Reason: Linux is more likely to have a class driver
                 * than a vendor-specific driver. */
                else if (udev->descriptor.bDeviceClass !=
                                                USB_CLASS_VENDOR_SPEC &&
                                (desc && desc->bInterfaceClass !=
                                                USB_CLASS_VENDOR_SPEC)) {
                        best = c;
                        break;
                }

                /* If all the remaining configs are vendor-specific,
                 * choose the first one. */
                else if (!best)
                        best = c;
        }

        if (insufficient_power > 0)
                dev_info(&udev->dev, "rejected %d configuration%s "
                        "due to insufficient available bus power\n",
                        insufficient_power, str_plural(insufficient_power));

        if (best) {
                i = best->desc.bConfigurationValue;
                dev_dbg(&udev->dev,
                        "configuration #%d chosen from %d choice%s\n",
                        i, num_configs, str_plural(num_configs));
        } else {
                i = -1;
                dev_warn(&udev->dev,
                        "no configuration chosen from %d choice%s\n",
                        num_configs, str_plural(num_configs));
        }
        return i;
}
EXPORT_SYMBOL_GPL(usb_choose_configuration);

static int __check_for_non_generic_match(struct device_driver *drv, void *data)
{
        struct usb_device *udev = data;
        struct usb_device_driver *udrv;

        if (!is_usb_device_driver(drv))
                return 0;
        udrv = to_usb_device_driver(drv);
        if (udrv == &usb_generic_driver)
                return 0;
        return usb_driver_applicable(udev, udrv);
}

static bool usb_generic_driver_match(struct usb_device *udev)
{
        if (udev->use_generic_driver)
                return true;

        /*
         * If any other driver wants the device, leave the device to this other
         * driver.
         */
        if (bus_for_each_drv(&usb_bus_type, NULL, udev, __check_for_non_generic_match))
                return false;

        return true;
}

int usb_generic_driver_probe(struct usb_device *udev)
{
        int err, c;

        /* Choose and set the configuration.  This registers the interfaces
         * with the driver core and lets interface drivers bind to them.
         */
        if (udev->authorized == 0)
                dev_info(&udev->dev, "Device is not authorized for usage\n");
        else {
                c = usb_choose_configuration(udev);
                if (c >= 0) {
                        err = usb_set_configuration(udev, c);
                        if (err && err != -ENODEV) {
                                dev_err(&udev->dev, "can't set config #%d, error %d\n",
                                        c, err);
                                /* This need not be fatal.  The user can try to
                                 * set other configurations. */
                        }
                }
        }
        /* USB device state == configured ... usable */
        usb_notify_add_device(udev);

        return 0;
}

void usb_generic_driver_disconnect(struct usb_device *udev)
{
        usb_notify_remove_device(udev);

        /* if this is only an unbind, not a physical disconnect, then
         * unconfigure the device */
        if (udev->actconfig)
                usb_set_configuration(udev, -1);
}

#ifdef        CONFIG_PM

int usb_generic_driver_suspend(struct usb_device *udev, pm_message_t msg)
{
        int rc;

        /* Normal USB devices suspend through their upstream port.
         * Root hubs don't have upstream ports to suspend,
         * so we have to shut down their downstream HC-to-USB
         * interfaces manually by doing a bus (or "global") suspend.
         */
        if (!udev->parent)
                rc = hcd_bus_suspend(udev, msg);

        /*
         * Non-root USB2 devices don't need to do anything for FREEZE
         * or PRETHAW. USB3 devices don't support global suspend and
         * needs to be selectively suspended.
         */
        else if ((msg.event == PM_EVENT_FREEZE || msg.event == PM_EVENT_PRETHAW)
                 && (udev->speed < USB_SPEED_SUPER))
                rc = 0;
        else
                rc = usb_port_suspend(udev, msg);

        if (rc == 0)
                usbfs_notify_suspend(udev);
        return rc;
}

int usb_generic_driver_resume(struct usb_device *udev, pm_message_t msg)
{
        int rc;

        /* Normal USB devices resume/reset through their upstream port.
         * Root hubs don't have upstream ports to resume or reset,
         * so we have to start up their downstream HC-to-USB
         * interfaces manually by doing a bus (or "global") resume.
         */
        if (!udev->parent)
                rc = hcd_bus_resume(udev, msg);
        else
                rc = usb_port_resume(udev, msg);

        if (rc == 0)
                usbfs_notify_resume(udev);
        return rc;
}

#endif        /* CONFIG_PM */

struct usb_device_driver usb_generic_driver = {
        .name =        "usb",
        .match = usb_generic_driver_match,
        .probe = usb_generic_driver_probe,
        .disconnect = usb_generic_driver_disconnect,
#ifdef        CONFIG_PM
        .suspend = usb_generic_driver_suspend,
        .resume = usb_generic_driver_resume,
#endif
        .supports_autosuspend = 1,
};








































































































































































































































































































































































































































































    1 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Block data types and constants.  Directly include this file only to
 * break include dependency loop.
 */
#ifndef __LINUX_BLK_TYPES_H
#define __LINUX_BLK_TYPES_H

#include <linux/types.h>
#include <linux/bvec.h>
#include <linux/device.h>
#include <linux/ktime.h>
#include <linux/rw_hint.h>

struct bio_set;
struct bio;
struct bio_integrity_payload;
struct page;
struct io_context;
struct cgroup_subsys_state;
typedef void (bio_end_io_t) (struct bio *);
struct bio_crypt_ctx;

/*
 * The basic unit of block I/O is a sector. It is used in a number of contexts
 * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9
 * bytes. Variables of type sector_t represent an offset or size that is a
 * multiple of 512 bytes. Hence these two constants.
 */
#ifndef SECTOR_SHIFT
#define SECTOR_SHIFT 9
#endif
#ifndef SECTOR_SIZE
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
#endif

#define PAGE_SECTORS_SHIFT        (PAGE_SHIFT - SECTOR_SHIFT)
#define PAGE_SECTORS                (1 << PAGE_SECTORS_SHIFT)
#define SECTOR_MASK                (PAGE_SECTORS - 1)

struct block_device {
        sector_t                bd_start_sect;
        sector_t                bd_nr_sectors;
        struct gendisk *        bd_disk;
        struct request_queue *        bd_queue;
        struct disk_stats __percpu *bd_stats;
        unsigned long                bd_stamp;
        atomic_t                __bd_flags;        // partition number + flags
#define BD_PARTNO                255        // lower 8 bits; assign-once
#define BD_READ_ONLY                (1u<<8) // read-only policy
#define BD_WRITE_HOLDER                (1u<<9)
#define BD_HAS_SUBMIT_BIO        (1u<<10)
#define BD_RO_WARNED                (1u<<11)
#ifdef CONFIG_FAIL_MAKE_REQUEST
#define BD_MAKE_IT_FAIL                (1u<<12)
#endif
        dev_t                        bd_dev;
        struct address_space        *bd_mapping;        /* page cache */

        atomic_t                bd_openers;
        spinlock_t                bd_size_lock; /* for bd_inode->i_size updates */
        void *                        bd_claiming;
        void *                        bd_holder;
        const struct blk_holder_ops *bd_holder_ops;
        struct mutex                bd_holder_lock;
        int                        bd_holders;
        struct kobject                *bd_holder_dir;

        atomic_t                bd_fsfreeze_count; /* number of freeze requests */
        struct mutex                bd_fsfreeze_mutex; /* serialize freeze/thaw */

        struct partition_meta_info *bd_meta_info;
        int                        bd_writers;
#ifdef CONFIG_SECURITY
        void                        *bd_security;
#endif
        /*
         * keep this out-of-line as it's both big and not needed in the fast
         * path
         */
        struct device                bd_device;
} __randomize_layout;

#define bdev_whole(_bdev) \
        ((_bdev)->bd_disk->part0)

#define dev_to_bdev(device) \
        container_of((device), struct block_device, bd_device)

#define bdev_kobj(_bdev) \
        (&((_bdev)->bd_device.kobj))

/*
 * Block error status values.  See block/blk-core:blk_errors for the details.
 */
typedef u8 __bitwise blk_status_t;
typedef u16 blk_short_t;
#define        BLK_STS_OK 0
#define BLK_STS_NOTSUPP                ((__force blk_status_t)1)
#define BLK_STS_TIMEOUT                ((__force blk_status_t)2)
#define BLK_STS_NOSPC                ((__force blk_status_t)3)
#define BLK_STS_TRANSPORT        ((__force blk_status_t)4)
#define BLK_STS_TARGET                ((__force blk_status_t)5)
#define BLK_STS_RESV_CONFLICT        ((__force blk_status_t)6)
#define BLK_STS_MEDIUM                ((__force blk_status_t)7)
#define BLK_STS_PROTECTION        ((__force blk_status_t)8)
#define BLK_STS_RESOURCE        ((__force blk_status_t)9)
#define BLK_STS_IOERR                ((__force blk_status_t)10)

/* hack for device mapper, don't use elsewhere: */
#define BLK_STS_DM_REQUEUE    ((__force blk_status_t)11)

/*
 * BLK_STS_AGAIN should only be returned if RQF_NOWAIT is set
 * and the bio would block (cf bio_wouldblock_error())
 */
#define BLK_STS_AGAIN                ((__force blk_status_t)12)

/*
 * BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if
 * device related resources are unavailable, but the driver can guarantee
 * that the queue will be rerun in the future once resources become
 * available again. This is typically the case for device specific
 * resources that are consumed for IO. If the driver fails allocating these
 * resources, we know that inflight (or pending) IO will free these
 * resource upon completion.
 *
 * This is different from BLK_STS_RESOURCE in that it explicitly references
 * a device specific resource. For resources of wider scope, allocation
 * failure can happen without having pending IO. This means that we can't
 * rely on request completions freeing these resources, as IO may not be in
 * flight. Examples of that are kernel memory allocations, DMA mappings, or
 * any other system wide resources.
 */
#define BLK_STS_DEV_RESOURCE        ((__force blk_status_t)13)

/*
 * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion
 * path if the device returns a status indicating that too many zone resources
 * are currently open. The same command should be successful if resubmitted
 * after the number of open zones decreases below the device's limits, which is
 * reported in the request_queue's max_open_zones.
 */
#define BLK_STS_ZONE_OPEN_RESOURCE        ((__force blk_status_t)14)

/*
 * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion
 * path if the device returns a status indicating that too many zone resources
 * are currently active. The same command should be successful if resubmitted
 * after the number of active zones decreases below the device's limits, which
 * is reported in the request_queue's max_active_zones.
 */
#define BLK_STS_ZONE_ACTIVE_RESOURCE        ((__force blk_status_t)15)

/*
 * BLK_STS_OFFLINE is returned from the driver when the target device is offline
 * or is being taken offline. This could help differentiate the case where a
 * device is intentionally being shut down from a real I/O error.
 */
#define BLK_STS_OFFLINE                ((__force blk_status_t)16)

/*
 * BLK_STS_DURATION_LIMIT is returned from the driver when the target device
 * aborted the command because it exceeded one of its Command Duration Limits.
 */
#define BLK_STS_DURATION_LIMIT        ((__force blk_status_t)17)

/*
 * Invalid size or alignment.
 */
#define BLK_STS_INVAL        ((__force blk_status_t)19)

/**
 * blk_path_error - returns true if error may be path related
 * @error: status the request was completed with
 *
 * Description:
 *     This classifies block error status into non-retryable errors and ones
 *     that may be successful if retried on a failover path.
 *
 * Return:
 *     %false - retrying failover path will not help
 *     %true  - may succeed if retried
 */
static inline bool blk_path_error(blk_status_t error)
{
        switch (error) {
        case BLK_STS_NOTSUPP:
        case BLK_STS_NOSPC:
        case BLK_STS_TARGET:
        case BLK_STS_RESV_CONFLICT:
        case BLK_STS_MEDIUM:
        case BLK_STS_PROTECTION:
                return false;
        }

        /* Anything else could be a path failure, so should be retried */
        return true;
}

typedef __u32 __bitwise blk_opf_t;

typedef unsigned int blk_qc_t;
#define BLK_QC_T_NONE                -1U

/*
 * main unit of I/O for the block layer and lower layers (ie drivers and
 * stacking drivers)
 */
struct bio {
        struct bio                *bi_next;        /* request queue link */
        struct block_device        *bi_bdev;
        blk_opf_t                bi_opf;                /* bottom bits REQ_OP, top bits
                                                 * req_flags.
                                                 */
        unsigned short                bi_flags;        /* BIO_* below */
        unsigned short                bi_ioprio;
        enum rw_hint                bi_write_hint;
        u8                        bi_write_stream;
        blk_status_t                bi_status;

        /*
         * The bvec gap bit indicates the lowest set bit in any address offset
         * between all bi_io_vecs. This field is initialized only after the bio
         * is split to the hardware limits (see bio_split_io_at()). The value
         * may be used to consider DMA optimization when performing that
         * mapping. The value is compared to a power of two mask where the
         * result depends on any bit set within the mask, so saving the lowest
         * bit is sufficient to know if any segment gap collides with the mask.
         */
        u8                        bi_bvec_gap_bit;

        atomic_t                __bi_remaining;

        /* The actual vec list, preserved by bio_reset() */
        struct bio_vec                *bi_io_vec;
        struct bvec_iter        bi_iter;

        union {
                /* for polled bios: */
                blk_qc_t                bi_cookie;
                /* for plugged zoned writes only: */
                unsigned int                __bi_nr_segments;
        };
        bio_end_io_t                *bi_end_io;
        void                        *bi_private;
#ifdef CONFIG_BLK_CGROUP
        /*
         * Represents the association of the css and request_queue for the bio.
         * If a bio goes direct to device, it will not have a blkg as it will
         * not have a request_queue associated with it.  The reference is put
         * on release of the bio.
         */
        struct blkcg_gq                *bi_blkg;
        /* Time that this bio was issued. */
        u64                        issue_time_ns;
#ifdef CONFIG_BLK_CGROUP_IOCOST
        u64                        bi_iocost_cost;
#endif
#endif

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        struct bio_crypt_ctx        *bi_crypt_context;
#endif

#if defined(CONFIG_BLK_DEV_INTEGRITY)
        struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif

        unsigned short                bi_vcnt;        /* how many bio_vec's */

        /*
         * Everything starting with bi_max_vecs will be preserved by bio_reset()
         */

        /*
         * Number of elements in `bi_io_vec` that were allocated for this bio.
         * Only used by the bio submitter to make `bio_add_page` fail once full
         * and to free the `bi_io_vec` allocation. Must not be used in drivers
         * and does not hold a useful value for cloned bios.
         */
        unsigned short                bi_max_vecs;

        atomic_t                __bi_cnt;        /* pin count */

        struct bio_set                *bi_pool;
};

#define BIO_RESET_BYTES                offsetof(struct bio, bi_max_vecs)
#define BIO_MAX_SIZE                UINT_MAX /* max value of bi_iter.bi_size */
#define BIO_MAX_SECTORS                (BIO_MAX_SIZE >> SECTOR_SHIFT)

static inline struct bio_vec *bio_inline_vecs(struct bio *bio)
{
        return (struct bio_vec *)(bio + 1);
}

/*
 * bio flags
 */
enum {
        BIO_PAGE_PINNED,        /* Unpin pages in bio_release_pages() */
        BIO_CLONED,                /* doesn't own data */
        BIO_QUIET,                /* Make BIO Quiet */
        BIO_CHAIN,                /* chained bio, ->bi_remaining in effect */
        BIO_REFFED,                /* bio has elevated ->bi_cnt */
        BIO_BPS_THROTTLED,        /* This bio has already been subjected to
                                 * throttling rules. Don't do it again. */
        BIO_TRACE_COMPLETION,        /* bio_endio() should trace the final completion
                                 * of this bio. */
        BIO_CGROUP_ACCT,        /* has been accounted to a cgroup */
        BIO_QOS_THROTTLED,        /* bio went through rq_qos throttle path */
        /*
         * This bio has completed bps throttling at the single tg granularity,
         * which is different from BIO_BPS_THROTTLED. When the bio is enqueued
         * into the sq->queued of the upper tg, or is about to be dispatched,
         * this flag needs to be cleared. Since blk-throttle and rq_qos are not
         * on the same hierarchical level, reuse the value.
         */
        BIO_TG_BPS_THROTTLED = BIO_QOS_THROTTLED,
        BIO_QOS_MERGED,                /* but went through rq_qos merge path */
        BIO_REMAPPED,
        BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
        BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
        BIO_FLAG_LAST
};

typedef __u32 __bitwise blk_mq_req_flags_t;

#define REQ_OP_BITS        8
#define REQ_OP_MASK        (__force blk_opf_t)((1 << REQ_OP_BITS) - 1)
#define REQ_FLAG_BITS        24

/**
 * enum req_op - Operations common to the bio and request structures.
 * We use 8 bits for encoding the operation, and the remaining 24 for flags.
 *
 * The least significant bit of the operation number indicates the data
 * transfer direction:
 *
 *   - if the least significant bit is set transfers are TO the device
 *   - if the least significant bit is not set transfers are FROM the device
 *
 * If a operation does not transfer data the least significant bit has no
 * meaning.
 */
enum req_op {
        /** @REQ_OP_READ: read sectors from the device */
        REQ_OP_READ                = (__force blk_opf_t)0,
        /** @REQ_OP_WRITE: write sectors to the device */
        REQ_OP_WRITE                = (__force blk_opf_t)1,
        /** @REQ_OP_FLUSH: flush the volatile write cache */
        REQ_OP_FLUSH                = (__force blk_opf_t)2,
        /** @REQ_OP_DISCARD: discard sectors */
        REQ_OP_DISCARD                = (__force blk_opf_t)3,
        /** @REQ_OP_SECURE_ERASE: securely erase sectors */
        REQ_OP_SECURE_ERASE        = (__force blk_opf_t)5,
        /** @REQ_OP_ZONE_APPEND: write data at the current zone write pointer */
        REQ_OP_ZONE_APPEND        = (__force blk_opf_t)7,
        /** @REQ_OP_WRITE_ZEROES: write the zero filled sector many times */
        REQ_OP_WRITE_ZEROES        = (__force blk_opf_t)9,
        /** @REQ_OP_ZONE_OPEN: Open a zone */
        REQ_OP_ZONE_OPEN        = (__force blk_opf_t)11,
        /** @REQ_OP_ZONE_CLOSE: Close a zone */
        REQ_OP_ZONE_CLOSE        = (__force blk_opf_t)13,
        /** @REQ_OP_ZONE_FINISH: Transition a zone to full */
        REQ_OP_ZONE_FINISH        = (__force blk_opf_t)15,
        /** @REQ_OP_ZONE_RESET: reset a zone write pointer */
        REQ_OP_ZONE_RESET        = (__force blk_opf_t)17,
        /** @REQ_OP_ZONE_RESET_ALL: reset all the zone present on the device */
        REQ_OP_ZONE_RESET_ALL        = (__force blk_opf_t)19,

        /* Driver private requests */
        /* private: */
        REQ_OP_DRV_IN                = (__force blk_opf_t)34,
        REQ_OP_DRV_OUT                = (__force blk_opf_t)35,

        REQ_OP_LAST                = (__force blk_opf_t)36,
};

/* Keep cmd_flag_name[] in sync with the definitions below */
enum req_flag_bits {
        __REQ_FAILFAST_DEV =        /* no driver retries of device errors */
                REQ_OP_BITS,
        __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
        __REQ_FAILFAST_DRIVER,        /* no driver retries of driver errors */
        __REQ_SYNC,                /* request is sync (sync write or read) */
        __REQ_META,                /* metadata io request */
        __REQ_PRIO,                /* boost priority in cfq */
        __REQ_NOMERGE,                /* don't touch this for merging */
        __REQ_IDLE,                /* anticipate more IO after this one */
        __REQ_INTEGRITY,        /* I/O includes block integrity payload */
        __REQ_FUA,                /* forced unit access */
        __REQ_PREFLUSH,                /* request for cache flush */
        __REQ_RAHEAD,                /* read ahead, can fail anytime */
        __REQ_BACKGROUND,        /* background IO */
        __REQ_NOWAIT,           /* Don't wait if request will block */
        __REQ_POLLED,                /* caller polls for completion using bio_poll */
        __REQ_ALLOC_CACHE,        /* allocate IO from cache if available */
        __REQ_SWAP,                /* swap I/O */
        __REQ_DRV,                /* for driver use */
        __REQ_FS_PRIVATE,        /* for file system (submitter) use */
        __REQ_ATOMIC,                /* for atomic write operations */
        /*
         * Command specific flags, keep last:
         */
        /* for REQ_OP_WRITE_ZEROES: */
        __REQ_NOUNMAP,                /* do not free blocks when zeroing */

        __REQ_NR_BITS,                /* stops here */
};

#define REQ_FAILFAST_DEV        \
                        (__force blk_opf_t)(1ULL << __REQ_FAILFAST_DEV)
#define REQ_FAILFAST_TRANSPORT        \
                        (__force blk_opf_t)(1ULL << __REQ_FAILFAST_TRANSPORT)
#define REQ_FAILFAST_DRIVER        \
                        (__force blk_opf_t)(1ULL << __REQ_FAILFAST_DRIVER)
#define REQ_SYNC        (__force blk_opf_t)(1ULL << __REQ_SYNC)
#define REQ_META        (__force blk_opf_t)(1ULL << __REQ_META)
#define REQ_PRIO        (__force blk_opf_t)(1ULL << __REQ_PRIO)
#define REQ_NOMERGE        (__force blk_opf_t)(1ULL << __REQ_NOMERGE)
#define REQ_IDLE        (__force blk_opf_t)(1ULL << __REQ_IDLE)
#define REQ_INTEGRITY        (__force blk_opf_t)(1ULL << __REQ_INTEGRITY)
#define REQ_FUA                (__force blk_opf_t)(1ULL << __REQ_FUA)
#define REQ_PREFLUSH        (__force blk_opf_t)(1ULL << __REQ_PREFLUSH)
#define REQ_RAHEAD        (__force blk_opf_t)(1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND        (__force blk_opf_t)(1ULL << __REQ_BACKGROUND)
#define REQ_NOWAIT        (__force blk_opf_t)(1ULL << __REQ_NOWAIT)
#define REQ_POLLED        (__force blk_opf_t)(1ULL << __REQ_POLLED)
#define REQ_ALLOC_CACHE        (__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE)
#define REQ_SWAP        (__force blk_opf_t)(1ULL << __REQ_SWAP)
#define REQ_DRV                (__force blk_opf_t)(1ULL << __REQ_DRV)
#define REQ_FS_PRIVATE        (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
#define REQ_ATOMIC        (__force blk_opf_t)(1ULL << __REQ_ATOMIC)

#define REQ_NOUNMAP        (__force blk_opf_t)(1ULL << __REQ_NOUNMAP)

#define REQ_FAILFAST_MASK \
        (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)

#define REQ_NOMERGE_FLAGS \
        (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)

enum stat_group {
        STAT_READ,
        STAT_WRITE,
        STAT_DISCARD,
        STAT_FLUSH,

        NR_STAT_GROUPS
};

static inline enum req_op bio_op(const struct bio *bio)
{
        return bio->bi_opf & REQ_OP_MASK;
}

static inline bool op_is_write(blk_opf_t op)
{
        return !!(op & (__force blk_opf_t)1);
}

/*
 * Check if the bio or request is one that needs special treatment in the
 * flush state machine.
 */
static inline bool op_is_flush(blk_opf_t op)
{
        return op & (REQ_FUA | REQ_PREFLUSH);
}

/*
 * Reads are always treated as synchronous, as are requests with the FUA or
 * PREFLUSH flag.  Other operations may be marked as synchronous using the
 * REQ_SYNC flag.
 */
static inline bool op_is_sync(blk_opf_t op)
{
        return (op & REQ_OP_MASK) == REQ_OP_READ ||
                (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH));
}

static inline bool op_is_discard(blk_opf_t op)
{
        return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
}

/*
 * Check if a bio or request operation is a zone management operation.
 */
static inline bool op_is_zone_mgmt(enum req_op op)
{
        switch (op & REQ_OP_MASK) {
        case REQ_OP_ZONE_RESET:
        case REQ_OP_ZONE_RESET_ALL:
        case REQ_OP_ZONE_OPEN:
        case REQ_OP_ZONE_CLOSE:
        case REQ_OP_ZONE_FINISH:
                return true;
        default:
                return false;
        }
}

static inline int op_stat_group(enum req_op op)
{
        if (op_is_discard(op))
                return STAT_DISCARD;
        return op_is_write(op);
}

struct blk_rq_stat {
        u64 mean;
        u64 min;
        u64 max;
        u32 nr_samples;
        u64 batch;
};

#endif /* __LINUX_BLK_TYPES_H */




































































   13 





























































































































































   15 






























   14 














   13 









   14 
   13 






   14 






   14 



   13 



   15 











   14 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Packet matching code.
 *
 * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
 * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
 * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net>
 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cache.h>
#include <linux/capability.h>
#include <linux/skbuff.h>
#include <linux/kmod.h>
#include <linux/vmalloc.h>
#include <linux/netdevice.h>
#include <linux/module.h>
#include <net/ip.h>
#include <net/compat.h>
#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
#include <linux/cpumask.h>

#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <net/netfilter/nf_log.h>
#include "../../netfilter/xt_repldata.h"

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv4 packet filter");

void *ipt_alloc_initial_table(const struct xt_table *info)
{
        return xt_alloc_initial_table(ipt, IPT);
}
EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);

/* Returns whether matches rule or not. */
/* Performance critical - called for every packet */
static inline bool
ip_packet_match(const struct iphdr *ip,
                const char *indev,
                const char *outdev,
                const struct ipt_ip *ipinfo,
                int isfrag)
{
        unsigned long ret;

        if (NF_INVF(ipinfo, IPT_INV_SRCIP,
                    (ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) ||
            NF_INVF(ipinfo, IPT_INV_DSTIP,
                    (ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr))
                return false;

        ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);

        if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0))
                return false;

        ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);

        if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0))
                return false;

        /* Check specific protocol */
        if (ipinfo->proto &&
            NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto))
                return false;

        /* If we have a fragment rule but the packet is not a fragment
         * then we return zero */
        if (NF_INVF(ipinfo, IPT_INV_FRAG,
                    (ipinfo->flags & IPT_F_FRAG) && !isfrag))
                return false;

        return true;
}

static bool
ip_checkentry(const struct ipt_ip *ip)
{
        if (ip->flags & ~IPT_F_MASK)
                return false;
        if (ip->invflags & ~IPT_INV_MASK)
                return false;
        return true;
}

static unsigned int
ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
        net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);

        return NF_DROP;
}

/* Performance critical */
static inline struct ipt_entry *
get_entry(const void *base, unsigned int offset)
{
        return (struct ipt_entry *)(base + offset);
}

/* All zeroes == unconditional rule. */
/* Mildly perf critical (only if packet tracing is on) */
static inline bool unconditional(const struct ipt_entry *e)
{
        static const struct ipt_ip uncond;

        return e->target_offset == sizeof(struct ipt_entry) &&
               memcmp(&e->ip, &uncond, sizeof(uncond)) == 0;
}

/* for const-correctness */
static inline const struct xt_entry_target *
ipt_get_target_c(const struct ipt_entry *e)
{
        return ipt_get_target((struct ipt_entry *)e);
}

#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
static const char *const hooknames[] = {
        [NF_INET_PRE_ROUTING]                = "PREROUTING",
        [NF_INET_LOCAL_IN]                = "INPUT",
        [NF_INET_FORWARD]                = "FORWARD",
        [NF_INET_LOCAL_OUT]                = "OUTPUT",
        [NF_INET_POST_ROUTING]                = "POSTROUTING",
};

enum nf_ip_trace_comments {
        NF_IP_TRACE_COMMENT_RULE,
        NF_IP_TRACE_COMMENT_RETURN,
        NF_IP_TRACE_COMMENT_POLICY,
};

static const char *const comments[] = {
        [NF_IP_TRACE_COMMENT_RULE]        = "rule",
        [NF_IP_TRACE_COMMENT_RETURN]        = "return",
        [NF_IP_TRACE_COMMENT_POLICY]        = "policy",
};

static const struct nf_loginfo trace_loginfo = {
        .type = NF_LOG_TYPE_LOG,
        .u = {
                .log = {
                        .level = 4,
                        .logflags = NF_LOG_DEFAULT_MASK,
                },
        },
};

/* Mildly perf critical (only if packet tracing is on) */
static inline int
get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
                      const char *hookname, const char **chainname,
                      const char **comment, unsigned int *rulenum)
{
        const struct xt_standard_target *t = (void *)ipt_get_target_c(s);

        if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
                /* Head of user chain: ERROR target with chainname */
                *chainname = t->target.data;
                (*rulenum) = 0;
        } else if (s == e) {
                (*rulenum)++;

                if (unconditional(s) &&
                    strcmp(t->target.u.kernel.target->name,
                           XT_STANDARD_TARGET) == 0 &&
                   t->verdict < 0) {
                        /* Tail of chains: STANDARD target (return/policy) */
                        *comment = *chainname == hookname
                                ? comments[NF_IP_TRACE_COMMENT_POLICY]
                                : comments[NF_IP_TRACE_COMMENT_RETURN];
                }
                return 1;
        } else
                (*rulenum)++;

        return 0;
}

static void trace_packet(struct net *net,
                         const struct sk_buff *skb,
                         unsigned int hook,
                         const struct net_device *in,
                         const struct net_device *out,
                         const char *tablename,
                         const struct xt_table_info *private,
                         const struct ipt_entry *e)
{
        const struct ipt_entry *root;
        const char *hookname, *chainname, *comment;
        const struct ipt_entry *iter;
        unsigned int rulenum = 0;

        root = get_entry(private->entries, private->hook_entry[hook]);

        hookname = chainname = hooknames[hook];
        comment = comments[NF_IP_TRACE_COMMENT_RULE];

        xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
                if (get_chainname_rulenum(iter, e, hookname,
                    &chainname, &comment, &rulenum) != 0)
                        break;

        nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo,
                     "TRACE: %s:%s:%s:%u ",
                     tablename, chainname, comment, rulenum);
}
#endif

static inline
struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
{
        return (void *)entry + entry->next_offset;
}

/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ipt_do_table(void *priv,
             struct sk_buff *skb,
             const struct nf_hook_state *state)
{
        const struct xt_table *table = priv;
        unsigned int hook = state->hook;
        static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
        const struct iphdr *ip;
        /* Initializing verdict to NF_DROP keeps gcc happy. */
        unsigned int verdict = NF_DROP;
        const char *indev, *outdev;
        const void *table_base;
        struct ipt_entry *e, **jumpstack;
        unsigned int stackidx, cpu;
        const struct xt_table_info *private;
        struct xt_action_param acpar;
        unsigned int addend;

        /* Initialization */
        stackidx = 0;
        ip = ip_hdr(skb);
        indev = state->in ? state->in->name : nulldevname;
        outdev = state->out ? state->out->name : nulldevname;
        /* We handle fragments by dealing with the first fragment as
         * if it was a normal packet.  All other fragments are treated
         * normally, except that they will NEVER match rules that ask
         * things we don't know, ie. tcp syn flag or ports).  If the
         * rule is also a fragment-specific rule, non-fragments won't
         * match it. */
        acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
        acpar.thoff   = ip_hdrlen(skb);
        acpar.hotdrop = false;
        acpar.state   = state;

        WARN_ON(!(table->valid_hooks & (1 << hook)));
        local_bh_disable();
        addend = xt_write_recseq_begin();
        private = READ_ONCE(table->private); /* Address dependency. */
        cpu        = smp_processor_id();
        table_base = private->entries;
        jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];

        /* Switch to alternate jumpstack if we're being invoked via TEE.
         * TEE issues XT_CONTINUE verdict on original skb so we must not
         * clobber the jumpstack.
         *
         * For recursion via REJECT or SYNPROXY the stack will be clobbered
         * but it is no problem since absolute verdict is issued by these.
         */
        if (static_key_false(&xt_tee_enabled))
                jumpstack += private->stacksize * current->in_nf_duplicate;

        e = get_entry(table_base, private->hook_entry[hook]);

        do {
                const struct xt_entry_target *t;
                const struct xt_entry_match *ematch;
                struct xt_counters *counter;

                WARN_ON(!e);
                if (!ip_packet_match(ip, indev, outdev,
                    &e->ip, acpar.fragoff)) {
 no_match:
                        e = ipt_next_entry(e);
                        continue;
                }

                xt_ematch_foreach(ematch, e) {
                        acpar.match     = ematch->u.kernel.match;
                        acpar.matchinfo = ematch->data;
                        if (!acpar.match->match(skb, &acpar))
                                goto no_match;
                }

                counter = xt_get_this_cpu_counter(&e->counters);
                ADD_COUNTER(*counter, skb->len, 1);

                t = ipt_get_target_c(e);
                WARN_ON(!t->u.kernel.target);

#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
                /* The packet is traced: log it */
                if (unlikely(skb->nf_trace))
                        trace_packet(state->net, skb, hook, state->in,
                                     state->out, table->name, private, e);
#endif
                /* Standard target? */
                if (!t->u.kernel.target->target) {
                        int v;

                        v = ((struct xt_standard_target *)t)->verdict;
                        if (v < 0) {
                                /* Pop from stack? */
                                if (v != XT_RETURN) {
                                        verdict = (unsigned int)(-v) - 1;
                                        break;
                                }
                                if (stackidx == 0) {
                                        e = get_entry(table_base,
                                            private->underflow[hook]);
                                } else {
                                        e = jumpstack[--stackidx];
                                        e = ipt_next_entry(e);
                                }
                                continue;
                        }
                        if (table_base + v != ipt_next_entry(e) &&
                            !(e->ip.flags & IPT_F_GOTO)) {
                                if (unlikely(stackidx >= private->stacksize)) {
                                        verdict = NF_DROP;
                                        break;
                                }
                                jumpstack[stackidx++] = e;
                        }

                        e = get_entry(table_base, v);
                        continue;
                }

                acpar.target   = t->u.kernel.target;
                acpar.targinfo = t->data;

                verdict = t->u.kernel.target->target(skb, &acpar);
                if (verdict == XT_CONTINUE) {
                        /* Target might have changed stuff. */
                        ip = ip_hdr(skb);
                        e = ipt_next_entry(e);
                } else {
                        /* Verdict */
                        break;
                }
        } while (!acpar.hotdrop);

        xt_write_recseq_end(addend);
        local_bh_enable();

        if (acpar.hotdrop)
                return NF_DROP;
        else return verdict;
}

/* Figures out from what hook each rule can be called: returns 0 if
   there are loops.  Puts hook bitmask in comefrom. */
static int
mark_source_chains(const struct xt_table_info *newinfo,
                   unsigned int valid_hooks, void *entry0,
                   unsigned int *offsets)
{
        unsigned int hook;

        /* No recursion; use packet counter to save back ptrs (reset
           to 0 as we leave), and comefrom to save source hook bitmask */
        for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
                unsigned int pos = newinfo->hook_entry[hook];
                struct ipt_entry *e = entry0 + pos;

                if (!(valid_hooks & (1 << hook)))
                        continue;

                /* Set initial back pointer. */
                e->counters.pcnt = pos;

                for (;;) {
                        const struct xt_standard_target *t
                                = (void *)ipt_get_target_c(e);
                        int visited = e->comefrom & (1 << hook);

                        if (e->comefrom & (1 << NF_INET_NUMHOOKS))
                                return 0;

                        e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));

                        /* Unconditional return/END. */
                        if ((unconditional(e) &&
                             (strcmp(t->target.u.user.name,
                                     XT_STANDARD_TARGET) == 0) &&
                             t->verdict < 0) || visited) {
                                unsigned int oldpos, size;

                                /* Return: backtrack through the last
                                   big jump. */
                                do {
                                        e->comefrom ^= (1<<NF_INET_NUMHOOKS);
                                        oldpos = pos;
                                        pos = e->counters.pcnt;
                                        e->counters.pcnt = 0;

                                        /* We're at the start. */
                                        if (pos == oldpos)
                                                goto next;

                                        e = entry0 + pos;
                                } while (oldpos == pos + e->next_offset);

                                /* Move along one */
                                size = e->next_offset;
                                e = entry0 + pos + size;
                                if (pos + size >= newinfo->size)
                                        return 0;
                                e->counters.pcnt = pos;
                                pos += size;
                        } else {
                                int newpos = t->verdict;

                                if (strcmp(t->target.u.user.name,
                                           XT_STANDARD_TARGET) == 0 &&
                                    newpos >= 0) {
                                        /* This a jump; chase it. */
                                        if (!xt_find_jump_offset(offsets, newpos,
                                                                 newinfo->number))
                                                return 0;
                                } else {
                                        /* ... this is a fallthru */
                                        newpos = pos + e->next_offset;
                                        if (newpos >= newinfo->size)
                                                return 0;
                                }
                                e = entry0 + newpos;
                                e->counters.pcnt = pos;
                                pos = newpos;
                        }
                }
next:                ;
        }
        return 1;
}

static void cleanup_match(struct xt_entry_match *m, struct net *net)
{
        struct xt_mtdtor_param par;

        par.net       = net;
        par.match     = m->u.kernel.match;
        par.matchinfo = m->data;
        par.family    = NFPROTO_IPV4;
        if (par.match->destroy != NULL)
                par.match->destroy(&par);
        module_put(par.match->me);
}

static int
check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
        const struct ipt_ip *ip = par->entryinfo;

        par->match     = m->u.kernel.match;
        par->matchinfo = m->data;

        return xt_check_match(par, m->u.match_size - sizeof(*m),
                              ip->proto, ip->invflags & IPT_INV_PROTO);
}

static int
find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
        struct xt_match *match;
        int ret;

        match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
                                      m->u.user.revision);
        if (IS_ERR(match))
                return PTR_ERR(match);
        m->u.kernel.match = match;

        ret = check_match(m, par);
        if (ret)
                goto err;

        return 0;
err:
        module_put(m->u.kernel.match->me);
        return ret;
}

static int check_target(struct ipt_entry *e, struct net *net, const char *name)
{
        struct xt_entry_target *t = ipt_get_target(e);
        struct xt_tgchk_param par = {
                .net       = net,
                .table     = name,
                .entryinfo = e,
                .target    = t->u.kernel.target,
                .targinfo  = t->data,
                .hook_mask = e->comefrom,
                .family    = NFPROTO_IPV4,
        };

        return xt_check_target(&par, t->u.target_size - sizeof(*t),
                               e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
}

static int
find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
                 unsigned int size,
                 struct xt_percpu_counter_alloc_state *alloc_state)
{
        struct xt_entry_target *t;
        struct xt_target *target;
        int ret;
        unsigned int j;
        struct xt_mtchk_param mtpar;
        struct xt_entry_match *ematch;

        if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
                return -ENOMEM;

        j = 0;
        memset(&mtpar, 0, sizeof(mtpar));
        mtpar.net        = net;
        mtpar.table     = name;
        mtpar.entryinfo = &e->ip;
        mtpar.hook_mask = e->comefrom;
        mtpar.family    = NFPROTO_IPV4;
        xt_ematch_foreach(ematch, e) {
                ret = find_check_match(ematch, &mtpar);
                if (ret != 0)
                        goto cleanup_matches;
                ++j;
        }

        t = ipt_get_target(e);
        target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
                                        t->u.user.revision);
        if (IS_ERR(target)) {
                ret = PTR_ERR(target);
                goto cleanup_matches;
        }
        t->u.kernel.target = target;

        ret = check_target(e, net, name);
        if (ret)
                goto err;

        return 0;
 err:
        module_put(t->u.kernel.target->me);
 cleanup_matches:
        xt_ematch_foreach(ematch, e) {
                if (j-- == 0)
                        break;
                cleanup_match(ematch, net);
        }

        xt_percpu_counter_free(&e->counters);

        return ret;
}

static bool check_underflow(const struct ipt_entry *e)
{
        const struct xt_entry_target *t;
        unsigned int verdict;

        if (!unconditional(e))
                return false;
        t = ipt_get_target_c(e);
        if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
                return false;
        verdict = ((struct xt_standard_target *)t)->verdict;
        verdict = -verdict - 1;
        return verdict == NF_DROP || verdict == NF_ACCEPT;
}

static int
check_entry_size_and_hooks(struct ipt_entry *e,
                           struct xt_table_info *newinfo,
                           const unsigned char *base,
                           const unsigned char *limit,
                           const unsigned int *hook_entries,
                           const unsigned int *underflows,
                           unsigned int valid_hooks)
{
        unsigned int h;
        int err;

        if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
            (unsigned char *)e + sizeof(struct ipt_entry) >= limit ||
            (unsigned char *)e + e->next_offset > limit)
                return -EINVAL;

        if (e->next_offset
            < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target))
                return -EINVAL;

        if (!ip_checkentry(&e->ip))
                return -EINVAL;

        err = xt_check_entry_offsets(e, e->elems, e->target_offset,
                                     e->next_offset);
        if (err)
                return err;

        /* Check hooks & underflows */
        for (h = 0; h < NF_INET_NUMHOOKS; h++) {
                if (!(valid_hooks & (1 << h)))
                        continue;
                if ((unsigned char *)e - base == hook_entries[h])
                        newinfo->hook_entry[h] = hook_entries[h];
                if ((unsigned char *)e - base == underflows[h]) {
                        if (!check_underflow(e))
                                return -EINVAL;

                        newinfo->underflow[h] = underflows[h];
                }
        }

        /* Clear counters and comefrom */
        e->counters = ((struct xt_counters) { 0, 0 });
        e->comefrom = 0;
        return 0;
}

static void
cleanup_entry(struct ipt_entry *e, struct net *net)
{
        struct xt_tgdtor_param par;
        struct xt_entry_target *t;
        struct xt_entry_match *ematch;

        /* Cleanup all matches */
        xt_ematch_foreach(ematch, e)
                cleanup_match(ematch, net);
        t = ipt_get_target(e);

        par.net      = net;
        par.target   = t->u.kernel.target;
        par.targinfo = t->data;
        par.family   = NFPROTO_IPV4;
        if (par.target->destroy != NULL)
                par.target->destroy(&par);
        module_put(par.target->me);
        xt_percpu_counter_free(&e->counters);
}

/* Checks and translates the user-supplied table segment (held in
   newinfo) */
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
                const struct ipt_replace *repl)
{
        struct xt_percpu_counter_alloc_state alloc_state = { 0 };
        struct ipt_entry *iter;
        unsigned int *offsets;
        unsigned int i;
        int ret = 0;

        newinfo->size = repl->size;
        newinfo->number = repl->num_entries;

        /* Init all hooks to impossible value. */
        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                newinfo->hook_entry[i] = 0xFFFFFFFF;
                newinfo->underflow[i] = 0xFFFFFFFF;
        }

        offsets = xt_alloc_entry_offsets(newinfo->number);
        if (!offsets)
                return -ENOMEM;
        i = 0;
        /* Walk through entries, checking offsets. */
        xt_entry_foreach(iter, entry0, newinfo->size) {
                ret = check_entry_size_and_hooks(iter, newinfo, entry0,
                                                 entry0 + repl->size,
                                                 repl->hook_entry,
                                                 repl->underflow,
                                                 repl->valid_hooks);
                if (ret != 0)
                        goto out_free;
                if (i < repl->num_entries)
                        offsets[i] = (void *)iter - entry0;
                ++i;
                if (strcmp(ipt_get_target(iter)->u.user.name,
                    XT_ERROR_TARGET) == 0)
                        ++newinfo->stacksize;
        }

        ret = -EINVAL;
        if (i != repl->num_entries)
                goto out_free;

        ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
        if (ret)
                goto out_free;

        if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
                ret = -ELOOP;
                goto out_free;
        }
        kvfree(offsets);

        /* Finally, each sanity check must pass */
        i = 0;
        xt_entry_foreach(iter, entry0, newinfo->size) {
                ret = find_check_entry(iter, net, repl->name, repl->size,
                                       &alloc_state);
                if (ret != 0)
                        break;
                ++i;
        }

        if (ret != 0) {
                xt_entry_foreach(iter, entry0, newinfo->size) {
                        if (i-- == 0)
                                break;
                        cleanup_entry(iter, net);
                }
                return ret;
        }

        return ret;
 out_free:
        kvfree(offsets);
        return ret;
}

static void
get_counters(const struct xt_table_info *t,
             struct xt_counters counters[])
{
        struct ipt_entry *iter;
        unsigned int cpu;
        unsigned int i;

        for_each_possible_cpu(cpu) {
                seqcount_t *s = &per_cpu(xt_recseq, cpu);

                i = 0;
                xt_entry_foreach(iter, t->entries, t->size) {
                        struct xt_counters *tmp;
                        u64 bcnt, pcnt;
                        unsigned int start;

                        tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
                        do {
                                start = read_seqcount_begin(s);
                                bcnt = tmp->bcnt;
                                pcnt = tmp->pcnt;
                        } while (read_seqcount_retry(s, start));

                        ADD_COUNTER(counters[i], bcnt, pcnt);
                        ++i; /* macro does multi eval of i */
                        cond_resched();
                }
        }
}

static void get_old_counters(const struct xt_table_info *t,
                             struct xt_counters counters[])
{
        struct ipt_entry *iter;
        unsigned int cpu, i;

        for_each_possible_cpu(cpu) {
                i = 0;
                xt_entry_foreach(iter, t->entries, t->size) {
                        const struct xt_counters *tmp;

                        tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
                        ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
                        ++i; /* macro does multi eval of i */
                }

                cond_resched();
        }
}

static struct xt_counters *alloc_counters(const struct xt_table *table)
{
        unsigned int countersize;
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;

        /* We need atomic snapshot of counters: rest doesn't change
           (other than comefrom, which userspace doesn't care
           about). */
        countersize = sizeof(struct xt_counters) * private->number;
        counters = vzalloc(countersize);

        if (counters == NULL)
                return ERR_PTR(-ENOMEM);

        get_counters(private, counters);

        return counters;
}

static int
copy_entries_to_user(unsigned int total_size,
                     const struct xt_table *table,
                     void __user *userptr)
{
        unsigned int off, num;
        const struct ipt_entry *e;
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;
        int ret = 0;
        const void *loc_cpu_entry;

        counters = alloc_counters(table);
        if (IS_ERR(counters))
                return PTR_ERR(counters);

        loc_cpu_entry = private->entries;

        /* FIXME: use iterator macros --RR */
        /* ... then go back and fix counters and names */
        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
                unsigned int i;
                const struct xt_entry_match *m;
                const struct xt_entry_target *t;

                e = loc_cpu_entry + off;
                if (copy_to_user(userptr + off, e, sizeof(*e))) {
                        ret = -EFAULT;
                        goto free_counters;
                }
                if (copy_to_user(userptr + off
                                 + offsetof(struct ipt_entry, counters),
                                 &counters[num],
                                 sizeof(counters[num])) != 0) {
                        ret = -EFAULT;
                        goto free_counters;
                }

                for (i = sizeof(struct ipt_entry);
                     i < e->target_offset;
                     i += m->u.match_size) {
                        m = (void *)e + i;

                        if (xt_match_to_user(m, userptr + off + i)) {
                                ret = -EFAULT;
                                goto free_counters;
                        }
                }

                t = ipt_get_target_c(e);
                if (xt_target_to_user(t, userptr + off + e->target_offset)) {
                        ret = -EFAULT;
                        goto free_counters;
                }
        }

 free_counters:
        vfree(counters);
        return ret;
}

#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
static void compat_standard_from_user(void *dst, const void *src)
{
        int v = *(compat_int_t *)src;

        if (v > 0)
                v += xt_compat_calc_jump(AF_INET, v);
        memcpy(dst, &v, sizeof(v));
}

static int compat_standard_to_user(void __user *dst, const void *src)
{
        compat_int_t cv = *(int *)src;

        if (cv > 0)
                cv -= xt_compat_calc_jump(AF_INET, cv);
        return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
}

static int compat_calc_entry(const struct ipt_entry *e,
                             const struct xt_table_info *info,
                             const void *base, struct xt_table_info *newinfo)
{
        const struct xt_entry_match *ematch;
        const struct xt_entry_target *t;
        unsigned int entry_offset;
        int off, i, ret;

        off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
        entry_offset = (void *)e - base;
        xt_ematch_foreach(ematch, e)
                off += xt_compat_match_offset(ematch->u.kernel.match);
        t = ipt_get_target_c(e);
        off += xt_compat_target_offset(t->u.kernel.target);
        newinfo->size -= off;
        ret = xt_compat_add_offset(AF_INET, entry_offset, off);
        if (ret)
                return ret;

        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                if (info->hook_entry[i] &&
                    (e < (struct ipt_entry *)(base + info->hook_entry[i])))
                        newinfo->hook_entry[i] -= off;
                if (info->underflow[i] &&
                    (e < (struct ipt_entry *)(base + info->underflow[i])))
                        newinfo->underflow[i] -= off;
        }
        return 0;
}

static int compat_table_info(const struct xt_table_info *info,
                             struct xt_table_info *newinfo)
{
        struct ipt_entry *iter;
        const void *loc_cpu_entry;
        int ret;

        if (!newinfo || !info)
                return -EINVAL;

        /* we dont care about newinfo->entries */
        memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
        newinfo->initial_entries = 0;
        loc_cpu_entry = info->entries;
        ret = xt_compat_init_offsets(AF_INET, info->number);
        if (ret)
                return ret;
        xt_entry_foreach(iter, loc_cpu_entry, info->size) {
                ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
                if (ret != 0)
                        return ret;
        }
        return 0;
}
#endif

static int get_info(struct net *net, void __user *user, const int *len)
{
        char name[XT_TABLE_MAXNAMELEN];
        struct xt_table *t;
        int ret;

        if (*len != sizeof(struct ipt_getinfo))
                return -EINVAL;

        if (copy_from_user(name, user, sizeof(name)) != 0)
                return -EFAULT;

        name[XT_TABLE_MAXNAMELEN-1] = '\0';
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        if (in_compat_syscall())
                xt_compat_lock(AF_INET);
#endif
        t = xt_request_find_table_lock(net, AF_INET, name);
        if (!IS_ERR(t)) {
                struct ipt_getinfo info;
                const struct xt_table_info *private = t->private;
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                struct xt_table_info tmp;

                if (in_compat_syscall()) {
                        ret = compat_table_info(private, &tmp);
                        xt_compat_flush_offsets(AF_INET);
                        private = &tmp;
                }
#endif
                memset(&info, 0, sizeof(info));
                info.valid_hooks = t->valid_hooks;
                memcpy(info.hook_entry, private->hook_entry,
                       sizeof(info.hook_entry));
                memcpy(info.underflow, private->underflow,
                       sizeof(info.underflow));
                info.num_entries = private->number;
                info.size = private->size;
                strscpy(info.name, name);

                if (copy_to_user(user, &info, *len) != 0)
                        ret = -EFAULT;
                else
                        ret = 0;

                xt_table_unlock(t);
                module_put(t->me);
        } else
                ret = PTR_ERR(t);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        if (in_compat_syscall())
                xt_compat_unlock(AF_INET);
#endif
        return ret;
}

static int
get_entries(struct net *net, struct ipt_get_entries __user *uptr,
            const int *len)
{
        int ret;
        struct ipt_get_entries get;
        struct xt_table *t;

        if (*len < sizeof(get))
                return -EINVAL;
        if (copy_from_user(&get, uptr, sizeof(get)) != 0)
                return -EFAULT;
        if (*len != sizeof(struct ipt_get_entries) + get.size)
                return -EINVAL;
        get.name[sizeof(get.name) - 1] = '\0';

        t = xt_find_table_lock(net, AF_INET, get.name);
        if (!IS_ERR(t)) {
                const struct xt_table_info *private = t->private;
                if (get.size == private->size)
                        ret = copy_entries_to_user(private->size,
                                                   t, uptr->entrytable);
                else
                        ret = -EAGAIN;

                module_put(t->me);
                xt_table_unlock(t);
        } else
                ret = PTR_ERR(t);

        return ret;
}

static int
__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
             struct xt_table_info *newinfo, unsigned int num_counters,
             void __user *counters_ptr)
{
        int ret;
        struct xt_table *t;
        struct xt_table_info *oldinfo;
        struct xt_counters *counters;
        struct ipt_entry *iter;

        counters = xt_counters_alloc(num_counters);
        if (!counters) {
                ret = -ENOMEM;
                goto out;
        }

        t = xt_request_find_table_lock(net, AF_INET, name);
        if (IS_ERR(t)) {
                ret = PTR_ERR(t);
                goto free_newinfo_counters_untrans;
        }

        /* You lied! */
        if (valid_hooks != t->valid_hooks) {
                ret = -EINVAL;
                goto put_module;
        }

        oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
        if (!oldinfo)
                goto put_module;

        /* Update module usage count based on number of rules */
        if ((oldinfo->number > oldinfo->initial_entries) ||
            (newinfo->number <= oldinfo->initial_entries))
                module_put(t->me);
        if ((oldinfo->number > oldinfo->initial_entries) &&
            (newinfo->number <= oldinfo->initial_entries))
                module_put(t->me);

        xt_table_unlock(t);

        get_old_counters(oldinfo, counters);

        /* Decrease module usage counts and free resource */
        xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
                cleanup_entry(iter, net);

        xt_free_table_info(oldinfo);
        if (copy_to_user(counters_ptr, counters,
                         sizeof(struct xt_counters) * num_counters) != 0) {
                /* Silent error, can't fail, new table is already in place */
                net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
        }
        vfree(counters);
        return 0;

 put_module:
        module_put(t->me);
        xt_table_unlock(t);
 free_newinfo_counters_untrans:
        vfree(counters);
 out:
        return ret;
}

static int
do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
        int ret;
        struct ipt_replace tmp;
        struct xt_table_info *newinfo;
        void *loc_cpu_entry;
        struct ipt_entry *iter;

        if (len < sizeof(tmp))
                return -EINVAL;
        if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
                return -EFAULT;

        /* overflow check */
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
        if (tmp.num_counters == 0)
                return -EINVAL;
        if ((u64)len < (u64)tmp.size + sizeof(tmp))
                return -EINVAL;

        tmp.name[sizeof(tmp.name)-1] = 0;

        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
                        tmp.size) != 0) {
                ret = -EFAULT;
                goto free_newinfo;
        }

        ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
        if (ret != 0)
                goto free_newinfo;

        ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
                           tmp.num_counters, tmp.counters);
        if (ret)
                goto free_newinfo_untrans;
        return 0;

 free_newinfo_untrans:
        xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                cleanup_entry(iter, net);
 free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
}

static int
do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
{
        unsigned int i;
        struct xt_counters_info tmp;
        struct xt_counters *paddc;
        struct xt_table *t;
        const struct xt_table_info *private;
        int ret = 0;
        struct ipt_entry *iter;
        unsigned int addend;

        paddc = xt_copy_counters(arg, len, &tmp);
        if (IS_ERR(paddc))
                return PTR_ERR(paddc);

        t = xt_find_table_lock(net, AF_INET, tmp.name);
        if (IS_ERR(t)) {
                ret = PTR_ERR(t);
                goto free;
        }

        local_bh_disable();
        private = t->private;
        if (private->number != tmp.num_counters) {
                ret = -EINVAL;
                goto unlock_up_free;
        }

        i = 0;
        addend = xt_write_recseq_begin();
        xt_entry_foreach(iter, private->entries, private->size) {
                struct xt_counters *tmp;

                tmp = xt_get_this_cpu_counter(&iter->counters);
                ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
                ++i;
        }
        xt_write_recseq_end(addend);
 unlock_up_free:
        local_bh_enable();
        xt_table_unlock(t);
        module_put(t->me);
 free:
        vfree(paddc);

        return ret;
}

#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_ipt_replace {
        char                        name[XT_TABLE_MAXNAMELEN];
        u32                        valid_hooks;
        u32                        num_entries;
        u32                        size;
        u32                        hook_entry[NF_INET_NUMHOOKS];
        u32                        underflow[NF_INET_NUMHOOKS];
        u32                        num_counters;
        compat_uptr_t                counters;        /* struct xt_counters * */
        struct compat_ipt_entry        entries[];
};

static int
compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
                          unsigned int *size, struct xt_counters *counters,
                          unsigned int i)
{
        struct xt_entry_target *t;
        struct compat_ipt_entry __user *ce;
        u_int16_t target_offset, next_offset;
        compat_uint_t origsize;
        const struct xt_entry_match *ematch;
        int ret = 0;

        origsize = *size;
        ce = *dstptr;
        if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
            copy_to_user(&ce->counters, &counters[i],
            sizeof(counters[i])) != 0)
                return -EFAULT;

        *dstptr += sizeof(struct compat_ipt_entry);
        *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);

        xt_ematch_foreach(ematch, e) {
                ret = xt_compat_match_to_user(ematch, dstptr, size);
                if (ret != 0)
                        return ret;
        }
        target_offset = e->target_offset - (origsize - *size);
        t = ipt_get_target(e);
        ret = xt_compat_target_to_user(t, dstptr, size);
        if (ret)
                return ret;
        next_offset = e->next_offset - (origsize - *size);
        if (put_user(target_offset, &ce->target_offset) != 0 ||
            put_user(next_offset, &ce->next_offset) != 0)
                return -EFAULT;
        return 0;
}

static int
compat_find_calc_match(struct xt_entry_match *m,
                       const struct ipt_ip *ip,
                       int *size)
{
        struct xt_match *match;

        match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
                                      m->u.user.revision);
        if (IS_ERR(match))
                return PTR_ERR(match);

        m->u.kernel.match = match;
        *size += xt_compat_match_offset(match);
        return 0;
}

static void compat_release_entry(struct compat_ipt_entry *e)
{
        struct xt_entry_target *t;
        struct xt_entry_match *ematch;

        /* Cleanup all matches */
        xt_ematch_foreach(ematch, e)
                module_put(ematch->u.kernel.match->me);
        t = compat_ipt_get_target(e);
        module_put(t->u.kernel.target->me);
}

static int
check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
                                  struct xt_table_info *newinfo,
                                  unsigned int *size,
                                  const unsigned char *base,
                                  const unsigned char *limit)
{
        struct xt_entry_match *ematch;
        struct xt_entry_target *t;
        struct xt_target *target;
        unsigned int entry_offset;
        unsigned int j;
        int ret, off;

        if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
            (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit ||
            (unsigned char *)e + e->next_offset > limit)
                return -EINVAL;

        if (e->next_offset < sizeof(struct compat_ipt_entry) +
                             sizeof(struct compat_xt_entry_target))
                return -EINVAL;

        if (!ip_checkentry(&e->ip))
                return -EINVAL;

        ret = xt_compat_check_entry_offsets(e, e->elems,
                                            e->target_offset, e->next_offset);
        if (ret)
                return ret;

        off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
        entry_offset = (void *)e - (void *)base;
        j = 0;
        xt_ematch_foreach(ematch, e) {
                ret = compat_find_calc_match(ematch, &e->ip, &off);
                if (ret != 0)
                        goto release_matches;
                ++j;
        }

        t = compat_ipt_get_target(e);
        target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
                                        t->u.user.revision);
        if (IS_ERR(target)) {
                ret = PTR_ERR(target);
                goto release_matches;
        }
        t->u.kernel.target = target;

        off += xt_compat_target_offset(target);
        *size += off;
        ret = xt_compat_add_offset(AF_INET, entry_offset, off);
        if (ret)
                goto out;

        return 0;

out:
        module_put(t->u.kernel.target->me);
release_matches:
        xt_ematch_foreach(ematch, e) {
                if (j-- == 0)
                        break;
                module_put(ematch->u.kernel.match->me);
        }
        return ret;
}

static void
compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
                            unsigned int *size,
                            struct xt_table_info *newinfo, unsigned char *base)
{
        struct xt_entry_target *t;
        struct ipt_entry *de;
        unsigned int origsize;
        int h;
        struct xt_entry_match *ematch;

        origsize = *size;
        de = *dstptr;
        memcpy(de, e, sizeof(struct ipt_entry));
        memcpy(&de->counters, &e->counters, sizeof(e->counters));

        *dstptr += sizeof(struct ipt_entry);
        *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);

        xt_ematch_foreach(ematch, e)
                xt_compat_match_from_user(ematch, dstptr, size);

        de->target_offset = e->target_offset - (origsize - *size);
        t = compat_ipt_get_target(e);
        xt_compat_target_from_user(t, dstptr, size);

        de->next_offset = e->next_offset - (origsize - *size);

        for (h = 0; h < NF_INET_NUMHOOKS; h++) {
                if ((unsigned char *)de - base < newinfo->hook_entry[h])
                        newinfo->hook_entry[h] -= origsize - *size;
                if ((unsigned char *)de - base < newinfo->underflow[h])
                        newinfo->underflow[h] -= origsize - *size;
        }
}

static int
translate_compat_table(struct net *net,
                       struct xt_table_info **pinfo,
                       void **pentry0,
                       const struct compat_ipt_replace *compatr)
{
        unsigned int i, j;
        struct xt_table_info *newinfo, *info;
        void *pos, *entry0, *entry1;
        struct compat_ipt_entry *iter0;
        struct ipt_replace repl;
        unsigned int size;
        int ret;

        info = *pinfo;
        entry0 = *pentry0;
        size = compatr->size;
        info->number = compatr->num_entries;

        j = 0;
        xt_compat_lock(AF_INET);
        ret = xt_compat_init_offsets(AF_INET, compatr->num_entries);
        if (ret)
                goto out_unlock;
        /* Walk through entries, checking offsets. */
        xt_entry_foreach(iter0, entry0, compatr->size) {
                ret = check_compat_entry_size_and_hooks(iter0, info, &size,
                                                        entry0,
                                                        entry0 + compatr->size);
                if (ret != 0)
                        goto out_unlock;
                ++j;
        }

        ret = -EINVAL;
        if (j != compatr->num_entries)
                goto out_unlock;

        ret = -ENOMEM;
        newinfo = xt_alloc_table_info(size);
        if (!newinfo)
                goto out_unlock;

        memset(newinfo->entries, 0, size);

        newinfo->number = compatr->num_entries;
        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                newinfo->hook_entry[i] = compatr->hook_entry[i];
                newinfo->underflow[i] = compatr->underflow[i];
        }
        entry1 = newinfo->entries;
        pos = entry1;
        size = compatr->size;
        xt_entry_foreach(iter0, entry0, compatr->size)
                compat_copy_entry_from_user(iter0, &pos, &size,
                                            newinfo, entry1);

        /* all module references in entry0 are now gone.
         * entry1/newinfo contains a 64bit ruleset that looks exactly as
         * generated by 64bit userspace.
         *
         * Call standard translate_table() to validate all hook_entrys,
         * underflows, check for loops, etc.
         */
        xt_compat_flush_offsets(AF_INET);
        xt_compat_unlock(AF_INET);

        memcpy(&repl, compatr, sizeof(*compatr));

        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                repl.hook_entry[i] = newinfo->hook_entry[i];
                repl.underflow[i] = newinfo->underflow[i];
        }

        repl.num_counters = 0;
        repl.counters = NULL;
        repl.size = newinfo->size;
        ret = translate_table(net, newinfo, entry1, &repl);
        if (ret)
                goto free_newinfo;

        *pinfo = newinfo;
        *pentry0 = entry1;
        xt_free_table_info(info);
        return 0;

free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
out_unlock:
        xt_compat_flush_offsets(AF_INET);
        xt_compat_unlock(AF_INET);
        xt_entry_foreach(iter0, entry0, compatr->size) {
                if (j-- == 0)
                        break;
                compat_release_entry(iter0);
        }
        return ret;
}

static int
compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
        int ret;
        struct compat_ipt_replace tmp;
        struct xt_table_info *newinfo;
        void *loc_cpu_entry;
        struct ipt_entry *iter;

        if (len < sizeof(tmp))
                return -EINVAL;
        if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
                return -EFAULT;

        /* overflow check */
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
        if (tmp.num_counters == 0)
                return -EINVAL;
        if ((u64)len < (u64)tmp.size + sizeof(tmp))
                return -EINVAL;

        tmp.name[sizeof(tmp.name)-1] = 0;

        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
                        tmp.size) != 0) {
                ret = -EFAULT;
                goto free_newinfo;
        }

        ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
        if (ret != 0)
                goto free_newinfo;

        ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
                           tmp.num_counters, compat_ptr(tmp.counters));
        if (ret)
                goto free_newinfo_untrans;
        return 0;

 free_newinfo_untrans:
        xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                cleanup_entry(iter, net);
 free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
}

struct compat_ipt_get_entries {
        char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t size;
        struct compat_ipt_entry entrytable[];
};

static int
compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
                            void __user *userptr)
{
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;
        void __user *pos;
        unsigned int size;
        int ret = 0;
        unsigned int i = 0;
        struct ipt_entry *iter;

        counters = alloc_counters(table);
        if (IS_ERR(counters))
                return PTR_ERR(counters);

        pos = userptr;
        size = total_size;
        xt_entry_foreach(iter, private->entries, total_size) {
                ret = compat_copy_entry_to_user(iter, &pos,
                                                &size, counters, i++);
                if (ret != 0)
                        break;
        }

        vfree(counters);
        return ret;
}

static int
compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
                   int *len)
{
        int ret;
        struct compat_ipt_get_entries get;
        struct xt_table *t;

        if (*len < sizeof(get))
                return -EINVAL;

        if (copy_from_user(&get, uptr, sizeof(get)) != 0)
                return -EFAULT;

        if (*len != sizeof(struct compat_ipt_get_entries) + get.size)
                return -EINVAL;

        get.name[sizeof(get.name) - 1] = '\0';

        xt_compat_lock(AF_INET);
        t = xt_find_table_lock(net, AF_INET, get.name);
        if (!IS_ERR(t)) {
                const struct xt_table_info *private = t->private;
                struct xt_table_info info;
                ret = compat_table_info(private, &info);
                if (!ret && get.size == info.size)
                        ret = compat_copy_entries_to_user(private->size,
                                                          t, uptr->entrytable);
                else if (!ret)
                        ret = -EAGAIN;

                xt_compat_flush_offsets(AF_INET);
                module_put(t->me);
                xt_table_unlock(t);
        } else
                ret = PTR_ERR(t);

        xt_compat_unlock(AF_INET);
        return ret;
}
#endif

static int
do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
{
        int ret;

        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        switch (cmd) {
        case IPT_SO_SET_REPLACE:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                if (in_compat_syscall())
                        ret = compat_do_replace(sock_net(sk), arg, len);
                else
#endif
                        ret = do_replace(sock_net(sk), arg, len);
                break;

        case IPT_SO_SET_ADD_COUNTERS:
                ret = do_add_counters(sock_net(sk), arg, len);
                break;

        default:
                ret = -EINVAL;
        }

        return ret;
}

static int
do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
        int ret;

        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        switch (cmd) {
        case IPT_SO_GET_INFO:
                ret = get_info(sock_net(sk), user, len);
                break;

        case IPT_SO_GET_ENTRIES:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                if (in_compat_syscall())
                        ret = compat_get_entries(sock_net(sk), user, len);
                else
#endif
                        ret = get_entries(sock_net(sk), user, len);
                break;

        case IPT_SO_GET_REVISION_MATCH:
        case IPT_SO_GET_REVISION_TARGET: {
                struct xt_get_revision rev;
                int target;

                if (*len != sizeof(rev)) {
                        ret = -EINVAL;
                        break;
                }
                if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
                        ret = -EFAULT;
                        break;
                }
                rev.name[sizeof(rev.name)-1] = 0;

                if (cmd == IPT_SO_GET_REVISION_TARGET)
                        target = 1;
                else
                        target = 0;

                try_then_request_module(xt_find_revision(AF_INET, rev.name,
                                                         rev.revision,
                                                         target, &ret),
                                        "ipt_%s", rev.name);
                break;
        }

        default:
                ret = -EINVAL;
        }

        return ret;
}

static void __ipt_unregister_table(struct net *net, struct xt_table *table)
{
        struct xt_table_info *private;
        void *loc_cpu_entry;
        struct module *table_owner = table->me;
        struct ipt_entry *iter;

        private = xt_unregister_table(table);

        /* Decrease module usage counts and free resources */
        loc_cpu_entry = private->entries;
        xt_entry_foreach(iter, loc_cpu_entry, private->size)
                cleanup_entry(iter, net);
        if (private->number > private->initial_entries)
                module_put(table_owner);
        xt_free_table_info(private);
}

int ipt_register_table(struct net *net, const struct xt_table *table,
                       const struct ipt_replace *repl,
                       const struct nf_hook_ops *template_ops)
{
        struct nf_hook_ops *ops;
        unsigned int num_ops;
        int ret, i;
        struct xt_table_info *newinfo;
        struct xt_table_info bootstrap = {0};
        void *loc_cpu_entry;
        struct xt_table *new_table;

        newinfo = xt_alloc_table_info(repl->size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        memcpy(loc_cpu_entry, repl->entries, repl->size);

        ret = translate_table(net, newinfo, loc_cpu_entry, repl);
        if (ret != 0) {
                xt_free_table_info(newinfo);
                return ret;
        }

        new_table = xt_register_table(net, table, &bootstrap, newinfo);
        if (IS_ERR(new_table)) {
                struct ipt_entry *iter;

                xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                        cleanup_entry(iter, net);
                xt_free_table_info(newinfo);
                return PTR_ERR(new_table);
        }

        /* No template? No need to do anything. This is used by 'nat' table, it registers
         * with the nat core instead of the netfilter core.
         */
        if (!template_ops)
                return 0;

        num_ops = hweight32(table->valid_hooks);
        if (num_ops == 0) {
                ret = -EINVAL;
                goto out_free;
        }

        ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
        if (!ops) {
                ret = -ENOMEM;
                goto out_free;
        }

        for (i = 0; i < num_ops; i++)
                ops[i].priv = new_table;

        new_table->ops = ops;

        ret = nf_register_net_hooks(net, ops, num_ops);
        if (ret != 0)
                goto out_free;

        return ret;

out_free:
        __ipt_unregister_table(net, new_table);
        return ret;
}

void ipt_unregister_table_pre_exit(struct net *net, const char *name)
{
        struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);

        if (table)
                nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}

void ipt_unregister_table_exit(struct net *net, const char *name)
{
        struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);

        if (table)
                __ipt_unregister_table(net, table);
}

static struct xt_target ipt_builtin_tg[] __read_mostly = {
        {
                .name             = XT_STANDARD_TARGET,
                .targetsize       = sizeof(int),
                .family           = NFPROTO_IPV4,
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                .compatsize       = sizeof(compat_int_t),
                .compat_from_user = compat_standard_from_user,
                .compat_to_user   = compat_standard_to_user,
#endif
        },
        {
                .name             = XT_ERROR_TARGET,
                .target           = ipt_error,
                .targetsize       = XT_FUNCTION_MAXNAMELEN,
                .family           = NFPROTO_IPV4,
        },
};

static struct nf_sockopt_ops ipt_sockopts = {
        .pf                = PF_INET,
        .set_optmin        = IPT_BASE_CTL,
        .set_optmax        = IPT_SO_SET_MAX+1,
        .set                = do_ipt_set_ctl,
        .get_optmin        = IPT_BASE_CTL,
        .get_optmax        = IPT_SO_GET_MAX+1,
        .get                = do_ipt_get_ctl,
        .owner                = THIS_MODULE,
};

static int __net_init ip_tables_net_init(struct net *net)
{
        return xt_proto_init(net, NFPROTO_IPV4);
}

static void __net_exit ip_tables_net_exit(struct net *net)
{
        xt_proto_fini(net, NFPROTO_IPV4);
}

static struct pernet_operations ip_tables_net_ops = {
        .init = ip_tables_net_init,
        .exit = ip_tables_net_exit,
};

static int __init ip_tables_init(void)
{
        int ret;

        ret = register_pernet_subsys(&ip_tables_net_ops);
        if (ret < 0)
                goto err1;

        /* No one else will be downing sem now, so we won't sleep */
        ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
        if (ret < 0)
                goto err2;

        /* Register setsockopt */
        ret = nf_register_sockopt(&ipt_sockopts);
        if (ret < 0)
                goto err4;

        return 0;

err4:
        xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
err2:
        unregister_pernet_subsys(&ip_tables_net_ops);
err1:
        return ret;
}

static void __exit ip_tables_fini(void)
{
        nf_unregister_sockopt(&ipt_sockopts);

        xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
        unregister_pernet_subsys(&ip_tables_net_ops);
}

EXPORT_SYMBOL(ipt_register_table);
EXPORT_SYMBOL(ipt_unregister_table_pre_exit);
EXPORT_SYMBOL(ipt_unregister_table_exit);
EXPORT_SYMBOL(ipt_do_table);
module_init(ip_tables_init);
module_exit(ip_tables_fini);






























































































































































































   11 








    7 
    2 




    2 

   10 





   11 
















   11 
   11 





   11 
   11 












   11 





    9 




   11 


















    4 









    3 



    1 






















   14 




    3 








    3 




    3 






    3 
    3 


















   12 




   14 








    1 

   10 


    3 






















   14 


    1 









   14 









    9 
    4 






    1 

    1 
    2 


















    6 













    1 





   14 






   14 




    4 
   10 







   14 
















   13 





   12 

























   15 







   15 











   14 













   14 





















   14 






   14 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                The Internet Protocol (IP) module.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Donald Becker, <becker@super.org>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Richard Underwood
 *                Stefan Becker, <stefanb@yello.ping.de>
 *                Jorge Cwik, <jorge@laser.satlink.net>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *
 * Fixes:
 *                Alan Cox        :        Commented a couple of minor bits of surplus code
 *                Alan Cox        :        Undefining IP_FORWARD doesn't include the code
 *                                        (just stops a compiler warning).
 *                Alan Cox        :        Frames with >=MAX_ROUTE record routes, strict routes or loose routes
 *                                        are junked rather than corrupting things.
 *                Alan Cox        :        Frames to bad broadcast subnets are dumped
 *                                        We used to process them non broadcast and
 *                                        boy could that cause havoc.
 *                Alan Cox        :        ip_forward sets the free flag on the
 *                                        new frame it queues. Still crap because
 *                                        it copies the frame but at least it
 *                                        doesn't eat memory too.
 *                Alan Cox        :        Generic queue code and memory fixes.
 *                Fred Van Kempen :        IP fragment support (borrowed from NET2E)
 *                Gerhard Koerting:        Forward fragmented frames correctly.
 *                Gerhard Koerting:         Fixes to my fix of the above 8-).
 *                Gerhard Koerting:        IP interface addressing fix.
 *                Linus Torvalds        :        More robustness checks
 *                Alan Cox        :        Even more checks: Still not as robust as it ought to be
 *                Alan Cox        :        Save IP header pointer for later
 *                Alan Cox        :        ip option setting
 *                Alan Cox        :        Use ip_tos/ip_ttl settings
 *                Alan Cox        :        Fragmentation bogosity removed
 *                                        (Thanks to Mark.Bush@prg.ox.ac.uk)
 *                Dmitry Gorodchanin :        Send of a raw packet crash fix.
 *                Alan Cox        :        Silly ip bug when an overlength
 *                                        fragment turns up. Now frees the
 *                                        queue.
 *                Linus Torvalds/ :        Memory leakage on fragmentation
 *                Alan Cox        :        handling.
 *                Gerhard Koerting:        Forwarding uses IP priority hints
 *                Teemu Rantanen        :        Fragment problems.
 *                Alan Cox        :        General cleanup, comments and reformat
 *                Alan Cox        :        SNMP statistics
 *                Alan Cox        :        BSD address rule semantics. Also see
 *                                        UDP as there is a nasty checksum issue
 *                                        if you do things the wrong way.
 *                Alan Cox        :        Always defrag, moved IP_FORWARD to the config.in file
 *                Alan Cox        :         IP options adjust sk->priority.
 *                Pedro Roque        :        Fix mtu/length error in ip_forward.
 *                Alan Cox        :        Avoid ip_chk_addr when possible.
 *        Richard Underwood        :        IP multicasting.
 *                Alan Cox        :        Cleaned up multicast handlers.
 *                Alan Cox        :        RAW sockets demultiplex in the BSD style.
 *                Gunther Mayer        :        Fix the SNMP reporting typo
 *                Alan Cox        :        Always in group 224.0.0.1
 *        Pauline Middelink        :        Fast ip_checksum update when forwarding
 *                                        Masquerading support.
 *                Alan Cox        :        Multicast loopback error for 224.0.0.1
 *                Alan Cox        :        IP_MULTICAST_LOOP option.
 *                Alan Cox        :        Use notifiers.
 *                Bjorn Ekwall        :        Removed ip_csum (from slhc.c too)
 *                Bjorn Ekwall        :        Moved ip_fast_csum to ip.h (inline!)
 *                Stefan Becker   :       Send out ICMP HOST REDIRECT
 *        Arnt Gulbrandsen        :        ip_build_xmit
 *                Alan Cox        :        Per socket routing cache
 *                Alan Cox        :        Fixed routing cache, added header cache.
 *                Alan Cox        :        Loopback didn't work right in original ip_build_xmit - fixed it.
 *                Alan Cox        :        Only send ICMP_REDIRECT if src/dest are the same net.
 *                Alan Cox        :        Incoming IP option handling.
 *                Alan Cox        :        Set saddr on raw output frames as per BSD.
 *                Alan Cox        :        Stopped broadcast source route explosions.
 *                Alan Cox        :        Can disable source routing
 *                Takeshi Sone    :        Masquerading didn't work.
 *        Dave Bonn,Alan Cox        :        Faster IP forwarding whenever possible.
 *                Alan Cox        :        Memory leaks, tramples, misc debugging.
 *                Alan Cox        :        Fixed multicast (by popular demand 8))
 *                Alan Cox        :        Fixed forwarding (by even more popular demand 8))
 *                Alan Cox        :        Fixed SNMP statistics [I think]
 *        Gerhard Koerting        :        IP fragmentation forwarding fix
 *                Alan Cox        :        Device lock against page fault.
 *                Alan Cox        :        IP_HDRINCL facility.
 *        Werner Almesberger        :        Zero fragment bug
 *                Alan Cox        :        RAW IP frame length bug
 *                Alan Cox        :        Outgoing firewall on build_xmit
 *                A.N.Kuznetsov        :        IP_OPTIONS support throughout the kernel
 *                Alan Cox        :        Multicast routing hooks
 *                Jos Vos                :        Do accounting *before* call_in_firewall
 *        Willy Konynenberg        :        Transparent proxying support
 *
 * To Fix:
 *                IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
 *                and could be made very efficient with the addition of some virtual memory hacks to permit
 *                the allocation of a buffer that can then be 'grown' by twiddling page tables.
 *                Output fragmentation wants updating along with the buffer management to use a single
 *                interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
 *                output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
 *                fragmentation anyway.
 */

#define pr_fmt(fmt) "IPv4: " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/slab.h>

#include <linux/net.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/indirect_call_wrapper.h>

#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
#include <net/raw.h>
#include <net/checksum.h>
#include <net/inet_ecn.h>
#include <linux/netfilter_ipv4.h>
#include <net/xfrm.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
#include <net/dst_metadata.h>
#include <net/udp.h>
#include <net/tcp.h>

/*
 *        Process Router Attention IP option (RFC 2113)
 */
bool ip_call_ra_chain(struct sk_buff *skb)
{
        struct ip_ra_chain *ra;
        u8 protocol = ip_hdr(skb)->protocol;
        struct sock *last = NULL;
        struct net_device *dev = skb->dev;
        struct net *net = dev_net(dev);

        for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
                struct sock *sk = ra->sk;

                /* If socket is bound to an interface, only report
                 * the packet if it came  from that interface.
                 */
                if (sk && inet_sk(sk)->inet_num == protocol &&
                    (!sk->sk_bound_dev_if ||
                     sk->sk_bound_dev_if == dev->ifindex)) {
                        if (ip_is_fragment(ip_hdr(skb))) {
                                if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
                                        return true;
                        }
                        if (last) {
                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                                if (skb2)
                                        raw_rcv(last, skb2);
                        }
                        last = sk;
                }
        }

        if (last) {
                raw_rcv(last, skb);
                return true;
        }
        return false;
}

INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *));
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
        const struct net_protocol *ipprot;
        int raw, ret;

resubmit:
        raw = raw_local_deliver(skb, protocol);

        ipprot = rcu_dereference(inet_protos[protocol]);
        if (ipprot) {
                if (!ipprot->no_policy) {
                        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                kfree_skb_reason(skb,
                                                 SKB_DROP_REASON_XFRM_POLICY);
                                return;
                        }
                        nf_reset_ct(skb);
                }
                ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
                                      skb);
                if (ret < 0) {
                        protocol = -ret;
                        goto resubmit;
                }
                __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
        } else {
                if (!raw) {
                        if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
                                icmp_send(skb, ICMP_DEST_UNREACH,
                                          ICMP_PROT_UNREACH, 0);
                        }
                        kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
                } else {
                        __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
                        consume_skb(skb);
                }
        }
}

static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
                __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
                kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM);
                return 0;
        }

        skb_clear_delivery_time(skb);
        __skb_pull(skb, skb_network_header_len(skb));

        rcu_read_lock();
        ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
        rcu_read_unlock();

        return 0;
}

/*
 *         Deliver IP Packets to the higher protocol layers.
 */
int ip_local_deliver(struct sk_buff *skb)
{
        /*
         *        Reassemble IP fragments.
         */
        struct net *net = dev_net(skb->dev);

        if (ip_is_fragment(ip_hdr(skb))) {
                if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
                        return 0;
        }

        return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
                       net, NULL, skb, skb->dev, NULL,
                       ip_local_deliver_finish);
}
EXPORT_SYMBOL(ip_local_deliver);

static inline enum skb_drop_reason
ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
{
        const struct iphdr *iph;
        struct ip_options *opt;

        /* It looks as overkill, because not all
           IP options require packet mangling.
           But it is the easiest for now, especially taking
           into account that combination of IP options
           and running sniffer is extremely rare condition.
                                              --ANK (980813)
        */
        if (skb_cow(skb, skb_headroom(skb))) {
                __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
                return SKB_DROP_REASON_NOMEM;
        }

        iph = ip_hdr(skb);
        opt = &(IPCB(skb)->opt);
        opt->optlen = iph->ihl*4 - sizeof(struct iphdr);

        if (ip_options_compile(dev_net(dev), opt, skb)) {
                __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
                return SKB_DROP_REASON_IP_INHDR;
        }

        if (unlikely(opt->srr)) {
                struct in_device *in_dev = __in_dev_get_rcu(dev);

                if (in_dev) {
                        if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
                                if (IN_DEV_LOG_MARTIANS(in_dev))
                                        net_info_ratelimited("source route option %pI4 -> %pI4\n",
                                                             &iph->saddr,
                                                             &iph->daddr);
                                return SKB_DROP_REASON_NOT_SPECIFIED;
                        }
                }

                if (ip_options_rcv_srr(skb, dev))
                        return SKB_DROP_REASON_NOT_SPECIFIED;
        }

        return SKB_NOT_DROPPED_YET;
}

static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
                            const struct sk_buff *hint)
{
        return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
               ip_hdr(hint)->tos == iph->tos;
}

static int tcp_v4_early_demux(struct sk_buff *skb)
{
        struct net *net = dev_net_rcu(skb->dev);
        const struct iphdr *iph;
        const struct tcphdr *th;
        struct sock *sk;

        if (skb->pkt_type != PACKET_HOST)
                return 0;

        if (!pskb_may_pull(skb, skb_transport_offset(skb) +
                                sizeof(struct tcphdr)))
                return 0;

        iph = ip_hdr(skb);
        th = tcp_hdr(skb);

        if (th->doff < sizeof(struct tcphdr) / 4)
                return 0;

        sk = __inet_lookup_established(net, iph->saddr, th->source,
                                       iph->daddr, ntohs(th->dest),
                                       skb->skb_iif, inet_sdif(skb));
        if (sk) {
                skb->sk = sk;
                skb->destructor = sock_edemux;
                if (sk_fullsock(sk)) {
                        struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);

                        if (dst)
                                dst = dst_check(dst, 0);
                        if (dst &&
                            sk->sk_rx_dst_ifindex == skb->skb_iif)
                                skb_dst_set_noref(skb, dst);
                }
        }
        return 0;
}

static int ip_rcv_finish_core(struct net *net,
                              struct sk_buff *skb, struct net_device *dev,
                              const struct sk_buff *hint)
{
        const struct iphdr *iph = ip_hdr(skb);
        struct rtable *rt;
        int drop_reason;

        if (ip_can_use_hint(skb, iph, hint)) {
                drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr,
                                                ip4h_dscp(iph), dev, hint);
                if (unlikely(drop_reason))
                        goto drop_error;
        }

        if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
            !skb_dst(skb) &&
            !skb->sk &&
            !ip_is_fragment(iph)) {
                switch (iph->protocol) {
                case IPPROTO_TCP:
                        if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
                                tcp_v4_early_demux(skb);

                                /* must reload iph, skb->head might have changed */
                                iph = ip_hdr(skb);
                        }
                        break;
                case IPPROTO_UDP:
                        if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
                                drop_reason = udp_v4_early_demux(skb);
                                if (unlikely(drop_reason))
                                        goto drop_error;

                                /* must reload iph, skb->head might have changed */
                                iph = ip_hdr(skb);
                        }
                        break;
                }
        }

        /*
         *        Initialise the virtual path cache for the packet. It describes
         *        how the packet travels inside Linux networking.
         */
        if (!skb_valid_dst(skb)) {
                drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr,
                                                   ip4h_dscp(iph), dev);
                if (unlikely(drop_reason))
                        goto drop_error;
        } else {
                struct in_device *in_dev = __in_dev_get_rcu(dev);

                if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY))
                        IPCB(skb)->flags |= IPSKB_NOPOLICY;
        }

#ifdef CONFIG_IP_ROUTE_CLASSID
        if (unlikely(skb_dst(skb)->tclassid)) {
                struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
                u32 idx = skb_dst(skb)->tclassid;
                st[idx&0xFF].o_packets++;
                st[idx&0xFF].o_bytes += skb->len;
                st[(idx>>16)&0xFF].i_packets++;
                st[(idx>>16)&0xFF].i_bytes += skb->len;
        }
#endif

        if (iph->ihl > 5) {
                drop_reason = ip_rcv_options(skb, dev);
                if (drop_reason)
                        goto drop;
        }

        rt = skb_rtable(skb);
        if (rt->rt_type == RTN_MULTICAST) {
                __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
        } else if (rt->rt_type == RTN_BROADCAST) {
                __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
        } else if (skb->pkt_type == PACKET_BROADCAST ||
                   skb->pkt_type == PACKET_MULTICAST) {
                struct in_device *in_dev = __in_dev_get_rcu(dev);

                /* RFC 1122 3.3.6:
                 *
                 *   When a host sends a datagram to a link-layer broadcast
                 *   address, the IP destination address MUST be a legal IP
                 *   broadcast or IP multicast address.
                 *
                 *   A host SHOULD silently discard a datagram that is received
                 *   via a link-layer broadcast (see Section 2.4) but does not
                 *   specify an IP multicast or broadcast destination address.
                 *
                 * This doesn't explicitly say L2 *broadcast*, but broadcast is
                 * in a way a form of multicast and the most common use case for
                 * this is 802.11 protecting against cross-station spoofing (the
                 * so-called "hole-196" attack) so do it for both.
                 */
                if (in_dev &&
                    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
                        drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
                        goto drop;
                }
        }

        return NET_RX_SUCCESS;

drop:
        kfree_skb_reason(skb, drop_reason);
        return NET_RX_DROP;

drop_error:
        if (drop_reason == SKB_DROP_REASON_IP_RPFILTER)
                __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
        goto drop;
}

static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        int ret;

        /* if ingress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
        skb = l3mdev_ip_rcv(skb);
        if (!skb)
                return NET_RX_SUCCESS;

        ret = ip_rcv_finish_core(net, skb, dev, NULL);
        if (ret != NET_RX_DROP)
                ret = dst_input(skb);
        return ret;
}

/*
 *         Main IP Receive routine.
 */
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
        const struct iphdr *iph;
        int drop_reason;
        u32 len;

        /* When the interface is in promisc. mode, drop all the crap
         * that it receives, do not try to analyse it.
         */
        if (skb->pkt_type == PACKET_OTHERHOST) {
                dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
                drop_reason = SKB_DROP_REASON_OTHERHOST;
                goto drop;
        }

        __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb) {
                __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
                goto out;
        }

        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto inhdr_error;

        iph = ip_hdr(skb);

        /*
         *        RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
         *
         *        Is the datagram acceptable?
         *
         *        1.        Length at least the size of an ip header
         *        2.        Version of 4
         *        3.        Checksums correctly. [Speed optimisation for later, skip loopback checksums]
         *        4.        Doesn't have a bogus length
         */

        if (iph->ihl < 5 || iph->version != 4)
                goto inhdr_error;

        BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
        BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
        BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
        __IP_ADD_STATS(net,
                       IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
                       max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));

        if (!pskb_may_pull(skb, iph->ihl*4))
                goto inhdr_error;

        iph = ip_hdr(skb);

        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
                goto csum_error;

        len = iph_totlen(skb, iph);
        if (skb->len < len) {
                drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
                __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
                goto drop;
        } else if (len < (iph->ihl*4))
                goto inhdr_error;

        /* Our transport medium may have padded the buffer out. Now we know it
         * is IP we can trim to the true length of the frame.
         * Note this now means skb->len holds ntohs(iph->tot_len).
         */
        if (pskb_trim_rcsum(skb, len)) {
                __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
                goto drop;
        }

        iph = ip_hdr(skb);
        skb->transport_header = skb->network_header + iph->ihl*4;

        /* Remove any debris in the socket control block */
        memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
        IPCB(skb)->iif = skb->skb_iif;

        /* Must drop socket now because of tproxy. */
        if (!skb_sk_is_prefetched(skb))
                skb_orphan(skb);

        return skb;

csum_error:
        drop_reason = SKB_DROP_REASON_IP_CSUM;
        __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
        if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
                drop_reason = SKB_DROP_REASON_IP_INHDR;
        __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
        kfree_skb_reason(skb, drop_reason);
out:
        return NULL;
}

/*
 * IP receive entry point
 */
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
           struct net_device *orig_dev)
{
        struct net *net = dev_net(dev);

        skb = ip_rcv_core(skb, net);
        if (skb == NULL)
                return NET_RX_DROP;

        return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
                       net, NULL, skb, dev, NULL,
                       ip_rcv_finish);
}

static void ip_sublist_rcv_finish(struct list_head *head)
{
        struct sk_buff *skb, *next;

        list_for_each_entry_safe(skb, next, head, list) {
                skb_list_del_init(skb);
                dst_input(skb);
        }
}

static struct sk_buff *ip_extract_route_hint(const struct net *net,
                                             struct sk_buff *skb)
{
        const struct iphdr *iph = ip_hdr(skb);

        if (fib4_has_custom_rules(net) ||
            ipv4_is_lbcast(iph->daddr) ||
            ipv4_is_zeronet(iph->daddr) ||
            IPCB(skb)->flags & IPSKB_MULTIPATH)
                return NULL;

        return skb;
}

static void ip_list_rcv_finish(struct net *net, struct list_head *head)
{
        struct sk_buff *skb, *next, *hint = NULL;
        struct dst_entry *curr_dst = NULL;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                struct net_device *dev = skb->dev;
                struct dst_entry *dst;

                skb_list_del_init(skb);
                /* if ingress device is enslaved to an L3 master device pass the
                 * skb to its handler for processing
                 */
                skb = l3mdev_ip_rcv(skb);
                if (!skb)
                        continue;
                if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP)
                        continue;

                dst = skb_dst(skb);
                if (curr_dst != dst) {
                        hint = ip_extract_route_hint(net, skb);

                        /* dispatch old sublist */
                        if (!list_empty(&sublist))
                                ip_sublist_rcv_finish(&sublist);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        curr_dst = dst;
                }
                list_add_tail(&skb->list, &sublist);
        }
        /* dispatch final sublist */
        ip_sublist_rcv_finish(&sublist);
}

static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
                           struct net *net)
{
        NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
                     head, dev, NULL, ip_rcv_finish);
        ip_list_rcv_finish(net, head);
}

/* Receive a list of IP packets */
void ip_list_rcv(struct list_head *head, struct packet_type *pt,
                 struct net_device *orig_dev)
{
        struct net_device *curr_dev = NULL;
        struct net *curr_net = NULL;
        struct sk_buff *skb, *next;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                struct net_device *dev = skb->dev;
                struct net *net = dev_net(dev);

                skb_list_del_init(skb);
                skb = ip_rcv_core(skb, net);
                if (skb == NULL)
                        continue;

                if (curr_dev != dev || curr_net != net) {
                        /* dispatch old sublist */
                        if (!list_empty(&sublist))
                                ip_sublist_rcv(&sublist, curr_dev, curr_net);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        curr_dev = dev;
                        curr_net = net;
                }
                list_add_tail(&skb->list, &sublist);
        }
        /* dispatch final sublist */
        if (!list_empty(&sublist))
                ip_sublist_rcv(&sublist, curr_dev, curr_net);
}







































































    2 























    2 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_MQ_SCHED_H
#define BLK_MQ_SCHED_H

#include "elevator.h"
#include "blk-mq.h"

#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)

bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs, struct request **merged_request);
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
                                   struct list_head *free);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);

int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
                struct elevator_resources *res);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q);

struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
                unsigned int nr_hw_queues, unsigned int nr_requests);
int blk_mq_alloc_sched_res(struct request_queue *q,
                struct elevator_type *type,
                struct elevator_resources *res,
                unsigned int nr_hw_queues);
int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
                struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl,
                struct blk_mq_tag_set *set);
void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl);
void blk_mq_free_sched_tags(struct elevator_tags *et,
                struct blk_mq_tag_set *set);
void blk_mq_free_sched_res(struct elevator_resources *res,
                struct elevator_type *type,
                struct blk_mq_tag_set *set);
void blk_mq_free_sched_res_batch(struct xarray *et_table,
                struct blk_mq_tag_set *set);
/*
 * blk_mq_alloc_sched_data() - Allocates scheduler specific data
 * Returns:
 *         - Pointer to allocated data on success
 *         - NULL if no allocation needed
 *         - ERR_PTR(-ENOMEM) in case of failure
 */
static inline void *blk_mq_alloc_sched_data(struct request_queue *q,
                struct elevator_type *e)
{
        void *sched_data;

        if (!e || !e->ops.alloc_sched_data)
                return NULL;

        sched_data = e->ops.alloc_sched_data(q);
        return (sched_data) ?: ERR_PTR(-ENOMEM);
}

static inline void blk_mq_free_sched_data(struct elevator_type *e, void *data)
{
        if (e && e->ops.free_sched_data)
                e->ops.free_sched_data(data);
}

static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
        if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                __blk_mq_sched_restart(hctx);
}

static inline bool bio_mergeable(struct bio *bio)
{
        return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
}

static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
                         struct bio *bio)
{
        if (rq->rq_flags & RQF_USE_SCHED) {
                struct elevator_queue *e = q->elevator;

                if (e->type->ops.allow_merge)
                        return e->type->ops.allow_merge(q, rq, bio);
        }
        return true;
}

static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
        if (rq->rq_flags & RQF_USE_SCHED) {
                struct elevator_queue *e = rq->q->elevator;

                if (e->type->ops.completed_request)
                        e->type->ops.completed_request(rq, now);
        }
}

static inline void blk_mq_sched_requeue_request(struct request *rq)
{
        if (rq->rq_flags & RQF_USE_SCHED) {
                struct request_queue *q = rq->q;
                struct elevator_queue *e = q->elevator;

                if (e->type->ops.requeue_request)
                        e->type->ops.requeue_request(rq);
        }
}

static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
{
        struct elevator_queue *e = hctx->queue->elevator;

        if (e && e->type->ops.has_work)
                return e->type->ops.has_work(hctx);

        return false;
}

static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
{
        return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}

static inline void blk_mq_set_min_shallow_depth(struct request_queue *q,
                                                unsigned int depth)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
                                                depth);
}

static inline bool blk_mq_is_sync_read(blk_opf_t opf)
{
        return op_is_sync(opf) && !op_is_write(opf);
}

#endif












































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_IP6_ROUTE_H
#define _NET_IP6_ROUTE_H

#include <net/addrconf.h>
#include <net/flow.h>
#include <net/ip6_fib.h>
#include <net/sock.h>
#include <net/lwtunnel.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/route.h>
#include <net/nexthop.h>

struct route_info {
        __u8                        type;
        __u8                        length;
        __u8                        prefix_len;
#if defined(__BIG_ENDIAN_BITFIELD)
        __u8                        reserved_h:3,
                                route_pref:2,
                                reserved_l:3;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
        __u8                        reserved_l:3,
                                route_pref:2,
                                reserved_h:3;
#endif
        __be32                        lifetime;
        __u8                        prefix[];        /* 0,8 or 16 */
};

#define RT6_LOOKUP_F_IFACE                0x00000001
#define RT6_LOOKUP_F_REACHABLE                0x00000002
#define RT6_LOOKUP_F_HAS_SADDR                0x00000004
#define RT6_LOOKUP_F_SRCPREF_TMP        0x00000008
#define RT6_LOOKUP_F_SRCPREF_PUBLIC        0x00000010
#define RT6_LOOKUP_F_SRCPREF_COA        0x00000020
#define RT6_LOOKUP_F_IGNORE_LINKSTATE        0x00000040
#define RT6_LOOKUP_F_DST_NOREF                0x00000080

/* We do not (yet ?) support IPv6 jumbograms (RFC 2675)
 * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header
 */
#define IP6_MAX_MTU (0xFFFF + sizeof(struct ipv6hdr))

/*
 * rt6_srcprefs2flags() and rt6_flags2srcprefs() translate
 * between IPV6_ADDR_PREFERENCES socket option values
 *        IPV6_PREFER_SRC_TMP    = 0x1
 *        IPV6_PREFER_SRC_PUBLIC = 0x2
 *        IPV6_PREFER_SRC_COA    = 0x4
 * and above RT6_LOOKUP_F_SRCPREF_xxx flags.
 */
static inline int rt6_srcprefs2flags(unsigned int srcprefs)
{
        return (srcprefs & IPV6_PREFER_SRC_MASK) << 3;
}

static inline unsigned int rt6_flags2srcprefs(int flags)
{
        return (flags >> 3) & IPV6_PREFER_SRC_MASK;
}

static inline bool rt6_need_strict(const struct in6_addr *daddr)
{
        return ipv6_addr_type(daddr) &
                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}

/* fib entries using a nexthop object can not be coalesced into
 * a multipath route
 */
static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
{
        /* the RTF_ADDRCONF flag filters out RA's */
        return !(f6i->fib6_flags & RTF_ADDRCONF) && !f6i->nh &&
                f6i->fib6_nh->fib_nh_gw_family;
}

#if IS_ENABLED(CONFIG_IPV6)
void ip6_route_input(struct sk_buff *skb);
#else
static inline void ip6_route_input(struct sk_buff *skb)
{
}
#endif

struct dst_entry *ip6_route_input_lookup(struct net *net,
                                         struct net_device *dev,
                                         struct flowi6 *fl6,
                                         const struct sk_buff *skb, int flags);

struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
                                         struct flowi6 *fl6, int flags);

static inline struct dst_entry *ip6_route_output(struct net *net,
                                                 const struct sock *sk,
                                                 struct flowi6 *fl6)
{
        return ip6_route_output_flags(net, sk, fl6, 0);
}

/* Only conditionally release dst if flags indicates
 * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list.
 */
static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
{
        if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
            !list_empty(&rt->dst.rt_uncached))
                ip6_rt_put(rt);
}

struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb, int flags);
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                               int ifindex, struct flowi6 *fl6,
                               const struct sk_buff *skb, int flags);

void ip6_route_init_special_entries(void);
int ip6_route_init(void);
void ip6_route_cleanup(void);

int ipv6_route_ioctl(struct net *net, unsigned int cmd,
                struct in6_rtmsg *rtmsg);

int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
                  struct netlink_ext_ack *extack);
int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
#if IS_ENABLED(CONFIG_IPV6)
int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify);
#else
static inline int ip6_del_rt(struct net *net, struct fib6_info *f6i,
                             bool skip_notify)
{
        return -EAFNOSUPPORT;
}
#endif

void rt6_flush_exceptions(struct fib6_info *f6i);
void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
                        unsigned long now);

static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i,
                                      const struct in6_addr *daddr,
                                      unsigned int prefs, int l3mdev_index,
                                      struct in6_addr *saddr)
{
        struct net_device *l3mdev;
        struct net_device *dev;
        bool same_vrf;
        int err = 0;

        rcu_read_lock();

        l3mdev = dev_get_by_index_rcu(net, l3mdev_index);
        if (!f6i || !f6i->fib6_prefsrc.plen || l3mdev)
                dev = f6i ? fib6_info_nh_dev(f6i) : NULL;
        same_vrf = !l3mdev || l3mdev_master_dev_rcu(dev) == l3mdev;
        if (f6i && f6i->fib6_prefsrc.plen && same_vrf)
                *saddr = f6i->fib6_prefsrc.addr;
        else
                err = ipv6_dev_get_saddr(net, same_vrf ? dev : l3mdev, daddr, prefs, saddr);

        rcu_read_unlock();

        return err;
}

struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
                            const struct in6_addr *saddr, int oif,
                            const struct sk_buff *skb, int flags);
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
                       const struct sk_buff *skb, struct flow_keys *hkeys);

struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);

void fib6_force_start_gc(struct net *net);

struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev,
                                     const struct in6_addr *addr, bool anycast,
                                     gfp_t gfp_flags, struct netlink_ext_ack *extack);

struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
                               int flags);

/*
 *        support functions for ND
 *
 */
struct fib6_info *rt6_get_dflt_router(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev);
struct fib6_info *rt6_add_dflt_router(struct net *net,
                                     const struct in6_addr *gwaddr,
                                     struct net_device *dev, unsigned int pref,
                                     u32 defrtr_usr_metric,
                                     int lifetime);

void rt6_purge_dflt_routers(struct net *net);

int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                  const struct in6_addr *gwaddr);

void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
                     u32 mark, kuid_t uid);
void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
                  kuid_t uid);
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif);
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);

struct netlink_callback;

struct rt6_rtnl_dump_arg {
        struct sk_buff *skb;
        struct netlink_callback *cb;
        struct net *net;
        struct fib_dump_filter filter;
};

int rt6_dump_route(struct fib6_info *f6i, void *p_arg, unsigned int skip);
void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
void rt6_sync_up(struct net_device *dev, unsigned char nh_flags);
void rt6_disable_ip(struct net_device *dev, unsigned long event);
void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
void rt6_multipath_rebalance(struct fib6_info *f6i);

void rt6_uncached_list_add(struct rt6_info *rt);
void rt6_uncached_list_del(struct rt6_info *rt);

static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
{
        const struct dst_entry *dst = skb_dst(skb);

        if (dst)
                return dst_rt6_info(dst);

        return NULL;
}

/*
 *        Store a destination cache entry in a socket
 */
static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
                                 bool daddr_set,
                                 bool saddr_set)
{
        struct ipv6_pinfo *np = inet6_sk(sk);

        np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
        sk_setup_caps(sk, dst);
        np->daddr_cache = daddr_set;
#ifdef CONFIG_IPV6_SUBTREES
        np->saddr_cache = saddr_set;
#endif
}

void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
                           const struct flowi6 *fl6);

static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
{
        const struct rt6_info *rt = dst_rt6_info(skb_dst(skb));

        return rt->rt6i_flags & RTF_LOCAL;
}

static inline bool __ipv6_anycast_destination(const struct rt6key *rt6i_dst,
                                              u32 rt6i_flags,
                                              const struct in6_addr *daddr)
{
        return rt6i_flags & RTF_ANYCAST ||
               (rt6i_dst->plen < 127 &&
               !(rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) &&
               ipv6_addr_equal(&rt6i_dst->addr, daddr));
}

static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
                                            const struct in6_addr *daddr)
{
        const struct rt6_info *rt = dst_rt6_info(dst);

        return __ipv6_anycast_destination(&rt->rt6i_dst, rt->rt6i_flags, daddr);
}

#if IS_ENABLED(CONFIG_IPV6)
int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                 int (*output)(struct net *, struct sock *, struct sk_buff *));
#else
static inline int ip6_fragment(struct net *net, struct sock *sk,
                               struct sk_buff *skb,
                               int (*output)(struct net *, struct sock *,
                                             struct sk_buff *))
{
        kfree_skb(skb);
        return -EAFNOSUPPORT;
}
#endif

/* Variant of dst_mtu() for IPv6 users */
static inline u32 dst6_mtu(const struct dst_entry *dst)
{
        return INDIRECT_CALL_1(dst->ops->mtu, ip6_mtu, dst);
}

static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
{
        const struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
                                inet6_sk(skb->sk) : NULL;
        const struct dst_entry *dst = skb_dst(skb);
        unsigned int mtu;

        if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) {
                mtu = READ_ONCE(dst_dev(dst)->mtu);
                mtu -= lwtunnel_headroom(dst->lwtstate, mtu);
        } else {
                mtu = dst_mtu(dst);
        }
        return mtu;
}

static inline bool ip6_sk_accept_pmtu(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc);

        return pmtudisc != IPV6_PMTUDISC_INTERFACE &&
               pmtudisc != IPV6_PMTUDISC_OMIT;
}

static inline bool ip6_sk_ignore_df(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc);

        return pmtudisc < IPV6_PMTUDISC_DO ||
               pmtudisc == IPV6_PMTUDISC_OMIT;
}

static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt,
                                                 const struct in6_addr *daddr)
{
        if (rt->rt6i_flags & RTF_GATEWAY)
                return &rt->rt6i_gateway;
        else if (unlikely(rt->rt6i_flags & RTF_CACHE))
                return &rt->rt6i_dst.addr;
        else
                return daddr;
}

static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
{
        struct fib6_nh *nha, *nhb;

        if (a->nh || b->nh)
                return nexthop_cmp(a->nh, b->nh);

        nha = a->fib6_nh;
        nhb = b->fib6_nh;
        return nha->fib_nh_dev == nhb->fib_nh_dev &&
               ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
               !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
}

static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst,
                                                     bool forwarding)
{
        struct inet6_dev *idev;
        unsigned int mtu;

        if (!forwarding || dst_metric_locked(dst, RTAX_MTU)) {
                mtu = dst_metric_raw(dst, RTAX_MTU);
                if (mtu)
                        goto out;
        }

        mtu = IPV6_MIN_MTU;
        rcu_read_lock();
        idev = __in6_dev_get(dst_dev_rcu(dst));
        if (idev)
                mtu = READ_ONCE(idev->cnf.mtu6);
        rcu_read_unlock();

out:
        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}

u32 ip6_mtu_from_fib6(const struct fib6_result *res,
                      const struct in6_addr *daddr,
                      const struct in6_addr *saddr);

struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
                                   struct net_device *dev, struct sk_buff *skb,
                                   const void *daddr);
#endif










































































    3 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KCOV_H
#define _LINUX_KCOV_H

#include <linux/sched.h>
#include <uapi/linux/kcov.h>

struct task_struct;

#ifdef CONFIG_KCOV

enum kcov_mode {
        /* Coverage collection is not enabled yet. */
        KCOV_MODE_DISABLED = 0,
        /* KCOV was initialized, but tracing mode hasn't been chosen yet. */
        KCOV_MODE_INIT = 1,
        /*
         * Tracing coverage collection mode.
         * Covered PCs are collected in a per-task buffer.
         */
        KCOV_MODE_TRACE_PC = 2,
        /* Collecting comparison operands mode. */
        KCOV_MODE_TRACE_CMP = 3,
        /* The process owns a KCOV remote reference. */
        KCOV_MODE_REMOTE = 4,
};

#define KCOV_IN_CTXSW        (1 << 30)

void kcov_task_init(struct task_struct *t);
void kcov_task_exit(struct task_struct *t);

#define kcov_prepare_switch(t)                        \
do {                                                \
        (t)->kcov_mode |= KCOV_IN_CTXSW;        \
} while (0)

#define kcov_finish_switch(t)                        \
do {                                                \
        (t)->kcov_mode &= ~KCOV_IN_CTXSW;        \
} while (0)

/* See Documentation/dev-tools/kcov.rst for usage details. */
void kcov_remote_start(u64 handle);
void kcov_remote_stop(void);
u64 kcov_common_handle(void);

static inline void kcov_remote_start_common(u64 id)
{
        kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_COMMON, id));
}

static inline void kcov_remote_start_usb(u64 id)
{
        kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_USB, id));
}

/*
 * The softirq flavor of kcov_remote_*() functions is introduced as a temporary
 * work around for kcov's lack of nested remote coverage sections support in
 * task context. Adding support for nested sections is tracked in:
 * https://bugzilla.kernel.org/show_bug.cgi?id=210337
 */

static inline void kcov_remote_start_usb_softirq(u64 id)
{
        if (in_serving_softirq() && !in_hardirq())
                kcov_remote_start_usb(id);
}

static inline void kcov_remote_stop_softirq(void)
{
        if (in_serving_softirq() && !in_hardirq())
                kcov_remote_stop();
}

#ifdef CONFIG_64BIT
typedef unsigned long kcov_u64;
#else
typedef unsigned long long kcov_u64;
#endif

void __sanitizer_cov_trace_pc(void);
void __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2);
void __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2);
void __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2);
void __sanitizer_cov_trace_cmp8(kcov_u64 arg1, kcov_u64 arg2);
void __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2);
void __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2);
void __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2);
void __sanitizer_cov_trace_const_cmp8(kcov_u64 arg1, kcov_u64 arg2);
void __sanitizer_cov_trace_switch(kcov_u64 val, void *cases);

#else

static inline void kcov_task_init(struct task_struct *t) {}
static inline void kcov_task_exit(struct task_struct *t) {}
static inline void kcov_prepare_switch(struct task_struct *t) {}
static inline void kcov_finish_switch(struct task_struct *t) {}
static inline void kcov_remote_start(u64 handle) {}
static inline void kcov_remote_stop(void) {}
static inline u64 kcov_common_handle(void)
{
        return 0;
}
static inline void kcov_remote_start_common(u64 id) {}
static inline void kcov_remote_start_usb(u64 id) {}
static inline void kcov_remote_start_usb_softirq(u64 id) {}
static inline void kcov_remote_stop_softirq(void) {}

#endif /* CONFIG_KCOV */
#endif /* _LINUX_KCOV_H */




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 

















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                PF_INET protocol family socket handler.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Alan Cox, <A.Cox@swansea.ac.uk>
 *
 * Changes (see also sock.c)
 *
 *                piggy,
 *                Karl Knutson        :        Socket protocol table
 *                A.N.Kuznetsov        :        Socket death error in accept().
 *                John Richardson :        Fix non blocking error in connect()
 *                                        so sockets that fail to connect
 *                                        don't return -EINPROGRESS.
 *                Alan Cox        :        Asynchronous I/O support
 *                Alan Cox        :        Keep correct socket pointer on sock
 *                                        structures
 *                                        when accept() ed
 *                Alan Cox        :        Semantics of SO_LINGER aren't state
 *                                        moved to close when you look carefully.
 *                                        With this fixed and the accept bug fixed
 *                                        some RPC stuff seems happier.
 *                Niibe Yutaka        :        4.4BSD style write async I/O
 *                Alan Cox,
 *                Tony Gale         :        Fixed reuse semantics.
 *                Alan Cox        :        bind() shouldn't abort existing but dead
 *                                        sockets. Stops FTP netin:.. I hope.
 *                Alan Cox        :        bind() works correctly for RAW sockets.
 *                                        Note that FreeBSD at least was broken
 *                                        in this respect so be careful with
 *                                        compatibility tests...
 *                Alan Cox        :        routing cache support
 *                Alan Cox        :        memzero the socket structure for
 *                                        compactness.
 *                Matt Day        :        nonblock connect error handler
 *                Alan Cox        :        Allow large numbers of pending sockets
 *                                        (eg for big web sites), but only if
 *                                        specifically application requested.
 *                Alan Cox        :        New buffering throughout IP. Used
 *                                        dumbly.
 *                Alan Cox        :        New buffering now used smartly.
 *                Alan Cox        :        BSD rather than common sense
 *                                        interpretation of listen.
 *                Germano Caronni        :        Assorted small races.
 *                Alan Cox        :        sendmsg/recvmsg basic support.
 *                Alan Cox        :        Only sendmsg/recvmsg now supported.
 *                Alan Cox        :        Locked down bind (see security list).
 *                Alan Cox        :        Loosened bind a little.
 *                Mike McLagan        :        ADD/DEL DLCI Ioctls
 *        Willy Konynenberg        :        Transparent proxying support.
 *                David S. Miller        :        New socket lookup architecture.
 *                                        Some other random speedups.
 *                Cyrus Durgin        :        Cleaned up file for kmod hacks.
 *                Andi Kleen        :        Fix inet_stream_connect TCP race.
 */

#define pr_fmt(fmt) "IPv4: " fmt

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/capability.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/slab.h>

#include <linux/uaccess.h>

#include <linux/inet.h>
#include <linux/igmp.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <net/checksum.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/inet_connection_sock.h>
#include <net/gro.h>
#include <net/gso.h>
#include <net/tcp.h>
#include <net/psp.h>
#include <net/udp.h>
#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/raw.h>
#include <net/icmp.h>
#include <net/inet_common.h>
#include <net/ip_tunnels.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/secure_seq.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
#include <net/l3mdev.h>
#include <net/compat.h>
#include <net/rps.h>

#include <trace/events/sock.h>

/* Keep the definition of IPv6 disable here for now, to avoid annoying linker
 * issues in case IPv6=m
 */
int disable_ipv6_mod;
EXPORT_SYMBOL(disable_ipv6_mod);

/* The inetsw table contains everything that inet_create needs to
 * build a new socket.
 */
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);

/* New destruction routine */

void inet_sock_destruct(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);

        __skb_queue_purge(&sk->sk_receive_queue);
        __skb_queue_purge(&sk->sk_error_queue);

        sk_mem_reclaim_final(sk);

        if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
                pr_err("Attempt to release TCP socket in state %d %p\n",
                       sk->sk_state, sk);
                return;
        }
        if (!sock_flag(sk, SOCK_DEAD)) {
                pr_err("Attempt to release alive inet socket %p\n", sk);
                return;
        }

        WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
        WARN_ON_ONCE(sk->sk_wmem_queued);
        WARN_ON_ONCE(sk->sk_forward_alloc);

        kfree(rcu_dereference_protected(inet->inet_opt, 1));
        dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
        dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
        psp_sk_assoc_free(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);

/*
 *        The routines beyond this point handle the behaviour of an AF_INET
 *        socket object. Mostly it punts to the subprotocols of IP to do
 *        the work.
 */

/*
 *        Automatically bind an unbound socket.
 */

static int inet_autobind(struct sock *sk)
{
        struct inet_sock *inet;
        /* We may need to bind the socket. */
        lock_sock(sk);
        inet = inet_sk(sk);
        if (!inet->inet_num) {
                if (sk->sk_prot->get_port(sk, 0)) {
                        release_sock(sk);
                        return -EAGAIN;
                }
                inet->inet_sport = htons(inet->inet_num);
        }
        release_sock(sk);
        return 0;
}

int __inet_listen_sk(struct sock *sk, int backlog)
{
        unsigned char old_state = sk->sk_state;
        int err, tcp_fastopen;

        if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                return -EINVAL;

        WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
        /* Really, if the socket is already in listen state
         * we can only allow the backlog to be adjusted.
         */
        if (old_state != TCP_LISTEN) {
                /* Enable TFO w/o requiring TCP_FASTOPEN socket option.
                 * Note that only TCP sockets (SOCK_STREAM) will reach here.
                 * Also fastopen backlog may already been set via the option
                 * because the socket was in TCP_LISTEN state previously but
                 * was shutdown() rather than close().
                 */
                tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
                if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
                    (tcp_fastopen & TFO_SERVER_ENABLE) &&
                    !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
                        fastopen_queue_tune(sk, backlog);
                        tcp_fastopen_init_key_once(sock_net(sk));
                }

                err = inet_csk_listen_start(sk);
                if (err)
                        return err;

                tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
        }
        return 0;
}

/*
 *        Move a socket into listening state.
 */
int inet_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        int err = -EINVAL;

        lock_sock(sk);

        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
                goto out;

        err = __inet_listen_sk(sk, backlog);

out:
        release_sock(sk);
        return err;
}
EXPORT_SYMBOL(inet_listen);

/*
 *        Create an inet socket.
 */

static int inet_create(struct net *net, struct socket *sock, int protocol,
                       int kern)
{
        struct sock *sk;
        struct inet_protosw *answer;
        struct inet_sock *inet;
        struct proto *answer_prot;
        unsigned char answer_flags;
        int try_loading_module = 0;
        int err;

        if (protocol < 0 || protocol >= IPPROTO_MAX)
                return -EINVAL;

        sock->state = SS_UNCONNECTED;

        /* Look for the requested type/protocol pair. */
lookup_protocol:
        err = -ESOCKTNOSUPPORT;
        rcu_read_lock();
        list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

                err = 0;
                /* Check the non-wild match. */
                if (protocol == answer->protocol) {
                        if (protocol != IPPROTO_IP)
                                break;
                } else {
                        /* Check for the two wild cases. */
                        if (IPPROTO_IP == protocol) {
                                protocol = answer->protocol;
                                break;
                        }
                        if (IPPROTO_IP == answer->protocol)
                                break;
                }
                err = -EPROTONOSUPPORT;
        }

        if (unlikely(err)) {
                if (try_loading_module < 2) {
                        rcu_read_unlock();
                        /*
                         * Be more specific, e.g. net-pf-2-proto-132-type-1
                         * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
                         */
                        if (++try_loading_module == 1)
                                request_module("net-pf-%d-proto-%d-type-%d",
                                               PF_INET, protocol, sock->type);
                        /*
                         * Fall back to generic, e.g. net-pf-2-proto-132
                         * (net-pf-PF_INET-proto-IPPROTO_SCTP)
                         */
                        else
                                request_module("net-pf-%d-proto-%d",
                                               PF_INET, protocol);
                        goto lookup_protocol;
                } else
                        goto out_rcu_unlock;
        }

        err = -EPERM;
        if (sock->type == SOCK_RAW && !kern &&
            !ns_capable(net->user_ns, CAP_NET_RAW))
                goto out_rcu_unlock;

        sock->ops = answer->ops;
        answer_prot = answer->prot;
        answer_flags = answer->flags;
        rcu_read_unlock();

        WARN_ON(!answer_prot->slab);

        err = -ENOMEM;
        sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
        if (!sk)
                goto out;

        err = 0;
        if (INET_PROTOSW_REUSE & answer_flags)
                sk->sk_reuse = SK_CAN_REUSE;

        if (INET_PROTOSW_ICSK & answer_flags)
                inet_init_csk_locks(sk);

        inet = inet_sk(sk);
        inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);

        inet_clear_bit(NODEFRAG, sk);

        if (SOCK_RAW == sock->type) {
                inet->inet_num = protocol;
                if (IPPROTO_RAW == protocol)
                        inet_set_bit(HDRINCL, sk);
        }

        if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
                inet->pmtudisc = IP_PMTUDISC_DONT;
        else
                inet->pmtudisc = IP_PMTUDISC_WANT;

        atomic_set(&inet->inet_id, 0);

        sock_init_data(sock, sk);

        sk->sk_destruct           = inet_sock_destruct;
        sk->sk_protocol           = protocol;
        sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
        sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);

        inet->uc_ttl        = -1;
        inet_set_bit(MC_LOOP, sk);
        inet->mc_ttl        = 1;
        inet_set_bit(MC_ALL, sk);
        inet->mc_index        = 0;
        inet->mc_list        = NULL;
        inet->rcv_tos        = 0;

        if (inet->inet_num) {
                /* It assumes that any protocol which allows
                 * the user to assign a number at socket
                 * creation time automatically
                 * shares.
                 */
                inet->inet_sport = htons(inet->inet_num);
                /* Add to protocol hash chains. */
                err = sk->sk_prot->hash(sk);
                if (err)
                        goto out_sk_release;
        }

        if (sk->sk_prot->init) {
                err = sk->sk_prot->init(sk);
                if (err)
                        goto out_sk_release;
        }

        if (!kern) {
                err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
                if (err)
                        goto out_sk_release;
        }
out:
        return err;
out_rcu_unlock:
        rcu_read_unlock();
        goto out;
out_sk_release:
        sk_common_release(sk);
        sock->sk = NULL;
        goto out;
}


/*
 *        The peer socket should always be NULL (or else). When we call this
 *        function we are destroying the object and from then on nobody
 *        should refer to it.
 */
int inet_release(struct socket *sock)
{
        struct sock *sk = sock->sk;

        if (sk) {
                long timeout;

                if (!sk->sk_kern_sock)
                        BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);

                /* Applications forget to leave groups before exiting */
                ip_mc_drop_socket(sk);

                /* If linger is set, we don't return until the close
                 * is complete.  Otherwise we return immediately. The
                 * actually closing is done the same either way.
                 *
                 * If the close is due to the process exiting, we never
                 * linger..
                 */
                timeout = 0;
                if (sock_flag(sk, SOCK_LINGER) &&
                    !(current->flags & PF_EXITING))
                        timeout = sk->sk_lingertime;
                sk->sk_prot->close(sk, timeout);
                sock->sk = NULL;
        }
        return 0;
}
EXPORT_SYMBOL(inet_release);

int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
        u32 flags = BIND_WITH_LOCK;
        int err;

        /* If the socket has its own bind function then use it. (RAW) */
        if (sk->sk_prot->bind) {
                return sk->sk_prot->bind(sk, uaddr, addr_len);
        }
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        /* BPF prog is run before any checks are done so that if the prog
         * changes context in a wrong way it will be caught.
         */
        err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
                                                 CGROUP_INET4_BIND, &flags);
        if (err)
                return err;

        return __inet_bind(sk, uaddr, addr_len, flags);
}

int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
{
        return inet_bind_sk(sock->sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_bind);

int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
                u32 flags)
{
        struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        unsigned short snum;
        int chk_addr_ret;
        u32 tb_id = RT_TABLE_LOCAL;
        int err;

        if (addr->sin_family != AF_INET) {
                /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
                 * only if s_addr is INADDR_ANY.
                 */
                err = -EAFNOSUPPORT;
                if (addr->sin_family != AF_UNSPEC ||
                    addr->sin_addr.s_addr != htonl(INADDR_ANY))
                        goto out;
        }

        tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
        chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);

        /* Not specified by any standard per-se, however it breaks too
         * many applications when removed.  It is unfortunate since
         * allowing applications to make a non-local bind solves
         * several problems with systems using dynamic addressing.
         * (ie. your servers still start up even if your ISDN link
         *  is temporarily down)
         */
        err = -EADDRNOTAVAIL;
        if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
                                         chk_addr_ret))
                goto out;

        snum = ntohs(addr->sin_port);
        err = -EACCES;
        if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
            snum && inet_port_requires_bind_service(net, snum) &&
            !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                goto out;

        /*      We keep a pair of addresses. rcv_saddr is the one
         *      used by hash lookups, and saddr is used for transmit.
         *
         *      In the BSD API these are the same except where it
         *      would be illegal to use them (multicast/broadcast) in
         *      which case the sending device address is used.
         */
        if (flags & BIND_WITH_LOCK)
                lock_sock(sk);

        /* Check these errors (active socket, double bind). */
        err = -EINVAL;
        if (sk->sk_state != TCP_CLOSE || inet->inet_num)
                goto out_release_sock;

        inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
        if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
                inet->inet_saddr = 0;  /* Use device */

        /* Make sure we are allowed to bind here. */
        if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
                      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
                err = sk->sk_prot->get_port(sk, snum);
                if (err) {
                        inet->inet_saddr = inet->inet_rcv_saddr = 0;
                        goto out_release_sock;
                }
                if (!(flags & BIND_FROM_BPF)) {
                        err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
                        if (err) {
                                inet->inet_saddr = inet->inet_rcv_saddr = 0;
                                if (sk->sk_prot->put_port)
                                        sk->sk_prot->put_port(sk);
                                goto out_release_sock;
                        }
                }
        }

        if (inet->inet_rcv_saddr)
                sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
        if (snum)
                sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
        inet->inet_sport = htons(inet->inet_num);
        inet->inet_daddr = 0;
        inet->inet_dport = 0;
        sk_dst_reset(sk);
        err = 0;
out_release_sock:
        if (flags & BIND_WITH_LOCK)
                release_sock(sk);
out:
        return err;
}

int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
                       int addr_len, int flags)
{
        struct sock *sk = sock->sk;
        const struct proto *prot;
        int err;

        if (addr_len < sizeof(uaddr->sa_family))
                return -EINVAL;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        prot = READ_ONCE(sk->sk_prot);

        if (uaddr->sa_family == AF_UNSPEC)
                return prot->disconnect(sk, flags);

        if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
                err = prot->pre_connect(sk, uaddr, addr_len);
                if (err)
                        return err;
        }

        if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
                return -EAGAIN;
        return prot->connect(sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_dgram_connect);

static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);

        add_wait_queue(sk_sleep(sk), &wait);
        sk->sk_write_pending += writebias;

        /* Basic assumption: if someone sets sk->sk_err, he _must_
         * change state of the socket from TCP_SYN_*.
         * Connect() does not allow to get error notifications
         * without closing the socket.
         */
        while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                release_sock(sk);
                timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
                lock_sock(sk);
                if (signal_pending(current) || !timeo)
                        break;
        }
        remove_wait_queue(sk_sleep(sk), &wait);
        sk->sk_write_pending -= writebias;
        return timeo;
}

/*
 *        Connect to a remote host. There is regrettably still a little
 *        TCP 'magic' in here.
 */
int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
                          int addr_len, int flags, int is_sendmsg)
{
        struct sock *sk = sock->sk;
        int err;
        long timeo;

        /*
         * uaddr can be NULL and addr_len can be 0 if:
         * sk is a TCP fastopen active socket and
         * TCP_FASTOPEN_CONNECT sockopt is set and
         * we already have a valid cookie for this socket.
         * In this case, user can call write() after connect().
         * write() will invoke tcp_sendmsg_fastopen() which calls
         * __inet_stream_connect().
         */
        if (uaddr) {
                if (addr_len < sizeof(uaddr->sa_family))
                        return -EINVAL;

                if (uaddr->sa_family == AF_UNSPEC) {
                        sk->sk_disconnects++;
                        err = sk->sk_prot->disconnect(sk, flags);
                        sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
                        goto out;
                }
        }

        switch (sock->state) {
        default:
                err = -EINVAL;
                goto out;
        case SS_CONNECTED:
                err = -EISCONN;
                goto out;
        case SS_CONNECTING:
                if (inet_test_bit(DEFER_CONNECT, sk))
                        err = is_sendmsg ? -EINPROGRESS : -EISCONN;
                else
                        err = -EALREADY;
                /* Fall out of switch with err, set for this state */
                break;
        case SS_UNCONNECTED:
                err = -EISCONN;
                if (sk->sk_state != TCP_CLOSE)
                        goto out;

                if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
                        err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
                        if (err)
                                goto out;
                }

                err = sk->sk_prot->connect(sk, uaddr, addr_len);
                if (err < 0)
                        goto out;

                sock->state = SS_CONNECTING;

                if (!err && inet_test_bit(DEFER_CONNECT, sk))
                        goto out;

                /* Just entered SS_CONNECTING state; the only
                 * difference is that return value in non-blocking
                 * case is EINPROGRESS, rather than EALREADY.
                 */
                err = -EINPROGRESS;
                break;
        }

        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
                                tcp_sk(sk)->fastopen_req &&
                                tcp_sk(sk)->fastopen_req->data ? 1 : 0;
                int dis = sk->sk_disconnects;

                /* Error code is set above */
                if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
                        goto out;

                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        goto out;

                if (dis != sk->sk_disconnects) {
                        err = -EPIPE;
                        goto out;
                }
        }

        /* Connection was closed by RST, timeout, ICMP error
         * or another process disconnected us.
         */
        if (sk->sk_state == TCP_CLOSE)
                goto sock_error;

        /* sk->sk_err may be not zero now, if RECVERR was ordered by user
         * and error was received after socket entered established state.
         * Hence, it is handled normally after connect() return successfully.
         */

        sock->state = SS_CONNECTED;
        err = 0;
out:
        return err;

sock_error:
        err = sock_error(sk) ? : -ECONNABORTED;
        sock->state = SS_UNCONNECTED;
        sk->sk_disconnects++;
        if (sk->sk_prot->disconnect(sk, flags))
                sock->state = SS_DISCONNECTING;
        goto out;
}
EXPORT_SYMBOL(__inet_stream_connect);

int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
                        int addr_len, int flags)
{
        int err;

        lock_sock(sock->sk);
        err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
        release_sock(sock->sk);
        return err;
}
EXPORT_SYMBOL(inet_stream_connect);

void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk)
{
        if (mem_cgroup_sockets_enabled) {
                mem_cgroup_sk_alloc(newsk);
                __sk_charge(newsk, GFP_KERNEL);
        }

        sock_rps_record_flow(newsk);
        WARN_ON(!((1 << newsk->sk_state) &
                  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
                   TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 |
                   TCPF_CLOSING | TCPF_CLOSE_WAIT |
                   TCPF_CLOSE)));

        if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
                set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
        sock_graft(newsk, newsock);

        newsock->state = SS_CONNECTED;
}
EXPORT_SYMBOL_GPL(__inet_accept);

/*
 *        Accept a pending connection. The TCP layer now gives BSD semantics.
 */

int inet_accept(struct socket *sock, struct socket *newsock,
                struct proto_accept_arg *arg)
{
        struct sock *sk1 = sock->sk, *sk2;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        arg->err = -EINVAL;
        sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg);
        if (!sk2)
                return arg->err;

        lock_sock(sk2);
        __inet_accept(sock, newsock, sk2);
        release_sock(sk2);
        return 0;
}
EXPORT_SYMBOL(inet_accept);

/*
 *        This does both peername and sockname.
 */
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
                 int peer)
{
        struct sock *sk                = sock->sk;
        struct inet_sock *inet        = inet_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
        int sin_addr_len = sizeof(*sin);

        sin->sin_family = AF_INET;
        lock_sock(sk);
        if (peer) {
                if (!inet->inet_dport ||
                    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
                     peer == 1)) {
                        release_sock(sk);
                        return -ENOTCONN;
                }
                sin->sin_port = inet->inet_dport;
                sin->sin_addr.s_addr = inet->inet_daddr;
                BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
                                       CGROUP_INET4_GETPEERNAME);
        } else {
                __be32 addr = inet->inet_rcv_saddr;
                if (!addr)
                        addr = inet->inet_saddr;
                sin->sin_port = inet->inet_sport;
                sin->sin_addr.s_addr = addr;
                BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
                                       CGROUP_INET4_GETSOCKNAME);
        }
        release_sock(sk);
        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
        return sin_addr_len;
}
EXPORT_SYMBOL(inet_getname);

int inet_send_prepare(struct sock *sk)
{
        sock_rps_record_flow(sk);

        /* We may need to bind the socket. */
        if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind &&
            inet_autobind(sk))
                return -EAGAIN;

        return 0;
}
EXPORT_SYMBOL_GPL(inet_send_prepare);

int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
        struct sock *sk = sock->sk;
        const struct proto *prot;

        if (unlikely(inet_send_prepare(sk)))
                return -EAGAIN;

        prot = READ_ONCE(sk->sk_prot);
        return INDIRECT_CALL_2(prot->sendmsg, tcp_sendmsg, udp_sendmsg,
                               sk, msg, size);
}
EXPORT_SYMBOL(inet_sendmsg);

void inet_splice_eof(struct socket *sock)
{
        const struct proto *prot;
        struct sock *sk = sock->sk;

        if (unlikely(inet_send_prepare(sk)))
                return;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        prot = READ_ONCE(sk->sk_prot);
        if (prot->splice_eof)
                prot->splice_eof(sock);
}
EXPORT_SYMBOL_GPL(inet_splice_eof);

int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                 int flags)
{
        struct sock *sk = sock->sk;
        const struct proto *prot;

        if (likely(!(flags & MSG_ERRQUEUE)))
                sock_rps_record_flow(sk);

        prot = READ_ONCE(sk->sk_prot);
        return INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udp_recvmsg,
                               sk, msg, size, flags);
}
EXPORT_SYMBOL(inet_recvmsg);

int inet_shutdown(struct socket *sock, int how)
{
        struct sock *sk = sock->sk;
        int err = 0;

        /* This should really check to make sure
         * the socket is a TCP socket. (WHY AC...)
         */
        how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
                       1->2 bit 2 snds.
                       2->3 */
        if ((how & ~SHUTDOWN_MASK) || !how)        /* MAXINT->0 */
                return -EINVAL;

        lock_sock(sk);
        if (sock->state == SS_CONNECTING) {
                if ((1 << sk->sk_state) &
                    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
                        sock->state = SS_DISCONNECTING;
                else
                        sock->state = SS_CONNECTED;
        }

        switch (sk->sk_state) {
        case TCP_CLOSE:
                err = -ENOTCONN;
                /* Hack to wake up other listeners, who can poll for
                   EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
                fallthrough;
        default:
                WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how);
                if (sk->sk_prot->shutdown)
                        sk->sk_prot->shutdown(sk, how);
                break;

        /* Remaining two branches are temporary solution for missing
         * close() in multithreaded environment. It is _not_ a good idea,
         * but we have no choice until close() is repaired at VFS level.
         */
        case TCP_LISTEN:
                if (!(how & RCV_SHUTDOWN))
                        break;
                fallthrough;
        case TCP_SYN_SENT:
                err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
                sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
                break;
        }

        /* Wake up anyone sleeping in poll. */
        sk->sk_state_change(sk);
        release_sock(sk);
        return err;
}
EXPORT_SYMBOL(inet_shutdown);

/*
 *        ioctl() calls you can issue on an INET socket. Most of these are
 *        device configuration and stuff and very rarely used. Some ioctls
 *        pass on to the socket itself.
 *
 *        NOTE: I like the idea of a module for the config stuff. ie ifconfig
 *        loads the devconfigure module does its configuring and unloads it.
 *        There's a good 20K of config code hanging around the kernel.
 */

int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        struct sock *sk = sock->sk;
        int err = 0;
        struct net *net = sock_net(sk);
        void __user *p = (void __user *)arg;
        struct ifreq ifr;
        struct rtentry rt;

        switch (cmd) {
        case SIOCADDRT:
        case SIOCDELRT:
                if (copy_from_user(&rt, p, sizeof(struct rtentry)))
                        return -EFAULT;
                err = ip_rt_ioctl(net, cmd, &rt);
                break;
        case SIOCRTMSG:
                err = -EINVAL;
                break;
        case SIOCDARP:
        case SIOCGARP:
        case SIOCSARP:
                err = arp_ioctl(net, cmd, (void __user *)arg);
                break;
        case SIOCGIFADDR:
        case SIOCGIFBRDADDR:
        case SIOCGIFNETMASK:
        case SIOCGIFDSTADDR:
        case SIOCGIFPFLAGS:
                if (get_user_ifreq(&ifr, NULL, p))
                        return -EFAULT;
                err = devinet_ioctl(net, cmd, &ifr);
                if (!err && put_user_ifreq(&ifr, p))
                        err = -EFAULT;
                break;

        case SIOCSIFADDR:
        case SIOCSIFBRDADDR:
        case SIOCSIFNETMASK:
        case SIOCSIFDSTADDR:
        case SIOCSIFPFLAGS:
        case SIOCSIFFLAGS:
                if (get_user_ifreq(&ifr, NULL, p))
                        return -EFAULT;
                err = devinet_ioctl(net, cmd, &ifr);
                break;
        default:
                if (sk->sk_prot->ioctl)
                        err = sk_ioctl(sk, cmd, (void __user *)arg);
                else
                        err = -ENOIOCTLCMD;
                break;
        }
        return err;
}
EXPORT_SYMBOL(inet_ioctl);

#ifdef CONFIG_COMPAT
static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
                struct compat_rtentry __user *ur)
{
        compat_uptr_t rtdev;
        struct rtentry rt;

        if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
                        3 * sizeof(struct sockaddr)) ||
            get_user(rt.rt_flags, &ur->rt_flags) ||
            get_user(rt.rt_metric, &ur->rt_metric) ||
            get_user(rt.rt_mtu, &ur->rt_mtu) ||
            get_user(rt.rt_window, &ur->rt_window) ||
            get_user(rt.rt_irtt, &ur->rt_irtt) ||
            get_user(rtdev, &ur->rt_dev))
                return -EFAULT;

        rt.rt_dev = compat_ptr(rtdev);
        return ip_rt_ioctl(sock_net(sk), cmd, &rt);
}

static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        void __user *argp = compat_ptr(arg);
        struct sock *sk = sock->sk;

        switch (cmd) {
        case SIOCADDRT:
        case SIOCDELRT:
                return inet_compat_routing_ioctl(sk, cmd, argp);
        default:
                if (!sk->sk_prot->compat_ioctl)
                        return -ENOIOCTLCMD;
                return sk->sk_prot->compat_ioctl(sk, cmd, arg);
        }
}
#endif /* CONFIG_COMPAT */

const struct proto_ops inet_stream_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,
        .bind                   = inet_bind,
        .connect           = inet_stream_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = inet_accept,
        .getname           = inet_getname,
        .poll                   = tcp_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = inet_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = inet_recvmsg,
#ifdef CONFIG_MMU
        .mmap                   = tcp_mmap,
#endif
        .splice_eof           = inet_splice_eof,
        .splice_read           = tcp_splice_read,
        .set_peek_off      = sk_set_peek_off,
        .read_sock           = tcp_read_sock,
        .read_skb           = tcp_read_skb,
        .sendmsg_locked    = tcp_sendmsg_locked,
        .peek_len           = tcp_peek_len,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet_compat_ioctl,
#endif
        .set_rcvlowat           = tcp_set_rcvlowat,
        .set_rcvbuf           = tcp_set_rcvbuf,
};
EXPORT_SYMBOL(inet_stream_ops);

const struct proto_ops inet_dgram_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,
        .bind                   = inet_bind,
        .connect           = inet_dgram_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = sock_no_accept,
        .getname           = inet_getname,
        .poll                   = udp_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = sock_no_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .read_skb           = udp_read_skb,
        .recvmsg           = inet_recvmsg,
        .mmap                   = sock_no_mmap,
        .splice_eof           = inet_splice_eof,
        .set_peek_off           = udp_set_peek_off,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);

/*
 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
 * udp_poll
 */
static const struct proto_ops inet_sockraw_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,
        .bind                   = inet_bind,
        .connect           = inet_dgram_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = sock_no_accept,
        .getname           = inet_getname,
        .poll                   = datagram_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = sock_no_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = inet_recvmsg,
        .mmap                   = sock_no_mmap,
        .splice_eof           = inet_splice_eof,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet_compat_ioctl,
#endif
};

static const struct net_proto_family inet_family_ops = {
        .family = PF_INET,
        .create = inet_create,
        .owner        = THIS_MODULE,
};

/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
        {
                .type =       SOCK_STREAM,
                .protocol =   IPPROTO_TCP,
                .prot =       &tcp_prot,
                .ops =        &inet_stream_ops,
                .flags =      INET_PROTOSW_PERMANENT |
                              INET_PROTOSW_ICSK,
        },

        {
                .type =       SOCK_DGRAM,
                .protocol =   IPPROTO_UDP,
                .prot =       &udp_prot,
                .ops =        &inet_dgram_ops,
                .flags =      INET_PROTOSW_PERMANENT,
       },

       {
                .type =       SOCK_DGRAM,
                .protocol =   IPPROTO_ICMP,
                .prot =       &ping_prot,
                .ops =        &inet_sockraw_ops,
                .flags =      INET_PROTOSW_REUSE,
       },

       {
               .type =       SOCK_RAW,
               .protocol =   IPPROTO_IP,        /* wild card */
               .prot =       &raw_prot,
               .ops =        &inet_sockraw_ops,
               .flags =      INET_PROTOSW_REUSE,
       }
};

#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

void inet_register_protosw(struct inet_protosw *p)
{
        struct list_head *lh;
        struct inet_protosw *answer;
        int protocol = p->protocol;
        struct list_head *last_perm;

        spin_lock_bh(&inetsw_lock);

        if (p->type >= SOCK_MAX)
                goto out_illegal;

        /* If we are trying to override a permanent protocol, bail. */
        last_perm = &inetsw[p->type];
        list_for_each(lh, &inetsw[p->type]) {
                answer = list_entry(lh, struct inet_protosw, list);
                /* Check only the non-wild match. */
                if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
                        break;
                if (protocol == answer->protocol)
                        goto out_permanent;
                last_perm = lh;
        }

        /* Add the new entry after the last permanent entry if any, so that
         * the new entry does not override a permanent entry when matched with
         * a wild-card protocol. But it is allowed to override any existing
         * non-permanent entry.  This means that when we remove this entry, the
         * system automatically returns to the old behavior.
         */
        list_add_rcu(&p->list, last_perm);
out:
        spin_unlock_bh(&inetsw_lock);

        return;

out_permanent:
        pr_err("Attempt to override permanent protocol %d\n", protocol);
        goto out;

out_illegal:
        pr_err("Ignoring attempt to register invalid socket type %d\n",
               p->type);
        goto out;
}
EXPORT_SYMBOL(inet_register_protosw);

void inet_unregister_protosw(struct inet_protosw *p)
{
        if (INET_PROTOSW_PERMANENT & p->flags) {
                pr_err("Attempt to unregister permanent protocol %d\n",
                       p->protocol);
        } else {
                spin_lock_bh(&inetsw_lock);
                list_del_rcu(&p->list);
                spin_unlock_bh(&inetsw_lock);

                synchronize_net();
        }
}
EXPORT_SYMBOL(inet_unregister_protosw);

static int inet_sk_reselect_saddr(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);
        __be32 old_saddr = inet->inet_saddr;
        __be32 daddr = inet->inet_daddr;
        struct flowi4 *fl4;
        struct rtable *rt;
        __be32 new_saddr;
        struct ip_options_rcu *inet_opt;
        int err;

        inet_opt = rcu_dereference_protected(inet->inet_opt,
                                             lockdep_sock_is_held(sk));
        if (inet_opt && inet_opt->opt.srr)
                daddr = inet_opt->opt.faddr;

        /* Query new route. */
        fl4 = &inet->cork.fl.u.ip4;
        rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if,
                              sk->sk_protocol, inet->inet_sport,
                              inet->inet_dport, sk);
        if (IS_ERR(rt))
                return PTR_ERR(rt);

        new_saddr = fl4->saddr;

        if (new_saddr == old_saddr) {
                sk_setup_caps(sk, &rt->dst);
                return 0;
        }

        err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET);
        if (err) {
                ip_rt_put(rt);
                return err;
        }

        sk_setup_caps(sk, &rt->dst);

        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
                pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
                        __func__, &old_saddr, &new_saddr);
        }

        /*
         * XXX The only one ugly spot where we need to
         * XXX really change the sockets identity after
         * XXX it has entered the hashes. -DaveM
         *
         * Besides that, it does not check for connection
         * uniqueness. Wait for troubles.
         */
        return __sk_prot_rehash(sk);
}

int inet_sk_rebuild_header(struct sock *sk)
{
        struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0));
        struct inet_sock *inet = inet_sk(sk);
        struct flowi4 *fl4;
        int err;

        /* Route is OK, nothing to do. */
        if (rt)
                return 0;

        /* Reroute. */
        fl4 = &inet->cork.fl.u.ip4;
        inet_sk_init_flowi4(inet, fl4);
        rt = ip_route_output_flow(sock_net(sk), fl4, sk);
        if (!IS_ERR(rt)) {
                err = 0;
                sk_setup_caps(sk, &rt->dst);
        } else {
                err = PTR_ERR(rt);

                /* Routing failed... */
                sk->sk_route_caps = 0;

                if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
                    sk->sk_state != TCP_SYN_SENT ||
                    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
                    (err = inet_sk_reselect_saddr(sk)) != 0)
                        WRITE_ONCE(sk->sk_err_soft, -err);
        }

        return err;
}
EXPORT_SYMBOL(inet_sk_rebuild_header);

void inet_sk_set_state(struct sock *sk, int state)
{
        trace_inet_sock_set_state(sk, sk->sk_state, state);
        sk->sk_state = state;
}
EXPORT_SYMBOL(inet_sk_set_state);

void inet_sk_state_store(struct sock *sk, int newstate)
{
        trace_inet_sock_set_state(sk, sk->sk_state, newstate);
        smp_store_release(&sk->sk_state, newstate);
}

struct sk_buff *inet_gso_segment(struct sk_buff *skb,
                                 netdev_features_t features)
{
        bool udpfrag = false, fixedid = false, gso_partial, encap;
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        const struct net_offload *ops;
        unsigned int offset = 0;
        struct iphdr *iph;
        int proto, tot_len;
        int nhoff;
        int ihl;
        int id;

        skb_reset_network_header(skb);
        nhoff = skb_network_header(skb) - skb_mac_header(skb);
        if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
                goto out;

        iph = ip_hdr(skb);
        ihl = iph->ihl * 4;
        if (ihl < sizeof(*iph))
                goto out;

        id = ntohs(iph->id);
        proto = iph->protocol;

        /* Warning: after this point, iph might be no longer valid */
        if (unlikely(!pskb_may_pull(skb, ihl)))
                goto out;
        __skb_pull(skb, ihl);

        encap = SKB_GSO_CB(skb)->encap_level > 0;
        if (encap)
                features &= skb->dev->hw_enc_features;
        SKB_GSO_CB(skb)->encap_level += ihl;

        skb_reset_transport_header(skb);

        segs = ERR_PTR(-EPROTONOSUPPORT);

        fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap));

        if (!skb->encapsulation || encap)
                udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);

        ops = rcu_dereference(inet_offloads[proto]);
        if (likely(ops && ops->callbacks.gso_segment)) {
                segs = ops->callbacks.gso_segment(skb, features);
                if (!segs)
                        skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
        }

        if (IS_ERR_OR_NULL(segs))
                goto out;

        gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);

        skb = segs;
        do {
                iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
                if (udpfrag) {
                        iph->frag_off = htons(offset >> 3);
                        if (skb->next)
                                iph->frag_off |= htons(IP_MF);
                        offset += skb->len - nhoff - ihl;
                        tot_len = skb->len - nhoff;
                } else if (skb_is_gso(skb)) {
                        if (!fixedid) {
                                iph->id = htons(id);
                                id += skb_shinfo(skb)->gso_segs;
                        }

                        if (gso_partial)
                                tot_len = skb_shinfo(skb)->gso_size +
                                          SKB_GSO_CB(skb)->data_offset +
                                          skb->head - (unsigned char *)iph;
                        else
                                tot_len = skb->len - nhoff;
                } else {
                        if (!fixedid)
                                iph->id = htons(id++);
                        tot_len = skb->len - nhoff;
                }
                iph->tot_len = htons(tot_len);
                ip_send_check(iph);
                if (encap)
                        skb_reset_inner_headers(skb);
                skb->network_header = (u8 *)iph - skb->head;
                skb_reset_mac_len(skb);
        } while ((skb = skb->next));

out:
        return segs;
}

static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
                                        netdev_features_t features)
{
        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
                return ERR_PTR(-EINVAL);

        return inet_gso_segment(skb, features);
}

struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
{
        const struct net_offload *ops;
        struct sk_buff *pp = NULL;
        const struct iphdr *iph;
        struct sk_buff *p;
        unsigned int hlen;
        unsigned int off;
        int flush = 1;
        int proto;

        off = skb_gro_offset(skb);
        hlen = off + sizeof(*iph);
        iph = skb_gro_header(skb, hlen, off);
        if (unlikely(!iph))
                goto out;

        proto = iph->protocol;

        ops = rcu_dereference(inet_offloads[proto]);
        if (!ops || !ops->callbacks.gro_receive)
                goto out;

        if (*(u8 *)iph != 0x45)
                goto out;

        if (ip_is_fragment(iph))
                goto out;

        if (unlikely(ip_fast_csum((u8 *)iph, 5)))
                goto out;

        NAPI_GRO_CB(skb)->proto = proto;
        flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF));

        list_for_each_entry(p, head, list) {
                struct iphdr *iph2;

                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                iph2 = (struct iphdr *)(p->data + off);
                /* The above works because, with the exception of the top
                 * (inner most) layer, we only aggregate pkts with the same
                 * hdr length so all the hdrs we'll need to verify will start
                 * at the same offset.
                 */
                if ((iph->protocol ^ iph2->protocol) |
                    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
                    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }
        }

        NAPI_GRO_CB(skb)->flush |= flush;
        NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off;

        /* Note : No need to call skb_gro_postpull_rcsum() here,
         * as we already checked checksum over ipv4 header was 0
         */
        skb_gro_pull(skb, sizeof(*iph));
        skb_set_transport_header(skb, skb_gro_offset(skb));

        pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
                                       ops->callbacks.gro_receive, head, skb);

out:
        skb_gro_flush_final(skb, pp, flush);

        return pp;
}

static struct sk_buff *ipip_gro_receive(struct list_head *head,
                                        struct sk_buff *skb)
{
        if (NAPI_GRO_CB(skb)->encap_mark) {
                NAPI_GRO_CB(skb)->flush = 1;
                return NULL;
        }

        NAPI_GRO_CB(skb)->encap_mark = 1;

        return inet_gro_receive(head, skb);
}

#define SECONDS_PER_DAY        86400

/* inet_current_timestamp - Return IP network timestamp
 *
 * Return milliseconds since midnight in network byte order.
 */
__be32 inet_current_timestamp(void)
{
        u32 secs;
        u32 msecs;
        struct timespec64 ts;

        ktime_get_real_ts64(&ts);

        /* Get secs since midnight. */
        (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
        /* Convert to msecs. */
        msecs = secs * MSEC_PER_SEC;
        /* Convert nsec to msec. */
        msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;

        /* Convert to network byte order. */
        return htonl(msecs);
}
EXPORT_SYMBOL(inet_current_timestamp);

int inet_recv_error(struct sock *sk, struct msghdr *msg, int len)
{
        unsigned int family = READ_ONCE(sk->sk_family);

        if (family == AF_INET)
                return ip_recv_error(sk, msg, len);
#if IS_ENABLED(CONFIG_IPV6)
        if (family == AF_INET6)
                return pingv6_ops.ipv6_recv_error(sk, msg, len);
#endif
        return -EINVAL;
}
EXPORT_SYMBOL(inet_recv_error);

int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
        struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
        const struct net_offload *ops;
        __be16 totlen = iph->tot_len;
        int proto = iph->protocol;
        int err = -ENOSYS;

        if (skb->encapsulation) {
                skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
                skb_set_inner_network_header(skb, nhoff);
        }

        iph_set_totlen(iph, skb->len - nhoff);
        csum_replace2(&iph->check, totlen, iph->tot_len);

        ops = rcu_dereference(inet_offloads[proto]);
        if (WARN_ON(!ops || !ops->callbacks.gro_complete))
                goto out;

        /* Only need to add sizeof(*iph) to get to the next hdr below
         * because any hdr with option will have been flushed in
         * inet_gro_receive().
         */
        err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
                              tcp4_gro_complete, udp4_gro_complete,
                              skb, nhoff + sizeof(*iph));

out:
        return err;
}

static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
{
        skb->encapsulation = 1;
        skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
        return inet_gro_complete(skb, nhoff);
}

int inet_ctl_sock_create(struct sock **sk, unsigned short family,
                         unsigned short type, unsigned char protocol,
                         struct net *net)
{
        struct socket *sock;
        int rc = sock_create_kern(net, family, type, protocol, &sock);

        if (rc == 0) {
                *sk = sock->sk;
                (*sk)->sk_allocation = GFP_ATOMIC;
                (*sk)->sk_use_task_frag = false;
                /*
                 * Unhash it so that IP input processing does not even see it,
                 * we do not wish this socket to see incoming packets.
                 */
                (*sk)->sk_prot->unhash(*sk);
        }
        return rc;
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);

unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
        unsigned long res = 0;
        int i;

        for_each_possible_cpu(i)
                res += snmp_get_cpu_field(mib, i, offt);
        return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);

#if BITS_PER_LONG==32

u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
                         size_t syncp_offset)
{
        void *bhptr;
        struct u64_stats_sync *syncp;
        u64 v;
        unsigned int start;

        bhptr = per_cpu_ptr(mib, cpu);
        syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
        do {
                start = u64_stats_fetch_begin(syncp);
                v = *(((u64 *)bhptr) + offt);
        } while (u64_stats_fetch_retry(syncp, start));

        return v;
}
EXPORT_SYMBOL_GPL(snmp_get_cpu_field64);

u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
{
        u64 res = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset);
        }
        return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field64);
#endif

#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
        .handler =        igmp_rcv,
};
#endif

static const struct net_protocol icmp_protocol = {
        .handler =        icmp_rcv,
        .err_handler =        icmp_err,
        .no_policy =        1,
};

static __net_init int ipv4_mib_init_net(struct net *net)
{
        int i;

        net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
        if (!net->mib.tcp_statistics)
                goto err_tcp_mib;
        net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
        if (!net->mib.ip_statistics)
                goto err_ip_mib;

        for_each_possible_cpu(i) {
                struct ipstats_mib *af_inet_stats;
                af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
                u64_stats_init(&af_inet_stats->syncp);
        }

        net->mib.net_statistics = alloc_percpu(struct linux_mib);
        if (!net->mib.net_statistics)
                goto err_net_mib;
        net->mib.udp_statistics = alloc_percpu(struct udp_mib);
        if (!net->mib.udp_statistics)
                goto err_udp_mib;
        net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
        if (!net->mib.icmp_statistics)
                goto err_icmp_mib;
        net->mib.icmpmsg_statistics = kzalloc_obj(struct icmpmsg_mib);
        if (!net->mib.icmpmsg_statistics)
                goto err_icmpmsg_mib;

        tcp_mib_init(net);
        return 0;

err_icmpmsg_mib:
        free_percpu(net->mib.icmp_statistics);
err_icmp_mib:
        free_percpu(net->mib.udp_statistics);
err_udp_mib:
        free_percpu(net->mib.net_statistics);
err_net_mib:
        free_percpu(net->mib.ip_statistics);
err_ip_mib:
        free_percpu(net->mib.tcp_statistics);
err_tcp_mib:
        return -ENOMEM;
}

static __net_exit void ipv4_mib_exit_net(struct net *net)
{
        kfree(net->mib.icmpmsg_statistics);
        free_percpu(net->mib.icmp_statistics);
        free_percpu(net->mib.udp_statistics);
        free_percpu(net->mib.net_statistics);
        free_percpu(net->mib.ip_statistics);
        free_percpu(net->mib.tcp_statistics);
#ifdef CONFIG_MPTCP
        /* allocated on demand, see mptcp_init_sock() */
        free_percpu(net->mib.mptcp_statistics);
#endif
}

static __net_initdata struct pernet_operations ipv4_mib_ops = {
        .init = ipv4_mib_init_net,
        .exit = ipv4_mib_exit_net,
};

static int __init init_ipv4_mibs(void)
{
        return register_pernet_subsys(&ipv4_mib_ops);
}

static __net_init int inet_init_net(struct net *net)
{
        /*
         * Set defaults for local port range
         */
        net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u;

        seqlock_init(&net->ipv4.ping_group_range.lock);
        /*
         * Sane defaults - nobody may create ping sockets.
         * Boot scripts should set this to distro-specific group.
         */
        net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
        net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);

        /* Default values for sysctl-controlled parameters.
         * We set them here, in case sysctl is not compiled.
         */
        net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
        net->ipv4.sysctl_ip_fwd_update_priority = 1;
        net->ipv4.sysctl_ip_dynaddr = 0;
        net->ipv4.sysctl_ip_early_demux = 1;
        net->ipv4.sysctl_udp_early_demux = 1;
        net->ipv4.sysctl_tcp_early_demux = 1;
        net->ipv4.sysctl_nexthop_compat_mode = 1;
#ifdef CONFIG_SYSCTL
        net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
#endif

        /* Some igmp sysctl, whose values are always used */
        net->ipv4.sysctl_igmp_max_memberships = 20;
        net->ipv4.sysctl_igmp_max_msf = 10;
        /* IGMP reports for link-local multicast groups are enabled by default */
        net->ipv4.sysctl_igmp_llm_reports = 1;
        net->ipv4.sysctl_igmp_qrv = 2;

        net->ipv4.sysctl_fib_notify_on_flag_change = 0;

        return 0;
}

static __net_initdata struct pernet_operations af_inet_ops = {
        .init = inet_init_net,
};

static int __init init_inet_pernet_ops(void)
{
        return register_pernet_subsys(&af_inet_ops);
}

static int ipv4_proc_init(void);

/*
 *        IP protocol layer initialiser
 */


static const struct net_offload ipip_offload = {
        .callbacks = {
                .gso_segment        = ipip_gso_segment,
                .gro_receive        = ipip_gro_receive,
                .gro_complete        = ipip_gro_complete,
        },
};

static int __init ipip_offload_init(void)
{
        return inet_add_offload(&ipip_offload, IPPROTO_IPIP);
}

static int __init ipv4_offload_init(void)
{
        /*
         * Add offloads
         */
        if (udpv4_offload_init() < 0)
                pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
        if (tcpv4_offload_init() < 0)
                pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
        if (ipip_offload_init() < 0)
                pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);

        net_hotdata.ip_packet_offload = (struct packet_offload) {
                .type = cpu_to_be16(ETH_P_IP),
                .callbacks = {
                        .gso_segment = inet_gso_segment,
                        .gro_receive = inet_gro_receive,
                        .gro_complete = inet_gro_complete,
                },
        };
        dev_add_offload(&net_hotdata.ip_packet_offload);
        return 0;
}

fs_initcall(ipv4_offload_init);

static struct packet_type ip_packet_type __read_mostly = {
        .type = cpu_to_be16(ETH_P_IP),
        .func = ip_rcv,
        .list_func = ip_list_rcv,
};

static int __init inet_init(void)
{
        struct inet_protosw *q;
        struct list_head *r;
        int rc;

        sock_skb_cb_check_size(sizeof(struct inet_skb_parm));

        raw_hashinfo_init(&raw_v4_hashinfo);

        rc = proto_register(&tcp_prot, 1);
        if (rc)
                goto out;

        rc = proto_register(&udp_prot, 1);
        if (rc)
                goto out_unregister_tcp_proto;

        rc = proto_register(&raw_prot, 1);
        if (rc)
                goto out_unregister_udp_proto;

        rc = proto_register(&ping_prot, 1);
        if (rc)
                goto out_unregister_raw_proto;

        /*
         *        Tell SOCKET that we are alive...
         */

        (void)sock_register(&inet_family_ops);

#ifdef CONFIG_SYSCTL
        ip_static_sysctl_init();
#endif

        /*
         *        Add all the base protocols.
         */

        if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
                pr_crit("%s: Cannot add ICMP protocol\n", __func__);

        net_hotdata.udp_protocol = (struct net_protocol) {
                .handler =        udp_rcv,
                .err_handler =        udp_err,
                .no_policy =        1,
        };
        if (inet_add_protocol(&net_hotdata.udp_protocol, IPPROTO_UDP) < 0)
                pr_crit("%s: Cannot add UDP protocol\n", __func__);

        net_hotdata.tcp_protocol = (struct net_protocol) {
                .handler        =        tcp_v4_rcv,
                .err_handler        =        tcp_v4_err,
                .no_policy        =        1,
                .icmp_strict_tag_validation = 1,
        };
        if (inet_add_protocol(&net_hotdata.tcp_protocol, IPPROTO_TCP) < 0)
                pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
        if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
                pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif

        /* Register the socket-side information for inet_create. */
        for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
                INIT_LIST_HEAD(r);

        for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
                inet_register_protosw(q);

        /*
         *        Set the ARP module up
         */

        arp_init();

        /*
         *        Set the IP module up
         */

        ip_init();

        /* Initialise per-cpu ipv4 mibs */
        if (init_ipv4_mibs())
                panic("%s: Cannot init ipv4 mibs\n", __func__);

        /* Setup TCP slab cache for open requests. */
        tcp_init();

        /* Setup UDP memory threshold */
        udp_init();

        raw_init();

        ping_init();

        /*
         *        Set the ICMP layer up
         */

        if (icmp_init() < 0)
                panic("Failed to create the ICMP control socket.\n");

        /*
         *        Initialise the multicast router
         */
#if defined(CONFIG_IP_MROUTE)
        if (ip_mr_init())
                pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif

        if (init_inet_pernet_ops())
                pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);

        ipv4_proc_init();

        ipfrag_init();

        dev_add_pack(&ip_packet_type);

        ip_tunnel_core_init();

        rc = 0;
out:
        return rc;
out_unregister_raw_proto:
        proto_unregister(&raw_prot);
out_unregister_udp_proto:
        proto_unregister(&udp_prot);
out_unregister_tcp_proto:
        proto_unregister(&tcp_prot);
        goto out;
}

fs_initcall(inet_init);

/* ------------------------------------------------------------------------ */

#ifdef CONFIG_PROC_FS
static int __init ipv4_proc_init(void)
{
        int rc = 0;

        if (raw_proc_init())
                goto out_raw;
        if (tcp4_proc_init())
                goto out_tcp;
        if (udp4_proc_init())
                goto out_udp;
        if (ping_proc_init())
                goto out_ping;
        if (ip_misc_proc_init())
                goto out_misc;
out:
        return rc;
out_misc:
        ping_proc_exit();
out_ping:
        udp4_proc_exit();
out_udp:
        tcp4_proc_exit();
out_tcp:
        raw_proc_exit();
out_raw:
        rc = -ENOMEM;
        goto out;
}

#else /* CONFIG_PROC_FS */
static int __init ipv4_proc_init(void)
{
        return 0;
}
#endif /* CONFIG_PROC_FS */
























































































































































































    1 

    1 





































    1 






    1 


    1 




























































































    1 



    1 



































    1 


















    1 


























    1 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/mount.c - kernfs mount implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/seq_file.h>
#include <linux/exportfs.h>
#include <linux/uuid.h>
#include <linux/statfs.h>

#include "kernfs-internal.h"

struct kmem_cache *kernfs_node_cache __ro_after_init;
struct kmem_cache *kernfs_iattrs_cache __ro_after_init;
struct kernfs_global_locks *kernfs_locks __ro_after_init;

static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
{
        struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry));
        struct kernfs_syscall_ops *scops = root->syscall_ops;

        if (scops && scops->show_options)
                return scops->show_options(sf, root);
        return 0;
}

static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
{
        struct kernfs_node *node = kernfs_dentry_node(dentry);
        struct kernfs_root *root = kernfs_root(node);
        struct kernfs_syscall_ops *scops = root->syscall_ops;

        if (scops && scops->show_path)
                return scops->show_path(sf, node, root);

        seq_dentry(sf, dentry, " \t\n\\");
        return 0;
}

static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        simple_statfs(dentry, buf);
        buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
        return 0;
}

const struct super_operations kernfs_sops = {
        .statfs                = kernfs_statfs,
        .drop_inode        = inode_just_drop,
        .evict_inode        = kernfs_evict_inode,

        .show_options        = kernfs_sop_show_options,
        .show_path        = kernfs_sop_show_path,

        /*
         * sysfs is built on top of kernfs and sysfs provides the power
         * management infrastructure to support suspend/hibernate by
         * writing to various files in /sys/power/. As filesystems may
         * be automatically frozen during suspend/hibernate implementing
         * freeze/thaw support for kernfs generically will cause
         * deadlocks as the suspending/hibernation initiating task will
         * hold a VFS lock that it will then wait upon to be released.
         * If freeze/thaw for kernfs is needed talk to the VFS.
         */
        .freeze_fs        = NULL,
        .unfreeze_fs        = NULL,
        .freeze_super        = NULL,
        .thaw_super        = NULL,
};

static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
                            struct inode *parent)
{
        struct kernfs_node *kn = inode->i_private;

        if (*max_len < 2) {
                *max_len = 2;
                return FILEID_INVALID;
        }

        *max_len = 2;
        *(u64 *)fh = kn->id;
        return FILEID_KERNFS;
}

static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb,
                                            struct fid *fid, int fh_len,
                                            int fh_type, bool get_parent)
{
        struct kernfs_super_info *info = kernfs_info(sb);
        struct kernfs_node *kn;
        struct inode *inode;
        u64 id;

        if (fh_len < 2)
                return NULL;

        switch (fh_type) {
        case FILEID_KERNFS:
                id = *(u64 *)fid;
                break;
        case FILEID_INO32_GEN:
        case FILEID_INO32_GEN_PARENT:
                /*
                 * blk_log_action() exposes "LOW32,HIGH32" pair without
                 * type and userland can call us with generic fid
                 * constructed from them.  Combine it back to ID.  See
                 * blk_log_action().
                 */
                id = ((u64)fid->i32.gen << 32) | fid->i32.ino;
                break;
        default:
                return NULL;
        }

        kn = kernfs_find_and_get_node_by_id(info->root, id);
        if (!kn)
                return ERR_PTR(-ESTALE);

        if (get_parent) {
                struct kernfs_node *parent;

                parent = kernfs_get_parent(kn);
                kernfs_put(kn);
                kn = parent;
                if (!kn)
                        return ERR_PTR(-ESTALE);
        }

        inode = kernfs_get_inode(sb, kn);
        kernfs_put(kn);
        return d_obtain_alias(inode);
}

static struct dentry *kernfs_fh_to_dentry(struct super_block *sb,
                                          struct fid *fid, int fh_len,
                                          int fh_type)
{
        return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false);
}

static struct dentry *kernfs_fh_to_parent(struct super_block *sb,
                                          struct fid *fid, int fh_len,
                                          int fh_type)
{
        return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true);
}

static struct dentry *kernfs_get_parent_dentry(struct dentry *child)
{
        struct kernfs_node *kn = kernfs_dentry_node(child);
        struct kernfs_root *root = kernfs_root(kn);

        guard(rwsem_read)(&root->kernfs_rwsem);
        return d_obtain_alias(kernfs_get_inode(child->d_sb, kernfs_parent(kn)));
}

static const struct export_operations kernfs_export_ops = {
        .encode_fh        = kernfs_encode_fh,
        .fh_to_dentry        = kernfs_fh_to_dentry,
        .fh_to_parent        = kernfs_fh_to_parent,
        .get_parent        = kernfs_get_parent_dentry,
};

/**
 * kernfs_root_from_sb - determine kernfs_root associated with a super_block
 * @sb: the super_block in question
 *
 * Return: the kernfs_root associated with @sb.  If @sb is not a kernfs one,
 * %NULL is returned.
 */
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
{
        if (sb->s_op == &kernfs_sops)
                return kernfs_info(sb)->root;
        return NULL;
}

/*
 * find the next ancestor in the path down to @child, where @parent was the
 * ancestor whose descendant we want to find.
 *
 * Say the path is /a/b/c/d.  @child is d, @parent is %NULL.  We return the root
 * node.  If @parent is b, then we return the node for c.
 * Passing in d as @parent is not ok.
 */
static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
                                              struct kernfs_node *parent)
{
        if (child == parent) {
                pr_crit_once("BUG in find_next_ancestor: called with parent == child");
                return NULL;
        }

        while (kernfs_parent(child) != parent) {
                child = kernfs_parent(child);
                if (!child)
                        return NULL;
        }

        return child;
}

/**
 * kernfs_node_dentry - get a dentry for the given kernfs_node
 * @kn: kernfs_node for which a dentry is needed
 * @sb: the kernfs super_block
 *
 * Return: the dentry pointer
 */
struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
                                  struct super_block *sb)
{
        struct dentry *dentry;
        struct kernfs_node *knparent;
        struct kernfs_root *root;

        BUG_ON(sb->s_op != &kernfs_sops);

        dentry = dget(sb->s_root);

        /* Check if this is the root kernfs_node */
        if (!rcu_access_pointer(kn->__parent))
                return dentry;

        root = kernfs_root(kn);
        /*
         * As long as kn is valid, its parent can not vanish. This is cgroup's
         * kn so it can't have its parent replaced. Therefore it is safe to use
         * the ancestor node outside of the RCU or locked section.
         */
        if (WARN_ON_ONCE(!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)))
                return ERR_PTR(-EINVAL);
        scoped_guard(rcu) {
                knparent = find_next_ancestor(kn, NULL);
        }
        if (WARN_ON(!knparent)) {
                dput(dentry);
                return ERR_PTR(-EINVAL);
        }

        do {
                struct dentry *dtmp;
                struct kernfs_node *kntmp;
                const char *name;

                if (kn == knparent)
                        return dentry;

                scoped_guard(rwsem_read, &root->kernfs_rwsem) {
                        kntmp = find_next_ancestor(kn, knparent);
                        if (WARN_ON(!kntmp)) {
                                dput(dentry);
                                return ERR_PTR(-EINVAL);
                        }
                        name = kstrdup(kernfs_rcu_name(kntmp), GFP_KERNEL);
                }
                if (!name) {
                        dput(dentry);
                        return ERR_PTR(-ENOMEM);
                }
                dtmp = lookup_noperm_positive_unlocked(&QSTR(name), dentry);
                dput(dentry);
                kfree(name);
                if (IS_ERR(dtmp))
                        return dtmp;
                knparent = kntmp;
                dentry = dtmp;
        } while (true);
}

static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
{
        struct kernfs_super_info *info = kernfs_info(sb);
        struct kernfs_root *kf_root = kfc->root;
        struct inode *inode;
        struct dentry *root;

        info->sb = sb;
        /* Userspace would break if executables or devices appear on sysfs */
        sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
        sb->s_blocksize = PAGE_SIZE;
        sb->s_blocksize_bits = PAGE_SHIFT;
        sb->s_magic = kfc->magic;
        sb->s_op = &kernfs_sops;
        sb->s_xattr = kernfs_xattr_handlers;
        if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP)
                sb->s_export_op = &kernfs_export_ops;
        sb->s_time_gran = 1;
        sb->s_maxbytes  = MAX_LFS_FILESIZE;

        /* sysfs dentries and inodes don't require IO to create */
        sb->s_shrink->seeks = 0;

        /* get root inode, initialize and unlock it */
        down_read(&kf_root->kernfs_rwsem);
        inode = kernfs_get_inode(sb, info->root->kn);
        up_read(&kf_root->kernfs_rwsem);
        if (!inode) {
                pr_debug("kernfs: could not get root inode\n");
                return -ENOMEM;
        }

        /* instantiate and link root dentry */
        root = d_make_root(inode);
        if (!root) {
                pr_debug("%s: could not get root dentry!\n", __func__);
                return -ENOMEM;
        }
        sb->s_root = root;
        set_default_d_op(sb, &kernfs_dops);
        return 0;
}

static int kernfs_test_super(struct super_block *sb, struct fs_context *fc)
{
        struct kernfs_super_info *sb_info = kernfs_info(sb);
        struct kernfs_super_info *info = fc->s_fs_info;

        return sb_info->root == info->root && sb_info->ns == info->ns;
}

static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
{
        struct kernfs_fs_context *kfc = fc->fs_private;

        kfc->ns_tag = NULL;
        return set_anon_super_fc(sb, fc);
}

/**
 * kernfs_super_ns - determine the namespace tag of a kernfs super_block
 * @sb: super_block of interest
 *
 * Return: the namespace tag associated with kernfs super_block @sb.
 */
const struct ns_common *kernfs_super_ns(struct super_block *sb)
{
        struct kernfs_super_info *info = kernfs_info(sb);

        return info->ns;
}

/**
 * kernfs_get_tree - kernfs filesystem access/retrieval helper
 * @fc: The filesystem context.
 *
 * This is to be called from each kernfs user's fs_context->ops->get_tree()
 * implementation, which should set the specified ->@fs_type and ->@flags, and
 * specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
 * respectively.
 *
 * Return: %0 on success, -errno on failure.
 */
int kernfs_get_tree(struct fs_context *fc)
{
        struct kernfs_fs_context *kfc = fc->fs_private;
        struct super_block *sb;
        struct kernfs_super_info *info;
        int error;

        info = kzalloc_obj(*info);
        if (!info)
                return -ENOMEM;

        info->root = kfc->root;
        info->ns = kfc->ns_tag;
        INIT_LIST_HEAD(&info->node);

        fc->s_fs_info = info;
        sb = sget_fc(fc, kernfs_test_super, kernfs_set_super);
        if (IS_ERR(sb))
                return PTR_ERR(sb);

        if (!sb->s_root) {
                struct kernfs_super_info *info = kernfs_info(sb);
                struct kernfs_root *root = kfc->root;

                kfc->new_sb_created = true;

                error = kernfs_fill_super(sb, kfc);
                if (error) {
                        deactivate_locked_super(sb);
                        return error;
                }
                sb->s_flags |= SB_ACTIVE;

                uuid_t uuid;
                uuid_gen(&uuid);
                super_set_uuid(sb, uuid.b, sizeof(uuid));

                down_write(&root->kernfs_supers_rwsem);
                list_add(&info->node, &info->root->supers);
                up_write(&root->kernfs_supers_rwsem);
        }

        fc->root = dget(sb->s_root);
        return 0;
}

void kernfs_free_fs_context(struct fs_context *fc)
{
        /* Note that we don't deal with kfc->ns_tag here. */
        kfree(fc->s_fs_info);
        fc->s_fs_info = NULL;
}

/**
 * kernfs_kill_sb - kill_sb for kernfs
 * @sb: super_block being killed
 *
 * This can be used directly for file_system_type->kill_sb().  If a kernfs
 * user needs extra cleanup, it can implement its own kill_sb() and call
 * this function at the end.
 */
void kernfs_kill_sb(struct super_block *sb)
{
        struct kernfs_super_info *info = kernfs_info(sb);
        struct kernfs_root *root = info->root;

        down_write(&root->kernfs_supers_rwsem);
        list_del(&info->node);
        up_write(&root->kernfs_supers_rwsem);

        /*
         * Remove the superblock from fs_supers/s_instances
         * so we can't find it, before freeing kernfs_super_info.
         */
        kill_anon_super(sb);
        kfree(info);
}

static void __init kernfs_mutex_init(void)
{
        int count;

        for (count = 0; count < NR_KERNFS_LOCKS; count++)
                mutex_init(&kernfs_locks->open_file_mutex[count]);
}

static void __init kernfs_lock_init(void)
{
        kernfs_locks = kmalloc_obj(struct kernfs_global_locks);
        WARN_ON(!kernfs_locks);

        kernfs_mutex_init();
}

void __init kernfs_init(void)
{
        kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
                                              sizeof(struct kernfs_node),
                                              0, SLAB_PANIC, NULL);

        /* Creates slab cache for kernfs inode attributes */
        kernfs_iattrs_cache  = kmem_cache_create("kernfs_iattrs_cache",
                                              sizeof(struct kernfs_iattrs),
                                              0, SLAB_PANIC, NULL);

        kernfs_lock_init();
}


























































































































































    1 










































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * linux/ipc/util.h
 * Copyright (C) 1999 Christoph Rohland
 *
 * ipc helper functions (c) 1999 Manfred Spraul <manfred@colorfullife.com>
 * namespaces support.      2006 OpenVZ, SWsoft Inc.
 *                               Pavel Emelianov <xemul@openvz.org>
 */

#ifndef _IPC_UTIL_H
#define _IPC_UTIL_H

#include <linux/unistd.h>
#include <linux/err.h>
#include <linux/ipc_namespace.h>
#include <linux/pid.h>

/*
 * The IPC ID contains 2 separate numbers - index and sequence number.
 * By default,
 *   bits  0-14: index (32k, 15 bits)
 *   bits 15-30: sequence number (64k, 16 bits)
 *
 * When IPCMNI extension mode is turned on, the composition changes:
 *   bits  0-23: index (16M, 24 bits)
 *   bits 24-30: sequence number (128, 7 bits)
 */
#define IPCMNI_SHIFT                15
#define IPCMNI_EXTEND_SHIFT        24
#define IPCMNI_EXTEND_MIN_CYCLE        (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE)
#define IPCMNI                        (1 << IPCMNI_SHIFT)
#define IPCMNI_EXTEND                (1 << IPCMNI_EXTEND_SHIFT)

#ifdef CONFIG_SYSVIPC_SYSCTL
extern int ipc_mni;
extern int ipc_mni_shift;
extern int ipc_min_cycle;

#define ipcmni_seq_shift()        ipc_mni_shift
#define IPCMNI_IDX_MASK                ((1 << ipc_mni_shift) - 1)

#else /* CONFIG_SYSVIPC_SYSCTL */

#define ipc_mni                        IPCMNI
#define ipc_min_cycle                ((int)RADIX_TREE_MAP_SIZE)
#define ipcmni_seq_shift()        IPCMNI_SHIFT
#define IPCMNI_IDX_MASK                ((1 << IPCMNI_SHIFT) - 1)
#endif /* CONFIG_SYSVIPC_SYSCTL */

void sem_init(void);
void msg_init(void);
void shm_init(void);

struct ipc_namespace;
struct pid_namespace;

#ifdef CONFIG_POSIX_MQUEUE
extern void mq_clear_sbinfo(struct ipc_namespace *ns);
#else
static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { }
#endif

#ifdef CONFIG_SYSVIPC
void sem_init_ns(struct ipc_namespace *ns);
int msg_init_ns(struct ipc_namespace *ns);
void shm_init_ns(struct ipc_namespace *ns);

void sem_exit_ns(struct ipc_namespace *ns);
void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
#else
static inline void sem_init_ns(struct ipc_namespace *ns) { }
static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; }
static inline void shm_init_ns(struct ipc_namespace *ns) { }

static inline void sem_exit_ns(struct ipc_namespace *ns) { }
static inline void msg_exit_ns(struct ipc_namespace *ns) { }
static inline void shm_exit_ns(struct ipc_namespace *ns) { }
#endif

/*
 * Structure that holds the parameters needed by the ipc operations
 * (see after)
 */
struct ipc_params {
        key_t key;
        int flg;
        union {
                size_t size;        /* for shared memories */
                int nsems;        /* for semaphores */
        } u;                        /* holds the getnew() specific param */
};

/*
 * Structure that holds some ipc operations. This structure is used to unify
 * the calls to sys_msgget(), sys_semget(), sys_shmget()
 *      . routine to call to create a new ipc object. Can be one of newque,
 *        newary, newseg
 *      . routine to call to check permissions for a new ipc object.
 *        Can be one of security_msg_associate, security_sem_associate,
 *        security_shm_associate
 *      . routine to call for an extra check if needed
 */
struct ipc_ops {
        int (*getnew)(struct ipc_namespace *, struct ipc_params *);
        int (*associate)(struct kern_ipc_perm *, int);
        int (*more_checks)(struct kern_ipc_perm *, struct ipc_params *);
};

struct seq_file;
struct ipc_ids;

void ipc_init_ids(struct ipc_ids *ids);
#ifdef CONFIG_PROC_FS
void __init ipc_init_proc_interface(const char *path, const char *header,
                int ids, int (*show)(struct seq_file *, void *));
struct pid_namespace *ipc_seq_pid_ns(struct seq_file *);
#else
#define ipc_init_proc_interface(path, header, ids, show) do {} while (0)
#endif

#define IPC_SEM_IDS        0
#define IPC_MSG_IDS        1
#define IPC_SHM_IDS        2

#define ipcid_to_idx(id)  ((id) & IPCMNI_IDX_MASK)
#define ipcid_to_seqx(id) ((id) >> ipcmni_seq_shift())
#define ipcid_seq_max()          (INT_MAX >> ipcmni_seq_shift())

/* must be called with ids->rwsem acquired for writing */
int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);

/* must be called with both locks acquired. */
void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);

/* must be called with both locks acquired. */
void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *);

/* must be called with ipcp locked */
int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);

/**
 * ipc_get_maxidx - get the highest assigned index
 * @ids: ipc identifier set
 *
 * The function returns the highest assigned index for @ids. The function
 * doesn't scan the idr tree, it uses a cached value.
 *
 * Called with ipc_ids.rwsem held for reading.
 */
static inline int ipc_get_maxidx(struct ipc_ids *ids)
{
        if (ids->in_use == 0)
                return -1;

        if (ids->in_use == ipc_mni)
                return ipc_mni - 1;

        return ids->max_idx;
}

/*
 * For allocation that need to be freed by RCU.
 * Objects are reference counted, they start with reference count 1.
 * getref increases the refcount, the putref call that reduces the recount
 * to 0 schedules the rcu destruction. Caller must guarantee locking.
 *
 * refcount is initialized by ipc_addid(), before that point call_rcu()
 * must be used.
 */
bool ipc_rcu_getref(struct kern_ipc_perm *ptr);
void ipc_rcu_putref(struct kern_ipc_perm *ptr,
                        void (*func)(struct rcu_head *head));

struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id);

void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out);
struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns,
                                             struct ipc_ids *ids, int id, int cmd,
                                             struct ipc64_perm *perm, int extra_perm);

static inline void ipc_update_pid(struct pid **pos, struct pid *pid)
{
        struct pid *old = *pos;
        if (old != pid) {
                *pos = get_pid(pid);
                put_pid(old);
        }
}

#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
int ipc_parse_version(int *cmd);
#endif

extern void free_msg(struct msg_msg *msg);
extern struct msg_msg *load_msg(const void __user *src, size_t len);
extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);

static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int id)
{
        return ipcid_to_seqx(id) != ipcp->seq;
}

static inline void ipc_lock_object(struct kern_ipc_perm *perm)
{
        spin_lock(&perm->lock);
}

static inline void ipc_unlock_object(struct kern_ipc_perm *perm)
{
        spin_unlock(&perm->lock);
}

static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm)
{
        assert_spin_locked(&perm->lock);
}

static inline void ipc_unlock(struct kern_ipc_perm *perm)
{
        ipc_unlock_object(perm);
        rcu_read_unlock();
}

/*
 * ipc_valid_object() - helper to sort out IPC_RMID races for codepaths
 * where the respective ipc_ids.rwsem is not being held down.
 * Checks whether the ipc object is still around or if it's gone already, as
 * ipc_rmid() may have already freed the ID while the ipc lock was spinning.
 * Needs to be called with kern_ipc_perm.lock held -- exception made for one
 * checkpoint case at sys_semtimedop() as noted in code commentary.
 */
static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
{
        return !perm->deleted;
}

struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
                        const struct ipc_ops *ops, struct ipc_params *params);
void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
                void (*free)(struct ipc_namespace *, struct kern_ipc_perm *));

static inline int sem_check_semmni(struct ipc_namespace *ns) {
        /*
         * Check semmni range [0, ipc_mni]
         * semmni is the last element of sem_ctls[4] array
         */
        return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni))
                ? -ERANGE : 0;
}

#ifdef CONFIG_COMPAT
#include <linux/compat.h>
struct compat_ipc_perm {
        key_t key;
        __compat_uid_t uid;
        __compat_gid_t gid;
        __compat_uid_t cuid;
        __compat_gid_t cgid;
        compat_mode_t mode;
        unsigned short seq;
};

void to_compat_ipc_perm(struct compat_ipc_perm *, struct ipc64_perm *);
void to_compat_ipc64_perm(struct compat_ipc64_perm *, struct ipc64_perm *);
int get_compat_ipc_perm(struct ipc64_perm *, struct compat_ipc_perm __user *);
int get_compat_ipc64_perm(struct ipc64_perm *,
                          struct compat_ipc64_perm __user *);

static inline int compat_ipc_parse_version(int *cmd)
{
        int version = *cmd & IPC_64;
        *cmd &= ~IPC_64;
        return version;
}

long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg);
long compat_ksys_old_msgctl(int msqid, int cmd, void __user *uptr);
long compat_ksys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz,
                        compat_long_t msgtyp, int msgflg);
long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp,
                       compat_ssize_t msgsz, int msgflg);
long compat_ksys_old_shmctl(int shmid, int cmd, void __user *uptr);

#endif

#endif
















































































































































































































































   13 



   13 



   10 










   17 
















   17 













    1 
    1 





















































































































































































































    1 








    1 







    1 






























































































































































   18 




























































































































































































































































































    1 












    1 












































































































































































































































































    1 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Integrity Measurement Architecture
 *
 * Copyright (C) 2005,2006,2007,2008 IBM Corporation
 *
 * Authors:
 * Reiner Sailer <sailer@watson.ibm.com>
 * Serge Hallyn <serue@us.ibm.com>
 * Kylene Hall <kylene@us.ibm.com>
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima_main.c
 *        implements the IMA hooks: ima_bprm_check, ima_file_mmap,
 *        and ima_file_check.
 */

#include <linux/module.h>
#include <linux/file.h>
#include <linux/binfmts.h>
#include <linux/kernel_read_file.h>
#include <linux/mount.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/ima.h>
#include <linux/fs.h>
#include <linux/iversion.h>
#include <linux/evm.h>
#include <linux/crash_dump.h>

#include "ima.h"

#ifdef CONFIG_IMA_APPRAISE
int ima_appraise = IMA_APPRAISE_ENFORCE;
#else
int ima_appraise;
#endif

int __ro_after_init ima_hash_algo = HASH_ALGO_SHA1;
static int hash_setup_done;
static int ima_disabled __ro_after_init;

static struct notifier_block ima_lsm_policy_notifier = {
        .notifier_call = ima_lsm_policy_change,
};

static int __init ima_setup(char *str)
{
        if (!is_kdump_kernel()) {
                pr_info("Warning: ima setup option only permitted in kdump");
                return 1;
        }

        if (strncmp(str, "off", 3) == 0)
                ima_disabled = 1;
        else if (strncmp(str, "on", 2) == 0)
                ima_disabled = 0;
        else
                pr_err("Invalid ima setup option: \"%s\" , please specify ima=on|off.", str);

        return 1;
}
__setup("ima=", ima_setup);

static int __init hash_setup(char *str)
{
        struct ima_template_desc *template_desc = ima_template_desc_current();
        int i;

        if (hash_setup_done)
                return 1;

        if (strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) == 0) {
                if (strncmp(str, "sha1", 4) == 0) {
                        ima_hash_algo = HASH_ALGO_SHA1;
                } else if (strncmp(str, "md5", 3) == 0) {
                        ima_hash_algo = HASH_ALGO_MD5;
                } else {
                        pr_err("invalid hash algorithm \"%s\" for template \"%s\"",
                                str, IMA_TEMPLATE_IMA_NAME);
                        return 1;
                }
                goto out;
        }

        i = match_string(hash_algo_name, HASH_ALGO__LAST, str);
        if (i < 0) {
                pr_err("invalid hash algorithm \"%s\"", str);
                return 1;
        }

        ima_hash_algo = i;
out:
        hash_setup_done = 1;
        return 1;
}
__setup("ima_hash=", hash_setup);

enum hash_algo ima_get_current_hash_algo(void)
{
        return ima_hash_algo;
}

/* Prevent mmap'ing a file execute that is already mmap'ed write */
static int mmap_violation_check(enum ima_hooks func, struct file *file,
                                char **pathbuf, const char **pathname,
                                char *filename)
{
        struct inode *inode;
        int rc = 0;

        if ((func == MMAP_CHECK || func == MMAP_CHECK_REQPROT) &&
            mapping_writably_mapped(file->f_mapping)) {
                rc = -ETXTBSY;
                inode = file_inode(file);

                if (!*pathbuf)        /* ima_rdwr_violation possibly pre-fetched */
                        *pathname = ima_d_path(&file->f_path, pathbuf,
                                               filename);
                integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, *pathname,
                                    "mmap_file", "mmapped_writers", rc, 0);
        }
        return rc;
}

/*
 * ima_rdwr_violation_check
 *
 * Only invalidate the PCR for measured files:
 *        - Opening a file for write when already open for read,
 *          results in a time of measure, time of use (ToMToU) error.
 *        - Opening a file for read when already open for write,
 *          could result in a file measurement error.
 *
 */
static void ima_rdwr_violation_check(struct file *file,
                                     struct ima_iint_cache *iint,
                                     int must_measure,
                                     char **pathbuf,
                                     const char **pathname,
                                     char *filename)
{
        struct inode *inode = file_inode(file);
        fmode_t mode = file->f_mode;
        bool send_tomtou = false, send_writers = false;

        if (mode & FMODE_WRITE) {
                if (atomic_read(&inode->i_readcount) && IS_IMA(inode)) {
                        if (!iint)
                                iint = ima_iint_find(inode);

                        /* IMA_MEASURE is set from reader side */
                        if (iint && test_and_clear_bit(IMA_MAY_EMIT_TOMTOU,
                                                       &iint->atomic_flags))
                                send_tomtou = true;
                }
        } else {
                if (must_measure)
                        set_bit(IMA_MAY_EMIT_TOMTOU, &iint->atomic_flags);

                /* Limit number of open_writers violations */
                if (inode_is_open_for_write(inode) && must_measure) {
                        if (!test_and_set_bit(IMA_EMITTED_OPENWRITERS,
                                              &iint->atomic_flags))
                                send_writers = true;
                }
        }

        if (!send_tomtou && !send_writers)
                return;

        *pathname = ima_d_path(&file->f_path, pathbuf, filename);

        if (send_tomtou)
                ima_add_violation(file, *pathname, iint,
                                  "invalid_pcr", "ToMToU");
        if (send_writers)
                ima_add_violation(file, *pathname, iint,
                                  "invalid_pcr", "open_writers");
}

/*
 * Detect file change based on STATX_CHANGE_COOKIE, when supported, and
 * fallback to detecting file change based on i_version. On filesystems
 * which do not support either, assume the file changed.
 */
static bool ima_detect_file_change(struct ima_iint_cache *iint,
                                   struct inode *inode, struct file *file)
{
        struct kstat stat;
        int result;

        result = vfs_getattr_nosec(&file->f_path, &stat, STATX_CHANGE_COOKIE,
                                   AT_STATX_SYNC_AS_STAT);

        if (!result && stat.result_mask & STATX_CHANGE_COOKIE)
                return stat.change_cookie != iint->real_inode.version;

        if (IS_I_VERSION(inode))
                return !inode_eq_iversion(inode, iint->real_inode.version);

        return true;
}

static void ima_check_last_writer(struct ima_iint_cache *iint,
                                  struct inode *inode, struct file *file)
{
        fmode_t mode = file->f_mode;
        bool update;

        if (!(mode & FMODE_WRITE))
                return;

        mutex_lock(&iint->mutex);
        if (atomic_read(&inode->i_writecount) == 1) {
                clear_bit(IMA_EMITTED_OPENWRITERS, &iint->atomic_flags);

                update = test_and_clear_bit(IMA_UPDATE_XATTR,
                                            &iint->atomic_flags);

                if (iint->flags & IMA_NEW_FILE ||
                    ima_detect_file_change(iint, inode, file)) {
                        iint->flags &= ~(IMA_DONE_MASK | IMA_NEW_FILE);
                        iint->measured_pcrs = 0;
                        if (update)
                                ima_update_xattr(iint, file);
                }
        }
        mutex_unlock(&iint->mutex);
}

/**
 * ima_file_free - called on __fput()
 * @file: pointer to file structure being freed
 *
 * Flag files that changed, based on i_version
 */
static void ima_file_free(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct ima_iint_cache *iint;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        iint = ima_iint_find(inode);
        if (!iint)
                return;

        ima_check_last_writer(iint, inode, file);
}

static int process_measurement(struct file *file, const struct cred *cred,
                               struct lsm_prop *prop, char *buf, loff_t size,
                               int mask, enum ima_hooks func,
                               enum kernel_read_file_id read_id,
                               bool bprm_is_check)
{
        struct inode *real_inode, *inode = file_inode(file);
        struct ima_iint_cache *iint = NULL;
        struct ima_template_desc *template_desc = NULL;
        struct inode *metadata_inode;
        char *pathbuf = NULL;
        char filename[NAME_MAX];
        const char *pathname = NULL;
        int rc = 0, action, must_appraise = 0;
        int pcr = CONFIG_IMA_MEASURE_PCR_IDX;
        struct evm_ima_xattr_data *xattr_value = NULL;
        struct modsig *modsig = NULL;
        int xattr_len = 0;
        bool violation_check;
        enum hash_algo hash_algo;
        unsigned int allowed_algos = 0;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return 0;

        /* Return an IMA_MEASURE, IMA_APPRAISE, IMA_AUDIT action
         * bitmask based on the appraise/audit/measurement policy.
         * Included is the appraise submask.
         */
        action = ima_get_action(file_mnt_idmap(file), inode, cred, prop,
                                mask, func, &pcr, &template_desc, NULL,
                                &allowed_algos);
        violation_check = ((func == FILE_CHECK || func == MMAP_CHECK ||
                            func == MMAP_CHECK_REQPROT) &&
                           (ima_policy_flag & IMA_MEASURE) &&
                           ((action & IMA_MEASURE) ||
                            (file->f_mode & FMODE_WRITE)));
        if (!action && !violation_check)
                return 0;

        must_appraise = action & IMA_APPRAISE;

        /*  Is the appraise rule hook specific?  */
        if (action & IMA_FILE_APPRAISE)
                func = FILE_CHECK;

        inode_lock(inode);

        if (action) {
                iint = ima_inode_get(inode);
                if (!iint)
                        rc = -ENOMEM;
        }

        if (!rc && violation_check)
                ima_rdwr_violation_check(file, iint, action & IMA_MEASURE,
                                         &pathbuf, &pathname, filename);

        inode_unlock(inode);

        if (rc)
                goto out;
        if (!action)
                goto out;

        mutex_lock(&iint->mutex);

        if (test_and_clear_bit(IMA_CHANGE_ATTR, &iint->atomic_flags))
                /*
                 * Reset appraisal flags (action and non-action rule-specific)
                 * if ima_inode_post_setattr was called.
                 */
                iint->flags &= ~(IMA_APPRAISE | IMA_APPRAISED |
                                 IMA_APPRAISE_SUBMASK | IMA_APPRAISED_SUBMASK |
                                 IMA_NONACTION_RULE_FLAGS);

        /*
         * Re-evaulate the file if either the xattr has changed or the
         * kernel has no way of detecting file change on the filesystem.
         * (Limited to privileged mounted filesystems.)
         */
        if (test_and_clear_bit(IMA_CHANGE_XATTR, &iint->atomic_flags) ||
            ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) &&
             !(inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) &&
             !(action & IMA_FAIL_UNVERIFIABLE_SIGS))) {
                iint->flags &= ~IMA_DONE_MASK;
                iint->measured_pcrs = 0;
        }

        /*
         * On stacked filesystems, detect and re-evaluate file data and
         * metadata changes.
         */
        real_inode = d_real_inode(file_dentry(file));
        if (real_inode != inode &&
            (action & IMA_DO_MASK) && (iint->flags & IMA_DONE_MASK)) {
                if (!IS_I_VERSION(real_inode) ||
                    integrity_inode_attrs_changed(&iint->real_inode,
                                                  real_inode)) {
                        iint->flags &= ~IMA_DONE_MASK;
                        iint->measured_pcrs = 0;
                }

                /*
                 * Reset the EVM status when metadata changed.
                 */
                metadata_inode = d_inode(d_real(file_dentry(file),
                                         D_REAL_METADATA));
                if (evm_metadata_changed(inode, metadata_inode))
                        iint->flags &= ~(IMA_APPRAISED |
                                         IMA_APPRAISED_SUBMASK);
        }

        /* Determine if already appraised/measured based on bitmask
         * (IMA_MEASURE, IMA_MEASURED, IMA_XXXX_APPRAISE, IMA_XXXX_APPRAISED,
         *  IMA_AUDIT, IMA_AUDITED)
         */
        iint->flags |= action;
        action &= IMA_DO_MASK;
        action &= ~((iint->flags & (IMA_DONE_MASK ^ IMA_MEASURED)) >> 1);

        /* If target pcr is already measured, unset IMA_MEASURE action */
        if ((action & IMA_MEASURE) && (iint->measured_pcrs & (0x1 << pcr)))
                action ^= IMA_MEASURE;

        /* HASH sets the digital signature and update flags, nothing else */
        if ((action & IMA_HASH) &&
            !(test_bit(IMA_DIGSIG, &iint->atomic_flags))) {
                xattr_len = ima_read_xattr(file_dentry(file),
                                           &xattr_value, xattr_len);
                if ((xattr_value && xattr_len > 2) &&
                    (xattr_value->type == EVM_IMA_XATTR_DIGSIG))
                        set_bit(IMA_DIGSIG, &iint->atomic_flags);
                iint->flags |= IMA_HASHED;
                action ^= IMA_HASH;
                set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        }

        /* Nothing to do, just return existing appraised status */
        if (!action) {
                if (must_appraise) {
                        rc = mmap_violation_check(func, file, &pathbuf,
                                                  &pathname, filename);
                        if (!rc)
                                rc = ima_get_cache_status(iint, func);
                }
                goto out_locked;
        }

        if ((action & IMA_APPRAISE_SUBMASK) ||
            strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) {
                /* read 'security.ima' */
                xattr_len = ima_read_xattr(file_dentry(file),
                                           &xattr_value, xattr_len);

                /*
                 * Read the appended modsig if allowed by the policy, and allow
                 * an additional measurement list entry, if needed, based on the
                 * template format and whether the file was already measured.
                 */
                if (iint->flags & IMA_MODSIG_ALLOWED) {
                        rc = ima_read_modsig(func, buf, size, &modsig);

                        if (!rc && ima_template_has_modsig(template_desc) &&
                            iint->flags & IMA_MEASURED)
                                action |= IMA_MEASURE;
                }
        }

        hash_algo = ima_get_hash_algo(xattr_value, xattr_len);

        rc = ima_collect_measurement(iint, file, buf, size, hash_algo, modsig);
        if (rc != 0 && rc != -EBADF && rc != -EINVAL)
                goto out_locked;

        /* Defer measuring/appraising kernel modules to READING_MODULE */
        if (read_id == READING_MODULE_COMPRESSED) {
                must_appraise = 0;
                goto out_locked;
        }

        if (!pathbuf)        /* ima_rdwr_violation possibly pre-fetched */
                pathname = ima_d_path(&file->f_path, &pathbuf, filename);

        if (action & IMA_MEASURE)
                ima_store_measurement(iint, file, pathname,
                                      xattr_value, xattr_len, modsig, pcr,
                                      template_desc);
        if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) {
                rc = ima_check_blacklist(iint, modsig, pcr);
                if (rc != -EPERM) {
                        inode_lock(inode);
                        rc = ima_appraise_measurement(func, iint, file,
                                                      pathname, xattr_value,
                                                      xattr_len, modsig,
                                                      bprm_is_check);
                        inode_unlock(inode);
                }
                if (!rc)
                        rc = mmap_violation_check(func, file, &pathbuf,
                                                  &pathname, filename);
        }
        if (action & IMA_AUDIT)
                ima_audit_measurement(iint, pathname);

        if ((file->f_flags & O_DIRECT) && (iint->flags & IMA_PERMIT_DIRECTIO))
                rc = 0;

        /* Ensure the digest was generated using an allowed algorithm */
        if (rc == 0 && must_appraise && allowed_algos != 0 &&
            (allowed_algos & (1U << hash_algo)) == 0) {
                rc = -EACCES;

                integrity_audit_msg(AUDIT_INTEGRITY_DATA, file_inode(file),
                                    pathname, "collect_data",
                                    "denied-hash-algorithm", rc, 0);
        }
out_locked:
        if ((mask & MAY_WRITE) && test_bit(IMA_DIGSIG, &iint->atomic_flags) &&
             !(iint->flags & IMA_NEW_FILE))
                rc = -EACCES;
        mutex_unlock(&iint->mutex);
        kfree(xattr_value);
        ima_free_modsig(modsig);
out:
        if (pathbuf)
                __putname(pathbuf);
        if (must_appraise) {
                if (rc && (ima_appraise & IMA_APPRAISE_ENFORCE))
                        return -EACCES;
                if (file->f_mode & FMODE_WRITE)
                        set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        }
        return 0;
}

/**
 * ima_file_mmap - based on policy, collect/store measurement.
 * @file: pointer to the file to be measured (May be NULL)
 * @reqprot: protection requested by the application
 * @prot: protection that will be applied by the kernel
 * @flags: operational flags
 *
 * Measure files being mmapped executable based on the ima_must_measure()
 * policy decision.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_file_mmap(struct file *file, unsigned long reqprot,
                         unsigned long prot, unsigned long flags)
{
        struct lsm_prop prop;
        int ret;

        if (!file)
                return 0;

        security_current_getlsmprop_subj(&prop);

        if (reqprot & PROT_EXEC) {
                ret = process_measurement(file, current_cred(), &prop, NULL,
                                          0, MAY_EXEC, MMAP_CHECK_REQPROT, 0,
                                          false);
                if (ret)
                        return ret;
        }

        if (prot & PROT_EXEC)
                return process_measurement(file, current_cred(), &prop, NULL,
                                           0, MAY_EXEC, MMAP_CHECK, 0, false);

        return 0;
}

/**
 * ima_file_mprotect - based on policy, limit mprotect change
 * @vma: vm_area_struct protection is set to
 * @reqprot: protection requested by the application
 * @prot: protection that will be applied by the kernel
 *
 * Files can be mmap'ed read/write and later changed to execute to circumvent
 * IMA's mmap appraisal policy rules.  Due to locking issues (mmap semaphore
 * would be taken before i_mutex), files can not be measured or appraised at
 * this point.  Eliminate this integrity gap by denying the mprotect
 * PROT_EXECUTE change, if an mmap appraise policy rule exists.
 *
 * On mprotect change success, return 0.  On failure, return -EACESS.
 */
static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                             unsigned long prot)
{
        struct ima_template_desc *template = NULL;
        struct file *file;
        char filename[NAME_MAX];
        char *pathbuf = NULL;
        const char *pathname = NULL;
        struct inode *inode;
        struct lsm_prop prop;
        int result = 0;
        int action;
        int pcr;

        /* Is mprotect making an mmap'ed file executable? */
        if (!(ima_policy_flag & IMA_APPRAISE) || !vma->vm_file ||
            !(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC))
                return 0;

        security_current_getlsmprop_subj(&prop);
        inode = file_inode(vma->vm_file);
        action = ima_get_action(file_mnt_idmap(vma->vm_file), inode,
                                current_cred(), &prop, MAY_EXEC, MMAP_CHECK,
                                &pcr, &template, NULL, NULL);
        action |= ima_get_action(file_mnt_idmap(vma->vm_file), inode,
                                 current_cred(), &prop, MAY_EXEC,
                                 MMAP_CHECK_REQPROT, &pcr, &template, NULL,
                                 NULL);

        /* Is the mmap'ed file in policy? */
        if (!(action & (IMA_MEASURE | IMA_APPRAISE_SUBMASK)))
                return 0;

        if (action & IMA_APPRAISE_SUBMASK)
                result = -EPERM;

        file = vma->vm_file;
        pathname = ima_d_path(&file->f_path, &pathbuf, filename);
        integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, pathname,
                            "collect_data", "failed-mprotect", result, 0);
        if (pathbuf)
                __putname(pathbuf);

        return result;
}

/**
 * ima_bprm_check - based on policy, collect/store measurement.
 * @bprm: contains the linux_binprm structure
 *
 * The OS protects against an executable file, already open for write,
 * from being executed in deny_write_access() and an executable file,
 * already open for execute, from being modified in get_write_access().
 * So we can be certain that what we verify and measure here is actually
 * what is being executed.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_bprm_check(struct linux_binprm *bprm)
{
        struct lsm_prop prop;

        security_current_getlsmprop_subj(&prop);
        return process_measurement(bprm->file, current_cred(),
                                   &prop, NULL, 0, MAY_EXEC, BPRM_CHECK, 0,
                                   bprm->is_check);
}

/**
 * ima_creds_check - based on policy, collect/store measurement.
 * @bprm: contains the linux_binprm structure
 * @file: contains the file descriptor of the binary being executed
 *
 * The OS protects against an executable file, already open for write,
 * from being executed in deny_write_access() and an executable file,
 * already open for execute, from being modified in get_write_access().
 * So we can be certain that what we verify and measure here is actually
 * what is being executed.
 *
 * The difference from ima_bprm_check() is that ima_creds_check() is invoked
 * only after determining the final binary to be executed without interpreter,
 * and not when searching for intermediate binaries. The reason is that since
 * commit 56305aa9b6fab ("exec: Compute file based creds only once"), the
 * credentials to be applied to the process are calculated only at that stage
 * (bprm_creds_from_file security hook instead of bprm_check_security).
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_creds_check(struct linux_binprm *bprm, const struct file *file)
{
        struct lsm_prop prop;

        security_current_getlsmprop_subj(&prop);
        return process_measurement((struct file *)file, bprm->cred, &prop, NULL,
                                   0, MAY_EXEC, CREDS_CHECK, 0, false);
}

/**
 * ima_bprm_creds_for_exec - collect/store/appraise measurement.
 * @bprm: contains the linux_binprm structure
 *
 * Based on the IMA policy and the execveat(2) AT_EXECVE_CHECK flag, measure
 * and appraise the integrity of a file to be executed by script interpreters.
 * Unlike any of the other LSM hooks where the kernel enforces file integrity,
 * enforcing file integrity is left up to the discretion of the script
 * interpreter (userspace).
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        /*
         * As security_bprm_check() is called multiple times, both
         * the script and the shebang interpreter are measured, appraised,
         * and audited. Limit usage of this LSM hook to just measuring,
         * appraising, and auditing the indirect script execution
         * (e.g. ./sh example.sh).
         */
        if (!bprm->is_check)
                return 0;

        return ima_bprm_check(bprm);
}

/**
 * ima_file_check - based on policy, collect/store measurement.
 * @file: pointer to the file to be measured
 * @mask: contains MAY_READ, MAY_WRITE, MAY_EXEC or MAY_APPEND
 *
 * Measure files based on the ima_must_measure() policy decision.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_file_check(struct file *file, int mask)
{
        struct lsm_prop prop;

        security_current_getlsmprop_subj(&prop);
        return process_measurement(file, current_cred(), &prop, NULL, 0,
                                   mask & (MAY_READ | MAY_WRITE | MAY_EXEC |
                                           MAY_APPEND), FILE_CHECK, 0, false);
}

static int __ima_inode_hash(struct inode *inode, struct file *file, char *buf,
                            size_t buf_size)
{
        struct ima_iint_cache *iint = NULL, tmp_iint;
        int rc, hash_algo;

        if (ima_policy_flag) {
                iint = ima_iint_find(inode);
                if (iint)
                        mutex_lock(&iint->mutex);
        }

        if ((!iint || !(iint->flags & IMA_COLLECTED)) && file) {
                if (iint)
                        mutex_unlock(&iint->mutex);

                memset(&tmp_iint, 0, sizeof(tmp_iint));
                mutex_init(&tmp_iint.mutex);

                rc = ima_collect_measurement(&tmp_iint, file, NULL, 0,
                                             ima_hash_algo, NULL);
                if (rc < 0) {
                        /* ima_hash could be allocated in case of failure. */
                        if (rc != -ENOMEM)
                                kfree(tmp_iint.ima_hash);

                        return -EOPNOTSUPP;
                }

                iint = &tmp_iint;
                mutex_lock(&iint->mutex);
        }

        if (!iint)
                return -EOPNOTSUPP;

        /*
         * ima_file_hash can be called when ima_collect_measurement has still
         * not been called, we might not always have a hash.
         */
        if (!iint->ima_hash || !(iint->flags & IMA_COLLECTED)) {
                mutex_unlock(&iint->mutex);
                return -EOPNOTSUPP;
        }

        if (buf) {
                size_t copied_size;

                copied_size = min_t(size_t, iint->ima_hash->length, buf_size);
                memcpy(buf, iint->ima_hash->digest, copied_size);
        }
        hash_algo = iint->ima_hash->algo;
        mutex_unlock(&iint->mutex);

        if (iint == &tmp_iint)
                kfree(iint->ima_hash);

        return hash_algo;
}

/**
 * ima_file_hash - return a measurement of the file
 * @file: pointer to the file
 * @buf: buffer in which to store the hash
 * @buf_size: length of the buffer
 *
 * On success, return the hash algorithm (as defined in the enum hash_algo).
 * If buf is not NULL, this function also outputs the hash into buf.
 * If the hash is larger than buf_size, then only buf_size bytes will be copied.
 * It generally just makes sense to pass a buffer capable of holding the largest
 * possible hash: IMA_MAX_DIGEST_SIZE.
 * The file hash returned is based on the entire file, including the appended
 * signature.
 *
 * If the measurement cannot be performed, return -EOPNOTSUPP.
 * If the parameters are incorrect, return -EINVAL.
 */
int ima_file_hash(struct file *file, char *buf, size_t buf_size)
{
        if (!file)
                return -EINVAL;

        return __ima_inode_hash(file_inode(file), file, buf, buf_size);
}
EXPORT_SYMBOL_GPL(ima_file_hash);

/**
 * ima_inode_hash - return the stored measurement if the inode has been hashed
 * and is in the iint cache.
 * @inode: pointer to the inode
 * @buf: buffer in which to store the hash
 * @buf_size: length of the buffer
 *
 * On success, return the hash algorithm (as defined in the enum hash_algo).
 * If buf is not NULL, this function also outputs the hash into buf.
 * If the hash is larger than buf_size, then only buf_size bytes will be copied.
 * It generally just makes sense to pass a buffer capable of holding the largest
 * possible hash: IMA_MAX_DIGEST_SIZE.
 * The hash returned is based on the entire contents, including the appended
 * signature.
 *
 * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP.
 * If the parameters are incorrect, return -EINVAL.
 */
int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size)
{
        if (!inode)
                return -EINVAL;

        return __ima_inode_hash(inode, NULL, buf, buf_size);
}
EXPORT_SYMBOL_GPL(ima_inode_hash);

/**
 * ima_post_create_tmpfile - mark newly created tmpfile as new
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode of the newly created tmpfile
 *
 * No measuring, appraising or auditing of newly created tmpfiles is needed.
 * Skip calling process_measurement(), but indicate which newly, created
 * tmpfiles are in policy.
 */
static void ima_post_create_tmpfile(struct mnt_idmap *idmap,
                                    struct inode *inode)

{
        struct ima_iint_cache *iint;
        int must_appraise;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS,
                                          FILE_CHECK);
        if (!must_appraise)
                return;

        /* Nothing to do if we can't allocate memory */
        iint = ima_inode_get(inode);
        if (!iint)
                return;

        /* needed for writing the security xattrs */
        set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        iint->ima_file_status = INTEGRITY_PASS;
}

/**
 * ima_post_path_mknod - mark as a new inode
 * @idmap: idmap of the mount the inode was found from
 * @dentry: newly created dentry
 *
 * Mark files created via the mknodat syscall as new, so that the
 * file data can be written later.
 */
static void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        struct ima_iint_cache *iint;
        struct inode *inode = dentry->d_inode;
        int must_appraise;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS,
                                          FILE_CHECK);
        if (!must_appraise)
                return;

        /* Nothing to do if we can't allocate memory */
        iint = ima_inode_get(inode);
        if (!iint)
                return;

        /* needed for re-opening empty files */
        iint->flags |= IMA_NEW_FILE;
}

/**
 * ima_read_file - pre-measure/appraise hook decision based on policy
 * @file: pointer to the file to be measured/appraised/audit
 * @read_id: caller identifier
 * @contents: whether a subsequent call will be made to ima_post_read_file()
 *
 * Permit reading a file based on policy. The policy rules are written
 * in terms of the policy identifier.  Appraising the integrity of
 * a file requires a file descriptor.
 *
 * For permission return 0, otherwise return -EACCES.
 */
static int ima_read_file(struct file *file, enum kernel_read_file_id read_id,
                         bool contents)
{
        enum ima_hooks func;
        struct lsm_prop prop;

        /*
         * Do devices using pre-allocated memory run the risk of the
         * firmware being accessible to the device prior to the completion
         * of IMA's signature verification any more than when using two
         * buffers? It may be desirable to include the buffer address
         * in this API and walk all the dma_map_single() mappings to check.
         */

        /*
         * There will be a call made to ima_post_read_file() with
         * a filled buffer, so we don't need to perform an extra
         * read early here.
         */
        if (contents)
                return 0;

        /* Read entire file for all partial reads. */
        func = read_idmap[read_id] ?: FILE_CHECK;
        security_current_getlsmprop_subj(&prop);
        return process_measurement(file, current_cred(), &prop, NULL, 0,
                                   MAY_READ, func, 0, false);
}

const int read_idmap[READING_MAX_ID] = {
        [READING_FIRMWARE] = FIRMWARE_CHECK,
        [READING_MODULE] = MODULE_CHECK,
        [READING_MODULE_COMPRESSED] = MODULE_CHECK,
        [READING_KEXEC_IMAGE] = KEXEC_KERNEL_CHECK,
        [READING_KEXEC_INITRAMFS] = KEXEC_INITRAMFS_CHECK,
        [READING_POLICY] = POLICY_CHECK
};

/**
 * ima_post_read_file - in memory collect/appraise/audit measurement
 * @file: pointer to the file to be measured/appraised/audit
 * @buf: pointer to in memory file contents
 * @size: size of in memory file contents
 * @read_id: caller identifier
 *
 * Measure/appraise/audit in memory file based on policy.  Policy rules
 * are written in terms of a policy identifier.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_post_read_file(struct file *file, char *buf, loff_t size,
                              enum kernel_read_file_id read_id)
{
        enum ima_hooks func;
        struct lsm_prop prop;

        /* permit signed certs */
        if (!file && read_id == READING_X509_CERTIFICATE)
                return 0;

        if (!file || !buf || size == 0) { /* should never happen */
                if (ima_appraise & IMA_APPRAISE_ENFORCE)
                        return -EACCES;
                return 0;
        }

        func = read_idmap[read_id] ?: FILE_CHECK;
        security_current_getlsmprop_subj(&prop);
        return process_measurement(file, current_cred(), &prop, buf, size,
                                   MAY_READ, func, read_id, false);
}

/**
 * ima_load_data - appraise decision based on policy
 * @id: kernel load data caller identifier
 * @contents: whether the full contents will be available in a later
 *              call to ima_post_load_data().
 *
 * Callers of this LSM hook can not measure, appraise, or audit the
 * data provided by userspace.  Enforce policy rules requiring a file
 * signature (eg. kexec'ed kernel image).
 *
 * For permission return 0, otherwise return -EACCES.
 */
static int ima_load_data(enum kernel_load_data_id id, bool contents)
{
        bool ima_enforce, sig_enforce;

        ima_enforce =
                (ima_appraise & IMA_APPRAISE_ENFORCE) == IMA_APPRAISE_ENFORCE;

        switch (id) {
        case LOADING_KEXEC_IMAGE:
                if (IS_ENABLED(CONFIG_KEXEC_SIG) && arch_get_secureboot()) {
                        pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n");
                        return -EACCES;
                }

                if (ima_enforce && (ima_appraise & IMA_APPRAISE_KEXEC)) {
                        pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        case LOADING_FIRMWARE:
                if (ima_enforce && (ima_appraise & IMA_APPRAISE_FIRMWARE) && !contents) {
                        pr_err("Prevent firmware sysfs fallback loading.\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        case LOADING_MODULE:
                sig_enforce = is_module_sig_enforced();

                if (ima_enforce && (!sig_enforce
                                    && (ima_appraise & IMA_APPRAISE_MODULES))) {
                        pr_err("impossible to appraise a module without a file descriptor. sig_enforce kernel parameter might help\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        default:
                break;
        }
        return 0;
}

/**
 * ima_post_load_data - appraise decision based on policy
 * @buf: pointer to in memory file contents
 * @size: size of in memory file contents
 * @load_id: kernel load data caller identifier
 * @description: @load_id-specific description of contents
 *
 * Measure/appraise/audit in memory buffer based on policy.  Policy rules
 * are written in terms of a policy identifier.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_post_load_data(char *buf, loff_t size,
                              enum kernel_load_data_id load_id,
                              char *description)
{
        if (load_id == LOADING_FIRMWARE) {
                if ((ima_appraise & IMA_APPRAISE_FIRMWARE) &&
                    (ima_appraise & IMA_APPRAISE_ENFORCE)) {
                        pr_err("Prevent firmware loading_store.\n");
                        return -EACCES; /* INTEGRITY_UNKNOWN */
                }
                return 0;
        }

        /*
         * Measure the init_module syscall buffer containing the ELF image.
         */
        if (load_id == LOADING_MODULE)
                ima_measure_critical_data("modules", "init_module",
                                          buf, size, true, NULL, 0);

        return 0;
}

/**
 * process_buffer_measurement - Measure the buffer or the buffer data hash
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode associated with the object being measured (NULL for KEY_CHECK)
 * @buf: pointer to the buffer that needs to be added to the log.
 * @size: size of buffer(in bytes).
 * @eventname: event name to be used for the buffer entry.
 * @func: IMA hook
 * @pcr: pcr to extend the measurement
 * @func_data: func specific data, may be NULL
 * @buf_hash: measure buffer data hash
 * @digest: buffer digest will be written to
 * @digest_len: buffer length
 *
 * Based on policy, either the buffer data or buffer data hash is measured
 *
 * Return: 0 if the buffer has been successfully measured, 1 if the digest
 * has been written to the passed location but not added to a measurement entry,
 * a negative value otherwise.
 */
int process_buffer_measurement(struct mnt_idmap *idmap,
                               struct inode *inode, const void *buf, int size,
                               const char *eventname, enum ima_hooks func,
                               int pcr, const char *func_data,
                               bool buf_hash, u8 *digest, size_t digest_len)
{
        int ret = 0;
        const char *audit_cause = "ENOMEM";
        struct ima_template_entry *entry = NULL;
        struct ima_iint_cache iint = {};
        struct ima_event_data event_data = {.iint = &iint,
                                            .filename = eventname,
                                            .buf = buf,
                                            .buf_len = size};
        struct ima_template_desc *template;
        struct ima_max_digest_data hash;
        struct ima_digest_data *hash_hdr = container_of(&hash.hdr,
                                                struct ima_digest_data, hdr);
        char digest_hash[IMA_MAX_DIGEST_SIZE];
        int digest_hash_len = hash_digest_size[ima_hash_algo];
        int violation = 0;
        int action = 0;
        struct lsm_prop prop;

        if (digest && digest_len < digest_hash_len)
                return -EINVAL;

        if (!ima_policy_flag && !digest)
                return -ENOENT;

        template = ima_template_desc_buf();
        if (!template) {
                ret = -EINVAL;
                audit_cause = "ima_template_desc_buf";
                goto out;
        }

        /*
         * Both LSM hooks and auxiliary based buffer measurements are
         * based on policy. To avoid code duplication, differentiate
         * between the LSM hooks and auxiliary buffer measurements,
         * retrieving the policy rule information only for the LSM hook
         * buffer measurements.
         */
        if (func) {
                security_current_getlsmprop_subj(&prop);
                action = ima_get_action(idmap, inode, current_cred(),
                                        &prop, 0, func, &pcr, &template,
                                        func_data, NULL);
                if (!(action & IMA_MEASURE) && !digest)
                        return -ENOENT;
        }

        if (!pcr)
                pcr = CONFIG_IMA_MEASURE_PCR_IDX;

        iint.ima_hash = hash_hdr;
        iint.ima_hash->algo = ima_hash_algo;
        iint.ima_hash->length = hash_digest_size[ima_hash_algo];

        ret = ima_calc_buffer_hash(buf, size, iint.ima_hash);
        if (ret < 0) {
                audit_cause = "hashing_error";
                goto out;
        }

        if (buf_hash) {
                memcpy(digest_hash, hash_hdr->digest, digest_hash_len);

                ret = ima_calc_buffer_hash(digest_hash, digest_hash_len,
                                           iint.ima_hash);
                if (ret < 0) {
                        audit_cause = "hashing_error";
                        goto out;
                }

                event_data.buf = digest_hash;
                event_data.buf_len = digest_hash_len;
        }

        if (digest)
                memcpy(digest, iint.ima_hash->digest, digest_hash_len);

        if (!ima_policy_flag || (func && !(action & IMA_MEASURE)))
                return 1;

        ret = ima_alloc_init_template(&event_data, &entry, template);
        if (ret < 0) {
                audit_cause = "alloc_entry";
                goto out;
        }

        ret = ima_store_template(entry, violation, NULL, event_data.buf, pcr);
        if (ret < 0) {
                audit_cause = "store_entry";
                ima_free_template_entry(entry);
        }

out:
        if (ret < 0)
                integrity_audit_message(AUDIT_INTEGRITY_PCR, NULL, eventname,
                                        func_measure_str(func),
                                        audit_cause, ret, 0, ret);

        return ret;
}

/**
 * ima_kexec_cmdline - measure kexec cmdline boot args
 * @kernel_fd: file descriptor of the kexec kernel being loaded
 * @buf: pointer to buffer
 * @size: size of buffer
 *
 * Buffers can only be measured, not appraised.
 */
void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
{
        if (!buf || !size)
                return;

        CLASS(fd, f)(kernel_fd);
        if (fd_empty(f))
                return;

        process_buffer_measurement(file_mnt_idmap(fd_file(f)), file_inode(fd_file(f)),
                                   buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
                                   NULL, false, NULL, 0);
}

/**
 * ima_measure_critical_data - measure kernel integrity critical data
 * @event_label: unique event label for grouping and limiting critical data
 * @event_name: event name for the record in the IMA measurement list
 * @buf: pointer to buffer data
 * @buf_len: length of buffer data (in bytes)
 * @hash: measure buffer data hash
 * @digest: buffer digest will be written to
 * @digest_len: buffer length
 *
 * Measure data critical to the integrity of the kernel into the IMA log
 * and extend the pcr.  Examples of critical data could be various data
 * structures, policies, and states stored in kernel memory that can
 * impact the integrity of the system.
 *
 * Return: 0 if the buffer has been successfully measured, 1 if the digest
 * has been written to the passed location but not added to a measurement entry,
 * a negative value otherwise.
 */
int ima_measure_critical_data(const char *event_label,
                              const char *event_name,
                              const void *buf, size_t buf_len,
                              bool hash, u8 *digest, size_t digest_len)
{
        if (!event_name || !event_label || !buf || !buf_len)
                return -ENOPARAM;

        return process_buffer_measurement(&nop_mnt_idmap, NULL, buf, buf_len,
                                          event_name, CRITICAL_DATA, 0,
                                          event_label, hash, digest,
                                          digest_len);
}
EXPORT_SYMBOL_GPL(ima_measure_critical_data);

#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS

/**
 * ima_kernel_module_request - Prevent crypto-pkcs1(rsa,*) requests
 * @kmod_name: kernel module name
 *
 * Avoid a verification loop where verifying the signature of the modprobe
 * binary requires executing modprobe itself. Since the modprobe iint->mutex
 * is already held when the signature verification is performed, a deadlock
 * occurs as soon as modprobe is executed within the critical region, since
 * the same lock cannot be taken again.
 *
 * This happens when public_key_verify_signature(), in case of RSA algorithm,
 * use alg_name to store internal information in order to construct an
 * algorithm on the fly, but crypto_larval_lookup() will try to use alg_name
 * in order to load a kernel module with same name.
 *
 * Since we don't have any real "crypto-pkcs1(rsa,*)" kernel modules,
 * we are safe to fail such module request from crypto_larval_lookup(), and
 * avoid the verification loop.
 *
 * Return: Zero if it is safe to load the kernel module, -EINVAL otherwise.
 */
static int ima_kernel_module_request(char *kmod_name)
{
        if (strncmp(kmod_name, "crypto-pkcs1(rsa,", 17) == 0)
                return -EINVAL;

        return 0;
}

#endif /* CONFIG_INTEGRITY_ASYMMETRIC_KEYS */

static int __init init_ima(void)
{
        int error;

        /*Note that turning IMA off is intentionally limited to kdump kernel.*/
        if (ima_disabled && is_kdump_kernel()) {
                pr_info("IMA functionality is disabled");
                return 0;
        }

        ima_appraise_parse_cmdline();
        ima_init_template_list();
        hash_setup(CONFIG_IMA_DEFAULT_HASH);
        error = ima_init();

        if (error && strcmp(hash_algo_name[ima_hash_algo],
                            CONFIG_IMA_DEFAULT_HASH) != 0) {
                pr_info("Allocating %s failed, going to use default hash algorithm %s\n",
                        hash_algo_name[ima_hash_algo], CONFIG_IMA_DEFAULT_HASH);
                hash_setup_done = 0;
                hash_setup(CONFIG_IMA_DEFAULT_HASH);
                error = ima_init();
        }

        if (error)
                return error;

        error = register_blocking_lsm_notifier(&ima_lsm_policy_notifier);
        if (error)
                pr_warn("Couldn't register LSM notifier, error %d\n", error);

        if (!error)
                ima_update_policy_flags();

        return error;
}

static struct security_hook_list ima_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(bprm_check_security, ima_bprm_check),
        LSM_HOOK_INIT(bprm_creds_for_exec, ima_bprm_creds_for_exec),
        LSM_HOOK_INIT(bprm_creds_from_file, ima_creds_check),
        LSM_HOOK_INIT(file_post_open, ima_file_check),
        LSM_HOOK_INIT(inode_post_create_tmpfile, ima_post_create_tmpfile),
        LSM_HOOK_INIT(file_release, ima_file_free),
        LSM_HOOK_INIT(mmap_file, ima_file_mmap),
        LSM_HOOK_INIT(file_mprotect, ima_file_mprotect),
        LSM_HOOK_INIT(kernel_load_data, ima_load_data),
        LSM_HOOK_INIT(kernel_post_load_data, ima_post_load_data),
        LSM_HOOK_INIT(kernel_read_file, ima_read_file),
        LSM_HOOK_INIT(kernel_post_read_file, ima_post_read_file),
        LSM_HOOK_INIT(path_post_mknod, ima_post_path_mknod),
#ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS
        LSM_HOOK_INIT(key_post_create_or_update, ima_post_key_create_or_update),
#endif
#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS
        LSM_HOOK_INIT(kernel_module_request, ima_kernel_module_request),
#endif
        LSM_HOOK_INIT(inode_free_security_rcu, ima_inode_free_rcu),
};

static const struct lsm_id ima_lsmid = {
        .name = "ima",
        .id = LSM_ID_IMA,
};

static int __init init_ima_lsm(void)
{
        ima_iintcache_init();
        security_add_hooks(ima_hooks, ARRAY_SIZE(ima_hooks), &ima_lsmid);
        init_ima_appraise_lsm(&ima_lsmid);
        return 0;
}

struct lsm_blob_sizes ima_blob_sizes __ro_after_init = {
        .lbs_inode = sizeof(struct ima_iint_cache *),
};

DEFINE_LSM(ima) = {
        .id = &ima_lsmid,
        .init = init_ima_lsm,
        .order = LSM_ORDER_LAST,
        .blobs = &ima_blob_sizes,
        /* Start IMA after the TPM is available */
        .initcall_late = init_ima,
};





























































































































































































    1 





    1 























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
/*
 *  linux/fs/hfs/catalog.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains the functions related to the catalog B-tree.
 *
 * Cache code shamelessly stolen from
 *     linux/fs/inode.c Copyright (C) 1991, 1992  Linus Torvalds
 *     re-shamelessly stolen Copyright (C) 1997 Linus Torvalds
 */

#include "hfs_fs.h"
#include "btree.h"

/*
 * hfs_cat_build_key()
 *
 * Given the ID of the parent and the name build a search key.
 */
void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, const struct qstr *name)
{
        key->cat.reserved = 0;
        key->cat.ParID = cpu_to_be32(parent);
        if (name) {
                hfs_asc2mac(sb, &key->cat.CName, name);
                key->key_len = 6 + key->cat.CName.len;
        } else {
                memset(&key->cat.CName, 0, sizeof(struct hfs_name));
                key->key_len = 6;
        }
}

static int hfs_cat_build_record(hfs_cat_rec *rec, u32 cnid, struct inode *inode)
{
        __be32 mtime = hfs_mtime();

        memset(rec, 0, sizeof(*rec));
        if (S_ISDIR(inode->i_mode)) {
                rec->type = HFS_CDR_DIR;
                rec->dir.DirID = cpu_to_be32(cnid);
                rec->dir.CrDat = mtime;
                rec->dir.MdDat = mtime;
                rec->dir.BkDat = 0;
                rec->dir.UsrInfo.frView = cpu_to_be16(0xff);
                return sizeof(struct hfs_cat_dir);
        } else {
                /* init some fields for the file record */
                rec->type = HFS_CDR_FIL;
                rec->file.Flags = HFS_FIL_USED | HFS_FIL_THD;
                if (!(inode->i_mode & S_IWUSR))
                        rec->file.Flags |= HFS_FIL_LOCK;
                rec->file.FlNum = cpu_to_be32(cnid);
                rec->file.CrDat = mtime;
                rec->file.MdDat = mtime;
                rec->file.BkDat = 0;
                rec->file.UsrWds.fdType = HFS_SB(inode->i_sb)->s_type;
                rec->file.UsrWds.fdCreator = HFS_SB(inode->i_sb)->s_creator;
                return sizeof(struct hfs_cat_file);
        }
}

static int hfs_cat_build_thread(struct super_block *sb,
                                hfs_cat_rec *rec, int type,
                                u32 parentid, const struct qstr *name)
{
        rec->type = type;
        memset(rec->thread.reserved, 0, sizeof(rec->thread.reserved));
        rec->thread.ParID = cpu_to_be32(parentid);
        hfs_asc2mac(sb, &rec->thread.CName, name);
        return sizeof(struct hfs_cat_thread);
}

/*
 * create_entry()
 *
 * Add a new file or directory to the catalog B-tree and
 * return a (struct hfs_cat_entry) for it in '*result'.
 */
int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct inode *inode)
{
        struct hfs_find_data fd;
        struct super_block *sb;
        union hfs_cat_rec entry;
        int entry_size;
        int err;

        hfs_dbg("name %s, cnid %u, i_nlink %d\n",
                str->name, cnid, inode->i_nlink);
        if (dir->i_size >= HFS_MAX_VALENCE)
                return -ENOSPC;

        sb = dir->i_sb;
        err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
        if (err)
                return err;

        /*
         * Fail early and avoid ENOSPC during the btree operations. We may
         * have to split the root node at most once.
         */
        err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
        if (err)
                goto err2;

        hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
        entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
                        HFS_CDR_THD : HFS_CDR_FTH,
                        dir->i_ino, str);
        err = hfs_brec_find(&fd);
        if (err != -ENOENT) {
                if (!err)
                        err = -EEXIST;
                goto err2;
        }
        err = hfs_brec_insert(&fd, &entry, entry_size);
        if (err)
                goto err2;

        hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
        entry_size = hfs_cat_build_record(&entry, cnid, inode);
        err = hfs_brec_find(&fd);
        if (err != -ENOENT) {
                /* panic? */
                if (!err)
                        err = -EEXIST;
                goto err1;
        }
        err = hfs_brec_insert(&fd, &entry, entry_size);
        if (err)
                goto err1;

        dir->i_size++;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        mark_inode_dirty(dir);
        hfs_find_exit(&fd);
        return 0;

err1:
        hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
        if (!hfs_brec_find(&fd))
                hfs_brec_remove(&fd);
err2:
        hfs_find_exit(&fd);
        return err;
}

/*
 * hfs_cat_compare()
 *
 * Description:
 *   This is the comparison function used for the catalog B-tree.  In
 *   comparing catalog B-tree entries, the parent id is the most
 *   significant field (compared as unsigned ints).  The name field is
 *   the least significant (compared in "Macintosh lexical order",
 *   see hfs_strcmp() in string.c)
 * Input Variable(s):
 *   struct hfs_cat_key *key1: pointer to the first key to compare
 *   struct hfs_cat_key *key2: pointer to the second key to compare
 * Output Variable(s):
 *   NONE
 * Returns:
 *   int: negative if key1<key2, positive if key1>key2, and 0 if key1==key2
 * Preconditions:
 *   key1 and key2 point to "valid" (struct hfs_cat_key)s.
 * Postconditions:
 *   This function has no side-effects
 */
int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2)
{
        __be32 k1p, k2p;

        k1p = key1->cat.ParID;
        k2p = key2->cat.ParID;

        if (k1p != k2p)
                return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;

        return hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len,
                          key2->cat.CName.name, key2->cat.CName.len);
}

/* Try to get a catalog entry for given catalog id */
// move to read_super???
int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
                      struct hfs_find_data *fd)
{
        hfs_cat_rec rec;
        int res, len, type;

        hfs_cat_build_key(sb, fd->search_key, cnid, NULL);
        res = hfs_brec_read(fd, &rec, sizeof(rec));
        if (res)
                return res;

        type = rec.type;
        if (type != HFS_CDR_THD && type != HFS_CDR_FTH) {
                pr_err("found bad thread record in catalog\n");
                return -EIO;
        }

        fd->search_key->cat.ParID = rec.thread.ParID;
        len = fd->search_key->cat.CName.len = rec.thread.CName.len;
        if (len > HFS_NAMELEN) {
                pr_err("bad catalog namelength\n");
                return -EIO;
        }
        memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len);
        return hfs_brec_find(fd);
}

static inline
void hfs_set_next_unused_CNID(struct super_block *sb,
                                u32 deleted_cnid, u32 found_cnid)
{
        if (found_cnid < HFS_FIRSTUSER_CNID) {
                atomic64_cmpxchg(&HFS_SB(sb)->next_id,
                                 deleted_cnid + 1, HFS_FIRSTUSER_CNID);
        } else {
                atomic64_cmpxchg(&HFS_SB(sb)->next_id,
                                 deleted_cnid + 1, found_cnid + 1);
        }
}

/*
 * hfs_correct_next_unused_CNID()
 *
 * Correct the next unused CNID of Catalog Tree.
 */
static
int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid)
{
        struct hfs_btree *cat_tree;
        struct hfs_bnode *node;
        s64 leaf_head;
        s64 leaf_tail;
        s64 node_id;

        hfs_dbg("cnid %u, next_id %lld\n",
                cnid, atomic64_read(&HFS_SB(sb)->next_id));

        if ((cnid + 1) < atomic64_read(&HFS_SB(sb)->next_id)) {
                /* next ID should be unchanged */
                return 0;
        }

        cat_tree = HFS_SB(sb)->cat_tree;
        leaf_head = cat_tree->leaf_head;
        leaf_tail = cat_tree->leaf_tail;

        if (leaf_head > leaf_tail) {
                pr_err("node is corrupted: leaf_head %lld, leaf_tail %lld\n",
                        leaf_head, leaf_tail);
                return -ERANGE;
        }

        node = hfs_bnode_find(cat_tree, leaf_tail);
        if (IS_ERR(node)) {
                pr_err("fail to find leaf node: node ID %lld\n",
                        leaf_tail);
                return -ENOENT;
        }

        node_id = leaf_tail;

        do {
                int i;

                if (node_id != leaf_tail) {
                        node = hfs_bnode_find(cat_tree, node_id);
                        if (IS_ERR(node))
                                return -ENOENT;
                }

                hfs_dbg("node %lld, leaf_tail %lld, leaf_head %lld\n",
                        node_id, leaf_tail, leaf_head);

                hfs_bnode_dump(node);

                for (i = node->num_recs - 1; i >= 0; i--) {
                        hfs_cat_rec rec;
                        u16 off, len, keylen;
                        int entryoffset;
                        int entrylength;
                        u32 found_cnid;

                        len = hfs_brec_lenoff(node, i, &off);
                        keylen = hfs_brec_keylen(node, i);
                        if (keylen == 0) {
                                pr_err("fail to get the keylen: "
                                        "node_id %lld, record index %d\n",
                                        node_id, i);
                                return -EINVAL;
                        }

                        entryoffset = off + keylen;
                        entrylength = len - keylen;

                        if (entrylength > sizeof(rec)) {
                                pr_err("unexpected record length: "
                                        "entrylength %d\n",
                                        entrylength);
                                return -EINVAL;
                        }

                        hfs_bnode_read(node, &rec, entryoffset, entrylength);

                        if (rec.type == HFS_CDR_DIR) {
                                found_cnid = be32_to_cpu(rec.dir.DirID);
                                hfs_dbg("found_cnid %u\n", found_cnid);
                                hfs_set_next_unused_CNID(sb, cnid, found_cnid);
                                hfs_bnode_put(node);
                                return 0;
                        } else if (rec.type == HFS_CDR_FIL) {
                                found_cnid = be32_to_cpu(rec.file.FlNum);
                                hfs_dbg("found_cnid %u\n", found_cnid);
                                hfs_set_next_unused_CNID(sb, cnid, found_cnid);
                                hfs_bnode_put(node);
                                return 0;
                        }
                }

                node_id = node->prev;
                hfs_bnode_put(node);

        } while (node_id >= leaf_head);

        return -ENOENT;
}

/*
 * hfs_cat_delete()
 *
 * Delete the indicated file or directory.
 * The associated thread is also removed unless ('with_thread'==0).
 */
int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
{
        struct super_block *sb;
        struct hfs_find_data fd;
        struct hfs_readdir_data *rd;
        int res, type;

        hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid);
        sb = dir->i_sb;
        res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
        if (res)
                return res;

        hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
        res = hfs_brec_find(&fd);
        if (res)
                goto out;

        type = hfs_bnode_read_u8(fd.bnode, fd.entryoffset);
        if (type == HFS_CDR_FIL) {
                struct hfs_cat_file file;
                hfs_bnode_read(fd.bnode, &file, fd.entryoffset, sizeof(file));
                if (be32_to_cpu(file.FlNum) == cnid) {
#if 0
                        hfs_free_fork(sb, &file, HFS_FK_DATA);
#endif
                        hfs_free_fork(sb, &file, HFS_FK_RSRC);
                }
        }

        /* we only need to take spinlock for exclusion with ->release() */
        spin_lock(&HFS_I(dir)->open_dir_lock);
        list_for_each_entry(rd, &HFS_I(dir)->open_dir_list, list) {
                if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
                        rd->file->f_pos--;
        }
        spin_unlock(&HFS_I(dir)->open_dir_lock);

        res = hfs_brec_remove(&fd);
        if (res)
                goto out;

        hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
        res = hfs_brec_find(&fd);
        if (!res) {
                res = hfs_brec_remove(&fd);
                if (res)
                        goto out;
        }

        dir->i_size--;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        mark_inode_dirty(dir);

        res = hfs_correct_next_unused_CNID(sb, cnid);
        if (res)
                goto out;

        res = 0;
out:
        hfs_find_exit(&fd);

        return res;
}

/*
 * hfs_cat_move()
 *
 * Rename a file or directory, possibly to a new directory.
 * If the destination exists it is removed and a
 * (struct hfs_cat_entry) for it is returned in '*result'.
 */
int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
                 struct inode *dst_dir, const struct qstr *dst_name)
{
        struct super_block *sb;
        struct hfs_find_data src_fd, dst_fd;
        union hfs_cat_rec entry;
        int entry_size, type;
        int err;

        hfs_dbg("cnid %u - (ino %llu, name %s) - (ino %llu, name %s)\n",
                cnid, src_dir->i_ino, src_name->name,
                dst_dir->i_ino, dst_name->name);
        sb = src_dir->i_sb;
        err = hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd);
        if (err)
                return err;
        dst_fd = src_fd;

        /*
         * Fail early and avoid ENOSPC during the btree operations. We may
         * have to split the root node at most once.
         */
        err = hfs_bmap_reserve(src_fd.tree, 2 * src_fd.tree->depth);
        if (err)
                goto out;

        /* find the old dir entry and read the data */
        hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
        err = hfs_brec_find(&src_fd);
        if (err)
                goto out;
        if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
                err = -EIO;
                goto out;
        }

        hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
                            src_fd.entrylength);

        /* create new dir entry with the data from the old entry */
        hfs_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
        err = hfs_brec_find(&dst_fd);
        if (err != -ENOENT) {
                if (!err)
                        err = -EEXIST;
                goto out;
        }

        err = hfs_brec_insert(&dst_fd, &entry, src_fd.entrylength);
        if (err)
                goto out;
        dst_dir->i_size++;
        inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir));
        mark_inode_dirty(dst_dir);

        /* finally remove the old entry */
        hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
        err = hfs_brec_find(&src_fd);
        if (err)
                goto out;
        err = hfs_brec_remove(&src_fd);
        if (err)
                goto out;
        src_dir->i_size--;
        inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir));
        mark_inode_dirty(src_dir);

        type = entry.type;
        if (type == HFS_CDR_FIL && !(entry.file.Flags & HFS_FIL_THD))
                goto out;

        /* remove old thread entry */
        hfs_cat_build_key(sb, src_fd.search_key, cnid, NULL);
        err = hfs_brec_find(&src_fd);
        if (err)
                goto out;
        err = hfs_brec_remove(&src_fd);
        if (err)
                goto out;

        /* create new thread entry */
        hfs_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
        entry_size = hfs_cat_build_thread(sb, &entry, type == HFS_CDR_FIL ? HFS_CDR_FTH : HFS_CDR_THD,
                                        dst_dir->i_ino, dst_name);
        err = hfs_brec_find(&dst_fd);
        if (err != -ENOENT) {
                if (!err)
                        err = -EEXIST;
                goto out;
        }
        err = hfs_brec_insert(&dst_fd, &entry, entry_size);
out:
        hfs_bnode_put(dst_fd.bnode);
        hfs_find_exit(&src_fd);
        return err;
}







































































































































































































    1 



















































































    1 



    1 








    1 



    1 
























































































































    1 


























    1 



















    1 















































    1 








    1 





















    1 



    1 
    1 






    1 













    1 



    1 
    1 
    1 
    1 
































    1 





    1 





























    1 
    1 










    1 















    1 











    1 



    1 




    1 

































    1 






    1 











    1 
















    1 



































    1 


















































































































































































































































































    1 
























    1 


































    1 



































































































































    1 
    1 








    1 












    1 







    1 





    1 
    1 


    1 





    1 



    1 




























    1 


    1 


    1 











    1 




    1 






















































































































































































































































































































































































































    1 







































    1 






















    1 











    1 



    1 





    1 


    1 

































































































































































































































    1 




































    1 












































    1 




    1 










    1 
































































































































































































































































    1 





    1 






    1 







































    1 
































    1 

















































































































































































































































































































































    1 







    1 































    1 
    1 

















    1 













    1 
    1 

    1 



    1 








    1 









    1 









    1 










    1 



















    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 

















































































































































































































































































































































































































































































































































































































































































































































































































































    1 












    1 

















































    1 





























































    1 
































    1 

















































    1 








    1 








    1 
    1 




    1 




    1 




















    1 


    1 

    1 





    1 











































    1 



































    1 













    1 





































































































































































































































































































































































































































































































































































    1 



















    1 






































































































































































































































































































































































































































    1 






    1 


    1 





    1 


    1 



    1 


    1 









































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic hugetlb support.
 * (C) Nadia Yvette Chambers, April 2004
 */
#include <linux/list.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/mmu_notifier.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/compiler.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/memblock.h>
#include <linux/minmax.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/string_choices.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/leafops.h>
#include <linux/jhash.h>
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
#include <linux/migrate.h>
#include <linux/nospec.h>
#include <linux/delayacct.h>
#include <linux/memory.h>
#include <linux/mm_inline.h>
#include <linux/padata.h>
#include <linux/pgalloc.h>

#include <asm/page.h>
#include <asm/tlb.h>
#include <asm/setup.h>

#include <linux/io.h>
#include <linux/node.h>
#include <linux/page_owner.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
#include "hugetlb_cma.h"
#include "hugetlb_internal.h"
#include <linux/page-isolation.h>

int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];

__initdata nodemask_t hugetlb_bootmem_nodes;
__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;

/*
 * Due to ordering constraints across the init code for various
 * architectures, hugetlb hstate cmdline parameters can't simply
 * be early_param. early_param might call the setup function
 * before valid hugetlb page sizes are determined, leading to
 * incorrect rejection of valid hugepagesz= options.
 *
 * So, record the parameters early and consume them whenever the
 * init code is ready for them, by calling hugetlb_parse_params().
 */

/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */
#define HUGE_MAX_CMDLINE_ARGS        (2 * HUGE_MAX_HSTATE + 1)
struct hugetlb_cmdline {
        char *val;
        int (*setup)(char *val);
};

/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
static unsigned long hugepage_allocation_threads __initdata;

static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata;
static int hstate_cmdline_index __initdata;
static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata;
static int hugetlb_param_index __initdata;
static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
static __init void hugetlb_parse_params(void);

#define hugetlb_early_param(str, func) \
static __init int func##args(char *s) \
{ \
        return hugetlb_add_param(s, func); \
} \
early_param(str, func##args)

/*
 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
 * free_huge_pages, and surplus_huge_pages.
 */
__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock);

/*
 * Serializes faults on the same logical page.  This is used to
 * prevent spurious OOMs when the hugepage pool is fully utilized.
 */
static int num_fault_mutexes __ro_after_init;
struct mutex *hugetlb_fault_mutex_table __ro_after_init;

/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);

static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
        if (spool->count)
                return false;
        if (spool->max_hpages != -1)
                return spool->used_hpages == 0;
        if (spool->min_hpages != -1)
                return spool->rsv_hpages == spool->min_hpages;

        return true;
}

static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
                                                unsigned long irq_flags)
{
        spin_unlock_irqrestore(&spool->lock, irq_flags);

        /* If no pages are used, and no other handles to the subpool
         * remain, give up any reservations based on minimum size and
         * free the subpool */
        if (subpool_is_free(spool)) {
                if (spool->min_hpages != -1)
                        hugetlb_acct_memory(spool->hstate,
                                                -spool->min_hpages);
                kfree(spool);
        }
}

struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
                                                long min_hpages)
{
        struct hugepage_subpool *spool;

        spool = kzalloc_obj(*spool);
        if (!spool)
                return NULL;

        spin_lock_init(&spool->lock);
        spool->count = 1;
        spool->max_hpages = max_hpages;
        spool->hstate = h;
        spool->min_hpages = min_hpages;

        if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
                kfree(spool);
                return NULL;
        }
        spool->rsv_hpages = min_hpages;

        return spool;
}

void hugepage_put_subpool(struct hugepage_subpool *spool)
{
        unsigned long flags;

        spin_lock_irqsave(&spool->lock, flags);
        BUG_ON(!spool->count);
        spool->count--;
        unlock_or_release_subpool(spool, flags);
}

/*
 * Subpool accounting for allocating and reserving pages.
 * Return -ENOMEM if there are not enough resources to satisfy the
 * request.  Otherwise, return the number of pages by which the
 * global pools must be adjusted (upward).  The returned value may
 * only be different than the passed value (delta) in the case where
 * a subpool minimum size must be maintained.
 */
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
                                      long delta)
{
        long ret = delta;

        if (!spool)
                return ret;

        spin_lock_irq(&spool->lock);

        if (spool->max_hpages != -1) {                /* maximum size accounting */
                if ((spool->used_hpages + delta) <= spool->max_hpages)
                        spool->used_hpages += delta;
                else {
                        ret = -ENOMEM;
                        goto unlock_ret;
                }
        }

        /* minimum size accounting */
        if (spool->min_hpages != -1 && spool->rsv_hpages) {
                if (delta > spool->rsv_hpages) {
                        /*
                         * Asking for more reserves than those already taken on
                         * behalf of subpool.  Return difference.
                         */
                        ret = delta - spool->rsv_hpages;
                        spool->rsv_hpages = 0;
                } else {
                        ret = 0;        /* reserves already accounted for */
                        spool->rsv_hpages -= delta;
                }
        }

unlock_ret:
        spin_unlock_irq(&spool->lock);
        return ret;
}

/*
 * Subpool accounting for freeing and unreserving pages.
 * Return the number of global page reservations that must be dropped.
 * The return value may only be different than the passed value (delta)
 * in the case where a subpool minimum size must be maintained.
 */
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                       long delta)
{
        long ret = delta;
        unsigned long flags;

        if (!spool)
                return delta;

        spin_lock_irqsave(&spool->lock, flags);

        if (spool->max_hpages != -1)                /* maximum size accounting */
                spool->used_hpages -= delta;

         /* minimum size accounting */
        if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
                if (spool->rsv_hpages + delta <= spool->min_hpages)
                        ret = 0;
                else
                        ret = spool->rsv_hpages + delta - spool->min_hpages;

                spool->rsv_hpages += delta;
                if (spool->rsv_hpages > spool->min_hpages)
                        spool->rsv_hpages = spool->min_hpages;
        }

        /*
         * If hugetlbfs_put_super couldn't free spool due to an outstanding
         * quota reference, free it now.
         */
        unlock_or_release_subpool(spool, flags);

        return ret;
}

static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
        return subpool_inode(file_inode(vma->vm_file));
}

/*
 * hugetlb vma_lock helper routines
 */
void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                down_read(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                down_read(&resv_map->rw_sema);
        }
}

void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                up_read(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                up_read(&resv_map->rw_sema);
        }
}

void hugetlb_vma_lock_write(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                down_write(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                down_write(&resv_map->rw_sema);
        }
}

void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                up_write(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                up_write(&resv_map->rw_sema);
        }
}

int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{

        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                return down_write_trylock(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                return down_write_trylock(&resv_map->rw_sema);
        }

        return 1;
}

void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                lockdep_assert_held(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                lockdep_assert_held(&resv_map->rw_sema);
        }
}

void hugetlb_vma_lock_release(struct kref *kref)
{
        struct hugetlb_vma_lock *vma_lock = container_of(kref,
                        struct hugetlb_vma_lock, refs);

        kfree(vma_lock);
}

static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
{
        struct vm_area_struct *vma = vma_lock->vma;

        /*
         * vma_lock structure may or not be released as a result of put,
         * it certainly will no longer be attached to vma so clear pointer.
         * Semaphore synchronizes access to vma_lock->vma field.
         */
        vma_lock->vma = NULL;
        vma->vm_private_data = NULL;
        up_write(&vma_lock->rw_sema);
        kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
}

static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                __hugetlb_vma_unlock_write_put(vma_lock);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                /* no free for anon vmas, but still need to unlock */
                up_write(&resv_map->rw_sema);
        }
}

static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
{
        /*
         * Only present in sharable vmas.
         */
        if (!vma || !__vma_shareable_lock(vma))
                return;

        if (vma->vm_private_data) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                down_write(&vma_lock->rw_sema);
                __hugetlb_vma_unlock_write_put(vma_lock);
        }
}

/*
 * vma specific semaphore used for pmd sharing and fault/truncation
 * synchronization
 */
int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
{
        struct hugetlb_vma_lock *vma_lock;

        /* Only establish in (flags) sharable vmas */
        if (!vma || !(vma->vm_flags & VM_MAYSHARE))
                return 0;

        /* Should never get here with non-NULL vm_private_data */
        if (vma->vm_private_data)
                return -EINVAL;

        vma_lock = kmalloc_obj(*vma_lock);
        if (!vma_lock) {
                /*
                 * If we can not allocate structure, then vma can not
                 * participate in pmd sharing.  This is only a possible
                 * performance enhancement and memory saving issue.
                 * However, the lock is also used to synchronize page
                 * faults with truncation.  If the lock is not present,
                 * unlikely races could leave pages in a file past i_size
                 * until the file is removed.  Warn in the unlikely case of
                 * allocation failure.
                 */
                pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
                return -EINVAL;
        }

        kref_init(&vma_lock->refs);
        init_rwsem(&vma_lock->rw_sema);
        vma_lock->vma = vma;
        vma->vm_private_data = vma_lock;

        return 0;
}

/* Helper that removes a struct file_region from the resv_map cache and returns
 * it for use.
 */
static struct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
        struct file_region *nrg;

        VM_BUG_ON(resv->region_cache_count <= 0);

        resv->region_cache_count--;
        nrg = list_first_entry(&resv->region_cache, struct file_region, link);
        list_del(&nrg->link);

        nrg->from = from;
        nrg->to = to;

        return nrg;
}

static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
                                              struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
        nrg->reservation_counter = rg->reservation_counter;
        nrg->css = rg->css;
        if (rg->css)
                css_get(rg->css);
#endif
}

/* Helper that records hugetlb_cgroup uncharge info. */
static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
                                                struct hstate *h,
                                                struct resv_map *resv,
                                                struct file_region *nrg)
{
#ifdef CONFIG_CGROUP_HUGETLB
        if (h_cg) {
                nrg->reservation_counter =
                        &h_cg->rsvd_hugepage[hstate_index(h)];
                nrg->css = &h_cg->css;
                /*
                 * The caller will hold exactly one h_cg->css reference for the
                 * whole contiguous reservation region. But this area might be
                 * scattered when there are already some file_regions reside in
                 * it. As a result, many file_regions may share only one css
                 * reference. In order to ensure that one file_region must hold
                 * exactly one h_cg->css reference, we should do css_get for
                 * each file_region and leave the reference held by caller
                 * untouched.
                 */
                css_get(&h_cg->css);
                if (!resv->pages_per_hpage)
                        resv->pages_per_hpage = pages_per_huge_page(h);
                /* pages_per_hpage should be the same for all entries in
                 * a resv_map.
                 */
                VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
        } else {
                nrg->reservation_counter = NULL;
                nrg->css = NULL;
        }
#endif
}

static void put_uncharge_info(struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
        if (rg->css)
                css_put(rg->css);
#endif
}

static bool has_same_uncharge_info(struct file_region *rg,
                                   struct file_region *org)
{
#ifdef CONFIG_CGROUP_HUGETLB
        return rg->reservation_counter == org->reservation_counter &&
               rg->css == org->css;

#else
        return true;
#endif
}

static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
{
        struct file_region *nrg, *prg;

        prg = list_prev_entry(rg, link);
        if (&prg->link != &resv->regions && prg->to == rg->from &&
            has_same_uncharge_info(prg, rg)) {
                prg->to = rg->to;

                list_del(&rg->link);
                put_uncharge_info(rg);
                kfree(rg);

                rg = prg;
        }

        nrg = list_next_entry(rg, link);
        if (&nrg->link != &resv->regions && nrg->from == rg->to &&
            has_same_uncharge_info(nrg, rg)) {
                nrg->from = rg->from;

                list_del(&rg->link);
                put_uncharge_info(rg);
                kfree(rg);
        }
}

static inline long
hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
                     long to, struct hstate *h, struct hugetlb_cgroup *cg,
                     long *regions_needed)
{
        struct file_region *nrg;

        if (!regions_needed) {
                nrg = get_file_region_entry_from_cache(map, from, to);
                record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
                list_add(&nrg->link, rg);
                coalesce_file_region(map, nrg);
        } else {
                *regions_needed += 1;
        }

        return to - from;
}

/*
 * Must be called with resv->lock held.
 *
 * Calling this with regions_needed != NULL will count the number of pages
 * to be added but will not modify the linked list. And regions_needed will
 * indicate the number of file_regions needed in the cache to carry out to add
 * the regions for this range.
 */
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
                                     struct hugetlb_cgroup *h_cg,
                                     struct hstate *h, long *regions_needed)
{
        long add = 0;
        struct list_head *head = &resv->regions;
        long last_accounted_offset = f;
        struct file_region *iter, *trg = NULL;
        struct list_head *rg = NULL;

        if (regions_needed)
                *regions_needed = 0;

        /* In this loop, we essentially handle an entry for the range
         * [last_accounted_offset, iter->from), at every iteration, with some
         * bounds checking.
         */
        list_for_each_entry_safe(iter, trg, head, link) {
                /* Skip irrelevant regions that start before our range. */
                if (iter->from < f) {
                        /* If this region ends after the last accounted offset,
                         * then we need to update last_accounted_offset.
                         */
                        if (iter->to > last_accounted_offset)
                                last_accounted_offset = iter->to;
                        continue;
                }

                /* When we find a region that starts beyond our range, we've
                 * finished.
                 */
                if (iter->from >= t) {
                        rg = iter->link.prev;
                        break;
                }

                /* Add an entry for last_accounted_offset -> iter->from, and
                 * update last_accounted_offset.
                 */
                if (iter->from > last_accounted_offset)
                        add += hugetlb_resv_map_add(resv, iter->link.prev,
                                                    last_accounted_offset,
                                                    iter->from, h, h_cg,
                                                    regions_needed);

                last_accounted_offset = iter->to;
        }

        /* Handle the case where our range extends beyond
         * last_accounted_offset.
         */
        if (!rg)
                rg = head->prev;
        if (last_accounted_offset < t)
                add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
                                            t, h, h_cg, regions_needed);

        return add;
}

/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
 */
static int allocate_file_region_entries(struct resv_map *resv,
                                        int regions_needed)
        __must_hold(&resv->lock)
{
        LIST_HEAD(allocated_regions);
        int to_allocate = 0, i = 0;
        struct file_region *trg = NULL, *rg = NULL;

        VM_BUG_ON(regions_needed < 0);

        /*
         * Check for sufficient descriptors in the cache to accommodate
         * the number of in progress add operations plus regions_needed.
         *
         * This is a while loop because when we drop the lock, some other call
         * to region_add or region_del may have consumed some region_entries,
         * so we keep looping here until we finally have enough entries for
         * (adds_in_progress + regions_needed).
         */
        while (resv->region_cache_count <
               (resv->adds_in_progress + regions_needed)) {
                to_allocate = resv->adds_in_progress + regions_needed -
                              resv->region_cache_count;

                /* At this point, we should have enough entries in the cache
                 * for all the existing adds_in_progress. We should only be
                 * needing to allocate for regions_needed.
                 */
                VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);

                spin_unlock(&resv->lock);
                for (i = 0; i < to_allocate; i++) {
                        trg = kmalloc_obj(*trg);
                        if (!trg)
                                goto out_of_memory;
                        list_add(&trg->link, &allocated_regions);
                }

                spin_lock(&resv->lock);

                list_splice(&allocated_regions, &resv->region_cache);
                resv->region_cache_count += to_allocate;
        }

        return 0;

out_of_memory:
        list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
                list_del(&rg->link);
                kfree(rg);
        }
        return -ENOMEM;
}

/*
 * Add the huge page range represented by [f, t) to the reserve
 * map.  Regions will be taken from the cache to fill in this range.
 * Sufficient regions should exist in the cache due to the previous
 * call to region_chg with the same range, but in some cases the cache will not
 * have sufficient entries due to races with other code doing region_add or
 * region_del.  The extra needed entries will be allocated.
 *
 * regions_needed is the out value provided by a previous call to region_chg.
 *
 * Return the number of new huge pages added to the map.  This number is greater
 * than or equal to zero.  If file_region entries needed to be allocated for
 * this operation and we were not able to allocate, it returns -ENOMEM.
 * region_add of regions of length 1 never allocate file_regions and cannot
 * fail; region_chg will always allocate at least 1 entry and a region_add for
 * 1 page will only require at most 1 entry.
 */
static long region_add(struct resv_map *resv, long f, long t,
                       long in_regions_needed, struct hstate *h,
                       struct hugetlb_cgroup *h_cg)
{
        long add = 0, actual_regions_needed = 0;

        spin_lock(&resv->lock);
retry:

        /* Count how many regions are actually needed to execute this add. */
        add_reservation_in_range(resv, f, t, NULL, NULL,
                                 &actual_regions_needed);

        /*
         * Check for sufficient descriptors in the cache to accommodate
         * this add operation. Note that actual_regions_needed may be greater
         * than in_regions_needed, as the resv_map may have been modified since
         * the region_chg call. In this case, we need to make sure that we
         * allocate extra entries, such that we have enough for all the
         * existing adds_in_progress, plus the excess needed for this
         * operation.
         */
        if (actual_regions_needed > in_regions_needed &&
            resv->region_cache_count <
                    resv->adds_in_progress +
                            (actual_regions_needed - in_regions_needed)) {
                /* region_add operation of range 1 should never need to
                 * allocate file_region entries.
                 */
                VM_BUG_ON(t - f <= 1);

                if (allocate_file_region_entries(
                            resv, actual_regions_needed - in_regions_needed)) {
                        return -ENOMEM;
                }

                goto retry;
        }

        add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);

        resv->adds_in_progress -= in_regions_needed;

        spin_unlock(&resv->lock);
        return add;
}

/*
 * Examine the existing reserve map and determine how many
 * huge pages in the specified range [f, t) are NOT currently
 * represented.  This routine is called before a subsequent
 * call to region_add that will actually modify the reserve
 * map to add the specified range [f, t).  region_chg does
 * not change the number of huge pages represented by the
 * map.  A number of new file_region structures is added to the cache as a
 * placeholder, for the subsequent region_add call to use. At least 1
 * file_region structure is added.
 *
 * out_regions_needed is the number of regions added to the
 * resv->adds_in_progress.  This value needs to be provided to a follow up call
 * to region_add or region_abort for proper accounting.
 *
 * Returns the number of huge pages that need to be added to the existing
 * reservation map for the range [f, t).  This number is greater or equal to
 * zero.  -ENOMEM is returned if a new file_region structure or cache entry
 * is needed and can not be allocated.
 */
static long region_chg(struct resv_map *resv, long f, long t,
                       long *out_regions_needed)
{
        long chg = 0;

        spin_lock(&resv->lock);

        /* Count how many hugepages in this range are NOT represented. */
        chg = add_reservation_in_range(resv, f, t, NULL, NULL,
                                       out_regions_needed);

        if (*out_regions_needed == 0)
                *out_regions_needed = 1;

        if (allocate_file_region_entries(resv, *out_regions_needed))
                return -ENOMEM;

        resv->adds_in_progress += *out_regions_needed;

        spin_unlock(&resv->lock);
        return chg;
}

/*
 * Abort the in progress add operation.  The adds_in_progress field
 * of the resv_map keeps track of the operations in progress between
 * calls to region_chg and region_add.  Operations are sometimes
 * aborted after the call to region_chg.  In such cases, region_abort
 * is called to decrement the adds_in_progress counter. regions_needed
 * is the value returned by the region_chg call, it is used to decrement
 * the adds_in_progress counter.
 *
 * NOTE: The range arguments [f, t) are not needed or used in this
 * routine.  They are kept to make reading the calling code easier as
 * arguments will match the associated region_chg call.
 */
static void region_abort(struct resv_map *resv, long f, long t,
                         long regions_needed)
{
        spin_lock(&resv->lock);
        VM_BUG_ON(!resv->region_cache_count);
        resv->adds_in_progress -= regions_needed;
        spin_unlock(&resv->lock);
}

/*
 * Delete the specified range [f, t) from the reserve map.  If the
 * t parameter is LONG_MAX, this indicates that ALL regions after f
 * should be deleted.  Locate the regions which intersect [f, t)
 * and either trim, delete or split the existing regions.
 *
 * Returns the number of huge pages deleted from the reserve map.
 * In the normal case, the return value is zero or more.  In the
 * case where a region must be split, a new region descriptor must
 * be allocated.  If the allocation fails, -ENOMEM will be returned.
 * NOTE: If the parameter t == LONG_MAX, then we will never split
 * a region and possibly return -ENOMEM.  Callers specifying
 * t == LONG_MAX do not need to check for -ENOMEM error.
 */
static long region_del(struct resv_map *resv, long f, long t)
{
        struct list_head *head = &resv->regions;
        struct file_region *rg, *trg;
        struct file_region *nrg = NULL;
        long del = 0;

retry:
        spin_lock(&resv->lock);
        list_for_each_entry_safe(rg, trg, head, link) {
                /*
                 * Skip regions before the range to be deleted.  file_region
                 * ranges are normally of the form [from, to).  However, there
                 * may be a "placeholder" entry in the map which is of the form
                 * (from, to) with from == to.  Check for placeholder entries
                 * at the beginning of the range to be deleted.
                 */
                if (rg->to <= f && (rg->to != rg->from || rg->to != f))
                        continue;

                if (rg->from >= t)
                        break;

                if (f > rg->from && t < rg->to) { /* Must split region */
                        /*
                         * Check for an entry in the cache before dropping
                         * lock and attempting allocation.
                         */
                        if (!nrg &&
                            resv->region_cache_count > resv->adds_in_progress) {
                                nrg = list_first_entry(&resv->region_cache,
                                                        struct file_region,
                                                        link);
                                list_del(&nrg->link);
                                resv->region_cache_count--;
                        }

                        if (!nrg) {
                                spin_unlock(&resv->lock);
                                nrg = kmalloc_obj(*nrg);
                                if (!nrg)
                                        return -ENOMEM;
                                goto retry;
                        }

                        del += t - f;
                        hugetlb_cgroup_uncharge_file_region(
                                resv, rg, t - f, false);

                        /* New entry for end of split region */
                        nrg->from = t;
                        nrg->to = rg->to;

                        copy_hugetlb_cgroup_uncharge_info(nrg, rg);

                        INIT_LIST_HEAD(&nrg->link);

                        /* Original entry is trimmed */
                        rg->to = f;

                        list_add(&nrg->link, &rg->link);
                        nrg = NULL;
                        break;
                }

                if (f <= rg->from && t >= rg->to) { /* Remove entire region */
                        del += rg->to - rg->from;
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
                                                            rg->to - rg->from, true);
                        list_del(&rg->link);
                        kfree(rg);
                        continue;
                }

                if (f <= rg->from) {        /* Trim beginning of region */
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
                                                            t - rg->from, false);

                        del += t - rg->from;
                        rg->from = t;
                } else {                /* Trim end of region */
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
                                                            rg->to - f, false);

                        del += rg->to - f;
                        rg->to = f;
                }
        }

        spin_unlock(&resv->lock);
        kfree(nrg);
        return del;
}

/*
 * A rare out of memory error was encountered which prevented removal of
 * the reserve map region for a page.  The huge page itself was free'ed
 * and removed from the page cache.  This routine will adjust the subpool
 * usage count, and the global reserve count if needed.  By incrementing
 * these counts, the reserve map entry which could not be deleted will
 * appear as a "reserved" entry instead of simply dangling with incorrect
 * counts.
 */
void hugetlb_fix_reserve_counts(struct inode *inode)
{
        struct hugepage_subpool *spool = subpool_inode(inode);
        long rsv_adjust;
        bool reserved = false;

        rsv_adjust = hugepage_subpool_get_pages(spool, 1);
        if (rsv_adjust > 0) {
                struct hstate *h = hstate_inode(inode);

                if (!hugetlb_acct_memory(h, 1))
                        reserved = true;
        } else if (!rsv_adjust) {
                reserved = true;
        }

        if (!reserved)
                pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}

/*
 * Count and return the number of huge pages in the reserve map
 * that intersect with the range [f, t).
 */
static long region_count(struct resv_map *resv, long f, long t)
{
        struct list_head *head = &resv->regions;
        struct file_region *rg;
        long chg = 0;

        spin_lock(&resv->lock);
        /* Locate each segment we overlap with, and count that overlap. */
        list_for_each_entry(rg, head, link) {
                long seg_from;
                long seg_to;

                if (rg->to <= f)
                        continue;
                if (rg->from >= t)
                        break;

                seg_from = max(rg->from, f);
                seg_to = min(rg->to, t);

                chg += seg_to - seg_from;
        }
        spin_unlock(&resv->lock);

        return chg;
}

/*
 * Convert the address within this vma to the page offset within
 * the mapping, huge page units here.
 */
static pgoff_t vma_hugecache_offset(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long address)
{
        return ((address - vma->vm_start) >> huge_page_shift(h)) +
                        (vma->vm_pgoff >> huge_page_order(h));
}

/*
 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 * bits of the reservation map pointer, which are always clear due to
 * alignment.
 */
#define HPAGE_RESV_OWNER    (1UL << 0)
#define HPAGE_RESV_UNMAPPED (1UL << 1)
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)

/*
 * These helpers are used to track how many pages are reserved for
 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
 * is guaranteed to have their future faults succeed.
 *
 * With the exception of hugetlb_dup_vma_private() which is called at fork(),
 * the reserve counters are updated with the hugetlb_lock held. It is safe
 * to reset the VMA at fork() time as it is not in use yet and there is no
 * chance of the global counters getting corrupted as a result of the values.
 *
 * The private mapping reservation is represented in a subtly different
 * manner to a shared mapping.  A shared mapping has a region map associated
 * with the underlying file, this region map represents the backing file
 * pages which have ever had a reservation assigned which this persists even
 * after the page is instantiated.  A private mapping has a region map
 * associated with the original mmap which is attached to all VMAs which
 * reference it, this region map represents those offsets which have consumed
 * reservation ie. where pages have been instantiated.
 */
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
        return (unsigned long)vma->vm_private_data;
}

static void set_vma_private_data(struct vm_area_struct *vma,
                                                        unsigned long value)
{
        vma->vm_private_data = (void *)value;
}

static void
resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
                                          struct hugetlb_cgroup *h_cg,
                                          struct hstate *h)
{
#ifdef CONFIG_CGROUP_HUGETLB
        if (!h_cg || !h) {
                resv_map->reservation_counter = NULL;
                resv_map->pages_per_hpage = 0;
                resv_map->css = NULL;
        } else {
                resv_map->reservation_counter =
                        &h_cg->rsvd_hugepage[hstate_index(h)];
                resv_map->pages_per_hpage = pages_per_huge_page(h);
                resv_map->css = &h_cg->css;
        }
#endif
}

struct resv_map *resv_map_alloc(void)
{
        struct resv_map *resv_map = kmalloc_obj(*resv_map);
        struct file_region *rg = kmalloc_obj(*rg);

        if (!resv_map || !rg) {
                kfree(resv_map);
                kfree(rg);
                return NULL;
        }

        kref_init(&resv_map->refs);
        spin_lock_init(&resv_map->lock);
        INIT_LIST_HEAD(&resv_map->regions);
        init_rwsem(&resv_map->rw_sema);

        resv_map->adds_in_progress = 0;
        /*
         * Initialize these to 0. On shared mappings, 0's here indicate these
         * fields don't do cgroup accounting. On private mappings, these will be
         * re-initialized to the proper values, to indicate that hugetlb cgroup
         * reservations are to be un-charged from here.
         */
        resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);

        INIT_LIST_HEAD(&resv_map->region_cache);
        list_add(&rg->link, &resv_map->region_cache);
        resv_map->region_cache_count = 1;

        return resv_map;
}

void resv_map_release(struct kref *ref)
{
        struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
        struct list_head *head = &resv_map->region_cache;
        struct file_region *rg, *trg;

        /* Clear out any active regions before we release the map. */
        region_del(resv_map, 0, LONG_MAX);

        /* ... and any entries left in the cache */
        list_for_each_entry_safe(rg, trg, head, link) {
                list_del(&rg->link);
                kfree(rg);
        }

        VM_BUG_ON(resv_map->adds_in_progress);

        kfree(resv_map);
}

static inline struct resv_map *inode_resv_map(struct inode *inode)
{
        return HUGETLBFS_I(inode)->resv_map;
}

static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        if (vma->vm_flags & VM_MAYSHARE) {
                struct address_space *mapping = vma->vm_file->f_mapping;
                struct inode *inode = mapping->host;

                return inode_resv_map(inode);

        } else {
                return (struct resv_map *)(get_vma_private_data(vma) &
                                                        ~HPAGE_RESV_MASK);
        }
}

static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
        VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma);
        VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma);

        set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}

static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map)
{
        VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
        VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT));

        desc->private_data = map;
}

static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags)
{
        VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
        VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT));

        desc->private_data = (void *)((unsigned long)desc->private_data | flags);
}

static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);

        return (get_vma_private_data(vma) & flag) != 0;
}

static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
{
        VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));

        return ((unsigned long)desc->private_data) & flag;
}

bool __vma_private_lock(struct vm_area_struct *vma)
{
        return !(vma->vm_flags & VM_MAYSHARE) &&
                get_vma_private_data(vma) & ~HPAGE_RESV_MASK &&
                is_vma_resv_set(vma, HPAGE_RESV_OWNER);
}

void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        /*
         * Clear vm_private_data
         * - For shared mappings this is a per-vma semaphore that may be
         *   allocated in a subsequent call to hugetlb_vm_op_open.
         *   Before clearing, make sure pointer is not associated with vma
         *   as this will leak the structure.  This is the case when called
         *   via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
         *   been called to allocate a new structure.
         * - For MAP_PRIVATE mappings, this is the reserve map which does
         *   not apply to children.  Faults generated by the children are
         *   not guaranteed to succeed, even if read-only.
         */
        if (vma->vm_flags & VM_MAYSHARE) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                if (vma_lock && vma_lock->vma != vma)
                        vma->vm_private_data = NULL;
        } else {
                vma->vm_private_data = NULL;
        }
}

/*
 * Reset and decrement one ref on hugepage private reservation.
 * Called with mm->mmap_lock writer semaphore held.
 * This function should be only used by mremap and operate on
 * same sized vma. It should never come here with last ref on the
 * reservation.
 */
void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
{
        /*
         * Clear the old hugetlb private page reservation.
         * It has already been transferred to new_vma.
         *
         * During a mremap() operation of a hugetlb vma we call move_vma()
         * which copies vma into new_vma and unmaps vma. After the copy
         * operation both new_vma and vma share a reference to the resv_map
         * struct, and at that point vma is about to be unmapped. We don't
         * want to return the reservation to the pool at unmap of vma because
         * the reservation still lives on in new_vma, so simply decrement the
         * ref here and remove the resv_map reference from this vma.
         */
        struct resv_map *reservations = vma_resv_map(vma);

        if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
                kref_put(&reservations->refs, resv_map_release);
        }

        hugetlb_dup_vma_private(vma);
}

static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
        int nid = folio_nid(folio);

        lockdep_assert_held(&hugetlb_lock);
        VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);

        list_move(&folio->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
        folio_set_hugetlb_freed(folio);
}

static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
                                                                int nid)
{
        struct folio *folio;
        bool pin = !!(current->flags & PF_MEMALLOC_PIN);

        lockdep_assert_held(&hugetlb_lock);
        list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
                if (pin && !folio_is_longterm_pinnable(folio))
                        continue;

                if (folio_test_hwpoison(folio))
                        continue;

                if (is_migrate_isolate_page(&folio->page))
                        continue;

                list_move(&folio->lru, &h->hugepage_activelist);
                folio_ref_unfreeze(folio, 1);
                folio_clear_hugetlb_freed(folio);
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
                return folio;
        }

        return NULL;
}

static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask,
                                                        int nid, nodemask_t *nmask)
{
        unsigned int cpuset_mems_cookie;
        struct zonelist *zonelist;
        struct zone *zone;
        struct zoneref *z;
        int node = NUMA_NO_NODE;

        /* 'nid' should not be NUMA_NO_NODE. Try to catch any misuse of it and rectifiy. */
        if (nid == NUMA_NO_NODE)
                nid = numa_node_id();

        zonelist = node_zonelist(nid, gfp_mask);

retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
        for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
                struct folio *folio;

                if (!cpuset_zone_allowed(zone, gfp_mask))
                        continue;
                /*
                 * no need to ask again on the same node. Pool is node rather than
                 * zone aware
                 */
                if (zone_to_nid(zone) == node)
                        continue;
                node = zone_to_nid(zone);

                folio = dequeue_hugetlb_folio_node_exact(h, node);
                if (folio)
                        return folio;
        }
        if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
                goto retry_cpuset;

        return NULL;
}

static unsigned long available_huge_pages(struct hstate *h)
{
        return h->free_huge_pages - h->resv_huge_pages;
}

static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, long gbl_chg)
{
        struct folio *folio = NULL;
        struct mempolicy *mpol;
        gfp_t gfp_mask;
        nodemask_t *nodemask;
        int nid;

        /*
         * gbl_chg==1 means the allocation requires a new page that was not
         * reserved before.  Making sure there's at least one free page.
         */
        if (gbl_chg && !available_huge_pages(h))
                goto err;

        gfp_mask = htlb_alloc_mask(h);
        nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);

        if (mpol_is_preferred_many(mpol)) {
                folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
                                                        nid, nodemask);

                /* Fallback to all nodes if page==NULL */
                nodemask = NULL;
        }

        if (!folio)
                folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
                                                        nid, nodemask);

        mpol_cond_put(mpol);
        return folio;

err:
        return NULL;
}

#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && defined(CONFIG_CONTIG_ALLOC)
static struct folio *alloc_gigantic_frozen_folio(int order, gfp_t gfp_mask,
                int nid, nodemask_t *nodemask)
{
        struct folio *folio;

        folio = hugetlb_cma_alloc_frozen_folio(order, gfp_mask, nid, nodemask);
        if (folio)
                return folio;

        if (hugetlb_cma_exclusive_alloc())
                return NULL;

        folio = (struct folio *)alloc_contig_frozen_pages(1 << order, gfp_mask,
                                                          nid, nodemask);
        return folio;
}
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE || !CONFIG_CONTIG_ALLOC */
static struct folio *alloc_gigantic_frozen_folio(int order, gfp_t gfp_mask, int nid,
                                          nodemask_t *nodemask)
{
        return NULL;
}
#endif

/*
 * Remove hugetlb folio from lists.
 * If vmemmap exists for the folio, clear the hugetlb flag so that the
 * folio appears as just a compound page.  Otherwise, wait until after
 * allocating vmemmap to clear the flag.
 *
 * Must be called with hugetlb lock held.
 */
void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
                          bool adjust_surplus)
{
        int nid = folio_nid(folio);

        VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
        VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);

        lockdep_assert_held(&hugetlb_lock);
        if (hstate_is_gigantic_no_runtime(h))
                return;

        list_del(&folio->lru);

        if (folio_test_hugetlb_freed(folio)) {
                folio_clear_hugetlb_freed(folio);
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
        }
        if (adjust_surplus) {
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
        }

        /*
         * We can only clear the hugetlb flag after allocating vmemmap
         * pages.  Otherwise, someone (memory error handling) may try to write
         * to tail struct pages.
         */
        if (!folio_test_hugetlb_vmemmap_optimized(folio))
                __folio_clear_hugetlb(folio);

        h->nr_huge_pages--;
        h->nr_huge_pages_node[nid]--;
}

void add_hugetlb_folio(struct hstate *h, struct folio *folio,
                       bool adjust_surplus)
{
        int nid = folio_nid(folio);

        VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);

        lockdep_assert_held(&hugetlb_lock);

        INIT_LIST_HEAD(&folio->lru);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;

        if (adjust_surplus) {
                h->surplus_huge_pages++;
                h->surplus_huge_pages_node[nid]++;
        }

        __folio_set_hugetlb(folio);
        folio_change_private(folio, NULL);
        /*
         * We have to set hugetlb_vmemmap_optimized again as above
         * folio_change_private(folio, NULL) cleared it.
         */
        folio_set_hugetlb_vmemmap_optimized(folio);

        arch_clear_hugetlb_flags(folio);
        enqueue_hugetlb_folio(h, folio);
}

static void __update_and_free_hugetlb_folio(struct hstate *h,
                                                struct folio *folio)
{
        bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);

        if (hstate_is_gigantic_no_runtime(h))
                return;

        /*
         * If we don't know which subpages are hwpoisoned, we can't free
         * the hugepage, so it's leaked intentionally.
         */
        if (folio_test_hugetlb_raw_hwp_unreliable(folio))
                return;

        /*
         * If folio is not vmemmap optimized (!clear_flag), then the folio
         * is no longer identified as a hugetlb page.  hugetlb_vmemmap_restore_folio
         * can only be passed hugetlb pages and will BUG otherwise.
         */
        if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {
                spin_lock_irq(&hugetlb_lock);
                /*
                 * If we cannot allocate vmemmap pages, just refuse to free the
                 * page and put the page back on the hugetlb free list and treat
                 * as a surplus page.
                 */
                add_hugetlb_folio(h, folio, true);
                spin_unlock_irq(&hugetlb_lock);
                return;
        }

        /*
         * If vmemmap pages were allocated above, then we need to clear the
         * hugetlb flag under the hugetlb lock.
         */
        if (folio_test_hugetlb(folio)) {
                spin_lock_irq(&hugetlb_lock);
                __folio_clear_hugetlb(folio);
                spin_unlock_irq(&hugetlb_lock);
        }

        /*
         * Move PageHWPoison flag from head page to the raw error pages,
         * which makes any healthy subpages reusable.
         */
        if (unlikely(folio_test_hwpoison(folio)))
                folio_clear_hugetlb_hwpoison(folio);

        VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
        if (folio_test_hugetlb_cma(folio))
                hugetlb_cma_free_frozen_folio(folio);
        else
                free_frozen_pages(&folio->page, folio_order(folio));
}

/*
 * As update_and_free_hugetlb_folio() can be called under any context, so we cannot
 * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
 * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
 * the vmemmap pages.
 *
 * free_hpage_workfn() locklessly retrieves the linked list of pages to be
 * freed and frees them one-by-one. As the page->mapping pointer is going
 * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
 * structure of a lockless linked list of huge pages to be freed.
 */
static LLIST_HEAD(hpage_freelist);

static void free_hpage_workfn(struct work_struct *work)
{
        struct llist_node *node;

        node = llist_del_all(&hpage_freelist);

        while (node) {
                struct folio *folio;
                struct hstate *h;

                folio = container_of((struct address_space **)node,
                                     struct folio, mapping);
                node = node->next;
                folio->mapping = NULL;
                /*
                 * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
                 * folio_hstate() is going to trigger because a previous call to
                 * remove_hugetlb_folio() will clear the hugetlb bit, so do
                 * not use folio_hstate() directly.
                 */
                h = size_to_hstate(folio_size(folio));

                __update_and_free_hugetlb_folio(h, folio);

                cond_resched();
        }
}
static DECLARE_WORK(free_hpage_work, free_hpage_workfn);

static inline void flush_free_hpage_work(struct hstate *h)
{
        if (hugetlb_vmemmap_optimizable(h))
                flush_work(&free_hpage_work);
}

static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
                                 bool atomic)
{
        if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
                __update_and_free_hugetlb_folio(h, folio);
                return;
        }

        /*
         * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
         *
         * Only call schedule_work() if hpage_freelist is previously
         * empty. Otherwise, schedule_work() had been called but the workfn
         * hasn't retrieved the list yet.
         */
        if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
                schedule_work(&free_hpage_work);
}

static void bulk_vmemmap_restore_error(struct hstate *h,
                                        struct list_head *folio_list,
                                        struct list_head *non_hvo_folios)
{
        struct folio *folio, *t_folio;

        if (!list_empty(non_hvo_folios)) {
                /*
                 * Free any restored hugetlb pages so that restore of the
                 * entire list can be retried.
                 * The idea is that in the common case of ENOMEM errors freeing
                 * hugetlb pages with vmemmap we will free up memory so that we
                 * can allocate vmemmap for more hugetlb pages.
                 */
                list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
                        list_del(&folio->lru);
                        spin_lock_irq(&hugetlb_lock);
                        __folio_clear_hugetlb(folio);
                        spin_unlock_irq(&hugetlb_lock);
                        update_and_free_hugetlb_folio(h, folio, false);
                        cond_resched();
                }
        } else {
                /*
                 * In the case where there are no folios which can be
                 * immediately freed, we loop through the list trying to restore
                 * vmemmap individually in the hope that someone elsewhere may
                 * have done something to cause success (such as freeing some
                 * memory).  If unable to restore a hugetlb page, the hugetlb
                 * page is made a surplus page and removed from the list.
                 * If are able to restore vmemmap and free one hugetlb page, we
                 * quit processing the list to retry the bulk operation.
                 */
                list_for_each_entry_safe(folio, t_folio, folio_list, lru)
                        if (hugetlb_vmemmap_restore_folio(h, folio)) {
                                list_del(&folio->lru);
                                spin_lock_irq(&hugetlb_lock);
                                add_hugetlb_folio(h, folio, true);
                                spin_unlock_irq(&hugetlb_lock);
                        } else {
                                list_del(&folio->lru);
                                spin_lock_irq(&hugetlb_lock);
                                __folio_clear_hugetlb(folio);
                                spin_unlock_irq(&hugetlb_lock);
                                update_and_free_hugetlb_folio(h, folio, false);
                                cond_resched();
                                break;
                        }
        }
}

static void update_and_free_pages_bulk(struct hstate *h,
                                                struct list_head *folio_list)
{
        long ret;
        struct folio *folio, *t_folio;
        LIST_HEAD(non_hvo_folios);

        /*
         * First allocate required vmemmmap (if necessary) for all folios.
         * Carefully handle errors and free up any available hugetlb pages
         * in an effort to make forward progress.
         */
retry:
        ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios);
        if (ret < 0) {
                bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios);
                goto retry;
        }

        /*
         * At this point, list should be empty, ret should be >= 0 and there
         * should only be pages on the non_hvo_folios list.
         * Do note that the non_hvo_folios list could be empty.
         * Without HVO enabled, ret will be 0 and there is no need to call
         * __folio_clear_hugetlb as this was done previously.
         */
        VM_WARN_ON(!list_empty(folio_list));
        VM_WARN_ON(ret < 0);
        if (!list_empty(&non_hvo_folios) && ret) {
                spin_lock_irq(&hugetlb_lock);
                list_for_each_entry(folio, &non_hvo_folios, lru)
                        __folio_clear_hugetlb(folio);
                spin_unlock_irq(&hugetlb_lock);
        }

        list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) {
                update_and_free_hugetlb_folio(h, folio, false);
                cond_resched();
        }
}

struct hstate *size_to_hstate(unsigned long size)
{
        struct hstate *h;

        for_each_hstate(h) {
                if (huge_page_size(h) == size)
                        return h;
        }
        return NULL;
}

void free_huge_folio(struct folio *folio)
{
        /*
         * Can't pass hstate in here because it is called from the
         * generic mm code.
         */
        struct hstate *h = folio_hstate(folio);
        int nid = folio_nid(folio);
        struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
        bool restore_reserve;
        unsigned long flags;

        VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
        VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);

        hugetlb_set_folio_subpool(folio, NULL);
        if (folio_test_anon(folio))
                __ClearPageAnonExclusive(&folio->page);
        folio->mapping = NULL;
        restore_reserve = folio_test_hugetlb_restore_reserve(folio);
        folio_clear_hugetlb_restore_reserve(folio);

        /*
         * If HPageRestoreReserve was set on page, page allocation consumed a
         * reservation.  If the page was associated with a subpool, there
         * would have been a page reserved in the subpool before allocation
         * via hugepage_subpool_get_pages().  Since we are 'restoring' the
         * reservation, do not call hugepage_subpool_put_pages() as this will
         * remove the reserved page from the subpool.
         */
        if (!restore_reserve) {
                /*
                 * A return code of zero implies that the subpool will be
                 * under its minimum size if the reservation is not restored
                 * after page is free.  Therefore, force restore_reserve
                 * operation.
                 */
                if (hugepage_subpool_put_pages(spool, 1) == 0)
                        restore_reserve = true;
        }

        spin_lock_irqsave(&hugetlb_lock, flags);
        folio_clear_hugetlb_migratable(folio);
        hugetlb_cgroup_uncharge_folio(hstate_index(h),
                                     pages_per_huge_page(h), folio);
        hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
                                          pages_per_huge_page(h), folio);
        lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h));
        mem_cgroup_uncharge(folio);
        if (restore_reserve)
                h->resv_huge_pages++;

        if (folio_test_hugetlb_temporary(folio)) {
                remove_hugetlb_folio(h, folio, false);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
                update_and_free_hugetlb_folio(h, folio, true);
        } else if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
                remove_hugetlb_folio(h, folio, true);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
                update_and_free_hugetlb_folio(h, folio, true);
        } else {
                arch_clear_hugetlb_flags(folio);
                enqueue_hugetlb_folio(h, folio);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
        }
}

/*
 * Must be called with the hugetlb lock held
 */
static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
        lockdep_assert_held(&hugetlb_lock);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[folio_nid(folio)]++;
}

void init_new_hugetlb_folio(struct folio *folio)
{
        __folio_set_hugetlb(folio);
        INIT_LIST_HEAD(&folio->lru);
        hugetlb_set_folio_subpool(folio, NULL);
        set_hugetlb_cgroup(folio, NULL);
        set_hugetlb_cgroup_rsvd(folio, NULL);
}

/*
 * Find and lock address space (mapping) in write mode.
 *
 * Upon entry, the folio is locked which means that folio_mapping() is
 * stable.  Due to locking order, we can only trylock_write.  If we can
 * not get the lock, simply return NULL to caller.
 */
struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        if (!mapping)
                return mapping;

        if (i_mmap_trylock_write(mapping))
                return mapping;

        return NULL;
}

static struct folio *alloc_buddy_frozen_folio(int order, gfp_t gfp_mask,
                int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry)
{
        struct folio *folio;
        bool alloc_try_hard = true;

        /*
         * By default we always try hard to allocate the folio with
         * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating folios in
         * a loop (to adjust global huge page counts) and previous allocation
         * failed, do not continue to try hard on the same node.  Use the
         * node_alloc_noretry bitmap to manage this state information.
         */
        if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
                alloc_try_hard = false;
        if (alloc_try_hard)
                gfp_mask |= __GFP_RETRY_MAYFAIL;

        folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);

        /*
         * If we did not specify __GFP_RETRY_MAYFAIL, but still got a
         * folio this indicates an overall state change.  Clear bit so
         * that we resume normal 'try hard' allocations.
         */
        if (node_alloc_noretry && folio && !alloc_try_hard)
                node_clear(nid, *node_alloc_noretry);

        /*
         * If we tried hard to get a folio but failed, set bit so that
         * subsequent attempts will not try as hard until there is an
         * overall state change.
         */
        if (node_alloc_noretry && !folio && alloc_try_hard)
                node_set(nid, *node_alloc_noretry);

        if (!folio) {
                __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
                return NULL;
        }

        __count_vm_event(HTLB_BUDDY_PGALLOC);
        return folio;
}

static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
                gfp_t gfp_mask, int nid, nodemask_t *nmask,
                nodemask_t *node_alloc_noretry)
{
        struct folio *folio;
        int order = huge_page_order(h);

        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        if (order_is_gigantic(order))
                folio = alloc_gigantic_frozen_folio(order, gfp_mask, nid, nmask);
        else
                folio = alloc_buddy_frozen_folio(order, gfp_mask, nid, nmask,
                                                 node_alloc_noretry);
        if (folio)
                init_new_hugetlb_folio(folio);
        return folio;
}

/*
 * Common helper to allocate a fresh hugetlb folio. All specific allocators
 * should use this function to get new hugetlb folio
 *
 * Note that returned folio is 'frozen':  ref count of head page and all tail
 * pages is zero, and the accounting must be done in the caller.
 */
static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
                gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
        struct folio *folio;

        folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
        if (folio)
                hugetlb_vmemmap_optimize_folio(h, folio);
        return folio;
}

void prep_and_add_allocated_folios(struct hstate *h,
                                   struct list_head *folio_list)
{
        unsigned long flags;
        struct folio *folio, *tmp_f;

        /* Send list for bulk vmemmap optimization processing */
        hugetlb_vmemmap_optimize_folios(h, folio_list);

        /* Add all new pool pages to free lists in one lock cycle */
        spin_lock_irqsave(&hugetlb_lock, flags);
        list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
                account_new_hugetlb_folio(h, folio);
                enqueue_hugetlb_folio(h, folio);
        }
        spin_unlock_irqrestore(&hugetlb_lock, flags);
}

/*
 * Allocates a fresh hugetlb page in a node interleaved manner.  The page
 * will later be added to the appropriate hugetlb pool.
 */
static struct folio *alloc_pool_huge_folio(struct hstate *h,
                                        nodemask_t *nodes_allowed,
                                        nodemask_t *node_alloc_noretry,
                                        int *next_node)
{
        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
        int nr_nodes, node;

        for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) {
                struct folio *folio;

                folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
                                        nodes_allowed, node_alloc_noretry);
                if (folio)
                        return folio;
        }

        return NULL;
}

/*
 * Remove huge page from pool from next node to free.  Attempt to keep
 * persistent huge pages more or less balanced over allowed nodes.
 * This routine only 'removes' the hugetlb page.  The caller must make
 * an additional call to free the page to low level allocators.
 * Called with hugetlb_lock locked.
 */
static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
                nodemask_t *nodes_allowed, bool acct_surplus)
{
        int nr_nodes, node;
        struct folio *folio = NULL;

        lockdep_assert_held(&hugetlb_lock);
        for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                /*
                 * If we're returning unused surplus pages, only examine
                 * nodes with surplus pages.
                 */
                if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
                    !list_empty(&h->hugepage_freelists[node])) {
                        folio = list_entry(h->hugepage_freelists[node].next,
                                          struct folio, lru);
                        remove_hugetlb_folio(h, folio, acct_surplus);
                        break;
                }
        }

        return folio;
}

/*
 * Dissolve a given free hugetlb folio into free buddy pages. This function
 * does nothing for in-use hugetlb folios and non-hugetlb folios.
 * This function returns values like below:
 *
 *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
 *           when the system is under memory pressure and the feature of
 *           freeing unused vmemmap pages associated with each hugetlb page
 *           is enabled.
 *  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
 *           (allocated or reserved.)
 *       0:  successfully dissolved free hugepages or the page is not a
 *           hugepage (considered as already dissolved)
 */
int dissolve_free_hugetlb_folio(struct folio *folio)
{
        int rc = -EBUSY;

retry:
        /* Not to disrupt normal path by vainly holding hugetlb_lock */
        if (!folio_test_hugetlb(folio))
                return 0;

        spin_lock_irq(&hugetlb_lock);
        if (!folio_test_hugetlb(folio)) {
                rc = 0;
                goto out;
        }

        if (!folio_ref_count(folio)) {
                struct hstate *h = folio_hstate(folio);
                bool adjust_surplus = false;

                if (!available_huge_pages(h))
                        goto out;

                /*
                 * We should make sure that the page is already on the free list
                 * when it is dissolved.
                 */
                if (unlikely(!folio_test_hugetlb_freed(folio))) {
                        spin_unlock_irq(&hugetlb_lock);
                        cond_resched();

                        /*
                         * Theoretically, we should return -EBUSY when we
                         * encounter this race. In fact, we have a chance
                         * to successfully dissolve the page if we do a
                         * retry. Because the race window is quite small.
                         * If we seize this opportunity, it is an optimization
                         * for increasing the success rate of dissolving page.
                         */
                        goto retry;
                }

                if (h->surplus_huge_pages_node[folio_nid(folio)])
                        adjust_surplus = true;
                remove_hugetlb_folio(h, folio, adjust_surplus);
                h->max_huge_pages--;
                spin_unlock_irq(&hugetlb_lock);

                /*
                 * Normally update_and_free_hugtlb_folio will allocate required vmemmmap
                 * before freeing the page.  update_and_free_hugtlb_folio will fail to
                 * free the page if it can not allocate required vmemmap.  We
                 * need to adjust max_huge_pages if the page is not freed.
                 * Attempt to allocate vmemmmap here so that we can take
                 * appropriate action on failure.
                 *
                 * The folio_test_hugetlb check here is because
                 * remove_hugetlb_folio will clear hugetlb folio flag for
                 * non-vmemmap optimized hugetlb folios.
                 */
                if (folio_test_hugetlb(folio)) {
                        rc = hugetlb_vmemmap_restore_folio(h, folio);
                        if (rc) {
                                spin_lock_irq(&hugetlb_lock);
                                add_hugetlb_folio(h, folio, adjust_surplus);
                                h->max_huge_pages++;
                                goto out;
                        }
                } else {
                        rc = 0;
                }

                update_and_free_hugetlb_folio(h, folio, false);
                return rc;
        }
out:
        spin_unlock_irq(&hugetlb_lock);
        return rc;
}

/*
 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
 * make specified memory blocks removable from the system.
 * Note that this will dissolve a free gigantic hugepage completely, if any
 * part of it lies within the given range.
 * Also note that if dissolve_free_hugetlb_folio() returns with an error, all
 * free hugetlb folios that were dissolved before that error are lost.
 */
int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)
{
        unsigned long pfn;
        struct folio *folio;
        int rc = 0;
        unsigned int order;
        struct hstate *h;

        if (!hugepages_supported())
                return rc;

        order = huge_page_order(&default_hstate);
        for_each_hstate(h)
                order = min(order, huge_page_order(h));

        for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
                folio = pfn_folio(pfn);
                rc = dissolve_free_hugetlb_folio(folio);
                if (rc)
                        break;
        }

        return rc;
}

/*
 * Allocates a fresh surplus page from the page allocator.
 */
static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
                                gfp_t gfp_mask,        int nid, nodemask_t *nmask)
{
        struct folio *folio = NULL;

        if (hstate_is_gigantic_no_runtime(h))
                return NULL;

        spin_lock_irq(&hugetlb_lock);
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
                goto out_unlock;
        spin_unlock_irq(&hugetlb_lock);

        folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
        if (!folio)
                return NULL;

        spin_lock_irq(&hugetlb_lock);
        /*
         * nr_huge_pages needs to be adjusted within the same lock cycle
         * as surplus_pages, otherwise it might confuse
         * persistent_huge_pages() momentarily.
         */
        account_new_hugetlb_folio(h, folio);

        /*
         * We could have raced with the pool size change.
         * Double check that and simply deallocate the new page
         * if we would end up overcommiting the surpluses. Abuse
         * temporary page to workaround the nasty free_huge_folio
         * codeflow
         */
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                folio_set_hugetlb_temporary(folio);
                spin_unlock_irq(&hugetlb_lock);
                free_huge_folio(folio);
                return NULL;
        }

        h->surplus_huge_pages++;
        h->surplus_huge_pages_node[folio_nid(folio)]++;

out_unlock:
        spin_unlock_irq(&hugetlb_lock);

        return folio;
}

static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
                                     int nid, nodemask_t *nmask)
{
        struct folio *folio;

        if (hstate_is_gigantic(h))
                return NULL;

        folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
        if (!folio)
                return NULL;

        spin_lock_irq(&hugetlb_lock);
        account_new_hugetlb_folio(h, folio);
        spin_unlock_irq(&hugetlb_lock);

        /* fresh huge pages are frozen */
        folio_ref_unfreeze(folio, 1);
        /*
         * We do not account these pages as surplus because they are only
         * temporary and will be released properly on the last reference
         */
        folio_set_hugetlb_temporary(folio);

        return folio;
}

/*
 * Use the VMA's mpolicy to allocate a huge page from the buddy.
 */
static
struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
                struct vm_area_struct *vma, unsigned long addr)
{
        struct folio *folio = NULL;
        struct mempolicy *mpol;
        gfp_t gfp_mask = htlb_alloc_mask(h);
        int nid;
        nodemask_t *nodemask;

        nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
        if (mpol_is_preferred_many(mpol)) {
                gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);

                folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask);

                /* Fallback to all nodes if page==NULL */
                nodemask = NULL;
        }

        if (!folio)
                folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask);
        mpol_cond_put(mpol);
        return folio;
}

struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
                nodemask_t *nmask, gfp_t gfp_mask)
{
        struct folio *folio;

        spin_lock_irq(&hugetlb_lock);
        if (!h->resv_huge_pages) {
                spin_unlock_irq(&hugetlb_lock);
                return NULL;
        }

        folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid,
                                               nmask);
        if (folio)
                h->resv_huge_pages--;

        spin_unlock_irq(&hugetlb_lock);
        return folio;
}

/* folio migration callback function */
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
                nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback)
{
        spin_lock_irq(&hugetlb_lock);
        if (available_huge_pages(h)) {
                struct folio *folio;

                folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
                                                preferred_nid, nmask);
                if (folio) {
                        spin_unlock_irq(&hugetlb_lock);
                        return folio;
                }
        }
        spin_unlock_irq(&hugetlb_lock);

        /* We cannot fallback to other nodes, as we could break the per-node pool. */
        if (!allow_alloc_fallback)
                gfp_mask |= __GFP_THISNODE;

        return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
}

static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
{
#ifdef CONFIG_NUMA
        struct mempolicy *mpol = get_task_policy(current);

        /*
         * Only enforce MPOL_BIND policy which overlaps with cpuset policy
         * (from policy_nodemask) specifically for hugetlb case
         */
        if (mpol->mode == MPOL_BIND &&
                (apply_policy_zone(mpol, gfp_zone(gfp)) &&
                 cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
                return &mpol->nodes;
#endif
        return NULL;
}

/*
 * Increase the hugetlb pool such that it can accommodate a reservation
 * of size 'delta'.
 */
static int gather_surplus_pages(struct hstate *h, long delta)
        __must_hold(&hugetlb_lock)
{
        LIST_HEAD(surplus_list);
        struct folio *folio, *tmp;
        int ret;
        long i;
        long needed, allocated;
        bool alloc_ok = true;
        nodemask_t *mbind_nodemask, alloc_nodemask;

        mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
        if (mbind_nodemask)
                nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed);
        else
                alloc_nodemask = cpuset_current_mems_allowed;

        lockdep_assert_held(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
                h->resv_huge_pages += delta;
                return 0;
        }

        allocated = 0;

        ret = -ENOMEM;
retry:
        spin_unlock_irq(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
                folio = NULL;

                /*
                 * It is okay to use NUMA_NO_NODE because we use numa_mem_id()
                 * down the road to pick the current node if that is the case.
                 */
                folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
                                                    NUMA_NO_NODE, &alloc_nodemask);
                if (!folio) {
                        alloc_ok = false;
                        break;
                }
                list_add(&folio->lru, &surplus_list);
                cond_resched();
        }
        allocated += i;

        /*
         * After retaking hugetlb_lock, we need to recalculate 'needed'
         * because either resv_huge_pages or free_huge_pages may have changed.
         */
        spin_lock_irq(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) -
                        (h->free_huge_pages + allocated);
        if (needed > 0) {
                if (alloc_ok)
                        goto retry;
                /*
                 * We were not able to allocate enough pages to
                 * satisfy the entire reservation so we free what
                 * we've allocated so far.
                 */
                goto free;
        }
        /*
         * The surplus_list now contains _at_least_ the number of extra pages
         * needed to accommodate the reservation.  Add the appropriate number
         * of pages to the hugetlb pool and free the extras back to the buddy
         * allocator.  Commit the entire reservation here to prevent another
         * process from stealing the pages as they are added to the pool but
         * before they are reserved.
         */
        needed += allocated;
        h->resv_huge_pages += delta;
        ret = 0;

        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(folio, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
                /* Add the page to the hugetlb allocator */
                enqueue_hugetlb_folio(h, folio);
        }
free:
        spin_unlock_irq(&hugetlb_lock);

        /*
         * Free unnecessary surplus pages to the buddy allocator.
         * Pages have no ref count, call free_huge_folio directly.
         */
        list_for_each_entry_safe(folio, tmp, &surplus_list, lru)
                free_huge_folio(folio);
        spin_lock_irq(&hugetlb_lock);

        return ret;
}

/*
 * This routine has two main purposes:
 * 1) Decrement the reservation count (resv_huge_pages) by the value passed
 *    in unused_resv_pages.  This corresponds to the prior adjustments made
 *    to the associated reservation map.
 * 2) Free any unused surplus pages that may have been allocated to satisfy
 *    the reservation.  As many as unused_resv_pages may be freed.
 */
static void return_unused_surplus_pages(struct hstate *h,
                                        unsigned long unused_resv_pages)
{
        unsigned long nr_pages;
        LIST_HEAD(page_list);

        lockdep_assert_held(&hugetlb_lock);
        /* Uncommit the reservation */
        h->resv_huge_pages -= unused_resv_pages;

        if (hstate_is_gigantic_no_runtime(h))
                goto out;

        /*
         * Part (or even all) of the reservation could have been backed
         * by pre-allocated pages. Only free surplus pages.
         */
        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);

        /*
         * We want to release as many surplus pages as possible, spread
         * evenly across all nodes with memory. Iterate across these nodes
         * until we can no longer free unreserved surplus pages. This occurs
         * when the nodes with surplus pages have no free pages.
         * remove_pool_hugetlb_folio() will balance the freed pages across the
         * on-line nodes with memory and will handle the hstate accounting.
         */
        while (nr_pages--) {
                struct folio *folio;

                folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1);
                if (!folio)
                        goto out;

                list_add(&folio->lru, &page_list);
        }

out:
        spin_unlock_irq(&hugetlb_lock);
        update_and_free_pages_bulk(h, &page_list);
        spin_lock_irq(&hugetlb_lock);
}


/*
 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
 * are used by the huge page allocation routines to manage reservations.
 *
 * vma_needs_reservation is called to determine if the huge page at addr
 * within the vma has an associated reservation.  If a reservation is
 * needed, the value 1 is returned.  The caller is then responsible for
 * managing the global reservation and subpool usage counts.  After
 * the huge page has been allocated, vma_commit_reservation is called
 * to add the page to the reservation map.  If the page allocation fails,
 * the reservation must be ended instead of committed.  vma_end_reservation
 * is called in such cases.
 *
 * In the normal case, vma_commit_reservation returns the same value
 * as the preceding vma_needs_reservation call.  The only time this
 * is not the case is if a reserve map was changed between calls.  It
 * is the responsibility of the caller to notice the difference and
 * take appropriate action.
 *
 * vma_add_reservation is used in error paths where a reservation must
 * be restored when a newly allocated huge page must be freed.  It is
 * to be called after calling vma_needs_reservation to determine if a
 * reservation exists.
 *
 * vma_del_reservation is used in error paths where an entry in the reserve
 * map was created during huge page allocation and must be removed.  It is to
 * be called after calling vma_needs_reservation to determine if a reservation
 * exists.
 */
enum vma_resv_mode {
        VMA_NEEDS_RESV,
        VMA_COMMIT_RESV,
        VMA_END_RESV,
        VMA_ADD_RESV,
        VMA_DEL_RESV,
};
static long __vma_reservation_common(struct hstate *h,
                                struct vm_area_struct *vma, unsigned long addr,
                                enum vma_resv_mode mode)
{
        struct resv_map *resv;
        pgoff_t idx;
        long ret;
        long dummy_out_regions_needed;

        resv = vma_resv_map(vma);
        if (!resv)
                return 1;

        idx = vma_hugecache_offset(h, vma, addr);
        switch (mode) {
        case VMA_NEEDS_RESV:
                ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
                /* We assume that vma_reservation_* routines always operate on
                 * 1 page, and that adding to resv map a 1 page entry can only
                 * ever require 1 region.
                 */
                VM_BUG_ON(dummy_out_regions_needed != 1);
                break;
        case VMA_COMMIT_RESV:
                ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
                /* region_add calls of range 1 should never fail. */
                VM_BUG_ON(ret < 0);
                break;
        case VMA_END_RESV:
                region_abort(resv, idx, idx + 1, 1);
                ret = 0;
                break;
        case VMA_ADD_RESV:
                if (vma->vm_flags & VM_MAYSHARE) {
                        ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
                        /* region_add calls of range 1 should never fail. */
                        VM_BUG_ON(ret < 0);
                } else {
                        region_abort(resv, idx, idx + 1, 1);
                        ret = region_del(resv, idx, idx + 1);
                }
                break;
        case VMA_DEL_RESV:
                if (vma->vm_flags & VM_MAYSHARE) {
                        region_abort(resv, idx, idx + 1, 1);
                        ret = region_del(resv, idx, idx + 1);
                } else {
                        ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
                        /* region_add calls of range 1 should never fail. */
                        VM_BUG_ON(ret < 0);
                }
                break;
        default:
                BUG();
        }

        if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
                return ret;
        /*
         * We know private mapping must have HPAGE_RESV_OWNER set.
         *
         * In most cases, reserves always exist for private mappings.
         * However, a file associated with mapping could have been
         * hole punched or truncated after reserves were consumed.
         * As subsequent fault on such a range will not use reserves.
         * Subtle - The reserve map for private mappings has the
         * opposite meaning than that of shared mappings.  If NO
         * entry is in the reserve map, it means a reservation exists.
         * If an entry exists in the reserve map, it means the
         * reservation has already been consumed.  As a result, the
         * return value of this routine is the opposite of the
         * value returned from reserve map manipulation routines above.
         */
        if (ret > 0)
                return 0;
        if (ret == 0)
                return 1;
        return ret;
}

static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
}

static long vma_commit_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
}

static void vma_end_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
}

static long vma_add_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}

static long vma_del_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
}

/*
 * This routine is called to restore reservation information on error paths.
 * It should ONLY be called for folios allocated via alloc_hugetlb_folio(),
 * and the hugetlb mutex should remain held when calling this routine.
 *
 * It handles two specific cases:
 * 1) A reservation was in place and the folio consumed the reservation.
 *    hugetlb_restore_reserve is set in the folio.
 * 2) No reservation was in place for the page, so hugetlb_restore_reserve is
 *    not set.  However, alloc_hugetlb_folio always updates the reserve map.
 *
 * In case 1, free_huge_folio later in the error path will increment the
 * global reserve count.  But, free_huge_folio does not have enough context
 * to adjust the reservation map.  This case deals primarily with private
 * mappings.  Adjust the reserve map here to be consistent with global
 * reserve count adjustments to be made by free_huge_folio.  Make sure the
 * reserve map indicates there is a reservation present.
 *
 * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
 */
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
                        unsigned long address, struct folio *folio)
{
        long rc = vma_needs_reservation(h, vma, address);

        if (folio_test_hugetlb_restore_reserve(folio)) {
                if (unlikely(rc < 0))
                        /*
                         * Rare out of memory condition in reserve map
                         * manipulation.  Clear hugetlb_restore_reserve so
                         * that global reserve count will not be incremented
                         * by free_huge_folio.  This will make it appear
                         * as though the reservation for this folio was
                         * consumed.  This may prevent the task from
                         * faulting in the folio at a later time.  This
                         * is better than inconsistent global huge page
                         * accounting of reserve counts.
                         */
                        folio_clear_hugetlb_restore_reserve(folio);
                else if (rc)
                        (void)vma_add_reservation(h, vma, address);
                else
                        vma_end_reservation(h, vma, address);
        } else {
                if (!rc) {
                        /*
                         * This indicates there is an entry in the reserve map
                         * not added by alloc_hugetlb_folio.  We know it was added
                         * before the alloc_hugetlb_folio call, otherwise
                         * hugetlb_restore_reserve would be set on the folio.
                         * Remove the entry so that a subsequent allocation
                         * does not consume a reservation.
                         */
                        rc = vma_del_reservation(h, vma, address);
                        if (rc < 0)
                                /*
                                 * VERY rare out of memory condition.  Since
                                 * we can not delete the entry, set
                                 * hugetlb_restore_reserve so that the reserve
                                 * count will be incremented when the folio
                                 * is freed.  This reserve will be consumed
                                 * on a subsequent allocation.
                                 */
                                folio_set_hugetlb_restore_reserve(folio);
                } else if (rc < 0) {
                        /*
                         * Rare out of memory condition from
                         * vma_needs_reservation call.  Memory allocation is
                         * only attempted if a new entry is needed.  Therefore,
                         * this implies there is not an entry in the
                         * reserve map.
                         *
                         * For shared mappings, no entry in the map indicates
                         * no reservation.  We are done.
                         */
                        if (!(vma->vm_flags & VM_MAYSHARE))
                                /*
                                 * For private mappings, no entry indicates
                                 * a reservation is present.  Since we can
                                 * not add an entry, set hugetlb_restore_reserve
                                 * on the folio so reserve count will be
                                 * incremented when freed.  This reserve will
                                 * be consumed on a subsequent allocation.
                                 */
                                folio_set_hugetlb_restore_reserve(folio);
                } else {
                        /*
                         * No reservation present, do nothing
                         */
                        vma_end_reservation(h, vma, address);
                }
        }
}

/*
 * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
 * the old one
 * @old_folio: Old folio to dissolve
 * @list: List to isolate the page in case we need to
 * Returns 0 on success, otherwise negated error.
 */
static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
                        struct list_head *list)
{
        gfp_t gfp_mask;
        struct hstate *h;
        int nid = folio_nid(old_folio);
        struct folio *new_folio = NULL;
        int ret = 0;

retry:
        /*
         * The old_folio might have been dissolved from under our feet, so make sure
         * to carefully check the state under the lock.
         */
        spin_lock_irq(&hugetlb_lock);
        if (!folio_test_hugetlb(old_folio)) {
                /*
                 * Freed from under us. Drop new_folio too.
                 */
                goto free_new;
        } else if (folio_ref_count(old_folio)) {
                bool isolated;

                /*
                 * Someone has grabbed the folio, try to isolate it here.
                 * Fail with -EBUSY if not possible.
                 */
                spin_unlock_irq(&hugetlb_lock);
                isolated = folio_isolate_hugetlb(old_folio, list);
                ret = isolated ? 0 : -EBUSY;
                spin_lock_irq(&hugetlb_lock);
                goto free_new;
        } else if (!folio_test_hugetlb_freed(old_folio)) {
                /*
                 * Folio's refcount is 0 but it has not been enqueued in the
                 * freelist yet. Race window is small, so we can succeed here if
                 * we retry.
                 */
                spin_unlock_irq(&hugetlb_lock);
                cond_resched();
                goto retry;
        } else {
                h = folio_hstate(old_folio);
                if (!new_folio) {
                        spin_unlock_irq(&hugetlb_lock);
                        gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
                        new_folio = alloc_fresh_hugetlb_folio(h, gfp_mask,
                                                              nid, NULL);
                        if (!new_folio)
                                return -ENOMEM;
                        goto retry;
                }

                /*
                 * Ok, old_folio is still a genuine free hugepage. Remove it from
                 * the freelist and decrease the counters. These will be
                 * incremented again when calling account_new_hugetlb_folio()
                 * and enqueue_hugetlb_folio() for new_folio. The counters will
                 * remain stable since this happens under the lock.
                 */
                remove_hugetlb_folio(h, old_folio, false);

                /*
                 * Ref count on new_folio is already zero as it was dropped
                 * earlier.  It can be directly added to the pool free list.
                 */
                account_new_hugetlb_folio(h, new_folio);
                enqueue_hugetlb_folio(h, new_folio);

                /*
                 * Folio has been replaced, we can safely free the old one.
                 */
                spin_unlock_irq(&hugetlb_lock);
                update_and_free_hugetlb_folio(h, old_folio, false);
        }

        return ret;

free_new:
        spin_unlock_irq(&hugetlb_lock);
        if (new_folio)
                update_and_free_hugetlb_folio(h, new_folio, false);

        return ret;
}

int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
{
        int ret = -EBUSY;

        /* Not to disrupt normal path by vainly holding hugetlb_lock */
        if (!folio_test_hugetlb(folio))
                return 0;

        /*
         * Fence off gigantic pages as there is a cyclic dependency between
         * alloc_contig_range and them. Return -ENOMEM as this has the effect
         * of bailing out right away without further retrying.
         */
        if (order_is_gigantic(folio_order(folio)))
                return -ENOMEM;

        if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
                ret = 0;
        else if (!folio_ref_count(folio))
                ret = alloc_and_dissolve_hugetlb_folio(folio, list);

        return ret;
}

/*
 *  replace_free_hugepage_folios - Replace free hugepage folios in a given pfn
 *  range with new folios.
 *  @start_pfn: start pfn of the given pfn range
 *  @end_pfn: end pfn of the given pfn range
 *  Returns 0 on success, otherwise negated error.
 */
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
{
        unsigned long nr = 0;
        struct page *page;
        struct hstate *h;
        LIST_HEAD(list);
        int ret = 0;

        /* Avoid pfn iterations if no free non-gigantic huge pages */
        for_each_hstate(h) {
                if (hstate_is_gigantic(h))
                        continue;

                nr += h->free_huge_pages;
                if (nr)
                        break;
        }

        if (!nr)
                return 0;

        while (start_pfn < end_pfn) {
                page = pfn_to_page(start_pfn);
                nr = 1;

                if (PageHuge(page) || PageCompound(page)) {
                        struct folio *folio = page_folio(page);

                        nr = folio_nr_pages(folio) - folio_page_idx(folio, page);

                        /*
                         * Don't disrupt normal path by vainly holding
                         * hugetlb_lock
                         */
                        if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) {
                                if (order_is_gigantic(folio_order(folio))) {
                                        ret = -ENOMEM;
                                        break;
                                }

                                ret = alloc_and_dissolve_hugetlb_folio(folio, &list);
                                if (ret)
                                        break;

                                putback_movable_pages(&list);
                        }
                } else if (PageBuddy(page)) {
                        /*
                         * Buddy order check without zone lock is unsafe and
                         * the order is maybe invalid, but race should be
                         * small, and the worst thing is skipping free hugetlb.
                         */
                        const unsigned int order = buddy_order_unsafe(page);

                        if (order <= MAX_PAGE_ORDER)
                                nr = 1UL << order;
                }
                start_pfn += nr;
        }

        return ret;
}

void wait_for_freed_hugetlb_folios(void)
{
        if (llist_empty(&hpage_freelist))
                return;

        flush_work(&free_hpage_work);
}

typedef enum {
        /*
         * For either 0/1: we checked the per-vma resv map, and one resv
         * count either can be reused (0), or an extra needed (1).
         */
        MAP_CHG_REUSE = 0,
        MAP_CHG_NEEDED = 1,
        /*
         * Cannot use per-vma resv count can be used, hence a new resv
         * count is enforced.
         *
         * NOTE: This is mostly identical to MAP_CHG_NEEDED, except
         * that currently vma_needs_reservation() has an unwanted side
         * effect to either use end() or commit() to complete the
         * transaction. Hence it needs to differentiate from NEEDED.
         */
        MAP_CHG_ENFORCED = 2,
} map_chg_state;

/*
 * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
 * faults of hugetlb private mappings on top of a non-page-cache folio (in
 * which case even if there's a private vma resv map it won't cover such
 * allocation).  New call sites should (probably) never set it to true!!
 * When it's set, the allocation will bypass all vma level reservations.
 */
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
                                    unsigned long addr, bool cow_from_owner)
{
        struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct folio *folio;
        long retval, gbl_chg, gbl_reserve;
        map_chg_state map_chg;
        int ret, idx;
        struct hugetlb_cgroup *h_cg = NULL;
        gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;

        idx = hstate_index(h);

        /* Whether we need a separate per-vma reservation? */
        if (cow_from_owner) {
                /*
                 * Special case!  Since it's a CoW on top of a reserved
                 * page, the private resv map doesn't count.  So it cannot
                 * consume the per-vma resv map even if it's reserved.
                 */
                map_chg = MAP_CHG_ENFORCED;
        } else {
                /*
                 * Examine the region/reserve map to determine if the process
                 * has a reservation for the page to be allocated.  A return
                 * code of zero indicates a reservation exists (no change).
                 */
                retval = vma_needs_reservation(h, vma, addr);
                if (retval < 0)
                        return ERR_PTR(-ENOMEM);
                map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
        }

        /*
         * Whether we need a separate global reservation?
         *
         * Processes that did not create the mapping will have no
         * reserves as indicated by the region/reserve map. Check
         * that the allocation will not exceed the subpool limit.
         * Or if it can get one from the pool reservation directly.
         */
        if (map_chg) {
                gbl_chg = hugepage_subpool_get_pages(spool, 1);
                if (gbl_chg < 0)
                        goto out_end_reservation;
        } else {
                /*
                 * If we have the vma reservation ready, no need for extra
                 * global reservation.
                 */
                gbl_chg = 0;
        }

        /*
         * If this allocation is not consuming a per-vma reservation,
         * charge the hugetlb cgroup now.
         */
        if (map_chg) {
                ret = hugetlb_cgroup_charge_cgroup_rsvd(
                        idx, pages_per_huge_page(h), &h_cg);
                if (ret)
                        goto out_subpool_put;
        }

        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
        if (ret)
                goto out_uncharge_cgroup_reservation;

        spin_lock_irq(&hugetlb_lock);
        /*
         * glb_chg is passed to indicate whether or not a page must be taken
         * from the global free pool (global change).  gbl_chg == 0 indicates
         * a reservation exists for the allocation.
         */
        folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg);
        if (!folio) {
                spin_unlock_irq(&hugetlb_lock);
                folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
                if (!folio)
                        goto out_uncharge_cgroup;
                spin_lock_irq(&hugetlb_lock);
                list_add(&folio->lru, &h->hugepage_activelist);
                folio_ref_unfreeze(folio, 1);
                /* Fall through */
        }

        /*
         * Either dequeued or buddy-allocated folio needs to add special
         * mark to the folio when it consumes a global reservation.
         */
        if (!gbl_chg) {
                folio_set_hugetlb_restore_reserve(folio);
                h->resv_huge_pages--;
        }

        hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
        /* If allocation is not consuming a reservation, also store the
         * hugetlb_cgroup pointer on the page.
         */
        if (map_chg) {
                hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
                                                  h_cg, folio);
        }

        spin_unlock_irq(&hugetlb_lock);

        hugetlb_set_folio_subpool(folio, spool);

        if (map_chg != MAP_CHG_ENFORCED) {
                /* commit() is only needed if the map_chg is not enforced */
                retval = vma_commit_reservation(h, vma, addr);
                /*
                 * Check for possible race conditions. When it happens..
                 * The page was added to the reservation map between
                 * vma_needs_reservation and vma_commit_reservation.
                 * This indicates a race with hugetlb_reserve_pages.
                 * Adjust for the subpool count incremented above AND
                 * in hugetlb_reserve_pages for the same page.        Also,
                 * the reservation count added in hugetlb_reserve_pages
                 * no longer applies.
                 */
                if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
                        long rsv_adjust;

                        rsv_adjust = hugepage_subpool_put_pages(spool, 1);
                        hugetlb_acct_memory(h, -rsv_adjust);
                        spin_lock_irq(&hugetlb_lock);
                        hugetlb_cgroup_uncharge_folio_rsvd(
                            hstate_index(h), pages_per_huge_page(h), folio);
                        spin_unlock_irq(&hugetlb_lock);
                }
        }

        ret = mem_cgroup_charge_hugetlb(folio, gfp);
        /*
         * Unconditionally increment NR_HUGETLB here. If it turns out that
         * mem_cgroup_charge_hugetlb failed, then immediately free the page and
         * decrement NR_HUGETLB.
         */
        lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));

        if (ret == -ENOMEM) {
                free_huge_folio(folio);
                return ERR_PTR(-ENOMEM);
        }

        return folio;

out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_uncharge_cgroup_reservation:
        if (map_chg)
                hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
                                                    h_cg);
out_subpool_put:
        /*
         * put page to subpool iff the quota of subpool's rsv_hpages is used
         * during hugepage_subpool_get_pages.
         */
        if (map_chg && !gbl_chg) {
                gbl_reserve = hugepage_subpool_put_pages(spool, 1);
                hugetlb_acct_memory(h, -gbl_reserve);
        }


out_end_reservation:
        if (map_chg != MAP_CHG_ENFORCED)
                vma_end_reservation(h, vma, addr);
        return ERR_PTR(-ENOSPC);
}

static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
{
        struct huge_bootmem_page *m;
        int listnode = nid;

        if (hugetlb_early_cma(h))
                m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact);
        else {
                if (node_exact)
                        m = memblock_alloc_exact_nid_raw(huge_page_size(h),
                                huge_page_size(h), 0,
                                MEMBLOCK_ALLOC_ACCESSIBLE, nid);
                else {
                        m = memblock_alloc_try_nid_raw(huge_page_size(h),
                                huge_page_size(h), 0,
                                MEMBLOCK_ALLOC_ACCESSIBLE, nid);
                        /*
                         * For pre-HVO to work correctly, pages need to be on
                         * the list for the node they were actually allocated
                         * from. That node may be different in the case of
                         * fallback by memblock_alloc_try_nid_raw. So,
                         * extract the actual node first.
                         */
                        if (m)
                                listnode = early_pfn_to_nid(PHYS_PFN(__pa(m)));
                }

                if (m) {
                        m->flags = 0;
                        m->cma = NULL;
                }
        }

        if (m) {
                /*
                 * Use the beginning of the huge page to store the
                 * huge_bootmem_page struct (until gather_bootmem
                 * puts them into the mem_map).
                 *
                 * Put them into a private list first because mem_map
                 * is not up yet.
                 */
                INIT_LIST_HEAD(&m->list);
                list_add(&m->list, &huge_boot_pages[listnode]);
                m->hstate = h;
        }

        return m;
}

int alloc_bootmem_huge_page(struct hstate *h, int nid)
        __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h, int nid)
{
        struct huge_bootmem_page *m = NULL; /* initialize for clang */
        int nr_nodes, node = nid;

        /* do node specific alloc */
        if (nid != NUMA_NO_NODE) {
                m = alloc_bootmem(h, node, true);
                if (!m)
                        return 0;
                goto found;
        }

        /* allocate from next node when distributing huge pages */
        for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node,
                                    &hugetlb_bootmem_nodes) {
                m = alloc_bootmem(h, node, false);
                if (!m)
                        return 0;
                goto found;
        }

found:

        /*
         * Only initialize the head struct page in memmap_init_reserved_pages,
         * rest of the struct pages will be initialized by the HugeTLB
         * subsystem itself.
         * The head struct page is used to get folio information by the HugeTLB
         * subsystem like zone id and node id.
         */
        memblock_reserved_mark_noinit(__pa((void *)m + PAGE_SIZE),
                huge_page_size(h) - PAGE_SIZE);

        return 1;
}

/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */
static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
                                        struct hstate *h,
                                        unsigned long start_page_number,
                                        unsigned long end_page_number)
{
        enum zone_type zone = folio_zonenum(folio);
        int nid = folio_nid(folio);
        struct page *page = folio_page(folio, start_page_number);
        unsigned long head_pfn = folio_pfn(folio);
        unsigned long pfn, end_pfn = head_pfn + end_page_number;
        unsigned int order = huge_page_order(h);

        /*
         * As we marked all tail pages with memblock_reserved_mark_noinit(),
         * we must initialize them ourselves here.
         */
        for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) {
                __init_single_page(page, pfn, zone, nid);
                prep_compound_tail(page, &folio->page, order);
                set_page_count(page, 0);
        }
}

static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
                                              struct hstate *h,
                                              unsigned long nr_pages)
{
        int ret;

        /*
         * This is an open-coded prep_compound_page() whereby we avoid
         * walking pages twice by initializing/preparing+freezing them in the
         * same go.
         */
        __folio_clear_reserved(folio);
        __folio_set_head(folio);
        ret = folio_ref_freeze(folio, 1);
        VM_BUG_ON(!ret);
        hugetlb_folio_init_tail_vmemmap(folio, h, 1, nr_pages);
        prep_compound_head(&folio->page, huge_page_order(h));
}

static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
{
        return m->flags & HUGE_BOOTMEM_HVO;
}

static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
{
        return m->flags & HUGE_BOOTMEM_CMA;
}

/*
 * memblock-allocated pageblocks might not have the migrate type set
 * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
 * here, or MIGRATE_CMA if this was a page allocated through an early CMA
 * reservation.
 *
 * In case of vmemmap optimized folios, the tail vmemmap pages are mapped
 * read-only, but that's ok - for sparse vmemmap this does not write to
 * the page structure.
 */
static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
                                                          struct hstate *h)
{
        unsigned long nr_pages = pages_per_huge_page(h), i;

        WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));

        for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
                if (folio_test_hugetlb_cma(folio))
                        init_cma_pageblock(folio_page(folio, i));
                else
                        init_pageblock_migratetype(folio_page(folio, i),
                                          MIGRATE_MOVABLE, false);
        }
}

static void __init prep_and_add_bootmem_folios(struct hstate *h,
                                        struct list_head *folio_list)
{
        unsigned long flags;
        struct folio *folio, *tmp_f;

        /* Send list for bulk vmemmap optimization processing */
        hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list);

        list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
                if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
                        /*
                         * If HVO fails, initialize all tail struct pages
                         * We do not worry about potential long lock hold
                         * time as this is early in boot and there should
                         * be no contention.
                         */
                        hugetlb_folio_init_tail_vmemmap(folio, h,
                                        HUGETLB_VMEMMAP_RESERVE_PAGES,
                                        pages_per_huge_page(h));
                }
                hugetlb_bootmem_init_migratetype(folio, h);
                /* Subdivide locks to achieve better parallel performance */
                spin_lock_irqsave(&hugetlb_lock, flags);
                account_new_hugetlb_folio(h, folio);
                enqueue_hugetlb_folio(h, folio);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
        }
}

bool __init hugetlb_bootmem_page_zones_valid(int nid,
                                             struct huge_bootmem_page *m)
{
        unsigned long start_pfn;
        bool valid;

        if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
                /*
                 * Already validated, skip check.
                 */
                return true;
        }

        if (hugetlb_bootmem_page_earlycma(m)) {
                valid = cma_validate_zones(m->cma);
                goto out;
        }

        start_pfn = virt_to_phys(m) >> PAGE_SHIFT;

        valid = !pfn_range_intersects_zones(nid, start_pfn,
                        pages_per_huge_page(m->hstate));
out:
        if (!valid)
                hstate_boot_nrinvalid[hstate_index(m->hstate)]++;

        return valid;
}

/*
 * Free a bootmem page that was found to be invalid (intersecting with
 * multiple zones).
 *
 * Since it intersects with multiple zones, we can't just do a free
 * operation on all pages at once, but instead have to walk all
 * pages, freeing them one by one.
 */
static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
                                             struct hstate *h)
{
        unsigned long npages = pages_per_huge_page(h);
        unsigned long pfn;

        while (npages--) {
                pfn = page_to_pfn(page);
                __init_page_from_nid(pfn, nid);
                free_reserved_page(page);
                page++;
        }
}

/*
 * Put bootmem huge pages into the standard lists after mem_map is up.
 * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
 */
static void __init gather_bootmem_prealloc_node(unsigned long nid)
{
        LIST_HEAD(folio_list);
        struct huge_bootmem_page *m, *tm;
        struct hstate *h = NULL, *prev_h = NULL;

        list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
                struct page *page = virt_to_page(m);
                struct folio *folio = (void *)page;

                h = m->hstate;
                if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
                        /*
                         * Can't use this page. Initialize the
                         * page structures if that hasn't already
                         * been done, and give them to the page
                         * allocator.
                         */
                        hugetlb_bootmem_free_invalid_page(nid, page, h);
                        continue;
                }

                /*
                 * It is possible to have multiple huge page sizes (hstates)
                 * in this list.  If so, process each size separately.
                 */
                if (h != prev_h && prev_h != NULL)
                        prep_and_add_bootmem_folios(prev_h, &folio_list);
                prev_h = h;

                VM_BUG_ON(!hstate_is_gigantic(h));
                WARN_ON(folio_ref_count(folio) != 1);

                hugetlb_folio_init_vmemmap(folio, h,
                                           HUGETLB_VMEMMAP_RESERVE_PAGES);
                init_new_hugetlb_folio(folio);

                if (hugetlb_bootmem_page_prehvo(m))
                        /*
                         * If pre-HVO was done, just set the
                         * flag, the HVO code will then skip
                         * this folio.
                         */
                        folio_set_hugetlb_vmemmap_optimized(folio);

                if (hugetlb_bootmem_page_earlycma(m))
                        folio_set_hugetlb_cma(folio);

                list_add(&folio->lru, &folio_list);

                /*
                 * We need to restore the 'stolen' pages to totalram_pages
                 * in order to fix confusing memory reports from free(1) and
                 * other side-effects, like CommitLimit going negative.
                 *
                 * For CMA pages, this is done in init_cma_pageblock
                 * (via hugetlb_bootmem_init_migratetype), so skip it here.
                 */
                if (!folio_test_hugetlb_cma(folio))
                        adjust_managed_page_count(page, pages_per_huge_page(h));
                cond_resched();
        }

        prep_and_add_bootmem_folios(h, &folio_list);
}

static void __init gather_bootmem_prealloc_parallel(unsigned long start,
                                                    unsigned long end, void *arg)
{
        int nid;

        for (nid = start; nid < end; nid++)
                gather_bootmem_prealloc_node(nid);
}

static void __init gather_bootmem_prealloc(void)
{
        struct padata_mt_job job = {
                .thread_fn        = gather_bootmem_prealloc_parallel,
                .fn_arg                = NULL,
                .start                = 0,
                .size                = nr_node_ids,
                .align                = 1,
                .min_chunk        = 1,
                .max_threads        = num_node_state(N_MEMORY),
                .numa_aware        = true,
        };

        padata_do_multithreaded(&job);
}

static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
{
        unsigned long i;
        char buf[32];
        LIST_HEAD(folio_list);

        for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
                if (hstate_is_gigantic(h)) {
                        if (!alloc_bootmem_huge_page(h, nid))
                                break;
                } else {
                        struct folio *folio;
                        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;

                        folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
                                        &node_states[N_MEMORY], NULL);
                        if (!folio && !list_empty(&folio_list) &&
                            hugetlb_vmemmap_optimizable_size(h)) {
                                prep_and_add_allocated_folios(h, &folio_list);
                                INIT_LIST_HEAD(&folio_list);
                                folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
                                                &node_states[N_MEMORY], NULL);
                        }
                        if (!folio)
                                break;
                        list_add(&folio->lru, &folio_list);
                }
                cond_resched();
        }

        if (!list_empty(&folio_list))
                prep_and_add_allocated_folios(h, &folio_list);

        if (i == h->max_huge_pages_node[nid])
                return;

        string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
        pr_warn("HugeTLB: allocating %u of page size %s failed node%d.  Only allocated %lu hugepages.\n",
                h->max_huge_pages_node[nid], buf, nid, i);
        h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
        h->max_huge_pages_node[nid] = i;
}

static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h)
{
        int i;
        bool node_specific_alloc = false;

        for_each_online_node(i) {
                if (h->max_huge_pages_node[i] > 0) {
                        hugetlb_hstate_alloc_pages_onenode(h, i);
                        node_specific_alloc = true;
                }
        }

        return node_specific_alloc;
}

static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h)
{
        if (allocated < h->max_huge_pages) {
                char buf[32];

                string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
                pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
                        h->max_huge_pages, buf, allocated);
                h->max_huge_pages = allocated;
        }
}

static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg)
{
        struct hstate *h = (struct hstate *)arg;
        int i, num = end - start;
        nodemask_t node_alloc_noretry;
        LIST_HEAD(folio_list);
        int next_node = first_online_node;

        /* Bit mask controlling how hard we retry per-node allocations.*/
        nodes_clear(node_alloc_noretry);

        for (i = 0; i < num; ++i) {
                struct folio *folio;

                if (hugetlb_vmemmap_optimizable_size(h) &&
                    (si_mem_available() == 0) && !list_empty(&folio_list)) {
                        prep_and_add_allocated_folios(h, &folio_list);
                        INIT_LIST_HEAD(&folio_list);
                }
                folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
                                                &node_alloc_noretry, &next_node);
                if (!folio)
                        break;

                list_move(&folio->lru, &folio_list);
                cond_resched();
        }

        prep_and_add_allocated_folios(h, &folio_list);
}

static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
{
        unsigned long i;

        for (i = 0; i < h->max_huge_pages; ++i) {
                if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
                        break;
                cond_resched();
        }

        return i;
}

static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
{
        struct padata_mt_job job = {
                .fn_arg                = h,
                .align                = 1,
                .numa_aware        = true
        };

        unsigned long jiffies_start;
        unsigned long jiffies_end;
        unsigned long remaining;

        job.thread_fn        = hugetlb_pages_alloc_boot_node;

        /*
         * job.max_threads is 25% of the available cpu threads by default.
         *
         * On large servers with terabytes of memory, huge page allocation
         * can consume a considerably amount of time.
         *
         * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages.
         * 2MiB huge pages. Using more threads can significantly improve allocation time.
         *
         * +-----------------------+-------+-------+-------+-------+-------+
         * | threads               |   8   |   16  |   32  |   64  |   128 |
         * +-----------------------+-------+-------+-------+-------+-------+
         * | skylake      144 cpus |   44s |   22s |   16s |   19s |   20s |
         * | cascade lake 192 cpus |   39s |   20s |   11s |   10s |    9s |
         * +-----------------------+-------+-------+-------+-------+-------+
         */
        if (hugepage_allocation_threads == 0) {
                hugepage_allocation_threads = num_online_cpus() / 4;
                hugepage_allocation_threads = max(hugepage_allocation_threads, 1);
        }

        job.max_threads        = hugepage_allocation_threads;

        jiffies_start = jiffies;
        do {
                remaining = h->max_huge_pages - h->nr_huge_pages;

                job.start     = h->nr_huge_pages;
                job.size      = remaining;
                job.min_chunk = remaining / hugepage_allocation_threads;
                padata_do_multithreaded(&job);

                if (h->nr_huge_pages == h->max_huge_pages)
                        break;

                /*
                 * Retry only if the vmemmap optimization might have been able to free
                 * some memory back to the system.
                 */
                if (!hugetlb_vmemmap_optimizable(h))
                        break;

                /* Continue if progress was made in last iteration */
        } while (remaining != (h->max_huge_pages - h->nr_huge_pages));

        jiffies_end = jiffies;

        pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n",
                jiffies_to_msecs(jiffies_end - jiffies_start),
                hugepage_allocation_threads);

        return h->nr_huge_pages;
}

/*
 * NOTE: this routine is called in different contexts for gigantic and
 * non-gigantic pages.
 * - For gigantic pages, this is called early in the boot process and
 *   pages are allocated from memblock allocated or something similar.
 *   Gigantic pages are actually added to pools later with the routine
 *   gather_bootmem_prealloc.
 * - For non-gigantic pages, this is called later in the boot process after
 *   all of mm is up and functional.  Pages are allocated from buddy and
 *   then added to hugetlb pools.
 */
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
        unsigned long allocated;

        /*
         * Skip gigantic hugepages allocation if early CMA
         * reservations are not available.
         */
        if (hstate_is_gigantic(h) && hugetlb_cma_total_size() &&
            !hugetlb_early_cma(h)) {
                pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
                return;
        }

        if (!h->max_huge_pages)
                return;

        /* do node specific alloc */
        if (hugetlb_hstate_alloc_pages_specific_nodes(h))
                return;

        /* below will do all node balanced alloc */
        if (hstate_is_gigantic(h))
                allocated = hugetlb_gigantic_pages_alloc_boot(h);
        else
                allocated = hugetlb_pages_alloc_boot(h);

        hugetlb_hstate_alloc_pages_errcheck(allocated, h);
}

static void __init hugetlb_init_hstates(void)
{
        struct hstate *h, *h2;

        for_each_hstate(h) {
                /*
                 * Always reset to first_memory_node here, even if
                 * next_nid_to_alloc was set before - we can't
                 * reference hugetlb_bootmem_nodes after init, and
                 * first_memory_node is right for all further allocations.
                 */
                h->next_nid_to_alloc = first_memory_node;
                h->next_nid_to_free = first_memory_node;

                /* oversize hugepages were init'ed in early boot */
                if (!hstate_is_gigantic(h))
                        hugetlb_hstate_alloc_pages(h);

                /*
                 * Set demote order for each hstate.  Note that
                 * h->demote_order is initially 0.
                 * - We can not demote gigantic pages if runtime freeing
                 *   is not supported, so skip this.
                 * - If CMA allocation is possible, we can not demote
                 *   HUGETLB_PAGE_ORDER or smaller size pages.
                 */
                if (hstate_is_gigantic_no_runtime(h))
                        continue;
                if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER)
                        continue;
                for_each_hstate(h2) {
                        if (h2 == h)
                                continue;
                        if (h2->order < h->order &&
                            h2->order > h->demote_order)
                                h->demote_order = h2->order;
                }
        }
}

static void __init report_hugepages(void)
{
        struct hstate *h;
        unsigned long nrinvalid;

        for_each_hstate(h) {
                char buf[32];

                nrinvalid = hstate_boot_nrinvalid[hstate_index(h)];
                h->max_huge_pages -= nrinvalid;

                string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
                pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
                        buf, h->nr_huge_pages);
                if (nrinvalid)
                        pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
                                        buf, nrinvalid, str_plural(nrinvalid));
                pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
                        hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
        }
}

#ifdef CONFIG_HIGHMEM
static void try_to_free_low(struct hstate *h, unsigned long count,
                                                nodemask_t *nodes_allowed)
{
        int i;
        LIST_HEAD(page_list);

        lockdep_assert_held(&hugetlb_lock);
        if (hstate_is_gigantic(h))
                return;

        /*
         * Collect pages to be freed on a list, and free after dropping lock
         */
        for_each_node_mask(i, *nodes_allowed) {
                struct folio *folio, *next;
                struct list_head *freel = &h->hugepage_freelists[i];
                list_for_each_entry_safe(folio, next, freel, lru) {
                        if (count >= h->nr_huge_pages)
                                goto out;
                        if (folio_test_highmem(folio))
                                continue;
                        remove_hugetlb_folio(h, folio, false);
                        list_add(&folio->lru, &page_list);
                }
        }

out:
        spin_unlock_irq(&hugetlb_lock);
        update_and_free_pages_bulk(h, &page_list);
        spin_lock_irq(&hugetlb_lock);
}
#else
static inline void try_to_free_low(struct hstate *h, unsigned long count,
                                                nodemask_t *nodes_allowed)
{
}
#endif

/*
 * Increment or decrement surplus_huge_pages.  Keep node-specific counters
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 */
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
                                int delta)
{
        int nr_nodes, node;

        lockdep_assert_held(&hugetlb_lock);
        VM_BUG_ON(delta != -1 && delta != 1);

        if (delta < 0) {
                for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) {
                        if (h->surplus_huge_pages_node[node])
                                goto found;
                }
        } else {
                for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                        if (h->surplus_huge_pages_node[node] <
                                        h->nr_huge_pages_node[node])
                                goto found;
                }
        }
        return 0;

found:
        h->surplus_huge_pages += delta;
        h->surplus_huge_pages_node[node] += delta;
        return 1;
}

#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                              nodemask_t *nodes_allowed)
{
        unsigned long persistent_free_count;
        unsigned long min_count;
        unsigned long allocated;
        struct folio *folio;
        LIST_HEAD(page_list);
        NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);

        /*
         * Bit mask controlling how hard we retry per-node allocations.
         * If we can not allocate the bit mask, do not attempt to allocate
         * the requested huge pages.
         */
        if (node_alloc_noretry)
                nodes_clear(*node_alloc_noretry);
        else
                return -ENOMEM;

        /*
         * resize_lock mutex prevents concurrent adjustments to number of
         * pages in hstate via the proc/sysfs interfaces.
         */
        mutex_lock(&h->resize_lock);
        flush_free_hpage_work(h);
        spin_lock_irq(&hugetlb_lock);

        /*
         * Check for a node specific request.
         * Changing node specific huge page count may require a corresponding
         * change to the global count.  In any case, the passed node mask
         * (nodes_allowed) will restrict alloc/free to the specified node.
         */
        if (nid != NUMA_NO_NODE) {
                unsigned long old_count = count;

                count += persistent_huge_pages(h) -
                         (h->nr_huge_pages_node[nid] -
                          h->surplus_huge_pages_node[nid]);
                /*
                 * User may have specified a large count value which caused the
                 * above calculation to overflow.  In this case, they wanted
                 * to allocate as many huge pages as possible.  Set count to
                 * largest possible value to align with their intention.
                 */
                if (count < old_count)
                        count = ULONG_MAX;
        }

        /*
         * Gigantic pages runtime allocation depend on the capability for large
         * page range allocation.
         * If the system does not provide this feature, return an error when
         * the user tries to allocate gigantic pages but let the user free the
         * boottime allocated gigantic pages.
         */
        if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
                if (count > persistent_huge_pages(h)) {
                        spin_unlock_irq(&hugetlb_lock);
                        mutex_unlock(&h->resize_lock);
                        NODEMASK_FREE(node_alloc_noretry);
                        return -EINVAL;
                }
                /* Fall through to decrease pool */
        }

        /*
         * Increase the pool size
         * First take pages out of surplus state.  Then make up the
         * remaining difference by allocating fresh huge pages.
         *
         * We might race with alloc_surplus_hugetlb_folio() here and be unable
         * to convert a surplus huge page to a normal huge page. That is
         * not critical, though, it just means the overall size of the
         * pool might be one hugepage larger than it needs to be, but
         * within all the constraints specified by the sysctls.
         */
        while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, -1))
                        break;
        }

        allocated = 0;
        while (count > (persistent_huge_pages(h) + allocated)) {
                /*
                 * If this allocation races such that we no longer need the
                 * page, free_huge_folio will handle it by freeing the page
                 * and reducing the surplus.
                 */
                spin_unlock_irq(&hugetlb_lock);

                /* yield cpu to avoid soft lockup */
                cond_resched();

                folio = alloc_pool_huge_folio(h, nodes_allowed,
                                                node_alloc_noretry,
                                                &h->next_nid_to_alloc);
                if (!folio) {
                        prep_and_add_allocated_folios(h, &page_list);
                        spin_lock_irq(&hugetlb_lock);
                        goto out;
                }

                list_add(&folio->lru, &page_list);
                allocated++;

                /* Bail for signals. Probably ctrl-c from user */
                if (signal_pending(current)) {
                        prep_and_add_allocated_folios(h, &page_list);
                        spin_lock_irq(&hugetlb_lock);
                        goto out;
                }

                spin_lock_irq(&hugetlb_lock);
        }

        /* Add allocated pages to the pool */
        if (!list_empty(&page_list)) {
                spin_unlock_irq(&hugetlb_lock);
                prep_and_add_allocated_folios(h, &page_list);
                spin_lock_irq(&hugetlb_lock);
        }

        /*
         * Decrease the pool size
         * First return free pages to the buddy allocator (being careful
         * to keep enough around to satisfy reservations).  Then place
         * pages into surplus state as needed so the pool will shrink
         * to the desired size as pages become free.
         *
         * By placing pages into the surplus state independent of the
         * overcommit value, we are allowing the surplus pool size to
         * exceed overcommit. There are few sane options here. Since
         * alloc_surplus_hugetlb_folio() is checking the global counter,
         * though, we'll note that we're not allowed to exceed surplus
         * and won't grow the pool anywhere else. Not until one of the
         * sysctls are changed, or the surplus pages go out of use.
         *
         * min_count is the expected number of persistent pages, we
         * shouldn't calculate min_count by using
         * resv_huge_pages + persistent_huge_pages() - free_huge_pages,
         * because there may exist free surplus huge pages, and this will
         * lead to subtracting twice. Free surplus huge pages come from HVO
         * failing to restore vmemmap, see comments in the callers of
         * hugetlb_vmemmap_restore_folio(). Thus, we should calculate
         * persistent free count first.
         */
        persistent_free_count = h->free_huge_pages;
        if (h->free_huge_pages > persistent_huge_pages(h)) {
                if (h->free_huge_pages > h->surplus_huge_pages)
                        persistent_free_count -= h->surplus_huge_pages;
                else
                        persistent_free_count = 0;
        }
        min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count;
        min_count = max(count, min_count);
        try_to_free_low(h, min_count, nodes_allowed);

        /*
         * Collect pages to be removed on list without dropping lock
         */
        while (min_count < persistent_huge_pages(h)) {
                folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0);
                if (!folio)
                        break;

                list_add(&folio->lru, &page_list);
        }
        /* free the pages after dropping lock */
        spin_unlock_irq(&hugetlb_lock);
        update_and_free_pages_bulk(h, &page_list);
        flush_free_hpage_work(h);
        spin_lock_irq(&hugetlb_lock);

        while (count < persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, 1))
                        break;
        }
out:
        h->max_huge_pages = persistent_huge_pages(h);
        spin_unlock_irq(&hugetlb_lock);
        mutex_unlock(&h->resize_lock);

        NODEMASK_FREE(node_alloc_noretry);

        return 0;
}

static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
                                       struct list_head *src_list)
{
        long rc;
        struct folio *folio, *next;
        LIST_HEAD(dst_list);
        LIST_HEAD(ret_list);

        rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list);
        list_splice_init(&ret_list, src_list);

        /*
         * Taking target hstate mutex synchronizes with set_max_huge_pages.
         * Without the mutex, pages added to target hstate could be marked
         * as surplus.
         *
         * Note that we already hold src->resize_lock.  To prevent deadlock,
         * use the convention of always taking larger size hstate mutex first.
         */
        mutex_lock(&dst->resize_lock);

        list_for_each_entry_safe(folio, next, src_list, lru) {
                int i;
                bool cma;

                if (folio_test_hugetlb_vmemmap_optimized(folio))
                        continue;

                cma = folio_test_hugetlb_cma(folio);

                list_del(&folio->lru);

                split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
                pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst));

                for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
                        struct page *page = folio_page(folio, i);
                        /* Careful: see __split_huge_page_tail() */
                        struct folio *new_folio = (struct folio *)page;

                        clear_compound_head(page);
                        prep_compound_page(page, dst->order);

                        new_folio->mapping = NULL;
                        init_new_hugetlb_folio(new_folio);
                        /* Copy the CMA flag so that it is freed correctly */
                        if (cma)
                                folio_set_hugetlb_cma(new_folio);
                        list_add(&new_folio->lru, &dst_list);
                }
        }

        prep_and_add_allocated_folios(dst, &dst_list);

        mutex_unlock(&dst->resize_lock);

        return rc;
}

long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
                           unsigned long nr_to_demote)
        __must_hold(&hugetlb_lock)
{
        int nr_nodes, node;
        struct hstate *dst;
        long rc = 0;
        long nr_demoted = 0;

        lockdep_assert_held(&hugetlb_lock);

        /* We should never get here if no demote order */
        if (!src->demote_order) {
                pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
                return -EINVAL;                /* internal error */
        }
        dst = size_to_hstate(PAGE_SIZE << src->demote_order);

        for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) {
                LIST_HEAD(list);
                struct folio *folio, *next;

                list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) {
                        if (folio_test_hwpoison(folio))
                                continue;

                        remove_hugetlb_folio(src, folio, false);
                        list_add(&folio->lru, &list);

                        if (++nr_demoted == nr_to_demote)
                                break;
                }

                spin_unlock_irq(&hugetlb_lock);

                rc = demote_free_hugetlb_folios(src, dst, &list);

                spin_lock_irq(&hugetlb_lock);

                list_for_each_entry_safe(folio, next, &list, lru) {
                        list_del(&folio->lru);
                        add_hugetlb_folio(src, folio, false);

                        nr_demoted--;
                }

                if (rc < 0 || nr_demoted == nr_to_demote)
                        break;
        }

        /*
         * Not absolutely necessary, but for consistency update max_huge_pages
         * based on pool changes for the demoted page.
         */
        src->max_huge_pages -= nr_demoted;
        dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst));

        if (rc < 0)
                return rc;

        if (nr_demoted)
                return nr_demoted;
        /*
         * Only way to get here is if all pages on free lists are poisoned.
         * Return -EBUSY so that caller will not retry.
         */
        return -EBUSY;
}

ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
                                           struct hstate *h, int nid,
                                           unsigned long count, size_t len)
{
        int err;
        nodemask_t nodes_allowed, *n_mask;

        if (hstate_is_gigantic_no_runtime(h))
                return -EINVAL;

        if (nid == NUMA_NO_NODE) {
                /*
                 * global hstate attribute
                 */
                if (!(obey_mempolicy &&
                                init_nodemask_of_mempolicy(&nodes_allowed)))
                        n_mask = &node_states[N_MEMORY];
                else
                        n_mask = &nodes_allowed;
        } else {
                /*
                 * Node specific request.  count adjustment happens in
                 * set_max_huge_pages() after acquiring hugetlb_lock.
                 */
                init_nodemask_of_node(&nodes_allowed, nid);
                n_mask = &nodes_allowed;
        }

        err = set_max_huge_pages(h, count, nid, n_mask);

        return err ? err : len;
}

static int __init hugetlb_init(void)
{
        int i;

        BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
                        __NR_HPAGEFLAGS);
        BUILD_BUG_ON_INVALID(HUGETLB_PAGE_ORDER > MAX_FOLIO_ORDER);

        if (!hugepages_supported()) {
                if (hugetlb_max_hstate || default_hstate_max_huge_pages)
                        pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
                return 0;
        }

        /*
         * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists.  Some
         * architectures depend on setup being done here.
         */
        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
        if (!parsed_default_hugepagesz) {
                /*
                 * If we did not parse a default huge page size, set
                 * default_hstate_idx to HPAGE_SIZE hstate. And, if the
                 * number of huge pages for this default size was implicitly
                 * specified, set that here as well.
                 * Note that the implicit setting will overwrite an explicit
                 * setting.  A warning will be printed in this case.
                 */
                default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
                if (default_hstate_max_huge_pages) {
                        if (default_hstate.max_huge_pages) {
                                char buf[32];

                                string_get_size(huge_page_size(&default_hstate),
                                        1, STRING_UNITS_2, buf, 32);
                                pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
                                        default_hstate.max_huge_pages, buf);
                                pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
                                        default_hstate_max_huge_pages);
                        }
                        default_hstate.max_huge_pages =
                                default_hstate_max_huge_pages;

                        for_each_online_node(i)
                                default_hstate.max_huge_pages_node[i] =
                                        default_hugepages_in_node[i];
                }
        }

        hugetlb_init_hstates();
        gather_bootmem_prealloc();
        report_hugepages();

        hugetlb_sysfs_init();
        hugetlb_cgroup_file_init();
        hugetlb_sysctl_init();

#ifdef CONFIG_SMP
        num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
#else
        num_fault_mutexes = 1;
#endif
        hugetlb_fault_mutex_table =
                kmalloc_objs(struct mutex, num_fault_mutexes);
        BUG_ON(!hugetlb_fault_mutex_table);

        for (i = 0; i < num_fault_mutexes; i++)
                mutex_init(&hugetlb_fault_mutex_table[i]);
        return 0;
}
subsys_initcall(hugetlb_init);

/* Overwritten by architectures with more huge page sizes */
bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
{
        return size == HPAGE_SIZE;
}

void __init hugetlb_add_hstate(unsigned int order)
{
        struct hstate *h;
        unsigned long i;

        if (size_to_hstate(PAGE_SIZE << order)) {
                return;
        }
        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
        BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
        WARN_ON(order > MAX_FOLIO_ORDER);
        h = &hstates[hugetlb_max_hstate++];
        __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key);
        h->order = order;
        h->mask = ~(huge_page_size(h) - 1);
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
        INIT_LIST_HEAD(&h->hugepage_activelist);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/SZ_1K);

        parsed_hstate = h;
}

bool __init __weak hugetlb_node_alloc_supported(void)
{
        return true;
}

static void __init hugepages_clear_pages_in_node(void)
{
        if (!hugetlb_max_hstate) {
                default_hstate_max_huge_pages = 0;
                memset(default_hugepages_in_node, 0,
                        sizeof(default_hugepages_in_node));
        } else {
                parsed_hstate->max_huge_pages = 0;
                memset(parsed_hstate->max_huge_pages_node, 0,
                        sizeof(parsed_hstate->max_huge_pages_node));
        }
}

static __init int hugetlb_add_param(char *s, int (*setup)(char *))
{
        size_t len;
        char *p;

        if (!s)
                return -EINVAL;

        if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS)
                return -EINVAL;

        len = strlen(s) + 1;
        if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf))
                return -EINVAL;

        p = &hstate_cmdline_buf[hstate_cmdline_index];
        memcpy(p, s, len);
        hstate_cmdline_index += len;

        hugetlb_params[hugetlb_param_index].val = p;
        hugetlb_params[hugetlb_param_index].setup = setup;

        hugetlb_param_index++;

        return 0;
}

static __init void hugetlb_parse_params(void)
{
        int i;
        struct hugetlb_cmdline *hcp;

        for (i = 0; i < hugetlb_param_index; i++) {
                hcp = &hugetlb_params[i];

                hcp->setup(hcp->val);
        }

        hugetlb_cma_validate_params();
}

/*
 * hugepages command line processing
 * hugepages normally follows a valid hugepagsz or default_hugepagsz
 * specification.  If not, ignore the hugepages value.  hugepages can also
 * be the first huge page command line  option in which case it implicitly
 * specifies the number of huge pages for the default size.
 */
static int __init hugepages_setup(char *s)
{
        unsigned long *mhp;
        static unsigned long *last_mhp;
        int node = NUMA_NO_NODE;
        int count;
        unsigned long tmp;
        char *p = s;

        if (!hugepages_supported()) {
                pr_warn("HugeTLB: hugepages unsupported, ignoring hugepages=%s cmdline\n", s);
                return 0;
        }

        if (!parsed_valid_hugepagesz) {
                pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
                parsed_valid_hugepagesz = true;
                return -EINVAL;
        }

        /*
         * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
         * yet, so this hugepages= parameter goes to the "default hstate".
         * Otherwise, it goes with the previously parsed hugepagesz or
         * default_hugepagesz.
         */
        else if (!hugetlb_max_hstate)
                mhp = &default_hstate_max_huge_pages;
        else
                mhp = &parsed_hstate->max_huge_pages;

        if (mhp == last_mhp) {
                pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
                return 1;
        }

        while (*p) {
                count = 0;
                if (sscanf(p, "%lu%n", &tmp, &count) != 1)
                        goto invalid;
                /* Parameter is node format */
                if (p[count] == ':') {
                        if (!hugetlb_node_alloc_supported()) {
                                pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
                                return 1;
                        }
                        if (tmp >= MAX_NUMNODES || !node_online(tmp))
                                goto invalid;
                        node = array_index_nospec(tmp, MAX_NUMNODES);
                        p += count + 1;
                        /* Parse hugepages */
                        if (sscanf(p, "%lu%n", &tmp, &count) != 1)
                                goto invalid;
                        if (!hugetlb_max_hstate)
                                default_hugepages_in_node[node] = tmp;
                        else
                                parsed_hstate->max_huge_pages_node[node] = tmp;
                        *mhp += tmp;
                        /* Go to parse next node*/
                        if (p[count] == ',')
                                p += count + 1;
                        else
                                break;
                } else {
                        if (p != s)
                                goto invalid;
                        *mhp = tmp;
                        break;
                }
        }

        last_mhp = mhp;

        return 0;

invalid:
        pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
        hugepages_clear_pages_in_node();
        return -EINVAL;
}
hugetlb_early_param("hugepages", hugepages_setup);

/*
 * hugepagesz command line processing
 * A specific huge page size can only be specified once with hugepagesz.
 * hugepagesz is followed by hugepages on the command line.  The global
 * variable 'parsed_valid_hugepagesz' is used to determine if prior
 * hugepagesz argument was valid.
 */
static int __init hugepagesz_setup(char *s)
{
        unsigned long size;
        struct hstate *h;

        if (!hugepages_supported()) {
                pr_warn("HugeTLB: hugepages unsupported, ignoring hugepagesz=%s cmdline\n", s);
                return 0;
        }

        parsed_valid_hugepagesz = false;
        size = (unsigned long)memparse(s, NULL);

        if (!arch_hugetlb_valid_size(size)) {
                pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
                return -EINVAL;
        }

        h = size_to_hstate(size);
        if (h) {
                /*
                 * hstate for this size already exists.  This is normally
                 * an error, but is allowed if the existing hstate is the
                 * default hstate.  More specifically, it is only allowed if
                 * the number of huge pages for the default hstate was not
                 * previously specified.
                 */
                if (!parsed_default_hugepagesz ||  h != &default_hstate ||
                    default_hstate.max_huge_pages) {
                        pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
                        return -EINVAL;
                }

                /*
                 * No need to call hugetlb_add_hstate() as hstate already
                 * exists.  But, do set parsed_hstate so that a following
                 * hugepages= parameter will be applied to this hstate.
                 */
                parsed_hstate = h;
                parsed_valid_hugepagesz = true;
                return 0;
        }

        hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
        parsed_valid_hugepagesz = true;
        return 0;
}
hugetlb_early_param("hugepagesz", hugepagesz_setup);

/*
 * default_hugepagesz command line input
 * Only one instance of default_hugepagesz allowed on command line.
 */
static int __init default_hugepagesz_setup(char *s)
{
        unsigned long size;
        int i;

        if (!hugepages_supported()) {
                pr_warn("HugeTLB: hugepages unsupported, ignoring default_hugepagesz=%s cmdline\n",
                        s);
                return 0;
        }

        parsed_valid_hugepagesz = false;
        if (parsed_default_hugepagesz) {
                pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
                return -EINVAL;
        }

        size = (unsigned long)memparse(s, NULL);

        if (!arch_hugetlb_valid_size(size)) {
                pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
                return -EINVAL;
        }

        hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
        parsed_valid_hugepagesz = true;
        parsed_default_hugepagesz = true;
        default_hstate_idx = hstate_index(size_to_hstate(size));

        /*
         * The number of default huge pages (for this size) could have been
         * specified as the first hugetlb parameter: hugepages=X.  If so,
         * then default_hstate_max_huge_pages is set.  If the default huge
         * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be
         * allocated here from bootmem allocator.
         */
        if (default_hstate_max_huge_pages) {
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
                /*
                 * Since this is an early parameter, we can't check
                 * NUMA node state yet, so loop through MAX_NUMNODES.
                 */
                for (i = 0; i < MAX_NUMNODES; i++) {
                        if (default_hugepages_in_node[i] != 0)
                                default_hstate.max_huge_pages_node[i] =
                                        default_hugepages_in_node[i];
                }
                default_hstate_max_huge_pages = 0;
        }

        return 0;
}
hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);

void __init hugetlb_bootmem_set_nodes(void)
{
        int i, nid;
        unsigned long start_pfn, end_pfn;

        if (!nodes_empty(hugetlb_bootmem_nodes))
                return;

        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
                if (end_pfn > start_pfn)
                        node_set(nid, hugetlb_bootmem_nodes);
        }
}

void __init hugetlb_bootmem_alloc(void)
{
        struct hstate *h;
        int i;

        hugetlb_bootmem_set_nodes();

        for (i = 0; i < MAX_NUMNODES; i++)
                INIT_LIST_HEAD(&huge_boot_pages[i]);

        hugetlb_parse_params();

        for_each_hstate(h) {
                h->next_nid_to_alloc = first_online_node;

                if (hstate_is_gigantic(h))
                        hugetlb_hstate_alloc_pages(h);
        }
}

/*
 * hugepage_alloc_threads command line parsing.
 *
 * When set, use this specific number of threads for the boot
 * allocation of hugepages.
 */
static int __init hugepage_alloc_threads_setup(char *s)
{
        unsigned long allocation_threads;

        if (kstrtoul(s, 0, &allocation_threads) != 0)
                return 1;

        if (allocation_threads == 0)
                return 1;

        hugepage_allocation_threads = allocation_threads;

        return 1;
}
__setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup);

static unsigned int allowed_mems_nr(struct hstate *h)
{
        int node;
        unsigned int nr = 0;
        nodemask_t *mbind_nodemask;
        unsigned int *array = h->free_huge_pages_node;
        gfp_t gfp_mask = htlb_alloc_mask(h);

        mbind_nodemask = policy_mbind_nodemask(gfp_mask);
        for_each_node_mask(node, cpuset_current_mems_allowed) {
                if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
                        nr += array[node];
        }

        return nr;
}

void hugetlb_report_meminfo(struct seq_file *m)
{
        struct hstate *h;
        unsigned long total = 0;

        if (!hugepages_supported())
                return;

        for_each_hstate(h) {
                unsigned long count = h->nr_huge_pages;

                total += huge_page_size(h) * count;

                if (h == &default_hstate)
                        seq_printf(m,
                                   "HugePages_Total:   %5lu\n"
                                   "HugePages_Free:    %5lu\n"
                                   "HugePages_Rsvd:    %5lu\n"
                                   "HugePages_Surp:    %5lu\n"
                                   "Hugepagesize:   %8lu kB\n",
                                   count,
                                   h->free_huge_pages,
                                   h->resv_huge_pages,
                                   h->surplus_huge_pages,
                                   huge_page_size(h) / SZ_1K);
        }

        seq_printf(m, "Hugetlb:        %8lu kB\n", total / SZ_1K);
}

int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
        struct hstate *h = &default_hstate;

        if (!hugepages_supported())
                return 0;

        return sysfs_emit_at(buf, len,
                             "Node %d HugePages_Total: %5u\n"
                             "Node %d HugePages_Free:  %5u\n"
                             "Node %d HugePages_Surp:  %5u\n",
                             nid, h->nr_huge_pages_node[nid],
                             nid, h->free_huge_pages_node[nid],
                             nid, h->surplus_huge_pages_node[nid]);
}

void hugetlb_show_meminfo_node(int nid)
{
        struct hstate *h;

        if (!hugepages_supported())
                return;

        for_each_hstate(h)
                printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
                        nid,
                        h->nr_huge_pages_node[nid],
                        h->free_huge_pages_node[nid],
                        h->surplus_huge_pages_node[nid],
                        huge_page_size(h) / SZ_1K);
}

void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
{
        seq_printf(m, "HugetlbPages:\t%8lu kB\n",
                   K(atomic_long_read(&mm->hugetlb_usage)));
}

/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
        struct hstate *h;
        unsigned long nr_total_pages = 0;

        for_each_hstate(h)
                nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
        return nr_total_pages;
}

static int hugetlb_acct_memory(struct hstate *h, long delta)
{
        int ret = -ENOMEM;

        if (!delta)
                return 0;

        spin_lock_irq(&hugetlb_lock);
        /*
         * When cpuset is configured, it breaks the strict hugetlb page
         * reservation as the accounting is done on a global variable. Such
         * reservation is completely rubbish in the presence of cpuset because
         * the reservation is not checked against page availability for the
         * current cpuset. Application can still potentially OOM'ed by kernel
         * with lack of free htlb page in cpuset that the task is in.
         * Attempt to enforce strict accounting with cpuset is almost
         * impossible (or too ugly) because cpuset is too fluid that
         * task or memory node can be dynamically moved between cpusets.
         *
         * The change of semantics for shared hugetlb mapping with cpuset is
         * undesirable. However, in order to preserve some of the semantics,
         * we fall back to check against current free page availability as
         * a best attempt and hopefully to minimize the impact of changing
         * semantics that cpuset has.
         *
         * Apart from cpuset, we also have memory policy mechanism that
         * also determines from which node the kernel will allocate memory
         * in a NUMA system. So similar to cpuset, we also should consider
         * the memory policy of the current task. Similar to the description
         * above.
         */
        if (delta > 0) {
                if (gather_surplus_pages(h, delta) < 0)
                        goto out;

                if (delta > allowed_mems_nr(h)) {
                        return_unused_surplus_pages(h, delta);
                        goto out;
                }
        }

        ret = 0;
        if (delta < 0)
                return_unused_surplus_pages(h, (unsigned long) -delta);

out:
        spin_unlock_irq(&hugetlb_lock);
        return ret;
}

static void hugetlb_vm_op_open(struct vm_area_struct *vma)
{
        struct resv_map *resv = vma_resv_map(vma);

        /*
         * HPAGE_RESV_OWNER indicates a private mapping.
         * This new VMA should share its siblings reservation map if present.
         * The VMA will only ever have a valid reservation map pointer where
         * it is being copied for another still existing VMA.  As that VMA
         * has a reference to the reservation map it cannot disappear until
         * after this open call completes.  It is therefore safe to take a
         * new reference here without additional locking.
         */
        if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
                kref_get(&resv->refs);
        }

        /*
         * vma_lock structure for sharable mappings is vma specific.
         * Clear old pointer (if copied via vm_area_dup) and allocate
         * new structure.  Before clearing, make sure vma_lock is not
         * for this vma.
         */
        if (vma->vm_flags & VM_MAYSHARE) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                if (vma_lock) {
                        if (vma_lock->vma != vma) {
                                vma->vm_private_data = NULL;
                                hugetlb_vma_lock_alloc(vma);
                        } else {
                                pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
                        }
                } else {
                        hugetlb_vma_lock_alloc(vma);
                }
        }
}

static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
        struct hstate *h = hstate_vma(vma);
        struct resv_map *resv;
        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve, start, end;
        long gbl_reserve;

        hugetlb_vma_lock_free(vma);

        resv = vma_resv_map(vma);
        if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                return;

        start = vma_hugecache_offset(h, vma, vma->vm_start);
        end = vma_hugecache_offset(h, vma, vma->vm_end);

        reserve = (end - start) - region_count(resv, start, end);
        hugetlb_cgroup_uncharge_counter(resv, start, end);
        if (reserve) {
                /*
                 * Decrement reserve counts.  The global reserve count may be
                 * adjusted if the subpool has a minimum size.
                 */
                gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
                hugetlb_acct_memory(h, -gbl_reserve);
        }

        kref_put(&resv->refs, resv_map_release);
}

static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
        if (addr & ~(huge_page_mask(hstate_vma(vma))))
                return -EINVAL;
        return 0;
}

void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
{
        /*
         * PMD sharing is only possible for PUD_SIZE-aligned address ranges
         * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
         * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
         * This function is called in the middle of a VMA split operation, with
         * MM, VMA and rmap all write-locked to prevent concurrent page table
         * walks (except hardware and gup_fast()).
         */
        vma_assert_write_locked(vma);
        i_mmap_assert_write_locked(vma->vm_file->f_mapping);

        if (addr & ~PUD_MASK) {
                unsigned long floor = addr & PUD_MASK;
                unsigned long ceil = floor + PUD_SIZE;

                if (floor >= vma->vm_start && ceil <= vma->vm_end) {
                        /*
                         * Locking:
                         * Use take_locks=false here.
                         * The file rmap lock is already held.
                         * The hugetlb VMA lock can't be taken when we already
                         * hold the file rmap lock, and we don't need it because
                         * its purpose is to synchronize against concurrent page
                         * table walks, which are not possible thanks to the
                         * locks held by our caller.
                         */
                        hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
                }
        }
}

static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
{
        return huge_page_size(hstate_vma(vma));
}

/*
 * We cannot handle pagefaults against hugetlb pages at all.  They cause
 * handle_mm_fault() to try to instantiate regular-sized pages in the
 * hugepage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 * this far.
 */
static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
{
        BUG();
        return 0;
}

#ifdef CONFIG_USERFAULTFD
static bool hugetlb_can_userfault(struct vm_area_struct *vma,
                                  vm_flags_t vm_flags)
{
        return true;
}

static const struct vm_uffd_ops hugetlb_uffd_ops = {
        .can_userfault = hugetlb_can_userfault,
};
#endif

/*
 * When a new function is introduced to vm_operations_struct and added
 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
 * This is because under System V memory model, mappings created via
 * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
 * their original vm_ops are overwritten with shm_vm_ops.
 */
const struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
        .open = hugetlb_vm_op_open,
        .close = hugetlb_vm_op_close,
        .may_split = hugetlb_vm_op_split,
        .pagesize = hugetlb_vm_op_pagesize,
#ifdef CONFIG_USERFAULTFD
        .uffd_ops = &hugetlb_uffd_ops,
#endif
};

static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
                bool try_mkwrite)
{
        pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
        unsigned int shift = huge_page_shift(hstate_vma(vma));

        if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
                entry = pte_mkwrite_novma(pte_mkdirty(entry));
        } else {
                entry = pte_wrprotect(entry);
        }
        entry = pte_mkyoung(entry);
        entry = arch_make_huge_pte(entry, shift, vma->vm_flags);

        return entry;
}

static void set_huge_ptep_writable(struct vm_area_struct *vma,
                                   unsigned long address, pte_t *ptep)
{
        pte_t entry;

        entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(vma->vm_mm, address, ptep)));
        if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
                update_mmu_cache(vma, address, ptep);
}

static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
                                         unsigned long address, pte_t *ptep)
{
        if (vma->vm_flags & VM_WRITE)
                set_huge_ptep_writable(vma, address, ptep);
}

static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
                      struct folio *new_folio, pte_t old, unsigned long sz)
{
        pte_t newpte = make_huge_pte(vma, new_folio, true);

        __folio_mark_uptodate(new_folio);
        hugetlb_add_new_anon_rmap(new_folio, vma, addr);
        if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
                newpte = huge_pte_mkuffd_wp(newpte);
        set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz);
        hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
        folio_set_hugetlb_migratable(new_folio);
}

int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *dst_vma,
                            struct vm_area_struct *src_vma)
{
        pte_t *src_pte, *dst_pte, entry;
        struct folio *pte_folio;
        unsigned long addr;
        bool cow = is_cow_mapping(src_vma->vm_flags);
        struct hstate *h = hstate_vma(src_vma);
        unsigned long sz = huge_page_size(h);
        unsigned long npages = pages_per_huge_page(h);
        struct mmu_notifier_range range;
        unsigned long last_addr_mask;
        softleaf_t softleaf;
        int ret = 0;

        if (cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src,
                                        src_vma->vm_start,
                                        src_vma->vm_end);
                mmu_notifier_invalidate_range_start(&range);
                vma_assert_write_locked(src_vma);
                raw_write_seqcount_begin(&src->write_protect_seq);
        } else {
                /*
                 * For shared mappings the vma lock must be held before
                 * calling hugetlb_walk() in the src vma. Otherwise, the
                 * returned ptep could go away if part of a shared pmd and
                 * another thread calls huge_pmd_unshare.
                 */
                hugetlb_vma_lock_read(src_vma);
        }

        last_addr_mask = hugetlb_mask_last_page(h);
        for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = hugetlb_walk(src_vma, addr, sz);
                if (!src_pte) {
                        addr |= last_addr_mask;
                        continue;
                }
                dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
                        break;
                }

#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
                /* If the pagetables are shared, there is nothing to do */
                if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) {
                        addr |= last_addr_mask;
                        continue;
                }
#endif

                dst_ptl = huge_pte_lock(h, dst, dst_pte);
                src_ptl = huge_pte_lockptr(h, src, src_pte);
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
again:
                if (huge_pte_none(entry)) {
                        /* Skip if src entry none. */
                        goto next;
                }

                softleaf = softleaf_from_pte(entry);
                if (unlikely(softleaf_is_hwpoison(softleaf))) {
                        if (!userfaultfd_wp(dst_vma))
                                entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_pte_at(dst, addr, dst_pte, entry, sz);
                } else if (unlikely(softleaf_is_migration(softleaf))) {
                        bool uffd_wp = pte_swp_uffd_wp(entry);

                        if (!softleaf_is_migration_read(softleaf) && cow) {
                                /*
                                 * COW mappings require pages in both
                                 * parent and child to be set to read.
                                 */
                                softleaf = make_readable_migration_entry(
                                                        swp_offset(softleaf));
                                entry = swp_entry_to_pte(softleaf);
                                if (userfaultfd_wp(src_vma) && uffd_wp)
                                        entry = pte_swp_mkuffd_wp(entry);
                                set_huge_pte_at(src, addr, src_pte, entry, sz);
                        }
                        if (!userfaultfd_wp(dst_vma))
                                entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_pte_at(dst, addr, dst_pte, entry, sz);
                } else if (unlikely(pte_is_marker(entry))) {
                        const pte_marker marker = copy_pte_marker(softleaf, dst_vma);

                        if (marker)
                                set_huge_pte_at(dst, addr, dst_pte,
                                                make_pte_marker(marker), sz);
                } else {
                        entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
                        pte_folio = page_folio(pte_page(entry));
                        folio_get(pte_folio);

                        /*
                         * Failing to duplicate the anon rmap is a rare case
                         * where we see pinned hugetlb pages while they're
                         * prone to COW. We need to do the COW earlier during
                         * fork.
                         *
                         * When pre-allocating the page or copying data, we
                         * need to be without the pgtable locks since we could
                         * sleep during the process.
                         */
                        if (!folio_test_anon(pte_folio)) {
                                hugetlb_add_file_rmap(pte_folio);
                        } else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) {
                                pte_t src_pte_old = entry;
                                struct folio *new_folio;

                                spin_unlock(src_ptl);
                                spin_unlock(dst_ptl);
                                /* Do not use reserve as it's private owned */
                                new_folio = alloc_hugetlb_folio(dst_vma, addr, false);
                                if (IS_ERR(new_folio)) {
                                        folio_put(pte_folio);
                                        ret = PTR_ERR(new_folio);
                                        break;
                                }
                                ret = copy_user_large_folio(new_folio, pte_folio,
                                                            addr, dst_vma);
                                folio_put(pte_folio);
                                if (ret) {
                                        folio_put(new_folio);
                                        break;
                                }

                                /* Install the new hugetlb folio if src pte stable */
                                dst_ptl = huge_pte_lock(h, dst, dst_pte);
                                src_ptl = huge_pte_lockptr(h, src, src_pte);
                                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                                entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
                                if (!pte_same(src_pte_old, entry)) {
                                        restore_reserve_on_error(h, dst_vma, addr,
                                                                new_folio);
                                        folio_put(new_folio);
                                        /* huge_ptep of dst_pte won't change as in child */
                                        goto again;
                                }
                                hugetlb_install_folio(dst_vma, dst_pte, addr,
                                                      new_folio, src_pte_old, sz);
                                goto next;
                        }

                        if (cow) {
                                /*
                                 * No need to notify as we are downgrading page
                                 * table protection not changing it to point
                                 * to a new page.
                                 *
                                 * See Documentation/mm/mmu_notifier.rst
                                 */
                                huge_ptep_set_wrprotect(src, addr, src_pte);
                                entry = huge_pte_wrprotect(entry);
                        }

                        if (!userfaultfd_wp(dst_vma))
                                entry = huge_pte_clear_uffd_wp(entry);

                        set_huge_pte_at(dst, addr, dst_pte, entry, sz);
                        hugetlb_count_add(npages, dst);
                }

next:
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
        }

        if (cow) {
                raw_write_seqcount_end(&src->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
        } else {
                hugetlb_vma_unlock_read(src_vma);
        }

        return ret;
}

static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
                          unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
                          unsigned long sz)
{
        bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
        struct hstate *h = hstate_vma(vma);
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *src_ptl, *dst_ptl;
        pte_t pte;

        dst_ptl = huge_pte_lock(h, mm, dst_pte);
        src_ptl = huge_pte_lockptr(h, mm, src_pte);

        /*
         * We don't have to worry about the ordering of src and dst ptlocks
         * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock.
         */
        if (src_ptl != dst_ptl)
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

        pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);

        if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) {
                huge_pte_clear(mm, new_addr, dst_pte, sz);
        } else {
                if (need_clear_uffd_wp) {
                        if (pte_present(pte))
                                pte = huge_pte_clear_uffd_wp(pte);
                        else
                                pte = pte_swp_clear_uffd_wp(pte);
                }
                set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
        }

        if (src_ptl != dst_ptl)
                spin_unlock(src_ptl);
        spin_unlock(dst_ptl);
}

int move_hugetlb_page_tables(struct vm_area_struct *vma,
                             struct vm_area_struct *new_vma,
                             unsigned long old_addr, unsigned long new_addr,
                             unsigned long len)
{
        struct hstate *h = hstate_vma(vma);
        struct address_space *mapping = vma->vm_file->f_mapping;
        unsigned long sz = huge_page_size(h);
        struct mm_struct *mm = vma->vm_mm;
        unsigned long old_end = old_addr + len;
        unsigned long last_addr_mask;
        pte_t *src_pte, *dst_pte;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
                                old_end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
        /*
         * In case of shared PMDs, we should cover the maximum possible
         * range.
         */
        flush_cache_range(vma, range.start, range.end);
        tlb_gather_mmu_vma(&tlb, vma);

        mmu_notifier_invalidate_range_start(&range);
        last_addr_mask = hugetlb_mask_last_page(h);
        /* Prevent race with file truncation */
        hugetlb_vma_lock_write(vma);
        i_mmap_lock_write(mapping);
        for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
                src_pte = hugetlb_walk(vma, old_addr, sz);
                if (!src_pte) {
                        old_addr |= last_addr_mask;
                        new_addr |= last_addr_mask;
                        continue;
                }
                if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
                        continue;

                if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) {
                        old_addr |= last_addr_mask;
                        new_addr |= last_addr_mask;
                        continue;
                }

                dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
                if (!dst_pte)
                        break;

                move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
                tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr);
        }

        tlb_flush_mmu_tlbonly(&tlb);
        huge_pmd_unshare_flush(&tlb, vma);

        mmu_notifier_invalidate_range_end(&range);
        i_mmap_unlock_write(mapping);
        hugetlb_vma_unlock_write(vma);
        tlb_finish_mmu(&tlb);

        return len + old_addr - old_end;
}

void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                            unsigned long start, unsigned long end,
                            struct folio *folio, zap_flags_t zap_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        const bool folio_provided = !!folio;
        unsigned long address;
        pte_t *ptep;
        pte_t pte;
        spinlock_t *ptl;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        bool adjust_reservation;
        unsigned long last_addr_mask;

        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));

        /*
         * This is a hugetlb vma, all the pte entries should point
         * to huge page.
         */
        tlb_change_page_size(tlb, sz);
        tlb_start_vma(tlb, vma);

        last_addr_mask = hugetlb_mask_last_page(h);
        address = start;
        for (; address < end; address += sz) {
                ptep = hugetlb_walk(vma, address, sz);
                if (!ptep) {
                        address |= last_addr_mask;
                        continue;
                }

                ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(tlb, vma, address, ptep)) {
                        spin_unlock(ptl);
                        address |= last_addr_mask;
                        continue;
                }

                pte = huge_ptep_get(mm, address, ptep);
                if (huge_pte_none(pte)) {
                        spin_unlock(ptl);
                        continue;
                }

                /*
                 * Migrating hugepage or HWPoisoned hugepage is already
                 * unmapped and its refcount is dropped, so just clear pte here.
                 */
                if (unlikely(!pte_present(pte))) {
                        /*
                         * If the pte was wr-protected by uffd-wp in any of the
                         * swap forms, meanwhile the caller does not want to
                         * drop the uffd-wp bit in this zap, then replace the
                         * pte with a marker.
                         */
                        if (pte_swp_uffd_wp_any(pte) &&
                            !(zap_flags & ZAP_FLAG_DROP_MARKER))
                                set_huge_pte_at(mm, address, ptep,
                                                make_pte_marker(PTE_MARKER_UFFD_WP),
                                                sz);
                        else
                                huge_pte_clear(mm, address, ptep, sz);
                        spin_unlock(ptl);
                        continue;
                }

                /*
                 * If a folio is supplied, it is because a specific
                 * folio is being unmapped, not a range. Ensure the folio we
                 * are about to unmap is the actual folio of interest.
                 */
                if (folio_provided) {
                        if (folio != page_folio(pte_page(pte))) {
                                spin_unlock(ptl);
                                continue;
                        }
                        /*
                         * Mark the VMA as having unmapped its page so that
                         * future faults in this VMA will fail rather than
                         * looking like data was lost
                         */
                        set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
                } else {
                        folio = page_folio(pte_page(pte));
                }

                pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
                tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                if (huge_pte_dirty(pte))
                        folio_mark_dirty(folio);
                /* Leave a uffd-wp pte marker if needed */
                if (huge_pte_uffd_wp(pte) &&
                    !(zap_flags & ZAP_FLAG_DROP_MARKER))
                        set_huge_pte_at(mm, address, ptep,
                                        make_pte_marker(PTE_MARKER_UFFD_WP),
                                        sz);
                hugetlb_count_sub(pages_per_huge_page(h), mm);
                hugetlb_remove_rmap(folio);
                spin_unlock(ptl);

                /*
                 * Restore the reservation for anonymous page, otherwise the
                 * backing page could be stolen by someone.
                 * If there we are freeing a surplus, do not set the restore
                 * reservation bit.
                 */
                adjust_reservation = false;

                spin_lock_irq(&hugetlb_lock);
                if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
                    folio_test_anon(folio)) {
                        folio_set_hugetlb_restore_reserve(folio);
                        /* Reservation to be adjusted after the spin lock */
                        adjust_reservation = true;
                }
                spin_unlock_irq(&hugetlb_lock);

                /*
                 * Adjust the reservation for the region that will have the
                 * reserve restored. Keep in mind that vma_needs_reservation() changes
                 * resv->adds_in_progress if it succeeds. If this is not done,
                 * do_exit() will not see it, and will keep the reservation
                 * forever.
                 */
                if (adjust_reservation) {
                        int rc = vma_needs_reservation(h, vma, address);

                        if (rc < 0)
                                /* Pressumably allocate_file_region_entries failed
                                 * to allocate a file_region struct. Clear
                                 * hugetlb_restore_reserve so that global reserve
                                 * count will not be incremented by free_huge_folio.
                                 * Act as if we consumed the reservation.
                                 */
                                folio_clear_hugetlb_restore_reserve(folio);
                        else if (rc)
                                vma_add_reservation(h, vma, address);
                }

                tlb_remove_page_size(tlb, folio_page(folio, 0),
                                     folio_size(folio));
                /*
                 * If we were instructed to unmap a specific folio, we're done.
                 */
                if (folio_provided)
                        break;
        }
        tlb_end_vma(tlb, vma);

        huge_pmd_unshare_flush(tlb, vma);
}

void __hugetlb_zap_begin(struct vm_area_struct *vma,
                         unsigned long *start, unsigned long *end)
{
        if (!vma->vm_file)        /* hugetlbfs_file_mmap error */
                return;

        adjust_range_if_pmd_sharing_possible(vma, start, end);
        hugetlb_vma_lock_write(vma);
        if (vma->vm_file)
                i_mmap_lock_write(vma->vm_file->f_mapping);
}

void __hugetlb_zap_end(struct vm_area_struct *vma,
                       struct zap_details *details)
{
        zap_flags_t zap_flags = details ? details->zap_flags : 0;

        if (!vma->vm_file)        /* hugetlbfs_file_mmap error */
                return;

        if (zap_flags & ZAP_FLAG_UNMAP) {        /* final unmap */
                /*
                 * Unlock and free the vma lock before releasing i_mmap_rwsem.
                 * When the vma_lock is freed, this makes the vma ineligible
                 * for pmd sharing.  And, i_mmap_rwsem is required to set up
                 * pmd sharing.  This is important as page tables for this
                 * unmapped range will be asynchrously deleted.  If the page
                 * tables are shared, there will be issues when accessed by
                 * someone else.
                 */
                __hugetlb_vma_unlock_write_free(vma);
        } else {
                hugetlb_vma_unlock_write(vma);
        }

        if (vma->vm_file)
                i_mmap_unlock_write(vma->vm_file->f_mapping);
}

void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct folio *folio,
                          zap_flags_t zap_flags)
{
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                start, end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
        mmu_notifier_invalidate_range_start(&range);
        tlb_gather_mmu(&tlb, vma->vm_mm);

        __unmap_hugepage_range(&tlb, vma, start, end,
                               folio, zap_flags);

        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
}

/*
 * This is called when the original mapper is failing to COW a MAP_PRIVATE
 * mapping it owns the reserve page for. The intention is to unmap the page
 * from other VMAs and let the children be SIGKILLed if they are faulting the
 * same region.
 */
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                              struct folio *folio, unsigned long address)
{
        struct hstate *h = hstate_vma(vma);
        struct vm_area_struct *iter_vma;
        struct address_space *mapping;
        pgoff_t pgoff;

        /*
         * vm_pgoff is in PAGE_SIZE units, hence the different calculation
         * from page cache lookup which is in HPAGE_SIZE units.
         */
        address = address & huge_page_mask(h);
        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
        mapping = vma->vm_file->f_mapping;

        /*
         * Take the mapping lock for the duration of the table walk. As
         * this mapping should be shared between all the VMAs,
         * __unmap_hugepage_range() is called as the lock is already held
         */
        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
                        continue;

                /*
                 * Shared VMAs have their own reserves and do not affect
                 * MAP_PRIVATE accounting but it is possible that a shared
                 * VMA is using the same page so check and skip such VMAs.
                 */
                if (iter_vma->vm_flags & VM_MAYSHARE)
                        continue;

                /*
                 * Unmap the page from other VMAs without their own reserves.
                 * They get marked to be SIGKILLed if they fault in these
                 * areas. This is because a future no-page fault on this VMA
                 * could insert a zeroed page instead of the data existing
                 * from the time of fork. This would look like data corruption
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
                        unmap_hugepage_range(iter_vma, address,
                                             address + huge_page_size(h),
                                             folio, 0);
        }
        i_mmap_unlock_write(mapping);
}

/*
 * hugetlb_wp() should be called with page lock of the original hugepage held.
 * Called with hugetlb_fault_mutex_table held and pte_page locked so we
 * cannot race with other handlers or page migration.
 * Keep the pte_same checks anyway to make transition from the mutex easier.
 */
static vm_fault_t hugetlb_wp(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        pte_t pte = huge_ptep_get(mm, vmf->address, vmf->pte);
        struct hstate *h = hstate_vma(vma);
        struct folio *old_folio;
        struct folio *new_folio;
        bool cow_from_owner = 0;
        vm_fault_t ret = 0;
        struct mmu_notifier_range range;

        /*
         * Never handle CoW for uffd-wp protected pages.  It should be only
         * handled when the uffd-wp protection is removed.
         *
         * Note that only the CoW optimization path (in hugetlb_no_page())
         * can trigger this, because hugetlb_fault() will always resolve
         * uffd-wp bit first.
         */
        if (!unshare && huge_pte_uffd_wp(pte))
                return 0;

        /* Let's take out MAP_SHARED mappings first. */
        if (vma->vm_flags & VM_MAYSHARE) {
                set_huge_ptep_writable(vma, vmf->address, vmf->pte);
                return 0;
        }

        old_folio = page_folio(pte_page(pte));

        delayacct_wpcopy_start();

retry_avoidcopy:
        /*
         * If no-one else is actually using this page, we're the exclusive
         * owner and can reuse this page.
         *
         * Note that we don't rely on the (safer) folio refcount here, because
         * copying the hugetlb folio when there are unexpected (temporary)
         * folio references could harm simple fork()+exit() users when
         * we run out of free hugetlb folios: we would have to kill processes
         * in scenarios that used to work. As a side effect, there can still
         * be leaks between processes, for example, with FOLL_GET users.
         */
        if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
                if (!PageAnonExclusive(&old_folio->page)) {
                        folio_move_anon_rmap(old_folio, vma);
                        SetPageAnonExclusive(&old_folio->page);
                }
                if (likely(!unshare))
                        set_huge_ptep_maybe_writable(vma, vmf->address,
                                                     vmf->pte);

                delayacct_wpcopy_end();
                return 0;
        }
        VM_BUG_ON_PAGE(folio_test_anon(old_folio) &&
                       PageAnonExclusive(&old_folio->page), &old_folio->page);

        /*
         * If the process that created a MAP_PRIVATE mapping is about to perform
         * a COW due to a shared page count, attempt to satisfy the allocation
         * without using the existing reserves.
         * In order to determine where this is a COW on a MAP_PRIVATE mapping it
         * is enough to check whether the old_folio is anonymous. This means that
         * the reserve for this address was consumed. If reserves were used, a
         * partial faulted mapping at the fime of fork() could consume its reserves
         * on COW instead of the full address range.
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
            folio_test_anon(old_folio))
                cow_from_owner = true;

        folio_get(old_folio);

        /*
         * Drop page table lock as buddy allocator may be called. It will
         * be acquired again before returning to the caller, as expected.
         */
        spin_unlock(vmf->ptl);
        new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner);

        if (IS_ERR(new_folio)) {
                /*
                 * If a process owning a MAP_PRIVATE mapping fails to COW,
                 * it is due to references held by a child and an insufficient
                 * huge page pool. To guarantee the original mappers
                 * reliability, unmap the page from child processes. The child
                 * may get SIGKILLed if it later faults.
                 */
                if (cow_from_owner) {
                        struct address_space *mapping = vma->vm_file->f_mapping;
                        pgoff_t idx;
                        u32 hash;

                        folio_put(old_folio);
                        /*
                         * Drop hugetlb_fault_mutex and vma_lock before
                         * unmapping.  unmapping needs to hold vma_lock
                         * in write mode.  Dropping vma_lock in read mode
                         * here is OK as COW mappings do not interact with
                         * PMD sharing.
                         *
                         * Reacquire both after unmap operation.
                         */
                        idx = vma_hugecache_offset(h, vma, vmf->address);
                        hash = hugetlb_fault_mutex_hash(mapping, idx);
                        hugetlb_vma_unlock_read(vma);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);

                        unmap_ref_private(mm, vma, old_folio, vmf->address);

                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        hugetlb_vma_lock_read(vma);
                        spin_lock(vmf->ptl);
                        vmf->pte = hugetlb_walk(vma, vmf->address,
                                        huge_page_size(h));
                        if (likely(vmf->pte &&
                                   pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte)))
                                goto retry_avoidcopy;
                        /*
                         * race occurs while re-acquiring page table
                         * lock, and our job is done.
                         */
                        delayacct_wpcopy_end();
                        return 0;
                }

                ret = vmf_error(PTR_ERR(new_folio));
                goto out_release_old;
        }

        /*
         * When the original hugepage is shared one, it does not have
         * anon_vma prepared.
         */
        ret = __vmf_anon_prepare(vmf);
        if (unlikely(ret))
                goto out_release_all;

        if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) {
                ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h));
                goto out_release_all;
        }
        __folio_mark_uptodate(new_folio);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address,
                                vmf->address + huge_page_size(h));
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Retake the page table lock to check for racing updates
         * before the page tables are altered
         */
        spin_lock(vmf->ptl);
        vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
        if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) {
                pte_t newpte = make_huge_pte(vma, new_folio, !unshare);

                /* Break COW or unshare */
                huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
                hugetlb_remove_rmap(old_folio);
                hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);
                if (huge_pte_uffd_wp(pte))
                        newpte = huge_pte_mkuffd_wp(newpte);
                set_huge_pte_at(mm, vmf->address, vmf->pte, newpte,
                                huge_page_size(h));
                folio_set_hugetlb_migratable(new_folio);
                /* Make the old page be freed below */
                new_folio = old_folio;
        }
        spin_unlock(vmf->ptl);
        mmu_notifier_invalidate_range_end(&range);
out_release_all:
        /*
         * No restore in case of successful pagetable update (Break COW or
         * unshare)
         */
        if (new_folio != old_folio)
                restore_reserve_on_error(h, vma, vmf->address, new_folio);
        folio_put(new_folio);
out_release_old:
        folio_put(old_folio);

        spin_lock(vmf->ptl); /* Caller expects lock to be held */

        delayacct_wpcopy_end();
        return ret;
}

/*
 * Return whether there is a pagecache page to back given address within VMA.
 */
bool hugetlbfs_pagecache_present(struct hstate *h,
                                 struct vm_area_struct *vma, unsigned long address)
{
        struct address_space *mapping = vma->vm_file->f_mapping;
        pgoff_t idx = linear_page_index(vma, address);
        struct folio *folio;

        folio = filemap_get_folio(mapping, idx);
        if (IS_ERR(folio))
                return false;
        folio_put(folio);
        return true;
}

int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
                           pgoff_t idx)
{
        struct inode *inode = mapping->host;
        struct hstate *h = hstate_inode(inode);
        int err;

        idx <<= huge_page_order(h);
        __folio_set_locked(folio);
        err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);

        if (unlikely(err)) {
                __folio_clear_locked(folio);
                return err;
        }
        folio_clear_hugetlb_restore_reserve(folio);

        /*
         * mark folio dirty so that it will not be removed from cache/file
         * by non-hugetlbfs specific code paths.
         */
        folio_mark_dirty(folio);

        spin_lock(&inode->i_lock);
        inode->i_blocks += blocks_per_huge_page(h);
        spin_unlock(&inode->i_lock);
        return 0;
}

static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf,
                                                  struct address_space *mapping,
                                                  unsigned long reason)
{
        u32 hash;

        /*
         * vma_lock and hugetlb_fault_mutex must be dropped before handling
         * userfault. Also mmap_lock could be dropped due to handling
         * userfault, any vma operation should be careful from here.
         */
        hugetlb_vma_unlock_read(vmf->vma);
        hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        return handle_userfault(vmf, reason);
}

/*
 * Recheck pte with pgtable lock.  Returns true if pte didn't change, or
 * false if pte changed or is changing.
 */
static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned long addr,
                               pte_t *ptep, pte_t old_pte)
{
        spinlock_t *ptl;
        bool same;

        ptl = huge_pte_lock(h, mm, ptep);
        same = pte_same(huge_ptep_get(mm, addr, ptep), old_pte);
        spin_unlock(ptl);

        return same;
}

static vm_fault_t hugetlb_no_page(struct address_space *mapping,
                        struct vm_fault *vmf)
{
        u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
        bool new_folio, new_anon_folio = false;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct hstate *h = hstate_vma(vma);
        vm_fault_t ret = VM_FAULT_SIGBUS;
        bool folio_locked = true;
        struct folio *folio;
        unsigned long size;
        pte_t new_pte;

        /*
         * Currently, we are forced to kill the process in the event the
         * original mapper has unmapped pages from the child due to a failed
         * COW/unsharing. Warn that such a situation has occurred as it may not
         * be obvious.
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
                           current->pid);
                goto out;
        }

        /*
         * Use page lock to guard against racing truncation
         * before we get page_table_lock.
         */
        new_folio = false;
        folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff);
        if (IS_ERR(folio)) {
                size = i_size_read(mapping->host) >> huge_page_shift(h);
                if (vmf->pgoff >= size)
                        goto out;
                /* Check for page in userfault range */
                if (userfaultfd_missing(vma)) {
                        /*
                         * Since hugetlb_no_page() was examining pte
                         * without pgtable lock, we need to re-test under
                         * lock because the pte may not be stable and could
                         * have changed from under us.  Try to detect
                         * either changed or during-changing ptes and retry
                         * properly when needed.
                         *
                         * Note that userfaultfd is actually fine with
                         * false positives (e.g. caused by pte changed),
                         * but not wrong logical events (e.g. caused by
                         * reading a pte during changing).  The latter can
                         * confuse the userspace, so the strictness is very
                         * much preferred.  E.g., MISSING event should
                         * never happen on the page after UFFDIO_COPY has
                         * correctly installed the page and returned.
                         */
                        if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) {
                                ret = 0;
                                goto out;
                        }

                        return hugetlb_handle_userfault(vmf, mapping,
                                                        VM_UFFD_MISSING);
                }

                if (!(vma->vm_flags & VM_MAYSHARE)) {
                        ret = __vmf_anon_prepare(vmf);
                        if (unlikely(ret))
                                goto out;
                }

                folio = alloc_hugetlb_folio(vma, vmf->address, false);
                if (IS_ERR(folio)) {
                        /*
                         * Returning error will result in faulting task being
                         * sent SIGBUS.  The hugetlb fault mutex prevents two
                         * tasks from racing to fault in the same page which
                         * could result in false unable to allocate errors.
                         * Page migration does not take the fault mutex, but
                         * does a clear then write of pte's under page table
                         * lock.  Page fault code could race with migration,
                         * notice the clear pte and try to allocate a page
                         * here.  Before returning error, get ptl and make
                         * sure there really is no pte entry.
                         */
                        if (hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte))
                                ret = vmf_error(PTR_ERR(folio));
                        else
                                ret = 0;
                        goto out;
                }
                folio_zero_user(folio, vmf->real_address);
                __folio_mark_uptodate(folio);
                new_folio = true;

                if (vma->vm_flags & VM_MAYSHARE) {
                        int err = hugetlb_add_to_page_cache(folio, mapping,
                                                        vmf->pgoff);
                        if (err) {
                                /*
                                 * err can't be -EEXIST which implies someone
                                 * else consumed the reservation since hugetlb
                                 * fault mutex is held when add a hugetlb page
                                 * to the page cache. So it's safe to call
                                 * restore_reserve_on_error() here.
                                 */
                                restore_reserve_on_error(h, vma, vmf->address,
                                                        folio);
                                folio_put(folio);
                                ret = VM_FAULT_SIGBUS;
                                goto out;
                        }
                } else {
                        new_anon_folio = true;
                        folio_lock(folio);
                }
        } else {
                /*
                 * If memory error occurs between mmap() and fault, some process
                 * don't have hwpoisoned swap entry for errored virtual address.
                 * So we need to block hugepage fault by PG_hwpoison bit check.
                 */
                if (unlikely(folio_test_hwpoison(folio))) {
                        ret = VM_FAULT_HWPOISON_LARGE |
                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }

                /* Check for page in userfault range. */
                if (userfaultfd_minor(vma)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        /* See comment in userfaultfd_missing() block above */
                        if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) {
                                ret = 0;
                                goto out;
                        }
                        return hugetlb_handle_userfault(vmf, mapping,
                                                        VM_UFFD_MINOR);
                }
        }

        /*
         * If we are going to COW a private mapping later, we examine the
         * pending reservations for this page now. This will ensure that
         * any allocations necessary to record that reservation occur outside
         * the spinlock.
         */
        if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                if (vma_needs_reservation(h, vma, vmf->address) < 0) {
                        ret = VM_FAULT_OOM;
                        goto backout_unlocked;
                }
                /* Just decrements count, does not deallocate */
                vma_end_reservation(h, vma, vmf->address);
        }

        vmf->ptl = huge_pte_lock(h, mm, vmf->pte);
        ret = 0;
        /* If pte changed from under us, retry */
        if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte))
                goto backout;

        if (new_anon_folio)
                hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
        else
                hugetlb_add_file_rmap(folio);
        new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED);
        /*
         * If this pte was previously wr-protected, keep it wr-protected even
         * if populated.
         */
        if (unlikely(pte_is_uffd_wp_marker(vmf->orig_pte)))
                new_pte = huge_pte_mkuffd_wp(new_pte);
        set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));

        hugetlb_count_add(pages_per_huge_page(h), mm);
        if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                /*
                 * No need to keep file folios locked. See comment in
                 * hugetlb_fault().
                 */
                if (!new_anon_folio) {
                        folio_locked = false;
                        folio_unlock(folio);
                }
                /* Optimization, do the COW without a second fault */
                ret = hugetlb_wp(vmf);
        }

        spin_unlock(vmf->ptl);

        /*
         * Only set hugetlb_migratable in newly allocated pages.  Existing pages
         * found in the pagecache may not have hugetlb_migratable if they have
         * been isolated for migration.
         */
        if (new_folio)
                folio_set_hugetlb_migratable(folio);

        if (folio_locked)
                folio_unlock(folio);
out:
        hugetlb_vma_unlock_read(vma);

        /*
         * We must check to release the per-VMA lock. __vmf_anon_prepare() is
         * the only way ret can be set to VM_FAULT_RETRY.
         */
        if (unlikely(ret & VM_FAULT_RETRY))
                vma_end_read(vma);

        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        return ret;

backout:
        spin_unlock(vmf->ptl);
backout_unlocked:
        /* We only need to restore reservations for private mappings */
        if (new_anon_folio)
                restore_reserve_on_error(h, vma, vmf->address, folio);

        folio_unlock(folio);
        folio_put(folio);
        goto out;
}

#ifdef CONFIG_SMP
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
        unsigned long key[2];
        u32 hash;

        key[0] = (unsigned long) mapping;
        key[1] = idx;

        hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);

        return hash & (num_fault_mutexes - 1);
}
#else
/*
 * For uniprocessor systems we always use a single mutex, so just
 * return 0 and avoid the hashing overhead.
 */
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
        return 0;
}
#endif

vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags)
{
        vm_fault_t ret;
        u32 hash;
        struct folio *folio = NULL;
        struct hstate *h = hstate_vma(vma);
        struct address_space *mapping;
        bool need_wait_lock = false;
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & huge_page_mask(h),
                .real_address = address,
                .flags = flags,
                .pgoff = vma_hugecache_offset(h, vma,
                                address & huge_page_mask(h)),
                /* TODO: Track hugetlb faults using vm_fault */

                /*
                 * Some fields may not be initialized, be careful as it may
                 * be hard to debug if called functions make assumptions
                 */
        };

        /*
         * Serialize hugepage allocation and instantiation, so that we don't
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
        mapping = vma->vm_file->f_mapping;
        hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff);
        mutex_lock(&hugetlb_fault_mutex_table[hash]);

        /*
         * Acquire vma lock before calling huge_pte_alloc and hold
         * until finished with vmf.pte.  This prevents huge_pmd_unshare from
         * being called elsewhere and making the vmf.pte no longer valid.
         */
        hugetlb_vma_lock_read(vma);
        vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
        if (!vmf.pte) {
                hugetlb_vma_unlock_read(vma);
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                return VM_FAULT_OOM;
        }

        vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte);
        if (huge_pte_none(vmf.orig_pte))
                /*
                 * hugetlb_no_page will drop vma lock and hugetlb fault
                 * mutex internally, which make us return immediately.
                 */
                return hugetlb_no_page(mapping, &vmf);

        if (pte_is_marker(vmf.orig_pte)) {
                const pte_marker marker =
                        softleaf_to_marker(softleaf_from_pte(vmf.orig_pte));

                if (marker & PTE_MARKER_POISONED) {
                        ret = VM_FAULT_HWPOISON_LARGE |
                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto out_mutex;
                } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
                        /* This isn't supported in hugetlb. */
                        ret = VM_FAULT_SIGSEGV;
                        goto out_mutex;
                }

                return hugetlb_no_page(mapping, &vmf);
        }

        ret = 0;

        /* Not present, either a migration or a hwpoisoned entry */
        if (!pte_present(vmf.orig_pte) && !huge_pte_none(vmf.orig_pte)) {
                const softleaf_t softleaf = softleaf_from_pte(vmf.orig_pte);

                if (softleaf_is_migration(softleaf)) {
                        /*
                         * Release the hugetlb fault lock now, but retain
                         * the vma lock, because it is needed to guard the
                         * huge_pte_lockptr() later in
                         * migration_entry_wait_huge(). The vma lock will
                         * be released there.
                         */
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        migration_entry_wait_huge(vma, vmf.address, vmf.pte);
                        return 0;
                }
                if (softleaf_is_hwpoison(softleaf)) {
                        ret = VM_FAULT_HWPOISON_LARGE |
                            VM_FAULT_SET_HINDEX(hstate_index(h));
                }

                goto out_mutex;
        }

        /*
         * If we are going to COW/unshare the mapping later, we examine the
         * pending reservations for this page now. This will ensure that any
         * allocations necessary to record that reservation occur outside the
         * spinlock.
         */
        if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
            !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
                if (vma_needs_reservation(h, vma, vmf.address) < 0) {
                        ret = VM_FAULT_OOM;
                        goto out_mutex;
                }
                /* Just decrements count, does not deallocate */
                vma_end_reservation(h, vma, vmf.address);
        }

        vmf.ptl = huge_pte_lock(h, mm, vmf.pte);

        /* Check for a racing update before calling hugetlb_wp() */
        if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(mm, vmf.address, vmf.pte))))
                goto out_ptl;

        /* Handle userfault-wp first, before trying to lock more pages */
        if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(mm, vmf.address, vmf.pte)) &&
            (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
                if (!userfaultfd_wp_async(vma)) {
                        spin_unlock(vmf.ptl);
                        hugetlb_vma_unlock_read(vma);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        return handle_userfault(&vmf, VM_UFFD_WP);
                }

                vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
                set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
                                huge_page_size(hstate_vma(vma)));
                /* Fallthrough to CoW */
        }

        if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                if (!huge_pte_write(vmf.orig_pte)) {
                        /*
                         * Anonymous folios need to be lock since hugetlb_wp()
                         * checks whether we can re-use the folio exclusively
                         * for us in case we are the only user of it.
                         */
                        folio = page_folio(pte_page(vmf.orig_pte));
                        if (folio_test_anon(folio) && !folio_trylock(folio)) {
                                need_wait_lock = true;
                                goto out_ptl;
                        }
                        folio_get(folio);
                        ret = hugetlb_wp(&vmf);
                        if (folio_test_anon(folio))
                                folio_unlock(folio);
                        folio_put(folio);
                        goto out_ptl;
                } else if (likely(flags & FAULT_FLAG_WRITE)) {
                        vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
                }
        }
        vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
        if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
                                                flags & FAULT_FLAG_WRITE))
                update_mmu_cache(vma, vmf.address, vmf.pte);
out_ptl:
        spin_unlock(vmf.ptl);
out_mutex:
        hugetlb_vma_unlock_read(vma);

        /*
         * We must check to release the per-VMA lock. __vmf_anon_prepare() in
         * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY.
         */
        if (unlikely(ret & VM_FAULT_RETRY))
                vma_end_read(vma);

        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        /*
         * hugetlb_wp drops all the locks, but the folio lock, before trying to
         * unmap the folio from other processes. During that window, if another
         * process mapping that folio faults in, it will take the mutex and then
         * it will wait on folio_lock, causing an ABBA deadlock.
         * Use trylock instead and bail out if we fail.
         *
         * Ideally, we should hold a refcount on the folio we wait for, but we do
         * not want to use the folio after it becomes unlocked, but rather just
         * wait for it to become unlocked, so hopefully next fault successes on
         * the trylock.
         */
        if (need_wait_lock)
                folio_wait_locked(folio);
        return ret;
}

#ifdef CONFIG_USERFAULTFD
/*
 * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
 */
static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
                struct vm_area_struct *vma, unsigned long address)
{
        struct mempolicy *mpol;
        nodemask_t *nodemask;
        struct folio *folio;
        gfp_t gfp_mask;
        int node;

        gfp_mask = htlb_alloc_mask(h);
        node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
        /*
         * This is used to allocate a temporary hugetlb to hold the copied
         * content, which will then be copied again to the final hugetlb
         * consuming a reservation. Set the alloc_fallback to false to indicate
         * that breaking the per-node hugetlb pool is not allowed in this case.
         */
        folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false);
        mpol_cond_put(mpol);

        return folio;
}

/*
 * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
 * with modifications for hugetlb pages.
 */
int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                             struct vm_area_struct *dst_vma,
                             unsigned long dst_addr,
                             unsigned long src_addr,
                             uffd_flags_t flags,
                             struct folio **foliop)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
        bool wp_enabled = (flags & MFILL_ATOMIC_WP);
        struct hstate *h = hstate_vma(dst_vma);
        struct address_space *mapping = dst_vma->vm_file->f_mapping;
        pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
        unsigned long size = huge_page_size(h);
        int vm_shared = dst_vma->vm_flags & VM_SHARED;
        pte_t _dst_pte;
        spinlock_t *ptl;
        int ret = -ENOMEM;
        struct folio *folio;
        bool folio_in_pagecache = false;
        pte_t dst_ptep;

        if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
                ptl = huge_pte_lock(h, dst_mm, dst_pte);

                /* Don't overwrite any existing PTEs (even markers) */
                if (!huge_pte_none(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
                        spin_unlock(ptl);
                        return -EEXIST;
                }

                _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
                set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size);

                /* No need to invalidate - it was non-present before */
                update_mmu_cache(dst_vma, dst_addr, dst_pte);

                spin_unlock(ptl);
                return 0;
        }

        if (is_continue) {
                ret = -EFAULT;
                folio = filemap_lock_hugetlb_folio(h, mapping, idx);
                if (IS_ERR(folio))
                        goto out;
                folio_in_pagecache = true;
        } else if (!*foliop) {
                /* If a folio already exists, then it's UFFDIO_COPY for
                 * a non-missing case. Return -EEXIST.
                 */
                if (vm_shared &&
                    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
                        ret = -EEXIST;
                        goto out;
                }

                folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
                if (IS_ERR(folio)) {
                        pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE);
                        if (actual_pte) {
                                ret = -EEXIST;
                                goto out;
                        }
                        ret = -ENOMEM;
                        goto out;
                }

                ret = copy_folio_from_user(folio, (const void __user *) src_addr,
                                           false);

                /* fallback to copy_from_user outside mmap_lock */
                if (unlikely(ret)) {
                        ret = -ENOENT;
                        /* Free the allocated folio which may have
                         * consumed a reservation.
                         */
                        restore_reserve_on_error(h, dst_vma, dst_addr, folio);
                        folio_put(folio);

                        /* Allocate a temporary folio to hold the copied
                         * contents.
                         */
                        folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr);
                        if (!folio) {
                                ret = -ENOMEM;
                                goto out;
                        }
                        *foliop = folio;
                        /* Set the outparam foliop and return to the caller to
                         * copy the contents outside the lock. Don't free the
                         * folio.
                         */
                        goto out;
                }
        } else {
                if (vm_shared &&
                    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
                        folio_put(*foliop);
                        ret = -EEXIST;
                        *foliop = NULL;
                        goto out;
                }

                folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
                if (IS_ERR(folio)) {
                        folio_put(*foliop);
                        ret = -ENOMEM;
                        *foliop = NULL;
                        goto out;
                }
                ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
                folio_put(*foliop);
                *foliop = NULL;
                if (ret) {
                        folio_put(folio);
                        goto out;
                }
        }

        /*
         * If we just allocated a new page, we need a memory barrier to ensure
         * that preceding stores to the page become visible before the
         * set_pte_at() write. The memory barrier inside __folio_mark_uptodate
         * is what we need.
         *
         * In the case where we have not allocated a new page (is_continue),
         * the page must already be uptodate. UFFDIO_CONTINUE already includes
         * an earlier smp_wmb() to ensure that prior stores will be visible
         * before the set_pte_at() write.
         */
        if (!is_continue)
                __folio_mark_uptodate(folio);
        else
                WARN_ON_ONCE(!folio_test_uptodate(folio));

        /* Add shared, newly allocated pages to the page cache. */
        if (vm_shared && !is_continue) {
                ret = -EFAULT;
                if (idx >= (i_size_read(mapping->host) >> huge_page_shift(h)))
                        goto out_release_nounlock;

                /*
                 * Serialization between remove_inode_hugepages() and
                 * hugetlb_add_to_page_cache() below happens through the
                 * hugetlb_fault_mutex_table that here must be hold by
                 * the caller.
                 */
                ret = hugetlb_add_to_page_cache(folio, mapping, idx);
                if (ret)
                        goto out_release_nounlock;
                folio_in_pagecache = true;
        }

        ptl = huge_pte_lock(h, dst_mm, dst_pte);

        ret = -EIO;
        if (folio_test_hwpoison(folio))
                goto out_release_unlock;

        ret = -EEXIST;

        dst_ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte);
        /*
         * See comment about UFFD marker overwriting in
         * mfill_atomic_install_pte().
         */
        if (!huge_pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep))
                goto out_release_unlock;

        if (folio_in_pagecache)
                hugetlb_add_file_rmap(folio);
        else
                hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr);

        /*
         * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
         * with wp flag set, don't set pte write bit.
         */
        _dst_pte = make_huge_pte(dst_vma, folio,
                                 !wp_enabled && !(is_continue && !vm_shared));
        /*
         * Always mark UFFDIO_COPY page dirty; note that this may not be
         * extremely important for hugetlbfs for now since swapping is not
         * supported, but we should still be clear in that this page cannot be
         * thrown away at will, even if write bit not set.
         */
        _dst_pte = huge_pte_mkdirty(_dst_pte);
        _dst_pte = pte_mkyoung(_dst_pte);

        if (wp_enabled)
                _dst_pte = huge_pte_mkuffd_wp(_dst_pte);

        set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size);

        hugetlb_count_add(pages_per_huge_page(h), dst_mm);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache(dst_vma, dst_addr, dst_pte);

        spin_unlock(ptl);
        if (!is_continue)
                folio_set_hugetlb_migratable(folio);
        if (vm_shared || is_continue)
                folio_unlock(folio);
        ret = 0;
out:
        return ret;
out_release_unlock:
        spin_unlock(ptl);
        if (vm_shared || is_continue)
                folio_unlock(folio);
out_release_nounlock:
        if (!folio_in_pagecache)
                restore_reserve_on_error(h, dst_vma, dst_addr, folio);
        folio_put(folio);
        goto out;
}
#endif /* CONFIG_USERFAULTFD */

long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end,
                pgprot_t newprot, unsigned long cp_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long start = address;
        pte_t *ptep;
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
        long pages = 0, psize = huge_page_size(h);
        struct mmu_notifier_range range;
        unsigned long last_addr_mask;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
        struct mmu_gather tlb;

        /*
         * In the case of shared PMDs, the area to flush could be beyond
         * start/end.  Set range.start/range.end to cover the maximum possible
         * range if PMD sharing is possible.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
                                0, mm, start, end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);

        BUG_ON(address >= end);
        flush_cache_range(vma, range.start, range.end);
        tlb_gather_mmu_vma(&tlb, vma);

        mmu_notifier_invalidate_range_start(&range);
        hugetlb_vma_lock_write(vma);
        i_mmap_lock_write(vma->vm_file->f_mapping);
        last_addr_mask = hugetlb_mask_last_page(h);
        for (; address < end; address += psize) {
                softleaf_t entry;
                spinlock_t *ptl;

                ptep = hugetlb_walk(vma, address, psize);
                if (!ptep) {
                        if (!uffd_wp) {
                                address |= last_addr_mask;
                                continue;
                        }
                        /*
                         * Userfaultfd wr-protect requires pgtable
                         * pre-allocations to install pte markers.
                         */
                        ptep = huge_pte_alloc(mm, vma, address, psize);
                        if (!ptep) {
                                pages = -ENOMEM;
                                break;
                        }
                }
                ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(&tlb, vma, address, ptep)) {
                        /*
                         * When uffd-wp is enabled on the vma, unshare
                         * shouldn't happen at all.  Warn about it if it
                         * happened due to some reason.
                         */
                        WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
                        pages++;
                        spin_unlock(ptl);
                        address |= last_addr_mask;
                        continue;
                }
                pte = huge_ptep_get(mm, address, ptep);
                if (huge_pte_none(pte)) {
                        if (unlikely(uffd_wp))
                                /* Safe to modify directly (none->non-present). */
                                set_huge_pte_at(mm, address, ptep,
                                                make_pte_marker(PTE_MARKER_UFFD_WP),
                                                psize);
                        goto next;
                }

                entry = softleaf_from_pte(pte);
                if (unlikely(softleaf_is_hwpoison(entry))) {
                        /* Nothing to do. */
                } else if (unlikely(softleaf_is_migration(entry))) {
                        struct folio *folio = softleaf_to_folio(entry);
                        pte_t newpte = pte;

                        if (softleaf_is_migration_write(entry)) {
                                if (folio_test_anon(folio))
                                        entry = make_readable_exclusive_migration_entry(
                                                                swp_offset(entry));
                                else
                                        entry = make_readable_migration_entry(
                                                                swp_offset(entry));
                                newpte = swp_entry_to_pte(entry);
                                pages++;
                        }

                        if (uffd_wp)
                                newpte = pte_swp_mkuffd_wp(newpte);
                        else if (uffd_wp_resolve)
                                newpte = pte_swp_clear_uffd_wp(newpte);
                        if (!pte_same(pte, newpte))
                                set_huge_pte_at(mm, address, ptep, newpte, psize);
                } else if (unlikely(pte_is_marker(pte))) {
                        /*
                         * Do nothing on a poison marker; page is
                         * corrupted, permissions do not apply. Here
                         * pte_marker_uffd_wp()==true implies !poison
                         * because they're mutual exclusive.
                         */
                        if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve)
                                /* Safe to modify directly (non-present->none). */
                                huge_pte_clear(mm, address, ptep, psize);
                } else {
                        pte_t old_pte;
                        unsigned int shift = huge_page_shift(hstate_vma(vma));

                        old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
                        pte = huge_pte_modify(old_pte, newprot);
                        pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
                        if (uffd_wp)
                                pte = huge_pte_mkuffd_wp(pte);
                        else if (uffd_wp_resolve)
                                pte = huge_pte_clear_uffd_wp(pte);
                        huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
                        pages++;
                        tlb_remove_huge_tlb_entry(h, &tlb, ptep, address);
                }

next:
                spin_unlock(ptl);
                cond_resched();
        }

        tlb_flush_mmu_tlbonly(&tlb);
        huge_pmd_unshare_flush(&tlb, vma);
        /*
         * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
         * downgrading page table protection not changing it to point to a new
         * page.
         *
         * See Documentation/mm/mmu_notifier.rst
         */
        i_mmap_unlock_write(vma->vm_file->f_mapping);
        hugetlb_vma_unlock_write(vma);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);

        return pages > 0 ? (pages << h->order) : pages;
}

/*
 * Update the reservation map for the range [from, to].
 *
 * Returns the number of entries that would be added to the reservation map
 * associated with the range [from, to].  This number is greater or equal to
 * zero. -EINVAL or -ENOMEM is returned in case of any errors.
 */

long hugetlb_reserve_pages(struct inode *inode,
                long from, long to,
                struct vm_area_desc *desc,
                vma_flags_t vma_flags)
{
        long chg = -1, add = -1, spool_resv, gbl_resv;
        struct hstate *h = hstate_inode(inode);
        struct hugepage_subpool *spool = subpool_inode(inode);
        struct resv_map *resv_map;
        struct hugetlb_cgroup *h_cg = NULL;
        long gbl_reserve, regions_needed = 0;
        int err;

        /* This should never happen */
        if (from > to) {
                VM_WARN(1, "%s called with a negative range\n", __func__);
                return -EINVAL;
        }

        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
         * without using reserves
         */
        if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT))
                return 0;

        /*
         * Shared mappings base their reservation on the number of pages that
         * are already allocated on behalf of the file. Private mappings need
         * to reserve the full area even if read-only as mprotect() may be
         * called to make the mapping read-write. Assume !desc is a shm mapping
         */
        if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) {
                /*
                 * resv_map can not be NULL as hugetlb_reserve_pages is only
                 * called for inodes for which resv_maps were created (see
                 * hugetlbfs_get_inode).
                 */
                resv_map = inode_resv_map(inode);

                chg = region_chg(resv_map, from, to, &regions_needed);
        } else {
                /* Private mapping. */
                resv_map = resv_map_alloc();
                if (!resv_map) {
                        err = -ENOMEM;
                        goto out_err;
                }

                chg = to - from;

                set_vma_desc_resv_map(desc, resv_map);
                set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER);
        }

        if (chg < 0) {
                /* region_chg() above can return -ENOMEM */
                err = (chg == -ENOMEM) ? -ENOMEM : -EINVAL;
                goto out_err;
        }

        err = hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
                                chg * pages_per_huge_page(h), &h_cg);
        if (err < 0)
                goto out_err;

        if (desc && !vma_desc_test(desc, VMA_MAYSHARE_BIT) && h_cg) {
                /* For private mappings, the hugetlb_cgroup uncharge info hangs
                 * of the resv_map.
                 */
                resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
        }

        /*
         * There must be enough pages in the subpool for the mapping. If
         * the subpool has a minimum size, there may be some global
         * reservations already in place (gbl_reserve).
         */
        gbl_reserve = hugepage_subpool_get_pages(spool, chg);
        if (gbl_reserve < 0) {
                err = gbl_reserve;
                goto out_uncharge_cgroup;
        }

        /*
         * Check enough hugepages are available for the reservation.
         * Hand the pages back to the subpool if there are not
         */
        err = hugetlb_acct_memory(h, gbl_reserve);
        if (err < 0)
                goto out_put_pages;

        /*
         * Account for the reservations made. Shared mappings record regions
         * that have reservations as they are shared by multiple VMAs.
         * When the last VMA disappears, the region map says how much
         * the reservation was and the page cache tells how much of
         * the reservation was consumed. Private mappings are per-VMA and
         * only the consumed reservations are tracked. When the VMA
         * disappears, the original reservation is the VMA size and the
         * consumed reservations are stored in the map. Hence, nothing
         * else has to be done for private mappings here
         */
        if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) {
                add = region_add(resv_map, from, to, regions_needed, h, h_cg);

                if (unlikely(add < 0)) {
                        hugetlb_acct_memory(h, -gbl_reserve);
                        err = add;
                        goto out_put_pages;
                } else if (unlikely(chg > add)) {
                        /*
                         * pages in this range were added to the reserve
                         * map between region_chg and region_add.  This
                         * indicates a race with alloc_hugetlb_folio.  Adjust
                         * the subpool and reserve counts modified above
                         * based on the difference.
                         */
                        long rsv_adjust;

                        /*
                         * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
                         * reference to h_cg->css. See comment below for detail.
                         */
                        hugetlb_cgroup_uncharge_cgroup_rsvd(
                                hstate_index(h),
                                (chg - add) * pages_per_huge_page(h), h_cg);

                        rsv_adjust = hugepage_subpool_put_pages(spool,
                                                                chg - add);
                        hugetlb_acct_memory(h, -rsv_adjust);
                } else if (h_cg) {
                        /*
                         * The file_regions will hold their own reference to
                         * h_cg->css. So we should release the reference held
                         * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
                         * done.
                         */
                        hugetlb_cgroup_put_rsvd_cgroup(h_cg);
                }
        }
        return chg;

out_put_pages:
        spool_resv = chg - gbl_reserve;
        if (spool_resv) {
                /* put sub pool's reservation back, chg - gbl_reserve */
                gbl_resv = hugepage_subpool_put_pages(spool, spool_resv);
                /*
                 * subpool's reserved pages can not be put back due to race,
                 * return to hstate.
                 */
                hugetlb_acct_memory(h, -gbl_resv);
        }
        /* Restore used_hpages for pages that failed global reservation */
        if (gbl_reserve && spool) {
                unsigned long flags;

                spin_lock_irqsave(&spool->lock, flags);
                if (spool->max_hpages != -1)
                        spool->used_hpages -= gbl_reserve;
                unlock_or_release_subpool(spool, flags);
        }
out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
                                            chg * pages_per_huge_page(h), h_cg);
out_err:
        if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT))
                /* Only call region_abort if the region_chg succeeded but the
                 * region_add failed or didn't run.
                 */
                if (chg >= 0 && add < 0)
                        region_abort(resv_map, from, to, regions_needed);
        if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) {
                kref_put(&resv_map->refs, resv_map_release);
                set_vma_desc_resv_map(desc, NULL);
        }
        return err;
}

long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                                long freed)
{
        struct hstate *h = hstate_inode(inode);
        struct resv_map *resv_map = inode_resv_map(inode);
        long chg = 0;
        struct hugepage_subpool *spool = subpool_inode(inode);
        long gbl_reserve;

        /*
         * Since this routine can be called in the evict inode path for all
         * hugetlbfs inodes, resv_map could be NULL.
         */
        if (resv_map) {
                chg = region_del(resv_map, start, end);
                /*
                 * region_del() can fail in the rare case where a region
                 * must be split and another region descriptor can not be
                 * allocated.  If end == LONG_MAX, it will not fail.
                 */
                if (chg < 0)
                        return chg;
        }

        spin_lock(&inode->i_lock);
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);

        /*
         * If the subpool has a minimum size, the number of global
         * reservations to be released may be adjusted.
         *
         * Note that !resv_map implies freed == 0. So (chg - freed)
         * won't go negative.
         */
        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -gbl_reserve);

        return 0;
}

#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static unsigned long page_table_shareable(struct vm_area_struct *svma,
                                struct vm_area_struct *vma,
                                unsigned long addr, pgoff_t idx)
{
        unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
                                svma->vm_start;
        unsigned long sbase = saddr & PUD_MASK;
        unsigned long s_end = sbase + PUD_SIZE;

        /* Allow segments to share if only one is marked locked */
        vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
        vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;

        /*
         * match the virtual addresses, permission and the alignment of the
         * page table page.
         *
         * Also, vma_lock (vm_private_data) is required for sharing.
         */
        if (pmd_index(addr) != pmd_index(saddr) ||
            vm_flags != svm_flags ||
            !range_in_vma(svma, sbase, s_end) ||
            !svma->vm_private_data)
                return 0;

        return saddr;
}

bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
        unsigned long start = addr & PUD_MASK;
        unsigned long end = start + PUD_SIZE;

#ifdef CONFIG_USERFAULTFD
        if (uffd_disable_huge_pmd_share(vma))
                return false;
#endif
        /*
         * check on proper vm_flags and page table alignment
         */
        if (!(vma->vm_flags & VM_MAYSHARE))
                return false;
        if (!vma->vm_private_data)        /* vma lock required for sharing */
                return false;
        if (!range_in_vma(vma, start, end))
                return false;
        return true;
}

/*
 * Determine if start,end range within vma could be mapped by shared pmd.
 * If yes, adjust start and end to cover range associated with possible
 * shared pmd mappings.
 */
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
        unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
                v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);

        /*
         * vma needs to span at least one aligned PUD size, and the range
         * must be at least partially within in.
         */
        if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
                (*end <= v_start) || (*start >= v_end))
                return;

        /* Extend the range to be PUD aligned for a worst case scenario */
        if (*start > v_start)
                *start = ALIGN_DOWN(*start, PUD_SIZE);

        if (*end < v_end)
                *end = ALIGN(*end, PUD_SIZE);
}

/*
 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
 * and returns the corresponding pte. While this is not necessary for the
 * !shared pmd case because we can allocate the pmd later as well, it makes the
 * code much cleaner. pmd allocation is essential for the shared case because
 * pud has to be populated inside the same i_mmap_rwsem section - otherwise
 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
 * bad pmd for sharing.
 */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud)
{
        struct address_space *mapping = vma->vm_file->f_mapping;
        pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
        struct vm_area_struct *svma;
        unsigned long saddr;
        pte_t *spte = NULL;
        pte_t *pte;

        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;

                saddr = page_table_shareable(svma, vma, addr, idx);
                if (saddr) {
                        spte = hugetlb_walk(svma, saddr,
                                            vma_mmu_pagesize(svma));
                        if (spte) {
                                ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
                                break;
                        }
                }
        }

        if (!spte)
                goto out;

        spin_lock(&mm->page_table_lock);
        if (pud_none(*pud)) {
                pud_populate(mm, pud,
                                (pmd_t *)((unsigned long)spte & PAGE_MASK));
                mm_inc_nr_pmds(mm);
        } else {
                ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
        }
        spin_unlock(&mm->page_table_lock);
out:
        pte = (pte_t *)pmd_alloc(mm, pud, addr);
        i_mmap_unlock_read(mapping);
        return pte;
}

/**
 * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
 * @tlb: the current mmu_gather.
 * @vma: the vma covering the pmd table.
 * @addr: the address we are trying to unshare.
 * @ptep: pointer into the (pmd) page table.
 *
 * Called with the page table lock held, the i_mmap_rwsem held in write mode
 * and the hugetlb vma lock held in write mode.
 *
 * Note: The caller must call huge_pmd_unshare_flush() before dropping the
 * i_mmap_rwsem.
 *
 * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it
 *            was not a shared PMD table.
 */
int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
        unsigned long sz = huge_page_size(hstate_vma(vma));
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd = pgd_offset(mm, addr);
        p4d_t *p4d = p4d_offset(pgd, addr);
        pud_t *pud = pud_offset(p4d, addr);

        if (sz != PMD_SIZE)
                return 0;
        if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep)))
                return 0;
        i_mmap_assert_write_locked(vma->vm_file->f_mapping);
        hugetlb_vma_assert_locked(vma);
        pud_clear(pud);

        tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);

        mm_dec_nr_pmds(mm);
        return 1;
}

/*
 * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls
 * @tlb: the current mmu_gather.
 * @vma: the vma covering the pmd table.
 *
 * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table
 * unsharing with concurrent page table walkers.
 *
 * This function must be called after a sequence of huge_pmd_unshare()
 * calls while still holding the i_mmap_rwsem.
 */
void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        /*
         * We must synchronize page table unsharing such that nobody will
         * try reusing a previously-shared page table while it might still
         * be in use by previous sharers (TLB, GUP_fast).
         */
        i_mmap_assert_write_locked(vma->vm_file->f_mapping);

        tlb_flush_unshared_tables(tlb);
}

#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */

pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud)
{
        return NULL;
}

int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
        return 0;
}

void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
}

void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
        return false;
}
#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */

#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pte_t *pte = NULL;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (pud) {
                if (sz == PUD_SIZE) {
                        pte = (pte_t *)pud;
                } else {
                        BUG_ON(sz != PMD_SIZE);
                        if (want_pmd_share(vma, addr) && pud_none(*pud))
                                pte = huge_pmd_share(mm, vma, addr, pud);
                        else
                                pte = (pte_t *)pmd_alloc(mm, pud, addr);
                }
        }

        if (pte) {
                pte_t pteval = ptep_get_lockless(pte);

                BUG_ON(pte_present(pteval) && !pte_huge(pteval));
        }

        return pte;
}

/*
 * huge_pte_offset() - Walk the page table to resolve the hugepage
 * entry at address @addr
 *
 * Return: Pointer to page table entry (PUD or PMD) for
 * address @addr, or NULL if a !p*d_present() entry is encountered and the
 * size @sz doesn't match the hugepage size at this level of the page
 * table.
 */
pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset(mm, addr);
        if (!pgd_present(*pgd))
                return NULL;
        p4d = p4d_offset(pgd, addr);
        if (!p4d_present(*p4d))
                return NULL;

        pud = pud_offset(p4d, addr);
        if (sz == PUD_SIZE)
                /* must be pud huge, non-present or none */
                return (pte_t *)pud;
        if (!pud_present(*pud))
                return NULL;
        /* must have a valid entry and size to go further */

        pmd = pmd_offset(pud, addr);
        /* must be pmd huge, non-present or none */
        return (pte_t *)pmd;
}

/*
 * Return a mask that can be used to update an address to the last huge
 * page in a page table page mapping size.  Used to skip non-present
 * page table entries when linearly scanning address ranges.  Architectures
 * with unique huge page to page table relationships can define their own
 * version of this routine.
 */
unsigned long hugetlb_mask_last_page(struct hstate *h)
{
        unsigned long hp_size = huge_page_size(h);

        if (hp_size == PUD_SIZE)
                return P4D_SIZE - PUD_SIZE;
        else if (hp_size == PMD_SIZE)
                return PUD_SIZE - PMD_SIZE;
        else
                return 0UL;
}

#else

/* See description above.  Architectures can provide their own version. */
__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
{
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
        if (huge_page_size(h) == PMD_SIZE)
                return PUD_SIZE - PMD_SIZE;
#endif
        return 0UL;
}

#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */

/**
 * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio
 * @folio: the folio to isolate
 * @list: the list to add the folio to on success
 *
 * Isolate an allocated (refcount > 0) hugetlb folio, marking it as
 * isolated/non-migratable, and moving it from the active list to the
 * given list.
 *
 * Isolation will fail if @folio is not an allocated hugetlb folio, or if
 * it is already isolated/non-migratable.
 *
 * On success, an additional folio reference is taken that must be dropped
 * using folio_putback_hugetlb() to undo the isolation.
 *
 * Return: True if isolation worked, otherwise False.
 */
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
{
        bool ret = true;

        spin_lock_irq(&hugetlb_lock);
        if (!folio_test_hugetlb(folio) ||
            !folio_test_hugetlb_migratable(folio) ||
            !folio_try_get(folio)) {
                ret = false;
                goto unlock;
        }
        folio_clear_hugetlb_migratable(folio);
        list_move_tail(&folio->lru, list);
unlock:
        spin_unlock_irq(&hugetlb_lock);
        return ret;
}

int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
{
        int ret = 0;

        *hugetlb = false;
        spin_lock_irq(&hugetlb_lock);
        if (folio_test_hugetlb(folio)) {
                *hugetlb = true;
                if (folio_test_hugetlb_freed(folio))
                        ret = 0;
                else if (folio_test_hugetlb_migratable(folio) || unpoison)
                        ret = folio_try_get(folio);
                else
                        ret = -EBUSY;
        }
        spin_unlock_irq(&hugetlb_lock);
        return ret;
}

int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                bool *migratable_cleared)
{
        int ret;

        spin_lock_irq(&hugetlb_lock);
        ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
        spin_unlock_irq(&hugetlb_lock);
        return ret;
}

/**
 * folio_putback_hugetlb - unisolate a hugetlb folio
 * @folio: the isolated hugetlb folio
 *
 * Putback/un-isolate the hugetlb folio that was previous isolated using
 * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it
 * back onto the active list.
 *
 * Will drop the additional folio reference obtained through
 * folio_isolate_hugetlb().
 */
void folio_putback_hugetlb(struct folio *folio)
{
        spin_lock_irq(&hugetlb_lock);
        folio_set_hugetlb_migratable(folio);
        list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist);
        spin_unlock_irq(&hugetlb_lock);
        folio_put(folio);
}

void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
{
        struct hstate *h = folio_hstate(old_folio);

        hugetlb_cgroup_migrate(old_folio, new_folio);
        folio_set_owner_migrate_reason(new_folio, reason);

        /*
         * transfer temporary state of the new hugetlb folio. This is
         * reverse to other transitions because the newpage is going to
         * be final while the old one will be freed so it takes over
         * the temporary status.
         *
         * Also note that we have to transfer the per-node surplus state
         * here as well otherwise the global surplus count will not match
         * the per-node's.
         */
        if (folio_test_hugetlb_temporary(new_folio)) {
                int old_nid = folio_nid(old_folio);
                int new_nid = folio_nid(new_folio);

                folio_set_hugetlb_temporary(old_folio);
                folio_clear_hugetlb_temporary(new_folio);


                /*
                 * There is no need to transfer the per-node surplus state
                 * when we do not cross the node.
                 */
                if (new_nid == old_nid)
                        return;
                spin_lock_irq(&hugetlb_lock);
                if (h->surplus_huge_pages_node[old_nid]) {
                        h->surplus_huge_pages_node[old_nid]--;
                        h->surplus_huge_pages_node[new_nid]++;
                }
                spin_unlock_irq(&hugetlb_lock);
        }

        /*
         * Our old folio is isolated and has "migratable" cleared until it
         * is putback. As migration succeeded, set the new folio "migratable"
         * and add it to the active list.
         */
        spin_lock_irq(&hugetlb_lock);
        folio_set_hugetlb_migratable(new_folio);
        list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist);
        spin_unlock_irq(&hugetlb_lock);
}

/*
 * If @take_locks is false, the caller must ensure that no concurrent page table
 * access can happen (except for gup_fast() and hardware page walks).
 * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
 * concurrent page fault handling) and the file rmap lock.
 */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
                                   unsigned long start,
                                   unsigned long end,
                                   bool take_locks)
{
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;
        unsigned long address;
        spinlock_t *ptl;
        pte_t *ptep;

        if (!(vma->vm_flags & VM_MAYSHARE))
                return;

        if (start >= end)
                return;

        flush_cache_range(vma, start, end);
        tlb_gather_mmu_vma(&tlb, vma);

        /*
         * No need to call adjust_range_if_pmd_sharing_possible(), because
         * we have already done the PUD_SIZE alignment.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                start, end);
        mmu_notifier_invalidate_range_start(&range);
        if (take_locks) {
                hugetlb_vma_lock_write(vma);
                i_mmap_lock_write(vma->vm_file->f_mapping);
        } else {
                i_mmap_assert_write_locked(vma->vm_file->f_mapping);
        }
        for (address = start; address < end; address += PUD_SIZE) {
                ptep = hugetlb_walk(vma, address, sz);
                if (!ptep)
                        continue;
                ptl = huge_pte_lock(h, mm, ptep);
                huge_pmd_unshare(&tlb, vma, address, ptep);
                spin_unlock(ptl);
        }
        huge_pmd_unshare_flush(&tlb, vma);
        if (take_locks) {
                i_mmap_unlock_write(vma->vm_file->f_mapping);
                hugetlb_vma_unlock_write(vma);
        }
        /*
         * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
         * Documentation/mm/mmu_notifier.rst.
         */
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
}

/*
 * This function will unconditionally remove all the shared pmd pgtable entries
 * within the specific vma for a hugetlbfs memory range.
 */
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
        hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
                        ALIGN_DOWN(vma->vm_end, PUD_SIZE),
                        /* take_locks = */ true);
}

/*
 * For hugetlb, mremap() is an odd edge case - while the VMA copying is
 * performed, we permit both the old and new VMAs to reference the same
 * reservation.
 *
 * We fix this up after the operation succeeds, or if a newly allocated VMA
 * is closed as a result of a failure to allocate memory.
 */
void fixup_hugetlb_reservations(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                clear_vma_resv_huge_pages(vma);
}





























































































































































































































































   36 






    1 



























































    3 


   13 
















































































































































































   13 











































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_MM_H
#define _LINUX_SCHED_MM_H

#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>
#include <linux/sched/coredump.h>

/*
 * Routines for handling mm_structs
 */
extern struct mm_struct *mm_alloc(void);

/**
 * mmgrab() - Pin a &struct mm_struct.
 * @mm: The &struct mm_struct to pin.
 *
 * Make sure that @mm will not get freed even after the owning task
 * exits. This doesn't guarantee that the associated address space
 * will still exist later on and mmget_not_zero() has to be used before
 * accessing it.
 *
 * This is a preferred way to pin @mm for a longer/unbounded amount
 * of time.
 *
 * Use mmdrop() to release the reference acquired by mmgrab().
 *
 * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmgrab(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_count);
}

static inline void smp_mb__after_mmgrab(void)
{
        smp_mb__after_atomic();
}

extern void __mmdrop(struct mm_struct *mm);

static inline void mmdrop(struct mm_struct *mm)
{
        /*
         * The implicit full barrier implied by atomic_dec_and_test() is
         * required by the membarrier system call before returning to
         * user-space, after storing to rq->curr.
         */
        if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                __mmdrop(mm);
}

#ifdef CONFIG_PREEMPT_RT
/*
 * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is
 * by far the least expensive way to do that.
 */
static inline void __mmdrop_delayed(struct rcu_head *rhp)
{
        struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);

        __mmdrop(mm);
}

/*
 * Invoked from finish_task_switch(). Delegates the heavy lifting on RT
 * kernels via RCU.
 */
static inline void mmdrop_sched(struct mm_struct *mm)
{
        /* Provides a full memory barrier. See mmdrop() */
        if (atomic_dec_and_test(&mm->mm_count))
                call_rcu(&mm->delayed_drop, __mmdrop_delayed);
}
#else
static inline void mmdrop_sched(struct mm_struct *mm)
{
        mmdrop(mm);
}
#endif

/* Helpers for lazy TLB mm refcounting */
static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
                mmgrab(mm);
}

static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
                mmdrop(mm);
        } else {
                /*
                 * mmdrop_lazy_tlb must provide a full memory barrier, see the
                 * membarrier comment finish_task_switch which relies on this.
                 */
                smp_mb();
        }
}

static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
                mmdrop_sched(mm);
        else
                smp_mb(); /* see mmdrop_lazy_tlb() above */
}

/**
 * mmget() - Pin the address space associated with a &struct mm_struct.
 * @mm: The address space to pin.
 *
 * Make sure that the address space of the given &struct mm_struct doesn't
 * go away. This does not protect against parts of the address space being
 * modified or freed, however.
 *
 * Never use this function to pin this address space for an
 * unbounded/indefinite amount of time.
 *
 * Use mmput() to release the reference acquired by mmget().
 *
 * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmget(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_users);
}

static inline bool mmget_not_zero(struct mm_struct *mm)
{
        return atomic_inc_not_zero(&mm->mm_users);
}

/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
/* same as above but performs the slow path from the async context. Can
 * be called from the atomic context as well
 */
void mmput_async(struct mm_struct *);
#endif

/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
 * Grab a reference to a task's mm, if it is not already going away
 * and ptrace_may_access with the mode parameter passed to it
 * succeeds.
 */
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct on exit() */
extern void exit_mm_release(struct task_struct *, struct mm_struct *);
/* Remove the current tasks stale references to the old mm_struct on exec() */
extern void exec_mm_release(struct task_struct *, struct mm_struct *);

#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
#else
static inline void mm_update_next_owner(struct mm_struct *mm)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MMU
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr, len, flags)        (TASK_SIZE)
#endif

#ifndef arch_get_mmap_base
#define arch_get_mmap_base(addr, base) (base)
#endif

extern void arch_pick_mmap_layout(struct mm_struct *mm,
                                  const struct rlimit *rlim_stack);

unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                       unsigned long len, unsigned long pgoff,
                       unsigned long flags, vm_flags_t vm_flags);
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                               unsigned long len, unsigned long pgoff,
                               unsigned long flags, vm_flags_t);

unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr,
                                   unsigned long len, unsigned long pgoff,
                                   unsigned long flags);

unsigned long mm_get_unmapped_area_vmflags(struct file *filp,
                                           unsigned long addr,
                                           unsigned long len,
                                           unsigned long pgoff,
                                           unsigned long flags,
                                           vm_flags_t vm_flags);

unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags, vm_flags_t vm_flags);
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags, vm_flags_t vm_flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
                                         const struct rlimit *rlim_stack) {}
#endif

static inline bool in_vfork(struct task_struct *tsk)
{
        bool ret;

        /*
         * need RCU to access ->real_parent if CLONE_VM was used along with
         * CLONE_PARENT.
         *
         * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
         * imply CLONE_VM
         *
         * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
         * ->real_parent is not necessarily the task doing vfork(), so in
         * theory we can't rely on task_lock() if we want to dereference it.
         *
         * And in this case we can't trust the real_parent->mm == tsk->mm
         * check, it can be false negative. But we do not care, if init or
         * another oom-unkillable task does this it should blame itself.
         */
        rcu_read_lock();
        ret = tsk->vfork_done &&
                        rcu_dereference(tsk->real_parent)->mm == tsk->mm;
        rcu_read_unlock();

        return ret;
}

/*
 * Applies per-task gfp context to the given allocation flags.
 * PF_MEMALLOC_NOIO implies GFP_NOIO
 * PF_MEMALLOC_NOFS implies GFP_NOFS
 * PF_MEMALLOC_PIN  implies !GFP_MOVABLE
 */
static inline gfp_t current_gfp_context(gfp_t flags)
{
        unsigned int pflags = READ_ONCE(current->flags);

        if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
                /*
                 * NOIO implies both NOIO and NOFS and it is a weaker context
                 * so always make sure it makes precedence
                 */
                if (pflags & PF_MEMALLOC_NOIO)
                        flags &= ~(__GFP_IO | __GFP_FS);
                else if (pflags & PF_MEMALLOC_NOFS)
                        flags &= ~__GFP_FS;

                if (pflags & PF_MEMALLOC_PIN)
                        flags &= ~__GFP_MOVABLE;
        }
        return flags;
}

#ifdef CONFIG_LOCKDEP
extern void __fs_reclaim_acquire(unsigned long ip);
extern void __fs_reclaim_release(unsigned long ip);
extern void fs_reclaim_acquire(gfp_t gfp_mask);
extern void fs_reclaim_release(gfp_t gfp_mask);
#else
static inline void __fs_reclaim_acquire(unsigned long ip) { }
static inline void __fs_reclaim_release(unsigned long ip) { }
static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif

/* Any memory-allocation retry loop should use
 * memalloc_retry_wait(), and pass the flags for the most
 * constrained allocation attempt that might have failed.
 * This provides useful documentation of where loops are,
 * and a central place to fine tune the waiting as the MM
 * implementation changes.
 */
static inline void memalloc_retry_wait(gfp_t gfp_flags)
{
        /* We use io_schedule_timeout because waiting for memory
         * typically included waiting for dirty pages to be
         * written out, which requires IO.
         */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        gfp_flags = current_gfp_context(gfp_flags);
        if (gfpflags_allow_blocking(gfp_flags) &&
            !(gfp_flags & __GFP_NORETRY))
                /* Probably waited already, no need for much more */
                io_schedule_timeout(1);
        else
                /* Probably didn't wait, and has now released a lock,
                 * so now is a good time to wait
                 */
                io_schedule_timeout(HZ/50);
}

/**
 * might_alloc - Mark possible allocation sites
 * @gfp_mask: gfp_t flags that would be used to allocate
 *
 * Similar to might_sleep() and other annotations, this can be used in functions
 * that might allocate, but often don't. Compiles to nothing without
 * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking.
 */
static inline void might_alloc(gfp_t gfp_mask)
{
        fs_reclaim_acquire(gfp_mask);
        fs_reclaim_release(gfp_mask);

        if (current->flags & PF_MEMALLOC)
                return;

        might_sleep_if(gfpflags_allow_blocking(gfp_mask));
}

/**
 * memalloc_flags_save - Add a PF_* flag to current->flags, save old value
 * @flags: Flags to add.
 *
 * This allows PF_* flags to be conveniently added, irrespective of current
 * value, and then the old version restored with memalloc_flags_restore().
 */
static inline unsigned memalloc_flags_save(unsigned flags)
{
        unsigned oldflags = ~current->flags & flags;
        current->flags |= flags;
        return oldflags;
}

static inline void memalloc_flags_restore(unsigned flags)
{
        current->flags &= ~flags;
}

/**
 * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
 *
 * This functions marks the beginning of the GFP_NOIO allocation scope.
 * All further allocations will implicitly drop __GFP_IO flag and so
 * they are safe for the IO critical section from the allocation recursion
 * point of view. Use memalloc_noio_restore to end the scope with flags
 * returned by this function.
 *
 * Context: This function is safe to be used from any context.
 * Return: The saved flags to be passed to memalloc_noio_restore.
 */
static inline unsigned int memalloc_noio_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_NOIO);
}

/**
 * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_noio_save call.
 */
static inline void memalloc_noio_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
 *
 * This functions marks the beginning of the GFP_NOFS allocation scope.
 * All further allocations will implicitly drop __GFP_FS flag and so
 * they are safe for the FS critical section from the allocation recursion
 * point of view. Use memalloc_nofs_restore to end the scope with flags
 * returned by this function.
 *
 * Context: This function is safe to be used from any context.
 * Return: The saved flags to be passed to memalloc_nofs_restore.
 */
static inline unsigned int memalloc_nofs_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_NOFS);
}

/**
 * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_nofs_save call.
 */
static inline void memalloc_nofs_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
 *
 * This function marks the beginning of the __GFP_MEMALLOC allocation scope.
 * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
 * prevents entering reclaim and allows access to all memory reserves. This
 * should only be used when the caller guarantees the allocation will allow more
 * memory to be freed very shortly, i.e. it needs to allocate some memory in
 * the process of freeing memory, and cannot reclaim due to potential recursion.
 *
 * Users of this scope have to be extremely careful to not deplete the reserves
 * completely and implement a throttling mechanism which controls the
 * consumption of the reserve based on the amount of freed memory. Usage of a
 * pre-allocated pool (e.g. mempool) should be always considered before using
 * this scope.
 *
 * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
 *
 * Context: This function should not be used in an interrupt context as that one
 *          does not give PF_MEMALLOC access to reserves.
 *          See __gfp_pfmemalloc_flags().
 * Return: The saved flags to be passed to memalloc_noreclaim_restore.
 */
static inline unsigned int memalloc_noreclaim_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC);
}

/**
 * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save
 * function. Always make sure that the given flags is the return value from the
 * pairing memalloc_noreclaim_save call.
 */
static inline void memalloc_noreclaim_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope.
 *
 * This function marks the beginning of the ~__GFP_MOVABLE allocation scope.
 * All further allocations will implicitly remove the __GFP_MOVABLE flag, which
 * will constraint the allocations to zones that allow long term pinning, i.e.
 * not ZONE_MOVABLE zones.
 *
 * Return: The saved flags to be passed to memalloc_pin_restore.
 */
static inline unsigned int memalloc_pin_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_PIN);
}

/**
 * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function.
 * Always make sure that the given flags is the return value from the pairing
 * memalloc_pin_save call.
 */
static inline void memalloc_pin_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/**
 * set_active_memcg - Starts the remote memcg charging scope.
 * @memcg: memcg to charge.
 *
 * This function marks the beginning of the remote memcg charging scope. All the
 * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
 * given memcg.
 *
 * Please, make sure that caller has a reference to the passed memcg structure,
 * so its lifetime is guaranteed to exceed the scope between two
 * set_active_memcg() calls.
 *
 * NOTE: This function can nest. Users must save the return value and
 * reset the previous value after their own charging scope is over.
 */
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        struct mem_cgroup *old;

        if (!in_task()) {
                old = this_cpu_read(int_active_memcg);
                this_cpu_write(int_active_memcg, memcg);
        } else {
                old = current->active_memcg;
                current->active_memcg = memcg;
        }

        return old;
}
#else
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        return NULL;
}
#endif

#ifdef CONFIG_MEMBARRIER
enum {
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY                = (1U << 0),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED                        = (1U << 1),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY                        = (1U << 2),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED                        = (1U << 3),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY        = (1U << 4),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE                = (1U << 5),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY                = (1U << 6),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ                        = (1U << 7),
};

enum {
        MEMBARRIER_FLAG_SYNC_CORE        = (1U << 0),
        MEMBARRIER_FLAG_RSEQ                = (1U << 1),
};

#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
#include <asm/membarrier.h>
#endif

static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
        /*
         * The atomic_read() below prevents CSE. The following should
         * help the compiler generate more efficient code on architectures
         * where sync_core_before_usermode() is a no-op.
         */
        if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE))
                return;
        if (current->mm != mm)
                return;
        if (likely(!(atomic_read(&mm->membarrier_state) &
                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
                return;
        sync_core_before_usermode();
}

extern void membarrier_exec_mmap(struct mm_struct *mm);

extern void membarrier_update_current_mm(struct mm_struct *next_mm);

#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
                                             struct mm_struct *next,
                                             struct task_struct *tsk)
{
}
#endif
static inline void membarrier_exec_mmap(struct mm_struct *mm)
{
}
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
{
}
#endif

#endif /* _LINUX_SCHED_MM_H */





























































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 */
#ifndef __XFS_MOUNT_H__
#define        __XFS_MOUNT_H__

struct xlog;
struct xfs_inode;
struct xfs_mru_cache;
struct xfs_ail;
struct xfs_quotainfo;
struct xfs_da_geometry;
struct xfs_perag;
struct xfs_healthmon;

/* dynamic preallocation free space thresholds, 5% down to 1% */
enum {
        XFS_LOWSP_1_PCNT = 0,
        XFS_LOWSP_2_PCNT,
        XFS_LOWSP_3_PCNT,
        XFS_LOWSP_4_PCNT,
        XFS_LOWSP_5_PCNT,
        XFS_LOWSP_MAX,
};

/*
 * Error Configuration
 *
 * Error classes define the subsystem the configuration belongs to.
 * Error numbers define the errors that are configurable.
 */
enum {
        XFS_ERR_METADATA,
        XFS_ERR_CLASS_MAX,
};
enum {
        XFS_ERR_DEFAULT,
        XFS_ERR_EIO,
        XFS_ERR_ENOSPC,
        XFS_ERR_ENODEV,
        XFS_ERR_ERRNO_MAX,
};

#define XFS_ERR_RETRY_FOREVER        -1

/*
 * Although retry_timeout is in jiffies which is normally an unsigned long,
 * we limit the retry timeout to 86400 seconds, or one day.  So even a
 * signed 32-bit long is sufficient for a HZ value up to 24855.  Making it
 * signed lets us store the special "-1" value, meaning retry forever.
 */
struct xfs_error_cfg {
        struct xfs_kobj        kobj;
        int                max_retries;
        long                retry_timeout;        /* in jiffies, -1 = infinite */
};

/*
 * Per-cpu deferred inode inactivation GC lists.
 */
struct xfs_inodegc {
        struct xfs_mount        *mp;
        struct llist_head        list;
        struct delayed_work        work;
        int                        error;

        /* approximate count of inodes in the list */
        unsigned int                items;
        unsigned int                shrinker_hits;
        unsigned int                cpu;
};

/*
 * Container for each type of groups, used to look up individual groups and
 * describes the geometry.
 */
struct xfs_groups {
        struct xarray                xa;

        /*
         * Maximum capacity of the group in FSBs.
         *
         * Each group is laid out densely in the daddr space.  For the
         * degenerate case of a pre-rtgroups filesystem, the incore rtgroup
         * pretends to have a zero-block and zero-blklog rtgroup.
         */
        uint32_t                blocks;

        /*
         * Log(2) of the logical size of each group.
         *
         * Compared to the blocks field above this is rounded up to the next
         * power of two, and thus lays out the xfs_fsblock_t/xfs_rtblock_t
         * space sparsely with a hole from blocks to (1 << blklog) at the end
         * of each group.
         */
        uint8_t                        blklog;

        /*
         * Zoned devices can have gaps beyond the usable capacity of a zone and
         * the end in the LBA/daddr address space.  In other words, the hardware
         * equivalent to the RT groups already takes care of the power of 2
         * alignment for us.  In this case the sparse FSB/RTB address space maps
         * 1:1 to the device address space.
         */
        bool                        has_daddr_gaps;

        /*
         * Mask to extract the group-relative block number from a FSB.
         * For a pre-rtgroups filesystem we pretend to have one very large
         * rtgroup, so this mask must be 64-bit.
         */
        uint64_t                blkmask;

        /*
         * Start of the first group in the device.  This is used to support a
         * RT device following the data device on the same block device for
         * SMR hard drives.
         */
        xfs_fsblock_t                start_fsb;

        /*
         * Maximum length of an atomic write for files stored in this
         * collection of allocation groups, in fsblocks.
         */
        xfs_extlen_t                awu_max;
};

struct xfs_freecounter {
        /* free blocks for general use: */
        struct percpu_counter        count;

        /* total reserved blocks: */
        uint64_t                res_total;

        /* available reserved blocks: */
        uint64_t                res_avail;

        /* reserved blks @ remount,ro: */
        uint64_t                res_saved;
};

/*
 * The struct xfsmount layout is optimised to separate read-mostly variables
 * from variables that are frequently modified. We put the read-mostly variables
 * first, then place all the other variables at the end.
 *
 * Typically, read-mostly variables are those that are set at mount time and
 * never changed again, or only change rarely as a result of things like sysfs
 * knobs being tweaked.
 */
typedef struct xfs_mount {
        struct xfs_sb                m_sb;                /* copy of fs superblock */
        struct super_block        *m_super;
        struct xfs_ail                *m_ail;                /* fs active log item list */
        struct xfs_buf                *m_sb_bp;        /* buffer for superblock */
        struct xfs_buf                *m_rtsb_bp;        /* realtime superblock */
        char                        *m_rtname;        /* realtime device name */
        char                        *m_logname;        /* external log device name */
        struct xfs_da_geometry        *m_dir_geo;        /* directory block geometry */
        struct xfs_da_geometry        *m_attr_geo;        /* attribute block geometry */
        struct xlog                *m_log;                /* log specific stuff */
        struct xfs_inode        *m_rootip;        /* pointer to root directory */
        struct xfs_inode        *m_metadirip;        /* ptr to metadata directory */
        struct xfs_inode        *m_rtdirip;        /* ptr to realtime metadir */
        struct xfs_quotainfo        *m_quotainfo;        /* disk quota information */
        struct xfs_buftarg        *m_ddev_targp;        /* data device */
        struct xfs_buftarg        *m_logdev_targp;/* log device */
        struct xfs_buftarg        *m_rtdev_targp;        /* rt device */
        void __percpu                *m_inodegc;        /* percpu inodegc structures */
        struct xfs_mru_cache        *m_filestream;  /* per-mount filestream data */
        struct workqueue_struct *m_buf_workqueue;
        struct workqueue_struct        *m_unwritten_workqueue;
        struct workqueue_struct        *m_reclaim_workqueue;
        struct workqueue_struct        *m_sync_workqueue;
        struct workqueue_struct *m_blockgc_wq;
        struct workqueue_struct *m_inodegc_wq;

        int                        m_bsize;        /* fs logical block size */
        uint8_t                        m_blkbit_log;        /* blocklog + NBBY */
        uint8_t                        m_blkbb_log;        /* blocklog - BBSHIFT */
        uint8_t                        m_agno_log;        /* log #ag's */
        uint8_t                        m_sectbb_log;        /* sectlog - BBSHIFT */
        int8_t                        m_rtxblklog;        /* log2 of rextsize, if possible */

        uint                        m_blockmask;        /* sb_blocksize-1 */
        uint                        m_blockwsize;        /* sb_blocksize in words */
        /* number of rt extents per rt bitmap block if rtgroups enabled */
        unsigned int                m_rtx_per_rbmblock;
        uint                        m_alloc_mxr[2];        /* max alloc btree records */
        uint                        m_alloc_mnr[2];        /* min alloc btree records */
        uint                        m_bmap_dmxr[2];        /* max bmap btree records */
        uint                        m_bmap_dmnr[2];        /* min bmap btree records */
        uint                        m_rmap_mxr[2];        /* max rmap btree records */
        uint                        m_rmap_mnr[2];        /* min rmap btree records */
        uint                        m_rtrmap_mxr[2]; /* max rtrmap btree records */
        uint                        m_rtrmap_mnr[2]; /* min rtrmap btree records */
        uint                        m_refc_mxr[2];        /* max refc btree records */
        uint                        m_refc_mnr[2];        /* min refc btree records */
        uint                        m_rtrefc_mxr[2]; /* max rtrefc btree records */
        uint                        m_rtrefc_mnr[2]; /* min rtrefc btree records */
        uint                        m_alloc_maxlevels; /* max alloc btree levels */
        uint                        m_bm_maxlevels[2]; /* max bmap btree levels */
        uint                        m_rmap_maxlevels; /* max rmap btree levels */
        uint                        m_rtrmap_maxlevels; /* max rtrmap btree level */
        uint                        m_refc_maxlevels; /* max refcount btree level */
        uint                        m_rtrefc_maxlevels; /* max rtrefc btree level */
        unsigned int                m_agbtree_maxlevels; /* max level of all AG btrees */
        unsigned int                m_rtbtree_maxlevels; /* max level of all rt btrees */
        xfs_extlen_t                m_ag_prealloc_blocks; /* reserved ag blocks */
        uint                        m_alloc_set_aside; /* space we can't use */
        uint                        m_ag_max_usable; /* max space per AG */
        int                        m_dalign;        /* stripe unit */
        int                        m_swidth;        /* stripe width */
        xfs_agnumber_t                m_maxagi;        /* highest inode alloc group */
        uint                        m_allocsize_log;/* min write size log bytes */
        uint                        m_allocsize_blocks; /* min write size blocks */
        int                        m_logbufs;        /* number of log buffers */
        int                        m_logbsize;        /* size of each log buffer */
        unsigned int                m_rsumlevels;        /* rt summary levels */
        xfs_filblks_t                m_rsumblocks;        /* size of rt summary, FSBs */
        int                        m_fixedfsid[2];        /* unchanged for life of FS */
        uint                        m_qflags;        /* quota status flags */
        uint64_t                m_features;        /* active filesystem features */
        uint64_t                m_low_space[XFS_LOWSP_MAX];
        uint64_t                m_low_rtexts[XFS_LOWSP_MAX];
        uint64_t                m_rtxblkmask;        /* rt extent block mask */
        struct xfs_ino_geometry        m_ino_geo;        /* inode geometry */
        struct xfs_trans_resv        m_resv;                /* precomputed res values */
                                                /* low free space thresholds */
        unsigned long                m_opstate;        /* dynamic state flags */
        bool                        m_always_cow;
        bool                        m_fail_unmount;
        bool                        m_finobt_nores; /* no per-AG finobt resv. */
        bool                        m_update_sb;        /* sb needs update in mount */
        unsigned int                m_max_open_zones;
        unsigned int                m_zonegc_low_space;

        /* max_atomic_write mount option value */
        unsigned long long        m_awu_max_bytes;

        /*
         * Bitsets of per-fs metadata that have been checked and/or are sick.
         * Callers must hold m_sb_lock to access these two fields.
         */
        uint8_t                        m_fs_checked;
        uint8_t                        m_fs_sick;
        /*
         * Bitsets of rt metadata that have been checked and/or are sick.
         * Callers must hold m_sb_lock to access this field.
         */
        uint8_t                        m_rt_checked;
        uint8_t                        m_rt_sick;

        /*
         * End of read-mostly variables. Frequently written variables and locks
         * should be placed below this comment from now on. The first variable
         * here is marked as cacheline aligned so they it is separated from
         * the read-mostly variables.
         */

        spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
        struct percpu_counter        m_icount;        /* allocated inodes counter */
        struct percpu_counter        m_ifree;        /* free inodes counter */

        struct xfs_freecounter        m_free[XC_FREE_NR];

        /*
         * Count of data device blocks reserved for delayed allocations,
         * including indlen blocks.  Does not include allocated CoW staging
         * extents or anything related to the rt device.
         */
        struct percpu_counter        m_delalloc_blks;

        /*
         * RT version of the above.
         */
        struct percpu_counter        m_delalloc_rtextents;

        /*
         * Global count of allocation btree blocks in use across all AGs. Only
         * used when perag reservation is enabled. Helps prevent block
         * reservation from attempting to reserve allocation btree blocks.
         */
        atomic64_t                m_allocbt_blks;

        struct xfs_groups        m_groups[XG_TYPE_MAX];
        struct delayed_work        m_reclaim_work;        /* background inode reclaim */
        struct xfs_zone_info        *m_zone_info;        /* zone allocator information */
        struct dentry                *m_debugfs;        /* debugfs parent */
        struct xfs_kobj                m_kobj;
        struct xfs_kobj                m_error_kobj;
        struct xfs_kobj                m_error_meta_kobj;
        struct xfs_error_cfg        m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
        struct xstats                m_stats;        /* per-fs stats */
#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
        struct xchk_stats        *m_scrub_stats;
#endif
        struct xfs_kobj                m_zoned_kobj;
        xfs_agnumber_t                m_agfrotor;        /* last ag where space found */
        atomic_t                m_agirotor;        /* last ag dir inode alloced */
        atomic_t                m_rtgrotor;        /* last rtgroup rtpicked */

        struct mutex                m_metafile_resv_lock;
        uint64_t                m_metafile_resv_target;
        uint64_t                m_metafile_resv_used;
        uint64_t                m_metafile_resv_avail;

        /* Memory shrinker to throttle and reprioritize inodegc */
        struct shrinker                *m_inodegc_shrinker;
        /*
         * Workqueue item so that we can coalesce multiple inode flush attempts
         * into a single flush.
         */
        struct work_struct        m_flush_inodes_work;

        /*
         * Generation of the filesysyem layout.  This is incremented by each
         * growfs, and used by the pNFS server to ensure the client updates
         * its view of the block device once it gets a layout that might
         * reference the newly added blocks.  Does not need to be persistent
         * as long as we only allow file system size increments, but if we
         * ever support shrinks it would have to be persisted in addition
         * to various other kinds of pain inflicted on the pNFS server.
         */
        uint32_t                m_generation;
        struct mutex                m_growlock;        /* growfs mutex */

#ifdef DEBUG
        /*
         * Frequency with which errors are injected.  Replaces xfs_etest; the
         * value stored in here is the inverse of the frequency with which the
         * error triggers.  1 = always, 2 = half the time, etc.
         */
        unsigned int                *m_errortag;
        struct xfs_kobj                m_errortag_kobj;
#endif

        /* cpus that have inodes queued for inactivation */
        struct cpumask                m_inodegc_cpumask;

        /* Hook to feed dirent updates to an active online repair. */
        struct xfs_hooks        m_dir_update_hooks;

        /* Private data referring to a health monitor object. */
        struct xfs_healthmon __rcu        *m_healthmon;

        /* Index of uuid record in the uuid xarray. */
        unsigned int                m_uuid_table_index;
} xfs_mount_t;

#define M_IGEO(mp)                (&(mp)->m_ino_geo)

/*
 * Flags for m_features.
 *
 * These are all the active features in the filesystem, regardless of how
 * they are configured.
 */
#define XFS_FEAT_ATTR                (1ULL << 0)        /* xattrs present in fs */
#define XFS_FEAT_NLINK                (1ULL << 1)        /* 32 bit link counts */
#define XFS_FEAT_QUOTA                (1ULL << 2)        /* quota active */
#define XFS_FEAT_ALIGN                (1ULL << 3)        /* inode alignment */
#define XFS_FEAT_DALIGN                (1ULL << 4)        /* data alignment */
#define XFS_FEAT_LOGV2                (1ULL << 5)        /* version 2 logs */
#define XFS_FEAT_SECTOR                (1ULL << 6)        /* sector size > 512 bytes */
#define XFS_FEAT_EXTFLG                (1ULL << 7)        /* unwritten extents */
#define XFS_FEAT_ASCIICI        (1ULL << 8)        /* ASCII only case-insens. */
#define XFS_FEAT_LAZYSBCOUNT        (1ULL << 9)        /* Superblk counters */
#define XFS_FEAT_PARENT                (1ULL << 11)        /* parent pointers */
#define XFS_FEAT_PROJID32        (1ULL << 12)        /* 32 bit project id */
#define XFS_FEAT_CRC                (1ULL << 13)        /* metadata CRCs */
#define XFS_FEAT_V3INODES        (1ULL << 14)        /* Version 3 inodes */
#define XFS_FEAT_PQUOTINO        (1ULL << 15)        /* non-shared proj/grp quotas */
#define XFS_FEAT_FTYPE                (1ULL << 16)        /* inode type in dir */
#define XFS_FEAT_FINOBT                (1ULL << 17)        /* free inode btree */
#define XFS_FEAT_RMAPBT                (1ULL << 18)        /* reverse map btree */
#define XFS_FEAT_REFLINK        (1ULL << 19)        /* reflinked files */
#define XFS_FEAT_SPINODES        (1ULL << 20)        /* sparse inode chunks */
#define XFS_FEAT_META_UUID        (1ULL << 21)        /* metadata UUID */
#define XFS_FEAT_REALTIME        (1ULL << 22)        /* realtime device present */
#define XFS_FEAT_INOBTCNT        (1ULL << 23)        /* inobt block counts */
#define XFS_FEAT_BIGTIME        (1ULL << 24)        /* large timestamps */
#define XFS_FEAT_NEEDSREPAIR        (1ULL << 25)        /* needs xfs_repair */
#define XFS_FEAT_NREXT64        (1ULL << 26)        /* large extent counters */
#define XFS_FEAT_EXCHANGE_RANGE        (1ULL << 27)        /* exchange range */
#define XFS_FEAT_METADIR        (1ULL << 28)        /* metadata directory tree */
#define XFS_FEAT_ZONED                (1ULL << 29)        /* zoned RT device */

/* Mount features */
#define XFS_FEAT_NOLIFETIME        (1ULL << 47)        /* disable lifetime hints */
#define XFS_FEAT_NOALIGN        (1ULL << 49)        /* ignore alignment */
#define XFS_FEAT_ALLOCSIZE        (1ULL << 50)        /* user specified allocation size */
#define XFS_FEAT_LARGE_IOSIZE        (1ULL << 51)        /* report large preferred
                                                 * I/O size in stat() */
#define XFS_FEAT_WSYNC                (1ULL << 52)        /* synchronous metadata ops */
#define XFS_FEAT_DIRSYNC        (1ULL << 53)        /* synchronous directory ops */
#define XFS_FEAT_DISCARD        (1ULL << 54)        /* discard unused blocks */
#define XFS_FEAT_GRPID                (1ULL << 55)        /* group-ID assigned from directory */
#define XFS_FEAT_SMALL_INUMS        (1ULL << 56)        /* user wants 32bit inodes */
#define XFS_FEAT_SWALLOC        (1ULL << 58)        /* stripe width allocation */
#define XFS_FEAT_FILESTREAMS        (1ULL << 59)        /* use filestreams allocator */
#define XFS_FEAT_DAX_ALWAYS        (1ULL << 60)        /* DAX always enabled */
#define XFS_FEAT_DAX_NEVER        (1ULL << 61)        /* DAX never enabled */
#define XFS_FEAT_NORECOVERY        (1ULL << 62)        /* no recovery - dirty fs */
#define XFS_FEAT_NOUUID                (1ULL << 63)        /* ignore uuid during mount */

#define __XFS_HAS_FEAT(name, NAME) \
static inline bool xfs_has_ ## name (const struct xfs_mount *mp) \
{ \
        return mp->m_features & XFS_FEAT_ ## NAME; \
}

/* Some features can be added dynamically so they need a set wrapper, too. */
#define __XFS_ADD_FEAT(name, NAME) \
        __XFS_HAS_FEAT(name, NAME); \
static inline void xfs_add_ ## name (struct xfs_mount *mp) \
{ \
        mp->m_features |= XFS_FEAT_ ## NAME; \
        xfs_sb_version_add ## name(&mp->m_sb); \
}

/* Superblock features */
__XFS_ADD_FEAT(attr, ATTR)
__XFS_HAS_FEAT(nlink, NLINK)
__XFS_ADD_FEAT(quota, QUOTA)
__XFS_HAS_FEAT(dalign, DALIGN)
__XFS_HAS_FEAT(sector, SECTOR)
__XFS_HAS_FEAT(asciici, ASCIICI)
__XFS_HAS_FEAT(parent, PARENT)
__XFS_HAS_FEAT(ftype, FTYPE)
__XFS_HAS_FEAT(finobt, FINOBT)
__XFS_HAS_FEAT(rmapbt, RMAPBT)
__XFS_HAS_FEAT(reflink, REFLINK)
__XFS_HAS_FEAT(sparseinodes, SPINODES)
__XFS_HAS_FEAT(metauuid, META_UUID)
__XFS_HAS_FEAT(realtime, REALTIME)
__XFS_HAS_FEAT(inobtcounts, INOBTCNT)
__XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
__XFS_HAS_FEAT(metadir, METADIR)
__XFS_HAS_FEAT(zoned, ZONED)
__XFS_HAS_FEAT(nolifetime, NOLIFETIME)

static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
{
        /* all metadir file systems also allow rtgroups */
        return xfs_has_metadir(mp);
}

static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
{
        /* all rtgroups filesystems with an rt section have an rtsb */
        return xfs_has_rtgroups(mp) &&
                xfs_has_realtime(mp) &&
                !xfs_has_zoned(mp);
}

static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
{
        return xfs_has_rtgroups(mp) && xfs_has_realtime(mp) &&
               xfs_has_rmapbt(mp);
}

static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
{
        return xfs_has_metadir(mp) && xfs_has_realtime(mp) &&
               xfs_has_reflink(mp);
}

static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
{
        return !xfs_has_zoned(mp);
}

static inline bool xfs_can_sw_atomic_write(struct xfs_mount *mp)
{
        return xfs_has_reflink(mp);
}

/*
 * Some features are always on for v5 file systems, allow the compiler to
 * eliminiate dead code when building without v4 support.
 */
#define __XFS_HAS_V4_FEAT(name, NAME) \
static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
{ \
        return !IS_ENABLED(CONFIG_XFS_SUPPORT_V4) || \
                (mp->m_features & XFS_FEAT_ ## NAME); \
}

#define __XFS_ADD_V4_FEAT(name, NAME) \
        __XFS_HAS_V4_FEAT(name, NAME); \
static inline void xfs_add_ ## name (struct xfs_mount *mp) \
{ \
        if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) { \
                mp->m_features |= XFS_FEAT_ ## NAME; \
                xfs_sb_version_add ## name(&mp->m_sb); \
        } \
}

__XFS_HAS_V4_FEAT(align, ALIGN)
__XFS_HAS_V4_FEAT(logv2, LOGV2)
__XFS_HAS_V4_FEAT(extflg, EXTFLG)
__XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT)
__XFS_ADD_V4_FEAT(projid32, PROJID32)
__XFS_HAS_V4_FEAT(v3inodes, V3INODES)
__XFS_HAS_V4_FEAT(crc, CRC)
__XFS_HAS_V4_FEAT(pquotino, PQUOTINO)

static inline void xfs_add_attr2(struct xfs_mount *mp)
{
        if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4))
                xfs_sb_version_addattr2(&mp->m_sb);
}

/*
 * Mount features
 *
 * These do not change dynamically - features that can come and go, such as 32
 * bit inodes and read-only state, are kept as operational state rather than
 * features.
 */
__XFS_HAS_FEAT(noalign, NOALIGN)
__XFS_HAS_FEAT(allocsize, ALLOCSIZE)
__XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE)
__XFS_HAS_FEAT(wsync, WSYNC)
__XFS_HAS_FEAT(dirsync, DIRSYNC)
__XFS_HAS_FEAT(discard, DISCARD)
__XFS_HAS_FEAT(grpid, GRPID)
__XFS_HAS_FEAT(small_inums, SMALL_INUMS)
__XFS_HAS_FEAT(swalloc, SWALLOC)
__XFS_HAS_FEAT(filestreams, FILESTREAMS)
__XFS_HAS_FEAT(dax_always, DAX_ALWAYS)
__XFS_HAS_FEAT(dax_never, DAX_NEVER)
__XFS_HAS_FEAT(norecovery, NORECOVERY)
__XFS_HAS_FEAT(nouuid, NOUUID)

/*
 * Operational mount state flags
 *
 * Use these with atomic bit ops only!
 */
#define XFS_OPSTATE_UNMOUNTING                0        /* filesystem is unmounting */
#define XFS_OPSTATE_CLEAN                1        /* mount was clean */
#define XFS_OPSTATE_SHUTDOWN                2        /* stop all fs operations */
#define XFS_OPSTATE_INODE32                3        /* inode32 allocator active */
#define XFS_OPSTATE_READONLY                4        /* read-only fs */

/*
 * If set, inactivation worker threads will be scheduled to process queued
 * inodegc work.  If not, queued inodes remain in memory waiting to be
 * processed.
 */
#define XFS_OPSTATE_INODEGC_ENABLED        5
/*
 * If set, background speculative prealloc gc worker threads will be scheduled
 * to process queued blockgc work.  If not, inodes retain their preallocations
 * until explicitly deleted.
 */
#define XFS_OPSTATE_BLOCKGC_ENABLED        6

/* Kernel has logged a warning about shrink being used on this fs. */
#define XFS_OPSTATE_WARNED_SHRINK        9
/* Kernel has logged a warning about logged xattr updates being used. */
#define XFS_OPSTATE_WARNED_LARP                10
/* Mount time quotacheck is running */
#define XFS_OPSTATE_QUOTACHECK_RUNNING        11
/* Do we want to clear log incompat flags? */
#define XFS_OPSTATE_UNSET_LOG_INCOMPAT        12
/* Filesystem can use logged extended attributes */
#define XFS_OPSTATE_USE_LARP                13
/* Kernel has logged a warning about blocksize > pagesize on this fs. */
#define XFS_OPSTATE_WARNED_LBS                14
/* Kernel has logged a warning about metadata dirs being used on this fs. */
#define XFS_OPSTATE_WARNED_METADIR        17
/* Filesystem should use qflags to determine quotaon status */
#define XFS_OPSTATE_RESUMING_QUOTAON        18
/* Kernel has logged a warning about zoned RT device being used on this fs. */
#define XFS_OPSTATE_WARNED_ZONED        19
/* (Zoned) GC is in progress */
#define XFS_OPSTATE_ZONEGC_RUNNING        20

#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
{ \
        return test_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
} \
static inline bool xfs_clear_ ## name (struct xfs_mount *mp) \
{ \
        return test_and_clear_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
} \
static inline bool xfs_set_ ## name (struct xfs_mount *mp) \
{ \
        return test_and_set_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
}

__XFS_IS_OPSTATE(unmounting, UNMOUNTING)
__XFS_IS_OPSTATE(clean, CLEAN)
__XFS_IS_OPSTATE(shutdown, SHUTDOWN)
__XFS_IS_OPSTATE(inode32, INODE32)
__XFS_IS_OPSTATE(readonly, READONLY)
__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
#ifdef CONFIG_XFS_QUOTA
__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
__XFS_IS_OPSTATE(resuming_quotaon, RESUMING_QUOTAON)
#else
static inline bool xfs_is_quotacheck_running(struct xfs_mount *mp)
{
        return false;
}
static inline bool xfs_is_resuming_quotaon(struct xfs_mount *mp)
{
        return false;
}
static inline void xfs_set_resuming_quotaon(struct xfs_mount *m)
{
}
static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
{
        return false;
}
#endif /* CONFIG_XFS_QUOTA */
__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING)

static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
{
        return !test_and_set_bit(nr, &mp->m_opstate);
}

#define XFS_OPSTATE_STRINGS \
        { (1UL << XFS_OPSTATE_UNMOUNTING),                "unmounting" }, \
        { (1UL << XFS_OPSTATE_CLEAN),                        "clean" }, \
        { (1UL << XFS_OPSTATE_SHUTDOWN),                "shutdown" }, \
        { (1UL << XFS_OPSTATE_INODE32),                        "inode32" }, \
        { (1UL << XFS_OPSTATE_READONLY),                "read_only" }, \
        { (1UL << XFS_OPSTATE_INODEGC_ENABLED),                "inodegc" }, \
        { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED),                "blockgc" }, \
        { (1UL << XFS_OPSTATE_WARNED_SHRINK),                "wshrink" }, \
        { (1UL << XFS_OPSTATE_WARNED_LARP),                "wlarp" }, \
        { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING),        "quotacheck" }, \
        { (1UL << XFS_OPSTATE_UNSET_LOG_INCOMPAT),        "unset_log_incompat" }, \
        { (1UL << XFS_OPSTATE_USE_LARP),                "logged_xattrs" }

/*
 * Max and min values for mount-option defined I/O
 * preallocation sizes.
 */
#define XFS_MAX_IO_LOG                30        /* 1G */
#define XFS_MIN_IO_LOG                PAGE_SHIFT

void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname,
                int lnnum);
#define xfs_force_shutdown(m,f)        \
        xfs_do_force_shutdown(m, f, __FILE__, __LINE__)

#define SHUTDOWN_META_IO_ERROR        (1u << 0) /* write attempt to metadata failed */
#define SHUTDOWN_LOG_IO_ERROR        (1u << 1) /* write attempt to the log failed */
#define SHUTDOWN_FORCE_UMOUNT        (1u << 2) /* shutdown from a forced unmount */
#define SHUTDOWN_CORRUPT_INCORE        (1u << 3) /* corrupt in-memory structures */
#define SHUTDOWN_CORRUPT_ONDISK        (1u << 4)  /* corrupt metadata on device */
#define SHUTDOWN_DEVICE_REMOVED        (1u << 5) /* device removed underneath us */

#define XFS_SHUTDOWN_STRINGS \
        { SHUTDOWN_META_IO_ERROR,        "metadata_io" }, \
        { SHUTDOWN_LOG_IO_ERROR,        "log_io" }, \
        { SHUTDOWN_FORCE_UMOUNT,        "force_umount" }, \
        { SHUTDOWN_CORRUPT_INCORE,        "corruption" }, \
        { SHUTDOWN_DEVICE_REMOVED,        "device_removed" }

/*
 * Flags for xfs_mountfs
 */
#define XFS_MFSI_QUIET                0x40        /* Be silent if mount errors found */

static inline xfs_agnumber_t
xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
{
        xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
        do_div(ld, mp->m_sb.sb_agblocks);
        return (xfs_agnumber_t) ld;
}

static inline xfs_agblock_t
xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
{
        xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
        return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}

extern void        xfs_uuid_table_free(void);
uint64_t        xfs_default_resblks(struct xfs_mount *mp,
                        enum xfs_free_counter ctr);
extern int        xfs_mountfs(xfs_mount_t *mp);
extern void        xfs_unmountfs(xfs_mount_t *);

/*
 * Deltas for the block count can vary from 1 to very large, but lock contention
 * only occurs on frequent small block count updates such as in the delayed
 * allocation path for buffered writes (page a time updates). Hence we set
 * a large batch count (1024) to minimise global counter updates except when
 * we get near to ENOSPC and we have to be very accurate with our updates.
 */
#define XFS_FDBLOCKS_BATCH        1024

uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp,
                enum xfs_free_counter ctr);

/*
 * Sum up the freecount, but never return negative values.
 */
static inline s64 xfs_sum_freecounter(struct xfs_mount *mp,
                enum xfs_free_counter ctr)
{
        return percpu_counter_sum_positive(&mp->m_free[ctr].count);
}

/*
 * Same as above, but does return negative values.  Mostly useful for
 * special cases like repair and tracing.
 */
static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp,
                enum xfs_free_counter ctr)
{
        return percpu_counter_sum(&mp->m_free[ctr].count);
}

/*
 * This just provides and estimate without the cpu-local updates, use
 * xfs_sum_freecounter for the exact value.
 */
static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp,
                enum xfs_free_counter ctr)
{
        return percpu_counter_read_positive(&mp->m_free[ctr].count);
}

static inline int xfs_compare_freecounter(struct xfs_mount *mp,
                enum xfs_free_counter ctr, s64 rhs, s32 batch)
{
        return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch);
}

static inline void xfs_set_freecounter(struct xfs_mount *mp,
                enum xfs_free_counter ctr, uint64_t val)
{
        percpu_counter_set(&mp->m_free[ctr].count, val);
}

int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
                uint64_t delta, bool rsvd);
void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
                uint64_t delta);

static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
                bool reserved)
{
        return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved);
}

static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
{
        xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta);
}

static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
{
        return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false);
}

static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
{
        xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta);
}

extern int        xfs_readsb(xfs_mount_t *, int);
extern void        xfs_freesb(xfs_mount_t *);
extern bool        xfs_fs_writable(struct xfs_mount *mp, int level);
extern int        xfs_sb_validate_fsb_count(struct xfs_sb *, uint64_t);

extern int        xfs_dev_is_read_only(struct xfs_mount *, char *);

extern void        xfs_set_low_space_thresholds(struct xfs_mount *);

int        xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
                        xfs_off_t count_fsb);

struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
                int error_class, int error);
void xfs_force_summary_recalc(struct xfs_mount *mp);
int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta,
                int64_t ind_delta);
static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
{
        percpu_counter_add(&mp->m_delalloc_blks, delta);
}

int xfs_set_max_atomic_write_opt(struct xfs_mount *mp,
                unsigned long long new_max_bytes);

static inline struct xfs_buftarg *
xfs_group_type_buftarg(
        struct xfs_mount        *mp,
        enum xfs_group_type        type)
{
        switch (type) {
        case XG_TYPE_AG:
                return mp->m_ddev_targp;
        case XG_TYPE_RTG:
                return mp->m_rtdev_targp;
        default:
                ASSERT(0);
                break;
        }
        return NULL;
}

#endif        /* __XFS_MOUNT_H__ */






























































































































































































    1 








































































































































































































































































































    1 















    8 
















































































   20 













   54 














   13 


    9 
   10 
















































































































































































































































   15 

















   16 

































































































































































































































































































































    3 



















































































































































































































































































































































































































































































































































































































































































































    1 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* internal.h: mm/ internal definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H

#include <linux/fs.h>
#include <linux/khugepaged.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/pagemap.h>
#include <linux/pagewalk.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/leafops.h>
#include <linux/swap_cgroup.h>
#include <linux/tracepoint-defs.h>

/* Internal core VMA manipulation functions. */
#include "vma.h"

struct folio_batch;

/*
 * Maintains state across a page table move. The operation assumes both source
 * and destination VMAs already exist and are specified by the user.
 *
 * Partial moves are permitted, but the old and new ranges must both reside
 * within a VMA.
 *
 * mmap lock must be held in write and VMA write locks must be held on any VMA
 * that is visible.
 *
 * Use the PAGETABLE_MOVE() macro to initialise this struct.
 *
 * The old_addr and new_addr fields are updated as the page table move is
 * executed.
 *
 * NOTE: The page table move is affected by reading from [old_addr, old_end),
 * and old_addr may be updated for better page table alignment, so len_in
 * represents the length of the range being copied as specified by the user.
 */
struct pagetable_move_control {
        struct vm_area_struct *old; /* Source VMA. */
        struct vm_area_struct *new; /* Destination VMA. */
        unsigned long old_addr; /* Address from which the move begins. */
        unsigned long old_end; /* Exclusive address at which old range ends. */
        unsigned long new_addr; /* Address to move page tables to. */
        unsigned long len_in; /* Bytes to remap specified by user. */

        bool need_rmap_locks; /* Do rmap locks need to be taken? */
        bool for_stack; /* Is this an early temp stack being moved? */
};

#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_)        \
        struct pagetable_move_control name = {                                \
                .old = old_,                                                \
                .new = new_,                                                \
                .old_addr = old_addr_,                                        \
                .old_end = (old_addr_) + (len_),                        \
                .new_addr = new_addr_,                                        \
                .len_in = len_,                                                \
        }

/*
 * The set of flags that only affect watermark checking and reclaim
 * behaviour. This is used by the MM to obey the caller constraints
 * about IO, FS and watermark checking while ignoring placement
 * hints such as HIGHMEM usage.
 */
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
                        __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
                        __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
                        __GFP_NOLOCKDEP)

/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))

/* Control allocation cpuset and node placement constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)

/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

/*
 * Different from WARN_ON_ONCE(), no warning will be issued
 * when we specify __GFP_NOWARN.
 */
#define WARN_ON_ONCE_GFP(cond, gfp)        ({                                \
        static bool __section(".data..once") __warned;                        \
        int __ret_warn_once = !!(cond);                                        \
                                                                        \
        if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
                __warned = true;                                        \
                WARN_ON(1);                                                \
        }                                                                \
        unlikely(__ret_warn_once);                                        \
})

void page_writeback_init(void);

/*
 * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
 * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
 * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
 * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
 */
#define ENTIRELY_MAPPED                0x800000
#define FOLIO_PAGES_MAPPED        (ENTIRELY_MAPPED - 1)

/*
 * Flags passed to __show_mem() and show_free_areas() to suppress output in
 * various contexts.
 */
#define SHOW_MEM_FILTER_NODES                (0x0001u)        /* disallowed nodes */

/*
 * How many individual pages have an elevated _mapcount.  Excludes
 * the folio's entire_mapcount.
 *
 * Don't use this function outside of debugging code.
 */
static inline int folio_nr_pages_mapped(const struct folio *folio)
{
        if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
                return -1;
        return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
}

/*
 * Retrieve the first entry of a folio based on a provided entry within the
 * folio. We cannot rely on folio->swap as there is no guarantee that it has
 * been initialized. Used for calling arch_swap_restore()
 */
static inline swp_entry_t folio_swap(swp_entry_t entry,
                const struct folio *folio)
{
        swp_entry_t swap = {
                .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
        };

        return swap;
}

static inline void *folio_raw_mapping(const struct folio *folio)
{
        unsigned long mapping = (unsigned long)folio->mapping;

        return (void *)(mapping & ~FOLIO_MAPPING_FLAGS);
}

/*
 * This is a file-backed mapping, and is about to be memory mapped - invoke its
 * mmap hook and safely handle error conditions. On error, VMA hooks will be
 * mutated.
 *
 * @file: File which backs the mapping.
 * @vma:  VMA which we are mapping.
 *
 * Returns: 0 if success, error otherwise.
 */
static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
{
        int err = vfs_mmap(file, vma);

        if (likely(!err))
                return 0;

        /*
         * OK, we tried to call the file hook for mmap(), but an error
         * arose. The mapping is in an inconsistent state and we must not invoke
         * any further hooks on it.
         */
        vma->vm_ops = &vma_dummy_vm_ops;

        return err;
}

/*
 * If the VMA has a close hook then close it, and since closing it might leave
 * it in an inconsistent state which makes the use of any hooks suspect, clear
 * them down by installing dummy empty hooks.
 */
static inline void vma_close(struct vm_area_struct *vma)
{
        if (vma->vm_ops && vma->vm_ops->close) {
                vma->vm_ops->close(vma);

                /*
                 * The mapping is in an inconsistent state, and no further hooks
                 * may be invoked upon it.
                 */
                vma->vm_ops = &vma_dummy_vm_ops;
        }
}

/* unmap_vmas is in mm/memory.c */
void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);

#ifdef CONFIG_MMU

static inline void get_anon_vma(struct anon_vma *anon_vma)
{
        atomic_inc(&anon_vma->refcount);
}

void __put_anon_vma(struct anon_vma *anon_vma);

static inline void put_anon_vma(struct anon_vma *anon_vma)
{
        if (atomic_dec_and_test(&anon_vma->refcount))
                __put_anon_vma(anon_vma);
}

static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
        down_write(&anon_vma->root->rwsem);
}

static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
{
        return down_write_trylock(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
{
        up_write(&anon_vma->root->rwsem);
}

static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
{
        down_read(&anon_vma->root->rwsem);
}

static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
{
        return down_read_trylock(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
{
        up_read(&anon_vma->root->rwsem);
}

struct anon_vma *folio_get_anon_vma(const struct folio *folio);

/* Operations which modify VMAs. */
enum vma_operation {
        VMA_OP_SPLIT,
        VMA_OP_MERGE_UNFAULTED,
        VMA_OP_REMAP,
        VMA_OP_FORK,
};

int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
        enum vma_operation operation);
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
int  __anon_vma_prepare(struct vm_area_struct *vma);
void unlink_anon_vmas(struct vm_area_struct *vma);

static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
        if (likely(vma->anon_vma))
                return 0;

        return __anon_vma_prepare(vma);
}

/* Flags for folio_pte_batch(). */
typedef int __bitwise fpb_t;

/* Compare PTEs respecting the dirty bit. */
#define FPB_RESPECT_DIRTY                ((__force fpb_t)BIT(0))

/* Compare PTEs respecting the soft-dirty bit. */
#define FPB_RESPECT_SOFT_DIRTY                ((__force fpb_t)BIT(1))

/* Compare PTEs respecting the writable bit. */
#define FPB_RESPECT_WRITE                ((__force fpb_t)BIT(2))

/*
 * Merge PTE write bits: if any PTE in the batch is writable, modify the
 * PTE at @ptentp to be writable.
 */
#define FPB_MERGE_WRITE                        ((__force fpb_t)BIT(3))

/*
 * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty,
 * modify the PTE at @ptentp to be young or dirty, respectively.
 */
#define FPB_MERGE_YOUNG_DIRTY                ((__force fpb_t)BIT(4))

static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
        if (!(flags & FPB_RESPECT_DIRTY))
                pte = pte_mkclean(pte);
        if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY)))
                pte = pte_clear_soft_dirty(pte);
        if (likely(!(flags & FPB_RESPECT_WRITE)))
                pte = pte_wrprotect(pte);
        return pte_mkold(pte);
}

/**
 * folio_pte_batch_flags - detect a PTE batch for a large folio
 * @folio: The large folio to detect a PTE batch for.
 * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL.
 * @ptep: Page table pointer for the first entry.
 * @ptentp: Pointer to a COPY of the first page table entry whose flags this
 *            function updates based on @flags if appropriate.
 * @max_nr: The maximum number of table entries to consider.
 * @flags: Flags to modify the PTE batch semantics.
 *
 * Detect a PTE batch: consecutive (present) PTEs that map consecutive
 * pages of the same large folio in a single VMA and a single page table.
 *
 * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
 * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set)
 * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set).
 *
 * @ptep must map any page of the folio. max_nr must be at least one and
 * must be limited by the caller so scanning cannot exceed a single VMA and
 * a single page table.
 *
 * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will
 * be updated: it's crucial that a pointer to a COPY of the first
 * page table entry, obtained through ptep_get(), is provided as @ptentp.
 *
 * This function will be inlined to optimize based on the input parameters;
 * consider using folio_pte_batch() instead if applicable.
 *
 * Return: the number of table entries in the batch.
 */
static inline unsigned int folio_pte_batch_flags(struct folio *folio,
                struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp,
                unsigned int max_nr, fpb_t flags)
{
        bool any_writable = false, any_young = false, any_dirty = false;
        pte_t expected_pte, pte = *ptentp;
        unsigned int nr, cur_nr;

        VM_WARN_ON_FOLIO(!pte_present(pte), folio);
        VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
        VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
        /*
         * Ensure this is a pointer to a copy not a pointer into a page table.
         * If this is a stack value, it won't be a valid virtual address, but
         * that's fine because it also cannot be pointing into the page table.
         */
        VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp)));

        /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
        max_nr = min_t(unsigned long, max_nr,
                       folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));

        nr = pte_batch_hint(ptep, pte);
        expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
        ptep = ptep + nr;

        while (nr < max_nr) {
                pte = ptep_get(ptep);

                if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte))
                        break;

                if (flags & FPB_MERGE_WRITE)
                        any_writable |= pte_write(pte);
                if (flags & FPB_MERGE_YOUNG_DIRTY) {
                        any_young |= pte_young(pte);
                        any_dirty |= pte_dirty(pte);
                }

                cur_nr = pte_batch_hint(ptep, pte);
                expected_pte = pte_advance_pfn(expected_pte, cur_nr);
                ptep += cur_nr;
                nr += cur_nr;
        }

        if (any_writable)
                *ptentp = pte_mkwrite(*ptentp, vma);
        if (any_young)
                *ptentp = pte_mkyoung(*ptentp);
        if (any_dirty)
                *ptentp = pte_mkdirty(*ptentp);

        return min(nr, max_nr);
}

unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
                unsigned int max_nr);

/**
 * pte_move_swp_offset - Move the swap entry offset field of a swap pte
 *         forward or backward by delta
 * @pte: The initial pte state; must be a swap entry
 * @delta: The direction and the offset we are moving; forward if delta
 *         is positive; backward if delta is negative
 *
 * Moves the swap offset, while maintaining all other fields, including
 * swap type, and any swp pte bits. The resulting pte is returned.
 */
static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
{
        const softleaf_t entry = softleaf_from_pte(pte);
        pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
                                                   (swp_offset(entry) + delta)));

        if (pte_swp_soft_dirty(pte))
                new = pte_swp_mksoft_dirty(new);
        if (pte_swp_exclusive(pte))
                new = pte_swp_mkexclusive(new);
        if (pte_swp_uffd_wp(pte))
                new = pte_swp_mkuffd_wp(new);

        return new;
}


/**
 * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
 * @pte: The initial pte state; must be a swap entry.
 *
 * Increments the swap offset, while maintaining all other fields, including
 * swap type, and any swp pte bits. The resulting pte is returned.
 */
static inline pte_t pte_next_swp_offset(pte_t pte)
{
        return pte_move_swp_offset(pte, 1);
}

/**
 * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
 * @start_ptep: Page table pointer for the first entry.
 * @max_nr: The maximum number of table entries to consider.
 * @pte: Page table entry for the first entry.
 *
 * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
 * containing swap entries all with consecutive offsets and targeting the same
 * swap type, all with matching swp pte bits.
 *
 * max_nr must be at least one and must be limited by the caller so scanning
 * cannot exceed a single page table.
 *
 * Return: the number of table entries in the batch.
 */
static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
{
        pte_t expected_pte = pte_next_swp_offset(pte);
        const pte_t *end_ptep = start_ptep + max_nr;
        const softleaf_t entry = softleaf_from_pte(pte);
        pte_t *ptep = start_ptep + 1;
        unsigned short cgroup_id;

        VM_WARN_ON(max_nr < 1);
        VM_WARN_ON(!softleaf_is_swap(entry));

        cgroup_id = lookup_swap_cgroup_id(entry);
        while (ptep < end_ptep) {
                softleaf_t entry;

                pte = ptep_get(ptep);

                if (!pte_same(pte, expected_pte))
                        break;
                entry = softleaf_from_pte(pte);
                if (lookup_swap_cgroup_id(entry) != cgroup_id)
                        break;
                expected_pte = pte_next_swp_offset(expected_pte);
                ptep++;
        }

        return ptep - start_ptep;
}
#endif /* CONFIG_MMU */

void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
                                                int nr_throttled);
static inline void acct_reclaim_writeback(struct folio *folio)
{
        pg_data_t *pgdat = folio_pgdat(folio);
        int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);

        if (nr_throttled)
                __acct_reclaim_writeback(pgdat, folio, nr_throttled);
}

static inline void wake_throttle_isolated(pg_data_t *pgdat)
{
        wait_queue_head_t *wqh;

        wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
        if (waitqueue_active(wqh))
                wake_up(wqh);
}

vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf);
static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
{
        vm_fault_t ret = __vmf_anon_prepare(vmf);

        if (unlikely(ret & VM_FAULT_RETRY))
                vma_end_read(vmf->vma);
        return ret;
}

vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
void folio_activate(struct folio *folio);

void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc);

void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);

/**
 * sync_with_folio_pmd_zap - sync with concurrent zapping of a folio PMD
 * @mm: The mm_struct.
 * @pmdp: Pointer to the pmd that was found to be pmd_none().
 *
 * When we find a pmd_none() while unmapping a folio without holding the PTL,
 * zap_huge_pmd() may have cleared the PMD but not yet modified the folio to
 * indicate that it's unmapped. Skipping the PMD without synchronization could
 * make folio unmapping code assume that unmapping failed.
 *
 * Wait for concurrent zapping to complete by grabbing the PTL.
 */
static inline void sync_with_folio_pmd_zap(struct mm_struct *mm, pmd_t *pmdp)
{
        spinlock_t *ptl = pmd_lock(mm, pmdp);

        spin_unlock(ptl);
}

struct zap_details;
void zap_vma_range_batched(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long addr,
                unsigned long size, struct zap_details *details);
int zap_vma_for_reaping(struct vm_area_struct *vma);
int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
                           gfp_t gfp);

void page_cache_ra_order(struct readahead_control *, struct file_ra_state *);
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
                struct file *file, pgoff_t index, unsigned long nr_to_read)
{
        DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
        force_page_cache_ra(&ractl, nr_to_read);
}

unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
                loff_t end);
long mapping_evict_folio(struct address_space *mapping, struct folio *folio);
unsigned long mapping_try_invalidate(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_failed);

/**
 * folio_evictable - Test whether a folio is evictable.
 * @folio: The folio to test.
 *
 * Test whether @folio is evictable -- i.e., should be placed on
 * active/inactive lists vs unevictable list.
 *
 * Reasons folio might not be evictable:
 * 1. folio's mapping marked unevictable
 * 2. One of the pages in the folio is part of an mlocked VMA
 */
static inline bool folio_evictable(struct folio *folio)
{
        bool ret;

        /* Prevent address_space of inode and swap cache from being freed */
        rcu_read_lock();
        ret = !mapping_unevictable(folio_mapping(folio)) &&
                        !folio_test_mlocked(folio);
        rcu_read_unlock();
        return ret;
}

/*
 * Turn a non-refcounted page (->_refcount == 0) into refcounted with
 * a count of one.
 */
static inline void set_page_refcounted(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(page_ref_count(page), page);
        set_page_count(page, 1);
}

static inline void set_pages_refcounted(struct page *page, unsigned long nr_pages)
{
        unsigned long pfn = page_to_pfn(page);

        for (; nr_pages--; pfn++)
                set_page_refcounted(pfn_to_page(pfn));
}

/*
 * Return true if a folio needs ->release_folio() calling upon it.
 */
static inline bool folio_needs_release(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        return folio_has_private(folio) ||
                (mapping && mapping_release_always(mapping));
}

extern unsigned long highest_memmap_pfn;

/*
 * Maximum number of reclaim retries without progress before the OOM
 * killer is consider the only way forward.
 */
#define MAX_RECLAIM_RETRIES 16

/*
 * in mm/vmscan.c:
 */
bool folio_isolate_lru(struct folio *folio);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
int user_proactive_reclaim(char *buf,
                           struct mem_cgroup *memcg, pg_data_t *pgdat);

/*
 * in mm/rmap.c:
 */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);

/*
 * in mm/khugepaged.c
 */
void set_recommended_min_free_kbytes(void);

/*
 * in mm/page_alloc.c
 */
#define K(x) ((x) << (PAGE_SHIFT-10))

extern char * const zone_names[MAX_NR_ZONES];

/* perform sanity checks on struct pages being allocated or freed */
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);

extern int min_free_kbytes;
extern int defrag_mode;

void setup_per_zone_wmarks(void);
void calculate_min_free_kbytes(void);
int __meminit init_per_zone_wmark_min(void);
void page_alloc_sysctl_init(void);

/*
 * Structure for holding the mostly immutable allocation parameters passed
 * between functions involved in allocations, including the alloc_pages*
 * family of functions.
 *
 * nodemask, migratetype and highest_zoneidx are initialized only once in
 * __alloc_pages() and then never change.
 *
 * zonelist, preferred_zone and highest_zoneidx are set first in
 * __alloc_pages() for the fast path, and might be later changed
 * in __alloc_pages_slowpath(). All other functions pass the whole structure
 * by a const pointer.
 */
struct alloc_context {
        struct zonelist *zonelist;
        nodemask_t *nodemask;
        struct zoneref *preferred_zoneref;
        int migratetype;

        /*
         * highest_zoneidx represents highest usable zone index of
         * the allocation request. Due to the nature of the zone,
         * memory on lower zone than the highest_zoneidx will be
         * protected by lowmem_reserve[highest_zoneidx].
         *
         * highest_zoneidx is also used by reclaim/compaction to limit
         * the target zone since higher zone than this index cannot be
         * usable for this allocation request.
         */
        enum zone_type highest_zoneidx;
        bool spread_dirty_pages;
};

/*
 * This function returns the order of a free page in the buddy system. In
 * general, page_zone(page)->lock must be held by the caller to prevent the
 * page from being allocated in parallel and returning garbage as the order.
 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
 * page cannot be allocated or merged in parallel. Alternatively, it must
 * handle invalid values gracefully, and use buddy_order_unsafe() below.
 */
static inline unsigned int buddy_order(struct page *page)
{
        /* PageBuddy() must be checked by the caller */
        return page_private(page);
}

/*
 * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
 * PageBuddy() should be checked first by the caller to minimize race window,
 * and invalid values must be handled gracefully.
 *
 * READ_ONCE is used so that if the caller assigns the result into a local
 * variable and e.g. tests it for valid range before using, the compiler cannot
 * decide to remove the variable and inline the page_private(page) multiple
 * times, potentially observing different values in the tests and the actual
 * use of the result.
 */
#define buddy_order_unsafe(page)        READ_ONCE(page_private(page))

/*
 * This function checks whether a page is free && is the buddy
 * we can coalesce a page and its buddy if
 * (a) the buddy is not in a hole (check before calling!) &&
 * (b) the buddy is in the buddy system &&
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
 * For recording whether a page is in the buddy system, we set PageBuddy.
 * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
static inline bool page_is_buddy(struct page *page, struct page *buddy,
                                 unsigned int order)
{
        if (!page_is_guard(buddy) && !PageBuddy(buddy))
                return false;

        if (buddy_order(buddy) != order)
                return false;

        /*
         * zone check is done late to avoid uselessly calculating
         * zone/node ids for pages that could never merge.
         */
        if (page_zone_id(page) != page_zone_id(buddy))
                return false;

        VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

        return true;
}

/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 * the following equation:
 *     B2 = B1 ^ (1 << O)
 * For example, if the starting buddy (buddy2) is #8 its order
 * 1 buddy is #10:
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 *
 * 2) Any buddy B will have an order O+1 parent P which
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
 */
static inline unsigned long
__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
        return page_pfn ^ (1 << order);
}

/*
 * Find the buddy of @page and validate it.
 * @page: The input page
 * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
 *       function is used in the performance-critical __free_one_page().
 * @order: The order of the page
 * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
 *             page_to_pfn().
 *
 * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
 * not the same as @page. The validation is necessary before use it.
 *
 * Return: the found buddy page or NULL if not found.
 */
static inline struct page *find_buddy_page_pfn(struct page *page,
                        unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
{
        unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
        struct page *buddy;

        buddy = page + (__buddy_pfn - pfn);
        if (buddy_pfn)
                *buddy_pfn = __buddy_pfn;

        if (page_is_buddy(page, buddy, order))
                return buddy;
        return NULL;
}

extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone);

static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone)
{
        if (zone->contiguous)
                return pfn_to_page(start_pfn);

        return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}

void set_zone_contiguous(struct zone *zone);
bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
                           unsigned long nr_pages);

static inline void clear_zone_contiguous(struct zone *zone)
{
        zone->contiguous = false;
}

extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
                                    int mt);
extern void memblock_free_pages(unsigned long pfn, unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order,
                enum meminit_context context);

/*
 * This will have no effect, other than possibly generating a warning, if the
 * caller passes in a non-large folio.
 */
static inline void folio_set_order(struct folio *folio, unsigned int order)
{
        if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
                return;
        VM_WARN_ON_ONCE(order > MAX_FOLIO_ORDER);

        folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
#ifdef NR_PAGES_IN_LARGE_FOLIO
        folio->_nr_pages = 1U << order;
#endif
}

bool __folio_unqueue_deferred_split(struct folio *folio);
static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
        if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
                return false;

        /*
         * At this point, there is no one trying to add the folio to
         * deferred_list. If folio is not in deferred_list, it's safe
         * to check without acquiring the split_queue_lock.
         */
        if (data_race(list_empty(&folio->_deferred_list)))
                return false;

        return __folio_unqueue_deferred_split(folio);
}

static inline struct folio *page_rmappable_folio(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (folio && folio_test_large(folio))
                folio_set_large_rmappable(folio);
        return folio;
}

static inline void prep_compound_head(struct page *page, unsigned int order)
{
        struct folio *folio = (struct folio *)page;

        folio_set_order(folio, order);
        atomic_set(&folio->_large_mapcount, -1);
        if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                atomic_set(&folio->_nr_pages_mapped, 0);
        if (IS_ENABLED(CONFIG_MM_ID)) {
                folio->_mm_ids = 0;
                folio->_mm_id_mapcount[0] = -1;
                folio->_mm_id_mapcount[1] = -1;
        }
        if (IS_ENABLED(CONFIG_64BIT) || order > 1) {
                atomic_set(&folio->_pincount, 0);
                atomic_set(&folio->_entire_mapcount, -1);
        }
        if (order > 1)
                INIT_LIST_HEAD(&folio->_deferred_list);
}

static inline void prep_compound_tail(struct page *tail,
                const struct page *head, unsigned int order)
{
        tail->mapping = TAIL_MAPPING;
        set_compound_head(tail, head, order);
        set_page_private(tail, 0);
}

static inline void init_compound_tail(struct page *tail,
                const struct page *head, unsigned int order, struct zone *zone)
{
        atomic_set(&tail->_mapcount, -1);
        set_page_node(tail, zone_to_nid(zone));
        set_page_zone(tail, zone_idx(zone));
        prep_compound_tail(tail, head, order);
}

void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);

extern int user_min_free_kbytes;

struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid,
                nodemask_t *);
#define __alloc_frozen_pages(...) \
        alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__))
void free_frozen_pages(struct page *page, unsigned int order);
void free_unref_folios(struct folio_batch *fbatch);

#ifdef CONFIG_NUMA
struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order);
#else
static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order)
{
        return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL);
}
#endif

#define alloc_frozen_pages(...) \
        alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))

struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
#define alloc_frozen_pages_nolock(...) \
        alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__))
void free_frozen_pages_nolock(struct page *page, unsigned int order);

extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
extern void zone_pcp_init(struct zone *zone);

extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
                          phys_addr_t min_addr,
                          int nid, bool exact_nid);

void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
                unsigned long, enum meminit_context, struct vmem_altmap *, int,
                bool);

/*
 * mm/sparse.c
 */
#ifdef CONFIG_SPARSEMEM
void sparse_init(void);
int sparse_index_init(unsigned long section_nr, int nid);

static inline void sparse_init_one_section(struct mem_section *ms,
                unsigned long pnum, struct page *mem_map,
                struct mem_section_usage *usage, unsigned long flags)
{
        unsigned long coded_mem_map;

        BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);

        /*
         * We encode the start PFN of the section into the mem_map such that
         * page_to_pfn() on !CONFIG_SPARSEMEM_VMEMMAP can simply subtract it
         * from the page pointer to obtain the PFN.
         */
        coded_mem_map = (unsigned long)(mem_map - section_nr_to_pfn(pnum));
        VM_WARN_ON_ONCE(coded_mem_map & ~SECTION_MAP_MASK);

        ms->section_mem_map &= ~SECTION_MAP_MASK;
        ms->section_mem_map |= coded_mem_map;
        ms->section_mem_map |= flags | SECTION_HAS_MEM_MAP;
        ms->usage = usage;
}

static inline void __section_mark_present(struct mem_section *ms,
                unsigned long section_nr)
{
        if (section_nr > __highest_present_section_nr)
                __highest_present_section_nr = section_nr;

        ms->section_mem_map |= SECTION_MARKED_PRESENT;
}
#else
static inline void sparse_init(void) {}
#endif /* CONFIG_SPARSEMEM */

/*
 * mm/sparse-vmemmap.c
 */
#ifdef CONFIG_SPARSEMEM_VMEMMAP
void sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages);
#else
static inline void sparse_init_subsection_map(unsigned long pfn,
                unsigned long nr_pages)
{
}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

/*
 * in mm/compaction.c
 */
/*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 * are moved to the end of a zone during a compaction run and the run
 * completes when free_pfn <= migrate_pfn
 */
struct compact_control {
        struct list_head freepages[NR_PAGE_ORDERS];        /* List of free pages to migrate to */
        struct list_head migratepages;        /* List of pages being migrated */
        unsigned int nr_freepages;        /* Number of isolated free pages */
        unsigned int nr_migratepages;        /* Number of pages to migrate */
        unsigned long free_pfn;                /* isolate_freepages search base */
        /*
         * Acts as an in/out parameter to page isolation for migration.
         * isolate_migratepages uses it as a search base.
         * isolate_migratepages_block will update the value to the next pfn
         * after the last isolated one.
         */
        unsigned long migrate_pfn;
        unsigned long fast_start_pfn;        /* a pfn to start linear scan from */
        struct zone *zone;
        unsigned long total_migrate_scanned;
        unsigned long total_free_scanned;
        unsigned short fast_search_fail;/* failures to use free list searches */
        short search_order;                /* order to start a fast search at */
        const gfp_t gfp_mask;                /* gfp mask of a direct compactor */
        int order;                        /* order a direct compactor needs */
        int migratetype;                /* migratetype of direct compactor */
        const unsigned int alloc_flags;        /* alloc flags of a direct compactor */
        const int highest_zoneidx;        /* zone index of a direct compactor */
        enum migrate_mode mode;                /* Async or sync migration mode */
        bool ignore_skip_hint;                /* Scan blocks even if marked skip */
        bool no_set_skip_hint;                /* Don't mark blocks for skipping */
        bool ignore_block_suitable;        /* Scan blocks considered unsuitable */
        bool direct_compaction;                /* False from kcompactd or /proc/... */
        bool proactive_compaction;        /* kcompactd proactive compaction */
        bool whole_zone;                /* Whole zone should/has been scanned */
        bool contended;                        /* Signal lock contention */
        bool finish_pageblock;                /* Scan the remainder of a pageblock. Used
                                         * when there are potentially transient
                                         * isolation or migration failures to
                                         * ensure forward progress.
                                         */
        bool alloc_contig;                /* alloc_contig_range allocation */
};

/*
 * Used in direct compaction when a page should be taken from the freelists
 * immediately when one is created during the free path.
 */
struct capture_control {
        struct compact_control *cc;
        struct page *page;
};

unsigned long
isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn);
int
isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void init_cma_reserved_pageblock(struct page *page);

#endif /* CONFIG_COMPACTION || CONFIG_CMA */

struct cma;

#ifdef CONFIG_CMA
bool cma_validate_zones(struct cma *cma);
void *cma_reserve_early(struct cma *cma, unsigned long size);
void init_cma_pageblock(struct page *page);
#else
static inline bool cma_validate_zones(struct cma *cma)
{
        return false;
}
static inline void *cma_reserve_early(struct cma *cma, unsigned long size)
{
        return NULL;
}
static inline void init_cma_pageblock(struct page *page)
{
}
#endif


int find_suitable_fallback(struct free_area *area, unsigned int order,
                           int migratetype, bool claimable);

static inline bool free_area_empty(struct free_area *area, int migratetype)
{
        return list_empty(&area->free_list[migratetype]);
}

/* mm/util.c */
struct anon_vma *folio_anon_vma(const struct folio *folio);

#ifdef CONFIG_MMU
void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
                unsigned long end, bool write, int *locked);
bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked,
                unsigned long bytes);

/*
 * NOTE: This function can't tell whether the folio is "fully mapped" in the
 * range.
 * "fully mapped" means all the pages of folio is associated with the page
 * table of range while this function just check whether the folio range is
 * within the range [start, end). Function caller needs to do page table
 * check if it cares about the page table association.
 *
 * Typical usage (like mlock or madvise) is:
 * Caller knows at least 1 page of folio is associated with page table of VMA
 * and the range [start, end) is intersect with the VMA range. Caller wants
 * to know whether the folio is fully associated with the range. It calls
 * this function to check whether the folio is in the range first. Then checks
 * the page table to know whether the folio is fully mapped to the range.
 */
static inline bool
folio_within_range(struct folio *folio, struct vm_area_struct *vma,
                unsigned long start, unsigned long end)
{
        pgoff_t pgoff, addr;
        unsigned long vma_pglen = vma_pages(vma);

        VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
        if (start > end)
                return false;

        if (start < vma->vm_start)
                start = vma->vm_start;

        if (end > vma->vm_end)
                end = vma->vm_end;

        pgoff = folio_pgoff(folio);

        /* if folio start address is not in vma range */
        if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
                return false;

        addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);

        return !(addr < start || end - addr < folio_size(folio));
}

static inline bool
folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
{
        return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
}

/*
 * mlock_vma_folio() and munlock_vma_folio():
 * should be called with vma's mmap_lock held for read or write,
 * under page table lock for the pte/pmd being added or removed.
 *
 * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at
 * the end of folio_remove_rmap_*(); but new anon folios are managed by
 * folio_add_lru_vma() calling mlock_new_folio().
 */
void mlock_folio(struct folio *folio);
static inline void mlock_vma_folio(struct folio *folio,
                                struct vm_area_struct *vma)
{
        /*
         * The VM_SPECIAL check here serves two purposes.
         * 1) VM_IO check prevents migration from double-counting during mlock.
         * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
         *    is never left set on a VM_SPECIAL vma, there is an interval while
         *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
         *    still be set while VM_SPECIAL bits are added: so ignore it then.
         */
        if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED))
                mlock_folio(folio);
}

void munlock_folio(struct folio *folio);
static inline void munlock_vma_folio(struct folio *folio,
                                        struct vm_area_struct *vma)
{
        /*
         * munlock if the function is called. Ideally, we should only
         * do munlock if any page of folio is unmapped from VMA and
         * cause folio not fully mapped to VMA.
         *
         * But it's not easy to confirm that's the situation. So we
         * always munlock the folio and page reclaim will correct it
         * if it's wrong.
         */
        if (unlikely(vma->vm_flags & VM_LOCKED))
                munlock_folio(folio);
}

void mlock_new_folio(struct folio *folio);
bool need_mlock_drain(int cpu);
void mlock_drain_local(void);
void mlock_drain_remote(int cpu);

extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);

/**
 * vma_address - Find the virtual address a page range is mapped at
 * @vma: The vma which maps this object.
 * @pgoff: The page offset within its object.
 * @nr_pages: The number of pages to consider.
 *
 * If any page in this range is mapped by this VMA, return the first address
 * where any of these pages appear.  Otherwise, return -EFAULT.
 */
static inline unsigned long vma_address(const struct vm_area_struct *vma,
                pgoff_t pgoff, unsigned long nr_pages)
{
        unsigned long address;

        if (pgoff >= vma->vm_pgoff) {
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                /* Check for address beyond vma (or wrapped through 0?) */
                if (address < vma->vm_start || address >= vma->vm_end)
                        address = -EFAULT;
        } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
                /* Test above avoids possibility of wrap to 0 on 32-bit */
                address = vma->vm_start;
        } else {
                address = -EFAULT;
        }
        return address;
}

/*
 * Then at what user virtual address will none of the range be found in vma?
 * Assumes that vma_address() already returned a good starting address.
 */
static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
{
        struct vm_area_struct *vma = pvmw->vma;
        pgoff_t pgoff;
        unsigned long address;

        /* Common case, plus ->pgoff is invalid for KSM */
        if (pvmw->nr_pages == 1)
                return pvmw->address + PAGE_SIZE;

        pgoff = pvmw->pgoff + pvmw->nr_pages;
        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        /* Check for address beyond vma (or wrapped through 0?) */
        if (address < vma->vm_start || address > vma->vm_end)
                address = vma->vm_end;
        return address;
}

static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
                                                    struct file *fpin)
{
        int flags = vmf->flags;

        if (fpin)
                return fpin;

        /*
         * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
         * anything, so we only pin the file and drop the mmap_lock if only
         * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
         */
        if (fault_flag_allow_retry_first(flags) &&
            !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
                fpin = get_file(vmf->vma->vm_file);
                release_fault_lock(vmf);
        }
        return fpin;
}

static inline bool vma_supports_mlock(const struct vm_area_struct *vma)
{
        if (vma_test_any_mask(vma, VMA_SPECIAL_FLAGS))
                return false;
        if (vma_test_single_mask(vma, VMA_DROPPABLE))
                return false;
        if (vma_is_dax(vma) || is_vm_hugetlb_page(vma))
                return false;
        return vma != get_gate_vma(current->mm);
}

#else /* !CONFIG_MMU */
static inline void unmap_mapping_folio(struct folio *folio) { }
static inline void mlock_new_folio(struct folio *folio) { }
static inline bool need_mlock_drain(int cpu) { return false; }
static inline void mlock_drain_local(void) { }
static inline void mlock_drain_remote(int cpu) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
#endif /* !CONFIG_MMU */

/* Memory initialisation debug and verification */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
DECLARE_STATIC_KEY_TRUE(deferred_pages);

static inline bool deferred_pages_enabled(void)
{
        return static_branch_unlikely(&deferred_pages);
}

bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
#else
static inline bool deferred_pages_enabled(void)
{
        return false;
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

void init_deferred_page(unsigned long pfn, int nid);

enum mminit_level {
        MMINIT_WARNING,
        MMINIT_VERIFY,
        MMINIT_TRACE
};

#ifdef CONFIG_DEBUG_MEMORY_INIT

extern int mminit_loglevel;

#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
        if (level < mminit_loglevel) { \
                if (level <= MMINIT_WARNING) \
                        pr_warn("mminit::" prefix " " fmt, ##arg);        \
                else \
                        printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
        } \
} while (0)

extern void mminit_verify_pageflags_layout(void);
extern void mminit_verify_zonelist(void);
#else

static inline void mminit_dprintk(enum mminit_level level,
                                const char *prefix, const char *fmt, ...)
{
}

static inline void mminit_verify_pageflags_layout(void)
{
}

static inline void mminit_verify_zonelist(void)
{
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */

#define NODE_RECLAIM_NOSCAN        -2
#define NODE_RECLAIM_FULL        -1
#define NODE_RECLAIM_SOME        0
#define NODE_RECLAIM_SUCCESS        1

#ifdef CONFIG_NUMA
extern int node_reclaim_mode;

extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
#define node_reclaim_mode 0

static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
                                unsigned int order)
{
        return NODE_RECLAIM_NOSCAN;
}
static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
{
        return NUMA_NO_NODE;
}
#endif

static inline bool node_reclaim_enabled(void)
{
        /* Is any node_reclaim_mode bit set? */
        return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
}

/*
 * mm/memory-failure.c
 */
#ifdef CONFIG_MEMORY_FAILURE
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
typedef int hwpoison_filter_func_t(struct page *p);
void hwpoison_filter_register(hwpoison_filter_func_t *filter);
void hwpoison_filter_unregister(void);

#define MAGIC_HWPOISON        0x48575053U        /* HWPS */
void SetPageHWPoisonTakenOff(struct page *page);
void ClearPageHWPoisonTakenOff(struct page *page);
bool take_page_off_buddy(struct page *page);
bool put_page_back_buddy(struct page *page);
struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
                     struct vm_area_struct *vma, struct list_head *to_kill,
                     unsigned long ksm_addr);
unsigned long page_mapped_in_vma(const struct page *page,
                struct vm_area_struct *vma);

#else
static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
        return -EBUSY;
}
#endif

extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

extern void set_pageblock_order(void);
unsigned long reclaim_pages(struct list_head *folio_list);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
                                            struct list_head *folio_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN                WMARK_MIN
#define ALLOC_WMARK_LOW                WMARK_LOW
#define ALLOC_WMARK_HIGH        WMARK_HIGH
#define ALLOC_NO_WATERMARKS        0x04 /* don't check watermarks at all */

/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)

/*
 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
 * cannot assume a reduced access to memory reserves is sufficient for
 * !MMU
 */
#ifdef CONFIG_MMU
#define ALLOC_OOM                0x08
#else
#define ALLOC_OOM                ALLOC_NO_WATERMARKS
#endif

#define ALLOC_NON_BLOCK                 0x10 /* Caller cannot block. Allow access
                                       * to 25% of the min watermark or
                                       * 62.5% if __GFP_HIGH is set.
                                       */
#define ALLOC_MIN_RESERVE         0x20 /* __GFP_HIGH set. Allow access to 50%
                                       * of the min watermark.
                                       */
#define ALLOC_CPUSET                 0x40 /* check for correct cpuset */
#define ALLOC_CMA                 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
#define ALLOC_NOFRAGMENT        0x100 /* avoid mixing pageblock types */
#else
#define ALLOC_NOFRAGMENT          0x0
#endif
#define ALLOC_HIGHATOMIC        0x200 /* Allows access to MIGRATE_HIGHATOMIC */
#define ALLOC_TRYLOCK                0x400 /* Only use spin_trylock in allocation path */
#define ALLOC_KSWAPD                0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */

/* Flags that allow allocations below the min watermark. */
#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)

enum ttu_flags;
struct tlbflush_unmap_batch;


/*
 * only for MM internal work items which do not depend on
 * any allocations or locks which might depend on allocations
 */
extern struct workqueue_struct *mm_percpu_wq;

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
}
static inline void try_to_unmap_flush_dirty(void)
{
}
static inline void flush_tlb_batched_pending(struct mm_struct *mm)
{
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];

void setup_zone_pageset(struct zone *zone);

struct migration_target_control {
        int nid;                /* preferred node id */
        nodemask_t *nmask;
        gfp_t gfp_mask;
        enum migrate_reason reason;
};

/*
 * mm/filemap.c
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
                              struct folio *folio, loff_t fpos, size_t size);

/*
 * mm/vmalloc.c
 */
#ifdef CONFIG_MMU
void __init vmalloc_init(void);
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
        pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask);
unsigned int get_vm_area_page_order(struct vm_struct *vm);
#else
static inline void vmalloc_init(void)
{
}

static inline
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
        pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask)
{
        return -EINVAL;
}
#endif

void clear_vm_uninitialized_flag(struct vm_struct *vm);

int __must_check __vmap_pages_range_noflush(unsigned long addr,
                               unsigned long end, pgprot_t prot,
                               struct page **pages, unsigned int page_shift);

void vunmap_range_noflush(unsigned long start, unsigned long end);

void __vunmap_range_noflush(unsigned long start, unsigned long end);

static inline bool vma_is_single_threaded_private(struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_SHARED)
                return false;

        return atomic_read(&vma->vm_mm->mm_users) == 1;
}

#ifdef CONFIG_NUMA_BALANCING
bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
                bool is_private_single_threaded);

#else
static inline bool folio_can_map_prot_numa(struct folio *folio,
                struct vm_area_struct *vma, bool is_private_single_threaded)
{
        return false;
}
#endif

int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
                      unsigned long addr, int *flags, bool writable,
                      int *last_cpupid);

void free_zone_device_folio(struct folio *folio);
int migrate_device_coherent_folio(struct folio *folio);

struct vm_struct *__get_vm_area_node(unsigned long size,
                                     unsigned long align, unsigned long shift,
                                     unsigned long vm_flags, unsigned long start,
                                     unsigned long end, int node, gfp_t gfp_mask,
                                     const void *caller);

/*
 * mm/gup.c
 */
int __must_check try_grab_folio(struct folio *folio, int refs,
                                unsigned int flags);

/*
 * mm/huge_memory.c
 */
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
               pud_t *pud, bool write);
bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
               pmd_t *pmd, bool write);

/*
 * Parses a string with mem suffixes into its order. Useful to parse kernel
 * parameters.
 */
static inline int get_order_from_str(const char *size_str,
                                     unsigned long valid_orders)
{
        unsigned long size;
        char *endptr;
        int order;

        size = memparse(size_str, &endptr);

        if (!is_power_of_2(size))
                return -EINVAL;
        order = get_order(size);
        if (BIT(order) & ~valid_orders)
                return -EINVAL;

        return order;
}

enum {
        /* mark page accessed */
        FOLL_TOUCH = 1 << 16,
        /* a retry, previous pass started an IO */
        FOLL_TRIED = 1 << 17,
        /* we are working on non-current tsk/mm */
        FOLL_REMOTE = 1 << 18,
        /* pages must be released via unpin_user_page */
        FOLL_PIN = 1 << 19,
        /* gup_fast: prevent fall-back to slow gup */
        FOLL_FAST_ONLY = 1 << 20,
        /* allow unlocking the mmap lock */
        FOLL_UNLOCKABLE = 1 << 21,
        /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
        FOLL_MADV_POPULATE = 1 << 22,
};

#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
                            FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
                            FOLL_MADV_POPULATE)

/*
 * Indicates for which pages that are write-protected in the page table,
 * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
 * GUP pin will remain consistent with the pages mapped into the page tables
 * of the MM.
 *
 * Temporary unmapping of PageAnonExclusive() pages or clearing of
 * PageAnonExclusive() has to protect against concurrent GUP:
 * * Ordinary GUP: Using the PT lock
 * * GUP-fast and fork(): mm->write_protect_seq
 * * GUP-fast and KSM or temporary unmapping (swap, migration): see
 *    folio_try_share_anon_rmap_*()
 *
 * Must be called with the (sub)page that's actually referenced via the
 * page table entry, which might not necessarily be the head page for a
 * PTE-mapped THP.
 *
 * If the vma is NULL, we're coming from the GUP-fast path and might have
 * to fallback to the slow path just to lookup the vma.
 */
static inline bool gup_must_unshare(struct vm_area_struct *vma,
                                    unsigned int flags, struct page *page)
{
        /*
         * FOLL_WRITE is implicitly handled correctly as the page table entry
         * has to be writable -- and if it references (part of) an anonymous
         * folio, that part is required to be marked exclusive.
         */
        if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
                return false;
        /*
         * Note: PageAnon(page) is stable until the page is actually getting
         * freed.
         */
        if (!PageAnon(page)) {
                /*
                 * We only care about R/O long-term pining: R/O short-term
                 * pinning does not have the semantics to observe successive
                 * changes through the process page tables.
                 */
                if (!(flags & FOLL_LONGTERM))
                        return false;

                /* We really need the vma ... */
                if (!vma)
                        return true;

                /*
                 * ... because we only care about writable private ("COW")
                 * mappings where we have to break COW early.
                 */
                return is_cow_mapping(vma->vm_flags);
        }

        /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_rmb();

        /*
         * Note that KSM pages cannot be exclusive, and consequently,
         * cannot get pinned.
         */
        return !PageAnonExclusive(page);
}

extern bool mirrored_kernelcore;
bool memblock_has_mirror(void);
void memblock_free_all(void);

static __always_inline void vma_set_range(struct vm_area_struct *vma,
                                          unsigned long start, unsigned long end,
                                          pgoff_t pgoff)
{
        vma->vm_start = start;
        vma->vm_end = end;
        vma->vm_pgoff = pgoff;
}

static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
{
        /*
         * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
         * enablements, because when without soft-dirty being compiled in,
         * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
         * will be constantly true.
         */
        if (!pgtable_supports_soft_dirty())
                return false;

        /*
         * Soft-dirty is kind of special: its tracking is enabled when the
         * vma flags not set.
         */
        return !(vma->vm_flags & VM_SOFTDIRTY);
}

static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd)
{
        return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd);
}

static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte)
{
        return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte);
}

void __meminit __init_single_page(struct page *page, unsigned long pfn,
                                unsigned long zone, int nid);
void __meminit __init_page_from_nid(unsigned long pfn, int nid);

/* shrinker related functions */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
                          int priority);

int shmem_add_to_page_cache(struct folio *folio,
                            struct address_space *mapping,
                            pgoff_t index, void *expected, gfp_t gfp);
int shmem_inode_acct_blocks(struct inode *inode, long pages);
bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped);

#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
                        struct shrinker *shrinker, const char *fmt, va_list ap)
{
        shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);

        return shrinker->name ? 0 : -ENOMEM;
}

static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
{
        kfree_const(shrinker->name);
        shrinker->name = NULL;
}

extern int shrinker_debugfs_add(struct shrinker *shrinker);
extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
                                              int *debugfs_id);
extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
                                    int debugfs_id);
#else /* CONFIG_SHRINKER_DEBUG */
static inline int shrinker_debugfs_add(struct shrinker *shrinker)
{
        return 0;
}
static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
                                              const char *fmt, va_list ap)
{
        return 0;
}
static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
{
}
static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
                                                     int *debugfs_id)
{
        *debugfs_id = -1;
        return NULL;
}
static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
                                           int debugfs_id)
{
}
#endif /* CONFIG_SHRINKER_DEBUG */

/* Only track the nodes of mappings with shadow entries */
void workingset_update_node(struct xa_node *node);
extern struct list_lru shadow_nodes;
#define mapping_set_update(xas, mapping) do {                        \
        if (!dax_mapping(mapping) && !shmem_mapping(mapping)) {        \
                xas_set_update(xas, workingset_update_node);        \
                xas_set_lru(xas, &shadow_nodes);                \
        }                                                        \
} while (0)

/* mremap.c */
unsigned long move_page_tables(struct pagetable_move_control *pmc);

#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
#else /* CONFIG_UNACCEPTED_MEMORY */
static inline void accept_page(struct page *page)
{
}
#endif /* CONFIG_UNACCEPTED_MEMORY */

/* pagewalk.c */
int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start,
                unsigned long end, const struct mm_walk_ops *ops,
                void *private);
int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, const struct mm_walk_ops *ops,
                void *private);
int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
                          unsigned long end, const struct mm_walk_ops *ops,
                          pgd_t *pgd, void *private);

void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);

int remap_pfn_range_prepare(struct vm_area_desc *desc);
int remap_pfn_range_complete(struct vm_area_struct *vma,
                             struct mmap_action *action);
int simple_ioremap_prepare(struct vm_area_desc *desc);

static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc)
{
        struct mmap_action *action = &desc->action;
        const unsigned long orig_pfn = action->remap.start_pfn;
        const pgprot_t orig_pgprot = action->remap.pgprot;
        const unsigned long size = action->remap.size;
        const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
        int err;

        action->remap.start_pfn = pfn;
        action->remap.pgprot = pgprot_decrypted(orig_pgprot);
        err = remap_pfn_range_prepare(desc);
        if (err)
                return err;

        /* Remap does the actual work. */
        action->type = MMAP_REMAP_PFN;
        return 0;
}

/*
 * When we succeed an mmap action or just before we unmap a VMA on error, we
 * need to ensure any rmap lock held is released. On unmap it's required to
 * avoid a deadlock.
 */
static inline void maybe_rmap_unlock_action(struct vm_area_struct *vma,
                struct mmap_action *action)
{
        struct file *file;

        if (!action->hide_from_rmap_until_complete)
                return;

        VM_WARN_ON_ONCE(vma_is_anonymous(vma));
        file = vma->vm_file;
        i_mmap_unlock_write(file->f_mapping);
        action->hide_from_rmap_until_complete = false;
}

#ifdef CONFIG_MMU_NOTIFIER
static inline bool clear_flush_young_ptes_notify(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep, unsigned int nr)
{
        bool young;

        young = clear_flush_young_ptes(vma, addr, ptep, nr);
        young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr,
                                                addr + nr * PAGE_SIZE);
        return young;
}

static inline bool pmdp_clear_flush_young_notify(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmdp)
{
        bool young;

        young = pmdp_clear_flush_young(vma, addr, pmdp);
        young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, addr + PMD_SIZE);
        return young;
}

static inline bool test_and_clear_young_ptes_notify(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep, unsigned int nr)
{
        bool young;

        young = test_and_clear_young_ptes(vma, addr, ptep, nr);
        young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + nr * PAGE_SIZE);
        return young;
}

static inline bool pmdp_test_and_clear_young_notify(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmdp)
{
        bool young;

        young = pmdp_test_and_clear_young(vma, addr, pmdp);
        young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE);
        return young;
}

#else /* CONFIG_MMU_NOTIFIER */

#define clear_flush_young_ptes_notify        clear_flush_young_ptes
#define pmdp_clear_flush_young_notify        pmdp_clear_flush_young
#define test_and_clear_young_ptes_notify        test_and_clear_young_ptes
#define pmdp_test_and_clear_young_notify        pmdp_test_and_clear_young

#endif /* CONFIG_MMU_NOTIFIER */

extern int sysctl_max_map_count;
static inline int get_sysctl_max_map_count(void)
{
        return READ_ONCE(sysctl_max_map_count);
}

bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags,
                   unsigned long npages);

#endif        /* __MM_INTERNAL_H */
























   13 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* SPDX-License-Identifier: GPL-2.0-only */

#ifndef LINUX_RESUME_USER_MODE_H
#define LINUX_RESUME_USER_MODE_H

#include <linux/sched.h>
#include <linux/task_work.h>
#include <linux/memcontrol.h>
#include <linux/rseq.h>
#include <linux/blk-cgroup.h>

/**
 * set_notify_resume - cause resume_user_mode_work() to be called
 * @task:                task that will call resume_user_mode_work()
 *
 * Calling this arranges that @task will call resume_user_mode_work()
 * before returning to user mode.  If it's already running in user mode,
 * it will enter the kernel and call resume_user_mode_work() soon.
 * If it's blocked, it will not be woken.
 */
static inline void set_notify_resume(struct task_struct *task)
{
        if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
                kick_process(task);
}


/**
 * resume_user_mode_work - Perform work before returning to user mode
 * @regs:                user-mode registers of @current task
 *
 * This is called when %TIF_NOTIFY_RESUME has been set.  Now we are
 * about to return to user mode, and the user state in @regs can be
 * inspected or adjusted.  The caller in arch code has cleared
 * %TIF_NOTIFY_RESUME before the call.  If the flag gets set again
 * asynchronously, this will be called again before we return to
 * user mode.
 *
 * Called without locks.
 */
static inline void resume_user_mode_work(struct pt_regs *regs)
{
        clear_thread_flag(TIF_NOTIFY_RESUME);
        /*
         * This barrier pairs with task_work_add()->set_notify_resume() after
         * hlist_add_head(task->task_works);
         */
        smp_mb__after_atomic();
        if (unlikely(task_work_pending(current)))
                task_work_run();

#ifdef CONFIG_KEYS_REQUEST_CACHE
        if (unlikely(current->cached_requested_key)) {
                key_put(current->cached_requested_key);
                current->cached_requested_key = NULL;
        }
#endif

        mem_cgroup_handle_over_high(GFP_KERNEL);
        blkcg_maybe_throttle_current();

        rseq_handle_slowpath(regs);
}

#endif /* LINUX_RESUME_USER_MODE_H */













































































































































































































































































































































































































































































































































































    1 
















    1 



    1 



    1 






































































































































































































































































































































































































































































































































































































































































































    1 







    1 






















































































































































































































    1 


















    1 












    1 









    1 












    1 

    2 




















































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 

    1 






























    1 

















    1 








    1 

    1 

    1 

    1 
    1 



    1 
    1 




    1 































    1 





























    1 

    1 

    1 













    1 







    1 











    1 



















    2 





    2 
    2 






    2 
















    2 

    1 







    1 

    1 














    2 



    2 

    2 











    2 


































    1 










    1 
    2 


























































































    1 














    1 











































   11 














   11 



















    1 



































    1 









    1 

    1 





    1 
























    1 




    1 
    1 

















    1 

    1 















    1 
    1 






























    1 



    1 

    1 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/page-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 *
 * Contains functions related to writing back dirty pages at the
 * address_space level.
 *
 * 10Apr2002        Andrew Morton
 *                Initial version
 */

#include <linux/kernel.h>
#include <linux/math64.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/percpu.h>
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/folio_batch.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
#include <linux/shmem_fs.h>
#include <trace/events/writeback.h>

#include "internal.h"

/*
 * Sleep at most 200ms at a time in balance_dirty_pages().
 */
#define MAX_PAUSE                max(HZ/5, 1)

/*
 * Try to keep balance_dirty_pages() call intervals higher than this many pages
 * by raising pause time to max_pause when falls below it.
 */
#define DIRTY_POLL_THRESH        (128 >> (PAGE_SHIFT - 10))

/*
 * Estimate write bandwidth or update dirty limit at 200ms intervals.
 */
#define BANDWIDTH_INTERVAL        max(HZ/5, 1)

#define RATELIMIT_CALC_SHIFT        10

/*
 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
 * will look to see if it needs to force writeback or throttling.
 */
static long ratelimit_pages = 32;

/* The following parameters are exported via /proc/sys/vm */

/*
 * Start background writeback (via writeback threads) at this percentage
 */
static int dirty_background_ratio = 10;

/*
 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
 * dirty_background_ratio * the amount of dirtyable memory
 */
static unsigned long dirty_background_bytes;

/*
 * free highmem will not be subtracted from the total free memory
 * for calculating free ratios if vm_highmem_is_dirtyable is true
 */
static int vm_highmem_is_dirtyable;

/*
 * The generator of dirty data starts writeback at this percentage
 */
static int vm_dirty_ratio = 20;

/*
 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
 * vm_dirty_ratio * the amount of dirtyable memory
 */
static unsigned long vm_dirty_bytes;

/*
 * The interval between `kupdate'-style writebacks
 */
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */

EXPORT_SYMBOL_GPL(dirty_writeback_interval);

/*
 * The longest time for which data is allowed to remain dirty
 */
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */

/* End of sysctl-exported parameters */

struct wb_domain global_wb_domain;

/*
 * Length of period for aging writeout fractions of bdis. This is an
 * arbitrarily chosen number. The longer the period, the slower fractions will
 * reflect changes in current writeout rate.
 */
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)

#ifdef CONFIG_CGROUP_WRITEBACK

#define GDTC_INIT(__wb)                .wb = (__wb),                                \
                                .dom = &global_wb_domain,                \
                                .wb_completions = &(__wb)->completions

#define GDTC_INIT_NO_WB                .dom = &global_wb_domain

#define MDTC_INIT(__wb, __gdtc)        .wb = (__wb),                                \
                                .dom = mem_cgroup_wb_domain(__wb),        \
                                .wb_completions = &(__wb)->memcg_completions, \
                                .gdtc = __gdtc

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
        return dtc->dom;
}

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
        return dtc->dom;
}

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
        return mdtc->gdtc;
}

static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
        return &wb->memcg_completions;
}

static void wb_min_max_ratio(struct bdi_writeback *wb,
                             unsigned long *minp, unsigned long *maxp)
{
        unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
        unsigned long long min = wb->bdi->min_ratio;
        unsigned long long max = wb->bdi->max_ratio;

        /*
         * @wb may already be clean by the time control reaches here and
         * the total may not include its bw.
         */
        if (this_bw < tot_bw) {
                if (min) {
                        min *= this_bw;
                        min = div64_ul(min, tot_bw);
                }
                if (max < 100 * BDI_RATIO_SCALE) {
                        max *= this_bw;
                        max = div64_ul(max, tot_bw);
                }
        }

        *minp = min;
        *maxp = max;
}

#else        /* CONFIG_CGROUP_WRITEBACK */

#define GDTC_INIT(__wb)                .wb = (__wb),                           \
                                .wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB
#define MDTC_INIT(__wb, __gdtc)

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
        return false;
}

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
        return &global_wb_domain;
}

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
        return NULL;
}

static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
        return NULL;
}

static void wb_min_max_ratio(struct bdi_writeback *wb,
                             unsigned long *minp, unsigned long *maxp)
{
        *minp = wb->bdi->min_ratio;
        *maxp = wb->bdi->max_ratio;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * In a memory zone, there is a certain amount of pages we consider
 * available for the page cache, which is essentially the number of
 * free and reclaimable pages, minus some zone reserves to protect
 * lowmem and the ability to uphold the zone's watermarks without
 * requiring writeback.
 *
 * This number of dirtyable pages is the base value of which the
 * user-configurable dirty ratio is the effective number of pages that
 * are allowed to be actually dirtied.  Per individual zone, or
 * globally by using the sum of dirtyable pages over all zones.
 *
 * Because the user is allowed to specify the dirty limit globally as
 * absolute number of bytes, calculating the per-zone dirty limit can
 * require translating the configured limit into a percentage of
 * global dirtyable memory first.
 */

/**
 * node_dirtyable_memory - number of dirtyable pages in a node
 * @pgdat: the node
 *
 * Return: the node's number of pages potentially available for dirty
 * page cache.  This is the base value for the per-node dirty limits.
 */
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
{
        unsigned long nr_pages = 0;
        int z;

        for (z = 0; z < MAX_NR_ZONES; z++) {
                struct zone *zone = pgdat->node_zones + z;

                if (!populated_zone(zone))
                        continue;

                nr_pages += zone_page_state(zone, NR_FREE_PAGES);
        }

        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
         * clean pages in order to balance the zones.
         */
        nr_pages -= min(nr_pages, pgdat->totalreserve_pages);

        nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
        nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);

        return nr_pages;
}

static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
        int node;
        unsigned long x = 0;
        int i;

        for_each_node_state(node, N_HIGH_MEMORY) {
                for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
                        struct zone *z;
                        unsigned long nr_pages;

                        if (!is_highmem_idx(i))
                                continue;

                        z = &NODE_DATA(node)->node_zones[i];
                        if (!populated_zone(z))
                                continue;

                        nr_pages = zone_page_state(z, NR_FREE_PAGES);
                        /* watch for underflows */
                        nr_pages -= min(nr_pages, high_wmark_pages(z));
                        nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
                        nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
                        x += nr_pages;
                }
        }

        /*
         * Make sure that the number of highmem pages is never larger
         * than the number of the total dirtyable memory. This can only
         * occur in very strange VM situations but we want to make sure
         * that this does not occur.
         */
        return min(x, total);
#else
        return 0;
#endif
}

/**
 * global_dirtyable_memory - number of globally dirtyable pages
 *
 * Return: the global number of pages potentially available for dirty
 * page cache.  This is the base value for the global dirty limits.
 */
static unsigned long global_dirtyable_memory(void)
{
        unsigned long x;

        x = global_zone_page_state(NR_FREE_PAGES);
        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
         * clean pages in order to balance the zones.
         */
        x -= min(x, totalreserve_pages);

        x += global_node_page_state(NR_INACTIVE_FILE);
        x += global_node_page_state(NR_ACTIVE_FILE);

        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);

        return x + 1;        /* Ensure that we never return 0 */
}

/**
 * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
 * @dtc: dirty_throttle_control of interest
 *
 * Calculate @dtc->thresh and ->bg_thresh considering
 * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
 * must ensure that @dtc->avail is set before calling this function.  The
 * dirty limits will be lifted by 1/4 for real-time tasks.
 */
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
        const unsigned long available_memory = dtc->avail;
        struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
        unsigned long bytes = vm_dirty_bytes;
        unsigned long bg_bytes = dirty_background_bytes;
        /* convert ratios to per-PAGE_SIZE for higher precision */
        unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
        unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
        unsigned long thresh;
        unsigned long bg_thresh;
        struct task_struct *tsk;

        /* gdtc is !NULL iff @dtc is for memcg domain */
        if (gdtc) {
                unsigned long global_avail = gdtc->avail;

                /*
                 * The byte settings can't be applied directly to memcg
                 * domains.  Convert them to ratios by scaling against
                 * globally available memory.  As the ratios are in
                 * per-PAGE_SIZE, they can be obtained by dividing bytes by
                 * number of pages.
                 */
                if (bytes)
                        ratio = min(DIV_ROUND_UP(bytes, global_avail),
                                    PAGE_SIZE);
                if (bg_bytes)
                        bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
                                       PAGE_SIZE);
                bytes = bg_bytes = 0;
        }

        if (bytes)
                thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
        else
                thresh = (ratio * available_memory) / PAGE_SIZE;

        if (bg_bytes)
                bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
        else
                bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;

        tsk = current;
        if (rt_or_dl_task(tsk)) {
                bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
                thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
        }
        /*
         * Dirty throttling logic assumes the limits in page units fit into
         * 32-bits. This gives 16TB dirty limits max which is hopefully enough.
         */
        if (thresh > UINT_MAX)
                thresh = UINT_MAX;
        /* This makes sure bg_thresh is within 32-bits as well */
        if (bg_thresh >= thresh)
                bg_thresh = thresh / 2;
        dtc->thresh = thresh;
        dtc->bg_thresh = bg_thresh;

        /* we should eventually report the domain in the TP */
        if (!gdtc)
                trace_global_dirty_state(bg_thresh, thresh);
}

/**
 * global_dirty_limits - background-writeback and dirty-throttling thresholds
 * @pbackground: out parameter for bg_thresh
 * @pdirty: out parameter for thresh
 *
 * Calculate bg_thresh and thresh for global_wb_domain.  See
 * domain_dirty_limits() for details.
 */
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };

        gdtc.avail = global_dirtyable_memory();
        domain_dirty_limits(&gdtc);

        *pbackground = gdtc.bg_thresh;
        *pdirty = gdtc.thresh;
}

/**
 * node_dirty_limit - maximum number of dirty pages allowed in a node
 * @pgdat: the node
 *
 * Return: the maximum number of dirty pages allowed in a node, based
 * on the node's dirtyable memory.
 */
static unsigned long node_dirty_limit(struct pglist_data *pgdat)
{
        unsigned long node_memory = node_dirtyable_memory(pgdat);
        struct task_struct *tsk = current;
        unsigned long dirty;

        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
                        node_memory / global_dirtyable_memory();
        else
                dirty = vm_dirty_ratio * node_memory / 100;

        if (rt_or_dl_task(tsk))
                dirty += dirty / 4;

        /*
         * Dirty throttling logic assumes the limits in page units fit into
         * 32-bits. This gives 16TB dirty limits max which is hopefully enough.
         */
        return min_t(unsigned long, dirty, UINT_MAX);
}

/**
 * node_dirty_ok - tells whether a node is within its dirty limits
 * @pgdat: the node to check
 *
 * Return: %true when the dirty pages in @pgdat are within the node's
 * dirty limit, %false if the limit is exceeded.
 */
bool node_dirty_ok(struct pglist_data *pgdat)
{
        unsigned long limit = node_dirty_limit(pgdat);
        unsigned long nr_pages = 0;

        nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
        nr_pages += node_page_state(pgdat, NR_WRITEBACK);

        return nr_pages <= limit;
}

#ifdef CONFIG_SYSCTL
static int dirty_background_ratio_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_bytes = 0;
        return ret;
}

static int dirty_background_bytes_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;
        unsigned long old_bytes = dirty_background_bytes;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write) {
                if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) >
                                                                UINT_MAX) {
                        dirty_background_bytes = old_bytes;
                        return -ERANGE;
                }
                dirty_background_ratio = 0;
        }
        return ret;
}

static int dirty_ratio_handler(const struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int old_ratio = vm_dirty_ratio;
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
                vm_dirty_bytes = 0;
                writeback_set_ratelimit();
        }
        return ret;
}

static int dirty_bytes_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        unsigned long old_bytes = vm_dirty_bytes;
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
                if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) {
                        vm_dirty_bytes = old_bytes;
                        return -ERANGE;
                }
                writeback_set_ratelimit();
                vm_dirty_ratio = 0;
        }
        return ret;
}
#endif

static unsigned long wp_next_time(unsigned long cur_time)
{
        cur_time += VM_COMPLETIONS_PERIOD_LEN;
        /* 0 has a special meaning... */
        if (!cur_time)
                return 1;
        return cur_time;
}

static void wb_domain_writeout_add(struct wb_domain *dom,
                                   struct fprop_local_percpu *completions,
                                   unsigned int max_prop_frac, long nr)
{
        __fprop_add_percpu_max(&dom->completions, completions,
                               max_prop_frac, nr);
        /* First event after period switching was turned off? */
        if (unlikely(!dom->period_time)) {
                /*
                 * We can race with other wb_domain_writeout_add calls here but
                 * it does not cause any harm since the resulting time when
                 * timer will fire and what is in writeout_period_time will be
                 * roughly the same.
                 */
                dom->period_time = wp_next_time(jiffies);
                mod_timer(&dom->period_timer, dom->period_time);
        }
}

/*
 * Increment @wb's writeout completion count and the global writeout
 * completion count. Called from __folio_end_writeback().
 */
static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
{
        struct wb_domain *cgdom;

        wb_stat_mod(wb, WB_WRITTEN, nr);
        wb_domain_writeout_add(&global_wb_domain, &wb->completions,
                               wb->bdi->max_prop_frac, nr);

        cgdom = mem_cgroup_wb_domain(wb);
        if (cgdom)
                wb_domain_writeout_add(cgdom, wb_memcg_completions(wb),
                                       wb->bdi->max_prop_frac, nr);
}

void wb_writeout_inc(struct bdi_writeback *wb)
{
        unsigned long flags;

        local_irq_save(flags);
        __wb_writeout_add(wb, 1);
        local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(wb_writeout_inc);

/*
 * On idle system, we can be called long after we scheduled because we use
 * deferred timers so count with missed periods.
 */
static void writeout_period(struct timer_list *t)
{
        struct wb_domain *dom = timer_container_of(dom, t, period_timer);
        int miss_periods = (jiffies - dom->period_time) /
                                                 VM_COMPLETIONS_PERIOD_LEN;

        if (fprop_new_period(&dom->completions, miss_periods + 1)) {
                dom->period_time = wp_next_time(dom->period_time +
                                miss_periods * VM_COMPLETIONS_PERIOD_LEN);
                mod_timer(&dom->period_timer, dom->period_time);
        } else {
                /*
                 * Aging has zeroed all fractions. Stop wasting CPU on period
                 * updates.
                 */
                dom->period_time = 0;
        }
}

int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
{
        memset(dom, 0, sizeof(*dom));

        spin_lock_init(&dom->lock);

        timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);

        dom->dirty_limit_tstamp = jiffies;

        return fprop_global_init(&dom->completions, gfp);
}

#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
        timer_delete_sync(&dom->period_timer);
        fprop_global_destroy(&dom->completions);
}
#endif

/*
 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
 * registered backing devices, which, for obvious reasons, can not
 * exceed 100%.
 */
static unsigned int bdi_min_ratio;

static int bdi_check_pages_limit(unsigned long pages)
{
        unsigned long max_dirty_pages = global_dirtyable_memory();

        if (pages > max_dirty_pages)
                return -EINVAL;

        return 0;
}

static unsigned long bdi_ratio_from_pages(unsigned long pages)
{
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long ratio;

        global_dirty_limits(&background_thresh, &dirty_thresh);
        if (!dirty_thresh)
                return -EINVAL;
        ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh);

        return ratio;
}

static u64 bdi_get_bytes(unsigned int ratio)
{
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        u64 bytes;

        global_dirty_limits(&background_thresh, &dirty_thresh);
        bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100;

        return bytes;
}

static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
        unsigned int delta;
        int ret = 0;

        if (min_ratio > 100 * BDI_RATIO_SCALE)
                return -EINVAL;

        spin_lock_bh(&bdi_lock);
        if (min_ratio > bdi->max_ratio) {
                ret = -EINVAL;
        } else {
                if (min_ratio < bdi->min_ratio) {
                        delta = bdi->min_ratio - min_ratio;
                        bdi_min_ratio -= delta;
                        bdi->min_ratio = min_ratio;
                } else {
                        delta = min_ratio - bdi->min_ratio;
                        if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) {
                                bdi_min_ratio += delta;
                                bdi->min_ratio = min_ratio;
                        } else {
                                ret = -EINVAL;
                        }
                }
        }
        spin_unlock_bh(&bdi_lock);

        return ret;
}

static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
{
        int ret = 0;

        if (max_ratio > 100 * BDI_RATIO_SCALE)
                return -EINVAL;

        spin_lock_bh(&bdi_lock);
        if (bdi->min_ratio > max_ratio) {
                ret = -EINVAL;
        } else {
                bdi->max_ratio = max_ratio;
                bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) /
                                                (100 * BDI_RATIO_SCALE);
        }
        spin_unlock_bh(&bdi_lock);

        return ret;
}

int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio)
{
        return __bdi_set_min_ratio(bdi, min_ratio);
}

int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio)
{
        return __bdi_set_max_ratio(bdi, max_ratio);
}

int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
        return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE);
}

int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
{
        return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE);
}
EXPORT_SYMBOL(bdi_set_max_ratio);

u64 bdi_get_min_bytes(struct backing_dev_info *bdi)
{
        return bdi_get_bytes(bdi->min_ratio);
}

int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes)
{
        int ret;
        unsigned long pages = min_bytes >> PAGE_SHIFT;
        long min_ratio;

        ret = bdi_check_pages_limit(pages);
        if (ret)
                return ret;

        min_ratio = bdi_ratio_from_pages(pages);
        if (min_ratio < 0)
                return min_ratio;
        return __bdi_set_min_ratio(bdi, min_ratio);
}

u64 bdi_get_max_bytes(struct backing_dev_info *bdi)
{
        return bdi_get_bytes(bdi->max_ratio);
}

int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes)
{
        int ret;
        unsigned long pages = max_bytes >> PAGE_SHIFT;
        long max_ratio;

        ret = bdi_check_pages_limit(pages);
        if (ret)
                return ret;

        max_ratio = bdi_ratio_from_pages(pages);
        if (max_ratio < 0)
                return max_ratio;
        return __bdi_set_max_ratio(bdi, max_ratio);
}

int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit)
{
        if (strict_limit > 1)
                return -EINVAL;

        spin_lock_bh(&bdi_lock);
        if (strict_limit)
                bdi->capabilities |= BDI_CAP_STRICTLIMIT;
        else
                bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
        spin_unlock_bh(&bdi_lock);

        return 0;
}

static unsigned long dirty_freerun_ceiling(unsigned long thresh,
                                           unsigned long bg_thresh)
{
        return (thresh + bg_thresh) / 2;
}

static unsigned long hard_dirty_limit(struct wb_domain *dom,
                                      unsigned long thresh)
{
        return max(thresh, dom->dirty_limit);
}

/*
 * Memory which can be further allocated to a memcg domain is capped by
 * system-wide clean memory excluding the amount being used in the domain.
 */
static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
                            unsigned long filepages, unsigned long headroom)
{
        struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
        unsigned long clean = filepages - min(filepages, mdtc->dirty);
        unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
        unsigned long other_clean = global_clean - min(global_clean, clean);

        mdtc->avail = filepages + min(headroom, other_clean);
}

static inline bool dtc_is_global(struct dirty_throttle_control *dtc)
{
        return mdtc_gdtc(dtc) == NULL;
}

/*
 * Dirty background will ignore pages being written as we're trying to
 * decide whether to put more under writeback.
 */
static void domain_dirty_avail(struct dirty_throttle_control *dtc,
                               bool include_writeback)
{
        if (dtc_is_global(dtc)) {
                dtc->avail = global_dirtyable_memory();
                dtc->dirty = global_node_page_state(NR_FILE_DIRTY);
                if (include_writeback)
                        dtc->dirty += global_node_page_state(NR_WRITEBACK);
        } else {
                unsigned long filepages = 0, headroom = 0, writeback = 0;

                mem_cgroup_wb_stats(dtc->wb, &filepages, &headroom, &dtc->dirty,
                                    &writeback);
                if (include_writeback)
                        dtc->dirty += writeback;
                mdtc_calc_avail(dtc, filepages, headroom);
        }
}

/**
 * __wb_calc_thresh - @wb's share of dirty threshold
 * @dtc: dirty_throttle_context of interest
 * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc
 *
 * Note that balance_dirty_pages() will only seriously take dirty throttling
 * threshold as a hard limit when sleeping max_pause per page is not enough
 * to keep the dirty pages under control. For example, when the device is
 * completely stalled due to some error conditions, or when there are 1000
 * dd tasks writing to a slow 10MB/s USB key.
 * In the other normal situations, it acts more gently by throttling the tasks
 * more (rather than completely block them) when the wb dirty pages go high.
 *
 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
 * - starving fast devices
 * - piling up dirty pages (that will take long time to sync) on slow devices
 *
 * The wb's share of dirty limit will be adapting to its throughput and
 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
 *
 * Return: @wb's dirty limit in pages. For dirty throttling limit, the term
 * "dirty" in the context of dirty balancing includes all PG_dirty and
 * PG_writeback pages.
 */
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
                                      unsigned long thresh)
{
        struct wb_domain *dom = dtc_dom(dtc);
        struct bdi_writeback *wb = dtc->wb;
        u64 wb_thresh;
        u64 wb_max_thresh;
        unsigned long numerator, denominator;
        unsigned long wb_min_ratio, wb_max_ratio;

        /*
         * Calculate this wb's share of the thresh ratio.
         */
        fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
                              &numerator, &denominator);

        wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE);
        wb_thresh *= numerator;
        wb_thresh = div64_ul(wb_thresh, denominator);

        wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio);

        wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);

        /*
         * It's very possible that wb_thresh is close to 0 not because the
         * device is slow, but that it has remained inactive for long time.
         * Honour such devices a reasonable good (hopefully IO efficient)
         * threshold, so that the occasional writes won't be blocked and active
         * writes can rampup the threshold quickly.
         */
        if (thresh > dtc->dirty) {
                if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT))
                        wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100);
                else
                        wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8);
        }

        wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
        if (wb_thresh > wb_max_thresh)
                wb_thresh = wb_max_thresh;

        return wb_thresh;
}

unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };

        domain_dirty_avail(&gdtc, true);
        return __wb_calc_thresh(&gdtc, thresh);
}

unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
        struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };

        domain_dirty_avail(&gdtc, true);
        domain_dirty_avail(&mdtc, true);
        domain_dirty_limits(&mdtc);

        return __wb_calc_thresh(&mdtc, mdtc.thresh);
}

/*
 *                           setpoint - dirty 3
 *        f(dirty) := 1.0 + (----------------)
 *                           limit - setpoint
 *
 * it's a 3rd order polynomial that subjects to
 *
 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
 * (2) f(setpoint) = 1.0 => the balance point
 * (3) f(limit)    = 0   => the hard limit
 * (4) df/dx      <= 0         => negative feedback control
 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
 *     => fast response on large errors; small oscillation near setpoint
 */
static long long pos_ratio_polynom(unsigned long setpoint,
                                          unsigned long dirty,
                                          unsigned long limit)
{
        long long pos_ratio;
        long x;

        x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
                      (limit - setpoint) | 1);
        pos_ratio = x;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
        pos_ratio += 1 << RATELIMIT_CALC_SHIFT;

        return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
}

/*
 * Dirty position control.
 *
 * (o) global/bdi setpoints
 *
 * We want the dirty pages be balanced around the global/wb setpoints.
 * When the number of dirty pages is higher/lower than the setpoint, the
 * dirty position control ratio (and hence task dirty ratelimit) will be
 * decreased/increased to bring the dirty pages back to the setpoint.
 *
 *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
 *
 *     if (dirty < setpoint) scale up   pos_ratio
 *     if (dirty > setpoint) scale down pos_ratio
 *
 *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
 *     if (wb_dirty > wb_setpoint) scale down pos_ratio
 *
 *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
 *
 * (o) global control line
 *
 *     ^ pos_ratio
 *     |
 *     |            |<===== global dirty control scope ======>|
 * 2.0  * * * * * * *
 *     |            .*
 *     |            . *
 *     |            .   *
 *     |            .     *
 *     |            .        *
 *     |            .            *
 * 1.0 ................................*
 *     |            .                  .     *
 *     |            .                  .          *
 *     |            .                  .              *
 *     |            .                  .                 *
 *     |            .                  .                    *
 *   0 +------------.------------------.----------------------*------------->
 *           freerun^          setpoint^                 limit^   dirty pages
 *
 * (o) wb control line
 *
 *     ^ pos_ratio
 *     |
 *     |            *
 *     |              *
 *     |                *
 *     |                  *
 *     |                    * |<=========== span ============>|
 * 1.0 .......................*
 *     |                      . *
 *     |                      .   *
 *     |                      .     *
 *     |                      .       *
 *     |                      .         *
 *     |                      .           *
 *     |                      .             *
 *     |                      .               *
 *     |                      .                 *
 *     |                      .                   *
 *     |                      .                     *
 * 1/4 ...............................................* * * * * * * * * * * *
 *     |                      .                         .
 *     |                      .                           .
 *     |                      .                             .
 *   0 +----------------------.-------------------------------.------------->
 *                wb_setpoint^                    x_intercept^
 *
 * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
 * be smoothly throttled down to normal if it starts high in situations like
 * - start writing to a slow SD card and a fast disk at the same time. The SD
 *   card's wb_dirty may rush to many times higher than wb_setpoint.
 * - the wb dirty thresh drops quickly due to change of JBOD workload
 */
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
        unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long wb_thresh = dtc->wb_thresh;
        unsigned long x_intercept;
        unsigned long setpoint;                /* dirty pages' target balance point */
        unsigned long wb_setpoint;
        unsigned long span;
        long long pos_ratio;                /* for scaling up/down the rate limit */
        long x;

        dtc->pos_ratio = 0;

        if (unlikely(dtc->dirty >= limit))
                return;

        /*
         * global setpoint
         *
         * See comment for pos_ratio_polynom().
         */
        setpoint = (freerun + limit) / 2;
        pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);

        /*
         * The strictlimit feature is a tool preventing mistrusted filesystems
         * from growing a large number of dirty pages before throttling. For
         * such filesystems balance_dirty_pages always checks wb counters
         * against wb limits. Even if global "nr_dirty" is under "freerun".
         * This is especially important for fuse which sets bdi->max_ratio to
         * 1% by default.
         *
         * Here, in wb_position_ratio(), we calculate pos_ratio based on
         * two values: wb_dirty and wb_thresh. Let's consider an example:
         * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
         * limits are set by default to 10% and 20% (background and throttle).
         * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
         * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
         * about ~6K pages (as the average of background and throttle wb
         * limits). The 3rd order polynomial will provide positive feedback if
         * wb_dirty is under wb_setpoint and vice versa.
         *
         * Note, that we cannot use global counters in these calculations
         * because we want to throttle process writing to a strictlimit wb
         * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
         * in the example above).
         */
        if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
                long long wb_pos_ratio;

                if (dtc->wb_dirty >= wb_thresh)
                        return;

                wb_setpoint = dirty_freerun_ceiling(wb_thresh,
                                                    dtc->wb_bg_thresh);

                if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
                        return;

                wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
                                                 wb_thresh);

                /*
                 * Typically, for strictlimit case, wb_setpoint << setpoint
                 * and pos_ratio >> wb_pos_ratio. In the other words global
                 * state ("dirty") is not limiting factor and we have to
                 * make decision based on wb counters. But there is an
                 * important case when global pos_ratio should get precedence:
                 * global limits are exceeded (e.g. due to activities on other
                 * wb's) while given strictlimit wb is below limit.
                 *
                 * "pos_ratio * wb_pos_ratio" would work for the case above,
                 * but it would look too non-natural for the case of all
                 * activity in the system coming from a single strictlimit wb
                 * with bdi->max_ratio == 100%.
                 *
                 * Note that min() below somewhat changes the dynamics of the
                 * control system. Normally, pos_ratio value can be well over 3
                 * (when globally we are at freerun and wb is well below wb
                 * setpoint). Now the maximum pos_ratio in the same situation
                 * is 2. We might want to tweak this if we observe the control
                 * system is too slow to adapt.
                 */
                dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
                return;
        }

        /*
         * We have computed basic pos_ratio above based on global situation. If
         * the wb is over/under its share of dirty pages, we want to scale
         * pos_ratio further down/up. That is done by the following mechanism.
         */

        /*
         * wb setpoint
         *
         *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
         *
         *                        x_intercept - wb_dirty
         *                     := --------------------------
         *                        x_intercept - wb_setpoint
         *
         * The main wb control line is a linear function that subjects to
         *
         * (1) f(wb_setpoint) = 1.0
         * (2) k = - 1 / (8 * write_bw)  (in single wb case)
         *     or equally: x_intercept = wb_setpoint + 8 * write_bw
         *
         * For single wb case, the dirty pages are observed to fluctuate
         * regularly within range
         *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
         * for various filesystems, where (2) can yield in a reasonable 12.5%
         * fluctuation range for pos_ratio.
         *
         * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
         * own size, so move the slope over accordingly and choose a slope that
         * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
         */
        if (unlikely(wb_thresh > dtc->thresh))
                wb_thresh = dtc->thresh;
        /*
         * scale global setpoint to wb's:
         *        wb_setpoint = setpoint * wb_thresh / thresh
         */
        x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
        wb_setpoint = setpoint * (u64)x >> 16;
        /*
         * Use span=(8*write_bw) in single wb case as indicated by
         * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
         *
         *        wb_thresh                    thresh - wb_thresh
         * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
         *         thresh                           thresh
         */
        span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
        x_intercept = wb_setpoint + span;

        if (dtc->wb_dirty < x_intercept - span / 4) {
                pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
                                      (x_intercept - wb_setpoint) | 1);
        } else
                pos_ratio /= 4;

        /*
         * wb reserve area, safeguard against dirty pool underrun and disk idle
         * It may push the desired control point of global dirty pages higher
         * than setpoint.
         */
        x_intercept = wb_thresh / 2;
        if (dtc->wb_dirty < x_intercept) {
                if (dtc->wb_dirty > x_intercept / 8)
                        pos_ratio = div_u64(pos_ratio * x_intercept,
                                            dtc->wb_dirty);
                else
                        pos_ratio *= 8;
        }

        dtc->pos_ratio = pos_ratio;
}

static void wb_update_write_bandwidth(struct bdi_writeback *wb,
                                      unsigned long elapsed,
                                      unsigned long written)
{
        const unsigned long period = roundup_pow_of_two(3 * HZ);
        unsigned long avg = wb->avg_write_bandwidth;
        unsigned long old = wb->write_bandwidth;
        u64 bw;

        /*
         * bw = written * HZ / elapsed
         *
         *                   bw * elapsed + write_bandwidth * (period - elapsed)
         * write_bandwidth = ---------------------------------------------------
         *                                          period
         *
         * @written may have decreased due to folio_redirty_for_writepage().
         * Avoid underflowing @bw calculation.
         */
        bw = written - min(written, wb->written_stamp);
        bw *= HZ;
        if (unlikely(elapsed > period)) {
                bw = div64_ul(bw, elapsed);
                avg = bw;
                goto out;
        }
        bw += (u64)wb->write_bandwidth * (period - elapsed);
        bw >>= ilog2(period);

        /*
         * one more level of smoothing, for filtering out sudden spikes
         */
        if (avg > old && old >= (unsigned long)bw)
                avg -= (avg - old) >> 3;

        if (avg < old && old <= (unsigned long)bw)
                avg += (old - avg) >> 3;

out:
        /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
        avg = max(avg, 1LU);
        if (wb_has_dirty_io(wb)) {
                long delta = avg - wb->avg_write_bandwidth;
                WARN_ON_ONCE(atomic_long_add_return(delta,
                                        &wb->bdi->tot_write_bandwidth) <= 0);
        }
        wb->write_bandwidth = bw;
        WRITE_ONCE(wb->avg_write_bandwidth, avg);
}

static void update_dirty_limit(struct dirty_throttle_control *dtc)
{
        struct wb_domain *dom = dtc_dom(dtc);
        unsigned long thresh = dtc->thresh;
        unsigned long limit = dom->dirty_limit;

        /*
         * Follow up in one step.
         */
        if (limit < thresh) {
                limit = thresh;
                goto update;
        }

        /*
         * Follow down slowly. Use the higher one as the target, because thresh
         * may drop below dirty. This is exactly the reason to introduce
         * dom->dirty_limit which is guaranteed to lie above the dirty pages.
         */
        thresh = max(thresh, dtc->dirty);
        if (limit > thresh) {
                limit -= (limit - thresh) >> 5;
                goto update;
        }
        return;
update:
        dom->dirty_limit = limit;
}

static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
                                      unsigned long now)
{
        struct wb_domain *dom = dtc_dom(dtc);

        /*
         * check locklessly first to optimize away locking for the most time
         */
        if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
                return;

        spin_lock(&dom->lock);
        if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
                update_dirty_limit(dtc);
                dom->dirty_limit_tstamp = now;
        }
        spin_unlock(&dom->lock);
}

/*
 * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
 *
 * Normal wb tasks will be curbed at or below it in long term.
 * Obviously it should be around (write_bw / N) when there are N dd tasks.
 */
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
                                      unsigned long dirtied,
                                      unsigned long elapsed)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long dirty = dtc->dirty;
        unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
        unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long setpoint = (freerun + limit) / 2;
        unsigned long write_bw = wb->avg_write_bandwidth;
        unsigned long dirty_ratelimit = wb->dirty_ratelimit;
        unsigned long dirty_rate;
        unsigned long task_ratelimit;
        unsigned long balanced_dirty_ratelimit;
        unsigned long step;
        unsigned long x;
        unsigned long shift;

        /*
         * The dirty rate will match the writeout rate in long term, except
         * when dirty pages are truncated by userspace or re-dirtied by FS.
         */
        dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;

        /*
         * task_ratelimit reflects each dd's dirty rate for the past 200ms.
         */
        task_ratelimit = (u64)dirty_ratelimit *
                                        dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
        task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */

        /*
         * A linear estimation of the "balanced" throttle rate. The theory is,
         * if there are N dd tasks, each throttled at task_ratelimit, the wb's
         * dirty_rate will be measured to be (N * task_ratelimit). So the below
         * formula will yield the balanced rate limit (write_bw / N).
         *
         * Note that the expanded form is not a pure rate feedback:
         *        rate_(i+1) = rate_(i) * (write_bw / dirty_rate)                     (1)
         * but also takes pos_ratio into account:
         *        rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
         *
         * (1) is not realistic because pos_ratio also takes part in balancing
         * the dirty rate.  Consider the state
         *        pos_ratio = 0.5                                                     (3)
         *        rate = 2 * (write_bw / N)                                     (4)
         * If (1) is used, it will stuck in that state! Because each dd will
         * be throttled at
         *        task_ratelimit = pos_ratio * rate = (write_bw / N)             (5)
         * yielding
         *        dirty_rate = N * task_ratelimit = write_bw                     (6)
         * put (6) into (1) we get
         *        rate_(i+1) = rate_(i)                                             (7)
         *
         * So we end up using (2) to always keep
         *        rate_(i+1) ~= (write_bw / N)                                     (8)
         * regardless of the value of pos_ratio. As long as (8) is satisfied,
         * pos_ratio is able to drive itself to 1.0, which is not only where
         * the dirty count meet the setpoint, but also where the slope of
         * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
         */
        balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
                                           dirty_rate | 1);
        /*
         * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
         */
        if (unlikely(balanced_dirty_ratelimit > write_bw))
                balanced_dirty_ratelimit = write_bw;

        /*
         * We could safely do this and return immediately:
         *
         *        wb->dirty_ratelimit = balanced_dirty_ratelimit;
         *
         * However to get a more stable dirty_ratelimit, the below elaborated
         * code makes use of task_ratelimit to filter out singular points and
         * limit the step size.
         *
         * The below code essentially only uses the relative value of
         *
         *        task_ratelimit - dirty_ratelimit
         *        = (pos_ratio - 1) * dirty_ratelimit
         *
         * which reflects the direction and size of dirty position error.
         */

        /*
         * dirty_ratelimit will follow balanced_dirty_ratelimit iff
         * task_ratelimit is on the same side of dirty_ratelimit, too.
         * For example, when
         * - dirty_ratelimit > balanced_dirty_ratelimit
         * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
         * lowering dirty_ratelimit will help meet both the position and rate
         * control targets. Otherwise, don't update dirty_ratelimit if it will
         * only help meet the rate target. After all, what the users ultimately
         * feel and care are stable dirty rate and small position error.
         *
         * |task_ratelimit - dirty_ratelimit| is used to limit the step size
         * and filter out the singular points of balanced_dirty_ratelimit. Which
         * keeps jumping around randomly and can even leap far away at times
         * due to the small 200ms estimation period of dirty_rate (we want to
         * keep that period small to reduce time lags).
         */
        step = 0;

        /*
         * For strictlimit case, calculations above were based on wb counters
         * and limits (starting from pos_ratio = wb_position_ratio() and up to
         * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
         * Hence, to calculate "step" properly, we have to use wb_dirty as
         * "dirty" and wb_setpoint as "setpoint".
         */
        if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
                dirty = dtc->wb_dirty;
                setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
        }

        if (dirty < setpoint) {
                x = min3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit < x)
                        step = x - dirty_ratelimit;
        } else {
                x = max3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit > x)
                        step = dirty_ratelimit - x;
        }

        /*
         * Don't pursue 100% rate matching. It's impossible since the balanced
         * rate itself is constantly fluctuating. So decrease the track speed
         * when it gets close to the target. Helps eliminate pointless tremors.
         */
        shift = dirty_ratelimit / (2 * step + 1);
        if (shift < BITS_PER_LONG)
                step = DIV_ROUND_UP(step >> shift, 8);
        else
                step = 0;

        if (dirty_ratelimit < balanced_dirty_ratelimit)
                dirty_ratelimit += step;
        else
                dirty_ratelimit -= step;

        WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
        wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;

        trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
}

static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
                                  struct dirty_throttle_control *mdtc,
                                  bool update_ratelimit)
{
        struct bdi_writeback *wb = gdtc->wb;
        unsigned long now = jiffies;
        unsigned long elapsed;
        unsigned long dirtied;
        unsigned long written;

        spin_lock(&wb->list_lock);

        /*
         * Lockless checks for elapsed time are racy and delayed update after
         * IO completion doesn't do it at all (to make sure written pages are
         * accounted reasonably quickly). Make sure elapsed >= 1 to avoid
         * division errors.
         */
        elapsed = max(now - wb->bw_time_stamp, 1UL);
        dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
        written = percpu_counter_read(&wb->stat[WB_WRITTEN]);

        if (update_ratelimit) {
                domain_update_dirty_limit(gdtc, now);
                wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);

                /*
                 * @mdtc is always NULL if !CGROUP_WRITEBACK but the
                 * compiler has no way to figure that out.  Help it.
                 */
                if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
                        domain_update_dirty_limit(mdtc, now);
                        wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
                }
        }
        wb_update_write_bandwidth(wb, elapsed, written);

        wb->dirtied_stamp = dirtied;
        wb->written_stamp = written;
        WRITE_ONCE(wb->bw_time_stamp, now);
        spin_unlock(&wb->list_lock);
}

void wb_update_bandwidth(struct bdi_writeback *wb)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };

        __wb_update_bandwidth(&gdtc, NULL, false);
}

/* Interval after which we consider wb idle and don't estimate bandwidth */
#define WB_BANDWIDTH_IDLE_JIF (HZ)

static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
{
        unsigned long now = jiffies;
        unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);

        if (elapsed > WB_BANDWIDTH_IDLE_JIF &&
            !atomic_read(&wb->writeback_inodes)) {
                spin_lock(&wb->list_lock);
                wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
                wb->written_stamp = wb_stat(wb, WB_WRITTEN);
                WRITE_ONCE(wb->bw_time_stamp, now);
                spin_unlock(&wb->list_lock);
        }
}

/*
 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
 * will look to see if it needs to start dirty throttling.
 *
 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
 * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
 * (the number of pages we may dirty without exceeding the dirty limits).
 */
static unsigned long dirty_poll_interval(unsigned long dirty,
                                         unsigned long thresh)
{
        if (thresh > dirty)
                return 1UL << (ilog2(thresh - dirty) >> 1);

        return 1;
}

static unsigned long wb_max_pause(struct bdi_writeback *wb,
                                  unsigned long wb_dirty)
{
        unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long t;

        /*
         * Limit pause time for small memory systems. If sleeping for too long
         * time, a small pool of dirty/writeback pages may go empty and disk go
         * idle.
         *
         * 8 serves as the safety ratio.
         */
        t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
        t++;

        return min_t(unsigned long, t, MAX_PAUSE);
}

static long wb_min_pause(struct bdi_writeback *wb,
                         long max_pause,
                         unsigned long task_ratelimit,
                         unsigned long dirty_ratelimit,
                         int *nr_dirtied_pause)
{
        long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
        long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
        long t;                /* target pause */
        long pause;        /* estimated next pause */
        int pages;        /* target nr_dirtied_pause */

        /* target for 10ms pause on 1-dd case */
        t = max(1, HZ / 100);

        /*
         * Scale up pause time for concurrent dirtiers in order to reduce CPU
         * overheads.
         *
         * (N * 10ms) on 2^N concurrent tasks.
         */
        if (hi > lo)
                t += (hi - lo) * (10 * HZ) / 1024;

        /*
         * This is a bit convoluted. We try to base the next nr_dirtied_pause
         * on the much more stable dirty_ratelimit. However the next pause time
         * will be computed based on task_ratelimit and the two rate limits may
         * depart considerably at some time. Especially if task_ratelimit goes
         * below dirty_ratelimit/2 and the target pause is max_pause, the next
         * pause time will be max_pause*2 _trimmed down_ to max_pause.  As a
         * result task_ratelimit won't be executed faithfully, which could
         * eventually bring down dirty_ratelimit.
         *
         * We apply two rules to fix it up:
         * 1) try to estimate the next pause time and if necessary, use a lower
         *    nr_dirtied_pause so as not to exceed max_pause. When this happens,
         *    nr_dirtied_pause will be "dancing" with task_ratelimit.
         * 2) limit the target pause time to max_pause/2, so that the normal
         *    small fluctuations of task_ratelimit won't trigger rule (1) and
         *    nr_dirtied_pause will remain as stable as dirty_ratelimit.
         */
        t = min(t, 1 + max_pause / 2);
        pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);

        /*
         * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
         * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
         * When the 16 consecutive reads are often interrupted by some dirty
         * throttling pause during the async writes, cfq will go into idles
         * (deadline is fine). So push nr_dirtied_pause as high as possible
         * until reaches DIRTY_POLL_THRESH=32 pages.
         */
        if (pages < DIRTY_POLL_THRESH) {
                t = max_pause;
                pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
                if (pages > DIRTY_POLL_THRESH) {
                        pages = DIRTY_POLL_THRESH;
                        t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
                }
        }

        pause = HZ * pages / (task_ratelimit + 1);
        if (pause > max_pause) {
                t = max_pause;
                pages = task_ratelimit * t / roundup_pow_of_two(HZ);
        }

        *nr_dirtied_pause = pages;
        /*
         * The minimal pause time will normally be half the target pause time.
         */
        return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}

static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long wb_reclaimable;

        /*
         * wb_thresh is not treated as some limiting factor as
         * dirty_thresh, due to reasons
         * - in JBOD setup, wb_thresh can fluctuate a lot
         * - in a system with HDD and USB key, the USB key may somehow
         *   go into state (wb_dirty >> wb_thresh) either because
         *   wb_dirty starts high, or because wb_thresh drops low.
         *   In this case we don't want to hard throttle the USB key
         *   dirtiers for 100 seconds until wb_dirty drops under
         *   wb_thresh. Instead the auxiliary wb control line in
         *   wb_position_ratio() will let the dirtier task progress
         *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
         */
        dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh);
        dtc->wb_bg_thresh = dtc->thresh ?
                div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;

        /*
         * In order to avoid the stacked BDI deadlock we need
         * to ensure we accurately count the 'dirty' pages when
         * the threshold is low.
         *
         * Otherwise it would be possible to get thresh+n pages
         * reported dirty, even though there are thresh-m pages
         * actually dirty; with m+n sitting in the percpu
         * deltas.
         */
        if (dtc->wb_thresh < 2 * wb_stat_error()) {
                wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
                dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
        } else {
                wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
                dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
        }
}

static unsigned long domain_poll_intv(struct dirty_throttle_control *dtc,
                                      bool strictlimit)
{
        unsigned long dirty, thresh;

        if (strictlimit) {
                dirty = dtc->wb_dirty;
                thresh = dtc->wb_thresh;
        } else {
                dirty = dtc->dirty;
                thresh = dtc->thresh;
        }

        return dirty_poll_interval(dirty, thresh);
}

/*
 * Throttle it only when the background writeback cannot catch-up. This avoids
 * (excessively) small writeouts when the wb limits are ramping up in case of
 * !strictlimit.
 *
 * In strictlimit case make decision based on the wb counters and limits. Small
 * writeouts when the wb limits are ramping up are the price we consciously pay
 * for strictlimit-ing.
 */
static void domain_dirty_freerun(struct dirty_throttle_control *dtc,
                                 bool strictlimit)
{
        unsigned long dirty, thresh, bg_thresh;

        if (unlikely(strictlimit)) {
                wb_dirty_limits(dtc);
                dirty = dtc->wb_dirty;
                thresh = dtc->wb_thresh;
                bg_thresh = dtc->wb_bg_thresh;
        } else {
                dirty = dtc->dirty;
                thresh = dtc->thresh;
                bg_thresh = dtc->bg_thresh;
        }
        dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh);
}

static void balance_domain_limits(struct dirty_throttle_control *dtc,
                                  bool strictlimit)
{
        domain_dirty_avail(dtc, true);
        domain_dirty_limits(dtc);
        domain_dirty_freerun(dtc, strictlimit);
}

static void wb_dirty_freerun(struct dirty_throttle_control *dtc,
                             bool strictlimit)
{
        dtc->freerun = false;

        /* was already handled in domain_dirty_freerun */
        if (strictlimit)
                return;

        wb_dirty_limits(dtc);
        /*
         * LOCAL_THROTTLE tasks must not be throttled when below the per-wb
         * freerun ceiling.
         */
        if (!(current->flags & PF_LOCAL_THROTTLE))
                return;

        dtc->freerun = dtc->wb_dirty <
                       dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh);
}

static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc,
                                     bool strictlimit)
{
        dtc->dirty_exceeded = (dtc->wb_dirty > dtc->wb_thresh) &&
                ((dtc->dirty > dtc->thresh) || strictlimit);
}

/*
 * The limits fields dirty_exceeded and pos_ratio won't be updated if wb is
 * in freerun state. Please don't use these invalid fields in freerun case.
 */
static void balance_wb_limits(struct dirty_throttle_control *dtc,
                              bool strictlimit)
{
        wb_dirty_freerun(dtc, strictlimit);
        if (dtc->freerun)
                return;

        wb_dirty_exceeded(dtc, strictlimit);
        wb_position_ratio(dtc);
}

/*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
 * If we're over `background_thresh' then the writeback threads are woken to
 * perform some writeout.
 */
static int balance_dirty_pages(struct bdi_writeback *wb,
                               unsigned long pages_dirtied, unsigned int flags)
{
        struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
        struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
        struct dirty_throttle_control * const gdtc = &gdtc_stor;
        struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
                                                     &mdtc_stor : NULL;
        struct dirty_throttle_control *sdtc;
        unsigned long nr_dirty;
        long period;
        long pause;
        long max_pause;
        long min_pause;
        int nr_dirtied_pause;
        unsigned long task_ratelimit;
        unsigned long dirty_ratelimit;
        struct backing_dev_info *bdi = wb->bdi;
        bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
        unsigned long start_time = jiffies;
        int ret = 0;

        for (;;) {
                unsigned long now = jiffies;

                nr_dirty = global_node_page_state(NR_FILE_DIRTY);

                balance_domain_limits(gdtc, strictlimit);
                if (mdtc) {
                        /*
                         * If @wb belongs to !root memcg, repeat the same
                         * basic calculations for the memcg domain.
                         */
                        balance_domain_limits(mdtc, strictlimit);
                }

                if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb))
                        wb_start_background_writeback(wb);

                /*
                 * If memcg domain is in effect, @dirty should be under
                 * both global and memcg freerun ceilings.
                 */
                if (gdtc->freerun && (!mdtc || mdtc->freerun)) {
                        unsigned long intv;
                        unsigned long m_intv;

free_running:
                        intv = domain_poll_intv(gdtc, strictlimit);
                        m_intv = ULONG_MAX;

                        current->dirty_paused_when = now;
                        current->nr_dirtied = 0;
                        if (mdtc)
                                m_intv = domain_poll_intv(mdtc, strictlimit);
                        current->nr_dirtied_pause = min(intv, m_intv);
                        break;
                }

                /*
                 * Unconditionally start background writeback if it's not
                 * already in progress. We need to do this because the global
                 * dirty threshold check above (nr_dirty > gdtc->bg_thresh)
                 * doesn't account for these cases:
                 *
                 * a) strictlimit BDIs: throttling is calculated using per-wb
                 * thresholds. The per-wb threshold can be exceeded even when
                 * nr_dirty < gdtc->bg_thresh
                 *
                 * b) memcg-based throttling: memcg uses its own dirty count and
                 * thresholds and can trigger throttling even when global
                 * nr_dirty < gdtc->bg_thresh
                 *
                 * Writeback needs to be started else the writer stalls in the
                 * throttle loop waiting for dirty pages to be written back
                 * while no writeback is running.
                 */
                if (unlikely(!writeback_in_progress(wb)))
                        wb_start_background_writeback(wb);

                mem_cgroup_flush_foreign(wb);

                /*
                 * Calculate global domain's pos_ratio and select the
                 * global dtc by default.
                 */
                balance_wb_limits(gdtc, strictlimit);
                if (gdtc->freerun)
                        goto free_running;
                sdtc = gdtc;

                if (mdtc) {
                        /*
                         * If memcg domain is in effect, calculate its
                         * pos_ratio.  @wb should satisfy constraints from
                         * both global and memcg domains.  Choose the one
                         * w/ lower pos_ratio.
                         */
                        balance_wb_limits(mdtc, strictlimit);
                        if (mdtc->freerun)
                                goto free_running;
                        if (mdtc->pos_ratio < gdtc->pos_ratio)
                                sdtc = mdtc;
                }

                wb->dirty_exceeded = gdtc->dirty_exceeded ||
                                     (mdtc && mdtc->dirty_exceeded);
                if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
                                           BANDWIDTH_INTERVAL))
                        __wb_update_bandwidth(gdtc, mdtc, true);

                /* throttle according to the chosen dtc */
                dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
                task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                                                        RATELIMIT_CALC_SHIFT;
                max_pause = wb_max_pause(wb, sdtc->wb_dirty);
                min_pause = wb_min_pause(wb, max_pause,
                                         task_ratelimit, dirty_ratelimit,
                                         &nr_dirtied_pause);

                if (unlikely(task_ratelimit == 0)) {
                        period = max_pause;
                        pause = max_pause;
                        goto pause;
                }
                period = HZ * pages_dirtied / task_ratelimit;
                pause = period;
                if (current->dirty_paused_when)
                        pause -= now - current->dirty_paused_when;
                /*
                 * For less than 1s think time (ext3/4 may block the dirtier
                 * for up to 800ms from time to time on 1-HDD; so does xfs,
                 * however at much less frequency), try to compensate it in
                 * future periods by updating the virtual time; otherwise just
                 * do a reset, as it may be a light dirtier.
                 */
                if (pause < min_pause) {
                        trace_balance_dirty_pages(wb,
                                                  sdtc,
                                                  dirty_ratelimit,
                                                  task_ratelimit,
                                                  pages_dirtied,
                                                  period,
                                                  min(pause, 0L),
                                                  start_time);
                        if (pause < -HZ) {
                                current->dirty_paused_when = now;
                                current->nr_dirtied = 0;
                        } else if (period) {
                                current->dirty_paused_when += period;
                                current->nr_dirtied = 0;
                        } else if (current->nr_dirtied_pause <= pages_dirtied)
                                current->nr_dirtied_pause += pages_dirtied;
                        break;
                }
                if (unlikely(pause > max_pause)) {
                        /* for occasional dropped task_ratelimit */
                        now += min(pause - max_pause, max_pause);
                        pause = max_pause;
                }

pause:
                trace_balance_dirty_pages(wb,
                                          sdtc,
                                          dirty_ratelimit,
                                          task_ratelimit,
                                          pages_dirtied,
                                          period,
                                          pause,
                                          start_time);
                if (flags & BDP_ASYNC) {
                        ret = -EAGAIN;
                        break;
                }
                __set_current_state(TASK_KILLABLE);
                bdi->last_bdp_sleep = jiffies;
                io_schedule_timeout(pause);

                current->dirty_paused_when = now + pause;
                current->nr_dirtied = 0;
                current->nr_dirtied_pause = nr_dirtied_pause;

                /*
                 * This is typically equal to (dirty < thresh) and can also
                 * keep "1000+ dd on a slow USB stick" under control.
                 */
                if (task_ratelimit)
                        break;

                /*
                 * In the case of an unresponsive NFS server and the NFS dirty
                 * pages exceeds dirty_thresh, give the other good wb's a pipe
                 * to go through, so that tasks on them still remain responsive.
                 *
                 * In theory 1 page is enough to keep the consumer-producer
                 * pipe going: the flusher cleans 1 page => the task dirties 1
                 * more page. However wb_dirty has accounting errors.  So use
                 * the larger and more IO friendly wb_stat_error.
                 */
                if (sdtc->wb_dirty <= wb_stat_error())
                        break;

                if (fatal_signal_pending(current))
                        break;
        }
        return ret;
}

static DEFINE_PER_CPU(int, bdp_ratelimits);

/*
 * Normal tasks are throttled by
 *        loop {
 *                dirty tsk->nr_dirtied_pause pages;
 *                take a snap in balance_dirty_pages();
 *        }
 * However there is a worst case. If every task exit immediately when dirtied
 * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
 * called to throttle the page dirties. The solution is to save the not yet
 * throttled page dirties in dirty_throttle_leaks on task exit and charge them
 * randomly into the running tasks. This works well for the above worst case,
 * as the new task will pick up and accumulate the old task's leaked dirty
 * count and eventually get throttled.
 */
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;

/**
 * balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
 * @mapping: address_space which was dirtied.
 * @flags: BDP flags.
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
 * dirty state and will initiate writeback if needed.
 *
 * See balance_dirty_pages_ratelimited() for details.
 *
 * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
 * indicate that memory is out of balance and the caller must wait
 * for I/O to complete.  Otherwise, it will return 0 to indicate
 * that either memory was already in balance, or it was able to sleep
 * until the amount of dirty memory returned to balance.
 */
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
                                        unsigned int flags)
{
        struct inode *inode = mapping->host;
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct bdi_writeback *wb = NULL;
        int ratelimit;
        int ret = 0;
        int *p;

        if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
                return ret;

        if (inode_cgwb_enabled(inode))
                wb = wb_get_create_current(bdi, GFP_KERNEL);
        if (!wb)
                wb = &bdi->wb;

        ratelimit = current->nr_dirtied_pause;
        if (wb->dirty_exceeded)
                ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));

        preempt_disable();
        /*
         * This prevents one CPU to accumulate too many dirtied pages without
         * calling into balance_dirty_pages(), which can happen when there are
         * 1000+ tasks, all of them start dirtying pages at exactly the same
         * time, hence all honoured too large initial task->nr_dirtied_pause.
         */
        p =  this_cpu_ptr(&bdp_ratelimits);
        if (unlikely(current->nr_dirtied >= ratelimit))
                *p = 0;
        else if (unlikely(*p >= ratelimit_pages)) {
                *p = 0;
                ratelimit = 0;
        }
        /*
         * Pick up the dirtied pages by the exited tasks. This avoids lots of
         * short-lived tasks (eg. gcc invocations in a kernel build) escaping
         * the dirty throttling and livelock other long-run dirtiers.
         */
        p = this_cpu_ptr(&dirty_throttle_leaks);
        if (*p > 0 && current->nr_dirtied < ratelimit) {
                unsigned long nr_pages_dirtied;
                nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
                *p -= nr_pages_dirtied;
                current->nr_dirtied += nr_pages_dirtied;
        }
        preempt_enable();

        if (unlikely(current->nr_dirtied >= ratelimit))
                ret = balance_dirty_pages(wb, current->nr_dirtied, flags);

        wb_put(wb);
        return ret;
}
EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags);

/**
 * balance_dirty_pages_ratelimited - balance dirty memory state.
 * @mapping: address_space which was dirtied.
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
 * dirty state and will initiate writeback if needed.
 *
 * Once we're over the dirty memory limit we decrease the ratelimiting
 * by a lot, to prevent individual processes from overshooting the limit
 * by (ratelimit_pages) each.
 */
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
        balance_dirty_pages_ratelimited_flags(mapping, 0);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);

/*
 * Similar to wb_dirty_limits, wb_bg_dirty_limits also calculates dirty
 * and thresh, but it's for background writeback.
 */
static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc)
{
        struct bdi_writeback *wb = dtc->wb;

        dtc->wb_bg_thresh = __wb_calc_thresh(dtc, dtc->bg_thresh);
        if (dtc->wb_bg_thresh < 2 * wb_stat_error())
                dtc->wb_dirty = wb_stat_sum(wb, WB_RECLAIMABLE);
        else
                dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE);
}

static bool domain_over_bg_thresh(struct dirty_throttle_control *dtc)
{
        domain_dirty_avail(dtc, false);
        domain_dirty_limits(dtc);
        if (dtc->dirty > dtc->bg_thresh)
                return true;

        wb_bg_dirty_limits(dtc);
        if (dtc->wb_dirty > dtc->wb_bg_thresh)
                return true;

        return false;
}

/**
 * wb_over_bg_thresh - does @wb need to be written back?
 * @wb: bdi_writeback of interest
 *
 * Determines whether background writeback should keep writing @wb or it's
 * clean enough.
 *
 * Return: %true if writeback should continue.
 */
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
        struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };

        if (domain_over_bg_thresh(&gdtc))
                return true;

        if (mdtc_valid(&mdtc))
                return domain_over_bg_thresh(&mdtc);

        return false;
}

#ifdef CONFIG_SYSCTL
/*
 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
 */
static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos)
{
        unsigned int old_interval = dirty_writeback_interval;
        int ret;

        ret = proc_dointvec(table, write, buffer, length, ppos);

        /*
         * Writing 0 to dirty_writeback_interval will disable periodic writeback
         * and a different non-zero value will wakeup the writeback threads.
         * wb_wakeup_delayed() would be more appropriate, but it's a pain to
         * iterate over all bdis and wbs.
         * The reason we do this is to make the change take effect immediately.
         */
        if (!ret && write && dirty_writeback_interval &&
                dirty_writeback_interval != old_interval)
                wakeup_flusher_threads(WB_REASON_PERIODIC);

        return ret;
}
#endif

/*
 * If ratelimit_pages is too high then we can get into dirty-data overload
 * if a large number of processes all perform writes at the same time.
 *
 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
 * thresholds.
 */

void writeback_set_ratelimit(void)
{
        struct wb_domain *dom = &global_wb_domain;
        unsigned long background_thresh;
        unsigned long dirty_thresh;

        global_dirty_limits(&background_thresh, &dirty_thresh);
        dom->dirty_limit = dirty_thresh;
        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
}

static int page_writeback_cpu_online(unsigned int cpu)
{
        writeback_set_ratelimit();
        return 0;
}

#ifdef CONFIG_SYSCTL

static int laptop_mode;
static int laptop_mode_handler(const struct ctl_table *table, int write,
                               void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec_jiffies(table, write, buffer, lenp, ppos);

        if (!ret && write)
                pr_warn("%s: vm.laptop_mode is deprecated. Ignoring setting.\n",
                        current->comm);

        return ret;
}

/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;

static const struct ctl_table vm_page_writeback_sysctls[] = {
        {
                .procname   = "dirty_background_ratio",
                .data       = &dirty_background_ratio,
                .maxlen     = sizeof(dirty_background_ratio),
                .mode       = 0644,
                .proc_handler   = dirty_background_ratio_handler,
                .extra1     = SYSCTL_ZERO,
                .extra2     = SYSCTL_ONE_HUNDRED,
        },
        {
                .procname   = "dirty_background_bytes",
                .data       = &dirty_background_bytes,
                .maxlen     = sizeof(dirty_background_bytes),
                .mode       = 0644,
                .proc_handler   = dirty_background_bytes_handler,
                .extra1     = SYSCTL_LONG_ONE,
        },
        {
                .procname   = "dirty_ratio",
                .data       = &vm_dirty_ratio,
                .maxlen     = sizeof(vm_dirty_ratio),
                .mode       = 0644,
                .proc_handler   = dirty_ratio_handler,
                .extra1     = SYSCTL_ZERO,
                .extra2     = SYSCTL_ONE_HUNDRED,
        },
        {
                .procname   = "dirty_bytes",
                .data       = &vm_dirty_bytes,
                .maxlen     = sizeof(vm_dirty_bytes),
                .mode       = 0644,
                .proc_handler   = dirty_bytes_handler,
                .extra1     = (void *)&dirty_bytes_min,
        },
        {
                .procname   = "dirty_writeback_centisecs",
                .data       = &dirty_writeback_interval,
                .maxlen     = sizeof(dirty_writeback_interval),
                .mode       = 0644,
                .proc_handler   = dirty_writeback_centisecs_handler,
        },
        {
                .procname   = "dirty_expire_centisecs",
                .data       = &dirty_expire_interval,
                .maxlen     = sizeof(dirty_expire_interval),
                .mode       = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1     = SYSCTL_ZERO,
        },
#ifdef CONFIG_HIGHMEM
        {
                .procname        = "highmem_is_dirtyable",
                .data                = &vm_highmem_is_dirtyable,
                .maxlen                = sizeof(vm_highmem_is_dirtyable),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#endif
        {
                .procname        = "laptop_mode",
                .data                = &laptop_mode,
                .maxlen                = sizeof(laptop_mode),
                .mode                = 0644,
                .proc_handler        = laptop_mode_handler,
        },
};
#endif

/*
 * Called early on to tune the page writeback dirty limits.
 *
 * We used to scale dirty pages according to how total memory
 * related to pages that could be allocated for buffers.
 *
 * However, that was when we used "dirty_ratio" to scale with
 * all memory, and we don't do that any more. "dirty_ratio"
 * is now applied to total non-HIGHPAGE memory, and as such we can't
 * get into the old insane situation any more where we had
 * large amounts of dirty pages compared to a small amount of
 * non-HIGHMEM memory.
 *
 * But we might still want to scale the dirty_ratio by how
 * much memory the box has..
 */
void __init page_writeback_init(void)
{
        BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));

        cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online",
                          page_writeback_cpu_online, NULL);
        cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
                          page_writeback_cpu_online);
#ifdef CONFIG_SYSCTL
        register_sysctl_init("vm", vm_page_writeback_sysctls);
#endif
}

/**
 * tag_pages_for_writeback - tag pages to be written by writeback
 * @mapping: address space structure to write
 * @start: starting page index
 * @end: ending page index (inclusive)
 *
 * This function scans the page range from @start to @end (inclusive) and tags
 * all pages that have DIRTY tag set with a special TOWRITE tag.  The caller
 * can then use the TOWRITE tag to identify pages eligible for writeback.
 * This mechanism is used to avoid livelocking of writeback by a process
 * steadily creating new dirty pages in the file (thus it is important for this
 * function to be quick so that it can tag pages faster than a dirtying process
 * can create them).
 */
void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end)
{
        XA_STATE(xas, &mapping->i_pages, start);
        unsigned int tagged = 0;
        void *page;

        xas_lock_irq(&xas);
        xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
                xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
                if (++tagged % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);
}
EXPORT_SYMBOL(tag_pages_for_writeback);

static bool folio_prepare_writeback(struct address_space *mapping,
                struct writeback_control *wbc, struct folio *folio)
{
        /*
         * Folio truncated or invalidated. We can freely skip it then,
         * even for data integrity operations: the folio has disappeared
         * concurrently, so there could be no real expectation of this
         * data integrity operation even if there is now a new, dirty
         * folio at the same pagecache index.
         */
        if (unlikely(folio->mapping != mapping))
                return false;

        /*
         * Did somebody else write it for us?
         */
        if (!folio_test_dirty(folio))
                return false;

        if (folio_test_writeback(folio)) {
                if (wbc->sync_mode == WB_SYNC_NONE)
                        return false;
                folio_wait_writeback(folio);
        }
        BUG_ON(folio_test_writeback(folio));

        if (!folio_clear_dirty_for_io(folio))
                return false;

        return true;
}


static pgoff_t wbc_end(struct writeback_control *wbc)
{
        if (wbc->range_cyclic)
                return -1;
        return wbc->range_end >> PAGE_SHIFT;
}

static struct folio *writeback_get_folio(struct address_space *mapping,
                struct writeback_control *wbc)
{
        struct folio *folio;

retry:
        folio = folio_batch_next(&wbc->fbatch);
        if (!folio) {
                folio_batch_release(&wbc->fbatch);
                cond_resched();
                filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc),
                                wbc_to_tag(wbc), &wbc->fbatch);
                folio = folio_batch_next(&wbc->fbatch);
                if (!folio)
                        return NULL;
        }

        folio_lock(folio);
        if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) {
                folio_unlock(folio);
                goto retry;
        }

        trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
        return folio;
}

/**
 * writeback_iter - iterate folio of a mapping for writeback
 * @mapping: address space structure to write
 * @wbc: writeback context
 * @folio: previously iterated folio (%NULL to start)
 * @error: in-out pointer for writeback errors (see below)
 *
 * This function returns the next folio for the writeback operation described by
 * @wbc on @mapping and  should be called in a while loop in the ->writepages
 * implementation.
 *
 * To start the writeback operation, %NULL is passed in the @folio argument, and
 * for every subsequent iteration the folio returned previously should be passed
 * back in.
 *
 * If there was an error in the per-folio writeback inside the writeback_iter()
 * loop, @error should be set to the error value.
 *
 * Once the writeback described in @wbc has finished, this function will return
 * %NULL and if there was an error in any iteration restore it to @error.
 *
 * Note: callers should not manually break out of the loop using break or goto
 * but must keep calling writeback_iter() until it returns %NULL.
 *
 * Return: the folio to write or %NULL if the loop is done.
 */
struct folio *writeback_iter(struct address_space *mapping,
                struct writeback_control *wbc, struct folio *folio, int *error)
{
        if (!folio) {
                folio_batch_init(&wbc->fbatch);
                wbc->saved_err = *error = 0;

                /*
                 * For range cyclic writeback we remember where we stopped so
                 * that we can continue where we stopped.
                 *
                 * For non-cyclic writeback we always start at the beginning of
                 * the passed in range.
                 */
                if (wbc->range_cyclic)
                        wbc->index = mapping->writeback_index;
                else
                        wbc->index = wbc->range_start >> PAGE_SHIFT;

                /*
                 * To avoid livelocks when other processes dirty new pages, we
                 * first tag pages which should be written back and only then
                 * start writing them.
                 *
                 * For data-integrity writeback we have to be careful so that we
                 * do not miss some pages (e.g., because some other process has
                 * cleared the TOWRITE tag we set).  The rule we follow is that
                 * TOWRITE tag can be cleared only by the process clearing the
                 * DIRTY tag (and submitting the page for I/O).
                 */
                if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                        tag_pages_for_writeback(mapping, wbc->index,
                                        wbc_end(wbc));
        } else {
                wbc->nr_to_write -= folio_nr_pages(folio);

                WARN_ON_ONCE(*error > 0);

                /*
                 * For integrity writeback we have to keep going until we have
                 * written all the folios we tagged for writeback above, even if
                 * we run past wbc->nr_to_write or encounter errors.
                 * We stash away the first error we encounter in wbc->saved_err
                 * so that it can be retrieved when we're done.  This is because
                 * the file system may still have state to clear for each folio.
                 *
                 * For background writeback we exit as soon as we run past
                 * wbc->nr_to_write or encounter the first error.
                 */
                if (wbc->sync_mode == WB_SYNC_ALL) {
                        if (*error && !wbc->saved_err)
                                wbc->saved_err = *error;
                } else {
                        if (*error || wbc->nr_to_write <= 0)
                                goto done;
                }
        }

        folio = writeback_get_folio(mapping, wbc);
        if (!folio) {
                /*
                 * To avoid deadlocks between range_cyclic writeback and callers
                 * that hold folios in writeback to aggregate I/O until
                 * the writeback iteration finishes, we do not loop back to the
                 * start of the file.  Doing so causes a folio lock/folio
                 * writeback access order inversion - we should only ever lock
                 * multiple folios in ascending folio->index order, and looping
                 * back to the start of the file violates that rule and causes
                 * deadlocks.
                 */
                if (wbc->range_cyclic)
                        mapping->writeback_index = 0;

                /*
                 * Return the first error we encountered (if there was any) to
                 * the caller.
                 */
                *error = wbc->saved_err;
        }
        return folio;

done:
        if (wbc->range_cyclic)
                mapping->writeback_index = folio_next_index(folio);
        folio_batch_release(&wbc->fbatch);
        return NULL;
}
EXPORT_SYMBOL_GPL(writeback_iter);

int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
        int ret;
        struct bdi_writeback *wb;

        if (wbc->nr_to_write <= 0)
                return 0;
        wb = inode_to_wb_wbc(mapping->host, wbc);
        wb_bandwidth_estimate_start(wb);
        while (1) {
                if (mapping->a_ops->writepages)
                        ret = mapping->a_ops->writepages(mapping, wbc);
                else
                        /* deal with chardevs and other special files */
                        ret = 0;
                if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
                        break;

                /*
                 * Lacking an allocation context or the locality or writeback
                 * state of any of the inode's pages, throttle based on
                 * writeback activity on the local node. It's as good a
                 * guess as any.
                 */
                reclaim_throttle(NODE_DATA(numa_node_id()),
                        VMSCAN_THROTTLE_WRITEBACK);
        }
        /*
         * Usually few pages are written by now from those we've just submitted
         * but if there's constant writeback being submitted, this makes sure
         * writeback bandwidth is updated once in a while.
         */
        if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
                                   BANDWIDTH_INTERVAL))
                wb_update_bandwidth(wb);
        return ret;
}

/*
 * For address_spaces which do not use buffers nor write back.
 */
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        if (!folio_test_dirty(folio))
                return !folio_test_set_dirty(folio);
        return false;
}
EXPORT_SYMBOL(noop_dirty_folio);

/*
 * Helper function for set_page_dirty family.
 *
 * NOTE: This relies on being atomic wrt interrupts.
 */
static void folio_account_dirtied(struct folio *folio,
                struct address_space *mapping)
{
        struct inode *inode = mapping->host;

        trace_writeback_dirty_folio(folio, mapping);

        if (mapping_can_writeback(mapping)) {
                struct bdi_writeback *wb;
                long nr = folio_nr_pages(folio);

                inode_attach_wb(inode, folio);
                wb = inode_to_wb(inode);

                lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
                __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
                __node_stat_mod_folio(folio, NR_DIRTIED, nr);
                wb_stat_mod(wb, WB_RECLAIMABLE, nr);
                wb_stat_mod(wb, WB_DIRTIED, nr);
                task_io_account_write(nr * PAGE_SIZE);
                current->nr_dirtied += nr;
                __this_cpu_add(bdp_ratelimits, nr);

                mem_cgroup_track_foreign_dirty(folio, wb);
        }
}

/*
 * Helper function for deaccounting dirty page without writeback.
 *
 */
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
        long nr = folio_nr_pages(folio);

        lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
        wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
        task_io_account_cancelled_write(nr * PAGE_SIZE);
}

/*
 * Mark the folio dirty, and set it dirty in the page cache.
 *
 * If warn is true, then emit a warning if the folio is not uptodate and has
 * not been truncated.
 *
 * It is the caller's responsibility to prevent the folio from being truncated
 * while this function is in progress, although it may have been truncated
 * before this function is called.  Most callers have the folio locked.
 * A few have the folio blocked from truncation through other means (e.g.
 * zap_vma() has it mapped and is holding the page table lock).
 * When called from mark_buffer_dirty(), the filesystem should hold a
 * reference to the buffer_head that is being marked dirty, which causes
 * try_to_free_buffers() to fail.
 */
void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
                             int warn)
{
        unsigned long flags;

        /*
         * Shmem writeback relies on swap, and swap writeback is LRU based,
         * not using the dirty mark.
         */
        VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping));

        xa_lock_irqsave(&mapping->i_pages, flags);
        if (folio->mapping) {        /* Race with truncate? */
                WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
                folio_account_dirtied(folio, mapping);
                __xa_set_mark(&mapping->i_pages, folio->index,
                              PAGECACHE_TAG_DIRTY);
        }
        xa_unlock_irqrestore(&mapping->i_pages, flags);
}

/**
 * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
 * @mapping: Address space this folio belongs to.
 * @folio: Folio to be marked as dirty.
 *
 * Filesystems which do not use buffer heads should call this function
 * from their dirty_folio address space operation.  It ignores the
 * contents of folio_get_private(), so if the filesystem marks individual
 * blocks as dirty, the filesystem should handle that itself.
 *
 * This is also sometimes used by filesystems which use buffer_heads when
 * a single buffer is being dirtied: we want to set the folio dirty in
 * that case, but not all the buffers.  This is a "bottom-up" dirtying,
 * whereas block_dirty_folio() is a "top-down" dirtying.
 *
 * The caller must ensure this doesn't race with truncation.  Most will
 * simply hold the folio lock, but e.g. zap_pte_range() calls with the
 * folio mapped and the pte lock held, which also locks out truncation.
 */
bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        if (folio_test_set_dirty(folio))
                return false;

        __folio_mark_dirty(folio, mapping, !folio_test_private(folio));

        if (mapping->host) {
                /* !PageAnon && !swapper_space */
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
        return true;
}
EXPORT_SYMBOL(filemap_dirty_folio);

/**
 * folio_redirty_for_writepage - Decline to write a dirty folio.
 * @wbc: The writeback control.
 * @folio: The folio.
 *
 * When a writepage implementation decides that it doesn't want to write
 * @folio for some reason, it should call this function, unlock @folio and
 * return 0.
 *
 * Return: True if we redirtied the folio.  False if someone else dirtied
 * it first.
 */
bool folio_redirty_for_writepage(struct writeback_control *wbc,
                struct folio *folio)
{
        struct address_space *mapping = folio->mapping;
        long nr = folio_nr_pages(folio);
        bool ret;

        wbc->pages_skipped += nr;
        ret = filemap_dirty_folio(mapping, folio);
        if (mapping && mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                wb = unlocked_inode_to_wb_begin(inode, &cookie);
                current->nr_dirtied -= nr;
                node_stat_mod_folio(folio, NR_DIRTIED, -nr);
                wb_stat_mod(wb, WB_DIRTIED, -nr);
                unlocked_inode_to_wb_end(inode, &cookie);
        }
        return ret;
}
EXPORT_SYMBOL(folio_redirty_for_writepage);

/**
 * folio_mark_dirty - Mark a folio as being modified.
 * @folio: The folio.
 *
 * The folio may not be truncated while this function is running.
 * Holding the folio lock is sufficient to prevent truncation, but some
 * callers cannot acquire a sleeping lock.  These callers instead hold
 * the page table lock for a page table which contains at least one page
 * in this folio.  Truncation will block on the page table lock as it
 * unmaps pages before removing the folio from its mapping.
 *
 * Return: True if the folio was newly dirtied, false if it was already dirty.
 */
bool folio_mark_dirty(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        if (likely(mapping)) {
                /*
                 * readahead/folio_deactivate could remain
                 * PG_readahead/PG_reclaim due to race with folio_end_writeback
                 * About readahead, if the folio is written, the flags would be
                 * reset. So no problem.
                 * About folio_deactivate, if the folio is redirtied,
                 * the flag will be reset. So no problem. but if the
                 * folio is used by readahead it will confuse readahead
                 * and make it restart the size rampup process. But it's
                 * a trivial problem.
                 */
                if (folio_test_reclaim(folio))
                        folio_clear_reclaim(folio);
                return mapping->a_ops->dirty_folio(mapping, folio);
        }

        return noop_dirty_folio(mapping, folio);
}
EXPORT_SYMBOL(folio_mark_dirty);

/*
 * folio_mark_dirty() is racy if the caller has no reference against
 * folio->mapping->host, and if the folio is unlocked.  This is because another
 * CPU could truncate the folio off the mapping and then free the mapping.
 *
 * Usually, the folio _is_ locked, or the caller is a user-space process which
 * holds a reference on the inode by having an open file.
 *
 * In other cases, the folio should be locked before running folio_mark_dirty().
 */
bool folio_mark_dirty_lock(struct folio *folio)
{
        bool ret;

        folio_lock(folio);
        ret = folio_mark_dirty(folio);
        folio_unlock(folio);
        return ret;
}
EXPORT_SYMBOL(folio_mark_dirty_lock);

/*
 * This cancels just the dirty bit on the kernel page itself, it does NOT
 * actually remove dirty bits on any mmap's that may be around. It also
 * leaves the page tagged dirty, so any sync activity will still find it on
 * the dirty lists, and in particular, clear_page_dirty_for_io() will still
 * look at the dirty bits in the VM.
 *
 * Doing this should *normally* only ever be done when a page is truncated,
 * and is not actually mapped anywhere at all. However, fs/buffer.c does
 * this when it notices that somebody has cleaned out all the buffers on a
 * page without actually doing it through the VM. Can you say "ext3 is
 * horribly ugly"? Thought you could.
 */
void __folio_cancel_dirty(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        if (mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                wb = unlocked_inode_to_wb_begin(inode, &cookie);

                if (folio_test_clear_dirty(folio))
                        folio_account_cleaned(folio, wb);

                unlocked_inode_to_wb_end(inode, &cookie);
        } else {
                folio_clear_dirty(folio);
        }
}
EXPORT_SYMBOL(__folio_cancel_dirty);

/*
 * Clear a folio's dirty flag, while caring for dirty memory accounting.
 * Returns true if the folio was previously dirty.
 *
 * This is for preparing to put the folio under writeout.  We leave
 * the folio tagged as dirty in the xarray so that a concurrent
 * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
 * The ->writepage implementation will run either folio_start_writeback()
 * or folio_mark_dirty(), at which stage we bring the folio's dirty flag
 * and xarray dirty tag back into sync.
 *
 * This incoherency between the folio's dirty flag and xarray tag is
 * unfortunate, but it only exists while the folio is locked.
 */
bool folio_clear_dirty_for_io(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);
        bool ret = false;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (mapping && mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                /*
                 * Yes, Virginia, this is indeed insane.
                 *
                 * We use this sequence to make sure that
                 *  (a) we account for dirty stats properly
                 *  (b) we tell the low-level filesystem to
                 *      mark the whole folio dirty if it was
                 *      dirty in a pagetable. Only to then
                 *  (c) clean the folio again and return 1 to
                 *      cause the writeback.
                 *
                 * This way we avoid all nasty races with the
                 * dirty bit in multiple places and clearing
                 * them concurrently from different threads.
                 *
                 * Note! Normally the "folio_mark_dirty(folio)"
                 * has no effect on the actual dirty bit - since
                 * that will already usually be set. But we
                 * need the side effects, and it can help us
                 * avoid races.
                 *
                 * We basically use the folio "master dirty bit"
                 * as a serialization point for all the different
                 * threads doing their things.
                 */
                if (folio_mkclean(folio))
                        folio_mark_dirty(folio);
                /*
                 * We carefully synchronise fault handlers against
                 * installing a dirty pte and marking the folio dirty
                 * at this point.  We do this by having them hold the
                 * page lock while dirtying the folio, and folios are
                 * always locked coming in here, so we get the desired
                 * exclusion.
                 */
                wb = unlocked_inode_to_wb_begin(inode, &cookie);
                if (folio_test_clear_dirty(folio)) {
                        long nr = folio_nr_pages(folio);
                        lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
                        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
                        wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
                        ret = true;
                }
                unlocked_inode_to_wb_end(inode, &cookie);
                return ret;
        }
        return folio_test_clear_dirty(folio);
}
EXPORT_SYMBOL(folio_clear_dirty_for_io);

static void wb_inode_writeback_start(struct bdi_writeback *wb)
{
        atomic_inc(&wb->writeback_inodes);
}

static void wb_inode_writeback_end(struct bdi_writeback *wb)
{
        unsigned long flags;
        atomic_dec(&wb->writeback_inodes);
        /*
         * Make sure estimate of writeback throughput gets updated after
         * writeback completed. We delay the update by BANDWIDTH_INTERVAL
         * (which is the interval other bandwidth updates use for batching) so
         * that if multiple inodes end writeback at a similar time, they get
         * batched into one bandwidth update.
         */
        spin_lock_irqsave(&wb->work_lock, flags);
        if (test_bit(WB_registered, &wb->state))
                queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
        spin_unlock_irqrestore(&wb->work_lock, flags);
}

bool __folio_end_writeback(struct folio *folio)
{
        long nr = folio_nr_pages(folio);
        struct address_space *mapping = folio_mapping(folio);
        bool ret;

        if (mapping && mapping_use_writeback_tags(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                unsigned long flags;

                xa_lock_irqsave(&mapping->i_pages, flags);
                ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
                __xa_clear_mark(&mapping->i_pages, folio->index,
                                        PAGECACHE_TAG_WRITEBACK);

                wb = inode_to_wb(inode);
                wb_stat_mod(wb, WB_WRITEBACK, -nr);
                __wb_writeout_add(wb, nr);
                if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
                        wb_inode_writeback_end(wb);
                        if (mapping->host)
                                sb_clear_inode_writeback(mapping->host);
                }

                xa_unlock_irqrestore(&mapping->i_pages, flags);
        } else {
                ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
        }

        lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
        node_stat_mod_folio(folio, NR_WRITTEN, nr);

        return ret;
}

void __folio_start_writeback(struct folio *folio, bool keep_write)
{
        long nr = folio_nr_pages(folio);
        struct address_space *mapping = folio_mapping(folio);
        int access_ret;

        VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (mapping && mapping_use_writeback_tags(mapping)) {
                XA_STATE(xas, &mapping->i_pages, folio->index);
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                unsigned long flags;
                bool on_wblist;

                xas_lock_irqsave(&xas, flags);
                xas_load(&xas);
                folio_test_set_writeback(folio);

                on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);

                xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
                wb = inode_to_wb(inode);
                wb_stat_mod(wb, WB_WRITEBACK, nr);
                if (!on_wblist) {
                        wb_inode_writeback_start(wb);
                        /*
                         * We can come through here when swapping anonymous
                         * folios, so we don't necessarily have an inode to
                         * track for sync.
                         */
                        if (mapping->host)
                                sb_mark_inode_writeback(mapping->host);
                }

                if (!folio_test_dirty(folio))
                        xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
                if (!keep_write)
                        xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
                xas_unlock_irqrestore(&xas, flags);
        } else {
                folio_test_set_writeback(folio);
        }

        lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);

        access_ret = arch_make_folio_accessible(folio);
        /*
         * If writeback has been triggered on a page that cannot be made
         * accessible, it is too late to recover here.
         */
        VM_BUG_ON_FOLIO(access_ret != 0, folio);
}
EXPORT_SYMBOL(__folio_start_writeback);

/**
 * folio_wait_writeback - Wait for a folio to finish writeback.
 * @folio: The folio to wait for.
 *
 * If the folio is currently being written back to storage, wait for the
 * I/O to complete.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 */
void folio_wait_writeback(struct folio *folio)
{
        while (folio_test_writeback(folio)) {
                trace_folio_wait_writeback(folio, folio_mapping(folio));
                folio_wait_bit(folio, PG_writeback);
        }
}
EXPORT_SYMBOL_GPL(folio_wait_writeback);

/**
 * folio_wait_writeback_killable - Wait for a folio to finish writeback.
 * @folio: The folio to wait for.
 *
 * If the folio is currently being written back to storage, wait for the
 * I/O to complete or a fatal signal to arrive.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 * Return: 0 on success, -EINTR if we get a fatal signal while waiting.
 */
int folio_wait_writeback_killable(struct folio *folio)
{
        while (folio_test_writeback(folio)) {
                trace_folio_wait_writeback(folio, folio_mapping(folio));
                if (folio_wait_bit_killable(folio, PG_writeback))
                        return -EINTR;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);

/**
 * folio_wait_stable() - wait for writeback to finish, if necessary.
 * @folio: The folio to wait on.
 *
 * This function determines if the given folio is related to a backing
 * device that requires folio contents to be held stable during writeback.
 * If so, then it will wait for any pending writeback to complete.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 */
void folio_wait_stable(struct folio *folio)
{
        if (mapping_stable_writes(folio_mapping(folio)))
                folio_wait_writeback(folio);
}
EXPORT_SYMBOL_GPL(folio_wait_stable);

























    1 



















    1 

































    4 


































































    2 




    1 




































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_PGALLOC_H
#define __ASM_GENERIC_PGALLOC_H

#ifdef CONFIG_MMU

#define GFP_PGTABLE_KERNEL        (GFP_KERNEL | __GFP_ZERO)
#define GFP_PGTABLE_USER        (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)

/**
 * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
        struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL, 0);

        if (!ptdesc)
                return NULL;
        if (!pagetable_pte_ctor(mm, ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }

        ptdesc_set_kernel(ptdesc);

        return ptdesc_address(ptdesc);
}
#define __pte_alloc_one_kernel(...)        alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
/**
 * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
        return __pte_alloc_one_kernel_noprof(mm);
}
#define pte_alloc_one_kernel(...)        alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
#endif

/**
 * pte_free_kernel - free PTE-level kernel page table memory
 * @mm: the mm_struct of the current context
 * @pte: pointer to the memory containing the page table
 */
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
        pagetable_dtor_free(virt_to_ptdesc(pte));
}

/**
 * __pte_alloc_one - allocate memory for a PTE-level user page table
 * @mm: the mm_struct of the current context
 * @gfp: GFP flags to use for the allocation
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation or must have custom GFP flags.
 *
 * Return: `struct page` referencing the ptdesc or %NULL on error
 */
static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp)
{
        struct ptdesc *ptdesc;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;
        if (!pagetable_pte_ctor(mm, ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }

        return ptdesc_page(ptdesc);
}
#define __pte_alloc_one(...)        alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE
/**
 * pte_alloc_one - allocate a page for PTE-level user page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
 *
 * Return: `struct page` referencing the ptdesc or %NULL on error
 */
static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm)
{
        return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER);
}
#define pte_alloc_one(...)        alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__))
#endif

/*
 * Should really implement gc for free page table pages. This could be
 * done with a reference count in struct page.
 */

/**
 * pte_free - free PTE-level user page table memory
 * @mm: the mm_struct of the current context
 * @pte_page: the `struct page` referencing the ptdesc
 */
static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
{
        struct ptdesc *ptdesc = page_ptdesc(pte_page);

        pagetable_dtor_free(ptdesc);
}


#if CONFIG_PGTABLE_LEVELS > 2

#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
/**
 * pmd_alloc_one - allocate memory for a PMD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor().
 *
 * Allocations use %GFP_PGTABLE_USER in user context and
 * %GFP_PGTABLE_KERNEL in kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        struct ptdesc *ptdesc;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;
        if (!pagetable_pmd_ctor(mm, ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }

        if (mm == &init_mm)
                ptdesc_set_kernel(ptdesc);

        return ptdesc_address(ptdesc);
}
#define pmd_alloc_one(...)        alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
#endif

#ifndef __HAVE_ARCH_PMD_FREE
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pmd);

        BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3

static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;

        pagetable_pud_ctor(ptdesc);

        if (mm == &init_mm)
                ptdesc_set_kernel(ptdesc);

        return ptdesc_address(ptdesc);
}
#define __pud_alloc_one(...)        alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PUD_ALLOC_ONE
/**
 * pud_alloc_one - allocate memory for a PUD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table using %GFP_PGTABLE_USER for user context
 * and %GFP_PGTABLE_KERNEL for kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        return __pud_alloc_one_noprof(mm, addr);
}
#define pud_alloc_one(...)        alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__))
#endif

static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pud);

        BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}

#ifndef __HAVE_ARCH_PUD_FREE
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
        __pud_free(mm, pud);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 3 */

#if CONFIG_PGTABLE_LEVELS > 4

static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;

        pagetable_p4d_ctor(ptdesc);

        if (mm == &init_mm)
                ptdesc_set_kernel(ptdesc);

        return ptdesc_address(ptdesc);
}
#define __p4d_alloc_one(...)        alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_P4D_ALLOC_ONE
static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        return __p4d_alloc_one_noprof(mm, addr);
}
#define p4d_alloc_one(...)        alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__))
#endif

static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(p4d);

        BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}

#ifndef __HAVE_ARCH_P4D_FREE
static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
        if (!mm_p4d_folded(mm))
                __p4d_free(mm, p4d);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 4 */

static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;

        ptdesc = pagetable_alloc_noprof(gfp, order);
        if (!ptdesc)
                return NULL;

        pagetable_pgd_ctor(ptdesc);

        if (mm == &init_mm)
                ptdesc_set_kernel(ptdesc);

        return ptdesc_address(ptdesc);
}
#define __pgd_alloc(...)        alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__))

static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pgd);

        BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}

#ifndef __HAVE_ARCH_PGD_FREE
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        __pgd_free(mm, pgd);
}
#endif

#endif /* CONFIG_MMU */

#endif /* __ASM_GENERIC_PGALLOC_H */












































































































































































































































































































































































































































































































































































































































































    1 












    1 


    1 
    1 






    1 


















































































    1 










    1 














    1 




























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Extension Header handling for IPv6
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Andi Kleen                <ak@muc.de>
 *        Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
 */

/* Changes:
 *        yoshfuji                : ensure not to overrun while parsing
 *                                  tlv options.
 *        Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs().
 *        YOSHIFUJI Hideaki @USAGI  Register inbound extension header
 *                                  handlers as inet6_protocol{}.
 */

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/icmpv6.h>
#include <linux/slab.h>
#include <linux/export.h>

#include <net/dst.h>
#include <net/sock.h>
#include <net/snmp.h>

#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/transp_v6.h>
#include <net/rawv6.h>
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/calipso.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/xfrm.h>
#endif
#include <linux/seg6.h>
#include <net/seg6.h>
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
#include <net/rpl.h>
#include <linux/ioam6.h>
#include <linux/ioam6_genl.h>
#include <net/ioam6.h>
#include <net/dst_metadata.h>

#include <linux/uaccess.h>

/*********************
  Generic functions
 *********************/

/* An unknown option is detected, decide what to do */

static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff,
                               bool disallow_unknowns)
{
        if (disallow_unknowns) {
                /* If unknown TLVs are disallowed by configuration
                 * then always silently drop packet. Note this also
                 * means no ICMP parameter problem is sent which
                 * could be a good property to mitigate a reflection DOS
                 * attack.
                 */

                goto drop;
        }

        switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) {
        case 0: /* ignore */
                return true;

        case 1: /* drop packet */
                break;

        case 3: /* Send ICMP if not a multicast address and drop packet */
                /* Actually, it is redundant check. icmp_send
                   will recheck in any case.
                 */
                if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr))
                        break;
                fallthrough;
        case 2: /* send ICMP PARM PROB regardless and drop packet */
                icmpv6_param_prob_reason(skb, ICMPV6_UNK_OPTION, optoff,
                                         SKB_DROP_REASON_UNHANDLED_PROTO);
                return false;
        }

drop:
        kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
        return false;
}

static bool ipv6_hop_ra(struct sk_buff *skb, int optoff);
static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff);
static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff);
static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
static bool ipv6_dest_hao(struct sk_buff *skb, int optoff);
#endif

/* Parse tlv encoded option header (hop-by-hop or destination) */

static bool ip6_parse_tlv(bool hopbyhop,
                          struct sk_buff *skb,
                          int max_count)
{
        int len = (skb_transport_header(skb)[1] + 1) << 3;
        const unsigned char *nh = skb_network_header(skb);
        int off = skb_network_header_len(skb);
        bool disallow_unknowns = false;
        int tlv_count = 0;
        int padlen = 0;

        if (unlikely(max_count < 0)) {
                disallow_unknowns = true;
                max_count = -max_count;
        }

        off += 2;
        len -= 2;

        while (len > 0) {
                int optlen, i;

                if (nh[off] == IPV6_TLV_PAD1) {
                        padlen++;
                        if (padlen > 7)
                                goto bad;
                        off++;
                        len--;
                        continue;
                }
                if (len < 2)
                        goto bad;
                optlen = nh[off + 1] + 2;
                if (optlen > len)
                        goto bad;

                if (nh[off] == IPV6_TLV_PADN) {
                        /* RFC 2460 states that the purpose of PadN is
                         * to align the containing header to multiples
                         * of 8. 7 is therefore the highest valid value.
                         * See also RFC 4942, Section 2.1.9.5.
                         */
                        padlen += optlen;
                        if (padlen > 7)
                                goto bad;
                        /* RFC 4942 recommends receiving hosts to
                         * actively check PadN payload to contain
                         * only zeroes.
                         */
                        for (i = 2; i < optlen; i++) {
                                if (nh[off + i] != 0)
                                        goto bad;
                        }
                } else {
                        tlv_count++;
                        if (tlv_count > max_count)
                                goto bad;

                        if (hopbyhop) {
                                switch (nh[off]) {
                                case IPV6_TLV_ROUTERALERT:
                                        if (!ipv6_hop_ra(skb, off))
                                                return false;
                                        break;
                                case IPV6_TLV_IOAM:
                                        if (!ipv6_hop_ioam(skb, off))
                                                return false;

                                        nh = skb_network_header(skb);
                                        break;
                                case IPV6_TLV_JUMBO:
                                        if (!ipv6_hop_jumbo(skb, off))
                                                return false;
                                        break;
                                case IPV6_TLV_CALIPSO:
                                        if (!ipv6_hop_calipso(skb, off))
                                                return false;
                                        break;
                                default:
                                        if (!ip6_tlvopt_unknown(skb, off,
                                                                disallow_unknowns))
                                                return false;
                                        break;
                                }
                        } else {
                                switch (nh[off]) {
#if IS_ENABLED(CONFIG_IPV6_MIP6)
                                case IPV6_TLV_HAO:
                                        if (!ipv6_dest_hao(skb, off))
                                                return false;
                                        break;
#endif
                                default:
                                        if (!ip6_tlvopt_unknown(skb, off,
                                                                disallow_unknowns))
                                                return false;
                                        break;
                                }
                        }
                        padlen = 0;
                }
                off += optlen;
                len -= optlen;
        }

        if (len == 0)
                return true;
bad:
        kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
        return false;
}

/*****************************
  Destination options header.
 *****************************/

#if IS_ENABLED(CONFIG_IPV6_MIP6)
static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
{
        struct ipv6_destopt_hao *hao;
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        SKB_DR(reason);
        int ret;

        if (opt->dsthao) {
                net_dbg_ratelimited("hao duplicated\n");
                goto discard;
        }
        opt->dsthao = opt->dst1;
        opt->dst1 = 0;

        hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff);

        if (hao->length != 16) {
                net_dbg_ratelimited("hao invalid option length = %d\n",
                                    hao->length);
                SKB_DR_SET(reason, IP_INHDR);
                goto discard;
        }

        if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) {
                net_dbg_ratelimited("hao is not an unicast addr: %pI6\n",
                                    &hao->addr);
                SKB_DR_SET(reason, INVALID_PROTO);
                goto discard;
        }

        ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr,
                               (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS);
        if (unlikely(ret < 0)) {
                SKB_DR_SET(reason, XFRM_POLICY);
                goto discard;
        }

        if (skb_cloned(skb)) {
                if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
                        goto discard;

                /* update all variable using below by copied skbuff */
                hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) +
                                                  optoff);
                ipv6h = ipv6_hdr(skb);
        }

        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;

        swap(ipv6h->saddr, hao->addr);

        if (skb->tstamp == 0)
                __net_timestamp(skb);

        return true;

 discard:
        kfree_skb_reason(skb, reason);
        return false;
}
#endif

static int ipv6_destopt_rcv(struct sk_buff *skb)
{
        struct inet6_dev *idev = __in6_dev_get(skb->dev);
        struct inet6_skb_parm *opt = IP6CB(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        __u16 dstbuf;
#endif
        struct dst_entry *dst = skb_dst(skb);
        struct net *net = dev_net(skb->dev);
        int extlen;

        if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
            !pskb_may_pull(skb, (skb_transport_offset(skb) +
                                 ((skb_transport_header(skb)[1] + 1) << 3)))) {
                __IP6_INC_STATS(dev_net(dst_dev(dst)), idev,
                                IPSTATS_MIB_INHDRERRORS);
fail_and_free:
                kfree_skb(skb);
                return -1;
        }

        extlen = (skb_transport_header(skb)[1] + 1) << 3;
        if (extlen > READ_ONCE(net->ipv6.sysctl.max_dst_opts_len))
                goto fail_and_free;

        opt->lastopt = opt->dst1 = skb_network_header_len(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        dstbuf = opt->dst1;
#endif

        if (ip6_parse_tlv(false, skb,
                          READ_ONCE(net->ipv6.sysctl.max_dst_opts_cnt))) {
                skb->transport_header += extlen;
                opt = IP6CB(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
                opt->nhoff = dstbuf;
#else
                opt->nhoff = opt->dst1;
#endif
                return 1;
        }

        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
        return -1;
}

static void seg6_update_csum(struct sk_buff *skb)
{
        struct ipv6_sr_hdr *hdr;
        struct in6_addr *addr;
        __be32 from, to;

        /* srh is at transport offset and seg_left is already decremented
         * but daddr is not yet updated with next segment
         */

        hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
        addr = hdr->segments + hdr->segments_left;

        hdr->segments_left++;
        from = *(__be32 *)hdr;

        hdr->segments_left--;
        to = *(__be32 *)hdr;

        /* update skb csum with diff resulting from seg_left decrement */

        update_csum_diff4(skb, from, to);

        /* compute csum diff between current and next segment and update */

        update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr),
                           (__be32 *)addr);
}

static int ipv6_srh_rcv(struct sk_buff *skb)
{
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct net *net = dev_net(skb->dev);
        struct ipv6_sr_hdr *hdr;
        struct inet6_dev *idev;
        struct in6_addr *addr;
        int accept_seg6;

        hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);

        idev = __in6_dev_get(skb->dev);
        if (!idev) {
                kfree_skb(skb);
                return -1;
        }

        accept_seg6 = min(READ_ONCE(net->ipv6.devconf_all->seg6_enabled),
                          READ_ONCE(idev->cnf.seg6_enabled));

        if (!accept_seg6) {
                kfree_skb(skb);
                return -1;
        }

#ifdef CONFIG_IPV6_SEG6_HMAC
        if (!seg6_hmac_validate_skb(skb)) {
                kfree_skb(skb);
                return -1;
        }
#endif

looped_back:
        if (hdr->segments_left == 0) {
                if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) {
                        int offset = (hdr->hdrlen + 1) << 3;

                        skb_postpull_rcsum(skb, skb_network_header(skb),
                                           skb_network_header_len(skb));
                        skb_pull(skb, offset);
                        skb_postpull_rcsum(skb, skb_transport_header(skb),
                                           offset);

                        skb_reset_network_header(skb);
                        skb_reset_transport_header(skb);
                        skb->encapsulation = 0;
                        if (hdr->nexthdr == NEXTHDR_IPV4)
                                skb->protocol = htons(ETH_P_IP);
                        __skb_tunnel_rx(skb, skb->dev, net);

                        netif_rx(skb);
                        return -1;
                }

                opt->srcrt = skb_network_header_len(skb);
                opt->lastopt = opt->srcrt;
                skb->transport_header += (hdr->hdrlen + 1) << 3;
                opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);

                return 1;
        }

        if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
                                  ((&hdr->segments_left) -
                                   skb_network_header(skb)));
                return -1;
        }

        if (skb_cloned(skb)) {
                if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
                        __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
                                        IPSTATS_MIB_OUTDISCARDS);
                        kfree_skb(skb);
                        return -1;
                }

                hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
        }

        hdr->segments_left--;
        addr = hdr->segments + hdr->segments_left;

        skb_push(skb, sizeof(struct ipv6hdr));

        if (skb->ip_summed == CHECKSUM_COMPLETE)
                seg6_update_csum(skb);

        ipv6_hdr(skb)->daddr = *addr;

        ip6_route_input(skb);

        if (skb_dst(skb)->error) {
                dst_input(skb);
                return -1;
        }

        if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
                if (ipv6_hdr(skb)->hop_limit <= 1) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                        icmpv6_send(skb, ICMPV6_TIME_EXCEED,
                                    ICMPV6_EXC_HOPLIMIT, 0);
                        kfree_skb(skb);
                        return -1;
                }
                ipv6_hdr(skb)->hop_limit--;

                skb_pull(skb, sizeof(struct ipv6hdr));
                goto looped_back;
        }

        dst_input(skb);

        return -1;
}

static int ipv6_rpl_srh_rcv(struct sk_buff *skb)
{
        struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr;
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct net *net = dev_net(skb->dev);
        struct inet6_dev *idev;
        struct ipv6hdr *oldhdr;
        unsigned char *buf;
        int accept_rpl_seg;
        int i, err;
        u64 n = 0;
        u32 r;

        idev = __in6_dev_get(skb->dev);

        accept_rpl_seg = min(READ_ONCE(net->ipv6.devconf_all->rpl_seg_enabled),
                             READ_ONCE(idev->cnf.rpl_seg_enabled));
        if (!accept_rpl_seg) {
                kfree_skb(skb);
                return -1;
        }

looped_back:
        hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);

        if (hdr->segments_left == 0) {
                if (hdr->nexthdr == NEXTHDR_IPV6) {
                        int offset = (hdr->hdrlen + 1) << 3;

                        skb_postpull_rcsum(skb, skb_network_header(skb),
                                           skb_network_header_len(skb));
                        skb_pull(skb, offset);
                        skb_postpull_rcsum(skb, skb_transport_header(skb),
                                           offset);

                        skb_reset_network_header(skb);
                        skb_reset_transport_header(skb);
                        skb->encapsulation = 0;

                        __skb_tunnel_rx(skb, skb->dev, net);

                        netif_rx(skb);
                        return -1;
                }

                opt->srcrt = skb_network_header_len(skb);
                opt->lastopt = opt->srcrt;
                skb->transport_header += (hdr->hdrlen + 1) << 3;
                opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);

                return 1;
        }

        n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre);
        r = do_div(n, (16 - hdr->cmpri));
        /* checks if calculation was without remainder and n fits into
         * unsigned char which is segments_left field. Should not be
         * higher than that.
         */
        if (r || (n + 1) > 255) {
                kfree_skb(skb);
                return -1;
        }

        if (hdr->segments_left > n + 1) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
                                  ((&hdr->segments_left) -
                                   skb_network_header(skb)));
                return -1;
        }

        hdr->segments_left--;
        i = n - hdr->segments_left;

        buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC);
        if (unlikely(!buf)) {
                kfree_skb(skb);
                return -1;
        }

        ohdr = (struct ipv6_rpl_sr_hdr *)buf;
        ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n);
        chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3));

        if (ipv6_addr_is_multicast(&ohdr->rpl_segaddr[i])) {
                kfree_skb(skb);
                kfree(buf);
                return -1;
        }

        err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1);
        if (err) {
                icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0);
                kfree_skb(skb);
                kfree(buf);
                return -1;
        }

        swap(ipv6_hdr(skb)->daddr, ohdr->rpl_segaddr[i]);

        ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n);

        oldhdr = ipv6_hdr(skb);

        skb_pull(skb, ((hdr->hdrlen + 1) << 3));
        skb_postpull_rcsum(skb, oldhdr,
                           sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3));
        if (unlikely(!hdr->segments_left)) {
                if (pskb_expand_head(skb, sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3), 0,
                                     GFP_ATOMIC)) {
                        __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS);
                        kfree_skb(skb);
                        kfree(buf);
                        return -1;
                }

                oldhdr = ipv6_hdr(skb);
        }
        skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr));
        skb_reset_network_header(skb);
        skb_mac_header_rebuild(skb);
        skb_set_transport_header(skb, sizeof(struct ipv6hdr));

        memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr));
        memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3);

        ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
        skb_postpush_rcsum(skb, ipv6_hdr(skb),
                           sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3));

        kfree(buf);

        ip6_route_input(skb);

        if (skb_dst(skb)->error) {
                dst_input(skb);
                return -1;
        }

        if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
                if (ipv6_hdr(skb)->hop_limit <= 1) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                        icmpv6_send(skb, ICMPV6_TIME_EXCEED,
                                    ICMPV6_EXC_HOPLIMIT, 0);
                        kfree_skb(skb);
                        return -1;
                }
                ipv6_hdr(skb)->hop_limit--;

                skb_pull(skb, sizeof(struct ipv6hdr));
                goto looped_back;
        }

        dst_input(skb);

        return -1;
}

/********************************
  Routing header.
 ********************************/

/* called with rcu_read_lock() */
static int ipv6_rthdr_rcv(struct sk_buff *skb)
{
        struct inet6_dev *idev = __in6_dev_get(skb->dev);
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct in6_addr *addr = NULL;
        int n, i;
        struct ipv6_rt_hdr *hdr;
        struct rt0_hdr *rthdr;
        struct net *net = dev_net(skb->dev);
        int accept_source_route;

        accept_source_route = READ_ONCE(net->ipv6.devconf_all->accept_source_route);

        if (idev)
                accept_source_route = min(accept_source_route,
                                          READ_ONCE(idev->cnf.accept_source_route));

        if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
            !pskb_may_pull(skb, (skb_transport_offset(skb) +
                                 ((skb_transport_header(skb)[1] + 1) << 3)))) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                kfree_skb(skb);
                return -1;
        }

        hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb);

        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
            skb->pkt_type != PACKET_HOST) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                kfree_skb(skb);
                return -1;
        }

        switch (hdr->type) {
        case IPV6_SRCRT_TYPE_4:
                /* segment routing */
                return ipv6_srh_rcv(skb);
        case IPV6_SRCRT_TYPE_3:
                /* rpl segment routing */
                return ipv6_rpl_srh_rcv(skb);
        default:
                break;
        }

looped_back:
        if (hdr->segments_left == 0) {
                switch (hdr->type) {
#if IS_ENABLED(CONFIG_IPV6_MIP6)
                case IPV6_SRCRT_TYPE_2:
                        /* Silently discard type 2 header unless it was
                         * processed by own
                         */
                        if (!addr) {
                                __IP6_INC_STATS(net, idev,
                                                IPSTATS_MIB_INADDRERRORS);
                                kfree_skb(skb);
                                return -1;
                        }
                        break;
#endif
                default:
                        break;
                }

                opt->lastopt = opt->srcrt = skb_network_header_len(skb);
                skb->transport_header += (hdr->hdrlen + 1) << 3;
                opt->dst0 = opt->dst1;
                opt->dst1 = 0;
                opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
                return 1;
        }

        switch (hdr->type) {
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        case IPV6_SRCRT_TYPE_2:
                if (accept_source_route < 0)
                        goto unknown_rh;
                /* Silently discard invalid RTH type 2 */
                if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                        kfree_skb(skb);
                        return -1;
                }
                break;
#endif
        default:
                goto unknown_rh;
        }

        /*
         *        This is the routing header forwarding algorithm from
         *        RFC 2460, page 16.
         */

        n = hdr->hdrlen >> 1;

        if (hdr->segments_left > n) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
                                  ((&hdr->segments_left) -
                                   skb_network_header(skb)));
                return -1;
        }

        /* We are about to mangle packet header. Be careful!
           Do not damage packets queued somewhere.
         */
        if (skb_cloned(skb)) {
                /* the copy is a forwarded packet */
                if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
                        __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
                                        IPSTATS_MIB_OUTDISCARDS);
                        kfree_skb(skb);
                        return -1;
                }
                hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb);
        }

        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;

        i = n - --hdr->segments_left;

        rthdr = (struct rt0_hdr *) hdr;
        addr = rthdr->addr;
        addr += i - 1;

        switch (hdr->type) {
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        case IPV6_SRCRT_TYPE_2:
                if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
                                     (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
                                     IPPROTO_ROUTING) < 0) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                        kfree_skb(skb);
                        return -1;
                }
                if (!ipv6_chk_home_addr(skb_dst_dev_net(skb), addr)) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                        kfree_skb(skb);
                        return -1;
                }
                break;
#endif
        default:
                break;
        }

        if (ipv6_addr_is_multicast(addr)) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                kfree_skb(skb);
                return -1;
        }

        swap(*addr, ipv6_hdr(skb)->daddr);

        ip6_route_input(skb);
        if (skb_dst(skb)->error) {
                skb_push(skb, -skb_network_offset(skb));
                dst_input(skb);
                return -1;
        }

        if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
                if (ipv6_hdr(skb)->hop_limit <= 1) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                        icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
                                    0);
                        kfree_skb(skb);
                        return -1;
                }
                ipv6_hdr(skb)->hop_limit--;
                goto looped_back;
        }

        skb_push(skb, -skb_network_offset(skb));
        dst_input(skb);
        return -1;

unknown_rh:
        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
        icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
                          (&hdr->type) - skb_network_header(skb));
        return -1;
}

static const struct inet6_protocol rthdr_protocol = {
        .handler        =        ipv6_rthdr_rcv,
        .flags                =        INET6_PROTO_NOPOLICY,
};

static const struct inet6_protocol destopt_protocol = {
        .handler        =        ipv6_destopt_rcv,
        .flags                =        INET6_PROTO_NOPOLICY,
};

static const struct inet6_protocol nodata_protocol = {
        .handler        =        dst_discard,
        .flags                =        INET6_PROTO_NOPOLICY,
};

int __init ipv6_exthdrs_init(void)
{
        int ret;

        ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING);
        if (ret)
                goto out;

        ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
        if (ret)
                goto out_rthdr;

        ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE);
        if (ret)
                goto out_destopt;

out:
        return ret;
out_destopt:
        inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
out_rthdr:
        inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING);
        goto out;
};

void ipv6_exthdrs_exit(void)
{
        inet6_del_protocol(&nodata_protocol, IPPROTO_NONE);
        inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
        inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING);
}

/**********************************
  Hop-by-hop options.
 **********************************/

/* Router Alert as of RFC 2711 */

static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
{
        const unsigned char *nh = skb_network_header(skb);

        if (nh[optoff + 1] == 2) {
                IP6CB(skb)->flags |= IP6SKB_ROUTERALERT;
                memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra));
                return true;
        }
        net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n",
                            nh[optoff + 1]);
        kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
        return false;
}

/* IOAM */

static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff)
{
        struct ioam6_trace_hdr *trace;
        struct ioam6_namespace *ns;
        struct ioam6_hdr *hdr;

        /* Bad alignment (must be 4n-aligned) */
        if (optoff & 3)
                goto drop;

        /* Ignore if IOAM is not enabled on ingress */
        if (!READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_enabled))
                goto ignore;

        /* Truncated Option header */
        hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff);
        if (hdr->opt_len < 2)
                goto drop;

        switch (hdr->type) {
        case IOAM6_TYPE_PREALLOC:
                /* Truncated Pre-allocated Trace header */
                if (hdr->opt_len < 2 + sizeof(*trace))
                        goto drop;

                /* Malformed Pre-allocated Trace header */
                trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr));
                if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4)
                        goto drop;

                /* Inconsistent Pre-allocated Trace header */
                if (trace->nodelen !=
                    ioam6_trace_compute_nodelen(be32_to_cpu(trace->type_be32)))
                        goto drop;

                /* Ignore if the IOAM namespace is unknown */
                ns = ioam6_namespace(dev_net(skb->dev), trace->namespace_id);
                if (!ns)
                        goto ignore;

                if (!skb_valid_dst(skb))
                        ip6_route_input(skb);

                /* About to mangle packet header */
                if (skb_ensure_writable(skb, optoff + 2 + hdr->opt_len))
                        goto drop;

                /* Trace pointer may have changed */
                trace = (struct ioam6_trace_hdr *)(skb_network_header(skb)
                                                   + optoff + sizeof(*hdr));

                ioam6_fill_trace_data(skb, ns, trace, true);

                ioam6_event(IOAM6_EVENT_TRACE, dev_net(skb->dev),
                            GFP_ATOMIC, (void *)trace, hdr->opt_len - 2);
                break;
        default:
                break;
        }

ignore:
        return true;

drop:
        kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
        return false;
}

/* Jumbo payload */

static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
{
        const unsigned char *nh = skb_network_header(skb);
        SKB_DR(reason);
        u32 pkt_len;

        if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
                net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
                                    nh[optoff+1]);
                SKB_DR_SET(reason, IP_INHDR);
                goto drop;
        }

        pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
        if (pkt_len <= IPV6_MAXPLEN) {
                icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff + 2,
                                         SKB_DROP_REASON_IP_INHDR);
                return false;
        }
        if (ipv6_hdr(skb)->payload_len) {
                icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff,
                                         SKB_DROP_REASON_IP_INHDR);
                return false;
        }

        if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
                SKB_DR_SET(reason, PKT_TOO_SMALL);
                goto drop;
        }

        if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
                goto drop;

        IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM;
        return true;

drop:
        kfree_skb_reason(skb, reason);
        return false;
}

/* CALIPSO RFC 5570 */

static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff)
{
        const unsigned char *nh = skb_network_header(skb);

        if (nh[optoff + 1] < 8)
                goto drop;

        if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1])
                goto drop;

        if (!calipso_validate(skb, nh + optoff))
                goto drop;

        return true;

drop:
        kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
        return false;
}

int ipv6_parse_hopopts(struct sk_buff *skb)
{
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct net *net = dev_net(skb->dev);
        int extlen;

        /*
         * skb_network_header(skb) is equal to skb->data, and
         * skb_network_header_len(skb) is always equal to
         * sizeof(struct ipv6hdr) by definition of
         * hop-by-hop options.
         */
        if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) ||
            !pskb_may_pull(skb, (sizeof(struct ipv6hdr) +
                                 ((skb_transport_header(skb)[1] + 1) << 3)))) {
fail_and_free:
                kfree_skb(skb);
                return -1;
        }

        extlen = (skb_transport_header(skb)[1] + 1) << 3;
        if (extlen > READ_ONCE(net->ipv6.sysctl.max_hbh_opts_len))
                goto fail_and_free;

        opt->flags |= IP6SKB_HOPBYHOP;
        if (ip6_parse_tlv(true, skb,
                          READ_ONCE(net->ipv6.sysctl.max_hbh_opts_cnt))) {
                skb->transport_header += extlen;
                opt = IP6CB(skb);
                opt->nhoff = sizeof(struct ipv6hdr);
                return 1;
        }
        return -1;
}

/*
 *        Creating outbound headers.
 *
 *        "build" functions work when skb is filled from head to tail (datagram)
 *        "push"        functions work when headers are added from tail to head (tcp)
 *
 *        In both cases we assume, that caller reserved enough room
 *        for headers.
 */

static u8 ipv6_push_rthdr0(struct sk_buff *skb, u8 proto,
                           struct ipv6_rt_hdr *opt,
                           struct in6_addr **addr_p, struct in6_addr *saddr)
{
        struct rt0_hdr *phdr, *ihdr;
        int hops;

        ihdr = (struct rt0_hdr *) opt;

        phdr = skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
        memcpy(phdr, ihdr, sizeof(struct rt0_hdr));

        hops = ihdr->rt_hdr.hdrlen >> 1;

        if (hops > 1)
                memcpy(phdr->addr, ihdr->addr + 1,
                       (hops - 1) * sizeof(struct in6_addr));

        phdr->addr[hops - 1] = **addr_p;
        *addr_p = ihdr->addr;

        phdr->rt_hdr.nexthdr = proto;
        return NEXTHDR_ROUTING;
}

static u8 ipv6_push_rthdr4(struct sk_buff *skb, u8 proto,
                           struct ipv6_rt_hdr *opt,
                           struct in6_addr **addr_p, struct in6_addr *saddr)
{
        struct ipv6_sr_hdr *sr_phdr, *sr_ihdr;
        int plen, hops;

        sr_ihdr = (struct ipv6_sr_hdr *)opt;
        plen = (sr_ihdr->hdrlen + 1) << 3;

        sr_phdr = skb_push(skb, plen);
        memcpy(sr_phdr, sr_ihdr, sizeof(struct ipv6_sr_hdr));

        hops = sr_ihdr->first_segment + 1;
        memcpy(sr_phdr->segments + 1, sr_ihdr->segments + 1,
               (hops - 1) * sizeof(struct in6_addr));

        sr_phdr->segments[0] = **addr_p;
        *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left];

        if (sr_ihdr->hdrlen > hops * 2) {
                int tlvs_offset, tlvs_length;

                tlvs_offset = (1 + hops * 2) << 3;
                tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3;
                memcpy((char *)sr_phdr + tlvs_offset,
                       (char *)sr_ihdr + tlvs_offset, tlvs_length);
        }

#ifdef CONFIG_IPV6_SEG6_HMAC
        if (sr_has_hmac(sr_phdr)) {
                struct net *net = NULL;

                if (skb->dev)
                        net = dev_net(skb->dev);
                else if (skb->sk)
                        net = sock_net(skb->sk);

                WARN_ON(!net);

                if (net)
                        seg6_push_hmac(net, saddr, sr_phdr);
        }
#endif

        sr_phdr->nexthdr = proto;
        return NEXTHDR_ROUTING;
}

static u8 ipv6_push_rthdr(struct sk_buff *skb, u8 proto,
                          struct ipv6_rt_hdr *opt,
                          struct in6_addr **addr_p, struct in6_addr *saddr)
{
        switch (opt->type) {
        case IPV6_SRCRT_TYPE_0:
        case IPV6_SRCRT_STRICT:
        case IPV6_SRCRT_TYPE_2:
                proto = ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr);
                break;
        case IPV6_SRCRT_TYPE_4:
                proto = ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr);
                break;
        default:
                break;
        }
        return proto;
}

static u8 ipv6_push_exthdr(struct sk_buff *skb, u8 proto, u8 type, struct ipv6_opt_hdr *opt)
{
        struct ipv6_opt_hdr *h = skb_push(skb, ipv6_optlen(opt));

        memcpy(h, opt, ipv6_optlen(opt));
        h->nexthdr = proto;
        return type;
}

u8 ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
                        u8 proto,
                        struct in6_addr **daddr, struct in6_addr *saddr)
{
        if (opt->srcrt) {
                proto = ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr);
                /*
                 * IPV6_RTHDRDSTOPTS is ignored
                 * unless IPV6_RTHDR is set (RFC3542).
                 */
                if (opt->dst0opt)
                        proto = ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
        }
        if (opt->hopopt)
                proto = ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt);
        return proto;
}

u8 ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 proto)
{
        if (opt->dst1opt)
                proto = ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt);
        return proto;
}
EXPORT_SYMBOL(ipv6_push_frag_opts);

struct ipv6_txoptions *
ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
{
        struct ipv6_txoptions *opt2;

        opt2 = sock_kmemdup(sk, opt, opt->tot_len, GFP_ATOMIC);
        if (opt2) {
                long dif = (char *)opt2 - (char *)opt;
                if (opt2->hopopt)
                        *((char **)&opt2->hopopt) += dif;
                if (opt2->dst0opt)
                        *((char **)&opt2->dst0opt) += dif;
                if (opt2->dst1opt)
                        *((char **)&opt2->dst1opt) += dif;
                if (opt2->srcrt)
                        *((char **)&opt2->srcrt) += dif;
                refcount_set(&opt2->refcnt, 1);
        }
        return opt2;
}
EXPORT_SYMBOL_GPL(ipv6_dup_options);

static void ipv6_renew_option(int renewtype,
                              struct ipv6_opt_hdr **dest,
                              struct ipv6_opt_hdr *old,
                              struct ipv6_opt_hdr *new,
                              int newtype, char **p)
{
        struct ipv6_opt_hdr *src;

        src = (renewtype == newtype ? new : old);
        if (!src)
                return;

        memcpy(*p, src, ipv6_optlen(src));
        *dest = (struct ipv6_opt_hdr *)*p;
        *p += CMSG_ALIGN(ipv6_optlen(*dest));
}

/**
 * ipv6_renew_options - replace a specific ext hdr with a new one.
 *
 * @sk: sock from which to allocate memory
 * @opt: original options
 * @newtype: option type to replace in @opt
 * @newopt: new option of type @newtype to replace (user-mem)
 *
 * Returns a new set of options which is a copy of @opt with the
 * option type @newtype replaced with @newopt.
 *
 * @opt may be NULL, in which case a new set of options is returned
 * containing just @newopt.
 *
 * @newopt may be NULL, in which case the specified option type is
 * not copied into the new set of options.
 *
 * The new set of options is allocated from the socket option memory
 * buffer of @sk.
 */
struct ipv6_txoptions *
ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
                   int newtype, struct ipv6_opt_hdr *newopt)
{
        int tot_len = 0;
        char *p;
        struct ipv6_txoptions *opt2;

        if (opt) {
                if (newtype != IPV6_HOPOPTS && opt->hopopt)
                        tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt));
                if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt)
                        tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt));
                if (newtype != IPV6_RTHDR && opt->srcrt)
                        tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt));
                if (newtype != IPV6_DSTOPTS && opt->dst1opt)
                        tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
        }

        if (newopt)
                tot_len += CMSG_ALIGN(ipv6_optlen(newopt));

        if (!tot_len)
                return NULL;

        tot_len += sizeof(*opt2);
        opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC);
        if (!opt2)
                return ERR_PTR(-ENOBUFS);

        memset(opt2, 0, tot_len);
        refcount_set(&opt2->refcnt, 1);
        opt2->tot_len = tot_len;
        p = (char *)(opt2 + 1);

        ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt,
                          (opt ? opt->hopopt : NULL),
                          newopt, newtype, &p);
        ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt,
                          (opt ? opt->dst0opt : NULL),
                          newopt, newtype, &p);
        ipv6_renew_option(IPV6_RTHDR,
                          (struct ipv6_opt_hdr **)&opt2->srcrt,
                          (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL),
                          newopt, newtype, &p);
        ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt,
                          (opt ? opt->dst1opt : NULL),
                          newopt, newtype, &p);

        opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
                          (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
                          (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0);
        opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);

        return opt2;
}

struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
                                            struct ipv6_txoptions *opt)
{
        /*
         * ignore the dest before srcrt unless srcrt is being included.
         * --yoshfuji
         */
        if (opt->dst0opt && !opt->srcrt) {
                if (opt_space != opt) {
                        memcpy(opt_space, opt, sizeof(*opt_space));
                        opt = opt_space;
                }
                opt->opt_nflen -= ipv6_optlen(opt->dst0opt);
                opt->dst0opt = NULL;
        }

        return opt;
}
EXPORT_SYMBOL_GPL(__ipv6_fixup_options);

/**
 * __fl6_update_dst - update flowi destination address with info given
 *                  by srcrt option, if any.
 *
 * @fl6: flowi6 for which daddr is to be updated
 * @opt: struct ipv6_txoptions in which to look for srcrt opt
 * @orig: copy of original daddr address if modified
 *
 * Return: NULL if no srcrt or invalid srcrt type, otherwise returns orig
 * and initial value of fl6->daddr set in orig
 */
struct in6_addr *__fl6_update_dst(struct flowi6 *fl6,
                                  const struct ipv6_txoptions *opt,
                                  struct in6_addr *orig)
{
        if (!opt->srcrt)
                return NULL;

        *orig = fl6->daddr;

        switch (opt->srcrt->type) {
        case IPV6_SRCRT_TYPE_0:
        case IPV6_SRCRT_STRICT:
        case IPV6_SRCRT_TYPE_2:
                fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
                break;
        case IPV6_SRCRT_TYPE_4:
        {
                struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt;

                fl6->daddr = srh->segments[srh->segments_left];
                break;
        }
        default:
                return NULL;
        }

        return orig;
}
EXPORT_SYMBOL_GPL(__fl6_update_dst);












































































































































































































































































    3 




   26 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H

#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>

#include <trace/events/tlb.h>

#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/gsseg.h>
#include <asm/desc.h>

extern atomic64_t last_mm_ctx_id;

#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
void cr4_update_pce(void *ignored);
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
 * ldt_structs can be allocated, used, and freed, but they are never
 * modified while live.
 */
struct ldt_struct {
        /*
         * Xen requires page-aligned LDTs with special permissions.  This is
         * needed to prevent us from installing evil descriptors such as
         * call gates.  On native, we could merge the ldt_struct and LDT
         * allocations, but it's not worth trying to optimize.
         */
        struct desc_struct        *entries;
        unsigned int                nr_entries;

        /*
         * If PTI is in use, then the entries array is not mapped while we're
         * in user mode.  The whole array will be aliased at the addressed
         * given by ldt_slot_va(slot).  We use two slots so that we can allocate
         * and map, and enable a new LDT without invalidating the mapping
         * of an older, still-in-use LDT.
         *
         * slot will be -1 if this LDT doesn't have an alias mapping.
         */
        int                        slot;
};

/*
 * Used for LDT copy/destruction.
 */
static inline void init_new_context_ldt(struct mm_struct *mm)
{
        mm->context.ldt = NULL;
        init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else        /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
        return 0;
}
static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
extern void load_mm_ldt(struct mm_struct *mm);
extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
#else
static inline void load_mm_ldt(struct mm_struct *mm)
{
        clear_LDT();
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
        DEBUG_LOCKS_WARN_ON(preemptible());
}
#endif

#ifdef CONFIG_ADDRESS_MASKING
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
        /*
         * When switch_mm_irqs_off() is called for a kthread, it may race with
         * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two
         * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it
         * reads a single value for both.
         */
        return READ_ONCE(mm->context.lam_cr3_mask);
}

static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
        mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
        mm->context.untag_mask = oldmm->context.untag_mask;
}

#define mm_untag_mask mm_untag_mask
static inline unsigned long mm_untag_mask(struct mm_struct *mm)
{
        return mm->context.untag_mask;
}

static inline void mm_reset_untag_mask(struct mm_struct *mm)
{
        mm->context.untag_mask = -1UL;
}

#define arch_pgtable_dma_compat arch_pgtable_dma_compat
static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
{
        return !mm_lam_cr3_mask(mm) ||
                test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
}
#else

static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
        return 0;
}

static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
}

static inline void mm_reset_untag_mask(struct mm_struct *mm)
{
}
#endif

extern void mm_init_global_asid(struct mm_struct *mm);
extern void mm_free_global_asid(struct mm_struct *mm);

/*
 * Init a new mm.  Used on mm copies, like at fork()
 * and on mm's that are brand-new, like at execve().
 */
#define init_new_context init_new_context
static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
{
        mutex_init(&mm->context.lock);

        mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
        atomic64_set(&mm->context.tlb_gen, 0);
        mm->context.next_trim_cpumask = jiffies + HZ;

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                /* pkey 0 is the default and allocated implicitly */
                mm->context.pkey_allocation_map = 0x1;
                /* -1 means unallocated or invalid */
                mm->context.execute_only_pkey = -1;
        }
#endif

        mm_init_global_asid(mm);
        mm_reset_untag_mask(mm);
        init_new_context_ldt(mm);
        return 0;
}

#define destroy_context destroy_context
static inline void destroy_context(struct mm_struct *mm)
{
        destroy_context_ldt(mm);
        mm_free_global_asid(mm);
}

extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                      struct task_struct *tsk);

extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                               struct task_struct *tsk);
#define switch_mm_irqs_off switch_mm_irqs_off

#define activate_mm(prev, next)                        \
do {                                                \
        paravirt_enter_mmap(next);                \
        switch_mm_irqs_off((prev), (next), NULL);        \
} while (0);

#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        loadsegment(gs, 0);                        \
} while (0)
#else
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        shstk_free(tsk);                        \
        load_gs_index(0);                        \
        loadsegment(fs, 0);                        \
} while (0)
#endif

static inline void arch_dup_pkeys(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        /* Duplicate the oldmm pkey state in mm: */
        mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
        mm->context.execute_only_pkey   = oldmm->context.execute_only_pkey;
#endif
}

static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
        arch_dup_pkeys(oldmm, mm);
        paravirt_enter_mmap(mm);
        dup_lam(oldmm, mm);
        return ldt_dup_context(oldmm, mm);
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
        paravirt_arch_exit_mmap(mm);
        ldt_arch_exit_mmap(mm);
}

#ifdef CONFIG_X86_64
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return        !IS_ENABLED(CONFIG_IA32_EMULATION) ||
                !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return false;
}
#endif

static inline bool is_notrack_mm(struct mm_struct *mm)
{
        return test_bit(MM_CONTEXT_NOTRACK, &mm->context.flags);
}

static inline void set_notrack_mm(struct mm_struct *mm)
{
        set_bit(MM_CONTEXT_NOTRACK, &mm->context.flags);
}

/*
 * We only want to enforce protection keys on the current process
 * because we effectively have no access to PKRU for other
 * processes or any way to tell *which * PKRU in a threaded
 * process we could use.
 *
 * So do not enforce things if the VMA is not from the current
 * mm, or if we are in a kernel thread.
 */
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
                bool write, bool execute, bool foreign)
{
        /* pkeys never affect instruction fetches */
        if (execute)
                return true;
        /* allow access if the VMA is not one from this process */
        if (foreign || vma_is_foreign(vma))
                return true;
        return __pkru_allows_pkey(vma_pkey(vma), write);
}

unsigned long __get_current_cr3_fast(void);

#include <asm-generic/mmu_context.h>

extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm);
extern void unuse_temporary_mm(struct mm_struct *prev_mm);

#endif /* _ASM_X86_MMU_CONTEXT_H */



























    3 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
/* SPDX-License-Identifier: GPL-2.0 */
/* linux/net/inet/arp.h */
#ifndef _ARP_H
#define _ARP_H

#include <linux/if_arp.h>
#include <linux/hash.h>
#include <net/neighbour.h>


extern struct neigh_table arp_tbl;

static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd)
{
        u32 key = *(const u32 *)pkey;
        u32 val = key ^ hash32_ptr(dev);

        return val * hash_rnd[0];
}

#ifdef CONFIG_INET
static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
{
        if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
                key = INADDR_ANY;

        return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev);
}
#else
static inline
struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
{
        return NULL;
}
#endif

static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key)
{
        struct neighbour *n;

        rcu_read_lock();
        n = __ipv4_neigh_lookup_noref(dev, key);
        if (n && !refcount_inc_not_zero(&n->refcnt))
                n = NULL;
        rcu_read_unlock();

        return n;
}

static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key)
{
        struct neighbour *n;

        rcu_read_lock();
        n = __ipv4_neigh_lookup_noref(dev, key);
        neigh_confirm(n);
        rcu_read_unlock();
}

void arp_init(void);
int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg);
void arp_send(int type, int ptype, __be32 dest_ip,
              struct net_device *dev, __be32 src_ip,
              const unsigned char *dest_hw,
              const unsigned char *src_hw, const unsigned char *th);
int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir);
void arp_ifdown(struct net_device *dev);
int arp_invalidate(struct net_device *dev, __be32 ip, bool force);

struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
                           struct net_device *dev, __be32 src_ip,
                           const unsigned char *dest_hw,
                           const unsigned char *src_hw,
                           const unsigned char *target_hw);
void arp_xmit(struct sk_buff *skb);

#endif        /* _ARP_H */









































































































































































































   21 
   22 



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef TUN_VNET_H
#define TUN_VNET_H

/* High bits in flags field are unused. */
#define TUN_VNET_LE     0x80000000
#define TUN_VNET_BE     0x40000000

#define TUN_VNET_TNL_SIZE        sizeof(struct virtio_net_hdr_v1_hash_tunnel)

static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags)
{
        bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) &&
                  (flags & TUN_VNET_BE);

        return !be && virtio_legacy_is_little_endian();
}

static inline long tun_get_vnet_be(unsigned int flags, int __user *argp)
{
        int be = !!(flags & TUN_VNET_BE);

        if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE))
                return -EINVAL;

        if (put_user(be, argp))
                return -EFAULT;

        return 0;
}

static inline long tun_set_vnet_be(unsigned int *flags, int __user *argp)
{
        int be;

        if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE))
                return -EINVAL;

        if (get_user(be, argp))
                return -EFAULT;

        if (be)
                *flags |= TUN_VNET_BE;
        else
                *flags &= ~TUN_VNET_BE;

        return 0;
}

static inline bool tun_vnet_is_little_endian(unsigned int flags)
{
        return flags & TUN_VNET_LE || tun_vnet_legacy_is_little_endian(flags);
}

static inline u16 tun_vnet16_to_cpu(unsigned int flags, __virtio16 val)
{
        return __virtio16_to_cpu(tun_vnet_is_little_endian(flags), val);
}

static inline __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val)
{
        return __cpu_to_virtio16(tun_vnet_is_little_endian(flags), val);
}

static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags,
                                  unsigned int cmd, int __user *sp)
{
        int s;

        switch (cmd) {
        case TUNGETVNETHDRSZ:
                s = *vnet_hdr_sz;
                if (put_user(s, sp))
                        return -EFAULT;
                return 0;

        case TUNSETVNETHDRSZ:
                if (get_user(s, sp))
                        return -EFAULT;
                if (s < (int)sizeof(struct virtio_net_hdr))
                        return -EINVAL;

                *vnet_hdr_sz = s;
                return 0;

        case TUNGETVNETLE:
                s = !!(*flags & TUN_VNET_LE);
                if (put_user(s, sp))
                        return -EFAULT;
                return 0;

        case TUNSETVNETLE:
                if (get_user(s, sp))
                        return -EFAULT;
                if (s)
                        *flags |= TUN_VNET_LE;
                else
                        *flags &= ~TUN_VNET_LE;
                return 0;

        case TUNGETVNETBE:
                return tun_get_vnet_be(*flags, sp);

        case TUNSETVNETBE:
                return tun_set_vnet_be(flags, sp);

        default:
                return -EINVAL;
        }
}

static inline unsigned int tun_vnet_parse_size(netdev_features_t features)
{
        if (!(features & NETIF_F_GSO_UDP_TUNNEL))
                return sizeof(struct virtio_net_hdr);

        return TUN_VNET_TNL_SIZE;
}

static inline int __tun_vnet_hdr_get(int sz, unsigned int flags,
                                     netdev_features_t features,
                                     struct iov_iter *from,
                                     struct virtio_net_hdr *hdr)
{
        unsigned int parsed_size = tun_vnet_parse_size(features);
        u16 hdr_len;

        if (iov_iter_count(from) < sz)
                return -EINVAL;

        if (!copy_from_iter_full(hdr, parsed_size, from))
                return -EFAULT;

        hdr_len = tun_vnet16_to_cpu(flags, hdr->hdr_len);

        if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
                hdr_len = max(tun_vnet16_to_cpu(flags, hdr->csum_start) + tun_vnet16_to_cpu(flags, hdr->csum_offset) + 2, hdr_len);
                hdr->hdr_len = cpu_to_tun_vnet16(flags, hdr_len);
        }

        if (hdr_len > iov_iter_count(from))
                return -EINVAL;

        iov_iter_advance(from, sz - parsed_size);

        return hdr_len;
}

static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
                                   struct iov_iter *from,
                                   struct virtio_net_hdr *hdr)
{
        return __tun_vnet_hdr_get(sz, flags, 0, from, hdr);
}

static inline int __tun_vnet_hdr_put(int sz, netdev_features_t features,
                                     struct iov_iter *iter,
                                     const struct virtio_net_hdr *hdr)
{
        unsigned int parsed_size = tun_vnet_parse_size(features);

        if (unlikely(iov_iter_count(iter) < sz))
                return -EINVAL;

        if (unlikely(copy_to_iter(hdr, parsed_size, iter) != parsed_size))
                return -EFAULT;

        if (iov_iter_zero(sz - parsed_size, iter) != sz - parsed_size)
                return -EFAULT;

        return 0;
}

static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
                                   const struct virtio_net_hdr *hdr)
{
        return __tun_vnet_hdr_put(sz, 0, iter, hdr);
}

static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb,
                                      const struct virtio_net_hdr *hdr)
{
        return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags));
}

/*
 * Tun is not aware of the negotiated guest features, guess them from the
 * virtio net hdr size
 */
static inline netdev_features_t tun_vnet_hdr_guest_features(int vnet_hdr_sz)
{
        if (vnet_hdr_sz >= TUN_VNET_TNL_SIZE)
                return NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM;
        return 0;
}

static inline int
tun_vnet_hdr_tnl_to_skb(unsigned int flags, netdev_features_t features,
                        struct sk_buff *skb,
                        const struct virtio_net_hdr_v1_hash_tunnel *hdr)
{
        return virtio_net_hdr_tnl_to_skb(skb, hdr,
                                features & NETIF_F_GSO_UDP_TUNNEL,
                                features & NETIF_F_GSO_UDP_TUNNEL_CSUM,
                                tun_vnet_is_little_endian(flags));
}

static inline int tun_vnet_hdr_from_skb(unsigned int flags,
                                        const struct net_device *dev,
                                        const struct sk_buff *skb,
                                        struct virtio_net_hdr *hdr)
{
        int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;

        if (virtio_net_hdr_from_skb(skb, hdr,
                                    tun_vnet_is_little_endian(flags), true,
                                    vlan_hlen)) {
                struct skb_shared_info *sinfo = skb_shinfo(skb);

                if (net_ratelimit()) {
                        netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
                                   sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size),
                                   tun_vnet16_to_cpu(flags, hdr->hdr_len));
                        print_hex_dump(KERN_ERR, "tun: ",
                                       DUMP_PREFIX_NONE,
                                       16, 1, skb->head,
                                       min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true);
                }
                WARN_ON_ONCE(1);
                return -EINVAL;
        }

        return 0;
}

static inline int
tun_vnet_hdr_tnl_from_skb(unsigned int flags,
                          const struct net_device *dev,
                          const struct sk_buff *skb,
                          struct virtio_net_hdr_v1_hash_tunnel *tnl_hdr)
{
        bool has_tnl_offload = !!(dev->features & NETIF_F_GSO_UDP_TUNNEL);
        int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;

        if (virtio_net_hdr_tnl_from_skb(skb, tnl_hdr, has_tnl_offload,
                                        tun_vnet_is_little_endian(flags),
                                        vlan_hlen, true, false)) {
                struct virtio_net_hdr_v1 *hdr = &tnl_hdr->hash_hdr.hdr;
                struct skb_shared_info *sinfo = skb_shinfo(skb);

                if (net_ratelimit()) {
                        int hdr_len = tun_vnet16_to_cpu(flags, hdr->hdr_len);

                        netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
                                   sinfo->gso_type,
                                   tun_vnet16_to_cpu(flags, hdr->gso_size),
                                   tun_vnet16_to_cpu(flags, hdr->hdr_len));
                        print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE,
                                       16, 1, skb->head, min(hdr_len, 64),
                                       true);
                }
                WARN_ON_ONCE(1);
                return -EINVAL;
        }

        return 0;
}

#endif /* TUN_VNET_H */






























































   10 









   10 


















































































   13 
   13 


   13 
    7 






   12 



   11 



   11 





























































































































































    1 
    1 

































   12 









    5 





















   11 


   11 


    8 

   11 





   11 











   11 


   11 













   11 








































    9 














    5 









    5 






    4 



















    1 








    1 



    1 










    1 





    1 


    1 





    1 



    1 





















    1 






































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/truncate.c - code for taking down pages from address_spaces
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 10Sep2002        Andrew Morton
 *                Initial version.
 */

#include <linux/kernel.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/folio_batch.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include "internal.h"

static void clear_shadow_entries(struct address_space *mapping,
                                 unsigned long start, unsigned long max)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct folio *folio;

        /* Handled by shmem itself, or for DAX we do nothing. */
        if (shmem_mapping(mapping) || dax_mapping(mapping))
                return;

        xas_set_update(&xas, workingset_update_node);

        spin_lock(&mapping->host->i_lock);
        xas_lock_irq(&xas);

        /* Clear all shadow entries from start to max */
        xas_for_each(&xas, folio, max) {
                if (xa_is_value(folio))
                        xas_store(&xas, NULL);
        }

        xas_unlock_irq(&xas);
        if (mapping_shrinkable(mapping))
                inode_lru_list_add(mapping->host);
        spin_unlock(&mapping->host->i_lock);
}

/*
 * Unconditionally remove exceptional entries. Usually called from truncate
 * path. Note that the folio_batch may be altered by this function by removing
 * exceptional entries similar to what folio_batch_remove_exceptionals() does.
 * Please note that indices[] has entries in ascending order as guaranteed by
 * either find_get_entries() or find_lock_entries().
 */
static void truncate_folio_batch_exceptionals(struct address_space *mapping,
                                struct folio_batch *fbatch, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, indices[0]);
        int nr = folio_batch_count(fbatch);
        struct folio *folio;
        int i, j;

        /* Handled by shmem itself */
        if (shmem_mapping(mapping))
                return;

        for (j = 0; j < nr; j++)
                if (xa_is_value(fbatch->folios[j]))
                        break;

        if (j == nr)
                return;

        if (dax_mapping(mapping)) {
                for (i = j; i < nr; i++) {
                        if (xa_is_value(fbatch->folios[i])) {
                                /*
                                 * File systems should already have called
                                 * dax_break_layout_entry() to remove all DAX
                                 * entries while holding a lock to prevent
                                 * establishing new entries. Therefore we
                                 * shouldn't find any here.
                                 */
                                WARN_ON_ONCE(1);

                                /*
                                 * Delete the mapping so truncate_pagecache()
                                 * doesn't loop forever.
                                 */
                                dax_delete_mapping_entry(mapping, indices[i]);
                        }
                }
                goto out;
        }

        xas_set(&xas, indices[j]);
        xas_set_update(&xas, workingset_update_node);

        spin_lock(&mapping->host->i_lock);
        xas_lock_irq(&xas);

        xas_for_each(&xas, folio, indices[nr-1]) {
                if (xa_is_value(folio))
                        xas_store(&xas, NULL);
        }

        xas_unlock_irq(&xas);
        if (mapping_shrinkable(mapping))
                inode_lru_list_add(mapping->host);
        spin_unlock(&mapping->host->i_lock);
out:
        folio_batch_remove_exceptionals(fbatch);
}

/**
 * folio_invalidate - Invalidate part or all of a folio.
 * @folio: The folio which is affected.
 * @offset: start of the range to invalidate
 * @length: length of the range to invalidate
 *
 * folio_invalidate() is called when all or part of the folio has become
 * invalidated by a truncate operation.
 *
 * folio_invalidate() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void folio_invalidate(struct folio *folio, size_t offset, size_t length)
{
        const struct address_space_operations *aops = folio->mapping->a_ops;

        if (aops->invalidate_folio)
                aops->invalidate_folio(folio, offset, length);
}
EXPORT_SYMBOL_GPL(folio_invalidate);

/*
 * If truncate cannot remove the fs-private metadata from the page, the page
 * becomes orphaned.  It will be left on the LRU and may even be mapped into
 * user pagetables if we're racing with filemap_fault().
 *
 * We need to bail out if page->mapping is no longer equal to the original
 * mapping.  This happens a) when the VM reclaimed the page while we waited on
 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
 */
static void truncate_cleanup_folio(struct folio *folio)
{
        if (folio_mapped(folio))
                unmap_mapping_folio(folio);

        if (folio_needs_release(folio))
                folio_invalidate(folio, 0, folio_size(folio));

        /*
         * Some filesystems seem to re-dirty the page even after
         * the VM has canceled the dirty bit (eg ext3 journaling).
         * Hence dirty accounting check is placed after invalidation.
         */
        folio_cancel_dirty(folio);
}

int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
{
        if (folio->mapping != mapping)
                return -EIO;

        truncate_cleanup_folio(folio);
        filemap_remove_folio(folio);
        return 0;
}

static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at,
                                    unsigned long min_order)
{
        enum ttu_flags ttu_flags =
                TTU_SYNC |
                TTU_SPLIT_HUGE_PMD |
                TTU_IGNORE_MLOCK;
        int ret;

        ret = try_folio_split_to_order(folio, split_at, min_order);

        /*
         * If the split fails, unmap the folio, so it will be refaulted
         * with PTEs to respect SIGBUS semantics.
         *
         * Make an exception for shmem/tmpfs that for long time
         * intentionally mapped with PMDs across i_size.
         */
        if (ret && !shmem_mapping(folio->mapping)) {
                try_to_unmap(folio, ttu_flags);
                WARN_ON(folio_mapped(folio));
        }

        return ret;
}

/*
 * Handle partial folios.  The folio may be entirely within the
 * range if a split has raced with us.  If not, we zero the part of the
 * folio that's within the [start, end] range, and then split the folio if
 * it's large.  split_page_range() will discard pages which now lie beyond
 * i_size, and we rely on the caller to discard pages which lie within a
 * newly created hole.
 *
 * Returns false if splitting failed so the caller can avoid
 * discarding the entire folio which is stubbornly unsplit.
 */
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
        loff_t pos = folio_pos(folio);
        size_t size = folio_size(folio);
        unsigned int offset, length;
        struct page *split_at, *split_at2;
        unsigned int min_order;

        if (pos < start)
                offset = start - pos;
        else
                offset = 0;
        if (pos + size <= (u64)end)
                length = size - offset;
        else
                length = end + 1 - pos - offset;

        folio_wait_writeback(folio);
        if (length == size) {
                truncate_inode_folio(folio->mapping, folio);
                return true;
        }

        /*
         * We may be zeroing pages we're about to discard, but it avoids
         * doing a complex calculation here, and then doing the zeroing
         * anyway if the page split fails.
         */
        if (!mapping_inaccessible(folio->mapping))
                folio_zero_range(folio, offset, length);

        if (folio_needs_release(folio))
                folio_invalidate(folio, offset, length);
        if (!folio_test_large(folio))
                return true;

        min_order = mapping_min_folio_order(folio->mapping);
        split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
        if (!try_folio_split_or_unmap(folio, split_at, min_order)) {
                /*
                 * try to split at offset + length to make sure folios within
                 * the range can be dropped, especially to avoid memory waste
                 * for shmem truncate
                 */
                struct folio *folio2;

                if (offset + length == size)
                        goto no_split;

                split_at2 = folio_page(folio,
                                PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
                folio2 = page_folio(split_at2);

                if (!folio_try_get(folio2))
                        goto no_split;

                if (!folio_test_large(folio2))
                        goto out;

                if (!folio_trylock(folio2))
                        goto out;

                /* make sure folio2 is large and does not change its mapping */
                if (folio_test_large(folio2) &&
                    folio2->mapping == folio->mapping)
                        try_folio_split_or_unmap(folio2, split_at2, min_order);

                folio_unlock(folio2);
out:
                folio_put(folio2);
no_split:
                return true;
        }
        if (folio_test_dirty(folio))
                return false;
        truncate_inode_folio(folio->mapping, folio);
        return true;
}

/*
 * Used to get rid of pages on hardware memory corruption.
 */
int generic_error_remove_folio(struct address_space *mapping,
                struct folio *folio)
{
        if (!mapping)
                return -EINVAL;
        /*
         * Only punch for normal data pages for now.
         * Handling other types like directories would need more auditing.
         */
        if (!S_ISREG(mapping->host->i_mode))
                return -EIO;
        return truncate_inode_folio(mapping, folio);
}
EXPORT_SYMBOL(generic_error_remove_folio);

/**
 * mapping_evict_folio() - Remove an unused folio from the page-cache.
 * @mapping: The mapping this folio belongs to.
 * @folio: The folio to remove.
 *
 * Safely remove one folio from the page cache.
 * It only drops clean, unused folios.
 *
 * Context: Folio must be locked.
 * Return: The number of pages successfully removed.
 */
long mapping_evict_folio(struct address_space *mapping, struct folio *folio)
{
        /* The page may have been truncated before it was locked */
        if (!mapping)
                return 0;
        if (folio_test_dirty(folio) || folio_test_writeback(folio))
                return 0;
        /* The refcount will be elevated if any page in the folio is mapped */
        if (folio_ref_count(folio) >
                        folio_nr_pages(folio) + folio_has_private(folio) + 1)
                return 0;
        if (!filemap_release_folio(folio, 0))
                return 0;

        return remove_mapping(mapping, folio);
}

/**
 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 * @lend: offset to which to truncate (inclusive)
 *
 * Truncate the page cache, removing the pages that are between
 * specified offsets (and zeroing out partial pages
 * if lstart or lend + 1 is not page aligned).
 *
 * Truncate takes two passes - the first pass is nonblocking.  It will not
 * block on page locks and it will not block on writeback.  The second pass
 * will wait.  This is to prevent as much IO as possible in the affected region.
 * The first pass will remove most pages, so the search cost of the second pass
 * is low.
 *
 * We pass down the cache-hot hint to the page freeing code.  Even if the
 * mapping is large, it is probably the case that the final pages are the most
 * recently touched, and freeing happens in ascending file offset order.
 *
 * Note that since ->invalidate_folio() accepts range to invalidate
 * truncate_inode_pages_range is able to handle cases where lend + 1 is not
 * page aligned properly.
 */
void truncate_inode_pages_range(struct address_space *mapping,
                                loff_t lstart, uoff_t lend)
{
        pgoff_t                start;                /* inclusive */
        pgoff_t                end;                /* exclusive */
        struct folio_batch fbatch;
        pgoff_t                indices[FOLIO_BATCH_SIZE];
        pgoff_t                index;
        int                i;
        struct folio        *folio;
        bool                same_folio;

        if (mapping_empty(mapping))
                return;

        /*
         * 'start' and 'end' always covers the range of pages to be fully
         * truncated. Partial pages are covered with 'partial_start' at the
         * start of the range and 'partial_end' at the end of the range.
         * Note that 'end' is exclusive while 'lend' is inclusive.
         */
        start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (lend == -1)
                /*
                 * lend == -1 indicates end-of-file so we have to set 'end'
                 * to the highest possible pgoff_t and since the type is
                 * unsigned we're using -1.
                 */
                end = -1;
        else
                end = (lend + 1) >> PAGE_SHIFT;

        folio_batch_init(&fbatch);
        index = start;
        while (index < end && find_lock_entries(mapping, &index, end - 1,
                        &fbatch, indices)) {
                truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        truncate_cleanup_folio(fbatch.folios[i]);
                delete_from_page_cache_batch(mapping, &fbatch);
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        folio_unlock(fbatch.folios[i]);
                folio_batch_release(&fbatch);
                cond_resched();
        }

        same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
        folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
        if (!IS_ERR(folio)) {
                same_folio = lend < folio_next_pos(folio);
                if (!truncate_inode_partial_folio(folio, lstart, lend)) {
                        start = folio_next_index(folio);
                        if (same_folio)
                                end = folio->index;
                }
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
        }

        if (!same_folio) {
                folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
                                                FGP_LOCK, 0);
                if (!IS_ERR(folio)) {
                        if (!truncate_inode_partial_folio(folio, lstart, lend))
                                end = folio->index;
                        folio_unlock(folio);
                        folio_put(folio);
                }
        }

        index = start;
        while (index < end) {
                cond_resched();
                if (!find_get_entries(mapping, &index, end - 1, &fbatch,
                                indices)) {
                        /* If all gone from start onwards, we're done */
                        if (index == start)
                                break;
                        /* Otherwise restart to make sure all gone */
                        index = start;
                        continue;
                }

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing folio->index */

                        if (xa_is_value(folio))
                                continue;

                        folio_lock(folio);
                        VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
                        folio_wait_writeback(folio);
                        truncate_inode_folio(mapping, folio);
                        folio_unlock(folio);
                }
                truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
                folio_batch_release(&fbatch);
        }
}
EXPORT_SYMBOL(truncate_inode_pages_range);

/**
 * truncate_inode_pages - truncate *all* the pages from an offset
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 *
 * Called under (and serialised by) inode->i_rwsem and
 * mapping->invalidate_lock.
 *
 * Note: When this function returns, there can be a page in the process of
 * deletion (inside __filemap_remove_folio()) in the specified range.  Thus
 * mapping->nrpages can be non-zero when this function returns even after
 * truncation of the whole mapping.
 */
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
        truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
}
EXPORT_SYMBOL(truncate_inode_pages);

/**
 * truncate_inode_pages_final - truncate *all* pages before inode dies
 * @mapping: mapping to truncate
 *
 * Called under (and serialized by) inode->i_rwsem.
 *
 * Filesystems have to use this in the .evict_inode path to inform the
 * VM that this is the final truncate and the inode is going away.
 */
void truncate_inode_pages_final(struct address_space *mapping)
{
        /*
         * Page reclaim can not participate in regular inode lifetime
         * management (can't call iput()) and thus can race with the
         * inode teardown.  Tell it when the address space is exiting,
         * so that it does not install eviction information after the
         * final truncate has begun.
         */
        mapping_set_exiting(mapping);

        if (!mapping_empty(mapping)) {
                /*
                 * As truncation uses a lockless tree lookup, cycle
                 * the tree lock to make sure any ongoing tree
                 * modification that does not see AS_EXITING is
                 * completed before starting the final truncate.
                 */
                xa_lock_irq(&mapping->i_pages);
                xa_unlock_irq(&mapping->i_pages);
        }

        truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);

/**
 * mapping_try_invalidate - Invalidate all the evictable folios of one inode
 * @mapping: the address_space which holds the folios to invalidate
 * @start: the offset 'from' which to invalidate
 * @end: the offset 'to' which to invalidate (inclusive)
 * @nr_failed: How many folio invalidations failed
 *
 * This function is similar to invalidate_mapping_pages(), except that it
 * returns the number of folios which could not be evicted in @nr_failed.
 */
unsigned long mapping_try_invalidate(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_failed)
{
        pgoff_t indices[FOLIO_BATCH_SIZE];
        struct folio_batch fbatch;
        pgoff_t index = start;
        unsigned long ret;
        unsigned long count = 0;
        int i;

        folio_batch_init(&fbatch);
        while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
                bool xa_has_values = false;
                int nr = folio_batch_count(&fbatch);

                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing folio->index */

                        if (xa_is_value(folio)) {
                                xa_has_values = true;
                                count++;
                                continue;
                        }

                        ret = mapping_evict_folio(mapping, folio);
                        folio_unlock(folio);
                        /*
                         * Invalidation is a hint that the folio is no longer
                         * of interest and try to speed up its reclaim.
                         */
                        if (!ret) {
                                deactivate_file_folio(folio);
                                /* Likely in the lru cache of a remote CPU */
                                if (nr_failed)
                                        (*nr_failed)++;
                        }
                        count += ret;
                }

                if (xa_has_values)
                        clear_shadow_entries(mapping, indices[0], indices[nr-1]);

                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
        return count;
}

/**
 * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
 * @mapping: the address_space which holds the cache to invalidate
 * @start: the offset 'from' which to invalidate
 * @end: the offset 'to' which to invalidate (inclusive)
 *
 * This function removes pages that are clean, unmapped and unlocked,
 * as well as shadow entries. It will not block on IO activity.
 *
 * If you want to remove all the pages of one inode, regardless of
 * their use and writeback state, use truncate_inode_pages().
 *
 * Return: The number of indices that had their contents invalidated
 */
unsigned long invalidate_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t end)
{
        return mapping_try_invalidate(mapping, start, end, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);

static int folio_launder(struct address_space *mapping, struct folio *folio)
{
        if (!folio_test_dirty(folio))
                return 0;
        if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
                return 0;
        return mapping->a_ops->launder_folio(folio);
}

/*
 * This is like mapping_evict_folio(), except it ignores the folio's
 * refcount.  We do this because invalidate_inode_pages2() needs stronger
 * invalidation guarantees, and cannot afford to leave folios behind because
 * shrink_folio_list() has a temp ref on them, or because they're transiently
 * sitting in the folio_add_lru() caches.
 */
int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
                           gfp_t gfp)
{
        void (*free_folio)(struct folio *);
        int ret;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (folio_mapped(folio))
                unmap_mapping_folio(folio);
        BUG_ON(folio_mapped(folio));

        ret = folio_launder(mapping, folio);
        if (ret)
                return ret;
        if (folio->mapping != mapping)
                return -EBUSY;
        if (!filemap_release_folio(folio, gfp))
                return -EBUSY;

        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        if (folio_test_dirty(folio))
                goto failed;

        BUG_ON(folio_has_private(folio));
        __filemap_remove_folio(folio, NULL);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_lru_list_add(mapping->host);
        free_folio = mapping->a_ops->free_folio;
        spin_unlock(&mapping->host->i_lock);

        if (free_folio)
                free_folio(folio);
        folio_put_refs(folio, folio_nr_pages(folio));
        return 1;
failed:
        xa_unlock_irq(&mapping->i_pages);
        spin_unlock(&mapping->host->i_lock);
        return -EBUSY;
}

/**
 * invalidate_inode_pages2_range - remove range of pages from an address_space
 * @mapping: the address_space
 * @start: the page offset 'from' which to invalidate
 * @end: the page offset 'to' which to invalidate (inclusive)
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
 * Return: -EBUSY if any pages could not be invalidated.
 */
int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
{
        pgoff_t indices[FOLIO_BATCH_SIZE];
        struct folio_batch fbatch;
        pgoff_t index;
        int i;
        int ret = 0;
        int ret2 = 0;
        int did_range_unmap = 0;

        if (mapping_empty(mapping))
                return 0;

        folio_batch_init(&fbatch);
        index = start;
        while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
                bool xa_has_values = false;
                int nr = folio_batch_count(&fbatch);

                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing folio->index */

                        if (xa_is_value(folio)) {
                                xa_has_values = true;
                                if (dax_mapping(mapping) &&
                                    !dax_invalidate_mapping_entry_sync(mapping, indices[i]))
                                        ret = -EBUSY;
                                continue;
                        }

                        if (!did_range_unmap && folio_mapped(folio)) {
                                /*
                                 * If folio is mapped, before taking its lock,
                                 * zap the rest of the file in one hit.
                                 */
                                unmap_mapping_pages(mapping, indices[i],
                                                (1 + end - indices[i]), false);
                                did_range_unmap = 1;
                        }

                        folio_lock(folio);
                        if (unlikely(folio->mapping != mapping)) {
                                folio_unlock(folio);
                                continue;
                        }
                        VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
                        folio_wait_writeback(folio);
                        ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL);
                        if (ret2 < 0)
                                ret = ret2;
                        folio_unlock(folio);
                }

                if (xa_has_values)
                        clear_shadow_entries(mapping, indices[0], indices[nr-1]);

                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
        /*
         * For DAX we invalidate page tables after invalidating page cache.  We
         * could invalidate page tables while invalidating each entry however
         * that would be expensive. And doing range unmapping before doesn't
         * work as we have no cheap way to find whether page cache entry didn't
         * get remapped later.
         */
        if (dax_mapping(mapping)) {
                unmap_mapping_pages(mapping, start, end - start + 1, false);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);

/**
 * invalidate_inode_pages2 - remove all pages from an address_space
 * @mapping: the address_space
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
 * Return: -EBUSY if any pages could not be invalidated.
 */
int invalidate_inode_pages2(struct address_space *mapping)
{
        return invalidate_inode_pages2_range(mapping, 0, -1);
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);

/**
 * truncate_pagecache - unmap and remove pagecache that has been truncated
 * @inode: inode
 * @newsize: new file size
 *
 * inode's new i_size must already be written before truncate_pagecache
 * is called.
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache(struct inode *inode, loff_t newsize)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t holebegin = round_up(newsize, PAGE_SIZE);

        /*
         * unmap_mapping_range is called twice, first simply for
         * efficiency so that truncate_inode_pages does fewer
         * single-page unmaps.  However after this first call, and
         * before truncate_inode_pages finishes, it is possible for
         * private pages to be COWed, which remain after
         * truncate_inode_pages finishes, hence the second
         * unmap_mapping_range call must be made for correctness.
         */
        unmap_mapping_range(mapping, holebegin, 0, 1);
        truncate_inode_pages(mapping, newsize);
        unmap_mapping_range(mapping, holebegin, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);

/**
 * truncate_setsize - update inode and pagecache for a new file size
 * @inode: inode
 * @newsize: new file size
 *
 * truncate_setsize updates i_size and performs pagecache truncation (if
 * necessary) to @newsize. It will be typically be called from the filesystem's
 * setattr function when ATTR_SIZE is passed in.
 *
 * Must be called with a lock serializing truncates and writes (generally
 * i_rwsem but e.g. xfs uses a different lock) and before all filesystem
 * specific block truncation has been performed.
 */
void truncate_setsize(struct inode *inode, loff_t newsize)
{
        loff_t oldsize = inode->i_size;

        i_size_write(inode, newsize);
        if (newsize > oldsize)
                pagecache_isize_extended(inode, oldsize, newsize);
        truncate_pagecache(inode, newsize);
}
EXPORT_SYMBOL(truncate_setsize);

/**
 * pagecache_isize_extended - update pagecache after extension of i_size
 * @inode:        inode for which i_size was extended
 * @from:        original inode size
 * @to:                new inode size
 *
 * Handle extension of inode size either caused by extending truncate or
 * by write starting after current i_size.  We mark the page straddling
 * current i_size RO so that page_mkwrite() is called on the first
 * write access to the page.  The filesystem will update its per-block
 * information before user writes to the page via mmap after the i_size
 * has been changed.
 *
 * The function must be called after i_size is updated so that page fault
 * coming after we unlock the folio will already see the new i_size.
 * The function must be called while we still hold i_rwsem - this not only
 * makes sure i_size is stable but also that userspace cannot observe new
 * i_size value before we are prepared to store mmap writes at new inode size.
 */
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
{
        int bsize = i_blocksize(inode);
        loff_t rounded_from;
        struct folio *folio;

        WARN_ON(to > inode->i_size);

        if (from >= to || bsize >= PAGE_SIZE)
                return;
        /* Page straddling @from will not have any hole block created? */
        rounded_from = round_up(from, bsize);
        if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
                return;

        folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE);
        /* Folio not cached? Nothing to do */
        if (IS_ERR(folio))
                return;
        /*
         * See folio_clear_dirty_for_io() for details why folio_mark_dirty()
         * is needed.
         */
        if (folio_mkclean(folio))
                folio_mark_dirty(folio);

        /*
         * The post-eof range of the folio must be zeroed before it is exposed
         * to the file. Writeback normally does this, but since i_size has been
         * increased we handle it here.
         */
        if (folio_test_dirty(folio)) {
                unsigned int offset, end;

                offset = from - folio_pos(folio);
                end = min_t(unsigned int, to - folio_pos(folio),
                            folio_size(folio));
                folio_zero_segment(folio, offset, end);
        }

        folio_unlock(folio);
        folio_put(folio);
}
EXPORT_SYMBOL(pagecache_isize_extended);

/**
 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
 * @inode: inode
 * @lstart: offset of beginning of hole
 * @lend: offset of last byte of hole
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t unmap_start = round_up(lstart, PAGE_SIZE);
        loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
        /*
         * This rounding is currently just for example: unmap_mapping_range
         * expands its hole outwards, whereas we want it to contract the hole
         * inwards.  However, existing callers of truncate_pagecache_range are
         * doing their own page rounding first.  Note that unmap_mapping_range
         * allows holelen 0 for all, and we allow lend -1 for end of file.
         */

        /*
         * Unlike in truncate_pagecache, unmap_mapping_range is called only
         * once (before truncating pagecache), and without "even_cows" flag:
         * hole-punching should not remove private COWed pages from the hole.
         */
        if ((u64)unmap_end > (u64)unmap_start)
                unmap_mapping_range(mapping, unmap_start,
                                    1 + unmap_end - unmap_start, 0);
        truncate_inode_pages_range(mapping, lstart, lend);
}
EXPORT_SYMBOL(truncate_pagecache_range);

































































   12 






   12 




   11 





    1 











    3 


    2 




















   12 














   13 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_RWSEM_H
#define _LINUX_PERCPU_RWSEM_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcuwait.h>
#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>

struct percpu_rw_semaphore {
        struct rcu_sync                rss;
        unsigned int __percpu        *read_count;
        struct rcuwait                writer;
        wait_queue_head_t        waiters;
        atomic_t                block;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)        .dep_map = { .name = #lockname },
#else
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
#endif

#define __DEFINE_PERCPU_RWSEM(name, is_static)                                \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);                \
is_static struct percpu_rw_semaphore name = {                                \
        .rss = __RCU_SYNC_INITIALIZER(name.rss),                        \
        .read_count = &__percpu_rwsem_rc_##name,                        \
        .writer = __RCUWAIT_INITIALIZER(name.writer),                        \
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),                \
        .block = ATOMIC_INIT(0),                                        \
        __PERCPU_RWSEM_DEP_MAP_INIT(name)                                \
}

#define DEFINE_PERCPU_RWSEM(name)                \
        __DEFINE_PERCPU_RWSEM(name, /* not static */)
#define DEFINE_STATIC_PERCPU_RWSEM(name)        \
        __DEFINE_PERCPU_RWSEM(name, static)

extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool, bool);

static inline void percpu_down_read_internal(struct percpu_rw_semaphore *sem,
                                             bool freezable)
{
        might_sleep();

        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);

        preempt_disable();
        /*
         * We are in an RCU-sched read-side critical section, so the writer
         * cannot both change sem->state from readers_fast and start checking
         * counters while we are here. So if we see !sem->state, we know that
         * the writer won't be checking until we're past the preempt_enable()
         * and that once the synchronize_rcu() is done, the writer will see
         * anything we did within this RCU-sched read-size critical section.
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                __percpu_down_read(sem, false, freezable); /* Unconditional memory barrier */
        /*
         * The preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */
        preempt_enable();
}

static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
        percpu_down_read_internal(sem, false);
}

static inline void percpu_down_read_freezable(struct percpu_rw_semaphore *sem,
                                              bool freeze)
{
        percpu_down_read_internal(sem, freeze);
}

static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
        bool ret = true;

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                ret = __percpu_down_read(sem, true, false); /* Unconditional memory barrier */
        preempt_enable();
        /*
         * The barrier() from preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */

        if (ret)
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);

        return ret;
}

static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
        rwsem_release(&sem->dep_map, _RET_IP_);

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss))) {
                this_cpu_dec(*sem->read_count);
        } else {
                /*
                 * slowpath; reader will only ever wake a single blocked
                 * writer.
                 */
                smp_mb(); /* B matches C */
                /*
                 * In other words, if they see our decrement (presumably to
                 * aggregate zero, as that is the only time it matters) they
                 * will also see our critical section.
                 */
                this_cpu_dec(*sem->read_count);
                rcuwait_wake_up(&sem->writer);
        }
        preempt_enable();
}

extern bool percpu_is_read_locked(struct percpu_rw_semaphore *);
extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);

DEFINE_GUARD(percpu_read, struct percpu_rw_semaphore *,
             percpu_down_read(_T), percpu_up_read(_T))
DEFINE_GUARD_COND(percpu_read, _try, percpu_down_read_trylock(_T))

DEFINE_GUARD(percpu_write, struct percpu_rw_semaphore *,
             percpu_down_write(_T), percpu_up_write(_T))

static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem)
{
        return atomic_read(&sem->block);
}

extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
                                const char *, struct lock_class_key *);

extern void percpu_free_rwsem(struct percpu_rw_semaphore *);

#define percpu_init_rwsem(sem)                                        \
({                                                                \
        static struct lock_class_key rwsem_key;                        \
        __percpu_init_rwsem(sem, #sem, &rwsem_key);                \
})

#define percpu_rwsem_is_write_held(sem)        lockdep_is_held_type(sem, 0)
#define percpu_rwsem_is_held(sem)        lockdep_is_held(sem)
#define percpu_rwsem_assert_held(sem)        lockdep_assert_held(sem)

static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
                                        unsigned long ip)
{
        lock_release(&sem->dep_map, ip);
}

static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
}

#endif



















































































































































































































    1 





    1 















    1 













    1 




    1 




    1 























    1 





    1 















































































































































































































    1 










    1 














































    1 































































    1 













    1 


































































































































































































































































































































    1 





















    1 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
/*
 * Compressed rom filesystem for Linux.
 *
 * Copyright (C) 1999 Linus Torvalds.
 *
 * This file is released under the GPL.
 */

/*
 * These are the VFS interfaces to the compressed rom filesystem.
 * The actual compression is based on zlib, see the other files.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/filelock.h>
#include <linux/pagemap.h>
#include <linux/ramfs.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/blkdev.h>
#include <linux/mtd/mtd.h>
#include <linux/mtd/super.h>
#include <linux/fs_context.h>
#include <linux/slab.h>
#include <linux/vfs.h>
#include <linux/mutex.h>
#include <uapi/linux/cramfs_fs.h>
#include <linux/uaccess.h>

#include "internal.h"

/*
 * cramfs super-block data in memory
 */
struct cramfs_sb_info {
        unsigned long magic;
        unsigned long size;
        unsigned long blocks;
        unsigned long files;
        unsigned long flags;
        void *linear_virt_addr;
        resource_size_t linear_phys_addr;
        size_t mtd_point_size;
};

static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

static const struct super_operations cramfs_ops;
static const struct inode_operations cramfs_dir_inode_operations;
static const struct file_operations cramfs_directory_operations;
static const struct file_operations cramfs_physmem_fops;
static const struct address_space_operations cramfs_aops;

static DEFINE_MUTEX(read_mutex);


/* These macros may change in future, to provide better st_ino semantics. */
#define OFFSET(x)        ((x)->i_ino)

static unsigned long cramino(const struct cramfs_inode *cino, unsigned int offset)
{
        if (!cino->offset)
                return offset + 1;
        if (!cino->size)
                return offset + 1;

        /*
         * The file mode test fixes buggy mkcramfs implementations where
         * cramfs_inode->offset is set to a non zero value for entries
         * which did not contain data, like devices node and fifos.
         */
        switch (cino->mode & S_IFMT) {
        case S_IFREG:
        case S_IFDIR:
        case S_IFLNK:
                return cino->offset << 2;
        default:
                break;
        }
        return offset + 1;
}

static struct inode *get_cramfs_inode(struct super_block *sb,
        const struct cramfs_inode *cramfs_inode, unsigned int offset)
{
        struct inode *inode;
        static struct timespec64 zerotime;

        inode = iget_locked(sb, cramino(cramfs_inode, offset));
        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (!(inode_state_read_once(inode) & I_NEW))
                return inode;

        switch (cramfs_inode->mode & S_IFMT) {
        case S_IFREG:
                inode->i_fop = &generic_ro_fops;
                inode->i_data.a_ops = &cramfs_aops;
                if (IS_ENABLED(CONFIG_CRAMFS_MTD) &&
                    CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS &&
                    CRAMFS_SB(sb)->linear_phys_addr)
                        inode->i_fop = &cramfs_physmem_fops;
                break;
        case S_IFDIR:
                inode->i_op = &cramfs_dir_inode_operations;
                inode->i_fop = &cramfs_directory_operations;
                break;
        case S_IFLNK:
                inode->i_op = &page_symlink_inode_operations;
                inode_nohighmem(inode);
                inode->i_data.a_ops = &cramfs_aops;
                break;
        case S_IFCHR:
        case S_IFBLK:
        case S_IFIFO:
        case S_IFSOCK:
                init_special_inode(inode, cramfs_inode->mode,
                                old_decode_dev(cramfs_inode->size));
                break;
        default:
                printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %llu.\n",
                       inode->i_mode, inode->i_ino);
                iget_failed(inode);
                return ERR_PTR(-EIO);
        }

        inode->i_mode = cramfs_inode->mode;
        i_uid_write(inode, cramfs_inode->uid);
        i_gid_write(inode, cramfs_inode->gid);

        /* if the lower 2 bits are zero, the inode contains data */
        if (!(inode->i_ino & 3)) {
                inode->i_size = cramfs_inode->size;
                inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
        }

        /* Struct copy intentional */
        inode_set_mtime_to_ts(inode,
                              inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, zerotime)));
        /* inode->i_nlink is left 1 - arguably wrong for directories,
           but it's the best we can do without reading the directory
           contents.  1 yields the right result in GNU find, even
           without -noleaf option. */

        unlock_new_inode(inode);

        return inode;
}

/*
 * We have our own block cache: don't fill up the buffer cache
 * with the rom-image, because the way the filesystem is set
 * up the accesses should be fairly regular and cached in the
 * page cache and dentry tree anyway..
 *
 * This also acts as a way to guarantee contiguous areas of up to
 * BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to
 * worry about end-of-buffer issues even when decompressing a full
 * page cache.
 *
 * Note: This is all optimized away at compile time when
 *       CONFIG_CRAMFS_BLOCKDEV=n.
 */
#define READ_BUFFERS (2)
/* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */
#define NEXT_BUFFER(_ix) ((_ix) ^ 1)

/*
 * BLKS_PER_BUF_SHIFT should be at least 2 to allow for "compressed"
 * data that takes up more space than the original and with unlucky
 * alignment.
 */
#define BLKS_PER_BUF_SHIFT        (2)
#define BLKS_PER_BUF                (1 << BLKS_PER_BUF_SHIFT)
#define BUFFER_SIZE                (BLKS_PER_BUF*PAGE_SIZE)

static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE];
static unsigned buffer_blocknr[READ_BUFFERS];
static struct super_block *buffer_dev[READ_BUFFERS];
static int next_buffer;

/*
 * Populate our block cache and return a pointer to it.
 */
static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
                                unsigned int len)
{
        struct address_space *mapping = sb->s_bdev->bd_mapping;
        struct file_ra_state ra = {};
        struct page *pages[BLKS_PER_BUF];
        unsigned i, blocknr, buffer;
        unsigned long devsize;
        char *data;

        if (!len)
                return NULL;
        blocknr = offset >> PAGE_SHIFT;
        offset &= PAGE_SIZE - 1;

        /* Check if an existing buffer already has the data.. */
        for (i = 0; i < READ_BUFFERS; i++) {
                unsigned int blk_offset;

                if (buffer_dev[i] != sb)
                        continue;
                if (blocknr < buffer_blocknr[i])
                        continue;
                blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT;
                blk_offset += offset;
                if (blk_offset > BUFFER_SIZE ||
                    blk_offset + len > BUFFER_SIZE)
                        continue;
                return read_buffers[i] + blk_offset;
        }

        devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT;

        /* Ok, read in BLKS_PER_BUF pages completely first. */
        file_ra_state_init(&ra, mapping);
        page_cache_sync_readahead(mapping, &ra, NULL, blocknr, BLKS_PER_BUF);

        for (i = 0; i < BLKS_PER_BUF; i++) {
                struct page *page = NULL;

                if (blocknr + i < devsize) {
                        page = read_mapping_page(mapping, blocknr + i, NULL);
                        /* synchronous error? */
                        if (IS_ERR(page))
                                page = NULL;
                }
                pages[i] = page;
        }

        buffer = next_buffer;
        next_buffer = NEXT_BUFFER(buffer);
        buffer_blocknr[buffer] = blocknr;
        buffer_dev[buffer] = sb;

        data = read_buffers[buffer];
        for (i = 0; i < BLKS_PER_BUF; i++) {
                struct page *page = pages[i];

                if (page) {
                        memcpy_from_page(data, page, 0, PAGE_SIZE);
                        put_page(page);
                } else
                        memset(data, 0, PAGE_SIZE);
                data += PAGE_SIZE;
        }
        return read_buffers[buffer] + offset;
}

/*
 * Return a pointer to the linearly addressed cramfs image in memory.
 */
static void *cramfs_direct_read(struct super_block *sb, unsigned int offset,
                                unsigned int len)
{
        struct cramfs_sb_info *sbi = CRAMFS_SB(sb);

        if (!len)
                return NULL;
        if (len > sbi->size || offset > sbi->size - len)
                return page_address(ZERO_PAGE(0));
        return sbi->linear_virt_addr + offset;
}

/*
 * Returns a pointer to a buffer containing at least LEN bytes of
 * filesystem starting at byte offset OFFSET into the filesystem.
 */
static void *cramfs_read(struct super_block *sb, unsigned int offset,
                         unsigned int len)
{
        struct cramfs_sb_info *sbi = CRAMFS_SB(sb);

        if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sbi->linear_virt_addr)
                return cramfs_direct_read(sb, offset, len);
        else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV))
                return cramfs_blkdev_read(sb, offset, len);
        else
                return NULL;
}

/*
 * For a mapping to be possible, we need a range of uncompressed and
 * contiguous blocks. Return the offset for the first block and number of
 * valid blocks for which that is true, or zero otherwise.
 */
static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages)
{
        struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
        int i;
        u32 *blockptrs, first_block_addr;

        /*
         * We can dereference memory directly here as this code may be
         * reached only when there is a direct filesystem image mapping
         * available in memory.
         */
        blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode) + pgoff * 4);
        first_block_addr = blockptrs[0] & ~CRAMFS_BLK_FLAGS;
        i = 0;
        do {
                u32 block_off = i * (PAGE_SIZE >> CRAMFS_BLK_DIRECT_PTR_SHIFT);
                u32 expect = (first_block_addr + block_off) |
                             CRAMFS_BLK_FLAG_DIRECT_PTR |
                             CRAMFS_BLK_FLAG_UNCOMPRESSED;
                if (blockptrs[i] != expect) {
                        pr_debug("range: block %d/%d got %#x expects %#x\n",
                                 pgoff+i, pgoff + *pages - 1,
                                 blockptrs[i], expect);
                        if (i == 0)
                                return 0;
                        break;
                }
        } while (++i < *pages);

        *pages = i;
        return first_block_addr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
}

#ifdef CONFIG_MMU

/*
 * Return true if the last page of a file in the filesystem image contains
 * some other data that doesn't belong to that file. It is assumed that the
 * last block is CRAMFS_BLK_FLAG_DIRECT_PTR | CRAMFS_BLK_FLAG_UNCOMPRESSED
 * (verified by cramfs_get_block_range() and directly accessible in memory.
 */
static bool cramfs_last_page_is_shared(struct inode *inode)
{
        struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
        u32 partial, last_page, blockaddr, *blockptrs;
        char *tail_data;

        partial = offset_in_page(inode->i_size);
        if (!partial)
                return false;
        last_page = inode->i_size >> PAGE_SHIFT;
        blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode));
        blockaddr = blockptrs[last_page] & ~CRAMFS_BLK_FLAGS;
        blockaddr <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
        tail_data = sbi->linear_virt_addr + blockaddr + partial;
        return memchr_inv(tail_data, 0, PAGE_SIZE - partial) ? true : false;
}

static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct inode *inode = file_inode(file);
        struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
        unsigned int pages, max_pages, offset;
        unsigned long address, pgoff = vma->vm_pgoff;
        char *bailout_reason;
        int ret;

        ret = generic_file_readonly_mmap(file, vma);
        if (ret)
                return ret;

        /*
         * Now try to pre-populate ptes for this vma with a direct
         * mapping avoiding memory allocation when possible.
         */

        /* Could COW work here? */
        bailout_reason = "vma is writable";
        if (vma->vm_flags & VM_WRITE)
                goto bailout;

        max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        bailout_reason = "beyond file limit";
        if (pgoff >= max_pages)
                goto bailout;
        pages = min(vma_pages(vma), max_pages - pgoff);

        offset = cramfs_get_block_range(inode, pgoff, &pages);
        bailout_reason = "unsuitable block layout";
        if (!offset)
                goto bailout;
        address = sbi->linear_phys_addr + offset;
        bailout_reason = "data is not page aligned";
        if (!PAGE_ALIGNED(address))
                goto bailout;

        /* Don't map the last page if it contains some other data */
        if (pgoff + pages == max_pages && cramfs_last_page_is_shared(inode)) {
                pr_debug("mmap: %pD: last page is shared\n", file);
                pages--;
        }

        if (!pages) {
                bailout_reason = "no suitable block remaining";
                goto bailout;
        }

        if (pages == vma_pages(vma)) {
                /*
                 * The entire vma is mappable. remap_pfn_range() will
                 * make it distinguishable from a non-direct mapping
                 * in /proc/<pid>/maps by substituting the file offset
                 * with the actual physical address.
                 */
                ret = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT,
                                      pages * PAGE_SIZE, vma->vm_page_prot);
        } else {
                /*
                 * Let's create a mixed map if we can't map it all.
                 * The normal paging machinery will take care of the
                 * unpopulated ptes via cramfs_read_folio().
                 */
                int i;
                vm_flags_set(vma, VM_MIXEDMAP);
                for (i = 0; i < pages && !ret; i++) {
                        vm_fault_t vmf;
                        unsigned long off = i * PAGE_SIZE;
                        vmf = vmf_insert_mixed(vma, vma->vm_start + off,
                                        PHYS_PFN(address + off));
                        if (vmf & VM_FAULT_ERROR)
                                ret = vm_fault_to_errno(vmf, 0);
                }
        }

        if (!ret)
                pr_debug("mapped %pD[%lu] at 0x%08lx (%u/%lu pages) "
                         "to vma 0x%08lx, page_prot 0x%llx\n", file,
                         pgoff, address, pages, vma_pages(vma), vma->vm_start,
                         (unsigned long long)pgprot_val(vma->vm_page_prot));
        return ret;

bailout:
        pr_debug("%pD[%lu]: direct mmap impossible: %s\n",
                 file, pgoff, bailout_reason);
        /* Didn't manage any direct map, but normal paging is still possible */
        return 0;
}

#else /* CONFIG_MMU */

static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
{
        return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS;
}

static unsigned long cramfs_physmem_get_unmapped_area(struct file *file,
                        unsigned long addr, unsigned long len,
                        unsigned long pgoff, unsigned long flags)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
        unsigned int pages, block_pages, max_pages, offset;

        pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (pgoff >= max_pages || pages > max_pages - pgoff)
                return -EINVAL;
        block_pages = pages;
        offset = cramfs_get_block_range(inode, pgoff, &block_pages);
        if (!offset || block_pages != pages)
                return -ENOSYS;
        addr = sbi->linear_phys_addr + offset;
        pr_debug("get_unmapped for %pD ofs %#lx siz %lu at 0x%08lx\n",
                 file, pgoff*PAGE_SIZE, len, addr);
        return addr;
}

static unsigned int cramfs_physmem_mmap_capabilities(struct file *file)
{
        return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT |
               NOMMU_MAP_READ | NOMMU_MAP_EXEC;
}

#endif /* CONFIG_MMU */

static const struct file_operations cramfs_physmem_fops = {
        .llseek                        = generic_file_llseek,
        .read_iter                = generic_file_read_iter,
        .splice_read                = filemap_splice_read,
        .mmap                        = cramfs_physmem_mmap,
#ifndef CONFIG_MMU
        .get_unmapped_area        = cramfs_physmem_get_unmapped_area,
        .mmap_capabilities        = cramfs_physmem_mmap_capabilities,
#endif
};

static void cramfs_kill_sb(struct super_block *sb)
{
        struct cramfs_sb_info *sbi = CRAMFS_SB(sb);

        generic_shutdown_super(sb);

        if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) {
                if (sbi && sbi->mtd_point_size)
                        mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size);
                put_mtd_device(sb->s_mtd);
                sb->s_mtd = NULL;
        } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
                sync_blockdev(sb->s_bdev);
                bdev_fput(sb->s_bdev_file);
        }
        kfree(sbi);
}

static int cramfs_reconfigure(struct fs_context *fc)
{
        sync_filesystem(fc->root->d_sb);
        fc->sb_flags |= SB_RDONLY;
        return 0;
}

static int cramfs_read_super(struct super_block *sb, struct fs_context *fc,
                             struct cramfs_super *super)
{
        struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
        unsigned long root_offset;
        bool silent = fc->sb_flags & SB_SILENT;

        /* We don't know the real size yet */
        sbi->size = PAGE_SIZE;

        /* Read the first block and get the superblock from it */
        mutex_lock(&read_mutex);
        memcpy(super, cramfs_read(sb, 0, sizeof(*super)), sizeof(*super));
        mutex_unlock(&read_mutex);

        /* Do sanity checks on the superblock */
        if (super->magic != CRAMFS_MAGIC) {
                /* check for wrong endianness */
                if (super->magic == CRAMFS_MAGIC_WEND) {
                        if (!silent)
                                errorfc(fc, "wrong endianness");
                        return -EINVAL;
                }

                /* check at 512 byte offset */
                mutex_lock(&read_mutex);
                memcpy(super,
                       cramfs_read(sb, 512, sizeof(*super)),
                       sizeof(*super));
                mutex_unlock(&read_mutex);
                if (super->magic != CRAMFS_MAGIC) {
                        if (super->magic == CRAMFS_MAGIC_WEND && !silent)
                                errorfc(fc, "wrong endianness");
                        else if (!silent)
                                errorfc(fc, "wrong magic");
                        return -EINVAL;
                }
        }

        /* get feature flags first */
        if (super->flags & ~CRAMFS_SUPPORTED_FLAGS) {
                errorfc(fc, "unsupported filesystem features");
                return -EINVAL;
        }

        /* Check that the root inode is in a sane state */
        if (!S_ISDIR(super->root.mode)) {
                errorfc(fc, "root is not a directory");
                return -EINVAL;
        }
        /* correct strange, hard-coded permissions of mkcramfs */
        super->root.mode |= 0555;

        root_offset = super->root.offset << 2;
        if (super->flags & CRAMFS_FLAG_FSID_VERSION_2) {
                sbi->size = super->size;
                sbi->blocks = super->fsid.blocks;
                sbi->files = super->fsid.files;
        } else {
                sbi->size = 1<<28;
                sbi->blocks = 0;
                sbi->files = 0;
        }
        sbi->magic = super->magic;
        sbi->flags = super->flags;
        if (root_offset == 0)
                infofc(fc, "empty filesystem");
        else if (!(super->flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
                 ((root_offset != sizeof(struct cramfs_super)) &&
                  (root_offset != 512 + sizeof(struct cramfs_super))))
        {
                errorfc(fc, "bad root offset %lu", root_offset);
                return -EINVAL;
        }

        return 0;
}

static int cramfs_finalize_super(struct super_block *sb,
                                 struct cramfs_inode *cramfs_root)
{
        struct inode *root;

        /* Set it all up.. */
        sb->s_flags |= SB_RDONLY;
        sb->s_time_min = 0;
        sb->s_time_max = 0;
        sb->s_op = &cramfs_ops;
        root = get_cramfs_inode(sb, cramfs_root, 0);
        if (IS_ERR(root))
                return PTR_ERR(root);
        sb->s_root = d_make_root(root);
        if (!sb->s_root)
                return -ENOMEM;
        return 0;
}

static int cramfs_blkdev_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct cramfs_sb_info *sbi;
        struct cramfs_super super;
        int i, err;

        sbi = kzalloc_obj(struct cramfs_sb_info);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;

        /* Invalidate the read buffers on mount: think disk change.. */
        for (i = 0; i < READ_BUFFERS; i++)
                buffer_blocknr[i] = -1;

        err = cramfs_read_super(sb, fc, &super);
        if (err)
                return err;
        return cramfs_finalize_super(sb, &super.root);
}

static int cramfs_mtd_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct cramfs_sb_info *sbi;
        struct cramfs_super super;
        int err;

        sbi = kzalloc_obj(struct cramfs_sb_info);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;

        /* Map only one page for now.  Will remap it when fs size is known. */
        err = mtd_point(sb->s_mtd, 0, PAGE_SIZE, &sbi->mtd_point_size,
                        &sbi->linear_virt_addr, &sbi->linear_phys_addr);
        if (err || sbi->mtd_point_size != PAGE_SIZE) {
                pr_err("unable to get direct memory access to mtd:%s\n",
                       sb->s_mtd->name);
                return err ? : -ENODATA;
        }

        pr_info("checking physical address %pap for linear cramfs image\n",
                &sbi->linear_phys_addr);
        err = cramfs_read_super(sb, fc, &super);
        if (err)
                return err;

        /* Remap the whole filesystem now */
        pr_info("linear cramfs image on mtd:%s appears to be %lu KB in size\n",
                sb->s_mtd->name, sbi->size/1024);
        mtd_unpoint(sb->s_mtd, 0, PAGE_SIZE);
        err = mtd_point(sb->s_mtd, 0, sbi->size, &sbi->mtd_point_size,
                        &sbi->linear_virt_addr, &sbi->linear_phys_addr);
        if (err || sbi->mtd_point_size != sbi->size) {
                pr_err("unable to get direct memory access to mtd:%s\n",
                       sb->s_mtd->name);
                return err ? : -ENODATA;
        }

        return cramfs_finalize_super(sb, &super.root);
}

static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        u64 id = 0;

        if (sb->s_bdev)
                id = huge_encode_dev(sb->s_bdev->bd_dev);
        else if (sb->s_dev)
                id = huge_encode_dev(sb->s_dev);

        buf->f_type = CRAMFS_MAGIC;
        buf->f_bsize = PAGE_SIZE;
        buf->f_blocks = CRAMFS_SB(sb)->blocks;
        buf->f_bfree = 0;
        buf->f_bavail = 0;
        buf->f_files = CRAMFS_SB(sb)->files;
        buf->f_ffree = 0;
        buf->f_fsid = u64_to_fsid(id);
        buf->f_namelen = CRAMFS_MAXPATHLEN;
        return 0;
}

/*
 * Read a cramfs directory entry.
 */
static int cramfs_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        char *buf;
        unsigned int offset;

        /* Offset within the thing. */
        if (ctx->pos >= inode->i_size)
                return 0;
        offset = ctx->pos;
        /* Directory entries are always 4-byte aligned */
        if (offset & 3)
                return -EINVAL;

        buf = kmalloc(CRAMFS_MAXPATHLEN, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        while (offset < inode->i_size) {
                struct cramfs_inode *de;
                unsigned long nextoffset;
                char *name;
                ino_t ino;
                umode_t mode;
                int namelen;

                mutex_lock(&read_mutex);
                de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
                name = (char *)(de+1);

                /*
                 * Namelengths on disk are shifted by two
                 * and the name padded out to 4-byte boundaries
                 * with zeroes.
                 */
                namelen = de->namelen << 2;
                memcpy(buf, name, namelen);
                ino = cramino(de, OFFSET(inode) + offset);
                mode = de->mode;
                mutex_unlock(&read_mutex);
                nextoffset = offset + sizeof(*de) + namelen;
                for (;;) {
                        if (!namelen) {
                                kfree(buf);
                                return -EIO;
                        }
                        if (buf[namelen-1])
                                break;
                        namelen--;
                }
                if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
                        break;

                ctx->pos = offset = nextoffset;
        }
        kfree(buf);
        return 0;
}

/*
 * Lookup and fill in the inode data..
 */
static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        unsigned int offset = 0;
        struct inode *inode = NULL;
        int sorted;

        mutex_lock(&read_mutex);
        sorted = CRAMFS_SB(dir->i_sb)->flags & CRAMFS_FLAG_SORTED_DIRS;
        while (offset < dir->i_size) {
                struct cramfs_inode *de;
                char *name;
                int namelen, retval;
                int dir_off = OFFSET(dir) + offset;

                de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
                name = (char *)(de+1);

                /* Try to take advantage of sorted directories */
                if (sorted && (dentry->d_name.name[0] < name[0]))
                        break;

                namelen = de->namelen << 2;
                offset += sizeof(*de) + namelen;

                /* Quick check that the name is roughly the right length */
                if (((dentry->d_name.len + 3) & ~3) != namelen)
                        continue;

                for (;;) {
                        if (!namelen) {
                                inode = ERR_PTR(-EIO);
                                goto out;
                        }
                        if (name[namelen-1])
                                break;
                        namelen--;
                }
                if (namelen != dentry->d_name.len)
                        continue;
                retval = memcmp(dentry->d_name.name, name, namelen);
                if (retval > 0)
                        continue;
                if (!retval) {
                        inode = get_cramfs_inode(dir->i_sb, de, dir_off);
                        break;
                }
                /* else (retval < 0) */
                if (sorted)
                        break;
        }
out:
        mutex_unlock(&read_mutex);
        return d_splice_alias(inode, dentry);
}

static int cramfs_read_folio(struct file *file, struct folio *folio)
{
        struct inode *inode = folio->mapping->host;
        u32 maxblock;
        int bytes_filled;
        void *pgdata;
        bool success = false;

        maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        bytes_filled = 0;
        pgdata = kmap_local_folio(folio, 0);

        if (folio->index < maxblock) {
                struct super_block *sb = inode->i_sb;
                u32 blkptr_offset = OFFSET(inode) + folio->index * 4;
                u32 block_ptr, block_start, block_len;
                bool uncompressed, direct;

                mutex_lock(&read_mutex);
                block_ptr = *(u32 *) cramfs_read(sb, blkptr_offset, 4);
                uncompressed = (block_ptr & CRAMFS_BLK_FLAG_UNCOMPRESSED);
                direct = (block_ptr & CRAMFS_BLK_FLAG_DIRECT_PTR);
                block_ptr &= ~CRAMFS_BLK_FLAGS;

                if (direct) {
                        /*
                         * The block pointer is an absolute start pointer,
                         * shifted by 2 bits. The size is included in the
                         * first 2 bytes of the data block when compressed,
                         * or PAGE_SIZE otherwise.
                         */
                        block_start = block_ptr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
                        if (uncompressed) {
                                block_len = PAGE_SIZE;
                                /* if last block: cap to file length */
                                if (folio->index == maxblock - 1)
                                        block_len =
                                                offset_in_page(inode->i_size);
                        } else {
                                block_len = *(u16 *)
                                        cramfs_read(sb, block_start, 2);
                                block_start += 2;
                        }
                } else {
                        /*
                         * The block pointer indicates one past the end of
                         * the current block (start of next block). If this
                         * is the first block then it starts where the block
                         * pointer table ends, otherwise its start comes
                         * from the previous block's pointer.
                         */
                        block_start = OFFSET(inode) + maxblock * 4;
                        if (folio->index)
                                block_start = *(u32 *)
                                        cramfs_read(sb, blkptr_offset - 4, 4);
                        /* Beware... previous ptr might be a direct ptr */
                        if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) {
                                /* See comments on earlier code. */
                                u32 prev_start = block_start;
                                block_start = prev_start & ~CRAMFS_BLK_FLAGS;
                                block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
                                if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) {
                                        block_start += PAGE_SIZE;
                                } else {
                                        block_len = *(u16 *)
                                                cramfs_read(sb, block_start, 2);
                                        block_start += 2 + block_len;
                                }
                        }
                        block_start &= ~CRAMFS_BLK_FLAGS;
                        block_len = block_ptr - block_start;
                }

                if (block_len == 0)
                        ; /* hole */
                else if (unlikely(block_len > 2*PAGE_SIZE ||
                                  (uncompressed && block_len > PAGE_SIZE))) {
                        mutex_unlock(&read_mutex);
                        pr_err("bad data blocksize %u\n", block_len);
                        goto err;
                } else if (uncompressed) {
                        memcpy(pgdata,
                               cramfs_read(sb, block_start, block_len),
                               block_len);
                        bytes_filled = block_len;
                } else {
                        bytes_filled = cramfs_uncompress_block(pgdata,
                                 PAGE_SIZE,
                                 cramfs_read(sb, block_start, block_len),
                                 block_len);
                }
                mutex_unlock(&read_mutex);
                if (unlikely(bytes_filled < 0))
                        goto err;
        }

        memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
        flush_dcache_folio(folio);

        success = true;
err:
        kunmap_local(pgdata);
        folio_end_read(folio, success);
        return 0;
}

static const struct address_space_operations cramfs_aops = {
        .read_folio = cramfs_read_folio
};

/*
 * Our operations:
 */

/*
 * A directory can only readdir
 */
static const struct file_operations cramfs_directory_operations = {
        .llseek                = generic_file_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = cramfs_readdir,
        .setlease        = generic_setlease,
};

static const struct inode_operations cramfs_dir_inode_operations = {
        .lookup                = cramfs_lookup,
};

static const struct super_operations cramfs_ops = {
        .statfs                = cramfs_statfs,
};

static int cramfs_get_tree(struct fs_context *fc)
{
        int ret = -ENOPROTOOPT;

        if (IS_ENABLED(CONFIG_CRAMFS_MTD)) {
                ret = get_tree_mtd(fc, cramfs_mtd_fill_super);
                if (!ret)
                        return 0;
        }
        if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV))
                ret = get_tree_bdev(fc, cramfs_blkdev_fill_super);
        return ret;
}

static const struct fs_context_operations cramfs_context_ops = {
        .get_tree        = cramfs_get_tree,
        .reconfigure        = cramfs_reconfigure,
};

/*
 * Set up the filesystem mount context.
 */
static int cramfs_init_fs_context(struct fs_context *fc)
{
        fc->ops = &cramfs_context_ops;
        return 0;
}

static struct file_system_type cramfs_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "cramfs",
        .init_fs_context = cramfs_init_fs_context,
        .kill_sb        = cramfs_kill_sb,
        .fs_flags        = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("cramfs");

static int __init init_cramfs_fs(void)
{
        int rv;

        rv = cramfs_uncompress_init();
        if (rv < 0)
                return rv;
        rv = register_filesystem(&cramfs_fs_type);
        if (rv < 0)
                cramfs_uncompress_exit();
        return rv;
}

static void __exit exit_cramfs_fs(void)
{
        cramfs_uncompress_exit();
        unregister_filesystem(&cramfs_fs_type);
}

module_init(init_cramfs_fs)
module_exit(exit_cramfs_fs)
MODULE_DESCRIPTION("Compressed ROM file system support");
MODULE_LICENSE("GPL");


























































































    6 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_BL_H
#define _LINUX_RCULIST_BL_H

/*
 * RCU-protected bl list version. See include/linux/list_bl.h.
 */
#include <linux/list_bl.h>
#include <linux/rcupdate.h>

/* return the first ptr or next element in an RCU protected list */
#define hlist_bl_first_rcu(head)        \
        (*((struct hlist_bl_node __rcu **)(&(head)->first)))
#define hlist_bl_next_rcu(node)        \
        (*((struct hlist_bl_node __rcu **)(&(node)->next)))

static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        rcu_assign_pointer(hlist_bl_first_rcu(h),
                (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
}

#define hlist_bl_first_rcu_dereference(head)                                \
({                                                                        \
        struct hlist_bl_head *__head = (head);                                \
                                                                        \
        (struct hlist_bl_node *)                                        \
        ((unsigned long)rcu_dereference_check(hlist_bl_first_rcu(__head), \
                                              hlist_bl_is_locked(__head)) & \
                                              ~LIST_BL_LOCKMASK);        \
})

/**
 * hlist_bl_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_bl_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry().
 */
static inline void hlist_bl_del_rcu(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->pprev = LIST_POISON2;
}

/**
 * hlist_bl_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_bl,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first;

        /* don't need hlist_bl_first_rcu* because we're under lock */
        first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;

        /* need _rcu because we can have concurrent lock free readers */
        hlist_bl_set_first_rcu(h, n);
}
/**
 * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_bl_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_bl_node within the struct.
 *
 */
#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member)                \
        for (pos = hlist_bl_first_rcu_dereference(head);                \
                pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
                pos = rcu_dereference_raw(hlist_bl_next_rcu(pos)))

/**
 * hlist_bl_for_each_entry_continue_rcu - continue iteration over list of given
 *   type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_bl_node to use as a loop cursor.
 * @member:        the name of the hlist_bl_node within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position which must have been in the list when the RCU read
 * lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 */
#define hlist_bl_for_each_entry_continue_rcu(tpos, pos, member)                \
        for (pos = rcu_dereference_raw(hlist_bl_next_rcu(&(tpos)->member)); \
             pos &&                                                        \
             ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
             pos = rcu_dereference_raw(hlist_bl_next_rcu(pos)))

#endif









































































































































































































































































































































































































































































































































































   10 




   11 










   13 



























































    4 


















































    4 




























    4 

    4 







   13 







    3 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
// SPDX-License-Identifier: GPL-2.0
/*
 *  memory buffer pool support. Such pools are mostly used
 *  for guaranteed, deadlock-free memory allocations during
 *  extreme VM load.
 *
 *  started by Ingo Molnar, Copyright (C) 2001
 *  debugging by David Rientjes, Copyright (C) 2015
 */
#include <linux/fault-inject.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kasan.h>
#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/writeback.h>
#include "slab.h"

static DECLARE_FAULT_ATTR(fail_mempool_alloc);
static DECLARE_FAULT_ATTR(fail_mempool_alloc_bulk);

static int __init mempool_faul_inject_init(void)
{
        int error;

        error = PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc",
                        NULL, &fail_mempool_alloc));
        if (error)
                return error;

        /* booting will fail on error return here, don't bother to cleanup */
        return PTR_ERR_OR_ZERO(
                fault_create_debugfs_attr("fail_mempool_alloc_bulk", NULL,
                &fail_mempool_alloc_bulk));
}
late_initcall(mempool_faul_inject_init);

#ifdef CONFIG_SLUB_DEBUG_ON
static void poison_error(struct mempool *pool, void *element, size_t size,
                         size_t byte)
{
        const int nr = pool->curr_nr;
        const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
        const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
        int i;

        pr_err("BUG: mempool element poison mismatch\n");
        pr_err("Mempool %p size %zu\n", pool, size);
        pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
        for (i = start; i < end; i++)
                pr_cont("%x ", *(u8 *)(element + i));
        pr_cont("%s\n", end < size ? "..." : "");
        dump_stack();
}

static void __check_element(struct mempool *pool, void *element, size_t size)
{
        u8 *obj = element;
        size_t i;

        for (i = 0; i < size; i++) {
                u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;

                if (obj[i] != exp) {
                        poison_error(pool, element, size, i);
                        return;
                }
        }
        memset(obj, POISON_INUSE, size);
}

static void check_element(struct mempool *pool, void *element)
{
        /* Skip checking: KASAN might save its metadata in the element. */
        if (kasan_enabled())
                return;

        /* Mempools backed by slab allocator */
        if (pool->free == mempool_kfree) {
                __check_element(pool, element, (size_t)pool->pool_data);
        } else if (pool->free == mempool_free_slab) {
                __check_element(pool, element, kmem_cache_size(pool->pool_data));
        } else if (pool->free == mempool_free_pages) {
                /* Mempools backed by page allocator */
                int order = (int)(long)pool->pool_data;

#ifdef CONFIG_HIGHMEM
                for (int i = 0; i < (1 << order); i++) {
                        struct page *page = (struct page *)element;
                        void *addr = kmap_local_page(page + i);

                        __check_element(pool, addr, PAGE_SIZE);
                        kunmap_local(addr);
                }
#else
                void *addr = page_address((struct page *)element);

                __check_element(pool, addr, PAGE_SIZE << order);
#endif
        }
}

static void __poison_element(void *element, size_t size)
{
        u8 *obj = element;

        memset(obj, POISON_FREE, size - 1);
        obj[size - 1] = POISON_END;
}

static void poison_element(struct mempool *pool, void *element)
{
        /* Skip poisoning: KASAN might save its metadata in the element. */
        if (kasan_enabled())
                return;

        /* Mempools backed by slab allocator */
        if (pool->alloc == mempool_kmalloc) {
                __poison_element(element, (size_t)pool->pool_data);
        } else if (pool->alloc == mempool_alloc_slab) {
                __poison_element(element, kmem_cache_size(pool->pool_data));
        } else if (pool->alloc == mempool_alloc_pages) {
                /* Mempools backed by page allocator */
                int order = (int)(long)pool->pool_data;

#ifdef CONFIG_HIGHMEM
                for (int i = 0; i < (1 << order); i++) {
                        struct page *page = (struct page *)element;
                        void *addr = kmap_local_page(page + i);

                        __poison_element(addr, PAGE_SIZE);
                        kunmap_local(addr);
                }
#else
                void *addr = page_address((struct page *)element);

                __poison_element(addr, PAGE_SIZE << order);
#endif
        }
}
#else /* CONFIG_SLUB_DEBUG_ON */
static inline void check_element(struct mempool *pool, void *element)
{
}
static inline void poison_element(struct mempool *pool, void *element)
{
}
#endif /* CONFIG_SLUB_DEBUG_ON */

static __always_inline bool kasan_poison_element(struct mempool *pool,
                void *element)
{
        if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
                return kasan_mempool_poison_object(element);
        else if (pool->alloc == mempool_alloc_pages)
                return kasan_mempool_poison_pages(element,
                                                (unsigned long)pool->pool_data);
        return true;
}

static void kasan_unpoison_element(struct mempool *pool, void *element)
{
        if (pool->alloc == mempool_kmalloc)
                kasan_mempool_unpoison_object(element, (size_t)pool->pool_data);
        else if (pool->alloc == mempool_alloc_slab)
                kasan_mempool_unpoison_object(element,
                                              kmem_cache_size(pool->pool_data));
        else if (pool->alloc == mempool_alloc_pages)
                kasan_mempool_unpoison_pages(element,
                                             (unsigned long)pool->pool_data);
}

static __always_inline void add_element(struct mempool *pool, void *element)
{
        BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr);
        poison_element(pool, element);
        if (kasan_poison_element(pool, element))
                pool->elements[pool->curr_nr++] = element;
}

static void *remove_element(struct mempool *pool)
{
        void *element = pool->elements[--pool->curr_nr];

        BUG_ON(pool->curr_nr < 0);
        kasan_unpoison_element(pool, element);
        check_element(pool, element);
        return element;
}

/**
 * mempool_exit - exit a mempool initialized with mempool_init()
 * @pool:      pointer to the memory pool which was initialized with
 *             mempool_init().
 *
 * Free all reserved elements in @pool and @pool itself.  This function
 * only sleeps if the free_fn() function sleeps.
 *
 * May be called on a zeroed but uninitialized mempool (i.e. allocated with
 * kzalloc()).
 */
void mempool_exit(struct mempool *pool)
{
        while (pool->curr_nr) {
                void *element = remove_element(pool);
                pool->free(element, pool->pool_data);
        }
        kfree(pool->elements);
        pool->elements = NULL;
}
EXPORT_SYMBOL(mempool_exit);

/**
 * mempool_destroy - deallocate a memory pool
 * @pool:      pointer to the memory pool which was allocated via
 *             mempool_create().
 *
 * Free all reserved elements in @pool and @pool itself.  This function
 * only sleeps if the free_fn() function sleeps.
 */
void mempool_destroy(struct mempool *pool)
{
        if (unlikely(!pool))
                return;

        mempool_exit(pool);
        kfree(pool);
}
EXPORT_SYMBOL(mempool_destroy);

int mempool_init_node(struct mempool *pool, int min_nr,
                mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
                void *pool_data, gfp_t gfp_mask, int node_id)
{
        spin_lock_init(&pool->lock);
        pool->min_nr        = min_nr;
        pool->pool_data = pool_data;
        pool->alloc        = alloc_fn;
        pool->free        = free_fn;
        init_waitqueue_head(&pool->wait);
        /*
         * max() used here to ensure storage for at least 1 element to support
         * zero minimum pool
         */
        pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *),
                                            gfp_mask, node_id);
        if (!pool->elements)
                return -ENOMEM;

        /*
         * First pre-allocate the guaranteed number of buffers,
         * also pre-allocate 1 element for zero minimum pool.
         */
        while (pool->curr_nr < max(1, pool->min_nr)) {
                void *element;

                element = pool->alloc(gfp_mask, pool->pool_data);
                if (unlikely(!element)) {
                        mempool_exit(pool);
                        return -ENOMEM;
                }
                add_element(pool, element);
        }

        return 0;
}
EXPORT_SYMBOL(mempool_init_node);

/**
 * mempool_init - initialize a memory pool
 * @pool:      pointer to the memory pool that should be initialized
 * @min_nr:    the minimum number of elements guaranteed to be
 *             allocated for this pool.
 * @alloc_fn:  user-defined element-allocation function.
 * @free_fn:   user-defined element-freeing function.
 * @pool_data: optional private data available to the user-defined functions.
 *
 * Like mempool_create(), but initializes the pool in (i.e. embedded in another
 * structure).
 *
 * Return: %0 on success, negative error code otherwise.
 */
int mempool_init_noprof(struct mempool *pool, int min_nr,
                mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
                void *pool_data)
{
        return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
                                 pool_data, GFP_KERNEL, NUMA_NO_NODE);

}
EXPORT_SYMBOL(mempool_init_noprof);

/**
 * mempool_create_node - create a memory pool
 * @min_nr:    the minimum number of elements guaranteed to be
 *             allocated for this pool.
 * @alloc_fn:  user-defined element-allocation function.
 * @free_fn:   user-defined element-freeing function.
 * @pool_data: optional private data available to the user-defined functions.
 * @gfp_mask:  memory allocation flags
 * @node_id:   numa node to allocate on
 *
 * this function creates and allocates a guaranteed size, preallocated
 * memory pool. The pool can be used from the mempool_alloc() and mempool_free()
 * functions. This function might sleep. Both the alloc_fn() and the free_fn()
 * functions might sleep - as long as the mempool_alloc() function is not called
 * from IRQ contexts.
 *
 * Return: pointer to the created memory pool object or %NULL on error.
 */
struct mempool *mempool_create_node_noprof(int min_nr,
                mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
                void *pool_data, gfp_t gfp_mask, int node_id)
{
        struct mempool *pool;

        pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
        if (!pool)
                return NULL;

        if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data,
                              gfp_mask, node_id)) {
                kfree(pool);
                return NULL;
        }

        return pool;
}
EXPORT_SYMBOL(mempool_create_node_noprof);

/**
 * mempool_resize - resize an existing memory pool
 * @pool:       pointer to the memory pool which was allocated via
 *              mempool_create().
 * @new_min_nr: the new minimum number of elements guaranteed to be
 *              allocated for this pool.
 *
 * This function shrinks/grows the pool. In the case of growing,
 * it cannot be guaranteed that the pool will be grown to the new
 * size immediately, but new mempool_free() calls will refill it.
 * This function may sleep.
 *
 * Note, the caller must guarantee that no mempool_destroy is called
 * while this function is running. mempool_alloc() & mempool_free()
 * might be called (eg. from IRQ contexts) while this function executes.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int mempool_resize(struct mempool *pool, int new_min_nr)
{
        void *element;
        void **new_elements;
        unsigned long flags;

        BUG_ON(new_min_nr <= 0);
        might_sleep();

        spin_lock_irqsave(&pool->lock, flags);
        if (new_min_nr <= pool->min_nr) {
                while (new_min_nr < pool->curr_nr) {
                        element = remove_element(pool);
                        spin_unlock_irqrestore(&pool->lock, flags);
                        pool->free(element, pool->pool_data);
                        spin_lock_irqsave(&pool->lock, flags);
                }
                pool->min_nr = new_min_nr;
                goto out_unlock;
        }
        spin_unlock_irqrestore(&pool->lock, flags);

        /* Grow the pool */
        new_elements = kmalloc_objs(*new_elements, new_min_nr);
        if (!new_elements)
                return -ENOMEM;

        spin_lock_irqsave(&pool->lock, flags);
        if (unlikely(new_min_nr <= pool->min_nr)) {
                /* Raced, other resize will do our work */
                spin_unlock_irqrestore(&pool->lock, flags);
                kfree(new_elements);
                goto out;
        }
        memcpy(new_elements, pool->elements,
                        pool->curr_nr * sizeof(*new_elements));
        kfree(pool->elements);
        pool->elements = new_elements;
        pool->min_nr = new_min_nr;

        while (pool->curr_nr < pool->min_nr) {
                spin_unlock_irqrestore(&pool->lock, flags);
                element = pool->alloc(GFP_KERNEL, pool->pool_data);
                if (!element)
                        goto out;
                spin_lock_irqsave(&pool->lock, flags);
                if (pool->curr_nr < pool->min_nr) {
                        add_element(pool, element);
                } else {
                        spin_unlock_irqrestore(&pool->lock, flags);
                        pool->free(element, pool->pool_data);        /* Raced */
                        goto out;
                }
        }
out_unlock:
        spin_unlock_irqrestore(&pool->lock, flags);
out:
        return 0;
}
EXPORT_SYMBOL(mempool_resize);

static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems,
                unsigned int count, unsigned int allocated,
                gfp_t gfp_mask)
{
        unsigned long flags;
        unsigned int i;

        spin_lock_irqsave(&pool->lock, flags);
        if (unlikely(pool->curr_nr < count - allocated))
                goto fail;
        for (i = 0; i < count; i++) {
                if (!elems[i]) {
                        elems[i] = remove_element(pool);
                        allocated++;
                }
        }
        spin_unlock_irqrestore(&pool->lock, flags);

        /* Paired with rmb in mempool_free(), read comment there. */
        smp_wmb();

        /*
         * Update the allocation stack trace as this is more useful for
         * debugging.
         */
        for (i = 0; i < count; i++)
                kmemleak_update_trace(elems[i]);
        return allocated;

fail:
        if (gfp_mask & __GFP_DIRECT_RECLAIM) {
                DEFINE_WAIT(wait);

                prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
                spin_unlock_irqrestore(&pool->lock, flags);

                /*
                 * Wait for someone else to return an element to @pool, but wake
                 * up occasionally as memory pressure might have reduced even
                 * and the normal allocation in alloc_fn could succeed even if
                 * no element was returned.
                 */
                io_schedule_timeout(5 * HZ);
                finish_wait(&pool->wait, &wait);
        } else {
                /* We must not sleep if __GFP_DIRECT_RECLAIM is not set. */
                spin_unlock_irqrestore(&pool->lock, flags);
        }

        return allocated;
}

/*
 * Adjust the gfp flags for mempool allocations, as we never want to dip into
 * the global emergency reserves or retry in the page allocator.
 *
 * The first pass also doesn't want to go reclaim, but the next passes do, so
 * return a separate subset for that first iteration.
 */
static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask)
{
        *gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
        return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
}

/**
 * mempool_alloc_bulk - allocate multiple elements from a memory pool
 * @pool:        pointer to the memory pool
 * @elems:        partially or fully populated elements array
 * @count:        number of entries in @elem that need to be allocated
 * @allocated:        number of entries in @elem already allocated
 *
 * Allocate elements for each slot in @elem that is non-%NULL. This is done by
 * first calling into the alloc_fn supplied at pool initialization time, and
 * dipping into the reserved pool when alloc_fn fails to allocate an element.
 *
 * On return all @count elements in @elems will be populated.
 *
 * Return: Always 0.  If it wasn't for %$#^$ alloc tags, it would return void.
 */
int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems,
                unsigned int count, unsigned int allocated)
{
        gfp_t gfp_mask = GFP_KERNEL;
        gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask);
        unsigned int i = 0;

        VM_WARN_ON_ONCE(count > pool->min_nr);
        might_alloc(gfp_mask);

        /*
         * If an error is injected, fail all elements in a bulk allocation so
         * that we stress the multiple elements missing path.
         */
        if (should_fail_ex(&fail_mempool_alloc_bulk, 1, FAULT_NOWARN)) {
                pr_info("forcing mempool usage for %pS\n",
                                (void *)_RET_IP_);
                goto use_pool;
        }

repeat_alloc:
        /*
         * Try to allocate the elements using the allocation callback first as
         * that might succeed even when the caller's bulk allocation did not.
         */
        for (i = 0; i < count; i++) {
                if (elems[i])
                        continue;
                elems[i] = pool->alloc(gfp_temp, pool->pool_data);
                if (unlikely(!elems[i]))
                        goto use_pool;
                allocated++;
        }

        return 0;

use_pool:
        allocated = mempool_alloc_from_pool(pool, elems, count, allocated,
                        gfp_temp);
        gfp_temp = gfp_mask;
        goto repeat_alloc;
}
EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof);

/**
 * mempool_alloc - allocate an element from a memory pool
 * @pool:        pointer to the memory pool
 * @gfp_mask:        GFP_* flags.  %__GFP_ZERO is not supported.
 *
 * Allocate an element from @pool.  This is done by first calling into the
 * alloc_fn supplied at pool initialization time, and dipping into the reserved
 * pool when alloc_fn fails to allocate an element.
 *
 * This function only sleeps if the alloc_fn callback sleeps, or when waiting
 * for elements to become available in the pool.
 *
 * Return: pointer to the allocated element or %NULL when failing to allocate
 * an element.  Allocation failure can only happen when @gfp_mask does not
 * include %__GFP_DIRECT_RECLAIM.
 */
void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask)
{
        gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask);
        void *element;

        VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
        might_alloc(gfp_mask);

repeat_alloc:
        if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) {
                pr_info("forcing mempool usage for %pS\n",
                                (void *)_RET_IP_);
                element = NULL;
        } else {
                element = pool->alloc(gfp_temp, pool->pool_data);
        }

        if (unlikely(!element)) {
                /*
                 * Try to allocate an element from the pool.
                 *
                 * The first pass won't have __GFP_DIRECT_RECLAIM and won't
                 * sleep in mempool_alloc_from_pool.  Retry the allocation
                 * with all flags set in that case.
                 */
                if (!mempool_alloc_from_pool(pool, &element, 1, 0, gfp_temp)) {
                        if (gfp_temp != gfp_mask) {
                                gfp_temp = gfp_mask;
                                goto repeat_alloc;
                        }
                        if (gfp_mask & __GFP_DIRECT_RECLAIM) {
                                goto repeat_alloc;
                        }
                }
        }

        return element;
}
EXPORT_SYMBOL(mempool_alloc_noprof);

/**
 * mempool_alloc_preallocated - allocate an element from preallocated elements
 *                              belonging to a memory pool
 * @pool:        pointer to the memory pool
 *
 * This function is similar to mempool_alloc(), but it only attempts allocating
 * an element from the preallocated elements. It only takes a single spinlock_t
 * and immediately returns if no preallocated elements are available.
 *
 * Return: pointer to the allocated element or %NULL if no elements are
 * available.
 */
void *mempool_alloc_preallocated(struct mempool *pool)
{
        void *element = NULL;

        mempool_alloc_from_pool(pool, &element, 1, 0, GFP_NOWAIT);
        return element;
}
EXPORT_SYMBOL(mempool_alloc_preallocated);

/**
 * mempool_free_bulk - return elements to a mempool
 * @pool:        pointer to the memory pool
 * @elems:        elements to return
 * @count:        number of elements to return
 *
 * Returns a number of elements from the start of @elem to @pool if @pool needs
 * replenishing and sets their slots in @elem to NULL.  Other elements are left
 * in @elem.
 *
 * Return: number of elements transferred to @pool.  Elements are always
 * transferred from the beginning of @elem, so the return value can be used as
 * an offset into @elem for the freeing the remaining elements in the caller.
 */
unsigned int mempool_free_bulk(struct mempool *pool, void **elems,
                unsigned int count)
{
        unsigned long flags;
        unsigned int freed = 0;
        bool added = false;

        /*
         * Paired with the wmb in mempool_alloc().  The preceding read is
         * for @element and the following @pool->curr_nr.  This ensures
         * that the visible value of @pool->curr_nr is from after the
         * allocation of @element.  This is necessary for fringe cases
         * where @element was passed to this task without going through
         * barriers.
         *
         * For example, assume @p is %NULL at the beginning and one task
         * performs "p = mempool_alloc(...);" while another task is doing
         * "while (!p) cpu_relax(); mempool_free(p, ...);".  This function
         * may end up using curr_nr value which is from before allocation
         * of @p without the following rmb.
         */
        smp_rmb();

        /*
         * For correctness, we need a test which is guaranteed to trigger
         * if curr_nr + #allocated == min_nr.  Testing curr_nr < min_nr
         * without locking achieves that and refilling as soon as possible
         * is desirable.
         *
         * Because curr_nr visible here is always a value after the
         * allocation of @element, any task which decremented curr_nr below
         * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
         * incremented to min_nr afterwards.  If curr_nr gets incremented
         * to min_nr after the allocation of @element, the elements
         * allocated after that are subject to the same guarantee.
         *
         * Waiters happen iff curr_nr is 0 and the above guarantee also
         * ensures that there will be frees which return elements to the
         * pool waking up the waiters.
         *
         * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds,
         * so waiters sleeping on pool->wait would never be woken by the
         * wake-up path of previous test. This explicit check ensures the
         * allocation of element when both min_nr and curr_nr are 0, and
         * any active waiters are properly awakened.
         */
        if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
                spin_lock_irqsave(&pool->lock, flags);
                while (pool->curr_nr < pool->min_nr && freed < count) {
                        add_element(pool, elems[freed++]);
                        added = true;
                }
                spin_unlock_irqrestore(&pool->lock, flags);
        } else if (unlikely(pool->min_nr == 0 &&
                     READ_ONCE(pool->curr_nr) == 0)) {
                /* Handle the min_nr = 0 edge case: */
                spin_lock_irqsave(&pool->lock, flags);
                if (likely(pool->curr_nr == 0)) {
                        add_element(pool, elems[freed++]);
                        added = true;
                }
                spin_unlock_irqrestore(&pool->lock, flags);
        }

        if (unlikely(added) && wq_has_sleeper(&pool->wait))
                wake_up(&pool->wait);

        return freed;
}
EXPORT_SYMBOL_GPL(mempool_free_bulk);

/**
 * mempool_free - return an element to the pool.
 * @element:        element to return
 * @pool:        pointer to the memory pool
 *
 * Returns @element to @pool if it needs replenishing, else frees it using
 * the free_fn callback in @pool.
 *
 * This function only sleeps if the free_fn callback sleeps.
 */
void mempool_free(void *element, struct mempool *pool)
{
        if (likely(element) && !mempool_free_bulk(pool, &element, 1))
                pool->free(element, pool->pool_data);
}
EXPORT_SYMBOL(mempool_free);

/*
 * A commonly used alloc and free fn.
 */
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
{
        struct kmem_cache *mem = pool_data;
        VM_BUG_ON(mem->ctor);
        return kmem_cache_alloc_noprof(mem, gfp_mask);
}
EXPORT_SYMBOL(mempool_alloc_slab);

void mempool_free_slab(void *element, void *pool_data)
{
        struct kmem_cache *mem = pool_data;
        kmem_cache_free(mem, element);
}
EXPORT_SYMBOL(mempool_free_slab);

/*
 * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
 * specified by pool_data
 */
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
{
        size_t size = (size_t)pool_data;
        return kmalloc_noprof(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kmalloc);

void mempool_kfree(void *element, void *pool_data)
{
        kfree(element);
}
EXPORT_SYMBOL(mempool_kfree);

/*
 * A simple mempool-backed page allocator that allocates pages
 * of the order specified by pool_data.
 */
void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
{
        int order = (int)(long)pool_data;
        return alloc_pages_noprof(gfp_mask, order);
}
EXPORT_SYMBOL(mempool_alloc_pages);

void mempool_free_pages(void *element, void *pool_data)
{
        int order = (int)(long)pool_data;
        __free_pages(element, order);
}
EXPORT_SYMBOL(mempool_free_pages);






































































































































    4 












































    1 

   23 




   25 

























    1 

















    3 




    3 








    3 













































   82 


























   24 

















   24 






























   82 
















   82 











   16 



   79 



























   23 




   57 

















   56 









   24 















   24 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Variant of atomic_t specialized for reference counts.
 *
 * The interface matches the atomic_t interface (to aid in porting) but only
 * provides the few functions one should use for reference counting.
 *
 * Saturation semantics
 * ====================
 *
 * refcount_t differs from atomic_t in that the counter saturates at
 * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
 * counter and causing 'spurious' use-after-free issues. In order to avoid the
 * cost associated with introducing cmpxchg() loops into all of the saturating
 * operations, we temporarily allow the counter to take on an unchecked value
 * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
 * or overflow has occurred. Although this is racy when multiple threads
 * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
 * equidistant from 0 and INT_MAX we minimise the scope for error:
 *
 *                                    INT_MAX     REFCOUNT_SATURATED   UINT_MAX
 *   0                          (0x7fff_ffff)    (0xc000_0000)    (0xffff_ffff)
 *   +--------------------------------+----------------+----------------+
 *                                     <---------- bad value! ---------->
 *
 * (in a signed view of the world, the "bad value" range corresponds to
 * a negative counter value).
 *
 * As an example, consider a refcount_inc() operation that causes the counter
 * to overflow:
 *
 *         int old = atomic_fetch_add_relaxed(r);
 *        // old is INT_MAX, refcount now INT_MIN (0x8000_0000)
 *        if (old < 0)
 *                atomic_set(r, REFCOUNT_SATURATED);
 *
 * If another thread also performs a refcount_inc() operation between the two
 * atomic operations, then the count will continue to edge closer to 0. If it
 * reaches a value of 1 before /any/ of the threads reset it to the saturated
 * value, then a concurrent refcount_dec_and_test() may erroneously free the
 * underlying object.
 * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
 * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
 * With the current PID limit, if no batched refcounting operations are used and
 * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
 * operations, this makes it impossible for a saturated refcount to leave the
 * saturation range, even if it is possible for multiple uses of the same
 * refcount to nest in the context of a single task:
 *
 *     (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
 *     0x40000000 / 0x400000 = 0x100 = 256
 *
 * If hundreds of references are added/removed with a single refcounting
 * operation, it may potentially be possible to leave the saturation range; but
 * given the precise timing details involved with the round-robin scheduling of
 * each thread manipulating the refcount and the need to hit the race multiple
 * times in succession, there doesn't appear to be a practical avenue of attack
 * even if using refcount_add() operations with larger increments.
 *
 * Memory ordering
 * ===============
 *
 * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
 * and provide only what is strictly required for refcounts.
 *
 * The increments are fully relaxed; these will not provide ordering. The
 * rationale is that whatever is used to obtain the object we're increasing the
 * reference count on will provide the ordering. For locked data structures,
 * its the lock acquire, for RCU/lockless data structures its the dependent
 * load.
 *
 * Do note that inc_not_zero() provides a control dependency which will order
 * future stores against the inc, this ensures we'll never modify the object
 * if we did not in fact acquire a reference.
 *
 * The decrements will provide release order, such that all the prior loads and
 * stores will be issued before, it also provides a control dependency, which
 * will order us against the subsequent free().
 *
 * The control dependency is against the load of the cmpxchg (ll/sc) that
 * succeeded. This means the stores aren't fully ordered, but this is fine
 * because the 1->0 transition indicates no concurrency.
 *
 * Note that the allocator is responsible for ordering things between free()
 * and alloc().
 *
 * The decrements dec_and_test() and sub_and_test() also provide acquire
 * ordering on success.
 *
 * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide
 * acquire and release ordering for cases when the memory occupied by the
 * object might be reused to store another object. This is important for the
 * cases where secondary validation is required to detect such reuse, e.g.
 * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after
 * the refcount is taken, hence acquire order is necessary. Similarly, when the
 * object is initialized, all stores to its attributes should be visible before
 * the refcount is set, otherwise a stale attribute value might be used by
 * another task which succeeds in taking a refcount to the new object.
 */

#ifndef _LINUX_REFCOUNT_H
#define _LINUX_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/limits.h>
#include <linux/refcount_types.h>
#include <linux/spinlock_types.h>

struct mutex;

#define REFCOUNT_INIT(n)        { .refs = ATOMIC_INIT(n), }
#define REFCOUNT_MAX                INT_MAX
#define REFCOUNT_SATURATED        (INT_MIN / 2)

enum refcount_saturation_type {
        REFCOUNT_ADD_NOT_ZERO_OVF,
        REFCOUNT_ADD_OVF,
        REFCOUNT_ADD_UAF,
        REFCOUNT_SUB_UAF,
        REFCOUNT_DEC_LEAK,
};

void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);

/**
 * refcount_set - set a refcount's value
 * @r: the refcount
 * @n: value to which the refcount will be set
 */
static inline void refcount_set(refcount_t *r, int n)
{
        atomic_set(&r->refs, n);
}

/**
 * refcount_set_release - set a refcount's value with release ordering
 * @r: the refcount
 * @n: value to which the refcount will be set
 *
 * This function should be used when memory occupied by the object might be
 * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
 *
 * Provides release memory ordering which will order previous memory operations
 * against this store. This ensures all updates to this object are visible
 * once the refcount is set and stale values from the object previously
 * occupying this memory are overwritten with new ones.
 *
 * This function should be called only after new object is fully initialized.
 * After this call the object should be considered visible to other tasks even
 * if it was not yet added into an object collection normally used to discover
 * it. This is because other tasks might have discovered the object previously
 * occupying the same memory and after memory reuse they can succeed in taking
 * refcount to the new object and start using it.
 */
static inline void refcount_set_release(refcount_t *r, int n)
{
        atomic_set_release(&r->refs, n);
}

/**
 * refcount_read - get a refcount's value
 * @r: the refcount
 *
 * Return: the refcount's value
 */
static inline unsigned int refcount_read(const refcount_t *r)
{
        return atomic_read(&r->refs);
}

static inline __must_check
bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
{
        int old = refcount_read(r);

        do {
                if (!old)
                        break;
        } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));

        if (oldp)
                *oldp = old;

        if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

        return old;
}

/**
 * refcount_add_not_zero - add a value to a refcount unless it is 0
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 *
 * Return: false if the passed refcount is 0, true otherwise
 */
static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
{
        return __refcount_add_not_zero(i, r, NULL);
}

static inline __must_check
bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp,
                                             int limit)
{
        int old = refcount_read(r);

        do {
                if (!old)
                        break;

                if (i > limit - old) {
                        if (oldp)
                                *oldp = old;
                        return false;
                }
        } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i));

        if (oldp)
                *oldp = old;

        if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

        return old;
}

static inline __must_check bool
__refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit)
{
        return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit);
}

static inline __must_check
bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX);
}

/**
 * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0
 *
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Will saturate at REFCOUNT_SATURATED and WARN.
 *
 * This function should be used when memory occupied by the object might be
 * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
 *
 * Provides acquire memory ordering on success, it is assumed the caller has
 * guaranteed the object memory to be stable (RCU, etc.). It does provide a
 * control dependency and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc_not_zero_acquire() should instead be used to increment a
 * reference count.
 *
 * Return: false if the passed refcount is 0, true otherwise
 */
static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r)
{
        return __refcount_add_not_zero_acquire(i, r, NULL);
}

static inline
void __refcount_add(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_add_relaxed(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(!old))
                refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
        else if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
}

/**
 * refcount_add - add a value to a refcount
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 */
static inline void refcount_add(int i, refcount_t *r)
{
        __refcount_add(i, r, NULL);
}

static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero(1, r, oldp);
}

/**
 * refcount_inc_not_zero - increment a refcount unless it is 0
 * @r: the refcount to increment
 *
 * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
 * and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Return: true if the increment was successful, false otherwise
 */
static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
{
        return __refcount_inc_not_zero(r, NULL);
}

static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero_acquire(1, r, oldp);
}

/**
 * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0
 * @r: the refcount to increment
 *
 * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on
 * success.
 *
 * This function should be used when memory occupied by the object might be
 * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
 *
 * Provides acquire memory ordering on success, it is assumed the caller has
 * guaranteed the object memory to be stable (RCU, etc.). It does provide a
 * control dependency and thereby orders future stores. See the comment on top.
 *
 * Return: true if the increment was successful, false otherwise
 */
static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r)
{
        return __refcount_inc_not_zero_acquire(r, NULL);
}

static inline void __refcount_inc(refcount_t *r, int *oldp)
{
        __refcount_add(1, r, oldp);
}

/**
 * refcount_inc - increment a refcount
 * @r: the refcount to increment
 *
 * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller already has a
 * reference on the object.
 *
 * Will WARN if the refcount is 0, as this represents a possible use-after-free
 * condition.
 */
static inline void refcount_inc(refcount_t *r)
{
        __refcount_inc(r, NULL);
}

static inline __must_check
bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (old > 0 && old == i) {
                smp_acquire__after_ctrl_dep();
                return true;
        }

        if (unlikely(old <= 0 || old - i < 0))
                refcount_warn_saturate(r, REFCOUNT_SUB_UAF);

        return false;
}

/**
 * refcount_sub_and_test - subtract from a refcount and test if it is 0
 * @i: amount to subtract from the refcount
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), but it will WARN, return false and
 * ultimately leak on underflow and will fail to decrement when saturated
 * at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_dec(), or one of its variants, should instead be used to
 * decrement a reference count.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
{
        return __refcount_sub_and_test(i, r, NULL);
}

static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
{
        return __refcount_sub_and_test(1, r, oldp);
}

/**
 * refcount_dec_and_test - decrement a refcount and test if it is 0
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
 * decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
        return __refcount_dec_and_test(r, NULL);
}

static inline void __refcount_dec(refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(1, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(old <= 1))
                refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
}

/**
 * refcount_dec - decrement a refcount
 * @r: the refcount
 *
 * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
 * when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before.
 */
static inline void refcount_dec(refcount_t *r)
{
        __refcount_dec(r, NULL);
}

extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(true, lock);
extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(true, lock);
extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
                                                       spinlock_t *lock,
                                                       unsigned long *flags) __cond_acquires(true, lock);
#endif /* _LINUX_REFCOUNT_H */









































































































































































































































































































































    2 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * linux/include/linux/jbd2.h
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>
 *
 * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
 *
 * Definitions for transaction data structures for the buffer cache
 * filesystem journaling support.
 */

#ifndef _LINUX_JBD2_H
#define _LINUX_JBD2_H

/* Allow this file to be included directly into e2fsprogs */
#ifndef __KERNEL__
#include "jfs_compat.h"
#define JBD2_DEBUG
#else

#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/journal-head.h>
#include <linux/stddef.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/slab.h>
#include <linux/bit_spinlock.h>
#include <linux/blkdev.h>
#include <linux/crc32c.h>
#endif

#define journal_oom_retry 1

/*
 * Define JBD2_PARANIOD_IOFAIL to cause a kernel BUG() if ext4 finds
 * certain classes of error which can occur due to failed IOs.  Under
 * normal use we want ext4 to continue after such errors, because
 * hardware _can_ fail, but for debugging purposes when running tests on
 * known-good hardware we may want to trap these errors.
 */
#undef JBD2_PARANOID_IOFAIL

/*
 * The default maximum commit age, in seconds.
 */
#define JBD2_DEFAULT_MAX_COMMIT_AGE 5

#ifdef CONFIG_JBD2_DEBUG
/*
 * Define JBD2_EXPENSIVE_CHECKING to enable more expensive internal
 * consistency checks.  By default we don't do this unless
 * CONFIG_JBD2_DEBUG is on.
 */
#define JBD2_EXPENSIVE_CHECKING
void __jbd2_debug(int level, const char *file, const char *func,
                  unsigned int line, const char *fmt, ...);

#define jbd2_debug(n, fmt, a...) \
        __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
#else
#define jbd2_debug(n, fmt, a...)  no_printk(fmt, ##a)
#endif

extern void *jbd2_alloc(size_t size, gfp_t flags);
extern void jbd2_free(void *ptr, size_t size);

#define JBD2_MIN_JOURNAL_BLOCKS 1024
#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256

#ifdef __KERNEL__

/**
 * typedef handle_t - The handle_t type represents a single atomic update being performed by some process.
 *
 * All filesystem modifications made by the process go
 * through this handle.  Recursive operations (such as quota operations)
 * are gathered into a single update.
 *
 * The buffer credits field is used to account for journaled buffers
 * being modified by the running process.  To ensure that there is
 * enough log space for all outstanding operations, we need to limit the
 * number of outstanding buffers possible at any time.  When the
 * operation completes, any buffer credits not used are credited back to
 * the transaction, so that at all times we know how many buffers the
 * outstanding updates on a transaction might possibly touch.
 *
 * This is an opaque datatype.
 **/
typedef struct jbd2_journal_handle handle_t;        /* Atomic operation type */


/**
 * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem.
 *
 * journal_t is linked to from the fs superblock structure.
 *
 * We use the journal_t to keep track of all outstanding transaction
 * activity on the filesystem, and to manage the state of the log
 * writing process.
 *
 * This is an opaque datatype.
 **/
typedef struct journal_s        journal_t;        /* Journal control structure */
#endif

/*
 * Internal structures used by the logging mechanism:
 */

#define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */

/*
 * On-disk structures
 */

/*
 * Descriptor block types:
 */

#define JBD2_DESCRIPTOR_BLOCK        1
#define JBD2_COMMIT_BLOCK        2
#define JBD2_SUPERBLOCK_V1        3
#define JBD2_SUPERBLOCK_V2        4
#define JBD2_REVOKE_BLOCK        5

/*
 * Standard header for all descriptor blocks:
 */
typedef struct journal_header_s
{
        __be32                h_magic;
        __be32                h_blocktype;
        __be32                h_sequence;
} journal_header_t;

/*
 * Checksum types.
 */
#define JBD2_CRC32_CHKSUM   1
#define JBD2_MD5_CHKSUM     2
#define JBD2_SHA1_CHKSUM    3
#define JBD2_CRC32C_CHKSUM  4

#define JBD2_CRC32_CHKSUM_SIZE 4

#define JBD2_CHECKSUM_BYTES (32 / sizeof(u32))
/*
 * Commit block header for storing transactional checksums:
 *
 * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum*
 * fields are used to store a checksum of the descriptor and data blocks.
 *
 * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum
 * field is used to store crc32c(uuid+commit_block).  Each journal metadata
 * block gets its own checksum, and data block checksums are stored in
 * journal_block_tag (in the descriptor).  The other h_chksum* fields are
 * not used.
 *
 * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses
 * journal_block_tag3_t to store a full 32-bit checksum.  Everything else
 * is the same as v2.
 *
 * Checksum v1, v2, and v3 are mutually exclusive features.
 */
struct commit_header {
        __be32                h_magic;
        __be32          h_blocktype;
        __be32          h_sequence;
        unsigned char   h_chksum_type;
        unsigned char   h_chksum_size;
        unsigned char         h_padding[2];
        __be32                 h_chksum[JBD2_CHECKSUM_BYTES];
        __be64                h_commit_sec;
        __be32                h_commit_nsec;
};

/*
 * The block tag: used to describe a single buffer in the journal.
 * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this
 * raw struct shouldn't be used for pointer math or sizeof() - use
 * journal_tag_bytes(journal) instead to compute this.
 */
typedef struct journal_block_tag3_s
{
        __be32                t_blocknr;        /* The on-disk block number */
        __be32                t_flags;        /* See below */
        __be32                t_blocknr_high; /* most-significant high 32bits. */
        __be32                t_checksum;        /* crc32c(uuid+seq+block) */
} journal_block_tag3_t;

typedef struct journal_block_tag_s
{
        __be32                t_blocknr;        /* The on-disk block number */
        __be16                t_checksum;        /* truncated crc32c(uuid+seq+block) */
        __be16                t_flags;        /* See below */
        __be32                t_blocknr_high; /* most-significant high 32bits. */
} journal_block_tag_t;

/* Tail of descriptor or revoke block, for checksumming */
struct jbd2_journal_block_tail {
        __be32                t_checksum;        /* crc32c(uuid+descr_block) */
};

/*
 * The revoke descriptor: used on disk to describe a series of blocks to
 * be revoked from the log
 */
typedef struct jbd2_journal_revoke_header_s
{
        journal_header_t r_header;
        __be32                 r_count;        /* Count of bytes used in the block */
} jbd2_journal_revoke_header_t;

/* Definitions for the journal tag flags word: */
#define JBD2_FLAG_ESCAPE                1        /* on-disk block is escaped */
#define JBD2_FLAG_SAME_UUID        2        /* block has same uuid as previous */
#define JBD2_FLAG_DELETED        4        /* block deleted by this transaction */
#define JBD2_FLAG_LAST_TAG        8        /* last tag in this descriptor block */


/*
 * The journal superblock.  All fields are in big-endian byte order.
 */
typedef struct journal_superblock_s
{
/* 0x0000 */
        journal_header_t s_header;

/* 0x000C */
        /* Static information describing the journal */
        __be32        s_blocksize;                /* journal device blocksize */
        __be32        s_maxlen;                /* total blocks in journal file */
        __be32        s_first;                /* first block of log information */

/* 0x0018 */
        /* Dynamic information describing the current state of the log */
        __be32        s_sequence;                /* first commit ID expected in log */
        __be32        s_start;                /* blocknr of start of log */

/* 0x0020 */
        /* Error value, as set by jbd2_journal_abort(). */
        __be32        s_errno;

/* 0x0024 */
        /* Remaining fields are only valid in a version-2 superblock */
        __be32        s_feature_compat;        /* compatible feature set */
        __be32        s_feature_incompat;        /* incompatible feature set */
        __be32        s_feature_ro_compat;        /* readonly-compatible feature set */
/* 0x0030 */
        __u8        s_uuid[16];                /* 128-bit uuid for journal */

/* 0x0040 */
        __be32        s_nr_users;                /* Nr of filesystems sharing log */

        __be32        s_dynsuper;                /* Blocknr of dynamic superblock copy*/

/* 0x0048 */
        __be32        s_max_transaction;        /* Limit of journal blocks per trans.*/
        __be32        s_max_trans_data;        /* Limit of data blocks per trans. */

/* 0x0050 */
        __u8        s_checksum_type;        /* checksum type */
        __u8        s_padding2[3];
/* 0x0054 */
        __be32        s_num_fc_blks;                /* Number of fast commit blocks */
        __be32        s_head;                        /* blocknr of head of log, only uptodate
                                         * while the filesystem is clean */
/* 0x005C */
        __u32        s_padding[40];
        __be32        s_checksum;                /* crc32c(superblock) */

/* 0x0100 */
        __u8        s_users[16*48];                /* ids of all fs'es sharing the log */
/* 0x0400 */
} journal_superblock_t;

#define JBD2_FEATURE_COMPAT_CHECKSUM                0x00000001

#define JBD2_FEATURE_INCOMPAT_REVOKE                0x00000001
#define JBD2_FEATURE_INCOMPAT_64BIT                0x00000002
#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT        0x00000004
#define JBD2_FEATURE_INCOMPAT_CSUM_V2                0x00000008
#define JBD2_FEATURE_INCOMPAT_CSUM_V3                0x00000010
#define JBD2_FEATURE_INCOMPAT_FAST_COMMIT        0x00000020

/* See "journal feature predicate functions" below */

/* Features known to this kernel version: */
#define JBD2_KNOWN_COMPAT_FEATURES        JBD2_FEATURE_COMPAT_CHECKSUM
#define JBD2_KNOWN_ROCOMPAT_FEATURES        0
#define JBD2_KNOWN_INCOMPAT_FEATURES        (JBD2_FEATURE_INCOMPAT_REVOKE | \
                                        JBD2_FEATURE_INCOMPAT_64BIT | \
                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
                                        JBD2_FEATURE_INCOMPAT_CSUM_V2 | \
                                        JBD2_FEATURE_INCOMPAT_CSUM_V3 | \
                                        JBD2_FEATURE_INCOMPAT_FAST_COMMIT)

#ifdef __KERNEL__

#include <linux/fs.h>
#include <linux/sched.h>

enum jbd_state_bits {
        BH_JBD                        /* Has an attached ext3 journal_head */
          = BH_PrivateStart,
        BH_JWrite,                /* Being written to log (@@@ DEBUGGING) */
        BH_Freed,                /* Has been freed (truncated) */
        BH_Revoked,                /* Has been revoked from the log */
        BH_RevokeValid,                /* Revoked flag is valid */
        BH_JBDDirty,                /* Is dirty but journaled */
        BH_JournalHead,                /* Pins bh->b_private and jh->b_bh */
        BH_Shadow,                /* IO on shadow buffer is running */
        BH_Verified,                /* Metadata block has been verified ok */
        BH_JBDPrivateStart,        /* First bit available for private use by FS */
};

BUFFER_FNS(JBD, jbd)
BUFFER_FNS(JWrite, jwrite)
BUFFER_FNS(JBDDirty, jbddirty)
TAS_BUFFER_FNS(JBDDirty, jbddirty)
BUFFER_FNS(Revoked, revoked)
TAS_BUFFER_FNS(Revoked, revoked)
BUFFER_FNS(RevokeValid, revokevalid)
TAS_BUFFER_FNS(RevokeValid, revokevalid)
BUFFER_FNS(Freed, freed)
BUFFER_FNS(Shadow, shadow)
BUFFER_FNS(Verified, verified)

static inline struct buffer_head *jh2bh(struct journal_head *jh)
{
        return jh->b_bh;
}

static inline struct journal_head *bh2jh(struct buffer_head *bh)
{
        return bh->b_private;
}

static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
{
        bit_spin_lock(BH_JournalHead, &bh->b_state);
}

static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
{
        bit_spin_unlock(BH_JournalHead, &bh->b_state);
}

#define J_ASSERT(assert)        BUG_ON(!(assert))

#define J_ASSERT_BH(bh, expr)        J_ASSERT(expr)
#define J_ASSERT_JH(jh, expr)        J_ASSERT(expr)

#if defined(JBD2_PARANOID_IOFAIL)
#define J_EXPECT(expr, why...)                J_ASSERT(expr)
#define J_EXPECT_BH(bh, expr, why...)        J_ASSERT_BH(bh, expr)
#define J_EXPECT_JH(jh, expr, why...)        J_ASSERT_JH(jh, expr)
#else
#define __journal_expect(expr, why...)                                             \
        ({                                                                     \
                int val = (expr);                                             \
                if (!val) {                                                     \
                        printk(KERN_ERR                                             \
                               "JBD2 unexpected failure: %s: %s;\n",             \
                               __func__, #expr);                             \
                        printk(KERN_ERR why "\n");                             \
                }                                                             \
                val;                                                             \
        })
#define J_EXPECT(expr, why...)                __journal_expect(expr, ## why)
#define J_EXPECT_BH(bh, expr, why...)        __journal_expect(expr, ## why)
#define J_EXPECT_JH(jh, expr, why...)        __journal_expect(expr, ## why)
#endif

/* Flags in jbd_inode->i_flags */
#define __JI_COMMIT_RUNNING 0
#define __JI_WRITE_DATA 1
#define __JI_WAIT_DATA 2

/*
 * Commit of the inode data in progress. We use this flag to protect us from
 * concurrent deletion of inode. We cannot use reference to inode for this
 * since we cannot afford doing last iput() on behalf of kjournald
 */
#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
/* Write allocated dirty buffers in this inode before commit */
#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
/* Wait for outstanding data writes for this inode before commit */
#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)

/**
 * struct jbd2_inode - The jbd_inode type is the structure linking inodes in
 * ordered mode present in a transaction so that we can sync them during commit.
 */
struct jbd2_inode {
        /**
         * @i_transaction:
         *
         * Which transaction does this inode belong to? Either the running
         * transaction or the committing one. [j_list_lock]
         */
        transaction_t *i_transaction;

        /**
         * @i_next_transaction:
         *
         * Pointer to the running transaction modifying inode's data in case
         * there is already a committing transaction touching it. [j_list_lock]
         */
        transaction_t *i_next_transaction;

        /**
         * @i_list: List of inodes in the i_transaction [j_list_lock]
         */
        struct list_head i_list;

        /**
         * @i_vfs_inode:
         *
         * VFS inode this inode belongs to [constant for lifetime of structure]
         */
        struct inode *i_vfs_inode;

        /**
         * @i_flags: Flags of inode [j_list_lock]
         */
        unsigned long i_flags;

        /**
         * @i_dirty_start_page:
         *
         * Dirty range start in PAGE_SIZE units.
         *
         * The dirty range is empty if @i_dirty_start_page is greater than or
         * equal to @i_dirty_end_page.
         *
         * [j_list_lock]
         */
        pgoff_t i_dirty_start_page;

        /**
         * @i_dirty_end_page:
         *
         * Dirty range end in PAGE_SIZE units (exclusive).
         *
         * [j_list_lock]
         */
        pgoff_t i_dirty_end_page;
};

/*
 * Lockless readers treat start_page >= end_page as an empty range.
 * Writers publish a new non-empty range by storing i_dirty_end_page before
 * i_dirty_start_page.
 */
static inline bool jbd2_jinode_get_dirty_range(const struct jbd2_inode *jinode,
                                               loff_t *start, loff_t *end)
{
        pgoff_t start_page = READ_ONCE(jinode->i_dirty_start_page);
        pgoff_t end_page = READ_ONCE(jinode->i_dirty_end_page);

        if (start_page >= end_page)
                return false;

        *start = (loff_t)start_page << PAGE_SHIFT;
        *end = ((loff_t)end_page << PAGE_SHIFT) - 1;
        return true;
}

struct jbd2_revoke_table_s;

/**
 * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete
 *     type associated with handle_t.
 * @h_transaction: Which compound transaction is this update a part of?
 * @h_journal: Which journal handle belongs to - used iff h_reserved set.
 * @h_rsv_handle: Handle reserved for finishing the logical operation.
 * @h_total_credits: Number of remaining buffers we are allowed to add to
 *        journal. These are dirty buffers and revoke descriptor blocks.
 * @h_revoke_credits: Number of remaining revoke records available for handle
 * @h_ref: Reference count on this handle.
 * @h_err: Field for caller's use to track errors through large fs operations.
 * @h_sync: Flag for sync-on-close.
 * @h_reserved: Flag for handle for reserved credits.
 * @h_aborted: Flag indicating fatal error on handle.
 * @h_type: For handle statistics.
 * @h_line_no: For handle statistics.
 * @h_start_jiffies: Handle Start time.
 * @h_requested_credits: Holds @h_total_credits after handle is started.
 * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started.
 * @saved_alloc_context: Saved context while transaction is open.
 **/

/* Docbook can't yet cope with the bit fields, but will leave the documentation
 * in so it can be fixed later.
 */

struct jbd2_journal_handle
{
        union {
                transaction_t        *h_transaction;
                /* Which journal handle belongs to - used iff h_reserved set */
                journal_t        *h_journal;
        };

        handle_t                *h_rsv_handle;
        int                        h_total_credits;
        int                        h_revoke_credits;
        int                        h_revoke_credits_requested;
        int                        h_ref;
        int                        h_err;

        /* Flags [no locking] */
        unsigned int        h_sync:                1;
        unsigned int        h_reserved:        1;
        unsigned int        h_aborted:        1;
        unsigned int        h_type:                8;
        unsigned int        h_line_no:        16;

        unsigned long                h_start_jiffies;
        unsigned int                h_requested_credits;

        unsigned int                saved_alloc_context;
};


/*
 * Some stats for checkpoint phase
 */
struct transaction_chp_stats_s {
        unsigned long                cs_chp_time;
        __u32                        cs_forced_to_close;
        __u32                        cs_written;
        __u32                        cs_dropped;
};

/* The transaction_t type is the guts of the journaling mechanism.  It
 * tracks a compound transaction through its various states:
 *
 * RUNNING:        accepting new updates
 * LOCKED:        Updates still running but we don't accept new ones
 * RUNDOWN:        Updates are tidying up but have finished requesting
 *                new buffers to modify (state not used for now)
 * FLUSH:       All updates complete, but we are still writing to disk
 * COMMIT:      All data on disk, writing commit record
 * FINISHED:        We still have to keep the transaction for checkpointing.
 *
 * The transaction keeps track of all of the buffers modified by a
 * running transaction, and all of the buffers committed but not yet
 * flushed to home for finished transactions.
 * (Locking Documentation improved by LockDoc)
 */

/*
 * Lock ranking:
 *
 *    j_list_lock
 *      ->jbd_lock_bh_journal_head()        (This is "innermost")
 *
 *    j_state_lock
 *    ->b_state_lock
 *
 *    b_state_lock
 *    ->j_list_lock
 *
 *    j_state_lock
 *    ->j_list_lock                        (journal_unmap_buffer)
 *
 */

struct transaction_s
{
        /* Pointer to the journal for this transaction. [no locking] */
        journal_t                *t_journal;

        /* Sequence number for this transaction [no locking] */
        tid_t                        t_tid;

        /*
         * Transaction's current state
         * [no locking - only kjournald2 alters this]
         * [j_list_lock] guards transition of a transaction into T_FINISHED
         * state and subsequent call of __jbd2_journal_drop_transaction()
         * FIXME: needs barriers
         * KLUDGE: [use j_state_lock]
         */
        enum {
                T_RUNNING,
                T_LOCKED,
                T_SWITCH,
                T_FLUSH,
                T_COMMIT,
                T_COMMIT_DFLUSH,
                T_COMMIT_JFLUSH,
                T_COMMIT_CALLBACK,
                T_FINISHED
        }                        t_state;

        /*
         * Where in the log does this transaction's commit start? [no locking]
         */
        unsigned long                t_log_start;

        /*
         * Number of buffers on the t_buffers list [j_list_lock, no locks
         * needed for jbd2 thread]
         */
        int                        t_nr_buffers;

        /*
         * Doubly-linked circular list of all buffers reserved but not yet
         * modified by this transaction [j_list_lock, no locks needed fo
         * jbd2 thread]
         */
        struct journal_head        *t_reserved_list;

        /*
         * Doubly-linked circular list of all metadata buffers owned by this
         * transaction [j_list_lock, no locks needed for jbd2 thread]
         */
        struct journal_head        *t_buffers;

        /*
         * Doubly-linked circular list of all forget buffers (superseded
         * buffers which we can un-checkpoint once this transaction commits)
         * [j_list_lock]
         */
        struct journal_head        *t_forget;

        /*
         * Doubly-linked circular list of all buffers still to be flushed before
         * this transaction can be checkpointed. [j_list_lock]
         */
        struct journal_head        *t_checkpoint_list;

        /*
         * Doubly-linked circular list of metadata buffers being
         * shadowed by log IO.  The IO buffers on the iobuf list and
         * the shadow buffers on this list match each other one for
         * one at all times. [j_list_lock, no locks needed for jbd2
         * thread]
         */
        struct journal_head        *t_shadow_list;

        /*
         * List of inodes associated with the transaction; e.g., ext4 uses
         * this to track inodes in data=ordered and data=journal mode that
         * need special handling on transaction commit; also used by ocfs2.
         * [j_list_lock]
         */
        struct list_head        t_inode_list;

        /*
         * Longest time some handle had to wait for running transaction
         */
        unsigned long                t_max_wait;

        /*
         * When transaction started
         */
        unsigned long                t_start;

        /*
         * When commit was requested [j_state_lock]
         */
        unsigned long                t_requested;

        /*
         * Checkpointing stats [j_list_lock]
         */
        struct transaction_chp_stats_s t_chp_stats;

        /*
         * Number of outstanding updates running on this transaction
         * [none]
         */
        atomic_t                t_updates;

        /*
         * Number of blocks reserved for this transaction in the journal.
         * This is including all credits reserved when starting transaction
         * handles as well as all journal descriptor blocks needed for this
         * transaction. [none]
         */
        atomic_t                t_outstanding_credits;

        /*
         * Number of revoke records for this transaction added by already
         * stopped handles. [none]
         */
        atomic_t                t_outstanding_revokes;

        /*
         * How many handles used this transaction? [none]
         */
        atomic_t                t_handle_count;

        /*
         * Forward and backward links for the circular list of all transactions
         * awaiting checkpoint. [j_list_lock]
         */
        transaction_t                *t_cpnext, *t_cpprev;

        /*
         * When will the transaction expire (become due for commit), in jiffies?
         * [no locking]
         */
        unsigned long                t_expires;

        /*
         * When this transaction started, in nanoseconds [no locking]
         */
        ktime_t                        t_start_time;

        /*
         * This transaction is being forced and some process is
         * waiting for it to finish.
         */
        unsigned int t_synchronous_commit:1;

        /* Disk flush needs to be sent to fs partition [no locking] */
        int                        t_need_data_flush;
};

struct transaction_run_stats_s {
        unsigned long                rs_wait;
        unsigned long                rs_request_delay;
        unsigned long                rs_running;
        unsigned long                rs_locked;
        unsigned long                rs_flushing;
        unsigned long                rs_logging;

        __u32                        rs_handle_count;
        __u32                        rs_blocks;
        __u32                        rs_blocks_logged;
};

struct transaction_stats_s {
        unsigned long                ts_tid;
        unsigned long                ts_requested;
        struct transaction_run_stats_s run;
};

static inline unsigned long
jbd2_time_diff(unsigned long start, unsigned long end)
{
        if (end >= start)
                return end - start;

        return end + (MAX_JIFFY_OFFSET - start);
}

#define JBD2_NR_BATCH        64

enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};

#define JBD2_FC_REPLAY_STOP        0
#define JBD2_FC_REPLAY_CONTINUE        1

/**
 * struct journal_s - The journal_s type is the concrete type associated with
 *     journal_t.
 */
struct journal_s
{
        /**
         * @j_flags: General journaling state flags [j_state_lock,
         * no lock for quick racy checks]
         */
        unsigned long                j_flags;

        /**
         * @j_errno:
         *
         * Is there an outstanding uncleared error on the journal (from a prior
         * abort)? [j_state_lock]
         */
        int                        j_errno;

        /**
         * @j_abort_mutex: Lock the whole aborting procedure.
         */
        struct mutex                j_abort_mutex;

        /**
         * @j_sb_buffer: The first part of the superblock buffer.
         */
        struct buffer_head        *j_sb_buffer;

        /**
         * @j_superblock: The second part of the superblock buffer.
         */
        journal_superblock_t        *j_superblock;

        /**
         * @j_state_lock: Protect the various scalars in the journal.
         */
        rwlock_t                j_state_lock;

        /**
         * @j_barrier_count:
         *
         * Number of processes waiting to create a barrier lock [j_state_lock,
         * no lock for quick racy checks]
         */
        int                        j_barrier_count;

        /**
         * @j_barrier: The barrier lock itself.
         */
        struct mutex                j_barrier;

        /**
         * @j_running_transaction:
         *
         * Transactions: The current running transaction...
         * [j_state_lock, no lock for quick racy checks] [caller holding
         * open handle]
         */
        transaction_t                *j_running_transaction;

        /**
         * @j_committing_transaction:
         *
         * the transaction we are pushing to disk
         * [j_state_lock] [caller holding open handle]
         */
        transaction_t                *j_committing_transaction;

        /**
         * @j_checkpoint_transactions:
         *
         * ... and a linked circular list of all transactions waiting for
         * checkpointing. [j_list_lock]
         */
        transaction_t                *j_checkpoint_transactions;

        /**
         * @j_wait_transaction_locked:
         *
         * Wait queue for waiting for a locked transaction to start committing,
         * or for a barrier lock to be released.
         */
        wait_queue_head_t        j_wait_transaction_locked;

        /**
         * @j_wait_done_commit: Wait queue for waiting for commit to complete.
         */
        wait_queue_head_t        j_wait_done_commit;

        /**
         * @j_wait_commit: Wait queue to trigger commit.
         */
        wait_queue_head_t        j_wait_commit;

        /**
         * @j_wait_updates: Wait queue to wait for updates to complete.
         */
        wait_queue_head_t        j_wait_updates;

        /**
         * @j_wait_reserved:
         *
         * Wait queue to wait for reserved buffer credits to drop.
         */
        wait_queue_head_t        j_wait_reserved;

        /**
         * @j_fc_wait:
         *
         * Wait queue to wait for completion of async fast commits.
         */
        wait_queue_head_t        j_fc_wait;

        /**
         * @j_checkpoint_mutex:
         *
         * Semaphore for locking against concurrent checkpoints.
         */
        struct mutex                j_checkpoint_mutex;

        /**
         * @j_chkpt_bhs:
         *
         * List of buffer heads used by the checkpoint routine.  This
         * was moved from jbd2_log_do_checkpoint() to reduce stack
         * usage.  Access to this array is controlled by the
         * @j_checkpoint_mutex.  [j_checkpoint_mutex]
         */
        struct buffer_head        *j_chkpt_bhs[JBD2_NR_BATCH];

        /**
         * @j_shrinker:
         *
         * Journal head shrinker, reclaim buffer's journal head which
         * has been written back.
         */
        struct shrinker                *j_shrinker;

        /**
         * @j_checkpoint_jh_count:
         *
         * Number of journal buffers on the checkpoint list. [j_list_lock]
         */
        struct percpu_counter        j_checkpoint_jh_count;

        /**
         * @j_shrink_transaction:
         *
         * Record next transaction will shrink on the checkpoint list.
         * [j_list_lock]
         */
        transaction_t                *j_shrink_transaction;

        /**
         * @j_head:
         *
         * Journal head: identifies the first unused block in the journal.
         * [j_state_lock]
         */
        unsigned long                j_head;

        /**
         * @j_tail:
         *
         * Journal tail: identifies the oldest still-used block in the journal.
         * [j_state_lock]
         */
        unsigned long                j_tail;

        /**
         * @j_free:
         *
         * Journal free: how many free blocks are there in the journal?
         * [j_state_lock]
         */
        unsigned long                j_free;

        /**
         * @j_first:
         *
         * The block number of the first usable block in the journal
         * [j_state_lock].
         */
        unsigned long                j_first;

        /**
         * @j_last:
         *
         * The block number one beyond the last usable block in the journal
         * [j_state_lock].
         */
        unsigned long                j_last;

        /**
         * @j_fc_first:
         *
         * The block number of the first fast commit block in the journal
         * [j_state_lock].
         */
        unsigned long                j_fc_first;

        /**
         * @j_fc_off:
         *
         * Number of fast commit blocks currently allocated. Accessed only
         * during fast commit. Currently only process can do fast commit, so
         * this field is not protected by any lock.
         */
        unsigned long                j_fc_off;

        /**
         * @j_fc_last:
         *
         * The block number one beyond the last fast commit block in the journal
         * [j_state_lock].
         */
        unsigned long                j_fc_last;

        /**
         * @j_dev: Device where we store the journal.
         */
        struct block_device        *j_dev;

        /**
         * @j_blocksize: Block size for the location where we store the journal.
         */
        int                        j_blocksize;

        /**
         * @j_blk_offset:
         *
         * Starting block offset into the device where we store the journal.
         */
        unsigned long long        j_blk_offset;

        /**
         * @j_devname: Journal device name.
         */
        char                        j_devname[BDEVNAME_SIZE+24];

        /**
         * @j_fs_dev:
         *
         * Device which holds the client fs.  For internal journal this will be
         * equal to j_dev.
         */
        struct block_device        *j_fs_dev;

        /**
         * @j_fs_dev_wb_err:
         *
         * Records the errseq of the client fs's backing block device.
         */
        errseq_t                j_fs_dev_wb_err;

        /**
         * @j_total_len: Total maximum capacity of the journal region on disk.
         */
        unsigned int                j_total_len;

        /**
         * @j_reserved_credits:
         *
         * Number of buffers reserved from the running transaction.
         */
        atomic_t                j_reserved_credits;

        /**
         * @j_list_lock: Protects the buffer lists and internal buffer state.
         */
        spinlock_t                j_list_lock;

        /**
         * @j_inode:
         *
         * Optional inode where we store the journal.  If present, all
         * journal block numbers are mapped into this inode via bmap().
         */
        struct inode                *j_inode;

        /**
         * @j_tail_sequence:
         *
         * Sequence number of the oldest transaction in the log [j_state_lock]
         */
        tid_t                        j_tail_sequence;

        /**
         * @j_transaction_sequence:
         *
         * Sequence number of the next transaction to grant [j_state_lock]
         */
        tid_t                        j_transaction_sequence;

        /**
         * @j_commit_sequence:
         *
         * Sequence number of the most recently committed transaction
         * [j_state_lock, no lock for quick racy checks]
         */
        tid_t                        j_commit_sequence;

        /**
         * @j_commit_request:
         *
         * Sequence number of the most recent transaction wanting commit
         * [j_state_lock, no lock for quick racy checks]
         */
        tid_t                        j_commit_request;

        /**
         * @j_uuid:
         *
         * Journal uuid: identifies the object (filesystem, LVM volume etc)
         * backed by this journal.  This will eventually be replaced by an array
         * of uuids, allowing us to index multiple devices within a single
         * journal and to perform atomic updates across them.
         */
        __u8                        j_uuid[16];

        /**
         * @j_task: Pointer to the current commit thread for this journal.
         */
        struct task_struct        *j_task;

        /**
         * @j_max_transaction_buffers:
         *
         * Maximum number of metadata buffers to allow in a single compound
         * commit transaction.
         */
        int                        j_max_transaction_buffers;

        /**
         * @j_revoke_records_per_block:
         *
         * Number of revoke records that fit in one descriptor block.
         */
        int                        j_revoke_records_per_block;

        /**
         * @j_transaction_overhead_buffers:
         *
         * Number of blocks each transaction needs for its own bookkeeping
         */
        int                        j_transaction_overhead_buffers;

        /**
         * @j_commit_interval:
         *
         * What is the maximum transaction lifetime before we begin a commit?
         */
        unsigned long                j_commit_interval;

        /**
         * @j_commit_timer: The timer used to wakeup the commit thread.
         */
        struct timer_list        j_commit_timer;

        /**
         * @j_revoke_lock: Protect the revoke table.
         */
        spinlock_t                j_revoke_lock;

        /**
         * @j_revoke:
         *
         * The revoke table - maintains the list of revoked blocks in the
         * current transaction.
         */
        struct jbd2_revoke_table_s *j_revoke;

        /**
         * @j_revoke_table: Alternate revoke tables for j_revoke.
         */
        struct jbd2_revoke_table_s *j_revoke_table[2];

        /**
         * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction.
         */
        struct buffer_head        **j_wbuf;

        /**
         * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only
         * during a fast commit. Currently only process can do fast commit, so
         * this field is not protected by any lock.
         */
        struct buffer_head        **j_fc_wbuf;

        /**
         * @j_wbufsize:
         *
         * Size of @j_wbuf array.
         */
        int                        j_wbufsize;

        /**
         * @j_fc_wbufsize:
         *
         * Size of @j_fc_wbuf array.
         */
        int                        j_fc_wbufsize;

        /**
         * @j_last_sync_writer:
         *
         * The pid of the last person to run a synchronous operation
         * through the journal.
         */
        pid_t                        j_last_sync_writer;

        /**
         * @j_average_commit_time:
         *
         * The average amount of time in nanoseconds it takes to commit a
         * transaction to disk. [j_state_lock]
         */
        u64                        j_average_commit_time;

        /**
         * @j_min_batch_time:
         *
         * Minimum time that we should wait for additional filesystem operations
         * to get batched into a synchronous handle in microseconds.
         */
        u32                        j_min_batch_time;

        /**
         * @j_max_batch_time:
         *
         * Maximum time that we should wait for additional filesystem operations
         * to get batched into a synchronous handle in microseconds.
         */
        u32                        j_max_batch_time;

        /**
         * @j_commit_callback:
         *
         * This function is called when a transaction is closed.
         */
        void                        (*j_commit_callback)(journal_t *,
                                                     transaction_t *);

        /**
         * @j_submit_inode_data_buffers:
         *
         * This function is called for all inodes associated with the
         * committing transaction marked with JI_WRITE_DATA flag
         * before we start to write out the transaction to the journal.
         */
        int                        (*j_submit_inode_data_buffers)
                                        (struct jbd2_inode *);

        /**
         * @j_finish_inode_data_buffers:
         *
         * This function is called for all inodes associated with the
         * committing transaction marked with JI_WAIT_DATA flag
         * after we have written the transaction to the journal
         * but before we write out the commit block.
         */
        int                        (*j_finish_inode_data_buffers)
                                        (struct jbd2_inode *);

        /*
         * Journal statistics
         */

        /**
         * @j_history_lock: Protect the transactions statistics history.
         */
        spinlock_t                j_history_lock;

        /**
         * @j_proc_entry: procfs entry for the jbd statistics directory.
         */
        struct proc_dir_entry        *j_proc_entry;

        /**
         * @j_stats: Overall statistics.
         */
        struct transaction_stats_s j_stats;

        /**
         * @j_failed_commit: Failed journal commit ID.
         */
        unsigned int                j_failed_commit;

        /**
         * @j_private:
         *
         * An opaque pointer to fs-private information.  ext3 puts its
         * superblock pointer here.
         */
        void *j_private;

        /**
         * @j_csum_seed:
         *
         * Precomputed journal UUID checksum for seeding other checksums.
         */
        __u32 j_csum_seed;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /**
         * @j_trans_commit_map:
         *
         * Lockdep entity to track transaction commit dependencies. Handles
         * hold this "lock" for read, when we wait for commit, we acquire the
         * "lock" for writing. This matches the properties of jbd2 journalling
         * where the running transaction has to wait for all handles to be
         * dropped to commit that transaction and also acquiring a handle may
         * require transaction commit to finish.
         */
        struct lockdep_map        j_trans_commit_map;
#endif
        /**
         * @jbd2_trans_commit_key:
         *
         * "struct lock_class_key" for @j_trans_commit_map
         */
        struct lock_class_key        jbd2_trans_commit_key;

        /**
         * @j_fc_cleanup_callback:
         *
         * Clean-up after fast commit or full commit. JBD2 calls this function
         * after every commit operation.
         */
        void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid);

        /**
         * @j_fc_replay_callback:
         *
         * File-system specific function that performs replay of a fast
         * commit. JBD2 calls this function for each fast commit block found in
         * the journal. This function should return JBD2_FC_REPLAY_CONTINUE
         * to indicate that the block was processed correctly and more fast
         * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP
         * indicates the end of replay (no more blocks remaining). A negative
         * return value indicates error.
         */
        int (*j_fc_replay_callback)(struct journal_s *journal,
                                    struct buffer_head *bh,
                                    enum passtype pass, int off,
                                    tid_t expected_commit_id);

        /**
         * @j_bmap:
         *
         * Bmap function that should be used instead of the generic
         * VFS bmap function.
         */
        int (*j_bmap)(struct journal_s *journal, sector_t *block);
};

#define jbd2_might_wait_for_commit(j) \
        do { \
                rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \
                rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \
        } while (0)

/*
 * We can support any known requested features iff the
 * superblock is not in version 1.  Otherwise we fail to support any
 * extended sb features.
 */
static inline bool jbd2_format_support_feature(journal_t *j)
{
        return j->j_superblock->s_header.h_blocktype !=
                                        cpu_to_be32(JBD2_SUPERBLOCK_V1);
}

/* journal feature predicate functions */
#define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return (jbd2_format_support_feature(j) && \
                ((j)->j_superblock->s_feature_compat & \
                 cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_compat |= \
                cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_compat &= \
                ~cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \
}

#define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return (jbd2_format_support_feature(j) && \
                ((j)->j_superblock->s_feature_ro_compat & \
                 cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_ro_compat |= \
                cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_ro_compat &= \
                ~cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \
}

#define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return (jbd2_format_support_feature(j) && \
                ((j)->j_superblock->s_feature_incompat & \
                 cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_incompat |= \
                cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_incompat &= \
                ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \
}

JBD2_FEATURE_COMPAT_FUNCS(checksum,                CHECKSUM)

JBD2_FEATURE_INCOMPAT_FUNCS(revoke,                REVOKE)
JBD2_FEATURE_INCOMPAT_FUNCS(64bit,                64BIT)
JBD2_FEATURE_INCOMPAT_FUNCS(async_commit,        ASYNC_COMMIT)
JBD2_FEATURE_INCOMPAT_FUNCS(csum2,                CSUM_V2)
JBD2_FEATURE_INCOMPAT_FUNCS(csum3,                CSUM_V3)
JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,        FAST_COMMIT)

/* Journal high priority write IO operation flags */
#define JBD2_JOURNAL_REQ_FLAGS                (REQ_META | REQ_SYNC | REQ_IDLE)

/*
 * Journal flag definitions
 */
#define JBD2_UNMOUNT        0x001        /* Journal thread is being destroyed */
#define JBD2_ABORT        0x002        /* Journaling has been aborted for errors. */
#define JBD2_ACK_ERR        0x004        /* The errno in the sb has been acked */
#define JBD2_FLUSHED        0x008        /* The journal superblock has been flushed */
#define JBD2_LOADED        0x010        /* The journal superblock has been loaded */
#define JBD2_BARRIER        0x020        /* Use IDE barriers */
#define JBD2_CYCLE_RECORD                0x080        /* Journal cycled record log on
                                                 * clean and empty filesystem
                                                 * logging area */
#define JBD2_FAST_COMMIT_ONGOING        0x100        /* Fast commit is ongoing */
#define JBD2_FULL_COMMIT_ONGOING        0x200        /* Full commit is ongoing */
#define JBD2_JOURNAL_FLUSH_DISCARD        0x0001
#define JBD2_JOURNAL_FLUSH_ZEROOUT        0x0002
#define JBD2_JOURNAL_FLUSH_VALID        (JBD2_JOURNAL_FLUSH_DISCARD | \
                                        JBD2_JOURNAL_FLUSH_ZEROOUT)

/*
 * Function declarations for the journaling transaction and buffer
 * management
 */

/* Filing buffers */
extern bool __jbd2_journal_refile_buffer(struct journal_head *);
extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh)
{
        list_add_tail(&bh->b_assoc_buffers, head);
}
static inline void jbd2_unfile_log_bh(struct buffer_head *bh)
{
        list_del_init(&bh->b_assoc_buffers);
}

/* Log buffer allocation */
struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int);
void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *);
int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
                              unsigned long *block);
int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);

/* Commit management */
extern void jbd2_journal_commit_transaction(journal_t *);

/* Checkpoint list management */
enum jbd2_shrink_type {JBD2_SHRINK_DESTROY, JBD2_SHRINK_BUSY_STOP, JBD2_SHRINK_BUSY_SKIP};

void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type);
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);


/*
 * Triggers
 */

struct jbd2_buffer_trigger_type {
        /*
         * Fired a the moment data to write to the journal are known to be
         * stable - so either at the moment b_frozen_data is created or just
         * before a buffer is written to the journal.  mapped_data is a mapped
         * buffer that is the frozen data for commit.
         */
        void (*t_frozen)(struct jbd2_buffer_trigger_type *type,
                         struct buffer_head *bh, void *mapped_data,
                         size_t size);

        /*
         * Fired during journal abort for dirty buffers that will not be
         * committed.
         */
        void (*t_abort)(struct jbd2_buffer_trigger_type *type,
                        struct buffer_head *bh);
};

extern void jbd2_buffer_frozen_trigger(struct journal_head *jh,
                                       void *mapped_data,
                                       struct jbd2_buffer_trigger_type *triggers);
extern void jbd2_buffer_abort_trigger(struct journal_head *jh,
                                      struct jbd2_buffer_trigger_type *triggers);

/* Buffer IO */
extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
                                              struct journal_head *jh_in,
                                              struct buffer_head **bh_out,
                                              sector_t blocknr);

/* Transaction cache support */
extern void jbd2_journal_destroy_transaction_cache(void);
extern int __init jbd2_journal_init_transaction_cache(void);
extern void jbd2_journal_free_transaction(transaction_t *);

/*
 * Journal locking.
 *
 * We need to lock the journal during transaction state changes so that nobody
 * ever tries to take a handle on the running transaction while we are in the
 * middle of moving it to the commit phase.  j_state_lock does this.
 *
 * Note that the locking is completely interrupt unsafe.  We never touch
 * journal structures from interrupts.
 */

static inline handle_t *journal_current_handle(void)
{
        return current->journal_info;
}

/* The journaling code user interface:
 *
 * Create and destroy handles
 * Register buffer modifications against the current transaction.
 */

extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks,
                                     int revoke_records, gfp_t gfp_mask,
                                     unsigned int type, unsigned int line_no);
extern int         jbd2_journal_restart(handle_t *, int nblocks);
extern int         jbd2__journal_restart(handle_t *, int nblocks,
                                       int revoke_records, gfp_t gfp_mask);
extern int         jbd2_journal_start_reserved(handle_t *handle,
                                unsigned int type, unsigned int line_no);
extern void         jbd2_journal_free_reserved(handle_t *handle);
extern int         jbd2_journal_extend(handle_t *handle, int nblocks,
                                     int revoke_records);
extern int         jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
extern int         jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
extern int         jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
void                 jbd2_journal_set_triggers(struct buffer_head *,
                                           struct jbd2_buffer_trigger_type *type);
extern int         jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
extern int         jbd2_journal_forget (handle_t *, struct buffer_head *);
int jbd2_journal_invalidate_folio(journal_t *, struct folio *,
                                        size_t offset, size_t length);
bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio);
extern int         jbd2_journal_stop(handle_t *);
extern int         jbd2_journal_flush(journal_t *journal, unsigned int flags);
extern void         jbd2_journal_lock_updates (journal_t *);
extern void         jbd2_journal_unlock_updates (journal_t *);

void jbd2_journal_wait_updates(journal_t *);

extern journal_t * jbd2_journal_init_dev(struct block_device *bdev,
                                struct block_device *fs_dev,
                                unsigned long long start, int len, int bsize);
extern journal_t * jbd2_journal_init_inode (struct inode *);
extern int           jbd2_journal_update_format (journal_t *);
extern int           jbd2_journal_check_used_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_check_available_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_set_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern void           jbd2_journal_clear_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_load       (journal_t *journal);
extern int           jbd2_journal_destroy    (journal_t *);
extern int           jbd2_journal_recover    (journal_t *journal);
extern int           jbd2_journal_wipe       (journal_t *, int);
extern int           jbd2_journal_skip_recovery        (journal_t *);
extern void           jbd2_journal_update_sb_errno(journal_t *);
extern int           jbd2_journal_update_sb_log_tail        (journal_t *, tid_t,
                                unsigned long, blk_opf_t);
extern void           jbd2_journal_abort      (journal_t *, int);
extern int           jbd2_journal_errno      (journal_t *);
extern void           jbd2_journal_ack_err    (journal_t *);
extern int           jbd2_journal_clear_err  (journal_t *);
extern int           jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
extern int           jbd2_journal_force_commit(journal_t *);
extern int           jbd2_journal_force_commit_nested(journal_t *);
extern int           jbd2_journal_inode_ranged_write(handle_t *handle,
                        struct jbd2_inode *inode, loff_t start_byte,
                        loff_t length);
extern int           jbd2_journal_inode_ranged_wait(handle_t *handle,
                        struct jbd2_inode *inode, loff_t start_byte,
                        loff_t length);
extern int           jbd2_journal_finish_inode_data_buffers(
                        struct jbd2_inode *jinode);
extern int           jbd2_journal_begin_ordered_truncate(journal_t *journal,
                                struct jbd2_inode *inode, loff_t new_size);
extern void           jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
extern void           jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);

/*
 * journal_head management
 */
struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh);
struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh);
void jbd2_journal_put_journal_head(struct journal_head *jh);

/*
 * handle management
 */
extern struct kmem_cache *jbd2_handle_cache;

/*
 * This specialized allocator has to be a macro for its allocations to be
 * accounted separately (to have a separate alloc_tag). The typecast is
 * intentional to enforce typesafety.
 */
#define jbd2_alloc_handle(_gfp_flags)        \
                ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags))

static inline void jbd2_free_handle(handle_t *handle)
{
        kmem_cache_free(jbd2_handle_cache, handle);
}

/*
 * jbd2_inode management (optional, for those file systems that want to use
 * dynamically allocated jbd2_inode structures)
 */
extern struct kmem_cache *jbd2_inode_cache;

/*
 * This specialized allocator has to be a macro for its allocations to be
 * accounted separately (to have a separate alloc_tag). The typecast is
 * intentional to enforce typesafety.
 */
#define jbd2_alloc_inode(_gfp_flags)        \
                ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags))

static inline void jbd2_free_inode(struct jbd2_inode *jinode)
{
        kmem_cache_free(jbd2_inode_cache, jinode);
}

/* Primary revoke support */
#define JOURNAL_REVOKE_DEFAULT_HASH 256
extern int           jbd2_journal_init_revoke(journal_t *, int);
extern void           jbd2_journal_destroy_revoke_record_cache(void);
extern void           jbd2_journal_destroy_revoke_table_cache(void);
extern int __init jbd2_journal_init_revoke_record_cache(void);
extern int __init jbd2_journal_init_revoke_table_cache(void);
struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size);
void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table);

extern void           jbd2_journal_destroy_revoke(journal_t *);
extern int           jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
extern void           jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
extern void           jbd2_journal_write_revoke_records(transaction_t *transaction,
                                                     struct list_head *log_bufs);

/* Recovery revoke support */
extern int        jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
extern int        jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t);
extern void        jbd2_journal_clear_revoke(journal_t *);
extern void        jbd2_journal_switch_revoke_table(journal_t *journal);
extern void        jbd2_clear_buffer_revoked_flags(journal_t *journal);

/*
 * The log thread user interface:
 *
 * Request space in the current transaction, and force transaction commit
 * transitions on demand.
 */

int jbd2_log_start_commit(journal_t *journal, tid_t tid);
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_transaction_committed(journal_t *journal, tid_t tid);
int jbd2_complete_transaction(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);

void __jbd2_log_wait_for_space(journal_t *journal);
extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
extern int jbd2_cleanup_journal_tail(journal_t *);

/* Fast commit related APIs */
int jbd2_fc_begin_commit(journal_t *journal, tid_t tid);
int jbd2_fc_end_commit(journal_t *journal);
int jbd2_fc_end_commit_fallback(journal_t *journal);
int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out);
int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
void jbd2_fc_release_bufs(journal_t *journal);

/*
 * is_journal_abort
 *
 * Simple test wrapper function to test the JBD2_ABORT state flag.  This
 * bit, when set, indicates that we have had a fatal error somewhere,
 * either inside the journaling layer or indicated to us by the client
 * (eg. ext3), and that we and should not commit any further
 * transactions.
 */

static inline int is_journal_aborted(journal_t *journal)
{
        return journal->j_flags & JBD2_ABORT;
}

static inline int is_handle_aborted(handle_t *handle)
{
        if (handle->h_aborted || !handle->h_transaction)
                return 1;
        return is_journal_aborted(handle->h_transaction->t_journal);
}

static inline void jbd2_journal_abort_handle(handle_t *handle)
{
        handle->h_aborted = 1;
}

static inline void jbd2_init_fs_dev_write_error(journal_t *journal)
{
        struct address_space *mapping = journal->j_fs_dev->bd_mapping;

        /*
         * Save the original wb_err value of client fs's bdev mapping which
         * could be used to detect the client fs's metadata async write error.
         */
        errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err);
}

static inline int jbd2_check_fs_dev_write_error(journal_t *journal)
{
        struct address_space *mapping = journal->j_fs_dev->bd_mapping;

        return errseq_check(&mapping->wb_err,
                            READ_ONCE(journal->j_fs_dev_wb_err));
}

#endif /* __KERNEL__   */

/* Comparison functions for transaction IDs: perform comparisons using
 * modulo arithmetic so that they work over sequence number wraps. */

static inline int tid_gt(tid_t x, tid_t y)
{
        int difference = (x - y);
        return (difference > 0);
}

static inline int tid_geq(tid_t x, tid_t y)
{
        int difference = (x - y);
        return (difference >= 0);
}

extern int jbd2_journal_blocks_per_folio(struct inode *inode);
extern size_t journal_tag_bytes(journal_t *journal);

static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
{
        return jbd2_has_feature_csum2(journal) ||
               jbd2_has_feature_csum3(journal);
}

static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)
{
        int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks);

        return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS;
}

/*
 * Return number of free blocks in the log. Must be called under j_state_lock.
 */
static inline unsigned long jbd2_log_space_left(journal_t *journal)
{
        /* Allow for rounding errors */
        long free = journal->j_free - 32;

        if (journal->j_committing_transaction) {
                free -= atomic_read(&journal->
                        j_committing_transaction->t_outstanding_credits);
        }
        return max_t(long, free, 0);
}

/*
 * Definitions which augment the buffer_head layer
 */

/* journaling buffer types */
#define BJ_None                0        /* Not journaled */
#define BJ_Metadata        1        /* Normal journaled metadata */
#define BJ_Forget        2        /* Buffer superseded by this transaction */
#define BJ_Shadow        3        /* Buffer contents being shadowed to the log */
#define BJ_Reserved        4        /* Buffer is reserved for access by journal */
#define BJ_Types        5

static inline u32 jbd2_chksum(u32 crc, const void *address, unsigned int length)
{
        return crc32c(crc, address, length);
}

/* Return most recent uncommitted transaction */
static inline tid_t  jbd2_get_latest_transaction(journal_t *journal)
{
        tid_t tid;

        read_lock(&journal->j_state_lock);
        tid = journal->j_commit_request;
        if (journal->j_running_transaction)
                tid = journal->j_running_transaction->t_tid;
        read_unlock(&journal->j_state_lock);
        return tid;
}

static inline int jbd2_handle_buffer_credits(handle_t *handle)
{
        journal_t *journal;

        if (!handle->h_reserved)
                journal = handle->h_transaction->t_journal;
        else
                journal = handle->h_journal;

        return handle->h_total_credits -
                DIV_ROUND_UP(handle->h_revoke_credits_requested,
                             journal->j_revoke_records_per_block);
}

#ifdef __KERNEL__

#define buffer_trace_init(bh)        do {} while (0)
#define print_buffer_fields(bh)        do {} while (0)
#define print_buffer_trace(bh)        do {} while (0)
#define BUFFER_TRACE(bh, info)        do {} while (0)
#define BUFFER_TRACE2(bh, bh2, info)        do {} while (0)
#define JBUFFER_TRACE(jh, info)        do {} while (0)

#endif        /* __KERNEL__ */

#endif        /* _LINUX_JBD2_H */


















    1 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/interval_tree.c - interval tree for mapping->i_mmap
 *
 * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
 */

#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/rmap.h>
#include <linux/interval_tree_generic.h>

static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
{
        return v->vm_pgoff;
}

INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
                     unsigned long, shared.rb_subtree_last,
                     vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree)

/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root)
{
        struct rb_node **link;
        struct vm_area_struct *parent;
        unsigned long last = vma_last_pgoff(node);

        VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);

        if (!prev->shared.rb.rb_right) {
                parent = prev;
                link = &prev->shared.rb.rb_right;
        } else {
                parent = rb_entry(prev->shared.rb.rb_right,
                                  struct vm_area_struct, shared.rb);
                if (parent->shared.rb_subtree_last < last)
                        parent->shared.rb_subtree_last = last;
                while (parent->shared.rb.rb_left) {
                        parent = rb_entry(parent->shared.rb.rb_left,
                                struct vm_area_struct, shared.rb);
                        if (parent->shared.rb_subtree_last < last)
                                parent->shared.rb_subtree_last = last;
                }
                link = &parent->shared.rb.rb_left;
        }

        node->shared.rb_subtree_last = last;
        rb_link_node(&node->shared.rb, &parent->shared.rb, link);
        rb_insert_augmented(&node->shared.rb, &root->rb_root,
                            &vma_interval_tree_augment);
}

static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
{
        return vma_start_pgoff(avc->vma);
}

static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
{
        return vma_last_pgoff(avc->vma);
}

INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
                     avc_start_pgoff, avc_last_pgoff,
                     static inline, __anon_vma_interval_tree)

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root)
{
#ifdef CONFIG_DEBUG_VM_RB
        node->cached_vma_start = avc_start_pgoff(node);
        node->cached_vma_last = avc_last_pgoff(node);
#endif
        __anon_vma_interval_tree_insert(node, root);
}

void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root)
{
        __anon_vma_interval_tree_remove(node, root);
}

struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long first, unsigned long last)
{
        return __anon_vma_interval_tree_iter_first(root, first, last);
}

struct anon_vma_chain *
anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
                                 unsigned long first, unsigned long last)
{
        return __anon_vma_interval_tree_iter_next(node, first, last);
}

#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
{
        WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
        WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
}
#endif



















































































































































































































































































































































































































































































































































   23 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   22 




























   22 

































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/common.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/string_helpers.h>
#include "common.h"

/* String table for operation mode. */
const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE] = {
        [TOMOYO_CONFIG_DISABLED]   = "disabled",
        [TOMOYO_CONFIG_LEARNING]   = "learning",
        [TOMOYO_CONFIG_PERMISSIVE] = "permissive",
        [TOMOYO_CONFIG_ENFORCING]  = "enforcing"
};

/* String table for /sys/kernel/security/tomoyo/profile */
const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX
                                       + TOMOYO_MAX_MAC_CATEGORY_INDEX] = {
        /* CONFIG::file group */
        [TOMOYO_MAC_FILE_EXECUTE]    = "execute",
        [TOMOYO_MAC_FILE_OPEN]       = "open",
        [TOMOYO_MAC_FILE_CREATE]     = "create",
        [TOMOYO_MAC_FILE_UNLINK]     = "unlink",
        [TOMOYO_MAC_FILE_GETATTR]    = "getattr",
        [TOMOYO_MAC_FILE_MKDIR]      = "mkdir",
        [TOMOYO_MAC_FILE_RMDIR]      = "rmdir",
        [TOMOYO_MAC_FILE_MKFIFO]     = "mkfifo",
        [TOMOYO_MAC_FILE_MKSOCK]     = "mksock",
        [TOMOYO_MAC_FILE_TRUNCATE]   = "truncate",
        [TOMOYO_MAC_FILE_SYMLINK]    = "symlink",
        [TOMOYO_MAC_FILE_MKBLOCK]    = "mkblock",
        [TOMOYO_MAC_FILE_MKCHAR]     = "mkchar",
        [TOMOYO_MAC_FILE_LINK]       = "link",
        [TOMOYO_MAC_FILE_RENAME]     = "rename",
        [TOMOYO_MAC_FILE_CHMOD]      = "chmod",
        [TOMOYO_MAC_FILE_CHOWN]      = "chown",
        [TOMOYO_MAC_FILE_CHGRP]      = "chgrp",
        [TOMOYO_MAC_FILE_IOCTL]      = "ioctl",
        [TOMOYO_MAC_FILE_CHROOT]     = "chroot",
        [TOMOYO_MAC_FILE_MOUNT]      = "mount",
        [TOMOYO_MAC_FILE_UMOUNT]     = "unmount",
        [TOMOYO_MAC_FILE_PIVOT_ROOT] = "pivot_root",
        /* CONFIG::network group */
        [TOMOYO_MAC_NETWORK_INET_STREAM_BIND]       = "inet_stream_bind",
        [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN]     = "inet_stream_listen",
        [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT]    = "inet_stream_connect",
        [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND]        = "inet_dgram_bind",
        [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND]        = "inet_dgram_send",
        [TOMOYO_MAC_NETWORK_INET_RAW_BIND]          = "inet_raw_bind",
        [TOMOYO_MAC_NETWORK_INET_RAW_SEND]          = "inet_raw_send",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND]       = "unix_stream_bind",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN]     = "unix_stream_listen",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT]    = "unix_stream_connect",
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND]        = "unix_dgram_bind",
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND]        = "unix_dgram_send",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND]    = "unix_seqpacket_bind",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN]  = "unix_seqpacket_listen",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] = "unix_seqpacket_connect",
        /* CONFIG::misc group */
        [TOMOYO_MAC_ENVIRON] = "env",
        /* CONFIG group */
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_FILE] = "file",
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_NETWORK] = "network",
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_MISC] = "misc",
};

/* String table for conditions. */
const char * const tomoyo_condition_keyword[TOMOYO_MAX_CONDITION_KEYWORD] = {
        [TOMOYO_TASK_UID]             = "task.uid",
        [TOMOYO_TASK_EUID]            = "task.euid",
        [TOMOYO_TASK_SUID]            = "task.suid",
        [TOMOYO_TASK_FSUID]           = "task.fsuid",
        [TOMOYO_TASK_GID]             = "task.gid",
        [TOMOYO_TASK_EGID]            = "task.egid",
        [TOMOYO_TASK_SGID]            = "task.sgid",
        [TOMOYO_TASK_FSGID]           = "task.fsgid",
        [TOMOYO_TASK_PID]             = "task.pid",
        [TOMOYO_TASK_PPID]            = "task.ppid",
        [TOMOYO_EXEC_ARGC]            = "exec.argc",
        [TOMOYO_EXEC_ENVC]            = "exec.envc",
        [TOMOYO_TYPE_IS_SOCKET]       = "socket",
        [TOMOYO_TYPE_IS_SYMLINK]      = "symlink",
        [TOMOYO_TYPE_IS_FILE]         = "file",
        [TOMOYO_TYPE_IS_BLOCK_DEV]    = "block",
        [TOMOYO_TYPE_IS_DIRECTORY]    = "directory",
        [TOMOYO_TYPE_IS_CHAR_DEV]     = "char",
        [TOMOYO_TYPE_IS_FIFO]         = "fifo",
        [TOMOYO_MODE_SETUID]          = "setuid",
        [TOMOYO_MODE_SETGID]          = "setgid",
        [TOMOYO_MODE_STICKY]          = "sticky",
        [TOMOYO_MODE_OWNER_READ]      = "owner_read",
        [TOMOYO_MODE_OWNER_WRITE]     = "owner_write",
        [TOMOYO_MODE_OWNER_EXECUTE]   = "owner_execute",
        [TOMOYO_MODE_GROUP_READ]      = "group_read",
        [TOMOYO_MODE_GROUP_WRITE]     = "group_write",
        [TOMOYO_MODE_GROUP_EXECUTE]   = "group_execute",
        [TOMOYO_MODE_OTHERS_READ]     = "others_read",
        [TOMOYO_MODE_OTHERS_WRITE]    = "others_write",
        [TOMOYO_MODE_OTHERS_EXECUTE]  = "others_execute",
        [TOMOYO_EXEC_REALPATH]        = "exec.realpath",
        [TOMOYO_SYMLINK_TARGET]       = "symlink.target",
        [TOMOYO_PATH1_UID]            = "path1.uid",
        [TOMOYO_PATH1_GID]            = "path1.gid",
        [TOMOYO_PATH1_INO]            = "path1.ino",
        [TOMOYO_PATH1_MAJOR]          = "path1.major",
        [TOMOYO_PATH1_MINOR]          = "path1.minor",
        [TOMOYO_PATH1_PERM]           = "path1.perm",
        [TOMOYO_PATH1_TYPE]           = "path1.type",
        [TOMOYO_PATH1_DEV_MAJOR]      = "path1.dev_major",
        [TOMOYO_PATH1_DEV_MINOR]      = "path1.dev_minor",
        [TOMOYO_PATH2_UID]            = "path2.uid",
        [TOMOYO_PATH2_GID]            = "path2.gid",
        [TOMOYO_PATH2_INO]            = "path2.ino",
        [TOMOYO_PATH2_MAJOR]          = "path2.major",
        [TOMOYO_PATH2_MINOR]          = "path2.minor",
        [TOMOYO_PATH2_PERM]           = "path2.perm",
        [TOMOYO_PATH2_TYPE]           = "path2.type",
        [TOMOYO_PATH2_DEV_MAJOR]      = "path2.dev_major",
        [TOMOYO_PATH2_DEV_MINOR]      = "path2.dev_minor",
        [TOMOYO_PATH1_PARENT_UID]     = "path1.parent.uid",
        [TOMOYO_PATH1_PARENT_GID]     = "path1.parent.gid",
        [TOMOYO_PATH1_PARENT_INO]     = "path1.parent.ino",
        [TOMOYO_PATH1_PARENT_PERM]    = "path1.parent.perm",
        [TOMOYO_PATH2_PARENT_UID]     = "path2.parent.uid",
        [TOMOYO_PATH2_PARENT_GID]     = "path2.parent.gid",
        [TOMOYO_PATH2_PARENT_INO]     = "path2.parent.ino",
        [TOMOYO_PATH2_PARENT_PERM]    = "path2.parent.perm",
};

/* String table for PREFERENCE keyword. */
static const char * const tomoyo_pref_keywords[TOMOYO_MAX_PREF] = {
        [TOMOYO_PREF_MAX_AUDIT_LOG]      = "max_audit_log",
        [TOMOYO_PREF_MAX_LEARNING_ENTRY] = "max_learning_entry",
};

/* String table for path operation. */
const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION] = {
        [TOMOYO_TYPE_EXECUTE]    = "execute",
        [TOMOYO_TYPE_READ]       = "read",
        [TOMOYO_TYPE_WRITE]      = "write",
        [TOMOYO_TYPE_APPEND]     = "append",
        [TOMOYO_TYPE_UNLINK]     = "unlink",
        [TOMOYO_TYPE_GETATTR]    = "getattr",
        [TOMOYO_TYPE_RMDIR]      = "rmdir",
        [TOMOYO_TYPE_TRUNCATE]   = "truncate",
        [TOMOYO_TYPE_SYMLINK]    = "symlink",
        [TOMOYO_TYPE_CHROOT]     = "chroot",
        [TOMOYO_TYPE_UMOUNT]     = "unmount",
};

/* String table for socket's operation. */
const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION] = {
        [TOMOYO_NETWORK_BIND]    = "bind",
        [TOMOYO_NETWORK_LISTEN]  = "listen",
        [TOMOYO_NETWORK_CONNECT] = "connect",
        [TOMOYO_NETWORK_SEND]    = "send",
};

/* String table for categories. */
static const char * const tomoyo_category_keywords
[TOMOYO_MAX_MAC_CATEGORY_INDEX] = {
        [TOMOYO_MAC_CATEGORY_FILE]    = "file",
        [TOMOYO_MAC_CATEGORY_NETWORK] = "network",
        [TOMOYO_MAC_CATEGORY_MISC]    = "misc",
};

/* Permit policy management by non-root user? */
static bool tomoyo_manage_by_non_root;

/* Utility functions. */

/**
 * tomoyo_addprintf - strncat()-like-snprintf().
 *
 * @buffer: Buffer to write to. Must be '\0'-terminated.
 * @len:    Size of @buffer.
 * @fmt:    The printf()'s format string, followed by parameters.
 *
 * Returns nothing.
 */
__printf(3, 4)
static void tomoyo_addprintf(char *buffer, int len, const char *fmt, ...)
{
        va_list args;
        const int pos = strlen(buffer);

        va_start(args, fmt);
        vsnprintf(buffer + pos, len - pos - 1, fmt, args);
        va_end(args);
}

/**
 * tomoyo_flush - Flush queued string to userspace's buffer.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 *
 * Returns true if all data was flushed, false otherwise.
 */
static bool tomoyo_flush(struct tomoyo_io_buffer *head)
{
        while (head->r.w_pos) {
                const char *w = head->r.w[0];
                size_t len = strlen(w);

                if (len) {
                        if (len > head->read_user_buf_avail)
                                len = head->read_user_buf_avail;
                        if (!len)
                                return false;
                        if (copy_to_user(head->read_user_buf, w, len))
                                return false;
                        head->read_user_buf_avail -= len;
                        head->read_user_buf += len;
                        w += len;
                }
                head->r.w[0] = w;
                if (*w)
                        return false;
                /* Add '\0' for audit logs and query. */
                if (head->poll) {
                        if (!head->read_user_buf_avail ||
                            copy_to_user(head->read_user_buf, "", 1))
                                return false;
                        head->read_user_buf_avail--;
                        head->read_user_buf++;
                }
                head->r.w_pos--;
                for (len = 0; len < head->r.w_pos; len++)
                        head->r.w[len] = head->r.w[len + 1];
        }
        head->r.avail = 0;
        return true;
}

/**
 * tomoyo_set_string - Queue string to "struct tomoyo_io_buffer" structure.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 * @string: String to print.
 *
 * Note that @string has to be kept valid until @head is kfree()d.
 * This means that char[] allocated on stack memory cannot be passed to
 * this function. Use tomoyo_io_printf() for char[] allocated on stack memory.
 */
static void tomoyo_set_string(struct tomoyo_io_buffer *head, const char *string)
{
        if (head->r.w_pos < TOMOYO_MAX_IO_READ_QUEUE) {
                head->r.w[head->r.w_pos++] = string;
                tomoyo_flush(head);
        } else
                WARN_ON(1);
}

static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt,
                             ...) __printf(2, 3);

/**
 * tomoyo_io_printf - printf() to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @fmt:  The printf()'s format string, followed by parameters.
 */
static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt,
                             ...)
        __must_hold(&head->io_sem)
{
        va_list args;
        size_t len;
        size_t pos = head->r.avail;
        int size = head->readbuf_size - pos;

        if (size <= 0)
                return;
        va_start(args, fmt);
        len = vsnprintf(head->read_buf + pos, size, fmt, args) + 1;
        va_end(args);
        if (pos + len >= head->readbuf_size) {
                WARN_ON(1);
                return;
        }
        head->r.avail += len;
        tomoyo_set_string(head, head->read_buf + pos);
}

/**
 * tomoyo_set_space - Put a space to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_set_space(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, " ");
}

/**
 * tomoyo_set_lf - Put a line feed to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static bool tomoyo_set_lf(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, "\n");
        return !head->r.w_pos;
}

/**
 * tomoyo_set_slash - Put a shash to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_set_slash(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, "/");
}

/* List of namespaces. */
LIST_HEAD(tomoyo_namespace_list);
/* True if namespace other than tomoyo_kernel_namespace is defined. */
static bool tomoyo_namespace_enabled;

/**
 * tomoyo_init_policy_namespace - Initialize namespace.
 *
 * @ns: Pointer to "struct tomoyo_policy_namespace".
 *
 * Returns nothing.
 */
void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns)
{
        unsigned int idx;

        for (idx = 0; idx < TOMOYO_MAX_ACL_GROUPS; idx++)
                INIT_LIST_HEAD(&ns->acl_group[idx]);
        for (idx = 0; idx < TOMOYO_MAX_GROUP; idx++)
                INIT_LIST_HEAD(&ns->group_list[idx]);
        for (idx = 0; idx < TOMOYO_MAX_POLICY; idx++)
                INIT_LIST_HEAD(&ns->policy_list[idx]);
        ns->profile_version = 20150505;
        tomoyo_namespace_enabled = !list_empty(&tomoyo_namespace_list);
        list_add_tail_rcu(&ns->namespace_list, &tomoyo_namespace_list);
}

/**
 * tomoyo_print_namespace - Print namespace header.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_print_namespace(struct tomoyo_io_buffer *head)
{
        if (!tomoyo_namespace_enabled)
                return;
        tomoyo_set_string(head,
                          container_of(head->r.ns,
                                       struct tomoyo_policy_namespace,
                                       namespace_list)->name);
        tomoyo_set_space(head);
}

/**
 * tomoyo_print_name_union - Print a tomoyo_name_union.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 */
static void tomoyo_print_name_union(struct tomoyo_io_buffer *head,
                                    const struct tomoyo_name_union *ptr)
{
        tomoyo_set_space(head);
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                tomoyo_set_string(head, ptr->filename->name);
        }
}

/**
 * tomoyo_print_name_union_quoted - Print a tomoyo_name_union with a quote.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_name_union_quoted(struct tomoyo_io_buffer *head,
                                           const struct tomoyo_name_union *ptr)
{
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                tomoyo_set_string(head, "\"");
                tomoyo_set_string(head, ptr->filename->name);
                tomoyo_set_string(head, "\"");
        }
}

/**
 * tomoyo_print_number_union_nospace - Print a tomoyo_number_union without a space.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
static void
tomoyo_print_number_union_nospace(struct tomoyo_io_buffer *head, const struct tomoyo_number_union *ptr)
        __must_hold(&head->io_sem)
{
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                int i;
                unsigned long min = ptr->values[0];
                const unsigned long max = ptr->values[1];
                u8 min_type = ptr->value_type[0];
                const u8 max_type = ptr->value_type[1];
                char buffer[128];

                buffer[0] = '\0';
                for (i = 0; i < 2; i++) {
                        switch (min_type) {
                        case TOMOYO_VALUE_TYPE_HEXADECIMAL:
                                tomoyo_addprintf(buffer, sizeof(buffer),
                                                 "0x%lX", min);
                                break;
                        case TOMOYO_VALUE_TYPE_OCTAL:
                                tomoyo_addprintf(buffer, sizeof(buffer),
                                                 "0%lo", min);
                                break;
                        default:
                                tomoyo_addprintf(buffer, sizeof(buffer), "%lu",
                                                 min);
                                break;
                        }
                        if (min == max && min_type == max_type)
                                break;
                        tomoyo_addprintf(buffer, sizeof(buffer), "-");
                        min_type = max_type;
                        min = max;
                }
                tomoyo_io_printf(head, "%s", buffer);
        }
}

/**
 * tomoyo_print_number_union - Print a tomoyo_number_union.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_number_union(struct tomoyo_io_buffer *head,
                                      const struct tomoyo_number_union *ptr)
        __must_hold(&head->io_sem)
{
        tomoyo_set_space(head);
        tomoyo_print_number_union_nospace(head, ptr);
}

/**
 * tomoyo_assign_profile - Create a new profile.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number to create.
 *
 * Returns pointer to "struct tomoyo_profile" on success, NULL otherwise.
 */
static struct tomoyo_profile *tomoyo_assign_profile
(struct tomoyo_policy_namespace *ns, const unsigned int profile)
{
        struct tomoyo_profile *ptr;
        struct tomoyo_profile *entry;

        if (profile >= TOMOYO_MAX_PROFILES)
                return NULL;
        ptr = ns->profile_ptr[profile];
        if (ptr)
                return ptr;
        entry = kzalloc_obj(*entry, GFP_NOFS | __GFP_NOWARN);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        ptr = ns->profile_ptr[profile];
        if (!ptr && tomoyo_memory_ok(entry)) {
                ptr = entry;
                ptr->default_config = TOMOYO_CONFIG_DISABLED |
                        TOMOYO_CONFIG_WANT_GRANT_LOG |
                        TOMOYO_CONFIG_WANT_REJECT_LOG;
                memset(ptr->config, TOMOYO_CONFIG_USE_DEFAULT,
                       sizeof(ptr->config));
                ptr->pref[TOMOYO_PREF_MAX_AUDIT_LOG] =
                        CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG;
                ptr->pref[TOMOYO_PREF_MAX_LEARNING_ENTRY] =
                        CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY;
                mb(); /* Avoid out-of-order execution. */
                ns->profile_ptr[profile] = ptr;
                entry = NULL;
        }
        mutex_unlock(&tomoyo_policy_lock);
 out:
        kfree(entry);
        return ptr;
}

/**
 * tomoyo_profile - Find a profile.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number to find.
 *
 * Returns pointer to "struct tomoyo_profile".
 */
struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
                                      const u8 profile)
{
        static struct tomoyo_profile tomoyo_null_profile;
        struct tomoyo_profile *ptr = ns->profile_ptr[profile];

        if (!ptr)
                ptr = &tomoyo_null_profile;
        return ptr;
}

/**
 * tomoyo_find_yesno - Find values for specified keyword.
 *
 * @string: String to check.
 * @find:   Name of keyword.
 *
 * Returns 1 if "@find=yes" was found, 0 if "@find=no" was found, -1 otherwise.
 */
static s8 tomoyo_find_yesno(const char *string, const char *find)
{
        const char *cp = strstr(string, find);

        if (cp) {
                cp += strlen(find);
                if (!strncmp(cp, "=yes", 4))
                        return 1;
                else if (!strncmp(cp, "=no", 3))
                        return 0;
        }
        return -1;
}

/**
 * tomoyo_set_uint - Set value for specified preference.
 *
 * @i:      Pointer to "unsigned int".
 * @string: String to check.
 * @find:   Name of keyword.
 *
 * Returns nothing.
 */
static void tomoyo_set_uint(unsigned int *i, const char *string,
                            const char *find)
{
        const char *cp = strstr(string, find);

        if (cp)
                sscanf(cp + strlen(find), "=%u", i);
}

/**
 * tomoyo_set_mode - Set mode for specified profile.
 *
 * @name:    Name of functionality.
 * @value:   Mode for @name.
 * @profile: Pointer to "struct tomoyo_profile".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_set_mode(char *name, const char *value,
                           struct tomoyo_profile *profile)
{
        u8 i;
        u8 config;

        if (!strcmp(name, "CONFIG")) {
                i = TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX;
                config = profile->default_config;
        } else if (tomoyo_str_starts(&name, "CONFIG::")) {
                config = 0;
                for (i = 0; i < TOMOYO_MAX_MAC_INDEX
                             + TOMOYO_MAX_MAC_CATEGORY_INDEX; i++) {
                        int len = 0;

                        if (i < TOMOYO_MAX_MAC_INDEX) {
                                const u8 c = tomoyo_index2category[i];
                                const char *category =
                                        tomoyo_category_keywords[c];

                                len = strlen(category);
                                if (strncmp(name, category, len) ||
                                    name[len++] != ':' || name[len++] != ':')
                                        continue;
                        }
                        if (strcmp(name + len, tomoyo_mac_keywords[i]))
                                continue;
                        config = profile->config[i];
                        break;
                }
                if (i == TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX)
                        return -EINVAL;
        } else {
                return -EINVAL;
        }
        if (strstr(value, "use_default")) {
                config = TOMOYO_CONFIG_USE_DEFAULT;
        } else {
                u8 mode;

                for (mode = 0; mode < 4; mode++)
                        if (strstr(value, tomoyo_mode[mode]))
                                /*
                                 * Update lower 3 bits in order to distinguish
                                 * 'config' from 'TOMOYO_CONFIG_USE_DEFAULT'.
                                 */
                                config = (config & ~7) | mode;
                if (config != TOMOYO_CONFIG_USE_DEFAULT) {
                        switch (tomoyo_find_yesno(value, "grant_log")) {
                        case 1:
                                config |= TOMOYO_CONFIG_WANT_GRANT_LOG;
                                break;
                        case 0:
                                config &= ~TOMOYO_CONFIG_WANT_GRANT_LOG;
                                break;
                        }
                        switch (tomoyo_find_yesno(value, "reject_log")) {
                        case 1:
                                config |= TOMOYO_CONFIG_WANT_REJECT_LOG;
                                break;
                        case 0:
                                config &= ~TOMOYO_CONFIG_WANT_REJECT_LOG;
                                break;
                        }
                }
        }
        if (i < TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX)
                profile->config[i] = config;
        else if (config != TOMOYO_CONFIG_USE_DEFAULT)
                profile->default_config = config;
        return 0;
}

/**
 * tomoyo_write_profile - Write profile table.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_write_profile(struct tomoyo_io_buffer *head)
        __must_hold(&head->io_sem)
{
        char *data = head->write_buf;
        unsigned int i;
        char *cp;
        struct tomoyo_profile *profile;

        if (sscanf(data, "PROFILE_VERSION=%u", &head->w.ns->profile_version)
            == 1)
                return 0;
        i = simple_strtoul(data, &cp, 10);
        if (*cp != '-')
                return -EINVAL;
        data = cp + 1;
        profile = tomoyo_assign_profile(head->w.ns, i);
        if (!profile)
                return -EINVAL;
        cp = strchr(data, '=');
        if (!cp)
                return -EINVAL;
        *cp++ = '\0';
        if (!strcmp(data, "COMMENT")) {
                static DEFINE_SPINLOCK(lock);
                const struct tomoyo_path_info *new_comment
                        = tomoyo_get_name(cp);
                const struct tomoyo_path_info *old_comment;

                if (!new_comment)
                        return -ENOMEM;
                spin_lock(&lock);
                old_comment = profile->comment;
                profile->comment = new_comment;
                spin_unlock(&lock);
                tomoyo_put_name(old_comment);
                return 0;
        }
        if (!strcmp(data, "PREFERENCE")) {
                for (i = 0; i < TOMOYO_MAX_PREF; i++)
                        tomoyo_set_uint(&profile->pref[i], cp,
                                        tomoyo_pref_keywords[i]);
                return 0;
        }
        return tomoyo_set_mode(data, cp, profile);
}

/**
 * tomoyo_print_config - Print mode for specified functionality.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 * @config: Mode for that functionality.
 *
 * Returns nothing.
 *
 * Caller prints functionality's name.
 */
static void tomoyo_print_config(struct tomoyo_io_buffer *head, const u8 config)
        __must_hold(&head->io_sem)
{
        tomoyo_io_printf(head, "={ mode=%s grant_log=%s reject_log=%s }\n",
                         tomoyo_mode[config & 3],
                         str_yes_no(config & TOMOYO_CONFIG_WANT_GRANT_LOG),
                         str_yes_no(config & TOMOYO_CONFIG_WANT_REJECT_LOG));
}

/**
 * tomoyo_read_profile - Read profile table.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_read_profile(struct tomoyo_io_buffer *head)
        __must_hold(&head->io_sem)
{
        u8 index;
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        const struct tomoyo_profile *profile;

        if (head->r.eof)
                return;
 next:
        index = head->r.index;
        profile = ns->profile_ptr[index];
        switch (head->r.step) {
        case 0:
                tomoyo_print_namespace(head);
                tomoyo_io_printf(head, "PROFILE_VERSION=%u\n",
                                 ns->profile_version);
                head->r.step++;
                break;
        case 1:
                for ( ; head->r.index < TOMOYO_MAX_PROFILES;
                      head->r.index++)
                        if (ns->profile_ptr[head->r.index])
                                break;
                if (head->r.index == TOMOYO_MAX_PROFILES) {
                        head->r.eof = true;
                        return;
                }
                head->r.step++;
                break;
        case 2:
                {
                        u8 i;
                        const struct tomoyo_path_info *comment =
                                profile->comment;

                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-COMMENT=", index);
                        tomoyo_set_string(head, comment ? comment->name : "");
                        tomoyo_set_lf(head);
                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-PREFERENCE={ ", index);
                        for (i = 0; i < TOMOYO_MAX_PREF; i++)
                                tomoyo_io_printf(head, "%s=%u ",
                                                 tomoyo_pref_keywords[i],
                                                 profile->pref[i]);
                        tomoyo_set_string(head, "}\n");
                        head->r.step++;
                }
                break;
        case 3:
                {
                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-%s", index, "CONFIG");
                        tomoyo_print_config(head, profile->default_config);
                        head->r.bit = 0;
                        head->r.step++;
                }
                break;
        case 4:
                for ( ; head->r.bit < TOMOYO_MAX_MAC_INDEX
                              + TOMOYO_MAX_MAC_CATEGORY_INDEX; head->r.bit++) {
                        const u8 i = head->r.bit;
                        const u8 config = profile->config[i];

                        if (config == TOMOYO_CONFIG_USE_DEFAULT)
                                continue;
                        tomoyo_print_namespace(head);
                        if (i < TOMOYO_MAX_MAC_INDEX)
                                tomoyo_io_printf(head, "%u-CONFIG::%s::%s",
                                                 index,
                                                 tomoyo_category_keywords
                                                 [tomoyo_index2category[i]],
                                                 tomoyo_mac_keywords[i]);
                        else
                                tomoyo_io_printf(head, "%u-CONFIG::%s", index,
                                                 tomoyo_mac_keywords[i]);
                        tomoyo_print_config(head, config);
                        head->r.bit++;
                        break;
                }
                if (head->r.bit == TOMOYO_MAX_MAC_INDEX
                    + TOMOYO_MAX_MAC_CATEGORY_INDEX) {
                        head->r.index++;
                        head->r.step = 1;
                }
                break;
        }
        if (tomoyo_flush(head))
                goto next;
}

/**
 * tomoyo_same_manager - Check for duplicated "struct tomoyo_manager" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_manager(const struct tomoyo_acl_head *a,
                                const struct tomoyo_acl_head *b)
{
        return container_of(a, struct tomoyo_manager, head)->manager ==
                container_of(b, struct tomoyo_manager, head)->manager;
}

/**
 * tomoyo_update_manager_entry - Add a manager entry.
 *
 * @manager:   The path to manager or the domainnamme.
 * @is_delete: True if it is a delete request.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_manager_entry(const char *manager,
                                       const bool is_delete)
        __must_hold_shared(&tomoyo_ss)
{
        struct tomoyo_manager e = { };
        struct tomoyo_acl_param param = {
                /* .ns = &tomoyo_kernel_namespace, */
                .is_delete = is_delete,
                .list = &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER],
        };
        int error = is_delete ? -ENOENT : -ENOMEM;

        if (!tomoyo_correct_domain(manager) &&
            !tomoyo_correct_word(manager))
                return -EINVAL;
        e.manager = tomoyo_get_name(manager);
        if (e.manager) {
                error = tomoyo_update_policy(&e.head, sizeof(e), &param,
                                             tomoyo_same_manager);
                tomoyo_put_name(e.manager);
        }
        return error;
}

/**
 * tomoyo_write_manager - Write manager policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_manager(struct tomoyo_io_buffer *head)
        __must_hold_shared(&tomoyo_ss)
        __must_hold(&head->io_sem)
{
        char *data = head->write_buf;

        if (!strcmp(data, "manage_by_non_root")) {
                tomoyo_manage_by_non_root = !head->w.is_delete;
                return 0;
        }
        return tomoyo_update_manager_entry(data, head->w.is_delete);
}

/**
 * tomoyo_read_manager - Read manager policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_manager(struct tomoyo_io_buffer *head)
        __must_hold_shared(&tomoyo_ss)
{
        if (head->r.eof)
                return;
        list_for_each_cookie(head->r.acl, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER]) {
                struct tomoyo_manager *ptr =
                        list_entry(head->r.acl, typeof(*ptr), head.list);

                if (ptr->head.is_deleted)
                        continue;
                if (!tomoyo_flush(head))
                        return;
                tomoyo_set_string(head, ptr->manager->name);
                tomoyo_set_lf(head);
        }
        head->r.eof = true;
}

/**
 * tomoyo_manager - Check whether the current process is a policy manager.
 *
 * Returns true if the current process is permitted to modify policy
 * via /sys/kernel/security/tomoyo/ interface.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_manager(void)
        __must_hold_shared(&tomoyo_ss)
{
        struct tomoyo_manager *ptr;
        const char *exe;
        const struct task_struct *task = current;
        const struct tomoyo_path_info *domainname = tomoyo_domain()->domainname;
        bool found = IS_ENABLED(CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING);

        if (!tomoyo_policy_loaded)
                return true;
        if (!tomoyo_manage_by_non_root &&
            (!uid_eq(task->cred->uid,  GLOBAL_ROOT_UID) ||
             !uid_eq(task->cred->euid, GLOBAL_ROOT_UID)))
                return false;
        exe = tomoyo_get_exe();
        if (!exe)
                return false;
        list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (!ptr->head.is_deleted &&
                    (!tomoyo_pathcmp(domainname, ptr->manager) ||
                     !strcmp(exe, ptr->manager->name))) {
                        found = true;
                        break;
                }
        }
        if (!found) { /* Reduce error messages. */
                static pid_t last_pid;
                const pid_t pid = current->pid;

                if (last_pid != pid) {
                        pr_warn("%s ( %s ) is not permitted to update policies.\n",
                                domainname->name, exe);
                        last_pid = pid;
                }
        }
        kfree(exe);
        return found;
}

static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
(unsigned int serial);

/**
 * tomoyo_select_domain - Parse select command.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @data: String to parse.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_select_domain(struct tomoyo_io_buffer *head,
                                 const char *data)
        __must_hold_shared(&tomoyo_ss)
        __must_hold(&head->io_sem)
{
        unsigned int pid;
        struct tomoyo_domain_info *domain = NULL;
        bool global_pid = false;

        if (strncmp(data, "select ", 7))
                return false;
        data += 7;
        if (sscanf(data, "pid=%u", &pid) == 1 ||
            (global_pid = true, sscanf(data, "global-pid=%u", &pid) == 1)) {
                struct task_struct *p;

                rcu_read_lock();
                if (global_pid)
                        p = find_task_by_pid_ns(pid, &init_pid_ns);
                else
                        p = find_task_by_vpid(pid);
                if (p)
                        domain = tomoyo_task(p)->domain_info;
                rcu_read_unlock();
        } else if (!strncmp(data, "domain=", 7)) {
                if (tomoyo_domain_def(data + 7))
                        domain = tomoyo_find_domain(data + 7);
        } else if (sscanf(data, "Q=%u", &pid) == 1) {
                domain = tomoyo_find_domain_by_qid(pid);
        } else
                return false;
        head->w.domain = domain;
        /* Accessing read_buf is safe because head->io_sem is held. */
        if (!head->read_buf)
                return true; /* Do nothing if open(O_WRONLY). */
        memset(&head->r, 0, sizeof(head->r));
        head->r.print_this_domain_only = true;
        if (domain)
                head->r.domain = &domain->list;
        else
                head->r.eof = true;
        tomoyo_io_printf(head, "# select %s\n", data);
        if (domain && domain->is_deleted)
                tomoyo_io_printf(head, "# This is a deleted domain.\n");
        return true;
}

/**
 * tomoyo_same_task_acl - Check for duplicated "struct tomoyo_task_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a,
                                 const struct tomoyo_acl_info *b)
{
        const struct tomoyo_task_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_task_acl *p2 = container_of(b, typeof(*p2), head);

        return p1->domainname == p2->domainname;
}

/**
 * tomoyo_write_task - Update task related list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_task(struct tomoyo_acl_param *param)
        __must_hold_shared(&tomoyo_ss)
{
        int error = -EINVAL;

        if (tomoyo_str_starts(&param->data, "manual_domain_transition ")) {
                struct tomoyo_task_acl e = {
                        .head.type = TOMOYO_TYPE_MANUAL_TASK_ACL,
                        .domainname = tomoyo_get_domainname(param),
                };

                if (e.domainname)
                        error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                                     tomoyo_same_task_acl,
                                                     NULL);
                tomoyo_put_name(e.domainname);
        }
        return error;
}

/**
 * tomoyo_delete_domain - Delete a domain.
 *
 * @domainname: The name of domain.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_delete_domain(char *domainname)
        __must_hold_shared(&tomoyo_ss)
{
        struct tomoyo_domain_info *domain;
        struct tomoyo_path_info name;

        name.name = domainname;
        tomoyo_fill_path_info(&name);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                return -EINTR;
        /* Is there an active domain? */
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                /* Never delete tomoyo_kernel_domain */
                if (domain == &tomoyo_kernel_domain)
                        continue;
                if (domain->is_deleted ||
                    tomoyo_pathcmp(domain->domainname, &name))
                        continue;
                domain->is_deleted = true;
                break;
        }
        mutex_unlock(&tomoyo_policy_lock);
        return 0;
}

/**
 * tomoyo_write_domain2 - Write domain policy.
 *
 * @ns:        Pointer to "struct tomoyo_policy_namespace".
 * @list:      Pointer to "struct list_head".
 * @data:      Policy to be interpreted.
 * @is_delete: True if it is a delete request.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_domain2(struct tomoyo_policy_namespace *ns,
                                struct list_head *list, char *data,
                                const bool is_delete)
        __must_hold_shared(&tomoyo_ss)
{
        struct tomoyo_acl_param param = {
                .ns = ns,
                .list = list,
                .data = data,
                .is_delete = is_delete,
        };
        static const struct {
                const char *keyword;
                int (*write)(struct tomoyo_acl_param *param);
        } tomoyo_callback[5] = {
                { "file ", tomoyo_write_file },
                { "network inet ", tomoyo_write_inet_network },
                { "network unix ", tomoyo_write_unix_network },
                { "misc ", tomoyo_write_misc },
                { "task ", tomoyo_write_task },
        };
        u8 i;

        for (i = 0; i < ARRAY_SIZE(tomoyo_callback); i++) {
                if (!tomoyo_str_starts(&param.data,
                                       tomoyo_callback[i].keyword))
                        continue;
                return tomoyo_callback[i].write(&param);
        }
        return -EINVAL;
}

/* String table for domain flags. */
const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS] = {
        [TOMOYO_DIF_QUOTA_WARNED]      = "quota_exceeded\n",
        [TOMOYO_DIF_TRANSITION_FAILED] = "transition_failed\n",
};

/**
 * tomoyo_write_domain - Write domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_domain(struct tomoyo_io_buffer *head)
        __must_hold_shared(&tomoyo_ss)
        __must_hold(&head->io_sem)
{
        char *data = head->write_buf;
        struct tomoyo_policy_namespace *ns;
        struct tomoyo_domain_info *domain = head->w.domain;
        const bool is_delete = head->w.is_delete;
        bool is_select = !is_delete && tomoyo_str_starts(&data, "select ");
        unsigned int idx;

        if (*data == '<') {
                int ret = 0;

                domain = NULL;
                if (is_delete)
                        ret = tomoyo_delete_domain(data);
                else if (is_select)
                        domain = tomoyo_find_domain(data);
                else
                        domain = tomoyo_assign_domain(data, false);
                head->w.domain = domain;
                return ret;
        }
        if (!domain)
                return -EINVAL;
        ns = domain->ns;
        if (sscanf(data, "use_profile %u", &idx) == 1
            && idx < TOMOYO_MAX_PROFILES) {
                if (!tomoyo_policy_loaded || ns->profile_ptr[idx])
                        if (!is_delete)
                                domain->profile = (u8) idx;
                return 0;
        }
        if (sscanf(data, "use_group %u\n", &idx) == 1
            && idx < TOMOYO_MAX_ACL_GROUPS) {
                if (!is_delete)
                        set_bit(idx, domain->group);
                else
                        clear_bit(idx, domain->group);
                return 0;
        }
        for (idx = 0; idx < TOMOYO_MAX_DOMAIN_INFO_FLAGS; idx++) {
                const char *cp = tomoyo_dif[idx];

                if (strncmp(data, cp, strlen(cp) - 1))
                        continue;
                domain->flags[idx] = !is_delete;
                return 0;
        }
        return tomoyo_write_domain2(ns, &domain->acl_info_list, data,
                                    is_delete);
}

/**
 * tomoyo_print_condition - Print condition part.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @cond: Pointer to "struct tomoyo_condition".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
                                   const struct tomoyo_condition *cond)
        __must_hold(&head->io_sem)
{
        switch (head->r.cond_step) {
        case 0:
                head->r.cond_index = 0;
                head->r.cond_step++;
                if (cond->transit) {
                        tomoyo_set_space(head);
                        tomoyo_set_string(head, cond->transit->name);
                }
                fallthrough;
        case 1:
                {
                        const u16 condc = cond->condc;
                        const struct tomoyo_condition_element *condp =
                                (typeof(condp)) (cond + 1);
                        const struct tomoyo_number_union *numbers_p =
                                (typeof(numbers_p)) (condp + condc);
                        const struct tomoyo_name_union *names_p =
                                (typeof(names_p))
                                (numbers_p + cond->numbers_count);
                        const struct tomoyo_argv *argv =
                                (typeof(argv)) (names_p + cond->names_count);
                        const struct tomoyo_envp *envp =
                                (typeof(envp)) (argv + cond->argc);
                        u16 skip;

                        for (skip = 0; skip < head->r.cond_index; skip++) {
                                const u8 left = condp->left;
                                const u8 right = condp->right;

                                condp++;
                                switch (left) {
                                case TOMOYO_ARGV_ENTRY:
                                        argv++;
                                        continue;
                                case TOMOYO_ENVP_ENTRY:
                                        envp++;
                                        continue;
                                case TOMOYO_NUMBER_UNION:
                                        numbers_p++;
                                        break;
                                }
                                switch (right) {
                                case TOMOYO_NAME_UNION:
                                        names_p++;
                                        break;
                                case TOMOYO_NUMBER_UNION:
                                        numbers_p++;
                                        break;
                                }
                        }
                        while (head->r.cond_index < condc) {
                                const u8 match = condp->equals;
                                const u8 left = condp->left;
                                const u8 right = condp->right;

                                if (!tomoyo_flush(head))
                                        return false;
                                condp++;
                                head->r.cond_index++;
                                tomoyo_set_space(head);
                                switch (left) {
                                case TOMOYO_ARGV_ENTRY:
                                        tomoyo_io_printf(head,
                                                         "exec.argv[%lu]%s=\"",
                                                         argv->index, argv->is_not ? "!" : "");
                                        tomoyo_set_string(head,
                                                          argv->value->name);
                                        tomoyo_set_string(head, "\"");
                                        argv++;
                                        continue;
                                case TOMOYO_ENVP_ENTRY:
                                        tomoyo_set_string(head,
                                                          "exec.envp[\"");
                                        tomoyo_set_string(head,
                                                          envp->name->name);
                                        tomoyo_io_printf(head, "\"]%s=", envp->is_not ? "!" : "");
                                        if (envp->value) {
                                                tomoyo_set_string(head, "\"");
                                                tomoyo_set_string(head, envp->value->name);
                                                tomoyo_set_string(head, "\"");
                                        } else {
                                                tomoyo_set_string(head,
                                                                  "NULL");
                                        }
                                        envp++;
                                        continue;
                                case TOMOYO_NUMBER_UNION:
                                        tomoyo_print_number_union_nospace
                                                (head, numbers_p++);
                                        break;
                                default:
                                        tomoyo_set_string(head,
                                               tomoyo_condition_keyword[left]);
                                        break;
                                }
                                tomoyo_set_string(head, match ? "=" : "!=");
                                switch (right) {
                                case TOMOYO_NAME_UNION:
                                        tomoyo_print_name_union_quoted
                                                (head, names_p++);
                                        break;
                                case TOMOYO_NUMBER_UNION:
                                        tomoyo_print_number_union_nospace
                                                (head, numbers_p++);
                                        break;
                                default:
                                        tomoyo_set_string(head,
                                          tomoyo_condition_keyword[right]);
                                        break;
                                }
                        }
                }
                head->r.cond_step++;
                fallthrough;
        case 2:
                if (!tomoyo_flush(head))
                        break;
                head->r.cond_step++;
                fallthrough;
        case 3:
                if (cond->grant_log != TOMOYO_GRANTLOG_AUTO)
                        tomoyo_io_printf(head, " grant_log=%s",
                                         str_yes_no(cond->grant_log ==
                                                    TOMOYO_GRANTLOG_YES));
                tomoyo_set_lf(head);
                return true;
        }
        return false;
}

/**
 * tomoyo_set_group - Print "acl_group " header keyword and category name.
 *
 * @head:     Pointer to "struct tomoyo_io_buffer".
 * @category: Category name.
 *
 * Returns nothing.
 */
static void tomoyo_set_group(struct tomoyo_io_buffer *head,
                             const char *category)
        __must_hold(&head->io_sem)
{
        if (head->type == TOMOYO_EXCEPTIONPOLICY) {
                tomoyo_print_namespace(head);
                tomoyo_io_printf(head, "acl_group %u ",
                                 head->r.acl_group_index);
        }
        tomoyo_set_string(head, category);
}

/**
 * tomoyo_print_entry - Print an ACL entry.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @acl:  Pointer to an ACL entry.
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
                               struct tomoyo_acl_info *acl)
        __must_hold(&head->io_sem)
{
        const u8 acl_type = acl->type;
        bool first = true;
        u8 bit;

        if (head->r.print_cond_part)
                goto print_cond_part;
        if (acl->is_deleted)
                return true;
        if (!tomoyo_flush(head))
                return false;
        else if (acl_type == TOMOYO_TYPE_PATH_ACL) {
                struct tomoyo_path_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u16 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (head->r.print_transition_related_only &&
                            bit != TOMOYO_TYPE_EXECUTE)
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_path_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
        } else if (acl_type == TOMOYO_TYPE_MANUAL_TASK_ACL) {
                struct tomoyo_task_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "task ");
                tomoyo_set_string(head, "manual_domain_transition ");
                tomoyo_set_string(head, ptr->domainname->name);
        } else if (head->r.print_transition_related_only) {
                return true;
        } else if (acl_type == TOMOYO_TYPE_PATH2_ACL) {
                struct tomoyo_path2_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH2_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pp2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name1);
                tomoyo_print_name_union(head, &ptr->name2);
        } else if (acl_type == TOMOYO_TYPE_PATH_NUMBER_ACL) {
                struct tomoyo_path_number_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH_NUMBER_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pn2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
                tomoyo_print_number_union(head, &ptr->number);
        } else if (acl_type == TOMOYO_TYPE_MKDEV_ACL) {
                struct tomoyo_mkdev_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_MKDEV_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pnnn2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
                tomoyo_print_number_union(head, &ptr->mode);
                tomoyo_print_number_union(head, &ptr->major);
                tomoyo_print_number_union(head, &ptr->minor);
        } else if (acl_type == TOMOYO_TYPE_INET_ACL) {
                struct tomoyo_inet_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "network inet ");
                                tomoyo_set_string(head, tomoyo_proto_keyword
                                                  [ptr->protocol]);
                                tomoyo_set_space(head);
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_socket_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_set_space(head);
                if (ptr->address.group) {
                        tomoyo_set_string(head, "@");
                        tomoyo_set_string(head, ptr->address.group->group_name
                                          ->name);
                } else {
                        char buf[128];

                        tomoyo_print_ip(buf, sizeof(buf), &ptr->address);
                        tomoyo_io_printf(head, "%s", buf);
                }
                tomoyo_print_number_union(head, &ptr->port);
        } else if (acl_type == TOMOYO_TYPE_UNIX_ACL) {
                struct tomoyo_unix_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "network unix ");
                                tomoyo_set_string(head, tomoyo_proto_keyword
                                                  [ptr->protocol]);
                                tomoyo_set_space(head);
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_socket_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
        } else if (acl_type == TOMOYO_TYPE_MOUNT_ACL) {
                struct tomoyo_mount_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "file mount");
                tomoyo_print_name_union(head, &ptr->dev_name);
                tomoyo_print_name_union(head, &ptr->dir_name);
                tomoyo_print_name_union(head, &ptr->fs_type);
                tomoyo_print_number_union(head, &ptr->flags);
        } else if (acl_type == TOMOYO_TYPE_ENV_ACL) {
                struct tomoyo_env_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "misc env ");
                tomoyo_set_string(head, ptr->env->name);
        }
        if (acl->cond) {
                head->r.print_cond_part = true;
                head->r.cond_step = 0;
                if (!tomoyo_flush(head))
                        return false;
print_cond_part:
                if (!tomoyo_print_condition(head, acl->cond))
                        return false;
                head->r.print_cond_part = false;
        } else {
                tomoyo_set_lf(head);
        }
        return true;
}

/**
 * tomoyo_read_domain2 - Read domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @list: Pointer to "struct list_head".
 *
 * Caller holds tomoyo_read_lock().
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_read_domain2(struct tomoyo_io_buffer *head,
                                struct list_head *list)
        __must_hold_shared(&tomoyo_ss)
        __must_hold(&head->io_sem)
{
        list_for_each_cookie(head->r.acl, list) {
                struct tomoyo_acl_info *ptr =
                        list_entry(head->r.acl, typeof(*ptr), list);

                if (!tomoyo_print_entry(head, ptr))
                        return false;
        }
        head->r.acl = NULL;
        return true;
}

/**
 * tomoyo_read_domain - Read domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_domain(struct tomoyo_io_buffer *head)
        __must_hold_shared(&tomoyo_ss)
        __must_hold(&head->io_sem)
{
        if (head->r.eof)
                return;
        list_for_each_cookie(head->r.domain, &tomoyo_domain_list) {
                struct tomoyo_domain_info *domain =
                        list_entry(head->r.domain, typeof(*domain), list);
                u8 i;

                switch (head->r.step) {
                case 0:
                        if (domain->is_deleted &&
                            !head->r.print_this_domain_only)
                                continue;
                        /* Print domainname and flags. */
                        tomoyo_set_string(head, domain->domainname->name);
                        tomoyo_set_lf(head);
                        tomoyo_io_printf(head, "use_profile %u\n",
                                         domain->profile);
                        for (i = 0; i < TOMOYO_MAX_DOMAIN_INFO_FLAGS; i++)
                                if (domain->flags[i])
                                        tomoyo_set_string(head, tomoyo_dif[i]);
                        head->r.index = 0;
                        head->r.step++;
                        fallthrough;
                case 1:
                        while (head->r.index < TOMOYO_MAX_ACL_GROUPS) {
                                i = head->r.index++;
                                if (!test_bit(i, domain->group))
                                        continue;
                                tomoyo_io_printf(head, "use_group %u\n", i);
                                if (!tomoyo_flush(head))
                                        return;
                        }
                        head->r.index = 0;
                        head->r.step++;
                        tomoyo_set_lf(head);
                        fallthrough;
                case 2:
                        if (!tomoyo_read_domain2(head, &domain->acl_info_list))
                                return;
                        head->r.step++;
                        if (!tomoyo_set_lf(head))
                                return;
                        fallthrough;
                case 3:
                        head->r.step = 0;
                        if (head->r.print_this_domain_only)
                                goto done;
                }
        }
 done:
        head->r.eof = true;
}

/**
 * tomoyo_write_pid: Specify PID to obtain domainname.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0.
 */
static int tomoyo_write_pid(struct tomoyo_io_buffer *head)
{
        head->r.eof = false;
        return 0;
}

/**
 * tomoyo_read_pid - Get domainname of the specified PID.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns the domainname which the specified PID is in on success,
 * empty string otherwise.
 * The PID is specified by tomoyo_write_pid() so that the user can obtain
 * using read()/write() interface rather than sysctl() interface.
 */
static void tomoyo_read_pid(struct tomoyo_io_buffer *head)
        __must_hold(&head->io_sem)
{
        char *buf = head->write_buf;
        bool global_pid = false;
        unsigned int pid;
        struct task_struct *p;
        struct tomoyo_domain_info *domain = NULL;

        /* Accessing write_buf is safe because head->io_sem is held. */
        if (!buf) {
                head->r.eof = true;
                return; /* Do nothing if open(O_RDONLY). */
        }
        if (head->r.w_pos || head->r.eof)
                return;
        head->r.eof = true;
        if (tomoyo_str_starts(&buf, "global-pid "))
                global_pid = true;
        if (kstrtouint(buf, 10, &pid))
                return;
        rcu_read_lock();
        if (global_pid)
                p = find_task_by_pid_ns(pid, &init_pid_ns);
        else
                p = find_task_by_vpid(pid);
        if (p)
                domain = tomoyo_task(p)->domain_info;
        rcu_read_unlock();
        if (!domain)
                return;
        tomoyo_io_printf(head, "%u %u ", pid, domain->profile);
        tomoyo_set_string(head, domain->domainname->name);
}

/* String table for domain transition control keywords. */
static const char *tomoyo_transition_type[TOMOYO_MAX_TRANSITION_TYPE] = {
        [TOMOYO_TRANSITION_CONTROL_NO_RESET]      = "no_reset_domain ",
        [TOMOYO_TRANSITION_CONTROL_RESET]         = "reset_domain ",
        [TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE] = "no_initialize_domain ",
        [TOMOYO_TRANSITION_CONTROL_INITIALIZE]    = "initialize_domain ",
        [TOMOYO_TRANSITION_CONTROL_NO_KEEP]       = "no_keep_domain ",
        [TOMOYO_TRANSITION_CONTROL_KEEP]          = "keep_domain ",
};

/* String table for grouping keywords. */
static const char *tomoyo_group_name[TOMOYO_MAX_GROUP] = {
        [TOMOYO_PATH_GROUP]    = "path_group ",
        [TOMOYO_NUMBER_GROUP]  = "number_group ",
        [TOMOYO_ADDRESS_GROUP] = "address_group ",
};

/**
 * tomoyo_write_exception - Write exception policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_exception(struct tomoyo_io_buffer *head)
        __must_hold_shared(&tomoyo_ss)
        __must_hold(&head->io_sem)
{
        const bool is_delete = head->w.is_delete;
        struct tomoyo_acl_param param = {
                .ns = head->w.ns,
                .is_delete = is_delete,
                .data = head->write_buf,
        };
        u8 i;

        if (tomoyo_str_starts(&param.data, "aggregator "))
                return tomoyo_write_aggregator(&param);
        for (i = 0; i < TOMOYO_MAX_TRANSITION_TYPE; i++)
                if (tomoyo_str_starts(&param.data, tomoyo_transition_type[i]))
                        return tomoyo_write_transition_control(&param, i);
        for (i = 0; i < TOMOYO_MAX_GROUP; i++)
                if (tomoyo_str_starts(&param.data, tomoyo_group_name[i]))
                        return tomoyo_write_group(&param, i);
        if (tomoyo_str_starts(&param.data, "acl_group ")) {
                unsigned int group;
                char *data;

                group = simple_strtoul(param.data, &data, 10);
                if (group < TOMOYO_MAX_ACL_GROUPS && *data++ == ' ')
                        return tomoyo_write_domain2
                                (head->w.ns, &head->w.ns->acl_group[group],
                                 data, is_delete);
        }
        return -EINVAL;
}

/**
 * tomoyo_read_group - Read "struct tomoyo_path_group"/"struct tomoyo_number_group"/"struct tomoyo_address_group" list.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @idx:  Index number.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_read_group(struct tomoyo_io_buffer *head, const int idx)
        __must_hold_shared(&tomoyo_ss)
        __must_hold(&head->io_sem)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        struct list_head *list = &ns->group_list[idx];

        list_for_each_cookie(head->r.group, list) {
                struct tomoyo_group *group =
                        list_entry(head->r.group, typeof(*group), head.list);

                list_for_each_cookie(head->r.acl, &group->member_list) {
                        struct tomoyo_acl_head *ptr =
                                list_entry(head->r.acl, typeof(*ptr), list);

                        if (ptr->is_deleted)
                                continue;
                        if (!tomoyo_flush(head))
                                return false;
                        tomoyo_print_namespace(head);
                        tomoyo_set_string(head, tomoyo_group_name[idx]);
                        tomoyo_set_string(head, group->group_name->name);
                        if (idx == TOMOYO_PATH_GROUP) {
                                tomoyo_set_space(head);
                                tomoyo_set_string(head, container_of
                                               (ptr, struct tomoyo_path_group,
                                                head)->member_name->name);
                        } else if (idx == TOMOYO_NUMBER_GROUP) {
                                tomoyo_print_number_union(head, &container_of
                                                          (ptr,
                                                   struct tomoyo_number_group,
                                                           head)->number);
                        } else if (idx == TOMOYO_ADDRESS_GROUP) {
                                char buffer[128];
                                struct tomoyo_address_group *member =
                                        container_of(ptr, typeof(*member),
                                                     head);

                                tomoyo_print_ip(buffer, sizeof(buffer),
                                                &member->address);
                                tomoyo_io_printf(head, " %s", buffer);
                        }
                        tomoyo_set_lf(head);
                }
                head->r.acl = NULL;
        }
        head->r.group = NULL;
        return true;
}

/**
 * tomoyo_read_policy - Read "struct tomoyo_..._entry" list.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @idx:  Index number.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx)
        __must_hold_shared(&tomoyo_ss)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        struct list_head *list = &ns->policy_list[idx];

        list_for_each_cookie(head->r.acl, list) {
                struct tomoyo_acl_head *acl =
                        container_of(head->r.acl, typeof(*acl), list);
                if (acl->is_deleted)
                        continue;
                if (!tomoyo_flush(head))
                        return false;
                switch (idx) {
                case TOMOYO_ID_TRANSITION_CONTROL:
                        {
                                struct tomoyo_transition_control *ptr =
                                        container_of(acl, typeof(*ptr), head);

                                tomoyo_print_namespace(head);
                                tomoyo_set_string(head, tomoyo_transition_type
                                                  [ptr->type]);
                                tomoyo_set_string(head, ptr->program ?
                                                  ptr->program->name : "any");
                                tomoyo_set_string(head, " from ");
                                tomoyo_set_string(head, ptr->domainname ?
                                                  ptr->domainname->name :
                                                  "any");
                        }
                        break;
                case TOMOYO_ID_AGGREGATOR:
                        {
                                struct tomoyo_aggregator *ptr =
                                        container_of(acl, typeof(*ptr), head);

                                tomoyo_print_namespace(head);
                                tomoyo_set_string(head, "aggregator ");
                                tomoyo_set_string(head,
                                                  ptr->original_name->name);
                                tomoyo_set_space(head);
                                tomoyo_set_string(head,
                                               ptr->aggregated_name->name);
                        }
                        break;
                default:
                        continue;
                }
                tomoyo_set_lf(head);
        }
        head->r.acl = NULL;
        return true;
}

/**
 * tomoyo_read_exception - Read exception policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_exception(struct tomoyo_io_buffer *head)
        __must_hold_shared(&tomoyo_ss)
        __must_hold(&head->io_sem)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);

        if (head->r.eof)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY &&
               tomoyo_read_policy(head, head->r.step))
                head->r.step++;
        if (head->r.step < TOMOYO_MAX_POLICY)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP &&
               tomoyo_read_group(head, head->r.step - TOMOYO_MAX_POLICY))
                head->r.step++;
        if (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP
               + TOMOYO_MAX_ACL_GROUPS) {
                head->r.acl_group_index = head->r.step - TOMOYO_MAX_POLICY
                        - TOMOYO_MAX_GROUP;
                if (!tomoyo_read_domain2(head, &ns->acl_group
                                         [head->r.acl_group_index]))
                        return;
                head->r.step++;
        }
        head->r.eof = true;
}

/* Wait queue for kernel -> userspace notification. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_query_wait);
/* Wait queue for userspace -> kernel notification. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_answer_wait);

/* Structure for query. */
struct tomoyo_query {
        struct list_head list;
        struct tomoyo_domain_info *domain;
        char *query;
        size_t query_len;
        unsigned int serial;
        u8 timer;
        u8 answer;
        u8 retry;
};

/* The list for "struct tomoyo_query". */
static LIST_HEAD(tomoyo_query_list);

/* Lock for manipulating tomoyo_query_list. */
static DEFINE_SPINLOCK(tomoyo_query_list_lock);

/*
 * Number of "struct file" referring /sys/kernel/security/tomoyo/query
 * interface.
 */
static atomic_t tomoyo_query_observers = ATOMIC_INIT(0);

/**
 * tomoyo_truncate - Truncate a line.
 *
 * @str: String to truncate.
 *
 * Returns length of truncated @str.
 */
static int tomoyo_truncate(char *str)
{
        char *start = str;

        while (*(unsigned char *) str > (unsigned char) ' ')
                str++;
        *str = '\0';
        return strlen(start) + 1;
}

/**
 * tomoyo_numscan - sscanf() which stores the length of a decimal integer value.
 *
 * @str:   String to scan.
 * @head:  Leading string that must start with.
 * @width: Pointer to "int" for storing length of a decimal integer value after @head.
 * @tail:  Optional character that must match after a decimal integer value.
 *
 * Returns whether @str starts with @head and a decimal value follows @head.
 */
static bool tomoyo_numscan(const char *str, const char *head, int *width, const char tail)
{
        const char *cp;
        const int n = strlen(head);

        if (!strncmp(str, head, n)) {
                cp = str + n;
                while (*cp && *cp >= '0' && *cp <= '9')
                        cp++;
                if (*cp == tail || !tail) {
                        *width = cp - (str + n);
                        return *width != 0;
                }
        }
        *width = 0;
        return 0;
}

/**
 * tomoyo_patternize_path - Make patterns for file path. Used by learning mode.
 *
 * @buffer: Destination buffer.
 * @len:    Size of @buffer.
 * @entry:  Original line.
 *
 * Returns nothing.
 */
static void tomoyo_patternize_path(char *buffer, const int len, char *entry)
{
        int width;
        char *cp = entry;

        /* Nothing to do if this line is not for "file" related entry. */
        if (strncmp(entry, "file ", 5))
                goto flush;
        /*
         * Nothing to do if there is no colon in this line, for this rewriting
         * applies to only filesystems where numeric values in the path are volatile.
         */
        cp = strchr(entry + 5, ':');
        if (!cp) {
                cp = entry;
                goto flush;
        }
        /* Flush e.g. "file ioctl" part. */
        while (*cp != ' ')
                cp--;
        *cp++ = '\0';
        tomoyo_addprintf(buffer, len, "%s ", entry);
        /* e.g. file ioctl pipe:[$INO] $CMD */
        if (tomoyo_numscan(cp, "pipe:[", &width, ']')) {
                cp += width + 7;
                tomoyo_addprintf(buffer, len, "pipe:[\\$]");
                goto flush;
        }
        /* e.g. file ioctl socket:[$INO] $CMD */
        if (tomoyo_numscan(cp, "socket:[", &width, ']')) {
                cp += width + 9;
                tomoyo_addprintf(buffer, len, "socket:[\\$]");
                goto flush;
        }
        if (!strncmp(cp, "proc:/self", 10)) {
                /* e.g. file read proc:/self/task/$TID/fdinfo/$FD */
                cp += 10;
                tomoyo_addprintf(buffer, len, "proc:/self");
        } else if (tomoyo_numscan(cp, "proc:/", &width, 0)) {
                /* e.g. file read proc:/$PID/task/$TID/fdinfo/$FD */
                /*
                 * Don't patternize $PID part if $PID == 1, for several
                 * programs access only files in /proc/1/ directory.
                 */
                cp += width + 6;
                if (width == 1 && *(cp - 1) == '1')
                        tomoyo_addprintf(buffer, len, "proc:/1");
                else
                        tomoyo_addprintf(buffer, len, "proc:/\\$");
        } else {
                goto flush;
        }
        /* Patternize $TID part if "/task/" follows. */
        if (tomoyo_numscan(cp, "/task/", &width, 0)) {
                cp += width + 6;
                tomoyo_addprintf(buffer, len, "/task/\\$");
        }
        /* Patternize $FD part if "/fd/" or "/fdinfo/" follows. */
        if (tomoyo_numscan(cp, "/fd/", &width, 0)) {
                cp += width + 4;
                tomoyo_addprintf(buffer, len, "/fd/\\$");
        } else if (tomoyo_numscan(cp, "/fdinfo/", &width, 0)) {
                cp += width + 8;
                tomoyo_addprintf(buffer, len, "/fdinfo/\\$");
        }
flush:
        /* Flush remaining part if any. */
        if (*cp)
                tomoyo_addprintf(buffer, len, "%s", cp);
}

/**
 * tomoyo_add_entry - Add an ACL to current thread's domain. Used by learning mode.
 *
 * @domain: Pointer to "struct tomoyo_domain_info".
 * @header: Lines containing ACL.
 *
 * Returns nothing.
 */
static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header)
        __must_hold_shared(&tomoyo_ss)
{
        char *buffer;
        char *realpath = NULL;
        char *argv0 = NULL;
        char *symlink = NULL;
        char *cp = strchr(header, '\n');
        int len;

        if (!cp)
                return;
        cp = strchr(cp + 1, '\n');
        if (!cp)
                return;
        *cp++ = '\0';
        /* Reserve some space for potentially using patterns. */
        len = strlen(cp) + 16;
        /* strstr() will return NULL if ordering is wrong. */
        if (*cp == 'f') {
                argv0 = strstr(header, " argv[]={ \"");
                if (argv0) {
                        argv0 += 10;
                        len += tomoyo_truncate(argv0) + 14;
                }
                realpath = strstr(header, " exec={ realpath=\"");
                if (realpath) {
                        realpath += 8;
                        len += tomoyo_truncate(realpath) + 6;
                }
                symlink = strstr(header, " symlink.target=\"");
                if (symlink)
                        len += tomoyo_truncate(symlink + 1) + 1;
        }
        buffer = kmalloc(len, GFP_NOFS | __GFP_ZERO);
        if (!buffer)
                return;
        tomoyo_patternize_path(buffer, len, cp);
        if (realpath)
                tomoyo_addprintf(buffer, len, " exec.%s", realpath);
        if (argv0)
                tomoyo_addprintf(buffer, len, " exec.argv[0]=%s", argv0);
        if (symlink)
                tomoyo_addprintf(buffer, len, "%s", symlink);
        tomoyo_normalize_line(buffer);
        if (!tomoyo_write_domain2(domain->ns, &domain->acl_info_list, buffer,
                                  false))
                tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
        kfree(buffer);
}

/**
 * tomoyo_supervisor - Ask for the supervisor's decision.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @fmt: The printf()'s format string, followed by parameters.
 *
 * Returns 0 if the supervisor decided to permit the access request which
 * violated the policy in enforcing mode, TOMOYO_RETRY_REQUEST if the
 * supervisor decided to retry the access request which violated the policy in
 * enforcing mode, 0 if it is not in enforcing mode, -EPERM otherwise.
 */
int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
{
        va_list args;
        int error;
        int len;
        static unsigned int tomoyo_serial;
        struct tomoyo_query entry = { };
        bool quota_exceeded = false;

        va_start(args, fmt);
        len = vsnprintf(NULL, 0, fmt, args) + 1;
        va_end(args);
        /* Write /sys/kernel/security/tomoyo/audit. */
        va_start(args, fmt);
        tomoyo_write_log2(r, len, fmt, args);
        va_end(args);
        /* Nothing more to do if granted. */
        if (r->granted)
                return 0;
        if (r->mode)
                tomoyo_update_stat(r->mode);
        switch (r->mode) {
        case TOMOYO_CONFIG_ENFORCING:
                error = -EPERM;
                if (atomic_read(&tomoyo_query_observers))
                        break;
                goto out;
        case TOMOYO_CONFIG_LEARNING:
                error = 0;
                /* Check max_learning_entry parameter. */
                if (tomoyo_domain_quota_is_ok(r))
                        break;
                fallthrough;
        default:
                return 0;
        }
        /* Get message. */
        va_start(args, fmt);
        entry.query = tomoyo_init_log(r, len, fmt, args);
        va_end(args);
        if (!entry.query)
                goto out;
        entry.query_len = strlen(entry.query) + 1;
        if (!error) {
                tomoyo_add_entry(r->domain, entry.query);
                goto out;
        }
        len = kmalloc_size_roundup(entry.query_len);
        entry.domain = r->domain;
        spin_lock(&tomoyo_query_list_lock);
        if (tomoyo_memory_quota[TOMOYO_MEMORY_QUERY] &&
            tomoyo_memory_used[TOMOYO_MEMORY_QUERY] + len
            >= tomoyo_memory_quota[TOMOYO_MEMORY_QUERY]) {
                quota_exceeded = true;
        } else {
                entry.serial = tomoyo_serial++;
                entry.retry = r->retry;
                tomoyo_memory_used[TOMOYO_MEMORY_QUERY] += len;
                list_add_tail(&entry.list, &tomoyo_query_list);
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (quota_exceeded)
                goto out;
        /* Give 10 seconds for supervisor's opinion. */
        while (entry.timer < 10) {
                wake_up_all(&tomoyo_query_wait);
                if (wait_event_interruptible_timeout
                    (tomoyo_answer_wait, entry.answer ||
                     !atomic_read(&tomoyo_query_observers), HZ))
                        break;
                entry.timer++;
        }
        spin_lock(&tomoyo_query_list_lock);
        list_del(&entry.list);
        tomoyo_memory_used[TOMOYO_MEMORY_QUERY] -= len;
        spin_unlock(&tomoyo_query_list_lock);
        switch (entry.answer) {
        case 3: /* Asked to retry by administrator. */
                error = TOMOYO_RETRY_REQUEST;
                r->retry++;
                break;
        case 1:
                /* Granted by administrator. */
                error = 0;
                break;
        default:
                /* Timed out or rejected by administrator. */
                break;
        }
out:
        kfree(entry.query);
        return error;
}

/**
 * tomoyo_find_domain_by_qid - Get domain by query id.
 *
 * @serial: Query ID assigned by tomoyo_supervisor().
 *
 * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise.
 */
static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
(unsigned int serial)
{
        struct tomoyo_query *ptr;
        struct tomoyo_domain_info *domain = NULL;

        spin_lock(&tomoyo_query_list_lock);
        list_for_each_entry(ptr, &tomoyo_query_list, list) {
                if (ptr->serial != serial)
                        continue;
                domain = ptr->domain;
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        return domain;
}

/**
 * tomoyo_poll_query - poll() for /sys/kernel/security/tomoyo/query.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table".
 *
 * Returns EPOLLIN | EPOLLRDNORM when ready to read, 0 otherwise.
 *
 * Waits for access requests which violated policy in enforcing mode.
 */
static __poll_t tomoyo_poll_query(struct file *file, poll_table *wait)
{
        if (!list_empty(&tomoyo_query_list))
                return EPOLLIN | EPOLLRDNORM;
        poll_wait(file, &tomoyo_query_wait, wait);
        if (!list_empty(&tomoyo_query_list))
                return EPOLLIN | EPOLLRDNORM;
        return 0;
}

/**
 * tomoyo_read_query - Read access requests which violated policy in enforcing mode.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 */
static void tomoyo_read_query(struct tomoyo_io_buffer *head)
        __must_hold(&head->io_sem)
{
        struct list_head *tmp;
        unsigned int pos = 0;
        size_t len = 0;
        char *buf;

        if (head->r.w_pos)
                return;
        kfree(head->read_buf);
        head->read_buf = NULL;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (pos++ != head->r.query_index)
                        continue;
                len = ptr->query_len;
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (!len) {
                head->r.query_index = 0;
                return;
        }
        buf = kzalloc(len + 32, GFP_NOFS);
        if (!buf)
                return;
        pos = 0;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (pos++ != head->r.query_index)
                        continue;
                /*
                 * Some query can be skipped because tomoyo_query_list
                 * can change, but I don't care.
                 */
                if (len == ptr->query_len)
                        snprintf(buf, len + 31, "Q%u-%hu\n%s", ptr->serial,
                                 ptr->retry, ptr->query);
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (buf[0]) {
                head->read_buf = buf;
                head->r.w[head->r.w_pos++] = buf;
                head->r.query_index++;
        } else {
                kfree(buf);
        }
}

/**
 * tomoyo_write_answer - Write the supervisor's decision.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, -EINVAL otherwise.
 */
static int tomoyo_write_answer(struct tomoyo_io_buffer *head)
        __must_hold(&head->io_sem)
{
        char *data = head->write_buf;
        struct list_head *tmp;
        unsigned int serial;
        unsigned int answer;

        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                ptr->timer = 0;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (sscanf(data, "A%u=%u", &serial, &answer) != 2)
                return -EINVAL;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (ptr->serial != serial)
                        continue;
                ptr->answer = answer;
                /* Remove from tomoyo_query_list. */
                if (ptr->answer)
                        list_del_init(&ptr->list);
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        return 0;
}

/**
 * tomoyo_read_version: Get version.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns version information.
 */
static void tomoyo_read_version(struct tomoyo_io_buffer *head)
        __must_hold(&head->io_sem)
{
        if (!head->r.eof) {
                tomoyo_io_printf(head, "2.6.0");
                head->r.eof = true;
        }
}

/* String table for /sys/kernel/security/tomoyo/stat interface. */
static const char * const tomoyo_policy_headers[TOMOYO_MAX_POLICY_STAT] = {
        [TOMOYO_STAT_POLICY_UPDATES]    = "update:",
        [TOMOYO_STAT_POLICY_LEARNING]   = "violation in learning mode:",
        [TOMOYO_STAT_POLICY_PERMISSIVE] = "violation in permissive mode:",
        [TOMOYO_STAT_POLICY_ENFORCING]  = "violation in enforcing mode:",
};

/* String table for /sys/kernel/security/tomoyo/stat interface. */
static const char * const tomoyo_memory_headers[TOMOYO_MAX_MEMORY_STAT] = {
        [TOMOYO_MEMORY_POLICY] = "policy:",
        [TOMOYO_MEMORY_AUDIT]  = "audit log:",
        [TOMOYO_MEMORY_QUERY]  = "query message:",
};

/* Counter for number of updates. */
static atomic_t tomoyo_stat_updated[TOMOYO_MAX_POLICY_STAT];
/* Timestamp counter for last updated. */
static time64_t tomoyo_stat_modified[TOMOYO_MAX_POLICY_STAT];

/**
 * tomoyo_update_stat - Update statistic counters.
 *
 * @index: Index for policy type.
 *
 * Returns nothing.
 */
void tomoyo_update_stat(const u8 index)
{
        atomic_inc(&tomoyo_stat_updated[index]);
        tomoyo_stat_modified[index] = ktime_get_real_seconds();
}

/**
 * tomoyo_read_stat - Read statistic data.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_read_stat(struct tomoyo_io_buffer *head)
        __must_hold(&head->io_sem)
{
        u8 i;
        unsigned int total = 0;

        if (head->r.eof)
                return;
        for (i = 0; i < TOMOYO_MAX_POLICY_STAT; i++) {
                tomoyo_io_printf(head, "Policy %-30s %10u",
                                 tomoyo_policy_headers[i],
                                 atomic_read(&tomoyo_stat_updated[i]));
                if (tomoyo_stat_modified[i]) {
                        struct tomoyo_time stamp;

                        tomoyo_convert_time(tomoyo_stat_modified[i], &stamp);
                        tomoyo_io_printf(head, " (Last: %04u/%02u/%02u %02u:%02u:%02u)",
                                         stamp.year, stamp.month, stamp.day,
                                         stamp.hour, stamp.min, stamp.sec);
                }
                tomoyo_set_lf(head);
        }
        for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) {
                unsigned int used = tomoyo_memory_used[i];

                total += used;
                tomoyo_io_printf(head, "Memory used by %-22s %10u",
                                 tomoyo_memory_headers[i], used);
                used = tomoyo_memory_quota[i];
                if (used)
                        tomoyo_io_printf(head, " (Quota: %10u)", used);
                tomoyo_set_lf(head);
        }
        tomoyo_io_printf(head, "Total memory used:                    %10u\n",
                         total);
        head->r.eof = true;
}

/**
 * tomoyo_write_stat - Set memory quota.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0.
 */
static int tomoyo_write_stat(struct tomoyo_io_buffer *head)
        __must_hold(&head->io_sem)
{
        char *data = head->write_buf;
        u8 i;

        if (tomoyo_str_starts(&data, "Memory used by "))
                for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++)
                        if (tomoyo_str_starts(&data, tomoyo_memory_headers[i]))
                                sscanf(data, "%u", &tomoyo_memory_quota[i]);
        return 0;
}

/**
 * tomoyo_open_control - open() for /sys/kernel/security/tomoyo/ interface.
 *
 * @type: Type of interface.
 * @file: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_open_control(const u8 type, struct file *file)
{
        struct tomoyo_io_buffer *head = kzalloc_obj(*head, GFP_NOFS);

        if (!head)
                return -ENOMEM;
        guard(mutex_init)(&head->io_sem);
        head->type = type;
        switch (type) {
        case TOMOYO_DOMAINPOLICY:
                /* /sys/kernel/security/tomoyo/domain_policy */
                head->write = tomoyo_write_domain;
                head->read = tomoyo_read_domain;
                break;
        case TOMOYO_EXCEPTIONPOLICY:
                /* /sys/kernel/security/tomoyo/exception_policy */
                head->write = tomoyo_write_exception;
                head->read = tomoyo_read_exception;
                break;
        case TOMOYO_AUDIT:
                /* /sys/kernel/security/tomoyo/audit */
                head->poll = tomoyo_poll_log;
                head->read = tomoyo_read_log;
                break;
        case TOMOYO_PROCESS_STATUS:
                /* /sys/kernel/security/tomoyo/.process_status */
                head->write = tomoyo_write_pid;
                head->read = tomoyo_read_pid;
                break;
        case TOMOYO_VERSION:
                /* /sys/kernel/security/tomoyo/version */
                head->read = tomoyo_read_version;
                head->readbuf_size = 128;
                break;
        case TOMOYO_STAT:
                /* /sys/kernel/security/tomoyo/stat */
                head->write = tomoyo_write_stat;
                head->read = tomoyo_read_stat;
                head->readbuf_size = 1024;
                break;
        case TOMOYO_PROFILE:
                /* /sys/kernel/security/tomoyo/profile */
                head->write = tomoyo_write_profile;
                head->read = tomoyo_read_profile;
                break;
        case TOMOYO_QUERY: /* /sys/kernel/security/tomoyo/query */
                head->poll = tomoyo_poll_query;
                head->write = tomoyo_write_answer;
                head->read = tomoyo_read_query;
                break;
        case TOMOYO_MANAGER:
                /* /sys/kernel/security/tomoyo/manager */
                head->write = tomoyo_write_manager;
                head->read = tomoyo_read_manager;
                break;
        }
        if (!(file->f_mode & FMODE_READ)) {
                /*
                 * No need to allocate read_buf since it is not opened
                 * for reading.
                 */
                head->read = NULL;
                head->poll = NULL;
        } else if (!head->poll) {
                /* Don't allocate read_buf for poll() access. */
                if (!head->readbuf_size)
                        head->readbuf_size = 4096 * 2;
                head->read_buf = kzalloc(head->readbuf_size, GFP_NOFS);
                if (!head->read_buf) {
                        kfree(head);
                        return -ENOMEM;
                }
        }
        if (!(file->f_mode & FMODE_WRITE)) {
                /*
                 * No need to allocate write_buf since it is not opened
                 * for writing.
                 */
                head->write = NULL;
        } else if (head->write) {
                head->writebuf_size = 4096 * 2;
                head->write_buf = kzalloc(head->writebuf_size, GFP_NOFS);
                if (!head->write_buf) {
                        kfree(head->read_buf);
                        kfree(head);
                        return -ENOMEM;
                }
        }
        /*
         * If the file is /sys/kernel/security/tomoyo/query , increment the
         * observer counter.
         * The obserber counter is used by tomoyo_supervisor() to see if
         * there is some process monitoring /sys/kernel/security/tomoyo/query.
         */
        if (type == TOMOYO_QUERY)
                atomic_inc(&tomoyo_query_observers);
        file->private_data = head;
        tomoyo_notify_gc(head, true);
        return 0;
}

/**
 * tomoyo_poll_control - poll() for /sys/kernel/security/tomoyo/ interface.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table". Maybe NULL.
 *
 * Returns EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM if ready to read/write,
 * EPOLLOUT | EPOLLWRNORM otherwise.
 */
__poll_t tomoyo_poll_control(struct file *file, poll_table *wait)
{
        struct tomoyo_io_buffer *head = file->private_data;

        if (head->poll)
                return head->poll(file, wait) | EPOLLOUT | EPOLLWRNORM;
        return EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM;
}

/**
 * tomoyo_set_namespace_cursor - Set namespace to read.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static inline void tomoyo_set_namespace_cursor(struct tomoyo_io_buffer *head)
{
        struct list_head *ns;

        if (head->type != TOMOYO_EXCEPTIONPOLICY &&
            head->type != TOMOYO_PROFILE)
                return;
        /*
         * If this is the first read, or reading previous namespace finished
         * and has more namespaces to read, update the namespace cursor.
         */
        ns = head->r.ns;
        if (!ns || (head->r.eof && ns->next != &tomoyo_namespace_list)) {
                /* Clearing is OK because tomoyo_flush() returned true. */
                memset(&head->r, 0, sizeof(head->r));
                head->r.ns = ns ? ns->next : tomoyo_namespace_list.next;
        }
}

/**
 * tomoyo_has_more_namespace - Check for unread namespaces.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns true if we have more entries to print, false otherwise.
 */
static inline bool tomoyo_has_more_namespace(struct tomoyo_io_buffer *head)
{
        return (head->type == TOMOYO_EXCEPTIONPOLICY ||
                head->type == TOMOYO_PROFILE) && head->r.eof &&
                head->r.ns->next != &tomoyo_namespace_list;
}

/**
 * tomoyo_read_control - read() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head:       Pointer to "struct tomoyo_io_buffer".
 * @buffer:     Pointer to buffer to write to.
 * @buffer_len: Size of @buffer.
 *
 * Returns bytes read on success, negative value otherwise.
 */
ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer,
                            const int buffer_len)
{
        int len;
        int idx;

        if (!head->read)
                return -EINVAL;
        if (mutex_lock_interruptible(&head->io_sem))
                return -EINTR;
        head->read_user_buf = buffer;
        head->read_user_buf_avail = buffer_len;
        idx = tomoyo_read_lock();
        if (tomoyo_flush(head))
                /* Call the policy handler. */
                do {
                        tomoyo_set_namespace_cursor(head);
                        head->read(head);
                } while (tomoyo_flush(head) &&
                         tomoyo_has_more_namespace(head));
        tomoyo_read_unlock(idx);
        len = head->read_user_buf - buffer;
        mutex_unlock(&head->io_sem);
        return len;
}

/**
 * tomoyo_parse_policy - Parse a policy line.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @line: Line to parse.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_parse_policy(struct tomoyo_io_buffer *head, char *line)
        __must_hold_shared(&tomoyo_ss)
        __must_hold(&head->io_sem)
{
        /* Delete request? */
        head->w.is_delete = !strncmp(line, "delete ", 7);
        if (head->w.is_delete)
                memmove(line, line + 7, strlen(line + 7) + 1);
        /* Selecting namespace to update. */
        if (head->type == TOMOYO_EXCEPTIONPOLICY ||
            head->type == TOMOYO_PROFILE) {
                if (*line == '<') {
                        char *cp = strchr(line, ' ');

                        if (cp) {
                                *cp++ = '\0';
                                head->w.ns = tomoyo_assign_namespace(line);
                                memmove(line, cp, strlen(cp) + 1);
                        } else
                                head->w.ns = NULL;
                } else
                        head->w.ns = &tomoyo_kernel_namespace;
                /* Don't allow updating if namespace is invalid. */
                if (!head->w.ns)
                        return -ENOENT;
        }
        /* Do the update. */
        return head->write(head);
}

/**
 * tomoyo_write_control - write() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head:       Pointer to "struct tomoyo_io_buffer".
 * @buffer:     Pointer to buffer to read from.
 * @buffer_len: Size of @buffer.
 *
 * Returns @buffer_len on success, negative value otherwise.
 */
ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
                             const char __user *buffer, const int buffer_len)
{
        int error = buffer_len;
        size_t avail_len = buffer_len;
        char *cp0;
        int idx;

        if (!head->write)
                return -EINVAL;
        if (mutex_lock_interruptible(&head->io_sem))
                return -EINTR;
        cp0 = head->write_buf;
        head->read_user_buf_avail = 0;
        idx = tomoyo_read_lock();
        /* Read a line and dispatch it to the policy handler. */
        while (avail_len > 0) {
                char c;

                if (head->w.avail >= head->writebuf_size - 1) {
                        const int len = head->writebuf_size * 2;
                        char *cp = kzalloc(len, GFP_NOFS | __GFP_NOWARN);

                        if (!cp) {
                                error = -ENOMEM;
                                break;
                        }
                        memmove(cp, cp0, head->w.avail);
                        kfree(cp0);
                        head->write_buf = cp;
                        cp0 = cp;
                        head->writebuf_size = len;
                }
                if (get_user(c, buffer)) {
                        error = -EFAULT;
                        break;
                }
                buffer++;
                avail_len--;
                cp0[head->w.avail++] = c;
                if (c != '\n')
                        continue;
                cp0[head->w.avail - 1] = '\0';
                head->w.avail = 0;
                tomoyo_normalize_line(cp0);
                if (!strcmp(cp0, "reset")) {
                        head->w.ns = &tomoyo_kernel_namespace;
                        head->w.domain = NULL;
                        memset(&head->r, 0, sizeof(head->r));
                        continue;
                }
                /* Don't allow updating policies by non manager programs. */
                switch (head->type) {
                case TOMOYO_PROCESS_STATUS:
                        /* This does not write anything. */
                        break;
                case TOMOYO_DOMAINPOLICY:
                        if (tomoyo_select_domain(head, cp0))
                                continue;
                        fallthrough;
                case TOMOYO_EXCEPTIONPOLICY:
                        if (!strcmp(cp0, "select transition_only")) {
                                head->r.print_transition_related_only = true;
                                continue;
                        }
                        fallthrough;
                default:
                        if (!tomoyo_manager()) {
                                error = -EPERM;
                                goto out;
                        }
                }
                switch (tomoyo_parse_policy(head, cp0)) {
                case -EPERM:
                        error = -EPERM;
                        goto out;
                case 0:
                        switch (head->type) {
                        case TOMOYO_DOMAINPOLICY:
                        case TOMOYO_EXCEPTIONPOLICY:
                        case TOMOYO_STAT:
                        case TOMOYO_PROFILE:
                        case TOMOYO_MANAGER:
                                tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
                                break;
                        default:
                                break;
                        }
                        break;
                }
        }
out:
        tomoyo_read_unlock(idx);
        mutex_unlock(&head->io_sem);
        return error;
}

/**
 * tomoyo_close_control - close() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 */
void tomoyo_close_control(struct tomoyo_io_buffer *head)
{
        /*
         * If the file is /sys/kernel/security/tomoyo/query , decrement the
         * observer counter.
         */
        if (head->type == TOMOYO_QUERY &&
            atomic_dec_and_test(&tomoyo_query_observers))
                wake_up_all(&tomoyo_answer_wait);
        tomoyo_notify_gc(head, false);
}

/**
 * tomoyo_check_profile - Check all profiles currently assigned to domains are defined.
 */
void tomoyo_check_profile(void)
{
        struct tomoyo_domain_info *domain;
        const int idx = tomoyo_read_lock();

        tomoyo_policy_loaded = true;
        pr_info("TOMOYO: 2.6.0\n");
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                const u8 profile = domain->profile;
                struct tomoyo_policy_namespace *ns = domain->ns;

                if (ns->profile_version == 20110903) {
                        pr_info_once("Converting profile version from %u to %u.\n",
                                     20110903, 20150505);
                        ns->profile_version = 20150505;
                }
                if (ns->profile_version != 20150505)
                        pr_err("Profile version %u is not supported.\n",
                               ns->profile_version);
                else if (!ns->profile_ptr[profile])
                        pr_err("Profile %u (used by '%s') is not defined.\n",
                               profile, domain->domainname->name);
                else
                        continue;
                pr_err("Userland tools for TOMOYO 2.6 must be installed and policy must be initialized.\n");
                pr_err("Please see https://tomoyo.sourceforge.net/2.6/ for more information.\n");
                panic("STOP!");
        }
        tomoyo_read_unlock(idx);
        pr_info("Mandatory Access Control activated.\n");
}

/**
 * tomoyo_load_builtin_policy - Load built-in policy.
 *
 * Returns nothing.
 */
void __init tomoyo_load_builtin_policy(void)
{
#ifdef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING
        static char tomoyo_builtin_profile[] __initdata =
                "PROFILE_VERSION=20150505\n"
                "0-CONFIG={ mode=learning grant_log=no reject_log=yes }\n";
        static char tomoyo_builtin_exception_policy[] __initdata =
                "aggregator proc:/self/exe /proc/self/exe\n";
        static char tomoyo_builtin_domain_policy[] __initdata = "";
        static char tomoyo_builtin_manager[] __initdata = "";
        static char tomoyo_builtin_stat[] __initdata = "";
#else
        /*
         * This include file is manually created and contains built-in policy
         * named "tomoyo_builtin_profile", "tomoyo_builtin_exception_policy",
         * "tomoyo_builtin_domain_policy", "tomoyo_builtin_manager",
         * "tomoyo_builtin_stat" in the form of "static char [] __initdata".
         */
#include "builtin-policy.h"
#endif
        u8 i;
        const int idx = tomoyo_read_lock();

        for (i = 0; i < 5; i++) {
                struct tomoyo_io_buffer head = { };
                char *start = "";

                switch (i) {
                case 0:
                        start = tomoyo_builtin_profile;
                        head.type = TOMOYO_PROFILE;
                        head.write = tomoyo_write_profile;
                        break;
                case 1:
                        start = tomoyo_builtin_exception_policy;
                        head.type = TOMOYO_EXCEPTIONPOLICY;
                        head.write = tomoyo_write_exception;
                        break;
                case 2:
                        start = tomoyo_builtin_domain_policy;
                        head.type = TOMOYO_DOMAINPOLICY;
                        head.write = tomoyo_write_domain;
                        break;
                case 3:
                        start = tomoyo_builtin_manager;
                        head.type = TOMOYO_MANAGER;
                        head.write = tomoyo_write_manager;
                        break;
                case 4:
                        start = tomoyo_builtin_stat;
                        head.type = TOMOYO_STAT;
                        head.write = tomoyo_write_stat;
                        break;
                }
                while (1) {
                        char *end = strchr(start, '\n');

                        if (!end)
                                break;
                        *end = '\0';
                        tomoyo_normalize_line(start);
                        /* head is stack-local and not shared. */
                        context_unsafe(
                                head.write_buf = start;
                                tomoyo_parse_policy(&head, start);
                        );
                        start = end + 1;
                }
        }
        tomoyo_read_unlock(idx);
#ifdef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
        tomoyo_check_profile();
#endif
}











































    2 



    3 
















    1 




































































































    1 






    1 


































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
// SPDX-License-Identifier: GPL-2.0
/*
 * class.c - basic device class management
 *
 * Copyright (c) 2002-3 Patrick Mochel
 * Copyright (c) 2002-3 Open Source Development Labs
 * Copyright (c) 2003-2004 Greg Kroah-Hartman
 * Copyright (c) 2003-2004 IBM Corp.
 */

#include <linux/device/class.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/kdev_t.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/mutex.h>
#include "base.h"

/* /sys/class */
static struct kset *class_kset;

#define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr)

/**
 * class_to_subsys - Turn a struct class into a struct subsys_private
 *
 * @class: pointer to the struct bus_type to look up
 *
 * The driver core internals need to work on the subsys_private structure, not
 * the external struct class pointer.  This function walks the list of
 * registered classes in the system and finds the matching one and returns the
 * internal struct subsys_private that relates to that class.
 *
 * Note, the reference count of the return value is INCREMENTED if it is not
 * NULL.  A call to subsys_put() must be done when finished with the pointer in
 * order for it to be properly freed.
 */
struct subsys_private *class_to_subsys(const struct class *class)
{
        struct subsys_private *sp = NULL;
        struct kobject *kobj;

        if (!class || !class_kset)
                return NULL;

        spin_lock(&class_kset->list_lock);

        if (list_empty(&class_kset->list))
                goto done;

        list_for_each_entry(kobj, &class_kset->list, entry) {
                struct kset *kset = container_of(kobj, struct kset, kobj);

                sp = container_of_const(kset, struct subsys_private, subsys);
                if (sp->class == class)
                        goto done;
        }
        sp = NULL;
done:
        sp = subsys_get(sp);
        spin_unlock(&class_kset->list_lock);
        return sp;
}

static ssize_t class_attr_show(struct kobject *kobj, struct attribute *attr,
                               char *buf)
{
        struct class_attribute *class_attr = to_class_attr(attr);
        struct subsys_private *cp = to_subsys_private(kobj);
        ssize_t ret = -EIO;

        if (class_attr->show)
                ret = class_attr->show(cp->class, class_attr, buf);
        return ret;
}

static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr,
                                const char *buf, size_t count)
{
        struct class_attribute *class_attr = to_class_attr(attr);
        struct subsys_private *cp = to_subsys_private(kobj);
        ssize_t ret = -EIO;

        if (class_attr->store)
                ret = class_attr->store(cp->class, class_attr, buf, count);
        return ret;
}

static void class_release(struct kobject *kobj)
{
        struct subsys_private *cp = to_subsys_private(kobj);
        const struct class *class = cp->class;

        pr_debug("class '%s': release.\n", class->name);

        if (class->class_release)
                class->class_release(class);
        else
                pr_debug("class '%s' does not have a release() function, "
                         "be careful\n", class->name);

        lockdep_unregister_key(&cp->lock_key);
        kfree(cp);
}

static const struct kobj_ns_type_operations *class_child_ns_type(const struct kobject *kobj)
{
        const struct subsys_private *cp = to_subsys_private(kobj);
        const struct class *class = cp->class;

        return class->ns_type;
}

static const struct sysfs_ops class_sysfs_ops = {
        .show           = class_attr_show,
        .store           = class_attr_store,
};

static const struct kobj_type class_ktype = {
        .sysfs_ops        = &class_sysfs_ops,
        .release        = class_release,
        .child_ns_type        = class_child_ns_type,
};

int class_create_file_ns(const struct class *cls, const struct class_attribute *attr,
                         const struct ns_common *ns)
{
        struct subsys_private *sp = class_to_subsys(cls);
        int error;

        if (!sp)
                return -EINVAL;

        error = sysfs_create_file_ns(&sp->subsys.kobj, &attr->attr, ns);
        subsys_put(sp);

        return error;
}
EXPORT_SYMBOL_GPL(class_create_file_ns);

void class_remove_file_ns(const struct class *cls, const struct class_attribute *attr,
                          const struct ns_common *ns)
{
        struct subsys_private *sp = class_to_subsys(cls);

        if (!sp)
                return;

        sysfs_remove_file_ns(&sp->subsys.kobj, &attr->attr, ns);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(class_remove_file_ns);

static struct device *klist_class_to_dev(struct klist_node *n)
{
        struct device_private *p = to_device_private_class(n);
        return p->device;
}

static void klist_class_dev_get(struct klist_node *n)
{
        struct device *dev = klist_class_to_dev(n);

        get_device(dev);
}

static void klist_class_dev_put(struct klist_node *n)
{
        struct device *dev = klist_class_to_dev(n);

        put_device(dev);
}

int class_register(const struct class *cls)
{
        struct subsys_private *cp;
        struct lock_class_key *key;
        int error;

        pr_debug("device class '%s': registering\n", cls->name);

        if (cls->ns_type && !cls->namespace) {
                pr_err("%s: class '%s' does not have namespace\n",
                       __func__, cls->name);
                return -EINVAL;
        }
        if (!cls->ns_type && cls->namespace) {
                pr_err("%s: class '%s' does not have ns_type\n",
                       __func__, cls->name);
                return -EINVAL;
        }

        cp = kzalloc_obj(*cp);
        if (!cp)
                return -ENOMEM;
        klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put);
        INIT_LIST_HEAD(&cp->interfaces);
        kset_init(&cp->glue_dirs);
        key = &cp->lock_key;
        lockdep_register_key(key);
        __mutex_init(&cp->mutex, "subsys mutex", key);
        error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name);
        if (error)
                goto err_out;

        cp->subsys.kobj.kset = class_kset;
        cp->subsys.kobj.ktype = &class_ktype;
        cp->class = cls;

        error = kset_register(&cp->subsys);
        if (error)
                goto err_out;

        error = sysfs_create_groups(&cp->subsys.kobj, cls->class_groups);
        if (error) {
                kobject_del(&cp->subsys.kobj);
                kfree_const(cp->subsys.kobj.name);
                goto err_out;
        }
        return 0;

err_out:
        lockdep_unregister_key(key);
        kfree(cp);
        return error;
}
EXPORT_SYMBOL_GPL(class_register);

void class_unregister(const struct class *cls)
{
        struct subsys_private *sp = class_to_subsys(cls);

        if (!sp)
                return;

        pr_debug("device class '%s': unregistering\n", cls->name);

        sysfs_remove_groups(&sp->subsys.kobj, cls->class_groups);
        kset_unregister(&sp->subsys);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(class_unregister);

static void class_create_release(const struct class *cls)
{
        pr_debug("%s called for %s\n", __func__, cls->name);
        kfree(cls);
}

/**
 * class_create - create a struct class structure
 * @name: pointer to a string for the name of this class.
 *
 * This is used to create a struct class pointer that can then be used
 * in calls to device_create().
 *
 * Returns &struct class pointer on success, or ERR_PTR() on error.
 *
 * Note, the pointer created here is to be destroyed when finished by
 * making a call to class_destroy().
 */
struct class *class_create(const char *name)
{
        struct class *cls;
        int retval;

        cls = kzalloc_obj(*cls);
        if (!cls) {
                retval = -ENOMEM;
                goto error;
        }

        cls->name = name;
        cls->class_release = class_create_release;

        retval = class_register(cls);
        if (retval)
                goto error;

        return cls;

error:
        kfree(cls);
        return ERR_PTR(retval);
}
EXPORT_SYMBOL_GPL(class_create);

/**
 * class_destroy - destroys a struct class structure
 * @cls: pointer to the struct class that is to be destroyed
 *
 * Note, the pointer to be destroyed must have been created with a call
 * to class_create().
 */
void class_destroy(const struct class *cls)
{
        if (IS_ERR_OR_NULL(cls))
                return;

        class_unregister(cls);
}
EXPORT_SYMBOL_GPL(class_destroy);

/**
 * class_dev_iter_init - initialize class device iterator
 * @iter: class iterator to initialize
 * @class: the class we wanna iterate over
 * @start: the device to start iterating from, if any
 * @type: device_type of the devices to iterate over, NULL for all
 *
 * Initialize class iterator @iter such that it iterates over devices
 * of @class.  If @start is set, the list iteration will start there,
 * otherwise if it is NULL, the iteration starts at the beginning of
 * the list.
 */
void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class,
                         const struct device *start, const struct device_type *type)
{
        struct subsys_private *sp = class_to_subsys(class);
        struct klist_node *start_knode = NULL;

        memset(iter, 0, sizeof(*iter));
        if (!sp) {
                pr_crit("%s: class %p was not registered yet\n",
                        __func__, class);
                return;
        }

        if (start)
                start_knode = &start->p->knode_class;
        klist_iter_init_node(&sp->klist_devices, &iter->ki, start_knode);
        iter->type = type;
        iter->sp = sp;
}
EXPORT_SYMBOL_GPL(class_dev_iter_init);

/**
 * class_dev_iter_next - iterate to the next device
 * @iter: class iterator to proceed
 *
 * Proceed @iter to the next device and return it.  Returns NULL if
 * iteration is complete.
 *
 * The returned device is referenced and won't be released till
 * iterator is proceed to the next device or exited.  The caller is
 * free to do whatever it wants to do with the device including
 * calling back into class code.
 */
struct device *class_dev_iter_next(struct class_dev_iter *iter)
{
        struct klist_node *knode;
        struct device *dev;

        if (!iter->sp)
                return NULL;

        while (1) {
                knode = klist_next(&iter->ki);
                if (!knode)
                        return NULL;
                dev = klist_class_to_dev(knode);
                if (!iter->type || iter->type == dev->type)
                        return dev;
        }
}
EXPORT_SYMBOL_GPL(class_dev_iter_next);

/**
 * class_dev_iter_exit - finish iteration
 * @iter: class iterator to finish
 *
 * Finish an iteration.  Always call this function after iteration is
 * complete whether the iteration ran till the end or not.
 */
void class_dev_iter_exit(struct class_dev_iter *iter)
{
        klist_iter_exit(&iter->ki);
        subsys_put(iter->sp);
}
EXPORT_SYMBOL_GPL(class_dev_iter_exit);

/**
 * class_for_each_device - device iterator
 * @class: the class we're iterating
 * @start: the device to start with in the list, if any.
 * @data: data for the callback
 * @fn: function to be called for each device
 *
 * Iterate over @class's list of devices, and call @fn for each,
 * passing it @data.  If @start is set, the list iteration will start
 * there, otherwise if it is NULL, the iteration starts at the
 * beginning of the list.
 *
 * We check the return of @fn each time. If it returns anything
 * other than 0, we break out and return that value.
 *
 * @fn is allowed to do anything including calling back into class
 * code.  There's no locking restriction.
 */
int class_for_each_device(const struct class *class, const struct device *start,
                          void *data, device_iter_t fn)
{
        struct subsys_private *sp = class_to_subsys(class);
        struct class_dev_iter iter;
        struct device *dev;
        int error = 0;

        if (!class)
                return -EINVAL;
        if (!sp) {
                WARN(1, "%s called for class '%s' before it was registered",
                     __func__, class->name);
                return -EINVAL;
        }

        class_dev_iter_init(&iter, class, start, NULL);
        while ((dev = class_dev_iter_next(&iter))) {
                error = fn(dev, data);
                if (error)
                        break;
        }
        class_dev_iter_exit(&iter);
        subsys_put(sp);

        return error;
}
EXPORT_SYMBOL_GPL(class_for_each_device);

/**
 * class_find_device - device iterator for locating a particular device
 * @class: the class we're iterating
 * @start: Device to begin with
 * @data: data for the match function
 * @match: function to check device
 *
 * This is similar to the class_for_each_dev() function above, but it
 * returns a reference to a device that is 'found' for later use, as
 * determined by the @match callback.
 *
 * The callback should return 0 if the device doesn't match and non-zero
 * if it does.  If the callback returns non-zero, this function will
 * return to the caller and not iterate over any more devices.
 *
 * Note, you will need to drop the reference with put_device() after use.
 *
 * @match is allowed to do anything including calling back into class
 * code.  There's no locking restriction.
 */
struct device *class_find_device(const struct class *class, const struct device *start,
                                 const void *data, device_match_t match)
{
        struct subsys_private *sp = class_to_subsys(class);
        struct class_dev_iter iter;
        struct device *dev;

        if (!class)
                return NULL;
        if (!sp) {
                WARN(1, "%s called for class '%s' before it was registered",
                     __func__, class->name);
                return NULL;
        }

        class_dev_iter_init(&iter, class, start, NULL);
        while ((dev = class_dev_iter_next(&iter))) {
                if (match(dev, data)) {
                        get_device(dev);
                        break;
                }
        }
        class_dev_iter_exit(&iter);
        subsys_put(sp);

        return dev;
}
EXPORT_SYMBOL_GPL(class_find_device);

int class_interface_register(struct class_interface *class_intf)
{
        struct subsys_private *sp;
        const struct class *parent;
        struct class_dev_iter iter;
        struct device *dev;

        if (!class_intf || !class_intf->class)
                return -ENODEV;

        parent = class_intf->class;
        sp = class_to_subsys(parent);
        if (!sp)
                return -EINVAL;

        /*
         * Reference in sp is now incremented and will be dropped when
         * the interface is removed in the call to class_interface_unregister()
         */

        mutex_lock(&sp->mutex);
        list_add_tail(&class_intf->node, &sp->interfaces);
        if (class_intf->add_dev) {
                class_dev_iter_init(&iter, parent, NULL, NULL);
                while ((dev = class_dev_iter_next(&iter)))
                        class_intf->add_dev(dev);
                class_dev_iter_exit(&iter);
        }
        mutex_unlock(&sp->mutex);

        return 0;
}
EXPORT_SYMBOL_GPL(class_interface_register);

void class_interface_unregister(struct class_interface *class_intf)
{
        struct subsys_private *sp;
        const struct class *parent = class_intf->class;
        struct class_dev_iter iter;
        struct device *dev;

        if (!parent)
                return;

        sp = class_to_subsys(parent);
        if (!sp)
                return;

        mutex_lock(&sp->mutex);
        list_del_init(&class_intf->node);
        if (class_intf->remove_dev) {
                class_dev_iter_init(&iter, parent, NULL, NULL);
                while ((dev = class_dev_iter_next(&iter)))
                        class_intf->remove_dev(dev);
                class_dev_iter_exit(&iter);
        }
        mutex_unlock(&sp->mutex);

        /*
         * Decrement the reference count twice, once for the class_to_subsys()
         * call in the start of this function, and the second one from the
         * reference increment in class_interface_register()
         */
        subsys_put(sp);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(class_interface_unregister);

ssize_t show_class_attr_string(const struct class *class,
                               const struct class_attribute *attr, char *buf)
{
        struct class_attribute_string *cs;

        cs = container_of(attr, struct class_attribute_string, attr);
        return sysfs_emit(buf, "%s\n", cs->str);
}

EXPORT_SYMBOL_GPL(show_class_attr_string);

struct class_compat {
        struct kobject *kobj;
};

/**
 * class_compat_register - register a compatibility class
 * @name: the name of the class
 *
 * Compatibility class are meant as a temporary user-space compatibility
 * workaround when converting a family of class devices to a bus devices.
 */
struct class_compat *class_compat_register(const char *name)
{
        struct class_compat *cls;

        cls = kmalloc_obj(struct class_compat);
        if (!cls)
                return NULL;
        cls->kobj = kobject_create_and_add(name, &class_kset->kobj);
        if (!cls->kobj) {
                kfree(cls);
                return NULL;
        }
        return cls;
}
EXPORT_SYMBOL_GPL(class_compat_register);

/**
 * class_compat_unregister - unregister a compatibility class
 * @cls: the class to unregister
 */
void class_compat_unregister(struct class_compat *cls)
{
        kobject_put(cls->kobj);
        kfree(cls);
}
EXPORT_SYMBOL_GPL(class_compat_unregister);

/**
 * class_compat_create_link - create a compatibility class device link to
 *                              a bus device
 * @cls: the compatibility class
 * @dev: the target bus device
 */
int class_compat_create_link(struct class_compat *cls, struct device *dev)
{
        return sysfs_create_link(cls->kobj, &dev->kobj, dev_name(dev));
}
EXPORT_SYMBOL_GPL(class_compat_create_link);

/**
 * class_compat_remove_link - remove a compatibility class device link to
 *                              a bus device
 * @cls: the compatibility class
 * @dev: the target bus device
 */
void class_compat_remove_link(struct class_compat *cls, struct device *dev)
{
        sysfs_remove_link(cls->kobj, dev_name(dev));
}
EXPORT_SYMBOL_GPL(class_compat_remove_link);

/**
 * class_is_registered - determine if at this moment in time, a class is
 *                         registered in the driver core or not.
 * @class: the class to check
 *
 * Returns a boolean to state if the class is registered in the driver core
 * or not.  Note that the value could switch right after this call is made,
 * so only use this in places where you "know" it is safe to do so (usually
 * to determine if the specific class has been registered yet or not).
 *
 * Be careful in using this.
 */
bool class_is_registered(const struct class *class)
{
        struct subsys_private *sp = class_to_subsys(class);
        bool is_initialized = false;

        if (sp) {
                is_initialized = true;
                subsys_put(sp);
        }
        return is_initialized;
}
EXPORT_SYMBOL_GPL(class_is_registered);

int __init classes_init(void)
{
        class_kset = kset_create_and_add("class", NULL, NULL);
        if (!class_kset)
                return -ENOMEM;
        return 0;
}














































































































































































































    2 



    2 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
// SPDX-License-Identifier: GPL-2.0-only
/* tunnel4.c: Generic IP tunnel transformer.
 *
 * Copyright (C) 2003 David S. Miller (davem@redhat.com)
 */

#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/mpls.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/xfrm.h>

static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly;
static DEFINE_MUTEX(tunnel4_mutex);

static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
{
        return (family == AF_INET) ? &tunnel4_handlers :
                (family == AF_INET6) ? &tunnel64_handlers :
                &tunnelmpls4_handlers;
}

int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
{
        struct xfrm_tunnel __rcu **pprev;
        struct xfrm_tunnel *t;

        int ret = -EEXIST;
        int priority = handler->priority;

        mutex_lock(&tunnel4_mutex);

        for (pprev = fam_handlers(family);
             (t = rcu_dereference_protected(*pprev,
                        lockdep_is_held(&tunnel4_mutex))) != NULL;
             pprev = &t->next) {
                if (t->priority > priority)
                        break;
                if (t->priority == priority)
                        goto err;
        }

        handler->next = *pprev;
        rcu_assign_pointer(*pprev, handler);

        ret = 0;

err:
        mutex_unlock(&tunnel4_mutex);

        return ret;
}
EXPORT_SYMBOL(xfrm4_tunnel_register);

int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
{
        struct xfrm_tunnel __rcu **pprev;
        struct xfrm_tunnel *t;
        int ret = -ENOENT;

        mutex_lock(&tunnel4_mutex);

        for (pprev = fam_handlers(family);
             (t = rcu_dereference_protected(*pprev,
                        lockdep_is_held(&tunnel4_mutex))) != NULL;
             pprev = &t->next) {
                if (t == handler) {
                        *pprev = handler->next;
                        ret = 0;
                        break;
                }
        }

        mutex_unlock(&tunnel4_mutex);

        synchronize_net();

        return ret;
}
EXPORT_SYMBOL(xfrm4_tunnel_deregister);

#define for_each_tunnel_rcu(head, handler)                \
        for (handler = rcu_dereference(head);                \
             handler != NULL;                                \
             handler = rcu_dereference(handler->next))        \

static int tunnel4_rcv(struct sk_buff *skb)
{
        struct xfrm_tunnel *handler;

        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto drop;

        for_each_tunnel_rcu(tunnel4_handlers, handler)
                if (!handler->handler(skb))
                        return 0;

        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

drop:
        kfree_skb(skb);
        return 0;
}

#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err)
{
        struct xfrm_tunnel __rcu *head;
        struct xfrm_tunnel *handler;
        int ret;

        head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers;

        for_each_tunnel_rcu(head, handler) {
                if (handler->cb_handler) {
                        ret = handler->cb_handler(skb, err);
                        if (ret <= 0)
                                return ret;
                }
        }

        return 0;
}

static const struct xfrm_input_afinfo tunnel4_input_afinfo = {
        .family                =        AF_INET,
        .is_ipip        =        true,
        .callback        =        tunnel4_rcv_cb,
};
#endif

#if IS_ENABLED(CONFIG_IPV6)
static int tunnel64_rcv(struct sk_buff *skb)
{
        struct xfrm_tunnel *handler;

        if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
                goto drop;

        for_each_tunnel_rcu(tunnel64_handlers, handler)
                if (!handler->handler(skb))
                        return 0;

        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

drop:
        kfree_skb(skb);
        return 0;
}
#endif

#if IS_ENABLED(CONFIG_MPLS)
static int tunnelmpls4_rcv(struct sk_buff *skb)
{
        struct xfrm_tunnel *handler;

        if (!pskb_may_pull(skb, sizeof(struct mpls_label)))
                goto drop;

        for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
                if (!handler->handler(skb))
                        return 0;

        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

drop:
        kfree_skb(skb);
        return 0;
}
#endif

static int tunnel4_err(struct sk_buff *skb, u32 info)
{
        struct xfrm_tunnel *handler;

        for_each_tunnel_rcu(tunnel4_handlers, handler)
                if (!handler->err_handler(skb, info))
                        return 0;

        return -ENOENT;
}

#if IS_ENABLED(CONFIG_IPV6)
static int tunnel64_err(struct sk_buff *skb, u32 info)
{
        struct xfrm_tunnel *handler;

        for_each_tunnel_rcu(tunnel64_handlers, handler)
                if (!handler->err_handler(skb, info))
                        return 0;

        return -ENOENT;
}
#endif

#if IS_ENABLED(CONFIG_MPLS)
static int tunnelmpls4_err(struct sk_buff *skb, u32 info)
{
        struct xfrm_tunnel *handler;

        for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
                if (!handler->err_handler(skb, info))
                        return 0;

        return -ENOENT;
}
#endif

static const struct net_protocol tunnel4_protocol = {
        .handler        =        tunnel4_rcv,
        .err_handler        =        tunnel4_err,
        .no_policy        =        1,
};

#if IS_ENABLED(CONFIG_IPV6)
static const struct net_protocol tunnel64_protocol = {
        .handler        =        tunnel64_rcv,
        .err_handler        =        tunnel64_err,
        .no_policy        =        1,
};
#endif

#if IS_ENABLED(CONFIG_MPLS)
static const struct net_protocol tunnelmpls4_protocol = {
        .handler        =        tunnelmpls4_rcv,
        .err_handler        =        tunnelmpls4_err,
        .no_policy        =        1,
};
#endif

static int __init tunnel4_init(void)
{
        if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP))
                goto err;
#if IS_ENABLED(CONFIG_IPV6)
        if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
                inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
                goto err;
        }
#endif
#if IS_ENABLED(CONFIG_MPLS)
        if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) {
                inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
#if IS_ENABLED(CONFIG_IPV6)
                inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6);
#endif
                goto err;
        }
#endif
#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
        if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) {
                inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
#if IS_ENABLED(CONFIG_IPV6)
                inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6);
#endif
#if IS_ENABLED(CONFIG_MPLS)
                inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS);
#endif
                goto err;
        }
#endif
        return 0;

err:
        pr_err("%s: can't add protocol\n", __func__);
        return -EAGAIN;
}

static void __exit tunnel4_fini(void)
{
#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
        if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo))
                pr_err("tunnel4 close: can't remove input afinfo\n");
#endif
#if IS_ENABLED(CONFIG_MPLS)
        if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS))
                pr_err("tunnelmpls4 close: can't remove protocol\n");
#endif
#if IS_ENABLED(CONFIG_IPV6)
        if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
                pr_err("tunnel64 close: can't remove protocol\n");
#endif
        if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP))
                pr_err("tunnel4 close: can't remove protocol\n");
}

module_init(tunnel4_init);
module_exit(tunnel4_fini);
MODULE_DESCRIPTION("IPv4 XFRM tunnel library");
MODULE_LICENSE("GPL");













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 








































    1 
























































































    1 















    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







    1 









    1 



    1 





































































    1 







    1 






    1 











    1 








































































































































































































































































































































    1 


























    1 









    1 
















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/signal.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-11-02  Modified for POSIX.1b signals by Richard Henderson
 *
 *  2003-06-02  Jim Houston - Concurrent Computer Corp.
 *                Changes to use preallocated sigqueue structures
 *                to allow signals to be sent reliably.
 */

#include <linux/slab.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/user.h>
#include <linux/sched/debug.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/coredump.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/signalfd.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/uprobes.h>
#include <linux/compat.h>
#include <linux/cn_proc.h>
#include <linux/compiler.h>
#include <linux/posix-timers.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
#include <linux/sysctl.h>
#include <uapi/linux/pidfd.h>

#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>

#include <asm/param.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
#include <asm/syscall.h>        /* for syscall_get_* */

#include "time/posix-timers.h"

/*
 * SLAB caches for signal bits.
 */

static struct kmem_cache *sigqueue_cachep;

int print_fatal_signals __read_mostly;

static void __user *sig_handler(struct task_struct *t, int sig)
{
        return t->sighand->action[sig - 1].sa.sa_handler;
}

static inline bool sig_handler_ignored(void __user *handler, int sig)
{
        /* Is it explicitly or implicitly ignored? */
        return handler == SIG_IGN ||
               (handler == SIG_DFL && sig_kernel_ignore(sig));
}

static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
{
        void __user *handler;

        handler = sig_handler(t, sig);

        /* SIGKILL and SIGSTOP may not be sent to the global init */
        if (unlikely(is_global_init(t) && sig_kernel_only(sig)))
                return true;

        if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
            handler == SIG_DFL && !(force && sig_kernel_only(sig)))
                return true;

        /* Only allow kernel generated signals to this kthread */
        if (unlikely((t->flags & PF_KTHREAD) &&
                     (handler == SIG_KTHREAD_KERNEL) && !force))
                return true;

        return sig_handler_ignored(handler, sig);
}

static bool sig_ignored(struct task_struct *t, int sig, bool force)
{
        /*
         * Blocked signals are never ignored, since the
         * signal handler may change by the time it is
         * unblocked.
         */
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return false;

        /*
         * Tracers may want to know about even ignored signal unless it
         * is SIGKILL which can't be reported anyway but can be ignored
         * by SIGNAL_UNKILLABLE task.
         */
        if (t->ptrace && sig != SIGKILL)
                return false;

        return sig_task_ignored(t, sig, force);
}

/*
 * Re-calculate pending state from the set of locally pending
 * signals, globally pending signals, and blocked signals.
 */
static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
{
        unsigned long ready;
        long i;

        switch (_NSIG_WORDS) {
        default:
                for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
                        ready |= signal->sig[i] &~ blocked->sig[i];
                break;

        case 4: ready  = signal->sig[3] &~ blocked->sig[3];
                ready |= signal->sig[2] &~ blocked->sig[2];
                ready |= signal->sig[1] &~ blocked->sig[1];
                ready |= signal->sig[0] &~ blocked->sig[0];
                break;

        case 2: ready  = signal->sig[1] &~ blocked->sig[1];
                ready |= signal->sig[0] &~ blocked->sig[0];
                break;

        case 1: ready  = signal->sig[0] &~ blocked->sig[0];
        }
        return ready !=        0;
}

#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))

static bool recalc_sigpending_tsk(struct task_struct *t)
{
        if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
            PENDING(&t->pending, &t->blocked) ||
            PENDING(&t->signal->shared_pending, &t->blocked) ||
            cgroup_task_frozen(t)) {
                set_tsk_thread_flag(t, TIF_SIGPENDING);
                return true;
        }

        /*
         * We must never clear the flag in another thread, or in current
         * when it's possible the current syscall is returning -ERESTART*.
         * So we don't clear it here, and only callers who know they should do.
         */
        return false;
}

void recalc_sigpending(void)
{
        if (!recalc_sigpending_tsk(current) && !freezing(current)) {
                if (unlikely(test_thread_flag(TIF_SIGPENDING)))
                        clear_thread_flag(TIF_SIGPENDING);
        }
}
EXPORT_SYMBOL(recalc_sigpending);

void calculate_sigpending(void)
{
        /* Have any signals or users of TIF_SIGPENDING been delayed
         * until after fork?
         */
        spin_lock_irq(&current->sighand->siglock);
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
}

/* Given the mask, find the first available signal that should be serviced. */

#define SYNCHRONOUS_MASK \
        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
         sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))

int next_signal(struct sigpending *pending, sigset_t *mask)
{
        unsigned long i, *s, *m, x;
        int sig = 0;

        s = pending->signal.sig;
        m = mask->sig;

        /*
         * Handle the first word specially: it contains the
         * synchronous signals that need to be dequeued first.
         */
        x = *s &~ *m;
        if (x) {
                if (x & SYNCHRONOUS_MASK)
                        x &= SYNCHRONOUS_MASK;
                sig = ffz(~x) + 1;
                return sig;
        }

        switch (_NSIG_WORDS) {
        default:
                for (i = 1; i < _NSIG_WORDS; ++i) {
                        x = *++s &~ *++m;
                        if (!x)
                                continue;
                        sig = ffz(~x) + i*_NSIG_BPW + 1;
                        break;
                }
                break;

        case 2:
                x = s[1] &~ m[1];
                if (!x)
                        break;
                sig = ffz(~x) + _NSIG_BPW + 1;
                break;

        case 1:
                /* Nothing to do */
                break;
        }

        return sig;
}

static inline void print_dropped_signal(int sig)
{
        static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);

        if (!print_fatal_signals)
                return;

        if (!__ratelimit(&ratelimit_state))
                return;

        pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
                                current->comm, current->pid, sig);
}

/**
 * task_set_jobctl_pending - set jobctl pending bits
 * @task: target task
 * @mask: pending bits to set
 *
 * Clear @mask from @task->jobctl.  @mask must be subset of
 * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
 * %JOBCTL_TRAPPING.  If stop signo is being set, the existing signo is
 * cleared.  If @task is already being killed or exiting, this function
 * becomes noop.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 *
 * RETURNS:
 * %true if @mask is set, %false if made noop because @task was dying.
 */
bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
{
        BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
                        JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
        BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));

        if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
                return false;

        if (mask & JOBCTL_STOP_SIGMASK)
                task->jobctl &= ~JOBCTL_STOP_SIGMASK;

        task->jobctl |= mask;
        return true;
}

/**
 * task_clear_jobctl_trapping - clear jobctl trapping bit
 * @task: target task
 *
 * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
 * Clear it and wake up the ptracer.  Note that we don't need any further
 * locking.  @task->siglock guarantees that @task->parent points to the
 * ptracer.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
void task_clear_jobctl_trapping(struct task_struct *task)
{
        if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
                task->jobctl &= ~JOBCTL_TRAPPING;
                smp_mb();        /* advised by wake_up_bit() */
                wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
        }
}

/**
 * task_clear_jobctl_pending - clear jobctl pending bits
 * @task: target task
 * @mask: pending bits to clear
 *
 * Clear @mask from @task->jobctl.  @mask must be subset of
 * %JOBCTL_PENDING_MASK.  If %JOBCTL_STOP_PENDING is being cleared, other
 * STOP bits are cleared together.
 *
 * If clearing of @mask leaves no stop or trap pending, this function calls
 * task_clear_jobctl_trapping().
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
{
        BUG_ON(mask & ~JOBCTL_PENDING_MASK);

        if (mask & JOBCTL_STOP_PENDING)
                mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;

        task->jobctl &= ~mask;

        if (!(task->jobctl & JOBCTL_PENDING_MASK))
                task_clear_jobctl_trapping(task);
}

/**
 * task_participate_group_stop - participate in a group stop
 * @task: task participating in a group stop
 *
 * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
 * Group stop states are cleared and the group stop count is consumed if
 * %JOBCTL_STOP_CONSUME was set.  If the consumption completes the group
 * stop, the appropriate `SIGNAL_*` flags are set.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 *
 * RETURNS:
 * %true if group stop completion should be notified to the parent, %false
 * otherwise.
 */
static bool task_participate_group_stop(struct task_struct *task)
{
        struct signal_struct *sig = task->signal;
        bool consume = task->jobctl & JOBCTL_STOP_CONSUME;

        WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));

        task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);

        if (!consume)
                return false;

        if (!WARN_ON_ONCE(sig->group_stop_count == 0))
                sig->group_stop_count--;

        /*
         * Tell the caller to notify completion iff we are entering into a
         * fresh group stop.  Read comment in do_signal_stop() for details.
         */
        if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
                signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
                return true;
        }
        return false;
}

void task_join_group_stop(struct task_struct *task)
{
        unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK;
        struct signal_struct *sig = current->signal;

        if (sig->group_stop_count) {
                sig->group_stop_count++;
                mask |= JOBCTL_STOP_CONSUME;
        } else if (!(sig->flags & SIGNAL_STOP_STOPPED))
                return;

        /* Have the new thread join an on-going signal group stop */
        task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
}

static struct ucounts *sig_get_ucounts(struct task_struct *t, int sig,
                                       int override_rlimit)
{
        struct ucounts *ucounts;
        long sigpending;

        /*
         * Protect access to @t credentials. This can go away when all
         * callers hold rcu read lock.
         *
         * NOTE! A pending signal will hold on to the user refcount,
         * and we get/put the refcount only when the sigpending count
         * changes from/to zero.
         */
        rcu_read_lock();
        ucounts = task_ucounts(t);
        sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING,
                                            override_rlimit);
        rcu_read_unlock();
        if (!sigpending)
                return NULL;

        if (unlikely(!override_rlimit && sigpending > task_rlimit(t, RLIMIT_SIGPENDING))) {
                dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
                print_dropped_signal(sig);
                return NULL;
        }

        return ucounts;
}

static void __sigqueue_init(struct sigqueue *q, struct ucounts *ucounts,
                            const unsigned int sigqueue_flags)
{
        INIT_LIST_HEAD(&q->list);
        q->flags = sigqueue_flags;
        q->ucounts = ucounts;
}

/*
 * allocate a new signal queue record
 * - this may be called without locks if and only if t == current, otherwise an
 *   appropriate lock must be held to stop the target task from exiting
 */
static struct sigqueue *sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
                                       int override_rlimit)
{
        struct ucounts *ucounts = sig_get_ucounts(t, sig, override_rlimit);
        struct sigqueue *q;

        if (!ucounts)
                return NULL;

        q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
        if (!q) {
                dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
                return NULL;
        }

        __sigqueue_init(q, ucounts, 0);
        return q;
}

static void __sigqueue_free(struct sigqueue *q)
{
        if (q->flags & SIGQUEUE_PREALLOC) {
                posixtimer_sigqueue_putref(q);
                return;
        }
        if (q->ucounts) {
                dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
                q->ucounts = NULL;
        }
        kmem_cache_free(sigqueue_cachep, q);
}

void flush_sigqueue(struct sigpending *queue)
{
        struct sigqueue *q;

        sigemptyset(&queue->signal);
        while (!list_empty(&queue->list)) {
                q = list_entry(queue->list.next, struct sigqueue , list);
                list_del_init(&q->list);
                __sigqueue_free(q);
        }
}

/*
 * Flush all pending signals for this kthread.
 */
void flush_signals(struct task_struct *t)
{
        unsigned long flags;

        spin_lock_irqsave(&t->sighand->siglock, flags);
        clear_tsk_thread_flag(t, TIF_SIGPENDING);
        flush_sigqueue(&t->pending);
        flush_sigqueue(&t->signal->shared_pending);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
EXPORT_SYMBOL(flush_signals);

void ignore_signals(struct task_struct *t)
{
        int i;

        for (i = 0; i < _NSIG; ++i)
                t->sighand->action[i].sa.sa_handler = SIG_IGN;

        flush_signals(t);
}

/*
 * Flush all handlers for a task.
 */

void
flush_signal_handlers(struct task_struct *t, int force_default)
{
        int i;
        struct k_sigaction *ka = &t->sighand->action[0];
        for (i = _NSIG ; i != 0 ; i--) {
                if (force_default || ka->sa.sa_handler != SIG_IGN)
                        ka->sa.sa_handler = SIG_DFL;
                ka->sa.sa_flags = 0;
#ifdef __ARCH_HAS_SA_RESTORER
                ka->sa.sa_restorer = NULL;
#endif
                sigemptyset(&ka->sa.sa_mask);
                ka++;
        }
}

bool unhandled_signal(struct task_struct *tsk, int sig)
{
        void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
        if (is_global_init(tsk))
                return true;

        if (handler != SIG_IGN && handler != SIG_DFL)
                return false;

        /* If dying, we handle all new signals by ignoring them */
        if (fatal_signal_pending(tsk))
                return false;

        /* if ptraced, let the tracer determine */
        return !tsk->ptrace;
}

static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info,
                           struct sigqueue **timer_sigq)
{
        struct sigqueue *q, *first = NULL;

        /*
         * Collect the siginfo appropriate to this signal.  Check if
         * there is another siginfo for the same signal.
        */
        list_for_each_entry(q, &list->list, list) {
                if (q->info.si_signo == sig) {
                        if (first)
                                goto still_pending;
                        first = q;
                }
        }

        sigdelset(&list->signal, sig);

        if (first) {
still_pending:
                list_del_init(&first->list);
                copy_siginfo(info, &first->info);

                /*
                 * posix-timer signals are preallocated and freed when the last
                 * reference count is dropped in posixtimer_deliver_signal() or
                 * immediately on timer deletion when the signal is not pending.
                 * Spare the extra round through __sigqueue_free() which is
                 * ignoring preallocated signals.
                 */
                if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER)))
                        *timer_sigq = first;
                else
                        __sigqueue_free(first);
        } else {
                /*
                 * Ok, it wasn't in the queue.  This must be
                 * a fast-pathed signal or we must have been
                 * out of queue space.  So zero out the info.
                 */
                clear_siginfo(info);
                info->si_signo = sig;
                info->si_errno = 0;
                info->si_code = SI_USER;
                info->si_pid = 0;
                info->si_uid = 0;
        }
}

static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                            kernel_siginfo_t *info, struct sigqueue **timer_sigq)
{
        int sig = next_signal(pending, mask);

        if (sig)
                collect_signal(sig, pending, info, timer_sigq);
        return sig;
}

/*
 * Try to dequeue a signal. If a deliverable signal is found fill in the
 * caller provided siginfo and return the signal number. Otherwise return
 * 0.
 */
int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type)
{
        struct task_struct *tsk = current;
        struct sigqueue *timer_sigq;
        int signr;

        lockdep_assert_held(&tsk->sighand->siglock);

again:
        *type = PIDTYPE_PID;
        timer_sigq = NULL;
        signr = __dequeue_signal(&tsk->pending, mask, info, &timer_sigq);
        if (!signr) {
                *type = PIDTYPE_TGID;
                signr = __dequeue_signal(&tsk->signal->shared_pending,
                                         mask, info, &timer_sigq);

                if (unlikely(signr == SIGALRM))
                        posixtimer_rearm_itimer(tsk);
        }

        recalc_sigpending();
        if (!signr)
                return 0;

        if (unlikely(sig_kernel_stop(signr))) {
                /*
                 * Set a marker that we have dequeued a stop signal.  Our
                 * caller might release the siglock and then the pending
                 * stop signal it is about to process is no longer in the
                 * pending bitmasks, but must still be cleared by a SIGCONT
                 * (and overruled by a SIGKILL).  So those cases clear this
                 * shared flag after we've set it.  Note that this flag may
                 * remain set after the signal we return is ignored or
                 * handled.  That doesn't matter because its only purpose
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
                current->jobctl |= JOBCTL_STOP_DEQUEUED;
        }

        if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(timer_sigq)) {
                if (!posixtimer_deliver_signal(info, timer_sigq))
                        goto again;
        }

        return signr;
}
EXPORT_SYMBOL_GPL(dequeue_signal);

static int dequeue_synchronous_signal(kernel_siginfo_t *info)
{
        struct task_struct *tsk = current;
        struct sigpending *pending = &tsk->pending;
        struct sigqueue *q, *sync = NULL;

        /*
         * Might a synchronous signal be in the queue?
         */
        if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
                return 0;

        /*
         * Return the first synchronous signal in the queue.
         */
        list_for_each_entry(q, &pending->list, list) {
                /* Synchronous signals have a positive si_code */
                if ((q->info.si_code > SI_USER) &&
                    (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
                        sync = q;
                        goto next;
                }
        }
        return 0;
next:
        /*
         * Check if there is another siginfo for the same signal.
         */
        list_for_each_entry_continue(q, &pending->list, list) {
                if (q->info.si_signo == sync->info.si_signo)
                        goto still_pending;
        }

        sigdelset(&pending->signal, sync->info.si_signo);
        recalc_sigpending();
still_pending:
        list_del_init(&sync->list);
        copy_siginfo(info, &sync->info);
        __sigqueue_free(sync);
        return info->si_signo;
}

/*
 * Tell a process that it has a new active signal..
 *
 * NOTE! we rely on the previous spin_lock to
 * lock interrupts for us! We can only be called with
 * "siglock" held, and the local interrupt must
 * have been disabled when that got acquired!
 *
 * No need to set need_resched since signal event passing
 * goes through ->blocked
 */
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
        lockdep_assert_held(&t->sighand->siglock);

        set_tsk_thread_flag(t, TIF_SIGPENDING);

        /*
         * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
         * case. We don't check t->state here because there is a race with it
         * executing another processor and just now entering stopped state.
         * By using wake_up_state, we ensure the process will wake up and
         * handle its death signal.
         */
        if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
                kick_process(t);
}

static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q);

static void sigqueue_free_ignored(struct task_struct *tsk, struct sigqueue *q)
{
        if (likely(!(q->flags & SIGQUEUE_PREALLOC) || q->info.si_code != SI_TIMER))
                __sigqueue_free(q);
        else
                posixtimer_sig_ignore(tsk, q);
}

/* Remove signals in mask from the pending set and queue. */
static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct sigpending *s)
{
        struct sigqueue *q, *n;
        sigset_t m;

        lockdep_assert_held(&p->sighand->siglock);

        sigandsets(&m, mask, &s->signal);
        if (sigisemptyset(&m))
                return;

        sigandnsets(&s->signal, &s->signal, mask);
        list_for_each_entry_safe(q, n, &s->list, list) {
                if (sigismember(mask, q->info.si_signo)) {
                        list_del_init(&q->list);
                        sigqueue_free_ignored(p, q);
                }
        }
}

static inline int is_si_special(const struct kernel_siginfo *info)
{
        return info <= SEND_SIG_PRIV;
}

static inline bool si_fromuser(const struct kernel_siginfo *info)
{
        return info == SEND_SIG_NOINFO ||
                (!is_si_special(info) && SI_FROMUSER(info));
}

/*
 * called with RCU read lock from check_kill_permission()
 */
static bool kill_ok_by_cred(struct task_struct *t)
{
        const struct cred *cred = current_cred();
        const struct cred *tcred = __task_cred(t);

        return uid_eq(cred->euid, tcred->suid) ||
               uid_eq(cred->euid, tcred->uid) ||
               uid_eq(cred->uid, tcred->suid) ||
               uid_eq(cred->uid, tcred->uid) ||
               ns_capable(tcred->user_ns, CAP_KILL);
}

/*
 * Bad permissions for sending the signal
 * - the caller must hold the RCU read lock
 */
static int check_kill_permission(int sig, struct kernel_siginfo *info,
                                 struct task_struct *t)
{
        struct pid *sid;
        int error;

        if (!valid_signal(sig))
                return -EINVAL;

        if (!si_fromuser(info))
                return 0;

        error = audit_signal_info(sig, t); /* Let audit system see the signal */
        if (error)
                return error;

        if (!same_thread_group(current, t) &&
            !kill_ok_by_cred(t)) {
                switch (sig) {
                case SIGCONT:
                        sid = task_session(t);
                        /*
                         * We don't return the error if sid == NULL. The
                         * task was unhashed, the caller must notice this.
                         */
                        if (!sid || sid == task_session(current))
                                break;
                        fallthrough;
                default:
                        return -EPERM;
                }
        }

        return security_task_kill(t, info, sig, NULL);
}

/**
 * ptrace_trap_notify - schedule trap to notify ptracer
 * @t: tracee wanting to notify tracer
 *
 * This function schedules sticky ptrace trap which is cleared on the next
 * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
 * ptracer.
 *
 * If @t is running, STOP trap will be taken.  If trapped for STOP and
 * ptracer is listening for events, tracee is woken up so that it can
 * re-trap for the new event.  If trapped otherwise, STOP trap will be
 * eventually taken without returning to userland after the existing traps
 * are finished by PTRACE_CONT.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
static void ptrace_trap_notify(struct task_struct *t)
{
        WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
        lockdep_assert_held(&t->sighand->siglock);

        task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
        ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
}

/*
 * Handle magic process-wide effects of stop/continue signals. Unlike
 * the signal actions, these happen immediately at signal-generation
 * time regardless of blocking, ignoring, or handling.  This does the
 * actual continuing for SIGCONT, but not the actual stopping for stop
 * signals. The process stop is done as a signal action for SIG_DFL.
 *
 * Returns true if the signal should be actually delivered, otherwise
 * it should be dropped.
 */
static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
        struct signal_struct *signal = p->signal;
        struct task_struct *t;
        sigset_t flush;

        if (signal->flags & SIGNAL_GROUP_EXIT) {
                if (signal->core_state)
                        return sig == SIGKILL;
                /*
                 * The process is in the middle of dying, drop the signal.
                 */
                return false;
        } else if (sig_kernel_stop(sig)) {
                /*
                 * This is a stop signal.  Remove SIGCONT from all queues.
                 */
                siginitset(&flush, sigmask(SIGCONT));
                flush_sigqueue_mask(p, &flush, &signal->shared_pending);
                for_each_thread(p, t)
                        flush_sigqueue_mask(p, &flush, &t->pending);
        } else if (sig == SIGCONT) {
                unsigned int why;
                /*
                 * Remove all stop signals from all queues, wake all threads.
                 */
                siginitset(&flush, SIG_KERNEL_STOP_MASK);
                flush_sigqueue_mask(p, &flush, &signal->shared_pending);
                for_each_thread(p, t) {
                        flush_sigqueue_mask(p, &flush, &t->pending);
                        task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
                        if (likely(!(t->ptrace & PT_SEIZED))) {
                                t->jobctl &= ~JOBCTL_STOPPED;
                                wake_up_state(t, __TASK_STOPPED);
                        } else
                                ptrace_trap_notify(t);
                }

                /*
                 * Notify the parent with CLD_CONTINUED if we were stopped.
                 *
                 * If we were in the middle of a group stop, we pretend it
                 * was already finished, and then continued. Since SIGCHLD
                 * doesn't queue we report only CLD_STOPPED, as if the next
                 * CLD_CONTINUED was dropped.
                 */
                why = 0;
                if (signal->flags & SIGNAL_STOP_STOPPED)
                        why |= SIGNAL_CLD_CONTINUED;
                else if (signal->group_stop_count)
                        why |= SIGNAL_CLD_STOPPED;

                if (why) {
                        /*
                         * The first thread which returns from do_signal_stop()
                         * will take ->siglock, notice SIGNAL_CLD_MASK, and
                         * notify its parent. See get_signal().
                         */
                        signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
                        signal->group_stop_count = 0;
                        signal->group_exit_code = 0;
                }
        }

        return !sig_ignored(p, sig, force);
}

/*
 * Test if P wants to take SIG.  After we've checked all threads with this,
 * it's equivalent to finding no threads not blocking SIG.  Any threads not
 * blocking SIG were ruled out because they are not running and already
 * have pending signals.  Such threads will dequeue from the shared queue
 * as soon as they're available, so putting the signal on the shared queue
 * will be equivalent to sending it to one such thread.
 */
static inline bool wants_signal(int sig, struct task_struct *p)
{
        if (sigismember(&p->blocked, sig))
                return false;

        if (p->flags & PF_EXITING)
                return false;

        if (sig == SIGKILL)
                return true;

        if (task_is_stopped_or_traced(p))
                return false;

        return task_curr(p) || !task_sigpending(p);
}

static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
        struct signal_struct *signal = p->signal;
        struct task_struct *t;

        /*
         * Now find a thread we can wake up to take the signal off the queue.
         *
         * Try the suggested task first (may or may not be the main thread).
         */
        if (wants_signal(sig, p))
                t = p;
        else if ((type == PIDTYPE_PID) || thread_group_empty(p))
                /*
                 * There is just one thread and it does not need to be woken.
                 * It will dequeue unblocked signals before it runs again.
                 */
                return;
        else {
                /*
                 * Otherwise try to find a suitable thread.
                 */
                t = signal->curr_target;
                while (!wants_signal(sig, t)) {
                        t = next_thread(t);
                        if (t == signal->curr_target)
                                /*
                                 * No thread needs to be woken.
                                 * Any eligible threads will see
                                 * the signal in the queue soon.
                                 */
                                return;
                }
                signal->curr_target = t;
        }

        /*
         * Found a killable thread.  If the signal will be fatal,
         * then start taking the whole group down immediately.
         */
        if (sig_fatal(p, sig) && !sigismember(&t->real_blocked, sig) &&
            (sig == SIGKILL || !p->ptrace)) {
                /*
                 * This signal will be fatal to the whole group.
                 */
                if (!sig_kernel_coredump(sig)) {
                        /*
                         * Start a group exit and wake everybody up.
                         * This way we don't have other threads
                         * running and doing things after a slower
                         * thread has the fatal signal pending.
                         */
                        signal->flags = SIGNAL_GROUP_EXIT;
                        signal->group_exit_code = sig;
                        signal->group_stop_count = 0;
                        __for_each_thread(signal, t) {
                                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                                sigaddset(&t->pending.signal, SIGKILL);
                                signal_wake_up(t, 1);
                        }
                        return;
                }
        }

        /*
         * The signal is already in the shared-pending queue.
         * Tell the chosen thread to wake up and dequeue it.
         */
        signal_wake_up(t, sig == SIGKILL);
        return;
}

static inline bool legacy_queue(struct sigpending *signals, int sig)
{
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}

static int __send_signal_locked(int sig, struct kernel_siginfo *info,
                                struct task_struct *t, enum pid_type type, bool force)
{
        struct sigpending *pending;
        struct sigqueue *q;
        int override_rlimit;
        int ret = 0, result;

        lockdep_assert_held(&t->sighand->siglock);

        result = TRACE_SIGNAL_IGNORED;
        if (!prepare_signal(sig, t, force))
                goto ret;

        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
        /*
         * Short-circuit ignored signals and support queuing
         * exactly one non-rt signal, so that we can get more
         * detailed information about the cause of the signal.
         */
        result = TRACE_SIGNAL_ALREADY_PENDING;
        if (legacy_queue(pending, sig))
                goto ret;

        result = TRACE_SIGNAL_DELIVERED;
        /*
         * Skip useless siginfo allocation for SIGKILL and kernel threads.
         */
        if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
                goto out_set;

        /*
         * Real-time signals must be queued if sent by sigqueue, or
         * some other real-time mechanism.  It is implementation
         * defined whether kill() does so.  We attempt to do so, on
         * the principle of least surprise, but since kill is not
         * allowed to fail with EAGAIN when low on memory we just
         * make sure at least one signal gets delivered and don't
         * pass on the info struct.
         */
        if (sig < SIGRTMIN)
                override_rlimit = (is_si_special(info) || info->si_code >= 0);
        else
                override_rlimit = 0;

        q = sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);

        if (q) {
                list_add_tail(&q->list, &pending->list);
                switch ((unsigned long) info) {
                case (unsigned long) SEND_SIG_NOINFO:
                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_USER;
                        q->info.si_pid = task_tgid_nr_ns(current,
                                                        task_active_pid_ns(t));
                        rcu_read_lock();
                        q->info.si_uid =
                                from_kuid_munged(task_cred_xxx(t, user_ns),
                                                 current_uid());
                        rcu_read_unlock();
                        break;
                case (unsigned long) SEND_SIG_PRIV:
                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_KERNEL;
                        q->info.si_pid = 0;
                        q->info.si_uid = 0;
                        break;
                default:
                        copy_siginfo(&q->info, info);
                        break;
                }
        } else if (!is_si_special(info) &&
                   sig >= SIGRTMIN && info->si_code != SI_USER) {
                /*
                 * Queue overflow, abort.  We may abort if the
                 * signal was rt and sent by user using something
                 * other than kill().
                 */
                result = TRACE_SIGNAL_OVERFLOW_FAIL;
                ret = -EAGAIN;
                goto ret;
        } else {
                /*
                 * This is a silent loss of information.  We still
                 * send the signal, but the *info bits are lost.
                 */
                result = TRACE_SIGNAL_LOSE_INFO;
        }

out_set:
        signalfd_notify(t, sig);
        sigaddset(&pending->signal, sig);

        /* Let multiprocess signals appear after on-going forks */
        if (type > PIDTYPE_TGID) {
                struct multiprocess_signals *delayed;
                hlist_for_each_entry(delayed, &t->signal->multiprocess, node) {
                        sigset_t *signal = &delayed->signal;
                        /* Can't queue both a stop and a continue signal */
                        if (sig == SIGCONT)
                                sigdelsetmask(signal, SIG_KERNEL_STOP_MASK);
                        else if (sig_kernel_stop(sig))
                                sigdelset(signal, SIGCONT);
                        sigaddset(signal, sig);
                }
        }

        complete_signal(sig, t, type);
ret:
        trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
        return ret;
}

static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
{
        bool ret = false;
        switch (siginfo_layout(info->si_signo, info->si_code)) {
        case SIL_KILL:
        case SIL_CHLD:
        case SIL_RT:
                ret = true;
                break;
        case SIL_TIMER:
        case SIL_POLL:
        case SIL_FAULT:
        case SIL_FAULT_TRAPNO:
        case SIL_FAULT_MCEERR:
        case SIL_FAULT_BNDERR:
        case SIL_FAULT_PKUERR:
        case SIL_FAULT_PERF_EVENT:
        case SIL_SYS:
                ret = false;
                break;
        }
        return ret;
}

int send_signal_locked(int sig, struct kernel_siginfo *info,
                       struct task_struct *t, enum pid_type type)
{
        /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
        bool force = false;

        if (info == SEND_SIG_NOINFO) {
                /* Force if sent from an ancestor pid namespace */
                force = !task_pid_nr_ns(current, task_active_pid_ns(t));
        } else if (info == SEND_SIG_PRIV) {
                /* Don't ignore kernel generated signals */
                force = true;
        } else if (has_si_pid_and_uid(info)) {
                /* SIGKILL and SIGSTOP is special or has ids */
                struct user_namespace *t_user_ns;

                rcu_read_lock();
                t_user_ns = task_cred_xxx(t, user_ns);
                if (current_user_ns() != t_user_ns) {
                        kuid_t uid = make_kuid(current_user_ns(), info->si_uid);
                        info->si_uid = from_kuid_munged(t_user_ns, uid);
                }
                rcu_read_unlock();

                /* A kernel generated signal? */
                force = (info->si_code == SI_KERNEL);

                /* From an ancestor pid namespace? */
                if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
                        info->si_pid = 0;
                        force = true;
                }
        }
        return __send_signal_locked(sig, info, t, type, force);
}

static void print_fatal_signal(int signr)
{
        struct pt_regs *regs = task_pt_regs(current);
        struct file *exe_file;

        exe_file = get_task_exe_file(current);
        if (exe_file) {
                pr_info("%pD: %s: potentially unexpected fatal signal %d.\n",
                        exe_file, current->comm, signr);
                fput(exe_file);
        } else {
                pr_info("%s: potentially unexpected fatal signal %d.\n",
                        current->comm, signr);
        }

#if defined(__i386__) && !defined(__arch_um__)
        pr_info("code at %08lx: ", regs->ip);
        {
                int i;
                for (i = 0; i < 16; i++) {
                        unsigned char insn;

                        if (get_user(insn, (unsigned char *)(regs->ip + i)))
                                break;
                        pr_cont("%02x ", insn);
                }
        }
        pr_cont("\n");
#endif
        preempt_disable();
        show_regs(regs);
        preempt_enable();
}

static int __init setup_print_fatal_signals(char *str)
{
        get_option (&str, &print_fatal_signals);

        return 1;
}

__setup("print-fatal-signals=", setup_print_fatal_signals);

int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p,
                        enum pid_type type)
{
        unsigned long flags;
        int ret = -ESRCH;

        if (lock_task_sighand(p, &flags)) {
                ret = send_signal_locked(sig, info, p, type);
                unlock_task_sighand(p, &flags);
        }

        return ret;
}

enum sig_handler {
        HANDLER_CURRENT, /* If reachable use the current handler */
        HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */
        HANDLER_EXIT,         /* Only visible as the process exit code */
};

/*
 * Force a signal that the process can't ignore: if necessary
 * we unblock the signal and change any SIG_IGN to SIG_DFL.
 *
 * Note: If we unblock the signal, we always reset it to SIG_DFL,
 * since we do not want to have a signal handler that was blocked
 * be invoked when user space had explicitly blocked it.
 *
 * We don't want to have recursive SIGSEGV's etc, for example,
 * that is why we also clear SIGNAL_UNKILLABLE.
 */
static int
force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
        enum sig_handler handler)
{
        unsigned long int flags;
        int ret, blocked, ignored;
        struct k_sigaction *action;
        int sig = info->si_signo;

        spin_lock_irqsave(&t->sighand->siglock, flags);
        action = &t->sighand->action[sig-1];
        ignored = action->sa.sa_handler == SIG_IGN;
        blocked = sigismember(&t->blocked, sig);
        if (blocked || ignored || (handler != HANDLER_CURRENT)) {
                action->sa.sa_handler = SIG_DFL;
                if (handler == HANDLER_EXIT)
                        action->sa.sa_flags |= SA_IMMUTABLE;
                if (blocked)
                        sigdelset(&t->blocked, sig);
        }
        /*
         * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
         * debugging to leave init killable. But HANDLER_EXIT is always fatal.
         */
        if (action->sa.sa_handler == SIG_DFL &&
            (!t->ptrace || (handler == HANDLER_EXIT)))
                t->signal->flags &= ~SIGNAL_UNKILLABLE;
        ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
        /* This can happen if the signal was already pending and blocked */
        if (!task_sigpending(t))
                signal_wake_up(t, 0);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);

        return ret;
}

int force_sig_info(struct kernel_siginfo *info)
{
        return force_sig_info_to_task(info, current, HANDLER_CURRENT);
}

/*
 * Nuke all other threads in the group.
 */
int zap_other_threads(struct task_struct *p)
{
        struct task_struct *t;
        int count = 0;

        p->signal->group_stop_count = 0;

        for_other_threads(p, t) {
                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                count++;

                /* Don't bother with already dead threads */
                if (t->exit_state)
                        continue;
                sigaddset(&t->pending.signal, SIGKILL);
                signal_wake_up(t, 1);
        }

        return count;
}

struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
                                         unsigned long *flags)
{
        struct sighand_struct *sighand;

        rcu_read_lock();
        for (;;) {
                sighand = rcu_dereference(tsk->sighand);
                if (unlikely(sighand == NULL))
                        break;

                /*
                 * This sighand can be already freed and even reused, but
                 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
                 * initializes ->siglock: this slab can't go away, it has
                 * the same object type, ->siglock can't be reinitialized.
                 *
                 * We need to ensure that tsk->sighand is still the same
                 * after we take the lock, we can race with de_thread() or
                 * __exit_signal(). In the latter case the next iteration
                 * must see ->sighand == NULL.
                 */
                spin_lock_irqsave(&sighand->siglock, *flags);
                if (likely(sighand == rcu_access_pointer(tsk->sighand)))
                        break;
                spin_unlock_irqrestore(&sighand->siglock, *flags);
        }
        rcu_read_unlock();

        return sighand;
}

#ifdef CONFIG_LOCKDEP
void lockdep_assert_task_sighand_held(struct task_struct *task)
{
        struct sighand_struct *sighand;

        rcu_read_lock();
        sighand = rcu_dereference(task->sighand);
        if (sighand)
                lockdep_assert_held(&sighand->siglock);
        else
                WARN_ON_ONCE(1);
        rcu_read_unlock();
}
#endif

/*
 * send signal info to all the members of a thread group or to the
 * individual thread if type == PIDTYPE_PID.
 */
int group_send_sig_info(int sig, struct kernel_siginfo *info,
                        struct task_struct *p, enum pid_type type)
{
        int ret;

        rcu_read_lock();
        ret = check_kill_permission(sig, info, p);
        rcu_read_unlock();

        if (!ret && sig)
                ret = do_send_sig_info(sig, info, p, type);

        return ret;
}

/*
 * __kill_pgrp_info() sends a signal to a process group: this is what the tty
 * control characters do (^C, ^Z etc)
 * - the caller must hold at least a readlock on tasklist_lock
 */
int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
        struct task_struct *p = NULL;
        int ret = -ESRCH;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
                /*
                 * If group_send_sig_info() succeeds at least once ret
                 * becomes 0 and after that the code below has no effect.
                 * Otherwise we return the last err or -ESRCH if this
                 * process group is empty.
                 */
                if (ret)
                        ret = err;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return ret;
}

static int kill_pid_info_type(int sig, struct kernel_siginfo *info,
                                struct pid *pid, enum pid_type type)
{
        int error = -ESRCH;
        struct task_struct *p;

        for (;;) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        error = group_send_sig_info(sig, info, p, type);
                rcu_read_unlock();
                if (likely(!p || error != -ESRCH))
                        return error;
                /*
                 * The task was unhashed in between, try again.  If it
                 * is dead, pid_task() will return NULL, if we race with
                 * de_thread() it will find the new leader.
                 */
        }
}

int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
{
        return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID);
}

static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
        int error;
        rcu_read_lock();
        error = kill_pid_info(sig, info, find_vpid(pid));
        rcu_read_unlock();
        return error;
}

static inline bool kill_as_cred_perm(const struct cred *cred,
                                     struct task_struct *target)
{
        const struct cred *pcred = __task_cred(target);

        return uid_eq(cred->euid, pcred->suid) ||
               uid_eq(cred->euid, pcred->uid) ||
               uid_eq(cred->uid, pcred->suid) ||
               uid_eq(cred->uid, pcred->uid);
}

/*
 * The usb asyncio usage of siginfo is wrong.  The glibc support
 * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT.
 * AKA after the generic fields:
 *        kernel_pid_t        si_pid;
 *        kernel_uid32_t        si_uid;
 *        sigval_t        si_value;
 *
 * Unfortunately when usb generates SI_ASYNCIO it assumes the layout
 * after the generic fields is:
 *        void __user         *si_addr;
 *
 * This is a practical problem when there is a 64bit big endian kernel
 * and a 32bit userspace.  As the 32bit address will encoded in the low
 * 32bits of the pointer.  Those low 32bits will be stored at higher
 * address than appear in a 32 bit pointer.  So userspace will not
 * see the address it was expecting for it's completions.
 *
 * There is nothing in the encoding that can allow
 * copy_siginfo_to_user32 to detect this confusion of formats, so
 * handle this by requiring the caller of kill_pid_usb_asyncio to
 * notice when this situration takes place and to store the 32bit
 * pointer in sival_int, instead of sival_addr of the sigval_t addr
 * parameter.
 */
int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
                         struct pid *pid, const struct cred *cred)
{
        struct kernel_siginfo info;
        struct task_struct *p;
        unsigned long flags;
        int ret = -EINVAL;

        if (!valid_signal(sig))
                return ret;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = errno;
        info.si_code = SI_ASYNCIO;
        *((sigval_t *)&info.si_pid) = addr;

        rcu_read_lock();
        p = pid_task(pid, PIDTYPE_PID);
        if (!p) {
                ret = -ESRCH;
                goto out_unlock;
        }
        if (!kill_as_cred_perm(cred, p)) {
                ret = -EPERM;
                goto out_unlock;
        }
        ret = security_task_kill(p, &info, sig, cred);
        if (ret)
                goto out_unlock;

        if (sig) {
                if (lock_task_sighand(p, &flags)) {
                        ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false);
                        unlock_task_sighand(p, &flags);
                } else
                        ret = -ESRCH;
        }
out_unlock:
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio);

/*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
 *
 * POSIX specifies that kill(-1,sig) is unspecified, but what we have
 * is probably wrong.  Should make it like BSD or SYSV.
 */

static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
        int ret;

        if (pid > 0)
                return kill_proc_info(sig, info, pid);

        /* -INT_MIN is undefined.  Exclude this case to avoid a UBSAN warning */
        if (pid == INT_MIN)
                return -ESRCH;

        read_lock(&tasklist_lock);
        if (pid != -1) {
                ret = __kill_pgrp_info(sig, info,
                                pid ? find_vpid(-pid) : task_pgrp(current));
        } else {
                int retval = 0, count = 0;
                struct task_struct * p;

                for_each_process(p) {
                        if (task_pid_vnr(p) > 1 &&
                                        !same_thread_group(p, current)) {
                                int err = group_send_sig_info(sig, info, p,
                                                              PIDTYPE_MAX);
                                ++count;
                                if (err != -EPERM)
                                        retval = err;
                        }
                }
                ret = count ? retval : -ESRCH;
        }
        read_unlock(&tasklist_lock);

        return ret;
}

/*
 * These are for backward compatibility with the rest of the kernel source.
 */

int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
{
        /*
         * Make sure legacy kernel users don't send in bad values
         * (normal paths check this in check_kill_permission).
         */
        if (!valid_signal(sig))
                return -EINVAL;

        return do_send_sig_info(sig, info, p, PIDTYPE_PID);
}
EXPORT_SYMBOL(send_sig_info);

#define __si_special(priv) \
        ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)

int
send_sig(int sig, struct task_struct *p, int priv)
{
        return send_sig_info(sig, __si_special(priv), p);
}
EXPORT_SYMBOL(send_sig);

void force_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info(&info);
}
EXPORT_SYMBOL(force_sig);

void force_fatal_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info_to_task(&info, current, HANDLER_SIG_DFL);
}

void force_exit_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info_to_task(&info, current, HANDLER_EXIT);
}

/*
 * When things go south during signal handling, we
 * will force a SIGSEGV. And if the signal that caused
 * the problem was already a SIGSEGV, we'll want to
 * make sure we don't even try to deliver the signal..
 */
void force_sigsegv(int sig)
{
        if (sig == SIGSEGV)
                force_fatal_sig(SIGSEGV);
        else
                force_sig(SIGSEGV);
}

int force_sig_fault_to_task(int sig, int code, void __user *addr,
                            struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
}

int force_sig_fault(int sig, int code, void __user *addr)
{
        return force_sig_fault_to_task(sig, code, addr, current);
}

int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        return send_sig_info(info.si_signo, &info, t);
}

int force_sig_mceerr(int code, void __user *addr, short lsb)
{
        struct kernel_siginfo info;

        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
        clear_siginfo(&info);
        info.si_signo = SIGBUS;
        info.si_errno = 0;
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
        return force_sig_info(&info);
}

int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
{
        struct kernel_siginfo info;

        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
        clear_siginfo(&info);
        info.si_signo = SIGBUS;
        info.si_errno = 0;
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
        return send_sig_info(info.si_signo, &info, t);
}
EXPORT_SYMBOL(send_sig_mceerr);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSEGV;
        info.si_errno = 0;
        info.si_code  = SEGV_BNDERR;
        info.si_addr  = addr;
        info.si_lower = lower;
        info.si_upper = upper;
        return force_sig_info(&info);
}

#ifdef SEGV_PKUERR
int force_sig_pkuerr(void __user *addr, u32 pkey)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSEGV;
        info.si_errno = 0;
        info.si_code  = SEGV_PKUERR;
        info.si_addr  = addr;
        info.si_pkey  = pkey;
        return force_sig_info(&info);
}
#endif

int send_sig_perf(void __user *addr, u32 type, u64 sig_data)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo     = SIGTRAP;
        info.si_errno     = 0;
        info.si_code      = TRAP_PERF;
        info.si_addr      = addr;
        info.si_perf_data = sig_data;
        info.si_perf_type = type;

        /*
         * Signals generated by perf events should not terminate the whole
         * process if SIGTRAP is blocked, however, delivering the signal
         * asynchronously is better than not delivering at all. But tell user
         * space if the signal was asynchronous, so it can clearly be
         * distinguished from normal synchronous ones.
         */
        info.si_perf_flags = sigismember(&current->blocked, info.si_signo) ?
                                     TRAP_PERF_FLAG_ASYNC :
                                     0;

        return send_sig_info(info.si_signo, &info, current);
}

/**
 * force_sig_seccomp - signals the task to allow in-process syscall emulation
 * @syscall: syscall number to send to userland
 * @reason: filter-supplied reason code to send to userland (via si_errno)
 * @force_coredump: true to trigger a coredump
 *
 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
 */
int force_sig_seccomp(int syscall, int reason, bool force_coredump)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSYS;
        info.si_code = SYS_SECCOMP;
        info.si_call_addr = (void __user *)KSTK_EIP(current);
        info.si_errno = reason;
        info.si_arch = syscall_get_arch(current);
        info.si_syscall = syscall;
        return force_sig_info_to_task(&info, current,
                force_coredump ? HANDLER_EXIT : HANDLER_CURRENT);
}

/* For the crazy architectures that include trap information in
 * the errno field, instead of an actual errno value.
 */
int force_sig_ptrace_errno_trap(int errno, void __user *addr)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGTRAP;
        info.si_errno = errno;
        info.si_code  = TRAP_HWBKPT;
        info.si_addr  = addr;
        return force_sig_info(&info);
}

/* For the rare architectures that include trap information using
 * si_trapno.
 */
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        info.si_trapno = trapno;
        return force_sig_info(&info);
}

/* For the rare architectures that include trap information using
 * si_trapno.
 */
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
                          struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        info.si_trapno = trapno;
        return send_sig_info(info.si_signo, &info, t);
}

static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
        int ret;
        read_lock(&tasklist_lock);
        ret = __kill_pgrp_info(sig, info, pgrp);
        read_unlock(&tasklist_lock);
        return ret;
}

int kill_pgrp(struct pid *pid, int sig, int priv)
{
        return kill_pgrp_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pgrp);

int kill_pid(struct pid *pid, int sig, int priv)
{
        return kill_pid_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pid);

#ifdef CONFIG_POSIX_TIMERS
/*
 * These functions handle POSIX timer signals. POSIX timers use
 * preallocated sigqueue structs for sending signals.
 */
static void __flush_itimer_signals(struct sigpending *pending)
{
        sigset_t signal, retain;
        struct sigqueue *q, *n;

        signal = pending->signal;
        sigemptyset(&retain);

        list_for_each_entry_safe(q, n, &pending->list, list) {
                int sig = q->info.si_signo;

                if (likely(q->info.si_code != SI_TIMER)) {
                        sigaddset(&retain, sig);
                } else {
                        sigdelset(&signal, sig);
                        list_del_init(&q->list);
                        __sigqueue_free(q);
                }
        }

        sigorsets(&pending->signal, &signal, &retain);
}

void flush_itimer_signals(void)
{
        struct task_struct *tsk = current;

        guard(spinlock_irqsave)(&tsk->sighand->siglock);
        __flush_itimer_signals(&tsk->pending);
        __flush_itimer_signals(&tsk->signal->shared_pending);
}

bool posixtimer_init_sigqueue(struct sigqueue *q)
{
        struct ucounts *ucounts = sig_get_ucounts(current, -1, 0);

        if (!ucounts)
                return false;
        clear_siginfo(&q->info);
        __sigqueue_init(q, ucounts, SIGQUEUE_PREALLOC);
        return true;
}

static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type)
{
        struct sigpending *pending;
        int sig = q->info.si_signo;

        signalfd_notify(t, sig);
        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
        list_add_tail(&q->list, &pending->list);
        sigaddset(&pending->signal, sig);
        complete_signal(sig, t, type);
}

/*
 * This function is used by POSIX timers to deliver a timer signal.
 * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
 * set), the signal must be delivered to the specific thread (queues
 * into t->pending).
 *
 * Where type is not PIDTYPE_PID, signals must be delivered to the
 * process. In this case, prefer to deliver to current if it is in
 * the same thread group as the target process and its sighand is
 * stable, which avoids unnecessarily waking up a potentially idle task.
 */
static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr)
{
        struct task_struct *t = pid_task(tmr->it_pid, tmr->it_pid_type);

        if (t && tmr->it_pid_type != PIDTYPE_PID &&
            same_thread_group(t, current) && !current->exit_state)
                t = current;
        return t;
}

void posixtimer_send_sigqueue(struct k_itimer *tmr)
{
        struct sigqueue *q = &tmr->sigq;
        int sig = q->info.si_signo;
        struct task_struct *t;
        unsigned long flags;
        int result;

        guard(rcu)();

        t = posixtimer_get_target(tmr);
        if (!t)
                return;

        if (!likely(lock_task_sighand(t, &flags)))
                return;

        /*
         * Update @tmr::sigqueue_seq for posix timer signals with sighand
         * locked to prevent a race against dequeue_signal().
         */
        tmr->it_sigqueue_seq = tmr->it_signal_seq;

        /*
         * Set the signal delivery status under sighand lock, so that the
         * ignored signal handling can distinguish between a periodic and a
         * non-periodic timer.
         */
        tmr->it_sig_periodic = tmr->it_status == POSIX_TIMER_REQUEUE_PENDING;

        if (!prepare_signal(sig, t, false)) {
                result = TRACE_SIGNAL_IGNORED;

                if (!list_empty(&q->list)) {
                        /*
                         * The signal was ignored and blocked. The timer
                         * expiry queued it because blocked signals are
                         * queued independent of the ignored state.
                         *
                         * The unblocking set SIGPENDING, but the signal
                         * was not yet dequeued from the pending list.
                         * So prepare_signal() sees unblocked and ignored,
                         * which ends up here. Leave it queued like a
                         * regular signal.
                         *
                         * The same happens when the task group is exiting
                         * and the signal is already queued.
                         * prepare_signal() treats SIGNAL_GROUP_EXIT as
                         * ignored independent of its queued state. This
                         * gets cleaned up in __exit_signal().
                         */
                        goto out;
                }

                /* Periodic timers with SIG_IGN are queued on the ignored list */
                if (tmr->it_sig_periodic) {
                        /*
                         * Already queued means the timer was rearmed after
                         * the previous expiry got it on the ignore list.
                         * Nothing to do for that case.
                         */
                        if (hlist_unhashed(&tmr->ignored_list)) {
                                /*
                                 * Take a signal reference and queue it on
                                 * the ignored list.
                                 */
                                posixtimer_sigqueue_getref(q);
                                posixtimer_sig_ignore(t, q);
                        }
                } else if (!hlist_unhashed(&tmr->ignored_list)) {
                        /*
                         * Covers the case where a timer was periodic and
                         * then the signal was ignored. Later it was rearmed
                         * as oneshot timer. The previous signal is invalid
                         * now, and this oneshot signal has to be dropped.
                         * Remove it from the ignored list and drop the
                         * reference count as the signal is not longer
                         * queued.
                         */
                        hlist_del_init(&tmr->ignored_list);
                        posixtimer_putref(tmr);
                }
                goto out;
        }

        if (unlikely(!list_empty(&q->list))) {
                /* This holds a reference count already */
                result = TRACE_SIGNAL_ALREADY_PENDING;
                goto out;
        }

        /*
         * If the signal is on the ignore list, it got blocked after it was
         * ignored earlier. But nothing lifted the ignore. Move it back to
         * the pending list to be consistent with the regular signal
         * handling. This already holds a reference count.
         *
         * If it's not on the ignore list acquire a reference count.
         */
        if (likely(hlist_unhashed(&tmr->ignored_list)))
                posixtimer_sigqueue_getref(q);
        else
                hlist_del_init(&tmr->ignored_list);

        posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
        result = TRACE_SIGNAL_DELIVERED;
out:
        trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result);
        unlock_task_sighand(t, &flags);
}

static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q)
{
        struct k_itimer *tmr = container_of(q, struct k_itimer, sigq);

        /*
         * If the timer is marked deleted already or the signal originates
         * from a non-periodic timer, then just drop the reference
         * count. Otherwise queue it on the ignored list.
         */
        if (posixtimer_valid(tmr) && tmr->it_sig_periodic)
                hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers);
        else
                posixtimer_putref(tmr);
}

static void posixtimer_sig_unignore(struct task_struct *tsk, int sig)
{
        struct hlist_head *head = &tsk->signal->ignored_posix_timers;
        struct hlist_node *tmp;
        struct k_itimer *tmr;

        if (likely(hlist_empty(head)))
                return;

        /*
         * Rearming a timer with sighand lock held is not possible due to
         * lock ordering vs. tmr::it_lock. Just stick the sigqueue back and
         * let the signal delivery path deal with it whether it needs to be
         * rearmed or not. This cannot be decided here w/o dropping sighand
         * lock and creating a loop retry horror show.
         */
        hlist_for_each_entry_safe(tmr, tmp , head, ignored_list) {
                struct task_struct *target;

                /*
                 * tmr::sigq.info.si_signo is immutable, so accessing it
                 * without holding tmr::it_lock is safe.
                 */
                if (tmr->sigq.info.si_signo != sig)
                        continue;

                hlist_del_init(&tmr->ignored_list);

                /* This should never happen and leaks a reference count */
                if (WARN_ON_ONCE(!list_empty(&tmr->sigq.list)))
                        continue;

                /*
                 * Get the target for the signal. If target is a thread and
                 * has exited by now, drop the reference count.
                 */
                guard(rcu)();
                target = posixtimer_get_target(tmr);
                if (target)
                        posixtimer_queue_sigqueue(&tmr->sigq, target, tmr->it_pid_type);
                else
                        posixtimer_putref(tmr);
        }
}
#else /* CONFIG_POSIX_TIMERS */
static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { }
static inline void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { }
#endif /* !CONFIG_POSIX_TIMERS */

void do_notify_pidfd(struct task_struct *task)
{
        struct pid *pid = task_pid(task);

        WARN_ON(task->exit_state == 0);

        __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
                        poll_to_key(EPOLLIN | EPOLLRDNORM));
}

/*
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
 *
 * Returns true if our parent ignored us and so we've switched to
 * self-reaping.
 */
bool do_notify_parent(struct task_struct *tsk, int sig)
{
        struct kernel_siginfo info;
        unsigned long flags;
        struct sighand_struct *psig;
        bool autoreap = false;
        u64 utime, stime;

        if (WARN_ON_ONCE(!valid_signal(sig)))
                return false;

        /* do_notify_parent_cldstop should have been called instead.  */
        WARN_ON_ONCE(task_is_stopped_or_traced(tsk));

        WARN_ON_ONCE(!tsk->ptrace && !thread_group_empty(tsk));

        /* ptraced, or group-leader without sub-threads */
        do_notify_pidfd(tsk);

        if (sig != SIGCHLD) {
                /*
                 * This is only possible if parent == real_parent.
                 * Check if it has changed security domain.
                 */
                if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id))
                        sig = SIGCHLD;
        }

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        /*
         * We are under tasklist_lock here so our parent is tied to
         * us and cannot change.
         *
         * task_active_pid_ns will always return the same pid namespace
         * until a task passes through release_task.
         *
         * write_lock() currently calls preempt_disable() which is the
         * same as rcu_read_lock(), but according to Oleg, this is not
         * correct to rely on this
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
                                       task_uid(tsk));
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime);
        info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime);

        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
                info.si_code = CLD_DUMPED;
        else if (tsk->exit_code & 0x7f)
                info.si_code = CLD_KILLED;
        else {
                info.si_code = CLD_EXITED;
                info.si_status = tsk->exit_code >> 8;
        }

        psig = tsk->parent->sighand;
        spin_lock_irqsave(&psig->siglock, flags);
        if (!tsk->ptrace && sig == SIGCHLD &&
            (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
             (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
                /*
                 * We are exiting and our parent doesn't care.  POSIX.1
                 * defines special semantics for setting SIGCHLD to SIG_IGN
                 * or setting the SA_NOCLDWAIT flag: we should be reaped
                 * automatically and not left for our parent's wait4 call.
                 * Rather than having the parent do it as a magic kind of
                 * signal handler, we just set this to tell do_exit that we
                 * can be cleaned up without becoming a zombie.  Note that
                 * we still call __wake_up_parent in this case, because a
                 * blocked sys_wait4 might now return -ECHILD.
                 *
                 * Whether we send SIGCHLD or not for SA_NOCLDWAIT
                 * is implementation-defined: we do (if you don't want
                 * it, just use SIG_IGN instead).
                 */
                autoreap = true;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
                        sig = 0;
        }
        if (!tsk->ptrace && tsk->signal->autoreap) {
                autoreap = true;
                sig = 0;
        }
        /*
         * Send with __send_signal as si_pid and si_uid are in the
         * parent's namespaces.
         */
        if (sig)
                __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false);
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);

        return autoreap;
}

/**
 * do_notify_parent_cldstop - notify parent of stopped/continued state change
 * @tsk: task reporting the state change
 * @for_ptracer: the notification is for ptracer
 * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
 *
 * Notify @tsk's parent that the stopped/continued state has changed.  If
 * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
 * If %true, @tsk reports to @tsk->parent which should be the ptracer.
 *
 * CONTEXT:
 * Must be called with tasklist_lock at least read locked.
 */
static void do_notify_parent_cldstop(struct task_struct *tsk,
                                     bool for_ptracer, int why)
{
        struct kernel_siginfo info;
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
        u64 utime, stime;

        if (for_ptracer) {
                parent = tsk->parent;
        } else {
                tsk = tsk->group_leader;
                parent = tsk->real_parent;
        }

        clear_siginfo(&info);
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        /*
         * see comment in do_notify_parent() about the following 4 lines
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        info.si_utime = nsec_to_clock_t(utime);
        info.si_stime = nsec_to_clock_t(stime);

         info.si_code = why;
         switch (why) {
         case CLD_CONTINUED:
                 info.si_status = SIGCONT;
                 break;
         case CLD_STOPPED:
                 info.si_status = tsk->signal->group_exit_code & 0x7f;
                 break;
         case CLD_TRAPPED:
                 info.si_status = tsk->exit_code & 0x7f;
                 break;
         default:
                 BUG();
         }

        sighand = parent->sighand;
        spin_lock_irqsave(&sighand->siglock, flags);
        if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
            !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
                send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID);
        /*
         * Even if SIGCHLD is not generated, we must wake up wait4 calls.
         */
        __wake_up_parent(tsk, parent);
        spin_unlock_irqrestore(&sighand->siglock, flags);
}

/*
 * This must be called with current->sighand->siglock held.
 *
 * This should be the path for all ptrace stops.
 * We always set current->last_siginfo while stopped here.
 * That makes it a way to test a stopped process for
 * being ptrace-stopped vs being job-control-stopped.
 *
 * Returns the signal the ptracer requested the code resume
 * with.  If the code did not stop because the tracer is gone,
 * the stop signal remains unchanged unless clear_code.
 */
static int ptrace_stop(int exit_code, int why, unsigned long message,
                       kernel_siginfo_t *info)
        __releases(&current->sighand->siglock)
        __acquires(&current->sighand->siglock)
{
        bool gstop_done = false;

        if (arch_ptrace_stop_needed()) {
                /*
                 * The arch code has something special to do before a
                 * ptrace stop.  This is allowed to block, e.g. for faults
                 * on user stack pages.  We can't keep the siglock while
                 * calling arch_ptrace_stop, so we must release it now.
                 * To preserve proper semantics, we must do this before
                 * any signal bookkeeping like checking group_stop_count.
                 */
                spin_unlock_irq(&current->sighand->siglock);
                arch_ptrace_stop();
                spin_lock_irq(&current->sighand->siglock);
        }

        /*
         * After this point ptrace_signal_wake_up or signal_wake_up
         * will clear TASK_TRACED if ptrace_unlink happens or a fatal
         * signal comes in.  Handle previous ptrace_unlinks and fatal
         * signals here to prevent ptrace_stop sleeping in schedule.
         */
        if (!current->ptrace || __fatal_signal_pending(current))
                return exit_code;

        set_special_state(TASK_TRACED);
        current->jobctl |= JOBCTL_TRACED;

        /*
         * We're committing to trapping.  TRACED should be visible before
         * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
         * Also, transition to TRACED and updates to ->jobctl should be
         * atomic with respect to siglock and should be done after the arch
         * hook as siglock is released and regrabbed across it.
         *
         *     TRACER                                    TRACEE
         *
         *     ptrace_attach()
         * [L]   wait_on_bit(JOBCTL_TRAPPING)        [S] set_special_state(TRACED)
         *     do_wait()
         *       set_current_state()                smp_wmb();
         *       ptrace_do_wait()
         *         wait_task_stopped()
         *           task_stopped_code()
         * [L]         task_is_traced()                [S] task_clear_jobctl_trapping();
         */
        smp_wmb();

        current->ptrace_message = message;
        current->last_siginfo = info;
        current->exit_code = exit_code;

        /*
         * If @why is CLD_STOPPED, we're trapping to participate in a group
         * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
         * across siglock relocks since INTERRUPT was scheduled, PENDING
         * could be clear now.  We act as if SIGCONT is received after
         * TASK_TRACED is entered - ignore it.
         */
        if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
                gstop_done = task_participate_group_stop(current);

        /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
        task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
        if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
                task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);

        /* entering a trap, clear TRAPPING */
        task_clear_jobctl_trapping(current);

        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
        /*
         * Notify parents of the stop.
         *
         * While ptraced, there are two parents - the ptracer and
         * the real_parent of the group_leader.  The ptracer should
         * know about every stop while the real parent is only
         * interested in the completion of group stop.  The states
         * for the two don't interact with each other.  Notify
         * separately unless they're gonna be duplicates.
         */
        if (current->ptrace)
                do_notify_parent_cldstop(current, true, why);
        if (gstop_done && (!current->ptrace || ptrace_reparented(current)))
                do_notify_parent_cldstop(current, false, why);

        /*
         * The previous do_notify_parent_cldstop() invocation woke ptracer.
         * One a PREEMPTION kernel this can result in preemption requirement
         * which will be fulfilled after read_unlock() and the ptracer will be
         * put on the CPU.
         * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
         * this task wait in schedule(). If this task gets preempted then it
         * remains enqueued on the runqueue. The ptracer will observe this and
         * then sleep for a delay of one HZ tick. In the meantime this task
         * gets scheduled, enters schedule() and will wait for the ptracer.
         *
         * This preemption point is not bad from a correctness point of
         * view but extends the runtime by one HZ tick time due to the
         * ptracer's sleep.  The preempt-disable section ensures that there
         * will be no preemption between unlock and schedule() and so
         * improving the performance since the ptracer will observe that
         * the tracee is scheduled out once it gets on the CPU.
         *
         * On PREEMPT_RT locking tasklist_lock does not disable preemption.
         * Therefore the task can be preempted after do_notify_parent_cldstop()
         * before unlocking tasklist_lock so there is no benefit in doing this.
         *
         * In fact disabling preemption is harmful on PREEMPT_RT because
         * the spinlock_t in cgroup_enter_frozen() must not be acquired
         * with preemption disabled due to the 'sleeping' spinlock
         * substitution of RT.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_disable();
        read_unlock(&tasklist_lock);
        cgroup_enter_frozen();
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable_no_resched();
        schedule();
        cgroup_leave_frozen(true);

        /*
         * We are back.  Now reacquire the siglock before touching
         * last_siginfo, so that we are sure to have synchronized with
         * any signal-sending on another CPU that wants to examine it.
         */
        spin_lock_irq(&current->sighand->siglock);
        exit_code = current->exit_code;
        current->last_siginfo = NULL;
        current->ptrace_message = 0;
        current->exit_code = 0;

        /* LISTENING can be set only during STOP traps, clear it */
        current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN);

        /*
         * Queued signals ignored us while we were stopped for tracing.
         * So check for any that we should take before resuming user mode.
         * This sets TIF_SIGPENDING, but never clears it.
         */
        recalc_sigpending_tsk(current);
        return exit_code;
}

static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
{
        kernel_siginfo_t info;

        clear_siginfo(&info);
        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());

        /* Let the debugger run.  */
        return ptrace_stop(exit_code, why, message, &info);
}

int ptrace_notify(int exit_code, unsigned long message)
{
        int signr;

        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
        if (unlikely(task_work_pending(current)))
                task_work_run();

        spin_lock_irq(&current->sighand->siglock);
        signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
        spin_unlock_irq(&current->sighand->siglock);
        return signr;
}

/**
 * do_signal_stop - handle group stop for SIGSTOP and other stop signals
 * @signr: signr causing group stop if initiating
 *
 * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
 * and participate in it.  If already set, participate in the existing
 * group stop.  If participated in a group stop (and thus slept), %true is
 * returned with siglock released.
 *
 * If ptraced, this function doesn't handle stop itself.  Instead,
 * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
 * untouched.  The caller must ensure that INTERRUPT trap handling takes
 * places afterwards.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held, which is released
 * on %true return.
 *
 * RETURNS:
 * %false if group stop is already cancelled or ptrace trap is scheduled.
 * %true if participated in group stop.
 */
static bool do_signal_stop(int signr)
        __releases(&current->sighand->siglock)
{
        struct signal_struct *sig = current->signal;

        if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
                unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
                struct task_struct *t;

                /* signr will be recorded in task->jobctl for retries */
                WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);

                if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
                    unlikely(sig->flags & SIGNAL_GROUP_EXIT) ||
                    unlikely(sig->group_exec_task))
                        return false;
                /*
                 * There is no group stop already in progress.  We must
                 * initiate one now.
                 *
                 * While ptraced, a task may be resumed while group stop is
                 * still in effect and then receive a stop signal and
                 * initiate another group stop.  This deviates from the
                 * usual behavior as two consecutive stop signals can't
                 * cause two group stops when !ptraced.  That is why we
                 * also check !task_is_stopped(t) below.
                 *
                 * The condition can be distinguished by testing whether
                 * SIGNAL_STOP_STOPPED is already set.  Don't generate
                 * group_exit_code in such case.
                 *
                 * This is not necessary for SIGNAL_STOP_CONTINUED because
                 * an intervening stop signal is required to cause two
                 * continued events regardless of ptrace.
                 */
                if (!(sig->flags & SIGNAL_STOP_STOPPED))
                        sig->group_exit_code = signr;

                sig->group_stop_count = 0;
                if (task_set_jobctl_pending(current, signr | gstop))
                        sig->group_stop_count++;

                for_other_threads(current, t) {
                        /*
                         * Setting state to TASK_STOPPED for a group
                         * stop is always done with the siglock held,
                         * so this check has no races.
                         */
                        if (!task_is_stopped(t) &&
                            task_set_jobctl_pending(t, signr | gstop)) {
                                sig->group_stop_count++;
                                if (likely(!(t->ptrace & PT_SEIZED)))
                                        signal_wake_up(t, 0);
                                else
                                        ptrace_trap_notify(t);
                        }
                }
        }

        if (likely(!current->ptrace)) {
                int notify = 0;

                /*
                 * If there are no other threads in the group, or if there
                 * is a group stop in progress and we are the last to stop,
                 * report to the parent.
                 */
                if (task_participate_group_stop(current))
                        notify = CLD_STOPPED;

                current->jobctl |= JOBCTL_STOPPED;
                set_special_state(TASK_STOPPED);
                spin_unlock_irq(&current->sighand->siglock);

                /*
                 * Notify the parent of the group stop completion.  Because
                 * we're not holding either the siglock or tasklist_lock
                 * here, ptracer may attach inbetween; however, this is for
                 * group stop and should always be delivered to the real
                 * parent of the group leader.  The new ptracer will get
                 * its notification when this task transitions into
                 * TASK_TRACED.
                 */
                if (notify) {
                        read_lock(&tasklist_lock);
                        do_notify_parent_cldstop(current, false, notify);
                        read_unlock(&tasklist_lock);
                }

                /* Now we don't run again until woken by SIGCONT or SIGKILL */
                cgroup_enter_frozen();
                schedule();
                return true;
        } else {
                /*
                 * While ptraced, group stop is handled by STOP trap.
                 * Schedule it and let the caller deal with it.
                 */
                task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
                return false;
        }
}

/**
 * do_jobctl_trap - take care of ptrace jobctl traps
 *
 * When PT_SEIZED, it's used for both group stop and explicit
 * SEIZE/INTERRUPT traps.  Both generate PTRACE_EVENT_STOP trap with
 * accompanying siginfo.  If stopped, lower eight bits of exit_code contain
 * the stop signal; otherwise, %SIGTRAP.
 *
 * When !PT_SEIZED, it's used only for group stop trap with stop signal
 * number as exit_code and no siginfo.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held, which may be
 * released and re-acquired before returning with intervening sleep.
 */
static void do_jobctl_trap(void)
{
        struct signal_struct *signal = current->signal;
        int signr = current->jobctl & JOBCTL_STOP_SIGMASK;

        if (current->ptrace & PT_SEIZED) {
                if (!signal->group_stop_count &&
                    !(signal->flags & SIGNAL_STOP_STOPPED))
                        signr = SIGTRAP;
                WARN_ON_ONCE(!signr);
                ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
                                 CLD_STOPPED, 0);
        } else {
                WARN_ON_ONCE(!signr);
                ptrace_stop(signr, CLD_STOPPED, 0, NULL);
        }
}

/**
 * do_freezer_trap - handle the freezer jobctl trap
 *
 * Puts the task into frozen state, if only the task is not about to quit.
 * In this case it drops JOBCTL_TRAP_FREEZE.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held,
 * which is always released before returning.
 */
static void do_freezer_trap(void)
        __releases(&current->sighand->siglock)
{
        /*
         * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
         * let's make another loop to give it a chance to be handled.
         * In any case, we'll return back.
         */
        if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
             JOBCTL_TRAP_FREEZE) {
                spin_unlock_irq(&current->sighand->siglock);
                return;
        }

        /*
         * Now we're sure that there is no pending fatal signal and no
         * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
         * immediately (if there is a non-fatal signal pending), and
         * put the task into sleep.
         */
        __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
        clear_thread_flag(TIF_SIGPENDING);
        spin_unlock_irq(&current->sighand->siglock);
        cgroup_enter_frozen();
        schedule();

        /*
         * We could've been woken by task_work, run it to clear
         * TIF_NOTIFY_SIGNAL. The caller will retry if necessary.
         */
        clear_notify_signal();
        if (unlikely(task_work_pending(current)))
                task_work_run();
}

static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
{
        /*
         * We do not check sig_kernel_stop(signr) but set this marker
         * unconditionally because we do not know whether debugger will
         * change signr. This flag has no meaning unless we are going
         * to stop after return from ptrace_stop(). In this case it will
         * be checked in do_signal_stop(), we should only stop if it was
         * not cleared by SIGCONT while we were sleeping. See also the
         * comment in dequeue_signal().
         */
        current->jobctl |= JOBCTL_STOP_DEQUEUED;
        signr = ptrace_stop(signr, CLD_TRAPPED, 0, info);

        /* We're back.  Did the debugger cancel the sig?  */
        if (signr == 0)
                return signr;

        /*
         * Update the siginfo structure if the signal has
         * changed.  If the debugger wanted something
         * specific in the siginfo structure then it should
         * have updated *info via PTRACE_SETSIGINFO.
         */
        if (signr != info->si_signo) {
                clear_siginfo(info);
                info->si_signo = signr;
                info->si_errno = 0;
                info->si_code = SI_USER;
                rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
                info->si_uid = from_kuid_munged(current_user_ns(),
                                                task_uid(current->parent));
                rcu_read_unlock();
        }

        /* If the (new) signal is now blocked, requeue it.  */
        if (sigismember(&current->blocked, signr) ||
            fatal_signal_pending(current)) {
                send_signal_locked(signr, info, current, type);
                signr = 0;
        }

        return signr;
}

static void hide_si_addr_tag_bits(struct ksignal *ksig)
{
        switch (siginfo_layout(ksig->sig, ksig->info.si_code)) {
        case SIL_FAULT:
        case SIL_FAULT_TRAPNO:
        case SIL_FAULT_MCEERR:
        case SIL_FAULT_BNDERR:
        case SIL_FAULT_PKUERR:
        case SIL_FAULT_PERF_EVENT:
                ksig->info.si_addr = arch_untagged_si_addr(
                        ksig->info.si_addr, ksig->sig, ksig->info.si_code);
                break;
        case SIL_KILL:
        case SIL_TIMER:
        case SIL_POLL:
        case SIL_CHLD:
        case SIL_RT:
        case SIL_SYS:
                break;
        }
}

bool get_signal(struct ksignal *ksig)
{
        struct sighand_struct *sighand = current->sighand;
        struct signal_struct *signal = current->signal;
        int signr;

        clear_notify_signal();
        if (unlikely(task_work_pending(current)))
                task_work_run();

        if (!task_sigpending(current))
                return false;

        if (unlikely(uprobe_deny_signal()))
                return false;

        /*
         * Do this once, we can't return to user-mode if freezing() == T.
         * do_signal_stop() and ptrace_stop() set TASK_STOPPED/TASK_TRACED
         * and the freezer handles those states via TASK_FROZEN, thus they
         * do not need another check after return.
         */
        try_to_freeze();

relock:
        spin_lock_irq(&sighand->siglock);

        /*
         * Every stopped thread goes here after wakeup. Check to see if
         * we should notify the parent, prepare_signal(SIGCONT) encodes
         * the CLD_ si_code into SIGNAL_CLD_MASK bits.
         */
        if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
                int why;

                if (signal->flags & SIGNAL_CLD_CONTINUED)
                        why = CLD_CONTINUED;
                else
                        why = CLD_STOPPED;

                signal->flags &= ~SIGNAL_CLD_MASK;

                spin_unlock_irq(&sighand->siglock);

                /*
                 * Notify the parent that we're continuing.  This event is
                 * always per-process and doesn't make whole lot of sense
                 * for ptracers, who shouldn't consume the state via
                 * wait(2) either, but, for backward compatibility, notify
                 * the ptracer of the group leader too unless it's gonna be
                 * a duplicate.
                 */
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current, false, why);

                if (ptrace_reparented(current->group_leader))
                        do_notify_parent_cldstop(current->group_leader,
                                                true, why);
                read_unlock(&tasklist_lock);

                goto relock;
        }

        for (;;) {
                struct k_sigaction *ka;
                enum pid_type type;

                /* Has this task already been marked for death? */
                if ((signal->flags & SIGNAL_GROUP_EXIT) ||
                     signal->group_exec_task) {
                        signr = SIGKILL;
                        sigdelset(&current->pending.signal, SIGKILL);
                        trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
                                             &sighand->action[SIGKILL-1]);
                        recalc_sigpending();
                        /*
                         * implies do_group_exit() or return to PF_USER_WORKER,
                         * no need to initialize ksig->info/etc.
                         */
                        goto fatal;
                }

                if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
                    do_signal_stop(0))
                        goto relock;

                if (unlikely(current->jobctl &
                             (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
                        if (current->jobctl & JOBCTL_TRAP_MASK) {
                                do_jobctl_trap();
                                spin_unlock_irq(&sighand->siglock);
                        } else if (current->jobctl & JOBCTL_TRAP_FREEZE)
                                do_freezer_trap();

                        goto relock;
                }

                /*
                 * If the task is leaving the frozen state, let's update
                 * cgroup counters and reset the frozen bit.
                 */
                if (unlikely(cgroup_task_frozen(current))) {
                        spin_unlock_irq(&sighand->siglock);
                        cgroup_leave_frozen(false);
                        goto relock;
                }

                /*
                 * Signals generated by the execution of an instruction
                 * need to be delivered before any other pending signals
                 * so that the instruction pointer in the signal stack
                 * frame points to the faulting instruction.
                 */
                type = PIDTYPE_PID;
                signr = dequeue_synchronous_signal(&ksig->info);
                if (!signr)
                        signr = dequeue_signal(&current->blocked, &ksig->info, &type);

                if (!signr)
                        break; /* will return 0 */

                if (unlikely(current->ptrace) && (signr != SIGKILL) &&
                    !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) {
                        signr = ptrace_signal(signr, &ksig->info, type);
                        if (!signr)
                                continue;
                }

                ka = &sighand->action[signr-1];

                /* Trace actually delivered signals. */
                trace_signal_deliver(signr, &ksig->info, ka);

                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
                        /* Run the handler.  */
                        ksig->ka = *ka;

                        if (ka->sa.sa_flags & SA_ONESHOT)
                                ka->sa.sa_handler = SIG_DFL;

                        break; /* will return non-zero "signr" value */
                }

                /*
                 * Now we are doing the default action for this signal.
                 */
                if (sig_kernel_ignore(signr)) /* Default is nothing. */
                        continue;

                /*
                 * Global init gets no signals it doesn't want.
                 * Container-init gets no signals it doesn't want from same
                 * container.
                 *
                 * Note that if global/container-init sees a sig_kernel_only()
                 * signal here, the signal must have been generated internally
                 * or must have come from an ancestor namespace. In either
                 * case, the signal cannot be dropped.
                 */
                if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
                                !sig_kernel_only(signr))
                        continue;

                if (sig_kernel_stop(signr)) {
                        /*
                         * The default action is to stop all threads in
                         * the thread group.  The job control signals
                         * do nothing in an orphaned pgrp, but SIGSTOP
                         * always works.  Note that siglock needs to be
                         * dropped during the call to is_orphaned_pgrp()
                         * because of lock ordering with tasklist_lock.
                         * This allows an intervening SIGCONT to be posted.
                         * We need to check for that and bail out if necessary.
                         */
                        if (signr != SIGSTOP) {
                                spin_unlock_irq(&sighand->siglock);

                                /* signals can be posted during this window */

                                if (is_current_pgrp_orphaned())
                                        goto relock;

                                spin_lock_irq(&sighand->siglock);
                        }

                        if (likely(do_signal_stop(signr))) {
                                /* It released the siglock.  */
                                goto relock;
                        }

                        /*
                         * We didn't actually stop, due to a race
                         * with SIGCONT or something like that.
                         */
                        continue;
                }

        fatal:
                spin_unlock_irq(&sighand->siglock);
                if (unlikely(cgroup_task_frozen(current)))
                        cgroup_leave_frozen(true);

                /*
                 * Anything else is fatal, maybe with a core dump.
                 */
                current->flags |= PF_SIGNALED;

                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
                                print_fatal_signal(signr);
                        proc_coredump_connector(current);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
                         * their demise.  If we lost the race with another
                         * thread getting here, it set group_exit_code
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
                        vfs_coredump(&ksig->info);
                }

                /*
                 * PF_USER_WORKER threads will catch and exit on fatal signals
                 * themselves. They have cleanup that must be performed, so we
                 * cannot call do_exit() on their behalf. Note that ksig won't
                 * be properly initialized, PF_USER_WORKER's shouldn't use it.
                 */
                if (current->flags & PF_USER_WORKER)
                        goto out;

                /*
                 * Death signals, no core dump.
                 */
                do_group_exit(signr);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);

        ksig->sig = signr;

        if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
                hide_si_addr_tag_bits(ksig);
out:
        return signr > 0;
}

/**
 * signal_delivered - called after signal delivery to update blocked signals
 * @ksig:                kernel signal struct
 * @stepping:                nonzero if debugger single-step or block-step in use
 *
 * This function should be called when a signal has successfully been
 * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
 * is always blocked), and the signal itself is blocked unless %SA_NODEFER
 * is set in @ksig->ka.sa.sa_flags.  Tracing is notified.
 */
static void signal_delivered(struct ksignal *ksig, int stepping)
{
        sigset_t blocked;

        /* A signal was successfully delivered, and the
           saved sigmask was stored on the signal frame,
           and will be restored by sigreturn.  So we can
           simply clear the restore sigmask flag.  */
        clear_restore_sigmask();

        sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
        if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
                sigaddset(&blocked, ksig->sig);
        set_current_blocked(&blocked);
        if (current->sas_ss_flags & SS_AUTODISARM)
                sas_ss_reset(current);
        if (stepping)
                ptrace_notify(SIGTRAP, 0);
}

void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
{
        if (failed)
                force_sigsegv(ksig->sig);
        else
                signal_delivered(ksig, stepping);
}

/*
 * It could be that complete_signal() picked us to notify about the
 * group-wide signal. Other threads should be notified now to take
 * the shared signals in @which since we will not.
 */
static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
{
        sigset_t retarget;
        struct task_struct *t;

        sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
        if (sigisemptyset(&retarget))
                return;

        for_other_threads(tsk, t) {
                if (t->flags & PF_EXITING)
                        continue;

                if (!has_pending_signals(&retarget, &t->blocked))
                        continue;
                /* Remove the signals this thread can handle. */
                sigandsets(&retarget, &retarget, &t->blocked);

                if (!task_sigpending(t))
                        signal_wake_up(t, 0);

                if (sigisemptyset(&retarget))
                        break;
        }
}

void exit_signals(struct task_struct *tsk)
{
        int group_stop = 0;
        sigset_t unblocked;

        /*
         * @tsk is about to have PF_EXITING set - lock out users which
         * expect stable threadgroup.
         */
        cgroup_threadgroup_change_begin(tsk);

        if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
                tsk->flags |= PF_EXITING;
                cgroup_threadgroup_change_end(tsk);
                return;
        }

        spin_lock_irq(&tsk->sighand->siglock);
        /*
         * From now this task is not visible for group-wide signals,
         * see wants_signal(), do_signal_stop().
         */
        tsk->flags |= PF_EXITING;

        cgroup_threadgroup_change_end(tsk);

        if (!task_sigpending(tsk))
                goto out;

        unblocked = tsk->blocked;
        signotset(&unblocked);
        retarget_shared_pending(tsk, &unblocked);

        if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
            task_participate_group_stop(tsk))
                group_stop = CLD_STOPPED;
out:
        spin_unlock_irq(&tsk->sighand->siglock);

        /*
         * If group stop has completed, deliver the notification.  This
         * should always go to the real parent of the group leader.
         */
        if (unlikely(group_stop)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(tsk, false, group_stop);
                read_unlock(&tasklist_lock);
        }
}

/*
 * System call entry points.
 */

/**
 *  sys_restart_syscall - restart a system call
 */
SYSCALL_DEFINE0(restart_syscall)
{
        struct restart_block *restart = &current->restart_block;
        return restart->fn(restart);
}

long do_no_restart_syscall(struct restart_block *param)
{
        return -EINTR;
}

static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
        if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
                sigset_t newblocked;
                /* A set of now blocked but previously unblocked signals. */
                sigandnsets(&newblocked, newset, &current->blocked);
                retarget_shared_pending(tsk, &newblocked);
        }
        tsk->blocked = *newset;
        recalc_sigpending();
}

/**
 * set_current_blocked - change current->blocked mask
 * @newset: new mask
 *
 * It is wrong to change ->blocked directly, this helper should be used
 * to ensure the process can't miss a shared signal we are going to block.
 */
void set_current_blocked(sigset_t *newset)
{
        sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
        __set_current_blocked(newset);
}

void __set_current_blocked(const sigset_t *newset)
{
        struct task_struct *tsk = current;

        /*
         * In case the signal mask hasn't changed, there is nothing we need
         * to do. The current->blocked shouldn't be modified by other task.
         */
        if (sigequalsets(&tsk->blocked, newset))
                return;

        spin_lock_irq(&tsk->sighand->siglock);
        __set_task_blocked(tsk, newset);
        spin_unlock_irq(&tsk->sighand->siglock);
}

/*
 * This is also useful for kernel threads that want to temporarily
 * (or permanently) block certain signals.
 *
 * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
 * interface happily blocks "unblockable" signals like SIGKILL
 * and friends.
 */
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
        struct task_struct *tsk = current;
        sigset_t newset;

        /* Lockless, only current can change ->blocked, never from irq */
        if (oldset)
                *oldset = tsk->blocked;

        switch (how) {
        case SIG_BLOCK:
                sigorsets(&newset, &tsk->blocked, set);
                break;
        case SIG_UNBLOCK:
                sigandnsets(&newset, &tsk->blocked, set);
                break;
        case SIG_SETMASK:
                newset = *set;
                break;
        default:
                return -EINVAL;
        }

        __set_current_blocked(&newset);
        return 0;
}
EXPORT_SYMBOL(sigprocmask);

/*
 * The api helps set app-provided sigmasks.
 *
 * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
 * epoll_pwait where a new sigmask is passed from userland for the syscalls.
 *
 * Note that it does set_restore_sigmask() in advance, so it must be always
 * paired with restore_saved_sigmask_unless() before return from syscall.
 */
int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
        sigset_t kmask;

        if (!umask)
                return 0;
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;
        if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
                return -EFAULT;

        set_restore_sigmask();
        current->saved_sigmask = current->blocked;
        set_current_blocked(&kmask);

        return 0;
}

#ifdef CONFIG_COMPAT
int set_compat_user_sigmask(const compat_sigset_t __user *umask,
                            size_t sigsetsize)
{
        sigset_t kmask;

        if (!umask)
                return 0;
        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;
        if (get_compat_sigset(&kmask, umask))
                return -EFAULT;

        set_restore_sigmask();
        current->saved_sigmask = current->blocked;
        set_current_blocked(&kmask);

        return 0;
}
#endif

/**
 *  sys_rt_sigprocmask - change the list of currently blocked signals
 *  @how: whether to add, remove, or set signals
 *  @nset: stores pending signals
 *  @oset: previous value of signal mask if non-null
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
                sigset_t __user *, oset, size_t, sigsetsize)
{
        sigset_t old_set, new_set;
        int error;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        old_set = current->blocked;

        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                error = sigprocmask(how, &new_set, NULL);
                if (error)
                        return error;
        }

        if (oset) {
                if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
                        return -EFAULT;
        }

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
                compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
        sigset_t old_set = current->blocked;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (nset) {
                sigset_t new_set;
                int error;
                if (get_compat_sigset(&new_set, nset))
                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                error = sigprocmask(how, &new_set, NULL);
                if (error)
                        return error;
        }
        return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
}
#endif

static void do_sigpending(sigset_t *set)
{
        spin_lock_irq(&current->sighand->siglock);
        sigorsets(set, &current->pending.signal,
                  &current->signal->shared_pending.signal);
        spin_unlock_irq(&current->sighand->siglock);

        /* Outside the lock because only this thread touches it.  */
        sigandsets(set, &current->blocked, set);
}

/**
 *  sys_rt_sigpending - examine a pending signal that has been raised
 *                        while blocked
 *  @uset: stores pending signals
 *  @sigsetsize: size of sigset_t type or larger
 */
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
        sigset_t set;

        if (sigsetsize > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        if (copy_to_user(uset, &set, sigsetsize))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
                compat_size_t, sigsetsize)
{
        sigset_t set;

        if (sigsetsize > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        return put_compat_sigset(uset, &set, sigsetsize);
}
#endif

static const struct {
        unsigned char limit, layout;
} sig_sicodes[] = {
        [SIGILL]  = { NSIGILL,  SIL_FAULT },
        [SIGFPE]  = { NSIGFPE,  SIL_FAULT },
        [SIGSEGV] = { NSIGSEGV, SIL_FAULT },
        [SIGBUS]  = { NSIGBUS,  SIL_FAULT },
        [SIGTRAP] = { NSIGTRAP, SIL_FAULT },
#if defined(SIGEMT)
        [SIGEMT]  = { NSIGEMT,  SIL_FAULT },
#endif
        [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
        [SIGPOLL] = { NSIGPOLL, SIL_POLL },
        [SIGSYS]  = { NSIGSYS,  SIL_SYS },
};

static bool known_siginfo_layout(unsigned sig, int si_code)
{
        if (si_code == SI_KERNEL)
                return true;
        else if ((si_code > SI_USER)) {
                if (sig_specific_sicodes(sig)) {
                        if (si_code <= sig_sicodes[sig].limit)
                                return true;
                }
                else if (si_code <= NSIGPOLL)
                        return true;
        }
        else if (si_code >= SI_DETHREAD)
                return true;
        else if (si_code == SI_ASYNCNL)
                return true;
        return false;
}

enum siginfo_layout siginfo_layout(unsigned sig, int si_code)
{
        enum siginfo_layout layout = SIL_KILL;
        if ((si_code > SI_USER) && (si_code < SI_KERNEL)) {
                if ((sig < ARRAY_SIZE(sig_sicodes)) &&
                    (si_code <= sig_sicodes[sig].limit)) {
                        layout = sig_sicodes[sig].layout;
                        /* Handle the exceptions */
                        if ((sig == SIGBUS) &&
                            (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
                                layout = SIL_FAULT_MCEERR;
                        else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
                                layout = SIL_FAULT_BNDERR;
#ifdef SEGV_PKUERR
                        else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
                                layout = SIL_FAULT_PKUERR;
#endif
                        else if ((sig == SIGTRAP) && (si_code == TRAP_PERF))
                                layout = SIL_FAULT_PERF_EVENT;
                        else if (IS_ENABLED(CONFIG_SPARC) &&
                                 (sig == SIGILL) && (si_code == ILL_ILLTRP))
                                layout = SIL_FAULT_TRAPNO;
                        else if (IS_ENABLED(CONFIG_ALPHA) &&
                                 ((sig == SIGFPE) ||
                                  ((sig == SIGTRAP) && (si_code == TRAP_UNK))))
                                layout = SIL_FAULT_TRAPNO;
                }
                else if (si_code <= NSIGPOLL)
                        layout = SIL_POLL;
        } else {
                if (si_code == SI_TIMER)
                        layout = SIL_TIMER;
                else if (si_code == SI_SIGIO)
                        layout = SIL_POLL;
                else if (si_code < 0)
                        layout = SIL_RT;
        }
        return layout;
}

static inline char __user *si_expansion(const siginfo_t __user *info)
{
        return ((char __user *)info) + sizeof(struct kernel_siginfo);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from)
{
        char __user *expansion = si_expansion(to);
        if (copy_to_user(to, from , sizeof(struct kernel_siginfo)))
                return -EFAULT;
        if (clear_user(expansion, SI_EXPANSION_SIZE))
                return -EFAULT;
        return 0;
}

static int post_copy_siginfo_from_user(kernel_siginfo_t *info,
                                       const siginfo_t __user *from)
{
        if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) {
                char __user *expansion = si_expansion(from);
                char buf[SI_EXPANSION_SIZE];
                int i;
                /*
                 * An unknown si_code might need more than
                 * sizeof(struct kernel_siginfo) bytes.  Verify all of the
                 * extra bytes are 0.  This guarantees copy_siginfo_to_user
                 * will return this data to userspace exactly.
                 */
                if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE))
                        return -EFAULT;
                for (i = 0; i < SI_EXPANSION_SIZE; i++) {
                        if (buf[i] != 0)
                                return -E2BIG;
                }
        }
        return 0;
}

static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to,
                                    const siginfo_t __user *from)
{
        if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
                return -EFAULT;
        to->si_signo = signo;
        return post_copy_siginfo_from_user(to, from);
}

int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from)
{
        if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
                return -EFAULT;
        return post_copy_siginfo_from_user(to, from);
}

#ifdef CONFIG_COMPAT
/**
 * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo
 * @to: compat siginfo destination
 * @from: kernel siginfo source
 *
 * Note: This function does not work properly for the SIGCHLD on x32, but
 * fortunately it doesn't have to.  The only valid callers for this function are
 * copy_siginfo_to_user32, which is overriden for x32 and the coredump code.
 * The latter does not care because SIGCHLD will never cause a coredump.
 */
void copy_siginfo_to_external32(struct compat_siginfo *to,
                const struct kernel_siginfo *from)
{
        memset(to, 0, sizeof(*to));

        to->si_signo = from->si_signo;
        to->si_errno = from->si_errno;
        to->si_code  = from->si_code;
        switch(siginfo_layout(from->si_signo, from->si_code)) {
        case SIL_KILL:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                break;
        case SIL_TIMER:
                to->si_tid     = from->si_tid;
                to->si_overrun = from->si_overrun;
                to->si_int     = from->si_int;
                break;
        case SIL_POLL:
                to->si_band = from->si_band;
                to->si_fd   = from->si_fd;
                break;
        case SIL_FAULT:
                to->si_addr = ptr_to_compat(from->si_addr);
                break;
        case SIL_FAULT_TRAPNO:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_trapno = from->si_trapno;
                break;
        case SIL_FAULT_MCEERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_addr_lsb = from->si_addr_lsb;
                break;
        case SIL_FAULT_BNDERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_lower = ptr_to_compat(from->si_lower);
                to->si_upper = ptr_to_compat(from->si_upper);
                break;
        case SIL_FAULT_PKUERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_pkey = from->si_pkey;
                break;
        case SIL_FAULT_PERF_EVENT:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_perf_data = from->si_perf_data;
                to->si_perf_type = from->si_perf_type;
                to->si_perf_flags = from->si_perf_flags;
                break;
        case SIL_CHLD:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_status = from->si_status;
                to->si_utime = from->si_utime;
                to->si_stime = from->si_stime;
                break;
        case SIL_RT:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_int = from->si_int;
                break;
        case SIL_SYS:
                to->si_call_addr = ptr_to_compat(from->si_call_addr);
                to->si_syscall   = from->si_syscall;
                to->si_arch      = from->si_arch;
                break;
        }
}

int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
                           const struct kernel_siginfo *from)
{
        struct compat_siginfo new;

        copy_siginfo_to_external32(&new, from);
        if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
                return -EFAULT;
        return 0;
}

static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
                                         const struct compat_siginfo *from)
{
        clear_siginfo(to);
        to->si_signo = from->si_signo;
        to->si_errno = from->si_errno;
        to->si_code  = from->si_code;
        switch(siginfo_layout(from->si_signo, from->si_code)) {
        case SIL_KILL:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                break;
        case SIL_TIMER:
                to->si_tid     = from->si_tid;
                to->si_overrun = from->si_overrun;
                to->si_int     = from->si_int;
                break;
        case SIL_POLL:
                to->si_band = from->si_band;
                to->si_fd   = from->si_fd;
                break;
        case SIL_FAULT:
                to->si_addr = compat_ptr(from->si_addr);
                break;
        case SIL_FAULT_TRAPNO:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_trapno = from->si_trapno;
                break;
        case SIL_FAULT_MCEERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_addr_lsb = from->si_addr_lsb;
                break;
        case SIL_FAULT_BNDERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_lower = compat_ptr(from->si_lower);
                to->si_upper = compat_ptr(from->si_upper);
                break;
        case SIL_FAULT_PKUERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_pkey = from->si_pkey;
                break;
        case SIL_FAULT_PERF_EVENT:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_perf_data = from->si_perf_data;
                to->si_perf_type = from->si_perf_type;
                to->si_perf_flags = from->si_perf_flags;
                break;
        case SIL_CHLD:
                to->si_pid    = from->si_pid;
                to->si_uid    = from->si_uid;
                to->si_status = from->si_status;
#ifdef CONFIG_X86_X32_ABI
                if (in_x32_syscall()) {
                        to->si_utime = from->_sifields._sigchld_x32._utime;
                        to->si_stime = from->_sifields._sigchld_x32._stime;
                } else
#endif
                {
                        to->si_utime = from->si_utime;
                        to->si_stime = from->si_stime;
                }
                break;
        case SIL_RT:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_int = from->si_int;
                break;
        case SIL_SYS:
                to->si_call_addr = compat_ptr(from->si_call_addr);
                to->si_syscall   = from->si_syscall;
                to->si_arch      = from->si_arch;
                break;
        }
        return 0;
}

static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to,
                                      const struct compat_siginfo __user *ufrom)
{
        struct compat_siginfo from;

        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
                return -EFAULT;

        from.si_signo = signo;
        return post_copy_siginfo_from_user32(to, &from);
}

int copy_siginfo_from_user32(struct kernel_siginfo *to,
                             const struct compat_siginfo __user *ufrom)
{
        struct compat_siginfo from;

        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
                return -EFAULT;

        return post_copy_siginfo_from_user32(to, &from);
}
#endif /* CONFIG_COMPAT */

/**
 *  do_sigtimedwait - wait for queued signals specified in @which
 *  @which: queued signals to wait for
 *  @info: if non-null, the signal's siginfo is returned here
 *  @ts: upper bound on process time suspension
 */
static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
                    const struct timespec64 *ts)
{
        ktime_t *to = NULL, timeout = KTIME_MAX;
        struct task_struct *tsk = current;
        sigset_t mask = *which;
        enum pid_type type;
        int sig, ret = 0;

        if (ts) {
                if (!timespec64_valid(ts))
                        return -EINVAL;
                timeout = timespec64_to_ktime(*ts);
                to = &timeout;
        }

        /*
         * Invert the set of allowed signals to get those we want to block.
         */
        sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
        signotset(&mask);

        spin_lock_irq(&tsk->sighand->siglock);
        sig = dequeue_signal(&mask, info, &type);
        if (!sig && timeout) {
                /*
                 * None ready, temporarily unblock those we're interested
                 * while we are sleeping in so that we'll be awakened when
                 * they arrive. Unblocking is always fine, we can avoid
                 * set_current_blocked().
                 */
                tsk->real_blocked = tsk->blocked;
                sigandsets(&tsk->blocked, &tsk->blocked, &mask);
                recalc_sigpending();
                spin_unlock_irq(&tsk->sighand->siglock);

                __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns,
                                               HRTIMER_MODE_REL);
                spin_lock_irq(&tsk->sighand->siglock);
                __set_task_blocked(tsk, &tsk->real_blocked);
                sigemptyset(&tsk->real_blocked);
                sig = dequeue_signal(&mask, info, &type);
        }
        spin_unlock_irq(&tsk->sighand->siglock);

        if (sig)
                return sig;
        return ret ? -EINTR : -EAGAIN;
}

/**
 *  sys_rt_sigtimedwait - synchronously wait for queued signals specified
 *                        in @uthese
 *  @uthese: queued signals to wait for
 *  @uinfo: if non-null, the signal's siginfo is returned here
 *  @uts: upper bound on process time suspension
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo,
                const struct __kernel_timespec __user *, uts,
                size_t, sigsetsize)
{
        sigset_t these;
        struct timespec64 ts;
        kernel_siginfo_t info;
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;

        if (uts) {
                if (get_timespec64(&ts, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo,
                const struct old_timespec32 __user *, uts,
                size_t, sigsetsize)
{
        sigset_t these;
        struct timespec64 ts;
        kernel_siginfo_t info;
        int ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;

        if (uts) {
                if (get_old_timespec32(&ts, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}
#endif

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
                struct compat_siginfo __user *, uinfo,
                struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize)
{
        sigset_t s;
        struct timespec64 t;
        kernel_siginfo_t info;
        long ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&s, uthese))
                return -EFAULT;

        if (uts) {
                if (get_timespec64(&t, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user32(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
                struct compat_siginfo __user *, uinfo,
                struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
{
        sigset_t s;
        struct timespec64 t;
        kernel_siginfo_t info;
        long ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&s, uthese))
                return -EFAULT;

        if (uts) {
                if (get_old_timespec32(&t, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user32(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}
#endif
#endif

static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info,
                                 enum pid_type type)
{
        clear_siginfo(info);
        info->si_signo = sig;
        info->si_errno = 0;
        info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER;
        info->si_pid = task_tgid_vnr(current);
        info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
}

/**
 *  sys_kill - send a signal to a process
 *  @pid: the PID of the process
 *  @sig: signal to be sent
 */
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
        struct kernel_siginfo info;

        prepare_kill_siginfo(sig, &info, PIDTYPE_TGID);

        return kill_something_info(sig, &info, pid);
}

/*
 * Verify that the signaler and signalee either are in the same pid namespace
 * or that the signaler's pid namespace is an ancestor of the signalee's pid
 * namespace.
 */
static bool access_pidfd_pidns(struct pid *pid)
{
        struct pid_namespace *active = task_active_pid_ns(current);
        struct pid_namespace *p = ns_of_pid(pid);

        for (;;) {
                if (!p)
                        return false;
                if (p == active)
                        break;
                p = p->parent;
        }

        return true;
}

static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo,
                siginfo_t __user *info)
{
#ifdef CONFIG_COMPAT
        /*
         * Avoid hooking up compat syscalls and instead handle necessary
         * conversions here. Note, this is a stop-gap measure and should not be
         * considered a generic solution.
         */
        if (in_compat_syscall())
                return copy_siginfo_from_user32(
                        kinfo, (struct compat_siginfo __user *)info);
#endif
        return copy_siginfo_from_user(kinfo, info);
}

static struct pid *pidfd_to_pid(const struct file *file)
{
        struct pid *pid;

        pid = pidfd_pid(file);
        if (!IS_ERR(pid))
                return pid;

        return tgid_pidfd_to_pid(file);
}

#define PIDFD_SEND_SIGNAL_FLAGS                            \
        (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
         PIDFD_SIGNAL_PROCESS_GROUP)

static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type,
                                siginfo_t __user *info, unsigned int flags)
{
        kernel_siginfo_t kinfo;

        switch (flags) {
        case PIDFD_SIGNAL_THREAD:
                type = PIDTYPE_PID;
                break;
        case PIDFD_SIGNAL_THREAD_GROUP:
                type = PIDTYPE_TGID;
                break;
        case PIDFD_SIGNAL_PROCESS_GROUP:
                type = PIDTYPE_PGID;
                break;
        }

        if (info) {
                int ret;

                ret = copy_siginfo_from_user_any(&kinfo, info);
                if (unlikely(ret))
                        return ret;

                if (unlikely(sig != kinfo.si_signo))
                        return -EINVAL;

                /* Only allow sending arbitrary signals to yourself. */
                if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
                    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
                        return -EPERM;
        } else {
                prepare_kill_siginfo(sig, &kinfo, type);
        }

        if (type == PIDTYPE_PGID)
                return kill_pgrp_info(sig, &kinfo, pid);

        return kill_pid_info_type(sig, &kinfo, pid, type);
}

/**
 * sys_pidfd_send_signal - Signal a process through a pidfd
 * @pidfd:  file descriptor of the process
 * @sig:    signal to send
 * @info:   signal info
 * @flags:  future flags
 *
 * Send the signal to the thread group or to the individual thread depending
 * on PIDFD_THREAD.
 * In the future extension to @flags may be used to override the default scope
 * of @pidfd.
 *
 * Return: 0 on success, negative errno on failure
 */
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
                siginfo_t __user *, info, unsigned int, flags)
{
        struct pid *pid;
        enum pid_type type;
        int ret;

        /* Enforce flags be set to 0 until we add an extension. */
        if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
                return -EINVAL;

        /* Ensure that only a single signal scope determining flag is set. */
        if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
                return -EINVAL;

        switch (pidfd) {
        case PIDFD_SELF_THREAD:
                pid = get_task_pid(current, PIDTYPE_PID);
                type = PIDTYPE_PID;
                break;
        case PIDFD_SELF_THREAD_GROUP:
                pid = get_task_pid(current, PIDTYPE_TGID);
                type = PIDTYPE_TGID;
                break;
        default: {
                CLASS(fd, f)(pidfd);
                if (fd_empty(f))
                        return -EBADF;

                /* Is this a pidfd? */
                pid = pidfd_to_pid(fd_file(f));
                if (IS_ERR(pid))
                        return PTR_ERR(pid);

                if (!access_pidfd_pidns(pid))
                        return -EINVAL;

                /* Infer scope from the type of pidfd. */
                if (fd_file(f)->f_flags & PIDFD_THREAD)
                        type = PIDTYPE_PID;
                else
                        type = PIDTYPE_TGID;

                return do_pidfd_send_signal(pid, sig, type, info, flags);
        }
        }

        ret = do_pidfd_send_signal(pid, sig, type, info, flags);
        put_pid(pid);

        return ret;
}

static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
{
        struct task_struct *p;
        int error = -ESRCH;

        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
                error = check_kill_permission(sig, info, p);
                /*
                 * The null signal is a permissions and process existence
                 * probe.  No signal is actually delivered.
                 */
                if (!error && sig) {
                        error = do_send_sig_info(sig, info, p, PIDTYPE_PID);
                        /*
                         * If lock_task_sighand() failed we pretend the task
                         * dies after receiving the signal. The window is tiny,
                         * and the signal is private anyway.
                         */
                        if (unlikely(error == -ESRCH))
                                error = 0;
                }
        }
        rcu_read_unlock();

        return error;
}

static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
        struct kernel_siginfo info;

        prepare_kill_siginfo(sig, &info, PIDTYPE_PID);

        return do_send_specific(tgid, pid, sig, &info);
}

/**
 *  sys_tgkill - send signal to one specific thread
 *  @tgid: the thread group ID of the thread
 *  @pid: the PID of the thread
 *  @sig: signal to be sent
 *
 *  This syscall also checks the @tgid and returns -ESRCH even if the PID
 *  exists but it's not belonging to the target process anymore. This
 *  method solves the problem of threads exiting and PIDs getting reused.
 */
SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
{
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
                return -EINVAL;

        return do_tkill(tgid, pid, sig);
}

/**
 *  sys_tkill - send signal to one specific task
 *  @pid: the PID of the task
 *  @sig: signal to be sent
 *
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
{
        /* This is only valid for single tasks */
        if (pid <= 0)
                return -EINVAL;

        return do_tkill(0, pid, sig);
}

static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info)
{
        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
            (task_pid_vnr(current) != pid))
                return -EPERM;

        /* POSIX.1b doesn't mention process groups.  */
        return kill_proc_info(sig, info, pid);
}

/**
 *  sys_rt_sigqueueinfo - send signal information to a signal
 *  @pid: the PID of the thread
 *  @sig: signal to be sent
 *  @uinfo: signal info to be sent
 */
SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_sigqueueinfo(pid, sig, &info);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
                        compat_pid_t, pid,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_sigqueueinfo(pid, sig, &info);
}
#endif

static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info)
{
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
                return -EINVAL;

        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
            (task_pid_vnr(current) != pid))
                return -EPERM;

        return do_send_specific(tgid, pid, sig, info);
}

SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
                        compat_pid_t, tgid,
                        compat_pid_t, pid,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#endif

/*
 * For kthreads only, must not be used if cloned with CLONE_SIGHAND
 */
void kernel_sigaction(int sig, __sighandler_t action)
{
        spin_lock_irq(&current->sighand->siglock);
        current->sighand->action[sig - 1].sa.sa_handler = action;
        if (action == SIG_IGN) {
                sigset_t mask;

                sigemptyset(&mask);
                sigaddset(&mask, sig);

                flush_sigqueue_mask(current, &mask, &current->signal->shared_pending);
                flush_sigqueue_mask(current, &mask, &current->pending);
                recalc_sigpending();
        }
        spin_unlock_irq(&current->sighand->siglock);
}
EXPORT_SYMBOL(kernel_sigaction);

void __weak sigaction_compat_abi(struct k_sigaction *act,
                struct k_sigaction *oact)
{
}

int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
        struct task_struct *p = current, *t;
        struct k_sigaction *k;
        sigset_t mask;

        if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
                return -EINVAL;

        k = &p->sighand->action[sig-1];

        spin_lock_irq(&p->sighand->siglock);
        if (k->sa.sa_flags & SA_IMMUTABLE) {
                spin_unlock_irq(&p->sighand->siglock);
                return -EINVAL;
        }
        if (oact)
                *oact = *k;

        /*
         * Make sure that we never accidentally claim to support SA_UNSUPPORTED,
         * e.g. by having an architecture use the bit in their uapi.
         */
        BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED);

        /*
         * Clear unknown flag bits in order to allow userspace to detect missing
         * support for flag bits and to allow the kernel to use non-uapi bits
         * internally.
         */
        if (act)
                act->sa.sa_flags &= UAPI_SA_FLAGS;
        if (oact)
                oact->sa.sa_flags &= UAPI_SA_FLAGS;

        sigaction_compat_abi(act, oact);

        if (act) {
                bool was_ignored = k->sa.sa_handler == SIG_IGN;

                sigdelsetmask(&act->sa.sa_mask,
                              sigmask(SIGKILL) | sigmask(SIGSTOP));
                *k = *act;
                /*
                 * POSIX 3.3.1.3:
                 *  "Setting a signal action to SIG_IGN for a signal that is
                 *   pending shall cause the pending signal to be discarded,
                 *   whether or not it is blocked."
                 *
                 *  "Setting a signal action to SIG_DFL for a signal that is
                 *   pending and whose default action is to ignore the signal
                 *   (for example, SIGCHLD), shall cause the pending signal to
                 *   be discarded, whether or not it is blocked"
                 */
                if (sig_handler_ignored(sig_handler(p, sig), sig)) {
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        flush_sigqueue_mask(p, &mask, &p->signal->shared_pending);
                        for_each_thread(p, t)
                                flush_sigqueue_mask(p, &mask, &t->pending);
                } else if (was_ignored) {
                        posixtimer_sig_unignore(p, sig);
                }
        }

        spin_unlock_irq(&p->sighand->siglock);
        return 0;
}

#ifdef CONFIG_DYNAMIC_SIGFRAME
static inline void sigaltstack_lock(void)
        __acquires(&current->sighand->siglock)
{
        spin_lock_irq(&current->sighand->siglock);
}

static inline void sigaltstack_unlock(void)
        __releases(&current->sighand->siglock)
{
        spin_unlock_irq(&current->sighand->siglock);
}
#else
static inline void sigaltstack_lock(void) { }
static inline void sigaltstack_unlock(void) { }
#endif

static int
do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
                size_t min_ss_size)
{
        struct task_struct *t = current;
        int ret = 0;

        if (oss) {
                memset(oss, 0, sizeof(stack_t));
                oss->ss_sp = (void __user *) t->sas_ss_sp;
                oss->ss_size = t->sas_ss_size;
                oss->ss_flags = sas_ss_flags(sp) |
                        (current->sas_ss_flags & SS_FLAG_BITS);
        }

        if (ss) {
                void __user *ss_sp = ss->ss_sp;
                size_t ss_size = ss->ss_size;
                unsigned ss_flags = ss->ss_flags;
                int ss_mode;

                if (unlikely(on_sig_stack(sp)))
                        return -EPERM;

                ss_mode = ss_flags & ~SS_FLAG_BITS;
                if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
                                ss_mode != 0))
                        return -EINVAL;

                /*
                 * Return before taking any locks if no actual
                 * sigaltstack changes were requested.
                 */
                if (t->sas_ss_sp == (unsigned long)ss_sp &&
                    t->sas_ss_size == ss_size &&
                    t->sas_ss_flags == ss_flags)
                        return 0;

                sigaltstack_lock();
                if (ss_mode == SS_DISABLE) {
                        ss_size = 0;
                        ss_sp = NULL;
                } else {
                        if (unlikely(ss_size < min_ss_size))
                                ret = -ENOMEM;
                        if (!sigaltstack_size_valid(ss_size))
                                ret = -ENOMEM;
                }
                if (!ret) {
                        t->sas_ss_sp = (unsigned long) ss_sp;
                        t->sas_ss_size = ss_size;
                        t->sas_ss_flags = ss_flags;
                }
                sigaltstack_unlock();
        }
        return ret;
}

SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
{
        stack_t new, old;
        int err;
        if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
                return -EFAULT;
        err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
                              current_user_stack_pointer(),
                              MINSIGSTKSZ);
        if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
                err = -EFAULT;
        return err;
}

int restore_altstack(const stack_t __user *uss)
{
        stack_t new;
        if (copy_from_user(&new, uss, sizeof(stack_t)))
                return -EFAULT;
        (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(),
                             MINSIGSTKSZ);
        /* squash all but EFAULT for now */
        return 0;
}

int __save_altstack(stack_t __user *uss, unsigned long sp)
{
        struct task_struct *t = current;
        int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
                __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
        return err;
}

#ifdef CONFIG_COMPAT
static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr,
                                 compat_stack_t __user *uoss_ptr)
{
        stack_t uss, uoss;
        int ret;

        if (uss_ptr) {
                compat_stack_t uss32;
                if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
                        return -EFAULT;
                uss.ss_sp = compat_ptr(uss32.ss_sp);
                uss.ss_flags = uss32.ss_flags;
                uss.ss_size = uss32.ss_size;
        }
        ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
                             compat_user_stack_pointer(),
                             COMPAT_MINSIGSTKSZ);
        if (ret >= 0 && uoss_ptr)  {
                compat_stack_t old;
                memset(&old, 0, sizeof(old));
                old.ss_sp = ptr_to_compat(uoss.ss_sp);
                old.ss_flags = uoss.ss_flags;
                old.ss_size = uoss.ss_size;
                if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
                        ret = -EFAULT;
        }
        return ret;
}

COMPAT_SYSCALL_DEFINE2(sigaltstack,
                        const compat_stack_t __user *, uss_ptr,
                        compat_stack_t __user *, uoss_ptr)
{
        return do_compat_sigaltstack(uss_ptr, uoss_ptr);
}

int compat_restore_altstack(const compat_stack_t __user *uss)
{
        int err = do_compat_sigaltstack(uss, NULL);
        /* squash all but -EFAULT for now */
        return err == -EFAULT ? err : 0;
}

int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
{
        int err;
        struct task_struct *t = current;
        err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
                         &uss->ss_sp) |
                __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
        return err;
}
#endif

#ifdef __ARCH_WANT_SYS_SIGPENDING

/**
 *  sys_sigpending - examine pending signals
 *  @uset: where mask of pending signal is returned
 */
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
{
        sigset_t set;

        if (sizeof(old_sigset_t) > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        if (copy_to_user(uset, &set, sizeof(old_sigset_t)))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
        sigset_t set;

        do_sigpending(&set);

        return put_user(set.sig[0], set32);
}
#endif

#endif

#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/**
 *  sys_sigprocmask - examine and change blocked signals
 *  @how: whether to add, remove, or set signals
 *  @nset: signals to add or remove (if non-null)
 *  @oset: previous value of signal mask if non-null
 *
 * Some platforms have their own version with special arguments;
 * others support only sys_rt_sigprocmask.
 */

SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
                old_sigset_t __user *, oset)
{
        old_sigset_t old_set, new_set;
        sigset_t new_blocked;

        old_set = current->blocked.sig[0];

        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(*nset)))
                        return -EFAULT;

                new_blocked = current->blocked;

                switch (how) {
                case SIG_BLOCK:
                        sigaddsetmask(&new_blocked, new_set);
                        break;
                case SIG_UNBLOCK:
                        sigdelsetmask(&new_blocked, new_set);
                        break;
                case SIG_SETMASK:
                        new_blocked.sig[0] = new_set;
                        break;
                default:
                        return -EINVAL;
                }

                set_current_blocked(&new_blocked);
        }

        if (oset) {
                if (copy_to_user(oset, &old_set, sizeof(*oset)))
                        return -EFAULT;
        }

        return 0;
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */

#ifndef CONFIG_ODD_RT_SIGACTION
/**
 *  sys_rt_sigaction - alter an action taken by a process
 *  @sig: signal to be sent
 *  @act: new sigaction
 *  @oact: used to save the previous sigaction
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct sigaction __user *, act,
                struct sigaction __user *, oact,
                size_t, sigsetsize)
{
        struct k_sigaction new_sa, old_sa;
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
                return -EFAULT;

        ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
        if (ret)
                return ret;

        if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
                return -EFAULT;

        return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct compat_sigaction __user *, act,
                struct compat_sigaction __user *, oact,
                compat_size_t, sigsetsize)
{
        struct k_sigaction new_ka, old_ka;
#ifdef __ARCH_HAS_SA_RESTORER
        compat_uptr_t restorer;
#endif
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;

        if (act) {
                compat_uptr_t handler;
                ret = get_user(handler, &act->sa_handler);
                new_ka.sa.sa_handler = compat_ptr(handler);
#ifdef __ARCH_HAS_SA_RESTORER
                ret |= get_user(restorer, &act->sa_restorer);
                new_ka.sa.sa_restorer = compat_ptr(restorer);
#endif
                ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
                ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
                if (ret)
                        return -EFAULT;
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
        if (!ret && oact) {
                ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
                               &oact->sa_handler);
                ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
                                         sizeof(oact->sa_mask));
                ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
#ifdef __ARCH_HAS_SA_RESTORER
                ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                                &oact->sa_restorer);
#endif
        }
        return ret;
}
#endif
#endif /* !CONFIG_ODD_RT_SIGACTION */

#ifdef CONFIG_OLD_SIGACTION
SYSCALL_DEFINE3(sigaction, int, sig,
                const struct old_sigaction __user *, act,
                struct old_sigaction __user *, oact)
{
        struct k_sigaction new_ka, old_ka;
        int ret;

        if (act) {
                old_sigset_t mask;
                if (!access_ok(act, sizeof(*act)) ||
                    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
                    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
                    __get_user(mask, &act->sa_mask))
                        return -EFAULT;
#ifdef __ARCH_HAS_KA_RESTORER
                new_ka.ka_restorer = NULL;
#endif
                siginitset(&new_ka.sa.sa_mask, mask);
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);

        if (!ret && oact) {
                if (!access_ok(oact, sizeof(*oact)) ||
                    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
                    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                        return -EFAULT;
        }

        return ret;
}
#endif
#ifdef CONFIG_COMPAT_OLD_SIGACTION
COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
                const struct compat_old_sigaction __user *, act,
                struct compat_old_sigaction __user *, oact)
{
        struct k_sigaction new_ka, old_ka;
        int ret;
        compat_old_sigset_t mask;
        compat_uptr_t handler, restorer;

        if (act) {
                if (!access_ok(act, sizeof(*act)) ||
                    __get_user(handler, &act->sa_handler) ||
                    __get_user(restorer, &act->sa_restorer) ||
                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
                    __get_user(mask, &act->sa_mask))
                        return -EFAULT;

#ifdef __ARCH_HAS_KA_RESTORER
                new_ka.ka_restorer = NULL;
#endif
                new_ka.sa.sa_handler = compat_ptr(handler);
                new_ka.sa.sa_restorer = compat_ptr(restorer);
                siginitset(&new_ka.sa.sa_mask, mask);
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);

        if (!ret && oact) {
                if (!access_ok(oact, sizeof(*oact)) ||
                    __put_user(ptr_to_compat(old_ka.sa.sa_handler),
                               &oact->sa_handler) ||
                    __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                               &oact->sa_restorer) ||
                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                        return -EFAULT;
        }
        return ret;
}
#endif

#ifdef CONFIG_SGETMASK_SYSCALL

/*
 * For backwards compatibility.  Functionality superseded by sigprocmask.
 */
SYSCALL_DEFINE0(sgetmask)
{
        /* SMP safe */
        return current->blocked.sig[0];
}

SYSCALL_DEFINE1(ssetmask, int, newmask)
{
        int old = current->blocked.sig[0];
        sigset_t newset;

        siginitset(&newset, newmask);
        set_current_blocked(&newset);

        return old;
}
#endif /* CONFIG_SGETMASK_SYSCALL */

#ifdef __ARCH_WANT_SYS_SIGNAL
/*
 * For backwards compatibility.  Functionality superseded by sigaction.
 */
SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
{
        struct k_sigaction new_sa, old_sa;
        int ret;

        new_sa.sa.sa_handler = handler;
        new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
        sigemptyset(&new_sa.sa.sa_mask);

        ret = do_sigaction(sig, &new_sa, &old_sa);

        return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
}
#endif /* __ARCH_WANT_SYS_SIGNAL */

#ifdef __ARCH_WANT_SYS_PAUSE

SYSCALL_DEFINE0(pause)
{
        while (!signal_pending(current)) {
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        return -ERESTARTNOHAND;
}

#endif

static int sigsuspend(sigset_t *set)
{
        current->saved_sigmask = current->blocked;
        set_current_blocked(set);

        while (!signal_pending(current)) {
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        set_restore_sigmask();
        return -ERESTARTNOHAND;
}

/**
 *  sys_rt_sigsuspend - replace the signal mask for a value with the
 *        @unewset value until a signal is received
 *  @unewset: new signal mask value
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
{
        sigset_t newset;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&newset, unewset, sizeof(newset)))
                return -EFAULT;
        return sigsuspend(&newset);
}
 
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
{
        sigset_t newset;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&newset, unewset))
                return -EFAULT;
        return sigsuspend(&newset);
}
#endif

#ifdef CONFIG_OLD_SIGSUSPEND
SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
{
        sigset_t blocked;
        siginitset(&blocked, mask);
        return sigsuspend(&blocked);
}
#endif
#ifdef CONFIG_OLD_SIGSUSPEND3
SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
{
        sigset_t blocked;
        siginitset(&blocked, mask);
        return sigsuspend(&blocked);
}
#endif

__weak const char *arch_vma_name(struct vm_area_struct *vma)
{
        return NULL;
}

static inline void siginfo_buildtime_checks(void)
{
        BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE);

        /* Verify the offsets in the two siginfos match */
#define CHECK_OFFSET(field) \
        BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field))

        /* kill */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);

        /* timer */
        CHECK_OFFSET(si_tid);
        CHECK_OFFSET(si_overrun);
        CHECK_OFFSET(si_value);

        /* rt */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);
        CHECK_OFFSET(si_value);

        /* sigchld */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);
        CHECK_OFFSET(si_status);
        CHECK_OFFSET(si_utime);
        CHECK_OFFSET(si_stime);

        /* sigfault */
        CHECK_OFFSET(si_addr);
        CHECK_OFFSET(si_trapno);
        CHECK_OFFSET(si_addr_lsb);
        CHECK_OFFSET(si_lower);
        CHECK_OFFSET(si_upper);
        CHECK_OFFSET(si_pkey);
        CHECK_OFFSET(si_perf_data);
        CHECK_OFFSET(si_perf_type);
        CHECK_OFFSET(si_perf_flags);

        /* sigpoll */
        CHECK_OFFSET(si_band);
        CHECK_OFFSET(si_fd);

        /* sigsys */
        CHECK_OFFSET(si_call_addr);
        CHECK_OFFSET(si_syscall);
        CHECK_OFFSET(si_arch);
#undef CHECK_OFFSET

        /* usb asyncio */
        BUILD_BUG_ON(offsetof(struct siginfo, si_pid) !=
                     offsetof(struct siginfo, si_addr));
        if (sizeof(int) == sizeof(void __user *)) {
                BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) !=
                             sizeof(void __user *));
        } else {
                BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) +
                              sizeof_field(struct siginfo, si_uid)) !=
                             sizeof(void __user *));
                BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) !=
                             offsetof(struct siginfo, si_uid));
        }
#ifdef CONFIG_COMPAT
        BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) !=
                     offsetof(struct compat_siginfo, si_addr));
        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
                     sizeof(compat_uptr_t));
        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
                     sizeof_field(struct siginfo, si_pid));
#endif
}

#if defined(CONFIG_SYSCTL)
static const struct ctl_table signal_debug_table[] = {
#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
        {
                .procname        = "exception-trace",
                .data                = &show_unhandled_signals,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec
        },
#endif
};

static const struct ctl_table signal_table[] = {
        {
                .procname        = "print-fatal-signals",
                .data                = &print_fatal_signals,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};

static int __init init_signal_sysctls(void)
{
        register_sysctl_init("debug", signal_debug_table);
        register_sysctl_init("kernel", signal_table);
        return 0;
}
early_initcall(init_signal_sysctls);
#endif /* CONFIG_SYSCTL */

void __init signals_init(void)
{
        siginfo_buildtime_checks();

        sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT);
}

#ifdef CONFIG_KGDB_KDB
#include <linux/kdb.h>
/*
 * kdb_send_sig - Allows kdb to send signals without exposing
 * signal internals.  This function checks if the required locks are
 * available before calling the main signal code, to avoid kdb
 * deadlocks.
 */
void kdb_send_sig(struct task_struct *t, int sig)
{
        static struct task_struct *kdb_prev_t;
        int new_t, ret;
        if (!spin_trylock(&t->sighand->siglock)) {
                kdb_printf("Can't do kill command now.\n"
                           "The sigmask lock is held somewhere else in "
                           "kernel, try again later\n");
                return;
        }
        new_t = kdb_prev_t != t;
        kdb_prev_t = t;
        if (!task_is_running(t) && new_t) {
                spin_unlock(&t->sighand->siglock);
                kdb_printf("Process is not RUNNING, sending a signal from "
                           "kdb risks deadlock\n"
                           "on the run queue locks. "
                           "The signal has _not_ been sent.\n"
                           "Reissue the kill command if you want to risk "
                           "the deadlock.\n");
                return;
        }
        ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
        spin_unlock(&t->sighand->siglock);
        if (ret)
                kdb_printf("Fail to deliver Signal %d to process %d.\n",
                           sig, t->pid);
        else
                kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
}
#endif        /* CONFIG_KGDB_KDB */
















































































































































































































































































































































































   26 
   52 


































   52 


















   46 





   50 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * AppArmor security module
 *
 * This file contains AppArmor label definitions
 *
 * Copyright 2017 Canonical Ltd.
 */

#ifndef __AA_LABEL_H
#define __AA_LABEL_H

#include <linux/atomic.h>
#include <linux/audit.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>

#include "apparmor.h"
#include "lib.h"

struct aa_ns;
struct aa_ruleset;

#define LOCAL_VEC_ENTRIES 8
#define DEFINE_VEC(T, V)                                                \
        struct aa_ ## T *(_ ## V ## _localtmp)[LOCAL_VEC_ENTRIES];        \
        struct aa_ ## T **(V)

#define vec_setup(T, V, N, GFP)                                                \
({                                                                        \
        if ((N) <= LOCAL_VEC_ENTRIES) {                                        \
                typeof(N) i;                                                \
                (V) = (_ ## V ## _localtmp);                                \
                for (i = 0; i < (N); i++)                                \
                        (V)[i] = NULL;                                        \
        } else                                                                \
                (V) = kzalloc(sizeof(struct aa_ ## T *) * (N), (GFP));        \
        (V) ? 0 : -ENOMEM;                                                \
})

#define vec_cleanup(T, V, N)                                                \
do {                                                                        \
        int i;                                                                \
        for (i = 0; i < (N); i++) {                                        \
                if (!IS_ERR_OR_NULL((V)[i]))                                \
                        aa_put_ ## T((V)[i]);                                \
        }                                                                \
        if ((V) != _ ## V ## _localtmp)                                        \
                kfree(V);                                                \
} while (0)

#define vec_last(VEC, SIZE) ((VEC)[(SIZE) - 1])
#define vec_ns(VEC, SIZE) (vec_last((VEC), (SIZE))->ns)
#define vec_labelset(VEC, SIZE) (&vec_ns((VEC), (SIZE))->labels)
#define cleanup_domain_vec(V, L) cleanup_label_vec((V), (L)->size)

struct aa_profile;
#define VEC_FLAG_TERMINATE 1
int aa_vec_unique(struct aa_profile **vec, int n, int flags);
struct aa_label *aa_vec_find_or_create_label(struct aa_profile **vec, int len,
                                             gfp_t gfp);
#define aa_sort_and_merge_vec(N, V) \
        aa_sort_and_merge_profiles((N), (struct aa_profile **)(V))


/* struct aa_labelset - set of labels for a namespace
 *
 * Labels are reference counted; aa_labelset does not contribute to label
 * reference counts. Once a label's last refcount is put it is removed from
 * the set.
 */
struct aa_labelset {
        rwlock_t lock;

        struct rb_root root;
};

#define __labelset_for_each(LS, N) \
        for ((N) = rb_first(&(LS)->root); (N); (N) = rb_next(N))

enum label_flags {
        FLAG_HAT = 1,                        /* profile is a hat */
        FLAG_UNCONFINED = 2,                /* label unconfined only if all */
        FLAG_NULL = 4,                        /* profile is null learning profile */
        FLAG_IX_ON_NAME_ERROR = 8,        /* fallback to ix on name lookup fail */
        FLAG_IMMUTIBLE = 0x10,                /* don't allow changes/replacement */
        FLAG_USER_DEFINED = 0x20,        /* user based profile - lower privs */
        FLAG_NO_LIST_REF = 0x40,        /* list doesn't keep profile ref */
        FLAG_NS_COUNT = 0x80,                /* carries NS ref count */
        FLAG_IN_TREE = 0x100,                /* label is in tree */
        FLAG_PROFILE = 0x200,                /* label is a profile */
        FLAG_EXPLICIT = 0x400,                /* explicit static label */
        FLAG_STALE = 0x800,                /* replaced/removed */
        FLAG_RENAMED = 0x1000,                /* label has renaming in it */
        FLAG_REVOKED = 0x2000,                /* label has revocation in it */
        FLAG_DEBUG1 = 0x4000,
        FLAG_DEBUG2 = 0x8000,

        /* These flags must correspond with PATH_flags */
        /* TODO: add new path flags */
};

struct aa_label;
struct aa_proxy {
        struct aa_common_ref count;
        struct aa_label __rcu *label;
};

struct label_it {
        int i, j;
};

/* struct aa_label_base - base info of label
 * @count: ref count of active users
 * @node: rbtree position
 * @rcu: rcu callback struct
 * @proxy: is set to the label that replaced this label
 * @hname: text representation of the label (MAYBE_NULL)
 * @flags: stale and other flags - values may change under label set lock
 * @secid: secid that references this label
 * @size: number of entries in @ent[]
 * @mediates: bitmask for label_mediates
 * profile: label vec when embedded in a profile FLAG_PROFILE is set
 * rules: variable length rules in a profile FLAG_PROFILE is set
 * vec: vector of profiles comprising the compound label
 */
struct aa_label {
        struct aa_common_ref count;
        struct rb_node node;
        struct rcu_head rcu;
        struct aa_proxy *proxy;
        __counted char *hname;
        long flags;
        u32 secid;
        int size;
        u64 mediates;
        union {
                struct {
                        /* only used is the label is a profile, size of
                         * rules[] is determined by the profile
                         * profile[1] is poison or null as guard
                         */
                        struct aa_profile *profile[2];
                        DECLARE_FLEX_ARRAY(struct aa_ruleset *, rules);
                };
                DECLARE_FLEX_ARRAY(struct aa_profile *, vec);
        };
};

#define last_error(E, FN)                                \
do {                                                        \
        int __subE = (FN);                                \
        if (__subE)                                        \
                (E) = __subE;                                \
} while (0)

#define label_isprofile(X) ((X)->flags & FLAG_PROFILE)
#define label_unconfined(X) ((X)->flags & FLAG_UNCONFINED)
#define unconfined(X) label_unconfined(X)
#define label_is_stale(X) ((X)->flags & FLAG_STALE)
#define __label_make_stale(X) ((X)->flags |= FLAG_STALE)
#define labels_ns(X) (vec_ns(&((X)->vec[0]), (X)->size))
#define labels_set(X) (&labels_ns(X)->labels)
#define labels_view(X) labels_ns(X)
#define labels_profile(X) ((X)->vec[(X)->size - 1])


int aa_label_next_confined(struct aa_label *l, int i);

/* for each profile in a label */
#define label_for_each(I, L, P)                                                \
        for ((I).i = 0; ((P) = (L)->vec[(I).i]); ++((I).i))

/* assumes break/goto ended label_for_each */
#define label_for_each_cont(I, L, P)                                        \
        for (++((I).i); ((P) = (L)->vec[(I).i]); ++((I).i))



/* for each profile that is enforcing confinement in a label */
#define label_for_each_confined(I, L, P)                                \
        for ((I).i = aa_label_next_confined((L), 0);                        \
             ((P) = (L)->vec[(I).i]);                                        \
             (I).i = aa_label_next_confined((L), (I).i + 1))

#define label_for_each_in_merge(I, A, B, P)                                \
        for ((I).i = (I).j = 0;                                                \
             ((P) = aa_label_next_in_merge(&(I), (A), (B)));                \
             )

#define label_for_each_not_in_set(I, SET, SUB, P)                        \
        for ((I).i = (I).j = 0;                                                \
             ((P) = __aa_label_next_not_in_set(&(I), (SET), (SUB)));        \
             )

#define next_in_ns(i, NS, L)                                                \
({                                                                        \
        typeof(i) ___i = (i);                                                \
        while ((L)->vec[___i] && (L)->vec[___i]->ns != (NS))                \
                (___i)++;                                                \
        (___i);                                                                \
})

#define label_for_each_in_ns(I, NS, L, P)                                \
        for ((I).i = next_in_ns(0, (NS), (L));                                \
             ((P) = (L)->vec[(I).i]);                                        \
             (I).i = next_in_ns((I).i + 1, (NS), (L)))

#define fn_for_each_in_ns(L, P, FN)                                        \
({                                                                        \
        struct label_it __i;                                                \
        struct aa_ns *__ns = labels_ns(L);                                \
        int __E = 0;                                                        \
        label_for_each_in_ns(__i, __ns, (L), (P)) {                        \
                last_error(__E, (FN));                                        \
        }                                                                \
        __E;                                                                \
})


#define fn_for_each_XXX(L, P, FN, ...)                                        \
({                                                                        \
        struct label_it i;                                                \
        int __E = 0;                                                        \
        label_for_each ## __VA_ARGS__(i, (L), (P)) {                        \
                last_error(__E, (FN));                                        \
        }                                                                \
        __E;                                                                \
})

#define fn_for_each(L, P, FN) fn_for_each_XXX(L, P, FN)
#define fn_for_each_confined(L, P, FN) fn_for_each_XXX(L, P, FN, _confined)

#define fn_for_each2_XXX(L1, L2, P, FN, ...)                                \
({                                                                        \
        struct label_it i;                                                \
        int __E = 0;                                                        \
        label_for_each ## __VA_ARGS__(i, (L1), (L2), (P)) {                \
                last_error(__E, (FN));                                        \
        }                                                                \
        __E;                                                                \
})

#define fn_for_each_in_merge(L1, L2, P, FN)                                \
        fn_for_each2_XXX((L1), (L2), P, FN, _in_merge)
#define fn_for_each_not_in_set(L1, L2, P, FN)                                \
        fn_for_each2_XXX((L1), (L2), P, FN, _not_in_set)

static inline bool label_mediates(struct aa_label *L, unsigned char C)
{
        return (L)->mediates & (((u64) 1) << (C));
}

static inline bool label_mediates_safe(struct aa_label *L, unsigned char C)
{
        if (C > AA_CLASS_LAST)
                return false;
        return label_mediates(L, C);
}

void aa_labelset_destroy(struct aa_labelset *ls);
void aa_labelset_init(struct aa_labelset *ls);
void __aa_labelset_update_subtree(struct aa_ns *ns);

void aa_label_destroy(struct aa_label *label);
void aa_label_free(struct aa_label *label);
void aa_label_kref(struct kref *kref);
bool aa_label_init(struct aa_label *label, int size, gfp_t gfp);
struct aa_label *aa_label_alloc(int size, struct aa_proxy *proxy, gfp_t gfp);

bool aa_label_is_subset(struct aa_label *set, struct aa_label *sub);
bool aa_label_is_unconfined_subset(struct aa_label *set, struct aa_label *sub);
struct aa_profile *__aa_label_next_not_in_set(struct label_it *I,
                                             struct aa_label *set,
                                             struct aa_label *sub);
bool aa_label_remove(struct aa_label *label);
struct aa_label *aa_label_insert(struct aa_labelset *ls, struct aa_label *l);
bool aa_label_replace(struct aa_label *old, struct aa_label *new);
bool aa_label_make_newest(struct aa_labelset *ls, struct aa_label *old,
                          struct aa_label *new);

struct aa_profile *aa_label_next_in_merge(struct label_it *I,
                                          struct aa_label *a,
                                          struct aa_label *b);
struct aa_label *aa_label_find_merge(struct aa_label *a, struct aa_label *b);
struct aa_label *aa_label_merge(struct aa_label *a, struct aa_label *b,
                                gfp_t gfp);


bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp);

#define FLAGS_NONE 0
#define FLAG_SHOW_MODE 1
#define FLAG_VIEW_SUBNS 2
#define FLAG_HIDDEN_UNCONFINED 4
#define FLAG_ABS_ROOT 8
int aa_label_snxprint(char *str, size_t size, struct aa_ns *view,
                      struct aa_label *label, int flags);
int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label,
                      int flags, gfp_t gfp);
int aa_label_acntsxprint(char __counted **strp, struct aa_ns *ns,
                         struct aa_label *label, int flags, gfp_t gfp);
void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns,
                     struct aa_label *label, int flags, gfp_t gfp);
void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns,
                         struct aa_label *label, int flags, gfp_t gfp);
void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags,
                      gfp_t gfp);
void aa_label_printk(struct aa_label *label, gfp_t gfp);

struct aa_label *aa_label_strn_parse(struct aa_label *base, const char *str,
                                     size_t n, gfp_t gfp, bool create,
                                     bool force_stack);
struct aa_label *aa_label_parse(struct aa_label *base, const char *str,
                                gfp_t gfp, bool create, bool force_stack);

static inline const char *aa_label_strn_split(const char *str, int n)
{
        const char *pos;
        aa_state_t state;

        state = aa_dfa_matchn_until(stacksplitdfa, DFA_START, str, n, &pos);
        if (!ACCEPT_TABLE(stacksplitdfa)[state])
                return NULL;

        return pos - 3;
}

static inline const char *aa_label_str_split(const char *str)
{
        const char *pos;
        aa_state_t state;

        state = aa_dfa_match_until(stacksplitdfa, DFA_START, str, &pos);
        if (!ACCEPT_TABLE(stacksplitdfa)[state])
                return NULL;

        return pos - 3;
}



struct aa_perms;
struct aa_ruleset;
int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules,
                   struct aa_label *label, aa_state_t state, bool subns,
                   u32 request, struct aa_perms *perms);


/**
 * __aa_get_label - get a reference count to uncounted label reference
 * @l: reference to get a count on
 *
 * Returns: pointer to reference OR NULL if race is lost and reference is
 *          being repeated.
 * Requires: lock held, and the return code MUST be checked
 */
static inline struct aa_label *__aa_get_label(struct aa_label *l)
{
        if (l && kref_get_unless_zero(&l->count.count))
                return l;

        return NULL;
}

static inline struct aa_label *aa_get_label(struct aa_label *l)
{
        if (l)
                kref_get(&(l->count.count));

        return l;
}


/**
 * aa_get_label_rcu - increment refcount on a label that can be replaced
 * @l: pointer to label that can be replaced (NOT NULL)
 *
 * Returns: pointer to a refcounted label.
 *     else NULL if no label
 */
static inline struct aa_label *aa_get_label_rcu(struct aa_label __rcu **l)
{
        struct aa_label *c;

        rcu_read_lock();
        do {
                c = rcu_dereference(*l);
        } while (c && !kref_get_unless_zero(&c->count.count));
        rcu_read_unlock();

        return c;
}

/**
 * aa_get_newest_label - find the newest version of @l
 * @l: the label to check for newer versions of
 *
 * Returns: refcounted newest version of @l taking into account
 *          replacement, renames and removals
 *          return @l.
 */
static inline struct aa_label *aa_get_newest_label(struct aa_label *l)
{
        if (!l)
                return NULL;

        if (label_is_stale(l)) {
                struct aa_label *tmp;

                AA_BUG(!l->proxy);
                AA_BUG(!l->proxy->label);
                /* BUG: only way this can happen is @l ref count and its
                 * replacement count have gone to 0 and are on their way
                 * to destruction. ie. we have a refcounting error
                 */
                tmp = aa_get_label_rcu(&l->proxy->label);
                AA_BUG(!tmp);

                return tmp;
        }

        return aa_get_label(l);
}

static inline void aa_put_label(struct aa_label *l)
{
        if (l)
                kref_put(&l->count.count, aa_label_kref);
}

/* wrapper fn to indicate semantics of the check */
static inline bool __aa_subj_label_is_cached(struct aa_label *subj_label,
                                          struct aa_label *obj_label)
{
        return aa_label_is_subset(obj_label, subj_label);
}


struct aa_proxy *aa_alloc_proxy(struct aa_label *l, gfp_t gfp);
void aa_proxy_kref(struct kref *kref);

static inline struct aa_proxy *aa_get_proxy(struct aa_proxy *proxy)
{
        if (proxy)
                kref_get(&(proxy->count.count));

        return proxy;
}

static inline void aa_put_proxy(struct aa_proxy *proxy)
{
        if (proxy)
                kref_put(&proxy->count.count, aa_proxy_kref);
}

void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new);

#endif /* __AA_LABEL_H */







































































































































































































































































































































































































































































































































































































































































































































































































































































   14 




































































































































































































































   36 


























































































































































































































































































































































































































































































































































































   13 

















































































































































































































































































































































































   13 

































































































































































































































































    3 






































































































































































   21 







































































































    4 





















































   15 















































    3 

















































































    4 

























































































































































































    3 















    3 



















    3 
























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the AF_INET socket handler.
 *
 * Version:        @(#)sock.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche <flla@stud.uni-sb.de>
 *
 * Fixes:
 *                Alan Cox        :        Volatiles in skbuff pointers. See
 *                                        skbuff comments. May be overdone,
 *                                        better to prove they can be removed
 *                                        than the reverse.
 *                Alan Cox        :        Added a zapped field for tcp to note
 *                                        a socket is reset and must stay shut up
 *                Alan Cox        :        New fields for options
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Eliminate low level recv/recvfrom
 *                David S. Miller        :        New socket lookup architecture.
 *              Steve Whitehouse:       Default routines for sock_ops
 *              Arnaldo C. Melo :        removed net_pinfo, tp_pinfo and made
 *                                      protinfo be just a void pointer, as the
 *                                      protocol specific parts were moved to
 *                                      respective headers and ipv4/v6, etc now
 *                                      use private slabcaches for its socks
 *              Pedro Hortas        :        New flags field for socket options
 */
#ifndef _SOCK_H
#define _SOCK_H

#include <linux/hardirq.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/timer.h>
#include <linux/cache.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>        /* struct sk_buff */
#include <linux/mm.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/static_key.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cgroup-defs.h>
#include <linux/rbtree.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
#include <linux/sockptr.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/llist.h>
#include <net/dst.h>
#include <net/checksum.h>
#include <net/tcp_states.h>
#include <linux/net_tstamp.h>
#include <net/l3mdev.h>
#include <uapi/linux/socket.h>

/*
 * This structure really needs to be cleaned up.
 * Most of it is for TCP, and not used by any of
 * the other protocols.
 */

/* This is the per-socket lock.  The spinlock provides a synchronization
 * between user contexts and software interrupt processing, whereas the
 * mini-semaphore synchronizes multiple users amongst themselves.
 */
typedef struct {
        union {
                struct slock_owned {
                        int                owned;
                        spinlock_t        slock;
                };
                long        combined;
        };
        wait_queue_head_t        wq;
        /*
         * We express the mutex-alike socket_lock semantics
         * to the lock validator by explicitly managing
         * the slock as a lock variant (in addition to
         * the slock itself):
         */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
#endif
} socket_lock_t;

struct sock;
struct proto;
struct net;

typedef __u32 __bitwise __portpair;
typedef __u64 __bitwise __addrpair;

/**
 *        struct sock_common - minimal network layer representation of sockets
 *        @skc_daddr: Foreign IPv4 addr
 *        @skc_rcv_saddr: Bound local IPv4 addr
 *        @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr
 *        @skc_hash: hash value used with various protocol lookup tables
 *        @skc_u16hashes: two u16 hash values used by UDP lookup tables
 *        @skc_dport: placeholder for inet_dport/tw_dport
 *        @skc_num: placeholder for inet_num/tw_num
 *        @skc_portpair: __u32 union of @skc_dport & @skc_num
 *        @skc_family: network address family
 *        @skc_state: Connection state
 *        @skc_reuse: %SO_REUSEADDR setting
 *        @skc_reuseport: %SO_REUSEPORT setting
 *        @skc_ipv6only: socket is IPV6 only
 *        @skc_net_refcnt: socket is using net ref counting
 *        @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb
 *        @skc_bound_dev_if: bound device index if != 0
 *        @skc_bind_node: bind hash linkage for various protocol lookup tables
 *        @skc_portaddr_node: second hash linkage for UDP
 *        @skc_prot: protocol handlers inside a network family
 *        @skc_net: reference to the network namespace of this socket
 *        @skc_v6_daddr: IPV6 destination address
 *        @skc_v6_rcv_saddr: IPV6 source address
 *        @skc_cookie: socket's cookie value
 *        @skc_node: main hash linkage for various protocol lookup tables
 *        @skc_nulls_node: main hash linkage for TCP
 *        @skc_tx_queue_mapping: tx queue number for this connection
 *        @skc_rx_queue_mapping: rx queue number for this connection
 *        @skc_flags: place holder for sk_flags
 *                %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 *                %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
 *        @skc_listener: connection request listener socket (aka rsk_listener)
 *                [union with @skc_flags]
 *        @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row
 *                [union with @skc_flags]
 *        @skc_incoming_cpu: record/match cpu processing incoming packets
 *        @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled)
 *                [union with @skc_incoming_cpu]
 *        @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
 *                [union with @skc_incoming_cpu]
 *        @skc_refcnt: reference count
 *
 *        This is the minimal network layer representation of sockets, the header
 *        for struct sock and struct inet_timewait_sock.
 */
struct sock_common {
        union {
                __addrpair        skc_addrpair;
                struct {
                        __be32        skc_daddr;
                        __be32        skc_rcv_saddr;
                };
        };
        union  {
                unsigned int        skc_hash;
                __u16                skc_u16hashes[2];
        };
        /* skc_dport && skc_num must be grouped as well */
        union {
                __portpair        skc_portpair;
                struct {
                        __be16        skc_dport;
                        __u16        skc_num;
                };
        };

        unsigned short                skc_family;
        volatile unsigned char        skc_state;
        unsigned char                skc_reuse:4;
        unsigned char                skc_reuseport:1;
        unsigned char                skc_ipv6only:1;
        unsigned char                skc_net_refcnt:1;
        unsigned char                skc_bypass_prot_mem:1;
        int                        skc_bound_dev_if;
        union {
                struct hlist_node        skc_bind_node;
                struct hlist_node        skc_portaddr_node;
        };
        struct proto                *skc_prot;
        possible_net_t                skc_net;

#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr                skc_v6_daddr;
        struct in6_addr                skc_v6_rcv_saddr;
#endif

        atomic64_t                skc_cookie;

        /* following fields are padding to force
         * offset(struct sock, sk_refcnt) == 128 on 64bit arches
         * assuming IPV6 is enabled. We use this padding differently
         * for different kind of 'sockets'
         */
        union {
                unsigned long        skc_flags;
                struct sock        *skc_listener; /* request_sock */
                struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
        };
        /*
         * fields between dontcopy_begin/dontcopy_end
         * are not copied in sock_copy()
         */
        /* private: */
        int                        skc_dontcopy_begin[0];
        /* public: */
        union {
                struct hlist_node        skc_node;
                struct hlist_nulls_node skc_nulls_node;
        };
        unsigned short                skc_tx_queue_mapping;
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        unsigned short                skc_rx_queue_mapping;
#endif
        union {
                int                skc_incoming_cpu;
                u32                skc_rcv_wnd;
                u32                skc_tw_rcv_nxt; /* struct tcp_timewait_sock  */
        };

        refcount_t                skc_refcnt;
        /* private: */
        int                     skc_dontcopy_end[0];
        union {
                u32                skc_rxhash;
                u32                skc_window_clamp;
                u32                skc_tw_snd_nxt; /* struct tcp_timewait_sock */
        };
        /* public: */
};

struct bpf_local_storage;
struct sk_filter;

/**
  *        struct sock - network layer representation of sockets
  *        @__sk_common: shared layout with inet_timewait_sock
  *        @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  *        @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *        @sk_lock:        synchronizer
  *        @sk_kern_sock: True if sock is using kernel lock classes
  *        @sk_rcvbuf: size of receive buffer in bytes
  *        @sk_wq: sock wait queue and async head
  *        @sk_rx_dst: receive input route used by early demux
  *        @sk_rx_dst_ifindex: ifindex for @sk_rx_dst
  *        @sk_rx_dst_cookie: cookie for @sk_rx_dst
  *        @sk_dst_cache: destination cache
  *        @sk_dst_pending_confirm: need to confirm neighbour
  *        @sk_policy: flow policy
  *        @psp_assoc: PSP association, if socket is PSP-secured
  *        @sk_receive_queue: incoming packets
  *        @sk_wmem_alloc: transmit queue bytes committed
  *        @sk_tsq_flags: TCP Small Queues flags
  *        @sk_write_queue: Packet sending queue
  *        @sk_omem_alloc: "o" is "option" or "other"
  *        @sk_wmem_queued: persistent queue size
  *        @sk_forward_alloc: space allocated forward
  *        @sk_reserved_mem: space reserved and non-reclaimable for the socket
  *        @sk_napi_id: id of the last napi context to receive data for sk
  *        @sk_ll_usec: usecs to busypoll when there is no data
  *        @sk_allocation: allocation mode
  *        @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
  *        @sk_pacing_status: Pacing status (requested, handled by sch_fq)
  *        @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
  *        @sk_sndbuf: size of send buffer in bytes
  *        @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
  *        @sk_no_check_rx: allow zero checksum in RX packets
  *        @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
  *        @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden.
  *        @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
  *        @sk_gso_max_size: Maximum GSO segment size to build
  *        @sk_gso_max_segs: Maximum number of GSO segments
  *        @sk_pacing_shift: scaling factor for TCP Small Queues
  *        @sk_lingertime: %SO_LINGER l_linger setting
  *        @sk_backlog: always used with the per-socket spinlock held
  *        @sk_callback_lock: used with the callbacks in the end of this struct
  *        @sk_error_queue: rarely used
  *        @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
  *                          IPV6_ADDRFORM for instance)
  *        @sk_err: last error
  *        @sk_err_soft: errors that don't cause failure but are the cause of a
  *                      persistent failure not just 'timed out'
  *        @sk_drops: raw/udp drops counter
  *        @sk_drop_counters: optional pointer to numa_drop_counters
  *        @sk_ack_backlog: current listen backlog
  *        @sk_max_ack_backlog: listen backlog set in listen()
  *        @sk_uid: user id of owner
  *        @sk_ino: inode number (zero if orphaned)
  *        @sk_prefer_busy_poll: prefer busypolling over softirq processing
  *        @sk_busy_poll_budget: napi processing budget when busypolling
  *        @sk_priority: %SO_PRIORITY setting
  *        @sk_type: socket type (%SOCK_STREAM, etc)
  *        @sk_protocol: which protocol this socket belongs in this network family
  *        @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred
  *        @sk_peer_pid: &struct pid for this socket's peer
  *        @sk_peer_cred: %SO_PEERCRED setting
  *        @sk_rcvlowat: %SO_RCVLOWAT setting
  *        @sk_rcvtimeo: %SO_RCVTIMEO setting
  *        @sk_sndtimeo: %SO_SNDTIMEO setting
  *        @sk_txhash: computed flow hash for use on transmit
  *        @sk_txrehash: enable TX hash rethink
  *        @sk_filter: socket filtering instructions
  *        @sk_timer: sock cleanup timer
  *        @tcp_retransmit_timer: tcp retransmit timer
  *        @mptcp_retransmit_timer: mptcp retransmit timer
  *        @sk_stamp: time stamp of last packet received
  *        @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
  *        @sk_tsflags: SO_TIMESTAMPING flags
  *        @sk_bpf_cb_flags: used in bpf_setsockopt()
  *        @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
  *                           Sockets that can be used under memory reclaim should
  *                           set this to false.
  *        @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
  *                      for timestamping
  *        @sk_tskey: counter to disambiguate concurrent tstamp requests
  *        @sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh.
  *        @sk_zckey: counter to order MSG_ZEROCOPY notifications
  *        @sk_socket: Identd and reporting IO signals
  *        @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
  *        @sk_frag: cached page frag
  *        @sk_peek_off: current peek_offset value
  *        @sk_send_head: front of stuff to transmit
  *        @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
  *        @sk_security: used by security modules
  *        @sk_mark: generic packet mark
  *        @sk_cgrp_data: cgroup data for this cgroup
  *        @sk_memcg: this socket's memory cgroup association
  *        @sk_write_pending: a write to stream socket waits to start
  *        @sk_disconnects: number of disconnect operations performed on this sock
  *        @sk_state_change: callback to indicate change in the state of the sock
  *        @sk_data_ready: callback to indicate there is data to be processed
  *        @sk_write_space: callback to indicate there is bf sending space available
  *        @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
  *        @sk_backlog_rcv: callback to process the backlog
  *        @sk_validate_xmit_skb: ptr to an optional validate function
  *        @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
  *        @sk_reuseport_cb: reuseport group container
  *        @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
  *        @sk_rcu: used during RCU grace period
  *        @sk_freeptr: used for SLAB_TYPESAFE_BY_RCU managed sockets
  *        @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
  *        @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
  *        @sk_txtime_report_errors: set report errors mode for SO_TXTIME
  *        @sk_txtime_unused: unused txtime flags
  *        @sk_scm_recv_flags: all flags used by scm_recv()
  *        @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS
  *        @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY
  *        @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD
  *        @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS
  *        @sk_scm_unused: unused flags for scm_recv()
  *        @ns_tracker: tracker for netns reference
  *        @sk_user_frags: xarray of pages the user is holding a reference on.
  *        @sk_owner: reference to the real owner of the socket that calls
  *                   sock_lock_init_class_and_name().
  */
struct sock {
        /*
         * Now struct inet_timewait_sock also uses sock_common, so please just
         * don't add nothing before this first member (__sk_common) --acme
         */
        struct sock_common        __sk_common;
#define sk_node                        __sk_common.skc_node
#define sk_nulls_node                __sk_common.skc_nulls_node
#define sk_refcnt                __sk_common.skc_refcnt
#define sk_tx_queue_mapping        __sk_common.skc_tx_queue_mapping
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
#define sk_rx_queue_mapping        __sk_common.skc_rx_queue_mapping
#endif

#define sk_dontcopy_begin        __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end                __sk_common.skc_dontcopy_end
#define sk_hash                        __sk_common.skc_hash
#define sk_portpair                __sk_common.skc_portpair
#define sk_num                        __sk_common.skc_num
#define sk_dport                __sk_common.skc_dport
#define sk_addrpair                __sk_common.skc_addrpair
#define sk_daddr                __sk_common.skc_daddr
#define sk_rcv_saddr                __sk_common.skc_rcv_saddr
#define sk_family                __sk_common.skc_family
#define sk_state                __sk_common.skc_state
#define sk_reuse                __sk_common.skc_reuse
#define sk_reuseport                __sk_common.skc_reuseport
#define sk_ipv6only                __sk_common.skc_ipv6only
#define sk_net_refcnt                __sk_common.skc_net_refcnt
#define sk_bypass_prot_mem        __sk_common.skc_bypass_prot_mem
#define sk_bound_dev_if                __sk_common.skc_bound_dev_if
#define sk_bind_node                __sk_common.skc_bind_node
#define sk_prot                        __sk_common.skc_prot
#define sk_net                        __sk_common.skc_net
#define sk_v6_daddr                __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr        __sk_common.skc_v6_rcv_saddr
#define sk_cookie                __sk_common.skc_cookie
#define sk_incoming_cpu                __sk_common.skc_incoming_cpu
#define sk_flags                __sk_common.skc_flags
#define sk_rxhash                __sk_common.skc_rxhash

        __cacheline_group_begin(sock_write_rx);

        atomic_t                sk_drops;
        __s32                        sk_peek_off;
        struct sk_buff_head        sk_error_queue;
        struct sk_buff_head        sk_receive_queue;
        /*
         * The backlog queue is special, it is always used with
         * the per-socket spinlock held and requires low latency
         * access. Therefore we special case it's implementation.
         * Note : rmem_alloc is in this structure to fill a hole
         * on 64bit arches, not because its logically part of
         * backlog.
         */
        struct {
                atomic_t        rmem_alloc;
                int                len;
                struct sk_buff        *head;
                struct sk_buff        *tail;
        } sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc

        __cacheline_group_end(sock_write_rx);

        __cacheline_group_begin(sock_read_rx);
        /* early demux fields */
        struct dst_entry __rcu        *sk_rx_dst;
        int                        sk_rx_dst_ifindex;
        u32                        sk_rx_dst_cookie;

#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int                sk_ll_usec;
        unsigned int                sk_napi_id;
        u16                        sk_busy_poll_budget;
        u8                        sk_prefer_busy_poll;
#endif
        u8                        sk_userlocks;
        int                        sk_rcvbuf;

        struct sk_filter __rcu        *sk_filter;
        union {
                struct socket_wq __rcu        *sk_wq;
                /* private: */
                struct socket_wq        *sk_wq_raw;
                /* public: */
        };

        void                        (*sk_data_ready)(struct sock *sk);
        long                        sk_rcvtimeo;
        int                        sk_rcvlowat;
        __cacheline_group_end(sock_read_rx);

        __cacheline_group_begin(sock_read_rxtx);
        int                        sk_err;
        struct socket                *sk_socket;
#ifdef CONFIG_MEMCG
        struct mem_cgroup        *sk_memcg;
#endif
#ifdef CONFIG_XFRM
        struct xfrm_policy __rcu *sk_policy[2];
#endif
#if IS_ENABLED(CONFIG_INET_PSP)
        struct psp_assoc __rcu        *psp_assoc;
#endif
        __cacheline_group_end(sock_read_rxtx);

        __cacheline_group_begin(sock_write_rxtx);
        socket_lock_t                sk_lock;
        u32                        sk_reserved_mem;
        int                        sk_forward_alloc;
        u32                        sk_tsflags;
        __cacheline_group_end(sock_write_rxtx);

        __cacheline_group_begin(sock_write_tx);
        int                        sk_write_pending;
        atomic_t                sk_omem_alloc;
        int                        sk_err_soft;

        int                        sk_wmem_queued;
        refcount_t                sk_wmem_alloc;
        unsigned long                sk_tsq_flags;
        union {
                struct sk_buff        *sk_send_head;
                struct rb_root        tcp_rtx_queue;
        };
        struct sk_buff_head        sk_write_queue;
        struct page_frag        sk_frag;
        union {
                struct timer_list        sk_timer;
                struct timer_list        tcp_retransmit_timer;
                struct timer_list        mptcp_retransmit_timer;
        };
        unsigned long                sk_pacing_rate; /* bytes per second */
        atomic_t                sk_zckey;
        atomic_t                sk_tskey;
        unsigned long                sk_tx_queue_mapping_jiffies;
        __cacheline_group_end(sock_write_tx);

        __cacheline_group_begin(sock_read_tx);
        u32                        sk_dst_pending_confirm;
        u32                        sk_pacing_status; /* see enum sk_pacing */
        unsigned long                sk_max_pacing_rate;
        long                        sk_sndtimeo;
        u32                        sk_priority;
        u32                        sk_mark;
        kuid_t                        sk_uid;
        u16                        sk_protocol;
        u16                        sk_type;
        struct dst_entry __rcu        *sk_dst_cache;
        netdev_features_t        sk_route_caps;
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sk_buff*                (*sk_validate_xmit_skb)(struct sock *sk,
                                                        struct net_device *dev,
                                                        struct sk_buff *skb);
#endif
        u16                        sk_gso_type;
        u16                        sk_gso_max_segs;
        unsigned int                sk_gso_max_size;
        gfp_t                        sk_allocation;
        u32                        sk_txhash;
        int                        sk_sndbuf;
        u8                        sk_pacing_shift;
        bool                        sk_use_task_frag;
        __cacheline_group_end(sock_read_tx);

        /*
         * Because of non atomicity rules, all
         * changes are protected by socket lock.
         */
        u8                        sk_gso_disabled : 1,
                                sk_kern_sock : 1,
                                sk_no_check_tx : 1,
                                sk_no_check_rx : 1;
        u8                        sk_shutdown;
        unsigned long                sk_lingertime;
        struct proto                *sk_prot_creator;
        rwlock_t                sk_callback_lock;
        u32                        sk_ack_backlog;
        u32                        sk_max_ack_backlog;
        u64                        sk_ino;
        spinlock_t                sk_peer_lock;
        int                        sk_bind_phc;
        struct pid                *sk_peer_pid;
        const struct cred        *sk_peer_cred;

        ktime_t                        sk_stamp;
#if BITS_PER_LONG==32
        seqlock_t                sk_stamp_seq;
#endif
        int                        sk_disconnects;

        union {
                u8                sk_txrehash;
                u8                sk_scm_recv_flags;
                struct {
                        u8        sk_scm_credentials : 1,
                                sk_scm_security : 1,
                                sk_scm_pidfd : 1,
                                sk_scm_rights : 1,
                                sk_scm_unused : 4;
                };
        };
        u8                        sk_clockid;
        u8                        sk_txtime_deadline_mode : 1,
                                sk_txtime_report_errors : 1,
                                sk_txtime_unused : 6;
#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
        u8                        sk_bpf_cb_flags;

        void                        *sk_user_data;
#ifdef CONFIG_SECURITY
        void                        *sk_security;
#endif
        struct sock_cgroup_data        sk_cgrp_data;
        void                        (*sk_state_change)(struct sock *sk);
        void                        (*sk_write_space)(struct sock *sk);
        void                        (*sk_error_report)(struct sock *sk);
        int                        (*sk_backlog_rcv)(struct sock *sk,
                                                  struct sk_buff *skb);
        void                    (*sk_destruct)(struct sock *sk);
        struct sock_reuseport __rcu        *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL
        struct bpf_local_storage __rcu        *sk_bpf_storage;
#endif
        struct numa_drop_counters *sk_drop_counters;
        /* sockets using SLAB_TYPESAFE_BY_RCU can use sk_freeptr.
         * By the time kfree() is called, sk_rcu can not be in
         * use and can be mangled.
         */
        union {
                struct rcu_head        sk_rcu;
                freeptr_t        sk_freeptr;
        };
        netns_tracker                ns_tracker;
        struct xarray                sk_user_frags;

#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
        struct module                *sk_owner;
#endif
};

struct sock_bh_locked {
        struct sock *sock;
        local_lock_t bh_lock;
};

enum sk_pacing {
        SK_PACING_NONE                = 0,
        SK_PACING_NEEDED        = 1,
        SK_PACING_FQ                = 2,
};

/* flag bits in sk_user_data
 *
 * - SK_USER_DATA_NOCOPY:      Pointer stored in sk_user_data might
 *   not be suitable for copying when cloning the socket. For instance,
 *   it can point to a reference counted object. sk_user_data bottom
 *   bit is set if pointer must not be copied.
 *
 * - SK_USER_DATA_BPF:         Mark whether sk_user_data field is
 *   managed/owned by a BPF reuseport array. This bit should be set
 *   when sk_user_data's sk is added to the bpf's reuseport_array.
 *
 * - SK_USER_DATA_PSOCK:       Mark whether pointer stored in
 *   sk_user_data points to psock type. This bit should be set
 *   when sk_user_data is assigned to a psock object.
 */
#define SK_USER_DATA_NOCOPY        1UL
#define SK_USER_DATA_BPF        2UL
#define SK_USER_DATA_PSOCK        4UL
#define SK_USER_DATA_PTRMASK        ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
                                  SK_USER_DATA_PSOCK)

/**
 * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
 * @sk: socket
 */
static inline bool sk_user_data_is_nocopy(const struct sock *sk)
{
        return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY);
}

#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))

/**
 * __locked_read_sk_user_data_with_flags - return the pointer
 * only if argument flags all has been set in sk_user_data. Otherwise
 * return NULL
 *
 * @sk: socket
 * @flags: flag bits
 *
 * The caller must be holding sk->sk_callback_lock.
 */
static inline void *
__locked_read_sk_user_data_with_flags(const struct sock *sk,
                                      uintptr_t flags)
{
        uintptr_t sk_user_data =
                (uintptr_t)rcu_dereference_check(__sk_user_data(sk),
                                                 lockdep_is_held(&sk->sk_callback_lock));

        WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);

        if ((sk_user_data & flags) == flags)
                return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
        return NULL;
}

/**
 * __rcu_dereference_sk_user_data_with_flags - return the pointer
 * only if argument flags all has been set in sk_user_data. Otherwise
 * return NULL
 *
 * @sk: socket
 * @flags: flag bits
 */
static inline void *
__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
                                          uintptr_t flags)
{
        uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));

        WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);

        if ((sk_user_data & flags) == flags)
                return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
        return NULL;
}

#define rcu_dereference_sk_user_data(sk)                                \
        __rcu_dereference_sk_user_data_with_flags(sk, 0)
#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags)                \
({                                                                        \
        uintptr_t __tmp1 = (uintptr_t)(ptr),                                \
                  __tmp2 = (uintptr_t)(flags);                                \
        WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK);                        \
        WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK);                        \
        rcu_assign_pointer(__sk_user_data((sk)),                        \
                           __tmp1 | __tmp2);                                \
})
#define rcu_assign_sk_user_data(sk, ptr)                                \
        __rcu_assign_sk_user_data_with_flags(sk, ptr, 0)

static inline
struct net *sock_net(const struct sock *sk)
{
        return read_pnet(&sk->sk_net);
}

static inline
void sock_net_set(struct sock *sk, struct net *net)
{
        write_pnet(&sk->sk_net, net);
}

/*
 * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
 * or not whether his port will be reused by someone else. SK_FORCE_REUSE
 * on a socket means that the socket will reuse everybody else's port
 * without looking at the other's sk_reuse value.
 */

#define SK_NO_REUSE        0
#define SK_CAN_REUSE        1
#define SK_FORCE_REUSE        2

int sk_set_peek_off(struct sock *sk, int val);

static inline int sk_peek_offset(const struct sock *sk, int flags)
{
        if (unlikely(flags & MSG_PEEK)) {
                return READ_ONCE(sk->sk_peek_off);
        }

        return 0;
}

static inline void sk_peek_offset_bwd(struct sock *sk, int val)
{
        s32 off = READ_ONCE(sk->sk_peek_off);

        if (unlikely(off >= 0)) {
                off = max_t(s32, off - val, 0);
                WRITE_ONCE(sk->sk_peek_off, off);
        }
}

static inline void sk_peek_offset_fwd(struct sock *sk, int val)
{
        sk_peek_offset_bwd(sk, -val);
}

/*
 * Hashed lists helper routines
 */
static inline struct sock *sk_entry(const struct hlist_node *node)
{
        return hlist_entry(node, struct sock, sk_node);
}

static inline struct sock *__sk_head(const struct hlist_head *head)
{
        return hlist_entry(head->first, struct sock, sk_node);
}

static inline struct sock *sk_head(const struct hlist_head *head)
{
        return hlist_empty(head) ? NULL : __sk_head(head);
}

static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
}

static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
}

static inline struct sock *sk_next(const struct sock *sk)
{
        return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node);
}

static inline struct sock *sk_nulls_next(const struct sock *sk)
{
        return (!is_a_nulls(sk->sk_nulls_node.next)) ?
                hlist_nulls_entry(sk->sk_nulls_node.next,
                                  struct sock, sk_nulls_node) :
                NULL;
}

static inline bool sk_unhashed(const struct sock *sk)
{
        return hlist_unhashed(&sk->sk_node);
}

static inline bool sk_hashed(const struct sock *sk)
{
        return !sk_unhashed(sk);
}

static inline void sk_node_init(struct hlist_node *node)
{
        node->pprev = NULL;
}

static inline void __sk_del_node(struct sock *sk)
{
        __hlist_del(&sk->sk_node);
}

/* NB: equivalent to hlist_del_init_rcu */
static inline bool __sk_del_node_init(struct sock *sk)
{
        if (sk_hashed(sk)) {
                __sk_del_node(sk);
                sk_node_init(&sk->sk_node);
                return true;
        }
        return false;
}

/* Grab socket reference count. This operation is valid only
   when sk is ALREADY grabbed f.e. it is found in hash table
   or a list and the lookup is made under lock preventing hash table
   modifications.
 */

static __always_inline void sock_hold(struct sock *sk)
{
        refcount_inc(&sk->sk_refcnt);
}

/* Ungrab socket in the context, which assumes that socket refcnt
   cannot hit zero, f.e. it is true in context of any socketcall.
 */
static __always_inline void __sock_put(struct sock *sk)
{
        refcount_dec(&sk->sk_refcnt);
}

static inline bool sk_del_node_init(struct sock *sk)
{
        bool rc = __sk_del_node_init(sk);

        if (rc)
                __sock_put(sk);

        return rc;
}
#define sk_del_node_init_rcu(sk)        sk_del_node_init(sk)

static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk)
{
        if (sk_hashed(sk)) {
                hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
                return true;
        }
        return false;
}

static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
{
        bool rc = __sk_nulls_del_node_init_rcu(sk);

        if (rc)
                __sock_put(sk);

        return rc;
}

static inline bool sk_nulls_replace_node_init_rcu(struct sock *old,
                                                  struct sock *new)
{
        if (sk_hashed(old)) {
                hlist_nulls_replace_init_rcu(&old->sk_nulls_node,
                                             &new->sk_nulls_node);
                __sock_put(old);
                return true;
        }

        return false;
}

static inline void __sk_add_node(struct sock *sk, struct hlist_head *list)
{
        hlist_add_head(&sk->sk_node, list);
}

static inline void sk_add_node(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        __sk_add_node(sk, list);
}

static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
            sk->sk_family == AF_INET6)
                hlist_add_tail_rcu(&sk->sk_node, list);
        else
                hlist_add_head_rcu(&sk->sk_node, list);
}

static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        hlist_add_tail_rcu(&sk->sk_node, list);
}

static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
}

static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
}

static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        sock_hold(sk);
        __sk_nulls_add_node_rcu(sk, list);
}

static inline void __sk_del_bind_node(struct sock *sk)
{
        __hlist_del(&sk->sk_bind_node);
}

static inline void sk_add_bind_node(struct sock *sk,
                                        struct hlist_head *list)
{
        hlist_add_head(&sk->sk_bind_node, list);
}

#define sk_for_each(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
        hlist_for_each_entry_rcu(__sk, list, sk_node)
#define sk_nulls_for_each(__sk, node, list) \
        hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
#define sk_nulls_for_each_rcu(__sk, node, list) \
        hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
#define sk_for_each_from(__sk) \
        hlist_for_each_entry_from(__sk, sk_node)
#define sk_nulls_for_each_from(__sk, node) \
        if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
                hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
#define sk_for_each_safe(__sk, tmp, list) \
        hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_safe(__sk, tmp, list) \
        hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node)

/**
 * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @offset:        offset of hlist_node within the struct.
 *
 */
#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)                       \
        for (pos = rcu_dereference(hlist_first_rcu(head));                       \
             pos != NULL &&                                                       \
                ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
             pos = rcu_dereference(hlist_next_rcu(pos)))

static inline struct user_namespace *sk_user_ns(const struct sock *sk)
{
        /* Careful only use this in a context where these parameters
         * can not change and must all be valid, such as recvmsg from
         * userspace.
         */
        return sk->sk_socket->file->f_cred->user_ns;
}

/* Sock flags */
enum sock_flags {
        SOCK_DEAD,
        SOCK_DONE,
        SOCK_URGINLINE,
        SOCK_KEEPOPEN,
        SOCK_LINGER,
        SOCK_DESTROY,
        SOCK_BROADCAST,
        SOCK_TIMESTAMP,
        SOCK_ZAPPED,
        SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */
        SOCK_DBG, /* %SO_DEBUG setting */
        SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
        SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
        SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
        SOCK_MEMALLOC, /* VM depends on this socket for swapping */
        SOCK_TIMESTAMPING_RX_SOFTWARE,  /* %SOF_TIMESTAMPING_RX_SOFTWARE */
        SOCK_FASYNC, /* fasync() active */
        SOCK_RXQ_OVFL,
        SOCK_ZEROCOPY, /* buffers from userspace */
        SOCK_WIFI_STATUS, /* push wifi status to userspace */
        SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS.
                     * Will use last 4 bytes of packet sent from
                     * user-space instead.
                     */
        SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */
        SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
        SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
        SOCK_TXTIME,
        SOCK_XDP, /* XDP is attached */
        SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
        SOCK_RCVMARK, /* Receive SO_MARK  ancillary data with packet */
        SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */
        SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */
};

#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
/*
 * The highest bit of sk_tsflags is reserved for kernel-internal
 * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that
 * SOF_TIMESTAMPING* values do not reach this reserved area
 */
#define SOCKCM_FLAG_TS_OPT_ID        BIT(31)

static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk)
{
        nsk->sk_flags = osk->sk_flags;
}

static inline void sock_set_flag(struct sock *sk, enum sock_flags flag)
{
        __set_bit(flag, &sk->sk_flags);
}

static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
{
        __clear_bit(flag, &sk->sk_flags);
}

static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
                                     int valbool)
{
        if (valbool)
                sock_set_flag(sk, bit);
        else
                sock_reset_flag(sk, bit);
}

static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
{
        return test_bit(flag, &sk->sk_flags);
}

#ifdef CONFIG_NET
DECLARE_STATIC_KEY_FALSE(memalloc_socks_key);
static inline int sk_memalloc_socks(void)
{
        return static_branch_unlikely(&memalloc_socks_key);
}

void __receive_sock(struct file *file);
#else

static inline int sk_memalloc_socks(void)
{
        return 0;
}

static inline void __receive_sock(struct file *file)
{ }
#endif

static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)
{
        return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
}

static inline void sk_acceptq_removed(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
}

static inline void sk_acceptq_added(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
}

/* Note: If you think the test should be:
 *        return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog);
 * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.")
 */
static inline bool sk_acceptq_is_full(const struct sock *sk)
{
        return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}

/*
 * Compute minimal free write space needed to queue new packets.
 */
static inline int sk_stream_min_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_wmem_queued) >> 1;
}

static inline int sk_stream_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued);
}

static inline void sk_wmem_queued_add(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}

static inline void sk_forward_alloc_add(struct sock *sk, int val)
{
        /* Paired with lockless reads of sk->sk_forward_alloc */
        WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val);
}

void sk_stream_write_space(struct sock *sk);

/* OOB backlog add */
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
        /* dont let skb dst not refcounted, we are going to leave rcu lock */
        skb_dst_force(skb);

        if (!sk->sk_backlog.tail)
                WRITE_ONCE(sk->sk_backlog.head, skb);
        else
                sk->sk_backlog.tail->next = skb;

        WRITE_ONCE(sk->sk_backlog.tail, skb);
        skb->next = NULL;
}

/*
 * Take into account size of receive queue and backlog queue
 * Do not take into account this skb truesize,
 * to allow even a single big packet to come.
 */
static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit)
{
        unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);

        return qsize > limit;
}

/* The per-socket spinlock must be held here. */
static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb,
                                              unsigned int limit)
{
        if (sk_rcvqueues_full(sk, limit))
                return -ENOBUFS;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
                return -ENOMEM;

        __sk_add_backlog(sk, skb);
        sk->sk_backlog.len += skb->truesize;
        return 0;
}

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);

INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb));

static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        if (sk_memalloc_socks() && skb_pfmemalloc(skb))
                return __sk_backlog_rcv(sk, skb);

        return INDIRECT_CALL_INET(sk->sk_backlog_rcv,
                                  tcp_v6_do_rcv,
                                  tcp_v4_do_rcv,
                                  sk, skb);
}

static inline void sk_incoming_cpu_update(struct sock *sk)
{
        int cpu = raw_smp_processor_id();

        if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu))
                WRITE_ONCE(sk->sk_incoming_cpu, cpu);
}


static inline void sock_rps_save_rxhash(struct sock *sk,
                                        const struct sk_buff *skb)
{
#ifdef CONFIG_RPS
        /* The following WRITE_ONCE() is paired with the READ_ONCE()
         * here, and another one in sock_rps_record_flow().
         */
        if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash))
                WRITE_ONCE(sk->sk_rxhash, skb->hash);
#endif
}

static inline void sock_rps_reset_rxhash(struct sock *sk)
{
#ifdef CONFIG_RPS
        /* Paired with READ_ONCE() in sock_rps_record_flow() */
        WRITE_ONCE(sk->sk_rxhash, 0);
#endif
}

#define sk_wait_event(__sk, __timeo, __condition, __wait)                \
        ({        int __rc, __dis = __sk->sk_disconnects;                        \
                release_sock(__sk);                                        \
                __rc = __condition;                                        \
                if (!__rc) {                                                \
                        *(__timeo) = wait_woken(__wait,                        \
                                                TASK_INTERRUPTIBLE,        \
                                                *(__timeo));                \
                }                                                        \
                sched_annotate_sleep();                                        \
                lock_sock(__sk);                                        \
                __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \
                __rc;                                                        \
        })

int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
void sk_stream_wait_close(struct sock *sk, long timeo_p);
int sk_stream_error(struct sock *sk, int flags, int err);
void sk_stream_kill_queues(struct sock *sk);
void sk_set_memalloc(struct sock *sk);
void sk_clear_memalloc(struct sock *sk);

void __sk_flush_backlog(struct sock *sk);

static inline bool sk_flush_backlog(struct sock *sk)
{
        if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
                __sk_flush_backlog(sk);
                return true;
        }
        return false;
}

int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);

struct request_sock_ops;
struct timewait_sock_ops;
struct inet_hashinfo;
struct raw_hashinfo;
struct smc_hashinfo;
struct module;
struct sk_psock;

/*
 * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
 * un-modified. Special care is taken when initializing object to zero.
 */
static inline void sk_prot_clear_nulls(struct sock *sk, int size)
{
        if (offsetof(struct sock, sk_node.next) != 0)
                memset(sk, 0, offsetof(struct sock, sk_node.next));
        memset(&sk->sk_node.pprev, 0,
               size - offsetof(struct sock, sk_node.pprev));
}

struct proto_accept_arg {
        int flags;
        int err;
        int is_empty;
        bool kern;
};

/* Networking protocol blocks we attach to sockets.
 * socket layer -> transport layer interface
 */
struct proto {
        void                        (*close)(struct sock *sk,
                                        long timeout);
        int                        (*pre_connect)(struct sock *sk,
                                        struct sockaddr_unsized *uaddr,
                                        int addr_len);
        int                        (*connect)(struct sock *sk,
                                        struct sockaddr_unsized *uaddr,
                                        int addr_len);
        int                        (*disconnect)(struct sock *sk, int flags);

        struct sock *                (*accept)(struct sock *sk,
                                          struct proto_accept_arg *arg);

        int                        (*ioctl)(struct sock *sk, int cmd,
                                         int *karg);
        int                        (*init)(struct sock *sk);
        void                        (*destroy)(struct sock *sk);
        void                        (*shutdown)(struct sock *sk, int how);
        int                        (*setsockopt)(struct sock *sk, int level,
                                        int optname, sockptr_t optval,
                                        unsigned int optlen);
        int                        (*getsockopt)(struct sock *sk, int level,
                                        int optname, char __user *optval,
                                        int __user *option);
        void                        (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
        int                        (*compat_ioctl)(struct sock *sk,
                                        unsigned int cmd, unsigned long arg);
#endif
        int                        (*sendmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len);
        int                        (*recvmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len, int flags);
        void                        (*splice_eof)(struct socket *sock);
        int                        (*bind)(struct sock *sk,
                                        struct sockaddr_unsized *addr, int addr_len);
        int                        (*bind_add)(struct sock *sk,
                                        struct sockaddr_unsized *addr, int addr_len);

        int                        (*backlog_rcv) (struct sock *sk,
                                                struct sk_buff *skb);
        bool                        (*bpf_bypass_getsockopt)(int level,
                                                         int optname);

        void                (*release_cb)(struct sock *sk);

        /* Keeping track of sk's, looking them up, and port selection methods. */
        int                        (*hash)(struct sock *sk);
        void                        (*unhash)(struct sock *sk);
        void                        (*rehash)(struct sock *sk);
        int                        (*get_port)(struct sock *sk, unsigned short snum);
        void                        (*put_port)(struct sock *sk);
#ifdef CONFIG_BPF_SYSCALL
        int                        (*psock_update_sk_prot)(struct sock *sk,
                                                        struct sk_psock *psock,
                                                        bool restore);
#endif

        /* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
        unsigned int                inuse_idx;
#endif

        bool                        (*stream_memory_free)(const struct sock *sk, int wake);
        bool                        (*sock_is_readable)(struct sock *sk);
        /* Memory pressure */
        void                        (*enter_memory_pressure)(struct sock *sk);
        void                        (*leave_memory_pressure)(struct sock *sk);
        atomic_long_t                *memory_allocated;        /* Current allocated memory. */
        int  __percpu                *per_cpu_fw_alloc;
        struct percpu_counter        *sockets_allocated;        /* Current number of sockets. */

        /*
         * Pressure flag: try to collapse.
         * Technical note: it is used by multiple contexts non atomically.
         * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes.
         * All the __sk_mem_schedule() is of this nature: accounting
         * is strict, actions are advisory and have some latency.
         */
        unsigned long                *memory_pressure;
        long                        *sysctl_mem;

        int                        *sysctl_wmem;
        int                        *sysctl_rmem;
        u32                        sysctl_wmem_offset;
        u32                        sysctl_rmem_offset;

        int                        max_header;
        bool                        no_autobind;

        struct kmem_cache        *slab;
        unsigned int                obj_size;
        unsigned int                freeptr_offset;
        unsigned int                ipv6_pinfo_offset;
        slab_flags_t                slab_flags;
        unsigned int                useroffset;        /* Usercopy region offset */
        unsigned int                usersize;        /* Usercopy region size */

        struct request_sock_ops        *rsk_prot;
        struct timewait_sock_ops *twsk_prot;

        union {
                struct inet_hashinfo        *hashinfo;
                struct raw_hashinfo        *raw_hash;
                struct smc_hashinfo        *smc_hash;
        } h;

        struct module                *owner;

        char                        name[32];

        struct list_head        node;
        int                        (*diag_destroy)(struct sock *sk, int err);
} __randomize_layout;

int proto_register(struct proto *prot, int alloc_slab);
void proto_unregister(struct proto *prot);
int sock_load_diag_module(int family, int protocol);

INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));

static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
        if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
                return false;

        return sk->sk_prot->stream_memory_free ?
                INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free,
                                     tcp_stream_memory_free, sk, wake) : true;
}

static inline bool sk_stream_memory_free(const struct sock *sk)
{
        return __sk_stream_memory_free(sk, 0);
}

static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
{
        return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
               __sk_stream_memory_free(sk, wake);
}

static inline bool sk_stream_is_writeable(const struct sock *sk)
{
        return __sk_stream_is_writeable(sk, 0);
}

static inline int sk_under_cgroup_hierarchy(struct sock *sk,
                                            struct cgroup *ancestor)
{
#ifdef CONFIG_SOCK_CGROUP_DATA
        return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data),
                                    ancestor);
#else
        return -ENOTSUPP;
#endif
}

#define SK_ALLOC_PERCPU_COUNTER_BATCH 16

static inline void sk_sockets_allocated_dec(struct sock *sk)
{
        percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1,
                                 SK_ALLOC_PERCPU_COUNTER_BATCH);
}

static inline void sk_sockets_allocated_inc(struct sock *sk)
{
        percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1,
                                 SK_ALLOC_PERCPU_COUNTER_BATCH);
}

static inline u64
sk_sockets_allocated_read_positive(struct sock *sk)
{
        return percpu_counter_read_positive(sk->sk_prot->sockets_allocated);
}

static inline int
proto_sockets_allocated_sum_positive(struct proto *prot)
{
        return percpu_counter_sum_positive(prot->sockets_allocated);
}

#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR        64        /* should be enough for the first time */
struct prot_inuse {
        int all;
        int val[PROTO_INUSE_NR];
};

static inline void sock_prot_inuse_add(const struct net *net,
                                       const struct proto *prot, int val)
{
        this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
}

static inline void sock_inuse_add(const struct net *net, int val)
{
        this_cpu_add(net->core.prot_inuse->all, val);
}

int sock_prot_inuse_get(struct net *net, struct proto *proto);
int sock_inuse_get(struct net *net);
#else
static inline void sock_prot_inuse_add(const struct net *net,
                                       const struct proto *prot, int val)
{
}

static inline void sock_inuse_add(const struct net *net, int val)
{
}
#endif


/* With per-bucket locks this operation is not-atomic, so that
 * this version is not worse.
 */
static inline int __sk_prot_rehash(struct sock *sk)
{
        sk->sk_prot->unhash(sk);
        return sk->sk_prot->hash(sk);
}

/* About 10 seconds */
#define SOCK_DESTROY_TIME (10*HZ)

/* Sockets 0-1023 can't be bound to unless you are superuser */
#define PROT_SOCK        1024

#define SHUTDOWN_MASK        3
#define RCV_SHUTDOWN        1
#define SEND_SHUTDOWN        2

#define SOCK_BINDADDR_LOCK        4
#define SOCK_BINDPORT_LOCK        8
/**
 * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time
 */
#define SOCK_CONNECT_BIND        16

struct socket_alloc {
        struct socket socket;
        struct inode vfs_inode;
};

static inline struct socket *SOCKET_I(struct inode *inode)
{
        return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

static inline struct inode *SOCK_INODE(struct socket *socket)
{
        return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
}

/*
 * Functions for memory accounting
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);

#define SK_MEM_SEND        0
#define SK_MEM_RECV        1

/* sysctl_mem values are in pages */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
        return READ_ONCE(sk->sk_prot->sysctl_mem[index]);
}

static inline int sk_mem_pages(int amt)
{
        return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT;
}

static inline bool sk_has_account(struct sock *sk)
{
        /* return true if protocol supports memory accounting */
        return !!sk->sk_prot->memory_allocated;
}

static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}

static inline bool
__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
               pfmemalloc;
}

static inline bool
sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size)
{
        return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb));
}

static inline int sk_unused_reserved_mem(const struct sock *sk)
{
        int unused_mem;

        if (likely(!sk->sk_reserved_mem))
                return 0;

        unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued -
                        atomic_read(&sk->sk_rmem_alloc);

        return unused_mem > 0 ? unused_mem : 0;
}

static inline void sk_mem_reclaim(struct sock *sk)
{
        int reclaimable;

        if (!sk_has_account(sk))
                return;

        reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);

        if (reclaimable >= (int)PAGE_SIZE)
                __sk_mem_reclaim(sk, reclaimable);
}

static inline void sk_mem_reclaim_final(struct sock *sk)
{
        sk->sk_reserved_mem = 0;
        sk_mem_reclaim(sk);
}

static inline void sk_mem_charge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk_forward_alloc_add(sk, -size);
}

static inline void sk_mem_uncharge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk_forward_alloc_add(sk, size);
        sk_mem_reclaim(sk);
}

void __sk_charge(struct sock *sk, gfp_t gfp);

#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
static inline void sk_owner_set(struct sock *sk, struct module *owner)
{
        __module_get(owner);
        sk->sk_owner = owner;
}

static inline void sk_owner_clear(struct sock *sk)
{
        sk->sk_owner = NULL;
}

static inline void sk_owner_put(struct sock *sk)
{
        module_put(sk->sk_owner);
}
#else
static inline void sk_owner_set(struct sock *sk, struct module *owner)
{
}

static inline void sk_owner_clear(struct sock *sk)
{
}

static inline void sk_owner_put(struct sock *sk)
{
}
#endif
/*
 * Macro so as to not evaluate some arguments when
 * lockdep is not enabled.
 *
 * Mark both the sk_lock and the sk_lock.slock as a
 * per-address-family lock class.
 */
#define sock_lock_init_class_and_name(sk, sname, skey, name, key)        \
do {                                                                        \
        sk_owner_set(sk, THIS_MODULE);                                        \
        sk->sk_lock.owned = 0;                                                \
        init_waitqueue_head(&sk->sk_lock.wq);                                \
        spin_lock_init(&(sk)->sk_lock.slock);                                \
        debug_check_no_locks_freed((void *)&(sk)->sk_lock,                \
                                   sizeof((sk)->sk_lock));                \
        lockdep_set_class_and_name(&(sk)->sk_lock.slock,                \
                                   (skey), (sname));                        \
        lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);        \
} while (0)

static inline bool lockdep_sock_is_held(const struct sock *sk)
{
        return lockdep_is_held(&sk->sk_lock) ||
               lockdep_is_held(&sk->sk_lock.slock);
}

void lock_sock_nested(struct sock *sk, int subclass);

static inline void lock_sock(struct sock *sk)
{
        lock_sock_nested(sk, 0);
}

void __release_sock(struct sock *sk);
void release_sock(struct sock *sk);

/* BH context may only use the following locking interface. */
#define bh_lock_sock(__sk)        spin_lock(&((__sk)->sk_lock.slock))
#define bh_lock_sock_nested(__sk) \
                                spin_lock_nested(&((__sk)->sk_lock.slock), \
                                SINGLE_DEPTH_NESTING)
#define bh_unlock_sock(__sk)        spin_unlock(&((__sk)->sk_lock.slock))

bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock);

/**
 * lock_sock_fast - fast version of lock_sock
 * @sk: socket
 *
 * This version should be used for very small section, where process won't block
 * return false if fast path is taken:
 *
 *   sk_lock.slock locked, owned = 0, BH disabled
 *
 * return true if slow path is taken:
 *
 *   sk_lock.slock unlocked, owned = 1, BH enabled
 */
static inline bool lock_sock_fast(struct sock *sk)
{
        /* The sk_lock has mutex_lock() semantics here. */
        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);

        return __lock_sock_fast(sk);
}

/* fast socket lock variant for caller already holding a [different] socket lock */
static inline bool lock_sock_fast_nested(struct sock *sk)
{
        mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_);

        return __lock_sock_fast(sk);
}

/**
 * unlock_sock_fast - complement of lock_sock_fast
 * @sk: socket
 * @slow: slow mode
 *
 * fast unlock socket for user context.
 * If slow mode is on, we call regular release_sock()
 */
static inline void unlock_sock_fast(struct sock *sk, bool slow)
        __releases(&sk->sk_lock.slock)
{
        if (slow) {
                release_sock(sk);
                __release(&sk->sk_lock.slock);
        } else {
                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
                spin_unlock_bh(&sk->sk_lock.slock);
        }
}

void sockopt_lock_sock(struct sock *sk);
void sockopt_release_sock(struct sock *sk);
bool sockopt_ns_capable(struct user_namespace *ns, int cap);
bool sockopt_capable(int cap);

/* Used by processes to "lock" a socket state, so that
 * interrupts and bottom half handlers won't change it
 * from under us. It essentially blocks any incoming
 * packets, so that we won't get any new data or any
 * packets that change the state of the socket.
 *
 * While locked, BH processing will add new packets to
 * the backlog queue.  This queue is processed by the
 * owner of the socket lock right before it is released.
 *
 * Since ~2.3.5 it is also exclusive sleep lock serializing
 * accesses from user process context.
 */

static inline void sock_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline void sock_not_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline bool sock_owned_by_user(const struct sock *sk)
{
        sock_owned_by_me(sk);
        return sk->sk_lock.owned;
}

static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
{
        return sk->sk_lock.owned;
}

static inline void sock_release_ownership(struct sock *sk)
{
        DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk));
        sk->sk_lock.owned = 0;

        /* The sk_lock has mutex_unlock() semantics: */
        mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
}

/* no reclassification while locks are held */
static inline bool sock_allow_reclassification(const struct sock *csk)
{
        struct sock *sk = (struct sock *)csk;

        return !sock_owned_by_user_nocheck(sk) &&
                !spin_is_locked(&sk->sk_lock.slock);
}

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern);
void sk_free(struct sock *sk);
void sk_net_refcnt_upgrade(struct sock *sk);
void sk_destruct(struct sock *sk);
struct sock *sk_clone(const struct sock *sk, const gfp_t priority, bool lock);

static inline struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
        return sk_clone(sk, priority, true);
}

struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority);
void __sock_wfree(struct sk_buff *skb);
void sock_wfree(struct sk_buff *skb);
struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority);
void skb_orphan_partial(struct sk_buff *skb);
void sock_rfree(struct sk_buff *skb);
void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
void sock_pfree(struct sk_buff *skb);

static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        if (refcount_inc_not_zero(&sk->sk_refcnt)) {
                skb->sk = sk;
                skb->destructor = sock_edemux;
        }
}
#else
#define sock_edemux sock_efree
#endif

int sk_setsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, unsigned int optlen);
int sock_setsockopt(struct socket *sock, int level, int op,
                    sockptr_t optval, unsigned int optlen);
int do_sock_setsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, int optlen);
int do_sock_getsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, sockptr_t optlen);

int sk_getsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, sockptr_t optlen);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32);
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order);

static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk,
                                                  unsigned long size,
                                                  int noblock, int *errcode)
{
        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
}

void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
void *sock_kmemdup(struct sock *sk, const void *src,
                   int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);

static inline void sock_replace_proto(struct sock *sk, struct proto *proto)
{
        if (sk->sk_socket)
                clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
        WRITE_ONCE(sk->sk_prot, proto);
}

struct sockcm_cookie {
        u64 transmit_time;
        u32 mark;
        u32 tsflags;
        u32 ts_opt_id;
        u32 priority;
        u32 dmabuf_id;
};

static inline void sockcm_init(struct sockcm_cookie *sockc,
                               const struct sock *sk)
{
        *sockc = (struct sockcm_cookie) {
                .mark = READ_ONCE(sk->sk_mark),
                .tsflags = READ_ONCE(sk->sk_tsflags),
                .priority = READ_ONCE(sk->sk_priority),
        };
}

int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc);
int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * does not implement a particular function.
 */
int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len);
int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags);
int sock_no_socketpair(struct socket *, struct socket *);
int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *);
int sock_no_getname(struct socket *, struct sockaddr *, int);
int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
int sock_no_listen(struct socket *, int);
int sock_no_shutdown(struct socket *, int);
int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
int sock_no_mmap(struct file *file, struct socket *sock,
                 struct vm_area_struct *vma);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * uses the inet style.
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                                  char __user *optval, int __user *optlen);
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags);
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen);

void sk_common_release(struct sock *sk);

/*
 *        Default socket callbacks and setup code
 */

/* Initialise core socket variables using an explicit uid. */
void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid);

/* Initialise core socket variables.
 * Assumes struct socket *sock is embedded in a struct socket_alloc.
 */
void sock_init_data(struct socket *sock, struct sock *sk);

/*
 * Socket reference counting postulates.
 *
 * * Each user of socket SHOULD hold a reference count.
 * * Each access point to socket (an hash table bucket, reference from a list,
 *   running timer, skb in flight MUST hold a reference count.
 * * When reference count hits 0, it means it will never increase back.
 * * When reference count hits 0, it means that no references from
 *   outside exist to this socket and current process on current CPU
 *   is last user and may/should destroy this socket.
 * * sk_free is called from any context: process, BH, IRQ. When
 *   it is called, socket has no references from outside -> sk_free
 *   may release descendant resources allocated by the socket, but
 *   to the time when it is called, socket is NOT referenced by any
 *   hash tables, lists etc.
 * * Packets, delivered from outside (from network or from another process)
 *   and enqueued on receive/error queues SHOULD NOT grab reference count,
 *   when they sit in queue. Otherwise, packets will leak to hole, when
 *   socket is looked up by one cpu and unhasing is made by another CPU.
 *   It is true for udp/raw, netlink (leak to receive and error queues), tcp
 *   (leak to backlog). Packet socket does all the processing inside
 *   BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets
 *   use separate SMP lock, so that they are prone too.
 */

/* Ungrab socket and destroy it, if it was the last reference. */
static inline void sock_put(struct sock *sk)
{
        if (refcount_dec_and_test(&sk->sk_refcnt))
                sk_free(sk);
}
/* Generic version of sock_put(), dealing with all sockets
 * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...)
 */
void sock_gen_put(struct sock *sk);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested,
                     unsigned int trim_cap, bool refcounted);
static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                                 const int nested)
{
        return __sk_receive_skb(sk, skb, nested, 1, true);
}

static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
{
        /* sk_tx_queue_mapping accept only upto a 16-bit value */
        if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX))
                return;
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) {
                WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
                WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
                return;
        }

        /* Refresh sk_tx_queue_mapping_jiffies if too old. */
        if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ))
                WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
}

#define NO_QUEUE_MAPPING        USHRT_MAX

static inline void sk_tx_queue_clear(struct sock *sk)
{
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}

int sk_tx_queue_get(const struct sock *sk);

static inline void __sk_rx_queue_set(struct sock *sk,
                                     const struct sk_buff *skb,
                                     bool force_set)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        if (skb_rx_queue_recorded(skb)) {
                u16 rx_queue = skb_get_rx_queue(skb);

                if (force_set ||
                    unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
                        WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue);
        }
#endif
}

static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
{
        __sk_rx_queue_set(sk, skb, true);
}

static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb)
{
        __sk_rx_queue_set(sk, skb, false);
}

static inline void sk_rx_queue_clear(struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING);
#endif
}

static inline int sk_rx_queue_get(const struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        if (sk) {
                int res = READ_ONCE(sk->sk_rx_queue_mapping);

                if (res != NO_QUEUE_MAPPING)
                        return res;
        }
#endif

        return -1;
}

static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
        WRITE_ONCE(sk->sk_socket, sock);
        if (sock) {
                WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
                WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
        } else {
                /* Note: sk_uid is unchanged. */
                WRITE_ONCE(sk->sk_ino, 0);
        }
}

static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
        BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);
        return &rcu_dereference_raw(sk->sk_wq)->wait;
}
/* Detach socket from process context.
 * Announce socket dead, detach it from wait queue and inode.
 * Note that parent inode held reference count on this struct sock,
 * we do not release it in this function, because protocol
 * probably wants some additional cleanups or even continuing
 * to work with this socket (TCP).
 */
static inline void sock_orphan(struct sock *sk)
{
        write_lock_bh(&sk->sk_callback_lock);
        sock_set_flag(sk, SOCK_DEAD);
        sk_set_socket(sk, NULL);
        sk->sk_wq  = NULL;
        write_unlock_bh(&sk->sk_callback_lock);
}

static inline void sock_graft(struct sock *sk, struct socket *parent)
{
        WARN_ON(parent->sk);
        write_lock_bh(&sk->sk_callback_lock);
        rcu_assign_pointer(sk->sk_wq, &parent->wq);
        parent->sk = sk;
        sk_set_socket(sk, parent);
        security_sock_graft(sk, parent);
        write_unlock_bh(&sk->sk_callback_lock);
}

static inline u64 sock_i_ino(const struct sock *sk)
{
        /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */
        return READ_ONCE(sk->sk_ino);
}

static inline kuid_t sk_uid(const struct sock *sk)
{
        /* Paired with WRITE_ONCE() in sockfs_setattr() */
        return READ_ONCE(sk->sk_uid);
}

static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
{
        return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0);
}

static inline u32 net_tx_rndhash(void)
{
        u32 v = get_random_u32();

        return v ?: 1;
}

static inline void sk_set_txhash(struct sock *sk)
{
        /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
        WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
}

static inline bool sk_rethink_txhash(struct sock *sk)
{
        if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) {
                sk_set_txhash(sk);
                return true;
        }
        return false;
}

static inline struct dst_entry *
__sk_dst_get(const struct sock *sk)
{
        return rcu_dereference_check(sk->sk_dst_cache,
                                     lockdep_sock_is_held(sk));
}

static inline struct dst_entry *
sk_dst_get(const struct sock *sk)
{
        struct dst_entry *dst;

        rcu_read_lock();
        dst = rcu_dereference(sk->sk_dst_cache);
        if (dst && !rcuref_get(&dst->__rcuref))
                dst = NULL;
        rcu_read_unlock();
        return dst;
}

static inline void __dst_negative_advice(struct sock *sk)
{
        struct dst_entry *dst = __sk_dst_get(sk);

        if (dst && dst->ops->negative_advice)
                dst->ops->negative_advice(sk, dst);
}

static inline void dst_negative_advice(struct sock *sk)
{
        sk_rethink_txhash(sk);
        __dst_negative_advice(sk);
}

static inline void
__sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = rcu_dereference_protected(sk->sk_dst_cache,
                                            lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_dst_cache, dst);
        dst_release(old_dst);
}

static inline void
sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst)));
        dst_release(old_dst);
}

static inline void
__sk_dst_reset(struct sock *sk)
{
        __sk_dst_set(sk, NULL);
}

static inline void
sk_dst_reset(struct sock *sk)
{
        sk_dst_set(sk, NULL);
}

struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);

static inline void sk_dst_confirm(struct sock *sk)
{
        if (!READ_ONCE(sk->sk_dst_pending_confirm))
                WRITE_ONCE(sk->sk_dst_pending_confirm, 1);
}

static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
{
        if (skb_get_dst_pending_confirm(skb)) {
                struct sock *sk = skb->sk;

                if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
                        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
                neigh_confirm(n);
        }
}

bool sk_mc_loop(const struct sock *sk);

static inline bool sk_can_gso(const struct sock *sk)
{
        return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst);

static inline void sk_gso_disable(struct sock *sk)
{
        sk->sk_gso_disabled = 1;
        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
}

static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
                                           struct iov_iter *from, char *to,
                                           int copy, int offset)
{
        if (skb->ip_summed == CHECKSUM_NONE) {
                __wsum csum = 0;
                if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
                        return -EFAULT;
                skb->csum = csum_block_add(skb->csum, csum, offset);
        } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
                if (!copy_from_iter_full_nocache(to, copy, from))
                        return -EFAULT;
        } else if (!copy_from_iter_full(to, copy, from))
                return -EFAULT;

        return 0;
}

static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
                                       struct iov_iter *from, int copy)
{
        int err, offset = skb->len;

        err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy),
                                       copy, offset);
        if (err)
                __skb_trim(skb, offset);

        return err;
}

static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from,
                                           struct sk_buff *skb,
                                           struct page *page,
                                           int off, int copy)
{
        int err;

        err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
                                       copy, skb->len);
        if (err)
                return err;

        skb_len_add(skb, copy);
        sk_wmem_queued_add(sk, copy);
        sk_mem_charge(sk, copy);
        return 0;
}

#define SK_WMEM_ALLOC_BIAS 1
/**
 * sk_wmem_alloc_get - returns write allocations
 * @sk: socket
 *
 * Return: sk_wmem_alloc minus initial offset of one
 */
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) - SK_WMEM_ALLOC_BIAS;
}

/**
 * sk_rmem_alloc_get - returns read allocations
 * @sk: socket
 *
 * Return: sk_rmem_alloc
 */
static inline int sk_rmem_alloc_get(const struct sock *sk)
{
        return atomic_read(&sk->sk_rmem_alloc);
}

/**
 * sk_has_allocations - check if allocations are outstanding
 * @sk: socket
 *
 * Return: true if socket has write or read allocations
 */
static inline bool sk_has_allocations(const struct sock *sk)
{
        return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
}

/**
 * skwq_has_sleeper - check if there are any waiting processes
 * @wq: struct socket_wq
 *
 * Return: true if socket_wq has waiting processes
 *
 * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
 * barrier call. They were added due to the race found within the tcp code.
 *
 * Consider following tcp code paths::
 *
 *   CPU1                CPU2
 *   sys_select          receive packet
 *   ...                 ...
 *   __add_wait_queue    update tp->rcv_nxt
 *   ...                 ...
 *   tp->rcv_nxt check   sock_def_readable
 *   ...                 {
 *   schedule               rcu_read_lock();
 *                          wq = rcu_dereference(sk->sk_wq);
 *                          if (wq && waitqueue_active(&wq->wait))
 *                              wake_up_interruptible(&wq->wait)
 *                          ...
 *                       }
 *
 * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay
 * in its cache, and so does the tp->rcv_nxt update on CPU2 side.  The CPU1
 * could then endup calling schedule and sleep forever if there are no more
 * data on the socket.
 *
 */
static inline bool skwq_has_sleeper(struct socket_wq *wq)
{
        return wq && wq_has_sleeper(&wq->wait);
}

/**
 * sock_poll_wait - wrapper for the poll_wait call.
 * @filp:           file
 * @sock:           socket to wait on
 * @p:              poll_table
 *
 * See the comments in the wq_has_sleeper function.
 */
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
                                  poll_table *p)
{
        /* Provides a barrier we need to be sure we are in sync
         * with the socket flags modification.
         *
         * This memory barrier is paired in the wq_has_sleeper.
         */
        poll_wait(filp, &sock->wq.wait, p);
}

static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
{
        /* This pairs with WRITE_ONCE() in sk_set_txhash() */
        u32 txhash = READ_ONCE(sk->sk_txhash);

        if (txhash) {
                skb->l4_hash = 1;
                skb->hash = txhash;
        }
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk);

/*
 *        Queue a received datagram if it will fit. Stream and sequenced
 *        protocols can't normally use this as they need to fit buffers in
 *        and play with them.
 *
 *        Inlined as it's very short and called for pretty much every
 *        packet ever received.
 */
static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rfree;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        sk_mem_charge(sk, skb->truesize);
}

static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
{
        if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
                skb_orphan(skb);
                skb->destructor = sock_efree;
                skb->sk = sk;
                return true;
        }
        return false;
}

static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk)
{
        skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
        if (skb) {
                if (sk_rmem_schedule(sk, skb, skb->truesize)) {
                        skb_set_owner_r(skb, sk);
                        return skb;
                }
                __kfree_skb(skb);
        }
        return NULL;
}

static inline void skb_prepare_for_gro(struct sk_buff *skb)
{
        if (skb->destructor != sock_wfree) {
                skb_orphan(skb);
                return;
        }
        skb->slow_gro = 1;
}

void sk_reset_timer(struct sock *sk, struct timer_list *timer,
                    unsigned long expires);

void sk_stop_timer(struct sock *sk, struct timer_list *timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer);

int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
                        struct sk_buff *skb, unsigned int flags,
                        void (*destructor)(struct sock *sk,
                                           struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);

enum skb_drop_reason
sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb);

static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason drop_reason = sock_queue_rcv_skb_reason(sk, skb);

        switch (drop_reason) {
        case SKB_DROP_REASON_SOCKET_RCVBUFF:
                return -ENOMEM;
        case SKB_DROP_REASON_PROTO_MEM:
                return -ENOBUFS;
        case 0:
                return 0;
        default:
                return -EPERM;
        }
}

int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
struct sk_buff *sock_dequeue_err_skb(struct sock *sk);

/*
 *        Recover an error report and clear atomically
 */

static inline int sock_error(struct sock *sk)
{
        int err;

        /* Avoid an atomic operation for the common case.
         * This is racy since another cpu/thread can change sk_err under us.
         */
        if (likely(data_race(!sk->sk_err)))
                return 0;

        err = xchg(&sk->sk_err, 0);
        return -err;
}

void sk_error_report(struct sock *sk);

static inline unsigned long sock_wspace(struct sock *sk)
{
        int amt = 0;

        if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
                amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc);
                if (amt < 0)
                        amt = 0;
        }
        return amt;
}

/* Note:
 *  We use sk->sk_wq_raw, from contexts knowing this
 *  pointer is not NULL and cannot disappear/change.
 */
static inline void sk_set_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        set_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_clear_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        clear_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_wake_async(const struct sock *sk, int how, int band)
{
        if (sock_flag(sk, SOCK_FASYNC)) {
                rcu_read_lock();
                sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
                rcu_read_unlock();
        }
}

static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band)
{
        if (unlikely(sock_flag(sk, SOCK_FASYNC)))
                sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
}

/* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
 * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
 * Note: for send buffers, TCP works better if we can build two skbs at
 * minimum.
 */
#define TCP_SKB_MIN_TRUESIZE        (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff)))

#define SOCK_MIN_SNDBUF                (TCP_SKB_MIN_TRUESIZE * 2)
#define SOCK_MIN_RCVBUF                 TCP_SKB_MIN_TRUESIZE

static inline void sk_stream_moderate_sndbuf(struct sock *sk)
{
        u32 val;

        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
                return;

        val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
        val = max_t(u32, val, sk_unused_reserved_mem(sk));

        WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
}

/**
 * sk_page_frag - return an appropriate page_frag
 * @sk: socket
 *
 * Use the per task page_frag instead of the per socket one for
 * optimization when we know that we're in process context and own
 * everything that's associated with %current.
 *
 * Both direct reclaim and page faults can nest inside other
 * socket operations and end up recursing into sk_page_frag()
 * while it's already in use: explicitly avoid task page_frag
 * when users disable sk_use_task_frag.
 *
 * Return: a per task page_frag if context allows that,
 * otherwise a per socket one.
 */
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
        if (sk->sk_use_task_frag)
                return &current->task_frag;

        return &sk->sk_frag;
}

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);

static inline bool __sock_writeable(const struct sock *sk, int wmem_alloc)
{
        return wmem_alloc < (READ_ONCE(sk->sk_sndbuf) >> 1);
}
/*
 *        Default write policy as shown to user space via poll/select/SIGIO
 */
static inline bool sock_writeable(const struct sock *sk)
{
        return __sock_writeable(sk, refcount_read(&sk->sk_wmem_alloc));
}

static inline gfp_t gfp_any(void)
{
        return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}

static inline gfp_t gfp_memcg_charge(void)
{
        return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}

#ifdef CONFIG_MEMCG
static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
{
        return sk->sk_memcg;
}

static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
{
        return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk);
}

static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
{
        struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);

#ifdef CONFIG_MEMCG_V1
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return !!memcg->tcpmem_pressure;
#endif /* CONFIG_MEMCG_V1 */

        do {
                if (time_before64(get_jiffies_64(),
                                  mem_cgroup_get_socket_pressure(memcg))) {
                        memcg_memory_event(mem_cgroup_from_sk(sk),
                                           MEMCG_SOCK_THROTTLED);
                        return true;
                }
        } while ((memcg = parent_mem_cgroup(memcg)));

        return false;
}
#else
static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
{
        return NULL;
}

static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
{
        return false;
}

static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
{
        return false;
}
#endif

static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo);
}

static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo);
}

static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
{
        int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len);

        return v ?: 1;
}

/* Alas, with timeout socket operations are not restartable.
 * Compare this to poll().
 */
static inline int sock_intr_errno(long timeo)
{
        return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}

struct sock_skb_cb {
        u32 dropcount;
};

/* Store sock_skb_cb at the end of skb->cb[] so protocol families
 * using skb->cb[] would keep using it directly and utilize its
 * alignment guarantee.
 */
#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \
                            sizeof(struct sock_skb_cb))

#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
                            SOCK_SKB_CB_OFFSET))

#define sock_skb_cb_check_size(size) \
        BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET)

static inline void sk_drops_add(struct sock *sk, int segs)
{
        struct numa_drop_counters *ndc = sk->sk_drop_counters;

        if (ndc)
                numa_drop_add(ndc, segs);
        else
                atomic_add(segs, &sk->sk_drops);
}

static inline void sk_drops_inc(struct sock *sk)
{
        sk_drops_add(sk, 1);
}

static inline int sk_drops_read(const struct sock *sk)
{
        const struct numa_drop_counters *ndc = sk->sk_drop_counters;

        if (ndc) {
                DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops));
                return numa_drop_read(ndc);
        }
        return atomic_read(&sk->sk_drops);
}

static inline void sk_drops_reset(struct sock *sk)
{
        struct numa_drop_counters *ndc = sk->sk_drop_counters;

        if (ndc)
                numa_drop_reset(ndc);
        atomic_set(&sk->sk_drops, 0);
}

static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
        SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
                                                sk_drops_read(sk) : 0;
}

static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb)
{
        int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);

        sk_drops_add(sk, segs);
}

static inline ktime_t sock_read_timestamp(struct sock *sk)
{
#if BITS_PER_LONG==32
        unsigned int seq;
        ktime_t kt;

        do {
                seq = read_seqbegin(&sk->sk_stamp_seq);
                kt = sk->sk_stamp;
        } while (read_seqretry(&sk->sk_stamp_seq, seq));

        return kt;
#else
        return READ_ONCE(sk->sk_stamp);
#endif
}

static inline void sock_write_timestamp(struct sock *sk, ktime_t kt)
{
#if BITS_PER_LONG==32
        write_seqlock(&sk->sk_stamp_seq);
        sk->sk_stamp = kt;
        write_sequnlock(&sk->sk_stamp_seq);
#else
        WRITE_ONCE(sk->sk_stamp, kt);
#endif
}

void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
                           struct sk_buff *skb);
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
                             struct sk_buff *skb);

bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk);
int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
                         struct timespec64 *ts);

static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
        struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
        u32 tsflags = READ_ONCE(sk->sk_tsflags);
        ktime_t kt = skb->tstamp;
        /*
         * generate control messages if
         * - receive time stamping in software requested
         * - software time stamp available and wanted
         * - hardware time stamps available and wanted
         */
        if (sock_flag(sk, SOCK_RCVTSTAMP) ||
            (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
            (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
            (hwtstamps->hwtstamp &&
             (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
                __sock_recv_timestamp(msg, sk, skb);
        else
                sock_write_timestamp(sk, kt);

        if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb))
                __sock_recv_wifi_status(msg, sk, skb);
}

void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                       struct sk_buff *skb);

#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC)
static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                                   struct sk_buff *skb)
{
#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL)                        | \
                           (1UL << SOCK_RCVTSTAMP)                        | \
                           (1UL << SOCK_RCVMARK)                        | \
                           (1UL << SOCK_RCVPRIORITY)                        | \
                           (1UL << SOCK_TIMESTAMPING_ANY))
#define TSFLAGS_ANY          (SOF_TIMESTAMPING_SOFTWARE                        | \
                           SOF_TIMESTAMPING_RAW_HARDWARE)

        if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS)
                __sock_recv_cmsgs(msg, sk, skb);
        else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
                sock_write_timestamp(sk, skb->tstamp);
        else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP))
                sock_write_timestamp(sk, 0);
}

void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags);

/**
 * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
 * @sk:                socket sending this packet
 * @sockc:        pointer to socket cmsg cookie to get timestamping info
 * @tx_flags:        completed with instructions for time stamping
 * @tskey:      filled in with next sk_tskey (not for TCP, which uses seqno)
 *
 * Note: callers should take care of initial ``*tx_flags`` value (usually 0)
 */
static inline void _sock_tx_timestamp(struct sock *sk,
                                      const struct sockcm_cookie *sockc,
                                      __u8 *tx_flags, __u32 *tskey)
{
        __u32 tsflags = sockc->tsflags;

        if (unlikely(tsflags)) {
                __sock_tx_timestamp(tsflags, tx_flags);
                if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
                    tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) {
                        if (tsflags & SOCKCM_FLAG_TS_OPT_ID)
                                *tskey = sockc->ts_opt_id;
                        else
                                *tskey = atomic_inc_return(&sk->sk_tskey) - 1;
                }
        }
}

static inline void sock_tx_timestamp(struct sock *sk,
                                     const struct sockcm_cookie *sockc,
                                     __u8 *tx_flags)
{
        _sock_tx_timestamp(sk, sockc, tx_flags, NULL);
}

static inline void skb_setup_tx_timestamp(struct sk_buff *skb,
                                          const struct sockcm_cookie *sockc)
{
        _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags,
                           &skb_shinfo(skb)->tskey);
}

static inline bool sk_is_inet(const struct sock *sk)
{
        int family = READ_ONCE(sk->sk_family);

        return family == AF_INET || family == AF_INET6;
}

static inline bool sk_is_tcp(const struct sock *sk)
{
        return sk_is_inet(sk) &&
               sk->sk_type == SOCK_STREAM &&
               sk->sk_protocol == IPPROTO_TCP;
}

static inline bool sk_is_udp(const struct sock *sk)
{
        return sk_is_inet(sk) &&
               sk->sk_type == SOCK_DGRAM &&
               sk->sk_protocol == IPPROTO_UDP;
}

static inline bool sk_is_unix(const struct sock *sk)
{
        return sk->sk_family == AF_UNIX;
}

static inline bool sk_is_stream_unix(const struct sock *sk)
{
        return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM;
}

static inline bool sk_is_vsock(const struct sock *sk)
{
        return sk->sk_family == AF_VSOCK;
}

static inline bool sk_may_scm_recv(const struct sock *sk)
{
        return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) ||
                sk->sk_family == AF_NETLINK ||
                (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH);
}

/**
 * sk_eat_skb - Release a skb if it is no longer needed
 * @sk: socket to eat this skb from
 * @skb: socket buffer to eat
 *
 * This routine must be called with interrupts disabled or with the socket
 * locked so that the sk_buff queue operation is ok.
*/
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
        __skb_unlink(skb, &sk->sk_receive_queue);
        __kfree_skb(skb);
}

static inline bool
skb_sk_is_prefetched(struct sk_buff *skb)
{
#ifdef CONFIG_INET
        return skb->destructor == sock_pfree;
#else
        return false;
#endif /* CONFIG_INET */
}

/* This helper checks if a socket is a full socket,
 * ie _not_ a timewait or request socket.
 */
static inline bool sk_fullsock(const struct sock *sk)
{
        return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
}

static inline bool
sk_is_refcounted(struct sock *sk)
{
        /* Only full sockets have sk->sk_flags. */
        return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
}

static inline bool
sk_requests_wifi_status(struct sock *sk)
{
        return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS);
}

/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
 * SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
 */
static inline bool sk_listener(const struct sock *sk)
{
        return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}

/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT
 * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE)
 * TCP RST and ACK can be attached to TIME_WAIT.
 */
static inline bool sk_listener_or_tw(const struct sock *sk)
{
        return (1 << READ_ONCE(sk->sk_state)) &
               (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT);
}

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
                       int type);

bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap);
bool sk_capable(const struct sock *sk, int cap);
bool sk_net_capable(const struct sock *sk, int cap);

void sk_get_meminfo(const struct sock *sk, u32 *meminfo);

/* Take into consideration the size of the struct sk_buff overhead in the
 * determination of these values, since that is non-constant across
 * platforms.  This makes socket queueing behavior and performance
 * not depend upon such differences.
 */
#define _SK_MEM_PACKETS                256
#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
#define SK_WMEM_DEFAULT                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_DEFAULT                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)

extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;

extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;

#define SKB_FRAG_PAGE_ORDER        get_order(32768)
DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_wmem ? */
        if (proto->sysctl_wmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));

        return READ_ONCE(*proto->sysctl_wmem);
}

static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_rmem ? */
        if (proto->sysctl_rmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));

        return READ_ONCE(*proto->sysctl_rmem);
}

/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
 * Some wifi drivers need to tweak it to get more chunks.
 * They can use this helper from their ndo_start_xmit()
 */
static inline void sk_pacing_shift_update(struct sock *sk, int val)
{
        if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val)
                return;
        WRITE_ONCE(sk->sk_pacing_shift, val);
}

/* if a socket is bound to a device, check that the given device
 * index is either the same or that the socket is bound to an L3
 * master device and the given device index is also enslaved to
 * that L3 master
 */
static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
{
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
        int mdif;

        if (!bound_dev_if || bound_dev_if == dif)
                return true;

        mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
        if (mdif && mdif == bound_dev_if)
                return true;

        return false;
}

void sock_def_readable(struct sock *sk);

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
int sock_set_timestamping(struct sock *sk, int optname,
                          struct so_timestamping timestamping);

#if defined(CONFIG_CGROUP_BPF)
void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
#else
static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
{
}
#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
void sock_set_rcvbuf(struct sock *sk, int val);
void sock_set_mark(struct sock *sk, u32 val);
void sock_set_reuseaddr(struct sock *sk);
void sock_set_reuseport(struct sock *sk);
void sock_set_sndtimeo(struct sock *sk, s64 secs);

int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len);

int sock_get_timeout(long timeo, void *optval, bool old_timeval);
int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
                           sockptr_t optval, int optlen, bool old_timeval);

int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
                     void __user *arg, void *karg, size_t size);
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
static inline bool sk_is_readable(struct sock *sk)
{
        const struct proto *prot = READ_ONCE(sk->sk_prot);

        if (prot->sock_is_readable)
                return prot->sock_is_readable(sk);

        return false;
}
#endif        /* _SOCK_H */
































































    4 





    1 
    3 




    3 
    3 












































    4 
    8 






































































    8 








    4 

    7 


    4 
















    8 



    6 

















    8 



    8 


































































































































































































    1 



































    1 
















    8 










    9 








































    1 





    1 

















































































































































































































































    1 
    1 



















    9 





   10 


    9 
    9 











    9 


    9 
   10 


































    8 










    8 










    6 


    7 

    8 







    7 
    4 
    1 
    8 



    7 




    7 
























    8 




























    8 




    7 

























    8 

















    8 















































    2 
    2 

    2 







    2 

    1 










    2 

























    8 

   11 























    6 

    2 





    6 
    5 















































































    8 

















    8 


    8 

    3 
    3 




    3 






    8 












    8 
    8 



    8 
    4 
    8 

    4 




































    7 







    7 
    6 


    8 







    2 




    2 
    2 




























    6 












    5 
    6 








    9 
   10 









   10 


   10 
    8 



    8 




   10 







    9 






















    9 























    5 




    8 




















    8 









    7 




















    8 


    7 






    8 
    6 












    4 









    3 



    4 



































































































    1 

    2 


































    1 






    1 

    1 


































    1 














    1 












    1 








    1 

    1 










    1 





    1 










    1 
























































































































































































































































































































































































































































    2 









    2 


    2 














    1 
    2 





    1 


    1 










    2 





    1 
    2 
    1 

    1 

    2 









    2 

    1 










































































































































































































































































    1 




    1 
    1 























































    8 





   10 












    9 
    3 









    8 




    9 







    9 




    7 



    3 
    2 






















    1 
    2 
    2 













    2 
    2 









    2 


















    1 


    8 






    8 

    8 






























    9 


















    9 
















    9 


    1 






    8 






























   10 








    9 






   10 
   10 






    8 




    9 
    8 


















































    2 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/buffer.c
 *
 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
 */

/*
 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
 *
 * Removed a lot of unnecessary code and simplified things now that
 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
 *
 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
 *
 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
 *
 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
 */

#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/capability.h>
#include <linux/blkdev.h>
#include <linux/blk-crypto.h>
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
#include <linux/folio_batch.h>
#include <linux/sched/mm.h>
#include <trace/events/block.h>
#include <linux/fscrypt.h>
#include <linux/fsverity.h>
#include <linux/sched/isolation.h>

#include "internal.h"

static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
                          enum rw_hint hint, struct writeback_control *wbc);

#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

inline void touch_buffer(struct buffer_head *bh)
{
        trace_block_touch_buffer(bh);
        folio_mark_accessed(bh->b_folio);
}
EXPORT_SYMBOL(touch_buffer);

void __lock_buffer(struct buffer_head *bh)
{
        wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_buffer);

void unlock_buffer(struct buffer_head *bh)
{
        clear_bit_unlock(BH_Lock, &bh->b_state);
        smp_mb__after_atomic();
        wake_up_bit(&bh->b_state, BH_Lock);
}
EXPORT_SYMBOL(unlock_buffer);

/*
 * Returns if the folio has dirty or writeback buffers. If all the buffers
 * are unlocked and clean then the folio_test_dirty information is stale. If
 * any of the buffers are locked, it is assumed they are locked for IO.
 */
void buffer_check_dirty_writeback(struct folio *folio,
                                     bool *dirty, bool *writeback)
{
        struct buffer_head *head, *bh;
        *dirty = false;
        *writeback = false;

        BUG_ON(!folio_test_locked(folio));

        head = folio_buffers(folio);
        if (!head)
                return;

        if (folio_test_writeback(folio))
                *writeback = true;

        bh = head;
        do {
                if (buffer_locked(bh))
                        *writeback = true;

                if (buffer_dirty(bh))
                        *dirty = true;

                bh = bh->b_this_page;
        } while (bh != head);
}

/*
 * Block until a buffer comes unlocked.  This doesn't stop it
 * from becoming locked again - you have to lock it yourself
 * if you want to preserve its state.
 */
void __wait_on_buffer(struct buffer_head * bh)
{
        wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__wait_on_buffer);

static void buffer_io_error(struct buffer_head *bh, char *msg)
{
        if (!test_bit(BH_Quiet, &bh->b_state))
                printk_ratelimited(KERN_ERR
                        "Buffer I/O error on dev %pg, logical block %llu%s\n",
                        bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
}

/*
 * End-of-IO handler helper function which does not touch the bh after
 * unlocking it.
 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 * a race there is benign: unlock_buffer() only use the bh's address for
 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 * itself.
 */
static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                /* This happens, due to failed read-ahead attempts. */
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
}

/*
 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 * unlock the buffer.
 */
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
        put_bh(bh);
        __end_buffer_read_notouch(bh, uptodate);
}
EXPORT_SYMBOL(end_buffer_read_sync);

void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                buffer_io_error(bh, ", lost sync page write");
                mark_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
        put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_write_sync);

static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
{
        struct address_space *bd_mapping = bdev->bd_mapping;
        const int blkbits = bd_mapping->host->i_blkbits;
        struct buffer_head *ret = NULL;
        pgoff_t index;
        struct buffer_head *bh;
        struct buffer_head *head;
        struct folio *folio;
        int all_mapped = 1;
        static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);

        index = ((loff_t)block << blkbits) / PAGE_SIZE;
        folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
        if (IS_ERR(folio))
                goto out;

        /*
         * Folio lock protects the buffers. Callers that cannot block
         * will fallback to serializing vs try_to_free_buffers() via
         * the i_private_lock.
         */
        if (atomic)
                spin_lock(&bd_mapping->i_private_lock);
        else
                folio_lock(folio);

        head = folio_buffers(folio);
        if (!head)
                goto out_unlock;
        /*
         * Upon a noref migration, the folio lock serializes here;
         * otherwise bail.
         */
        if (test_bit_acquire(BH_Migrate, &head->b_state)) {
                WARN_ON(!atomic);
                goto out_unlock;
        }

        bh = head;
        do {
                if (!buffer_mapped(bh))
                        all_mapped = 0;
                else if (bh->b_blocknr == block) {
                        ret = bh;
                        get_bh(bh);
                        goto out_unlock;
                }
                bh = bh->b_this_page;
        } while (bh != head);

        /* we might be here because some of the buffers on this page are
         * not mapped.  This is due to various races between
         * file io on the block device and getblk.  It gets dealt with
         * elsewhere, don't buffer_error if we had some unmapped buffers
         */
        ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
        if (all_mapped && __ratelimit(&last_warned)) {
                printk("__find_get_block_slow() failed. block=%llu, "
                       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
                       "device %pg blocksize: %d\n",
                       (unsigned long long)block,
                       (unsigned long long)bh->b_blocknr,
                       bh->b_state, bh->b_size, bdev,
                       1 << blkbits);
        }
out_unlock:
        if (atomic)
                spin_unlock(&bd_mapping->i_private_lock);
        else
                folio_unlock(folio);
        folio_put(folio);
out:
        return ret;
}

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct folio *folio;
        int folio_uptodate = 1;

        BUG_ON(!buffer_async_read(bh));

        folio = bh->b_folio;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                clear_buffer_uptodate(bh);
                buffer_io_error(bh, ", async page read");
        }

        /*
         * Be _very_ careful from here on. Bad things can happen if
         * two buffer heads end IO at almost the same time and both
         * decide that the page is now completely done.
         */
        first = folio_buffers(folio);
        spin_lock_irqsave(&first->b_uptodate_lock, flags);
        clear_buffer_async_read(bh);
        unlock_buffer(bh);
        tmp = bh;
        do {
                if (!buffer_uptodate(tmp))
                        folio_uptodate = 0;
                if (buffer_async_read(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        } while (tmp != bh);
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);

        folio_end_read(folio, folio_uptodate);
        return;

still_busy:
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
}

struct postprocess_bh_ctx {
        struct work_struct work;
        struct buffer_head *bh;
        struct fsverity_info *vi;
};

static void verify_bh(struct work_struct *work)
{
        struct postprocess_bh_ctx *ctx =
                container_of(work, struct postprocess_bh_ctx, work);
        struct buffer_head *bh = ctx->bh;
        bool valid;

        valid = fsverity_verify_blocks(ctx->vi, bh->b_folio, bh->b_size,
                                       bh_offset(bh));
        end_buffer_async_read(bh, valid);
        kfree(ctx);
}

static void decrypt_bh(struct work_struct *work)
{
        struct postprocess_bh_ctx *ctx =
                container_of(work, struct postprocess_bh_ctx, work);
        struct buffer_head *bh = ctx->bh;
        int err;

        err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
                                               bh_offset(bh));
        if (err == 0 && ctx->vi) {
                /*
                 * We use different work queues for decryption and for verity
                 * because verity may require reading metadata pages that need
                 * decryption, and we shouldn't recurse to the same workqueue.
                 */
                INIT_WORK(&ctx->work, verify_bh);
                fsverity_enqueue_verify_work(&ctx->work);
                return;
        }
        end_buffer_async_read(bh, err == 0);
        kfree(ctx);
}

/*
 * I/O completion handler for block_read_full_folio() - pages
 * which come unlocked at the end of I/O.
 */
static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{
        struct inode *inode = bh->b_folio->mapping->host;
        bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
        struct fsverity_info *vi = NULL;

        /* needed by ext4 */
        if (bh->b_folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
                vi = fsverity_get_info(inode);

        /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
        if (uptodate && (decrypt || vi)) {
                struct postprocess_bh_ctx *ctx = kmalloc_obj(*ctx, GFP_ATOMIC);

                if (ctx) {
                        ctx->bh = bh;
                        ctx->vi = vi;
                        if (decrypt) {
                                INIT_WORK(&ctx->work, decrypt_bh);
                                fscrypt_enqueue_decrypt_work(&ctx->work);
                        } else {
                                INIT_WORK(&ctx->work, verify_bh);
                                fsverity_enqueue_verify_work(&ctx->work);
                        }
                        return;
                }
                uptodate = 0;
        }
        end_buffer_async_read(bh, uptodate);
}

/*
 * Completion handler for block_write_full_folio() - folios which are unlocked
 * during I/O, and which have the writeback flag cleared upon I/O completion.
 */
static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct folio *folio;

        BUG_ON(!buffer_async_write(bh));

        folio = bh->b_folio;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                buffer_io_error(bh, ", lost async page write");
                mark_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
        }

        first = folio_buffers(folio);
        spin_lock_irqsave(&first->b_uptodate_lock, flags);

        clear_buffer_async_write(bh);
        unlock_buffer(bh);
        tmp = bh->b_this_page;
        while (tmp != bh) {
                if (buffer_async_write(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        }
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        folio_end_writeback(folio);
        return;

still_busy:
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
}

/*
 * If a page's buffers are under async readin (end_buffer_async_read
 * completion) then there is a possibility that another thread of
 * control could lock one of the buffers after it has completed
 * but while some of the other buffers have not completed.  This
 * locked buffer would confuse end_buffer_async_read() into not unlocking
 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 * that this buffer is not under async I/O.
 *
 * The page comes unlocked when it has no locked buffer_async buffers
 * left.
 *
 * PageLocked prevents anyone starting new async I/O reads any of
 * the buffers.
 *
 * PageWriteback is used to prevent simultaneous writeout of the same
 * page.
 *
 * PageLocked prevents anyone from starting writeback of a page which is
 * under read I/O (PageWriteback is only ever set against a locked page).
 */
static void mark_buffer_async_read(struct buffer_head *bh)
{
        bh->b_end_io = end_buffer_async_read_io;
        set_buffer_async_read(bh);
}

static void mark_buffer_async_write_endio(struct buffer_head *bh,
                                          bh_end_io_t *handler)
{
        bh->b_end_io = handler;
        set_buffer_async_write(bh);
}

void mark_buffer_async_write(struct buffer_head *bh)
{
        mark_buffer_async_write_endio(bh, end_buffer_async_write);
}
EXPORT_SYMBOL(mark_buffer_async_write);


/*
 * fs/buffer.c contains helper functions for buffer-backed address space's
 * fsync functions.  A common requirement for buffer-based filesystems is
 * that certain data from the backing blockdev needs to be written out for
 * a successful fsync().  For example, ext2 indirect blocks need to be
 * written back and waited upon before fsync() returns.
 *
 * The functions mmb_mark_buffer_dirty(), mmb_sync(), mmb_has_buffers()
 * and mmb_invalidate() are provided for the management of a list of dependent
 * buffers in mapping_metadata_bhs struct.
 *
 * The locking is a little subtle: The list of buffer heads is protected by
 * the lock in mapping_metadata_bhs so functions coming from bdev mapping
 * (such as try_to_free_buffers()) need to safely get to mapping_metadata_bhs
 * using RCU, grab the lock, verify we didn't race with somebody detaching the
 * bh / moving it to different inode and only then proceeding.
 */

void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping)
{
        spin_lock_init(&mmb->lock);
        INIT_LIST_HEAD(&mmb->list);
        mmb->mapping = mapping;
}
EXPORT_SYMBOL(mmb_init);

static void __remove_assoc_queue(struct mapping_metadata_bhs *mmb,
                                 struct buffer_head *bh)
{
        lockdep_assert_held(&mmb->lock);
        list_del_init(&bh->b_assoc_buffers);
        WARN_ON(!bh->b_mmb);
        bh->b_mmb = NULL;
}

static void remove_assoc_queue(struct buffer_head *bh)
{
        struct mapping_metadata_bhs *mmb;

        /*
         * The locking dance is ugly here. We need to acquire the lock
         * protecting the metadata bh list while possibly racing with bh
         * being removed from the list or moved to a different one.  We
         * use RCU to pin mapping_metadata_bhs in memory to
         * opportunistically acquire the lock and then recheck the bh
         * didn't move under us.
         */
        while (bh->b_mmb) {
                rcu_read_lock();
                mmb = READ_ONCE(bh->b_mmb);
                if (mmb) {
                        spin_lock(&mmb->lock);
                        if (bh->b_mmb == mmb)
                                __remove_assoc_queue(mmb, bh);
                        spin_unlock(&mmb->lock);
                }
                rcu_read_unlock();
        }
}

bool mmb_has_buffers(struct mapping_metadata_bhs *mmb)
{
        return !list_empty(&mmb->list);
}
EXPORT_SYMBOL_GPL(mmb_has_buffers);

/**
 * mmb_sync - write out & wait upon all buffers in a list
 * @mmb: the list of buffers to write
 *
 * Starts I/O against the buffers in the given list and waits upon
 * that I/O. Basically, this is a convenience function for fsync().  @mmb is
 * for a file or directory which needs those buffers to be written for a
 * successful fsync().
 *
 * We have conflicting pressures: we want to make sure that all
 * initially dirty buffers get waited on, but that any subsequently
 * dirtied buffers don't.  After all, we don't want fsync to last
 * forever if somebody is actively writing to the file.
 *
 * Do this in two main stages: first we copy dirty buffers to a
 * temporary inode list, queueing the writes as we go. Then we clean
 * up, waiting for those writes to complete. mark_buffer_dirty_inode()
 * doesn't touch b_assoc_buffers list if b_mmb is not NULL so we are sure the
 * buffer stays on our list until IO completes (at which point it can be
 * reaped).
 */
int mmb_sync(struct mapping_metadata_bhs *mmb)
{
        struct buffer_head *bh;
        int err = 0;
        struct blk_plug plug;
        LIST_HEAD(tmp);

        if (!mmb_has_buffers(mmb))
                return 0;

        blk_start_plug(&plug);

        spin_lock(&mmb->lock);
        while (!list_empty(&mmb->list)) {
                bh = BH_ENTRY(mmb->list.next);
                WARN_ON_ONCE(bh->b_mmb != mmb);
                __remove_assoc_queue(mmb, bh);
                /* Avoid race with mark_buffer_dirty_inode() which does
                 * a lockless check and we rely on seeing the dirty bit */
                smp_mb();
                if (buffer_dirty(bh) || buffer_locked(bh)) {
                        list_add(&bh->b_assoc_buffers, &tmp);
                        bh->b_mmb = mmb;
                        if (buffer_dirty(bh)) {
                                get_bh(bh);
                                spin_unlock(&mmb->lock);
                                /*
                                 * Ensure any pending I/O completes so that
                                 * write_dirty_buffer() actually writes the
                                 * current contents - it is a noop if I/O is
                                 * still in flight on potentially older
                                 * contents.
                                 */
                                write_dirty_buffer(bh, REQ_SYNC);

                                /*
                                 * Kick off IO for the previous mapping. Note
                                 * that we will not run the very last mapping,
                                 * wait_on_buffer() will do that for us
                                 * through sync_buffer().
                                 */
                                brelse(bh);
                                spin_lock(&mmb->lock);
                        }
                }
        }

        spin_unlock(&mmb->lock);
        blk_finish_plug(&plug);
        spin_lock(&mmb->lock);

        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
                get_bh(bh);
                __remove_assoc_queue(mmb, bh);
                /* Avoid race with mark_buffer_dirty_inode() which does
                 * a lockless check and we rely on seeing the dirty bit */
                smp_mb();
                if (buffer_dirty(bh)) {
                        list_add(&bh->b_assoc_buffers, &mmb->list);
                        bh->b_mmb = mmb;
                }
                spin_unlock(&mmb->lock);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        err = -EIO;
                brelse(bh);
                spin_lock(&mmb->lock);
        }
        spin_unlock(&mmb->lock);
        return err;
}
EXPORT_SYMBOL(mmb_sync);

/**
 * mmb_fsync_noflush - fsync implementation for simple filesystems with
 *                        metadata buffers list
 *
 * @file:        file to synchronize
 * @mmb:        list of metadata bhs to flush
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is an implementation of the fsync method for simple filesystems which
 * track all non-inode metadata in the buffers list hanging off the @mmb
 * structure.
 */
int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb,
                      loff_t start, loff_t end, bool datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret = 0;

        err = file_write_and_wait_range(file, start, end);
        if (err)
                return err;

        if (mmb)
                ret = mmb_sync(mmb);
        if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
                goto out;

        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;

out:
        /* check and advance again to catch errors after syncing out buffers */
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        return ret;
}
EXPORT_SYMBOL(mmb_fsync_noflush);

/**
 * mmb_fsync - fsync implementation for simple filesystems with metadata
 *                buffers list
 *
 * @file:        file to synchronize
 * @mmb:        list of metadata bhs to flush
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is an implementation of the fsync method for simple filesystems which
 * track all non-inode metadata in the buffers list hanging off the @mmb
 * structure. This also makes sure that a device cache flush operation is
 * called at the end.
 */
int mmb_fsync(struct file *file, struct mapping_metadata_bhs *mmb,
              loff_t start, loff_t end, bool datasync)
{
        struct inode *inode = file->f_mapping->host;
        int ret;

        ret = mmb_fsync_noflush(file, mmb, start, end, datasync);
        if (!ret)
                ret = blkdev_issue_flush(inode->i_sb->s_bdev);
        return ret;
}
EXPORT_SYMBOL(mmb_fsync);

/*
 * Called when we've recently written block `bblock', and it is known that
 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 */
void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize)
{
        struct buffer_head *bh;

        bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
        if (bh) {
                if (buffer_dirty(bh))
                        write_dirty_buffer(bh, 0);
                put_bh(bh);
        }
}

void mmb_mark_buffer_dirty(struct buffer_head *bh,
                           struct mapping_metadata_bhs *mmb)
{
        mark_buffer_dirty(bh);
        if (!bh->b_mmb) {
                spin_lock(&mmb->lock);
                list_move_tail(&bh->b_assoc_buffers, &mmb->list);
                bh->b_mmb = mmb;
                spin_unlock(&mmb->lock);
        }
}
EXPORT_SYMBOL(mmb_mark_buffer_dirty);

/**
 * block_dirty_folio - Mark a folio as dirty.
 * @mapping: The address space containing this folio.
 * @folio: The folio to mark dirty.
 *
 * Filesystems which use buffer_heads can use this function as their
 * ->dirty_folio implementation.  Some filesystems need to do a little
 * work before calling this function.  Filesystems which do not use
 * buffer_heads should call filemap_dirty_folio() instead.
 *
 * If the folio has buffers, the uptodate buffers are set dirty, to
 * preserve dirty-state coherency between the folio and the buffers.
 * Buffers added to a dirty folio are created dirty.
 *
 * The buffers are dirtied before the folio is dirtied.  There's a small
 * race window in which writeback may see the folio cleanness but not the
 * buffer dirtiness.  That's fine.  If this code were to set the folio
 * dirty before the buffers, writeback could clear the folio dirty flag,
 * see a bunch of clean buffers and we'd end up with dirty buffers/clean
 * folio on the dirty folio list.
 *
 * We use i_private_lock to lock against try_to_free_buffers() while
 * using the folio's buffer list.  This also prevents clean buffers
 * being added to the folio after it was set dirty.
 *
 * Context: May only be called from process context.  Does not sleep.
 * Caller must ensure that @folio cannot be truncated during this call,
 * typically by holding the folio lock or having a page in the folio
 * mapped and holding the page table lock.
 *
 * Return: True if the folio was dirtied; false if it was already dirtied.
 */
bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        struct buffer_head *head;
        bool newly_dirty;

        spin_lock(&mapping->i_private_lock);
        head = folio_buffers(folio);
        if (head) {
                struct buffer_head *bh = head;

                do {
                        set_buffer_dirty(bh);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
        /*
         * Lock out page's memcg migration to keep PageDirty
         * synchronized with per-memcg dirty page counters.
         */
        newly_dirty = !folio_test_set_dirty(folio);
        spin_unlock(&mapping->i_private_lock);

        if (newly_dirty)
                __folio_mark_dirty(folio, mapping, 1);

        if (newly_dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

        return newly_dirty;
}
EXPORT_SYMBOL(block_dirty_folio);

/*
 * Invalidate any and all dirty buffers on a given buffers list.  We are
 * probably unmounting the fs, but that doesn't mean we have already
 * done a sync().  Just drop the buffers from the inode list.
 */
void mmb_invalidate(struct mapping_metadata_bhs *mmb)
{
        if (mmb_has_buffers(mmb)) {
                spin_lock(&mmb->lock);
                while (!list_empty(&mmb->list))
                        __remove_assoc_queue(mmb, BH_ENTRY(mmb->list.next));
                spin_unlock(&mmb->lock);
        }
}
EXPORT_SYMBOL(mmb_invalidate);

/*
 * Create the appropriate buffers when given a folio for data area and
 * the size of each buffer.. Use the bh->b_this_page linked list to
 * follow the buffers created.  Return NULL if unable to create more
 * buffers.
 *
 * The retry flag is used to differentiate async IO (paging, swapping)
 * which may not fail from ordinary buffer allocations.
 */
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
                                        gfp_t gfp)
{
        struct buffer_head *bh, *head;
        long offset;
        struct mem_cgroup *memcg, *old_memcg;

        memcg = get_mem_cgroup_from_folio(folio);
        old_memcg = set_active_memcg(memcg);

        head = NULL;
        offset = folio_size(folio);
        while ((offset -= size) >= 0) {
                bh = alloc_buffer_head(gfp);
                if (!bh)
                        goto no_grow;

                bh->b_this_page = head;
                bh->b_blocknr = -1;
                head = bh;

                bh->b_size = size;

                /* Link the buffer to its folio */
                folio_set_bh(bh, folio, offset);
        }
out:
        set_active_memcg(old_memcg);
        mem_cgroup_put(memcg);
        return head;
/*
 * In case anything failed, we just free everything we got.
 */
no_grow:
        if (head) {
                do {
                        bh = head;
                        head = head->b_this_page;
                        free_buffer_head(bh);
                } while (head);
        }

        goto out;
}
EXPORT_SYMBOL_GPL(folio_alloc_buffers);

struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size)
{
        gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;

        return folio_alloc_buffers(page_folio(page), size, gfp);
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);

static inline void link_dev_buffers(struct folio *folio,
                struct buffer_head *head)
{
        struct buffer_head *bh, *tail;

        bh = head;
        do {
                tail = bh;
                bh = bh->b_this_page;
        } while (bh);
        tail->b_this_page = head;
        folio_attach_private(folio, head);
}

static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
{
        sector_t retval = ~((sector_t)0);
        loff_t sz = bdev_nr_bytes(bdev);

        if (sz) {
                unsigned int sizebits = blksize_bits(size);
                retval = (sz >> sizebits);
        }
        return retval;
}

/*
 * Initialise the state of a blockdev folio's buffers.
 */ 
static sector_t folio_init_buffers(struct folio *folio,
                struct block_device *bdev, unsigned size)
{
        struct buffer_head *head = folio_buffers(folio);
        struct buffer_head *bh = head;
        bool uptodate = folio_test_uptodate(folio);
        sector_t block = div_u64(folio_pos(folio), size);
        sector_t end_block = blkdev_max_block(bdev, size);

        do {
                if (!buffer_mapped(bh)) {
                        bh->b_end_io = NULL;
                        bh->b_private = NULL;
                        bh->b_bdev = bdev;
                        bh->b_blocknr = block;
                        if (uptodate)
                                set_buffer_uptodate(bh);
                        if (block < end_block)
                                set_buffer_mapped(bh);
                }
                block++;
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * Caller needs to validate requested block against end of device.
         */
        return end_block;
}

/*
 * Create the page-cache folio that contains the requested block.
 *
 * This is used purely for blockdev mappings.
 *
 * Returns false if we have a failure which cannot be cured by retrying
 * without sleeping.  Returns true if we succeeded, or the caller should retry.
 */
static bool grow_dev_folio(struct block_device *bdev, sector_t block,
                pgoff_t index, unsigned size, gfp_t gfp)
{
        struct address_space *mapping = bdev->bd_mapping;
        struct folio *folio;
        struct buffer_head *bh;
        sector_t end_block = 0;

        folio = __filemap_get_folio(mapping, index,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
        if (IS_ERR(folio))
                return false;

        bh = folio_buffers(folio);
        if (bh) {
                if (bh->b_size == size) {
                        end_block = folio_init_buffers(folio, bdev, size);
                        goto unlock;
                }

                /*
                 * Retrying may succeed; for example the folio may finish
                 * writeback, or buffers may be cleaned.  This should not
                 * happen very often; maybe we have old buffers attached to
                 * this blockdev's page cache and we're trying to change
                 * the block size?
                 */
                if (!try_to_free_buffers(folio)) {
                        end_block = ~0ULL;
                        goto unlock;
                }
        }

        bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
        if (!bh)
                goto unlock;

        /*
         * Link the folio to the buffers and initialise them.  Take the
         * lock to be atomic wrt __find_get_block(), which does not
         * run under the folio lock.
         */
        spin_lock(&mapping->i_private_lock);
        link_dev_buffers(folio, bh);
        end_block = folio_init_buffers(folio, bdev, size);
        spin_unlock(&mapping->i_private_lock);
unlock:
        folio_unlock(folio);
        folio_put(folio);
        return block < end_block;
}

/*
 * Create buffers for the specified block device block's folio.  If
 * that folio was dirty, the buffers are set dirty also.  Returns false
 * if we've hit a permanent error.
 */
static bool grow_buffers(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        loff_t pos;

        /*
         * Check for a block which lies outside our maximum possible
         * pagecache index.
         */
        if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
                printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
                        __func__, (unsigned long long)block,
                        bdev);
                return false;
        }

        /* Create a folio with the proper size buffers */
        return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
}

static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
             unsigned size, gfp_t gfp)
{
        bool blocking = gfpflags_allow_blocking(gfp);

        if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
                printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
                       size, bdev_logical_block_size(bdev));
                return NULL;
        }

        for (;;) {
                struct buffer_head *bh;

                if (!grow_buffers(bdev, block, size, gfp))
                        return NULL;

                if (blocking)
                        bh = __find_get_block_nonatomic(bdev, block, size);
                else
                        bh = __find_get_block(bdev, block, size);
                if (bh)
                        return bh;
        }
}

/*
 * The relationship between dirty buffers and dirty pages:
 *
 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
 * the page is tagged dirty in the page cache.
 *
 * At all times, the dirtiness of the buffers represents the dirtiness of
 * subsections of the page.  If the page has buffers, the page dirty bit is
 * merely a hint about the true dirty state.
 *
 * When a page is set dirty in its entirety, all its buffers are marked dirty
 * (if the page has buffers).
 *
 * When a buffer is marked dirty, its page is dirtied, but the page's other
 * buffers are not.
 *
 * Also.  When blockdev buffers are explicitly read with bread(), they
 * individually become uptodate.  But their backing page remains not
 * uptodate - even if all of its buffers are uptodate.  A subsequent
 * block_read_full_folio() against that folio will discover all the uptodate
 * buffers, will set the folio uptodate and will perform no I/O.
 */

/**
 * mark_buffer_dirty - mark a buffer_head as needing writeout
 * @bh: the buffer_head to mark dirty
 *
 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
 * its backing page dirty, then tag the page as dirty in the page cache
 * and then attach the address_space's inode to its superblock's dirty
 * inode list.
 *
 * mark_buffer_dirty() is atomic.  It takes bh->b_folio->mapping->i_private_lock,
 * i_pages lock and mapping->host->i_lock.
 */
void mark_buffer_dirty(struct buffer_head *bh)
{
        WARN_ON_ONCE(!buffer_uptodate(bh));

        trace_block_dirty_buffer(bh);

        /*
         * Very *carefully* optimize the it-is-already-dirty case.
         *
         * Don't let the final "is it dirty" escape to before we
         * perhaps modified the buffer.
         */
        if (buffer_dirty(bh)) {
                smp_mb();
                if (buffer_dirty(bh))
                        return;
        }

        if (!test_set_buffer_dirty(bh)) {
                struct folio *folio = bh->b_folio;
                struct address_space *mapping = NULL;

                if (!folio_test_set_dirty(folio)) {
                        mapping = folio->mapping;
                        if (mapping)
                                __folio_mark_dirty(folio, mapping, 0);
                }
                if (mapping)
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
}
EXPORT_SYMBOL(mark_buffer_dirty);

void mark_buffer_write_io_error(struct buffer_head *bh)
{
        set_buffer_write_io_error(bh);
        /* FIXME: do we need to set this in both places? */
        if (bh->b_folio && bh->b_folio->mapping)
                mapping_set_error(bh->b_folio->mapping, -EIO);
        if (bh->b_mmb)
                mapping_set_error(bh->b_mmb->mapping, -EIO);
}
EXPORT_SYMBOL(mark_buffer_write_io_error);

/**
 * __brelse - Release a buffer.
 * @bh: The buffer to release.
 *
 * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
 */
void __brelse(struct buffer_head *bh)
{
        if (atomic_read(&bh->b_count)) {
                put_bh(bh);
                return;
        }
        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
EXPORT_SYMBOL(__brelse);

/**
 * __bforget - Discard any dirty data in a buffer.
 * @bh: The buffer to forget.
 *
 * This variant of bforget() can be called if @bh is guaranteed to not
 * be NULL.
 */
void __bforget(struct buffer_head *bh)
{
        clear_buffer_dirty(bh);
        remove_assoc_queue(bh);
        __brelse(bh);
}
EXPORT_SYMBOL(__bforget);

static struct buffer_head *__bread_slow(struct buffer_head *bh)
{
        lock_buffer(bh);
        if (buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return bh;
        } else {
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(REQ_OP_READ, bh);
                wait_on_buffer(bh);
                if (buffer_uptodate(bh))
                        return bh;
        }
        brelse(bh);
        return NULL;
}

/*
 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
 * refcount elevated by one when they're in an LRU.  A buffer can only appear
 * once in a particular CPU's LRU.  A single buffer can be present in multiple
 * CPU's LRUs at the same time.
 *
 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
 * sb_find_get_block().
 *
 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
 * a local interrupt disable for that.
 */

#define BH_LRU_SIZE        16

struct bh_lru {
        struct buffer_head *bhs[BH_LRU_SIZE];
};

static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};

#ifdef CONFIG_SMP
#define bh_lru_lock()        local_irq_disable()
#define bh_lru_unlock()        local_irq_enable()
#else
#define bh_lru_lock()        preempt_disable()
#define bh_lru_unlock()        preempt_enable()
#endif

static inline void check_irqs_on(void)
{
#ifdef irqs_disabled
        BUG_ON(irqs_disabled());
#endif
}

/*
 * Install a buffer_head into this cpu's LRU.  If not already in the LRU, it is
 * inserted at the front, and the buffer_head at the back if any is evicted.
 * Or, if already in the LRU it is moved to the front.
 */
static void bh_lru_install(struct buffer_head *bh)
{
        struct buffer_head *evictee = bh;
        struct bh_lru *b;
        int i;

        check_irqs_on();
        bh_lru_lock();

        /*
         * the refcount of buffer_head in bh_lru prevents dropping the
         * attached page(i.e., try_to_free_buffers) so it could cause
         * failing page migration.
         * Skip putting upcoming bh into bh_lru until migration is done.
         */
        if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
                bh_lru_unlock();
                return;
        }

        b = this_cpu_ptr(&bh_lrus);
        for (i = 0; i < BH_LRU_SIZE; i++) {
                swap(evictee, b->bhs[i]);
                if (evictee == bh) {
                        bh_lru_unlock();
                        return;
                }
        }

        get_bh(bh);
        bh_lru_unlock();
        brelse(evictee);
}

/*
 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
 */
static struct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *ret = NULL;
        unsigned int i;

        check_irqs_on();
        bh_lru_lock();
        if (cpu_is_isolated(smp_processor_id())) {
                bh_lru_unlock();
                return NULL;
        }
        for (i = 0; i < BH_LRU_SIZE; i++) {
                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);

                if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
                    bh->b_size == size) {
                        if (i) {
                                while (i) {
                                        __this_cpu_write(bh_lrus.bhs[i],
                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
                                        i--;
                                }
                                __this_cpu_write(bh_lrus.bhs[0], bh);
                        }
                        get_bh(bh);
                        ret = bh;
                        break;
                }
        }
        bh_lru_unlock();
        return ret;
}

/*
 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
 * it in the LRU and mark it as accessed.  If it is not present then return
 * NULL. Atomic context callers may also return NULL if the buffer is being
 * migrated; similarly the page is not marked accessed either.
 */
static struct buffer_head *
find_get_block_common(struct block_device *bdev, sector_t block,
                        unsigned size, bool atomic)
{
        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

        if (bh == NULL) {
                /* __find_get_block_slow will mark the page accessed */
                bh = __find_get_block_slow(bdev, block, atomic);
                if (bh)
                        bh_lru_install(bh);
        } else
                touch_buffer(bh);

        return bh;
}

struct buffer_head *
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
{
        return find_get_block_common(bdev, block, size, true);
}
EXPORT_SYMBOL(__find_get_block);

/* same as __find_get_block() but allows sleeping contexts */
struct buffer_head *
__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
                           unsigned size)
{
        return find_get_block_common(bdev, block, size, false);
}
EXPORT_SYMBOL(__find_get_block_nonatomic);

/**
 * bdev_getblk - Get a buffer_head in a block device's buffer cache.
 * @bdev: The block device.
 * @block: The block number.
 * @size: The size of buffer_heads for this @bdev.
 * @gfp: The memory allocation flags to use.
 *
 * The returned buffer head has its reference count incremented, but is
 * not locked.  The caller should call brelse() when it has finished
 * with the buffer.  The buffer may not be uptodate.  If needed, the
 * caller can bring it uptodate either by reading it or overwriting it.
 *
 * Return: The buffer head, or NULL if memory could not be allocated.
 */
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        struct buffer_head *bh;

        if (gfpflags_allow_blocking(gfp))
                bh = __find_get_block_nonatomic(bdev, block, size);
        else
                bh = __find_get_block(bdev, block, size);

        might_alloc(gfp);
        if (bh)
                return bh;

        return __getblk_slow(bdev, block, size, gfp);
}
EXPORT_SYMBOL(bdev_getblk);

/*
 * Do async read-ahead on a buffer..
 */
void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *bh = bdev_getblk(bdev, block, size,
                        GFP_NOWAIT | __GFP_MOVABLE);

        if (likely(bh)) {
                bh_readahead(bh, REQ_RAHEAD);
                brelse(bh);
        }
}
EXPORT_SYMBOL(__breadahead);

/**
 * __bread_gfp() - Read a block.
 * @bdev: The block device to read from.
 * @block: Block number in units of block size.
 * @size: The block size of this device in bytes.
 * @gfp: Not page allocation flags; see below.
 *
 * You are not expected to call this function.  You should use one of
 * sb_bread(), sb_bread_unmovable() or __bread().
 *
 * Read a specified block, and return the buffer head that refers to it.
 * If @gfp is 0, the memory will be allocated using the block device's
 * default GFP flags.  If @gfp is __GFP_MOVABLE, the memory may be
 * allocated from a movable area.  Do not pass in a complete set of
 * GFP flags.
 *
 * The returned buffer head has its refcount increased.  The caller should
 * call brelse() when it has finished with the buffer.
 *
 * Context: May sleep waiting for I/O.
 * Return: NULL if the block was unreadable.
 */
struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        struct buffer_head *bh;

        gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);

        /*
         * Prefer looping in the allocator rather than here, at least that
         * code knows what it's doing.
         */
        gfp |= __GFP_NOFAIL;

        bh = bdev_getblk(bdev, block, size, gfp);

        if (likely(bh) && !buffer_uptodate(bh))
                bh = __bread_slow(bh);
        return bh;
}
EXPORT_SYMBOL(__bread_gfp);

static void __invalidate_bh_lrus(struct bh_lru *b)
{
        int i;

        for (i = 0; i < BH_LRU_SIZE; i++) {
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
}
/*
 * invalidate_bh_lrus() is called rarely - but not only at unmount.
 * This doesn't race because it runs in each cpu either in irq
 * or with preempt disabled.
 */
static void invalidate_bh_lru(void *arg)
{
        struct bh_lru *b = &get_cpu_var(bh_lrus);

        __invalidate_bh_lrus(b);
        put_cpu_var(bh_lrus);
}

bool has_bh_in_lru(int cpu, void *dummy)
{
        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
        int i;
        
        for (i = 0; i < BH_LRU_SIZE; i++) {
                if (b->bhs[i])
                        return true;
        }

        return false;
}

void invalidate_bh_lrus(void)
{
        on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);

/*
 * It's called from workqueue context so we need a bh_lru_lock to close
 * the race with preemption/irq.
 */
void invalidate_bh_lrus_cpu(void)
{
        struct bh_lru *b;

        bh_lru_lock();
        b = this_cpu_ptr(&bh_lrus);
        __invalidate_bh_lrus(b);
        bh_lru_unlock();
}

void folio_set_bh(struct buffer_head *bh, struct folio *folio,
                  unsigned long offset)
{
        bh->b_folio = folio;
        BUG_ON(offset >= folio_size(folio));
        if (folio_test_highmem(folio))
                /*
                 * This catches illegal uses and preserves the offset:
                 */
                bh->b_data = (char *)(0 + offset);
        else
                bh->b_data = folio_address(folio) + offset;
}
EXPORT_SYMBOL(folio_set_bh);

/*
 * Called when truncating a buffer on a page completely.
 */

/* Bits that are cleared during an invalidate */
#define BUFFER_FLAGS_DISCARD \
        (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
         1 << BH_Delay | 1 << BH_Unwritten)

static void discard_buffer(struct buffer_head * bh)
{
        unsigned long b_state;

        lock_buffer(bh);
        clear_buffer_dirty(bh);
        bh->b_bdev = NULL;
        b_state = READ_ONCE(bh->b_state);
        do {
        } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
                                      b_state & ~BUFFER_FLAGS_DISCARD));
        unlock_buffer(bh);
}

/**
 * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
 * @folio: The folio which is affected.
 * @offset: start of the range to invalidate
 * @length: length of the range to invalidate
 *
 * block_invalidate_folio() is called when all or part of the folio has been
 * invalidated by a truncate operation.
 *
 * block_invalidate_folio() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
        struct buffer_head *head, *bh, *next;
        size_t curr_off = 0;
        size_t stop = length + offset;

        BUG_ON(!folio_test_locked(folio));

        /*
         * Check for overflow
         */
        BUG_ON(stop > folio_size(folio) || stop < length);

        head = folio_buffers(folio);
        if (!head)
                return;

        bh = head;
        do {
                size_t next_off = curr_off + bh->b_size;
                next = bh->b_this_page;

                /*
                 * Are we still fully in range ?
                 */
                if (next_off > stop)
                        goto out;

                /*
                 * is this block fully invalidated?
                 */
                if (offset <= curr_off)
                        discard_buffer(bh);
                curr_off = next_off;
                bh = next;
        } while (bh != head);

        /*
         * We release buffers only if the entire folio is being invalidated.
         * The get_block cached value has been unconditionally invalidated,
         * so real IO is not possible anymore.
         */
        if (length == folio_size(folio))
                filemap_release_folio(folio, 0);
out:
        folio_clear_mappedtodisk(folio);
}
EXPORT_SYMBOL(block_invalidate_folio);

/*
 * We attach and possibly dirty the buffers atomically wrt
 * block_dirty_folio() via i_private_lock.  try_to_free_buffers
 * is already excluded via the folio lock.
 */
struct buffer_head *create_empty_buffers(struct folio *folio,
                unsigned long blocksize, unsigned long b_state)
{
        struct buffer_head *bh, *head, *tail;
        gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;

        head = folio_alloc_buffers(folio, blocksize, gfp);
        bh = head;
        do {
                bh->b_state |= b_state;
                tail = bh;
                bh = bh->b_this_page;
        } while (bh);
        tail->b_this_page = head;

        spin_lock(&folio->mapping->i_private_lock);
        if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
                bh = head;
                do {
                        if (folio_test_dirty(folio))
                                set_buffer_dirty(bh);
                        if (folio_test_uptodate(folio))
                                set_buffer_uptodate(bh);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
        folio_attach_private(folio, head);
        spin_unlock(&folio->mapping->i_private_lock);

        return head;
}
EXPORT_SYMBOL(create_empty_buffers);

/**
 * clean_bdev_aliases: clean a range of buffers in block device
 * @bdev: Block device to clean buffers in
 * @block: Start of a range of blocks to clean
 * @len: Number of blocks to clean
 *
 * We are taking a range of blocks for data and we don't want writeback of any
 * buffer-cache aliases starting from return from this function and until the
 * moment when something will explicitly mark the buffer dirty (hopefully that
 * will not happen until we will free that block ;-) We don't even need to mark
 * it not-uptodate - nobody can expect anything from a newly allocated buffer
 * anyway. We used to use unmap_buffer() for such invalidation, but that was
 * wrong. We definitely don't want to mark the alias unmapped, for example - it
 * would confuse anyone who might pick it with bread() afterwards...
 *
 * Also..  Note that bforget() doesn't lock the buffer.  So there can be
 * writeout I/O going on against recently-freed buffers.  We don't wait on that
 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
 * need to.  That happens here.
 */
void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
        struct address_space *bd_mapping = bdev->bd_mapping;
        const int blkbits = bd_mapping->host->i_blkbits;
        struct folio_batch fbatch;
        pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
        pgoff_t end;
        int i, count;
        struct buffer_head *bh;
        struct buffer_head *head;

        end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
        folio_batch_init(&fbatch);
        while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
                count = folio_batch_count(&fbatch);
                for (i = 0; i < count; i++) {
                        struct folio *folio = fbatch.folios[i];

                        if (!folio_buffers(folio))
                                continue;
                        /*
                         * We use folio lock instead of bd_mapping->i_private_lock
                         * to pin buffers here since we can afford to sleep and
                         * it scales better than a global spinlock lock.
                         */
                        folio_lock(folio);
                        /* Recheck when the folio is locked which pins bhs */
                        head = folio_buffers(folio);
                        if (!head)
                                goto unlock_page;
                        bh = head;
                        do {
                                if (!buffer_mapped(bh) || (bh->b_blocknr < block))
                                        goto next;
                                if (bh->b_blocknr >= block + len)
                                        break;
                                clear_buffer_dirty(bh);
                                wait_on_buffer(bh);
                                clear_buffer_req(bh);
next:
                                bh = bh->b_this_page;
                        } while (bh != head);
unlock_page:
                        folio_unlock(folio);
                }
                folio_batch_release(&fbatch);
                cond_resched();
                /* End of range already reached? */
                if (index > end || !index)
                        break;
        }
}
EXPORT_SYMBOL(clean_bdev_aliases);

static struct buffer_head *folio_create_buffers(struct folio *folio,
                                                struct inode *inode,
                                                unsigned int b_state)
{
        struct buffer_head *bh;

        BUG_ON(!folio_test_locked(folio));

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio,
                                1 << READ_ONCE(inode->i_blkbits), b_state);
        return bh;
}

/*
 * NOTE! All mapped/uptodate combinations are valid:
 *
 *        Mapped        Uptodate        Meaning
 *
 *        No        No                "unknown" - must do get_block()
 *        No        Yes                "hole" - zero-filled
 *        Yes        No                "allocated" - allocated on disk, not read in
 *        Yes        Yes                "valid" - allocated and up-to-date in memory.
 *
 * "Dirty" is valid only with the last case (mapped+uptodate).
 */

/*
 * While block_write_full_folio is writing back the dirty buffers under
 * the page lock, whoever dirtied the buffers may decide to clean them
 * again at any time.  We handle that by only looking at the buffer
 * state inside lock_buffer().
 *
 * If block_write_full_folio() is called for regular writeback
 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
 * locked buffer.   This only can happen if someone has written the buffer
 * directly, with submit_bh().  At the address_space level PageWriteback
 * prevents this contention from occurring.
 *
 * If block_write_full_folio() is called with wbc->sync_mode ==
 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
 * causes the writes to be flagged as synchronous writes.
 */
int __block_write_full_folio(struct inode *inode, struct folio *folio,
                        get_block_t *get_block, struct writeback_control *wbc)
{
        int err;
        sector_t block;
        sector_t last_block;
        struct buffer_head *bh, *head;
        size_t blocksize;
        int nr_underway = 0;
        blk_opf_t write_flags = wbc_to_write_flags(wbc);

        head = folio_create_buffers(folio, inode,
                                    (1 << BH_Dirty) | (1 << BH_Uptodate));

        /*
         * Be very careful.  We have no exclusion from block_dirty_folio
         * here, and the (potentially unmapped) buffers may become dirty at
         * any time.  If a buffer becomes dirty here after we've inspected it
         * then we just miss that fact, and the folio stays dirty.
         *
         * Buffers outside i_size may be dirtied by block_dirty_folio;
         * handle that here by just cleaning them.
         */

        bh = head;
        blocksize = bh->b_size;

        block = div_u64(folio_pos(folio), blocksize);
        last_block = div_u64(i_size_read(inode) - 1, blocksize);

        /*
         * Get all the dirty buffers mapped to disk addresses and
         * handle any aliases from the underlying blockdev's mapping.
         */
        do {
                if (block > last_block) {
                        /*
                         * mapped buffers outside i_size will occur, because
                         * this folio can be outside i_size when there is a
                         * truncate in progress.
                         */
                        /*
                         * The buffer was zeroed by block_write_full_folio()
                         */
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
                           buffer_dirty(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                goto recover;
                        clear_buffer_delay(bh);
                        if (buffer_new(bh)) {
                                /* blockdev mappings never come here */
                                clear_buffer_new(bh);
                                clean_bdev_bh_alias(bh);
                        }
                }
                bh = bh->b_this_page;
                block++;
        } while (bh != head);

        do {
                if (!buffer_mapped(bh))
                        continue;
                /*
                 * If it's a fully non-blocking write attempt and we cannot
                 * lock the buffer then redirty the folio.  Note that this can
                 * potentially cause a busy-wait loop from writeback threads
                 * and kswapd activity, but those code paths have their own
                 * higher-level throttling.
                 */
                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        folio_redirty_for_writepage(wbc, folio);
                        continue;
                }
                if (test_clear_buffer_dirty(bh)) {
                        mark_buffer_async_write_endio(bh,
                                end_buffer_async_write);
                } else {
                        unlock_buffer(bh);
                }
        } while ((bh = bh->b_this_page) != head);

        /*
         * The folio and its buffers are protected by the writeback flag,
         * so we can drop the bh refcounts early.
         */
        BUG_ON(folio_test_writeback(folio));
        folio_start_writeback(folio);

        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
                                      inode->i_write_hint, wbc);
                        nr_underway++;
                }
                bh = next;
        } while (bh != head);
        folio_unlock(folio);

        err = 0;
done:
        if (nr_underway == 0) {
                /*
                 * The folio was marked dirty, but the buffers were
                 * clean.  Someone wrote them back by hand with
                 * write_dirty_buffer/submit_bh.  A rare case.
                 */
                folio_end_writeback(folio);

                /*
                 * The folio and buffer_heads can be released at any time from
                 * here on.
                 */
        }
        return err;

recover:
        /*
         * ENOSPC, or some other error.  We may already have added some
         * blocks to the file, so we need to write these out to avoid
         * exposing stale data.
         * The folio is currently locked and not marked for writeback
         */
        bh = head;
        /* Recovery: lock and submit the mapped buffers */
        do {
                if (buffer_mapped(bh) && buffer_dirty(bh) &&
                    !buffer_delay(bh)) {
                        lock_buffer(bh);
                        mark_buffer_async_write_endio(bh,
                                end_buffer_async_write);
                } else {
                        /*
                         * The buffer may have been set dirty during
                         * attachment to a dirty folio.
                         */
                        clear_buffer_dirty(bh);
                }
        } while ((bh = bh->b_this_page) != head);
        BUG_ON(folio_test_writeback(folio));
        mapping_set_error(folio->mapping, err);
        folio_start_writeback(folio);
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
                        submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
                                      inode->i_write_hint, wbc);
                        nr_underway++;
                }
                bh = next;
        } while (bh != head);
        folio_unlock(folio);
        goto done;
}
EXPORT_SYMBOL(__block_write_full_folio);

/*
 * If a folio has any new buffers, zero them out here, and mark them uptodate
 * and dirty so they'll be written out (in order to prevent uninitialised
 * block data from leaking). And clear the new bit.
 */
void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
{
        size_t block_start, block_end;
        struct buffer_head *head, *bh;

        BUG_ON(!folio_test_locked(folio));
        head = folio_buffers(folio);
        if (!head)
                return;

        bh = head;
        block_start = 0;
        do {
                block_end = block_start + bh->b_size;

                if (buffer_new(bh)) {
                        if (block_end > from && block_start < to) {
                                if (!folio_test_uptodate(folio)) {
                                        size_t start, xend;

                                        start = max(from, block_start);
                                        xend = min(to, block_end);

                                        folio_zero_segment(folio, start, xend);
                                        set_buffer_uptodate(bh);
                                }

                                clear_buffer_new(bh);
                                mark_buffer_dirty(bh);
                        }
                }

                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);
}
EXPORT_SYMBOL(folio_zero_new_buffers);

static int
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
                const struct iomap *iomap)
{
        loff_t offset = (loff_t)block << inode->i_blkbits;

        bh->b_bdev = iomap->bdev;

        /*
         * Block points to offset in file we need to map, iomap contains
         * the offset at which the map starts. If the map ends before the
         * current block, then do not map the buffer and let the caller
         * handle it.
         */
        if (offset >= iomap->offset + iomap->length)
                return -EIO;

        switch (iomap->type) {
        case IOMAP_HOLE:
                /*
                 * If the buffer is not up to date or beyond the current EOF,
                 * we need to mark it as new to ensure sub-block zeroing is
                 * executed if necessary.
                 */
                if (!buffer_uptodate(bh) ||
                    (offset >= i_size_read(inode)))
                        set_buffer_new(bh);
                return 0;
        case IOMAP_DELALLOC:
                if (!buffer_uptodate(bh) ||
                    (offset >= i_size_read(inode)))
                        set_buffer_new(bh);
                set_buffer_uptodate(bh);
                set_buffer_mapped(bh);
                set_buffer_delay(bh);
                return 0;
        case IOMAP_UNWRITTEN:
                /*
                 * For unwritten regions, we always need to ensure that regions
                 * in the block we are not writing to are zeroed. Mark the
                 * buffer as new to ensure this.
                 */
                set_buffer_new(bh);
                set_buffer_unwritten(bh);
                fallthrough;
        case IOMAP_MAPPED:
                if ((iomap->flags & IOMAP_F_NEW) ||
                    offset >= i_size_read(inode)) {
                        /*
                         * This can happen if truncating the block device races
                         * with the check in the caller as i_size updates on
                         * block devices aren't synchronized by i_rwsem for
                         * block devices.
                         */
                        if (S_ISBLK(inode->i_mode))
                                return -EIO;
                        set_buffer_new(bh);
                }
                bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
                                inode->i_blkbits;
                set_buffer_mapped(bh);
                return 0;
        default:
                WARN_ON_ONCE(1);
                return -EIO;
        }
}

int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block, const struct iomap *iomap)
{
        size_t from = offset_in_folio(folio, pos);
        size_t to = from + len;
        struct inode *inode = folio->mapping->host;
        size_t block_start, block_end;
        sector_t block;
        int err = 0;
        size_t blocksize;
        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(to > folio_size(folio));
        BUG_ON(from > to);

        head = folio_create_buffers(folio, inode, 0);
        blocksize = head->b_size;
        block = div_u64(folio_pos(folio), blocksize);

        for (bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (folio_test_uptodate(folio)) {
                                if (!buffer_uptodate(bh))
                                        set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        if (get_block)
                                err = get_block(inode, block, bh, 1);
                        else
                                err = iomap_to_bh(inode, block, bh, iomap);
                        if (err)
                                break;

                        if (buffer_new(bh)) {
                                clean_bdev_bh_alias(bh);
                                if (folio_test_uptodate(folio)) {
                                        clear_buffer_new(bh);
                                        set_buffer_uptodate(bh);
                                        mark_buffer_dirty(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        folio_zero_segments(folio,
                                                to, block_end,
                                                block_start, from);
                                continue;
                        }
                }
                if (folio_test_uptodate(folio)) {
                        if (!buffer_uptodate(bh))
                                set_buffer_uptodate(bh);
                        continue; 
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                     (block_start < from || block_end > to)) {
                        bh_read_nowait(bh, 0);
                        *wait_bh++=bh;
                }
        }
        /*
         * If we issued read requests - let them complete.
         */
        while(wait_bh > wait) {
                wait_on_buffer(*--wait_bh);
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
        if (unlikely(err))
                folio_zero_new_buffers(folio, from, to);
        return err;
}

int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block)
{
        return __block_write_begin_int(folio, pos, len, get_block, NULL);
}
EXPORT_SYMBOL(__block_write_begin);

void block_commit_write(struct folio *folio, size_t from, size_t to)
{
        size_t block_start, block_end;
        bool partial = false;
        unsigned blocksize;
        struct buffer_head *bh, *head;

        bh = head = folio_buffers(folio);
        if (!bh)
                return;
        blocksize = bh->b_size;

        block_start = 0;
        do {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (!buffer_uptodate(bh))
                                partial = true;
                } else {
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);

                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * If this is a partial write which happened to make all buffers
         * uptodate then we can optimize away a bogus read_folio() for
         * the next read(). Here we 'discover' whether the folio went
         * uptodate as a result of this (potentially partial) write.
         */
        if (!partial)
                folio_mark_uptodate(folio);
}
EXPORT_SYMBOL(block_commit_write);

/*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
 *
 * The filesystem needs to handle block truncation upon failure.
 */
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                struct folio **foliop, get_block_t *get_block)
{
        pgoff_t index = pos >> PAGE_SHIFT;
        struct folio *folio;
        int status;

        folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        status = __block_write_begin_int(folio, pos, len, get_block, NULL);
        if (unlikely(status)) {
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
        }

        *foliop = folio;
        return status;
}
EXPORT_SYMBOL(block_write_begin);

int block_write_end(loff_t pos, unsigned len, unsigned copied,
                struct folio *folio)
{
        size_t start = pos - folio_pos(folio);

        if (unlikely(copied < len)) {
                /*
                 * The buffers that were written will now be uptodate, so
                 * we don't have to worry about a read_folio reading them
                 * and overwriting a partial write. However if we have
                 * encountered a short write and only partially written
                 * into a buffer, it will not be marked uptodate, so a
                 * read_folio might come in and destroy our partial write.
                 *
                 * Do the simplest thing, and just treat any short write to a
                 * non uptodate folio as a zero-length write, and force the
                 * caller to redo the whole thing.
                 */
                if (!folio_test_uptodate(folio))
                        copied = 0;

                folio_zero_new_buffers(folio, start+copied, start+len);
        }
        flush_dcache_folio(folio);

        /* This could be a short (even 0-length) commit */
        block_commit_write(folio, start, start + copied);

        return copied;
}
EXPORT_SYMBOL(block_write_end);

int generic_write_end(const struct kiocb *iocb, struct address_space *mapping,
                      loff_t pos, unsigned len, unsigned copied,
                      struct folio *folio, void *fsdata)
{
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        bool i_size_changed = false;

        copied = block_write_end(pos, len, copied, folio);

        /*
         * No need to use i_size_read() here, the i_size cannot change under us
         * because we hold i_rwsem.
         *
         * But it's important to update i_size while still holding folio lock:
         * page writeout could otherwise come in and zero beyond i_size.
         */
        if (pos + copied > inode->i_size) {
                i_size_write(inode, pos + copied);
                i_size_changed = true;
        }

        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos)
                pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
         * ordering of page lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                mark_inode_dirty(inode);
        return copied;
}
EXPORT_SYMBOL(generic_write_end);

/*
 * block_is_partially_uptodate checks whether buffers within a folio are
 * uptodate or not.
 *
 * Returns true if all buffers which correspond to the specified part
 * of the folio are uptodate.
 */
bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
        unsigned block_start, block_end, blocksize;
        unsigned to;
        struct buffer_head *bh, *head;
        bool ret = true;

        head = folio_buffers(folio);
        if (!head)
                return false;
        blocksize = head->b_size;
        to = min(folio_size(folio) - from, count);
        to = from + to;
        if (from < blocksize && to > folio_size(folio) - blocksize)
                return false;

        bh = head;
        block_start = 0;
        do {
                block_end = block_start + blocksize;
                if (block_end > from && block_start < to) {
                        if (!buffer_uptodate(bh)) {
                                ret = false;
                                break;
                        }
                        if (block_end >= to)
                                break;
                }
                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);

        return ret;
}
EXPORT_SYMBOL(block_is_partially_uptodate);

/*
 * Generic "read_folio" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the folio asynchronously --- the unlock_buffer() and
 * set/clear_buffer_uptodate() functions propagate buffer state into the
 * folio once IO has completed.
 */
int block_read_full_folio(struct folio *folio, get_block_t *get_block)
{
        struct inode *inode = folio->mapping->host;
        sector_t iblock, lblock;
        struct buffer_head *bh, *head, *prev = NULL;
        size_t blocksize;
        int fully_mapped = 1;
        bool page_error = false;
        loff_t limit = i_size_read(inode);

        /* This is needed for ext4. */
        if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
                limit = inode->i_sb->s_maxbytes;

        head = folio_create_buffers(folio, inode, 0);
        blocksize = head->b_size;

        iblock = div_u64(folio_pos(folio), blocksize);
        lblock = div_u64(limit + blocksize - 1, blocksize);
        bh = head;

        do {
                if (buffer_uptodate(bh))
                        continue;

                if (!buffer_mapped(bh)) {
                        int err = 0;

                        fully_mapped = 0;
                        if (iblock < lblock) {
                                WARN_ON(bh->b_size != blocksize);
                                err = get_block(inode, iblock, bh, 0);
                                if (err)
                                        page_error = true;
                        }
                        if (!buffer_mapped(bh)) {
                                folio_zero_range(folio, bh_offset(bh),
                                                blocksize);
                                if (!err)
                                        set_buffer_uptodate(bh);
                                continue;
                        }
                        /*
                         * get_block() might have updated the buffer
                         * synchronously
                         */
                        if (buffer_uptodate(bh))
                                continue;
                }

                lock_buffer(bh);
                if (buffer_uptodate(bh)) {
                        unlock_buffer(bh);
                        continue;
                }

                mark_buffer_async_read(bh);
                if (prev)
                        submit_bh(REQ_OP_READ, prev);
                prev = bh;
        } while (iblock++, (bh = bh->b_this_page) != head);

        if (fully_mapped)
                folio_set_mappedtodisk(folio);

        /*
         * All buffers are uptodate or get_block() returned an error
         * when trying to map them - we must finish the read because
         * end_buffer_async_read() will never be called on any buffer
         * in this folio.
         */
        if (prev)
                submit_bh(REQ_OP_READ, prev);
        else
                folio_end_read(folio, !page_error);

        return 0;
}
EXPORT_SYMBOL(block_read_full_folio);

/* utility function for filesystems that need to do work on expanding
 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
 * deal with the hole.  
 */
int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
        struct address_space *mapping = inode->i_mapping;
        const struct address_space_operations *aops = mapping->a_ops;
        struct folio *folio;
        void *fsdata = NULL;
        int err;

        err = inode_newsize_ok(inode, size);
        if (err)
                goto out;

        err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata);
        if (err)
                goto out;

        err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata);
        BUG_ON(err > 0);

out:
        return err;
}
EXPORT_SYMBOL(generic_cont_expand_simple);

static int cont_expand_zero(const struct kiocb *iocb,
                            struct address_space *mapping,
                            loff_t pos, loff_t *bytes)
{
        struct inode *inode = mapping->host;
        const struct address_space_operations *aops = mapping->a_ops;
        unsigned int blocksize = i_blocksize(inode);
        struct folio *folio;
        void *fsdata = NULL;
        pgoff_t index, curidx;
        loff_t curpos;
        unsigned zerofrom, offset, len;
        int err = 0;

        index = pos >> PAGE_SHIFT;
        offset = pos & ~PAGE_MASK;

        while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
                zerofrom = curpos & ~PAGE_MASK;
                if (zerofrom & (blocksize-1)) {
                        *bytes |= (blocksize-1);
                        (*bytes)++;
                }
                len = PAGE_SIZE - zerofrom;

                err = aops->write_begin(iocb, mapping, curpos, len,
                                            &folio, &fsdata);
                if (err)
                        goto out;
                folio_zero_range(folio, offset_in_folio(folio, curpos), len);
                err = aops->write_end(iocb, mapping, curpos, len, len,
                                                folio, fsdata);
                if (err < 0)
                        goto out;
                BUG_ON(err != len);
                err = 0;

                balance_dirty_pages_ratelimited(mapping);

                if (fatal_signal_pending(current)) {
                        err = -EINTR;
                        goto out;
                }
        }

        /* page covers the boundary, find the boundary offset */
        if (index == curidx) {
                zerofrom = curpos & ~PAGE_MASK;
                /* if we will expand the thing last block will be filled */
                if (offset <= zerofrom) {
                        goto out;
                }
                if (zerofrom & (blocksize-1)) {
                        *bytes |= (blocksize-1);
                        (*bytes)++;
                }
                len = offset - zerofrom;

                err = aops->write_begin(iocb, mapping, curpos, len,
                                            &folio, &fsdata);
                if (err)
                        goto out;
                folio_zero_range(folio, offset_in_folio(folio, curpos), len);
                err = aops->write_end(iocb, mapping, curpos, len, len,
                                                folio, fsdata);
                if (err < 0)
                        goto out;
                BUG_ON(err != len);
                err = 0;
        }
out:
        return err;
}

/*
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */
int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
                     loff_t pos, unsigned len, struct folio **foliop,
                     void **fsdata, get_block_t *get_block, loff_t *bytes)
{
        struct inode *inode = mapping->host;
        unsigned int blocksize = i_blocksize(inode);
        unsigned int zerofrom;
        int err;

        err = cont_expand_zero(iocb, mapping, pos, bytes);
        if (err)
                return err;

        zerofrom = *bytes & ~PAGE_MASK;
        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
                *bytes |= (blocksize-1);
                (*bytes)++;
        }

        return block_write_begin(mapping, pos, len, foliop, get_block);
}
EXPORT_SYMBOL(cont_write_begin);

/*
 * block_page_mkwrite() is not allowed to change the file size as it gets
 * called from a page fault handler when a page is first dirtied. Hence we must
 * be careful to check for EOF conditions here. We set the page up correctly
 * for a written page which means we get ENOSPC checking when writing into
 * holes and correct delalloc and unwritten extent mapping on filesystems that
 * support these features.
 *
 * We are not allowed to take the i_rwsem here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
 * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
 *
 * Direct callers of this function should protect against filesystem freezing
 * using sb_start_pagefault() - sb_end_pagefault() functions.
 */
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                         get_block_t get_block)
{
        struct folio *folio = page_folio(vmf->page);
        struct inode *inode = file_inode(vma->vm_file);
        unsigned long end;
        loff_t size;
        int ret;

        folio_lock(folio);
        size = i_size_read(inode);
        if ((folio->mapping != inode->i_mapping) ||
            (folio_pos(folio) >= size)) {
                /* We overload EFAULT to mean page got truncated */
                ret = -EFAULT;
                goto out_unlock;
        }

        end = folio_size(folio);
        /* folio is wholly or partially inside EOF */
        if (folio_pos(folio) + end > size)
                end = size - folio_pos(folio);

        ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
        if (unlikely(ret))
                goto out_unlock;

        block_commit_write(folio, 0, end);

        folio_mark_dirty(folio);
        folio_wait_stable(folio);
        return 0;
out_unlock:
        folio_unlock(folio);
        return ret;
}
EXPORT_SYMBOL(block_page_mkwrite);

int block_truncate_page(struct address_space *mapping,
                        loff_t from, get_block_t *get_block)
{
        pgoff_t index = from >> PAGE_SHIFT;
        unsigned blocksize;
        sector_t iblock;
        size_t offset, length, pos;
        struct inode *inode = mapping->host;
        struct folio *folio;
        struct buffer_head *bh;
        int err = 0;

        blocksize = i_blocksize(inode);
        length = from & (blocksize - 1);

        /* Block boundary? Nothing to do */
        if (!length)
                return 0;

        length = blocksize - length;
        iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;

        folio = filemap_grab_folio(mapping, index);
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio, blocksize, 0);

        /* Find the buffer that contains "offset" */
        offset = offset_in_folio(folio, from);
        pos = blocksize;
        while (offset >= pos) {
                bh = bh->b_this_page;
                iblock++;
                pos += blocksize;
        }

        if (!buffer_mapped(bh)) {
                WARN_ON(bh->b_size != blocksize);
                err = get_block(inode, iblock, bh, 0);
                if (err)
                        goto unlock;
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh))
                        goto unlock;
        }

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (folio_test_uptodate(folio))
                set_buffer_uptodate(bh);

        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
                err = bh_read(bh, 0);
                /* Uhhuh. Read error. Complain and punt. */
                if (err < 0)
                        goto unlock;
        }

        folio_zero_range(folio, offset, length);
        mark_buffer_dirty(bh);

unlock:
        folio_unlock(folio);
        folio_put(folio);

        return err;
}
EXPORT_SYMBOL(block_truncate_page);

/*
 * The generic write folio function for buffer-backed address_spaces
 */
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
                void *get_block)
{
        struct inode * const inode = folio->mapping->host;
        loff_t i_size = i_size_read(inode);

        /* Is the folio fully inside i_size? */
        if (folio_next_pos(folio) <= i_size)
                return __block_write_full_folio(inode, folio, get_block, wbc);

        /* Is the folio fully outside i_size? (truncate in progress) */
        if (folio_pos(folio) >= i_size) {
                folio_unlock(folio);
                return 0; /* don't care */
        }

        /*
         * The folio straddles i_size.  It must be zeroed out on each and every
         * writeback invocation because it may be mmapped.  "A file is mapped
         * in multiples of the page size.  For a file that is not a multiple of
         * the page size, the remaining memory is zeroed when mapped, and
         * writes to that region are not written out to the file."
         */
        folio_zero_segment(folio, offset_in_folio(folio, i_size),
                        folio_size(folio));
        return __block_write_full_folio(inode, folio, get_block, wbc);
}

sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
                            get_block_t *get_block)
{
        struct inode *inode = mapping->host;
        struct buffer_head tmp = {
                .b_size = i_blocksize(inode),
        };

        get_block(inode, block, &tmp, 0);
        return tmp.b_blocknr;
}
EXPORT_SYMBOL(generic_block_bmap);

static void end_bio_bh_io_sync(struct bio *bio)
{
        struct buffer_head *bh = bio->bi_private;

        if (unlikely(bio_flagged(bio, BIO_QUIET)))
                set_bit(BH_Quiet, &bh->b_state);

        bh->b_end_io(bh, !bio->bi_status);
        bio_put(bio);
}

static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh,
                                  gfp_t gfp_mask)
{
        const struct address_space *mapping = folio_mapping(bh->b_folio);

        /*
         * The ext4 journal (jbd2) can submit a buffer_head it directly created
         * for a non-pagecache page.  fscrypt doesn't care about these.
         */
        if (!mapping)
                return;
        fscrypt_set_bio_crypt_ctx(bio, mapping->host,
                        folio_pos(bh->b_folio) + bh_offset(bh), gfp_mask);
}

static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
                          enum rw_hint write_hint,
                          struct writeback_control *wbc)
{
        const enum req_op op = opf & REQ_OP_MASK;
        struct bio *bio;

        BUG_ON(!buffer_locked(bh));
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
        BUG_ON(buffer_delay(bh));
        BUG_ON(buffer_unwritten(bh));

        /*
         * Only clear out a write error when rewriting
         */
        if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
                clear_buffer_write_io_error(bh);

        if (buffer_meta(bh))
                opf |= REQ_META;
        if (buffer_prio(bh))
                opf |= REQ_PRIO;

        bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);

        if (IS_ENABLED(CONFIG_FS_ENCRYPTION))
                buffer_set_crypto_ctx(bio, bh, GFP_NOIO);

        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_write_hint = write_hint;

        bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));

        bio->bi_end_io = end_bio_bh_io_sync;
        bio->bi_private = bh;

        /* Take care of bh's that straddle the end of the device */
        guard_bio_eod(bio);

        if (wbc) {
                wbc_init_bio(wbc, bio);
                wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
        }

        blk_crypto_submit_bio(bio);
}

void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
        submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
}
EXPORT_SYMBOL(submit_bh);

void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
        lock_buffer(bh);
        if (!test_clear_buffer_dirty(bh)) {
                unlock_buffer(bh);
                return;
        }
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
        submit_bh(REQ_OP_WRITE | op_flags, bh);
}
EXPORT_SYMBOL(write_dirty_buffer);

/*
 * For a data-integrity writeout, we need to wait upon any in-progress I/O
 * and then start new I/O and then wait upon it.  The caller must have a ref on
 * the buffer_head.
 */
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
        WARN_ON(atomic_read(&bh->b_count) < 1);
        lock_buffer(bh);
        if (test_clear_buffer_dirty(bh)) {
                /*
                 * The bh should be mapped, but it might not be if the
                 * device was hot-removed. Not much we can do but fail the I/O.
                 */
                if (!buffer_mapped(bh)) {
                        unlock_buffer(bh);
                        return -EIO;
                }

                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
                submit_bh(REQ_OP_WRITE | op_flags, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        return -EIO;
        } else {
                unlock_buffer(bh);
        }
        return 0;
}
EXPORT_SYMBOL(__sync_dirty_buffer);

int sync_dirty_buffer(struct buffer_head *bh)
{
        return __sync_dirty_buffer(bh, REQ_SYNC);
}
EXPORT_SYMBOL(sync_dirty_buffer);

static inline int buffer_busy(struct buffer_head *bh)
{
        return atomic_read(&bh->b_count) |
                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}

static bool
drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
{
        struct buffer_head *head = folio_buffers(folio);
        struct buffer_head *bh;

        bh = head;
        do {
                if (buffer_busy(bh))
                        goto failed;
                bh = bh->b_this_page;
        } while (bh != head);

        do {
                struct buffer_head *next = bh->b_this_page;

                remove_assoc_queue(bh);
                bh = next;
        } while (bh != head);
        *buffers_to_free = head;
        folio_detach_private(folio);
        return true;
failed:
        return false;
}

/**
 * try_to_free_buffers - Release buffers attached to this folio.
 * @folio: The folio.
 *
 * If any buffers are in use (dirty, under writeback, elevated refcount),
 * no buffers will be freed.
 *
 * If the folio is dirty but all the buffers are clean then we need to
 * be sure to mark the folio clean as well.  This is because the folio
 * may be against a block device, and a later reattachment of buffers
 * to a dirty folio will set *all* buffers dirty.  Which would corrupt
 * filesystem data on the same device.
 *
 * The same applies to regular filesystem folios: if all the buffers are
 * clean then we set the folio clean and proceed.  To do that, we require
 * total exclusion from block_dirty_folio().  That is obtained with
 * i_private_lock.
 *
 * Exclusion against try_to_free_buffers may be obtained by either
 * locking the folio or by holding its mapping's i_private_lock.
 *
 * Context: Process context.  @folio must be locked.  Will not sleep.
 * Return: true if all buffers attached to this folio were freed.
 */
bool try_to_free_buffers(struct folio *folio)
{
        struct address_space * const mapping = folio->mapping;
        struct buffer_head *buffers_to_free = NULL;
        bool ret = 0;

        BUG_ON(!folio_test_locked(folio));
        if (folio_test_writeback(folio))
                return false;

        /* Misconfigured folio check */
        if (WARN_ON_ONCE(!folio_buffers(folio)))
                return true;

        if (mapping == NULL) {                /* can this still happen? */
                ret = drop_buffers(folio, &buffers_to_free);
                goto out;
        }

        spin_lock(&mapping->i_private_lock);
        ret = drop_buffers(folio, &buffers_to_free);

        /*
         * If the filesystem writes its buffers by hand (eg ext3)
         * then we can have clean buffers against a dirty folio.  We
         * clean the folio here; otherwise the VM will never notice
         * that the filesystem did any IO at all.
         *
         * Also, during truncate, discard_buffer will have marked all
         * the folio's buffers clean.  We discover that here and clean
         * the folio also.
         *
         * i_private_lock must be held over this entire operation in order
         * to synchronise against block_dirty_folio and prevent the
         * dirty bit from being lost.
         */
        if (ret)
                folio_cancel_dirty(folio);
        spin_unlock(&mapping->i_private_lock);
out:
        if (buffers_to_free) {
                struct buffer_head *bh = buffers_to_free;

                do {
                        struct buffer_head *next = bh->b_this_page;
                        free_buffer_head(bh);
                        bh = next;
                } while (bh != buffers_to_free);
        }
        return ret;
}
EXPORT_SYMBOL(try_to_free_buffers);

/*
 * Buffer-head allocation
 */
static struct kmem_cache *bh_cachep __ro_after_init;

/*
 * Once the number of bh's in the machine exceeds this level, we start
 * stripping them in writeback.
 */
static unsigned long max_buffer_heads __ro_after_init;

int buffer_heads_over_limit;

struct bh_accounting {
        int nr;                        /* Number of live bh's */
        int ratelimit;                /* Limit cacheline bouncing */
};

static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};

static void recalc_bh_state(void)
{
        int i;
        int tot = 0;

        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
                return;
        __this_cpu_write(bh_accounting.ratelimit, 0);
        for_each_online_cpu(i)
                tot += per_cpu(bh_accounting, i).nr;
        buffer_heads_over_limit = (tot > max_buffer_heads);
}

struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
{
        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
                spin_lock_init(&ret->b_uptodate_lock);
                preempt_disable();
                __this_cpu_inc(bh_accounting.nr);
                recalc_bh_state();
                preempt_enable();
        }
        return ret;
}
EXPORT_SYMBOL(alloc_buffer_head);

void free_buffer_head(struct buffer_head *bh)
{
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
        preempt_disable();
        __this_cpu_dec(bh_accounting.nr);
        recalc_bh_state();
        preempt_enable();
}
EXPORT_SYMBOL(free_buffer_head);

static int buffer_exit_cpu_dead(unsigned int cpu)
{
        int i;
        struct bh_lru *b = &per_cpu(bh_lrus, cpu);

        for (i = 0; i < BH_LRU_SIZE; i++) {
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
        per_cpu(bh_accounting, cpu).nr = 0;
        return 0;
}

/**
 * bh_uptodate_or_lock - Test whether the buffer is uptodate
 * @bh: struct buffer_head
 *
 * Return true if the buffer is up-to-date and false,
 * with the buffer locked, if not.
 */
int bh_uptodate_or_lock(struct buffer_head *bh)
{
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
                if (!buffer_uptodate(bh))
                        return 0;
                unlock_buffer(bh);
        }
        return 1;
}
EXPORT_SYMBOL(bh_uptodate_or_lock);

/**
 * __bh_read - Submit read for a locked buffer
 * @bh: struct buffer_head
 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
 * @wait: wait until reading finish
 *
 * Returns zero on success or don't wait, and -EIO on error.
 */
int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
        int ret = 0;

        BUG_ON(!buffer_locked(bh));

        get_bh(bh);
        bh->b_end_io = end_buffer_read_sync;
        submit_bh(REQ_OP_READ | op_flags, bh);
        if (wait) {
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        ret = -EIO;
        }
        return ret;
}
EXPORT_SYMBOL(__bh_read);

/**
 * __bh_read_batch - Submit read for a batch of unlocked buffers
 * @nr: entry number of the buffer batch
 * @bhs: a batch of struct buffer_head
 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
 * @force_lock: force to get a lock on the buffer if set, otherwise drops any
 *              buffer that cannot lock.
 *
 * Returns zero on success or don't wait, and -EIO on error.
 */
void __bh_read_batch(int nr, struct buffer_head *bhs[],
                     blk_opf_t op_flags, bool force_lock)
{
        int i;

        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];

                if (buffer_uptodate(bh))
                        continue;

                if (force_lock)
                        lock_buffer(bh);
                else
                        if (!trylock_buffer(bh))
                                continue;

                if (buffer_uptodate(bh)) {
                        unlock_buffer(bh);
                        continue;
                }

                bh->b_end_io = end_buffer_read_sync;
                get_bh(bh);
                submit_bh(REQ_OP_READ | op_flags, bh);
        }
}
EXPORT_SYMBOL(__bh_read_batch);

void __init buffer_init(void)
{
        unsigned long nrpages;
        int ret;

        bh_cachep = KMEM_CACHE(buffer_head,
                                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
        /*
         * Limit the bh occupancy to 10% of ZONE_NORMAL
         */
        nrpages = (nr_free_buffer_pages() * 10) / 100;
        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
        ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
                                        NULL, buffer_exit_cpu_dead);
        WARN_ON(ret < 0);
}




































































































































































































    5 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_H
#define _LINUX_WAIT_H
/*
 * Linux wait queue related types and methods
 */
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>

#include <asm/current.h>

typedef struct wait_queue_entry wait_queue_entry_t;

typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);

/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE        0x01
#define WQ_FLAG_WOKEN                0x02
#define WQ_FLAG_CUSTOM                0x04
#define WQ_FLAG_DONE                0x08
#define WQ_FLAG_PRIORITY        0x10

/*
 * A single wait-queue entry structure:
 */
struct wait_queue_entry {
        unsigned int                flags;
        void                        *private;
        wait_queue_func_t        func;
        struct list_head        entry;
};

struct wait_queue_head {
        spinlock_t                lock;
        struct list_head        head;
};
typedef struct wait_queue_head wait_queue_head_t;

struct task_struct;

/*
 * Macros for declaration and initialisaton of the datatypes
 */

#define __WAITQUEUE_INITIALIZER(name, tsk) {                                        \
        .private        = tsk,                                                        \
        .func                = default_wake_function,                                \
        .entry                = { NULL, NULL } }

#define DECLARE_WAITQUEUE(name, tsk)                                                \
        struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                                        \
        .lock                = __SPIN_LOCK_UNLOCKED(name.lock),                        \
        .head                = LIST_HEAD_INIT(name.head) }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);

#define init_waitqueue_head(wq_head)                                                \
        do {                                                                        \
                static struct lock_class_key __key;                                \
                                                                                \
                __init_waitqueue_head((wq_head), #wq_head, &__key);                \
        } while (0)

#ifdef CONFIG_LOCKDEP
# define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
        ({ init_waitqueue_head(&name); name; })
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
#else
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
#endif

static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
        wq_entry->flags                = 0;
        wq_entry->private        = p;
        wq_entry->func                = default_wake_function;
}

static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
        wq_entry->flags                = 0;
        wq_entry->private        = NULL;
        wq_entry->func                = func;
}

/**
 * waitqueue_active -- locklessly test for waiters on the queue
 * @wq_head: the waitqueue to test for waiters
 *
 * returns true if the wait list is not empty
 *
 * NOTE: this function is lockless and requires care, incorrect usage _will_
 * lead to sporadic and non-obvious failure.
 *
 * Use either while holding wait_queue_head::lock or when used for wakeups
 * with an extra smp_mb() like::
 *
 *      CPU0 - waker                    CPU1 - waiter
 *
 *                                      for (;;) {
 *      @cond = true;                     prepare_to_wait(&wq_head, &wait, state);
 *      smp_mb();                         // smp_mb() from set_current_state()
 *      if (waitqueue_active(wq_head))         if (@cond)
 *        wake_up(wq_head);                      break;
 *                                        schedule();
 *                                      }
 *                                      finish_wait(&wq_head, &wait);
 *
 * Because without the explicit smp_mb() it's possible for the
 * waitqueue_active() load to get hoisted over the @cond store such that we'll
 * observe an empty wait list while the waiter might not observe @cond.
 *
 * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
 * which (when the lock is uncontended) are of roughly equal cost.
 */
static inline int waitqueue_active(struct wait_queue_head *wq_head)
{
        return !list_empty(&wq_head->head);
}

/**
 * wq_has_single_sleeper - check if there is only one sleeper
 * @wq_head: wait queue head
 *
 * Returns true of wq_head has only one sleeper on the list.
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
{
        return list_is_singular(&wq_head->head);
}

/**
 * wq_has_sleeper - check if there are any waiting processes
 * @wq_head: wait queue head
 *
 * Returns true if wq_head has waiting processes
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
{
        /*
         * We need to be sure we are in sync with the
         * add_wait_queue modifications to the wait queue.
         *
         * This memory barrier should be paired with one on the
         * waiting side.
         */
        smp_mb();
        return waitqueue_active(wq_head);
}

extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
                                             struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);

static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        struct list_head *head = &wq_head->head;
        struct wait_queue_entry *wq;

        list_for_each_entry(wq, &wq_head->head, entry) {
                if (!(wq->flags & WQ_FLAG_PRIORITY))
                        break;
                head = &wq->entry;
        }
        list_add(&wq_entry->entry, head);
}

/*
 * Used for wake-one threads:
 */
static inline void
__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq_head, wq_entry);
}

static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_add_tail(&wq_entry->entry, &wq_head->head);
}

static inline void
__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue_entry_tail(wq_head, wq_entry);
}

static inline void
__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_del(&wq_entry->entry);
}

int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);

#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr)                __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x)                        __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x)                __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x)                __wake_up_locked((x), TASK_NORMAL, 0)
#define wake_up_sync(x)                        __wake_up_sync(x, TASK_NORMAL)

#define wake_up_interruptible(x)        __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr)        __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
#define wake_up_interruptible_all(x)        __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
#define wake_up_interruptible_sync(x)        __wake_up_sync((x), TASK_INTERRUPTIBLE)

/*
 * Wakeup macros to be used to report events to the targets.
 */
#define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m))
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m)                                                        \
        __wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
#define wake_up_poll_on_current_cpu(x, m)                                        \
        __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m))
#define wake_up_locked_poll(x, m)                                                \
        __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m)                                        \
        __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
#define wake_up_interruptible_sync_poll(x, m)                                        \
        __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
#define wake_up_interruptible_sync_poll_locked(x, m)                                \
        __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

/**
 * wake_up_pollfree - signal that a polled waitqueue is going away
 * @wq_head: the wait queue head
 *
 * In the very rare cases where a ->poll() implementation uses a waitqueue whose
 * lifetime is tied to a task rather than to the 'struct file' being polled,
 * this function must be called before the waitqueue is freed so that
 * non-blocking polls (e.g. epoll) are notified that the queue is going away.
 *
 * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
 * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
 */
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
        /*
         * For performance reasons, we don't always take the queue lock here.
         * Therefore, we might race with someone removing the last entry from
         * the queue, and proceed while they still hold the queue lock.
         * However, rcu_read_lock() is required to be held in such cases, so we
         * can safely proceed with an RCU-delayed free.
         */
        if (waitqueue_active(wq_head))
                __wake_up_pollfree(wq_head);
}

#define ___wait_cond_timeout(condition)                                                \
({                                                                                \
        bool __cond = (condition);                                                \
        if (__cond && !__ret)                                                        \
                __ret = 1;                                                        \
        __cond || !__ret;                                                        \
})

#define ___wait_is_interruptible(state)                                                \
        (!__builtin_constant_p(state) ||                                        \
         (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))

extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);

/*
 * The below macro ___wait_event() has an explicit shadow of the __ret
 * variable when used from the wait_event_*() macros.
 *
 * This is so that both can use the ___wait_cond_timeout() construct
 * to wrap the condition.
 *
 * The type inconsistency of the wait_event_*() __ret variable is also
 * on purpose; we use long where we can return timeout values and int
 * otherwise.
 */

#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)                \
({                                                                                \
        __label__ __out;                                                        \
        struct wait_queue_entry __wq_entry;                                        \
        long __ret = ret;        /* explicit shadow */                                \
                                                                                \
        init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);        \
        for (;;) {                                                                \
                long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
                                                                                \
                if (condition)                                                        \
                        break;                                                        \
                                                                                \
                if (___wait_is_interruptible(state) && __int) {                        \
                        __ret = __int;                                                \
                        goto __out;                                                \
                }                                                                \
                                                                                \
                cmd;                                                                \
                                                                                \
                if (condition)                                                        \
                        break;                                                        \
        }                                                                        \
        finish_wait(&wq_head, &__wq_entry);                                        \
__out:        __ret;                                                                        \
})

#define __wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            schedule())

/**
 * wait_event - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event(wq_head, condition)                                                \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event(wq_head, condition);                                        \
} while (0)

#define __io_wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            io_schedule())

/*
 * io_wait_event() -- like wait_event() but with io_schedule()
 */
#define io_wait_event(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __io_wait_event(wq_head, condition);                                        \
} while (0)

#define __wait_event_freezable(wq_head, condition)                                \
        ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE),        \
                        0, 0, schedule())

/**
 * wait_event_freezable - sleep (or freeze) until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
 * to system load) until the @condition evaluates to true. The
 * @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_freezable(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_UNINTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_timeout(wq_head, condition, timeout)                                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_freezable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout,                \
                      __ret = schedule_timeout(__ret))

/*
 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
 * increasing load and is freezable.
 */
#define wait_event_freezable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \
        __ret;                                                                        \
})

#define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0,        \
                            cmd1; schedule(); cmd2)
/*
 * Just like wait_event_cmd(), except it sets exclusive flag
 */
#define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2);                \
} while (0)

#define __wait_event_cmd(wq_head, condition, cmd1, cmd2)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            cmd1; schedule(); cmd2)

/**
 * wait_event_cmd - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @cmd1: the command will be executed before sleep
 * @cmd2: the command will be executed after sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_cmd(wq_head, condition, cmd1, cmd2)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_cmd(wq_head, condition, cmd1, cmd2);                        \
} while (0)

#define __wait_event_interruptible(wq_head, condition)                                \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      schedule())

/**
 * wait_event_interruptible - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_INTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a signal.
 */
#define wait_event_interruptible_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_interruptible_timeout(wq_head,                \
                                                condition, timeout);                \
        __ret;                                                                        \
})

#define __wait_event_hrtimeout(wq_head, condition, timeout, state)                \
({                                                                                \
        int __ret = 0;                                                                \
        struct hrtimer_sleeper __t;                                                \
                                                                                \
        hrtimer_setup_sleeper_on_stack(&__t, CLOCK_MONOTONIC,                        \
                                       HRTIMER_MODE_REL);                        \
        if ((timeout) != KTIME_MAX) {                                                \
                hrtimer_set_expires_range_ns(&__t.timer, timeout,                \
                                        current->timer_slack_ns);                \
                hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL);                \
        }                                                                        \
                                                                                \
        __ret = ___wait_event(wq_head, condition, state, 0, 0,                        \
                if (!__t.task) {                                                \
                        __ret = -ETIME;                                                \
                        break;                                                        \
                }                                                                \
                schedule());                                                        \
                                                                                \
        hrtimer_cancel(&__t.timer);                                                \
        destroy_hrtimer_on_stack(&__t.timer);                                        \
        __ret;                                                                        \
})

/**
 * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, or -ETIME if the timeout
 * elapsed.
 */
#define wait_event_hrtimeout(wq_head, condition, timeout)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq_head, condition, timeout,        \
                                               TASK_UNINTERRUPTIBLE);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, -ERESTARTSYS if it was
 * interrupted by a signal, or -ETIME if the timeout elapsed.
 */
#define wait_event_interruptible_hrtimeout(wq, condition, timeout)                \
({                                                                                \
        long __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq, condition, timeout,                \
                                               TASK_INTERRUPTIBLE);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_exclusive(wq, condition)                        \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,                        \
                      schedule())

#define wait_event_interruptible_exclusive(wq, condition)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_exclusive(wq, condition);        \
        __ret;                                                                        \
})

#define __wait_event_killable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, TASK_KILLABLE, 1, 0,                        \
                      schedule())

#define wait_event_killable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable_exclusive(wq, condition);                \
        __ret;                                                                        \
})


#define __wait_event_freezable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\
                        schedule())

#define wait_event_freezable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable_exclusive(wq, condition);        \
        __ret;                                                                        \
})

/**
 * wait_event_idle - wait for a condition without contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule());        \
} while (0)

/**
 * wait_event_idle_exclusive - wait for a condition with contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle_exclusive(wq_head, condition)                                \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule());        \
} while (0)

#define __wait_event_idle_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 0, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_timeout(wq_head, condition, timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 1, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\
        __ret;                                                                        \
})

extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);

#define __wait_event_interruptible_locked(wq, condition, exclusive, fn)                \
({                                                                                \
        int __ret;                                                                \
        DEFINE_WAIT(__wait);                                                        \
        if (exclusive)                                                                \
                __wait.flags |= WQ_FLAG_EXCLUSIVE;                                \
        do {                                                                        \
                __ret = fn(&(wq), &__wait);                                        \
                if (__ret)                                                        \
                        break;                                                        \
        } while (!(condition));                                                        \
        __remove_wait_queue(&(wq), &__wait);                                        \
        __set_current_state(TASK_RUNNING);                                        \
        __ret;                                                                        \
})


/**
 * wait_event_interruptible_locked - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked(wq, condition)                                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr))

/**
 * wait_event_interruptible_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked_irq(wq, condition)                        \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq))

/**
 * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr))

/**
 * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked_irq(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq))


#define __wait_event_killable(wq, condition)                                        \
        ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())

/**
 * wait_event_killable - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_killable(wq_head, condition)                                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __io_wait_event_killable(wq, condition)                                        \
        ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, io_schedule())

/*
 * wait_event_killable() - link wait_event_killable but with io_schedule()
 */
#define io_wait_event_killable(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __io_wait_event_killable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_state(wq, condition, state)                                \
        ___wait_event(wq, condition, state, 0, 0, schedule())

/**
 * wait_event_state - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @state: state to sleep in
 *
 * The process is put to sleep (@state) until the @condition evaluates to true
 * or a signal is received (when allowed by @state).  The @condition is checked
 * each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a signal
 * (when allowed by @state) and 0 if @condition evaluated to true.
 */
#define wait_event_state(wq_head, condition, state)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_state(wq_head, condition, state);                \
        __ret;                                                                        \
})

#define __wait_event_state_exclusive(wq, condition, state)                        \
        ___wait_event(wq, condition, state, 1, 0, schedule())

#define wait_event_state_exclusive(wq, condition, state)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_state_exclusive(wq, condition, state);        \
        __ret;                                                                        \
})

#define __wait_event_killable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_KILLABLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a kill signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a kill signal.
 *
 * Only kill signals interrupt this process.
 */
#define wait_event_killable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_killable_timeout(wq_head,                        \
                                                condition, timeout);                \
        __ret;                                                                        \
})


#define __wait_event_lock_irq(wq_head, condition, lock, cmd)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            spin_unlock_irq(&lock);                                \
                            cmd;                                                \
                            schedule();                                                \
                            spin_lock_irq(&lock))

/**
 * wait_event_lock_irq_cmd - sleep until a condition gets true. The
 *                             condition is checked under the lock. This
 *                             is expected to be called with the lock
 *                             taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd
 *          and schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 */
#define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd)                        \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, cmd);                        \
} while (0)

/**
 * wait_event_lock_irq - sleep until a condition gets true. The
 *                         condition is checked under the lock. This
 *                         is expected to be called with the lock
 *                         taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 */
#define wait_event_lock_irq(wq_head, condition, lock)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, );                        \
} while (0)


#define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd)        \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      spin_unlock_irq(&lock);                                        \
                      cmd;                                                        \
                      schedule();                                                \
                      spin_lock_irq(&lock))

/**
 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected to
 *                be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd and
 *          schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd)        \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock, cmd);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_lock_irq - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected
 *                to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq(wq_head, condition, lock)                \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock,);                \
        __ret;                                                                        \
})

#define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      state, 0, timeout,                                        \
                      spin_unlock_irq(&lock);                                        \
                      __ret = schedule_timeout(__ret);                                \
                      spin_lock_irq(&lock));

/**
 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
 *                true or a timeout elapses. The condition is checked under
 *                the lock. This is expected to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it
 * was interrupted by a signal, and the remaining jiffies otherwise
 * if the condition evaluated to true before the timeout elapsed.
 */
#define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock,        \
                                                  timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_INTERRUPTIBLE);                        \
        __ret;                                                                        \
})

#define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_UNINTERRUPTIBLE);                        \
        __ret;                                                                        \
})

/*
 * Waitqueues which are removed from the waitqueue_head at wakeup time
 */
void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_FUNC(name, function)                                        \
        struct wait_queue_entry name = {                                        \
                .private        = current,                                        \
                .func                = function,                                        \
                .entry                = LIST_HEAD_INIT((name).entry),                        \
        }

#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)

#define init_wait_func(wait, function)                                                \
        do {                                                                        \
                (wait)->private = current;                                        \
                (wait)->func = function;                                        \
                INIT_LIST_HEAD(&(wait)->entry);                                        \
                (wait)->flags = 0;                                                \
        } while (0)

#define init_wait(wait)        init_wait_func(wait, autoremove_wake_function)

typedef int (*task_call_f)(struct task_struct *p, void *arg);
extern int task_call_func(struct task_struct *p, task_call_f func, void *arg);

#endif /* _LINUX_WAIT_H */













































    3 



















































































































   66 














    3 
   60 












    1 
   41 











   43 






















   43 










   18 

























































   42 










    3 
    2 










    1 
    1 













































































    5 




























































































































































    1 










    1 













    4 
















    3 












































































































































































































































































































































































































   20 
    6 











    4 













   17 
















   34 
   27 



































































    3 














    3 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

#include <linux/container_of.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/const.h>

#include <asm/barrier.h>

/*
 * Circular doubly linked list implementation.
 *
 * Some of the internal functions ("__xxx") are useful when
 * manipulating whole lists rather than single entries, as
 * sometimes we already know the next/prev entries and we can
 * generate better code by using them directly rather than
 * using the generic single-entry routines.
 */

/**
 * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself
 * @name: name of the list_head
 */
#define LIST_HEAD_INIT(name) { &(name), &(name) }

/**
 * LIST_HEAD - definition of a &struct list_head with initialization values
 * @name: name of the list_head
 */
#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)

/**
 * INIT_LIST_HEAD - Initialize a list_head structure
 * @list: list_head structure to be initialized.
 *
 * Initializes the list_head to point to itself.  If it is a list header,
 * the result is an empty list.
 */
static inline void INIT_LIST_HEAD(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

#ifdef CONFIG_LIST_HARDENED

#ifdef CONFIG_DEBUG_LIST
# define __list_valid_slowpath
#else
# define __list_valid_slowpath __cold __preserve_most
#endif

/*
 * Performs the full set of list corruption checks before __list_add().
 * On list corruption reports a warning, and returns false.
 */
bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new,
                                                      struct list_head *prev,
                                                      struct list_head *next);

/*
 * Performs list corruption checks before __list_add(). Returns false if a
 * corruption is detected, true otherwise.
 *
 * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
 * inline to catch non-faulting corruptions, and only if a corruption is
 * detected calls the reporting function __list_add_valid_or_report().
 */
static __always_inline bool __list_add_valid(struct list_head *new,
                                             struct list_head *prev,
                                             struct list_head *next)
{
        bool ret = true;

        if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
                /*
                 * With the hardening version, elide checking if next and prev
                 * are NULL, since the immediate dereference of them below would
                 * result in a fault if NULL.
                 *
                 * With the reduced set of checks, we can afford to inline the
                 * checks, which also gives the compiler a chance to elide some
                 * of them completely if they can be proven at compile-time. If
                 * one of the pre-conditions does not hold, the slow-path will
                 * show a report which pre-condition failed.
                 */
                if (likely(next->prev == prev && prev->next == next && new != prev && new != next))
                        return true;
                ret = false;
        }

        ret &= __list_add_valid_or_report(new, prev, next);
        return ret;
}

/*
 * Performs the full set of list corruption checks before __list_del_entry().
 * On list corruption reports a warning, and returns false.
 */
bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry);

/*
 * Performs list corruption checks before __list_del_entry(). Returns false if a
 * corruption is detected, true otherwise.
 *
 * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
 * inline to catch non-faulting corruptions, and only if a corruption is
 * detected calls the reporting function __list_del_entry_valid_or_report().
 */
static __always_inline bool __list_del_entry_valid(struct list_head *entry)
{
        bool ret = true;

        if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
                struct list_head *prev = entry->prev;
                struct list_head *next = entry->next;

                /*
                 * With the hardening version, elide checking if next and prev
                 * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate
                 * dereference of them below would result in a fault.
                 */
                if (likely(prev->next == entry && next->prev == entry))
                        return true;
                ret = false;
        }

        ret &= __list_del_entry_valid_or_report(entry);
        return ret;
}
#else
static inline bool __list_add_valid(struct list_head *new,
                                struct list_head *prev,
                                struct list_head *next)
{
        return true;
}
static inline bool __list_del_entry_valid(struct list_head *entry)
{
        return true;
}
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        next->prev = new;
        new->next = next;
        new->prev = prev;
        WRITE_ONCE(prev->next, new);
}

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
        __list_add(new, head, head->next);
}


/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
        __list_add(new, head->prev, head);
}

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
        next->prev = prev;
        WRITE_ONCE(prev->next, next);
}

/*
 * Delete a list entry and clear the 'prev' pointer.
 *
 * This is a special-purpose list clearing method used in the networking code
 * for lists allocated as per-cpu, where we don't want to incur the extra
 * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
 * needs to check the node 'prev' pointer instead of calling list_empty().
 */
static inline void __list_del_clearprev(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        entry->prev = NULL;
}

static inline void __list_del_entry(struct list_head *entry)
{
        if (!__list_del_entry_valid(entry))
                return;

        __list_del(entry->prev, entry->next);
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty() on entry does not return true after this, the entry is
 * in an undefined state.
 */
static inline void list_del(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->next = LIST_POISON1;
        entry->prev = LIST_POISON2;
}

/**
 * list_replace - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->next->prev = new;
        new->prev = old->prev;
        new->prev->next = new;
}

/**
 * list_replace_init - replace old entry by new one and initialize the old one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace_init(struct list_head *old,
                                     struct list_head *new)
{
        list_replace(old, new);
        INIT_LIST_HEAD(old);
}

/**
 * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
 * @entry1: the location to place entry2
 * @entry2: the location to place entry1
 */
static inline void list_swap(struct list_head *entry1,
                             struct list_head *entry2)
{
        struct list_head *pos = entry2->prev;

        list_del(entry2);
        list_replace(entry1, entry2);
        if (pos == entry1)
                pos = entry2;
        list_add(entry1, pos);
}

/**
 * list_del_init - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 */
static inline void list_del_init(struct list_head *entry)
{
        __list_del_entry(entry);
        INIT_LIST_HEAD(entry);
}

/**
 * list_move - delete from one list and add as another's head
 * @list: the entry to move
 * @head: the head that will precede our entry
 */
static inline void list_move(struct list_head *list, struct list_head *head)
{
        __list_del_entry(list);
        list_add(list, head);
}

/**
 * list_move_tail - delete from one list and add as another's tail
 * @list: the entry to move
 * @head: the head that will follow our entry
 */
static inline void list_move_tail(struct list_head *list,
                                  struct list_head *head)
{
        __list_del_entry(list);
        list_add_tail(list, head);
}

/**
 * list_bulk_move_tail - move a subsection of a list to its tail
 * @head: the head that will follow our entry
 * @first: first entry to move
 * @last: last entry to move, can be the same as first
 *
 * Move all entries between @first and including @last before @head.
 * All three entries must belong to the same linked list.
 */
static inline void list_bulk_move_tail(struct list_head *head,
                                       struct list_head *first,
                                       struct list_head *last)
{
        first->prev->next = last->next;
        last->next->prev = first->prev;

        head->prev->next = first;
        first->prev = head->prev;

        last->next = head;
        head->prev = last;
}

/**
 * list_is_first -- tests whether @list is the first entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_first(const struct list_head *list, const struct list_head *head)
{
        return list->prev == head;
}

/**
 * list_is_last - tests whether @list is the last entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_last(const struct list_head *list, const struct list_head *head)
{
        return list->next == head;
}

/**
 * list_is_head - tests whether @list is the list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_head(const struct list_head *list, const struct list_head *head)
{
        return list == head;
}

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(const struct list_head *head)
{
        return READ_ONCE(head->next) == head;
}

/**
 * list_del_init_careful - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 *
 * This is the same as list_del_init(), except designed to be used
 * together with list_empty_careful() in a way to guarantee ordering
 * of other memory operations.
 *
 * Any memory operations done before a list_del_init_careful() are
 * guaranteed to be visible after a list_empty_careful() test.
 */
static inline void list_del_init_careful(struct list_head *entry)
{
        __list_del_entry(entry);
        WRITE_ONCE(entry->prev, entry);
        smp_store_release(&entry->next, entry);
}

/**
 * list_empty_careful - tests whether a list is empty and not being modified
 * @head: the list to test
 *
 * Description:
 * tests whether a list is empty _and_ checks that no other CPU might be
 * in the process of modifying either member (next or prev)
 *
 * NOTE: using list_empty_careful() without synchronization
 * can only be safe if the only activity that can happen
 * to the list entry is list_del_init(). Eg. it cannot be used
 * if another CPU could re-list_add() it.
 */
static inline int list_empty_careful(const struct list_head *head)
{
        struct list_head *next = smp_load_acquire(&head->next);
        return list_is_head(next, head) && (next == READ_ONCE(head->prev));
}

/**
 * list_rotate_left - rotate the list to the left
 * @head: the head of the list
 */
static inline void list_rotate_left(struct list_head *head)
{
        struct list_head *first;

        if (!list_empty(head)) {
                first = head->next;
                list_move_tail(first, head);
        }
}

/**
 * list_rotate_to_front() - Rotate list to specific item.
 * @list: The desired new front of the list.
 * @head: The head of the list.
 *
 * Rotates list so that @list becomes the new front of the list.
 */
static inline void list_rotate_to_front(struct list_head *list,
                                        struct list_head *head)
{
        /*
         * Deletes the list head from the list denoted by @head and
         * places it as the tail of @list, this effectively rotates the
         * list so that @list is at the front.
         */
        list_move_tail(head, list);
}

/**
 * list_is_singular - tests whether a list has just one entry.
 * @head: the list to test.
 */
static inline int list_is_singular(const struct list_head *head)
{
        return !list_empty(head) && (head->next == head->prev);
}

static inline void __list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        struct list_head *new_first = entry->next;
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry;
        entry->next = list;
        head->next = new_first;
        new_first->prev = head;
}

/**
 * list_cut_position - cut a list into two
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *        and if so we won't cut the list
 *
 * This helper moves the initial part of @head, up to and
 * including @entry, from @head to @list. You should
 * pass on @entry an element you know is on @head. @list
 * should be an empty list or a list you do not care about
 * losing its data.
 *
 */
static inline void list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        if (list_empty(head))
                return;
        if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next))
                return;
        if (list_is_head(entry, head))
                INIT_LIST_HEAD(list);
        else
                __list_cut_position(list, head, entry);
}

/**
 * list_cut_before - cut a list into two, before given entry
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *
 * This helper moves the initial part of @head, up to but
 * excluding @entry, from @head to @list.  You should pass
 * in @entry an element you know is on @head.  @list should
 * be an empty list or a list you do not care about losing
 * its data.
 * If @entry == @head, all entries on @head are moved to
 * @list.
 */
static inline void list_cut_before(struct list_head *list,
                                   struct list_head *head,
                                   struct list_head *entry)
{
        if (head->next == entry) {
                INIT_LIST_HEAD(list);
                return;
        }
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry->prev;
        list->prev->next = list;
        head->next = entry;
        entry->prev = head;
}

static inline void __list_splice(const struct list_head *list,
                                 struct list_head *prev,
                                 struct list_head *next)
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        first->prev = prev;
        prev->next = first;

        last->next = next;
        next->prev = last;
}

/**
 * list_splice - join two lists, this is designed for stacks
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice(const struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head, head->next);
}

/**
 * list_splice_tail - join two lists, each list being a queue
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice_tail(struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head->prev, head);
}

/**
 * list_splice_init - join two lists and reinitialise the emptied list.
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * The list at @list is reinitialised
 */
static inline void list_splice_init(struct list_head *list,
                                    struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head, head->next);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_splice_tail_init - join two lists and reinitialise the emptied list
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * Each of the lists is a queue.
 * The list at @list is reinitialised
 */
static inline void list_splice_tail_init(struct list_head *list,
                                         struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head->prev, head);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_entry - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry(ptr, type, member) \
        container_of(ptr, type, member)

/**
 * list_first_entry - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_first_entry(ptr, type, member) \
        list_entry((ptr)->next, type, member)

/**
 * list_last_entry - get the last element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_last_entry(ptr, type, member) \
        list_entry((ptr)->prev, type, member)

/**
 * list_first_entry_or_null - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 */
#define list_first_entry_or_null(ptr, type, member) ({ \
        struct list_head *head__ = (ptr); \
        struct list_head *pos__ = READ_ONCE(head__->next); \
        pos__ != head__ ? list_entry(pos__, type, member) : NULL; \
})

/**
 * list_last_entry_or_null - get the last element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 */
#define list_last_entry_or_null(ptr, type, member) ({ \
        struct list_head *head__ = (ptr); \
        struct list_head *pos__ = READ_ONCE(head__->prev); \
        pos__ != head__ ? list_entry(pos__, type, member) : NULL; \
})

/**
 * list_next_entry - get the next element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_next_entry(pos, member) \
        list_entry((pos)->member.next, typeof(*(pos)), member)

/**
 * list_next_entry_circular - get the next element in list
 * @pos:        the type * to cursor.
 * @head:        the list head to take the element from.
 * @member:        the name of the list_head within the struct.
 *
 * Wraparound if pos is the last element (return the first element).
 * Note, that list is expected to be not empty.
 */
#define list_next_entry_circular(pos, head, member) \
        (list_is_last(&(pos)->member, head) ? \
        list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member))

/**
 * list_prev_entry - get the prev element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_prev_entry(pos, member) \
        list_entry((pos)->member.prev, typeof(*(pos)), member)

/**
 * list_prev_entry_circular - get the prev element in list
 * @pos:        the type * to cursor.
 * @head:        the list head to take the element from.
 * @member:        the name of the list_head within the struct.
 *
 * Wraparound if pos is the first element (return the last element).
 * Note, that list is expected to be not empty.
 */
#define list_prev_entry_circular(pos, head, member) \
        (list_is_first(&(pos)->member, head) ? \
        list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member))

/**
 * list_for_each        -        iterate over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each(pos, head) \
        for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)

/**
 * list_for_each_continue - continue iteration over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 *
 * Continue to iterate over a list, continuing after the current position.
 */
#define list_for_each_continue(pos, head) \
        for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next)

/**
 * list_for_each_prev        -        iterate over a list backwards
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_prev(pos, head) \
        for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)

/**
 * list_for_each_safe - iterate over a list safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_safe(pos, n, head) \
        for (pos = (head)->next, n = pos->next; \
             !list_is_head(pos, (head)); \
             pos = n, n = pos->next)

/**
 * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_prev_safe(pos, n, head) \
        for (pos = (head)->prev, n = pos->prev; \
             !list_is_head(pos, (head)); \
             pos = n, n = pos->prev)

/**
 * list_count_nodes - count nodes in the list
 * @head:        the head for your list.
 */
static inline size_t list_count_nodes(struct list_head *head)
{
        struct list_head *pos;
        size_t count = 0;

        list_for_each(pos, head)
                count++;

        return count;
}

/**
 * list_entry_is_head - test if the entry points to the head of the list
 * @pos:        the type * to cursor
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry_is_head(pos, head, member)                                \
        list_is_head(&pos->member, (head))

/**
 * list_for_each_entry        -        iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry(pos, head, member)                                \
        for (pos = list_first_entry(head, typeof(*pos), member);        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_reverse - iterate backwards over list of given type.
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_reverse(pos, head, member)                        \
        for (pos = list_last_entry(head, typeof(*pos), member);                \
             !list_entry_is_head(pos, head, member);                         \
             pos = list_prev_entry(pos, member))

/**
 * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
 * @pos:        the type * to use as a start point
 * @head:        the head of the list
 * @member:        the name of the list_head within the struct.
 *
 * Prepares a pos entry for use as a start point in list_for_each_entry_continue().
 */
#define list_prepare_entry(pos, head, member) \
        ((pos) ? : list_entry(head, typeof(*pos), member))

/**
 * list_for_each_entry_continue - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position.
 */
#define list_for_each_entry_continue(pos, head, member)                 \
        for (pos = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_continue_reverse - iterate backwards from the given point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Start to iterate over list of given type backwards, continuing after
 * the current position.
 */
#define list_for_each_entry_continue_reverse(pos, head, member)                \
        for (pos = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_from - iterate over list of given type from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing from current position.
 */
#define list_for_each_entry_from(pos, head, member)                         \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_from_reverse - iterate backwards over list of given type
 *                                    from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, continuing from current position.
 */
#define list_for_each_entry_from_reverse(pos, head, member)                \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_safe(pos, n, head, member)                        \
        for (pos = list_first_entry(head, typeof(*pos), member),        \
                n = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_continue - continue list iteration safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing after current point,
 * safe against removal of list entry.
 */
#define list_for_each_entry_safe_continue(pos, n, head, member)                 \
        for (pos = list_next_entry(pos, member),                                 \
                n = list_next_entry(pos, member);                                \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_from - iterate over list from current point safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type from current point, safe against
 * removal of list entry.
 */
#define list_for_each_entry_safe_from(pos, n, head, member)                         \
        for (n = list_next_entry(pos, member);                                        \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, safe against removal
 * of list entry.
 */
#define list_for_each_entry_safe_reverse(pos, n, head, member)                \
        for (pos = list_last_entry(head, typeof(*pos), member),                \
                n = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_prev_entry(n, member))

/**
 * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
 * @pos:        the loop cursor used in the list_for_each_entry_safe loop
 * @n:                temporary storage used in list_for_each_entry_safe
 * @member:        the name of the list_head within the struct.
 *
 * list_safe_reset_next is not safe to use in general if the list may be
 * modified concurrently (eg. the lock is dropped in the loop body). An
 * exception to this is if the cursor element (pos) is pinned in the list,
 * and list_safe_reset_next is called after re-taking the lock and before
 * completing the current iteration of the loop body.
 */
#define list_safe_reset_next(pos, n, member)                                \
        n = list_next_entry(pos, member)

/*
 * Double linked lists with a single pointer list head.
 * Mostly useful for hash tables where the two pointer list head is
 * too wasteful.
 * You lose the ability to access the tail in O(1).
 */

#define HLIST_HEAD_INIT { .first = NULL }
#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
static inline void INIT_HLIST_NODE(struct hlist_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

/**
 * hlist_unhashed - Has node been removed from list and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed
 * state.  For example, hlist_nulls_del_init_rcu() does leave the
 * node in unhashed state, but hlist_nulls_del() does not.
 */
static inline int hlist_unhashed(const struct hlist_node *h)
{
        return !h->pprev;
}

/**
 * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use
 * @h: Node to be checked
 *
 * This variant of hlist_unhashed() must be used in lockless contexts
 * to avoid potential load-tearing.  The READ_ONCE() is paired with the
 * various WRITE_ONCE() in hlist helpers that are defined below.
 */
static inline int hlist_unhashed_lockless(const struct hlist_node *h)
{
        return !READ_ONCE(h->pprev);
}

/**
 * hlist_empty - Is the specified hlist_head structure an empty hlist?
 * @h: Structure to check.
 */
static inline int hlist_empty(const struct hlist_head *h)
{
        return !READ_ONCE(h->first);
}

static inline void __hlist_del(struct hlist_node *n)
{
        struct hlist_node *next = n->next;
        struct hlist_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (next)
                WRITE_ONCE(next->pprev, pprev);
}

/**
 * hlist_del - Delete the specified hlist_node from its list
 * @n: Node to delete.
 *
 * Note that this function leaves the node in hashed state.  Use
 * hlist_del_init() or similar instead to unhash @n.
 */
static inline void hlist_del(struct hlist_node *n)
{
        __hlist_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

/**
 * hlist_del_init - Delete the specified hlist_node from its list and initialize
 * @n: Node to delete.
 *
 * Note that this function leaves the node in unhashed state.
 */
static inline void hlist_del_init(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                INIT_HLIST_NODE(n);
        }
}

/**
 * hlist_add_head - add a new entry at the beginning of the hlist
 * @n: new entry to be added
 * @h: hlist head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
{
        struct hlist_node *first = h->first;
        WRITE_ONCE(n->next, first);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
        WRITE_ONCE(h->first, n);
        WRITE_ONCE(n->pprev, &h->first);
}

/**
 * hlist_add_before - add a new entry before the one specified
 * @n: new entry to be added
 * @next: hlist node to add it before, which must be non-NULL
 */
static inline void hlist_add_before(struct hlist_node *n,
                                    struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        WRITE_ONCE(n->next, next);
        WRITE_ONCE(next->pprev, &n->next);
        WRITE_ONCE(*(n->pprev), n);
}

/**
 * hlist_add_behind - add a new entry after the one specified
 * @n: new entry to be added
 * @prev: hlist node to add it after, which must be non-NULL
 */
static inline void hlist_add_behind(struct hlist_node *n,
                                    struct hlist_node *prev)
{
        WRITE_ONCE(n->next, prev->next);
        WRITE_ONCE(prev->next, n);
        WRITE_ONCE(n->pprev, &prev->next);

        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

/**
 * hlist_add_fake - create a fake hlist consisting of a single headless node
 * @n: Node to make a fake list out of
 *
 * This makes @n appear to be its own predecessor on a headless hlist.
 * The point of this is to allow things like hlist_del() to work correctly
 * in cases where there is no list.
 */
static inline void hlist_add_fake(struct hlist_node *n)
{
        n->pprev = &n->next;
}

/**
 * hlist_fake: Is this node a fake hlist?
 * @h: Node to check for being a self-referential fake hlist.
 */
static inline bool hlist_fake(struct hlist_node *h)
{
        return h->pprev == &h->next;
}

/**
 * hlist_is_singular_node - is node the only element of the specified hlist?
 * @n: Node to check for singularity.
 * @h: Header for potentially singular list.
 *
 * Check whether the node is the only node of the head without
 * accessing head, thus avoiding unnecessary cache misses.
 */
static inline bool
hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h)
{
        return !n->next && n->pprev == &h->first;
}

/**
 * hlist_move_list - Move an hlist
 * @old: hlist_head for old list.
 * @new: hlist_head for new list.
 *
 * Move a list from one list head to another. Fixup the pprev
 * reference of the first entry if it exists.
 */
static inline void hlist_move_list(struct hlist_head *old,
                                   struct hlist_head *new)
{
        new->first = old->first;
        if (new->first)
                new->first->pprev = &new->first;
        old->first = NULL;
}

/**
 * hlist_splice_init() - move all entries from one list to another
 * @from: hlist_head from which entries will be moved
 * @last: last entry on the @from list
 * @to:   hlist_head to which entries will be moved
 *
 * @to can be empty, @from must contain at least @last.
 */
static inline void hlist_splice_init(struct hlist_head *from,
                                     struct hlist_node *last,
                                     struct hlist_head *to)
{
        if (to->first)
                to->first->pprev = &last->next;
        last->next = to->first;
        to->first = from->first;
        from->first->pprev = &to->first;
        from->first = NULL;
}

#define hlist_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_for_each(pos, head) \
        for (pos = (head)->first; pos ; pos = pos->next)

#define hlist_for_each_safe(pos, n, head) \
        for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
             pos = n)

#define hlist_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
        })

/**
 * hlist_for_each_entry        - iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry(pos, head, member)                                \
        for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue(pos, member)                        \
        for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from(pos, member)                                \
        for (; pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                a &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_safe(pos, n, head, member)                 \
        for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
             pos && ({ n = pos->member.next; 1; });                        \
             pos = hlist_entry_safe(n, typeof(*pos), member))

/**
 * hlist_count_nodes - count nodes in the hlist
 * @head:        the head for your hlist.
 */
static inline size_t hlist_count_nodes(struct hlist_head *head)
{
        struct hlist_node *pos;
        size_t count = 0;

        hlist_for_each(pos, head)
                count++;

        return count;
}

#endif












































































































































































































































































































































































































































































































































































































































































































































































































    3 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * sysfs.h - definitions for the device driver filesystem
 *
 * Copyright (c) 2001,2002 Patrick Mochel
 * Copyright (c) 2004 Silicon Graphics, Inc.
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * Please see Documentation/filesystems/sysfs.rst for more information.
 */

#ifndef _SYSFS_H_
#define _SYSFS_H_

#include <linux/kernfs.h>
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/kobject_ns.h>
#include <linux/stat.h>
#include <linux/atomic.h>

struct kobject;
struct module;
struct bin_attribute;
enum kobj_ns_type;

struct attribute {
        const char                *name;
        umode_t                        mode;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        bool                        ignore_lockdep:1;
        struct lock_class_key        *key;
        struct lock_class_key        skey;
#endif
};

/**
 *        sysfs_attr_init - initialize a dynamically allocated sysfs attribute
 *        @attr: struct attribute to initialize
 *
 *        Initialize a dynamically allocated struct attribute so we can
 *        make lockdep happy.  This is a new requirement for attributes
 *        and initially this is only needed when lockdep is enabled.
 *        Lockdep gives a nice error when your attribute is added to
 *        sysfs if you don't have this.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define sysfs_attr_init(attr)                                \
do {                                                        \
        static struct lock_class_key __key;                \
                                                        \
        (attr)->key = &__key;                                \
} while (0)
#else
#define sysfs_attr_init(attr) do {} while (0)
#endif

#ifdef CONFIG_CFI
#define __SYSFS_FUNCTION_ALTERNATIVE(MEMBERS...) struct { MEMBERS }
#else
#define __SYSFS_FUNCTION_ALTERNATIVE(MEMBERS...) union { MEMBERS }
#endif

/**
 * struct attribute_group - data structure used to declare an attribute group.
 * @name:        Optional: Attribute group name
 *                If specified, the attribute group will be created in a
 *                new subdirectory with this name. Additionally when a
 *                group is named, @is_visible and @is_bin_visible may
 *                return SYSFS_GROUP_INVISIBLE to control visibility of
 *                the directory itself.
 * @is_visible:        Optional: Function to return permissions associated with an
 *                attribute of the group. Will be called repeatedly for
 *                each non-binary attribute in the group. Only read/write
 *                permissions as well as SYSFS_PREALLOC are accepted. Must
 *                return 0 if an attribute is not visible. The returned
 *                value will replace static permissions defined in struct
 *                attribute. Use SYSFS_GROUP_VISIBLE() when assigning this
 *                callback to specify separate _group_visible() and
 *                _attr_visible() handlers.
 * @is_bin_visible:
 *                Optional: Function to return permissions associated with a
 *                binary attribute of the group. Will be called repeatedly
 *                for each binary attribute in the group. Only read/write
 *                permissions as well as SYSFS_PREALLOC (and the
 *                visibility flags for named groups) are accepted. Must
 *                return 0 if a binary attribute is not visible. The
 *                returned value will replace static permissions defined
 *                in struct bin_attribute. If @is_visible is not set, Use
 *                SYSFS_GROUP_VISIBLE() when assigning this callback to
 *                specify separate _group_visible() and _attr_visible()
 *                handlers.
 * @bin_size:
 *                Optional: Function to return the size of a binary attribute
 *                of the group. Will be called repeatedly for each binary
 *                attribute in the group. Overwrites the size field embedded
 *                inside the attribute itself.
 * @attrs:        Pointer to NULL terminated list of attributes.
 * @bin_attrs:        Pointer to NULL terminated list of binary attributes.
 *                Either attrs or bin_attrs or both must be provided.
 */
struct attribute_group {
        const char                *name;
        __SYSFS_FUNCTION_ALTERNATIVE(
                umode_t                        (*is_visible)(struct kobject *,
                                                      struct attribute *, int);
                umode_t                        (*is_visible_const)(struct kobject *,
                                                            const struct attribute *, int);
        );
        umode_t                        (*is_bin_visible)(struct kobject *,
                                                  const struct bin_attribute *, int);
        size_t                        (*bin_size)(struct kobject *,
                                            const struct bin_attribute *,
                                            int);
        union {
                struct attribute        **attrs;
                const struct attribute        *const *attrs_const;
        };
        const struct bin_attribute        *const *bin_attrs;
};

#define SYSFS_PREALLOC                010000
#define SYSFS_GROUP_INVISIBLE        020000

/*
 * DEFINE_SYSFS_GROUP_VISIBLE(name):
 *        A helper macro to pair with the assignment of ".is_visible =
 *        SYSFS_GROUP_VISIBLE(name)", that arranges for the directory
 *        associated with a named attribute_group to optionally be hidden.
 *        This allows for static declaration of attribute_groups, and the
 *        simplification of attribute visibility lifetime that implies,
 *        without polluting sysfs with empty attribute directories.
 * Ex.
 *
 * static umode_t example_attr_visible(struct kobject *kobj,
 *                                   struct attribute *attr, int n)
 * {
 *       if (example_attr_condition)
 *               return 0;
 *       else if (ro_attr_condition)
 *               return 0444;
 *       return a->mode;
 * }
 *
 * static bool example_group_visible(struct kobject *kobj)
 * {
 *       if (example_group_condition)
 *               return false;
 *       return true;
 * }
 *
 * DEFINE_SYSFS_GROUP_VISIBLE(example);
 *
 * static struct attribute_group example_group = {
 *       .name = "example",
 *       .is_visible = SYSFS_GROUP_VISIBLE(example),
 *       .attrs = &example_attrs,
 * };
 *
 * Note that it expects <name>_attr_visible and <name>_group_visible to
 * be defined. For cases where individual attributes do not need
 * separate visibility consideration, only entire group visibility at
 * once, see DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE().
 */
#define DEFINE_SYSFS_GROUP_VISIBLE(name)                             \
        static inline umode_t sysfs_group_visible_##name(            \
                struct kobject *kobj, struct attribute *attr, int n) \
        {                                                            \
                if (n == 0 && !name##_group_visible(kobj))           \
                        return SYSFS_GROUP_INVISIBLE;                \
                return name##_attr_visible(kobj, attr, n);           \
        }

/*
 * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name):
 *        A helper macro to pair with SYSFS_GROUP_VISIBLE() that like
 *        DEFINE_SYSFS_GROUP_VISIBLE() controls group visibility, but does
 *        not require the implementation of a per-attribute visibility
 *        callback.
 * Ex.
 *
 * static bool example_group_visible(struct kobject *kobj)
 * {
 *       if (example_group_condition)
 *               return false;
 *       return true;
 * }
 *
 * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(example);
 *
 * static struct attribute_group example_group = {
 *       .name = "example",
 *       .is_visible = SYSFS_GROUP_VISIBLE(example),
 *       .attrs = &example_attrs,
 * };
 */
#define DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name)                   \
        static inline umode_t sysfs_group_visible_##name(         \
                struct kobject *kobj, struct attribute *a, int n) \
        {                                                         \
                if (n == 0 && !name##_group_visible(kobj))        \
                        return SYSFS_GROUP_INVISIBLE;             \
                return a->mode;                                   \
        }

/*
 * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary
 * attributes. If an attribute_group defines both text and binary
 * attributes, the group visibility is determined by the function
 * specified to is_visible() not is_bin_visible()
 */
#define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name)                                   \
        static inline umode_t sysfs_group_visible_##name(                      \
                struct kobject *kobj, const struct bin_attribute *attr, int n) \
        {                                                                      \
                if (n == 0 && !name##_group_visible(kobj))                     \
                        return SYSFS_GROUP_INVISIBLE;                          \
                return name##_attr_visible(kobj, attr, n);                     \
        }

#define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name)                         \
        static inline umode_t sysfs_group_visible_##name(                   \
                struct kobject *kobj, const struct bin_attribute *a, int n) \
        {                                                                   \
                if (n == 0 && !name##_group_visible(kobj))                  \
                        return SYSFS_GROUP_INVISIBLE;                       \
                return a->mode;                                             \
        }

#define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn

/*
 * Use these macros to make defining attributes easier.
 * See include/linux/device.h for examples..
 */

#define __ATTR(_name, _mode, _show, _store) {                                \
        .attr = {.name = __stringify(_name),                                \
                 .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _show,                                                \
        .store        = _store,                                                \
}

#define __ATTR_PREALLOC(_name, _mode, _show, _store) {                        \
        .attr = {.name = __stringify(_name),                                \
                 .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\
        .show        = _show,                                                \
        .store        = _store,                                                \
}

#define __ATTR_RO_MODE(_name, _mode) {                                        \
        .attr        = { .name = __stringify(_name),                                \
                    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _name##_show,                                                \
}

#define __ATTR_RO(_name)                                                \
        __ATTR_RO_MODE(_name, 0444)

#define __ATTR_RW_MODE(_name, _mode)                                        \
        __ATTR(_name, _mode, _name##_show, _name##_store)

#define __ATTR_WO(_name)                                                \
        __ATTR(_name, 0200, NULL, _name##_store)

#define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store)

#define __ATTR_NULL { .attr = { .name = NULL } }

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) {        \
        .attr = {.name = __stringify(_name), .mode = _mode,        \
                        .ignore_lockdep = true },                \
        .show                = _show,                                \
        .store                = _store,                                \
}
#else
#define __ATTR_IGNORE_LOCKDEP        __ATTR
#endif

#define __ATTRIBUTE_GROUPS(_name)                                \
static const struct attribute_group *_name##_groups[] = {        \
        &_name##_group,                                                \
        NULL,                                                        \
}

#define ATTRIBUTE_GROUPS(_name)                                        \
static const struct attribute_group _name##_group = {                \
        .attrs = _Generic(_name##_attrs,                        \
                          struct attribute **:                        \
                                _name##_attrs,                        \
                          const struct attribute *const *:        \
                                (void *)_name##_attrs                \
        ),                                                        \
};                                                                \
__ATTRIBUTE_GROUPS(_name)

#define BIN_ATTRIBUTE_GROUPS(_name)                                \
static const struct attribute_group _name##_group = {                \
        .bin_attrs = _name##_attrs,                                \
};                                                                \
__ATTRIBUTE_GROUPS(_name)

struct file;
struct vm_area_struct;
struct address_space;

struct bin_attribute {
        struct attribute        attr;
        size_t                        size;
        void                        *private;
        struct address_space *(*f_mapping)(void);
        ssize_t (*read)(struct file *, struct kobject *, const struct bin_attribute *,
                        char *, loff_t, size_t);
        ssize_t (*write)(struct file *, struct kobject *, const struct bin_attribute *,
                         char *, loff_t, size_t);
        loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *,
                         loff_t, int);
        int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr,
                    struct vm_area_struct *vma);
};

/**
 *        sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute
 *        @attr: struct bin_attribute to initialize
 *
 *        Initialize a dynamically allocated struct bin_attribute so we
 *        can make lockdep happy.  This is a new requirement for
 *        attributes and initially this is only needed when lockdep is
 *        enabled.  Lockdep gives a nice error when your attribute is
 *        added to sysfs if you don't have this.
 */
#define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr)

/* macros to create static binary attributes easier */
#define __BIN_ATTR(_name, _mode, _read, _write, _size) {                \
        .attr = { .name = __stringify(_name), .mode = _mode },                \
        .read = _read,                                                        \
        .write = _write,                                                \
        .size        = _size,                                                \
}

#define __BIN_ATTR_RO(_name, _size)                                        \
        __BIN_ATTR(_name, 0444, _name##_read, NULL, _size)

#define __BIN_ATTR_WO(_name, _size)                                        \
        __BIN_ATTR(_name, 0200, NULL, _name##_write, _size)

#define __BIN_ATTR_RW(_name, _size)                                        \
        __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size)

#define __BIN_ATTR_NULL __ATTR_NULL

#define BIN_ATTR(_name, _mode, _read, _write, _size)                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read,        \
                                        _write, _size)

#define BIN_ATTR_RO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size)

#define BIN_ATTR_WO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size)

#define BIN_ATTR_RW(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size)


#define __BIN_ATTR_ADMIN_RO(_name, _size)                                \
        __BIN_ATTR(_name, 0400, _name##_read, NULL, _size)

#define __BIN_ATTR_ADMIN_RW(_name, _size)                                        \
        __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size)

#define BIN_ATTR_ADMIN_RO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size)

#define BIN_ATTR_ADMIN_RW(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size)

#define __BIN_ATTR_SIMPLE_RO(_name, _mode)                                \
        __BIN_ATTR(_name, _mode, sysfs_bin_attr_simple_read, NULL, 0)

#define BIN_ATTR_SIMPLE_RO(_name)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0444)

#define BIN_ATTR_SIMPLE_ADMIN_RO(_name)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0400)

struct sysfs_ops {
        ssize_t        (*show)(struct kobject *, struct attribute *, char *);
        ssize_t        (*store)(struct kobject *, struct attribute *, const char *, size_t);
};

#ifdef CONFIG_SYSFS

int __must_check sysfs_create_dir_ns(struct kobject *kobj, const struct ns_common *ns);
void sysfs_remove_dir(struct kobject *kobj);
int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
                                     const struct ns_common *new_ns);
int __must_check sysfs_move_dir_ns(struct kobject *kobj,
                                   struct kobject *new_parent_kobj,
                                   const struct ns_common *new_ns);
int __must_check sysfs_create_mount_point(struct kobject *parent_kobj,
                                          const char *name);
void sysfs_remove_mount_point(struct kobject *parent_kobj,
                              const char *name);

int __must_check sysfs_create_file_ns(struct kobject *kobj,
                                      const struct attribute *attr,
                                      const struct ns_common *ns);
int __must_check sysfs_create_files(struct kobject *kobj,
                                   const struct attribute * const *attr);
int __must_check sysfs_chmod_file(struct kobject *kobj,
                                  const struct attribute *attr, umode_t mode);
struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj,
                                                  const struct attribute *attr);
void sysfs_unbreak_active_protection(struct kernfs_node *kn);
void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
                          const struct ns_common *ns);
bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr);
void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr);

int __must_check sysfs_create_bin_file(struct kobject *kobj,
                                       const struct bin_attribute *attr);
void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr);

int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target,
                                   const char *name);
int __must_check sysfs_create_link_nowarn(struct kobject *kobj,
                                          struct kobject *target,
                                          const char *name);
void sysfs_remove_link(struct kobject *kobj, const char *name);

int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target,
                         const char *old_name, const char *new_name,
                         const struct ns_common *new_ns);

void sysfs_delete_link(struct kobject *dir, struct kobject *targ,
                        const char *name);

int __must_check sysfs_create_group(struct kobject *kobj,
                                    const struct attribute_group *grp);
int __must_check sysfs_create_groups(struct kobject *kobj,
                                     const struct attribute_group *const *groups);
int __must_check sysfs_update_groups(struct kobject *kobj,
                                     const struct attribute_group *const *groups);
int sysfs_update_group(struct kobject *kobj,
                       const struct attribute_group *grp);
void sysfs_remove_group(struct kobject *kobj,
                        const struct attribute_group *grp);
void sysfs_remove_groups(struct kobject *kobj,
                         const struct attribute_group *const *groups);
int sysfs_add_file_to_group(struct kobject *kobj,
                        const struct attribute *attr, const char *group);
void sysfs_remove_file_from_group(struct kobject *kobj,
                        const struct attribute *attr, const char *group);
int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp);
void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp);
int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
                            struct kobject *target, const char *link_name);
void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
                                  const char *link_name);
int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
                                         struct kobject *target_kobj,
                                         const char *target_name,
                                         const char *symlink_name);

void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);

int __must_check sysfs_init(void);

static inline void sysfs_enable_ns(struct kernfs_node *kn)
{
        return kernfs_enable_ns(kn);
}

int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
                            kgid_t kgid);
int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid);
int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
                            const char *name, kuid_t kuid, kgid_t kgid);
int sysfs_groups_change_owner(struct kobject *kobj,
                              const struct attribute_group *const *groups,
                              kuid_t kuid, kgid_t kgid);
int sysfs_group_change_owner(struct kobject *kobj,
                             const struct attribute_group *groups, kuid_t kuid,
                             kgid_t kgid);
__printf(2, 3)
int sysfs_emit(char *buf, const char *fmt, ...);
__printf(3, 4)
int sysfs_emit_at(char *buf, int at, const char *fmt, ...);

ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
                                   const struct bin_attribute *attr, char *buf,
                                   loff_t off, size_t count);

#else /* CONFIG_SYSFS */

static inline int sysfs_create_dir_ns(struct kobject *kobj, const struct ns_common *ns)
{
        return 0;
}

static inline void sysfs_remove_dir(struct kobject *kobj)
{
}

static inline int sysfs_rename_dir_ns(struct kobject *kobj,
                                      const char *new_name, const struct ns_common *new_ns)
{
        return 0;
}

static inline int sysfs_move_dir_ns(struct kobject *kobj,
                                    struct kobject *new_parent_kobj,
                                    const struct ns_common *new_ns)
{
        return 0;
}

static inline int sysfs_create_mount_point(struct kobject *parent_kobj,
                                           const char *name)
{
        return 0;
}

static inline void sysfs_remove_mount_point(struct kobject *parent_kobj,
                                            const char *name)
{
}

static inline int sysfs_create_file_ns(struct kobject *kobj,
                                       const struct attribute *attr,
                                       const struct ns_common *ns)
{
        return 0;
}

static inline int sysfs_create_files(struct kobject *kobj,
                                    const struct attribute * const *attr)
{
        return 0;
}

static inline int sysfs_chmod_file(struct kobject *kobj,
                                   const struct attribute *attr, umode_t mode)
{
        return 0;
}

static inline struct kernfs_node *
sysfs_break_active_protection(struct kobject *kobj,
                              const struct attribute *attr)
{
        return NULL;
}

static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn)
{
}

static inline void sysfs_remove_file_ns(struct kobject *kobj,
                                        const struct attribute *attr,
                                        const struct ns_common *ns)
{
}

static inline bool sysfs_remove_file_self(struct kobject *kobj,
                                          const struct attribute *attr)
{
        return false;
}

static inline void sysfs_remove_files(struct kobject *kobj,
                                     const struct attribute * const *attr)
{
}

static inline int sysfs_create_bin_file(struct kobject *kobj,
                                        const struct bin_attribute *attr)
{
        return 0;
}

static inline void sysfs_remove_bin_file(struct kobject *kobj,
                                         const struct bin_attribute *attr)
{
}

static inline int sysfs_create_link(struct kobject *kobj,
                                    struct kobject *target, const char *name)
{
        return 0;
}

static inline int sysfs_create_link_nowarn(struct kobject *kobj,
                                           struct kobject *target,
                                           const char *name)
{
        return 0;
}

static inline void sysfs_remove_link(struct kobject *kobj, const char *name)
{
}

static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t,
                                       const char *old_name,
                                       const char *new_name, const struct ns_common *ns)
{
        return 0;
}

static inline void sysfs_delete_link(struct kobject *k, struct kobject *t,
                                     const char *name)
{
}

static inline int sysfs_create_group(struct kobject *kobj,
                                     const struct attribute_group *grp)
{
        return 0;
}

static inline int sysfs_create_groups(struct kobject *kobj,
                                      const struct attribute_group *const *groups)
{
        return 0;
}

static inline int sysfs_update_groups(struct kobject *kobj,
                                      const struct attribute_group *const *groups)
{
        return 0;
}

static inline int sysfs_update_group(struct kobject *kobj,
                                const struct attribute_group *grp)
{
        return 0;
}

static inline void sysfs_remove_group(struct kobject *kobj,
                                      const struct attribute_group *grp)
{
}

static inline void sysfs_remove_groups(struct kobject *kobj,
                                       const struct attribute_group *const *groups)
{
}

static inline int sysfs_add_file_to_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
        return 0;
}

static inline void sysfs_remove_file_from_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
}

static inline int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        return 0;
}

static inline void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
}

static inline int sysfs_add_link_to_group(struct kobject *kobj,
                const char *group_name, struct kobject *target,
                const char *link_name)
{
        return 0;
}

static inline void sysfs_remove_link_from_group(struct kobject *kobj,
                const char *group_name, const char *link_name)
{
}

static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
                                                       struct kobject *target_kobj,
                                                       const char *target_name,
                                                       const char *symlink_name)
{
        return 0;
}

static inline void sysfs_notify(struct kobject *kobj, const char *dir,
                                const char *attr)
{
}

static inline int __must_check sysfs_init(void)
{
        return 0;
}

static inline void sysfs_enable_ns(struct kernfs_node *kn)
{
}

static inline int sysfs_file_change_owner(struct kobject *kobj,
                                          const char *name, kuid_t kuid,
                                          kgid_t kgid)
{
        return 0;
}

static inline int sysfs_link_change_owner(struct kobject *kobj,
                                          struct kobject *targ,
                                          const char *name, kuid_t kuid,
                                          kgid_t kgid)
{
        return 0;
}

static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
{
        return 0;
}

static inline int sysfs_groups_change_owner(struct kobject *kobj,
                          const struct attribute_group *const *groups,
                          kuid_t kuid, kgid_t kgid)
{
        return 0;
}

static inline int sysfs_group_change_owner(struct kobject *kobj,
                                           const struct attribute_group *groups,
                                           kuid_t kuid, kgid_t kgid)
{
        return 0;
}

__printf(2, 3)
static inline int sysfs_emit(char *buf, const char *fmt, ...)
{
        return 0;
}

__printf(3, 4)
static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
{
        return 0;
}

static inline ssize_t sysfs_bin_attr_simple_read(struct file *file,
                                                 struct kobject *kobj,
                                                 const struct bin_attribute *attr,
                                                 char *buf, loff_t off,
                                                 size_t count)
{
        return 0;
}
#endif /* CONFIG_SYSFS */

static inline int __must_check sysfs_create_file(struct kobject *kobj,
                                                 const struct attribute *attr)
{
        return sysfs_create_file_ns(kobj, attr, NULL);
}

static inline void sysfs_remove_file(struct kobject *kobj,
                                     const struct attribute *attr)
{
        sysfs_remove_file_ns(kobj, attr, NULL);
}

static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target,
                                    const char *old_name, const char *new_name)
{
        return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL);
}

static inline void sysfs_notify_dirent(struct kernfs_node *kn)
{
        kernfs_notify(kn);
}

static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent,
                                                   const char *name)
{
        return kernfs_find_and_get(parent, name);
}

static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn)
{
        kernfs_get(kn);
        return kn;
}

static inline void sysfs_put(struct kernfs_node *kn)
{
        kernfs_put(kn);
}

/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
#define VERIFY_OCTAL_PERMISSIONS(perms)                                                \
        (BUILD_BUG_ON_ZERO((perms) < 0) +                                        \
         BUILD_BUG_ON_ZERO((perms) > 0777) +                                        \
         /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */                \
         BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) +        \
         BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) +                \
         /* USER_WRITABLE >= GROUP_WRITABLE */                                        \
         BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) +        \
         /* OTHER_WRITABLE?  Generally considered a bad idea. */                \
         BUILD_BUG_ON_ZERO((perms) & 2) +                                        \
         (perms))

#endif /* _SYSFS_H_ */







    3 































































































































































    3 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/pm_qos.h>

static inline void device_pm_init_common(struct device *dev)
{
        if (!dev->power.early_init) {
                spin_lock_init(&dev->power.lock);
                dev->power.qos = NULL;
                dev->power.early_init = true;
        }
}

#ifdef CONFIG_PM

static inline void pm_runtime_early_init(struct device *dev)
{
        dev->power.disable_depth = 1;
        device_pm_init_common(dev);
}

extern void pm_runtime_init(struct device *dev);
extern void pm_runtime_reinit(struct device *dev);
extern void pm_runtime_remove(struct device *dev);
extern u64 pm_runtime_active_time(struct device *dev);

#define WAKE_IRQ_DEDICATED_ALLOCATED        BIT(0)
#define WAKE_IRQ_DEDICATED_MANAGED        BIT(1)
#define WAKE_IRQ_DEDICATED_REVERSE        BIT(2)
#define WAKE_IRQ_DEDICATED_MASK                (WAKE_IRQ_DEDICATED_ALLOCATED | \
                                         WAKE_IRQ_DEDICATED_MANAGED | \
                                         WAKE_IRQ_DEDICATED_REVERSE)
#define WAKE_IRQ_DEDICATED_ENABLED        BIT(3)

struct wake_irq {
        struct device *dev;
        unsigned int status;
        int irq;
        const char *name;
};

extern void dev_pm_arm_wake_irq(struct wake_irq *wirq);
extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq);
extern void dev_pm_enable_wake_irq_check(struct device *dev,
                                         bool can_change_status);
extern void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable);
extern void dev_pm_enable_wake_irq_complete(struct device *dev);

#ifdef CONFIG_PM_SLEEP

extern void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq);
extern void device_wakeup_detach_irq(struct device *dev);
extern void device_wakeup_arm_wake_irqs(void);
extern void device_wakeup_disarm_wake_irqs(void);

#else

static inline void device_wakeup_attach_irq(struct device *dev,
                                            struct wake_irq *wakeirq) {}

static inline void device_wakeup_detach_irq(struct device *dev)
{
}

#endif /* CONFIG_PM_SLEEP */

/*
 * sysfs.c
 */

extern int dpm_sysfs_add(struct device *dev);
extern void dpm_sysfs_remove(struct device *dev);
extern void rpm_sysfs_remove(struct device *dev);
extern int wakeup_sysfs_add(struct device *dev);
extern void wakeup_sysfs_remove(struct device *dev);
extern int pm_qos_sysfs_add_resume_latency(struct device *dev);
extern void pm_qos_sysfs_remove_resume_latency(struct device *dev);
extern int pm_qos_sysfs_add_flags(struct device *dev);
extern void pm_qos_sysfs_remove_flags(struct device *dev);
extern int pm_qos_sysfs_add_latency_tolerance(struct device *dev);
extern void pm_qos_sysfs_remove_latency_tolerance(struct device *dev);
extern int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);

#else /* CONFIG_PM */

static inline void pm_runtime_early_init(struct device *dev)
{
        device_pm_init_common(dev);
}

static inline void pm_runtime_init(struct device *dev) {}
static inline void pm_runtime_reinit(struct device *dev) {}
static inline void pm_runtime_remove(struct device *dev) {}

static inline int dpm_sysfs_add(struct device *dev) { return 0; }
static inline void dpm_sysfs_remove(struct device *dev) {}
static inline int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid,
                                         kgid_t kgid) { return 0; }

#endif

#ifdef CONFIG_PM_SLEEP

/* kernel/power/main.c */
extern int pm_async_enabled;

/* drivers/base/power/main.c */
extern struct list_head dpm_list;        /* The active device list */

static inline struct device *to_device(struct list_head *entry)
{
        return container_of(entry, struct device, power.entry);
}

extern void device_pm_sleep_init(struct device *dev);
extern void device_pm_add(struct device *);
extern void device_pm_remove(struct device *);
extern void device_pm_move_before(struct device *, struct device *);
extern void device_pm_move_after(struct device *, struct device *);
extern void device_pm_move_last(struct device *);
extern void device_pm_check_callbacks(struct device *dev);

static inline bool device_pm_initialized(struct device *dev)
{
        return dev->power.in_dpm_list;
}

/* drivers/base/power/wakeup_stats.c */
extern int wakeup_source_sysfs_add(struct device *parent,
                                   struct wakeup_source *ws);
extern void wakeup_source_sysfs_remove(struct wakeup_source *ws);

extern int pm_wakeup_source_sysfs_add(struct device *parent);

#else /* !CONFIG_PM_SLEEP */

static inline void device_pm_sleep_init(struct device *dev) {}

static inline void device_pm_add(struct device *dev) {}

static inline void device_pm_remove(struct device *dev)
{
        pm_runtime_remove(dev);
}

static inline void device_pm_move_before(struct device *deva,
                                         struct device *devb) {}
static inline void device_pm_move_after(struct device *deva,
                                        struct device *devb) {}
static inline void device_pm_move_last(struct device *dev) {}

static inline void device_pm_check_callbacks(struct device *dev) {}

static inline bool device_pm_initialized(struct device *dev)
{
        return device_is_registered(dev);
}

static inline int pm_wakeup_source_sysfs_add(struct device *parent)
{
        return 0;
}

#endif /* !CONFIG_PM_SLEEP */

static inline void device_pm_init(struct device *dev)
{
        device_pm_init_common(dev);
        device_pm_sleep_init(dev);
        pm_runtime_init(dev);
}














































































































































































































































































































































    3 
    3 














    3 


























    3 



































    3 





    3 
    3 


    3 



    3 







    3 











    3 



    3 






















    3 





    3 





    2 























    1 







































































    1 





















































































    3 





    3 


    3 























    3 











    3 
















    3 

















    3 




























































































































































    3 


















    3 








    3 




    3 































































    3 



    3 

















































































































































































































































    3 






















    3 





    3 








    3 












    3 




    3 


    3 

    3 



    2 



















































































































































































































































































































































































































































































































































































































































































































































    3 










    3 










    3 
    3 





























































    3 














    3 











































































    3 




    2 







    1 

    2 






















    3 































































    3 
































































    3 
    3 










































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
// SPDX-License-Identifier: GPL-2.0+
/*
 * dummy_hcd.c -- Dummy/Loopback USB host and device emulator driver.
 *
 * Maintainer: Alan Stern <stern@rowland.harvard.edu>
 *
 * Copyright (C) 2003 David Brownell
 * Copyright (C) 2003-2005 Alan Stern
 */


/*
 * This exposes a device side "USB gadget" API, driven by requests to a
 * Linux-USB host controller driver.  USB traffic is simulated; there's
 * no need for USB hardware.  Use this with two other drivers:
 *
 *  - Gadget driver, responding to requests (device);
 *  - Host-side device driver, as already familiar in Linux.
 *
 * Having this all in one kernel can help some stages of development,
 * bypassing some hardware (and driver) issues.  UML could help too.
 *
 * Note: The emulation does not include isochronous transfers!
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/ioport.h>
#include <linux/slab.h>
#include <linux/string_choices.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/hrtimer.h>
#include <linux/list.h>
#include <linux/interrupt.h>
#include <linux/platform_device.h>
#include <linux/usb.h>
#include <linux/usb/gadget.h>
#include <linux/usb/hcd.h>
#include <linux/scatterlist.h>

#include <asm/byteorder.h>
#include <linux/io.h>
#include <asm/irq.h>
#include <linux/unaligned.h>

#define DRIVER_DESC        "USB Host+Gadget Emulator"
#define DRIVER_VERSION        "02 May 2005"

#define POWER_BUDGET        500        /* in mA; use 8 for low-power port testing */
#define POWER_BUDGET_3        900        /* in mA */

#define DUMMY_TIMER_INT_NSECS        125000 /* 1 microframe */

static const char        driver_name[] = "dummy_hcd";
static const char        driver_desc[] = "USB Host+Gadget Emulator";

static const char        gadget_name[] = "dummy_udc";

MODULE_DESCRIPTION(DRIVER_DESC);
MODULE_AUTHOR("David Brownell");
MODULE_LICENSE("GPL");

struct dummy_hcd_module_parameters {
        bool is_super_speed;
        bool is_high_speed;
        unsigned int num;
};

static struct dummy_hcd_module_parameters mod_data = {
        .is_super_speed = false,
        .is_high_speed = true,
        .num = 1,
};
module_param_named(is_super_speed, mod_data.is_super_speed, bool, S_IRUGO);
MODULE_PARM_DESC(is_super_speed, "true to simulate SuperSpeed connection");
module_param_named(is_high_speed, mod_data.is_high_speed, bool, S_IRUGO);
MODULE_PARM_DESC(is_high_speed, "true to simulate HighSpeed connection");
module_param_named(num, mod_data.num, uint, S_IRUGO);
MODULE_PARM_DESC(num, "number of emulated controllers");
/*-------------------------------------------------------------------------*/

/* gadget side driver data structures */
struct dummy_ep {
        struct list_head                queue;
        unsigned long                        last_io;        /* jiffies timestamp */
        struct usb_gadget                *gadget;
        const struct usb_endpoint_descriptor *desc;
        struct usb_ep                        ep;
        unsigned                        halted:1;
        unsigned                        wedged:1;
        unsigned                        already_seen:1;
        unsigned                        setup_stage:1;
        unsigned                        stream_en:1;
};

struct dummy_request {
        struct list_head                queue;                /* ep's requests */
        struct usb_request                req;
};

static inline struct dummy_ep *usb_ep_to_dummy_ep(struct usb_ep *_ep)
{
        return container_of(_ep, struct dummy_ep, ep);
}

static inline struct dummy_request *usb_request_to_dummy_request
                (struct usb_request *_req)
{
        return container_of(_req, struct dummy_request, req);
}

/*-------------------------------------------------------------------------*/

/*
 * Every device has ep0 for control requests, plus up to 30 more endpoints,
 * in one of two types:
 *
 *   - Configurable:  direction (in/out), type (bulk, iso, etc), and endpoint
 *     number can be changed.  Names like "ep-a" are used for this type.
 *
 *   - Fixed Function:  in other cases.  some characteristics may be mutable;
 *     that'd be hardware-specific.  Names like "ep12out-bulk" are used.
 *
 * Gadget drivers are responsible for not setting up conflicting endpoint
 * configurations, illegal or unsupported packet lengths, and so on.
 */

static const char ep0name[] = "ep0";

static const struct {
        const char *name;
        const struct usb_ep_caps caps;
} ep_info[] = {
#define EP_INFO(_name, _caps) \
        { \
                .name = _name, \
                .caps = _caps, \
        }

/* we don't provide isochronous endpoints since we don't support them */
#define TYPE_BULK_OR_INT        (USB_EP_CAPS_TYPE_BULK | USB_EP_CAPS_TYPE_INT)

        /* everyone has ep0 */
        EP_INFO(ep0name,
                USB_EP_CAPS(USB_EP_CAPS_TYPE_CONTROL, USB_EP_CAPS_DIR_ALL)),
        /* act like a pxa250: fifteen fixed function endpoints */
        EP_INFO("ep1in-bulk",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep2out-bulk",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)),
/*
        EP_INFO("ep3in-iso",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep4out-iso",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)),
*/
        EP_INFO("ep5in-int",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep6in-bulk",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep7out-bulk",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)),
/*
        EP_INFO("ep8in-iso",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep9out-iso",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)),
*/
        EP_INFO("ep10in-int",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep11in-bulk",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep12out-bulk",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)),
/*
        EP_INFO("ep13in-iso",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep14out-iso",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)),
*/
        EP_INFO("ep15in-int",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)),

        /* or like sa1100: two fixed function endpoints */
        EP_INFO("ep1out-bulk",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)),
        EP_INFO("ep2in-bulk",
                USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)),

        /* and now some generic EPs so we have enough in multi config */
        EP_INFO("ep-aout",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)),
        EP_INFO("ep-bin",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep-cout",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)),
        EP_INFO("ep-dout",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)),
        EP_INFO("ep-ein",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep-fout",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)),
        EP_INFO("ep-gin",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep-hout",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)),
        EP_INFO("ep-iout",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)),
        EP_INFO("ep-jin",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep-kout",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)),
        EP_INFO("ep-lin",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)),
        EP_INFO("ep-mout",
                USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)),

#undef EP_INFO
};

#define DUMMY_ENDPOINTS        ARRAY_SIZE(ep_info)

/*-------------------------------------------------------------------------*/

#define FIFO_SIZE                64

struct urbp {
        struct urb                *urb;
        struct list_head        urbp_list;
        struct sg_mapping_iter        miter;
        u32                        miter_started;
};


enum dummy_rh_state {
        DUMMY_RH_RESET,
        DUMMY_RH_SUSPENDED,
        DUMMY_RH_RUNNING
};

struct dummy_hcd {
        struct dummy                        *dum;
        enum dummy_rh_state                rh_state;
        struct hrtimer                        timer;
        u32                                port_status;
        u32                                old_status;
        unsigned long                        re_timeout;

        struct usb_device                *udev;
        struct list_head                urbp_list;
        struct urbp                        *next_frame_urbp;

        u32                                stream_en_ep;
        u8                                num_stream[30 / 2];

        unsigned                        timer_pending:1;
        unsigned                        active:1;
        unsigned                        old_active:1;
        unsigned                        resuming:1;
};

struct dummy {
        spinlock_t                        lock;

        /*
         * DEVICE/GADGET side support
         */
        struct dummy_ep                        ep[DUMMY_ENDPOINTS];
        int                                address;
        int                                callback_usage;
        struct usb_gadget                gadget;
        struct usb_gadget_driver        *driver;
        struct dummy_request                fifo_req;
        u8                                fifo_buf[FIFO_SIZE];
        u16                                devstatus;
        unsigned                        ints_enabled:1;
        unsigned                        udc_suspended:1;
        unsigned                        pullup:1;

        /*
         * HOST side support
         */
        struct dummy_hcd                *hs_hcd;
        struct dummy_hcd                *ss_hcd;
};

static inline struct dummy_hcd *hcd_to_dummy_hcd(struct usb_hcd *hcd)
{
        return (struct dummy_hcd *) (hcd->hcd_priv);
}

static inline struct usb_hcd *dummy_hcd_to_hcd(struct dummy_hcd *dum)
{
        return container_of((void *) dum, struct usb_hcd, hcd_priv);
}

static inline struct device *dummy_dev(struct dummy_hcd *dum)
{
        return dummy_hcd_to_hcd(dum)->self.controller;
}

static inline struct device *udc_dev(struct dummy *dum)
{
        return dum->gadget.dev.parent;
}

static inline struct dummy *ep_to_dummy(struct dummy_ep *ep)
{
        return container_of(ep->gadget, struct dummy, gadget);
}

static inline struct dummy_hcd *gadget_to_dummy_hcd(struct usb_gadget *gadget)
{
        struct dummy *dum = container_of(gadget, struct dummy, gadget);
        if (dum->gadget.speed == USB_SPEED_SUPER)
                return dum->ss_hcd;
        else
                return dum->hs_hcd;
}

static inline struct dummy *gadget_dev_to_dummy(struct device *dev)
{
        return container_of(dev, struct dummy, gadget.dev);
}

/*-------------------------------------------------------------------------*/

/* DEVICE/GADGET SIDE UTILITY ROUTINES */

/* called with spinlock held */
static void nuke(struct dummy *dum, struct dummy_ep *ep)
{
        while (!list_empty(&ep->queue)) {
                struct dummy_request        *req;

                req = list_entry(ep->queue.next, struct dummy_request, queue);
                list_del_init(&req->queue);
                req->req.status = -ESHUTDOWN;

                spin_unlock(&dum->lock);
                usb_gadget_giveback_request(&ep->ep, &req->req);
                spin_lock(&dum->lock);
        }
}

/* caller must hold lock */
static void stop_activity(struct dummy *dum)
{
        int i;

        /* prevent any more requests */
        dum->address = 0;

        /* The timer is left running so that outstanding URBs can fail */

        /* nuke any pending requests first, so driver i/o is quiesced */
        for (i = 0; i < DUMMY_ENDPOINTS; ++i)
                nuke(dum, &dum->ep[i]);

        /* driver now does any non-usb quiescing necessary */
}

/**
 * set_link_state_by_speed() - Sets the current state of the link according to
 *        the hcd speed
 * @dum_hcd: pointer to the dummy_hcd structure to update the link state for
 *
 * This function updates the port_status according to the link state and the
 * speed of the hcd.
 */
static void set_link_state_by_speed(struct dummy_hcd *dum_hcd)
{
        struct dummy *dum = dum_hcd->dum;

        if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) {
                if ((dum_hcd->port_status & USB_SS_PORT_STAT_POWER) == 0) {
                        dum_hcd->port_status = 0;
                } else if (!dum->pullup || dum->udc_suspended) {
                        /* UDC suspend must cause a disconnect */
                        dum_hcd->port_status &= ~(USB_PORT_STAT_CONNECTION |
                                                USB_PORT_STAT_ENABLE);
                        if ((dum_hcd->old_status &
                             USB_PORT_STAT_CONNECTION) != 0)
                                dum_hcd->port_status |=
                                        (USB_PORT_STAT_C_CONNECTION << 16);
                } else {
                        /* device is connected and not suspended */
                        dum_hcd->port_status |= (USB_PORT_STAT_CONNECTION |
                                                 USB_PORT_STAT_SPEED_5GBPS) ;
                        if ((dum_hcd->old_status &
                             USB_PORT_STAT_CONNECTION) == 0)
                                dum_hcd->port_status |=
                                        (USB_PORT_STAT_C_CONNECTION << 16);
                        if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) &&
                            (dum_hcd->port_status &
                             USB_PORT_STAT_LINK_STATE) == USB_SS_PORT_LS_U0 &&
                            dum_hcd->rh_state != DUMMY_RH_SUSPENDED)
                                dum_hcd->active = 1;
                }
        } else {
                if ((dum_hcd->port_status & USB_PORT_STAT_POWER) == 0) {
                        dum_hcd->port_status = 0;
                } else if (!dum->pullup || dum->udc_suspended) {
                        /* UDC suspend must cause a disconnect */
                        dum_hcd->port_status &= ~(USB_PORT_STAT_CONNECTION |
                                                USB_PORT_STAT_ENABLE |
                                                USB_PORT_STAT_LOW_SPEED |
                                                USB_PORT_STAT_HIGH_SPEED |
                                                USB_PORT_STAT_SUSPEND);
                        if ((dum_hcd->old_status &
                             USB_PORT_STAT_CONNECTION) != 0)
                                dum_hcd->port_status |=
                                        (USB_PORT_STAT_C_CONNECTION << 16);
                } else {
                        dum_hcd->port_status |= USB_PORT_STAT_CONNECTION;
                        if ((dum_hcd->old_status &
                             USB_PORT_STAT_CONNECTION) == 0)
                                dum_hcd->port_status |=
                                        (USB_PORT_STAT_C_CONNECTION << 16);
                        if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) == 0)
                                dum_hcd->port_status &= ~USB_PORT_STAT_SUSPEND;
                        else if ((dum_hcd->port_status &
                                  USB_PORT_STAT_SUSPEND) == 0 &&
                                        dum_hcd->rh_state != DUMMY_RH_SUSPENDED)
                                dum_hcd->active = 1;
                }
        }
}

/* caller must hold lock */
static void set_link_state(struct dummy_hcd *dum_hcd)
        __must_hold(&dum->lock)
{
        struct dummy *dum = dum_hcd->dum;
        unsigned int power_bit;

        dum_hcd->active = 0;
        if (dum->pullup)
                if ((dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3 &&
                     dum->gadget.speed != USB_SPEED_SUPER) ||
                    (dummy_hcd_to_hcd(dum_hcd)->speed != HCD_USB3 &&
                     dum->gadget.speed == USB_SPEED_SUPER))
                        return;

        set_link_state_by_speed(dum_hcd);
        power_bit = (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3 ?
                        USB_SS_PORT_STAT_POWER : USB_PORT_STAT_POWER);

        if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) == 0 ||
             dum_hcd->active)
                dum_hcd->resuming = 0;

        /* Currently !connected or in reset */
        if ((dum_hcd->port_status & power_bit) == 0 ||
                        (dum_hcd->port_status & USB_PORT_STAT_RESET) != 0) {
                unsigned int disconnect = power_bit &
                                dum_hcd->old_status & (~dum_hcd->port_status);
                unsigned int reset = USB_PORT_STAT_RESET &
                                (~dum_hcd->old_status) & dum_hcd->port_status;

                /* Report reset and disconnect events to the driver */
                if (dum->ints_enabled && (disconnect || reset)) {
                        ++dum->callback_usage;
                        /*
                         * stop_activity() can drop dum->lock, so it must
                         * not come between the dum->ints_enabled test
                         * and the ++dum->callback_usage.
                         */
                        stop_activity(dum);
                        spin_unlock(&dum->lock);
                        if (reset)
                                usb_gadget_udc_reset(&dum->gadget, dum->driver);
                        else
                                dum->driver->disconnect(&dum->gadget);
                        spin_lock(&dum->lock);
                        --dum->callback_usage;
                }
        } else if (dum_hcd->active != dum_hcd->old_active &&
                        dum->ints_enabled) {
                ++dum->callback_usage;
                spin_unlock(&dum->lock);
                if (dum_hcd->old_active && dum->driver->suspend)
                        dum->driver->suspend(&dum->gadget);
                else if (!dum_hcd->old_active &&  dum->driver->resume)
                        dum->driver->resume(&dum->gadget);
                spin_lock(&dum->lock);
                --dum->callback_usage;
        }

        dum_hcd->old_status = dum_hcd->port_status;
        dum_hcd->old_active = dum_hcd->active;
}

/*-------------------------------------------------------------------------*/

/* DEVICE/GADGET SIDE DRIVER
 *
 * This only tracks gadget state.  All the work is done when the host
 * side tries some (emulated) i/o operation.  Real device controller
 * drivers would do real i/o using dma, fifos, irqs, timers, etc.
 */

#define is_enabled(dum) \
        (dum->port_status & USB_PORT_STAT_ENABLE)

static int dummy_enable(struct usb_ep *_ep,
                const struct usb_endpoint_descriptor *desc)
{
        struct dummy                *dum;
        struct dummy_hcd        *dum_hcd;
        struct dummy_ep                *ep;
        unsigned                max;
        int                        retval;

        ep = usb_ep_to_dummy_ep(_ep);
        if (!_ep || !desc || ep->desc || _ep->name == ep0name
                        || desc->bDescriptorType != USB_DT_ENDPOINT)
                return -EINVAL;
        dum = ep_to_dummy(ep);
        if (!dum->driver)
                return -ESHUTDOWN;

        dum_hcd = gadget_to_dummy_hcd(&dum->gadget);
        if (!is_enabled(dum_hcd))
                return -ESHUTDOWN;

        /*
         * For HS/FS devices only bits 0..10 of the wMaxPacketSize represent the
         * maximum packet size.
         * For SS devices the wMaxPacketSize is limited by 1024.
         */
        max = usb_endpoint_maxp(desc);

        /* drivers must not request bad settings, since lower levels
         * (hardware or its drivers) may not check.  some endpoints
         * can't do iso, many have maxpacket limitations, etc.
         *
         * since this "hardware" driver is here to help debugging, we
         * have some extra sanity checks.  (there could be more though,
         * especially for "ep9out" style fixed function ones.)
         */
        retval = -EINVAL;
        switch (usb_endpoint_type(desc)) {
        case USB_ENDPOINT_XFER_BULK:
                if (strstr(ep->ep.name, "-iso")
                                || strstr(ep->ep.name, "-int")) {
                        goto done;
                }
                switch (dum->gadget.speed) {
                case USB_SPEED_SUPER:
                        if (max == 1024)
                                break;
                        goto done;
                case USB_SPEED_HIGH:
                        if (max == 512)
                                break;
                        goto done;
                case USB_SPEED_FULL:
                        if (max == 8 || max == 16 || max == 32 || max == 64)
                                /* we'll fake any legal size */
                                break;
                        /* save a return statement */
                        fallthrough;
                default:
                        goto done;
                }
                break;
        case USB_ENDPOINT_XFER_INT:
                if (strstr(ep->ep.name, "-iso")) /* bulk is ok */
                        goto done;
                /* real hardware might not handle all packet sizes */
                switch (dum->gadget.speed) {
                case USB_SPEED_SUPER:
                case USB_SPEED_HIGH:
                        if (max <= 1024)
                                break;
                        /* save a return statement */
                        fallthrough;
                case USB_SPEED_FULL:
                        if (max <= 64)
                                break;
                        /* save a return statement */
                        fallthrough;
                default:
                        if (max <= 8)
                                break;
                        goto done;
                }
                break;
        case USB_ENDPOINT_XFER_ISOC:
                if (strstr(ep->ep.name, "-bulk")
                                || strstr(ep->ep.name, "-int"))
                        goto done;
                /* real hardware might not handle all packet sizes */
                switch (dum->gadget.speed) {
                case USB_SPEED_SUPER:
                case USB_SPEED_HIGH:
                        if (max <= 1024)
                                break;
                        /* save a return statement */
                        fallthrough;
                case USB_SPEED_FULL:
                        if (max <= 1023)
                                break;
                        /* save a return statement */
                        fallthrough;
                default:
                        goto done;
                }
                break;
        default:
                /* few chips support control except on ep0 */
                goto done;
        }

        _ep->maxpacket = max;
        if (usb_ss_max_streams(_ep->comp_desc)) {
                if (!usb_endpoint_xfer_bulk(desc)) {
                        dev_err(udc_dev(dum), "Can't enable stream support on "
                                        "non-bulk ep %s\n", _ep->name);
                        return -EINVAL;
                }
                ep->stream_en = 1;
        }
        ep->desc = desc;

        dev_dbg(udc_dev(dum), "enabled %s (ep%d%s-%s) maxpacket %d stream %s\n",
                _ep->name,
                usb_endpoint_num(desc),
                (desc->bEndpointAddress & USB_DIR_IN) ? "in" : "out",
                usb_ep_type_string(usb_endpoint_type(desc)),
                max, str_enabled_disabled(ep->stream_en));

        /* at this point real hardware should be NAKing transfers
         * to that endpoint, until a buffer is queued to it.
         */
        ep->halted = ep->wedged = 0;
        retval = 0;
done:
        return retval;
}

static int dummy_disable(struct usb_ep *_ep)
{
        struct dummy_ep                *ep;
        struct dummy                *dum;
        unsigned long                flags;

        ep = usb_ep_to_dummy_ep(_ep);
        if (!_ep || !ep->desc || _ep->name == ep0name)
                return -EINVAL;
        dum = ep_to_dummy(ep);

        spin_lock_irqsave(&dum->lock, flags);
        ep->desc = NULL;
        ep->stream_en = 0;
        nuke(dum, ep);
        spin_unlock_irqrestore(&dum->lock, flags);

        dev_dbg(udc_dev(dum), "disabled %s\n", _ep->name);
        return 0;
}

static struct usb_request *dummy_alloc_request(struct usb_ep *_ep,
                gfp_t mem_flags)
{
        struct dummy_request        *req;

        if (!_ep)
                return NULL;

        req = kzalloc_obj(*req, mem_flags);
        if (!req)
                return NULL;
        INIT_LIST_HEAD(&req->queue);
        return &req->req;
}

static void dummy_free_request(struct usb_ep *_ep, struct usb_request *_req)
{
        struct dummy_request        *req;

        if (!_ep || !_req) {
                WARN_ON(1);
                return;
        }

        req = usb_request_to_dummy_request(_req);
        WARN_ON(!list_empty(&req->queue));
        kfree(req);
}

static void fifo_complete(struct usb_ep *ep, struct usb_request *req)
{
}

static int dummy_queue(struct usb_ep *_ep, struct usb_request *_req,
                gfp_t mem_flags)
{
        struct dummy_ep                *ep;
        struct dummy_request        *req;
        struct dummy                *dum;
        struct dummy_hcd        *dum_hcd;
        unsigned long                flags;

        req = usb_request_to_dummy_request(_req);
        if (!_req || !list_empty(&req->queue) || !_req->complete)
                return -EINVAL;

        ep = usb_ep_to_dummy_ep(_ep);
        if (!_ep || (!ep->desc && _ep->name != ep0name))
                return -EINVAL;

        dum = ep_to_dummy(ep);
        dum_hcd = gadget_to_dummy_hcd(&dum->gadget);
        if (!dum->driver || !is_enabled(dum_hcd))
                return -ESHUTDOWN;

#if 0
        dev_dbg(udc_dev(dum), "ep %p queue req %p to %s, len %d buf %p\n",
                        ep, _req, _ep->name, _req->length, _req->buf);
#endif
        _req->status = -EINPROGRESS;
        _req->actual = 0;
        spin_lock_irqsave(&dum->lock, flags);

        /* implement an emulated single-request FIFO */
        if (ep->desc && (ep->desc->bEndpointAddress & USB_DIR_IN) &&
                        list_empty(&dum->fifo_req.queue) &&
                        list_empty(&ep->queue) &&
                        _req->length <= FIFO_SIZE) {
                req = &dum->fifo_req;
                req->req = *_req;
                req->req.buf = dum->fifo_buf;
                memcpy(dum->fifo_buf, _req->buf, _req->length);
                req->req.context = dum;
                req->req.complete = fifo_complete;

                list_add_tail(&req->queue, &ep->queue);
                spin_unlock(&dum->lock);
                _req->actual = _req->length;
                _req->status = 0;
                usb_gadget_giveback_request(_ep, _req);
                spin_lock(&dum->lock);
        }  else
                list_add_tail(&req->queue, &ep->queue);
        spin_unlock_irqrestore(&dum->lock, flags);

        /* real hardware would likely enable transfers here, in case
         * it'd been left NAKing.
         */
        return 0;
}

static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req)
{
        struct dummy_ep                *ep;
        struct dummy                *dum;
        int                        retval = -EINVAL;
        unsigned long                flags;
        struct dummy_request        *req = NULL, *iter;

        if (!_ep || !_req)
                return retval;
        ep = usb_ep_to_dummy_ep(_ep);
        dum = ep_to_dummy(ep);

        if (!dum->driver)
                return -ESHUTDOWN;

        spin_lock_irqsave(&dum->lock, flags);
        list_for_each_entry(iter, &ep->queue, queue) {
                if (&iter->req != _req)
                        continue;
                list_del_init(&iter->queue);
                _req->status = -ECONNRESET;
                req = iter;
                retval = 0;
                break;
        }

        if (retval == 0) {
                dev_dbg(udc_dev(dum),
                                "dequeued req %p from %s, len %d buf %p\n",
                                req, _ep->name, _req->length, _req->buf);
                spin_unlock(&dum->lock);
                usb_gadget_giveback_request(_ep, _req);
                spin_lock(&dum->lock);
        }
        spin_unlock_irqrestore(&dum->lock, flags);
        return retval;
}

static int
dummy_set_halt_and_wedge(struct usb_ep *_ep, int value, int wedged)
{
        struct dummy_ep                *ep;
        struct dummy                *dum;

        if (!_ep)
                return -EINVAL;
        ep = usb_ep_to_dummy_ep(_ep);
        dum = ep_to_dummy(ep);
        if (!dum->driver)
                return -ESHUTDOWN;
        if (!value)
                ep->halted = ep->wedged = 0;
        else if (ep->desc && (ep->desc->bEndpointAddress & USB_DIR_IN) &&
                        !list_empty(&ep->queue))
                return -EAGAIN;
        else {
                ep->halted = 1;
                if (wedged)
                        ep->wedged = 1;
        }
        /* FIXME clear emulated data toggle too */
        return 0;
}

static int
dummy_set_halt(struct usb_ep *_ep, int value)
{
        return dummy_set_halt_and_wedge(_ep, value, 0);
}

static int dummy_set_wedge(struct usb_ep *_ep)
{
        if (!_ep || _ep->name == ep0name)
                return -EINVAL;
        return dummy_set_halt_and_wedge(_ep, 1, 1);
}

static const struct usb_ep_ops dummy_ep_ops = {
        .enable                = dummy_enable,
        .disable        = dummy_disable,

        .alloc_request        = dummy_alloc_request,
        .free_request        = dummy_free_request,

        .queue                = dummy_queue,
        .dequeue        = dummy_dequeue,

        .set_halt        = dummy_set_halt,
        .set_wedge        = dummy_set_wedge,
};

/*-------------------------------------------------------------------------*/

/* there are both host and device side versions of this call ... */
static int dummy_g_get_frame(struct usb_gadget *_gadget)
{
        struct timespec64 ts64;

        ktime_get_ts64(&ts64);
        return ts64.tv_nsec / NSEC_PER_MSEC;
}

static int dummy_wakeup(struct usb_gadget *_gadget)
{
        struct dummy_hcd *dum_hcd;

        dum_hcd = gadget_to_dummy_hcd(_gadget);
        if (!(dum_hcd->dum->devstatus & ((1 << USB_DEVICE_B_HNP_ENABLE)
                                | (1 << USB_DEVICE_REMOTE_WAKEUP))))
                return -EINVAL;
        if ((dum_hcd->port_status & USB_PORT_STAT_CONNECTION) == 0)
                return -ENOLINK;
        if ((dum_hcd->port_status & USB_PORT_STAT_SUSPEND) == 0 &&
                         dum_hcd->rh_state != DUMMY_RH_SUSPENDED)
                return -EIO;

        /* FIXME: What if the root hub is suspended but the port isn't? */

        /* hub notices our request, issues downstream resume, etc */
        dum_hcd->resuming = 1;
        dum_hcd->re_timeout = jiffies + msecs_to_jiffies(20);
        mod_timer(&dummy_hcd_to_hcd(dum_hcd)->rh_timer, dum_hcd->re_timeout);
        return 0;
}

static int dummy_set_selfpowered(struct usb_gadget *_gadget, int value)
{
        struct dummy        *dum;

        _gadget->is_selfpowered = (value != 0);
        dum = gadget_to_dummy_hcd(_gadget)->dum;
        if (value)
                dum->devstatus |= (1 << USB_DEVICE_SELF_POWERED);
        else
                dum->devstatus &= ~(1 << USB_DEVICE_SELF_POWERED);
        return 0;
}

static void dummy_udc_update_ep0(struct dummy *dum)
{
        if (dum->gadget.speed == USB_SPEED_SUPER)
                dum->ep[0].ep.maxpacket = 9;
        else
                dum->ep[0].ep.maxpacket = 64;
}

static int dummy_pullup(struct usb_gadget *_gadget, int value)
{
        struct dummy_hcd *dum_hcd;
        struct dummy        *dum;
        unsigned long        flags;

        dum = gadget_dev_to_dummy(&_gadget->dev);
        dum_hcd = gadget_to_dummy_hcd(_gadget);

        spin_lock_irqsave(&dum->lock, flags);
        dum->pullup = (value != 0);
        set_link_state(dum_hcd);
        spin_unlock_irqrestore(&dum->lock, flags);

        usb_hcd_poll_rh_status(dummy_hcd_to_hcd(dum_hcd));
        return 0;
}

static void dummy_udc_set_speed(struct usb_gadget *_gadget,
                enum usb_device_speed speed)
{
        struct dummy        *dum;

        dum = gadget_dev_to_dummy(&_gadget->dev);
        dum->gadget.speed = speed;
        dummy_udc_update_ep0(dum);
}

static void dummy_udc_async_callbacks(struct usb_gadget *_gadget, bool enable)
{
        struct dummy        *dum = gadget_dev_to_dummy(&_gadget->dev);

        spin_lock_irq(&dum->lock);
        dum->ints_enabled = enable;
        if (!enable) {
                /*
                 * Emulate synchronize_irq(): wait for callbacks to finish.
                 * This has to happen after emulated interrupts are disabled
                 * (dum->ints_enabled is clear) and before the unbind callback,
                 * just like the call to synchronize_irq() in
                 * gadget/udc/core:gadget_unbind_driver().
                 */
                while (dum->callback_usage > 0) {
                        spin_unlock_irq(&dum->lock);
                        usleep_range(1000, 2000);
                        spin_lock_irq(&dum->lock);
                }
        }
        spin_unlock_irq(&dum->lock);
}

static int dummy_udc_start(struct usb_gadget *g,
                struct usb_gadget_driver *driver);
static int dummy_udc_stop(struct usb_gadget *g);

static const struct usb_gadget_ops dummy_ops = {
        .get_frame        = dummy_g_get_frame,
        .wakeup                = dummy_wakeup,
        .set_selfpowered = dummy_set_selfpowered,
        .pullup                = dummy_pullup,
        .udc_start        = dummy_udc_start,
        .udc_stop        = dummy_udc_stop,
        .udc_set_speed        = dummy_udc_set_speed,
        .udc_async_callbacks = dummy_udc_async_callbacks,
};

/*-------------------------------------------------------------------------*/

/* "function" sysfs attribute */
static ssize_t function_show(struct device *dev, struct device_attribute *attr,
                char *buf)
{
        struct dummy        *dum = gadget_dev_to_dummy(dev);

        if (!dum->driver || !dum->driver->function)
                return 0;
        return scnprintf(buf, PAGE_SIZE, "%s\n", dum->driver->function);
}
static DEVICE_ATTR_RO(function);

/*-------------------------------------------------------------------------*/

/*
 * Driver registration/unregistration.
 *
 * This is basically hardware-specific; there's usually only one real USB
 * device (not host) controller since that's how USB devices are intended
 * to work.  So most implementations of these api calls will rely on the
 * fact that only one driver will ever bind to the hardware.  But curious
 * hardware can be built with discrete components, so the gadget API doesn't
 * require that assumption.
 *
 * For this emulator, it might be convenient to create a usb device
 * for each driver that registers:  just add to a big root hub.
 */

static int dummy_udc_start(struct usb_gadget *g,
                struct usb_gadget_driver *driver)
{
        struct dummy_hcd        *dum_hcd = gadget_to_dummy_hcd(g);
        struct dummy                *dum = dum_hcd->dum;

        switch (g->speed) {
        /* All the speeds we support */
        case USB_SPEED_LOW:
        case USB_SPEED_FULL:
        case USB_SPEED_HIGH:
        case USB_SPEED_SUPER:
                break;
        default:
                dev_err(dummy_dev(dum_hcd), "Unsupported driver max speed %d\n",
                                driver->max_speed);
                return -EINVAL;
        }

        /*
         * DEVICE side init ... the layer above hardware, which
         * can't enumerate without help from the driver we're binding.
         */

        spin_lock_irq(&dum->lock);
        dum->devstatus = 0;
        dum->driver = driver;
        spin_unlock_irq(&dum->lock);

        return 0;
}

static int dummy_udc_stop(struct usb_gadget *g)
{
        struct dummy_hcd        *dum_hcd = gadget_to_dummy_hcd(g);
        struct dummy                *dum = dum_hcd->dum;

        spin_lock_irq(&dum->lock);
        dum->ints_enabled = 0;
        stop_activity(dum);
        dum->driver = NULL;
        spin_unlock_irq(&dum->lock);

        return 0;
}

#undef is_enabled

/* The gadget structure is stored inside the hcd structure and will be
 * released along with it. */
static void init_dummy_udc_hw(struct dummy *dum)
{
        int i;

        INIT_LIST_HEAD(&dum->gadget.ep_list);
        for (i = 0; i < DUMMY_ENDPOINTS; i++) {
                struct dummy_ep        *ep = &dum->ep[i];

                if (!ep_info[i].name)
                        break;
                ep->ep.name = ep_info[i].name;
                ep->ep.caps = ep_info[i].caps;
                ep->ep.ops = &dummy_ep_ops;
                list_add_tail(&ep->ep.ep_list, &dum->gadget.ep_list);
                ep->halted = ep->wedged = ep->already_seen =
                                ep->setup_stage = 0;
                usb_ep_set_maxpacket_limit(&ep->ep, ~0);
                ep->ep.max_streams = 16;
                ep->last_io = jiffies;
                ep->gadget = &dum->gadget;
                ep->desc = NULL;
                INIT_LIST_HEAD(&ep->queue);
        }

        dum->gadget.ep0 = &dum->ep[0].ep;
        list_del_init(&dum->ep[0].ep.ep_list);
        INIT_LIST_HEAD(&dum->fifo_req.queue);

#ifdef CONFIG_USB_OTG
        dum->gadget.is_otg = 1;
#endif
}

static int dummy_udc_probe(struct platform_device *pdev)
{
        struct dummy        *dum;
        int                rc;

        dum = *((void **)dev_get_platdata(&pdev->dev));
        /* Clear usb_gadget region for new registration to udc-core */
        memzero_explicit(&dum->gadget, sizeof(struct usb_gadget));
        dum->gadget.name = gadget_name;
        dum->gadget.ops = &dummy_ops;
        if (mod_data.is_super_speed)
                dum->gadget.max_speed = USB_SPEED_SUPER;
        else if (mod_data.is_high_speed)
                dum->gadget.max_speed = USB_SPEED_HIGH;
        else
                dum->gadget.max_speed = USB_SPEED_FULL;

        dum->gadget.dev.parent = &pdev->dev;
        init_dummy_udc_hw(dum);

        rc = usb_add_gadget_udc(&pdev->dev, &dum->gadget);
        if (rc < 0)
                goto err_udc;

        rc = device_create_file(&dum->gadget.dev, &dev_attr_function);
        if (rc < 0)
                goto err_dev;
        platform_set_drvdata(pdev, dum);
        return rc;

err_dev:
        usb_del_gadget_udc(&dum->gadget);
err_udc:
        return rc;
}

static void dummy_udc_remove(struct platform_device *pdev)
{
        struct dummy        *dum = platform_get_drvdata(pdev);

        device_remove_file(&dum->gadget.dev, &dev_attr_function);
        usb_del_gadget_udc(&dum->gadget);
}

static void dummy_udc_pm(struct dummy *dum, struct dummy_hcd *dum_hcd,
                int suspend)
{
        spin_lock_irq(&dum->lock);
        dum->udc_suspended = suspend;
        set_link_state(dum_hcd);
        spin_unlock_irq(&dum->lock);
}

static int dummy_udc_suspend(struct platform_device *pdev, pm_message_t state)
{
        struct dummy                *dum = platform_get_drvdata(pdev);
        struct dummy_hcd        *dum_hcd = gadget_to_dummy_hcd(&dum->gadget);

        dev_dbg(&pdev->dev, "%s\n", __func__);
        dummy_udc_pm(dum, dum_hcd, 1);
        usb_hcd_poll_rh_status(dummy_hcd_to_hcd(dum_hcd));
        return 0;
}

static int dummy_udc_resume(struct platform_device *pdev)
{
        struct dummy                *dum = platform_get_drvdata(pdev);
        struct dummy_hcd        *dum_hcd = gadget_to_dummy_hcd(&dum->gadget);

        dev_dbg(&pdev->dev, "%s\n", __func__);
        dummy_udc_pm(dum, dum_hcd, 0);
        usb_hcd_poll_rh_status(dummy_hcd_to_hcd(dum_hcd));
        return 0;
}

static struct platform_driver dummy_udc_driver = {
        .probe                = dummy_udc_probe,
        .remove                = dummy_udc_remove,
        .suspend        = dummy_udc_suspend,
        .resume                = dummy_udc_resume,
        .driver                = {
                .name        = gadget_name,
        },
};

/*-------------------------------------------------------------------------*/

static unsigned int dummy_get_ep_idx(const struct usb_endpoint_descriptor *desc)
{
        unsigned int index;

        index = usb_endpoint_num(desc) << 1;
        if (usb_endpoint_dir_in(desc))
                index |= 1;
        return index;
}

/* HOST SIDE DRIVER
 *
 * this uses the hcd framework to hook up to host side drivers.
 * its root hub will only have one device, otherwise it acts like
 * a normal host controller.
 *
 * when urbs are queued, they're just stuck on a list that we
 * scan in a timer callback.  that callback connects writes from
 * the host with reads from the device, and so on, based on the
 * usb 2.0 rules.
 */

static int dummy_ep_stream_en(struct dummy_hcd *dum_hcd, struct urb *urb)
{
        const struct usb_endpoint_descriptor *desc = &urb->ep->desc;
        u32 index;

        if (!usb_endpoint_xfer_bulk(desc))
                return 0;

        index = dummy_get_ep_idx(desc);
        return (1 << index) & dum_hcd->stream_en_ep;
}

/*
 * The max stream number is saved as a nibble so for the 30 possible endpoints
 * we only 15 bytes of memory. Therefore we are limited to max 16 streams (0
 * means we use only 1 stream). The maximum according to the spec is 16bit so
 * if the 16 stream limit is about to go, the array size should be incremented
 * to 30 elements of type u16.
 */
static int get_max_streams_for_pipe(struct dummy_hcd *dum_hcd,
                unsigned int pipe)
{
        int max_streams;

        max_streams = dum_hcd->num_stream[usb_pipeendpoint(pipe)];
        if (usb_pipeout(pipe))
                max_streams >>= 4;
        else
                max_streams &= 0xf;
        max_streams++;
        return max_streams;
}

static void set_max_streams_for_pipe(struct dummy_hcd *dum_hcd,
                unsigned int pipe, unsigned int streams)
{
        int max_streams;

        streams--;
        max_streams = dum_hcd->num_stream[usb_pipeendpoint(pipe)];
        if (usb_pipeout(pipe)) {
                streams <<= 4;
                max_streams &= 0xf;
        } else {
                max_streams &= 0xf0;
        }
        max_streams |= streams;
        dum_hcd->num_stream[usb_pipeendpoint(pipe)] = max_streams;
}

static int dummy_validate_stream(struct dummy_hcd *dum_hcd, struct urb *urb)
{
        unsigned int max_streams;
        int enabled;

        enabled = dummy_ep_stream_en(dum_hcd, urb);
        if (!urb->stream_id) {
                if (enabled)
                        return -EINVAL;
                return 0;
        }
        if (!enabled)
                return -EINVAL;

        max_streams = get_max_streams_for_pipe(dum_hcd,
                        usb_pipeendpoint(urb->pipe));
        if (urb->stream_id > max_streams) {
                dev_err(dummy_dev(dum_hcd), "Stream id %d is out of range.\n",
                                urb->stream_id);
                BUG();
                return -EINVAL;
        }
        return 0;
}

static int dummy_urb_enqueue(
        struct usb_hcd                        *hcd,
        struct urb                        *urb,
        gfp_t                                mem_flags
) {
        struct dummy_hcd *dum_hcd;
        struct urbp        *urbp;
        unsigned long        flags;
        int                rc;

        urbp = kmalloc_obj(*urbp, mem_flags);
        if (!urbp)
                return -ENOMEM;
        urbp->urb = urb;
        urbp->miter_started = 0;

        dum_hcd = hcd_to_dummy_hcd(hcd);
        spin_lock_irqsave(&dum_hcd->dum->lock, flags);

        rc = dummy_validate_stream(dum_hcd, urb);
        if (rc) {
                kfree(urbp);
                goto done;
        }

        rc = usb_hcd_link_urb_to_ep(hcd, urb);
        if (rc) {
                kfree(urbp);
                goto done;
        }

        if (!dum_hcd->udev) {
                dum_hcd->udev = urb->dev;
                usb_get_dev(dum_hcd->udev);
        } else if (unlikely(dum_hcd->udev != urb->dev))
                dev_err(dummy_dev(dum_hcd), "usb_device address has changed!\n");

        list_add_tail(&urbp->urbp_list, &dum_hcd->urbp_list);
        urb->hcpriv = urbp;
        if (!dum_hcd->next_frame_urbp)
                dum_hcd->next_frame_urbp = urbp;
        if (usb_pipetype(urb->pipe) == PIPE_CONTROL)
                urb->error_count = 1;                /* mark as a new urb */

        /* kick the scheduler, it'll do the rest */
        if (!dum_hcd->timer_pending) {
                dum_hcd->timer_pending = 1;
                hrtimer_start(&dum_hcd->timer, ns_to_ktime(DUMMY_TIMER_INT_NSECS),
                                HRTIMER_MODE_REL_SOFT);
        }

 done:
        spin_unlock_irqrestore(&dum_hcd->dum->lock, flags);
        return rc;
}

static int dummy_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
{
        struct dummy_hcd *dum_hcd;
        unsigned long        flags;
        int                rc;

        /* giveback happens automatically in timer callback,
         * so make sure the callback happens */
        dum_hcd = hcd_to_dummy_hcd(hcd);
        spin_lock_irqsave(&dum_hcd->dum->lock, flags);

        rc = usb_hcd_check_unlink_urb(hcd, urb, status);
        if (rc == 0 && !dum_hcd->timer_pending) {
                dum_hcd->timer_pending = 1;
                hrtimer_start(&dum_hcd->timer, ns_to_ktime(0), HRTIMER_MODE_REL_SOFT);
        }

        spin_unlock_irqrestore(&dum_hcd->dum->lock, flags);
        return rc;
}

static int dummy_perform_transfer(struct urb *urb, struct dummy_request *req,
                u32 len)
{
        void *ubuf, *rbuf;
        struct urbp *urbp = urb->hcpriv;
        int to_host;
        struct sg_mapping_iter *miter = &urbp->miter;
        u32 trans = 0;
        u32 this_sg;
        bool next_sg;

        to_host = usb_urb_dir_in(urb);
        rbuf = req->req.buf + req->req.actual;

        if (!urb->num_sgs) {
                ubuf = urb->transfer_buffer + urb->actual_length;
                if (to_host)
                        memcpy(ubuf, rbuf, len);
                else
                        memcpy(rbuf, ubuf, len);
                return len;
        }

        if (!urbp->miter_started) {
                u32 flags = SG_MITER_ATOMIC;

                if (to_host)
                        flags |= SG_MITER_TO_SG;
                else
                        flags |= SG_MITER_FROM_SG;

                sg_miter_start(miter, urb->sg, urb->num_sgs, flags);
                urbp->miter_started = 1;
        }
        next_sg = sg_miter_next(miter);
        if (next_sg == false) {
                WARN_ON_ONCE(1);
                return -EINVAL;
        }
        do {
                ubuf = miter->addr;
                this_sg = min_t(u32, len, miter->length);
                miter->consumed = this_sg;
                trans += this_sg;

                if (to_host)
                        memcpy(ubuf, rbuf, this_sg);
                else
                        memcpy(rbuf, ubuf, this_sg);
                len -= this_sg;

                if (!len)
                        break;
                next_sg = sg_miter_next(miter);
                if (next_sg == false) {
                        WARN_ON_ONCE(1);
                        return -EINVAL;
                }

                rbuf += this_sg;
        } while (1);

        sg_miter_stop(miter);
        return trans;
}

/* transfer up to a frame's worth; caller must own lock */
static int transfer(struct dummy_hcd *dum_hcd, struct urb *urb,
                struct dummy_ep *ep, int limit, int *status)
{
        struct dummy                *dum = dum_hcd->dum;
        struct dummy_request        *req;
        int                        sent = 0;

top:
        /* if there's no request queued, the device is NAKing; return */
        list_for_each_entry(req, &ep->queue, queue) {
                unsigned        host_len, dev_len, len;
                int                is_short, to_host;
                int                rescan = 0;

                if (dummy_ep_stream_en(dum_hcd, urb)) {
                        if ((urb->stream_id != req->req.stream_id))
                                continue;
                }

                /* 1..N packets of ep->ep.maxpacket each ... the last one
                 * may be short (including zero length).
                 *
                 * writer can send a zlp explicitly (length 0) or implicitly
                 * (length mod maxpacket zero, and 'zero' flag); they always
                 * terminate reads.
                 */
                host_len = urb->transfer_buffer_length - urb->actual_length;
                dev_len = req->req.length - req->req.actual;
                len = min(host_len, dev_len);

                /* FIXME update emulated data toggle too */

                to_host = usb_urb_dir_in(urb);
                if (unlikely(len == 0))
                        is_short = 1;
                else {
                        /* not enough bandwidth left? */
                        if (limit < ep->ep.maxpacket && limit < len)
                                break;
                        len = min_t(unsigned, len, limit);
                        if (len == 0)
                                break;

                        /* send multiple of maxpacket first, then remainder */
                        if (len >= ep->ep.maxpacket) {
                                is_short = 0;
                                if (len % ep->ep.maxpacket)
                                        rescan = 1;
                                len -= len % ep->ep.maxpacket;
                        } else {
                                is_short = 1;
                        }

                        len = dummy_perform_transfer(urb, req, len);

                        ep->last_io = jiffies;
                        if ((int)len < 0) {
                                req->req.status = len;
                        } else {
                                limit -= len;
                                sent += len;
                                urb->actual_length += len;
                                req->req.actual += len;
                        }
                }

                /* short packets terminate, maybe with overflow/underflow.
                 * it's only really an error to write too much.
                 *
                 * partially filling a buffer optionally blocks queue advances
                 * (so completion handlers can clean up the queue) but we don't
                 * need to emulate such data-in-flight.
                 */
                if (is_short) {
                        if (host_len == dev_len) {
                                req->req.status = 0;
                                *status = 0;
                        } else if (to_host) {
                                req->req.status = 0;
                                if (dev_len > host_len)
                                        *status = -EOVERFLOW;
                                else
                                        *status = 0;
                        } else {
                                *status = 0;
                                if (host_len > dev_len)
                                        req->req.status = -EOVERFLOW;
                                else
                                        req->req.status = 0;
                        }

                /*
                 * many requests terminate without a short packet.
                 * send a zlp if demanded by flags.
                 */
                } else {
                        if (req->req.length == req->req.actual) {
                                if (req->req.zero && to_host)
                                        rescan = 1;
                                else
                                        req->req.status = 0;
                        }
                        if (urb->transfer_buffer_length == urb->actual_length) {
                                if (urb->transfer_flags & URB_ZERO_PACKET &&
                                    !to_host)
                                        rescan = 1;
                                else
                                        *status = 0;
                        }
                }

                /* device side completion --> continuable */
                if (req->req.status != -EINPROGRESS) {
                        list_del_init(&req->queue);

                        spin_unlock(&dum->lock);
                        usb_gadget_giveback_request(&ep->ep, &req->req);
                        spin_lock(&dum->lock);

                        /* requests might have been unlinked... */
                        rescan = 1;
                }

                /* host side completion --> terminate */
                if (*status != -EINPROGRESS)
                        break;

                /* rescan to continue with any other queued i/o */
                if (rescan)
                        goto top;

                /* request not fully transferred; stop iterating to
                 * preserve data ordering across queued requests.
                 */
                if (req->req.actual < req->req.length)
                        break;
        }
        return sent;
}

static int periodic_bytes(struct dummy *dum, struct dummy_ep *ep)
{
        int        limit = ep->ep.maxpacket;

        if (dum->gadget.speed == USB_SPEED_HIGH) {
                int        tmp;

                /* high bandwidth mode */
                tmp = usb_endpoint_maxp_mult(ep->desc);
                tmp *= 8 /* applies to entire frame */;
                limit += limit * tmp;
        }
        if (dum->gadget.speed == USB_SPEED_SUPER) {
                switch (usb_endpoint_type(ep->desc)) {
                case USB_ENDPOINT_XFER_ISOC:
                        /* Sec. 4.4.8.2 USB3.0 Spec */
                        limit = 3 * 16 * 1024 * 8;
                        break;
                case USB_ENDPOINT_XFER_INT:
                        /* Sec. 4.4.7.2 USB3.0 Spec */
                        limit = 3 * 1024 * 8;
                        break;
                case USB_ENDPOINT_XFER_BULK:
                default:
                        break;
                }
        }
        return limit;
}

#define is_active(dum_hcd)        ((dum_hcd->port_status & \
                (USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE | \
                        USB_PORT_STAT_SUSPEND)) \
                == (USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE))

static struct dummy_ep *find_endpoint(struct dummy *dum, u8 address)
{
        int                i;

        if (!is_active((dum->gadget.speed == USB_SPEED_SUPER ?
                        dum->ss_hcd : dum->hs_hcd)))
                return NULL;
        if (!dum->ints_enabled)
                return NULL;
        if ((address & ~USB_DIR_IN) == 0)
                return &dum->ep[0];
        for (i = 1; i < DUMMY_ENDPOINTS; i++) {
                struct dummy_ep        *ep = &dum->ep[i];

                if (!ep->desc)
                        continue;
                if (ep->desc->bEndpointAddress == address)
                        return ep;
        }
        return NULL;
}

#undef is_active

#define Dev_Request        (USB_TYPE_STANDARD | USB_RECIP_DEVICE)
#define Dev_InRequest        (Dev_Request | USB_DIR_IN)
#define Intf_Request        (USB_TYPE_STANDARD | USB_RECIP_INTERFACE)
#define Intf_InRequest        (Intf_Request | USB_DIR_IN)
#define Ep_Request        (USB_TYPE_STANDARD | USB_RECIP_ENDPOINT)
#define Ep_InRequest        (Ep_Request | USB_DIR_IN)


/**
 * handle_control_request() - handles all control transfers
 * @dum_hcd: pointer to dummy (the_controller)
 * @urb: the urb request to handle
 * @setup: pointer to the setup data for a USB device control
 *         request
 * @status: pointer to request handling status
 *
 * Return 0 - if the request was handled
 *          1 - if the request wasn't handles
 *          error code on error
 */
static int handle_control_request(struct dummy_hcd *dum_hcd, struct urb *urb,
                                  struct usb_ctrlrequest *setup,
                                  int *status)
{
        struct dummy_ep                *ep2;
        struct dummy                *dum = dum_hcd->dum;
        int                        ret_val = 1;
        unsigned        w_index;
        unsigned        w_value;

        w_index = le16_to_cpu(setup->wIndex);
        w_value = le16_to_cpu(setup->wValue);
        switch (setup->bRequest) {
        case USB_REQ_SET_ADDRESS:
                if (setup->bRequestType != Dev_Request)
                        break;
                dum->address = w_value;
                *status = 0;
                dev_dbg(udc_dev(dum), "set_address = %d\n",
                                w_value);
                ret_val = 0;
                break;
        case USB_REQ_SET_FEATURE:
                if (setup->bRequestType == Dev_Request) {
                        ret_val = 0;
                        switch (w_value) {
                        case USB_DEVICE_REMOTE_WAKEUP:
                                break;
                        case USB_DEVICE_B_HNP_ENABLE:
                                dum->gadget.b_hnp_enable = 1;
                                break;
                        case USB_DEVICE_A_HNP_SUPPORT:
                                dum->gadget.a_hnp_support = 1;
                                break;
                        case USB_DEVICE_A_ALT_HNP_SUPPORT:
                                dum->gadget.a_alt_hnp_support = 1;
                                break;
                        case USB_DEVICE_U1_ENABLE:
                                if (dummy_hcd_to_hcd(dum_hcd)->speed ==
                                    HCD_USB3)
                                        w_value = USB_DEV_STAT_U1_ENABLED;
                                else
                                        ret_val = -EOPNOTSUPP;
                                break;
                        case USB_DEVICE_U2_ENABLE:
                                if (dummy_hcd_to_hcd(dum_hcd)->speed ==
                                    HCD_USB3)
                                        w_value = USB_DEV_STAT_U2_ENABLED;
                                else
                                        ret_val = -EOPNOTSUPP;
                                break;
                        case USB_DEVICE_LTM_ENABLE:
                                if (dummy_hcd_to_hcd(dum_hcd)->speed ==
                                    HCD_USB3)
                                        w_value = USB_DEV_STAT_LTM_ENABLED;
                                else
                                        ret_val = -EOPNOTSUPP;
                                break;
                        default:
                                ret_val = -EOPNOTSUPP;
                        }
                        if (ret_val == 0) {
                                dum->devstatus |= (1 << w_value);
                                *status = 0;
                        }
                } else if (setup->bRequestType == Ep_Request) {
                        /* endpoint halt */
                        ep2 = find_endpoint(dum, w_index);
                        if (!ep2 || ep2->ep.name == ep0name) {
                                ret_val = -EOPNOTSUPP;
                                break;
                        }
                        ep2->halted = 1;
                        ret_val = 0;
                        *status = 0;
                }
                break;
        case USB_REQ_CLEAR_FEATURE:
                if (setup->bRequestType == Dev_Request) {
                        ret_val = 0;
                        switch (w_value) {
                        case USB_DEVICE_REMOTE_WAKEUP:
                                w_value = USB_DEVICE_REMOTE_WAKEUP;
                                break;
                        case USB_DEVICE_U1_ENABLE:
                                if (dummy_hcd_to_hcd(dum_hcd)->speed ==
                                    HCD_USB3)
                                        w_value = USB_DEV_STAT_U1_ENABLED;
                                else
                                        ret_val = -EOPNOTSUPP;
                                break;
                        case USB_DEVICE_U2_ENABLE:
                                if (dummy_hcd_to_hcd(dum_hcd)->speed ==
                                    HCD_USB3)
                                        w_value = USB_DEV_STAT_U2_ENABLED;
                                else
                                        ret_val = -EOPNOTSUPP;
                                break;
                        case USB_DEVICE_LTM_ENABLE:
                                if (dummy_hcd_to_hcd(dum_hcd)->speed ==
                                    HCD_USB3)
                                        w_value = USB_DEV_STAT_LTM_ENABLED;
                                else
                                        ret_val = -EOPNOTSUPP;
                                break;
                        default:
                                ret_val = -EOPNOTSUPP;
                                break;
                        }
                        if (ret_val == 0) {
                                dum->devstatus &= ~(1 << w_value);
                                *status = 0;
                        }
                } else if (setup->bRequestType == Ep_Request) {
                        /* endpoint halt */
                        ep2 = find_endpoint(dum, w_index);
                        if (!ep2) {
                                ret_val = -EOPNOTSUPP;
                                break;
                        }
                        if (!ep2->wedged)
                                ep2->halted = 0;
                        ret_val = 0;
                        *status = 0;
                }
                break;
        case USB_REQ_GET_STATUS:
                if (setup->bRequestType == Dev_InRequest
                                || setup->bRequestType == Intf_InRequest
                                || setup->bRequestType == Ep_InRequest) {
                        char *buf;
                        /*
                         * device: remote wakeup, selfpowered
                         * interface: nothing
                         * endpoint: halt
                         */
                        buf = (char *)urb->transfer_buffer;
                        if (urb->transfer_buffer_length > 0) {
                                if (setup->bRequestType == Ep_InRequest) {
                                        ep2 = find_endpoint(dum, w_index);
                                        if (!ep2) {
                                                ret_val = -EOPNOTSUPP;
                                                break;
                                        }
                                        buf[0] = ep2->halted;
                                } else if (setup->bRequestType ==
                                           Dev_InRequest) {
                                        buf[0] = (u8)dum->devstatus;
                                } else
                                        buf[0] = 0;
                        }
                        if (urb->transfer_buffer_length > 1)
                                buf[1] = 0;
                        urb->actual_length = min_t(u32, 2,
                                urb->transfer_buffer_length);
                        ret_val = 0;
                        *status = 0;
                }
                break;
        }
        return ret_val;
}

/*
 * Drive both sides of the transfers; looks like irq handlers to both
 * drivers except that the callbacks are invoked from soft interrupt
 * context.
 */
static enum hrtimer_restart dummy_timer(struct hrtimer *t)
{
        struct dummy_hcd        *dum_hcd = timer_container_of(dum_hcd, t,
                                                              timer);
        struct dummy                *dum = dum_hcd->dum;
        struct urbp                *urbp, *tmp;
        unsigned long                flags;
        int                        limit, total;
        int                        i;

        /* simplistic model for one frame's bandwidth */
        /* FIXME: account for transaction and packet overhead */
        switch (dum->gadget.speed) {
        case USB_SPEED_LOW:
                total = 8/*bytes*/ * 12/*packets*/;
                break;
        case USB_SPEED_FULL:
                total = 64/*bytes*/ * 19/*packets*/;
                break;
        case USB_SPEED_HIGH:
                total = 512/*bytes*/ * 13/*packets*/ * 8/*uframes*/;
                break;
        case USB_SPEED_SUPER:
                /* Bus speed is 500000 bytes/ms, so use a little less */
                total = 490000;
                break;
        default:        /* Can't happen */
                dev_err(dummy_dev(dum_hcd), "bogus device speed\n");
                total = 0;
                break;
        }

        /* look at each urb queued by the host side driver */
        spin_lock_irqsave(&dum->lock, flags);
        dum_hcd->timer_pending = 0;

        if (!dum_hcd->udev) {
                dev_err(dummy_dev(dum_hcd),
                                "timer fired with no URBs pending?\n");
                spin_unlock_irqrestore(&dum->lock, flags);
                return HRTIMER_NORESTART;
        }
        dum_hcd->next_frame_urbp = NULL;

        for (i = 0; i < DUMMY_ENDPOINTS; i++) {
                if (!ep_info[i].name)
                        break;
                dum->ep[i].already_seen = 0;
        }

restart:
        list_for_each_entry_safe(urbp, tmp, &dum_hcd->urbp_list, urbp_list) {
                struct urb                *urb;
                struct dummy_request        *req;
                u8                        address;
                struct dummy_ep                *ep = NULL;
                int                        status = -EINPROGRESS;

                /* stop when we reach URBs queued after the timer interrupt */
                if (urbp == dum_hcd->next_frame_urbp)
                        break;

                urb = urbp->urb;
                if (urb->unlinked)
                        goto return_urb;
                else if (dum_hcd->rh_state != DUMMY_RH_RUNNING)
                        continue;

                /* Used up this frame's bandwidth? */
                if (total <= 0)
                        continue;

                /* find the gadget's ep for this request (if configured) */
                address = usb_pipeendpoint (urb->pipe);
                if (usb_urb_dir_in(urb))
                        address |= USB_DIR_IN;
                ep = find_endpoint(dum, address);
                if (!ep) {
                        /* set_configuration() disagreement */
                        dev_dbg(dummy_dev(dum_hcd),
                                "no ep configured for urb %p\n",
                                urb);
                        status = -EPROTO;
                        goto return_urb;
                }

                if (ep->already_seen)
                        continue;
                ep->already_seen = 1;
                if (ep == &dum->ep[0] && urb->error_count) {
                        ep->setup_stage = 1;        /* a new urb */
                        urb->error_count = 0;
                }
                if (ep->halted && !ep->setup_stage) {
                        /* NOTE: must not be iso! */
                        dev_dbg(dummy_dev(dum_hcd), "ep %s halted, urb %p\n",
                                        ep->ep.name, urb);
                        status = -EPIPE;
                        goto return_urb;
                }
                /* FIXME make sure both ends agree on maxpacket */

                /* handle control requests */
                if (ep == &dum->ep[0] && ep->setup_stage) {
                        struct usb_ctrlrequest                setup;
                        int                                value;

                        setup = *(struct usb_ctrlrequest *) urb->setup_packet;
                        /* paranoia, in case of stale queued data */
                        list_for_each_entry(req, &ep->queue, queue) {
                                list_del_init(&req->queue);
                                req->req.status = -EOVERFLOW;
                                dev_dbg(udc_dev(dum), "stale req = %p\n",
                                                req);

                                spin_unlock(&dum->lock);
                                usb_gadget_giveback_request(&ep->ep, &req->req);
                                spin_lock(&dum->lock);
                                ep->already_seen = 0;
                                goto restart;
                        }

                        /* gadget driver never sees set_address or operations
                         * on standard feature flags.  some hardware doesn't
                         * even expose them.
                         */
                        ep->last_io = jiffies;
                        ep->setup_stage = 0;
                        ep->halted = 0;

                        value = handle_control_request(dum_hcd, urb, &setup,
                                                       &status);

                        /* gadget driver handles all other requests.  block
                         * until setup() returns; no reentrancy issues etc.
                         */
                        if (value > 0) {
                                ++dum->callback_usage;
                                spin_unlock(&dum->lock);
                                value = dum->driver->setup(&dum->gadget,
                                                &setup);
                                spin_lock(&dum->lock);
                                --dum->callback_usage;

                                if (value >= 0) {
                                        /* no delays (max 64KB data stage) */
                                        limit = 64*1024;
                                        goto treat_control_like_bulk;
                                }
                                /* error, see below */
                        }

                        if (value < 0) {
                                if (value != -EOPNOTSUPP)
                                        dev_dbg(udc_dev(dum),
                                                "setup --> %d\n",
                                                value);
                                status = -EPIPE;
                                urb->actual_length = 0;
                        }

                        goto return_urb;
                }

                /* non-control requests */
                limit = total;
                switch (usb_pipetype(urb->pipe)) {
                case PIPE_ISOCHRONOUS:
                        /*
                         * We don't support isochronous.  But if we did,
                         * here are some of the issues we'd have to face:
                         *
                         * Is it urb->interval since the last xfer?
                         * Use urb->iso_frame_desc[i].
                         * Complete whether or not ep has requests queued.
                         * Report random errors, to debug drivers.
                         */
                        limit = max(limit, periodic_bytes(dum, ep));
                        status = -EINVAL;        /* fail all xfers */
                        break;

                case PIPE_INTERRUPT:
                        /* FIXME is it urb->interval since the last xfer?
                         * this almost certainly polls too fast.
                         */
                        limit = max(limit, periodic_bytes(dum, ep));
                        fallthrough;

                default:
treat_control_like_bulk:
                        ep->last_io = jiffies;
                        total -= transfer(dum_hcd, urb, ep, limit, &status);
                        break;
                }

                /* incomplete transfer? */
                if (status == -EINPROGRESS)
                        continue;

return_urb:
                list_del(&urbp->urbp_list);
                kfree(urbp);
                if (ep)
                        ep->already_seen = ep->setup_stage = 0;

                usb_hcd_unlink_urb_from_ep(dummy_hcd_to_hcd(dum_hcd), urb);
                spin_unlock(&dum->lock);
                usb_hcd_giveback_urb(dummy_hcd_to_hcd(dum_hcd), urb, status);
                spin_lock(&dum->lock);

                goto restart;
        }

        if (list_empty(&dum_hcd->urbp_list)) {
                usb_put_dev(dum_hcd->udev);
                dum_hcd->udev = NULL;
        } else if (!dum_hcd->timer_pending &&
                        dum_hcd->rh_state == DUMMY_RH_RUNNING) {
                /* want a 1 msec delay here */
                dum_hcd->timer_pending = 1;
                hrtimer_start(&dum_hcd->timer, ns_to_ktime(DUMMY_TIMER_INT_NSECS),
                                HRTIMER_MODE_REL_SOFT);
        }

        spin_unlock_irqrestore(&dum->lock, flags);

        return HRTIMER_NORESTART;
}

/*-------------------------------------------------------------------------*/

#define PORT_C_MASK \
        ((USB_PORT_STAT_C_CONNECTION \
        | USB_PORT_STAT_C_ENABLE \
        | USB_PORT_STAT_C_SUSPEND \
        | USB_PORT_STAT_C_OVERCURRENT \
        | USB_PORT_STAT_C_RESET) << 16)

static int dummy_hub_status(struct usb_hcd *hcd, char *buf)
{
        struct dummy_hcd        *dum_hcd;
        unsigned long                flags;
        int                        retval = 0;

        dum_hcd = hcd_to_dummy_hcd(hcd);

        spin_lock_irqsave(&dum_hcd->dum->lock, flags);
        if (!HCD_HW_ACCESSIBLE(hcd))
                goto done;

        if (dum_hcd->resuming && time_after_eq(jiffies, dum_hcd->re_timeout)) {
                dum_hcd->port_status |= (USB_PORT_STAT_C_SUSPEND << 16);
                dum_hcd->port_status &= ~USB_PORT_STAT_SUSPEND;
                set_link_state(dum_hcd);
        }

        if ((dum_hcd->port_status & PORT_C_MASK) != 0) {
                *buf = (1 << 1);
                dev_dbg(dummy_dev(dum_hcd), "port status 0x%08x has changes\n",
                                dum_hcd->port_status);
                retval = 1;
                if (dum_hcd->rh_state == DUMMY_RH_SUSPENDED)
                        usb_hcd_resume_root_hub(hcd);
        }
done:
        spin_unlock_irqrestore(&dum_hcd->dum->lock, flags);
        return retval;
}

/* usb 3.0 root hub device descriptor */
static struct {
        struct usb_bos_descriptor bos;
        struct usb_ss_cap_descriptor ss_cap;
} __packed usb3_bos_desc = {

        .bos = {
                .bLength                = USB_DT_BOS_SIZE,
                .bDescriptorType        = USB_DT_BOS,
                .wTotalLength                = cpu_to_le16(sizeof(usb3_bos_desc)),
                .bNumDeviceCaps                = 1,
        },
        .ss_cap = {
                .bLength                = USB_DT_USB_SS_CAP_SIZE,
                .bDescriptorType        = USB_DT_DEVICE_CAPABILITY,
                .bDevCapabilityType        = USB_SS_CAP_TYPE,
                .wSpeedSupported        = cpu_to_le16(USB_5GBPS_OPERATION),
                .bFunctionalitySupport        = ilog2(USB_5GBPS_OPERATION),
        },
};

static inline void
ss_hub_descriptor(struct usb_hub_descriptor *desc)
{
        memset(desc, 0, sizeof *desc);
        desc->bDescriptorType = USB_DT_SS_HUB;
        desc->bDescLength = 12;
        desc->wHubCharacteristics = cpu_to_le16(
                        HUB_CHAR_INDV_PORT_LPSM |
                        HUB_CHAR_COMMON_OCPM);
        desc->bNbrPorts = 1;
        desc->u.ss.bHubHdrDecLat = 0x04; /* Worst case: 0.4 micro sec*/
        desc->u.ss.DeviceRemovable = 0;
}

static inline void hub_descriptor(struct usb_hub_descriptor *desc)
{
        memset(desc, 0, sizeof *desc);
        desc->bDescriptorType = USB_DT_HUB;
        desc->bDescLength = 9;
        desc->wHubCharacteristics = cpu_to_le16(
                        HUB_CHAR_INDV_PORT_LPSM |
                        HUB_CHAR_COMMON_OCPM);
        desc->bNbrPorts = 1;
        desc->u.hs.DeviceRemovable[0] = 0;
        desc->u.hs.DeviceRemovable[1] = 0xff;        /* PortPwrCtrlMask */
}

static int dummy_hub_control(
        struct usb_hcd        *hcd,
        u16                typeReq,
        u16                wValue,
        u16                wIndex,
        char                *buf,
        u16                wLength
) {
        struct dummy_hcd *dum_hcd;
        int                retval = 0;
        unsigned long        flags;

        if (!HCD_HW_ACCESSIBLE(hcd))
                return -ETIMEDOUT;

        dum_hcd = hcd_to_dummy_hcd(hcd);

        spin_lock_irqsave(&dum_hcd->dum->lock, flags);
        switch (typeReq) {
        case ClearHubFeature:
                break;
        case ClearPortFeature:
                switch (wValue) {
                case USB_PORT_FEAT_SUSPEND:
                        if (hcd->speed == HCD_USB3) {
                                dev_dbg(dummy_dev(dum_hcd),
                                         "USB_PORT_FEAT_SUSPEND req not "
                                         "supported for USB 3.0 roothub\n");
                                goto error;
                        }
                        if (dum_hcd->port_status & USB_PORT_STAT_SUSPEND) {
                                /* 20msec resume signaling */
                                dum_hcd->resuming = 1;
                                dum_hcd->re_timeout = jiffies +
                                                msecs_to_jiffies(20);
                        }
                        break;
                case USB_PORT_FEAT_POWER:
                        dev_dbg(dummy_dev(dum_hcd), "power-off\n");
                        if (hcd->speed == HCD_USB3)
                                dum_hcd->port_status &= ~USB_SS_PORT_STAT_POWER;
                        else
                                dum_hcd->port_status &= ~USB_PORT_STAT_POWER;
                        set_link_state(dum_hcd);
                        break;
                case USB_PORT_FEAT_ENABLE:
                case USB_PORT_FEAT_C_ENABLE:
                case USB_PORT_FEAT_C_SUSPEND:
                        /* Not allowed for USB-3 */
                        if (hcd->speed == HCD_USB3)
                                goto error;
                        fallthrough;
                case USB_PORT_FEAT_C_CONNECTION:
                case USB_PORT_FEAT_C_RESET:
                        dum_hcd->port_status &= ~(1 << wValue);
                        set_link_state(dum_hcd);
                        break;
                default:
                /* Disallow INDICATOR and C_OVER_CURRENT */
                        goto error;
                }
                break;
        case GetHubDescriptor:
                if (hcd->speed == HCD_USB3 &&
                                (wLength < USB_DT_SS_HUB_SIZE ||
                                 wValue != (USB_DT_SS_HUB << 8))) {
                        dev_dbg(dummy_dev(dum_hcd),
                                "Wrong hub descriptor type for "
                                "USB 3.0 roothub.\n");
                        goto error;
                }
                if (hcd->speed == HCD_USB3)
                        ss_hub_descriptor((struct usb_hub_descriptor *) buf);
                else
                        hub_descriptor((struct usb_hub_descriptor *) buf);
                break;

        case DeviceRequest | USB_REQ_GET_DESCRIPTOR:
                if (hcd->speed != HCD_USB3)
                        goto error;

                if ((wValue >> 8) != USB_DT_BOS)
                        goto error;

                memcpy(buf, &usb3_bos_desc, sizeof(usb3_bos_desc));
                retval = sizeof(usb3_bos_desc);
                break;

        case GetHubStatus:
                *(__le32 *) buf = cpu_to_le32(0);
                break;
        case GetPortStatus:
                if (wIndex != 1)
                        retval = -EPIPE;

                /* whoever resets or resumes must GetPortStatus to
                 * complete it!!
                 */
                if (dum_hcd->resuming &&
                                time_after_eq(jiffies, dum_hcd->re_timeout)) {
                        dum_hcd->port_status |= (USB_PORT_STAT_C_SUSPEND << 16);
                        dum_hcd->port_status &= ~USB_PORT_STAT_SUSPEND;
                }
                if ((dum_hcd->port_status & USB_PORT_STAT_RESET) != 0 &&
                                time_after_eq(jiffies, dum_hcd->re_timeout)) {
                        dum_hcd->port_status |= (USB_PORT_STAT_C_RESET << 16);
                        dum_hcd->port_status &= ~USB_PORT_STAT_RESET;
                        if (dum_hcd->dum->pullup) {
                                dum_hcd->port_status |= USB_PORT_STAT_ENABLE;

                                if (hcd->speed < HCD_USB3) {
                                        switch (dum_hcd->dum->gadget.speed) {
                                        case USB_SPEED_HIGH:
                                                dum_hcd->port_status |=
                                                      USB_PORT_STAT_HIGH_SPEED;
                                                break;
                                        case USB_SPEED_LOW:
                                                dum_hcd->dum->gadget.ep0->
                                                        maxpacket = 8;
                                                dum_hcd->port_status |=
                                                        USB_PORT_STAT_LOW_SPEED;
                                                break;
                                        default:
                                                break;
                                        }
                                }
                        }
                }
                set_link_state(dum_hcd);
                ((__le16 *) buf)[0] = cpu_to_le16(dum_hcd->port_status);
                ((__le16 *) buf)[1] = cpu_to_le16(dum_hcd->port_status >> 16);
                break;
        case SetHubFeature:
                retval = -EPIPE;
                break;
        case SetPortFeature:
                switch (wValue) {
                case USB_PORT_FEAT_LINK_STATE:
                        if (hcd->speed != HCD_USB3) {
                                dev_dbg(dummy_dev(dum_hcd),
                                         "USB_PORT_FEAT_LINK_STATE req not "
                                         "supported for USB 2.0 roothub\n");
                                goto error;
                        }
                        /*
                         * Since this is dummy we don't have an actual link so
                         * there is nothing to do for the SET_LINK_STATE cmd
                         */
                        break;
                case USB_PORT_FEAT_U1_TIMEOUT:
                case USB_PORT_FEAT_U2_TIMEOUT:
                        /* TODO: add suspend/resume support! */
                        if (hcd->speed != HCD_USB3) {
                                dev_dbg(dummy_dev(dum_hcd),
                                         "USB_PORT_FEAT_U1/2_TIMEOUT req not "
                                         "supported for USB 2.0 roothub\n");
                                goto error;
                        }
                        break;
                case USB_PORT_FEAT_SUSPEND:
                        /* Applicable only for USB2.0 hub */
                        if (hcd->speed == HCD_USB3) {
                                dev_dbg(dummy_dev(dum_hcd),
                                         "USB_PORT_FEAT_SUSPEND req not "
                                         "supported for USB 3.0 roothub\n");
                                goto error;
                        }
                        if (dum_hcd->active) {
                                dum_hcd->port_status |= USB_PORT_STAT_SUSPEND;

                                /* HNP would happen here; for now we
                                 * assume b_bus_req is always true.
                                 */
                                set_link_state(dum_hcd);
                                if (((1 << USB_DEVICE_B_HNP_ENABLE)
                                                & dum_hcd->dum->devstatus) != 0)
                                        dev_dbg(dummy_dev(dum_hcd),
                                                        "no HNP yet!\n");
                        }
                        break;
                case USB_PORT_FEAT_POWER:
                        if (hcd->speed == HCD_USB3)
                                dum_hcd->port_status |= USB_SS_PORT_STAT_POWER;
                        else
                                dum_hcd->port_status |= USB_PORT_STAT_POWER;
                        set_link_state(dum_hcd);
                        break;
                case USB_PORT_FEAT_BH_PORT_RESET:
                        /* Applicable only for USB3.0 hub */
                        if (hcd->speed != HCD_USB3) {
                                dev_dbg(dummy_dev(dum_hcd),
                                         "USB_PORT_FEAT_BH_PORT_RESET req not "
                                         "supported for USB 2.0 roothub\n");
                                goto error;
                        }
                        fallthrough;
                case USB_PORT_FEAT_RESET:
                        if (!(dum_hcd->port_status & USB_PORT_STAT_CONNECTION))
                                break;
                        /* if it's already enabled, disable */
                        if (hcd->speed == HCD_USB3) {
                                dum_hcd->port_status =
                                        (USB_SS_PORT_STAT_POWER |
                                         USB_PORT_STAT_CONNECTION |
                                         USB_PORT_STAT_RESET);
                        } else {
                                dum_hcd->port_status &= ~(USB_PORT_STAT_ENABLE
                                        | USB_PORT_STAT_LOW_SPEED
                                        | USB_PORT_STAT_HIGH_SPEED);
                                dum_hcd->port_status |= USB_PORT_STAT_RESET;
                        }
                        /*
                         * We want to reset device status. All but the
                         * Self powered feature
                         */
                        dum_hcd->dum->devstatus &=
                                (1 << USB_DEVICE_SELF_POWERED);
                        /*
                         * FIXME USB3.0: what is the correct reset signaling
                         * interval? Is it still 50msec as for HS?
                         */
                        dum_hcd->re_timeout = jiffies + msecs_to_jiffies(50);
                        set_link_state(dum_hcd);
                        break;
                case USB_PORT_FEAT_C_CONNECTION:
                case USB_PORT_FEAT_C_RESET:
                case USB_PORT_FEAT_C_ENABLE:
                case USB_PORT_FEAT_C_SUSPEND:
                        /* Not allowed for USB-3, and ignored for USB-2 */
                        if (hcd->speed == HCD_USB3)
                                goto error;
                        break;
                default:
                /* Disallow TEST, INDICATOR, and C_OVER_CURRENT */
                        goto error;
                }
                break;
        case GetPortErrorCount:
                if (hcd->speed != HCD_USB3) {
                        dev_dbg(dummy_dev(dum_hcd),
                                 "GetPortErrorCount req not "
                                 "supported for USB 2.0 roothub\n");
                        goto error;
                }
                /* We'll always return 0 since this is a dummy hub */
                *(__le32 *) buf = cpu_to_le32(0);
                break;
        case SetHubDepth:
                if (hcd->speed != HCD_USB3) {
                        dev_dbg(dummy_dev(dum_hcd),
                                 "SetHubDepth req not supported for "
                                 "USB 2.0 roothub\n");
                        goto error;
                }
                break;
        default:
                dev_dbg(dummy_dev(dum_hcd),
                        "hub control req%04x v%04x i%04x l%d\n",
                        typeReq, wValue, wIndex, wLength);
error:
                /* "protocol stall" on error */
                retval = -EPIPE;
        }
        spin_unlock_irqrestore(&dum_hcd->dum->lock, flags);

        if ((dum_hcd->port_status & PORT_C_MASK) != 0)
                usb_hcd_poll_rh_status(hcd);
        return retval;
}

static int dummy_bus_suspend(struct usb_hcd *hcd)
{
        struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd);

        dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__);

        spin_lock_irq(&dum_hcd->dum->lock);
        dum_hcd->rh_state = DUMMY_RH_SUSPENDED;
        set_link_state(dum_hcd);
        hcd->state = HC_STATE_SUSPENDED;
        spin_unlock_irq(&dum_hcd->dum->lock);
        return 0;
}

static int dummy_bus_resume(struct usb_hcd *hcd)
{
        struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd);
        int rc = 0;

        dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__);

        spin_lock_irq(&dum_hcd->dum->lock);
        if (!HCD_HW_ACCESSIBLE(hcd)) {
                rc = -ESHUTDOWN;
        } else {
                dum_hcd->rh_state = DUMMY_RH_RUNNING;
                set_link_state(dum_hcd);
                if (!list_empty(&dum_hcd->urbp_list)) {
                        dum_hcd->timer_pending = 1;
                        hrtimer_start(&dum_hcd->timer, ns_to_ktime(0), HRTIMER_MODE_REL_SOFT);
                }
                hcd->state = HC_STATE_RUNNING;
        }
        spin_unlock_irq(&dum_hcd->dum->lock);
        return rc;
}

/*-------------------------------------------------------------------------*/

static inline ssize_t show_urb(char *buf, size_t size, struct urb *urb)
{
        int ep = usb_pipeendpoint(urb->pipe);

        return scnprintf(buf, size,
                "urb/%p %s ep%d%s%s len %d/%d\n",
                urb,
                ({ char *s;
                switch (urb->dev->speed) {
                case USB_SPEED_LOW:
                        s = "ls";
                        break;
                case USB_SPEED_FULL:
                        s = "fs";
                        break;
                case USB_SPEED_HIGH:
                        s = "hs";
                        break;
                case USB_SPEED_SUPER:
                        s = "ss";
                        break;
                default:
                        s = "?";
                        break;
                 } s; }),
                ep, ep ? (usb_urb_dir_in(urb) ? "in" : "out") : "",
                ({ char *s; \
                switch (usb_pipetype(urb->pipe)) { \
                case PIPE_CONTROL: \
                        s = ""; \
                        break; \
                case PIPE_BULK: \
                        s = "-bulk"; \
                        break; \
                case PIPE_INTERRUPT: \
                        s = "-int"; \
                        break; \
                default: \
                        s = "-iso"; \
                        break; \
                } s; }),
                urb->actual_length, urb->transfer_buffer_length);
}

static ssize_t urbs_show(struct device *dev, struct device_attribute *attr,
                char *buf)
{
        struct usb_hcd                *hcd = dev_get_drvdata(dev);
        struct dummy_hcd        *dum_hcd = hcd_to_dummy_hcd(hcd);
        struct urbp                *urbp;
        size_t                        size = 0;
        unsigned long                flags;

        spin_lock_irqsave(&dum_hcd->dum->lock, flags);
        list_for_each_entry(urbp, &dum_hcd->urbp_list, urbp_list) {
                size_t                temp;

                temp = show_urb(buf, PAGE_SIZE - size, urbp->urb);
                buf += temp;
                size += temp;
        }
        spin_unlock_irqrestore(&dum_hcd->dum->lock, flags);

        return size;
}
static DEVICE_ATTR_RO(urbs);

static int dummy_start_ss(struct dummy_hcd *dum_hcd)
{
        hrtimer_setup(&dum_hcd->timer, dummy_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
        dum_hcd->rh_state = DUMMY_RH_RUNNING;
        dum_hcd->stream_en_ep = 0;
        INIT_LIST_HEAD(&dum_hcd->urbp_list);
        dummy_hcd_to_hcd(dum_hcd)->power_budget = POWER_BUDGET_3;
        dummy_hcd_to_hcd(dum_hcd)->state = HC_STATE_RUNNING;
        dummy_hcd_to_hcd(dum_hcd)->uses_new_polling = 1;
#ifdef CONFIG_USB_OTG
        dummy_hcd_to_hcd(dum_hcd)->self.otg_port = 1;
#endif
        return 0;

        /* FIXME 'urbs' should be a per-device thing, maybe in usbcore */
        return device_create_file(dummy_dev(dum_hcd), &dev_attr_urbs);
}

static int dummy_start(struct usb_hcd *hcd)
{
        struct dummy_hcd        *dum_hcd = hcd_to_dummy_hcd(hcd);

        /*
         * HOST side init ... we emulate a root hub that'll only ever
         * talk to one device (the gadget side).  Also appears in sysfs,
         * just like more familiar pci-based HCDs.
         */
        if (!usb_hcd_is_primary_hcd(hcd))
                return dummy_start_ss(dum_hcd);

        spin_lock_init(&dum_hcd->dum->lock);
        hrtimer_setup(&dum_hcd->timer, dummy_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
        dum_hcd->rh_state = DUMMY_RH_RUNNING;

        INIT_LIST_HEAD(&dum_hcd->urbp_list);

        hcd->power_budget = POWER_BUDGET;
        hcd->state = HC_STATE_RUNNING;
        hcd->uses_new_polling = 1;

#ifdef CONFIG_USB_OTG
        hcd->self.otg_port = 1;
#endif

        /* FIXME 'urbs' should be a per-device thing, maybe in usbcore */
        return device_create_file(dummy_dev(dum_hcd), &dev_attr_urbs);
}

static void dummy_stop(struct usb_hcd *hcd)
{
        struct dummy_hcd        *dum_hcd = hcd_to_dummy_hcd(hcd);

        hrtimer_cancel(&dum_hcd->timer);
        dum_hcd->timer_pending = 0;
        device_remove_file(dummy_dev(dum_hcd), &dev_attr_urbs);
        dev_info(dummy_dev(dum_hcd), "stopped\n");
}

/*-------------------------------------------------------------------------*/

static int dummy_h_get_frame(struct usb_hcd *hcd)
{
        return dummy_g_get_frame(NULL);
}

static int dummy_setup(struct usb_hcd *hcd)
{
        struct dummy *dum;

        dum = *((void **)dev_get_platdata(hcd->self.controller));
        hcd->self.sg_tablesize = ~0;
        if (usb_hcd_is_primary_hcd(hcd)) {
                dum->hs_hcd = hcd_to_dummy_hcd(hcd);
                dum->hs_hcd->dum = dum;
                /*
                 * Mark the first roothub as being USB 2.0.
                 * The USB 3.0 roothub will be registered later by
                 * dummy_hcd_probe()
                 */
                hcd->speed = HCD_USB2;
                hcd->self.root_hub->speed = USB_SPEED_HIGH;
        } else {
                dum->ss_hcd = hcd_to_dummy_hcd(hcd);
                dum->ss_hcd->dum = dum;
                hcd->speed = HCD_USB3;
                hcd->self.root_hub->speed = USB_SPEED_SUPER;
        }
        return 0;
}

/* Change a group of bulk endpoints to support multiple stream IDs */
static int dummy_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev,
        struct usb_host_endpoint **eps, unsigned int num_eps,
        unsigned int num_streams, gfp_t mem_flags)
{
        struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd);
        unsigned long flags;
        int max_stream;
        int ret_streams = num_streams;
        unsigned int index;
        unsigned int i;

        if (!num_eps)
                return -EINVAL;

        spin_lock_irqsave(&dum_hcd->dum->lock, flags);
        for (i = 0; i < num_eps; i++) {
                index = dummy_get_ep_idx(&eps[i]->desc);
                if ((1 << index) & dum_hcd->stream_en_ep) {
                        ret_streams = -EINVAL;
                        goto out;
                }
                max_stream = usb_ss_max_streams(&eps[i]->ss_ep_comp);
                if (!max_stream) {
                        ret_streams = -EINVAL;
                        goto out;
                }
                if (max_stream < ret_streams) {
                        dev_dbg(dummy_dev(dum_hcd), "Ep 0x%x only supports %u "
                                        "stream IDs.\n",
                                        eps[i]->desc.bEndpointAddress,
                                        max_stream);
                        ret_streams = max_stream;
                }
        }

        for (i = 0; i < num_eps; i++) {
                index = dummy_get_ep_idx(&eps[i]->desc);
                dum_hcd->stream_en_ep |= 1 << index;
                set_max_streams_for_pipe(dum_hcd,
                                usb_endpoint_num(&eps[i]->desc), ret_streams);
        }
out:
        spin_unlock_irqrestore(&dum_hcd->dum->lock, flags);
        return ret_streams;
}

/* Reverts a group of bulk endpoints back to not using stream IDs. */
static int dummy_free_streams(struct usb_hcd *hcd, struct usb_device *udev,
        struct usb_host_endpoint **eps, unsigned int num_eps,
        gfp_t mem_flags)
{
        struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd);
        unsigned long flags;
        int ret;
        unsigned int index;
        unsigned int i;

        spin_lock_irqsave(&dum_hcd->dum->lock, flags);
        for (i = 0; i < num_eps; i++) {
                index = dummy_get_ep_idx(&eps[i]->desc);
                if (!((1 << index) & dum_hcd->stream_en_ep)) {
                        ret = -EINVAL;
                        goto out;
                }
        }

        for (i = 0; i < num_eps; i++) {
                index = dummy_get_ep_idx(&eps[i]->desc);
                dum_hcd->stream_en_ep &= ~(1 << index);
                set_max_streams_for_pipe(dum_hcd,
                                usb_endpoint_num(&eps[i]->desc), 0);
        }
        ret = 0;
out:
        spin_unlock_irqrestore(&dum_hcd->dum->lock, flags);
        return ret;
}

static struct hc_driver dummy_hcd = {
        .description =                (char *) driver_name,
        .product_desc =                "Dummy host controller",
        .hcd_priv_size =        sizeof(struct dummy_hcd),

        .reset =                dummy_setup,
        .start =                dummy_start,
        .stop =                        dummy_stop,

        .urb_enqueue =                dummy_urb_enqueue,
        .urb_dequeue =                dummy_urb_dequeue,

        .get_frame_number =        dummy_h_get_frame,

        .hub_status_data =        dummy_hub_status,
        .hub_control =                dummy_hub_control,
        .bus_suspend =                dummy_bus_suspend,
        .bus_resume =                dummy_bus_resume,

        .alloc_streams =        dummy_alloc_streams,
        .free_streams =                dummy_free_streams,
};

static int dummy_hcd_probe(struct platform_device *pdev)
{
        struct dummy                *dum;
        struct usb_hcd                *hs_hcd;
        struct usb_hcd                *ss_hcd;
        int                        retval;

        dev_info(&pdev->dev, "%s, driver " DRIVER_VERSION "\n", driver_desc);
        dum = *((void **)dev_get_platdata(&pdev->dev));

        if (mod_data.is_super_speed)
                dummy_hcd.flags = HCD_USB3 | HCD_SHARED;
        else if (mod_data.is_high_speed)
                dummy_hcd.flags = HCD_USB2;
        else
                dummy_hcd.flags = HCD_USB11;
        hs_hcd = usb_create_hcd(&dummy_hcd, &pdev->dev, dev_name(&pdev->dev));
        if (!hs_hcd)
                return -ENOMEM;
        hs_hcd->has_tt = 1;

        retval = usb_add_hcd(hs_hcd, 0, 0);
        if (retval)
                goto put_usb2_hcd;

        if (mod_data.is_super_speed) {
                ss_hcd = usb_create_shared_hcd(&dummy_hcd, &pdev->dev,
                                        dev_name(&pdev->dev), hs_hcd);
                if (!ss_hcd) {
                        retval = -ENOMEM;
                        goto dealloc_usb2_hcd;
                }

                retval = usb_add_hcd(ss_hcd, 0, 0);
                if (retval)
                        goto put_usb3_hcd;
        }
        return 0;

put_usb3_hcd:
        usb_put_hcd(ss_hcd);
dealloc_usb2_hcd:
        usb_remove_hcd(hs_hcd);
put_usb2_hcd:
        usb_put_hcd(hs_hcd);
        dum->hs_hcd = dum->ss_hcd = NULL;
        return retval;
}

static void dummy_hcd_remove(struct platform_device *pdev)
{
        struct dummy                *dum;

        dum = hcd_to_dummy_hcd(platform_get_drvdata(pdev))->dum;

        if (dum->ss_hcd) {
                usb_remove_hcd(dummy_hcd_to_hcd(dum->ss_hcd));
                usb_put_hcd(dummy_hcd_to_hcd(dum->ss_hcd));
        }

        usb_remove_hcd(dummy_hcd_to_hcd(dum->hs_hcd));
        usb_put_hcd(dummy_hcd_to_hcd(dum->hs_hcd));

        dum->hs_hcd = NULL;
        dum->ss_hcd = NULL;
}

static int dummy_hcd_suspend(struct platform_device *pdev, pm_message_t state)
{
        struct usb_hcd                *hcd;
        struct dummy_hcd        *dum_hcd;
        int                        rc = 0;

        dev_dbg(&pdev->dev, "%s\n", __func__);

        hcd = platform_get_drvdata(pdev);
        dum_hcd = hcd_to_dummy_hcd(hcd);
        if (dum_hcd->rh_state == DUMMY_RH_RUNNING) {
                dev_warn(&pdev->dev, "Root hub isn't suspended!\n");
                rc = -EBUSY;
        } else
                clear_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags);
        return rc;
}

static int dummy_hcd_resume(struct platform_device *pdev)
{
        struct usb_hcd                *hcd;

        dev_dbg(&pdev->dev, "%s\n", __func__);

        hcd = platform_get_drvdata(pdev);
        set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags);
        usb_hcd_poll_rh_status(hcd);
        return 0;
}

static struct platform_driver dummy_hcd_driver = {
        .probe                = dummy_hcd_probe,
        .remove                = dummy_hcd_remove,
        .suspend        = dummy_hcd_suspend,
        .resume                = dummy_hcd_resume,
        .driver                = {
                .name        = driver_name,
        },
};

/*-------------------------------------------------------------------------*/
#define MAX_NUM_UDC        32
static struct platform_device *the_udc_pdev[MAX_NUM_UDC];
static struct platform_device *the_hcd_pdev[MAX_NUM_UDC];

static int __init dummy_hcd_init(void)
{
        int        retval = -ENOMEM;
        int        i;
        struct        dummy *dum[MAX_NUM_UDC] = {};

        if (usb_disabled())
                return -ENODEV;

        if (!mod_data.is_high_speed && mod_data.is_super_speed)
                return -EINVAL;

        if (mod_data.num < 1 || mod_data.num > MAX_NUM_UDC) {
                pr_err("Number of emulated UDC must be in range of 1...%d\n",
                                MAX_NUM_UDC);
                return -EINVAL;
        }

        for (i = 0; i < mod_data.num; i++) {
                the_hcd_pdev[i] = platform_device_alloc(driver_name, i);
                if (!the_hcd_pdev[i]) {
                        i--;
                        while (i >= 0)
                                platform_device_put(the_hcd_pdev[i--]);
                        return retval;
                }
        }
        for (i = 0; i < mod_data.num; i++) {
                the_udc_pdev[i] = platform_device_alloc(gadget_name, i);
                if (!the_udc_pdev[i]) {
                        i--;
                        while (i >= 0)
                                platform_device_put(the_udc_pdev[i--]);
                        goto err_alloc_udc;
                }
        }
        for (i = 0; i < mod_data.num; i++) {
                dum[i] = kzalloc_obj(struct dummy);
                if (!dum[i]) {
                        retval = -ENOMEM;
                        goto err_add_pdata;
                }
                retval = platform_device_add_data(the_hcd_pdev[i], &dum[i],
                                sizeof(void *));
                if (retval)
                        goto err_add_pdata;
                retval = platform_device_add_data(the_udc_pdev[i], &dum[i],
                                sizeof(void *));
                if (retval)
                        goto err_add_pdata;
        }

        retval = platform_driver_register(&dummy_hcd_driver);
        if (retval < 0)
                goto err_add_pdata;
        retval = platform_driver_register(&dummy_udc_driver);
        if (retval < 0)
                goto err_register_udc_driver;

        for (i = 0; i < mod_data.num; i++) {
                retval = platform_device_add(the_hcd_pdev[i]);
                if (retval < 0) {
                        i--;
                        while (i >= 0)
                                platform_device_del(the_hcd_pdev[i--]);
                        goto err_add_hcd;
                }
        }
        for (i = 0; i < mod_data.num; i++) {
                if (!dum[i]->hs_hcd ||
                                (!dum[i]->ss_hcd && mod_data.is_super_speed)) {
                        /*
                         * The hcd was added successfully but its probe
                         * function failed for some reason.
                         */
                        retval = -EINVAL;
                        goto err_add_udc;
                }
        }

        for (i = 0; i < mod_data.num; i++) {
                retval = platform_device_add(the_udc_pdev[i]);
                if (retval < 0) {
                        i--;
                        while (i >= 0)
                                platform_device_del(the_udc_pdev[i--]);
                        goto err_add_udc;
                }
        }

        for (i = 0; i < mod_data.num; i++) {
                if (!platform_get_drvdata(the_udc_pdev[i])) {
                        /*
                         * The udc was added successfully but its probe
                         * function failed for some reason.
                         */
                        retval = -EINVAL;
                        goto err_probe_udc;
                }
        }
        return retval;

err_probe_udc:
        for (i = 0; i < mod_data.num; i++)
                platform_device_del(the_udc_pdev[i]);
err_add_udc:
        for (i = 0; i < mod_data.num; i++)
                platform_device_del(the_hcd_pdev[i]);
err_add_hcd:
        platform_driver_unregister(&dummy_udc_driver);
err_register_udc_driver:
        platform_driver_unregister(&dummy_hcd_driver);
err_add_pdata:
        for (i = 0; i < mod_data.num; i++)
                kfree(dum[i]);
        for (i = 0; i < mod_data.num; i++)
                platform_device_put(the_udc_pdev[i]);
err_alloc_udc:
        for (i = 0; i < mod_data.num; i++)
                platform_device_put(the_hcd_pdev[i]);
        return retval;
}
module_init(dummy_hcd_init);

static void __exit dummy_hcd_cleanup(void)
{
        int i;

        for (i = 0; i < mod_data.num; i++) {
                struct dummy *dum;

                dum = *((void **)dev_get_platdata(&the_udc_pdev[i]->dev));

                platform_device_unregister(the_udc_pdev[i]);
                platform_device_unregister(the_hcd_pdev[i]);
                kfree(dum);
        }
        platform_driver_unregister(&dummy_udc_driver);
        platform_driver_unregister(&dummy_hcd_driver);
}
module_exit(dummy_hcd_cleanup);












































    6 
    1 
    5 





















   20 














    8 
    8 















    7 
    6 


    6 






























    2 




    3 




















































   17 
   17 























    3 






    3 










































   17 
















   18 


























































































































































































































































































    1 








    1 

    1 



    1 


    1 







































































































































   19 






   19 








































































































































































































































































































































































































































































































































































































































































    1 


















    1 

    1 


    1 
    1 



























    1 
    1 




























    1 


    1 
















    1 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/sysctl.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
#include <linux/elf.h>
#include <linux/elf-randomize.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
#include <linux/fsnotify.h>
#include <linux/page_idle.h>

#include <linux/uaccess.h>

#include <kunit/visibility.h>

#include "internal.h"
#include "swap.h"

/**
 * kfree_const - conditionally free memory
 * @x: pointer to the memory
 *
 * Function calls kfree only if @x is not in .rodata section.
 */
void kfree_const(const void *x)
{
        if (!is_kernel_rodata((unsigned long)x))
                kfree(x);
}
EXPORT_SYMBOL(kfree_const);

/**
 * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated.
 * @s: The data to copy
 * @len: The size of the data, not including the NUL terminator
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s with NUL-termination or %NULL in
 * case of error
 */
static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
        char *buf;

        /* '+1' for the NUL terminator */
        buf = kmalloc_track_caller(len + 1, gfp);
        if (!buf)
                return NULL;

        memcpy(buf, s, len);
        /* Ensure the buf is always NUL-terminated, regardless of @s. */
        buf[len] = '\0';
        return buf;
}

/**
 * kstrdup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
noinline
char *kstrdup(const char *s, gfp_t gfp)
{
        return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL;
}
EXPORT_SYMBOL(kstrdup);

/**
 * kstrdup_const - conditionally duplicate an existing const string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
 * must not be passed to krealloc().
 *
 * Return: source string if it is in .rodata section otherwise
 * fallback to kstrdup.
 */
const char *kstrdup_const(const char *s, gfp_t gfp)
{
        if (is_kernel_rodata((unsigned long)s))
                return s;

        return kstrdup(s, gfp);
}
EXPORT_SYMBOL(kstrdup_const);

/**
 * kstrndup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @max: read at most @max chars from @s
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Use kmemdup_nul() instead if the size is known exactly.
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
        return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL;
}
EXPORT_SYMBOL(kstrndup);

/**
 * kmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error,
 * result is physically contiguous. Use kfree() to free.
 */
void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kmemdup_noprof);

/**
 * kmemdup_array - duplicate a given array.
 *
 * @src: array to duplicate.
 * @count: number of elements to duplicate from array.
 * @element_size: size of each element of array.
 * @gfp: GFP mask to use.
 *
 * Return: duplicated array of @src or %NULL in case of error,
 * result is physically contiguous. Use kfree() to free.
 */
void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp)
{
        return kmemdup(src, size_mul(element_size, count), gfp);
}
EXPORT_SYMBOL(kmemdup_array);

/**
 * kvmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error,
 * result may be not physically contiguous. Use kvfree() to free.
 */
void *kvmemdup(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kvmalloc(len, gfp);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kvmemdup);

/**
 * kmemdup_nul - Create a NUL-terminated string from unterminated data
 * @s: The data to stringify
 * @len: The size of the data
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s with NUL-termination or %NULL in
 * case of error
 */
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
        return s ? __kmemdup_nul(s, len, gfp) : NULL;
}
EXPORT_SYMBOL(kmemdup_nul);

static kmem_buckets *user_buckets __ro_after_init;

static int __init init_user_buckets(void)
{
        user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL);

        return 0;
}
subsys_initcall(init_user_buckets);

/**
 * memdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result is physically
 * contiguous, to be freed by kfree().
 */
void *memdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(memdup_user);

/**
 * vmemdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result may be not
 * physically contiguous.  Use kvfree() to free.
 */
void *vmemdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kmem_buckets_valloc(user_buckets, len, GFP_USER);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kvfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(vmemdup_user);

/**
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
 * @n: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Return: newly allocated copy of @s or an ERR_PTR() in case of error
 */
char *strndup_user(const char __user *s, long n)
{
        char *p;
        long length;

        length = strnlen_user(s, n);

        if (!length)
                return ERR_PTR(-EFAULT);

        if (length > n)
                return ERR_PTR(-EINVAL);

        p = memdup_user(s, length);

        if (IS_ERR(p))
                return p;

        p[length - 1] = '\0';

        return p;
}
EXPORT_SYMBOL(strndup_user);

/**
 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.
 */
void *memdup_user_nul(const void __user *src, size_t len)
{
        char *p;

        p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';

        return p;
}
EXPORT_SYMBOL(memdup_user_nul);

/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(const struct vm_area_struct *vma)
{
        struct task_struct * __maybe_unused t = current;

        return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}

/*
 * Change backing file, only valid to use during initial VMA setup.
 */
void vma_set_file(struct vm_area_struct *vma, struct file *file)
{
        /* Changing an anonymous vma with this is illegal */
        get_file(file);
        swap(vma->vm_file, file);
        fput(file);
}
EXPORT_SYMBOL(vma_set_file);

#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
#endif

unsigned long randomize_stack_top(unsigned long stack_top)
{
        unsigned long random_variable = 0;

        if (current->flags & PF_RANDOMIZE) {
                random_variable = get_random_long();
                random_variable &= STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
#ifdef CONFIG_STACK_GROWSUP
        return PAGE_ALIGN(stack_top) + random_variable;
#else
        return PAGE_ALIGN(stack_top) - random_variable;
#endif
}

/**
 * randomize_page - Generate a random, page aligned address
 * @start:        The smallest acceptable address the caller will take.
 * @range:        The size of the area, starting at @start, within which the
 *                random address must fall.
 *
 * If @start + @range would overflow, @range is capped.
 *
 * NOTE: Historical use of randomize_range, which this replaces, presumed that
 * @start was already page aligned.  We now align it regardless.
 *
 * Return: A page aligned address within [start, start + range).  On error,
 * @start is returned.
 */
unsigned long randomize_page(unsigned long start, unsigned long range)
{
        if (!PAGE_ALIGNED(start)) {
                range -= PAGE_ALIGN(start) - start;
                start = PAGE_ALIGN(start);
        }

        if (start > ULONG_MAX - range)
                range = ULONG_MAX - start;

        range >>= PAGE_SHIFT;

        if (range == 0)
                return start;

        return start + (get_random_long() % range << PAGE_SHIFT);
}

#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
{
        /* Is the current task 32bit ? */
        if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
                return randomize_page(mm->brk, SZ_32M);

        return randomize_page(mm->brk, SZ_1G);
}

unsigned long arch_mmap_rnd(void)
{
        unsigned long rnd;

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
        if (is_compat_task())
                rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
        else
#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
                rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);

        return rnd << PAGE_SHIFT;
}

static int mmap_is_legacy(const struct rlimit *rlim_stack)
{
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;

        /* On parisc the stack always grows up - so a unlimited stack should
         * not be an indicator to use the legacy memory layout. */
        if (rlim_stack->rlim_cur == RLIM_INFINITY &&
                !IS_ENABLED(CONFIG_STACK_GROWSUP))
                return 1;

        return sysctl_legacy_va_layout;
}

/*
 * Leave enough space between the mmap area and the stack to honour ulimit in
 * the face of randomisation.
 */
#define MIN_GAP                (SZ_128M)
#define MAX_GAP                (STACK_TOP / 6 * 5)

static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack)
{
#ifdef CONFIG_STACK_GROWSUP
        /*
         * For an upwards growing stack the calculation is much simpler.
         * Memory for the maximum stack size is reserved at the top of the
         * task. mmap_base starts directly below the stack and grows
         * downwards.
         */
        return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
#else
        unsigned long gap = rlim_stack->rlim_cur;
        unsigned long pad = stack_guard_gap;

        /* Account for stack randomization if necessary */
        if (current->flags & PF_RANDOMIZE)
                pad += (STACK_RND_MASK << PAGE_SHIFT);

        /* Values close to RLIM_INFINITY can overflow. */
        if (gap + pad > gap)
                gap += pad;

        if (gap < MIN_GAP && MIN_GAP < MAX_GAP)
                gap = MIN_GAP;
        else if (gap > MAX_GAP)
                gap = MAX_GAP;

        return PAGE_ALIGN(STACK_TOP - gap - rnd);
#endif
}

void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
{
        unsigned long random_factor = 0UL;

        if (current->flags & PF_RANDOMIZE)
                random_factor = arch_mmap_rnd();

        if (mmap_is_legacy(rlim_stack)) {
                mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
                mm_flags_clear(MMF_TOPDOWN, mm);
        } else {
                mm->mmap_base = mmap_base(random_factor, rlim_stack);
                mm_flags_set(MMF_TOPDOWN, mm);
        }
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
{
        mm->mmap_base = TASK_UNMAPPED_BASE;
        mm_flags_clear(MMF_TOPDOWN, mm);
}
#endif
#ifdef CONFIG_MMU
EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout);
#endif

/**
 * __account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 * @task:        task used to check RLIMIT_MEMLOCK
 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
 *
 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
 * that mmap_lock is held as writer.
 *
 * Return:
 * * 0       on success
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        const struct task_struct *task, bool bypass_rlim)
{
        unsigned long locked_vm, limit;
        int ret = 0;

        mmap_assert_write_locked(mm);

        locked_vm = mm->locked_vm;
        if (inc) {
                if (!bypass_rlim) {
                        limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
                        if (locked_vm + pages > limit)
                                ret = -ENOMEM;
                }
                if (!ret)
                        mm->locked_vm = locked_vm + pages;
        } else {
                WARN_ON_ONCE(pages > locked_vm);
                mm->locked_vm = locked_vm - pages;
        }

        pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
                 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
                 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
                 ret ? " - exceeded" : "");

        return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);

/**
 * account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against, may be NULL
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 *
 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
 *
 * Return:
 * * 0       on success, or if mm is NULL
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
{
        int ret;

        if (pages == 0 || !mm)
                return 0;

        mmap_write_lock(mm);
        ret = __account_locked_vm(mm, pages, inc, current,
                                  capable(CAP_IPC_LOCK));
        mmap_write_unlock(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(account_locked_vm);

unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long pgoff)
{
        loff_t off = (loff_t)pgoff << PAGE_SHIFT;
        unsigned long ret;
        struct mm_struct *mm = current->mm;
        unsigned long populate;
        LIST_HEAD(uf);

        ret = security_mmap_file(file, prot, flag);
        if (!ret)
                ret = fsnotify_mmap_perm(file, prot, off, len);
        if (!ret) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
                ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
                              &uf);
                mmap_write_unlock(mm);
                userfaultfd_unmap_complete(mm, &uf);
                if (populate)
                        mm_populate(ret, populate);
        }
        return ret;
}

/*
 * Perform a userland memory mapping into the current process address space. See
 * the comment for do_mmap() for more details on this operation in general.
 *
 * This differs from do_mmap() in that:
 *
 * a. An offset parameter is provided rather than pgoff, which is both checked
 *    for overflow and page alignment.
 * b. mmap locking is performed on the caller's behalf.
 * c. Userfaultfd unmap events and memory population are handled.
 *
 * This means that this function performs essentially the same work as if
 * userland were invoking mmap (2).
 *
 * Returns either an error, or the address at which the requested mapping has
 * been performed.
 */
unsigned long vm_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long offset)
{
        if (unlikely(offset + PAGE_ALIGN(len) < offset))
                return -EINVAL;
        if (unlikely(offset_in_page(offset)))
                return -EINVAL;

        return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);

#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
/*
 * Perform a userland memory mapping for a shadow stack into the current
 * process address space. This is intended to be used by architectures that
 * support user shadow stacks.
 */
unsigned long vm_mmap_shadow_stack(unsigned long addr, unsigned long len,
                unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        unsigned long ret, unused;
        vm_flags_t vm_flags = VM_SHADOW_STACK;

        flags |= MAP_ANONYMOUS | MAP_PRIVATE;
        if (addr)
                flags |= MAP_FIXED_NOREPLACE;

        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                vm_flags |= VM_NOHUGEPAGE;

        mmap_write_lock(mm);
        ret = do_mmap(NULL, addr, len, PROT_READ | PROT_WRITE, flags,
                      vm_flags, 0, &unused, NULL);
        mmap_write_unlock(mm);

        return ret;
}
#endif /* CONFIG_ARCH_HAS_USER_SHADOW_STACK */

/**
 * __vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        return __vmalloc_noprof(bytes, flags);
}
EXPORT_SYMBOL(__vmalloc_array_noprof);

/**
 * vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vmalloc_array_noprof(size_t n, size_t size)
{
        return __vmalloc_array_noprof(n, size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc_array_noprof);

/**
 * __vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
{
        return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(__vcalloc_noprof);

/**
 * vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vcalloc_noprof(size_t n, size_t size)
{
        return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vcalloc_noprof);

struct anon_vma *folio_anon_vma(const struct folio *folio)
{
        unsigned long mapping = (unsigned long)folio->mapping;

        if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
                return NULL;
        return (void *)(mapping - FOLIO_MAPPING_ANON);
}

/**
 * folio_mapping - Find the mapping where this folio is stored.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Folios in the swap cache return the swap mapping
 * this page is stored in (which is different from the mapping for the
 * swap file or swap device where the data is stored).
 *
 * You can call this for folios which aren't in the swap cache or page
 * cache and it will return NULL.
 */
struct address_space *folio_mapping(const struct folio *folio)
{
        struct address_space *mapping;

        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(folio_test_slab(folio)))
                return NULL;

        if (unlikely(folio_test_swapcache(folio)))
                return swap_address_space(folio->swap);

        mapping = folio->mapping;
        if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS)
                return NULL;

        return mapping;
}
EXPORT_SYMBOL(folio_mapping);

/**
 * folio_copy - Copy the contents of one folio to another.
 * @dst: Folio to copy to.
 * @src: Folio to copy from.
 *
 * The bytes in the folio represented by @src are copied to @dst.
 * Assumes the caller has validated that @dst is at least as large as @src.
 * Can be called in atomic context for order-0 folios, but if the folio is
 * larger, it may sleep.
 */
void folio_copy(struct folio *dst, struct folio *src)
{
        long i = 0;
        long nr = folio_nr_pages(src);

        for (;;) {
                copy_highpage(folio_page(dst, i), folio_page(src, i));
                if (++i == nr)
                        break;
                cond_resched();
        }
}
EXPORT_SYMBOL(folio_copy);

int folio_mc_copy(struct folio *dst, struct folio *src)
{
        long nr = folio_nr_pages(src);
        long i = 0;

        for (;;) {
                if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i)))
                        return -EHWPOISON;
                if (++i == nr)
                        break;
                cond_resched();
        }

        return 0;
}
EXPORT_SYMBOL(folio_mc_copy);

int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
static int sysctl_overcommit_ratio __read_mostly = 50;
static unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */

#ifdef CONFIG_SYSCTL

static int overcommit_ratio_handler(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_kbytes = 0;
        return ret;
}

static void sync_overcommit_as(struct work_struct *dummy)
{
        percpu_counter_sync(&vm_committed_as);
}

static int overcommit_policy_handler(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int new_policy = -1;
        int ret;

        /*
         * The deviation of sync_overcommit_as could be big with loose policy
         * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
         * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
         * with the strict "NEVER", and to avoid possible race condition (even
         * though user usually won't too frequently do the switching to policy
         * OVERCOMMIT_NEVER), the switch is done in the following order:
         *        1. changing the batch
         *        2. sync percpu count on each CPU
         *        3. switch the policy
         */
        if (write) {
                t = *table;
                t.data = &new_policy;
                ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
                if (ret || new_policy == -1)
                        return ret;

                mm_compute_batch(new_policy);
                if (new_policy == OVERCOMMIT_NEVER)
                        schedule_on_each_cpu(sync_overcommit_as);
                sysctl_overcommit_memory = new_policy;
        } else {
                ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        }

        return ret;
}

static int overcommit_kbytes_handler(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_ratio = 0;
        return ret;
}

static const struct ctl_table util_sysctl_table[] = {
        {
                .procname        = "overcommit_memory",
                .data                = &sysctl_overcommit_memory,
                .maxlen                = sizeof(sysctl_overcommit_memory),
                .mode                = 0644,
                .proc_handler        = overcommit_policy_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
        {
                .procname        = "overcommit_ratio",
                .data                = &sysctl_overcommit_ratio,
                .maxlen                = sizeof(sysctl_overcommit_ratio),
                .mode                = 0644,
                .proc_handler        = overcommit_ratio_handler,
        },
        {
                .procname        = "overcommit_kbytes",
                .data                = &sysctl_overcommit_kbytes,
                .maxlen                = sizeof(sysctl_overcommit_kbytes),
                .mode                = 0644,
                .proc_handler        = overcommit_kbytes_handler,
        },
        {
                .procname        = "user_reserve_kbytes",
                .data                = &sysctl_user_reserve_kbytes,
                .maxlen                = sizeof(sysctl_user_reserve_kbytes),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
        {
                .procname        = "admin_reserve_kbytes",
                .data                = &sysctl_admin_reserve_kbytes,
                .maxlen                = sizeof(sysctl_admin_reserve_kbytes),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
};

static int __init init_vm_util_sysctls(void)
{
        register_sysctl_init("vm", util_sysctl_table);
        return 0;
}
subsys_initcall(init_vm_util_sysctls);
#endif /* CONFIG_SYSCTL */

/*
 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
 */
unsigned long vm_commit_limit(void)
{
        unsigned long allowed;

        if (sysctl_overcommit_kbytes)
                allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
        else
                allowed = ((totalram_pages() - hugetlb_total_pages())
                           * sysctl_overcommit_ratio / 100);
        allowed += total_swap_pages;

        return allowed;
}

/*
 * Make sure vm_committed_as in one cacheline and not cacheline shared with
 * other variables. It can be updated by several CPUs frequently.
 */
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;

/*
 * The global memory commitment made in the system can be a metric
 * that can be used to drive ballooning decisions when Linux is hosted
 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
 * balancing memory across competing virtual machines that are hosted.
 * Several metrics drive this policy engine including the guest reported
 * memory commitment.
 *
 * The time cost of this is very low for small platforms, and for big
 * platform like a 2S/36C/72T Skylake server, in worst case where
 * vm_committed_as's spinlock is under severe contention, the time cost
 * could be about 30~40 microseconds.
 */
unsigned long vm_memory_committed(void)
{
        return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);

/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * We currently support three overcommit policies, which are set via the
 * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
 *
 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
 * Additional code 2002 Jul 20 by Robert Love.
 *
 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
 *
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin)
{
        long allowed;
        unsigned long bytes_failed;

        vm_acct_memory(pages);

        /*
         * Sometimes we want to use more memory than we have
         */
        if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
                return 0;

        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                if (pages > totalram_pages() + total_swap_pages)
                        goto error;
                return 0;
        }

        allowed = vm_commit_limit();
        /*
         * Reserve some for root
         */
        if (!cap_sys_admin)
                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

        /*
         * Don't let a single process grow so big a user can't recover
         */
        if (mm) {
                long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);

                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }

        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
error:
        bytes_failed = pages << PAGE_SHIFT;
        pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
                            __func__, current->pid, current->comm, bytes_failed);
        vm_unacct_memory(pages);

        return -ENOMEM;
}

/**
 * get_cmdline() - copy the cmdline value to a buffer.
 * @task:     the task whose cmdline value to copy.
 * @buffer:   the buffer to copy to.
 * @buflen:   the length of the buffer. Larger cmdline values are truncated
 *            to this length.
 *
 * Return: the size of the cmdline field copied. Note that the copy does
 * not guarantee an ending NULL byte.
 */
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
        int res = 0;
        unsigned int len;
        struct mm_struct *mm = get_task_mm(task);
        unsigned long arg_start, arg_end, env_start, env_end;
        if (!mm)
                goto out;
        if (!mm->arg_end)
                goto out_mm;        /* Shh! No looking before we're done */

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        len = arg_end - arg_start;

        if (len > buflen)
                len = buflen;

        res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);

        /*
         * If the nul at the end of args has been overwritten, then
         * assume application is using setproctitle(3).
         */
        if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
                len = strnlen(buffer, res);
                if (len < res) {
                        res = len;
                } else {
                        len = env_end - env_start;
                        if (len > buflen - res)
                                len = buflen - res;
                        res += access_process_vm(task, env_start,
                                                 buffer+res, len,
                                                 FOLL_FORCE);
                        res = strnlen(buffer, res);
                }
        }
out_mm:
        mmput(mm);
out:
        return res;
}

int __weak memcmp_pages(struct page *page1, struct page *page2)
{
        char *addr1, *addr2;
        int ret;

        addr1 = kmap_local_page(page1);
        addr2 = kmap_local_page(page2);
        ret = memcmp(addr1, addr2, PAGE_SIZE);
        kunmap_local(addr2);
        kunmap_local(addr1);
        return ret;
}

#ifdef CONFIG_PRINTK
/**
 * mem_dump_obj - Print available provenance information
 * @object: object for which to find provenance information.
 *
 * This function uses pr_cont(), so that the caller is expected to have
 * printed out whatever preamble is appropriate.  The provenance information
 * depends on the type of object and on how much debugging is enabled.
 * For example, for a slab-cache object, the slab name is printed, and,
 * if available, the return address and stack trace from the allocation
 * and last free path of that object.
 */
void mem_dump_obj(void *object)
{
        const char *type;

        if (kmem_dump_obj(object))
                return;

        if (vmalloc_dump_obj(object))
                return;

        if (is_vmalloc_addr(object))
                type = "vmalloc memory";
        else if (virt_addr_valid(object))
                type = "non-slab/vmalloc memory";
        else if (object == NULL)
                type = "NULL pointer";
        else if (object == ZERO_SIZE_PTR)
                type = "zero-size pointer";
        else
                type = "non-paged memory";

        pr_cont(" %s\n", type);
}
EXPORT_SYMBOL_GPL(mem_dump_obj);
#endif

/*
 * A driver might set a page logically offline -- PageOffline() -- and
 * turn the page inaccessible in the hypervisor; after that, access to page
 * content can be fatal.
 *
 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
 * pages after checking PageOffline(); however, these PFN walkers can race
 * with drivers that set PageOffline().
 *
 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
 * synchronize with such drivers, achieving that a page cannot be set
 * PageOffline() while frozen.
 *
 * page_offline_begin()/page_offline_end() is used by drivers that care about
 * such races when setting a page PageOffline().
 */
static DECLARE_RWSEM(page_offline_rwsem);

void page_offline_freeze(void)
{
        down_read(&page_offline_rwsem);
}

void page_offline_thaw(void)
{
        up_read(&page_offline_rwsem);
}

void page_offline_begin(void)
{
        down_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_begin);

void page_offline_end(void)
{
        up_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_end);

#ifndef flush_dcache_folio
void flush_dcache_folio(struct folio *folio)
{
        long i, nr = folio_nr_pages(folio);

        for (i = 0; i < nr; i++)
                flush_dcache_page(folio_page(folio, i));
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif

/**
 * compat_set_desc_from_vma() - assigns VMA descriptor @desc fields from a VMA.
 * @desc: A VMA descriptor whose fields need to be set.
 * @file: The file object describing the file being mmap()'d.
 * @vma: The VMA whose fields we wish to assign to @desc.
 *
 * This is a compatibility function to allow an mmap() hook to call
 * mmap_prepare() hooks when drivers nest these. This function specifically
 * allows the construction of a vm_area_desc value, @desc, from a VMA @vma for
 * the purposes of doing this.
 *
 * Once the conversion of drivers is complete this function will no longer be
 * required and will be removed.
 */
void compat_set_desc_from_vma(struct vm_area_desc *desc,
                              const struct file *file,
                              const struct vm_area_struct *vma)
{
        memset(desc, 0, sizeof(*desc));

        desc->mm = vma->vm_mm;
        desc->file = (struct file *)file;
        desc->start = vma->vm_start;
        desc->end = vma->vm_end;

        desc->pgoff = vma->vm_pgoff;
        desc->vm_file = vma->vm_file;
        desc->vma_flags = vma->flags;
        desc->page_prot = vma->vm_page_prot;

        /* Default. */
        desc->action.type = MMAP_NOTHING;
}
EXPORT_SYMBOL(compat_set_desc_from_vma);

/**
 * __compat_vma_mmap() - Similar to compat_vma_mmap(), only it allows
 * flexibility as to how the mmap_prepare callback is invoked, which is useful
 * for drivers which invoke nested mmap_prepare callbacks in an mmap() hook.
 * @desc: A VMA descriptor upon which an mmap_prepare() hook has already been
 * executed.
 * @vma: The VMA to which @desc should be applied.
 *
 * The function assumes that you have obtained a VMA descriptor @desc from
 * compat_set_desc_from_vma(), and already executed the mmap_prepare() hook upon
 * it.
 *
 * It then performs any specified mmap actions, and invokes the vm_ops->mapped()
 * hook if one is present.
 *
 * See the description of compat_vma_mmap() for more details.
 *
 * Once the conversion of drivers is complete this function will no longer be
 * required and will be removed.
 *
 * Returns: 0 on success or error.
 */
int __compat_vma_mmap(struct vm_area_desc *desc,
                      struct vm_area_struct *vma)
{
        int err;

        /* Perform any preparatory tasks for mmap action. */
        err = mmap_action_prepare(desc);
        if (err)
                return err;
        /* Update the VMA from the descriptor. */
        compat_set_vma_from_desc(vma, desc);
        /* Complete any specified mmap actions. */
        return mmap_action_complete(vma, &desc->action);
}
EXPORT_SYMBOL(__compat_vma_mmap);

/**
 * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
 * existing VMA and execute any requested actions.
 * @file: The file which possesss an f_op->mmap_prepare() hook.
 * @vma: The VMA to apply the .mmap_prepare() hook to.
 *
 * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
 * stacked drivers invoke a nested mmap hook of an underlying file.
 *
 * Until all drivers are converted to use .mmap_prepare(), we must be
 * conservative and continue to invoke these stacked drivers using the
 * deprecated .mmap() hook.
 *
 * However we have a problem if the underlying file system possesses an
 * .mmap_prepare() hook, as we are in a different context when we invoke the
 * .mmap() hook, already having a VMA to deal with.
 *
 * compat_vma_mmap() is a compatibility function that takes VMA state,
 * establishes a struct vm_area_desc descriptor, passes to the underlying
 * .mmap_prepare() hook and applies any changes performed by it.
 *
 * Once the conversion of drivers is complete this function will no longer be
 * required and will be removed.
 *
 * Returns: 0 on success or error.
 */
int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct vm_area_desc desc;
        struct mmap_action *action;
        int err;

        compat_set_desc_from_vma(&desc, file, vma);
        err = vfs_mmap_prepare(file, &desc);
        if (err)
                return err;
        action = &desc.action;

        /* being invoked from .mmmap means we don't have to enforce this. */
        action->hide_from_rmap_until_complete = false;

        return __compat_vma_mmap(&desc, vma);
}
EXPORT_SYMBOL(compat_vma_mmap);

static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
                         const struct page *page)
{
        /*
         * Only the first page of a high-order buddy page has PageBuddy() set.
         * So we have to check manually whether this page is part of a high-
         * order buddy page.
         */
        if (PageBuddy(page))
                ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
        else if (page_count(page) == 0 && is_free_buddy_page(page))
                ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;

        if (folio_test_idle(folio))
                ps->flags |= PAGE_SNAPSHOT_PG_IDLE;
}

/**
 * snapshot_page() - Create a snapshot of a struct page
 * @ps: Pointer to a struct page_snapshot to store the page snapshot
 * @page: The page to snapshot
 *
 * Create a snapshot of the page and store both its struct page and struct
 * folio representations in @ps.
 *
 * A snapshot is marked as "faithful" if the compound state of @page was
 * stable and allowed safe reconstruction of the folio representation. In
 * rare cases where this is not possible (e.g. due to folio splitting),
 * snapshot_page() falls back to treating @page as a single page and the
 * snapshot is marked as "unfaithful". The snapshot_page_is_faithful()
 * helper can be used to check for this condition.
 */
void snapshot_page(struct page_snapshot *ps, const struct page *page)
{
        unsigned long info, nr_pages = 1;
        struct folio *foliop;
        int loops = 5;

        ps->pfn = page_to_pfn(page);
        ps->flags = PAGE_SNAPSHOT_FAITHFUL;

again:
        memset(&ps->folio_snapshot, 0, sizeof(struct folio));
        memcpy(&ps->page_snapshot, page, sizeof(*page));
        info = ps->page_snapshot.compound_info;
        if (!(info & 1)) {
                ps->idx = 0;
                foliop = (struct folio *)&ps->page_snapshot;
                if (!folio_test_large(foliop)) {
                        set_ps_flags(ps, page_folio(page), page);
                        memcpy(&ps->folio_snapshot, foliop,
                               sizeof(struct page));
                        return;
                }
                foliop = (struct folio *)page;
        } else {
                /* See compound_head() */
                if (compound_info_has_mask()) {
                        unsigned long p = (unsigned long)page;

                        foliop = (struct folio *)(p & info);
                } else {
                        foliop = (struct folio *)(info - 1);
                }

                ps->idx = folio_page_idx(foliop, page);
        }

        if (ps->idx < MAX_FOLIO_NR_PAGES) {
                memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page));
                nr_pages = folio_nr_pages(&ps->folio_snapshot);
                if (nr_pages > 1)
                        memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2,
                               sizeof(struct page));
                set_ps_flags(ps, foliop, page);
        }

        if (ps->idx > nr_pages) {
                if (loops-- > 0)
                        goto again;
                clear_compound_head(&ps->page_snapshot);
                foliop = (struct folio *)&ps->page_snapshot;
                memcpy(&ps->folio_snapshot, foliop, sizeof(struct page));
                ps->flags = 0;
                ps->idx = 0;
        }
}

static int call_vma_mapped(struct vm_area_struct *vma)
{
        const struct vm_operations_struct *vm_ops = vma->vm_ops;
        void *vm_private_data = vma->vm_private_data;
        int err;

        if (!vm_ops || !vm_ops->mapped)
                return 0;

        err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
                             vma->vm_file, &vm_private_data);
        if (err)
                return err;

        if (vm_private_data != vma->vm_private_data)
                vma->vm_private_data = vm_private_data;
        return 0;
}

static int mmap_action_finish(struct vm_area_struct *vma,
                              struct mmap_action *action, int err)
{
        size_t len;

        if (!err)
                err = call_vma_mapped(vma);
        if (!err && action->success_hook)
                err = action->success_hook(vma);

        /* do_munmap() might take rmap lock, so release if held. */
        maybe_rmap_unlock_action(vma, action);
        if (!err)
                return 0;

        /*
         * If an error occurs, unmap the VMA altogether and return an error. We
         * only clear the newly allocated VMA, since this function is only
         * invoked if we do NOT merge, so we only clean up the VMA we created.
         */
        len = vma_pages(vma) << PAGE_SHIFT;
        do_munmap(current->mm, vma->vm_start, len, NULL);
        if (action->error_hook) {
                /* We may want to filter the error. */
                err = action->error_hook(err);
                /* The caller should not clear the error. */
                VM_WARN_ON_ONCE(!err);
        }
        return err;
}

#ifdef CONFIG_MMU
/**
 * mmap_action_prepare - Perform preparatory setup for an VMA descriptor
 * action which need to be performed.
 * @desc: The VMA descriptor to prepare for its @desc->action.
 *
 * Returns: %0 on success, otherwise error.
 */
int mmap_action_prepare(struct vm_area_desc *desc)
{
        switch (desc->action.type) {
        case MMAP_NOTHING:
                return 0;
        case MMAP_REMAP_PFN:
                return remap_pfn_range_prepare(desc);
        case MMAP_IO_REMAP_PFN:
                return io_remap_pfn_range_prepare(desc);
        case MMAP_SIMPLE_IO_REMAP:
                return simple_ioremap_prepare(desc);
        case MMAP_MAP_KERNEL_PAGES:
                return map_kernel_pages_prepare(desc);
        }

        WARN_ON_ONCE(1);
        return -EINVAL;
}
EXPORT_SYMBOL(mmap_action_prepare);

/**
 * mmap_action_complete - Execute VMA descriptor action.
 * @vma: The VMA to perform the action upon.
 * @action: The action to perform.
 *
 * Similar to mmap_action_prepare().
 *
 * Return: 0 on success, or error, at which point the VMA will be unmapped.
 */
int mmap_action_complete(struct vm_area_struct *vma,
                         struct mmap_action *action)
{
        int err = 0;

        switch (action->type) {
        case MMAP_NOTHING:
                break;
        case MMAP_REMAP_PFN:
                err = remap_pfn_range_complete(vma, action);
                break;
        case MMAP_MAP_KERNEL_PAGES:
                err = map_kernel_pages_complete(vma, action);
                break;
        case MMAP_IO_REMAP_PFN:
        case MMAP_SIMPLE_IO_REMAP:
                /* Should have been delegated. */
                WARN_ON_ONCE(1);
                err = -EINVAL;
                break;
        }

        return mmap_action_finish(vma, action, err);
}
EXPORT_SYMBOL(mmap_action_complete);
#else
int mmap_action_prepare(struct vm_area_desc *desc)
{
        switch (desc->action.type) {
        case MMAP_NOTHING:
                break;
        case MMAP_REMAP_PFN:
        case MMAP_IO_REMAP_PFN:
        case MMAP_SIMPLE_IO_REMAP:
        case MMAP_MAP_KERNEL_PAGES:
                WARN_ON_ONCE(1); /* nommu cannot handle these. */
                break;
        }

        return 0;
}
EXPORT_SYMBOL(mmap_action_prepare);

int mmap_action_complete(struct vm_area_struct *vma,
                         struct mmap_action *action)
{
        int err = 0;

        switch (action->type) {
        case MMAP_NOTHING:
                break;
        case MMAP_REMAP_PFN:
        case MMAP_IO_REMAP_PFN:
        case MMAP_SIMPLE_IO_REMAP:
        case MMAP_MAP_KERNEL_PAGES:
                WARN_ON_ONCE(1); /* nommu cannot handle this. */

                err = -EINVAL;
                break;
        }

        return mmap_action_finish(vma, action, err);
}
EXPORT_SYMBOL(mmap_action_complete);
#endif

#ifdef CONFIG_MMU
/**
 * folio_pte_batch - detect a PTE batch for a large folio
 * @folio: The large folio to detect a PTE batch for.
 * @ptep: Page table pointer for the first entry.
 * @pte: Page table entry for the first page.
 * @max_nr: The maximum number of table entries to consider.
 *
 * This is a simplified variant of folio_pte_batch_flags().
 *
 * Detect a PTE batch: consecutive (present) PTEs that map consecutive
 * pages of the same large folio in a single VMA and a single page table.
 *
 * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
 * the accessed bit, writable bit, dirt-bit and soft-dirty bit.
 *
 * ptep must map any page of the folio. max_nr must be at least one and
 * must be limited by the caller so scanning cannot exceed a single VMA and
 * a single page table.
 *
 * Return: the number of table entries in the batch.
 */
unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
                unsigned int max_nr)
{
        return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
}
#endif /* CONFIG_MMU */

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
/**
 * page_range_contiguous - test whether the page range is contiguous
 * @page: the start of the page range.
 * @nr_pages: the number of pages in the range.
 *
 * Test whether the page range is contiguous, such that they can be iterated
 * naively, corresponding to iterating a contiguous PFN range.
 *
 * This function should primarily only be used for debug checks, or when
 * working with page ranges that are not naturally contiguous (e.g., pages
 * within a folio are).
 *
 * Returns true if contiguous, otherwise false.
 */
bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
{
        const unsigned long start_pfn = page_to_pfn(page);
        const unsigned long end_pfn = start_pfn + nr_pages;
        unsigned long pfn;

        /*
         * The memmap is allocated per memory section, so no need to check
         * within the first section. However, we need to check each other
         * spanned memory section once, making sure the first page in a
         * section could similarly be reached by just iterating pages.
         */
        for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
             pfn < end_pfn; pfn += PAGES_PER_SECTION)
                if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
                        return false;
        return true;
}
EXPORT_SYMBOL(page_range_contiguous);
#endif





















   21 























    1 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fs.h>

#define DEVCG_ACC_MKNOD 1
#define DEVCG_ACC_READ  2
#define DEVCG_ACC_WRITE 4
#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE)

#define DEVCG_DEV_BLOCK 1
#define DEVCG_DEV_CHAR  2
#define DEVCG_DEV_ALL   4  /* this represents all devices */


#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)
int devcgroup_check_permission(short type, u32 major, u32 minor,
                               short access);
static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{
        short type, access = 0;

        if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)))
                return 0;

        if (!inode->i_rdev)
                return 0;

        if (S_ISBLK(inode->i_mode))
                type = DEVCG_DEV_BLOCK;
        else /* S_ISCHR by the test above */
                type = DEVCG_DEV_CHAR;

        if (mask & MAY_WRITE)
                access |= DEVCG_ACC_WRITE;
        if (mask & MAY_READ)
                access |= DEVCG_ACC_READ;

        return devcgroup_check_permission(type, imajor(inode), iminor(inode),
                                          access);
}

static inline int devcgroup_inode_mknod(int mode, dev_t dev)
{
        short type;

        if (!S_ISBLK(mode) && !S_ISCHR(mode))
                return 0;

        if (S_ISCHR(mode) && dev == WHITEOUT_DEV)
                return 0;

        if (S_ISBLK(mode))
                type = DEVCG_DEV_BLOCK;
        else
                type = DEVCG_DEV_CHAR;

        return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
                                          DEVCG_ACC_MKNOD);
}

#else
static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
                               short access)
{ return 0; }
static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{ return 0; }
static inline int devcgroup_inode_mknod(int mode, dev_t dev)
{ return 0; }
#endif

























































































































































































































































































































































































































































    6 




    6 




























































































































































































































    6 


















































































    5 


    6 


    5 




    1 
    5 









    5 







    6 








    6 




    6 



    6 





























    6 


    6 





    6 




    6 





























    5 






















    5 





































    5 
    6 
















    6 











































































































    5 




    1 



    1 











































































    1 


    5 





    6 












    6 




    6 




    6 


    6 

    6 



    6 




































































































































































































































    6 



    6 









    6 

















    5 





















    6 


















    5 































































































































    5 

































































































































































































































































    5 














































































































































































































































































































































































    4 






    6 
    5 











    6 







    6 





















    5 
    5 

    6 






















    5 


    6 






    6 












    5 









    6 









    3 













    3 








    3 
















































































































































    3 








    3 





























































































    6 














    6 



    6 


    3 

    6 
    6 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux INET6 implementation
 *        FIB front-end.
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 */

/*        Changes:
 *
 *        YOSHIFUJI Hideaki @USAGI
 *                reworked default router selection.
 *                - respect outgoing interface
 *                - select from (probably) reachable routers (i.e.
 *                routers in REACHABLE, STALE, DELAY or PROBE states).
 *                - always select the same router if it is (probably)
 *                reachable.  otherwise, round-robin the list.
 *        Ville Nuorvala
 *                Fixed routing subtrees.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/mroute6.h>
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <linux/siphash.h>
#include <net/net_namespace.h>
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <net/rtnh.h>
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
#include <net/ip.h>
#include <linux/uaccess.h>
#include <linux/btf_ids.h>

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

static int ip6_rt_type_to_error(u8 fib6_type);

#define CREATE_TRACE_POINTS
#include <trace/events/fib6.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
#undef CREATE_TRACE_POINTS

enum rt6_nud_state {
        RT6_NUD_FAIL_HARD = -3,
        RT6_NUD_FAIL_PROBE = -2,
        RT6_NUD_FAIL_DO_RR = -1,
        RT6_NUD_SUCCEED = 1
};

INDIRECT_CALLABLE_SCOPE
struct dst_entry        *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int         ip6_default_advmss(const struct dst_entry *dst);
INDIRECT_CALLABLE_SCOPE
unsigned int                ip6_mtu(const struct dst_entry *dst);
static void                ip6_negative_advice(struct sock *sk,
                                            struct dst_entry *dst);
static void                ip6_dst_destroy(struct dst_entry *);
static void                ip6_dst_ifdown(struct dst_entry *,
                                       struct net_device *dev);
static void                 ip6_dst_gc(struct dst_ops *ops);

static int                ip6_pkt_discard(struct sk_buff *skb);
static int                ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static int                ip6_pkt_prohibit(struct sk_buff *skb);
static int                ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void                ip6_link_failure(struct sk_buff *skb);
static void                ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                           struct sk_buff *skb, u32 mtu,
                                           bool confirm_neigh);
static void                rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
                                        struct sk_buff *skb);
static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
                           int strict);
static size_t rt6_nlmsg_size(struct fib6_info *f6i);
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                         struct fib6_info *rt, struct dst_entry *dst,
                         struct in6_addr *dest, struct in6_addr *src,
                         int iif, int type, u32 portid, u32 seq,
                         unsigned int flags);
static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
                                           const struct in6_addr *daddr,
                                           const struct in6_addr *saddr);

#ifdef CONFIG_IPV6_ROUTE_INFO
static struct fib6_info *rt6_add_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev,
                                           unsigned int pref);
static struct fib6_info *rt6_get_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev);
#endif

struct uncached_list {
        spinlock_t                lock;
        struct list_head        head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);

void rt6_uncached_list_add(struct rt6_info *rt)
{
        struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);

        rt->dst.rt_uncached_list = ul;

        spin_lock_bh(&ul->lock);
        list_add_tail(&rt->dst.rt_uncached, &ul->head);
        spin_unlock_bh(&ul->lock);
}

void rt6_uncached_list_del(struct rt6_info *rt)
{
        struct uncached_list *ul = rt->dst.rt_uncached_list;

        if (ul) {
                spin_lock_bh(&ul->lock);
                list_del_init(&rt->dst.rt_uncached);
                spin_unlock_bh(&ul->lock);
        }
}

static void rt6_uncached_list_flush_dev(struct net_device *dev)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
                struct rt6_info *rt, *safe;

                if (list_empty(&ul->head))
                        continue;

                spin_lock_bh(&ul->lock);
                list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
                        struct inet6_dev *rt_idev = rt->rt6i_idev;
                        struct net_device *rt_dev = rt->dst.dev;
                        bool handled = false;

                        if (rt_idev && rt_idev->dev == dev) {
                                rt->rt6i_idev = in6_dev_get(blackhole_netdev);
                                in6_dev_put(rt_idev);
                                handled = true;
                        }

                        if (rt_dev == dev) {
                                rt->dst.dev = blackhole_netdev;
                                netdev_ref_replace(rt_dev, blackhole_netdev,
                                                   &rt->dst.dev_tracker,
                                                   GFP_ATOMIC);
                                handled = true;
                        }
                        if (handled)
                                list_del_init(&rt->dst.rt_uncached);
                }
                spin_unlock_bh(&ul->lock);
        }
}

static inline const void *choose_neigh_daddr(const struct in6_addr *p,
                                             struct sk_buff *skb,
                                             const void *daddr)
{
        if (!ipv6_addr_any(p))
                return (const void *) p;
        else if (skb)
                return &ipv6_hdr(skb)->daddr;
        return daddr;
}

struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
                                   struct net_device *dev,
                                   struct sk_buff *skb,
                                   const void *daddr)
{
        struct neighbour *n;

        daddr = choose_neigh_daddr(gw, skb, daddr);
        n = __ipv6_neigh_lookup(dev, daddr);
        if (n)
                return n;

        n = neigh_create(&nd_tbl, daddr, dev);
        return IS_ERR(n) ? NULL : n;
}

static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
                                              struct sk_buff *skb,
                                              const void *daddr)
{
        const struct rt6_info *rt = dst_rt6_info(dst);

        return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
                                dst_dev(dst), skb, daddr);
}

static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
        const struct rt6_info *rt = dst_rt6_info(dst);
        struct net_device *dev = dst_dev(dst);

        daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
        if (!daddr)
                return;
        if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
                return;
        if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
                return;
        __ipv6_confirm_neigh(dev, daddr);
}

static struct dst_ops ip6_dst_ops_template = {
        .family                        =        AF_INET6,
        .gc                        =        ip6_dst_gc,
        .gc_thresh                =        1024,
        .check                        =        ip6_dst_check,
        .default_advmss                =        ip6_default_advmss,
        .mtu                        =        ip6_mtu,
        .cow_metrics                =        dst_cow_metrics_generic,
        .destroy                =        ip6_dst_destroy,
        .ifdown                        =        ip6_dst_ifdown,
        .negative_advice        =        ip6_negative_advice,
        .link_failure                =        ip6_link_failure,
        .update_pmtu                =        ip6_rt_update_pmtu,
        .redirect                =        rt6_do_redirect,
        .local_out                =        __ip6_local_out,
        .neigh_lookup                =        ip6_dst_neigh_lookup,
        .confirm_neigh                =        ip6_confirm_neigh,
};

static struct dst_ops ip6_dst_blackhole_ops = {
        .family                        = AF_INET6,
        .default_advmss                = ip6_default_advmss,
        .neigh_lookup                = ip6_dst_neigh_lookup,
        .check                        = ip6_dst_check,
        .destroy                = ip6_dst_destroy,
        .cow_metrics                = dst_cow_metrics_generic,
        .update_pmtu                = dst_blackhole_update_pmtu,
        .redirect                = dst_blackhole_redirect,
        .mtu                        = dst_blackhole_mtu,
};

static const u32 ip6_template_metrics[RTAX_MAX] = {
        [RTAX_HOPLIMIT - 1] = 0,
};

static const struct fib6_info fib6_null_entry_template = {
        .fib6_flags        = (RTF_REJECT | RTF_NONEXTHOP),
        .fib6_protocol  = RTPROT_KERNEL,
        .fib6_metric        = ~(u32)0,
        .fib6_ref        = REFCOUNT_INIT(1),
        .fib6_type        = RTN_UNREACHABLE,
        .fib6_metrics        = (struct dst_metrics *)&dst_default_metrics,
};

static const struct rt6_info ip6_null_entry_template = {
        .dst = {
                .__rcuref        = RCUREF_INIT(1),
                .__use                = 1,
                .obsolete        = DST_OBSOLETE_FORCE_CHK,
                .error                = -ENETUNREACH,
                .input                = ip6_pkt_discard,
                .output                = ip6_pkt_discard_out,
        },
        .rt6i_flags        = (RTF_REJECT | RTF_NONEXTHOP),
};

#ifdef CONFIG_IPV6_MULTIPLE_TABLES

static const struct rt6_info ip6_prohibit_entry_template = {
        .dst = {
                .__rcuref        = RCUREF_INIT(1),
                .__use                = 1,
                .obsolete        = DST_OBSOLETE_FORCE_CHK,
                .error                = -EACCES,
                .input                = ip6_pkt_prohibit,
                .output                = ip6_pkt_prohibit_out,
        },
        .rt6i_flags        = (RTF_REJECT | RTF_NONEXTHOP),
};

static const struct rt6_info ip6_blk_hole_entry_template = {
        .dst = {
                .__rcuref        = RCUREF_INIT(1),
                .__use                = 1,
                .obsolete        = DST_OBSOLETE_FORCE_CHK,
                .error                = -EINVAL,
                .input                = dst_discard,
                .output                = dst_discard_out,
        },
        .rt6i_flags        = (RTF_REJECT | RTF_NONEXTHOP),
};

#endif

static void rt6_info_init(struct rt6_info *rt)
{
        memset_after(rt, 0, dst);
}

/* allocate dst with ip6_dst_ops */
struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
                               int flags)
{
        struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
                                        DST_OBSOLETE_FORCE_CHK, flags);

        if (rt) {
                rt6_info_init(rt);
                atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
        }

        return rt;
}
EXPORT_SYMBOL(ip6_dst_alloc);

static void ip6_dst_destroy(struct dst_entry *dst)
{
        struct rt6_info *rt = dst_rt6_info(dst);
        struct fib6_info *from;
        struct inet6_dev *idev;

        ip_dst_metrics_put(dst);
        rt6_uncached_list_del(rt);

        idev = rt->rt6i_idev;
        if (idev) {
                rt->rt6i_idev = NULL;
                in6_dev_put(idev);
        }

        from = unrcu_pointer(xchg(&rt->from, NULL));
        fib6_info_release(from);
}

static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
        struct rt6_info *rt = dst_rt6_info(dst);
        struct inet6_dev *idev = rt->rt6i_idev;
        struct fib6_info *from;

        if (idev && idev->dev != blackhole_netdev) {
                struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);

                if (blackhole_idev) {
                        rt->rt6i_idev = blackhole_idev;
                        in6_dev_put(idev);
                }
        }
        from = unrcu_pointer(xchg(&rt->from, NULL));
        fib6_info_release(from);
}

static bool __rt6_check_expired(const struct rt6_info *rt)
{
        if (rt->rt6i_flags & RTF_EXPIRES)
                return time_after(jiffies, READ_ONCE(rt->dst.expires));
        return false;
}

static bool rt6_check_expired(const struct rt6_info *rt)
{
        struct fib6_info *from;

        from = rcu_dereference(rt->from);

        if (rt->rt6i_flags & RTF_EXPIRES) {
                if (time_after(jiffies, READ_ONCE(rt->dst.expires)))
                        return true;
        } else if (from) {
                return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK ||
                        fib6_check_expired(from);
        }
        return false;
}

static struct fib6_info *
rt6_multipath_first_sibling_rcu(const struct fib6_info *rt)
{
        struct fib6_info *iter;
        struct fib6_node *fn;

        fn = rcu_dereference(rt->fib6_node);
        if (!fn)
                goto out;
        iter = rcu_dereference(fn->leaf);
        if (!iter)
                goto out;

        while (iter) {
                if (iter->fib6_metric == rt->fib6_metric &&
                    rt6_qualify_for_ecmp(iter))
                        return iter;
                iter = rcu_dereference(iter->fib6_next);
        }

out:
        return NULL;
}

void fib6_select_path(const struct net *net, struct fib6_result *res,
                      struct flowi6 *fl6, int oif, bool have_oif_match,
                      const struct sk_buff *skb, int strict)
{
        struct fib6_info *first, *match = res->f6i;
        struct fib6_info *sibling;
        int hash;

        if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
                goto out;

        if (match->nh && have_oif_match && res->nh)
                return;

        if (skb)
                IP6CB(skb)->flags |= IP6SKB_MULTIPATH;

        /* We might have already computed the hash for ICMPv6 errors. In such
         * case it will always be non-zero. Otherwise now is the time to do it.
         */
        if (!fl6->mp_hash &&
            (!match->nh || nexthop_is_multipath(match->nh)))
                fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);

        if (unlikely(match->nh)) {
                nexthop_path_fib6_result(res, fl6->mp_hash);
                return;
        }

        first = rt6_multipath_first_sibling_rcu(match);
        if (!first)
                goto out;

        hash = fl6->mp_hash;
        if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) {
                if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif,
                                    strict) >= 0)
                        match = first;
                goto out;
        }

        list_for_each_entry_rcu(sibling, &first->fib6_siblings,
                                fib6_siblings) {
                const struct fib6_nh *nh = sibling->fib6_nh;
                int nh_upper_bound;

                nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
                if (hash > nh_upper_bound)
                        continue;
                if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
                        break;
                match = sibling;
                break;
        }

out:
        res->f6i = match;
        res->nh = match->fib6_nh;
}

/*
 *        Route lookup. rcu_read_lock() should be held.
 */

static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
                               const struct in6_addr *saddr, int oif, int flags)
{
        const struct net_device *dev;

        if (nh->fib_nh_flags & RTNH_F_DEAD)
                return false;

        dev = nh->fib_nh_dev;
        if (oif) {
                if (dev->ifindex == oif)
                        return true;
        } else {
                if (ipv6_chk_addr(net, saddr, dev,
                                  flags & RT6_LOOKUP_F_IFACE))
                        return true;
        }

        return false;
}

struct fib6_nh_dm_arg {
        struct net                *net;
        const struct in6_addr        *saddr;
        int                        oif;
        int                        flags;
        struct fib6_nh                *nh;
};

static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_dm_arg *arg = _arg;

        arg->nh = nh;
        return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
                                  arg->flags);
}

/* returns fib6_nh from nexthop or NULL */
static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
                                        struct fib6_result *res,
                                        const struct in6_addr *saddr,
                                        int oif, int flags)
{
        struct fib6_nh_dm_arg arg = {
                .net   = net,
                .saddr = saddr,
                .oif   = oif,
                .flags = flags,
        };

        if (nexthop_is_blackhole(nh))
                return NULL;

        if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
                return arg.nh;

        return NULL;
}

static void rt6_device_match(struct net *net, struct fib6_result *res,
                             const struct in6_addr *saddr, int oif, int flags)
{
        struct fib6_info *f6i = res->f6i;
        struct fib6_info *spf6i;
        struct fib6_nh *nh;

        if (!oif && ipv6_addr_any(saddr)) {
                if (unlikely(f6i->nh)) {
                        nh = nexthop_fib6_nh(f6i->nh);
                        if (nexthop_is_blackhole(f6i->nh))
                                goto out_blackhole;
                } else {
                        nh = f6i->fib6_nh;
                }
                if (!(nh->fib_nh_flags & RTNH_F_DEAD))
                        goto out;
        }

        for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
                bool matched = false;

                if (unlikely(spf6i->nh)) {
                        nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
                                              oif, flags);
                        if (nh)
                                matched = true;
                } else {
                        nh = spf6i->fib6_nh;
                        if (__rt6_device_match(net, nh, saddr, oif, flags))
                                matched = true;
                }
                if (matched) {
                        res->f6i = spf6i;
                        goto out;
                }
        }

        if (oif && flags & RT6_LOOKUP_F_IFACE) {
                res->f6i = net->ipv6.fib6_null_entry;
                nh = res->f6i->fib6_nh;
                goto out;
        }

        if (unlikely(f6i->nh)) {
                nh = nexthop_fib6_nh(f6i->nh);
                if (nexthop_is_blackhole(f6i->nh))
                        goto out_blackhole;
        } else {
                nh = f6i->fib6_nh;
        }

        if (nh->fib_nh_flags & RTNH_F_DEAD) {
                res->f6i = net->ipv6.fib6_null_entry;
                nh = res->f6i->fib6_nh;
        }
out:
        res->nh = nh;
        res->fib6_type = res->f6i->fib6_type;
        res->fib6_flags = res->f6i->fib6_flags;
        return;

out_blackhole:
        res->fib6_flags |= RTF_REJECT;
        res->fib6_type = RTN_BLACKHOLE;
        res->nh = nh;
}

#ifdef CONFIG_IPV6_ROUTER_PREF
struct __rt6_probe_work {
        struct work_struct work;
        struct in6_addr target;
        struct net_device *dev;
        netdevice_tracker dev_tracker;
};

static void rt6_probe_deferred(struct work_struct *w)
{
        struct in6_addr mcaddr;
        struct __rt6_probe_work *work =
                container_of(w, struct __rt6_probe_work, work);

        addrconf_addr_solict_mult(&work->target, &mcaddr);
        ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
        netdev_put(work->dev, &work->dev_tracker);
        kfree(work);
}

static void rt6_probe(struct fib6_nh *fib6_nh)
{
        struct __rt6_probe_work *work = NULL;
        const struct in6_addr *nh_gw;
        unsigned long last_probe;
        struct neighbour *neigh;
        struct net_device *dev;
        struct inet6_dev *idev;

        /*
         * Okay, this does not seem to be appropriate
         * for now, however, we need to check if it
         * is really so; aka Router Reachability Probing.
         *
         * Router Reachability Probe MUST be rate-limited
         * to no more than one per minute.
         */
        if (!fib6_nh->fib_nh_gw_family)
                return;

        nh_gw = &fib6_nh->fib_nh_gw6;
        dev = fib6_nh->fib_nh_dev;
        rcu_read_lock();
        last_probe = READ_ONCE(fib6_nh->last_probe);
        idev = __in6_dev_get(dev);
        if (!idev)
                goto out;
        neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
        if (neigh) {
                if (READ_ONCE(neigh->nud_state) & NUD_VALID)
                        goto out;

                write_lock_bh(&neigh->lock);
                if (!(neigh->nud_state & NUD_VALID) &&
                    time_after(jiffies,
                               neigh->updated +
                               READ_ONCE(idev->cnf.rtr_probe_interval))) {
                        work = kmalloc_obj(*work, GFP_ATOMIC);
                        if (work)
                                __neigh_set_probe_once(neigh);
                }
                write_unlock_bh(&neigh->lock);
        } else if (time_after(jiffies, last_probe +
                                       READ_ONCE(idev->cnf.rtr_probe_interval))) {
                work = kmalloc_obj(*work, GFP_ATOMIC);
        }

        if (!work || cmpxchg(&fib6_nh->last_probe,
                             last_probe, jiffies) != last_probe) {
                kfree(work);
        } else {
                INIT_WORK(&work->work, rt6_probe_deferred);
                work->target = *nh_gw;
                netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC);
                work->dev = dev;
                schedule_work(&work->work);
        }

out:
        rcu_read_unlock();
}
#else
static inline void rt6_probe(struct fib6_nh *fib6_nh)
{
}
#endif

/*
 * Default Router Selection (RFC 2461 6.3.6)
 */
static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
{
        enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
        struct neighbour *neigh;

        rcu_read_lock();
        neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
                                          &fib6_nh->fib_nh_gw6);
        if (neigh) {
                u8 nud_state = READ_ONCE(neigh->nud_state);

                if (nud_state & NUD_VALID)
                        ret = RT6_NUD_SUCCEED;
#ifdef CONFIG_IPV6_ROUTER_PREF
                else if (!(nud_state & NUD_FAILED))
                        ret = RT6_NUD_SUCCEED;
                else
                        ret = RT6_NUD_FAIL_PROBE;
#endif
        } else {
                ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
                      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
        }
        rcu_read_unlock();

        return ret;
}

static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
                           int strict)
{
        int m = 0;

        if (!oif || nh->fib_nh_dev->ifindex == oif)
                m = 2;

        if (!m && (strict & RT6_LOOKUP_F_IFACE))
                return RT6_NUD_FAIL_HARD;
#ifdef CONFIG_IPV6_ROUTER_PREF
        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
#endif
        if ((strict & RT6_LOOKUP_F_REACHABLE) &&
            !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
                int n = rt6_check_neigh(nh);
                if (n < 0)
                        return n;
        }
        return m;
}

static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
                       int oif, int strict, int *mpri, bool *do_rr)
{
        bool match_do_rr = false;
        bool rc = false;
        int m;

        if (nh->fib_nh_flags & RTNH_F_DEAD)
                goto out;

        if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
            nh->fib_nh_flags & RTNH_F_LINKDOWN &&
            !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
                goto out;

        m = rt6_score_route(nh, fib6_flags, oif, strict);
        if (m == RT6_NUD_FAIL_DO_RR) {
                match_do_rr = true;
                m = 0; /* lowest valid score */
        } else if (m == RT6_NUD_FAIL_HARD) {
                goto out;
        }

        if (strict & RT6_LOOKUP_F_REACHABLE)
                rt6_probe(nh);

        /* note that m can be RT6_NUD_FAIL_PROBE at this point */
        if (m > *mpri) {
                *do_rr = match_do_rr;
                *mpri = m;
                rc = true;
        }
out:
        return rc;
}

struct fib6_nh_frl_arg {
        u32                flags;
        int                oif;
        int                strict;
        int                *mpri;
        bool                *do_rr;
        struct fib6_nh        *nh;
};

static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_frl_arg *arg = _arg;

        arg->nh = nh;
        return find_match(nh, arg->flags, arg->oif, arg->strict,
                          arg->mpri, arg->do_rr);
}

static void __find_rr_leaf(struct fib6_info *f6i_start,
                           struct fib6_info *nomatch, u32 metric,
                           struct fib6_result *res, struct fib6_info **cont,
                           int oif, int strict, bool *do_rr, int *mpri)
{
        struct fib6_info *f6i;

        for (f6i = f6i_start;
             f6i && f6i != nomatch;
             f6i = rcu_dereference(f6i->fib6_next)) {
                bool matched = false;
                struct fib6_nh *nh;

                if (cont && f6i->fib6_metric != metric) {
                        *cont = f6i;
                        return;
                }

                if (fib6_check_expired(f6i))
                        continue;

                if (unlikely(f6i->nh)) {
                        struct fib6_nh_frl_arg arg = {
                                .flags  = f6i->fib6_flags,
                                .oif    = oif,
                                .strict = strict,
                                .mpri   = mpri,
                                .do_rr  = do_rr
                        };

                        if (nexthop_is_blackhole(f6i->nh)) {
                                res->fib6_flags = RTF_REJECT;
                                res->fib6_type = RTN_BLACKHOLE;
                                res->f6i = f6i;
                                res->nh = nexthop_fib6_nh(f6i->nh);
                                return;
                        }
                        if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
                                                     &arg)) {
                                matched = true;
                                nh = arg.nh;
                        }
                } else {
                        nh = f6i->fib6_nh;
                        if (find_match(nh, f6i->fib6_flags, oif, strict,
                                       mpri, do_rr))
                                matched = true;
                }
                if (matched) {
                        res->f6i = f6i;
                        res->nh = nh;
                        res->fib6_flags = f6i->fib6_flags;
                        res->fib6_type = f6i->fib6_type;
                }
        }
}

static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
                         struct fib6_info *rr_head, int oif, int strict,
                         bool *do_rr, struct fib6_result *res)
{
        u32 metric = rr_head->fib6_metric;
        struct fib6_info *cont = NULL;
        int mpri = -1;

        __find_rr_leaf(rr_head, NULL, metric, res, &cont,
                       oif, strict, do_rr, &mpri);

        __find_rr_leaf(leaf, rr_head, metric, res, &cont,
                       oif, strict, do_rr, &mpri);

        if (res->f6i || !cont)
                return;

        __find_rr_leaf(cont, NULL, metric, res, NULL,
                       oif, strict, do_rr, &mpri);
}

static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
                       struct fib6_result *res, int strict)
{
        struct fib6_info *leaf = rcu_dereference(fn->leaf);
        struct fib6_info *rt0;
        bool do_rr = false;
        int key_plen;

        /* make sure this function or its helpers sets f6i */
        res->f6i = NULL;

        if (!leaf || leaf == net->ipv6.fib6_null_entry)
                goto out;

        rt0 = rcu_dereference(fn->rr_ptr);
        if (!rt0)
                rt0 = leaf;

        /* Double check to make sure fn is not an intermediate node
         * and fn->leaf does not points to its child's leaf
         * (This might happen if all routes under fn are deleted from
         * the tree and fib6_repair_tree() is called on the node.)
         */
        key_plen = rt0->fib6_dst.plen;
#ifdef CONFIG_IPV6_SUBTREES
        if (rt0->fib6_src.plen)
                key_plen = rt0->fib6_src.plen;
#endif
        if (fn->fn_bit != key_plen)
                goto out;

        find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
        if (do_rr) {
                struct fib6_info *next = rcu_dereference(rt0->fib6_next);

                /* no entries matched; do round-robin */
                if (!next || next->fib6_metric != rt0->fib6_metric)
                        next = leaf;

                if (next != rt0) {
                        spin_lock_bh(&leaf->fib6_table->tb6_lock);
                        /* make sure next is not being deleted from the tree */
                        if (next->fib6_node)
                                rcu_assign_pointer(fn->rr_ptr, next);
                        spin_unlock_bh(&leaf->fib6_table->tb6_lock);
                }
        }

out:
        if (!res->f6i) {
                res->f6i = net->ipv6.fib6_null_entry;
                res->nh = res->f6i->fib6_nh;
                res->fib6_flags = res->f6i->fib6_flags;
                res->fib6_type = res->f6i->fib6_type;
        }
}

static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
{
        return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
               res->nh->fib_nh_gw_family;
}

#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                  const struct in6_addr *gwaddr)
{
        struct net *net = dev_net(dev);
        struct route_info *rinfo = (struct route_info *) opt;
        struct in6_addr prefix_buf, *prefix;
        struct fib6_table *table;
        unsigned int pref;
        unsigned long lifetime;
        struct fib6_info *rt;

        if (len < sizeof(struct route_info)) {
                return -EINVAL;
        }

        /* Sanity check for prefix_len and length */
        if (rinfo->length > 3) {
                return -EINVAL;
        } else if (rinfo->prefix_len > 128) {
                return -EINVAL;
        } else if (rinfo->prefix_len > 64) {
                if (rinfo->length < 2) {
                        return -EINVAL;
                }
        } else if (rinfo->prefix_len > 0) {
                if (rinfo->length < 1) {
                        return -EINVAL;
                }
        }

        pref = rinfo->route_pref;
        if (pref == ICMPV6_ROUTER_PREF_INVALID)
                return -EINVAL;

        lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);

        if (rinfo->length == 3)
                prefix = (struct in6_addr *)rinfo->prefix;
        else {
                /* this function is safe */
                ipv6_addr_prefix(&prefix_buf,
                                 (struct in6_addr *)rinfo->prefix,
                                 rinfo->prefix_len);
                prefix = &prefix_buf;
        }

        if (rinfo->prefix_len == 0)
                rt = rt6_get_dflt_router(net, gwaddr, dev);
        else
                rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
                                        gwaddr, dev);

        if (rt && !lifetime) {
                ip6_del_rt(net, rt, false);
                rt = NULL;
        }

        if (!rt && lifetime)
                rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
                                        dev, pref);
        else if (rt)
                rt->fib6_flags = RTF_ROUTEINFO |
                                 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);

        if (rt) {
                table = rt->fib6_table;
                spin_lock_bh(&table->tb6_lock);

                if (!addrconf_finite_timeout(lifetime)) {
                        fib6_clean_expires(rt);
                        fib6_may_remove_gc_list(net, rt);
                } else {
                        fib6_set_expires(rt, jiffies + HZ * lifetime);
                        fib6_add_gc_list(rt);
                }

                spin_unlock_bh(&table->tb6_lock);

                fib6_info_release(rt);
        }
        return 0;
}
#endif

/*
 *        Misc support functions
 */

/* called with rcu_lock held */
static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
{
        struct net_device *dev = res->nh->fib_nh_dev;

        if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
                /* for copies of local routes, dst->dev needs to be the
                 * device if it is a master device, the master device if
                 * device is enslaved, and the loopback as the default
                 */
                if (netif_is_l3_slave(dev) &&
                    !rt6_need_strict(&res->f6i->fib6_dst.addr))
                        dev = l3mdev_master_dev_rcu(dev) ? :
                              dev_net(dev)->loopback_dev;
                else if (!netif_is_l3_master(dev))
                        dev = dev_net(dev)->loopback_dev;
                /* last case is netif_is_l3_master(dev) is true in which
                 * case we want dev returned to be dev
                 */
        }

        return dev;
}

static const int fib6_prop[RTN_MAX + 1] = {
        [RTN_UNSPEC]        = 0,
        [RTN_UNICAST]        = 0,
        [RTN_LOCAL]        = 0,
        [RTN_BROADCAST]        = 0,
        [RTN_ANYCAST]        = 0,
        [RTN_MULTICAST]        = 0,
        [RTN_BLACKHOLE]        = -EINVAL,
        [RTN_UNREACHABLE] = -EHOSTUNREACH,
        [RTN_PROHIBIT]        = -EACCES,
        [RTN_THROW]        = -EAGAIN,
        [RTN_NAT]        = -EINVAL,
        [RTN_XRESOLVE]        = -EINVAL,
};

static int ip6_rt_type_to_error(u8 fib6_type)
{
        return fib6_prop[fib6_type];
}

static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
{
        unsigned short flags = 0;

        if (rt->dst_nocount)
                flags |= DST_NOCOUNT;
        if (rt->dst_nopolicy)
                flags |= DST_NOPOLICY;

        return flags;
}

static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
{
        rt->dst.error = ip6_rt_type_to_error(fib6_type);

        switch (fib6_type) {
        case RTN_BLACKHOLE:
                rt->dst.output = dst_discard_out;
                rt->dst.input = dst_discard;
                break;
        case RTN_PROHIBIT:
                rt->dst.output = ip6_pkt_prohibit_out;
                rt->dst.input = ip6_pkt_prohibit;
                break;
        case RTN_THROW:
        case RTN_UNREACHABLE:
        default:
                rt->dst.output = ip6_pkt_discard_out;
                rt->dst.input = ip6_pkt_discard;
                break;
        }
}

static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
{
        struct fib6_info *f6i = res->f6i;

        if (res->fib6_flags & RTF_REJECT) {
                ip6_rt_init_dst_reject(rt, res->fib6_type);
                return;
        }

        rt->dst.error = 0;
        rt->dst.output = ip6_output;

        if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
                rt->dst.input = ip6_input;
        } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
                rt->dst.input = ip6_mc_input;
                rt->dst.output = ip6_mr_output;
        } else {
                rt->dst.input = ip6_forward;
        }

        if (res->nh->fib_nh_lws) {
                rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
                lwtunnel_set_redirect(&rt->dst);
        }

        rt->dst.lastuse = jiffies;
}

/* Caller must already hold reference to @from */
static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
{
        rt->rt6i_flags &= ~RTF_EXPIRES;
        rcu_assign_pointer(rt->from, from);
        ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
}

/* Caller must already hold reference to f6i in result */
static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
{
        const struct fib6_nh *nh = res->nh;
        const struct net_device *dev = nh->fib_nh_dev;
        struct fib6_info *f6i = res->f6i;

        ip6_rt_init_dst(rt, res);

        rt->rt6i_dst = f6i->fib6_dst;
        rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
        rt->rt6i_flags = res->fib6_flags;
        if (nh->fib_nh_gw_family) {
                rt->rt6i_gateway = nh->fib_nh_gw6;
                rt->rt6i_flags |= RTF_GATEWAY;
        }
        rt6_set_from(rt, f6i);
#ifdef CONFIG_IPV6_SUBTREES
        rt->rt6i_src = f6i->fib6_src;
#endif
}

static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
                                        struct in6_addr *saddr)
{
        struct fib6_node *pn, *sn;
        while (1) {
                if (fn->fn_flags & RTN_TL_ROOT)
                        return NULL;
                pn = rcu_dereference(fn->parent);
                sn = FIB6_SUBTREE(pn);
                if (sn && sn != fn)
                        fn = fib6_node_lookup(sn, NULL, saddr);
                else
                        fn = pn;
                if (fn->fn_flags & RTN_RTINFO)
                        return fn;
        }
}

static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
{
        struct rt6_info *rt = *prt;

        if (dst_hold_safe(&rt->dst))
                return true;
        if (net) {
                rt = net->ipv6.ip6_null_entry;
                dst_hold(&rt->dst);
        } else {
                rt = NULL;
        }
        *prt = rt;
        return false;
}

/* called with rcu_lock held */
static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
{
        struct net_device *dev = res->nh->fib_nh_dev;
        struct fib6_info *f6i = res->f6i;
        unsigned short flags;
        struct rt6_info *nrt;

        if (!fib6_info_hold_safe(f6i))
                goto fallback;

        flags = fib6_info_dst_flags(f6i);
        nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
        if (!nrt) {
                fib6_info_release(f6i);
                goto fallback;
        }

        ip6_rt_copy_init(nrt, res);
        return nrt;

fallback:
        nrt = dev_net(dev)->ipv6.ip6_null_entry;
        dst_hold(&nrt->dst);
        return nrt;
}

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags)
{
        struct fib6_result res = {};
        struct fib6_node *fn;
        struct rt6_info *rt;

        rcu_read_lock();
        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
        res.f6i = rcu_dereference(fn->leaf);
        if (!res.f6i)
                res.f6i = net->ipv6.fib6_null_entry;
        else
                rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
                                 flags);

        if (res.f6i == net->ipv6.fib6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto restart;

                rt = net->ipv6.ip6_null_entry;
                dst_hold(&rt->dst);
                goto out;
        } else if (res.fib6_flags & RTF_REJECT) {
                goto do_create;
        }

        fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
                         fl6->flowi6_oif != 0, skb, flags);

        /* Search through exception table */
        rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
        if (rt) {
                if (ip6_hold_safe(net, &rt))
                        dst_use_noref(&rt->dst, jiffies);
        } else {
do_create:
                rt = ip6_create_rt_rcu(&res);
        }

out:
        trace_fib6_table_lookup(net, &res, table, fl6);

        rcu_read_unlock();

        return rt;
}

struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb, int flags)
{
        return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
                            const struct in6_addr *saddr, int oif,
                            const struct sk_buff *skb, int strict)
{
        struct flowi6 fl6 = {
                .flowi6_oif = oif,
                .daddr = *daddr,
        };
        struct dst_entry *dst;
        int flags = strict ? RT6_LOOKUP_F_IFACE : 0;

        if (saddr) {
                memcpy(&fl6.saddr, saddr, sizeof(*saddr));
                flags |= RT6_LOOKUP_F_HAS_SADDR;
        }

        dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
        if (dst->error == 0)
                return dst_rt6_info(dst);

        dst_release(dst);

        return NULL;
}
EXPORT_SYMBOL(rt6_lookup);

/* ip6_ins_rt is called with FREE table->tb6_lock.
 * It takes new route entry, the addition fails by any reason the
 * route is released.
 * Caller must hold dst before calling it.
 */

static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
                        struct netlink_ext_ack *extack)
{
        int err;
        struct fib6_table *table;

        table = rt->fib6_table;
        spin_lock_bh(&table->tb6_lock);
        err = fib6_add(&table->tb6_root, rt, info, extack);
        spin_unlock_bh(&table->tb6_lock);

        return err;
}

int ip6_ins_rt(struct net *net, struct fib6_info *rt)
{
        struct nl_info info = {        .nl_net = net, };

        return __ip6_ins_rt(rt, &info, NULL);
}

static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
                                           const struct in6_addr *daddr,
                                           const struct in6_addr *saddr)
{
        struct fib6_info *f6i = res->f6i;
        struct net_device *dev;
        struct rt6_info *rt;

        /*
         *        Clone the route.
         */

        if (!fib6_info_hold_safe(f6i))
                return NULL;

        dev = ip6_rt_get_dev_rcu(res);
        rt = ip6_dst_alloc(dev_net(dev), dev, 0);
        if (!rt) {
                fib6_info_release(f6i);
                return NULL;
        }

        ip6_rt_copy_init(rt, res);
        rt->rt6i_flags |= RTF_CACHE;
        rt->rt6i_dst.addr = *daddr;
        rt->rt6i_dst.plen = 128;

        if (!rt6_is_gw_or_nonexthop(res)) {
                if (f6i->fib6_dst.plen != 128 &&
                    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
                        rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
                if (rt->rt6i_src.plen && saddr) {
                        rt->rt6i_src.addr = *saddr;
                        rt->rt6i_src.plen = 128;
                }
#endif
        }

        return rt;
}

static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
{
        struct fib6_info *f6i = res->f6i;
        unsigned short flags = fib6_info_dst_flags(f6i);
        struct net_device *dev;
        struct rt6_info *pcpu_rt;

        if (!fib6_info_hold_safe(f6i))
                return NULL;

        rcu_read_lock();
        dev = ip6_rt_get_dev_rcu(res);
        pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
        rcu_read_unlock();
        if (!pcpu_rt) {
                fib6_info_release(f6i);
                return NULL;
        }
        ip6_rt_copy_init(pcpu_rt, res);
        pcpu_rt->rt6i_flags |= RTF_PCPU;

        if (f6i->nh)
                pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));

        return pcpu_rt;
}

static bool rt6_is_valid(const struct rt6_info *rt6)
{
        return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
}

/* It should be called with rcu_read_lock() acquired */
static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
        struct rt6_info *pcpu_rt;

        pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);

        if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
                struct rt6_info *prev, **p;

                p = this_cpu_ptr(res->nh->rt6i_pcpu);
                /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */
                prev = xchg(p, NULL);
                if (prev) {
                        dst_dev_put(&prev->dst);
                        dst_release(&prev->dst);
                }

                pcpu_rt = NULL;
        }

        return pcpu_rt;
}

static struct rt6_info *rt6_make_pcpu_route(struct net *net,
                                            const struct fib6_result *res)
{
        struct rt6_info *pcpu_rt, *prev, **p;

        pcpu_rt = ip6_rt_pcpu_alloc(res);
        if (!pcpu_rt)
                return NULL;

        p = this_cpu_ptr(res->nh->rt6i_pcpu);
        prev = cmpxchg(p, NULL, pcpu_rt);
        if (unlikely(prev)) {
                /*
                 * Another task on this CPU already installed a pcpu_rt.
                 * This can happen on PREEMPT_RT where preemption is possible.
                 * Free our allocation and return the existing one.
                 */
                WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RT));

                dst_dev_put(&pcpu_rt->dst);
                dst_release(&pcpu_rt->dst);
                return prev;
        }

        if (res->f6i->fib6_destroying) {
                struct fib6_info *from;

                from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
                fib6_info_release(from);
        }

        return pcpu_rt;
}

/* exception hash table implementation
 */
static DEFINE_SPINLOCK(rt6_exception_lock);

/* Remove rt6_ex from hash table and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
                                 struct rt6_exception *rt6_ex)
{
        struct net *net;

        if (!bucket || !rt6_ex)
                return;

        net = dev_net(rt6_ex->rt6i->dst.dev);
        net->ipv6.rt6_stats->fib_rt_cache--;

        /* purge completely the exception to allow releasing the held resources:
         * some [sk] cache may keep the dst around for unlimited time
         */
        dst_dev_put(&rt6_ex->rt6i->dst);

        hlist_del_rcu(&rt6_ex->hlist);
        dst_release(&rt6_ex->rt6i->dst);
        kfree_rcu(rt6_ex, rcu);
        WARN_ON_ONCE(!bucket->depth);
        bucket->depth--;
}

/* Remove oldest rt6_ex in bucket and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
{
        struct rt6_exception *rt6_ex, *oldest = NULL;

        if (!bucket)
                return;

        hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
                if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
                        oldest = rt6_ex;
        }
        rt6_remove_exception(bucket, oldest);
}

static u32 rt6_exception_hash(const struct in6_addr *dst,
                              const struct in6_addr *src)
{
        static siphash_aligned_key_t rt6_exception_key;
        struct {
                struct in6_addr dst;
                struct in6_addr src;
        } __aligned(SIPHASH_ALIGNMENT) combined = {
                .dst = *dst,
        };
        u64 val;

        net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));

#ifdef CONFIG_IPV6_SUBTREES
        if (src)
                combined.src = *src;
#endif
        val = siphash(&combined, sizeof(combined), &rt6_exception_key);

        return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rt6_exception_lock
 */
static struct rt6_exception *
__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
                              const struct in6_addr *daddr,
                              const struct in6_addr *saddr)
{
        struct rt6_exception *rt6_ex;
        u32 hval;

        if (!(*bucket) || !daddr)
                return NULL;

        hval = rt6_exception_hash(daddr, saddr);
        *bucket += hval;

        hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
                struct rt6_info *rt6 = rt6_ex->rt6i;
                bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
                if (matched && saddr)
                        matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
                if (matched)
                        return rt6_ex;
        }
        return NULL;
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rcu_read_lock()
 */
static struct rt6_exception *
__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
                         const struct in6_addr *daddr,
                         const struct in6_addr *saddr)
{
        struct rt6_exception *rt6_ex;
        u32 hval;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (!(*bucket) || !daddr)
                return NULL;

        hval = rt6_exception_hash(daddr, saddr);
        *bucket += hval;

        hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
                struct rt6_info *rt6 = rt6_ex->rt6i;
                bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
                if (matched && saddr)
                        matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
                if (matched)
                        return rt6_ex;
        }
        return NULL;
}

static unsigned int fib6_mtu(const struct fib6_result *res)
{
        const struct fib6_nh *nh = res->nh;
        unsigned int mtu;

        if (res->f6i->fib6_pmtu) {
                mtu = res->f6i->fib6_pmtu;
        } else {
                struct net_device *dev = nh->fib_nh_dev;
                struct inet6_dev *idev;

                rcu_read_lock();
                idev = __in6_dev_get(dev);
                mtu = READ_ONCE(idev->cnf.mtu6);
                rcu_read_unlock();
        }

        mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

        return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}

#define FIB6_EXCEPTION_BUCKET_FLUSHED  0x1UL

/* used when the flushed bit is not relevant, only access to the bucket
 * (ie., all bucket users except rt6_insert_exception);
 *
 * called under rcu lock; sometimes called with rt6_exception_lock held
 */
static
struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
                                                       spinlock_t *lock)
{
        struct rt6_exception_bucket *bucket;

        if (lock)
                bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
                                                   lockdep_is_held(lock));
        else
                bucket = rcu_dereference(nh->rt6i_exception_bucket);

        /* remove bucket flushed bit if set */
        if (bucket) {
                unsigned long p = (unsigned long)bucket;

                p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
                bucket = (struct rt6_exception_bucket *)p;
        }

        return bucket;
}

static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
{
        unsigned long p = (unsigned long)bucket;

        return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
}

/* called with rt6_exception_lock held */
static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
                                              spinlock_t *lock)
{
        struct rt6_exception_bucket *bucket;
        unsigned long p;

        bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
                                           lockdep_is_held(lock));

        p = (unsigned long)bucket;
        p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
        bucket = (struct rt6_exception_bucket *)p;
        rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
}

static int rt6_insert_exception(struct rt6_info *nrt,
                                const struct fib6_result *res)
{
        struct net *net = dev_net(nrt->dst.dev);
        struct rt6_exception_bucket *bucket;
        struct fib6_info *f6i = res->f6i;
        struct in6_addr *src_key = NULL;
        struct rt6_exception *rt6_ex;
        struct fib6_nh *nh = res->nh;
        int max_depth;
        int err = 0;

        spin_lock_bh(&rt6_exception_lock);

        bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
                                          lockdep_is_held(&rt6_exception_lock));
        if (!bucket) {
                bucket = kzalloc_objs(*bucket, FIB6_EXCEPTION_BUCKET_SIZE,
                                      GFP_ATOMIC);
                if (!bucket) {
                        err = -ENOMEM;
                        goto out;
                }
                rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
        } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
                err = -EINVAL;
                goto out;
        }

#ifdef CONFIG_IPV6_SUBTREES
        /* fib6_src.plen != 0 indicates f6i is in subtree
         * and exception table is indexed by a hash of
         * both fib6_dst and fib6_src.
         * Otherwise, the exception table is indexed by
         * a hash of only fib6_dst.
         */
        if (f6i->fib6_src.plen)
                src_key = &nrt->rt6i_src.addr;
#endif
        /* rt6_mtu_change() might lower mtu on f6i.
         * Only insert this exception route if its mtu
         * is less than f6i's mtu value.
         */
        if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
                err = -EINVAL;
                goto out;
        }

        rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
                                               src_key);
        if (rt6_ex)
                rt6_remove_exception(bucket, rt6_ex);

        rt6_ex = kzalloc_obj(*rt6_ex, GFP_ATOMIC);
        if (!rt6_ex) {
                err = -ENOMEM;
                goto out;
        }
        rt6_ex->rt6i = nrt;
        rt6_ex->stamp = jiffies;
        hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
        bucket->depth++;
        net->ipv6.rt6_stats->fib_rt_cache++;

        /* Randomize max depth to avoid some side channels attacks. */
        max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH);
        while (bucket->depth > max_depth)
                rt6_exception_remove_oldest(bucket);

out:
        spin_unlock_bh(&rt6_exception_lock);

        /* Update fn->fn_sernum to invalidate all cached dst */
        if (!err) {
                spin_lock_bh(&f6i->fib6_table->tb6_lock);
                fib6_update_sernum(net, f6i);
                fib6_add_gc_list(f6i);
                spin_unlock_bh(&f6i->fib6_table->tb6_lock);
                fib6_force_start_gc(net);
        }

        return err;
}

static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
        int i;

        spin_lock_bh(&rt6_exception_lock);

        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (!bucket)
                goto out;

        /* Prevent rt6_insert_exception() to recreate the bucket list */
        if (!from)
                fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);

        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
                        if (!from ||
                            rcu_access_pointer(rt6_ex->rt6i->from) == from)
                                rt6_remove_exception(bucket, rt6_ex);
                }
                WARN_ON_ONCE(!from && bucket->depth);
                bucket++;
        }
out:
        spin_unlock_bh(&rt6_exception_lock);
}

static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
{
        struct fib6_info *f6i = arg;

        fib6_nh_flush_exceptions(nh, f6i);

        return 0;
}

void rt6_flush_exceptions(struct fib6_info *f6i)
{
        if (f6i->nh) {
                rcu_read_lock();
                nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i);
                rcu_read_unlock();
        } else {
                fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
        }
}

/* Find cached rt in the hash table inside passed in rt
 * Caller has to hold rcu_read_lock()
 */
static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
                                           const struct in6_addr *daddr,
                                           const struct in6_addr *saddr)
{
        const struct in6_addr *src_key = NULL;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct rt6_info *ret = NULL;

#ifdef CONFIG_IPV6_SUBTREES
        /* fib6i_src.plen != 0 indicates f6i is in subtree
         * and exception table is indexed by a hash of
         * both fib6_dst and fib6_src.
         * However, the src addr used to create the hash
         * might not be exactly the passed in saddr which
         * is a /128 addr from the flow.
         * So we need to use f6i->fib6_src to redo lookup
         * if the passed in saddr does not find anything.
         * (See the logic in ip6_rt_cache_alloc() on how
         * rt->rt6i_src is updated.)
         */
        if (res->f6i->fib6_src.plen)
                src_key = saddr;
find_ex:
#endif
        bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
        rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);

        if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
                ret = rt6_ex->rt6i;

#ifdef CONFIG_IPV6_SUBTREES
        /* Use fib6_src as src_key and redo lookup */
        if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
                src_key = &res->f6i->fib6_src.addr;
                goto find_ex;
        }
#endif

        return ret;
}

/* Remove the passed in cached rt from the hash table that contains it */
static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
                                    const struct rt6_info *rt)
{
        const struct in6_addr *src_key = NULL;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        int err;

        if (!rcu_access_pointer(nh->rt6i_exception_bucket))
                return -ENOENT;

        spin_lock_bh(&rt6_exception_lock);
        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);

#ifdef CONFIG_IPV6_SUBTREES
        /* rt6i_src.plen != 0 indicates 'from' is in subtree
         * and exception table is indexed by a hash of
         * both rt6i_dst and rt6i_src.
         * Otherwise, the exception table is indexed by
         * a hash of only rt6i_dst.
         */
        if (plen)
                src_key = &rt->rt6i_src.addr;
#endif
        rt6_ex = __rt6_find_exception_spinlock(&bucket,
                                               &rt->rt6i_dst.addr,
                                               src_key);
        if (rt6_ex) {
                rt6_remove_exception(bucket, rt6_ex);
                err = 0;
        } else {
                err = -ENOENT;
        }

        spin_unlock_bh(&rt6_exception_lock);
        return err;
}

struct fib6_nh_excptn_arg {
        struct rt6_info        *rt;
        int                plen;
};

static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_excptn_arg *arg = _arg;
        int err;

        err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
        if (err == 0)
                return 1;

        return 0;
}

static int rt6_remove_exception_rt(struct rt6_info *rt)
{
        struct fib6_info *from;

        from = rcu_dereference(rt->from);
        if (!from || !(rt->rt6i_flags & RTF_CACHE))
                return -EINVAL;

        if (from->nh) {
                struct fib6_nh_excptn_arg arg = {
                        .rt = rt,
                        .plen = from->fib6_src.plen
                };
                int rc;

                /* rc = 1 means an entry was found */
                rc = nexthop_for_each_fib6_nh(from->nh,
                                              rt6_nh_remove_exception_rt,
                                              &arg);
                return rc ? 0 : -ENOENT;
        }

        return fib6_nh_remove_exception(from->fib6_nh,
                                        from->fib6_src.plen, rt);
}

/* Find rt6_ex which contains the passed in rt cache and
 * refresh its stamp
 */
static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
                                     const struct rt6_info *rt)
{
        const struct in6_addr *src_key = NULL;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;

        bucket = fib6_nh_get_excptn_bucket(nh, NULL);
#ifdef CONFIG_IPV6_SUBTREES
        /* rt6i_src.plen != 0 indicates 'from' is in subtree
         * and exception table is indexed by a hash of
         * both rt6i_dst and rt6i_src.
         * Otherwise, the exception table is indexed by
         * a hash of only rt6i_dst.
         */
        if (plen)
                src_key = &rt->rt6i_src.addr;
#endif
        rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
        if (rt6_ex)
                rt6_ex->stamp = jiffies;
}

struct fib6_nh_match_arg {
        const struct net_device *dev;
        const struct in6_addr        *gw;
        struct fib6_nh                *match;
};

/* determine if fib6_nh has given device and gateway */
static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_match_arg *arg = _arg;

        if (arg->dev != nh->fib_nh_dev ||
            (arg->gw && !nh->fib_nh_gw_family) ||
            (!arg->gw && nh->fib_nh_gw_family) ||
            (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
                return 0;

        arg->match = nh;

        /* found a match, break the loop */
        return 1;
}

static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
        struct fib6_info *from;
        struct fib6_nh *fib6_nh;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (!from || !(rt->rt6i_flags & RTF_CACHE))
                goto unlock;

        if (from->nh) {
                struct fib6_nh_match_arg arg = {
                        .dev = rt->dst.dev,
                        .gw = &rt->rt6i_gateway,
                };

                nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);

                if (!arg.match)
                        goto unlock;
                fib6_nh = arg.match;
        } else {
                fib6_nh = from->fib6_nh;
        }
        fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
unlock:
        rcu_read_unlock();
}

static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
                                         struct rt6_info *rt, int mtu)
{
        u32 dmtu = dst6_mtu(&rt->dst);

        /* If the new MTU is lower than the route PMTU, this new MTU will be the
         * lowest MTU in the path: always allow updating the route PMTU to
         * reflect PMTU decreases.
         *
         * If the new MTU is higher, and the route PMTU is equal to the local
         * MTU, this means the old MTU is the lowest in the path, so allow
         * updating it: if other nodes now have lower MTUs, PMTU discovery will
         * handle this.
         */

        if (dmtu >= mtu)
                return true;

        if (dmtu == idev->cnf.mtu6)
                return true;

        return false;
}

static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
                                       const struct fib6_nh *nh, int mtu)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        int i;

        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (!bucket)
                return;

        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
                        struct rt6_info *entry = rt6_ex->rt6i;

                        /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
                         * route), the metrics of its rt->from have already
                         * been updated.
                         */
                        if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
                            rt6_mtu_change_route_allowed(idev, entry, mtu))
                                dst_metric_set(&entry->dst, RTAX_MTU, mtu);
                }
                bucket++;
        }
}

#define RTF_CACHE_GATEWAY        (RTF_GATEWAY | RTF_CACHE)

static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
                                            const struct in6_addr *gateway)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
        int i;

        if (!rcu_access_pointer(nh->rt6i_exception_bucket))
                return;

        spin_lock_bh(&rt6_exception_lock);
        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (bucket) {
                for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                        hlist_for_each_entry_safe(rt6_ex, tmp,
                                                  &bucket->chain, hlist) {
                                struct rt6_info *entry = rt6_ex->rt6i;

                                if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
                                    RTF_CACHE_GATEWAY &&
                                    ipv6_addr_equal(gateway,
                                                    &entry->rt6i_gateway)) {
                                        rt6_remove_exception(bucket, rt6_ex);
                                }
                        }
                        bucket++;
                }
        }

        spin_unlock_bh(&rt6_exception_lock);
}

static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
                                      struct rt6_exception *rt6_ex,
                                      struct fib6_gc_args *gc_args,
                                      unsigned long now)
{
        struct rt6_info *rt = rt6_ex->rt6i;

        /* we are pruning and obsoleting aged-out and non gateway exceptions
         * even if others have still references to them, so that on next
         * dst_check() such references can be dropped.
         * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
         * expired, independently from their aging, as per RFC 8201 section 4
         */
        if (!(rt->rt6i_flags & RTF_EXPIRES)) {
                if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) +
                                       gc_args->timeout)) {
                        pr_debug("aging clone %p\n", rt);
                        rt6_remove_exception(bucket, rt6_ex);
                        return;
                }
        } else if (time_after(jiffies, READ_ONCE(rt->dst.expires))) {
                pr_debug("purging expired route %p\n", rt);
                rt6_remove_exception(bucket, rt6_ex);
                return;
        }

        if (rt->rt6i_flags & RTF_GATEWAY) {
                struct neighbour *neigh;

                neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);

                if (!(neigh && (neigh->flags & NTF_ROUTER))) {
                        pr_debug("purging route %p via non-router but gateway\n",
                                 rt);
                        rt6_remove_exception(bucket, rt6_ex);
                        return;
                }
        }

        gc_args->more++;
}

static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
                                   struct fib6_gc_args *gc_args,
                                   unsigned long now)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
        int i;

        if (!rcu_access_pointer(nh->rt6i_exception_bucket))
                return;

        rcu_read_lock_bh();
        spin_lock(&rt6_exception_lock);
        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (bucket) {
                for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                        hlist_for_each_entry_safe(rt6_ex, tmp,
                                                  &bucket->chain, hlist) {
                                rt6_age_examine_exception(bucket, rt6_ex,
                                                          gc_args, now);
                        }
                        bucket++;
                }
        }
        spin_unlock(&rt6_exception_lock);
        rcu_read_unlock_bh();
}

struct fib6_nh_age_excptn_arg {
        struct fib6_gc_args        *gc_args;
        unsigned long                now;
};

static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_age_excptn_arg *arg = _arg;

        fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
        return 0;
}

void rt6_age_exceptions(struct fib6_info *f6i,
                        struct fib6_gc_args *gc_args,
                        unsigned long now)
{
        if (f6i->nh) {
                struct fib6_nh_age_excptn_arg arg = {
                        .gc_args = gc_args,
                        .now = now
                };

                nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
                                         &arg);
        } else {
                fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
        }
}

/* must be called with rcu lock held */
int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
                      struct flowi6 *fl6, struct fib6_result *res, int strict)
{
        struct fib6_node *fn, *saved_fn;

        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
        saved_fn = fn;

redo_rt6_select:
        rt6_select(net, fn, oif, res, strict);
        if (res->f6i == net->ipv6.fib6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto redo_rt6_select;
                else if (strict & RT6_LOOKUP_F_REACHABLE) {
                        /* also consider unreachable route */
                        strict &= ~RT6_LOOKUP_F_REACHABLE;
                        fn = saved_fn;
                        goto redo_rt6_select;
                }
        }

        trace_fib6_table_lookup(net, res, table, fl6);

        return 0;
}

struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                               int oif, struct flowi6 *fl6,
                               const struct sk_buff *skb, int flags)
{
        struct fib6_result res = {};
        struct rt6_info *rt = NULL;
        int strict = 0;

        WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
                     !rcu_read_lock_held());

        strict |= flags & RT6_LOOKUP_F_IFACE;
        strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
        if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
                strict |= RT6_LOOKUP_F_REACHABLE;

        rcu_read_lock();

        fib6_table_lookup(net, table, oif, fl6, &res, strict);
        if (res.f6i == net->ipv6.fib6_null_entry)
                goto out;

        fib6_select_path(net, &res, fl6, oif, false, skb, strict);

        /*Search through exception table */
        rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
        if (rt) {
                goto out;
        } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
                            !res.nh->fib_nh_gw_family)) {
                /* Create a RTF_CACHE clone which will not be
                 * owned by the fib6 tree.  It is for the special case where
                 * the daddr in the skb during the neighbor look-up is different
                 * from the fl6->daddr used to look-up route here.
                 */
                rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);

                if (rt) {
                        /* 1 refcnt is taken during ip6_rt_cache_alloc().
                         * As rt6_uncached_list_add() does not consume refcnt,
                         * this refcnt is always returned to the caller even
                         * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
                         */
                        rt6_uncached_list_add(rt);
                        rcu_read_unlock();

                        return rt;
                }
        } else {
                /* Get a percpu copy */
                local_bh_disable();
                rt = rt6_get_pcpu_route(&res);

                if (!rt)
                        rt = rt6_make_pcpu_route(net, &res);

                local_bh_enable();
        }
out:
        if (!rt)
                rt = net->ipv6.ip6_null_entry;
        if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                ip6_hold_safe(net, &rt);
        rcu_read_unlock();

        return rt;
}
EXPORT_SYMBOL_GPL(ip6_pol_route);

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
                                            struct fib6_table *table,
                                            struct flowi6 *fl6,
                                            const struct sk_buff *skb,
                                            int flags)
{
        return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
}

struct dst_entry *ip6_route_input_lookup(struct net *net,
                                         struct net_device *dev,
                                         struct flowi6 *fl6,
                                         const struct sk_buff *skb,
                                         int flags)
{
        if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
                flags |= RT6_LOOKUP_F_IFACE;

        return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
}
EXPORT_SYMBOL_GPL(ip6_route_input_lookup);

static void ip6_multipath_l3_keys(const struct sk_buff *skb,
                                  struct flow_keys *keys,
                                  struct flow_keys *flkeys)
{
        const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
        const struct ipv6hdr *key_iph = outer_iph;
        struct flow_keys *_flkeys = flkeys;
        const struct ipv6hdr *inner_iph;
        const struct icmp6hdr *icmph;
        struct ipv6hdr _inner_iph;
        struct icmp6hdr _icmph;

        if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
                goto out;

        icmph = skb_header_pointer(skb, skb_transport_offset(skb),
                                   sizeof(_icmph), &_icmph);
        if (!icmph)
                goto out;

        if (!icmpv6_is_err(icmph->icmp6_type))
                goto out;

        inner_iph = skb_header_pointer(skb,
                                       skb_transport_offset(skb) + sizeof(*icmph),
                                       sizeof(_inner_iph), &_inner_iph);
        if (!inner_iph)
                goto out;

        key_iph = inner_iph;
        _flkeys = NULL;
out:
        if (_flkeys) {
                keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
                keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
                keys->tags.flow_label = _flkeys->tags.flow_label;
                keys->basic.ip_proto = _flkeys->basic.ip_proto;
        } else {
                keys->addrs.v6addrs.src = key_iph->saddr;
                keys->addrs.v6addrs.dst = key_iph->daddr;
                keys->tags.flow_label = ip6_flowlabel(key_iph);
                keys->basic.ip_proto = key_iph->nexthdr;
        }
}

static u32 rt6_multipath_custom_hash_outer(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool *p_has_inner)
{
        u32 hash_fields = ip6_multipath_hash_fields(net);
        struct flow_keys keys, hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);

        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
                hash_keys.tags.flow_label = keys.tags.flow_label;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 rt6_multipath_custom_hash_inner(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool has_inner)
{
        u32 hash_fields = ip6_multipath_hash_fields(net);
        struct flow_keys keys, hash_keys;

        /* We assume the packet carries an encapsulation, but if none was
         * encountered during dissection of the outer flow, then there is no
         * point in calling the flow dissector again.
         */
        if (!has_inner)
                return 0;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, 0);

        if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
                return 0;

        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
                        hash_keys.tags.flow_label = keys.tags.flow_label;
        }

        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 rt6_multipath_custom_hash_skb(const struct net *net,
                                         const struct sk_buff *skb)
{
        u32 mhash, mhash_inner;
        bool has_inner = true;

        mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner);
        mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner);

        return jhash_2words(mhash, mhash_inner, 0);
}

static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
                                         const struct flowi6 *fl6)
{
        u32 hash_fields = ip6_multipath_hash_fields(net);
        struct flow_keys hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v6addrs.src = fl6->saddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v6addrs.dst = fl6->daddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = fl6->flowi6_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
                hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
                if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
                        hash_keys.ports.src = (__force __be16)get_random_u16();
                else
                        hash_keys.ports.src = fl6->fl6_sport;
        }
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = fl6->fl6_dport;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

/* if skb is set it will be used and fl6 can be NULL */
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
                       const struct sk_buff *skb, struct flow_keys *flkeys)
{
        struct flow_keys hash_keys;
        u32 mhash = 0;

        switch (ip6_multipath_hash_policy(net)) {
        case 0:
                memset(&hash_keys, 0, sizeof(hash_keys));
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (skb) {
                        ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
                } else {
                        hash_keys.addrs.v6addrs.src = fl6->saddr;
                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
                        hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 1:
                if (skb) {
                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
                        struct flow_keys keys;

                        /* short-circuit if we already have L4 hash present */
                        if (skb->l4_hash)
                                return skb_get_hash_raw(skb) >> 1;

                        memset(&hash_keys, 0, sizeof(hash_keys));

                        if (!flkeys) {
                                skb_flow_dissect_flow_keys(skb, &keys, flag);
                                flkeys = &keys;
                        }
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                        hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
                        hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
                        hash_keys.ports.src = flkeys->ports.src;
                        hash_keys.ports.dst = flkeys->ports.dst;
                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
                } else {
                        memset(&hash_keys, 0, sizeof(hash_keys));
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                        hash_keys.addrs.v6addrs.src = fl6->saddr;
                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
                        if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
                                hash_keys.ports.src = (__force __be16)get_random_u16();
                        else
                                hash_keys.ports.src = fl6->fl6_sport;
                        hash_keys.ports.dst = fl6->fl6_dport;
                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 2:
                memset(&hash_keys, 0, sizeof(hash_keys));
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (skb) {
                        struct flow_keys keys;

                        if (!flkeys) {
                                skb_flow_dissect_flow_keys(skb, &keys, 0);
                                flkeys = &keys;
                        }

                        /* Inner can be v4 or v6 */
                        if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                                hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
                                hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
                        } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                                hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
                                hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
                                hash_keys.tags.flow_label = flkeys->tags.flow_label;
                                hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
                        } else {
                                /* Same as case 0 */
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                                ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
                        }
                } else {
                        /* Same as case 0 */
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                        hash_keys.addrs.v6addrs.src = fl6->saddr;
                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
                        hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 3:
                if (skb)
                        mhash = rt6_multipath_custom_hash_skb(net, skb);
                else
                        mhash = rt6_multipath_custom_hash_fl6(net, fl6);
                break;
        }

        return mhash >> 1;
}

/* Called with rcu held */
void ip6_route_input(struct sk_buff *skb)
{
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        struct net *net = dev_net(skb->dev);
        int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
        struct ip_tunnel_info *tun_info;
        struct flowi6 fl6 = {
                .flowi6_iif = skb->dev->ifindex,
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
                .flowi6_mark = skb->mark,
                .flowi6_proto = iph->nexthdr,
        };
        struct flow_keys *flkeys = NULL, _flkeys;

        tun_info = skb_tunnel_info(skb);
        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
                fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;

        if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
                flkeys = &_flkeys;

        if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
                fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
        skb_dst_drop(skb);
        skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
                                                      &fl6, skb, flags));
}
EXPORT_SYMBOL_GPL(ip6_route_input);

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags)
{
        return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}

static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
                                                      const struct sock *sk,
                                                      struct flowi6 *fl6,
                                                      int flags)
{
        bool any_src;

        if (ipv6_addr_type(&fl6->daddr) &
            (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
                struct dst_entry *dst;

                /* This function does not take refcnt on the dst */
                dst = l3mdev_link_scope_lookup(net, fl6);
                if (dst)
                        return dst;
        }

        fl6->flowi6_iif = LOOPBACK_IFINDEX;

        flags |= RT6_LOOKUP_F_DST_NOREF;
        any_src = ipv6_addr_any(&fl6->saddr);
        if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
            (fl6->flowi6_oif && any_src))
                flags |= RT6_LOOKUP_F_IFACE;

        if (!any_src)
                flags |= RT6_LOOKUP_F_HAS_SADDR;
        else if (sk)
                flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));

        return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
}

struct dst_entry *ip6_route_output_flags(struct net *net,
                                         const struct sock *sk,
                                         struct flowi6 *fl6,
                                         int flags)
{
        struct dst_entry *dst;
        struct rt6_info *rt6;

        rcu_read_lock();
        dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
        rt6 = dst_rt6_info(dst);
        /* For dst cached in uncached_list, refcnt is already taken. */
        if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
                dst = &net->ipv6.ip6_null_entry->dst;
                dst_hold(dst);
        }
        rcu_read_unlock();

        return dst;
}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);

struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
        struct rt6_info *rt, *ort = dst_rt6_info(dst_orig);
        struct net_device *loopback_dev = net->loopback_dev;
        struct dst_entry *new = NULL;

        rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev,
                       DST_OBSOLETE_DEAD, 0);
        if (rt) {
                rt6_info_init(rt);
                atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);

                new = &rt->dst;
                new->__use = 1;
                new->input = dst_discard;
                new->output = dst_discard_out;

                dst_copy_metrics(new, &ort->dst);

                rt->rt6i_idev = in6_dev_get(loopback_dev);
                rt->rt6i_gateway = ort->rt6i_gateway;
                rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;

                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
        }

        dst_release(dst_orig);
        return new ? new : ERR_PTR(-ENOMEM);
}

/*
 *        Destination cache support functions
 */

static bool fib6_check(struct fib6_info *f6i, u32 cookie)
{
        u32 rt_cookie = 0;

        if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
                return false;

        if (fib6_check_expired(f6i))
                return false;

        return true;
}

static struct dst_entry *rt6_check(struct rt6_info *rt,
                                   struct fib6_info *from,
                                   u32 cookie)
{
        u32 rt_cookie = 0;

        if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
            rt_cookie != cookie)
                return NULL;

        if (rt6_check_expired(rt))
                return NULL;

        return &rt->dst;
}

static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
                                            struct fib6_info *from,
                                            u32 cookie)
{
        if (!__rt6_check_expired(rt) &&
            READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK &&
            fib6_check(from, cookie))
                return &rt->dst;
        return NULL;
}

INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
                                                        u32 cookie)
{
        struct dst_entry *dst_ret;
        struct fib6_info *from;
        struct rt6_info *rt;

        rt = dst_rt6_info(dst);

        if (rt->sernum)
                return rt6_is_valid(rt) ? dst : NULL;

        rcu_read_lock();

        /* All IPV6 dsts are created with ->obsolete set to the value
         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
         * into this function always.
         */

        from = rcu_dereference(rt->from);

        if (from && (rt->rt6i_flags & RTF_PCPU ||
            unlikely(!list_empty(&rt->dst.rt_uncached))))
                dst_ret = rt6_dst_from_check(rt, from, cookie);
        else
                dst_ret = rt6_check(rt, from, cookie);

        rcu_read_unlock();

        return dst_ret;
}
EXPORT_INDIRECT_CALLABLE(ip6_dst_check);

static void ip6_negative_advice(struct sock *sk,
                                struct dst_entry *dst)
{
        struct rt6_info *rt = dst_rt6_info(dst);

        if (rt->rt6i_flags & RTF_CACHE) {
                rcu_read_lock();
                if (rt6_check_expired(rt)) {
                        /* rt/dst can not be destroyed yet,
                         * because of rcu_read_lock()
                         */
                        sk_dst_reset(sk);
                        rt6_remove_exception_rt(rt);
                }
                rcu_read_unlock();
                return;
        }
        sk_dst_reset(sk);
}

static void ip6_link_failure(struct sk_buff *skb)
{
        struct rt6_info *rt;

        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);

        rt = dst_rt6_info(skb_dst(skb));
        if (rt) {
                rcu_read_lock();
                if (rt->rt6i_flags & RTF_CACHE) {
                        rt6_remove_exception_rt(rt);
                } else {
                        struct fib6_info *from;
                        struct fib6_node *fn;

                        from = rcu_dereference(rt->from);
                        if (from) {
                                fn = rcu_dereference(from->fib6_node);
                                if (fn && (rt->rt6i_flags & RTF_DEFAULT))
                                        WRITE_ONCE(fn->fn_sernum, -1);
                        }
                }
                rcu_read_unlock();
        }
}

static void rt6_update_expires(struct rt6_info *rt0, int timeout)
{
        if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
                struct fib6_info *from;

                rcu_read_lock();
                from = rcu_dereference(rt0->from);
                if (from)
                        WRITE_ONCE(rt0->dst.expires, from->expires);
                rcu_read_unlock();
        }

        dst_set_expires(&rt0->dst, timeout);
        rt0->rt6i_flags |= RTF_EXPIRES;
}

static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
        struct net *net = dev_net(rt->dst.dev);

        dst_metric_set(&rt->dst, RTAX_MTU, mtu);
        rt->rt6i_flags |= RTF_MODIFIED;
        rt6_update_expires(rt, READ_ONCE(net->ipv6.sysctl.ip6_rt_mtu_expires));
}

static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
        return !(rt->rt6i_flags & RTF_CACHE) &&
                (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
}

static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
                                 const struct ipv6hdr *iph, u32 mtu,
                                 bool confirm_neigh)
{
        const struct in6_addr *daddr, *saddr;
        struct rt6_info *rt6 = dst_rt6_info(dst);

        /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
         * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
         * [see also comment in rt6_mtu_change_route()]
         */

        if (iph) {
                daddr = &iph->daddr;
                saddr = &iph->saddr;
        } else if (sk) {
                daddr = &sk->sk_v6_daddr;
                saddr = &inet6_sk(sk)->saddr;
        } else {
                daddr = NULL;
                saddr = NULL;
        }

        if (confirm_neigh)
                dst_confirm_neigh(dst, daddr);

        if (mtu < IPV6_MIN_MTU)
                return;
        if (mtu >= dst6_mtu(dst))
                return;

        if (!rt6_cache_allowed_for_pmtu(rt6)) {
                rt6_do_update_pmtu(rt6, mtu);
                /* update rt6_ex->stamp for cache */
                if (rt6->rt6i_flags & RTF_CACHE)
                        rt6_update_exception_stamp_rt(rt6);
        } else if (daddr) {
                struct fib6_result res = {};
                struct rt6_info *nrt6;

                rcu_read_lock();
                res.f6i = rcu_dereference(rt6->from);
                if (!res.f6i)
                        goto out_unlock;

                res.fib6_flags = res.f6i->fib6_flags;
                res.fib6_type = res.f6i->fib6_type;

                if (res.f6i->nh) {
                        struct fib6_nh_match_arg arg = {
                                .dev = dst_dev_rcu(dst),
                                .gw = &rt6->rt6i_gateway,
                        };

                        nexthop_for_each_fib6_nh(res.f6i->nh,
                                                 fib6_nh_find_match, &arg);

                        /* fib6_info uses a nexthop that does not have fib6_nh
                         * using the dst->dev + gw. Should be impossible.
                         */
                        if (!arg.match)
                                goto out_unlock;

                        res.nh = arg.match;
                } else {
                        res.nh = res.f6i->fib6_nh;
                }

                nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
                if (nrt6) {
                        rt6_do_update_pmtu(nrt6, mtu);
                        if (rt6_insert_exception(nrt6, &res))
                                dst_release_immediate(&nrt6->dst);
                }
out_unlock:
                rcu_read_unlock();
        }
}

static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu,
                               bool confirm_neigh)
{
        __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
                             confirm_neigh);
}

void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
                     int oif, u32 mark, kuid_t uid)
{
        const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
        struct dst_entry *dst;
        struct flowi6 fl6 = {
                .flowi6_oif = oif,
                .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
                .flowi6_uid = uid,
        };

        dst = ip6_route_output(net, NULL, &fl6);
        if (!dst->error)
                __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
        dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);

void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
        int oif = sk->sk_bound_dev_if;
        struct dst_entry *dst;

        if (!oif && skb->dev)
                oif = l3mdev_master_ifindex(skb->dev);

        ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark),
                        sk_uid(sk));

        dst = __sk_dst_get(sk);
        if (!dst || !READ_ONCE(dst->obsolete) ||
            dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
                return;

        bh_lock_sock(sk);
        if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
                ip6_datagram_dst_update(sk, false);
        bh_unlock_sock(sk);
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);

void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
                           const struct flowi6 *fl6)
{
#ifdef CONFIG_IPV6_SUBTREES
        struct ipv6_pinfo *np = inet6_sk(sk);
#endif

        ip6_dst_store(sk, dst,
                      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr),
#ifdef CONFIG_IPV6_SUBTREES
                      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
                      true :
#endif
                      false);
}

static bool ip6_redirect_nh_match(const struct fib6_result *res,
                                  struct flowi6 *fl6,
                                  const struct in6_addr *gw,
                                  struct rt6_info **ret)
{
        const struct fib6_nh *nh = res->nh;

        if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
            fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
                return false;

        /* rt_cache's gateway might be different from its 'parent'
         * in the case of an ip redirect.
         * So we keep searching in the exception table if the gateway
         * is different.
         */
        if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
                struct rt6_info *rt_cache;

                rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
                if (rt_cache &&
                    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
                        *ret = rt_cache;
                        return true;
                }
                return false;
        }
        return true;
}

struct fib6_nh_rd_arg {
        struct fib6_result        *res;
        struct flowi6                *fl6;
        const struct in6_addr        *gw;
        struct rt6_info                **ret;
};

static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_rd_arg *arg = _arg;

        arg->res->nh = nh;
        return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
}

/* Handle redirects */
struct ip6rd_flowi {
        struct flowi6 fl6;
        struct in6_addr gateway;
};

INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags)
{
        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
        struct rt6_info *ret = NULL;
        struct fib6_result res = {};
        struct fib6_nh_rd_arg arg = {
                .res = &res,
                .fl6 = fl6,
                .gw  = &rdfl->gateway,
                .ret = &ret
        };
        struct fib6_info *rt;
        struct fib6_node *fn;

        /* Get the "current" route for this destination and
         * check if the redirect has come from appropriate router.
         *
         * RFC 4861 specifies that redirects should only be
         * accepted if they come from the nexthop to the target.
         * Due to the way the routes are chosen, this notion
         * is a bit fuzzy and one might need to check all possible
         * routes.
         */

        rcu_read_lock();
        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
        for_each_fib6_node_rt_rcu(fn) {
                res.f6i = rt;
                if (fib6_check_expired(rt))
                        continue;
                if (rt->fib6_flags & RTF_REJECT)
                        break;
                if (unlikely(rt->nh)) {
                        if (nexthop_is_blackhole(rt->nh))
                                continue;
                        /* on match, res->nh is filled in and potentially ret */
                        if (nexthop_for_each_fib6_nh(rt->nh,
                                                     fib6_nh_redirect_match,
                                                     &arg))
                                goto out;
                } else {
                        res.nh = rt->fib6_nh;
                        if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
                                                  &ret))
                                goto out;
                }
        }

        if (!rt)
                rt = net->ipv6.fib6_null_entry;
        else if (rt->fib6_flags & RTF_REJECT) {
                ret = net->ipv6.ip6_null_entry;
                goto out;
        }

        if (rt == net->ipv6.fib6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto restart;
        }

        res.f6i = rt;
        res.nh = rt->fib6_nh;
out:
        if (ret) {
                ip6_hold_safe(net, &ret);
        } else {
                res.fib6_flags = res.f6i->fib6_flags;
                res.fib6_type = res.f6i->fib6_type;
                ret = ip6_create_rt_rcu(&res);
        }

        rcu_read_unlock();

        trace_fib6_table_lookup(net, &res, table, fl6);
        return ret;
};

static struct dst_entry *ip6_route_redirect(struct net *net,
                                            const struct flowi6 *fl6,
                                            const struct sk_buff *skb,
                                            const struct in6_addr *gateway)
{
        int flags = RT6_LOOKUP_F_HAS_SADDR;
        struct ip6rd_flowi rdfl;

        rdfl.fl6 = *fl6;
        rdfl.gateway = *gateway;

        return fib6_rule_lookup(net, &rdfl.fl6, skb,
                                flags, __ip6_route_redirect);
}

void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
                  kuid_t uid)
{
        const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
        struct dst_entry *dst;
        struct flowi6 fl6 = {
                .flowi6_iif = LOOPBACK_IFINDEX,
                .flowi6_oif = oif,
                .flowi6_mark = mark,
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
                .flowi6_uid = uid,
        };

        dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
        rt6_do_redirect(dst, NULL, skb);
        dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_redirect);

void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
{
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
        struct dst_entry *dst;
        struct flowi6 fl6 = {
                .flowi6_iif = LOOPBACK_IFINDEX,
                .flowi6_oif = oif,
                .daddr = msg->dest,
                .saddr = iph->daddr,
                .flowi6_uid = sock_net_uid(net, NULL),
        };

        dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
        rt6_do_redirect(dst, NULL, skb);
        dst_release(dst);
}

void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
        ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
                     READ_ONCE(sk->sk_mark), sk_uid(sk));
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);

static unsigned int ip6_default_advmss(const struct dst_entry *dst)
{
        unsigned int mtu = dst6_mtu(dst);
        struct net *net;

        mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);

        rcu_read_lock();

        net = dst_dev_net_rcu(dst);
        mtu = max_t(unsigned int, mtu,
                    READ_ONCE(net->ipv6.sysctl.ip6_rt_min_advmss));

        rcu_read_unlock();

        /*
         * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
         * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
         * IPV6_MAXPLEN is also valid and means: "any MSS,
         * rely only on pmtu discovery"
         */
        if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
                mtu = IPV6_MAXPLEN;
        return mtu;
}

INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
{
        return ip6_dst_mtu_maybe_forward(dst, false);
}
EXPORT_INDIRECT_CALLABLE(ip6_mtu);

/* MTU selection:
 * 1. mtu on route is locked - use it
 * 2. mtu from nexthop exception
 * 3. mtu from egress device
 *
 * based on ip6_dst_mtu_forward and exception logic of
 * rt6_find_cached_rt; called with rcu_read_lock
 */
u32 ip6_mtu_from_fib6(const struct fib6_result *res,
                      const struct in6_addr *daddr,
                      const struct in6_addr *saddr)
{
        const struct fib6_nh *nh = res->nh;
        struct fib6_info *f6i = res->f6i;
        struct inet6_dev *idev;
        struct rt6_info *rt;
        u32 mtu = 0;

        if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
                mtu = f6i->fib6_pmtu;
                if (mtu)
                        goto out;
        }

        rt = rt6_find_cached_rt(res, daddr, saddr);
        if (unlikely(rt)) {
                mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
        } else {
                struct net_device *dev = nh->fib_nh_dev;

                mtu = IPV6_MIN_MTU;
                idev = __in6_dev_get(dev);
                if (idev)
                        mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6));
        }

        mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
out:
        return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}

struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
                                  struct flowi6 *fl6)
{
        struct dst_entry *dst;
        struct rt6_info *rt;
        struct inet6_dev *idev = in6_dev_get(dev);
        struct net *net = dev_net(dev);

        if (unlikely(!idev))
                return ERR_PTR(-ENODEV);

        rt = ip6_dst_alloc(net, dev, 0);
        if (unlikely(!rt)) {
                in6_dev_put(idev);
                dst = ERR_PTR(-ENOMEM);
                goto out;
        }

        rt->dst.input = ip6_input;
        rt->dst.output  = ip6_output;
        rt->rt6i_gateway  = fl6->daddr;
        rt->rt6i_dst.addr = fl6->daddr;
        rt->rt6i_dst.plen = 128;
        rt->rt6i_idev     = idev;
        dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);

        /* Add this dst into uncached_list so that rt6_disable_ip() can
         * do proper release of the net_device
         */
        rt6_uncached_list_add(rt);

        dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);

out:
        return dst;
}

static void ip6_dst_gc(struct dst_ops *ops)
{
        struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
        int rt_min_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_min_interval);
        int rt_elasticity = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_elasticity);
        int rt_gc_timeout = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_timeout);
        unsigned long rt_last_gc = READ_ONCE(net->ipv6.ip6_rt_last_gc);
        unsigned int val;
        int entries;

        if (time_after(rt_last_gc + rt_min_interval, jiffies))
                goto out;

        fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
        entries = dst_entries_get_slow(ops);
        if (entries < ops->gc_thresh)
                atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
out:
        val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
        atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
}

static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
                               const struct in6_addr *gw_addr, u32 tbid,
                               int flags, struct fib6_result *res)
{
        struct flowi6 fl6 = {
                .flowi6_oif = cfg->fc_ifindex,
                .daddr = *gw_addr,
                .saddr = cfg->fc_prefsrc,
        };
        struct fib6_table *table;
        int err;

        table = fib6_get_table(net, tbid);
        if (!table)
                return -EINVAL;

        if (!ipv6_addr_any(&cfg->fc_prefsrc))
                flags |= RT6_LOOKUP_F_HAS_SADDR;

        flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;

        err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
        if (!err && res->f6i != net->ipv6.fib6_null_entry)
                fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
                                 cfg->fc_ifindex != 0, NULL, flags);

        return err;
}

static int ip6_route_check_nh_onlink(struct net *net,
                                     struct fib6_config *cfg,
                                     const struct net_device *dev,
                                     struct netlink_ext_ack *extack)
{
        u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
        const struct in6_addr *gw_addr = &cfg->fc_gateway;
        struct fib6_result res = {};
        int err;

        err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
        if (!err && !(res.fib6_flags & RTF_REJECT) &&
            res.fib6_type != RTN_UNICAST) {
                NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
                err = -EINVAL;
        }

        return err;
}

static int ip6_route_check_nh(struct net *net,
                              struct fib6_config *cfg,
                              struct net_device **_dev,
                              netdevice_tracker *dev_tracker,
                              struct inet6_dev **idev)
{
        const struct in6_addr *gw_addr = &cfg->fc_gateway;
        struct net_device *dev = _dev ? *_dev : NULL;
        int flags = RT6_LOOKUP_F_IFACE;
        struct fib6_result res = {};
        int err = -EHOSTUNREACH;

        if (cfg->fc_table) {
                err = ip6_nh_lookup_table(net, cfg, gw_addr,
                                          cfg->fc_table, flags, &res);
                /* gw_addr can not require a gateway or resolve to a reject
                 * route. If a device is given, it must match the result.
                 */
                if (err || res.fib6_flags & RTF_REJECT ||
                    res.nh->fib_nh_gw_family ||
                    (dev && dev != res.nh->fib_nh_dev))
                        err = -EHOSTUNREACH;
        }

        if (err < 0) {
                struct flowi6 fl6 = {
                        .flowi6_oif = cfg->fc_ifindex,
                        .daddr = *gw_addr,
                };

                err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
                if (err || res.fib6_flags & RTF_REJECT ||
                    res.nh->fib_nh_gw_family)
                        err = -EHOSTUNREACH;

                if (err)
                        return err;

                fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
                                 cfg->fc_ifindex != 0, NULL, flags);
        }

        err = 0;
        if (dev) {
                if (dev != res.nh->fib_nh_dev)
                        err = -EHOSTUNREACH;
        } else {
                *_dev = dev = res.nh->fib_nh_dev;
                netdev_hold(dev, dev_tracker, GFP_ATOMIC);
                *idev = in6_dev_get(dev);
        }

        return err;
}

static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
                           struct net_device **_dev,
                           netdevice_tracker *dev_tracker,
                           struct inet6_dev **idev,
                           struct netlink_ext_ack *extack)
{
        const struct in6_addr *gw_addr = &cfg->fc_gateway;
        int gwa_type = ipv6_addr_type(gw_addr);
        bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
        const struct net_device *dev = *_dev;
        bool need_addr_check = !dev;
        int err = -EINVAL;

        /* if gw_addr is local we will fail to detect this in case
         * address is still TENTATIVE (DAD in progress). rt6_lookup()
         * will return already-added prefix route via interface that
         * prefix route was assigned to, which might be non-loopback.
         */
        if (dev &&
            ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
                NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
                goto out;
        }

        if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
                /* IPv6 strictly inhibits using not link-local
                 * addresses as nexthop address.
                 * Otherwise, router will not able to send redirects.
                 * It is very good, but in some (rare!) circumstances
                 * (SIT, PtP, NBMA NOARP links) it is handy to allow
                 * some exceptions. --ANK
                 * We allow IPv4-mapped nexthops to support RFC4798-type
                 * addressing
                 */
                if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
                        NL_SET_ERR_MSG(extack, "Invalid gateway address");
                        goto out;
                }

                rcu_read_lock();

                if (cfg->fc_flags & RTNH_F_ONLINK)
                        err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
                else
                        err = ip6_route_check_nh(net, cfg, _dev, dev_tracker,
                                                 idev);

                rcu_read_unlock();

                if (err)
                        goto out;
        }

        /* reload in case device was changed */
        dev = *_dev;

        err = -EINVAL;
        if (!dev) {
                NL_SET_ERR_MSG(extack, "Egress device not specified");
                goto out;
        } else if (dev->flags & IFF_LOOPBACK) {
                NL_SET_ERR_MSG(extack,
                               "Egress device can not be loopback device for this route");
                goto out;
        }

        /* if we did not check gw_addr above, do so now that the
         * egress device has been resolved.
         */
        if (need_addr_check &&
            ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
                NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
                goto out;
        }

        err = 0;
out:
        return err;
}

static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
{
        if ((flags & RTF_REJECT) ||
            (dev && (dev->flags & IFF_LOOPBACK) &&
             !(addr_type & IPV6_ADDR_LOOPBACK) &&
             !(flags & (RTF_ANYCAST | RTF_LOCAL))))
                return true;

        return false;
}

int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
                 struct fib6_config *cfg, gfp_t gfp_flags,
                 struct netlink_ext_ack *extack)
{
        netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker;
        struct net_device *dev = NULL;
        struct inet6_dev *idev = NULL;
        int err;

        if (!ipv6_mod_enabled()) {
                NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
                return -EAFNOSUPPORT;
        }

        fib6_nh->fib_nh_family = AF_INET6;
#ifdef CONFIG_IPV6_ROUTER_PREF
        fib6_nh->last_probe = jiffies;
#endif
        if (cfg->fc_is_fdb) {
                fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
                fib6_nh->fib_nh_gw_family = AF_INET6;
                return 0;
        }

        err = -ENODEV;
        if (cfg->fc_ifindex) {
                dev = netdev_get_by_index(net, cfg->fc_ifindex,
                                          dev_tracker, gfp_flags);
                if (!dev)
                        goto out;
                idev = in6_dev_get(dev);
                if (!idev)
                        goto out;
        }

        if (cfg->fc_flags & RTNH_F_ONLINK) {
                if (!dev) {
                        NL_SET_ERR_MSG(extack,
                                       "Nexthop device required for onlink");
                        goto out;
                }

                if (!(dev->flags & IFF_UP)) {
                        NL_SET_ERR_MSG(extack, "Nexthop device is not up");
                        err = -ENETDOWN;
                        goto out;
                }

                fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
        }

        fib6_nh->fib_nh_weight = 1;

        /* Reset the nexthop device to the loopback device in case of reject
         * routes.
         */
        if (cfg->fc_flags & RTF_REJECT) {
                /* hold loopback dev/idev if we haven't done so. */
                if (dev != net->loopback_dev) {
                        if (dev) {
                                netdev_put(dev, dev_tracker);
                                in6_dev_put(idev);
                        }
                        dev = net->loopback_dev;
                        netdev_hold(dev, dev_tracker, gfp_flags);
                        idev = in6_dev_get(dev);
                        if (!idev) {
                                err = -ENODEV;
                                goto out;
                        }
                }
                goto pcpu_alloc;
        }

        if (cfg->fc_flags & RTF_GATEWAY) {
                err = ip6_validate_gw(net, cfg, &dev, dev_tracker,
                                      &idev, extack);
                if (err)
                        goto out;

                fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
                fib6_nh->fib_nh_gw_family = AF_INET6;
        }

        err = -ENODEV;
        if (!dev)
                goto out;

        if (!idev || idev->cnf.disable_ipv6) {
                NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
                err = -EACCES;
                goto out;
        }

        if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
                NL_SET_ERR_MSG(extack, "Nexthop device is not up");
                err = -ENETDOWN;
                goto out;
        }

        if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
            !netif_carrier_ok(dev))
                fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;

        err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
                                 cfg->fc_encap_type, cfg, gfp_flags, extack);
        if (err)
                goto out;

pcpu_alloc:
        fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
        if (!fib6_nh->rt6i_pcpu) {
                err = -ENOMEM;
                goto out;
        }

        fib6_nh->fib_nh_dev = dev;
        fib6_nh->fib_nh_oif = dev->ifindex;
        err = 0;
out:
        if (idev)
                in6_dev_put(idev);

        if (err) {
                fib_nh_common_release(&fib6_nh->nh_common);
                fib6_nh->nh_common.nhc_pcpu_rth_output = NULL;
                fib6_nh->fib_nh_lws = NULL;
                netdev_put(dev, dev_tracker);
        }

        return err;
}

void fib6_nh_release(struct fib6_nh *fib6_nh)
{
        struct rt6_exception_bucket *bucket;

        rcu_read_lock();

        fib6_nh_flush_exceptions(fib6_nh, NULL);
        bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
        if (bucket) {
                rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
                kfree(bucket);
        }

        rcu_read_unlock();

        fib6_nh_release_dsts(fib6_nh);
        free_percpu(fib6_nh->rt6i_pcpu);

        fib_nh_common_release(&fib6_nh->nh_common);
}

void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
{
        int cpu;

        if (!fib6_nh->rt6i_pcpu)
                return;

        for_each_possible_cpu(cpu) {
                struct rt6_info *pcpu_rt, **ppcpu_rt;

                ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
                pcpu_rt = xchg(ppcpu_rt, NULL);
                if (pcpu_rt) {
                        dst_dev_put(&pcpu_rt->dst);
                        dst_release(&pcpu_rt->dst);
                }
        }
}

static int fib6_config_validate(struct fib6_config *cfg,
                                struct netlink_ext_ack *extack)
{
        /* RTF_PCPU is an internal flag; can not be set by userspace */
        if (cfg->fc_flags & RTF_PCPU) {
                NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
                goto errout;
        }

        /* RTF_CACHE is an internal flag; can not be set by userspace */
        if (cfg->fc_flags & RTF_CACHE) {
                NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
                goto errout;
        }

        if (cfg->fc_type > RTN_MAX) {
                NL_SET_ERR_MSG(extack, "Invalid route type");
                goto errout;
        }

        if (cfg->fc_dst_len > 128) {
                NL_SET_ERR_MSG(extack, "Invalid prefix length");
                goto errout;
        }

#ifdef CONFIG_IPV6_SUBTREES
        if (cfg->fc_src_len > 128) {
                NL_SET_ERR_MSG(extack, "Invalid source address length");
                goto errout;
        }

        if (cfg->fc_nh_id && cfg->fc_src_len) {
                NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
                goto errout;
        }
#else
        if (cfg->fc_src_len) {
                NL_SET_ERR_MSG(extack,
                               "Specifying source address requires IPV6_SUBTREES to be enabled");
                goto errout;
        }
#endif
        return 0;
errout:
        return -EINVAL;
}

static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
                                               gfp_t gfp_flags,
                                               struct netlink_ext_ack *extack)
{
        struct net *net = cfg->fc_nlinfo.nl_net;
        struct fib6_table *table;
        struct fib6_info *rt;
        int err;

        if (cfg->fc_nlinfo.nlh &&
            !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
                table = fib6_get_table(net, cfg->fc_table);
                if (!table) {
                        pr_warn("NLM_F_CREATE should be specified when creating new route\n");
                        table = fib6_new_table(net, cfg->fc_table);
                }
        } else {
                table = fib6_new_table(net, cfg->fc_table);
        }
        if (!table) {
                err = -ENOBUFS;
                goto err;
        }

        rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id);
        if (!rt) {
                err = -ENOMEM;
                goto err;
        }

        rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len,
                                               extack);
        if (IS_ERR(rt->fib6_metrics)) {
                err = PTR_ERR(rt->fib6_metrics);
                goto free;
        }

        if (cfg->fc_flags & RTF_ADDRCONF)
                rt->dst_nocount = true;

        if (cfg->fc_flags & RTF_EXPIRES)
                fib6_set_expires(rt, jiffies +
                                 clock_t_to_jiffies(cfg->fc_expires));

        if (cfg->fc_protocol == RTPROT_UNSPEC)
                cfg->fc_protocol = RTPROT_BOOT;

        rt->fib6_protocol = cfg->fc_protocol;
        rt->fib6_table = table;
        rt->fib6_metric = cfg->fc_metric;
        rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
        rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;

        ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
        rt->fib6_dst.plen = cfg->fc_dst_len;

#ifdef CONFIG_IPV6_SUBTREES
        ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
        rt->fib6_src.plen = cfg->fc_src_len;
#endif
        return rt;
free:
        kfree(rt);
err:
        return ERR_PTR(err);
}

static int ip6_route_info_create_nh(struct fib6_info *rt,
                                    struct fib6_config *cfg,
                                    gfp_t gfp_flags,
                                    struct netlink_ext_ack *extack)
{
        struct net *net = cfg->fc_nlinfo.nl_net;
        struct fib6_nh *fib6_nh;
        int err;

        if (cfg->fc_nh_id) {
                struct nexthop *nh;

                rcu_read_lock();

                nh = nexthop_find_by_id(net, cfg->fc_nh_id);
                if (!nh) {
                        err = -EINVAL;
                        NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                        goto out_free;
                }

                err = fib6_check_nexthop(nh, cfg, extack);
                if (err)
                        goto out_free;

                if (!nexthop_get(nh)) {
                        NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
                        err = -ENOENT;
                        goto out_free;
                }

                rt->nh = nh;
                fib6_nh = nexthop_fib6_nh(rt->nh);

                rcu_read_unlock();
        } else {
                int addr_type;

                err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
                if (err)
                        goto out_release;

                fib6_nh = rt->fib6_nh;

                /* We cannot add true routes via loopback here, they would
                 * result in kernel looping; promote them to reject routes
                 */
                addr_type = ipv6_addr_type(&cfg->fc_dst);
                if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
                                   addr_type))
                        rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
        }

        if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
                struct net_device *dev = fib6_nh->fib_nh_dev;

                if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
                        NL_SET_ERR_MSG(extack, "Invalid source address");
                        err = -EINVAL;
                        goto out_release;
                }
                rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
                rt->fib6_prefsrc.plen = 128;
        }

        return 0;
out_release:
        fib6_info_release(rt);
        return err;
out_free:
        rcu_read_unlock();
        ip_fib_metrics_put(rt->fib6_metrics);
        kfree(rt);
        return err;
}

int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
                  struct netlink_ext_ack *extack)
{
        struct fib6_info *rt;
        int err;

        err = fib6_config_validate(cfg, extack);
        if (err)
                return err;

        rt = ip6_route_info_create(cfg, gfp_flags, extack);
        if (IS_ERR(rt))
                return PTR_ERR(rt);

        err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack);
        if (err)
                return err;

        err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
        fib6_info_release(rt);

        return err;
}

static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
{
        struct net *net = info->nl_net;
        struct fib6_table *table;
        int err;

        if (rt == net->ipv6.fib6_null_entry) {
                err = -ENOENT;
                goto out;
        }

        table = rt->fib6_table;
        spin_lock_bh(&table->tb6_lock);
        err = fib6_del(rt, info);
        spin_unlock_bh(&table->tb6_lock);

out:
        fib6_info_release(rt);
        return err;
}

int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
{
        struct nl_info info = {
                .nl_net = net,
                .skip_notify = skip_notify
        };

        return __ip6_del_rt(rt, &info);
}

static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
{
        struct nl_info *info = &cfg->fc_nlinfo;
        struct net *net = info->nl_net;
        struct sk_buff *skb = NULL;
        struct fib6_table *table;
        int err = -ENOENT;

        if (rt == net->ipv6.fib6_null_entry)
                goto out_put;
        table = rt->fib6_table;
        spin_lock_bh(&table->tb6_lock);

        if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
                struct fib6_info *sibling, *next_sibling;
                struct fib6_node *fn;

                /* prefer to send a single notification with all hops */
                skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
                if (skb) {
                        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;

                        if (rt6_fill_node(net, skb, rt, NULL,
                                          NULL, NULL, 0, RTM_DELROUTE,
                                          info->portid, seq, 0) < 0) {
                                kfree_skb(skb);
                                skb = NULL;
                        } else
                                info->skip_notify = 1;
                }

                /* 'rt' points to the first sibling route. If it is not the
                 * leaf, then we do not need to send a notification. Otherwise,
                 * we need to check if the last sibling has a next route or not
                 * and emit a replace or delete notification, respectively.
                 */
                info->skip_notify_kernel = 1;
                fn = rcu_dereference_protected(rt->fib6_node,
                                            lockdep_is_held(&table->tb6_lock));
                if (rcu_access_pointer(fn->leaf) == rt) {
                        struct fib6_info *last_sibling, *replace_rt;

                        last_sibling = list_last_entry(&rt->fib6_siblings,
                                                       struct fib6_info,
                                                       fib6_siblings);
                        replace_rt = rcu_dereference_protected(
                                            last_sibling->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                        if (replace_rt)
                                call_fib6_entry_notifiers_replace(net,
                                                                  replace_rt);
                        else
                                call_fib6_multipath_entry_notifiers(net,
                                                       FIB_EVENT_ENTRY_DEL,
                                                       rt, rt->fib6_nsiblings,
                                                       NULL);
                }
                list_for_each_entry_safe(sibling, next_sibling,
                                         &rt->fib6_siblings,
                                         fib6_siblings) {
                        err = fib6_del(sibling, info);
                        if (err)
                                goto out_unlock;
                }
        }

        err = fib6_del(rt, info);
out_unlock:
        spin_unlock_bh(&table->tb6_lock);
out_put:
        fib6_info_release(rt);

        if (skb) {
                rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
                            info->nlh, gfp_any());
        }
        return err;
}

static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
        int rc = -ESRCH;

        if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
                goto out;

        if (cfg->fc_flags & RTF_GATEWAY &&
            !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
                goto out;

        rc = rt6_remove_exception_rt(rt);
out:
        return rc;
}

static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
                             struct fib6_nh *nh)
{
        struct fib6_result res = {
                .f6i = rt,
                .nh = nh,
        };
        struct rt6_info *rt_cache;

        rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
        if (rt_cache)
                return __ip6_del_cached_rt(rt_cache, cfg);

        return 0;
}

struct fib6_nh_del_cached_rt_arg {
        struct fib6_config *cfg;
        struct fib6_info *f6i;
};

static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_del_cached_rt_arg *arg = _arg;
        int rc;

        rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
        return rc != -ESRCH ? rc : 0;
}

static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
{
        struct fib6_nh_del_cached_rt_arg arg = {
                .cfg = cfg,
                .f6i = f6i
        };

        return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
}

static int ip6_route_del(struct fib6_config *cfg,
                         struct netlink_ext_ack *extack)
{
        struct fib6_table *table;
        struct fib6_info *rt;
        struct fib6_node *fn;
        int err = -ESRCH;

        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
        if (!table) {
                NL_SET_ERR_MSG(extack, "FIB table does not exist");
                return err;
        }

        rcu_read_lock();

        fn = fib6_locate(&table->tb6_root,
                         &cfg->fc_dst, cfg->fc_dst_len,
                         &cfg->fc_src, cfg->fc_src_len,
                         !(cfg->fc_flags & RTF_CACHE));

        if (fn) {
                for_each_fib6_node_rt_rcu(fn) {
                        struct fib6_nh *nh;

                        if (rt->nh && cfg->fc_nh_id &&
                            rt->nh->id != cfg->fc_nh_id)
                                continue;

                        if (cfg->fc_flags & RTF_CACHE) {
                                int rc = 0;

                                if (rt->nh) {
                                        rc = ip6_del_cached_rt_nh(cfg, rt);
                                } else if (cfg->fc_nh_id) {
                                        continue;
                                } else {
                                        nh = rt->fib6_nh;
                                        rc = ip6_del_cached_rt(cfg, rt, nh);
                                }
                                if (rc != -ESRCH) {
                                        rcu_read_unlock();
                                        return rc;
                                }
                                continue;
                        }

                        if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
                                continue;
                        if (cfg->fc_protocol &&
                            cfg->fc_protocol != rt->fib6_protocol)
                                continue;

                        if (rt->nh) {
                                if (!fib6_info_hold_safe(rt))
                                        continue;

                                err =  __ip6_del_rt(rt, &cfg->fc_nlinfo);
                                break;
                        }
                        if (cfg->fc_nh_id)
                                continue;

                        nh = rt->fib6_nh;
                        if (cfg->fc_ifindex &&
                            (!nh->fib_nh_dev ||
                             nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
                                continue;
                        if (cfg->fc_flags & RTF_GATEWAY &&
                            !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
                                continue;
                        if (!fib6_info_hold_safe(rt))
                                continue;

                        /* if gateway was specified only delete the one hop */
                        if (cfg->fc_flags & RTF_GATEWAY)
                                err = __ip6_del_rt(rt, &cfg->fc_nlinfo);
                        else
                                err = __ip6_del_rt_siblings(rt, cfg);
                        break;
                }
        }
        rcu_read_unlock();

        return err;
}

static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
        struct netevent_redirect netevent;
        struct rt6_info *rt, *nrt = NULL;
        struct fib6_result res = {};
        struct ndisc_options ndopts;
        struct inet6_dev *in6_dev;
        struct neighbour *neigh;
        struct rd_msg *msg;
        int optlen, on_link;
        u8 *lladdr;

        optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
        optlen -= sizeof(*msg);

        if (optlen < 0) {
                net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
                return;
        }

        msg = (struct rd_msg *)icmp6_hdr(skb);

        if (ipv6_addr_is_multicast(&msg->dest)) {
                net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
                return;
        }

        on_link = 0;
        if (ipv6_addr_equal(&msg->dest, &msg->target)) {
                on_link = 1;
        } else if (ipv6_addr_type(&msg->target) !=
                   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
                net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
                return;
        }

        in6_dev = __in6_dev_get(skb->dev);
        if (!in6_dev)
                return;
        if (READ_ONCE(in6_dev->cnf.forwarding) ||
            !READ_ONCE(in6_dev->cnf.accept_redirects))
                return;

        /* RFC2461 8.1:
         *        The IP source address of the Redirect MUST be the same as the current
         *        first-hop router for the specified ICMP Destination Address.
         */

        if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
                net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
                return;
        }

        lladdr = NULL;
        if (ndopts.nd_opts_tgt_lladdr) {
                lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
                                             skb->dev);
                if (!lladdr) {
                        net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
                        return;
                }
        }

        rt = dst_rt6_info(dst);
        if (rt->rt6i_flags & RTF_REJECT) {
                net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
                return;
        }

        /* Redirect received -> path was valid.
         * Look, redirects are sent only in response to data packets,
         * so that this nexthop apparently is reachable. --ANK
         */
        dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);

        neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
        if (!neigh)
                return;

        /*
         *        We have finally decided to accept it.
         */

        ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
                     NEIGH_UPDATE_F_WEAK_OVERRIDE|
                     NEIGH_UPDATE_F_OVERRIDE|
                     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
                                     NEIGH_UPDATE_F_ISROUTER)),
                     NDISC_REDIRECT, &ndopts);

        rcu_read_lock();
        res.f6i = rcu_dereference(rt->from);
        if (!res.f6i)
                goto out;

        if (res.f6i->nh) {
                struct fib6_nh_match_arg arg = {
                        .dev = dst_dev_rcu(dst),
                        .gw = &rt->rt6i_gateway,
                };

                nexthop_for_each_fib6_nh(res.f6i->nh,
                                         fib6_nh_find_match, &arg);

                /* fib6_info uses a nexthop that does not have fib6_nh
                 * using the dst->dev. Should be impossible
                 */
                if (!arg.match)
                        goto out;
                res.nh = arg.match;
        } else {
                res.nh = res.f6i->fib6_nh;
        }

        res.fib6_flags = res.f6i->fib6_flags;
        res.fib6_type = res.f6i->fib6_type;
        nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
        if (!nrt)
                goto out;

        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
        if (on_link)
                nrt->rt6i_flags &= ~RTF_GATEWAY;

        nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;

        /* rt6_insert_exception() will take care of duplicated exceptions */
        if (rt6_insert_exception(nrt, &res)) {
                dst_release_immediate(&nrt->dst);
                goto out;
        }

        netevent.old = &rt->dst;
        netevent.new = &nrt->dst;
        netevent.daddr = &msg->dest;
        netevent.neigh = neigh;
        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);

out:
        rcu_read_unlock();
        neigh_release(neigh);
}

#ifdef CONFIG_IPV6_ROUTE_INFO
static struct fib6_info *rt6_get_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev)
{
        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
        int ifindex = dev->ifindex;
        struct fib6_node *fn;
        struct fib6_info *rt = NULL;
        struct fib6_table *table;

        table = fib6_get_table(net, tb_id);
        if (!table)
                return NULL;

        rcu_read_lock();
        fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
        if (!fn)
                goto out;

        for_each_fib6_node_rt_rcu(fn) {
                /* these routes do not use nexthops */
                if (rt->nh)
                        continue;
                if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
                        continue;
                if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
                    !rt->fib6_nh->fib_nh_gw_family)
                        continue;
                if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
                        continue;
                if (!fib6_info_hold_safe(rt))
                        continue;
                break;
        }
out:
        rcu_read_unlock();
        return rt;
}

static struct fib6_info *rt6_add_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev,
                                           unsigned int pref)
{
        struct fib6_config cfg = {
                .fc_metric        = IP6_RT_PRIO_USER,
                .fc_ifindex        = dev->ifindex,
                .fc_dst_len        = prefixlen,
                .fc_flags        = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
                                  RTF_UP | RTF_PREF(pref),
                .fc_protocol = RTPROT_RA,
                .fc_type = RTN_UNICAST,
                .fc_nlinfo.portid = 0,
                .fc_nlinfo.nlh = NULL,
                .fc_nlinfo.nl_net = net,
        };

        cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
        cfg.fc_dst = *prefix;
        cfg.fc_gateway = *gwaddr;

        /* We should treat it as a default route if prefix length is 0. */
        if (!prefixlen)
                cfg.fc_flags |= RTF_DEFAULT;

        ip6_route_add(&cfg, GFP_ATOMIC, NULL);

        return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
}
#endif

struct fib6_info *rt6_get_dflt_router(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev)
{
        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
        struct fib6_info *rt;
        struct fib6_table *table;

        table = fib6_get_table(net, tb_id);
        if (!table)
                return NULL;

        rcu_read_lock();
        for_each_fib6_node_rt_rcu(&table->tb6_root) {
                struct fib6_nh *nh;

                /* RA routes do not use nexthops */
                if (rt->nh)
                        continue;

                nh = rt->fib6_nh;
                if (dev == nh->fib_nh_dev &&
                    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
                    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
                        break;
        }
        if (rt && !fib6_info_hold_safe(rt))
                rt = NULL;
        rcu_read_unlock();
        return rt;
}

struct fib6_info *rt6_add_dflt_router(struct net *net,
                                     const struct in6_addr *gwaddr,
                                     struct net_device *dev,
                                     unsigned int pref,
                                     u32 defrtr_usr_metric,
                                     int lifetime)
{
        struct fib6_config cfg = {
                .fc_table        = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
                .fc_metric        = defrtr_usr_metric,
                .fc_ifindex        = dev->ifindex,
                .fc_flags        = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
                                  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
                .fc_protocol = RTPROT_RA,
                .fc_type = RTN_UNICAST,
                .fc_nlinfo.portid = 0,
                .fc_nlinfo.nlh = NULL,
                .fc_nlinfo.nl_net = net,
                .fc_expires = jiffies_to_clock_t(lifetime * HZ),
        };

        cfg.fc_gateway = *gwaddr;

        if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
                struct fib6_table *table;

                table = fib6_get_table(dev_net(dev), cfg.fc_table);
                if (table)
                        table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
        }

        return rt6_get_dflt_router(net, gwaddr, dev);
}

static void __rt6_purge_dflt_routers(struct net *net,
                                     struct fib6_table *table)
{
        struct fib6_info *rt;

restart:
        rcu_read_lock();
        for_each_fib6_node_rt_rcu(&table->tb6_root) {
                struct net_device *dev = fib6_info_nh_dev(rt);
                struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;

                if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
                    (!idev || idev->cnf.accept_ra != 2) &&
                    fib6_info_hold_safe(rt)) {
                        rcu_read_unlock();
                        ip6_del_rt(net, rt, false);
                        goto restart;
                }
        }
        rcu_read_unlock();

        table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
}

void rt6_purge_dflt_routers(struct net *net)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();

        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
                                __rt6_purge_dflt_routers(net, table);
                }
        }

        rcu_read_unlock();
}

static void rtmsg_to_fib6_config(struct net *net,
                                 struct in6_rtmsg *rtmsg,
                                 struct fib6_config *cfg)
{
        *cfg = (struct fib6_config){
                .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
                         : RT6_TABLE_MAIN,
                .fc_ifindex = rtmsg->rtmsg_ifindex,
                .fc_metric = rtmsg->rtmsg_metric,
                .fc_expires = rtmsg->rtmsg_info,
                .fc_dst_len = rtmsg->rtmsg_dst_len,
                .fc_src_len = rtmsg->rtmsg_src_len,
                .fc_flags = rtmsg->rtmsg_flags,
                .fc_type = rtmsg->rtmsg_type,

                .fc_nlinfo.nl_net = net,

                .fc_dst = rtmsg->rtmsg_dst,
                .fc_src = rtmsg->rtmsg_src,
                .fc_gateway = rtmsg->rtmsg_gateway,
        };
}

int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
{
        struct fib6_config cfg;
        int err;

        if (cmd != SIOCADDRT && cmd != SIOCDELRT)
                return -EINVAL;
        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        rtmsg_to_fib6_config(net, rtmsg, &cfg);

        switch (cmd) {
        case SIOCADDRT:
                /* Only do the default setting of fc_metric in route adding */
                if (cfg.fc_metric == 0)
                        cfg.fc_metric = IP6_RT_PRIO_USER;
                err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
                break;
        case SIOCDELRT:
                err = ip6_route_del(&cfg, NULL);
                break;
        }

        return err;
}

/*
 *        Drop the packet on the floor
 */

static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
{
        struct dst_entry *dst = skb_dst(skb);
        struct net_device *dev = dst_dev(dst);
        struct net *net = dev_net(dev);
        struct inet6_dev *idev;
        SKB_DR(reason);
        int type;

        if (netif_is_l3_master(skb->dev) ||
            dev == net->loopback_dev)
                idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
        else
                idev = ip6_dst_idev(dst);

        switch (ipstats_mib_noroutes) {
        case IPSTATS_MIB_INNOROUTES:
                type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
                if (type == IPV6_ADDR_ANY) {
                        SKB_DR_SET(reason, IP_INADDRERRORS);
                        IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                        break;
                }
                SKB_DR_SET(reason, IP_INNOROUTES);
                fallthrough;
        case IPSTATS_MIB_OUTNOROUTES:
                SKB_DR_OR(reason, IP_OUTNOROUTES);
                IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
                break;
        }

        /* Start over by dropping the dst for l3mdev case */
        if (netif_is_l3_master(skb->dev))
                skb_dst_drop(skb);

        icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
        kfree_skb_reason(skb, reason);
        return 0;
}

static int ip6_pkt_discard(struct sk_buff *skb)
{
        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
}

static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb->dev = skb_dst_dev(skb);
        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
}

static int ip6_pkt_prohibit(struct sk_buff *skb)
{
        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
}

static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb->dev = skb_dst_dev(skb);
        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
}

/*
 *        Allocate a dst for local (unicast / anycast) address.
 */

struct fib6_info *addrconf_f6i_alloc(struct net *net,
                                     struct inet6_dev *idev,
                                     const struct in6_addr *addr,
                                     bool anycast, gfp_t gfp_flags,
                                     struct netlink_ext_ack *extack)
{
        struct fib6_config cfg = {
                .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
                .fc_ifindex = idev->dev->ifindex,
                .fc_flags = RTF_UP | RTF_NONEXTHOP,
                .fc_dst = *addr,
                .fc_dst_len = 128,
                .fc_protocol = RTPROT_KERNEL,
                .fc_nlinfo.nl_net = net,
                .fc_ignore_dev_down = true,
        };
        struct fib6_info *f6i;
        int err;

        if (anycast) {
                cfg.fc_type = RTN_ANYCAST;
                cfg.fc_flags |= RTF_ANYCAST;
        } else {
                cfg.fc_type = RTN_LOCAL;
                cfg.fc_flags |= RTF_LOCAL;
        }

        f6i = ip6_route_info_create(&cfg, gfp_flags, extack);
        if (IS_ERR(f6i))
                return f6i;

        err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack);
        if (err)
                return ERR_PTR(err);

        f6i->dst_nocount = true;

        if (!anycast &&
            (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
             READ_ONCE(idev->cnf.disable_policy)))
                f6i->dst_nopolicy = true;

        return f6i;
}

/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
        struct net *net;
        struct in6_addr *addr;
};

static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
{
        struct net *net = ((struct arg_dev_net_ip *)arg)->net;
        struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;

        if (!rt->nh &&
            rt != net->ipv6.fib6_null_entry &&
            ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) &&
            !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) {
                spin_lock_bh(&rt6_exception_lock);
                /* remove prefsrc entry */
                rt->fib6_prefsrc.plen = 0;
                spin_unlock_bh(&rt6_exception_lock);
        }
        return 0;
}

void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
        struct net *net = dev_net(ifp->idev->dev);
        struct arg_dev_net_ip adni = {
                .net = net,
                .addr = &ifp->addr,
        };
        fib6_clean_all(net, fib6_remove_prefsrc, &adni);
}

#define RTF_RA_ROUTER                (RTF_ADDRCONF | RTF_DEFAULT)

/* Remove routers and update dst entries when gateway turn into host. */
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
        struct in6_addr *gateway = (struct in6_addr *)arg;
        struct fib6_nh *nh;

        /* RA routes do not use nexthops */
        if (rt->nh)
                return 0;

        nh = rt->fib6_nh;
        if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
            nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
                return -1;

        /* Further clean up cached routes in exception table.
         * This is needed because cached route may have a different
         * gateway than its 'parent' in the case of an ip redirect.
         */
        fib6_nh_exceptions_clean_tohost(nh, gateway);

        return 0;
}

void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
{
        fib6_clean_all(net, fib6_clean_tohost, gateway);
}

struct arg_netdev_event {
        const struct net_device *dev;
        union {
                unsigned char nh_flags;
                unsigned long event;
        };
};

static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
{
        struct fib6_info *iter;
        struct fib6_node *fn;

        fn = rcu_dereference_protected(rt->fib6_node,
                        lockdep_is_held(&rt->fib6_table->tb6_lock));
        iter = rcu_dereference_protected(fn->leaf,
                        lockdep_is_held(&rt->fib6_table->tb6_lock));
        while (iter) {
                if (iter->fib6_metric == rt->fib6_metric &&
                    rt6_qualify_for_ecmp(iter))
                        return iter;
                iter = rcu_dereference_protected(iter->fib6_next,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));
        }

        return NULL;
}

/* only called for fib entries with builtin fib6_nh */
static bool rt6_is_dead(const struct fib6_info *rt)
{
        if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
            (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
             ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
                return true;

        return false;
}

static int rt6_multipath_total_weight(const struct fib6_info *rt)
{
        struct fib6_info *iter;
        int total = 0;

        if (!rt6_is_dead(rt))
                total += rt->fib6_nh->fib_nh_weight;

        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
                if (!rt6_is_dead(iter))
                        total += iter->fib6_nh->fib_nh_weight;
        }

        return total;
}

static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
{
        int upper_bound = -1;

        if (!rt6_is_dead(rt)) {
                *weight += rt->fib6_nh->fib_nh_weight;
                upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
                                                    total) - 1;
        }
        atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
}

static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
{
        struct fib6_info *iter;
        int weight = 0;

        rt6_upper_bound_set(rt, &weight, total);

        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                rt6_upper_bound_set(iter, &weight, total);
}

void rt6_multipath_rebalance(struct fib6_info *rt)
{
        struct fib6_info *first;
        int total;

        /* In case the entire multipath route was marked for flushing,
         * then there is no need to rebalance upon the removal of every
         * sibling route.
         */
        if (!rt->fib6_nsiblings || rt->should_flush)
                return;

        /* During lookup routes are evaluated in order, so we need to
         * make sure upper bounds are assigned from the first sibling
         * onwards.
         */
        first = rt6_multipath_first_sibling(rt);
        if (WARN_ON_ONCE(!first))
                return;

        total = rt6_multipath_total_weight(first);
        rt6_multipath_upper_bound_set(first, total);
}

static int fib6_ifup(struct fib6_info *rt, void *p_arg)
{
        const struct arg_netdev_event *arg = p_arg;
        struct net *net = dev_net(arg->dev);

        if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
            rt->fib6_nh->fib_nh_dev == arg->dev) {
                rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
                fib6_update_sernum_upto_root(net, rt);
                rt6_multipath_rebalance(rt);
        }

        return 0;
}

void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
{
        struct arg_netdev_event arg = {
                .dev = dev,
                {
                        .nh_flags = nh_flags,
                },
        };

        if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
                arg.nh_flags |= RTNH_F_LINKDOWN;

        fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}

/* only called for fib entries with inline fib6_nh */
static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
                                   const struct net_device *dev)
{
        struct fib6_info *iter;

        if (rt->fib6_nh->fib_nh_dev == dev)
                return true;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                if (iter->fib6_nh->fib_nh_dev == dev)
                        return true;

        return false;
}

static void rt6_multipath_flush(struct fib6_info *rt)
{
        struct fib6_info *iter;

        rt->should_flush = 1;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                iter->should_flush = 1;
}

static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
                                             const struct net_device *down_dev)
{
        struct fib6_info *iter;
        unsigned int dead = 0;

        if (rt->fib6_nh->fib_nh_dev == down_dev ||
            rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
                dead++;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                if (iter->fib6_nh->fib_nh_dev == down_dev ||
                    iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
                        dead++;

        return dead;
}

static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
                                       const struct net_device *dev,
                                       unsigned char nh_flags)
{
        struct fib6_info *iter;

        if (rt->fib6_nh->fib_nh_dev == dev)
                rt->fib6_nh->fib_nh_flags |= nh_flags;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                if (iter->fib6_nh->fib_nh_dev == dev)
                        iter->fib6_nh->fib_nh_flags |= nh_flags;
}

/* called with write lock held for table with rt */
static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
{
        const struct arg_netdev_event *arg = p_arg;
        const struct net_device *dev = arg->dev;
        struct net *net = dev_net(dev);

        if (rt == net->ipv6.fib6_null_entry || rt->nh)
                return 0;

        switch (arg->event) {
        case NETDEV_UNREGISTER:
                return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
        case NETDEV_DOWN:
                if (rt->should_flush)
                        return -1;
                if (!rt->fib6_nsiblings)
                        return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
                if (rt6_multipath_uses_dev(rt, dev)) {
                        unsigned int count;

                        count = rt6_multipath_dead_count(rt, dev);
                        if (rt->fib6_nsiblings + 1 == count) {
                                rt6_multipath_flush(rt);
                                return -1;
                        }
                        rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
                                                   RTNH_F_LINKDOWN);
                        fib6_update_sernum(net, rt);
                        rt6_multipath_rebalance(rt);
                }
                return -2;
        case NETDEV_CHANGE:
                if (rt->fib6_nh->fib_nh_dev != dev ||
                    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
                        break;
                rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
                rt6_multipath_rebalance(rt);
                break;
        }

        return 0;
}

void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
{
        struct arg_netdev_event arg = {
                .dev = dev,
                {
                        .event = event,
                },
        };
        struct net *net = dev_net(dev);

        if (READ_ONCE(net->ipv6.sysctl.skip_notify_on_dev_down))
                fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
        else
                fib6_clean_all(net, fib6_ifdown, &arg);
}

void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
        rt6_sync_down_dev(dev, event);
        rt6_uncached_list_flush_dev(dev);
        neigh_ifdown(&nd_tbl, dev);
}

struct rt6_mtu_change_arg {
        struct net_device *dev;
        unsigned int mtu;
        struct fib6_info *f6i;
};

static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
{
        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
        struct fib6_info *f6i = arg->f6i;

        /* For administrative MTU increase, there is no way to discover
         * IPv6 PMTU increase, so PMTU increase should be updated here.
         * Since RFC 1981 doesn't include administrative MTU increase
         * update PMTU increase is a MUST. (i.e. jumbo frame)
         */
        if (nh->fib_nh_dev == arg->dev) {
                struct inet6_dev *idev = __in6_dev_get(arg->dev);
                u32 mtu = f6i->fib6_pmtu;

                if (mtu >= arg->mtu ||
                    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
                        fib6_metric_set(f6i, RTAX_MTU, arg->mtu);

                spin_lock_bh(&rt6_exception_lock);
                rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
                spin_unlock_bh(&rt6_exception_lock);
        }

        return 0;
}

static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
{
        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
        struct inet6_dev *idev;

        /* In IPv6 pmtu discovery is not optional,
           so that RTAX_MTU lock cannot disable it.
           We still use this lock to block changes
           caused by addrconf/ndisc.
        */

        idev = __in6_dev_get(arg->dev);
        if (!idev)
                return 0;

        if (fib6_metric_locked(f6i, RTAX_MTU))
                return 0;

        arg->f6i = f6i;
        if (f6i->nh) {
                /* fib6_nh_mtu_change only returns 0, so this is safe */
                return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
                                                arg);
        }

        return fib6_nh_mtu_change(f6i->fib6_nh, arg);
}

void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
{
        struct rt6_mtu_change_arg arg = {
                .dev = dev,
                .mtu = mtu,
        };

        fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
}

static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
        [RTA_UNSPEC]                = { .strict_start_type = RTA_DPORT + 1 },
        [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
        [RTA_PREFSRC]                = { .len = sizeof(struct in6_addr) },
        [RTA_OIF]               = { .type = NLA_U32 },
        [RTA_IIF]                = { .type = NLA_U32 },
        [RTA_PRIORITY]          = { .type = NLA_U32 },
        [RTA_METRICS]           = { .type = NLA_NESTED },
        [RTA_MULTIPATH]                = { .len = sizeof(struct rtnexthop) },
        [RTA_PREF]              = { .type = NLA_U8 },
        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
        [RTA_ENCAP]                = { .type = NLA_NESTED },
        [RTA_EXPIRES]                = { .type = NLA_U32 },
        [RTA_UID]                = { .type = NLA_U32 },
        [RTA_MARK]                = { .type = NLA_U32 },
        [RTA_TABLE]                = { .type = NLA_U32 },
        [RTA_IP_PROTO]                = { .type = NLA_U8 },
        [RTA_SPORT]                = { .type = NLA_U16 },
        [RTA_DPORT]                = { .type = NLA_U16 },
        [RTA_NH_ID]                = { .type = NLA_U32 },
        [RTA_FLOWLABEL]                = { .type = NLA_BE32 },
};

static int rtm_to_fib6_multipath_config(struct fib6_config *cfg,
                                        struct netlink_ext_ack *extack,
                                        bool newroute)
{
        struct rtnexthop *rtnh;
        int remaining;

        remaining = cfg->fc_mp_len;
        rtnh = (struct rtnexthop *)cfg->fc_mp;

        if (!rtnh_ok(rtnh, remaining)) {
                NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops");
                return -EINVAL;
        }

        do {
                bool has_gateway = cfg->fc_flags & RTF_GATEWAY;
                int attrlen = rtnh_attrlen(rtnh);

                if (attrlen > 0) {
                        struct nlattr *nla, *attrs;

                        attrs = rtnh_attrs(rtnh);
                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        if (nla) {
                                if (nla_len(nla) < sizeof(cfg->fc_gateway)) {
                                        NL_SET_ERR_MSG(extack,
                                                       "Invalid IPv6 address in RTA_GATEWAY");
                                        return -EINVAL;
                                }

                                has_gateway = true;
                        }
                }

                if (newroute && (cfg->fc_nh_id || !has_gateway)) {
                        NL_SET_ERR_MSG(extack,
                                       "Device only routes can not be added for IPv6 using the multipath API.");
                        return -EINVAL;
                }

                rtnh = rtnh_next(rtnh, &remaining);
        } while (rtnh_ok(rtnh, remaining));

        return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack);
}

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct fib6_config *cfg,
                              struct netlink_ext_ack *extack)
{
        bool newroute = nlh->nlmsg_type == RTM_NEWROUTE;
        struct nlattr *tb[RTA_MAX+1];
        struct rtmsg *rtm;
        unsigned int pref;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
                                     rtm_ipv6_policy, extack);
        if (err < 0)
                goto errout;

        err = -EINVAL;
        rtm = nlmsg_data(nlh);

        if (rtm->rtm_tos) {
                NL_SET_ERR_MSG(extack,
                               "Invalid dsfield (tos): option not available for IPv6");
                goto errout;
        }

        if (tb[RTA_FLOWLABEL]) {
                NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL],
                                    "Flow label cannot be specified for this operation");
                goto errout;
        }

        *cfg = (struct fib6_config){
                .fc_table = rtm->rtm_table,
                .fc_dst_len = rtm->rtm_dst_len,
                .fc_src_len = rtm->rtm_src_len,
                .fc_flags = RTF_UP,
                .fc_protocol = rtm->rtm_protocol,
                .fc_type = rtm->rtm_type,

                .fc_nlinfo.portid = NETLINK_CB(skb).portid,
                .fc_nlinfo.nlh = nlh,
                .fc_nlinfo.nl_net = sock_net(skb->sk),
        };

        if (rtm->rtm_type == RTN_UNREACHABLE ||
            rtm->rtm_type == RTN_BLACKHOLE ||
            rtm->rtm_type == RTN_PROHIBIT ||
            rtm->rtm_type == RTN_THROW)
                cfg->fc_flags |= RTF_REJECT;

        if (rtm->rtm_type == RTN_LOCAL)
                cfg->fc_flags |= RTF_LOCAL;

        if (rtm->rtm_flags & RTM_F_CLONED)
                cfg->fc_flags |= RTF_CACHE;

        cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);

        if (tb[RTA_NH_ID]) {
                if (tb[RTA_GATEWAY]   || tb[RTA_OIF] ||
                    tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
                        NL_SET_ERR_MSG(extack,
                                       "Nexthop specification and nexthop id are mutually exclusive");
                        goto errout;
                }
                cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
        }

        if (tb[RTA_GATEWAY]) {
                cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
                cfg->fc_flags |= RTF_GATEWAY;
        }
        if (tb[RTA_VIA]) {
                NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
                goto errout;
        }

        if (tb[RTA_DST]) {
                int plen = (rtm->rtm_dst_len + 7) >> 3;

                if (nla_len(tb[RTA_DST]) < plen)
                        goto errout;

                nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
        }

        if (tb[RTA_SRC]) {
                int plen = (rtm->rtm_src_len + 7) >> 3;

                if (nla_len(tb[RTA_SRC]) < plen)
                        goto errout;

                nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
        }

        if (tb[RTA_PREFSRC])
                cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);

        if (tb[RTA_OIF])
                cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);

        if (tb[RTA_PRIORITY])
                cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);

        if (tb[RTA_METRICS]) {
                cfg->fc_mx = nla_data(tb[RTA_METRICS]);
                cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
        }

        if (tb[RTA_TABLE])
                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);

        if (tb[RTA_MULTIPATH]) {
                cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
                cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);

                err = rtm_to_fib6_multipath_config(cfg, extack, newroute);
                if (err < 0)
                        goto errout;
        }

        if (tb[RTA_PREF]) {
                pref = nla_get_u8(tb[RTA_PREF]);
                if (pref != ICMPV6_ROUTER_PREF_LOW &&
                    pref != ICMPV6_ROUTER_PREF_HIGH)
                        pref = ICMPV6_ROUTER_PREF_MEDIUM;
                cfg->fc_flags |= RTF_PREF(pref);
        }

        if (tb[RTA_ENCAP])
                cfg->fc_encap = tb[RTA_ENCAP];

        if (tb[RTA_ENCAP_TYPE]) {
                cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);

                err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
                if (err < 0)
                        goto errout;
        }

        if (tb[RTA_EXPIRES]) {
                unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);

                if (addrconf_finite_timeout(timeout)) {
                        cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
                        cfg->fc_flags |= RTF_EXPIRES;
                }
        }

        err = 0;
errout:
        return err;
}

struct rt6_nh {
        struct fib6_info *fib6_info;
        struct fib6_config r_cfg;
        struct list_head list;
};

static int ip6_route_info_append(struct list_head *rt6_nh_list,
                                 struct fib6_info *rt,
                                 struct fib6_config *r_cfg)
{
        struct rt6_nh *nh;

        list_for_each_entry(nh, rt6_nh_list, list) {
                /* check if fib6_info already exists */
                if (rt6_duplicate_nexthop(nh->fib6_info, rt))
                        return -EEXIST;
        }

        nh = kzalloc_obj(*nh);
        if (!nh)
                return -ENOMEM;

        nh->fib6_info = rt;
        memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
        list_add_tail(&nh->list, rt6_nh_list);

        return 0;
}

static void ip6_route_mpath_notify(struct fib6_info *rt,
                                   struct fib6_info *rt_last,
                                   struct nl_info *info,
                                   __u16 nlflags)
{
        /* if this is an APPEND route, then rt points to the first route
         * inserted and rt_last points to last route inserted. Userspace
         * wants a consistent dump of the route which starts at the first
         * nexthop. Since sibling routes are always added at the end of
         * the list, find the first sibling of the last route appended
         */
        rcu_read_lock();

        if ((nlflags & NLM_F_APPEND) && rt_last &&
            READ_ONCE(rt_last->fib6_nsiblings)) {
                rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
                                            struct fib6_info,
                                            fib6_siblings);
        }

        if (rt)
                inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);

        rcu_read_unlock();
}

static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
{
        bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
        bool should_notify = false;
        struct fib6_info *leaf;
        struct fib6_node *fn;

        rcu_read_lock();
        fn = rcu_dereference(rt->fib6_node);
        if (!fn)
                goto out;

        leaf = rcu_dereference(fn->leaf);
        if (!leaf)
                goto out;

        if (rt == leaf ||
            (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
             rt6_qualify_for_ecmp(leaf)))
                should_notify = true;
out:
        rcu_read_unlock();

        return should_notify;
}

static int ip6_route_multipath_add(struct fib6_config *cfg,
                                   struct netlink_ext_ack *extack)
{
        struct fib6_info *rt_notif = NULL, *rt_last = NULL;
        struct nl_info *info = &cfg->fc_nlinfo;
        struct rt6_nh *nh, *nh_safe;
        struct fib6_config r_cfg;
        struct rtnexthop *rtnh;
        LIST_HEAD(rt6_nh_list);
        struct rt6_nh *err_nh;
        struct fib6_info *rt;
        __u16 nlflags;
        int remaining;
        int attrlen;
        int replace;
        int nhn = 0;
        int err;

        err = fib6_config_validate(cfg, extack);
        if (err)
                return err;

        replace = (cfg->fc_nlinfo.nlh &&
                   (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));

        nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
        if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
                nlflags |= NLM_F_APPEND;

        remaining = cfg->fc_mp_len;
        rtnh = (struct rtnexthop *)cfg->fc_mp;

        /* Parse a Multipath Entry and build a list (rt6_nh_list) of
         * fib6_info structs per nexthop
         */
        while (rtnh_ok(rtnh, remaining)) {
                memcpy(&r_cfg, cfg, sizeof(*cfg));
                if (rtnh->rtnh_ifindex)
                        r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        if (nla) {
                                r_cfg.fc_gateway = nla_get_in6_addr(nla);
                                r_cfg.fc_flags |= RTF_GATEWAY;
                        }

                        r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
                        nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
                        if (nla)
                                r_cfg.fc_encap_type = nla_get_u16(nla);
                }

                r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
                rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
                if (IS_ERR(rt)) {
                        err = PTR_ERR(rt);
                        rt = NULL;
                        goto cleanup;
                }

                err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack);
                if (err) {
                        rt = NULL;
                        goto cleanup;
                }

                rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;

                err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
                if (err) {
                        fib6_info_release(rt);
                        goto cleanup;
                }

                rtnh = rtnh_next(rtnh, &remaining);
        }

        /* for add and replace send one notification with all nexthops.
         * Skip the notification in fib6_add_rt2node and send one with
         * the full route when done
         */
        info->skip_notify = 1;

        /* For add and replace, send one notification with all nexthops. For
         * append, send one notification with all appended nexthops.
         */
        info->skip_notify_kernel = 1;

        err_nh = NULL;
        list_for_each_entry(nh, &rt6_nh_list, list) {
                err = __ip6_ins_rt(nh->fib6_info, info, extack);

                if (err) {
                        if (replace && nhn)
                                NL_SET_ERR_MSG_MOD(extack,
                                                   "multipath route replace failed (check consistency of installed routes)");
                        err_nh = nh;
                        goto add_errout;
                }
                /* save reference to last route successfully inserted */
                rt_last = nh->fib6_info;

                /* save reference to first route for notification */
                if (!rt_notif)
                        rt_notif = nh->fib6_info;

                /* Because each route is added like a single route we remove
                 * these flags after the first nexthop: if there is a collision,
                 * we have already failed to add the first nexthop:
                 * fib6_add_rt2node() has rejected it; when replacing, old
                 * nexthops have been replaced by first new, the rest should
                 * be added to it.
                 */
                if (cfg->fc_nlinfo.nlh) {
                        cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
                                                             NLM_F_REPLACE);
                        cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
                }
                nhn++;
        }

        /* An in-kernel notification should only be sent in case the new
         * multipath route is added as the first route in the node, or if
         * it was appended to it. We pass 'rt_notif' since it is the first
         * sibling and might allow us to skip some checks in the replace case.
         */
        if (ip6_route_mpath_should_notify(rt_notif)) {
                enum fib_event_type fib_event;

                if (rt_notif->fib6_nsiblings != nhn - 1)
                        fib_event = FIB_EVENT_ENTRY_APPEND;
                else
                        fib_event = FIB_EVENT_ENTRY_REPLACE;

                err = call_fib6_multipath_entry_notifiers(info->nl_net,
                                                          fib_event, rt_notif,
                                                          nhn - 1, extack);
                if (err) {
                        /* Delete all the siblings that were just added */
                        err_nh = NULL;
                        goto add_errout;
                }
        }

        /* success ... tell user about new route */
        ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
        goto cleanup;

add_errout:
        /* send notification for routes that were added so that
         * the delete notifications sent by ip6_route_del are
         * coherent
         */
        if (rt_notif)
                ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);

        /* Delete routes that were already added */
        list_for_each_entry(nh, &rt6_nh_list, list) {
                if (err_nh == nh)
                        break;
                ip6_route_del(&nh->r_cfg, extack);
        }

cleanup:
        list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) {
                fib6_info_release(nh->fib6_info);
                list_del(&nh->list);
                kfree(nh);
        }

        return err;
}

static int ip6_route_multipath_del(struct fib6_config *cfg,
                                   struct netlink_ext_ack *extack)
{
        struct fib6_config r_cfg;
        struct rtnexthop *rtnh;
        int last_err = 0;
        int remaining;
        int attrlen;
        int err;

        remaining = cfg->fc_mp_len;
        rtnh = (struct rtnexthop *)cfg->fc_mp;

        /* Parse a Multipath Entry */
        while (rtnh_ok(rtnh, remaining)) {
                memcpy(&r_cfg, cfg, sizeof(*cfg));
                if (rtnh->rtnh_ifindex)
                        r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        if (nla) {
                                r_cfg.fc_gateway = nla_get_in6_addr(nla);
                                r_cfg.fc_flags |= RTF_GATEWAY;
                        }
                }

                err = ip6_route_del(&r_cfg, extack);
                if (err)
                        last_err = err;

                rtnh = rtnh_next(rtnh, &remaining);
        }

        return last_err;
}

static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct fib6_config cfg;
        int err;

        err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
        if (err < 0)
                return err;

        if (cfg.fc_nh_id) {
                rcu_read_lock();
                err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id);
                rcu_read_unlock();

                if (err) {
                        NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                        return -EINVAL;
                }
        }

        if (cfg.fc_mp) {
                return ip6_route_multipath_del(&cfg, extack);
        } else {
                cfg.fc_delete_all_nh = 1;
                return ip6_route_del(&cfg, extack);
        }
}

static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct fib6_config cfg;
        int err;

        err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
        if (err < 0)
                return err;

        if (cfg.fc_metric == 0)
                cfg.fc_metric = IP6_RT_PRIO_USER;

        if (cfg.fc_mp)
                return ip6_route_multipath_add(&cfg, extack);
        else
                return ip6_route_add(&cfg, GFP_KERNEL, extack);
}

/* add the overhead of this fib6_nh to nexthop_len */
static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
{
        int *nexthop_len = arg;

        *nexthop_len += nla_total_size(0)         /* RTA_MULTIPATH */
                     + NLA_ALIGN(sizeof(struct rtnexthop))
                     + nla_total_size(16); /* RTA_GATEWAY */

        if (nh->fib_nh_lws) {
                /* RTA_ENCAP_TYPE */
                *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
                /* RTA_ENCAP */
                *nexthop_len += nla_total_size(2);
        }

        return 0;
}

static size_t rt6_nlmsg_size(struct fib6_info *f6i)
{
        struct fib6_info *sibling;
        struct fib6_nh *nh;
        int nexthop_len;

        if (f6i->nh) {
                nexthop_len = nla_total_size(4); /* RTA_NH_ID */
                nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
                                         &nexthop_len);
                goto common;
        }

        rcu_read_lock();
retry:
        nh = f6i->fib6_nh;
        nexthop_len = 0;
        if (READ_ONCE(f6i->fib6_nsiblings)) {
                rt6_nh_nlmsg_size(nh, &nexthop_len);

                list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
                                        fib6_siblings) {
                        rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
                        if (!READ_ONCE(f6i->fib6_nsiblings))
                                goto retry;
                }
        }
        rcu_read_unlock();
        nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
common:
        return NLMSG_ALIGN(sizeof(struct rtmsg))
               + nla_total_size(16) /* RTA_SRC */
               + nla_total_size(16) /* RTA_DST */
               + nla_total_size(16) /* RTA_GATEWAY */
               + nla_total_size(16) /* RTA_PREFSRC */
               + nla_total_size(4) /* RTA_TABLE */
               + nla_total_size(4) /* RTA_IIF */
               + nla_total_size(4) /* RTA_OIF */
               + nla_total_size(4) /* RTA_PRIORITY */
               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
               + nla_total_size(sizeof(struct rta_cacheinfo))
               + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
               + nla_total_size(1) /* RTA_PREF */
               + nexthop_len;
}

static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
                                 unsigned char *flags)
{
        if (nexthop_is_multipath(nh)) {
                struct nlattr *mp;

                mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
                if (!mp)
                        goto nla_put_failure;

                if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
                        goto nla_put_failure;

                nla_nest_end(skb, mp);
        } else {
                struct fib6_nh *fib6_nh;

                fib6_nh = nexthop_fib6_nh(nh);
                if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
                                     flags, false) < 0)
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                         struct fib6_info *rt, struct dst_entry *dst,
                         struct in6_addr *dest, struct in6_addr *src,
                         int iif, int type, u32 portid, u32 seq,
                         unsigned int flags)
{
        struct rt6_info *rt6 = dst_rt6_info(dst);
        struct rt6key *rt6_dst, *rt6_src;
        u32 *pmetrics, table, rt6_flags;
        unsigned char nh_flags = 0;
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;
        long expires = 0;

        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
        if (!nlh)
                return -EMSGSIZE;

        if (rt6) {
                rt6_dst = &rt6->rt6i_dst;
                rt6_src = &rt6->rt6i_src;
                rt6_flags = rt6->rt6i_flags;
        } else {
                rt6_dst = &rt->fib6_dst;
                rt6_src = &rt->fib6_src;
                rt6_flags = rt->fib6_flags;
        }

        rtm = nlmsg_data(nlh);
        rtm->rtm_family = AF_INET6;
        rtm->rtm_dst_len = rt6_dst->plen;
        rtm->rtm_src_len = rt6_src->plen;
        rtm->rtm_tos = 0;
        if (rt->fib6_table)
                table = rt->fib6_table->tb6_id;
        else
                table = RT6_TABLE_UNSPEC;
        rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
        if (nla_put_u32(skb, RTA_TABLE, table))
                goto nla_put_failure;

        rtm->rtm_type = rt->fib6_type;
        rtm->rtm_flags = 0;
        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
        rtm->rtm_protocol = rt->fib6_protocol;

        if (rt6_flags & RTF_CACHE)
                rtm->rtm_flags |= RTM_F_CLONED;

        if (dest) {
                if (nla_put_in6_addr(skb, RTA_DST, dest))
                        goto nla_put_failure;
                rtm->rtm_dst_len = 128;
        } else if (rtm->rtm_dst_len)
                if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
                        goto nla_put_failure;
#ifdef CONFIG_IPV6_SUBTREES
        if (src) {
                if (nla_put_in6_addr(skb, RTA_SRC, src))
                        goto nla_put_failure;
                rtm->rtm_src_len = 128;
        } else if (rtm->rtm_src_len &&
                   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
                goto nla_put_failure;
#endif
        if (iif) {
#ifdef CONFIG_IPV6_MROUTE
                if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
                        int err = ip6mr_get_route(net, skb, rtm, portid);

                        if (err == 0)
                                return 0;
                        if (err < 0)
                                goto nla_put_failure;
                } else
#endif
                        if (nla_put_u32(skb, RTA_IIF, iif))
                                goto nla_put_failure;
        } else if (dest) {
                struct in6_addr saddr_buf;
                if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 &&
                    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
                        goto nla_put_failure;
        }

        if (rt->fib6_prefsrc.plen) {
                struct in6_addr saddr_buf;
                saddr_buf = rt->fib6_prefsrc.addr;
                if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
                        goto nla_put_failure;
        }

        pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
        if (rtnetlink_put_metrics(skb, pmetrics) < 0)
                goto nla_put_failure;

        if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
                goto nla_put_failure;

        /* For multipath routes, walk the siblings list and add
         * each as a nexthop within RTA_MULTIPATH.
         */
        if (rt6) {
                struct net_device *dev;

                if (rt6_flags & RTF_GATEWAY &&
                    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
                        goto nla_put_failure;

                dev = dst_dev(dst);
                if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
                        goto nla_put_failure;

                if (lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
                        goto nla_put_failure;
        } else if (READ_ONCE(rt->fib6_nsiblings)) {
                struct fib6_info *sibling;
                struct nlattr *mp;

                mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
                if (!mp)
                        goto nla_put_failure;

                if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
                                    rt->fib6_nh->fib_nh_weight, AF_INET6,
                                    0) < 0)
                        goto nla_put_failure;

                rcu_read_lock();

                list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
                                        fib6_siblings) {
                        if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
                                            sibling->fib6_nh->fib_nh_weight,
                                            AF_INET6, 0) < 0) {
                                rcu_read_unlock();

                                goto nla_put_failure;
                        }
                }

                rcu_read_unlock();

                nla_nest_end(skb, mp);
        } else if (rt->nh) {
                if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
                        goto nla_put_failure;

                if (nexthop_is_blackhole(rt->nh))
                        rtm->rtm_type = RTN_BLACKHOLE;

                if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
                    rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
                        goto nla_put_failure;

                rtm->rtm_flags |= nh_flags;
        } else {
                if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
                                     &nh_flags, false) < 0)
                        goto nla_put_failure;

                rtm->rtm_flags |= nh_flags;
        }

        if (rt6_flags & RTF_EXPIRES) {
                expires = dst ? READ_ONCE(dst->expires) : rt->expires;
                expires -= jiffies;
        }

        if (!dst) {
                if (READ_ONCE(rt->offload))
                        rtm->rtm_flags |= RTM_F_OFFLOAD;
                if (READ_ONCE(rt->trap))
                        rtm->rtm_flags |= RTM_F_TRAP;
                if (READ_ONCE(rt->offload_failed))
                        rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
        }

        if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
                goto nla_put_failure;

        if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
                goto nla_put_failure;


        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
{
        const struct net_device *dev = arg;

        if (nh->fib_nh_dev == dev)
                return 1;

        return 0;
}

static bool fib6_info_uses_dev(const struct fib6_info *f6i,
                               const struct net_device *dev)
{
        if (f6i->nh) {
                struct net_device *_dev = (struct net_device *)dev;

                return !!nexthop_for_each_fib6_nh(f6i->nh,
                                                  fib6_info_nh_uses_dev,
                                                  _dev);
        }

        if (f6i->fib6_nh->fib_nh_dev == dev)
                return true;

        if (READ_ONCE(f6i->fib6_nsiblings)) {
                const struct fib6_info *sibling;

                rcu_read_lock();
                list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
                                        fib6_siblings) {
                        if (sibling->fib6_nh->fib_nh_dev == dev) {
                                rcu_read_unlock();
                                return true;
                        }
                        if (!READ_ONCE(f6i->fib6_nsiblings))
                                break;
                }
                rcu_read_unlock();
        }
        return false;
}

struct fib6_nh_exception_dump_walker {
        struct rt6_rtnl_dump_arg *dump;
        struct fib6_info *rt;
        unsigned int flags;
        unsigned int skip;
        unsigned int count;
};

static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
{
        struct fib6_nh_exception_dump_walker *w = arg;
        struct rt6_rtnl_dump_arg *dump = w->dump;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        int i, err;

        bucket = fib6_nh_get_excptn_bucket(nh, NULL);
        if (!bucket)
                return 0;

        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
                        if (w->skip) {
                                w->skip--;
                                continue;
                        }

                        /* Expiration of entries doesn't bump sernum, insertion
                         * does. Removal is triggered by insertion, so we can
                         * rely on the fact that if entries change between two
                         * partial dumps, this node is scanned again completely,
                         * see rt6_insert_exception() and fib6_dump_table().
                         *
                         * Count expired entries we go through as handled
                         * entries that we'll skip next time, in case of partial
                         * node dump. Otherwise, if entries expire meanwhile,
                         * we'll skip the wrong amount.
                         */
                        if (rt6_check_expired(rt6_ex->rt6i)) {
                                w->count++;
                                continue;
                        }

                        err = rt6_fill_node(dump->net, dump->skb, w->rt,
                                            &rt6_ex->rt6i->dst, NULL, NULL, 0,
                                            RTM_NEWROUTE,
                                            NETLINK_CB(dump->cb->skb).portid,
                                            dump->cb->nlh->nlmsg_seq, w->flags);
                        if (err)
                                return err;

                        w->count++;
                }
                bucket++;
        }

        return 0;
}

/* Return -1 if done with node, number of handled routes on partial dump */
int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
{
        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
        struct fib_dump_filter *filter = &arg->filter;
        unsigned int flags = NLM_F_MULTI;
        struct net *net = arg->net;
        int count = 0;

        if (rt == net->ipv6.fib6_null_entry)
                return -1;

        if ((filter->flags & RTM_F_PREFIX) &&
            !(rt->fib6_flags & RTF_PREFIX_RT)) {
                /* success since this is not a prefix route */
                return -1;
        }
        if (filter->filter_set &&
            ((filter->rt_type  && rt->fib6_type != filter->rt_type) ||
             (filter->dev      && !fib6_info_uses_dev(rt, filter->dev)) ||
             (filter->protocol && rt->fib6_protocol != filter->protocol))) {
                return -1;
        }

        if (filter->filter_set ||
            !filter->dump_routes || !filter->dump_exceptions) {
                flags |= NLM_F_DUMP_FILTERED;
        }

        if (filter->dump_routes) {
                if (skip) {
                        skip--;
                } else {
                        if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
                                          0, RTM_NEWROUTE,
                                          NETLINK_CB(arg->cb->skb).portid,
                                          arg->cb->nlh->nlmsg_seq, flags)) {
                                return 0;
                        }
                        count++;
                }
        }

        if (filter->dump_exceptions) {
                struct fib6_nh_exception_dump_walker w = { .dump = arg,
                                                           .rt = rt,
                                                           .flags = flags,
                                                           .skip = skip,
                                                           .count = 0 };
                int err;

                rcu_read_lock();
                if (rt->nh) {
                        err = nexthop_for_each_fib6_nh(rt->nh,
                                                       rt6_nh_dump_exceptions,
                                                       &w);
                } else {
                        err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
                }
                rcu_read_unlock();

                if (err)
                        return count + w.count;
        }

        return -1;
}

static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
                                        const struct nlmsghdr *nlh,
                                        struct nlattr **tb,
                                        struct netlink_ext_ack *extack)
{
        struct rtmsg *rtm;
        int i, err;

        rtm = nlmsg_payload(nlh, sizeof(*rtm));
        if (!rtm) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Invalid header for get route request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
                                              rtm_ipv6_policy, extack);

        if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
            (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
            rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
            rtm->rtm_type) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
                return -EINVAL;
        }
        if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Invalid flags for get route request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
                                            rtm_ipv6_policy, extack);
        if (err)
                return err;

        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
                NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
                return -EINVAL;
        }

        if (tb[RTA_FLOWLABEL] &&
            (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) {
                NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL],
                                    "Invalid flow label");
                return -EINVAL;
        }

        for (i = 0; i <= RTA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case RTA_SRC:
                case RTA_DST:
                case RTA_IIF:
                case RTA_OIF:
                case RTA_MARK:
                case RTA_UID:
                case RTA_SPORT:
                case RTA_DPORT:
                case RTA_IP_PROTO:
                case RTA_FLOWLABEL:
                        break;
                default:
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[RTA_MAX+1];
        int err, iif = 0, oif = 0;
        struct fib6_info *from;
        struct dst_entry *dst;
        struct rt6_info *rt;
        struct sk_buff *skb;
        struct rtmsg *rtm;
        struct flowi6 fl6 = {};
        __be32 flowlabel;
        bool fibmatch;

        err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
        if (err < 0)
                goto errout;

        err = -EINVAL;
        rtm = nlmsg_data(nlh);
        fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);

        if (tb[RTA_SRC]) {
                if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
                        goto errout;

                fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
        }

        if (tb[RTA_DST]) {
                if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
                        goto errout;

                fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
        }

        if (tb[RTA_IIF])
                iif = nla_get_u32(tb[RTA_IIF]);

        if (tb[RTA_OIF])
                oif = nla_get_u32(tb[RTA_OIF]);

        if (tb[RTA_MARK])
                fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);

        if (tb[RTA_UID])
                fl6.flowi6_uid = make_kuid(current_user_ns(),
                                           nla_get_u32(tb[RTA_UID]));
        else
                fl6.flowi6_uid = iif ? INVALID_UID : current_uid();

        if (tb[RTA_SPORT])
                fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);

        if (tb[RTA_DPORT])
                fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);

        if (tb[RTA_IP_PROTO]) {
                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
                                                  &fl6.flowi6_proto, AF_INET6,
                                                  extack);
                if (err)
                        goto errout;
        }

        flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0);
        fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel);

        if (iif) {
                struct net_device *dev;
                int flags = 0;

                rcu_read_lock();

                dev = dev_get_by_index_rcu(net, iif);
                if (!dev) {
                        rcu_read_unlock();
                        err = -ENODEV;
                        goto errout;
                }

                fl6.flowi6_iif = iif;

                if (!ipv6_addr_any(&fl6.saddr))
                        flags |= RT6_LOOKUP_F_HAS_SADDR;

                dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);

                rcu_read_unlock();
        } else {
                fl6.flowi6_oif = oif;

                dst = ip6_route_output(net, NULL, &fl6);
        }


        rt = dst_rt6_info(dst);
        if (rt->dst.error) {
                err = rt->dst.error;
                ip6_rt_put(rt);
                goto errout;
        }

        if (rt == net->ipv6.ip6_null_entry) {
                err = rt->dst.error;
                ip6_rt_put(rt);
                goto errout;
        }

        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb) {
                ip6_rt_put(rt);
                err = -ENOBUFS;
                goto errout;
        }

        skb_dst_set(skb, &rt->dst);

        rcu_read_lock();
        from = rcu_dereference(rt->from);
        if (from) {
                if (fibmatch)
                        err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
                                            iif, RTM_NEWROUTE,
                                            NETLINK_CB(in_skb).portid,
                                            nlh->nlmsg_seq, 0);
                else
                        err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
                                            &fl6.saddr, iif, RTM_NEWROUTE,
                                            NETLINK_CB(in_skb).portid,
                                            nlh->nlmsg_seq, 0);
        } else {
                err = -ENETUNREACH;
        }
        rcu_read_unlock();

        if (err < 0) {
                kfree_skb(skb);
                goto errout;
        }

        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
        return err;
}

void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
                     unsigned int nlm_flags)
{
        struct net *net = info->nl_net;
        struct sk_buff *skb;
        size_t sz;
        u32 seq;
        int err;

        err = -ENOBUFS;
        seq = info->nlh ? info->nlh->nlmsg_seq : 0;

        rcu_read_lock();
        sz = rt6_nlmsg_size(rt);
retry:
        skb = nlmsg_new(sz, GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
                            event, info->portid, seq, nlm_flags);
        if (err < 0) {
                kfree_skb(skb);
                /* -EMSGSIZE implies needed space grew under us. */
                if (err == -EMSGSIZE) {
                        sz = max(rt6_nlmsg_size(rt), sz << 1);
                        goto retry;
                }
                goto errout;
        }

        rcu_read_unlock();

        rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
                    info->nlh, GFP_ATOMIC);
        return;
errout:
        rcu_read_unlock();
        rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}

void fib6_rt_update(struct net *net, struct fib6_info *rt,
                    struct nl_info *info)
{
        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
        if (!skb)
                goto errout;

        err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
                            RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
                    info->nlh, gfp_any());
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}

void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
                            bool offload, bool trap, bool offload_failed)
{
        u8 fib_notify_on_flag_change;
        struct sk_buff *skb;
        int err;

        if (READ_ONCE(f6i->offload) == offload &&
            READ_ONCE(f6i->trap) == trap &&
            READ_ONCE(f6i->offload_failed) == offload_failed)
                return;

        WRITE_ONCE(f6i->offload, offload);
        WRITE_ONCE(f6i->trap, trap);

        fib_notify_on_flag_change = READ_ONCE(net->ipv6.sysctl.fib_notify_on_flag_change);
        /* 2 means send notifications only if offload_failed was changed. */
        if (fib_notify_on_flag_change == 2 &&
            READ_ONCE(f6i->offload_failed) == offload_failed)
                return;

        WRITE_ONCE(f6i->offload_failed, offload_failed);

        if (!rcu_access_pointer(f6i->fib6_node))
                /* The route was removed from the tree, do not send
                 * notification.
                 */
                return;

        if (!fib_notify_on_flag_change)
                return;

        skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
        if (!skb) {
                err = -ENOBUFS;
                goto errout;
        }

        err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0,
                            0, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL);
        return;

errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}
EXPORT_SYMBOL(fib6_info_hw_flags_set);

static int ip6_route_dev_notify(struct notifier_block *this,
                                unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);

        if (!(dev->flags & IFF_LOOPBACK))
                return NOTIFY_OK;

        if (event == NETDEV_REGISTER) {
                net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
                net->ipv6.ip6_null_entry->dst.dev = dev;
                net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
                net->ipv6.ip6_prohibit_entry->dst.dev = dev;
                net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
                net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
                net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
#endif
         } else if (event == NETDEV_UNREGISTER &&
                    dev->reg_state != NETREG_UNREGISTERED) {
                /* NETDEV_UNREGISTER could be fired for multiple times by
                 * netdev_wait_allrefs(). Make sure we only call this once.
                 */
                in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
                in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
                in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
#endif
        }

        return NOTIFY_OK;
}

/*
 *        /proc
 */

#ifdef CONFIG_PROC_FS
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
        struct net *net = (struct net *)seq->private;
        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
                   net->ipv6.rt6_stats->fib_nodes,
                   net->ipv6.rt6_stats->fib_route_nodes,
                   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
                   net->ipv6.rt6_stats->fib_rt_entries,
                   net->ipv6.rt6_stats->fib_rt_cache,
                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
                   net->ipv6.rt6_stats->fib_discarded_routes);

        return 0;
}
#endif        /* CONFIG_PROC_FS */

#ifdef CONFIG_SYSCTL

static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        struct net *net;
        int delay;
        int ret;
        if (!write)
                return -EINVAL;

        ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
        if (ret)
                return ret;

        net = (struct net *)ctl->extra1;
        delay = READ_ONCE(net->ipv6.sysctl.flush_delay);
        fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
        return 0;
}

static struct ctl_table ipv6_route_table_template[] = {
        {
                .procname        =        "max_size",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_max_size,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "gc_thresh",
                .data                =        &ip6_dst_ops_template.gc_thresh,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "flush",
                .data                =        &init_net.ipv6.sysctl.flush_delay,
                .maxlen                =        sizeof(int),
                .mode                =        0200,
                .proc_handler        =        ipv6_sysctl_rtcache_flush
        },
        {
                .procname        =        "gc_min_interval",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "gc_timeout",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "gc_interval",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_interval,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "gc_elasticity",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "mtu_expires",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "min_adv_mss",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_min_advmss,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "gc_min_interval_ms",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_ms_jiffies,
        },
        {
                .procname        =        "skip_notify_on_dev_down",
                .data                =        &init_net.ipv6.sysctl.skip_notify_on_dev_down,
                .maxlen                =        sizeof(u8),
                .mode                =        0644,
                .proc_handler        =        proc_dou8vec_minmax,
                .extra1                =        SYSCTL_ZERO,
                .extra2                =        SYSCTL_ONE,
        },
};

struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
{
        struct ctl_table *table;

        table = kmemdup(ipv6_route_table_template,
                        sizeof(ipv6_route_table_template),
                        GFP_KERNEL);

        if (table) {
                table[0].data = &net->ipv6.sysctl.ip6_rt_max_size;
                table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
                table[2].data = &net->ipv6.sysctl.flush_delay;
                table[2].extra1 = net;
                table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
                table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
                table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
                table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
                table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
        }

        return table;
}

size_t ipv6_route_sysctl_table_size(struct net *net)
{
        /* Don't export sysctls to unprivileged users */
        if (net->user_ns != &init_user_ns)
                return 1;

        return ARRAY_SIZE(ipv6_route_table_template);
}
#endif

static int __net_init ip6_route_net_init(struct net *net)
{
        int ret = -ENOMEM;

        memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
               sizeof(net->ipv6.ip6_dst_ops));

        if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
                goto out_ip6_dst_ops;

        net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
        if (!net->ipv6.fib6_null_entry)
                goto out_ip6_dst_entries;
        memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
               sizeof(*net->ipv6.fib6_null_entry));

        net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
                                           sizeof(*net->ipv6.ip6_null_entry),
                                           GFP_KERNEL);
        if (!net->ipv6.ip6_null_entry)
                goto out_fib6_null_entry;
        net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
        dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
                         ip6_template_metrics, true);
        INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
        net->ipv6.fib6_has_custom_rules = false;
        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
                                               sizeof(*net->ipv6.ip6_prohibit_entry),
                                               GFP_KERNEL);
        if (!net->ipv6.ip6_prohibit_entry)
                goto out_ip6_null_entry;
        net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
        dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
                         ip6_template_metrics, true);
        INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);

        net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
                                               sizeof(*net->ipv6.ip6_blk_hole_entry),
                                               GFP_KERNEL);
        if (!net->ipv6.ip6_blk_hole_entry)
                goto out_ip6_prohibit_entry;
        net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
        dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
                         ip6_template_metrics, true);
        INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
#ifdef CONFIG_IPV6_SUBTREES
        net->ipv6.fib6_routes_require_src = 0;
#endif
#endif

        net->ipv6.sysctl.flush_delay = 0;
        net->ipv6.sysctl.ip6_rt_max_size = INT_MAX;
        net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
        net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
        net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
        net->ipv6.sysctl.skip_notify_on_dev_down = 0;

        atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);

        ret = 0;
out:
        return ret;

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_ip6_prohibit_entry:
        kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
        kfree(net->ipv6.ip6_null_entry);
#endif
out_fib6_null_entry:
        kfree(net->ipv6.fib6_null_entry);
out_ip6_dst_entries:
        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
out_ip6_dst_ops:
        goto out;
}

static void __net_exit ip6_route_net_exit(struct net *net)
{
        kfree(net->ipv6.fib6_null_entry);
        kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
        kfree(net->ipv6.ip6_prohibit_entry);
        kfree(net->ipv6.ip6_blk_hole_entry);
#endif
        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
}

static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
        if (!proc_create_net("ipv6_route", 0, net->proc_net,
                             &ipv6_route_seq_ops,
                             sizeof(struct ipv6_route_iter)))
                return -ENOMEM;

        if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
                                    rt6_stats_seq_show, NULL)) {
                remove_proc_entry("ipv6_route", net->proc_net);
                return -ENOMEM;
        }
#endif
        return 0;
}

static void __net_exit ip6_route_net_exit_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
        remove_proc_entry("ipv6_route", net->proc_net);
        remove_proc_entry("rt6_stats", net->proc_net);
#endif
}

static struct pernet_operations ip6_route_net_ops = {
        .init = ip6_route_net_init,
        .exit = ip6_route_net_exit,
};

static int __net_init ipv6_inetpeer_init(struct net *net)
{
        struct inet_peer_base *bp = kmalloc_obj(*bp);

        if (!bp)
                return -ENOMEM;
        inet_peer_base_init(bp);
        net->ipv6.peers = bp;
        return 0;
}

static void __net_exit ipv6_inetpeer_exit(struct net *net)
{
        struct inet_peer_base *bp = net->ipv6.peers;

        net->ipv6.peers = NULL;
        inetpeer_invalidate_tree(bp);
        kfree(bp);
}

static struct pernet_operations ipv6_inetpeer_ops = {
        .init        =        ipv6_inetpeer_init,
        .exit        =        ipv6_inetpeer_exit,
};

static struct pernet_operations ip6_route_net_late_ops = {
        .init = ip6_route_net_init_late,
        .exit = ip6_route_net_exit_late,
};

static struct notifier_block ip6_route_dev_notifier = {
        .notifier_call = ip6_route_dev_notify,
        .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
};

void __init ip6_route_init_special_entries(void)
{
        /* Registering of the loopback is done before this portion of code,
         * the loopback reference in rt6_info will not be taken, do it
         * manually for init_net */
        init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
        init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
        init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
        init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
        init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
        init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
        init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #endif
}

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)

BTF_ID_LIST_SINGLE(btf_fib6_info_id, struct, fib6_info)

static const struct bpf_iter_seq_info ipv6_route_seq_info = {
        .seq_ops                = &ipv6_route_seq_ops,
        .init_seq_private        = bpf_iter_init_seq_net,
        .fini_seq_private        = bpf_iter_fini_seq_net,
        .seq_priv_size                = sizeof(struct ipv6_route_iter),
};

static struct bpf_iter_reg ipv6_route_reg_info = {
        .target                        = "ipv6_route",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__ipv6_route, rt),
                  PTR_TO_BTF_ID_OR_NULL },
        },
        .seq_info                = &ipv6_route_seq_info,
};

static int __init bpf_iter_register(void)
{
        ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
        return bpf_iter_reg_target(&ipv6_route_reg_info);
}

static void bpf_iter_unregister(void)
{
        bpf_iter_unreg_target(&ipv6_route_reg_info);
}
#endif

static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = {
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE,
         .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE,
         .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
         .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
};

int __init ip6_route_init(void)
{
        int ret;
        int cpu;

        ret = -ENOMEM;
        ip6_dst_ops_template.kmem_cachep =
                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
                                  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
        if (!ip6_dst_ops_template.kmem_cachep)
                goto out;

        ret = dst_entries_init(&ip6_dst_blackhole_ops);
        if (ret)
                goto out_kmem_cache;

        ret = register_pernet_subsys(&ipv6_inetpeer_ops);
        if (ret)
                goto out_dst_entries;

        ret = register_pernet_subsys(&ip6_route_net_ops);
        if (ret)
                goto out_register_inetpeer;

        ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;

        ret = fib6_init();
        if (ret)
                goto out_register_subsys;

        ret = xfrm6_init();
        if (ret)
                goto out_fib6_init;

        ret = fib6_rules_init();
        if (ret)
                goto xfrm6_init;

        ret = register_pernet_subsys(&ip6_route_net_late_ops);
        if (ret)
                goto fib6_rules_init;

        ret = rtnl_register_many(ip6_route_rtnl_msg_handlers);
        if (ret < 0)
                goto out_register_late_subsys;

        ret = register_netdevice_notifier(&ip6_route_dev_notifier);
        if (ret)
                goto out_register_late_subsys;

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        ret = bpf_iter_register();
        if (ret)
                goto out_register_late_subsys;
#endif

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);

                INIT_LIST_HEAD(&ul->head);
                spin_lock_init(&ul->lock);
        }

out:
        return ret;

out_register_late_subsys:
        rtnl_unregister_all(PF_INET6);
        unregister_pernet_subsys(&ip6_route_net_late_ops);
fib6_rules_init:
        fib6_rules_cleanup();
xfrm6_init:
        xfrm6_fini();
out_fib6_init:
        fib6_gc_cleanup();
out_register_subsys:
        unregister_pernet_subsys(&ip6_route_net_ops);
out_register_inetpeer:
        unregister_pernet_subsys(&ipv6_inetpeer_ops);
out_dst_entries:
        dst_entries_destroy(&ip6_dst_blackhole_ops);
out_kmem_cache:
        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
        goto out;
}

void ip6_route_cleanup(void)
{
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        bpf_iter_unregister();
#endif
        unregister_netdevice_notifier(&ip6_route_dev_notifier);
        unregister_pernet_subsys(&ip6_route_net_late_ops);
        fib6_rules_cleanup();
        xfrm6_fini();
        fib6_gc_cleanup();
        unregister_pernet_subsys(&ipv6_inetpeer_ops);
        unregister_pernet_subsys(&ip6_route_net_ops);
        dst_entries_destroy(&ip6_dst_blackhole_ops);
        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
}





























































































































































































































































    2 

    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Mutexes: blocking mutual exclusion locks
 *
 * started by Ingo Molnar:
 *
 *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *
 * This file contains the main data structure and API definitions.
 */
#ifndef __LINUX_MUTEX_H
#define __LINUX_MUTEX_H

#include <asm/current.h>
#include <linux/list.h>
#include <linux/spinlock_types.h>
#include <linux/lockdep.h>
#include <linux/atomic.h>
#include <asm/processor.h>
#include <linux/osq_lock.h>
#include <linux/debug_locks.h>
#include <linux/cleanup.h>
#include <linux/mutex_types.h>

struct device;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __DEP_MAP_MUTEX_INITIALIZER(lockname)                        \
                , .dep_map = {                                        \
                        .name = #lockname,                        \
                        .wait_type_inner = LD_WAIT_SLEEP,        \
                }
#else
# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
#endif

#ifdef CONFIG_DEBUG_MUTEXES

# define __DEBUG_MUTEX_INITIALIZER(lockname)                                \
        , .magic = &lockname

extern void mutex_destroy(struct mutex *lock);

#else

# define __DEBUG_MUTEX_INITIALIZER(lockname)

static inline void mutex_destroy(struct mutex *lock) {}

#endif

/**
 * mutex_init - initialize the mutex
 * @mutex: the mutex to be initialized
 *
 * Initialize the mutex to unlocked state.
 *
 * It is not allowed to initialize an already locked mutex.
 */
#define mutex_init(mutex)                                                \
do {                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __mutex_init((mutex), #mutex, &__key);                                \
} while (0)

/**
 * mutex_init_with_key - initialize a mutex with a given lockdep key
 * @mutex: the mutex to be initialized
 * @key: the lockdep key to be associated with the mutex
 *
 * Initialize the mutex to the unlocked state.
 *
 * It is not allowed to initialize an already locked mutex.
 */
#define mutex_init_with_key(mutex, key) __mutex_init((mutex), #mutex, (key))

#ifndef CONFIG_PREEMPT_RT
#define __MUTEX_INITIALIZER(lockname) \
                { .owner = ATOMIC_LONG_INIT(0) \
                , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
                , .first_waiter = NULL \
                __DEBUG_MUTEX_INITIALIZER(lockname) \
                __DEP_MAP_MUTEX_INITIALIZER(lockname) }

#define DEFINE_MUTEX(mutexname) \
        struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key);

static inline void __mutex_init(struct mutex *lock, const char *name,
                                struct lock_class_key *key)
{
        mutex_init_lockdep(lock, name, key);
}
#else
extern void mutex_init_generic(struct mutex *lock);

static inline void __mutex_init(struct mutex *lock, const char *name,
                                struct lock_class_key *key)
{
        mutex_init_generic(lock);
}
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */

/**
 * mutex_is_locked - is the mutex locked
 * @lock: the mutex to be queried
 *
 * Returns true if the mutex is locked, false if unlocked.
 */
extern bool mutex_is_locked(struct mutex *lock);

#else /* !CONFIG_PREEMPT_RT */
/*
 * Preempt-RT variant based on rtmutexes.
 */

#define __MUTEX_INITIALIZER(mutexname)                                        \
{                                                                        \
        .rtmutex = __RT_MUTEX_BASE_INITIALIZER(mutexname.rtmutex)        \
        __DEP_MAP_MUTEX_INITIALIZER(mutexname)                                \
}

#define DEFINE_MUTEX(mutexname)                                                \
        struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)

#define mutex_is_locked(l)        rt_mutex_base_is_locked(&(l)->rtmutex)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void mutex_rt_init_lockdep(struct mutex *mutex, const char *name,
                             struct lock_class_key *key);

static inline void __mutex_init(struct mutex *lock, const char *name,
                                struct lock_class_key *key)
{
        mutex_rt_init_lockdep(lock, name, key);
}

#else
extern void mutex_rt_init_generic(struct mutex *mutex);

static inline void __mutex_init(struct mutex *lock, const char *name,
                                struct lock_class_key *key)
{
        mutex_rt_init_generic(lock);
}
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
#endif /* CONFIG_PREEMPT_RT */

#ifdef CONFIG_DEBUG_MUTEXES

int __must_check __devm_mutex_init(struct device *dev, struct mutex *lock);

#else

static inline int __must_check __devm_mutex_init(struct device *dev, struct mutex *lock)
{
        /*
         * When CONFIG_DEBUG_MUTEXES is off mutex_destroy() is just a nop so
         * no really need to register it in the devm subsystem.
         */
        return 0;
}

#endif

#define __mutex_init_ret(mutex)                                \
({                                                        \
        typeof(mutex) mutex_ = (mutex);                        \
                                                        \
        mutex_init(mutex_);                                \
        mutex_;                                                \
})

#define devm_mutex_init(dev, mutex) \
        __devm_mutex_init(dev, __mutex_init_ret(mutex))

/*
 * See kernel/locking/mutex.c for detailed documentation of these APIs.
 * Also see Documentation/locking/mutex-design.rst.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass) __acquires(lock);
extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) __acquires(lock);
extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
                                        unsigned int subclass) __cond_acquires(0, lock);
extern int __must_check _mutex_lock_killable(struct mutex *lock,
                unsigned int subclass, struct lockdep_map *nest_lock) __cond_acquires(0, lock);
extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) __acquires(lock);

#define mutex_lock(lock) mutex_lock_nested(lock, 0)
#define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0)
#define mutex_lock_killable(lock) _mutex_lock_killable(lock, 0, NULL)
#define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0)

#define mutex_lock_nest_lock(lock, nest_lock)                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map);        \
        _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);                \
} while (0)

#define mutex_lock_killable_nest_lock(lock, nest_lock)                        \
(                                                                        \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map),                \
        _mutex_lock_killable(lock, 0, &(nest_lock)->dep_map)                \
)

#define mutex_lock_killable_nested(lock, subclass) \
        _mutex_lock_killable(lock, subclass, NULL)

#else
extern void mutex_lock(struct mutex *lock) __acquires(lock);
extern int __must_check mutex_lock_interruptible(struct mutex *lock) __cond_acquires(0, lock);
extern int __must_check mutex_lock_killable(struct mutex *lock) __cond_acquires(0, lock);
extern void mutex_lock_io(struct mutex *lock) __acquires(lock);

# define mutex_lock_nested(lock, subclass) mutex_lock(lock)
# define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
# define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
# define mutex_lock_killable_nest_lock(lock, nest_lock) mutex_lock_killable(lock)
# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
# define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock)
#endif

/*
 * NOTE: mutex_trylock() follows the spin_trylock() convention,
 *       not the down_trylock() convention!
 *
 * Returns 1 if the mutex has been acquired successfully, and 0 on contention.
 */

#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern int _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) __cond_acquires(true, lock);

#define mutex_trylock_nest_lock(lock, nest_lock)                \
(                                                                \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map),        \
        _mutex_trylock_nest_lock(lock, &(nest_lock)->dep_map)        \
)

#define mutex_trylock(lock) _mutex_trylock_nest_lock(lock, NULL)
#else
extern int mutex_trylock(struct mutex *lock) __cond_acquires(true, lock);
#define mutex_trylock_nest_lock(lock, nest_lock) mutex_trylock(lock)
#endif

extern void mutex_unlock(struct mutex *lock) __releases(lock);

extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) __cond_acquires(true, lock);

DEFINE_LOCK_GUARD_1(mutex, struct mutex, mutex_lock(_T->lock), mutex_unlock(_T->lock))
DEFINE_LOCK_GUARD_1_COND(mutex, _try, mutex_trylock(_T->lock))
DEFINE_LOCK_GUARD_1_COND(mutex, _intr, mutex_lock_interruptible(_T->lock), _RET == 0)
DEFINE_LOCK_GUARD_1_COND(mutex, _kill, mutex_lock_killable(_T->lock), _RET == 0)
DEFINE_LOCK_GUARD_1(mutex_init, struct mutex, mutex_init(_T->lock), /* */)

DECLARE_LOCK_GUARD_1_ATTRS(mutex,        __acquires(_T), __releases(*(struct mutex **)_T))
#define class_mutex_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex, _T)
DECLARE_LOCK_GUARD_1_ATTRS(mutex_try,        __acquires(_T), __releases(*(struct mutex **)_T))
#define class_mutex_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_try, _T)
DECLARE_LOCK_GUARD_1_ATTRS(mutex_intr,        __acquires(_T), __releases(*(struct mutex **)_T))
#define class_mutex_intr_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_intr, _T)
DECLARE_LOCK_GUARD_1_ATTRS(mutex_kill,        __acquires(_T), __releases(*(struct mutex **)_T))
#define class_mutex_kill_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_kill, _T)
DECLARE_LOCK_GUARD_1_ATTRS(mutex_init,        __acquires(_T), __releases(*(struct mutex **)_T))
#define class_mutex_init_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_init, _T)

extern unsigned long mutex_get_owner(struct mutex *lock);

#endif /* __LINUX_MUTEX_H */










    6 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __VDSO_MATH64_H
#define __VDSO_MATH64_H

static __always_inline u32
__iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
{
        u32 ret = 0;

        while (dividend >= divisor) {
                /* The following asm() prevents the compiler from
                   optimising this loop into a modulo operation.  */
                asm("" : "+rm"(dividend));

                dividend -= divisor;
                ret++;
        }

        *remainder = dividend;

        return ret;
}

#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)

#ifndef mul_u64_u32_add_u64_shr
static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift)
{
        return (u64)((((unsigned __int128)a * mul) + b) >> shift);
}
#endif /* mul_u64_u32_add_u64_shr */

#else

#ifndef mul_u64_u32_add_u64_shr
#ifndef mul_u32_u32
static inline u64 mul_u32_u32(u32 a, u32 b)
{
        return (u64)a * b;
}
#define mul_u32_u32 mul_u32_u32
#endif
static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift)
{
        u32 ah = a >> 32, al = a;
        bool ovf;
        u64 ret;

        ovf = __builtin_add_overflow(mul_u32_u32(al, mul), b, &ret);
        ret >>= shift;
        if (ovf && shift)
                ret += 1ULL << (64 - shift);
        if (ah)
                ret += mul_u32_u32(ah, mul) << (32 - shift);

        return ret;
}
#endif /* mul_u64_u32_add_u64_shr */

#endif

#endif /* __VDSO_MATH64_H */





















































































    5 








    6 





    1 


    1 




    6 





    1 




    1 






    6 
    2 






    2 




    2 



    6 
    4 








    4 



    4 



    2 



    2 






















































































































































































































    2 




    1 
    1 











    2 















    2 

    2 






    1 


















    1 



    1 





    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2013 Nicira, Inc.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/rculist.h>
#include <linux/err.h>

#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ip_tunnels.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/netdev_lock.h>
#include <net/rtnetlink.h>
#include <net/udp.h>
#include <net/dst_metadata.h>
#include <net/inet_dscp.h>

#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#endif

static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
{
        return hash_32((__force u32)key ^ (__force u32)remote,
                         IP_TNL_HASH_BITS);
}

static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
                                const unsigned long *flags, __be32 key)
{
        if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
                return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);

        return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
}

/* Fallback tunnel: no source, no destination, no key, no options

   Tunnel hash table:
   We require exact key match i.e. if a key is present in packet
   it will match only tunnel with the same key; if it is not present,
   it will match only keyless tunnel.

   All keysless packets, if not matched configured keyless tunnels
   will match fallback tunnel.
   Given src, dst and key, find appropriate for input tunnel.
*/
struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
                                   int link, const unsigned long *flags,
                                   __be32 remote, __be32 local,
                                   __be32 key)
{
        struct ip_tunnel *t, *cand = NULL;
        struct hlist_head *head;
        struct net_device *ndev;
        unsigned int hash;

        hash = ip_tunnel_hash(key, remote);
        head = &itn->tunnels[hash];

        hlist_for_each_entry_rcu(t, head, hash_node) {
                if (local != t->parms.iph.saddr ||
                    remote != t->parms.iph.daddr ||
                    !(t->dev->flags & IFF_UP))
                        continue;

                if (!ip_tunnel_key_match(&t->parms, flags, key))
                        continue;

                if (READ_ONCE(t->parms.link) == link)
                        return t;
                cand = t;
        }

        hlist_for_each_entry_rcu(t, head, hash_node) {
                if (remote != t->parms.iph.daddr ||
                    t->parms.iph.saddr != 0 ||
                    !(t->dev->flags & IFF_UP))
                        continue;

                if (!ip_tunnel_key_match(&t->parms, flags, key))
                        continue;

                if (READ_ONCE(t->parms.link) == link)
                        return t;
                if (!cand)
                        cand = t;
        }

        hash = ip_tunnel_hash(key, 0);
        head = &itn->tunnels[hash];

        hlist_for_each_entry_rcu(t, head, hash_node) {
                if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
                    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
                        continue;

                if (!(t->dev->flags & IFF_UP))
                        continue;

                if (!ip_tunnel_key_match(&t->parms, flags, key))
                        continue;

                if (READ_ONCE(t->parms.link) == link)
                        return t;
                if (!cand)
                        cand = t;
        }

        hlist_for_each_entry_rcu(t, head, hash_node) {
                if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
                     t->parms.i_key != key) ||
                    t->parms.iph.saddr != 0 ||
                    t->parms.iph.daddr != 0 ||
                    !(t->dev->flags & IFF_UP))
                        continue;

                if (READ_ONCE(t->parms.link) == link)
                        return t;
                if (!cand)
                        cand = t;
        }

        if (cand)
                return cand;

        t = rcu_dereference(itn->collect_md_tun);
        if (t && t->dev->flags & IFF_UP)
                return t;

        ndev = READ_ONCE(itn->fb_tunnel_dev);
        if (ndev && ndev->flags & IFF_UP)
                return netdev_priv(ndev);

        return NULL;
}
EXPORT_SYMBOL_GPL(ip_tunnel_lookup);

static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
                                    struct ip_tunnel_parm_kern *parms)
{
        unsigned int h;
        __be32 remote;
        __be32 i_key = parms->i_key;

        if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
                remote = parms->iph.daddr;
        else
                remote = 0;

        if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
            test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
                i_key = 0;

        h = ip_tunnel_hash(i_key, remote);
        return &itn->tunnels[h];
}

static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{
        struct hlist_head *head = ip_bucket(itn, &t->parms);

        if (t->collect_md)
                rcu_assign_pointer(itn->collect_md_tun, t);
        hlist_add_head_rcu(&t->hash_node, head);
}

static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{
        if (t->collect_md)
                rcu_assign_pointer(itn->collect_md_tun, NULL);
        hlist_del_init_rcu(&t->hash_node);
}

static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
                                        struct ip_tunnel_parm_kern *parms,
                                        int type)
{
        __be32 remote = parms->iph.daddr;
        __be32 local = parms->iph.saddr;
        IP_TUNNEL_DECLARE_FLAGS(flags);
        __be32 key = parms->i_key;
        int link = parms->link;
        struct ip_tunnel *t = NULL;
        struct hlist_head *head = ip_bucket(itn, parms);

        ip_tunnel_flags_copy(flags, parms->i_flags);

        hlist_for_each_entry_rcu(t, head, hash_node, lockdep_rtnl_is_held()) {
                if (local == t->parms.iph.saddr &&
                    remote == t->parms.iph.daddr &&
                    link == READ_ONCE(t->parms.link) &&
                    type == t->dev->type &&
                    ip_tunnel_key_match(&t->parms, flags, key))
                        break;
        }
        return t;
}

static struct net_device *__ip_tunnel_create(struct net *net,
                                             const struct rtnl_link_ops *ops,
                                             struct ip_tunnel_parm_kern *parms)
{
        int err;
        struct ip_tunnel *tunnel;
        struct net_device *dev;
        char name[IFNAMSIZ];

        err = -E2BIG;
        if (parms->name[0]) {
                if (!dev_valid_name(parms->name))
                        goto failed;
                strscpy(name, parms->name);
        } else {
                if (strlen(ops->kind) > (IFNAMSIZ - 3))
                        goto failed;
                strscpy(name, ops->kind);
                strcat(name, "%d");
        }

        ASSERT_RTNL();
        dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
        if (!dev) {
                err = -ENOMEM;
                goto failed;
        }
        dev_net_set(dev, net);

        dev->rtnl_link_ops = ops;

        tunnel = netdev_priv(dev);
        tunnel->parms = *parms;
        tunnel->net = net;

        err = register_netdevice(dev);
        if (err)
                goto failed_free;

        return dev;

failed_free:
        free_netdev(dev);
failed:
        return ERR_PTR(err);
}

static int ip_tunnel_bind_dev(struct net_device *dev)
{
        struct net_device *tdev = NULL;
        struct ip_tunnel *tunnel = netdev_priv(dev);
        const struct iphdr *iph;
        int hlen = LL_MAX_HEADER;
        int mtu = ETH_DATA_LEN;
        int t_hlen = tunnel->hlen + sizeof(struct iphdr);

        iph = &tunnel->parms.iph;

        /* Guess output device to choose reasonable mtu and needed_headroom */
        if (iph->daddr) {
                struct flowi4 fl4;
                struct rtable *rt;

                ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
                                    iph->saddr, tunnel->parms.o_key,
                                    iph->tos & INET_DSCP_MASK, tunnel->net,
                                    tunnel->parms.link, tunnel->fwmark, 0, 0);
                rt = ip_route_output_key(tunnel->net, &fl4);

                if (!IS_ERR(rt)) {
                        tdev = rt->dst.dev;
                        ip_rt_put(rt);
                }
                if (dev->type != ARPHRD_ETHER)
                        dev->flags |= IFF_POINTOPOINT;

                dst_cache_reset(&tunnel->dst_cache);
        }

        if (!tdev && tunnel->parms.link)
                tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);

        if (tdev) {
                hlen = tdev->hard_header_len + tdev->needed_headroom;
                mtu = min(tdev->mtu, IP_MAX_MTU);
        }

        dev->needed_headroom = t_hlen + hlen;
        mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);

        if (mtu < IPV4_MIN_MTU)
                mtu = IPV4_MIN_MTU;

        return mtu;
}

static struct ip_tunnel *ip_tunnel_create(struct net *net,
                                          struct ip_tunnel_net *itn,
                                          struct ip_tunnel_parm_kern *parms)
{
        struct ip_tunnel *nt;
        struct net_device *dev;
        int t_hlen;
        int mtu;
        int err;

        dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
        if (IS_ERR(dev))
                return ERR_CAST(dev);

        mtu = ip_tunnel_bind_dev(dev);
        err = dev_set_mtu(dev, mtu);
        if (err)
                goto err_dev_set_mtu;

        nt = netdev_priv(dev);
        t_hlen = nt->hlen + sizeof(struct iphdr);
        dev->min_mtu = ETH_MIN_MTU;
        dev->max_mtu = IP_MAX_MTU - t_hlen;
        if (dev->type == ARPHRD_ETHER)
                dev->max_mtu -= dev->hard_header_len;

        ip_tunnel_add(itn, nt);
        return nt;

err_dev_set_mtu:
        unregister_netdevice(dev);
        return ERR_PTR(err);
}

void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
{
        const struct iphdr *iph = ip_hdr(skb);
        const struct udphdr *udph;

        if (iph->protocol != IPPROTO_UDP)
                return;

        udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
        info->encap.sport = udph->source;
        info->encap.dport = udph->dest;
}
EXPORT_SYMBOL(ip_tunnel_md_udp_encap);

int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
                  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
                  bool log_ecn_error)
{
        const struct iphdr *iph = ip_hdr(skb);
        int nh, err;

#ifdef CONFIG_NET_IPGRE_BROADCAST
        if (ipv4_is_multicast(iph->daddr)) {
                DEV_STATS_INC(tunnel->dev, multicast);
                skb->pkt_type = PACKET_BROADCAST;
        }
#endif

        if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
            test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
                DEV_STATS_INC(tunnel->dev, rx_crc_errors);
                DEV_STATS_INC(tunnel->dev, rx_errors);
                goto drop;
        }

        if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
                if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
                    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
                        DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
                        DEV_STATS_INC(tunnel->dev, rx_errors);
                        goto drop;
                }
                tunnel->i_seqno = ntohl(tpi->seq) + 1;
        }

        /* Save offset of outer header relative to skb->head,
         * because we are going to reset the network header to the inner header
         * and might change skb->head.
         */
        nh = skb_network_header(skb) - skb->head;

        skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);

        if (!pskb_inet_may_pull(skb)) {
                DEV_STATS_INC(tunnel->dev, rx_length_errors);
                DEV_STATS_INC(tunnel->dev, rx_errors);
                goto drop;
        }
        iph = (struct iphdr *)(skb->head + nh);

        err = IP_ECN_decapsulate(iph, skb);
        if (unlikely(err)) {
                if (log_ecn_error)
                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
                                        &iph->saddr, iph->tos);
                if (err > 1) {
                        DEV_STATS_INC(tunnel->dev, rx_frame_errors);
                        DEV_STATS_INC(tunnel->dev, rx_errors);
                        goto drop;
                }
        }

        dev_sw_netstats_rx_add(tunnel->dev, skb->len);
        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));

        if (tunnel->dev->type == ARPHRD_ETHER) {
                skb->protocol = eth_type_trans(skb, tunnel->dev);
                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
        } else {
                skb->dev = tunnel->dev;
        }

        if (tun_dst)
                skb_dst_set(skb, (struct dst_entry *)tun_dst);

        gro_cells_receive(&tunnel->gro_cells, skb);
        return 0;

drop:
        if (tun_dst)
                dst_release((struct dst_entry *)tun_dst);
        kfree_skb(skb);
        return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_rcv);

int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
                            unsigned int num)
{
        if (num >= MAX_IPTUN_ENCAP_OPS)
                return -ERANGE;

        return !cmpxchg((const struct ip_tunnel_encap_ops **)
                        &iptun_encaps[num],
                        NULL, ops) ? 0 : -1;
}
EXPORT_SYMBOL(ip_tunnel_encap_add_ops);

int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
                            unsigned int num)
{
        int ret;

        if (num >= MAX_IPTUN_ENCAP_OPS)
                return -ERANGE;

        ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
                       &iptun_encaps[num],
                       ops, NULL) == ops) ? 0 : -1;

        synchronize_net();

        return ret;
}
EXPORT_SYMBOL(ip_tunnel_encap_del_ops);

int ip_tunnel_encap_setup(struct ip_tunnel *t,
                          struct ip_tunnel_encap *ipencap)
{
        int hlen;

        memset(&t->encap, 0, sizeof(t->encap));

        hlen = ip_encap_hlen(ipencap);
        if (hlen < 0)
                return hlen;

        t->encap.type = ipencap->type;
        t->encap.sport = ipencap->sport;
        t->encap.dport = ipencap->dport;
        t->encap.flags = ipencap->flags;

        t->encap_hlen = hlen;
        t->hlen = t->encap_hlen + t->tun_hlen;

        return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);

static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
                            struct rtable *rt, __be16 df,
                            const struct iphdr *inner_iph,
                            int tunnel_hlen, __be32 dst, bool md)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        int pkt_size;
        int mtu;

        tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
        pkt_size = skb->len - tunnel_hlen;
        pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;

        if (df) {
                mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
                mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
        } else {
                mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
        }

        if (skb_valid_dst(skb))
                skb_dst_update_pmtu_no_confirm(skb, mtu);

        if (skb->protocol == htons(ETH_P_IP)) {
                if (!skb_is_gso(skb) &&
                    (inner_iph->frag_off & htons(IP_DF)) &&
                    mtu < pkt_size) {
                        icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
                        return -E2BIG;
                }
        }
#if IS_ENABLED(CONFIG_IPV6)
        else if (skb->protocol == htons(ETH_P_IPV6)) {
                struct rt6_info *rt6;
                __be32 daddr;

                rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
                                           NULL;
                daddr = md ? dst : tunnel->parms.iph.daddr;

                if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
                           mtu >= IPV6_MIN_MTU) {
                        if ((daddr && !ipv4_is_multicast(daddr)) ||
                            rt6->rt6i_dst.plen == 128) {
                                rt6->rt6i_flags |= RTF_MODIFIED;
                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
                        }
                }

                if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
                                        mtu < pkt_size) {
                        icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                        return -E2BIG;
                }
        }
#endif
        return 0;
}

void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                       u8 proto, int tunnel_hlen)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        u32 headroom = sizeof(struct iphdr);
        struct ip_tunnel_info *tun_info;
        const struct ip_tunnel_key *key;
        const struct iphdr *inner_iph;
        struct rtable *rt = NULL;
        struct flowi4 fl4;
        __be16 df = 0;
        u8 tos, ttl;
        bool use_cache;

        tun_info = skb_tunnel_info(skb);
        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
                     ip_tunnel_info_af(tun_info) != AF_INET))
                goto tx_error;
        key = &tun_info->key;
        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
        tos = key->tos;
        if (tos == 1) {
                if (skb->protocol == htons(ETH_P_IP))
                        tos = inner_iph->tos;
                else if (skb->protocol == htons(ETH_P_IPV6))
                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
        }
        ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
                            tunnel_id_to_key32(key->tun_id),
                            tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark,
                            skb_get_hash(skb), key->flow_flags);

        if (!tunnel_hlen)
                tunnel_hlen = ip_encap_hlen(&tun_info->encap);

        if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
                goto tx_error;

        use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
        if (use_cache)
                rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
        if (!rt) {
                rt = ip_route_output_key(tunnel->net, &fl4);
                if (IS_ERR(rt)) {
                        DEV_STATS_INC(dev, tx_carrier_errors);
                        goto tx_error;
                }
                if (use_cache)
                        dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
                                          fl4.saddr);
        }
        if (rt->dst.dev == dev) {
                ip_rt_put(rt);
                DEV_STATS_INC(dev, collisions);
                goto tx_error;
        }

        if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
                df = htons(IP_DF);
        if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
                            key->u.ipv4.dst, true)) {
                ip_rt_put(rt);
                goto tx_error;
        }

        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
        ttl = key->ttl;
        if (ttl == 0) {
                if (skb->protocol == htons(ETH_P_IP))
                        ttl = inner_iph->ttl;
                else if (skb->protocol == htons(ETH_P_IPV6))
                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
                else
                        ttl = ip4_dst_hoplimit(&rt->dst);
        }

        headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
        if (skb_cow_head(skb, headroom)) {
                ip_rt_put(rt);
                goto tx_dropped;
        }

        ip_tunnel_adj_headroom(dev, headroom);

        iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
                      df, !net_eq(tunnel->net, dev_net(dev)), 0);
        return;
tx_error:
        DEV_STATS_INC(dev, tx_errors);
        goto kfree;
tx_dropped:
        DEV_STATS_INC(dev, tx_dropped);
kfree:
        kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);

void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                    const struct iphdr *tnl_params, u8 protocol)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct ip_tunnel_info *tun_info = NULL;
        const struct iphdr *inner_iph;
        unsigned int max_headroom;        /* The extra header space needed */
        struct rtable *rt = NULL;                /* Route to the other host */
        __be16 payload_protocol;
        bool use_cache = false;
        struct flowi4 fl4;
        bool md = false;
        bool connected;
        u8 tos, ttl;
        __be32 dst;
        __be16 df;

        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
        connected = (tunnel->parms.iph.daddr != 0);
        payload_protocol = skb_protocol(skb, true);

        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));

        dst = tnl_params->daddr;
        if (dst == 0) {
                /* NBMA tunnel */

                if (!skb_dst(skb)) {
                        DEV_STATS_INC(dev, tx_fifo_errors);
                        goto tx_error;
                }

                tun_info = skb_tunnel_info(skb);
                if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
                    ip_tunnel_info_af(tun_info) == AF_INET &&
                    tun_info->key.u.ipv4.dst) {
                        dst = tun_info->key.u.ipv4.dst;
                        md = true;
                        connected = true;
                } else if (payload_protocol == htons(ETH_P_IP)) {
                        rt = skb_rtable(skb);
                        dst = rt_nexthop(rt, inner_iph->daddr);
                }
#if IS_ENABLED(CONFIG_IPV6)
                else if (payload_protocol == htons(ETH_P_IPV6)) {
                        const struct in6_addr *addr6;
                        struct neighbour *neigh;
                        bool do_tx_error_icmp;
                        int addr_type;

                        neigh = dst_neigh_lookup(skb_dst(skb),
                                                 &ipv6_hdr(skb)->daddr);
                        if (!neigh)
                                goto tx_error;

                        addr6 = (const struct in6_addr *)&neigh->primary_key;
                        addr_type = ipv6_addr_type(addr6);

                        if (addr_type == IPV6_ADDR_ANY) {
                                addr6 = &ipv6_hdr(skb)->daddr;
                                addr_type = ipv6_addr_type(addr6);
                        }

                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
                                do_tx_error_icmp = true;
                        else {
                                do_tx_error_icmp = false;
                                dst = addr6->s6_addr32[3];
                        }
                        neigh_release(neigh);
                        if (do_tx_error_icmp)
                                goto tx_error_icmp;
                }
#endif
                else
                        goto tx_error;

                if (!md)
                        connected = false;
        }

        tos = tnl_params->tos;
        if (tos & 0x1) {
                tos &= ~0x1;
                if (payload_protocol == htons(ETH_P_IP)) {
                        tos = inner_iph->tos;
                        connected = false;
                } else if (payload_protocol == htons(ETH_P_IPV6)) {
                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
                        connected = false;
                }
        }

        ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
                            tunnel->parms.o_key, tos & INET_DSCP_MASK,
                            tunnel->net, READ_ONCE(tunnel->parms.link),
                            tunnel->fwmark, skb_get_hash(skb), 0);

        if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
                goto tx_error;

        if (connected && md) {
                use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
                if (use_cache)
                        rt = dst_cache_get_ip4(&tun_info->dst_cache,
                                               &fl4.saddr);
        } else {
                rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
                                                &fl4.saddr) : NULL;
        }

        if (!rt) {
                rt = ip_route_output_key(tunnel->net, &fl4);

                if (IS_ERR(rt)) {
                        DEV_STATS_INC(dev, tx_carrier_errors);
                        goto tx_error;
                }
                if (use_cache)
                        dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
                                          fl4.saddr);
                else if (!md && connected)
                        dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
                                          fl4.saddr);
        }

        if (rt->dst.dev == dev) {
                ip_rt_put(rt);
                DEV_STATS_INC(dev, collisions);
                goto tx_error;
        }

        df = tnl_params->frag_off;
        if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
                df |= (inner_iph->frag_off & htons(IP_DF));

        if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
                ip_rt_put(rt);
                goto tx_error;
        }

        if (tunnel->err_count > 0) {
                if (time_before(jiffies,
                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
                        tunnel->err_count--;

                        dst_link_failure(skb);
                } else
                        tunnel->err_count = 0;
        }

        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
        ttl = tnl_params->ttl;
        if (ttl == 0) {
                if (payload_protocol == htons(ETH_P_IP))
                        ttl = inner_iph->ttl;
#if IS_ENABLED(CONFIG_IPV6)
                else if (payload_protocol == htons(ETH_P_IPV6))
                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
#endif
                else
                        ttl = ip4_dst_hoplimit(&rt->dst);
        }

        max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
                        + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);

        if (skb_cow_head(skb, max_headroom)) {
                ip_rt_put(rt);
                DEV_STATS_INC(dev, tx_dropped);
                kfree_skb(skb);
                return;
        }

        ip_tunnel_adj_headroom(dev, max_headroom);

        iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
                      df, !net_eq(tunnel->net, dev_net(dev)), 0);
        return;

#if IS_ENABLED(CONFIG_IPV6)
tx_error_icmp:
        dst_link_failure(skb);
#endif
tx_error:
        DEV_STATS_INC(dev, tx_errors);
        kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_tunnel_xmit);

static void ip_tunnel_update(struct ip_tunnel_net *itn,
                             struct ip_tunnel *t,
                             struct net_device *dev,
                             struct ip_tunnel_parm_kern *p,
                             bool set_mtu,
                             __u32 fwmark)
{
        ip_tunnel_del(itn, t);
        t->parms.iph.saddr = p->iph.saddr;
        t->parms.iph.daddr = p->iph.daddr;
        t->parms.i_key = p->i_key;
        t->parms.o_key = p->o_key;
        if (dev->type != ARPHRD_ETHER) {
                __dev_addr_set(dev, &p->iph.saddr, 4);
                memcpy(dev->broadcast, &p->iph.daddr, 4);
        }
        ip_tunnel_add(itn, t);

        t->parms.iph.ttl = p->iph.ttl;
        t->parms.iph.tos = p->iph.tos;
        t->parms.iph.frag_off = p->iph.frag_off;

        if (t->parms.link != p->link || t->fwmark != fwmark) {
                int mtu;

                WRITE_ONCE(t->parms.link, p->link);
                t->fwmark = fwmark;
                mtu = ip_tunnel_bind_dev(dev);
                if (set_mtu)
                        WRITE_ONCE(dev->mtu, mtu);
        }
        dst_cache_reset(&t->dst_cache);
        netdev_state_change(dev);
}

int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
                  int cmd)
{
        int err = 0;
        struct ip_tunnel *t = netdev_priv(dev);
        struct net *net = t->net;
        struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);

        switch (cmd) {
        case SIOCGETTUNNEL:
                if (dev == itn->fb_tunnel_dev) {
                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
                        if (!t)
                                t = netdev_priv(dev);
                }
                memcpy(p, &t->parms, sizeof(*p));
                break;

        case SIOCADDTUNNEL:
        case SIOCCHGTUNNEL:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto done;
                if (p->iph.ttl)
                        p->iph.frag_off |= htons(IP_DF);
                if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
                        if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
                                p->i_key = 0;
                        if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
                                p->o_key = 0;
                }

                t = ip_tunnel_find(itn, p, itn->type);

                if (cmd == SIOCADDTUNNEL) {
                        if (!t) {
                                t = ip_tunnel_create(net, itn, p);
                                err = PTR_ERR_OR_ZERO(t);
                                break;
                        }

                        err = -EEXIST;
                        break;
                }
                if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
                        if (t) {
                                if (t->dev != dev) {
                                        err = -EEXIST;
                                        break;
                                }
                        } else {
                                unsigned int nflags = 0;

                                if (ipv4_is_multicast(p->iph.daddr))
                                        nflags = IFF_BROADCAST;
                                else if (p->iph.daddr)
                                        nflags = IFF_POINTOPOINT;

                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
                                        err = -EINVAL;
                                        break;
                                }

                                t = netdev_priv(dev);
                        }
                }

                if (t) {
                        err = 0;
                        ip_tunnel_update(itn, t, dev, p, true, 0);
                } else {
                        err = -ENOENT;
                }
                break;

        case SIOCDELTUNNEL:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto done;

                if (dev == itn->fb_tunnel_dev) {
                        err = -ENOENT;
                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
                        if (!t)
                                goto done;
                        err = -EPERM;
                        if (t == netdev_priv(itn->fb_tunnel_dev))
                                goto done;
                        dev = t->dev;
                }
                unregister_netdevice(dev);
                err = 0;
                break;

        default:
                err = -EINVAL;
        }

done:
        return err;
}
EXPORT_SYMBOL_GPL(ip_tunnel_ctl);

bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
                              const void __user *data)
{
        struct ip_tunnel_parm p;

        if (copy_from_user(&p, data, sizeof(p)))
                return false;

        strscpy(kp->name, p.name);
        kp->link = p.link;
        ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
        ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
        kp->i_key = p.i_key;
        kp->o_key = p.o_key;
        memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));

        return true;
}
EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);

bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
{
        struct ip_tunnel_parm p;

        if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
            !ip_tunnel_flags_is_be16_compat(kp->o_flags))
                return false;

        memset(&p, 0, sizeof(p));

        strscpy(p.name, kp->name);
        p.link = kp->link;
        p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
        p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
        p.i_key = kp->i_key;
        p.o_key = kp->o_key;
        memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));

        return !copy_to_user(data, &p, sizeof(p));
}
EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);

int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
                             void __user *data, int cmd)
{
        struct ip_tunnel_parm_kern p;
        int err;

        if (!ip_tunnel_parm_from_user(&p, data))
                return -EFAULT;
        err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
        if (!err && !ip_tunnel_parm_to_user(data, &p))
                return -EFAULT;
        return err;
}
EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);

int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
        int max_mtu = IP_MAX_MTU - t_hlen;

        if (dev->type == ARPHRD_ETHER)
                max_mtu -= dev->hard_header_len;

        if (new_mtu < ETH_MIN_MTU)
                return -EINVAL;

        if (new_mtu > max_mtu) {
                if (strict)
                        return -EINVAL;

                new_mtu = max_mtu;
        }

        WRITE_ONCE(dev->mtu, new_mtu);
        return 0;
}
EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);

int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
{
        return __ip_tunnel_change_mtu(dev, new_mtu, true);
}
EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);

static void ip_tunnel_dev_free(struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);

        gro_cells_destroy(&tunnel->gro_cells);
        dst_cache_destroy(&tunnel->dst_cache);
}

void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct ip_tunnel_net *itn;

        itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);

        if (itn->fb_tunnel_dev != dev) {
                ip_tunnel_del(itn, netdev_priv(dev));
                unregister_netdevice_queue(dev, head);
        }
}
EXPORT_SYMBOL_GPL(ip_tunnel_dellink);

struct net *ip_tunnel_get_link_net(const struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);

        return READ_ONCE(tunnel->net);
}
EXPORT_SYMBOL(ip_tunnel_get_link_net);

int ip_tunnel_get_iflink(const struct net_device *dev)
{
        const struct ip_tunnel *tunnel = netdev_priv(dev);

        return READ_ONCE(tunnel->parms.link);
}
EXPORT_SYMBOL(ip_tunnel_get_iflink);

int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
                                  struct rtnl_link_ops *ops, char *devname)
{
        struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
        struct ip_tunnel_parm_kern parms;
        unsigned int i;

        itn->rtnl_link_ops = ops;
        for (i = 0; i < IP_TNL_HASH_SIZE; i++)
                INIT_HLIST_HEAD(&itn->tunnels[i]);

        if (!ops || !net_has_fallback_tunnels(net)) {
                struct ip_tunnel_net *it_init_net;

                it_init_net = net_generic(&init_net, ip_tnl_net_id);
                itn->type = it_init_net->type;
                itn->fb_tunnel_dev = NULL;
                return 0;
        }

        memset(&parms, 0, sizeof(parms));
        if (devname)
                strscpy(parms.name, devname, IFNAMSIZ);

        rtnl_lock();
        itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
        /* FB netdevice is special: we have one, and only one per netns.
         * Allowing to move it to another netns is clearly unsafe.
         */
        if (!IS_ERR(itn->fb_tunnel_dev)) {
                itn->fb_tunnel_dev->netns_immutable = true;
                itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
                ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
                itn->type = itn->fb_tunnel_dev->type;
        }
        rtnl_unlock();

        return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
}
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);

void ip_tunnel_delete_net(struct net *net, unsigned int id,
                          struct rtnl_link_ops *ops,
                          struct list_head *head)
{
        struct ip_tunnel_net *itn = net_generic(net, id);
        struct net_device *dev, *aux;
        int h;

        ASSERT_RTNL_NET(net);

        for_each_netdev_safe(net, dev, aux)
                if (dev->rtnl_link_ops == ops)
                        unregister_netdevice_queue(dev, head);

        for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
                struct ip_tunnel *t;
                struct hlist_node *n;
                struct hlist_head *thead = &itn->tunnels[h];

                hlist_for_each_entry_safe(t, n, thead, hash_node)
                        /* If dev is in the same netns, it has already
                         * been added to the list by the previous loop.
                         */
                        if (!net_eq(dev_net(t->dev), net))
                                unregister_netdevice_queue(t->dev, head);
        }
}
EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);

int ip_tunnel_newlink(struct net *net, struct net_device *dev,
                      struct nlattr *tb[], struct ip_tunnel_parm_kern *p,
                      __u32 fwmark)
{
        struct ip_tunnel *nt;
        struct ip_tunnel_net *itn;
        int mtu;
        int err;

        nt = netdev_priv(dev);
        itn = net_generic(net, nt->ip_tnl_net_id);

        if (nt->collect_md) {
                if (rtnl_dereference(itn->collect_md_tun))
                        return -EEXIST;
        } else {
                if (ip_tunnel_find(itn, p, dev->type))
                        return -EEXIST;
        }

        nt->net = net;
        nt->parms = *p;
        nt->fwmark = fwmark;
        err = register_netdevice(dev);
        if (err)
                goto err_register_netdevice;

        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
                eth_hw_addr_random(dev);

        mtu = ip_tunnel_bind_dev(dev);
        if (tb[IFLA_MTU]) {
                unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));

                if (dev->type == ARPHRD_ETHER)
                        max -= dev->hard_header_len;

                mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
        }

        err = dev_set_mtu(dev, mtu);
        if (err)
                goto err_dev_set_mtu;

        ip_tunnel_add(itn, nt);
        return 0;

err_dev_set_mtu:
        unregister_netdevice(dev);
err_register_netdevice:
        return err;
}
EXPORT_SYMBOL_GPL(ip_tunnel_newlink);

int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
                         struct ip_tunnel_parm_kern *p, __u32 fwmark)
{
        struct ip_tunnel *t;
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct net *net = tunnel->net;
        struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);

        if (dev == itn->fb_tunnel_dev)
                return -EINVAL;

        t = ip_tunnel_find(itn, p, dev->type);

        if (t) {
                if (t->dev != dev)
                        return -EEXIST;
        } else {
                t = tunnel;

                if (dev->type != ARPHRD_ETHER) {
                        unsigned int nflags = 0;

                        if (ipv4_is_multicast(p->iph.daddr))
                                nflags = IFF_BROADCAST;
                        else if (p->iph.daddr)
                                nflags = IFF_POINTOPOINT;

                        if ((dev->flags ^ nflags) &
                            (IFF_POINTOPOINT | IFF_BROADCAST))
                                return -EINVAL;
                }
        }

        ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
        return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_changelink);

int __ip_tunnel_init(struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct iphdr *iph = &tunnel->parms.iph;
        int err;

        dev->needs_free_netdev = true;
        dev->priv_destructor = ip_tunnel_dev_free;
        dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;

        err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
        if (err)
                return err;

        err = gro_cells_init(&tunnel->gro_cells, dev);
        if (err) {
                dst_cache_destroy(&tunnel->dst_cache);
                return err;
        }

        tunnel->dev = dev;
        strscpy(tunnel->parms.name, dev->name);
        iph->version                = 4;
        iph->ihl                = 5;

        if (tunnel->collect_md)
                netif_keep_dst(dev);
        return 0;
}
EXPORT_SYMBOL_GPL(__ip_tunnel_init);

void ip_tunnel_uninit(struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct net *net = tunnel->net;
        struct ip_tunnel_net *itn;

        itn = net_generic(net, tunnel->ip_tnl_net_id);
        ip_tunnel_del(itn, netdev_priv(dev));
        if (itn->fb_tunnel_dev == dev)
                WRITE_ONCE(itn->fb_tunnel_dev, NULL);

        dst_cache_reset(&tunnel->dst_cache);
}
EXPORT_SYMBOL_GPL(ip_tunnel_uninit);

/* Do least required initialization, rest of init is done in tunnel_init call */
void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        tunnel->ip_tnl_net_id = net_id;
}
EXPORT_SYMBOL_GPL(ip_tunnel_setup);

MODULE_DESCRIPTION("IPv4 tunnel implementation library");
MODULE_LICENSE("GPL");














































































































































































































    1 

































































    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










    1 




    1 



























































































































    1 



























    1 













    1 





































































































































































    2 































    2 


















    1 
















































































































































    1 











































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/extents_status.c
 *
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 * Modified by
 *        Allison Henderson <achender@linux.vnet.ibm.com>
 *        Hugh Dickins <hughd@google.com>
 *        Zheng Liu <wenqing.lz@taobao.com>
 *
 * Ext4 extents status tree core functions.
 */
#include <linux/list_sort.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "ext4.h"

#include <trace/events/ext4.h>
#include <kunit/static_stub.h>

/*
 * According to previous discussion in Ext4 Developer Workshop, we
 * will introduce a new structure called io tree to track all extent
 * status in order to solve some problems that we have met
 * (e.g. Reservation space warning), and provide extent-level locking.
 * Delay extent tree is the first step to achieve this goal.  It is
 * original built by Yongqiang Yang.  At that time it is called delay
 * extent tree, whose goal is only track delayed extents in memory to
 * simplify the implementation of fiemap and bigalloc, and introduce
 * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
 * delay extent tree at the first commit.  But for better understand
 * what it does, it has been rename to extent status tree.
 *
 * Step1:
 * Currently the first step has been done.  All delayed extents are
 * tracked in the tree.  It maintains the delayed extent when a delayed
 * allocation is issued, and the delayed extent is written out or
 * invalidated.  Therefore the implementation of fiemap and bigalloc
 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
 *
 * The following comment describes the implemenmtation of extent
 * status tree and future works.
 *
 * Step2:
 * In this step all extent status are tracked by extent status tree.
 * Thus, we can first try to lookup a block mapping in this tree before
 * finding it in extent tree.  Hence, single extent cache can be removed
 * because extent status tree can do a better job.  Extents in status
 * tree are loaded on-demand.  Therefore, the extent status tree may not
 * contain all of the extents in a file.  Meanwhile we define a shrinker
 * to reclaim memory from extent status tree because fragmented extent
 * tree will make status tree cost too much memory.  written/unwritten/-
 * hole extents in the tree will be reclaimed by this shrinker when we
 * are under high memory pressure.  Delayed extents will not be
 * reclimed because fiemap, bigalloc, and seek_data/hole need it.
 */

/*
 * Extent status tree implementation for ext4.
 *
 *
 * ==========================================================================
 * Extent status tree tracks all extent status.
 *
 * 1. Why we need to implement extent status tree?
 *
 * Without extent status tree, ext4 identifies a delayed extent by looking
 * up page cache, this has several deficiencies - complicated, buggy,
 * and inefficient code.
 *
 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
 * block or a range of blocks are belonged to a delayed extent.
 *
 * Let us have a look at how they do without extent status tree.
 *   --        FIEMAP
 *        FIEMAP looks up page cache to identify delayed allocations from holes.
 *
 *   --        SEEK_HOLE/DATA
 *        SEEK_HOLE/DATA has the same problem as FIEMAP.
 *
 *   --        bigalloc
 *        bigalloc looks up page cache to figure out if a block is
 *        already under delayed allocation or not to determine whether
 *        quota reserving is needed for the cluster.
 *
 *   --        writeout
 *        Writeout looks up whole page cache to see if a buffer is
 *        mapped, If there are not very many delayed buffers, then it is
 *        time consuming.
 *
 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
 * bigalloc and writeout can figure out if a block or a range of
 * blocks is under delayed allocation(belonged to a delayed extent) or
 * not by searching the extent tree.
 *
 *
 * ==========================================================================
 * 2. Ext4 extent status tree impelmentation
 *
 *   --        extent
 *        A extent is a range of blocks which are contiguous logically and
 *        physically.  Unlike extent in extent tree, this extent in ext4 is
 *        a in-memory struct, there is no corresponding on-disk data.  There
 *        is no limit on length of extent, so an extent can contain as many
 *        blocks as they are contiguous logically and physically.
 *
 *   --        extent status tree
 *        Every inode has an extent status tree and all allocation blocks
 *        are added to the tree with different status.  The extent in the
 *        tree are ordered by logical block no.
 *
 *   --        operations on a extent status tree
 *        There are three important operations on a delayed extent tree: find
 *        next extent, adding a extent(a range of blocks) and removing a extent.
 *
 *   --        race on a extent status tree
 *        Extent status tree is protected by inode->i_es_lock.
 *
 *   --        memory consumption
 *      Fragmented extent tree will make extent status tree cost too much
 *      memory.  Hence, we will reclaim written/unwritten/hole extents from
 *      the tree under a heavy memory pressure.
 *
 * ==========================================================================
 * 3. Assurance of Ext4 extent status tree consistency
 *
 * When mapping blocks, Ext4 queries the extent status tree first and should
 * always trusts that the extent status tree is consistent and up to date.
 * Therefore, it is important to adheres to the following rules when createing,
 * modifying and removing extents.
 *
 *  1. Besides fastcommit replay, when Ext4 creates or queries block mappings,
 *     the extent information should always be processed through the extent
 *     status tree instead of being organized manually through the on-disk
 *     extent tree.
 *
 *  2. When updating the extent tree, Ext4 should acquire the i_data_sem
 *     exclusively and update the extent status tree atomically. If the extents
 *     to be modified are large enough to exceed the range that a single
 *     i_data_sem can process (as ext4_datasem_ensure_credits() may drop
 *     i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole()
 *     does):
 *
 *     a) Hold the i_rwsem and invalidate_lock exclusively. This ensures
 *        exclusion against page faults, as well as reads and writes that may
 *        concurrently modify the extent status tree.
 *     b) Evict all page cache in the affected range and recommend rebuilding
 *        or dropping the extent status tree after modifying the on-disk
 *        extent tree. This ensures exclusion against concurrent writebacks
 *        that do not hold those locks but only holds a folio lock.
 *
 *  3. Based on the rules above, when querying block mappings, Ext4 should at
 *     least hold the i_rwsem or invalidate_lock or folio lock(s) for the
 *     specified querying range.
 *
 * ==========================================================================
 * 4. Performance analysis
 *
 *   --        overhead
 *        1. There is a cache extent for write access, so if writes are
 *        not very random, adding space operaions are in O(1) time.
 *
 *   --        gain
 *        2. Code is much simpler, more readable, more maintainable and
 *        more efficient.
 *
 *
 * ==========================================================================
 * 5. TODO list
 *
 *   -- Refactor delayed space reservation
 *
 *   -- Extent-level locking
 */

static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;

static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
                              struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end, unsigned int status,
                              int *reserved, struct extent_status *res,
                              struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                       struct ext4_inode_info *locked_ei);
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                            ext4_lblk_t len,
                            struct pending_reservation **prealloc);

int __init ext4_init_es(void)
{
        ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
        if (ext4_es_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_es(void)
{
        kmem_cache_destroy(ext4_es_cachep);
}

void ext4_es_init_tree(struct ext4_es_tree *tree)
{
        tree->root = RB_ROOT;
        tree->cache_es = NULL;
}

#ifdef ES_DEBUG__
static void ext4_es_print_tree(struct inode *inode)
{
        struct ext4_es_tree *tree;
        struct rb_node *node;

        printk(KERN_DEBUG "status extents for inode %llu:", inode->i_ino);
        tree = &EXT4_I(inode)->i_es_tree;
        node = rb_first(&tree->root);
        while (node) {
                struct extent_status *es;
                es = rb_entry(node, struct extent_status, rb_node);
                printk(KERN_DEBUG " [%u/%u) %llu %x",
                       es->es_lblk, es->es_len,
                       ext4_es_pblock(es), ext4_es_status(es));
                node = rb_next(node);
        }
        printk(KERN_DEBUG "\n");
}
#else
#define ext4_es_print_tree(inode)
#endif

static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
{
        BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
        return es->es_lblk + es->es_len - 1;
}

static inline void ext4_es_inc_seq(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
}

static inline int __es_check_extent_status(struct extent_status *es,
                                           unsigned int status,
                                           struct extent_status *res)
{
        if (ext4_es_type(es) & status)
                return 0;

        if (res) {
                res->es_lblk = es->es_lblk;
                res->es_len = es->es_len;
                res->es_pblk = es->es_pblk;
        }
        return -EINVAL;
}

/*
 * search through the tree for an delayed extent with a given offset.  If
 * it can't be found, try to find next extent.
 */
static struct extent_status *__es_tree_search(struct rb_root *root,
                                              ext4_lblk_t lblk)
{
        struct rb_node *node = root->rb_node;
        struct extent_status *es = NULL;

        while (node) {
                es = rb_entry(node, struct extent_status, rb_node);
                if (lblk < es->es_lblk)
                        node = node->rb_left;
                else if (lblk > ext4_es_end(es))
                        node = node->rb_right;
                else
                        return es;
        }

        if (es && lblk < es->es_lblk)
                return es;

        if (es && lblk > ext4_es_end(es)) {
                node = rb_next(&es->rb_node);
                return node ? rb_entry(node, struct extent_status, rb_node) :
                              NULL;
        }

        return NULL;
}

/*
 * ext4_es_find_extent_range - find extent with specified status within block
 *                             range or next extent following block range in
 *                             extents status tree
 *
 * @inode - file containing the range
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block defining start of range
 * @end - logical block defining end of range
 * @es - extent found, if any
 *
 * Find the first extent within the block range specified by @lblk and @end
 * in the extents status tree that satisfies @matching_fn.  If a match
 * is found, it's returned in @es.  If not, and a matching extent is found
 * beyond the block range, it's returned in @es.  If no match is found, an
 * extent is returned in @es whose es_lblk, es_len, and es_pblk components
 * are 0.
 */
static void __es_find_extent_range(struct inode *inode,
                                   int (*matching_fn)(struct extent_status *es),
                                   ext4_lblk_t lblk, ext4_lblk_t end,
                                   struct extent_status *es)
{
        struct ext4_es_tree *tree = NULL;
        struct extent_status *es1 = NULL;
        struct rb_node *node;

        WARN_ON(es == NULL);
        WARN_ON(end < lblk);

        tree = &EXT4_I(inode)->i_es_tree;

        /* see if the extent has been cached */
        es->es_lblk = es->es_len = es->es_pblk = 0;
        es1 = READ_ONCE(tree->cache_es);
        if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
                es_debug("%u cached by [%u/%u) %llu %x\n",
                         lblk, es1->es_lblk, es1->es_len,
                         ext4_es_pblock(es1), ext4_es_status(es1));
                goto out;
        }

        es1 = __es_tree_search(&tree->root, lblk);

out:
        if (es1 && !matching_fn(es1)) {
                while ((node = rb_next(&es1->rb_node)) != NULL) {
                        es1 = rb_entry(node, struct extent_status, rb_node);
                        if (es1->es_lblk > end) {
                                es1 = NULL;
                                break;
                        }
                        if (matching_fn(es1))
                                break;
                }
        }

        if (es1 && matching_fn(es1)) {
                WRITE_ONCE(tree->cache_es, es1);
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
        }

}

/*
 * Locking for __es_find_extent_range() for external use
 */
void ext4_es_find_extent_range(struct inode *inode,
                               int (*matching_fn)(struct extent_status *es),
                               ext4_lblk_t lblk, ext4_lblk_t end,
                               struct extent_status *es)
{
        es->es_lblk = es->es_len = es->es_pblk = 0;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        trace_ext4_es_find_extent_range_enter(inode, lblk);

        read_lock(&EXT4_I(inode)->i_es_lock);
        __es_find_extent_range(inode, matching_fn, lblk, end, es);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        trace_ext4_es_find_extent_range_exit(inode, es);
}

/*
 * __es_scan_range - search block range for block with specified status
 *                   in extents status tree
 *
 * @inode - file containing the range
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block defining start of range
 * @end - logical block defining end of range
 *
 * Returns true if at least one block in the specified block range satisfies
 * the criterion specified by @matching_fn, and false if not.  If at least
 * one extent has the specified status, then there is at least one block
 * in the cluster with that status.  Should only be called by code that has
 * taken i_es_lock.
 */
static bool __es_scan_range(struct inode *inode,
                            int (*matching_fn)(struct extent_status *es),
                            ext4_lblk_t start, ext4_lblk_t end)
{
        struct extent_status es;

        __es_find_extent_range(inode, matching_fn, start, end, &es);
        if (es.es_len == 0)
                return false;   /* no matching extent in the tree */
        else if (es.es_lblk <= start &&
                 start < es.es_lblk + es.es_len)
                return true;
        else if (start <= es.es_lblk && es.es_lblk <= end)
                return true;
        else
                return false;
}
/*
 * Locking for __es_scan_range() for external use
 */
bool ext4_es_scan_range(struct inode *inode,
                        int (*matching_fn)(struct extent_status *es),
                        ext4_lblk_t lblk, ext4_lblk_t end)
{
        bool ret;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return false;

        read_lock(&EXT4_I(inode)->i_es_lock);
        ret = __es_scan_range(inode, matching_fn, lblk, end);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        return ret;
}

/*
 * __es_scan_clu - search cluster for block with specified status in
 *                 extents status tree
 *
 * @inode - file containing the cluster
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block in cluster to be searched
 *
 * Returns true if at least one extent in the cluster containing @lblk
 * satisfies the criterion specified by @matching_fn, and false if not.  If at
 * least one extent has the specified status, then there is at least one block
 * in the cluster with that status.  Should only be called by code that has
 * taken i_es_lock.
 */
static bool __es_scan_clu(struct inode *inode,
                          int (*matching_fn)(struct extent_status *es),
                          ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t lblk_start, lblk_end;

        lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
        lblk_end = lblk_start + sbi->s_cluster_ratio - 1;

        return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
}

/*
 * Locking for __es_scan_clu() for external use
 */
bool ext4_es_scan_clu(struct inode *inode,
                      int (*matching_fn)(struct extent_status *es),
                      ext4_lblk_t lblk)
{
        bool ret;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return false;

        read_lock(&EXT4_I(inode)->i_es_lock);
        ret = __es_scan_clu(inode, matching_fn, lblk);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        return ret;
}

static void ext4_es_list_add(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        if (!list_empty(&ei->i_es_list))
                return;

        spin_lock(&sbi->s_es_lock);
        if (list_empty(&ei->i_es_list)) {
                list_add_tail(&ei->i_es_list, &sbi->s_es_list);
                sbi->s_es_nr_inode++;
        }
        spin_unlock(&sbi->s_es_lock);
}

static void ext4_es_list_del(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        spin_lock(&sbi->s_es_lock);
        if (!list_empty(&ei->i_es_list)) {
                list_del_init(&ei->i_es_list);
                sbi->s_es_nr_inode--;
                WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
        }
        spin_unlock(&sbi->s_es_lock);
}

static inline struct pending_reservation *__alloc_pending(bool nofail)
{
        if (!nofail)
                return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);

        return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
}

static inline void __free_pending(struct pending_reservation *pr)
{
        kmem_cache_free(ext4_pending_cachep, pr);
}

/*
 * Returns true if we cannot fail to allocate memory for this extent_status
 * entry and cannot reclaim it until its status changes.
 */
static inline bool ext4_es_must_keep(struct extent_status *es)
{
        /* fiemap, bigalloc, and seek_data/hole need to use it. */
        if (ext4_es_is_delayed(es))
                return true;

        return false;
}

static inline struct extent_status *__es_alloc_extent(bool nofail)
{
        if (!nofail)
                return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);

        return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
}

static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
                ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
{
        es->es_lblk = lblk;
        es->es_len = len;
        es->es_pblk = pblk;

        /* We never try to reclaim a must kept extent, so we don't count it. */
        if (!ext4_es_must_keep(es)) {
                if (!EXT4_I(inode)->i_es_shk_nr++)
                        ext4_es_list_add(inode);
                percpu_counter_inc(&EXT4_SB(inode->i_sb)->
                                        s_es_stats.es_stats_shk_cnt);
        }

        EXT4_I(inode)->i_es_all_nr++;
        percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
}

static inline void __es_free_extent(struct extent_status *es)
{
        kmem_cache_free(ext4_es_cachep, es);
}

static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
        EXT4_I(inode)->i_es_all_nr--;
        percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);

        /* Decrease the shrink counter when we can reclaim the extent. */
        if (!ext4_es_must_keep(es)) {
                BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
                if (!--EXT4_I(inode)->i_es_shk_nr)
                        ext4_es_list_del(inode);
                percpu_counter_dec(&EXT4_SB(inode->i_sb)->
                                        s_es_stats.es_stats_shk_cnt);
        }

        __es_free_extent(es);
}

/*
 * Check whether or not two extents can be merged
 * Condition:
 *  - logical block number is contiguous
 *  - physical block number is contiguous
 *  - status is equal
 */
static int ext4_es_can_be_merged(struct extent_status *es1,
                                 struct extent_status *es2)
{
        if (ext4_es_type(es1) != ext4_es_type(es2))
                return 0;

        if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
                pr_warn("ES assertion failed when merging extents. "
                        "The sum of lengths of es1 (%d) and es2 (%d) "
                        "is bigger than allowed file size (%d)\n",
                        es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
                WARN_ON(1);
                return 0;
        }

        if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
                return 0;

        if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
            (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
                return 1;

        if (ext4_es_is_hole(es1))
                return 1;

        /* we need to check delayed extent */
        if (ext4_es_is_delayed(es1))
                return 1;

        return 0;
}

static struct extent_status *
ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct extent_status *es1;
        struct rb_node *node;

        node = rb_prev(&es->rb_node);
        if (!node)
                return es;

        es1 = rb_entry(node, struct extent_status, rb_node);
        if (ext4_es_can_be_merged(es1, es)) {
                es1->es_len += es->es_len;
                if (ext4_es_is_referenced(es))
                        ext4_es_set_referenced(es1);
                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                es = es1;
        }

        return es;
}

static struct extent_status *
ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct extent_status *es1;
        struct rb_node *node;

        node = rb_next(&es->rb_node);
        if (!node)
                return es;

        es1 = rb_entry(node, struct extent_status, rb_node);
        if (ext4_es_can_be_merged(es, es1)) {
                es->es_len += es1->es_len;
                if (ext4_es_is_referenced(es1))
                        ext4_es_set_referenced(es);
                rb_erase(node, &tree->root);
                ext4_es_free_extent(inode, es1);
        }

        return es;
}

#ifdef ES_AGGRESSIVE_TEST
#include "ext4_extents.h"        /* Needed when ES_AGGRESSIVE_TEST is defined */

static void ext4_es_insert_extent_ext_check(struct inode *inode,
                                            struct extent_status *es)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        ext4_fsblk_t ee_start;
        unsigned short ee_len;
        int depth, ee_status, es_status;

        path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;

        if (ex) {

                ee_block = le32_to_cpu(ex->ee_block);
                ee_start = ext4_ext_pblock(ex);
                ee_len = ext4_ext_get_actual_len(ex);

                ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
                es_status = ext4_es_is_unwritten(es) ? 1 : 0;

                /*
                 * Make sure ex and es are not overlap when we try to insert
                 * a delayed/hole extent.
                 */
                if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
                        if (in_range(es->es_lblk, ee_block, ee_len)) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %llu we can find an extent "
                                        "at block [%d/%d/%llu/%c], but we "
                                        "want to add a delayed/hole extent "
                                        "[%d/%d/%llu/%x]\n",
                                        inode->i_ino, ee_block, ee_len,
                                        ee_start, ee_status ? 'u' : 'w',
                                        es->es_lblk, es->es_len,
                                        ext4_es_pblock(es), ext4_es_status(es));
                        }
                        goto out;
                }

                /*
                 * We don't check ee_block == es->es_lblk, etc. because es
                 * might be a part of whole extent, vice versa.
                 */
                if (es->es_lblk < ee_block ||
                    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
                        pr_warn("ES insert assertion failed for inode: %llu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
                                ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
                                ext4_es_pblock(es), es_status ? 'u' : 'w');
                        goto out;
                }

                if (ee_status ^ es_status) {
                        pr_warn("ES insert assertion failed for inode: %llu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
                                ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
                                ext4_es_pblock(es), es_status ? 'u' : 'w');
                }
        } else {
                /*
                 * We can't find an extent on disk.  So we need to make sure
                 * that we don't want to add an written/unwritten extent.
                 */
                if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
                        pr_warn("ES insert assertion failed for inode: %llu "
                                "can't find an extent at block %d but we want "
                                "to add a written/unwritten extent "
                                "[%d/%d/%llu/%x]\n", inode->i_ino,
                                es->es_lblk, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                }
        }
out:
        ext4_free_ext_path(path);
}

static void ext4_es_insert_extent_ind_check(struct inode *inode,
                                            struct extent_status *es)
{
        struct ext4_map_blocks map;
        int retval;

        /*
         * Here we call ext4_ind_map_blocks to lookup a block mapping because
         * 'Indirect' structure is defined in indirect.c.  So we couldn't
         * access direct/indirect tree from outside.  It is too dirty to define
         * this function in indirect.c file.
         */

        map.m_lblk = es->es_lblk;
        map.m_len = es->es_len;

        retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
        if (retval > 0) {
                if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
                        /*
                         * We want to add a delayed/hole extent but this
                         * block has been allocated.
                         */
                        pr_warn("ES insert assertion failed for inode: %llu "
                                "We can find blocks but we want to add a "
                                "delayed/hole extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
                } else if (ext4_es_is_written(es)) {
                        if (retval != es->es_len) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %llu retval %d != es_len %d\n",
                                        inode->i_ino, retval, es->es_len);
                                return;
                        }
                        if (map.m_pblk != ext4_es_pblock(es)) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %llu m_pblk %llu != "
                                        "es_pblk %llu\n",
                                        inode->i_ino, map.m_pblk,
                                        ext4_es_pblock(es));
                                return;
                        }
                } else {
                        /*
                         * We don't need to check unwritten extent because
                         * indirect-based file doesn't have it.
                         */
                        BUG();
                }
        } else if (retval == 0) {
                if (ext4_es_is_written(es)) {
                        pr_warn("ES insert assertion failed for inode: %llu "
                                "We can't find the block but we want to add "
                                "a written extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
                }
        }
}

static inline void ext4_es_insert_extent_check(struct inode *inode,
                                               struct extent_status *es)
{
        /*
         * We don't need to worry about the race condition because
         * caller takes i_data_sem locking.
         */
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ext4_es_insert_extent_ext_check(inode, es);
        else
                ext4_es_insert_extent_ind_check(inode, es);
}
#else
static inline void ext4_es_insert_extent_check(struct inode *inode,
                                               struct extent_status *es)
{
}
#endif

static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
                              struct extent_status *prealloc)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct rb_node **p = &tree->root.rb_node;
        struct rb_node *parent = NULL;
        struct extent_status *es;

        while (*p) {
                parent = *p;
                es = rb_entry(parent, struct extent_status, rb_node);

                if (newes->es_lblk < es->es_lblk) {
                        if (ext4_es_can_be_merged(newes, es)) {
                                /*
                                 * Here we can modify es_lblk directly
                                 * because it isn't overlapped.
                                 */
                                es->es_lblk = newes->es_lblk;
                                es->es_len += newes->es_len;
                                if (ext4_es_is_written(es) ||
                                    ext4_es_is_unwritten(es))
                                        ext4_es_store_pblock(es,
                                                             newes->es_pblk);
                                es = ext4_es_try_to_merge_left(inode, es);
                                goto out;
                        }
                        p = &(*p)->rb_left;
                } else if (newes->es_lblk > ext4_es_end(es)) {
                        if (ext4_es_can_be_merged(es, newes)) {
                                es->es_len += newes->es_len;
                                es = ext4_es_try_to_merge_right(inode, es);
                                goto out;
                        }
                        p = &(*p)->rb_right;
                } else {
                        BUG();
                        return -EINVAL;
                }
        }

        if (prealloc)
                es = prealloc;
        else
                es = __es_alloc_extent(false);
        if (!es)
                return -ENOMEM;
        ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
                            newes->es_pblk);

        rb_link_node(&es->rb_node, parent, p);
        rb_insert_color(&es->rb_node, &tree->root);

out:
        tree->cache_es = es;
        return 0;
}

/*
 * ext4_es_insert_extent() adds information to an inode's extent
 * status tree. This interface is used for modifying extents. To cache
 * on-disk extents, use ext4_es_cache_extent() instead.
 */
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                           ext4_lblk_t len, ext4_fsblk_t pblk,
                           unsigned int status, bool delalloc_reserve_used)
{
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;
        int err1 = 0, err2 = 0, err3 = 0;
        int resv_used = 0, pending = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct extent_status *es1 = NULL;
        struct extent_status *es2 = NULL;
        struct pending_reservation *pr = NULL;
        bool revise_pending = false;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %llu\n",
                 lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino);

        if (!len)
                return;

        BUG_ON(end < lblk);
        WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);

        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, pblk, status);

        ext4_es_insert_extent_check(inode, &newes);

        revise_pending = sbi->s_cluster_ratio > 1 &&
                         test_opt(inode->i_sb, DELALLOC) &&
                         (status & (EXTENT_STATUS_WRITTEN |
                                    EXTENT_STATUS_UNWRITTEN));
retry:
        if (err1 && !es1)
                es1 = __es_alloc_extent(true);
        if ((err1 || err2) && !es2)
                es2 = __es_alloc_extent(true);
        if ((err1 || err2 || err3 < 0) && revise_pending && !pr)
                pr = __alloc_pending(true);
        write_lock(&EXT4_I(inode)->i_es_lock);

        err1 = __es_remove_extent(inode, lblk, end, 0, &resv_used, NULL, es1);
        if (err1 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es1) {
                if (!es1->es_len)
                        __es_free_extent(es1);
                es1 = NULL;
        }

        err2 = __es_insert_extent(inode, &newes, es2);
        if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
                err2 = 0;
        if (err2 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es2) {
                if (!es2->es_len)
                        __es_free_extent(es2);
                es2 = NULL;
        }

        if (revise_pending) {
                err3 = __revise_pending(inode, lblk, len, &pr);
                if (err3 < 0)
                        goto error;
                if (pr) {
                        __free_pending(pr);
                        pr = NULL;
                }
                pending = err3;
        }
        ext4_es_inc_seq(inode);
error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
        /*
         * Reduce the reserved cluster count to reflect successful deferred
         * allocation of delayed allocated clusters or direct allocation of
         * clusters discovered to be delayed allocated.  Once allocated, a
         * cluster is not included in the reserved count.
         *
         * When direct allocating (from fallocate, filemap, DIO, or clusters
         * allocated when delalloc has been disabled by ext4_nonda_switch())
         * an extent either 1) contains delayed blocks but start with
         * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed
         * allocated blocks which belong to delayed allocated clusters when
         * bigalloc feature is enabled, quota has already been claimed by
         * ext4_mb_new_blocks(), so release the quota reservations made for
         * any previously delayed allocated clusters instead of claim them
         * again.
         */
        resv_used += pending;
        if (resv_used)
                ext4_da_update_reserve_space(inode, resv_used,
                                             delalloc_reserve_used);

        if (err1 || err2 || err3 < 0)
                goto retry;

        trace_ext4_es_insert_extent(inode, &newes);
        ext4_es_print_tree(inode);
        return;
}

/*
 * ext4_es_cache_extent() inserts information into the extent status tree
 * only if there is no existing information about the specified range or
 * if the existing extents have the same status.
 *
 * Note that this interface is only used for caching on-disk extent
 * information and cannot be used to convert existing extents in the extent
 * status tree. To convert existing extents, use ext4_es_insert_extent()
 * instead.
 */
void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t len, ext4_fsblk_t pblk,
                          unsigned int status)
{
        struct extent_status *es;
        struct extent_status chkes, newes;
        ext4_lblk_t end = lblk + len - 1;
        bool conflict = false;
        int err;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, pblk, status);

        if (!len)
                return;

        BUG_ON(end < lblk);

        write_lock(&EXT4_I(inode)->i_es_lock);
        es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
        if (es && es->es_lblk <= end) {
                /* Found an extent that covers the entire range. */
                if (es->es_lblk <= lblk && es->es_lblk + es->es_len > end) {
                        if (__es_check_extent_status(es, status, &chkes))
                                conflict = true;
                        goto unlock;
                }
                /* Check and remove all extents in range. */
                err = __es_remove_extent(inode, lblk, end, status, NULL,
                                         &chkes, NULL);
                if (err) {
                        if (err == -EINVAL)
                                conflict = true;
                        goto unlock;
                }
        }
        __es_insert_extent(inode, &newes, NULL);
        trace_ext4_es_cache_extent(inode, &newes);
        ext4_es_print_tree(inode);
unlock:
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (!conflict)
                return;
        /*
         * A hole in the on-disk extent but a delayed extent in the extent
         * status tree, is allowed.
         */
        if (status == EXTENT_STATUS_HOLE &&
            ext4_es_type(&chkes) == EXTENT_STATUS_DELAYED)
                return;

        ext4_warning_inode(inode,
                           "ES cache extent failed: add [%d,%d,%llu,0x%x] conflict with existing [%d,%d,%llu,0x%x]\n",
                           lblk, len, pblk, status, chkes.es_lblk, chkes.es_len,
                           ext4_es_pblock(&chkes), ext4_es_status(&chkes));
}

/*
 * ext4_es_lookup_extent() looks up an extent in extent status tree.
 *
 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
 *
 * Return: 1 on found, 0 on not
 */
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t *next_lblk, struct extent_status *es,
                          u64 *pseq)
{
        struct ext4_es_tree *tree;
        struct ext4_es_stats *stats;
        struct extent_status *es1 = NULL;
        struct rb_node *node;
        int found = 0;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        trace_ext4_es_lookup_extent_enter(inode, lblk);
        es_debug("lookup extent in block %u\n", lblk);

        tree = &EXT4_I(inode)->i_es_tree;
        read_lock(&EXT4_I(inode)->i_es_lock);

        /* find extent in cache firstly */
        es->es_lblk = es->es_len = es->es_pblk = 0;
        es1 = READ_ONCE(tree->cache_es);
        if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
                es_debug("%u cached by [%u/%u)\n",
                         lblk, es1->es_lblk, es1->es_len);
                found = 1;
                goto out;
        }

        node = tree->root.rb_node;
        while (node) {
                es1 = rb_entry(node, struct extent_status, rb_node);
                if (lblk < es1->es_lblk)
                        node = node->rb_left;
                else if (lblk > ext4_es_end(es1))
                        node = node->rb_right;
                else {
                        found = 1;
                        break;
                }
        }

out:
        stats = &EXT4_SB(inode->i_sb)->s_es_stats;
        if (found) {
                BUG_ON(!es1);
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
                if (!ext4_es_is_referenced(es1))
                        ext4_es_set_referenced(es1);
                percpu_counter_inc(&stats->es_stats_cache_hits);
                if (next_lblk) {
                        node = rb_next(&es1->rb_node);
                        if (node) {
                                es1 = rb_entry(node, struct extent_status,
                                               rb_node);
                                *next_lblk = es1->es_lblk;
                        } else
                                *next_lblk = 0;
                }
                if (pseq)
                        *pseq = EXT4_I(inode)->i_es_seq;
        } else {
                percpu_counter_inc(&stats->es_stats_cache_misses);
        }

        read_unlock(&EXT4_I(inode)->i_es_lock);

        trace_ext4_es_lookup_extent_exit(inode, es, found);
        return found;
}

struct rsvd_count {
        int ndelayed;
        bool first_do_lblk_found;
        ext4_lblk_t first_do_lblk;
        ext4_lblk_t last_do_lblk;
        struct extent_status *left_es;
        bool partial;
        ext4_lblk_t lclu;
};

/*
 * init_rsvd - initialize reserved count data before removing block range
 *               in file from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @es - pointer to first extent in range
 * @rc - pointer to reserved count data
 *
 * Assumes es is not NULL
 */
static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
                      struct extent_status *es, struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct rb_node *node;

        rc->ndelayed = 0;

        /*
         * for bigalloc, note the first delayed block in the range has not
         * been found, record the extent containing the block to the left of
         * the region to be removed, if any, and note that there's no partial
         * cluster to track
         */
        if (sbi->s_cluster_ratio > 1) {
                rc->first_do_lblk_found = false;
                if (lblk > es->es_lblk) {
                        rc->left_es = es;
                } else {
                        node = rb_prev(&es->rb_node);
                        rc->left_es = node ? rb_entry(node,
                                                      struct extent_status,
                                                      rb_node) : NULL;
                }
                rc->partial = false;
        }
}

/*
 * count_rsvd - count the clusters containing delayed blocks in a range
 *                within an extent and add to the running tally in rsvd_count
 *
 * @inode - file containing extent
 * @lblk - first block in range
 * @len - length of range in blocks
 * @es - pointer to extent containing clusters to be counted
 * @rc - pointer to reserved count data
 *
 * Tracks partial clusters found at the beginning and end of extents so
 * they aren't overcounted when they span adjacent extents
 */
static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
                       struct extent_status *es, struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t i, end, nclu;

        if (!ext4_es_is_delayed(es))
                return;

        WARN_ON(len <= 0);

        if (sbi->s_cluster_ratio == 1) {
                rc->ndelayed += (int) len;
                return;
        }

        /* bigalloc */

        i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
        end = lblk + (ext4_lblk_t) len - 1;
        end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;

        /* record the first block of the first delayed extent seen */
        if (!rc->first_do_lblk_found) {
                rc->first_do_lblk = i;
                rc->first_do_lblk_found = true;
        }

        /* update the last lblk in the region seen so far */
        rc->last_do_lblk = end;

        /*
         * if we're tracking a partial cluster and the current extent
         * doesn't start with it, count it and stop tracking
         */
        if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
                rc->ndelayed++;
                rc->partial = false;
        }

        /*
         * if the first cluster doesn't start on a cluster boundary but
         * ends on one, count it
         */
        if (EXT4_LBLK_COFF(sbi, i) != 0) {
                if (end >= EXT4_LBLK_CFILL(sbi, i)) {
                        rc->ndelayed++;
                        rc->partial = false;
                        i = EXT4_LBLK_CFILL(sbi, i) + 1;
                }
        }

        /*
         * if the current cluster starts on a cluster boundary, count the
         * number of whole delayed clusters in the extent
         */
        if ((i + sbi->s_cluster_ratio - 1) <= end) {
                nclu = (end - i + 1) >> sbi->s_cluster_bits;
                rc->ndelayed += nclu;
                i += nclu << sbi->s_cluster_bits;
        }

        /*
         * start tracking a partial cluster if there's a partial at the end
         * of the current extent and we're not already tracking one
         */
        if (!rc->partial && i <= end) {
                rc->partial = true;
                rc->lclu = EXT4_B2C(sbi, i);
        }
}

/*
 * __pr_tree_search - search for a pending cluster reservation
 *
 * @root - root of pending reservation tree
 * @lclu - logical cluster to search for
 *
 * Returns the pending reservation for the cluster identified by @lclu
 * if found.  If not, returns a reservation for the next cluster if any,
 * and if not, returns NULL.
 */
static struct pending_reservation *__pr_tree_search(struct rb_root *root,
                                                    ext4_lblk_t lclu)
{
        struct rb_node *node = root->rb_node;
        struct pending_reservation *pr = NULL;

        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                if (lclu < pr->lclu)
                        node = node->rb_left;
                else if (lclu > pr->lclu)
                        node = node->rb_right;
                else
                        return pr;
        }
        if (pr && lclu < pr->lclu)
                return pr;
        if (pr && lclu > pr->lclu) {
                node = rb_next(&pr->rb_node);
                return node ? rb_entry(node, struct pending_reservation,
                                       rb_node) : NULL;
        }
        return NULL;
}

/*
 * get_rsvd - calculates and returns the number of cluster reservations to be
 *              released when removing a block range from the extent status tree
 *              and releases any pending reservations within the range
 *
 * @inode - file containing block range
 * @end - last block in range
 * @right_es - pointer to extent containing next block beyond end or NULL
 * @rc - pointer to reserved count data
 *
 * The number of reservations to be released is equal to the number of
 * clusters containing delayed blocks within the range, minus the number of
 * clusters still containing delayed blocks at the ends of the range, and
 * minus the number of pending reservations within the range.
 */
static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
                             struct extent_status *right_es,
                             struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct pending_reservation *pr;
        struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
        struct rb_node *node;
        ext4_lblk_t first_lclu, last_lclu;
        bool left_delayed, right_delayed, count_pending;
        struct extent_status *es;

        if (sbi->s_cluster_ratio > 1) {
                /* count any remaining partial cluster */
                if (rc->partial)
                        rc->ndelayed++;

                if (rc->ndelayed == 0)
                        return 0;

                first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
                last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);

                /*
                 * decrease the delayed count by the number of clusters at the
                 * ends of the range that still contain delayed blocks -
                 * these clusters still need to be reserved
                 */
                left_delayed = right_delayed = false;

                es = rc->left_es;
                while (es && ext4_es_end(es) >=
                       EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
                        if (ext4_es_is_delayed(es)) {
                                rc->ndelayed--;
                                left_delayed = true;
                                break;
                        }
                        node = rb_prev(&es->rb_node);
                        if (!node)
                                break;
                        es = rb_entry(node, struct extent_status, rb_node);
                }
                if (right_es && (!left_delayed || first_lclu != last_lclu)) {
                        if (end < ext4_es_end(right_es)) {
                                es = right_es;
                        } else {
                                node = rb_next(&right_es->rb_node);
                                es = node ? rb_entry(node, struct extent_status,
                                                     rb_node) : NULL;
                        }
                        while (es && es->es_lblk <=
                               EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
                                if (ext4_es_is_delayed(es)) {
                                        rc->ndelayed--;
                                        right_delayed = true;
                                        break;
                                }
                                node = rb_next(&es->rb_node);
                                if (!node)
                                        break;
                                es = rb_entry(node, struct extent_status,
                                              rb_node);
                        }
                }

                /*
                 * Determine the block range that should be searched for
                 * pending reservations, if any.  Clusters on the ends of the
                 * original removed range containing delayed blocks are
                 * excluded.  They've already been accounted for and it's not
                 * possible to determine if an associated pending reservation
                 * should be released with the information available in the
                 * extents status tree.
                 */
                if (first_lclu == last_lclu) {
                        if (left_delayed | right_delayed)
                                count_pending = false;
                        else
                                count_pending = true;
                } else {
                        if (left_delayed)
                                first_lclu++;
                        if (right_delayed)
                                last_lclu--;
                        if (first_lclu <= last_lclu)
                                count_pending = true;
                        else
                                count_pending = false;
                }

                /*
                 * a pending reservation found between first_lclu and last_lclu
                 * represents an allocated cluster that contained at least one
                 * delayed block, so the delayed total must be reduced by one
                 * for each pending reservation found and released
                 */
                if (count_pending) {
                        pr = __pr_tree_search(&tree->root, first_lclu);
                        while (pr && pr->lclu <= last_lclu) {
                                rc->ndelayed--;
                                node = rb_next(&pr->rb_node);
                                rb_erase(&pr->rb_node, &tree->root);
                                __free_pending(pr);
                                if (!node)
                                        break;
                                pr = rb_entry(node, struct pending_reservation,
                                              rb_node);
                        }
                }
        }
        return rc->ndelayed;
}

/*
 * __es_remove_extent - removes block range from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @end - last block in range
 * @status - the extent status to be checked
 * @reserved - number of cluster reservations released
 * @res - return the extent if the status is not match
 * @prealloc - pre-allocated es to avoid memory allocation failures
 *
 * If @reserved is not NULL and delayed allocation is enabled, counts
 * block/cluster reservations freed by removing range and if bigalloc
 * enabled cancels pending reservations as needed. If @status is not
 * zero, check extent status type while removing extent, return -EINVAL
 * and pass out the extent through @res if not match.  Returns 0 on
 * success, error code on failure.
 */
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end, unsigned int status,
                              int *reserved, struct extent_status *res,
                              struct extent_status *prealloc)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct rb_node *node;
        struct extent_status *es;
        struct extent_status orig_es;
        ext4_lblk_t len1, len2;
        ext4_fsblk_t block;
        int err;
        bool count_reserved = true;
        struct rsvd_count rc;

        if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
                count_reserved = false;
        if (status == 0)
                status = ES_TYPE_MASK;

        es = __es_tree_search(&tree->root, lblk);
        if (!es)
                return 0;
        if (es->es_lblk > end)
                return 0;

        err = __es_check_extent_status(es, status, res);
        if (err)
                return err;

        /* Simply invalidate cache_es. */
        tree->cache_es = NULL;
        if (count_reserved)
                init_rsvd(inode, lblk, es, &rc);

        orig_es.es_lblk = es->es_lblk;
        orig_es.es_len = es->es_len;
        orig_es.es_pblk = es->es_pblk;

        len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
        len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
        if (len1 > 0)
                es->es_len = len1;
        if (len2 > 0) {
                if (len1 > 0) {
                        struct extent_status newes;

                        newes.es_lblk = end + 1;
                        newes.es_len = len2;
                        block = 0x7FDEADBEEFULL;
                        if (ext4_es_is_written(&orig_es) ||
                            ext4_es_is_unwritten(&orig_es))
                                block = ext4_es_pblock(&orig_es) +
                                        orig_es.es_len - len2;
                        ext4_es_store_pblock_status(&newes, block,
                                                    ext4_es_status(&orig_es));
                        err = __es_insert_extent(inode, &newes, prealloc);
                        if (err) {
                                if (!ext4_es_must_keep(&newes))
                                        return 0;

                                es->es_lblk = orig_es.es_lblk;
                                es->es_len = orig_es.es_len;
                                return err;
                        }
                } else {
                        es->es_lblk = end + 1;
                        es->es_len = len2;
                        if (ext4_es_is_written(es) ||
                            ext4_es_is_unwritten(es)) {
                                block = orig_es.es_pblk + orig_es.es_len - len2;
                                ext4_es_store_pblock(es, block);
                        }
                }
                if (count_reserved)
                        count_rsvd(inode, orig_es.es_lblk + len1,
                                   orig_es.es_len - len1 - len2, &orig_es, &rc);
                goto out;
        }

        if (len1 > 0) {
                if (count_reserved)
                        count_rsvd(inode, lblk, orig_es.es_len - len1,
                                   &orig_es, &rc);
                node = rb_next(&es->rb_node);
                if (node)
                        es = rb_entry(node, struct extent_status, rb_node);
                else
                        es = NULL;
        }

        while (es && ext4_es_end(es) <= end) {
                err = __es_check_extent_status(es, status, res);
                if (err)
                        return err;
                if (count_reserved)
                        count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
                node = rb_next(&es->rb_node);
                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                if (!node) {
                        es = NULL;
                        break;
                }
                es = rb_entry(node, struct extent_status, rb_node);
        }

        if (es && es->es_lblk < end + 1) {
                ext4_lblk_t orig_len = es->es_len;

                err = __es_check_extent_status(es, status, res);
                if (err)
                        return err;

                len1 = ext4_es_end(es) - end;
                if (count_reserved)
                        count_rsvd(inode, es->es_lblk, orig_len - len1,
                                   es, &rc);
                es->es_lblk = end + 1;
                es->es_len = len1;
                if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
                        block = es->es_pblk + orig_len - len1;
                        ext4_es_store_pblock(es, block);
                }
        }

out:
        if (count_reserved)
                *reserved = get_rsvd(inode, end, es, &rc);
        return 0;
}

/*
 * ext4_es_remove_extent - removes block range from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @len - number of blocks to remove
 *
 * Reduces block/cluster reservation count and for bigalloc cancels pending
 * reservations as needed.
 */
void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                           ext4_lblk_t len)
{
        ext4_lblk_t end;
        int err = 0;
        int reserved = 0;
        struct extent_status *es = NULL;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        es_debug("remove [%u/%u) from extent status tree of inode %llu\n",
                 lblk, len, inode->i_ino);

        if (!len)
                return;

        end = lblk + len - 1;
        BUG_ON(end < lblk);

retry:
        if (err && !es)
                es = __es_alloc_extent(true);
        /*
         * ext4_clear_inode() depends on us taking i_es_lock unconditionally
         * so that we are sure __es_shrink() is done with the inode before it
         * is reclaimed.
         */
        write_lock(&EXT4_I(inode)->i_es_lock);
        err = __es_remove_extent(inode, lblk, end, 0, &reserved, NULL, es);
        if (err)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es) {
                if (!es->es_len)
                        __es_free_extent(es);
                es = NULL;
        }
        ext4_es_inc_seq(inode);
error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (err)
                goto retry;

        trace_ext4_es_remove_extent(inode, lblk, len);
        ext4_es_print_tree(inode);
        ext4_da_release_space(inode, reserved);
}

static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                       struct ext4_inode_info *locked_ei)
{
        struct ext4_inode_info *ei;
        struct ext4_es_stats *es_stats;
        ktime_t start_time;
        u64 scan_time;
        int nr_to_walk;
        int nr_shrunk = 0;
        int retried = 0, nr_skipped = 0;

        es_stats = &sbi->s_es_stats;
        start_time = ktime_get();

retry:
        spin_lock(&sbi->s_es_lock);
        nr_to_walk = sbi->s_es_nr_inode;
        while (nr_to_walk-- > 0) {
                if (list_empty(&sbi->s_es_list)) {
                        spin_unlock(&sbi->s_es_lock);
                        goto out;
                }
                ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
                                      i_es_list);
                /* Move the inode to the tail */
                list_move_tail(&ei->i_es_list, &sbi->s_es_list);

                /*
                 * Normally we try hard to avoid shrinking precached inodes,
                 * but we will as a last resort.
                 */
                if (!retried && ext4_test_inode_state(&ei->vfs_inode,
                                                EXT4_STATE_EXT_PRECACHED)) {
                        nr_skipped++;
                        continue;
                }

                if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
                        nr_skipped++;
                        continue;
                }
                /*
                 * Now we hold i_es_lock which protects us from inode reclaim
                 * freeing inode under us
                 */
                spin_unlock(&sbi->s_es_lock);

                nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
                write_unlock(&ei->i_es_lock);

                if (nr_to_scan <= 0)
                        goto out;
                spin_lock(&sbi->s_es_lock);
        }
        spin_unlock(&sbi->s_es_lock);

        /*
         * If we skipped any inodes, and we weren't able to make any
         * forward progress, try again to scan precached inodes.
         */
        if ((nr_shrunk == 0) && nr_skipped && !retried) {
                retried++;
                goto retry;
        }

        if (locked_ei && nr_shrunk == 0)
                nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);

out:
        scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
        if (likely(es_stats->es_stats_scan_time))
                es_stats->es_stats_scan_time = (scan_time +
                                es_stats->es_stats_scan_time*3) / 4;
        else
                es_stats->es_stats_scan_time = scan_time;
        if (scan_time > es_stats->es_stats_max_scan_time)
                es_stats->es_stats_max_scan_time = scan_time;
        if (likely(es_stats->es_stats_shrunk))
                es_stats->es_stats_shrunk = (nr_shrunk +
                                es_stats->es_stats_shrunk*3) / 4;
        else
                es_stats->es_stats_shrunk = nr_shrunk;

        trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
                             nr_skipped, retried);
        return nr_shrunk;
}

static unsigned long ext4_es_count(struct shrinker *shrink,
                                   struct shrink_control *sc)
{
        unsigned long nr;
        struct ext4_sb_info *sbi;

        sbi = shrink->private_data;
        nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
        return nr;
}

static unsigned long ext4_es_scan(struct shrinker *shrink,
                                  struct shrink_control *sc)
{
        struct ext4_sb_info *sbi = shrink->private_data;
        int nr_to_scan = sc->nr_to_scan;
        int ret, nr_shrunk;

        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);

        nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);

        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
        return nr_shrunk;
}

int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
{
        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
        struct ext4_es_stats *es_stats = &sbi->s_es_stats;
        struct ext4_inode_info *ei, *max = NULL;
        unsigned int inode_cnt = 0;

        if (v != SEQ_START_TOKEN)
                return 0;

        /* here we just find an inode that has the max nr. of objects */
        spin_lock(&sbi->s_es_lock);
        list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
                inode_cnt++;
                if (max && max->i_es_all_nr < ei->i_es_all_nr)
                        max = ei;
                else if (!max)
                        max = ei;
        }
        spin_unlock(&sbi->s_es_lock);

        seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
                   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
                   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
        seq_printf(seq, "  %lld/%lld cache hits/misses\n",
                   percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
                   percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
        if (inode_cnt)
                seq_printf(seq, "  %d inodes on list\n", inode_cnt);

        seq_printf(seq, "average:\n  %llu us scan time\n",
            div_u64(es_stats->es_stats_scan_time, 1000));
        seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
        if (inode_cnt)
                seq_printf(seq,
                    "maximum:\n  %llu inode (%u objects, %u reclaimable)\n"
                    "  %llu us max scan time\n",
                    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
                    div_u64(es_stats->es_stats_max_scan_time, 1000));

        return 0;
}

int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
        int err;

        /* Make sure we have enough bits for physical block number */
        BUILD_BUG_ON(ES_SHIFT < 48);
        INIT_LIST_HEAD(&sbi->s_es_list);
        sbi->s_es_nr_inode = 0;
        spin_lock_init(&sbi->s_es_lock);
        sbi->s_es_stats.es_stats_shrunk = 0;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
                                  GFP_KERNEL);
        if (err)
                return err;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
                                  GFP_KERNEL);
        if (err)
                goto err1;
        sbi->s_es_stats.es_stats_scan_time = 0;
        sbi->s_es_stats.es_stats_max_scan_time = 0;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
        if (err)
                goto err2;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
        if (err)
                goto err3;

        sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
        if (!sbi->s_es_shrinker) {
                err = -ENOMEM;
                goto err4;
        }

        sbi->s_es_shrinker->scan_objects = ext4_es_scan;
        sbi->s_es_shrinker->count_objects = ext4_es_count;
        sbi->s_es_shrinker->private_data = sbi;

        shrinker_register(sbi->s_es_shrinker);

        return 0;
err4:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
err3:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
err2:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
err1:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
        return err;
}

void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
        shrinker_free(sbi->s_es_shrinker);
}

/*
 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
 * most *nr_to_scan extents, update *nr_to_scan accordingly.
 *
 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
 * Increment *nr_shrunk by the number of reclaimed extents. Also update
 * ei->i_es_shrink_lblk to where we should continue scanning.
 */
static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
                                 int *nr_to_scan, int *nr_shrunk)
{
        struct inode *inode = &ei->vfs_inode;
        struct ext4_es_tree *tree = &ei->i_es_tree;
        struct extent_status *es;
        struct rb_node *node;

        es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
        if (!es)
                goto out_wrap;

        while (*nr_to_scan > 0) {
                if (es->es_lblk > end) {
                        ei->i_es_shrink_lblk = end + 1;
                        return 0;
                }

                (*nr_to_scan)--;
                node = rb_next(&es->rb_node);

                if (ext4_es_must_keep(es))
                        goto next;
                if (ext4_es_is_referenced(es)) {
                        ext4_es_clear_referenced(es);
                        goto next;
                }

                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                (*nr_shrunk)++;
next:
                if (!node)
                        goto out_wrap;
                es = rb_entry(node, struct extent_status, rb_node);
        }
        ei->i_es_shrink_lblk = es->es_lblk;
        return 1;
out_wrap:
        ei->i_es_shrink_lblk = 0;
        return 0;
}

static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
{
        struct inode *inode = &ei->vfs_inode;
        int nr_shrunk = 0;
        ext4_lblk_t start = ei->i_es_shrink_lblk;
        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);

        if (ei->i_es_shk_nr == 0)
                return 0;

        if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
            __ratelimit(&_rs))
                ext4_warning(inode->i_sb, "forced shrink of precached extents");

        if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
            start != 0)
                es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);

        ei->i_es_tree.cache_es = NULL;
        return nr_shrunk;
}

/*
 * Called to support EXT4_IOC_CLEAR_ES_CACHE.  We can only remove
 * discretionary entries from the extent status cache.  (Some entries
 * must be present for proper operations.)
 */
void ext4_clear_inode_es(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct extent_status *es;
        struct ext4_es_tree *tree;
        struct rb_node *node;

        write_lock(&ei->i_es_lock);
        tree = &EXT4_I(inode)->i_es_tree;
        tree->cache_es = NULL;
        node = rb_first(&tree->root);
        while (node) {
                es = rb_entry(node, struct extent_status, rb_node);
                node = rb_next(node);
                if (!ext4_es_must_keep(es)) {
                        rb_erase(&es->rb_node, &tree->root);
                        ext4_es_free_extent(inode, es);
                }
        }
        ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
        write_unlock(&ei->i_es_lock);
}

#ifdef ES_DEBUG__
static void ext4_print_pending_tree(struct inode *inode)
{
        struct ext4_pending_tree *tree;
        struct rb_node *node;
        struct pending_reservation *pr;

        printk(KERN_DEBUG "pending reservations for inode %llu:", inode->i_ino);
        tree = &EXT4_I(inode)->i_pending_tree;
        node = rb_first(&tree->root);
        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                printk(KERN_DEBUG " %u", pr->lclu);
                node = rb_next(node);
        }
        printk(KERN_DEBUG "\n");
}
#else
#define ext4_print_pending_tree(inode)
#endif

int __init ext4_init_pending(void)
{
        ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
        if (ext4_pending_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_pending(void)
{
        kmem_cache_destroy(ext4_pending_cachep);
}

void ext4_init_pending_tree(struct ext4_pending_tree *tree)
{
        tree->root = RB_ROOT;
}

/*
 * __get_pending - retrieve a pointer to a pending reservation
 *
 * @inode - file containing the pending cluster reservation
 * @lclu - logical cluster of interest
 *
 * Returns a pointer to a pending reservation if it's a member of
 * the set, and NULL if not.  Must be called holding i_es_lock.
 */
static struct pending_reservation *__get_pending(struct inode *inode,
                                                 ext4_lblk_t lclu)
{
        struct ext4_pending_tree *tree;
        struct rb_node *node;
        struct pending_reservation *pr = NULL;

        tree = &EXT4_I(inode)->i_pending_tree;
        node = (&tree->root)->rb_node;

        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                if (lclu < pr->lclu)
                        node = node->rb_left;
                else if (lclu > pr->lclu)
                        node = node->rb_right;
                else if (lclu == pr->lclu)
                        return pr;
        }
        return NULL;
}

/*
 * __insert_pending - adds a pending cluster reservation to the set of
 *                    pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the cluster to be added
 * @prealloc - preallocated pending entry
 *
 * Returns 1 on successful insertion and -ENOMEM on failure.  If the
 * pending reservation is already in the set, returns successfully.
 */
static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
                            struct pending_reservation **prealloc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
        struct rb_node **p = &tree->root.rb_node;
        struct rb_node *parent = NULL;
        struct pending_reservation *pr;
        ext4_lblk_t lclu;
        int ret = 0;

        lclu = EXT4_B2C(sbi, lblk);
        /* search to find parent for insertion */
        while (*p) {
                parent = *p;
                pr = rb_entry(parent, struct pending_reservation, rb_node);

                if (lclu < pr->lclu) {
                        p = &(*p)->rb_left;
                } else if (lclu > pr->lclu) {
                        p = &(*p)->rb_right;
                } else {
                        /* pending reservation already inserted */
                        goto out;
                }
        }

        if (likely(*prealloc == NULL)) {
                pr = __alloc_pending(false);
                if (!pr) {
                        ret = -ENOMEM;
                        goto out;
                }
        } else {
                pr = *prealloc;
                *prealloc = NULL;
        }
        pr->lclu = lclu;

        rb_link_node(&pr->rb_node, parent, p);
        rb_insert_color(&pr->rb_node, &tree->root);
        ret = 1;

out:
        return ret;
}

/*
 * __remove_pending - removes a pending cluster reservation from the set
 *                    of pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the pending cluster reservation to be removed
 *
 * Returns successfully if pending reservation is not a member of the set.
 */
static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct pending_reservation *pr;
        struct ext4_pending_tree *tree;

        pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
        if (pr != NULL) {
                tree = &EXT4_I(inode)->i_pending_tree;
                rb_erase(&pr->rb_node, &tree->root);
                __free_pending(pr);
        }
}

/*
 * ext4_remove_pending - removes a pending cluster reservation from the set
 *                       of pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the pending cluster reservation to be removed
 *
 * Locking for external use of __remove_pending.
 */
void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        write_lock(&ei->i_es_lock);
        __remove_pending(inode, lblk);
        write_unlock(&ei->i_es_lock);
}

/*
 * ext4_is_pending - determine whether a cluster has a pending reservation
 *                   on it
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the cluster
 *
 * Returns true if there's a pending reservation for the cluster in the
 * set of pending reservations, and false if not.
 */
bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        bool ret;

        read_lock(&ei->i_es_lock);
        ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
        read_unlock(&ei->i_es_lock);

        return ret;
}

/*
 * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
 *                                 status tree, adding a pending reservation
 *                                 where needed
 *
 * @inode - file containing the newly added block
 * @lblk - start logical block to be added
 * @len - length of blocks to be added
 * @lclu_allocated/end_allocated - indicates whether a physical cluster has
 *                                 been allocated for the logical cluster
 *                                 that contains the start/end block. Note that
 *                                 end_allocated should always be set to false
 *                                 if the start and the end block are in the
 *                                 same cluster
 */
void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
                                   ext4_lblk_t len, bool lclu_allocated,
                                   bool end_allocated)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;
        int err1 = 0, err2 = 0, err3 = 0;
        struct extent_status *es1 = NULL;
        struct extent_status *es2 = NULL;
        struct pending_reservation *pr1 = NULL;
        struct pending_reservation *pr2 = NULL;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        es_debug("add [%u/%u) delayed to extent status tree of inode %llu\n",
                 lblk, len, inode->i_ino);
        if (!len)
                return;

        WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
                     end_allocated);

        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);

        ext4_es_insert_extent_check(inode, &newes);

retry:
        if (err1 && !es1)
                es1 = __es_alloc_extent(true);
        if ((err1 || err2) && !es2)
                es2 = __es_alloc_extent(true);
        if (err1 || err2 || err3 < 0) {
                if (lclu_allocated && !pr1)
                        pr1 = __alloc_pending(true);
                if (end_allocated && !pr2)
                        pr2 = __alloc_pending(true);
        }
        write_lock(&EXT4_I(inode)->i_es_lock);

        err1 = __es_remove_extent(inode, lblk, end, 0, NULL, NULL, es1);
        if (err1 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es1) {
                if (!es1->es_len)
                        __es_free_extent(es1);
                es1 = NULL;
        }

        err2 = __es_insert_extent(inode, &newes, es2);
        if (err2 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es2) {
                if (!es2->es_len)
                        __es_free_extent(es2);
                es2 = NULL;
        }

        if (lclu_allocated) {
                err3 = __insert_pending(inode, lblk, &pr1);
                if (err3 < 0)
                        goto error;
                if (pr1) {
                        __free_pending(pr1);
                        pr1 = NULL;
                }
        }
        if (end_allocated) {
                err3 = __insert_pending(inode, end, &pr2);
                if (err3 < 0)
                        goto error;
                if (pr2) {
                        __free_pending(pr2);
                        pr2 = NULL;
                }
        }
        ext4_es_inc_seq(inode);
error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (err1 || err2 || err3 < 0)
                goto retry;

        trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
                                            end_allocated);
        ext4_es_print_tree(inode);
        ext4_print_pending_tree(inode);
        return;
}

/*
 * __revise_pending - makes, cancels, or leaves unchanged pending cluster
 *                    reservations for a specified block range depending
 *                    upon the presence or absence of delayed blocks
 *                    outside the range within clusters at the ends of the
 *                    range
 *
 * @inode - file containing the range
 * @lblk - logical block defining the start of range
 * @len  - length of range in blocks
 * @prealloc - preallocated pending entry
 *
 * Used after a newly allocated extent is added to the extents status tree.
 * Requires that the extents in the range have either written or unwritten
 * status.  Must be called while holding i_es_lock. Returns number of new
 * inserts pending cluster on insert pendings, returns 0 on remove pendings,
 * return -ENOMEM on failure.
 */
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                            ext4_lblk_t len,
                            struct pending_reservation **prealloc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t end = lblk + len - 1;
        ext4_lblk_t first, last;
        bool f_del = false, l_del = false;
        int pendings = 0;
        int ret = 0;

        if (len == 0)
                return 0;

        /*
         * Two cases - block range within single cluster and block range
         * spanning two or more clusters.  Note that a cluster belonging
         * to a range starting and/or ending on a cluster boundary is treated
         * as if it does not contain a delayed extent.  The new range may
         * have allocated space for previously delayed blocks out to the
         * cluster boundary, requiring that any pre-existing pending
         * reservation be canceled.  Because this code only looks at blocks
         * outside the range, it should revise pending reservations
         * correctly even if the extent represented by the range can't be
         * inserted in the extents status tree due to ENOSPC.
         */

        if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
                first = EXT4_LBLK_CMASK(sbi, lblk);
                if (first != lblk)
                        f_del = __es_scan_range(inode, &ext4_es_is_delayed,
                                                first, lblk - 1);
                if (f_del) {
                        ret = __insert_pending(inode, first, prealloc);
                        if (ret < 0)
                                goto out;
                        pendings += ret;
                } else {
                        last = EXT4_LBLK_CMASK(sbi, end) +
                               sbi->s_cluster_ratio - 1;
                        if (last != end)
                                l_del = __es_scan_range(inode,
                                                        &ext4_es_is_delayed,
                                                        end + 1, last);
                        if (l_del) {
                                ret = __insert_pending(inode, last, prealloc);
                                if (ret < 0)
                                        goto out;
                                pendings += ret;
                        } else
                                __remove_pending(inode, last);
                }
        } else {
                first = EXT4_LBLK_CMASK(sbi, lblk);
                if (first != lblk)
                        f_del = __es_scan_range(inode, &ext4_es_is_delayed,
                                                first, lblk - 1);
                if (f_del) {
                        ret = __insert_pending(inode, first, prealloc);
                        if (ret < 0)
                                goto out;
                        pendings += ret;
                } else
                        __remove_pending(inode, first);

                last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
                if (last != end)
                        l_del = __es_scan_range(inode, &ext4_es_is_delayed,
                                                end + 1, last);
                if (l_del) {
                        ret = __insert_pending(inode, last, prealloc);
                        if (ret < 0)
                                goto out;
                        pendings += ret;
                } else
                        __remove_pending(inode, last);
        }
out:
        return (ret < 0) ? ret : pendings;
}




























































    9 




   11 










































































































    1 
    1 


    1 






































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
// SPDX-License-Identifier: GPL-2.0
/*
 * blk-integrity.c - Block layer data integrity extensions
 *
 * Copyright (C) 2007, 2008 Oracle Corporation
 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
 */

#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/mempool.h>
#include <linux/bio.h>
#include <linux/scatterlist.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/t10-pi.h>

#include "blk.h"

/**
 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
 * @q:                request queue
 * @bio:        bio with integrity metadata attached
 *
 * Description: Returns the number of elements required in a
 * scatterlist corresponding to the integrity metadata in a bio.
 */
int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
{
        struct bio_vec iv, ivprv = { NULL };
        unsigned int segments = 0;
        unsigned int seg_size = 0;
        struct bvec_iter iter;
        int prev = 0;

        bio_for_each_integrity_vec(iv, bio, iter) {

                if (prev) {
                        if (!biovec_phys_mergeable(q, &ivprv, &iv))
                                goto new_segment;
                        if (seg_size + iv.bv_len > queue_max_segment_size(q))
                                goto new_segment;

                        seg_size += iv.bv_len;
                } else {
new_segment:
                        segments++;
                        seg_size = iv.bv_len;
                }

                prev = 1;
                ivprv = iv;
        }

        return segments;
}

int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
                     struct logical_block_metadata_cap __user *argp)
{
        struct blk_integrity *bi;
        struct logical_block_metadata_cap meta_cap = {};
        size_t usize = _IOC_SIZE(cmd);

        if (!extensible_ioctl_valid(cmd, FS_IOC_GETLBMD_CAP, LBMD_SIZE_VER0))
                return -ENOIOCTLCMD;

        bi = blk_get_integrity(bdev->bd_disk);
        if (!bi)
                goto out;

        if (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE)
                meta_cap.lbmd_flags |= LBMD_PI_CAP_INTEGRITY;
        if (bi->flags & BLK_INTEGRITY_REF_TAG)
                meta_cap.lbmd_flags |= LBMD_PI_CAP_REFTAG;
        meta_cap.lbmd_interval = 1 << bi->interval_exp;
        meta_cap.lbmd_size = bi->metadata_size;
        meta_cap.lbmd_pi_size = bi->pi_tuple_size;
        meta_cap.lbmd_pi_offset = bi->pi_offset;
        meta_cap.lbmd_opaque_size = bi->metadata_size - bi->pi_tuple_size;
        if (meta_cap.lbmd_opaque_size && !bi->pi_offset)
                meta_cap.lbmd_opaque_offset = bi->pi_tuple_size;

        switch (bi->csum_type) {
        case BLK_INTEGRITY_CSUM_NONE:
                meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_NONE;
                break;
        case BLK_INTEGRITY_CSUM_IP:
                meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_IP;
                break;
        case BLK_INTEGRITY_CSUM_CRC:
                meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_CRC16_T10DIF;
                break;
        case BLK_INTEGRITY_CSUM_CRC64:
                meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_CRC64_NVME;
                break;
        }

        if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE)
                meta_cap.lbmd_app_tag_size = 2;

        if (bi->flags & BLK_INTEGRITY_REF_TAG) {
                switch (bi->csum_type) {
                case BLK_INTEGRITY_CSUM_CRC64:
                        meta_cap.lbmd_ref_tag_size =
                                sizeof_field(struct crc64_pi_tuple, ref_tag);
                        break;
                case BLK_INTEGRITY_CSUM_CRC:
                case BLK_INTEGRITY_CSUM_IP:
                        meta_cap.lbmd_ref_tag_size =
                                sizeof_field(struct t10_pi_tuple, ref_tag);
                        break;
                default:
                        break;
                }
        }

out:
        return copy_struct_to_user(argp, usize, &meta_cap, sizeof(meta_cap),
                                   NULL);
}

int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
                              ssize_t bytes)
{
        int ret;
        struct iov_iter iter;

        iov_iter_ubuf(&iter, rq_data_dir(rq), ubuf, bytes);
        ret = bio_integrity_map_user(rq->bio, &iter);
        if (ret)
                return ret;

        rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, rq->bio);
        rq->cmd_flags |= REQ_INTEGRITY;
        return 0;
}
EXPORT_SYMBOL_GPL(blk_rq_integrity_map_user);

bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
                            struct request *next)
{
        struct bio_integrity_payload *bip, *bip_next;

        if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0)
                return true;

        if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0)
                return false;

        bip = bio_integrity(req->bio);
        bip_next = bio_integrity(next->bio);
        if (bip->bip_flags != bip_next->bip_flags)
                return false;

        if (bip->bip_flags & BIP_CHECK_APPTAG &&
            bip->app_tag != bip_next->app_tag)
                return false;

        if (req->nr_integrity_segments + next->nr_integrity_segments >
            q->limits.max_integrity_segments)
                return false;

        if (integrity_req_gap_back_merge(req, next->bio))
                return false;

        return true;
}

bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
                             struct bio *bio)
{
        struct bio_integrity_payload *bip, *bip_bio = bio_integrity(bio);
        int nr_integrity_segs;

        if (blk_integrity_rq(req) == 0 && bip_bio == NULL)
                return true;

        if (blk_integrity_rq(req) == 0 || bip_bio == NULL)
                return false;

        bip = bio_integrity(req->bio);
        if (bip->bip_flags != bip_bio->bip_flags)
                return false;

        if (bip->bip_flags & BIP_CHECK_APPTAG &&
            bip->app_tag != bip_bio->app_tag)
                return false;

        nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
        if (req->nr_integrity_segments + nr_integrity_segs >
            q->limits.max_integrity_segments)
                return false;

        return true;
}

static inline struct blk_integrity *dev_to_bi(struct device *dev)
{
        return &dev_to_disk(dev)->queue->limits.integrity;
}

const char *blk_integrity_profile_name(struct blk_integrity *bi)
{
        switch (bi->csum_type) {
        case BLK_INTEGRITY_CSUM_IP:
                if (bi->flags & BLK_INTEGRITY_REF_TAG)
                        return "T10-DIF-TYPE1-IP";
                return "T10-DIF-TYPE3-IP";
        case BLK_INTEGRITY_CSUM_CRC:
                if (bi->flags & BLK_INTEGRITY_REF_TAG)
                        return "T10-DIF-TYPE1-CRC";
                return "T10-DIF-TYPE3-CRC";
        case BLK_INTEGRITY_CSUM_CRC64:
                if (bi->flags & BLK_INTEGRITY_REF_TAG)
                        return "EXT-DIF-TYPE1-CRC64";
                return "EXT-DIF-TYPE3-CRC64";
        case BLK_INTEGRITY_CSUM_NONE:
                break;
        }

        return "nop";
}
EXPORT_SYMBOL_GPL(blk_integrity_profile_name);

static ssize_t flag_store(struct device *dev, const char *page, size_t count,
                unsigned char flag)
{
        struct request_queue *q = dev_to_disk(dev)->queue;
        struct queue_limits lim;
        unsigned long val;
        int err;

        err = kstrtoul(page, 10, &val);
        if (err)
                return err;

        /* note that the flags are inverted vs the values in the sysfs files */
        lim = queue_limits_start_update(q);
        if (val)
                lim.integrity.flags &= ~flag;
        else
                lim.integrity.flags |= flag;

        err = queue_limits_commit_update_frozen(q, &lim);
        if (err)
                return err;
        return count;
}

static ssize_t flag_show(struct device *dev, char *page, unsigned char flag)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        return sysfs_emit(page, "%d\n", !(bi->flags & flag));
}

static ssize_t format_show(struct device *dev, struct device_attribute *attr,
                           char *page)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        if (!bi->metadata_size)
                return sysfs_emit(page, "none\n");
        return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi));
}

static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr,
                             char *page)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        return sysfs_emit(page, "%u\n", bi->tag_size);
}

static ssize_t protection_interval_bytes_show(struct device *dev,
                                              struct device_attribute *attr,
                                              char *page)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        return sysfs_emit(page, "%u\n",
                          bi->interval_exp ? 1 << bi->interval_exp : 0);
}

static ssize_t read_verify_store(struct device *dev,
                                 struct device_attribute *attr,
                                 const char *page, size_t count)
{
        return flag_store(dev, page, count, BLK_INTEGRITY_NOVERIFY);
}

static ssize_t read_verify_show(struct device *dev,
                                struct device_attribute *attr, char *page)
{
        return flag_show(dev, page, BLK_INTEGRITY_NOVERIFY);
}

static ssize_t write_generate_store(struct device *dev,
                                    struct device_attribute *attr,
                                    const char *page, size_t count)
{
        return flag_store(dev, page, count, BLK_INTEGRITY_NOGENERATE);
}

static ssize_t write_generate_show(struct device *dev,
                                   struct device_attribute *attr, char *page)
{
        return flag_show(dev, page, BLK_INTEGRITY_NOGENERATE);
}

static ssize_t device_is_integrity_capable_show(struct device *dev,
                                                struct device_attribute *attr,
                                                char *page)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        return sysfs_emit(page, "%u\n",
                          !!(bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE));
}

static DEVICE_ATTR_RO(format);
static DEVICE_ATTR_RO(tag_size);
static DEVICE_ATTR_RO(protection_interval_bytes);
static DEVICE_ATTR_RW(read_verify);
static DEVICE_ATTR_RW(write_generate);
static DEVICE_ATTR_RO(device_is_integrity_capable);

static struct attribute *integrity_attrs[] = {
        &dev_attr_format.attr,
        &dev_attr_tag_size.attr,
        &dev_attr_protection_interval_bytes.attr,
        &dev_attr_read_verify.attr,
        &dev_attr_write_generate.attr,
        &dev_attr_device_is_integrity_capable.attr,
        NULL
};

const struct attribute_group blk_integrity_attr_group = {
        .name = "integrity",
        .attrs = integrity_attrs,
};




























































































































































































































































































































































































































































































    6 







































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BITMAP_H
#define __LINUX_BITMAP_H

#ifndef __ASSEMBLY__

#include <linux/align.h>
#include <linux/bitops.h>
#include <linux/cleanup.h>
#include <linux/errno.h>
#include <linux/find.h>
#include <linux/limits.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bitmap-str.h>

struct device;

/*
 * bitmaps provide bit arrays that consume one or more unsigned
 * longs.  The bitmap interface and available operations are listed
 * here, in bitmap.h
 *
 * Function implementations generic to all architectures are in
 * lib/bitmap.c.  Functions implementations that are architecture
 * specific are in various arch/<arch>/include/asm/bitops.h headers
 * and other arch/<arch> specific files.
 *
 * See lib/bitmap.c for more details.
 */

/**
 * DOC: bitmap overview
 *
 * The available bitmap operations and their rough meaning in the
 * case that the bitmap is a single unsigned long are thus:
 *
 * The generated code is more efficient when nbits is known at
 * compile-time and at most BITS_PER_LONG.
 *
 * ::
 *
 *  bitmap_zero(dst, nbits)                     *dst = 0UL
 *  bitmap_fill(dst, nbits)                     *dst = ~0UL
 *  bitmap_copy(dst, src, nbits)                *dst = *src
 *  bitmap_and(dst, src1, src2, nbits)          *dst = *src1 & *src2
 *  bitmap_or(dst, src1, src2, nbits)           *dst = *src1 | *src2
 *  bitmap_weighted_or(dst, src1, src2, nbits)        *dst = *src1 | *src2. Returns Hamming Weight of dst
 *  bitmap_weighted_xor(dst, src1, src2, nbits)        *dst = *src1 ^ *src2. Returns Hamming Weight of dst
 *  bitmap_xor(dst, src1, src2, nbits)          *dst = *src1 ^ *src2
 *  bitmap_andnot(dst, src1, src2, nbits)       *dst = *src1 & ~(*src2)
 *  bitmap_complement(dst, src, nbits)          *dst = ~(*src)
 *  bitmap_equal(src1, src2, nbits)             Are *src1 and *src2 equal?
 *  bitmap_intersects(src1, src2, nbits)        Do *src1 and *src2 overlap?
 *  bitmap_subset(src1, src2, nbits)            Is *src1 a subset of *src2?
 *  bitmap_empty(src, nbits)                    Are all bits zero in *src?
 *  bitmap_full(src, nbits)                     Are all bits set in *src?
 *  bitmap_weight(src, nbits)                   Hamming Weight: number set bits
 *  bitmap_weight_and(src1, src2, nbits)        Hamming Weight of and'ed bitmap
 *  bitmap_weight_andnot(src1, src2, nbits)     Hamming Weight of andnot'ed bitmap
 *  bitmap_weight_from(src, start, end)         Hamming Weight starting from @start
 *  bitmap_set(dst, pos, nbits)                 Set specified bit area
 *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
 *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
 *  bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off)  as above
 *  bitmap_shift_right(dst, src, n, nbits)      *dst = *src >> n
 *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
 *  bitmap_cut(dst, src, first, n, nbits)       Cut n bits from first, copy rest
 *  bitmap_replace(dst, old, new, mask, nbits)  *dst = (*old & ~(*mask)) | (*new & *mask)
 *  bitmap_scatter(dst, src, mask, nbits)        *dst = map(dense, sparse)(src)
 *  bitmap_gather(dst, src, mask, nbits)        *dst = map(sparse, dense)(src)
 *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
 *  bitmap_bitremap(oldbit, old, new, nbits)    newbit = map(old, new)(oldbit)
 *  bitmap_onto(dst, orig, relmap, nbits)       *dst = orig relative to relmap
 *  bitmap_fold(dst, orig, sz, nbits)           dst bits = orig bits mod sz
 *  bitmap_parse(buf, buflen, dst, nbits)       Parse bitmap dst from kernel buf
 *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user buf
 *  bitmap_parselist(buf, dst, nbits)           Parse bitmap dst from kernel buf
 *  bitmap_parselist_user(buf, dst, nbits)      Parse bitmap dst from user buf
 *  bitmap_find_free_region(bitmap, bits, order)  Find and allocate bit region
 *  bitmap_release_region(bitmap, pos, order)   Free specified bit region
 *  bitmap_allocate_region(bitmap, pos, order)  Allocate specified bit region
 *  bitmap_from_arr32(dst, buf, nbits)          Copy nbits from u32[] buf to dst
 *  bitmap_from_arr64(dst, buf, nbits)          Copy nbits from u64[] buf to dst
 *  bitmap_to_arr32(buf, src, nbits)            Copy nbits from buf to u32[] dst
 *  bitmap_to_arr64(buf, src, nbits)            Copy nbits from buf to u64[] dst
 *  bitmap_get_value8(map, start)               Get 8bit value from map at start
 *  bitmap_set_value8(map, value, start)        Set 8bit value to map at start
 *  bitmap_read(map, start, nbits)              Read an nbits-sized value from
 *                                              map at start
 *  bitmap_write(map, value, start, nbits)      Write an nbits-sized value to
 *                                              map at start
 *
 * Note, bitmap_zero() and bitmap_fill() operate over the region of
 * unsigned longs, that is, bits behind bitmap till the unsigned long
 * boundary will be zeroed or filled as well. Consider to use
 * bitmap_clear() or bitmap_set() to make explicit zeroing or filling
 * respectively.
 */

/**
 * DOC: bitmap bitops
 *
 * Also the following operations in asm/bitops.h apply to bitmaps.::
 *
 *  set_bit(bit, addr)                  *addr |= bit
 *  clear_bit(bit, addr)                *addr &= ~bit
 *  change_bit(bit, addr)               *addr ^= bit
 *  test_bit(bit, addr)                 Is bit set in *addr?
 *  test_and_set_bit(bit, addr)         Set bit and return old value
 *  test_and_clear_bit(bit, addr)       Clear bit and return old value
 *  test_and_change_bit(bit, addr)      Change bit and return old value
 *  find_first_zero_bit(addr, nbits)    Position first zero bit in *addr
 *  find_first_bit(addr, nbits)         Position first set bit in *addr
 *  find_next_zero_bit(addr, nbits, bit)
 *                                      Position next zero bit in *addr >= bit
 *  find_next_bit(addr, nbits, bit)     Position next set bit in *addr >= bit
 *  find_next_and_bit(addr1, addr2, nbits, bit)
 *                                      Same as find_next_bit, but in
 *                                      (*addr1 & *addr2)
 *
 */

/**
 * DOC: declare bitmap
 * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
 * to declare an array named 'name' of just enough unsigned longs to
 * contain all bit positions from 0 to 'bits' - 1.
 */

/*
 * Allocation and deallocation of bitmap.
 * Provided in lib/bitmap.c to avoid circular dependency.
 */
unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node);
unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node);
void bitmap_free(const unsigned long *bitmap);

DEFINE_FREE(bitmap, unsigned long *, if (_T) bitmap_free(_T))

/* Managed variants of the above. */
unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags);
unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags);

/*
 * lib/bitmap.c provides these functions:
 */

bool __bitmap_equal(const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int nbits);
bool __pure __bitmap_or_equal(const unsigned long *src1,
                              const unsigned long *src2,
                              const unsigned long *src3,
                              unsigned int nbits);
void __bitmap_complement(unsigned long *dst, const unsigned long *src,
                         unsigned int nbits);
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                          unsigned int shift, unsigned int nbits);
void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                         unsigned int shift, unsigned int nbits);
void bitmap_cut(unsigned long *dst, const unsigned long *src,
                unsigned int first, unsigned int cut, unsigned int nbits);
bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                 const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                 const unsigned long *bitmap2, unsigned int nbits);
unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1,
                                  const unsigned long *bitmap2, unsigned int nbits);
unsigned int __bitmap_weighted_xor(unsigned long *dst, const unsigned long *bitmap1,
                                  const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                  const unsigned long *bitmap2, unsigned int nbits);
bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_replace(unsigned long *dst,
                      const unsigned long *old, const unsigned long *new,
                      const unsigned long *mask, unsigned int nbits);
bool __bitmap_intersects(const unsigned long *bitmap1,
                         const unsigned long *bitmap2, unsigned int nbits);
bool __bitmap_subset(const unsigned long *bitmap1,
                     const unsigned long *bitmap2, unsigned int nbits);
unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
                                 const unsigned long *bitmap2, unsigned int nbits);
unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
                                    const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_set(unsigned long *map, unsigned int start, int len);
void __bitmap_clear(unsigned long *map, unsigned int start, int len);

unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                             unsigned long size,
                                             unsigned long start,
                                             unsigned int nr,
                                             unsigned long align_mask,
                                             unsigned long align_offset);

/**
 * bitmap_find_next_zero_area - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds is multiples of that
 * power of 2. A @align_mask of 0 means no alignment is required.
 */
static __always_inline
unsigned long bitmap_find_next_zero_area(unsigned long *map,
                                         unsigned long size,
                                         unsigned long start,
                                         unsigned int nr,
                                         unsigned long align_mask)
{
        return bitmap_find_next_zero_area_off(map, size, start, nr,
                                              align_mask, 0);
}

void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new, unsigned int nbits);
int bitmap_bitremap(int oldbit,
                const unsigned long *old, const unsigned long *new, int bits);
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                const unsigned long *relmap, unsigned int bits);
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                unsigned int sz, unsigned int nbits);

#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))

#define bitmap_size(nbits)        (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)

static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);

        if (small_const_nbits(nbits))
                *dst = 0;
        else
                memset(dst, 0, len);
}

static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);

        if (small_const_nbits(nbits))
                *dst = ~0UL;
        else
                memset(dst, 0xff, len);
}

static __always_inline
void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);

        if (small_const_nbits(nbits))
                *dst = *src;
        else
                memcpy(dst, src, len);
}

/*
 * Copy bitmap and clear tail bits in last word.
 */
static __always_inline
void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        bitmap_copy(dst, src, nbits);
        if (nbits % BITS_PER_LONG)
                dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits);
}

static inline void bitmap_copy_and_extend(unsigned long *to,
                                          const unsigned long *from,
                                          unsigned int count, unsigned int size)
{
        unsigned int copy = BITS_TO_LONGS(count);

        memcpy(to, from, copy * sizeof(long));
        if (count % BITS_PER_LONG)
                to[copy - 1] &= BITMAP_LAST_WORD_MASK(count);
        memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long));
}

/*
 * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64
 * machines the order of hi and lo parts of numbers match the bitmap structure.
 * In both cases conversion is not needed when copying data from/to arrays of
 * u32. But in LE64 case, typecast in bitmap_copy_clear_tail() may lead
 * to out-of-bound access. To avoid that, both LE and BE variants of 64-bit
 * architectures are not using bitmap_copy_clear_tail().
 */
#if BITS_PER_LONG == 64
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
                                                        unsigned int nbits);
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap,
                                                        unsigned int nbits);
#else
#define bitmap_from_arr32(bitmap, buf, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *) (bitmap),        \
                        (const unsigned long *) (buf), (nbits))
#define bitmap_to_arr32(buf, bitmap, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *) (buf),                \
                        (const unsigned long *) (bitmap), (nbits))
#endif

/*
 * On 64-bit systems bitmaps are represented as u64 arrays internally. So,
 * the conversion is not needed when copying data from/to arrays of u64.
 */
#if BITS_PER_LONG == 32
void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits);
void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits);
#else
#define bitmap_from_arr64(bitmap, buf, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *)(bitmap), (const unsigned long *)(buf), (nbits))
#define bitmap_to_arr64(buf, bitmap, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits))
#endif

static __always_inline
bool bitmap_and(unsigned long *dst, const unsigned long *src1,
                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        return __bitmap_and(dst, src1, src2, nbits);
}

static __always_inline
void bitmap_or(unsigned long *dst, const unsigned long *src1,
               const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = *src1 | *src2;
        else
                __bitmap_or(dst, src1, src2, nbits);
}

static __always_inline
unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1,
                                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits)) {
                *dst = *src1 | *src2;
                return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
        } else {
                return __bitmap_weighted_or(dst, src1, src2, nbits);
        }
}

static __always_inline
unsigned int bitmap_weighted_xor(unsigned long *dst, const unsigned long *src1,
                                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits)) {
                *dst = *src1 ^ *src2;
                return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
        } else {
                return __bitmap_weighted_xor(dst, src1, src2, nbits);
        }
}

static __always_inline
void bitmap_xor(unsigned long *dst, const unsigned long *src1,
                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = *src1 ^ *src2;
        else
                __bitmap_xor(dst, src1, src2, nbits);
}

static __always_inline
bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
                   const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        return __bitmap_andnot(dst, src1, src2, nbits);
}

static __always_inline
void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = ~(*src);
        else
                __bitmap_complement(dst, src, nbits);
}

#ifdef __LITTLE_ENDIAN
#define BITMAP_MEM_ALIGNMENT 8
#else
#define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long))
#endif
#define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1)

static __always_inline
bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
        if (__builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
            IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                return !memcmp(src1, src2, nbits / 8);
        return __bitmap_equal(src1, src2, nbits);
}

/**
 * bitmap_or_equal - Check whether the or of two bitmaps is equal to a third
 * @src1:        Pointer to bitmap 1
 * @src2:        Pointer to bitmap 2 will be or'ed with bitmap 1
 * @src3:        Pointer to bitmap 3. Compare to the result of *@src1 | *@src2
 * @nbits:        number of bits in each of these bitmaps
 *
 * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise
 */
static __always_inline
bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2,
                     const unsigned long *src3, unsigned int nbits)
{
        if (!small_const_nbits(nbits))
                return __bitmap_or_equal(src1, src2, src3, nbits);

        return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits));
}

static __always_inline
bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        else
                return __bitmap_intersects(src1, src2, nbits);
}

static __always_inline
bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
        else
                return __bitmap_subset(src1, src2, nbits);
}

static __always_inline
bool bitmap_empty(const unsigned long *src, unsigned nbits)
{
        if (small_const_nbits(nbits))
                return ! (*src & BITMAP_LAST_WORD_MASK(nbits));

        return find_first_bit(src, nbits) == nbits;
}

static __always_inline
bool bitmap_full(const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));

        return find_first_zero_bit(src, nbits) == nbits;
}

static __always_inline
unsigned int bitmap_weight(const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight(src, nbits);
}

static __always_inline
unsigned long bitmap_weight_and(const unsigned long *src1,
                                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight_and(src1, src2, nbits);
}

static __always_inline
unsigned long bitmap_weight_andnot(const unsigned long *src1,
                                   const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight_andnot(src1, src2, nbits);
}

/**
 * bitmap_weight_from - Hamming weight for a memory region
 * @bitmap: The base address
 * @start: The bitnumber to starts weighting
 * @end: the bitmap size in bits
 *
 * Returns the number of set bits in the region. If @start >= @end,
 * return >= end.
 */
static __always_inline
unsigned long bitmap_weight_from(const unsigned long *bitmap,
                                   unsigned int start, unsigned int end)
{
        unsigned long w;

        if (unlikely(start >= end))
                return end;

        if (small_const_nbits(end))
                return hweight_long(*bitmap & GENMASK(end - 1, start));

        bitmap += start / BITS_PER_LONG;
        /* Opencode round_down() to not include math.h */
        end -= start & ~(BITS_PER_LONG - 1);
        start %= BITS_PER_LONG;
        w = bitmap_weight(bitmap, end);
        if (start)
                w -= hweight_long(*bitmap & BITMAP_LAST_WORD_MASK(start));

        return w;
}

static __always_inline
void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits)
{
        if (__builtin_constant_p(nbits) && nbits == 1)
                __set_bit(start, map);
        else if (small_const_nbits(start + nbits))
                *map |= GENMASK(start + nbits - 1, start);
        else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
                 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                memset((char *)map + start / 8, 0xff, nbits / 8);
        else
                __bitmap_set(map, start, nbits);
}

static __always_inline
void bitmap_clear(unsigned long *map, unsigned int start, unsigned int nbits)
{
        if (__builtin_constant_p(nbits) && nbits == 1)
                __clear_bit(start, map);
        else if (small_const_nbits(start + nbits))
                *map &= ~GENMASK(start + nbits - 1, start);
        else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
                 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                memset((char *)map + start / 8, 0, nbits / 8);
        else
                __bitmap_clear(map, start, nbits);
}

static __always_inline
void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                        unsigned int shift, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift;
        else
                __bitmap_shift_right(dst, src, shift, nbits);
}

static __always_inline
void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                       unsigned int shift, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits);
        else
                __bitmap_shift_left(dst, src, shift, nbits);
}

static __always_inline
void bitmap_replace(unsigned long *dst,
                    const unsigned long *old,
                    const unsigned long *new,
                    const unsigned long *mask,
                    unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*old & ~(*mask)) | (*new & *mask);
        else
                __bitmap_replace(dst, old, new, mask, nbits);
}

/**
 * bitmap_scatter - Scatter a bitmap according to the given mask
 * @dst: scattered bitmap
 * @src: gathered bitmap
 * @mask: mask representing bits to assign to in the scattered bitmap
 * @nbits: number of bits in each of these bitmaps
 *
 * Scatters bitmap with sequential bits according to the given @mask.
 *
 * Example:
 * If @src bitmap = 0x005a, with @mask = 0x1313, @dst will be 0x0302.
 *
 * Or in binary form
 * @src                        @mask                        @dst
 * 0000000001011010        0001001100010011        0000001100000010
 *
 * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12)
 *
 * A more 'visual' description of the operation::
 *
 *        src:  0000000001011010
 *                        ||||||
 *                 +------+|||||
 *                 |  +----+||||
 *                 |  |+----+|||
 *                 |  ||   +-+||
 *                 |  ||   |  ||
 *        mask: ...v..vv...v..vv
 *              ...0..11...0..10
 *        dst:  0000001100000010
 *
 * A relationship exists between bitmap_scatter() and bitmap_gather(). See
 * bitmap_gather() for the bitmap gather detailed operations. TL;DR:
 * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation.
 */
static __always_inline
void bitmap_scatter(unsigned long *dst, const unsigned long *src,
                    const unsigned long *mask, unsigned int nbits)
{
        unsigned int n = 0;
        unsigned int bit;

        bitmap_zero(dst, nbits);

        for_each_set_bit(bit, mask, nbits)
                __assign_bit(bit, dst, test_bit(n++, src));
}

/**
 * bitmap_gather - Gather a bitmap according to given mask
 * @dst: gathered bitmap
 * @src: scattered bitmap
 * @mask: mask representing bits to extract from in the scattered bitmap
 * @nbits: number of bits in each of these bitmaps
 *
 * Gathers bitmap with sparse bits according to the given @mask.
 *
 * Example:
 * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a.
 *
 * Or in binary form
 * @src                        @mask                        @dst
 * 0000001100000010        0001001100010011        0000000000011010
 *
 * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5)
 *
 * A more 'visual' description of the operation::
 *
 *        mask: ...v..vv...v..vv
 *        src:  0000001100000010
 *                 ^  ^^   ^   0
 *                 |  ||   |  10
 *                 |  ||   > 010
 *                 |  |+--> 1010
 *                 |  +--> 11010
 *                 +----> 011010
 *        dst:  0000000000011010
 *
 * A relationship exists between bitmap_gather() and bitmap_scatter(). See
 * bitmap_scatter() for the bitmap scatter detailed operations. TL;DR:
 * bitmap_scatter() can be seen as the 'reverse' bitmap_gather() operation.
 *
 * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n).
 * The operation bitmap_gather(result, scattered, mask, n) leads to a result
 * equal or equivalent to src.
 *
 * The result can be 'equivalent' because bitmap_scatter() and bitmap_gather()
 * are not bijective.
 * The result and src values are equivalent in that sense that a call to
 * bitmap_scatter(res, src, mask, n) and a call to
 * bitmap_scatter(res, result, mask, n) will lead to the same res value.
 */
static __always_inline
void bitmap_gather(unsigned long *dst, const unsigned long *src,
                   const unsigned long *mask, unsigned int nbits)
{
        unsigned int n = 0;
        unsigned int bit;

        bitmap_zero(dst, nbits);

        for_each_set_bit(bit, mask, nbits)
                __assign_bit(n++, dst, test_bit(bit, src));
}

static __always_inline
void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs,
                            unsigned int *re, unsigned int end)
{
        *rs = find_next_bit(bitmap, end, *rs);
        *re = find_next_zero_bit(bitmap, end, *rs + 1);
}

/**
 * bitmap_release_region - release allocated bitmap region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @pos: beginning of bit region to release
 *        @order: region size (log base 2 of number of bits) to release
 *
 * This is the complement to __bitmap_find_free_region() and releases
 * the found region (by clearing it in the bitmap).
 */
static __always_inline
void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
{
        bitmap_clear(bitmap, pos, BIT(order));
}

/**
 * bitmap_allocate_region - allocate bitmap region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @pos: beginning of bit region to allocate
 *        @order: region size (log base 2 of number of bits) to allocate
 *
 * Allocate (set bits in) a specified region of a bitmap.
 *
 * Returns: 0 on success, or %-EBUSY if specified region wasn't
 * free (not all bits were zero).
 */
static __always_inline
int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
{
        unsigned int len = BIT(order);

        if (find_next_bit(bitmap, pos + len, pos) < pos + len)
                return -EBUSY;
        bitmap_set(bitmap, pos, len);
        return 0;
}

/**
 * bitmap_find_free_region - find a contiguous aligned mem region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @bits: number of bits in the bitmap
 *        @order: region size (log base 2 of number of bits) to find
 *
 * Find a region of free (zero) bits in a @bitmap of @bits bits and
 * allocate them (set them to one).  Only consider regions of length
 * a power (@order) of two, aligned to that power of two, which
 * makes the search algorithm much faster.
 *
 * Returns: the bit offset in bitmap of the allocated region,
 * or -errno on failure.
 */
static __always_inline
int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
{
        unsigned int pos, end;                /* scans bitmap by regions of size order */

        for (pos = 0; (end = pos + BIT(order)) <= bits; pos = end) {
                if (!bitmap_allocate_region(bitmap, pos, order))
                        return pos;
        }
        return -ENOMEM;
}

/**
 * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
 * @n: u64 value
 *
 * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit
 * integers in 32-bit environment, and 64-bit integers in 64-bit one.
 *
 * There are four combinations of endianness and length of the word in linux
 * ABIs: LE64, BE64, LE32 and BE32.
 *
 * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in
 * bitmaps and therefore don't require any special handling.
 *
 * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory
 * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the
 * other hand is represented as an array of 32-bit words and the position of
 * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that
 * word.  For example, bit #42 is located at 10th position of 2nd word.
 * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit
 * values in memory as it usually does. But for BE we need to swap hi and lo
 * words manually.
 *
 * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and
 * lo parts of u64.  For LE32 it does nothing, and for BE environment it swaps
 * hi and lo words, as is expected by bitmap.
 */
#if __BITS_PER_LONG == 64
#define BITMAP_FROM_U64(n) (n)
#else
#define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \
                                ((unsigned long) ((u64)(n) >> 32))
#endif

/**
 * bitmap_from_u64 - Check and swap words within u64.
 *  @mask: source bitmap
 *  @dst:  destination bitmap
 *
 * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]``
 * to read u64 mask, we will get the wrong word.
 * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits,
 * but we expect the lower 32-bits of u64.
 */
static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask)
{
        bitmap_from_arr64(dst, &mask, 64);
}

/**
 * bitmap_read - read a value of n-bits from the memory region
 * @map: address to the bitmap memory region
 * @start: bit offset of the n-bit value
 * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG
 *
 * Returns: value of @nbits bits located at the @start bit offset within the
 * @map memory region. For @nbits = 0 and @nbits > BITS_PER_LONG the return
 * value is undefined.
 */
static __always_inline
unsigned long bitmap_read(const unsigned long *map, unsigned long start, unsigned long nbits)
{
        size_t index = BIT_WORD(start);
        unsigned long offset = start % BITS_PER_LONG;
        unsigned long space = BITS_PER_LONG - offset;
        unsigned long value_low, value_high;

        if (unlikely(!nbits || nbits > BITS_PER_LONG))
                return 0;

        if (space >= nbits)
                return (map[index] >> offset) & BITMAP_LAST_WORD_MASK(nbits);

        value_low = map[index] & BITMAP_FIRST_WORD_MASK(start);
        value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits);
        return (value_low >> offset) | (value_high << space);
}

/**
 * bitmap_write - write n-bit value within a memory region
 * @map: address to the bitmap memory region
 * @value: value to write, clamped to nbits
 * @start: bit offset of the n-bit value
 * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG.
 *
 * bitmap_write() behaves as-if implemented as @nbits calls of __assign_bit(),
 * i.e. bits beyond @nbits are ignored:
 *
 *   for (bit = 0; bit < nbits; bit++)
 *           __assign_bit(start + bit, bitmap, val & BIT(bit));
 *
 * For @nbits == 0 and @nbits > BITS_PER_LONG no writes are performed.
 */
static __always_inline
void bitmap_write(unsigned long *map, unsigned long value,
                  unsigned long start, unsigned long nbits)
{
        size_t index;
        unsigned long offset;
        unsigned long space;
        unsigned long mask;
        bool fit;

        if (unlikely(!nbits || nbits > BITS_PER_LONG))
                return;

        mask = BITMAP_LAST_WORD_MASK(nbits);
        value &= mask;
        offset = start % BITS_PER_LONG;
        space = BITS_PER_LONG - offset;
        fit = space >= nbits;
        index = BIT_WORD(start);

        map[index] &= (fit ? (~(mask << offset)) : ~BITMAP_FIRST_WORD_MASK(start));
        map[index] |= value << offset;
        if (fit)
                return;

        map[index + 1] &= BITMAP_FIRST_WORD_MASK(start + nbits);
        map[index + 1] |= (value >> space);
}

#define bitmap_get_value8(map, start)                        \
        bitmap_read(map, start, BITS_PER_BYTE)
#define bitmap_set_value8(map, value, start)                \
        bitmap_write(map, value, start, BITS_PER_BYTE)

#endif /* __ASSEMBLY__ */

#endif /* __LINUX_BITMAP_H */











































    1 





















































    1 
















































    1 


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUSET_H
#define _LINUX_CPUSET_H
/*
 *  cpuset interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/sched/topology.h>
#include <linux/sched/task.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/jump_label.h>

extern bool lockdep_is_cpuset_held(void);

#ifdef CONFIG_CPUSETS

/*
 * Static branch rewrites can happen in an arbitrary order for a given
 * key. In code paths where we need to loop with read_mems_allowed_begin() and
 * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
 * to ensure that begin() always gets rewritten before retry() in the
 * disabled -> enabled transition. If not, then if local irqs are disabled
 * around the loop, we can deadlock since retry() would always be
 * comparing the latest value of the mems_allowed seqcount against 0 as
 * begin() still would see cpusets_enabled() as false. The enabled -> disabled
 * transition should happen in reverse order for the same reasons (want to stop
 * looking at real value of mems_allowed.sequence in retry() first).
 */
extern struct static_key_false cpusets_pre_enable_key;
extern struct static_key_false cpusets_enabled_key;
extern struct static_key_false cpusets_insane_config_key;

static inline bool cpusets_enabled(void)
{
        return static_branch_unlikely(&cpusets_enabled_key);
}

static inline void cpuset_inc(void)
{
        static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
        static_branch_inc_cpuslocked(&cpusets_enabled_key);
}

static inline void cpuset_dec(void)
{
        static_branch_dec_cpuslocked(&cpusets_enabled_key);
        static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
}

/*
 * This will get enabled whenever a cpuset configuration is considered
 * unsupportable in general. E.g. movable only node which cannot satisfy
 * any non movable allocations (see update_nodemask). Page allocator
 * needs to make additional checks for those configurations and this
 * check is meant to guard those checks without any overhead for sane
 * configurations.
 */
static inline bool cpusets_insane_config(void)
{
        return static_branch_unlikely(&cpusets_insane_config_key);
}

extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
extern void inc_dl_tasks_cs(struct task_struct *task);
extern void dec_dl_tasks_cs(struct task_struct *task);
extern void cpuset_lock(void);
extern void cpuset_unlock(void);
extern void lockdep_assert_cpuset_lock_held(void);
extern void cpuset_cpus_allowed_locked(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);

extern bool cpuset_current_node_allowed(int node, gfp_t gfp_mask);

static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return cpuset_current_node_allowed(zone_to_nid(z), gfp_mask);
}

static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        if (cpusets_enabled())
                return __cpuset_zone_allowed(z, gfp_mask);
        return true;
}

extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                          const struct task_struct *tsk2);

#ifdef CONFIG_CPUSETS_V1
#define cpuset_memory_pressure_bump()                                 \
        do {                                                        \
                if (cpuset_memory_pressure_enabled)                \
                        __cpuset_memory_pressure_bump();        \
        } while (0)
extern int cpuset_memory_pressure_enabled;
extern void __cpuset_memory_pressure_bump(void);
#else
static inline void cpuset_memory_pressure_bump(void) { }
#endif

extern void cpuset_task_status_allowed(struct seq_file *m,
                                        struct task_struct *task);
extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                            struct pid *pid, struct task_struct *tsk);

extern int cpuset_mem_spread_node(void);

static inline int cpuset_do_page_mem_spread(void)
{
        return task_spread_page(current);
}

extern bool current_cpuset_is_being_rebound(void);

extern void dl_rebuild_rd_accounting(void);
extern void rebuild_sched_domains(void);

extern void cpuset_print_current_mems_allowed(void);
extern void cpuset_reset_sched_domains(void);

/*
 * read_mems_allowed_begin is required when making decisions involving
 * mems_allowed such as during page allocation. mems_allowed can be updated in
 * parallel and depending on the new value an operation can fail potentially
 * causing process failure. A retry loop with read_mems_allowed_begin and
 * read_mems_allowed_retry prevents these artificial failures.
 */
static inline unsigned int read_mems_allowed_begin(void)
{
        if (!static_branch_unlikely(&cpusets_pre_enable_key))
                return 0;

        return read_seqcount_begin(&current->mems_allowed_seq);
}

/*
 * If this returns true, the operation that took place after
 * read_mems_allowed_begin may have failed artificially due to a concurrent
 * update of mems_allowed. It is up to the caller to retry the operation if
 * appropriate.
 */
static inline bool read_mems_allowed_retry(unsigned int seq)
{
        if (!static_branch_unlikely(&cpusets_enabled_key))
                return false;

        return read_seqcount_retry(&current->mems_allowed_seq, seq);
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
        unsigned long flags;

        task_lock(current);
        local_irq_save(flags);
        write_seqcount_begin(&current->mems_allowed_seq);
        current->mems_allowed = nodemask;
        write_seqcount_end(&current->mems_allowed_seq);
        local_irq_restore(flags);
        task_unlock(current);
}

extern void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask);
#else /* !CONFIG_CPUSETS */

static inline bool cpusets_enabled(void) { return false; }

static inline bool cpusets_insane_config(void) { return false; }

static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}

static inline void cpuset_force_rebuild(void) { }

static inline void cpuset_update_active_cpus(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void inc_dl_tasks_cs(struct task_struct *task) { }
static inline void dec_dl_tasks_cs(struct task_struct *task) { }
static inline void cpuset_lock(void) { }
static inline void cpuset_unlock(void) { }
static inline void lockdep_assert_cpuset_lock_held(void) { }

static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
                                        struct cpumask *mask)
{
        cpumask_copy(mask, task_cpu_possible_mask(p));
}

static inline void cpuset_cpus_allowed(struct task_struct *p,
                                       struct cpumask *mask)
{
        cpuset_cpus_allowed_locked(p, mask);
}

static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
{
        return false;
}

static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
        return node_possible_map;
}

#define cpuset_current_mems_allowed (node_states[N_MEMORY])
static inline void cpuset_init_current_mems_allowed(void) {}

static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
        return 1;
}

static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return true;
}

static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return true;
}

static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                                 const struct task_struct *tsk2)
{
        return 1;
}

static inline void cpuset_memory_pressure_bump(void) {}

static inline void cpuset_task_status_allowed(struct seq_file *m,
                                                struct task_struct *task)
{
}

static inline int cpuset_mem_spread_node(void)
{
        return 0;
}

static inline int cpuset_do_page_mem_spread(void)
{
        return 0;
}

static inline bool current_cpuset_is_being_rebound(void)
{
        return false;
}

static inline void dl_rebuild_rd_accounting(void)
{
}

static inline void rebuild_sched_domains(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void cpuset_reset_sched_domains(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void cpuset_print_current_mems_allowed(void)
{
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
}

static inline unsigned int read_mems_allowed_begin(void)
{
        return 0;
}

static inline bool read_mems_allowed_retry(unsigned int seq)
{
        return false;
}

static inline void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask)
{
        nodes_copy(*mask, node_states[N_MEMORY]);
}
#endif /* !CONFIG_CPUSETS */

#endif /* _LINUX_CPUSET_H */




















































































    6 






















































































































































    2 
    3 



















































































































































































































































    4 

















    4 








    2 














































    4 


















    2 












    2 
    2 
    2 
    2 
































































   38 
   10 



   34 











    2 
    1 

    1 









































































    2 



    2 

    2 
    2 






    2 

    2 
















































































































































































































































































































































































































    2 

    2 

    2 






    2 


    2 

    2 


    2 
















    2 





    2 









































































































































































































































































































































































































































































































































   20 


   20 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
 *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
 *   Copyright (C) 2009 Red Hat, Inc.
 *
 * Creation is done via kthreadd, so that we get a clean environment
 * even if we're invoked from userspace (think modprobe, hotplug cpu,
 * etc.).
 */
#include <uapi/linux/sched/types.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
#include <linux/sched/isolation.h>
#include <trace/events/sched.h>


static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;

static LIST_HEAD(kthread_affinity_list);
static DEFINE_MUTEX(kthread_affinity_lock);

struct kthread_create_info
{
        /* Information passed to kthread() from kthreadd. */
        char *full_name;
        int (*threadfn)(void *data);
        void *data;
        int node;

        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
        struct completion *done;

        struct list_head list;
};

struct kthread {
        unsigned long flags;
        unsigned int cpu;
        unsigned int node;
        int started;
        int result;
        int (*threadfn)(void *);
        void *data;
        struct completion parked;
        struct completion exited;
#ifdef CONFIG_BLK_CGROUP
        struct cgroup_subsys_state *blkcg_css;
#endif
        /* To store the full name if task comm is truncated. */
        char *full_name;
        struct task_struct *task;
        struct list_head affinity_node;
        struct cpumask *preferred_affinity;
};

enum KTHREAD_BITS {
        KTHREAD_IS_PER_CPU = 0,
        KTHREAD_SHOULD_STOP,
        KTHREAD_SHOULD_PARK,
};

static inline struct kthread *to_kthread(struct task_struct *k)
{
        WARN_ON(!(k->flags & PF_KTHREAD));
        return k->worker_private;
}

void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
        struct kthread *kthread = to_kthread(tsk);

        if (!kthread || !kthread->full_name) {
                strscpy(buf, tsk->comm, buf_size);
                return;
        }

        strscpy_pad(buf, kthread->full_name, buf_size);
}

bool set_kthread_struct(struct task_struct *p)
{
        struct kthread *kthread;

        if (WARN_ON_ONCE(to_kthread(p)))
                return false;

        kthread = kzalloc_obj(*kthread);
        if (!kthread)
                return false;

        init_completion(&kthread->exited);
        init_completion(&kthread->parked);
        INIT_LIST_HEAD(&kthread->affinity_node);
        p->vfork_done = &kthread->exited;

        kthread->task = p;
        kthread->node = tsk_fork_get_node(current);
        p->worker_private = kthread;
        return true;
}

void free_kthread_struct(struct task_struct *k)
{
        struct kthread *kthread;

        /*
         * Can be NULL if kmalloc() in set_kthread_struct() failed.
         */
        kthread = to_kthread(k);
        if (!kthread)
                return;

#ifdef CONFIG_BLK_CGROUP
        WARN_ON_ONCE(kthread->blkcg_css);
#endif
        k->worker_private = NULL;
        kfree(kthread->full_name);
        kfree(kthread);
}

/**
 * kthread_should_stop - should this kthread return now?
 *
 * When someone calls kthread_stop() on your kthread, it will be woken
 * and this will return true.  You should then return, and your return
 * value will be passed through to kthread_stop().
 */
bool kthread_should_stop(void)
{
        return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);

static bool __kthread_should_park(struct task_struct *k)
{
        return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}

/**
 * kthread_should_park - should this kthread park now?
 *
 * When someone calls kthread_park() on your kthread, it will be woken
 * and this will return true.  You should then do the necessary
 * cleanup and call kthread_parkme()
 *
 * Similar to kthread_should_stop(), but this keeps the thread alive
 * and in a park position. kthread_unpark() "restarts" the thread and
 * calls the thread function again.
 */
bool kthread_should_park(void)
{
        return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);

bool kthread_should_stop_or_park(void)
{
        struct kthread *kthread = tsk_is_kthread(current);

        if (!kthread)
                return false;

        return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
}

/**
 * kthread_freezable_should_stop - should this freezable kthread return now?
 * @was_frozen: optional out parameter, indicates whether %current was frozen
 *
 * kthread_should_stop() for freezable kthreads, which will enter
 * refrigerator if necessary.  This function is safe from kthread_stop() /
 * freezer deadlock and freezable kthreads should use this function instead
 * of calling try_to_freeze() directly.
 */
bool kthread_freezable_should_stop(bool *was_frozen)
{
        bool frozen = false;

        might_sleep();

        if (unlikely(freezing(current)))
                frozen = __refrigerator(true);

        if (was_frozen)
                *was_frozen = frozen;

        return kthread_should_stop();
}
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);

/**
 * kthread_func - return the function specified on kthread creation
 * @task: kthread task in question
 *
 * Returns NULL if the task is not a kthread.
 */
void *kthread_func(struct task_struct *task)
{
        struct kthread *kthread = tsk_is_kthread(task);
        if (kthread)
                return kthread->threadfn;
        return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);

/**
 * kthread_data - return data value specified on kthread creation
 * @task: kthread task in question
 *
 * Return the data value specified when kthread @task was created.
 * The caller is responsible for ensuring the validity of @task when
 * calling this function.
 */
void *kthread_data(struct task_struct *task)
{
        return to_kthread(task)->data;
}
EXPORT_SYMBOL_GPL(kthread_data);

/**
 * kthread_probe_data - speculative version of kthread_data()
 * @task: possible kthread task in question
 *
 * @task could be a kthread task.  Return the data value specified when it
 * was created if accessible.  If @task isn't a kthread task or its data is
 * inaccessible for any reason, %NULL is returned.  This function requires
 * that @task itself is safe to dereference.
 */
void *kthread_probe_data(struct task_struct *task)
{
        struct kthread *kthread = tsk_is_kthread(task);
        void *data = NULL;

        if (kthread)
                copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
        return data;
}

static void __kthread_parkme(struct kthread *self)
{
        for (;;) {
                /*
                 * TASK_PARKED is a special state; we must serialize against
                 * possible pending wakeups to avoid store-store collisions on
                 * task->state.
                 *
                 * Such a collision might possibly result in the task state
                 * changin from TASK_PARKED and us failing the
                 * wait_task_inactive() in kthread_park().
                 */
                set_special_state(TASK_PARKED);
                if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
                        break;

                /*
                 * Thread is going to call schedule(), do not preempt it,
                 * or the caller of kthread_park() may spend more time in
                 * wait_task_inactive().
                 */
                preempt_disable();
                complete(&self->parked);
                schedule_preempt_disabled();
                preempt_enable();
        }
        __set_current_state(TASK_RUNNING);
}

void kthread_parkme(void)
{
        __kthread_parkme(to_kthread(current));
}
EXPORT_SYMBOL_GPL(kthread_parkme);

void kthread_do_exit(struct kthread *kthread, long result)
{
        kthread->result = result;
        if (!list_empty(&kthread->affinity_node)) {
                mutex_lock(&kthread_affinity_lock);
                list_del(&kthread->affinity_node);
                mutex_unlock(&kthread_affinity_lock);

                if (kthread->preferred_affinity) {
                        kfree(kthread->preferred_affinity);
                        kthread->preferred_affinity = NULL;
                }
        }
}

/**
 * kthread_complete_and_exit - Exit the current kthread.
 * @comp: Completion to complete
 * @code: The integer value to return to kthread_stop().
 *
 * If present, complete @comp and then return code to kthread_stop().
 *
 * A kernel thread whose module may be removed after the completion of
 * @comp can use this function to exit safely.
 *
 * Does not return.
 */
void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
{
        if (comp)
                complete(comp);

        kthread_exit(code);
}
EXPORT_SYMBOL(kthread_complete_and_exit);

static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
{
        const struct cpumask *pref;

        guard(rcu)();

        if (kthread->preferred_affinity) {
                pref = kthread->preferred_affinity;
        } else {
                if (kthread->node == NUMA_NO_NODE)
                        pref = housekeeping_cpumask(HK_TYPE_DOMAIN);
                else
                        pref = cpumask_of_node(kthread->node);
        }

        cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_DOMAIN));
        if (cpumask_empty(cpumask))
                cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
}

static void kthread_affine_node(void)
{
        struct kthread *kthread = to_kthread(current);
        cpumask_var_t affinity;

        if (WARN_ON_ONCE(kthread_is_per_cpu(current)))
                return;

        if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
                WARN_ON_ONCE(1);
                return;
        }

        mutex_lock(&kthread_affinity_lock);
        WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
        list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
        /*
         * The node cpumask is racy when read from kthread() but:
         * - a racing CPU going down will either fail on the subsequent
         *   call to set_cpus_allowed_ptr() or be migrated to housekeepers
         *   afterwards by the scheduler.
         * - a racing CPU going up will be handled by kthreads_online_cpu()
         */
        kthread_fetch_affinity(kthread, affinity);
        set_cpus_allowed_ptr(current, affinity);
        mutex_unlock(&kthread_affinity_lock);

        free_cpumask_var(affinity);
}

static int kthread(void *_create)
{
        static const struct sched_param param = { .sched_priority = 0 };
        /* Copy data: it's on kthread's stack */
        struct kthread_create_info *create = _create;
        int (*threadfn)(void *data) = create->threadfn;
        void *data = create->data;
        struct completion *done;
        struct kthread *self;
        int ret;

        self = to_kthread(current);

        /* Release the structure when caller killed by a fatal signal. */
        done = xchg(&create->done, NULL);
        if (!done) {
                kfree(create->full_name);
                kfree(create);
                kthread_exit(-EINTR);
        }

        self->full_name = create->full_name;
        self->threadfn = threadfn;
        self->data = data;

        /*
         * The new thread inherited kthreadd's priority and CPU mask. Reset
         * back to default in case they have been changed.
         */
        sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);

        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        create->result = current;
        /*
         * Thread is going to call schedule(), do not preempt it,
         * or the creator may spend more time in wait_task_inactive().
         */
        preempt_disable();
        complete(done);
        schedule_preempt_disabled();
        preempt_enable();

        self->started = 1;

        /*
         * Apply default node affinity if no call to kthread_bind[_mask]() nor
         * kthread_affine_preferred() was issued before the first wake-up.
         */
        if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
                kthread_affine_node();

        ret = -EINTR;
        if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
                cgroup_kthread_ready();
                __kthread_parkme(self);
                ret = threadfn(data);
        }
        kthread_exit(ret);
}

/* called from kernel_clone() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
        if (tsk == kthreadd_task)
                return tsk->pref_node_fork;
#endif
        return NUMA_NO_NODE;
}

static void create_kthread(struct kthread_create_info *create)
{
        int pid;

#ifdef CONFIG_NUMA
        current->pref_node_fork = create->node;
#endif
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, create->full_name,
                            CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
                /* Release the structure when caller killed by a fatal signal. */
                struct completion *done = xchg(&create->done, NULL);

                kfree(create->full_name);
                if (!done) {
                        kfree(create);
                        return;
                }
                create->result = ERR_PTR(pid);
                complete(done);
        }
}

static __printf(4, 0)
struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
                                                    void *data, int node,
                                                    const char namefmt[],
                                                    va_list args)
{
        DECLARE_COMPLETION_ONSTACK(done);
        struct task_struct *task;
        struct kthread_create_info *create = kmalloc_obj(*create);

        if (!create)
                return ERR_PTR(-ENOMEM);
        create->threadfn = threadfn;
        create->data = data;
        create->node = node;
        create->done = &done;
        create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
        if (!create->full_name) {
                task = ERR_PTR(-ENOMEM);
                goto free_create;
        }

        spin_lock(&kthread_create_lock);
        list_add_tail(&create->list, &kthread_create_list);
        spin_unlock(&kthread_create_lock);

        wake_up_process(kthreadd_task);
        /*
         * Wait for completion in killable state, for I might be chosen by
         * the OOM killer while kthreadd is trying to allocate memory for
         * new kernel thread.
         */
        if (unlikely(wait_for_completion_killable(&done))) {
                /*
                 * If I was killed by a fatal signal before kthreadd (or new
                 * kernel thread) calls complete(), leave the cleanup of this
                 * structure to that thread.
                 */
                if (xchg(&create->done, NULL))
                        return ERR_PTR(-EINTR);
                /*
                 * kthreadd (or new kernel thread) will call complete()
                 * shortly.
                 */
                wait_for_completion(&done);
        }
        task = create->result;
free_create:
        kfree(create);
        return task;
}

/**
 * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @node: task and thread structures for the thread are allocated on this node
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
 * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
 * is affine to all CPUs.
 *
 * If thread is going to be bound on a particular cpu, give its node
 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either return directly if it is a
 * standalone thread for which no one will call kthread_stop(), or
 * return when 'kthread_should_stop()' is true (which means
 * kthread_stop() has been called).  The return value should be zero
 * or a negative error number; it will be passed to kthread_stop().
 *
 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
 */
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           void *data, int node,
                                           const char namefmt[],
                                           ...)
{
        struct task_struct *task;
        va_list args;

        va_start(args, namefmt);
        task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
        va_end(args);

        return task;
}
EXPORT_SYMBOL(kthread_create_on_node);

static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
        if (!wait_task_inactive(p, state)) {
                WARN_ON(1);
                return;
        }

        scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
                set_cpus_allowed_force(p, mask);

        /* It's safe because the task is inactive. */
        p->flags |= PF_NO_SETAFFINITY;
}

static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
{
        __kthread_bind_mask(p, cpumask_of(cpu), state);
}

void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
        struct kthread *kthread = to_kthread(p);
        __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
        WARN_ON_ONCE(kthread->started);
}

/**
 * kthread_bind - bind a just-created kthread to a cpu.
 * @p: thread created by kthread_create().
 * @cpu: cpu (might not be online, must be possible) for @k to run on.
 *
 * Description: This function is equivalent to set_cpus_allowed(),
 * except that @cpu doesn't need to be online, and the thread must be
 * stopped (i.e., just returned from kthread_create()).
 */
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
        struct kthread *kthread = to_kthread(p);
        __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
        WARN_ON_ONCE(kthread->started);
}
EXPORT_SYMBOL(kthread_bind);

/**
 * kthread_create_on_cpu - Create a cpu bound kthread
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @cpu: The cpu on which the thread should be bound,
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Description: This helper function creates and names a kernel thread
 */
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
                                          void *data, unsigned int cpu,
                                          const char *namefmt)
{
        struct task_struct *p;

        p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
                                   cpu);
        if (IS_ERR(p))
                return p;
        kthread_bind(p, cpu);
        /* CPU hotplug need to bind once again when unparking the thread. */
        to_kthread(p)->cpu = cpu;
        return p;
}
EXPORT_SYMBOL(kthread_create_on_cpu);

void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
        struct kthread *kthread = to_kthread(k);
        if (!kthread)
                return;

        WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));

        if (cpu < 0) {
                clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
                return;
        }

        kthread->cpu = cpu;
        set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

bool kthread_is_per_cpu(struct task_struct *p)
{
        struct kthread *kthread = tsk_is_kthread(p);
        if (!kthread)
                return false;

        return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

/**
 * kthread_unpark - unpark a thread created by kthread_create().
 * @k:                thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return false, wakes it, and
 * waits for it to return. If the thread is marked percpu then its
 * bound to the cpu again.
 */
void kthread_unpark(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))
                return;
        /*
         * Newly created kthread was parked when the CPU was offline.
         * The binding was lost and we need to set it again.
         */
        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
                __kthread_bind(k, kthread->cpu, TASK_PARKED);

        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        /*
         * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
         */
        wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);

/**
 * kthread_park - park a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return true, wakes it, and
 * waits for it to return. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will park without
 * calling threadfn().
 *
 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
 * If called by the kthread itself just the park bit is set.
 */
int kthread_park(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        if (WARN_ON(k->flags & PF_EXITING))
                return -ENOSYS;

        if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
                return -EBUSY;

        set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        if (k != current) {
                wake_up_process(k);
                /*
                 * Wait for __kthread_parkme() to complete(), this means we
                 * _will_ have TASK_PARKED and are about to call schedule().
                 */
                wait_for_completion(&kthread->parked);
                /*
                 * Now wait for that schedule() to complete and the task to
                 * get scheduled out.
                 */
                WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
        }

        return 0;
}
EXPORT_SYMBOL_GPL(kthread_park);

/**
 * kthread_stop - stop a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_stop() for @k to return true, wakes it, and
 * waits for it to exit. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will exit without
 * calling threadfn().
 *
 * If threadfn() may call kthread_exit() itself, the caller must ensure
 * task_struct can't go away.
 *
 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
 * was never called.
 */
int kthread_stop(struct task_struct *k)
{
        struct kthread *kthread;
        int ret;

        trace_sched_kthread_stop(k);

        get_task_struct(k);
        kthread = to_kthread(k);
        set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
        kthread_unpark(k);
        set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL);
        wake_up_process(k);
        wait_for_completion(&kthread->exited);
        ret = kthread->result;
        put_task_struct(k);

        trace_sched_kthread_stop_ret(ret);
        return ret;
}
EXPORT_SYMBOL(kthread_stop);

/**
 * kthread_stop_put - stop a thread and put its task struct
 * @k: thread created by kthread_create().
 *
 * Stops a thread created by kthread_create() and put its task_struct.
 * Only use when holding an extra task struct reference obtained by
 * calling get_task_struct().
 */
int kthread_stop_put(struct task_struct *k)
{
        int ret;

        ret = kthread_stop(k);
        put_task_struct(k);
        return ret;
}
EXPORT_SYMBOL(kthread_stop_put);

int kthreadd(void *unused)
{
        static const char comm[TASK_COMM_LEN] = "kthreadd";
        struct task_struct *tsk = current;

        /* Setup a clean context for our children to inherit. */
        set_task_comm(tsk, comm);
        ignore_signals(tsk);
        set_mems_allowed(node_states[N_MEMORY]);

        current->flags |= PF_NOFREEZE;
        cgroup_init_kthreadd();

        kthread_affine_node();

        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (list_empty(&kthread_create_list))
                        schedule();
                __set_current_state(TASK_RUNNING);

                spin_lock(&kthread_create_lock);
                while (!list_empty(&kthread_create_list)) {
                        struct kthread_create_info *create;

                        create = list_entry(kthread_create_list.next,
                                            struct kthread_create_info, list);
                        list_del_init(&create->list);
                        spin_unlock(&kthread_create_lock);

                        create_kthread(create);

                        spin_lock(&kthread_create_lock);
                }
                spin_unlock(&kthread_create_lock);
        }

        return 0;
}

/**
 * kthread_affine_preferred - Define a kthread's preferred affinity
 * @p: thread created by kthread_create().
 * @mask: preferred mask of CPUs (might not be online, must be possible) for @p
 *        to run on.
 *
 * Similar to kthread_bind_mask() except that the affinity is not a requirement
 * but rather a preference that can be constrained by CPU isolation or CPU hotplug.
 * Must be called before the first wakeup of the kthread.
 *
 * Returns 0 if the affinity has been applied.
 */
int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
{
        struct kthread *kthread = to_kthread(p);
        cpumask_var_t affinity;
        int ret = 0;

        if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
                WARN_ON(1);
                return -EINVAL;
        }

        WARN_ON_ONCE(kthread->preferred_affinity);

        if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
                return -ENOMEM;

        kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL);
        if (!kthread->preferred_affinity) {
                ret = -ENOMEM;
                goto out;
        }

        mutex_lock(&kthread_affinity_lock);
        cpumask_copy(kthread->preferred_affinity, mask);
        WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
        list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
        kthread_fetch_affinity(kthread, affinity);

        scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
                set_cpus_allowed_force(p, affinity);

        mutex_unlock(&kthread_affinity_lock);
out:
        free_cpumask_var(affinity);

        return ret;
}
EXPORT_SYMBOL_GPL(kthread_affine_preferred);

static int kthreads_update_affinity(bool force)
{
        cpumask_var_t affinity;
        struct kthread *k;
        int ret;

        guard(mutex)(&kthread_affinity_lock);

        if (list_empty(&kthread_affinity_list))
                return 0;

        if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
                return -ENOMEM;

        ret = 0;

        list_for_each_entry(k, &kthread_affinity_list, affinity_node) {
                if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
                                 kthread_is_per_cpu(k->task))) {
                        ret = -EINVAL;
                        continue;
                }

                /*
                 * Unbound kthreads without preferred affinity are already affine
                 * to housekeeping, whether those CPUs are online or not. So no need
                 * to handle newly online CPUs for them. However housekeeping changes
                 * have to be applied.
                 *
                 * But kthreads with a preferred affinity or node are different:
                 * if none of their preferred CPUs are online and part of
                 * housekeeping at the same time, they must be affine to housekeeping.
                 * But as soon as one of their preferred CPU becomes online, they must
                 * be affine to them.
                 */
                if (force || k->preferred_affinity || k->node != NUMA_NO_NODE) {
                        kthread_fetch_affinity(k, affinity);
                        set_cpus_allowed_ptr(k->task, affinity);
                }
        }

        free_cpumask_var(affinity);

        return ret;
}

/**
 * kthreads_update_housekeeping - Update kthreads affinity on cpuset change
 *
 * When cpuset changes a partition type to/from "isolated" or updates related
 * cpumasks, propagate the housekeeping cpumask change to preferred kthreads
 * affinity.
 *
 * Returns 0 if successful, -ENOMEM if temporary mask couldn't
 * be allocated or -EINVAL in case of internal error.
 */
int kthreads_update_housekeeping(void)
{
        return kthreads_update_affinity(true);
}

/*
 * Re-affine kthreads according to their preferences
 * and the newly online CPU. The CPU down part is handled
 * by select_fallback_rq() which default re-affines to
 * housekeepers from other nodes in case the preferred
 * affinity doesn't apply anymore.
 */
static int kthreads_online_cpu(unsigned int cpu)
{
        return kthreads_update_affinity(false);
}

static int kthreads_init(void)
{
        return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
                                kthreads_online_cpu, NULL);
}
early_initcall(kthreads_init);

void __kthread_init_worker(struct kthread_worker *worker,
                                const char *name,
                                struct lock_class_key *key)
{
        memset(worker, 0, sizeof(struct kthread_worker));
        raw_spin_lock_init(&worker->lock);
        lockdep_set_class_and_name(&worker->lock, key, name);
        INIT_LIST_HEAD(&worker->work_list);
        INIT_LIST_HEAD(&worker->delayed_work_list);
}
EXPORT_SYMBOL_GPL(__kthread_init_worker);

/**
 * kthread_worker_fn - kthread function to process kthread_worker
 * @worker_ptr: pointer to initialized kthread_worker
 *
 * This function implements the main cycle of kthread worker. It processes
 * work_list until it is stopped with kthread_stop(). It sleeps when the queue
 * is empty.
 *
 * The works are not allowed to keep any locks, disable preemption or interrupts
 * when they finish. There is defined a safe point for freezing when one work
 * finishes and before a new one is started.
 *
 * Also the works must not be handled by more than one worker at the same time,
 * see also kthread_queue_work().
 */
int kthread_worker_fn(void *worker_ptr)
{
        struct kthread_worker *worker = worker_ptr;
        struct kthread_work *work;

        /*
         * FIXME: Update the check and remove the assignment when all kthread
         * worker users are created using kthread_create_worker*() functions.
         */
        WARN_ON(worker->task && worker->task != current);
        worker->task = current;

        if (worker->flags & KTW_FREEZABLE)
                set_freezable();

repeat:
        set_current_state(TASK_INTERRUPTIBLE);        /* mb paired w/ kthread_stop */

        if (kthread_should_stop()) {
                __set_current_state(TASK_RUNNING);
                raw_spin_lock_irq(&worker->lock);
                worker->task = NULL;
                raw_spin_unlock_irq(&worker->lock);
                return 0;
        }

        work = NULL;
        raw_spin_lock_irq(&worker->lock);
        if (!list_empty(&worker->work_list)) {
                work = list_first_entry(&worker->work_list,
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
        worker->current_work = work;
        raw_spin_unlock_irq(&worker->lock);

        if (work) {
                kthread_work_func_t func = work->func;
                __set_current_state(TASK_RUNNING);
                trace_sched_kthread_work_execute_start(work);
                work->func(work);
                /*
                 * Avoid dereferencing work after this point.  The trace
                 * event only cares about the address.
                 */
                trace_sched_kthread_work_execute_end(work, func);
        } else if (!freezing(current)) {
                schedule();
        } else {
                /*
                 * Handle the case where the current remains
                 * TASK_INTERRUPTIBLE. try_to_freeze() expects
                 * the current to be TASK_RUNNING.
                 */
                __set_current_state(TASK_RUNNING);
        }

        try_to_freeze();
        cond_resched();
        goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);

static __printf(3, 0) struct kthread_worker *
__kthread_create_worker_on_node(unsigned int flags, int node,
                                const char namefmt[], va_list args)
{
        struct kthread_worker *worker;
        struct task_struct *task;

        worker = kzalloc_obj(*worker);
        if (!worker)
                return ERR_PTR(-ENOMEM);

        kthread_init_worker(worker);

        task = __kthread_create_on_node(kthread_worker_fn, worker,
                                        node, namefmt, args);
        if (IS_ERR(task))
                goto fail_task;

        worker->flags = flags;
        worker->task = task;

        return worker;

fail_task:
        kfree(worker);
        return ERR_CAST(task);
}

/**
 * kthread_create_worker_on_node - create a kthread worker
 * @flags: flags modifying the default behavior of the worker
 * @node: task structure for the thread is allocated on this node
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the caller was killed by a fatal signal.
 */
struct kthread_worker *
kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker_on_node(flags, node, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_node);

/**
 * kthread_create_worker_on_cpu - create a kthread worker and bind it
 *        to a given CPU and the associated NUMA node.
 * @cpu: CPU number
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Use a valid CPU number if you want to bind the kthread worker
 * to the given CPU and the associated NUMA node.
 *
 * A good practice is to add the cpu number also into the worker name.
 * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
 *
 * CPU hotplug:
 * The kthread worker API is simple and generic. It just provides a way
 * to create, use, and destroy workers.
 *
 * It is up to the API user how to handle CPU hotplug. They have to decide
 * how to handle pending work items, prevent queuing new ones, and
 * restore the functionality when the CPU goes off and on. There are a
 * few catches:
 *
 *    - CPU affinity gets lost when it is scheduled on an offline CPU.
 *
 *    - The worker might not exist when the CPU was off when the user
 *      created the workers.
 *
 * Good practice is to implement two CPU hotplug callbacks and to
 * destroy/create the worker when the CPU goes down/up.
 *
 * Return:
 * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the caller was killed by a fatal signal.
 */
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
                             const char namefmt[])
{
        struct kthread_worker *worker;

        worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu);
        if (!IS_ERR(worker))
                kthread_bind(worker->task, cpu);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_cpu);

/*
 * Returns true when the work could not be queued at the moment.
 * It happens when it is already pending in a worker list
 * or when it is being cancelled.
 */
static inline bool queuing_blocked(struct kthread_worker *worker,
                                   struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);

        return !list_empty(&work->node) || work->canceling;
}

static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
                                             struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);
        WARN_ON_ONCE(!list_empty(&work->node));
        /* Do not use a work with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker && work->worker != worker);
}

/* insert @work before @pos in @worker */
static void kthread_insert_work(struct kthread_worker *worker,
                                struct kthread_work *work,
                                struct list_head *pos)
{
        kthread_insert_work_sanity_check(worker, work);

        trace_sched_kthread_work_queue_work(worker, work);

        list_add_tail(&work->node, pos);
        work->worker = worker;
        if (!worker->current_work && likely(worker->task))
                wake_up_process(worker->task);
}

/**
 * kthread_queue_work - queue a kthread_work
 * @worker: target kthread_worker
 * @work: kthread_work to queue
 *
 * Queue @work to work processor @task for async execution.  @task
 * must have been created with kthread_create_worker().  Returns %true
 * if @work was successfully queued, %false if it was already pending.
 *
 * Reinitialize the work if it needs to be used by another worker.
 * For example, when the worker was stopped and started again.
 */
bool kthread_queue_work(struct kthread_worker *worker,
                        struct kthread_work *work)
{
        bool ret = false;
        unsigned long flags;

        raw_spin_lock_irqsave(&worker->lock, flags);
        if (!queuing_blocked(worker, work)) {
                kthread_insert_work(worker, work, &worker->work_list);
                ret = true;
        }
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);

/**
 * kthread_delayed_work_timer_fn - callback that queues the associated kthread
 *        delayed work when the timer expires.
 * @t: pointer to the expired timer
 *
 * The format of the function is defined by struct timer_list.
 * It should have been called from irqsafe timer with irq already off.
 */
void kthread_delayed_work_timer_fn(struct timer_list *t)
{
        struct kthread_delayed_work *dwork = timer_container_of(dwork, t,
                                                                timer);
        struct kthread_work *work = &dwork->work;
        struct kthread_worker *worker = work->worker;
        unsigned long flags;

        /*
         * This might happen when a pending work is reinitialized.
         * It means that it is used a wrong way.
         */
        if (WARN_ON_ONCE(!worker))
                return;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        /* Move the work from worker->delayed_work_list. */
        WARN_ON_ONCE(list_empty(&work->node));
        list_del_init(&work->node);
        if (!work->canceling)
                kthread_insert_work(worker, work, &worker->work_list);

        raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);

static void __kthread_queue_delayed_work(struct kthread_worker *worker,
                                         struct kthread_delayed_work *dwork,
                                         unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct kthread_work *work = &dwork->work;

        WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                kthread_insert_work(worker, work, &worker->work_list);
                return;
        }

        /* Be paranoid and try to detect possible races already now. */
        kthread_insert_work_sanity_check(worker, work);

        list_add(&work->node, &worker->delayed_work_list);
        work->worker = worker;
        timer->expires = jiffies + delay;
        add_timer(timer);
}

/**
 * kthread_queue_delayed_work - queue the associated kthread work
 *        after a delay.
 * @worker: target kthread_worker
 * @dwork: kthread_delayed_work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If the work has not been pending it starts a timer that will queue
 * the work after the given @delay. If @delay is zero, it queues the
 * work immediately.
 *
 * Return: %false if the @work has already been pending. It means that
 * either the timer was running or the work was queued. It returns %true
 * otherwise.
 */
bool kthread_queue_delayed_work(struct kthread_worker *worker,
                                struct kthread_delayed_work *dwork,
                                unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        bool ret = false;

        raw_spin_lock_irqsave(&worker->lock, flags);

        if (!queuing_blocked(worker, work)) {
                __kthread_queue_delayed_work(worker, dwork, delay);
                ret = true;
        }

        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);

struct kthread_flush_work {
        struct kthread_work        work;
        struct completion        done;
};

static void kthread_flush_work_fn(struct kthread_work *work)
{
        struct kthread_flush_work *fwork =
                container_of(work, struct kthread_flush_work, work);
        complete(&fwork->done);
}

/**
 * kthread_flush_work - flush a kthread_work
 * @work: work to flush
 *
 * If @work is queued or executing, wait for it to finish execution.
 */
void kthread_flush_work(struct kthread_work *work)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };
        struct kthread_worker *worker;
        bool noop = false;

        worker = work->worker;
        if (!worker)
                return;

        raw_spin_lock_irq(&worker->lock);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (!list_empty(&work->node))
                kthread_insert_work(worker, &fwork.work, work->node.next);
        else if (worker->current_work == work)
                kthread_insert_work(worker, &fwork.work,
                                    worker->work_list.next);
        else
                noop = true;

        raw_spin_unlock_irq(&worker->lock);

        if (!noop)
                wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_work);

/*
 * Make sure that the timer is neither set nor running and could
 * not manipulate the work list_head any longer.
 *
 * The function is called under worker->lock. The lock is temporary
 * released but the timer can't be set again in the meantime.
 */
static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
                                              unsigned long *flags)
{
        struct kthread_delayed_work *dwork =
                container_of(work, struct kthread_delayed_work, work);
        struct kthread_worker *worker = work->worker;

        /*
         * timer_delete_sync() must be called to make sure that the timer
         * callback is not running. The lock must be temporary released
         * to avoid a deadlock with the callback. In the meantime,
         * any queuing is blocked by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, *flags);
        timer_delete_sync(&dwork->timer);
        raw_spin_lock_irqsave(&worker->lock, *flags);
        work->canceling--;
}

/*
 * This function removes the work from the worker queue.
 *
 * It is called under worker->lock. The caller must make sure that
 * the timer used by delayed work is not running, e.g. by calling
 * kthread_cancel_delayed_work_timer().
 *
 * The work might still be in use when this function finishes. See the
 * current_work proceed by the worker.
 *
 * Return: %true if @work was pending and successfully canceled,
 *        %false if @work was not pending
 */
static bool __kthread_cancel_work(struct kthread_work *work)
{
        /*
         * Try to remove the work from a worker list. It might either
         * be from worker->work_list or from worker->delayed_work_list.
         */
        if (!list_empty(&work->node)) {
                list_del_init(&work->node);
                return true;
        }

        return false;
}

/**
 * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
 * @worker: kthread worker to use
 * @dwork: kthread delayed work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
 * modify @dwork's timer so that it expires after @delay. If @delay is zero,
 * @work is guaranteed to be queued immediately.
 *
 * Return: %false if @dwork was idle and queued, %true otherwise.
 *
 * A special case is when the work is being canceled in parallel.
 * It might be caused either by the real kthread_cancel_delayed_work_sync()
 * or yet another kthread_mod_delayed_work() call. We let the other command
 * win and return %true here. The return value can be used for reference
 * counting and the number of queued works stays the same. Anyway, the caller
 * is supposed to synchronize these operations a reasonable way.
 *
 * This function is safe to call from any context including IRQ handler.
 * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
 * for details.
 */
bool kthread_mod_delayed_work(struct kthread_worker *worker,
                              struct kthread_delayed_work *dwork,
                              unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&worker->lock, flags);

        /* Do not bother with canceling when never queued. */
        if (!work->worker) {
                ret = false;
                goto fast_queue;
        }

        /* Work must not be used with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker != worker);

        /*
         * Temporary cancel the work but do not fight with another command
         * that is canceling the work as well.
         *
         * It is a bit tricky because of possible races with another
         * mod_delayed_work() and cancel_delayed_work() callers.
         *
         * The timer must be canceled first because worker->lock is released
         * when doing so. But the work can be removed from the queue (list)
         * only when it can be queued again so that the return value can
         * be used for reference counting.
         */
        kthread_cancel_delayed_work_timer(work, &flags);
        if (work->canceling) {
                /* The number of works in the queue does not change. */
                ret = true;
                goto out;
        }
        ret = __kthread_cancel_work(work);

fast_queue:
        __kthread_queue_delayed_work(worker, dwork, delay);
out:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);

static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
{
        struct kthread_worker *worker = work->worker;
        unsigned long flags;
        int ret = false;

        if (!worker)
                goto out;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (is_dwork)
                kthread_cancel_delayed_work_timer(work, &flags);

        ret = __kthread_cancel_work(work);

        if (worker->current_work != work)
                goto out_fast;

        /*
         * The work is in progress and we need to wait with the lock released.
         * In the meantime, block any queuing by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        kthread_flush_work(work);
        raw_spin_lock_irqsave(&worker->lock, flags);
        work->canceling--;

out_fast:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
        return ret;
}

/**
 * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
 * @work: the kthread work to cancel
 *
 * Cancel @work and wait for its execution to finish.  This function
 * can be used even if the work re-queues itself. On return from this
 * function, @work is guaranteed to be not pending or executing on any CPU.
 *
 * kthread_cancel_work_sync(&delayed_work->work) must not be used for
 * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
 *
 * The caller must ensure that the worker on which @work was last
 * queued can't be destroyed before this function returns.
 *
 * Return: %true if @work was pending, %false otherwise.
 */
bool kthread_cancel_work_sync(struct kthread_work *work)
{
        return __kthread_cancel_work_sync(work, false);
}
EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);

/**
 * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
 *        wait for it to finish.
 * @dwork: the kthread delayed work to cancel
 *
 * This is kthread_cancel_work_sync() for delayed works.
 *
 * Return: %true if @dwork was pending, %false otherwise.
 */
bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
{
        return __kthread_cancel_work_sync(&dwork->work, true);
}
EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);

/**
 * kthread_flush_worker - flush all current works on a kthread_worker
 * @worker: worker to flush
 *
 * Wait until all currently executing or pending works on @worker are
 * finished.
 */
void kthread_flush_worker(struct kthread_worker *worker)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };

        kthread_queue_work(worker, &fwork.work);
        wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_worker);

/**
 * kthread_destroy_worker - destroy a kthread worker
 * @worker: worker to be destroyed
 *
 * Flush and destroy @worker.  The simple flush is enough because the kthread
 * worker API is used only in trivial scenarios.  There are no multi-step state
 * machines needed.
 *
 * Note that this function is not responsible for handling delayed work, so
 * caller should be responsible for queuing or canceling all delayed work items
 * before invoke this function.
 */
void kthread_destroy_worker(struct kthread_worker *worker)
{
        struct task_struct *task;

        task = worker->task;
        if (WARN_ON(!task))
                return;

        kthread_flush_worker(worker);
        kthread_stop(task);
        WARN_ON(!list_empty(&worker->delayed_work_list));
        WARN_ON(!list_empty(&worker->work_list));
        kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);

/**
 * kthread_use_mm - make the calling kthread operate on an address space
 * @mm: address space to operate on
 */
void kthread_use_mm(struct mm_struct *mm)
{
        struct mm_struct *active_mm;
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(tsk->mm);
        WARN_ON_ONCE(!mm->user_ns);

        /*
         * It is possible for mm to be the same as tsk->active_mm, but
         * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
         * because these references are not equivalent.
         */
        mmgrab(mm);

        task_lock(tsk);
        /* Hold off tlb flush IPIs while switching mm's */
        local_irq_disable();
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
        membarrier_update_current_mm(mm);
        switch_mm_irqs_off(active_mm, mm, tsk);
        local_irq_enable();
        task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
        finish_arch_post_lock_switch();
#endif

        /*
         * When a kthread starts operating on an address space, the loop
         * in membarrier_{private,global}_expedited() may not observe
         * that tsk->mm, and not issue an IPI. Membarrier requires a
         * memory barrier after storing to tsk->mm, before accessing
         * user-space memory. A full memory barrier for membarrier
         * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
         * mmdrop_lazy_tlb().
         */
        mmdrop_lazy_tlb(active_mm);
}
EXPORT_SYMBOL_GPL(kthread_use_mm);

/**
 * kthread_unuse_mm - reverse the effect of kthread_use_mm()
 * @mm: address space to operate on
 */
void kthread_unuse_mm(struct mm_struct *mm)
{
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(!tsk->mm);

        task_lock(tsk);
        /*
         * When a kthread stops operating on an address space, the loop
         * in membarrier_{private,global}_expedited() may not observe
         * that tsk->mm, and not issue an IPI. Membarrier requires a
         * memory barrier after accessing user-space memory, before
         * clearing tsk->mm.
         */
        smp_mb__after_spinlock();
        local_irq_disable();
        tsk->mm = NULL;
        membarrier_update_current_mm(NULL);
        mmgrab_lazy_tlb(mm);
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
        local_irq_enable();
        task_unlock(tsk);

        mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);

#ifdef CONFIG_BLK_CGROUP
/**
 * kthread_associate_blkcg - associate blkcg to current kthread
 * @css: the cgroup info
 *
 * Current thread must be a kthread. The thread is running jobs on behalf of
 * other threads. In some cases, we expect the jobs attach cgroup info of
 * original threads instead of that of current thread. This function stores
 * original thread's cgroup info in current kthread context for later
 * retrieval.
 */
void kthread_associate_blkcg(struct cgroup_subsys_state *css)
{
        struct kthread *kthread;

        if (!(current->flags & PF_KTHREAD))
                return;
        kthread = to_kthread(current);
        if (!kthread)
                return;

        if (kthread->blkcg_css) {
                css_put(kthread->blkcg_css);
                kthread->blkcg_css = NULL;
        }
        if (css) {
                css_get(css);
                kthread->blkcg_css = css;
        }
}
EXPORT_SYMBOL(kthread_associate_blkcg);

/**
 * kthread_blkcg - get associated blkcg css of current kthread
 *
 * Current thread must be a kthread.
 */
struct cgroup_subsys_state *kthread_blkcg(void)
{
        struct kthread *kthread;

        if (current->flags & PF_KTHREAD) {
                kthread = to_kthread(current);
                if (kthread)
                        return kthread->blkcg_css;
        }
        return NULL;
}
#endif



























































































































































































































































































































































































    1 





    1 

















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 1999-2002 Vojtech Pavlik
 */
#ifndef _INPUT_H
#define _INPUT_H

#include <linux/time.h>
#include <linux/list.h>
#include <uapi/linux/input.h>
/* Implementation details, userspace should not care about these */
#define ABS_MT_FIRST                ABS_MT_TOUCH_MAJOR
#define ABS_MT_LAST                ABS_MT_TOOL_Y

/*
 * In-kernel definitions.
 */

#include <linux/device.h>
#include <linux/fs.h>
#include <linux/timer.h>
#include <linux/mod_devicetable.h>

struct input_dev_poller;

/**
 * struct input_value - input value representation
 * @type: type of value (EV_KEY, EV_ABS, etc)
 * @code: the value code
 * @value: the value
 */
struct input_value {
        __u16 type;
        __u16 code;
        __s32 value;
};

enum input_clock_type {
        INPUT_CLK_REAL = 0,
        INPUT_CLK_MONO,
        INPUT_CLK_BOOT,
        INPUT_CLK_MAX
};

/**
 * struct input_dev - represents an input device
 * @name: name of the device
 * @phys: physical path to the device in the system hierarchy
 * @uniq: unique identification code for the device (if device has it)
 * @id: id of the device (struct input_id)
 * @propbit: bitmap of device properties and quirks
 * @evbit: bitmap of types of events supported by the device (EV_KEY,
 *        EV_REL, etc.)
 * @keybit: bitmap of keys/buttons this device has
 * @relbit: bitmap of relative axes for the device
 * @absbit: bitmap of absolute axes for the device
 * @mscbit: bitmap of miscellaneous events supported by the device
 * @ledbit: bitmap of leds present on the device
 * @sndbit: bitmap of sound effects supported by the device
 * @ffbit: bitmap of force feedback effects supported by the device
 * @swbit: bitmap of switches present on the device
 * @hint_events_per_packet: average number of events generated by the
 *        device in a packet (between EV_SYN/SYN_REPORT events). Used by
 *        event handlers to estimate size of the buffer needed to hold
 *        events.
 * @keycodemax: size of keycode table
 * @keycodesize: size of elements in keycode table
 * @keycode: map of scancodes to keycodes for this device
 * @getkeycode: optional legacy method to retrieve current keymap.
 * @setkeycode: optional method to alter current keymap, used to implement
 *        sparse keymaps. If not supplied default mechanism will be used.
 *        The method is being called while holding event_lock and thus must
 *        not sleep
 * @ff: force feedback structure associated with the device if device
 *        supports force feedback effects
 * @poller: poller structure associated with the device if device is
 *        set up to use polling mode
 * @repeat_key: stores key code of the last key pressed; used to implement
 *        software autorepeat
 * @timer: timer for software autorepeat
 * @rep: current values for autorepeat parameters (delay, rate)
 * @mt: pointer to multitouch state
 * @absinfo: array of &struct input_absinfo elements holding information
 *        about absolute axes (current value, min, max, flat, fuzz,
 *        resolution)
 * @key: reflects current state of device's keys/buttons
 * @led: reflects current state of device's LEDs
 * @snd: reflects current state of sound effects
 * @sw: reflects current state of device's switches
 * @open: this method is called when the very first user calls
 *        input_open_device(). The driver must prepare the device
 *        to start generating events (start polling thread,
 *        request an IRQ, submit URB, etc.). The meaning of open() is
 *        to start providing events to the input core.
 * @close: this method is called when the very last user calls
 *        input_close_device(). The meaning of close() is to stop
 *        providing events to the input core.
 * @flush: purges the device. Most commonly used to get rid of force
 *        feedback effects loaded into the device when disconnecting
 *        from it
 * @event: event handler for events sent _to_ the device, like EV_LED
 *        or EV_SND. The device is expected to carry out the requested
 *        action (turn on a LED, play sound, etc.) The call is protected
 *        by @event_lock and must not sleep
 * @grab: input handle that currently has the device grabbed (via
 *        EVIOCGRAB ioctl). When a handle grabs a device it becomes sole
 *        recipient for all input events coming from the device
 * @event_lock: this spinlock is taken when input core receives
 *        and processes a new event for the device (in input_event()).
 *        Code that accesses and/or modifies parameters of a device
 *        (such as keymap or absmin, absmax, absfuzz, etc.) after device
 *        has been registered with input core must take this lock.
 * @mutex: serializes calls to open(), close() and flush() methods
 * @users: stores number of users (input handlers) that opened this
 *        device. It is used by input_open_device() and input_close_device()
 *        to make sure that dev->open() is only called when the first
 *        user opens device and dev->close() is called when the very
 *        last user closes the device
 * @going_away: marks devices that are in a middle of unregistering and
 *        causes input_open_device*() fail with -ENODEV.
 * @dev: driver model's view of this device
 * @h_list: list of input handles associated with the device. When
 *        accessing the list dev->mutex must be held
 * @node: used to place the device onto input_dev_list
 * @num_vals: number of values queued in the current frame
 * @max_vals: maximum number of values queued in a frame
 * @vals: array of values queued in the current frame
 * @devres_managed: indicates that devices is managed with devres framework
 *        and needs not be explicitly unregistered or freed.
 * @timestamp: storage for a timestamp set by input_set_timestamp called
 *  by a driver
 * @inhibited: indicates that the input device is inhibited. If that is
 * the case then input core ignores any events generated by the device.
 * Device's close() is called when it is being inhibited and its open()
 * is called when it is being uninhibited.
 */
struct input_dev {
        const char *name;
        const char *phys;
        const char *uniq;
        struct input_id id;

        unsigned long propbit[BITS_TO_LONGS(INPUT_PROP_CNT)];

        unsigned long evbit[BITS_TO_LONGS(EV_CNT)];
        unsigned long keybit[BITS_TO_LONGS(KEY_CNT)];
        unsigned long relbit[BITS_TO_LONGS(REL_CNT)];
        unsigned long absbit[BITS_TO_LONGS(ABS_CNT)];
        unsigned long mscbit[BITS_TO_LONGS(MSC_CNT)];
        unsigned long ledbit[BITS_TO_LONGS(LED_CNT)];
        unsigned long sndbit[BITS_TO_LONGS(SND_CNT)];
        unsigned long ffbit[BITS_TO_LONGS(FF_CNT)];
        unsigned long swbit[BITS_TO_LONGS(SW_CNT)];

        unsigned int hint_events_per_packet;

        unsigned int keycodemax;
        unsigned int keycodesize;
        void *keycode;

        int (*setkeycode)(struct input_dev *dev,
                          const struct input_keymap_entry *ke,
                          unsigned int *old_keycode);
        int (*getkeycode)(struct input_dev *dev,
                          struct input_keymap_entry *ke);

        struct ff_device *ff;

        struct input_dev_poller *poller;

        unsigned int repeat_key;
        struct timer_list timer;

        int rep[REP_CNT];

        struct input_mt *mt;

        struct input_absinfo *absinfo;

        unsigned long key[BITS_TO_LONGS(KEY_CNT)];
        unsigned long led[BITS_TO_LONGS(LED_CNT)];
        unsigned long snd[BITS_TO_LONGS(SND_CNT)];
        unsigned long sw[BITS_TO_LONGS(SW_CNT)];

        int (*open)(struct input_dev *dev);
        void (*close)(struct input_dev *dev);
        int (*flush)(struct input_dev *dev, struct file *file);
        int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value);

        struct input_handle __rcu *grab;

        spinlock_t event_lock;
        struct mutex mutex;

        unsigned int users;
        bool going_away;

        struct device dev;

        struct list_head        h_list;
        struct list_head        node;

        unsigned int num_vals;
        unsigned int max_vals;
        struct input_value *vals;

        bool devres_managed;

        ktime_t timestamp[INPUT_CLK_MAX];

        bool inhibited;
};
#define to_input_dev(d) container_of(d, struct input_dev, dev)

/*
 * Verify that we are in sync with input_device_id mod_devicetable.h #defines
 */

#if EV_MAX != INPUT_DEVICE_ID_EV_MAX
#error "EV_MAX and INPUT_DEVICE_ID_EV_MAX do not match"
#endif

#if KEY_MIN_INTERESTING != INPUT_DEVICE_ID_KEY_MIN_INTERESTING
#error "KEY_MIN_INTERESTING and INPUT_DEVICE_ID_KEY_MIN_INTERESTING do not match"
#endif

#if KEY_MAX != INPUT_DEVICE_ID_KEY_MAX
#error "KEY_MAX and INPUT_DEVICE_ID_KEY_MAX do not match"
#endif

#if REL_MAX != INPUT_DEVICE_ID_REL_MAX
#error "REL_MAX and INPUT_DEVICE_ID_REL_MAX do not match"
#endif

#if ABS_MAX != INPUT_DEVICE_ID_ABS_MAX
#error "ABS_MAX and INPUT_DEVICE_ID_ABS_MAX do not match"
#endif

#if MSC_MAX != INPUT_DEVICE_ID_MSC_MAX
#error "MSC_MAX and INPUT_DEVICE_ID_MSC_MAX do not match"
#endif

#if LED_MAX != INPUT_DEVICE_ID_LED_MAX
#error "LED_MAX and INPUT_DEVICE_ID_LED_MAX do not match"
#endif

#if SND_MAX != INPUT_DEVICE_ID_SND_MAX
#error "SND_MAX and INPUT_DEVICE_ID_SND_MAX do not match"
#endif

#if FF_MAX != INPUT_DEVICE_ID_FF_MAX
#error "FF_MAX and INPUT_DEVICE_ID_FF_MAX do not match"
#endif

#if SW_MAX != INPUT_DEVICE_ID_SW_MAX
#error "SW_MAX and INPUT_DEVICE_ID_SW_MAX do not match"
#endif

#if INPUT_PROP_MAX != INPUT_DEVICE_ID_PROP_MAX
#error "INPUT_PROP_MAX and INPUT_DEVICE_ID_PROP_MAX do not match"
#endif

#define INPUT_DEVICE_ID_MATCH_DEVICE \
        (INPUT_DEVICE_ID_MATCH_BUS | INPUT_DEVICE_ID_MATCH_VENDOR | INPUT_DEVICE_ID_MATCH_PRODUCT)
#define INPUT_DEVICE_ID_MATCH_DEVICE_AND_VERSION \
        (INPUT_DEVICE_ID_MATCH_DEVICE | INPUT_DEVICE_ID_MATCH_VERSION)

struct input_handle;

/**
 * struct input_handler - implements one of interfaces for input devices
 * @private: driver-specific data
 * @event: event handler. This method is being called by input core with
 *        interrupts disabled and dev->event_lock spinlock held and so
 *        it may not sleep
 * @events: event sequence handler. This method is being called by
 *        input core with interrupts disabled and dev->event_lock
 *        spinlock held and so it may not sleep. The method must return
 *        number of events passed to it.
 * @filter: similar to @event; separates normal event handlers from
 *        "filters".
 * @match: called after comparing device's id with handler's id_table
 *        to perform fine-grained matching between device and handler
 * @connect: called when attaching a handler to an input device
 * @disconnect: disconnects a handler from input device
 * @start: starts handler for given handle. This function is called by
 *        input core right after connect() method and also when a process
 *        that "grabbed" a device releases it
 * @passive_observer: set to %true by drivers only interested in observing
 *        data stream from devices if there are other users present. Such
 *        drivers will not result in starting underlying hardware device
 *        when input_open_device() is called for their handles
 * @legacy_minors: set to %true by drivers using legacy minor ranges
 * @minor: beginning of range of 32 legacy minors for devices this driver
 *        can provide
 * @name: name of the handler, to be shown in /proc/bus/input/handlers
 * @id_table: pointer to a table of input_device_ids this driver can
 *        handle
 * @h_list: list of input handles associated with the handler
 * @node: for placing the driver onto input_handler_list
 *
 * Input handlers attach to input devices and create input handles. There
 * are likely several handlers attached to any given input device at the
 * same time. All of them will get their copy of input event generated by
 * the device.
 *
 * The very same structure is used to implement input filters. Input core
 * allows filters to run first and will not pass event to regular handlers
 * if any of the filters indicate that the event should be filtered (by
 * returning %true from their filter() method).
 *
 * Note that input core serializes calls to connect() and disconnect()
 * methods.
 */
struct input_handler {

        void *private;

        void (*event)(struct input_handle *handle, unsigned int type, unsigned int code, int value);
        unsigned int (*events)(struct input_handle *handle,
                               struct input_value *vals, unsigned int count);
        bool (*filter)(struct input_handle *handle, unsigned int type, unsigned int code, int value);
        bool (*match)(struct input_handler *handler, struct input_dev *dev);
        int (*connect)(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id);
        void (*disconnect)(struct input_handle *handle);
        void (*start)(struct input_handle *handle);

        bool passive_observer;
        bool legacy_minors;
        int minor;
        const char *name;

        const struct input_device_id *id_table;

        struct list_head        h_list;
        struct list_head        node;
};

/**
 * struct input_handle - links input device with an input handler
 * @private: handler-specific data
 * @open: counter showing whether the handle is 'open', i.e. should deliver
 *        events from its device
 * @name: name given to the handle by handler that created it
 * @dev: input device the handle is attached to
 * @handler: handler that works with the device through this handle
 * @handle_events: event sequence handler. It is set up by the input core
 *        according to event handling method specified in the @handler. See
 *        input_handle_setup_event_handler().
 *        This method is being called by the input core with interrupts disabled
 *        and dev->event_lock spinlock held and so it may not sleep.
 * @d_node: used to put the handle on device's list of attached handles
 * @h_node: used to put the handle on handler's list of handles from which
 *        it gets events
 */
struct input_handle {
        void *private;

        int open;
        const char *name;

        struct input_dev *dev;
        struct input_handler *handler;

        unsigned int (*handle_events)(struct input_handle *handle,
                                      struct input_value *vals,
                                      unsigned int count);

        struct list_head        d_node;
        struct list_head        h_node;
};

struct input_dev __must_check *input_allocate_device(void);
struct input_dev __must_check *devm_input_allocate_device(struct device *);
void input_free_device(struct input_dev *dev);

static inline struct input_dev *input_get_device(struct input_dev *dev)
{
        return dev ? to_input_dev(get_device(&dev->dev)) : NULL;
}

static inline void input_put_device(struct input_dev *dev)
{
        if (dev)
                put_device(&dev->dev);
}

static inline void *input_get_drvdata(struct input_dev *dev)
{
        return dev_get_drvdata(&dev->dev);
}

static inline void input_set_drvdata(struct input_dev *dev, void *data)
{
        dev_set_drvdata(&dev->dev, data);
}

int __must_check input_register_device(struct input_dev *);
void input_unregister_device(struct input_dev *);

void input_reset_device(struct input_dev *);

int input_setup_polling(struct input_dev *dev,
                        void (*poll_fn)(struct input_dev *dev));
void input_set_poll_interval(struct input_dev *dev, unsigned int interval);
void input_set_min_poll_interval(struct input_dev *dev, unsigned int interval);
void input_set_max_poll_interval(struct input_dev *dev, unsigned int interval);
int input_get_poll_interval(struct input_dev *dev);

int __must_check input_register_handler(struct input_handler *);
void input_unregister_handler(struct input_handler *);

int __must_check input_get_new_minor(int legacy_base, unsigned int legacy_num,
                                     bool allow_dynamic);
void input_free_minor(unsigned int minor);

int input_handler_for_each_handle(struct input_handler *, void *data,
                                  int (*fn)(struct input_handle *, void *));

int input_register_handle(struct input_handle *);
void input_unregister_handle(struct input_handle *);

int input_grab_device(struct input_handle *);
void input_release_device(struct input_handle *);

int input_open_device(struct input_handle *);
void input_close_device(struct input_handle *);

int input_flush_device(struct input_handle *handle, struct file *file);

void input_set_timestamp(struct input_dev *dev, ktime_t timestamp);
ktime_t *input_get_timestamp(struct input_dev *dev);

void input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value);
void input_inject_event(struct input_handle *handle, unsigned int type, unsigned int code, int value);

static inline void input_report_key(struct input_dev *dev, unsigned int code, int value)
{
        input_event(dev, EV_KEY, code, !!value);
}

static inline void input_report_rel(struct input_dev *dev, unsigned int code, int value)
{
        input_event(dev, EV_REL, code, value);
}

static inline void input_report_abs(struct input_dev *dev, unsigned int code, int value)
{
        input_event(dev, EV_ABS, code, value);
}

static inline void input_report_ff_status(struct input_dev *dev, unsigned int code, int value)
{
        input_event(dev, EV_FF_STATUS, code, value);
}

static inline void input_report_switch(struct input_dev *dev, unsigned int code, int value)
{
        input_event(dev, EV_SW, code, !!value);
}

static inline void input_sync(struct input_dev *dev)
{
        input_event(dev, EV_SYN, SYN_REPORT, 0);
}

static inline void input_mt_sync(struct input_dev *dev)
{
        input_event(dev, EV_SYN, SYN_MT_REPORT, 0);
}

void input_set_capability(struct input_dev *dev, unsigned int type, unsigned int code);

/**
 * input_set_events_per_packet - tell handlers about the driver event rate
 * @dev: the input device used by the driver
 * @n_events: the average number of events between calls to input_sync()
 *
 * If the event rate sent from a device is unusually large, use this
 * function to set the expected event rate. This will allow handlers
 * to set up an appropriate buffer size for the event stream, in order
 * to minimize information loss.
 */
static inline void input_set_events_per_packet(struct input_dev *dev, int n_events)
{
        dev->hint_events_per_packet = n_events;
}

void input_alloc_absinfo(struct input_dev *dev);
void input_set_abs_params(struct input_dev *dev, unsigned int axis,
                          int min, int max, int fuzz, int flat);
void input_copy_abs(struct input_dev *dst, unsigned int dst_axis,
                    const struct input_dev *src, unsigned int src_axis);

#define INPUT_GENERATE_ABS_ACCESSORS(_suffix, _item)                        \
static inline int input_abs_get_##_suffix(struct input_dev *dev,        \
                                          unsigned int axis)                \
{                                                                        \
        return dev->absinfo ? dev->absinfo[axis]._item : 0;                \
}                                                                        \
                                                                        \
static inline void input_abs_set_##_suffix(struct input_dev *dev,        \
                                           unsigned int axis, int val)        \
{                                                                        \
        input_alloc_absinfo(dev);                                        \
        if (dev->absinfo)                                                \
                dev->absinfo[axis]._item = val;                                \
}

INPUT_GENERATE_ABS_ACCESSORS(val, value)
INPUT_GENERATE_ABS_ACCESSORS(min, minimum)
INPUT_GENERATE_ABS_ACCESSORS(max, maximum)
INPUT_GENERATE_ABS_ACCESSORS(fuzz, fuzz)
INPUT_GENERATE_ABS_ACCESSORS(flat, flat)
INPUT_GENERATE_ABS_ACCESSORS(res, resolution)

int input_scancode_to_scalar(const struct input_keymap_entry *ke,
                             unsigned int *scancode);

int input_get_keycode(struct input_dev *dev, struct input_keymap_entry *ke);
int input_set_keycode(struct input_dev *dev,
                      const struct input_keymap_entry *ke);

bool input_match_device_id(const struct input_dev *dev,
                           const struct input_device_id *id);

void input_enable_softrepeat(struct input_dev *dev, int delay, int period);

bool input_device_enabled(struct input_dev *dev);

extern const struct class input_class;

/**
 * struct ff_device - force-feedback part of an input device
 * @upload: Called to upload an new effect into device
 * @erase: Called to erase an effect from device
 * @playback: Called to request device to start playing specified effect
 * @set_gain: Called to set specified gain
 * @set_autocenter: Called to auto-center device
 * @destroy: called by input core when parent input device is being
 *        destroyed
 * @private: driver-specific data, will be freed automatically
 * @ffbit: bitmap of force feedback capabilities truly supported by
 *        device (not emulated like ones in input_dev->ffbit)
 * @mutex: mutex for serializing access to the device
 * @max_effects: maximum number of effects supported by device
 * @effects: pointer to an array of effects currently loaded into device
 * @effect_owners: array of effect owners; when file handle owning
 *        an effect gets closed the effect is automatically erased
 *
 * Every force-feedback device must implement upload() and playback()
 * methods; erase() is optional. set_gain() and set_autocenter() need
 * only be implemented if driver sets up FF_GAIN and FF_AUTOCENTER
 * bits.
 *
 * Note that playback(), set_gain() and set_autocenter() are called with
 * dev->event_lock spinlock held and interrupts off and thus may not
 * sleep.
 */
struct ff_device {
        int (*upload)(struct input_dev *dev, struct ff_effect *effect,
                      struct ff_effect *old);
        int (*erase)(struct input_dev *dev, int effect_id);

        int (*playback)(struct input_dev *dev, int effect_id, int value);
        void (*set_gain)(struct input_dev *dev, u16 gain);
        void (*set_autocenter)(struct input_dev *dev, u16 magnitude);

        void (*destroy)(struct ff_device *);

        void *private;

        unsigned long ffbit[BITS_TO_LONGS(FF_CNT)];

        struct mutex mutex;

        int max_effects;
        struct ff_effect *effects;
        struct file *effect_owners[] __counted_by(max_effects);
};

int input_ff_create(struct input_dev *dev, unsigned int max_effects);
void input_ff_destroy(struct input_dev *dev);

int input_ff_event(struct input_dev *dev, unsigned int type, unsigned int code, int value);

int input_ff_upload(struct input_dev *dev, struct ff_effect *effect, struct file *file);
int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file);
int input_ff_flush(struct input_dev *dev, struct file *file);

int input_ff_create_memless(struct input_dev *dev, void *data,
                int (*play_effect)(struct input_dev *, void *, struct ff_effect *));

#endif






































































































































































































































































































































































































































































































































































































































































































































































































































   29 






   28 




















    4 


































    1 



































    1 


   29 


   15 









    4 






























    3 



















































































































































































    4 

    1 
























































































































    3 

    3 










   29 





    8 



    3 






















   29 






















































































































    3 


    3 










    3 
    2 


















    3 








    3 






















    3 
    3 















































































    3 





















   20 





    2 












    2 










    2 















    2 



    2 


    2 




    2 












   30 



















    1 










    2 





    1 






























   14 













   26 


   18 

























   14 































   20 







































































































































































































































































































    3 












    5 



















































































    4 



    5 




























   28 
   30 
   30 




























   15 
















   27 


















   26 







    1 
   19 
   15 
















   30 
   12 




    4 





















   32 











   28 













   18 
   32 

   26 
   32 
























   28 

















   28 





    7 
   28 




   29 
































































































    3 



    3 
    2 
    3 








    1 



    2 












    2 
























    2 







    2 
    2 




    2 
























    1 





    1 


    1 






















































































































































    1 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 





























































































    1 







    1 



































    1 





    1 











    1 
    1 
    1 
    1 





    1 

    1 







    1 






    1 


    1 












































    1 

























    1 





    1 






    1 

























































































































    1 










    1 













    1 












    1 













    4 









    1 











    1 

























    5 








    4 









    1 












































    4 




















































    2 







    2 





    3 






    4 

    3 
    2 

    4 

    4 












    2 






    3 



































    1 
    2 




















    3 



















































    1 
    1 














    2 





    2 
    2 
    2 

    2 





















































































































    2 


















    2 



















































    2 














    2 














    2 




















    2 






















































































































    2 














    2 











    2 
































































































































    2 







    2 






































































































    2 








































    2 





    2 





    2 









    2 





























    2 






















    2 





















    2 
    2 
    2 

    2 










    2 






















    2 





    2 




    2 

    1 
















    2 













    1 








    2 
    2 

    2 






















    2 























































































































    1 














    1 















    2 





























    2 











    2 






    1 










    1 










    2 





















    2 









    2 



    1 
















































    2 




















    1 
    2 






    2 











    2 

















    1 
    2 









    2 






    2 

    2 




    2 






























    2 




    2 



































































    1 

























    1 







    1 










    1 

    1 










    1 

    1 

















    1 









    1 
    1 



    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
// SPDX-License-Identifier: GPL-2.0-only
/*
 * kernel/workqueue.c - generic async execution with shared worker pool
 *
 * Copyright (C) 2002                Ingo Molnar
 *
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
 *
 * Made to use alloc_percpu by Christoph Lameter.
 *
 * Copyright (C) 2010                SUSE Linux Products GmbH
 * Copyright (C) 2010                Tejun Heo <tj@kernel.org>
 *
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
 * automatically managed.  There are two worker pools for each CPU (one for
 * normal work items and the other for high priority ones) and some extra
 * pools for workqueues which are not bound to any specific CPU - the
 * number of these backing pools is dynamic.
 *
 * Please read Documentation/core-api/workqueue.rst for details.
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
#include <linux/freezer.h>
#include <linux/debug_locks.h>
#include <linux/device/devres.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
#include <linux/jhash.h>
#include <linux/hashtable.h>
#include <linux/rculist.h>
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>
#include <linux/delay.h>
#include <linux/irq_work.h>

#include "workqueue_internal.h"

enum worker_pool_flags {
        /*
         * worker_pool flags
         *
         * A bound pool is either associated or disassociated with its CPU.
         * While associated (!DISASSOCIATED), all workers are bound to the
         * CPU and none has %WORKER_UNBOUND set and concurrency management
         * is in effect.
         *
         * While DISASSOCIATED, the cpu may be offline and all workers have
         * %WORKER_UNBOUND set and concurrency management disabled, and may
         * be executing on any CPU.  The pool behaves as an unbound one.
         *
         * Note that DISASSOCIATED should be flipped only while holding
         * wq_pool_attach_mutex to avoid changing binding state while
         * worker_attach_to_pool() is in progress.
         *
         * As there can only be one concurrent BH execution context per CPU, a
         * BH pool is per-CPU and always DISASSOCIATED.
         */
        POOL_BH                        = 1 << 0,        /* is a BH pool */
        POOL_MANAGER_ACTIVE        = 1 << 1,        /* being managed */
        POOL_DISASSOCIATED        = 1 << 2,        /* cpu can't serve workers */
        POOL_BH_DRAINING        = 1 << 3,        /* draining after CPU offline */
};

enum worker_flags {
        /* worker flags */
        WORKER_DIE                = 1 << 1,        /* die die die */
        WORKER_IDLE                = 1 << 2,        /* is idle */
        WORKER_PREP                = 1 << 3,        /* preparing to run works */
        WORKER_CPU_INTENSIVE        = 1 << 6,        /* cpu intensive */
        WORKER_UNBOUND                = 1 << 7,        /* worker is unbound */
        WORKER_REBOUND                = 1 << 8,        /* worker was rebound */

        WORKER_NOT_RUNNING        = WORKER_PREP | WORKER_CPU_INTENSIVE |
                                  WORKER_UNBOUND | WORKER_REBOUND,
};

enum work_cancel_flags {
        WORK_CANCEL_DELAYED        = 1 << 0,        /* canceling a delayed_work */
        WORK_CANCEL_DISABLE        = 1 << 1,        /* canceling to disable */
};

enum wq_internal_consts {
        NR_STD_WORKER_POOLS        = 2,                /* # standard pools per cpu */

        UNBOUND_POOL_HASH_ORDER        = 6,                /* hashed by pool->attrs */
        BUSY_WORKER_HASH_ORDER        = 6,                /* 64 pointers */

        MAX_IDLE_WORKERS_RATIO        = 4,                /* 1/4 of busy can be idle */
        IDLE_WORKER_TIMEOUT        = 300 * HZ,        /* keep idle ones for 5 mins */

        MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
                                                /* call for help after 10ms
                                                   (min two ticks) */
        MAYDAY_INTERVAL                = HZ / 10,        /* and then every 100ms */
        CREATE_COOLDOWN                = HZ,                /* time to breath after fail */

        RESCUER_BATCH                = 16,                /* process items per turn */

        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give MIN_NICE.
         */
        RESCUER_NICE_LEVEL        = MIN_NICE,
        HIGHPRI_NICE_LEVEL        = MIN_NICE,

        WQ_NAME_LEN                = 32,
        WORKER_ID_LEN                = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */
};

/* Layout of shards within one LLC pod */
struct llc_shard_layout {
        int nr_large_shards;        /* number of large shards (cores_per_shard + 1) */
        int cores_per_shard;        /* base number of cores per default shard */
        int nr_shards;                /* total number of shards */
        /* nr_default shards = (nr_shards - nr_large_shards) */
};

/*
 * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
 * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
 * msecs_to_jiffies() can't be an initializer.
 */
#define BH_WORKER_JIFFIES        msecs_to_jiffies(2)
#define BH_WORKER_RESTARTS        10

/*
 * Structure fields follow one of the following exclusion rules.
 *
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
 *
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
 * L: pool->lock protected.  Access with pool->lock held.
 *
 * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
 *     reads.
 *
 * K: Only modified by worker while holding pool->lock. Can be safely read by
 *    self, while holding pool->lock or from IRQ context if %current is the
 *    kworker.
 *
 * S: Only modified by worker self.
 *
 * A: wq_pool_attach_mutex protected.
 *
 * PL: wq_pool_mutex protected.
 *
 * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
 *
 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
 *
 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
 *      RCU for reads.
 *
 * WQ: wq->mutex protected.
 *
 * WR: wq->mutex protected for writes.  RCU protected for reads.
 *
 * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
 *     with READ_ONCE() without locking.
 *
 * MD: wq_mayday_lock protected.
 *
 * WD: Used internally by the watchdog.
 */

/* struct worker is defined in workqueue_internal.h */

struct worker_pool {
        raw_spinlock_t                lock;                /* the pool lock */
        int                        cpu;                /* I: the associated cpu */
        int                        node;                /* I: the associated node ID */
        int                        id;                /* I: pool ID */
        unsigned int                flags;                /* L: flags */

        unsigned long                last_progress_ts;        /* L: last forward progress timestamp */
        bool                        cpu_stall;        /* WD: stalled cpu bound pool */

        /*
         * The counter is incremented in a process context on the associated CPU
         * w/ preemption disabled, and decremented or reset in the same context
         * but w/ pool->lock held. The readers grab pool->lock and are
         * guaranteed to see if the counter reached zero.
         */
        int                        nr_running;

        struct list_head        worklist;        /* L: list of pending works */

        int                        nr_workers;        /* L: total number of workers */
        int                        nr_idle;        /* L: currently idle workers */

        struct list_head        idle_list;        /* L: list of idle workers */
        struct timer_list        idle_timer;        /* L: worker idle timeout */
        struct work_struct      idle_cull_work; /* L: worker idle cleanup */

        struct timer_list        mayday_timer;          /* L: SOS timer for workers */

        /* a workers is either on busy_hash or idle_list, or the manager */
        DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                                                /* L: hash of busy workers */

        struct worker                *manager;        /* L: purely informational */
        struct list_head        workers;        /* A: attached workers */

        struct ida                worker_ida;        /* worker IDs for task name */

        struct workqueue_attrs        *attrs;                /* I: worker attributes */
        struct hlist_node        hash_node;        /* PL: unbound_pool_hash node */
        int                        refcnt;                /* PL: refcnt for unbound pools */
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                cb_lock;        /* BH worker cancel lock */
#endif
        /*
         * Destruction of pool is RCU protected to allow dereferences
         * from get_work_pool().
         */
        struct rcu_head                rcu;
};

/*
 * Per-pool_workqueue statistics. These can be monitored using
 * tools/workqueue/wq_monitor.py.
 */
enum pool_workqueue_stats {
        PWQ_STAT_STARTED,        /* work items started execution */
        PWQ_STAT_COMPLETED,        /* work items completed execution */
        PWQ_STAT_CPU_TIME,        /* total CPU time consumed */
        PWQ_STAT_CPU_INTENSIVE,        /* wq_cpu_intensive_thresh_us violations */
        PWQ_STAT_CM_WAKEUP,        /* concurrency-management worker wakeups */
        PWQ_STAT_REPATRIATED,        /* unbound workers brought back into scope */
        PWQ_STAT_MAYDAY,        /* maydays to rescuer */
        PWQ_STAT_RESCUED,        /* linked work items executed by rescuer */

        PWQ_NR_STATS,
};

/*
 * The per-pool workqueue.  While queued, bits below WORK_PWQ_SHIFT
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
 */
struct pool_workqueue {
        struct worker_pool        *pool;                /* I: the associated pool */
        struct workqueue_struct *wq;                /* I: the owning workqueue */
        int                        work_color;        /* L: current color */
        int                        flush_color;        /* L: flushing color */
        int                        refcnt;                /* L: reference count */
        int                        nr_in_flight[WORK_NR_COLORS];
                                                /* L: nr of in_flight works */
        bool                        plugged;        /* L: execution suspended */

        /*
         * nr_active management and WORK_STRUCT_INACTIVE:
         *
         * When pwq->nr_active >= max_active, new work item is queued to
         * pwq->inactive_works instead of pool->worklist and marked with
         * WORK_STRUCT_INACTIVE.
         *
         * All work items marked with WORK_STRUCT_INACTIVE do not participate in
         * nr_active and all work items in pwq->inactive_works are marked with
         * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
         * in pwq->inactive_works. Some of them are ready to run in
         * pool->worklist or worker->scheduled. Those work itmes are only struct
         * wq_barrier which is used for flush_work() and should not participate
         * in nr_active. For non-barrier work item, it is marked with
         * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
         */
        int                        nr_active;        /* L: nr of active works */
        struct list_head        inactive_works;        /* L: inactive works */
        struct list_head        pending_node;        /* LN: node on wq_node_nr_active->pending_pwqs */
        struct list_head        pwqs_node;        /* WR: node on wq->pwqs */
        struct list_head        mayday_node;        /* MD: node on wq->maydays */
        struct work_struct        mayday_cursor;        /* L: cursor on pool->worklist */

        u64                        stats[PWQ_NR_STATS];

        /*
         * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
         * and pwq_release_workfn() for details. pool_workqueue itself is also
         * RCU protected so that the first pwq can be determined without
         * grabbing wq->mutex.
         */
        struct kthread_work        release_work;
        struct rcu_head                rcu;
} __aligned(1 << WORK_STRUCT_PWQ_SHIFT);

/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
        struct list_head        list;                /* WQ: list of flushers */
        int                        flush_color;        /* WQ: flush color waiting for */
        struct completion        done;                /* flush completion */
};

struct wq_device;

/*
 * Unlike in a per-cpu workqueue where max_active limits its concurrency level
 * on each CPU, in an unbound workqueue, max_active applies to the whole system.
 * As sharing a single nr_active across multiple sockets can be very expensive,
 * the counting and enforcement is per NUMA node.
 *
 * The following struct is used to enforce per-node max_active. When a pwq wants
 * to start executing a work item, it should increment ->nr using
 * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
 * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
 * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
 * round-robin order.
 */
struct wq_node_nr_active {
        int                        max;                /* per-node max_active */
        atomic_t                nr;                /* per-node nr_active */
        raw_spinlock_t                lock;                /* nests inside pool locks */
        struct list_head        pending_pwqs;        /* LN: pwqs with inactive works */
};

/*
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
 */
struct workqueue_struct {
        struct list_head        pwqs;                /* WR: all pwqs of this wq */
        struct list_head        list;                /* PR: list of all workqueues */

        struct mutex                mutex;                /* protects this wq */
        int                        work_color;        /* WQ: current work color */
        int                        flush_color;        /* WQ: current flush color */
        atomic_t                nr_pwqs_to_flush; /* flush in progress */
        struct wq_flusher        *first_flusher;        /* WQ: first flusher */
        struct list_head        flusher_queue;        /* WQ: flush waiters */
        struct list_head        flusher_overflow; /* WQ: flush overflow list */

        struct list_head        maydays;        /* MD: pwqs requesting rescue */
        struct worker                *rescuer;        /* MD: rescue worker */

        int                        nr_drainers;        /* WQ: drain in progress */

        /* See alloc_workqueue() function comment for info on min/max_active */
        int                        max_active;        /* WO: max active works */
        int                        min_active;        /* WO: min active works */
        int                        saved_max_active; /* WQ: saved max_active */
        int                        saved_min_active; /* WQ: saved min_active */

        struct workqueue_attrs        *unbound_attrs;        /* PW: only for unbound wqs */
        struct pool_workqueue __rcu *dfl_pwq;   /* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
        struct wq_device        *wq_dev;        /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
        char                        *lock_name;
        struct lock_class_key        key;
        struct lockdep_map        __lockdep_map;
        struct lockdep_map        *lockdep_map;
#endif
        char                        name[WQ_NAME_LEN]; /* I: workqueue name */

        /*
         * Destruction of workqueue_struct is RCU protected to allow walking
         * the workqueues list without grabbing wq_pool_mutex.
         * This is used to dump all workqueues from sysrq.
         */
        struct rcu_head                rcu;

        /* hot fields used during command issue, aligned to cacheline */
        unsigned int                flags ____cacheline_aligned; /* WQ: WQ_* flags */
        struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */
        struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
};

/*
 * Each pod type describes how CPUs should be grouped for unbound workqueues.
 * See the comment above workqueue_attrs->affn_scope.
 */
struct wq_pod_type {
        int                        nr_pods;        /* number of pods */
        cpumask_var_t                *pod_cpus;        /* pod -> cpus */
        int                        *pod_node;        /* pod -> node */
        int                        *cpu_pod;        /* cpu -> pod */
};

struct work_offq_data {
        u32                        pool_id;
        u32                        disable;
        u32                        flags;
};

static const char * const wq_affn_names[WQ_AFFN_NR_TYPES] = {
        [WQ_AFFN_DFL]                = "default",
        [WQ_AFFN_CPU]                = "cpu",
        [WQ_AFFN_SMT]                = "smt",
        [WQ_AFFN_CACHE]                = "cache",
        [WQ_AFFN_CACHE_SHARD]        = "cache_shard",
        [WQ_AFFN_NUMA]                = "numa",
        [WQ_AFFN_SYSTEM]        = "system",
};

/*
 * Per-cpu work items which run for longer than the following threshold are
 * automatically considered CPU intensive and excluded from concurrency
 * management to prevent them from noticeably delaying other per-cpu work items.
 * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
 * The actual value is initialized in wq_cpu_intensive_thresh_init().
 */
static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
static unsigned int wq_cpu_intensive_warning_thresh = 4;
module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644);
#endif

/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

static unsigned int wq_cache_shard_size = 8;
module_param_named(cache_shard_size, wq_cache_shard_size, uint, 0444);

static bool wq_online;                        /* can kworkers be created yet? */
static bool wq_topo_initialized __read_mostly = false;

static struct kmem_cache *pwq_cache;

static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE_SHARD;

/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;

static DEFINE_MUTEX(wq_pool_mutex);        /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
static DEFINE_RAW_SPINLOCK(wq_mayday_lock);        /* protects wq->maydays list */
/* wait for manager to go away */
static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);

static LIST_HEAD(workqueues);                /* PR: list of all workqueues */
static bool workqueue_freezing;                /* PL: have wqs started freezing? */

/* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */
static cpumask_var_t wq_online_cpumask;

/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

/* PL: user requested unbound cpumask via sysfs */
static cpumask_var_t wq_requested_unbound_cpumask;

/* PL: isolated cpumask to be excluded from unbound cpumask */
static cpumask_var_t wq_isolated_cpumask;

/* for further constrain wq_unbound_cpumask by cmdline parameter*/
static struct cpumask wq_cmdline_cpumask __initdata;

/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);

/*
 * Local execution of unbound work items is no longer guaranteed.  The
 * following always forces round-robin CPU selection on unbound work items
 * to uncover usages which depend on it.
 */
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

/* to raise softirq for the BH worker pools on other CPUs */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works);

/* the BH worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools);

/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);

static DEFINE_IDR(worker_pool_idr);        /* PR: idr of all pools */

/* PL: hash of all unbound pools keyed by pool->attrs */
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

/*
 * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
 * process context while holding a pool lock. Bounce to a dedicated kthread
 * worker to avoid A-A deadlocks.
 */
static struct kthread_worker *pwq_release_worker __ro_after_init;

struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_percpu_wq __ro_after_init;
EXPORT_SYMBOL(system_percpu_wq);
struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_dfl_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_dfl_wq);
struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
struct workqueue_struct *system_bh_wq;
EXPORT_SYMBOL_GPL(system_bh_wq);
struct workqueue_struct *system_bh_highpri_wq;
EXPORT_SYMBOL_GPL(system_bh_highpri_wq);
struct workqueue_struct *system_dfl_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_dfl_long_wq);

static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
static void show_pwq(struct pool_workqueue *pwq);
static void show_one_worker_pool(struct worker_pool *pool);

#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

#define assert_rcu_or_pool_mutex()                                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&                        \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU or wq_pool_mutex should be held")

#define for_each_bh_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(bh_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

#define for_each_cpu_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
 * @pi: integer used for iteration
 *
 * This must be called either with wq_pool_mutex held or RCU read
 * locked.  If the pool needs to be used beyond the locking in effect, the
 * caller is responsible for guaranteeing that the pool stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool(pool, pi)                                                \
        idr_for_each_entry(&worker_pool_idr, pool, pi)                        \
                if (({ assert_rcu_or_pool_mutex(); false; })) { }        \
                else

/**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
 * @pool: worker_pool to iterate workers of
 *
 * This must be called with wq_pool_attach_mutex.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool_worker(worker, pool)                                \
        list_for_each_entry((worker), &(pool)->workers, node)                \
                if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
                else

/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
 *
 * This must be called either with wq->mutex held or RCU read locked.
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pwq(pwq, wq)                                                \
        list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,                \
                                 lockdep_is_held(&(wq->mutex)))

#ifdef CONFIG_DEBUG_OBJECTS_WORK

static const struct debug_obj_descr work_debug_descr;

static void *work_debug_hint(void *addr)
{
        return ((struct work_struct *) addr)->func;
}

static bool work_is_static_object(void *addr)
{
        struct work_struct *work = addr;

        return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool work_fixup_init(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_init(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool work_fixup_free(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_free(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr work_debug_descr = {
        .name                = "work_struct",
        .debug_hint        = work_debug_hint,
        .is_static_object = work_is_static_object,
        .fixup_init        = work_fixup_init,
        .fixup_free        = work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
        debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
        debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
        if (onstack)
                debug_object_init_on_stack(work, &work_debug_descr);
        else
                debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
        debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

void destroy_delayed_work_on_stack(struct delayed_work *work)
{
        timer_destroy_on_stack(&work->timer);
        debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

/**
 * worker_pool_assign_id - allocate ID and assign it to @pool
 * @pool: the pool pointer of interest
 *
 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
 * successfully, -errno on failure.
 */
static int worker_pool_assign_id(struct worker_pool *pool)
{
        int ret;

        lockdep_assert_held(&wq_pool_mutex);

        ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
                        GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
        }
        return ret;
}

static struct pool_workqueue __rcu **
unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
{
       if (cpu >= 0)
               return per_cpu_ptr(wq->cpu_pwq, cpu);
       else
               return &wq->dfl_pwq;
}

/* @cpu < 0 for dfl_pwq */
static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
{
        return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
                                     lockdep_is_held(&wq_pool_mutex) ||
                                     lockdep_is_held(&wq->mutex));
}

/**
 * unbound_effective_cpumask - effective cpumask of an unbound workqueue
 * @wq: workqueue of interest
 *
 * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
 * is masked with wq_unbound_cpumask to determine the effective cpumask. The
 * default pwq is always mapped to the pool with the current effective cpumask.
 */
static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
{
        return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
}

static unsigned int work_color_to_flags(int color)
{
        return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(unsigned long work_data)
{
        return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
                ((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
        return (color + 1) % WORK_NR_COLORS;
}

static unsigned long pool_offq_flags(struct worker_pool *pool)
{
        return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0;
}

/*
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
 * is cleared and the high bits contain OFFQ flags and pool ID.
 *
 * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling()
 * can be used to set the pwq, pool or clear work->data. These functions should
 * only be called while the work is owned - ie. while the PENDING bit is set.
 *
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
 * corresponding to a work.  Pool is available once the work has been
 * queued anywhere after initialization until it is sync canceled.  pwq is
 * available only while the work item is queued.
 */
static inline void set_work_data(struct work_struct *work, unsigned long data)
{
        WARN_ON_ONCE(!work_pending(work));
        atomic_long_set(&work->data, data | work_static(work));
}

static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
                         unsigned long flags)
{
        set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING |
                      WORK_STRUCT_PWQ | flags);
}

static void set_work_pool_and_keep_pending(struct work_struct *work,
                                           int pool_id, unsigned long flags)
{
        set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
                      WORK_STRUCT_PENDING | flags);
}

static void set_work_pool_and_clear_pending(struct work_struct *work,
                                            int pool_id, unsigned long flags)
{
        /*
         * The following wmb is paired with the implied mb in
         * test_and_set_bit(PENDING) and ensures all updates to @work made
         * here are visible to and precede any updates by the next PENDING
         * owner.
         */
        smp_wmb();
        set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
                      flags);
        /*
         * The following mb guarantees that previous clear of a PENDING bit
         * will not be reordered with any speculative LOADS or STORES from
         * work->current_func, which is executed afterwards.  This possible
         * reordering can lead to a missed execution on attempt to queue
         * the same @work.  E.g. consider this case:
         *
         *   CPU#0                         CPU#1
         *   ----------------------------  --------------------------------
         *
         * 1  STORE event_indicated
         * 2  queue_work_on() {
         * 3    test_and_set_bit(PENDING)
         * 4 }                             set_..._and_clear_pending() {
         * 5                                 set_work_data() # clear bit
         * 6                                 smp_mb()
         * 7                               work->current_func() {
         * 8                                      LOAD event_indicated
         *                                   }
         *
         * Without an explicit full barrier speculative LOAD on line 8 can
         * be executed before CPU#0 does STORE on line 1.  If that happens,
         * CPU#0 observes the PENDING bit is still set and new execution of
         * a @work is not queued in a hope, that CPU#1 will eventually
         * finish the queued @work.  Meanwhile CPU#1 does not see
         * event_indicated is set, because speculative LOAD was executed
         * before actual STORE.
         */
        smp_mb();
}

static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
{
        return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK);
}

static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data);
        else
                return NULL;
}

/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or inside of a rcu_read_lock() region.
 *
 * All fields of the returned pool are accessible as long as the above
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
 *
 * Return: The worker_pool @work was last associated with.  %NULL if none.
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);
        int pool_id;

        assert_rcu_or_pool_mutex();

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data)->pool;

        pool_id = data >> WORK_OFFQ_POOL_SHIFT;
        if (pool_id == WORK_OFFQ_POOL_NONE)
                return NULL;

        return idr_find(&worker_pool_idr, pool_id);
}

static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits)
{
        return (v >> shift) & ((1U << bits) - 1);
}

static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data)
{
        WARN_ON_ONCE(data & WORK_STRUCT_PWQ);

        offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT,
                                        WORK_OFFQ_POOL_BITS);
        offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT,
                                        WORK_OFFQ_DISABLE_BITS);
        offqd->flags = data & WORK_OFFQ_FLAG_MASK;
}

static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd)
{
        return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) |
                ((unsigned long)offqd->flags);
}

/*
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
 * they're being called with pool->lock held.
 */

/*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
 *
 * Note that, because unbound workers never contribute to nr_running, this
 * function will always return %true for unbound pools as long as the
 * worklist isn't empty.
 */
static bool need_more_worker(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && !pool->nr_running;
}

/* Can I start working?  Called from busy but !running workers. */
static bool may_start_working(struct worker_pool *pool)
{
        return pool->nr_idle;
}

/* Do I need to keep working?  Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
}

/* Do we need a new worker?  Called from manager. */
static bool need_to_create_worker(struct worker_pool *pool)
{
        return need_more_worker(pool) && !may_start_working(pool);
}

/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
        bool managing = pool->flags & POOL_MANAGER_ACTIVE;
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;

        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to set
 *
 * Set @flags in @worker->flags and adjust nr_running accordingly.
 */
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);

        /* If transitioning into NOT_RUNNING, adjust nr_running. */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
                pool->nr_running--;
        }

        worker->flags |= flags;
}

/**
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to clear
 *
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;
        unsigned int oflags = worker->flags;

        lockdep_assert_held(&pool->lock);

        worker->flags &= ~flags;

        /*
         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
         * of multiple flags, not a single flag.
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
                        pool->nr_running++;
}

/* Return the first idle worker.  Called with pool->lock held. */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;

        return list_first_entry(&pool->idle_list, struct worker, entry);
}

/**
 * worker_enter_idle - enter idle state
 * @worker: worker which is entering idle state
 *
 * @worker is entering idle state.  Update stats and idle timer if
 * necessary.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_enter_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
            WARN_ON_ONCE(!list_empty(&worker->entry) &&
                         (worker->hentry.next || worker->hentry.pprev)))
                return;

        /* can't use worker_set_flags(), also called from create_worker() */
        worker->flags |= WORKER_IDLE;
        pool->nr_idle++;
        worker->last_active = jiffies;

        /* idle_list is LIFO */
        list_add(&worker->entry, &pool->idle_list);

        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);

        /* Sanity check nr_running. */
        WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
}

/**
 * worker_leave_idle - leave idle state
 * @worker: worker which is leaving idle state
 *
 * @worker is leaving idle state.  Update stats.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_leave_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
                return;
        worker_clr_flags(worker, WORKER_IDLE);
        pool->nr_idle--;
        list_del_init(&worker->entry);
}

/**
 * find_worker_executing_work - find worker which is executing a work
 * @pool: pool of interest
 * @work: work to find worker for
 *
 * Find a worker which is executing @work on @pool by searching
 * @pool->busy_hash which is keyed by the address of @work.  For a worker
 * to match, its current execution should match the address of @work and
 * its work function.  This is to avoid unwanted dependency between
 * unrelated work executions through a work item being recycled while still
 * being executed.
 *
 * This is a bit tricky.  A work item may be freed once its execution
 * starts and nothing prevents the freed area from being recycled for
 * another work item.  If the same work item address ends up being reused
 * before the original execution finishes, workqueue will identify the
 * recycled work item as currently executing and make it wait until the
 * current execution finishes, introducing an unwanted dependency.
 *
 * This function checks the work item address and work function to avoid
 * false positives.  Note that this isn't complete as one may construct a
 * work function which can introduce dependency onto itself through a
 * recycled work item.  Well, if somebody wants to shoot oneself in the
 * foot that badly, there's only so much we can do, and if such deadlock
 * actually occurs, it should be easy to locate the culprit work function.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 *
 * Return:
 * Pointer to worker which is executing @work if found, %NULL
 * otherwise.
 */
static struct worker *find_worker_executing_work(struct worker_pool *pool,
                                                 struct work_struct *work)
{
        struct worker *worker;

        hash_for_each_possible(pool->busy_hash, worker, hentry,
                               (unsigned long)work)
                if (worker->current_work == work &&
                    worker->current_func == work->func)
                        return worker;

        return NULL;
}

static void mayday_cursor_func(struct work_struct *work)
{
        /* should not be processed, only for marking position */
        BUG();
}

/**
 * move_linked_works - move linked works to a list
 * @work: start of series of works to be scheduled
 * @head: target list to append @work to
 * @nextp: out parameter for nested worklist walking
 *
 * Schedule linked works starting from @work to @head. Work series to be
 * scheduled starts at @work and includes any consecutive work with
 * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
 * @nextp.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void move_linked_works(struct work_struct *work, struct list_head *head,
                              struct work_struct **nextp)
{
        struct work_struct *n;

        /*
         * Linked worklist will always end before the end of the list,
         * use NULL for list head.
         */
        list_for_each_entry_safe_from(work, n, NULL, entry) {
                list_move_tail(&work->entry, head);
                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
                        break;
        }

        /*
         * If we're already inside safe list traversal and have moved
         * multiple works to the scheduled queue, the next position
         * needs to be updated.
         */
        if (nextp)
                *nextp = n;
}

/**
 * assign_work - assign a work item and its linked work items to a worker
 * @work: work to assign
 * @worker: worker to assign to
 * @nextp: out parameter for nested worklist walking
 *
 * Assign @work and its linked work items to @worker. If @work is already being
 * executed by another worker in the same pool, it'll be punted there.
 *
 * If @nextp is not NULL, it's updated to point to the next work of the last
 * scheduled work. This allows assign_work() to be nested inside
 * list_for_each_entry_safe().
 *
 * Returns %true if @work was successfully assigned to @worker. %false if @work
 * was punted to another worker already executing it.
 */
static bool assign_work(struct work_struct *work, struct worker *worker,
                        struct work_struct **nextp)
{
        struct worker_pool *pool = worker->pool;
        struct worker *collision;

        lockdep_assert_held(&pool->lock);

        /* The cursor work should not be processed */
        if (unlikely(work->func == mayday_cursor_func)) {
                /* only worker_thread() can possibly take this branch */
                WARN_ON_ONCE(worker->rescue_wq);
                if (nextp)
                        *nextp = list_next_entry(work, entry);
                list_del_init(&work->entry);
                return false;
        }

        /*
         * A single work shouldn't be executed concurrently by multiple workers.
         * __queue_work() ensures that @work doesn't jump to a different pool
         * while still running in the previous pool. Here, we should ensure that
         * @work is not executed concurrently by multiple workers from the same
         * pool. Check whether anyone is already processing the work. If so,
         * defer the work to the currently executing one.
         */
        collision = find_worker_executing_work(pool, work);
        if (unlikely(collision)) {
                move_linked_works(work, &collision->scheduled, nextp);
                return false;
        }

        move_linked_works(work, &worker->scheduled, nextp);
        return true;
}

static struct irq_work *bh_pool_irq_work(struct worker_pool *pool)
{
        int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0;

        return &per_cpu(bh_pool_irq_works, pool->cpu)[high];
}

static void kick_bh_pool(struct worker_pool *pool)
{
#ifdef CONFIG_SMP
        /* see drain_dead_softirq_workfn() for BH_DRAINING */
        if (unlikely(pool->cpu != smp_processor_id() &&
                     !(pool->flags & POOL_BH_DRAINING))) {
                irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
                return;
        }
#endif
        if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                raise_softirq_irqoff(HI_SOFTIRQ);
        else
                raise_softirq_irqoff(TASKLET_SOFTIRQ);
}

/**
 * kick_pool - wake up an idle worker if necessary
 * @pool: pool to kick
 *
 * @pool may have pending work items. Wake up worker if necessary. Returns
 * whether a worker was woken up.
 */
static bool kick_pool(struct worker_pool *pool)
{
        struct worker *worker = first_idle_worker(pool);
        struct task_struct *p;

        lockdep_assert_held(&pool->lock);

        if (!need_more_worker(pool) || !worker)
                return false;

        if (pool->flags & POOL_BH) {
                kick_bh_pool(pool);
                return true;
        }

        p = worker->task;

#ifdef CONFIG_SMP
        /*
         * Idle @worker is about to execute @work and waking up provides an
         * opportunity to migrate @worker at a lower cost by setting the task's
         * wake_cpu field. Let's see if we want to move @worker to improve
         * execution locality.
         *
         * We're waking the worker that went idle the latest and there's some
         * chance that @worker is marked idle but hasn't gone off CPU yet. If
         * so, setting the wake_cpu won't do anything. As this is a best-effort
         * optimization and the race window is narrow, let's leave as-is for
         * now. If this becomes pronounced, we can skip over workers which are
         * still on cpu when picking an idle worker.
         *
         * If @pool has non-strict affinity, @worker might have ended up outside
         * its affinity scope. Repatriate.
         */
        if (!pool->attrs->affn_strict &&
            !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
                struct work_struct *work = list_first_entry(&pool->worklist,
                                                struct work_struct, entry);
                int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask,
                                                          cpu_online_mask);
                if (wake_cpu < nr_cpu_ids) {
                        p->wake_cpu = wake_cpu;
                        get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
                }
        }
#endif
        wake_up_process(p);
        return true;
}

#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT

/*
 * Concurrency-managed per-cpu work items that hog CPU for longer than
 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
 * which prevents them from stalling other concurrency-managed work items. If a
 * work function keeps triggering this mechanism, it's likely that the work item
 * should be using an unbound workqueue instead.
 *
 * wq_cpu_intensive_report() tracks work functions which trigger such conditions
 * and report them so that they can be examined and converted to use unbound
 * workqueues as appropriate. To avoid flooding the console, each violating work
 * function is tracked and reported with exponential backoff.
 */
#define WCI_MAX_ENTS 128

struct wci_ent {
        work_func_t                func;
        atomic64_t                cnt;
        struct hlist_node        hash_node;
};

static struct wci_ent wci_ents[WCI_MAX_ENTS];
static int wci_nr_ents;
static DEFINE_RAW_SPINLOCK(wci_lock);
static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));

static struct wci_ent *wci_find_ent(work_func_t func)
{
        struct wci_ent *ent;

        hash_for_each_possible_rcu(wci_hash, ent, hash_node,
                                   (unsigned long)func) {
                if (ent->func == func)
                        return ent;
        }
        return NULL;
}

static void wq_cpu_intensive_report(work_func_t func)
{
        struct wci_ent *ent;

restart:
        ent = wci_find_ent(func);
        if (ent) {
                u64 cnt;

                /*
                 * Start reporting from the warning_thresh and back off
                 * exponentially.
                 */
                cnt = atomic64_inc_return_relaxed(&ent->cnt);
                if (wq_cpu_intensive_warning_thresh &&
                    cnt >= wq_cpu_intensive_warning_thresh &&
                    is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh))
                        printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
                                        ent->func, wq_cpu_intensive_thresh_us,
                                        atomic64_read(&ent->cnt));
                return;
        }

        /*
         * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
         * is exhausted, something went really wrong and we probably made enough
         * noise already.
         */
        if (wci_nr_ents >= WCI_MAX_ENTS)
                return;

        raw_spin_lock(&wci_lock);

        if (wci_nr_ents >= WCI_MAX_ENTS) {
                raw_spin_unlock(&wci_lock);
                return;
        }

        if (wci_find_ent(func)) {
                raw_spin_unlock(&wci_lock);
                goto restart;
        }

        ent = &wci_ents[wci_nr_ents++];
        ent->func = func;
        atomic64_set(&ent->cnt, 0);
        hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);

        raw_spin_unlock(&wci_lock);

        goto restart;
}

#else        /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
static void wq_cpu_intensive_report(work_func_t func) {}
#endif        /* CONFIG_WQ_CPU_INTENSIVE_REPORT */

/**
 * wq_worker_running - a worker is running again
 * @task: task waking up
 *
 * This function is called when a worker returns from schedule()
 */
void wq_worker_running(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        if (!READ_ONCE(worker->sleeping))
                return;

        /*
         * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
         * and the nr_running increment below, we may ruin the nr_running reset
         * and leave with an unexpected pool->nr_running == 1 on the newly unbound
         * pool. Protect against such race.
         */
        preempt_disable();
        if (!(worker->flags & WORKER_NOT_RUNNING))
                worker->pool->nr_running++;
        preempt_enable();

        /*
         * CPU intensive auto-detection cares about how long a work item hogged
         * CPU without sleeping. Reset the starting timestamp on wakeup.
         */
        worker->current_at = worker->task->se.sum_exec_runtime;

        WRITE_ONCE(worker->sleeping, 0);
}

/**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 *
 * This function is called from schedule() when a busy worker is
 * going to sleep.
 */
void wq_worker_sleeping(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);
        struct worker_pool *pool;

        /*
         * Rescuers, which may not have all the fields set up like normal
         * workers, also reach here, let's not access anything before
         * checking NOT_RUNNING.
         */
        if (worker->flags & WORKER_NOT_RUNNING)
                return;

        pool = worker->pool;

        /* Return if preempted before wq_worker_running() was reached */
        if (READ_ONCE(worker->sleeping))
                return;

        WRITE_ONCE(worker->sleeping, 1);
        raw_spin_lock_irq(&pool->lock);

        /*
         * Recheck in case unbind_workers() preempted us. We don't
         * want to decrement nr_running after the worker is unbound
         * and nr_running has been reset.
         */
        if (worker->flags & WORKER_NOT_RUNNING) {
                raw_spin_unlock_irq(&pool->lock);
                return;
        }

        pool->nr_running--;
        if (kick_pool(pool))
                worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * wq_worker_tick - a scheduler tick occurred while a kworker is running
 * @task: task currently running
 *
 * Called from sched_tick(). We're in the IRQ context and the current
 * worker's fields which follow the 'K' locking rule can be accessed safely.
 */
void wq_worker_tick(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);
        struct pool_workqueue *pwq = worker->current_pwq;
        struct worker_pool *pool = worker->pool;

        if (!pwq)
                return;

        pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;

        if (!wq_cpu_intensive_thresh_us)
                return;

        /*
         * If the current worker is concurrency managed and hogged the CPU for
         * longer than wq_cpu_intensive_thresh_us, it's automatically marked
         * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
         *
         * Set @worker->sleeping means that @worker is in the process of
         * switching out voluntarily and won't be contributing to
         * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
         * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
         * double decrements. The task is releasing the CPU anyway. Let's skip.
         * We probably want to make this prettier in the future.
         */
        if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
            worker->task->se.sum_exec_runtime - worker->current_at <
            wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
                return;

        raw_spin_lock(&pool->lock);

        worker_set_flags(worker, WORKER_CPU_INTENSIVE);
        wq_cpu_intensive_report(worker->current_func);
        pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;

        if (kick_pool(pool))
                pwq->stats[PWQ_STAT_CM_WAKEUP]++;

        raw_spin_unlock(&pool->lock);
}

/**
 * wq_worker_last_func - retrieve worker's last work function
 * @task: Task to retrieve last work function of.
 *
 * Determine the last function a worker executed. This is called from
 * the scheduler to get a worker's last known identity.
 *
 * CONTEXT:
 * raw_spin_lock_irq(rq->lock)
 *
 * This function is called during schedule() when a kworker is going
 * to sleep. It's used by psi to identify aggregation workers during
 * dequeuing, to allow periodic aggregation to shut-off when that
 * worker is the last task in the system or cgroup to go to sleep.
 *
 * As this function doesn't involve any workqueue-related locking, it
 * only returns stable values when called from inside the scheduler's
 * queuing and dequeuing paths, when @task, which must be a kworker,
 * is guaranteed to not be processing any works.
 *
 * Return:
 * The last work function %current executed as a worker, NULL if it
 * hasn't executed any work yet.
 */
work_func_t wq_worker_last_func(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        return worker->last_func;
}

/**
 * wq_node_nr_active - Determine wq_node_nr_active to use
 * @wq: workqueue of interest
 * @node: NUMA node, can be %NUMA_NO_NODE
 *
 * Determine wq_node_nr_active to use for @wq on @node. Returns:
 *
 * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
 *
 * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
 *
 * - Otherwise, node_nr_active[@node].
 */
static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
                                                   int node)
{
        if (!(wq->flags & WQ_UNBOUND))
                return NULL;

        if (node == NUMA_NO_NODE)
                node = nr_node_ids;

        return wq->node_nr_active[node];
}

/**
 * wq_update_node_max_active - Update per-node max_actives to use
 * @wq: workqueue to update
 * @off_cpu: CPU that's going down, -1 if a CPU is not going down
 *
 * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
 * distributed among nodes according to the proportions of numbers of online
 * cpus. The result is always between @wq->min_active and max_active.
 */
static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
{
        struct cpumask *effective = unbound_effective_cpumask(wq);
        int min_active = READ_ONCE(wq->min_active);
        int max_active = READ_ONCE(wq->max_active);
        int total_cpus, node;

        lockdep_assert_held(&wq->mutex);

        if (!wq_topo_initialized)
                return;

        if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
                off_cpu = -1;

        total_cpus = cpumask_weight_and(effective, cpu_online_mask);
        if (off_cpu >= 0)
                total_cpus--;

        /* If all CPUs of the wq get offline, use the default values */
        if (unlikely(!total_cpus)) {
                for_each_node(node)
                        wq_node_nr_active(wq, node)->max = min_active;

                wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
                return;
        }

        for_each_node(node) {
                int node_cpus;

                node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
                if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
                        node_cpus--;

                wq_node_nr_active(wq, node)->max =
                        clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
                              min_active, max_active);
        }

        wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
}

/**
 * get_pwq - get an extra reference on the specified pool_workqueue
 * @pwq: pool_workqueue to get
 *
 * Obtain an extra reference on @pwq.  The caller should guarantee that
 * @pwq has positive refcnt and be holding the matching pool->lock.
 */
static void get_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        WARN_ON_ONCE(pwq->refcnt <= 0);
        pwq->refcnt++;
}

/**
 * put_pwq - put a pool_workqueue reference
 * @pwq: pool_workqueue to put
 *
 * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
 * destruction.  The caller should be holding the matching pool->lock.
 */
static void put_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        if (likely(--pwq->refcnt))
                return;
        /*
         * @pwq can't be released under pool->lock, bounce to a dedicated
         * kthread_worker to avoid A-A deadlocks.
         */
        kthread_queue_work(pwq_release_worker, &pwq->release_work);
}

/**
 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
 * @pwq: pool_workqueue to put (can be %NULL)
 *
 * put_pwq() with locking.  This function also allows %NULL @pwq.
 */
static void put_pwq_unlocked(struct pool_workqueue *pwq)
{
        if (pwq) {
                /*
                 * As both pwqs and pools are RCU protected, the
                 * following lock operations are safe.
                 */
                raw_spin_lock_irq(&pwq->pool->lock);
                put_pwq(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
}

static bool pwq_is_empty(struct pool_workqueue *pwq)
{
        return !pwq->nr_active && list_empty(&pwq->inactive_works);
}

static void __pwq_activate_work(struct pool_workqueue *pwq,
                                struct work_struct *work)
{
        unsigned long *wdb = work_data_bits(work);

        WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
        trace_workqueue_activate_work(work);
        if (list_empty(&pwq->pool->worklist))
                pwq->pool->last_progress_ts = jiffies;
        move_linked_works(work, &pwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}

static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
        int max = READ_ONCE(nna->max);
        int old = atomic_read(&nna->nr);

        do {
                if (old >= max)
                        return false;
        } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1));

        return true;
}

/**
 * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
 * @pwq: pool_workqueue of interest
 * @fill: max_active may have increased, try to increase concurrency level
 *
 * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
 * successfully obtained. %false otherwise.
 */
static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
{
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
        bool obtained = false;

        lockdep_assert_held(&pool->lock);

        if (!nna) {
                /* BH or per-cpu workqueue, pwq->nr_active is sufficient */
                obtained = pwq->nr_active < READ_ONCE(wq->max_active);
                goto out;
        }

        if (unlikely(pwq->plugged))
                return false;

        /*
         * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
         * already waiting on $nna, pwq_dec_nr_active() will maintain the
         * concurrency level. Don't jump the line.
         *
         * We need to ignore the pending test after max_active has increased as
         * pwq_dec_nr_active() can only maintain the concurrency level but not
         * increase it. This is indicated by @fill.
         */
        if (!list_empty(&pwq->pending_node) && likely(!fill))
                goto out;

        obtained = tryinc_node_nr_active(nna);
        if (obtained)
                goto out;

        /*
         * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
         * and try again. The smp_mb() is paired with the implied memory barrier
         * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
         * we see the decremented $nna->nr or they see non-empty
         * $nna->pending_pwqs.
         */
        raw_spin_lock(&nna->lock);

        if (list_empty(&pwq->pending_node))
                list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
        else if (likely(!fill))
                goto out_unlock;

        smp_mb();

        obtained = tryinc_node_nr_active(nna);

        /*
         * If @fill, @pwq might have already been pending. Being spuriously
         * pending in cold paths doesn't affect anything. Let's leave it be.
         */
        if (obtained && likely(!fill))
                list_del_init(&pwq->pending_node);

out_unlock:
        raw_spin_unlock(&nna->lock);
out:
        if (obtained)
                pwq->nr_active++;
        return obtained;
}

/**
 * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
 * @pwq: pool_workqueue of interest
 * @fill: max_active may have increased, try to increase concurrency level
 *
 * Activate the first inactive work item of @pwq if available and allowed by
 * max_active limit.
 *
 * Returns %true if an inactive work item has been activated. %false if no
 * inactive work item is found or max_active limit is reached.
 */
static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
{
        struct work_struct *work =
                list_first_entry_or_null(&pwq->inactive_works,
                                         struct work_struct, entry);

        if (work && pwq_tryinc_nr_active(pwq, fill)) {
                __pwq_activate_work(pwq, work);
                return true;
        } else {
                return false;
        }
}

/**
 * unplug_oldest_pwq - unplug the oldest pool_workqueue
 * @wq: workqueue_struct where its oldest pwq is to be unplugged
 *
 * This function should only be called for ordered workqueues where only the
 * oldest pwq is unplugged, the others are plugged to suspend execution to
 * ensure proper work item ordering::
 *
 *    dfl_pwq --------------+     [P] - plugged
 *                          |
 *                          v
 *    pwqs -> A -> B [P] -> C [P] (newest)
 *            |    |        |
 *            1    3        5
 *            |    |        |
 *            2    4        6
 *
 * When the oldest pwq is drained and removed, this function should be called
 * to unplug the next oldest one to start its work item execution. Note that
 * pwq's are linked into wq->pwqs with the oldest first, so the first one in
 * the list is the oldest.
 */
static void unplug_oldest_pwq(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq->mutex);

        /* Caller should make sure that pwqs isn't empty before calling */
        pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue,
                                       pwqs_node);
        raw_spin_lock_irq(&pwq->pool->lock);
        if (pwq->plugged) {
                pwq->plugged = false;
                if (pwq_activate_first_inactive(pwq, true)) {
                        /*
                         * While plugged, queueing skips activation which
                         * includes bumping the nr_active count and adding the
                         * pwq to nna->pending_pwqs if the count can't be
                         * obtained. We need to restore both for the pwq being
                         * unplugged. The first call activates the first
                         * inactive work item and the second, if there are more
                         * inactive, puts the pwq on pending_pwqs.
                         */
                        pwq_activate_first_inactive(pwq, false);

                        kick_pool(pwq->pool);
                }
        }
        raw_spin_unlock_irq(&pwq->pool->lock);
}

/**
 * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
 * @nna: wq_node_nr_active to activate a pending pwq for
 * @caller_pool: worker_pool the caller is locking
 *
 * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
 * @caller_pool may be unlocked and relocked to lock other worker_pools.
 */
static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
                                      struct worker_pool *caller_pool)
{
        struct worker_pool *locked_pool = caller_pool;
        struct pool_workqueue *pwq;
        struct work_struct *work;

        lockdep_assert_held(&caller_pool->lock);

        raw_spin_lock(&nna->lock);
retry:
        pwq = list_first_entry_or_null(&nna->pending_pwqs,
                                       struct pool_workqueue, pending_node);
        if (!pwq)
                goto out_unlock;

        /*
         * If @pwq is for a different pool than @locked_pool, we need to lock
         * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
         * / lock dance. For that, we also need to release @nna->lock as it's
         * nested inside pool locks.
         */
        if (pwq->pool != locked_pool) {
                raw_spin_unlock(&locked_pool->lock);
                locked_pool = pwq->pool;
                if (!raw_spin_trylock(&locked_pool->lock)) {
                        raw_spin_unlock(&nna->lock);
                        raw_spin_lock(&locked_pool->lock);
                        raw_spin_lock(&nna->lock);
                        goto retry;
                }
        }

        /*
         * $pwq may not have any inactive work items due to e.g. cancellations.
         * Drop it from pending_pwqs and see if there's another one.
         */
        work = list_first_entry_or_null(&pwq->inactive_works,
                                        struct work_struct, entry);
        if (!work) {
                list_del_init(&pwq->pending_node);
                goto retry;
        }

        /*
         * Acquire an nr_active count and activate the inactive work item. If
         * $pwq still has inactive work items, rotate it to the end of the
         * pending_pwqs so that we round-robin through them. This means that
         * inactive work items are not activated in queueing order which is fine
         * given that there has never been any ordering across different pwqs.
         */
        if (likely(tryinc_node_nr_active(nna))) {
                pwq->nr_active++;
                __pwq_activate_work(pwq, work);

                if (list_empty(&pwq->inactive_works))
                        list_del_init(&pwq->pending_node);
                else
                        list_move_tail(&pwq->pending_node, &nna->pending_pwqs);

                /* if activating a foreign pool, make sure it's running */
                if (pwq->pool != caller_pool)
                        kick_pool(pwq->pool);
        }

out_unlock:
        raw_spin_unlock(&nna->lock);
        if (locked_pool != caller_pool) {
                raw_spin_unlock(&locked_pool->lock);
                raw_spin_lock(&caller_pool->lock);
        }
}

/**
 * pwq_dec_nr_active - Retire an active count
 * @pwq: pool_workqueue of interest
 *
 * Decrement @pwq's nr_active and try to activate the first inactive work item.
 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
 */
static void pwq_dec_nr_active(struct pool_workqueue *pwq)
{
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);

        lockdep_assert_held(&pool->lock);

        /*
         * @pwq->nr_active should be decremented for both percpu and unbound
         * workqueues.
         */
        pwq->nr_active--;

        /*
         * For a percpu workqueue, it's simple. Just need to kick the first
         * inactive work item on @pwq itself.
         */
        if (!nna) {
                pwq_activate_first_inactive(pwq, false);
                return;
        }

        /*
         * If @pwq is for an unbound workqueue, it's more complicated because
         * multiple pwqs and pools may be sharing the nr_active count. When a
         * pwq needs to wait for an nr_active count, it puts itself on
         * $nna->pending_pwqs. The following atomic_dec_return()'s implied
         * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
         * guarantee that either we see non-empty pending_pwqs or they see
         * decremented $nna->nr.
         *
         * $nna->max may change as CPUs come online/offline and @pwq->wq's
         * max_active gets updated. However, it is guaranteed to be equal to or
         * larger than @pwq->wq->min_active which is above zero unless freezing.
         * This maintains the forward progress guarantee.
         */
        if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
                return;

        if (!list_empty(&nna->pending_pwqs))
                node_activate_pending_pwq(nna, pool);
}

/**
 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
 * @pwq: pwq of interest
 * @work_data: work_data of work which left the queue
 *
 * A work either has completed or is removed from pending queue,
 * decrement nr_in_flight of its pwq and handle workqueue flushing.
 *
 * NOTE:
 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock
 * and thus should be called after all other state updates for the in-flight
 * work item is complete.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
{
        int color = get_work_color(work_data);

        if (!(work_data & WORK_STRUCT_INACTIVE))
                pwq_dec_nr_active(pwq);

        pwq->nr_in_flight[color]--;

        /* is flush in progress and are we at the flushing tip? */
        if (likely(pwq->flush_color != color))
                goto out_put;

        /* are there still in-flight works? */
        if (pwq->nr_in_flight[color])
                goto out_put;

        /* this pwq is done, clear flush_color */
        pwq->flush_color = -1;

        /*
         * If this was the last pwq, wake up the first flusher.  It
         * will handle the rest.
         */
        if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
                complete(&pwq->wq->first_flusher->done);
out_put:
        put_pwq(pwq);
}

/**
 * try_to_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @cflags: %WORK_CANCEL_ flags
 * @irq_flags: place to store irq state
 *
 * Try to grab PENDING bit of @work.  This function can handle @work in any
 * stable state - idle, on timer or on worklist.
 *
 * Return:
 *
 *  ========        ================================================================
 *  1                if @work was pending and we successfully stole PENDING
 *  0                if @work was idle and we claimed PENDING
 *  -EAGAIN        if PENDING couldn't be grabbed at the moment, safe to busy-retry
 *  ========        ================================================================
 *
 * Note:
 * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
 * interrupted while holding PENDING and @work off queue, irq must be
 * disabled on entry.  This, combined with delayed_work->timer being
 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
 *
 * On successful return, >= 0, irq is disabled and the caller is
 * responsible for releasing it using local_irq_restore(*@irq_flags).
 *
 * This function is safe to call from any context including IRQ handler.
 */
static int try_to_grab_pending(struct work_struct *work, u32 cflags,
                               unsigned long *irq_flags)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        local_irq_save(*irq_flags);

        /* try to steal the timer if it exists */
        if (cflags & WORK_CANCEL_DELAYED) {
                struct delayed_work *dwork = to_delayed_work(work);

                /*
                 * dwork->timer is irqsafe.  If timer_delete() fails, it's
                 * guaranteed that the timer is not queued anywhere and not
                 * running on the local CPU.
                 */
                if (likely(timer_delete(&dwork->timer)))
                        return 1;
        }

        /* try to claim PENDING the normal way */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
                return 0;

        rcu_read_lock();
        /*
         * The queueing is in progress, or it is already queued. Try to
         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
         */
        pool = get_work_pool(work);
        if (!pool)
                goto fail;

        raw_spin_lock(&pool->lock);
        /*
         * work->data is guaranteed to point to pwq only while the work
         * item is queued on pwq->wq, and both updating work->data to point
         * to pwq on queueing and to pool on dequeueing are done under
         * pwq->pool->lock.  This in turn guarantees that, if work->data
         * points to pwq which is associated with a locked pool, the work
         * item is currently queued on that pool.
         */
        pwq = get_work_pwq(work);
        if (pwq && pwq->pool == pool) {
                unsigned long work_data = *work_data_bits(work);

                debug_work_deactivate(work);

                /*
                 * A cancelable inactive work item must be in the
                 * pwq->inactive_works since a queued barrier can't be
                 * canceled (see the comments in insert_wq_barrier()).
                 *
                 * An inactive work item cannot be deleted directly because
                 * it might have linked barrier work items which, if left
                 * on the inactive_works list, will confuse pwq->nr_active
                 * management later on and cause stall.  Move the linked
                 * barrier work items to the worklist when deleting the grabbed
                 * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that
                 * it doesn't participate in nr_active management in later
                 * pwq_dec_nr_in_flight().
                 */
                if (work_data & WORK_STRUCT_INACTIVE)
                        move_linked_works(work, &pwq->pool->worklist, NULL);

                list_del_init(&work->entry);

                /*
                 * work->data points to pwq iff queued. Let's point to pool. As
                 * this destroys work->data needed by the next step, stash it.
                 */
                set_work_pool_and_keep_pending(work, pool->id,
                                               pool_offq_flags(pool));

                /* must be the last step, see the function comment */
                pwq_dec_nr_in_flight(pwq, work_data);

                raw_spin_unlock(&pool->lock);
                rcu_read_unlock();
                return 1;
        }
        raw_spin_unlock(&pool->lock);
fail:
        rcu_read_unlock();
        local_irq_restore(*irq_flags);
        return -EAGAIN;
}

/**
 * work_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @cflags: %WORK_CANCEL_ flags
 * @irq_flags: place to store IRQ state
 *
 * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer
 * or on worklist.
 *
 * Can be called from any context. IRQ is disabled on return with IRQ state
 * stored in *@irq_flags. The caller is responsible for re-enabling it using
 * local_irq_restore().
 *
 * Returns %true if @work was pending. %false if idle.
 */
static bool work_grab_pending(struct work_struct *work, u32 cflags,
                              unsigned long *irq_flags)
{
        int ret;

        while (true) {
                ret = try_to_grab_pending(work, cflags, irq_flags);
                if (ret >= 0)
                        return ret;
                cpu_relax();
        }
}

/**
 * insert_work - insert a work into a pool
 * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
 * work_struct flags.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
                        struct list_head *head, unsigned int extra_flags)
{
        debug_work_activate(work);

        /* record the work call stack in order to print it in KASAN reports */
        kasan_record_aux_stack(work);

        /* we own @work, set data and link */
        set_work_pwq(work, pwq, extra_flags);
        list_add_tail(&work->entry, head);
        get_pwq(pwq);
}

/*
 * Test whether @work is being queued from another work executing on the
 * same workqueue.
 */
static bool is_chained_work(struct workqueue_struct *wq)
{
        struct worker *worker;

        worker = current_wq_worker();
        /*
         * Return %true iff I'm a worker executing a work item on @wq.  If
         * I'm @worker, it's safe to dereference it without locking.
         */
        return worker && worker->current_pwq->wq == wq;
}

/*
 * When queueing an unbound work item to a wq, prefer local CPU if allowed
 * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
 * avoid perturbing sensitive tasks.
 */
static int wq_select_unbound_cpu(int cpu)
{
        int new_cpu;

        if (likely(!wq_debug_force_rr_cpu)) {
                if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
                        return cpu;
        } else {
                pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
        }

        new_cpu = __this_cpu_read(wq_rr_cpu_last);
        new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask);
        if (unlikely(new_cpu >= nr_cpu_ids))
                return cpu;
        __this_cpu_write(wq_rr_cpu_last, new_cpu);

        return new_cpu;
}

static void __queue_work(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
{
        struct pool_workqueue *pwq;
        struct worker_pool *last_pool, *pool;
        unsigned int work_flags;
        unsigned int req_cpu = cpu;

        /*
         * While a work item is PENDING && off queue, a task trying to
         * steal the PENDING will busy-loop waiting for it to either get
         * queued or lose PENDING.  Grabbing PENDING and queueing should
         * happen with IRQ disabled.
         */
        lockdep_assert_irqs_disabled();

        /*
         * For a draining wq, only works from the same workqueue are
         * allowed. The __WQ_DESTROYING helps to spot the issue that
         * queues a new work item to a wq after destroy_workqueue(wq).
         */
        if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
                     WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n",
                               work->func, wq->name))) {
                return;
        }
        rcu_read_lock();
retry:
        /* pwq which will be used unless @work is executing elsewhere */
        if (req_cpu == WORK_CPU_UNBOUND) {
                if (wq->flags & WQ_UNBOUND)
                        cpu = wq_select_unbound_cpu(raw_smp_processor_id());
                else
                        cpu = raw_smp_processor_id();
        }

        pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
        pool = pwq->pool;

        /*
         * If @work was previously on a different pool, it might still be
         * running there, in which case the work needs to be queued on that
         * pool to guarantee non-reentrancy.
         *
         * For ordered workqueue, work items must be queued on the newest pwq
         * for accurate order management.  Guaranteed order also guarantees
         * non-reentrancy.  See the comments above unplug_oldest_pwq().
         */
        last_pool = get_work_pool(work);
        if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) {
                struct worker *worker;

                raw_spin_lock(&last_pool->lock);

                worker = find_worker_executing_work(last_pool, work);

                if (worker && worker->current_pwq->wq == wq) {
                        pwq = worker->current_pwq;
                        pool = pwq->pool;
                        WARN_ON_ONCE(pool != last_pool);
                } else {
                        /* meh... not running there, queue here */
                        raw_spin_unlock(&last_pool->lock);
                        raw_spin_lock(&pool->lock);
                }
        } else {
                raw_spin_lock(&pool->lock);
        }

        /*
         * pwq is determined and locked. For unbound pools, we could have raced
         * with pwq release and it could already be dead. If its refcnt is zero,
         * repeat pwq selection. Note that unbound pwqs never die without
         * another pwq replacing it in cpu_pwq or while work items are executing
         * on it, so the retrying is guaranteed to make forward-progress.
         */
        if (unlikely(!pwq->refcnt)) {
                if (wq->flags & WQ_UNBOUND) {
                        raw_spin_unlock(&pool->lock);
                        cpu_relax();
                        goto retry;
                }
                /* oops */
                WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
                          wq->name, cpu);
        }

        /* pwq determined, queue */
        trace_workqueue_queue_work(req_cpu, pwq, work);

        if (WARN_ON(!list_empty(&work->entry)))
                goto out;

        pwq->nr_in_flight[pwq->work_color]++;
        work_flags = work_color_to_flags(pwq->work_color);

        /*
         * Limit the number of concurrently active work items to max_active.
         * @work must also queue behind existing inactive work items to maintain
         * ordering when max_active changes. See wq_adjust_max_active().
         */
        if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
                if (list_empty(&pool->worklist))
                        pool->last_progress_ts = jiffies;

                trace_workqueue_activate_work(work);
                insert_work(pwq, work, &pool->worklist, work_flags);
                kick_pool(pool);
        } else {
                work_flags |= WORK_STRUCT_INACTIVE;
                insert_work(pwq, work, &pwq->inactive_works, work_flags);
        }

out:
        raw_spin_unlock(&pool->lock);
        rcu_read_unlock();
}

static bool clear_pending_if_disabled(struct work_struct *work)
{
        unsigned long data = *work_data_bits(work);
        struct work_offq_data offqd;

        if (likely((data & WORK_STRUCT_PWQ) ||
                   !(data & WORK_OFFQ_DISABLE_MASK)))
                return false;

        work_offqd_unpack(&offqd, data);
        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        return true;
}

/**
 * queue_work_on - queue work on specific cpu
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.  Callers that fail to ensure that the specified
 * CPU cannot go away will execute on a randomly chosen CPU.
 * But note well that callers specifying a CPU that never has been
 * online will get a splat.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_on(int cpu, struct workqueue_struct *wq,
                   struct work_struct *work)
{
        bool ret = false;
        unsigned long irq_flags;

        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL(queue_work_on);

/**
 * select_numa_node_cpu - Select a CPU based on NUMA node
 * @node: NUMA node ID that we want to select a CPU from
 *
 * This function will attempt to find a "random" cpu available on a given
 * node. If there are no CPUs available on the given node it will return
 * WORK_CPU_UNBOUND indicating that we should just schedule to any
 * available CPU if we need to schedule this work.
 */
static int select_numa_node_cpu(int node)
{
        int cpu;

        /* Delay binding to CPU if node is not valid or online */
        if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
                return WORK_CPU_UNBOUND;

        /* Use local node/cpu if we are already there */
        cpu = raw_smp_processor_id();
        if (node == cpu_to_node(cpu))
                return cpu;

        /* Use "random" otherwise know as "first" online CPU of node */
        cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);

        /* If CPU is valid return that, otherwise just defer */
        return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
}

/**
 * queue_work_node - queue work on a "random" cpu for a given NUMA node
 * @node: NUMA node that we are targeting the work for
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a "random" CPU within a given NUMA node. The basic
 * idea here is to provide a way to somehow associate work with a given
 * NUMA node.
 *
 * This function will only make a best effort attempt at getting this onto
 * the right NUMA node. If no node is requested or the requested node is
 * offline then we just fall back to standard queue_work behavior.
 *
 * Currently the "random" CPU ends up being the first available CPU in the
 * intersection of cpu_online_mask and the cpumask of the node, unless we
 * are running on the node. In that case we just use the current CPU.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_node(int node, struct workqueue_struct *wq,
                     struct work_struct *work)
{
        unsigned long irq_flags;
        bool ret = false;

        /*
         * This current implementation is specific to unbound workqueues.
         * Specifically we only return the first available CPU for a given
         * node instead of cycling through individual CPUs within the node.
         *
         * If this is used with a per-cpu workqueue then the logic in
         * workqueue_select_cpu_near would need to be updated to allow for
         * some round robin type logic.
         */
        WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));

        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                int cpu = select_numa_node_cpu(node);

                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL_GPL(queue_work_node);

void delayed_work_timer_fn(struct timer_list *t)
{
        struct delayed_work *dwork = timer_container_of(dwork, t, timer);

        /* should have been called from irqsafe timer with irq already off */
        __queue_work(dwork->cpu, dwork->wq, &dwork->work);
}
EXPORT_SYMBOL(delayed_work_timer_fn);

static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
                                struct delayed_work *dwork, unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;

        WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
        WARN_ON_ONCE(timer_pending(timer));
        WARN_ON_ONCE(!list_empty(&work->entry));

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                __queue_work(cpu, wq, &dwork->work);
                return;
        }

        WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu));
        dwork->wq = wq;
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;

        if (housekeeping_enabled(HK_TYPE_TIMER)) {
                /* If the current cpu is a housekeeping cpu, use it. */
                cpu = smp_processor_id();
                if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER))
                        cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
                add_timer_on(timer, cpu);
        } else {
                if (likely(cpu == WORK_CPU_UNBOUND))
                        add_timer_global(timer);
                else
                        add_timer_on(timer, cpu);
        }
}

/**
 * queue_delayed_work_on - queue work on specific CPU after delay
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * We queue the delayed_work to a specific CPU, for non-zero delays the
 * caller must ensure it is online and can't go away. Callers that fail
 * to ensure this, may get @dwork->timer queued to an offlined CPU and
 * this will prevent queueing of @dwork->work unless the offlined CPU
 * becomes online again.
 *
 * Return: %false if @work was already on a queue, %true otherwise.  If
 * @delay is zero and @dwork is idle, it will be scheduled for immediate
 * execution.
 */
bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                           struct delayed_work *dwork, unsigned long delay)
{
        struct work_struct *work = &dwork->work;
        bool ret = false;
        unsigned long irq_flags;

        /* read the comment in __queue_work() */
        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                __queue_delayed_work(cpu, wq, dwork, delay);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL(queue_delayed_work_on);

/**
 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
 * modify @dwork's timer so that it expires after @delay.  If @delay is
 * zero, @work is guaranteed to be scheduled immediately regardless of its
 * current state.
 *
 * Return: %false if @dwork was idle and queued, %true if @dwork was
 * pending and its timer was modified.
 *
 * This function is safe to call from any context including IRQ handler.
 * See try_to_grab_pending() for details.
 */
bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                         struct delayed_work *dwork, unsigned long delay)
{
        unsigned long irq_flags;
        bool ret;

        ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags);

        if (!clear_pending_if_disabled(&dwork->work))
                __queue_delayed_work(cpu, wq, dwork, delay);

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL_GPL(mod_delayed_work_on);

static void rcu_work_rcufn(struct rcu_head *rcu)
{
        struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);

        /* read the comment in __queue_work() */
        local_irq_disable();
        __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
        local_irq_enable();
}

/**
 * queue_rcu_work - queue work after a RCU grace period
 * @wq: workqueue to use
 * @rwork: work to queue
 *
 * Return: %false if @rwork was already pending, %true otherwise.  Note
 * that a full RCU grace period is guaranteed only after a %true return.
 * While @rwork is guaranteed to be executed after a %false return, the
 * execution may happen before a full RCU grace period has passed.
 */
bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
{
        struct work_struct *work = &rwork->work;

        /*
         * rcu_work can't be canceled or disabled. Warn if the user reached
         * inside @rwork and disabled the inner work.
         */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !WARN_ON_ONCE(clear_pending_if_disabled(work))) {
                rwork->wq = wq;
                call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
                return true;
        }

        return false;
}
EXPORT_SYMBOL(queue_rcu_work);

static struct worker *alloc_worker(int node)
{
        struct worker *worker;

        worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
                INIT_LIST_HEAD(&worker->node);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
        return worker;
}

static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
{
        if (pool->cpu < 0 && pool->attrs->affn_strict)
                return pool->attrs->__pod_cpumask;
        else
                return pool->attrs->cpumask;
}

/**
 * worker_attach_to_pool() - attach a worker to a pool
 * @worker: worker to be attached
 * @pool: the target pool
 *
 * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
 * cpu-binding of @worker are kept coordinated with the pool across
 * cpu-[un]hotplugs.
 */
static void worker_attach_to_pool(struct worker *worker,
                                  struct worker_pool *pool)
{
        mutex_lock(&wq_pool_attach_mutex);

        /*
         * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable
         * across this function. See the comments above the flag definition for
         * details. BH workers are, while per-CPU, always DISASSOCIATED.
         */
        if (pool->flags & POOL_DISASSOCIATED) {
                worker->flags |= WORKER_UNBOUND;
        } else {
                WARN_ON_ONCE(pool->flags & POOL_BH);
                kthread_set_per_cpu(worker->task, pool->cpu);
        }

        if (worker->rescue_wq)
                set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));

        list_add_tail(&worker->node, &pool->workers);
        worker->pool = pool;

        mutex_unlock(&wq_pool_attach_mutex);
}

static void unbind_worker(struct worker *worker)
{
        lockdep_assert_held(&wq_pool_attach_mutex);

        kthread_set_per_cpu(worker->task, -1);
        if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
        else
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}


static void detach_worker(struct worker *worker)
{
        lockdep_assert_held(&wq_pool_attach_mutex);

        unbind_worker(worker);
        list_del(&worker->node);
}

/**
 * worker_detach_from_pool() - detach a worker from its pool
 * @worker: worker which is attached to its pool
 *
 * Undo the attaching which had been done in worker_attach_to_pool().  The
 * caller worker shouldn't access to the pool after detached except it has
 * other reference to the pool.
 */
static void worker_detach_from_pool(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        /* there is one permanent BH worker per CPU which should never detach */
        WARN_ON_ONCE(pool->flags & POOL_BH);

        mutex_lock(&wq_pool_attach_mutex);
        detach_worker(worker);
        worker->pool = NULL;
        mutex_unlock(&wq_pool_attach_mutex);

        /* clear leftover flags without pool->lock after it is detached */
        worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
}

static int format_worker_id(char *buf, size_t size, struct worker *worker,
                            struct worker_pool *pool)
{
        if (worker->rescue_wq)
                return scnprintf(buf, size, "kworker/R-%s",
                                 worker->rescue_wq->name);

        if (pool) {
                if (pool->cpu >= 0)
                        return scnprintf(buf, size, "kworker/%d:%d%s",
                                         pool->cpu, worker->id,
                                         pool->attrs->nice < 0  ? "H" : "");
                else
                        return scnprintf(buf, size, "kworker/u%d:%d",
                                         pool->id, worker->id);
        } else {
                return scnprintf(buf, size, "kworker/dying");
        }
}

/**
 * create_worker - create a new workqueue worker
 * @pool: pool the new worker will belong to
 *
 * Create and start a new worker which is attached to @pool.
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
 *
 * Return:
 * Pointer to the newly created worker.
 */
static struct worker *create_worker(struct worker_pool *pool)
{
        struct worker *worker;
        int id;

        /* ID is needed to determine kthread name */
        id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
        if (id < 0) {
                pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
                            ERR_PTR(id));
                return NULL;
        }

        worker = alloc_worker(pool->node);
        if (!worker) {
                pr_err_once("workqueue: Failed to allocate a worker\n");
                goto fail;
        }

        worker->id = id;

        if (!(pool->flags & POOL_BH)) {
                char id_buf[WORKER_ID_LEN];

                format_worker_id(id_buf, sizeof(id_buf), worker, pool);
                worker->task = kthread_create_on_node(worker_thread, worker,
                                                      pool->node, "%s", id_buf);
                if (IS_ERR(worker->task)) {
                        if (PTR_ERR(worker->task) == -EINTR) {
                                pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n",
                                       id_buf);
                        } else {
                                pr_err_once("workqueue: Failed to create a worker thread: %pe",
                                            worker->task);
                        }
                        goto fail;
                }

                set_user_nice(worker->task, pool->attrs->nice);
                kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
        }

        /* successful, attach the worker to the pool */
        worker_attach_to_pool(worker, pool);

        /* start the newly created worker */
        raw_spin_lock_irq(&pool->lock);

        worker->pool->nr_workers++;
        worker_enter_idle(worker);

        /*
         * @worker is waiting on a completion in kthread() and will trigger hung
         * check if not woken up soon. As kick_pool() is noop if @pool is empty,
         * wake it up explicitly.
         */
        if (worker->task)
                wake_up_process(worker->task);

        raw_spin_unlock_irq(&pool->lock);

        return worker;

fail:
        ida_free(&pool->worker_ida, id);
        kfree(worker);
        return NULL;
}

static void detach_dying_workers(struct list_head *cull_list)
{
        struct worker *worker;

        list_for_each_entry(worker, cull_list, entry)
                detach_worker(worker);
}

static void reap_dying_workers(struct list_head *cull_list)
{
        struct worker *worker, *tmp;

        list_for_each_entry_safe(worker, tmp, cull_list, entry) {
                list_del_init(&worker->entry);
                kthread_stop_put(worker->task);
                kfree(worker);
        }
}

/**
 * set_worker_dying - Tag a worker for destruction
 * @worker: worker to be destroyed
 * @list: transfer worker away from its pool->idle_list and into list
 *
 * Tag @worker for destruction and adjust @pool stats accordingly.  The worker
 * should be idle.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void set_worker_dying(struct worker *worker, struct list_head *list)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);
        lockdep_assert_held(&wq_pool_attach_mutex);

        /* sanity check frenzy */
        if (WARN_ON(worker->current_work) ||
            WARN_ON(!list_empty(&worker->scheduled)) ||
            WARN_ON(!(worker->flags & WORKER_IDLE)))
                return;

        pool->nr_workers--;
        pool->nr_idle--;

        worker->flags |= WORKER_DIE;

        list_move(&worker->entry, list);

        /* get an extra task struct reference for later kthread_stop_put() */
        get_task_struct(worker->task);
}

/**
 * idle_worker_timeout - check if some idle workers can now be deleted.
 * @t: The pool's idle_timer that just expired
 *
 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
 * worker_leave_idle(), as a worker flicking between idle and active while its
 * pool is at the too_many_workers() tipping point would cause too much timer
 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
 * it expire and re-evaluate things from there.
 */
static void idle_worker_timeout(struct timer_list *t)
{
        struct worker_pool *pool = timer_container_of(pool, t, idle_timer);
        bool do_cull = false;

        if (work_pending(&pool->idle_cull_work))
                return;

        raw_spin_lock_irq(&pool->lock);

        if (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                /* idle_list is kept in LIFO order, check the last one */
                worker = list_last_entry(&pool->idle_list, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                do_cull = !time_before(jiffies, expires);

                if (!do_cull)
                        mod_timer(&pool->idle_timer, expires);
        }
        raw_spin_unlock_irq(&pool->lock);

        if (do_cull)
                queue_work(system_dfl_wq, &pool->idle_cull_work);
}

/**
 * idle_cull_fn - cull workers that have been idle for too long.
 * @work: the pool's work for handling these idle workers
 *
 * This goes through a pool's idle workers and gets rid of those that have been
 * idle for at least IDLE_WORKER_TIMEOUT seconds.
 *
 * We don't want to disturb isolated CPUs because of a pcpu kworker being
 * culled, so this also resets worker affinity. This requires a sleepable
 * context, hence the split between timer callback and work item.
 */
static void idle_cull_fn(struct work_struct *work)
{
        struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
        LIST_HEAD(cull_list);

        /*
         * Grabbing wq_pool_attach_mutex here ensures an already-running worker
         * cannot proceed beyong set_pf_worker() in its self-destruct path.
         * This is required as a previously-preempted worker could run after
         * set_worker_dying() has happened but before detach_dying_workers() did.
         */
        mutex_lock(&wq_pool_attach_mutex);
        raw_spin_lock_irq(&pool->lock);

        while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                worker = list_last_entry(&pool->idle_list, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;

                if (time_before(jiffies, expires)) {
                        mod_timer(&pool->idle_timer, expires);
                        break;
                }

                set_worker_dying(worker, &cull_list);
        }

        raw_spin_unlock_irq(&pool->lock);
        detach_dying_workers(&cull_list);
        mutex_unlock(&wq_pool_attach_mutex);

        reap_dying_workers(&cull_list);
}

static void send_mayday(struct pool_workqueue *pwq)
{
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq_mayday_lock);

        if (!wq->rescuer)
                return;

        /* mayday mayday mayday */
        if (list_empty(&pwq->mayday_node)) {
                /*
                 * If @pwq is for an unbound wq, its base ref may be put at
                 * any time due to an attribute change.  Pin @pwq until the
                 * rescuer is done with it.
                 */
                get_pwq(pwq);
                list_add_tail(&pwq->mayday_node, &wq->maydays);
                wake_up_process(wq->rescuer->task);
                pwq->stats[PWQ_STAT_MAYDAY]++;
        }
}

static void pool_mayday_timeout(struct timer_list *t)
{
        struct worker_pool *pool = timer_container_of(pool, t, mayday_timer);
        struct work_struct *work;

        raw_spin_lock_irq(&pool->lock);
        raw_spin_lock(&wq_mayday_lock);                /* for wq->maydays */

        if (need_to_create_worker(pool)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
                list_for_each_entry(work, &pool->worklist, entry)
                        send_mayday(get_work_pwq(work));
        }

        raw_spin_unlock(&wq_mayday_lock);
        raw_spin_unlock_irq(&pool->lock);

        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}

/**
 * maybe_create_worker - create a new worker if necessary
 * @pool: pool to create a new worker for
 *
 * Create a new worker for @pool if necessary.  @pool is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
 * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be %false and
 * may_start_working() %true.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 */
static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
        raw_spin_unlock_irq(&pool->lock);

        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);

        while (true) {
                if (create_worker(pool) || !need_to_create_worker(pool))
                        break;

                schedule_timeout_interruptible(CREATE_COOLDOWN);

                if (!need_to_create_worker(pool))
                        break;
        }

        timer_delete_sync(&pool->mayday_timer);
        raw_spin_lock_irq(&pool->lock);
        /*
         * This is necessary even after a new worker was just successfully
         * created as @pool->lock was dropped and the new worker might have
         * already become busy.
         */
        if (need_to_create_worker(pool))
                goto restart;
}

#ifdef CONFIG_PREEMPT_RT
static void worker_lock_callback(struct worker_pool *pool)
{
        spin_lock(&pool->cb_lock);
}

static void worker_unlock_callback(struct worker_pool *pool)
{
        spin_unlock(&pool->cb_lock);
}

static void workqueue_callback_cancel_wait_running(struct worker_pool *pool)
{
        spin_lock(&pool->cb_lock);
        spin_unlock(&pool->cb_lock);
}

#else

static void worker_lock_callback(struct worker_pool *pool) { }
static void worker_unlock_callback(struct worker_pool *pool) { }
static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { }

#endif

/**
 * manage_workers - manage worker pool
 * @worker: self
 *
 * Assume the manager role and manage the worker pool @worker belongs
 * to.  At any given time, there can be only zero or one manager per
 * pool.  The exclusion is handled automatically by this function.
 *
 * The caller can safely start processing works on false return.  On
 * true return, it's guaranteed that need_to_create_worker() is false
 * and may_start_working() is true.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * Return:
 * %false if the pool doesn't need management and the caller can safely
 * start processing works, %true if management function was performed and
 * the conditions that the caller verified before calling the function may
 * no longer be true.
 */
static bool manage_workers(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & POOL_MANAGER_ACTIVE)
                return false;

        pool->flags |= POOL_MANAGER_ACTIVE;
        pool->manager = worker;

        maybe_create_worker(pool);

        pool->manager = NULL;
        pool->flags &= ~POOL_MANAGER_ACTIVE;
        rcuwait_wake_up(&manager_wait);
        return true;
}

/**
 * process_one_work - process single work
 * @worker: self
 * @work: work to process
 *
 * Process @work.  This function contains all the logics necessary to
 * process a single work including synchronization against and
 * interaction with other workers on the same cpu, queueing and
 * flushing.  As long as context requirement is met, any worker can
 * call this function to process a work.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
 */
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct worker_pool *pool = worker->pool;
        unsigned long work_data;
        int lockdep_start_depth, rcu_start_depth;
        bool bh_draining = pool->flags & POOL_BH_DRAINING;
#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the struct work_struct from
         * inside the function that is called from it, this we need to
         * take into account for lockdep too.  To avoid bogus "held
         * lock freed" warnings as well as problems when looking into
         * work->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
        /* ensure we're on the correct CPU */
        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     raw_smp_processor_id() != pool->cpu);

        /* claim and dequeue */
        debug_work_deactivate(work);
        hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
        worker->current_work = work;
        worker->current_func = work->func;
        worker->current_pwq = pwq;
        if (worker->task)
                worker->current_at = worker->task->se.sum_exec_runtime;
        worker->current_start = jiffies;
        work_data = *work_data_bits(work);
        worker->current_color = get_work_color(work_data);

        /*
         * Record wq name for cmdline and debug reporting, may get
         * overridden through set_worker_desc().
         */
        strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

        list_del_init(&work->entry);

        /*
         * CPU intensive works don't participate in concurrency management.
         * They're the scheduler's responsibility.  This takes @worker out
         * of concurrency management and the next code block will chain
         * execution of the pending work items.
         */
        if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE);

        /*
         * Kick @pool if necessary. It's always noop for per-cpu worker pools
         * since nr_running would always be >= 1 at this point. This is used to
         * chain execution of the pending work items for WORKER_NOT_RUNNING
         * workers such as the UNBOUND and CPU_INTENSIVE ones.
         */
        kick_pool(pool);

        /*
         * Record the last pool and clear PENDING which should be the last
         * update to @work.  Also, do this inside @pool->lock so that
         * PENDING and queued state changes happen together while IRQ is
         * disabled.
         */
        set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool));

        pwq->stats[PWQ_STAT_STARTED]++;
        raw_spin_unlock_irq(&pool->lock);

        rcu_start_depth = rcu_preempt_depth();
        lockdep_start_depth = lockdep_depth(current);
        /* see drain_dead_softirq_workfn() */
        if (!bh_draining)
                lock_map_acquire(pwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        /*
         * Strictly speaking we should mark the invariant state without holding
         * any locks, that is, before these two lock_map_acquire()'s.
         *
         * However, that would result in:
         *
         *   A(W1)
         *   WFC(C)
         *                A(W1)
         *                C(C)
         *
         * Which would create W1->C->W1 dependencies, even though there is no
         * actual deadlock possible. There are two solutions, using a
         * read-recursive acquire on the work(queue) 'locks', but this will then
         * hit the lockdep limitation on recursive locks, or simply discard
         * these locks.
         *
         * AFAICT there is no possible deadlock scenario between the
         * flush_work() and complete() primitives (except for single-threaded
         * workqueues), so hiding them isn't a problem.
         */
        lockdep_invariant_state(true);
        trace_workqueue_execute_start(work);
        worker->current_func(work);
        /*
         * While we must be careful to not use "work" after this, the trace
         * point will only record its address.
         */
        trace_workqueue_execute_end(work, worker->current_func);

        lock_map_release(&lockdep_map);
        if (!bh_draining)
                lock_map_release(pwq->wq->lockdep_map);

        if (unlikely((worker->task && in_atomic()) ||
                     lockdep_depth(current) != lockdep_start_depth ||
                     rcu_preempt_depth() != rcu_start_depth)) {
                pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n"
                       "     preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n",
                       current->comm, task_pid_nr(current), preempt_count(),
                       lockdep_start_depth, lockdep_depth(current),
                       rcu_start_depth, rcu_preempt_depth(),
                       worker->current_func);
                debug_show_held_locks(current);
                dump_stack();
        }

        /*
         * The following prevents a kworker from hogging CPU on !PREEMPTION
         * kernels, where a requeueing work item waiting for something to
         * happen could deadlock with stop_machine as such work item could
         * indefinitely requeue itself while all other CPUs are trapped in
         * stop_machine. At the same time, report a quiescent RCU state so
         * the same condition doesn't freeze RCU.
         */
        if (worker->task)
                cond_resched();

        raw_spin_lock_irq(&pool->lock);

        pwq->stats[PWQ_STAT_COMPLETED]++;

        /*
         * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
         * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
         * wq_cpu_intensive_thresh_us. Clear it.
         */
        worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

        /* tag the worker for identification in schedule() */
        worker->last_func = worker->current_func;

        /* we're done with it, release */
        hash_del(&worker->hentry);
        worker->current_work = NULL;
        worker->current_func = NULL;
        worker->current_pwq = NULL;
        worker->current_color = INT_MAX;

        /* must be the last step, see the function comment */
        pwq_dec_nr_in_flight(pwq, work_data);
}

/**
 * process_scheduled_works - process scheduled works
 * @worker: self
 *
 * Process all scheduled works.  Please note that the scheduled list
 * may change while processing a work, so this function repeatedly
 * fetches a work from the top and executes it.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.
 */
static void process_scheduled_works(struct worker *worker)
{
        struct work_struct *work;
        bool first = true;

        while ((work = list_first_entry_or_null(&worker->scheduled,
                                                struct work_struct, entry))) {
                if (first) {
                        worker->pool->last_progress_ts = jiffies;
                        first = false;
                }
                process_one_work(worker, work);
        }
}

static void set_pf_worker(bool val)
{
        mutex_lock(&wq_pool_attach_mutex);
        if (val)
                current->flags |= PF_WQ_WORKER;
        else
                current->flags &= ~PF_WQ_WORKER;
        mutex_unlock(&wq_pool_attach_mutex);
}

/**
 * worker_thread - the worker thread function
 * @__worker: self
 *
 * The worker thread function.  All workers belong to a worker_pool -
 * either a per-cpu one or dynamic unbound one.  These workers process all
 * work items regardless of their specific target workqueue.  The only
 * exception is work items which belong to workqueues with a rescuer which
 * will be explained in rescuer_thread().
 *
 * Return: 0
 */
static int worker_thread(void *__worker)
{
        struct worker *worker = __worker;
        struct worker_pool *pool = worker->pool;

        /* tell the scheduler that this is a workqueue worker */
        set_pf_worker(true);
woke_up:
        raw_spin_lock_irq(&pool->lock);

        /* am I supposed to die? */
        if (unlikely(worker->flags & WORKER_DIE)) {
                raw_spin_unlock_irq(&pool->lock);
                set_pf_worker(false);
                /*
                 * The worker is dead and PF_WQ_WORKER is cleared, worker->pool
                 * shouldn't be accessed, reset it to NULL in case otherwise.
                 */
                worker->pool = NULL;
                ida_free(&pool->worker_ida, worker->id);
                return 0;
        }

        worker_leave_idle(worker);
recheck:
        /* no more worker necessary? */
        if (!need_more_worker(pool))
                goto sleep;

        /* do we need to manage? */
        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                goto recheck;

        /*
         * ->scheduled list can only be filled while a worker is
         * preparing to process a work or actually processing it.
         * Make sure nobody diddled with it while I was sleeping.
         */
        WARN_ON_ONCE(!list_empty(&worker->scheduled));

        /*
         * Finish PREP stage.  We're guaranteed to have at least one idle
         * worker or that someone else has already assumed the manager
         * role.  This is where @worker starts participating in concurrency
         * management if applicable and concurrency management is restored
         * after being rebound.  See rebind_workers() for details.
         */
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                if (assign_work(work, worker, NULL))
                        process_scheduled_works(worker);
        } while (keep_working(pool));

        worker_set_flags(worker, WORKER_PREP);
sleep:
        /*
         * pool->lock is held and there's no work to process and no need to
         * manage, sleep.  Workers are woken up only while holding
         * pool->lock or from local cpu, so setting the current state
         * before releasing pool->lock is enough to prevent losing any
         * event.
         */
        worker_enter_idle(worker);
        __set_current_state(TASK_IDLE);
        raw_spin_unlock_irq(&pool->lock);
        schedule();
        goto woke_up;
}

static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer)
{
        struct worker_pool *pool = pwq->pool;
        struct work_struct *cursor = &pwq->mayday_cursor;
        struct work_struct *work, *n;

        /* have work items to rescue? */
        if (!pwq->nr_active)
                return false;

        /* need rescue? */
        if (!need_to_create_worker(pool)) {
                /*
                 * The pool has idle workers and doesn't need the rescuer, so it
                 * could simply return false here.
                 *
                 * However, the memory pressure might not be fully relieved.
                 * In PERCPU pool with concurrency enabled, having idle workers
                 * does not necessarily mean memory pressure is gone; it may
                 * simply mean regular workers have woken up, completed their
                 * work, and gone idle again due to concurrency limits.
                 *
                 * In this case, those working workers may later sleep again,
                 * the pool may run out of idle workers, and it will have to
                 * allocate new ones and wait for the timer to send mayday,
                 * causing unnecessary delay - especially if memory pressure
                 * was never resolved throughout.
                 *
                 * Do more work if memory pressure is still on to reduce
                 * relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though
                 * not precisely, unless there are other PWQs needing help.
                 */
                if (!(pool->flags & POOL_MANAGER_ACTIVE) ||
                    !list_empty(&pwq->wq->maydays))
                        return false;
        }

        /* search from the start or cursor if available */
        if (list_empty(&cursor->entry))
                work = list_first_entry(&pool->worklist, struct work_struct, entry);
        else
                work = list_next_entry(cursor, entry);

        /* find the next work item to rescue */
        list_for_each_entry_safe_from(work, n, &pool->worklist, entry) {
                if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) {
                        pwq->stats[PWQ_STAT_RESCUED]++;
                        /* put the cursor for next search */
                        list_move_tail(&cursor->entry, &n->entry);
                        return true;
                }
        }

        return false;
}

/**
 * rescuer_thread - the rescuer thread function
 * @__rescuer: self
 *
 * Workqueue rescuer thread function.  There's one rescuer for each
 * workqueue which has WQ_MEM_RECLAIM set.
 *
 * Regular work processing on a pool may block trying to create a new
 * worker which uses GFP_KERNEL allocation which has slight chance of
 * developing into deadlock if some works currently on the same queue
 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
 * the problem rescuer solves.
 *
 * When such condition is possible, the pool summons rescuers of all
 * workqueues which have works queued on the pool and let them process
 * those works so that forward progress can be guaranteed.
 *
 * This should happen rarely.
 *
 * Return: 0
 */
static int rescuer_thread(void *__rescuer)
{
        struct worker *rescuer = __rescuer;
        struct workqueue_struct *wq = rescuer->rescue_wq;
        bool should_stop;

        set_user_nice(current, RESCUER_NICE_LEVEL);

        /*
         * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
         * doesn't participate in concurrency management.
         */
        set_pf_worker(true);
repeat:
        set_current_state(TASK_IDLE);

        /*
         * By the time the rescuer is requested to stop, the workqueue
         * shouldn't have any work pending, but @wq->maydays may still have
         * pwq(s) queued.  This can happen by non-rescuer workers consuming
         * all the work items before the rescuer got to them.  Go through
         * @wq->maydays processing before acting on should_stop so that the
         * list is always empty on exit.
         */
        should_stop = kthread_should_stop();

        /* see whether any pwq is asking for help */
        raw_spin_lock_irq(&wq_mayday_lock);

        while (!list_empty(&wq->maydays)) {
                struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
                                        struct pool_workqueue, mayday_node);
                struct worker_pool *pool = pwq->pool;
                unsigned int count = 0;

                __set_current_state(TASK_RUNNING);
                list_del_init(&pwq->mayday_node);

                raw_spin_unlock_irq(&wq_mayday_lock);

                worker_attach_to_pool(rescuer, pool);

                raw_spin_lock_irq(&pool->lock);

                WARN_ON_ONCE(!list_empty(&rescuer->scheduled));

                while (assign_rescuer_work(pwq, rescuer)) {
                        process_scheduled_works(rescuer);

                        /*
                         * If the per-turn work item limit is reached and other
                         * PWQs are in mayday, requeue mayday for this PWQ and
                         * let the rescuer handle the other PWQs first.
                         */
                        if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) &&
                            pwq->nr_active && need_to_create_worker(pool)) {
                                raw_spin_lock(&wq_mayday_lock);
                                send_mayday(pwq);
                                raw_spin_unlock(&wq_mayday_lock);
                                break;
                        }
                }

                /* The cursor can not be left behind without the rescuer watching it. */
                if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node))
                        list_del_init(&pwq->mayday_cursor.entry);

                /*
                 * Leave this pool. Notify regular workers; otherwise, we end up
                 * with 0 concurrency and stalling the execution.
                 */
                kick_pool(pool);

                raw_spin_unlock_irq(&pool->lock);

                worker_detach_from_pool(rescuer);

                /*
                 * Put the reference grabbed by send_mayday().  @pool might
                 * go away any time after it.
                 */
                put_pwq_unlocked(pwq);

                raw_spin_lock_irq(&wq_mayday_lock);
        }

        raw_spin_unlock_irq(&wq_mayday_lock);

        if (should_stop) {
                __set_current_state(TASK_RUNNING);
                set_pf_worker(false);
                return 0;
        }

        /* rescuers should never participate in concurrency management */
        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
        goto repeat;
}

static void bh_worker(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;
        int nr_restarts = BH_WORKER_RESTARTS;
        unsigned long end = jiffies + BH_WORKER_JIFFIES;

        worker_lock_callback(pool);
        raw_spin_lock_irq(&pool->lock);
        worker_leave_idle(worker);

        /*
         * This function follows the structure of worker_thread(). See there for
         * explanations on each step.
         */
        if (!need_more_worker(pool))
                goto done;

        WARN_ON_ONCE(!list_empty(&worker->scheduled));
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                if (assign_work(work, worker, NULL))
                        process_scheduled_works(worker);
        } while (keep_working(pool) &&
                 --nr_restarts && time_before(jiffies, end));

        worker_set_flags(worker, WORKER_PREP);
done:
        worker_enter_idle(worker);
        kick_pool(pool);
        raw_spin_unlock_irq(&pool->lock);
        worker_unlock_callback(pool);
}

/*
 * TODO: Convert all tasklet users to workqueue and use softirq directly.
 *
 * This is currently called from tasklet[_hi]action() and thus is also called
 * whenever there are tasklets to run. Let's do an early exit if there's nothing
 * queued. Once conversion from tasklet is complete, the need_more_worker() test
 * can be dropped.
 *
 * After full conversion, we'll add worker->softirq_action, directly use the
 * softirq action and obtain the worker pointer from the softirq_action pointer.
 */
void workqueue_softirq_action(bool highpri)
{
        struct worker_pool *pool =
                &per_cpu(bh_worker_pools, smp_processor_id())[highpri];
        if (need_more_worker(pool))
                bh_worker(list_first_entry(&pool->workers, struct worker, node));
}

struct wq_drain_dead_softirq_work {
        struct work_struct        work;
        struct worker_pool        *pool;
        struct completion        done;
};

static void drain_dead_softirq_workfn(struct work_struct *work)
{
        struct wq_drain_dead_softirq_work *dead_work =
                container_of(work, struct wq_drain_dead_softirq_work, work);
        struct worker_pool *pool = dead_work->pool;
        bool repeat;

        /*
         * @pool's CPU is dead and we want to execute its still pending work
         * items from this BH work item which is running on a different CPU. As
         * its CPU is dead, @pool can't be kicked and, as work execution path
         * will be nested, a lockdep annotation needs to be suppressed. Mark
         * @pool with %POOL_BH_DRAINING for the special treatments.
         */
        raw_spin_lock_irq(&pool->lock);
        pool->flags |= POOL_BH_DRAINING;
        raw_spin_unlock_irq(&pool->lock);

        bh_worker(list_first_entry(&pool->workers, struct worker, node));

        raw_spin_lock_irq(&pool->lock);
        pool->flags &= ~POOL_BH_DRAINING;
        repeat = need_more_worker(pool);
        raw_spin_unlock_irq(&pool->lock);

        /*
         * bh_worker() might hit consecutive execution limit and bail. If there
         * still are pending work items, reschedule self and return so that we
         * don't hog this CPU's BH.
         */
        if (repeat) {
                if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                        queue_work(system_bh_highpri_wq, work);
                else
                        queue_work(system_bh_wq, work);
        } else {
                complete(&dead_work->done);
        }
}

/*
 * @cpu is dead. Drain the remaining BH work items on the current CPU. It's
 * possible to allocate dead_work per CPU and avoid flushing. However, then we
 * have to worry about draining overlapping with CPU coming back online or
 * nesting (one CPU's dead_work queued on another CPU which is also dead and so
 * on). Let's keep it simple and drain them synchronously. These are BH work
 * items which shouldn't be requeued on the same pool. Shouldn't take long.
 */
void workqueue_softirq_dead(unsigned int cpu)
{
        int i;

        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
                struct wq_drain_dead_softirq_work dead_work;

                if (!need_more_worker(pool))
                        continue;

                INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn);
                dead_work.pool = pool;
                init_completion(&dead_work.done);

                if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                        queue_work(system_bh_highpri_wq, &dead_work.work);
                else
                        queue_work(system_bh_wq, &dead_work.work);

                wait_for_completion(&dead_work.done);
                destroy_work_on_stack(&dead_work.work);
        }
}

/**
 * check_flush_dependency - check for flush dependency sanity
 * @target_wq: workqueue being flushed
 * @target_work: work item being flushed (NULL for workqueue flushes)
 * @from_cancel: are we called from the work cancel path
 *
 * %current is trying to flush the whole @target_wq or @target_work on it.
 * If this is not the cancel path (which implies work being flushed is either
 * already running, or will not be at all), check if @target_wq doesn't have
 * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running
 * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward-
 * progress guarantee leading to a deadlock.
 */
static void check_flush_dependency(struct workqueue_struct *target_wq,
                                   struct work_struct *target_work,
                                   bool from_cancel)
{
        work_func_t target_func;
        struct worker *worker;

        if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM)
                return;

        worker = current_wq_worker();
        target_func = target_work ? target_work->func : NULL;

        WARN_ONCE(current->flags & PF_MEMALLOC,
                  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
                  current->pid, current->comm, target_wq->name, target_func);
        WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
                              (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
                  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
                  worker->current_pwq->wq->name, worker->current_func,
                  target_wq->name, target_func);
}

struct wq_barrier {
        struct work_struct        work;
        struct completion        done;
        struct task_struct        *task;        /* purely informational */
};

static void wq_barrier_func(struct work_struct *work)
{
        struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
        complete(&barr->done);
}

/**
 * insert_wq_barrier - insert a barrier work
 * @pwq: pwq to insert barrier into
 * @barr: wq_barrier to insert
 * @target: target work to attach @barr to
 * @worker: worker currently executing @target, NULL if @target is not executing
 *
 * @barr is linked to @target such that @barr is completed only after
 * @target finishes execution.  Please note that the ordering
 * guarantee is observed only with respect to @target and on the local
 * cpu.
 *
 * Currently, a queued barrier can't be canceled.  This is because
 * try_to_grab_pending() can't determine whether the work to be
 * grabbed is at the head of the queue and thus can't clear LINKED
 * flag of the previous work while there must be a valid next work
 * after a work with LINKED flag set.
 *
 * Note that when @worker is non-NULL, @target may be modified
 * underneath us, so we can't reliably determine pwq from @target.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_wq_barrier(struct pool_workqueue *pwq,
                              struct wq_barrier *barr,
                              struct work_struct *target, struct worker *worker)
{
        static __maybe_unused struct lock_class_key bh_key, thr_key;
        unsigned int work_flags = 0;
        unsigned int work_color;
        struct list_head *head;

        /*
         * debugobject calls are safe here even with pool->lock locked
         * as we know for sure that this will not trigger any of the
         * checks and call back into the fixup functions where we
         * might deadlock.
         *
         * BH and threaded workqueues need separate lockdep keys to avoid
         * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W}
         * usage".
         */
        INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func,
                              (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));

        init_completion_map(&barr->done, &target->lockdep_map);

        barr->task = current;

        /* The barrier work item does not participate in nr_active. */
        work_flags |= WORK_STRUCT_INACTIVE;

        /*
         * If @target is currently being executed, schedule the
         * barrier to the worker; otherwise, put it after @target.
         */
        if (worker) {
                head = worker->scheduled.next;
                work_color = worker->current_color;
        } else {
                unsigned long *bits = work_data_bits(target);

                head = target->entry.next;
                /* there can already be other linked works, inherit and set */
                work_flags |= *bits & WORK_STRUCT_LINKED;
                work_color = get_work_color(*bits);
                __set_bit(WORK_STRUCT_LINKED_BIT, bits);
        }

        pwq->nr_in_flight[work_color]++;
        work_flags |= work_color_to_flags(work_color);

        insert_work(pwq, &barr->work, head, work_flags);
}

/**
 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
 * @wq: workqueue being flushed
 * @flush_color: new flush color, < 0 for no-op
 * @work_color: new work color, < 0 for no-op
 *
 * Prepare pwqs for workqueue flushing.
 *
 * If @flush_color is non-negative, flush_color on all pwqs should be
 * -1.  If no pwq has in-flight commands at the specified color, all
 * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
 * has in flight commands, its pwq->flush_color is set to
 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
 * wakeup logic is armed and %true is returned.
 *
 * The caller should have initialized @wq->first_flusher prior to
 * calling this function with non-negative @flush_color.  If
 * @flush_color is negative, no flush color update is done and %false
 * is returned.
 *
 * If @work_color is non-negative, all pwqs should have the same
 * work_color which is previous to @work_color and all will be
 * advanced to @work_color.
 *
 * CONTEXT:
 * mutex_lock(wq->mutex).
 *
 * Return:
 * %true if @flush_color >= 0 and there's something to flush.  %false
 * otherwise.
 */
static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                                      int flush_color, int work_color)
{
        bool wait = false;
        struct pool_workqueue *pwq;
        struct worker_pool *current_pool = NULL;

        if (flush_color >= 0) {
                WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
                atomic_set(&wq->nr_pwqs_to_flush, 1);
        }

        /*
         * For unbound workqueue, pwqs will map to only a few pools.
         * Most of the time, pwqs within the same pool will be linked
         * sequentially to wq->pwqs by cpu index. So in the majority
         * of pwq iters, the pool is the same, only doing lock/unlock
         * if the pool has changed. This can largely reduce expensive
         * lock operations.
         */
        for_each_pwq(pwq, wq) {
                if (current_pool != pwq->pool) {
                        if (likely(current_pool))
                                raw_spin_unlock_irq(&current_pool->lock);
                        current_pool = pwq->pool;
                        raw_spin_lock_irq(&current_pool->lock);
                }

                if (flush_color >= 0) {
                        WARN_ON_ONCE(pwq->flush_color != -1);

                        if (pwq->nr_in_flight[flush_color]) {
                                pwq->flush_color = flush_color;
                                atomic_inc(&wq->nr_pwqs_to_flush);
                                wait = true;
                        }
                }

                if (work_color >= 0) {
                        WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
                        pwq->work_color = work_color;
                }

        }

        if (current_pool)
                raw_spin_unlock_irq(&current_pool->lock);

        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
                complete(&wq->first_flusher->done);

        return wait;
}

static void touch_wq_lockdep_map(struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
        if (unlikely(!wq->lockdep_map))
                return;

        if (wq->flags & WQ_BH)
                local_bh_disable();

        lock_map_acquire(wq->lockdep_map);
        lock_map_release(wq->lockdep_map);

        if (wq->flags & WQ_BH)
                local_bh_enable();
#endif
}

static void touch_work_lockdep_map(struct work_struct *work,
                                   struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
        if (wq->flags & WQ_BH)
                local_bh_disable();

        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);

        if (wq->flags & WQ_BH)
                local_bh_enable();
#endif
}

/**
 * __flush_workqueue - ensure that any scheduled work has run to completion.
 * @wq: workqueue to flush
 *
 * This function sleeps until all work items which were queued on entry
 * have finished execution, but it is not livelocked by new incoming ones.
 */
void __flush_workqueue(struct workqueue_struct *wq)
{
        struct wq_flusher this_flusher = {
                .list = LIST_HEAD_INIT(this_flusher.list),
                .flush_color = -1,
                .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)),
        };
        int next_color;

        if (WARN_ON(!wq_online))
                return;

        touch_wq_lockdep_map(wq);

        mutex_lock(&wq->mutex);

        /*
         * Start-to-wait phase
         */
        next_color = work_next_color(wq->work_color);

        if (next_color != wq->flush_color) {
                /*
                 * Color space is not full.  The current work_color
                 * becomes our flush_color and work_color is advanced
                 * by one.
                 */
                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
                this_flusher.flush_color = wq->work_color;
                wq->work_color = next_color;

                if (!wq->first_flusher) {
                        /* no flush in progress, become the first flusher */
                        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

                        wq->first_flusher = &this_flusher;

                        if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
                                                       wq->work_color)) {
                                /* nothing to flush, done */
                                wq->flush_color = next_color;
                                wq->first_flusher = NULL;
                                goto out_unlock;
                        }
                } else {
                        /* wait in queue */
                        WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
        } else {
                /*
                 * Oops, color space is full, wait on overflow queue.
                 * The next flush completion will assign us
                 * flush_color and transfer to flusher_queue.
                 */
                list_add_tail(&this_flusher.list, &wq->flusher_overflow);
        }

        check_flush_dependency(wq, NULL, false);

        mutex_unlock(&wq->mutex);

        wait_for_completion(&this_flusher.done);

        /*
         * Wake-up-and-cascade phase
         *
         * First flushers are responsible for cascading flushes and
         * handling overflow.  Non-first flushers can simply return.
         */
        if (READ_ONCE(wq->first_flusher) != &this_flusher)
                return;

        mutex_lock(&wq->mutex);

        /* we might have raced, check again with mutex held */
        if (wq->first_flusher != &this_flusher)
                goto out_unlock;

        WRITE_ONCE(wq->first_flusher, NULL);

        WARN_ON_ONCE(!list_empty(&this_flusher.list));
        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

        while (true) {
                struct wq_flusher *next, *tmp;

                /* complete all the flushers sharing the current flush color */
                list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
                        if (next->flush_color != wq->flush_color)
                                break;
                        list_del_init(&next->list);
                        complete(&next->done);
                }

                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
                             wq->flush_color != work_next_color(wq->work_color));

                /* this flush_color is finished, advance by one */
                wq->flush_color = work_next_color(wq->flush_color);

                /* one color has been freed, handle overflow queue */
                if (!list_empty(&wq->flusher_overflow)) {
                        /*
                         * Assign the same color to all overflowed
                         * flushers, advance work_color and append to
                         * flusher_queue.  This is the start-to-wait
                         * phase for these overflowed flushers.
                         */
                        list_for_each_entry(tmp, &wq->flusher_overflow, list)
                                tmp->flush_color = wq->work_color;

                        wq->work_color = work_next_color(wq->work_color);

                        list_splice_tail_init(&wq->flusher_overflow,
                                              &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }

                if (list_empty(&wq->flusher_queue)) {
                        WARN_ON_ONCE(wq->flush_color != wq->work_color);
                        break;
                }

                /*
                 * Need to flush more colors.  Make the next flusher
                 * the new first flusher and arm pwqs.
                 */
                WARN_ON_ONCE(wq->flush_color == wq->work_color);
                WARN_ON_ONCE(wq->flush_color != next->flush_color);

                list_del_init(&next->list);
                wq->first_flusher = next;

                if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
                        break;

                /*
                 * Meh... this color is already done, clear first
                 * flusher and repeat cascading.
                 */
                wq->first_flusher = NULL;
        }

out_unlock:
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL(__flush_workqueue);

/**
 * drain_workqueue - drain a workqueue
 * @wq: workqueue to drain
 *
 * Wait until the workqueue becomes empty.  While draining is in progress,
 * only chain queueing is allowed.  IOW, only currently pending or running
 * work items on @wq can queue further work items on it.  @wq is flushed
 * repeatedly until it becomes empty.  The number of flushing is determined
 * by the depth of chaining and should be relatively short.  Whine if it
 * takes too long.
 */
void drain_workqueue(struct workqueue_struct *wq)
{
        unsigned int flush_cnt = 0;
        struct pool_workqueue *pwq;

        /*
         * __queue_work() needs to test whether there are drainers, is much
         * hotter than drain_workqueue() and already looks at @wq->flags.
         * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
         */
        mutex_lock(&wq->mutex);
        if (!wq->nr_drainers++)
                wq->flags |= __WQ_DRAINING;
        mutex_unlock(&wq->mutex);
reflush:
        __flush_workqueue(wq);

        mutex_lock(&wq->mutex);

        for_each_pwq(pwq, wq) {
                bool drained;

                raw_spin_lock_irq(&pwq->pool->lock);
                drained = pwq_is_empty(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);

                if (drained)
                        continue;

                if (++flush_cnt == 10 ||
                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
                        pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
                                wq->name, __func__, flush_cnt);

                mutex_unlock(&wq->mutex);
                goto reflush;
        }

        if (!--wq->nr_drainers)
                wq->flags &= ~__WQ_DRAINING;
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(drain_workqueue);

static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                             bool from_cancel)
{
        struct worker *worker = NULL;
        struct worker_pool *pool;
        struct pool_workqueue *pwq;
        struct workqueue_struct *wq;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (!pool) {
                rcu_read_unlock();
                return false;
        }

        raw_spin_lock_irq(&pool->lock);
        /* see the comment in try_to_grab_pending() with the same code */
        pwq = get_work_pwq(work);
        if (pwq) {
                if (unlikely(pwq->pool != pool))
                        goto already_gone;
        } else {
                worker = find_worker_executing_work(pool, work);
                if (!worker)
                        goto already_gone;
                pwq = worker->current_pwq;
        }

        wq = pwq->wq;
        check_flush_dependency(wq, work, from_cancel);

        insert_wq_barrier(pwq, barr, work, worker);
        raw_spin_unlock_irq(&pool->lock);

        touch_work_lockdep_map(work, wq);

        /*
         * Force a lock recursion deadlock when using flush_work() inside a
         * single-threaded or rescuer equipped workqueue.
         *
         * For single threaded workqueues the deadlock happens when the work
         * is after the work issuing the flush_work(). For rescuer equipped
         * workqueues the deadlock happens when the rescuer stalls, blocking
         * forward progress.
         */
        if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer))
                touch_wq_lockdep_map(wq);

        rcu_read_unlock();
        return true;
already_gone:
        raw_spin_unlock_irq(&pool->lock);
        rcu_read_unlock();
        return false;
}

static bool __flush_work(struct work_struct *work, bool from_cancel)
{
        struct wq_barrier barr;

        if (WARN_ON(!wq_online))
                return false;

        if (WARN_ON(!work->func))
                return false;

        if (!start_flush_work(work, &barr, from_cancel))
                return false;

        /*
         * start_flush_work() returned %true. If @from_cancel is set, we know
         * that @work must have been executing during start_flush_work() and
         * can't currently be queued. Its data must contain OFFQ bits. If @work
         * was queued on a BH workqueue, we also know that it was running in the
         * BH context and thus can be busy-waited.
         */
        if (from_cancel) {
                unsigned long data = *work_data_bits(work);

                if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) &&
                    (data & WORK_OFFQ_BH)) {
                        /*
                         * On RT, prevent a live lock when %current preempted
                         * soft interrupt processing by blocking on lock which
                         * is owned by the thread invoking the callback.
                         */
                        while (!try_wait_for_completion(&barr.done)) {
                                if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                                        struct worker_pool *pool;

                                        guard(rcu)();
                                        pool = get_work_pool(work);
                                        if (pool)
                                                workqueue_callback_cancel_wait_running(pool);
                                } else {
                                        cpu_relax();
                                }
                        }
                        goto out_destroy;
                }
        }

        wait_for_completion(&barr.done);

out_destroy:
        destroy_work_on_stack(&barr.work);
        return true;
}

/**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
 * Wait until @work has finished execution.  @work is guaranteed to be idle
 * on return if it hasn't been requeued since flush started.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_work(struct work_struct *work)
{
        might_sleep();
        return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);

/**
 * flush_delayed_work - wait for a dwork to finish executing the last queueing
 * @dwork: the delayed work to flush
 *
 * Delayed timer is cancelled and the pending work is queued for
 * immediate execution.  Like flush_work(), this function only
 * considers the last queueing instance of @dwork.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_delayed_work(struct delayed_work *dwork)
{
        local_irq_disable();
        if (timer_delete_sync(&dwork->timer))
                __queue_work(dwork->cpu, dwork->wq, &dwork->work);
        local_irq_enable();
        return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);

/**
 * flush_rcu_work - wait for a rwork to finish executing the last queueing
 * @rwork: the rcu work to flush
 *
 * Return:
 * %true if flush_rcu_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_rcu_work(struct rcu_work *rwork)
{
        if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
                rcu_barrier();
                flush_work(&rwork->work);
                return true;
        } else {
                return flush_work(&rwork->work);
        }
}
EXPORT_SYMBOL(flush_rcu_work);

static void work_offqd_disable(struct work_offq_data *offqd)
{
        const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1;

        if (likely(offqd->disable < max))
                offqd->disable++;
        else
                WARN_ONCE(true, "workqueue: work disable count overflowed\n");
}

static void work_offqd_enable(struct work_offq_data *offqd)
{
        if (likely(offqd->disable > 0))
                offqd->disable--;
        else
                WARN_ONCE(true, "workqueue: work disable count underflowed\n");
}

static bool __cancel_work(struct work_struct *work, u32 cflags)
{
        struct work_offq_data offqd;
        unsigned long irq_flags;
        int ret;

        ret = work_grab_pending(work, cflags, &irq_flags);

        work_offqd_unpack(&offqd, *work_data_bits(work));

        if (cflags & WORK_CANCEL_DISABLE)
                work_offqd_disable(&offqd);

        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        local_irq_restore(irq_flags);
        return ret;
}

static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
{
        bool ret;

        ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE);

        if (*work_data_bits(work) & WORK_OFFQ_BH)
                WARN_ON_ONCE(in_hardirq());
        else
                might_sleep();

        /*
         * Skip __flush_work() during early boot when we know that @work isn't
         * executing. This allows canceling during early boot.
         */
        if (wq_online)
                __flush_work(work, true);

        if (!(cflags & WORK_CANCEL_DISABLE))
                enable_work(work);

        return ret;
}

/*
 * See cancel_delayed_work()
 */
bool cancel_work(struct work_struct *work)
{
        return __cancel_work(work, 0);
}
EXPORT_SYMBOL(cancel_work);

/**
 * cancel_work_sync - cancel a work and wait for it to finish
 * @work: the work to cancel
 *
 * Cancel @work and wait for its execution to finish. This function can be used
 * even if the work re-queues itself or migrates to another workqueue. On return
 * from this function, @work is guaranteed to be not pending or executing on any
 * CPU as long as there aren't racing enqueues.
 *
 * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's.
 * Use cancel_delayed_work_sync() instead.
 *
 * Must be called from a sleepable context if @work was last queued on a non-BH
 * workqueue. Can also be called from non-hardirq atomic contexts including BH
 * if @work was last queued on a BH workqueue.
 *
 * Returns %true if @work was pending, %false otherwise.
 */
bool cancel_work_sync(struct work_struct *work)
{
        return __cancel_work_sync(work, 0);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);

/**
 * cancel_delayed_work - cancel a delayed work
 * @dwork: delayed_work to cancel
 *
 * Kill off a pending delayed_work.
 *
 * Return: %true if @dwork was pending and canceled; %false if it wasn't
 * pending.
 *
 * Note:
 * The work callback function may still be running on return, unless
 * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
 * use cancel_delayed_work_sync() to wait on it.
 *
 * This function is safe to call from any context including IRQ handler.
 */
bool cancel_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work);

/**
 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
 * @dwork: the delayed work cancel
 *
 * This is cancel_work_sync() for delayed works.
 *
 * Return:
 * %true if @dwork was pending, %false otherwise.
 */
bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work_sync);

/**
 * disable_work - Disable and cancel a work item
 * @work: work item to disable
 *
 * Disable @work by incrementing its disable count and cancel it if currently
 * pending. As long as the disable count is non-zero, any attempt to queue @work
 * will fail and return %false. The maximum supported disable depth is 2 to the
 * power of %WORK_OFFQ_DISABLE_BITS, currently 65536.
 *
 * Can be called from any context. Returns %true if @work was pending, %false
 * otherwise.
 */
bool disable_work(struct work_struct *work)
{
        return __cancel_work(work, WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_work);

/**
 * disable_work_sync - Disable, cancel and drain a work item
 * @work: work item to disable
 *
 * Similar to disable_work() but also wait for @work to finish if currently
 * executing.
 *
 * Must be called from a sleepable context if @work was last queued on a non-BH
 * workqueue. Can also be called from non-hardirq atomic contexts including BH
 * if @work was last queued on a BH workqueue.
 *
 * Returns %true if @work was pending, %false otherwise.
 */
bool disable_work_sync(struct work_struct *work)
{
        return __cancel_work_sync(work, WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_work_sync);

/**
 * enable_work - Enable a work item
 * @work: work item to enable
 *
 * Undo disable_work[_sync]() by decrementing @work's disable count. @work can
 * only be queued if its disable count is 0.
 *
 * Can be called from any context. Returns %true if the disable count reached 0.
 * Otherwise, %false.
 */
bool enable_work(struct work_struct *work)
{
        struct work_offq_data offqd;
        unsigned long irq_flags;

        work_grab_pending(work, 0, &irq_flags);

        work_offqd_unpack(&offqd, *work_data_bits(work));
        work_offqd_enable(&offqd);
        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        local_irq_restore(irq_flags);

        return !offqd.disable;
}
EXPORT_SYMBOL_GPL(enable_work);

/**
 * disable_delayed_work - Disable and cancel a delayed work item
 * @dwork: delayed work item to disable
 *
 * disable_work() for delayed work items.
 */
bool disable_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work,
                             WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_delayed_work);

/**
 * disable_delayed_work_sync - Disable, cancel and drain a delayed work item
 * @dwork: delayed work item to disable
 *
 * disable_work_sync() for delayed work items.
 */
bool disable_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_sync(&dwork->work,
                                  WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_delayed_work_sync);

/**
 * enable_delayed_work - Enable a delayed work item
 * @dwork: delayed work item to enable
 *
 * enable_work() for delayed work items.
 */
bool enable_delayed_work(struct delayed_work *dwork)
{
        return enable_work(&dwork->work);
}
EXPORT_SYMBOL_GPL(enable_delayed_work);

/**
 * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
 * schedule_on_each_cpu() executes @func on each online CPU using the
 * system workqueue and blocks until all CPUs have completed.
 * schedule_on_each_cpu() is very slow.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
int schedule_on_each_cpu(work_func_t func)
{
        int cpu;
        struct work_struct __percpu *works;

        works = alloc_percpu(struct work_struct);
        if (!works)
                return -ENOMEM;

        cpus_read_lock();

        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);

                INIT_WORK(work, func);
                schedule_work_on(cpu, work);
        }

        for_each_online_cpu(cpu)
                flush_work(per_cpu_ptr(works, cpu));

        cpus_read_unlock();
        free_percpu(works);
        return 0;
}

/**
 * execute_in_process_context - reliably execute the routine with user context
 * @fn:                the function to execute
 * @ew:                guaranteed storage for the execute work structure (must
 *                be available when the work executes)
 *
 * Executes the function immediately if process context is available,
 * otherwise schedules the function for delayed execution.
 *
 * Return:        0 - function was executed
 *                1 - function was scheduled for execution
 */
int execute_in_process_context(work_func_t fn, struct execute_work *ew)
{
        if (!in_interrupt()) {
                fn(&ew->work);
                return 0;
        }

        INIT_WORK(&ew->work, fn);
        schedule_work(&ew->work);

        return 1;
}
EXPORT_SYMBOL_GPL(execute_in_process_context);

/**
 * free_workqueue_attrs - free a workqueue_attrs
 * @attrs: workqueue_attrs to free
 *
 * Undo alloc_workqueue_attrs().
 */
void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
        if (attrs) {
                free_cpumask_var(attrs->cpumask);
                free_cpumask_var(attrs->__pod_cpumask);
                kfree(attrs);
        }
}

/**
 * alloc_workqueue_attrs - allocate a workqueue_attrs
 *
 * Allocate a new workqueue_attrs, initialize with default settings and
 * return it.
 *
 * Return: The allocated new workqueue_attr on success. %NULL on failure.
 */
struct workqueue_attrs *alloc_workqueue_attrs_noprof(void)
{
        struct workqueue_attrs *attrs;

        attrs = kzalloc_obj(*attrs);
        if (!attrs)
                goto fail;
        if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
                goto fail;
        if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
                goto fail;

        cpumask_copy(attrs->cpumask, cpu_possible_mask);
        attrs->affn_scope = WQ_AFFN_DFL;
        return attrs;
fail:
        free_workqueue_attrs(attrs);
        return NULL;
}

static void copy_workqueue_attrs(struct workqueue_attrs *to,
                                 const struct workqueue_attrs *from)
{
        to->nice = from->nice;
        cpumask_copy(to->cpumask, from->cpumask);
        cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
        to->affn_strict = from->affn_strict;

        /*
         * Unlike hash and equality test, copying shouldn't ignore wq-only
         * fields as copying is used for both pool and wq attrs. Instead,
         * get_unbound_pool() explicitly clears the fields.
         */
        to->affn_scope = from->affn_scope;
        to->ordered = from->ordered;
}

/*
 * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
 * comments in 'struct workqueue_attrs' definition.
 */
static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
{
        attrs->affn_scope = WQ_AFFN_NR_TYPES;
        attrs->ordered = false;
        if (attrs->affn_strict)
                cpumask_copy(attrs->cpumask, cpu_possible_mask);
}

/* hash value of the content of @attr */
static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
{
        u32 hash = 0;

        hash = jhash_1word(attrs->nice, hash);
        hash = jhash_1word(attrs->affn_strict, hash);
        hash = jhash(cpumask_bits(attrs->__pod_cpumask),
                     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        if (!attrs->affn_strict)
                hash = jhash(cpumask_bits(attrs->cpumask),
                             BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        return hash;
}

/* content equality test */
static bool wqattrs_equal(const struct workqueue_attrs *a,
                          const struct workqueue_attrs *b)
{
        if (a->nice != b->nice)
                return false;
        if (a->affn_strict != b->affn_strict)
                return false;
        if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
                return false;
        if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask))
                return false;
        return true;
}

/* Update @attrs with actually available CPUs */
static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
                                      const cpumask_t *unbound_cpumask)
{
        /*
         * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
         * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
         * @unbound_cpumask.
         */
        cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);
        if (unlikely(cpumask_empty(attrs->cpumask)))
                cpumask_copy(attrs->cpumask, unbound_cpumask);
}

/* find wq_pod_type to use for @attrs */
static const struct wq_pod_type *
wqattrs_pod_type(const struct workqueue_attrs *attrs)
{
        enum wq_affn_scope scope;
        struct wq_pod_type *pt;

        /* to synchronize access to wq_affn_dfl */
        lockdep_assert_held(&wq_pool_mutex);

        if (attrs->affn_scope == WQ_AFFN_DFL)
                scope = wq_affn_dfl;
        else
                scope = attrs->affn_scope;

        pt = &wq_pod_types[scope];

        if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
            likely(pt->nr_pods))
                return pt;

        /*
         * Before workqueue_init_topology(), only SYSTEM is available which is
         * initialized in workqueue_init_early().
         */
        pt = &wq_pod_types[WQ_AFFN_SYSTEM];
        BUG_ON(!pt->nr_pods);
        return pt;
}

/**
 * init_worker_pool - initialize a newly zalloc'd worker_pool
 * @pool: worker_pool to initialize
 *
 * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
 *
 * Return: 0 on success, -errno on failure.  Even on failure, all fields
 * inside @pool proper are initialized and put_unbound_pool() can be called
 * on @pool safely to release it.
 */
static int init_worker_pool(struct worker_pool *pool)
{
        raw_spin_lock_init(&pool->lock);
        pool->id = -1;
        pool->cpu = -1;
        pool->node = NUMA_NO_NODE;
        pool->flags |= POOL_DISASSOCIATED;
        pool->last_progress_ts = jiffies;
        INIT_LIST_HEAD(&pool->worklist);
        INIT_LIST_HEAD(&pool->idle_list);
        hash_init(pool->busy_hash);

        timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
        INIT_WORK(&pool->idle_cull_work, idle_cull_fn);

        timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);

        INIT_LIST_HEAD(&pool->workers);

        ida_init(&pool->worker_ida);
        INIT_HLIST_NODE(&pool->hash_node);
        pool->refcnt = 1;
#ifdef CONFIG_PREEMPT_RT
        spin_lock_init(&pool->cb_lock);
#endif

        /* shouldn't fail above this point */
        pool->attrs = alloc_workqueue_attrs();
        if (!pool->attrs)
                return -ENOMEM;

        wqattrs_clear_for_pool(pool->attrs);

        return 0;
}

#ifdef CONFIG_LOCKDEP
static void wq_init_lockdep(struct workqueue_struct *wq)
{
        char *lock_name;

        lockdep_register_key(&wq->key);
        lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
        if (!lock_name)
                lock_name = wq->name;

        wq->lock_name = lock_name;
        wq->lockdep_map = &wq->__lockdep_map;
        lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0);
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
        if (wq->lockdep_map != &wq->__lockdep_map)
                return;

        lockdep_unregister_key(&wq->key);
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
        if (wq->lockdep_map != &wq->__lockdep_map)
                return;

        if (wq->lock_name != wq->name)
                kfree(wq->lock_name);
}
#else
static void wq_init_lockdep(struct workqueue_struct *wq)
{
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
}
#endif

static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
{
        int node;

        for_each_node(node) {
                kfree(nna_ar[node]);
                nna_ar[node] = NULL;
        }

        kfree(nna_ar[nr_node_ids]);
        nna_ar[nr_node_ids] = NULL;
}

static void init_node_nr_active(struct wq_node_nr_active *nna)
{
        nna->max = WQ_DFL_MIN_ACTIVE;
        atomic_set(&nna->nr, 0);
        raw_spin_lock_init(&nna->lock);
        INIT_LIST_HEAD(&nna->pending_pwqs);
}

/*
 * Each node's nr_active counter will be accessed mostly from its own node and
 * should be allocated in the node.
 */
static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
{
        struct wq_node_nr_active *nna;
        int node;

        for_each_node(node) {
                nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
                if (!nna)
                        goto err_free;
                init_node_nr_active(nna);
                nna_ar[node] = nna;
        }

        /* [nr_node_ids] is used as the fallback */
        nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
        if (!nna)
                goto err_free;
        init_node_nr_active(nna);
        nna_ar[nr_node_ids] = nna;

        return 0;

err_free:
        free_node_nr_active(nna_ar);
        return -ENOMEM;
}

static void rcu_free_wq(struct rcu_head *rcu)
{
        struct workqueue_struct *wq =
                container_of(rcu, struct workqueue_struct, rcu);

        if (wq->flags & WQ_UNBOUND)
                free_node_nr_active(wq->node_nr_active);

        wq_free_lockdep(wq);
        free_percpu(wq->cpu_pwq);
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
}

static void rcu_free_pool(struct rcu_head *rcu)
{
        struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);

        ida_destroy(&pool->worker_ida);
        free_workqueue_attrs(pool->attrs);
        kfree(pool);
}

/**
 * put_unbound_pool - put a worker_pool
 * @pool: worker_pool to put
 *
 * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
 * safe manner.  get_unbound_pool() calls this function on its failure path
 * and this function should be able to release pools which went through,
 * successfully or not, init_worker_pool().
 *
 * Should be called with wq_pool_mutex held.
 */
static void put_unbound_pool(struct worker_pool *pool)
{
        struct worker *worker;
        LIST_HEAD(cull_list);

        lockdep_assert_held(&wq_pool_mutex);

        if (--pool->refcnt)
                return;

        /* sanity checks */
        if (WARN_ON(!(pool->cpu < 0)) ||
            WARN_ON(!list_empty(&pool->worklist)))
                return;

        /* release id and unhash */
        if (pool->id >= 0)
                idr_remove(&worker_pool_idr, pool->id);
        hash_del(&pool->hash_node);

        /*
         * Become the manager and destroy all workers.  This prevents
         * @pool's workers from blocking on attach_mutex.  We're the last
         * manager and @pool gets freed with the flag set.
         *
         * Having a concurrent manager is quite unlikely to happen as we can
         * only get here with
         *   pwq->refcnt == pool->refcnt == 0
         * which implies no work queued to the pool, which implies no worker can
         * become the manager. However a worker could have taken the role of
         * manager before the refcnts dropped to 0, since maybe_create_worker()
         * drops pool->lock
         */
        while (true) {
                rcuwait_wait_event(&manager_wait,
                                   !(pool->flags & POOL_MANAGER_ACTIVE),
                                   TASK_UNINTERRUPTIBLE);

                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);
                if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
                        pool->flags |= POOL_MANAGER_ACTIVE;
                        break;
                }
                raw_spin_unlock_irq(&pool->lock);
                mutex_unlock(&wq_pool_attach_mutex);
        }

        while ((worker = first_idle_worker(pool)))
                set_worker_dying(worker, &cull_list);
        WARN_ON(pool->nr_workers || pool->nr_idle);
        raw_spin_unlock_irq(&pool->lock);

        detach_dying_workers(&cull_list);

        mutex_unlock(&wq_pool_attach_mutex);

        reap_dying_workers(&cull_list);

        /* shut down the timers */
        timer_delete_sync(&pool->idle_timer);
        cancel_work_sync(&pool->idle_cull_work);
        timer_delete_sync(&pool->mayday_timer);

        /* RCU protected to allow dereferences from get_work_pool() */
        call_rcu(&pool->rcu, rcu_free_pool);
}

/**
 * get_unbound_pool - get a worker_pool with the specified attributes
 * @attrs: the attributes of the worker_pool to get
 *
 * Obtain a worker_pool which has the same attributes as @attrs, bump the
 * reference count and return it.  If there already is a matching
 * worker_pool, it will be used; otherwise, this function attempts to
 * create a new one.
 *
 * Should be called with wq_pool_mutex held.
 *
 * Return: On success, a worker_pool with the same attributes as @attrs.
 * On failure, %NULL.
 */
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
        struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
        u32 hash = wqattrs_hash(attrs);
        struct worker_pool *pool;
        int pod, node = NUMA_NO_NODE;

        lockdep_assert_held(&wq_pool_mutex);

        /* do we already have a matching pool? */
        hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
                if (wqattrs_equal(pool->attrs, attrs)) {
                        pool->refcnt++;
                        return pool;
                }
        }

        /* If __pod_cpumask is contained inside a NUMA pod, that's our node */
        for (pod = 0; pod < pt->nr_pods; pod++) {
                if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
                        node = pt->pod_node[pod];
                        break;
                }
        }

        /* nope, create a new one */
        pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
        if (!pool || init_worker_pool(pool) < 0)
                goto fail;

        pool->node = node;
        copy_workqueue_attrs(pool->attrs, attrs);
        wqattrs_clear_for_pool(pool->attrs);

        if (worker_pool_assign_id(pool) < 0)
                goto fail;

        /* create and start the initial worker */
        if (wq_online && !create_worker(pool))
                goto fail;

        /* install */
        hash_add(unbound_pool_hash, &pool->hash_node, hash);

        return pool;
fail:
        if (pool)
                put_unbound_pool(pool);
        return NULL;
}

/*
 * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
 * refcnt and needs to be destroyed.
 */
static void pwq_release_workfn(struct kthread_work *work)
{
        struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
                                                  release_work);
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        bool is_last = false;

        /*
         * When @pwq is not linked, it doesn't hold any reference to the
         * @wq, and @wq is invalid to access.
         */
        if (!list_empty(&pwq->pwqs_node)) {
                mutex_lock(&wq->mutex);
                list_del_rcu(&pwq->pwqs_node);
                is_last = list_empty(&wq->pwqs);

                /*
                 * For ordered workqueue with a plugged dfl_pwq, restart it now.
                 */
                if (!is_last && (wq->flags & __WQ_ORDERED))
                        unplug_oldest_pwq(wq);

                mutex_unlock(&wq->mutex);
        }

        if (wq->flags & WQ_UNBOUND) {
                mutex_lock(&wq_pool_mutex);
                put_unbound_pool(pool);
                mutex_unlock(&wq_pool_mutex);
        }

        if (!list_empty(&pwq->pending_node)) {
                struct wq_node_nr_active *nna =
                        wq_node_nr_active(pwq->wq, pwq->pool->node);

                raw_spin_lock_irq(&nna->lock);
                list_del_init(&pwq->pending_node);
                raw_spin_unlock_irq(&nna->lock);
        }

        kfree_rcu(pwq, rcu);

        /*
         * If we're the last pwq going away, @wq is already dead and no one
         * is gonna access it anymore.  Schedule RCU free.
         */
        if (is_last) {
                wq_unregister_lockdep(wq);
                call_rcu(&wq->rcu, rcu_free_wq);
        }
}

/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
                     struct worker_pool *pool)
{
        BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK);

        memset(pwq, 0, sizeof(*pwq));

        pwq->pool = pool;
        pwq->wq = wq;
        pwq->flush_color = -1;
        pwq->refcnt = 1;
        INIT_LIST_HEAD(&pwq->inactive_works);
        INIT_LIST_HEAD(&pwq->pending_node);
        INIT_LIST_HEAD(&pwq->pwqs_node);
        INIT_LIST_HEAD(&pwq->mayday_node);
        kthread_init_work(&pwq->release_work, pwq_release_workfn);

        /*
         * Set the dummy cursor work with valid function and get_work_pwq().
         *
         * The cursor work should only be in the pwq->pool->worklist, and
         * should not be treated as a processable work item.
         *
         * WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less
         * surprise for kernel debugging tools and reviewers.
         */
        INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func);
        atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq |
                        WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE);
}

/* sync @pwq with the current state of its associated wq and link it */
static void link_pwq(struct pool_workqueue *pwq)
{
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq->mutex);

        /* may be called multiple times, ignore if already linked */
        if (!list_empty(&pwq->pwqs_node))
                return;

        /* set the matching work_color */
        pwq->work_color = wq->work_color;

        /* link in @pwq */
        list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs);
}

/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq_pool_mutex);

        pool = get_unbound_pool(attrs);
        if (!pool)
                return NULL;

        pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
        if (!pwq) {
                put_unbound_pool(pool);
                return NULL;
        }

        init_pwq(pwq, wq, pool);
        return pwq;
}

static void apply_wqattrs_lock(void)
{
        mutex_lock(&wq_pool_mutex);
}

static void apply_wqattrs_unlock(void)
{
        mutex_unlock(&wq_pool_mutex);
}

/**
 * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
 * @attrs: the wq_attrs of the default pwq of the target workqueue
 * @cpu: the target CPU
 *
 * Calculate the cpumask a workqueue with @attrs should use on @pod.
 * The result is stored in @attrs->__pod_cpumask.
 *
 * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
 * and @pod has online CPUs requested by @attrs, the returned cpumask is the
 * intersection of the possible CPUs of @pod and @attrs->cpumask.
 *
 * The caller is responsible for ensuring that the cpumask of @pod stays stable.
 */
static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu)
{
        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
        int pod = pt->cpu_pod[cpu];

        /* calculate possible CPUs in @pod that @attrs wants */
        cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
        /* does @pod have any online CPUs @attrs wants? */
        if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) {
                cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
                return;
        }
}

/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
                                        int cpu, struct pool_workqueue *pwq)
{
        struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
        struct pool_workqueue *old_pwq;

        lockdep_assert_held(&wq_pool_mutex);
        lockdep_assert_held(&wq->mutex);

        /* link_pwq() can handle duplicate calls */
        link_pwq(pwq);

        old_pwq = rcu_access_pointer(*slot);
        rcu_assign_pointer(*slot, pwq);
        return old_pwq;
}

/* context to store the prepared attrs & pwqs before applying */
struct apply_wqattrs_ctx {
        struct workqueue_struct        *wq;                /* target workqueue */
        struct workqueue_attrs        *attrs;                /* attrs to apply */
        struct list_head        list;                /* queued for batching commit */
        struct pool_workqueue        *dfl_pwq;
        struct pool_workqueue        *pwq_tbl[];
};

/* free the resources after success or abort */
static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
{
        if (ctx) {
                int cpu;

                for_each_possible_cpu(cpu)
                        put_pwq_unlocked(ctx->pwq_tbl[cpu]);
                put_pwq_unlocked(ctx->dfl_pwq);

                free_workqueue_attrs(ctx->attrs);

                kfree(ctx);
        }
}

/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
                      const struct workqueue_attrs *attrs,
                      const cpumask_var_t unbound_cpumask)
{
        struct apply_wqattrs_ctx *ctx;
        struct workqueue_attrs *new_attrs;
        int cpu;

        lockdep_assert_held(&wq_pool_mutex);

        if (WARN_ON(attrs->affn_scope < 0 ||
                    attrs->affn_scope >= WQ_AFFN_NR_TYPES))
                return ERR_PTR(-EINVAL);

        ctx = kzalloc_flex(*ctx, pwq_tbl, nr_cpu_ids);

        new_attrs = alloc_workqueue_attrs();
        if (!ctx || !new_attrs)
                goto out_free;

        /*
         * If something goes wrong during CPU up/down, we'll fall back to
         * the default pwq covering whole @attrs->cpumask.  Always create
         * it even if we don't use it immediately.
         */
        copy_workqueue_attrs(new_attrs, attrs);
        wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
        cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
        ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
        if (!ctx->dfl_pwq)
                goto out_free;

        for_each_possible_cpu(cpu) {
                if (new_attrs->ordered) {
                        ctx->dfl_pwq->refcnt++;
                        ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
                } else {
                        wq_calc_pod_cpumask(new_attrs, cpu);
                        ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
                        if (!ctx->pwq_tbl[cpu])
                                goto out_free;
                }
        }

        /* save the user configured attrs and sanitize it. */
        copy_workqueue_attrs(new_attrs, attrs);
        cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
        cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
        ctx->attrs = new_attrs;

        /*
         * For initialized ordered workqueues, there should only be one pwq
         * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution
         * of newly queued work items until execution of older work items in
         * the old pwq's have completed.
         */
        if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))
                ctx->dfl_pwq->plugged = true;

        ctx->wq = wq;
        return ctx;

out_free:
        free_workqueue_attrs(new_attrs);
        apply_wqattrs_cleanup(ctx);
        return ERR_PTR(-ENOMEM);
}

/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
        int cpu;

        /* all pwqs have been created successfully, let's install'em */
        mutex_lock(&ctx->wq->mutex);

        copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);

        /* save the previous pwqs and install the new ones */
        for_each_possible_cpu(cpu)
                ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
                                                        ctx->pwq_tbl[cpu]);
        ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);

        /* update node_nr_active->max */
        wq_update_node_max_active(ctx->wq, -1);

        mutex_unlock(&ctx->wq->mutex);
}

static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct apply_wqattrs_ctx *ctx;

        /* only unbound workqueues can change attributes */
        if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
                return -EINVAL;

        ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);

        /* the ctx has been prepared successfully, let's commit it */
        apply_wqattrs_commit(ctx);
        apply_wqattrs_cleanup(ctx);

        return 0;
}

/**
 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
 * @wq: the target workqueue
 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
 *
 * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
 * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
 * work items are affine to the pod it was issued on. Older pwqs are released as
 * in-flight work items finish. Note that a work item which repeatedly requeues
 * itself back-to-back will stay on its current pwq.
 *
 * Performs GFP_KERNEL allocations.
 *
 * Return: 0 on success and -errno on failure.
 */
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs)
{
        int ret;

        mutex_lock(&wq_pool_mutex);
        ret = apply_workqueue_attrs_locked(wq, attrs);
        mutex_unlock(&wq_pool_mutex);

        return ret;
}

/**
 * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug
 * @wq: the target workqueue
 * @cpu: the CPU to update the pwq slot for
 *
 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
 * %CPU_DOWN_FAILED.  @cpu is in the same pod of the CPU being hot[un]plugged.
 *
 *
 * If pod affinity can't be adjusted due to memory allocation failure, it falls
 * back to @wq->dfl_pwq which may not be optimal but is always correct.
 *
 * Note that when the last allowed CPU of a pod goes offline for a workqueue
 * with a cpumask spanning multiple pods, the workers which were already
 * executing the work items for the workqueue will lose their CPU affinity and
 * may execute on any CPU. This is similar to how per-cpu workqueues behave on
 * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
 * responsibility to flush the work item from CPU_DOWN_PREPARE.
 */
static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
{
        struct pool_workqueue *old_pwq = NULL, *pwq;
        struct workqueue_attrs *target_attrs;

        lockdep_assert_held(&wq_pool_mutex);

        if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)
                return;

        /*
         * We don't wanna alloc/free wq_attrs for each wq for each CPU.
         * Let's use a preallocated one.  The following buf is protected by
         * CPU hotplug exclusion.
         */
        target_attrs = unbound_wq_update_pwq_attrs_buf;

        copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
        wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);

        /* nothing to do if the target cpumask matches the current pwq */
        wq_calc_pod_cpumask(target_attrs, cpu);
        if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
                return;

        /* create a new pwq */
        pwq = alloc_unbound_pwq(wq, target_attrs);
        if (!pwq) {
                pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
                        wq->name);
                goto use_dfl_pwq;
        }

        /* Install the new pwq. */
        mutex_lock(&wq->mutex);
        old_pwq = install_unbound_pwq(wq, cpu, pwq);
        goto out_unlock;

use_dfl_pwq:
        mutex_lock(&wq->mutex);
        pwq = unbound_pwq(wq, -1);
        raw_spin_lock_irq(&pwq->pool->lock);
        get_pwq(pwq);
        raw_spin_unlock_irq(&pwq->pool->lock);
        old_pwq = install_unbound_pwq(wq, cpu, pwq);
out_unlock:
        mutex_unlock(&wq->mutex);
        put_pwq_unlocked(old_pwq);
}

static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
        bool highpri = wq->flags & WQ_HIGHPRI;
        int cpu, ret;

        lockdep_assert_held(&wq_pool_mutex);

        wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
        if (!wq->cpu_pwq)
                goto enomem;

        if (!(wq->flags & WQ_UNBOUND)) {
                struct worker_pool __percpu *pools;

                if (wq->flags & WQ_BH)
                        pools = bh_worker_pools;
                else
                        pools = cpu_worker_pools;

                for_each_possible_cpu(cpu) {
                        struct pool_workqueue **pwq_p;
                        struct worker_pool *pool;

                        pool = &(per_cpu_ptr(pools, cpu)[highpri]);
                        pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);

                        *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
                                                       pool->node);
                        if (!*pwq_p)
                                goto enomem;

                        init_pwq(*pwq_p, wq, pool);

                        mutex_lock(&wq->mutex);
                        link_pwq(*pwq_p);
                        mutex_unlock(&wq->mutex);
                }
                return 0;
        }

        if (wq->flags & __WQ_ORDERED) {
                struct pool_workqueue *dfl_pwq;

                ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
                /* there should only be single pwq for ordering guarantee */
                dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
                WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
                              wq->pwqs.prev != &dfl_pwq->pwqs_node),
                     "ordering guarantee broken for workqueue %s\n", wq->name);
        } else {
                ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
        }

        return ret;

enomem:
        if (wq->cpu_pwq) {
                for_each_possible_cpu(cpu) {
                        struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);

                        if (pwq) {
                                /*
                                 * Unlink pwq from wq->pwqs since link_pwq()
                                 * may have already added it. wq->mutex is not
                                 * needed as the wq has not been published yet.
                                 */
                                if (!list_empty(&pwq->pwqs_node))
                                        list_del_rcu(&pwq->pwqs_node);
                                kmem_cache_free(pwq_cache, pwq);
                        }
                }
                free_percpu(wq->cpu_pwq);
                wq->cpu_pwq = NULL;
        }
        return -ENOMEM;
}

static int wq_clamp_max_active(int max_active, unsigned int flags,
                               const char *name)
{
        if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
                pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
                        max_active, name, 1, WQ_MAX_ACTIVE);

        return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
}

/*
 * Workqueues which may be used during memory reclaim should have a rescuer
 * to guarantee forward progress.
 */
static int init_rescuer(struct workqueue_struct *wq)
{
        struct worker *rescuer;
        char id_buf[WORKER_ID_LEN];
        int ret;

        lockdep_assert_held(&wq_pool_mutex);

        if (!(wq->flags & WQ_MEM_RECLAIM))
                return 0;

        rescuer = alloc_worker(NUMA_NO_NODE);
        if (!rescuer) {
                pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
                       wq->name);
                return -ENOMEM;
        }

        rescuer->rescue_wq = wq;
        format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL);

        rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf);
        if (IS_ERR(rescuer->task)) {
                ret = PTR_ERR(rescuer->task);
                pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
                       wq->name, ERR_PTR(ret));
                kfree(rescuer);
                return ret;
        }

        wq->rescuer = rescuer;

        /* initial cpumask is consistent with the detached rescuer and unbind_worker() */
        if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
                kthread_bind_mask(rescuer->task, wq_unbound_cpumask);
        else
                kthread_bind_mask(rescuer->task, cpu_possible_mask);

        wake_up_process(rescuer->task);

        return 0;
}

/**
 * wq_adjust_max_active - update a wq's max_active to the current setting
 * @wq: target workqueue
 *
 * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
 * activate inactive work items accordingly. If @wq is freezing, clear
 * @wq->max_active to zero.
 */
static void wq_adjust_max_active(struct workqueue_struct *wq)
{
        bool activated;
        int new_max, new_min;

        lockdep_assert_held(&wq->mutex);

        if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
                new_max = 0;
                new_min = 0;
        } else {
                new_max = wq->saved_max_active;
                new_min = wq->saved_min_active;
        }

        if (wq->max_active == new_max && wq->min_active == new_min)
                return;

        /*
         * Update @wq->max/min_active and then kick inactive work items if more
         * active work items are allowed. This doesn't break work item ordering
         * because new work items are always queued behind existing inactive
         * work items if there are any.
         */
        WRITE_ONCE(wq->max_active, new_max);
        WRITE_ONCE(wq->min_active, new_min);

        if (wq->flags & WQ_UNBOUND)
                wq_update_node_max_active(wq, -1);

        if (new_max == 0)
                return;

        /*
         * Round-robin through pwq's activating the first inactive work item
         * until max_active is filled.
         */
        do {
                struct pool_workqueue *pwq;

                activated = false;
                for_each_pwq(pwq, wq) {
                        unsigned long irq_flags;

                        /* can be called during early boot w/ irq disabled */
                        raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
                        if (pwq_activate_first_inactive(pwq, true)) {
                                activated = true;
                                kick_pool(pwq->pool);
                        }
                        raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
                }
        } while (activated);
}

__printf(1, 0)
static struct workqueue_struct *__alloc_workqueue(const char *fmt,
                                                  unsigned int flags,
                                                  int max_active, va_list args)
{
        struct workqueue_struct *wq;
        size_t wq_size;
        int name_len;

        if (flags & WQ_BH) {
                if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS))
                        return NULL;
                if (WARN_ON_ONCE(max_active))
                        return NULL;
        }

        /* see the comment above the definition of WQ_POWER_EFFICIENT */
        if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
                flags |= WQ_UNBOUND;

        /* allocate wq and format name */
        if (flags & WQ_UNBOUND)
                wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
        else
                wq_size = sizeof(*wq);

        wq = kzalloc_noprof(wq_size, GFP_KERNEL);
        if (!wq)
                return NULL;

        if (flags & WQ_UNBOUND) {
                wq->unbound_attrs = alloc_workqueue_attrs_noprof();
                if (!wq->unbound_attrs)
                        goto err_free_wq;
        }

        name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);

        if (name_len >= WQ_NAME_LEN)
                pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
                             wq->name);

        if (flags & WQ_BH) {
                /*
                 * BH workqueues always share a single execution context per CPU
                 * and don't impose any max_active limit.
                 */
                max_active = INT_MAX;
        } else {
                max_active = max_active ?: WQ_DFL_ACTIVE;
                max_active = wq_clamp_max_active(max_active, flags, wq->name);
        }

        /* init wq */
        wq->flags = flags;
        wq->max_active = max_active;
        wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
        wq->saved_max_active = wq->max_active;
        wq->saved_min_active = wq->min_active;
        mutex_init(&wq->mutex);
        atomic_set(&wq->nr_pwqs_to_flush, 0);
        INIT_LIST_HEAD(&wq->pwqs);
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
        INIT_LIST_HEAD(&wq->maydays);

        INIT_LIST_HEAD(&wq->list);

        if (flags & WQ_UNBOUND) {
                if (alloc_node_nr_active(wq->node_nr_active) < 0)
                        goto err_free_wq;
        }

        /*
         * wq_pool_mutex protects the workqueues list, allocations of PWQs,
         * and the global freeze state.
         */
        apply_wqattrs_lock();

        if (alloc_and_link_pwqs(wq) < 0)
                goto err_unlock_free_node_nr_active;

        mutex_lock(&wq->mutex);
        wq_adjust_max_active(wq);
        mutex_unlock(&wq->mutex);

        list_add_tail_rcu(&wq->list, &workqueues);

        if (wq_online && init_rescuer(wq) < 0)
                goto err_unlock_destroy;

        apply_wqattrs_unlock();

        if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
                goto err_destroy;

        return wq;

err_unlock_free_node_nr_active:
        apply_wqattrs_unlock();
        /*
         * Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
         * flushing the pwq_release_worker ensures that the pwq_release_workfn()
         * completes before calling kfree(wq).
         */
        if (wq->flags & WQ_UNBOUND) {
                kthread_flush_worker(pwq_release_worker);
                free_node_nr_active(wq->node_nr_active);
        }
err_free_wq:
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
        return NULL;
err_unlock_destroy:
        apply_wqattrs_unlock();
err_destroy:
        destroy_workqueue(wq);
        return NULL;
}

__printf(1, 4)
struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
                                                unsigned int flags,
                                                int max_active, ...)
{
        struct workqueue_struct *wq;
        va_list args;

        va_start(args, max_active);
        wq = __alloc_workqueue(fmt, flags, max_active, args);
        va_end(args);
        if (!wq)
                return NULL;

        wq_init_lockdep(wq);

        return wq;
}
EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);

static void devm_workqueue_release(void *res)
{
        destroy_workqueue(res);
}

__printf(2, 5) struct workqueue_struct *
devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags,
                     int max_active, ...)
{
        struct workqueue_struct *wq;
        va_list args;
        int ret;

        va_start(args, max_active);
        wq = alloc_workqueue(fmt, flags, max_active, args);
        va_end(args);
        if (!wq)
                return NULL;

        ret = devm_add_action_or_reset(dev, devm_workqueue_release, wq);
        if (ret)
                return NULL;

        return wq;
}
EXPORT_SYMBOL_GPL(devm_alloc_workqueue);

#ifdef CONFIG_LOCKDEP
__printf(1, 5)
struct workqueue_struct *
alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags,
                            int max_active, struct lockdep_map *lockdep_map, ...)
{
        struct workqueue_struct *wq;
        va_list args;

        va_start(args, lockdep_map);
        wq = __alloc_workqueue(fmt, flags, max_active, args);
        va_end(args);
        if (!wq)
                return NULL;

        wq->lockdep_map = lockdep_map;

        return wq;
}
EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map);
#endif

static bool pwq_busy(struct pool_workqueue *pwq)
{
        int i;

        for (i = 0; i < WORK_NR_COLORS; i++)
                if (pwq->nr_in_flight[i])
                        return true;

        if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
                return true;
        if (!pwq_is_empty(pwq))
                return true;

        return false;
}

/**
 * destroy_workqueue - safely terminate a workqueue
 * @wq: target workqueue
 *
 * Safely destroy a workqueue. All work currently pending will be done first.
 *
 * This function does NOT guarantee that non-pending work that has been
 * submitted with queue_delayed_work() and similar functions will be done
 * before destroying the workqueue. The fundamental problem is that, currently,
 * the workqueue has no way of accessing non-pending delayed_work. delayed_work
 * is only linked on the timer-side. All delayed_work must, therefore, be
 * canceled before calling this function.
 *
 * TODO: It would be better if the problem described above wouldn't exist and
 * destroy_workqueue() would cleanly cancel all pending and non-pending
 * delayed_work.
 */
void destroy_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        int cpu;

        /*
         * Remove it from sysfs first so that sanity check failure doesn't
         * lead to sysfs name conflicts.
         */
        workqueue_sysfs_unregister(wq);

        /* mark the workqueue destruction is in progress */
        mutex_lock(&wq->mutex);
        wq->flags |= __WQ_DESTROYING;
        mutex_unlock(&wq->mutex);

        /* drain it before proceeding with destruction */
        drain_workqueue(wq);

        /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
        if (wq->rescuer) {
                /* rescuer will empty maydays list before exiting */
                kthread_stop(wq->rescuer->task);
                kfree(wq->rescuer);
                wq->rescuer = NULL;
        }

        /*
         * Sanity checks - grab all the locks so that we wait for all
         * in-flight operations which may do put_pwq().
         */
        mutex_lock(&wq_pool_mutex);
        mutex_lock(&wq->mutex);
        for_each_pwq(pwq, wq) {
                raw_spin_lock_irq(&pwq->pool->lock);
                if (WARN_ON(pwq_busy(pwq))) {
                        pr_warn("%s: %s has the following busy pwq\n",
                                __func__, wq->name);
                        show_pwq(pwq);
                        raw_spin_unlock_irq(&pwq->pool->lock);
                        mutex_unlock(&wq->mutex);
                        mutex_unlock(&wq_pool_mutex);
                        show_one_workqueue(wq);
                        return;
                }
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
        mutex_unlock(&wq->mutex);

        /*
         * wq list is used to freeze wq, remove from list after
         * flushing is complete in case freeze races us.
         */
        list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);

        /*
         * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
         * to put the base refs. @wq will be auto-destroyed from the last
         * pwq_put. RCU read lock prevents @wq from going away from under us.
         */
        rcu_read_lock();

        for_each_possible_cpu(cpu) {
                put_pwq_unlocked(unbound_pwq(wq, cpu));
                RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
        }

        put_pwq_unlocked(unbound_pwq(wq, -1));
        RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(destroy_workqueue);

/**
 * workqueue_set_max_active - adjust max_active of a workqueue
 * @wq: target workqueue
 * @max_active: new max_active value.
 *
 * Set max_active of @wq to @max_active. See the alloc_workqueue() function
 * comment.
 *
 * CONTEXT:
 * Don't call from IRQ context.
 */
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
        /* max_active doesn't mean anything for BH workqueues */
        if (WARN_ON(wq->flags & WQ_BH))
                return;
        /* disallow meddling with max_active for ordered workqueues */
        if (WARN_ON(wq->flags & __WQ_ORDERED))
                return;

        max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);

        mutex_lock(&wq->mutex);

        wq->saved_max_active = max_active;
        if (wq->flags & WQ_UNBOUND)
                wq->saved_min_active = min(wq->saved_min_active, max_active);

        wq_adjust_max_active(wq);

        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(workqueue_set_max_active);

/**
 * workqueue_set_min_active - adjust min_active of an unbound workqueue
 * @wq: target unbound workqueue
 * @min_active: new min_active value
 *
 * Set min_active of an unbound workqueue. Unlike other types of workqueues, an
 * unbound workqueue is not guaranteed to be able to process max_active
 * interdependent work items. Instead, an unbound workqueue is guaranteed to be
 * able to process min_active number of interdependent work items which is
 * %WQ_DFL_MIN_ACTIVE by default.
 *
 * Use this function to adjust the min_active value between 0 and the current
 * max_active.
 */
void workqueue_set_min_active(struct workqueue_struct *wq, int min_active)
{
        /* min_active is only meaningful for non-ordered unbound workqueues */
        if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) !=
                    WQ_UNBOUND))
                return;

        mutex_lock(&wq->mutex);
        wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active);
        wq_adjust_max_active(wq);
        mutex_unlock(&wq->mutex);
}

/**
 * current_work - retrieve %current task's work struct
 *
 * Determine if %current task is a workqueue worker and what it's working on.
 * Useful to find out the context that the %current task is running in.
 *
 * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
 */
struct work_struct *current_work(void)
{
        struct worker *worker = current_wq_worker();

        return worker ? worker->current_work : NULL;
}
EXPORT_SYMBOL(current_work);

/**
 * current_is_workqueue_rescuer - is %current workqueue rescuer?
 *
 * Determine whether %current is a workqueue rescuer.  Can be used from
 * work functions to determine whether it's being run off the rescuer task.
 *
 * Return: %true if %current is a workqueue rescuer. %false otherwise.
 */
bool current_is_workqueue_rescuer(void)
{
        struct worker *worker = current_wq_worker();

        return worker && worker->rescue_wq;
}

/**
 * workqueue_congested - test whether a workqueue is congested
 * @cpu: CPU in question
 * @wq: target workqueue
 *
 * Test whether @wq's cpu workqueue for @cpu is congested.  There is
 * no synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
 *
 * With the exception of ordered workqueues, all workqueues have per-cpu
 * pool_workqueues, each with its own congested state. A workqueue being
 * congested on one CPU doesn't mean that the workqueue is contested on any
 * other CPUs.
 *
 * Return:
 * %true if congested, %false otherwise.
 */
bool workqueue_congested(int cpu, struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool ret;

        preempt_disable();

        if (cpu == WORK_CPU_UNBOUND)
                cpu = smp_processor_id();

        pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
        ret = !list_empty(&pwq->inactive_works);

        preempt_enable();

        return ret;
}
EXPORT_SYMBOL_GPL(workqueue_congested);

/**
 * work_busy - test whether a work is currently pending or running
 * @work: the work to be tested
 *
 * Test whether @work is currently pending or running.  There is no
 * synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * Return:
 * OR'd bitmask of WORK_BUSY_* bits.
 */
unsigned int work_busy(struct work_struct *work)
{
        struct worker_pool *pool;
        unsigned long irq_flags;
        unsigned int ret = 0;

        if (work_pending(work))
                ret |= WORK_BUSY_PENDING;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (pool) {
                raw_spin_lock_irqsave(&pool->lock, irq_flags);
                if (find_worker_executing_work(pool, work))
                        ret |= WORK_BUSY_RUNNING;
                raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
        }
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(work_busy);

/**
 * set_worker_desc - set description for the current work item
 * @fmt: printf-style format string
 * @...: arguments for the format string
 *
 * This function can be called by a running work function to describe what
 * the work item is about.  If the worker task gets dumped, this
 * information will be printed out together to help debugging.  The
 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
 */
void set_worker_desc(const char *fmt, ...)
{
        struct worker *worker = current_wq_worker();
        va_list args;

        if (worker) {
                va_start(args, fmt);
                vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
                va_end(args);
        }
}
EXPORT_SYMBOL_GPL(set_worker_desc);

/**
 * print_worker_info - print out worker information and description
 * @log_lvl: the log level to use when printing
 * @task: target task
 *
 * If @task is a worker and currently executing a work item, print out the
 * name of the workqueue being serviced and worker description set with
 * set_worker_desc() by the currently executing work item.
 *
 * This function can be safely called on any task as long as the
 * task_struct itself is accessible.  While safe, this function isn't
 * synchronized and may print out mixups or garbages of limited length.
 */
void print_worker_info(const char *log_lvl, struct task_struct *task)
{
        work_func_t *fn = NULL;
        char name[WQ_NAME_LEN] = { };
        char desc[WORKER_DESC_LEN] = { };
        struct pool_workqueue *pwq = NULL;
        struct workqueue_struct *wq = NULL;
        struct worker *worker;

        if (!(task->flags & PF_WQ_WORKER))
                return;

        /*
         * This function is called without any synchronization and @task
         * could be in any state.  Be careful with dereferences.
         */
        worker = kthread_probe_data(task);

        /*
         * Carefully copy the associated workqueue's workfn, name and desc.
         * Keep the original last '\0' in case the original is garbage.
         */
        copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
        copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
        copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
        copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
        copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);

        if (fn || name[0] || desc[0]) {
                printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
                if (strcmp(name, desc))
                        pr_cont(" (%s)", desc);
                pr_cont("\n");
        }
}

static void pr_cont_pool_info(struct worker_pool *pool)
{
        pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
        if (pool->node != NUMA_NO_NODE)
                pr_cont(" node=%d", pool->node);
        pr_cont(" flags=0x%x", pool->flags);
        if (pool->flags & POOL_BH)
                pr_cont(" bh%s",
                        pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
        else
                pr_cont(" nice=%d", pool->attrs->nice);
}

static void pr_cont_worker_id(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & POOL_BH)
                pr_cont("bh%s",
                        pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
        else
                pr_cont("%d%s", task_pid_nr(worker->task),
                        worker->rescue_wq ? "(RESCUER)" : "");
}

struct pr_cont_work_struct {
        bool comma;
        work_func_t func;
        long ctr;
};

static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
{
        if (!pcwsp->ctr)
                goto out_record;
        if (func == pcwsp->func) {
                pcwsp->ctr++;
                return;
        }
        if (pcwsp->ctr == 1)
                pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
        else
                pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
        pcwsp->ctr = 0;
out_record:
        if ((long)func == -1L)
                return;
        pcwsp->comma = comma;
        pcwsp->func = func;
        pcwsp->ctr = 1;
}

static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
{
        if (work->func == wq_barrier_func) {
                struct wq_barrier *barr;

                barr = container_of(work, struct wq_barrier, work);

                pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
                pr_cont("%s BAR(%d)", comma ? "," : "",
                        task_pid_nr(barr->task));
        } else {
                if (!comma)
                        pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
                pr_cont_work_flush(comma, work->func, pcwsp);
        }
}

static void show_pwq(struct pool_workqueue *pwq)
{
        struct pr_cont_work_struct pcws = { .ctr = 0, };
        struct worker_pool *pool = pwq->pool;
        struct work_struct *work;
        struct worker *worker;
        bool has_in_flight = false, has_pending = false;
        int bkt;

        pr_info("  pwq %d:", pool->id);
        pr_cont_pool_info(pool);

        pr_cont(" active=%d refcnt=%d%s\n",
                pwq->nr_active, pwq->refcnt,
                !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                if (worker->current_pwq == pwq) {
                        has_in_flight = true;
                        break;
                }
        }
        if (has_in_flight) {
                bool comma = false;

                pr_info("    in-flight:");
                hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                        if (worker->current_pwq != pwq)
                                continue;

                        pr_cont(" %s", comma ? "," : "");
                        pr_cont_worker_id(worker);
                        pr_cont(":%ps", worker->current_func);
                        pr_cont(" for %us",
                                jiffies_to_msecs(jiffies - worker->current_start) / 1000);
                        list_for_each_entry(work, &worker->scheduled, entry)
                                pr_cont_work(false, work, &pcws);
                        pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                        comma = true;
                }
                pr_cont("\n");
        }

        list_for_each_entry(work, &pool->worklist, entry) {
                if (get_work_pwq(work) == pwq) {
                        has_pending = true;
                        break;
                }
        }
        if (has_pending) {
                bool comma = false;

                pr_info("    pending:");
                list_for_each_entry(work, &pool->worklist, entry) {
                        if (get_work_pwq(work) != pwq)
                                continue;

                        pr_cont_work(comma, work, &pcws);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                pr_cont("\n");
        }

        if (!list_empty(&pwq->inactive_works)) {
                bool comma = false;

                pr_info("    inactive:");
                list_for_each_entry(work, &pwq->inactive_works, entry) {
                        pr_cont_work(comma, work, &pcws);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                pr_cont("\n");
        }
}

/**
 * show_one_workqueue - dump state of specified workqueue
 * @wq: workqueue whose state will be printed
 */
void show_one_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool idle = true;
        unsigned long irq_flags;

        for_each_pwq(pwq, wq) {
                if (!pwq_is_empty(pwq)) {
                        idle = false;
                        break;
                }
        }
        if (idle) /* Nothing to print for idle workqueue */
                return;

        pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);

        for_each_pwq(pwq, wq) {
                raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
                if (!pwq_is_empty(pwq)) {
                        /*
                         * Defer printing to avoid deadlocks in console
                         * drivers that queue work while holding locks
                         * also taken in their write paths.
                         */
                        printk_deferred_enter();
                        show_pwq(pwq);
                        printk_deferred_exit();
                }
                raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
                /*
                 * We could be printing a lot from atomic context, e.g.
                 * sysrq-t -> show_all_workqueues(). Avoid triggering
                 * hard lockup.
                 */
                touch_nmi_watchdog();
        }

}

/**
 * show_one_worker_pool - dump state of specified worker pool
 * @pool: worker pool whose state will be printed
 */
static void show_one_worker_pool(struct worker_pool *pool)
{
        struct worker *worker;
        bool first = true;
        unsigned long irq_flags;
        unsigned long hung = 0;

        raw_spin_lock_irqsave(&pool->lock, irq_flags);
        if (pool->nr_workers == pool->nr_idle)
                goto next_pool;

        /* How long the first pending work is waiting for a worker. */
        if (!list_empty(&pool->worklist))
                hung = jiffies_to_msecs(jiffies - pool->last_progress_ts) / 1000;

        /*
         * Defer printing to avoid deadlocks in console drivers that
         * queue work while holding locks also taken in their write
         * paths.
         */
        printk_deferred_enter();
        pr_info("pool %d:", pool->id);
        pr_cont_pool_info(pool);
        pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
        if (pool->manager)
                pr_cont(" manager: %d",
                        task_pid_nr(pool->manager->task));
        list_for_each_entry(worker, &pool->idle_list, entry) {
                pr_cont(" %s", first ? "idle: " : "");
                pr_cont_worker_id(worker);
                first = false;
        }
        pr_cont("\n");
        printk_deferred_exit();
next_pool:
        raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
        /*
         * We could be printing a lot from atomic context, e.g.
         * sysrq-t -> show_all_workqueues(). Avoid triggering
         * hard lockup.
         */
        touch_nmi_watchdog();

}

/**
 * show_all_workqueues - dump workqueue state
 *
 * Called from a sysrq handler and prints out all busy workqueues and pools.
 */
void show_all_workqueues(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int pi;

        rcu_read_lock();

        pr_info("Showing busy workqueues and worker pools:\n");

        list_for_each_entry_rcu(wq, &workqueues, list)
                show_one_workqueue(wq);

        for_each_pool(pool, pi)
                show_one_worker_pool(pool);

        rcu_read_unlock();
}

/**
 * show_freezable_workqueues - dump freezable workqueue state
 *
 * Called from try_to_freeze_tasks() and prints out all freezable workqueues
 * still busy.
 */
void show_freezable_workqueues(void)
{
        struct workqueue_struct *wq;

        rcu_read_lock();

        pr_info("Showing freezable workqueues that are still busy:\n");

        list_for_each_entry_rcu(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                show_one_workqueue(wq);
        }

        rcu_read_unlock();
}

/* used to show worker information through /proc/PID/{comm,stat,status} */
void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
{
        /* stabilize PF_WQ_WORKER and worker pool association */
        mutex_lock(&wq_pool_attach_mutex);

        if (task->flags & PF_WQ_WORKER) {
                struct worker *worker = kthread_data(task);
                struct worker_pool *pool = worker->pool;
                int off;

                off = format_worker_id(buf, size, worker, pool);

                if (pool) {
                        raw_spin_lock_irq(&pool->lock);
                        /*
                         * ->desc tracks information (wq name or
                         * set_worker_desc()) for the latest execution.  If
                         * current, prepend '+', otherwise '-'.
                         */
                        if (worker->desc[0] != '\0') {
                                if (worker->current_work)
                                        scnprintf(buf + off, size - off, "+%s",
                                                  worker->desc);
                                else
                                        scnprintf(buf + off, size - off, "-%s",
                                                  worker->desc);
                        }
                        raw_spin_unlock_irq(&pool->lock);
                }
        } else {
                strscpy(buf, task->comm, size);
        }

        mutex_unlock(&wq_pool_attach_mutex);
}

#ifdef CONFIG_SMP

/*
 * CPU hotplug.
 *
 * There are two challenges in supporting CPU hotplug.  Firstly, there
 * are a lot of assumptions on strong associations among work, pwq and
 * pool which make migrating pending and scheduled works very
 * difficult to implement without impacting hot paths.  Secondly,
 * worker pools serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
 * This is solved by allowing the pools to be disassociated from the CPU
 * running as an unbound one and allowing it to be reattached later if the
 * cpu comes back online.
 */

static void unbind_workers(int cpu)
{
        struct worker_pool *pool;
        struct worker *worker;

        for_each_cpu_worker_pool(pool, cpu) {
                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);

                /*
                 * We've blocked all attach/detach operations. Make all workers
                 * unbound and set DISASSOCIATED.  Before this, all workers
                 * must be on the cpu.  After this, they may become diasporas.
                 * And the preemption disabled section in their sched callbacks
                 * are guaranteed to see WORKER_UNBOUND since the code here
                 * is on the same cpu.
                 */
                for_each_pool_worker(worker, pool)
                        worker->flags |= WORKER_UNBOUND;

                pool->flags |= POOL_DISASSOCIATED;

                /*
                 * The handling of nr_running in sched callbacks are disabled
                 * now.  Zap nr_running.  After this, nr_running stays zero and
                 * need_more_worker() and keep_working() are always true as
                 * long as the worklist is not empty.  This pool now behaves as
                 * an unbound (in terms of concurrency management) pool which
                 * are served by workers tied to the pool.
                 */
                pool->nr_running = 0;

                /*
                 * With concurrency management just turned off, a busy
                 * worker blocking could lead to lengthy stalls.  Kick off
                 * unbound chain execution of currently pending work items.
                 */
                kick_pool(pool);

                raw_spin_unlock_irq(&pool->lock);

                for_each_pool_worker(worker, pool)
                        unbind_worker(worker);

                mutex_unlock(&wq_pool_attach_mutex);
        }
}

/**
 * rebind_workers - rebind all workers of a pool to the associated CPU
 * @pool: pool of interest
 *
 * @pool->cpu is coming online.  Rebind all workers to the CPU.
 */
static void rebind_workers(struct worker_pool *pool)
{
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /*
         * Restore CPU affinity of all workers.  As all idle workers should
         * be on the run-queue of the associated CPU before any local
         * wake-ups for concurrency management happen, restore CPU affinity
         * of all workers first and then clear UNBOUND.  As we're called
         * from CPU_ONLINE, the following shouldn't fail.
         */
        for_each_pool_worker(worker, pool) {
                kthread_set_per_cpu(worker->task, pool->cpu);
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                  pool_allowed_cpus(pool)) < 0);
        }

        raw_spin_lock_irq(&pool->lock);

        pool->flags &= ~POOL_DISASSOCIATED;

        for_each_pool_worker(worker, pool) {
                unsigned int worker_flags = worker->flags;

                /*
                 * We want to clear UNBOUND but can't directly call
                 * worker_clr_flags() or adjust nr_running.  Atomically
                 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
                 * @worker will clear REBOUND using worker_clr_flags() when
                 * it initiates the next execution cycle thus restoring
                 * concurrency management.  Note that when or whether
                 * @worker clears REBOUND doesn't affect correctness.
                 *
                 * WRITE_ONCE() is necessary because @worker->flags may be
                 * tested without holding any lock in
                 * wq_worker_running().  Without it, NOT_RUNNING test may
                 * fail incorrectly leading to premature concurrency
                 * management operations.
                 */
                WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
                worker_flags |= WORKER_REBOUND;
                worker_flags &= ~WORKER_UNBOUND;
                WRITE_ONCE(worker->flags, worker_flags);
        }

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
 * @pool: unbound pool of interest
 * @cpu: the CPU which is coming up
 *
 * An unbound pool may end up with a cpumask which doesn't have any online
 * CPUs.  When a worker of such pool get scheduled, the scheduler resets
 * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
 * online CPU before, cpus_allowed of all its workers should be restored.
 */
static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
{
        static cpumask_t cpumask;
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /* is @cpu allowed for @pool? */
        if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
                return;

        cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);

        /* as we're called from CPU_ONLINE, the following shouldn't fail */
        for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}

int workqueue_prepare_cpu(unsigned int cpu)
{
        struct worker_pool *pool;

        for_each_cpu_worker_pool(pool, cpu) {
                if (pool->nr_workers)
                        continue;
                if (!create_worker(pool))
                        return -ENOMEM;
        }
        return 0;
}

int workqueue_online_cpu(unsigned int cpu)
{
        struct worker_pool *pool;
        struct workqueue_struct *wq;
        int pi;

        mutex_lock(&wq_pool_mutex);

        cpumask_set_cpu(cpu, wq_online_cpumask);

        for_each_pool(pool, pi) {
                /* BH pools aren't affected by hotplug */
                if (pool->flags & POOL_BH)
                        continue;

                mutex_lock(&wq_pool_attach_mutex);
                if (pool->cpu == cpu)
                        rebind_workers(pool);
                else if (pool->cpu < 0)
                        restore_unbound_workers_cpumask(pool, cpu);
                mutex_unlock(&wq_pool_attach_mutex);
        }

        /* update pod affinity of unbound workqueues */
        list_for_each_entry(wq, &workqueues, list) {
                struct workqueue_attrs *attrs = wq->unbound_attrs;

                if (attrs) {
                        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
                        int tcpu;

                        for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
                                unbound_wq_update_pwq(wq, tcpu);

                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, -1);
                        mutex_unlock(&wq->mutex);
                }
        }

        mutex_unlock(&wq_pool_mutex);
        return 0;
}

int workqueue_offline_cpu(unsigned int cpu)
{
        struct workqueue_struct *wq;

        /* unbinding per-cpu workers should happen on the local CPU */
        if (WARN_ON(cpu != smp_processor_id()))
                return -1;

        unbind_workers(cpu);

        /* update pod affinity of unbound workqueues */
        mutex_lock(&wq_pool_mutex);

        cpumask_clear_cpu(cpu, wq_online_cpumask);

        list_for_each_entry(wq, &workqueues, list) {
                struct workqueue_attrs *attrs = wq->unbound_attrs;

                if (attrs) {
                        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
                        int tcpu;

                        for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
                                unbound_wq_update_pwq(wq, tcpu);

                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, cpu);
                        mutex_unlock(&wq->mutex);
                }
        }
        mutex_unlock(&wq_pool_mutex);

        return 0;
}

struct work_for_cpu {
        struct work_struct work;
        long (*fn)(void *);
        void *arg;
        long ret;
};

static void work_for_cpu_fn(struct work_struct *work)
{
        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

        wfc->ret = wfc->fn(wfc->arg);
}

/**
 * work_on_cpu_key - run a function in thread context on a particular cpu
 * @cpu: the cpu to run on
 * @fn: the function to run
 * @arg: the function arg
 * @key: The lock class key for lock debugging purposes
 *
 * It is up to the caller to ensure that the cpu doesn't go offline.
 * The caller must not hold any locks which would prevent @fn from completing.
 *
 * Return: The value @fn returns.
 */
long work_on_cpu_key(int cpu, long (*fn)(void *),
                     void *arg, struct lock_class_key *key)
{
        struct work_for_cpu wfc = { .fn = fn, .arg = arg };

        INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
        schedule_work_on(cpu, &wfc.work);
        flush_work(&wfc.work);
        destroy_work_on_stack(&wfc.work);
        return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_key);
#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER

/**
 * freeze_workqueues_begin - begin freezing workqueues
 *
 * Start freezing workqueues.  After this function returns, all freezable
 * workqueues will queue new works to their inactive_works list instead of
 * pool->worklist.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void freeze_workqueues_begin(void)
{
        struct workqueue_struct *wq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(workqueue_freezing);
        workqueue_freezing = true;

        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                wq_adjust_max_active(wq);
                mutex_unlock(&wq->mutex);
        }

        mutex_unlock(&wq_pool_mutex);
}

/**
 * freeze_workqueues_busy - are freezable workqueues still busy?
 *
 * Check whether freezing is complete.  This function must be called
 * between freeze_workqueues_begin() and thaw_workqueues().
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex.
 *
 * Return:
 * %true if some freezable workqueues are still busy.  %false if freezing
 * is complete.
 */
bool freeze_workqueues_busy(void)
{
        bool busy = false;
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(!workqueue_freezing);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                /*
                 * nr_active is monotonically decreasing.  It's safe
                 * to peek without lock.
                 */
                rcu_read_lock();
                for_each_pwq(pwq, wq) {
                        WARN_ON_ONCE(pwq->nr_active < 0);
                        if (pwq->nr_active) {
                                busy = true;
                                rcu_read_unlock();
                                goto out_unlock;
                        }
                }
                rcu_read_unlock();
        }
out_unlock:
        mutex_unlock(&wq_pool_mutex);
        return busy;
}

/**
 * thaw_workqueues - thaw workqueues
 *
 * Thaw workqueues.  Normal queueing is restored and all collected
 * frozen works are transferred to their respective pool worklists.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void thaw_workqueues(void)
{
        struct workqueue_struct *wq;

        mutex_lock(&wq_pool_mutex);

        if (!workqueue_freezing)
                goto out_unlock;

        workqueue_freezing = false;

        /* restore max_active and repopulate worklist */
        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                wq_adjust_max_active(wq);
                mutex_unlock(&wq->mutex);
        }

out_unlock:
        mutex_unlock(&wq_pool_mutex);
}
#endif /* CONFIG_FREEZER */

static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
        LIST_HEAD(ctxs);
        int ret = 0;
        struct workqueue_struct *wq;
        struct apply_wqattrs_ctx *ctx, *n;

        lockdep_assert_held(&wq_pool_mutex);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING))
                        continue;

                ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
                if (IS_ERR(ctx)) {
                        ret = PTR_ERR(ctx);
                        break;
                }

                list_add_tail(&ctx->list, &ctxs);
        }

        list_for_each_entry_safe(ctx, n, &ctxs, list) {
                if (!ret)
                        apply_wqattrs_commit(ctx);
                apply_wqattrs_cleanup(ctx);
        }

        if (!ret) {
                int cpu;
                struct worker_pool *pool;
                struct worker *worker;

                mutex_lock(&wq_pool_attach_mutex);
                cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
                /* rescuer needs to respect cpumask changes when it is not attached */
                list_for_each_entry(wq, &workqueues, list) {
                        if (wq->rescuer && !wq->rescuer->pool)
                                unbind_worker(wq->rescuer);
                }
                /* DISASSOCIATED worker needs to respect wq_unbound_cpumask */
                for_each_possible_cpu(cpu) {
                        for_each_cpu_worker_pool(pool, cpu) {
                                if (!(pool->flags & POOL_DISASSOCIATED))
                                        continue;
                                for_each_pool_worker(worker, pool)
                                        unbind_worker(worker);
                        }
                }
                mutex_unlock(&wq_pool_attach_mutex);
        }
        return ret;
}

/**
 * workqueue_unbound_housekeeping_update - Propagate housekeeping cpumask update
 * @hk: the new housekeeping cpumask
 *
 * Update the unbound workqueue cpumask on top of the new housekeeping cpumask such
 * that the effective unbound affinity is the intersection of the new housekeeping
 * with the requested affinity set via nohz_full=/isolcpus= or sysfs.
 *
 * Return: 0 on success and -errno on failure.
 */
int workqueue_unbound_housekeeping_update(const struct cpumask *hk)
{
        cpumask_var_t cpumask;
        int ret = 0;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        mutex_lock(&wq_pool_mutex);

        /*
         * If the operation fails, it will fall back to
         * wq_requested_unbound_cpumask which is initially set to
         * HK_TYPE_DOMAIN house keeping mask and rewritten
         * by any subsequent write to workqueue/cpumask sysfs file.
         */
        if (!cpumask_and(cpumask, wq_requested_unbound_cpumask, hk))
                cpumask_copy(cpumask, wq_requested_unbound_cpumask);
        if (!cpumask_equal(cpumask, wq_unbound_cpumask))
                ret = workqueue_apply_unbound_cpumask(cpumask);

        /* Save the current isolated cpumask & export it via sysfs */
        if (!ret)
                cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, hk);

        mutex_unlock(&wq_pool_mutex);
        free_cpumask_var(cpumask);
        return ret;
}

static int parse_affn_scope(const char *val)
{
        return sysfs_match_string(wq_affn_names, val);
}

static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
{
        struct workqueue_struct *wq;
        int affn, cpu;

        affn = parse_affn_scope(val);
        if (affn < 0)
                return affn;
        if (affn == WQ_AFFN_DFL)
                return -EINVAL;

        cpus_read_lock();
        mutex_lock(&wq_pool_mutex);

        wq_affn_dfl = affn;

        list_for_each_entry(wq, &workqueues, list) {
                for_each_online_cpu(cpu)
                        unbound_wq_update_pwq(wq, cpu);
        }

        mutex_unlock(&wq_pool_mutex);
        cpus_read_unlock();

        return 0;
}

static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
{
        return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
}

static const struct kernel_param_ops wq_affn_dfl_ops = {
        .set        = wq_affn_dfl_set,
        .get        = wq_affn_dfl_get,
};

module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);

#ifdef CONFIG_SYSFS
/*
 * Workqueues with WQ_SYSFS flag set is visible to userland via
 * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
 * following attributes.
 *
 *  per_cpu                RO bool        : whether the workqueue is per-cpu or unbound
 *  max_active                RW int        : maximum number of in-flight work items
 *
 * Unbound workqueues have the following extra attributes.
 *
 *  nice                RW int        : nice value of the workers
 *  cpumask                RW mask        : bitmask of allowed CPUs for the workers
 *  affinity_scope        RW str  : worker CPU affinity scope (cache, numa, none)
 *  affinity_strict        RW bool : worker CPU affinity is strict
 */
struct wq_device {
        struct workqueue_struct                *wq;
        struct device                        dev;
};

static struct workqueue_struct *dev_to_wq(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        return wq_dev->wq;
}

static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
}
static DEVICE_ATTR_RO(per_cpu);

static ssize_t max_active_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
}

static ssize_t max_active_store(struct device *dev,
                                struct device_attribute *attr, const char *buf,
                                size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int val;

        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
                return -EINVAL;

        workqueue_set_max_active(wq, val);
        return count;
}
static DEVICE_ATTR_RW(max_active);

static struct attribute *wq_sysfs_attrs[] = {
        &dev_attr_per_cpu.attr,
        &dev_attr_max_active.attr,
        NULL,
};

static umode_t wq_sysfs_is_visible(struct kobject *kobj, struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct workqueue_struct *wq = dev_to_wq(dev);

        /*
         * Adjusting max_active breaks ordering guarantee. Changing it has no
         * effect on BH worker. Limit max_active to RO in such case.
         */
        if (wq->flags & (WQ_BH | __WQ_ORDERED))
                return 0444;
        return a->mode;
}

static const struct attribute_group wq_sysfs_group = {
        .is_visible = wq_sysfs_is_visible,
        .attrs = wq_sysfs_attrs,
};
__ATTRIBUTE_GROUPS(wq_sysfs);

static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
        mutex_unlock(&wq->mutex);

        return written;
}

/* prepare workqueue_attrs for sysfs store operations */
static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
{
        struct workqueue_attrs *attrs;

        lockdep_assert_held(&wq_pool_mutex);

        attrs = alloc_workqueue_attrs();
        if (!attrs)
                return NULL;

        copy_workqueue_attrs(attrs, wq->unbound_attrs);
        return attrs;
}

static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
                ret = apply_workqueue_attrs_locked(wq, attrs);
        else
                ret = -EINVAL;

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_cpumask_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
                            cpumask_pr_args(wq->unbound_attrs->cpumask));
        mutex_unlock(&wq->mutex);
        return written;
}

static ssize_t wq_cpumask_store(struct device *dev,
                                struct device_attribute *attr,
                                const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        ret = cpumask_parse(buf, attrs->cpumask);
        if (!ret)
                ret = apply_workqueue_attrs_locked(wq, attrs);

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_affn_scope_show(struct device *dev,
                                  struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
                written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
                                    wq_affn_names[WQ_AFFN_DFL],
                                    wq_affn_names[wq_affn_dfl]);
        else
                written = scnprintf(buf, PAGE_SIZE, "%s\n",
                                    wq_affn_names[wq->unbound_attrs->affn_scope]);
        mutex_unlock(&wq->mutex);

        return written;
}

static ssize_t wq_affn_scope_store(struct device *dev,
                                   struct device_attribute *attr,
                                   const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int affn, ret = -ENOMEM;

        affn = parse_affn_scope(buf);
        if (affn < 0)
                return affn;

        apply_wqattrs_lock();
        attrs = wq_sysfs_prep_attrs(wq);
        if (attrs) {
                attrs->affn_scope = affn;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_affinity_strict_show(struct device *dev,
                                       struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n",
                         wq->unbound_attrs->affn_strict);
}

static ssize_t wq_affinity_strict_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int v, ret = -ENOMEM;

        if (sscanf(buf, "%d", &v) != 1)
                return -EINVAL;

        apply_wqattrs_lock();
        attrs = wq_sysfs_prep_attrs(wq);
        if (attrs) {
                attrs->affn_strict = (bool)v;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static struct device_attribute wq_sysfs_unbound_attrs[] = {
        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
        __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
        __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
        __ATTR_NULL,
};

static const struct bus_type wq_subsys = {
        .name                                = "workqueue",
        .dev_groups                        = wq_sysfs_groups,
};

/**
 *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
 *  @cpumask: the cpumask to set
 *
 *  The low-level workqueues cpumask is a global cpumask that limits
 *  the affinity of all unbound workqueues.  This function check the @cpumask
 *  and apply it to all unbound workqueues and updates all pwqs of them.
 *
 *  Return:        0        - Success
 *                -EINVAL        - Invalid @cpumask
 *                -ENOMEM        - Failed to allocate memory for attrs or pwqs.
 */
static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
        int ret = -EINVAL;

        /*
         * Not excluding isolated cpus on purpose.
         * If the user wishes to include them, we allow that.
         */
        cpumask_and(cpumask, cpumask, cpu_possible_mask);
        if (!cpumask_empty(cpumask)) {
                ret = 0;
                apply_wqattrs_lock();
                if (!cpumask_equal(cpumask, wq_unbound_cpumask))
                        ret = workqueue_apply_unbound_cpumask(cpumask);
                if (!ret)
                        cpumask_copy(wq_requested_unbound_cpumask, cpumask);
                apply_wqattrs_unlock();
        }

        return ret;
}

static ssize_t __wq_cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf, cpumask_var_t mask)
{
        int written;

        mutex_lock(&wq_pool_mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
        mutex_unlock(&wq_pool_mutex);

        return written;
}

static ssize_t cpumask_requested_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
}
static DEVICE_ATTR_RO(cpumask_requested);

static ssize_t cpumask_isolated_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
}
static DEVICE_ATTR_RO(cpumask_isolated);

static ssize_t cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
}

static ssize_t cpumask_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        cpumask_var_t cpumask;
        int ret;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        ret = cpumask_parse(buf, cpumask);
        if (!ret)
                ret = workqueue_set_unbound_cpumask(cpumask);

        free_cpumask_var(cpumask);
        return ret ? ret : count;
}
static DEVICE_ATTR_RW(cpumask);

static struct attribute *wq_sysfs_cpumask_attrs[] = {
        &dev_attr_cpumask.attr,
        &dev_attr_cpumask_requested.attr,
        &dev_attr_cpumask_isolated.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs_cpumask);

static int __init wq_sysfs_init(void)
{
        return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups);
}
core_initcall(wq_sysfs_init);

static void wq_device_release(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        kfree(wq_dev);
}

/**
 * workqueue_sysfs_register - make a workqueue visible in sysfs
 * @wq: the workqueue to register
 *
 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
 * which is the preferred method.
 *
 * Workqueue user should use this function directly iff it wants to apply
 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
 * apply_workqueue_attrs() may race against userland updating the
 * attributes.
 *
 * Return: 0 on success, -errno on failure.
 */
int workqueue_sysfs_register(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev;
        int ret;

        wq->wq_dev = wq_dev = kzalloc_obj(*wq_dev);
        if (!wq_dev)
                return -ENOMEM;

        wq_dev->wq = wq;
        wq_dev->dev.bus = &wq_subsys;
        wq_dev->dev.release = wq_device_release;
        dev_set_name(&wq_dev->dev, "%s", wq->name);

        /*
         * unbound_attrs are created separately.  Suppress uevent until
         * everything is ready.
         */
        dev_set_uevent_suppress(&wq_dev->dev, true);

        ret = device_register(&wq_dev->dev);
        if (ret) {
                put_device(&wq_dev->dev);
                wq->wq_dev = NULL;
                return ret;
        }

        if (wq->flags & WQ_UNBOUND) {
                struct device_attribute *attr;

                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
                        ret = device_create_file(&wq_dev->dev, attr);
                        if (ret) {
                                device_unregister(&wq_dev->dev);
                                wq->wq_dev = NULL;
                                return ret;
                        }
                }
        }

        dev_set_uevent_suppress(&wq_dev->dev, false);
        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
        return 0;
}

/**
 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
 * @wq: the workqueue to unregister
 *
 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
 */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev = wq->wq_dev;

        if (!wq->wq_dev)
                return;

        wq->wq_dev = NULL;
        device_unregister(&wq_dev->dev);
}
#else        /* CONFIG_SYSFS */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)        { }
#endif        /* CONFIG_SYSFS */

/*
 * Workqueue watchdog.
 *
 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
 * flush dependency, a concurrency managed work item which stays RUNNING
 * indefinitely.  Workqueue stalls can be very difficult to debug as the
 * usual warning mechanisms don't trigger and internal workqueue state is
 * largely opaque.
 *
 * Workqueue watchdog monitors all worker pools periodically and dumps
 * state if some pools failed to make forward progress for a while where
 * forward progress is defined as the first item on ->worklist changing.
 *
 * This mechanism is controlled through the kernel parameter
 * "workqueue.watchdog_thresh" which can be updated at runtime through the
 * corresponding sysfs parameter file.
 */
#ifdef CONFIG_WQ_WATCHDOG

static unsigned long wq_watchdog_thresh = 30;
static struct timer_list wq_watchdog_timer;

static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;

static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC;
module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);

static unsigned int wq_panic_on_stall_time;
module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");

/*
 * Show workers that might prevent the processing of pending work items.
 * A busy worker that is not running on the CPU (e.g. sleeping in
 * wait_event_idle() with PF_WQ_WORKER cleared) can stall the pool just as
 * effectively as a CPU-bound one, so dump every in-flight worker.
 */
static void show_cpu_pool_busy_workers(struct worker_pool *pool)
{
        struct worker *worker;
        unsigned long irq_flags;
        int bkt;

        raw_spin_lock_irqsave(&pool->lock, irq_flags);

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                /*
                 * Defer printing to avoid deadlocks in console
                 * drivers that queue work while holding locks
                 * also taken in their write paths.
                 */
                printk_deferred_enter();

                pr_info("pool %d:\n", pool->id);
                sched_show_task(worker->task);

                printk_deferred_exit();
        }

        raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}

static void show_cpu_pools_busy_workers(void)
{
        struct worker_pool *pool;
        int pi;

        pr_info("Showing backtraces of busy workers in stalled worker pools:\n");

        rcu_read_lock();

        for_each_pool(pool, pi) {
                if (pool->cpu_stall)
                        show_cpu_pool_busy_workers(pool);

        }

        rcu_read_unlock();
}

/*
 * It triggers a panic in two scenarios: when the total number of stalls
 * exceeds a threshold, and when a stall lasts longer than
 * wq_panic_on_stall_time
 */
static void panic_on_wq_watchdog(unsigned int stall_time_sec)
{
        static unsigned int wq_stall;

        if (wq_panic_on_stall) {
                wq_stall++;
                if (wq_stall >= wq_panic_on_stall)
                        panic("workqueue: %u stall(s) exceeded threshold %u\n",
                              wq_stall, wq_panic_on_stall);
        }

        if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time)
                panic("workqueue: stall lasted %us, exceeding threshold %us\n",
                      stall_time_sec, wq_panic_on_stall_time);
}

static void wq_watchdog_reset_touched(void)
{
        int cpu;

        wq_watchdog_touched = jiffies;
        for_each_possible_cpu(cpu)
                per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
}

static void wq_watchdog_timer_fn(struct timer_list *unused)
{
        unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
        unsigned int max_stall_time = 0;
        bool lockup_detected = false;
        bool cpu_pool_stall = false;
        unsigned long now = jiffies;
        struct worker_pool *pool;
        unsigned int stall_time;
        int pi;

        if (!thresh)
                return;

        for_each_pool(pool, pi) {
                unsigned long pool_ts, touched, ts;

                pool->cpu_stall = false;
                if (list_empty(&pool->worklist))
                        continue;

                /*
                 * If a virtual machine is stopped by the host it can look to
                 * the watchdog like a stall.
                 */
                kvm_check_and_clear_guest_paused();

                /* get the latest of pool and touched timestamps */
                if (pool->cpu >= 0)
                        touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
                else
                        touched = READ_ONCE(wq_watchdog_touched);
                pool_ts = READ_ONCE(pool->last_progress_ts);

                if (time_after(pool_ts, touched))
                        ts = pool_ts;
                else
                        ts = touched;

                /*
                 * Did we stall?
                 *
                 * Do a lockless check first to do not disturb the system.
                 *
                 * Prevent false positives by double checking the timestamp
                 * under pool->lock. The lock makes sure that the check reads
                 * an updated pool->last_progress_ts when this CPU saw
                 * an already updated pool->worklist above. It seems better
                 * than adding another barrier into __queue_work() which
                 * is a hotter path.
                 */
                if (time_after(now, ts + thresh)) {
                        scoped_guard(raw_spinlock_irqsave, &pool->lock) {
                                pool_ts = pool->last_progress_ts;
                                if (time_after(pool_ts, touched))
                                        ts = pool_ts;
                                else
                                        ts = touched;
                        }
                        if (!time_after(now, ts + thresh))
                                continue;

                        lockup_detected = true;
                        stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
                        max_stall_time = max(max_stall_time, stall_time);
                        if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
                                pool->cpu_stall = true;
                                cpu_pool_stall = true;
                        }
                        pr_emerg("BUG: workqueue lockup - pool");
                        pr_cont_pool_info(pool);
                        pr_cont(" stuck for %us!\n", stall_time);
                }
        }

        if (lockup_detected)
                show_all_workqueues();

        if (cpu_pool_stall)
                show_cpu_pools_busy_workers();

        if (lockup_detected)
                panic_on_wq_watchdog(max_stall_time);

        wq_watchdog_reset_touched();
        mod_timer(&wq_watchdog_timer, jiffies + thresh);
}

notrace void wq_watchdog_touch(int cpu)
{
        unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
        unsigned long touch_ts = READ_ONCE(wq_watchdog_touched);
        unsigned long now = jiffies;

        if (cpu >= 0)
                per_cpu(wq_watchdog_touched_cpu, cpu) = now;
        else
                WARN_ONCE(1, "%s should be called with valid CPU", __func__);

        /* Don't unnecessarily store to global cacheline */
        if (time_after(now, touch_ts + thresh / 4))
                WRITE_ONCE(wq_watchdog_touched, jiffies);
}

static void wq_watchdog_set_thresh(unsigned long thresh)
{
        wq_watchdog_thresh = 0;
        timer_delete_sync(&wq_watchdog_timer);

        if (thresh) {
                wq_watchdog_thresh = thresh;
                wq_watchdog_reset_touched();
                mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
        }
}

static int wq_watchdog_param_set_thresh(const char *val,
                                        const struct kernel_param *kp)
{
        unsigned long thresh;
        int ret;

        ret = kstrtoul(val, 0, &thresh);
        if (ret)
                return ret;

        if (system_percpu_wq)
                wq_watchdog_set_thresh(thresh);
        else
                wq_watchdog_thresh = thresh;

        return 0;
}

static const struct kernel_param_ops wq_watchdog_thresh_ops = {
        .set        = wq_watchdog_param_set_thresh,
        .get        = param_get_ulong,
};

module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
                0644);

static void wq_watchdog_init(void)
{
        timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
        wq_watchdog_set_thresh(wq_watchdog_thresh);
}

#else        /* CONFIG_WQ_WATCHDOG */

static inline void wq_watchdog_init(void) { }

#endif        /* CONFIG_WQ_WATCHDOG */

static void bh_pool_kick_normal(struct irq_work *irq_work)
{
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
}

static void bh_pool_kick_highpri(struct irq_work *irq_work)
{
        raise_softirq_irqoff(HI_SOFTIRQ);
}

static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
{
        if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
                pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
                        cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
                return;
        }

        cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
}

static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice)
{
        BUG_ON(init_worker_pool(pool));
        pool->cpu = cpu;
        cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
        cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
        pool->attrs->nice = nice;
        pool->attrs->affn_strict = true;
        pool->node = cpu_to_node(cpu);

        /* alloc pool ID */
        mutex_lock(&wq_pool_mutex);
        BUG_ON(worker_pool_assign_id(pool));
        mutex_unlock(&wq_pool_mutex);
}

/**
 * workqueue_init_early - early init for workqueue subsystem
 *
 * This is the first step of three-staged workqueue subsystem initialization and
 * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
 * up. It sets up all the data structures and system workqueues and allows early
 * boot code to create workqueues and queue/cancel work items. Actual work item
 * execution starts only after kthreads can be created and scheduled right
 * before early initcalls.
 */
void __init workqueue_init_early(void)
{
        struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        void (*irq_work_fns[NR_STD_WORKER_POOLS])(struct irq_work *) =
                { bh_pool_kick_normal, bh_pool_kick_highpri };
        int i, cpu;

        BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

        BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));

        cpumask_copy(wq_online_cpumask, cpu_online_mask);
        cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
        restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
        if (!cpumask_empty(&wq_cmdline_cpumask))
                restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);

        cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
        cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask,
                                                housekeeping_cpumask(HK_TYPE_DOMAIN));
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

        unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
        BUG_ON(!unbound_wq_update_pwq_attrs_buf);

        /*
         * If nohz_full is enabled, set power efficient workqueue as unbound.
         * This allows workqueue items to be moved to HK CPUs.
         */
        if (housekeeping_enabled(HK_TYPE_TICK))
                wq_power_efficient = true;

        /* initialize WQ_AFFN_SYSTEM pods */
        pt->pod_cpus = kzalloc_objs(pt->pod_cpus[0], 1);
        pt->pod_node = kzalloc_objs(pt->pod_node[0], 1);
        pt->cpu_pod = kzalloc_objs(pt->cpu_pod[0], nr_cpu_ids);
        BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);

        BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));

        pt->nr_pods = 1;
        cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
        pt->pod_node[0] = NUMA_NO_NODE;
        pt->cpu_pod[0] = 0;

        /* initialize BH and CPU pools */
        for_each_possible_cpu(cpu) {
                struct worker_pool *pool;

                i = 0;
                for_each_bh_worker_pool(pool, cpu) {
                        init_cpu_worker_pool(pool, cpu, std_nice[i]);
                        pool->flags |= POOL_BH;
                        init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
                        i++;
                }

                i = 0;
                for_each_cpu_worker_pool(pool, cpu)
                        init_cpu_worker_pool(pool, cpu, std_nice[i++]);
        }

        /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;

                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;

                /*
                 * An ordered wq should have only one pwq as ordering is
                 * guaranteed by max_active which is enforced by pwqs.
                 */
                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                attrs->ordered = true;
                ordered_wq_attrs[i] = attrs;
        }

        system_wq = alloc_workqueue("events", WQ_PERCPU, 0);
        system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0);
        system_highpri_wq = alloc_workqueue("events_highpri",
                                            WQ_HIGHPRI | WQ_PERCPU, 0);
        system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
        system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE | WQ_PERCPU, 0);
        system_power_efficient_wq = alloc_workqueue("events_power_efficient",
                                              WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
        system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
                                              WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
        system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0);
        system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
                                               WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0);
        system_dfl_long_wq = alloc_workqueue("events_dfl_long", WQ_UNBOUND, WQ_MAX_ACTIVE);
        BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
               !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
               !system_power_efficient_wq ||
               !system_freezable_power_efficient_wq ||
               !system_bh_wq || !system_bh_highpri_wq || !system_dfl_long_wq);
}

static void __init wq_cpu_intensive_thresh_init(void)
{
        unsigned long thresh;
        unsigned long bogo;

        pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release");
        BUG_ON(IS_ERR(pwq_release_worker));

        /* if the user set it to a specific value, keep it */
        if (wq_cpu_intensive_thresh_us != ULONG_MAX)
                return;

        /*
         * The default of 10ms is derived from the fact that most modern (as of
         * 2023) processors can do a lot in 10ms and that it's just below what
         * most consider human-perceivable. However, the kernel also runs on a
         * lot slower CPUs including microcontrollers where the threshold is way
         * too low.
         *
         * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
         * This is by no means accurate but it doesn't have to be. The mechanism
         * is still useful even when the threshold is fully scaled up. Also, as
         * the reports would usually be applicable to everyone, some machines
         * operating on longer thresholds won't significantly diminish their
         * usefulness.
         */
        thresh = 10 * USEC_PER_MSEC;

        /* see init/calibrate.c for lpj -> BogoMIPS calculation */
        bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
        if (bogo < 4000)
                thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);

        pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
                 loops_per_jiffy, bogo, thresh);

        wq_cpu_intensive_thresh_us = thresh;
}

/**
 * workqueue_init - bring workqueue subsystem fully online
 *
 * This is the second step of three-staged workqueue subsystem initialization
 * and invoked as soon as kthreads can be created and scheduled. Workqueues have
 * been created and work items queued on them, but there are no kworkers
 * executing the work items yet. Populate the worker pools with the initial
 * workers and enable future kworker creations.
 */
void __init workqueue_init(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int cpu, bkt;

        wq_cpu_intensive_thresh_init();

        mutex_lock(&wq_pool_mutex);

        /*
         * Per-cpu pools created earlier could be missing node hint. Fix them
         * up. Also, create a rescuer for workqueues that requested it.
         */
        for_each_possible_cpu(cpu) {
                for_each_bh_worker_pool(pool, cpu)
                        pool->node = cpu_to_node(cpu);
                for_each_cpu_worker_pool(pool, cpu)
                        pool->node = cpu_to_node(cpu);
        }

        list_for_each_entry(wq, &workqueues, list) {
                WARN(init_rescuer(wq),
                     "workqueue: failed to create early rescuer for %s",
                     wq->name);
        }

        mutex_unlock(&wq_pool_mutex);

        /*
         * Create the initial workers. A BH pool has one pseudo worker that
         * represents the shared BH execution context and thus doesn't get
         * affected by hotplug events. Create the BH pseudo workers for all
         * possible CPUs here.
         */
        for_each_possible_cpu(cpu)
                for_each_bh_worker_pool(pool, cpu)
                        BUG_ON(!create_worker(pool));

        for_each_online_cpu(cpu) {
                for_each_cpu_worker_pool(pool, cpu) {
                        pool->flags &= ~POOL_DISASSOCIATED;
                        BUG_ON(!create_worker(pool));
                }
        }

        hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
                BUG_ON(!create_worker(pool));

        wq_online = true;
        wq_watchdog_init();
}

/*
 * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
 * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
 * and consecutive pod ID. The rest of @pt is initialized accordingly.
 */
static void __init init_pod_type(struct wq_pod_type *pt,
                                 bool (*cpus_share_pod)(int, int))
{
        int cur, pre, cpu, pod;

        pt->nr_pods = 0;

        /* init @pt->cpu_pod[] according to @cpus_share_pod() */
        pt->cpu_pod = kzalloc_objs(pt->cpu_pod[0], nr_cpu_ids);
        BUG_ON(!pt->cpu_pod);

        for_each_possible_cpu(cur) {
                for_each_possible_cpu(pre) {
                        if (pre >= cur) {
                                pt->cpu_pod[cur] = pt->nr_pods++;
                                break;
                        }
                        if (cpus_share_pod(cur, pre)) {
                                pt->cpu_pod[cur] = pt->cpu_pod[pre];
                                break;
                        }
                }
        }

        /* init the rest to match @pt->cpu_pod[] */
        pt->pod_cpus = kzalloc_objs(pt->pod_cpus[0], pt->nr_pods);
        pt->pod_node = kzalloc_objs(pt->pod_node[0], pt->nr_pods);
        BUG_ON(!pt->pod_cpus || !pt->pod_node);

        for (pod = 0; pod < pt->nr_pods; pod++)
                BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));

        for_each_possible_cpu(cpu) {
                cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);
                pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
        }
}

static bool __init cpus_dont_share(int cpu0, int cpu1)
{
        return false;
}

static bool __init cpus_share_smt(int cpu0, int cpu1)
{
#ifdef CONFIG_SCHED_SMT
        return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
#else
        return false;
#endif
}

static bool __init cpus_share_numa(int cpu0, int cpu1)
{
        return cpu_to_node(cpu0) == cpu_to_node(cpu1);
}

/* Maps each CPU to its shard index within the LLC pod it belongs to */
static int cpu_shard_id[NR_CPUS] __initdata;

/**
 * llc_count_cores - count distinct cores (SMT groups) within an LLC pod
 * @pod_cpus:  the cpumask of CPUs in the LLC pod
 * @smt_pods:  the SMT pod type, used to identify sibling groups
 *
 * A core is represented by the lowest-numbered CPU in its SMT group. Returns
 * the number of distinct cores found in @pod_cpus.
 */
static int __init llc_count_cores(const struct cpumask *pod_cpus,
                                  struct wq_pod_type *smt_pods)
{
        const struct cpumask *sibling_cpus;
        int nr_cores = 0, c;

        /*
         * Count distinct cores by only counting the first CPU in each
         * SMT sibling group.
         */
        for_each_cpu(c, pod_cpus) {
                sibling_cpus = smt_pods->pod_cpus[smt_pods->cpu_pod[c]];
                if (cpumask_first(sibling_cpus) == c)
                        nr_cores++;
        }

        return nr_cores;
}

/*
 * llc_shard_size - number of cores in a given shard
 *
 * Cores are spread as evenly as possible. The first @nr_large_shards shards are
 * "large shards" with (cores_per_shard + 1) cores; the rest are "default
 * shards" with cores_per_shard cores.
 */
static int __init llc_shard_size(int shard_id, int cores_per_shard, int nr_large_shards)
{
        /* The first @nr_large_shards shards are large shards */
        if (shard_id < nr_large_shards)
                return cores_per_shard + 1;

        /* The remaining shards are default shards */
        return cores_per_shard;
}

/*
 * llc_calc_shard_layout - compute the shard layout for an LLC pod
 * @nr_cores:  number of distinct cores in the LLC pod
 *
 * Chooses the number of shards that keeps average shard size closest to
 * wq_cache_shard_size. Returns a struct describing the total number of shards,
 * the base size of each, and how many are large shards.
 */
static struct llc_shard_layout __init llc_calc_shard_layout(int nr_cores)
{
        struct llc_shard_layout layout;

        /* Ensure at least one shard; pick the count closest to the target size */
        layout.nr_shards = max(1, DIV_ROUND_CLOSEST(nr_cores, wq_cache_shard_size));
        layout.cores_per_shard = nr_cores / layout.nr_shards;
        layout.nr_large_shards = nr_cores % layout.nr_shards;

        return layout;
}

/*
 * llc_shard_is_full - check whether a shard has reached its core capacity
 * @cores_in_shard: number of cores already assigned to this shard
 * @shard_id:       index of the shard being checked
 * @layout:         the shard layout computed by llc_calc_shard_layout()
 *
 * Returns true if @cores_in_shard equals the expected size for @shard_id.
 */
static bool __init llc_shard_is_full(int cores_in_shard, int shard_id,
                                     const struct llc_shard_layout *layout)
{
        return cores_in_shard == llc_shard_size(shard_id, layout->cores_per_shard,
                                                layout->nr_large_shards);
}

/**
 * llc_populate_cpu_shard_id - populate cpu_shard_id[] for each CPU in an LLC pod
 * @pod_cpus:  the cpumask of CPUs in the LLC pod
 * @smt_pods:  the SMT pod type, used to identify sibling groups
 * @nr_cores:  number of distinct cores in @pod_cpus (from llc_count_cores())
 *
 * Walks @pod_cpus in order. At each SMT group leader, advances to the next
 * shard once the current shard is full. Results are written to cpu_shard_id[].
 */
static void __init llc_populate_cpu_shard_id(const struct cpumask *pod_cpus,
                                             struct wq_pod_type *smt_pods,
                                             int nr_cores)
{
        struct llc_shard_layout layout = llc_calc_shard_layout(nr_cores);
        const struct cpumask *sibling_cpus;
        /* Count the number of cores in the current shard_id */
        int cores_in_shard = 0;
        unsigned int leader;
        /* This is a cursor for the shards. Go from zero to nr_shards - 1*/
        int shard_id = 0;
        int c;

        /* Iterate at every CPU for a given LLC pod, and assign it a shard */
        for_each_cpu(c, pod_cpus) {
                sibling_cpus = smt_pods->pod_cpus[smt_pods->cpu_pod[c]];
                if (cpumask_first(sibling_cpus) == c) {
                        /* This is the CPU leader for the siblings */
                        if (llc_shard_is_full(cores_in_shard, shard_id, &layout)) {
                                shard_id++;
                                cores_in_shard = 0;
                        }
                        cores_in_shard++;
                        cpu_shard_id[c] = shard_id;
                } else {
                        /*
                         * The siblings' shard MUST be the same as the leader.
                         * never split threads in the same core.
                         */
                        leader = cpumask_first(sibling_cpus);

                        /*
                         * This check silences a Warray-bounds warning on UP
                         * configs where NR_CPUS=1 makes cpu_shard_id[]
                         * a single-element array, and the compiler can't
                         * prove the index is always 0.
                         */
                        if (WARN_ON_ONCE(leader >= nr_cpu_ids))
                                continue;
                        cpu_shard_id[c] = cpu_shard_id[leader];
                }
        }

        WARN_ON_ONCE(shard_id != (layout.nr_shards - 1));
}

/**
 * precompute_cache_shard_ids - assign each CPU its shard index within its LLC
 *
 * Iterates over all LLC pods. For each pod, counts distinct cores then assigns
 * shard indices to all CPUs in the pod. Must be called after WQ_AFFN_CACHE and
 * WQ_AFFN_SMT have been initialized.
 */
static void __init precompute_cache_shard_ids(void)
{
        struct wq_pod_type *llc_pods = &wq_pod_types[WQ_AFFN_CACHE];
        struct wq_pod_type *smt_pods = &wq_pod_types[WQ_AFFN_SMT];
        const struct cpumask *cpus_sharing_llc;
        int nr_cores;
        int pod;

        if (!wq_cache_shard_size) {
                pr_warn("workqueue: cache_shard_size must be > 0, setting to 1\n");
                wq_cache_shard_size = 1;
        }

        for (pod = 0; pod < llc_pods->nr_pods; pod++) {
                cpus_sharing_llc = llc_pods->pod_cpus[pod];

                /* Number of cores in this given LLC */
                nr_cores = llc_count_cores(cpus_sharing_llc, smt_pods);
                llc_populate_cpu_shard_id(cpus_sharing_llc, smt_pods, nr_cores);
        }
}

/*
 * cpus_share_cache_shard - test whether two CPUs belong to the same cache shard
 *
 * Two CPUs share a cache shard if they are in the same LLC and have the same
 * shard index. Used as the pod affinity callback for WQ_AFFN_CACHE_SHARD.
 */
static bool __init cpus_share_cache_shard(int cpu0, int cpu1)
{
        if (!cpus_share_cache(cpu0, cpu1))
                return false;

        return cpu_shard_id[cpu0] == cpu_shard_id[cpu1];
}

/**
 * workqueue_init_topology - initialize CPU pods for unbound workqueues
 *
 * This is the third step of three-staged workqueue subsystem initialization and
 * invoked after SMP and topology information are fully initialized. It
 * initializes the unbound CPU pods accordingly.
 */
void __init workqueue_init_topology(void)
{
        struct workqueue_struct *wq;
        int cpu;

        init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
        init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
        init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
        precompute_cache_shard_ids();
        init_pod_type(&wq_pod_types[WQ_AFFN_CACHE_SHARD], cpus_share_cache_shard);
        init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);

        wq_topo_initialized = true;

        mutex_lock(&wq_pool_mutex);

        /*
         * Workqueues allocated earlier would have all CPUs sharing the default
         * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue
         * and CPU combinations to apply per-pod sharing.
         */
        list_for_each_entry(wq, &workqueues, list) {
                for_each_online_cpu(cpu)
                        unbound_wq_update_pwq(wq, cpu);
                if (wq->flags & WQ_UNBOUND) {
                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, -1);
                        mutex_unlock(&wq->mutex);
                }
        }

        mutex_unlock(&wq_pool_mutex);
}

void __warn_flushing_systemwide_wq(void)
{
        pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
        dump_stack();
}
EXPORT_SYMBOL(__warn_flushing_systemwide_wq);

static int __init workqueue_unbound_cpus_setup(char *str)
{
        if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {
                cpumask_clear(&wq_cmdline_cpumask);
                pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
        }

        return 1;
}
__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Memory merging support.
 *
 * This code enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork()
 *
 * Copyright (C) 2008-2009 Red Hat, Inc.
 * Authors:
 *        Izik Eidus
 *        Andrea Arcangeli
 *        Chris Wright
 *        Hugh Dickins
 */

#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/cputime.h>
#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
#include <linux/xxhash.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/memory.h>
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
#include <linux/hashtable.h>
#include <linux/freezer.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/pagewalk.h>

#include <asm/tlbflush.h>
#include "internal.h"
#include "mm_slot.h"

#define CREATE_TRACE_POINTS
#include <trace/events/ksm.h>

#ifdef CONFIG_NUMA
#define NUMA(x)                (x)
#define DO_NUMA(x)        do { (x); } while (0)
#else
#define NUMA(x)                (0)
#define DO_NUMA(x)        do { } while (0)
#endif

typedef u8 rmap_age_t;

/**
 * DOC: Overview
 *
 * A few notes about the KSM scanning process,
 * to make it easier to understand the data structures below:
 *
 * In order to reduce excessive scanning, KSM sorts the memory pages by their
 * contents into a data structure that holds pointers to the pages' locations.
 *
 * Since the contents of the pages may change at any moment, KSM cannot just
 * insert the pages into a normal sorted tree and expect it to find anything.
 * Therefore KSM uses two data structures - the stable and the unstable tree.
 *
 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
 * by their contents.  Because each such page is write-protected, searching on
 * this tree is fully assured to be working (except when pages are unmapped),
 * and therefore this tree is called the stable tree.
 *
 * The stable tree node includes information required for reverse
 * mapping from a KSM page to virtual addresses that map this page.
 *
 * In order to avoid large latencies of the rmap walks on KSM pages,
 * KSM maintains two types of nodes in the stable tree:
 *
 * * the regular nodes that keep the reverse mapping structures in a
 *   linked list
 * * the "chains" that link nodes ("dups") that represent the same
 *   write protected memory content, but each "dup" corresponds to a
 *   different KSM page copy of that content
 *
 * Internally, the regular nodes, "dups" and "chains" are represented
 * using the same struct ksm_stable_node structure.
 *
 * In addition to the stable tree, KSM uses a second data structure called the
 * unstable tree: this tree holds pointers to pages which have been found to
 * be "unchanged for a period of time".  The unstable tree sorts these pages
 * by their contents, but since they are not write-protected, KSM cannot rely
 * upon the unstable tree to work correctly - the unstable tree is liable to
 * be corrupted as its contents are modified, and so it is called unstable.
 *
 * KSM solves this problem by several techniques:
 *
 * 1) The unstable tree is flushed every time KSM completes scanning all
 *    memory areas, and then the tree is rebuilt again from the beginning.
 * 2) KSM will only insert into the unstable tree, pages whose hash value
 *    has not changed since the previous scan of all memory areas.
 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
 *    colors of the nodes and not on their contents, assuring that even when
 *    the tree gets "corrupted" it won't get out of balance, so scanning time
 *    remains the same (also, searching and inserting nodes in an rbtree uses
 *    the same algorithm, so we have no overhead when we flush and rebuild).
 * 4) KSM never flushes the stable tree, which means that even if it were to
 *    take 10 attempts to find a page in the unstable tree, once it is found,
 *    it is secured in the stable tree.  (When we scan a new page, we first
 *    compare it against the stable tree, and then against the unstable tree.)
 *
 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
 * stable trees and multiple unstable trees: one of each for each NUMA node.
 */

/**
 * struct ksm_mm_slot - ksm information per mm that is being scanned
 * @slot: hash lookup from mm to mm_slot
 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
 */
struct ksm_mm_slot {
        struct mm_slot slot;
        struct ksm_rmap_item *rmap_list;
};

/**
 * struct ksm_scan - cursor for scanning
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
 * @rmap_list: link to the next rmap to be scanned in the rmap_list
 * @seqnr: count of completed full scans (needed when removing unstable node)
 *
 * There is only the one ksm_scan instance of this cursor structure.
 */
struct ksm_scan {
        struct ksm_mm_slot *mm_slot;
        unsigned long address;
        struct ksm_rmap_item **rmap_list;
        unsigned long seqnr;
};

/**
 * struct ksm_stable_node - node of the stable rbtree
 * @node: rb node of this ksm page in the stable tree
 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
 * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
 * @list: linked into migrate_nodes, pending placement in the proper node tree
 * @hlist: hlist head of rmap_items using this ksm page
 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
 * @chain_prune_time: time of the last full garbage collection
 * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
 */
struct ksm_stable_node {
        union {
                struct rb_node node;        /* when node of stable tree */
                struct {                /* when listed for migration */
                        struct list_head *head;
                        struct {
                                struct hlist_node hlist_dup;
                                struct list_head list;
                        };
                };
        };
        struct hlist_head hlist;
        union {
                unsigned long kpfn;
                unsigned long chain_prune_time;
        };
        /*
         * STABLE_NODE_CHAIN can be any negative number in
         * rmap_hlist_len negative range, but better not -1 to be able
         * to reliably detect underflows.
         */
#define STABLE_NODE_CHAIN -1024
        int rmap_hlist_len;
#ifdef CONFIG_NUMA
        int nid;
#endif
};

/**
 * struct ksm_rmap_item - reverse mapping item for virtual addresses
 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 * @nid: NUMA node id of unstable tree in which linked (may not match page)
 * @mm: the memory structure this rmap_item is pointing into
 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 * @oldchecksum: previous checksum of the page at that virtual address
 * @node: rb node of this rmap_item in the unstable tree
 * @head: pointer to stable_node heading this list in the stable tree
 * @hlist: link into hlist of rmap_items hanging off that stable_node
 * @age: number of scan iterations since creation
 * @remaining_skips: how many scans to skip
 */
struct ksm_rmap_item {
        struct ksm_rmap_item *rmap_list;
        union {
                struct anon_vma *anon_vma;        /* when stable */
#ifdef CONFIG_NUMA
                int nid;                /* when node of unstable tree */
#endif
        };
        struct mm_struct *mm;
        unsigned long address;                /* + low bits used for flags below */
        unsigned int oldchecksum;        /* when unstable */
        rmap_age_t age;
        rmap_age_t remaining_skips;
        union {
                struct rb_node node;        /* when node of unstable tree */
                struct {                /* when listed from stable tree */
                        struct ksm_stable_node *head;
                        struct hlist_node hlist;
                };
        };
};

#define SEQNR_MASK        0x0ff        /* low bits of unstable tree seqnr */
#define UNSTABLE_FLAG        0x100        /* is a node of the unstable tree */
#define STABLE_FLAG        0x200        /* is listed from the stable tree */

/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
static struct rb_root one_unstable_tree[1] = { RB_ROOT };
static struct rb_root *root_stable_tree = one_stable_tree;
static struct rb_root *root_unstable_tree = one_unstable_tree;

/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)

#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);

static struct ksm_mm_slot ksm_mm_head = {
        .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node),
};
static struct ksm_scan ksm_scan = {
        .mm_slot = &ksm_mm_head,
};

static struct kmem_cache *rmap_item_cache;
static struct kmem_cache *stable_node_cache;
static struct kmem_cache *mm_slot_cache;

/* Default number of pages to scan per batch */
#define DEFAULT_PAGES_TO_SCAN 100

/* The number of pages scanned */
static unsigned long ksm_pages_scanned;

/* The number of nodes in the stable tree */
static unsigned long ksm_pages_shared;

/* The number of page slots additionally sharing those nodes */
static unsigned long ksm_pages_sharing;

/* The number of nodes in the unstable tree */
static unsigned long ksm_pages_unshared;

/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;

/* The number of stable_node chains */
static unsigned long ksm_stable_node_chains;

/* The number of stable_node dups linked to the stable_node chains */
static unsigned long ksm_stable_node_dups;

/* Delay in pruning stale stable_node_dups in the stable_node_chains */
static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;

/* Maximum number of page slots sharing a stable node */
static int ksm_max_page_sharing = 256;

/* Number of pages ksmd should scan in one batch */
static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;

/* Milliseconds ksmd should sleep between batches */
static unsigned int ksm_thread_sleep_millisecs = 20;

/* Checksum of an empty (zeroed) page */
static unsigned int zero_checksum __read_mostly;

/* Whether to merge empty (zeroed) pages with actual zero pages */
static bool ksm_use_zero_pages __read_mostly;

/* Skip pages that couldn't be de-duplicated previously */
/* Default to true at least temporarily, for testing */
static bool ksm_smart_scan = true;

/* The number of zero pages which is placed by KSM */
atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0);

/* The number of pages that have been skipped due to "smart scanning" */
static unsigned long ksm_pages_skipped;

/* Don't scan more than max pages per batch. */
static unsigned long ksm_advisor_max_pages_to_scan = 30000;

/* Min CPU for scanning pages per scan */
#define KSM_ADVISOR_MIN_CPU 10

/* Max CPU for scanning pages per scan */
static unsigned int ksm_advisor_max_cpu =  70;

/* Target scan time in seconds to analyze all KSM candidate pages. */
static unsigned long ksm_advisor_target_scan_time = 200;

/* Exponentially weighted moving average. */
#define EWMA_WEIGHT 30

/**
 * struct advisor_ctx - metadata for KSM advisor
 * @start_scan: start time of the current scan
 * @scan_time: scan time of previous scan
 * @change: change in percent to pages_to_scan parameter
 * @cpu_time: cpu time consumed by the ksmd thread in the previous scan
 */
struct advisor_ctx {
        ktime_t start_scan;
        unsigned long scan_time;
        unsigned long change;
        unsigned long long cpu_time;
};
static struct advisor_ctx advisor_ctx;

/* Define different advisor's */
enum ksm_advisor_type {
        KSM_ADVISOR_NONE,
        KSM_ADVISOR_SCAN_TIME,
};
static enum ksm_advisor_type ksm_advisor;

#ifdef CONFIG_SYSFS
/*
 * Only called through the sysfs control interface:
 */

/* At least scan this many pages per batch. */
static unsigned long ksm_advisor_min_pages_to_scan = 500;

static void set_advisor_defaults(void)
{
        if (ksm_advisor == KSM_ADVISOR_NONE) {
                ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
        } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) {
                advisor_ctx = (const struct advisor_ctx){ 0 };
                ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan;
        }
}
#endif /* CONFIG_SYSFS */

static inline void advisor_start_scan(void)
{
        if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
                advisor_ctx.start_scan = ktime_get();
}

/*
 * Use previous scan time if available, otherwise use current scan time as an
 * approximation for the previous scan time.
 */
static inline unsigned long prev_scan_time(struct advisor_ctx *ctx,
                                           unsigned long scan_time)
{
        return ctx->scan_time ? ctx->scan_time : scan_time;
}

/* Calculate exponential weighted moving average */
static unsigned long ewma(unsigned long prev, unsigned long curr)
{
        return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100;
}

/*
 * The scan time advisor is based on the current scan rate and the target
 * scan rate.
 *
 *      new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time)
 *
 * To avoid perturbations it calculates a change factor of previous changes.
 * A new change factor is calculated for each iteration and it uses an
 * exponentially weighted moving average. The new pages_to_scan value is
 * multiplied with that change factor:
 *
 *      new_pages_to_scan *= change factor
 *
 * The new_pages_to_scan value is limited by the cpu min and max values. It
 * calculates the cpu percent for the last scan and calculates the new
 * estimated cpu percent cost for the next scan. That value is capped by the
 * cpu min and max setting.
 *
 * In addition the new pages_to_scan value is capped by the max and min
 * limits.
 */
static void scan_time_advisor(void)
{
        unsigned int cpu_percent;
        unsigned long cpu_time;
        unsigned long cpu_time_diff;
        unsigned long cpu_time_diff_ms;
        unsigned long pages;
        unsigned long per_page_cost;
        unsigned long factor;
        unsigned long change;
        unsigned long last_scan_time;
        unsigned long scan_time;

        /* Convert scan time to seconds */
        scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan),
                            MSEC_PER_SEC);
        scan_time = scan_time ? scan_time : 1;

        /* Calculate CPU consumption of ksmd background thread */
        cpu_time = task_sched_runtime(current);
        cpu_time_diff = cpu_time - advisor_ctx.cpu_time;
        cpu_time_diff_ms = cpu_time_diff / 1000 / 1000;

        cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000);
        cpu_percent = cpu_percent ? cpu_percent : 1;
        last_scan_time = prev_scan_time(&advisor_ctx, scan_time);

        /* Calculate scan time as percentage of target scan time */
        factor = ksm_advisor_target_scan_time * 100 / scan_time;
        factor = factor ? factor : 1;

        /*
         * Calculate scan time as percentage of last scan time and use
         * exponentially weighted average to smooth it
         */
        change = scan_time * 100 / last_scan_time;
        change = change ? change : 1;
        change = ewma(advisor_ctx.change, change);

        /* Calculate new scan rate based on target scan rate. */
        pages = ksm_thread_pages_to_scan * 100 / factor;
        /* Update pages_to_scan by weighted change percentage. */
        pages = pages * change / 100;

        /* Cap new pages_to_scan value */
        per_page_cost = ksm_thread_pages_to_scan / cpu_percent;
        per_page_cost = per_page_cost ? per_page_cost : 1;

        pages = min(pages, per_page_cost * ksm_advisor_max_cpu);
        pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU);
        pages = min(pages, ksm_advisor_max_pages_to_scan);

        /* Update advisor context */
        advisor_ctx.change = change;
        advisor_ctx.scan_time = scan_time;
        advisor_ctx.cpu_time = cpu_time;

        ksm_thread_pages_to_scan = pages;
        trace_ksm_advisor(scan_time, pages, cpu_percent);
}

static void advisor_stop_scan(void)
{
        if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
                scan_time_advisor();
}

#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
static int ksm_nr_node_ids = 1;
#else
#define ksm_merge_across_nodes        1U
#define ksm_nr_node_ids                1
#endif

#define KSM_RUN_STOP        0
#define KSM_RUN_MERGE        1
#define KSM_RUN_UNMERGE        2
#define KSM_RUN_OFFLINE        4
static unsigned long ksm_run = KSM_RUN_STOP;
static void wait_while_offlining(void);

static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);

static int __init ksm_slab_init(void)
{
        rmap_item_cache = KMEM_CACHE(ksm_rmap_item, 0);
        if (!rmap_item_cache)
                goto out;

        stable_node_cache = KMEM_CACHE(ksm_stable_node, 0);
        if (!stable_node_cache)
                goto out_free1;

        mm_slot_cache = KMEM_CACHE(ksm_mm_slot, 0);
        if (!mm_slot_cache)
                goto out_free2;

        return 0;

out_free2:
        kmem_cache_destroy(stable_node_cache);
out_free1:
        kmem_cache_destroy(rmap_item_cache);
out:
        return -ENOMEM;
}

static void __init ksm_slab_free(void)
{
        kmem_cache_destroy(mm_slot_cache);
        kmem_cache_destroy(stable_node_cache);
        kmem_cache_destroy(rmap_item_cache);
        mm_slot_cache = NULL;
}

static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain)
{
        return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
}

static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup)
{
        return dup->head == STABLE_NODE_DUP_HEAD;
}

static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup,
                                             struct ksm_stable_node *chain)
{
        VM_BUG_ON(is_stable_node_dup(dup));
        dup->head = STABLE_NODE_DUP_HEAD;
        VM_BUG_ON(!is_stable_node_chain(chain));
        hlist_add_head(&dup->hlist_dup, &chain->hlist);
        ksm_stable_node_dups++;
}

static inline void __stable_node_dup_del(struct ksm_stable_node *dup)
{
        VM_BUG_ON(!is_stable_node_dup(dup));
        hlist_del(&dup->hlist_dup);
        ksm_stable_node_dups--;
}

static inline void stable_node_dup_del(struct ksm_stable_node *dup)
{
        VM_BUG_ON(is_stable_node_chain(dup));
        if (is_stable_node_dup(dup))
                __stable_node_dup_del(dup);
        else
                rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
#ifdef CONFIG_DEBUG_VM
        dup->head = NULL;
#endif
}

static inline struct ksm_rmap_item *alloc_rmap_item(void)
{
        struct ksm_rmap_item *rmap_item;

        rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
                                                __GFP_NORETRY | __GFP_NOWARN);
        if (rmap_item)
                ksm_rmap_items++;
        return rmap_item;
}

static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
{
        ksm_rmap_items--;
        rmap_item->mm->ksm_rmap_items--;
        rmap_item->mm = NULL;        /* debug safety */
        kmem_cache_free(rmap_item_cache, rmap_item);
}

static inline struct ksm_stable_node *alloc_stable_node(void)
{
        /*
         * The allocation can take too long with GFP_KERNEL when memory is under
         * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
         * grants access to memory reserves, helping to avoid this problem.
         */
        return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
}

static inline void free_stable_node(struct ksm_stable_node *stable_node)
{
        VM_BUG_ON(stable_node->rmap_hlist_len &&
                  !is_stable_node_chain(stable_node));
        kmem_cache_free(stable_node_cache, stable_node);
}

/*
 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
 * page tables after it has passed through ksm_exit() - which, if necessary,
 * takes mmap_lock briefly to serialize against them.  ksm_exit() does not set
 * a special flag: they can just back out as soon as mm_users goes to zero.
 * ksm_test_exit() is used throughout to make this test for exit: in some
 * places for correctness, in some places just to avoid unnecessary work.
 */
static inline bool ksm_test_exit(struct mm_struct *mm)
{
        return atomic_read(&mm->mm_users) == 0;
}

static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
                        struct mm_walk *walk)
{
        unsigned long *found_addr = (unsigned long *) walk->private;
        struct mm_struct *mm = walk->mm;
        pte_t *start_ptep, *ptep;
        spinlock_t *ptl;
        int found = 0;

        if (ksm_test_exit(walk->mm))
                return 0;
        if (signal_pending(current))
                return -ERESTARTSYS;

        start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
        if (!start_ptep)
                return 0;

        for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
                pte_t pte = ptep_get(ptep);
                struct folio *folio = NULL;

                if (pte_present(pte)) {
                        folio = vm_normal_folio(walk->vma, addr, pte);
                } else if (!pte_none(pte)) {
                        const softleaf_t entry = softleaf_from_pte(pte);

                        /*
                         * As KSM pages remain KSM pages until freed, no need to wait
                         * here for migration to end.
                         */
                        if (softleaf_is_migration(entry))
                                folio = softleaf_to_folio(entry);
                }
                /* return 1 if the page is an normal ksm page or KSM-placed zero page */
                found = (folio && folio_test_ksm(folio)) ||
                        (pte_present(pte) && is_ksm_zero_pte(pte));
                if (found) {
                        *found_addr = addr;
                        goto out_unlock;
                }
        }
out_unlock:
        pte_unmap_unlock(start_ptep, ptl);
        return found;
}

static const struct mm_walk_ops break_ksm_ops = {
        .pmd_entry = break_ksm_pmd_entry,
        .walk_lock = PGWALK_RDLOCK,
};

static const struct mm_walk_ops break_ksm_lock_vma_ops = {
        .pmd_entry = break_ksm_pmd_entry,
        .walk_lock = PGWALK_WRLOCK,
};

/*
 * Though it's very tempting to unmerge rmap_items from stable tree rather
 * than check every pte of a given vma, the locking doesn't quite work for
 * that - an rmap_item is assigned to the stable tree after inserting ksm
 * page and upping mmap_lock.  Nor does it fit with the way we skip dup'ing
 * rmap_items from parent to child at fork time (so as not to waste time
 * if exit comes before the next scan reaches it).
 *
 * Similarly, although we'd like to remove rmap_items (so updating counts
 * and freeing memory) when unmerging an area, it's easier to leave that
 * to the next pass of ksmd - consider, for example, how ksmd might be
 * in cmp_and_merge_page on one of the rmap_items we would be removing.
 *
 * We use break_ksm to break COW on a ksm page by triggering unsharing,
 * such that the ksm page will get replaced by an exclusive anonymous page.
 *
 * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
 * in case the application has unmapped and remapped mm,addr meanwhile.
 * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
 * mmap of /dev/mem, where we would not want to touch it.
 *
 * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
 * of the process that owns 'vma'.  We also do not want to enforce
 * protection keys here anyway.
 */
static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
                unsigned long end, bool lock_vma)
{
        vm_fault_t ret = 0;
        const struct mm_walk_ops *ops = lock_vma ?
                                &break_ksm_lock_vma_ops : &break_ksm_ops;

        do {
                int ksm_page;

                cond_resched();
                ksm_page = walk_page_range_vma(vma, addr, end, ops, &addr);
                if (ksm_page <= 0)
                        return ksm_page;
                ret = handle_mm_fault(vma, addr,
                                      FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
                                      NULL);
        } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
        /*
         * We must loop until we no longer find a KSM page because
         * handle_mm_fault() may back out if there's any difficulty e.g. if
         * pte accessed bit gets updated concurrently.
         *
         * VM_FAULT_SIGBUS could occur if we race with truncation of the
         * backing file, which also invalidates anonymous pages: that's
         * okay, that truncation will have unmapped the KSM page for us.
         *
         * VM_FAULT_OOM: at the time of writing (late July 2009), setting
         * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
         * current task has TIF_MEMDIE set, and will be OOM killed on return
         * to user; and ksmd, having no mm, would never be chosen for that.
         *
         * But if the mm is in a limited mem_cgroup, then the fault may fail
         * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
         * even ksmd can fail in this way - though it's usually breaking ksm
         * just to undo a merge it made a moment before, so unlikely to oom.
         *
         * That's a pity: we might therefore have more kernel pages allocated
         * than we're counting as nodes in the stable tree; but ksm_do_scan
         * will retry to break_cow on each pass, so should recover the page
         * in due course.  The important thing is to not let VM_MERGEABLE
         * be cleared while any such pages might remain in the area.
         */
        return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}

static bool ksm_compatible(const struct file *file, vma_flags_t vma_flags)
{
        /* Just ignore the advice. */
        if (vma_flags_test_any(&vma_flags, VMA_SHARED_BIT, VMA_MAYSHARE_BIT,
                               VMA_HUGETLB_BIT))
                return false;
        if (vma_flags_test_single_mask(&vma_flags, VMA_DROPPABLE))
                return false;
        if (vma_flags_test_any_mask(&vma_flags, VMA_SPECIAL_FLAGS))
                return false;
        if (file_is_dax(file))
                return false;
#ifdef VM_SAO
        if (vma_flags_test(&vma_flags, VMA_SAO_BIT))
                return false;
#endif
#ifdef VM_SPARC_ADI
        if (vma_flags_test(&vma_flags, VMA_SPARC_ADI_BIT))
                return false;
#endif

        return true;
}

static bool vma_ksm_compatible(struct vm_area_struct *vma)
{
        return ksm_compatible(vma->vm_file, vma->flags);
}

static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
                unsigned long addr)
{
        struct vm_area_struct *vma;
        if (ksm_test_exit(mm))
                return NULL;
        vma = vma_lookup(mm, addr);
        if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
                return NULL;
        return vma;
}

static void break_cow(struct ksm_rmap_item *rmap_item)
{
        struct mm_struct *mm = rmap_item->mm;
        unsigned long addr = rmap_item->address;
        struct vm_area_struct *vma;

        /*
         * It is not an accident that whenever we want to break COW
         * to undo, we also need to drop a reference to the anon_vma.
         */
        put_anon_vma(rmap_item->anon_vma);

        mmap_read_lock(mm);
        vma = find_mergeable_vma(mm, addr);
        if (vma)
                break_ksm(vma, addr, addr + PAGE_SIZE, false);
        mmap_read_unlock(mm);
}

static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
{
        struct mm_struct *mm = rmap_item->mm;
        unsigned long addr = rmap_item->address;
        struct vm_area_struct *vma;
        struct page *page = NULL;
        struct folio_walk fw;
        struct folio *folio;

        mmap_read_lock(mm);
        vma = find_mergeable_vma(mm, addr);
        if (!vma)
                goto out;

        folio = folio_walk_start(&fw, vma, addr, 0);
        if (folio) {
                if (!folio_is_zone_device(folio) &&
                    folio_test_anon(folio)) {
                        folio_get(folio);
                        page = fw.page;
                }
                folio_walk_end(&fw, vma);
        }
out:
        if (page) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        }
        mmap_read_unlock(mm);
        return page;
}

/*
 * This helper is used for getting right index into array of tree roots.
 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
 * every node has its own stable and unstable tree.
 */
static inline int get_kpfn_nid(unsigned long kpfn)
{
        return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
}

static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup,
                                                   struct rb_root *root)
{
        struct ksm_stable_node *chain = alloc_stable_node();
        VM_BUG_ON(is_stable_node_chain(dup));
        if (likely(chain)) {
                INIT_HLIST_HEAD(&chain->hlist);
                chain->chain_prune_time = jiffies;
                chain->rmap_hlist_len = STABLE_NODE_CHAIN;
#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
                chain->nid = NUMA_NO_NODE; /* debug */
#endif
                ksm_stable_node_chains++;

                /*
                 * Put the stable node chain in the first dimension of
                 * the stable tree and at the same time remove the old
                 * stable node.
                 */
                rb_replace_node(&dup->node, &chain->node, root);

                /*
                 * Move the old stable node to the second dimension
                 * queued in the hlist_dup. The invariant is that all
                 * dup stable_nodes in the chain->hlist point to pages
                 * that are write protected and have the exact same
                 * content.
                 */
                stable_node_chain_add_dup(dup, chain);
        }
        return chain;
}

static inline void free_stable_node_chain(struct ksm_stable_node *chain,
                                          struct rb_root *root)
{
        rb_erase(&chain->node, root);
        free_stable_node(chain);
        ksm_stable_node_chains--;
}

static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
{
        struct ksm_rmap_item *rmap_item;

        /* check it's not STABLE_NODE_CHAIN or negative */
        BUG_ON(stable_node->rmap_hlist_len < 0);

        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                if (rmap_item->hlist.next) {
                        ksm_pages_sharing--;
                        trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm);
                } else {
                        ksm_pages_shared--;
                }

                rmap_item->mm->ksm_merging_pages--;

                VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
                stable_node->rmap_hlist_len--;
                put_anon_vma(rmap_item->anon_vma);
                rmap_item->address &= PAGE_MASK;
                cond_resched();
        }

        /*
         * We need the second aligned pointer of the migrate_nodes
         * list_head to stay clear from the rb_parent_color union
         * (aligned and different than any node) and also different
         * from &migrate_nodes. This will verify that future list.h changes
         * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
         */
        BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
        BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);

        trace_ksm_remove_ksm_page(stable_node->kpfn);
        if (stable_node->head == &migrate_nodes)
                list_del(&stable_node->list);
        else
                stable_node_dup_del(stable_node);
        free_stable_node(stable_node);
}

enum ksm_get_folio_flags {
        KSM_GET_FOLIO_NOLOCK,
        KSM_GET_FOLIO_LOCK,
        KSM_GET_FOLIO_TRYLOCK
};

/*
 * ksm_get_folio: checks if the page indicated by the stable node
 * is still its ksm page, despite having held no reference to it.
 * In which case we can trust the content of the page, and it
 * returns the gotten page; but if the page has now been zapped,
 * remove the stale node from the stable tree and return NULL.
 * But beware, the stable node's page might be being migrated.
 *
 * You would expect the stable_node to hold a reference to the ksm page.
 * But if it increments the page's count, swapping out has to wait for
 * ksmd to come around again before it can free the page, which may take
 * seconds or even minutes: much too unresponsive.  So instead we use a
 * "keyhole reference": access to the ksm page from the stable node peeps
 * out through its keyhole to see if that page still holds the right key,
 * pointing back to this stable node.  This relies on freeing a PageAnon
 * page to reset its page->mapping to NULL, and relies on no other use of
 * a page to put something that might look like our key in page->mapping.
 * is on its way to being freed; but it is an anomaly to bear in mind.
 */
static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node,
                                 enum ksm_get_folio_flags flags)
{
        struct folio *folio;
        void *expected_mapping;
        unsigned long kpfn;

        expected_mapping = (void *)((unsigned long)stable_node |
                                        FOLIO_MAPPING_KSM);
again:
        kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
        folio = pfn_folio(kpfn);
        if (READ_ONCE(folio->mapping) != expected_mapping)
                goto stale;

        /*
         * We cannot do anything with the page while its refcount is 0.
         * Usually 0 means free, or tail of a higher-order page: in which
         * case this node is no longer referenced, and should be freed;
         * however, it might mean that the page is under page_ref_freeze().
         * The __remove_mapping() case is easy, again the node is now stale;
         * the same is in reuse_ksm_page() case; but if page is swapcache
         * in folio_migrate_mapping(), it might still be our page,
         * in which case it's essential to keep the node.
         */
        while (!folio_try_get(folio)) {
                /*
                 * Another check for folio->mapping != expected_mapping
                 * would work here too.  We have chosen to test the
                 * swapcache flag to optimize the common case, when the
                 * folio is or is about to be freed: the swapcache flag
                 * is cleared (under spin_lock_irq) in the ref_freeze
                 * section of __remove_mapping(); but anon folio->mapping
                 * is reset to NULL later, in free_pages_prepare().
                 */
                if (!folio_test_swapcache(folio))
                        goto stale;
                cpu_relax();
        }

        if (READ_ONCE(folio->mapping) != expected_mapping) {
                folio_put(folio);
                goto stale;
        }

        if (flags == KSM_GET_FOLIO_TRYLOCK) {
                if (!folio_trylock(folio)) {
                        folio_put(folio);
                        return ERR_PTR(-EBUSY);
                }
        } else if (flags == KSM_GET_FOLIO_LOCK)
                folio_lock(folio);

        if (flags != KSM_GET_FOLIO_NOLOCK) {
                if (READ_ONCE(folio->mapping) != expected_mapping) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto stale;
                }
        }
        return folio;

stale:
        /*
         * We come here from above when folio->mapping or the swapcache flag
         * suggests that the node is stale; but it might be under migration.
         * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
         * before checking whether node->kpfn has been changed.
         */
        smp_rmb();
        if (READ_ONCE(stable_node->kpfn) != kpfn)
                goto again;
        remove_node_from_stable_tree(stable_node);
        return NULL;
}

/*
 * Removing rmap_item from stable or unstable tree.
 * This function will clean the information from the stable/unstable tree.
 */
static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
{
        if (rmap_item->address & STABLE_FLAG) {
                struct ksm_stable_node *stable_node;
                struct folio *folio;

                stable_node = rmap_item->head;
                folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK);
                if (!folio)
                        goto out;

                hlist_del(&rmap_item->hlist);
                folio_unlock(folio);
                folio_put(folio);

                if (!hlist_empty(&stable_node->hlist))
                        ksm_pages_sharing--;
                else
                        ksm_pages_shared--;

                rmap_item->mm->ksm_merging_pages--;

                VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
                stable_node->rmap_hlist_len--;

                put_anon_vma(rmap_item->anon_vma);
                rmap_item->head = NULL;
                rmap_item->address &= PAGE_MASK;

        } else if (rmap_item->address & UNSTABLE_FLAG) {
                unsigned char age;
                /*
                 * Usually ksmd can and must skip the rb_erase, because
                 * root_unstable_tree was already reset to RB_ROOT.
                 * But be careful when an mm is exiting: do the rb_erase
                 * if this rmap_item was inserted by this scan, rather
                 * than left over from before.
                 */
                age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
                BUG_ON(age > 1);
                if (!age)
                        rb_erase(&rmap_item->node,
                                 root_unstable_tree + NUMA(rmap_item->nid));
                ksm_pages_unshared--;
                rmap_item->address &= PAGE_MASK;
        }
out:
        cond_resched();                /* we're called from many long loops */
}

static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
{
        while (*rmap_list) {
                struct ksm_rmap_item *rmap_item = *rmap_list;
                *rmap_list = rmap_item->rmap_list;
                remove_rmap_item_from_tree(rmap_item);
                free_rmap_item(rmap_item);
        }
}

static inline
struct ksm_stable_node *folio_stable_node(const struct folio *folio)
{
        return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
}

static inline void folio_set_stable_node(struct folio *folio,
                                         struct ksm_stable_node *stable_node)
{
        VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio);
        folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM);
}

#ifdef CONFIG_SYSFS
/*
 * Only called through the sysfs control interface:
 */
static int remove_stable_node(struct ksm_stable_node *stable_node)
{
        struct folio *folio;
        int err;

        folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK);
        if (!folio) {
                /*
                 * ksm_get_folio did remove_node_from_stable_tree itself.
                 */
                return 0;
        }

        /*
         * Page could be still mapped if this races with __mmput() running in
         * between ksm_exit() and exit_mmap(). Just refuse to let
         * merge_across_nodes/max_page_sharing be switched.
         */
        err = -EBUSY;
        if (!folio_mapped(folio)) {
                /*
                 * The stable node did not yet appear stale to ksm_get_folio(),
                 * since that allows for an unmapped ksm folio to be recognized
                 * right up until it is freed; but the node is safe to remove.
                 * This folio might be in an LRU cache waiting to be freed,
                 * or it might be in the swapcache (perhaps under writeback),
                 * or it might have been removed from swapcache a moment ago.
                 */
                folio_set_stable_node(folio, NULL);
                remove_node_from_stable_tree(stable_node);
                err = 0;
        }

        folio_unlock(folio);
        folio_put(folio);
        return err;
}

static int remove_stable_node_chain(struct ksm_stable_node *stable_node,
                                    struct rb_root *root)
{
        struct ksm_stable_node *dup;
        struct hlist_node *hlist_safe;

        if (!is_stable_node_chain(stable_node)) {
                VM_BUG_ON(is_stable_node_dup(stable_node));
                if (remove_stable_node(stable_node))
                        return true;
                else
                        return false;
        }

        hlist_for_each_entry_safe(dup, hlist_safe,
                                  &stable_node->hlist, hlist_dup) {
                VM_BUG_ON(!is_stable_node_dup(dup));
                if (remove_stable_node(dup))
                        return true;
        }
        BUG_ON(!hlist_empty(&stable_node->hlist));
        free_stable_node_chain(stable_node, root);
        return false;
}

static int remove_all_stable_nodes(void)
{
        struct ksm_stable_node *stable_node, *next;
        int nid;
        int err = 0;

        for (nid = 0; nid < ksm_nr_node_ids; nid++) {
                while (root_stable_tree[nid].rb_node) {
                        stable_node = rb_entry(root_stable_tree[nid].rb_node,
                                                struct ksm_stable_node, node);
                        if (remove_stable_node_chain(stable_node,
                                                     root_stable_tree + nid)) {
                                err = -EBUSY;
                                break;        /* proceed to next nid */
                        }
                        cond_resched();
                }
        }
        list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
                if (remove_stable_node(stable_node))
                        err = -EBUSY;
                cond_resched();
        }
        return err;
}

static int unmerge_and_remove_all_rmap_items(void)
{
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        int err = 0;

        spin_lock(&ksm_mmlist_lock);
        slot = list_entry(ksm_mm_head.slot.mm_node.next,
                          struct mm_slot, mm_node);
        ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
        spin_unlock(&ksm_mmlist_lock);

        for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
             mm_slot = ksm_scan.mm_slot) {
                VMA_ITERATOR(vmi, mm_slot->slot.mm, 0);

                mm = mm_slot->slot.mm;
                mmap_read_lock(mm);

                /*
                 * Exit right away if mm is exiting to avoid lockdep issue in
                 * the maple tree
                 */
                if (ksm_test_exit(mm))
                        goto mm_exiting;

                for_each_vma(vmi, vma) {
                        if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
                                continue;
                        err = break_ksm(vma, vma->vm_start, vma->vm_end, false);
                        if (err)
                                goto error;
                }

mm_exiting:
                remove_trailing_rmap_items(&mm_slot->rmap_list);
                mmap_read_unlock(mm);

                spin_lock(&ksm_mmlist_lock);
                slot = list_entry(mm_slot->slot.mm_node.next,
                                  struct mm_slot, mm_node);
                ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
                if (ksm_test_exit(mm)) {
                        hash_del(&mm_slot->slot.hash);
                        list_del(&mm_slot->slot.mm_node);
                        spin_unlock(&ksm_mmlist_lock);

                        mm_slot_free(mm_slot_cache, mm_slot);
                        mm_flags_clear(MMF_VM_MERGEABLE, mm);
                        mm_flags_clear(MMF_VM_MERGE_ANY, mm);
                        mmdrop(mm);
                } else
                        spin_unlock(&ksm_mmlist_lock);
        }

        /* Clean up stable nodes, but don't worry if some are still busy */
        remove_all_stable_nodes();
        ksm_scan.seqnr = 0;
        return 0;

error:
        mmap_read_unlock(mm);
        spin_lock(&ksm_mmlist_lock);
        ksm_scan.mm_slot = &ksm_mm_head;
        spin_unlock(&ksm_mmlist_lock);
        return err;
}
#endif /* CONFIG_SYSFS */

static u32 calc_checksum(struct page *page)
{
        u32 checksum;
        void *addr = kmap_local_page(page);
        checksum = xxhash(addr, PAGE_SIZE, 0);
        kunmap_local(addr);
        return checksum;
}

static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
                              pte_t *orig_pte)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0);
        int swapped;
        int err = -EFAULT;
        struct mmu_notifier_range range;
        bool anon_exclusive;
        pte_t entry;

        if (WARN_ON_ONCE(folio_test_large(folio)))
                return err;

        pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma);
        if (pvmw.address == -EFAULT)
                goto out;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address,
                                pvmw.address + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        if (!page_vma_mapped_walk(&pvmw))
                goto out_mn;
        if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
                goto out_unlock;

        entry = ptep_get(pvmw.pte);
        /*
         * Handle PFN swap PTEs, such as device-exclusive ones, that actually
         * map pages: give up just like the next folio_walk would.
         */
        if (unlikely(!pte_present(entry)))
                goto out_unlock;

        anon_exclusive = PageAnonExclusive(&folio->page);
        if (pte_write(entry) || pte_dirty(entry) ||
            anon_exclusive || mm_tlb_flush_pending(mm)) {
                swapped = folio_test_swapcache(folio);
                flush_cache_page(vma, pvmw.address, folio_pfn(folio));
                /*
                 * Ok this is tricky, when get_user_pages_fast() run it doesn't
                 * take any lock, therefore the check that we are going to make
                 * with the pagecount against the mapcount is racy and
                 * O_DIRECT can happen right after the check.
                 * So we clear the pte and flush the tlb before the check
                 * this assure us that no O_DIRECT can happen after the check
                 * or in the middle of the check.
                 *
                 * No need to notify as we are downgrading page table to read
                 * only not changing it to point to a new page.
                 *
                 * See Documentation/mm/mmu_notifier.rst
                 */
                entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
                /*
                 * Check that no O_DIRECT or similar I/O is in progress on the
                 * page
                 */
                if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) {
                        set_pte_at(mm, pvmw.address, pvmw.pte, entry);
                        goto out_unlock;
                }

                /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
                if (anon_exclusive &&
                    folio_try_share_anon_rmap_pte(folio, &folio->page)) {
                        set_pte_at(mm, pvmw.address, pvmw.pte, entry);
                        goto out_unlock;
                }

                if (pte_dirty(entry))
                        folio_mark_dirty(folio);
                entry = pte_mkclean(entry);

                if (pte_write(entry))
                        entry = pte_wrprotect(entry);

                set_pte_at(mm, pvmw.address, pvmw.pte, entry);
        }
        *orig_pte = entry;
        err = 0;

out_unlock:
        page_vma_mapped_walk_done(&pvmw);
out_mn:
        mmu_notifier_invalidate_range_end(&range);
out:
        return err;
}

/**
 * replace_page - replace page in vma by new ksm page
 * @vma:      vma that holds the pte pointing to page
 * @page:     the page we are replacing by kpage
 * @kpage:    the ksm page we replace page by
 * @orig_pte: the original value of the pte
 *
 * Returns 0 on success, -EFAULT on failure.
 */
static int replace_page(struct vm_area_struct *vma, struct page *page,
                        struct page *kpage, pte_t orig_pte)
{
        struct folio *kfolio = page_folio(kpage);
        struct mm_struct *mm = vma->vm_mm;
        struct folio *folio = page_folio(page);
        pmd_t *pmd;
        pmd_t pmde;
        pte_t *ptep;
        pte_t newpte;
        spinlock_t *ptl;
        unsigned long addr;
        int err = -EFAULT;
        struct mmu_notifier_range range;

        addr = page_address_in_vma(folio, page, vma);
        if (addr == -EFAULT)
                goto out;

        pmd = mm_find_pmd(mm, addr);
        if (!pmd)
                goto out;
        /*
         * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
         * without holding anon_vma lock for write.  So when looking for a
         * genuine pmde (in which to find pte), test present and !THP together.
         */
        pmde = pmdp_get_lockless(pmd);
        if (!pmd_present(pmde) || pmd_trans_huge(pmde))
                goto out;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
                                addr + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!ptep)
                goto out_mn;
        if (!pte_same(ptep_get(ptep), orig_pte)) {
                pte_unmap_unlock(ptep, ptl);
                goto out_mn;
        }
        VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
        VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage),
                        kfolio);

        /*
         * No need to check ksm_use_zero_pages here: we can only have a
         * zero_page here if ksm_use_zero_pages was enabled already.
         */
        if (!is_zero_pfn(page_to_pfn(kpage))) {
                folio_get(kfolio);
                folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE);
                newpte = mk_pte(kpage, vma->vm_page_prot);
        } else {
                /*
                 * Use pte_mkdirty to mark the zero page mapped by KSM, and then
                 * we can easily track all KSM-placed zero pages by checking if
                 * the dirty bit in zero page's PTE is set.
                 */
                newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
                ksm_map_zero_page(mm);
                /*
                 * We're replacing an anonymous page with a zero page, which is
                 * not anonymous. We need to do proper accounting otherwise we
                 * will get wrong values in /proc, and a BUG message in dmesg
                 * when tearing down the mm.
                 */
                dec_mm_counter(mm, MM_ANONPAGES);
        }

        flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
        /*
         * No need to notify as we are replacing a read only page with another
         * read only page with the same content.
         *
         * See Documentation/mm/mmu_notifier.rst
         */
        ptep_clear_flush(vma, addr, ptep);
        set_pte_at(mm, addr, ptep, newpte);

        folio_remove_rmap_pte(folio, page, vma);
        if (!folio_mapped(folio))
                folio_free_swap(folio);
        folio_put(folio);

        pte_unmap_unlock(ptep, ptl);
        err = 0;
out_mn:
        mmu_notifier_invalidate_range_end(&range);
out:
        return err;
}

/*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
 * @page: the PageAnon page that we want to replace with kpage
 * @kpage: the KSM page that we want to map instead of page,
 *         or NULL the first time when we want to use page as kpage.
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_one_page(struct vm_area_struct *vma,
                                 struct page *page, struct page *kpage)
{
        struct folio *folio = page_folio(page);
        pte_t orig_pte = __pte(0);
        int err = -EFAULT;

        if (page == kpage)                        /* ksm page forked */
                return 0;

        if (!folio_test_anon(folio))
                goto out;

        /*
         * We need the folio lock to read a stable swapcache flag in
         * write_protect_page().  We trylock because we don't want to wait
         * here - we prefer to continue scanning and merging different
         * pages, then come back to this page when it is unlocked.
         */
        if (!folio_trylock(folio))
                goto out;

        if (folio_test_large(folio)) {
                if (split_huge_page(page))
                        goto out_unlock;
                folio = page_folio(page);
        }

        /*
         * If this anonymous page is mapped only here, its pte may need
         * to be write-protected.  If it's mapped elsewhere, all of its
         * ptes are necessarily already write-protected.  But in either
         * case, we need to lock and check page_count is not raised.
         */
        if (write_protect_page(vma, folio, &orig_pte) == 0) {
                if (!kpage) {
                        /*
                         * While we hold folio lock, upgrade folio from
                         * anon to a NULL stable_node with the KSM flag set:
                         * stable_tree_insert() will update stable_node.
                         */
                        folio_set_stable_node(folio, NULL);
                        folio_mark_accessed(folio);
                        /*
                         * Page reclaim just frees a clean folio with no dirty
                         * ptes: make sure that the ksm page would be swapped.
                         */
                        if (!folio_test_dirty(folio))
                                folio_mark_dirty(folio);
                        err = 0;
                } else if (pages_identical(page, kpage))
                        err = replace_page(vma, page, kpage, orig_pte);
        }

out_unlock:
        folio_unlock(folio);
out:
        return err;
}

/*
 * This function returns 0 if the pages were merged or if they are
 * no longer merging candidates (e.g., VMA stale), -EFAULT otherwise.
 */
static int try_to_merge_with_zero_page(struct ksm_rmap_item *rmap_item,
                                       struct page *page)
{
        struct mm_struct *mm = rmap_item->mm;
        int err = -EFAULT;

        /*
         * Same checksum as an empty page. We attempt to merge it with the
         * appropriate zero page if the user enabled this via sysfs.
         */
        if (ksm_use_zero_pages && (rmap_item->oldchecksum == zero_checksum)) {
                struct vm_area_struct *vma;

                mmap_read_lock(mm);
                vma = find_mergeable_vma(mm, rmap_item->address);
                if (vma) {
                        err = try_to_merge_one_page(vma, page,
                                        ZERO_PAGE(rmap_item->address));
                        trace_ksm_merge_one_page(
                                page_to_pfn(ZERO_PAGE(rmap_item->address)),
                                rmap_item, mm, err);
                } else {
                        /*
                         * If the vma is out of date, we do not need to
                         * continue.
                         */
                        err = 0;
                }
                mmap_read_unlock(mm);
        }

        return err;
}

/*
 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
 * but no new kernel page is allocated: kpage must already be a ksm page.
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
                                      struct page *page, struct page *kpage)
{
        struct mm_struct *mm = rmap_item->mm;
        struct vm_area_struct *vma;
        int err = -EFAULT;

        mmap_read_lock(mm);
        vma = find_mergeable_vma(mm, rmap_item->address);
        if (!vma)
                goto out;

        err = try_to_merge_one_page(vma, page, kpage);
        if (err)
                goto out;

        /* Unstable nid is in union with stable anon_vma: remove first */
        remove_rmap_item_from_tree(rmap_item);

        /* Must get reference to anon_vma while still holding mmap_lock */
        rmap_item->anon_vma = vma->anon_vma;
        get_anon_vma(vma->anon_vma);
out:
        mmap_read_unlock(mm);
        trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page),
                                rmap_item, mm, err);
        return err;
}

/*
 * try_to_merge_two_pages - take two identical pages and prepare them
 * to be merged into one page.
 *
 * This function returns the kpage if we successfully merged two identical
 * pages into one ksm page, NULL otherwise.
 *
 * Note that this function upgrades page to ksm page: if one of the pages
 * is already a ksm page, try_to_merge_with_ksm_page should be used.
 */
static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
                                           struct page *page,
                                           struct ksm_rmap_item *tree_rmap_item,
                                           struct page *tree_page)
{
        int err;

        err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
        if (!err) {
                err = try_to_merge_with_ksm_page(tree_rmap_item,
                                                        tree_page, page);
                /*
                 * If that fails, we have a ksm page with only one pte
                 * pointing to it: so break it.
                 */
                if (err)
                        break_cow(rmap_item);
        }
        return err ? NULL : page_folio(page);
}

static __always_inline
bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset)
{
        VM_BUG_ON(stable_node->rmap_hlist_len < 0);
        /*
         * Check that at least one mapping still exists, otherwise
         * there's no much point to merge and share with this
         * stable_node, as the underlying tree_page of the other
         * sharer is going to be freed soon.
         */
        return stable_node->rmap_hlist_len &&
                stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
}

static __always_inline
bool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
{
        return __is_page_sharing_candidate(stable_node, 0);
}

static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
                                     struct ksm_stable_node **_stable_node,
                                     struct rb_root *root,
                                     bool prune_stale_stable_nodes)
{
        struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
        struct hlist_node *hlist_safe;
        struct folio *folio, *tree_folio = NULL;
        int found_rmap_hlist_len;

        if (!prune_stale_stable_nodes ||
            time_before(jiffies, stable_node->chain_prune_time +
                        msecs_to_jiffies(
                                ksm_stable_node_chains_prune_millisecs)))
                prune_stale_stable_nodes = false;
        else
                stable_node->chain_prune_time = jiffies;

        hlist_for_each_entry_safe(dup, hlist_safe,
                                  &stable_node->hlist, hlist_dup) {
                cond_resched();
                /*
                 * We must walk all stable_node_dup to prune the stale
                 * stable nodes during lookup.
                 *
                 * ksm_get_folio can drop the nodes from the
                 * stable_node->hlist if they point to freed pages
                 * (that's why we do a _safe walk). The "dup"
                 * stable_node parameter itself will be freed from
                 * under us if it returns NULL.
                 */
                folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK);
                if (!folio)
                        continue;
                /* Pick the best candidate if possible. */
                if (!found || (is_page_sharing_candidate(dup) &&
                    (!is_page_sharing_candidate(found) ||
                     dup->rmap_hlist_len > found_rmap_hlist_len))) {
                        if (found)
                                folio_put(tree_folio);
                        found = dup;
                        found_rmap_hlist_len = found->rmap_hlist_len;
                        tree_folio = folio;
                        /* skip put_page for found candidate */
                        if (!prune_stale_stable_nodes &&
                            is_page_sharing_candidate(found))
                                break;
                        continue;
                }
                folio_put(folio);
        }

        if (found) {
                if (hlist_is_singular_node(&found->hlist_dup, &stable_node->hlist)) {
                        /*
                         * If there's not just one entry it would
                         * corrupt memory, better BUG_ON. In KSM
                         * context with no lock held it's not even
                         * fatal.
                         */
                        BUG_ON(stable_node->hlist.first->next);

                        /*
                         * There's just one entry and it is below the
                         * deduplication limit so drop the chain.
                         */
                        rb_replace_node(&stable_node->node, &found->node,
                                        root);
                        free_stable_node(stable_node);
                        ksm_stable_node_chains--;
                        ksm_stable_node_dups--;
                        /*
                         * NOTE: the caller depends on the stable_node
                         * to be equal to stable_node_dup if the chain
                         * was collapsed.
                         */
                        *_stable_node = found;
                        /*
                         * Just for robustness, as stable_node is
                         * otherwise left as a stable pointer, the
                         * compiler shall optimize it away at build
                         * time.
                         */
                        stable_node = NULL;
                } else if (stable_node->hlist.first != &found->hlist_dup &&
                           __is_page_sharing_candidate(found, 1)) {
                        /*
                         * If the found stable_node dup can accept one
                         * more future merge (in addition to the one
                         * that is underway) and is not at the head of
                         * the chain, put it there so next search will
                         * be quicker in the !prune_stale_stable_nodes
                         * case.
                         *
                         * NOTE: it would be inaccurate to use nr > 1
                         * instead of checking the hlist.first pointer
                         * directly, because in the
                         * prune_stale_stable_nodes case "nr" isn't
                         * the position of the found dup in the chain,
                         * but the total number of dups in the chain.
                         */
                        hlist_del(&found->hlist_dup);
                        hlist_add_head(&found->hlist_dup,
                                       &stable_node->hlist);
                }
        } else {
                /* Its hlist must be empty if no one found. */
                free_stable_node_chain(stable_node, root);
        }

        *_stable_node_dup = found;
        return tree_folio;
}

/*
 * Like for ksm_get_folio, this function can free the *_stable_node and
 * *_stable_node_dup if the returned tree_page is NULL.
 *
 * It can also free and overwrite *_stable_node with the found
 * stable_node_dup if the chain is collapsed (in which case
 * *_stable_node will be equal to *_stable_node_dup like if the chain
 * never existed). It's up to the caller to verify tree_page is not
 * NULL before dereferencing *_stable_node or *_stable_node_dup.
 *
 * *_stable_node_dup is really a second output parameter of this
 * function and will be overwritten in all cases, the caller doesn't
 * need to initialize it.
 */
static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
                                         struct ksm_stable_node **_stable_node,
                                         struct rb_root *root,
                                         bool prune_stale_stable_nodes)
{
        struct ksm_stable_node *stable_node = *_stable_node;

        if (!is_stable_node_chain(stable_node)) {
                *_stable_node_dup = stable_node;
                return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK);
        }
        return stable_node_dup(_stable_node_dup, _stable_node, root,
                               prune_stale_stable_nodes);
}

static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d,
                                                 struct ksm_stable_node **s_n,
                                                 struct rb_root *root)
{
        return __stable_node_chain(s_n_d, s_n, root, true);
}

static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d,
                                           struct ksm_stable_node **s_n,
                                           struct rb_root *root)
{
        return __stable_node_chain(s_n_d, s_n, root, false);
}

/*
 * stable_tree_search - search for page inside the stable tree
 *
 * This function checks if there is a page inside the stable tree
 * with identical content to the page that we are scanning right now.
 *
 * This function returns the stable tree node of identical content if found,
 * -EBUSY if the stable node's page is being migrated, NULL otherwise.
 */
static struct folio *stable_tree_search(struct page *page)
{
        int nid;
        struct rb_root *root;
        struct rb_node **new;
        struct rb_node *parent;
        struct ksm_stable_node *stable_node, *stable_node_dup;
        struct ksm_stable_node *page_node;
        struct folio *folio;

        folio = page_folio(page);
        page_node = folio_stable_node(folio);
        if (page_node && page_node->head != &migrate_nodes) {
                /* ksm page forked */
                folio_get(folio);
                return folio;
        }

        nid = get_kpfn_nid(folio_pfn(folio));
        root = root_stable_tree + nid;
again:
        new = &root->rb_node;
        parent = NULL;

        while (*new) {
                struct folio *tree_folio;
                int ret;

                cond_resched();
                stable_node = rb_entry(*new, struct ksm_stable_node, node);
                tree_folio = chain_prune(&stable_node_dup, &stable_node, root);
                if (!tree_folio) {
                        /*
                         * If we walked over a stale stable_node,
                         * ksm_get_folio() will call rb_erase() and it
                         * may rebalance the tree from under us. So
                         * restart the search from scratch. Returning
                         * NULL would be safe too, but we'd generate
                         * false negative insertions just because some
                         * stable_node was stale.
                         */
                        goto again;
                }

                ret = memcmp_pages(page, &tree_folio->page);
                folio_put(tree_folio);

                parent = *new;
                if (ret < 0)
                        new = &parent->rb_left;
                else if (ret > 0)
                        new = &parent->rb_right;
                else {
                        if (page_node) {
                                VM_BUG_ON(page_node->head != &migrate_nodes);
                                /*
                                 * If the mapcount of our migrated KSM folio is
                                 * at most 1, we can merge it with another
                                 * KSM folio where we know that we have space
                                 * for one more mapping without exceeding the
                                 * ksm_max_page_sharing limit: see
                                 * chain_prune(). This way, we can avoid adding
                                 * this stable node to the chain.
                                 */
                                if (folio_mapcount(folio) > 1)
                                        goto chain_append;
                        }

                        if (!is_page_sharing_candidate(stable_node_dup)) {
                                /*
                                 * If the stable_node is a chain and
                                 * we got a payload match in memcmp
                                 * but we cannot merge the scanned
                                 * page in any of the existing
                                 * stable_node dups because they're
                                 * all full, we need to wait the
                                 * scanned page to find itself a match
                                 * in the unstable tree to create a
                                 * brand new KSM page to add later to
                                 * the dups of this stable_node.
                                 */
                                return NULL;
                        }

                        /*
                         * Lock and unlock the stable_node's page (which
                         * might already have been migrated) so that page
                         * migration is sure to notice its raised count.
                         * It would be more elegant to return stable_node
                         * than kpage, but that involves more changes.
                         */
                        tree_folio = ksm_get_folio(stable_node_dup,
                                                   KSM_GET_FOLIO_TRYLOCK);

                        if (PTR_ERR(tree_folio) == -EBUSY)
                                return ERR_PTR(-EBUSY);

                        if (unlikely(!tree_folio))
                                /*
                                 * The tree may have been rebalanced,
                                 * so re-evaluate parent and new.
                                 */
                                goto again;
                        folio_unlock(tree_folio);

                        if (get_kpfn_nid(stable_node_dup->kpfn) !=
                            NUMA(stable_node_dup->nid)) {
                                folio_put(tree_folio);
                                goto replace;
                        }
                        return tree_folio;
                }
        }

        if (!page_node)
                return NULL;

        list_del(&page_node->list);
        DO_NUMA(page_node->nid = nid);
        rb_link_node(&page_node->node, parent, new);
        rb_insert_color(&page_node->node, root);
out:
        if (is_page_sharing_candidate(page_node)) {
                folio_get(folio);
                return folio;
        } else
                return NULL;

replace:
        /*
         * If stable_node was a chain and chain_prune collapsed it,
         * stable_node has been updated to be the new regular
         * stable_node. A collapse of the chain is indistinguishable
         * from the case there was no chain in the stable
         * rbtree. Otherwise stable_node is the chain and
         * stable_node_dup is the dup to replace.
         */
        if (stable_node_dup == stable_node) {
                VM_BUG_ON(is_stable_node_chain(stable_node_dup));
                VM_BUG_ON(is_stable_node_dup(stable_node_dup));
                /* there is no chain */
                if (page_node) {
                        VM_BUG_ON(page_node->head != &migrate_nodes);
                        list_del(&page_node->list);
                        DO_NUMA(page_node->nid = nid);
                        rb_replace_node(&stable_node_dup->node,
                                        &page_node->node,
                                        root);
                        if (is_page_sharing_candidate(page_node))
                                folio_get(folio);
                        else
                                folio = NULL;
                } else {
                        rb_erase(&stable_node_dup->node, root);
                        folio = NULL;
                }
        } else {
                VM_BUG_ON(!is_stable_node_chain(stable_node));
                __stable_node_dup_del(stable_node_dup);
                if (page_node) {
                        VM_BUG_ON(page_node->head != &migrate_nodes);
                        list_del(&page_node->list);
                        DO_NUMA(page_node->nid = nid);
                        stable_node_chain_add_dup(page_node, stable_node);
                        if (is_page_sharing_candidate(page_node))
                                folio_get(folio);
                        else
                                folio = NULL;
                } else {
                        folio = NULL;
                }
        }
        stable_node_dup->head = &migrate_nodes;
        list_add(&stable_node_dup->list, stable_node_dup->head);
        return folio;

chain_append:
        /*
         * If stable_node was a chain and chain_prune collapsed it,
         * stable_node has been updated to be the new regular
         * stable_node. A collapse of the chain is indistinguishable
         * from the case there was no chain in the stable
         * rbtree. Otherwise stable_node is the chain and
         * stable_node_dup is the dup to replace.
         */
        if (stable_node_dup == stable_node) {
                VM_BUG_ON(is_stable_node_dup(stable_node_dup));
                /* chain is missing so create it */
                stable_node = alloc_stable_node_chain(stable_node_dup,
                                                      root);
                if (!stable_node)
                        return NULL;
        }
        /*
         * Add this stable_node dup that was
         * migrated to the stable_node chain
         * of the current nid for this page
         * content.
         */
        VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
        VM_BUG_ON(page_node->head != &migrate_nodes);
        list_del(&page_node->list);
        DO_NUMA(page_node->nid = nid);
        stable_node_chain_add_dup(page_node, stable_node);
        goto out;
}

/*
 * stable_tree_insert - insert stable tree node pointing to new ksm page
 * into the stable tree.
 *
 * This function returns the stable tree node just allocated on success,
 * NULL otherwise.
 */
static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio)
{
        int nid;
        unsigned long kpfn;
        struct rb_root *root;
        struct rb_node **new;
        struct rb_node *parent;
        struct ksm_stable_node *stable_node, *stable_node_dup;
        bool need_chain = false;

        kpfn = folio_pfn(kfolio);
        nid = get_kpfn_nid(kpfn);
        root = root_stable_tree + nid;
again:
        parent = NULL;
        new = &root->rb_node;

        while (*new) {
                struct folio *tree_folio;
                int ret;

                cond_resched();
                stable_node = rb_entry(*new, struct ksm_stable_node, node);
                tree_folio = chain(&stable_node_dup, &stable_node, root);
                if (!tree_folio) {
                        /*
                         * If we walked over a stale stable_node,
                         * ksm_get_folio() will call rb_erase() and it
                         * may rebalance the tree from under us. So
                         * restart the search from scratch. Returning
                         * NULL would be safe too, but we'd generate
                         * false negative insertions just because some
                         * stable_node was stale.
                         */
                        goto again;
                }

                ret = memcmp_pages(&kfolio->page, &tree_folio->page);
                folio_put(tree_folio);

                parent = *new;
                if (ret < 0)
                        new = &parent->rb_left;
                else if (ret > 0)
                        new = &parent->rb_right;
                else {
                        need_chain = true;
                        break;
                }
        }

        stable_node_dup = alloc_stable_node();
        if (!stable_node_dup)
                return NULL;

        INIT_HLIST_HEAD(&stable_node_dup->hlist);
        stable_node_dup->kpfn = kpfn;
        stable_node_dup->rmap_hlist_len = 0;
        DO_NUMA(stable_node_dup->nid = nid);
        if (!need_chain) {
                rb_link_node(&stable_node_dup->node, parent, new);
                rb_insert_color(&stable_node_dup->node, root);
        } else {
                if (!is_stable_node_chain(stable_node)) {
                        struct ksm_stable_node *orig = stable_node;
                        /* chain is missing so create it */
                        stable_node = alloc_stable_node_chain(orig, root);
                        if (!stable_node) {
                                free_stable_node(stable_node_dup);
                                return NULL;
                        }
                }
                stable_node_chain_add_dup(stable_node_dup, stable_node);
        }

        folio_set_stable_node(kfolio, stable_node_dup);

        return stable_node_dup;
}

/*
 * unstable_tree_search_insert - search for identical page,
 * else insert rmap_item into the unstable tree.
 *
 * This function searches for a page in the unstable tree identical to the
 * page currently being scanned; and if no identical page is found in the
 * tree, we insert rmap_item as a new object into the unstable tree.
 *
 * This function returns pointer to rmap_item found to be identical
 * to the currently scanned page, NULL otherwise.
 *
 * This function does both searching and inserting, because they share
 * the same walking algorithm in an rbtree.
 */
static
struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item,
                                              struct page *page,
                                              struct page **tree_pagep)
{
        struct rb_node **new;
        struct rb_root *root;
        struct rb_node *parent = NULL;
        int nid;

        nid = get_kpfn_nid(page_to_pfn(page));
        root = root_unstable_tree + nid;
        new = &root->rb_node;

        while (*new) {
                struct ksm_rmap_item *tree_rmap_item;
                struct page *tree_page;
                int ret;

                cond_resched();
                tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node);
                tree_page = get_mergeable_page(tree_rmap_item);
                if (!tree_page)
                        return NULL;

                /*
                 * Don't substitute a ksm page for a forked page.
                 */
                if (page == tree_page) {
                        put_page(tree_page);
                        return NULL;
                }

                ret = memcmp_pages(page, tree_page);

                parent = *new;
                if (ret < 0) {
                        put_page(tree_page);
                        new = &parent->rb_left;
                } else if (ret > 0) {
                        put_page(tree_page);
                        new = &parent->rb_right;
                } else if (!ksm_merge_across_nodes &&
                           page_to_nid(tree_page) != nid) {
                        /*
                         * If tree_page has been migrated to another NUMA node,
                         * it will be flushed out and put in the right unstable
                         * tree next time: only merge with it when across_nodes.
                         */
                        put_page(tree_page);
                        return NULL;
                } else {
                        *tree_pagep = tree_page;
                        return tree_rmap_item;
                }
        }

        rmap_item->address |= UNSTABLE_FLAG;
        rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
        DO_NUMA(rmap_item->nid = nid);
        rb_link_node(&rmap_item->node, parent, new);
        rb_insert_color(&rmap_item->node, root);

        ksm_pages_unshared++;
        return NULL;
}

/*
 * stable_tree_append - add another rmap_item to the linked list of
 * rmap_items hanging off a given node of the stable tree, all sharing
 * the same ksm page.
 */
static void stable_tree_append(struct ksm_rmap_item *rmap_item,
                               struct ksm_stable_node *stable_node,
                               bool max_page_sharing_bypass)
{
        /*
         * rmap won't find this mapping if we don't insert the
         * rmap_item in the right stable_node
         * duplicate. page_migration could break later if rmap breaks,
         * so we can as well crash here. We really need to check for
         * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
         * for other negative values as an underflow if detected here
         * for the first time (and not when decreasing rmap_hlist_len)
         * would be sign of memory corruption in the stable_node.
         */
        BUG_ON(stable_node->rmap_hlist_len < 0);

        stable_node->rmap_hlist_len++;
        if (!max_page_sharing_bypass)
                /* possibly non fatal but unexpected overflow, only warn */
                WARN_ON_ONCE(stable_node->rmap_hlist_len >
                             ksm_max_page_sharing);

        rmap_item->head = stable_node;
        rmap_item->address |= STABLE_FLAG;
        hlist_add_head(&rmap_item->hlist, &stable_node->hlist);

        if (rmap_item->hlist.next)
                ksm_pages_sharing++;
        else
                ksm_pages_shared++;

        rmap_item->mm->ksm_merging_pages++;
}

/*
 * cmp_and_merge_page - first see if page can be merged into the stable tree;
 * if not, compare checksum to previous and if it's the same, see if page can
 * be inserted into the unstable tree, or merged with a page already there and
 * both transferred to the stable tree.
 *
 * @page: the page that we are searching identical page to.
 * @rmap_item: the reverse mapping into the virtual address of this page
 */
static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
{
        struct folio *folio = page_folio(page);
        struct ksm_rmap_item *tree_rmap_item;
        struct page *tree_page = NULL;
        struct ksm_stable_node *stable_node;
        struct folio *kfolio;
        unsigned int checksum;
        int err;
        bool max_page_sharing_bypass = false;

        stable_node = folio_stable_node(folio);
        if (stable_node) {
                if (stable_node->head != &migrate_nodes &&
                    get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
                    NUMA(stable_node->nid)) {
                        stable_node_dup_del(stable_node);
                        stable_node->head = &migrate_nodes;
                        list_add(&stable_node->list, stable_node->head);
                }
                if (stable_node->head != &migrate_nodes &&
                    rmap_item->head == stable_node)
                        return;
                /*
                 * If it's a KSM fork, allow it to go over the sharing limit
                 * without warnings.
                 */
                if (!is_page_sharing_candidate(stable_node))
                        max_page_sharing_bypass = true;
        } else {
                remove_rmap_item_from_tree(rmap_item);

                /*
                 * If the hash value of the page has changed from the last time
                 * we calculated it, this page is changing frequently: therefore we
                 * don't want to insert it in the unstable tree, and we don't want
                 * to waste our time searching for something identical to it there.
                 */
                checksum = calc_checksum(page);
                if (rmap_item->oldchecksum != checksum) {
                        rmap_item->oldchecksum = checksum;
                        return;
                }

                if (!try_to_merge_with_zero_page(rmap_item, page))
                        return;
        }

        /* Start by searching for the folio in the stable tree */
        kfolio = stable_tree_search(page);
        if (kfolio == folio && rmap_item->head == stable_node) {
                folio_put(kfolio);
                return;
        }

        remove_rmap_item_from_tree(rmap_item);

        if (kfolio) {
                if (kfolio == ERR_PTR(-EBUSY))
                        return;

                err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page);
                if (!err) {
                        /*
                         * The page was successfully merged:
                         * add its rmap_item to the stable tree.
                         */
                        folio_lock(kfolio);
                        stable_tree_append(rmap_item, folio_stable_node(kfolio),
                                           max_page_sharing_bypass);
                        folio_unlock(kfolio);
                }
                folio_put(kfolio);
                return;
        }

        tree_rmap_item =
                unstable_tree_search_insert(rmap_item, page, &tree_page);
        if (tree_rmap_item) {
                bool split;

                kfolio = try_to_merge_two_pages(rmap_item, page,
                                                tree_rmap_item, tree_page);
                /*
                 * If both pages we tried to merge belong to the same compound
                 * page, then we actually ended up increasing the reference
                 * count of the same compound page twice, and split_huge_page
                 * failed.
                 * Here we set a flag if that happened, and we use it later to
                 * try split_huge_page again. Since we call put_page right
                 * afterwards, the reference count will be correct and
                 * split_huge_page should succeed.
                 */
                split = PageTransCompound(page)
                        && compound_head(page) == compound_head(tree_page);
                put_page(tree_page);
                if (kfolio) {
                        /*
                         * The pages were successfully merged: insert new
                         * node in the stable tree and add both rmap_items.
                         */
                        folio_lock(kfolio);
                        stable_node = stable_tree_insert(kfolio);
                        if (stable_node) {
                                stable_tree_append(tree_rmap_item, stable_node,
                                                   false);
                                stable_tree_append(rmap_item, stable_node,
                                                   false);
                        }
                        folio_unlock(kfolio);

                        /*
                         * If we fail to insert the page into the stable tree,
                         * we will have 2 virtual addresses that are pointing
                         * to a ksm page left outside the stable tree,
                         * in which case we need to break_cow on both.
                         */
                        if (!stable_node) {
                                break_cow(tree_rmap_item);
                                break_cow(rmap_item);
                        }
                } else if (split) {
                        /*
                         * We are here if we tried to merge two pages and
                         * failed because they both belonged to the same
                         * compound page. We will split the page now, but no
                         * merging will take place.
                         * We do not want to add the cost of a full lock; if
                         * the page is locked, it is better to skip it and
                         * perhaps try again later.
                         */
                        if (!folio_trylock(folio))
                                return;
                        split_huge_page(page);
                        folio = page_folio(page);
                        folio_unlock(folio);
                }
        }
}

static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
                                            struct ksm_rmap_item **rmap_list,
                                            unsigned long addr)
{
        struct ksm_rmap_item *rmap_item;

        while (*rmap_list) {
                rmap_item = *rmap_list;
                if ((rmap_item->address & PAGE_MASK) == addr)
                        return rmap_item;
                if (rmap_item->address > addr)
                        break;
                *rmap_list = rmap_item->rmap_list;
                remove_rmap_item_from_tree(rmap_item);
                free_rmap_item(rmap_item);
        }

        rmap_item = alloc_rmap_item();
        if (rmap_item) {
                /* It has already been zeroed */
                rmap_item->mm = mm_slot->slot.mm;
                rmap_item->mm->ksm_rmap_items++;
                rmap_item->address = addr;
                rmap_item->rmap_list = *rmap_list;
                *rmap_list = rmap_item;
        }
        return rmap_item;
}

/*
 * Calculate skip age for the ksm page age. The age determines how often
 * de-duplicating has already been tried unsuccessfully. If the age is
 * smaller, the scanning of this page is skipped for less scans.
 *
 * @age: rmap_item age of page
 */
static unsigned int skip_age(rmap_age_t age)
{
        if (age <= 3)
                return 1;
        if (age <= 5)
                return 2;
        if (age <= 8)
                return 4;

        return 8;
}

/*
 * Determines if a page should be skipped for the current scan.
 *
 * @folio: folio containing the page to check
 * @rmap_item: associated rmap_item of page
 */
static bool should_skip_rmap_item(struct folio *folio,
                                  struct ksm_rmap_item *rmap_item)
{
        rmap_age_t age;

        if (!ksm_smart_scan)
                return false;

        /*
         * Never skip pages that are already KSM; pages cmp_and_merge_page()
         * will essentially ignore them, but we still have to process them
         * properly.
         */
        if (folio_test_ksm(folio))
                return false;

        age = rmap_item->age;
        if (age != U8_MAX)
                rmap_item->age++;

        /*
         * Smaller ages are not skipped, they need to get a chance to go
         * through the different phases of the KSM merging.
         */
        if (age < 3)
                return false;

        /*
         * Are we still allowed to skip? If not, then don't skip it
         * and determine how much more often we are allowed to skip next.
         */
        if (!rmap_item->remaining_skips) {
                rmap_item->remaining_skips = skip_age(age);
                return false;
        }

        /* Skip this page */
        ksm_pages_skipped++;
        rmap_item->remaining_skips--;
        remove_rmap_item_from_tree(rmap_item);
        return true;
}

struct ksm_next_page_arg {
        struct folio *folio;
        struct page *page;
        unsigned long addr;
};

static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
                struct mm_walk *walk)
{
        struct ksm_next_page_arg *private = walk->private;
        struct vm_area_struct *vma = walk->vma;
        pte_t *start_ptep = NULL, *ptep, pte;
        struct mm_struct *mm = walk->mm;
        struct folio *folio;
        struct page *page;
        spinlock_t *ptl;
        pmd_t pmd;

        if (ksm_test_exit(mm))
                return 0;

        cond_resched();

        pmd = pmdp_get_lockless(pmdp);
        if (!pmd_present(pmd))
                return 0;

        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
                ptl = pmd_lock(mm, pmdp);
                pmd = pmdp_get(pmdp);

                if (!pmd_present(pmd)) {
                        goto not_found_unlock;
                } else if (pmd_leaf(pmd)) {
                        page = vm_normal_page_pmd(vma, addr, pmd);
                        if (!page)
                                goto not_found_unlock;
                        folio = page_folio(page);

                        if (folio_is_zone_device(folio) || !folio_test_anon(folio))
                                goto not_found_unlock;

                        page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
                        goto found_unlock;
                }
                spin_unlock(ptl);
        }

        start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
        if (!start_ptep)
                return 0;

        for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
                pte = ptep_get(ptep);

                if (!pte_present(pte))
                        continue;

                page = vm_normal_page(vma, addr, pte);
                if (!page)
                        continue;
                folio = page_folio(page);

                if (folio_is_zone_device(folio) || !folio_test_anon(folio))
                        continue;
                goto found_unlock;
        }

not_found_unlock:
        spin_unlock(ptl);
        if (start_ptep)
                pte_unmap(start_ptep);
        return 0;
found_unlock:
        folio_get(folio);
        spin_unlock(ptl);
        if (start_ptep)
                pte_unmap(start_ptep);
        private->page = page;
        private->folio = folio;
        private->addr = addr;
        return 1;
}

static struct mm_walk_ops ksm_next_page_ops = {
        .pmd_entry = ksm_next_page_pmd_entry,
        .walk_lock = PGWALK_RDLOCK,
};

static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
        struct mm_struct *mm;
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        struct vm_area_struct *vma;
        struct ksm_rmap_item *rmap_item;
        struct vma_iterator vmi;
        int nid;

        if (list_empty(&ksm_mm_head.slot.mm_node))
                return NULL;

        mm_slot = ksm_scan.mm_slot;
        if (mm_slot == &ksm_mm_head) {
                advisor_start_scan();
                trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);

                /*
                 * A number of pages can hang around indefinitely in per-cpu
                 * LRU cache, raised page count preventing write_protect_page
                 * from merging them.  Though it doesn't really matter much,
                 * it is puzzling to see some stuck in pages_volatile until
                 * other activity jostles them out, and they also prevented
                 * LTP's KSM test from succeeding deterministically; so drain
                 * them here (here rather than on entry to ksm_do_scan(),
                 * so we don't IPI too often when pages_to_scan is set low).
                 */
                lru_add_drain_all();

                /*
                 * Whereas stale stable_nodes on the stable_tree itself
                 * get pruned in the regular course of stable_tree_search(),
                 * those moved out to the migrate_nodes list can accumulate:
                 * so prune them once before each full scan.
                 */
                if (!ksm_merge_across_nodes) {
                        struct ksm_stable_node *stable_node, *next;
                        struct folio *folio;

                        list_for_each_entry_safe(stable_node, next,
                                                 &migrate_nodes, list) {
                                folio = ksm_get_folio(stable_node,
                                                      KSM_GET_FOLIO_NOLOCK);
                                if (folio)
                                        folio_put(folio);
                                cond_resched();
                        }
                }

                for (nid = 0; nid < ksm_nr_node_ids; nid++)
                        root_unstable_tree[nid] = RB_ROOT;

                spin_lock(&ksm_mmlist_lock);
                slot = list_entry(mm_slot->slot.mm_node.next,
                                  struct mm_slot, mm_node);
                mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
                ksm_scan.mm_slot = mm_slot;
                spin_unlock(&ksm_mmlist_lock);
                /*
                 * Although we tested list_empty() above, a racing __ksm_exit
                 * of the last mm on the list may have removed it since then.
                 */
                if (mm_slot == &ksm_mm_head)
                        return NULL;
next_mm:
                ksm_scan.address = 0;
                ksm_scan.rmap_list = &mm_slot->rmap_list;
        }

        slot = &mm_slot->slot;
        mm = slot->mm;
        vma_iter_init(&vmi, mm, ksm_scan.address);

        mmap_read_lock(mm);
        if (ksm_test_exit(mm))
                goto no_vmas;

        for_each_vma(vmi, vma) {
                if (!(vma->vm_flags & VM_MERGEABLE))
                        continue;
                if (ksm_scan.address < vma->vm_start)
                        ksm_scan.address = vma->vm_start;
                if (!vma->anon_vma)
                        ksm_scan.address = vma->vm_end;

                while (ksm_scan.address < vma->vm_end) {
                        struct ksm_next_page_arg ksm_next_page_arg;
                        struct page *tmp_page = NULL;
                        struct folio *folio;

                        if (ksm_test_exit(mm))
                                break;

                        int found;

                        found = walk_page_range_vma(vma, ksm_scan.address,
                                                    vma->vm_end,
                                                    &ksm_next_page_ops,
                                                    &ksm_next_page_arg);

                        if (found > 0) {
                                folio = ksm_next_page_arg.folio;
                                tmp_page = ksm_next_page_arg.page;
                                ksm_scan.address = ksm_next_page_arg.addr;
                        } else {
                                VM_WARN_ON_ONCE(found < 0);
                                ksm_scan.address = vma->vm_end - PAGE_SIZE;
                        }

                        if (tmp_page) {
                                flush_anon_page(vma, tmp_page, ksm_scan.address);
                                flush_dcache_page(tmp_page);
                                rmap_item = get_next_rmap_item(mm_slot,
                                        ksm_scan.rmap_list, ksm_scan.address);
                                if (rmap_item) {
                                        ksm_scan.rmap_list =
                                                        &rmap_item->rmap_list;

                                        if (should_skip_rmap_item(folio, rmap_item)) {
                                                folio_put(folio);
                                                goto next_page;
                                        }

                                        ksm_scan.address += PAGE_SIZE;
                                        *page = tmp_page;
                                } else {
                                        folio_put(folio);
                                }
                                mmap_read_unlock(mm);
                                return rmap_item;
                        }
next_page:
                        ksm_scan.address += PAGE_SIZE;
                        cond_resched();
                }
        }

        if (ksm_test_exit(mm)) {
no_vmas:
                ksm_scan.address = 0;
                ksm_scan.rmap_list = &mm_slot->rmap_list;
        }
        /*
         * Nuke all the rmap_items that are above this current rmap:
         * because there were no VM_MERGEABLE vmas with such addresses.
         */
        remove_trailing_rmap_items(ksm_scan.rmap_list);

        spin_lock(&ksm_mmlist_lock);
        slot = list_entry(mm_slot->slot.mm_node.next,
                          struct mm_slot, mm_node);
        ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
        if (ksm_scan.address == 0) {
                /*
                 * We've completed a full scan of all vmas, holding mmap_lock
                 * throughout, and found no VM_MERGEABLE: so do the same as
                 * __ksm_exit does to remove this mm from all our lists now.
                 * This applies either when cleaning up after __ksm_exit
                 * (but beware: we can reach here even before __ksm_exit),
                 * or when all VM_MERGEABLE areas have been unmapped (and
                 * mmap_lock then protects against race with MADV_MERGEABLE).
                 */
                hash_del(&mm_slot->slot.hash);
                list_del(&mm_slot->slot.mm_node);
                spin_unlock(&ksm_mmlist_lock);

                mm_slot_free(mm_slot_cache, mm_slot);
                /*
                 * Only clear MMF_VM_MERGEABLE. We must not clear
                 * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process,
                 * perhaps their mm_struct has just been added to ksm_mm_slot
                 * list, and its process has not yet officially started running
                 * or has not yet performed mmap/brk to allocate anonymous VMAS.
                 */
                mm_flags_clear(MMF_VM_MERGEABLE, mm);
                mmap_read_unlock(mm);
                mmdrop(mm);
        } else {
                mmap_read_unlock(mm);
                /*
                 * mmap_read_unlock(mm) first because after
                 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
                 * already have been freed under us by __ksm_exit()
                 * because the "mm_slot" is still hashed and
                 * ksm_scan.mm_slot doesn't point to it anymore.
                 */
                spin_unlock(&ksm_mmlist_lock);
        }

        /* Repeat until we've completed scanning the whole list */
        mm_slot = ksm_scan.mm_slot;
        if (mm_slot != &ksm_mm_head)
                goto next_mm;

        advisor_stop_scan();

        trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
        ksm_scan.seqnr++;
        return NULL;
}

/**
 * ksm_do_scan  - the ksm scanner main worker function.
 * @scan_npages:  number of pages we want to scan before we return.
 */
static void ksm_do_scan(unsigned int scan_npages)
{
        struct ksm_rmap_item *rmap_item;
        struct page *page;

        while (scan_npages-- && likely(!freezing(current))) {
                cond_resched();
                rmap_item = scan_get_next_rmap_item(&page);
                if (!rmap_item)
                        return;
                cmp_and_merge_page(page, rmap_item);
                put_page(page);
                ksm_pages_scanned++;
        }
}

static int ksmd_should_run(void)
{
        return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node);
}

static int ksm_scan_thread(void *nothing)
{
        unsigned int sleep_ms;

        set_freezable();
        set_user_nice(current, 5);

        while (!kthread_should_stop()) {
                mutex_lock(&ksm_thread_mutex);
                wait_while_offlining();
                if (ksmd_should_run())
                        ksm_do_scan(ksm_thread_pages_to_scan);
                mutex_unlock(&ksm_thread_mutex);

                if (ksmd_should_run()) {
                        sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
                        wait_event_freezable_timeout(ksm_iter_wait,
                                sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
                                msecs_to_jiffies(sleep_ms));
                } else {
                        wait_event_freezable(ksm_thread_wait,
                                ksmd_should_run() || kthread_should_stop());
                }
        }
        return 0;
}

static bool __ksm_should_add_vma(const struct file *file, vma_flags_t vma_flags)
{
        if (vma_flags_test(&vma_flags, VMA_MERGEABLE_BIT))
                return false;

        return ksm_compatible(file, vma_flags);
}

static void __ksm_add_vma(struct vm_area_struct *vma)
{
        if (__ksm_should_add_vma(vma->vm_file, vma->flags))
                vm_flags_set(vma, VM_MERGEABLE);
}

static int __ksm_del_vma(struct vm_area_struct *vma)
{
        int err;

        if (!(vma->vm_flags & VM_MERGEABLE))
                return 0;

        if (vma->anon_vma) {
                err = break_ksm(vma, vma->vm_start, vma->vm_end, true);
                if (err)
                        return err;
        }

        vm_flags_clear(vma, VM_MERGEABLE);
        return 0;
}
/**
 * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
 *
 * @mm:       Proposed VMA's mm_struct
 * @file:     Proposed VMA's file-backed mapping, if any.
 * @vma_flags: Proposed VMA"s flags.
 *
 * Returns: @vma_flags possibly updated to mark mergeable.
 */
vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file,
                          vma_flags_t vma_flags)
{
        if (mm_flags_test(MMF_VM_MERGE_ANY, mm) &&
            __ksm_should_add_vma(file, vma_flags)) {
                vma_flags_set(&vma_flags, VMA_MERGEABLE_BIT);
                /*
                 * Generally, the flags here always include MMF_VM_MERGEABLE.
                 * However, in rare cases, this flag may be cleared by ksmd who
                 * scans a cycle without finding any mergeable vma.
                 */
                if (unlikely(!mm_flags_test(MMF_VM_MERGEABLE, mm)))
                        __ksm_enter(mm);
        }

        return vma_flags;
}

static void ksm_add_vmas(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        VMA_ITERATOR(vmi, mm, 0);
        for_each_vma(vmi, vma)
                __ksm_add_vma(vma);
}

static int ksm_del_vmas(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        int err;

        VMA_ITERATOR(vmi, mm, 0);
        for_each_vma(vmi, vma) {
                err = __ksm_del_vma(vma);
                if (err)
                        return err;
        }
        return 0;
}

/**
 * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
 *                        compatible VMA's
 *
 * @mm:  Pointer to mm
 *
 * Returns 0 on success, otherwise error code
 */
int ksm_enable_merge_any(struct mm_struct *mm)
{
        int err;

        if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
                return 0;

        if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
                err = __ksm_enter(mm);
                if (err)
                        return err;
        }

        mm_flags_set(MMF_VM_MERGE_ANY, mm);
        ksm_add_vmas(mm);

        return 0;
}

/**
 * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
 *                           previously enabled via ksm_enable_merge_any().
 *
 * Disabling merging implies unmerging any merged pages, like setting
 * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
 * merging on all compatible VMA's remains enabled.
 *
 * @mm: Pointer to mm
 *
 * Returns 0 on success, otherwise error code
 */
int ksm_disable_merge_any(struct mm_struct *mm)
{
        int err;

        if (!mm_flags_test(MMF_VM_MERGE_ANY, mm))
                return 0;

        err = ksm_del_vmas(mm);
        if (err) {
                ksm_add_vmas(mm);
                return err;
        }

        mm_flags_clear(MMF_VM_MERGE_ANY, mm);
        return 0;
}

int ksm_disable(struct mm_struct *mm)
{
        mmap_assert_write_locked(mm);

        if (!mm_flags_test(MMF_VM_MERGEABLE, mm))
                return 0;
        if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
                return ksm_disable_merge_any(mm);
        return ksm_del_vmas(mm);
}

int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, vm_flags_t *vm_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        int err;

        switch (advice) {
        case MADV_MERGEABLE:
                if (vma->vm_flags & VM_MERGEABLE)
                        return 0;
                if (!vma_ksm_compatible(vma))
                        return 0;

                if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
                        err = __ksm_enter(mm);
                        if (err)
                                return err;
                }

                *vm_flags |= VM_MERGEABLE;
                break;

        case MADV_UNMERGEABLE:
                if (!(*vm_flags & VM_MERGEABLE))
                        return 0;                /* just ignore the advice */

                if (vma->anon_vma) {
                        err = break_ksm(vma, start, end, true);
                        if (err)
                                return err;
                }

                *vm_flags &= ~VM_MERGEABLE;
                break;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ksm_madvise);

int __ksm_enter(struct mm_struct *mm)
{
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        int needs_wakeup;

        mm_slot = mm_slot_alloc(mm_slot_cache);
        if (!mm_slot)
                return -ENOMEM;

        slot = &mm_slot->slot;

        /* Check ksm_run too?  Would need tighter locking */
        needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node);

        spin_lock(&ksm_mmlist_lock);
        mm_slot_insert(mm_slots_hash, mm, slot);
        /*
         * When KSM_RUN_MERGE (or KSM_RUN_STOP),
         * insert just behind the scanning cursor, to let the area settle
         * down a little; when fork is followed by immediate exec, we don't
         * want ksmd to waste time setting up and tearing down an rmap_list.
         *
         * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
         * scanning cursor, otherwise KSM pages in newly forked mms will be
         * missed: then we might as well insert at the end of the list.
         */
        if (ksm_run & KSM_RUN_UNMERGE)
                list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node);
        else
                list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
        spin_unlock(&ksm_mmlist_lock);

        mm_flags_set(MMF_VM_MERGEABLE, mm);
        mmgrab(mm);

        if (needs_wakeup)
                wake_up_interruptible(&ksm_thread_wait);

        trace_ksm_enter(mm);
        return 0;
}

void __ksm_exit(struct mm_struct *mm)
{
        struct ksm_mm_slot *mm_slot = NULL;
        struct mm_slot *slot;
        int easy_to_free = 0;

        /*
         * This process is exiting: if it's straightforward (as is the
         * case when ksmd was never running), free mm_slot immediately.
         * But if it's at the cursor or has rmap_items linked to it, use
         * mmap_lock to synchronize with any break_cows before pagetables
         * are freed, and leave the mm_slot on the list for ksmd to free.
         * Beware: ksm may already have noticed it exiting and freed the slot.
         */

        spin_lock(&ksm_mmlist_lock);
        slot = mm_slot_lookup(mm_slots_hash, mm);
        if (!slot)
                goto unlock;
        mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
        if (ksm_scan.mm_slot == mm_slot)
                goto unlock;
        if (!mm_slot->rmap_list) {
                hash_del(&slot->hash);
                list_del(&slot->mm_node);
                easy_to_free = 1;
        } else {
                list_move(&slot->mm_node,
                          &ksm_scan.mm_slot->slot.mm_node);
        }
unlock:
        spin_unlock(&ksm_mmlist_lock);

        if (easy_to_free) {
                mm_slot_free(mm_slot_cache, mm_slot);
                mm_flags_clear(MMF_VM_MERGE_ANY, mm);
                mm_flags_clear(MMF_VM_MERGEABLE, mm);
                mmdrop(mm);
        } else if (mm_slot) {
                mmap_write_lock(mm);
                mmap_write_unlock(mm);
        }

        trace_ksm_exit(mm);
}

struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr)
{
        struct page *page = folio_page(folio, 0);
        struct anon_vma *anon_vma = folio_anon_vma(folio);
        struct folio *new_folio;

        if (folio_test_large(folio))
                return folio;

        if (folio_test_ksm(folio)) {
                if (folio_stable_node(folio) &&
                    !(ksm_run & KSM_RUN_UNMERGE))
                        return folio;        /* no need to copy it */
        } else if (!anon_vma) {
                return folio;                /* no need to copy it */
        } else if (folio->index == linear_page_index(vma, addr) &&
                        anon_vma->root == vma->anon_vma->root) {
                return folio;                /* still no need to copy it */
        }
        if (PageHWPoison(page))
                return ERR_PTR(-EHWPOISON);
        if (!folio_test_uptodate(folio))
                return folio;                /* let do_swap_page report the error */

        new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
        if (new_folio &&
            mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) {
                folio_put(new_folio);
                new_folio = NULL;
        }
        if (new_folio) {
                if (copy_mc_user_highpage(folio_page(new_folio, 0), page,
                                                                addr, vma)) {
                        folio_put(new_folio);
                        return ERR_PTR(-EHWPOISON);
                }
                folio_set_dirty(new_folio);
                __folio_mark_uptodate(new_folio);
                __folio_set_locked(new_folio);
#ifdef CONFIG_SWAP
                count_vm_event(KSM_SWPIN_COPY);
#endif
        }

        return new_folio;
}

void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
{
        struct ksm_stable_node *stable_node;
        struct ksm_rmap_item *rmap_item;
        int search_new_forks = 0;

        VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);

        /*
         * Rely on the page lock to protect against concurrent modifications
         * to that page's node of the stable tree.
         */
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        stable_node = folio_stable_node(folio);
        if (!stable_node)
                return;
again:
        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                /* Ignore the stable/unstable/sqnr flags */
                const unsigned long addr = rmap_item->address & PAGE_MASK;
                struct anon_vma *anon_vma = rmap_item->anon_vma;
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;

                cond_resched();
                if (!anon_vma_trylock_read(anon_vma)) {
                        if (rwc->try_lock) {
                                rwc->contended = true;
                                return;
                        }
                        anon_vma_lock_read(anon_vma);
                }

                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {

                        cond_resched();
                        vma = vmac->vma;

                        if (addr < vma->vm_start || addr >= vma->vm_end)
                                continue;
                        /*
                         * Initially we examine only the vma which covers this
                         * rmap_item; but later, if there is still work to do,
                         * we examine covering vmas in other mms: in case they
                         * were forked from the original since ksmd passed.
                         */
                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
                                continue;

                        if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                                continue;

                        if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
                                anon_vma_unlock_read(anon_vma);
                                return;
                        }
                        if (rwc->done && rwc->done(folio)) {
                                anon_vma_unlock_read(anon_vma);
                                return;
                        }
                }
                anon_vma_unlock_read(anon_vma);
        }
        if (!search_new_forks++)
                goto again;
}

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Collect processes when the error hit an ksm page.
 */
void collect_procs_ksm(const struct folio *folio, const struct page *page,
                struct list_head *to_kill, int force_early)
{
        struct ksm_stable_node *stable_node;
        struct ksm_rmap_item *rmap_item;
        struct vm_area_struct *vma;
        struct task_struct *tsk;

        stable_node = folio_stable_node(folio);
        if (!stable_node)
                return;
        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *av = rmap_item->anon_vma;

                anon_vma_lock_read(av);
                rcu_read_lock();
                for_each_process(tsk) {
                        struct anon_vma_chain *vmac;
                        unsigned long addr;
                        struct task_struct *t =
                                task_early_kill(tsk, force_early);
                        if (!t)
                                continue;
                        anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0,
                                                       ULONG_MAX)
                        {
                                vma = vmac->vma;
                                if (vma->vm_mm == t->mm) {
                                        addr = rmap_item->address & PAGE_MASK;
                                        add_to_kill_ksm(t, page, vma, to_kill,
                                                        addr);
                                }
                        }
                }
                rcu_read_unlock();
                anon_vma_unlock_read(av);
        }
}
#endif

#ifdef CONFIG_MIGRATION
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
{
        struct ksm_stable_node *stable_node;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
        VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);

        stable_node = folio_stable_node(folio);
        if (stable_node) {
                VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
                stable_node->kpfn = folio_pfn(newfolio);
                /*
                 * newfolio->mapping was set in advance; now we need smp_wmb()
                 * to make sure that the new stable_node->kpfn is visible
                 * to ksm_get_folio() before it can see that folio->mapping
                 * has gone stale (or that the swapcache flag has been cleared).
                 */
                smp_wmb();
                folio_set_stable_node(folio, NULL);
        }
}
#endif /* CONFIG_MIGRATION */

#ifdef CONFIG_MEMORY_HOTREMOVE
static void wait_while_offlining(void)
{
        while (ksm_run & KSM_RUN_OFFLINE) {
                mutex_unlock(&ksm_thread_mutex);
                wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
                            TASK_UNINTERRUPTIBLE);
                mutex_lock(&ksm_thread_mutex);
        }
}

static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
                                         unsigned long start_pfn,
                                         unsigned long end_pfn)
{
        if (stable_node->kpfn >= start_pfn &&
            stable_node->kpfn < end_pfn) {
                /*
                 * Don't ksm_get_folio, page has already gone:
                 * which is why we keep kpfn instead of page*
                 */
                remove_node_from_stable_tree(stable_node);
                return true;
        }
        return false;
}

static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node,
                                           unsigned long start_pfn,
                                           unsigned long end_pfn,
                                           struct rb_root *root)
{
        struct ksm_stable_node *dup;
        struct hlist_node *hlist_safe;

        if (!is_stable_node_chain(stable_node)) {
                VM_BUG_ON(is_stable_node_dup(stable_node));
                return stable_node_dup_remove_range(stable_node, start_pfn,
                                                    end_pfn);
        }

        hlist_for_each_entry_safe(dup, hlist_safe,
                                  &stable_node->hlist, hlist_dup) {
                VM_BUG_ON(!is_stable_node_dup(dup));
                stable_node_dup_remove_range(dup, start_pfn, end_pfn);
        }
        if (hlist_empty(&stable_node->hlist)) {
                free_stable_node_chain(stable_node, root);
                return true; /* notify caller that tree was rebalanced */
        } else
                return false;
}

static void ksm_check_stable_tree(unsigned long start_pfn,
                                  unsigned long end_pfn)
{
        struct ksm_stable_node *stable_node, *next;
        struct rb_node *node;
        int nid;

        for (nid = 0; nid < ksm_nr_node_ids; nid++) {
                node = rb_first(root_stable_tree + nid);
                while (node) {
                        stable_node = rb_entry(node, struct ksm_stable_node, node);
                        if (stable_node_chain_remove_range(stable_node,
                                                           start_pfn, end_pfn,
                                                           root_stable_tree +
                                                           nid))
                                node = rb_first(root_stable_tree + nid);
                        else
                                node = rb_next(node);
                        cond_resched();
                }
        }
        list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
                if (stable_node->kpfn >= start_pfn &&
                    stable_node->kpfn < end_pfn)
                        remove_node_from_stable_tree(stable_node);
                cond_resched();
        }
}

static int ksm_memory_callback(struct notifier_block *self,
                               unsigned long action, void *arg)
{
        struct memory_notify *mn = arg;

        switch (action) {
        case MEM_GOING_OFFLINE:
                /*
                 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
                 * and remove_all_stable_nodes() while memory is going offline:
                 * it is unsafe for them to touch the stable tree at this time.
                 * But break_ksm(), rmap lookups and other entry points
                 * which do not need the ksm_thread_mutex are all safe.
                 */
                mutex_lock(&ksm_thread_mutex);
                ksm_run |= KSM_RUN_OFFLINE;
                mutex_unlock(&ksm_thread_mutex);
                break;

        case MEM_OFFLINE:
                /*
                 * Most of the work is done by page migration; but there might
                 * be a few stable_nodes left over, still pointing to struct
                 * pages which have been offlined: prune those from the tree,
                 * otherwise ksm_get_folio() might later try to access a
                 * non-existent struct page.
                 */
                ksm_check_stable_tree(mn->start_pfn,
                                      mn->start_pfn + mn->nr_pages);
                fallthrough;
        case MEM_CANCEL_OFFLINE:
                mutex_lock(&ksm_thread_mutex);
                ksm_run &= ~KSM_RUN_OFFLINE;
                mutex_unlock(&ksm_thread_mutex);

                smp_mb();        /* wake_up_bit advises this */
                wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
                break;
        }
        return NOTIFY_OK;
}
#else
static void wait_while_offlining(void)
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

#ifdef CONFIG_PROC_FS
/*
 * The process is mergeable only if any VMA is currently
 * applicable to KSM.
 *
 * The mmap lock must be held in read mode.
 */
bool ksm_process_mergeable(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        mmap_assert_locked(mm);
        VMA_ITERATOR(vmi, mm, 0);
        for_each_vma(vmi, vma)
                if (vma->vm_flags & VM_MERGEABLE)
                        return true;

        return false;
}

long ksm_process_profit(struct mm_struct *mm)
{
        return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE -
                mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
}
#endif /* CONFIG_PROC_FS */

#ifdef CONFIG_SYSFS
/*
 * This all compiles without CONFIG_SYSFS, but is a waste of space.
 */

#define KSM_ATTR_RO(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KSM_ATTR(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_RW(_name)

static ssize_t sleep_millisecs_show(struct kobject *kobj,
                                    struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
}

static ssize_t sleep_millisecs_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        ksm_thread_sleep_millisecs = msecs;
        wake_up_interruptible(&ksm_iter_wait);

        return count;
}
KSM_ATTR(sleep_millisecs);

static ssize_t pages_to_scan_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
}

static ssize_t pages_to_scan_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        unsigned int nr_pages;
        int err;

        if (ksm_advisor != KSM_ADVISOR_NONE)
                return -EINVAL;

        err = kstrtouint(buf, 10, &nr_pages);
        if (err)
                return -EINVAL;

        ksm_thread_pages_to_scan = nr_pages;

        return count;
}
KSM_ATTR(pages_to_scan);

static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
                        char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_run);
}

static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
                         const char *buf, size_t count)
{
        unsigned int flags;
        int err;

        err = kstrtouint(buf, 10, &flags);
        if (err)
                return -EINVAL;
        if (flags > KSM_RUN_UNMERGE)
                return -EINVAL;

        /*
         * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
         * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
         * breaking COW to free the pages_shared (but leaves mm_slots
         * on the list for when ksmd may be set running again).
         */

        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksm_run != flags) {
                ksm_run = flags;
                if (flags & KSM_RUN_UNMERGE) {
                        set_current_oom_origin();
                        err = unmerge_and_remove_all_rmap_items();
                        clear_current_oom_origin();
                        if (err) {
                                ksm_run = KSM_RUN_STOP;
                                count = err;
                        }
                }
        }
        mutex_unlock(&ksm_thread_mutex);

        if (flags & KSM_RUN_MERGE)
                wake_up_interruptible(&ksm_thread_wait);

        return count;
}
KSM_ATTR(run);

#ifdef CONFIG_NUMA
static ssize_t merge_across_nodes_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
}

static ssize_t merge_across_nodes_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long knob;

        err = kstrtoul(buf, 10, &knob);
        if (err)
                return err;
        if (knob > 1)
                return -EINVAL;

        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksm_merge_across_nodes != knob) {
                if (ksm_pages_shared || remove_all_stable_nodes())
                        err = -EBUSY;
                else if (root_stable_tree == one_stable_tree) {
                        struct rb_root *buf;
                        /*
                         * This is the first time that we switch away from the
                         * default of merging across nodes: must now allocate
                         * a buffer to hold as many roots as may be needed.
                         * Allocate stable and unstable together:
                         * MAXSMP NODES_SHIFT 10 will use 16kB.
                         */
                        buf = kzalloc_objs(*buf, nr_node_ids + nr_node_ids);
                        /* Let us assume that RB_ROOT is NULL is zero */
                        if (!buf)
                                err = -ENOMEM;
                        else {
                                root_stable_tree = buf;
                                root_unstable_tree = buf + nr_node_ids;
                                /* Stable tree is empty but not the unstable */
                                root_unstable_tree[0] = one_unstable_tree[0];
                        }
                }
                if (!err) {
                        ksm_merge_across_nodes = knob;
                        ksm_nr_node_ids = knob ? 1 : nr_node_ids;
                }
        }
        mutex_unlock(&ksm_thread_mutex);

        return err ? err : count;
}
KSM_ATTR(merge_across_nodes);
#endif

static ssize_t use_zero_pages_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
}
static ssize_t use_zero_pages_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        bool value;

        err = kstrtobool(buf, &value);
        if (err)
                return -EINVAL;

        ksm_use_zero_pages = value;

        return count;
}
KSM_ATTR(use_zero_pages);

static ssize_t max_page_sharing_show(struct kobject *kobj,
                                     struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
}

static ssize_t max_page_sharing_store(struct kobject *kobj,
                                      struct kobj_attribute *attr,
                                      const char *buf, size_t count)
{
        int err;
        int knob;

        err = kstrtoint(buf, 10, &knob);
        if (err)
                return err;
        /*
         * When a KSM page is created it is shared by 2 mappings. This
         * being a signed comparison, it implicitly verifies it's not
         * negative.
         */
        if (knob < 2)
                return -EINVAL;

        if (READ_ONCE(ksm_max_page_sharing) == knob)
                return count;

        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksm_max_page_sharing != knob) {
                if (ksm_pages_shared || remove_all_stable_nodes())
                        err = -EBUSY;
                else
                        ksm_max_page_sharing = knob;
        }
        mutex_unlock(&ksm_thread_mutex);

        return err ? err : count;
}
KSM_ATTR(max_page_sharing);

static ssize_t pages_scanned_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_scanned);
}
KSM_ATTR_RO(pages_scanned);

static ssize_t pages_shared_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
}
KSM_ATTR_RO(pages_shared);

static ssize_t pages_sharing_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
}
KSM_ATTR_RO(pages_sharing);

static ssize_t pages_unshared_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
}
KSM_ATTR_RO(pages_unshared);

static ssize_t pages_volatile_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        long ksm_pages_volatile;

        ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
                                - ksm_pages_sharing - ksm_pages_unshared;
        /*
         * It was not worth any locking to calculate that statistic,
         * but it might therefore sometimes be negative: conceal that.
         */
        if (ksm_pages_volatile < 0)
                ksm_pages_volatile = 0;
        return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
}
KSM_ATTR_RO(pages_volatile);

static ssize_t pages_skipped_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_skipped);
}
KSM_ATTR_RO(pages_skipped);

static ssize_t ksm_zero_pages_show(struct kobject *kobj,
                                struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages));
}
KSM_ATTR_RO(ksm_zero_pages);

static ssize_t general_profit_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        long general_profit;

        general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE -
                                ksm_rmap_items * sizeof(struct ksm_rmap_item);

        return sysfs_emit(buf, "%ld\n", general_profit);
}
KSM_ATTR_RO(general_profit);

static ssize_t stable_node_dups_show(struct kobject *kobj,
                                     struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
}
KSM_ATTR_RO(stable_node_dups);

static ssize_t stable_node_chains_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
}
KSM_ATTR_RO(stable_node_chains);

static ssize_t
stable_node_chains_prune_millisecs_show(struct kobject *kobj,
                                        struct kobj_attribute *attr,
                                        char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
}

static ssize_t
stable_node_chains_prune_millisecs_store(struct kobject *kobj,
                                         struct kobj_attribute *attr,
                                         const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        ksm_stable_node_chains_prune_millisecs = msecs;

        return count;
}
KSM_ATTR(stable_node_chains_prune_millisecs);

static ssize_t full_scans_show(struct kobject *kobj,
                               struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
}
KSM_ATTR_RO(full_scans);

static ssize_t smart_scan_show(struct kobject *kobj,
                               struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_smart_scan);
}

static ssize_t smart_scan_store(struct kobject *kobj,
                                struct kobj_attribute *attr,
                                const char *buf, size_t count)
{
        int err;
        bool value;

        err = kstrtobool(buf, &value);
        if (err)
                return -EINVAL;

        ksm_smart_scan = value;
        return count;
}
KSM_ATTR(smart_scan);

static ssize_t advisor_mode_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
                output = "none [scan-time]";
        else
                output = "[none] scan-time";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t advisor_mode_store(struct kobject *kobj,
                                  struct kobj_attribute *attr, const char *buf,
                                  size_t count)
{
        enum ksm_advisor_type curr_advisor = ksm_advisor;

        if (sysfs_streq("scan-time", buf))
                ksm_advisor = KSM_ADVISOR_SCAN_TIME;
        else if (sysfs_streq("none", buf))
                ksm_advisor = KSM_ADVISOR_NONE;
        else
                return -EINVAL;

        /* Set advisor default values */
        if (curr_advisor != ksm_advisor)
                set_advisor_defaults();

        return count;
}
KSM_ATTR(advisor_mode);

static ssize_t advisor_max_cpu_show(struct kobject *kobj,
                                    struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu);
}

static ssize_t advisor_max_cpu_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;

        ksm_advisor_max_cpu = value;
        return count;
}
KSM_ATTR(advisor_max_cpu);

static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan);
}

static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj,
                                        struct kobj_attribute *attr,
                                        const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;

        ksm_advisor_min_pages_to_scan = value;
        return count;
}
KSM_ATTR(advisor_min_pages_to_scan);

static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan);
}

static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj,
                                        struct kobj_attribute *attr,
                                        const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;

        ksm_advisor_max_pages_to_scan = value;
        return count;
}
KSM_ATTR(advisor_max_pages_to_scan);

static ssize_t advisor_target_scan_time_show(struct kobject *kobj,
                                             struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time);
}

static ssize_t advisor_target_scan_time_store(struct kobject *kobj,
                                              struct kobj_attribute *attr,
                                              const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;
        if (value < 1)
                return -EINVAL;

        ksm_advisor_target_scan_time = value;
        return count;
}
KSM_ATTR(advisor_target_scan_time);

static struct attribute *ksm_attrs[] = {
        &sleep_millisecs_attr.attr,
        &pages_to_scan_attr.attr,
        &run_attr.attr,
        &pages_scanned_attr.attr,
        &pages_shared_attr.attr,
        &pages_sharing_attr.attr,
        &pages_unshared_attr.attr,
        &pages_volatile_attr.attr,
        &pages_skipped_attr.attr,
        &ksm_zero_pages_attr.attr,
        &full_scans_attr.attr,
#ifdef CONFIG_NUMA
        &merge_across_nodes_attr.attr,
#endif
        &max_page_sharing_attr.attr,
        &stable_node_chains_attr.attr,
        &stable_node_dups_attr.attr,
        &stable_node_chains_prune_millisecs_attr.attr,
        &use_zero_pages_attr.attr,
        &general_profit_attr.attr,
        &smart_scan_attr.attr,
        &advisor_mode_attr.attr,
        &advisor_max_cpu_attr.attr,
        &advisor_min_pages_to_scan_attr.attr,
        &advisor_max_pages_to_scan_attr.attr,
        &advisor_target_scan_time_attr.attr,
        NULL,
};

static const struct attribute_group ksm_attr_group = {
        .attrs = ksm_attrs,
        .name = "ksm",
};
#endif /* CONFIG_SYSFS */

static int __init ksm_init(void)
{
        struct task_struct *ksm_thread;
        int err;

        /* The correct value depends on page size and endianness */
        zero_checksum = calc_checksum(ZERO_PAGE(0));
        /* Default to false for backwards compatibility */
        ksm_use_zero_pages = false;

        err = ksm_slab_init();
        if (err)
                goto out;

        ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
        if (IS_ERR(ksm_thread)) {
                pr_err("ksm: creating kthread failed\n");
                err = PTR_ERR(ksm_thread);
                goto out_free;
        }

#ifdef CONFIG_SYSFS
        err = sysfs_create_group(mm_kobj, &ksm_attr_group);
        if (err) {
                pr_err("ksm: register sysfs failed\n");
                kthread_stop(ksm_thread);
                goto out_free;
        }
#else
        ksm_run = KSM_RUN_MERGE;        /* no way for user to start it */

#endif /* CONFIG_SYSFS */

#ifdef CONFIG_MEMORY_HOTREMOVE
        /* There is no significance to this priority 100 */
        hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
#endif
        return 0;

out_free:
        ksm_slab_free();
out:
        return err;
}
subsys_initcall(ksm_init);











































































































































































































































   14 
   12 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/exit.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/posix-timers.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
#include <linux/futex.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
#include <linux/unwind_deferred.h>
#include <linux/uaccess.h>
#include <linux/pidfs.h>

#include <uapi/linux/wait.h>

#include <asm/unistd.h>
#include <asm/mmu_context.h>

#include "exit.h"

/*
 * The default value should be high enough to not crash a system that randomly
 * crashes its kernel from time to time, but low enough to at least not permit
 * overflowing 32-bit refcounts or the ldsem writer count.
 */
static unsigned int oops_limit = 10000;

#ifdef CONFIG_SYSCTL
static const struct ctl_table kern_exit_table[] = {
        {
                .procname       = "oops_limit",
                .data           = &oops_limit,
                .maxlen         = sizeof(oops_limit),
                .mode           = 0644,
                .proc_handler   = proc_douintvec,
        },
};

static __init int kernel_exit_sysctls_init(void)
{
        register_sysctl_init("kernel", kern_exit_table);
        return 0;
}
late_initcall(kernel_exit_sysctls_init);
#endif

static atomic_t oops_count = ATOMIC_INIT(0);

#ifdef CONFIG_SYSFS
static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
                               char *page)
{
        return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
}

static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);

static __init int kernel_exit_sysfs_init(void)
{
        sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
        return 0;
}
late_initcall(kernel_exit_sysfs_init);
#endif

/*
 * For things release_task() would like to do *after* tasklist_lock is released.
 */
struct release_task_post {
        struct pid *pids[PIDTYPE_MAX];
};

static void __unhash_process(struct release_task_post *post, struct task_struct *p,
                             bool group_dead)
{
        struct pid *pid = task_pid(p);

        nr_threads--;

        detach_pid(post->pids, p, PIDTYPE_PID);
        wake_up_all(&pid->wait_pidfd);

        if (group_dead) {
                detach_pid(post->pids, p, PIDTYPE_TGID);
                detach_pid(post->pids, p, PIDTYPE_PGID);
                detach_pid(post->pids, p, PIDTYPE_SID);

                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
        }
        list_del_rcu(&p->thread_node);
}

/*
 * This function expects the tasklist_lock write-locked.
 */
static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
{
        struct signal_struct *sig = tsk->signal;
        bool group_dead = thread_group_leader(tsk);
        struct sighand_struct *sighand;
        struct tty_struct *tty;
        u64 utime, stime;

        sighand = rcu_dereference_check(tsk->sighand,
                                        lockdep_tasklist_lock_is_held());
        spin_lock(&sighand->siglock);

#ifdef CONFIG_POSIX_TIMERS
        posix_cpu_timers_exit(tsk);
        if (group_dead)
                posix_cpu_timers_exit_group(tsk);
#endif

        if (group_dead) {
                tty = sig->tty;
                sig->tty = NULL;
        } else {
                /*
                 * If there is any task waiting for the group exit
                 * then notify it:
                 */
                if (sig->notify_count > 0 && !--sig->notify_count)
                        wake_up_process(sig->group_exec_task);

                if (tsk == sig->curr_target)
                        sig->curr_target = next_thread(tsk);
        }

        /*
         * Accumulate here the counters for all threads as they die. We could
         * skip the group leader because it is the last user of signal_struct,
         * but we want to avoid the race with thread_group_cputime() which can
         * see the empty ->thread_head list.
         */
        task_cputime(tsk, &utime, &stime);
        write_seqlock(&sig->stats_lock);
        sig->utime += utime;
        sig->stime += stime;
        sig->gtime += task_gtime(tsk);
        sig->min_flt += tsk->min_flt;
        sig->maj_flt += tsk->maj_flt;
        sig->nvcsw += tsk->nvcsw;
        sig->nivcsw += tsk->nivcsw;
        sig->inblock += task_io_get_inblock(tsk);
        sig->oublock += task_io_get_oublock(tsk);
        task_io_accounting_add(&sig->ioac, &tsk->ioac);
        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
        sig->nr_threads--;
        __unhash_process(post, tsk, group_dead);
        write_sequnlock(&sig->stats_lock);

        tsk->sighand = NULL;
        spin_unlock(&sighand->siglock);

        __cleanup_sighand(sighand);
        if (group_dead)
                tty_kref_put(tty);
}

static void delayed_put_task_struct(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        kprobe_flush_task(tsk);
        rethook_flush_task(tsk);
        perf_event_delayed_put(tsk);
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
}

void put_task_struct_rcu_user(struct task_struct *task)
{
        if (refcount_dec_and_test(&task->rcu_users))
                call_rcu(&task->rcu, delayed_put_task_struct);
}

void __weak release_thread(struct task_struct *dead_task)
{
}

void release_task(struct task_struct *p)
{
        struct release_task_post post;
        struct task_struct *leader;
        struct pid *thread_pid;
        int zap_leader;
repeat:
        memset(&post, 0, sizeof(post));

        /* don't need to get the RCU readlock here - the process is dead and
         * can't be modifying its own credentials. */
        dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);

        pidfs_exit(p);
        cgroup_task_release(p);

        /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */
        thread_pid = task_pid(p);

        write_lock_irq(&tasklist_lock);
        ptrace_release_task(p);
        __exit_signal(&post, p);

        /*
         * If we are the last non-leader member of the thread
         * group, and the leader is zombie, then notify the
         * group leader's parent process. (if it wants notification.)
         */
        zap_leader = 0;
        leader = p->group_leader;
        if (leader != p && thread_group_empty(leader)
                        && leader->exit_state == EXIT_ZOMBIE) {
                /* for pidfs_exit() and do_notify_parent() */
                if (leader->signal->flags & SIGNAL_GROUP_EXIT)
                        leader->exit_code = leader->signal->group_exit_code;
                /*
                 * If we were the last child thread and the leader has
                 * exited already, and the leader's parent ignores SIGCHLD,
                 * then we are the one who should release the leader.
                 */
                zap_leader = do_notify_parent(leader, leader->exit_signal);
                if (zap_leader)
                        leader->exit_state = EXIT_DEAD;
        }

        write_unlock_irq(&tasklist_lock);
        /* @thread_pid can't go away until free_pids() below */
        proc_flush_pid(thread_pid);
        exit_cred_namespaces(p);
        add_device_randomness(&p->se.sum_exec_runtime,
                              sizeof(p->se.sum_exec_runtime));
        free_pids(post.pids);
        release_thread(p);
        /*
         * This task was already removed from the process/thread/pid lists
         * and lock_task_sighand(p) can't succeed. Nobody else can touch
         * ->pending or, if group dead, signal->shared_pending. We can call
         * flush_sigqueue() lockless.
         */
        flush_sigqueue(&p->pending);
        if (thread_group_leader(p))
                flush_sigqueue(&p->signal->shared_pending);

        put_task_struct_rcu_user(p);

        p = leader;
        if (unlikely(zap_leader))
                goto repeat;
}

int rcuwait_wake_up(struct rcuwait *w)
{
        int ret = 0;
        struct task_struct *task;

        rcu_read_lock();

        /*
         * Order condition vs @task, such that everything prior to the load
         * of @task is visible. This is the condition as to why the user called
         * rcuwait_wake() in the first place. Pairs with set_current_state()
         * barrier (A) in rcuwait_wait_event().
         *
         *    WAIT                WAKE
         *    [S] tsk = current          [S] cond = true
         *        MB (A)              MB (B)
         *    [L] cond                  [L] tsk
         */
        smp_mb(); /* (B) */

        task = rcu_dereference(w->task);
        if (task)
                ret = wake_up_process(task);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(rcuwait_wake_up);

/*
 * Determine if a process group is "orphaned", according to the POSIX
 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 * by terminal-generated stop signals.  Newly orphaned process groups are
 * to receive a SIGHUP and a SIGCONT.
 *
 * "I ask you, have you ever known what it is to be an orphan?"
 */
static int will_become_orphaned_pgrp(struct pid *pgrp,
                                        struct task_struct *ignored_task)
{
        struct task_struct *p;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                if ((p == ignored_task) ||
                    (p->exit_state && thread_group_empty(p)) ||
                    is_global_init(p->real_parent))
                        continue;

                if (task_pgrp(p->real_parent) != pgrp &&
                    task_session(p->real_parent) == task_session(p))
                        return 0;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return 1;
}

int is_current_pgrp_orphaned(void)
{
        int retval;

        read_lock(&tasklist_lock);
        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
        read_unlock(&tasklist_lock);

        return retval;
}

static bool has_stopped_jobs(struct pid *pgrp)
{
        struct task_struct *p;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
                        return true;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return false;
}

/*
 * Check to see if any process groups have become orphaned as
 * a result of our exiting, and if they have any stopped jobs,
 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 */
static void
kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
{
        struct pid *pgrp = task_pgrp(tsk);
        struct task_struct *ignored_task = tsk;

        if (!parent)
                /* exit: our father is in a different pgrp than
                 * we are and we were the only connection outside.
                 */
                parent = tsk->real_parent;
        else
                /* reparent: our child is in a different pgrp than
                 * we are, and it was the only connection outside.
                 */
                ignored_task = NULL;

        if (task_pgrp(parent) != pgrp &&
            task_session(parent) == task_session(tsk) &&
            will_become_orphaned_pgrp(pgrp, ignored_task) &&
            has_stopped_jobs(pgrp)) {
                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
        }
}

static void coredump_task_exit(struct task_struct *tsk,
                               struct core_state *core_state)
{
        struct core_thread self;

        self.task = tsk;
        if (self.task->flags & PF_SIGNALED)
                self.next = xchg(&core_state->dumper.next, &self);
        else
                self.task = NULL;
        /*
         * Implies mb(), the result of xchg() must be visible
         * to core_state->dumper.
         */
        if (atomic_dec_and_test(&core_state->nr_threads))
                complete(&core_state->startup);

        for (;;) {
                set_current_state(TASK_IDLE|TASK_FREEZABLE);
                if (!self.task) /* see coredump_finish() */
                        break;
                schedule();
        }
        __set_current_state(TASK_RUNNING);
}

#ifdef CONFIG_MEMCG
/* drops tasklist_lock if succeeds */
static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm)
{
        bool ret = false;

        task_lock(tsk);
        if (likely(tsk->mm == mm)) {
                /* tsk can't pass exit_mm/exec_mmap and exit */
                read_unlock(&tasklist_lock);
                WRITE_ONCE(mm->owner, tsk);
                lru_gen_migrate_mm(mm);
                ret = true;
        }
        task_unlock(tsk);
        return ret;
}

static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm)
{
        struct task_struct *t;

        for_each_thread(g, t) {
                struct mm_struct *t_mm = READ_ONCE(t->mm);
                if (t_mm == mm) {
                        if (__try_to_set_owner(t, mm))
                                return true;
                } else if (t_mm)
                        break;
        }

        return false;
}

/*
 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 */
void mm_update_next_owner(struct mm_struct *mm)
{
        struct task_struct *g, *p = current;

        /*
         * If the exiting or execing task is not the owner, it's
         * someone else's problem.
         */
        if (mm->owner != p)
                return;
        /*
         * The current owner is exiting/execing and there are no other
         * candidates.  Do not leave the mm pointing to a possibly
         * freed task structure.
         */
        if (atomic_read(&mm->mm_users) <= 1) {
                WRITE_ONCE(mm->owner, NULL);
                return;
        }

        read_lock(&tasklist_lock);
        /*
         * Search in the children
         */
        list_for_each_entry(g, &p->children, sibling) {
                if (try_to_set_owner(g, mm))
                        goto ret;
        }
        /*
         * Search in the siblings
         */
        list_for_each_entry(g, &p->real_parent->children, sibling) {
                if (try_to_set_owner(g, mm))
                        goto ret;
        }
        /*
         * Search through everything else, we should not get here often.
         */
        for_each_process(g) {
                if (atomic_read(&mm->mm_users) <= 1)
                        break;
                if (g->flags & PF_KTHREAD)
                        continue;
                if (try_to_set_owner(g, mm))
                        goto ret;
        }
        read_unlock(&tasklist_lock);
        /*
         * We found no owner yet mm_users > 1: this implies that we are
         * most likely racing with swapoff (try_to_unuse()) or /proc or
         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
         */
        WRITE_ONCE(mm->owner, NULL);
 ret:
        return;

}
#endif /* CONFIG_MEMCG */

/*
 * Turn us into a lazy TLB process if we
 * aren't already..
 */
static void exit_mm(void)
{
        struct mm_struct *mm = current->mm;

        exit_mm_release(current, mm);
        if (!mm)
                return;
        mmap_read_lock(mm);
        mmgrab_lazy_tlb(mm);
        BUG_ON(mm != current->active_mm);
        /* more a memory barrier than a real lock */
        task_lock(current);
        /*
         * When a thread stops operating on an address space, the loop
         * in membarrier_private_expedited() may not observe that
         * tsk->mm, and the loop in membarrier_global_expedited() may
         * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
         * rq->membarrier_state, so those would not issue an IPI.
         * Membarrier requires a memory barrier after accessing
         * user-space memory, before clearing tsk->mm or the
         * rq->membarrier_state.
         */
        smp_mb__after_spinlock();
        local_irq_disable();
        current->mm = NULL;
        membarrier_update_current_mm(NULL);
        enter_lazy_tlb(mm, current);
        local_irq_enable();
        task_unlock(current);
        mmap_read_unlock(mm);
        mm_update_next_owner(mm);
        mmput(mm);
        if (test_thread_flag(TIF_MEMDIE))
                exit_oom_victim();
}

static struct task_struct *find_alive_thread(struct task_struct *p)
{
        struct task_struct *t;

        for_each_thread(p, t) {
                if (!(t->flags & PF_EXITING))
                        return t;
        }
        return NULL;
}

static struct task_struct *find_child_reaper(struct task_struct *father,
                                                struct list_head *dead)
        __releases(&tasklist_lock)
        __acquires(&tasklist_lock)
{
        struct pid_namespace *pid_ns = task_active_pid_ns(father);
        struct task_struct *reaper = pid_ns->child_reaper;
        struct task_struct *p, *n;

        if (likely(reaper != father))
                return reaper;

        reaper = find_alive_thread(father);
        if (reaper) {
                ASSERT_EXCLUSIVE_WRITER(pid_ns->child_reaper);
                WRITE_ONCE(pid_ns->child_reaper, reaper);
                return reaper;
        }

        write_unlock_irq(&tasklist_lock);

        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }

        zap_pid_ns_processes(pid_ns);
        write_lock_irq(&tasklist_lock);

        return father;
}

/*
 * When we die, we re-parent all our children, and try to:
 * 1. give them to another thread in our thread group, if such a member exists
 * 2. give it to the first ancestor process which prctl'd itself as a
 *    child_subreaper for its children (like a service manager)
 * 3. give it to the init process (PID 1) in our pid namespace
 */
static struct task_struct *find_new_reaper(struct task_struct *father,
                                           struct task_struct *child_reaper)
{
        struct task_struct *thread, *reaper;

        thread = find_alive_thread(father);
        if (thread)
                return thread;

        if (father->signal->has_child_subreaper) {
                unsigned int ns_level = task_pid(father)->level;
                /*
                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
                 * We can't check reaper != child_reaper to ensure we do not
                 * cross the namespaces, the exiting parent could be injected
                 * by setns() + fork().
                 * We check pid->level, this is slightly more efficient than
                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
                 */
                for (reaper = father->real_parent;
                     task_pid(reaper)->level == ns_level;
                     reaper = reaper->real_parent) {
                        if (reaper == &init_task)
                                break;
                        if (!reaper->signal->is_child_subreaper)
                                continue;
                        thread = find_alive_thread(reaper);
                        if (thread)
                                return thread;
                }
        }

        return child_reaper;
}

/*
* Any that need to be release_task'd are put on the @dead list.
 */
static void reparent_leader(struct task_struct *father, struct task_struct *p,
                                struct list_head *dead)
{
        if (unlikely(p->exit_state == EXIT_DEAD))
                return;

        /* We don't want people slaying init. */
        p->exit_signal = SIGCHLD;

        /* If it has exited notify the new parent about this child's death. */
        if (!p->ptrace &&
            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
                if (do_notify_parent(p, p->exit_signal)) {
                        p->exit_state = EXIT_DEAD;
                        list_add(&p->ptrace_entry, dead);
                }
        }

        kill_orphaned_pgrp(p, father);
}

/*
 * Make init inherit all the child processes
 */
static void forget_original_parent(struct task_struct *father,
                                        struct list_head *dead)
{
        struct task_struct *p, *t, *reaper;

        if (unlikely(!list_empty(&father->ptraced)))
                exit_ptrace(father, dead);

        /* Can drop and reacquire tasklist_lock */
        reaper = find_child_reaper(father, dead);
        if (list_empty(&father->children))
                return;

        reaper = find_new_reaper(father, reaper);
        list_for_each_entry(p, &father->children, sibling) {
                for_each_thread(p, t) {
                        RCU_INIT_POINTER(t->real_parent, reaper);
                        BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
                        if (likely(!t->ptrace))
                                t->parent = t->real_parent;
                        if (t->pdeath_signal)
                                group_send_sig_info(t->pdeath_signal,
                                                    SEND_SIG_NOINFO, t,
                                                    PIDTYPE_TGID);
                }
                /*
                 * If this is a threaded reparent there is no need to
                 * notify anyone anything has happened.
                 */
                if (!same_thread_group(reaper, father))
                        reparent_leader(father, p, dead);
        }
        list_splice_tail_init(&father->children, &reaper->children);
}

/*
 * Send signals to all our closest relatives so that they know
 * to properly mourn us..
 */
static void exit_notify(struct task_struct *tsk, int group_dead)
{
        bool autoreap;
        struct task_struct *p, *n;
        LIST_HEAD(dead);

        write_lock_irq(&tasklist_lock);
        forget_original_parent(tsk, &dead);

        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);

        tsk->exit_state = EXIT_ZOMBIE;

        if (unlikely(tsk->ptrace)) {
                int sig = thread_group_empty(tsk) && !ptrace_reparented(tsk)
                          ? tsk->exit_signal : SIGCHLD;
                autoreap = do_notify_parent(tsk, sig);
        } else if (thread_group_leader(tsk)) {
                autoreap = thread_group_empty(tsk) &&
                           do_notify_parent(tsk, tsk->exit_signal);
        } else {
                autoreap = true;
                /* untraced sub-thread */
                do_notify_pidfd(tsk);
        }

        if (autoreap) {
                tsk->exit_state = EXIT_DEAD;
                list_add(&tsk->ptrace_entry, &dead);
        }

        /* mt-exec, de_thread() is waiting for group leader */
        if (unlikely(tsk->signal->notify_count < 0))
                wake_up_process(tsk->signal->group_exec_task);
        write_unlock_irq(&tasklist_lock);

        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }
}

#ifdef CONFIG_DEBUG_STACK_USAGE
#ifdef CONFIG_STACK_GROWSUP
unsigned long stack_not_used(struct task_struct *p)
{
        unsigned long *n = end_of_stack(p);

        do {        /* Skip over canary */
                n--;
        } while (!*n);

        return (unsigned long)end_of_stack(p) - (unsigned long)n;
}
#else /* !CONFIG_STACK_GROWSUP */
unsigned long stack_not_used(struct task_struct *p)
{
        unsigned long *n = end_of_stack(p);

        do {        /* Skip over canary */
                n++;
        } while (!*n);

        return (unsigned long)n - (unsigned long)end_of_stack(p);
}
#endif /* CONFIG_STACK_GROWSUP */

/* Count the maximum pages reached in kernel stacks */
static inline void kstack_histogram(unsigned long used_stack)
{
#ifdef CONFIG_VM_EVENT_COUNTERS
        if (used_stack <= 1024)
                count_vm_event(KSTACK_1K);
#if THREAD_SIZE > 1024
        else if (used_stack <= 2048)
                count_vm_event(KSTACK_2K);
#endif
#if THREAD_SIZE > 2048
        else if (used_stack <= 4096)
                count_vm_event(KSTACK_4K);
#endif
#if THREAD_SIZE > 4096
        else if (used_stack <= 8192)
                count_vm_event(KSTACK_8K);
#endif
#if THREAD_SIZE > 8192
        else if (used_stack <= 16384)
                count_vm_event(KSTACK_16K);
#endif
#if THREAD_SIZE > 16384
        else if (used_stack <= 32768)
                count_vm_event(KSTACK_32K);
#endif
#if THREAD_SIZE > 32768
        else if (used_stack <= 65536)
                count_vm_event(KSTACK_64K);
#endif
#if THREAD_SIZE > 65536
        else
                count_vm_event(KSTACK_REST);
#endif
#endif /* CONFIG_VM_EVENT_COUNTERS */
}

static void check_stack_usage(void)
{
        static DEFINE_SPINLOCK(low_water_lock);
        static int lowest_to_date = THREAD_SIZE;
        unsigned long free;

        free = stack_not_used(current);
        kstack_histogram(THREAD_SIZE - free);

        if (free >= lowest_to_date)
                return;

        spin_lock(&low_water_lock);
        if (free < lowest_to_date) {
                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
                        current->comm, task_pid_nr(current), free);
                lowest_to_date = free;
        }
        spin_unlock(&low_water_lock);
}
#else /* !CONFIG_DEBUG_STACK_USAGE */
static inline void check_stack_usage(void) {}
#endif /* CONFIG_DEBUG_STACK_USAGE */

static void synchronize_group_exit(struct task_struct *tsk, long code)
{
        struct sighand_struct *sighand = tsk->sighand;
        struct signal_struct *signal = tsk->signal;
        struct core_state *core_state;

        spin_lock_irq(&sighand->siglock);
        signal->quick_threads--;
        if ((signal->quick_threads == 0) &&
            !(signal->flags & SIGNAL_GROUP_EXIT)) {
                signal->flags = SIGNAL_GROUP_EXIT;
                signal->group_exit_code = code;
                signal->group_stop_count = 0;
        }
        /*
         * Serialize with any possible pending coredump.
         * We must hold siglock around checking core_state
         * and setting PF_POSTCOREDUMP.  The core-inducing thread
         * will increment ->nr_threads for each thread in the
         * group without PF_POSTCOREDUMP set.
         */
        tsk->flags |= PF_POSTCOREDUMP;
        core_state = signal->core_state;
        spin_unlock_irq(&sighand->siglock);

        if (unlikely(core_state))
                coredump_task_exit(tsk, core_state);
}

void __noreturn do_exit(long code)
{
        struct task_struct *tsk = current;
        struct kthread *kthread;
        int group_dead;

        WARN_ON(irqs_disabled());
        WARN_ON(tsk->plug);

        kthread = tsk_is_kthread(tsk);
        if (unlikely(kthread))
                kthread_do_exit(kthread, code);

        kcov_task_exit(tsk);
        kmsan_task_exit(tsk);

        synchronize_group_exit(tsk, code);
        ptrace_event(PTRACE_EVENT_EXIT, code);
        user_events_exit(tsk);

        io_uring_files_cancel();
        sched_mm_cid_exit(tsk);
        exit_signals(tsk);  /* sets PF_EXITING */

        seccomp_filter_release(tsk);

        acct_update_integrals(tsk);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                /*
                 * If the last thread of global init has exited, panic
                 * immediately to get a useable coredump.
                 */
                if (unlikely(is_global_init(tsk)))
                        panic("Attempted to kill init! exitcode=0x%08x\n",
                                tsk->signal->group_exit_code ?: (int)code);

#ifdef CONFIG_POSIX_TIMERS
                hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk);
#endif
                if (tsk->mm)
                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
        }
        acct_collect(code, group_dead);
        if (group_dead)
                tty_audit_exit();
        audit_free(tsk);

        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);
        trace_sched_process_exit(tsk, group_dead);

        /*
         * Since sampling can touch ->mm, make sure to stop everything before we
         * tear it down.
         *
         * Also flushes inherited counters to the parent - before the parent
         * gets woken up by child-exit notifications.
         */
        perf_event_exit_task(tsk);
        /*
         * PF_EXITING (above) ensures unwind_deferred_request() will no
         * longer add new unwinds. While exit_mm() (below) will destroy the
         * abaility to do unwinds. So flush any pending unwinds here.
         */
        unwind_deferred_task_exit(tsk);

        exit_mm();

        if (group_dead)
                acct_process();

        exit_sem(tsk);
        exit_shm(tsk);
        exit_files(tsk);
        exit_fs(tsk);
        if (group_dead)
                disassociate_ctty(1);
        exit_nsproxy_namespaces(tsk);
        exit_task_work(tsk);
        exit_thread(tsk);

        sched_autogroup_exit_task(tsk);
        cgroup_task_exit(tsk);

        /*
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
        flush_ptrace_hw_breakpoint(tsk);

        exit_tasks_rcu_start();
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
        mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
        if (unlikely(current->pi_state_cache))
                kfree(current->pi_state_cache);
#endif
        /*
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held();

        if (tsk->io_context)
                exit_io_context(tsk);

        if (tsk->splice_pipe)
                free_pipe_info(tsk->splice_pipe);

        if (tsk->task_frag.page)
                put_page(tsk->task_frag.page);

        exit_task_stack_account(tsk);

        check_stack_usage();
        preempt_disable();
        if (tsk->nr_dirtied)
                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
        exit_tasks_rcu_finish();

        lockdep_free_task(tsk);
        do_task_dead();
}
EXPORT_SYMBOL(do_exit);

void __noreturn make_task_dead(int signr)
{
        /*
         * Take the task off the cpu after something catastrophic has
         * happened.
         *
         * We can get here from a kernel oops, sometimes with preemption off.
         * Start by checking for critical errors.
         * Then fix up important state like USER_DS and preemption.
         * Then do everything else.
         */
        struct task_struct *tsk = current;
        unsigned int limit;

        if (unlikely(in_interrupt()))
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");

        if (unlikely(irqs_disabled())) {
                pr_info("note: %s[%d] exited with irqs disabled\n",
                        current->comm, task_pid_nr(current));
                local_irq_enable();
        }
        if (unlikely(in_atomic())) {
                pr_info("note: %s[%d] exited with preempt_count %d\n",
                        current->comm, task_pid_nr(current),
                        preempt_count());
                preempt_count_set(PREEMPT_ENABLED);
        }

        /*
         * Every time the system oopses, if the oops happens while a reference
         * to an object was held, the reference leaks.
         * If the oops doesn't also leak memory, repeated oopsing can cause
         * reference counters to wrap around (if they're not using refcount_t).
         * This means that repeated oopsing can make unexploitable-looking bugs
         * exploitable through repeated oopsing.
         * To make sure this can't happen, place an upper bound on how often the
         * kernel may oops without panic().
         */
        limit = READ_ONCE(oops_limit);
        if (atomic_inc_return(&oops_count) >= limit && limit)
                panic("Oopsed too often (kernel.oops_limit is %d)", limit);

        /*
         * We're taking recursive faults here in make_task_dead. Safest is to just
         * leave this task alone and wait for reboot.
         */
        if (unlikely(tsk->flags & PF_EXITING)) {
                pr_alert("Fixing recursive fault but reboot is needed!\n");
                futex_exit_recursive(tsk);
                tsk->exit_state = EXIT_DEAD;
                refcount_inc(&tsk->rcu_users);
                do_task_dead();
        }

        do_exit(signr);
}

SYSCALL_DEFINE1(exit, int, error_code)
{
        do_exit((error_code&0xff)<<8);
}

/*
 * Take down every thread in the group.  This is called by fatal signals
 * as well as by sys_exit_group (below).
 */
void __noreturn
do_group_exit(int exit_code)
{
        struct signal_struct *sig = current->signal;

        if (sig->flags & SIGNAL_GROUP_EXIT)
                exit_code = sig->group_exit_code;
        else if (sig->group_exec_task)
                exit_code = 0;
        else {
                struct sighand_struct *const sighand = current->sighand;

                spin_lock_irq(&sighand->siglock);
                if (sig->flags & SIGNAL_GROUP_EXIT)
                        /* Another thread got here before we took the lock.  */
                        exit_code = sig->group_exit_code;
                else if (sig->group_exec_task)
                        exit_code = 0;
                else {
                        sig->group_exit_code = exit_code;
                        sig->flags = SIGNAL_GROUP_EXIT;
                        zap_other_threads(current);
                }
                spin_unlock_irq(&sighand->siglock);
        }

        do_exit(exit_code);
        /* NOTREACHED */
}

/*
 * this kills every thread in the thread group. Note that any externally
 * wait4()-ing process will get the correct exit code - even if this
 * thread is not the thread group leader.
 */
SYSCALL_DEFINE1(exit_group, int, error_code)
{
        do_group_exit((error_code & 0xff) << 8);
        /* NOTREACHED */
        return 0;
}

static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
        return        wo->wo_type == PIDTYPE_MAX ||
                task_pid_type(p, wo->wo_type) == wo->wo_pid;
}

static int
eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
        if (!eligible_pid(wo, p))
                return 0;

        /*
         * Wait for all children (clone and not) if __WALL is set or
         * if it is traced by us.
         */
        if (ptrace || (wo->wo_flags & __WALL))
                return 1;

        /*
         * Otherwise, wait for clone children *only* if __WCLONE is set;
         * otherwise, wait for non-clone children *only*.
         *
         * Note: a "clone" child here is one that reports to its parent
         * using a signal other than SIGCHLD, or a non-leader thread which
         * we can only see if it is traced by us.
         */
        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
                return 0;

        return 1;
}

/*
 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
        int state, status;
        pid_t pid = task_pid_vnr(p);
        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
        struct waitid_info *infop;

        if (!likely(wo->wo_flags & WEXITED))
                return 0;

        if (unlikely(wo->wo_flags & WNOWAIT)) {
                status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                        ? p->signal->group_exit_code : p->exit_code;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
                sched_annotate_sleep();
                if (wo->wo_rusage)
                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
                put_task_struct(p);
                goto out_info;
        }
        /*
         * Move the task's state to DEAD/TRACE, only one thread can do this.
         */
        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
                EXIT_TRACE : EXIT_DEAD;
        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
                return 0;
        /*
         * We own this thread, nobody else can reap it.
         */
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();

        /*
         * Check thread_group_leader() to exclude the traced sub-threads.
         */
        if (state == EXIT_DEAD && thread_group_leader(p)) {
                struct signal_struct *sig = p->signal;
                struct signal_struct *psig = current->signal;
                unsigned long maxrss;
                u64 tgutime, tgstime;

                /*
                 * The resource counters for the group leader are in its
                 * own task_struct.  Those for dead threads in the group
                 * are in its signal_struct, as are those for the child
                 * processes it has previously reaped.  All these
                 * accumulate in the parent's signal_struct c* fields.
                 *
                 * We don't bother to take a lock here to protect these
                 * p->signal fields because the whole thread group is dead
                 * and nobody can change them.
                 *
                 * psig->stats_lock also protects us from our sub-threads
                 * which can reap other children at the same time.
                 *
                 * We use thread_group_cputime_adjusted() to get times for
                 * the thread group, which consolidates times for all threads
                 * in the group including the group leader.
                 */
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                write_seqlock_irq(&psig->stats_lock);
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
                psig->cnvcsw +=
                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
                psig->cnivcsw +=
                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
                psig->cinblock +=
                        task_io_get_inblock(p) +
                        sig->inblock + sig->cinblock;
                psig->coublock +=
                        task_io_get_oublock(p) +
                        sig->oublock + sig->coublock;
                maxrss = max(sig->maxrss, sig->cmaxrss);
                if (psig->cmaxrss < maxrss)
                        psig->cmaxrss = maxrss;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
                write_sequnlock_irq(&psig->stats_lock);
        }

        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                ? p->signal->group_exit_code : p->exit_code;
        wo->wo_stat = status;

        if (state == EXIT_TRACE) {
                write_lock_irq(&tasklist_lock);
                /* We dropped tasklist, ptracer could die and untrace */
                ptrace_unlink(p);

                /* If parent wants a zombie, don't release it now */
                state = EXIT_ZOMBIE;
                if (do_notify_parent(p, p->exit_signal))
                        state = EXIT_DEAD;
                p->exit_state = state;
                write_unlock_irq(&tasklist_lock);
        }
        if (state == EXIT_DEAD)
                release_task(p);

out_info:
        infop = wo->wo_info;
        if (infop) {
                if ((status & 0x7f) == 0) {
                        infop->cause = CLD_EXITED;
                        infop->status = status >> 8;
                } else {
                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
                        infop->status = status & 0x7f;
                }
                infop->pid = pid;
                infop->uid = uid;
        }

        return pid;
}

static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
        if (ptrace) {
                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
                        return &p->exit_code;
        } else {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
                        return &p->signal->group_exit_code;
        }
        return NULL;
}

/**
 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
 * @wo: wait options
 * @ptrace: is the wait for ptrace
 * @p: task to wait for
 *
 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
 *
 * CONTEXT:
 * read_lock(&tasklist_lock), which is released if return value is
 * non-zero.  Also, grabs and releases @p->sighand->siglock.
 *
 * RETURNS:
 * 0 if wait condition didn't exist and search for other wait conditions
 * should continue.  Non-zero return, -errno on failure and @p's pid on
 * success, implies that tasklist_lock is released and wait condition
 * search should terminate.
 */
static int wait_task_stopped(struct wait_opts *wo,
                                int ptrace, struct task_struct *p)
{
        struct waitid_info *infop;
        int exit_code, *p_code, why;
        uid_t uid = 0; /* unneeded, required by compiler */
        pid_t pid;

        /*
         * Traditionally we see ptrace'd stopped tasks regardless of options.
         */
        if (!ptrace && !(wo->wo_flags & WUNTRACED))
                return 0;

        if (!task_stopped_code(p, ptrace))
                return 0;

        exit_code = 0;
        spin_lock_irq(&p->sighand->siglock);

        p_code = task_stopped_code(p, ptrace);
        if (unlikely(!p_code))
                goto unlock_sig;

        exit_code = *p_code;
        if (!exit_code)
                goto unlock_sig;

        if (!unlikely(wo->wo_flags & WNOWAIT))
                *p_code = 0;

        uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
        spin_unlock_irq(&p->sighand->siglock);
        if (!exit_code)
                return 0;

        /*
         * Now we are pretty sure this task is interesting.
         * Make sure it doesn't get reaped out from under us while we
         * give up the lock and then examine it below.  We don't want to
         * keep holding onto the tasklist_lock while we call getrusage and
         * possibly take page faults for user memory.
         */
        get_task_struct(p);
        pid = task_pid_vnr(p);
        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();
        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        put_task_struct(p);

        if (likely(!(wo->wo_flags & WNOWAIT)))
                wo->wo_stat = (exit_code << 8) | 0x7f;

        infop = wo->wo_info;
        if (infop) {
                infop->cause = why;
                infop->status = exit_code;
                infop->pid = pid;
                infop->uid = uid;
        }
        return pid;
}

/*
 * Handle do_wait work for one task in a live, non-stopped state.
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
        struct waitid_info *infop;
        pid_t pid;
        uid_t uid;

        if (!unlikely(wo->wo_flags & WCONTINUED))
                return 0;

        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
                return 0;

        spin_lock_irq(&p->sighand->siglock);
        /* Re-check with the lock held.  */
        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
                spin_unlock_irq(&p->sighand->siglock);
                return 0;
        }
        if (!unlikely(wo->wo_flags & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
        uid = from_kuid_munged(current_user_ns(), task_uid(p));
        spin_unlock_irq(&p->sighand->siglock);

        pid = task_pid_vnr(p);
        get_task_struct(p);
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();
        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        put_task_struct(p);

        infop = wo->wo_info;
        if (!infop) {
                wo->wo_stat = 0xffff;
        } else {
                infop->cause = CLD_CONTINUED;
                infop->pid = pid;
                infop->uid = uid;
                infop->status = SIGCONT;
        }
        return pid;
}

/*
 * Consider @p for a wait by @parent.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue;
 * then ->notask_error is 0 if @p is an eligible child,
 * or still -ECHILD.
 */
static int wait_consider_task(struct wait_opts *wo, int ptrace,
                                struct task_struct *p)
{
        /*
         * We can race with wait_task_zombie() from another thread.
         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
         * can't confuse the checks below.
         */
        int exit_state = READ_ONCE(p->exit_state);
        int ret;

        if (unlikely(exit_state == EXIT_DEAD))
                return 0;

        ret = eligible_child(wo, ptrace, p);
        if (!ret)
                return ret;

        if (unlikely(exit_state == EXIT_TRACE)) {
                /*
                 * ptrace == 0 means we are the natural parent. In this case
                 * we should clear notask_error, debugger will notify us.
                 */
                if (likely(!ptrace))
                        wo->notask_error = 0;
                return 0;
        }

        if (likely(!ptrace) && unlikely(p->ptrace)) {
                /*
                 * If it is traced by its real parent's group, just pretend
                 * the caller is ptrace_do_wait() and reap this child if it
                 * is zombie.
                 *
                 * This also hides group stop state from real parent; otherwise
                 * a single stop can be reported twice as group and ptrace stop.
                 * If a ptracer wants to distinguish these two events for its
                 * own children it should create a separate process which takes
                 * the role of real parent.
                 */
                if (!ptrace_reparented(p))
                        ptrace = 1;
        }

        /* slay zombie? */
        if (exit_state == EXIT_ZOMBIE) {
                /* we don't reap group leaders with subthreads */
                if (!delay_group_leader(p)) {
                        /*
                         * A zombie ptracee is only visible to its ptracer.
                         * Notification and reaping will be cascaded to the
                         * real parent when the ptracer detaches.
                         */
                        if (unlikely(ptrace) || likely(!p->ptrace))
                                return wait_task_zombie(wo, p);
                }

                /*
                 * Allow access to stopped/continued state via zombie by
                 * falling through.  Clearing of notask_error is complex.
                 *
                 * When !@ptrace:
                 *
                 * If WEXITED is set, notask_error should naturally be
                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
                 * so, if there are live subthreads, there are events to
                 * wait for.  If all subthreads are dead, it's still safe
                 * to clear - this function will be called again in finite
                 * amount time once all the subthreads are released and
                 * will then return without clearing.
                 *
                 * When @ptrace:
                 *
                 * Stopped state is per-task and thus can't change once the
                 * target task dies.  Only continued and exited can happen.
                 * Clear notask_error if WCONTINUED | WEXITED.
                 */
                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
                        wo->notask_error = 0;
        } else {
                /*
                 * @p is alive and it's gonna stop, continue or exit, so
                 * there always is something to wait for.
                 */
                wo->notask_error = 0;
        }

        /*
         * Wait for stopped.  Depending on @ptrace, different stopped state
         * is used and the two don't interact with each other.
         */
        ret = wait_task_stopped(wo, ptrace, p);
        if (ret)
                return ret;

        /*
         * Wait for continued.  There's only one continued state and the
         * ptracer can consume it which can confuse the real parent.  Don't
         * use WCONTINUED from ptracer.  You don't need or want it.
         */
        return wait_task_continued(wo, p);
}

/*
 * Do the work of do_wait() for one thread in the group, @tsk.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue; then
 * ->notask_error is 0 if there were any eligible children,
 * or still -ECHILD.
 */
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
        struct task_struct *p;

        list_for_each_entry(p, &tsk->children, sibling) {
                int ret = wait_consider_task(wo, 0, p);

                if (ret)
                        return ret;
        }

        return 0;
}

static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
{
        struct task_struct *p;

        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
                int ret = wait_consider_task(wo, 1, p);

                if (ret)
                        return ret;
        }

        return 0;
}

bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
{
        if (!eligible_pid(wo, p))
                return false;

        if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
                return false;

        return true;
}

static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
                                int sync, void *key)
{
        struct wait_opts *wo = container_of(wait, struct wait_opts,
                                                child_wait);
        struct task_struct *p = key;

        if (pid_child_should_wake(wo, p))
                return default_wake_function(wait, mode, sync, key);

        return 0;
}

void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
        __wake_up_sync_key(&parent->signal->wait_chldexit,
                           TASK_INTERRUPTIBLE, p);
}

static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
                                 struct task_struct *target)
{
        struct task_struct *parent =
                !ptrace ? target->real_parent : target->parent;

        return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
                                     same_thread_group(current, parent));
}

/*
 * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
 * and tracee lists to find the target task.
 */
static int do_wait_pid(struct wait_opts *wo)
{
        bool ptrace;
        struct task_struct *target;
        int retval;

        ptrace = false;
        target = pid_task(wo->wo_pid, PIDTYPE_TGID);
        if (target && is_effectively_child(wo, ptrace, target)) {
                retval = wait_consider_task(wo, ptrace, target);
                if (retval)
                        return retval;
        }

        ptrace = true;
        target = pid_task(wo->wo_pid, PIDTYPE_PID);
        if (target && target->ptrace &&
            is_effectively_child(wo, ptrace, target)) {
                retval = wait_consider_task(wo, ptrace, target);
                if (retval)
                        return retval;
        }

        return 0;
}

long __do_wait(struct wait_opts *wo)
{
        long retval;

        /*
         * If there is nothing that can match our criteria, just get out.
         * We will clear ->notask_error to zero if we see any child that
         * might later match our criteria, even if we are not able to reap
         * it yet.
         */
        wo->notask_error = -ECHILD;
        if ((wo->wo_type < PIDTYPE_MAX) &&
           (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
                goto notask;

        read_lock(&tasklist_lock);

        if (wo->wo_type == PIDTYPE_PID) {
                retval = do_wait_pid(wo);
                if (retval)
                        return retval;
        } else {
                struct task_struct *tsk = current;

                do {
                        retval = do_wait_thread(wo, tsk);
                        if (retval)
                                return retval;

                        retval = ptrace_do_wait(wo, tsk);
                        if (retval)
                                return retval;

                        if (wo->wo_flags & __WNOTHREAD)
                                break;
                } while_each_thread(current, tsk);
        }
        read_unlock(&tasklist_lock);

notask:
        retval = wo->notask_error;
        if (!retval && !(wo->wo_flags & WNOHANG))
                return -ERESTARTSYS;

        return retval;
}

static long do_wait(struct wait_opts *wo)
{
        int retval;

        trace_sched_process_wait(wo->wo_pid);

        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
        wo->child_wait.private = current;
        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);

        do {
                set_current_state(TASK_INTERRUPTIBLE);
                retval = __do_wait(wo);
                if (retval != -ERESTARTSYS)
                        break;
                if (signal_pending(current))
                        break;
                schedule();
        } while (1);

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
        return retval;
}

int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
                          struct waitid_info *infop, int options,
                          struct rusage *ru)
{
        unsigned int f_flags = 0;
        struct pid *pid = NULL;
        enum pid_type type;

        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;
        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
                return -EINVAL;

        switch (which) {
        case P_ALL:
                type = PIDTYPE_MAX;
                break;
        case P_PID:
                type = PIDTYPE_PID;
                if (upid <= 0)
                        return -EINVAL;

                pid = find_get_pid(upid);
                break;
        case P_PGID:
                type = PIDTYPE_PGID;
                if (upid < 0)
                        return -EINVAL;

                if (upid)
                        pid = find_get_pid(upid);
                else
                        pid = get_task_pid(current, PIDTYPE_PGID);
                break;
        case P_PIDFD:
                type = PIDTYPE_PID;
                if (upid < 0)
                        return -EINVAL;

                pid = pidfd_get_pid(upid, &f_flags);
                if (IS_ERR(pid))
                        return PTR_ERR(pid);

                break;
        default:
                return -EINVAL;
        }

        wo->wo_type        = type;
        wo->wo_pid        = pid;
        wo->wo_flags        = options;
        wo->wo_info        = infop;
        wo->wo_rusage        = ru;
        if (f_flags & O_NONBLOCK)
                wo->wo_flags |= WNOHANG;

        return 0;
}

static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
                          int options, struct rusage *ru)
{
        struct wait_opts wo;
        long ret;

        ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
        if (ret)
                return ret;

        ret = do_wait(&wo);
        if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
                ret = -EAGAIN;

        put_pid(wo.wo_pid);
        return ret;
}

SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
                infop, int, options, struct rusage __user *, ru)
{
        struct rusage r;
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
        int signo = 0;

        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
        if (!infop)
                return err;

        if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;

        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
        unsafe_put_user(info.cause, &infop->si_code, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
        user_write_access_end();
        return err;
Efault:
        user_write_access_end();
        return -EFAULT;
}

long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
                  struct rusage *ru)
{
        struct wait_opts wo;
        struct pid *pid = NULL;
        enum pid_type type;
        long ret;

        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;

        /* -INT_MIN is not defined */
        if (upid == INT_MIN)
                return -ESRCH;

        if (upid == -1)
                type = PIDTYPE_MAX;
        else if (upid < 0) {
                type = PIDTYPE_PGID;
                pid = find_get_pid(-upid);
        } else if (upid == 0) {
                type = PIDTYPE_PGID;
                pid = get_task_pid(current, PIDTYPE_PGID);
        } else /* upid > 0 */ {
                type = PIDTYPE_PID;
                pid = find_get_pid(upid);
        }

        wo.wo_type        = type;
        wo.wo_pid        = pid;
        wo.wo_flags        = options | WEXITED;
        wo.wo_info        = NULL;
        wo.wo_stat        = 0;
        wo.wo_rusage        = ru;
        ret = do_wait(&wo);
        put_pid(pid);
        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
                ret = -EFAULT;

        return ret;
}

int kernel_wait(pid_t pid, int *stat)
{
        struct wait_opts wo = {
                .wo_type        = PIDTYPE_PID,
                .wo_pid                = find_get_pid(pid),
                .wo_flags        = WEXITED,
        };
        int ret;

        ret = do_wait(&wo);
        if (ret > 0 && wo.wo_stat)
                *stat = wo.wo_stat;
        put_pid(wo.wo_pid);
        return ret;
}

SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
                int, options, struct rusage __user *, ru)
{
        struct rusage r;
        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);

        if (err > 0) {
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
        return err;
}

#ifdef __ARCH_WANT_SYS_WAITPID

/*
 * sys_waitpid() remains for compatibility. waitpid() should be
 * implemented by calling sys_wait4() from libc.a.
 */
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
{
        return kernel_wait4(pid, stat_addr, options, NULL);
}

#endif

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(wait4,
        compat_pid_t, pid,
        compat_uint_t __user *, stat_addr,
        int, options,
        struct compat_rusage __user *, ru)
{
        struct rusage r;
        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
        if (err > 0) {
                if (ru && put_compat_rusage(&r, ru))
                        return -EFAULT;
        }
        return err;
}

COMPAT_SYSCALL_DEFINE5(waitid,
                int, which, compat_pid_t, pid,
                struct compat_siginfo __user *, infop, int, options,
                struct compat_rusage __user *, uru)
{
        struct rusage ru;
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
        int signo = 0;
        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
                if (uru) {
                        /* kernel_waitid() overwrites everything in ru */
                        if (COMPAT_USE_64BIT_TIME)
                                err = copy_to_user(uru, &ru, sizeof(ru));
                        else
                                err = put_compat_rusage(&ru, uru);
                        if (err)
                                return -EFAULT;
                }
        }

        if (!infop)
                return err;

        if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;

        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
        unsafe_put_user(info.cause, &infop->si_code, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
        user_write_access_end();
        return err;
Efault:
        user_write_access_end();
        return -EFAULT;
}
#endif

/*
 * This needs to be __function_aligned as GCC implicitly makes any
 * implementation of abort() cold and drops alignment specified by
 * -falign-functions=N.
 *
 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
 */
__weak __function_aligned void abort(void)
{
        BUG();

        /* if that doesn't kill us, halt */
        panic("Oops failed to kill thread");
}
EXPORT_SYMBOL(abort);










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   14 







   19 














































   13 















    1 
    7 





















































   14 



   14 










   23 





    3 
   13 











    3 
























































































































































































































    2 







    2 


























































































































    1 





















































    2 






























































    2 






   36 
































    2 

























































































































































































   14 

















    8 



































































































    6 


















































































































































































































































































































































































    3 






















































































































































































































   35 


































   16 









































































    1 


    1 







   15 










   22 




   21 









   14 


   23 


    3 











   23 




   13 























































    1 






























































   22 















































































    3 
   17 
    7 



    2 


   18 





























   17 





    9 






   23 





    5 





   22 























   23 











    1 









    4 

   16 







































   10 




   15 





















    2 





























































    1 





    1 








    1 






    1 










































   19 



   22 














   15 









   18 




















































































































































































































   20 








































































































































































































































































    6 







































































































































































    9 

































































   19 

































































































































































































    1 




































    2 


































































































    3 










    3 













    3 









   17 













    1 













































































































































































































































    2 

















   10 
    2 
































































   10 






































   12 

    1 





























































    1 







































































































































   14 































   21 










   33 
    1 





















   35 
























   15 































































































    4 
















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct sk_buff' memory handlers.
 *
 *        Authors:
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Florian La Roche, <rzsfl@rz.uni-sb.de>
 */

#ifndef _LINUX_SKBUFF_H
#define _LINUX_SKBUFF_H

#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/time.h>
#include <linux/bug.h>
#include <linux/bvec.h>
#include <linux/cache.h>
#include <linux/rbtree.h>
#include <linux/socket.h>
#include <linux/refcount.h>

#include <linux/atomic.h>
#include <asm/types.h>
#include <linux/spinlock.h>
#include <net/checksum.h>
#include <linux/rcupdate.h>
#include <linux/dma-mapping.h>
#include <linux/netdev_features.h>
#include <net/flow_dissector.h>
#include <linux/in6.h>
#include <linux/if_packet.h>
#include <linux/llist.h>
#include <linux/page_frag_cache.h>
#include <net/flow.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_common.h>
#endif
#include <net/net_debug.h>
#include <net/dropreason-core.h>
#include <net/netmem.h>

/**
 * DOC: skb checksums
 *
 * The interface for checksum offload between the stack and networking drivers
 * is as follows...
 *
 * IP checksum related features
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * Drivers advertise checksum offload capabilities in the features of a device.
 * From the stack's point of view these are capabilities offered by the driver.
 * A driver typically only advertises features that it is capable of offloading
 * to its device.
 *
 * .. flat-table:: Checksum related device features
 *   :widths: 1 10
 *
 *   * - %NETIF_F_HW_CSUM
 *     - The driver (or its device) is able to compute one
 *         IP (one's complement) checksum for any combination
 *         of protocols or protocol layering. The checksum is
 *         computed and set in a packet per the CHECKSUM_PARTIAL
 *         interface (see below).
 *
 *   * - %NETIF_F_IP_CSUM
 *     - Driver (device) is only able to checksum plain
 *         TCP or UDP packets over IPv4. These are specifically
 *         unencapsulated packets of the form IPv4|TCP or
 *         IPv4|UDP where the Protocol field in the IPv4 header
 *         is TCP or UDP. The IPv4 header may contain IP options.
 *         This feature cannot be set in features for a device
 *         with NETIF_F_HW_CSUM also set. This feature is being
 *         DEPRECATED (see below).
 *
 *   * - %NETIF_F_IPV6_CSUM
 *     - Driver (device) is only able to checksum plain
 *         TCP or UDP packets over IPv6. These are specifically
 *         unencapsulated packets of the form IPv6|TCP or
 *         IPv6|UDP where the Next Header field in the IPv6
 *         header is either TCP or UDP. IPv6 extension headers
 *         are not supported with this feature. This feature
 *         cannot be set in features for a device with
 *         NETIF_F_HW_CSUM also set. This feature is being
 *         DEPRECATED (see below).
 *
 *   * - %NETIF_F_RXCSUM
 *     - Driver (device) performs receive checksum offload.
 *         This flag is only used to disable the RX checksum
 *         feature for a device. The stack will accept receive
 *         checksum indication in packets received on a device
 *         regardless of whether NETIF_F_RXCSUM is set.
 *
 * Checksumming of received packets by device
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * Indication of checksum verification is set in &sk_buff.ip_summed.
 * Possible values are:
 *
 * - %CHECKSUM_NONE
 *
 *   Device did not checksum this packet e.g. due to lack of capabilities.
 *   The packet contains full (though not verified) checksum in packet but
 *   not in skb->csum. Thus, skb->csum is undefined in this case.
 *
 * - %CHECKSUM_UNNECESSARY
 *
 *   The hardware you're dealing with doesn't calculate the full checksum
 *   (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums
 *   for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY
 *   if their checksums are okay. &sk_buff.csum is still undefined in this case
 *   though. A driver or device must never modify the checksum field in the
 *   packet even if checksum is verified.
 *
 *   %CHECKSUM_UNNECESSARY is applicable to following protocols:
 *
 *     - TCP: IPv6 and IPv4.
 *     - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a
 *       zero UDP checksum for either IPv4 or IPv6, the networking stack
 *       may perform further validation in this case.
 *     - GRE: only if the checksum is present in the header.
 *     - SCTP: indicates the CRC in SCTP header has been validated.
 *     - FCOE: indicates the CRC in FC frame has been validated.
 *
 *   &sk_buff.csum_level indicates the number of consecutive checksums found in
 *   the packet minus one that have been verified as %CHECKSUM_UNNECESSARY.
 *   For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
 *   and a device is able to verify the checksums for UDP (possibly zero),
 *   GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to
 *   two. If the device were only able to verify the UDP checksum and not
 *   GRE, either because it doesn't support GRE checksum or because GRE
 *   checksum is bad, skb->csum_level would be set to zero (TCP checksum is
 *   not considered in this case).
 *
 * - %CHECKSUM_COMPLETE
 *
 *   This is the most generic way. The device supplied checksum of the _whole_
 *   packet as seen by netif_rx() and fills in &sk_buff.csum. This means the
 *   hardware doesn't need to parse L3/L4 headers to implement this.
 *
 *   Notes:
 *
 *   - Even if device supports only some protocols, but is able to produce
 *     skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
 *   - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols.
 *
 * - %CHECKSUM_PARTIAL
 *
 *   A checksum is set up to be offloaded to a device as described in the
 *   output description for CHECKSUM_PARTIAL. This may occur on a packet
 *   received directly from another Linux OS, e.g., a virtualized Linux kernel
 *   on the same host, or it may be set in the input path in GRO or remote
 *   checksum offload. For the purposes of checksum verification, the checksum
 *   referred to by skb->csum_start + skb->csum_offset and any preceding
 *   checksums in the packet are considered verified. Any checksums in the
 *   packet that are after the checksum being offloaded are not considered to
 *   be verified.
 *
 * Checksumming on transmit for non-GSO
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * The stack requests checksum offload in the &sk_buff.ip_summed for a packet.
 * Values are:
 *
 * - %CHECKSUM_PARTIAL
 *
 *   The driver is required to checksum the packet as seen by hard_start_xmit()
 *   from &sk_buff.csum_start up to the end, and to record/write the checksum at
 *   offset &sk_buff.csum_start + &sk_buff.csum_offset.
 *   A driver may verify that the
 *   csum_start and csum_offset values are valid values given the length and
 *   offset of the packet, but it should not attempt to validate that the
 *   checksum refers to a legitimate transport layer checksum -- it is the
 *   purview of the stack to validate that csum_start and csum_offset are set
 *   correctly.
 *
 *   When the stack requests checksum offload for a packet, the driver MUST
 *   ensure that the checksum is set correctly. A driver can either offload the
 *   checksum calculation to the device, or call skb_checksum_help (in the case
 *   that the device does not support offload for a particular checksum).
 *
 *   %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of
 *   %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate
 *   checksum offload capability.
 *   skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based
 *   on network device checksumming capabilities: if a packet does not match
 *   them, skb_checksum_help() or skb_crc32c_help() (depending on the value of
 *   &sk_buff.csum_not_inet, see :ref:`crc`)
 *   is called to resolve the checksum.
 *
 * - %CHECKSUM_NONE
 *
 *   The skb was already checksummed by the protocol, or a checksum is not
 *   required.
 *
 * - %CHECKSUM_UNNECESSARY
 *
 *   This has the same meaning as CHECKSUM_NONE for checksum offload on
 *   output.
 *
 * - %CHECKSUM_COMPLETE
 *
 *   Not used in checksum output. If a driver observes a packet with this value
 *   set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set.
 *
 * .. _crc:
 *
 * Non-IP checksum (CRC) offloads
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * .. flat-table::
 *   :widths: 1 10
 *
 *   * - %NETIF_F_SCTP_CRC
 *     - This feature indicates that a device is capable of
 *         offloading the SCTP CRC in a packet. To perform this offload the stack
 *         will set csum_start and csum_offset accordingly, set ip_summed to
 *         %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication
 *         in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c.
 *         A driver that supports both IP checksum offload and SCTP CRC32c offload
 *         must verify which offload is configured for a packet by testing the
 *         value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to
 *         resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
 *
 *   * - %NETIF_F_FCOE_CRC
 *     - This feature indicates that a device is capable of offloading the FCOE
 *         CRC in a packet. To perform this offload the stack will set ip_summed
 *         to %CHECKSUM_PARTIAL and set csum_start and csum_offset
 *         accordingly. Note that there is no indication in the skbuff that the
 *         %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
 *         both IP checksum offload and FCOE CRC offload must verify which offload
 *         is configured for a packet, presumably by inspecting packet headers.
 *
 * Checksumming on output with GSO
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * In the case of a GSO packet (skb_is_gso() is true), checksum offload
 * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the
 * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as
 * part of the GSO operation is implied. If a checksum is being offloaded
 * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and
 * csum_offset are set to refer to the outermost checksum being offloaded
 * (two offloaded checksums are possible with UDP encapsulation).
 */

/* Don't change this without changing skb_csum_unnecessary! */
#define CHECKSUM_NONE                0
#define CHECKSUM_UNNECESSARY        1
#define CHECKSUM_COMPLETE        2
#define CHECKSUM_PARTIAL        3

/* Maximum value in skb->csum_level */
#define SKB_MAX_CSUM_LEVEL        3

#define SKB_DATA_ALIGN(X)        ALIGN(X, SMP_CACHE_BYTES)
#define SKB_WITH_OVERHEAD(X)        \
        ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

/* For X bytes available in skb->head, what is the minimal
 * allocation needed, knowing struct skb_shared_info needs
 * to be aligned.
 */
#define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \
        SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

#define SKB_MAX_ORDER(X, ORDER) \
        SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
#define SKB_MAX_HEAD(X)                (SKB_MAX_ORDER((X), 0))
#define SKB_MAX_ALLOC                (SKB_MAX_ORDER(0, 2))

/* return minimum truesize of one skb containing X bytes of data */
#define SKB_TRUESIZE(X) ((X) +                                                \
                         SKB_DATA_ALIGN(sizeof(struct sk_buff)) +        \
                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

struct net_device;
struct scatterlist;
struct pipe_inode_info;
struct iov_iter;
struct napi_struct;
struct bpf_prog;
union bpf_attr;
struct skb_ext;
struct ts_config;

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info {
        enum {
                BRNF_PROTO_UNCHANGED,
                BRNF_PROTO_8021Q,
                BRNF_PROTO_PPPOE
        } orig_proto:8;
        u8                        pkt_otherhost:1;
        u8                        in_prerouting:1;
        u8                        bridged_dnat:1;
        u8                        sabotage_in_done:1;
        __u16                        frag_max_size;
        int                        physinif;

        /* always valid & non-NULL from FORWARD on, for physdev match */
        struct net_device        *physoutdev;
        union {
                /* prerouting: detect dnat in orig/reply direction */
                __be32          ipv4_daddr;
                struct in6_addr ipv6_daddr;

                /* after prerouting + nat detected: store original source
                 * mac since neigh resolution overwrites it, only used while
                 * skb is out in neigh layer.
                 */
                char neigh_header[8];
        };
};
#endif

#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
/* Chain in tc_skb_ext will be used to share the tc chain with
 * ovs recirc_id. It will be set to the current chain by tc
 * and read by ovs to recirc_id.
 */
struct tc_skb_ext {
        union {
                u64 act_miss_cookie;
                __u32 chain;
        };
        __u16 mru;
        __u16 zone;
        u8 post_ct:1;
        u8 post_ct_snat:1;
        u8 post_ct_dnat:1;
        u8 act_miss:1; /* Set if act_miss_cookie is used */
        u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */
};
#endif

struct sk_buff_head {
        /* These two members must be first to match sk_buff. */
        struct_group_tagged(sk_buff_list, list,
                struct sk_buff        *next;
                struct sk_buff        *prev;
        );

        __u32                qlen;
        spinlock_t        lock;
};

struct sk_buff;

#ifndef CONFIG_MAX_SKB_FRAGS
# define CONFIG_MAX_SKB_FRAGS 17
#endif

#define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS

/* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to
 * segment using its current segmentation instead.
 */
#define GSO_BY_FRAGS        0xFFFF

typedef struct skb_frag {
        netmem_ref netmem;
        unsigned int len;
        unsigned int offset;
} skb_frag_t;

/**
 * skb_frag_size() - Returns the size of a skb fragment
 * @frag: skb fragment
 */
static inline unsigned int skb_frag_size(const skb_frag_t *frag)
{
        return frag->len;
}

/**
 * skb_frag_size_set() - Sets the size of a skb fragment
 * @frag: skb fragment
 * @size: size of fragment
 */
static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
{
        frag->len = size;
}

/**
 * skb_frag_size_add() - Increments the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
{
        frag->len += delta;
}

/**
 * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to subtract
 */
static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
{
        frag->len -= delta;
}

/**
 * skb_frag_must_loop - Test if %p is a high memory page
 * @p: fragment's page
 */
static inline bool skb_frag_must_loop(struct page *p)
{
#if defined(CONFIG_HIGHMEM)
        if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p))
                return true;
#endif
        return false;
}

/**
 *        skb_frag_foreach_page - loop over pages in a fragment
 *
 *        @f:                skb frag to operate on
 *        @f_off:                offset from start of f->netmem
 *        @f_len:                length from f_off to loop over
 *        @p:                (temp var) current page
 *        @p_off:                (temp var) offset from start of current page,
 *                                   non-zero only on first page.
 *        @p_len:                (temp var) length in current page,
 *                                   < PAGE_SIZE only on first and last page.
 *        @copied:        (temp var) length so far, excluding current p_len.
 *
 *        A fragment can hold a compound page, in which case per-page
 *        operations, notably kmap_atomic, must be called for each
 *        regular page.
 */
#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied)        \
        for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT),                \
             p_off = (f_off) & (PAGE_SIZE - 1),                                \
             p_len = skb_frag_must_loop(p) ?                                \
             min_t(u32, f_len, PAGE_SIZE - p_off) : f_len,                \
             copied = 0;                                                \
             copied < f_len;                                                \
             copied += p_len, p++, p_off = 0,                                \
             p_len = min_t(u32, f_len - copied, PAGE_SIZE))                \

/**
 * struct skb_shared_hwtstamps - hardware time stamps
 * @hwtstamp:                hardware time stamp transformed into duration
 *                        since arbitrary point in time
 * @netdev_data:        address/cookie of network device driver used as
 *                        reference to actual hardware time stamp
 *
 * Software time stamps generated by ktime_get_real() are stored in
 * skb->tstamp.
 *
 * hwtstamps can only be compared against other hwtstamps from
 * the same device.
 *
 * This structure is attached to packets as part of the
 * &skb_shared_info. Use skb_hwtstamps() to get a pointer.
 */
struct skb_shared_hwtstamps {
        union {
                ktime_t        hwtstamp;
                void *netdev_data;
        };
};

/* Definitions for tx_flags in struct skb_shared_info */
enum {
        /* generate hardware time stamp */
        SKBTX_HW_TSTAMP_NOBPF = 1 << 0,

        /* generate software time stamp when queueing packet to NIC */
        SKBTX_SW_TSTAMP = 1 << 1,

        /* device driver is going to provide hardware time stamp */
        SKBTX_IN_PROGRESS = 1 << 2,

        /* generate software time stamp on packet tx completion */
        SKBTX_COMPLETION_TSTAMP = 1 << 3,

        /* determine hardware time stamp based on time or cycles */
        SKBTX_HW_TSTAMP_NETDEV = 1 << 5,

        /* generate software time stamp when entering packet scheduling */
        SKBTX_SCHED_TSTAMP = 1 << 6,

        /* used for bpf extension when a bpf program is loaded */
        SKBTX_BPF = 1 << 7,
};

#define SKBTX_HW_TSTAMP                (SKBTX_HW_TSTAMP_NOBPF | SKBTX_BPF)

#define SKBTX_ANY_SW_TSTAMP        (SKBTX_SW_TSTAMP    | \
                                 SKBTX_SCHED_TSTAMP | \
                                 SKBTX_BPF          | \
                                 SKBTX_COMPLETION_TSTAMP)
#define SKBTX_ANY_TSTAMP        (SKBTX_HW_TSTAMP | \
                                 SKBTX_ANY_SW_TSTAMP)

/* Definitions for flags in struct skb_shared_info */
enum {
        /* use zcopy routines */
        SKBFL_ZEROCOPY_ENABLE = BIT(0),

        /* This indicates at least one fragment might be overwritten
         * (as in vmsplice(), sendfile() ...)
         * If we need to compute a TX checksum, we'll need to copy
         * all frags to avoid possible bad checksum
         */
        SKBFL_SHARED_FRAG = BIT(1),

        /* segment contains only zerocopy data and should not be
         * charged to the kernel memory.
         */
        SKBFL_PURE_ZEROCOPY = BIT(2),

        SKBFL_DONT_ORPHAN = BIT(3),

        /* page references are managed by the ubuf_info, so it's safe to
         * use frags only up until ubuf_info is released
         */
        SKBFL_MANAGED_FRAG_REFS = BIT(4),
};

#define SKBFL_ZEROCOPY_FRAG        (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
#define SKBFL_ALL_ZEROCOPY        (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
                                 SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)

struct ubuf_info_ops {
        void (*complete)(struct sk_buff *, struct ubuf_info *,
                         bool zerocopy_success);
        /* has to be compatible with skb_zcopy_set() */
        int (*link_skb)(struct sk_buff *skb, struct ubuf_info *uarg);
};

/*
 * The callback notifies userspace to release buffers when skb DMA is done in
 * lower device, the skb last reference should be 0 when calling this.
 * The zerocopy_success argument is true if zero copy transmit occurred,
 * false on data copy or out of memory error caused by data copy attempt.
 * The ctx field is used to track device context.
 * The desc field is used to track userspace buffer index.
 */
struct ubuf_info {
        const struct ubuf_info_ops *ops;
        refcount_t refcnt;
        u8 flags;
};

struct ubuf_info_msgzc {
        struct ubuf_info ubuf;

        union {
                struct {
                        unsigned long desc;
                        void *ctx;
                };
                struct {
                        u32 id;
                        u16 len;
                        u16 zerocopy:1;
                        u32 bytelen;
                };
        };

        struct mmpin {
                struct user_struct *user;
                unsigned int num_pg;
        } mmp;
};

#define skb_uarg(SKB)        ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
#define uarg_to_msgzc(ubuf_ptr)        container_of((ubuf_ptr), struct ubuf_info_msgzc, \
                                             ubuf)

int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);

/* Preserve some data across TX submission and completion.
 *
 * Note, this state is stored in the driver. Extending the layout
 * might need some special care.
 */
struct xsk_tx_metadata_compl {
        __u64 *tx_timestamp;
};

/* This data is invariant across clones and lives at
 * the end of the header data, ie. at skb->end.
 */
struct skb_shared_info {
        __u8                flags;
        __u8                meta_len;
        __u8                nr_frags;
        __u8                tx_flags;
        unsigned short        gso_size;
        /* Warning: this field is not always filled in (UFO)! */
        unsigned short        gso_segs;
        struct sk_buff        *frag_list;
        union {
                struct skb_shared_hwtstamps hwtstamps;
                struct xsk_tx_metadata_compl xsk_meta;
        };
        unsigned int        gso_type;
        u32                tskey;

        /*
         * Warning : all fields before dataref are cleared in __alloc_skb()
         */
        atomic_t        dataref;

        union {
                struct {
                        u32                xdp_frags_size;
                        u32                xdp_frags_truesize;
                };

                /*
                 * Intermediate layers must ensure that destructor_arg
                 * remains valid until skb destructor.
                 */
                void                *destructor_arg;
        };

        /* must be last field, see pskb_expand_head() */
        skb_frag_t        frags[MAX_SKB_FRAGS];
};

/**
 * DOC: dataref and headerless skbs
 *
 * Transport layers send out clones of payload skbs they hold for
 * retransmissions. To allow lower layers of the stack to prepend their headers
 * we split &skb_shared_info.dataref into two halves.
 * The lower 16 bits count the overall number of references.
 * The higher 16 bits indicate how many of the references are payload-only.
 * skb_header_cloned() checks if skb is allowed to add / write the headers.
 *
 * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr
 * (via __skb_header_release()). Any clone created from marked skb will get
 * &sk_buff.hdr_len populated with the available headroom.
 * If there's the only clone in existence it's able to modify the headroom
 * at will. The sequence of calls inside the transport layer is::
 *
 *  <alloc skb>
 *  skb_reserve()
 *  __skb_header_release()
 *  skb_clone()
 *  // send the clone down the stack
 *
 * This is not a very generic construct and it depends on the transport layers
 * doing the right thing. In practice there's usually only one payload-only skb.
 * Having multiple payload-only skbs with different lengths of hdr_len is not
 * possible. The payload-only skbs should never leave their owner.
 */
#define SKB_DATAREF_SHIFT 16
#define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)


enum {
        SKB_FCLONE_UNAVAILABLE,        /* skb has no fclone (from head_cache) */
        SKB_FCLONE_ORIG,        /* orig skb (from fclone_cache) */
        SKB_FCLONE_CLONE,        /* companion fclone skb (from fclone_cache) */
};

enum {
        SKB_GSO_TCPV4 = 1 << 0,

        /* This indicates the skb is from an untrusted source. */
        SKB_GSO_DODGY = 1 << 1,

        /* This indicates the tcp segment has CWR set. */
        SKB_GSO_TCP_ECN = 1 << 2,

        __SKB_GSO_TCP_FIXEDID = 1 << 3,

        SKB_GSO_TCPV6 = 1 << 4,

        SKB_GSO_FCOE = 1 << 5,

        SKB_GSO_GRE = 1 << 6,

        SKB_GSO_GRE_CSUM = 1 << 7,

        SKB_GSO_IPXIP4 = 1 << 8,

        SKB_GSO_IPXIP6 = 1 << 9,

        SKB_GSO_UDP_TUNNEL = 1 << 10,

        SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,

        SKB_GSO_PARTIAL = 1 << 12,

        SKB_GSO_TUNNEL_REMCSUM = 1 << 13,

        SKB_GSO_SCTP = 1 << 14,

        SKB_GSO_ESP = 1 << 15,

        SKB_GSO_UDP = 1 << 16,

        SKB_GSO_UDP_L4 = 1 << 17,

        SKB_GSO_FRAGLIST = 1 << 18,

        SKB_GSO_TCP_ACCECN = 1 << 19,

        /* These indirectly map onto the same netdev feature.
         * If NETIF_F_TSO_MANGLEID is set it may mangle both inner and outer IDs.
         */
        SKB_GSO_TCP_FIXEDID = 1 << 30,
        SKB_GSO_TCP_FIXEDID_INNER = 1 << 31,
};

#if BITS_PER_LONG > 32
#define NET_SKBUFF_DATA_USES_OFFSET 1
#endif

#ifdef NET_SKBUFF_DATA_USES_OFFSET
typedef unsigned int sk_buff_data_t;
#else
typedef unsigned char *sk_buff_data_t;
#endif

enum skb_tstamp_type {
        SKB_CLOCK_REALTIME,
        SKB_CLOCK_MONOTONIC,
        SKB_CLOCK_TAI,
        __SKB_CLOCK_MAX = SKB_CLOCK_TAI,
};

/**
 * DOC: Basic sk_buff geometry
 *
 * struct sk_buff itself is a metadata structure and does not hold any packet
 * data. All the data is held in associated buffers.
 *
 * &sk_buff.head points to the main "head" buffer. The head buffer is divided
 * into two parts:
 *
 *  - data buffer, containing headers and sometimes payload;
 *    this is the part of the skb operated on by the common helpers
 *    such as skb_put() or skb_pull();
 *  - shared info (struct skb_shared_info) which holds an array of pointers
 *    to read-only data in the (page, offset, length) format.
 *
 * Optionally &skb_shared_info.frag_list may point to another skb.
 *
 * Basic diagram may look like this::
 *
 *                                  ---------------
 *                                 | sk_buff       |
 *                                  ---------------
 *     ,---------------------------  + head
 *    /          ,-----------------  + data
 *   /          /      ,-----------  + tail
 *  |          |      |            , + end
 *  |          |      |           |
 *  v          v      v           v
 *   -----------------------------------------------
 *  | headroom | data |  tailroom | skb_shared_info |
 *   -----------------------------------------------
 *                                 + [page frag]
 *                                 + [page frag]
 *                                 + [page frag]
 *                                 + [page frag]       ---------
 *                                 + frag_list    --> | sk_buff |
 *                                                     ---------
 *
 */

/**
 *        struct sk_buff - socket buffer
 *        @next: Next buffer in list
 *        @prev: Previous buffer in list
 *        @tstamp: Time we arrived/left
 *        @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point
 *                for retransmit timer
 *        @rbnode: RB tree node, alternative to next/prev for netem/tcp
 *        @list: queue head
 *        @ll_node: anchor in an llist (eg socket defer_list)
 *        @sk: Socket we are owned by
 *        @dev: Device we arrived on/are leaving by
 *        @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
 *        @cb: Control buffer. Free for use by every layer. Put private vars here
 *        @_skb_refdst: destination entry (with norefcount bit)
 *        @len: Length of actual data
 *        @data_len: Data length
 *        @mac_len: Length of link layer header
 *        @hdr_len: writable header length of cloned skb
 *        @csum: Checksum (must include start/offset pair)
 *        @csum_start: Offset from skb->head where checksumming should start
 *        @csum_offset: Offset from csum_start where checksum should be stored
 *        @priority: Packet queueing priority
 *        @ignore_df: allow local fragmentation
 *        @cloned: Head may be cloned (check refcnt to be sure)
 *        @ip_summed: Driver fed us an IP checksum
 *        @nohdr: Payload reference only, must not modify header
 *        @pkt_type: Packet class
 *        @fclone: skbuff clone status
 *        @ipvs_property: skbuff is owned by ipvs
 *        @inner_protocol_type: whether the inner protocol is
 *                ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO
 *        @remcsum_offload: remote checksum offload is enabled
 *        @offload_fwd_mark: Packet was L2-forwarded in hardware
 *        @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
 *        @tc_skip_classify: do not classify packet. set by IFB device
 *        @tc_at_ingress: used within tc_classify to distinguish in/egress
 *        @redirected: packet was redirected by packet classifier
 *        @from_ingress: packet was redirected from the ingress path
 *        @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h
 *        @peeked: this packet has been seen already, so stats have been
 *                done for it, don't do them again
 *        @nf_trace: netfilter packet trace flag
 *        @protocol: Packet protocol from driver
 *        @destructor: Destruct function
 *        @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
 *        @_sk_redir: socket redirection information for skmsg
 *        @_nfct: Associated connection, if any (with nfctinfo bits)
 *        @skb_iif: ifindex of device we arrived on
 *        @tc_index: Traffic control index
 *        @hash: the packet hash
 *        @queue_mapping: Queue mapping for multiqueue devices
 *        @head_frag: skb was allocated from page fragments,
 *                not allocated by kmalloc() or vmalloc().
 *        @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
 *        @pp_recycle: mark the packet for recycling instead of freeing (implies
 *                page_pool support on driver)
 *        @active_extensions: active extensions (skb_ext_id types)
 *        @ndisc_nodetype: router type (from link layer)
 *        @ooo_okay: allow the mapping of a socket to a queue to be changed
 *        @l4_hash: indicate hash is a canonical 4-tuple hash over transport
 *                ports.
 *        @sw_hash: indicates hash was computed in software stack
 *        @wifi_acked_valid: wifi_acked was set
 *        @wifi_acked: whether frame was acked on wifi or not
 *        @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 *        @encapsulation: indicates the inner headers in the skbuff are valid
 *        @encap_hdr_csum: software checksum is needed
 *        @csum_valid: checksum is already valid
 *        @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
 *        @csum_complete_sw: checksum was completed by software
 *        @csum_level: indicates the number of consecutive checksums found in
 *                the packet minus one that have been verified as
 *                CHECKSUM_UNNECESSARY (max 3)
 *        @unreadable: indicates that at least 1 of the fragments in this skb is
 *                unreadable.
 *        @dst_pending_confirm: need to confirm neighbour
 *        @decrypted: Decrypted SKB
 *        @slow_gro: state present at GRO time, slower prepare step required
 *        @tstamp_type: When set, skb->tstamp has the
 *                delivery_time clock base of skb->tstamp.
 *        @napi_id: id of the NAPI struct this skb came from
 *        @sender_cpu: (aka @napi_id) source CPU in XPS
 *        @alloc_cpu: CPU which did the skb allocation.
 *        @secmark: security marking
 *        @mark: Generic packet mark
 *        @reserved_tailroom: (aka @mark) number of bytes of free space available
 *                at the tail of an sk_buff
 *        @vlan_all: vlan fields (proto & tci)
 *        @vlan_proto: vlan encapsulation protocol
 *        @vlan_tci: vlan tag control information
 *        @inner_protocol: Protocol (encapsulation)
 *        @inner_ipproto: (aka @inner_protocol) stores ipproto when
 *                skb->inner_protocol_type == ENCAP_TYPE_IPPROTO;
 *        @inner_transport_header: Inner transport layer header (encapsulation)
 *        @inner_network_header: Network layer header (encapsulation)
 *        @inner_mac_header: Link layer header (encapsulation)
 *        @transport_header: Transport layer header
 *        @network_header: Network layer header
 *        @mac_header: Link layer header
 *        @kcov_handle: KCOV remote handle for remote coverage collection
 *        @tail: Tail pointer
 *        @end: End pointer
 *        @head: Head of buffer
 *        @data: Data head pointer
 *        @truesize: Buffer size
 *        @users: User count - see {datagram,tcp}.c
 *        @extensions: allocated extensions, valid if active_extensions is nonzero
 */

struct sk_buff {
        union {
                struct {
                        /* These two members must be first to match sk_buff_head. */
                        struct sk_buff                *next;
                        struct sk_buff                *prev;

                        union {
                                struct net_device        *dev;
                                /* Some protocols might use this space to store information,
                                 * while device pointer would be NULL.
                                 * UDP receive path is one user.
                                 */
                                unsigned long                dev_scratch;
                        };
                };
                struct rb_node                rbnode; /* used in netem, ip4 defrag, and tcp stack */
                struct list_head        list;
                struct llist_node        ll_node;
        };

        struct sock                *sk;

        union {
                ktime_t                tstamp;
                u64                skb_mstamp_ns; /* earliest departure time */
        };
        /*
         * This is the control buffer. It is free to use for every
         * layer. Please put your private variables there. If you
         * want to keep them across layers you have to do a skb_clone()
         * first. This is owned by whoever has the skb queued ATM.
         */
        char                        cb[48] __aligned(8);

        union {
                struct {
                        unsigned long        _skb_refdst;
                        void                (*destructor)(struct sk_buff *skb);
                };
                struct list_head        tcp_tsorted_anchor;
#ifdef CONFIG_NET_SOCK_MSG
                unsigned long                _sk_redir;
#endif
        };

#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        unsigned long                 _nfct;
#endif
        unsigned int                len,
                                data_len;
        __u16                        mac_len,
                                hdr_len;

        /* Following fields are _not_ copied in __copy_skb_header()
         * Note that queue_mapping is here mostly to fill a hole.
         */
        __u16                        queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK        (1 << 7)
#else
#define CLONED_MASK        1
#endif
#define CLONED_OFFSET                offsetof(struct sk_buff, __cloned_offset)

        /* private: */
        __u8                        __cloned_offset[0];
        /* public: */
        __u8                        cloned:1,
                                nohdr:1,
                                fclone:2,
                                peeked:1,
                                head_frag:1,
                                pfmemalloc:1,
                                pp_recycle:1; /* page_pool recycle indicator */
#ifdef CONFIG_SKB_EXTENSIONS
        __u8                        active_extensions;
#endif

        /* Fields enclosed in headers group are copied
         * using a single memcpy() in __copy_skb_header()
         */
        struct_group(headers,

        /* private: */
        __u8                        __pkt_type_offset[0];
        /* public: */
        __u8                        pkt_type:3; /* see PKT_TYPE_MAX */
        __u8                        ignore_df:1;
        __u8                        dst_pending_confirm:1;
        __u8                        ip_summed:2;
        __u8                        ooo_okay:1;

        /* private: */
        __u8                        __mono_tc_offset[0];
        /* public: */
        __u8                        tstamp_type:2;        /* See skb_tstamp_type */
#ifdef CONFIG_NET_XGRESS
        __u8                        tc_at_ingress:1;        /* See TC_AT_INGRESS_MASK */
        __u8                        tc_skip_classify:1;
#endif
        __u8                        remcsum_offload:1;
        __u8                        csum_complete_sw:1;
        __u8                        csum_level:2;
        __u8                        inner_protocol_type:1;

        __u8                        l4_hash:1;
        __u8                        sw_hash:1;
#ifdef CONFIG_WIRELESS
        __u8                        wifi_acked_valid:1;
        __u8                        wifi_acked:1;
#endif
        __u8                        no_fcs:1;
        /* Indicates the inner headers are valid in the skbuff. */
        __u8                        encapsulation:1;
        __u8                        encap_hdr_csum:1;
        __u8                        csum_valid:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
        __u8                        ndisc_nodetype:2;
#endif

#if IS_ENABLED(CONFIG_IP_VS)
        __u8                        ipvs_property:1;
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        __u8                        nf_trace:1;
#endif
#ifdef CONFIG_NET_SWITCHDEV
        __u8                        offload_fwd_mark:1;
        __u8                        offload_l3_fwd_mark:1;
#endif
        __u8                        redirected:1;
#ifdef CONFIG_NET_REDIRECT
        __u8                        from_ingress:1;
#endif
#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        __u8                        nf_skip_egress:1;
#endif
#ifdef CONFIG_SKB_DECRYPTED
        __u8                        decrypted:1;
#endif
        __u8                        slow_gro:1;
#if IS_ENABLED(CONFIG_IP_SCTP)
        __u8                        csum_not_inet:1;
#endif
        __u8                        unreadable:1;
#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
        __u16                        tc_index;        /* traffic control index */
#endif

        u16                        alloc_cpu;

        union {
                __wsum                csum;
                struct {
                        __u16        csum_start;
                        __u16        csum_offset;
                };
        };
        __u32                        priority;
        int                        skb_iif;
        __u32                        hash;
        union {
                u32                vlan_all;
                struct {
                        __be16        vlan_proto;
                        __u16        vlan_tci;
                };
        };
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
        union {
                unsigned int        napi_id;
                unsigned int        sender_cpu;
        };
#endif
#ifdef CONFIG_NETWORK_SECMARK
        __u32                secmark;
#endif

        union {
                __u32                mark;
                __u32                reserved_tailroom;
        };

        union {
                __be16                inner_protocol;
                __u8                inner_ipproto;
        };

        __u16                        inner_transport_header;
        __u16                        inner_network_header;
        __u16                        inner_mac_header;

        __be16                        protocol;
        __u16                        transport_header;
        __u16                        network_header;
        __u16                        mac_header;

#ifdef CONFIG_KCOV
        u64                        kcov_handle;
#endif

        ); /* end headers group */

        /* These elements must be at the end, see alloc_skb() for details.  */
        sk_buff_data_t                tail;
        sk_buff_data_t                end;
        unsigned char                *head,
                                *data;
        unsigned int                truesize;
        refcount_t                users;

#ifdef CONFIG_SKB_EXTENSIONS
        /* only usable after checking ->active_extensions != 0 */
        struct skb_ext                *extensions;
#endif
};

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX        (7 << 5)
#else
#define PKT_TYPE_MAX        7
#endif
#define PKT_TYPE_OFFSET                offsetof(struct sk_buff, __pkt_type_offset)

/* if you move tc_at_ingress or tstamp_type
 * around, you also must adapt these constants.
 */
#ifdef __BIG_ENDIAN_BITFIELD
#define SKB_TSTAMP_TYPE_MASK                (3 << 6)
#define SKB_TSTAMP_TYPE_RSHIFT                (6)
#define TC_AT_INGRESS_MASK                (1 << 5)
#else
#define SKB_TSTAMP_TYPE_MASK                (3)
#define TC_AT_INGRESS_MASK                (1 << 2)
#endif
#define SKB_BF_MONO_TC_OFFSET                offsetof(struct sk_buff, __mono_tc_offset)

#ifdef __KERNEL__
/*
 *        Handling routines are only of interest to the kernel
 */

#define SKB_ALLOC_FCLONE        0x01
#define SKB_ALLOC_RX                0x02
#define SKB_ALLOC_NAPI                0x04

/**
 * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
 * @skb: buffer
 */
static inline bool skb_pfmemalloc(const struct sk_buff *skb)
{
        return unlikely(skb->pfmemalloc);
}

/*
 * skb might have a dst pointer attached, refcounted or not.
 * _skb_refdst low order bit is set if refcount was _not_ taken
 */
#define SKB_DST_NOREF        1UL
#define SKB_DST_PTRMASK        ~(SKB_DST_NOREF)

/**
 * skb_dst - returns skb dst_entry
 * @skb: buffer
 *
 * Returns: skb dst_entry, regardless of reference taken or not.
 */
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
        /* If refdst was not refcounted, check we still are in a
         * rcu_read_lock section
         */
        WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
                !rcu_read_lock_held() &&
                !rcu_read_lock_bh_held());
        return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}

static inline void skb_dst_check_unset(struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE((skb->_skb_refdst & SKB_DST_PTRMASK) &&
                               !(skb->_skb_refdst & SKB_DST_NOREF));
}

/**
 * skb_dstref_steal() - return current dst_entry value and clear it
 * @skb: buffer
 *
 * Resets skb dst_entry without adjusting its reference count. Useful in
 * cases where dst_entry needs to be temporarily reset and restored.
 * Note that the returned value cannot be used directly because it
 * might contain SKB_DST_NOREF bit.
 *
 * When in doubt, prefer skb_dst_drop() over skb_dstref_steal() to correctly
 * handle dst_entry reference counting.
 *
 * Returns: original skb dst_entry.
 */
static inline unsigned long skb_dstref_steal(struct sk_buff *skb)
{
        unsigned long refdst = skb->_skb_refdst;

        skb->_skb_refdst = 0;
        return refdst;
}

/**
 * skb_dstref_restore() - restore skb dst_entry removed via skb_dstref_steal()
 * @skb: buffer
 * @refdst: dst entry from a call to skb_dstref_steal()
 */
static inline void skb_dstref_restore(struct sk_buff *skb, unsigned long refdst)
{
        skb_dst_check_unset(skb);
        skb->_skb_refdst = refdst;
}

/**
 * skb_dst_set - sets skb dst
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was taken on dst and should
 * be released by skb_dst_drop()
 */
static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
        skb_dst_check_unset(skb);
        skb->slow_gro |= !!dst;
        skb->_skb_refdst = (unsigned long)dst;
}

/**
 * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was not taken on dst.
 * If dst entry is cached, we do not take reference and dst_release
 * will be avoided by refdst_drop. If dst entry is not cached, we take
 * reference, so that last dst_release can destroy the dst immediately.
 */
static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
{
        skb_dst_check_unset(skb);
        WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
        skb->slow_gro |= !!dst;
        skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}

/**
 * skb_dst_is_noref - Test if skb dst isn't refcounted
 * @skb: buffer
 */
static inline bool skb_dst_is_noref(const struct sk_buff *skb)
{
        return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
}

/* For mangling skb->pkt_type from user space side from applications
 * such as nft, tc, etc, we only allow a conservative subset of
 * possible pkt_types to be set.
*/
static inline bool skb_pkt_type_ok(u32 ptype)
{
        return ptype <= PACKET_OTHERHOST;
}

/**
 * skb_napi_id - Returns the skb's NAPI id
 * @skb: buffer
 */
static inline unsigned int skb_napi_id(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        return skb->napi_id;
#else
        return 0;
#endif
}

static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
{
#ifdef CONFIG_WIRELESS
        return skb->wifi_acked_valid;
#else
        return 0;
#endif
}

/**
 * skb_unref - decrement the skb's reference count
 * @skb: buffer
 *
 * Returns: true if we can free the skb.
 */
static inline bool skb_unref(struct sk_buff *skb)
{
        if (unlikely(!skb))
                return false;
        if (!IS_ENABLED(CONFIG_DEBUG_NET) && likely(refcount_read(&skb->users) == 1))
                smp_rmb();
        else if (likely(!refcount_dec_and_test(&skb->users)))
                return false;

        return true;
}

static inline bool skb_data_unref(const struct sk_buff *skb,
                                  struct skb_shared_info *shinfo)
{
        int bias;

        if (!skb->cloned)
                return true;

        bias = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;

        if (atomic_read(&shinfo->dataref) == bias)
                smp_rmb();
        else if (atomic_sub_return(bias, &shinfo->dataref))
                return false;

        return true;
}

void __fix_address sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
                                      enum skb_drop_reason reason);

static inline void
kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        sk_skb_reason_drop(NULL, skb, reason);
}

/**
 *        kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason
 *        @skb: buffer to free
 */
static inline void kfree_skb(struct sk_buff *skb)
{
        kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

void skb_release_head_state(struct sk_buff *skb);
void kfree_skb_list_reason(struct sk_buff *segs,
                           enum skb_drop_reason reason);
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
void skb_tx_error(struct sk_buff *skb);

static inline void kfree_skb_list(struct sk_buff *segs)
{
        kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED);
}

#ifdef CONFIG_TRACEPOINTS
void consume_skb(struct sk_buff *skb);
#else
static inline void consume_skb(struct sk_buff *skb)
{
        return kfree_skb(skb);
}
#endif

void __consume_stateless_skb(struct sk_buff *skb);
void  __kfree_skb(struct sk_buff *skb);

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize);

struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
                            int node);
struct sk_buff *__build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size);
void skb_attempt_defer_free(struct sk_buff *skb);

u32 napi_skb_cache_get_bulk(void **skbs, u32 n);
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
struct sk_buff *slab_build_skb(void *data);

/**
 * alloc_skb - allocate a network buffer
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb(unsigned int size,
                                        gfp_t priority)
{
        return __alloc_skb(size, priority, 0, NUMA_NO_NODE);
}

struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int max_page_order,
                                     int *errcode,
                                     gfp_t gfp_mask);
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first);

/* Layout of fast clones : [skb1][skb2][fclone_ref] */
struct sk_buff_fclones {
        struct sk_buff        skb1;

        struct sk_buff        skb2;

        refcount_t        fclone_ref;
};

/**
 *        skb_fclone_busy - check if fclone is busy
 *        @sk: socket
 *        @skb: buffer
 *
 * Returns: true if skb is a fast clone, and its clone is not freed.
 * Some drivers call skb_orphan() in their ndo_start_xmit(),
 * so we also check that didn't happen.
 */
static inline bool skb_fclone_busy(const struct sock *sk,
                                   const struct sk_buff *skb)
{
        const struct sk_buff_fclones *fclones;

        fclones = container_of(skb, struct sk_buff_fclones, skb1);

        return skb->fclone == SKB_FCLONE_ORIG &&
               refcount_read(&fclones->fclone_ref) > 1 &&
               READ_ONCE(fclones->skb2.sk) == sk;
}

/**
 * alloc_skb_fclone - allocate a network buffer from fclone cache
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
                                               gfp_t priority)
{
        return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
}

struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
void skb_headers_offset_update(struct sk_buff *skb, int off);
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone);
static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
                                          gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, headroom, gfp_mask, false);
}

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
                                     unsigned int headroom);
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom);
struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
                                int newtailroom, gfp_t priority);
int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                                     int offset, int len);
int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
                              int offset, int len);
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);

/**
 *        skb_pad                        -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error.
 */
static inline int skb_pad(struct sk_buff *skb, int pad)
{
        return __skb_pad(skb, pad, true);
}
#define dev_kfree_skb(a)        consume_skb(a)

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size, size_t max_frags);

struct skb_seq_state {
        __u32                lower_offset;
        __u32                upper_offset;
        __u32                frag_idx;
        __u32                stepped_offset;
        struct sk_buff        *root_skb;
        struct sk_buff        *cur_skb;
        __u8                *frag_data;
        __u32                frag_off;
};

void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st);
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st);
void skb_abort_seq_read(struct skb_seq_state *st);
int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len);

unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config);

/*
 * Packet hash types specify the type of hash in skb_set_hash.
 *
 * Hash types refer to the protocol layer addresses which are used to
 * construct a packet's hash. The hashes are used to differentiate or identify
 * flows of the protocol layer for the hash type. Hash types are either
 * layer-2 (L2), layer-3 (L3), or layer-4 (L4).
 *
 * Properties of hashes:
 *
 * 1) Two packets in different flows have different hash values
 * 2) Two packets in the same flow should have the same hash value
 *
 * A hash at a higher layer is considered to be more specific. A driver should
 * set the most specific hash possible.
 *
 * A driver cannot indicate a more specific hash than the layer at which a hash
 * was computed. For instance an L3 hash cannot be set as an L4 hash.
 *
 * A driver may indicate a hash level which is less specific than the
 * actual layer the hash was computed on. For instance, a hash computed
 * at L4 may be considered an L3 hash. This should only be done if the
 * driver can't unambiguously determine that the HW computed the hash at
 * the higher layer. Note that the "should" in the second property above
 * permits this.
 */
enum pkt_hash_types {
        PKT_HASH_TYPE_NONE,        /* Undefined type */
        PKT_HASH_TYPE_L2,        /* Input: src_MAC, dest_MAC */
        PKT_HASH_TYPE_L3,        /* Input: src_IP, dst_IP */
        PKT_HASH_TYPE_L4,        /* Input: src_IP, dst_IP, src_port, dst_port */
};

static inline void skb_clear_hash(struct sk_buff *skb)
{
        skb->hash = 0;
        skb->sw_hash = 0;
        skb->l4_hash = 0;
}

static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
{
        if (!skb->l4_hash)
                skb_clear_hash(skb);
}

static inline void
__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4)
{
        skb->l4_hash = is_l4;
        skb->sw_hash = is_sw;
        skb->hash = hash;
}

static inline void
skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
{
        /* Used by drivers to set hash from HW */
        __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4);
}

static inline void
__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
{
        __skb_set_hash(skb, hash, true, is_l4);
}

u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb);

static inline u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
{
        return __skb_get_hash_symmetric_net(NULL, skb);
}

void __skb_get_hash_net(const struct net *net, struct sk_buff *skb);
u32 skb_get_poff(const struct sk_buff *skb);
u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                   const struct flow_keys_basic *keys, int hlen);
__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
                          const void *data, int hlen_proto);

void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
                             const struct flow_dissector_key *key,
                             unsigned int key_count);

struct bpf_flow_dissector;
u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
                     __be16 proto, int nhoff, int hlen, unsigned int flags);

bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        __be16 proto, int nhoff, int hlen, unsigned int flags);

static inline bool skb_flow_dissect(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
                                    void *target_container, unsigned int flags)
{
        return __skb_flow_dissect(NULL, skb, flow_dissector,
                                  target_container, NULL, 0, 0, 0, flags);
}

static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
                                              struct flow_keys *flow,
                                              unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(NULL, skb, &flow_keys_dissector,
                                  flow, NULL, 0, 0, 0, flags);
}

static inline bool
skb_flow_dissect_flow_keys_basic(const struct net *net,
                                 const struct sk_buff *skb,
                                 struct flow_keys_basic *flow,
                                 const void *data, __be16 proto,
                                 int nhoff, int hlen, unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
                                  data, proto, nhoff, hlen, flags);
}

void skb_flow_dissect_meta(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

/* Gets a skb connection tracking info, ctinfo map should be a
 * map of mapsize to translate enum ip_conntrack_info states
 * to user states.
 */
void
skb_flow_dissect_ct(const struct sk_buff *skb,
                    struct flow_dissector *flow_dissector,
                    void *target_container,
                    u16 *ctinfo_map, size_t mapsize,
                    bool post_ct, u16 zone);
void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
                             void *target_container);

void skb_flow_dissect_hash(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

static inline __u32 skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
        if (!skb->l4_hash && !skb->sw_hash)
                __skb_get_hash_net(net, skb);

        return skb->hash;
}

static inline __u32 skb_get_hash(struct sk_buff *skb)
{
        if (!skb->l4_hash && !skb->sw_hash)
                __skb_get_hash_net(NULL, skb);

        return skb->hash;
}

static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
{
        if (!skb->l4_hash && !skb->sw_hash) {
                struct flow_keys keys;
                __u32 hash = __get_hash_from_flowi6(fl6, &keys);

                __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
        }

        return skb->hash;
}

__u32 skb_get_hash_perturb(const struct sk_buff *skb,
                           const siphash_key_t *perturb);

static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
{
        return skb->hash;
}

static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
{
        to->hash = from->hash;
        to->sw_hash = from->sw_hash;
        to->l4_hash = from->l4_hash;
};

static inline int skb_cmp_decrypted(const struct sk_buff *skb1,
                                    const struct sk_buff *skb2)
{
#ifdef CONFIG_SKB_DECRYPTED
        return skb2->decrypted - skb1->decrypted;
#else
        return 0;
#endif
}

static inline bool skb_is_decrypted(const struct sk_buff *skb)
{
#ifdef CONFIG_SKB_DECRYPTED
        return skb->decrypted;
#else
        return false;
#endif
}

static inline void skb_copy_decrypted(struct sk_buff *to,
                                      const struct sk_buff *from)
{
#ifdef CONFIG_SKB_DECRYPTED
        to->decrypted = from->decrypted;
#endif
}

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end;
}

static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
{
        skb->end = offset;
}
#else
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end - skb->head;
}

static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
{
        skb->end = skb->head + offset;
}
#endif

extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;

struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
                                       struct ubuf_info *uarg, bool devmem);

void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);

struct net_devmem_dmabuf_binding;

int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
                            struct sk_buff *skb, struct iov_iter *from,
                            size_t length,
                            struct net_devmem_dmabuf_binding *binding);

int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
                                struct iov_iter *from, size_t length);

static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
                                          struct msghdr *msg, int len)
{
        return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
                                       NULL);
}

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg,
                             struct net_devmem_dmabuf_binding *binding);

/* Internal */
#define skb_shinfo(SKB)        ((struct skb_shared_info *)(skb_end_pointer(SKB)))

static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
{
        return &skb_shinfo(skb)->hwtstamps;
}

static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
{
        bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE;

        return is_zcopy ? skb_uarg(skb) : NULL;
}

static inline bool skb_zcopy_pure(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
}

static inline bool skb_zcopy_managed(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
}

static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
                                       const struct sk_buff *skb2)
{
        return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
}

static inline void net_zcopy_get(struct ubuf_info *uarg)
{
        refcount_inc(&uarg->refcnt);
}

static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg)
{
        skb_shinfo(skb)->destructor_arg = uarg;
        skb_shinfo(skb)->flags |= uarg->flags;
}

static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
                                 bool *have_ref)
{
        if (skb && uarg && !skb_zcopy(skb)) {
                if (unlikely(have_ref && *have_ref))
                        *have_ref = false;
                else
                        net_zcopy_get(uarg);
                skb_zcopy_init(skb, uarg);
        }
}

static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
{
        skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL);
        skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
}

static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb)
{
        return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL;
}

static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
{
        return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL);
}

static inline void net_zcopy_put(struct ubuf_info *uarg)
{
        if (uarg)
                uarg->ops->complete(NULL, uarg, true);
}

static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
        if (uarg) {
                if (uarg->ops == &msg_zerocopy_ubuf_ops)
                        msg_zerocopy_put_abort(uarg, have_uref);
                else if (have_uref)
                        net_zcopy_put(uarg);
        }
}

/* Release a reference on a zerocopy structure */
static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
{
        struct ubuf_info *uarg = skb_zcopy(skb);

        if (uarg) {
                if (!skb_zcopy_is_nouarg(skb))
                        uarg->ops->complete(skb, uarg, zerocopy_success);

                skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
        }
}

void __skb_zcopy_downgrade_managed(struct sk_buff *skb);

static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
        if (unlikely(skb_zcopy_managed(skb)))
                __skb_zcopy_downgrade_managed(skb);
}

/* Return true if frags in this skb are readable by the host. */
static inline bool skb_frags_readable(const struct sk_buff *skb)
{
        return !skb->unreadable;
}

static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
        skb->next = NULL;
}

static inline void skb_poison_list(struct sk_buff *skb)
{
#ifdef CONFIG_DEBUG_NET
        skb->next = SKB_LIST_POISON_NEXT;
#endif
}

/* Iterate through singly-linked GSO fragments of an skb. */
#define skb_list_walk_safe(first, skb, next_skb)                               \
        for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb);  \
             (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL)

static inline void skb_list_del_init(struct sk_buff *skb)
{
        __list_del_entry(&skb->list);
        skb_mark_not_on_list(skb);
}

/**
 *        skb_queue_empty - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 */
static inline int skb_queue_empty(const struct sk_buff_head *list)
{
        return list->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_empty_lockless - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 *        This variant can be used in lockless contexts.
 */
static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list)
{
        return READ_ONCE(list->next) == (const struct sk_buff *) list;
}


/**
 *        skb_queue_is_last - check if skb is the last entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the last buffer on the list.
 */
static inline bool skb_queue_is_last(const struct sk_buff_head *list,
                                     const struct sk_buff *skb)
{
        return skb->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_is_first - check if skb is the first entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the first buffer on the list.
 */
static inline bool skb_queue_is_first(const struct sk_buff_head *list,
                                      const struct sk_buff *skb)
{
        return skb->prev == (const struct sk_buff *) list;
}

/**
 *        skb_queue_next - return the next packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the next packet in @list after @skb.  It is only valid to
 *        call this if skb_queue_is_last() evaluates to false.
 */
static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_last(list, skb));
        return skb->next;
}

/**
 *        skb_queue_prev - return the prev packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the prev packet in @list before @skb.  It is only valid to
 *        call this if skb_queue_is_first() evaluates to false.
 */
static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_first(list, skb));
        return skb->prev;
}

/**
 *        skb_get - reference buffer
 *        @skb: buffer to reference
 *
 *        Makes another reference to a socket buffer and returns a pointer
 *        to the buffer.
 */
static inline struct sk_buff *skb_get(struct sk_buff *skb)
{
        refcount_inc(&skb->users);
        return skb;
}

/*
 * If users == 1, we are the only owner and can avoid redundant atomic changes.
 */

/**
 *        skb_cloned - is the buffer a clone
 *        @skb: buffer to check
 *
 *        Returns true if the buffer was generated with skb_clone() and is
 *        one of multiple shared copies of the buffer. Cloned buffers are
 *        shared data so must not be written to under normal circumstances.
 */
static inline int skb_cloned(const struct sk_buff *skb)
{
        return skb->cloned &&
               (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1;
}

static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/* This variant of skb_unclone() makes sure skb->truesize
 * and skb_end_offset() are not changed, whenever a new skb->head is needed.
 *
 * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X))
 * when various debugging features are in place.
 */
int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri);
static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_cloned(skb))
                return __skb_unclone_keeptruesize(skb, pri);
        return 0;
}

/**
 *        skb_header_cloned - is the header a clone
 *        @skb: buffer to check
 *
 *        Returns true if modifying the header part of the buffer requires
 *        the data to be copied.
 */
static inline int skb_header_cloned(const struct sk_buff *skb)
{
        int dataref;

        if (!skb->cloned)
                return 0;

        dataref = atomic_read(&skb_shinfo(skb)->dataref);
        dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT);
        return dataref != 1;
}

static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_header_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/**
 * __skb_header_release() - allow clones to use the headroom
 * @skb: buffer to operate on
 *
 * See "DOC: dataref and headerless skbs".
 */
static inline void __skb_header_release(struct sk_buff *skb)
{
        skb->nohdr = 1;
        atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT));
}


/**
 *        skb_shared - is the buffer shared
 *        @skb: buffer to check
 *
 *        Returns true if more than one person has a reference to this
 *        buffer.
 */
static inline int skb_shared(const struct sk_buff *skb)
{
        return refcount_read(&skb->users) != 1;
}

/**
 *        skb_share_check - check if buffer is shared and if so clone it
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the buffer is shared the buffer is cloned and the old copy
 *        drops a reference. A new clone with a single reference is returned.
 *        If the buffer is not shared the original buffer is returned. When
 *        being called from interrupt status or with spinlocks held pri must
 *        be GFP_ATOMIC.
 *
 *        NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, pri);

                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/*
 *        Copy shared buffers into a new sk_buff. We effectively do COW on
 *        packets to handle cases where we have a local reader and forward
 *        and a couple of other messy ones. The normal one is tcpdumping
 *        a packet that's being forwarded.
 */

/**
 *        skb_unshare - make a copy of a shared buffer
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the socket buffer is a clone then this function creates a new
 *        copy of the data, drops a reference count on the old copy and returns
 *        the new copy with the reference count at 1. If the buffer is not a clone
 *        the original buffer is returned. When called with a spinlock held or
 *        from interrupt state @pri must be %GFP_ATOMIC
 *
 *        %NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
                                          gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_cloned(skb)) {
                struct sk_buff *nskb = skb_copy(skb, pri);

                /* Free our shared copy */
                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/**
 *        skb_peek - peek at the head of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the head element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = list_->next;

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;
}

/**
 *        __skb_peek - peek at the head of a non-empty &sk_buff_head
 *        @list_: list to peek at
 *
 *        Like skb_peek(), but the caller knows that the list is not empty.
 */
static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_)
{
        return list_->next;
}

/**
 *        skb_peek_next - peek skb following the given one from a queue
 *        @skb: skb to start from
 *        @list_: list to peek at
 *
 *        Returns %NULL when the end of the list is met or a pointer to the
 *        next element. The reference count is not incremented and the
 *        reference is therefore volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_next(struct sk_buff *skb,
                const struct sk_buff_head *list_)
{
        struct sk_buff *next = skb->next;

        if (next == (struct sk_buff *)list_)
                next = NULL;
        return next;
}

/**
 *        skb_peek_tail - peek at the tail of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the tail element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = READ_ONCE(list_->prev);

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;

}

/**
 *        skb_queue_len        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 */
static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
{
        return list_->qlen;
}

/**
 *        skb_queue_len_lockless        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 *        This variant can be used in lockless contexts.
 */
static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_)
{
        return READ_ONCE(list_->qlen);
}

/**
 *        __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head
 *        @list: queue to initialize
 *
 *        This initializes only the list and queue length aspects of
 *        an sk_buff_head object.  This allows to initialize the list
 *        aspects of an sk_buff_head without reinitializing things like
 *        the spinlock.  It can also be used for on-stack sk_buff_head
 *        objects where the spinlock is known to not be used.
 */
static inline void __skb_queue_head_init(struct sk_buff_head *list)
{
        list->prev = list->next = (struct sk_buff *)list;
        list->qlen = 0;
}

/*
 * This function creates a split out lock class for each invocation;
 * this is needed for now since a whole lot of users of the skb-queue
 * infrastructure in drivers have different locking usage (in hardirq)
 * than the networking core (in softirq only). In the long run either the
 * network layer or drivers should need annotation to consolidate the
 * main types of usage into 3 classes.
 */
static inline void skb_queue_head_init(struct sk_buff_head *list)
{
        spin_lock_init(&list->lock);
        __skb_queue_head_init(list);
}

static inline void skb_queue_head_init_class(struct sk_buff_head *list,
                struct lock_class_key *class)
{
        skb_queue_head_init(list);
        lockdep_set_class(&list->lock, class);
}

/*
 *        Insert an sk_buff on a list.
 *
 *        The "__skb_xxxx()" functions are the non-atomic ones that
 *        can only be called with interrupts disabled.
 */
static inline void __skb_insert(struct sk_buff *newsk,
                                struct sk_buff *prev, struct sk_buff *next,
                                struct sk_buff_head *list)
{
        /* See skb_queue_empty_lockless() and skb_peek_tail()
         * for the opposite READ_ONCE()
         */
        WRITE_ONCE(newsk->next, next);
        WRITE_ONCE(newsk->prev, prev);
        WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk);
        WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk);
        WRITE_ONCE(list->qlen, list->qlen + 1);
}

static inline void __skb_queue_splice(const struct sk_buff_head *list,
                                      struct sk_buff *prev,
                                      struct sk_buff *next)
{
        struct sk_buff *first = list->next;
        struct sk_buff *last = list->prev;

        WRITE_ONCE(first->prev, prev);
        WRITE_ONCE(prev->next, first);

        WRITE_ONCE(last->next, next);
        WRITE_ONCE(next->prev, last);
}

/**
 *        skb_queue_splice - join two skb lists, this is designed for stacks
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice(const struct sk_buff_head *list,
                                    struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_init(struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        skb_queue_splice_tail - join two skb lists, each list being a queue
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice_tail(const struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        Each of the lists is a queue.
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_tail_init(struct sk_buff_head *list,
                                              struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        __skb_queue_after - queue a buffer at the list head
 *        @list: list to use
 *        @prev: place after this buffer
 *        @newsk: buffer to queue
 *
 *        Queue a buffer int the middle of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_after(struct sk_buff_head *list,
                                     struct sk_buff *prev,
                                     struct sk_buff *newsk)
{
        __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list);
}

void skb_append(struct sk_buff *old, struct sk_buff *newsk,
                struct sk_buff_head *list);

static inline void __skb_queue_before(struct sk_buff_head *list,
                                      struct sk_buff *next,
                                      struct sk_buff *newsk)
{
        __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list);
}

/**
 *        __skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_head(struct sk_buff_head *list,
                                    struct sk_buff *newsk)
{
        __skb_queue_after(list, (struct sk_buff *)list, newsk);
}
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);

/**
 *        __skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the end of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_tail(struct sk_buff_head *list,
                                   struct sk_buff *newsk)
{
        __skb_queue_before(list, (struct sk_buff *)list, newsk);
}
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk);

/*
 * remove sk_buff from list. _Must_ be called atomically, and with
 * the list known..
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list);
static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        struct sk_buff *next, *prev;

        WRITE_ONCE(list->qlen, list->qlen - 1);
        next           = skb->next;
        prev           = skb->prev;
        skb->next  = skb->prev = NULL;
        WRITE_ONCE(next->prev, prev);
        WRITE_ONCE(prev->next, next);
}

/**
 *        __skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The head item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue(struct sk_buff_head *list);

/**
 *        __skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The tail item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek_tail(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);


static inline bool skb_is_nonlinear(const struct sk_buff *skb)
{
        return skb->data_len;
}

static inline unsigned int skb_headlen(const struct sk_buff *skb)
{
        return skb->len - skb->data_len;
}

static inline unsigned int __skb_pagelen(const struct sk_buff *skb)
{
        unsigned int i, len = 0;

        for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
                len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
        return len;
}

static inline unsigned int skb_pagelen(const struct sk_buff *skb)
{
        return skb_headlen(skb) + __skb_pagelen(skb);
}

static inline void skb_frag_fill_netmem_desc(skb_frag_t *frag,
                                             netmem_ref netmem, int off,
                                             int size)
{
        frag->netmem = netmem;
        frag->offset = off;
        skb_frag_size_set(frag, size);
}

static inline void skb_frag_fill_page_desc(skb_frag_t *frag,
                                           struct page *page,
                                           int off, int size)
{
        skb_frag_fill_netmem_desc(frag, page_to_netmem(page), off, size);
}

static inline void __skb_fill_netmem_desc_noacc(struct skb_shared_info *shinfo,
                                                int i, netmem_ref netmem,
                                                int off, int size)
{
        skb_frag_t *frag = &shinfo->frags[i];

        skb_frag_fill_netmem_desc(frag, netmem, off, size);
}

static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
                                              int i, struct page *page,
                                              int off, int size)
{
        __skb_fill_netmem_desc_noacc(shinfo, i, page_to_netmem(page), off,
                                     size);
}

/**
 * skb_len_add - adds a number to len fields of skb
 * @skb: buffer to add len to
 * @delta: number of bytes to add
 */
static inline void skb_len_add(struct sk_buff *skb, int delta)
{
        skb->len += delta;
        skb->data_len += delta;
        skb->truesize += delta;
}

/**
 * __skb_fill_netmem_desc - initialise a fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: fragment index to initialise
 * @netmem: the netmem to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * Initialises the @i'th fragment of @skb to point to &size bytes at
 * offset @off within @page.
 *
 * Does not take any additional reference on the fragment.
 */
static __always_inline void
__skb_fill_netmem_desc(struct sk_buff *skb, int i, netmem_ref netmem,
                       int off, int size)
{
        struct page *page;

        __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);

        if (netmem_is_net_iov(netmem)) {
                skb->unreadable = true;
                return;
        }

        page = netmem_to_page(netmem);

        /* Propagate page pfmemalloc to the skb if we can. The problem is
         * that not all callers have unique ownership of the page but rely
         * on page_is_pfmemalloc doing the right thing(tm).
         */
        page = compound_head(page);
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc = true;
}

static __always_inline void
__skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page,
                     int off, int size)
{
        __skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
}

static __always_inline void
skb_fill_netmem_desc(struct sk_buff *skb, int i, netmem_ref netmem,
                     int off, int size)
{
        __skb_fill_netmem_desc(skb, i, netmem, off, size);
        skb_shinfo(skb)->nr_frags = i + 1;
}

/**
 * skb_fill_page_desc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * As per __skb_fill_page_desc() -- initialises the @i'th fragment of
 * @skb to point to @size bytes at offset @off within @page. In
 * addition updates @skb such that @i is the last fragment.
 *
 * Does not take any additional reference on the fragment.
 */
static __always_inline void
skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page,
                   int off, int size)
{
        skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
}

/**
 * skb_fill_page_desc_noacc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * Variant of skb_fill_page_desc() which does not deal with
 * pfmemalloc, if page is not owned by us.
 */
static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i,
                                            struct page *page, int off,
                                            int size)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);

        __skb_fill_page_desc_noacc(shinfo, i, page, off, size);
        shinfo->nr_frags = i + 1;
}

static inline void skb_add_rx_frag_netmem(struct sk_buff *skb, int i,
                                          netmem_ref netmem, int off,
                                          int size, unsigned int truesize)
{
        DEBUG_NET_WARN_ON_ONCE(size > truesize);

        skb_fill_netmem_desc(skb, i, netmem, off, size);
        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
}

static inline void skb_add_rx_frag(struct sk_buff *skb, int i,
                                   struct page *page, int off, int size,
                                   unsigned int truesize)
{
        skb_add_rx_frag_netmem(skb, i, page_to_netmem(page), off, size,
                               truesize);
}

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize);

#define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data - skb->head;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb_reset_tail_pointer(skb);
        skb->tail += offset;
}

#else /* NET_SKBUFF_DATA_USES_OFFSET */
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb->tail = skb->data + offset;
}

#endif /* NET_SKBUFF_DATA_USES_OFFSET */

static inline void skb_assert_len(struct sk_buff *skb)
{
#ifdef CONFIG_DEBUG_NET
        if (WARN_ONCE(!skb->len, "%s\n", __func__))
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
#endif /* CONFIG_DEBUG_NET */
}

#if defined(CONFIG_FAIL_SKB_REALLOC)
void skb_might_realloc(struct sk_buff *skb);
#else
static inline void skb_might_realloc(struct sk_buff *skb) {}
#endif

/*
 *        Add data to an sk_buff
 */
void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len);
void *skb_put(struct sk_buff *skb, unsigned int len);
static inline void *__skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        return tmp;
}

static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memset(tmp, 0, len);
        return tmp;
}

static inline void *__skb_put_data(struct sk_buff *skb, const void *data,
                                   unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memcpy(tmp, data, len);
        return tmp;
}

static inline void __skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)__skb_put(skb, 1) = val;
}

static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memset(tmp, 0, len);

        return tmp;
}

static inline void *skb_put_data(struct sk_buff *skb, const void *data,
                                 unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memcpy(tmp, data, len);

        return tmp;
}

static inline void skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)skb_put(skb, 1) = val;
}

void *skb_push(struct sk_buff *skb, unsigned int len);
static inline void *__skb_push(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);

        skb->data -= len;
        DEBUG_NET_WARN_ON_ONCE(skb->data < skb->head);
        skb->len  += len;
        return skb->data;
}

void *skb_pull(struct sk_buff *skb, unsigned int len);
static __always_inline void *__skb_pull(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);

        skb->len -= len;
        if (unlikely(skb->len < skb->data_len)) {
#if defined(CONFIG_DEBUG_NET)
                skb->len += len;
                pr_err("__skb_pull(len=%u)\n", len);
                skb_dump(KERN_ERR, skb, false);
#endif
                BUG();
        }
        return skb->data += len;
}

static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len)
{
        return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
}

void *skb_pull_data(struct sk_buff *skb, size_t len);

void *__pskb_pull_tail(struct sk_buff *skb, int delta);

static __always_inline enum skb_drop_reason
pskb_may_pull_reason(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
        skb_might_realloc(skb);

        if (likely(len <= skb_headlen(skb)))
                return SKB_NOT_DROPPED_YET;

        if (unlikely(len > skb->len))
                return SKB_DROP_REASON_PKT_TOO_SMALL;

        if (unlikely(!__pskb_pull_tail(skb, len - skb_headlen(skb))))
                return SKB_DROP_REASON_NOMEM;

        return SKB_NOT_DROPPED_YET;
}

static __always_inline bool
pskb_may_pull(struct sk_buff *skb, unsigned int len)
{
        return pskb_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET;
}

static __always_inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
{
        if (!pskb_may_pull(skb, len))
                return NULL;

        skb->len -= len;
        return skb->data += len;
}

void skb_condense(struct sk_buff *skb);

/**
 *        skb_headroom - bytes at buffer head
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the head of an &sk_buff.
 */
static inline unsigned int skb_headroom(const struct sk_buff *skb)
{
        return skb->data - skb->head;
}

/**
 *        skb_tailroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 */
static inline int skb_tailroom(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
}

/**
 *        skb_availroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 *        allocated by sk_stream_alloc()
 */
static inline int skb_availroom(const struct sk_buff *skb)
{
        if (skb_is_nonlinear(skb))
                return 0;

        return skb->end - skb->tail - skb->reserved_tailroom;
}

/**
 *        skb_reserve - adjust headroom
 *        @skb: buffer to alter
 *        @len: bytes to move
 *
 *        Increase the headroom of an empty &sk_buff by reducing the tail
 *        room. This is only allowed for an empty buffer.
 */
static inline void skb_reserve(struct sk_buff *skb, int len)
{
        skb->data += len;
        skb->tail += len;
}

/**
 *        skb_tailroom_reserve - adjust reserved_tailroom
 *        @skb: buffer to alter
 *        @mtu: maximum amount of headlen permitted
 *        @needed_tailroom: minimum amount of reserved_tailroom
 *
 *        Set reserved_tailroom so that headlen can be as large as possible but
 *        not larger than mtu and tailroom cannot be smaller than
 *        needed_tailroom.
 *        The required headroom should already have been reserved before using
 *        this function.
 */
static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu,
                                        unsigned int needed_tailroom)
{
        SKB_LINEAR_ASSERT(skb);
        if (mtu < skb_tailroom(skb) - needed_tailroom)
                /* use at most mtu */
                skb->reserved_tailroom = skb_tailroom(skb) - mtu;
        else
                /* use up to all available space */
                skb->reserved_tailroom = needed_tailroom;
}

#define ENCAP_TYPE_ETHER        0
#define ENCAP_TYPE_IPPROTO        1

static inline void skb_set_inner_protocol(struct sk_buff *skb,
                                          __be16 protocol)
{
        skb->inner_protocol = protocol;
        skb->inner_protocol_type = ENCAP_TYPE_ETHER;
}

static inline void skb_set_inner_ipproto(struct sk_buff *skb,
                                         __u8 ipproto)
{
        skb->inner_ipproto = ipproto;
        skb->inner_protocol_type = ENCAP_TYPE_IPPROTO;
}

static inline void skb_reset_inner_headers(struct sk_buff *skb)
{
        skb->inner_mac_header = skb->mac_header;
        skb->inner_network_header = skb->network_header;
        skb->inner_transport_header = skb->transport_header;
}

static inline int skb_mac_header_was_set(const struct sk_buff *skb)
{
        return skb->mac_header != (typeof(skb->mac_header))~0U;
}

static inline void skb_reset_mac_len(struct sk_buff *skb)
{
        if (!skb_mac_header_was_set(skb)) {
                DEBUG_NET_WARN_ON_ONCE(1);
                skb->mac_len = 0;
        } else {
                skb->mac_len = skb->network_header - skb->mac_header;
        }
}

static inline unsigned char *skb_inner_transport_header(const struct sk_buff
                                                        *skb)
{
        return skb->head + skb->inner_transport_header;
}

static inline int skb_inner_transport_offset(const struct sk_buff *skb)
{
        return skb_inner_transport_header(skb) - skb->data;
}

static inline void skb_reset_inner_transport_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_transport_header))offset);
        skb->inner_transport_header = offset;
}

static inline void skb_set_inner_transport_header(struct sk_buff *skb,
                                                   const int offset)
{
        skb_reset_inner_transport_header(skb);
        skb->inner_transport_header += offset;
}

static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_network_header;
}

static inline void skb_reset_inner_network_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_network_header))offset);
        skb->inner_network_header = offset;
}

static inline void skb_set_inner_network_header(struct sk_buff *skb,
                                                const int offset)
{
        skb_reset_inner_network_header(skb);
        skb->inner_network_header += offset;
}

static inline bool skb_inner_network_header_was_set(const struct sk_buff *skb)
{
        return skb->inner_network_header > 0;
}

static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_mac_header;
}

static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_mac_header))offset);
        skb->inner_mac_header = offset;
}

static inline void skb_set_inner_mac_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_inner_mac_header(skb);
        skb->inner_mac_header += offset;
}
static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
{
        return skb->transport_header != (typeof(skb->transport_header))~0U;
}

static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb));
        return skb->head + skb->transport_header;
}

static inline void skb_reset_transport_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->transport_header))offset);
        skb->transport_header = offset;
}

/**
 * skb_reset_transport_header_careful - conditionally reset transport header
 * @skb: buffer to alter
 *
 * Hardened version of skb_reset_transport_header().
 *
 * Returns: true if the operation was a success.
 */
static inline bool __must_check
skb_reset_transport_header_careful(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        if (unlikely(offset != (typeof(skb->transport_header))offset))
                return false;

        if (unlikely(offset == (typeof(skb->transport_header))~0U))
                return false;

        skb->transport_header = offset;
        return true;
}

static inline void skb_set_transport_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_transport_header(skb);
        skb->transport_header += offset;
}

static inline unsigned char *skb_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->network_header;
}

static inline void skb_reset_network_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->network_header))offset);
        skb->network_header = offset;
}

static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
{
        skb_reset_network_header(skb);
        skb->network_header += offset;
}

static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
        return skb->head + skb->mac_header;
}

static inline int skb_mac_offset(const struct sk_buff *skb)
{
        return skb_mac_header(skb) - skb->data;
}

static inline u32 skb_mac_header_len(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
        return skb->network_header - skb->mac_header;
}

static inline void skb_unset_mac_header(struct sk_buff *skb)
{
        skb->mac_header = (typeof(skb->mac_header))~0U;
}

static inline void skb_reset_mac_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->mac_header))offset);
        skb->mac_header = offset;
}

static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
{
        skb_reset_mac_header(skb);
        skb->mac_header += offset;
}

static inline void skb_pop_mac_header(struct sk_buff *skb)
{
        skb->mac_header = skb->network_header;
}

static inline void skb_probe_transport_header(struct sk_buff *skb)
{
        struct flow_keys_basic keys;

        if (skb_transport_header_was_set(skb))
                return;

        if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                             NULL, 0, 0, 0, 0))
                skb_set_transport_header(skb, keys.control.thoff);
}

static inline void skb_mac_header_rebuild(struct sk_buff *skb)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -skb->mac_len);
                memmove(skb_mac_header(skb), old_mac, skb->mac_len);
        }
}

/* Move the full mac header up to current network_header.
 * Leaves skb->data pointing at offset skb->mac_len into the mac_header.
 * Must be provided the complete mac header length.
 */
static inline void skb_mac_header_rebuild_full(struct sk_buff *skb, u32 full_mac_len)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -full_mac_len);
                memmove(skb_mac_header(skb), old_mac, full_mac_len);
                __skb_push(skb, full_mac_len - skb->mac_len);
        }
}

static inline int skb_checksum_start_offset(const struct sk_buff *skb)
{
        return skb->csum_start - skb_headroom(skb);
}

static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
{
        return skb->head + skb->csum_start;
}

static inline int skb_transport_offset(const struct sk_buff *skb)
{
        return skb_transport_header(skb) - skb->data;
}

static inline u32 skb_network_header_len(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb));
        return skb->transport_header - skb->network_header;
}

static inline u32 skb_inner_network_header_len(const struct sk_buff *skb)
{
        return skb->inner_transport_header - skb->inner_network_header;
}

static inline int skb_network_offset(const struct sk_buff *skb)
{
        return skb_network_header(skb) - skb->data;
}

static inline int skb_inner_network_offset(const struct sk_buff *skb)
{
        return skb_inner_network_header(skb) - skb->data;
}

static inline enum skb_drop_reason
pskb_network_may_pull_reason(struct sk_buff *skb, unsigned int len)
{
        return pskb_may_pull_reason(skb, skb_network_offset(skb) + len);
}

static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
{
        return pskb_network_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET;
}

/*
 * CPUs often take a performance hit when accessing unaligned memory
 * locations. The actual performance hit varies, it can be small if the
 * hardware handles it or large if we have to take an exception and fix it
 * in software.
 *
 * Since an ethernet header is 14 bytes network drivers often end up with
 * the IP header at an unaligned offset. The IP header can be aligned by
 * shifting the start of the packet by 2 bytes. Drivers should do this
 * with:
 *
 * skb_reserve(skb, NET_IP_ALIGN);
 *
 * The downside to this alignment of the IP header is that the DMA is now
 * unaligned. On some architectures the cost of an unaligned DMA is high
 * and this cost outweighs the gains made by aligning the IP header.
 *
 * Since this trade off varies between architectures, we allow NET_IP_ALIGN
 * to be overridden.
 */
#ifndef NET_IP_ALIGN
#define NET_IP_ALIGN        2
#endif

/*
 * The networking layer reserves some headroom in skb data (via
 * dev_alloc_skb). This is used to avoid having to reallocate skb data when
 * the header has to grow. In the default case, if the header has to grow
 * 32 bytes or less we avoid the reallocation.
 *
 * Unfortunately this headroom changes the DMA alignment of the resulting
 * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive
 * on some architectures. An architecture can override this value,
 * perhaps setting it to a cacheline in size (since that will maintain
 * cacheline alignment of the DMA). It must be a power of 2.
 *
 * Various parts of the networking layer expect at least 32 bytes of
 * headroom, you should not reduce this.
 *
 * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS)
 * to reduce average number of cache lines per packet.
 * get_rps_cpu() for example only access one 64 bytes aligned block :
 * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)
 */
#ifndef NET_SKB_PAD
#define NET_SKB_PAD        max(32, L1_CACHE_BYTES)
#endif

int ___pskb_trim(struct sk_buff *skb, unsigned int len);

static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
{
        if (WARN_ON(skb_is_nonlinear(skb)))
                return;
        skb->len = len;
        skb_set_tail_pointer(skb, len);
}

static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
{
        __skb_set_length(skb, len);
}

void skb_trim(struct sk_buff *skb, unsigned int len);

static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->data_len)
                return ___pskb_trim(skb, len);
        __skb_trim(skb, len);
        return 0;
}

static __always_inline int pskb_trim(struct sk_buff *skb, unsigned int len)
{
        skb_might_realloc(skb);
        return (len < skb->len) ? __pskb_trim(skb, len) : 0;
}

/**
 *        pskb_trim_unique - remove end from a paged unique (not cloned) buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        This is identical to pskb_trim except that the caller knows that
 *        the skb is not cloned so we should never get an error due to out-
 *        of-memory.
 */
static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
{
        int err = pskb_trim(skb, len);
        BUG_ON(err);
}

static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
{
        unsigned int diff = len - skb->len;

        if (skb_tailroom(skb) < diff) {
                int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
                                           GFP_ATOMIC);
                if (ret)
                        return ret;
        }
        __skb_set_length(skb, len);
        return 0;
}

/**
 *        skb_orphan - orphan a buffer
 *        @skb: buffer to orphan
 *
 *        If a buffer currently has an owner then we call the owner's
 *        destructor function and make the @skb unowned. The buffer continues
 *        to exist but is no longer charged to its former owner.
 */
static __always_inline void skb_orphan(struct sk_buff *skb)
{
        if (skb->destructor) {
                skb->destructor(skb);
                skb->destructor = NULL;
                skb->sk                = NULL;
        } else {
                BUG_ON(skb->sk);
        }
}

/**
 *        skb_orphan_frags - orphan the frags contained in a buffer
 *        @skb: buffer to orphan frags from
 *        @gfp_mask: allocation mask for replacement pages
 *
 *        For each frag in the SKB which needs a destructor (i.e. has an
 *        owner) create a copy of that frag and release the original
 *        page by calling the destructor.
 */
static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/**
 *        __skb_queue_purge_reason - empty a list
 *        @list: list to empty
 *        @reason: drop reason
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function does not take the
 *        list lock and the caller must hold the relevant locks to use it.
 */
static inline void __skb_queue_purge_reason(struct sk_buff_head *list,
                                            enum skb_drop_reason reason)
{
        struct sk_buff *skb;

        while ((skb = __skb_dequeue(list)) != NULL)
                kfree_skb_reason(skb, reason);
}

static inline void __skb_queue_purge(struct sk_buff_head *list)
{
        __skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE);
}

void skb_queue_purge_reason(struct sk_buff_head *list,
                            enum skb_drop_reason reason);

static inline void skb_queue_purge(struct sk_buff_head *list)
{
        skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE);
}

unsigned int skb_rbtree_purge(struct rb_root *root);
void skb_errqueue_purge(struct sk_buff_head *list);

void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);

/**
 * netdev_alloc_frag - allocate a page fragment
 * @fragsz: fragment size
 *
 * Allocates a frag from a page for receive buffer.
 * Uses GFP_ATOMIC allocations.
 */
static inline void *netdev_alloc_frag(unsigned int fragsz)
{
        return __netdev_alloc_frag_align(fragsz, ~0u);
}

static inline void *netdev_alloc_frag_align(unsigned int fragsz,
                                            unsigned int align)
{
        WARN_ON_ONCE(!is_power_of_2(align));
        return __netdev_alloc_frag_align(fragsz, -align);
}

struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
                                   gfp_t gfp_mask);

/**
 *        netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @length: length to allocate
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has unspecified headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory. Although this function
 *        allocates memory it can be called from an interrupt.
 */
static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
                                               unsigned int length)
{
        return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
}

/* legacy helper around __netdev_alloc_skb() */
static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
                                              gfp_t gfp_mask)
{
        return __netdev_alloc_skb(NULL, length, gfp_mask);
}

/* legacy helper around netdev_alloc_skb() */
static inline struct sk_buff *dev_alloc_skb(unsigned int length)
{
        return netdev_alloc_skb(NULL, length);
}


static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length, gfp_t gfp)
{
        struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp);

        if (NET_IP_ALIGN && skb)
                skb_reserve(skb, NET_IP_ALIGN);
        return skb;
}

static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length)
{
        return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
}

static inline void skb_free_frag(void *addr)
{
        page_frag_free(addr);
}

void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);

static inline void *napi_alloc_frag(unsigned int fragsz)
{
        return __napi_alloc_frag_align(fragsz, ~0u);
}

static inline void *napi_alloc_frag_align(unsigned int fragsz,
                                          unsigned int align)
{
        WARN_ON_ONCE(!is_power_of_2(align));
        return __napi_alloc_frag_align(fragsz, -align);
}

struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length);
void napi_consume_skb(struct sk_buff *skb, int budget);

void napi_skb_free_stolen_head(struct sk_buff *skb);
void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason);

/**
 * __dev_alloc_pages - allocate page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 * @order: size of the allocation
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
*/
static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask,
                                             unsigned int order)
{
        /* This piece of code contains several assumptions.
         * 1.  This is for device Rx, therefore a cold page is preferred.
         * 2.  The expectation is the user wants a compound page.
         * 3.  If requesting a order 0 page it will not be compound
         *     due to the check to see if order has a value in prep_new_page
         * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
         *     code in gfp_to_alloc_flags that should be enforcing this.
         */
        gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;

        return alloc_pages_node_noprof(NUMA_NO_NODE, gfp_mask, order);
}
#define __dev_alloc_pages(...)        alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__))

/*
 * This specialized allocator has to be a macro for its allocations to be
 * accounted separately (to have a separate alloc_tag).
 */
#define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order)

/**
 * __dev_alloc_page - allocate a page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
 */
static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask)
{
        return __dev_alloc_pages_noprof(gfp_mask, 0);
}
#define __dev_alloc_page(...)        alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__))

/*
 * This specialized allocator has to be a macro for its allocations to be
 * accounted separately (to have a separate alloc_tag).
 */
#define dev_alloc_page()        dev_alloc_pages(0)

/**
 * dev_page_is_reusable - check whether a page can be reused for network Rx
 * @page: the page to test
 *
 * A page shouldn't be considered for reusing/recycling if it was allocated
 * under memory pressure or at a distant memory node.
 *
 * Returns: false if this page should be returned to page allocator, true
 * otherwise.
 */
static inline bool dev_page_is_reusable(const struct page *page)
{
        return likely(page_to_nid(page) == numa_mem_id() &&
                      !page_is_pfmemalloc(page));
}

/**
 *        skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
 *        @page: The page that was allocated from skb_alloc_page
 *        @skb: The skb that may need pfmemalloc set
 */
static inline void skb_propagate_pfmemalloc(const struct page *page,
                                            struct sk_buff *skb)
{
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc = true;
}

/**
 * skb_frag_off() - Returns the offset of a skb fragment
 * @frag: the paged fragment
 */
static inline unsigned int skb_frag_off(const skb_frag_t *frag)
{
        return frag->offset;
}

/**
 * skb_frag_off_add() - Increments the offset of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
{
        frag->offset += delta;
}

/**
 * skb_frag_off_set() - Sets the offset of a skb fragment
 * @frag: skb fragment
 * @offset: offset of fragment
 */
static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset)
{
        frag->offset = offset;
}

/**
 * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment
 * @fragto: skb fragment where offset is set
 * @fragfrom: skb fragment offset is copied from
 */
static inline void skb_frag_off_copy(skb_frag_t *fragto,
                                     const skb_frag_t *fragfrom)
{
        fragto->offset = fragfrom->offset;
}

/* Return: true if the skb_frag contains a net_iov. */
static inline bool skb_frag_is_net_iov(const skb_frag_t *frag)
{
        return netmem_is_net_iov(frag->netmem);
}

/**
 * skb_frag_net_iov - retrieve the net_iov referred to by fragment
 * @frag: the fragment
 *
 * Return: the &struct net_iov associated with @frag. Returns NULL if this
 * frag has no associated net_iov.
 */
static inline struct net_iov *skb_frag_net_iov(const skb_frag_t *frag)
{
        if (!skb_frag_is_net_iov(frag))
                return NULL;

        return netmem_to_net_iov(frag->netmem);
}

/**
 * skb_frag_page - retrieve the page referred to by a paged fragment
 * @frag: the paged fragment
 *
 * Return: the &struct page associated with @frag. Returns NULL if this frag
 * has no associated page.
 */
static inline struct page *skb_frag_page(const skb_frag_t *frag)
{
        if (skb_frag_is_net_iov(frag))
                return NULL;

        return netmem_to_page(frag->netmem);
}

/**
 * skb_frag_netmem - retrieve the netmem referred to by a fragment
 * @frag: the fragment
 *
 * Return: the &netmem_ref associated with @frag.
 */
static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag)
{
        return frag->netmem;
}

int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
                    unsigned int headroom);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
                         const struct bpf_prog *prog);

/**
 * skb_frag_address - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns: the address of the data within @frag. The page must already
 * be mapped.
 */
static inline void *skb_frag_address(const skb_frag_t *frag)
{
        if (!skb_frag_page(frag))
                return NULL;

        return page_address(skb_frag_page(frag)) + skb_frag_off(frag);
}

/**
 * skb_frag_address_safe - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns: the address of the data within @frag. Checks that the page
 * is mapped and returns %NULL otherwise.
 */
static inline void *skb_frag_address_safe(const skb_frag_t *frag)
{
        struct page *page = skb_frag_page(frag);
        void *ptr;

        if (!page)
                return NULL;

        ptr = page_address(page);
        if (unlikely(!ptr))
                return NULL;

        return ptr + skb_frag_off(frag);
}

/**
 * skb_frag_phys - gets the physical address of the data in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns: the physical address of the data within @frag.
 */
static inline phys_addr_t skb_frag_phys(const skb_frag_t *frag)
{
        return page_to_phys(skb_frag_page(frag)) + skb_frag_off(frag);
}

/**
 * skb_frag_page_copy() - sets the page in a fragment from another fragment
 * @fragto: skb fragment where page is set
 * @fragfrom: skb fragment page is copied from
 */
static inline void skb_frag_page_copy(skb_frag_t *fragto,
                                      const skb_frag_t *fragfrom)
{
        fragto->netmem = fragfrom->netmem;
}

bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);

/**
 * __skb_frag_dma_map - maps a paged fragment via the DMA API
 * @dev: the device to map the fragment to
 * @frag: the paged fragment to map
 * @offset: the offset within the fragment (starting at the
 *          fragment's own offset)
 * @size: the number of bytes to map
 * @dir: the direction of the mapping (``PCI_DMA_*``)
 *
 * Maps the page associated with @frag to @device.
 */
static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
                                            const skb_frag_t *frag,
                                            size_t offset, size_t size,
                                            enum dma_data_direction dir)
{
        if (skb_frag_is_net_iov(frag)) {
                return netmem_to_net_iov(frag->netmem)->desc.dma_addr +
                       offset + frag->offset;
        }
        return dma_map_page(dev, skb_frag_page(frag),
                            skb_frag_off(frag) + offset, size, dir);
}

#define skb_frag_dma_map(dev, frag, ...)                                \
        CONCATENATE(_skb_frag_dma_map,                                        \
                    COUNT_ARGS(__VA_ARGS__))(dev, frag, ##__VA_ARGS__)

#define __skb_frag_dma_map1(dev, frag, offset, uf, uo) ({                \
        const skb_frag_t *uf = (frag);                                        \
        size_t uo = (offset);                                                \
                                                                        \
        __skb_frag_dma_map(dev, uf, uo, skb_frag_size(uf) - uo,                \
                           DMA_TO_DEVICE);                                \
})
#define _skb_frag_dma_map1(dev, frag, offset)                                \
        __skb_frag_dma_map1(dev, frag, offset, __UNIQUE_ID(frag_),        \
                            __UNIQUE_ID(offset_))
#define _skb_frag_dma_map0(dev, frag)                                        \
        _skb_frag_dma_map1(dev, frag, 0)
#define _skb_frag_dma_map2(dev, frag, offset, size)                        \
        __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE)
#define _skb_frag_dma_map3(dev, frag, offset, size, dir)                \
        __skb_frag_dma_map(dev, frag, offset, size, dir)

static inline struct sk_buff *pskb_copy(struct sk_buff *skb,
                                        gfp_t gfp_mask)
{
        return __pskb_copy(skb, skb_headroom(skb), gfp_mask);
}


static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb,
                                                  gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true);
}


/**
 *        skb_clone_writable - is the header of a clone writable
 *        @skb: buffer to check
 *        @len: length up to which to write
 *
 *        Returns true if modifying the header part of the cloned buffer
 *        does not requires the data to be copied.
 */
static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len)
{
        return !skb_header_cloned(skb) &&
               skb_headroom(skb) + len <= skb->hdr_len;
}

static inline int skb_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        return skb_cloned(skb) && !skb_clone_writable(skb, write_len) &&
               pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}

static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
                            int cloned)
{
        int delta = 0;

        if (headroom > skb_headroom(skb))
                delta = headroom - skb_headroom(skb);

        if (delta || cloned)
                return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
                                        GFP_ATOMIC);
        return 0;
}

/**
 *        skb_cow - copy header of skb when it is required
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        If the skb passed lacks sufficient headroom or its data part
 *        is shared, data is reallocated. If reallocation fails, an error
 *        is returned and original skb is not changed.
 *
 *        The result is skb with writable area skb->head...skb->tail
 *        and at least @headroom of space at head.
 */
static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_cloned(skb));
}

/**
 *        skb_cow_head - skb_cow but only making the head writable
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        This function is identical to skb_cow except that we replace the
 *        skb_cloned check by skb_header_cloned.  It should be used when
 *        you only need to push on some header and do not need to modify
 *        the data.
 */
static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_header_cloned(skb));
}

/**
 *        skb_padto        - pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int skb_padto(struct sk_buff *skb, unsigned int len)
{
        unsigned int size = skb->len;
        if (likely(size >= len))
                return 0;
        return skb_pad(skb, len - size);
}

/**
 *        __skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *        @free_on_error: free buffer on error
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error if @free_on_error is true.
 */
static inline int __must_check __skb_put_padto(struct sk_buff *skb,
                                               unsigned int len,
                                               bool free_on_error)
{
        unsigned int size = skb->len;

        if (unlikely(size < len)) {
                len -= size;
                if (__skb_pad(skb, len, free_on_error))
                        return -ENOMEM;
                __skb_put(skb, len);
        }
        return 0;
}

/**
 *        skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len)
{
        return __skb_put_padto(skb, len, true);
}

bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i)
        __must_check;

static inline bool skb_can_coalesce_netmem(struct sk_buff *skb, int i,
                                           netmem_ref netmem, int off)
{
        if (skb_zcopy(skb))
                return false;
        if (i) {
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];

                return netmem == skb_frag_netmem(frag) &&
                       off == skb_frag_off(frag) + skb_frag_size(frag);
        }
        return false;
}

static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
                                    const struct page *page, int off)
{
        return skb_can_coalesce_netmem(skb, i, page_to_netmem(page), off);
}

static inline int __skb_linearize(struct sk_buff *skb)
{
        return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
}

/**
 *        skb_linearize - convert paged skb to linear one
 *        @skb: buffer to linarize
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
}

/**
 * skb_has_shared_frag - can any frag be overwritten
 * @skb: buffer to test
 *
 * Return: true if the skb has at least one frag that might be modified
 * by an external entity (as in vmsplice()/sendfile())
 */
static inline bool skb_has_shared_frag(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) &&
               skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
}

/**
 *        skb_linearize_cow - make sure skb is linear and writable
 *        @skb: buffer to process
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize_cow(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) || skb_cloned(skb) ?
               __skb_linearize(skb) : 0;
}

static __always_inline void
__skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_sub(skb->csum,
                                           csum_partial(start, len, 0), off);
        else if (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) < 0)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 *        skb_postpull_rcsum - update checksum for received skb after pull
 *        @skb: buffer to update
 *        @start: start of data before pull
 *        @len: length of data pulled
 *
 *        After doing a pull on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum, or set ip_summed to
 *        CHECKSUM_NONE so that it can be recomputed from scratch.
 */
static __always_inline void
skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = wsum_negate(csum_partial(start, len,
                                                     wsum_negate(skb->csum)));
        else if (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) < 0)
                skb->ip_summed = CHECKSUM_NONE;
}

static __always_inline void
__skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_add(skb->csum,
                                           csum_partial(start, len, 0), off);
}

/**
 *        skb_postpush_rcsum - update checksum for received skb after push
 *        @skb: buffer to update
 *        @start: start of data after push
 *        @len: length of data pushed
 *
 *        After doing a push on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum.
 */
static inline void skb_postpush_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        __skb_postpush_rcsum(skb, start, len, 0);
}

void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len);

/**
 *        skb_push_rcsum - push skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_push on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_push unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len)
{
        skb_push(skb, len);
        skb_postpush_rcsum(skb, skb->data, len);
        return skb->data;
}

int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
/**
 *        pskb_trim_rcsum - trim received skb and update checksum
 *        @skb: buffer to trim
 *        @len: new length
 *
 *        This is exactly the same as pskb_trim except that it ensures the
 *        checksum of received packets are still valid after the operation.
 *        It can change skb pointers.
 */

static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        skb_might_realloc(skb);
        if (likely(len >= skb->len))
                return 0;
        return pskb_trim_rcsum_slow(skb, len);
}

static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        __skb_trim(skb, len);
        return 0;
}

static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        return __skb_grow(skb, len);
}

#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
#define skb_rb_first(root) rb_to_skb(rb_first(root))
#define skb_rb_last(root)  rb_to_skb(rb_last(root))
#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))

#define skb_queue_walk(queue, skb) \
                for (skb = (queue)->next;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->next)

#define skb_queue_walk_safe(queue, skb, tmp)                                        \
                for (skb = (queue)->next, tmp = skb->next;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_walk_from(queue, skb)                                                \
                for (; skb != (struct sk_buff *)(queue);                        \
                     skb = skb->next)

#define skb_rbtree_walk(skb, root)                                                \
                for (skb = skb_rb_first(root); skb != NULL;                        \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from(skb)                                                \
                for (; skb != NULL;                                                \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from_safe(skb, tmp)                                        \
                for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);        \
                     skb = tmp)

#define skb_queue_walk_from_safe(queue, skb, tmp)                                \
                for (tmp = skb->next;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_reverse_walk(queue, skb) \
                for (skb = (queue)->prev;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->prev)

#define skb_queue_reverse_walk_safe(queue, skb, tmp)                                \
                for (skb = (queue)->prev, tmp = skb->prev;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

#define skb_queue_reverse_walk_from_safe(queue, skb, tmp)                        \
                for (tmp = skb->prev;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

static inline bool skb_has_frag_list(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->frag_list != NULL;
}

static inline void skb_frag_list_init(struct sk_buff *skb)
{
        skb_shinfo(skb)->frag_list = NULL;
}

#define skb_walk_frags(skb, iter)        \
        for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)


int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
                                int *err, long *timeo_p,
                                const struct sk_buff *skb);
struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue,
                                          unsigned int flags,
                                          int *off, int *err,
                                          struct sk_buff **last);
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
                                        struct sk_buff_head *queue,
                                        unsigned int flags, int *off, int *err,
                                        struct sk_buff **last);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
                                    struct sk_buff_head *sk_queue,
                                    unsigned int flags, int *off, int *err);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err);
__poll_t datagram_poll_queue(struct file *file, struct socket *sock,
                             struct poll_table_struct *wait,
                             struct sk_buff_head *rcv_queue);
__poll_t datagram_poll(struct file *file, struct socket *sock,
                           struct poll_table_struct *wait);
int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
                           struct iov_iter *to, int size);
static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
                                        struct msghdr *msg, int size)
{
        return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size);
}
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
                                   struct msghdr *msg);
int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
                                      struct iov_iter *to, int len, u32 *crcp);
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
                                 struct iov_iter *from, int len);
int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset,
                                     struct iov_iter *from, int len);
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
                              int len);
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int len,
                    unsigned int flags);
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len);
int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
                                    int offset, int len, int flags);
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
                 int len, int hlen);
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features,
                                 unsigned int offset);
struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len);
int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev);
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
int skb_vlan_pop(struct sk_buff *skb);
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
int skb_eth_pop(struct sk_buff *skb);
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src);
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet);
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet);
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
int skb_mpls_dec_ttl(struct sk_buff *skb);
struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
                             gfp_t gfp);

static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
{
        return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT;
}

static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
{
        return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
}

__wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
                    __wsum csum);
u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc);

static inline void * __must_check
__skb_header_pointer(const struct sk_buff *skb, int offset, int len,
                     const void *data, int hlen, void *buffer)
{
        if (likely(hlen - offset >= len))
                return (void *)data + offset;

        if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
                return NULL;

        return buffer;
}

static __always_inline void * __must_check
skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
{
        return __skb_header_pointer(skb, offset, len, skb->data,
                                    skb_headlen(skb), buffer);
}

/* Variant of skb_header_pointer() where @offset is user-controlled
 * and potentially negative.
 */
static inline void * __must_check
skb_header_pointer_careful(const struct sk_buff *skb, int offset,
                           int len, void *buffer)
{
        if (unlikely(offset < 0 && -offset > skb_headroom(skb)))
                return NULL;
        return skb_header_pointer(skb, offset, len, buffer);
}

static inline void * __must_check
skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len)
{
        if (likely(skb_headlen(skb) - offset >= len))
                return skb->data + offset;
        return NULL;
}

/**
 *        skb_needs_linearize - check if we need to linearize a given skb
 *                              depending on the given device features.
 *        @skb: socket buffer to check
 *        @features: net device features
 *
 *        Returns true if either:
 *        1. skb has frag_list and the device doesn't support FRAGLIST, or
 *        2. skb is fragmented and the device does not support SG.
 */
static inline bool skb_needs_linearize(struct sk_buff *skb,
                                       netdev_features_t features)
{
        return skb_is_nonlinear(skb) &&
               ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
                (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG)));
}

static inline void skb_copy_from_linear_data(const struct sk_buff *skb,
                                             void *to,
                                             const unsigned int len)
{
        memcpy(to, skb->data, len);
}

static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb,
                                                    const int offset, void *to,
                                                    const unsigned int len)
{
        memcpy(to, skb->data + offset, len);
}

static inline void skb_copy_to_linear_data(struct sk_buff *skb,
                                           const void *from,
                                           const unsigned int len)
{
        memcpy(skb->data, from, len);
}

static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
                                                  const int offset,
                                                  const void *from,
                                                  const unsigned int len)
{
        memcpy(skb->data + offset, from, len);
}

void skb_init(void);

static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
{
        return skb->tstamp;
}

/**
 *        skb_get_timestamp - get timestamp from a skb
 *        @skb: skb to get stamp from
 *        @stamp: pointer to struct __kernel_old_timeval to store stamp in
 *
 *        Timestamps are stored in the skb as offsets to a base timestamp.
 *        This function converts the offset back to a struct timeval and stores
 *        it in stamp.
 */
static inline void skb_get_timestamp(const struct sk_buff *skb,
                                     struct __kernel_old_timeval *stamp)
{
        *stamp = ns_to_kernel_old_timeval(skb->tstamp);
}

static inline void skb_get_new_timestamp(const struct sk_buff *skb,
                                         struct __kernel_sock_timeval *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_usec = ts.tv_nsec / 1000;
}

static inline void skb_get_timestampns(const struct sk_buff *skb,
                                       struct __kernel_old_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void skb_get_new_timestampns(const struct sk_buff *skb,
                                           struct __kernel_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void __net_timestamp(struct sk_buff *skb)
{
        skb->tstamp = ktime_get_real();
        skb->tstamp_type = SKB_CLOCK_REALTIME;
}

static inline ktime_t net_timedelta(ktime_t t)
{
        return ktime_sub(ktime_get_real(), t);
}

static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
                                         u8 tstamp_type)
{
        skb->tstamp = kt;

        if (kt)
                skb->tstamp_type = tstamp_type;
        else
                skb->tstamp_type = SKB_CLOCK_REALTIME;
}

static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb,
                                                    ktime_t kt, clockid_t clockid)
{
        u8 tstamp_type = SKB_CLOCK_REALTIME;

        switch (clockid) {
        case CLOCK_REALTIME:
                break;
        case CLOCK_MONOTONIC:
                tstamp_type = SKB_CLOCK_MONOTONIC;
                break;
        case CLOCK_TAI:
                tstamp_type = SKB_CLOCK_TAI;
                break;
        default:
                WARN_ON_ONCE(1);
                kt = 0;
        }

        skb_set_delivery_time(skb, kt, tstamp_type);
}

DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);

/* It is used in the ingress path to clear the delivery_time.
 * If needed, set the skb->tstamp to the (rcv) timestamp.
 */
static __always_inline void skb_clear_delivery_time(struct sk_buff *skb)
{
        if (skb->tstamp_type) {
                skb->tstamp_type = SKB_CLOCK_REALTIME;
                if (static_branch_unlikely(&netstamp_needed_key))
                        skb->tstamp = ktime_get_real();
                else
                        skb->tstamp = 0;
        }
}

static inline void skb_clear_tstamp(struct sk_buff *skb)
{
        if (skb->tstamp_type)
                return;

        skb->tstamp = 0;
}

static inline ktime_t skb_tstamp(const struct sk_buff *skb)
{
        if (skb->tstamp_type)
                return 0;

        return skb->tstamp;
}

static __always_inline ktime_t
skb_tstamp_cond(const struct sk_buff *skb, bool cond)
{
        if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp)
                return skb->tstamp;

        if (static_branch_unlikely(&netstamp_needed_key) || cond)
                return ktime_get_real();

        return 0;
}

static inline u8 skb_metadata_len(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->meta_len;
}

static inline void *skb_metadata_end(const struct sk_buff *skb)
{
        return skb_mac_header(skb);
}

static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
                                          const struct sk_buff *skb_b,
                                          u8 meta_len)
{
        const void *a = skb_metadata_end(skb_a);
        const void *b = skb_metadata_end(skb_b);
        u64 diffs = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
            BITS_PER_LONG != 64)
                goto slow;

        /* Using more efficient variant than plain call to memcmp(). */
        switch (meta_len) {
#define __it(x, op) (x -= sizeof(u##op))
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
        case 32: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 24: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 16: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  8: diffs |= __it_diff(a, b, 64);
                break;
        case 28: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 20: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 12: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  4: diffs |= __it_diff(a, b, 32);
                break;
        default:
slow:
                return memcmp(a - meta_len, b - meta_len, meta_len);
        }
        return diffs;
}

static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
                                        const struct sk_buff *skb_b)
{
        u8 len_a = skb_metadata_len(skb_a);
        u8 len_b = skb_metadata_len(skb_b);

        if (!(len_a | len_b))
                return false;

        return len_a != len_b ?
               true : __skb_metadata_differs(skb_a, skb_b, len_a);
}

static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
{
        skb_shinfo(skb)->meta_len = meta_len;
}

static inline void skb_metadata_clear(struct sk_buff *skb)
{
        skb_metadata_set(skb, 0);
}

/**
 * skb_data_move - Move packet data and metadata after skb_push() or skb_pull().
 * @skb: packet to operate on
 * @len: number of bytes pushed or pulled from &sk_buff->data
 * @n: number of bytes to memmove() from pre-push/pull &sk_buff->data
 *
 * Moves @n bytes of packet data, can be zero, and all bytes of skb metadata.
 *
 * Assumes metadata is located immediately before &sk_buff->data prior to the
 * push/pull, and that sufficient headroom exists to hold it after an
 * skb_push(). Otherwise, metadata is cleared and a one-time warning is issued.
 *
 * Prefer skb_postpull_data_move() or skb_postpush_data_move() to calling this
 * helper directly.
 */
static inline void skb_data_move(struct sk_buff *skb, const int len,
                                 const unsigned int n)
{
        const u8 meta_len = skb_metadata_len(skb);
        u8 *meta, *meta_end;

        if (!len || (!n && !meta_len))
                return;

        if (!meta_len)
                goto no_metadata;

        meta_end = skb_metadata_end(skb);
        meta = meta_end - meta_len;

        if (WARN_ON_ONCE(meta_end + len != skb->data ||
                         meta_len > skb_headroom(skb))) {
                skb_metadata_clear(skb);
                goto no_metadata;
        }

        memmove(meta + len, meta, meta_len + n);
        return;

no_metadata:
        memmove(skb->data, skb->data - len, n);
}

/**
 * skb_postpull_data_move - Move packet data and metadata after skb_pull().
 * @skb: packet to operate on
 * @len: number of bytes pulled from &sk_buff->data
 * @n: number of bytes to memmove() from pre-pull &sk_buff->data
 *
 * See skb_data_move() for details.
 */
static inline void skb_postpull_data_move(struct sk_buff *skb,
                                          const unsigned int len,
                                          const unsigned int n)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
        skb_data_move(skb, len, n);
}

/**
 * skb_postpush_data_move - Move packet data and metadata after skb_push().
 * @skb: packet to operate on
 * @len: number of bytes pushed onto &sk_buff->data
 * @n: number of bytes to memmove() from pre-push &sk_buff->data
 *
 * See skb_data_move() for details.
 */
static inline void skb_postpush_data_move(struct sk_buff *skb,
                                          const unsigned int len,
                                          const unsigned int n)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
        skb_data_move(skb, -len, n);
}

struct sk_buff *skb_clone_sk(struct sk_buff *skb);

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

void skb_clone_tx_timestamp(struct sk_buff *skb);
bool skb_defer_rx_timestamp(struct sk_buff *skb);

#else /* CONFIG_NETWORK_PHY_TIMESTAMPING */

static inline void skb_clone_tx_timestamp(struct sk_buff *skb)
{
}

static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
        return false;
}

#endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */

/**
 * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps
 *
 * PHY drivers may accept clones of transmitted packets for
 * timestamping via their phy_driver.txtstamp method. These drivers
 * must call this function to return the skb back to the stack with a
 * timestamp.
 *
 * @skb: clone of the original outgoing packet
 * @hwtstamps: hardware time stamps
 *
 */
void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps);

void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype);

/**
 * skb_tstamp_tx - queue clone of skb with send time stamps
 * @orig_skb:        the original outgoing packet
 * @hwtstamps:        hardware time stamps, may be NULL if not available
 *
 * If the skb has a socket associated, then this function clones the
 * skb (thus sharing the actual data and optional structures), stores
 * the optional hardware time stamping information (if non NULL) or
 * generates a software time stamp (otherwise), then queues the clone
 * to the error queue of the socket.  Errors are silently ignored.
 */
void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps);

/**
 * skb_tx_timestamp() - Driver hook for transmit timestamping
 *
 * Ethernet MAC Drivers should call this function in their hard_xmit()
 * function immediately before giving the sk_buff to the MAC hardware.
 *
 * Specifically, one should make absolutely sure that this function is
 * called before TX completion of this packet can trigger.  Otherwise
 * the packet could potentially already be freed.
 *
 * @skb: A socket buffer.
 */
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
        skb_clone_tx_timestamp(skb);
        if (skb_shinfo(skb)->tx_flags & (SKBTX_SW_TSTAMP | SKBTX_BPF))
                skb_tstamp_tx(skb, NULL);
}

/**
 * skb_complete_wifi_ack - deliver skb with wifi status
 *
 * @skb: the original outgoing packet
 * @acked: ack status
 *
 */
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked);

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
__sum16 __skb_checksum_complete(struct sk_buff *skb);

static inline int skb_csum_unnecessary(const struct sk_buff *skb)
{
        return ((skb->ip_summed == CHECKSUM_UNNECESSARY) ||
                skb->csum_valid ||
                (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) >= 0));
}

/**
 *        skb_checksum_complete - Calculate checksum of an entire packet
 *        @skb: packet to process
 *
 *        This function calculates the checksum over the entire packet plus
 *        the value of skb->csum.  The latter can be used to supply the
 *        checksum of a pseudo header as used by TCP/UDP.  It returns the
 *        checksum.
 *
 *        For protocols that contain complete checksums such as ICMP/TCP/UDP,
 *        this function can be used to verify that checksum on received
 *        packets.  In that case the function should return zero if the
 *        checksum is correct.  In particular, this function will return zero
 *        if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the
 *        hardware has already verified the correctness of the checksum.
 */
static inline __sum16 skb_checksum_complete(struct sk_buff *skb)
{
        return skb_csum_unnecessary(skb) ?
               0 : __skb_checksum_complete(skb);
}

static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level == 0)
                        skb->ip_summed = CHECKSUM_NONE;
                else
                        skb->csum_level--;
        }
}

static __always_inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
                        skb->csum_level++;
        } else if (skb->ip_summed == CHECKSUM_NONE) {
                skb->ip_summed = CHECKSUM_UNNECESSARY;
                skb->csum_level = 0;
        }
}

static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                skb->ip_summed = CHECKSUM_NONE;
                skb->csum_level = 0;
        }
}

/* Check if we need to perform checksum complete validation.
 *
 * Returns: true if checksum complete is needed, false otherwise
 * (either checksum is unnecessary or zero checksum is allowed).
 */
static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
                                                  bool zero_okay,
                                                  __sum16 check)
{
        if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
                skb->csum_valid = 1;
                __skb_decr_checksum_unnecessary(skb);
                return false;
        }

        return true;
}

/* For small packets <= CHECKSUM_BREAK perform checksum complete directly
 * in checksum_init.
 */
#define CHECKSUM_BREAK 76

/* Unset checksum-complete
 *
 * Unset checksum complete can be done when packet is being modified
 * (uncompressed for instance) and checksum-complete value is
 * invalidated.
 */
static inline void skb_checksum_complete_unset(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/* Validate (init) checksum based on checksum complete.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete. In the latter
 *        case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo
 *        checksum is stored in skb->csum for use in __skb_checksum_complete
 *   non-zero: value of invalid checksum
 *
 */
static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
                                                       bool complete,
                                                       __wsum psum)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                if (!csum_fold(csum_add(psum, skb->csum))) {
                        skb->csum_valid = 1;
                        return 0;
                }
        }

        skb->csum = psum;

        if (complete || skb->len <= CHECKSUM_BREAK) {
                __sum16 csum;

                csum = __skb_checksum_complete(skb);
                skb->csum_valid = !csum;
                return csum;
        }

        return 0;
}

static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
{
        return 0;
}

/* Perform checksum validate (init). Note that this is a macro since we only
 * want to calculate the pseudo header which is an input function if necessary.
 * First we try to validate without any computation (checksum unnecessary) and
 * then calculate based on checksum complete calling the function to compute
 * pseudo header.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete
 *   non-zero: value of invalid checksum
 */
#define __skb_checksum_validate(skb, proto, complete,                        \
                                zero_okay, check, compute_pseudo)        \
({                                                                        \
        __sum16 __ret = 0;                                                \
        skb->csum_valid = 0;                                                \
        if (__skb_checksum_validate_needed(skb, zero_okay, check))        \
                __ret = __skb_checksum_validate_complete(skb,                \
                                complete, compute_pseudo(skb, proto));        \
        __ret;                                                                \
})

#define skb_checksum_init(skb, proto, compute_pseudo)                        \
        __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo)

#define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo)        \
        __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo)

#define skb_checksum_validate(skb, proto, compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo)

#define skb_checksum_validate_zero_check(skb, proto, check,                \
                                         compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo)

#define skb_checksum_simple_validate(skb)                                \
        __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)

static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
{
        return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
}

static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo)
{
        skb->csum = ~pseudo;
        skb->ip_summed = CHECKSUM_COMPLETE;
}

#define skb_checksum_try_convert(skb, proto, compute_pseudo)        \
do {                                                                        \
        if (__skb_checksum_convert_check(skb))                                \
                __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \
} while (0)

static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
                                              u16 start, u16 offset)
{
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = ((unsigned char *)ptr + start) - skb->head;
        skb->csum_offset = offset - start;
}

/* Update skbuf and packet to reflect the remote checksum offload operation.
 * When called, ptr indicates the starting point for skb->csum when
 * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
 * here, skb_postpull_rcsum is done so skb->csum start is ptr.
 */
static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
                                       int start, int offset, bool nopartial)
{
        __wsum delta;

        if (!nopartial) {
                skb_remcsum_adjust_partial(skb, ptr, start, offset);
                return;
        }

        if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
                __skb_checksum_complete(skb);
                skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
        }

        delta = remcsum_adjust(ptr, skb->csum, start, offset);

        /* Adjust skb->csum since we changed the packet */
        skb->csum = csum_add(skb->csum, delta);
}

static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return (void *)(skb->_nfct & NFCT_PTRMASK);
#else
        return NULL;
#endif
}

static inline unsigned long skb_get_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return skb->_nfct;
#else
        return 0UL;
#endif
}

static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        skb->slow_gro |= !!nfct;
        skb->_nfct = nfct;
#endif
}

#ifdef CONFIG_SKB_EXTENSIONS
enum skb_ext_id {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        SKB_EXT_BRIDGE_NF,
#endif
#ifdef CONFIG_XFRM
        SKB_EXT_SEC_PATH,
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        TC_SKB_EXT,
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        SKB_EXT_MPTCP,
#endif
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
        SKB_EXT_MCTP,
#endif
#if IS_ENABLED(CONFIG_INET_PSP)
        SKB_EXT_PSP,
#endif
#if IS_ENABLED(CONFIG_CAN)
        SKB_EXT_CAN,
#endif
        SKB_EXT_NUM, /* must be last */
};

/**
 *        struct skb_ext - sk_buff extensions
 *        @refcnt: 1 on allocation, deallocated on 0
 *        @offset: offset to add to @data to obtain extension address
 *        @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
 *        @data: start of extension data, variable sized
 *
 *        Note: offsets/lengths are stored in chunks of 8 bytes, this allows
 *        to use 'u8' types while allowing up to 2kb worth of extension data.
 */
struct skb_ext {
        refcount_t refcnt;
        u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */
        u8 chunks;                /* same */
        char data[] __aligned(8);
};

struct skb_ext *__skb_ext_alloc(gfp_t flags);
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext);
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_put(struct skb_ext *ext);

static inline void skb_ext_put(struct sk_buff *skb)
{
        if (skb->active_extensions)
                __skb_ext_put(skb->extensions);
}

static inline void __skb_ext_copy(struct sk_buff *dst,
                                  const struct sk_buff *src)
{
        dst->active_extensions = src->active_extensions;

        if (src->active_extensions) {
                struct skb_ext *ext = src->extensions;

                refcount_inc(&ext->refcnt);
                dst->extensions = ext;
        }
}

static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
{
        skb_ext_put(dst);
        __skb_ext_copy(dst, src);
}

static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
{
        return !!ext->offset[i];
}

static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
{
        return skb->active_extensions & (1 << id);
}

static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id))
                __skb_ext_del(skb, id);
}

static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id)) {
                struct skb_ext *ext = skb->extensions;

                return (void *)ext + (ext->offset[id] << 3);
        }

        return NULL;
}

static inline void skb_ext_reset(struct sk_buff *skb)
{
        if (unlikely(skb->active_extensions)) {
                __skb_ext_put(skb->extensions);
                skb->active_extensions = 0;
        }
}

static inline bool skb_has_extensions(struct sk_buff *skb)
{
        return unlikely(skb->active_extensions);
}
#else
static inline void __skb_ext_put(struct skb_ext *ext) {}
static inline void skb_ext_put(struct sk_buff *skb) {}
static inline void skb_ext_reset(struct sk_buff *skb) {}
static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
static inline bool skb_has_extensions(struct sk_buff *skb) { return false; }
#endif /* CONFIG_SKB_EXTENSIONS */

static inline void nf_reset_ct(struct sk_buff *skb)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(skb));
        skb->_nfct = 0;
#endif
}

static inline void nf_reset_trace(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        skb->nf_trace = 0;
#endif
}

static inline void ipvs_reset(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_VS)
        skb->ipvs_property = 0;
#endif
}

/* Note: This doesn't put any conntrack info in dst. */
static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
                             bool copy)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        dst->_nfct = src->_nfct;
        nf_conntrack_get(skb_nfct(src));
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        if (copy)
                dst->nf_trace = src->nf_trace;
#endif
}

static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(dst));
#endif
        dst->slow_gro = src->slow_gro;
        __nf_copy(dst, src, true);
}

#ifdef CONFIG_NETWORK_SECMARK
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{
        to->secmark = from->secmark;
}

static inline void skb_init_secmark(struct sk_buff *skb)
{
        skb->secmark = 0;
}
#else
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{ }

static inline void skb_init_secmark(struct sk_buff *skb)
{ }
#endif

static inline int secpath_exists(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_exist(skb, SKB_EXT_SEC_PATH);
#else
        return 0;
#endif
}

static inline bool skb_irq_freeable(const struct sk_buff *skb)
{
        return !skb->destructor &&
                !secpath_exists(skb) &&
                !skb_nfct(skb) &&
                !skb->_skb_refdst &&
                !skb_has_frag_list(skb);
}

static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
        skb->queue_mapping = queue_mapping;
}

static inline u16 skb_get_queue_mapping(const struct sk_buff *skb)
{
        return skb->queue_mapping;
}

static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from)
{
        to->queue_mapping = from->queue_mapping;
}

static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
{
        skb->queue_mapping = rx_queue + 1;
}

static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
{
        return skb->queue_mapping - 1;
}

static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
{
        return skb->queue_mapping != 0;
}

static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
{
        skb->dst_pending_confirm = val;
}

static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
{
        return skb->dst_pending_confirm != 0;
}

static inline struct sec_path *skb_sec_path(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_find(skb, SKB_EXT_SEC_PATH);
#else
        return NULL;
#endif
}

static inline bool skb_is_gso(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_size;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_v6(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_tcp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6);
}

static inline void skb_gso_reset(struct sk_buff *skb)
{
        skb_shinfo(skb)->gso_size = 0;
        skb_shinfo(skb)->gso_segs = 0;
        skb_shinfo(skb)->gso_type = 0;
}

static inline void skb_increase_gso_size(struct skb_shared_info *shinfo,
                                         u16 increment)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size += increment;
}

static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo,
                                         u16 decrement)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size -= decrement;
}

void __skb_warn_lro_forwarding(const struct sk_buff *skb);

static __always_inline bool skb_warn_if_lro(const struct sk_buff *skb)
{
        /* LRO sets gso_size but not gso_type, whereas if GSO is really
         * wanted then gso_type will be set. */
        const struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 &&
            unlikely(shinfo->gso_type == 0)) {
                __skb_warn_lro_forwarding(skb);
                return true;
        }
        return false;
}

static inline void skb_forward_csum(struct sk_buff *skb)
{
        /* Unfortunately we don't support this one.  Any brave souls? */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE
 * @skb: skb to check
 *
 * fresh skbs have their ip_summed set to CHECKSUM_NONE.
 * Instead of forcing ip_summed to CHECKSUM_NONE, we can
 * use this helper, to document places where we make this assertion.
 */
static inline void skb_checksum_none_assert(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE);
}

bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);

int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb));

/**
 * skb_head_is_locked - Determine if the skb->head is locked down
 * @skb: skb to check
 *
 * The head on skbs build around a head frag can be removed if they are
 * not cloned.  This function returns true if the skb head is locked down
 * due to either being allocated via kmalloc, or by being a clone with
 * multiple references to the head.
 */
static inline bool skb_head_is_locked(const struct sk_buff *skb)
{
        return !skb->head_frag || skb_cloned(skb);
}

/* Local Checksum Offload.
 * Compute outer checksum based on the assumption that the
 * inner checksum will be offloaded later.
 * See Documentation/networking/checksum-offloads.rst for
 * explanation of how this works.
 * Fill in outer checksum adjustment (e.g. with sum of outer
 * pseudo-header) before calling.
 * Also ensure that inner checksum is in linear data area.
 */
static inline __wsum lco_csum(struct sk_buff *skb)
{
        unsigned char *csum_start = skb_checksum_start(skb);
        unsigned char *l4_hdr = skb_transport_header(skb);
        __wsum partial;

        /* Start with complement of inner checksum adjustment */
        partial = ~csum_unfold(*(__force __sum16 *)(csum_start +
                                                    skb->csum_offset));

        /* Add in checksum of our headers (incl. outer checksum
         * adjustment filled in by caller) and return result.
         */
        return csum_partial(l4_hdr, csum_start - l4_hdr, partial);
}

static inline bool skb_is_redirected(const struct sk_buff *skb)
{
        return skb->redirected;
}

static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
{
        skb->redirected = 1;
#ifdef CONFIG_NET_REDIRECT
        skb->from_ingress = from_ingress;
        if (skb->from_ingress)
                skb_clear_tstamp(skb);
#endif
}

static inline void skb_reset_redirect(struct sk_buff *skb)
{
        skb->redirected = 0;
}

static inline void skb_set_redirected_noclear(struct sk_buff *skb,
                                              bool from_ingress)
{
        skb->redirected = 1;
#ifdef CONFIG_NET_REDIRECT
        skb->from_ingress = from_ingress;
#endif
}

static inline bool skb_csum_is_sctp(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_SCTP)
        return skb->csum_not_inet;
#else
        return 0;
#endif
}

static inline void skb_reset_csum_not_inet(struct sk_buff *skb)
{
        skb->ip_summed = CHECKSUM_NONE;
#if IS_ENABLED(CONFIG_IP_SCTP)
        skb->csum_not_inet = 0;
#endif
}

static inline void skb_set_kcov_handle(struct sk_buff *skb,
                                       const u64 kcov_handle)
{
#ifdef CONFIG_KCOV
        skb->kcov_handle = kcov_handle;
#endif
}

static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
{
#ifdef CONFIG_KCOV
        return skb->kcov_handle;
#else
        return 0;
#endif
}

static inline void skb_mark_for_recycle(struct sk_buff *skb)
{
#ifdef CONFIG_PAGE_POOL
        skb->pp_recycle = 1;
#endif
}

ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
                             ssize_t maxsize);

#endif        /* __KERNEL__ */
#endif        /* _LINUX_SKBUFF_H */



























    3 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This file provides wrappers with sanitizer instrumentation for bit
 * locking operations.
 *
 * To use this functionality, an arch's bitops.h file needs to define each of
 * the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
 * arch___set_bit(), etc.).
 */
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H

#include <linux/instrumented.h>

/**
 * clear_bit_unlock - Clear a bit in memory, for unlock
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * This operation is atomic and provides release barrier semantics.
 */
static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        kcsan_release();
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_clear_bit_unlock(nr, addr);
}

/**
 * __clear_bit_unlock - Clears a bit in memory
 * @nr: Bit to clear
 * @addr: Address to start counting from
 *
 * This is a non-atomic operation but implies a release barrier before the
 * memory operation. It can be used for an unlock if no other CPUs can
 * concurrently modify other bits in the word.
 */
static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        kcsan_release();
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___clear_bit_unlock(nr, addr);
}

/**
 * test_and_set_bit_lock - Set a bit and return its old value, for lock
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This operation is atomic and provides acquire barrier semantics if
 * the returned value is 0.
 * It can be used to implement bit locks.
 */
static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_set_bit_lock(nr, addr);
}

/**
 * xor_unlock_is_negative_byte - XOR a single byte in memory and test if
 * it is negative, for unlock.
 * @mask: Change the bits which are set in this mask.
 * @addr: The address of the word containing the byte to change.
 *
 * Changes some of bits 0-6 in the word pointed to by @addr.
 * This operation is atomic and provides release barrier semantics.
 * Used to optimise some folio operations which are commonly paired
 * with an unlock or end of writeback.  Bit 7 is used as PG_waiters to
 * indicate whether anybody is waiting for the unlock.
 *
 * Return: Whether the top bit of the byte is set.
 */
static inline bool xor_unlock_is_negative_byte(unsigned long mask,
                volatile unsigned long *addr)
{
        kcsan_release();
        instrument_atomic_write(addr, sizeof(long));
        return arch_xor_unlock_is_negative_byte(mask, addr);
}
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */



























































   13 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_DST_OPS_H
#define _NET_DST_OPS_H
#include <linux/types.h>
#include <linux/percpu_counter.h>
#include <linux/cache.h>

struct dst_entry;
struct kmem_cachep;
struct net_device;
struct sk_buff;
struct sock;
struct net;

struct dst_ops {
        unsigned short                family;
        unsigned int                gc_thresh;

        void                        (*gc)(struct dst_ops *ops);
        struct dst_entry *        (*check)(struct dst_entry *, __u32 cookie);
        unsigned int                (*default_advmss)(const struct dst_entry *);
        unsigned int                (*mtu)(const struct dst_entry *);
        u32 *                        (*cow_metrics)(struct dst_entry *, unsigned long);
        void                        (*destroy)(struct dst_entry *);
        void                        (*ifdown)(struct dst_entry *,
                                          struct net_device *dev);
        void                        (*negative_advice)(struct sock *sk, struct dst_entry *);
        void                        (*link_failure)(struct sk_buff *);
        void                        (*update_pmtu)(struct dst_entry *dst, struct sock *sk,
                                               struct sk_buff *skb, u32 mtu,
                                               bool confirm_neigh);
        void                        (*redirect)(struct dst_entry *dst, struct sock *sk,
                                            struct sk_buff *skb);
        int                        (*local_out)(struct net *net, struct sock *sk, struct sk_buff *skb);
        struct neighbour *        (*neigh_lookup)(const struct dst_entry *dst,
                                                struct sk_buff *skb,
                                                const void *daddr);
        void                        (*confirm_neigh)(const struct dst_entry *dst,
                                                 const void *daddr);

        struct kmem_cache        *kmem_cachep;

        struct percpu_counter        pcpuc_entries ____cacheline_aligned_in_smp;
};

static inline int dst_entries_get_fast(struct dst_ops *dst)
{
        return percpu_counter_read_positive(&dst->pcpuc_entries);
}

static inline int dst_entries_get_slow(struct dst_ops *dst)
{
        return percpu_counter_sum_positive(&dst->pcpuc_entries);
}

#define DST_PERCPU_COUNTER_BATCH 32
static inline void dst_entries_add(struct dst_ops *dst, int val)
{
        percpu_counter_add_batch(&dst->pcpuc_entries, val,
                                 DST_PERCPU_COUNTER_BATCH);
}

static inline int dst_entries_init(struct dst_ops *dst)
{
        return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL);
}

static inline void dst_entries_destroy(struct dst_ops *dst)
{
        percpu_counter_destroy(&dst->pcpuc_entries);
}

#endif



































   11 















   12 







































    6 







    7 










































































    7 

























    7 



    7 













   13 
   12 
















   12 




   14 



    7 


   13 

































   11 







   11 

   12 

























































































































































































































   12 









































































































    2 



    2 


















    2 






































    2 





















    2 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2016 Facebook
 * Copyright (C) 2013-2014 Jens Axboe
 */

#include <linux/sched.h>
#include <linux/random.h>
#include <linux/sbitmap.h>
#include <linux/seq_file.h>

static int init_alloc_hint(struct sbitmap *sb, gfp_t flags)
{
        unsigned depth = sb->depth;

        sb->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
        if (!sb->alloc_hint)
                return -ENOMEM;

        if (depth && !sb->round_robin) {
                int i;

                for_each_possible_cpu(i)
                        *per_cpu_ptr(sb->alloc_hint, i) = get_random_u32_below(depth);
        }
        return 0;
}

static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb,
                                                    unsigned int depth)
{
        unsigned hint;

        hint = this_cpu_read(*sb->alloc_hint);
        if (unlikely(hint >= depth)) {
                hint = depth ? get_random_u32_below(depth) : 0;
                this_cpu_write(*sb->alloc_hint, hint);
        }

        return hint;
}

static inline void update_alloc_hint_after_get(struct sbitmap *sb,
                                               unsigned int depth,
                                               unsigned int hint,
                                               unsigned int nr)
{
        if (nr == -1) {
                /* If the map is full, a hint won't do us much good. */
                this_cpu_write(*sb->alloc_hint, 0);
        } else if (nr == hint || unlikely(sb->round_robin)) {
                /* Only update the hint if we used it. */
                hint = nr + 1;
                if (hint >= depth - 1)
                        hint = 0;
                this_cpu_write(*sb->alloc_hint, hint);
        }
}

/*
 * See if we have deferred clears that we can batch move
 */
static inline bool sbitmap_deferred_clear(struct sbitmap_word *map,
                unsigned int depth, unsigned int alloc_hint, bool wrap)
{
        unsigned long mask, word_mask;

        guard(raw_spinlock_irqsave)(&map->swap_lock);

        if (!map->cleared) {
                if (depth == 0)
                        return false;

                word_mask = (~0UL) >> (BITS_PER_LONG - depth);
                /*
                 * The current behavior is to always retry after moving
                 * ->cleared to word, and we change it to retry in case
                 * of any free bits. To avoid an infinite loop, we need
                 * to take wrap & alloc_hint into account, otherwise a
                 * soft lockup may occur.
                 */
                if (!wrap && alloc_hint)
                        word_mask &= ~((1UL << alloc_hint) - 1);

                return (READ_ONCE(map->word) & word_mask) != word_mask;
        }

        /*
         * First get a stable cleared mask, setting the old mask to 0.
         */
        mask = xchg(&map->cleared, 0);

        /*
         * Now clear the masked bits in our free word
         */
        atomic_long_andnot(mask, (atomic_long_t *)&map->word);
        BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word));
        return true;
}

int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
                      gfp_t flags, int node, bool round_robin,
                      bool alloc_hint)
{
        unsigned int bits_per_word;
        int i;

        if (shift < 0)
                shift = sbitmap_calculate_shift(depth);

        bits_per_word = 1U << shift;
        if (bits_per_word > BITS_PER_LONG)
                return -EINVAL;

        sb->shift = shift;
        sb->depth = depth;
        sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
        sb->round_robin = round_robin;

        if (depth == 0) {
                sb->map = NULL;
                return 0;
        }

        if (alloc_hint) {
                if (init_alloc_hint(sb, flags))
                        return -ENOMEM;
        } else {
                sb->alloc_hint = NULL;
        }

        sb->map = kvzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node);
        if (!sb->map) {
                free_percpu(sb->alloc_hint);
                return -ENOMEM;
        }

        for (i = 0; i < sb->map_nr; i++)
                raw_spin_lock_init(&sb->map[i].swap_lock);

        return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_init_node);

void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
{
        unsigned int bits_per_word = 1U << sb->shift;
        unsigned int i;

        for (i = 0; i < sb->map_nr; i++)
                sbitmap_deferred_clear(&sb->map[i], 0, 0, 0);

        sb->depth = depth;
        sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
}
EXPORT_SYMBOL_GPL(sbitmap_resize);

static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
                              unsigned int hint, bool wrap)
{
        int nr;

        /* don't wrap if starting from 0 */
        wrap = wrap && hint;

        while (1) {
                nr = find_next_zero_bit(word, depth, hint);
                if (unlikely(nr >= depth)) {
                        /*
                         * We started with an offset, and we didn't reset the
                         * offset to 0 in a failure case, so start from 0 to
                         * exhaust the map.
                         */
                        if (hint && wrap) {
                                hint = 0;
                                continue;
                        }
                        return -1;
                }

                if (!test_and_set_bit_lock(nr, word))
                        break;

                hint = nr + 1;
                if (hint >= depth - 1)
                        hint = 0;
        }

        return nr;
}

static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
                                    unsigned int depth,
                                    unsigned int alloc_hint,
                                    bool wrap)
{
        int nr;

        do {
                nr = __sbitmap_get_word(&map->word, depth,
                                        alloc_hint, wrap);
                if (nr != -1)
                        break;
                if (!sbitmap_deferred_clear(map, depth, alloc_hint, wrap))
                        break;
        } while (1);

        return nr;
}

static unsigned int __map_depth_with_shallow(const struct sbitmap *sb,
                                             int index,
                                             unsigned int shallow_depth)
{
        u64 shallow_word_depth;
        unsigned int word_depth, reminder;

        word_depth = __map_depth(sb, index);
        if (shallow_depth >= sb->depth)
                return word_depth;

        shallow_word_depth = word_depth * shallow_depth;
        reminder = do_div(shallow_word_depth, sb->depth);

        if (reminder >= (index + 1) * word_depth)
                shallow_word_depth++;

        return (unsigned int)shallow_word_depth;
}

static int sbitmap_find_bit(struct sbitmap *sb,
                            unsigned int shallow_depth,
                            unsigned int index,
                            unsigned int alloc_hint,
                            bool wrap)
{
        unsigned int i;
        int nr = -1;

        for (i = 0; i < sb->map_nr; i++) {
                unsigned int depth = __map_depth_with_shallow(sb, index,
                                                              shallow_depth);

                if (depth)
                        nr = sbitmap_find_bit_in_word(&sb->map[index], depth,
                                                      alloc_hint, wrap);
                if (nr != -1) {
                        nr += index << sb->shift;
                        break;
                }

                /* Jump to next index. */
                alloc_hint = 0;
                if (++index >= sb->map_nr)
                        index = 0;
        }

        return nr;
}

static int __sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint)
{
        unsigned int index;

        index = SB_NR_TO_INDEX(sb, alloc_hint);

        /*
         * Unless we're doing round robin tag allocation, just use the
         * alloc_hint to find the right word index. No point in looping
         * twice in find_next_zero_bit() for that case.
         */
        if (sb->round_robin)
                alloc_hint = SB_NR_TO_BIT(sb, alloc_hint);
        else
                alloc_hint = 0;

        return sbitmap_find_bit(sb, UINT_MAX, index, alloc_hint,
                                !sb->round_robin);
}

int sbitmap_get(struct sbitmap *sb)
{
        int nr;
        unsigned int hint, depth;

        if (WARN_ON_ONCE(unlikely(!sb->alloc_hint)))
                return -1;

        depth = READ_ONCE(sb->depth);
        hint = update_alloc_hint_before_get(sb, depth);
        nr = __sbitmap_get(sb, hint);
        update_alloc_hint_after_get(sb, depth, hint, nr);

        return nr;
}
EXPORT_SYMBOL_GPL(sbitmap_get);

static int __sbitmap_get_shallow(struct sbitmap *sb,
                                 unsigned int alloc_hint,
                                 unsigned long shallow_depth)
{
        unsigned int index;

        index = SB_NR_TO_INDEX(sb, alloc_hint);
        alloc_hint = SB_NR_TO_BIT(sb, alloc_hint);

        return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true);
}

/**
 * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
 * limiting the depth used from each word.
 * @sb: Bitmap to allocate from.
 * @shallow_depth: The maximum number of bits to allocate from the bitmap.
 *
 * This rather specific operation allows for having multiple users with
 * different allocation limits. E.g., there can be a high-priority class that
 * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
 * with a @shallow_depth of (sb->depth >> 1). Then, the low-priority
 * class can only allocate half of the total bits in the bitmap, preventing it
 * from starving out the high-priority class.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
static int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
{
        int nr;
        unsigned int hint, depth;

        if (WARN_ON_ONCE(unlikely(!sb->alloc_hint)))
                return -1;

        depth = READ_ONCE(sb->depth);
        hint = update_alloc_hint_before_get(sb, depth);
        nr = __sbitmap_get_shallow(sb, hint, shallow_depth);
        update_alloc_hint_after_get(sb, depth, hint, nr);

        return nr;
}

bool sbitmap_any_bit_set(const struct sbitmap *sb)
{
        unsigned int i;

        for (i = 0; i < sb->map_nr; i++) {
                if (sb->map[i].word & ~sb->map[i].cleared)
                        return true;
        }
        return false;
}
EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);

static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
{
        unsigned int i, weight = 0;

        for (i = 0; i < sb->map_nr; i++) {
                const struct sbitmap_word *word = &sb->map[i];
                unsigned int word_depth = __map_depth(sb, i);

                if (set)
                        weight += bitmap_weight(&word->word, word_depth);
                else
                        weight += bitmap_weight(&word->cleared, word_depth);
        }
        return weight;
}

static unsigned int sbitmap_cleared(const struct sbitmap *sb)
{
        return __sbitmap_weight(sb, false);
}

unsigned int sbitmap_weight(const struct sbitmap *sb)
{
        return __sbitmap_weight(sb, true) - sbitmap_cleared(sb);
}
EXPORT_SYMBOL_GPL(sbitmap_weight);

void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
{
        seq_printf(m, "depth=%u\n", sb->depth);
        seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
        seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
        seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
        seq_printf(m, "map_nr=%u\n", sb->map_nr);
}
EXPORT_SYMBOL_GPL(sbitmap_show);

static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte)
{
        if ((offset & 0xf) == 0) {
                if (offset != 0)
                        seq_putc(m, '\n');
                seq_printf(m, "%08x:", offset);
        }
        if ((offset & 0x1) == 0)
                seq_putc(m, ' ');
        seq_printf(m, "%02x", byte);
}

void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
{
        u8 byte = 0;
        unsigned int byte_bits = 0;
        unsigned int offset = 0;
        int i;

        for (i = 0; i < sb->map_nr; i++) {
                unsigned long word = READ_ONCE(sb->map[i].word);
                unsigned long cleared = READ_ONCE(sb->map[i].cleared);
                unsigned int word_bits = __map_depth(sb, i);

                word &= ~cleared;

                while (word_bits > 0) {
                        unsigned int bits = min(8 - byte_bits, word_bits);

                        byte |= (word & (BIT(bits) - 1)) << byte_bits;
                        byte_bits += bits;
                        if (byte_bits == 8) {
                                emit_byte(m, offset, byte);
                                byte = 0;
                                byte_bits = 0;
                                offset++;
                        }
                        word >>= bits;
                        word_bits -= bits;
                }
        }
        if (byte_bits) {
                emit_byte(m, offset, byte);
                offset++;
        }
        if (offset)
                seq_putc(m, '\n');
}
EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);

static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
                                        unsigned int depth)
{
        return clamp_t(unsigned int,
                       min(depth, sbq->min_shallow_depth) / SBQ_WAIT_QUEUES,
                       1, SBQ_WAKE_BATCH);
}

int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
                            int shift, bool round_robin, gfp_t flags, int node)
{
        int ret;
        int i;

        ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node,
                                round_robin, true);
        if (ret)
                return ret;

        sbq->min_shallow_depth = UINT_MAX;
        sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
        atomic_set(&sbq->wake_index, 0);
        atomic_set(&sbq->ws_active, 0);
        atomic_set(&sbq->completion_cnt, 0);
        atomic_set(&sbq->wakeup_cnt, 0);

        sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
        if (!sbq->ws) {
                sbitmap_free(&sbq->sb);
                return -ENOMEM;
        }

        for (i = 0; i < SBQ_WAIT_QUEUES; i++)
                init_waitqueue_head(&sbq->ws[i].wait);

        return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);

static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
                                            unsigned int depth)
{
        unsigned int wake_batch;

        wake_batch = sbq_calc_wake_batch(sbq, depth);
        if (sbq->wake_batch != wake_batch)
                WRITE_ONCE(sbq->wake_batch, wake_batch);
}

void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
                                            unsigned int users)
{
        unsigned int wake_batch;
        unsigned int depth = (sbq->sb.depth + users - 1) / users;

        wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES,
                        1, SBQ_WAKE_BATCH);

        WRITE_ONCE(sbq->wake_batch, wake_batch);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);

void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
        sbitmap_queue_update_wake_batch(sbq, depth);
        sbitmap_resize(&sbq->sb, depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_resize);

int __sbitmap_queue_get(struct sbitmap_queue *sbq)
{
        return sbitmap_get(&sbq->sb);
}
EXPORT_SYMBOL_GPL(__sbitmap_queue_get);

unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
                                        unsigned int *offset)
{
        struct sbitmap *sb = &sbq->sb;
        unsigned int hint, depth;
        unsigned long index, nr;
        int i;

        if (unlikely(sb->round_robin))
                return 0;

        depth = READ_ONCE(sb->depth);
        hint = update_alloc_hint_before_get(sb, depth);

        index = SB_NR_TO_INDEX(sb, hint);

        for (i = 0; i < sb->map_nr; i++) {
                struct sbitmap_word *map = &sb->map[index];
                unsigned long get_mask;
                unsigned int map_depth = __map_depth(sb, index);
                unsigned long val;

                sbitmap_deferred_clear(map, 0, 0, 0);
                val = READ_ONCE(map->word);
                if (val == (1UL << (map_depth - 1)) - 1)
                        goto next;

                nr = find_first_zero_bit(&val, map_depth);
                if (nr + nr_tags <= map_depth) {
                        atomic_long_t *ptr = (atomic_long_t *) &map->word;

                        get_mask = ((1UL << nr_tags) - 1) << nr;
                        while (!atomic_long_try_cmpxchg(ptr, &val,
                                                          get_mask | val))
                                ;
                        get_mask = (get_mask & ~val) >> nr;
                        if (get_mask) {
                                *offset = nr + (index << sb->shift);
                                update_alloc_hint_after_get(sb, depth, hint,
                                                        *offset + nr_tags - 1);
                                return get_mask;
                        }
                }
next:
                /* Jump to next index. */
                if (++index >= sb->map_nr)
                        index = 0;
        }

        return 0;
}

int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
                              unsigned int shallow_depth)
{
        WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth);

        return sbitmap_get_shallow(&sbq->sb, shallow_depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_get_shallow);

void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
                                     unsigned int min_shallow_depth)
{
        sbq->min_shallow_depth = min_shallow_depth;
        sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);

static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr)
{
        int i, wake_index, woken;

        if (!atomic_read(&sbq->ws_active))
                return;

        wake_index = atomic_read(&sbq->wake_index);
        for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
                struct sbq_wait_state *ws = &sbq->ws[wake_index];

                /*
                 * Advance the index before checking the current queue.
                 * It improves fairness, by ensuring the queue doesn't
                 * need to be fully emptied before trying to wake up
                 * from the next one.
                 */
                wake_index = sbq_index_inc(wake_index);

                if (waitqueue_active(&ws->wait)) {
                        woken = wake_up_nr(&ws->wait, nr);
                        if (woken == nr)
                                break;
                        nr -= woken;
                }
        }

        if (wake_index != atomic_read(&sbq->wake_index))
                atomic_set(&sbq->wake_index, wake_index);
}

void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr)
{
        unsigned int wake_batch = READ_ONCE(sbq->wake_batch);
        unsigned int wakeups;

        if (!atomic_read(&sbq->ws_active))
                return;

        atomic_add(nr, &sbq->completion_cnt);
        wakeups = atomic_read(&sbq->wakeup_cnt);

        do {
                if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch)
                        return;
        } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt,
                                     &wakeups, wakeups + wake_batch));

        __sbitmap_queue_wake_up(sbq, wake_batch);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);

static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag)
{
        if (likely(!sb->round_robin && tag < sb->depth))
                data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag);
}

void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
                                int *tags, int nr_tags)
{
        struct sbitmap *sb = &sbq->sb;
        unsigned long *addr = NULL;
        unsigned long mask = 0;
        int i;

        smp_mb__before_atomic();
        for (i = 0; i < nr_tags; i++) {
                const int tag = tags[i] - offset;
                unsigned long *this_addr;

                /* since we're clearing a batch, skip the deferred map */
                this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
                if (!addr) {
                        addr = this_addr;
                } else if (addr != this_addr) {
                        atomic_long_andnot(mask, (atomic_long_t *) addr);
                        mask = 0;
                        addr = this_addr;
                }
                mask |= (1UL << SB_NR_TO_BIT(sb, tag));
        }

        if (mask)
                atomic_long_andnot(mask, (atomic_long_t *) addr);

        smp_mb__after_atomic();
        sbitmap_queue_wake_up(sbq, nr_tags);
        sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
                                        tags[nr_tags - 1] - offset);
}

void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                         unsigned int cpu)
{
        /*
         * Once the clear bit is set, the bit may be allocated out.
         *
         * Orders READ/WRITE on the associated instance(such as request
         * of blk_mq) by this bit for avoiding race with re-allocation,
         * and its pair is the memory barrier implied in __sbitmap_get_word.
         *
         * One invariant is that the clear bit has to be zero when the bit
         * is in use.
         */
        smp_mb__before_atomic();
        sbitmap_deferred_clear_bit(&sbq->sb, nr);

        /*
         * Pairs with the memory barrier in set_current_state() to ensure the
         * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
         * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
         * waiter. See the comment on waitqueue_active().
         */
        smp_mb__after_atomic();
        sbitmap_queue_wake_up(sbq, 1);
        sbitmap_update_cpu_hint(&sbq->sb, cpu, nr);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_clear);

void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
{
        int i, wake_index;

        /*
         * Pairs with the memory barrier in set_current_state() like in
         * sbitmap_queue_wake_up().
         */
        smp_mb();
        wake_index = atomic_read(&sbq->wake_index);
        for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
                struct sbq_wait_state *ws = &sbq->ws[wake_index];

                if (waitqueue_active(&ws->wait))
                        wake_up(&ws->wait);

                wake_index = sbq_index_inc(wake_index);
        }
}
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);

void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
{
        bool first;
        int i;

        sbitmap_show(&sbq->sb, m);

        seq_puts(m, "alloc_hint={");
        first = true;
        for_each_possible_cpu(i) {
                if (!first)
                        seq_puts(m, ", ");
                first = false;
                seq_printf(m, "%u", *per_cpu_ptr(sbq->sb.alloc_hint, i));
        }
        seq_puts(m, "}\n");

        seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
        seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
        seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active));

        seq_puts(m, "ws={\n");
        for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
                struct sbq_wait_state *ws = &sbq->ws[i];
                seq_printf(m, "\t{.wait=%s},\n",
                           waitqueue_active(&ws->wait) ? "active" : "inactive");
        }
        seq_puts(m, "}\n");

        seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin);
        seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_show);

void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
                            struct sbq_wait_state *ws,
                            struct sbq_wait *sbq_wait)
{
        if (!sbq_wait->sbq) {
                sbq_wait->sbq = sbq;
                atomic_inc(&sbq->ws_active);
                add_wait_queue(&ws->wait, &sbq_wait->wait);
        }
}
EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue);

void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait)
{
        list_del_init(&sbq_wait->wait.entry);
        if (sbq_wait->sbq) {
                atomic_dec(&sbq_wait->sbq->ws_active);
                sbq_wait->sbq = NULL;
        }
}
EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue);

void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
                             struct sbq_wait_state *ws,
                             struct sbq_wait *sbq_wait, int state)
{
        if (!sbq_wait->sbq) {
                atomic_inc(&sbq->ws_active);
                sbq_wait->sbq = sbq;
        }
        prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state);
}
EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait);

void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
                         struct sbq_wait *sbq_wait)
{
        finish_wait(&ws->wait, &sbq_wait->wait);
        if (sbq_wait->sbq) {
                atomic_dec(&sbq->ws_active);
                sbq_wait->sbq = NULL;
        }
}
EXPORT_SYMBOL_GPL(sbitmap_finish_wait);













































































































































































































































































































































































































































































































































































































































































































































































































































    1 





































































































    1 






























































































































































































































































































































    4 









































    2 


    2 


    2 
















   15 






   15 



    2 
   11 




    4 










































































    1 




    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_XFRM_H
#define _NET_XFRM_H

#include <linux/compiler.h>
#include <linux/xfrm.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
#include <linux/in6.h>
#include <linux/mutex.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/refcount.h>
#include <linux/sockptr.h>

#include <net/sock.h>
#include <net/dst.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/flow.h>
#include <net/gro_cells.h>

#include <linux/interrupt.h>

#ifdef CONFIG_XFRM_STATISTICS
#include <net/snmp.h>
#endif

#define XFRM_PROTO_ESP                50
#define XFRM_PROTO_AH                51
#define XFRM_PROTO_COMP                108
#define XFRM_PROTO_IPIP                4
#define XFRM_PROTO_IPV6                41
#define XFRM_PROTO_IPTFS        IPPROTO_AGGFRAG
#define XFRM_PROTO_ROUTING        IPPROTO_ROUTING
#define XFRM_PROTO_DSTOPTS        IPPROTO_DSTOPTS

#define XFRM_ALIGN4(len)        (((len) + 3) & ~3)
#define XFRM_ALIGN8(len)        (((len) + 7) & ~7)
#define MODULE_ALIAS_XFRM_MODE(family, encap) \
        MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap))
#define MODULE_ALIAS_XFRM_TYPE(family, proto) \
        MODULE_ALIAS("xfrm-type-" __stringify(family) "-" __stringify(proto))
#define MODULE_ALIAS_XFRM_OFFLOAD_TYPE(family, proto) \
        MODULE_ALIAS("xfrm-offload-" __stringify(family) "-" __stringify(proto))

#ifdef CONFIG_XFRM_STATISTICS
#define XFRM_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.xfrm_statistics, field)
#define XFRM_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.xfrm_statistics, field, val)
#else
#define XFRM_INC_STATS(net, field)        ((void)(net))
#define XFRM_ADD_STATS(net, field, val) ((void)(net))
#endif


/* Organization of SPD aka "XFRM rules"
   ------------------------------------

   Basic objects:
   - policy rule, struct xfrm_policy (=SPD entry)
   - bundle of transformations, struct dst_entry == struct xfrm_dst (=SA bundle)
   - instance of a transformer, struct xfrm_state (=SA)
   - template to clone xfrm_state, struct xfrm_tmpl

   SPD is organized as hash table (for policies that meet minimum address prefix
   length setting, net->xfrm.policy_hthresh).  Other policies are stored in
   lists, sorted into rbtree ordered by destination and source address networks.
   See net/xfrm/xfrm_policy.c for details.

   (To be compatible with existing pfkeyv2 implementations,
   many rules with priority of 0x7fffffff are allowed to exist and
   such rules are ordered in an unpredictable way, thanks to bsd folks.)

   If "action" is "block", then we prohibit the flow, otherwise:
   if "xfrms_nr" is zero, the flow passes untransformed. Otherwise,
   policy entry has list of up to XFRM_MAX_DEPTH transformations,
   described by templates xfrm_tmpl. Each template is resolved
   to a complete xfrm_state (see below) and we pack bundle of transformations
   to a dst_entry returned to requester.

   dst -. xfrm  .-> xfrm_state #1
    |---. child .-> dst -. xfrm .-> xfrm_state #2
                     |---. child .-> dst -. xfrm .-> xfrm_state #3
                                      |---. child .-> NULL


   Resolution of xrfm_tmpl
   -----------------------
   Template contains:
   1. ->mode                Mode: transport or tunnel
   2. ->id.proto        Protocol: AH/ESP/IPCOMP
   3. ->id.daddr        Remote tunnel endpoint, ignored for transport mode.
      Q: allow to resolve security gateway?
   4. ->id.spi          If not zero, static SPI.
   5. ->saddr                Local tunnel endpoint, ignored for transport mode.
   6. ->algos                List of allowed algos. Plain bitmask now.
      Q: ealgos, aalgos, calgos. What a mess...
   7. ->share                Sharing mode.
      Q: how to implement private sharing mode? To add struct sock* to
      flow id?

   Having this template we search through SAD searching for entries
   with appropriate mode/proto/algo, permitted by selector.
   If no appropriate entry found, it is requested from key manager.

   PROBLEMS:
   Q: How to find all the bundles referring to a physical path for
      PMTU discovery? Seems, dst should contain list of all parents...
      and enter to infinite locking hierarchy disaster.
      No! It is easier, we will not search for them, let them find us.
      We add genid to each dst plus pointer to genid of raw IP route,
      pmtu disc will update pmtu on raw IP route and increase its genid.
      dst_check() will see this for top level and trigger resyncing
      metrics. Plus, it will be made via sk->sk_dst_cache. Solved.
 */

struct xfrm_state_walk {
        struct list_head        all;
        u8                        state;
        u8                        dying;
        u8                        proto;
        u32                        seq;
        struct xfrm_address_filter *filter;
};

enum {
        XFRM_DEV_OFFLOAD_IN = 1,
        XFRM_DEV_OFFLOAD_OUT,
        XFRM_DEV_OFFLOAD_FWD,
};

enum {
        XFRM_DEV_OFFLOAD_UNSPECIFIED,
        XFRM_DEV_OFFLOAD_CRYPTO,
        XFRM_DEV_OFFLOAD_PACKET,
};

enum {
        XFRM_DEV_OFFLOAD_FLAG_ACQ = 1,
};

struct xfrm_dev_offload {
        /* The device for this offload.
         * Device drivers should not use this directly, as that will prevent
         * them from working with bonding device. Instead, the device passed
         * to the add/delete callbacks should be used.
         */
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;
        /* This is a private pointer used by the bonding driver (and eventually
         * should be moved there). Device drivers should not use it.
         * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases,
         * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock
         * is held.
         */
        struct net_device        *real_dev;
        unsigned long                offload_handle;
        u8                        dir : 2;
        u8                        type : 2;
        u8                        flags : 2;
};

struct xfrm_mode {
        u8 encap;
        u8 family;
        u8 flags;
};

/* Flags for xfrm_mode. */
enum {
        XFRM_MODE_FLAG_TUNNEL = 1,
};

enum xfrm_replay_mode {
        XFRM_REPLAY_MODE_LEGACY,
        XFRM_REPLAY_MODE_BMP,
        XFRM_REPLAY_MODE_ESN,
};

/* Full description of state of transformer. */
struct xfrm_state {
        possible_net_t                xs_net;
        union {
                struct hlist_node        gclist;
                struct hlist_node        bydst;
        };
        union {
                struct hlist_node        dev_gclist;
                struct hlist_node        bysrc;
        };
        struct hlist_node        byspi;
        struct hlist_node        byseq;
        struct hlist_node        state_cache;
        struct hlist_node        state_cache_input;

        refcount_t                refcnt;
        spinlock_t                lock;

        u32                        pcpu_num;
        struct xfrm_id                id;
        struct xfrm_selector        sel;
        struct xfrm_mark        mark;
        u32                        if_id;
        u32                        tfcpad;

        u32                        genid;

        /* Key manager bits */
        struct xfrm_state_walk        km;

        /* Parameters of this state. */
        struct {
                u32                reqid;
                u8                mode;
                u8                replay_window;
                u8                aalgo, ealgo, calgo;
                u8                flags;
                u16                family;
                xfrm_address_t        saddr;
                int                header_len;
                int                enc_hdr_len;
                int                trailer_len;
                u32                extra_flags;
                struct xfrm_mark        smark;
        } props;

        struct xfrm_lifetime_cfg lft;

        /* Data for transformer */
        struct xfrm_algo_auth        *aalg;
        struct xfrm_algo        *ealg;
        struct xfrm_algo        *calg;
        struct xfrm_algo_aead        *aead;
        const char                *geniv;

        /* mapping change rate limiting */
        __be16 new_mapping_sport;
        u32 new_mapping;        /* seconds */
        u32 mapping_maxage;        /* seconds for input SA */

        /* Data for encapsulator */
        struct xfrm_encap_tmpl        *encap;

        /* NAT keepalive */
        u32                        nat_keepalive_interval; /* seconds */
        time64_t                nat_keepalive_expiration;

        /* Data for care-of address */
        xfrm_address_t        *coaddr;

        /* IPComp needs an IPIP tunnel for handling uncompressed packets */
        struct xfrm_state        *tunnel;

        /* If a tunnel, number of users + 1 */
        atomic_t                tunnel_users;

        /* State for replay detection */
        struct xfrm_replay_state replay;
        struct xfrm_replay_state_esn *replay_esn;

        /* Replay detection state at the time we sent the last notification */
        struct xfrm_replay_state preplay;
        struct xfrm_replay_state_esn *preplay_esn;

        /* replay detection mode */
        enum xfrm_replay_mode    repl_mode;
        /* internal flag that only holds state for delayed aevent at the
         * moment
        */
        u32                        xflags;

        /* Replay detection notification settings */
        u32                        replay_maxage;
        u32                        replay_maxdiff;

        /* Replay detection notification timer */
        struct timer_list        rtimer;

        /* Statistics */
        struct xfrm_stats        stats;

        struct xfrm_lifetime_cur curlft;
        struct hrtimer                mtimer;

        struct xfrm_dev_offload xso;

        /* used to fix curlft->add_time when changing date */
        long                saved_tmo;

        /* Last used time */
        time64_t                lastused;

        struct page_frag xfrag;

        /* Reference to data common to all the instances of this
         * transformer. */
        const struct xfrm_type        *type;
        struct xfrm_mode        inner_mode;
        struct xfrm_mode        inner_mode_iaf;
        struct xfrm_mode        outer_mode;

        const struct xfrm_type_offload        *type_offload;

        /* Security context */
        struct xfrm_sec_ctx        *security;

        /* Private data of this transformer, format is opaque,
         * interpreted by xfrm_type methods. */
        void                        *data;
        u8                        dir;

        const struct xfrm_mode_cbs        *mode_cbs;
        void                                *mode_data;
};

static inline struct net *xs_net(struct xfrm_state *x)
{
        return read_pnet(&x->xs_net);
}

/* xflags - make enum if more show up */
#define XFRM_TIME_DEFER        1
#define XFRM_SOFT_EXPIRE 2

enum {
        XFRM_STATE_VOID,
        XFRM_STATE_ACQ,
        XFRM_STATE_VALID,
        XFRM_STATE_ERROR,
        XFRM_STATE_EXPIRED,
        XFRM_STATE_DEAD
};

/* callback structure passed from either netlink or pfkey */
struct km_event {
        union {
                u32 hard;
                u32 proto;
                u32 byid;
                u32 aevent;
                u32 type;
        } data;

        u32        seq;
        u32        portid;
        u32        event;
        struct net *net;
};

struct xfrm_if_decode_session_result {
        struct net *net;
        u32 if_id;
};

struct xfrm_if_cb {
        bool (*decode_session)(struct sk_buff *skb,
                               unsigned short family,
                               struct xfrm_if_decode_session_result *res);
};

void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb);
void xfrm_if_unregister_cb(void);

struct xfrm_dst_lookup_params {
        struct net *net;
        dscp_t dscp;
        int oif;
        xfrm_address_t *saddr;
        xfrm_address_t *daddr;
        u32 mark;
        __u8 ipproto;
        union flowi_uli uli;
};

struct net_device;
struct xfrm_type;
struct xfrm_dst;
struct xfrm_policy_afinfo {
        struct dst_ops                *dst_ops;
        struct dst_entry        *(*dst_lookup)(const struct xfrm_dst_lookup_params *params);
        int                        (*get_saddr)(xfrm_address_t *saddr,
                                             const struct xfrm_dst_lookup_params *params);
        int                        (*fill_dst)(struct xfrm_dst *xdst,
                                            struct net_device *dev,
                                            const struct flowi *fl);
        struct dst_entry        *(*blackhole_route)(struct net *net, struct dst_entry *orig);
};

int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family);
void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo);
void km_policy_notify(struct xfrm_policy *xp, int dir,
                      const struct km_event *c);
void km_state_notify(struct xfrm_state *x, const struct km_event *c);

struct xfrm_tmpl;
int km_query(struct xfrm_state *x, struct xfrm_tmpl *t,
             struct xfrm_policy *pol);
void km_state_expired(struct xfrm_state *x, int hard, u32 portid);
int __xfrm_state_delete(struct xfrm_state *x);

struct xfrm_state_afinfo {
        u8                                family;
        u8                                proto;

        const struct xfrm_type_offload *type_offload_esp;

        const struct xfrm_type                *type_esp;
        const struct xfrm_type                *type_ipip;
        const struct xfrm_type                *type_ipip6;
        const struct xfrm_type                *type_comp;
        const struct xfrm_type                *type_ah;
        const struct xfrm_type                *type_routing;
        const struct xfrm_type                *type_dstopts;

        int                        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int                        (*transport_finish)(struct sk_buff *skb,
                                                    int async);
        void                        (*local_error)(struct sk_buff *skb, u32 mtu);
};

int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo);
int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo);
struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family);

struct xfrm_input_afinfo {
        u8                        family;
        bool                        is_ipip;
        int                        (*callback)(struct sk_buff *skb, u8 protocol,
                                            int err);
};

int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo);
int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo);

void xfrm_flush_gc(void);

struct xfrm_type {
        struct module                *owner;
        u8                        proto;
        u8                        flags;
#define XFRM_TYPE_NON_FRAGMENT        1
#define XFRM_TYPE_REPLAY_PROT        2
#define XFRM_TYPE_LOCAL_COADDR        4
#define XFRM_TYPE_REMOTE_COADDR        8

        int                        (*init_state)(struct xfrm_state *x,
                                              struct netlink_ext_ack *extack);
        void                        (*destructor)(struct xfrm_state *);
        int                        (*input)(struct xfrm_state *, struct sk_buff *skb);
        int                        (*output)(struct xfrm_state *, struct sk_buff *pskb);
        int                        (*reject)(struct xfrm_state *, struct sk_buff *,
                                          const struct flowi *);
};

int xfrm_register_type(const struct xfrm_type *type, unsigned short family);
void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family);

struct xfrm_type_offload {
        struct module        *owner;
        u8                proto;
        void                (*encap)(struct xfrm_state *, struct sk_buff *pskb);
        int                (*input_tail)(struct xfrm_state *x, struct sk_buff *skb);
        int                (*xmit)(struct xfrm_state *, struct sk_buff *pskb, netdev_features_t features);
};

int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
void xfrm_set_type_offload(struct xfrm_state *x, bool try_load);
static inline void xfrm_unset_type_offload(struct xfrm_state *x)
{
        if (!x->type_offload)
                return;

        module_put(x->type_offload->owner);
        x->type_offload = NULL;
}

/**
 * struct xfrm_mode_cbs - XFRM mode callbacks
 * @owner: module owner or NULL
 * @init_state: Add/init mode specific state in `xfrm_state *x`
 * @clone_state: Copy mode specific values from `orig` to new state `x`
 * @destroy_state: Cleanup mode specific state from `xfrm_state *x`
 * @user_init: Process mode specific netlink attributes from user
 * @copy_to_user: Add netlink attributes to `attrs` based on state in `x`
 * @sa_len: Return space required to store mode specific netlink attributes
 * @get_inner_mtu: Return avail payload space after removing encap overhead
 * @input: Process received packet from SA using mode
 * @output: Output given packet using mode
 * @prepare_output: Add mode specific encapsulation to packet in skb. On return
 *        `transport_header` should point at ESP header, `network_header` should
 *        point at outer IP header and `mac_header` should opint at the
 *        protocol/nexthdr field of the outer IP.
 *
 * One should examine and understand the specific uses of these callbacks in
 * xfrm for further detail on how and when these functions are called. RTSL.
 */
struct xfrm_mode_cbs {
        struct module        *owner;
        int        (*init_state)(struct xfrm_state *x);
        int        (*clone_state)(struct xfrm_state *x, struct xfrm_state *orig);
        void        (*destroy_state)(struct xfrm_state *x);
        int        (*user_init)(struct net *net, struct xfrm_state *x,
                             struct nlattr **attrs,
                             struct netlink_ext_ack *extack);
        int        (*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb);
        unsigned int (*sa_len)(const struct xfrm_state *x);
        u32        (*get_inner_mtu)(struct xfrm_state *x, int outer_mtu);
        int        (*input)(struct xfrm_state *x, struct sk_buff *skb);
        int        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int        (*prepare_output)(struct xfrm_state *x, struct sk_buff *skb);
};

int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs);
void xfrm_unregister_mode_cbs(u8 mode);

static inline int xfrm_af2proto(unsigned int family)
{
        switch(family) {
        case AF_INET:
                return IPPROTO_IPIP;
        case AF_INET6:
                return IPPROTO_IPV6;
        default:
                return 0;
        }
}

static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
{
        if ((x->sel.family != AF_UNSPEC) ||
            (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
            (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
                return &x->inner_mode;
        else
                return &x->inner_mode_iaf;
}

struct xfrm_tmpl {
/* id in template is interpreted as:
 * daddr - destination of tunnel, may be zero for transport mode.
 * spi   - zero to acquire spi. Not zero if spi is static, then
 *           daddr must be fixed too.
 * proto - AH/ESP/IPCOMP
 */
        struct xfrm_id                id;

/* Source address of tunnel. Ignored, if it is not a tunnel. */
        xfrm_address_t                saddr;

        unsigned short                encap_family;

        u32                        reqid;

/* Mode: transport, tunnel etc. */
        u8                        mode;

/* Sharing mode: unique, this session only, this user only etc. */
        u8                        share;

/* May skip this transfomration if no SA is found */
        u8                        optional;

/* Skip aalgos/ealgos/calgos checks. */
        u8                        allalgs;

/* Bit mask of algos allowed for acquisition */
        u32                        aalgos;
        u32                        ealgos;
        u32                        calgos;
};

#define XFRM_MAX_DEPTH                6
#define XFRM_MAX_OFFLOAD_DEPTH        1

struct xfrm_policy_walk_entry {
        struct list_head        all;
        u8                        dead;
};

struct xfrm_policy_walk {
        struct xfrm_policy_walk_entry walk;
        u8 type;
        u32 seq;
};

struct xfrm_policy_queue {
        struct sk_buff_head        hold_queue;
        struct timer_list        hold_timer;
        unsigned long                timeout;
};

/**
 *        struct xfrm_policy - xfrm policy
 *        @xp_net: network namespace the policy lives in
 *        @bydst: hlist node for SPD hash table or rbtree list
 *        @byidx: hlist node for index hash table
 *        @state_cache_list: hlist head for policy cached xfrm states
 *        @lock: serialize changes to policy structure members
 *        @refcnt: reference count, freed once it reaches 0
 *        @pos: kernel internal tie-breaker to determine age of policy
 *        @timer: timer
 *        @genid: generation, used to invalidate old policies
 *        @priority: priority, set by userspace
 *        @index:  policy index (autogenerated)
 *        @if_id: virtual xfrm interface id
 *        @mark: packet mark
 *        @selector: selector
 *        @lft: liftime configuration data
 *        @curlft: liftime state
 *        @walk: list head on pernet policy list
 *        @polq: queue to hold packets while aqcuire operaion in progress
 *        @bydst_reinsert: policy tree node needs to be merged
 *        @type: XFRM_POLICY_TYPE_MAIN or _SUB
 *        @action: XFRM_POLICY_ALLOW or _BLOCK
 *        @flags: XFRM_POLICY_LOCALOK, XFRM_POLICY_ICMP
 *        @xfrm_nr: number of used templates in @xfrm_vec
 *        @family: protocol family
 *        @security: SELinux security label
 *        @xfrm_vec: array of templates to resolve state
 *        @rcu: rcu head, used to defer memory release
 *        @xdo: hardware offload state
 */
struct xfrm_policy {
        possible_net_t                xp_net;
        struct hlist_node        bydst;
        struct hlist_node        byidx;

        struct hlist_head        state_cache_list;

        /* This lock only affects elements except for entry. */
        rwlock_t                lock;
        refcount_t                refcnt;
        u32                        pos;
        struct timer_list        timer;

        atomic_t                genid;
        u32                        priority;
        u32                        index;
        u32                        if_id;
        struct xfrm_mark        mark;
        struct xfrm_selector        selector;
        struct xfrm_lifetime_cfg lft;
        struct xfrm_lifetime_cur curlft;
        struct xfrm_policy_walk_entry walk;
        struct xfrm_policy_queue polq;
        bool                    bydst_reinsert;
        u8                        type;
        u8                        action;
        u8                        flags;
        u8                        xfrm_nr;
        u16                        family;
        struct xfrm_sec_ctx        *security;
        struct xfrm_tmpl               xfrm_vec[XFRM_MAX_DEPTH];
        struct rcu_head                rcu;

        struct xfrm_dev_offload xdo;
};

static inline struct net *xp_net(const struct xfrm_policy *xp)
{
        return read_pnet(&xp->xp_net);
}

struct xfrm_kmaddress {
        xfrm_address_t          local;
        xfrm_address_t          remote;
        u32                        reserved;
        u16                        family;
};

struct xfrm_migrate {
        xfrm_address_t                old_daddr;
        xfrm_address_t                old_saddr;
        xfrm_address_t                new_daddr;
        xfrm_address_t                new_saddr;
        u8                        proto;
        u8                        mode;
        u16                        reserved;
        u32                        reqid;
        u16                        old_family;
        u16                        new_family;
};

#define XFRM_KM_TIMEOUT                30
/* what happened */
#define XFRM_REPLAY_UPDATE        XFRM_AE_CR
#define XFRM_REPLAY_TIMEOUT        XFRM_AE_CE

/* default aevent timeout in units of 100ms */
#define XFRM_AE_ETIME                        10
/* Async Event timer multiplier */
#define XFRM_AE_ETH_M                        10
/* default seq threshold size */
#define XFRM_AE_SEQT_SIZE                2

struct xfrm_mgr {
        struct list_head        list;
        int                        (*notify)(struct xfrm_state *x, const struct km_event *c);
        int                        (*acquire)(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *xp);
        struct xfrm_policy        *(*compile_policy)(struct sock *sk, int opt, u8 *data, int len, int *dir);
        int                        (*new_mapping)(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
        int                        (*notify_policy)(struct xfrm_policy *x, int dir, const struct km_event *c);
        int                        (*report)(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr);
        int                        (*migrate)(const struct xfrm_selector *sel,
                                           u8 dir, u8 type,
                                           const struct xfrm_migrate *m,
                                           int num_bundles,
                                           const struct xfrm_kmaddress *k,
                                           const struct xfrm_encap_tmpl *encap);
        bool                        (*is_alive)(const struct km_event *c);
};

void xfrm_register_km(struct xfrm_mgr *km);
void xfrm_unregister_km(struct xfrm_mgr *km);

struct xfrm_tunnel_skb_cb {
        union {
                struct inet_skb_parm h4;
                struct inet6_skb_parm h6;
        } header;

        union {
                struct ip_tunnel *ip4;
                struct ip6_tnl *ip6;
        } tunnel;
};

#define XFRM_TUNNEL_SKB_CB(__skb) ((struct xfrm_tunnel_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used for the duration where packets are being
 * transformed by IPsec.  As soon as the packet leaves IPsec the
 * area beyond the generic IP part may be overwritten.
 */
struct xfrm_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        /* Sequence number for replay protection. */
        union {
                struct {
                        __u32 low;
                        __u32 hi;
                } output;
                struct {
                        __be32 low;
                        __be32 hi;
                } input;
        } seq;
};

#define XFRM_SKB_CB(__skb) ((struct xfrm_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used by the afinfo prepare_input/prepare_output functions
 * to transmit header information to the mode input/output functions.
 */
struct xfrm_mode_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        /* Copied from header for IPv4, always set to zero and DF for IPv6. */
        __be16 id;
        __be16 frag_off;

        /* IP header length (excluding options or extension headers). */
        u8 ihl;

        /* TOS for IPv4, class for IPv6. */
        u8 tos;

        /* TTL for IPv4, hop limitfor IPv6. */
        u8 ttl;

        /* Protocol for IPv4, NH for IPv6. */
        u8 protocol;

        /* Option length for IPv4, zero for IPv6. */
        u8 optlen;

        /* Used by IPv6 only, zero for IPv4. */
        u8 flow_lbl[3];
};

#define XFRM_MODE_SKB_CB(__skb) ((struct xfrm_mode_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used by the input processing to locate the SPI and
 * related information.
 */
struct xfrm_spi_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        unsigned int daddroff;
        unsigned int family;
        __be32 seq;
};

#define XFRM_SPI_SKB_CB(__skb) ((struct xfrm_spi_skb_cb *)&((__skb)->cb[0]))

#ifdef CONFIG_AUDITSYSCALL
static inline struct audit_buffer *xfrm_audit_start(const char *op)
{
        struct audit_buffer *audit_buf = NULL;

        if (audit_enabled == AUDIT_OFF)
                return NULL;
        audit_buf = audit_log_start(audit_context(), GFP_ATOMIC,
                                    AUDIT_MAC_IPSEC_EVENT);
        if (audit_buf == NULL)
                return NULL;
        audit_log_format(audit_buf, "op=%s", op);
        return audit_buf;
}

static inline void xfrm_audit_helper_usrinfo(bool task_valid,
                                             struct audit_buffer *audit_buf)
{
        const unsigned int auid = from_kuid(&init_user_ns, task_valid ?
                                            audit_get_loginuid(current) :
                                            INVALID_UID);
        const unsigned int ses = task_valid ? audit_get_sessionid(current) :
                AUDIT_SID_UNSET;

        audit_log_format(audit_buf, " auid=%u ses=%u", auid, ses);
        audit_log_task_context(audit_buf);
}

void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid);
void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                              bool task_valid);
void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid);
void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid);
void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
                                      struct sk_buff *skb);
void xfrm_audit_state_replay(struct xfrm_state *x, struct sk_buff *skb,
                             __be32 net_seq);
void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family);
void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, __be32 net_spi,
                               __be32 net_seq);
void xfrm_audit_state_icvfail(struct xfrm_state *x, struct sk_buff *skb,
                              u8 proto);
#else

static inline void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
                                         bool task_valid)
{
}

static inline void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                                            bool task_valid)
{
}

static inline void xfrm_audit_state_add(struct xfrm_state *x, int result,
                                        bool task_valid)
{
}

static inline void xfrm_audit_state_delete(struct xfrm_state *x, int result,
                                           bool task_valid)
{
}

static inline void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
                                             struct sk_buff *skb)
{
}

static inline void xfrm_audit_state_replay(struct xfrm_state *x,
                                           struct sk_buff *skb, __be32 net_seq)
{
}

static inline void xfrm_audit_state_notfound_simple(struct sk_buff *skb,
                                      u16 family)
{
}

static inline void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
                                      __be32 net_spi, __be32 net_seq)
{
}

static inline void xfrm_audit_state_icvfail(struct xfrm_state *x,
                                     struct sk_buff *skb, u8 proto)
{
}
#endif /* CONFIG_AUDITSYSCALL */

static inline void xfrm_pol_hold(struct xfrm_policy *policy)
{
        if (likely(policy != NULL))
                refcount_inc(&policy->refcnt);
}

void xfrm_policy_destroy(struct xfrm_policy *policy);

static inline void xfrm_pol_put(struct xfrm_policy *policy)
{
        if (refcount_dec_and_test(&policy->refcnt))
                xfrm_policy_destroy(policy);
}

static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
{
        int i;
        for (i = npols - 1; i >= 0; --i)
                xfrm_pol_put(pols[i]);
}

void __xfrm_state_destroy(struct xfrm_state *);

static inline void __xfrm_state_put(struct xfrm_state *x)
{
        refcount_dec(&x->refcnt);
}

static inline void xfrm_state_put(struct xfrm_state *x)
{
        if (refcount_dec_and_test(&x->refcnt))
                __xfrm_state_destroy(x);
}

static inline void xfrm_state_hold(struct xfrm_state *x)
{
        refcount_inc(&x->refcnt);
}

static inline bool addr_match(const void *token1, const void *token2,
                              unsigned int prefixlen)
{
        const __be32 *a1 = token1;
        const __be32 *a2 = token2;
        unsigned int pdw;
        unsigned int pbi;

        pdw = prefixlen >> 5;          /* num of whole u32 in prefix */
        pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */

        if (pdw)
                if (memcmp(a1, a2, pdw << 2))
                        return false;

        if (pbi) {
                __be32 mask;

                mask = htonl((0xffffffff) << (32 - pbi));

                if ((a1[pdw] ^ a2[pdw]) & mask)
                        return false;
        }

        return true;
}

static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen)
{
        /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
        if (sizeof(long) == 4 && prefixlen == 0)
                return true;
        return !((a1 ^ a2) & htonl(~0UL << (32 - prefixlen)));
}

static __inline__
__be16 xfrm_flowi_sport(const struct flowi *fl, const union flowi_uli *uli)
{
        __be16 port;
        switch(fl->flowi_proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
        case IPPROTO_SCTP:
                port = uli->ports.sport;
                break;
        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                port = htons(uli->icmpt.type);
                break;
        case IPPROTO_MH:
                port = htons(uli->mht.type);
                break;
        case IPPROTO_GRE:
                port = htons(ntohl(uli->gre_key) >> 16);
                break;
        default:
                port = 0;        /*XXX*/
        }
        return port;
}

static __inline__
__be16 xfrm_flowi_dport(const struct flowi *fl, const union flowi_uli *uli)
{
        __be16 port;
        switch(fl->flowi_proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
        case IPPROTO_SCTP:
                port = uli->ports.dport;
                break;
        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                port = htons(uli->icmpt.code);
                break;
        case IPPROTO_GRE:
                port = htons(ntohl(uli->gre_key) & 0xffff);
                break;
        default:
                port = 0;        /*XXX*/
        }
        return port;
}

bool xfrm_selector_match(const struct xfrm_selector *sel,
                         const struct flowi *fl, unsigned short family);

#ifdef CONFIG_SECURITY_NETWORK_XFRM
/*        If neither has a context --> match
 *         Otherwise, both must have a context and the sids, doi, alg must match
 */
static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
{
        return ((!s1 && !s2) ||
                (s1 && s2 &&
                 (s1->ctx_sid == s2->ctx_sid) &&
                 (s1->ctx_doi == s2->ctx_doi) &&
                 (s1->ctx_alg == s2->ctx_alg)));
}
#else
static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
{
        return true;
}
#endif

/* A struct encoding bundle of transformations to apply to some set of flow.
 *
 * xdst->child points to the next element of bundle.
 * dst->xfrm  points to an instanse of transformer.
 *
 * Due to unfortunate limitations of current routing cache, which we
 * have no time to fix, it mirrors struct rtable and bound to the same
 * routing key, including saddr,daddr. However, we can have many of
 * bundles differing by session id. All the bundles grow from a parent
 * policy rule.
 */
struct xfrm_dst {
        union {
                struct dst_entry        dst;
                struct rtable                rt;
                struct rt6_info                rt6;
        } u;
        struct dst_entry *route;
        struct dst_entry *child;
        struct dst_entry *path;
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int num_pols, num_xfrms;
        u32 xfrm_genid;
        u32 policy_genid;
        u32 route_mtu_cached;
        u32 child_mtu_cached;
        u32 route_cookie;
        u32 path_cookie;
};

static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst)
{
#ifdef CONFIG_XFRM
        if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) {
                const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst;

                return xdst->path;
        }
#endif
        return (struct dst_entry *) dst;
}

static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst)
{
#ifdef CONFIG_XFRM
        if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) {
                struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
                return xdst->child;
        }
#endif
        return NULL;
}

#ifdef CONFIG_XFRM
static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child)
{
        xdst->child = child;
}

static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
{
        xfrm_pols_put(xdst->pols, xdst->num_pols);
        dst_release(xdst->route);
        if (likely(xdst->u.dst.xfrm))
                xfrm_state_put(xdst->u.dst.xfrm);
}
#endif

void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);

struct xfrm_if_parms {
        int link;                /* ifindex of underlying L2 interface */
        u32 if_id;                /* interface identifier */
        bool collect_md;
};

struct xfrm_if {
        struct xfrm_if __rcu *next;        /* next interface in list */
        struct net_device *dev;                /* virtual device associated with interface */
        struct net *net;                /* netns for packet i/o */
        struct xfrm_if_parms p;                /* interface parms */

        struct gro_cells gro_cells;
};

struct xfrm_offload {
        /* Output sequence number for replay protection on offloading. */
        struct {
                __u32 low;
                __u32 hi;
        } seq;

        __u32                        flags;
#define        SA_DELETE_REQ                1
#define        CRYPTO_DONE                2
#define        CRYPTO_NEXT_DONE        4
#define        CRYPTO_FALLBACK                8
#define        XFRM_GSO_SEGMENT        16
#define        XFRM_GRO                32
/* 64 is free */
#define        XFRM_DEV_RESUME                128
#define        XFRM_XMIT                256

        __u32                        status;
#define CRYPTO_SUCCESS                                1
#define CRYPTO_GENERIC_ERROR                        2
#define CRYPTO_TRANSPORT_AH_AUTH_FAILED                4
#define CRYPTO_TRANSPORT_ESP_AUTH_FAILED        8
#define CRYPTO_TUNNEL_AH_AUTH_FAILED                16
#define CRYPTO_TUNNEL_ESP_AUTH_FAILED                32
#define CRYPTO_INVALID_PACKET_SYNTAX                64
#define CRYPTO_INVALID_PROTOCOL                        128

        /* Used to keep whole l2 header for transport mode GRO */
        __u16                        orig_mac_len;

        __u8                        proto;
        __u8                        inner_ipproto;
};

struct sec_path {
        struct xfrm_state        *xvec[XFRM_MAX_DEPTH];
        struct xfrm_offload        ovec[XFRM_MAX_OFFLOAD_DEPTH];

        u8                        len;
        u8                        olen;
        u8                        verified_cnt;
};

struct sec_path *secpath_set(struct sk_buff *skb);

static inline void
secpath_reset(struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        skb_ext_del(skb, SKB_EXT_SEC_PATH);
#endif
}

static inline int
xfrm_addr_any(const xfrm_address_t *addr, unsigned short family)
{
        switch (family) {
        case AF_INET:
                return addr->a4 == 0;
        case AF_INET6:
                return ipv6_addr_any(&addr->in6);
        }
        return 0;
}

static inline int
__xfrm4_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x)
{
        return        (tmpl->saddr.a4 &&
                 tmpl->saddr.a4 != x->props.saddr.a4);
}

static inline int
__xfrm6_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x)
{
        return        (!ipv6_addr_any((struct in6_addr*)&tmpl->saddr) &&
                 !ipv6_addr_equal((struct in6_addr *)&tmpl->saddr, (struct in6_addr*)&x->props.saddr));
}

static inline int
xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_cmp(tmpl, x);
        case AF_INET6:
                return __xfrm6_state_addr_cmp(tmpl, x);
        }
        return !0;
}

#ifdef CONFIG_XFRM
static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
{
        struct sec_path *sp = skb_sec_path(skb);

        return sp->xvec[sp->len - 1];
}
#endif

static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        struct sec_path *sp = skb_sec_path(skb);

        if (!sp || !sp->olen || sp->len != sp->olen)
                return NULL;

        return &sp->ovec[sp->olen - 1];
#else
        return NULL;
#endif
}

#ifdef CONFIG_XFRM
int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
                        unsigned short family);

static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb,
                                         int dir)
{
        if (!net->xfrm.policy_count[dir] && !secpath_exists(skb))
                return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT;

        return false;
}

static inline bool __xfrm_check_dev_nopolicy(struct sk_buff *skb,
                                             int dir, unsigned short family)
{
        if (dir != XFRM_POLICY_OUT && family == AF_INET) {
                /* same dst may be used for traffic originating from
                 * devices with different policy settings.
                 */
                return IPCB(skb)->flags & IPSKB_NOPOLICY;
        }
        return skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY);
}

static inline int __xfrm_policy_check2(struct sock *sk, int dir,
                                       struct sk_buff *skb,
                                       unsigned int family, int reverse)
{
        struct net *net = dev_net(skb->dev);
        int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0);
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct xfrm_state *x;

        if (sk && sk->sk_policy[XFRM_POLICY_IN])
                return __xfrm_policy_check(sk, ndir, skb, family);

        if (xo) {
                x = xfrm_input_state(skb);
                if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) {
                        bool check = (xo->flags & CRYPTO_DONE) &&
                                     (xo->status & CRYPTO_SUCCESS);

                        /* The packets here are plain ones and secpath was
                         * needed to indicate that hardware already handled
                         * them and there is no need to do nothing in addition.
                         *
                         * Consume secpath which was set by drivers.
                         */
                        secpath_reset(skb);
                        return check;
                }
        }

        return __xfrm_check_nopolicy(net, skb, dir) ||
               __xfrm_check_dev_nopolicy(skb, dir, family) ||
               __xfrm_policy_check(sk, ndir, skb, family);
}

static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
{
        return __xfrm_policy_check2(sk, dir, skb, family, 0);
}

static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return xfrm_policy_check(sk, dir, skb, AF_INET);
}

static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return xfrm_policy_check(sk, dir, skb, AF_INET6);
}

static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return __xfrm_policy_check2(sk, dir, skb, AF_INET, 1);
}

static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return __xfrm_policy_check2(sk, dir, skb, AF_INET6, 1);
}

int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl,
                          unsigned int family, int reverse);

static inline int xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl,
                                      unsigned int family)
{
        return __xfrm_decode_session(net, skb, fl, family, 0);
}

static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb,
                                              struct flowi *fl,
                                              unsigned int family)
{
        return __xfrm_decode_session(net, skb, fl, family, 1);
}

int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);

static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
        struct net *net = dev_net(skb->dev);

        if (!net->xfrm.policy_count[XFRM_POLICY_OUT] &&
            net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT)
                return true;

        return (skb_dst(skb)->flags & DST_NOXFRM) ||
               __xfrm_route_forward(skb, family);
}

static inline int xfrm4_route_forward(struct sk_buff *skb)
{
        return xfrm_route_forward(skb, AF_INET);
}

static inline int xfrm6_route_forward(struct sk_buff *skb)
{
        return xfrm_route_forward(skb, AF_INET6);
}

int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk);

static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
        if (!sk_fullsock(osk))
                return 0;
        sk->sk_policy[0] = NULL;
        sk->sk_policy[1] = NULL;
        if (unlikely(osk->sk_policy[0] || osk->sk_policy[1]))
                return __xfrm_sk_clone_policy(sk, osk);
        return 0;
}

int xfrm_policy_delete(struct xfrm_policy *pol, int dir);

static inline void xfrm_sk_free_policy(struct sock *sk)
{
        struct xfrm_policy *pol;

        pol = rcu_dereference_protected(sk->sk_policy[0], 1);
        if (unlikely(pol != NULL)) {
                xfrm_policy_delete(pol, XFRM_POLICY_MAX);
                sk->sk_policy[0] = NULL;
        }
        pol = rcu_dereference_protected(sk->sk_policy[1], 1);
        if (unlikely(pol != NULL)) {
                xfrm_policy_delete(pol, XFRM_POLICY_MAX+1);
                sk->sk_policy[1] = NULL;
        }
}

#else

static inline void xfrm_sk_free_policy(struct sock *sk) {}
static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; }
static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
{
        return 1;
}
static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb,
                                              struct flowi *fl,
                                              unsigned int family)
{
        return -ENOSYS;
}
static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return 1;
}
#endif

static __inline__
xfrm_address_t *xfrm_flowi_daddr(const struct flowi *fl, unsigned short family)
{
        switch (family){
        case AF_INET:
                return (xfrm_address_t *)&fl->u.ip4.daddr;
        case AF_INET6:
                return (xfrm_address_t *)&fl->u.ip6.daddr;
        }
        return NULL;
}

static __inline__
xfrm_address_t *xfrm_flowi_saddr(const struct flowi *fl, unsigned short family)
{
        switch (family){
        case AF_INET:
                return (xfrm_address_t *)&fl->u.ip4.saddr;
        case AF_INET6:
                return (xfrm_address_t *)&fl->u.ip6.saddr;
        }
        return NULL;
}

static __inline__
void xfrm_flowi_addr_get(const struct flowi *fl,
                         xfrm_address_t *saddr, xfrm_address_t *daddr,
                         unsigned short family)
{
        switch(family) {
        case AF_INET:
                memcpy(&saddr->a4, &fl->u.ip4.saddr, sizeof(saddr->a4));
                memcpy(&daddr->a4, &fl->u.ip4.daddr, sizeof(daddr->a4));
                break;
        case AF_INET6:
                saddr->in6 = fl->u.ip6.saddr;
                daddr->in6 = fl->u.ip6.daddr;
                break;
        }
}

static __inline__ int
__xfrm4_state_addr_check(const struct xfrm_state *x,
                         const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
        if (daddr->a4 == x->id.daddr.a4 &&
            (saddr->a4 == x->props.saddr.a4 || !saddr->a4 || !x->props.saddr.a4))
                return 1;
        return 0;
}

static __inline__ int
__xfrm6_state_addr_check(const struct xfrm_state *x,
                         const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
        if (ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)&x->id.daddr) &&
            (ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)&x->props.saddr) ||
             ipv6_addr_any((struct in6_addr *)saddr) ||
             ipv6_addr_any((struct in6_addr *)&x->props.saddr)))
                return 1;
        return 0;
}

static __inline__ int
xfrm_state_addr_check(const struct xfrm_state *x,
                      const xfrm_address_t *daddr, const xfrm_address_t *saddr,
                      unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_check(x, daddr, saddr);
        case AF_INET6:
                return __xfrm6_state_addr_check(x, daddr, saddr);
        }
        return 0;
}

static __inline__ int
xfrm_state_addr_flow_check(const struct xfrm_state *x, const struct flowi *fl,
                           unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_check(x,
                                                (const xfrm_address_t *)&fl->u.ip4.daddr,
                                                (const xfrm_address_t *)&fl->u.ip4.saddr);
        case AF_INET6:
                return __xfrm6_state_addr_check(x,
                                                (const xfrm_address_t *)&fl->u.ip6.daddr,
                                                (const xfrm_address_t *)&fl->u.ip6.saddr);
        }
        return 0;
}

static inline int xfrm_state_kern(const struct xfrm_state *x)
{
        return atomic_read(&x->tunnel_users);
}

static inline bool xfrm_id_proto_valid(u8 proto)
{
        switch (proto) {
        case IPPROTO_AH:
        case IPPROTO_ESP:
        case IPPROTO_COMP:
#if IS_ENABLED(CONFIG_IPV6)
        case IPPROTO_ROUTING:
        case IPPROTO_DSTOPTS:
#endif
                return true;
        default:
                return false;
        }
}

/* IPSEC_PROTO_ANY only matches 3 IPsec protocols, 0 could match all. */
static inline int xfrm_id_proto_match(u8 proto, u8 userproto)
{
        return (!userproto || proto == userproto ||
                (userproto == IPSEC_PROTO_ANY && (proto == IPPROTO_AH ||
                                                  proto == IPPROTO_ESP ||
                                                  proto == IPPROTO_COMP)));
}

/*
 * xfrm algorithm information
 */
struct xfrm_algo_aead_info {
        char *geniv;
        u16 icv_truncbits;
};

struct xfrm_algo_auth_info {
        u16 icv_truncbits;
        u16 icv_fullbits;
};

struct xfrm_algo_encr_info {
        char *geniv;
        u16 blockbits;
        u16 defkeybits;
};

struct xfrm_algo_comp_info {
        u16 threshold;
};

struct xfrm_algo_desc {
        char *name;
        char *compat;
        u8 available:1;
        u8 pfkey_supported:1;
        union {
                struct xfrm_algo_aead_info aead;
                struct xfrm_algo_auth_info auth;
                struct xfrm_algo_encr_info encr;
                struct xfrm_algo_comp_info comp;
        } uinfo;
        struct sadb_alg desc;
};

/* XFRM protocol handlers.  */
struct xfrm4_protocol {
        int (*handler)(struct sk_buff *skb);
        int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi,
                             int encap_type);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, u32 info);

        struct xfrm4_protocol __rcu *next;
        int priority;
};

struct xfrm6_protocol {
        int (*handler)(struct sk_buff *skb);
        int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi,
                             int encap_type);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
                           u8 type, u8 code, int offset, __be32 info);

        struct xfrm6_protocol __rcu *next;
        int priority;
};

/* XFRM tunnel handlers.  */
struct xfrm_tunnel {
        int (*handler)(struct sk_buff *skb);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, u32 info);

        struct xfrm_tunnel __rcu *next;
        int priority;
};

struct xfrm6_tunnel {
        int (*handler)(struct sk_buff *skb);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
                           u8 type, u8 code, int offset, __be32 info);
        struct xfrm6_tunnel __rcu *next;
        int priority;
};

void xfrm_init(void);
void xfrm4_init(void);
int xfrm_state_init(struct net *net);
void xfrm_state_fini(struct net *net);
void xfrm4_state_init(void);
void xfrm4_protocol_init(void);
#ifdef CONFIG_XFRM
int xfrm6_init(void);
void xfrm6_fini(void);
int xfrm6_state_init(void);
void xfrm6_state_fini(void);
int xfrm6_protocol_init(void);
void xfrm6_protocol_fini(void);
#else
static inline int xfrm6_init(void)
{
        return 0;
}
static inline void xfrm6_fini(void)
{
        ;
}
#endif

#ifdef CONFIG_XFRM_STATISTICS
int xfrm_proc_init(struct net *net);
void xfrm_proc_fini(struct net *net);
#endif

int xfrm_sysctl_init(struct net *net);
#ifdef CONFIG_SYSCTL
void xfrm_sysctl_fini(struct net *net);
#else
static inline void xfrm_sysctl_fini(struct net *net)
{
}
#endif

void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
                          struct xfrm_address_filter *filter);
int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
                    int (*func)(struct xfrm_state *, int, void*), void *);
void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net);
struct xfrm_state *xfrm_state_alloc(struct net *net);
void xfrm_state_free(struct xfrm_state *x);
struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr,
                                   const xfrm_address_t *saddr,
                                   const struct flowi *fl,
                                   struct xfrm_tmpl *tmpl,
                                   struct xfrm_policy *pol, int *err,
                                   unsigned short family, u32 if_id);
struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
                                       xfrm_address_t *daddr,
                                       xfrm_address_t *saddr,
                                       unsigned short family,
                                       u8 mode, u8 proto, u32 reqid);
struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
                                              unsigned short family);
int xfrm_state_check_expire(struct xfrm_state *x);
void xfrm_state_update_stats(struct net *net);
#ifdef CONFIG_XFRM_OFFLOAD
static inline void xfrm_dev_state_update_stats(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xdo = &x->xso;
        struct net_device *dev = READ_ONCE(xdo->dev);

        if (dev && dev->xfrmdev_ops &&
            dev->xfrmdev_ops->xdo_dev_state_update_stats)
                dev->xfrmdev_ops->xdo_dev_state_update_stats(x);

}
#else
static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) {}
#endif
void xfrm_state_insert(struct xfrm_state *x);
int xfrm_state_add(struct xfrm_state *x);
int xfrm_state_update(struct xfrm_state *x);
struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark,
                                     const xfrm_address_t *daddr, __be32 spi,
                                     u8 proto, unsigned short family);
struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark,
                                           const xfrm_address_t *daddr,
                                           __be32 spi, u8 proto,
                                           unsigned short family);
struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark,
                                            const xfrm_address_t *daddr,
                                            const xfrm_address_t *saddr,
                                            u8 proto,
                                            unsigned short family);
#ifdef CONFIG_XFRM_SUB_POLICY
void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
                    unsigned short family);
void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
                     unsigned short family);
#else
static inline void xfrm_tmpl_sort(struct xfrm_tmpl **d, struct xfrm_tmpl **s,
                                  int n, unsigned short family)
{
}

static inline void xfrm_state_sort(struct xfrm_state **d, struct xfrm_state **s,
                                   int n, unsigned short family)
{
}
#endif

struct xfrmk_sadinfo {
        u32 sadhcnt; /* current hash bkts */
        u32 sadhmcnt; /* max allowed hash bkts */
        u32 sadcnt; /* current running count */
};

struct xfrmk_spdinfo {
        u32 incnt;
        u32 outcnt;
        u32 fwdcnt;
        u32 inscnt;
        u32 outscnt;
        u32 fwdscnt;
        u32 spdhcnt;
        u32 spdhmcnt;
};

struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num);
int xfrm_state_delete(struct xfrm_state *x);
int xfrm_state_flush(struct net *net, u8 proto, bool task_valid);
int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
                          bool task_valid);
void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack);
u32 xfrm_state_mtu(struct xfrm_state *x, int mtu);
int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack);
int xfrm_init_state(struct xfrm_state *x);
int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
                         int (*finish)(struct net *, struct sock *,
                                       struct sk_buff *));
int xfrm_trans_queue(struct sk_buff *skb,
                     int (*finish)(struct net *, struct sock *,
                                   struct sk_buff *));
int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err);
int xfrm_output(struct sock *sk, struct sk_buff *skb);
int xfrm4_tunnel_check_size(struct sk_buff *skb);
#if IS_ENABLED(CONFIG_IPV6)
int xfrm6_tunnel_check_size(struct sk_buff *skb);
#else
static inline int xfrm6_tunnel_check_size(struct sk_buff *skb)
{
        return -EMSGSIZE;
}
#endif

#if IS_ENABLED(CONFIG_NET_PKTGEN)
int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb);
#endif

void xfrm_local_error(struct sk_buff *skb, int mtu);
int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
                    int encap_type);
int xfrm4_transport_finish(struct sk_buff *skb, int async);
int xfrm4_rcv(struct sk_buff *skb);

static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
{
        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
        XFRM_SPI_SKB_CB(skb)->family = AF_INET;
        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
        return xfrm_input(skb, nexthdr, spi, 0);
}

int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol);
int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol);
int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family);
int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family);
void xfrm4_local_error(struct sk_buff *skb, u32 mtu);
int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
                  struct ip6_tnl *t);
int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
                    int encap_type);
int xfrm6_transport_finish(struct sk_buff *skb, int async);
int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t);
int xfrm6_rcv(struct sk_buff *skb);
int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
                     xfrm_address_t *saddr, u8 proto);
void xfrm6_local_error(struct sk_buff *skb, u32 mtu);
int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol);
int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol);
int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family);
int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family);
__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr);
__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr);
int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb);

#ifdef CONFIG_XFRM
void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu);
int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
                                        struct sk_buff *skb);
struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
                                        struct sk_buff *skb);
int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval,
                     int optlen);
#else
static inline int xfrm_user_policy(struct sock *sk, int optname,
                                   sockptr_t optval, int optlen)
{
         return -ENOPROTOOPT;
}
#endif

struct dst_entry *__xfrm_dst_lookup(int family, const struct xfrm_dst_lookup_params *params);

struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp);

void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type);
int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
                     int (*func)(struct xfrm_policy *, int, int, void*),
                     void *);
void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net);
int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net,
                                          const struct xfrm_mark *mark,
                                          u32 if_id, u8 type, int dir,
                                          struct xfrm_selector *sel,
                                          struct xfrm_sec_ctx *ctx, int delete,
                                          int *err);
struct xfrm_policy *xfrm_policy_byid(struct net *net,
                                     const struct xfrm_mark *mark, u32 if_id,
                                     u8 type, int dir, u32 id, int delete,
                                     int *err);
int xfrm_policy_flush(struct net *net, u8 type, bool task_valid);
void xfrm_policy_hash_rebuild(struct net *net);
u32 xfrm_get_acqseq(void);
int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack);
int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi,
                   struct netlink_ext_ack *extack);
struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark,
                                 u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto,
                                 const xfrm_address_t *daddr,
                                 const xfrm_address_t *saddr, int create,
                                 unsigned short family);
int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);

#ifdef CONFIG_XFRM_MIGRATE
int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
               const struct xfrm_migrate *m, int num_bundles,
               const struct xfrm_kmaddress *k,
               const struct xfrm_encap_tmpl *encap);
struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net,
                                                u32 if_id);
struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
                                      struct xfrm_migrate *m,
                                      struct xfrm_encap_tmpl *encap,
                                      struct net *net,
                                      struct xfrm_user_offload *xuo,
                                      struct netlink_ext_ack *extack);
int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                 struct xfrm_migrate *m, int num_bundles,
                 struct xfrm_kmaddress *k, struct net *net,
                 struct xfrm_encap_tmpl *encap, u32 if_id,
                 struct netlink_ext_ack *extack,
                 struct xfrm_user_offload *xuo);
#endif

int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid);
int km_report(struct net *net, u8 proto, struct xfrm_selector *sel,
              xfrm_address_t *addr);

void xfrm_input_init(void);
int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq);

void xfrm_probe_algs(void);
int xfrm_count_pfkey_auth_supported(void);
int xfrm_count_pfkey_enc_supported(void);
struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx);
struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx);
struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len,
                                            int probe);

static inline bool xfrm6_addr_equal(const xfrm_address_t *a,
                                    const xfrm_address_t *b)
{
        return ipv6_addr_equal((const struct in6_addr *)a,
                               (const struct in6_addr *)b);
}

static inline bool xfrm_addr_equal(const xfrm_address_t *a,
                                   const xfrm_address_t *b,
                                   sa_family_t family)
{
        switch (family) {
        default:
        case AF_INET:
                return ((__force u32)a->a4 ^ (__force u32)b->a4) == 0;
        case AF_INET6:
                return xfrm6_addr_equal(a, b);
        }
}

static inline int xfrm_policy_id2dir(u32 index)
{
        return index & 7;
}

#ifdef CONFIG_XFRM
void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq);
int xfrm_replay_check(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq);
void xfrm_replay_notify(struct xfrm_state *x, int event);
int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb);
int xfrm_replay_recheck(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq);

static inline int xfrm_aevent_is_on(struct net *net)
{
        struct sock *nlsk;
        int ret = 0;

        rcu_read_lock();
        nlsk = rcu_dereference(net->xfrm.nlsk);
        if (nlsk)
                ret = netlink_has_listeners(nlsk, XFRMNLGRP_AEVENTS);
        rcu_read_unlock();
        return ret;
}

static inline int xfrm_acquire_is_on(struct net *net)
{
        struct sock *nlsk;
        int ret = 0;

        rcu_read_lock();
        nlsk = rcu_dereference(net->xfrm.nlsk);
        if (nlsk)
                ret = netlink_has_listeners(nlsk, XFRMNLGRP_ACQUIRE);
        rcu_read_unlock();

        return ret;
}
#endif

static inline unsigned int aead_len(struct xfrm_algo_aead *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn)
{
        return sizeof(*replay_esn) + replay_esn->bmp_len * sizeof(__u32);
}

#ifdef CONFIG_XFRM_MIGRATE
static inline int xfrm_replay_clone(struct xfrm_state *x,
                                     struct xfrm_state *orig)
{

        x->replay_esn = kmemdup(orig->replay_esn,
                                xfrm_replay_state_esn_len(orig->replay_esn),
                                GFP_KERNEL);
        if (!x->replay_esn)
                return -ENOMEM;
        x->preplay_esn = kmemdup(orig->preplay_esn,
                                 xfrm_replay_state_esn_len(orig->preplay_esn),
                                 GFP_KERNEL);
        if (!x->preplay_esn)
                return -ENOMEM;

        return 0;
}

static inline struct xfrm_algo_aead *xfrm_algo_aead_clone(struct xfrm_algo_aead *orig)
{
        return kmemdup(orig, aead_len(orig), GFP_KERNEL);
}


static inline struct xfrm_algo *xfrm_algo_clone(struct xfrm_algo *orig)
{
        return kmemdup(orig, xfrm_alg_len(orig), GFP_KERNEL);
}

static inline struct xfrm_algo_auth *xfrm_algo_auth_clone(struct xfrm_algo_auth *orig)
{
        return kmemdup(orig, xfrm_alg_auth_len(orig), GFP_KERNEL);
}

static inline void xfrm_states_put(struct xfrm_state **states, int n)
{
        int i;
        for (i = 0; i < n; i++)
                xfrm_state_put(*(states + i));
}

static inline void xfrm_states_delete(struct xfrm_state **states, int n)
{
        int i;
        for (i = 0; i < n; i++)
                xfrm_state_delete(*(states + i));
}
#endif

void __init xfrm_dev_init(void);

#ifdef CONFIG_XFRM_OFFLOAD
void xfrm_dev_resume(struct sk_buff *skb);
void xfrm_dev_backlog(struct softnet_data *sd);
struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again);
int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
                       struct xfrm_user_offload *xuo,
                       struct netlink_ext_ack *extack);
int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
                        struct xfrm_user_offload *xuo, u8 dir,
                        struct netlink_ext_ack *extack);
bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
void xfrm_dev_state_delete(struct xfrm_state *x);
void xfrm_dev_state_free(struct xfrm_state *x);

static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;
        struct net_device *dev = READ_ONCE(xso->dev);

        if (dev && dev->xfrmdev_ops->xdo_dev_state_advance_esn)
                dev->xfrmdev_ops->xdo_dev_state_advance_esn(x);
}

static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
{
        struct xfrm_state *x = dst->xfrm;
        struct xfrm_dst *xdst;

        if (!x || !x->type_offload)
                return false;

        xdst = (struct xfrm_dst *) dst;
        if (!x->xso.offload_handle && !xdst->child->xfrm)
                return true;
        if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) &&
            !xdst->child->xfrm)
                return true;

        return false;
}

static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
{
        struct xfrm_dev_offload *xdo = &x->xdo;
        struct net_device *dev = xdo->dev;

        if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_policy_delete)
                dev->xfrmdev_ops->xdo_dev_policy_delete(x);
}

static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
{
        struct xfrm_dev_offload *xdo = &x->xdo;
        struct net_device *dev = xdo->dev;

        if (dev && dev->xfrmdev_ops) {
                if (dev->xfrmdev_ops->xdo_dev_policy_free)
                        dev->xfrmdev_ops->xdo_dev_policy_free(x);
                xdo->dev = NULL;
                netdev_put(dev, &xdo->dev_tracker);
        }
}
#else
static inline void xfrm_dev_resume(struct sk_buff *skb)
{
}

static inline void xfrm_dev_backlog(struct softnet_data *sd)
{
}

static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again)
{
        return skb;
}

static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack)
{
        return 0;
}

static inline void xfrm_dev_state_delete(struct xfrm_state *x)
{
}

static inline void xfrm_dev_state_free(struct xfrm_state *x)
{
}

static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
                                      struct xfrm_user_offload *xuo, u8 dir,
                                      struct netlink_ext_ack *extack)
{
        return 0;
}

static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
{
}

static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
{
}

static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
{
        return false;
}

static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
{
}

static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
{
        return false;
}
#endif

static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m)
{
        if (attrs[XFRMA_MARK])
                memcpy(m, nla_data(attrs[XFRMA_MARK]), sizeof(struct xfrm_mark));
        else
                m->v = m->m = 0;

        return m->v & m->m;
}

static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m)
{
        int ret = 0;

        if (m->m | m->v)
                ret = nla_put(skb, XFRMA_MARK, sizeof(struct xfrm_mark), m);
        return ret;
}

static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x)
{
        struct xfrm_mark *m = &x->props.smark;

        return (m->v & m->m) | (mark & ~m->m);
}

static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id)
{
        int ret = 0;

        if (if_id)
                ret = nla_put_u32(skb, XFRMA_IF_ID, if_id);
        return ret;
}

static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
                                    unsigned int family)
{
        bool tunnel = false;

        switch(family) {
        case AF_INET:
                if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4)
                        tunnel = true;
                break;
        case AF_INET6:
                if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6)
                        tunnel = true;
                break;
        }
        if (tunnel && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL))
                return -EINVAL;

        return 0;
}

extern const int xfrm_msg_min[XFRM_NR_MSGTYPES];
extern const struct nla_policy xfrma_policy[XFRMA_MAX+1];

struct xfrm_translator {
        /* Allocate frag_list and put compat translation there */
        int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src);

        /* Allocate nlmsg with 64-bit translaton of received 32-bit message */
        struct nlmsghdr *(*rcv_msg_compat)(const struct nlmsghdr *nlh,
                        int maxtype, const struct nla_policy *policy,
                        struct netlink_ext_ack *extack);

        /* Translate 32-bit user_policy from sockptr */
        int (*xlate_user_policy_sockptr)(u8 **pdata32, int optlen);

        struct module *owner;
};

#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT)
extern int xfrm_register_translator(struct xfrm_translator *xtr);
extern int xfrm_unregister_translator(struct xfrm_translator *xtr);
extern struct xfrm_translator *xfrm_get_translator(void);
extern void xfrm_put_translator(struct xfrm_translator *xtr);
#else
static inline struct xfrm_translator *xfrm_get_translator(void)
{
        return NULL;
}
static inline void xfrm_put_translator(struct xfrm_translator *xtr)
{
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static inline bool xfrm6_local_dontfrag(const struct sock *sk)
{
        int proto;

        if (!sk || sk->sk_family != AF_INET6)
                return false;

        proto = sk->sk_protocol;
        if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
                return inet6_test_bit(DONTFRAG, sk);

        return false;
}
#endif

#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))

extern struct metadata_dst __percpu *xfrm_bpf_md_dst;

int register_xfrm_interface_bpf(void);

#else

static inline int register_xfrm_interface_bpf(void)
{
        return 0;
}

#endif

#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF)
int register_xfrm_state_bpf(void);
#else
static inline int register_xfrm_state_bpf(void)
{
        return 0;
}
#endif

int xfrm_nat_keepalive_init(unsigned short family);
void xfrm_nat_keepalive_fini(unsigned short family);
int xfrm_nat_keepalive_net_init(struct net *net);
int xfrm_nat_keepalive_net_fini(struct net *net);
void xfrm_nat_keepalive_state_updated(struct xfrm_state *x);

#endif        /* _NET_XFRM_H */






































































































    5 

    4 























































































































































































































    6 


























































































































    6 




    6 






    6 


















    5 














































    6 







































    6 








































    6 
















    6 














    6 








    6 
    6 
    5 








    6 
    6 


























    6 































    6 








    6 






    6 














    5 










    6 


    5 



















































































































































































    6 









    4 













    6 








































































    5 









































    6 
















    5 





















    6 





























    6 











    5 
























    6 
    6 





    6 
















































    6 

































































































































































































































































































































































































































































































































































































































    5 


    6 




















    6 









    6 
















    6 











    6 

























    6 










    6 






    6 









    6 






























    6 







    6 



    5 











    6 





    5 
    6 
























    6 




























    6 












































    6 



























































































































































































    6 













































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2002 Andi Kleen, SuSE Labs.
 * Thanks to Ben LaHaise for precious feedback.
 */
#include <linux/highmem.h>
#include <linux/memblock.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
#include <linux/pfn.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/vmalloc.h>
#include <linux/libnvdimm.h>
#include <linux/vmstat.h>
#include <linux/kernel.h>
#include <linux/cc_platform.h>
#include <linux/set_memory.h>
#include <linux/memregion.h>

#include <asm/e820/api.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <linux/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
#include <asm/memtype.h>

#include "../mm_internal.h"

/*
 * The current flushing context - we pass it instead of 5 arguments:
 */
struct cpa_data {
        unsigned long        *vaddr;
        pgd_t                *pgd;
        pgprot_t        mask_set;
        pgprot_t        mask_clr;
        unsigned long        numpages;
        unsigned long        curpage;
        unsigned long        pfn;
        unsigned int        flags;
        unsigned int        force_split                : 1,
                        force_static_prot        : 1,
                        force_flush_all                : 1;
        struct page        **pages;
};

enum cpa_warn {
        CPA_CONFLICT,
        CPA_PROTECT,
        CPA_DETECT,
};

static const int cpa_warn_level = CPA_PROTECT;

/*
 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
 * entries change the page attribute in parallel to some other cpu
 * splitting a large page entry along with changing the attribute.
 */
static DEFINE_SPINLOCK(cpa_lock);

#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
#define CPA_COLLAPSE 16 /* try to collapse large pages */

static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
{
        return __pgprot(cachemode2protval(pcm));
}

#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];

void update_page_count(int level, unsigned long pages)
{
        /* Protect against CPA */
        spin_lock(&pgd_lock);
        direct_pages_count[level] += pages;
        spin_unlock(&pgd_lock);
}

static void split_page_count(int level)
{
        if (direct_pages_count[level] == 0)
                return;

        direct_pages_count[level]--;
        if (system_state == SYSTEM_RUNNING) {
                if (level == PG_LEVEL_2M)
                        count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
                else if (level == PG_LEVEL_1G)
                        count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
        }
        direct_pages_count[level - 1] += PTRS_PER_PTE;
}

static void collapse_page_count(int level)
{
        direct_pages_count[level]++;
        if (system_state == SYSTEM_RUNNING) {
                if (level == PG_LEVEL_2M)
                        count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE);
                else if (level == PG_LEVEL_1G)
                        count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE);
        }
        direct_pages_count[level - 1] -= PTRS_PER_PTE;
}

void arch_report_meminfo(struct seq_file *m)
{
        seq_printf(m, "DirectMap4k:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_4K] << 2);
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
        seq_printf(m, "DirectMap2M:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_2M] << 11);
#else
        seq_printf(m, "DirectMap4M:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_2M] << 12);
#endif
        if (direct_gbpages)
                seq_printf(m, "DirectMap1G:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_1G] << 20);
}
#else
static inline void split_page_count(int level) { }
static inline void collapse_page_count(int level) { }
#endif

#ifdef CONFIG_X86_CPA_STATISTICS

static unsigned long cpa_1g_checked;
static unsigned long cpa_1g_sameprot;
static unsigned long cpa_1g_preserved;
static unsigned long cpa_2m_checked;
static unsigned long cpa_2m_sameprot;
static unsigned long cpa_2m_preserved;
static unsigned long cpa_4k_install;

static inline void cpa_inc_1g_checked(void)
{
        cpa_1g_checked++;
}

static inline void cpa_inc_2m_checked(void)
{
        cpa_2m_checked++;
}

static inline void cpa_inc_4k_install(void)
{
        data_race(cpa_4k_install++);
}

static inline void cpa_inc_lp_sameprot(int level)
{
        if (level == PG_LEVEL_1G)
                cpa_1g_sameprot++;
        else
                cpa_2m_sameprot++;
}

static inline void cpa_inc_lp_preserved(int level)
{
        if (level == PG_LEVEL_1G)
                cpa_1g_preserved++;
        else
                cpa_2m_preserved++;
}

static int cpastats_show(struct seq_file *m, void *p)
{
        seq_printf(m, "1G pages checked:     %16lu\n", cpa_1g_checked);
        seq_printf(m, "1G pages sameprot:    %16lu\n", cpa_1g_sameprot);
        seq_printf(m, "1G pages preserved:   %16lu\n", cpa_1g_preserved);
        seq_printf(m, "2M pages checked:     %16lu\n", cpa_2m_checked);
        seq_printf(m, "2M pages sameprot:    %16lu\n", cpa_2m_sameprot);
        seq_printf(m, "2M pages preserved:   %16lu\n", cpa_2m_preserved);
        seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
        return 0;
}

static int cpastats_open(struct inode *inode, struct file *file)
{
        return single_open(file, cpastats_show, NULL);
}

static const struct file_operations cpastats_fops = {
        .open                = cpastats_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static int __init cpa_stats_init(void)
{
        debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
                            &cpastats_fops);
        return 0;
}
late_initcall(cpa_stats_init);
#else
static inline void cpa_inc_1g_checked(void) { }
static inline void cpa_inc_2m_checked(void) { }
static inline void cpa_inc_4k_install(void) { }
static inline void cpa_inc_lp_sameprot(int level) { }
static inline void cpa_inc_lp_preserved(int level) { }
#endif


static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
        return addr >= start && addr < end;
}

#ifdef CONFIG_X86_64

static inline int
within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
{
        return addr >= start && addr <= end;
}

/*
 * The kernel image is mapped into two places in the virtual address space
 * (addresses without KASLR, of course):
 *
 * 1. The kernel direct map (0xffff880000000000)
 * 2. The "high kernel map" (0xffffffff81000000)
 *
 * We actually execute out of #2. If we get the address of a kernel symbol, it
 * points to #2, but almost all physical-to-virtual translations point to #1.
 *
 * This is so that we can have both a directmap of all physical memory *and*
 * take full advantage of the limited (s32) immediate addressing range (2G)
 * of x86_64.
 *
 * See Documentation/arch/x86/x86_64/mm.rst for more detail.
 */

static inline unsigned long highmap_start_pfn(void)
{
        return __pa_symbol(_text) >> PAGE_SHIFT;
}

static inline unsigned long highmap_end_pfn(void)
{
        /* Do not reference physical address outside the kernel. */
        return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
}

static bool __cpa_pfn_in_highmap(unsigned long pfn)
{
        /*
         * Kernel text has an alias mapping at a high address, known
         * here as "highmap".
         */
        return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
}

#else

static bool __cpa_pfn_in_highmap(unsigned long pfn)
{
        /* There is no highmap on 32-bit */
        return false;
}

#endif

/*
 * See set_mce_nospec().
 *
 * Machine check recovery code needs to change cache mode of poisoned pages to
 * UC to avoid speculative access logging another error. But passing the
 * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
 * speculative access. So we cheat and flip the top bit of the address. This
 * works fine for the code that updates the page tables. But at the end of the
 * process we need to flush the TLB and cache and the non-canonical address
 * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
 *
 * But in the common case we already have a canonical address. This code
 * will fix the top bit if needed and is a no-op otherwise.
 */
static inline unsigned long fix_addr(unsigned long addr)
{
#ifdef CONFIG_X86_64
        return (long)(addr << 1) >> 1;
#else
        return addr;
#endif
}

static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
{
        if (cpa->flags & CPA_PAGES_ARRAY) {
                struct page *page = cpa->pages[idx];

                if (unlikely(PageHighMem(page)))
                        return 0;

                return (unsigned long)page_address(page);
        }

        if (cpa->flags & CPA_ARRAY)
                return cpa->vaddr[idx];

        return *cpa->vaddr + idx * PAGE_SIZE;
}

/*
 * Flushing functions
 */

static void clflush_cache_range_opt(void *vaddr, unsigned int size)
{
        const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
        void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
        void *vend = vaddr + size;

        if (p >= vend)
                return;

        for (; p < vend; p += clflush_size)
                clflushopt(p);
}

/**
 * clflush_cache_range - flush a cache range with clflush
 * @vaddr:        virtual start address
 * @size:        number of bytes to flush
 *
 * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
 * SFENCE to avoid ordering issues.
 */
void clflush_cache_range(void *vaddr, unsigned int size)
{
        mb();
        clflush_cache_range_opt(vaddr, size);
        mb();
}
EXPORT_SYMBOL_GPL(clflush_cache_range);

#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_invalidate_pmem(void *addr, size_t size)
{
        clflush_cache_range(addr, size);
}
EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
#endif

#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
bool cpu_cache_has_invalidate_memregion(void)
{
        return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
}
EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM");

int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len)
{
        if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
                return -ENXIO;
        wbinvd_on_all_cpus();
        return 0;
}
EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM");
#endif

static void __cpa_flush_all(void *arg)
{
        unsigned long cache = (unsigned long)arg;

        /*
         * Flush all to work around Errata in early athlons regarding
         * large page flushing.
         */
        __flush_tlb_all();

        if (cache && boot_cpu_data.x86 >= 4)
                wbinvd();
}

static void cpa_flush_all(unsigned long cache)
{
        BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);

        on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}

static int collapse_large_pages(unsigned long addr, struct list_head *pgtables);

static void cpa_collapse_large_pages(struct cpa_data *cpa)
{
        unsigned long start, addr, end;
        struct ptdesc *ptdesc, *tmp;
        LIST_HEAD(pgtables);
        int collapsed = 0;
        int i;

        if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
                for (i = 0; i < cpa->numpages; i++)
                        collapsed += collapse_large_pages(__cpa_addr(cpa, i),
                                                          &pgtables);
        } else {
                addr = __cpa_addr(cpa, 0);
                start = addr & PMD_MASK;
                end = addr + PAGE_SIZE * cpa->numpages;

                for (addr = start; within(addr, start, end); addr += PMD_SIZE)
                        collapsed += collapse_large_pages(addr, &pgtables);
        }

        if (!collapsed)
                return;

        flush_tlb_all();

        list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
                list_del(&ptdesc->pt_list);
                pagetable_free(ptdesc);
        }
}

static void cpa_flush(struct cpa_data *cpa, int cache)
{
        unsigned long start, end;
        unsigned int i;

        BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);

        if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
                cpa_flush_all(cache);
                goto collapse_large_pages;
        }

        start = fix_addr(__cpa_addr(cpa, 0));
        end =   start + cpa->numpages * PAGE_SIZE;
        if (cpa->force_flush_all)
                end = TLB_FLUSH_ALL;

        flush_tlb_kernel_range(start, end);

        if (!cache)
                goto collapse_large_pages;

        mb();
        for (i = 0; i < cpa->numpages; i++) {
                unsigned long addr = __cpa_addr(cpa, i);
                unsigned int level;

                pte_t *pte = lookup_address(addr, &level);

                /*
                 * Only flush present addresses:
                 */
                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
                        clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
        }
        mb();

collapse_large_pages:
        if (cpa->flags & CPA_COLLAPSE)
                cpa_collapse_large_pages(cpa);
}

static bool overlaps(unsigned long r1_start, unsigned long r1_end,
                     unsigned long r2_start, unsigned long r2_end)
{
        return (r1_start <= r2_end && r1_end >= r2_start) ||
                (r2_start <= r1_end && r2_end >= r1_start);
}

#ifdef CONFIG_PCI_BIOS
/*
 * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
 * based config access (CONFIG_PCI_GOBIOS) support.
 */
#define BIOS_PFN        PFN_DOWN(BIOS_BEGIN)
#define BIOS_PFN_END        PFN_DOWN(BIOS_END - 1)

static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
        if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
                return _PAGE_NX;
        return 0;
}
#else
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
        return 0;
}
#endif

/*
 * The .rodata section needs to be read-only. Using the pfn catches all
 * aliases.  This also includes __ro_after_init, so do not enforce until
 * kernel_set_to_readonly is true.
 */
static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
{
        unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));

        /*
         * Note: __end_rodata is at page aligned and not inclusive, so
         * subtract 1 to get the last enforced PFN in the rodata area.
         */
        epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;

        if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
                return _PAGE_RW;
        return 0;
}

/*
 * Protect kernel text against becoming non executable by forbidding
 * _PAGE_NX.  This protects only the high kernel mapping (_text -> _etext)
 * out of which the kernel actually executes.  Do not protect the low
 * mapping.
 *
 * This does not cover __inittext since that is gone after boot.
 */
static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
{
        unsigned long t_end = (unsigned long)_etext - 1;
        unsigned long t_start = (unsigned long)_text;

        if (overlaps(start, end, t_start, t_end))
                return _PAGE_NX;
        return 0;
}

#if defined(CONFIG_X86_64)
/*
 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 * kernel text mappings for the large page aligned text, rodata sections
 * will be always read-only. For the kernel identity mappings covering the
 * holes caused by this alignment can be anything that user asks.
 *
 * This will preserve the large page mappings for kernel text/data at no
 * extra cost.
 */
static pgprotval_t protect_kernel_text_ro(unsigned long start,
                                          unsigned long end)
{
        unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
        unsigned long t_start = (unsigned long)_text;
        unsigned int level;

        if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
                return 0;
        /*
         * Don't enforce the !RW mapping for the kernel text mapping, if
         * the current mapping is already using small page mapping.  No
         * need to work hard to preserve large page mappings in this case.
         *
         * This also fixes the Linux Xen paravirt guest boot failure caused
         * by unexpected read-only mappings for kernel identity
         * mappings. In this paravirt guest case, the kernel text mapping
         * and the kernel identity mapping share the same page-table pages,
         * so the protections for kernel text and identity mappings have to
         * be the same.
         */
        if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
                return _PAGE_RW;
        return 0;
}
#else
static pgprotval_t protect_kernel_text_ro(unsigned long start,
                                          unsigned long end)
{
        return 0;
}
#endif

static inline bool conflicts(pgprot_t prot, pgprotval_t val)
{
        return (pgprot_val(prot) & ~val) != pgprot_val(prot);
}

static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
                                  unsigned long start, unsigned long end,
                                  unsigned long pfn, const char *txt)
{
        static const char *lvltxt[] = {
                [CPA_CONFLICT]        = "conflict",
                [CPA_PROTECT]        = "protect",
                [CPA_DETECT]        = "detect",
        };

        if (warnlvl > cpa_warn_level || !conflicts(prot, val))
                return;

        pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
                lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
                (unsigned long long)val);
}

/*
 * Certain areas of memory on x86 require very specific protection flags,
 * for example the BIOS area or kernel text. Callers don't always get this
 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 * checks and fixes these known static required protection bits.
 */
static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
                                          unsigned long pfn, unsigned long npg,
                                          unsigned long lpsize, int warnlvl)
{
        pgprotval_t forbidden, res;
        unsigned long end;

        /*
         * There is no point in checking RW/NX conflicts when the requested
         * mapping is setting the page !PRESENT.
         */
        if (!(pgprot_val(prot) & _PAGE_PRESENT))
                return prot;

        /* Operate on the virtual address */
        end = start + npg * PAGE_SIZE - 1;

        res = protect_kernel_text(start, end);
        check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
        forbidden = res;

        /*
         * Special case to preserve a large page. If the change spawns the
         * full large page mapping then there is no point to split it
         * up. Happens with ftrace and is going to be removed once ftrace
         * switched to text_poke().
         */
        if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) {
                res = protect_kernel_text_ro(start, end);
                check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
                forbidden |= res;
        }

        /* Check the PFN directly */
        res = protect_pci_bios(pfn, pfn + npg - 1);
        check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
        forbidden |= res;

        res = protect_rodata(pfn, pfn + npg - 1);
        check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
        forbidden |= res;

        return __pgprot(pgprot_val(prot) & ~forbidden);
}

/*
 * Validate strict W^X semantics.
 */
static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
                                  unsigned long pfn, unsigned long npg,
                                  bool nx, bool rw)
{
        unsigned long end;

        /*
         * 32-bit has some unfixable W+X issues, like EFI code
         * and writeable data being in the same page.  Disable
         * detection and enforcement there.
         */
        if (IS_ENABLED(CONFIG_X86_32))
                return new;

        /* Only verify when NX is supported: */
        if (!(__supported_pte_mask & _PAGE_NX))
                return new;

        if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
                return new;

        if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
                return new;

        /* Non-leaf translation entries can disable writing or execution. */
        if (!rw || nx)
                return new;

        end = start + npg * PAGE_SIZE - 1;
        WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
                  (unsigned long long)pgprot_val(old),
                  (unsigned long long)pgprot_val(new),
                  start, end, pfn);

        /*
         * For now, allow all permission change attempts by returning the
         * attempted permissions.  This can 'return old' to actively
         * refuse the permission change at a later time.
         */
        return new;
}

/*
 * Lookup the page table entry for a virtual address in a specific pgd.
 * Return a pointer to the entry (or NULL if the entry does not exist),
 * the level of the entry, and the effective NX and RW bits of all
 * page table levels.
 */
pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
                                  unsigned int *level, bool *nx, bool *rw)
{
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        *level = PG_LEVEL_256T;
        *nx = false;
        *rw = true;

        if (pgd_none(*pgd))
                return NULL;

        *level = PG_LEVEL_512G;
        *nx |= pgd_flags(*pgd) & _PAGE_NX;
        *rw &= pgd_flags(*pgd) & _PAGE_RW;

        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d))
                return NULL;

        if (p4d_leaf(*p4d) || !p4d_present(*p4d))
                return (pte_t *)p4d;

        *level = PG_LEVEL_1G;
        *nx |= p4d_flags(*p4d) & _PAGE_NX;
        *rw &= p4d_flags(*p4d) & _PAGE_RW;

        pud = pud_offset(p4d, address);
        if (pud_none(*pud))
                return NULL;

        if (pud_leaf(*pud) || !pud_present(*pud))
                return (pte_t *)pud;

        *level = PG_LEVEL_2M;
        *nx |= pud_flags(*pud) & _PAGE_NX;
        *rw &= pud_flags(*pud) & _PAGE_RW;

        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                return NULL;

        if (pmd_leaf(*pmd) || !pmd_present(*pmd))
                return (pte_t *)pmd;

        *level = PG_LEVEL_4K;
        *nx |= pmd_flags(*pmd) & _PAGE_NX;
        *rw &= pmd_flags(*pmd) & _PAGE_RW;

        return pte_offset_kernel(pmd, address);
}

/*
 * Lookup the page table entry for a virtual address in a specific pgd.
 * Return a pointer to the entry and the level of the mapping.
 */
pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
                             unsigned int *level)
{
        bool nx, rw;

        return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
}

/*
 * Lookup the page table entry for a virtual address. Return a pointer
 * to the entry and the level of the mapping.
 *
 * Note: the function returns p4d, pud or pmd either when the entry is marked
 * large or when the present bit is not set. Otherwise it returns NULL.
 */
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address);

static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
                                  unsigned int *level, bool *nx, bool *rw)
{
        pgd_t *pgd;

        if (!cpa->pgd)
                pgd = pgd_offset_k(address);
        else
                pgd = cpa->pgd + pgd_index(address);

        return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
}

/*
 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
 * or NULL if not present.
 */
pmd_t *lookup_pmd_address(unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;

        pgd = pgd_offset_k(address);
        if (pgd_none(*pgd))
                return NULL;

        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d))
                return NULL;

        pud = pud_offset(p4d, address);
        if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
                return NULL;

        return pmd_offset(pud, address);
}

/*
 * This is necessary because __pa() does not work on some
 * kinds of memory, like vmalloc() or the alloc_remap()
 * areas on 32-bit NUMA systems.  The percpu areas can
 * end up in this kind of memory, for instance.
 *
 * Note that as long as the PTEs are well-formed with correct PFNs, this
 * works without checking the PRESENT bit in the leaf PTE.  This is unlike
 * the similar vmalloc_to_page() and derivatives.  Callers may depend on
 * this behavior.
 *
 * This could be optimized, but it is only used in paths that are not perf
 * sensitive, and keeping it unoptimized should increase the testing coverage
 * for the more obscure platforms.
 */
phys_addr_t slow_virt_to_phys(void *__virt_addr)
{
        unsigned long virt_addr = (unsigned long)__virt_addr;
        phys_addr_t phys_addr;
        unsigned long offset;
        enum pg_level level;
        pte_t *pte;

        pte = lookup_address(virt_addr, &level);
        BUG_ON(!pte);

        /*
         * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
         * before being left-shifted PAGE_SHIFT bits -- this trick is to
         * make 32-PAE kernel work correctly.
         */
        switch (level) {
        case PG_LEVEL_1G:
                phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
                offset = virt_addr & ~PUD_MASK;
                break;
        case PG_LEVEL_2M:
                phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
                offset = virt_addr & ~PMD_MASK;
                break;
        default:
                phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
                offset = virt_addr & ~PAGE_MASK;
        }

        return (phys_addr_t)(phys_addr | offset);
}
EXPORT_SYMBOL_GPL(slow_virt_to_phys);

/*
 * Set the new pmd in all the pgds we know about:
 */
static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
{
        /* change init_mm */
        set_pte_atomic(kpte, pte);
#ifdef CONFIG_X86_32
        {
                struct page *page;

                list_for_each_entry(page, &pgd_list, lru) {
                        pgd_t *pgd;
                        p4d_t *p4d;
                        pud_t *pud;
                        pmd_t *pmd;

                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
                        p4d = p4d_offset(pgd, address);
                        pud = pud_offset(p4d, address);
                        pmd = pmd_offset(pud, address);
                        set_pte_atomic((pte_t *)pmd, pte);
                }
        }
#endif
}

static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
{
        /*
         * _PAGE_GLOBAL means "global page" for present PTEs.
         * But, it is also used to indicate _PAGE_PROTNONE
         * for non-present PTEs.
         *
         * This ensures that a _PAGE_GLOBAL PTE going from
         * present to non-present is not confused as
         * _PAGE_PROTNONE.
         */
        if (!(pgprot_val(prot) & _PAGE_PRESENT))
                pgprot_val(prot) &= ~_PAGE_GLOBAL;

        return prot;
}

static int __should_split_large_page(pte_t *kpte, unsigned long address,
                                     struct cpa_data *cpa)
{
        unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
        pgprot_t old_prot, new_prot, req_prot, chk_prot;
        pte_t new_pte, *tmp;
        enum pg_level level;
        bool nx, rw;

        /*
         * Check for races, another CPU might have split this page
         * up already:
         */
        tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
        if (tmp != kpte)
                return 1;

        switch (level) {
        case PG_LEVEL_2M:
                old_prot = pmd_pgprot(*(pmd_t *)kpte);
                old_pfn = pmd_pfn(*(pmd_t *)kpte);
                cpa_inc_2m_checked();
                break;
        case PG_LEVEL_1G:
                old_prot = pud_pgprot(*(pud_t *)kpte);
                old_pfn = pud_pfn(*(pud_t *)kpte);
                cpa_inc_1g_checked();
                break;
        default:
                return -EINVAL;
        }

        psize = page_level_size(level);
        pmask = page_level_mask(level);

        /*
         * Calculate the number of pages, which fit into this large
         * page starting at address:
         */
        lpaddr = (address + psize) & pmask;
        numpages = (lpaddr - address) >> PAGE_SHIFT;
        if (numpages < cpa->numpages)
                cpa->numpages = numpages;

        /*
         * We are safe now. Check whether the new pgprot is the same:
         * Convert protection attributes to 4k-format, as cpa->mask* are set
         * up accordingly.
         */

        /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
        req_prot = pgprot_large_2_4k(old_prot);

        pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
        pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);

        /*
         * req_prot is in format of 4k pages. It must be converted to large
         * page format: the caching mode includes the PAT bit located at
         * different bit positions in the two formats.
         */
        req_prot = pgprot_4k_2_large(req_prot);
        req_prot = pgprot_clear_protnone_bits(req_prot);
        if (pgprot_val(req_prot) & _PAGE_PRESENT)
                pgprot_val(req_prot) |= _PAGE_PSE;

        /*
         * old_pfn points to the large page base pfn. So we need to add the
         * offset of the virtual address:
         */
        pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
        cpa->pfn = pfn;

        /*
         * Calculate the large page base address and the number of 4K pages
         * in the large page
         */
        lpaddr = address & pmask;
        numpages = psize >> PAGE_SHIFT;

        /*
         * Sanity check that the existing mapping is correct versus the static
         * protections. static_protections() guards against !PRESENT, so no
         * extra conditional required here.
         */
        chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
                                      psize, CPA_CONFLICT);

        if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
                /*
                 * Split the large page and tell the split code to
                 * enforce static protections.
                 */
                cpa->force_static_prot = 1;
                return 1;
        }

        /*
         * Optimization: If the requested pgprot is the same as the current
         * pgprot, then the large page can be preserved and no updates are
         * required independent of alignment and length of the requested
         * range. The above already established that the current pgprot is
         * correct, which in consequence makes the requested pgprot correct
         * as well if it is the same. The static protection scan below will
         * not come to a different conclusion.
         */
        if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
                cpa_inc_lp_sameprot(level);
                return 0;
        }

        /*
         * If the requested range does not cover the full page, split it up
         */
        if (address != lpaddr || cpa->numpages != numpages)
                return 1;

        /*
         * Check whether the requested pgprot is conflicting with a static
         * protection requirement in the large page.
         */
        new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
                                      psize, CPA_DETECT);

        new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
                              nx, rw);

        /*
         * If there is a conflict, split the large page.
         *
         * There used to be a 4k wise evaluation trying really hard to
         * preserve the large pages, but experimentation has shown, that this
         * does not help at all. There might be corner cases which would
         * preserve one large page occasionally, but it's really not worth the
         * extra code and cycles for the common case.
         */
        if (pgprot_val(req_prot) != pgprot_val(new_prot))
                return 1;

        /* All checks passed. Update the large page mapping. */
        new_pte = pfn_pte(old_pfn, new_prot);
        __set_pmd_pte(kpte, address, new_pte);
        cpa->flags |= CPA_FLUSHTLB;
        cpa_inc_lp_preserved(level);
        return 0;
}

static int should_split_large_page(pte_t *kpte, unsigned long address,
                                   struct cpa_data *cpa)
{
        int do_split;

        if (cpa->force_split)
                return 1;

        spin_lock(&pgd_lock);
        do_split = __should_split_large_page(kpte, address, cpa);
        spin_unlock(&pgd_lock);

        return do_split;
}

static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
                          pgprot_t ref_prot, unsigned long address,
                          unsigned long size)
{
        unsigned int npg = PFN_DOWN(size);
        pgprot_t prot;

        /*
         * If should_split_large_page() discovered an inconsistent mapping,
         * remove the invalid protection in the split mapping.
         */
        if (!cpa->force_static_prot)
                goto set;

        /* Hand in lpsize = 0 to enforce the protection mechanism */
        prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT);

        if (pgprot_val(prot) == pgprot_val(ref_prot))
                goto set;

        /*
         * If this is splitting a PMD, fix it up. PUD splits cannot be
         * fixed trivially as that would require to rescan the newly
         * installed PMD mappings after returning from split_large_page()
         * so an eventual further split can allocate the necessary PTE
         * pages. Warn for now and revisit it in case this actually
         * happens.
         */
        if (size == PAGE_SIZE)
                ref_prot = prot;
        else
                pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
set:
        set_pte(pte, pfn_pte(pfn, ref_prot));
}

static int
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
                   struct ptdesc *ptdesc)
{
        unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
        struct page *base = ptdesc_page(ptdesc);
        pte_t *pbase = (pte_t *)page_address(base);
        unsigned int i, level;
        pgprot_t ref_prot;
        bool nx, rw;
        pte_t *tmp;

        spin_lock(&pgd_lock);
        /*
         * Check for races, another CPU might have split this page
         * up for us already:
         */
        tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
        if (tmp != kpte) {
                spin_unlock(&pgd_lock);
                return 1;
        }

        paravirt_alloc_pte(&init_mm, page_to_pfn(base));

        switch (level) {
        case PG_LEVEL_2M:
                ref_prot = pmd_pgprot(*(pmd_t *)kpte);
                /*
                 * Clear PSE (aka _PAGE_PAT) and move
                 * PAT bit to correct position.
                 */
                ref_prot = pgprot_large_2_4k(ref_prot);
                ref_pfn = pmd_pfn(*(pmd_t *)kpte);
                lpaddr = address & PMD_MASK;
                lpinc = PAGE_SIZE;
                break;

        case PG_LEVEL_1G:
                ref_prot = pud_pgprot(*(pud_t *)kpte);
                ref_pfn = pud_pfn(*(pud_t *)kpte);
                pfninc = PMD_SIZE >> PAGE_SHIFT;
                lpaddr = address & PUD_MASK;
                lpinc = PMD_SIZE;
                /*
                 * Clear the PSE flags if the PRESENT flag is not set
                 * otherwise pmd_present() will return true even on a non
                 * present pmd.
                 */
                if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
                        pgprot_val(ref_prot) &= ~_PAGE_PSE;
                break;

        default:
                spin_unlock(&pgd_lock);
                return 1;
        }

        ref_prot = pgprot_clear_protnone_bits(ref_prot);

        /*
         * Get the target pfn from the original entry:
         */
        pfn = ref_pfn;
        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
                split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);

        if (virt_addr_valid(address)) {
                unsigned long pfn = PFN_DOWN(__pa(address));

                if (pfn_range_is_mapped(pfn, pfn + 1))
                        split_page_count(level);
        }

        /*
         * Install the new, split up pagetable.
         *
         * We use the standard kernel pagetable protections for the new
         * pagetable protections, the actual ptes set above control the
         * primary protection behavior:
         */
        __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));

        /*
         * Do a global flush tlb after splitting the large page
         * and before we do the actual change page attribute in the PTE.
         *
         * Without this, we violate the TLB application note, that says:
         * "The TLBs may contain both ordinary and large-page
         *  translations for a 4-KByte range of linear addresses. This
         *  may occur if software modifies the paging structures so that
         *  the page size used for the address range changes. If the two
         *  translations differ with respect to page frame or attributes
         *  (e.g., permissions), processor behavior is undefined and may
         *  be implementation-specific."
         *
         * We do this global tlb flush inside the cpa_lock, so that we
         * don't allow any other cpu, with stale tlb entries change the
         * page attribute in parallel, that also falls into the
         * just split large page entry.
         */
        flush_tlb_all();
        spin_unlock(&pgd_lock);

        return 0;
}

static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
                            unsigned long address)
{
        struct ptdesc *ptdesc;

        if (!debug_pagealloc_enabled())
                spin_unlock(&cpa_lock);
        ptdesc = pagetable_alloc(GFP_KERNEL, 0);
        if (!debug_pagealloc_enabled())
                spin_lock(&cpa_lock);
        if (!ptdesc)
                return -ENOMEM;

        if (__split_large_page(cpa, kpte, address, ptdesc))
                pagetable_free(ptdesc);

        return 0;
}

static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
                             struct list_head *pgtables)
{
        pmd_t _pmd, old_pmd;
        pte_t *pte, first;
        unsigned long pfn;
        pgprot_t pgprot;
        int i = 0;

        if (!cpu_feature_enabled(X86_FEATURE_PSE))
                return 0;

        addr &= PMD_MASK;
        pte = pte_offset_kernel(pmd, addr);
        first = *pte;
        pfn = pte_pfn(first);

        /* Make sure alignment is suitable */
        if (PFN_PHYS(pfn) & ~PMD_MASK)
                return 0;

        /* The page is 4k intentionally */
        if (pte_flags(first) & _PAGE_KERNEL_4K)
                return 0;

        /* Check that the rest of PTEs are compatible with the first one */
        for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
                pte_t entry = *pte;

                if (!pte_present(entry))
                        return 0;
                if (pte_flags(entry) != pte_flags(first))
                        return 0;
                if (pte_pfn(entry) != pte_pfn(first) + i)
                        return 0;
        }

        old_pmd = *pmd;

        /* Success: set up a large page */
        pgprot = pgprot_4k_2_large(pte_pgprot(first));
        pgprot_val(pgprot) |= _PAGE_PSE;
        _pmd = pfn_pmd(pfn, pgprot);
        set_pmd(pmd, _pmd);

        /* Queue the page table to be freed after TLB flush */
        list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables);

        if (IS_ENABLED(CONFIG_X86_32)) {
                struct page *page;

                /* Update all PGD tables to use the same large page */
                list_for_each_entry(page, &pgd_list, lru) {
                        pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
                        p4d_t *p4d = p4d_offset(pgd, addr);
                        pud_t *pud = pud_offset(p4d, addr);
                        pmd_t *pmd = pmd_offset(pud, addr);
                        /* Something is wrong if entries doesn't match */
                        if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
                                continue;
                        set_pmd(pmd, _pmd);
                }
        }

        if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
                collapse_page_count(PG_LEVEL_2M);

        return 1;
}

static int collapse_pud_page(pud_t *pud, unsigned long addr,
                             struct list_head *pgtables)
{
        unsigned long pfn;
        pmd_t *pmd, first;
        int i;

        if (!direct_gbpages)
                return 0;

        addr &= PUD_MASK;
        pmd = pmd_offset(pud, addr);
        first = *pmd;

        /*
         * To restore PUD page all PMD entries must be large and
         * have suitable alignment
         */
        pfn = pmd_pfn(first);
        if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK))
                return 0;

        /*
         * To restore PUD page, all following PMDs must be compatible with the
         * first one.
         */
        for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) {
                pmd_t entry = *pmd;

                if (!pmd_present(entry) || !pmd_leaf(entry))
                        return 0;
                if (pmd_flags(entry) != pmd_flags(first))
                        return 0;
                if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE)
                        return 0;
        }

        /* Restore PUD page and queue page table to be freed after TLB flush */
        list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables);
        set_pud(pud, pfn_pud(pfn, pmd_pgprot(first)));

        if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
                collapse_page_count(PG_LEVEL_1G);

        return 1;
}

/*
 * Collapse PMD and PUD pages in the kernel mapping around the address where
 * possible.
 *
 * Caller must flush TLB and free page tables queued on the list before
 * touching the new entries. CPU must not see TLB entries of different size
 * with different attributes.
 */
static int collapse_large_pages(unsigned long addr, struct list_head *pgtables)
{
        int collapsed = 0;
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        addr &= PMD_MASK;

        spin_lock(&pgd_lock);
        pgd = pgd_offset_k(addr);
        if (pgd_none(*pgd))
                goto out;
        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d))
                goto out;
        pud = pud_offset(p4d, addr);
        if (!pud_present(*pud) || pud_leaf(*pud))
                goto out;
        pmd = pmd_offset(pud, addr);
        if (!pmd_present(*pmd) || pmd_leaf(*pmd))
                goto out;

        collapsed = collapse_pmd_page(pmd, addr, pgtables);
        if (collapsed)
                collapsed += collapse_pud_page(pud, addr, pgtables);

out:
        spin_unlock(&pgd_lock);
        return collapsed;
}

static bool try_to_free_pte_page(pte_t *pte)
{
        int i;

        for (i = 0; i < PTRS_PER_PTE; i++)
                if (!pte_none(pte[i]))
                        return false;

        pte_free_kernel(&init_mm, pte);
        return true;
}

static bool try_to_free_pmd_page(pmd_t *pmd)
{
        int i;

        for (i = 0; i < PTRS_PER_PMD; i++)
                if (!pmd_none(pmd[i]))
                        return false;

        pmd_free(&init_mm, pmd);
        return true;
}

static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
{
        pte_t *pte = pte_offset_kernel(pmd, start);

        while (start < end) {
                set_pte(pte, __pte(0));

                start += PAGE_SIZE;
                pte++;
        }

        if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
                pmd_clear(pmd);
                return true;
        }
        return false;
}

static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
                              unsigned long start, unsigned long end)
{
        if (unmap_pte_range(pmd, start, end))
                if (try_to_free_pmd_page(pud_pgtable(*pud)))
                        pud_clear(pud);
}

static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
{
        pmd_t *pmd = pmd_offset(pud, start);

        /*
         * Not on a 2MB page boundary?
         */
        if (start & (PMD_SIZE - 1)) {
                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
                unsigned long pre_end = min_t(unsigned long, end, next_page);

                __unmap_pmd_range(pud, pmd, start, pre_end);

                start = pre_end;
                pmd++;
        }

        /*
         * Try to unmap in 2M chunks.
         */
        while (end - start >= PMD_SIZE) {
                if (pmd_leaf(*pmd))
                        pmd_clear(pmd);
                else
                        __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);

                start += PMD_SIZE;
                pmd++;
        }

        /*
         * 4K leftovers?
         */
        if (start < end)
                return __unmap_pmd_range(pud, pmd, start, end);

        /*
         * Try again to free the PMD page if haven't succeeded above.
         */
        if (!pud_none(*pud))
                if (try_to_free_pmd_page(pud_pgtable(*pud)))
                        pud_clear(pud);
}

static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
{
        pud_t *pud = pud_offset(p4d, start);

        /*
         * Not on a GB page boundary?
         */
        if (start & (PUD_SIZE - 1)) {
                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
                unsigned long pre_end        = min_t(unsigned long, end, next_page);

                unmap_pmd_range(pud, start, pre_end);

                start = pre_end;
                pud++;
        }

        /*
         * Try to unmap in 1G chunks?
         */
        while (end - start >= PUD_SIZE) {

                if (pud_leaf(*pud))
                        pud_clear(pud);
                else
                        unmap_pmd_range(pud, start, start + PUD_SIZE);

                start += PUD_SIZE;
                pud++;
        }

        /*
         * 2M leftovers?
         */
        if (start < end)
                unmap_pmd_range(pud, start, end);

        /*
         * No need to try to free the PUD page because we'll free it in
         * populate_pgd's error path
         */
}

static int alloc_pte_page(pmd_t *pmd)
{
        pte_t *pte = pte_alloc_one_kernel(&init_mm);
        if (!pte)
                return -1;

        set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
        return 0;
}

static int alloc_pmd_page(pud_t *pud)
{
        /*
         * Pass 0 as a placeholder for the second argument, since the
         * generic implementation of pmd_alloc_one() does not use it.
         */
        pmd_t *pmd = pmd_alloc_one(&init_mm, 0);
        if (!pmd)
                return -1;

        set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
        return 0;
}

static void populate_pte(struct cpa_data *cpa,
                         unsigned long start, unsigned long end,
                         unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
{
        pte_t *pte;

        pte = pte_offset_kernel(pmd, start);

        pgprot = pgprot_clear_protnone_bits(pgprot);

        while (num_pages-- && start < end) {
                set_pte(pte, pfn_pte(cpa->pfn, pgprot));

                start         += PAGE_SIZE;
                cpa->pfn++;
                pte++;
        }
}

static long populate_pmd(struct cpa_data *cpa,
                         unsigned long start, unsigned long end,
                         unsigned num_pages, pud_t *pud, pgprot_t pgprot)
{
        long cur_pages = 0;
        pmd_t *pmd;
        pgprot_t pmd_pgprot;

        /*
         * Not on a 2M boundary?
         */
        if (start & (PMD_SIZE - 1)) {
                unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;

                pre_end   = min_t(unsigned long, pre_end, next_page);
                cur_pages = (pre_end - start) >> PAGE_SHIFT;
                cur_pages = min_t(unsigned int, num_pages, cur_pages);

                /*
                 * Need a PTE page?
                 */
                pmd = pmd_offset(pud, start);
                if (pmd_none(*pmd))
                        if (alloc_pte_page(pmd))
                                return -1;

                populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);

                start = pre_end;
        }

        /*
         * We mapped them all?
         */
        if (num_pages == cur_pages)
                return cur_pages;

        pmd_pgprot = pgprot_4k_2_large(pgprot);

        while (end - start >= PMD_SIZE) {

                /*
                 * We cannot use a 1G page so allocate a PMD page if needed.
                 */
                if (pud_none(*pud))
                        if (alloc_pmd_page(pud))
                                return -1;

                pmd = pmd_offset(pud, start);

                set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
                                        canon_pgprot(pmd_pgprot))));

                start          += PMD_SIZE;
                cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
                cur_pages += PMD_SIZE >> PAGE_SHIFT;
        }

        /*
         * Map trailing 4K pages.
         */
        if (start < end) {
                pmd = pmd_offset(pud, start);
                if (pmd_none(*pmd))
                        if (alloc_pte_page(pmd))
                                return -1;

                populate_pte(cpa, start, end, num_pages - cur_pages,
                             pmd, pgprot);
        }
        return num_pages;
}

static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
                        pgprot_t pgprot)
{
        pud_t *pud;
        unsigned long end;
        long cur_pages = 0;
        pgprot_t pud_pgprot;

        end = start + (cpa->numpages << PAGE_SHIFT);

        /*
         * Not on a Gb page boundary? => map everything up to it with
         * smaller pages.
         */
        if (start & (PUD_SIZE - 1)) {
                unsigned long pre_end;
                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;

                pre_end   = min_t(unsigned long, end, next_page);
                cur_pages = (pre_end - start) >> PAGE_SHIFT;
                cur_pages = min_t(int, (int)cpa->numpages, cur_pages);

                pud = pud_offset(p4d, start);

                /*
                 * Need a PMD page?
                 */
                if (pud_none(*pud))
                        if (alloc_pmd_page(pud))
                                return -1;

                cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
                                         pud, pgprot);
                if (cur_pages < 0)
                        return cur_pages;

                start = pre_end;
        }

        /* We mapped them all? */
        if (cpa->numpages == cur_pages)
                return cur_pages;

        pud = pud_offset(p4d, start);
        pud_pgprot = pgprot_4k_2_large(pgprot);

        /*
         * Map everything starting from the Gb boundary, possibly with 1G pages
         */
        while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
                set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
                                   canon_pgprot(pud_pgprot))));

                start          += PUD_SIZE;
                cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
                cur_pages += PUD_SIZE >> PAGE_SHIFT;
                pud++;
        }

        /* Map trailing leftover */
        if (start < end) {
                long tmp;

                pud = pud_offset(p4d, start);
                if (pud_none(*pud))
                        if (alloc_pmd_page(pud))
                                return -1;

                tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
                                   pud, pgprot);
                if (tmp < 0)
                        return cur_pages;

                cur_pages += tmp;
        }
        return cur_pages;
}

/*
 * Restrictions for kernel page table do not necessarily apply when mapping in
 * an alternate PGD.
 */
static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
{
        pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
        pud_t *pud = NULL;        /* shut up gcc */
        p4d_t *p4d;
        pgd_t *pgd_entry;
        long ret;

        pgd_entry = cpa->pgd + pgd_index(addr);

        if (pgd_none(*pgd_entry)) {
                /*
                 * Pass 0 as a placeholder for the second argument, since the
                 * generic implementation of p4d_alloc_one() does not use it.
                 */
                p4d = p4d_alloc_one(&init_mm, 0);
                if (!p4d)
                        return -1;

                set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
        }

        /*
         * Allocate a PUD page and hand it down for mapping.
         */
        p4d = p4d_offset(pgd_entry, addr);
        if (p4d_none(*p4d)) {
                /*
                 * Pass 0 as a placeholder for the second argument, since the
                 * generic implementation of pud_alloc_one() does not use it.
                 */
                pud = pud_alloc_one(&init_mm, 0);
                if (!pud)
                        return -1;

                set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
        }

        pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
        pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);

        ret = populate_pud(cpa, addr, p4d, pgprot);
        if (ret < 0) {
                /*
                 * Leave the PUD page in place in case some other CPU or thread
                 * already found it, but remove any useless entries we just
                 * added to it.
                 */
                unmap_pud_range(p4d, addr,
                                addr + (cpa->numpages << PAGE_SHIFT));
                return ret;
        }

        cpa->numpages = ret;
        return 0;
}

static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
                               int primary)
{
        if (cpa->pgd) {
                /*
                 * Right now, we only execute this code path when mapping
                 * the EFI virtual memory map regions, no other users
                 * provide a ->pgd value. This may change in the future.
                 */
                return populate_pgd(cpa, vaddr);
        }

        /*
         * Ignore all non primary paths.
         */
        if (!primary) {
                cpa->numpages = 1;
                return 0;
        }

        /*
         * Ignore the NULL PTE for kernel identity mapping, as it is expected
         * to have holes.
         * Also set numpages to '1' indicating that we processed cpa req for
         * one virtual address page and its pfn. TBD: numpages can be set based
         * on the initial value and the level returned by lookup_address().
         */
        if (within(vaddr, PAGE_OFFSET,
                   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
                cpa->numpages = 1;
                cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
                return 0;

        } else if (__cpa_pfn_in_highmap(cpa->pfn)) {
                /* Faults in the highmap are OK, so do not warn: */
                return -EFAULT;
        } else {
                WARN(1, KERN_WARNING "CPA: called for zero pte. "
                        "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
                        *cpa->vaddr);

                return -EFAULT;
        }
}

static int __change_page_attr(struct cpa_data *cpa, int primary)
{
        unsigned long address;
        int do_split, err;
        unsigned int level;
        pte_t *kpte, old_pte;
        bool nx, rw;

        address = __cpa_addr(cpa, cpa->curpage);
repeat:
        kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
        if (!kpte)
                return __cpa_process_fault(cpa, address, primary);

        old_pte = *kpte;
        if (pte_none(old_pte))
                return __cpa_process_fault(cpa, address, primary);

        if (level == PG_LEVEL_4K) {
                pte_t new_pte;
                pgprot_t old_prot = pte_pgprot(old_pte);
                pgprot_t new_prot = pte_pgprot(old_pte);
                unsigned long pfn = pte_pfn(old_pte);

                pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
                pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);

                cpa_inc_4k_install();
                /* Hand in lpsize = 0 to enforce the protection mechanism */
                new_prot = static_protections(new_prot, address, pfn, 1, 0,
                                              CPA_PROTECT);

                new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
                                      nx, rw);

                new_prot = pgprot_clear_protnone_bits(new_prot);

                /*
                 * We need to keep the pfn from the existing PTE,
                 * after all we're only going to change its attributes
                 * not the memory it points to
                 */
                new_pte = pfn_pte(pfn, new_prot);
                cpa->pfn = pfn;
                /*
                 * Do we really change anything ?
                 */
                if (pte_val(old_pte) != pte_val(new_pte)) {
                        set_pte_atomic(kpte, new_pte);
                        cpa->flags |= CPA_FLUSHTLB;
                }
                cpa->numpages = 1;
                return 0;
        }

        /*
         * Check, whether we can keep the large page intact
         * and just change the pte:
         */
        do_split = should_split_large_page(kpte, address, cpa);
        /*
         * When the range fits into the existing large page,
         * return. cp->numpages and cpa->tlbflush have been updated in
         * try_large_page:
         */
        if (do_split <= 0)
                return do_split;

        /*
         * We have to split the large page:
         */
        err = split_large_page(cpa, kpte, address);
        if (!err)
                goto repeat;

        return err;
}

static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary);

/*
 * Check the directmap and "high kernel map" 'aliases'.
 */
static int cpa_process_alias(struct cpa_data *cpa)
{
        struct cpa_data alias_cpa;
        unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
        unsigned long vaddr;
        int ret;

        if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
                return 0;

        /*
         * No need to redo, when the primary call touched the direct
         * mapping already:
         */
        vaddr = __cpa_addr(cpa, cpa->curpage);
        if (!(within(vaddr, PAGE_OFFSET,
                    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {

                alias_cpa = *cpa;
                alias_cpa.vaddr = &laddr;
                alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
                alias_cpa.curpage = 0;

                /* Directmap always has NX set, do not modify. */
                if (__supported_pte_mask & _PAGE_NX) {
                        alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
                        alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
                }

                cpa->force_flush_all = 1;

                ret = __change_page_attr_set_clr(&alias_cpa, 0);
                if (ret)
                        return ret;
        }

#ifdef CONFIG_X86_64
        /*
         * If the primary call didn't touch the high mapping already
         * and the physical address is inside the kernel map, we need
         * to touch the high mapped kernel as well:
         */
        if (!within(vaddr, (unsigned long)_text, _brk_end) &&
            __cpa_pfn_in_highmap(cpa->pfn)) {
                unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
                                               __START_KERNEL_map - phys_base;
                alias_cpa = *cpa;
                alias_cpa.vaddr = &temp_cpa_vaddr;
                alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
                alias_cpa.curpage = 0;

                /*
                 * [_text, _brk_end) also covers data, do not modify NX except
                 * in cases where the highmap is the primary target.
                 */
                if (__supported_pte_mask & _PAGE_NX) {
                        alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
                        alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
                }

                cpa->force_flush_all = 1;
                /*
                 * The high mapping range is imprecise, so ignore the
                 * return value.
                 */
                __change_page_attr_set_clr(&alias_cpa, 0);
        }
#endif

        return 0;
}

static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
{
        unsigned long numpages = cpa->numpages;
        unsigned long rempages = numpages;
        int ret = 0;

        /*
         * No changes, easy!
         */
        if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
            !cpa->force_split)
                return ret;

        while (rempages) {
                /*
                 * Store the remaining nr of pages for the large page
                 * preservation check.
                 */
                cpa->numpages = rempages;
                /* for array changes, we can't use large page */
                if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
                        cpa->numpages = 1;

                if (!debug_pagealloc_enabled())
                        spin_lock(&cpa_lock);
                ret = __change_page_attr(cpa, primary);
                if (!debug_pagealloc_enabled())
                        spin_unlock(&cpa_lock);
                if (ret)
                        goto out;

                if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
                        ret = cpa_process_alias(cpa);
                        if (ret)
                                goto out;
                }

                /*
                 * Adjust the number of pages with the result of the
                 * CPA operation. Either a large page has been
                 * preserved or a single page update happened.
                 */
                BUG_ON(cpa->numpages > rempages || !cpa->numpages);
                rempages -= cpa->numpages;
                cpa->curpage += cpa->numpages;
        }

out:
        /* Restore the original numpages */
        cpa->numpages = numpages;
        return ret;
}

static int change_page_attr_set_clr(unsigned long *addr, int numpages,
                                    pgprot_t mask_set, pgprot_t mask_clr,
                                    int force_split, int in_flag,
                                    struct page **pages)
{
        struct cpa_data cpa;
        int ret, cache;

        memset(&cpa, 0, sizeof(cpa));

        /*
         * Check, if we are requested to set a not supported
         * feature.  Clearing non-supported features is OK.
         */
        mask_set = canon_pgprot(mask_set);

        if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
                return 0;

        /* Ensure we are PAGE_SIZE aligned */
        if (in_flag & CPA_ARRAY) {
                int i;
                for (i = 0; i < numpages; i++) {
                        if (addr[i] & ~PAGE_MASK) {
                                addr[i] &= PAGE_MASK;
                                WARN_ON_ONCE(1);
                        }
                }
        } else if (!(in_flag & CPA_PAGES_ARRAY)) {
                /*
                 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
                 * No need to check in that case
                 */
                if (*addr & ~PAGE_MASK) {
                        *addr &= PAGE_MASK;
                        /*
                         * People should not be passing in unaligned addresses:
                         */
                        WARN_ON_ONCE(1);
                }
        }

        /* Must avoid aliasing mappings in the highmem code */
        kmap_flush_unused();

        vm_unmap_aliases();

        cpa.vaddr = addr;
        cpa.pages = pages;
        cpa.numpages = numpages;
        cpa.mask_set = mask_set;
        cpa.mask_clr = mask_clr;
        cpa.flags = in_flag;
        cpa.curpage = 0;
        cpa.force_split = force_split;

        ret = __change_page_attr_set_clr(&cpa, 1);

        /*
         * Check whether we really changed something:
         */
        if (!(cpa.flags & CPA_FLUSHTLB))
                goto out;

        /*
         * No need to flush, when we did not set any of the caching
         * attributes:
         */
        cache = !!pgprot2cachemode(mask_set);

        /*
         * On error; flush everything to be sure.
         */
        if (ret) {
                cpa_flush_all(cache);
                goto out;
        }

        cpa_flush(&cpa, cache);
out:
        return ret;
}

static inline int change_page_attr_set(unsigned long *addr, int numpages,
                                       pgprot_t mask, int array)
{
        return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
                (array ? CPA_ARRAY : 0), NULL);
}

static inline int change_page_attr_clear(unsigned long *addr, int numpages,
                                         pgprot_t mask, int array)
{
        return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
                (array ? CPA_ARRAY : 0), NULL);
}

static inline int cpa_set_pages_array(struct page **pages, int numpages,
                                       pgprot_t mask)
{
        return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
                CPA_PAGES_ARRAY, pages);
}

static inline int cpa_clear_pages_array(struct page **pages, int numpages,
                                         pgprot_t mask)
{
        return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
                CPA_PAGES_ARRAY, pages);
}

int _set_memory_uc(unsigned long addr, int numpages)
{
        /*
         * for now UC MINUS. see comments in ioremap()
         * If you really need strong UC use ioremap_uc(), but note
         * that you cannot override IO areas with set_memory_*() as
         * these helpers cannot work with IO memory.
         */
        return change_page_attr_set(&addr, numpages,
                                    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
                                    0);
}

int set_memory_uc(unsigned long addr, int numpages)
{
        int ret;

        /*
         * for now UC MINUS. see comments in ioremap()
         */
        ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
                              _PAGE_CACHE_MODE_UC_MINUS, NULL);
        if (ret)
                goto out_err;

        ret = _set_memory_uc(addr, numpages);
        if (ret)
                goto out_free;

        return 0;

out_free:
        memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
out_err:
        return ret;
}
EXPORT_SYMBOL(set_memory_uc);

int _set_memory_wc(unsigned long addr, int numpages)
{
        int ret;

        ret = change_page_attr_set(&addr, numpages,
                                   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
                                   0);
        if (!ret) {
                ret = change_page_attr_set_clr(&addr, numpages,
                                               cachemode2pgprot(_PAGE_CACHE_MODE_WC),
                                               __pgprot(_PAGE_CACHE_MASK),
                                               0, 0, NULL);
        }
        return ret;
}

int set_memory_wc(unsigned long addr, int numpages)
{
        int ret;

        ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
                _PAGE_CACHE_MODE_WC, NULL);
        if (ret)
                return ret;

        ret = _set_memory_wc(addr, numpages);
        if (ret)
                memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);

        return ret;
}
EXPORT_SYMBOL(set_memory_wc);

int _set_memory_wt(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages,
                                    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
}

int _set_memory_wb(unsigned long addr, int numpages)
{
        /* WB cache mode is hard wired to all cache attribute bits being 0 */
        return change_page_attr_clear(&addr, numpages,
                                      __pgprot(_PAGE_CACHE_MASK), 0);
}

int set_memory_wb(unsigned long addr, int numpages)
{
        int ret;

        ret = _set_memory_wb(addr, numpages);
        if (ret)
                return ret;

        memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
        return 0;
}
EXPORT_SYMBOL(set_memory_wb);

/* Prevent speculative access to a page by marking it not-present */
#ifdef CONFIG_X86_64
int set_mce_nospec(unsigned long pfn)
{
        unsigned long decoy_addr;
        int rc;

        /* SGX pages are not in the 1:1 map */
        if (arch_is_platform_page(pfn << PAGE_SHIFT))
                return 0;
        /*
         * We would like to just call:
         *      set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
         * but doing that would radically increase the odds of a
         * speculative access to the poison page because we'd have
         * the virtual address of the kernel 1:1 mapping sitting
         * around in registers.
         * Instead we get tricky.  We create a non-canonical address
         * that looks just like the one we want, but has bit 63 flipped.
         * This relies on set_memory_XX() properly sanitizing any __pa()
         * results with __PHYSICAL_MASK or PTE_PFN_MASK.
         */
        decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));

        rc = set_memory_np(decoy_addr, 1);
        if (rc)
                pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
        return rc;
}
EXPORT_SYMBOL_GPL(set_mce_nospec);

/* Restore full speculative operation to the pfn. */
int clear_mce_nospec(unsigned long pfn)
{
        unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);

        return set_memory_p(addr, 1);
}
EXPORT_SYMBOL_GPL(clear_mce_nospec);
#endif /* CONFIG_X86_64 */

int set_memory_x(unsigned long addr, int numpages)
{
        if (!(__supported_pte_mask & _PAGE_NX))
                return 0;

        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
}

int set_memory_nx(unsigned long addr, int numpages)
{
        if (!(__supported_pte_mask & _PAGE_NX))
                return 0;

        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
}

int set_memory_ro(unsigned long addr, int numpages)
{
        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0);
}

int set_memory_rox(unsigned long addr, int numpages)
{
        pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY);

        if (__supported_pte_mask & _PAGE_NX)
                clr.pgprot |= _PAGE_NX;

        return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0,
                                        CPA_COLLAPSE, NULL);
}

int set_memory_rw(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
}

int set_memory_np(unsigned long addr, int numpages)
{
        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}

int set_memory_np_noalias(unsigned long addr, int numpages)
{
        return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
                                        __pgprot(_PAGE_PRESENT), 0,
                                        CPA_NO_CHECK_ALIAS, NULL);
}

int set_memory_p(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}

int set_memory_4k(unsigned long addr, int numpages)
{
        return change_page_attr_set_clr(&addr, numpages,
                                        __pgprot(_PAGE_KERNEL_4K),
                                        __pgprot(0), 1, 0, NULL);
}

int set_memory_nonglobal(unsigned long addr, int numpages)
{
        return change_page_attr_clear(&addr, numpages,
                                      __pgprot(_PAGE_GLOBAL), 0);
}

int set_memory_global(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages,
                                    __pgprot(_PAGE_GLOBAL), 0);
}

/*
 * __set_memory_enc_pgtable() is used for the hypervisors that get
 * informed about "encryption" status via page tables.
 */
static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
{
        pgprot_t empty = __pgprot(0);
        struct cpa_data cpa;
        int ret;

        /* Should not be working on unaligned addresses */
        if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
                addr &= PAGE_MASK;

        memset(&cpa, 0, sizeof(cpa));
        cpa.vaddr = &addr;
        cpa.numpages = numpages;
        cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty);
        cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty);
        cpa.pgd = init_mm.pgd;

        /* Must avoid aliasing mappings in the highmem code */
        kmap_flush_unused();
        vm_unmap_aliases();

        /* Flush the caches as needed before changing the encryption attribute. */
        if (x86_platform.guest.enc_tlb_flush_required(enc))
                cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());

        /* Notify hypervisor that we are about to set/clr encryption attribute. */
        ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
        if (ret)
                goto vmm_fail;

        ret = __change_page_attr_set_clr(&cpa, 1);

        /*
         * After changing the encryption attribute, we need to flush TLBs again
         * in case any speculative TLB caching occurred (but no need to flush
         * caches again).  We could just use cpa_flush_all(), but in case TLB
         * flushing gets optimized in the cpa_flush() path use the same logic
         * as above.
         */
        cpa_flush(&cpa, 0);

        if (ret)
                return ret;

        /* Notify hypervisor that we have successfully set/clr encryption attribute. */
        ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc);
        if (ret)
                goto vmm_fail;

        return 0;

vmm_fail:
        WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n",
                  (void *)addr, numpages, enc ? "private" : "shared", ret);

        return ret;
}

/*
 * The lock serializes conversions between private and shared memory.
 *
 * It is taken for read on conversion. A write lock guarantees that no
 * concurrent conversions are in progress.
 */
static DECLARE_RWSEM(mem_enc_lock);

/*
 * Stop new private<->shared conversions.
 *
 * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
 * The lock is not released to prevent new conversions from being started.
 */
bool set_memory_enc_stop_conversion(void)
{
        /*
         * In a crash scenario, sleep is not allowed. Try to take the lock.
         * Failure indicates that there is a race with the conversion.
         */
        if (oops_in_progress)
                return down_write_trylock(&mem_enc_lock);

        down_write(&mem_enc_lock);

        return true;
}

static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
        int ret = 0;

        if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
                if (!down_read_trylock(&mem_enc_lock))
                        return -EBUSY;

                ret = __set_memory_enc_pgtable(addr, numpages, enc);

                up_read(&mem_enc_lock);
        }

        return ret;
}

int set_memory_encrypted(unsigned long addr, int numpages)
{
        return __set_memory_enc_dec(addr, numpages, true);
}
EXPORT_SYMBOL_GPL(set_memory_encrypted);

int set_memory_decrypted(unsigned long addr, int numpages)
{
        return __set_memory_enc_dec(addr, numpages, false);
}
EXPORT_SYMBOL_GPL(set_memory_decrypted);

int set_pages_uc(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_uc(addr, numpages);
}
EXPORT_SYMBOL(set_pages_uc);

static int _set_pages_array(struct page **pages, int numpages,
                enum page_cache_mode new_type)
{
        unsigned long start;
        unsigned long end;
        enum page_cache_mode set_type;
        int i;
        int free_idx;
        int ret;

        for (i = 0; i < numpages; i++) {
                if (PageHighMem(pages[i]))
                        continue;
                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                if (memtype_reserve(start, end, new_type, NULL))
                        goto err_out;
        }

        /* If WC, set to UC- first and then WC */
        set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
                                _PAGE_CACHE_MODE_UC_MINUS : new_type;

        ret = cpa_set_pages_array(pages, numpages,
                                  cachemode2pgprot(set_type));
        if (!ret && new_type == _PAGE_CACHE_MODE_WC)
                ret = change_page_attr_set_clr(NULL, numpages,
                                               cachemode2pgprot(
                                                _PAGE_CACHE_MODE_WC),
                                               __pgprot(_PAGE_CACHE_MASK),
                                               0, CPA_PAGES_ARRAY, pages);
        if (ret)
                goto err_out;
        return 0; /* Success */
err_out:
        free_idx = i;
        for (i = 0; i < free_idx; i++) {
                if (PageHighMem(pages[i]))
                        continue;
                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                memtype_free(start, end);
        }
        return -EINVAL;
}

int set_pages_array_uc(struct page **pages, int numpages)
{
        return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
}
EXPORT_SYMBOL(set_pages_array_uc);

int set_pages_array_wc(struct page **pages, int numpages)
{
        return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
}
EXPORT_SYMBOL(set_pages_array_wc);

int set_pages_wb(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_wb(addr, numpages);
}
EXPORT_SYMBOL(set_pages_wb);

int set_pages_array_wb(struct page **pages, int numpages)
{
        int retval;
        unsigned long start;
        unsigned long end;
        int i;

        /* WB cache mode is hard wired to all cache attribute bits being 0 */
        retval = cpa_clear_pages_array(pages, numpages,
                        __pgprot(_PAGE_CACHE_MASK));
        if (retval)
                return retval;

        for (i = 0; i < numpages; i++) {
                if (PageHighMem(pages[i]))
                        continue;
                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                memtype_free(start, end);
        }

        return 0;
}
EXPORT_SYMBOL(set_pages_array_wb);

int set_pages_ro(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_ro(addr, numpages);
}

int set_pages_rw(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_rw(addr, numpages);
}

static int __set_pages_p(struct page *page, int numpages)
{
        unsigned long tempaddr = (unsigned long) page_address(page);
        struct cpa_data cpa = { .vaddr = &tempaddr,
                                .pgd = NULL,
                                .numpages = numpages,
                                .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
                                .mask_clr = __pgprot(0),
                                .flags = CPA_NO_CHECK_ALIAS };

        /*
         * No alias checking needed for setting present flag. otherwise,
         * we may need to break large pages for 64-bit kernel text
         * mappings (this adds to complexity if we want to do this from
         * atomic context especially). Let's keep it simple!
         */
        return __change_page_attr_set_clr(&cpa, 1);
}

static int __set_pages_np(struct page *page, int numpages)
{
        unsigned long tempaddr = (unsigned long) page_address(page);
        struct cpa_data cpa = { .vaddr = &tempaddr,
                                .pgd = NULL,
                                .numpages = numpages,
                                .mask_set = __pgprot(0),
                                .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
                                .flags = CPA_NO_CHECK_ALIAS };

        /*
         * No alias checking needed for setting not present flag. otherwise,
         * we may need to break large pages for 64-bit kernel text
         * mappings (this adds to complexity if we want to do this from
         * atomic context especially). Let's keep it simple!
         */
        return __change_page_attr_set_clr(&cpa, 1);
}

int set_direct_map_invalid_noflush(struct page *page)
{
        return __set_pages_np(page, 1);
}

int set_direct_map_default_noflush(struct page *page)
{
        return __set_pages_p(page, 1);
}

int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
{
        if (valid)
                return __set_pages_p(page, nr);

        return __set_pages_np(page, nr);
}

#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
        if (PageHighMem(page))
                return;
        if (!enable) {
                debug_check_no_locks_freed(page_address(page),
                                           numpages * PAGE_SIZE);
        }

        /*
         * The return value is ignored as the calls cannot fail.
         * Large pages for identity mappings are not used at boot time
         * and hence no memory allocations during large page split.
         */
        if (enable)
                __set_pages_p(page, numpages);
        else
                __set_pages_np(page, numpages);

        /*
         * We should perform an IPI and flush all tlbs,
         * but that can deadlock->flush only current cpu.
         * Preemption needs to be disabled around __flush_tlb_all() due to
         * CR3 reload in __native_flush_tlb().
         */
        preempt_disable();
        __flush_tlb_all();
        preempt_enable();

        arch_flush_lazy_mmu_mode();
}
#endif /* CONFIG_DEBUG_PAGEALLOC */

bool kernel_page_present(struct page *page)
{
        unsigned int level;
        pte_t *pte;

        if (PageHighMem(page))
                return false;

        pte = lookup_address((unsigned long)page_address(page), &level);
        return (pte_val(*pte) & _PAGE_PRESENT);
}

int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
                                   unsigned numpages, unsigned long page_flags)
{
        int retval = -EINVAL;

        struct cpa_data cpa = {
                .vaddr = &address,
                .pfn = pfn,
                .pgd = pgd,
                .numpages = numpages,
                .mask_set = __pgprot(0),
                .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)),
                .flags = CPA_NO_CHECK_ALIAS,
        };

        WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");

        if (!(__supported_pte_mask & _PAGE_NX))
                goto out;

        if (!(page_flags & _PAGE_ENC))
                cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);

        cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);

        retval = __change_page_attr_set_clr(&cpa, 1);
        __flush_tlb_all();

out:
        return retval;
}

/*
 * __flush_tlb_all() flushes mappings only on current CPU and hence this
 * function shouldn't be used in an SMP environment. Presently, it's used only
 * during boot (way before smp_init()) by EFI subsystem and hence is ok.
 */
int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
                                     unsigned long numpages)
{
        int retval;

        /*
         * The typical sequence for unmapping is to find a pte through
         * lookup_address_in_pgd() (ideally, it should never return NULL because
         * the address is already mapped) and change its protections. As pfn is
         * the *target* of a mapping, it's not useful while unmapping.
         */
        struct cpa_data cpa = {
                .vaddr                = &address,
                .pfn                = 0,
                .pgd                = pgd,
                .numpages        = numpages,
                .mask_set        = __pgprot(0),
                .mask_clr        = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
                .flags                = CPA_NO_CHECK_ALIAS,
        };

        WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");

        retval = __change_page_attr_set_clr(&cpa, 1);
        __flush_tlb_all();

        return retval;
}

/*
 * The testcases use internal knowledge of the implementation that shouldn't
 * be exposed to the rest of the kernel. Include these directly here.
 */
#ifdef CONFIG_CPA_DEBUG
#include "cpa-test.c"
#endif









































































































































































































































































   14 



   14 















































   11 











   14 








    4 

























































































































   13 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BLK_CGROUP_PRIVATE_H
#define _BLK_CGROUP_PRIVATE_H
/*
 * block cgroup private header
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                       Nauman Rafique <nauman@google.com>
 */

#include <linux/blk-cgroup.h>
#include <linux/cgroup.h>
#include <linux/kthread.h>
#include <linux/blk-mq.h>
#include <linux/llist.h>
#include "blk.h"

struct blkcg_gq;
struct blkg_policy_data;


/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
#define BLKG_STAT_CPU_BATCH        (INT_MAX / 2)

#ifdef CONFIG_BLK_CGROUP

enum blkg_iostat_type {
        BLKG_IOSTAT_READ,
        BLKG_IOSTAT_WRITE,
        BLKG_IOSTAT_DISCARD,

        BLKG_IOSTAT_NR,
};

struct blkg_iostat {
        u64                                bytes[BLKG_IOSTAT_NR];
        u64                                ios[BLKG_IOSTAT_NR];
};

struct blkg_iostat_set {
        struct u64_stats_sync                sync;
        struct blkcg_gq                       *blkg;
        struct llist_node                lnode;
        int                                lqueued;        /* queued in llist */
        struct blkg_iostat                cur;
        struct blkg_iostat                last;
};

/* association between a blk cgroup and a request queue */
struct blkcg_gq {
        /* Pointer to the associated request_queue */
        struct request_queue                *q;
        struct list_head                q_node;
        struct hlist_node                blkcg_node;
        struct blkcg                        *blkcg;

        /* all non-root blkcg_gq's are guaranteed to have access to parent */
        struct blkcg_gq                        *parent;

        /* reference count */
        struct percpu_ref                refcnt;

        /* is this blkg online? protected by both blkcg and q locks */
        bool                                online;

        struct blkg_iostat_set __percpu        *iostat_cpu;
        struct blkg_iostat_set                iostat;

        struct blkg_policy_data                *pd[BLKCG_MAX_POLS];
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spinlock_t                        async_bio_lock;
        struct bio_list                        async_bios;
#endif
        union {
                struct work_struct        async_bio_work;
                struct work_struct        free_work;
        };

        atomic_t                        use_delay;
        atomic64_t                        delay_nsec;
        atomic64_t                        delay_start;
        u64                                last_delay;
        int                                last_use;

        struct rcu_head                        rcu_head;
};

struct blkcg {
        struct cgroup_subsys_state        css;
        spinlock_t                        lock;
        refcount_t                        online_pin;
        /* If there is block congestion on this cgroup. */
        atomic_t                        congestion_count;

        struct radix_tree_root                blkg_tree;
        struct blkcg_gq        __rcu                *blkg_hint;
        struct hlist_head                blkg_list;

        struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];

        struct list_head                all_blkcgs_node;

        /*
         * List of updated percpu blkg_iostat_set's since the last flush.
         */
        struct llist_head __percpu        *lhead;

#ifdef CONFIG_BLK_CGROUP_FC_APPID
        char                            fc_app_id[FC_APPID_LEN];
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head                cgwb_list;
#endif
};

static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct blkcg, css) : NULL;
}

/*
 * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
 * request_queue (q).  This is used by blkcg policies which need to track
 * information per blkcg - q pair.
 *
 * There can be multiple active blkcg policies and each blkg:policy pair is
 * represented by a blkg_policy_data which is allocated and freed by each
 * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
 * area by allocating larger data structure which embeds blkg_policy_data
 * at the beginning.
 */
struct blkg_policy_data {
        /* the blkg and policy id this per-policy data belongs to */
        struct blkcg_gq                        *blkg;
        int                                plid;
        bool                                online;
};

/*
 * Policies that need to keep per-blkcg data which is independent from any
 * request_queue associated to it should implement cpd_alloc/free_fn()
 * methods.  A policy can allocate private data area by allocating larger
 * data structure which embeds blkcg_policy_data at the beginning.
 * cpd_init() is invoked to let each policy handle per-blkcg data.
 */
struct blkcg_policy_data {
        /* the blkcg and policy id this per-policy data belongs to */
        struct blkcg                        *blkcg;
        int                                plid;
};

typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk,
                struct blkcg *blkcg, gfp_t gfp);
typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
                                struct seq_file *s);

struct blkcg_policy {
        int                                plid;
        /* cgroup files for the policy */
        struct cftype                        *dfl_cftypes;
        struct cftype                        *legacy_cftypes;

        /* operations */
        blkcg_pol_alloc_cpd_fn                *cpd_alloc_fn;
        blkcg_pol_free_cpd_fn                *cpd_free_fn;

        blkcg_pol_alloc_pd_fn                *pd_alloc_fn;
        blkcg_pol_init_pd_fn                *pd_init_fn;
        blkcg_pol_online_pd_fn                *pd_online_fn;
        blkcg_pol_offline_pd_fn                *pd_offline_fn;
        blkcg_pol_free_pd_fn                *pd_free_fn;
        blkcg_pol_reset_pd_stats_fn        *pd_reset_stats_fn;
        blkcg_pol_stat_pd_fn                *pd_stat_fn;
};

extern struct blkcg blkcg_root;
extern bool blkcg_debug_stats;

void blkg_init_queue(struct request_queue *q);
int blkcg_init_disk(struct gendisk *disk);
void blkcg_exit_disk(struct gendisk *disk);

/* Blkio controller policy registration */
int blkcg_policy_register(struct blkcg_policy *pol);
void blkcg_policy_unregister(struct blkcg_policy *pol);
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol);
void blkcg_deactivate_policy(struct gendisk *disk,
                             const struct blkcg_policy *pol);

const char *blkg_dev_name(struct blkcg_gq *blkg);
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
                       const struct blkcg_policy *pol, int data,
                       bool show_total);
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);

struct blkg_conf_ctx {
        char                                *input;
        char                                *body;
        struct block_device                *bdev;
        struct blkcg_gq                        *blkg;
};

void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   struct blkg_conf_ctx *ctx);
void blkg_conf_exit(struct blkg_conf_ctx *ctx);
void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);

/**
 * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
 * @bio: the target &bio
 *
 * Return: true if this bio needs to be submitted with the root blkg context.
 *
 * In order to avoid priority inversions we sometimes need to issue a bio as if
 * it were attached to the root blkg, and then backcharge to the actual owning
 * blkg.  The idea is we do bio_blkcg_css() to look up the actual context for
 * the bio and attach the appropriate blkg to the bio.  Then we call this helper
 * and if it is true run with the root blkg for that queue and then do any
 * backcharging to the originating cgroup once the io is complete.
 */
static inline bool bio_issue_as_root_blkg(struct bio *bio)
{
        return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
}

/**
 * blkg_lookup - lookup blkg for the specified blkcg - q pair
 * @blkcg: blkcg of interest
 * @q: request_queue of interest
 *
 * Lookup blkg for the @blkcg - @q pair.
 *
 * Must be called in a RCU critical section.
 */
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
                                           struct request_queue *q)
{
        struct blkcg_gq *blkg;

        if (blkcg == &blkcg_root)
                return q->root_blkg;

        blkg = rcu_dereference_check(blkcg->blkg_hint,
                        lockdep_is_held(&q->queue_lock));
        if (blkg && blkg->q == q)
                return blkg;

        blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
        if (blkg && blkg->q != q)
                blkg = NULL;
        return blkg;
}

/**
 * blkg_to_pd - get policy private data
 * @blkg: blkg of interest
 * @pol: policy of interest
 *
 * Return pointer to private data associated with the @blkg-@pol pair.
 */
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol)
{
        return blkg ? blkg->pd[pol->plid] : NULL;
}

static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
                                                     struct blkcg_policy *pol)
{
        return blkcg ? blkcg->cpd[pol->plid] : NULL;
}

/**
 * pd_to_blkg - get blkg associated with policy private data
 * @pd: policy private data of interest
 *
 * @pd is policy private data.  Determine the blkg it's associated with.
 */
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
{
        return pd ? pd->blkg : NULL;
}

static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
{
        return cpd ? cpd->blkcg : NULL;
}

/**
 * blkg_get - get a blkg reference
 * @blkg: blkg to get
 *
 * The caller should be holding an existing reference.
 */
static inline void blkg_get(struct blkcg_gq *blkg)
{
        percpu_ref_get(&blkg->refcnt);
}

/**
 * blkg_tryget - try and get a blkg reference
 * @blkg: blkg to get
 *
 * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
 * of freeing this blkg, so we can only use it if the refcnt is not zero.
 */
static inline bool blkg_tryget(struct blkcg_gq *blkg)
{
        return blkg && percpu_ref_tryget(&blkg->refcnt);
}

/**
 * blkg_put - put a blkg reference
 * @blkg: blkg to put
 */
static inline void blkg_put(struct blkcg_gq *blkg)
{
        percpu_ref_put(&blkg->refcnt);
}

/**
 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
 * read locked.  If called under either blkcg or queue lock, the iteration
 * is guaranteed to include all and only online blkgs.  The caller may
 * update @pos_css by calling css_rightmost_descendant() to skip subtree.
 * @p_blkg is included in the iteration and the first node to be visited.
 */
#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)                \
        css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)        \
                if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),        \
                                            (p_blkg)->q)))

/**
 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Similar to blkg_for_each_descendant_pre() but performs post-order
 * traversal instead.  Synchronization rules are the same.  @p_blkg is
 * included in the iteration and the last node to be visited.
 */
#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)                \
        css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)        \
                if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),        \
                                            (p_blkg)->q)))

static inline void blkcg_use_delay(struct blkcg_gq *blkg)
{
        if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
                return;
        if (atomic_add_return(1, &blkg->use_delay) == 1)
                atomic_inc(&blkg->blkcg->congestion_count);
}

static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
{
        int old = atomic_read(&blkg->use_delay);

        if (WARN_ON_ONCE(old < 0))
                return 0;
        if (old == 0)
                return 0;

        /*
         * We do this song and dance because we can race with somebody else
         * adding or removing delay.  If we just did an atomic_dec we'd end up
         * negative and we'd already be in trouble.  We need to subtract 1 and
         * then check to see if we were the last delay so we can drop the
         * congestion count on the cgroup.
         */
        while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1))
                ;

        if (old == 0)
                return 0;
        if (old == 1)
                atomic_dec(&blkg->blkcg->congestion_count);
        return 1;
}

/**
 * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
 * @blkg: target blkg
 * @delay: delay duration in nsecs
 *
 * When enabled with this function, the delay is not decayed and must be
 * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
 * blkcg_[un]use_delay() and blkcg_add_delay() usages.
 */
static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
{
        int old = atomic_read(&blkg->use_delay);

        /* We only want 1 person setting the congestion count for this blkg. */
        if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
                atomic_inc(&blkg->blkcg->congestion_count);

        atomic64_set(&blkg->delay_nsec, delay);
}

/**
 * blkcg_clear_delay - Disable allocator delay mechanism
 * @blkg: target blkg
 *
 * Disable use_delay mechanism. See blkcg_set_delay().
 */
static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
{
        int old = atomic_read(&blkg->use_delay);

        /* We only want 1 person clearing the congestion count for this blkg. */
        if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
                atomic_dec(&blkg->blkcg->congestion_count);
}

/**
 * blk_cgroup_mergeable - Determine whether to allow or disallow merges
 * @rq: request to merge into
 * @bio: bio to merge
 *
 * @bio and @rq should belong to the same cgroup and their issue_as_root should
 * match. The latter is necessary as we don't want to throttle e.g. a metadata
 * update because it happens to be next to a regular IO.
 */
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
{
        return rq->bio->bi_blkg == bio->bi_blkg &&
                bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
}

static inline bool blkcg_policy_enabled(struct request_queue *q,
                                const struct blkcg_policy *pol)
{
        return pol && test_bit(pol->plid, q->blkcg_pols);
}

void blk_cgroup_bio_start(struct bio *bio);
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
#else        /* CONFIG_BLK_CGROUP */

struct blkg_policy_data {
};

struct blkcg_policy_data {
};

struct blkcg_policy {
};

struct blkcg {
};

static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
static inline void blkg_init_queue(struct request_queue *q) { }
static inline int blkcg_init_disk(struct gendisk *disk) { return 0; }
static inline void blkcg_exit_disk(struct gendisk *disk) { }
static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
static inline int blkcg_activate_policy(struct gendisk *disk,
                                        const struct blkcg_policy *pol) { return 0; }
static inline void blkcg_deactivate_policy(struct gendisk *disk,
                                           const struct blkcg_policy *pol) { }

static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol) { return NULL; }
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline void blk_cgroup_bio_start(struct bio *bio) { }
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }

#define blk_queue_for_each_rl(rl, q)        \
        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)

#endif        /* CONFIG_BLK_CGROUP */

#endif /* _BLK_CGROUP_PRIVATE_H */





































































































































































































































































































































    3 





    3 
















    3 



    3 












































































































































































































































































































































































    3 












    3 



    3 















    3 












































































































































































































































































































































































































































































































































































































































































































































































































































































    3 

















    3 






























    3 
























    3 






























    3 






























































































    3 






    3 





    3 









    3 































































































































































































































































































































































































































































































































































































    1 





    1 








    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NET                An implementation of the SOCKET network access protocol.
 *
 * Version:        @(#)socket.c        1.1.93        18/02/95
 *
 * Authors:        Orest Zborowski, <obz@Kodak.COM>
 *                Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *                Anonymous        :        NOTSOCK/BADF cleanup. Error fix in
 *                                        shutdown()
 *                Alan Cox        :        verify_area() fixes
 *                Alan Cox        :        Removed DDI
 *                Jonathan Kamens        :        SOCK_DGRAM reconnect bug
 *                Alan Cox        :        Moved a load of checks to the very
 *                                        top level.
 *                Alan Cox        :        Move address structures to/from user
 *                                        mode above the protocol layers.
 *                Rob Janssen        :        Allow 0 length sends.
 *                Alan Cox        :        Asynchronous I/O support (cribbed from the
 *                                        tty drivers).
 *                Niibe Yutaka        :        Asynchronous I/O for writes (4.4BSD style)
 *                Jeff Uphoff        :        Made max number of sockets command-line
 *                                        configurable.
 *                Matti Aarnio        :        Made the number of sockets dynamic,
 *                                        to be allocated when needed, and mr.
 *                                        Uphoff's max is used as max to be
 *                                        allowed to allocate.
 *                Linus                :        Argh. removed all the socket allocation
 *                                        altogether: it's in the inode now.
 *                Alan Cox        :        Made sock_alloc()/sock_release() public
 *                                        for NetROM and future kernel nfsd type
 *                                        stuff.
 *                Alan Cox        :        sendmsg/recvmsg basics.
 *                Tom Dyas        :        Export net symbols.
 *                Marcin Dalecki        :        Fixed problems with CONFIG_NET="n".
 *                Alan Cox        :        Added thread locking to sys_* calls
 *                                        for sockets. May have errors at the
 *                                        moment.
 *                Kevin Buhr        :        Fixed the dumb errors in the above.
 *                Andi Kleen        :        Some small cleanups, optimizations,
 *                                        and fixed a copy_from_user() bug.
 *                Tigran Aivazian        :        sys_send(args) calls sys_sendto(args, NULL, 0)
 *                Tigran Aivazian        :        Made listen(2) backlog sanity checks
 *                                        protocol-independent
 *
 *        This module is effectively the top level interface to the BSD socket
 *        paradigm.
 *
 *        Based upon Swansea University Computer Society NET3.039
 */

#include <linux/bpf-cgroup.h>
#include <linux/ethtool.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/net.h>
#include <linux/interrupt.h>
#include <linux/thread_info.h>
#include <linux/rcupdate.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
#include <linux/if_bridge.h>
#include <linux/if_vlan.h>
#include <linux/ptp_classify.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/security.h>
#include <linux/uio.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/kmod.h>
#include <linux/audit.h>
#include <linux/wireless.h>
#include <linux/nsproxy.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/nospec.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/io_uring/net.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>

#include <net/compat.h>
#include <net/wext.h>
#include <net/cls_cgroup.h>

#include <net/sock.h>
#include <linux/netfilter.h>

#include <linux/if_tun.h>
#include <linux/ipv6_route.h>
#include <linux/route.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <net/busy_poll.h>
#include <linux/errqueue.h>
#include <linux/ptp_clock_kernel.h>
#include <trace/events/sock.h>

#include "core/dev.h"

#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
#endif

static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to);
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
static int sock_mmap(struct file *file, struct vm_area_struct *vma);

static int sock_close(struct inode *inode, struct file *file);
static __poll_t sock_poll(struct file *file,
                              struct poll_table_struct *wait);
static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
#ifdef CONFIG_COMPAT
static long compat_sock_ioctl(struct file *file,
                              unsigned int cmd, unsigned long arg);
#endif
static int sock_fasync(int fd, struct file *filp, int on);
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
                                struct pipe_inode_info *pipe, size_t len,
                                unsigned int flags);
static void sock_splice_eof(struct file *file);

#ifdef CONFIG_PROC_FS
static void sock_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct socket *sock = f->private_data;
        const struct proto_ops *ops = READ_ONCE(sock->ops);

        if (ops->show_fdinfo)
                ops->show_fdinfo(m, sock);
}
#else
#define sock_show_fdinfo NULL
#endif

/*
 *        Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *        in the operation structures but are done directly via the socketcall() multiplexor.
 */

static const struct file_operations socket_file_ops = {
        .owner =        THIS_MODULE,
        .read_iter =        sock_read_iter,
        .write_iter =        sock_write_iter,
        .poll =                sock_poll,
        .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl = compat_sock_ioctl,
#endif
        .uring_cmd =    io_uring_cmd_sock,
        .mmap =                sock_mmap,
        .release =        sock_close,
        .fasync =        sock_fasync,
        .splice_write = splice_to_socket,
        .splice_read =        sock_splice_read,
        .splice_eof =        sock_splice_eof,
        .show_fdinfo =        sock_show_fdinfo,
};

static const char * const pf_family_names[] = {
        [PF_UNSPEC]        = "PF_UNSPEC",
        [PF_UNIX]        = "PF_UNIX/PF_LOCAL",
        [PF_INET]        = "PF_INET",
        [PF_AX25]        = "PF_AX25",
        [PF_IPX]        = "PF_IPX",
        [PF_APPLETALK]        = "PF_APPLETALK",
        [PF_NETROM]        = "PF_NETROM",
        [PF_BRIDGE]        = "PF_BRIDGE",
        [PF_ATMPVC]        = "PF_ATMPVC",
        [PF_X25]        = "PF_X25",
        [PF_INET6]        = "PF_INET6",
        [PF_ROSE]        = "PF_ROSE",
        [PF_DECnet]        = "PF_DECnet",
        [PF_NETBEUI]        = "PF_NETBEUI",
        [PF_SECURITY]        = "PF_SECURITY",
        [PF_KEY]        = "PF_KEY",
        [PF_NETLINK]        = "PF_NETLINK/PF_ROUTE",
        [PF_PACKET]        = "PF_PACKET",
        [PF_ASH]        = "PF_ASH",
        [PF_ECONET]        = "PF_ECONET",
        [PF_ATMSVC]        = "PF_ATMSVC",
        [PF_RDS]        = "PF_RDS",
        [PF_SNA]        = "PF_SNA",
        [PF_IRDA]        = "PF_IRDA",
        [PF_PPPOX]        = "PF_PPPOX",
        [PF_WANPIPE]        = "PF_WANPIPE",
        [PF_LLC]        = "PF_LLC",
        [PF_IB]                = "PF_IB",
        [PF_MPLS]        = "PF_MPLS",
        [PF_CAN]        = "PF_CAN",
        [PF_TIPC]        = "PF_TIPC",
        [PF_BLUETOOTH]        = "PF_BLUETOOTH",
        [PF_IUCV]        = "PF_IUCV",
        [PF_RXRPC]        = "PF_RXRPC",
        [PF_ISDN]        = "PF_ISDN",
        [PF_PHONET]        = "PF_PHONET",
        [PF_IEEE802154]        = "PF_IEEE802154",
        [PF_CAIF]        = "PF_CAIF",
        [PF_ALG]        = "PF_ALG",
        [PF_NFC]        = "PF_NFC",
        [PF_VSOCK]        = "PF_VSOCK",
        [PF_KCM]        = "PF_KCM",
        [PF_QIPCRTR]        = "PF_QIPCRTR",
        [PF_SMC]        = "PF_SMC",
        [PF_XDP]        = "PF_XDP",
        [PF_MCTP]        = "PF_MCTP",
};

/*
 *        The protocol list. Each protocol is registered in here.
 */

static DEFINE_SPINLOCK(net_family_lock);
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;

/*
 * Support routines.
 * Move socket addresses back and forth across the kernel/user
 * divide and look after the messy bits.
 */

/**
 *        move_addr_to_kernel        -        copy a socket address into kernel space
 *        @uaddr: Address in user space
 *        @kaddr: Address in kernel space
 *        @ulen: Length in user space
 *
 *        The address is copied into kernel space. If the provided address is
 *        too long an error code of -EINVAL is returned. If the copy gives
 *        invalid addresses -EFAULT is returned. On a success 0 is returned.
 */

int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
{
        if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
                return -EINVAL;
        if (ulen == 0)
                return 0;
        if (copy_from_user(kaddr, uaddr, ulen))
                return -EFAULT;
        return audit_sockaddr(ulen, kaddr);
}

/**
 *        move_addr_to_user        -        copy an address to user space
 *        @kaddr: kernel space address
 *        @klen: length of address in kernel
 *        @uaddr: user space address
 *        @ulen: pointer to user length field
 *
 *        The value pointed to by ulen on entry is the buffer length available.
 *        This is overwritten with the buffer space used. -EINVAL is returned
 *        if an overlong buffer is specified or a negative buffer size. -EFAULT
 *        is returned if either the buffer or the length field are not
 *        accessible.
 *        After copying the data up to the limit the user specifies, the true
 *        length of the data is written over the length limit the user
 *        specified. Zero is returned for a success.
 */

static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
                             void __user *uaddr, int __user *ulen)
{
        int len;

        BUG_ON(klen > sizeof(struct sockaddr_storage));

        scoped_user_rw_access_size(ulen, 4, efault_end) {
                unsafe_get_user(len, ulen, efault_end);

                if (len > klen)
                        len = klen;
                /*
                 *      "fromlen shall refer to the value before truncation.."
                 *                      1003.1g
                 */
                if (len >= 0)
                        unsafe_put_user(klen, ulen, efault_end);
        }

        if (len) {
                if (len < 0)
                        return -EINVAL;
                if (audit_sockaddr(klen, kaddr))
                        return -ENOMEM;
                if (copy_to_user(uaddr, kaddr, len))
                        return -EFAULT;
        }
        return 0;

efault_end:
        return -EFAULT;
}

static struct kmem_cache *sock_inode_cachep __ro_after_init;

struct sockfs_inode {
        struct simple_xattrs *xattrs;
        struct simple_xattr_limits xattr_limits;
        struct socket_alloc;
};

static struct sockfs_inode *SOCKFS_I(struct inode *inode)
{
        return container_of(inode, struct sockfs_inode, vfs_inode);
}

static struct inode *sock_alloc_inode(struct super_block *sb)
{
        struct sockfs_inode *si;

        si = alloc_inode_sb(sb, sock_inode_cachep, GFP_KERNEL);
        if (!si)
                return NULL;
        si->xattrs = NULL;
        simple_xattr_limits_init(&si->xattr_limits);

        init_waitqueue_head(&si->socket.wq.wait);
        si->socket.wq.fasync_list = NULL;
        si->socket.wq.flags = 0;

        si->socket.state = SS_UNCONNECTED;
        si->socket.flags = 0;
        si->socket.ops = NULL;
        si->socket.sk = NULL;
        si->socket.file = NULL;

        return &si->vfs_inode;
}

static void sock_evict_inode(struct inode *inode)
{
        struct sockfs_inode *si = SOCKFS_I(inode);
        struct simple_xattrs *xattrs = si->xattrs;

        if (xattrs) {
                simple_xattrs_free(xattrs, NULL);
                kfree(xattrs);
        }
        clear_inode(inode);
}

static void sock_free_inode(struct inode *inode)
{
        struct sockfs_inode *si = SOCKFS_I(inode);

        kmem_cache_free(sock_inode_cachep, si);
}

static void init_once(void *foo)
{
        struct sockfs_inode *si = (struct sockfs_inode *)foo;

        inode_init_once(&si->vfs_inode);
}

static void init_inodecache(void)
{
        sock_inode_cachep = kmem_cache_create("sock_inode_cache",
                                              sizeof(struct sockfs_inode),
                                              0,
                                              (SLAB_HWCACHE_ALIGN |
                                               SLAB_RECLAIM_ACCOUNT |
                                               SLAB_ACCOUNT),
                                              init_once);
        BUG_ON(sock_inode_cachep == NULL);
}

static const struct super_operations sockfs_ops = {
        .alloc_inode        = sock_alloc_inode,
        .free_inode        = sock_free_inode,
        .evict_inode        = sock_evict_inode,
        .statfs                = simple_statfs,
};

/*
 * sockfs_dname() is called from d_path().
 */
static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(buffer, buflen, "socket:[%llu]",
                                d_inode(dentry)->i_ino);
}

static const struct dentry_operations sockfs_dentry_operations = {
        .d_dname  = sockfs_dname,
};

static int sockfs_xattr_get(const struct xattr_handler *handler,
                            struct dentry *dentry, struct inode *inode,
                            const char *suffix, void *value, size_t size)
{
        if (value) {
                if (dentry->d_name.len + 1 > size)
                        return -ERANGE;
                memcpy(value, dentry->d_name.name, dentry->d_name.len + 1);
        }
        return dentry->d_name.len + 1;
}

#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)

static const struct xattr_handler sockfs_xattr_handler = {
        .name = XATTR_NAME_SOCKPROTONAME,
        .get = sockfs_xattr_get,
};

static int sockfs_security_xattr_set(const struct xattr_handler *handler,
                                     struct mnt_idmap *idmap,
                                     struct dentry *dentry, struct inode *inode,
                                     const char *suffix, const void *value,
                                     size_t size, int flags)
{
        /* Handled by LSM. */
        return -EAGAIN;
}

static const struct xattr_handler sockfs_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .set = sockfs_security_xattr_set,
};

static int sockfs_user_xattr_get(const struct xattr_handler *handler,
                                 struct dentry *dentry, struct inode *inode,
                                 const char *suffix, void *value, size_t size)
{
        const char *name = xattr_full_name(handler, suffix);
        struct simple_xattrs *xattrs;

        xattrs = READ_ONCE(SOCKFS_I(inode)->xattrs);
        if (!xattrs)
                return -ENODATA;

        return simple_xattr_get(xattrs, name, value, size);
}

static int sockfs_user_xattr_set(const struct xattr_handler *handler,
                                 struct mnt_idmap *idmap,
                                 struct dentry *dentry, struct inode *inode,
                                 const char *suffix, const void *value,
                                 size_t size, int flags)
{
        const char *name = xattr_full_name(handler, suffix);
        struct sockfs_inode *si = SOCKFS_I(inode);
        struct simple_xattrs *xattrs;

        xattrs = simple_xattrs_lazy_alloc(&si->xattrs, value, flags);
        if (IS_ERR_OR_NULL(xattrs))
                return PTR_ERR(xattrs);

        return simple_xattr_set_limited(xattrs, &si->xattr_limits,
                                        name, value, size, flags);
}

static const struct xattr_handler sockfs_user_xattr_handler = {
        .prefix = XATTR_USER_PREFIX,
        .get = sockfs_user_xattr_get,
        .set = sockfs_user_xattr_set,
};

static const struct xattr_handler * const sockfs_xattr_handlers[] = {
        &sockfs_xattr_handler,
        &sockfs_security_xattr_handler,
        &sockfs_user_xattr_handler,
        NULL
};

static int sockfs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, SOCKFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &sockfs_ops;
        ctx->dops = &sockfs_dentry_operations;
        ctx->xattr = sockfs_xattr_handlers;
        return 0;
}

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {
        .name =                "sockfs",
        .init_fs_context = sockfs_init_fs_context,
        .kill_sb =        kill_anon_super,
};

/*
 *        Obtains the first available file descriptor and sets it up for use.
 *
 *        These functions create file structures and maps them to fd space
 *        of the current process. On success it returns file descriptor
 *        and file struct implicitly stored in sock->file.
 *        Note that another thread may close file descriptor before we return
 *        from this function. We use the fact that now we do not refer
 *        to socket after mapping. If one day we will need it, this
 *        function will increment ref. count on file by 1.
 *
 *        In any case returned fd MAY BE not valid!
 *        This race condition is unavoidable
 *        with shared fd spaces, we cannot solve it inside kernel,
 *        but we take care of internal coherence yet.
 */

/**
 *        sock_alloc_file - Bind a &socket to a &file
 *        @sock: socket
 *        @flags: file status flags
 *        @dname: protocol name
 *
 *        Returns the &file bound with @sock, implicitly storing it
 *        in sock->file. If dname is %NULL, sets to "".
 *
 *        On failure @sock is released, and an ERR pointer is returned.
 *
 *        This function uses GFP_KERNEL internally.
 */

struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
        struct file *file;

        if (!dname)
                dname = sock->sk ? sock->sk->sk_prot_creator->name : "";

        file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
                                O_RDWR | (flags & O_NONBLOCK),
                                &socket_file_ops);
        if (IS_ERR(file)) {
                sock_release(sock);
                return file;
        }

        file->f_mode |= FMODE_NOWAIT;
        sock->file = file;
        file->private_data = sock;
        stream_open(SOCK_INODE(sock), file);
        /*
         * Disable permission and pre-content events, but enable legacy
         * inotify events for legacy users.
         */
        file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
        return file;
}
EXPORT_SYMBOL(sock_alloc_file);

static int sock_map_fd(struct socket *sock, int flags)
{
        struct file *newfile;
        int fd = get_unused_fd_flags(flags);
        if (unlikely(fd < 0)) {
                sock_release(sock);
                return fd;
        }

        newfile = sock_alloc_file(sock, flags, NULL);
        if (!IS_ERR(newfile)) {
                fd_install(fd, newfile);
                return fd;
        }

        put_unused_fd(fd);
        return PTR_ERR(newfile);
}

/**
 *        sock_from_file - Return the &socket bounded to @file.
 *        @file: file
 *
 *        On failure returns %NULL.
 */

struct socket *sock_from_file(struct file *file)
{
        if (likely(file->f_op == &socket_file_ops))
                return file->private_data;        /* set in sock_alloc_file */

        return NULL;
}
EXPORT_SYMBOL(sock_from_file);

/**
 *        sockfd_lookup - Go from a file number to its socket slot
 *        @fd: file handle
 *        @err: pointer to an error code return
 *
 *        The file handle passed in is locked and the socket it is bound
 *        to is returned. If an error occurs the err pointer is overwritten
 *        with a negative errno code and NULL is returned. The function checks
 *        for both invalid handles and passing a handle which is not a socket.
 *
 *        On a success the socket object pointer is returned.
 */

struct socket *sockfd_lookup(int fd, int *err)
{
        struct file *file;
        struct socket *sock;

        file = fget(fd);
        if (!file) {
                *err = -EBADF;
                return NULL;
        }

        sock = sock_from_file(file);
        if (!sock) {
                *err = -ENOTSOCK;
                fput(file);
        }
        return sock;
}
EXPORT_SYMBOL(sockfd_lookup);

static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
                                size_t size)
{
        struct sockfs_inode *si = SOCKFS_I(d_inode(dentry));
        ssize_t len, used;

        len = simple_xattr_list(d_inode(dentry), READ_ONCE(si->xattrs),
                                buffer, size);
        if (len < 0)
                return len;

        used = len;
        if (buffer) {
                buffer += len;
                size -= len;
        }

        len = XATTR_NAME_SOCKPROTONAME_LEN + 1;
        used += len;
        if (buffer) {
                if (size < len)
                        return -ERANGE;
                memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len);
        }

        return used;
}

static int sockfs_setattr(struct mnt_idmap *idmap,
                          struct dentry *dentry, struct iattr *iattr)
{
        int err = simple_setattr(&nop_mnt_idmap, dentry, iattr);

        if (!err && (iattr->ia_valid & ATTR_UID)) {
                struct socket *sock = SOCKET_I(d_inode(dentry));

                if (sock->sk) {
                        /* Paired with READ_ONCE() in sk_uid() */
                        WRITE_ONCE(sock->sk->sk_uid, iattr->ia_uid);
                } else {
                        err = -ENOENT;
                }
        }

        return err;
}

static const struct inode_operations sockfs_inode_ops = {
        .listxattr = sockfs_listxattr,
        .setattr = sockfs_setattr,
};

/**
 *        sock_alloc - allocate a socket
 *
 *        Allocate a new inode and socket object. The two are bound together
 *        and initialised. The socket is then returned. If we are out of inodes
 *        NULL is returned. This functions uses GFP_KERNEL internally.
 */

struct socket *sock_alloc(void)
{
        struct inode *inode;
        struct socket *sock;

        inode = new_inode_pseudo(sock_mnt->mnt_sb);
        if (!inode)
                return NULL;

        sock = SOCKET_I(inode);

        inode->i_ino = get_next_ino();
        inode->i_mode = S_IFSOCK | S_IRWXUGO;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_op = &sockfs_inode_ops;

        return sock;
}
EXPORT_SYMBOL(sock_alloc);

static void __sock_release(struct socket *sock, struct inode *inode)
{
        const struct proto_ops *ops = READ_ONCE(sock->ops);

        if (ops) {
                struct module *owner = ops->owner;

                if (inode)
                        inode_lock(inode);
                ops->release(sock);
                sock->sk = NULL;
                if (inode)
                        inode_unlock(inode);
                sock->ops = NULL;
                module_put(owner);
        }

        if (sock->wq.fasync_list)
                pr_err("%s: fasync list not empty!\n", __func__);

        if (!sock->file) {
                iput(SOCK_INODE(sock));
                return;
        }
        WRITE_ONCE(sock->file, NULL);
}

/**
 *        sock_release - close a socket
 *        @sock: socket to close
 *
 *        The socket is released from the protocol stack if it has a release
 *        callback, and the inode is then released if the socket is bound to
 *        an inode not a file.
 */
void sock_release(struct socket *sock)
{
        __sock_release(sock, NULL);
}
EXPORT_SYMBOL(sock_release);

void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags)
{
        u8 flags = *tx_flags;

        if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE)
                flags |= SKBTX_HW_TSTAMP_NOBPF;

        if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
                flags |= SKBTX_SW_TSTAMP;

        if (tsflags & SOF_TIMESTAMPING_TX_SCHED)
                flags |= SKBTX_SCHED_TSTAMP;

        if (tsflags & SOF_TIMESTAMPING_TX_COMPLETION)
                flags |= SKBTX_COMPLETION_TSTAMP;

        *tx_flags = flags;
}
EXPORT_SYMBOL(__sock_tx_timestamp);

INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *,
                                           size_t));
INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *,
                                            size_t));

static noinline void call_trace_sock_send_length(struct sock *sk, int ret,
                                                 int flags)
{
        trace_sock_send_length(sk, ret, 0);
}

static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
{
        int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg,
                                     inet_sendmsg, sock, msg,
                                     msg_data_left(msg));
        BUG_ON(ret == -EIOCBQUEUED);

        if (trace_sock_send_length_enabled())
                call_trace_sock_send_length(sock->sk, ret, 0);
        return ret;
}

static int __sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
        int err = security_socket_sendmsg(sock, msg,
                                          msg_data_left(msg));

        return err ?: sock_sendmsg_nosec(sock, msg);
}

/**
 *        sock_sendmsg - send a message through @sock
 *        @sock: socket
 *        @msg: message to send
 *
 *        Sends @msg through @sock, passing through LSM.
 *        Returns the number of bytes sent, or an error code.
 */
int sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
        struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name;
        struct sockaddr_storage address;
        int save_len = msg->msg_namelen;
        int ret;

        if (msg->msg_name) {
                memcpy(&address, msg->msg_name, msg->msg_namelen);
                msg->msg_name = &address;
        }

        ret = __sock_sendmsg(sock, msg);
        msg->msg_name = save_addr;
        msg->msg_namelen = save_len;

        return ret;
}
EXPORT_SYMBOL(sock_sendmsg);

/**
 *        kernel_sendmsg - send a message through @sock (kernel-space)
 *        @sock: socket
 *        @msg: message header
 *        @vec: kernel vec
 *        @num: vec array length
 *        @size: total message data size
 *
 *        Builds the message data with @vec and sends it through @sock.
 *        Returns the number of bytes sent, or an error code.
 */

int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
                   struct kvec *vec, size_t num, size_t size)
{
        iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size);
        return sock_sendmsg(sock, msg);
}
EXPORT_SYMBOL(kernel_sendmsg);

static bool skb_is_err_queue(const struct sk_buff *skb)
{
        /* pkt_type of skbs enqueued on the error queue are set to
         * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do
         * in recvmsg, since skbs received on a local socket will never
         * have a pkt_type of PACKET_OUTGOING.
         */
        return skb->pkt_type == PACKET_OUTGOING;
}

/* On transmit, software and hardware timestamps are returned independently.
 * As the two skb clones share the hardware timestamp, which may be updated
 * before the software timestamp is received, a hardware TX timestamp may be
 * returned only if there is no software TX timestamp. Ignore false software
 * timestamps, which may be made in the __sock_recv_timestamp() call when the
 * option SO_TIMESTAMP_OLD(NS) is enabled on the socket, even when the skb has a
 * hardware timestamp.
 */
static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp)
{
        return skb->tstamp && !false_tstamp && skb_is_err_queue(skb);
}

static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index)
{
        bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC;
        struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
        struct net_device *orig_dev;
        ktime_t hwtstamp;

        rcu_read_lock();
        orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
        if (orig_dev) {
                *if_index = orig_dev->ifindex;
                hwtstamp = netdev_get_tstamp(orig_dev, shhwtstamps, cycles);
        } else {
                hwtstamp = shhwtstamps->hwtstamp;
        }
        rcu_read_unlock();

        return hwtstamp;
}

static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb,
                           int if_index)
{
        struct scm_ts_pktinfo ts_pktinfo;
        struct net_device *orig_dev;

        if (!skb_mac_header_was_set(skb))
                return;

        memset(&ts_pktinfo, 0, sizeof(ts_pktinfo));

        if (!if_index) {
                rcu_read_lock();
                orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
                if (orig_dev)
                        if_index = orig_dev->ifindex;
                rcu_read_unlock();
        }
        ts_pktinfo.if_index = if_index;

        ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb);
        put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
                 sizeof(ts_pktinfo), &ts_pktinfo);
}

bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk)
{
        const struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
        u32 tsflags = READ_ONCE(sk->sk_tsflags);

        if (serr->ee.ee_errno != ENOMSG ||
           serr->ee.ee_origin != SO_EE_ORIGIN_TIMESTAMPING)
                return false;

        /* software time stamp available and wanted */
        if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && skb->tstamp)
                return true;
        /* hardware time stamps available and wanted */
        return (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
                skb_hwtstamps(skb)->hwtstamp;
}

int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
                          struct timespec64 *ts)
{
        u32 tsflags = READ_ONCE(sk->sk_tsflags);
        ktime_t hwtstamp;
        int if_index = 0;

        if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
            ktime_to_timespec64_cond(skb->tstamp, ts))
                return SOF_TIMESTAMPING_TX_SOFTWARE;

        if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) ||
            skb_is_swtx_tstamp(skb, false))
                return -ENOENT;

        if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV)
                hwtstamp = get_timestamp(sk, skb, &if_index);
        else
                hwtstamp = skb_hwtstamps(skb)->hwtstamp;

        if (tsflags & SOF_TIMESTAMPING_BIND_PHC)
                hwtstamp = ptp_convert_timestamp(&hwtstamp,
                                                READ_ONCE(sk->sk_bind_phc));
        if (!ktime_to_timespec64_cond(hwtstamp, ts))
                return -ENOENT;

        return SOF_TIMESTAMPING_TX_HARDWARE;
}

/*
 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
 */
void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
        struct sk_buff *skb)
{
        int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
        int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
        struct skb_shared_hwtstamps *shhwtstamps =
                skb_hwtstamps(skb);
        struct scm_timestamping_internal tss;
        int if_index, false_tstamp = 0;
        ktime_t hwtstamp;
        u32 tsflags;

        /* Race occurred between timestamp enabling and packet
           receiving.  Fill in the current time for now. */
        if (need_software_tstamp && skb->tstamp == 0) {
                __net_timestamp(skb);
                false_tstamp = 1;
        }

        if (need_software_tstamp) {
                if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
                        if (new_tstamp) {
                                struct __kernel_sock_timeval tv;

                                skb_get_new_timestamp(skb, &tv);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
                                         sizeof(tv), &tv);
                        } else {
                                struct __kernel_old_timeval tv;

                                skb_get_timestamp(skb, &tv);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
                                         sizeof(tv), &tv);
                        }
                } else {
                        if (new_tstamp) {
                                struct __kernel_timespec ts;

                                skb_get_new_timestampns(skb, &ts);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
                                         sizeof(ts), &ts);
                        } else {
                                struct __kernel_old_timespec ts;

                                skb_get_timestampns(skb, &ts);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
                                         sizeof(ts), &ts);
                        }
                }
        }

        memset(&tss, 0, sizeof(tss));
        tsflags = READ_ONCE(sk->sk_tsflags);
        if (tsflags & SOF_TIMESTAMPING_SOFTWARE &&
            (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE ||
            skb_is_err_queue(skb) ||
            !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))
                tss.ts[0] = skb->tstamp;

        if (shhwtstamps &&
            (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE &&
             (tsflags & SOF_TIMESTAMPING_RX_HARDWARE ||
              skb_is_err_queue(skb) ||
              !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) &&
            !skb_is_swtx_tstamp(skb, false_tstamp)) {
                if_index = 0;
                if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV)
                        hwtstamp = get_timestamp(sk, skb, &if_index);
                else
                        hwtstamp = shhwtstamps->hwtstamp;

                if (tsflags & SOF_TIMESTAMPING_BIND_PHC)
                        hwtstamp = ptp_convert_timestamp(&hwtstamp,
                                                         READ_ONCE(sk->sk_bind_phc));

                if (hwtstamp) {
                        tss.ts[2] = hwtstamp;

                        if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
                            !skb_is_err_queue(skb))
                                put_ts_pktinfo(msg, skb, if_index);
                }
        }
        if (tss.ts[0] | tss.ts[2]) {
                if (sock_flag(sk, SOCK_TSTAMP_NEW))
                        put_cmsg_scm_timestamping64(msg, &tss);
                else
                        put_cmsg_scm_timestamping(msg, &tss);

                if (skb_is_err_queue(skb) && skb->len &&
                    SKB_EXT_ERR(skb)->opt_stats)
                        put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
                                 skb->len, skb->data);
        }
}
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);

#ifdef CONFIG_WIRELESS
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
        struct sk_buff *skb)
{
        int ack;

        if (!sock_flag(sk, SOCK_WIFI_STATUS))
                return;
        if (!skb->wifi_acked_valid)
                return;

        ack = skb->wifi_acked;

        put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
}
EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);
#endif

static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
                                   struct sk_buff *skb)
{
        if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount)
                put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
                        sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount);
}

static void sock_recv_mark(struct msghdr *msg, struct sock *sk,
                           struct sk_buff *skb)
{
        if (sock_flag(sk, SOCK_RCVMARK) && skb) {
                /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */
                __u32 mark = skb->mark;

                put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), &mark);
        }
}

static void sock_recv_priority(struct msghdr *msg, struct sock *sk,
                               struct sk_buff *skb)
{
        if (sock_flag(sk, SOCK_RCVPRIORITY) && skb) {
                __u32 priority = skb->priority;

                put_cmsg(msg, SOL_SOCKET, SO_PRIORITY, sizeof(__u32), &priority);
        }
}

void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                       struct sk_buff *skb)
{
        sock_recv_timestamp(msg, sk, skb);
        sock_recv_drops(msg, sk, skb);
        sock_recv_mark(msg, sk, skb);
        sock_recv_priority(msg, sk, skb);
}
EXPORT_SYMBOL_GPL(__sock_recv_cmsgs);

INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *,
                                           size_t, int));
INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
                                            size_t, int));

static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int flags)
{
        trace_sock_recv_length(sk, ret, flags);
}

static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
                                     int flags)
{
        int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->recvmsg,
                                     inet6_recvmsg,
                                     inet_recvmsg, sock, msg,
                                     msg_data_left(msg), flags);
        if (trace_sock_recv_length_enabled())
                call_trace_sock_recv_length(sock->sk, ret, flags);
        return ret;
}

/**
 *        sock_recvmsg - receive a message from @sock
 *        @sock: socket
 *        @msg: message to receive
 *        @flags: message flags
 *
 *        Receives @msg from @sock, passing through LSM. Returns the total number
 *        of bytes received, or an error.
 */
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{
        int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);

        return err ?: sock_recvmsg_nosec(sock, msg, flags);
}
EXPORT_SYMBOL(sock_recvmsg);

/**
 *        kernel_recvmsg - Receive a message from a socket (kernel space)
 *        @sock: The socket to receive the message from
 *        @msg: Received message
 *        @vec: Input s/g array for message data
 *        @num: Size of input s/g array
 *        @size: Number of bytes to read
 *        @flags: Message flags (MSG_DONTWAIT, etc...)
 *
 *        On return the msg structure contains the scatter/gather array passed in the
 *        vec argument. The array is modified so that it consists of the unfilled
 *        portion of the original array.
 *
 *        The returned value is the total number of bytes received, or an error.
 */

int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
                   struct kvec *vec, size_t num, size_t size, int flags)
{
        msg->msg_control_is_user = false;
        iov_iter_kvec(&msg->msg_iter, ITER_DEST, vec, num, size);
        return sock_recvmsg(sock, msg, flags);
}
EXPORT_SYMBOL(kernel_recvmsg);

static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
                                struct pipe_inode_info *pipe, size_t len,
                                unsigned int flags)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops;

        ops = READ_ONCE(sock->ops);
        if (unlikely(!ops->splice_read))
                return copy_splice_read(file, ppos, pipe, len, flags);

        return ops->splice_read(sock, ppos, pipe, len, flags);
}

static void sock_splice_eof(struct file *file)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops;

        ops = READ_ONCE(sock->ops);
        if (ops->splice_eof)
                ops->splice_eof(sock);
}

static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct socket *sock = file->private_data;
        struct msghdr msg = {.msg_iter = *to,
                             .msg_iocb = iocb};
        ssize_t res;

        if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
                msg.msg_flags = MSG_DONTWAIT;

        if (iocb->ki_pos != 0)
                return -ESPIPE;

        if (!iov_iter_count(to))        /* Match SYS5 behaviour */
                return 0;

        res = sock_recvmsg(sock, &msg, msg.msg_flags);
        *to = msg.msg_iter;
        return res;
}

static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct socket *sock = file->private_data;
        struct msghdr msg = {.msg_iter = *from,
                             .msg_iocb = iocb};
        ssize_t res;

        if (iocb->ki_pos != 0)
                return -ESPIPE;

        if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
                msg.msg_flags = MSG_DONTWAIT;

        if (sock->type == SOCK_SEQPACKET)
                msg.msg_flags |= MSG_EOR;

        if (iocb->ki_flags & IOCB_NOSIGNAL)
                msg.msg_flags |= MSG_NOSIGNAL;

        res = __sock_sendmsg(sock, &msg);
        *from = msg.msg_iter;
        return res;
}

/*
 * Atomic setting of ioctl hooks to avoid race
 * with module unload.
 */

static DEFINE_MUTEX(br_ioctl_mutex);
static int (*br_ioctl_hook)(struct net *net, unsigned int cmd,
                            void __user *uarg);

void brioctl_set(int (*hook)(struct net *net, unsigned int cmd,
                             void __user *uarg))
{
        mutex_lock(&br_ioctl_mutex);
        br_ioctl_hook = hook;
        mutex_unlock(&br_ioctl_mutex);
}
EXPORT_SYMBOL(brioctl_set);

int br_ioctl_call(struct net *net, unsigned int cmd, void __user *uarg)
{
        int err = -ENOPKG;

        if (!br_ioctl_hook)
                request_module("bridge");

        mutex_lock(&br_ioctl_mutex);
        if (br_ioctl_hook)
                err = br_ioctl_hook(net, cmd, uarg);
        mutex_unlock(&br_ioctl_mutex);

        return err;
}

static DEFINE_MUTEX(vlan_ioctl_mutex);
static int (*vlan_ioctl_hook) (struct net *, void __user *arg);

void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
{
        mutex_lock(&vlan_ioctl_mutex);
        vlan_ioctl_hook = hook;
        mutex_unlock(&vlan_ioctl_mutex);
}
EXPORT_SYMBOL(vlan_ioctl_set);

static long sock_do_ioctl(struct net *net, struct socket *sock,
                          unsigned int cmd, unsigned long arg)
{
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        struct ifreq ifr;
        bool need_copyout;
        int err;
        void __user *argp = (void __user *)arg;
        void __user *data;

        err = ops->ioctl(sock, cmd, arg);

        /*
         * If this ioctl is unknown try to hand it down
         * to the NIC driver.
         */
        if (err != -ENOIOCTLCMD)
                return err;

        if (!is_socket_ioctl_cmd(cmd))
                return -ENOTTY;

        if (get_user_ifreq(&ifr, &data, argp))
                return -EFAULT;
        err = dev_ioctl(net, cmd, &ifr, data, &need_copyout);
        if (!err && need_copyout)
                if (put_user_ifreq(&ifr, argp))
                        return -EFAULT;

        return err;
}

/*
 *        With an ioctl, arg may well be a user mode pointer, but we don't know
 *        what to do with it - that's up to the protocol still.
 */

static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
        const struct proto_ops  *ops;
        struct socket *sock;
        struct sock *sk;
        void __user *argp = (void __user *)arg;
        int pid, err;
        struct net *net;

        sock = file->private_data;
        ops = READ_ONCE(sock->ops);
        sk = sock->sk;
        net = sock_net(sk);
        if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) {
                struct ifreq ifr;
                void __user *data;
                bool need_copyout;
                if (get_user_ifreq(&ifr, &data, argp))
                        return -EFAULT;
                err = dev_ioctl(net, cmd, &ifr, data, &need_copyout);
                if (!err && need_copyout)
                        if (put_user_ifreq(&ifr, argp))
                                return -EFAULT;
        } else
#ifdef CONFIG_WEXT_CORE
        if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
                err = wext_handle_ioctl(net, cmd, argp);
        } else
#endif
                switch (cmd) {
                case FIOSETOWN:
                case SIOCSPGRP:
                        err = -EFAULT;
                        if (get_user(pid, (int __user *)argp))
                                break;
                        err = f_setown(sock->file, pid, 1);
                        break;
                case FIOGETOWN:
                case SIOCGPGRP:
                        err = put_user(f_getown(sock->file),
                                       (int __user *)argp);
                        break;
                case SIOCGIFBR:
                case SIOCSIFBR:
                case SIOCBRADDBR:
                case SIOCBRDELBR:
                case SIOCBRADDIF:
                case SIOCBRDELIF:
                        err = br_ioctl_call(net, cmd, argp);
                        break;
                case SIOCGIFVLAN:
                case SIOCSIFVLAN:
                        err = -ENOPKG;
                        if (!vlan_ioctl_hook)
                                request_module("8021q");

                        mutex_lock(&vlan_ioctl_mutex);
                        if (vlan_ioctl_hook)
                                err = vlan_ioctl_hook(net, argp);
                        mutex_unlock(&vlan_ioctl_mutex);
                        break;
                case SIOCGSKNS:
                        err = -EPERM;
                        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                                break;

                        err = open_related_ns(&net->ns, get_net_ns);
                        break;
                case SIOCGSTAMP_OLD:
                case SIOCGSTAMPNS_OLD:
                        if (!ops->gettstamp) {
                                err = -ENOIOCTLCMD;
                                break;
                        }
                        err = ops->gettstamp(sock, argp,
                                             cmd == SIOCGSTAMP_OLD,
                                             !IS_ENABLED(CONFIG_64BIT));
                        break;
                case SIOCGSTAMP_NEW:
                case SIOCGSTAMPNS_NEW:
                        if (!ops->gettstamp) {
                                err = -ENOIOCTLCMD;
                                break;
                        }
                        err = ops->gettstamp(sock, argp,
                                             cmd == SIOCGSTAMP_NEW,
                                             false);
                        break;

                case SIOCGIFCONF:
                        err = dev_ifconf(net, argp);
                        break;

                default:
                        err = sock_do_ioctl(net, sock, cmd, arg);
                        break;
                }
        return err;
}

/**
 *        sock_create_lite - creates a socket
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        Creates a new socket and assigns it to @res, passing through LSM.
 *        The new socket initialization is not complete, see kernel_accept().
 *        Returns 0 or an error. On failure @res is set to %NULL.
 *        This function internally uses GFP_KERNEL.
 */

int sock_create_lite(int family, int type, int protocol, struct socket **res)
{
        int err;
        struct socket *sock = NULL;

        err = security_socket_create(family, type, protocol, 1);
        if (err)
                goto out;

        sock = sock_alloc();
        if (!sock) {
                err = -ENOMEM;
                goto out;
        }

        sock->type = type;
        err = security_socket_post_create(sock, family, type, protocol, 1);
        if (err)
                goto out_release;

out:
        *res = sock;
        return err;
out_release:
        sock_release(sock);
        sock = NULL;
        goto out;
}
EXPORT_SYMBOL(sock_create_lite);

/* No kernel lock held - perfect */
static __poll_t sock_poll(struct file *file, poll_table *wait)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        __poll_t events = poll_requested_events(wait), flag = 0;

        if (!ops->poll)
                return 0;

        if (sk_can_busy_loop(sock->sk)) {
                /* poll once if requested by the syscall */
                if (events & POLL_BUSY_LOOP)
                        sk_busy_loop(sock->sk, 1);

                /* if this socket can poll_ll, tell the system call */
                flag = POLL_BUSY_LOOP;
        }

        return ops->poll(file, sock, wait) | flag;
}

static int sock_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct socket *sock = file->private_data;

        return READ_ONCE(sock->ops)->mmap(file, sock, vma);
}

static int sock_close(struct inode *inode, struct file *filp)
{
        __sock_release(SOCKET_I(inode), inode);
        return 0;
}

/*
 *        Update the socket async list
 *
 *        Fasync_list locking strategy.
 *
 *        1. fasync_list is modified only under process context socket lock
 *           i.e. under semaphore.
 *        2. fasync_list is used under read_lock(&sk->sk_callback_lock)
 *           or under socket lock
 */

static int sock_fasync(int fd, struct file *filp, int on)
{
        struct socket *sock = filp->private_data;
        struct sock *sk = sock->sk;
        struct socket_wq *wq = &sock->wq;

        if (sk == NULL)
                return -EINVAL;

        lock_sock(sk);
        fasync_helper(fd, filp, on, &wq->fasync_list);

        if (!wq->fasync_list)
                sock_reset_flag(sk, SOCK_FASYNC);
        else
                sock_set_flag(sk, SOCK_FASYNC);

        release_sock(sk);
        return 0;
}

/* This function may be called only under rcu_lock */

int sock_wake_async(struct socket_wq *wq, int how, int band)
{
        if (!wq || !wq->fasync_list)
                return -1;

        switch (how) {
        case SOCK_WAKE_WAITD:
                if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
                        break;
                goto call_kill;
        case SOCK_WAKE_SPACE:
                if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
                        break;
                fallthrough;
        case SOCK_WAKE_IO:
call_kill:
                kill_fasync(&wq->fasync_list, SIGIO, band);
                break;
        case SOCK_WAKE_URG:
                kill_fasync(&wq->fasync_list, SIGURG, band);
        }

        return 0;
}
EXPORT_SYMBOL(sock_wake_async);

/**
 *        __sock_create - creates a socket
 *        @net: net namespace
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *        @kern: boolean for kernel space sockets
 *
 *        Creates a new socket and assigns it to @res, passing through LSM.
 *        Returns 0 or an error. On failure @res is set to %NULL. @kern must
 *        be set to true if the socket resides in kernel space.
 *        This function internally uses GFP_KERNEL.
 */

int __sock_create(struct net *net, int family, int type, int protocol,
                         struct socket **res, int kern)
{
        int err;
        struct socket *sock;
        const struct net_proto_family *pf;

        /*
         *      Check protocol is in range
         */
        if (family < 0 || family >= NPROTO)
                return -EAFNOSUPPORT;
        if (type < 0 || type >= SOCK_MAX)
                return -EINVAL;

        /* Compatibility.

           This uglymoron is moved from INET layer to here to avoid
           deadlock in module load.
         */
        if (family == PF_INET && type == SOCK_PACKET) {
                pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
                             current->comm);
                family = PF_PACKET;
        }

        err = security_socket_create(family, type, protocol, kern);
        if (err)
                return err;

        /*
         *        Allocate the socket and allow the family to set things up. if
         *        the protocol is 0, the family is instructed to select an appropriate
         *        default.
         */
        sock = sock_alloc();
        if (!sock) {
                net_warn_ratelimited("socket: no more sockets\n");
                return -ENFILE;        /* Not exactly a match, but its the
                                   closest posix thing */
        }

        sock->type = type;

#ifdef CONFIG_MODULES
        /* Attempt to load a protocol module if the find failed.
         *
         * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
         * requested real, full-featured networking support upon configuration.
         * Otherwise module support will break!
         */
        if (rcu_access_pointer(net_families[family]) == NULL)
                request_module("net-pf-%d", family);
#endif

        rcu_read_lock();
        pf = rcu_dereference(net_families[family]);
        err = -EAFNOSUPPORT;
        if (!pf)
                goto out_release;

        /*
         * We will call the ->create function, that possibly is in a loadable
         * module, so we have to bump that loadable module refcnt first.
         */
        if (!try_module_get(pf->owner))
                goto out_release;

        /* Now protected by module ref count */
        rcu_read_unlock();

        err = pf->create(net, sock, protocol, kern);
        if (err < 0) {
                /* ->create should release the allocated sock->sk object on error
                 * and make sure sock->sk is set to NULL to avoid use-after-free
                 */
                DEBUG_NET_WARN_ONCE(sock->sk,
                                    "%ps must clear sock->sk on failure, family: %d, type: %d, protocol: %d\n",
                                    pf->create, family, type, protocol);
                goto out_module_put;
        }

        /*
         * Now to bump the refcnt of the [loadable] module that owns this
         * socket at sock_release time we decrement its refcnt.
         */
        if (!try_module_get(sock->ops->owner))
                goto out_module_busy;

        /*
         * Now that we're done with the ->create function, the [loadable]
         * module can have its refcnt decremented
         */
        module_put(pf->owner);
        err = security_socket_post_create(sock, family, type, protocol, kern);
        if (err)
                goto out_sock_release;
        *res = sock;

        return 0;

out_module_busy:
        err = -EAFNOSUPPORT;
out_module_put:
        sock->ops = NULL;
        module_put(pf->owner);
out_sock_release:
        sock_release(sock);
        return err;

out_release:
        rcu_read_unlock();
        goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);

/**
 *        sock_create - creates a socket
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        A wrapper around __sock_create().
 *        Returns 0 or an error. This function internally uses GFP_KERNEL.
 */

int sock_create(int family, int type, int protocol, struct socket **res)
{
        return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);

/**
 *        sock_create_kern - creates a socket (kernel space)
 *        @net: net namespace
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        A wrapper around __sock_create().
 *        Returns 0 or an error. This function internally uses GFP_KERNEL.
 */

int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)
{
        return __sock_create(net, family, type, protocol, res, 1);
}
EXPORT_SYMBOL(sock_create_kern);

static struct socket *__sys_socket_create(int family, int type, int protocol)
{
        struct socket *sock;
        int retval;

        /* Check the SOCK_* constants for consistency.  */
        BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

        if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return ERR_PTR(-EINVAL);
        type &= SOCK_TYPE_MASK;

        retval = sock_create(family, type, protocol, &sock);
        if (retval < 0)
                return ERR_PTR(retval);

        return sock;
}

struct file *__sys_socket_file(int family, int type, int protocol)
{
        struct socket *sock;
        int flags;

        sock = __sys_socket_create(family, type, protocol);
        if (IS_ERR(sock))
                return ERR_CAST(sock);

        flags = type & ~SOCK_TYPE_MASK;
        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        return sock_alloc_file(sock, flags, NULL);
}

/*        A hook for bpf progs to attach to and update socket protocol.
 *
 *        A static noinline declaration here could cause the compiler to
 *        optimize away the function. A global noinline declaration will
 *        keep the definition, but may optimize away the callsite.
 *        Therefore, __weak is needed to ensure that the call is still
 *        emitted, by telling the compiler that we don't know what the
 *        function might eventually be.
 */

__bpf_hook_start();

__weak noinline int update_socket_protocol(int family, int type, int protocol)
{
        return protocol;
}

__bpf_hook_end();

int __sys_socket(int family, int type, int protocol)
{
        struct socket *sock;
        int flags;

        sock = __sys_socket_create(family, type,
                                   update_socket_protocol(family, type, protocol));
        if (IS_ERR(sock))
                return PTR_ERR(sock);

        flags = type & ~SOCK_TYPE_MASK;
        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
        return __sys_socket(family, type, protocol);
}

/*
 *        Create a pair of connected sockets.
 */

int __sys_socketpair(int family, int type, int protocol, int __user *usockvec)
{
        struct socket *sock1, *sock2;
        int fd1, fd2, err;
        struct file *newfile1, *newfile2;
        int flags;

        flags = type & ~SOCK_TYPE_MASK;
        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;
        type &= SOCK_TYPE_MASK;

        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        /*
         * reserve descriptors and make sure we won't fail
         * to return them to userland.
         */
        fd1 = get_unused_fd_flags(flags);
        if (unlikely(fd1 < 0))
                return fd1;

        fd2 = get_unused_fd_flags(flags);
        if (unlikely(fd2 < 0)) {
                put_unused_fd(fd1);
                return fd2;
        }

        err = put_user(fd1, &usockvec[0]);
        if (err)
                goto out;

        err = put_user(fd2, &usockvec[1]);
        if (err)
                goto out;

        /*
         * Obtain the first socket and check if the underlying protocol
         * supports the socketpair call.
         */

        err = sock_create(family, type, protocol, &sock1);
        if (unlikely(err < 0))
                goto out;

        err = sock_create(family, type, protocol, &sock2);
        if (unlikely(err < 0)) {
                sock_release(sock1);
                goto out;
        }

        err = security_socket_socketpair(sock1, sock2);
        if (unlikely(err)) {
                sock_release(sock2);
                sock_release(sock1);
                goto out;
        }

        err = READ_ONCE(sock1->ops)->socketpair(sock1, sock2);
        if (unlikely(err < 0)) {
                sock_release(sock2);
                sock_release(sock1);
                goto out;
        }

        newfile1 = sock_alloc_file(sock1, flags, NULL);
        if (IS_ERR(newfile1)) {
                err = PTR_ERR(newfile1);
                sock_release(sock2);
                goto out;
        }

        newfile2 = sock_alloc_file(sock2, flags, NULL);
        if (IS_ERR(newfile2)) {
                err = PTR_ERR(newfile2);
                fput(newfile1);
                goto out;
        }

        audit_fd_pair(fd1, fd2);

        fd_install(fd1, newfile1);
        fd_install(fd2, newfile2);
        return 0;

out:
        put_unused_fd(fd2);
        put_unused_fd(fd1);
        return err;
}

SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
                int __user *, usockvec)
{
        return __sys_socketpair(family, type, protocol, usockvec);
}

int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
                      int addrlen)
{
        int err;

        err = security_socket_bind(sock, (struct sockaddr *)address,
                                   addrlen);
        if (!err)
                err = READ_ONCE(sock->ops)->bind(sock,
                                                 (struct sockaddr_unsized *)address,
                                                 addrlen);
        return err;
}

/*
 *        Bind a name to a socket. Nothing much to do here since it's
 *        the protocol's responsibility to handle the local address.
 *
 *        We move the socket address to kernel space before we call
 *        the protocol layer (having also checked the address is ok).
 */

int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
        struct socket *sock;
        struct sockaddr_storage address;
        CLASS(fd, f)(fd);
        int err;

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        err = move_addr_to_kernel(umyaddr, addrlen, &address);
        if (unlikely(err))
                return err;

        return __sys_bind_socket(sock, &address, addrlen);
}

SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
        return __sys_bind(fd, umyaddr, addrlen);
}

/*
 *        Perform a listen. Basically, we allow the protocol to do anything
 *        necessary for a listen, and if that works, we mark the socket as
 *        ready for listening.
 */
int __sys_listen_socket(struct socket *sock, int backlog)
{
        int somaxconn, err;

        somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn);
        if ((unsigned int)backlog > somaxconn)
                backlog = somaxconn;

        err = security_socket_listen(sock, backlog);
        if (!err)
                err = READ_ONCE(sock->ops)->listen(sock, backlog);
        return err;
}

int __sys_listen(int fd, int backlog)
{
        CLASS(fd, f)(fd);
        struct socket *sock;

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return __sys_listen_socket(sock, backlog);
}

SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
        return __sys_listen(fd, backlog);
}

struct file *do_accept(struct file *file, struct proto_accept_arg *arg,
                       struct sockaddr __user *upeer_sockaddr,
                       int __user *upeer_addrlen, int flags)
{
        struct socket *sock, *newsock;
        struct file *newfile;
        int err, len;
        struct sockaddr_storage address;
        const struct proto_ops *ops;

        sock = sock_from_file(file);
        if (!sock)
                return ERR_PTR(-ENOTSOCK);

        newsock = sock_alloc();
        if (!newsock)
                return ERR_PTR(-ENFILE);
        ops = READ_ONCE(sock->ops);

        newsock->type = sock->type;
        newsock->ops = ops;

        /*
         * We don't need try_module_get here, as the listening socket (sock)
         * has the protocol module (sock->ops->owner) held.
         */
        __module_get(ops->owner);

        newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
        if (IS_ERR(newfile))
                return newfile;

        err = security_socket_accept(sock, newsock);
        if (err)
                goto out_fd;

        arg->flags |= sock->file->f_flags;
        err = ops->accept(sock, newsock, arg);
        if (err < 0)
                goto out_fd;

        if (upeer_sockaddr) {
                len = ops->getname(newsock, (struct sockaddr *)&address, 2);
                if (len < 0) {
                        err = -ECONNABORTED;
                        goto out_fd;
                }
                err = move_addr_to_user(&address,
                                        len, upeer_sockaddr, upeer_addrlen);
                if (err < 0)
                        goto out_fd;
        }

        /* File flags are not inherited via accept() unlike another OSes. */
        return newfile;
out_fd:
        fput(newfile);
        return ERR_PTR(err);
}

static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_sockaddr,
                              int __user *upeer_addrlen, int flags)
{
        struct proto_accept_arg arg = { };

        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;

        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        return FD_ADD(flags, do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, flags));
}

/*
 *        For accept, we attempt to create a new socket, set up the link
 *        with the client, wake up the client, then return the new
 *        connected fd. We collect the address of the connector in kernel
 *        space and move it to user at the very end. This is unclean because
 *        we open the socket then return an error.
 *
 *        1003.1g adds the ability to recvmsg() to query connection pending
 *        status to recvmsg. We need to add that support in a way thats
 *        clean when we restructure accept also.
 */

int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
                  int __user *upeer_addrlen, int flags)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        return __sys_accept4_file(fd_file(f), upeer_sockaddr,
                                         upeer_addrlen, flags);
}

SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
                int __user *, upeer_addrlen, int, flags)
{
        return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, flags);
}

SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
                int __user *, upeer_addrlen)
{
        return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
}

/*
 *        Attempt to connect to a socket with the server address.  The address
 *        is in user space so we verify it is OK and move it to kernel space.
 *
 *        For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
 *        break bindings
 *
 *        NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
 *        other SEQPACKET protocols that take time to connect() as it doesn't
 *        include the -EINPROGRESS status for such sockets.
 */

int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
                       int addrlen, int file_flags)
{
        struct socket *sock;
        int err;

        sock = sock_from_file(file);
        if (!sock) {
                err = -ENOTSOCK;
                goto out;
        }

        err =
            security_socket_connect(sock, (struct sockaddr *)address, addrlen);
        if (err)
                goto out;

        err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)address,
                                            addrlen, sock->file->f_flags | file_flags);
out:
        return err;
}

int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
{
        struct sockaddr_storage address;
        CLASS(fd, f)(fd);
        int ret;

        if (fd_empty(f))
                return -EBADF;

        ret = move_addr_to_kernel(uservaddr, addrlen, &address);
        if (ret)
                return ret;

        return __sys_connect_file(fd_file(f), &address, addrlen, 0);
}

SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
                int, addrlen)
{
        return __sys_connect(fd, uservaddr, addrlen);
}

int do_getsockname(struct socket *sock, int peer,
                   struct sockaddr __user *usockaddr, int __user *usockaddr_len)
{
        struct sockaddr_storage address;
        int err;

        if (peer)
                err = security_socket_getpeername(sock);
        else
                err = security_socket_getsockname(sock);
        if (err)
                return err;
        err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer);
        if (err < 0)
                return err;
        /* "err" is actually length in this case */
        return move_addr_to_user(&address, err, usockaddr, usockaddr_len);
}

/*
 *        Get the remote or local address ('name') of a socket object. Move the
 *        obtained name to user space.
 */
int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
                      int __user *usockaddr_len, int peer)
{
        struct socket *sock;
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;
        return do_getsockname(sock, peer, usockaddr, usockaddr_len);
}

SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
                int __user *, usockaddr_len)
{
        return __sys_getsockname(fd, usockaddr, usockaddr_len, 0);
}

SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
                int __user *, usockaddr_len)
{
        return __sys_getsockname(fd, usockaddr, usockaddr_len, 1);
}

/*
 *        Send a datagram to a given address. We move the address into kernel
 *        space and check the user space data area is readable before invoking
 *        the protocol.
 */
int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
                 struct sockaddr __user *addr,  int addr_len)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err;
        struct msghdr msg;

        err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter);
        if (unlikely(err))
                return err;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        msg.msg_name = NULL;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_namelen = 0;
        msg.msg_ubuf = NULL;
        if (addr) {
                err = move_addr_to_kernel(addr, addr_len, &address);
                if (err < 0)
                        return err;
                msg.msg_name = (struct sockaddr *)&address;
                msg.msg_namelen = addr_len;
        }
        flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;
        msg.msg_flags = flags;
        return __sock_sendmsg(sock, &msg);
}

SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
                unsigned int, flags, struct sockaddr __user *, addr,
                int, addr_len)
{
        return __sys_sendto(fd, buff, len, flags, addr, addr_len);
}

/*
 *        Send a datagram down a socket.
 */

SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
                unsigned int, flags)
{
        return __sys_sendto(fd, buff, len, flags, NULL, 0);
}

/*
 *        Receive a frame from the socket and optionally record the address of the
 *        sender. We verify the buffers are writable and if needed move the
 *        sender address from kernel to user space.
 */
int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
                   struct sockaddr __user *addr, int __user *addr_len)
{
        struct sockaddr_storage address;
        struct msghdr msg = {
                /* Save some cycles and don't copy the address if not needed */
                .msg_name = addr ? (struct sockaddr *)&address : NULL,
        };
        struct socket *sock;
        int err, err2;

        err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter);
        if (unlikely(err))
                return err;

        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;
        err = sock_recvmsg(sock, &msg, flags);

        if (err >= 0 && addr != NULL) {
                err2 = move_addr_to_user(&address,
                                         msg.msg_namelen, addr, addr_len);
                if (err2 < 0)
                        err = err2;
        }
        return err;
}

SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
                unsigned int, flags, struct sockaddr __user *, addr,
                int __user *, addr_len)
{
        return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len);
}

/*
 *        Receive a datagram from a socket.
 */

SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
                unsigned int, flags)
{
        return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
}

static bool sock_use_custom_sol_socket(const struct socket *sock)
{
        return test_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags);
}

int do_sock_setsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, int optlen)
{
        const struct proto_ops *ops;
        char *kernel_optval = NULL;
        int err;

        if (optlen < 0)
                return -EINVAL;

        err = security_socket_setsockopt(sock, level, optname);
        if (err)
                goto out_put;

        if (!compat)
                err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname,
                                                     optval, &optlen,
                                                     &kernel_optval);
        if (err < 0)
                goto out_put;
        if (err > 0) {
                err = 0;
                goto out_put;
        }

        if (kernel_optval)
                optval = KERNEL_SOCKPTR(kernel_optval);
        ops = READ_ONCE(sock->ops);
        if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock))
                err = sock_setsockopt(sock, level, optname, optval, optlen);
        else if (unlikely(!ops->setsockopt))
                err = -EOPNOTSUPP;
        else
                err = ops->setsockopt(sock, level, optname, optval,
                                            optlen);
        kfree(kernel_optval);
out_put:
        return err;
}
EXPORT_SYMBOL(do_sock_setsockopt);

/* Set a socket option. Because we don't know the option lengths we have
 * to pass the user mode parameter for the protocols to sort out.
 */
int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval,
                     int optlen)
{
        sockptr_t optval = USER_SOCKPTR(user_optval);
        bool compat = in_compat_syscall();
        struct socket *sock;
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return do_sock_setsockopt(sock, compat, level, optname, optval, optlen);
}

SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
                char __user *, optval, int, optlen)
{
        return __sys_setsockopt(fd, level, optname, optval, optlen);
}

INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
                                                         int optname));

/*
 * Initialize a sockopt_t from sockptr optval/optlen, setting up iov_iter
 * for both input and output directions.
 * It is important to remember that both iov points to the same data, but,
 * .iter_in is read-only and .iter_out is write-only by the protocol callbacks
 */
static int sockptr_to_sockopt(sockopt_t *opt, sockptr_t optval,
                              sockptr_t optlen, struct kvec *kvec)
{
        int koptlen;

        if (copy_from_sockptr(&koptlen, optlen, sizeof(int)))
                return -EFAULT;

        if (koptlen < 0)
                return -EINVAL;

        if (optval.is_kernel) {
                kvec->iov_base = optval.kernel;
                kvec->iov_len = koptlen;
                iov_iter_kvec(&opt->iter_out, ITER_DEST, kvec, 1, koptlen);
                iov_iter_kvec(&opt->iter_in, ITER_SOURCE, kvec, 1, koptlen);
        } else {
                iov_iter_ubuf(&opt->iter_out, ITER_DEST, optval.user, koptlen);
                iov_iter_ubuf(&opt->iter_in, ITER_SOURCE, optval.user,
                              koptlen);
        }
        opt->optlen = koptlen;

        return 0;
}

int do_sock_getsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, sockptr_t optlen)
{
        int max_optlen __maybe_unused = 0;
        const struct proto_ops *ops;
        struct kvec kvec;
        sockopt_t opt;
        int err;

        err = security_socket_getsockopt(sock, level, optname);
        if (err)
                return err;

        if (!compat)
                copy_from_sockptr(&max_optlen, optlen, sizeof(int));

        ops = READ_ONCE(sock->ops);
        if (level == SOL_SOCKET) {
                err = sk_getsockopt(sock->sk, level, optname, optval, optlen);
        } else if (ops->getsockopt_iter) {
                err = sockptr_to_sockopt(&opt, optval, optlen, &kvec);
                if (err)
                        return err;

                err = ops->getsockopt_iter(sock, level, optname, &opt);

                /* Always write back optlen, even on failure. Some protocols
                 * (e.g. CAN raw) return -ERANGE and set optlen to the
                 * required buffer size so userspace can discover it.
                 */
                if (copy_to_sockptr(optlen, &opt.optlen, sizeof(int)))
                        return -EFAULT;
        } else if (ops->getsockopt) {
                if (WARN_ONCE(optval.is_kernel || optlen.is_kernel,
                              "Invalid argument type"))
                        return -EOPNOTSUPP;

                err = ops->getsockopt(sock, level, optname, optval.user,
                                      optlen.user);
        } else {
                err = -EOPNOTSUPP;
        }

        if (!compat)
                err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
                                                     optval, optlen, max_optlen,
                                                     err);

        return err;
}
EXPORT_SYMBOL(do_sock_getsockopt);

/*
 *        Get a socket option. Because we don't know the option lengths we have
 *        to pass a user mode parameter for the protocols to sort out.
 */
int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
                int __user *optlen)
{
        struct socket *sock;
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return do_sock_getsockopt(sock, in_compat_syscall(), level, optname,
                                 USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
}

SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
                char __user *, optval, int __user *, optlen)
{
        return __sys_getsockopt(fd, level, optname, optval, optlen);
}

/*
 *        Shutdown a socket.
 */

int __sys_shutdown_sock(struct socket *sock, int how)
{
        int err;

        err = security_socket_shutdown(sock, how);
        if (!err)
                err = READ_ONCE(sock->ops)->shutdown(sock, how);

        return err;
}

int __sys_shutdown(int fd, int how)
{
        struct socket *sock;
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return __sys_shutdown_sock(sock, how);
}

SYSCALL_DEFINE2(shutdown, int, fd, int, how)
{
        return __sys_shutdown(fd, how);
}

/* A couple of helpful macros for getting the address of the 32/64 bit
 * fields which are the same type (int / unsigned) on our platforms.
 */
#define COMPAT_MSG(msg, member)        ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
#define COMPAT_NAMELEN(msg)        COMPAT_MSG(msg, msg_namelen)
#define COMPAT_FLAGS(msg)        COMPAT_MSG(msg, msg_flags)

struct used_address {
        struct sockaddr_storage name;
        unsigned int name_len;
};

int __copy_msghdr(struct msghdr *kmsg,
                  struct user_msghdr *msg,
                  struct sockaddr __user **save_addr)
{
        ssize_t err;

        kmsg->msg_control_is_user = true;
        kmsg->msg_get_inq = 0;
        kmsg->msg_control_user = msg->msg_control;
        kmsg->msg_controllen = msg->msg_controllen;
        kmsg->msg_flags = msg->msg_flags;

        kmsg->msg_namelen = msg->msg_namelen;
        if (!msg->msg_name)
                kmsg->msg_namelen = 0;

        if (kmsg->msg_namelen < 0)
                return -EINVAL;

        if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
                kmsg->msg_namelen = sizeof(struct sockaddr_storage);

        if (save_addr)
                *save_addr = msg->msg_name;

        if (msg->msg_name && kmsg->msg_namelen) {
                if (!save_addr) {
                        err = move_addr_to_kernel(msg->msg_name,
                                                  kmsg->msg_namelen,
                                                  kmsg->msg_name);
                        if (err < 0)
                                return err;
                }
        } else {
                kmsg->msg_name = NULL;
                kmsg->msg_namelen = 0;
        }

        if (msg->msg_iovlen > UIO_MAXIOV)
                return -EMSGSIZE;

        kmsg->msg_iocb = NULL;
        kmsg->msg_ubuf = NULL;
        return 0;
}

static int copy_msghdr_from_user(struct msghdr *kmsg,
                                 struct user_msghdr __user *umsg,
                                 struct sockaddr __user **save_addr,
                                 struct iovec **iov)
{
        struct user_msghdr msg;
        ssize_t err;

        if (copy_from_user(&msg, umsg, sizeof(*umsg)))
                return -EFAULT;

        err = __copy_msghdr(kmsg, &msg, save_addr);
        if (err)
                return err;

        err = import_iovec(save_addr ? ITER_DEST : ITER_SOURCE,
                            msg.msg_iov, msg.msg_iovlen,
                            UIO_FASTIOV, iov, &kmsg->msg_iter);
        return err < 0 ? err : 0;
}

static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
                           unsigned int flags, struct used_address *used_address,
                           unsigned int allowed_msghdr_flags)
{
        unsigned char ctl[sizeof(struct cmsghdr) + 20]
                                __aligned(sizeof(__kernel_size_t));
        /* 20 is size of ipv6_pktinfo */
        unsigned char *ctl_buf = ctl;
        int ctl_len;
        ssize_t err;

        err = -ENOBUFS;

        if (msg_sys->msg_controllen > INT_MAX)
                goto out;
        flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
        ctl_len = msg_sys->msg_controllen;
        if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
                err =
                    cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
                                                     sizeof(ctl));
                if (err)
                        goto out;
                ctl_buf = msg_sys->msg_control;
                ctl_len = msg_sys->msg_controllen;
        } else if (ctl_len) {
                BUILD_BUG_ON(sizeof(struct cmsghdr) !=
                             CMSG_ALIGN(sizeof(struct cmsghdr)));
                if (ctl_len > sizeof(ctl)) {
                        ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
                        if (ctl_buf == NULL)
                                goto out;
                }
                err = -EFAULT;
                if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len))
                        goto out_freectl;
                msg_sys->msg_control = ctl_buf;
                msg_sys->msg_control_is_user = false;
        }
        flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
        msg_sys->msg_flags = flags;

        if (sock->file->f_flags & O_NONBLOCK)
                msg_sys->msg_flags |= MSG_DONTWAIT;
        /*
         * If this is sendmmsg() and current destination address is same as
         * previously succeeded address, omit asking LSM's decision.
         * used_address->name_len is initialized to UINT_MAX so that the first
         * destination address never matches.
         */
        if (used_address && msg_sys->msg_name &&
            used_address->name_len == msg_sys->msg_namelen &&
            !memcmp(&used_address->name, msg_sys->msg_name,
                    used_address->name_len)) {
                err = sock_sendmsg_nosec(sock, msg_sys);
                goto out_freectl;
        }
        err = __sock_sendmsg(sock, msg_sys);
        /*
         * If this is sendmmsg() and sending to current destination address was
         * successful, remember it.
         */
        if (used_address && err >= 0) {
                used_address->name_len = msg_sys->msg_namelen;
                if (msg_sys->msg_name)
                        memcpy(&used_address->name, msg_sys->msg_name,
                               used_address->name_len);
        }

out_freectl:
        if (ctl_buf != ctl)
                sock_kfree_s(sock->sk, ctl_buf, ctl_len);
out:
        return err;
}

static int sendmsg_copy_msghdr(struct msghdr *msg,
                               struct user_msghdr __user *umsg, unsigned flags,
                               struct iovec **iov)
{
        int err;

        if (flags & MSG_CMSG_COMPAT) {
                struct compat_msghdr __user *msg_compat;

                msg_compat = (struct compat_msghdr __user *) umsg;
                err = get_compat_msghdr(msg, msg_compat, NULL, iov);
        } else {
                err = copy_msghdr_from_user(msg, umsg, NULL, iov);
        }
        if (err < 0)
                return err;

        return 0;
}

static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
                         struct msghdr *msg_sys, unsigned int flags,
                         struct used_address *used_address,
                         unsigned int allowed_msghdr_flags)
{
        struct sockaddr_storage address;
        struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
        ssize_t err;

        msg_sys->msg_name = &address;

        err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov);
        if (err < 0)
                return err;

        err = ____sys_sendmsg(sock, msg_sys, flags, used_address,
                                allowed_msghdr_flags);
        kfree(iov);
        return err;
}

/*
 *        BSD sendmsg interface
 */
long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
                        unsigned int flags)
{
        return ____sys_sendmsg(sock, msg, flags, NULL, 0);
}

long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
                   bool forbid_cmsg_compat)
{
        struct msghdr msg_sys;
        struct socket *sock;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
}

SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags)
{
        return __sys_sendmsg(fd, msg, flags, true);
}

/*
 *        Linux sendmmsg interface
 */

int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
                   unsigned int flags, bool forbid_cmsg_compat)
{
        int err, datagrams;
        struct socket *sock;
        struct mmsghdr __user *entry;
        struct compat_mmsghdr __user *compat_entry;
        struct msghdr msg_sys;
        struct used_address used_address;
        unsigned int oflags = flags;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        if (vlen > UIO_MAXIOV)
                vlen = UIO_MAXIOV;

        datagrams = 0;

        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        used_address.name_len = UINT_MAX;
        entry = mmsg;
        compat_entry = (struct compat_mmsghdr __user *)mmsg;
        err = 0;
        flags |= MSG_BATCH;

        while (datagrams < vlen) {
                if (datagrams == vlen - 1)
                        flags = oflags;

                if (MSG_CMSG_COMPAT & flags) {
                        err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
                                             &msg_sys, flags, &used_address, MSG_EOR);
                        if (err < 0)
                                break;
                        err = __put_user(err, &compat_entry->msg_len);
                        ++compat_entry;
                } else {
                        err = ___sys_sendmsg(sock,
                                             (struct user_msghdr __user *)entry,
                                             &msg_sys, flags, &used_address, MSG_EOR);
                        if (err < 0)
                                break;
                        err = put_user(err, &entry->msg_len);
                        ++entry;
                }

                if (err)
                        break;
                ++datagrams;
                if (msg_data_left(&msg_sys))
                        break;
                cond_resched();
        }

        /* We only return an error if no datagrams were able to be sent */
        if (datagrams != 0)
                return datagrams;

        return err;
}

SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags)
{
        return __sys_sendmmsg(fd, mmsg, vlen, flags, true);
}

static int recvmsg_copy_msghdr(struct msghdr *msg,
                               struct user_msghdr __user *umsg, unsigned flags,
                               struct sockaddr __user **uaddr,
                               struct iovec **iov)
{
        ssize_t err;

        if (MSG_CMSG_COMPAT & flags) {
                struct compat_msghdr __user *msg_compat;

                msg_compat = (struct compat_msghdr __user *) umsg;
                err = get_compat_msghdr(msg, msg_compat, uaddr, iov);
        } else {
                err = copy_msghdr_from_user(msg, umsg, uaddr, iov);
        }
        if (err < 0)
                return err;

        return 0;
}

static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys,
                           struct user_msghdr __user *msg,
                           struct sockaddr __user *uaddr,
                           unsigned int flags, int nosec)
{
        struct compat_msghdr __user *msg_compat =
                                        (struct compat_msghdr __user *) msg;
        int __user *uaddr_len = COMPAT_NAMELEN(msg);
        struct sockaddr_storage addr;
        unsigned long cmsg_ptr;
        int len;
        ssize_t err;

        msg_sys->msg_name = &addr;
        cmsg_ptr = (unsigned long)msg_sys->msg_control;
        msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);

        /* We assume all kernel code knows the size of sockaddr_storage */
        msg_sys->msg_namelen = 0;

        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;

        if (unlikely(nosec))
                err = sock_recvmsg_nosec(sock, msg_sys, flags);
        else
                err = sock_recvmsg(sock, msg_sys, flags);

        if (err < 0)
                goto out;
        len = err;

        if (uaddr != NULL) {
                err = move_addr_to_user(&addr,
                                        msg_sys->msg_namelen, uaddr,
                                        uaddr_len);
                if (err < 0)
                        goto out;
        }
        err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
                         COMPAT_FLAGS(msg));
        if (err)
                goto out;
        if (MSG_CMSG_COMPAT & flags)
                err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
                                 &msg_compat->msg_controllen);
        else
                err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
                                 &msg->msg_controllen);
        if (err)
                goto out;
        err = len;
out:
        return err;
}

static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
                         struct msghdr *msg_sys, unsigned int flags, int nosec)
{
        struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
        /* user mode address pointers */
        struct sockaddr __user *uaddr;
        ssize_t err;

        err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov);
        if (err < 0)
                return err;

        err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec);
        kfree(iov);
        return err;
}

/*
 *        BSD recvmsg interface
 */

long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
                        struct user_msghdr __user *umsg,
                        struct sockaddr __user *uaddr, unsigned int flags)
{
        return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
}

long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
                   bool forbid_cmsg_compat)
{
        struct msghdr msg_sys;
        struct socket *sock;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
}

SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
                unsigned int, flags)
{
        return __sys_recvmsg(fd, msg, flags, true);
}

/*
 *     Linux recvmmsg interface
 */

static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
                          unsigned int vlen, unsigned int flags,
                          struct timespec64 *timeout)
{
        int err = 0, datagrams;
        struct socket *sock;
        struct mmsghdr __user *entry;
        struct compat_mmsghdr __user *compat_entry;
        struct msghdr msg_sys;
        struct timespec64 end_time;
        struct timespec64 timeout64;

        if (timeout &&
            poll_select_set_timeout(&end_time, timeout->tv_sec,
                                    timeout->tv_nsec))
                return -EINVAL;

        datagrams = 0;

        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        if (likely(!(flags & MSG_ERRQUEUE))) {
                err = sock_error(sock->sk);
                if (err)
                        return err;
        }

        entry = mmsg;
        compat_entry = (struct compat_mmsghdr __user *)mmsg;

        while (datagrams < vlen) {
                /*
                 * No need to ask LSM for more than the first datagram.
                 */
                if (MSG_CMSG_COMPAT & flags) {
                        err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry,
                                             &msg_sys, flags & ~MSG_WAITFORONE,
                                             datagrams);
                        if (err < 0)
                                break;
                        err = __put_user(err, &compat_entry->msg_len);
                        ++compat_entry;
                } else {
                        err = ___sys_recvmsg(sock,
                                             (struct user_msghdr __user *)entry,
                                             &msg_sys, flags & ~MSG_WAITFORONE,
                                             datagrams);
                        if (err < 0)
                                break;
                        err = put_user(err, &entry->msg_len);
                        ++entry;
                }

                if (err)
                        break;
                ++datagrams;

                /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */
                if (flags & MSG_WAITFORONE)
                        flags |= MSG_DONTWAIT;

                if (timeout) {
                        ktime_get_ts64(&timeout64);
                        *timeout = timespec64_sub(end_time, timeout64);
                        if (timeout->tv_sec < 0) {
                                timeout->tv_sec = timeout->tv_nsec = 0;
                                break;
                        }

                        /* Timeout, return less than vlen datagrams */
                        if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
                                break;
                }

                /* Out of band data, return right away */
                if (msg_sys.msg_flags & MSG_OOB)
                        break;
                cond_resched();
        }

        if (err == 0)
                return datagrams;

        if (datagrams == 0)
                return err;

        /*
         * We may return less entries than requested (vlen) if the
         * sock is non block and there aren't enough datagrams...
         */
        if (err != -EAGAIN) {
                /*
                 * ... or  if recvmsg returns an error after we
                 * received some datagrams, where we record the
                 * error to return on the next call or if the
                 * app asks about it using getsockopt(SO_ERROR).
                 */
                WRITE_ONCE(sock->sk->sk_err, -err);
        }
        return datagrams;
}

int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
                   unsigned int vlen, unsigned int flags,
                   struct __kernel_timespec __user *timeout,
                   struct old_timespec32 __user *timeout32)
{
        int datagrams;
        struct timespec64 timeout_sys;

        if (timeout && get_timespec64(&timeout_sys, timeout))
                return -EFAULT;

        if (timeout32 && get_old_timespec32(&timeout_sys, timeout32))
                return -EFAULT;

        if (!timeout && !timeout32)
                return do_recvmmsg(fd, mmsg, vlen, flags, NULL);

        datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);

        if (datagrams <= 0)
                return datagrams;

        if (timeout && put_timespec64(&timeout_sys, timeout))
                datagrams = -EFAULT;

        if (timeout32 && put_old_timespec32(&timeout_sys, timeout32))
                datagrams = -EFAULT;

        return datagrams;
}

SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags,
                struct __kernel_timespec __user *, timeout)
{
        if (flags & MSG_CMSG_COMPAT)
                return -EINVAL;

        return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL);
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags,
                struct old_timespec32 __user *, timeout)
{
        if (flags & MSG_CMSG_COMPAT)
                return -EINVAL;

        return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout);
}
#endif

#ifdef __ARCH_WANT_SYS_SOCKETCALL
/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nargs[21] = {
        AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
        AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
        AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
        AL(4), AL(5), AL(4)
};

#undef AL

/*
 *        System call vectors.
 *
 *        Argument checking cleaned up. Saved 20% in size.
 *  This function doesn't need to set the kernel lock because
 *  it is set by the callees.
 */

SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
        unsigned long a[AUDITSC_ARGS];
        unsigned long a0, a1;
        int err;
        unsigned int len;

        if (call < 1 || call > SYS_SENDMMSG)
                return -EINVAL;
        call = array_index_nospec(call, SYS_SENDMMSG + 1);

        len = nargs[call];
        if (len > sizeof(a))
                return -EINVAL;

        /* copy_from_user should be SMP safe. */
        if (copy_from_user(a, args, len))
                return -EFAULT;

        err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
        if (err)
                return err;

        a0 = a[0];
        a1 = a[1];

        switch (call) {
        case SYS_SOCKET:
                err = __sys_socket(a0, a1, a[2]);
                break;
        case SYS_BIND:
                err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
                break;
        case SYS_CONNECT:
                err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
                break;
        case SYS_LISTEN:
                err = __sys_listen(a0, a1);
                break;
        case SYS_ACCEPT:
                err = __sys_accept4(a0, (struct sockaddr __user *)a1,
                                    (int __user *)a[2], 0);
                break;
        case SYS_GETSOCKNAME:
                err =
                    __sys_getsockname(a0, (struct sockaddr __user *)a1,
                                      (int __user *)a[2], 0);
                break;
        case SYS_GETPEERNAME:
                err =
                    __sys_getsockname(a0, (struct sockaddr __user *)a1,
                                      (int __user *)a[2], 1);
                break;
        case SYS_SOCKETPAIR:
                err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
                break;
        case SYS_SEND:
                err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
                                   NULL, 0);
                break;
        case SYS_SENDTO:
                err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
                                   (struct sockaddr __user *)a[4], a[5]);
                break;
        case SYS_RECV:
                err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                                     NULL, NULL);
                break;
        case SYS_RECVFROM:
                err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                                     (struct sockaddr __user *)a[4],
                                     (int __user *)a[5]);
                break;
        case SYS_SHUTDOWN:
                err = __sys_shutdown(a0, a1);
                break;
        case SYS_SETSOCKOPT:
                err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3],
                                       a[4]);
                break;
        case SYS_GETSOCKOPT:
                err =
                    __sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                                     (int __user *)a[4]);
                break;
        case SYS_SENDMSG:
                err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1,
                                    a[2], true);
                break;
        case SYS_SENDMMSG:
                err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2],
                                     a[3], true);
                break;
        case SYS_RECVMSG:
                err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1,
                                    a[2], true);
                break;
        case SYS_RECVMMSG:
                if (IS_ENABLED(CONFIG_64BIT))
                        err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
                                             a[2], a[3],
                                             (struct __kernel_timespec __user *)a[4],
                                             NULL);
                else
                        err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
                                             a[2], a[3], NULL,
                                             (struct old_timespec32 __user *)a[4]);
                break;
        case SYS_ACCEPT4:
                err = __sys_accept4(a0, (struct sockaddr __user *)a1,
                                    (int __user *)a[2], a[3]);
                break;
        default:
                err = -EINVAL;
                break;
        }
        return err;
}

#endif                                /* __ARCH_WANT_SYS_SOCKETCALL */

/**
 *        sock_register - add a socket protocol handler
 *        @ops: description of protocol
 *
 *        This function is called by a protocol handler that wants to
 *        advertise its address family, and have it linked into the
 *        socket interface. The value ops->family corresponds to the
 *        socket system call protocol family.
 */
int sock_register(const struct net_proto_family *ops)
{
        int err;

        if (ops->family >= NPROTO) {
                pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
                return -ENOBUFS;
        }

        spin_lock(&net_family_lock);
        if (rcu_dereference_protected(net_families[ops->family],
                                      lockdep_is_held(&net_family_lock)))
                err = -EEXIST;
        else {
                rcu_assign_pointer(net_families[ops->family], ops);
                err = 0;
        }
        spin_unlock(&net_family_lock);

        pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]);
        return err;
}
EXPORT_SYMBOL(sock_register);

/**
 *        sock_unregister - remove a protocol handler
 *        @family: protocol family to remove
 *
 *        This function is called by a protocol handler that wants to
 *        remove its address family, and have it unlinked from the
 *        new socket creation.
 *
 *        If protocol handler is a module, then it can use module reference
 *        counts to protect against new references. If protocol handler is not
 *        a module then it needs to provide its own protection in
 *        the ops->create routine.
 */
void sock_unregister(int family)
{
        BUG_ON(family < 0 || family >= NPROTO);

        spin_lock(&net_family_lock);
        RCU_INIT_POINTER(net_families[family], NULL);
        spin_unlock(&net_family_lock);

        synchronize_rcu();

        pr_info("NET: Unregistered %s protocol family\n", pf_family_names[family]);
}
EXPORT_SYMBOL(sock_unregister);

bool sock_is_registered(int family)
{
        return family < NPROTO && rcu_access_pointer(net_families[family]);
}

static int __init sock_init(void)
{
        int err;
        /*
         *      Initialize the network sysctl infrastructure.
         */
        err = net_sysctl_init();
        if (err)
                goto out;

        /*
         *      Initialize skbuff SLAB cache
         */
        skb_init();

        /*
         *      Initialize the protocols module.
         */

        init_inodecache();

        err = register_filesystem(&sock_fs_type);
        if (err)
                goto out;
        sock_mnt = kern_mount(&sock_fs_type);
        if (IS_ERR(sock_mnt)) {
                err = PTR_ERR(sock_mnt);
                goto out_mount;
        }

        /* The real protocol initialization is performed in later initcalls.
         */

#ifdef CONFIG_NETFILTER
        err = netfilter_init();
        if (err)
                goto out;
#endif

        ptp_classifier_init();

out:
        return err;

out_mount:
        unregister_filesystem(&sock_fs_type);
        goto out;
}

core_initcall(sock_init);        /* early initcall */

#ifdef CONFIG_PROC_FS
void socket_seq_show(struct seq_file *seq)
{
        seq_printf(seq, "sockets: used %d\n",
                   sock_inuse_get(seq->private));
}
#endif                                /* CONFIG_PROC_FS */

/* Handle the fact that while struct ifreq has the same *layout* on
 * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data,
 * which are handled elsewhere, it still has different *size* due to
 * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit,
 * resulting in struct ifreq being 32 and 40 bytes respectively).
 * As a result, if the struct happens to be at the end of a page and
 * the next page isn't readable/writable, we get a fault. To prevent
 * that, copy back and forth to the full size.
 */
int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg)
{
        if (in_compat_syscall()) {
                struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr;

                memset(ifr, 0, sizeof(*ifr));
                if (copy_from_user(ifr32, arg, sizeof(*ifr32)))
                        return -EFAULT;

                if (ifrdata)
                        *ifrdata = compat_ptr(ifr32->ifr_data);

                return 0;
        }

        if (copy_from_user(ifr, arg, sizeof(*ifr)))
                return -EFAULT;

        if (ifrdata)
                *ifrdata = ifr->ifr_data;

        return 0;
}
EXPORT_SYMBOL(get_user_ifreq);

int put_user_ifreq(struct ifreq *ifr, void __user *arg)
{
        size_t size = sizeof(*ifr);

        if (in_compat_syscall())
                size = sizeof(struct compat_ifreq);

        if (copy_to_user(arg, ifr, size))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(put_user_ifreq);

#ifdef CONFIG_COMPAT
static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
{
        compat_uptr_t uptr32;
        struct ifreq ifr;
        void __user *saved;
        int err;

        if (get_user_ifreq(&ifr, NULL, uifr32))
                return -EFAULT;

        if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
                return -EFAULT;

        saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc;
        ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32);

        err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL);
        if (!err) {
                ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved;
                if (put_user_ifreq(&ifr, uifr32))
                        err = -EFAULT;
        }
        return err;
}

/* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */
static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
                                 struct compat_ifreq __user *u_ifreq32)
{
        struct ifreq ifreq;
        void __user *data;

        if (!is_socket_ioctl_cmd(cmd))
                return -ENOTTY;
        if (get_user_ifreq(&ifreq, &data, u_ifreq32))
                return -EFAULT;
        ifreq.ifr_data = data;

        return dev_ioctl(net, cmd, &ifreq, data, NULL);
}

static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
                         unsigned int cmd, unsigned long arg)
{
        void __user *argp = compat_ptr(arg);
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        const struct proto_ops *ops;

        if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
                return sock_ioctl(file, cmd, (unsigned long)argp);

        switch (cmd) {
        case SIOCWANDEV:
                return compat_siocwandev(net, argp);
        case SIOCGSTAMP_OLD:
        case SIOCGSTAMPNS_OLD:
                ops = READ_ONCE(sock->ops);
                if (!ops->gettstamp)
                        return -ENOIOCTLCMD;
                return ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD,
                                      !COMPAT_USE_64BIT_TIME);

        case SIOCETHTOOL:
        case SIOCBONDSLAVEINFOQUERY:
        case SIOCBONDINFOQUERY:
        case SIOCSHWTSTAMP:
        case SIOCGHWTSTAMP:
                return compat_ifr_data_ioctl(net, cmd, argp);

        case FIOSETOWN:
        case SIOCSPGRP:
        case FIOGETOWN:
        case SIOCGPGRP:
        case SIOCBRADDBR:
        case SIOCBRDELBR:
        case SIOCBRADDIF:
        case SIOCBRDELIF:
        case SIOCGIFVLAN:
        case SIOCSIFVLAN:
        case SIOCGSKNS:
        case SIOCGSTAMP_NEW:
        case SIOCGSTAMPNS_NEW:
        case SIOCGIFCONF:
        case SIOCSIFBR:
        case SIOCGIFBR:
                return sock_ioctl(file, cmd, arg);

        case SIOCGIFFLAGS:
        case SIOCSIFFLAGS:
        case SIOCGIFMAP:
        case SIOCSIFMAP:
        case SIOCGIFMETRIC:
        case SIOCSIFMETRIC:
        case SIOCGIFMTU:
        case SIOCSIFMTU:
        case SIOCGIFMEM:
        case SIOCSIFMEM:
        case SIOCGIFHWADDR:
        case SIOCSIFHWADDR:
        case SIOCADDMULTI:
        case SIOCDELMULTI:
        case SIOCGIFINDEX:
        case SIOCGIFADDR:
        case SIOCSIFADDR:
        case SIOCSIFHWBROADCAST:
        case SIOCDIFADDR:
        case SIOCGIFBRDADDR:
        case SIOCSIFBRDADDR:
        case SIOCGIFDSTADDR:
        case SIOCSIFDSTADDR:
        case SIOCGIFNETMASK:
        case SIOCSIFNETMASK:
        case SIOCSIFPFLAGS:
        case SIOCGIFPFLAGS:
        case SIOCGIFTXQLEN:
        case SIOCSIFTXQLEN:
        case SIOCGIFNAME:
        case SIOCSIFNAME:
        case SIOCGMIIPHY:
        case SIOCGMIIREG:
        case SIOCSMIIREG:
        case SIOCBONDENSLAVE:
        case SIOCBONDRELEASE:
        case SIOCBONDSETHWADDR:
        case SIOCBONDCHANGEACTIVE:
        case SIOCSARP:
        case SIOCGARP:
        case SIOCDARP:
        case SIOCOUTQ:
        case SIOCOUTQNSD:
        case SIOCATMARK:
                return sock_do_ioctl(net, sock, cmd, arg);
        }

        return -ENOIOCTLCMD;
}

static long compat_sock_ioctl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        int ret = -ENOIOCTLCMD;
        struct sock *sk;
        struct net *net;

        sk = sock->sk;
        net = sock_net(sk);

        if (ops->compat_ioctl)
                ret = ops->compat_ioctl(sock, cmd, arg);

        if (ret == -ENOIOCTLCMD &&
            (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
                ret = compat_wext_handle_ioctl(net, cmd, arg);

        if (ret == -ENOIOCTLCMD)
                ret = compat_sock_ioctl_trans(file, sock, cmd, arg);

        return ret;
}
#endif

/**
 *        kernel_bind - bind an address to a socket (kernel space)
 *        @sock: socket
 *        @addr: address
 *        @addrlen: length of address
 *
 *        Returns 0 or an error.
 */

int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen)
{
        struct sockaddr_storage address;

        memcpy(&address, addr, addrlen);

        return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr_unsized *)&address,
                                          addrlen);
}
EXPORT_SYMBOL(kernel_bind);

/**
 *        kernel_listen - move socket to listening state (kernel space)
 *        @sock: socket
 *        @backlog: pending connections queue size
 *
 *        Returns 0 or an error.
 */

int kernel_listen(struct socket *sock, int backlog)
{
        return READ_ONCE(sock->ops)->listen(sock, backlog);
}
EXPORT_SYMBOL(kernel_listen);

/**
 *        kernel_accept - accept a connection (kernel space)
 *        @sock: listening socket
 *        @newsock: new connected socket
 *        @flags: flags
 *
 *        @flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0.
 *        If it fails, @newsock is guaranteed to be %NULL.
 *        Returns 0 or an error.
 */

int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
{
        struct sock *sk = sock->sk;
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        struct proto_accept_arg arg = {
                .flags = flags,
                .kern = true,
        };
        int err;

        err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
                               newsock);
        if (err < 0)
                goto done;

        err = ops->accept(sock, *newsock, &arg);
        if (err < 0) {
                sock_release(*newsock);
                *newsock = NULL;
                goto done;
        }

        (*newsock)->ops = ops;
        __module_get(ops->owner);

done:
        return err;
}
EXPORT_SYMBOL(kernel_accept);

/**
 *        kernel_connect - connect a socket (kernel space)
 *        @sock: socket
 *        @addr: address
 *        @addrlen: address length
 *        @flags: flags (O_NONBLOCK, ...)
 *
 *        For datagram sockets, @addr is the address to which datagrams are sent
 *        by default, and the only address from which datagrams are received.
 *        For stream sockets, attempts to connect to @addr.
 *        Returns 0 or an error code.
 */

int kernel_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen,
                   int flags)
{
        struct sockaddr_storage address;

        memcpy(&address, addr, addrlen);

        return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)&address,
                                             addrlen, flags);
}
EXPORT_SYMBOL(kernel_connect);

/**
 *        kernel_getsockname - get the address which the socket is bound (kernel space)
 *        @sock: socket
 *        @addr: address holder
 *
 *         Fills the @addr pointer with the address which the socket is bound.
 *        Returns the length of the address in bytes or an error code.
 */

int kernel_getsockname(struct socket *sock, struct sockaddr *addr)
{
        return READ_ONCE(sock->ops)->getname(sock, addr, 0);
}
EXPORT_SYMBOL(kernel_getsockname);

/**
 *        kernel_getpeername - get the address which the socket is connected (kernel space)
 *        @sock: socket
 *        @addr: address holder
 *
 *         Fills the @addr pointer with the address which the socket is connected.
 *        Returns the length of the address in bytes or an error code.
 */

int kernel_getpeername(struct socket *sock, struct sockaddr *addr)
{
        return READ_ONCE(sock->ops)->getname(sock, addr, 1);
}
EXPORT_SYMBOL(kernel_getpeername);

/**
 *        kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space)
 *        @sock: socket
 *        @how: connection part
 *
 *        Returns 0 or an error.
 */

int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
{
        return READ_ONCE(sock->ops)->shutdown(sock, how);
}
EXPORT_SYMBOL(kernel_sock_shutdown);

/**
 *        kernel_sock_ip_overhead - returns the IP overhead imposed by a socket
 *        @sk: socket
 *
 *        This routine returns the IP overhead imposed by a socket i.e.
 *        the length of the underlying IP header, depending on whether
 *        this is an IPv4 or IPv6 socket and the length from IP options turned
 *        on at the socket. Assumes that the caller has a lock on the socket.
 */

u32 kernel_sock_ip_overhead(struct sock *sk)
{
        struct inet_sock *inet;
        struct ip_options_rcu *opt;
        u32 overhead = 0;
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6_pinfo *np;
        struct ipv6_txoptions *optv6 = NULL;
#endif /* IS_ENABLED(CONFIG_IPV6) */

        if (!sk)
                return overhead;

        switch (sk->sk_family) {
        case AF_INET:
                inet = inet_sk(sk);
                overhead += sizeof(struct iphdr);
                opt = rcu_dereference_protected(inet->inet_opt,
                                                sock_owned_by_user(sk));
                if (opt)
                        overhead += opt->opt.optlen;
                return overhead;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                np = inet6_sk(sk);
                overhead += sizeof(struct ipv6hdr);
                if (np)
                        optv6 = rcu_dereference_protected(np->opt,
                                                          sock_owned_by_user(sk));
                if (optv6)
                        overhead += (optv6->opt_flen + optv6->opt_nflen);
                return overhead;
#endif /* IS_ENABLED(CONFIG_IPV6) */
        default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */
                return overhead;
        }
}
EXPORT_SYMBOL(kernel_sock_ip_overhead);






























   19 





































   15 






















   19 











































































   14 









   14 


   18 
























   14 




































   20 


   18 
   16 









   18 












   20 



   18 

   18 




   18 









   14 







   13 



   10 














































   19 



   19 































   15 


















    1 






























    1 































































































































































































   10 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_MM_INLINE_H
#define LINUX_MM_INLINE_H

#include <linux/atomic.h>
#include <linux/huge_mm.h>
#include <linux/mm_types.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/userfaultfd_k.h>
#include <linux/leafops.h>

/**
 * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
 * @folio: The folio to test.
 *
 * We would like to get this info without a page flag, but the state
 * needs to survive until the folio is last deleted from the LRU, which
 * could be as far down as __page_cache_release.
 *
 * Return: An integer (not a boolean!) used to sort a folio onto the
 * right LRU list and to account folios correctly.
 * 1 if @folio is a regular filesystem backed page cache folio
 * or a lazily freed anonymous folio (e.g. via MADV_FREE).
 * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
 * ram or swap backed folio.
 */
static inline int folio_is_file_lru(const struct folio *folio)
{
        return !folio_test_swapbacked(folio);
}

static __always_inline void __update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
                                long nr_pages)
{
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        lockdep_assert_held(&lruvec->lru_lock);
        WARN_ON_ONCE(nr_pages != (int)nr_pages);

        mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
        __mod_zone_page_state(&pgdat->node_zones[zid],
                                NR_ZONE_LRU_BASE + lru, nr_pages);
}

static __always_inline void update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
                                long nr_pages)
{
        __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
        mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
}

/**
 * __folio_clear_lru_flags - Clear page lru flags before releasing a page.
 * @folio: The folio that was on lru and now has a zero reference.
 */
static __always_inline void __folio_clear_lru_flags(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio);

        __folio_clear_lru(folio);

        /* this shouldn't happen, so leave the flags to bad_page() */
        if (folio_test_active(folio) && folio_test_unevictable(folio))
                return;

        __folio_clear_active(folio);
        __folio_clear_unevictable(folio);
}

/**
 * folio_lru_list - Which LRU list should a folio be on?
 * @folio: The folio to test.
 *
 * Return: The LRU list a folio should be on, as an index
 * into the array of LRU lists.
 */
static __always_inline enum lru_list folio_lru_list(const struct folio *folio)
{
        enum lru_list lru;

        VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);

        if (folio_test_unevictable(folio))
                return LRU_UNEVICTABLE;

        lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
        if (folio_test_active(folio))
                lru += LRU_ACTIVE;

        return lru;
}

#ifdef CONFIG_LRU_GEN

static inline bool lru_gen_switching(void)
{
        DECLARE_STATIC_KEY_FALSE(lru_switch);

        return static_branch_unlikely(&lru_switch);
}
#ifdef CONFIG_LRU_GEN_ENABLED
static inline bool lru_gen_enabled(void)
{
        DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);

        return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
}
#else
static inline bool lru_gen_enabled(void)
{
        DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);

        return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
}
#endif

static inline bool lru_gen_in_fault(void)
{
        return current->in_lru_fault;
}

static inline int lru_gen_from_seq(unsigned long seq)
{
        return seq % MAX_NR_GENS;
}

static inline int lru_hist_from_seq(unsigned long seq)
{
        return seq % NR_HIST_GENS;
}

static inline int lru_tier_from_refs(int refs, bool workingset)
{
        VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));

        /* see the comment on MAX_NR_TIERS */
        return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs);
}

static inline int folio_lru_refs(const struct folio *folio)
{
        unsigned long flags = READ_ONCE(folio->flags.f);

        if (!(flags & BIT(PG_referenced)))
                return 0;
        /*
         * Return the total number of accesses including PG_referenced. Also see
         * the comment on LRU_REFS_FLAGS.
         */
        return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1;
}

static inline int folio_lru_gen(const struct folio *folio)
{
        unsigned long flags = READ_ONCE(folio->flags.f);

        return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}

static inline bool lru_gen_is_active(const struct lruvec *lruvec, int gen)
{
        unsigned long max_seq = lruvec->lrugen.max_seq;

        VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);

        /* see the comment on MIN_NR_GENS */
        return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
}

static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
                                       int old_gen, int new_gen)
{
        int type = folio_is_file_lru(folio);
        int zone = folio_zonenum(folio);
        int delta = folio_nr_pages(folio);
        enum lru_list lru = type * LRU_INACTIVE_FILE;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
        VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
        VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);

        if (old_gen >= 0)
                WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
                           lrugen->nr_pages[old_gen][type][zone] - delta);
        if (new_gen >= 0)
                WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
                           lrugen->nr_pages[new_gen][type][zone] + delta);

        /* addition */
        if (old_gen < 0) {
                if (lru_gen_is_active(lruvec, new_gen))
                        lru += LRU_ACTIVE;
                __update_lru_size(lruvec, lru, zone, delta);
                return;
        }

        /* deletion */
        if (new_gen < 0) {
                if (lru_gen_is_active(lruvec, old_gen))
                        lru += LRU_ACTIVE;
                __update_lru_size(lruvec, lru, zone, -delta);
                return;
        }

        /* promotion */
        if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
                __update_lru_size(lruvec, lru, zone, -delta);
                __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
        }

        /* demotion requires isolation, e.g., lru_deactivate_fn() */
        VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}

static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec,
                                              const struct folio *folio,
                                              bool reclaiming)
{
        int gen;
        int type = folio_is_file_lru(folio);
        const struct lru_gen_folio *lrugen = &lruvec->lrugen;

        /*
         * +-----------------------------------+-----------------------------------+
         * | Accessed through page tables and  | Accessed through file descriptors |
         * | promoted by folio_update_gen()    | and protected by folio_inc_gen()  |
         * +-----------------------------------+-----------------------------------+
         * | PG_active (set while isolated)    |                                   |
         * +-----------------+-----------------+-----------------+-----------------+
         * |  PG_workingset  |  PG_referenced  |  PG_workingset  |  LRU_REFS_FLAGS |
         * +-----------------------------------+-----------------------------------+
         * |<---------- MIN_NR_GENS ---------->|                                   |
         * |<---------------------------- MAX_NR_GENS ---------------------------->|
         */
        if (folio_test_active(folio))
                gen = MIN_NR_GENS - folio_test_workingset(folio);
        else if (reclaiming)
                gen = MAX_NR_GENS;
        else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) ||
                 (folio_test_reclaim(folio) &&
                  (folio_test_dirty(folio) || folio_test_writeback(folio))))
                gen = MIN_NR_GENS;
        else
                gen = MAX_NR_GENS - folio_test_workingset(folio);

        return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type]));
}

static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        unsigned long seq;
        unsigned long flags;
        int gen = folio_lru_gen(folio);
        int type = folio_is_file_lru(folio);
        int zone = folio_zonenum(folio);
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);

        if (folio_test_unevictable(folio) || !lrugen->enabled)
                return false;

        seq = lru_gen_folio_seq(lruvec, folio, reclaiming);
        gen = lru_gen_from_seq(seq);
        flags = (gen + 1UL) << LRU_GEN_PGOFF;
        /* see the comment on MIN_NR_GENS about PG_active */
        set_mask_bits(&folio->flags.f, LRU_GEN_MASK | BIT(PG_active), flags);

        lru_gen_update_size(lruvec, folio, -1, gen);
        /* for folio_rotate_reclaimable() */
        if (reclaiming)
                list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
        else
                list_add(&folio->lru, &lrugen->folios[gen][type][zone]);

        return true;
}

static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        unsigned long flags;
        int gen = folio_lru_gen(folio);

        if (gen < 0)
                return false;

        VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);

        /* for folio_migrate_flags() */
        flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
        flags = set_mask_bits(&folio->flags.f, LRU_GEN_MASK, flags);
        gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;

        lru_gen_update_size(lruvec, folio, gen, -1);
        list_del(&folio->lru);

        return true;
}

static inline void folio_migrate_refs(struct folio *new, const struct folio *old)
{
        unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK;

        set_mask_bits(&new->flags.f, LRU_REFS_MASK, refs);
}
#else /* !CONFIG_LRU_GEN */

static inline bool lru_gen_enabled(void)
{
        return false;
}

static inline bool lru_gen_switching(void)
{
        return false;
}

static inline bool lru_gen_in_fault(void)
{
        return false;
}

static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        return false;
}

static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        return false;
}

static inline void folio_migrate_refs(struct folio *new, const struct folio *old)
{

}
#endif /* CONFIG_LRU_GEN */

static __always_inline
void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);

        if (lru_gen_add_folio(lruvec, folio, false))
                return;

        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        folio_nr_pages(folio));
        if (lru != LRU_UNEVICTABLE)
                list_add(&folio->lru, &lruvec->lists[lru]);
}

static __always_inline
void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);

        if (lru_gen_add_folio(lruvec, folio, true))
                return;

        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        folio_nr_pages(folio));
        /* This is not expected to be used on LRU_UNEVICTABLE */
        list_add_tail(&folio->lru, &lruvec->lists[lru]);
}

static __always_inline
void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);

        if (lru_gen_del_folio(lruvec, folio, false))
                return;

        if (lru != LRU_UNEVICTABLE)
                list_del(&folio->lru);
        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        -folio_nr_pages(folio));
}

#ifdef CONFIG_ANON_VMA_NAME
/* mmap_lock should be read-locked */
static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
{
        if (anon_name)
                kref_get(&anon_name->kref);
}

static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
{
        if (anon_name)
                kref_put(&anon_name->kref, anon_vma_name_free);
}

static inline
struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
{
        /* Prevent anon_name refcount saturation early on */
        if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
                anon_vma_name_get(anon_name);
                return anon_name;

        }
        return anon_vma_name_alloc(anon_name->name);
}

static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
                                     struct vm_area_struct *new_vma)
{
        struct anon_vma_name *anon_name = anon_vma_name(orig_vma);

        if (anon_name)
                new_vma->anon_name = anon_vma_name_reuse(anon_name);
}

static inline void free_anon_vma_name(struct vm_area_struct *vma)
{
        /*
         * Not using anon_vma_name because it generates a warning if mmap_lock
         * is not held, which might be the case here.
         */
        anon_vma_name_put(vma->anon_name);
}

static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
                                    struct anon_vma_name *anon_name2)
{
        if (anon_name1 == anon_name2)
                return true;

        return anon_name1 && anon_name2 &&
                !strcmp(anon_name1->name, anon_name2->name);
}

#else /* CONFIG_ANON_VMA_NAME */
static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
                                     struct vm_area_struct *new_vma) {}
static inline void free_anon_vma_name(struct vm_area_struct *vma) {}

static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
                                    struct anon_vma_name *anon_name2)
{
        return true;
}

#endif  /* CONFIG_ANON_VMA_NAME */

void pfnmap_track_ctx_release(struct kref *ref);

static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
        atomic_set(&mm->tlb_flush_pending, 0);
}

static inline void inc_tlb_flush_pending(struct mm_struct *mm)
{
        atomic_inc(&mm->tlb_flush_pending);
        /*
         * The only time this value is relevant is when there are indeed pages
         * to flush. And we'll only flush pages after changing them, which
         * requires the PTL.
         *
         * So the ordering here is:
         *
         *        atomic_inc(&mm->tlb_flush_pending);
         *        spin_lock(&ptl);
         *        ...
         *        set_pte_at();
         *        spin_unlock(&ptl);
         *
         *                                spin_lock(&ptl)
         *                                mm_tlb_flush_pending();
         *                                ....
         *                                spin_unlock(&ptl);
         *
         *        flush_tlb_range();
         *        atomic_dec(&mm->tlb_flush_pending);
         *
         * Where the increment if constrained by the PTL unlock, it thus
         * ensures that the increment is visible if the PTE modification is
         * visible. After all, if there is no PTE modification, nobody cares
         * about TLB flushes either.
         *
         * This very much relies on users (mm_tlb_flush_pending() and
         * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
         * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
         * locks (PPC) the unlock of one doesn't order against the lock of
         * another PTL.
         *
         * The decrement is ordered by the flush_tlb_range(), such that
         * mm_tlb_flush_pending() will not return false unless all flushes have
         * completed.
         */
}

static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
        /*
         * See inc_tlb_flush_pending().
         *
         * This cannot be smp_mb__before_atomic() because smp_mb() simply does
         * not order against TLB invalidate completion, which is what we need.
         *
         * Therefore we must rely on tlb_flush_*() to guarantee order.
         */
        atomic_dec(&mm->tlb_flush_pending);
}

static inline bool mm_tlb_flush_pending(const struct mm_struct *mm)
{
        /*
         * Must be called after having acquired the PTL; orders against that
         * PTLs release and therefore ensures that if we observe the modified
         * PTE we must also observe the increment from inc_tlb_flush_pending().
         *
         * That is, it only guarantees to return true if there is a flush
         * pending for _this_ PTL.
         */
        return atomic_read(&mm->tlb_flush_pending);
}

static inline bool mm_tlb_flush_nested(const struct mm_struct *mm)
{
        /*
         * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
         * for which there is a TLB flush pending in order to guarantee
         * we've seen both that PTE modification and the increment.
         *
         * (no requirement on actually still holding the PTL, that is irrelevant)
         */
        return atomic_read(&mm->tlb_flush_pending) > 1;
}

#ifdef CONFIG_MMU
/*
 * Computes the pte marker to copy from the given source entry into dst_vma.
 * If no marker should be copied, returns 0.
 * The caller should insert a new pte created with make_pte_marker().
 */
static inline pte_marker copy_pte_marker(
                softleaf_t entry, struct vm_area_struct *dst_vma)
{
        const pte_marker srcm = softleaf_to_marker(entry);
        /* Always copy error entries. */
        pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);

        /* Only copy PTE markers if UFFD register matches. */
        if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
                dstm |= PTE_MARKER_UFFD_WP;

        return dstm;
}

/*
 * If this pte is wr-protected by uffd-wp in any form, arm the special pte to
 * replace a none pte.  NOTE!  This should only be called when *pte is already
 * cleared so we will never accidentally replace something valuable.  Meanwhile
 * none pte also means we are not demoting the pte so tlb flushed is not needed.
 * E.g., when pte cleared the caller should have taken care of the tlb flush.
 *
 * Must be called with pgtable lock held so that no thread will see the none
 * pte, and if they see it, they'll fault and serialize at the pgtable lock.
 *
 * Returns true if an uffd-wp pte was installed, false otherwise.
 */
static inline bool
pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
                              pte_t *pte, pte_t pteval)
{
        bool arm_uffd_pte = false;

        if (!uffd_supports_wp_marker())
                return false;

        /* The current status of the pte should be "cleared" before calling */
        WARN_ON_ONCE(!pte_none(ptep_get(pte)));

        /*
         * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
         * thing, because when zapping either it means it's dropping the
         * page, or in TTU where the present pte will be quickly replaced
         * with a swap pte.  There's no way of leaking the bit.
         */
        if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
                return false;

        /* A uffd-wp wr-protected normal pte */
        if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval)))
                arm_uffd_pte = true;

        /*
         * A uffd-wp wr-protected swap pte.  Note: this should even cover an
         * existing pte marker with uffd-wp bit set.
         */
        if (unlikely(pte_swp_uffd_wp_any(pteval)))
                arm_uffd_pte = true;

        if (unlikely(arm_uffd_pte)) {
                set_pte_at(vma->vm_mm, addr, pte,
                           make_pte_marker(PTE_MARKER_UFFD_WP));
                return true;
        }

        return false;
}

static inline bool vma_has_recency(const struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
                return false;

        if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
                return false;

        return true;
}
#endif

/**
 * num_pages_contiguous() - determine the number of contiguous pages
 *                            that represent contiguous PFNs
 * @pages: an array of page pointers
 * @nr_pages: length of the array, at least 1
 *
 * Determine the number of contiguous pages that represent contiguous PFNs
 * in @pages, starting from the first page.
 *
 * In some kernel configs contiguous PFNs will not have contiguous struct
 * pages. In these configurations num_pages_contiguous() will return a num
 * smaller than ideal number. The caller should continue to check for pfn
 * contiguity after each call to num_pages_contiguous().
 *
 * Returns the number of contiguous pages.
 */
static inline size_t num_pages_contiguous(struct page **pages, size_t nr_pages)
{
        struct page *cur_page = pages[0];
        unsigned long section = memdesc_section(cur_page->flags);
        size_t i;

        for (i = 1; i < nr_pages; i++) {
                if (++cur_page != pages[i])
                        break;
                /*
                 * In unproblematic kernel configs, page_to_section() == 0 and
                 * the whole check will get optimized out.
                 */
                if (memdesc_section(cur_page->flags) != section)
                        break;
        }

        return i;
}

#endif




























































































































































































































































































    1 











    1 











    1 


















    1 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
// SPDX-License-Identifier: GPL-2.0-only
/*
 *
 *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
 *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/statfs.h>
#include <linux/magic.h>
#include <linux/fscache.h>
#include <linux/fs_context.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>

#include "v9fs.h"
#include "v9fs_vfs.h"
#include "fid.h"
#include "xattr.h"
#include "acl.h"

static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;

static int v9fs_fill_super(struct super_block *sb)
{
        int ret;
        struct v9fs_session_info *v9ses = v9ses = sb->s_fs_info;

        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
        sb->s_blocksize = 1 << sb->s_blocksize_bits;
        sb->s_magic = V9FS_MAGIC;
        if (v9fs_proto_dotl(v9ses)) {
                sb->s_op = &v9fs_super_ops_dotl;
                if (!(v9ses->flags & V9FS_NO_XATTR))
                        sb->s_xattr = v9fs_xattr_handlers;
        } else {
                sb->s_op = &v9fs_super_ops;
                sb->s_time_max = U32_MAX;
        }

        sb->s_time_min = 0;

        ret = super_setup_bdi(sb);
        if (ret)
                return ret;

        if (!v9ses->cache) {
                sb->s_bdi->ra_pages = 0;
                sb->s_bdi->io_pages = 0;
        } else {
                sb->s_bdi->ra_pages = v9ses->maxdata >> PAGE_SHIFT;
                sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT;
        }

        sb->s_flags |= SB_ACTIVE;

#ifdef CONFIG_9P_FS_POSIX_ACL
        if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
                sb->s_flags |= SB_POSIXACL;
#endif

        return 0;
}

/**
 * v9fs_get_tree - create the mountable root and superblock
 * @fc: the filesystem context
 *
 */

static int v9fs_get_tree(struct fs_context *fc)
{
        struct super_block *sb = NULL;
        struct inode *inode = NULL;
        struct dentry *root = NULL;
        struct v9fs_session_info *v9ses = NULL;
        struct p9_fid *fid;
        int retval = 0;

        p9_debug(P9_DEBUG_VFS, "\n");

        v9ses = kzalloc_obj(struct v9fs_session_info);
        if (!v9ses)
                return -ENOMEM;

        fid = v9fs_session_init(v9ses, fc);
        if (IS_ERR(fid)) {
                retval = PTR_ERR(fid);
                goto free_session;
        }

        fc->s_fs_info = v9ses;
        sb = sget_fc(fc, NULL, set_anon_super_fc);
        if (IS_ERR(sb)) {
                retval = PTR_ERR(sb);
                goto clunk_fid;
        }
        retval = v9fs_fill_super(sb);
        if (retval)
                goto release_sb;

        if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
                set_default_d_op(sb, &v9fs_cached_dentry_operations);
        } else {
                set_default_d_op(sb, &v9fs_dentry_operations);
                sb->s_d_flags |= DCACHE_DONTCACHE;
        }

        inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb);
        if (IS_ERR(inode)) {
                retval = PTR_ERR(inode);
                goto release_sb;
        }

        root = d_make_root(inode);
        if (!root) {
                retval = -ENOMEM;
                goto release_sb;
        }
        sb->s_root = root;
        retval = v9fs_get_acl(inode, fid);
        if (retval)
                goto release_sb;
        v9fs_fid_add(root, &fid);

        p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
        fc->root = dget(sb->s_root);
        return 0;

clunk_fid:
        p9_fid_put(fid);
        v9fs_session_close(v9ses);
free_session:
        kfree(v9ses);
        return retval;

release_sb:
        /*
         * we will do the session_close and root dentry release
         * in the below call. But we need to clunk fid, because we haven't
         * attached the fid to dentry so it won't get clunked
         * automatically.
         */
        p9_fid_put(fid);
        deactivate_locked_super(sb);
        return retval;
}

/**
 * v9fs_kill_super - Kill Superblock
 * @s: superblock
 *
 */

static void v9fs_kill_super(struct super_block *s)
{
        struct v9fs_session_info *v9ses = s->s_fs_info;

        p9_debug(P9_DEBUG_VFS, " %p\n", s);

        kill_anon_super(s);

        v9fs_session_cancel(v9ses);
        v9fs_session_close(v9ses);
        kfree(v9ses);
        s->s_fs_info = NULL;
        p9_debug(P9_DEBUG_VFS, "exiting kill_super\n");
}

static void
v9fs_umount_begin(struct super_block *sb)
{
        struct v9fs_session_info *v9ses;

        v9ses = sb->s_fs_info;
        v9fs_session_begin_cancel(v9ses);
}

static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
        struct p9_rstatfs rs;
        int res;

        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid)) {
                res = PTR_ERR(fid);
                goto done;
        }

        v9ses = v9fs_dentry2v9ses(dentry);
        if (v9fs_proto_dotl(v9ses)) {
                res = p9_client_statfs(fid, &rs);
                if (res == 0) {
                        buf->f_type = rs.type;
                        buf->f_bsize = rs.bsize;
                        buf->f_blocks = rs.blocks;
                        buf->f_bfree = rs.bfree;
                        buf->f_bavail = rs.bavail;
                        buf->f_files = rs.files;
                        buf->f_ffree = rs.ffree;
                        buf->f_fsid = u64_to_fsid(rs.fsid);
                        buf->f_namelen = rs.namelen;
                }
                if (res != -ENOSYS)
                        goto done;
        }
        res = simple_statfs(dentry, buf);
done:
        p9_fid_put(fid);
        return res;
}

static int v9fs_drop_inode(struct inode *inode)
{
        struct v9fs_session_info *v9ses;

        v9ses = v9fs_inode2v9ses(inode);
        if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
                return inode_generic_drop(inode);
        /*
         * in case of non cached mode always drop the
         * inode because we want the inode attribute
         * to always match that on the server.
         */
        return 1;
}

static int v9fs_write_inode(struct inode *inode,
                            struct writeback_control *wbc)
{
        /*
         * send an fsync request to server irrespective of
         * wbc->sync_mode.
         */
        p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
        return netfs_unpin_writeback(inode, wbc);
}

static int v9fs_write_inode_dotl(struct inode *inode,
                                 struct writeback_control *wbc)
{

        p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);

        return netfs_unpin_writeback(inode, wbc);
}

static const struct super_operations v9fs_super_ops = {
        .alloc_inode = v9fs_alloc_inode,
        .free_inode = v9fs_free_inode,
        .statfs = simple_statfs,
        .drop_inode = v9fs_drop_inode,
        .evict_inode = v9fs_evict_inode,
        .show_options = v9fs_show_options,
        .umount_begin = v9fs_umount_begin,
        .write_inode = v9fs_write_inode,
};

static const struct super_operations v9fs_super_ops_dotl = {
        .alloc_inode = v9fs_alloc_inode,
        .free_inode = v9fs_free_inode,
        .statfs = v9fs_statfs,
        .drop_inode = v9fs_drop_inode,
        .evict_inode = v9fs_evict_inode,
        .show_options = v9fs_show_options,
        .umount_begin = v9fs_umount_begin,
        .write_inode = v9fs_write_inode_dotl,
};

static void v9fs_free_fc(struct fs_context *fc)
{
        struct v9fs_context *ctx = fc->fs_private;

        if (!ctx)
                return;

        /* These should be NULL by now but guard against leaks */
        kfree(ctx->session_opts.uname);
        kfree(ctx->session_opts.aname);
#ifdef CONFIG_9P_FSCACHE
        kfree(ctx->session_opts.cachetag);
#endif
        if (ctx->client_opts.trans_mod)
                v9fs_put_trans(ctx->client_opts.trans_mod);
        kfree(ctx);
}

static const struct fs_context_operations v9fs_context_ops = {
        .parse_param        = v9fs_parse_param,
        .get_tree        = v9fs_get_tree,
        .free                = v9fs_free_fc,
};

static int v9fs_init_fs_context(struct fs_context *fc)
{
        struct v9fs_context        *ctx;

        ctx = kzalloc_obj(*ctx);
        if (!ctx)
                return -ENOMEM;

        /* initialize core options */
        ctx->session_opts.afid = ~0;
        ctx->session_opts.cache = CACHE_NONE;
        ctx->session_opts.session_lock_timeout = P9_LOCK_TIMEOUT;
        ctx->session_opts.uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
        if (!ctx->session_opts.uname)
                goto error;

        ctx->session_opts.aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
        if (!ctx->session_opts.aname)
                goto error;

        ctx->session_opts.uid = INVALID_UID;
        ctx->session_opts.dfltuid = V9FS_DEFUID;
        ctx->session_opts.dfltgid = V9FS_DEFGID;

        /* initialize client options */
        ctx->client_opts.proto_version = p9_proto_2000L;
        ctx->client_opts.msize = DEFAULT_MSIZE;

        /* initialize fd transport options */
        ctx->fd_opts.port = P9_FD_PORT;
        ctx->fd_opts.rfd = ~0;
        ctx->fd_opts.wfd = ~0;
        ctx->fd_opts.privport = false;

        /* initialize rdma transport options */
        ctx->rdma_opts.port = P9_RDMA_PORT;
        ctx->rdma_opts.sq_depth = P9_RDMA_SQ_DEPTH;
        ctx->rdma_opts.rq_depth = P9_RDMA_RQ_DEPTH;
        ctx->rdma_opts.timeout = P9_RDMA_TIMEOUT;
        ctx->rdma_opts.privport = false;

        fc->ops = &v9fs_context_ops;
        fc->fs_private = ctx;

        return 0;
error:
        fc->need_free = 1;
        return -ENOMEM;
}

struct file_system_type v9fs_fs_type = {
        .name = "9p",
        .kill_sb = v9fs_kill_super,
        .owner = THIS_MODULE,
        .fs_flags = FS_RENAME_DOES_D_MOVE,
        .init_fs_context = v9fs_init_fs_context,
        .parameters = v9fs_param_spec,
};
MODULE_ALIAS_FS("9p");














































    1 



    1 





















    1 














































































    1 



    1 



































    1 







    1 



    1 











    1 







    1 



























    1 




    1 


    1 














    1 






    1 








    1 










    1 

























































    1 






    1 








    1 



    1 


























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/mpage.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
 * Contains functions related to preparing and submitting BIOs which contain
 * multiple pagecache pages.
 *
 * 15May2002        Andrew Morton
 *                Initial version
 * 27Jun2002        axboe@suse.de
 *                use bio_add_page() to build bio's just the right size
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/kdev_t.h>
#include <linux/gfp.h>
#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include "internal.h"

/*
 * I/O completion handler for multipage BIOs.
 *
 * The mpage code never puts partial pages into a BIO (except for end-of-file).
 * If a page does not map to a contiguous run of blocks then it simply falls
 * back to block_read_full_folio().
 *
 * Why is this?  If a page's completion depends on a number of different BIOs
 * which can complete in any order (or at the same time) then determining the
 * status of that page is hard.  See end_buffer_async_read() for the details.
 * There is no point in duplicating all that complexity.
 */
static void mpage_read_end_io(struct bio *bio)
{
        struct folio_iter fi;
        int err = blk_status_to_errno(bio->bi_status);

        bio_for_each_folio_all(fi, bio)
                folio_end_read(fi.folio, err == 0);

        bio_put(bio);
}

static void mpage_write_end_io(struct bio *bio)
{
        struct folio_iter fi;
        int err = blk_status_to_errno(bio->bi_status);

        bio_for_each_folio_all(fi, bio) {
                if (err)
                        mapping_set_error(fi.folio->mapping, err);
                folio_end_writeback(fi.folio);
        }

        bio_put(bio);
}

static struct bio *mpage_bio_submit_read(struct bio *bio)
{
        bio->bi_end_io = mpage_read_end_io;
        guard_bio_eod(bio);
        submit_bio(bio);
        return NULL;
}

static struct bio *mpage_bio_submit_write(struct bio *bio)
{
        bio->bi_end_io = mpage_write_end_io;
        guard_bio_eod(bio);
        submit_bio(bio);
        return NULL;
}

/*
 * support function for mpage_readahead.  The fs supplied get_block might
 * return an up to date buffer.  This is used to map that buffer into
 * the page, which allows read_folio to avoid triggering a duplicate call
 * to get_block.
 *
 * The idea is to avoid adding buffers to pages that don't already have
 * them.  So when the buffer is up to date and the page size == block size,
 * this marks the page up to date instead of adding new buffers.
 */
static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
                int page_block)
{
        struct inode *inode = folio->mapping->host;
        struct buffer_head *page_bh, *head;
        int block = 0;

        head = folio_buffers(folio);
        if (!head) {
                /*
                 * don't make any buffers if there is only one buffer on
                 * the folio and the folio just needs to be set up to date
                 */
                if (inode->i_blkbits == folio_shift(folio) &&
                    buffer_uptodate(bh)) {
                        folio_mark_uptodate(folio);
                        return;
                }
                head = create_empty_buffers(folio, i_blocksize(inode), 0);
        }

        page_bh = head;
        do {
                if (block == page_block) {
                        page_bh->b_state = bh->b_state;
                        page_bh->b_bdev = bh->b_bdev;
                        page_bh->b_blocknr = bh->b_blocknr;
                        break;
                }
                page_bh = page_bh->b_this_page;
                block++;
        } while (page_bh != head);
}

struct mpage_readpage_args {
        struct bio *bio;
        struct folio *folio;
        unsigned int nr_pages;
        bool is_readahead;
        sector_t last_block_in_bio;
        struct buffer_head map_bh;
        unsigned long first_logical_block;
        get_block_t *get_block;
};

/*
 * This is the worker routine which does all the work of mapping the disk
 * blocks and constructs largest possible bios, submits them for IO if the
 * blocks are not contiguous on the disk.
 *
 * We pass a buffer_head back and forth and use its buffer_mapped() flag to
 * represent the validity of its disk mapping and to decide when to do the next
 * get_block() call.
 */
static void do_mpage_readpage(struct mpage_readpage_args *args)
{
        struct folio *folio = args->folio;
        struct inode *inode = folio->mapping->host;
        const unsigned blkbits = inode->i_blkbits;
        const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
        const unsigned blocksize = 1 << blkbits;
        struct buffer_head *map_bh = &args->map_bh;
        sector_t block_in_file;
        sector_t last_block;
        sector_t last_block_in_file;
        sector_t first_block;
        unsigned page_block;
        unsigned first_hole = blocks_per_folio;
        struct block_device *bdev = NULL;
        int length;
        int fully_mapped = 1;
        blk_opf_t opf = REQ_OP_READ;
        unsigned nblocks;
        unsigned relative_block;
        gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);

        if (args->is_readahead) {
                opf |= REQ_RAHEAD;
                gfp |= __GFP_NORETRY | __GFP_NOWARN;
        }

        if (folio_buffers(folio))
                goto confused;

        block_in_file = folio_pos(folio) >> blkbits;
        last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits);
        last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
        if (last_block > last_block_in_file)
                last_block = last_block_in_file;
        page_block = 0;

        /*
         * Map blocks using the result from the previous get_blocks call first.
         */
        nblocks = map_bh->b_size >> blkbits;
        if (buffer_mapped(map_bh) &&
                        block_in_file > args->first_logical_block &&
                        block_in_file < (args->first_logical_block + nblocks)) {
                unsigned map_offset = block_in_file - args->first_logical_block;
                unsigned last = nblocks - map_offset;

                first_block = map_bh->b_blocknr + map_offset;
                for (relative_block = 0; ; relative_block++) {
                        if (relative_block == last) {
                                clear_buffer_mapped(map_bh);
                                break;
                        }
                        if (page_block == blocks_per_folio)
                                break;
                        page_block++;
                        block_in_file++;
                }
                bdev = map_bh->b_bdev;
        }

        /*
         * Then do more get_blocks calls until we are done with this folio.
         */
        map_bh->b_folio = folio;
        while (page_block < blocks_per_folio) {
                map_bh->b_state = 0;
                map_bh->b_size = 0;

                if (block_in_file < last_block) {
                        map_bh->b_size = (last_block-block_in_file) << blkbits;
                        if (args->get_block(inode, block_in_file, map_bh, 0))
                                goto confused;
                        args->first_logical_block = block_in_file;
                }

                if (!buffer_mapped(map_bh)) {
                        fully_mapped = 0;
                        if (first_hole == blocks_per_folio)
                                first_hole = page_block;
                        page_block++;
                        block_in_file++;
                        continue;
                }

                /* some filesystems will copy data into the page during
                 * the get_block call, in which case we don't want to
                 * read it again.  map_buffer_to_folio copies the data
                 * we just collected from get_block into the folio's buffers
                 * so read_folio doesn't have to repeat the get_block call
                 */
                if (buffer_uptodate(map_bh)) {
                        map_buffer_to_folio(folio, map_bh, page_block);
                        goto confused;
                }
        
                if (first_hole != blocks_per_folio)
                        goto confused;                /* hole -> non-hole */

                /* Contiguous blocks? */
                if (!page_block)
                        first_block = map_bh->b_blocknr;
                else if (first_block + page_block != map_bh->b_blocknr)
                        goto confused;
                nblocks = map_bh->b_size >> blkbits;
                for (relative_block = 0; ; relative_block++) {
                        if (relative_block == nblocks) {
                                clear_buffer_mapped(map_bh);
                                break;
                        } else if (page_block == blocks_per_folio)
                                break;
                        page_block++;
                        block_in_file++;
                }
                bdev = map_bh->b_bdev;
        }

        if (first_hole != blocks_per_folio) {
                folio_zero_segment(folio, first_hole << blkbits, folio_size(folio));
                if (first_hole == 0) {
                        folio_mark_uptodate(folio);
                        folio_unlock(folio);
                        goto out;
                }
        } else if (fully_mapped) {
                folio_set_mappedtodisk(folio);
        }

        /*
         * This folio will go to BIO.  Do we need to send this BIO off first?
         */
        if (args->bio && (args->last_block_in_bio != first_block - 1))
                args->bio = mpage_bio_submit_read(args->bio);

alloc_new:
        if (args->bio == NULL) {
                args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf,
                                      gfp);
                if (args->bio == NULL)
                        goto confused;
                args->bio->bi_iter.bi_sector = first_block << (blkbits - 9);
        }

        length = first_hole << blkbits;
        if (!bio_add_folio(args->bio, folio, length, 0)) {
                args->bio = mpage_bio_submit_read(args->bio);
                goto alloc_new;
        }

        relative_block = block_in_file - args->first_logical_block;
        nblocks = map_bh->b_size >> blkbits;
        if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
            (first_hole != blocks_per_folio))
                args->bio = mpage_bio_submit_read(args->bio);
        else
                args->last_block_in_bio = first_block + blocks_per_folio - 1;
out:
        return;

confused:
        if (args->bio)
                args->bio = mpage_bio_submit_read(args->bio);
        if (!folio_test_uptodate(folio))
                block_read_full_folio(folio, args->get_block);
        else
                folio_unlock(folio);
        goto out;
}

/**
 * mpage_readahead - start reads against pages
 * @rac: Describes which pages to read.
 * @get_block: The filesystem's block mapper function.
 *
 * This function walks the pages and the blocks within each page, building and
 * emitting large BIOs.
 *
 * If anything unusual happens, such as:
 *
 * - encountering a page which has buffers
 * - encountering a page which has a non-hole after a hole
 * - encountering a page with non-contiguous blocks
 *
 * then this code just gives up and calls the buffer_head-based read function.
 * It does handle a page which has holes at the end - that is a common case:
 * the end-of-file on blocksize < PAGE_SIZE setups.
 *
 * BH_Boundary explanation:
 *
 * There is a problem.  The mpage read code assembles several pages, gets all
 * their disk mappings, and then submits them all.  That's fine, but obtaining
 * the disk mappings may require I/O.  Reads of indirect blocks, for example.
 *
 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
 * submitted in the following order:
 *
 *         12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
 *
 * because the indirect block has to be read to get the mappings of blocks
 * 13,14,15,16.  Obviously, this impacts performance.
 *
 * So what we do it to allow the filesystem's get_block() function to set
 * BH_Boundary when it maps block 11.  BH_Boundary says: mapping of the block
 * after this one will require I/O against a block which is probably close to
 * this one.  So you should push what I/O you have currently accumulated.
 *
 * This all causes the disk requests to be issued in the correct order.
 */
void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
{
        struct folio *folio;
        struct mpage_readpage_args args = {
                .get_block = get_block,
                .is_readahead = true,
        };

        while ((folio = readahead_folio(rac))) {
                prefetchw(&folio->flags);
                args.folio = folio;
                args.nr_pages = readahead_count(rac);
                do_mpage_readpage(&args);
                /*
                 * If read ahead failed synchronously, it may cause by removed
                 * device, or some filesystem metadata error.
                 */
                if (!folio_test_locked(folio) && !folio_test_uptodate(folio))
                        break;
        }
        if (args.bio)
                mpage_bio_submit_read(args.bio);
}
EXPORT_SYMBOL(mpage_readahead);

/*
 * This isn't called much at all
 */
int mpage_read_folio(struct folio *folio, get_block_t get_block)
{
        struct mpage_readpage_args args = {
                .folio = folio,
                .nr_pages = folio_nr_pages(folio),
                .get_block = get_block,
        };

        do_mpage_readpage(&args);
        if (args.bio)
                mpage_bio_submit_read(args.bio);
        return 0;
}
EXPORT_SYMBOL(mpage_read_folio);

/*
 * Writing is not so simple.
 *
 * If the page has buffers then they will be used for obtaining the disk
 * mapping.  We only support pages which are fully mapped-and-dirty, with a
 * special case for pages which are unmapped at the end: end-of-file.
 *
 * If the page has no buffers (preferred) then the page is mapped here.
 *
 * If all blocks are found to be contiguous then the page can go into the
 * BIO.  Otherwise fall back to the mapping's writepage().
 * 
 * FIXME: This code wants an estimate of how many pages are still to be
 * written, so it can intelligently allocate a suitably-sized BIO.  For now,
 * just allocate full-size (16-page) BIOs.
 */

struct mpage_data {
        struct bio *bio;
        sector_t last_block_in_bio;
        get_block_t *get_block;
};

/*
 * We have our BIO, so we can now mark the buffers clean.  Make
 * sure to only clean buffers which we know we'll be writing.
 */
static void clean_buffers(struct folio *folio, unsigned first_unmapped)
{
        unsigned buffer_counter = 0;
        struct buffer_head *bh, *head = folio_buffers(folio);

        if (!head)
                return;
        bh = head;

        do {
                if (buffer_counter++ == first_unmapped)
                        break;
                clear_buffer_dirty(bh);
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * we cannot drop the bh if the page is not uptodate or a concurrent
         * read_folio would fail to serialize with the bh and it would read from
         * disk before we reach the platter.
         */
        if (buffer_heads_over_limit && folio_test_uptodate(folio))
                try_to_free_buffers(folio);
}

static int mpage_write_folio(struct writeback_control *wbc, struct folio *folio,
                struct mpage_data *mpd)
{
        struct bio *bio = mpd->bio;
        struct address_space *mapping = folio->mapping;
        struct inode *inode = mapping->host;
        const unsigned blkbits = inode->i_blkbits;
        const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
        sector_t last_block;
        sector_t block_in_file;
        sector_t first_block;
        unsigned page_block;
        unsigned first_unmapped = blocks_per_folio;
        struct block_device *bdev = NULL;
        int boundary = 0;
        sector_t boundary_block = 0;
        struct block_device *boundary_bdev = NULL;
        size_t length;
        struct buffer_head map_bh;
        loff_t i_size = i_size_read(inode);
        int ret = 0;
        struct buffer_head *head = folio_buffers(folio);

        if (head) {
                struct buffer_head *bh = head;

                /* If they're all mapped and dirty, do it */
                page_block = 0;
                do {
                        BUG_ON(buffer_locked(bh));
                        if (!buffer_mapped(bh)) {
                                /*
                                 * unmapped dirty buffers are created by
                                 * block_dirty_folio -> mmapped data
                                 */
                                if (buffer_dirty(bh))
                                        goto confused;
                                if (first_unmapped == blocks_per_folio)
                                        first_unmapped = page_block;
                                continue;
                        }

                        if (first_unmapped != blocks_per_folio)
                                goto confused;        /* hole -> non-hole */

                        if (!buffer_dirty(bh) || !buffer_uptodate(bh))
                                goto confused;
                        if (page_block) {
                                if (bh->b_blocknr != first_block + page_block)
                                        goto confused;
                        } else {
                                first_block = bh->b_blocknr;
                        }
                        page_block++;
                        boundary = buffer_boundary(bh);
                        if (boundary) {
                                boundary_block = bh->b_blocknr;
                                boundary_bdev = bh->b_bdev;
                        }
                        bdev = bh->b_bdev;
                } while ((bh = bh->b_this_page) != head);

                if (first_unmapped)
                        goto page_is_mapped;

                /*
                 * Page has buffers, but they are all unmapped. The page was
                 * created by pagein or read over a hole which was handled by
                 * block_read_full_folio().  If this address_space is also
                 * using mpage_readahead then this can rarely happen.
                 */
                goto confused;
        }

        /*
         * The page has no buffers: map it to disk
         */
        BUG_ON(!folio_test_uptodate(folio));
        block_in_file = folio_pos(folio) >> blkbits;
        /*
         * Whole page beyond EOF? Skip allocating blocks to avoid leaking
         * space.
         */
        if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits)
                goto page_is_mapped;
        last_block = (i_size - 1) >> blkbits;
        map_bh.b_folio = folio;
        for (page_block = 0; page_block < blocks_per_folio; ) {

                map_bh.b_state = 0;
                map_bh.b_size = 1 << blkbits;
                if (mpd->get_block(inode, block_in_file, &map_bh, 1))
                        goto confused;
                if (!buffer_mapped(&map_bh))
                        goto confused;
                if (buffer_new(&map_bh))
                        clean_bdev_bh_alias(&map_bh);
                if (buffer_boundary(&map_bh)) {
                        boundary_block = map_bh.b_blocknr;
                        boundary_bdev = map_bh.b_bdev;
                }
                if (page_block) {
                        if (map_bh.b_blocknr != first_block + page_block)
                                goto confused;
                } else {
                        first_block = map_bh.b_blocknr;
                }
                page_block++;
                boundary = buffer_boundary(&map_bh);
                bdev = map_bh.b_bdev;
                if (block_in_file == last_block)
                        break;
                block_in_file++;
        }
        BUG_ON(page_block == 0);

        first_unmapped = page_block;

page_is_mapped:
        /* Don't bother writing beyond EOF, truncate will discard the folio */
        if (folio_pos(folio) >= i_size)
                goto confused;
        length = folio_size(folio);
        if (folio_pos(folio) + length > i_size) {
                /*
                 * The page straddles i_size.  It must be zeroed out on each
                 * and every writepage invocation because it may be mmapped.
                 * "A file is mapped in multiples of the page size.  For a file
                 * that is not a multiple of the page size, the remaining memory
                 * is zeroed when mapped, and writes to that region are not
                 * written out to the file."
                 */
                length = i_size - folio_pos(folio);
                folio_zero_segment(folio, length, folio_size(folio));
        }

        /*
         * This page will go to BIO.  Do we need to send this BIO off first?
         */
        if (bio && mpd->last_block_in_bio != first_block - 1)
                bio = mpage_bio_submit_write(bio);

alloc_new:
        if (bio == NULL) {
                bio = bio_alloc(bdev, BIO_MAX_VECS,
                                REQ_OP_WRITE | wbc_to_write_flags(wbc),
                                GFP_NOFS);
                bio->bi_iter.bi_sector = first_block << (blkbits - 9);
                wbc_init_bio(wbc, bio);
                bio->bi_write_hint = inode->i_write_hint;
        }

        /*
         * Must try to add the page before marking the buffer clean or
         * the confused fail path above (OOM) will be very confused when
         * it finds all bh marked clean (i.e. it will not write anything)
         */
        wbc_account_cgroup_owner(wbc, folio, folio_size(folio));
        length = first_unmapped << blkbits;
        if (!bio_add_folio(bio, folio, length, 0)) {
                bio = mpage_bio_submit_write(bio);
                goto alloc_new;
        }

        clean_buffers(folio, first_unmapped);

        BUG_ON(folio_test_writeback(folio));
        folio_start_writeback(folio);
        folio_unlock(folio);
        if (boundary || (first_unmapped != blocks_per_folio)) {
                bio = mpage_bio_submit_write(bio);
                if (boundary_block) {
                        write_boundary_block(boundary_bdev,
                                        boundary_block, 1 << blkbits);
                }
        } else {
                mpd->last_block_in_bio = first_block + blocks_per_folio - 1;
        }
        goto out;

confused:
        if (bio)
                bio = mpage_bio_submit_write(bio);

        /*
         * The caller has a ref on the inode, so *mapping is stable
         */
        ret = block_write_full_folio(folio, wbc, mpd->get_block);
        mapping_set_error(mapping, ret);
out:
        mpd->bio = bio;
        return ret;
}

/**
 * __mpage_writepages - walk the list of dirty pages of the given address space
 *                         & writepage() all of them
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
 * @get_block: the filesystem's block mapper function.
 * @write_folio: handler to call for each folio before calling
 *                 mpage_write_folio()
 *
 * This is a library function, which implements the writepages()
 * address_space_operation. It calls @write_folio handler for each folio. If
 * the handler returns value > 0, it calls mpage_write_folio() to do the
 * folio writeback.
 */
int
__mpage_writepages(struct address_space *mapping,
                   struct writeback_control *wbc, get_block_t get_block,
                   int (*write_folio)(struct folio *folio,
                                      struct writeback_control *wbc))
{
        struct mpage_data mpd = {
                .get_block        = get_block,
        };
        struct folio *folio = NULL;
        struct blk_plug plug;
        int error;

        blk_start_plug(&plug);
        while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
                if (write_folio) {
                        error = write_folio(folio, wbc);
                        /*
                         * == 0 means folio is handled, < 0 means error. In
                         * both cases hand back control to writeback_iter()
                         */
                        if (error <= 0)
                                continue;
                        /* Let mpage_write_folio() handle the folio. */
                }
                error = mpage_write_folio(wbc, folio, &mpd);
        }
        if (mpd.bio)
                mpage_bio_submit_write(mpd.bio);
        blk_finish_plug(&plug);
        return error;
}
EXPORT_SYMBOL(__mpage_writepages);















































































































































   35 

















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MM_PERCPU_INTERNAL_H
#define _MM_PERCPU_INTERNAL_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/memcontrol.h>

/*
 * pcpu_block_md is the metadata block struct.
 * Each chunk's bitmap is split into a number of full blocks.
 * All units are in terms of bits.
 *
 * The scan hint is the largest known contiguous area before the contig hint.
 * It is not necessarily the actual largest contig hint though.  There is an
 * invariant that the scan_hint_start > contig_hint_start iff
 * scan_hint == contig_hint.  This is necessary because when scanning forward,
 * we don't know if a new contig hint would be better than the current one.
 */
struct pcpu_block_md {
        int                        scan_hint;        /* scan hint for block */
        int                        scan_hint_start; /* block relative starting
                                                    position of the scan hint */
        int                     contig_hint;    /* contig hint for block */
        int                     contig_hint_start; /* block relative starting
                                                      position of the contig hint */
        int                     left_free;      /* size of free space along
                                                   the left side of the block */
        int                     right_free;     /* size of free space along
                                                   the right side of the block */
        int                     first_free;     /* block position of first free */
        int                        nr_bits;        /* total bits responsible for */
};

struct pcpuobj_ext {
#ifdef CONFIG_MEMCG
        struct obj_cgroup        *cgroup;
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        union codetag_ref        tag;
#endif
};

#if defined(CONFIG_MEMCG) || defined(CONFIG_MEM_ALLOC_PROFILING)
#define NEED_PCPUOBJ_EXT
#endif

struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
        int                        nr_alloc;        /* # of allocations */
        size_t                        max_alloc_size; /* largest allocation size */
#endif

        struct list_head        list;                /* linked to pcpu_slot lists */
        int                        free_bytes;        /* free bytes in the chunk */
        struct pcpu_block_md        chunk_md;
        unsigned long                *bound_map;        /* boundary map */

        /*
         * base_addr is the base address of this chunk.
         * To reduce false sharing, current layout is optimized to make sure
         * base_addr locate in the different cacheline with free_bytes and
         * chunk_md.
         */
        void                        *base_addr ____cacheline_aligned_in_smp;

        unsigned long                *alloc_map;        /* allocation map */
        struct pcpu_block_md        *md_blocks;        /* metadata blocks */

        void                        *data;                /* chunk data */
        bool                        immutable;        /* no [de]population allowed */
        bool                        isolated;        /* isolated from active chunk
                                                   slots */
        int                        start_offset;        /* the overlap with the previous
                                                   region to have a page aligned
                                                   base_addr */
        int                        end_offset;        /* additional area required to
                                                   have the region end page
                                                   aligned */
#ifdef NEED_PCPUOBJ_EXT
        struct pcpuobj_ext        *obj_exts;        /* vector of object cgroups */
#endif

        int                        nr_pages;        /* # of pages served by this chunk */
        int                        nr_populated;        /* # of populated pages */
        int                     nr_empty_pop_pages; /* # of empty populated pages */
        unsigned long                populated[];        /* populated bitmap */
};

static inline bool need_pcpuobj_ext(void)
{
        if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING))
                return true;
        if (!mem_cgroup_kmem_disabled())
                return true;
        return false;
}

extern spinlock_t pcpu_lock;

extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_sidelined_slot;
extern int pcpu_to_depopulate_slot;
extern int pcpu_nr_empty_pop_pages;

extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;

/**
 * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bitmap blocks used.
 */
static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
{
        return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
}

/**
 * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
 * @pages: number of physical pages
 *
 * This conversion is from physical pages to the number of bits
 * required in the bitmap.
 */
static inline int pcpu_nr_pages_to_map_bits(int pages)
{
        return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bits in the bitmap.
 */
static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
{
        return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}

/**
 * pcpu_obj_full_size - helper to calculate size of each accounted object
 * @size: size of area to allocate in bytes
 *
 * For each accounted object there is an extra space which is used to store
 * obj_cgroup membership if kmemcg is not disabled. Charge it too.
 */
static inline size_t pcpu_obj_full_size(size_t size)
{
        size_t extra_size = 0;

#ifdef CONFIG_MEMCG
        if (!mem_cgroup_kmem_disabled())
                extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
#endif

        return size * num_possible_cpus() + extra_size;
}

#ifdef CONFIG_PERCPU_STATS

#include <linux/spinlock.h>

struct percpu_stats {
        u64 nr_alloc;                /* lifetime # of allocations */
        u64 nr_dealloc;                /* lifetime # of deallocations */
        u64 nr_cur_alloc;        /* current # of allocations */
        u64 nr_max_alloc;        /* max # of live allocations */
        u32 nr_chunks;                /* current # of live chunks */
        u32 nr_max_chunks;        /* max # of live chunks */
        size_t min_alloc_size;        /* min allocation size */
        size_t max_alloc_size;        /* max allocation size */
};

extern struct percpu_stats pcpu_stats;
extern struct pcpu_alloc_info pcpu_stats_ai;

/*
 * For debug purposes. We don't care about the flexible array.
 */
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
        memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));

        /* initialize min_alloc_size to unit_size */
        pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
}

/*
 * pcpu_stats_area_alloc - increment area allocation stats
 * @chunk: the location of the area being allocated
 * @size: size of area to allocate in bytes
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_alloc++;
        pcpu_stats.nr_cur_alloc++;
        pcpu_stats.nr_max_alloc =
                max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
        pcpu_stats.min_alloc_size =
                min(pcpu_stats.min_alloc_size, size);
        pcpu_stats.max_alloc_size =
                max(pcpu_stats.max_alloc_size, size);

        chunk->nr_alloc++;
        chunk->max_alloc_size = max(chunk->max_alloc_size, size);
}

/*
 * pcpu_stats_area_dealloc - decrement allocation stats
 * @chunk: the location of the area being deallocated
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_dealloc++;
        pcpu_stats.nr_cur_alloc--;

        chunk->nr_alloc--;
}

/*
 * pcpu_stats_chunk_alloc - increment chunk stats
 */
static inline void pcpu_stats_chunk_alloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks++;
        pcpu_stats.nr_max_chunks =
                max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

/*
 * pcpu_stats_chunk_dealloc - decrement chunk stats
 */
static inline void pcpu_stats_chunk_dealloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks--;

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

#else

static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
}

static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
}

static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
}

static inline void pcpu_stats_chunk_alloc(void)
{
}

static inline void pcpu_stats_chunk_dealloc(void)
{
}

#endif /* !CONFIG_PERCPU_STATS */

#endif



















































































































































































































































































    1 
    1 





































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2013 Politecnico di Torino, Italy
 *                    TORSEC group -- https://security.polito.it
 *
 * Author: Roberto Sassu <roberto.sassu@polito.it>
 *
 * File: ima_template.c
 *      Helpers to manage template descriptors.
 */

#include <linux/rculist.h>
#include "ima.h"
#include "ima_template_lib.h"

enum header_fields { HDR_PCR, HDR_DIGEST, HDR_TEMPLATE_NAME,
                     HDR_TEMPLATE_DATA, HDR__LAST };

static struct ima_template_desc builtin_templates[] = {
        {.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT},
        {.name = "ima-ng", .fmt = "d-ng|n-ng"},
        {.name = "ima-sig", .fmt = "d-ng|n-ng|sig"},
        {.name = "ima-ngv2", .fmt = "d-ngv2|n-ng"},
        {.name = "ima-sigv2", .fmt = "d-ngv2|n-ng|sig"},
        {.name = "ima-buf", .fmt = "d-ng|n-ng|buf"},
        {.name = "ima-modsig", .fmt = "d-ng|n-ng|sig|d-modsig|modsig"},
        {.name = "evm-sig",
         .fmt = "d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode"},
        {.name = "", .fmt = ""},        /* placeholder for a custom format */
};

static LIST_HEAD(defined_templates);
static DEFINE_SPINLOCK(template_list);
static int template_setup_done;

static const struct ima_template_field supported_fields[] = {
        {.field_id = "d", .field_init = ima_eventdigest_init,
         .field_show = ima_show_template_digest},
        {.field_id = "n", .field_init = ima_eventname_init,
         .field_show = ima_show_template_string},
        {.field_id = "d-ng", .field_init = ima_eventdigest_ng_init,
         .field_show = ima_show_template_digest_ng},
        {.field_id = "d-ngv2", .field_init = ima_eventdigest_ngv2_init,
         .field_show = ima_show_template_digest_ngv2},
        {.field_id = "n-ng", .field_init = ima_eventname_ng_init,
         .field_show = ima_show_template_string},
        {.field_id = "sig", .field_init = ima_eventsig_init,
         .field_show = ima_show_template_sig},
        {.field_id = "buf", .field_init = ima_eventbuf_init,
         .field_show = ima_show_template_buf},
        {.field_id = "d-modsig", .field_init = ima_eventdigest_modsig_init,
         .field_show = ima_show_template_digest_ng},
        {.field_id = "modsig", .field_init = ima_eventmodsig_init,
         .field_show = ima_show_template_sig},
        {.field_id = "evmsig", .field_init = ima_eventevmsig_init,
         .field_show = ima_show_template_sig},
        {.field_id = "iuid", .field_init = ima_eventinodeuid_init,
         .field_show = ima_show_template_uint},
        {.field_id = "igid", .field_init = ima_eventinodegid_init,
         .field_show = ima_show_template_uint},
        {.field_id = "imode", .field_init = ima_eventinodemode_init,
         .field_show = ima_show_template_uint},
        {.field_id = "xattrnames",
         .field_init = ima_eventinodexattrnames_init,
         .field_show = ima_show_template_string},
        {.field_id = "xattrlengths",
         .field_init = ima_eventinodexattrlengths_init,
         .field_show = ima_show_template_sig},
        {.field_id = "xattrvalues",
         .field_init = ima_eventinodexattrvalues_init,
         .field_show = ima_show_template_sig},
};

/*
 * Used when restoring measurements carried over from a kexec. 'd' and 'n' don't
 * need to be accounted for since they shouldn't be defined in the same template
 * description as 'd-ng' and 'n-ng' respectively.
 */
#define MAX_TEMPLATE_NAME_LEN \
        sizeof("d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode")

static struct ima_template_desc *ima_template;
static struct ima_template_desc *ima_buf_template;

/**
 * ima_template_has_modsig - Check whether template has modsig-related fields.
 * @ima_template: IMA template to check.
 *
 * Tells whether the given template has fields referencing a file's appended
 * signature.
 */
bool ima_template_has_modsig(const struct ima_template_desc *ima_template)
{
        int i;

        for (i = 0; i < ima_template->num_fields; i++)
                if (!strcmp(ima_template->fields[i]->field_id, "modsig") ||
                    !strcmp(ima_template->fields[i]->field_id, "d-modsig"))
                        return true;

        return false;
}

static int __init ima_template_setup(char *str)
{
        struct ima_template_desc *template_desc;
        int template_len = strlen(str);

        if (template_setup_done)
                return 1;

        if (!ima_template)
                ima_init_template_list();

        /*
         * Verify that a template with the supplied name exists.
         * If not, use CONFIG_IMA_DEFAULT_TEMPLATE.
         */
        template_desc = lookup_template_desc(str);
        if (!template_desc) {
                pr_err("template %s not found, using %s\n",
                       str, CONFIG_IMA_DEFAULT_TEMPLATE);
                return 1;
        }

        /*
         * Verify whether the current hash algorithm is supported
         * by the 'ima' template.
         */
        if (template_len == 3 && strcmp(str, IMA_TEMPLATE_IMA_NAME) == 0 &&
            ima_hash_algo != HASH_ALGO_SHA1 && ima_hash_algo != HASH_ALGO_MD5) {
                pr_err("template does not support hash alg\n");
                return 1;
        }

        ima_template = template_desc;
        template_setup_done = 1;
        return 1;
}
__setup("ima_template=", ima_template_setup);

static int __init ima_template_fmt_setup(char *str)
{
        int num_templates = ARRAY_SIZE(builtin_templates);

        if (template_setup_done)
                return 1;

        if (template_desc_init_fields(str, NULL, NULL) < 0) {
                pr_err("format string '%s' not valid, using template %s\n",
                       str, CONFIG_IMA_DEFAULT_TEMPLATE);
                return 1;
        }

        builtin_templates[num_templates - 1].fmt = str;
        ima_template = builtin_templates + num_templates - 1;
        template_setup_done = 1;

        return 1;
}
__setup("ima_template_fmt=", ima_template_fmt_setup);

struct ima_template_desc *lookup_template_desc(const char *name)
{
        struct ima_template_desc *template_desc;
        int found = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(template_desc, &defined_templates, list) {
                if ((strcmp(template_desc->name, name) == 0) ||
                    (strcmp(template_desc->fmt, name) == 0)) {
                        found = 1;
                        break;
                }
        }
        rcu_read_unlock();
        return found ? template_desc : NULL;
}

static const struct ima_template_field *
lookup_template_field(const char *field_id)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(supported_fields); i++)
                if (strncmp(supported_fields[i].field_id, field_id,
                            IMA_TEMPLATE_FIELD_ID_MAX_LEN) == 0)
                        return &supported_fields[i];
        return NULL;
}

static int template_fmt_size(const char *template_fmt)
{
        char c;
        int template_fmt_len = strlen(template_fmt);
        int i = 0, j = 0;

        while (i < template_fmt_len) {
                c = template_fmt[i];
                if (c == '|')
                        j++;
                i++;
        }

        return j + 1;
}

int template_desc_init_fields(const char *template_fmt,
                              const struct ima_template_field ***fields,
                              int *num_fields)
{
        const char *template_fmt_ptr;
        const struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX];
        int template_num_fields;
        int i, len;

        if (num_fields && *num_fields > 0) /* already initialized? */
                return 0;

        template_num_fields = template_fmt_size(template_fmt);

        if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) {
                pr_err("format string '%s' contains too many fields\n",
                       template_fmt);
                return -EINVAL;
        }

        for (i = 0, template_fmt_ptr = template_fmt; i < template_num_fields;
             i++, template_fmt_ptr += len + 1) {
                char tmp_field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN + 1];

                len = strchrnul(template_fmt_ptr, '|') - template_fmt_ptr;
                if (len == 0 || len > IMA_TEMPLATE_FIELD_ID_MAX_LEN) {
                        pr_err("Invalid field with length %d\n", len);
                        return -EINVAL;
                }

                memcpy(tmp_field_id, template_fmt_ptr, len);
                tmp_field_id[len] = '\0';
                found_fields[i] = lookup_template_field(tmp_field_id);
                if (!found_fields[i]) {
                        pr_err("field '%s' not found\n", tmp_field_id);
                        return -ENOENT;
                }
        }

        if (fields && num_fields) {
                *fields = kmalloc_objs(**fields, i);
                if (*fields == NULL)
                        return -ENOMEM;

                memcpy(*fields, found_fields, i * sizeof(**fields));
                *num_fields = i;
        }

        return 0;
}

void ima_init_template_list(void)
{
        int i;

        if (!list_empty(&defined_templates))
                return;

        spin_lock(&template_list);
        for (i = 0; i < ARRAY_SIZE(builtin_templates); i++) {
                list_add_tail_rcu(&builtin_templates[i].list,
                                  &defined_templates);
        }
        spin_unlock(&template_list);
}

struct ima_template_desc *ima_template_desc_current(void)
{
        if (!ima_template) {
                ima_init_template_list();
                ima_template =
                    lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE);
        }
        return ima_template;
}

struct ima_template_desc *ima_template_desc_buf(void)
{
        if (!ima_buf_template) {
                ima_init_template_list();
                ima_buf_template = lookup_template_desc("ima-buf");
        }
        return ima_buf_template;
}

int __init ima_init_template(void)
{
        struct ima_template_desc *template = ima_template_desc_current();
        int result;

        result = template_desc_init_fields(template->fmt,
                                           &(template->fields),
                                           &(template->num_fields));
        if (result < 0) {
                pr_err("template %s init failed, result: %d\n",
                       (strlen(template->name) ?
                       template->name : template->fmt), result);
                return result;
        }

        template = ima_template_desc_buf();
        if (!template) {
                pr_err("Failed to get ima-buf template\n");
                return -EINVAL;
        }

        result = template_desc_init_fields(template->fmt,
                                           &(template->fields),
                                           &(template->num_fields));
        if (result < 0)
                pr_err("template %s init failed, result: %d\n",
                       (strlen(template->name) ?
                       template->name : template->fmt), result);

        return result;
}

static struct ima_template_desc *restore_template_fmt(char *template_name)
{
        struct ima_template_desc *template_desc = NULL;
        int ret;

        ret = template_desc_init_fields(template_name, NULL, NULL);
        if (ret < 0) {
                pr_err("attempting to initialize the template \"%s\" failed\n",
                        template_name);
                goto out;
        }

        template_desc = kzalloc_obj(*template_desc);
        if (!template_desc)
                goto out;

        template_desc->name = "";
        template_desc->fmt = kstrdup(template_name, GFP_KERNEL);
        if (!template_desc->fmt) {
                kfree(template_desc);
                template_desc = NULL;
                goto out;
        }

        spin_lock(&template_list);
        list_add_tail_rcu(&template_desc->list, &defined_templates);
        spin_unlock(&template_list);
out:
        return template_desc;
}

static int ima_restore_template_data(struct ima_template_desc *template_desc,
                                     void *template_data,
                                     int template_data_size,
                                     struct ima_template_entry **entry)
{
        struct tpm_digest *digests;
        int ret = 0;
        int i;

        *entry = kzalloc_flex(**entry, template_data, template_desc->num_fields,
                              GFP_NOFS);
        if (!*entry)
                return -ENOMEM;

        digests = kzalloc_objs(*digests,
                               NR_BANKS(ima_tpm_chip) + ima_extra_slots,
                               GFP_NOFS);
        if (!digests) {
                kfree(*entry);
                return -ENOMEM;
        }

        (*entry)->digests = digests;

        ret = ima_parse_buf(template_data, template_data + template_data_size,
                            NULL, template_desc->num_fields,
                            (*entry)->template_data, NULL, NULL,
                            ENFORCE_FIELDS | ENFORCE_BUFEND, "template data");
        if (ret < 0) {
                kfree((*entry)->digests);
                kfree(*entry);
                return ret;
        }

        (*entry)->template_desc = template_desc;
        for (i = 0; i < template_desc->num_fields; i++) {
                struct ima_field_data *field_data = &(*entry)->template_data[i];
                u8 *data = field_data->data;

                (*entry)->template_data[i].data =
                        kzalloc(field_data->len + 1, GFP_KERNEL);
                if (!(*entry)->template_data[i].data) {
                        ret = -ENOMEM;
                        break;
                }
                memcpy((*entry)->template_data[i].data, data, field_data->len);
                (*entry)->template_data_len += sizeof(field_data->len);
                (*entry)->template_data_len += field_data->len;
        }

        if (ret < 0) {
                ima_free_template_entry(*entry);
                *entry = NULL;
        }

        return ret;
}

/* Restore the serialized binary measurement list without extending PCRs. */
int ima_restore_measurement_list(loff_t size, void *buf)
{
        char template_name[MAX_TEMPLATE_NAME_LEN];
        unsigned char zero[TPM_DIGEST_SIZE] = { 0 };

        struct ima_kexec_hdr *khdr = buf;
        struct ima_field_data hdr[HDR__LAST] = {
                [HDR_PCR] = {.len = sizeof(u32)},
                [HDR_DIGEST] = {.len = TPM_DIGEST_SIZE},
        };

        void *bufp = buf + sizeof(*khdr);
        void *bufendp;
        struct ima_template_entry *entry;
        struct ima_template_desc *template_desc;
        DECLARE_BITMAP(hdr_mask, HDR__LAST);
        unsigned long count = 0;
        int ret = 0;

        if (!buf || size < sizeof(*khdr))
                return 0;

        if (ima_canonical_fmt) {
                khdr->version = le16_to_cpu((__force __le16)khdr->version);
                khdr->count = le64_to_cpu((__force __le64)khdr->count);
                khdr->buffer_size = le64_to_cpu((__force __le64)khdr->buffer_size);
        }

        if (khdr->version != 1) {
                pr_err("attempting to restore a incompatible measurement list");
                return -EINVAL;
        }

        if (khdr->count > ULONG_MAX - 1) {
                pr_err("attempting to restore too many measurements");
                return -EINVAL;
        }

        bitmap_zero(hdr_mask, HDR__LAST);
        bitmap_set(hdr_mask, HDR_PCR, 1);
        bitmap_set(hdr_mask, HDR_DIGEST, 1);

        /*
         * ima kexec buffer prefix: version, buffer size, count
         * v1 format: pcr, digest, template-name-len, template-name,
         *              template-data-size, template-data
         */
        bufendp = buf + khdr->buffer_size;
        while ((bufp < bufendp) && (count++ < khdr->count)) {
                int enforce_mask = ENFORCE_FIELDS;

                enforce_mask |= (count == khdr->count) ? ENFORCE_BUFEND : 0;
                ret = ima_parse_buf(bufp, bufendp, &bufp, HDR__LAST, hdr, NULL,
                                    hdr_mask, enforce_mask, "entry header");
                if (ret < 0)
                        break;

                if (hdr[HDR_TEMPLATE_NAME].len >= MAX_TEMPLATE_NAME_LEN) {
                        pr_err("attempting to restore a template name that is too long\n");
                        ret = -EINVAL;
                        break;
                }

                /* template name is not null terminated */
                memcpy(template_name, hdr[HDR_TEMPLATE_NAME].data,
                       hdr[HDR_TEMPLATE_NAME].len);
                template_name[hdr[HDR_TEMPLATE_NAME].len] = 0;

                if (strcmp(template_name, "ima") == 0) {
                        pr_err("attempting to restore an unsupported template \"%s\" failed\n",
                               template_name);
                        ret = -EINVAL;
                        break;
                }

                template_desc = lookup_template_desc(template_name);
                if (!template_desc) {
                        template_desc = restore_template_fmt(template_name);
                        if (!template_desc)
                                break;
                }

                /*
                 * Only the running system's template format is initialized
                 * on boot.  As needed, initialize the other template formats.
                 */
                ret = template_desc_init_fields(template_desc->fmt,
                                                &(template_desc->fields),
                                                &(template_desc->num_fields));
                if (ret < 0) {
                        pr_err("attempting to restore the template fmt \"%s\" failed\n",
                               template_desc->fmt);
                        ret = -EINVAL;
                        break;
                }

                ret = ima_restore_template_data(template_desc,
                                                hdr[HDR_TEMPLATE_DATA].data,
                                                hdr[HDR_TEMPLATE_DATA].len,
                                                &entry);
                if (ret < 0)
                        break;

                if (memcmp(hdr[HDR_DIGEST].data, zero, sizeof(zero))) {
                        ret = ima_calc_field_array_hash(
                                                &entry->template_data[0],
                                                entry);
                        if (ret < 0) {
                                pr_err("cannot calculate template digest\n");
                                ret = -EINVAL;
                                break;
                        }
                }

                entry->pcr = !ima_canonical_fmt ? *(u32 *)(hdr[HDR_PCR].data) :
                             le32_to_cpu(*(__le32 *)(hdr[HDR_PCR].data));
                ret = ima_restore_measurement_entry(entry);
                if (ret < 0)
                        break;

        }
        return ret;
}











































































































































































































































    4 


















   15 









   15 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BACKING_DEV_DEFS_H
#define __LINUX_BACKING_DEV_DEFS_H

#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/percpu_counter.h>
#include <linux/percpu-refcount.h>
#include <linux/flex_proportions.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/refcount.h>

struct page;
struct device;
struct dentry;

/*
 * Bits in bdi_writeback.state
 */
enum wb_state {
        WB_registered,                /* bdi_register() was done */
        WB_writeback_running,        /* Writeback is in progress */
        WB_has_dirty_io,        /* Dirty inodes on ->b_{dirty|io|more_io} */
        WB_start_all,                /* nr_pages == 0 (all) work pending */
};

enum wb_stat_item {
        WB_RECLAIMABLE,
        WB_WRITEBACK,
        WB_DIRTIED,
        WB_WRITTEN,
        NR_WB_STAT_ITEMS
};

#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))

/*
 * why some writeback work was initiated
 */
enum wb_reason {
        WB_REASON_BACKGROUND,
        WB_REASON_VMSCAN,
        WB_REASON_SYNC,
        WB_REASON_PERIODIC,
        WB_REASON_FS_FREE_SPACE,
        /*
         * There is no bdi forker thread any more and works are done
         * by emergency worker, however, this is TPs userland visible
         * and we'll be exposing exactly the same information,
         * so it has a mismatch name.
         */
        WB_REASON_FORKER_THREAD,
        WB_REASON_FOREIGN_FLUSH,

        WB_REASON_MAX,
};

struct wb_completion {
        atomic_t                cnt;
        wait_queue_head_t        *waitq;
        unsigned long progress_stamp;        /* The jiffies when slow progress is detected */
        unsigned long wait_start;        /* The jiffies when waiting for the writeback work to finish */
};

#define __WB_COMPLETION_INIT(_waitq)        \
        (struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) }

/*
 * If one wants to wait for one or more wb_writeback_works, each work's
 * ->done should be set to a wb_completion defined using the following
 * macro.  Once all work items are issued with wb_queue_work(), the caller
 * can wait for the completion of all using wb_wait_for_completion().  Work
 * items which are waited upon aren't freed automatically on completion.
 */
#define WB_COMPLETION_INIT(bdi)                __WB_COMPLETION_INIT(&(bdi)->wb_waitq)

#define DEFINE_WB_COMPLETION(cmpl, bdi)        \
        struct wb_completion cmpl = WB_COMPLETION_INIT(bdi)

/*
 * Each wb (bdi_writeback) can perform writeback operations, is measured
 * and throttled, independently.  Without cgroup writeback, each bdi
 * (bdi_writeback) is served by its embedded bdi->wb.
 *
 * On the default hierarchy, blkcg implicitly enables memcg.  This allows
 * using memcg's page ownership for attributing writeback IOs, and every
 * memcg - blkcg combination can be served by its own wb by assigning a
 * dedicated wb to each memcg, which enables isolation across different
 * cgroups and propagation of IO back pressure down from the IO layer upto
 * the tasks which are generating the dirty pages to be written back.
 *
 * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
 * refcounted with the number of inodes attached to it, and pins the memcg
 * and the corresponding blkcg.  As the corresponding blkcg for a memcg may
 * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
 * is tested for blkcg after lookup and removed from index on mismatch so
 * that a new wb for the combination can be created.
 *
 * Each bdi_writeback that is not embedded into the backing_dev_info must hold
 * a reference to the parent backing_dev_info.  See cgwb_create() for details.
 */
struct bdi_writeback {
        struct backing_dev_info *bdi;        /* our parent bdi */

        unsigned long state;                /* Always use atomic bitops on this */
        unsigned long last_old_flush;        /* last old data flush */

        struct list_head b_dirty;        /* dirty inodes */
        struct list_head b_io;                /* parked for writeback */
        struct list_head b_more_io;        /* parked for more writeback */
        struct list_head b_dirty_time;        /* time stamps are dirty */
        spinlock_t list_lock;                /* protects the b_* lists */

        atomic_t writeback_inodes;        /* number of inodes under writeback */
        struct percpu_counter stat[NR_WB_STAT_ITEMS];

        unsigned long bw_time_stamp;        /* last time write bw is updated */
        unsigned long dirtied_stamp;
        unsigned long written_stamp;        /* pages written at bw_time_stamp */
        unsigned long write_bandwidth;        /* the estimated write bandwidth */
        unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */

        /*
         * The base dirty throttle rate, re-calculated on every 200ms.
         * All the bdi tasks' dirty rate will be curbed under it.
         * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
         * in small steps and is much more smooth/stable than the latter.
         */
        unsigned long dirty_ratelimit;
        unsigned long balanced_dirty_ratelimit;

        struct fprop_local_percpu completions;
        int dirty_exceeded;
        enum wb_reason start_all_reason;

        spinlock_t work_lock;                /* protects work_list & dwork scheduling */
        struct list_head work_list;
        struct delayed_work dwork;        /* work item used for writeback */
        struct delayed_work bw_dwork;        /* work item used for bandwidth estimate */

        struct list_head bdi_node;        /* anchored at bdi->wb_list */

#ifdef CONFIG_CGROUP_WRITEBACK
        struct percpu_ref refcnt;        /* used only for !root wb's */
        struct fprop_local_percpu memcg_completions;
        struct cgroup_subsys_state *memcg_css; /* the associated memcg */
        struct cgroup_subsys_state *blkcg_css; /* and blkcg */
        struct list_head memcg_node;        /* anchored at memcg->cgwb_list */
        struct list_head blkcg_node;        /* anchored at blkcg->cgwb_list */
        struct list_head b_attached;        /* attached inodes, protected by list_lock */
        struct list_head offline_node;        /* anchored at offline_cgwbs */
        struct work_struct switch_work;        /* work used to perform inode switching
                                         * to this wb */
        struct llist_head switch_wbs_ctxs;        /* queued contexts for
                                                 * writeback switching */

        union {
                struct work_struct release_work;
                struct rcu_head rcu;
        };
#endif
};

struct backing_dev_info {
        u64 id;
        struct rb_node rb_node; /* keyed by ->id */
        struct list_head bdi_list;
        /* max readahead in PAGE_SIZE units */
        unsigned long __data_racy ra_pages;

        unsigned long io_pages;        /* max allowed IO size */

        struct kref refcnt;        /* Reference counter for the structure */
        unsigned int capabilities; /* Device capabilities */
        unsigned int min_ratio;
        unsigned int max_ratio, max_prop_frac;

        /*
         * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
         * any dirty wbs, which is depended upon by bdi_has_dirty().
         */
        atomic_long_t tot_write_bandwidth;
        /*
         * Jiffies when last process was dirty throttled on this bdi. Used by
         * blk-wbt.
         */
        unsigned long last_bdp_sleep;

        struct bdi_writeback wb;  /* the root writeback info for this bdi */
        struct list_head wb_list; /* list of all wbs */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
        struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
        struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
#endif
        wait_queue_head_t wb_waitq;

        struct device *dev;
        char dev_name[64];
        struct device *owner;

#ifdef CONFIG_DEBUG_FS
        struct dentry *debug_dir;
#endif
};

struct wb_lock_cookie {
        bool locked;
        unsigned long flags;
};

#ifdef CONFIG_CGROUP_WRITEBACK

/**
 * wb_tryget - try to increment a wb's refcount
 * @wb: bdi_writeback to get
 */
static inline bool wb_tryget(struct bdi_writeback *wb)
{
        if (wb != &wb->bdi->wb)
                return percpu_ref_tryget(&wb->refcnt);
        return true;
}

/**
 * wb_get - increment a wb's refcount
 * @wb: bdi_writeback to get
 */
static inline void wb_get(struct bdi_writeback *wb)
{
        if (wb != &wb->bdi->wb)
                percpu_ref_get(&wb->refcnt);
}

/**
 * wb_put_many - decrement a wb's refcount
 * @wb: bdi_writeback to put
 * @nr: number of references to put
 */
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{
        if (WARN_ON_ONCE(!wb->bdi)) {
                /*
                 * A driver bug might cause a file to be removed before bdi was
                 * initialized.
                 */
                return;
        }

        if (wb != &wb->bdi->wb)
                percpu_ref_put_many(&wb->refcnt, nr);
}

/**
 * wb_put - decrement a wb's refcount
 * @wb: bdi_writeback to put
 */
static inline void wb_put(struct bdi_writeback *wb)
{
        wb_put_many(wb, 1);
}

/**
 * wb_dying - is a wb dying?
 * @wb: bdi_writeback of interest
 *
 * Returns whether @wb is unlinked and being drained.
 */
static inline bool wb_dying(struct bdi_writeback *wb)
{
        return percpu_ref_is_dying(&wb->refcnt);
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline bool wb_tryget(struct bdi_writeback *wb)
{
        return true;
}

static inline void wb_get(struct bdi_writeback *wb)
{
}

static inline void wb_put(struct bdi_writeback *wb)
{
}

static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{
}

static inline bool wb_dying(struct bdi_writeback *wb)
{
        return false;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

#endif        /* __LINUX_BACKING_DEV_DEFS_H */













   14 



   14 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BSEARCH_H
#define _LINUX_BSEARCH_H

#include <linux/types.h>

static __always_inline
void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        const char *pivot;
        int result;

        while (num > 0) {
                pivot = base + (num >> 1) * size;
                result = cmp(key, pivot);

                if (result == 0)
                        return (void *)pivot;

                if (result > 0) {
                        base = pivot + size;
                        num--;
                }
                num >>= 1;
        }

        return NULL;
}

extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp);

#endif /* _LINUX_BSEARCH_H */














































































































































   16 


















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Filesystem superblock creation and reconfiguration context.
 *
 * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_FS_CONTEXT_H
#define _LINUX_FS_CONTEXT_H

#include <linux/kernel.h>
#include <linux/refcount.h>
#include <linux/errno.h>
#include <linux/security.h>
#include <linux/mutex.h>

struct cred;
struct dentry;
struct file_operations;
struct file_system_type;
struct mnt_namespace;
struct net;
struct pid_namespace;
struct super_block;
struct user_namespace;
struct vfsmount;
struct path;

enum fs_context_purpose {
        FS_CONTEXT_FOR_MOUNT,                /* New superblock for explicit mount */
        FS_CONTEXT_FOR_SUBMOUNT,        /* New superblock for automatic submount */
        FS_CONTEXT_FOR_RECONFIGURE,        /* Superblock reconfiguration (remount) */
};

/*
 * Userspace usage phase for fsopen/fspick.
 */
enum fs_context_phase {
        FS_CONTEXT_CREATE_PARAMS,        /* Loading params for sb creation */
        FS_CONTEXT_CREATING,                /* A superblock is being created */
        FS_CONTEXT_AWAITING_MOUNT,        /* Superblock created, awaiting fsmount() */
        FS_CONTEXT_AWAITING_RECONF,        /* Awaiting initialisation for reconfiguration */
        FS_CONTEXT_RECONF_PARAMS,        /* Loading params for reconfiguration */
        FS_CONTEXT_RECONFIGURING,        /* Reconfiguring the superblock */
        FS_CONTEXT_FAILED,                /* Failed to correctly transition a context */
};

/*
 * Type of parameter value.
 */
enum fs_value_type {
        fs_value_is_undefined,
        fs_value_is_flag,                /* Value not given a value */
        fs_value_is_string,                /* Value is a string */
        fs_value_is_blob,                /* Value is a binary blob */
        fs_value_is_filename,                /* Value is a filename* + dirfd */
        fs_value_is_file,                /* Value is a file* */
};

/*
 * Configuration parameter.
 */
struct fs_parameter {
        const char                *key;                /* Parameter name */
        enum fs_value_type        type:8;                /* The type of value here */
        union {
                char                *string;
                void                *blob;
                struct filename        *name;
                struct file        *file;
        };
        size_t        size;
        int        dirfd;
};

struct p_log {
        const char *prefix;
        struct fc_log *log;
};

/*
 * Filesystem context for holding the parameters used in the creation or
 * reconfiguration of a superblock.
 *
 * Superblock creation fills in ->root whereas reconfiguration begins with this
 * already set.
 *
 * See Documentation/filesystems/mount_api.rst
 */
struct fs_context {
        const struct fs_context_operations *ops;
        struct mutex                uapi_mutex;        /* Userspace access mutex */
        struct file_system_type        *fs_type;
        void                        *fs_private;        /* The filesystem's context */
        void                        *sget_key;
        struct dentry                *root;                /* The root and superblock */
        struct user_namespace        *user_ns;        /* The user namespace for this mount */
        struct net                *net_ns;        /* The network namespace for this mount */
        const struct cred        *cred;                /* The mounter's credentials */
        struct p_log                log;                /* Logging buffer */
        const char                *source;        /* The source name (eg. dev path) */
        void                        *security;        /* LSM options */
        void                        *s_fs_info;        /* Proposed s_fs_info */
        unsigned int                sb_flags;        /* Proposed superblock flags (SB_*) */
        unsigned int                sb_flags_mask;        /* Superblock flags that were changed */
        unsigned int                s_iflags;        /* OR'd with sb->s_iflags */
        enum fs_context_purpose        purpose:8;
        enum fs_context_phase        phase:8;        /* The phase the context is in */
        bool                        need_free:1;        /* Need to call ops->free() */
        bool                        global:1;        /* Goes into &init_user_ns */
        bool                        oldapi:1;        /* Coming from mount(2) */
        bool                        exclusive:1;    /* create new superblock, reject existing one */
};

struct fs_context_operations {
        void (*free)(struct fs_context *fc);
        int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
        int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
        int (*parse_monolithic)(struct fs_context *fc, void *data);
        int (*get_tree)(struct fs_context *fc);
        int (*reconfigure)(struct fs_context *fc);
};

/*
 * fs_context manipulation functions.
 */
extern struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
                                                unsigned int sb_flags);
extern struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
                                                unsigned int sb_flags,
                                                unsigned int sb_flags_mask);
extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_type,
                                                struct dentry *reference);

extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc);
extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param);
extern int vfs_parse_fs_qstr(struct fs_context *fc, const char *key,
                                const struct qstr *value);
static inline int vfs_parse_fs_string(struct fs_context *fc, const char *key,
                               const char *value)
{
        return vfs_parse_fs_qstr(fc, key, value ? &QSTR(value) : NULL);
}
int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
                             char *(*sep)(char **));
extern int generic_parse_monolithic(struct fs_context *fc, void *data);
extern int vfs_get_tree(struct fs_context *fc);
extern void put_fs_context(struct fs_context *fc);
extern int vfs_parse_fs_param_source(struct fs_context *fc,
                                     struct fs_parameter *param);
extern void fc_drop_locked(struct fs_context *fc);

extern int get_tree_nodev(struct fs_context *fc,
                         int (*fill_super)(struct super_block *sb,
                                           struct fs_context *fc));
extern int get_tree_single(struct fs_context *fc,
                         int (*fill_super)(struct super_block *sb,
                                           struct fs_context *fc));
extern int get_tree_keyed(struct fs_context *fc,
                         int (*fill_super)(struct super_block *sb,
                                           struct fs_context *fc),
                         void *key);

int setup_bdev_super(struct super_block *sb, int sb_flags,
                struct fs_context *fc);

#define GET_TREE_BDEV_QUIET_LOOKUP                0x0001
int get_tree_bdev_flags(struct fs_context *fc,
                int (*fill_super)(struct super_block *sb,
                                  struct fs_context *fc), unsigned int flags);

extern int get_tree_bdev(struct fs_context *fc,
                               int (*fill_super)(struct super_block *sb,
                                                 struct fs_context *fc));

extern const struct file_operations fscontext_fops;

/*
 * Mount error, warning and informational message logging.  This structure is
 * shareable between a mount and a subordinate mount.
 */
struct fc_log {
        refcount_t        usage;
        u8                head;                /* Insertion index in buffer[] */
        u8                tail;                /* Removal index in buffer[] */
        u8                need_free;        /* Mask of kfree'able items in buffer[] */
        struct module        *owner;                /* Owner module for strings that don't then need freeing */
        char                *buffer[8];
};

extern __attribute__((format(printf, 4, 5)))
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...);

#define __logfc(fc, l, fmt, ...) \
        logfc((fc)->log.log, NULL, (l), (fmt), ## __VA_ARGS__)
#define __plogp(p, prefix, l, fmt, ...) \
        logfc((p)->log, (prefix), (l), (fmt), ## __VA_ARGS__)
#define __plog(p, l, fmt, ...) __plogp(p, (p)->prefix, l, fmt, ## __VA_ARGS__)

/**
 * infof - Store supplementary informational message
 * @fc: The context in which to log the informational message
 * @fmt: The format string
 *
 * Store the supplementary informational message for the process if the process
 * has enabled the facility.
 */
#define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__)
#define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__)
#define infofc(fc, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__)
#define infofcp(fc, prefix, fmt, ...) \
        __plogp((&(fc)->log), prefix, 'i', fmt, ## __VA_ARGS__)

/**
 * warnf - Store supplementary warning message
 * @fc: The context in which to log the error message
 * @fmt: The format string
 *
 * Store the supplementary warning message for the process if the process has
 * enabled the facility.
 */
#define warnf(fc, fmt, ...) __logfc(fc, 'w', fmt, ## __VA_ARGS__)
#define warn_plog(p, fmt, ...) __plog(p, 'w', fmt, ## __VA_ARGS__)
#define warnfc(fc, fmt, ...) __plog((&(fc)->log), 'w', fmt, ## __VA_ARGS__)
#define warnfcp(fc, prefix, fmt, ...) \
        __plogp((&(fc)->log), prefix, 'w', fmt, ## __VA_ARGS__)

/**
 * errorf - Store supplementary error message
 * @fc: The context in which to log the error message
 * @fmt: The format string
 *
 * Store the supplementary error message for the process if the process has
 * enabled the facility.
 */
#define errorf(fc, fmt, ...) __logfc(fc, 'e', fmt, ## __VA_ARGS__)
#define error_plog(p, fmt, ...) __plog(p, 'e', fmt, ## __VA_ARGS__)
#define errorfc(fc, fmt, ...) __plog((&(fc)->log), 'e', fmt, ## __VA_ARGS__)
#define errorfcp(fc, prefix, fmt, ...) \
        __plogp((&(fc)->log), prefix, 'e', fmt, ## __VA_ARGS__)

/**
 * invalf - Store supplementary invalid argument error message
 * @fc: The context in which to log the error message
 * @fmt: The format string
 *
 * Store the supplementary error message for the process if the process has
 * enabled the facility and return -EINVAL.
 */
#define invalf(fc, fmt, ...) (errorf(fc, fmt, ## __VA_ARGS__), -EINVAL)
#define inval_plog(p, fmt, ...) (error_plog(p, fmt, ## __VA_ARGS__), -EINVAL)
#define invalfc(fc, fmt, ...) (errorfc(fc, fmt, ## __VA_ARGS__), -EINVAL)
#define invalfcp(fc, prefix, fmt, ...) \
        (errorfcp(fc, prefix, fmt, ## __VA_ARGS__), -EINVAL)

#endif /* _LINUX_FS_CONTEXT_H */

















































   11 




   13 


















































    3 

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM filemap

#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FILEMAP_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
#include <linux/device.h>
#include <linux/kdev_t.h>
#include <linux/errseq.h>

DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(unsigned long, pfn)
                __field(unsigned long, index)
                __field(dev_t, s_dev)
                __field(unsigned char, order)
        ),

        TP_fast_assign(
                __entry->pfn = folio_pfn(folio);
                __entry->i_ino = folio->mapping->host->i_ino;
                __entry->index = folio->index;
                if (folio->mapping->host->i_sb)
                        __entry->s_dev = folio->mapping->host->i_sb->s_dev;
                else
                        __entry->s_dev = folio->mapping->host->i_rdev;
                __entry->order = folio_order(folio);
        ),

        TP_printk("dev %d:%d ino %llx pfn=0x%lx ofs=%lu order=%u",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino,
                __entry->pfn,
                __entry->index << PAGE_SHIFT,
                __entry->order)
);

DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache,
        TP_PROTO(struct folio *folio),
        TP_ARGS(folio)
        );

DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
        TP_PROTO(struct folio *folio),
        TP_ARGS(folio)
        );

DECLARE_EVENT_CLASS(mm_filemap_op_page_cache_range,

        TP_PROTO(
                struct address_space *mapping,
                pgoff_t index,
                pgoff_t last_index
        ),

        TP_ARGS(mapping, index, last_index),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(dev_t, s_dev)
                __field(unsigned long, index)
                __field(unsigned long, last_index)
        ),

        TP_fast_assign(
                __entry->i_ino = mapping->host->i_ino;
                if (mapping->host->i_sb)
                        __entry->s_dev =
                                mapping->host->i_sb->s_dev;
                else
                        __entry->s_dev = mapping->host->i_rdev;
                __entry->index = index;
                __entry->last_index = last_index;
        ),

        TP_printk(
                "dev=%d:%d ino=%llx ofs=%lld-%lld",
                MAJOR(__entry->s_dev),
                MINOR(__entry->s_dev), __entry->i_ino,
                ((loff_t)__entry->index) << PAGE_SHIFT,
                ((((loff_t)__entry->last_index + 1) << PAGE_SHIFT) - 1)
        )
);

DEFINE_EVENT(mm_filemap_op_page_cache_range, mm_filemap_get_pages,
        TP_PROTO(
                struct address_space *mapping,
                pgoff_t index,
                pgoff_t last_index
        ),
        TP_ARGS(mapping, index, last_index)
);

DEFINE_EVENT(mm_filemap_op_page_cache_range, mm_filemap_map_pages,
        TP_PROTO(
                struct address_space *mapping,
                pgoff_t index,
                pgoff_t last_index
        ),
        TP_ARGS(mapping, index, last_index)
);

TRACE_EVENT(mm_filemap_fault,
        TP_PROTO(struct address_space *mapping, pgoff_t index),

        TP_ARGS(mapping, index),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(dev_t, s_dev)
                __field(unsigned long, index)
        ),

        TP_fast_assign(
                __entry->i_ino = mapping->host->i_ino;
                if (mapping->host->i_sb)
                        __entry->s_dev =
                                mapping->host->i_sb->s_dev;
                else
                        __entry->s_dev = mapping->host->i_rdev;
                __entry->index = index;
        ),

        TP_printk(
                "dev=%d:%d ino=%llx ofs=%lld",
                MAJOR(__entry->s_dev),
                MINOR(__entry->s_dev), __entry->i_ino,
                ((loff_t)__entry->index) << PAGE_SHIFT
        )
);

TRACE_EVENT(filemap_set_wb_err,
                TP_PROTO(struct address_space *mapping, errseq_t eseq),

                TP_ARGS(mapping, eseq),

                TP_STRUCT__entry(
                        __field(u64, i_ino)
                        __field(dev_t, s_dev)
                        __field(errseq_t, errseq)
                ),

                TP_fast_assign(
                        __entry->i_ino = mapping->host->i_ino;
                        __entry->errseq = eseq;
                        if (mapping->host->i_sb)
                                __entry->s_dev = mapping->host->i_sb->s_dev;
                        else
                                __entry->s_dev = mapping->host->i_rdev;
                ),

                TP_printk("dev=%d:%d ino=0x%llx errseq=0x%x",
                        MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                        __entry->i_ino, __entry->errseq)
);

TRACE_EVENT(file_check_and_advance_wb_err,
                TP_PROTO(struct file *file, errseq_t old),

                TP_ARGS(file, old),

                TP_STRUCT__entry(
                        __field(u64, i_ino)
                        __field(struct file *, file)
                        __field(dev_t, s_dev)
                        __field(errseq_t, old)
                        __field(errseq_t, new)
                ),

                TP_fast_assign(
                        __entry->file = file;
                        __entry->i_ino = file->f_mapping->host->i_ino;
                        if (file->f_mapping->host->i_sb)
                                __entry->s_dev =
                                        file->f_mapping->host->i_sb->s_dev;
                        else
                                __entry->s_dev =
                                        file->f_mapping->host->i_rdev;
                        __entry->old = old;
                        __entry->new = file->f_wb_err;
                ),

                TP_printk("file=%p dev=%d:%d ino=0x%llx old=0x%x new=0x%x",
                        __entry->file, MAJOR(__entry->s_dev),
                        MINOR(__entry->s_dev), __entry->i_ino, __entry->old,
                        __entry->new)
);
#endif /* _TRACE_FILEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>















   12 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM fib

#if !defined(_TRACE_FIB_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FIB_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/ip_fib.h>
#include <linux/tracepoint.h>

TRACE_EVENT(fib_table_lookup,

        TP_PROTO(u32 tb_id, const struct flowi4 *flp,
                 const struct fib_nh_common *nhc, int err),

        TP_ARGS(tb_id, flp, nhc, err),

        TP_STRUCT__entry(
                __field(        u32,        tb_id                )
                __field(        int,        err                )
                __field(        int,        oif                )
                __field(        int,        iif                )
                __field(        u8,        proto                )
                __field(        __u8,        tos                )
                __field(        __u8,        scope                )
                __field(        __u8,        flags                )
                __array(        __u8,        src,        4        )
                __array(        __u8,        dst,        4        )
                __array(        __u8,        gw4,        4        )
                __array(        __u8,        gw6,        16        )
                __field(        u16,        sport                )
                __field(        u16,        dport                )
                __array(char,  name,   IFNAMSIZ )
        ),

        TP_fast_assign(
                struct net_device *dev;
                struct in6_addr *in6;
                __be32 *p32;

                __entry->tb_id = tb_id;
                __entry->err = err;
                __entry->oif = flp->flowi4_oif;
                __entry->iif = flp->flowi4_iif;
                __entry->tos = inet_dscp_to_dsfield(flp->flowi4_dscp);
                __entry->scope = flp->flowi4_scope;
                __entry->flags = flp->flowi4_flags;

                p32 = (__be32 *) __entry->src;
                *p32 = flp->saddr;

                p32 = (__be32 *) __entry->dst;
                *p32 = flp->daddr;

                __entry->proto = flp->flowi4_proto;
                if (__entry->proto == IPPROTO_TCP ||
                    __entry->proto == IPPROTO_UDP) {
                        __entry->sport = ntohs(flp->fl4_sport);
                        __entry->dport = ntohs(flp->fl4_dport);
                } else {
                        __entry->sport = 0;
                        __entry->dport = 0;
                }

                dev = nhc ? nhc->nhc_dev : NULL;
                strscpy(__entry->name, dev ? dev->name : "-", IFNAMSIZ);

                if (nhc) {
                        if (nhc->nhc_gw_family == AF_INET) {
                                p32 = (__be32 *) __entry->gw4;
                                *p32 = nhc->nhc_gw.ipv4;

                                in6 = (struct in6_addr *)__entry->gw6;
                                *in6 = in6addr_any;
                        } else if (nhc->nhc_gw_family == AF_INET6) {
                                p32 = (__be32 *) __entry->gw4;
                                *p32 = 0;

                                in6 = (struct in6_addr *)__entry->gw6;
                                *in6 = nhc->nhc_gw.ipv6;
                        }
                } else {
                        p32 = (__be32 *) __entry->gw4;
                        *p32 = 0;

                        in6 = (struct in6_addr *)__entry->gw6;
                        *in6 = in6addr_any;
                }
        ),

        TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4/%pI6c err %d",
                  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
                  __entry->src, __entry->sport, __entry->dst, __entry->dport,
                  __entry->tos, __entry->scope, __entry->flags,
                  __entry->name, __entry->gw4, __entry->gw6, __entry->err)
);
#endif /* _TRACE_FIB_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




























































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





































































































































































































































































































































































































































































































































































    1 

    2 












    2 




    2 


    2 























    2 


    2 



    2 





























































































































































































    2 







    2 

    2 






    2 













    2 

















    2 




    2 





    2 



    2 



    2 





















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
 * Shaohua Li <shli@fb.com>
 */
#include <linux/module.h>

#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/init.h>
#include "null_blk.h"

#undef pr_fmt
#define pr_fmt(fmt)        "null_blk: " fmt

#define FREE_BATCH                16

#define TICKS_PER_SEC                50ULL
#define TIMER_INTERVAL                (NSEC_PER_SEC / TICKS_PER_SEC)

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
static DECLARE_FAULT_ATTR(null_timeout_attr);
static DECLARE_FAULT_ATTR(null_requeue_attr);
static DECLARE_FAULT_ATTR(null_init_hctx_attr);
#endif

static inline u64 mb_per_tick(int mbps)
{
        return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
}

/*
 * Status flags for nullb_device.
 *
 * CONFIGURED:        Device has been configured and turned on. Cannot reconfigure.
 * UP:                Device is currently on and visible in userspace.
 * THROTTLED:        Device is being throttled.
 * CACHE:        Device is using a write-back cache.
 */
enum nullb_device_flags {
        NULLB_DEV_FL_CONFIGURED        = 0,
        NULLB_DEV_FL_UP                = 1,
        NULLB_DEV_FL_THROTTLED        = 2,
        NULLB_DEV_FL_CACHE        = 3,
};

#define MAP_SZ                ((PAGE_SIZE >> SECTOR_SHIFT) + 2)
/*
 * nullb_page is a page in memory for nullb devices.
 *
 * @page:        The page holding the data.
 * @bitmap:        The bitmap represents which sector in the page has data.
 *                Each bit represents one block size. For example, sector 8
 *                will use the 7th bit
 * The highest 2 bits of bitmap are for special purpose. LOCK means the cache
 * page is being flushing to storage. FREE means the cache page is freed and
 * should be skipped from flushing to storage. Please see
 * null_make_cache_space
 */
struct nullb_page {
        struct page *page;
        DECLARE_BITMAP(bitmap, MAP_SZ);
};
#define NULLB_PAGE_LOCK (MAP_SZ - 1)
#define NULLB_PAGE_FREE (MAP_SZ - 2)

static LIST_HEAD(nullb_list);
static struct mutex lock;
static int null_major;
static DEFINE_IDA(nullb_indexes);
static struct blk_mq_tag_set tag_set;

enum {
        NULL_IRQ_NONE                = 0,
        NULL_IRQ_SOFTIRQ        = 1,
        NULL_IRQ_TIMER                = 2,
};

static bool g_virt_boundary;
module_param_named(virt_boundary, g_virt_boundary, bool, 0444);
MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False");

static int g_no_sched;
module_param_named(no_sched, g_no_sched, int, 0444);
MODULE_PARM_DESC(no_sched, "No io scheduler");

static int g_submit_queues = 1;
module_param_named(submit_queues, g_submit_queues, int, 0444);
MODULE_PARM_DESC(submit_queues, "Number of submission queues");

static int g_poll_queues = 1;
module_param_named(poll_queues, g_poll_queues, int, 0444);
MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues");

static int g_home_node = NUMA_NO_NODE;
module_param_named(home_node, g_home_node, int, 0444);
MODULE_PARM_DESC(home_node, "Home node for the device");

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
/*
 * For more details about fault injection, please refer to
 * Documentation/fault-injection/fault-injection.rst.
 */
static char g_timeout_str[80];
module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444);
MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>");

static char g_requeue_str[80];
module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444);
MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>");

static char g_init_hctx_str[80];
module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444);
MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>");
#endif

/*
 * Historic queue modes.
 *
 * These days nothing but NULL_Q_MQ is actually supported, but we keep it the
 * enum for error reporting.
 */
enum {
        NULL_Q_BIO        = 0,
        NULL_Q_RQ        = 1,
        NULL_Q_MQ        = 2,
};

static int g_queue_mode = NULL_Q_MQ;

static int null_param_store_val(const char *str, int *val, int min, int max)
{
        int ret, new_val;

        ret = kstrtoint(str, 10, &new_val);
        if (ret)
                return -EINVAL;

        if (new_val < min || new_val > max)
                return -EINVAL;

        *val = new_val;
        return 0;
}

static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
{
        return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ);
}

static const struct kernel_param_ops null_queue_mode_param_ops = {
        .set        = null_set_queue_mode,
        .get        = param_get_int,
};

device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444);
MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");

static int g_gb = 250;
module_param_named(gb, g_gb, int, 0444);
MODULE_PARM_DESC(gb, "Size in GB");

static int g_bs = 512;
module_param_named(bs, g_bs, int, 0444);
MODULE_PARM_DESC(bs, "Block size (in bytes)");

static int g_max_sectors;
module_param_named(max_sectors, g_max_sectors, int, 0444);
MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)");

static unsigned int nr_devices = 1;
module_param(nr_devices, uint, 0444);
MODULE_PARM_DESC(nr_devices, "Number of devices to register");

static bool g_blocking;
module_param_named(blocking, g_blocking, bool, 0444);
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");

static bool g_shared_tags;
module_param_named(shared_tags, g_shared_tags, bool, 0444);
MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");

static bool g_shared_tag_bitmap;
module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444);
MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq");

static int g_irqmode = NULL_IRQ_SOFTIRQ;

static int null_set_irqmode(const char *str, const struct kernel_param *kp)
{
        return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE,
                                        NULL_IRQ_TIMER);
}

static const struct kernel_param_ops null_irqmode_param_ops = {
        .set        = null_set_irqmode,
        .get        = param_get_int,
};

device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444);
MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");

static unsigned long g_completion_nsec = 10000;
module_param_named(completion_nsec, g_completion_nsec, ulong, 0444);
MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");

static int g_hw_queue_depth = 64;
module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444);
MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");

static bool g_use_per_node_hctx;
module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");

static bool g_memory_backed;
module_param_named(memory_backed, g_memory_backed, bool, 0444);
MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false");

static bool g_discard;
module_param_named(discard, g_discard, bool, 0444);
MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false");

static unsigned long g_cache_size;
module_param_named(cache_size, g_cache_size, ulong, 0444);
MODULE_PARM_DESC(cache_size, "Cache size in MiB for memory-backed device. Default: 0 (none)");

static bool g_fua = true;
module_param_named(fua, g_fua, bool, 0444);
MODULE_PARM_DESC(fua, "Enable/disable FUA support when cache_size is used. Default: true");

static unsigned int g_mbps;
module_param_named(mbps, g_mbps, uint, 0444);
MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");

static bool g_zoned;
module_param_named(zoned, g_zoned, bool, S_IRUGO);
MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");

static unsigned long g_zone_size = 256;
module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");

static unsigned long g_zone_capacity;
module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size");

static unsigned int g_zone_nr_conv;
module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");

static unsigned int g_zone_max_open;
module_param_named(zone_max_open, g_zone_max_open, uint, 0444);
MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)");

static unsigned int g_zone_max_active;
module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");

static int g_zone_append_max_sectors = INT_MAX;
module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444);
MODULE_PARM_DESC(zone_append_max_sectors,
                 "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation");

static bool g_zone_full;
module_param_named(zone_full, g_zone_full, bool, S_IRUGO);
MODULE_PARM_DESC(zone_full, "Initialize the sequential write required zones of a zoned device to be full. Default: false");

static bool g_rotational;
module_param_named(rotational, g_rotational, bool, S_IRUGO);
MODULE_PARM_DESC(rotational, "Set the rotational feature for the device. Default: false");

static struct nullb_device *null_alloc_dev(void);
static void null_free_dev(struct nullb_device *dev);
static void null_del_dev(struct nullb *nullb);
static int null_add_dev(struct nullb_device *dev);
static struct nullb *null_find_dev_by_name(const char *name);
static void null_free_device_storage(struct nullb_device *dev, bool is_cache);

static inline struct nullb_device *to_nullb_device(struct config_item *item)
{
        return item ? container_of(to_config_group(item), struct nullb_device, group) : NULL;
}

static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page)
{
        return snprintf(page, PAGE_SIZE, "%u\n", val);
}

static inline ssize_t nullb_device_ulong_attr_show(unsigned long val,
        char *page)
{
        return snprintf(page, PAGE_SIZE, "%lu\n", val);
}

static inline ssize_t nullb_device_bool_attr_show(bool val, char *page)
{
        return snprintf(page, PAGE_SIZE, "%u\n", val);
}

static ssize_t nullb_device_uint_attr_store(unsigned int *val,
        const char *page, size_t count)
{
        unsigned int tmp;
        int result;

        result = kstrtouint(page, 0, &tmp);
        if (result < 0)
                return result;

        *val = tmp;
        return count;
}

static ssize_t nullb_device_ulong_attr_store(unsigned long *val,
        const char *page, size_t count)
{
        int result;
        unsigned long tmp;

        result = kstrtoul(page, 0, &tmp);
        if (result < 0)
                return result;

        *val = tmp;
        return count;
}

static ssize_t nullb_device_bool_attr_store(bool *val, const char *page,
        size_t count)
{
        bool tmp;
        int result;

        result = kstrtobool(page,  &tmp);
        if (result < 0)
                return result;

        *val = tmp;
        return count;
}

/* The following macro should only be used with TYPE = {uint, ulong, bool}. */
#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY)                                \
static ssize_t                                                                \
nullb_device_##NAME##_show(struct config_item *item, char *page)        \
{                                                                        \
        return nullb_device_##TYPE##_attr_show(                                \
                                to_nullb_device(item)->NAME, page);        \
}                                                                        \
static ssize_t                                                                \
nullb_device_##NAME##_store(struct config_item *item, const char *page,        \
                            size_t count)                                \
{                                                                        \
        int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\
        struct nullb_device *dev = to_nullb_device(item);                \
        TYPE new_value = 0;                                                \
        int ret;                                                        \
                                                                        \
        ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\
        if (ret < 0)                                                        \
                return ret;                                                \
        if (apply_fn)                                                        \
                ret = apply_fn(dev, new_value);                                \
        else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags))         \
                ret = -EBUSY;                                                \
        if (ret < 0)                                                        \
                return ret;                                                \
        dev->NAME = new_value;                                                \
        return count;                                                        \
}                                                                        \
CONFIGFS_ATTR(nullb_device_, NAME);

static int nullb_update_nr_hw_queues(struct nullb_device *dev,
                                     unsigned int submit_queues,
                                     unsigned int poll_queues)

{
        struct blk_mq_tag_set *set;
        int ret, nr_hw_queues;

        if (!dev->nullb)
                return 0;

        /*
         * Make sure at least one submit queue exists.
         */
        if (!submit_queues)
                return -EINVAL;

        /*
         * Make sure that null_init_hctx() does not access nullb->queues[] past
         * the end of that array.
         */
        if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues)
                return -EINVAL;

        /*
         * Keep previous and new queue numbers in nullb_device for reference in
         * the call back function null_map_queues().
         */
        dev->prev_submit_queues = dev->submit_queues;
        dev->prev_poll_queues = dev->poll_queues;
        dev->submit_queues = submit_queues;
        dev->poll_queues = poll_queues;

        set = dev->nullb->tag_set;
        nr_hw_queues = submit_queues + poll_queues;
        blk_mq_update_nr_hw_queues(set, nr_hw_queues);
        ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM;

        if (ret) {
                /* on error, revert the queue numbers */
                dev->submit_queues = dev->prev_submit_queues;
                dev->poll_queues = dev->prev_poll_queues;
        }

        return ret;
}

static int nullb_apply_submit_queues(struct nullb_device *dev,
                                     unsigned int submit_queues)
{
        int ret;

        mutex_lock(&lock);
        ret = nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues);
        mutex_unlock(&lock);

        return ret;
}

static int nullb_apply_poll_queues(struct nullb_device *dev,
                                   unsigned int poll_queues)
{
        int ret;

        mutex_lock(&lock);
        ret = nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues);
        mutex_unlock(&lock);

        return ret;
}

NULLB_DEVICE_ATTR(size, ulong, NULL);
NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL);
NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues);
NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues);
NULLB_DEVICE_ATTR(home_node, uint, NULL);
NULLB_DEVICE_ATTR(queue_mode, uint, NULL);
NULLB_DEVICE_ATTR(blocksize, uint, NULL);
NULLB_DEVICE_ATTR(max_sectors, uint, NULL);
NULLB_DEVICE_ATTR(irqmode, uint, NULL);
NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL);
NULLB_DEVICE_ATTR(index, uint, NULL);
NULLB_DEVICE_ATTR(blocking, bool, NULL);
NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL);
NULLB_DEVICE_ATTR(memory_backed, bool, NULL);
NULLB_DEVICE_ATTR(discard, bool, NULL);
NULLB_DEVICE_ATTR(mbps, uint, NULL);
NULLB_DEVICE_ATTR(cache_size, ulong, NULL);
NULLB_DEVICE_ATTR(zoned, bool, NULL);
NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL);
NULLB_DEVICE_ATTR(zone_full, bool, NULL);
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
NULLB_DEVICE_ATTR(no_sched, bool, NULL);
NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
NULLB_DEVICE_ATTR(fua, bool, NULL);
NULLB_DEVICE_ATTR(rotational, bool, NULL);
NULLB_DEVICE_ATTR(badblocks_once, bool, NULL);
NULLB_DEVICE_ATTR(badblocks_partial_io, bool, NULL);

static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
        return nullb_device_bool_attr_show(to_nullb_device(item)->power, page);
}

static ssize_t nullb_device_power_store(struct config_item *item,
                                     const char *page, size_t count)
{
        struct nullb_device *dev = to_nullb_device(item);
        bool newp = false;
        ssize_t ret;

        ret = nullb_device_bool_attr_store(&newp, page, count);
        if (ret < 0)
                return ret;

        ret = count;
        mutex_lock(&lock);
        if (!dev->power && newp) {
                if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags))
                        goto out;

                ret = null_add_dev(dev);
                if (ret) {
                        clear_bit(NULLB_DEV_FL_UP, &dev->flags);
                        goto out;
                }

                set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
                dev->power = newp;
                ret = count;
        } else if (dev->power && !newp) {
                if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
                        dev->power = newp;
                        null_del_dev(dev->nullb);
                }
                clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
        }

out:
        mutex_unlock(&lock);
        return ret;
}

CONFIGFS_ATTR(nullb_device_, power);

static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page)
{
        struct nullb_device *t_dev = to_nullb_device(item);

        return badblocks_show(&t_dev->badblocks, page, 0);
}

static ssize_t nullb_device_badblocks_store(struct config_item *item,
                                     const char *page, size_t count)
{
        struct nullb_device *t_dev = to_nullb_device(item);
        char *orig, *buf, *tmp;
        u64 start, end;
        int ret;

        orig = kstrndup(page, count, GFP_KERNEL);
        if (!orig)
                return -ENOMEM;

        buf = strstrip(orig);

        ret = -EINVAL;
        if (buf[0] != '+' && buf[0] != '-')
                goto out;
        tmp = strchr(&buf[1], '-');
        if (!tmp)
                goto out;
        *tmp = '\0';
        ret = kstrtoull(buf + 1, 0, &start);
        if (ret)
                goto out;
        ret = kstrtoull(tmp + 1, 0, &end);
        if (ret)
                goto out;
        ret = -EINVAL;
        if (start > end)
                goto out;
        /* enable badblocks */
        cmpxchg(&t_dev->badblocks.shift, -1, 0);
        if (buf[0] == '+') {
                if (badblocks_set(&t_dev->badblocks, start,
                                  end - start + 1, 1))
                        ret = count;
        } else if (badblocks_clear(&t_dev->badblocks, start,
                                   end - start + 1)) {
                ret = count;
        }
out:
        kfree(orig);
        return ret;
}
CONFIGFS_ATTR(nullb_device_, badblocks);

static ssize_t nullb_device_zone_readonly_store(struct config_item *item,
                                                const char *page, size_t count)
{
        struct nullb_device *dev = to_nullb_device(item);

        return zone_cond_store(dev, page, count, BLK_ZONE_COND_READONLY);
}
CONFIGFS_ATTR_WO(nullb_device_, zone_readonly);

static ssize_t nullb_device_zone_offline_store(struct config_item *item,
                                               const char *page, size_t count)
{
        struct nullb_device *dev = to_nullb_device(item);

        return zone_cond_store(dev, page, count, BLK_ZONE_COND_OFFLINE);
}
CONFIGFS_ATTR_WO(nullb_device_, zone_offline);

static struct configfs_attribute *nullb_device_attrs[] = {
        &nullb_device_attr_badblocks,
        &nullb_device_attr_badblocks_once,
        &nullb_device_attr_badblocks_partial_io,
        &nullb_device_attr_blocking,
        &nullb_device_attr_blocksize,
        &nullb_device_attr_cache_size,
        &nullb_device_attr_completion_nsec,
        &nullb_device_attr_discard,
        &nullb_device_attr_fua,
        &nullb_device_attr_home_node,
        &nullb_device_attr_hw_queue_depth,
        &nullb_device_attr_index,
        &nullb_device_attr_irqmode,
        &nullb_device_attr_max_sectors,
        &nullb_device_attr_mbps,
        &nullb_device_attr_memory_backed,
        &nullb_device_attr_no_sched,
        &nullb_device_attr_poll_queues,
        &nullb_device_attr_power,
        &nullb_device_attr_queue_mode,
        &nullb_device_attr_rotational,
        &nullb_device_attr_shared_tag_bitmap,
        &nullb_device_attr_shared_tags,
        &nullb_device_attr_size,
        &nullb_device_attr_submit_queues,
        &nullb_device_attr_use_per_node_hctx,
        &nullb_device_attr_virt_boundary,
        &nullb_device_attr_zone_append_max_sectors,
        &nullb_device_attr_zone_capacity,
        &nullb_device_attr_zone_full,
        &nullb_device_attr_zone_max_active,
        &nullb_device_attr_zone_max_open,
        &nullb_device_attr_zone_nr_conv,
        &nullb_device_attr_zone_offline,
        &nullb_device_attr_zone_readonly,
        &nullb_device_attr_zone_size,
        &nullb_device_attr_zoned,
        NULL,
};

static void nullb_device_release(struct config_item *item)
{
        struct nullb_device *dev = to_nullb_device(item);

        null_free_device_storage(dev, false);
        null_free_dev(dev);
}

static const struct configfs_item_operations nullb_device_ops = {
        .release        = nullb_device_release,
};

static const struct config_item_type nullb_device_type = {
        .ct_item_ops        = &nullb_device_ops,
        .ct_attrs        = nullb_device_attrs,
        .ct_owner        = THIS_MODULE,
};

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION

static void nullb_add_fault_config(struct nullb_device *dev)
{
        fault_config_init(&dev->timeout_config, "timeout_inject");
        fault_config_init(&dev->requeue_config, "requeue_inject");
        fault_config_init(&dev->init_hctx_fault_config, "init_hctx_fault_inject");

        configfs_add_default_group(&dev->timeout_config.group, &dev->group);
        configfs_add_default_group(&dev->requeue_config.group, &dev->group);
        configfs_add_default_group(&dev->init_hctx_fault_config.group, &dev->group);
}

static void nullb_del_fault_config(struct nullb_device *dev)
{
        config_item_put(&dev->init_hctx_fault_config.group.cg_item);
        config_item_put(&dev->requeue_config.group.cg_item);
        config_item_put(&dev->timeout_config.group.cg_item);
}

#else

static void nullb_add_fault_config(struct nullb_device *dev)
{
}

static void nullb_del_fault_config(struct nullb_device *dev)
{
}
#endif

static struct
config_group *nullb_group_make_group(struct config_group *group, const char *name)
{
        struct nullb_device *dev;

        if (null_find_dev_by_name(name))
                return ERR_PTR(-EEXIST);

        dev = null_alloc_dev();
        if (!dev)
                return ERR_PTR(-ENOMEM);

        config_group_init_type_name(&dev->group, name, &nullb_device_type);
        nullb_add_fault_config(dev);

        return &dev->group;
}

static void
nullb_group_drop_item(struct config_group *group, struct config_item *item)
{
        struct nullb_device *dev = to_nullb_device(item);

        if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
                mutex_lock(&lock);
                dev->power = false;
                null_del_dev(dev->nullb);
                mutex_unlock(&lock);
        }
        nullb_del_fault_config(dev);
        config_item_put(item);
}

static ssize_t memb_group_features_show(struct config_item *item, char *page)
{

        struct configfs_attribute **entry;
        char delimiter = ',';
        size_t left = PAGE_SIZE;
        size_t written = 0;
        int ret;

        for (entry = &nullb_device_attrs[0]; *entry && left > 0; entry++) {
                if (!*(entry + 1))
                        delimiter = '\n';
                ret = snprintf(page + written, left, "%s%c", (*entry)->ca_name,
                               delimiter);
                if (ret >= left) {
                        WARN_ONCE(1, "Too many null_blk features to print\n");
                        memzero_explicit(page, PAGE_SIZE);
                        return -ENOBUFS;
                }
                left -= ret;
                written += ret;
        }

        return written;
}

CONFIGFS_ATTR_RO(memb_group_, features);

static struct configfs_attribute *nullb_group_attrs[] = {
        &memb_group_attr_features,
        NULL,
};

static const struct configfs_group_operations nullb_group_ops = {
        .make_group        = nullb_group_make_group,
        .drop_item        = nullb_group_drop_item,
};

static const struct config_item_type nullb_group_type = {
        .ct_group_ops        = &nullb_group_ops,
        .ct_attrs        = nullb_group_attrs,
        .ct_owner        = THIS_MODULE,
};

static struct configfs_subsystem nullb_subsys = {
        .su_group = {
                .cg_item = {
                        .ci_namebuf = "nullb",
                        .ci_type = &nullb_group_type,
                },
        },
};

static inline int null_cache_active(struct nullb *nullb)
{
        return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
}

static struct nullb_device *null_alloc_dev(void)
{
        struct nullb_device *dev;

        dev = kzalloc_obj(*dev);
        if (!dev)
                return NULL;

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
        dev->timeout_config.attr = null_timeout_attr;
        dev->requeue_config.attr = null_requeue_attr;
        dev->init_hctx_fault_config.attr = null_init_hctx_attr;
#endif

        INIT_RADIX_TREE(&dev->data, GFP_ATOMIC);
        INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC);
        if (badblocks_init(&dev->badblocks, 0)) {
                kfree(dev);
                return NULL;
        }

        dev->size = g_gb * 1024;
        dev->completion_nsec = g_completion_nsec;
        dev->submit_queues = g_submit_queues;
        dev->prev_submit_queues = g_submit_queues;
        dev->poll_queues = g_poll_queues;
        dev->prev_poll_queues = g_poll_queues;
        dev->home_node = g_home_node;
        dev->queue_mode = g_queue_mode;
        dev->blocksize = g_bs;
        dev->max_sectors = g_max_sectors;
        dev->irqmode = g_irqmode;
        dev->hw_queue_depth = g_hw_queue_depth;
        dev->blocking = g_blocking;
        dev->memory_backed = g_memory_backed;
        dev->discard = g_discard;
        dev->cache_size = g_cache_size;
        dev->mbps = g_mbps;
        dev->use_per_node_hctx = g_use_per_node_hctx;
        dev->zoned = g_zoned;
        dev->zone_size = g_zone_size;
        dev->zone_capacity = g_zone_capacity;
        dev->zone_nr_conv = g_zone_nr_conv;
        dev->zone_max_open = g_zone_max_open;
        dev->zone_max_active = g_zone_max_active;
        dev->zone_append_max_sectors = g_zone_append_max_sectors;
        dev->zone_full = g_zone_full;
        dev->virt_boundary = g_virt_boundary;
        dev->no_sched = g_no_sched;
        dev->shared_tags = g_shared_tags;
        dev->shared_tag_bitmap = g_shared_tag_bitmap;
        dev->fua = g_fua;
        dev->rotational = g_rotational;

        return dev;
}

static void null_free_dev(struct nullb_device *dev)
{
        if (!dev)
                return;

        null_free_zoned_dev(dev);
        badblocks_exit(&dev->badblocks);
        kfree(dev);
}

static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
{
        struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer);

        blk_mq_end_request(blk_mq_rq_from_pdu(cmd), cmd->error);
        return HRTIMER_NORESTART;
}

static void null_cmd_end_timer(struct nullb_cmd *cmd)
{
        ktime_t kt = cmd->nq->dev->completion_nsec;

        hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
}

static void null_complete_rq(struct request *rq)
{
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);

        blk_mq_end_request(rq, cmd->error);
}

static struct nullb_page *null_alloc_page(void)
{
        struct nullb_page *t_page;

        t_page = kmalloc_obj(struct nullb_page, GFP_NOIO);
        if (!t_page)
                return NULL;

        t_page->page = alloc_pages(GFP_NOIO, 0);
        if (!t_page->page) {
                kfree(t_page);
                return NULL;
        }

        memset(t_page->bitmap, 0, sizeof(t_page->bitmap));
        return t_page;
}

static void null_free_page(struct nullb_page *t_page)
{
        __set_bit(NULLB_PAGE_FREE, t_page->bitmap);
        if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap))
                return;
        __free_page(t_page->page);
        kfree(t_page);
}

static bool null_page_empty(struct nullb_page *page)
{
        int size = MAP_SZ - 2;

        return find_first_bit(page->bitmap, size) == size;
}

static void null_free_sector(struct nullb *nullb, sector_t sector,
        bool is_cache)
{
        unsigned int sector_bit;
        u64 idx;
        struct nullb_page *t_page, *ret;
        struct radix_tree_root *root;

        root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
        idx = sector >> PAGE_SECTORS_SHIFT;
        sector_bit = (sector & SECTOR_MASK);

        t_page = radix_tree_lookup(root, idx);
        if (t_page) {
                __clear_bit(sector_bit, t_page->bitmap);

                if (null_page_empty(t_page)) {
                        ret = radix_tree_delete_item(root, idx, t_page);
                        WARN_ON(ret != t_page);
                        null_free_page(ret);
                        if (is_cache)
                                nullb->dev->curr_cache -= PAGE_SIZE;
                }
        }
}

static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx,
        struct nullb_page *t_page, bool is_cache)
{
        struct radix_tree_root *root;

        root = is_cache ? &nullb->dev->cache : &nullb->dev->data;

        if (radix_tree_insert(root, idx, t_page)) {
                null_free_page(t_page);
                t_page = radix_tree_lookup(root, idx);
                WARN_ON(!t_page || t_page->page->private != idx);
        } else if (is_cache)
                nullb->dev->curr_cache += PAGE_SIZE;

        return t_page;
}

static void null_free_device_storage(struct nullb_device *dev, bool is_cache)
{
        unsigned long pos = 0;
        int nr_pages;
        struct nullb_page *ret, *t_pages[FREE_BATCH];
        struct radix_tree_root *root;

        root = is_cache ? &dev->cache : &dev->data;

        do {
                int i;

                nr_pages = radix_tree_gang_lookup(root,
                                (void **)t_pages, pos, FREE_BATCH);

                for (i = 0; i < nr_pages; i++) {
                        pos = t_pages[i]->page->private;
                        ret = radix_tree_delete_item(root, pos, t_pages[i]);
                        WARN_ON(ret != t_pages[i]);
                        null_free_page(ret);
                }

                pos++;
        } while (nr_pages == FREE_BATCH);

        if (is_cache)
                dev->curr_cache = 0;
}

static struct nullb_page *__null_lookup_page(struct nullb *nullb,
        sector_t sector, bool for_write, bool is_cache)
{
        unsigned int sector_bit;
        u64 idx;
        struct nullb_page *t_page;
        struct radix_tree_root *root;

        idx = sector >> PAGE_SECTORS_SHIFT;
        sector_bit = (sector & SECTOR_MASK);

        root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
        t_page = radix_tree_lookup(root, idx);
        WARN_ON(t_page && t_page->page->private != idx);

        if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap)))
                return t_page;

        return NULL;
}

static struct nullb_page *null_lookup_page(struct nullb *nullb,
        sector_t sector, bool for_write, bool ignore_cache)
{
        struct nullb_page *page = NULL;

        if (!ignore_cache)
                page = __null_lookup_page(nullb, sector, for_write, true);
        if (page)
                return page;
        return __null_lookup_page(nullb, sector, for_write, false);
}

static struct nullb_page *null_insert_page(struct nullb *nullb,
                                           sector_t sector, bool ignore_cache)
        __releases(&nullb->lock)
        __acquires(&nullb->lock)
{
        u64 idx;
        struct nullb_page *t_page;

        t_page = null_lookup_page(nullb, sector, true, ignore_cache);
        if (t_page)
                return t_page;

        spin_unlock_irq(&nullb->lock);

        t_page = null_alloc_page();
        if (!t_page)
                goto out_lock;

        if (radix_tree_preload(GFP_NOIO))
                goto out_freepage;

        spin_lock_irq(&nullb->lock);
        idx = sector >> PAGE_SECTORS_SHIFT;
        t_page->page->private = idx;
        t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache);
        radix_tree_preload_end();

        return t_page;
out_freepage:
        null_free_page(t_page);
out_lock:
        spin_lock_irq(&nullb->lock);
        return null_lookup_page(nullb, sector, true, ignore_cache);
}

static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
{
        int i;
        unsigned int offset;
        u64 idx;
        struct nullb_page *t_page, *ret;
        void *dst, *src;

        idx = c_page->page->private;

        t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);

        __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap);
        if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) {
                null_free_page(c_page);
                if (t_page && null_page_empty(t_page)) {
                        ret = radix_tree_delete_item(&nullb->dev->data,
                                idx, t_page);
                        null_free_page(t_page);
                }
                return 0;
        }

        if (!t_page)
                return -ENOMEM;

        src = kmap_local_page(c_page->page);
        dst = kmap_local_page(t_page->page);

        for (i = 0; i < PAGE_SECTORS;
                        i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
                if (test_bit(i, c_page->bitmap)) {
                        offset = (i << SECTOR_SHIFT);
                        memcpy(dst + offset, src + offset,
                                nullb->dev->blocksize);
                        __set_bit(i, t_page->bitmap);
                }
        }

        kunmap_local(dst);
        kunmap_local(src);

        ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page);
        null_free_page(ret);
        nullb->dev->curr_cache -= PAGE_SIZE;

        return 0;
}

static int null_make_cache_space(struct nullb *nullb, unsigned long n)
{
        int i, err, nr_pages;
        struct nullb_page *c_pages[FREE_BATCH];
        unsigned long flushed = 0, one_round;

again:
        if ((nullb->dev->cache_size * 1024 * 1024) >
             nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0)
                return 0;

        nr_pages = radix_tree_gang_lookup(&nullb->dev->cache,
                        (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH);
        /*
         * nullb_flush_cache_page could unlock before using the c_pages. To
         * avoid race, we don't allow page free
         */
        for (i = 0; i < nr_pages; i++) {
                nullb->cache_flush_pos = c_pages[i]->page->private;
                /*
                 * We found the page which is being flushed to disk by other
                 * threads
                 */
                if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap))
                        c_pages[i] = NULL;
                else
                        __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap);
        }

        one_round = 0;
        for (i = 0; i < nr_pages; i++) {
                if (c_pages[i] == NULL)
                        continue;
                err = null_flush_cache_page(nullb, c_pages[i]);
                if (err)
                        return err;
                one_round++;
        }
        flushed += one_round << PAGE_SHIFT;

        if (n > flushed) {
                if (nr_pages == 0)
                        nullb->cache_flush_pos = 0;
                if (one_round == 0) {
                        /* give other threads a chance */
                        spin_unlock_irq(&nullb->lock);
                        spin_lock_irq(&nullb->lock);
                }
                goto again;
        }
        return 0;
}

static blk_status_t copy_to_nullb(struct nullb *nullb, void *source,
                                  loff_t pos, size_t n, bool is_fua)
{
        size_t temp, count = 0;
        struct nullb_page *t_page;
        sector_t sector;

        while (count < n) {
                temp = min3(nullb->dev->blocksize, n - count,
                            PAGE_SIZE - offset_in_page(pos));
                sector = pos >> SECTOR_SHIFT;

                if (null_cache_active(nullb) && !is_fua)
                        null_make_cache_space(nullb, PAGE_SIZE);

                t_page = null_insert_page(nullb, sector,
                        !null_cache_active(nullb) || is_fua);
                if (!t_page)
                        return BLK_STS_NOSPC;

                memcpy_to_page(t_page->page, offset_in_page(pos),
                               source + count, temp);

                __set_bit(sector & SECTOR_MASK, t_page->bitmap);

                if (is_fua)
                        null_free_sector(nullb, sector, true);

                count += temp;
                pos += temp;
        }
        return BLK_STS_OK;
}

static void copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos,
                            size_t n)
{
        size_t temp, count = 0;
        struct nullb_page *t_page;
        sector_t sector;

        while (count < n) {
                temp = min3(nullb->dev->blocksize, n - count,
                            PAGE_SIZE - offset_in_page(pos));
                sector = pos >> SECTOR_SHIFT;

                t_page = null_lookup_page(nullb, sector, false,
                        !null_cache_active(nullb));
                if (t_page)
                        memcpy_from_page(dest + count, t_page->page,
                                         offset_in_page(pos), temp);
                else
                        memset(dest + count, 0, temp);

                count += temp;
                pos += temp;
        }
}

blk_status_t null_handle_discard(struct nullb_device *dev,
                                 sector_t sector, sector_t nr_sectors)
{
        struct nullb *nullb = dev->nullb;
        size_t n = nr_sectors << SECTOR_SHIFT;
        size_t temp;

        spin_lock_irq(&nullb->lock);
        while (n > 0) {
                temp = min_t(size_t, n, dev->blocksize);
                null_free_sector(nullb, sector, false);
                if (null_cache_active(nullb))
                        null_free_sector(nullb, sector, true);
                sector += temp >> SECTOR_SHIFT;
                n -= temp;
        }
        spin_unlock_irq(&nullb->lock);

        return BLK_STS_OK;
}

static blk_status_t null_handle_flush(struct nullb *nullb)
{
        int err;

        if (!null_cache_active(nullb))
                return 0;

        spin_lock_irq(&nullb->lock);
        while (true) {
                err = null_make_cache_space(nullb,
                        nullb->dev->cache_size * 1024 * 1024);
                if (err || nullb->dev->curr_cache == 0)
                        break;
        }

        WARN_ON(!radix_tree_empty(&nullb->dev->cache));
        spin_unlock_irq(&nullb->lock);
        return errno_to_blk_status(err);
}

static blk_status_t null_transfer(struct nullb *nullb, struct page *page,
        unsigned int len, unsigned int off, bool is_write, loff_t pos,
        bool is_fua)
{
        struct nullb_device *dev = nullb->dev;
        blk_status_t err = BLK_STS_OK;
        unsigned int valid_len = len;
        void *p;

        p = kmap_local_page(page) + off;
        if (!is_write) {
                if (dev->zoned) {
                        valid_len = null_zone_valid_read_len(nullb,
                                pos >> SECTOR_SHIFT, len);
                        if (valid_len && valid_len != len)
                                valid_len -= pos & (SECTOR_SIZE - 1);
                }

                if (valid_len) {
                        copy_from_nullb(nullb, p, pos, valid_len);
                        off += valid_len;
                        len -= valid_len;
                }

                if (len)
                        memset(p + valid_len, 0xff, len);
                flush_dcache_page(page);
        } else {
                flush_dcache_page(page);
                err = copy_to_nullb(nullb, p, pos, len, is_fua);
        }

        kunmap_local(p);
        return err;
}

/*
 * Transfer data for the given request. The transfer size is capped with the
 * nr_sectors argument.
 */
static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
                                              sector_t nr_sectors)
{
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        struct nullb *nullb = cmd->nq->dev->nullb;
        blk_status_t err = BLK_STS_OK;
        unsigned int len;
        loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
        unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
        unsigned int transferred_bytes = 0;
        struct req_iterator iter;
        struct bio_vec bvec;

        spin_lock_irq(&nullb->lock);
        rq_for_each_segment(bvec, rq, iter) {
                len = bvec.bv_len;
                if (transferred_bytes + len > max_bytes)
                        len = max_bytes - transferred_bytes;
                err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
                                     op_is_write(req_op(rq)), pos,
                                     rq->cmd_flags & REQ_FUA);
                if (err)
                        break;
                pos += len;
                transferred_bytes += len;
                if (transferred_bytes >= max_bytes)
                        break;
        }
        spin_unlock_irq(&nullb->lock);

        return err;
}

static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
{
        struct nullb_device *dev = cmd->nq->dev;
        struct nullb *nullb = dev->nullb;
        blk_status_t sts = BLK_STS_OK;
        struct request *rq = blk_mq_rq_from_pdu(cmd);

        if (!hrtimer_active(&nullb->bw_timer))
                hrtimer_restart(&nullb->bw_timer);

        if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) {
                blk_mq_stop_hw_queues(nullb->q);
                /* race with timer */
                if (atomic_long_read(&nullb->cur_bytes) > 0)
                        blk_mq_start_stopped_hw_queues(nullb->q, true);
                /* requeue request */
                sts = BLK_STS_DEV_RESOURCE;
        }
        return sts;
}

/*
 * Check if the command should fail for the badblocks. If so, return
 * BLK_STS_IOERR and return number of partial I/O sectors to be written or read,
 * which may be less than the requested number of sectors.
 *
 * @cmd:        The command to handle.
 * @sector:     The start sector for I/O.
 * @nr_sectors: Specifies number of sectors to write or read, and returns the
 *              number of sectors to be written or read.
 */
blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector,
                                   unsigned int *nr_sectors)
{
        struct badblocks *bb = &cmd->nq->dev->badblocks;
        struct nullb_device *dev = cmd->nq->dev;
        unsigned int block_sectors = dev->blocksize >> SECTOR_SHIFT;
        sector_t first_bad, bad_sectors;
        unsigned int partial_io_sectors = 0;

        if (!badblocks_check(bb, sector, *nr_sectors, &first_bad, &bad_sectors))
                return BLK_STS_OK;

        if (cmd->nq->dev->badblocks_once)
                badblocks_clear(bb, first_bad, bad_sectors);

        if (cmd->nq->dev->badblocks_partial_io) {
                if (!IS_ALIGNED(first_bad, block_sectors))
                        first_bad = ALIGN_DOWN(first_bad, block_sectors);
                if (sector < first_bad)
                        partial_io_sectors = first_bad - sector;
        }
        *nr_sectors = partial_io_sectors;

        return BLK_STS_IOERR;
}

blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op,
                                       sector_t sector, sector_t nr_sectors)
{
        struct nullb_device *dev = cmd->nq->dev;

        if (op == REQ_OP_DISCARD)
                return null_handle_discard(dev, sector, nr_sectors);

        return null_handle_data_transfer(cmd, nr_sectors);
}

static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
{
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        struct nullb_device *dev = cmd->nq->dev;
        struct bio *bio;

        if (!dev->memory_backed && req_op(rq) == REQ_OP_READ) {
                __rq_for_each_bio(bio, rq)
                        zero_fill_bio(bio);
        }
}

static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
{
        struct request *rq = blk_mq_rq_from_pdu(cmd);

        /*
         * Since root privileges are required to configure the null_blk
         * driver, it is fine that this driver does not initialize the
         * data buffers of read commands. Zero-initialize these buffers
         * anyway if KMSAN is enabled to prevent that KMSAN complains
         * about null_blk not initializing read data buffers.
         */
        if (IS_ENABLED(CONFIG_KMSAN))
                nullb_zero_read_cmd_buffer(cmd);

        /* Complete IO by inline, softirq or timer */
        switch (cmd->nq->dev->irqmode) {
        case NULL_IRQ_SOFTIRQ:
                blk_mq_complete_request(rq);
                break;
        case NULL_IRQ_NONE:
                blk_mq_end_request(rq, cmd->error);
                break;
        case NULL_IRQ_TIMER:
                null_cmd_end_timer(cmd);
                break;
        }
}

blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
                              sector_t sector, unsigned int nr_sectors)
{
        struct nullb_device *dev = cmd->nq->dev;
        blk_status_t badblocks_ret = BLK_STS_OK;
        blk_status_t ret;

        if (dev->badblocks.shift != -1)
                badblocks_ret = null_handle_badblocks(cmd, sector, &nr_sectors);

        if (dev->memory_backed && nr_sectors) {
                ret = null_handle_memory_backed(cmd, op, sector, nr_sectors);
                if (ret != BLK_STS_OK)
                        return ret;
        }

        return badblocks_ret;
}

static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
                            sector_t nr_sectors, enum req_op op)
{
        struct nullb_device *dev = cmd->nq->dev;
        struct nullb *nullb = dev->nullb;
        blk_status_t sts;

        if (op == REQ_OP_FLUSH) {
                cmd->error = null_handle_flush(nullb);
                goto out;
        }

        if (dev->zoned)
                sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors);
        else
                sts = null_process_cmd(cmd, op, sector, nr_sectors);

        /* Do not overwrite errors (e.g. timeout errors) */
        if (cmd->error == BLK_STS_OK)
                cmd->error = sts;

out:
        nullb_complete_cmd(cmd);
}

static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
{
        struct nullb *nullb = container_of(timer, struct nullb, bw_timer);
        ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
        unsigned int mbps = nullb->dev->mbps;

        if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps))
                return HRTIMER_NORESTART;

        atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
        blk_mq_start_stopped_hw_queues(nullb->q, true);

        hrtimer_forward_now(&nullb->bw_timer, timer_interval);

        return HRTIMER_RESTART;
}

static void nullb_setup_bwtimer(struct nullb *nullb)
{
        ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);

        hrtimer_setup(&nullb->bw_timer, nullb_bwtimer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps));
        hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
}

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION

static bool should_timeout_request(struct request *rq)
{
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
        struct nullb_device *dev = cmd->nq->dev;

        return should_fail(&dev->timeout_config.attr, 1);
}

static bool should_requeue_request(struct request *rq)
{
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
        struct nullb_device *dev = cmd->nq->dev;

        return should_fail(&dev->requeue_config.attr, 1);
}

static bool should_init_hctx_fail(struct nullb_device *dev)
{
        return should_fail(&dev->init_hctx_fault_config.attr, 1);
}

#else

static bool should_timeout_request(struct request *rq)
{
        return false;
}

static bool should_requeue_request(struct request *rq)
{
        return false;
}

static bool should_init_hctx_fail(struct nullb_device *dev)
{
        return false;
}

#endif

static void null_map_queues(struct blk_mq_tag_set *set)
{
        struct nullb *nullb = set->driver_data;
        int i, qoff;
        unsigned int submit_queues = g_submit_queues;
        unsigned int poll_queues = g_poll_queues;

        if (nullb) {
                struct nullb_device *dev = nullb->dev;

                /*
                 * Refer nr_hw_queues of the tag set to check if the expected
                 * number of hardware queues are prepared. If block layer failed
                 * to prepare them, use previous numbers of submit queues and
                 * poll queues to map queues.
                 */
                if (set->nr_hw_queues ==
                    dev->submit_queues + dev->poll_queues) {
                        submit_queues = dev->submit_queues;
                        poll_queues = dev->poll_queues;
                } else if (set->nr_hw_queues ==
                           dev->prev_submit_queues + dev->prev_poll_queues) {
                        submit_queues = dev->prev_submit_queues;
                        poll_queues = dev->prev_poll_queues;
                } else {
                        pr_warn("tag set has unexpected nr_hw_queues: %d\n",
                                set->nr_hw_queues);
                        WARN_ON_ONCE(true);
                        submit_queues = 1;
                        poll_queues = 0;
                }
        }

        for (i = 0, qoff = 0; i < set->nr_maps; i++) {
                struct blk_mq_queue_map *map = &set->map[i];

                switch (i) {
                case HCTX_TYPE_DEFAULT:
                        map->nr_queues = submit_queues;
                        break;
                case HCTX_TYPE_READ:
                        map->nr_queues = 0;
                        continue;
                case HCTX_TYPE_POLL:
                        map->nr_queues = poll_queues;
                        break;
                }
                map->queue_offset = qoff;
                qoff += map->nr_queues;
                blk_mq_map_queues(map);
        }
}

static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
        struct nullb_queue *nq = hctx->driver_data;
        LIST_HEAD(list);
        int nr = 0;
        struct request *rq;

        spin_lock(&nq->poll_lock);
        list_splice_init(&nq->poll_list, &list);
        list_for_each_entry(rq, &list, queuelist)
                blk_mq_set_request_complete(rq);
        spin_unlock(&nq->poll_lock);

        while (!list_empty(&list)) {
                struct nullb_cmd *cmd;
                struct request *req;

                req = list_first_entry(&list, struct request, queuelist);
                list_del_init(&req->queuelist);
                cmd = blk_mq_rq_to_pdu(req);
                cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req),
                                                blk_rq_sectors(req));
                if (!blk_mq_add_to_batch(req, iob, cmd->error != BLK_STS_OK,
                                         blk_mq_end_request_batch))
                        blk_mq_end_request(req, cmd->error);
                nr++;
        }

        return nr;
}

static enum blk_eh_timer_return null_timeout_rq(struct request *rq)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);

        if (hctx->type == HCTX_TYPE_POLL) {
                struct nullb_queue *nq = hctx->driver_data;

                spin_lock(&nq->poll_lock);
                /* The request may have completed meanwhile. */
                if (blk_mq_request_completed(rq)) {
                        spin_unlock(&nq->poll_lock);
                        return BLK_EH_DONE;
                }
                list_del_init(&rq->queuelist);
                spin_unlock(&nq->poll_lock);
        }

        pr_info("rq %p timed out\n", rq);

        /*
         * If the device is marked as blocking (i.e. memory backed or zoned
         * device), the submission path may be blocked waiting for resources
         * and cause real timeouts. For these real timeouts, the submission
         * path will complete the request using blk_mq_complete_request().
         * Only fake timeouts need to execute blk_mq_complete_request() here.
         */
        cmd->error = BLK_STS_TIMEOUT;
        if (cmd->fake_timeout || hctx->type == HCTX_TYPE_POLL)
                blk_mq_complete_request(rq);
        return BLK_EH_DONE;
}

static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
                                  const struct blk_mq_queue_data *bd)
{
        struct request *rq = bd->rq;
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
        struct nullb_queue *nq = hctx->driver_data;
        sector_t nr_sectors = blk_rq_sectors(rq);
        sector_t sector = blk_rq_pos(rq);
        const bool is_poll = hctx->type == HCTX_TYPE_POLL;

        might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);

        if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) {
                hrtimer_setup(&cmd->timer, null_cmd_timer_expired, CLOCK_MONOTONIC,
                              HRTIMER_MODE_REL);
        }
        cmd->error = BLK_STS_OK;
        cmd->nq = nq;
        cmd->fake_timeout = should_timeout_request(rq) ||
                blk_should_fake_timeout(rq->q);

        if (should_requeue_request(rq)) {
                /*
                 * Alternate between hitting the core BUSY path, and the
                 * driver driven requeue path
                 */
                nq->requeue_selection++;
                if (nq->requeue_selection & 1)
                        return BLK_STS_RESOURCE;
                blk_mq_requeue_request(rq, true);
                return BLK_STS_OK;
        }

        if (test_bit(NULLB_DEV_FL_THROTTLED, &nq->dev->flags)) {
                blk_status_t sts = null_handle_throttled(cmd);

                if (sts != BLK_STS_OK)
                        return sts;
        }

        blk_mq_start_request(rq);

        if (is_poll) {
                spin_lock(&nq->poll_lock);
                list_add_tail(&rq->queuelist, &nq->poll_list);
                spin_unlock(&nq->poll_lock);
                return BLK_STS_OK;
        }
        if (cmd->fake_timeout)
                return BLK_STS_OK;

        null_handle_cmd(cmd, sector, nr_sectors, req_op(rq));
        return BLK_STS_OK;
}

static void null_queue_rqs(struct rq_list *rqlist)
{
        struct rq_list requeue_list = {};
        struct blk_mq_queue_data bd = { };
        blk_status_t ret;

        do {
                struct request *rq = rq_list_pop(rqlist);

                bd.rq = rq;
                ret = null_queue_rq(rq->mq_hctx, &bd);
                if (ret != BLK_STS_OK)
                        rq_list_add_tail(&requeue_list, rq);
        } while (!rq_list_empty(rqlist));

        *rqlist = requeue_list;
}

static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
{
        nq->dev = nullb->dev;
        INIT_LIST_HEAD(&nq->poll_list);
        spin_lock_init(&nq->poll_lock);
}

static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
                          unsigned int hctx_idx)
{
        struct nullb *nullb = hctx->queue->queuedata;
        struct nullb_queue *nq;

        if (should_init_hctx_fail(nullb->dev))
                return -EFAULT;

        nq = &nullb->queues[hctx_idx];
        hctx->driver_data = nq;
        null_init_queue(nullb, nq);

        return 0;
}

static const struct blk_mq_ops null_mq_ops = {
        .queue_rq       = null_queue_rq,
        .queue_rqs        = null_queue_rqs,
        .complete        = null_complete_rq,
        .timeout        = null_timeout_rq,
        .poll                = null_poll,
        .map_queues        = null_map_queues,
        .init_hctx        = null_init_hctx,
};

static void null_del_dev(struct nullb *nullb)
{
        struct nullb_device *dev;

        if (!nullb)
                return;

        dev = nullb->dev;

        ida_free(&nullb_indexes, nullb->index);

        list_del_init(&nullb->list);

        del_gendisk(nullb->disk);

        if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
                hrtimer_cancel(&nullb->bw_timer);
                atomic_long_set(&nullb->cur_bytes, LONG_MAX);
                blk_mq_start_stopped_hw_queues(nullb->q, true);
        }

        put_disk(nullb->disk);
        if (nullb->tag_set == &nullb->__tag_set)
                blk_mq_free_tag_set(nullb->tag_set);
        kfree(nullb->queues);
        if (null_cache_active(nullb))
                null_free_device_storage(nullb->dev, true);
        kfree(nullb);
        dev->nullb = NULL;
}

static void null_config_discard(struct nullb *nullb, struct queue_limits *lim)
{
        if (nullb->dev->discard == false)
                return;

        if (!nullb->dev->memory_backed) {
                nullb->dev->discard = false;
                pr_info("discard option is ignored without memory backing\n");
                return;
        }

        if (nullb->dev->zoned) {
                nullb->dev->discard = false;
                pr_info("discard option is ignored in zoned mode\n");
                return;
        }

        lim->max_hw_discard_sectors = UINT_MAX >> 9;
}

static const struct block_device_operations null_ops = {
        .owner                = THIS_MODULE,
        .report_zones        = null_report_zones,
};

static int setup_queues(struct nullb *nullb)
{
        int nqueues = nr_cpu_ids;

        if (g_poll_queues)
                nqueues += g_poll_queues;

        nullb->queues = kzalloc_objs(struct nullb_queue, nqueues);
        if (!nullb->queues)
                return -ENOMEM;

        return 0;
}

static int null_init_tag_set(struct blk_mq_tag_set *set, int poll_queues)
{
        set->ops = &null_mq_ops;
        set->cmd_size = sizeof(struct nullb_cmd);
        set->timeout = 5 * HZ;
        set->nr_maps = 1;
        if (poll_queues) {
                set->nr_hw_queues += poll_queues;
                set->nr_maps += 2;
        }
        return blk_mq_alloc_tag_set(set);
}

static int null_init_global_tag_set(void)
{
        int error;

        if (tag_set.ops)
                return 0;

        tag_set.nr_hw_queues = g_submit_queues;
        tag_set.queue_depth = g_hw_queue_depth;
        tag_set.numa_node = g_home_node;
        if (g_no_sched)
                tag_set.flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT;
        if (g_shared_tag_bitmap)
                tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED;
        if (g_blocking)
                tag_set.flags |= BLK_MQ_F_BLOCKING;

        error = null_init_tag_set(&tag_set, g_poll_queues);
        if (error)
                tag_set.ops = NULL;
        return error;
}

static int null_setup_tagset(struct nullb *nullb)
{
        if (nullb->dev->shared_tags) {
                nullb->tag_set = &tag_set;
                return null_init_global_tag_set();
        }

        nullb->tag_set = &nullb->__tag_set;
        nullb->tag_set->driver_data = nullb;
        nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues;
        nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth;
        nullb->tag_set->numa_node = nullb->dev->home_node;
        if (nullb->dev->no_sched)
                nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT;
        if (nullb->dev->shared_tag_bitmap)
                nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
        if (nullb->dev->blocking)
                nullb->tag_set->flags |= BLK_MQ_F_BLOCKING;
        return null_init_tag_set(nullb->tag_set, nullb->dev->poll_queues);
}

static int null_validate_conf(struct nullb_device *dev)
{
        if (dev->queue_mode == NULL_Q_RQ) {
                pr_err("legacy IO path is no longer available\n");
                return -EINVAL;
        }
        if (dev->queue_mode == NULL_Q_BIO) {
                pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n");
                dev->queue_mode = NULL_Q_MQ;
        }

        if (dev->use_per_node_hctx) {
                if (dev->submit_queues != nr_online_nodes)
                        dev->submit_queues = nr_online_nodes;
        } else if (dev->submit_queues > nr_cpu_ids)
                dev->submit_queues = nr_cpu_ids;
        else if (dev->submit_queues == 0)
                dev->submit_queues = 1;
        dev->prev_submit_queues = dev->submit_queues;

        if (dev->poll_queues > g_poll_queues)
                dev->poll_queues = g_poll_queues;
        dev->prev_poll_queues = dev->poll_queues;
        dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);

        /* Do memory allocation, so set blocking */
        if (dev->memory_backed)
                dev->blocking = true;
        else /* cache is meaningless */
                dev->cache_size = 0;
        dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
                                                dev->cache_size);
        dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);

        if (dev->zoned &&
            (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
                pr_err("zone_size must be power-of-two\n");
                return -EINVAL;
        }

        return 0;
}

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
static bool __null_setup_fault(struct fault_attr *attr, char *str)
{
        if (!str[0])
                return true;

        if (!setup_fault_attr(attr, str))
                return false;

        attr->verbose = 0;
        return true;
}
#endif

static bool null_setup_fault(void)
{
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
        if (!__null_setup_fault(&null_timeout_attr, g_timeout_str))
                return false;
        if (!__null_setup_fault(&null_requeue_attr, g_requeue_str))
                return false;
        if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str))
                return false;
#endif
        return true;
}

static int null_add_dev(struct nullb_device *dev)
{
        struct queue_limits lim = {
                .logical_block_size        = dev->blocksize,
                .physical_block_size        = dev->blocksize,
                .max_hw_sectors                = dev->max_sectors,
                .dma_alignment                = 1,
        };

        struct nullb *nullb;
        int rv;

        rv = null_validate_conf(dev);
        if (rv)
                return rv;

        nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
        if (!nullb) {
                rv = -ENOMEM;
                goto out;
        }
        nullb->dev = dev;
        dev->nullb = nullb;

        spin_lock_init(&nullb->lock);

        rv = setup_queues(nullb);
        if (rv)
                goto out_free_nullb;

        rv = null_setup_tagset(nullb);
        if (rv)
                goto out_cleanup_queues;

        if (dev->virt_boundary)
                lim.virt_boundary_mask = PAGE_SIZE - 1;
        null_config_discard(nullb, &lim);
        if (dev->zoned) {
                rv = null_init_zoned_dev(dev, &lim);
                if (rv)
                        goto out_cleanup_tags;
        }

        if (dev->cache_size > 0) {
                set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
                lim.features |= BLK_FEAT_WRITE_CACHE;
                if (dev->fua)
                        lim.features |= BLK_FEAT_FUA;
        }

        if (dev->rotational)
                lim.features |= BLK_FEAT_ROTATIONAL;

        nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb);
        if (IS_ERR(nullb->disk)) {
                rv = PTR_ERR(nullb->disk);
                goto out_cleanup_zone;
        }
        nullb->q = nullb->disk->queue;

        if (dev->mbps) {
                set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
                nullb_setup_bwtimer(nullb);
        }

        nullb->q->queuedata = nullb;

        rv = ida_alloc(&nullb_indexes, GFP_KERNEL);
        if (rv < 0)
                goto out_cleanup_disk;

        nullb->index = rv;
        dev->index = rv;

        if (config_item_name(&dev->group.cg_item)) {
                /* Use configfs dir name as the device name */
                snprintf(nullb->disk_name, sizeof(nullb->disk_name),
                         "%s", config_item_name(&dev->group.cg_item));
        } else {
                sprintf(nullb->disk_name, "nullb%d", nullb->index);
        }

        set_capacity(nullb->disk,
                ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT);
        nullb->disk->major = null_major;
        nullb->disk->first_minor = nullb->index;
        nullb->disk->minors = 1;
        nullb->disk->fops = &null_ops;
        nullb->disk->private_data = nullb;
        strscpy(nullb->disk->disk_name, nullb->disk_name);

        if (nullb->dev->zoned) {
                rv = null_register_zoned_dev(nullb);
                if (rv)
                        goto out_ida_free;
        }

        rv = add_disk(nullb->disk);
        if (rv)
                goto out_ida_free;

        list_add_tail(&nullb->list, &nullb_list);

        pr_info("disk %s created\n", nullb->disk_name);

        return 0;

out_ida_free:
        ida_free(&nullb_indexes, nullb->index);
out_cleanup_disk:
        put_disk(nullb->disk);
out_cleanup_zone:
        null_free_zoned_dev(dev);
out_cleanup_tags:
        if (nullb->tag_set == &nullb->__tag_set)
                blk_mq_free_tag_set(nullb->tag_set);
out_cleanup_queues:
        kfree(nullb->queues);
out_free_nullb:
        kfree(nullb);
        dev->nullb = NULL;
out:
        return rv;
}

static struct nullb *null_find_dev_by_name(const char *name)
{
        struct nullb *nullb = NULL, *nb;

        mutex_lock(&lock);
        list_for_each_entry(nb, &nullb_list, list) {
                if (strcmp(nb->disk_name, name) == 0) {
                        nullb = nb;
                        break;
                }
        }
        mutex_unlock(&lock);

        return nullb;
}

static int null_create_dev(void)
{
        struct nullb_device *dev;
        int ret;

        dev = null_alloc_dev();
        if (!dev)
                return -ENOMEM;

        mutex_lock(&lock);
        ret = null_add_dev(dev);
        mutex_unlock(&lock);
        if (ret) {
                null_free_dev(dev);
                return ret;
        }

        return 0;
}

static void null_destroy_dev(struct nullb *nullb)
{
        struct nullb_device *dev = nullb->dev;

        null_del_dev(nullb);
        null_free_device_storage(dev, false);
        null_free_dev(dev);
}

static int __init null_init(void)
{
        int ret = 0;
        unsigned int i;
        struct nullb *nullb;

        if (g_bs > PAGE_SIZE) {
                pr_warn("invalid block size\n");
                pr_warn("defaults block size to %lu\n", PAGE_SIZE);
                g_bs = PAGE_SIZE;
        }

        if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
                pr_err("invalid home_node value\n");
                g_home_node = NUMA_NO_NODE;
        }

        if (!null_setup_fault())
                return -EINVAL;

        if (g_queue_mode == NULL_Q_RQ) {
                pr_err("legacy IO path is no longer available\n");
                return -EINVAL;
        }

        if (g_use_per_node_hctx) {
                if (g_submit_queues != nr_online_nodes) {
                        pr_warn("submit_queues param is set to %u.\n",
                                nr_online_nodes);
                        g_submit_queues = nr_online_nodes;
                }
        } else if (g_submit_queues > nr_cpu_ids) {
                g_submit_queues = nr_cpu_ids;
        } else if (g_submit_queues <= 0) {
                g_submit_queues = 1;
        }

        config_group_init(&nullb_subsys.su_group);
        mutex_init(&nullb_subsys.su_mutex);

        ret = configfs_register_subsystem(&nullb_subsys);
        if (ret)
                return ret;

        mutex_init(&lock);

        null_major = register_blkdev(0, "nullb");
        if (null_major < 0) {
                ret = null_major;
                goto err_conf;
        }

        for (i = 0; i < nr_devices; i++) {
                ret = null_create_dev();
                if (ret)
                        goto err_dev;
        }

        pr_info("module loaded\n");
        return 0;

err_dev:
        while (!list_empty(&nullb_list)) {
                nullb = list_entry(nullb_list.next, struct nullb, list);
                null_destroy_dev(nullb);
        }
        unregister_blkdev(null_major, "nullb");
err_conf:
        configfs_unregister_subsystem(&nullb_subsys);
        return ret;
}

static void __exit null_exit(void)
{
        struct nullb *nullb;

        configfs_unregister_subsystem(&nullb_subsys);

        unregister_blkdev(null_major, "nullb");

        mutex_lock(&lock);
        while (!list_empty(&nullb_list)) {
                nullb = list_entry(nullb_list.next, struct nullb, list);
                null_destroy_dev(nullb);
        }
        mutex_unlock(&lock);

        if (tag_set.ops)
                blk_mq_free_tag_set(&tag_set);

        mutex_destroy(&lock);
}

module_init(null_init);
module_exit(null_exit);

MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
MODULE_DESCRIPTION("multi queue aware block test driver");
MODULE_LICENSE("GPL");









































































































































































































































































































































































































































































































































































































































































































    2 






















    3 




    2 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
// SPDX-License-Identifier: GPL-2.0+
/*
 * Procedures for creating, accessing and interpreting the device tree.
 *
 * Paul Mackerras        August 1996.
 * Copyright (C) 1996-2005 Paul Mackerras.
 *
 *  Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
 *    {engebret|bergner}@us.ibm.com
 *
 *  Adapted for sparc and sparc64 by David S. Miller davem@davemloft.net
 *
 *  Reconsolidated from arch/x/kernel/prom.c by Stephen Rothwell and
 *  Grant Likely.
 */

#define pr_fmt(fmt)        "OF: " fmt

#include <linux/cleanup.h>
#include <linux/console.h>
#include <linux/ctype.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/of_graph.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/proc_fs.h>

#include "of_private.h"

LIST_HEAD(aliases_lookup);

struct device_node *of_root;
EXPORT_SYMBOL(of_root);
struct device_node *of_chosen;
EXPORT_SYMBOL(of_chosen);
struct device_node *of_aliases;
struct device_node *of_stdout;
static const char *of_stdout_options;

struct kset *of_kset;

/*
 * Used to protect the of_aliases, to hold off addition of nodes to sysfs.
 * This mutex must be held whenever modifications are being made to the
 * device tree. The of_{attach,detach}_node() and
 * of_{add,remove,update}_property() helpers make sure this happens.
 */
DEFINE_MUTEX(of_mutex);

/* use when traversing tree through the child, sibling,
 * or parent members of struct device_node.
 */
DEFINE_RAW_SPINLOCK(devtree_lock);

bool of_node_name_eq(const struct device_node *np, const char *name)
{
        const char *node_name;
        size_t len;

        if (!np)
                return false;

        node_name = kbasename(np->full_name);
        len = strchrnul(node_name, '@') - node_name;

        return (strlen(name) == len) && (strncmp(node_name, name, len) == 0);
}
EXPORT_SYMBOL(of_node_name_eq);

bool of_node_name_prefix(const struct device_node *np, const char *prefix)
{
        if (!np)
                return false;

        return strncmp(kbasename(np->full_name), prefix, strlen(prefix)) == 0;
}
EXPORT_SYMBOL(of_node_name_prefix);

static bool __of_node_is_type(const struct device_node *np, const char *type)
{
        const char *match = __of_get_property(np, "device_type", NULL);

        return np && match && type && !strcmp(match, type);
}

#define EXCLUDED_DEFAULT_CELLS_PLATFORMS ( \
        IS_ENABLED(CONFIG_SPARC) || \
        of_find_compatible_node(NULL, NULL, "coreboot") \
)

int of_bus_n_addr_cells(struct device_node *np)
{
        u32 cells;

        for (; np; np = np->parent) {
                if (!of_property_read_u32(np, "#address-cells", &cells))
                        return cells;
                /*
                 * Default root value and walking parent nodes for "#address-cells"
                 * is deprecated. Any platforms which hit this warning should
                 * be added to the excluded list.
                 */
                WARN_ONCE(!EXCLUDED_DEFAULT_CELLS_PLATFORMS,
                          "Missing '#address-cells' in %pOF\n", np);
        }
        return OF_ROOT_NODE_ADDR_CELLS_DEFAULT;
}

int of_n_addr_cells(struct device_node *np)
{
        if (np->parent)
                np = np->parent;

        return of_bus_n_addr_cells(np);
}
EXPORT_SYMBOL(of_n_addr_cells);

int of_bus_n_size_cells(struct device_node *np)
{
        u32 cells;

        for (; np; np = np->parent) {
                if (!of_property_read_u32(np, "#size-cells", &cells))
                        return cells;
                /*
                 * Default root value and walking parent nodes for "#size-cells"
                 * is deprecated. Any platforms which hit this warning should
                 * be added to the excluded list.
                 */
                WARN_ONCE(!EXCLUDED_DEFAULT_CELLS_PLATFORMS,
                          "Missing '#size-cells' in %pOF\n", np);
        }
        return OF_ROOT_NODE_SIZE_CELLS_DEFAULT;
}

int of_n_size_cells(struct device_node *np)
{
        if (np->parent)
                np = np->parent;

        return of_bus_n_size_cells(np);
}
EXPORT_SYMBOL(of_n_size_cells);

#ifdef CONFIG_NUMA
int __weak of_node_to_nid(struct device_node *np)
{
        return NUMA_NO_NODE;
}
#endif

#define OF_PHANDLE_CACHE_BITS        7
#define OF_PHANDLE_CACHE_SZ        BIT(OF_PHANDLE_CACHE_BITS)

static struct device_node *phandle_cache[OF_PHANDLE_CACHE_SZ];

static u32 of_phandle_cache_hash(phandle handle)
{
        return hash_32(handle, OF_PHANDLE_CACHE_BITS);
}

/*
 * Caller must hold devtree_lock.
 */
void __of_phandle_cache_inv_entry(phandle handle)
{
        u32 handle_hash;
        struct device_node *np;

        if (!handle)
                return;

        handle_hash = of_phandle_cache_hash(handle);

        np = phandle_cache[handle_hash];
        if (np && handle == np->phandle)
                phandle_cache[handle_hash] = NULL;
}

void __init of_core_init(void)
{
        struct device_node *np;

        of_platform_register_reconfig_notifier();

        /* Create the kset, and register existing nodes */
        mutex_lock(&of_mutex);
        of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
        if (!of_kset) {
                mutex_unlock(&of_mutex);
                pr_err("failed to register existing nodes\n");
                return;
        }
        for_each_of_allnodes(np) {
                __of_attach_node_sysfs(np);
                if (np->phandle && !phandle_cache[of_phandle_cache_hash(np->phandle)])
                        phandle_cache[of_phandle_cache_hash(np->phandle)] = np;
        }
        mutex_unlock(&of_mutex);

        /* Symlink in /proc as required by userspace ABI */
        if (of_root)
                proc_symlink("device-tree", NULL, "/sys/firmware/devicetree/base");
}

static struct property *__of_find_property(const struct device_node *np,
                                           const char *name, int *lenp)
{
        struct property *pp;

        if (!np)
                return NULL;

        for (pp = np->properties; pp; pp = pp->next) {
                if (of_prop_cmp(pp->name, name) == 0) {
                        if (lenp)
                                *lenp = pp->length;
                        break;
                }
        }

        return pp;
}

struct property *of_find_property(const struct device_node *np,
                                  const char *name,
                                  int *lenp)
{
        struct property *pp;
        unsigned long flags;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        pp = __of_find_property(np, name, lenp);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);

        return pp;
}
EXPORT_SYMBOL(of_find_property);

struct device_node *__of_find_all_nodes(struct device_node *prev)
{
        struct device_node *np;
        if (!prev) {
                np = of_root;
        } else if (prev->child) {
                np = prev->child;
        } else {
                /* Walk back up looking for a sibling, or the end of the structure */
                np = prev;
                while (np->parent && !np->sibling)
                        np = np->parent;
                np = np->sibling; /* Might be null at the end of the tree */
        }
        return np;
}

/**
 * of_find_all_nodes - Get next node in global list
 * @prev:        Previous node or NULL to start iteration
 *                of_node_put() will be called on it
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_find_all_nodes(struct device_node *prev)
{
        struct device_node *np;
        unsigned long flags;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        np = __of_find_all_nodes(prev);
        of_node_get(np);
        of_node_put(prev);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return np;
}
EXPORT_SYMBOL(of_find_all_nodes);

/*
 * Find a property with a given name for a given node
 * and return the value.
 */
const void *__of_get_property(const struct device_node *np,
                              const char *name, int *lenp)
{
        const struct property *pp = __of_find_property(np, name, lenp);

        return pp ? pp->value : NULL;
}

/*
 * Find a property with a given name for a given node
 * and return the value.
 */
const void *of_get_property(const struct device_node *np, const char *name,
                            int *lenp)
{
        const struct property *pp = of_find_property(np, name, lenp);

        return pp ? pp->value : NULL;
}
EXPORT_SYMBOL(of_get_property);

/**
 * __of_device_is_compatible() - Check if the node matches given constraints
 * @device: pointer to node
 * @compat: required compatible string, NULL or "" for any match
 * @type: required device_type value, NULL or "" for any match
 * @name: required node name, NULL or "" for any match
 *
 * Checks if the given @compat, @type and @name strings match the
 * properties of the given @device. A constraints can be skipped by
 * passing NULL or an empty string as the constraint.
 *
 * Returns 0 for no match, and a positive integer on match. The return
 * value is a relative score with larger values indicating better
 * matches. The score is weighted for the most specific compatible value
 * to get the highest score. Matching type is next, followed by matching
 * name. Practically speaking, this results in the following priority
 * order for matches:
 *
 * 1. specific compatible && type && name
 * 2. specific compatible && type
 * 3. specific compatible && name
 * 4. specific compatible
 * 5. general compatible && type && name
 * 6. general compatible && type
 * 7. general compatible && name
 * 8. general compatible
 * 9. type && name
 * 10. type
 * 11. name
 */
static int __of_device_is_compatible(const struct device_node *device,
                                     const char *compat, const char *type, const char *name)
{
        const struct property *prop;
        const char *cp;
        int index = 0, score = 0;

        /* Compatible match has highest priority */
        if (compat && compat[0]) {
                prop = __of_find_property(device, "compatible", NULL);
                for (cp = of_prop_next_string(prop, NULL); cp;
                     cp = of_prop_next_string(prop, cp), index++) {
                        if (of_compat_cmp(cp, compat, strlen(compat)) == 0) {
                                score = INT_MAX/2 - (index << 2);
                                break;
                        }
                }
                if (!score)
                        return 0;
        }

        /* Matching type is better than matching name */
        if (type && type[0]) {
                if (!__of_node_is_type(device, type))
                        return 0;
                score += 2;
        }

        /* Matching name is a bit better than not */
        if (name && name[0]) {
                if (!of_node_name_eq(device, name))
                        return 0;
                score++;
        }

        return score;
}

/** Checks if the given "compat" string matches one of the strings in
 * the device's "compatible" property
 */
int of_device_is_compatible(const struct device_node *device,
                const char *compat)
{
        unsigned long flags;
        int res;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        res = __of_device_is_compatible(device, compat, NULL, NULL);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return res;
}
EXPORT_SYMBOL(of_device_is_compatible);

/** Checks if the device is compatible with any of the entries in
 *  a NULL terminated array of strings. Returns the best match
 *  score or 0.
 */
int of_device_compatible_match(const struct device_node *device,
                               const char *const *compat)
{
        unsigned int tmp, score = 0;

        if (!compat)
                return 0;

        while (*compat) {
                tmp = of_device_is_compatible(device, *compat);
                if (tmp > score)
                        score = tmp;
                compat++;
        }

        return score;
}
EXPORT_SYMBOL_GPL(of_device_compatible_match);

/**
 * of_machine_compatible_match - Test root of device tree against a compatible array
 * @compats: NULL terminated array of compatible strings to look for in root node's compatible property.
 *
 * Returns true if the root node has any of the given compatible values in its
 * compatible property.
 */
bool of_machine_compatible_match(const char *const *compats)
{
        struct device_node *root;
        int rc = 0;

        root = of_find_node_by_path("/");
        if (root) {
                rc = of_device_compatible_match(root, compats);
                of_node_put(root);
        }

        return rc != 0;
}
EXPORT_SYMBOL(of_machine_compatible_match);

/**
 * of_machine_read_compatible - Get the compatible string of this machine
 * @compatible: address at which the address of the compatible string will be
 *              stored
 * @index: index of the compatible entry in the list
 *
 * Returns:
 * 0 on success, negative error number on failure.
 */
int of_machine_read_compatible(const char **compatible, unsigned int index)
{
        return of_property_read_string_index(of_root, "compatible", index, compatible);
}
EXPORT_SYMBOL_GPL(of_machine_read_compatible);

/**
 * of_machine_read_model - Get the model string of this machine
 * @model: address at which the address of the model string will be stored
 *
 * Returns:
 * 0 on success, negative error number on failure.
 */
int of_machine_read_model(const char **model)
{
        return of_property_read_string(of_root, "model", model);
}
EXPORT_SYMBOL_GPL(of_machine_read_model);

/**
 * of_machine_get_match - Test root of device tree against an of_device_id array
 * @matches:        NULL terminated array of of_device_id match structures to search in
 *
 * Returns matched entry or NULL
 */
const struct of_device_id *of_machine_get_match(const struct of_device_id *matches)
{
        struct device_node *root;
        const struct of_device_id *match = NULL;

        root = of_find_node_by_path("/");
        if (root) {
                match = of_match_node(matches, root);
                of_node_put(root);
        }

        return match;
}
EXPORT_SYMBOL(of_machine_get_match);

/**
 * of_machine_get_match_data - Tell if root of device tree has a matching of_match structure
 * @matches:        NULL terminated array of of_device_id match structures to search in
 *
 * Returns data associated with matched entry or NULL
 */
const void *of_machine_get_match_data(const struct of_device_id *matches)
{
        const struct of_device_id *match;

        match = of_machine_get_match(matches);
        if (!match)
                return NULL;

        return match->data;
}
EXPORT_SYMBOL(of_machine_get_match_data);

static bool __of_device_is_status(const struct device_node *device,
                                  const char * const*strings)
{
        const char *status;
        int statlen;

        if (!device)
                return false;

        status = __of_get_property(device, "status", &statlen);
        if (status == NULL)
                return false;

        if (statlen > 0) {
                while (*strings) {
                        unsigned int len = strlen(*strings);

                        if ((*strings)[len - 1] == '-') {
                                if (!strncmp(status, *strings, len))
                                        return true;
                        } else {
                                if (!strcmp(status, *strings))
                                        return true;
                        }
                        strings++;
                }
        }

        return false;
}

/**
 *  __of_device_is_available - check if a device is available for use
 *
 *  @device: Node to check for availability, with locks already held
 *
 *  Return: True if the status property is absent or set to "okay" or "ok",
 *  false otherwise
 */
static bool __of_device_is_available(const struct device_node *device)
{
        static const char * const ok[] = {"okay", "ok", NULL};

        if (!device)
                return false;

        return !__of_get_property(device, "status", NULL) ||
                __of_device_is_status(device, ok);
}

/**
 *  __of_device_is_reserved - check if a device is reserved
 *
 *  @device: Node to check for availability, with locks already held
 *
 *  Return: True if the status property is set to "reserved", false otherwise
 */
static bool __of_device_is_reserved(const struct device_node *device)
{
        static const char * const reserved[] = {"reserved", NULL};

        return __of_device_is_status(device, reserved);
}

/**
 *  of_device_is_available - check if a device is available for use
 *
 *  @device: Node to check for availability
 *
 *  Return: True if the status property is absent or set to "okay" or "ok",
 *  false otherwise
 */
bool of_device_is_available(const struct device_node *device)
{
        unsigned long flags;
        bool res;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        res = __of_device_is_available(device);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return res;

}
EXPORT_SYMBOL(of_device_is_available);

/**
 *  __of_device_is_fail - check if a device has status "fail" or "fail-..."
 *
 *  @device: Node to check status for, with locks already held
 *
 *  Return: True if the status property is set to "fail" or "fail-..." (for any
 *  error code suffix), false otherwise
 */
static bool __of_device_is_fail(const struct device_node *device)
{
        static const char * const fail[] = {"fail", "fail-", NULL};

        return __of_device_is_status(device, fail);
}

/**
 *  of_device_is_big_endian - check if a device has BE registers
 *
 *  @device: Node to check for endianness
 *
 *  Return: True if the device has a "big-endian" property, or if the kernel
 *  was compiled for BE *and* the device has a "native-endian" property.
 *  Returns false otherwise.
 *
 *  Callers would nominally use ioread32be/iowrite32be if
 *  of_device_is_big_endian() == true, or readl/writel otherwise.
 */
bool of_device_is_big_endian(const struct device_node *device)
{
        if (of_property_read_bool(device, "big-endian"))
                return true;
        if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) &&
            of_property_read_bool(device, "native-endian"))
                return true;
        return false;
}
EXPORT_SYMBOL(of_device_is_big_endian);

/**
 * of_get_parent - Get a node's parent if any
 * @node:        Node to get parent
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_get_parent(const struct device_node *node)
{
        struct device_node *np;
        unsigned long flags;

        if (!node)
                return NULL;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        np = of_node_get(node->parent);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return np;
}
EXPORT_SYMBOL(of_get_parent);

/**
 * of_get_next_parent - Iterate to a node's parent
 * @node:        Node to get parent of
 *
 * This is like of_get_parent() except that it drops the
 * refcount on the passed node, making it suitable for iterating
 * through a node's parents.
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_get_next_parent(struct device_node *node)
{
        struct device_node *parent;
        unsigned long flags;

        if (!node)
                return NULL;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        parent = of_node_get(node->parent);
        of_node_put(node);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return parent;
}
EXPORT_SYMBOL(of_get_next_parent);

static struct device_node *__of_get_next_child(const struct device_node *node,
                                                struct device_node *prev)
{
        struct device_node *next;

        if (!node)
                return NULL;

        next = prev ? prev->sibling : node->child;
        of_node_get(next);
        of_node_put(prev);
        return next;
}
#define __for_each_child_of_node(parent, child) \
        for (child = __of_get_next_child(parent, NULL); child != NULL; \
             child = __of_get_next_child(parent, child))

/**
 * of_get_next_child - Iterate a node childs
 * @node:        parent node
 * @prev:        previous child of the parent node, or NULL to get first
 *
 * Return: A node pointer with refcount incremented, use of_node_put() on
 * it when done. Returns NULL when prev is the last child. Decrements the
 * refcount of prev.
 */
struct device_node *of_get_next_child(const struct device_node *node,
        struct device_node *prev)
{
        struct device_node *next;
        unsigned long flags;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        next = __of_get_next_child(node, prev);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return next;
}
EXPORT_SYMBOL(of_get_next_child);

/**
 * of_get_next_child_with_prefix - Find the next child node with prefix
 * @node:        parent node
 * @prev:        previous child of the parent node, or NULL to get first
 * @prefix:        prefix that the node name should have
 *
 * This function is like of_get_next_child(), except that it automatically
 * skips any nodes whose name doesn't have the given prefix.
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_get_next_child_with_prefix(const struct device_node *node,
                                                  struct device_node *prev,
                                                  const char *prefix)
{
        struct device_node *next;
        unsigned long flags;

        if (!node)
                return NULL;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        next = prev ? prev->sibling : node->child;
        for (; next; next = next->sibling) {
                if (!of_node_name_prefix(next, prefix))
                        continue;
                if (of_node_get(next))
                        break;
        }
        of_node_put(prev);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return next;
}
EXPORT_SYMBOL(of_get_next_child_with_prefix);

static struct device_node *of_get_next_status_child(const struct device_node *node,
                                                    struct device_node *prev,
                                                    bool (*checker)(const struct device_node *))
{
        struct device_node *next;
        unsigned long flags;

        if (!node)
                return NULL;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        next = prev ? prev->sibling : node->child;
        for (; next; next = next->sibling) {
                if (!checker(next))
                        continue;
                if (of_node_get(next))
                        break;
        }
        of_node_put(prev);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return next;
}

/**
 * of_get_next_available_child - Find the next available child node
 * @node:        parent node
 * @prev:        previous child of the parent node, or NULL to get first
 *
 * This function is like of_get_next_child(), except that it
 * automatically skips any disabled nodes (i.e. status = "disabled").
 */
struct device_node *of_get_next_available_child(const struct device_node *node,
        struct device_node *prev)
{
        return of_get_next_status_child(node, prev, __of_device_is_available);
}
EXPORT_SYMBOL(of_get_next_available_child);

/**
 * of_get_next_reserved_child - Find the next reserved child node
 * @node:        parent node
 * @prev:        previous child of the parent node, or NULL to get first
 *
 * This function is like of_get_next_child(), except that it
 * automatically skips any disabled nodes (i.e. status = "disabled").
 */
struct device_node *of_get_next_reserved_child(const struct device_node *node,
                                                struct device_node *prev)
{
        return of_get_next_status_child(node, prev, __of_device_is_reserved);
}
EXPORT_SYMBOL(of_get_next_reserved_child);

/**
 * of_get_next_cpu_node - Iterate on cpu nodes
 * @prev:        previous child of the /cpus node, or NULL to get first
 *
 * Unusable CPUs (those with the status property set to "fail" or "fail-...")
 * will be skipped.
 *
 * Return: A cpu node pointer with refcount incremented, use of_node_put()
 * on it when done. Returns NULL when prev is the last child. Decrements
 * the refcount of prev.
 */
struct device_node *of_get_next_cpu_node(struct device_node *prev)
{
        struct device_node *next = NULL;
        unsigned long flags;
        struct device_node *node;

        if (!prev)
                node = of_find_node_by_path("/cpus");

        raw_spin_lock_irqsave(&devtree_lock, flags);
        if (prev)
                next = prev->sibling;
        else if (node) {
                next = node->child;
                of_node_put(node);
        }
        for (; next; next = next->sibling) {
                if (__of_device_is_fail(next))
                        continue;
                if (!(of_node_name_eq(next, "cpu") ||
                      __of_node_is_type(next, "cpu")))
                        continue;
                if (of_node_get(next))
                        break;
        }
        of_node_put(prev);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return next;
}
EXPORT_SYMBOL(of_get_next_cpu_node);

/**
 * of_get_compatible_child - Find compatible child node
 * @parent:        parent node
 * @compatible:        compatible string
 *
 * Lookup child node whose compatible property contains the given compatible
 * string.
 *
 * Return: a node pointer with refcount incremented, use of_node_put() on it
 * when done; or NULL if not found.
 */
struct device_node *of_get_compatible_child(const struct device_node *parent,
                                const char *compatible)
{
        struct device_node *child;

        for_each_child_of_node(parent, child) {
                if (of_device_is_compatible(child, compatible))
                        break;
        }

        return child;
}
EXPORT_SYMBOL(of_get_compatible_child);

/**
 * of_get_child_by_name - Find the child node by name for a given parent
 * @node:        parent node
 * @name:        child name to look for.
 *
 * This function looks for child node for given matching name
 *
 * Return: A node pointer if found, with refcount incremented, use
 * of_node_put() on it when done.
 * Returns NULL if node is not found.
 */
struct device_node *of_get_child_by_name(const struct device_node *node,
                                const char *name)
{
        struct device_node *child;

        for_each_child_of_node(node, child)
                if (of_node_name_eq(child, name))
                        break;
        return child;
}
EXPORT_SYMBOL(of_get_child_by_name);

/**
 * of_get_available_child_by_name - Find the available child node by name for a given parent
 * @node:        parent node
 * @name:        child name to look for.
 *
 * This function looks for child node for given matching name and checks the
 * device's availability for use.
 *
 * Return: A node pointer if found, with refcount incremented, use
 * of_node_put() on it when done.
 * Returns NULL if node is not found.
 */
struct device_node *of_get_available_child_by_name(const struct device_node *node,
                                                   const char *name)
{
        struct device_node *child;

        child = of_get_child_by_name(node, name);
        if (child && !of_device_is_available(child)) {
                of_node_put(child);
                return NULL;
        }

        return child;
}
EXPORT_SYMBOL(of_get_available_child_by_name);

struct device_node *__of_find_node_by_path(const struct device_node *parent,
                                                const char *path)
{
        struct device_node *child;
        int len;

        len = strcspn(path, "/:");
        if (!len)
                return NULL;

        __for_each_child_of_node(parent, child) {
                const char *name = kbasename(child->full_name);
                if (strncmp(path, name, len) == 0 && (strlen(name) == len))
                        return child;
        }
        return NULL;
}

struct device_node *__of_find_node_by_full_path(struct device_node *node,
                                                const char *path)
{
        const char *separator = strchr(path, ':');

        while (node && *path == '/') {
                struct device_node *tmp = node;

                path++; /* Increment past '/' delimiter */
                node = __of_find_node_by_path(node, path);
                of_node_put(tmp);
                path = strchrnul(path, '/');
                if (separator && separator < path)
                        break;
        }
        return node;
}

/**
 * of_find_node_opts_by_path - Find a node matching a full OF path
 * @path: Either the full path to match, or if the path does not
 *       start with '/', the name of a property of the /aliases
 *       node (an alias).  In the case of an alias, the node
 *       matching the alias' value will be returned.
 * @opts: Address of a pointer into which to store the start of
 *       an options string appended to the end of the path with
 *       a ':' separator.
 *
 * Valid paths:
 *  * /foo/bar        Full path
 *  * foo        Valid alias
 *  * foo/bar        Valid alias + relative path
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_find_node_opts_by_path(const char *path, const char **opts)
{
        struct device_node *np = NULL;
        const struct property *pp;
        unsigned long flags;
        const char *separator = strchr(path, ':');

        if (opts)
                *opts = separator ? separator + 1 : NULL;

        if (strcmp(path, "/") == 0)
                return of_node_get(of_root);

        /* The path could begin with an alias */
        if (*path != '/') {
                int len;
                const char *p = strchrnul(path, '/');

                if (separator && separator < p)
                        p = separator;
                len = p - path;

                /* of_aliases must not be NULL */
                if (!of_aliases)
                        return NULL;

                for_each_property_of_node(of_aliases, pp) {
                        if (strlen(pp->name) == len && !strncmp(pp->name, path, len)) {
                                np = of_find_node_by_path(pp->value);
                                break;
                        }
                }
                if (!np)
                        return NULL;
                path = p;
        }

        /* Step down the tree matching path components */
        raw_spin_lock_irqsave(&devtree_lock, flags);
        if (!np)
                np = of_node_get(of_root);
        np = __of_find_node_by_full_path(np, path);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return np;
}
EXPORT_SYMBOL(of_find_node_opts_by_path);

/**
 * of_find_node_by_name - Find a node by its "name" property
 * @from:        The node to start searching from or NULL; the node
 *                you pass will not be searched, only the next one
 *                will. Typically, you pass what the previous call
 *                returned. of_node_put() will be called on @from.
 * @name:        The name string to match against
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_find_node_by_name(struct device_node *from,
        const char *name)
{
        struct device_node *np;
        unsigned long flags;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        for_each_of_allnodes_from(from, np)
                if (of_node_name_eq(np, name) && of_node_get(np))
                        break;
        of_node_put(from);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return np;
}
EXPORT_SYMBOL(of_find_node_by_name);

/**
 * of_find_node_by_type - Find a node by its "device_type" property
 * @from:        The node to start searching from, or NULL to start searching
 *                the entire device tree. The node you pass will not be
 *                searched, only the next one will; typically, you pass
 *                what the previous call returned. of_node_put() will be
 *                called on from for you.
 * @type:        The type string to match against
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_find_node_by_type(struct device_node *from,
        const char *type)
{
        struct device_node *np;
        unsigned long flags;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        for_each_of_allnodes_from(from, np)
                if (__of_node_is_type(np, type) && of_node_get(np))
                        break;
        of_node_put(from);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return np;
}
EXPORT_SYMBOL(of_find_node_by_type);

/**
 * of_find_compatible_node - Find a node based on type and one of the
 *                                tokens in its "compatible" property
 * @from:        The node to start searching from or NULL, the node
 *                you pass will not be searched, only the next one
 *                will; typically, you pass what the previous call
 *                returned. of_node_put() will be called on it
 * @type:        The type string to match "device_type" or NULL to ignore
 * @compatible:        The string to match to one of the tokens in the device
 *                "compatible" list.
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_find_compatible_node(struct device_node *from,
        const char *type, const char *compatible)
{
        struct device_node *np;
        unsigned long flags;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        for_each_of_allnodes_from(from, np)
                if (__of_device_is_compatible(np, compatible, type, NULL) &&
                    of_node_get(np))
                        break;
        of_node_put(from);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return np;
}
EXPORT_SYMBOL(of_find_compatible_node);

/**
 * of_find_node_with_property - Find a node which has a property with
 *                              the given name.
 * @from:        The node to start searching from or NULL, the node
 *                you pass will not be searched, only the next one
 *                will; typically, you pass what the previous call
 *                returned. of_node_put() will be called on it
 * @prop_name:        The name of the property to look for.
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_find_node_with_property(struct device_node *from,
        const char *prop_name)
{
        struct device_node *np;
        unsigned long flags;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        for_each_of_allnodes_from(from, np) {
                if (__of_find_property(np, prop_name, NULL)) {
                        of_node_get(np);
                        break;
                }
        }
        of_node_put(from);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return np;
}
EXPORT_SYMBOL(of_find_node_with_property);

static
const struct of_device_id *__of_match_node(const struct of_device_id *matches,
                                           const struct device_node *node)
{
        const struct of_device_id *best_match = NULL;
        int score, best_score = 0;

        if (!matches)
                return NULL;

        for (; matches->name[0] || matches->type[0] || matches->compatible[0]; matches++) {
                score = __of_device_is_compatible(node, matches->compatible,
                                                  matches->type, matches->name);
                if (score > best_score) {
                        best_match = matches;
                        best_score = score;
                }
        }

        return best_match;
}

/**
 * of_match_node - Tell if a device_node has a matching of_match structure
 * @matches:        array of of device match structures to search in
 * @node:        the of device structure to match against
 *
 * Low level utility function used by device matching.
 */
const struct of_device_id *of_match_node(const struct of_device_id *matches,
                                         const struct device_node *node)
{
        const struct of_device_id *match;
        unsigned long flags;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        match = __of_match_node(matches, node);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return match;
}
EXPORT_SYMBOL(of_match_node);

/**
 * of_find_matching_node_and_match - Find a node based on an of_device_id
 *                                     match table.
 * @from:        The node to start searching from or NULL, the node
 *                you pass will not be searched, only the next one
 *                will; typically, you pass what the previous call
 *                returned. of_node_put() will be called on it
 * @matches:        array of of device match structures to search in
 * @match:        Updated to point at the matches entry which matched
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_find_matching_node_and_match(struct device_node *from,
                                        const struct of_device_id *matches,
                                        const struct of_device_id **match)
{
        struct device_node *np;
        const struct of_device_id *m;
        unsigned long flags;

        if (match)
                *match = NULL;

        raw_spin_lock_irqsave(&devtree_lock, flags);
        for_each_of_allnodes_from(from, np) {
                m = __of_match_node(matches, np);
                if (m && of_node_get(np)) {
                        if (match)
                                *match = m;
                        break;
                }
        }
        of_node_put(from);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return np;
}
EXPORT_SYMBOL(of_find_matching_node_and_match);

/**
 * of_alias_from_compatible - Lookup appropriate alias for a device node
 *                              depending on compatible
 * @node:        pointer to a device tree node
 * @alias:        Pointer to buffer that alias value will be copied into
 * @len:        Length of alias value
 *
 * Based on the value of the compatible property, this routine will attempt
 * to choose an appropriate alias value for a particular device tree node.
 * It does this by stripping the manufacturer prefix (as delimited by a ',')
 * from the first entry in the compatible list property.
 *
 * Note: The matching on just the "product" side of the compatible is a relic
 * from I2C and SPI. Please do not add any new user.
 *
 * Return: This routine returns 0 on success, <0 on failure.
 */
int of_alias_from_compatible(const struct device_node *node, char *alias, int len)
{
        const char *compatible, *p;
        int cplen;

        compatible = of_get_property(node, "compatible", &cplen);
        if (!compatible || strlen(compatible) > cplen)
                return -ENODEV;
        p = strchr(compatible, ',');
        strscpy(alias, p ? p + 1 : compatible, len);
        return 0;
}
EXPORT_SYMBOL_GPL(of_alias_from_compatible);

/**
 * of_find_node_by_phandle - Find a node given a phandle
 * @handle:        phandle of the node to find
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.
 */
struct device_node *of_find_node_by_phandle(phandle handle)
{
        struct device_node *np = NULL;
        unsigned long flags;
        u32 handle_hash;

        if (!handle)
                return NULL;

        handle_hash = of_phandle_cache_hash(handle);

        raw_spin_lock_irqsave(&devtree_lock, flags);

        if (phandle_cache[handle_hash] &&
            handle == phandle_cache[handle_hash]->phandle)
                np = phandle_cache[handle_hash];

        if (!np) {
                for_each_of_allnodes(np)
                        if (np->phandle == handle &&
                            !of_node_check_flag(np, OF_DETACHED)) {
                                phandle_cache[handle_hash] = np;
                                break;
                        }
        }

        of_node_get(np);
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        return np;
}
EXPORT_SYMBOL(of_find_node_by_phandle);

void of_print_phandle_args(const char *msg, const struct of_phandle_args *args)
{
        int i;
        printk("%s %pOF", msg, args->np);
        for (i = 0; i < args->args_count; i++) {
                const char delim = i ? ',' : ':';

                pr_cont("%c%08x", delim, args->args[i]);
        }
        pr_cont("\n");
}

int of_phandle_iterator_init(struct of_phandle_iterator *it,
                const struct device_node *np,
                const char *list_name,
                const char *cells_name,
                int cell_count)
{
        const __be32 *list;
        int size;

        memset(it, 0, sizeof(*it));

        /*
         * one of cell_count or cells_name must be provided to determine the
         * argument length.
         */
        if (cell_count < 0 && !cells_name)
                return -EINVAL;

        list = of_get_property(np, list_name, &size);
        if (!list)
                return -ENOENT;

        it->cells_name = cells_name;
        it->cell_count = cell_count;
        it->parent = np;
        it->list_end = list + size / sizeof(*list);
        it->phandle_end = list;
        it->cur = list;

        return 0;
}
EXPORT_SYMBOL_GPL(of_phandle_iterator_init);

int of_phandle_iterator_next(struct of_phandle_iterator *it)
{
        uint32_t count = 0;

        if (it->node) {
                of_node_put(it->node);
                it->node = NULL;
        }

        if (!it->cur || it->phandle_end >= it->list_end)
                return -ENOENT;

        it->cur = it->phandle_end;

        /* If phandle is 0, then it is an empty entry with no arguments. */
        it->phandle = be32_to_cpup(it->cur++);

        if (it->phandle) {

                /*
                 * Find the provider node and parse the #*-cells property to
                 * determine the argument length.
                 */
                it->node = of_find_node_by_phandle(it->phandle);

                if (it->cells_name) {
                        if (!it->node) {
                                pr_err("%pOF: could not find phandle %d\n",
                                       it->parent, it->phandle);
                                goto err;
                        }

                        if (of_property_read_u32(it->node, it->cells_name,
                                                 &count)) {
                                /*
                                 * If both cell_count and cells_name is given,
                                 * fall back to cell_count in absence
                                 * of the cells_name property
                                 */
                                if (it->cell_count >= 0) {
                                        count = it->cell_count;
                                } else {
                                        pr_err("%pOF: could not get %s for %pOF\n",
                                               it->parent,
                                               it->cells_name,
                                               it->node);
                                        goto err;
                                }
                        }
                } else {
                        count = it->cell_count;
                }

                /*
                 * Make sure that the arguments actually fit in the remaining
                 * property data length
                 */
                if (it->cur + count > it->list_end) {
                        if (it->cells_name)
                                pr_err("%pOF: %s = %d found %td\n",
                                        it->parent, it->cells_name,
                                        count, it->list_end - it->cur);
                        else
                                pr_err("%pOF: phandle %s needs %d, found %td\n",
                                        it->parent, of_node_full_name(it->node),
                                        count, it->list_end - it->cur);
                        goto err;
                }
        }

        it->phandle_end = it->cur + count;
        it->cur_count = count;

        return 0;

err:
        if (it->node) {
                of_node_put(it->node);
                it->node = NULL;
        }

        return -EINVAL;
}
EXPORT_SYMBOL_GPL(of_phandle_iterator_next);

int of_phandle_iterator_args(struct of_phandle_iterator *it,
                             uint32_t *args,
                             int size)
{
        int i, count;

        count = it->cur_count;

        if (WARN_ON(size < count))
                count = size;

        for (i = 0; i < count; i++)
                args[i] = be32_to_cpup(it->cur++);

        return count;
}

int __of_parse_phandle_with_args(const struct device_node *np,
                                 const char *list_name,
                                 const char *cells_name,
                                 int cell_count, int index,
                                 struct of_phandle_args *out_args)
{
        struct of_phandle_iterator it;
        int rc, cur_index = 0;

        if (index < 0)
                return -EINVAL;

        /* Loop over the phandles until all the requested entry is found */
        of_for_each_phandle(&it, rc, np, list_name, cells_name, cell_count) {
                /*
                 * All of the error cases bail out of the loop, so at
                 * this point, the parsing is successful. If the requested
                 * index matches, then fill the out_args structure and return,
                 * or return -ENOENT for an empty entry.
                 */
                rc = -ENOENT;
                if (cur_index == index) {
                        if (!it.phandle)
                                goto err;

                        if (out_args) {
                                int c;

                                c = of_phandle_iterator_args(&it,
                                                             out_args->args,
                                                             MAX_PHANDLE_ARGS);
                                out_args->np = it.node;
                                out_args->args_count = c;
                        } else {
                                of_node_put(it.node);
                        }

                        /* Found it! return success */
                        return 0;
                }

                cur_index++;
        }

        /*
         * Unlock node before returning result; will be one of:
         * -ENOENT : index is for empty phandle
         * -EINVAL : parsing error on data
         */

 err:
        of_node_put(it.node);
        return rc;
}
EXPORT_SYMBOL(__of_parse_phandle_with_args);

/**
 * of_parse_phandle_with_args_map() - Find a node pointed by phandle in a list and remap it
 * @np:                pointer to a device tree node containing a list
 * @list_name:        property name that contains a list
 * @stem_name:        stem of property names that specify phandles' arguments count
 * @index:        index of a phandle to parse out
 * @out_args:        optional pointer to output arguments structure (will be filled)
 *
 * This function is useful to parse lists of phandles and their arguments.
 * Returns 0 on success and fills out_args, on error returns appropriate errno
 * value. The difference between this function and of_parse_phandle_with_args()
 * is that this API remaps a phandle if the node the phandle points to has
 * a <@stem_name>-map property.
 *
 * Caller is responsible to call of_node_put() on the returned out_args->np
 * pointer.
 *
 * Example::
 *
 *  phandle1: node1 {
 *          #list-cells = <2>;
 *  };
 *
 *  phandle2: node2 {
 *          #list-cells = <1>;
 *  };
 *
 *  phandle3: node3 {
 *          #list-cells = <1>;
 *          list-map = <0 &phandle2 3>,
 *                     <1 &phandle2 2>,
 *                     <2 &phandle1 5 1>;
 *          list-map-mask = <0x3>;
 *  };
 *
 *  node4 {
 *          list = <&phandle1 1 2 &phandle3 0>;
 *  };
 *
 * To get a device_node of the ``node2`` node you may call this:
 * of_parse_phandle_with_args(node4, "list", "list", 1, &args);
 */
int of_parse_phandle_with_args_map(const struct device_node *np,
                                   const char *list_name,
                                   const char *stem_name,
                                   int index, struct of_phandle_args *out_args)
{
        char *cells_name __free(kfree) = kasprintf(GFP_KERNEL, "#%s-cells", stem_name);
        char *map_name __free(kfree) = kasprintf(GFP_KERNEL, "%s-map", stem_name);
        char *mask_name __free(kfree) = kasprintf(GFP_KERNEL, "%s-map-mask", stem_name);
        char *pass_name __free(kfree) = kasprintf(GFP_KERNEL, "%s-map-pass-thru", stem_name);
        struct device_node *cur, *new = NULL;
        const __be32 *map, *mask, *pass;
        static const __be32 dummy_mask[] = { [0 ... (MAX_PHANDLE_ARGS - 1)] = cpu_to_be32(~0) };
        static const __be32 dummy_pass[] = { [0 ... (MAX_PHANDLE_ARGS - 1)] = cpu_to_be32(0) };
        __be32 initial_match_array[MAX_PHANDLE_ARGS];
        const __be32 *match_array = initial_match_array;
        int i, ret, map_len, match;
        u32 list_size, new_size;

        if (index < 0)
                return -EINVAL;

        if (!cells_name || !map_name || !mask_name || !pass_name)
                return -ENOMEM;

        ret = __of_parse_phandle_with_args(np, list_name, cells_name, -1, index,
                                           out_args);
        if (ret)
                return ret;

        /* Get the #<list>-cells property */
        cur = out_args->np;
        ret = of_property_read_u32(cur, cells_name, &list_size);
        if (ret < 0)
                goto put;

        /* Precalculate the match array - this simplifies match loop */
        for (i = 0; i < list_size; i++)
                initial_match_array[i] = cpu_to_be32(out_args->args[i]);

        ret = -EINVAL;
        while (cur) {
                /* Get the <list>-map property */
                map = of_get_property(cur, map_name, &map_len);
                if (!map) {
                        return 0;
                }
                map_len /= sizeof(u32);

                /* Get the <list>-map-mask property (optional) */
                mask = of_get_property(cur, mask_name, NULL);
                if (!mask)
                        mask = dummy_mask;
                /* Iterate through <list>-map property */
                match = 0;
                while (map_len > (list_size + 1) && !match) {
                        /* Compare specifiers */
                        match = 1;
                        for (i = 0; i < list_size; i++, map_len--)
                                match &= !((match_array[i] ^ *map++) & mask[i]);

                        of_node_put(new);
                        new = of_find_node_by_phandle(be32_to_cpup(map));
                        map++;
                        map_len--;

                        /* Check if not found */
                        if (!new) {
                                ret = -EINVAL;
                                goto put;
                        }

                        if (!of_device_is_available(new))
                                match = 0;

                        ret = of_property_read_u32(new, cells_name, &new_size);
                        if (ret)
                                goto put;

                        /* Check for malformed properties */
                        if (WARN_ON(new_size > MAX_PHANDLE_ARGS) ||
                            map_len < new_size) {
                                ret = -EINVAL;
                                goto put;
                        }

                        /* Move forward by new node's #<list>-cells amount */
                        map += new_size;
                        map_len -= new_size;
                }
                if (!match) {
                        ret = -ENOENT;
                        goto put;
                }

                /* Get the <list>-map-pass-thru property (optional) */
                pass = of_get_property(cur, pass_name, NULL);
                if (!pass)
                        pass = dummy_pass;

                /*
                 * Successfully parsed a <list>-map translation; copy new
                 * specifier into the out_args structure, keeping the
                 * bits specified in <list>-map-pass-thru.
                 */
                for (i = 0; i < new_size; i++) {
                        __be32 val = *(map - new_size + i);

                        if (i < list_size) {
                                val &= ~pass[i];
                                val |= cpu_to_be32(out_args->args[i]) & pass[i];
                        }

                        initial_match_array[i] = val;
                        out_args->args[i] = be32_to_cpu(val);
                }
                out_args->args_count = list_size = new_size;
                /* Iterate again with new provider */
                out_args->np = new;
                of_node_put(cur);
                cur = new;
                new = NULL;
        }
put:
        of_node_put(cur);
        of_node_put(new);
        return ret;
}
EXPORT_SYMBOL(of_parse_phandle_with_args_map);

/**
 * of_count_phandle_with_args() - Find the number of phandles references in a property
 * @np:                pointer to a device tree node containing a list
 * @list_name:        property name that contains a list
 * @cells_name:        property name that specifies phandles' arguments count
 *
 * Return: The number of phandle + argument tuples within a property. It
 * is a typical pattern to encode a list of phandle and variable
 * arguments into a single property. The number of arguments is encoded
 * by a property in the phandle-target node. For example, a gpios
 * property would contain a list of GPIO specifies consisting of a
 * phandle and 1 or more arguments. The number of arguments are
 * determined by the #gpio-cells property in the node pointed to by the
 * phandle.
 */
int of_count_phandle_with_args(const struct device_node *np, const char *list_name,
                                const char *cells_name)
{
        struct of_phandle_iterator it;
        int rc, cur_index = 0;

        /*
         * If cells_name is NULL we assume a cell count of 0. This makes
         * counting the phandles trivial as each 32bit word in the list is a
         * phandle and no arguments are to consider. So we don't iterate through
         * the list but just use the length to determine the phandle count.
         */
        if (!cells_name) {
                const __be32 *list;
                int size;

                list = of_get_property(np, list_name, &size);
                if (!list)
                        return -ENOENT;

                return size / sizeof(*list);
        }

        rc = of_phandle_iterator_init(&it, np, list_name, cells_name, -1);
        if (rc)
                return rc;

        while ((rc = of_phandle_iterator_next(&it)) == 0)
                cur_index += 1;

        if (rc != -ENOENT)
                return rc;

        return cur_index;
}
EXPORT_SYMBOL(of_count_phandle_with_args);

static struct property *__of_remove_property_from_list(struct property **list, struct property *prop)
{
        struct property **next;

        for (next = list; *next; next = &(*next)->next) {
                if (*next == prop) {
                        *next = prop->next;
                        prop->next = NULL;
                        return prop;
                }
        }
        return NULL;
}

/**
 * __of_add_property - Add a property to a node without lock operations
 * @np:                Caller's Device Node
 * @prop:        Property to add
 */
int __of_add_property(struct device_node *np, struct property *prop)
{
        int rc = 0;
        unsigned long flags;
        struct property **next;

        raw_spin_lock_irqsave(&devtree_lock, flags);

        __of_remove_property_from_list(&np->deadprops, prop);

        prop->next = NULL;
        next = &np->properties;
        while (*next) {
                if (of_prop_cmp(prop->name, (*next)->name) == 0) {
                        /* duplicate ! don't insert it */
                        rc = -EEXIST;
                        goto out_unlock;
                }
                next = &(*next)->next;
        }
        *next = prop;

out_unlock:
        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        if (rc)
                return rc;

        __of_add_property_sysfs(np, prop);
        return 0;
}

/**
 * of_add_property - Add a property to a node
 * @np:                Caller's Device Node
 * @prop:        Property to add
 */
int of_add_property(struct device_node *np, struct property *prop)
{
        int rc;

        mutex_lock(&of_mutex);
        rc = __of_add_property(np, prop);
        mutex_unlock(&of_mutex);

        if (!rc)
                of_property_notify(OF_RECONFIG_ADD_PROPERTY, np, prop, NULL);

        return rc;
}
EXPORT_SYMBOL_GPL(of_add_property);

int __of_remove_property(struct device_node *np, struct property *prop)
{
        unsigned long flags;
        int rc = -ENODEV;

        raw_spin_lock_irqsave(&devtree_lock, flags);

        if (__of_remove_property_from_list(&np->properties, prop)) {
                /* Found the property, add it to deadprops list */
                prop->next = np->deadprops;
                np->deadprops = prop;
                rc = 0;
        }

        raw_spin_unlock_irqrestore(&devtree_lock, flags);
        if (rc)
                return rc;

        __of_remove_property_sysfs(np, prop);
        return 0;
}

/**
 * of_remove_property - Remove a property from a node.
 * @np:                Caller's Device Node
 * @prop:        Property to remove
 *
 * Note that we don't actually remove it, since we have given out
 * who-knows-how-many pointers to the data using get-property.
 * Instead we just move the property to the "dead properties"
 * list, so it won't be found any more.
 */
int of_remove_property(struct device_node *np, struct property *prop)
{
        int rc;

        if (!prop)
                return -ENODEV;

        mutex_lock(&of_mutex);
        rc = __of_remove_property(np, prop);
        mutex_unlock(&of_mutex);

        if (!rc)
                of_property_notify(OF_RECONFIG_REMOVE_PROPERTY, np, prop, NULL);

        return rc;
}
EXPORT_SYMBOL_GPL(of_remove_property);

int __of_update_property(struct device_node *np, struct property *newprop,
                struct property **oldpropp)
{
        struct property **next, *oldprop;
        unsigned long flags;

        raw_spin_lock_irqsave(&devtree_lock, flags);

        __of_remove_property_from_list(&np->deadprops, newprop);

        for (next = &np->properties; *next; next = &(*next)->next) {
                if (of_prop_cmp((*next)->name, newprop->name) == 0)
                        break;
        }
        *oldpropp = oldprop = *next;

        if (oldprop) {
                /* replace the node */
                newprop->next = oldprop->next;
                *next = newprop;
                oldprop->next = np->deadprops;
                np->deadprops = oldprop;
        } else {
                /* new node */
                newprop->next = NULL;
                *next = newprop;
        }

        raw_spin_unlock_irqrestore(&devtree_lock, flags);

        __of_update_property_sysfs(np, newprop, oldprop);

        return 0;
}

/*
 * of_update_property - Update a property in a node, if the property does
 * not exist, add it.
 *
 * Note that we don't actually remove it, since we have given out
 * who-knows-how-many pointers to the data using get-property.
 * Instead we just move the property to the "dead properties" list,
 * and add the new property to the property list
 */
int of_update_property(struct device_node *np, struct property *newprop)
{
        struct property *oldprop;
        int rc;

        if (!newprop->name)
                return -EINVAL;

        mutex_lock(&of_mutex);
        rc = __of_update_property(np, newprop, &oldprop);
        mutex_unlock(&of_mutex);

        if (!rc)
                of_property_notify(OF_RECONFIG_UPDATE_PROPERTY, np, newprop, oldprop);

        return rc;
}

static void of_alias_add(struct alias_prop *ap, struct device_node *np,
                         int id, const char *stem, int stem_len)
{
        ap->np = np;
        ap->id = id;
        strscpy(ap->stem, stem, stem_len + 1);
        list_add_tail(&ap->link, &aliases_lookup);
        pr_debug("adding DT alias:%s: stem=%s id=%i node=%pOF\n",
                 ap->alias, ap->stem, ap->id, np);
}

/**
 * of_alias_scan - Scan all properties of the 'aliases' node
 * @dt_alloc:        An allocator that provides a virtual address to memory
 *                for storing the resulting tree
 *
 * The function scans all the properties of the 'aliases' node and populates
 * the global lookup table with the properties.
 */
void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align))
{
        const struct property *pp;

        of_aliases = of_find_node_by_path("/aliases");
        of_chosen = of_find_node_by_path("/chosen");
        if (of_chosen == NULL)
                of_chosen = of_find_node_by_path("/chosen@0");

        if (of_chosen) {
                /* linux,stdout-path and /aliases/stdout are for legacy compatibility */
                const char *name = NULL;

                if (of_property_read_string(of_chosen, "stdout-path", &name))
                        of_property_read_string(of_chosen, "linux,stdout-path",
                                                &name);
                if (IS_ENABLED(CONFIG_PPC) && !name)
                        of_property_read_string(of_aliases, "stdout", &name);
                if (name)
                        of_stdout = of_find_node_opts_by_path(name, &of_stdout_options);
                if (of_stdout)
                        fwnode_set_flag(&of_stdout->fwnode, FWNODE_FLAG_BEST_EFFORT);
        }

        if (!of_aliases)
                return;

        for_each_property_of_node(of_aliases, pp) {
                const char *start = pp->name;
                const char *end = start + strlen(start);
                struct device_node *np;
                struct alias_prop *ap;
                int id, len;

                /* Skip those we do not want to proceed */
                if (is_pseudo_property(pp->name))
                        continue;

                np = of_find_node_by_path(pp->value);
                if (!np)
                        continue;

                /* walk the alias backwards to extract the id and work out
                 * the 'stem' string */
                while (isdigit(*(end-1)) && end > start)
                        end--;
                len = end - start;

                if (kstrtoint(end, 10, &id) < 0) {
                        of_node_put(np);
                        continue;
                }

                /* Allocate an alias_prop with enough space for the stem */
                ap = dt_alloc(sizeof(*ap) + len + 1, __alignof__(*ap));
                if (!ap) {
                        of_node_put(np);
                        continue;
                }
                memset(ap, 0, sizeof(*ap) + len + 1);
                ap->alias = start;
                of_alias_add(ap, np, id, start, len);
        }
}

/**
 * of_alias_get_id - Get alias id for the given device_node
 * @np:                Pointer to the given device_node
 * @stem:        Alias stem of the given device_node
 *
 * The function travels the lookup table to get the alias id for the given
 * device_node and alias stem.
 *
 * Return: The alias id if found.
 */
int of_alias_get_id(const struct device_node *np, const char *stem)
{
        struct alias_prop *app;
        int id = -ENODEV;

        mutex_lock(&of_mutex);
        list_for_each_entry(app, &aliases_lookup, link) {
                if (strcmp(app->stem, stem) != 0)
                        continue;

                if (np == app->np) {
                        id = app->id;
                        break;
                }
        }
        mutex_unlock(&of_mutex);

        return id;
}
EXPORT_SYMBOL_GPL(of_alias_get_id);

/**
 * of_alias_get_highest_id - Get highest alias id for the given stem
 * @stem:        Alias stem to be examined
 *
 * The function travels the lookup table to get the highest alias id for the
 * given alias stem.  It returns the alias id if found.
 */
int of_alias_get_highest_id(const char *stem)
{
        struct alias_prop *app;
        int id = -ENODEV;

        mutex_lock(&of_mutex);
        list_for_each_entry(app, &aliases_lookup, link) {
                if (strcmp(app->stem, stem) != 0)
                        continue;

                if (app->id > id)
                        id = app->id;
        }
        mutex_unlock(&of_mutex);

        return id;
}
EXPORT_SYMBOL_GPL(of_alias_get_highest_id);

/**
 * of_console_check() - Test and setup console for DT setup
 * @dn: Pointer to device node
 * @name: Name to use for preferred console without index. ex. "ttyS"
 * @index: Index to use for preferred console.
 *
 * Check if the given device node matches the stdout-path property in the
 * /chosen node. If it does then register it as the preferred console.
 *
 * Return: TRUE if console successfully setup. Otherwise return FALSE.
 */
bool of_console_check(const struct device_node *dn, char *name, int index)
{
        if (!dn || dn != of_stdout || console_set_on_cmdline)
                return false;

        /*
         * XXX: cast `options' to char pointer to suppress complication
         * warnings: printk, UART and console drivers expect char pointer.
         */
        return !add_preferred_console(name, index, (char *)of_stdout_options);
}
EXPORT_SYMBOL_GPL(of_console_check);

/**
 * of_find_next_cache_node - Find a node's subsidiary cache
 * @np:        node of type "cpu" or "cache"
 *
 * Return: A node pointer with refcount incremented, use
 * of_node_put() on it when done.  Caller should hold a reference
 * to np.
 */
struct device_node *of_find_next_cache_node(const struct device_node *np)
{
        struct device_node *child, *cache_node;

        cache_node = of_parse_phandle(np, "l2-cache", 0);
        if (!cache_node)
                cache_node = of_parse_phandle(np, "next-level-cache", 0);

        if (cache_node)
                return cache_node;

        /* OF on pmac has nodes instead of properties named "l2-cache"
         * beneath CPU nodes.
         */
        if (IS_ENABLED(CONFIG_PPC_PMAC) && of_node_is_type(np, "cpu"))
                for_each_child_of_node(np, child)
                        if (of_node_is_type(child, "cache"))
                                return child;

        return NULL;
}

/**
 * of_find_last_cache_level - Find the level at which the last cache is
 *                 present for the given logical cpu
 *
 * @cpu: cpu number(logical index) for which the last cache level is needed
 *
 * Return: The level at which the last cache is present. It is exactly
 * same as  the total number of cache levels for the given logical cpu.
 */
int of_find_last_cache_level(unsigned int cpu)
{
        u32 cache_level = 0;
        struct device_node *prev = NULL, *np = of_cpu_device_node_get(cpu);

        while (np) {
                of_node_put(prev);
                prev = np;
                np = of_find_next_cache_node(np);
        }

        of_property_read_u32(prev, "cache-level", &cache_level);
        of_node_put(prev);

        return cache_level;
}

/**
 * of_map_id - Translate an ID through a downstream mapping.
 * @np: root complex device node.
 * @id: device ID to map.
 * @map_name: property name of the map to use.
 * @map_mask_name: optional property name of the mask to use.
 * @target: optional pointer to a target device node.
 * @id_out: optional pointer to receive the translated ID.
 *
 * Given a device ID, look up the appropriate implementation-defined
 * platform ID and/or the target device which receives transactions on that
 * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or
 * @id_out may be NULL if only the other is required. If @target points to
 * a non-NULL device node pointer, only entries targeting that node will be
 * matched; if it points to a NULL value, it will receive the device node of
 * the first matching target phandle, with a reference held.
 *
 * Return: 0 on success or a standard error code on failure.
 */
int of_map_id(const struct device_node *np, u32 id,
               const char *map_name, const char *map_mask_name,
               struct device_node **target, u32 *id_out)
{
        u32 map_mask, masked_id;
        int map_len;
        const __be32 *map = NULL;

        if (!np || !map_name || (!target && !id_out))
                return -EINVAL;

        map = of_get_property(np, map_name, &map_len);
        if (!map) {
                if (target)
                        return -ENODEV;
                /* Otherwise, no map implies no translation */
                *id_out = id;
                return 0;
        }

        if (!map_len || map_len % (4 * sizeof(*map))) {
                pr_err("%pOF: Error: Bad %s length: %d\n", np,
                        map_name, map_len);
                return -EINVAL;
        }

        /* The default is to select all bits. */
        map_mask = 0xffffffff;

        /*
         * Can be overridden by "{iommu,msi}-map-mask" property.
         * If of_property_read_u32() fails, the default is used.
         */
        if (map_mask_name)
                of_property_read_u32(np, map_mask_name, &map_mask);

        masked_id = map_mask & id;
        for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) {
                struct device_node *phandle_node;
                u32 id_base = be32_to_cpup(map + 0);
                u32 phandle = be32_to_cpup(map + 1);
                u32 out_base = be32_to_cpup(map + 2);
                u32 id_len = be32_to_cpup(map + 3);

                if (id_base & ~map_mask) {
                        pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores id-base (0x%x)\n",
                                np, map_name, map_name,
                                map_mask, id_base);
                        return -EFAULT;
                }

                if (masked_id < id_base || masked_id >= id_base + id_len)
                        continue;

                phandle_node = of_find_node_by_phandle(phandle);
                if (!phandle_node)
                        return -ENODEV;

                if (target) {
                        if (*target)
                                of_node_put(phandle_node);
                        else
                                *target = phandle_node;

                        if (*target != phandle_node)
                                continue;
                }

                if (id_out)
                        *id_out = masked_id - id_base + out_base;

                pr_debug("%pOF: %s, using mask %08x, id-base: %08x, out-base: %08x, length: %08x, id: %08x -> %08x\n",
                        np, map_name, map_mask, id_base, out_base,
                        id_len, id, masked_id - id_base + out_base);
                return 0;
        }

        pr_info("%pOF: no %s translation for id 0x%x on %pOF\n", np, map_name,
                id, target && *target ? *target : NULL);

        /* Bypasses translation */
        if (id_out)
                *id_out = id;
        return 0;
}
EXPORT_SYMBOL_GPL(of_map_id);







































   16 










































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Statically sized hash table implementation
 * (C) 2012  Sasha Levin <levinsasha928@gmail.com>
 */

#ifndef _LINUX_HASHTABLE_H
#define _LINUX_HASHTABLE_H

#include <linux/list.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/rculist.h>

#define DEFINE_HASHTABLE(name, bits)                                                \
        struct hlist_head name[1 << (bits)] =                                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits)                                \
        struct hlist_head name[1 << (bits)] __read_mostly =                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DECLARE_HASHTABLE(name, bits)                                           \
        struct hlist_head name[1 << (bits)]

#define HASH_SIZE(name) (ARRAY_SIZE(name))
#define HASH_BITS(name) ilog2(HASH_SIZE(name))

/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */
#define hash_min(val, bits)                                                        \
        (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits))

static inline void __hash_init(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                INIT_HLIST_HEAD(&ht[i]);
}

/**
 * hash_init - initialize a hash table
 * @hashtable: hashtable to be initialized
 *
 * Calculates the size of the hashtable from the given parameter, otherwise
 * same as hash_init_size.
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable))

/**
 * hash_add - add an object to a hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add(hashtable, node, key)                                                \
        hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_add_rcu - add an object to a rcu enabled hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add_rcu(hashtable, node, key)                                        \
        hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_hashed - check whether an object is in any hashtable
 * @node: the &struct hlist_node of the object to be checked
 */
static inline bool hash_hashed(struct hlist_node *node)
{
        return !hlist_unhashed(node);
}

static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                if (!hlist_empty(&ht[i]))
                        return false;

        return true;
}

/**
 * hash_empty - check whether a hashtable is empty
 * @hashtable: hashtable to check
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable))

/**
 * hash_del - remove an object from a hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del(struct hlist_node *node)
{
        hlist_del_init(node);
}

/**
 * hash_del_rcu - remove an object from a rcu enabled hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del_rcu(struct hlist_node *node)
{
        hlist_del_init_rcu(node);
}

/**
 * hash_for_each - iterate over a hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each(name, bkt, obj, member)                                \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry(obj, &name[bkt], member)

/**
 * hash_for_each_rcu - iterate over a rcu enabled hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_rcu(name, bkt, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_rcu(obj, &name[bkt], member)

/**
 * hash_for_each_safe - iterate over a hashtable safe against removal of
 * hash entry
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @tmp: a &struct hlist_node used for temporary storage
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_safe(name, bkt, tmp, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_safe(obj, tmp, &name[bkt], member)

/**
 * hash_for_each_possible - iterate over all possible objects hashing to the
 * same bucket
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible(name, obj, member, key)                        \
        hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_rcu - iterate over all possible objects hashing to the
 * same bucket in an rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_rcu(name, obj, member, key, cond...)        \
        hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\
                member, ## cond)

/**
 * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing
 * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 *
 * This is the same as hash_for_each_possible_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hash_for_each_possible_rcu_notrace(name, obj, member, key) \
        hlist_for_each_entry_rcu_notrace(obj, \
                &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_safe - iterate over all possible objects hashing to the
 * same bucket safe against removals
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @tmp: a &struct hlist_node used for temporary storage
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_safe(name, obj, tmp, member, key)        \
        hlist_for_each_entry_safe(obj, tmp,\
                &name[hash_min(key, HASH_BITS(name))], member)


#endif





















































   14 





   12 










   13 


    1 













   14 


   10 





   10 


   12 
    1 












































   13 




   14 




























   13 












   14 













   10 
   10 

    9 



























   13 
   14 





   14 






   10 





















   10 


   15 



























































































































































































































   11 










































































   14 




















   12 







   13 

















   13 










   13 














   12 



   14 











   13 






























   11 
   10 
   10 

   11 







   11 






   10 






   12 
   13 
   12 
   12 
   13 




   11 
   12 
    2 
   10 



   14 



    1 

   14 








   14 
   16 

   15 







































   13 

















   11 

















   12 











   14 











    3 























   14 

   16 










   12 

   10 















   10 




   10 

   11 






















   15 







   12 


   12 












   16 





   13 

   14 
   11 









    2 
   10 












   16 











    3 


















   12 






    2 












   11 













   14 









   12 


































   11 








   11 
   11 



   13 















   11 
   11 


   13 

   11 
   11 











   13 
















    9 








   10 


























   12 



















   12 





















   11 

   11 



   11 


    8 









































































































   12 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright (C) 2016 - 2020 Christoph Hellwig
 */

#include <linux/init.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/major.h>
#include <linux/device_cgroup.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/part_stat.h>
#include <linux/uaccess.h>
#include <linux/stat.h>
#include "../fs/internal.h"
#include "blk.h"

/* Should we allow writing to mounted block devices? */
static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);

struct bdev_inode {
        struct block_device bdev;
        struct inode vfs_inode;
};

static inline struct bdev_inode *BDEV_I(struct inode *inode)
{
        return container_of(inode, struct bdev_inode, vfs_inode);
}

static inline struct inode *BD_INODE(struct block_device *bdev)
{
        return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode;
}

struct block_device *I_BDEV(struct inode *inode)
{
        return &BDEV_I(inode)->bdev;
}
EXPORT_SYMBOL(I_BDEV);

struct block_device *file_bdev(struct file *bdev_file)
{
        return I_BDEV(bdev_file->f_mapping->host);
}
EXPORT_SYMBOL(file_bdev);

static void bdev_write_inode(struct block_device *bdev)
{
        struct inode *inode = BD_INODE(bdev);
        int ret;

        spin_lock(&inode->i_lock);
        while (inode_state_read(inode) & I_DIRTY) {
                spin_unlock(&inode->i_lock);
                ret = write_inode_now(inode, true);
                if (ret)
                        pr_warn_ratelimited(
        "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n",
                                bdev, ret);
                spin_lock(&inode->i_lock);
        }
        spin_unlock(&inode->i_lock);
}

/* Kill _all_ buffers and pagecache , dirty or not.. */
static void kill_bdev(struct block_device *bdev)
{
        struct address_space *mapping = bdev->bd_mapping;

        if (mapping_empty(mapping))
                return;

        invalidate_bh_lrus();
        truncate_inode_pages(mapping, 0);
}

/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
{
        struct address_space *mapping = bdev->bd_mapping;

        if (mapping->nrpages) {
                invalidate_bh_lrus();
                lru_add_drain_all();        /* make sure all lru add caches are flushed */
                invalidate_mapping_pages(mapping, 0, -1);
        }
}
EXPORT_SYMBOL(invalidate_bdev);

/*
 * Drop all buffers & page cache for given bdev range. This function bails
 * with error if bdev has other exclusive owner (such as filesystem).
 */
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
                        loff_t lstart, loff_t lend)
{
        /*
         * If we don't hold exclusive handle for the device, upgrade to it
         * while we discard the buffer cache to avoid discarding buffers
         * under live filesystem.
         */
        if (!(mode & BLK_OPEN_EXCL)) {
                int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL);
                if (err)
                        goto invalidate;
        }

        truncate_inode_pages_range(bdev->bd_mapping, lstart, lend);
        if (!(mode & BLK_OPEN_EXCL))
                bd_abort_claiming(bdev, truncate_bdev_range);
        return 0;

invalidate:
        /*
         * Someone else has handle exclusively open. Try invalidating instead.
         * The 'end' argument is inclusive so the rounding is safe.
         */
        return invalidate_inode_pages2_range(bdev->bd_mapping,
                                             lstart >> PAGE_SHIFT,
                                             lend >> PAGE_SHIFT);
}

static void set_init_blocksize(struct block_device *bdev)
{
        unsigned int bsize = bdev_logical_block_size(bdev);
        loff_t size = i_size_read(BD_INODE(bdev));

        while (bsize < PAGE_SIZE) {
                if (size & bsize)
                        break;
                bsize <<= 1;
        }
        BD_INODE(bdev)->i_blkbits = blksize_bits(bsize);
        mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping,
                                    get_order(bsize));
}

/**
 * bdev_validate_blocksize - check that this block size is acceptable
 * @bdev:        blockdevice to check
 * @block_size:        block size to check
 *
 * For block device users that do not use buffer heads or the block device
 * page cache, make sure that this block size can be used with the device.
 *
 * Return: On success zero is returned, negative error code on failure.
 */
int bdev_validate_blocksize(struct block_device *bdev, int block_size)
{
        if (blk_validate_block_size(block_size))
                return -EINVAL;

        /* Size cannot be smaller than the size supported by the device */
        if (block_size < bdev_logical_block_size(bdev))
                return -EINVAL;

        return 0;
}
EXPORT_SYMBOL_GPL(bdev_validate_blocksize);

int set_blocksize(struct file *file, int size)
{
        struct inode *inode = file->f_mapping->host;
        struct block_device *bdev = I_BDEV(inode);
        int ret;

        ret = bdev_validate_blocksize(bdev, size);
        if (ret)
                return ret;

        if (!file->private_data)
                return -EINVAL;

        /* Don't change the size if it is same as current */
        if (inode->i_blkbits != blksize_bits(size)) {
                /*
                 * Flush and truncate the pagecache before we reconfigure the
                 * mapping geometry because folio sizes are variable now.  If a
                 * reader has already allocated a folio whose size is smaller
                 * than the new min_order but invokes readahead after the new
                 * min_order becomes visible, readahead will think there are
                 * "zero" blocks per folio and crash.  Take the inode and
                 * invalidation locks to avoid racing with
                 * read/write/fallocate.
                 */
                inode_lock(inode);
                filemap_invalidate_lock(inode->i_mapping);

                sync_blockdev(bdev);
                kill_bdev(bdev);

                inode->i_blkbits = blksize_bits(size);
                mapping_set_folio_min_order(inode->i_mapping, get_order(size));
                filemap_invalidate_unlock(inode->i_mapping);
                inode_unlock(inode);
        }
        return 0;
}

EXPORT_SYMBOL(set_blocksize);

static int sb_validate_large_blocksize(struct super_block *sb, int size)
{
        const char *err_str = NULL;

        if (!(sb->s_type->fs_flags & FS_LBS))
                err_str = "not supported by filesystem";
        else if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                err_str = "is only supported with CONFIG_TRANSPARENT_HUGEPAGE";

        if (!err_str)
                return 0;

        pr_warn_ratelimited("%s: block size(%d) > page size(%lu) %s\n",
                                sb->s_type->name, size, PAGE_SIZE, err_str);
        return -EINVAL;
}

int sb_set_blocksize(struct super_block *sb, int size)
{
        if (size > PAGE_SIZE && sb_validate_large_blocksize(sb, size))
                return 0;
        if (set_blocksize(sb->s_bdev_file, size))
                return 0;
        /* If we get here, we know size is validated */
        sb->s_blocksize = size;
        sb->s_blocksize_bits = blksize_bits(size);
        return sb->s_blocksize;
}

EXPORT_SYMBOL(sb_set_blocksize);

int __must_check sb_min_blocksize(struct super_block *sb, int size)
{
        int minsize = bdev_logical_block_size(sb->s_bdev);
        if (size < minsize)
                size = minsize;
        return sb_set_blocksize(sb, size);
}

EXPORT_SYMBOL(sb_min_blocksize);

int sync_blockdev_nowait(struct block_device *bdev)
{
        if (!bdev)
                return 0;
        return filemap_flush(bdev->bd_mapping);
}
EXPORT_SYMBOL_GPL(sync_blockdev_nowait);

/*
 * Write out and wait upon all the dirty data associated with a block
 * device via its mapping.  Does not take the superblock lock.
 */
int sync_blockdev(struct block_device *bdev)
{
        if (!bdev)
                return 0;
        return filemap_write_and_wait(bdev->bd_mapping);
}
EXPORT_SYMBOL(sync_blockdev);

int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
{
        return filemap_write_and_wait_range(bdev->bd_mapping,
                        lstart, lend);
}
EXPORT_SYMBOL(sync_blockdev_range);

/**
 * bdev_freeze - lock a filesystem and force it into a consistent state
 * @bdev:        blockdevice to lock
 *
 * If a superblock is found on this device, we take the s_umount semaphore
 * on it to make sure nobody unmounts until the snapshot creation is done.
 * The reference counter (bd_fsfreeze_count) guarantees that only the last
 * unfreeze process can unfreeze the frozen filesystem actually when multiple
 * freeze requests arrive simultaneously. It counts up in bdev_freeze() and
 * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze
 * actually.
 *
 * Return: On success zero is returned, negative error code on failure.
 */
int bdev_freeze(struct block_device *bdev)
{
        int error = 0;

        mutex_lock(&bdev->bd_fsfreeze_mutex);

        if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) {
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                return 0;
        }

        mutex_lock(&bdev->bd_holder_lock);
        if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) {
                error = bdev->bd_holder_ops->freeze(bdev);
                lockdep_assert_not_held(&bdev->bd_holder_lock);
        } else {
                mutex_unlock(&bdev->bd_holder_lock);
                error = sync_blockdev(bdev);
        }

        if (error)
                atomic_dec(&bdev->bd_fsfreeze_count);

        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return error;
}
EXPORT_SYMBOL(bdev_freeze);

/**
 * bdev_thaw - unlock filesystem
 * @bdev:        blockdevice to unlock
 *
 * Unlocks the filesystem and marks it writeable again after bdev_freeze().
 *
 * Return: On success zero is returned, negative error code on failure.
 */
int bdev_thaw(struct block_device *bdev)
{
        int error = -EINVAL, nr_freeze;

        mutex_lock(&bdev->bd_fsfreeze_mutex);

        /*
         * If this returns < 0 it means that @bd_fsfreeze_count was
         * already 0 and no decrement was performed.
         */
        nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count);
        if (nr_freeze < 0)
                goto out;

        error = 0;
        if (nr_freeze > 0)
                goto out;

        mutex_lock(&bdev->bd_holder_lock);
        if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) {
                error = bdev->bd_holder_ops->thaw(bdev);
                lockdep_assert_not_held(&bdev->bd_holder_lock);
        } else {
                mutex_unlock(&bdev->bd_holder_lock);
        }

        if (error)
                atomic_inc(&bdev->bd_fsfreeze_count);
out:
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return error;
}
EXPORT_SYMBOL(bdev_thaw);

/*
 * pseudo-fs
 */

static  __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock);
static struct kmem_cache *bdev_cachep __ro_after_init;

static struct inode *bdev_alloc_inode(struct super_block *sb)
{
        struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL);

        if (!ei)
                return NULL;
        memset(&ei->bdev, 0, sizeof(ei->bdev));

        if (security_bdev_alloc(&ei->bdev)) {
                kmem_cache_free(bdev_cachep, ei);
                return NULL;
        }
        return &ei->vfs_inode;
}

static void bdev_free_inode(struct inode *inode)
{
        struct block_device *bdev = I_BDEV(inode);

        free_percpu(bdev->bd_stats);
        kfree(bdev->bd_meta_info);
        security_bdev_free(bdev);

        if (!bdev_is_partition(bdev)) {
                if (bdev->bd_disk && bdev->bd_disk->bdi)
                        bdi_put(bdev->bd_disk->bdi);
                kfree(bdev->bd_disk);
        }

        if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
                blk_free_ext_minor(MINOR(bdev->bd_dev));

        kmem_cache_free(bdev_cachep, BDEV_I(inode));
}

static void init_once(void *data)
{
        struct bdev_inode *ei = data;

        inode_init_once(&ei->vfs_inode);
}

static const struct super_operations bdev_sops = {
        .statfs = simple_statfs,
        .alloc_inode = bdev_alloc_inode,
        .free_inode = bdev_free_inode,
        .drop_inode = inode_just_drop,
};

static int bd_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        fc->s_iflags |= SB_I_CGROUPWB;
        ctx->ops = &bdev_sops;
        return 0;
}

static struct file_system_type bd_type = {
        .name                = "bdev",
        .init_fs_context = bd_init_fs_context,
        .kill_sb        = kill_anon_super,
};

struct super_block *blockdev_superblock __ro_after_init;
static struct vfsmount *blockdev_mnt __ro_after_init;
EXPORT_SYMBOL_GPL(blockdev_superblock);

void __init bdev_cache_init(void)
{
        int err;

        bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
                        0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_ACCOUNT|SLAB_PANIC),
                        init_once);
        err = register_filesystem(&bd_type);
        if (err)
                panic("Cannot register bdev pseudo-fs");
        blockdev_mnt = kern_mount(&bd_type);
        if (IS_ERR(blockdev_mnt))
                panic("Cannot create bdev pseudo-fs");
        blockdev_superblock = blockdev_mnt->mnt_sb;   /* For writeback */
}

struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
{
        struct block_device *bdev;
        struct inode *inode;

        inode = new_inode(blockdev_superblock);
        if (!inode)
                return NULL;
        inode->i_mode = S_IFBLK;
        inode->i_rdev = 0;
        inode->i_data.a_ops = &def_blk_aops;
        mapping_set_gfp_mask(&inode->i_data, GFP_USER);

        bdev = I_BDEV(inode);
        mutex_init(&bdev->bd_fsfreeze_mutex);
        spin_lock_init(&bdev->bd_size_lock);
        mutex_init(&bdev->bd_holder_lock);
        atomic_set(&bdev->__bd_flags, partno);
        bdev->bd_mapping = &inode->i_data;
        bdev->bd_queue = disk->queue;
        if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO))
                bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO);
        bdev->bd_stats = alloc_percpu(struct disk_stats);
        if (!bdev->bd_stats) {
                iput(inode);
                return NULL;
        }
        bdev->bd_disk = disk;
        return bdev;
}

void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
        spin_lock(&bdev->bd_size_lock);
        i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT);
        bdev->bd_nr_sectors = sectors;
        spin_unlock(&bdev->bd_size_lock);
}

void bdev_add(struct block_device *bdev, dev_t dev)
{
        struct inode *inode = BD_INODE(bdev);
        if (bdev_stable_writes(bdev))
                mapping_set_stable_writes(bdev->bd_mapping);
        bdev->bd_dev = dev;
        inode->i_rdev = dev;
        inode->i_ino = dev;
        insert_inode_hash(inode);
}

void bdev_unhash(struct block_device *bdev)
{
        remove_inode_hash(BD_INODE(bdev));
}

void bdev_drop(struct block_device *bdev)
{
        iput(BD_INODE(bdev));
}

long nr_blockdev_pages(void)
{
        struct inode *inode;
        long ret = 0;

        spin_lock(&blockdev_superblock->s_inode_list_lock);
        list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
                ret += inode->i_mapping->nrpages;
        spin_unlock(&blockdev_superblock->s_inode_list_lock);

        return ret;
}

/**
 * bd_may_claim - test whether a block device can be claimed
 * @bdev: block device of interest
 * @holder: holder trying to claim @bdev
 * @hops: holder ops
 *
 * Test whether @bdev can be claimed by @holder.
 *
 * RETURNS:
 * %true if @bdev can be claimed, %false otherwise.
 */
static bool bd_may_claim(struct block_device *bdev, void *holder,
                const struct blk_holder_ops *hops)
{
        struct block_device *whole = bdev_whole(bdev);

        lockdep_assert_held(&bdev_lock);

        if (bdev->bd_holder) {
                /*
                 * The same holder can always re-claim.
                 */
                if (bdev->bd_holder == holder) {
                        if (WARN_ON_ONCE(bdev->bd_holder_ops != hops))
                                return false;
                        return true;
                }
                return false;
        }

        /*
         * If the whole devices holder is set to bd_may_claim, a partition on
         * the device is claimed, but not the whole device.
         */
        if (whole != bdev &&
            whole->bd_holder && whole->bd_holder != bd_may_claim)
                return false;
        return true;
}

/**
 * bd_prepare_to_claim - claim a block device
 * @bdev: block device of interest
 * @holder: holder trying to claim @bdev
 * @hops: holder ops.
 *
 * Claim @bdev.  This function fails if @bdev is already claimed by another
 * holder and waits if another claiming is in progress. return, the caller
 * has ownership of bd_claiming and bd_holder[s].
 *
 * RETURNS:
 * 0 if @bdev can be claimed, -EBUSY otherwise.
 */
int bd_prepare_to_claim(struct block_device *bdev, void *holder,
                const struct blk_holder_ops *hops)
{
        struct block_device *whole = bdev_whole(bdev);

        if (WARN_ON_ONCE(!holder))
                return -EINVAL;
retry:
        mutex_lock(&bdev_lock);
        /* if someone else claimed, fail */
        if (!bd_may_claim(bdev, holder, hops)) {
                mutex_unlock(&bdev_lock);
                return -EBUSY;
        }

        /* if claiming is already in progress, wait for it to finish */
        if (whole->bd_claiming) {
                wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming);
                DEFINE_WAIT(wait);

                prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&bdev_lock);
                schedule();
                finish_wait(wq, &wait);
                goto retry;
        }

        /* yay, all mine */
        whole->bd_claiming = holder;
        mutex_unlock(&bdev_lock);
        return 0;
}
EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */

static void bd_clear_claiming(struct block_device *whole, void *holder)
{
        lockdep_assert_held(&bdev_lock);
        /* tell others that we're done */
        BUG_ON(whole->bd_claiming != holder);
        whole->bd_claiming = NULL;
        wake_up_var(&whole->bd_claiming);
}

/**
 * bd_finish_claiming - finish claiming of a block device
 * @bdev: block device of interest
 * @holder: holder that has claimed @bdev
 * @hops: block device holder operations
 *
 * Finish exclusive open of a block device. Mark the device as exlusively
 * open by the holder and wake up all waiters for exclusive open to finish.
 */
static void bd_finish_claiming(struct block_device *bdev, void *holder,
                const struct blk_holder_ops *hops)
{
        struct block_device *whole = bdev_whole(bdev);

        mutex_lock(&bdev_lock);
        BUG_ON(!bd_may_claim(bdev, holder, hops));
        /*
         * Note that for a whole device bd_holders will be incremented twice,
         * and bd_holder will be set to bd_may_claim before being set to holder
         */
        whole->bd_holders++;
        whole->bd_holder = bd_may_claim;
        bdev->bd_holders++;
        mutex_lock(&bdev->bd_holder_lock);
        bdev->bd_holder = holder;
        bdev->bd_holder_ops = hops;
        mutex_unlock(&bdev->bd_holder_lock);
        bd_clear_claiming(whole, holder);
        mutex_unlock(&bdev_lock);
}

/**
 * bd_abort_claiming - abort claiming of a block device
 * @bdev: block device of interest
 * @holder: holder that has claimed @bdev
 *
 * Abort claiming of a block device when the exclusive open failed. This can be
 * also used when exclusive open is not actually desired and we just needed
 * to block other exclusive openers for a while.
 */
void bd_abort_claiming(struct block_device *bdev, void *holder)
{
        mutex_lock(&bdev_lock);
        bd_clear_claiming(bdev_whole(bdev), holder);
        mutex_unlock(&bdev_lock);
}
EXPORT_SYMBOL(bd_abort_claiming);

static void bd_end_claim(struct block_device *bdev, void *holder)
{
        struct block_device *whole = bdev_whole(bdev);
        bool unblock = false;

        /*
         * Release a claim on the device.  The holder fields are protected with
         * bdev_lock.  open_mutex is used to synchronize disk_holder unlinking.
         */
        mutex_lock(&bdev_lock);
        WARN_ON_ONCE(bdev->bd_holder != holder);
        WARN_ON_ONCE(--bdev->bd_holders < 0);
        WARN_ON_ONCE(--whole->bd_holders < 0);
        if (!bdev->bd_holders) {
                mutex_lock(&bdev->bd_holder_lock);
                bdev->bd_holder = NULL;
                bdev->bd_holder_ops = NULL;
                mutex_unlock(&bdev->bd_holder_lock);
                if (bdev_test_flag(bdev, BD_WRITE_HOLDER))
                        unblock = true;
        }
        if (!whole->bd_holders)
                whole->bd_holder = NULL;
        mutex_unlock(&bdev_lock);

        /*
         * If this was the last claim, remove holder link and unblock evpoll if
         * it was a write holder.
         */
        if (unblock) {
                disk_unblock_events(bdev->bd_disk);
                bdev_clear_flag(bdev, BD_WRITE_HOLDER);
        }
}

static void blkdev_flush_mapping(struct block_device *bdev)
{
        WARN_ON_ONCE(bdev->bd_holders);
        sync_blockdev(bdev);
        kill_bdev(bdev);
        bdev_write_inode(bdev);
}

static void blkdev_put_whole(struct block_device *bdev)
{
        if (atomic_dec_and_test(&bdev->bd_openers))
                blkdev_flush_mapping(bdev);
        if (bdev->bd_disk->fops->release)
                bdev->bd_disk->fops->release(bdev->bd_disk);
}

static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode)
{
        struct gendisk *disk = bdev->bd_disk;
        int ret;

        if (disk->fops->open) {
                ret = disk->fops->open(disk, mode);
                if (ret) {
                        /* avoid ghost partitions on a removed medium */
                        if (ret == -ENOMEDIUM &&
                             test_bit(GD_NEED_PART_SCAN, &disk->state))
                                bdev_disk_changed(disk, true);
                        return ret;
                }
        }

        if (!atomic_read(&bdev->bd_openers))
                set_init_blocksize(bdev);
        atomic_inc(&bdev->bd_openers);
        if (test_bit(GD_NEED_PART_SCAN, &disk->state)) {
                /*
                 * Only return scanning errors if we are called from contexts
                 * that explicitly want them, e.g. the BLKRRPART ioctl.
                 */
                ret = bdev_disk_changed(disk, false);
                if (ret && (mode & BLK_OPEN_STRICT_SCAN)) {
                        blkdev_put_whole(bdev);
                        return ret;
                }
        }
        return 0;
}

static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
{
        struct gendisk *disk = part->bd_disk;
        int ret;

        ret = blkdev_get_whole(bdev_whole(part), mode);
        if (ret)
                return ret;

        ret = -ENXIO;
        if (!bdev_nr_sectors(part))
                goto out_blkdev_put;

        if (!atomic_read(&part->bd_openers)) {
                disk->open_partitions++;
                set_init_blocksize(part);
        }
        atomic_inc(&part->bd_openers);
        return 0;

out_blkdev_put:
        blkdev_put_whole(bdev_whole(part));
        return ret;
}

int bdev_permission(dev_t dev, blk_mode_t mode, void *holder)
{
        int ret;

        ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
                        MAJOR(dev), MINOR(dev),
                        ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
                        ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
        if (ret)
                return ret;

        /* Blocking writes requires exclusive opener */
        if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
                return -EINVAL;

        /*
         * We're using error pointers to indicate to ->release() when we
         * failed to open that block device. Also this doesn't make sense.
         */
        if (WARN_ON_ONCE(IS_ERR(holder)))
                return -EINVAL;

        return 0;
}

static void blkdev_put_part(struct block_device *part)
{
        struct block_device *whole = bdev_whole(part);

        if (atomic_dec_and_test(&part->bd_openers)) {
                blkdev_flush_mapping(part);
                whole->bd_disk->open_partitions--;
        }
        blkdev_put_whole(whole);
}

struct block_device *blkdev_get_no_open(dev_t dev, bool autoload)
{
        struct block_device *bdev;
        struct inode *inode;

        inode = ilookup(blockdev_superblock, dev);
        if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
                blk_request_module(dev);
                inode = ilookup(blockdev_superblock, dev);
                if (inode)
                        pr_warn_ratelimited(
"block device autoloading is deprecated and will be removed.\n");
        }
        if (!inode)
                return NULL;

        /* switch from the inode reference to a device mode one: */
        bdev = &BDEV_I(inode)->bdev;
        if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
                bdev = NULL;
        iput(inode);
        return bdev;
}

void blkdev_put_no_open(struct block_device *bdev)
{
        put_device(&bdev->bd_device);
}

static bool bdev_writes_blocked(struct block_device *bdev)
{
        return bdev->bd_writers < 0;
}

static void bdev_block_writes(struct block_device *bdev)
{
        bdev->bd_writers--;
}

static void bdev_unblock_writes(struct block_device *bdev)
{
        bdev->bd_writers++;
}

static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode)
{
        if (bdev_allow_write_mounted)
                return true;
        /* Writes blocked? */
        if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
                return false;
        if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
                return false;
        return true;
}

static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
{
        if (bdev_allow_write_mounted)
                return;

        /* Claim exclusive or shared write access. */
        if (mode & BLK_OPEN_RESTRICT_WRITES)
                bdev_block_writes(bdev);
        else if (mode & BLK_OPEN_WRITE)
                bdev->bd_writers++;
}

static inline bool bdev_unclaimed(const struct file *bdev_file)
{
        return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host);
}

static void bdev_yield_write_access(struct file *bdev_file)
{
        struct block_device *bdev;

        if (bdev_allow_write_mounted)
                return;

        if (bdev_unclaimed(bdev_file))
                return;

        bdev = file_bdev(bdev_file);

        if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED)
                bdev_unblock_writes(bdev);
        else if (bdev_file->f_mode & FMODE_WRITE)
                bdev->bd_writers--;
}

/**
 * bdev_open - open a block device
 * @bdev: block device to open
 * @mode: open mode (BLK_OPEN_*)
 * @holder: exclusive holder identifier
 * @hops: holder operations
 * @bdev_file: file for the block device
 *
 * Open the block device. If @holder is not %NULL, the block device is opened
 * with exclusive access.  Exclusive opens may nest for the same @holder.
 *
 * CONTEXT:
 * Might sleep.
 *
 * RETURNS:
 * zero on success, -errno on failure.
 */
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
              const struct blk_holder_ops *hops, struct file *bdev_file)
{
        bool unblock_events = true;
        struct gendisk *disk = bdev->bd_disk;
        int ret;

        if (holder) {
                mode |= BLK_OPEN_EXCL;
                ret = bd_prepare_to_claim(bdev, holder, hops);
                if (ret)
                        return ret;
        } else {
                if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL))
                        return -EIO;
        }

        disk_block_events(disk);

        mutex_lock(&disk->open_mutex);
        ret = -ENXIO;
        if (!disk_live(disk))
                goto abort_claiming;
        if (!try_module_get(disk->fops->owner))
                goto abort_claiming;
        ret = -EBUSY;
        if (!bdev_may_open(bdev, mode))
                goto put_module;
        if (bdev_is_partition(bdev))
                ret = blkdev_get_part(bdev, mode);
        else
                ret = blkdev_get_whole(bdev, mode);
        if (ret)
                goto put_module;
        bdev_claim_write_access(bdev, mode);
        if (holder) {
                bd_finish_claiming(bdev, holder, hops);

                /*
                 * Block event polling for write claims if requested.  Any write
                 * holder makes the write_holder state stick until all are
                 * released.  This is good enough and tracking individual
                 * writeable reference is too fragile given the way @mode is
                 * used in blkdev_get/put().
                 */
                if ((mode & BLK_OPEN_WRITE) &&
                    !bdev_test_flag(bdev, BD_WRITE_HOLDER) &&
                    (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
                        bdev_set_flag(bdev, BD_WRITE_HOLDER);
                        unblock_events = false;
                }
        }
        mutex_unlock(&disk->open_mutex);

        if (unblock_events)
                disk_unblock_events(disk);

        bdev_file->f_flags |= O_LARGEFILE;
        bdev_file->f_mode |= FMODE_CAN_ODIRECT;
        if (bdev_nowait(bdev))
                bdev_file->f_mode |= FMODE_NOWAIT;
        if (mode & BLK_OPEN_RESTRICT_WRITES)
                bdev_file->f_mode |= FMODE_WRITE_RESTRICTED;
        bdev_file->f_mapping = bdev->bd_mapping;
        bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
        bdev_file->private_data = holder;

        return 0;
put_module:
        module_put(disk->fops->owner);
abort_claiming:
        if (holder)
                bd_abort_claiming(bdev, holder);
        mutex_unlock(&disk->open_mutex);
        disk_unblock_events(disk);
        return ret;
}

/*
 * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk
 * associated with the floppy driver where it has allowed ioctls if the
 * file was opened for writing, but does not allow reads or writes.
 * Make sure that this quirk is reflected in @f_flags.
 *
 * It can also happen if a block device is opened as O_RDWR | O_WRONLY.
 */
static unsigned blk_to_file_flags(blk_mode_t mode)
{
        unsigned int flags = 0;

        if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) ==
            (BLK_OPEN_READ | BLK_OPEN_WRITE))
                flags |= O_RDWR;
        else if (mode & BLK_OPEN_WRITE_IOCTL)
                flags |= O_RDWR | O_WRONLY;
        else if (mode & BLK_OPEN_WRITE)
                flags |= O_WRONLY;
        else if (mode & BLK_OPEN_READ)
                flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */
        else
                WARN_ON_ONCE(true);

        if (mode & BLK_OPEN_NDELAY)
                flags |= O_NDELAY;

        return flags;
}

struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
                                   const struct blk_holder_ops *hops)
{
        struct file *bdev_file;
        struct block_device *bdev;
        unsigned int flags;
        int ret;

        ret = bdev_permission(dev, mode, holder);
        if (ret)
                return ERR_PTR(ret);

        bdev = blkdev_get_no_open(dev, true);
        if (!bdev)
                return ERR_PTR(-ENXIO);

        flags = blk_to_file_flags(mode);
        bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev),
                        blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops);
        if (IS_ERR(bdev_file)) {
                blkdev_put_no_open(bdev);
                return bdev_file;
        }
        ihold(BD_INODE(bdev));

        ret = bdev_open(bdev, mode, holder, hops, bdev_file);
        if (ret) {
                /* We failed to open the block device. Let ->release() know. */
                bdev_file->private_data = ERR_PTR(ret);
                fput(bdev_file);
                return ERR_PTR(ret);
        }
        return bdev_file;
}
EXPORT_SYMBOL(bdev_file_open_by_dev);

struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
                                    void *holder,
                                    const struct blk_holder_ops *hops)
{
        struct file *file;
        dev_t dev;
        int error;

        error = lookup_bdev(path, &dev);
        if (error)
                return ERR_PTR(error);

        file = bdev_file_open_by_dev(dev, mode, holder, hops);
        if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) {
                if (bdev_read_only(file_bdev(file))) {
                        fput(file);
                        file = ERR_PTR(-EACCES);
                }
        }

        return file;
}
EXPORT_SYMBOL(bdev_file_open_by_path);

static inline void bd_yield_claim(struct file *bdev_file)
{
        struct block_device *bdev = file_bdev(bdev_file);
        void *holder = bdev_file->private_data;

        lockdep_assert_held(&bdev->bd_disk->open_mutex);

        if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder)))
                return;

        if (!bdev_unclaimed(bdev_file))
                bd_end_claim(bdev, holder);
}

void bdev_release(struct file *bdev_file)
{
        struct block_device *bdev = file_bdev(bdev_file);
        void *holder = bdev_file->private_data;
        struct gendisk *disk = bdev->bd_disk;

        /* We failed to open that block device. */
        if (IS_ERR(holder))
                goto put_no_open;

        /*
         * Sync early if it looks like we're the last one.  If someone else
         * opens the block device between now and the decrement of bd_openers
         * then we did a sync that we didn't need to, but that's not the end
         * of the world and we want to avoid long (could be several minute)
         * syncs while holding the mutex.
         */
        if (atomic_read(&bdev->bd_openers) == 1)
                sync_blockdev(bdev);

        mutex_lock(&disk->open_mutex);
        bdev_yield_write_access(bdev_file);

        if (holder)
                bd_yield_claim(bdev_file);

        /*
         * Trigger event checking and tell drivers to flush MEDIA_CHANGE
         * event.  This is to ensure detection of media removal commanded
         * from userland - e.g. eject(1).
         */
        disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);

        if (bdev_is_partition(bdev))
                blkdev_put_part(bdev);
        else
                blkdev_put_whole(bdev);
        mutex_unlock(&disk->open_mutex);

        module_put(disk->fops->owner);
put_no_open:
        blkdev_put_no_open(bdev);
}

/**
 * bdev_fput - yield claim to the block device and put the file
 * @bdev_file: open block device
 *
 * Yield claim on the block device and put the file. Ensure that the
 * block device can be reclaimed before the file is closed which is a
 * deferred operation.
 */
void bdev_fput(struct file *bdev_file)
{
        if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops))
                return;

        if (bdev_file->private_data) {
                struct block_device *bdev = file_bdev(bdev_file);
                struct gendisk *disk = bdev->bd_disk;

                mutex_lock(&disk->open_mutex);
                bdev_yield_write_access(bdev_file);
                bd_yield_claim(bdev_file);
                /*
                 * Tell release we already gave up our hold on the
                 * device and if write restrictions are available that
                 * we already gave up write access to the device.
                 */
                bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host);
                mutex_unlock(&disk->open_mutex);
        }

        fput(bdev_file);
}
EXPORT_SYMBOL(bdev_fput);

/**
 * lookup_bdev() - Look up a struct block_device by name.
 * @pathname: Name of the block device in the filesystem.
 * @dev: Pointer to the block device's dev_t, if found.
 *
 * Lookup the block device's dev_t at @pathname in the current
 * namespace if possible and return it in @dev.
 *
 * Context: May sleep.
 * Return: 0 if succeeded, negative errno otherwise.
 */
int lookup_bdev(const char *pathname, dev_t *dev)
{
        struct inode *inode;
        struct path path;
        int error;

        if (!pathname || !*pathname)
                return -EINVAL;

        error = kern_path(pathname, LOOKUP_FOLLOW, &path);
        if (error)
                return error;

        inode = d_backing_inode(path.dentry);
        error = -ENOTBLK;
        if (!S_ISBLK(inode->i_mode))
                goto out_path_put;
        error = -EACCES;
        if (!may_open_dev(&path))
                goto out_path_put;

        *dev = inode->i_rdev;
        error = 0;
out_path_put:
        path_put(&path);
        return error;
}
EXPORT_SYMBOL(lookup_bdev);

/**
 * bdev_mark_dead - mark a block device as dead
 * @bdev: block device to operate on
 * @surprise: indicate a surprise removal
 *
 * Tell the file system that this devices or media is dead.  If @surprise is set
 * to %true the device or media is already gone, if not we are preparing for an
 * orderly removal.
 *
 * This calls into the file system, which then typicall syncs out all dirty data
 * and writes back inodes and then invalidates any cached data in the inodes on
 * the file system.  In addition we also invalidate the block device mapping.
 */
void bdev_mark_dead(struct block_device *bdev, bool surprise)
{
        mutex_lock(&bdev->bd_holder_lock);
        if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
                bdev->bd_holder_ops->mark_dead(bdev, surprise);
        else {
                mutex_unlock(&bdev->bd_holder_lock);
                sync_blockdev(bdev);
        }

        invalidate_bdev(bdev);
}
/*
 * New drivers should not use this directly.  There are some drivers however
 * that needs this for historical reasons. For example, the DASD driver has
 * historically had a shutdown to offline mode that doesn't actually remove the
 * gendisk that otherwise looks a lot like a safe device removal.
 */
EXPORT_SYMBOL_GPL(bdev_mark_dead);

void sync_bdevs(bool wait)
{
        struct inode *inode, *old_inode = NULL;

        spin_lock(&blockdev_superblock->s_inode_list_lock);
        list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
                struct address_space *mapping = inode->i_mapping;
                struct block_device *bdev;

                spin_lock(&inode->i_lock);
                if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW) ||
                    mapping->nrpages == 0) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&blockdev_superblock->s_inode_list_lock);
                /*
                 * We hold a reference to 'inode' so it couldn't have been
                 * removed from s_inodes list while we dropped the
                 * s_inode_list_lock  We cannot iput the inode now as we can
                 * be holding the last reference and we cannot iput it under
                 * s_inode_list_lock. So we keep the reference and iput it
                 * later.
                 */
                iput(old_inode);
                old_inode = inode;
                bdev = I_BDEV(inode);

                mutex_lock(&bdev->bd_disk->open_mutex);
                if (!atomic_read(&bdev->bd_openers)) {
                        ; /* skip */
                } else if (wait) {
                        /*
                         * We keep the error status of individual mapping so
                         * that applications can catch the writeback error using
                         * fsync(2). See filemap_fdatawait_keep_errors() for
                         * details.
                         */
                        filemap_fdatawait_keep_errors(inode->i_mapping);
                } else {
                        filemap_fdatawrite(inode->i_mapping);
                }
                mutex_unlock(&bdev->bd_disk->open_mutex);

                spin_lock(&blockdev_superblock->s_inode_list_lock);
        }
        spin_unlock(&blockdev_superblock->s_inode_list_lock);
        iput(old_inode);
}

/*
 * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices.
 */
void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask)
{
        struct block_device *bdev;

        /*
         * Note that d_backing_inode() returns the block device node inode, not
         * the block device's internal inode.  Therefore it is *not* valid to
         * use I_BDEV() here; the block device has to be looked up by i_rdev
         * instead.
         */
        bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false);
        if (!bdev)
                return;

        if (request_mask & STATX_DIOALIGN) {
                stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
                stat->dio_offset_align = bdev_logical_block_size(bdev);
                stat->result_mask |= STATX_DIOALIGN;
        }

        if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) {
                struct request_queue *bd_queue = bdev->bd_queue;

                generic_fill_statx_atomic_writes(stat,
                        queue_atomic_write_unit_min_bytes(bd_queue),
                        queue_atomic_write_unit_max_bytes(bd_queue),
                        0);
        }

        stat->blksize = bdev_io_min(bdev);

        blkdev_put_no_open(bdev);
}

bool disk_live(struct gendisk *disk)
{
        return !inode_unhashed(BD_INODE(disk->part0));
}
EXPORT_SYMBOL_GPL(disk_live);

unsigned int block_size(struct block_device *bdev)
{
        return 1 << BD_INODE(bdev)->i_blkbits;
}
EXPORT_SYMBOL_GPL(block_size);

static int __init setup_bdev_allow_write_mounted(char *str)
{
        if (kstrtobool(str, &bdev_allow_write_mounted))
                pr_warn("Invalid option string for bdev_allow_write_mounted:"
                        " '%s'\n", str);
        return 1;
}
__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);














































































    5 











































































































































































    6 



























    6 















    6 









    6 

    5 






    6 






    6 



    6 



    6 











    6 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Packet matching code.
 *
 * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
 * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
 * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/capability.h>
#include <linux/in.h>
#include <linux/skbuff.h>
#include <linux/kmod.h>
#include <linux/vmalloc.h>
#include <linux/netdevice.h>
#include <linux/module.h>
#include <linux/poison.h>
#include <net/ipv6.h>
#include <net/compat.h>
#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
#include <linux/cpumask.h>

#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/x_tables.h>
#include <net/netfilter/nf_log.h>
#include "../../netfilter/xt_repldata.h"

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv6 packet filter");

void *ip6t_alloc_initial_table(const struct xt_table *info)
{
        return xt_alloc_initial_table(ip6t, IP6T);
}
EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table);

/* Returns whether matches rule or not. */
/* Performance critical - called for every packet */
static inline bool
ip6_packet_match(const struct sk_buff *skb,
                 const char *indev,
                 const char *outdev,
                 const struct ip6t_ip6 *ip6info,
                 unsigned int *protoff,
                 u16 *fragoff, bool *hotdrop)
{
        unsigned long ret;
        const struct ipv6hdr *ipv6 = ipv6_hdr(skb);

        if (NF_INVF(ip6info, IP6T_INV_SRCIP,
                    ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk,
                                         &ip6info->src)) ||
            NF_INVF(ip6info, IP6T_INV_DSTIP,
                    ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk,
                                         &ip6info->dst)))
                return false;

        ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask);

        if (NF_INVF(ip6info, IP6T_INV_VIA_IN, ret != 0))
                return false;

        ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask);

        if (NF_INVF(ip6info, IP6T_INV_VIA_OUT, ret != 0))
                return false;

/* ... might want to do something with class and flowlabel here ... */

        /* look for the desired protocol header */
        if (ip6info->flags & IP6T_F_PROTO) {
                int protohdr;
                unsigned short _frag_off;

                protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off, NULL);
                if (protohdr < 0) {
                        if (_frag_off == 0)
                                *hotdrop = true;
                        return false;
                }
                *fragoff = _frag_off;

                if (ip6info->proto == protohdr) {
                        if (ip6info->invflags & IP6T_INV_PROTO)
                                return false;

                        return true;
                }

                /* We need match for the '-p all', too! */
                if ((ip6info->proto != 0) &&
                        !(ip6info->invflags & IP6T_INV_PROTO))
                        return false;
        }
        return true;
}

/* should be ip6 safe */
static bool
ip6_checkentry(const struct ip6t_ip6 *ipv6)
{
        if (ipv6->flags & ~IP6T_F_MASK)
                return false;
        if (ipv6->invflags & ~IP6T_INV_MASK)
                return false;

        return true;
}

static unsigned int
ip6t_error(struct sk_buff *skb, const struct xt_action_param *par)
{
        net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);

        return NF_DROP;
}

static inline struct ip6t_entry *
get_entry(const void *base, unsigned int offset)
{
        return (struct ip6t_entry *)(base + offset);
}

/* All zeroes == unconditional rule. */
/* Mildly perf critical (only if packet tracing is on) */
static inline bool unconditional(const struct ip6t_entry *e)
{
        static const struct ip6t_ip6 uncond;

        return e->target_offset == sizeof(struct ip6t_entry) &&
               memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0;
}

static inline const struct xt_entry_target *
ip6t_get_target_c(const struct ip6t_entry *e)
{
        return ip6t_get_target((struct ip6t_entry *)e);
}

#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* This cries for unification! */
static const char *const hooknames[] = {
        [NF_INET_PRE_ROUTING]                = "PREROUTING",
        [NF_INET_LOCAL_IN]                = "INPUT",
        [NF_INET_FORWARD]                = "FORWARD",
        [NF_INET_LOCAL_OUT]                = "OUTPUT",
        [NF_INET_POST_ROUTING]                = "POSTROUTING",
};

enum nf_ip_trace_comments {
        NF_IP6_TRACE_COMMENT_RULE,
        NF_IP6_TRACE_COMMENT_RETURN,
        NF_IP6_TRACE_COMMENT_POLICY,
};

static const char *const comments[] = {
        [NF_IP6_TRACE_COMMENT_RULE]        = "rule",
        [NF_IP6_TRACE_COMMENT_RETURN]        = "return",
        [NF_IP6_TRACE_COMMENT_POLICY]        = "policy",
};

static const struct nf_loginfo trace_loginfo = {
        .type = NF_LOG_TYPE_LOG,
        .u = {
                .log = {
                        .level = LOGLEVEL_WARNING,
                        .logflags = NF_LOG_DEFAULT_MASK,
                },
        },
};

/* Mildly perf critical (only if packet tracing is on) */
static inline int
get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
                      const char *hookname, const char **chainname,
                      const char **comment, unsigned int *rulenum)
{
        const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);

        if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
                /* Head of user chain: ERROR target with chainname */
                *chainname = t->target.data;
                (*rulenum) = 0;
        } else if (s == e) {
                (*rulenum)++;

                if (unconditional(s) &&
                    strcmp(t->target.u.kernel.target->name,
                           XT_STANDARD_TARGET) == 0 &&
                    t->verdict < 0) {
                        /* Tail of chains: STANDARD target (return/policy) */
                        *comment = *chainname == hookname
                                ? comments[NF_IP6_TRACE_COMMENT_POLICY]
                                : comments[NF_IP6_TRACE_COMMENT_RETURN];
                }
                return 1;
        } else
                (*rulenum)++;

        return 0;
}

static void trace_packet(struct net *net,
                         const struct sk_buff *skb,
                         unsigned int hook,
                         const struct net_device *in,
                         const struct net_device *out,
                         const char *tablename,
                         const struct xt_table_info *private,
                         const struct ip6t_entry *e)
{
        const struct ip6t_entry *root;
        const char *hookname, *chainname, *comment;
        const struct ip6t_entry *iter;
        unsigned int rulenum = 0;

        root = get_entry(private->entries, private->hook_entry[hook]);

        hookname = chainname = hooknames[hook];
        comment = comments[NF_IP6_TRACE_COMMENT_RULE];

        xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
                if (get_chainname_rulenum(iter, e, hookname,
                    &chainname, &comment, &rulenum) != 0)
                        break;

        nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo,
                     "TRACE: %s:%s:%s:%u ",
                     tablename, chainname, comment, rulenum);
}
#endif

static inline struct ip6t_entry *
ip6t_next_entry(const struct ip6t_entry *entry)
{
        return (void *)entry + entry->next_offset;
}

/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ip6t_do_table(void *priv, struct sk_buff *skb,
              const struct nf_hook_state *state)
{
        const struct xt_table *table = priv;
        unsigned int hook = state->hook;
        static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
        /* Initializing verdict to NF_DROP keeps gcc happy. */
        unsigned int verdict = NF_DROP;
        const char *indev, *outdev;
        const void *table_base;
        struct ip6t_entry *e, **jumpstack;
        unsigned int stackidx, cpu;
        const struct xt_table_info *private;
        struct xt_action_param acpar;
        unsigned int addend;

        /* Initialization */
        stackidx = 0;
        indev = state->in ? state->in->name : nulldevname;
        outdev = state->out ? state->out->name : nulldevname;
        /* We handle fragments by dealing with the first fragment as
         * if it was a normal packet.  All other fragments are treated
         * normally, except that they will NEVER match rules that ask
         * things we don't know, ie. tcp syn flag or ports).  If the
         * rule is also a fragment-specific rule, non-fragments won't
         * match it. */
        acpar.fragoff = 0;
        acpar.hotdrop = false;
        acpar.state   = state;

        WARN_ON(!(table->valid_hooks & (1 << hook)));

        local_bh_disable();
        addend = xt_write_recseq_begin();
        private = READ_ONCE(table->private); /* Address dependency. */
        cpu        = smp_processor_id();
        table_base = private->entries;
        jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];

        /* Switch to alternate jumpstack if we're being invoked via TEE.
         * TEE issues XT_CONTINUE verdict on original skb so we must not
         * clobber the jumpstack.
         *
         * For recursion via REJECT or SYNPROXY the stack will be clobbered
         * but it is no problem since absolute verdict is issued by these.
         */
        if (static_key_false(&xt_tee_enabled))
                jumpstack += private->stacksize * current->in_nf_duplicate;

        e = get_entry(table_base, private->hook_entry[hook]);

        do {
                const struct xt_entry_target *t;
                const struct xt_entry_match *ematch;
                struct xt_counters *counter;

                WARN_ON(!e);
                acpar.thoff = 0;
                if (!ip6_packet_match(skb, indev, outdev, &e->ipv6,
                    &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) {
 no_match:
                        e = ip6t_next_entry(e);
                        continue;
                }

                xt_ematch_foreach(ematch, e) {
                        acpar.match     = ematch->u.kernel.match;
                        acpar.matchinfo = ematch->data;
                        if (!acpar.match->match(skb, &acpar))
                                goto no_match;
                }

                counter = xt_get_this_cpu_counter(&e->counters);
                ADD_COUNTER(*counter, skb->len, 1);

                t = ip6t_get_target_c(e);
                WARN_ON(!t->u.kernel.target);

#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
                /* The packet is traced: log it */
                if (unlikely(skb->nf_trace))
                        trace_packet(state->net, skb, hook, state->in,
                                     state->out, table->name, private, e);
#endif
                /* Standard target? */
                if (!t->u.kernel.target->target) {
                        int v;

                        v = ((struct xt_standard_target *)t)->verdict;
                        if (v < 0) {
                                /* Pop from stack? */
                                if (v != XT_RETURN) {
                                        verdict = (unsigned int)(-v) - 1;
                                        break;
                                }
                                if (stackidx == 0)
                                        e = get_entry(table_base,
                                            private->underflow[hook]);
                                else
                                        e = ip6t_next_entry(jumpstack[--stackidx]);
                                continue;
                        }
                        if (table_base + v != ip6t_next_entry(e) &&
                            !(e->ipv6.flags & IP6T_F_GOTO)) {
                                if (unlikely(stackidx >= private->stacksize)) {
                                        verdict = NF_DROP;
                                        break;
                                }
                                jumpstack[stackidx++] = e;
                        }

                        e = get_entry(table_base, v);
                        continue;
                }

                acpar.target   = t->u.kernel.target;
                acpar.targinfo = t->data;

                verdict = t->u.kernel.target->target(skb, &acpar);
                if (verdict == XT_CONTINUE)
                        e = ip6t_next_entry(e);
                else
                        /* Verdict */
                        break;
        } while (!acpar.hotdrop);

        xt_write_recseq_end(addend);
        local_bh_enable();

        if (acpar.hotdrop)
                return NF_DROP;
        else return verdict;
}

/* Figures out from what hook each rule can be called: returns 0 if
   there are loops.  Puts hook bitmask in comefrom. */
static int
mark_source_chains(const struct xt_table_info *newinfo,
                   unsigned int valid_hooks, void *entry0,
                   unsigned int *offsets)
{
        unsigned int hook;

        /* No recursion; use packet counter to save back ptrs (reset
           to 0 as we leave), and comefrom to save source hook bitmask */
        for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
                unsigned int pos = newinfo->hook_entry[hook];
                struct ip6t_entry *e = entry0 + pos;

                if (!(valid_hooks & (1 << hook)))
                        continue;

                /* Set initial back pointer. */
                e->counters.pcnt = pos;

                for (;;) {
                        const struct xt_standard_target *t
                                = (void *)ip6t_get_target_c(e);
                        int visited = e->comefrom & (1 << hook);

                        if (e->comefrom & (1 << NF_INET_NUMHOOKS))
                                return 0;

                        e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));

                        /* Unconditional return/END. */
                        if ((unconditional(e) &&
                             (strcmp(t->target.u.user.name,
                                     XT_STANDARD_TARGET) == 0) &&
                             t->verdict < 0) || visited) {
                                unsigned int oldpos, size;

                                /* Return: backtrack through the last
                                   big jump. */
                                do {
                                        e->comefrom ^= (1<<NF_INET_NUMHOOKS);
                                        oldpos = pos;
                                        pos = e->counters.pcnt;
                                        e->counters.pcnt = 0;

                                        /* We're at the start. */
                                        if (pos == oldpos)
                                                goto next;

                                        e = entry0 + pos;
                                } while (oldpos == pos + e->next_offset);

                                /* Move along one */
                                size = e->next_offset;
                                e = entry0 + pos + size;
                                if (pos + size >= newinfo->size)
                                        return 0;
                                e->counters.pcnt = pos;
                                pos += size;
                        } else {
                                int newpos = t->verdict;

                                if (strcmp(t->target.u.user.name,
                                           XT_STANDARD_TARGET) == 0 &&
                                    newpos >= 0) {
                                        /* This a jump; chase it. */
                                        if (!xt_find_jump_offset(offsets, newpos,
                                                                 newinfo->number))
                                                return 0;
                                } else {
                                        /* ... this is a fallthru */
                                        newpos = pos + e->next_offset;
                                        if (newpos >= newinfo->size)
                                                return 0;
                                }
                                e = entry0 + newpos;
                                e->counters.pcnt = pos;
                                pos = newpos;
                        }
                }
next:                ;
        }
        return 1;
}

static void cleanup_match(struct xt_entry_match *m, struct net *net)
{
        struct xt_mtdtor_param par;

        par.net       = net;
        par.match     = m->u.kernel.match;
        par.matchinfo = m->data;
        par.family    = NFPROTO_IPV6;
        if (par.match->destroy != NULL)
                par.match->destroy(&par);
        module_put(par.match->me);
}

static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
        const struct ip6t_ip6 *ipv6 = par->entryinfo;

        par->match     = m->u.kernel.match;
        par->matchinfo = m->data;

        return xt_check_match(par, m->u.match_size - sizeof(*m),
                              ipv6->proto, ipv6->invflags & IP6T_INV_PROTO);
}

static int
find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
        struct xt_match *match;
        int ret;

        match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
                                      m->u.user.revision);
        if (IS_ERR(match))
                return PTR_ERR(match);

        m->u.kernel.match = match;

        ret = check_match(m, par);
        if (ret)
                goto err;

        return 0;
err:
        module_put(m->u.kernel.match->me);
        return ret;
}

static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
{
        struct xt_entry_target *t = ip6t_get_target(e);
        struct xt_tgchk_param par = {
                .net       = net,
                .table     = name,
                .entryinfo = e,
                .target    = t->u.kernel.target,
                .targinfo  = t->data,
                .hook_mask = e->comefrom,
                .family    = NFPROTO_IPV6,
        };

        return xt_check_target(&par, t->u.target_size - sizeof(*t),
                               e->ipv6.proto,
                               e->ipv6.invflags & IP6T_INV_PROTO);
}

static int
find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
                 unsigned int size,
                 struct xt_percpu_counter_alloc_state *alloc_state)
{
        struct xt_entry_target *t;
        struct xt_target *target;
        int ret;
        unsigned int j;
        struct xt_mtchk_param mtpar;
        struct xt_entry_match *ematch;

        if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
                return -ENOMEM;

        j = 0;
        memset(&mtpar, 0, sizeof(mtpar));
        mtpar.net        = net;
        mtpar.table     = name;
        mtpar.entryinfo = &e->ipv6;
        mtpar.hook_mask = e->comefrom;
        mtpar.family    = NFPROTO_IPV6;
        xt_ematch_foreach(ematch, e) {
                ret = find_check_match(ematch, &mtpar);
                if (ret != 0)
                        goto cleanup_matches;
                ++j;
        }

        t = ip6t_get_target(e);
        target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
                                        t->u.user.revision);
        if (IS_ERR(target)) {
                ret = PTR_ERR(target);
                goto cleanup_matches;
        }
        t->u.kernel.target = target;

        ret = check_target(e, net, name);
        if (ret)
                goto err;
        return 0;
 err:
        module_put(t->u.kernel.target->me);
 cleanup_matches:
        xt_ematch_foreach(ematch, e) {
                if (j-- == 0)
                        break;
                cleanup_match(ematch, net);
        }

        xt_percpu_counter_free(&e->counters);

        return ret;
}

static bool check_underflow(const struct ip6t_entry *e)
{
        const struct xt_entry_target *t;
        unsigned int verdict;

        if (!unconditional(e))
                return false;
        t = ip6t_get_target_c(e);
        if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
                return false;
        verdict = ((struct xt_standard_target *)t)->verdict;
        verdict = -verdict - 1;
        return verdict == NF_DROP || verdict == NF_ACCEPT;
}

static int
check_entry_size_and_hooks(struct ip6t_entry *e,
                           struct xt_table_info *newinfo,
                           const unsigned char *base,
                           const unsigned char *limit,
                           const unsigned int *hook_entries,
                           const unsigned int *underflows,
                           unsigned int valid_hooks)
{
        unsigned int h;
        int err;

        if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 ||
            (unsigned char *)e + sizeof(struct ip6t_entry) >= limit ||
            (unsigned char *)e + e->next_offset > limit)
                return -EINVAL;

        if (e->next_offset
            < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target))
                return -EINVAL;

        if (!ip6_checkentry(&e->ipv6))
                return -EINVAL;

        err = xt_check_entry_offsets(e, e->elems, e->target_offset,
                                     e->next_offset);
        if (err)
                return err;

        /* Check hooks & underflows */
        for (h = 0; h < NF_INET_NUMHOOKS; h++) {
                if (!(valid_hooks & (1 << h)))
                        continue;
                if ((unsigned char *)e - base == hook_entries[h])
                        newinfo->hook_entry[h] = hook_entries[h];
                if ((unsigned char *)e - base == underflows[h]) {
                        if (!check_underflow(e))
                                return -EINVAL;

                        newinfo->underflow[h] = underflows[h];
                }
        }

        /* Clear counters and comefrom */
        e->counters = ((struct xt_counters) { 0, 0 });
        e->comefrom = 0;
        return 0;
}

static void cleanup_entry(struct ip6t_entry *e, struct net *net)
{
        struct xt_tgdtor_param par;
        struct xt_entry_target *t;
        struct xt_entry_match *ematch;

        /* Cleanup all matches */
        xt_ematch_foreach(ematch, e)
                cleanup_match(ematch, net);
        t = ip6t_get_target(e);

        par.net      = net;
        par.target   = t->u.kernel.target;
        par.targinfo = t->data;
        par.family   = NFPROTO_IPV6;
        if (par.target->destroy != NULL)
                par.target->destroy(&par);
        module_put(par.target->me);
        xt_percpu_counter_free(&e->counters);
}

/* Checks and translates the user-supplied table segment (held in
   newinfo) */
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
                const struct ip6t_replace *repl)
{
        struct xt_percpu_counter_alloc_state alloc_state = { 0 };
        struct ip6t_entry *iter;
        unsigned int *offsets;
        unsigned int i;
        int ret = 0;

        newinfo->size = repl->size;
        newinfo->number = repl->num_entries;

        /* Init all hooks to impossible value. */
        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                newinfo->hook_entry[i] = 0xFFFFFFFF;
                newinfo->underflow[i] = 0xFFFFFFFF;
        }

        offsets = xt_alloc_entry_offsets(newinfo->number);
        if (!offsets)
                return -ENOMEM;
        i = 0;
        /* Walk through entries, checking offsets. */
        xt_entry_foreach(iter, entry0, newinfo->size) {
                ret = check_entry_size_and_hooks(iter, newinfo, entry0,
                                                 entry0 + repl->size,
                                                 repl->hook_entry,
                                                 repl->underflow,
                                                 repl->valid_hooks);
                if (ret != 0)
                        goto out_free;
                if (i < repl->num_entries)
                        offsets[i] = (void *)iter - entry0;
                ++i;
                if (strcmp(ip6t_get_target(iter)->u.user.name,
                    XT_ERROR_TARGET) == 0)
                        ++newinfo->stacksize;
        }

        ret = -EINVAL;
        if (i != repl->num_entries)
                goto out_free;

        ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
        if (ret)
                goto out_free;

        if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
                ret = -ELOOP;
                goto out_free;
        }
        kvfree(offsets);

        /* Finally, each sanity check must pass */
        i = 0;
        xt_entry_foreach(iter, entry0, newinfo->size) {
                ret = find_check_entry(iter, net, repl->name, repl->size,
                                       &alloc_state);
                if (ret != 0)
                        break;
                ++i;
        }

        if (ret != 0) {
                xt_entry_foreach(iter, entry0, newinfo->size) {
                        if (i-- == 0)
                                break;
                        cleanup_entry(iter, net);
                }
                return ret;
        }

        return ret;
 out_free:
        kvfree(offsets);
        return ret;
}

static void
get_counters(const struct xt_table_info *t,
             struct xt_counters counters[])
{
        struct ip6t_entry *iter;
        unsigned int cpu;
        unsigned int i;

        for_each_possible_cpu(cpu) {
                seqcount_t *s = &per_cpu(xt_recseq, cpu);

                i = 0;
                xt_entry_foreach(iter, t->entries, t->size) {
                        struct xt_counters *tmp;
                        u64 bcnt, pcnt;
                        unsigned int start;

                        tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
                        do {
                                start = read_seqcount_begin(s);
                                bcnt = tmp->bcnt;
                                pcnt = tmp->pcnt;
                        } while (read_seqcount_retry(s, start));

                        ADD_COUNTER(counters[i], bcnt, pcnt);
                        ++i;
                        cond_resched();
                }
        }
}

static void get_old_counters(const struct xt_table_info *t,
                             struct xt_counters counters[])
{
        struct ip6t_entry *iter;
        unsigned int cpu, i;

        for_each_possible_cpu(cpu) {
                i = 0;
                xt_entry_foreach(iter, t->entries, t->size) {
                        const struct xt_counters *tmp;

                        tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
                        ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
                        ++i;
                }
                cond_resched();
        }
}

static struct xt_counters *alloc_counters(const struct xt_table *table)
{
        unsigned int countersize;
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;

        /* We need atomic snapshot of counters: rest doesn't change
           (other than comefrom, which userspace doesn't care
           about). */
        countersize = sizeof(struct xt_counters) * private->number;
        counters = vzalloc(countersize);

        if (counters == NULL)
                return ERR_PTR(-ENOMEM);

        get_counters(private, counters);

        return counters;
}

static int
copy_entries_to_user(unsigned int total_size,
                     const struct xt_table *table,
                     void __user *userptr)
{
        unsigned int off, num;
        const struct ip6t_entry *e;
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;
        int ret = 0;
        const void *loc_cpu_entry;

        counters = alloc_counters(table);
        if (IS_ERR(counters))
                return PTR_ERR(counters);

        loc_cpu_entry = private->entries;

        /* FIXME: use iterator macros --RR */
        /* ... then go back and fix counters and names */
        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
                unsigned int i;
                const struct xt_entry_match *m;
                const struct xt_entry_target *t;

                e = loc_cpu_entry + off;
                if (copy_to_user(userptr + off, e, sizeof(*e))) {
                        ret = -EFAULT;
                        goto free_counters;
                }
                if (copy_to_user(userptr + off
                                 + offsetof(struct ip6t_entry, counters),
                                 &counters[num],
                                 sizeof(counters[num])) != 0) {
                        ret = -EFAULT;
                        goto free_counters;
                }

                for (i = sizeof(struct ip6t_entry);
                     i < e->target_offset;
                     i += m->u.match_size) {
                        m = (void *)e + i;

                        if (xt_match_to_user(m, userptr + off + i)) {
                                ret = -EFAULT;
                                goto free_counters;
                        }
                }

                t = ip6t_get_target_c(e);
                if (xt_target_to_user(t, userptr + off + e->target_offset)) {
                        ret = -EFAULT;
                        goto free_counters;
                }
        }

 free_counters:
        vfree(counters);
        return ret;
}

#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
static void compat_standard_from_user(void *dst, const void *src)
{
        int v = *(compat_int_t *)src;

        if (v > 0)
                v += xt_compat_calc_jump(AF_INET6, v);
        memcpy(dst, &v, sizeof(v));
}

static int compat_standard_to_user(void __user *dst, const void *src)
{
        compat_int_t cv = *(int *)src;

        if (cv > 0)
                cv -= xt_compat_calc_jump(AF_INET6, cv);
        return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
}

static int compat_calc_entry(const struct ip6t_entry *e,
                             const struct xt_table_info *info,
                             const void *base, struct xt_table_info *newinfo)
{
        const struct xt_entry_match *ematch;
        const struct xt_entry_target *t;
        unsigned int entry_offset;
        int off, i, ret;

        off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
        entry_offset = (void *)e - base;
        xt_ematch_foreach(ematch, e)
                off += xt_compat_match_offset(ematch->u.kernel.match);
        t = ip6t_get_target_c(e);
        off += xt_compat_target_offset(t->u.kernel.target);
        newinfo->size -= off;
        ret = xt_compat_add_offset(AF_INET6, entry_offset, off);
        if (ret)
                return ret;

        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                if (info->hook_entry[i] &&
                    (e < (struct ip6t_entry *)(base + info->hook_entry[i])))
                        newinfo->hook_entry[i] -= off;
                if (info->underflow[i] &&
                    (e < (struct ip6t_entry *)(base + info->underflow[i])))
                        newinfo->underflow[i] -= off;
        }
        return 0;
}

static int compat_table_info(const struct xt_table_info *info,
                             struct xt_table_info *newinfo)
{
        struct ip6t_entry *iter;
        const void *loc_cpu_entry;
        int ret;

        if (!newinfo || !info)
                return -EINVAL;

        /* we dont care about newinfo->entries */
        memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
        newinfo->initial_entries = 0;
        loc_cpu_entry = info->entries;
        ret = xt_compat_init_offsets(AF_INET6, info->number);
        if (ret)
                return ret;
        xt_entry_foreach(iter, loc_cpu_entry, info->size) {
                ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
                if (ret != 0)
                        return ret;
        }
        return 0;
}
#endif

static int get_info(struct net *net, void __user *user, const int *len)
{
        char name[XT_TABLE_MAXNAMELEN];
        struct xt_table *t;
        int ret;

        if (*len != sizeof(struct ip6t_getinfo))
                return -EINVAL;

        if (copy_from_user(name, user, sizeof(name)) != 0)
                return -EFAULT;

        name[XT_TABLE_MAXNAMELEN-1] = '\0';
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        if (in_compat_syscall())
                xt_compat_lock(AF_INET6);
#endif
        t = xt_request_find_table_lock(net, AF_INET6, name);
        if (!IS_ERR(t)) {
                struct ip6t_getinfo info;
                const struct xt_table_info *private = t->private;
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                struct xt_table_info tmp;

                if (in_compat_syscall()) {
                        ret = compat_table_info(private, &tmp);
                        xt_compat_flush_offsets(AF_INET6);
                        private = &tmp;
                }
#endif
                memset(&info, 0, sizeof(info));
                info.valid_hooks = t->valid_hooks;
                memcpy(info.hook_entry, private->hook_entry,
                       sizeof(info.hook_entry));
                memcpy(info.underflow, private->underflow,
                       sizeof(info.underflow));
                info.num_entries = private->number;
                info.size = private->size;
                strcpy(info.name, name);

                if (copy_to_user(user, &info, *len) != 0)
                        ret = -EFAULT;
                else
                        ret = 0;

                xt_table_unlock(t);
                module_put(t->me);
        } else
                ret = PTR_ERR(t);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        if (in_compat_syscall())
                xt_compat_unlock(AF_INET6);
#endif
        return ret;
}

static int
get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
            const int *len)
{
        int ret;
        struct ip6t_get_entries get;
        struct xt_table *t;

        if (*len < sizeof(get))
                return -EINVAL;
        if (copy_from_user(&get, uptr, sizeof(get)) != 0)
                return -EFAULT;
        if (*len != sizeof(struct ip6t_get_entries) + get.size)
                return -EINVAL;

        get.name[sizeof(get.name) - 1] = '\0';

        t = xt_find_table_lock(net, AF_INET6, get.name);
        if (!IS_ERR(t)) {
                struct xt_table_info *private = t->private;
                if (get.size == private->size)
                        ret = copy_entries_to_user(private->size,
                                                   t, uptr->entrytable);
                else
                        ret = -EAGAIN;

                module_put(t->me);
                xt_table_unlock(t);
        } else
                ret = PTR_ERR(t);

        return ret;
}

static int
__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
             struct xt_table_info *newinfo, unsigned int num_counters,
             void __user *counters_ptr)
{
        int ret;
        struct xt_table *t;
        struct xt_table_info *oldinfo;
        struct xt_counters *counters;
        struct ip6t_entry *iter;

        counters = xt_counters_alloc(num_counters);
        if (!counters) {
                ret = -ENOMEM;
                goto out;
        }

        t = xt_request_find_table_lock(net, AF_INET6, name);
        if (IS_ERR(t)) {
                ret = PTR_ERR(t);
                goto free_newinfo_counters_untrans;
        }

        /* You lied! */
        if (valid_hooks != t->valid_hooks) {
                ret = -EINVAL;
                goto put_module;
        }

        oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
        if (!oldinfo)
                goto put_module;

        /* Update module usage count based on number of rules */
        if ((oldinfo->number > oldinfo->initial_entries) ||
            (newinfo->number <= oldinfo->initial_entries))
                module_put(t->me);
        if ((oldinfo->number > oldinfo->initial_entries) &&
            (newinfo->number <= oldinfo->initial_entries))
                module_put(t->me);

        xt_table_unlock(t);

        get_old_counters(oldinfo, counters);

        /* Decrease module usage counts and free resource */
        xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
                cleanup_entry(iter, net);

        xt_free_table_info(oldinfo);
        if (copy_to_user(counters_ptr, counters,
                         sizeof(struct xt_counters) * num_counters) != 0) {
                /* Silent error, can't fail, new table is already in place */
                net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n");
        }
        vfree(counters);
        return 0;

 put_module:
        module_put(t->me);
        xt_table_unlock(t);
 free_newinfo_counters_untrans:
        vfree(counters);
 out:
        return ret;
}

static int
do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
        int ret;
        struct ip6t_replace tmp;
        struct xt_table_info *newinfo;
        void *loc_cpu_entry;
        struct ip6t_entry *iter;

        if (len < sizeof(tmp))
                return -EINVAL;
        if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
                return -EFAULT;

        /* overflow check */
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
        if (tmp.num_counters == 0)
                return -EINVAL;
        if ((u64)len < (u64)tmp.size + sizeof(tmp))
                return -EINVAL;

        tmp.name[sizeof(tmp.name)-1] = 0;

        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
                        tmp.size) != 0) {
                ret = -EFAULT;
                goto free_newinfo;
        }

        ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
        if (ret != 0)
                goto free_newinfo;

        ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
                           tmp.num_counters, tmp.counters);
        if (ret)
                goto free_newinfo_untrans;
        return 0;

 free_newinfo_untrans:
        xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                cleanup_entry(iter, net);
 free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
}

static int
do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
{
        unsigned int i;
        struct xt_counters_info tmp;
        struct xt_counters *paddc;
        struct xt_table *t;
        const struct xt_table_info *private;
        int ret = 0;
        struct ip6t_entry *iter;
        unsigned int addend;

        paddc = xt_copy_counters(arg, len, &tmp);
        if (IS_ERR(paddc))
                return PTR_ERR(paddc);
        t = xt_find_table_lock(net, AF_INET6, tmp.name);
        if (IS_ERR(t)) {
                ret = PTR_ERR(t);
                goto free;
        }

        local_bh_disable();
        private = t->private;
        if (private->number != tmp.num_counters) {
                ret = -EINVAL;
                goto unlock_up_free;
        }

        i = 0;
        addend = xt_write_recseq_begin();
        xt_entry_foreach(iter, private->entries, private->size) {
                struct xt_counters *tmp;

                tmp = xt_get_this_cpu_counter(&iter->counters);
                ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
                ++i;
        }
        xt_write_recseq_end(addend);
 unlock_up_free:
        local_bh_enable();
        xt_table_unlock(t);
        module_put(t->me);
 free:
        vfree(paddc);

        return ret;
}

#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_ip6t_replace {
        char                        name[XT_TABLE_MAXNAMELEN];
        u32                        valid_hooks;
        u32                        num_entries;
        u32                        size;
        u32                        hook_entry[NF_INET_NUMHOOKS];
        u32                        underflow[NF_INET_NUMHOOKS];
        u32                        num_counters;
        compat_uptr_t                counters;        /* struct xt_counters * */
        struct compat_ip6t_entry entries[];
};

static int
compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
                          unsigned int *size, struct xt_counters *counters,
                          unsigned int i)
{
        struct xt_entry_target *t;
        struct compat_ip6t_entry __user *ce;
        u_int16_t target_offset, next_offset;
        compat_uint_t origsize;
        const struct xt_entry_match *ematch;
        int ret = 0;

        origsize = *size;
        ce = *dstptr;
        if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 ||
            copy_to_user(&ce->counters, &counters[i],
            sizeof(counters[i])) != 0)
                return -EFAULT;

        *dstptr += sizeof(struct compat_ip6t_entry);
        *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);

        xt_ematch_foreach(ematch, e) {
                ret = xt_compat_match_to_user(ematch, dstptr, size);
                if (ret != 0)
                        return ret;
        }
        target_offset = e->target_offset - (origsize - *size);
        t = ip6t_get_target(e);
        ret = xt_compat_target_to_user(t, dstptr, size);
        if (ret)
                return ret;
        next_offset = e->next_offset - (origsize - *size);
        if (put_user(target_offset, &ce->target_offset) != 0 ||
            put_user(next_offset, &ce->next_offset) != 0)
                return -EFAULT;
        return 0;
}

static int
compat_find_calc_match(struct xt_entry_match *m,
                       const struct ip6t_ip6 *ipv6,
                       int *size)
{
        struct xt_match *match;

        match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
                                      m->u.user.revision);
        if (IS_ERR(match))
                return PTR_ERR(match);

        m->u.kernel.match = match;
        *size += xt_compat_match_offset(match);
        return 0;
}

static void compat_release_entry(struct compat_ip6t_entry *e)
{
        struct xt_entry_target *t;
        struct xt_entry_match *ematch;

        /* Cleanup all matches */
        xt_ematch_foreach(ematch, e)
                module_put(ematch->u.kernel.match->me);
        t = compat_ip6t_get_target(e);
        module_put(t->u.kernel.target->me);
}

static int
check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
                                  struct xt_table_info *newinfo,
                                  unsigned int *size,
                                  const unsigned char *base,
                                  const unsigned char *limit)
{
        struct xt_entry_match *ematch;
        struct xt_entry_target *t;
        struct xt_target *target;
        unsigned int entry_offset;
        unsigned int j;
        int ret, off;

        if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 ||
            (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit ||
            (unsigned char *)e + e->next_offset > limit)
                return -EINVAL;

        if (e->next_offset < sizeof(struct compat_ip6t_entry) +
                             sizeof(struct compat_xt_entry_target))
                return -EINVAL;

        if (!ip6_checkentry(&e->ipv6))
                return -EINVAL;

        ret = xt_compat_check_entry_offsets(e, e->elems,
                                            e->target_offset, e->next_offset);
        if (ret)
                return ret;

        off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
        entry_offset = (void *)e - (void *)base;
        j = 0;
        xt_ematch_foreach(ematch, e) {
                ret = compat_find_calc_match(ematch, &e->ipv6, &off);
                if (ret != 0)
                        goto release_matches;
                ++j;
        }

        t = compat_ip6t_get_target(e);
        target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
                                        t->u.user.revision);
        if (IS_ERR(target)) {
                ret = PTR_ERR(target);
                goto release_matches;
        }
        t->u.kernel.target = target;

        off += xt_compat_target_offset(target);
        *size += off;
        ret = xt_compat_add_offset(AF_INET6, entry_offset, off);
        if (ret)
                goto out;

        return 0;

out:
        module_put(t->u.kernel.target->me);
release_matches:
        xt_ematch_foreach(ematch, e) {
                if (j-- == 0)
                        break;
                module_put(ematch->u.kernel.match->me);
        }
        return ret;
}

static void
compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
                            unsigned int *size,
                            struct xt_table_info *newinfo, unsigned char *base)
{
        struct xt_entry_target *t;
        struct ip6t_entry *de;
        unsigned int origsize;
        int h;
        struct xt_entry_match *ematch;

        origsize = *size;
        de = *dstptr;
        memcpy(de, e, sizeof(struct ip6t_entry));
        memcpy(&de->counters, &e->counters, sizeof(e->counters));

        *dstptr += sizeof(struct ip6t_entry);
        *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);

        xt_ematch_foreach(ematch, e)
                xt_compat_match_from_user(ematch, dstptr, size);

        de->target_offset = e->target_offset - (origsize - *size);
        t = compat_ip6t_get_target(e);
        xt_compat_target_from_user(t, dstptr, size);

        de->next_offset = e->next_offset - (origsize - *size);
        for (h = 0; h < NF_INET_NUMHOOKS; h++) {
                if ((unsigned char *)de - base < newinfo->hook_entry[h])
                        newinfo->hook_entry[h] -= origsize - *size;
                if ((unsigned char *)de - base < newinfo->underflow[h])
                        newinfo->underflow[h] -= origsize - *size;
        }
}

static int
translate_compat_table(struct net *net,
                       struct xt_table_info **pinfo,
                       void **pentry0,
                       const struct compat_ip6t_replace *compatr)
{
        unsigned int i, j;
        struct xt_table_info *newinfo, *info;
        void *pos, *entry0, *entry1;
        struct compat_ip6t_entry *iter0;
        struct ip6t_replace repl;
        unsigned int size;
        int ret;

        info = *pinfo;
        entry0 = *pentry0;
        size = compatr->size;
        info->number = compatr->num_entries;

        j = 0;
        xt_compat_lock(AF_INET6);
        ret = xt_compat_init_offsets(AF_INET6, compatr->num_entries);
        if (ret)
                goto out_unlock;
        /* Walk through entries, checking offsets. */
        xt_entry_foreach(iter0, entry0, compatr->size) {
                ret = check_compat_entry_size_and_hooks(iter0, info, &size,
                                                        entry0,
                                                        entry0 + compatr->size);
                if (ret != 0)
                        goto out_unlock;
                ++j;
        }

        ret = -EINVAL;
        if (j != compatr->num_entries)
                goto out_unlock;

        ret = -ENOMEM;
        newinfo = xt_alloc_table_info(size);
        if (!newinfo)
                goto out_unlock;

        memset(newinfo->entries, 0, size);

        newinfo->number = compatr->num_entries;
        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                newinfo->hook_entry[i] = compatr->hook_entry[i];
                newinfo->underflow[i] = compatr->underflow[i];
        }
        entry1 = newinfo->entries;
        pos = entry1;
        size = compatr->size;
        xt_entry_foreach(iter0, entry0, compatr->size)
                compat_copy_entry_from_user(iter0, &pos, &size,
                                            newinfo, entry1);

        /* all module references in entry0 are now gone. */
        xt_compat_flush_offsets(AF_INET6);
        xt_compat_unlock(AF_INET6);

        memcpy(&repl, compatr, sizeof(*compatr));

        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                repl.hook_entry[i] = newinfo->hook_entry[i];
                repl.underflow[i] = newinfo->underflow[i];
        }

        repl.num_counters = 0;
        repl.counters = NULL;
        repl.size = newinfo->size;
        ret = translate_table(net, newinfo, entry1, &repl);
        if (ret)
                goto free_newinfo;

        *pinfo = newinfo;
        *pentry0 = entry1;
        xt_free_table_info(info);
        return 0;

free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
out_unlock:
        xt_compat_flush_offsets(AF_INET6);
        xt_compat_unlock(AF_INET6);
        xt_entry_foreach(iter0, entry0, compatr->size) {
                if (j-- == 0)
                        break;
                compat_release_entry(iter0);
        }
        return ret;
}

static int
compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
        int ret;
        struct compat_ip6t_replace tmp;
        struct xt_table_info *newinfo;
        void *loc_cpu_entry;
        struct ip6t_entry *iter;

        if (len < sizeof(tmp))
                return -EINVAL;
        if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
                return -EFAULT;

        /* overflow check */
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
        if (tmp.num_counters == 0)
                return -EINVAL;
        if ((u64)len < (u64)tmp.size + sizeof(tmp))
                return -EINVAL;

        tmp.name[sizeof(tmp.name)-1] = 0;

        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
                        tmp.size) != 0) {
                ret = -EFAULT;
                goto free_newinfo;
        }

        ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
        if (ret != 0)
                goto free_newinfo;

        ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
                           tmp.num_counters, compat_ptr(tmp.counters));
        if (ret)
                goto free_newinfo_untrans;
        return 0;

 free_newinfo_untrans:
        xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                cleanup_entry(iter, net);
 free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
}

struct compat_ip6t_get_entries {
        char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t size;
        struct compat_ip6t_entry entrytable[];
};

static int
compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
                            void __user *userptr)
{
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;
        void __user *pos;
        unsigned int size;
        int ret = 0;
        unsigned int i = 0;
        struct ip6t_entry *iter;

        counters = alloc_counters(table);
        if (IS_ERR(counters))
                return PTR_ERR(counters);

        pos = userptr;
        size = total_size;
        xt_entry_foreach(iter, private->entries, total_size) {
                ret = compat_copy_entry_to_user(iter, &pos,
                                                &size, counters, i++);
                if (ret != 0)
                        break;
        }

        vfree(counters);
        return ret;
}

static int
compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
                   int *len)
{
        int ret;
        struct compat_ip6t_get_entries get;
        struct xt_table *t;

        if (*len < sizeof(get))
                return -EINVAL;

        if (copy_from_user(&get, uptr, sizeof(get)) != 0)
                return -EFAULT;

        if (*len != sizeof(struct compat_ip6t_get_entries) + get.size)
                return -EINVAL;

        get.name[sizeof(get.name) - 1] = '\0';

        xt_compat_lock(AF_INET6);
        t = xt_find_table_lock(net, AF_INET6, get.name);
        if (!IS_ERR(t)) {
                const struct xt_table_info *private = t->private;
                struct xt_table_info info;
                ret = compat_table_info(private, &info);
                if (!ret && get.size == info.size)
                        ret = compat_copy_entries_to_user(private->size,
                                                          t, uptr->entrytable);
                else if (!ret)
                        ret = -EAGAIN;

                xt_compat_flush_offsets(AF_INET6);
                module_put(t->me);
                xt_table_unlock(t);
        } else
                ret = PTR_ERR(t);

        xt_compat_unlock(AF_INET6);
        return ret;
}
#endif

static int
do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
{
        int ret;

        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        switch (cmd) {
        case IP6T_SO_SET_REPLACE:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                if (in_compat_syscall())
                        ret = compat_do_replace(sock_net(sk), arg, len);
                else
#endif
                        ret = do_replace(sock_net(sk), arg, len);
                break;

        case IP6T_SO_SET_ADD_COUNTERS:
                ret = do_add_counters(sock_net(sk), arg, len);
                break;

        default:
                ret = -EINVAL;
        }

        return ret;
}

static int
do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
        int ret;

        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        switch (cmd) {
        case IP6T_SO_GET_INFO:
                ret = get_info(sock_net(sk), user, len);
                break;

        case IP6T_SO_GET_ENTRIES:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                if (in_compat_syscall())
                        ret = compat_get_entries(sock_net(sk), user, len);
                else
#endif
                        ret = get_entries(sock_net(sk), user, len);
                break;

        case IP6T_SO_GET_REVISION_MATCH:
        case IP6T_SO_GET_REVISION_TARGET: {
                struct xt_get_revision rev;
                int target;

                if (*len != sizeof(rev)) {
                        ret = -EINVAL;
                        break;
                }
                if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
                        ret = -EFAULT;
                        break;
                }
                rev.name[sizeof(rev.name)-1] = 0;

                if (cmd == IP6T_SO_GET_REVISION_TARGET)
                        target = 1;
                else
                        target = 0;

                try_then_request_module(xt_find_revision(AF_INET6, rev.name,
                                                         rev.revision,
                                                         target, &ret),
                                        "ip6t_%s", rev.name);
                break;
        }

        default:
                ret = -EINVAL;
        }

        return ret;
}

static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
{
        struct xt_table_info *private;
        void *loc_cpu_entry;
        struct module *table_owner = table->me;
        struct ip6t_entry *iter;

        private = xt_unregister_table(table);

        /* Decrease module usage counts and free resources */
        loc_cpu_entry = private->entries;
        xt_entry_foreach(iter, loc_cpu_entry, private->size)
                cleanup_entry(iter, net);
        if (private->number > private->initial_entries)
                module_put(table_owner);
        xt_free_table_info(private);
}

int ip6t_register_table(struct net *net, const struct xt_table *table,
                        const struct ip6t_replace *repl,
                        const struct nf_hook_ops *template_ops)
{
        struct nf_hook_ops *ops;
        unsigned int num_ops;
        int ret, i;
        struct xt_table_info *newinfo;
        struct xt_table_info bootstrap = {0};
        void *loc_cpu_entry;
        struct xt_table *new_table;

        newinfo = xt_alloc_table_info(repl->size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        memcpy(loc_cpu_entry, repl->entries, repl->size);

        ret = translate_table(net, newinfo, loc_cpu_entry, repl);
        if (ret != 0) {
                xt_free_table_info(newinfo);
                return ret;
        }

        new_table = xt_register_table(net, table, &bootstrap, newinfo);
        if (IS_ERR(new_table)) {
                struct ip6t_entry *iter;

                xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                        cleanup_entry(iter, net);
                xt_free_table_info(newinfo);
                return PTR_ERR(new_table);
        }

        if (!template_ops)
                return 0;

        num_ops = hweight32(table->valid_hooks);
        if (num_ops == 0) {
                ret = -EINVAL;
                goto out_free;
        }

        ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
        if (!ops) {
                ret = -ENOMEM;
                goto out_free;
        }

        for (i = 0; i < num_ops; i++)
                ops[i].priv = new_table;

        new_table->ops = ops;

        ret = nf_register_net_hooks(net, ops, num_ops);
        if (ret != 0)
                goto out_free;

        return ret;

out_free:
        __ip6t_unregister_table(net, new_table);
        return ret;
}

void ip6t_unregister_table_pre_exit(struct net *net, const char *name)
{
        struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);

        if (table)
                nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}

void ip6t_unregister_table_exit(struct net *net, const char *name)
{
        struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);

        if (table)
                __ip6t_unregister_table(net, table);
}

/* The built-in targets: standard (NULL) and error. */
static struct xt_target ip6t_builtin_tg[] __read_mostly = {
        {
                .name             = XT_STANDARD_TARGET,
                .targetsize       = sizeof(int),
                .family           = NFPROTO_IPV6,
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                .compatsize       = sizeof(compat_int_t),
                .compat_from_user = compat_standard_from_user,
                .compat_to_user   = compat_standard_to_user,
#endif
        },
        {
                .name             = XT_ERROR_TARGET,
                .target           = ip6t_error,
                .targetsize       = XT_FUNCTION_MAXNAMELEN,
                .family           = NFPROTO_IPV6,
        },
};

static struct nf_sockopt_ops ip6t_sockopts = {
        .pf                = PF_INET6,
        .set_optmin        = IP6T_BASE_CTL,
        .set_optmax        = IP6T_SO_SET_MAX+1,
        .set                = do_ip6t_set_ctl,
        .get_optmin        = IP6T_BASE_CTL,
        .get_optmax        = IP6T_SO_GET_MAX+1,
        .get                = do_ip6t_get_ctl,
        .owner                = THIS_MODULE,
};

static int __net_init ip6_tables_net_init(struct net *net)
{
        return xt_proto_init(net, NFPROTO_IPV6);
}

static void __net_exit ip6_tables_net_exit(struct net *net)
{
        xt_proto_fini(net, NFPROTO_IPV6);
}

static struct pernet_operations ip6_tables_net_ops = {
        .init = ip6_tables_net_init,
        .exit = ip6_tables_net_exit,
};

static int __init ip6_tables_init(void)
{
        int ret;

        ret = register_pernet_subsys(&ip6_tables_net_ops);
        if (ret < 0)
                goto err1;

        /* No one else will be downing sem now, so we won't sleep */
        ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
        if (ret < 0)
                goto err2;

        /* Register setsockopt */
        ret = nf_register_sockopt(&ip6t_sockopts);
        if (ret < 0)
                goto err4;

        return 0;

err4:
        xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
err2:
        unregister_pernet_subsys(&ip6_tables_net_ops);
err1:
        return ret;
}

static void __exit ip6_tables_fini(void)
{
        nf_unregister_sockopt(&ip6t_sockopts);

        xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
        unregister_pernet_subsys(&ip6_tables_net_ops);
}

EXPORT_SYMBOL(ip6t_register_table);
EXPORT_SYMBOL(ip6t_unregister_table_pre_exit);
EXPORT_SYMBOL(ip6t_unregister_table_exit);
EXPORT_SYMBOL(ip6t_do_table);

module_init(ip6_tables_init);
module_exit(ip6_tables_fini);
















































































































































































































































































































































































































































































































































































































    1 
















































































































































































































































































































    1 






    1 





    2 






    2 





   14 



















































































































    1 




    1 









    1 



















    1 




































































































    1 








































































































































    1 


































































































    1 


























































































    1 












































































































    1 





    1 







    2 




















    1 





    1 





































    1 





    1 































































































































































































































































































































































    1 






    1 











































































































































   22 

















    1 


















































    1 









































































































    2 






















































































































































































   20 
   15 
   17 






































































































































































   21 



















   21 



















































































































    2 



    4 











































    2 




































































































































   14 
    5 






















































































































































































































































































































    1 





















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_H
#define _LINUX_FS_H

#include <linux/fs/super.h>
#include <linux/vfsdebug.h>
#include <linux/linkage.h>
#include <linux/wait_bit.h>
#include <linux/kdev_t.h>
#include <linux/dcache.h>
#include <linux/path.h>
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
#include <linux/xarray.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/pid.h>
#include <linux/bug.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/mm_types.h>
#include <linux/capability.h>
#include <linux/semaphore.h>
#include <linux/fcntl.h>
#include <linux/rculist_bl.h>
#include <linux/atomic.h>
#include <linux/shrinker.h>
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
#include <linux/delayed_call.h>
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
#include <linux/build_bug.h>
#include <linux/stddef.h>
#include <linux/mount.h>
#include <linux/cred.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/maple_tree.h>
#include <linux/rw_hint.h>
#include <linux/file_ref.h>
#include <linux/unicode.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>

struct bdi_writeback;
struct bio;
struct io_comp_batch;
struct fiemap_extent_info;
struct kiocb;
struct kobject;
struct pipe_inode_info;
struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
struct iov_iter;
struct fsnotify_mark_connector;
struct fs_context;
struct fs_parameter_spec;
struct file_kattr;
struct iomap_ops;
struct delegated_inode;

extern void __init inode_init(void);
extern void __init inode_init_early(void);
extern void __init files_init(void);
extern void __init files_maxfiles_init(void);

extern unsigned long get_max_files(void);
extern unsigned int sysctl_nr_open;

typedef __kernel_rwf_t rwf_t;

struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);

#define MAY_EXEC                0x00000001
#define MAY_WRITE                0x00000002
#define MAY_READ                0x00000004
#define MAY_APPEND                0x00000008
#define MAY_ACCESS                0x00000010
#define MAY_OPEN                0x00000020
#define MAY_CHDIR                0x00000040
/* called from RCU mode, don't block */
#define MAY_NOT_BLOCK                0x00000080

/*
 * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
 * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open()
 */

/* file is open for reading */
#define FMODE_READ                ((__force fmode_t)(1 << 0))
/* file is open for writing */
#define FMODE_WRITE                ((__force fmode_t)(1 << 1))
/* file is seekable */
#define FMODE_LSEEK                ((__force fmode_t)(1 << 2))
/* file can be accessed using pread */
#define FMODE_PREAD                ((__force fmode_t)(1 << 3))
/* file can be accessed using pwrite */
#define FMODE_PWRITE                ((__force fmode_t)(1 << 4))
/* File is opened for execution with sys_execve / sys_uselib */
#define FMODE_EXEC                ((__force fmode_t)(1 << 5))
/* File writes are restricted (block device specific) */
#define FMODE_WRITE_RESTRICTED        ((__force fmode_t)(1 << 6))
/* File supports atomic writes */
#define FMODE_CAN_ATOMIC_WRITE        ((__force fmode_t)(1 << 7))

/* FMODE_* bit 8 */

/* 32bit hashes as llseek() offset (for directories) */
#define FMODE_32BITHASH         ((__force fmode_t)(1 << 9))
/* 64bit hashes as llseek() offset (for directories) */
#define FMODE_64BITHASH         ((__force fmode_t)(1 << 10))

/*
 * Don't update ctime and mtime.
 *
 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
 */
#define FMODE_NOCMTIME                ((__force fmode_t)(1 << 11))

/* Expect random access pattern */
#define FMODE_RANDOM                ((__force fmode_t)(1 << 12))

/* Supports IOCB_HAS_METADATA */
#define FMODE_HAS_METADATA        ((__force fmode_t)(1 << 13))

/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH                ((__force fmode_t)(1 << 14))

/* File needs atomic accesses to f_pos */
#define FMODE_ATOMIC_POS        ((__force fmode_t)(1 << 15))
/* Write access to underlying fs */
#define FMODE_WRITER                ((__force fmode_t)(1 << 16))
/* Has read method(s) */
#define FMODE_CAN_READ          ((__force fmode_t)(1 << 17))
/* Has write method(s) */
#define FMODE_CAN_WRITE         ((__force fmode_t)(1 << 18))

#define FMODE_OPENED                ((__force fmode_t)(1 << 19))
#define FMODE_CREATED                ((__force fmode_t)(1 << 20))

/* File is stream-like */
#define FMODE_STREAM                ((__force fmode_t)(1 << 21))

/* File supports DIRECT IO */
#define        FMODE_CAN_ODIRECT        ((__force fmode_t)(1 << 22))

#define        FMODE_NOREUSE                ((__force fmode_t)(1 << 23))

/* File is embedded in backing_file object */
#define FMODE_BACKING                ((__force fmode_t)(1 << 24))

/*
 * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be
 * generated (see below)
 */
#define FMODE_NONOTIFY                ((__force fmode_t)(1 << 25))

/*
 * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be
 * generated (see below)
 */
#define FMODE_NONOTIFY_PERM        ((__force fmode_t)(1 << 26))

/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT                ((__force fmode_t)(1 << 27))

/* File represents mount that needs unmounting */
#define FMODE_NEED_UNMOUNT        ((__force fmode_t)(1 << 28))

/* File does not contribute to nr_files count */
#define FMODE_NOACCOUNT                ((__force fmode_t)(1 << 29))

/*
 * The two FMODE_NONOTIFY* define which fsnotify events should not be generated
 * for an open file. These are the possible values of
 * (f->f_mode & FMODE_FSNOTIFY_MASK) and their meaning:
 *
 * FMODE_NONOTIFY - suppress all (incl. non-permission) events.
 * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events.
 * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only FAN_ACCESS_PERM.
 */
#define FMODE_FSNOTIFY_MASK \
        (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)

#define FMODE_FSNOTIFY_NONE(mode) \
        ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY)
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
#define FMODE_FSNOTIFY_HSM(mode) \
        ((mode & FMODE_FSNOTIFY_MASK) == 0 || \
         (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM))
#define FMODE_FSNOTIFY_ACCESS_PERM(mode) \
        ((mode & FMODE_FSNOTIFY_MASK) == 0)
#else
#define FMODE_FSNOTIFY_ACCESS_PERM(mode) 0
#define FMODE_FSNOTIFY_HSM(mode)        0
#endif

/*
 * Attribute flags.  These should be or-ed together to figure out what
 * has been changed!
 */
#define ATTR_MODE        (1 << 0)
#define ATTR_UID        (1 << 1)
#define ATTR_GID        (1 << 2)
#define ATTR_SIZE        (1 << 3)
#define ATTR_ATIME        (1 << 4)
#define ATTR_MTIME        (1 << 5)
#define ATTR_CTIME        (1 << 6)
#define ATTR_ATIME_SET        (1 << 7)
#define ATTR_MTIME_SET        (1 << 8)
#define ATTR_FORCE        (1 << 9) /* Not a change, but a change it */
#define ATTR_CTIME_SET        (1 << 10)
#define ATTR_KILL_SUID        (1 << 11)
#define ATTR_KILL_SGID        (1 << 12)
#define ATTR_FILE        (1 << 13)
#define ATTR_KILL_PRIV        (1 << 14)
#define ATTR_OPEN        (1 << 15) /* Truncating from open(O_TRUNC) */
#define ATTR_TIMES_SET        (1 << 16)
#define ATTR_TOUCH        (1 << 17)
#define ATTR_DELEG        (1 << 18) /* Delegated attrs. Don't break write delegations */

/*
 * Whiteout is represented by a char device.  The following constants define the
 * mode and device number to use.
 */
#define WHITEOUT_MODE 0
#define WHITEOUT_DEV 0

/*
 * This is the Inode Attributes structure, used for notify_change().  It
 * uses the above definitions as flags, to know which values have changed.
 * Also, in this manner, a Filesystem can look at only the values it cares
 * about.  Basically, these are the attributes that the VFS layer can
 * request to change from the FS layer.
 *
 * Derek Atkins <warlord@MIT.EDU> 94-10-20
 */
struct iattr {
        unsigned int        ia_valid;
        umode_t                ia_mode;
        /*
         * The two anonymous unions wrap structures with the same member.
         *
         * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which
         * are a dedicated type requiring the filesystem to use the dedicated
         * helpers. Other filesystem can continue to use ia_{g,u}id until they
         * have been ported.
         *
         * They always contain the same value. In other words FS_ALLOW_IDMAP
         * pass down the same value on idmapped mounts as they would on regular
         * mounts.
         */
        union {
                kuid_t                ia_uid;
                vfsuid_t        ia_vfsuid;
        };
        union {
                kgid_t                ia_gid;
                vfsgid_t        ia_vfsgid;
        };
        loff_t                ia_size;
        struct timespec64 ia_atime;
        struct timespec64 ia_mtime;
        struct timespec64 ia_ctime;

        /*
         * Not an attribute, but an auxiliary info for filesystems wanting to
         * implement an ftruncate() like method.  NOTE: filesystem should
         * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
         */
        struct file        *ia_file;
};

/*
 * Maximum number of layers of fs stack.  Needs to be limited to
 * prevent kernel stack overflow
 */
#define FILESYSTEM_MAX_STACK_DEPTH 2

/** 
 * enum positive_aop_returns - aop return codes with specific semantics
 *
 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
 *                             completed, that the page is still locked, and
 *                             should be considered active.  The VM uses this hint
 *                             to return the page to the active list -- it won't
 *                             be a candidate for writeback again in the near
 *                             future.  Other callers must be careful to unlock
 *                             the page if they get this return.  Returned by
 *                             writepage(); 
 *
 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
 *                          unlocked it and the page might have been truncated.
 *                          The caller should back up to acquiring a new page and
 *                          trying again.  The aop will be taking reasonable
 *                          precautions not to livelock.  If the caller held a page
 *                          reference, it should drop it before retrying.  Returned
 *                          by read_folio().
 *
 * address_space_operation functions return these large constants to indicate
 * special semantics to the caller.  These are much larger than the bytes in a
 * page to allow for functions that return the number of bytes operated on in a
 * given page.
 */

enum positive_aop_returns {
        AOP_WRITEPAGE_ACTIVATE        = 0x80000,
        AOP_TRUNCATED_PAGE        = 0x80001,
};

/*
 * oh the beauties of C type declarations.
 */
struct page;
struct address_space;
struct writeback_control;
struct readahead_control;

/* Match RWF_* bits to IOCB bits */
#define IOCB_HIPRI                (__force int) RWF_HIPRI
#define IOCB_DSYNC                (__force int) RWF_DSYNC
#define IOCB_SYNC                (__force int) RWF_SYNC
#define IOCB_NOWAIT                (__force int) RWF_NOWAIT
#define IOCB_APPEND                (__force int) RWF_APPEND
#define IOCB_ATOMIC                (__force int) RWF_ATOMIC
#define IOCB_DONTCACHE                (__force int) RWF_DONTCACHE
#define IOCB_NOSIGNAL                (__force int) RWF_NOSIGNAL

/* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD                (1 << 16)
#define IOCB_DIRECT                (1 << 17)
#define IOCB_WRITE                (1 << 18)
/* iocb->ki_waitq is valid */
#define IOCB_WAITQ                (1 << 19)
#define IOCB_NOIO                (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE        (1 << 21)
/* kiocb is a read or write operation submitted by fs/aio.c. */
#define IOCB_AIO_RW                (1 << 22)
#define IOCB_HAS_METADATA        (1 << 23)

/* for use in trace events */
#define TRACE_IOCB_STRINGS \
        { IOCB_HIPRI,                "HIPRI" }, \
        { IOCB_DSYNC,                "DSYNC" }, \
        { IOCB_SYNC,                "SYNC" }, \
        { IOCB_NOWAIT,                "NOWAIT" }, \
        { IOCB_APPEND,                "APPEND" }, \
        { IOCB_ATOMIC,                "ATOMIC" }, \
        { IOCB_DONTCACHE,        "DONTCACHE" }, \
        { IOCB_EVENTFD,                "EVENTFD"}, \
        { IOCB_DIRECT,                "DIRECT" }, \
        { IOCB_WRITE,                "WRITE" }, \
        { IOCB_WAITQ,                "WAITQ" }, \
        { IOCB_NOIO,                "NOIO" }, \
        { IOCB_ALLOC_CACHE,        "ALLOC_CACHE" }, \
        { IOCB_AIO_RW,                "AIO_RW" }, \
        { IOCB_HAS_METADATA,        "AIO_HAS_METADATA" }

struct kiocb {
        struct file                *ki_filp;
        loff_t                        ki_pos;
        void (*ki_complete)(struct kiocb *iocb, long ret);
        void                        *private;
        int                        ki_flags;
        u16                        ki_ioprio; /* See linux/ioprio.h */
        u8                        ki_write_stream;

        /*
         * Only used for async buffered reads, where it denotes the page
         * waitqueue associated with completing the read.
         * Valid IFF IOCB_WAITQ is set.
         */
        struct wait_page_queue        *ki_waitq;
};

static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
        return kiocb->ki_complete == NULL;
}

struct address_space_operations {
        int (*read_folio)(struct file *, struct folio *);

        /* Write back some dirty pages from this mapping. */
        int (*writepages)(struct address_space *, struct writeback_control *);

        /* Mark a folio dirty.  Return true if this dirtied it */
        bool (*dirty_folio)(struct address_space *, struct folio *);

        void (*readahead)(struct readahead_control *);

        int (*write_begin)(const struct kiocb *, struct address_space *mapping,
                                loff_t pos, unsigned len,
                                struct folio **foliop, void **fsdata);
        int (*write_end)(const struct kiocb *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned copied,
                                struct folio *folio, void *fsdata);

        /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
        sector_t (*bmap)(struct address_space *, sector_t);
        void (*invalidate_folio) (struct folio *, size_t offset, size_t len);
        bool (*release_folio)(struct folio *, gfp_t);
        void (*free_folio)(struct folio *folio);
        ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
        /*
         * migrate the contents of a folio to the specified target. If
         * migrate_mode is MIGRATE_ASYNC, it must not block.
         */
        int (*migrate_folio)(struct address_space *, struct folio *dst,
                        struct folio *src, enum migrate_mode);
        int (*launder_folio)(struct folio *);
        bool (*is_partially_uptodate) (struct folio *, size_t from,
                        size_t count);
        void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb);
        int (*error_remove_folio)(struct address_space *, struct folio *);

        /* swapfile support */
        int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
                                sector_t *span);
        void (*swap_deactivate)(struct file *file);
        int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
};

extern const struct address_space_operations empty_aops;

/* Structure for tracking metadata buffer heads associated with the mapping */
struct mapping_metadata_bhs {
        struct address_space *mapping;        /* Mapping bhs are associated with */
        spinlock_t lock;        /* Lock protecting bh list */
        struct list_head list;        /* The list of bhs (b_assoc_buffers) */
};

/**
 * struct address_space - Contents of a cacheable, mappable object.
 * @host: Owner, either the inode or the block_device.
 * @i_pages: Cached pages.
 * @invalidate_lock: Guards coherency between page cache contents and
 *   file offset->disk block mappings in the filesystem during invalidates.
 *   It is also used to block modification of page cache contents through
 *   memory mappings.
 * @gfp_mask: Memory allocation flags to use for allocating pages.
 * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
 * @nr_thps: Number of THPs in the pagecache (non-shmem only).
 * @i_mmap: Tree of private and shared mappings.
 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
 * @nrpages: Number of page entries, protected by the i_pages lock.
 * @writeback_index: Writeback starts here.
 * @a_ops: Methods.
 * @flags: Error bits and flags (AS_*).
 * @wb_err: The most recent error which has occurred.
 * @i_private_lock: For use by the owner of the address_space.
 */
struct address_space {
        struct inode                *host;
        struct xarray                i_pages;
        struct rw_semaphore        invalidate_lock;
        gfp_t                        gfp_mask;
        atomic_t                i_mmap_writable;
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        /* number of thp, only for non-shmem files */
        atomic_t                nr_thps;
#endif
        struct rb_root_cached        i_mmap;
        unsigned long                nrpages;
        pgoff_t                        writeback_index;
        const struct address_space_operations *a_ops;
        unsigned long                flags;
        errseq_t                wb_err;
        spinlock_t                i_private_lock;
        struct rw_semaphore        i_mmap_rwsem;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
        /*
         * On most architectures that alignment is already the case; but
         * must be enforced here for CRIS, to let the least significant bit
         * of struct folio's "mapping" pointer be used for FOLIO_MAPPING_ANON.
         */

/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
#define PAGECACHE_TAG_DIRTY        XA_MARK_0
#define PAGECACHE_TAG_WRITEBACK        XA_MARK_1
#define PAGECACHE_TAG_TOWRITE        XA_MARK_2

/*
 * Returns true if any of the pages in the mapping are marked with the tag.
 */
static inline bool mapping_tagged(const struct address_space *mapping, xa_mark_t tag)
{
        return xa_marked(&mapping->i_pages, tag);
}

static inline void i_mmap_lock_write(struct address_space *mapping)
{
        down_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_write(struct address_space *mapping)
{
        return down_write_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_write(struct address_space *mapping)
{
        up_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_read(struct address_space *mapping)
{
        return down_read_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_lock_read(struct address_space *mapping)
{
        down_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_read(struct address_space *mapping)
{
        up_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_locked(struct address_space *mapping)
{
        lockdep_assert_held(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_write_locked(struct address_space *mapping)
{
        lockdep_assert_held_write(&mapping->i_mmap_rwsem);
}

/*
 * Might pages of this file be mapped into userspace?
 */
static inline int mapping_mapped(const struct address_space *mapping)
{
        return        !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}

/*
 * Might pages of this file have been modified in userspace?
 * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
 * marks vma as VM_SHARED if it is shared, and the file was opened for
 * writing i.e. vma may be mprotected writable even if now readonly.
 *
 * If i_mmap_writable is negative, no new writable mappings are allowed. You
 * can only deny writable mappings, if none exists right now.
 */
static inline int mapping_writably_mapped(const struct address_space *mapping)
{
        return atomic_read(&mapping->i_mmap_writable) > 0;
}

static inline int mapping_map_writable(struct address_space *mapping)
{
        return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
                0 : -EPERM;
}

static inline void mapping_unmap_writable(struct address_space *mapping)
{
        atomic_dec(&mapping->i_mmap_writable);
}

static inline int mapping_deny_writable(struct address_space *mapping)
{
        return atomic_dec_unless_positive(&mapping->i_mmap_writable) ?
                0 : -EBUSY;
}

static inline void mapping_allow_writable(struct address_space *mapping)
{
        atomic_inc(&mapping->i_mmap_writable);
}

/*
 * Use sequence counter to get consistent i_size on 32-bit processors.
 */
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __NEED_I_SIZE_ORDERED
#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
#else
#define i_size_ordered_init(inode) do { } while (0)
#endif

struct posix_acl;
#define ACL_NOT_CACHED ((void *)(-1))
/*
 * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to
 * cache the ACL.  This also means that ->get_inode_acl() can be called in RCU
 * mode with the LOOKUP_RCU flag.
 */
#define ACL_DONT_CACHE ((void *)(-3))

static inline struct posix_acl *
uncached_acl_sentinel(struct task_struct *task)
{
        return (void *)task + 1;
}

static inline bool
is_uncached_acl(struct posix_acl *acl)
{
        return (long)acl & 1;
}

#define IOP_FASTPERM                0x0001
#define IOP_LOOKUP                0x0002
#define IOP_NOFOLLOW                0x0004
#define IOP_XATTR                0x0008
#define IOP_DEFAULT_READLINK        0x0010
#define IOP_MGTIME                0x0020
#define IOP_CACHED_LINK                0x0040
#define IOP_FASTPERM_MAY_EXEC        0x0080
#define IOP_FLCTX                0x0100

/*
 * Inode state bits.  Protected by inode->i_lock
 *
 * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
 * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
 *
 * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
 * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
 * various stages of removing an inode.
 *
 * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
 *
 * I_DIRTY_SYNC                Inode is dirty, but doesn't have to be written on
 *                        fdatasync() (unless I_DIRTY_DATASYNC is also set).
 *                        Timestamp updates are the usual cause.
 * I_DIRTY_DATASYNC        Data-related inode changes pending.  We keep track of
 *                        these changes separately from I_DIRTY_SYNC so that we
 *                        don't have to write inode on fdatasync() when only
 *                        e.g. the timestamps have changed.
 * I_DIRTY_PAGES        Inode has dirty pages.  Inode itself may be clean.
 * I_DIRTY_TIME                The inode itself has dirty timestamps, and the
 *                        lazytime mount option is enabled.  We keep track of this
 *                        separately from I_DIRTY_SYNC in order to implement
 *                        lazytime.  This gets cleared if I_DIRTY_INODE
 *                        (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But
 *                        I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already
 *                        in place because writeback might already be in progress
 *                        and we don't want to lose the time update
 * I_NEW                Serves as both a mutex and completion notification.
 *                        New inodes set I_NEW.  If two processes both create
 *                        the same inode, one of them will release its inode and
 *                        wait for I_NEW to be released before returning.
 *                        Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
 *                        also cause waiting on I_NEW, without I_NEW actually
 *                        being set.  find_inode() uses this to prevent returning
 *                        nearly-dead inodes.
 * I_WILL_FREE                Must be set when calling write_inode_now() if i_count
 *                        is zero.  I_FREEING must be set when I_WILL_FREE is
 *                        cleared.
 * I_FREEING                Set when inode is about to be freed but still has dirty
 *                        pages or buffers attached or the inode itself is still
 *                        dirty.
 * I_CLEAR                Added by clear_inode().  In this state the inode is
 *                        clean and can be destroyed.  Inode keeps I_FREEING.
 *
 *                        Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
 *                        prohibited for many purposes.  iget() must wait for
 *                        the inode to be completely released, then create it
 *                        anew.  Other functions will just ignore such inodes,
 *                        if appropriate.  I_NEW is used for waiting.
 *
 * I_SYNC                Writeback of inode is running. The bit is set during
 *                        data writeback, and cleared with a wakeup on the bit
 *                        address once it is done. The bit is also used to pin
 *                        the inode in memory for flusher thread.
 *
 * I_REFERENCED                Marks the inode as recently references on the LRU list.
 *
 * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
 *                        synchronize competing switching instances and to tell
 *                        wb stat updates to grab the i_pages lock.  See
 *                        inode_switch_wbs_work_fn() for details.
 *
 * I_OVL_INUSE                Used by overlayfs to get exclusive ownership on upper
 *                        and work dirs among overlayfs mounts.
 *
 * I_CREATING                New object's inode in the middle of setting up.
 *
 * I_DONTCACHE                Evict inode as soon as it is not used anymore.
 *
 * I_SYNC_QUEUED        Inode is queued in b_io or b_more_io writeback lists.
 *                        Used to detect that mark_inode_dirty() should not move
 *                        inode between dirty lists.
 *
 * I_PINNING_FSCACHE_WB        Inode is pinning an fscache object for writeback.
 *
 * I_LRU_ISOLATING        Inode is pinned being isolated from LRU without holding
 *                        i_count.
 *
 * Q: What is the difference between I_WILL_FREE and I_FREEING?
 *
 * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
 * upon. There's one free address left.
 */

enum inode_state_bits {
        __I_NEW                        = 0U,
        __I_SYNC                = 1U,
        __I_LRU_ISOLATING        = 2U
        /* reserved wait address bit 3 */
};

enum inode_state_flags_enum {
        I_NEW                        = (1U << __I_NEW),
        I_SYNC                        = (1U << __I_SYNC),
        I_LRU_ISOLATING         = (1U << __I_LRU_ISOLATING),
        /* reserved flag bit 3 */
        I_DIRTY_SYNC                = (1U << 4),
        I_DIRTY_DATASYNC        = (1U << 5),
        I_DIRTY_PAGES                = (1U << 6),
        I_WILL_FREE                = (1U << 7),
        I_FREEING                = (1U << 8),
        I_CLEAR                        = (1U << 9),
        I_REFERENCED                = (1U << 10),
        I_LINKABLE                = (1U << 11),
        I_DIRTY_TIME                = (1U << 12),
        I_WB_SWITCH                = (1U << 13),
        I_OVL_INUSE                = (1U << 14),
        I_CREATING                = (1U << 15),
        I_DONTCACHE                = (1U << 16),
        I_SYNC_QUEUED                = (1U << 17),
        I_PINNING_NETFS_WB        = (1U << 18)
};

#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)

/*
 * Use inode_state_read() & friends to access.
 */
struct inode_state_flags {
        enum inode_state_flags_enum __state;
};

/*
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
 */
struct inode {
        umode_t                        i_mode;
        unsigned short                i_opflags;
        unsigned int                i_flags;
#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl        *i_acl;
        struct posix_acl        *i_default_acl;
#endif
        kuid_t                        i_uid;
        kgid_t                        i_gid;

        const struct inode_operations        *i_op;
        struct super_block        *i_sb;
        struct address_space        *i_mapping;

#ifdef CONFIG_SECURITY
        void                        *i_security;
#endif

        /* Stat data, not accessed from path walking */
        u64                        i_ino;
        /*
         * Filesystems may only read i_nlink directly.  They shall use the
         * following functions for modification:
         *
         *    (set|clear|inc|drop)_nlink
         *    inode_(inc|dec)_link_count
         */
        union {
                const unsigned int i_nlink;
                unsigned int __i_nlink;
        };
        dev_t                        i_rdev;
        loff_t                        i_size;
        time64_t                i_atime_sec;
        time64_t                i_mtime_sec;
        time64_t                i_ctime_sec;
        u32                        i_atime_nsec;
        u32                        i_mtime_nsec;
        u32                        i_ctime_nsec;
        u32                        i_generation;
        spinlock_t                i_lock;        /* i_blocks, i_bytes, maybe i_size */
        unsigned short          i_bytes;
        u8                        i_blkbits;
        enum rw_hint                i_write_hint;
        blkcnt_t                i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
        seqcount_t                i_size_seqcount;
#endif

        /* Misc */
        struct inode_state_flags i_state;
        /* 32-bit hole */
        struct rw_semaphore        i_rwsem;

        unsigned long                dirtied_when;        /* jiffies of first dirtying */
        unsigned long                dirtied_time_when;

        struct hlist_node        i_hash;
        struct list_head        i_io_list;        /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback        *i_wb;                /* the associated cgroup wb */

        /* foreign inode detection, see wbc_detach_inode() */
        int                        i_wb_frn_winner;
        u16                        i_wb_frn_avg_time;
        u16                        i_wb_frn_history;
#endif
        struct list_head        i_lru;                /* inode LRU list */
        struct list_head        i_sb_list;
        struct list_head        i_wb_list;        /* backing dev writeback list */
        union {
                struct hlist_head        i_dentry;
                struct rcu_head                i_rcu;
        };
        atomic64_t                i_version;
        atomic64_t                i_sequence; /* see futex */
        atomic_t                i_count;
        atomic_t                i_dio_count;
        atomic_t                i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
        atomic_t                i_readcount; /* struct files open RO */
#endif
        union {
                const struct file_operations        *i_fop;        /* former ->i_op->default_file_ops */
                void (*free_inode)(struct inode *);
        };
        struct file_lock_context        *i_flctx;
        struct address_space        i_data;
        union {
                struct list_head        i_devices;
                int                        i_linklen;
        };
        union {
                struct pipe_inode_info        *i_pipe;
                struct cdev                *i_cdev;
                char                        *i_link;
                unsigned                i_dir_seq;
        };


#ifdef CONFIG_FSNOTIFY
        __u32                        i_fsnotify_mask; /* all events this inode cares about */
        /* 32-bit hole reserved for expanding i_fsnotify_mask */
        struct fsnotify_mark_connector __rcu        *i_fsnotify_marks;
#endif

        void                        *i_private; /* fs or device private pointer */
} __randomize_layout;

/*
 * i_state handling
 *
 * We hide all of it behind helpers so that we can validate consumers.
 */
static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode)
{
        return READ_ONCE(inode->i_state.__state);
}

static inline enum inode_state_flags_enum inode_state_read(struct inode *inode)
{
        lockdep_assert_held(&inode->i_lock);
        return inode->i_state.__state;
}

static inline void inode_state_set_raw(struct inode *inode,
                                       enum inode_state_flags_enum flags)
{
        WRITE_ONCE(inode->i_state.__state, inode->i_state.__state | flags);
}

static inline void inode_state_set(struct inode *inode,
                                   enum inode_state_flags_enum flags)
{
        lockdep_assert_held(&inode->i_lock);
        inode_state_set_raw(inode, flags);
}

static inline void inode_state_clear_raw(struct inode *inode,
                                         enum inode_state_flags_enum flags)
{
        WRITE_ONCE(inode->i_state.__state, inode->i_state.__state & ~flags);
}

static inline void inode_state_clear(struct inode *inode,
                                     enum inode_state_flags_enum flags)
{
        lockdep_assert_held(&inode->i_lock);
        inode_state_clear_raw(inode, flags);
}

static inline void inode_state_assign_raw(struct inode *inode,
                                          enum inode_state_flags_enum flags)
{
        WRITE_ONCE(inode->i_state.__state, flags);
}

static inline void inode_state_assign(struct inode *inode,
                                      enum inode_state_flags_enum flags)
{
        lockdep_assert_held(&inode->i_lock);
        inode_state_assign_raw(inode, flags);
}

static inline void inode_state_replace_raw(struct inode *inode,
                                           enum inode_state_flags_enum clearflags,
                                           enum inode_state_flags_enum setflags)
{
        enum inode_state_flags_enum flags;
        flags = inode->i_state.__state;
        flags &= ~clearflags;
        flags |= setflags;
        inode_state_assign_raw(inode, flags);
}

static inline void inode_state_replace(struct inode *inode,
                                       enum inode_state_flags_enum clearflags,
                                       enum inode_state_flags_enum setflags)
{
        lockdep_assert_held(&inode->i_lock);
        inode_state_replace_raw(inode, clearflags, setflags);
}

static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
{
        VFS_WARN_ON_INODE(strlen(link) != linklen, inode);
        VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode);
        inode->i_link = link;
        inode->i_linklen = linklen;
        inode->i_opflags |= IOP_CACHED_LINK;
}

/*
 * Get bit address from inode->i_state to use with wait_var_event()
 * infrastructre.
 */
#define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit))

struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
                                            struct inode *inode, u32 bit);

static inline void inode_wake_up_bit(struct inode *inode, u32 bit)
{
        /* Caller is responsible for correct memory barriers. */
        wake_up_var(inode_state_wait_address(inode, bit));
}

struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);

static inline unsigned int i_blocksize(const struct inode *node)
{
        return (1 << node->i_blkbits);
}

static inline int inode_unhashed(struct inode *inode)
{
        return hlist_unhashed(&inode->i_hash);
}

/*
 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
 * want special inodes in the fileset inode space, we make them
 * appear hashed, but do not put on any lists.  hlist_del()
 * will work fine and require no locking.
 */
static inline void inode_fake_hash(struct inode *inode)
{
        hlist_add_fake(&inode->i_hash);
}

void wait_on_new_inode(struct inode *inode);

/*
 * inode->i_rwsem nesting subclasses for the lock validator:
 *
 * 0: the object of the current VFS operation
 * 1: parent
 * 2: child/target
 * 3: xattr
 * 4: second non-directory
 * 5: second parent (when locking independent directories in rename)
 *
 * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two
 * non-directories at once.
 *
 * The locking order between these classes is
 * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory
 */
enum inode_i_mutex_lock_class
{
        I_MUTEX_NORMAL,
        I_MUTEX_PARENT,
        I_MUTEX_CHILD,
        I_MUTEX_XATTR,
        I_MUTEX_NONDIR2,
        I_MUTEX_PARENT2,
};

static inline void inode_lock(struct inode *inode)
{
        down_write(&inode->i_rwsem);
}

static inline __must_check int inode_lock_killable(struct inode *inode)
{
        return down_write_killable(&inode->i_rwsem);
}

static inline void inode_unlock(struct inode *inode)
{
        up_write(&inode->i_rwsem);
}

static inline void inode_lock_shared(struct inode *inode)
{
        down_read(&inode->i_rwsem);
}

static inline __must_check int inode_lock_shared_killable(struct inode *inode)
{
        return down_read_killable(&inode->i_rwsem);
}

static inline void inode_unlock_shared(struct inode *inode)
{
        up_read(&inode->i_rwsem);
}

static inline int inode_trylock(struct inode *inode)
{
        return down_write_trylock(&inode->i_rwsem);
}

static inline int inode_trylock_shared(struct inode *inode)
{
        return down_read_trylock(&inode->i_rwsem);
}

static inline int inode_is_locked(struct inode *inode)
{
        return rwsem_is_locked(&inode->i_rwsem);
}

static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
{
        down_write_nested(&inode->i_rwsem, subclass);
}

static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass)
{
        down_read_nested(&inode->i_rwsem, subclass);
}

static inline void filemap_invalidate_lock(struct address_space *mapping)
{
        down_write(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_unlock(struct address_space *mapping)
{
        up_write(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
{
        down_read(&mapping->invalidate_lock);
}

static inline int filemap_invalidate_trylock_shared(
                                        struct address_space *mapping)
{
        return down_read_trylock(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_unlock_shared(
                                        struct address_space *mapping)
{
        up_read(&mapping->invalidate_lock);
}

void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);

void filemap_invalidate_lock_two(struct address_space *mapping1,
                                 struct address_space *mapping2);
void filemap_invalidate_unlock_two(struct address_space *mapping1,
                                   struct address_space *mapping2);


/*
 * NOTE: in a 32bit arch with a preemptable kernel and
 * an UP compile the i_size_read/write must be atomic
 * with respect to the local cpu (unlike with preempt disabled),
 * but they don't need to be atomic with respect to other cpus like in
 * true SMP (so they need either to either locally disable irq around
 * the read or for example on x86 they can be still implemented as a
 * cmpxchg8b without the need of the lock prefix). For SMP compiles
 * and 64bit archs it makes no difference if preempt is enabled or not.
 */
static inline loff_t i_size_read(const struct inode *inode)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        loff_t i_size;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&inode->i_size_seqcount);
                i_size = inode->i_size;
        } while (read_seqcount_retry(&inode->i_size_seqcount, seq));
        return i_size;
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        loff_t i_size;

        preempt_disable();
        i_size = inode->i_size;
        preempt_enable();
        return i_size;
#else
        /* Pairs with smp_store_release() in i_size_write() */
        return smp_load_acquire(&inode->i_size);
#endif
}

/*
 * NOTE: unlike i_size_read(), i_size_write() does need locking around it
 * (normally i_rwsem), otherwise on 32bit/SMP an update of i_size_seqcount
 * can be lost, resulting in subsequent i_size_read() calls spinning forever.
 */
static inline void i_size_write(struct inode *inode, loff_t i_size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        preempt_disable();
        write_seqcount_begin(&inode->i_size_seqcount);
        inode->i_size = i_size;
        write_seqcount_end(&inode->i_size_seqcount);
        preempt_enable();
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        preempt_disable();
        inode->i_size = i_size;
        preempt_enable();
#else
        /*
         * Pairs with smp_load_acquire() in i_size_read() to ensure
         * changes related to inode size (such as page contents) are
         * visible before we see the changed inode size.
         */
        smp_store_release(&inode->i_size, i_size);
#endif
}

static inline unsigned iminor(const struct inode *inode)
{
        return MINOR(inode->i_rdev);
}

static inline unsigned imajor(const struct inode *inode)
{
        return MAJOR(inode->i_rdev);
}

struct fown_struct {
        struct file *file;        /* backpointer for security modules */
        rwlock_t lock;          /* protects pid, uid, euid fields */
        struct pid *pid;        /* pid or -pgrp where SIGIO should be sent */
        enum pid_type pid_type;        /* Kind of process group SIGIO should be sent to */
        kuid_t uid, euid;        /* uid/euid of process setting the owner */
        int signum;                /* posix.1b rt signal to be delivered on IO */
};

/**
 * struct file_ra_state - Track a file's readahead state.
 * @start: Where the most recent readahead started.
 * @size: Number of pages read in the most recent readahead.
 * @async_size: Numer of pages that were/are not needed immediately
 *      and so were/are genuinely "ahead".  Start next readahead when
 *      the first of these pages is accessed.
 * @ra_pages: Maximum size of a readahead request, copied from the bdi.
 * @order: Preferred folio order used for most recent readahead.
 * @mmap_miss: How many mmap accesses missed in the page cache.
 * @prev_pos: The last byte in the most recent read request.
 *
 * When this structure is passed to ->readahead(), the "most recent"
 * readahead means the current readahead.
 */
struct file_ra_state {
        pgoff_t start;
        unsigned int size;
        unsigned int async_size;
        unsigned int ra_pages;
        unsigned short order;
        unsigned short mmap_miss;
        loff_t prev_pos;
};

/*
 * Check if @index falls in the readahead windows.
 */
static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
{
        return (index >= ra->start &&
                index <  ra->start + ra->size);
}

/**
 * struct file - Represents a file
 * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
 * @f_mode: FMODE_* flags often used in hotpaths
 * @f_op: file operations
 * @f_mapping: Contents of a cacheable, mappable object.
 * @private_data: filesystem or driver specific data
 * @f_inode: cached inode
 * @f_flags: file flags
 * @f_iocb_flags: iocb flags
 * @f_cred: stashed credentials of creator/opener
 * @f_owner: file owner
 * @f_path: path of the file
 * @__f_path: writable alias for @f_path; *ONLY* for core VFS and only before
 *   the file gets open
 * @f_pos_lock: lock protecting file position
 * @f_pipe: specific to pipes
 * @f_pos: file position
 * @f_security: LSM security context of this file
 * @f_wb_err: writeback error
 * @f_sb_err: per sb writeback errors
 * @f_ep: link of all epoll hooks for this file
 * @f_task_work: task work entry point
 * @f_llist: work queue entrypoint
 * @f_ra: file's readahead state
 * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
 * @f_ref: reference count
 */
struct file {
        spinlock_t                        f_lock;
        fmode_t                                f_mode;
        const struct file_operations        *f_op;
        struct address_space                *f_mapping;
        void                                *private_data;
        struct inode                        *f_inode;
        unsigned int                        f_flags;
        unsigned int                        f_iocb_flags;
        const struct cred                *f_cred;
        struct fown_struct                *f_owner;
        /* --- cacheline 1 boundary (64 bytes) --- */
        union {
                const struct path        f_path;
                struct path                __f_path;
        };
        union {
                /* regular files (with FMODE_ATOMIC_POS) and directories */
                struct mutex                f_pos_lock;
                /* pipes */
                u64                        f_pipe;
        };
        loff_t                                f_pos;
#ifdef CONFIG_SECURITY
        void                                *f_security;
#endif
        /* --- cacheline 2 boundary (128 bytes) --- */
        errseq_t                        f_wb_err;
        errseq_t                        f_sb_err;
#ifdef CONFIG_EPOLL
        struct hlist_head                *f_ep;
#endif
        union {
                struct callback_head        f_task_work;
                struct llist_node        f_llist;
                struct file_ra_state        f_ra;
                freeptr_t                f_freeptr;
        };
        file_ref_t                        f_ref;
        /* --- cacheline 3 boundary (192 bytes) --- */
} __randomize_layout
  __attribute__((aligned(4)));        /* lest something weird decides that 2 is OK */

struct file_handle {
        __u32 handle_bytes;
        int handle_type;
        /* file identifier */
        unsigned char f_handle[] __counted_by(handle_bytes);
};

static inline struct file *get_file(struct file *f)
{
        file_ref_inc(&f->f_ref);
        return f;
}

struct file *get_file_rcu(struct file __rcu **f);
struct file *get_file_active(struct file **f);

#define file_count(f)        file_ref_read(&(f)->f_ref)

#define        MAX_NON_LFS        ((1UL<<31) - 1)

/* Page cache limit. The filesystems should put that into their s_maxbytes 
   limits, otherwise bad things can happen in VM. */ 
#if BITS_PER_LONG==32
#define MAX_LFS_FILESIZE        ((loff_t)ULONG_MAX << PAGE_SHIFT)
#elif BITS_PER_LONG==64
#define MAX_LFS_FILESIZE         ((loff_t)LLONG_MAX)
#endif

/* legacy typedef, should eventually be removed */
typedef void *fl_owner_t;

struct file_lock;
struct file_lease;

/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
#define OFFSET_MAX        type_max(loff_t)
#define OFFT_OFFSET_MAX        type_max(off_t)
#endif

int file_f_owner_allocate(struct file *file);
static inline struct fown_struct *file_f_owner(const struct file *file)
{
        return READ_ONCE(file->f_owner);
}

extern void send_sigio(struct fown_struct *fown, int fd, int band);

static inline struct inode *file_inode(const struct file *f)
{
        return f->f_inode;
}

/*
 * file_dentry() is a relic from the days that overlayfs was using files with a
 * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs.
 * In those days, file_dentry() was needed to get the underlying fs dentry that
 * matches f_inode.
 * Files with "fake" path should not exist nowadays, so use an assertion to make
 * sure that file_dentry() was not papering over filesystem bugs.
 */
static inline struct dentry *file_dentry(const struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;

        WARN_ON_ONCE(d_inode(dentry) != file_inode(file));
        return dentry;
}

struct fasync_struct {
        rwlock_t                fa_lock;
        int                        magic;
        int                        fa_fd;
        struct fasync_struct        *fa_next; /* singly linked list */
        struct file                *fa_file;
        struct rcu_head                fa_rcu;
};

#define FASYNC_MAGIC 0x4601

/* SMP safe fasync helpers: */
extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
extern int fasync_remove_entry(struct file *, struct fasync_struct **);
extern struct fasync_struct *fasync_alloc(void);
extern void fasync_free(struct fasync_struct *);

/* can be called from interrupts */
extern void kill_fasync(struct fasync_struct **, int, int);

extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
extern int f_setown(struct file *filp, int who, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct file *file);

/*
 *        Umount options
 */

#define MNT_FORCE        0x00000001        /* Attempt to forcibily umount */
#define MNT_DETACH        0x00000002        /* Just detach from the tree */
#define MNT_EXPIRE        0x00000004        /* Mark for expiry */
#define UMOUNT_NOFOLLOW        0x00000008        /* Don't follow symlink on umount */
#define UMOUNT_UNUSED        0x80000000        /* Flag guaranteed to be unused */

static inline struct user_namespace *i_user_ns(const struct inode *inode)
{
        return inode->i_sb->s_user_ns;
}

/* Helper functions so that in most cases filesystems will
 * not need to deal directly with kuid_t and kgid_t and can
 * instead deal with the raw numeric values that are stored
 * in the filesystem.
 */
static inline uid_t i_uid_read(const struct inode *inode)
{
        return from_kuid(i_user_ns(inode), inode->i_uid);
}

static inline gid_t i_gid_read(const struct inode *inode)
{
        return from_kgid(i_user_ns(inode), inode->i_gid);
}

static inline void i_uid_write(struct inode *inode, uid_t uid)
{
        inode->i_uid = make_kuid(i_user_ns(inode), uid);
}

static inline void i_gid_write(struct inode *inode, gid_t gid)
{
        inode->i_gid = make_kgid(i_user_ns(inode), gid);
}

/**
 * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode to map
 *
 * Return: whe inode's i_uid mapped down according to @idmap.
 * If the inode's i_uid has no mapping INVALID_VFSUID is returned.
 */
static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap,
                                         const struct inode *inode)
{
        return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid);
}

/**
 * i_uid_needs_update - check whether inode's i_uid needs to be updated
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Check whether the $inode's i_uid field needs to be updated taking idmapped
 * mounts into account if the filesystem supports it.
 *
 * Return: true if @inode's i_uid field needs to be updated, false if not.
 */
static inline bool i_uid_needs_update(struct mnt_idmap *idmap,
                                      const struct iattr *attr,
                                      const struct inode *inode)
{
        return ((attr->ia_valid & ATTR_UID) &&
                !vfsuid_eq(attr->ia_vfsuid,
                           i_uid_into_vfsuid(idmap, inode)));
}

/**
 * i_uid_update - update @inode's i_uid field
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Safely update @inode's i_uid field translating the vfsuid of any idmapped
 * mount into the filesystem kuid.
 */
static inline void i_uid_update(struct mnt_idmap *idmap,
                                const struct iattr *attr,
                                struct inode *inode)
{
        if (attr->ia_valid & ATTR_UID)
                inode->i_uid = from_vfsuid(idmap, i_user_ns(inode),
                                           attr->ia_vfsuid);
}

/**
 * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode to map
 *
 * Return: the inode's i_gid mapped down according to @idmap.
 * If the inode's i_gid has no mapping INVALID_VFSGID is returned.
 */
static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap,
                                         const struct inode *inode)
{
        return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid);
}

/**
 * i_gid_needs_update - check whether inode's i_gid needs to be updated
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Check whether the $inode's i_gid field needs to be updated taking idmapped
 * mounts into account if the filesystem supports it.
 *
 * Return: true if @inode's i_gid field needs to be updated, false if not.
 */
static inline bool i_gid_needs_update(struct mnt_idmap *idmap,
                                      const struct iattr *attr,
                                      const struct inode *inode)
{
        return ((attr->ia_valid & ATTR_GID) &&
                !vfsgid_eq(attr->ia_vfsgid,
                           i_gid_into_vfsgid(idmap, inode)));
}

/**
 * i_gid_update - update @inode's i_gid field
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Safely update @inode's i_gid field translating the vfsgid of any idmapped
 * mount into the filesystem kgid.
 */
static inline void i_gid_update(struct mnt_idmap *idmap,
                                const struct iattr *attr,
                                struct inode *inode)
{
        if (attr->ia_valid & ATTR_GID)
                inode->i_gid = from_vfsgid(idmap, i_user_ns(inode),
                                           attr->ia_vfsgid);
}

/**
 * inode_fsuid_set - initialize inode's i_uid field with callers fsuid
 * @inode: inode to initialize
 * @idmap: idmap of the mount the inode was found from
 *
 * Initialize the i_uid field of @inode. If the inode was found/created via
 * an idmapped mount map the caller's fsuid according to @idmap.
 */
static inline void inode_fsuid_set(struct inode *inode,
                                   struct mnt_idmap *idmap)
{
        inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode));
}

/**
 * inode_fsgid_set - initialize inode's i_gid field with callers fsgid
 * @inode: inode to initialize
 * @idmap: idmap of the mount the inode was found from
 *
 * Initialize the i_gid field of @inode. If the inode was found/created via
 * an idmapped mount map the caller's fsgid according to @idmap.
 */
static inline void inode_fsgid_set(struct inode *inode,
                                   struct mnt_idmap *idmap)
{
        inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode));
}

/**
 * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped
 * @sb: the superblock we want a mapping in
 * @idmap: idmap of the relevant mount
 *
 * Check whether the caller's fsuid and fsgid have a valid mapping in the
 * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map
 * the caller's fsuid and fsgid according to the @idmap first.
 *
 * Return: true if fsuid and fsgid is mapped, false if not.
 */
static inline bool fsuidgid_has_mapping(struct super_block *sb,
                                        struct mnt_idmap *idmap)
{
        struct user_namespace *fs_userns = sb->s_user_ns;
        kuid_t kuid;
        kgid_t kgid;

        kuid = mapped_fsuid(idmap, fs_userns);
        if (!uid_valid(kuid))
                return false;
        kgid = mapped_fsgid(idmap, fs_userns);
        if (!gid_valid(kgid))
                return false;
        return kuid_has_mapping(fs_userns, kuid) &&
               kgid_has_mapping(fs_userns, kgid);
}

struct timespec64 current_time(struct inode *inode);
struct timespec64 inode_set_ctime_current(struct inode *inode);
struct timespec64 inode_set_ctime_deleg(struct inode *inode,
                                        struct timespec64 update);

static inline time64_t inode_get_atime_sec(const struct inode *inode)
{
        return inode->i_atime_sec;
}

static inline long inode_get_atime_nsec(const struct inode *inode)
{
        return inode->i_atime_nsec;
}

static inline struct timespec64 inode_get_atime(const struct inode *inode)
{
        struct timespec64 ts = { .tv_sec  = inode_get_atime_sec(inode),
                                 .tv_nsec = inode_get_atime_nsec(inode) };

        return ts;
}

static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->i_atime_sec = ts.tv_sec;
        inode->i_atime_nsec = ts.tv_nsec;
        return ts;
}

static inline struct timespec64 inode_set_atime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };

        return inode_set_atime_to_ts(inode, ts);
}

static inline time64_t inode_get_mtime_sec(const struct inode *inode)
{
        return inode->i_mtime_sec;
}

static inline long inode_get_mtime_nsec(const struct inode *inode)
{
        return inode->i_mtime_nsec;
}

static inline struct timespec64 inode_get_mtime(const struct inode *inode)
{
        struct timespec64 ts = { .tv_sec  = inode_get_mtime_sec(inode),
                                 .tv_nsec = inode_get_mtime_nsec(inode) };
        return ts;
}

static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->i_mtime_sec = ts.tv_sec;
        inode->i_mtime_nsec = ts.tv_nsec;
        return ts;
}

static inline struct timespec64 inode_set_mtime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };
        return inode_set_mtime_to_ts(inode, ts);
}

/*
 * Multigrain timestamps
 *
 * Conditionally use fine-grained ctime and mtime timestamps when there
 * are users actively observing them via getattr. The primary use-case
 * for this is NFS clients that use the ctime to distinguish between
 * different states of the file, and that are often fooled by multiple
 * operations that occur in the same coarse-grained timer tick.
 */
#define I_CTIME_QUERIED                ((u32)BIT(31))

static inline time64_t inode_get_ctime_sec(const struct inode *inode)
{
        return inode->i_ctime_sec;
}

static inline long inode_get_ctime_nsec(const struct inode *inode)
{
        return inode->i_ctime_nsec & ~I_CTIME_QUERIED;
}

static inline struct timespec64 inode_get_ctime(const struct inode *inode)
{
        struct timespec64 ts = { .tv_sec  = inode_get_ctime_sec(inode),
                                 .tv_nsec = inode_get_ctime_nsec(inode) };

        return ts;
}

struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts);

/**
 * inode_set_ctime - set the ctime in the inode
 * @inode: inode in which to set the ctime
 * @sec: tv_sec value to set
 * @nsec: tv_nsec value to set
 *
 * Set the ctime in @inode to { @sec, @nsec }
 */
static inline struct timespec64 inode_set_ctime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };

        return inode_set_ctime_to_ts(inode, ts);
}

struct timespec64 simple_inode_init_ts(struct inode *inode);

static inline int inode_time_dirty_flag(struct inode *inode)
{
        if (inode->i_sb->s_flags & SB_LAZYTIME)
                return I_DIRTY_TIME;
        return I_DIRTY_SYNC;
}

/*
 * Snapshotting support.
 */

/**
 * file_write_started - check if SB_FREEZE_WRITE is held
 * @file: the file we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 * May be false positive with !S_ISREG, because file_start_write() has
 * no effect on !S_ISREG.
 */
static inline bool file_write_started(const struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_write_started(file_inode(file)->i_sb);
}

/**
 * file_write_not_started - check if SB_FREEZE_WRITE is not held
 * @file: the file we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 * May be false positive with !S_ISREG, because file_start_write() has
 * no effect on !S_ISREG.
 */
static inline bool file_write_not_started(const struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_write_not_started(file_inode(file)->i_sb);
}

bool inode_owner_or_capable(struct mnt_idmap *idmap,
                            const struct inode *inode);

/*
 * VFS helper functions..
 */
int vfs_create(struct mnt_idmap *, struct dentry *, umode_t,
               struct delegated_inode *);
struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
                         struct dentry *, umode_t, struct delegated_inode *);
int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
              umode_t, dev_t, struct delegated_inode *);
int vfs_symlink(struct mnt_idmap *, struct inode *,
                struct dentry *, const char *, struct delegated_inode *);
int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
             struct dentry *, struct delegated_inode *);
int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *,
              struct delegated_inode *);
int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
               struct delegated_inode *);

/**
 * struct renamedata - contains all information required for renaming
 * @mnt_idmap:     idmap of the mount in which the rename is happening.
 * @old_parent:        parent of source
 * @old_dentry:                source
 * @new_parent:        parent of destination
 * @new_dentry:                destination
 * @delegated_inode:   returns an inode needing a delegation break
 * @flags:             rename flags
 */
struct renamedata {
        struct mnt_idmap *mnt_idmap;
        struct dentry *old_parent;
        struct dentry *old_dentry;
        struct dentry *new_parent;
        struct dentry *new_dentry;
        struct delegated_inode *delegated_inode;
        unsigned int flags;
} __randomize_layout;

int vfs_rename(struct renamedata *);

static inline int vfs_whiteout(struct mnt_idmap *idmap,
                               struct inode *dir, struct dentry *dentry)
{
        return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE,
                         WHITEOUT_DEV, NULL);
}

struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
                                 const struct path *parentpath,
                                 umode_t mode, int open_flag,
                                 const struct cred *cred);
struct file *kernel_file_open(const struct path *path, int flags,
                              const struct cred *cred);

int vfs_mkobj(struct dentry *, umode_t,
                int (*f)(struct dentry *, umode_t, void *),
                void *);

int vfs_fchown(struct file *file, uid_t user, gid_t group);
int vfs_fchmod(struct file *file, umode_t mode);
int vfs_utimes(const struct path *path, struct timespec64 *times);

#ifdef CONFIG_COMPAT
extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
                                        unsigned long arg);
#else
#define compat_ptr_ioctl NULL
#endif

/*
 * VFS file helper functions.
 */
void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
                      const struct inode *dir, umode_t mode);
extern bool may_open_dev(const struct path *path);
umode_t mode_strip_sgid(struct mnt_idmap *idmap,
                        const struct inode *dir, umode_t mode);
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid);

/*
 * This is the "filldir" function type, used by readdir() to let
 * the kernel specify what kind of dirent layout it wants to have.
 * This allows the kernel to read directories into kernel space or
 * to have different dirent layouts depending on the binary type.
 * Return 'true' to keep going and 'false' if there are no more entries.
 */
struct dir_context;
typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
                         unsigned);

struct dir_context {
        filldir_t actor;
        loff_t pos;
        /*
         * Filesystems MUST NOT MODIFY count, but may use as a hint:
         * 0            unknown
         * > 0      space in buffer (assume at least one entry)
         * INT_MAX  unlimited
         */
        int count;
        /* @actor supports these flags in d_type high bits */
        unsigned int dt_flags_mask;
};

/* If OR-ed with d_type, pending signals are not checked */
#define FILLDIR_FLAG_NOINTR        0x1000

/*
 * These flags let !MMU mmap() govern direct device mapping vs immediate
 * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
 *
 * NOMMU_MAP_COPY:        Copy can be mapped (MAP_PRIVATE)
 * NOMMU_MAP_DIRECT:        Can be mapped directly (MAP_SHARED)
 * NOMMU_MAP_READ:        Can be mapped for reading
 * NOMMU_MAP_WRITE:        Can be mapped for writing
 * NOMMU_MAP_EXEC:        Can be mapped for execution
 */
#define NOMMU_MAP_COPY                0x00000001
#define NOMMU_MAP_DIRECT        0x00000008
#define NOMMU_MAP_READ                VM_MAYREAD
#define NOMMU_MAP_WRITE                VM_MAYWRITE
#define NOMMU_MAP_EXEC                VM_MAYEXEC

#define NOMMU_VMFLAGS \
        (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)

/*
 * These flags control the behavior of the remap_file_range function pointer.
 * If it is called with len == 0 that means "remap to end of source file".
 * See Documentation/filesystems/vfs.rst for more details about this call.
 *
 * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
 * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
 */
#define REMAP_FILE_DEDUP                (1 << 0)
#define REMAP_FILE_CAN_SHORTEN                (1 << 1)

/*
 * These flags signal that the caller is ok with altering various aspects of
 * the behavior of the remap operation.  The changes must be made by the
 * implementation; the vfs remap helper functions can take advantage of them.
 * Flags in this category exist to preserve the quirky behavior of the hoisted
 * btrfs clone/dedupe ioctls.
 */
#define REMAP_FILE_ADVISORY                (REMAP_FILE_CAN_SHORTEN)

/*
 * These flags control the behavior of vfs_copy_file_range().
 * They are not available to the user via syscall.
 *
 * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops
 */
#define COPY_FILE_SPLICE                (1 << 0)

struct io_uring_cmd;
struct offset_ctx;

typedef unsigned int __bitwise fop_flags_t;

struct file_operations {
        struct module *owner;
        fop_flags_t fop_flags;
        loff_t (*llseek) (struct file *, loff_t, int);
        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
        ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
        int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
                        unsigned int flags);
        int (*iterate_shared) (struct file *, struct dir_context *);
        __poll_t (*poll) (struct file *, struct poll_table_struct *);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
        int (*fsync) (struct file *, loff_t, loff_t, int datasync);
        int (*fasync) (int, struct file *, int);
        int (*lock) (struct file *, int, struct file_lock *);
        unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
        int (*check_flags)(int);
        int (*flock) (struct file *, int, struct file_lock *);
        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
        ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
        void (*splice_eof)(struct file *file);
        int (*setlease)(struct file *, int, struct file_lease **, void **);
        long (*fallocate)(struct file *file, int mode, loff_t offset,
                          loff_t len);
        void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
        unsigned (*mmap_capabilities)(struct file *);
#endif
        ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                        loff_t, size_t, unsigned int);
        loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
        int (*fadvise)(struct file *, loff_t, loff_t, int);
        int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
        int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
                                unsigned int poll_flags);
        int (*mmap_prepare)(struct vm_area_desc *);
} __randomize_layout;

/* Supports async buffered reads */
#define FOP_BUFFER_RASYNC        ((__force fop_flags_t)(1 << 0))
/* Supports async buffered writes */
#define FOP_BUFFER_WASYNC        ((__force fop_flags_t)(1 << 1))
/* Supports synchronous page faults for mappings */
#define FOP_MMAP_SYNC                ((__force fop_flags_t)(1 << 2))
/* Supports non-exclusive O_DIRECT writes from multiple threads */
#define FOP_DIO_PARALLEL_WRITE        ((__force fop_flags_t)(1 << 3))
/* Contains huge pages */
#define FOP_HUGE_PAGES                ((__force fop_flags_t)(1 << 4))
/* Treat loff_t as unsigned (e.g., /dev/mem) */
#define FOP_UNSIGNED_OFFSET        ((__force fop_flags_t)(1 << 5))
/* Supports asynchronous lock callbacks */
#define FOP_ASYNC_LOCK                ((__force fop_flags_t)(1 << 6))
/* File system supports uncached read/write buffered IO */
#define FOP_DONTCACHE                ((__force fop_flags_t)(1 << 7))

/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
                            int (*) (struct file *, struct dir_context *));
#define WRAP_DIR_ITER(x) \
        static int shared_##x(struct file *file , struct dir_context *ctx) \
        { return wrap_directory_iterator(file, ctx, x); }

enum fs_update_time {
        FS_UPD_ATIME,
        FS_UPD_CMTIME,
};

struct inode_operations {
        struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
        const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
        int (*permission) (struct mnt_idmap *, struct inode *, int);
        struct posix_acl * (*get_inode_acl)(struct inode *, int, bool);

        int (*readlink) (struct dentry *, char __user *,int);

        int (*create) (struct mnt_idmap *, struct inode *,struct dentry *,
                       umode_t, bool);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
        int (*unlink) (struct inode *,struct dentry *);
        int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,
                        const char *);
        struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *,
                                 struct dentry *, umode_t);
        int (*rmdir) (struct inode *,struct dentry *);
        int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,
                      umode_t,dev_t);
        int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *,
                        struct inode *, struct dentry *, unsigned int);
        int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *);
        int (*getattr) (struct mnt_idmap *, const struct path *,
                        struct kstat *, u32, unsigned int);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                      u64 len);
        int (*update_time)(struct inode *inode, enum fs_update_time type,
                           unsigned int flags);
        void (*sync_lazytime)(struct inode *inode);
        int (*atomic_open)(struct inode *, struct dentry *,
                           struct file *, unsigned open_flag,
                           umode_t create_mode);
        int (*tmpfile) (struct mnt_idmap *, struct inode *,
                        struct file *, umode_t);
        struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *,
                                     int);
        int (*set_acl)(struct mnt_idmap *, struct dentry *,
                       struct posix_acl *, int);
        int (*fileattr_set)(struct mnt_idmap *idmap,
                            struct dentry *dentry, struct file_kattr *fa);
        int (*fileattr_get)(struct dentry *dentry, struct file_kattr *fa);
        struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
} ____cacheline_aligned;

/* Did the driver provide valid mmap hook configuration? */
static inline bool can_mmap_file(struct file *file)
{
        bool has_mmap = file->f_op->mmap;
        bool has_mmap_prepare = file->f_op->mmap_prepare;

        /* Hooks are mutually exclusive. */
        if (WARN_ON_ONCE(has_mmap && has_mmap_prepare))
                return false;
        if (!has_mmap && !has_mmap_prepare)
                return false;

        return true;
}

void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file,
                              const struct vm_area_struct *vma);
int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma);
int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);

static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
{
        if (file->f_op->mmap_prepare)
                return compat_vma_mmap(file, vma);

        return file->f_op->mmap(file, vma);
}

static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
{
        return file->f_op->mmap_prepare(desc);
}

extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                   loff_t, size_t, unsigned int);
int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write);
int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    loff_t *len, unsigned int remap_flags,
                                    const struct iomap_ops *dax_read_ops);
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                  struct file *file_out, loff_t pos_out,
                                  loff_t *count, unsigned int remap_flags);
extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
extern int vfs_dedupe_file_range(struct file *file,
                                 struct file_dedupe_range *same);
extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
                                        struct file *dst_file, loff_t dst_pos,
                                        loff_t len, unsigned int remap_flags);

/*
 * Inode flags - they have no relation to superblock flags now
 */
#define S_SYNC                (1 << 0)  /* Writes are synced at once */
#define S_NOATIME        (1 << 1)  /* Do not update access times */
#define S_APPEND        (1 << 2)  /* Append-only file */
#define S_IMMUTABLE        (1 << 3)  /* Immutable file */
#define S_DEAD                (1 << 4)  /* removed, but still open directory */
#define S_NOQUOTA        (1 << 5)  /* Inode is not counted to quota */
#define S_DIRSYNC        (1 << 6)  /* Directory modifications are synchronous */
#define S_NOCMTIME        (1 << 7)  /* Do not update file c/mtime */
#define S_SWAPFILE        (1 << 8)  /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE        (1 << 9)  /* Inode is fs-internal */
#define S_IMA                (1 << 10) /* Inode has an associated IMA struct */
#define S_AUTOMOUNT        (1 << 11) /* Automount/referral quasi-directory */
#define S_NOSEC                (1 << 12) /* no suid or xattr security attributes */
#ifdef CONFIG_FS_DAX
#define S_DAX                (1 << 13) /* Direct Access, avoiding the page cache */
#else
#define S_DAX                0          /* Make all the DAX code disappear */
#endif
#define S_ENCRYPTED        (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD        (1 << 15) /* Casefolded file */
#define S_VERITY        (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE        (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
#define S_ANON_INODE        (1 << 19) /* Inode is an anonymous inode */

/*
 * Note that nosuid etc flags are inode-specific: setting some file-system
 * flags just means all the inodes inherit those flags by default. It might be
 * possible to override it selectively if you really wanted to with some
 * ioctl() that is not currently implemented.
 *
 * Exception: SB_RDONLY is always applied to the entire file system.
 *
 * Unfortunately, it is possible to change a filesystems flags with it mounted
 * with files in use.  This means that all of the inodes will not have their
 * i_flags updated.  Hence, i_flags no longer inherit the superblock mount
 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
 */
#define __IS_FLG(inode, flg)        ((inode)->i_sb->s_flags & (flg))

#define IS_RDONLY(inode)        sb_rdonly((inode)->i_sb)
#define IS_SYNC(inode)                (__IS_FLG(inode, SB_SYNCHRONOUS) || \
                                        ((inode)->i_flags & S_SYNC))
#define IS_DIRSYNC(inode)        (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \
                                        ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
#define IS_MANDLOCK(inode)        __IS_FLG(inode, SB_MANDLOCK)
#define IS_NOATIME(inode)        __IS_FLG(inode, SB_RDONLY|SB_NOATIME)
#define IS_I_VERSION(inode)        __IS_FLG(inode, SB_I_VERSION)

#define IS_NOQUOTA(inode)        ((inode)->i_flags & S_NOQUOTA)
#define IS_APPEND(inode)        ((inode)->i_flags & S_APPEND)
#define IS_IMMUTABLE(inode)        ((inode)->i_flags & S_IMMUTABLE)

#ifdef CONFIG_FS_POSIX_ACL
#define IS_POSIXACL(inode)        __IS_FLG(inode, SB_POSIXACL)
#else
#define IS_POSIXACL(inode)        0
#endif

#define IS_DEADDIR(inode)        ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode)        ((inode)->i_flags & S_NOCMTIME)

#ifdef CONFIG_SWAP
#define IS_SWAPFILE(inode)        ((inode)->i_flags & S_SWAPFILE)
#else
#define IS_SWAPFILE(inode)        ((void)(inode), 0U)
#endif

#define IS_PRIVATE(inode)        ((inode)->i_flags & S_PRIVATE)
#define IS_IMA(inode)                ((inode)->i_flags & S_IMA)
#define IS_AUTOMOUNT(inode)        ((inode)->i_flags & S_AUTOMOUNT)
#define IS_NOSEC(inode)                ((inode)->i_flags & S_NOSEC)
#define IS_DAX(inode)                ((inode)->i_flags & S_DAX)
#define IS_ENCRYPTED(inode)        ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode)        ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode)        ((inode)->i_flags & S_VERITY)

#define IS_WHITEOUT(inode)        (S_ISCHR(inode->i_mode) && \
                                 (inode)->i_rdev == WHITEOUT_DEV)
#define IS_ANON_FILE(inode)        ((inode)->i_flags & S_ANON_INODE)

static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
                                   struct inode *inode)
{
        return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
               !vfsgid_valid(i_gid_into_vfsgid(idmap, inode));
}

static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = filp->f_iocb_flags,
                .ki_ioprio = get_current_ioprio(),
        };
}

static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
                               struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = kiocb_src->ki_flags,
                .ki_ioprio = kiocb_src->ki_ioprio,
                .ki_pos = kiocb_src->ki_pos,
        };
}

extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY);
}

static inline void mark_inode_dirty_sync(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY_SYNC);
}

static inline int icount_read(const struct inode *inode)
{
        return atomic_read(&inode->i_count);
}

/*
 * Returns true if the given inode itself only has dirty timestamps (its pages
 * may still be dirty) and isn't currently being allocated or freed.
 * Filesystems should call this if when writing an inode when lazytime is
 * enabled, they want to opportunistically write the timestamps of other inodes
 * located very nearby on-disk, e.g. in the same inode block.  This returns true
 * if the given inode is in need of such an opportunistic update.  Requires
 * i_lock, or at least later re-checking under i_lock.
 */
static inline bool inode_is_dirtytime_only(struct inode *inode)
{
        return (inode_state_read_once(inode) &
               (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
}

extern void inc_nlink(struct inode *inode);
extern void drop_nlink(struct inode *inode);
extern void clear_nlink(struct inode *inode);
extern void set_nlink(struct inode *inode, unsigned int nlink);

static inline void inode_inc_link_count(struct inode *inode)
{
        inc_nlink(inode);
        mark_inode_dirty(inode);
}

static inline void inode_dec_link_count(struct inode *inode)
{
        drop_nlink(inode);
        mark_inode_dirty(inode);
}

extern bool atime_needs_update(const struct path *, struct inode *);
extern void touch_atime(const struct path *);

static inline void file_accessed(struct file *file)
{
        if (!(file->f_flags & O_NOATIME))
                touch_atime(&file->f_path);
}

extern int file_modified(struct file *file);
int kiocb_modified(struct kiocb *iocb);

int sync_inode_metadata(struct inode *inode, int wait);

struct file_system_type {
        const char *name;
        int fs_flags;
#define FS_REQUIRES_DEV                1 
#define FS_BINARY_MOUNTDATA        2
#define FS_HAS_SUBTYPE                4
#define FS_USERNS_MOUNT                8        /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM        16        /* Disable fanotify permission events */
#define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
#define FS_MGTIME                64        /* FS uses multigrain timestamps */
#define FS_LBS                        128        /* FS supports LBS */
#define FS_POWER_FREEZE                256        /* Always freeze on suspend/hibernate */
#define FS_RENAME_DOES_D_MOVE        32768        /* FS will handle d_move() during rename() internally. */
        int (*init_fs_context)(struct fs_context *);
        const struct fs_parameter_spec *parameters;
        void (*kill_sb) (struct super_block *);
        struct module *owner;
        struct file_system_type * next;
        struct hlist_head fs_supers;

        struct lock_class_key s_lock_key;
        struct lock_class_key s_umount_key;
        struct lock_class_key s_vfs_rename_key;
        struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

        struct lock_class_key i_lock_key;
        struct lock_class_key i_mutex_key;
        struct lock_class_key invalidate_lock_key;
        struct lock_class_key i_mutex_dir_key;
};

#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)

/**
 * is_mgtime: is this inode using multigrain timestamps
 * @inode: inode to test for multigrain timestamps
 *
 * Return true if the inode uses multigrain timestamps, false otherwise.
 */
static inline bool is_mgtime(const struct inode *inode)
{
        return inode->i_opflags & IOP_MGTIME;
}

extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
void retire_super(struct super_block *sb);
void generic_shutdown_super(struct super_block *sb);
void kill_block_super(struct super_block *sb);
void kill_anon_super(struct super_block *sb);
void deactivate_super(struct super_block *sb);
void deactivate_locked_super(struct super_block *sb);
int set_anon_super(struct super_block *s, void *data);
int set_anon_super_fc(struct super_block *s, struct fs_context *fc);
int get_anon_bdev(dev_t *);
void free_anon_bdev(dev_t);
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *));
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags, void *data);
struct super_block *sget_dev(struct fs_context *fc, dev_t dev);

/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
#define fops_get(fops) ({                                                \
        const struct file_operations *_fops = (fops);                        \
        (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL));        \
})

#define fops_put(fops) ({                                                \
        const struct file_operations *_fops = (fops);                        \
        if (_fops)                                                        \
                module_put((_fops)->owner);                                \
})

/*
 * This one is to be used *ONLY* from ->open() instances.
 * fops must be non-NULL, pinned down *and* module dependencies
 * should be sufficient to pin the caller down as well.
 */
#define replace_fops(f, fops) \
        do {        \
                struct file *__file = (f); \
                fops_put(__file->f_op); \
                BUG_ON(!(__file->f_op = (fops))); \
        } while(0)

extern int register_filesystem(struct file_system_type *);
extern int unregister_filesystem(struct file_system_type *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);

static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len)
{
        if (WARN_ON(len > sizeof(sb->s_uuid)))
                len = sizeof(sb->s_uuid);
        sb->s_uuid_len = len;
        memcpy(&sb->s_uuid, uuid, len);
}

/* set sb sysfs name based on sb->s_bdev */
static inline void super_set_sysfs_name_bdev(struct super_block *sb)
{
        snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev);
}

/* set sb sysfs name based on sb->s_uuid */
static inline void super_set_sysfs_name_uuid(struct super_block *sb)
{
        WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid));
        snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b);
}

/* set sb sysfs name based on sb->s_id */
static inline void super_set_sysfs_name_id(struct super_block *sb)
{
        strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name));
}

/* try to use something standard before you use this */
__printf(2, 3)
static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...)
{
        va_list args;

        va_start(args, fmt);
        vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args);
        va_end(args);
}

extern void ihold(struct inode * inode);
extern void iput(struct inode *);
void iput_not_last(struct inode *);
int inode_update_time(struct inode *inode, enum fs_update_time type,
                unsigned int flags);
int generic_update_time(struct inode *inode, enum fs_update_time type,
                unsigned int flags);

/* /sys/fs */
extern struct kobject *fs_kobj;

#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)

/* fs/open.c */
struct audit_names;

struct __filename_head {
        const char                *name;        /* pointer to actual string */
        int                        refcnt;
        struct audit_names        *aname;
};
#define EMBEDDED_NAME_MAX        (192 - sizeof(struct __filename_head))
struct filename {
        struct __filename_head;
        const char                iname[EMBEDDED_NAME_MAX];
};
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
static_assert(sizeof(struct filename) % 64 == 0);

static inline struct mnt_idmap *file_mnt_idmap(const struct file *file)
{
        return mnt_idmap(file->f_path.mnt);
}

/**
 * is_idmapped_mnt - check whether a mount is mapped
 * @mnt: the mount to check
 *
 * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped.
 *
 * Return: true if mount is mapped, false if not.
 */
static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
{
        return mnt_idmap(mnt) != &nop_mnt_idmap;
}

int vfs_truncate(const struct path *, loff_t);
int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start,
                unsigned int time_attrs, struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
                        loff_t len);
int do_sys_open(int dfd, const char __user *filename, int flags,
                umode_t mode);
extern struct file *file_open_name(struct filename *, int, umode_t);
extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(const struct path *,
                                   const char *, int, umode_t);
static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
                                   const char *name, int flags, umode_t mode)
{
        return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root},
                              name, flags, mode);
}
struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *creds);
struct file *dentry_open_nonotify(const struct path *path, int flags,
                                  const struct cred *cred);
struct file *dentry_create(struct path *path, int flags, umode_t mode,
                           const struct cred *cred);
const struct path *backing_file_user_path(const struct file *f);

#ifdef CONFIG_SECURITY
void *backing_file_security(const struct file *f);
void backing_file_set_security(struct file *f, void *security);
#else
static inline void *backing_file_security(const struct file *f)
{
        return NULL;
}
static inline void backing_file_set_security(struct file *f, void *security)
{
}
#endif /* CONFIG_SECURITY */

/*
 * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
 * stored in ->vm_file is a backing file whose f_inode is on the underlying
 * filesystem.  When the mapped file path and inode number are displayed to
 * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the
 * path and inode number to display to the user, which is the path of the fd
 * that user has requested to map and the inode number that would be returned
 * by fstat() on that same fd.
 */
/* Get the path to display in /proc/<pid>/maps */
static inline const struct path *file_user_path(const struct file *f)
{
        if (unlikely(f->f_mode & FMODE_BACKING))
                return backing_file_user_path(f);
        return &f->f_path;
}
/* Get the inode whose inode number to display in /proc/<pid>/maps */
static inline const struct inode *file_user_inode(const struct file *f)
{
        if (unlikely(f->f_mode & FMODE_BACKING))
                return d_inode(backing_file_user_path(f)->dentry);
        return file_inode(f);
}

static inline struct file *file_clone_open(struct file *file)
{
        return dentry_open(&file->f_path, file->f_flags, file->f_cred);
}
extern int filp_close(struct file *, fl_owner_t id);

extern struct filename *getname_flags(const char __user *, int);
extern struct filename *getname_uflags(const char __user *, int);
static inline struct filename *getname(const char __user *name)
{
        return getname_flags(name, 0);
}
extern struct filename *getname_kernel(const char *);
extern struct filename *__getname_maybe_null(const char __user *);
static inline struct filename *getname_maybe_null(const char __user *name, int flags)
{
        if (!(flags & AT_EMPTY_PATH))
                return getname(name);

        if (!name)
                return NULL;
        return __getname_maybe_null(name);
}
extern void putname(struct filename *name);
DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T))

struct delayed_filename {
        struct filename *__incomplete_filename;        // don't touch
};
#define INIT_DELAYED_FILENAME(ptr) \
        ((void)(*(ptr) = (struct delayed_filename){}))
int delayed_getname(struct delayed_filename *, const char __user *);
int delayed_getname_uflags(struct delayed_filename *v, const char __user *, int);
void dismiss_delayed_filename(struct delayed_filename *);
int putname_to_delayed(struct delayed_filename *, struct filename *);
struct filename *complete_getname(struct delayed_filename *);

DEFINE_CLASS(filename, struct filename *, putname(_T), getname(p), const char __user *p)
EXTEND_CLASS(filename, _kernel, getname_kernel(p), const char *p)
EXTEND_CLASS(filename, _flags, getname_flags(p, f), const char __user *p, unsigned int f)
EXTEND_CLASS(filename, _uflags, getname_uflags(p, f), const char __user *p, unsigned int f)
EXTEND_CLASS(filename, _maybe_null, getname_maybe_null(p, f), const char __user *p, unsigned int f)
EXTEND_CLASS(filename, _complete_delayed, complete_getname(p), struct delayed_filename *p)

extern int finish_open(struct file *file, struct dentry *dentry,
                        int (*open)(struct inode *, struct file *));
extern int finish_no_open(struct file *file, struct dentry *dentry);

/* Helper for the simple case when original dentry is used */
static inline int finish_open_simple(struct file *file, int error)
{
        if (error)
                return error;

        return finish_open(file, file->f_path.dentry, NULL);
}

/* fs/dcache.c */
extern void __init vfs_caches_init_early(void);
extern void __init vfs_caches_init(void);

#define __getname()                kmalloc(PATH_MAX, GFP_KERNEL)
#define __putname(name)                kfree(name)

void emergency_thaw_all(void);
extern int sync_filesystem(struct super_block *);
extern const struct file_operations def_blk_fops;
extern const struct file_operations def_chr_fops;

/* fs/char_dev.c */
#define CHRDEV_MAJOR_MAX 512
/* Marks the bottom of the first segment of free char majors */
#define CHRDEV_MAJOR_DYN_END 234
/* Marks the top and bottom of the second segment of free char majors */
#define CHRDEV_MAJOR_DYN_EXT_START 511
#define CHRDEV_MAJOR_DYN_EXT_END 384

extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
extern int register_chrdev_region(dev_t, unsigned, const char *);
extern int __register_chrdev(unsigned int major, unsigned int baseminor,
                             unsigned int count, const char *name,
                             const struct file_operations *fops);
extern void __unregister_chrdev(unsigned int major, unsigned int baseminor,
                                unsigned int count, const char *name);
extern void unregister_chrdev_region(dev_t, unsigned);
extern void chrdev_show(struct seq_file *,off_t);

static inline int register_chrdev(unsigned int major, const char *name,
                                  const struct file_operations *fops)
{
        return __register_chrdev(major, 0, 256, name, fops);
}

static inline void unregister_chrdev(unsigned int major, const char *name)
{
        __unregister_chrdev(major, 0, 256, name);
}

extern void init_special_inode(struct inode *, umode_t, dev_t);

/* Invalid inode operations -- fs/bad_inode.c */
extern void make_bad_inode(struct inode *);
extern bool is_bad_inode(struct inode *);

extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
                                                loff_t lend);
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
                                                loff_t start, loff_t end);
int filemap_flush_range(struct address_space *mapping, loff_t start,
                loff_t end);

static inline int file_write_and_wait(struct file *file)
{
        return file_write_and_wait_range(file, 0, LLONG_MAX);
}

extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
                           int datasync);
extern int vfs_fsync(struct file *file, int datasync);

extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
                                unsigned int flags);

static inline bool iocb_is_dsync(const struct kiocb *iocb)
{
        return (iocb->ki_flags & IOCB_DSYNC) ||
                IS_SYNC(iocb->ki_filp->f_mapping->host);
}

/*
 * Sync the bytes written if this was a synchronous write.  Expect ki_pos
 * to already be updated for the write, and will return either the amount
 * of bytes passed in, or an error if syncing the file failed.
 */
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{
        if (iocb_is_dsync(iocb)) {
                int ret = vfs_fsync_range(iocb->ki_filp,
                                iocb->ki_pos - count, iocb->ki_pos - 1,
                                (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
                if (ret)
                        return ret;
        } else if (iocb->ki_flags & IOCB_DONTCACHE) {
                struct address_space *mapping = iocb->ki_filp->f_mapping;

                filemap_flush_range(mapping, iocb->ki_pos - count,
                                iocb->ki_pos - 1);
        }

        return count;
}

extern void emergency_sync(void);
extern void emergency_remount(void);

#ifdef CONFIG_BLOCK
extern int bmap(struct inode *inode, sector_t *block);
#else
static inline int bmap(struct inode *inode,  sector_t *block)
{
        return -EINVAL;
}
#endif

int notify_change(struct mnt_idmap *, struct dentry *,
                  struct iattr *, struct delegated_inode *);
int inode_permission(struct mnt_idmap *, struct inode *, int);
int generic_permission(struct mnt_idmap *, struct inode *, int);
static inline int file_permission(struct file *file, int mask)
{
        return inode_permission(file_mnt_idmap(file),
                                file_inode(file), mask);
}
static inline int path_permission(const struct path *path, int mask)
{
        return inode_permission(mnt_idmap(path->mnt),
                                d_inode(path->dentry), mask);
}
int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
                   struct inode *inode);

int may_delete_dentry(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *victim, bool isdir);
int may_create_dentry(struct mnt_idmap *idmap,
                      struct inode *dir, struct dentry *child);

static inline bool execute_ok(struct inode *inode)
{
        return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
}

static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
{
        return (inode->i_mode ^ mode) & S_IFMT;
}

/**
 * file_start_write - get write access to a superblock for regular file io
 * @file: the file we want to write to
 *
 * This is a variant of sb_start_write() which is a noop on non-regular file.
 * Should be matched with a call to file_end_write().
 */
static inline void file_start_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_start_write(file_inode(file)->i_sb);
}

static inline bool file_start_write_trylock(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_start_write_trylock(file_inode(file)->i_sb);
}

/**
 * file_end_write - drop write access to a superblock of a regular file
 * @file: the file we wrote to
 *
 * Should be matched with a call to file_start_write().
 */
static inline void file_end_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_end_write(file_inode(file)->i_sb);
}

/**
 * kiocb_start_write - get write access to a superblock for async file io
 * @iocb: the io context we want to submit the write with
 *
 * This is a variant of sb_start_write() for async io submission.
 * Should be matched with a call to kiocb_end_write().
 */
static inline void kiocb_start_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        sb_start_write(inode->i_sb);
        /*
         * Fool lockdep by telling it the lock got released so that it
         * doesn't complain about the held lock when we return to userspace.
         */
        __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
}

/**
 * kiocb_end_write - drop write access to a superblock after async file io
 * @iocb: the io context we sumbitted the write with
 *
 * Should be matched with a call to kiocb_start_write().
 */
static inline void kiocb_end_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        /*
         * Tell lockdep we inherited freeze protection from submission thread.
         */
        __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
        sb_end_write(inode->i_sb);
}

/*
 * This is used for regular files where some users -- especially the
 * currently executed binary in a process, previously handled via
 * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap
 * read-write shared) accesses.
 *
 * get_write_access() gets write permission for a file.
 * put_write_access() releases this write permission.
 * deny_write_access() denies write access to a file.
 * allow_write_access() re-enables write access to a file.
 *
 * The i_writecount field of an inode can have the following values:
 * 0: no write access, no denied write access
 * < 0: (-i_writecount) users that denied write access to the file.
 * > 0: (i_writecount) users that have write access to the file.
 *
 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
 * except for the cases where we don't hold i_writecount yet. Then we need to
 * use {get,deny}_write_access() - these functions check the sign and refuse
 * to do the change if sign is wrong.
 */
static inline int get_write_access(struct inode *inode)
{
        return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline int deny_write_access(struct file *file)
{
        struct inode *inode = file_inode(file);
        return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline void put_write_access(struct inode * inode)
{
        atomic_dec(&inode->i_writecount);
}
static inline void allow_write_access(struct file *file)
{
        if (file)
                atomic_inc(&file_inode(file)->i_writecount);
}

/*
 * Do not prevent write to executable file when watched by pre-content events.
 *
 * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at
 * the time of file open and remains constant for entire lifetime of the file,
 * so if pre-content watches are added post execution or removed before the end
 * of the execution, it will not cause i_writecount reference leak.
 */
static inline int exe_file_deny_write_access(struct file *exe_file)
{
        if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
                return 0;
        return deny_write_access(exe_file);
}
static inline void exe_file_allow_write_access(struct file *exe_file)
{
        if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
                return;
        allow_write_access(exe_file);
}

static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode)
{
        file->f_mode &= ~FMODE_FSNOTIFY_MASK;
        file->f_mode |= mode;
}

static inline bool inode_is_open_for_write(const struct inode *inode)
{
        return atomic_read(&inode->i_writecount) > 0;
}

#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
static inline void i_readcount_dec(struct inode *inode)
{
        BUG_ON(atomic_dec_return(&inode->i_readcount) < 0);
}
static inline void i_readcount_inc(struct inode *inode)
{
        atomic_inc(&inode->i_readcount);
}
#else
static inline void i_readcount_dec(struct inode *inode)
{
        return;
}
static inline void i_readcount_inc(struct inode *inode)
{
        return;
}
#endif
extern int do_pipe_flags(int *, int);

extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
extern struct file * open_exec(const char *);
 
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);
u64 vfsmount_to_propagation_flags(struct vfsmount *mnt);

extern char *file_path(struct file *, char *, int);

static inline bool name_is_dot(const char *name, size_t len)
{
        return unlikely(len == 1 && name[0] == '.');
}

static inline bool name_is_dotdot(const char *name, size_t len)
{
        return unlikely(len == 2 && name[0] == '.' && name[1] == '.');
}

/**
 * name_is_dot_dotdot - returns true only if @name is "." or ".."
 * @name: file name to check
 * @len: length of file name, in bytes
 */
static inline bool name_is_dot_dotdot(const char *name, size_t len)
{
        return len && unlikely(name[0] == '.') &&
                (len == 1 || (len == 2 && name[1] == '.'));
}

/**
 * name_contains_dotdot - check if a file name contains ".." path components
 * @name: File path string to check
 * Search for ".." surrounded by either '/' or start/end of string.
 */
static inline bool name_contains_dotdot(const char *name)
{
        size_t name_len;

        name_len = strlen(name);
        return strcmp(name, "..") == 0 ||
               strncmp(name, "../", 3) == 0 ||
               strstr(name, "/../") != NULL ||
               (name_len >= 3 && strcmp(name + name_len - 3, "/..") == 0);
}

#include <linux/err.h>

/* needed for stackable file system support */
loff_t default_llseek(struct file *file, loff_t offset, int whence);

loff_t vfs_llseek(struct file *file, loff_t offset, int whence);

int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp);
static inline int inode_init_always(struct super_block *sb, struct inode *inode)
{
        return inode_init_always_gfp(sb, inode, GFP_NOFS);
}

void inode_init_once(struct inode *inode);
void address_space_init_once(struct address_space *mapping);
struct inode *igrab(struct inode *inode);
ino_t iunique(struct super_block *sb, ino_t max_reserved);
int inode_needs_sync(struct inode *inode);
int inode_just_drop(struct inode *inode);
static inline int inode_generic_drop(struct inode *inode)
{
        return !inode->i_nlink || inode_unhashed(inode);
}
void d_mark_dontcache(struct inode *inode);

struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval,
                              int (*test)(struct inode *, void *), void *data,
                              bool *isnew);
struct inode *ilookup5(struct super_block *sb, u64 hashval,
                       int (*test)(struct inode *, void *), void *data);
struct inode *ilookup(struct super_block *sb, u64 ino);

struct inode *inode_insert5(struct inode *inode, u64 hashval,
                            int (*test)(struct inode *, void *),
                            int (*set)(struct inode *, void *), void *data);
struct inode *iget5_locked(struct super_block *sb, u64 hashval,
                           int (*test)(struct inode *, void *),
                           int (*set)(struct inode *, void *), void *data);
struct inode *iget5_locked_rcu(struct super_block *sb, u64 hashval,
                               int (*test)(struct inode *, void *),
                               int (*set)(struct inode *, void *), void *data);
struct inode *iget_locked(struct super_block *sb, u64 ino);
struct inode *find_inode_nowait(struct super_block *sb, u64 hashval,
                                int (*match)(struct inode *, u64, void *),
                                void *data);
struct inode *find_inode_rcu(struct super_block *sb, u64 hashval,
                             int (*test)(struct inode *, void *), void *data);
struct inode *find_inode_by_ino_rcu(struct super_block *sb, u64 ino);
int insert_inode_locked4(struct inode *inode, u64 hashval,
                         int (*test)(struct inode *, void *), void *data);
int insert_inode_locked(struct inode *inode);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void lockdep_annotate_inode_mutex_key(struct inode *inode);
#else
static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
#endif
void unlock_new_inode(struct inode *inode);
void discard_new_inode(struct inode *inode);
unsigned int get_next_ino(void);
void evict_inodes(struct super_block *sb);
void dump_mapping(const struct address_space *);

/*
 * Userspace may rely on the inode number being non-zero. For example, glibc
 * simply ignores files with zero i_ino in unlink() and other places.
 *
 * As an additional complication, if userspace was compiled with
 * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the
 * lower 32 bits, so we need to check that those aren't zero explicitly. With
 * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but
 * better safe than sorry.
 */
static inline bool is_zero_ino(ino_t ino)
{
        return (u32)ino == 0;
}

static inline void __iget(struct inode *inode)
{
        lockdep_assert_held(&inode->i_lock);
        atomic_inc(&inode->i_count);
}

extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
struct inode *alloc_inode(struct super_block *sb);
static inline struct inode *new_inode_pseudo(struct super_block *sb)
{
        return alloc_inode(sb);
}
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
extern int file_remove_privs(struct file *);
int setattr_should_drop_sgid(struct mnt_idmap *idmap,
                             const struct inode *inode);

/*
 * This must be used for allocating filesystems specific inodes to set
 * up the inode reclaim context correctly.
 */
#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp)

void __insert_inode_hash(struct inode *inode, u64 hashval);
static inline void insert_inode_hash(struct inode *inode)
{
        __insert_inode_hash(inode, inode->i_ino);
}

void __remove_inode_hash(struct inode *inode);
static inline void remove_inode_hash(struct inode *inode)
{
        if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
                __remove_inode_hash(inode);
}

void inode_sb_list_add(struct inode *inode);
void inode_lru_list_add(struct inode *inode);

int generic_file_mmap(struct file *, struct vm_area_struct *);
int generic_file_mmap_prepare(struct vm_area_desc *desc);
int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
int generic_write_checks_count(struct kiocb *iocb, loff_t *count);
extern int generic_write_check_limits(struct file *file, loff_t pos,
                loff_t *count);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to,
                ssize_t already_read);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
ssize_t generic_perform_write(struct kiocb *, struct iov_iter *);
ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t direct_written, ssize_t buffered_written);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter);
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter);

/* fs/splice.c */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
                            struct pipe_inode_info *pipe,
                            size_t len, unsigned int flags);
ssize_t copy_splice_read(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe,
                         size_t len, unsigned int flags);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
                struct file *, loff_t *, size_t, unsigned int);


extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
                int whence, loff_t maxsize, loff_t eof);
loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
                             u64 *cookie);
extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
                int whence, loff_t size);
extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
extern loff_t no_seek_end_llseek(struct file *, loff_t, int);
int rw_verify_area(int, struct file *, const loff_t *, size_t);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
extern int stream_open(struct inode * inode, struct file * filp);

#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(struct bio *bio, struct inode *inode,
                            loff_t file_offset);

enum {
        /* need locking between buffered and direct access */
        DIO_LOCKING        = 0x01,

        /* filesystem does not support filling holes */
        DIO_SKIP_HOLES        = 0x02,
};

ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                             struct block_device *bdev, struct iov_iter *iter,
                             get_block_t get_block,
                             dio_iodone_t end_io,
                             int flags);

static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
                                         struct inode *inode,
                                         struct iov_iter *iter,
                                         get_block_t get_block)
{
        return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
                        get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
}
#endif

bool inode_dio_finished(const struct inode *inode);
void inode_dio_wait(struct inode *inode);
void inode_dio_wait_interruptible(struct inode *inode);

/**
 * inode_dio_begin - signal start of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_begin(struct inode *inode)
{
        atomic_inc(&inode->i_dio_count);
}

/**
 * inode_dio_end - signal finish of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_end(struct inode *inode)
{
        if (atomic_dec_and_test(&inode->i_dio_count))
                wake_up_var(&inode->i_dio_count);
}

extern void inode_set_flags(struct inode *inode, unsigned int flags,
                            unsigned int mask);

extern const struct file_operations generic_ro_fops;

#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))

extern int readlink_copy(char __user *, int, const char *, int);
extern int page_readlink(struct dentry *, char __user *, int);
extern const char *page_get_link_raw(struct dentry *, struct inode *,
                                     struct delayed_call *);
extern const char *page_get_link(struct dentry *, struct inode *,
                                 struct delayed_call *);
extern void page_put_link(void *);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
void generic_fill_statx_atomic_writes(struct kstat *stat,
                                      unsigned int unit_min,
                                      unsigned int unit_max,
                                      unsigned int unit_max_opt);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
void inode_add_bytes(struct inode *inode, loff_t bytes);
void __inode_sub_bytes(struct inode *inode, loff_t bytes);
void inode_sub_bytes(struct inode *inode, loff_t bytes);
static inline loff_t __inode_get_bytes(struct inode *inode)
{
        return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
}
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
const char *simple_get_link(struct dentry *, struct inode *,
                            struct delayed_call *);
extern const struct inode_operations simple_symlink_inode_operations;

extern int iterate_dir(struct file *, struct dir_context *);

int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
                int flags);
int vfs_fstat(int fd, struct kstat *stat);

static inline int vfs_stat(const char __user *filename, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, filename, stat, 0);
}
static inline int vfs_lstat(const char __user *name, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
}

extern const char *vfs_get_link(struct dentry *, struct delayed_call *);
extern int vfs_readlink(struct dentry *, char __user *, int);

extern struct file_system_type *get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg);
extern void iterate_supers_type(struct file_system_type *,
                                void (*)(struct super_block *, void *), void *);
void filesystems_freeze(bool freeze_all);
void filesystems_thaw(void);

void end_dirop(struct dentry *de);

extern int dcache_dir_open(struct inode *, struct file *);
extern int dcache_dir_close(struct inode *, struct file *);
extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
extern int dcache_readdir(struct file *, struct dir_context *);
extern int simple_setattr(struct mnt_idmap *, struct dentry *,
                          struct iattr *);
extern int simple_getattr(struct mnt_idmap *, const struct path *,
                          struct kstat *, u32, unsigned int);
extern int simple_statfs(struct dentry *, struct kstatfs *);
extern int simple_open(struct inode *inode, struct file *file);
extern int simple_link(struct dentry *, struct inode *, struct dentry *);
extern int simple_unlink(struct inode *, struct dentry *);
extern int simple_rmdir(struct inode *, struct dentry *);
extern void __simple_unlink(struct inode *, struct dentry *);
extern void __simple_rmdir(struct inode *, struct dentry *);
void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
                                  struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename(struct mnt_idmap *, struct inode *,
                         struct dentry *, struct inode *, struct dentry *,
                         unsigned int);
extern void simple_recursive_removal(struct dentry *,
                              void (*callback)(struct dentry *));
extern void simple_remove_by_name(struct dentry *, const char *,
                              void (*callback)(struct dentry *));
extern void locked_recursive_removal(struct dentry *,
                              void (*callback)(struct dentry *));
extern int noop_fsync(struct file *, loff_t, loff_t, int);
extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern int simple_empty(struct dentry *);
extern int simple_write_begin(const struct kiocb *iocb,
                              struct address_space *mapping,
                              loff_t pos, unsigned len,
                              struct folio **foliop, void **fsdata);
extern const struct address_space_operations ram_aops;
extern int always_delete_dentry(const struct dentry *);
extern struct inode *alloc_anon_inode(struct super_block *);
struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
                                           const struct inode *context_inode);

extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
extern const struct file_operations simple_dir_operations;
extern const struct inode_operations simple_dir_inode_operations;
extern void make_empty_dir_inode(struct inode *inode);
extern bool is_empty_dir_inode(struct inode *inode);
struct tree_descr { const char *name; const struct file_operations *ops; int mode; };
struct dentry *d_alloc_name(struct dentry *, const char *);
extern int simple_fill_super(struct super_block *, unsigned long,
                             const struct tree_descr *);
extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
extern void simple_release_fs(struct vfsmount **mount, int *count);
struct dentry *simple_start_creating(struct dentry *, const char *);
void simple_done_creating(struct dentry *);

extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
                        loff_t *ppos, const void *from, size_t available);
extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count);

struct offset_ctx {
        struct maple_tree        mt;
        unsigned long                next_offset;
};

void simple_offset_init(struct offset_ctx *octx);
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry);
int simple_offset_rename_exchange(struct inode *old_dir,
                                  struct dentry *old_dentry,
                                  struct inode *new_dir,
                                  struct dentry *new_dentry);
void simple_offset_destroy(struct offset_ctx *octx);

extern const struct file_operations simple_offset_dir_operations;

extern int simple_fsync_noflush(struct file *, loff_t, loff_t, int);
extern int simple_fsync(struct file *, loff_t, loff_t, int);

extern int generic_check_addressable(unsigned, u64);

extern void generic_set_sb_d_ops(struct super_block *sb);
extern int generic_ci_match(const struct inode *parent,
                            const struct qstr *name,
                            const struct qstr *folded_name,
                            const u8 *de_name, u32 de_name_len);

#if IS_ENABLED(CONFIG_UNICODE)
int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                         const char *str, const struct qstr *name);

/**
 * generic_ci_validate_strict_name - Check if a given name is suitable
 * for a directory
 *
 * This functions checks if the proposed filename is valid for the
 * parent directory. That means that only valid UTF-8 filenames will be
 * accepted for casefold directories from filesystems created with the
 * strict encoding flag.  That also means that any name will be
 * accepted for directories that doesn't have casefold enabled, or
 * aren't being strict with the encoding.
 *
 * @dir: inode of the directory where the new file will be created
 * @name: name of the new file
 *
 * Return:
 * * True: if the filename is suitable for this directory. It can be
 *   true if a given name is not suitable for a strict encoding
 *   directory, but the directory being used isn't strict
 * * False if the filename isn't suitable for this directory. This only
 *   happens when a directory is casefolded and the filesystem is strict
 *   about its encoding.
 */
static inline bool generic_ci_validate_strict_name(struct inode *dir,
                                                   const struct qstr *name)
{
        if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb))
                return true;

        /*
         * A casefold dir must have a encoding set, unless the filesystem
         * is corrupted
         */
        if (WARN_ON_ONCE(!dir->i_sb->s_encoding))
                return true;

        return !utf8_validate(dir->i_sb->s_encoding, name);
}
#else
static inline bool generic_ci_validate_strict_name(struct inode *dir,
                                                   const struct qstr *name)
{
        return true;
}
#endif

int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
                unsigned int ia_valid);
int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *);
extern int inode_newsize_ok(const struct inode *, loff_t offset);
void setattr_copy(struct mnt_idmap *, struct inode *inode,
                  const struct iattr *attr);

extern int file_update_time(struct file *file);

static inline bool file_is_dax(const struct file *file)
{
        return file && IS_DAX(file->f_mapping->host);
}

static inline bool vma_is_dax(const struct vm_area_struct *vma)
{
        return file_is_dax(vma->vm_file);
}

static inline bool vma_is_fsdax(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file)
                return false;
        if (!vma_is_dax(vma))
                return false;
        inode = file_inode(vma->vm_file);
        if (S_ISCHR(inode->i_mode))
                return false; /* device-dax */
        return true;
}

static inline int iocb_flags(struct file *file)
{
        int res = 0;
        if (file->f_flags & O_APPEND)
                res |= IOCB_APPEND;
        if (file->f_flags & O_DIRECT)
                res |= IOCB_DIRECT;
        if (file->f_flags & O_DSYNC)
                res |= IOCB_DSYNC;
        if (file->f_flags & __O_SYNC)
                res |= IOCB_SYNC;
        return res;
}

static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
                                     int rw_type)
{
        int kiocb_flags = 0;

        /* make sure there's no overlap between RWF and private IOCB flags */
        BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);

        if (!flags)
                return 0;
        if (unlikely(flags & ~RWF_SUPPORTED))
                return -EOPNOTSUPP;
        if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND)))
                return -EINVAL;

        if (flags & RWF_NOWAIT) {
                if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
                        return -EOPNOTSUPP;
        }
        if (flags & RWF_ATOMIC) {
                if (rw_type != WRITE)
                        return -EOPNOTSUPP;
                if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
                        return -EOPNOTSUPP;
        }
        if (flags & RWF_DONTCACHE) {
                /* file system must support it */
                if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE))
                        return -EOPNOTSUPP;
                /* DAX mappings not supported */
                if (IS_DAX(ki->ki_filp->f_mapping->host))
                        return -EOPNOTSUPP;
        }
        kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
        if (flags & RWF_SYNC)
                kiocb_flags |= IOCB_DSYNC;

        if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) {
                if (IS_APPEND(file_inode(ki->ki_filp)))
                        return -EPERM;
                ki->ki_flags &= ~IOCB_APPEND;
        }

        ki->ki_flags |= kiocb_flags;
        return 0;
}

/* Transaction based IO helpers */

/*
 * An argresp is stored in an allocated page and holds the
 * size of the argument or response, along with its content
 */
struct simple_transaction_argresp {
        ssize_t size;
        char data[];
};

#define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))

char *simple_transaction_get(struct file *file, const char __user *buf,
                                size_t size);
ssize_t simple_transaction_read(struct file *file, char __user *buf,
                                size_t size, loff_t *pos);
int simple_transaction_release(struct inode *inode, struct file *file);

void simple_transaction_set(struct file *file, size_t n);

/*
 * simple attribute files
 *
 * These attributes behave similar to those in sysfs:
 *
 * Writing to an attribute immediately sets a value, an open file can be
 * written to multiple times.
 *
 * Reading from an attribute creates a buffer from the value that might get
 * read with multiple read calls. When the attribute has been read
 * completely, no further read calls are possible until the file is opened
 * again.
 *
 * All attributes contain a text representation of a numeric value
 * that are accessed with the get() and set() functions.
 */
#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed)        \
static int __fops ## _open(struct inode *inode, struct file *file)        \
{                                                                        \
        __simple_attr_check_format(__fmt, 0ull);                        \
        return simple_attr_open(inode, file, __get, __set, __fmt);        \
}                                                                        \
static const struct file_operations __fops = {                                \
        .owner         = THIS_MODULE,                                                \
        .open         = __fops ## _open,                                        \
        .release = simple_attr_release,                                        \
        .read         = simple_attr_read,                                        \
        .write         = (__is_signed) ? simple_attr_write_signed : simple_attr_write,        \
        .llseek         = generic_file_llseek,                                        \
}

#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)                \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false)

#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt)        \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true)

static inline __printf(1, 2)
void __simple_attr_check_format(const char *fmt, ...)
{
        /* don't do anything, just let the compiler check the arguments; */
}

int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt);
int simple_attr_release(struct inode *inode, struct file *file);
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos);
ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos);
ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                                 size_t len, loff_t *ppos);

int __init list_bdev_fs_names(char *buf, size_t size);

#define __FMODE_EXEC                ((__force int) FMODE_EXEC)

#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE))

static inline bool is_sxid(umode_t mode)
{
        return mode & (S_ISUID | S_ISGID);
}

static inline int check_sticky(struct mnt_idmap *idmap,
                               struct inode *dir, struct inode *inode)
{
        if (!(dir->i_mode & S_ISVTX))
                return 0;

        return __check_sticky(idmap, dir, inode);
}

static inline void inode_has_no_xattr(struct inode *inode)
{
        if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC))
                inode->i_flags |= S_NOSEC;
}

static inline bool is_root_inode(struct inode *inode)
{
        return inode == inode->i_sb->s_root->d_inode;
}

static inline bool dir_emit(struct dir_context *ctx,
                            const char *name, int namelen,
                            u64 ino, unsigned type)
{
        unsigned int dt_mask = S_DT_MASK | ctx->dt_flags_mask;

        return ctx->actor(ctx, name, namelen, ctx->pos, ino, type & dt_mask);
}
static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, ".", 1, ctx->pos,
                          file->f_path.dentry->d_inode->i_ino, DT_DIR);
}
static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, "..", 2, ctx->pos,
                          d_parent_ino(file->f_path.dentry), DT_DIR);
}
static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
{
        if (ctx->pos == 0) {
                if (!dir_emit_dot(file, ctx))
                        return false;
                ctx->pos = 1;
        }
        if (ctx->pos == 1) {
                if (!dir_emit_dotdot(file, ctx))
                        return false;
                ctx->pos = 2;
        }
        return true;
}
static inline bool dir_relax(struct inode *inode)
{
        inode_unlock(inode);
        inode_lock(inode);
        return !IS_DEADDIR(inode);
}

static inline bool dir_relax_shared(struct inode *inode)
{
        inode_unlock_shared(inode);
        inode_lock_shared(inode);
        return !IS_DEADDIR(inode);
}

extern bool path_noexec(const struct path *path);
extern void inode_nohighmem(struct inode *inode);

/* mm/fadvise.c */
extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
                       int advice);
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
                           int advice);

static inline bool vfs_empty_path(int dfd, const char __user *path)
{
        char c;

        if (dfd < 0)
                return false;

        /* We now allow NULL to be used for empty path. */
        if (!path)
                return true;

        if (unlikely(get_user(c, path)))
                return false;

        return !c;
}

int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);

static inline bool extensible_ioctl_valid(unsigned int cmd_a,
                                          unsigned int cmd_b, size_t min_size)
{
        if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b))
                return false;
        if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b))
                return false;
        if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b))
                return false;
        if (_IOC_SIZE(cmd_a) < min_size)
                return false;
        return true;
}

#endif /* _LINUX_FS_H */
























































    1 
































































    1 













    1 













    1 











    1 






    1 
























    1 


































    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfsplus/wrapper.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Handling of HFS wrappers around HFS+ volumes
 */

#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/cdrom.h>
#include <linux/unaligned.h>

#include "hfsplus_fs.h"
#include "hfsplus_raw.h"

struct hfsplus_wd {
        u32 ablk_size;
        u16 ablk_start;
        u16 embed_start;
        u16 embed_count;
};

/**
 * hfsplus_submit_bio - Perform block I/O
 * @sb: super block of volume for I/O
 * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
 * @buf: buffer for I/O
 * @data: output pointer for location of requested data
 * @opf: I/O operation type and flags
 *
 * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
 * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
 * @data will return a pointer to the start of the requested sector,
 * which may not be the same location as @buf.
 *
 * If @sector is not aligned to the bdev logical block size it will
 * be rounded down. For writes this means that @buf should contain data
 * that starts at the rounded-down address. As long as the data was
 * read using hfsplus_submit_bio() and the same buffer is used things
 * will work correctly.
 *
 * Returns: %0 on success else -errno code
 */
int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
                       void *buf, void **data, blk_opf_t opf)
{
        u64 io_size = hfsplus_min_io_size(sb);
        loff_t start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
        int offset = start & (io_size - 1);

        if ((opf & REQ_OP_MASK) != REQ_OP_WRITE && data)
                *data = (u8 *)buf + offset;

        /*
         * Align sector to hardware sector size and find offset. We assume that
         * io_size is a power of two, which _should_ be true.
         */
        sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
        return bdev_rw_virt(sb->s_bdev, sector, buf, io_size, opf);
}

static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
{
        u32 extent;
        u16 attrib;
        __be16 sig;

        sig = *(__be16 *)(bufptr + HFSP_WRAPOFF_EMBEDSIG);
        if (sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIG) &&
            sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
                return 0;

        attrib = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ATTRIB));
        if (!(attrib & HFSP_WRAP_ATTRIB_SLOCK) ||
           !(attrib & HFSP_WRAP_ATTRIB_SPARED))
                return 0;

        wd->ablk_size =
                be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
        if (wd->ablk_size < HFSPLUS_SECTOR_SIZE)
                return 0;
        if (wd->ablk_size % HFSPLUS_SECTOR_SIZE)
                return 0;
        wd->ablk_start =
                be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));

        extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT);
        wd->embed_start = (extent >> 16) & 0xFFFF;
        wd->embed_count = extent & 0xFFFF;

        return 1;
}

static int hfsplus_get_last_session(struct super_block *sb,
                                    sector_t *start, sector_t *size)
{
        struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);

        /* default values */
        *start = 0;
        *size = bdev_nr_sectors(sb->s_bdev);

        if (HFSPLUS_SB(sb)->session >= 0) {
                struct cdrom_tocentry te;

                if (!cdi)
                        return -EINVAL;

                te.cdte_track = HFSPLUS_SB(sb)->session;
                te.cdte_format = CDROM_LBA;
                if (cdrom_read_tocentry(cdi, &te) ||
                    (te.cdte_ctrl & CDROM_DATA_TRACK) != 4) {
                        pr_err("invalid session number or type of track\n");
                        return -EINVAL;
                }
                *start = (sector_t)te.cdte_addr.lba << 2;
        } else if (cdi) {
                struct cdrom_multisession ms_info;

                ms_info.addr_format = CDROM_LBA;
                if (cdrom_multisession(cdi, &ms_info) == 0 && ms_info.xa_flag)
                        *start = (sector_t)ms_info.addr.lba << 2;
        }

        return 0;
}

/* Find the volume header and fill in some minimum bits in superblock */
/* Takes in super block, returns true if good data read */
int hfsplus_read_wrapper(struct super_block *sb)
{
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct hfsplus_wd wd;
        sector_t part_start, part_size;
        u32 blocksize;
        int error = 0;

        error = -EINVAL;
        blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE);
        if (!blocksize)
                goto out;

        sbi->min_io_size = blocksize;

        if (hfsplus_get_last_session(sb, &part_start, &part_size))
                goto out;

        error = -ENOMEM;
        sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
        if (!sbi->s_vhdr_buf)
                goto out;
        sbi->s_backup_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
        if (!sbi->s_backup_vhdr_buf)
                goto out_free_vhdr;

reread:
        error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR,
                                   sbi->s_vhdr_buf, (void **)&sbi->s_vhdr,
                                   REQ_OP_READ);
        if (error)
                goto out_free_backup_vhdr;

        error = -EINVAL;
        switch (sbi->s_vhdr->signature) {
        case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX):
                set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
                fallthrough;
        case cpu_to_be16(HFSPLUS_VOLHEAD_SIG):
                break;
        case cpu_to_be16(HFSP_WRAP_MAGIC):
                if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
                        goto out_free_backup_vhdr;
                wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
                part_start += (sector_t)wd.ablk_start +
                               (sector_t)wd.embed_start * wd.ablk_size;
                part_size = (sector_t)wd.embed_count * wd.ablk_size;
                goto reread;
        default:
                /*
                 * Check for a partition block.
                 *
                 * (should do this only for cdrom/loop though)
                 */
                if (hfs_part_find(sb, &part_start, &part_size))
                        goto out_free_backup_vhdr;
                goto reread;
        }

        error = hfsplus_submit_bio(sb, part_start + part_size - 2,
                                   sbi->s_backup_vhdr_buf,
                                   (void **)&sbi->s_backup_vhdr, REQ_OP_READ);
        if (error)
                goto out_free_backup_vhdr;

        error = -EINVAL;
        if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
                pr_warn("invalid secondary volume header\n");
                goto out_free_backup_vhdr;
        }

        blocksize = be32_to_cpu(sbi->s_vhdr->blocksize);

        /*
         * Block size must be at least as large as a sector and a multiple of 2.
         */
        if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize))
                goto out_free_backup_vhdr;
        sbi->alloc_blksz = blocksize;
        sbi->alloc_blksz_shift = ilog2(blocksize);
        blocksize = min_t(u32, sbi->alloc_blksz, PAGE_SIZE);

        /*
         * Align block size to block offset.
         */
        while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
                blocksize >>= 1;

        if (sb_set_blocksize(sb, blocksize) != blocksize) {
                pr_err("unable to set blocksize to %u!\n", blocksize);
                goto out_free_backup_vhdr;
        }

        sbi->blockoffset =
                part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
        sbi->part_start = part_start;
        sbi->sect_count = part_size;
        sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
        return 0;

out_free_backup_vhdr:
        kfree(sbi->s_backup_vhdr_buf);
out_free_vhdr:
        kfree(sbi->s_vhdr_buf);
out:
        return error;
}






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 





























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Kernel Connection Multiplexor
 *
 * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
 */

#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
#include <linux/file.h>
#include <linux/filter.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/poll.h>
#include <linux/rculist.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/splice.h>
#include <linux/uaccess.h>
#include <linux/workqueue.h>
#include <linux/syscalls.h>
#include <linux/sched/signal.h>

#include <net/kcm.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <uapi/linux/kcm.h>
#include <trace/events/sock.h>

unsigned int kcm_net_id;

static struct kmem_cache *kcm_psockp __read_mostly;
static struct kmem_cache *kcm_muxp __read_mostly;
static struct workqueue_struct *kcm_wq;

static inline struct kcm_sock *kcm_sk(const struct sock *sk)
{
        return (struct kcm_sock *)sk;
}

static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
{
        return (struct kcm_tx_msg *)skb->cb;
}

static void report_csk_error(struct sock *csk, int err)
{
        csk->sk_err = EPIPE;
        sk_error_report(csk);
}

static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
                               bool wakeup_kcm)
{
        struct sock *csk = psock->sk;
        struct kcm_mux *mux = psock->mux;

        /* Unrecoverable error in transmit */

        spin_lock_bh(&mux->lock);

        if (psock->tx_stopped) {
                spin_unlock_bh(&mux->lock);
                return;
        }

        psock->tx_stopped = 1;
        KCM_STATS_INCR(psock->stats.tx_aborts);

        if (!psock->tx_kcm) {
                /* Take off psocks_avail list */
                list_del(&psock->psock_avail_list);
        } else if (wakeup_kcm) {
                /* In this case psock is being aborted while outside of
                 * write_msgs and psock is reserved. Schedule tx_work
                 * to handle the failure there. Need to commit tx_stopped
                 * before queuing work.
                 */
                smp_mb();

                queue_work(kcm_wq, &psock->tx_kcm->tx_work);
        }

        spin_unlock_bh(&mux->lock);

        /* Report error on lower socket */
        report_csk_error(csk, err);
}

/* RX mux lock held. */
static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
                                    struct kcm_psock *psock)
{
        STRP_STATS_ADD(mux->stats.rx_bytes,
                       psock->strp.stats.bytes -
                       psock->saved_rx_bytes);
        mux->stats.rx_msgs +=
                psock->strp.stats.msgs - psock->saved_rx_msgs;
        psock->saved_rx_msgs = psock->strp.stats.msgs;
        psock->saved_rx_bytes = psock->strp.stats.bytes;
}

static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
                                    struct kcm_psock *psock)
{
        KCM_STATS_ADD(mux->stats.tx_bytes,
                      psock->stats.tx_bytes - psock->saved_tx_bytes);
        mux->stats.tx_msgs +=
                psock->stats.tx_msgs - psock->saved_tx_msgs;
        psock->saved_tx_msgs = psock->stats.tx_msgs;
        psock->saved_tx_bytes = psock->stats.tx_bytes;
}

static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);

/* KCM is ready to receive messages on its queue-- either the KCM is new or
 * has become unblocked after being blocked on full socket buffer. Queue any
 * pending ready messages on a psock. RX mux lock held.
 */
static void kcm_rcv_ready(struct kcm_sock *kcm)
{
        struct kcm_mux *mux = kcm->mux;
        struct kcm_psock *psock;
        struct sk_buff *skb;

        if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
                return;

        while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
                if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
                        /* Assuming buffer limit has been reached */
                        skb_queue_head(&mux->rx_hold_queue, skb);
                        WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
                        return;
                }
        }

        while (!list_empty(&mux->psocks_ready)) {
                psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
                                         psock_ready_list);

                if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
                        /* Assuming buffer limit has been reached */
                        WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
                        return;
                }

                /* Consumed the ready message on the psock. Schedule rx_work to
                 * get more messages.
                 */
                list_del(&psock->psock_ready_list);
                psock->ready_rx_msg = NULL;
                /* Commit clearing of ready_rx_msg for queuing work */
                smp_mb();

                strp_unpause(&psock->strp);
                strp_check_rcv(&psock->strp);
        }

        /* Buffer limit is okay now, add to ready list */
        list_add_tail(&kcm->wait_rx_list,
                      &kcm->mux->kcm_rx_waiters);
        /* paired with lockless reads in kcm_rfree() */
        WRITE_ONCE(kcm->rx_wait, true);
}

static void kcm_rfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        struct kcm_sock *kcm = kcm_sk(sk);
        struct kcm_mux *mux = kcm->mux;
        unsigned int len = skb->truesize;

        sk_mem_uncharge(sk, len);
        atomic_sub(len, &sk->sk_rmem_alloc);

        /* For reading rx_wait and rx_psock without holding lock */
        smp_mb__after_atomic();

        if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) &&
            sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
                spin_lock_bh(&mux->rx_lock);
                kcm_rcv_ready(kcm);
                spin_unlock_bh(&mux->rx_lock);
        }
}

static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        struct sk_buff_head *list = &sk->sk_receive_queue;

        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
                return -ENOMEM;

        if (!sk_rmem_schedule(sk, skb, skb->truesize))
                return -ENOBUFS;

        skb->dev = NULL;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = kcm_rfree;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        sk_mem_charge(sk, skb->truesize);

        skb_queue_tail(list, skb);

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_data_ready(sk);

        return 0;
}

/* Requeue received messages for a kcm socket to other kcm sockets. This is
 * called with a kcm socket is receive disabled.
 * RX mux lock held.
 */
static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
{
        struct sk_buff *skb;
        struct kcm_sock *kcm;

        while ((skb = skb_dequeue(head))) {
                /* Reset destructor to avoid calling kcm_rcv_ready */
                skb->destructor = sock_rfree;
                skb_orphan(skb);
try_again:
                if (list_empty(&mux->kcm_rx_waiters)) {
                        skb_queue_tail(&mux->rx_hold_queue, skb);
                        continue;
                }

                kcm = list_first_entry(&mux->kcm_rx_waiters,
                                       struct kcm_sock, wait_rx_list);

                if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
                        /* Should mean socket buffer full */
                        list_del(&kcm->wait_rx_list);
                        /* paired with lockless reads in kcm_rfree() */
                        WRITE_ONCE(kcm->rx_wait, false);

                        /* Commit rx_wait to read in kcm_free */
                        smp_wmb();

                        goto try_again;
                }
        }
}

/* Lower sock lock held */
static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
                                       struct sk_buff *head)
{
        struct kcm_mux *mux = psock->mux;
        struct kcm_sock *kcm;

        WARN_ON(psock->ready_rx_msg);

        if (psock->rx_kcm)
                return psock->rx_kcm;

        spin_lock_bh(&mux->rx_lock);

        if (psock->rx_kcm) {
                spin_unlock_bh(&mux->rx_lock);
                return psock->rx_kcm;
        }

        kcm_update_rx_mux_stats(mux, psock);

        if (list_empty(&mux->kcm_rx_waiters)) {
                psock->ready_rx_msg = head;
                strp_pause(&psock->strp);
                list_add_tail(&psock->psock_ready_list,
                              &mux->psocks_ready);
                spin_unlock_bh(&mux->rx_lock);
                return NULL;
        }

        kcm = list_first_entry(&mux->kcm_rx_waiters,
                               struct kcm_sock, wait_rx_list);
        list_del(&kcm->wait_rx_list);
        /* paired with lockless reads in kcm_rfree() */
        WRITE_ONCE(kcm->rx_wait, false);

        psock->rx_kcm = kcm;
        /* paired with lockless reads in kcm_rfree() */
        WRITE_ONCE(kcm->rx_psock, psock);

        spin_unlock_bh(&mux->rx_lock);

        return kcm;
}

static void kcm_done(struct kcm_sock *kcm);

static void kcm_done_work(struct work_struct *w)
{
        kcm_done(container_of(w, struct kcm_sock, done_work));
}

/* Lower sock held */
static void unreserve_rx_kcm(struct kcm_psock *psock,
                             bool rcv_ready)
{
        struct kcm_sock *kcm = psock->rx_kcm;
        struct kcm_mux *mux = psock->mux;

        if (!kcm)
                return;

        spin_lock_bh(&mux->rx_lock);

        psock->rx_kcm = NULL;
        /* paired with lockless reads in kcm_rfree() */
        WRITE_ONCE(kcm->rx_psock, NULL);

        /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
         * kcm_rfree
         */
        smp_mb();

        if (unlikely(kcm->done)) {
                spin_unlock_bh(&mux->rx_lock);

                /* Need to run kcm_done in a task since we need to qcquire
                 * callback locks which may already be held here.
                 */
                INIT_WORK(&kcm->done_work, kcm_done_work);
                schedule_work(&kcm->done_work);
                return;
        }

        if (unlikely(kcm->rx_disabled)) {
                requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
        } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
                /* Check for degenerative race with rx_wait that all
                 * data was dequeued (accounted for in kcm_rfree).
                 */
                kcm_rcv_ready(kcm);
        }
        spin_unlock_bh(&mux->rx_lock);
}

/* Lower sock lock held */
static void psock_data_ready(struct sock *sk)
{
        struct kcm_psock *psock;

        trace_sk_data_ready(sk);

        read_lock_bh(&sk->sk_callback_lock);

        psock = (struct kcm_psock *)sk->sk_user_data;
        if (likely(psock))
                strp_data_ready(&psock->strp);

        read_unlock_bh(&sk->sk_callback_lock);
}

/* Called with lower sock held */
static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb)
{
        struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
        struct kcm_sock *kcm;

try_queue:
        kcm = reserve_rx_kcm(psock, skb);
        if (!kcm) {
                 /* Unable to reserve a KCM, message is held in psock and strp
                  * is paused.
                  */
                return;
        }

        if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
                /* Should mean socket buffer full */
                unreserve_rx_kcm(psock, false);
                goto try_queue;
        }
}

static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
{
        struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
        struct bpf_prog *prog = psock->bpf_prog;
        int res;

        res = bpf_prog_run_pin_on_cpu(prog, skb);
        return res;
}

static int kcm_read_sock_done(struct strparser *strp, int err)
{
        struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);

        unreserve_rx_kcm(psock, true);

        return err;
}

static void psock_state_change(struct sock *sk)
{
        /* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here
         * since application will normally not poll with EPOLLIN
         * on the TCP sockets.
         */

        report_csk_error(sk, EPIPE);
}

static void psock_write_space(struct sock *sk)
{
        struct kcm_psock *psock;
        struct kcm_mux *mux;
        struct kcm_sock *kcm;

        read_lock_bh(&sk->sk_callback_lock);

        psock = (struct kcm_psock *)sk->sk_user_data;
        if (unlikely(!psock))
                goto out;
        mux = psock->mux;

        spin_lock_bh(&mux->lock);

        /* Check if the socket is reserved so someone is waiting for sending. */
        kcm = psock->tx_kcm;
        if (kcm)
                queue_work(kcm_wq, &kcm->tx_work);

        spin_unlock_bh(&mux->lock);
out:
        read_unlock_bh(&sk->sk_callback_lock);
}

static void unreserve_psock(struct kcm_sock *kcm);

/* kcm sock is locked. */
static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
{
        struct kcm_mux *mux = kcm->mux;
        struct kcm_psock *psock;

        psock = kcm->tx_psock;

        smp_rmb(); /* Must read tx_psock before tx_wait */

        if (psock) {
                WARN_ON(kcm->tx_wait);
                if (unlikely(psock->tx_stopped))
                        unreserve_psock(kcm);
                else
                        return kcm->tx_psock;
        }

        spin_lock_bh(&mux->lock);

        /* Check again under lock to see if psock was reserved for this
         * psock via psock_unreserve.
         */
        psock = kcm->tx_psock;
        if (unlikely(psock)) {
                WARN_ON(kcm->tx_wait);
                spin_unlock_bh(&mux->lock);
                return kcm->tx_psock;
        }

        if (!list_empty(&mux->psocks_avail)) {
                psock = list_first_entry(&mux->psocks_avail,
                                         struct kcm_psock,
                                         psock_avail_list);
                list_del(&psock->psock_avail_list);
                if (kcm->tx_wait) {
                        list_del(&kcm->wait_psock_list);
                        kcm->tx_wait = false;
                }
                kcm->tx_psock = psock;
                psock->tx_kcm = kcm;
                KCM_STATS_INCR(psock->stats.reserved);
        } else if (!kcm->tx_wait) {
                list_add_tail(&kcm->wait_psock_list,
                              &mux->kcm_tx_waiters);
                kcm->tx_wait = true;
        }

        spin_unlock_bh(&mux->lock);

        return psock;
}

/* mux lock held */
static void psock_now_avail(struct kcm_psock *psock)
{
        struct kcm_mux *mux = psock->mux;
        struct kcm_sock *kcm;

        if (list_empty(&mux->kcm_tx_waiters)) {
                list_add_tail(&psock->psock_avail_list,
                              &mux->psocks_avail);
        } else {
                kcm = list_first_entry(&mux->kcm_tx_waiters,
                                       struct kcm_sock,
                                       wait_psock_list);
                list_del(&kcm->wait_psock_list);
                kcm->tx_wait = false;
                psock->tx_kcm = kcm;

                /* Commit before changing tx_psock since that is read in
                 * reserve_psock before queuing work.
                 */
                smp_mb();

                kcm->tx_psock = psock;
                KCM_STATS_INCR(psock->stats.reserved);
                queue_work(kcm_wq, &kcm->tx_work);
        }
}

/* kcm sock is locked. */
static void unreserve_psock(struct kcm_sock *kcm)
{
        struct kcm_psock *psock;
        struct kcm_mux *mux = kcm->mux;

        spin_lock_bh(&mux->lock);

        psock = kcm->tx_psock;

        if (WARN_ON(!psock)) {
                spin_unlock_bh(&mux->lock);
                return;
        }

        smp_rmb(); /* Read tx_psock before tx_wait */

        kcm_update_tx_mux_stats(mux, psock);

        WARN_ON(kcm->tx_wait);

        kcm->tx_psock = NULL;
        psock->tx_kcm = NULL;
        KCM_STATS_INCR(psock->stats.unreserved);

        if (unlikely(psock->tx_stopped)) {
                if (psock->done) {
                        /* Deferred free */
                        list_del(&psock->psock_list);
                        mux->psocks_cnt--;
                        sock_put(psock->sk);
                        fput(psock->sk->sk_socket->file);
                        kmem_cache_free(kcm_psockp, psock);
                }

                /* Don't put back on available list */

                spin_unlock_bh(&mux->lock);

                return;
        }

        psock_now_avail(psock);

        spin_unlock_bh(&mux->lock);
}

static void kcm_report_tx_retry(struct kcm_sock *kcm)
{
        struct kcm_mux *mux = kcm->mux;

        spin_lock_bh(&mux->lock);
        KCM_STATS_INCR(mux->stats.tx_retries);
        spin_unlock_bh(&mux->lock);
}

/* Write any messages ready on the kcm socket.  Called with kcm sock lock
 * held.  Return bytes actually sent or error.
 */
static int kcm_write_msgs(struct kcm_sock *kcm)
{
        unsigned int total_sent = 0;
        struct sock *sk = &kcm->sk;
        struct kcm_psock *psock;
        struct sk_buff *head;
        int ret = 0;

        kcm->tx_wait_more = false;
        psock = kcm->tx_psock;
        if (unlikely(psock && psock->tx_stopped)) {
                /* A reserved psock was aborted asynchronously. Unreserve
                 * it and we'll retry the message.
                 */
                unreserve_psock(kcm);
                kcm_report_tx_retry(kcm);
                if (skb_queue_empty(&sk->sk_write_queue))
                        return 0;

                kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false;
        }

retry:
        while ((head = skb_peek(&sk->sk_write_queue))) {
                struct msghdr msg = {
                        .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
                };
                struct kcm_tx_msg *txm = kcm_tx_msg(head);
                struct sk_buff *skb;
                unsigned int msize;
                int i;

                if (!txm->started_tx) {
                        psock = reserve_psock(kcm);
                        if (!psock)
                                goto out;
                        skb = head;
                        txm->frag_offset = 0;
                        txm->sent = 0;
                        txm->started_tx = true;
                } else {
                        if (WARN_ON(!psock)) {
                                ret = -EINVAL;
                                goto out;
                        }
                        skb = txm->frag_skb;
                }

                if (WARN_ON_ONCE(!skb_shinfo(skb)->nr_frags) ||
                    WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
                        ret = -EINVAL;
                        goto out;
                }

                msize = 0;
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        msize += skb_frag_size(&skb_shinfo(skb)->frags[i]);

                iov_iter_bvec(&msg.msg_iter, ITER_SOURCE,
                              (const struct bio_vec *)skb_shinfo(skb)->frags,
                              skb_shinfo(skb)->nr_frags, msize);
                iov_iter_advance(&msg.msg_iter, txm->frag_offset);

                do {
                        ret = sock_sendmsg(psock->sk->sk_socket, &msg);
                        if (ret <= 0) {
                                if (ret == -EAGAIN) {
                                        /* Save state to try again when there's
                                         * write space on the socket
                                         */
                                        txm->frag_skb = skb;
                                        ret = 0;
                                        goto out;
                                }

                                /* Hard failure in sending message, abort this
                                 * psock since it has lost framing
                                 * synchronization and retry sending the
                                 * message from the beginning.
                                 */
                                kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
                                                   true);
                                unreserve_psock(kcm);
                                psock = NULL;

                                txm->started_tx = false;
                                kcm_report_tx_retry(kcm);
                                ret = 0;
                                goto retry;
                        }

                        txm->sent += ret;
                        txm->frag_offset += ret;
                        KCM_STATS_ADD(psock->stats.tx_bytes, ret);
                } while (msg.msg_iter.count > 0);

                if (skb == head) {
                        if (skb_has_frag_list(skb)) {
                                txm->frag_skb = skb_shinfo(skb)->frag_list;
                                txm->frag_offset = 0;
                                continue;
                        }
                } else if (skb->next) {
                        txm->frag_skb = skb->next;
                        txm->frag_offset = 0;
                        continue;
                }

                /* Successfully sent the whole packet, account for it. */
                sk->sk_wmem_queued -= txm->sent;
                total_sent += txm->sent;
                skb_dequeue(&sk->sk_write_queue);
                kfree_skb(head);
                KCM_STATS_INCR(psock->stats.tx_msgs);
        }
out:
        if (!head) {
                /* Done with all queued messages. */
                WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
                if (psock)
                        unreserve_psock(kcm);
        }

        /* Check if write space is available */
        sk->sk_write_space(sk);

        return total_sent ? : ret;
}

static void kcm_tx_work(struct work_struct *w)
{
        struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
        struct sock *sk = &kcm->sk;
        int err;

        lock_sock(sk);

        /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
         * aborts
         */
        err = kcm_write_msgs(kcm);
        if (err < 0) {
                /* Hard failure in write, report error on KCM socket */
                pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
                report_csk_error(&kcm->sk, -err);
                goto out;
        }

        /* Primarily for SOCK_SEQPACKET sockets */
        if (likely(sk->sk_socket) &&
            test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
                clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                sk->sk_write_space(sk);
        }

out:
        release_sock(sk);
}

static void kcm_push(struct kcm_sock *kcm)
{
        if (kcm->tx_wait_more)
                kcm_write_msgs(kcm);
}

static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        struct kcm_sock *kcm = kcm_sk(sk);
        struct sk_buff *skb = NULL, *head = NULL, *frag_prev = NULL;
        size_t copy, copied = 0;
        long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
        int eor = (sock->type == SOCK_DGRAM) ?
                  !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
        int err = -EPIPE;

        mutex_lock(&kcm->tx_mutex);
        lock_sock(sk);

        /* Per tcp_sendmsg this should be in poll */
        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);

        if (sk->sk_err)
                goto out_error;

        if (kcm->seq_skb) {
                /* Previously opened message */
                head = kcm->seq_skb;
                skb = kcm_tx_msg(head)->last_skb;
                goto start;
        }

        /* Call the sk_stream functions to manage the sndbuf mem. */
        if (!sk_stream_memory_free(sk)) {
                kcm_push(kcm);
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                err = sk_stream_wait_memory(sk, &timeo);
                if (err)
                        goto out_error;
        }

        if (msg_data_left(msg)) {
                /* New message, alloc head skb */
                head = alloc_skb(0, sk->sk_allocation);
                while (!head) {
                        kcm_push(kcm);
                        err = sk_stream_wait_memory(sk, &timeo);
                        if (err)
                                goto out_error;

                        head = alloc_skb(0, sk->sk_allocation);
                }

                skb = head;

                /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
                 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
                 */
                skb->ip_summed = CHECKSUM_UNNECESSARY;
        }

start:
        while (msg_data_left(msg)) {
                bool merge = true;
                int i = skb_shinfo(skb)->nr_frags;
                struct page_frag *pfrag = sk_page_frag(sk);

                if (!sk_page_frag_refill(sk, pfrag))
                        goto wait_for_memory;

                if (!skb_can_coalesce(skb, i, pfrag->page,
                                      pfrag->offset)) {
                        if (i == MAX_SKB_FRAGS) {
                                struct sk_buff *tskb;

                                tskb = alloc_skb(0, sk->sk_allocation);
                                if (!tskb)
                                        goto wait_for_memory;

                                if (head == skb)
                                        skb_shinfo(head)->frag_list = tskb;
                                else
                                        skb->next = tskb;

                                frag_prev = skb;
                                skb = tskb;
                                skb->ip_summed = CHECKSUM_UNNECESSARY;
                                continue;
                        }
                        merge = false;
                }

                if (msg->msg_flags & MSG_SPLICE_PAGES) {
                        copy = msg_data_left(msg);
                        if (!sk_wmem_schedule(sk, copy))
                                goto wait_for_memory;

                        err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
                        if (err < 0) {
                                if (err == -EMSGSIZE)
                                        goto wait_for_memory;
                                goto out_error;
                        }

                        copy = err;
                        skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
                        sk_wmem_queued_add(sk, copy);
                        sk_mem_charge(sk, copy);

                        if (head != skb)
                                head->truesize += copy;
                } else {
                        copy = min_t(int, msg_data_left(msg),
                                     pfrag->size - pfrag->offset);
                        if (!sk_wmem_schedule(sk, copy))
                                goto wait_for_memory;

                        err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
                                                       pfrag->page,
                                                       pfrag->offset,
                                                       copy);
                        if (err)
                                goto out_error;

                        /* Update the skb. */
                        if (merge) {
                                skb_frag_size_add(
                                        &skb_shinfo(skb)->frags[i - 1], copy);
                        } else {
                                skb_fill_page_desc(skb, i, pfrag->page,
                                                   pfrag->offset, copy);
                                get_page(pfrag->page);
                        }

                        pfrag->offset += copy;
                }

                copied += copy;
                if (head != skb) {
                        head->len += copy;
                        head->data_len += copy;
                }

                continue;

wait_for_memory:
                kcm_push(kcm);
                err = sk_stream_wait_memory(sk, &timeo);
                if (err)
                        goto out_error;
        }

        if (eor) {
                bool not_busy = skb_queue_empty(&sk->sk_write_queue);

                if (head) {
                        /* Message complete, queue it on send buffer */
                        __skb_queue_tail(&sk->sk_write_queue, head);
                        kcm->seq_skb = NULL;
                        KCM_STATS_INCR(kcm->stats.tx_msgs);
                }

                if (msg->msg_flags & MSG_BATCH) {
                        kcm->tx_wait_more = true;
                } else if (kcm->tx_wait_more || not_busy) {
                        err = kcm_write_msgs(kcm);
                        if (err < 0) {
                                /* We got a hard error in write_msgs but have
                                 * already queued this message. Report an error
                                 * in the socket, but don't affect return value
                                 * from sendmsg
                                 */
                                pr_warn("KCM: Hard failure on kcm_write_msgs\n");
                                report_csk_error(&kcm->sk, -err);
                        }
                }
        } else {
                /* Message not complete, save state */
partial_message:
                if (head) {
                        kcm->seq_skb = head;
                        kcm_tx_msg(head)->last_skb = skb;
                }
        }

        KCM_STATS_ADD(kcm->stats.tx_bytes, copied);

        release_sock(sk);
        mutex_unlock(&kcm->tx_mutex);
        return copied;

out_error:
        kcm_push(kcm);

        /* When MAX_SKB_FRAGS was reached, a new skb was allocated and
         * linked into the frag_list before data copy. If the copy
         * subsequently failed, this skb has zero frags. Remove it from
         * the frag_list to prevent kcm_write_msgs from later hitting
         * WARN_ON(!skb_shinfo(skb)->nr_frags).
         */
        if (frag_prev && !skb_shinfo(skb)->nr_frags) {
                if (head == frag_prev)
                        skb_shinfo(head)->frag_list = NULL;
                else
                        frag_prev->next = NULL;
                kfree_skb(skb);
                /* Update skb as it may be saved in partial_message via goto */
                skb = frag_prev;
        }

        if (sock->type == SOCK_SEQPACKET) {
                /* Wrote some bytes before encountering an
                 * error, return partial success.
                 */
                if (copied)
                        goto partial_message;
                if (head != kcm->seq_skb)
                        kfree_skb(head);
        } else {
                kfree_skb(head);
                kcm->seq_skb = NULL;
        }

        err = sk_stream_error(sk, msg->msg_flags, err);

        /* make sure we wake any epoll edge trigger waiter */
        if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
                sk->sk_write_space(sk);

        release_sock(sk);
        mutex_unlock(&kcm->tx_mutex);
        return err;
}

static void kcm_splice_eof(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct kcm_sock *kcm = kcm_sk(sk);

        if (skb_queue_empty_lockless(&sk->sk_write_queue))
                return;

        lock_sock(sk);
        kcm_write_msgs(kcm);
        release_sock(sk);
}

static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
                       size_t len, int flags)
{
        struct sock *sk = sock->sk;
        struct kcm_sock *kcm = kcm_sk(sk);
        int err = 0;
        struct strp_msg *stm;
        int copied = 0;
        struct sk_buff *skb;

        skb = skb_recv_datagram(sk, flags, &err);
        if (!skb)
                goto out;

        /* Okay, have a message on the receive queue */

        stm = strp_msg(skb);

        if (len > stm->full_len)
                len = stm->full_len;

        err = skb_copy_datagram_msg(skb, stm->offset, msg, len);
        if (err < 0)
                goto out;

        copied = len;
        if (likely(!(flags & MSG_PEEK))) {
                KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
                if (copied < stm->full_len) {
                        if (sock->type == SOCK_DGRAM) {
                                /* Truncated message */
                                msg->msg_flags |= MSG_TRUNC;
                                goto msg_finished;
                        }
                        stm->offset += copied;
                        stm->full_len -= copied;
                } else {
msg_finished:
                        /* Finished with message */
                        msg->msg_flags |= MSG_EOR;
                        KCM_STATS_INCR(kcm->stats.rx_msgs);
                }
        }

out:
        skb_free_datagram(sk, skb);
        return copied ? : err;
}

static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
                               struct pipe_inode_info *pipe, size_t len,
                               unsigned int flags)
{
        struct sock *sk = sock->sk;
        struct kcm_sock *kcm = kcm_sk(sk);
        struct strp_msg *stm;
        int err = 0;
        ssize_t copied;
        struct sk_buff *skb;

        if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK)
                flags = MSG_DONTWAIT;
        else
                flags = 0;

        /* Only support splice for SOCKSEQPACKET */

        skb = skb_recv_datagram(sk, flags, &err);
        if (!skb)
                goto err_out;

        /* Okay, have a message on the receive queue */

        stm = strp_msg(skb);

        if (len > stm->full_len)
                len = stm->full_len;

        copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags);
        if (copied < 0) {
                err = copied;
                goto err_out;
        }

        KCM_STATS_ADD(kcm->stats.rx_bytes, copied);

        stm->offset += copied;
        stm->full_len -= copied;

        /* We have no way to return MSG_EOR. If all the bytes have been
         * read we still leave the message in the receive socket buffer.
         * A subsequent recvmsg needs to be done to return MSG_EOR and
         * finish reading the message.
         */

        skb_free_datagram(sk, skb);
        return copied;

err_out:
        skb_free_datagram(sk, skb);
        return err;
}

/* kcm sock lock held */
static void kcm_recv_disable(struct kcm_sock *kcm)
{
        struct kcm_mux *mux = kcm->mux;

        if (kcm->rx_disabled)
                return;

        spin_lock_bh(&mux->rx_lock);

        kcm->rx_disabled = 1;

        /* If a psock is reserved we'll do cleanup in unreserve */
        if (!kcm->rx_psock) {
                if (kcm->rx_wait) {
                        list_del(&kcm->wait_rx_list);
                        /* paired with lockless reads in kcm_rfree() */
                        WRITE_ONCE(kcm->rx_wait, false);
                }

                requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
        }

        spin_unlock_bh(&mux->rx_lock);
}

/* kcm sock lock held */
static void kcm_recv_enable(struct kcm_sock *kcm)
{
        struct kcm_mux *mux = kcm->mux;

        if (!kcm->rx_disabled)
                return;

        spin_lock_bh(&mux->rx_lock);

        kcm->rx_disabled = 0;
        kcm_rcv_ready(kcm);

        spin_unlock_bh(&mux->rx_lock);
}

static int kcm_setsockopt(struct socket *sock, int level, int optname,
                          sockptr_t optval, unsigned int optlen)
{
        struct kcm_sock *kcm = kcm_sk(sock->sk);
        int val, valbool;
        int err = 0;

        if (level != SOL_KCM)
                return -ENOPROTOOPT;

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(int)))
                return -EFAULT;

        valbool = val ? 1 : 0;

        switch (optname) {
        case KCM_RECV_DISABLE:
                lock_sock(&kcm->sk);
                if (valbool)
                        kcm_recv_disable(kcm);
                else
                        kcm_recv_enable(kcm);
                release_sock(&kcm->sk);
                break;
        default:
                err = -ENOPROTOOPT;
        }

        return err;
}

static int kcm_getsockopt(struct socket *sock, int level, int optname,
                          char __user *optval, int __user *optlen)
{
        struct kcm_sock *kcm = kcm_sk(sock->sk);
        int val, len;

        if (level != SOL_KCM)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;

        if (len < 0)
                return -EINVAL;

        len = min_t(unsigned int, len, sizeof(int));

        switch (optname) {
        case KCM_RECV_DISABLE:
                val = kcm->rx_disabled;
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;
        return 0;
}

static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
{
        struct kcm_sock *tkcm;
        struct list_head *head;
        int index = 0;

        /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
         * we set sk_state, otherwise epoll_wait always returns right away with
         * EPOLLHUP
         */
        kcm->sk.sk_state = TCP_ESTABLISHED;

        /* Add to mux's kcm sockets list */
        kcm->mux = mux;
        spin_lock_bh(&mux->lock);

        head = &mux->kcm_socks;
        list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
                if (tkcm->index != index)
                        break;
                head = &tkcm->kcm_sock_list;
                index++;
        }

        list_add(&kcm->kcm_sock_list, head);
        kcm->index = index;

        mux->kcm_socks_cnt++;
        spin_unlock_bh(&mux->lock);

        INIT_WORK(&kcm->tx_work, kcm_tx_work);
        mutex_init(&kcm->tx_mutex);

        spin_lock_bh(&mux->rx_lock);
        kcm_rcv_ready(kcm);
        spin_unlock_bh(&mux->rx_lock);
}

static int kcm_attach(struct socket *sock, struct socket *csock,
                      struct bpf_prog *prog)
{
        struct kcm_sock *kcm = kcm_sk(sock->sk);
        struct kcm_mux *mux = kcm->mux;
        struct sock *csk;
        struct kcm_psock *psock = NULL, *tpsock;
        struct list_head *head;
        int index = 0;
        static const struct strp_callbacks cb = {
                .rcv_msg = kcm_rcv_strparser,
                .parse_msg = kcm_parse_func_strparser,
                .read_sock_done = kcm_read_sock_done,
        };
        int err = 0;

        csk = csock->sk;
        if (!csk)
                return -EINVAL;

        lock_sock(csk);

        /* Only allow TCP sockets to be attached for now */
        if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) ||
            csk->sk_protocol != IPPROTO_TCP) {
                err = -EOPNOTSUPP;
                goto out;
        }

        /* Don't allow listeners or closed sockets */
        if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) {
                err = -EOPNOTSUPP;
                goto out;
        }

        psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
        if (!psock) {
                err = -ENOMEM;
                goto out;
        }

        psock->mux = mux;
        psock->sk = csk;
        psock->bpf_prog = prog;

        write_lock_bh(&csk->sk_callback_lock);

        /* Check if sk_user_data is already by KCM or someone else.
         * Must be done under lock to prevent race conditions.
         */
        if (csk->sk_user_data) {
                write_unlock_bh(&csk->sk_callback_lock);
                kmem_cache_free(kcm_psockp, psock);
                err = -EALREADY;
                goto out;
        }

        err = strp_init(&psock->strp, csk, &cb);
        if (err) {
                write_unlock_bh(&csk->sk_callback_lock);
                kmem_cache_free(kcm_psockp, psock);
                goto out;
        }

        psock->save_data_ready = csk->sk_data_ready;
        psock->save_write_space = csk->sk_write_space;
        psock->save_state_change = csk->sk_state_change;
        csk->sk_user_data = psock;
        csk->sk_data_ready = psock_data_ready;
        csk->sk_write_space = psock_write_space;
        csk->sk_state_change = psock_state_change;

        write_unlock_bh(&csk->sk_callback_lock);

        sock_hold(csk);

        /* Finished initialization, now add the psock to the MUX. */
        spin_lock_bh(&mux->lock);
        head = &mux->psocks;
        list_for_each_entry(tpsock, &mux->psocks, psock_list) {
                if (tpsock->index != index)
                        break;
                head = &tpsock->psock_list;
                index++;
        }

        list_add(&psock->psock_list, head);
        psock->index = index;

        KCM_STATS_INCR(mux->stats.psock_attach);
        mux->psocks_cnt++;
        psock_now_avail(psock);
        spin_unlock_bh(&mux->lock);

        /* Schedule RX work in case there are already bytes queued */
        strp_check_rcv(&psock->strp);

out:
        release_sock(csk);

        return err;
}

static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
{
        struct socket *csock;
        struct bpf_prog *prog;
        int err;

        csock = sockfd_lookup(info->fd, &err);
        if (!csock)
                return -ENOENT;

        prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER);
        if (IS_ERR(prog)) {
                err = PTR_ERR(prog);
                goto out;
        }

        err = kcm_attach(sock, csock, prog);
        if (err) {
                bpf_prog_put(prog);
                goto out;
        }

        /* Keep reference on file also */

        return 0;
out:
        sockfd_put(csock);
        return err;
}

static void kcm_unattach(struct kcm_psock *psock)
{
        struct sock *csk = psock->sk;
        struct kcm_mux *mux = psock->mux;

        lock_sock(csk);

        /* Stop getting callbacks from TCP socket. After this there should
         * be no way to reserve a kcm for this psock.
         */
        write_lock_bh(&csk->sk_callback_lock);
        csk->sk_user_data = NULL;
        csk->sk_data_ready = psock->save_data_ready;
        csk->sk_write_space = psock->save_write_space;
        csk->sk_state_change = psock->save_state_change;
        strp_stop(&psock->strp);

        if (WARN_ON(psock->rx_kcm)) {
                write_unlock_bh(&csk->sk_callback_lock);
                release_sock(csk);
                return;
        }

        spin_lock_bh(&mux->rx_lock);

        /* Stop receiver activities. After this point psock should not be
         * able to get onto ready list either through callbacks or work.
         */
        if (psock->ready_rx_msg) {
                list_del(&psock->psock_ready_list);
                kfree_skb(psock->ready_rx_msg);
                psock->ready_rx_msg = NULL;
                KCM_STATS_INCR(mux->stats.rx_ready_drops);
        }

        spin_unlock_bh(&mux->rx_lock);

        write_unlock_bh(&csk->sk_callback_lock);

        /* Call strp_done without sock lock */
        release_sock(csk);
        strp_done(&psock->strp);
        lock_sock(csk);

        bpf_prog_put(psock->bpf_prog);

        spin_lock_bh(&mux->lock);

        aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats);
        save_strp_stats(&psock->strp, &mux->aggregate_strp_stats);

        KCM_STATS_INCR(mux->stats.psock_unattach);

        if (psock->tx_kcm) {
                /* psock was reserved.  Just mark it finished and we will clean
                 * up in the kcm paths, we need kcm lock which can not be
                 * acquired here.
                 */
                KCM_STATS_INCR(mux->stats.psock_unattach_rsvd);
                spin_unlock_bh(&mux->lock);

                /* We are unattaching a socket that is reserved. Abort the
                 * socket since we may be out of sync in sending on it. We need
                 * to do this without the mux lock.
                 */
                kcm_abort_tx_psock(psock, EPIPE, false);

                spin_lock_bh(&mux->lock);
                if (!psock->tx_kcm) {
                        /* psock now unreserved in window mux was unlocked */
                        goto no_reserved;
                }
                psock->done = 1;

                /* Commit done before queuing work to process it */
                smp_mb();

                /* Queue tx work to make sure psock->done is handled */
                queue_work(kcm_wq, &psock->tx_kcm->tx_work);
                spin_unlock_bh(&mux->lock);
        } else {
no_reserved:
                if (!psock->tx_stopped)
                        list_del(&psock->psock_avail_list);
                list_del(&psock->psock_list);
                mux->psocks_cnt--;
                spin_unlock_bh(&mux->lock);

                sock_put(csk);
                fput(csk->sk_socket->file);
                kmem_cache_free(kcm_psockp, psock);
        }

        release_sock(csk);
}

static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
{
        struct kcm_sock *kcm = kcm_sk(sock->sk);
        struct kcm_mux *mux = kcm->mux;
        struct kcm_psock *psock;
        struct socket *csock;
        struct sock *csk;
        int err;

        csock = sockfd_lookup(info->fd, &err);
        if (!csock)
                return -ENOENT;

        csk = csock->sk;
        if (!csk) {
                err = -EINVAL;
                goto out;
        }

        err = -ENOENT;

        spin_lock_bh(&mux->lock);

        list_for_each_entry(psock, &mux->psocks, psock_list) {
                if (psock->sk != csk)
                        continue;

                /* Found the matching psock */

                if (psock->unattaching || WARN_ON(psock->done)) {
                        err = -EALREADY;
                        break;
                }

                psock->unattaching = 1;

                spin_unlock_bh(&mux->lock);

                /* Lower socket lock should already be held */
                kcm_unattach(psock);

                err = 0;
                goto out;
        }

        spin_unlock_bh(&mux->lock);

out:
        sockfd_put(csock);
        return err;
}

static struct proto kcm_proto = {
        .name        = "KCM",
        .owner        = THIS_MODULE,
        .obj_size = sizeof(struct kcm_sock),
};

/* Clone a kcm socket. */
static struct file *kcm_clone(struct socket *osock)
{
        struct socket *newsock;
        struct sock *newsk;

        newsock = sock_alloc();
        if (!newsock)
                return ERR_PTR(-ENFILE);

        newsock->type = osock->type;
        newsock->ops = osock->ops;

        __module_get(newsock->ops->owner);

        newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
                         &kcm_proto, false);
        if (!newsk) {
                sock_release(newsock);
                return ERR_PTR(-ENOMEM);
        }
        sock_init_data(newsock, newsk);
        init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);

        return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
}

static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        int err;

        switch (cmd) {
        case SIOCKCMATTACH: {
                struct kcm_attach info;

                if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
                        return -EFAULT;

                err = kcm_attach_ioctl(sock, &info);

                break;
        }
        case SIOCKCMUNATTACH: {
                struct kcm_unattach info;

                if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
                        return -EFAULT;

                err = kcm_unattach_ioctl(sock, &info);

                break;
        }
        case SIOCKCMCLONE: {
                struct kcm_clone info;

                FD_PREPARE(fdf, 0, kcm_clone(sock));
                if (fdf.err)
                        return fdf.err;

                info.fd = fd_prepare_fd(fdf);
                if (copy_to_user((void __user *)arg, &info, sizeof(info)))
                        return -EFAULT;

                fd_publish(fdf);
                err = 0;
                break;
        }
        default:
                err = -ENOIOCTLCMD;
                break;
        }

        return err;
}

static void release_mux(struct kcm_mux *mux)
{
        struct kcm_net *knet = mux->knet;
        struct kcm_psock *psock, *tmp_psock;

        /* Release psocks */
        list_for_each_entry_safe(psock, tmp_psock,
                                 &mux->psocks, psock_list) {
                if (!WARN_ON(psock->unattaching))
                        kcm_unattach(psock);
        }

        if (WARN_ON(mux->psocks_cnt))
                return;

        __skb_queue_purge(&mux->rx_hold_queue);

        mutex_lock(&knet->mutex);
        aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats);
        aggregate_psock_stats(&mux->aggregate_psock_stats,
                              &knet->aggregate_psock_stats);
        aggregate_strp_stats(&mux->aggregate_strp_stats,
                             &knet->aggregate_strp_stats);
        list_del_rcu(&mux->kcm_mux_list);
        knet->count--;
        mutex_unlock(&knet->mutex);

        kfree_rcu(mux, rcu);
}

static void kcm_done(struct kcm_sock *kcm)
{
        struct kcm_mux *mux = kcm->mux;
        struct sock *sk = &kcm->sk;
        int socks_cnt;

        spin_lock_bh(&mux->rx_lock);
        if (kcm->rx_psock) {
                /* Cleanup in unreserve_rx_kcm */
                WARN_ON(kcm->done);
                kcm->rx_disabled = 1;
                kcm->done = 1;
                spin_unlock_bh(&mux->rx_lock);
                return;
        }

        if (kcm->rx_wait) {
                list_del(&kcm->wait_rx_list);
                /* paired with lockless reads in kcm_rfree() */
                WRITE_ONCE(kcm->rx_wait, false);
        }
        /* Move any pending receive messages to other kcm sockets */
        requeue_rx_msgs(mux, &sk->sk_receive_queue);

        spin_unlock_bh(&mux->rx_lock);

        if (WARN_ON(sk_rmem_alloc_get(sk)))
                return;

        /* Detach from MUX */
        spin_lock_bh(&mux->lock);

        list_del(&kcm->kcm_sock_list);
        mux->kcm_socks_cnt--;
        socks_cnt = mux->kcm_socks_cnt;

        spin_unlock_bh(&mux->lock);

        if (!socks_cnt) {
                /* We are done with the mux now. */
                release_mux(mux);
        }

        WARN_ON(kcm->rx_wait);

        sock_put(&kcm->sk);
}

/* Called by kcm_release to close a KCM socket.
 * If this is the last KCM socket on the MUX, destroy the MUX.
 */
static int kcm_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct kcm_sock *kcm;
        struct kcm_mux *mux;
        struct kcm_psock *psock;

        if (!sk)
                return 0;

        kcm = kcm_sk(sk);
        mux = kcm->mux;

        lock_sock(sk);
        sock_orphan(sk);
        kfree_skb(kcm->seq_skb);

        /* Purge queue under lock to avoid race condition with tx_work trying
         * to act when queue is nonempty. If tx_work runs after this point
         * it will just return.
         */
        __skb_queue_purge(&sk->sk_write_queue);

        release_sock(sk);

        spin_lock_bh(&mux->lock);
        if (kcm->tx_wait) {
                /* Take of tx_wait list, after this point there should be no way
                 * that a psock will be assigned to this kcm.
                 */
                list_del(&kcm->wait_psock_list);
                kcm->tx_wait = false;
        }
        spin_unlock_bh(&mux->lock);

        /* Cancel work. After this point there should be no outside references
         * to the kcm socket.
         */
        disable_work_sync(&kcm->tx_work);

        lock_sock(sk);
        psock = kcm->tx_psock;
        if (psock) {
                /* A psock was reserved, so we need to kill it since it
                 * may already have some bytes queued from a message. We
                 * need to do this after removing kcm from tx_wait list.
                 */
                kcm_abort_tx_psock(psock, EPIPE, false);
                unreserve_psock(kcm);
        }
        release_sock(sk);

        WARN_ON(kcm->tx_wait);
        WARN_ON(kcm->tx_psock);

        sock->sk = NULL;

        kcm_done(kcm);

        return 0;
}

static const struct proto_ops kcm_dgram_ops = {
        .family =        PF_KCM,
        .owner =        THIS_MODULE,
        .release =        kcm_release,
        .bind =                sock_no_bind,
        .connect =        sock_no_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        sock_no_getname,
        .poll =                datagram_poll,
        .ioctl =        kcm_ioctl,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        kcm_setsockopt,
        .getsockopt =        kcm_getsockopt,
        .sendmsg =        kcm_sendmsg,
        .recvmsg =        kcm_recvmsg,
        .mmap =                sock_no_mmap,
        .splice_eof =        kcm_splice_eof,
};

static const struct proto_ops kcm_seqpacket_ops = {
        .family =        PF_KCM,
        .owner =        THIS_MODULE,
        .release =        kcm_release,
        .bind =                sock_no_bind,
        .connect =        sock_no_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        sock_no_getname,
        .poll =                datagram_poll,
        .ioctl =        kcm_ioctl,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        kcm_setsockopt,
        .getsockopt =        kcm_getsockopt,
        .sendmsg =        kcm_sendmsg,
        .recvmsg =        kcm_recvmsg,
        .mmap =                sock_no_mmap,
        .splice_eof =        kcm_splice_eof,
        .splice_read =        kcm_splice_read,
};

/* Create proto operation for kcm sockets */
static int kcm_create(struct net *net, struct socket *sock,
                      int protocol, int kern)
{
        struct kcm_net *knet = net_generic(net, kcm_net_id);
        struct sock *sk;
        struct kcm_mux *mux;

        switch (sock->type) {
        case SOCK_DGRAM:
                sock->ops = &kcm_dgram_ops;
                break;
        case SOCK_SEQPACKET:
                sock->ops = &kcm_seqpacket_ops;
                break;
        default:
                return -ESOCKTNOSUPPORT;
        }

        if (protocol != KCMPROTO_CONNECTED)
                return -EPROTONOSUPPORT;

        sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
        if (!sk)
                return -ENOMEM;

        /* Allocate a kcm mux, shared between KCM sockets */
        mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
        if (!mux) {
                sk_free(sk);
                return -ENOMEM;
        }

        spin_lock_init(&mux->lock);
        spin_lock_init(&mux->rx_lock);
        INIT_LIST_HEAD(&mux->kcm_socks);
        INIT_LIST_HEAD(&mux->kcm_rx_waiters);
        INIT_LIST_HEAD(&mux->kcm_tx_waiters);

        INIT_LIST_HEAD(&mux->psocks);
        INIT_LIST_HEAD(&mux->psocks_ready);
        INIT_LIST_HEAD(&mux->psocks_avail);

        mux->knet = knet;

        /* Add new MUX to list */
        mutex_lock(&knet->mutex);
        list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
        knet->count++;
        mutex_unlock(&knet->mutex);

        skb_queue_head_init(&mux->rx_hold_queue);

        /* Init KCM socket */
        sock_init_data(sock, sk);
        init_kcm_sock(kcm_sk(sk), mux);

        return 0;
}

static const struct net_proto_family kcm_family_ops = {
        .family = PF_KCM,
        .create = kcm_create,
        .owner  = THIS_MODULE,
};

static __net_init int kcm_init_net(struct net *net)
{
        struct kcm_net *knet = net_generic(net, kcm_net_id);

        INIT_LIST_HEAD_RCU(&knet->mux_list);
        mutex_init(&knet->mutex);

        return 0;
}

static __net_exit void kcm_exit_net(struct net *net)
{
        struct kcm_net *knet = net_generic(net, kcm_net_id);

        /* All KCM sockets should be closed at this point, which should mean
         * that all multiplexors and psocks have been destroyed.
         */
        WARN_ON(!list_empty(&knet->mux_list));

        mutex_destroy(&knet->mutex);
}

static struct pernet_operations kcm_net_ops = {
        .init = kcm_init_net,
        .exit = kcm_exit_net,
        .id   = &kcm_net_id,
        .size = sizeof(struct kcm_net),
};

static int __init kcm_init(void)
{
        int err = -ENOMEM;

        kcm_muxp = KMEM_CACHE(kcm_mux, SLAB_HWCACHE_ALIGN);
        if (!kcm_muxp)
                goto fail;

        kcm_psockp = KMEM_CACHE(kcm_psock, SLAB_HWCACHE_ALIGN);
        if (!kcm_psockp)
                goto fail;

        kcm_wq = create_singlethread_workqueue("kkcmd");
        if (!kcm_wq)
                goto fail;

        err = proto_register(&kcm_proto, 1);
        if (err)
                goto fail;

        err = register_pernet_device(&kcm_net_ops);
        if (err)
                goto net_ops_fail;

        err = sock_register(&kcm_family_ops);
        if (err)
                goto sock_register_fail;

        err = kcm_proc_init();
        if (err)
                goto proc_init_fail;

        return 0;

proc_init_fail:
        sock_unregister(PF_KCM);

sock_register_fail:
        unregister_pernet_device(&kcm_net_ops);

net_ops_fail:
        proto_unregister(&kcm_proto);

fail:
        kmem_cache_destroy(kcm_muxp);
        kmem_cache_destroy(kcm_psockp);

        if (kcm_wq)
                destroy_workqueue(kcm_wq);

        return err;
}

static void __exit kcm_exit(void)
{
        kcm_proc_exit();
        sock_unregister(PF_KCM);
        unregister_pernet_device(&kcm_net_ops);
        proto_unregister(&kcm_proto);
        destroy_workqueue(kcm_wq);

        kmem_cache_destroy(kcm_muxp);
        kmem_cache_destroy(kcm_psockp);
}

module_init(kcm_init);
module_exit(kcm_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("KCM (Kernel Connection Multiplexor) sockets");
MODULE_ALIAS_NETPROTO(PF_KCM);










   12 
   11 

































    1 


















   14 













    6 
   11 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/**
 * css_get - obtain a reference on the specified css
 * @css: target css
 *
 * The caller must already have a reference.
 */
CGROUP_REF_FN_ATTRS
void css_get(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get(&css->refcnt);
}
CGROUP_REF_EXPORT(css_get)

/**
 * css_get_many - obtain references on the specified css
 * @css: target css
 * @n: number of references to get
 *
 * The caller must already have a reference.
 */
CGROUP_REF_FN_ATTRS
void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get_many(&css->refcnt, n);
}
CGROUP_REF_EXPORT(css_get_many)

/**
 * css_tryget - try to obtain a reference on the specified css
 * @css: target css
 *
 * Obtain a reference on @css unless it already has reached zero and is
 * being released.  This function doesn't care whether @css is on or
 * offline.  The caller naturally needs to ensure that @css is accessible
 * but doesn't have to be holding a reference on it - IOW, RCU protected
 * access is good enough for this function.  Returns %true if a reference
 * count was successfully obtained; %false otherwise.
 */
CGROUP_REF_FN_ATTRS
bool css_tryget(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget(&css->refcnt);
        return true;
}
CGROUP_REF_EXPORT(css_tryget)

/**
 * css_tryget_online - try to obtain a reference on the specified css if online
 * @css: target css
 *
 * Obtain a reference on @css if it's online.  The caller naturally needs
 * to ensure that @css is accessible but doesn't have to be holding a
 * reference on it - IOW, RCU protected access is good enough for this
 * function.  Returns %true if a reference count was successfully obtained;
 * %false otherwise.
 */
CGROUP_REF_FN_ATTRS
bool css_tryget_online(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget_live(&css->refcnt);
        return true;
}
CGROUP_REF_EXPORT(css_tryget_online)

/**
 * css_put - put a css reference
 * @css: target css
 *
 * Put a reference obtained via css_get() and css_tryget_online().
 */
CGROUP_REF_FN_ATTRS
void css_put(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put(&css->refcnt);
}
CGROUP_REF_EXPORT(css_put)

/**
 * css_put_many - put css references
 * @css: target css
 * @n: number of references to put
 *
 * Put references obtained via css_get() and css_tryget_online().
 */
CGROUP_REF_FN_ATTRS
void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put_many(&css->refcnt, n);
}
CGROUP_REF_EXPORT(css_put_many)























   28 



































   26 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM workqueue

#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WORKQUEUE_H

#include <linux/tracepoint.h>
#include <linux/workqueue.h>

struct pool_workqueue;

/**
 * workqueue_queue_work - called when a work gets queued
 * @req_cpu:        the requested cpu
 * @pwq:        pointer to struct pool_workqueue
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a work is queued immediately or once a
 * delayed work is actually queued on a workqueue (ie: once the delay
 * has been reached).
 */
TRACE_EVENT(workqueue_queue_work,

        TP_PROTO(int req_cpu, struct pool_workqueue *pwq,
                 struct work_struct *work),

        TP_ARGS(req_cpu, pwq, work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
                __string( workqueue,        pwq->wq->name)
                __field( int,        req_cpu        )
                __field( int,        cpu        )
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
                __assign_str(workqueue);
                __entry->req_cpu        = req_cpu;
                __entry->cpu                = pwq->pool->cpu;
        ),

        TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d",
                  __entry->work, __entry->function, __get_str(workqueue),
                  __entry->req_cpu, __entry->cpu)
);

/**
 * workqueue_activate_work - called when a work gets activated
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a queued work is put on the active queue,
 * which happens immediately after queueing unless @max_active limit
 * is reached.
 */
TRACE_EVENT(workqueue_activate_work,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p function=%ps ", __entry->work, __entry->function)
);

/**
 * workqueue_execute_start - called immediately before the workqueue callback
 * @work:        pointer to struct work_struct
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_start,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/**
 * workqueue_execute_end - called immediately after the workqueue callback
 * @work:        pointer to struct work_struct
 * @function:   pointer to worker function
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_end,

        TP_PROTO(struct work_struct *work, work_func_t function),

        TP_ARGS(work, function),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = function;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

#endif /*  _TRACE_WORKQUEUE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


























































    1 










































































































































































    1 











    1 













    1 






























    1 







    1 




















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfsplus/extents.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Handling of Extents both in catalog and extents overflow trees
 */

#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/pagemap.h>

#include "hfsplus_fs.h"
#include "hfsplus_raw.h"

/* Compare two extents keys, returns 0 on same, pos/neg for difference */
int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1,
                        const hfsplus_btree_key *k2)
{
        __be32 k1id, k2id;
        __be32 k1s, k2s;

        k1id = k1->ext.cnid;
        k2id = k2->ext.cnid;
        if (k1id != k2id)
                return be32_to_cpu(k1id) < be32_to_cpu(k2id) ? -1 : 1;

        if (k1->ext.fork_type != k2->ext.fork_type)
                return k1->ext.fork_type < k2->ext.fork_type ? -1 : 1;

        k1s = k1->ext.start_block;
        k2s = k2->ext.start_block;
        if (k1s == k2s)
                return 0;
        return be32_to_cpu(k1s) < be32_to_cpu(k2s) ? -1 : 1;
}

static void hfsplus_ext_build_key(hfsplus_btree_key *key, u32 cnid,
                                  u32 block, u8 type)
{
        key->key_len = cpu_to_be16(HFSPLUS_EXT_KEYLEN - 2);
        key->ext.cnid = cpu_to_be32(cnid);
        key->ext.start_block = cpu_to_be32(block);
        key->ext.fork_type = type;
        key->ext.pad = 0;
}

static u32 hfsplus_ext_find_block(struct hfsplus_extent *ext, u32 off)
{
        int i;
        u32 count;

        for (i = 0; i < 8; ext++, i++) {
                count = be32_to_cpu(ext->block_count);
                if (off < count)
                        return be32_to_cpu(ext->start_block) + off;
                off -= count;
        }
        /* panic? */
        return 0;
}

static int hfsplus_ext_block_count(struct hfsplus_extent *ext)
{
        int i;
        u32 count = 0;

        for (i = 0; i < 8; ext++, i++)
                count += be32_to_cpu(ext->block_count);
        return count;
}

static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
{
        int i;

        ext += 7;
        for (i = 0; i < 7; ext--, i++)
                if (ext->block_count)
                        break;
        return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
}

static int __hfsplus_ext_write_extent(struct inode *inode,
                struct hfs_find_data *fd)
{
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res;

        WARN_ON(!mutex_is_locked(&hip->extents_lock));

        hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
                              HFSPLUS_IS_RSRC(inode) ?
                                HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);

        res = hfs_brec_find(fd, hfs_find_rec_by_key);
        if (hip->extent_state & HFSPLUS_EXT_NEW) {
                if (res != -ENOENT)
                        return res;
                /* Fail early and avoid ENOSPC during the btree operation */
                res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
                if (res)
                        return res;
                hfs_brec_insert(fd, hip->cached_extents,
                                sizeof(hfsplus_extent_rec));
                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
        } else {
                if (res)
                        return res;
                hfs_bnode_write(fd->bnode, hip->cached_extents,
                                fd->entryoffset, fd->entrylength);
                hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
        }

        /*
         * We can't just use hfsplus_mark_inode_dirty here, because we
         * also get called from hfsplus_write_inode, which should not
         * redirty the inode.  Instead the callers have to be careful
         * to explicily mark the inode dirty, too.
         */
        set_bit(HFSPLUS_I_EXT_DIRTY,
                &HFSPLUS_I(HFSPLUS_EXT_TREE_I(inode->i_sb))->flags);
        set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);

        return 0;
}

static int hfsplus_ext_write_extent_locked(struct inode *inode)
{
        int res = 0;

        if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
                struct hfs_find_data fd;

                res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
                if (res)
                        return res;
                res = __hfsplus_ext_write_extent(inode, &fd);
                hfs_find_exit(&fd);
        }
        return res;
}

int hfsplus_ext_write_extent(struct inode *inode)
{
        int res;

        mutex_lock(&HFSPLUS_I(inode)->extents_lock);
        res = hfsplus_ext_write_extent_locked(inode);
        mutex_unlock(&HFSPLUS_I(inode)->extents_lock);

        return res;
}

static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
                                            struct hfsplus_extent *extent,
                                            u32 cnid, u32 block, u8 type)
{
        int res;

        hfsplus_ext_build_key(fd->search_key, cnid, block, type);
        fd->key->ext.cnid = 0;
        res = hfs_brec_find(fd, hfs_find_rec_by_key);
        if (res && res != -ENOENT)
                return res;
        if (fd->key->ext.cnid != fd->search_key->ext.cnid ||
            fd->key->ext.fork_type != fd->search_key->ext.fork_type)
                return -ENOENT;
        if (fd->entrylength != sizeof(hfsplus_extent_rec))
                return -EIO;
        hfs_bnode_read(fd->bnode, extent, fd->entryoffset,
                sizeof(hfsplus_extent_rec));
        return 0;
}

static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
                struct inode *inode, u32 block)
{
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res;

        WARN_ON(!mutex_is_locked(&hip->extents_lock));

        if (hip->extent_state & HFSPLUS_EXT_DIRTY) {
                res = __hfsplus_ext_write_extent(inode, fd);
                if (res)
                        return res;
        }

        res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
                                        block, HFSPLUS_IS_RSRC(inode) ?
                                                HFSPLUS_TYPE_RSRC :
                                                HFSPLUS_TYPE_DATA);
        if (!res) {
                hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
                hip->cached_blocks =
                        hfsplus_ext_block_count(hip->cached_extents);
        } else {
                hip->cached_start = hip->cached_blocks = 0;
                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
        }
        return res;
}

static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
{
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        struct hfs_find_data fd;
        int res;

        if (block >= hip->cached_start &&
            block < hip->cached_start + hip->cached_blocks)
                return 0;

        res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
        if (!res) {
                res = __hfsplus_ext_cache_extent(&fd, inode, block);
                hfs_find_exit(&fd);
        }
        return res;
}

/* Get a block at iblock for inode, possibly allocating if create */
int hfsplus_get_block(struct inode *inode, sector_t iblock,
                      struct buffer_head *bh_result, int create)
{
        struct super_block *sb = inode->i_sb;
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        int res = -EIO;
        u32 ablock, dblock, mask;
        sector_t sector;
        int was_dirty = 0;

        /* Convert inode block to disk allocation block */
        ablock = iblock >> sbi->fs_shift;

        if (iblock >= hip->fs_blocks) {
                if (!create)
                        return 0;
                if (iblock > hip->fs_blocks)
                        return -EIO;
                if (ablock >= hip->alloc_blocks) {
                        res = hfsplus_file_extend(inode, false);
                        if (res)
                                return res;
                }
        } else
                create = 0;

        if (ablock < hip->first_blocks) {
                dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
                goto done;
        }

        if (inode->i_ino == HFSPLUS_EXT_CNID)
                return -EIO;

        mutex_lock(&hip->extents_lock);

        /*
         * hfsplus_ext_read_extent will write out a cached extent into
         * the extents btree.  In that case we may have to mark the inode
         * dirty even for a pure read of an extent here.
         */
        was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY);
        res = hfsplus_ext_read_extent(inode, ablock);
        if (res) {
                mutex_unlock(&hip->extents_lock);
                return -EIO;
        }
        dblock = hfsplus_ext_find_block(hip->cached_extents,
                                        ablock - hip->cached_start);
        mutex_unlock(&hip->extents_lock);

done:
        hfs_dbg("ino %llu, iblock %llu - dblock %u\n",
                inode->i_ino, (long long)iblock, dblock);

        mask = (1 << sbi->fs_shift) - 1;
        sector = ((sector_t)dblock << sbi->fs_shift) +
                  sbi->blockoffset + (iblock & mask);
        map_bh(bh_result, sb, sector);

        if (create) {
                set_buffer_new(bh_result);
                hip->phys_size += sb->s_blocksize;
                hip->fs_blocks++;
                inode_add_bytes(inode, sb->s_blocksize);
        }
        if (create || was_dirty)
                mark_inode_dirty(inode);
        return 0;
}

static void hfsplus_dump_extent(struct hfsplus_extent *extent)
{
        int i;

        hfs_dbg("extent   ");
        for (i = 0; i < 8; i++)
                hfs_dbg(" start_block %u, block_count %u",
                        be32_to_cpu(extent[i].start_block),
                        be32_to_cpu(extent[i].block_count));
        hfs_dbg("\n");
}

static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset,
                              u32 alloc_block, u32 block_count)
{
        u32 count, start;
        int i;

        hfsplus_dump_extent(extent);
        for (i = 0; i < 8; extent++, i++) {
                count = be32_to_cpu(extent->block_count);
                if (offset == count) {
                        start = be32_to_cpu(extent->start_block);
                        if (alloc_block != start + count) {
                                if (++i >= 8)
                                        return -ENOSPC;
                                extent++;
                                extent->start_block = cpu_to_be32(alloc_block);
                        } else
                                block_count += count;
                        extent->block_count = cpu_to_be32(block_count);
                        return 0;
                } else if (offset < count)
                        break;
                offset -= count;
        }
        /* panic? */
        return -EIO;
}

static int hfsplus_free_extents(struct super_block *sb,
                                struct hfsplus_extent *extent,
                                u32 offset, u32 block_nr)
{
        u32 count, start;
        int i;
        int err = 0;

        hfsplus_dump_extent(extent);
        for (i = 0; i < 8; extent++, i++) {
                count = be32_to_cpu(extent->block_count);
                if (offset == count)
                        goto found;
                else if (offset < count)
                        break;
                offset -= count;
        }
        /* panic? */
        return -EIO;
found:
        for (;;) {
                start = be32_to_cpu(extent->start_block);
                if (count <= block_nr) {
                        err = hfsplus_block_free(sb, start, count);
                        if (err) {
                                pr_err("can't free extent: start %u, count %u\n",
                                        start, count);
                        }
                        extent->block_count = 0;
                        extent->start_block = 0;
                        block_nr -= count;
                } else {
                        count -= block_nr;
                        err = hfsplus_block_free(sb, start + count, block_nr);
                        if (err) {
                                pr_err("can't free extent: start %u, count %u\n",
                                        start, count);
                        }
                        extent->block_count = cpu_to_be32(count);
                        block_nr = 0;
                }
                if (!block_nr || !i) {
                        /*
                         * Try to free all extents and
                         * return only last error
                         */
                        return err;
                }
                i--;
                extent--;
                count = be32_to_cpu(extent->block_count);
        }
}

int hfsplus_free_fork(struct super_block *sb, u32 cnid,
                struct hfsplus_fork_raw *fork, int type)
{
        struct hfs_find_data fd;
        hfsplus_extent_rec ext_entry;
        u32 total_blocks, blocks, start;
        int res, i;

        total_blocks = be32_to_cpu(fork->total_blocks);
        if (!total_blocks)
                return 0;

        blocks = 0;
        for (i = 0; i < 8; i++)
                blocks += be32_to_cpu(fork->extents[i].block_count);

        res = hfsplus_free_extents(sb, fork->extents, blocks, blocks);
        if (res)
                return res;
        if (total_blocks == blocks)
                return 0;

        res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
        if (res)
                return res;
        do {
                res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
                                                total_blocks, type);
                if (res)
                        break;
                start = be32_to_cpu(fd.key->ext.start_block);
                hfs_brec_remove(&fd);

                mutex_unlock(&fd.tree->tree_lock);
                hfsplus_free_extents(sb, ext_entry, total_blocks - start,
                                     total_blocks);
                total_blocks = start;
                mutex_lock_nested(&fd.tree->tree_lock,
                        hfsplus_btree_lock_class(fd.tree));
        } while (total_blocks > blocks);
        hfs_find_exit(&fd);

        return res;
}

int hfsplus_file_extend(struct inode *inode, bool zeroout)
{
        struct super_block *sb = inode->i_sb;
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        u32 start, len, goal;
        int res;

        if (sbi->alloc_file->i_size * 8 <
            sbi->total_blocks - sbi->free_blocks + 8) {
                /* extend alloc file */
                pr_err_ratelimited("extend alloc file! (%llu,%u,%u)\n",
                                   sbi->alloc_file->i_size * 8,
                                   sbi->total_blocks, sbi->free_blocks);
                return -ENOSPC;
        }

        mutex_lock(&hip->extents_lock);
        if (hip->alloc_blocks == hip->first_blocks)
                goal = hfsplus_ext_lastblock(hip->first_extents);
        else {
                res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
                if (res)
                        goto out;
                goal = hfsplus_ext_lastblock(hip->cached_extents);
        }

        len = hip->clump_blocks;
        start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
        if (start >= sbi->total_blocks) {
                start = hfsplus_block_allocate(sb, goal, 0, &len);
                if (start >= goal) {
                        res = -ENOSPC;
                        goto out;
                }
        }

        if (zeroout) {
                res = sb_issue_zeroout(sb, start, len, GFP_NOFS);
                if (res)
                        goto out;
        }

        hfs_dbg("ino %llu, start %u, len %u\n", inode->i_ino, start, len);

        if (hip->alloc_blocks <= hip->first_blocks) {
                if (!hip->first_blocks) {
                        hfs_dbg("first_extent: start %u, len %u\n",
                                start, len);
                        /* no extents yet */
                        hip->first_extents[0].start_block = cpu_to_be32(start);
                        hip->first_extents[0].block_count = cpu_to_be32(len);
                        res = 0;
                } else {
                        /* try to append to extents in inode */
                        res = hfsplus_add_extent(hip->first_extents,
                                                 hip->alloc_blocks,
                                                 start, len);
                        if (res == -ENOSPC)
                                goto insert_extent;
                }
                if (!res) {
                        hfsplus_dump_extent(hip->first_extents);
                        hip->first_blocks += len;
                }
        } else {
                res = hfsplus_add_extent(hip->cached_extents,
                                         hip->alloc_blocks - hip->cached_start,
                                         start, len);
                if (!res) {
                        hfsplus_dump_extent(hip->cached_extents);
                        hip->extent_state |= HFSPLUS_EXT_DIRTY;
                        hip->cached_blocks += len;
                } else if (res == -ENOSPC)
                        goto insert_extent;
        }
out:
        if (!res) {
                hip->alloc_blocks += len;
                mutex_unlock(&hip->extents_lock);
                hfsplus_mark_inode_dirty(HFSPLUS_SB(sb)->alloc_file,
                                         HFSPLUS_I_ALLOC_DIRTY);
                hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
                return 0;
        }
        mutex_unlock(&hip->extents_lock);
        return res;

insert_extent:
        hfs_dbg("insert new extent\n");
        res = hfsplus_ext_write_extent_locked(inode);
        if (res)
                goto out;

        memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
        hip->cached_extents[0].start_block = cpu_to_be32(start);
        hip->cached_extents[0].block_count = cpu_to_be32(len);
        hfsplus_dump_extent(hip->cached_extents);
        hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW;
        hip->cached_start = hip->alloc_blocks;
        hip->cached_blocks = len;

        res = 0;
        goto out;
}

void hfsplus_file_truncate(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        struct hfs_find_data fd;
        u32 alloc_cnt, blk_cnt, start;
        int res;

        hfs_dbg("ino %llu, phys_size %llu -> i_size %llu\n",
                inode->i_ino, (long long)hip->phys_size, inode->i_size);

        if (inode->i_size > hip->phys_size) {
                struct address_space *mapping = inode->i_mapping;
                struct folio *folio;
                void *fsdata = NULL;
                loff_t size = inode->i_size;

                res = hfsplus_write_begin(NULL, mapping, size, 0,
                                          &folio, &fsdata);
                if (res)
                        return;
                res = generic_write_end(NULL, mapping, size, 0, 0,
                                        folio, fsdata);
                if (res < 0)
                        return;
                mark_inode_dirty(inode);
                return;
        } else if (inode->i_size == hip->phys_size)
                return;

        blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
                        HFSPLUS_SB(sb)->alloc_blksz_shift;

        mutex_lock(&hip->extents_lock);

        alloc_cnt = hip->alloc_blocks;
        if (blk_cnt == alloc_cnt)
                goto out_unlock;

        res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
        if (res) {
                mutex_unlock(&hip->extents_lock);
                /* XXX: We lack error handling of hfsplus_file_truncate() */
                return;
        }

        while (1) {
                if (alloc_cnt == hip->first_blocks) {
                        mutex_unlock(&fd.tree->tree_lock);
                        hfsplus_free_extents(sb, hip->first_extents,
                                             alloc_cnt, alloc_cnt - blk_cnt);
                        hfsplus_dump_extent(hip->first_extents);
                        hip->first_blocks = blk_cnt;
                        mutex_lock_nested(&fd.tree->tree_lock,
                                hfsplus_btree_lock_class(fd.tree));
                        break;
                }
                res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
                if (res)
                        break;

                start = hip->cached_start;
                if (blk_cnt <= start)
                        hfs_brec_remove(&fd);
                mutex_unlock(&fd.tree->tree_lock);
                hfsplus_free_extents(sb, hip->cached_extents,
                                     alloc_cnt - start, alloc_cnt - blk_cnt);
                hfsplus_dump_extent(hip->cached_extents);
                mutex_lock_nested(&fd.tree->tree_lock,
                                hfsplus_btree_lock_class(fd.tree));
                if (blk_cnt > start) {
                        hip->extent_state |= HFSPLUS_EXT_DIRTY;
                        break;
                }
                alloc_cnt = start;
                hip->cached_start = hip->cached_blocks = 0;
                hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
        }
        hfs_find_exit(&fd);

        hip->alloc_blocks = blk_cnt;
out_unlock:
        mutex_unlock(&hip->extents_lock);
        hip->phys_size = inode->i_size;
        hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
                sb->s_blocksize_bits;
        inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
        hfsplus_mark_inode_dirty(HFSPLUS_SB(sb)->alloc_file,
                                 HFSPLUS_I_ALLOC_DIRTY);
        hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
}
































































































































































    3 












    1 




    3 


    3 



























    1 






    1 
































    1 




    1 






















    3 




    1 






    3 





















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/inode.c - kernfs inode implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/security.h>

#include "kernfs-internal.h"

static const struct inode_operations kernfs_iops = {
        .permission        = kernfs_iop_permission,
        .setattr        = kernfs_iop_setattr,
        .getattr        = kernfs_iop_getattr,
        .listxattr        = kernfs_iop_listxattr,
};

static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc)
{
        struct kernfs_iattrs *ret __free(kfree) = NULL;
        struct kernfs_iattrs *attr;

        attr = READ_ONCE(kn->iattr);
        if (attr || !alloc)
                return attr;

        ret = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
        if (!ret)
                return NULL;

        /* assign default attributes */
        ret->ia_uid = GLOBAL_ROOT_UID;
        ret->ia_gid = GLOBAL_ROOT_GID;

        ktime_get_real_ts64(&ret->ia_atime);
        ret->ia_mtime = ret->ia_atime;
        ret->ia_ctime = ret->ia_atime;

        simple_xattr_limits_init(&ret->xattr_limits);

        /* If someone raced us, recognize it. */
        if (!try_cmpxchg(&kn->iattr, &attr, ret))
                return READ_ONCE(kn->iattr);

        return no_free_ptr(ret);
}

static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
{
        return __kernfs_iattrs(kn, true);
}

static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn)
{
        return __kernfs_iattrs(kn, false);
}

int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
        struct kernfs_iattrs *attrs;
        unsigned int ia_valid = iattr->ia_valid;

        attrs = kernfs_iattrs(kn);
        if (!attrs)
                return -ENOMEM;

        if (ia_valid & ATTR_UID)
                attrs->ia_uid = iattr->ia_uid;
        if (ia_valid & ATTR_GID)
                attrs->ia_gid = iattr->ia_gid;
        if (ia_valid & ATTR_ATIME)
                attrs->ia_atime = iattr->ia_atime;
        if (ia_valid & ATTR_MTIME)
                attrs->ia_mtime = iattr->ia_mtime;
        if (ia_valid & ATTR_CTIME)
                attrs->ia_ctime = iattr->ia_ctime;
        if (ia_valid & ATTR_MODE)
                kn->mode = iattr->ia_mode;
        return 0;
}

/**
 * kernfs_setattr - set iattr on a node
 * @kn: target node
 * @iattr: iattr to set
 *
 * Return: %0 on success, -errno on failure.
 */
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
        int ret;
        struct kernfs_root *root = kernfs_root(kn);

        down_write(&root->kernfs_iattr_rwsem);
        ret = __kernfs_setattr(kn, iattr);
        up_write(&root->kernfs_iattr_rwsem);
        return ret;
}

int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                       struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_root *root;
        int error;

        if (!kn)
                return -EINVAL;

        root = kernfs_root(kn);
        down_write(&root->kernfs_iattr_rwsem);
        error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
        if (error)
                goto out;

        error = __kernfs_setattr(kn, iattr);
        if (error)
                goto out;

        /* this ignores size changes */
        setattr_copy(&nop_mnt_idmap, inode, iattr);

out:
        up_write(&root->kernfs_iattr_rwsem);
        return error;
}

ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
{
        struct kernfs_node *kn = kernfs_dentry_node(dentry);
        struct kernfs_iattrs *attrs;

        attrs = kernfs_iattrs(kn);
        if (!attrs)
                return -ENOMEM;

        return simple_xattr_list(d_inode(dentry), READ_ONCE(attrs->xattrs),
                                 buf, size);
}

static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
        inode->i_mode = mode;
        simple_inode_init_ts(inode);
}

static inline void set_inode_attr(struct inode *inode,
                                  struct kernfs_iattrs *attrs)
{
        inode->i_uid = attrs->ia_uid;
        inode->i_gid = attrs->ia_gid;
        inode_set_atime_to_ts(inode, attrs->ia_atime);
        inode_set_mtime_to_ts(inode, attrs->ia_mtime);
        inode_set_ctime_to_ts(inode, attrs->ia_ctime);
}

static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
{
        struct kernfs_iattrs *attrs;

        inode->i_mode = kn->mode;
        attrs = kernfs_iattrs_noalloc(kn);
        if (attrs)
                /*
                 * kernfs_node has non-default attributes get them from
                 * persistent copy in kernfs_node.
                 */
                set_inode_attr(inode, attrs);

        if (kernfs_type(kn) == KERNFS_DIR && !(kn->flags & KERNFS_REMOVING))
                set_nlink(inode, kn->dir.subdirs + 2);
}

int kernfs_iop_getattr(struct mnt_idmap *idmap,
                       const struct path *path, struct kstat *stat,
                       u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_root *root = kernfs_root(kn);

        down_read(&root->kernfs_iattr_rwsem);
        kernfs_refresh_inode(kn, inode);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        up_read(&root->kernfs_iattr_rwsem);

        return 0;
}

static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
{
        kernfs_get(kn);
        inode->i_private = kn;
        inode->i_mapping->a_ops = &ram_aops;
        inode->i_op = &kernfs_iops;
        inode->i_generation = kernfs_gen(kn);

        set_default_inode_attr(inode, kn->mode);
        kernfs_refresh_inode(kn, inode);

        /* initialize inode according to type */
        switch (kernfs_type(kn)) {
        case KERNFS_DIR:
                inode->i_op = &kernfs_dir_iops;
                inode->i_fop = &kernfs_dir_fops;
                if (kn->flags & KERNFS_EMPTY_DIR)
                        make_empty_dir_inode(inode);
                break;
        case KERNFS_FILE:
                inode->i_size = kn->attr.size;
                inode->i_fop = &kernfs_file_fops;
                break;
        case KERNFS_LINK:
                inode->i_op = &kernfs_symlink_iops;
                break;
        default:
                BUG();
        }

        unlock_new_inode(inode);
}

/**
 *        kernfs_get_inode - get inode for kernfs_node
 *        @sb: super block
 *        @kn: kernfs_node to allocate inode for
 *
 *        Get inode for @kn.  If such inode doesn't exist, a new inode is
 *        allocated and basics are initialized.  New inode is returned
 *        locked.
 *
 *        Locking:
 *        Kernel thread context (may sleep).
 *
 *        Return:
 *        Pointer to allocated inode on success, %NULL on failure.
 */
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{
        struct inode *inode;

        inode = iget_locked(sb, kernfs_ino(kn));
        if (inode && (inode_state_read_once(inode) & I_NEW))
                kernfs_init_inode(kn, inode);

        return inode;
}

/*
 * The kernfs_node serves as both an inode and a directory entry for
 * kernfs.  To prevent the kernfs inode numbers from being freed
 * prematurely we take a reference to kernfs_node from the kernfs inode.  A
 * super_operations.evict_inode() implementation is needed to drop that
 * reference upon inode destruction.
 */
void kernfs_evict_inode(struct inode *inode)
{
        struct kernfs_node *kn = inode->i_private;

        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        kernfs_put(kn);
}

int kernfs_iop_permission(struct mnt_idmap *idmap,
                          struct inode *inode, int mask)
{
        struct kernfs_node *kn;
        struct kernfs_root *root;
        int ret;

        if (mask & MAY_NOT_BLOCK)
                return -ECHILD;

        kn = inode->i_private;
        root = kernfs_root(kn);

        down_read(&root->kernfs_iattr_rwsem);
        kernfs_refresh_inode(kn, inode);
        ret = generic_permission(&nop_mnt_idmap, inode, mask);
        up_read(&root->kernfs_iattr_rwsem);

        return ret;
}

int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
                     void *value, size_t size)
{
        struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn);
        struct simple_xattrs *xattrs;

        if (!attrs)
                return -ENODATA;

        xattrs = READ_ONCE(attrs->xattrs);
        if (!xattrs)
                return -ENODATA;

        return simple_xattr_get(xattrs, name, value, size);
}

int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
                     const void *value, size_t size, int flags)
{
        struct simple_xattr *old_xattr;
        struct simple_xattrs *xattrs;
        struct kernfs_iattrs *attrs;

        attrs = kernfs_iattrs(kn);
        if (!attrs)
                return -ENOMEM;

        xattrs = simple_xattrs_lazy_alloc(&attrs->xattrs, value, flags);
        if (IS_ERR_OR_NULL(xattrs))
                return PTR_ERR(xattrs);

        old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
        if (IS_ERR(old_xattr))
                return PTR_ERR(old_xattr);

        simple_xattr_free_rcu(old_xattr);
        return 0;
}

static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
                                struct dentry *unused, struct inode *inode,
                                const char *suffix, void *value, size_t size)
{
        const char *name = xattr_full_name(handler, suffix);
        struct kernfs_node *kn = inode->i_private;

        return kernfs_xattr_get(kn, name, value, size);
}

static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
                                struct mnt_idmap *idmap,
                                struct dentry *unused, struct inode *inode,
                                const char *suffix, const void *value,
                                size_t size, int flags)
{
        const char *name = xattr_full_name(handler, suffix);
        struct kernfs_node *kn = inode->i_private;

        return kernfs_xattr_set(kn, name, value, size, flags);
}

static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
                                     struct mnt_idmap *idmap,
                                     struct dentry *unused, struct inode *inode,
                                     const char *suffix, const void *value,
                                     size_t size, int flags)
{
        const char *full_name = xattr_full_name(handler, suffix);
        struct kernfs_node *kn = inode->i_private;
        struct simple_xattrs *xattrs;
        struct kernfs_iattrs *attrs;

        if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
                return -EOPNOTSUPP;

        attrs = kernfs_iattrs(kn);
        if (!attrs)
                return -ENOMEM;

        xattrs = simple_xattrs_lazy_alloc(&attrs->xattrs, value, flags);
        if (IS_ERR_OR_NULL(xattrs))
                return PTR_ERR(xattrs);

        return simple_xattr_set_limited(xattrs, &attrs->xattr_limits,
                                        full_name, value, size, flags);
}

static const struct xattr_handler kernfs_trusted_xattr_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .get = kernfs_vfs_xattr_get,
        .set = kernfs_vfs_xattr_set,
};

static const struct xattr_handler kernfs_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .get = kernfs_vfs_xattr_get,
        .set = kernfs_vfs_xattr_set,
};

static const struct xattr_handler kernfs_user_xattr_handler = {
        .prefix = XATTR_USER_PREFIX,
        .get = kernfs_vfs_xattr_get,
        .set = kernfs_vfs_user_xattr_set,
};

const struct xattr_handler * const kernfs_xattr_handlers[] = {
        &kernfs_trusted_xattr_handler,
        &kernfs_security_xattr_handler,
        &kernfs_user_xattr_handler,
        NULL
};











































   18 
   19 














































   20 
   23 
















    1 
    1 








    1 












   12 

   12 









   19 


   21 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/lockref.h>

#if USE_CMPXCHG_LOCKREF

/*
 * Note that the "cmpxchg()" reloads the "old" value for the
 * failure case.
 */
#define CMPXCHG_LOOP(CODE, SUCCESS) do {                                        \
        int retry = 100;                                                        \
        struct lockref old;                                                        \
        BUILD_BUG_ON(sizeof(old) != 8);                                                \
        old.lock_count = READ_ONCE(lockref->lock_count);                        \
        while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) {          \
                struct lockref new = old;                                        \
                CODE                                                                \
                if (likely(try_cmpxchg64_relaxed(&lockref->lock_count,                \
                                                 &old.lock_count,                \
                                                 new.lock_count))) {                \
                        SUCCESS;                                                \
                }                                                                \
                if (!--retry)                                                        \
                        break;                                                        \
        }                                                                        \
} while (0)

#else

#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)

#endif

/**
 * lockref_get - Increments reference count unconditionally
 * @lockref: pointer to lockref structure
 *
 * This operation is only valid if you already hold a reference
 * to the object, so you know the count cannot be zero.
 */
void lockref_get(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count++;
        ,
                return;
        );

        spin_lock(&lockref->lock);
        lockref->count++;
        spin_unlock(&lockref->lock);
}
EXPORT_SYMBOL(lockref_get);

/**
 * lockref_get_not_zero - Increments count unless the count is 0 or dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count was zero
 */
bool lockref_get_not_zero(struct lockref *lockref)
{
        bool retval = false;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count <= 0)
                        return false;
        ,
                return true;
        );

        spin_lock(&lockref->lock);
        if (lockref->count > 0) {
                lockref->count++;
                retval = true;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_zero);

/**
 * lockref_put_return - Decrement reference count if possible
 * @lockref: pointer to lockref structure
 *
 * Decrement the reference count and return the new value.
 * If the lockref was dead or locked, return -1.
 */
int lockref_put_return(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 0)
                        return -1;
        ,
                return new.count;
        );
        return -1;
}
EXPORT_SYMBOL(lockref_put_return);

/**
 * lockref_put_or_lock - decrements count unless count <= 1 before decrement
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
 */
bool lockref_put_or_lock(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 1)
                        break;
        ,
                return true;
        );

        spin_lock(&lockref->lock);
        if (lockref->count <= 1)
                return false;
        lockref->count--;
        spin_unlock(&lockref->lock);
        return true;
}
EXPORT_SYMBOL(lockref_put_or_lock);

/**
 * lockref_mark_dead - mark lockref dead
 * @lockref: pointer to lockref structure
 */
void lockref_mark_dead(struct lockref *lockref)
{
        assert_spin_locked(&lockref->lock);
        lockref->count = -128;
}
EXPORT_SYMBOL(lockref_mark_dead);

/**
 * lockref_get_not_dead - Increments count unless the ref is dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if lockref was dead
 */
bool lockref_get_not_dead(struct lockref *lockref)
{
        bool retval = false;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count < 0)
                        return false;
        ,
                return true;
        );

        spin_lock(&lockref->lock);
        if (lockref->count >= 0) {
                lockref->count++;
                retval = true;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_dead);
































































































    1 
    1 




































































































   47 



   46 
   45 

   50 




















    1 
    1 


    1 











































    1 














































    1 





















    1 
    1 




















































































































   17 




















   17 
   17 















   17 





   27 

   27 


   27 
   28 




   15 



   16 



   21 





   25 










   21 





















    1 















    1 
    1 

























































































































   16 










   18 
   17 

















   18 




























































































































































































































































   16 



   15 
   18 








































































































































































































































































    3 



    3 




    3 
    3 








    3 





































































































































































































































   14 



   13 






























































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    3 














































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
// SPDX-License-Identifier: GPL-2.0-only
/*
 * AppArmor security module
 *
 * This file contains AppArmor LSM hooks.
 *
 * Copyright (C) 1998-2008 Novell/SUSE
 * Copyright 2009-2010 Canonical Ltd.
 */

#include <linux/lsm_hooks.h>
#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/ptrace.h>
#include <linux/ctype.h>
#include <linux/sysctl.h>
#include <linux/audit.h>
#include <linux/user_namespace.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/zstd.h>
#include <net/sock.h>
#include <uapi/linux/mount.h>
#include <uapi/linux/lsm.h>

#include "include/af_unix.h"
#include "include/apparmor.h"
#include "include/apparmorfs.h"
#include "include/audit.h"
#include "include/capability.h"
#include "include/cred.h"
#include "include/crypto.h"
#include "include/file.h"
#include "include/ipc.h"
#include "include/net.h"
#include "include/path.h"
#include "include/label.h"
#include "include/policy.h"
#include "include/policy_ns.h"
#include "include/procattr.h"
#include "include/mount.h"
#include "include/secid.h"

/* Flag indicating whether initialization completed */
int apparmor_initialized;

union aa_buffer {
        struct list_head list;
        DECLARE_FLEX_ARRAY(char, buffer);
};

struct aa_local_cache {
        unsigned int hold;
        unsigned int count;
        struct list_head head;
};

#define RESERVE_COUNT 2
static int reserve_count = RESERVE_COUNT;
static int buffer_count;

static LIST_HEAD(aa_global_buffers);
static DEFINE_SPINLOCK(aa_buffers_lock);
static DEFINE_PER_CPU(struct aa_local_cache, aa_local_buffers);

/*
 * LSM hook functions
 */

/*
 * put the associated labels
 */
static void apparmor_cred_free(struct cred *cred)
{
        aa_put_label(cred_label(cred));
        set_cred_label(cred, NULL);
}

/*
 * allocate the apparmor part of blank credentials
 */
static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        set_cred_label(cred, NULL);
        return 0;
}

/*
 * prepare new cred label for modification by prepare_cred block
 */
static int apparmor_cred_prepare(struct cred *new, const struct cred *old,
                                 gfp_t gfp)
{
        set_cred_label(new, aa_get_newest_label(cred_label(old)));
        return 0;
}

/*
 * transfer the apparmor data to a blank set of creds
 */
static void apparmor_cred_transfer(struct cred *new, const struct cred *old)
{
        set_cred_label(new, aa_get_newest_label(cred_label(old)));
}

static void apparmor_task_free(struct task_struct *task)
{

        aa_free_task_ctx(task_ctx(task));
}

static int apparmor_task_alloc(struct task_struct *task,
                               u64 clone_flags)
{
        struct aa_task_ctx *new = task_ctx(task);

        aa_dup_task_ctx(new, task_ctx(current));

        return 0;
}

static int apparmor_ptrace_access_check(struct task_struct *child,
                                        unsigned int mode)
{
        struct aa_label *tracer, *tracee;
        const struct cred *cred;
        int error;
        bool needput;

        cred = get_task_cred(child);
        tracee = cred_label(cred);        /* ref count on cred */
        tracer = __begin_current_label_crit_section(&needput);
        error = aa_may_ptrace(current_cred(), tracer, cred, tracee,
                        (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ
                                                  : AA_PTRACE_TRACE);
        __end_current_label_crit_section(tracer, needput);
        put_cred(cred);

        return error;
}

static int apparmor_ptrace_traceme(struct task_struct *parent)
{
        struct aa_label *tracer, *tracee;
        const struct cred *cred;
        int error;
        bool needput;

        tracee = __begin_current_label_crit_section(&needput);
        cred = get_task_cred(parent);
        tracer = cred_label(cred);        /* ref count on cred */
        error = aa_may_ptrace(cred, tracer, current_cred(), tracee,
                              AA_PTRACE_TRACE);
        put_cred(cred);
        __end_current_label_crit_section(tracee, needput);

        return error;
}

/* Derived from security/commoncap.c:cap_capget */
static int apparmor_capget(const struct task_struct *target, kernel_cap_t *effective,
                           kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        struct aa_label *label;
        const struct cred *cred;

        rcu_read_lock();
        cred = __task_cred(target);
        label = aa_get_newest_cred_label(cred);

        /*
         * cap_capget is stacked ahead of this and will
         * initialize effective and permitted.
         */
        if (!unconfined(label)) {
                struct aa_profile *profile;
                struct label_it i;

                label_for_each_confined(i, label, profile) {
                        kernel_cap_t allowed;

                        allowed = aa_profile_capget(profile);
                        *effective = cap_intersect(*effective, allowed);
                        *permitted = cap_intersect(*permitted, allowed);
                }
        }
        rcu_read_unlock();
        aa_put_label(label);

        return 0;
}

static int apparmor_capable(const struct cred *cred, struct user_namespace *ns,
                            int cap, unsigned int opts)
{
        struct aa_label *label;
        int error = 0;

        label = aa_get_newest_cred_label(cred);
        if (!unconfined(label))
                error = aa_capable(cred, label, cap, opts);
        aa_put_label(label);

        return error;
}

/**
 * common_perm - basic common permission check wrapper fn for paths
 * @op: operation being checked
 * @path: path to check permission of  (NOT NULL)
 * @mask: requested permissions mask
 * @cond: conditional info for the permission request  (NOT NULL)
 *
 * Returns: %0 else error code if error or permission denied
 */
static int common_perm(const char *op, const struct path *path, u32 mask,
                       struct path_cond *cond)
{
        struct aa_label *label;
        int error = 0;
        bool needput;

        label = __begin_current_label_crit_section(&needput);
        if (!unconfined(label))
                error = aa_path_perm(op, current_cred(), label, path, 0, mask,
                                     cond);
        __end_current_label_crit_section(label, needput);

        return error;
}

/**
 * common_perm_cond - common permission wrapper around inode cond
 * @op: operation being checked
 * @path: location to check (NOT NULL)
 * @mask: requested permissions mask
 *
 * Returns: %0 else error code if error or permission denied
 */
static int common_perm_cond(const char *op, const struct path *path, u32 mask)
{
        vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt),
                                            d_backing_inode(path->dentry));
        struct path_cond cond = {
                vfsuid_into_kuid(vfsuid),
                d_backing_inode(path->dentry)->i_mode
        };

        if (!path_mediated_fs(path->dentry))
                return 0;

        return common_perm(op, path, mask, &cond);
}

/**
 * common_perm_dir_dentry - common permission wrapper when path is dir, dentry
 * @op: operation being checked
 * @dir: directory of the dentry  (NOT NULL)
 * @dentry: dentry to check  (NOT NULL)
 * @mask: requested permissions mask
 * @cond: conditional info for the permission request  (NOT NULL)
 *
 * Returns: %0 else error code if error or permission denied
 */
static int common_perm_dir_dentry(const char *op, const struct path *dir,
                                  struct dentry *dentry, u32 mask,
                                  struct path_cond *cond)
{
        struct path path = { .mnt = dir->mnt, .dentry = dentry };

        return common_perm(op, &path, mask, cond);
}

/**
 * common_perm_rm - common permission wrapper for operations doing rm
 * @op: operation being checked
 * @dir: directory that the dentry is in  (NOT NULL)
 * @dentry: dentry being rm'd  (NOT NULL)
 * @mask: requested permission mask
 *
 * Returns: %0 else error code if error or permission denied
 */
static int common_perm_rm(const char *op, const struct path *dir,
                          struct dentry *dentry, u32 mask)
{
        struct inode *inode = d_backing_inode(dentry);
        struct path_cond cond = { };
        vfsuid_t vfsuid;

        if (!inode || !path_mediated_fs(dentry))
                return 0;

        vfsuid = i_uid_into_vfsuid(mnt_idmap(dir->mnt), inode);
        cond.uid = vfsuid_into_kuid(vfsuid);
        cond.mode = inode->i_mode;

        return common_perm_dir_dentry(op, dir, dentry, mask, &cond);
}

/**
 * common_perm_create - common permission wrapper for operations doing create
 * @op: operation being checked
 * @dir: directory that dentry will be created in  (NOT NULL)
 * @dentry: dentry to create   (NOT NULL)
 * @mask: request permission mask
 * @mode: created file mode
 *
 * Returns: %0 else error code if error or permission denied
 */
static int common_perm_create(const char *op, const struct path *dir,
                              struct dentry *dentry, u32 mask, umode_t mode)
{
        struct path_cond cond = { current_fsuid(), mode };

        if (!path_mediated_fs(dir->dentry))
                return 0;

        return common_perm_dir_dentry(op, dir, dentry, mask, &cond);
}

static int apparmor_path_unlink(const struct path *dir, struct dentry *dentry)
{
        return common_perm_rm(OP_UNLINK, dir, dentry, AA_MAY_DELETE);
}

static int apparmor_path_mkdir(const struct path *dir, struct dentry *dentry,
                               umode_t mode)
{
        return common_perm_create(OP_MKDIR, dir, dentry, AA_MAY_CREATE,
                                  S_IFDIR);
}

static int apparmor_path_rmdir(const struct path *dir, struct dentry *dentry)
{
        return common_perm_rm(OP_RMDIR, dir, dentry, AA_MAY_DELETE);
}

static int apparmor_path_mknod(const struct path *dir, struct dentry *dentry,
                               umode_t mode, unsigned int dev)
{
        return common_perm_create(OP_MKNOD, dir, dentry, AA_MAY_CREATE, mode);
}

static int apparmor_path_truncate(const struct path *path)
{
        return common_perm_cond(OP_TRUNC, path, MAY_WRITE | AA_MAY_SETATTR);
}

static int apparmor_file_truncate(struct file *file)
{
        return apparmor_path_truncate(&file->f_path);
}

static int apparmor_path_symlink(const struct path *dir, struct dentry *dentry,
                                 const char *old_name)
{
        return common_perm_create(OP_SYMLINK, dir, dentry, AA_MAY_CREATE,
                                  S_IFLNK);
}

static int apparmor_path_link(struct dentry *old_dentry, const struct path *new_dir,
                              struct dentry *new_dentry)
{
        struct aa_label *label;
        int error = 0;

        if (!path_mediated_fs(old_dentry))
                return 0;

        label = begin_current_label_crit_section();
        if (!unconfined(label))
                error = aa_path_link(current_cred(), label, old_dentry, new_dir,
                                     new_dentry);
        end_current_label_crit_section(label);

        return error;
}

static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_dentry,
                                const struct path *new_dir, struct dentry *new_dentry,
                                const unsigned int flags)
{
        struct aa_label *label;
        int error = 0;

        if (!path_mediated_fs(old_dentry))
                return 0;
        if ((flags & RENAME_EXCHANGE) && !path_mediated_fs(new_dentry))
                return 0;

        label = begin_current_label_crit_section();
        if (!unconfined(label)) {
                struct mnt_idmap *idmap = mnt_idmap(old_dir->mnt);
                vfsuid_t vfsuid;
                struct path old_path = { .mnt = old_dir->mnt,
                                         .dentry = old_dentry };
                struct path new_path = { .mnt = new_dir->mnt,
                                         .dentry = new_dentry };
                struct path_cond cond = {
                        .mode = d_backing_inode(old_dentry)->i_mode
                };
                vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry));
                cond.uid = vfsuid_into_kuid(vfsuid);

                if (flags & RENAME_EXCHANGE) {
                        struct path_cond cond_exchange = {
                                .mode = d_backing_inode(new_dentry)->i_mode,
                        };
                        vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry));
                        cond_exchange.uid = vfsuid_into_kuid(vfsuid);

                        error = aa_path_perm(OP_RENAME_SRC, current_cred(),
                                             label, &new_path, 0,
                                             MAY_READ | AA_MAY_GETATTR | MAY_WRITE |
                                             AA_MAY_SETATTR | AA_MAY_DELETE,
                                             &cond_exchange);
                        if (!error)
                                error = aa_path_perm(OP_RENAME_DEST, current_cred(),
                                                     label, &old_path,
                                                     0, MAY_WRITE | AA_MAY_SETATTR |
                                                     AA_MAY_CREATE, &cond_exchange);
                }

                if (!error)
                        error = aa_path_perm(OP_RENAME_SRC, current_cred(),
                                             label, &old_path, 0,
                                             MAY_READ | AA_MAY_GETATTR | MAY_WRITE |
                                             AA_MAY_SETATTR | AA_MAY_DELETE,
                                             &cond);
                if (!error)
                        error = aa_path_perm(OP_RENAME_DEST, current_cred(),
                                             label, &new_path,
                                             0, MAY_WRITE | AA_MAY_SETATTR |
                                             AA_MAY_CREATE, &cond);

        }
        end_current_label_crit_section(label);

        return error;
}

static int apparmor_path_chmod(const struct path *path, umode_t mode)
{
        return common_perm_cond(OP_CHMOD, path, AA_MAY_CHMOD);
}

static int apparmor_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        return common_perm_cond(OP_CHOWN, path, AA_MAY_CHOWN);
}

static int apparmor_inode_getattr(const struct path *path)
{
        return common_perm_cond(OP_GETATTR, path, AA_MAY_GETATTR);
}

static int apparmor_file_open(struct file *file)
{
        struct aa_file_ctx *fctx = file_ctx(file);
        struct aa_label *label;
        int error = 0;
        bool needput;

        if (!path_mediated_fs(file->f_path.dentry))
                return 0;

        /* If in exec, permission is handled by bprm hooks.
         * Cache permissions granted by the previous exec check, with
         * implicit read and executable mmap which are required to
         * actually execute the image.
         *
         * Illogically, FMODE_EXEC is in f_flags, not f_mode.
         */
        if (file->f_flags & __FMODE_EXEC) {
                fctx->allow = MAY_EXEC | MAY_READ | AA_EXEC_MMAP;
                return 0;
        }

        label = aa_get_newest_cred_label_condref(file->f_cred, &needput);
        if (!unconfined(label)) {
                struct mnt_idmap *idmap = file_mnt_idmap(file);
                struct inode *inode = file_inode(file);
                vfsuid_t vfsuid;
                struct path_cond cond = {
                        .mode = inode->i_mode,
                };
                vfsuid = i_uid_into_vfsuid(idmap, inode);
                cond.uid = vfsuid_into_kuid(vfsuid);

                error = aa_path_perm(OP_OPEN, file->f_cred,
                                     label, &file->f_path, 0,
                                     aa_map_file_to_perms(file), &cond);
                /* todo cache full allowed permissions set and state */
                fctx->allow = aa_map_file_to_perms(file);
        }
        aa_put_label_condref(label, needput);

        return error;
}

static int apparmor_file_alloc_security(struct file *file)
{
        struct aa_file_ctx *ctx = file_ctx(file);
        struct aa_label *label = begin_current_label_crit_section();

        spin_lock_init(&ctx->lock);
        rcu_assign_pointer(ctx->label, aa_get_label(label));
        end_current_label_crit_section(label);
        return 0;
}

static void apparmor_file_free_security(struct file *file)
{
        struct aa_file_ctx *ctx = file_ctx(file);

        if (ctx)
                aa_put_label(rcu_access_pointer(ctx->label));
}

static int common_file_perm(const char *op, struct file *file, u32 mask)
{
        struct aa_label *label;
        int error = 0;

        label = begin_current_label_crit_section();
        error = aa_file_perm(op, current_cred(), label, file, mask, false);
        end_current_label_crit_section(label);

        return error;
}

static int apparmor_file_receive(struct file *file)
{
        return common_file_perm(OP_FRECEIVE, file, aa_map_file_to_perms(file));
}

static int apparmor_file_permission(struct file *file, int mask)
{
        return common_file_perm(OP_FPERM, file, mask);
}

static int apparmor_file_lock(struct file *file, unsigned int cmd)
{
        u32 mask = AA_MAY_LOCK;

        if (cmd == F_WRLCK)
                mask |= MAY_WRITE;

        return common_file_perm(OP_FLOCK, file, mask);
}

static int common_mmap(const char *op, struct file *file, unsigned long prot,
                       unsigned long flags)
{
        int mask = 0;

        if (!file || !file_ctx(file))
                return 0;

        if (prot & PROT_READ)
                mask |= MAY_READ;
        /*
         * Private mappings don't require write perms since they don't
         * write back to the files
         */
        if ((prot & PROT_WRITE) && !(flags & MAP_PRIVATE))
                mask |= MAY_WRITE;
        if (prot & PROT_EXEC)
                mask |= AA_EXEC_MMAP;

        return common_file_perm(op, file, mask);
}

static int apparmor_mmap_file(struct file *file, unsigned long reqprot,
                              unsigned long prot, unsigned long flags)
{
        return common_mmap(OP_FMMAP, file, prot, flags);
}

static int apparmor_file_mprotect(struct vm_area_struct *vma,
                                  unsigned long reqprot, unsigned long prot)
{
        return common_mmap(OP_FMPROT, vma->vm_file, prot,
                           !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0);
}

#ifdef CONFIG_IO_URING
static const char *audit_uring_mask(u32 mask)
{
        if (mask & AA_MAY_CREATE_SQPOLL)
                return "sqpoll";
        if (mask & AA_MAY_OVERRIDE_CRED)
                return "override_creds";
        return "";
}

static void audit_uring_cb(struct audit_buffer *ab, void *va)
{
        struct apparmor_audit_data *ad = aad_of_va(va);

        if (ad->request & AA_URING_PERM_MASK) {
                audit_log_format(ab, " requested=\"%s\"",
                                 audit_uring_mask(ad->request));
                if (ad->denied & AA_URING_PERM_MASK) {
                        audit_log_format(ab, " denied=\"%s\"",
                                         audit_uring_mask(ad->denied));
                }
        }
        if (ad->uring.target) {
                audit_log_format(ab, " tcontext=");
                aa_label_xaudit(ab, labels_ns(ad->subj_label),
                                ad->uring.target,
                                FLAGS_NONE, GFP_ATOMIC);
        }
}

static int profile_uring(struct aa_profile *profile, u32 request,
                         struct aa_label *new, int cap,
                         struct apparmor_audit_data *ad)
{
        unsigned int state;
        struct aa_ruleset *rules;
        int error = 0;

        AA_BUG(!profile);

        rules = profile->label.rules[0];
        state = RULE_MEDIATES(rules, AA_CLASS_IO_URING);
        if (state) {
                struct aa_perms perms = { };

                if (new) {
                        aa_label_match(profile, rules, new, state,
                                       false, request, &perms);
                } else {
                        perms = *aa_lookup_perms(rules->policy, state);
                }
                aa_apply_modes_to_perms(profile, &perms);
                error = aa_check_perms(profile, &perms, request, ad,
                                       audit_uring_cb);
        }

        return error;
}

/**
 * apparmor_uring_override_creds - check the requested cred override
 * @new: the target creds
 *
 * Check to see if the current task is allowed to override it's credentials
 * to service an io_uring operation.
 */
static int apparmor_uring_override_creds(const struct cred *new)
{
        struct aa_profile *profile;
        struct aa_label *label;
        int error;
        bool needput;
        DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING,
                          OP_URING_OVERRIDE);

        ad.uring.target = cred_label(new);
        label = __begin_current_label_crit_section(&needput);
        error = fn_for_each(label, profile,
                        profile_uring(profile, AA_MAY_OVERRIDE_CRED,
                                      cred_label(new), CAP_SYS_ADMIN, &ad));
        __end_current_label_crit_section(label, needput);

        return error;
}

/**
 * apparmor_uring_sqpoll - check if a io_uring polling thread can be created
 *
 * Check to see if the current task is allowed to create a new io_uring
 * kernel polling thread.
 */
static int apparmor_uring_sqpoll(void)
{
        struct aa_profile *profile;
        struct aa_label *label;
        int error;
        bool needput;
        DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING,
                          OP_URING_SQPOLL);

        label = __begin_current_label_crit_section(&needput);
        error = fn_for_each(label, profile,
                        profile_uring(profile, AA_MAY_CREATE_SQPOLL,
                                      NULL, CAP_SYS_ADMIN, &ad));
        __end_current_label_crit_section(label, needput);

        return error;
}
#endif /* CONFIG_IO_URING */

static int apparmor_sb_mount(const char *dev_name, const struct path *path,
                             const char *type, unsigned long flags, void *data)
{
        struct aa_label *label;
        int error = 0;
        bool needput;

        /* Discard magic */
        if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
                flags &= ~MS_MGC_MSK;

        flags &= ~AA_MS_IGNORE_MASK;

        label = __begin_current_label_crit_section(&needput);
        if (!unconfined(label)) {
                if (flags & MS_REMOUNT)
                        error = aa_remount(current_cred(), label, path, flags,
                                           data);
                else if (flags & MS_BIND)
                        error = aa_bind_mount(current_cred(), label, path,
                                              dev_name, flags);
                else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE |
                                  MS_UNBINDABLE))
                        error = aa_mount_change_type(current_cred(), label,
                                                     path, flags);
                else if (flags & MS_MOVE)
                        error = aa_move_mount_old(current_cred(), label, path,
                                                  dev_name);
                else
                        error = aa_new_mount(current_cred(), label, dev_name,
                                             path, type, flags, data);
        }
        __end_current_label_crit_section(label, needput);

        return error;
}

static int apparmor_move_mount(const struct path *from_path,
                               const struct path *to_path)
{
        struct aa_label *label;
        int error = 0;
        bool needput;

        label = __begin_current_label_crit_section(&needput);
        if (!unconfined(label))
                error = aa_move_mount(current_cred(), label, from_path,
                                      to_path);
        __end_current_label_crit_section(label, needput);

        return error;
}

static int apparmor_sb_umount(struct vfsmount *mnt, int flags)
{
        struct aa_label *label;
        int error = 0;
        bool needput;

        label = __begin_current_label_crit_section(&needput);
        if (!unconfined(label))
                error = aa_umount(current_cred(), label, mnt, flags);
        __end_current_label_crit_section(label, needput);

        return error;
}

static int apparmor_sb_pivotroot(const struct path *old_path,
                                 const struct path *new_path)
{
        struct aa_label *label;
        int error = 0;

        label = aa_get_current_label();
        if (!unconfined(label))
                error = aa_pivotroot(current_cred(), label, old_path, new_path);
        aa_put_label(label);

        return error;
}

static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx,
                                u32 *size, u32 flags)
{
        int error = -ENOENT;
        struct aa_task_ctx *ctx = task_ctx(current);
        struct aa_label *label = NULL;
        char *value = NULL;

        switch (attr) {
        case LSM_ATTR_CURRENT:
                label = aa_get_newest_label(cred_label(current_cred()));
                break;
        case LSM_ATTR_PREV:
                if (ctx->previous)
                        label = aa_get_newest_label(ctx->previous);
                break;
        case LSM_ATTR_EXEC:
                if (ctx->onexec)
                        label = aa_get_newest_label(ctx->onexec);
                break;
        default:
                error = -EOPNOTSUPP;
                break;
        }

        if (label) {
                error = aa_getprocattr(label, &value, false);
                if (error > 0)
                        error = lsm_fill_user_ctx(lx, size, value, error,
                                                  LSM_ID_APPARMOR, 0);
                kfree(value);
        }

        aa_put_label(label);

        if (error < 0)
                return error;
        return 1;
}

static int apparmor_getprocattr(struct task_struct *task, const char *name,
                                char **value)
{
        int error = -ENOENT;
        /* released below */
        const struct cred *cred = get_task_cred(task);
        struct aa_task_ctx *ctx = task_ctx(current);
        struct aa_label *label = NULL;

        if (strcmp(name, "current") == 0)
                label = aa_get_newest_label(cred_label(cred));
        else if (strcmp(name, "prev") == 0  && ctx->previous)
                label = aa_get_newest_label(ctx->previous);
        else if (strcmp(name, "exec") == 0 && ctx->onexec)
                label = aa_get_newest_label(ctx->onexec);
        else
                error = -EINVAL;

        if (label)
                error = aa_getprocattr(label, value, true);

        aa_put_label(label);
        put_cred(cred);

        return error;
}

static int do_setattr(u64 attr, void *value, size_t size)
{
        char *command, *largs = NULL, *args = value;
        size_t arg_size;
        int error;
        DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE,
                          OP_SETPROCATTR);

        if (size == 0)
                return -EINVAL;

        /* AppArmor requires that the buffer must be null terminated atm */
        if (args[size - 1] != '\0') {
                /* null terminate */
                largs = args = kmalloc(size + 1, GFP_KERNEL);
                if (!args)
                        return -ENOMEM;
                memcpy(args, value, size);
                args[size] = '\0';
        }

        error = -EINVAL;
        args = strim(args);
        command = strsep(&args, " ");
        if (!args)
                goto out;
        args = skip_spaces(args);
        if (!*args)
                goto out;

        arg_size = size - (args - (largs ? largs : (char *) value));
        if (attr == LSM_ATTR_CURRENT) {
                if (strcmp(command, "changehat") == 0) {
                        error = aa_setprocattr_changehat(args, arg_size,
                                                         AA_CHANGE_NOFLAGS);
                } else if (strcmp(command, "permhat") == 0) {
                        error = aa_setprocattr_changehat(args, arg_size,
                                                         AA_CHANGE_TEST);
                } else if (strcmp(command, "changeprofile") == 0) {
                        error = aa_change_profile(args, AA_CHANGE_NOFLAGS);
                } else if (strcmp(command, "permprofile") == 0) {
                        error = aa_change_profile(args, AA_CHANGE_TEST);
                } else if (strcmp(command, "stack") == 0) {
                        error = aa_change_profile(args, AA_CHANGE_STACK);
                } else
                        goto fail;
        } else if (attr == LSM_ATTR_EXEC) {
                if (strcmp(command, "exec") == 0)
                        error = aa_change_profile(args, AA_CHANGE_ONEXEC);
                else if (strcmp(command, "stack") == 0)
                        error = aa_change_profile(args, (AA_CHANGE_ONEXEC |
                                                         AA_CHANGE_STACK));
                else
                        goto fail;
        } else
                /* only support the "current" and "exec" process attributes */
                goto fail;

        if (!error)
                error = size;
out:
        kfree(largs);
        return error;

fail:
        ad.subj_label = begin_current_label_crit_section();
        if (attr == LSM_ATTR_CURRENT)
                ad.info = "current";
        else if (attr == LSM_ATTR_EXEC)
                ad.info = "exec";
        else
                ad.info = "invalid";
        ad.error = error = -EINVAL;
        aa_audit_msg(AUDIT_APPARMOR_DENIED, &ad, NULL);
        end_current_label_crit_section(ad.subj_label);
        goto out;
}

static int apparmor_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
                                u32 size, u32 flags)
{
        int rc;

        if (attr != LSM_ATTR_CURRENT && attr != LSM_ATTR_EXEC)
                return -EOPNOTSUPP;

        rc = do_setattr(attr, ctx->ctx, ctx->ctx_len);
        if (rc > 0)
                return 0;
        return rc;
}

static int apparmor_setprocattr(const char *name, void *value,
                                size_t size)
{
        int attr = lsm_name_to_attr(name);

        if (attr)
                return do_setattr(attr, value, size);
        return -EINVAL;
}

/**
 * apparmor_bprm_committing_creds - do task cleanup on committing new creds
 * @bprm: binprm for the exec  (NOT NULL)
 */
static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm)
{
        struct aa_label *label = aa_current_raw_label();
        struct aa_label *new_label = cred_label(bprm->cred);

        /* bail out if unconfined or not changing profile */
        if ((new_label->proxy == label->proxy) ||
            (unconfined(new_label)))
                return;

        aa_inherit_files(bprm->cred, current->files);

        current->pdeath_signal = 0;

        /* reset soft limits and set hard limits for the new label */
        __aa_transition_rlimits(label, new_label);
}

/**
 * apparmor_bprm_committed_creds() - do cleanup after new creds committed
 * @bprm: binprm for the exec  (NOT NULL)
 */
static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm)
{
        /* clear out temporary/transitional state from the context */
        aa_clear_task_ctx_trans(task_ctx(current));

        return;
}

static void apparmor_current_getlsmprop_subj(struct lsm_prop *prop)
{
        struct aa_label *label;
        bool needput;

        label = __begin_current_label_crit_section(&needput);
        prop->apparmor.label = label;
        __end_current_label_crit_section(label, needput);
}

static void apparmor_task_getlsmprop_obj(struct task_struct *p,
                                          struct lsm_prop *prop)
{
        struct aa_label *label = aa_get_task_label(p);

        prop->apparmor.label = label;
        aa_put_label(label);
}

static int apparmor_task_setrlimit(struct task_struct *task,
                unsigned int resource, struct rlimit *new_rlim)
{
        struct aa_label *label;
        int error = 0;
        bool needput;

        label = __begin_current_label_crit_section(&needput);

        if (!unconfined(label))
                error = aa_task_setrlimit(current_cred(), label, task,
                                          resource, new_rlim);
        __end_current_label_crit_section(label, needput);

        return error;
}

static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo *info,
                              int sig, const struct cred *cred)
{
        const struct cred *tc;
        struct aa_label *cl, *tl;
        int error;
        bool needput;

        tc = get_task_cred(target);
        tl = aa_get_newest_cred_label(tc);
        if (cred) {
                /*
                 * Dealing with USB IO specific behavior
                 */
                cl = aa_get_newest_cred_label(cred);
                error = aa_may_signal(cred, cl, tc, tl, sig);
                aa_put_label(cl);
        } else {
                cl = __begin_current_label_crit_section(&needput);
                error = aa_may_signal(current_cred(), cl, tc, tl, sig);
                __end_current_label_crit_section(cl, needput);
        }
        aa_put_label(tl);
        put_cred(tc);

        return error;
}

static int apparmor_userns_create(const struct cred *cred)
{
        struct aa_label *label;
        struct aa_profile *profile;
        int error = 0;
        DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_NS,
                          OP_USERNS_CREATE);

        ad.subj_cred = current_cred();

        label = begin_current_label_crit_section();
        if (!unconfined(label)) {
                error = fn_for_each(label, profile,
                                    aa_profile_ns_perm(profile, &ad,
                                                       AA_USERNS_CREATE));
        }
        end_current_label_crit_section(label);

        return error;
}

static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t gfp)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);
        struct aa_label *label;
        bool needput;

        label = __begin_current_label_crit_section(&needput);
        //spin_lock_init(&ctx->lock);
        rcu_assign_pointer(ctx->label, aa_get_label(label));
        rcu_assign_pointer(ctx->peer, NULL);
        rcu_assign_pointer(ctx->peer_lastupdate, NULL);
        __end_current_label_crit_section(label, needput);
        return 0;
}

static void apparmor_sk_free_security(struct sock *sk)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);

        /* dead these won't be updated any more */
        aa_put_label(rcu_dereference_protected(ctx->label, true));
        aa_put_label(rcu_dereference_protected(ctx->peer, true));
        aa_put_label(rcu_dereference_protected(ctx->peer_lastupdate, true));
}

/**
 * apparmor_sk_clone_security - clone the sk_security field
 * @sk: sock to have security cloned
 * @newsk: sock getting clone
 */
static void apparmor_sk_clone_security(const struct sock *sk,
                                       struct sock *newsk)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);
        struct aa_sk_ctx *new = aa_sock(newsk);

        /* not actually in use yet */
        if (rcu_access_pointer(ctx->label) != rcu_access_pointer(new->label)) {
                aa_put_label(rcu_dereference_protected(new->label, true));
                rcu_assign_pointer(new->label, aa_get_label_rcu(&ctx->label));
        }

        if (rcu_access_pointer(ctx->peer) != rcu_access_pointer(new->peer)) {
                aa_put_label(rcu_dereference_protected(new->peer, true));
                rcu_assign_pointer(new->peer, aa_get_label_rcu(&ctx->peer));
        }

        if (rcu_access_pointer(ctx->peer_lastupdate) != rcu_access_pointer(new->peer_lastupdate)) {
                aa_put_label(rcu_dereference_protected(new->peer_lastupdate, true));
                rcu_assign_pointer(new->peer_lastupdate,
                                   aa_get_label_rcu(&ctx->peer_lastupdate));
        }
}

static int unix_connect_perm(const struct cred *cred, struct aa_label *label,
                             struct sock *sk, struct sock *peer_sk)
{
        struct aa_sk_ctx *peer_ctx = aa_sock(peer_sk);
        int error;

        error = aa_unix_peer_perm(cred, label, OP_CONNECT,
                                (AA_MAY_CONNECT | AA_MAY_SEND | AA_MAY_RECEIVE),
                                  sk, peer_sk,
                                  rcu_dereference_protected(peer_ctx->label,
                                     lockdep_is_held(&unix_sk(peer_sk)->lock)));
        if (!is_unix_fs(peer_sk)) {
                last_error(error,
                           aa_unix_peer_perm(cred,
                                rcu_dereference_protected(peer_ctx->label,
                                     lockdep_is_held(&unix_sk(peer_sk)->lock)),
                                OP_CONNECT,
                                (AA_MAY_ACCEPT | AA_MAY_SEND | AA_MAY_RECEIVE),
                                                          peer_sk, sk, label));
        }

        return error;
}

/* lockdep check in unix_connect_perm - push sks here to check */
static void unix_connect_peers(struct aa_sk_ctx *sk_ctx,
                               struct aa_sk_ctx *peer_ctx)
{
        /* Cross reference the peer labels for SO_PEERSEC */
        struct aa_label *label = rcu_dereference_protected(sk_ctx->label, true);

        aa_get_label(label);
        aa_put_label(rcu_dereference_protected(peer_ctx->peer,
                                             true));
        rcu_assign_pointer(peer_ctx->peer, label);        /* transfer cnt */

        label = aa_get_label(rcu_dereference_protected(peer_ctx->label,
                                             true));
        //spin_unlock(&peer_ctx->lock);

        //spin_lock(&sk_ctx->lock);
        aa_put_label(rcu_dereference_protected(sk_ctx->peer,
                                               true));
        aa_put_label(rcu_dereference_protected(sk_ctx->peer_lastupdate,
                                               true));

        rcu_assign_pointer(sk_ctx->peer, aa_get_label(label));
        rcu_assign_pointer(sk_ctx->peer_lastupdate, label);     /* transfer cnt */
        //spin_unlock(&sk_ctx->lock);
}

/**
 * apparmor_unix_stream_connect - check perms before making unix domain conn
 * @sk: sk attempting to connect
 * @peer_sk: sk that is accepting the connection
 * @newsk: new sk created for this connection
 * peer is locked when this hook is called
 *
 * Return:
 *   0 if connection is permitted
 *   error code on denial or failure
 */
static int apparmor_unix_stream_connect(struct sock *sk, struct sock *peer_sk,
                                        struct sock *newsk)
{
        struct aa_sk_ctx *sk_ctx = aa_sock(sk);
        struct aa_sk_ctx *peer_ctx = aa_sock(peer_sk);
        struct aa_sk_ctx *new_ctx = aa_sock(newsk);
        struct aa_label *label;
        int error;
        bool needput;

        label = __begin_current_label_crit_section(&needput);
        error = unix_connect_perm(current_cred(), label, sk, peer_sk);
        __end_current_label_crit_section(label, needput);

        if (error)
                return error;

        /* newsk doesn't go through post_create, but does go through
         * security_sk_alloc()
         */
        rcu_assign_pointer(new_ctx->label,
                           aa_get_label(rcu_dereference_protected(peer_ctx->label,
                                                                  true)));

        /* Cross reference the peer labels for SO_PEERSEC */
        unix_connect_peers(sk_ctx, new_ctx);

        return 0;
}

/**
 * apparmor_unix_may_send - check perms before conn or sending unix dgrams
 * @sock: socket sending the message
 * @peer: socket message is being send to
 *
 * Performs bidirectional permission checks for Unix domain socket communication:
 * 1. Verifies sender has AA_MAY_SEND to target socket
 * 2. Verifies receiver has AA_MAY_RECEIVE from source socket
 *
 * sock and peer are locked when this hook is called
 * called by: dgram_connect peer setup but path not copied to newsk
 *
 * Return:
 *   0 if transmission is permitted
 *   error code on denial or failure
 */
static int apparmor_unix_may_send(struct socket *sock, struct socket *peer)
{
        struct aa_sk_ctx *peer_ctx = aa_sock(peer->sk);
        struct aa_label *label;
        int error;
        bool needput;

        label = __begin_current_label_crit_section(&needput);
        error = xcheck(aa_unix_peer_perm(current_cred(),
                                label, OP_SENDMSG, AA_MAY_SEND,
                                sock->sk, peer->sk,
                                rcu_dereference_protected(peer_ctx->label,
                                                          true)),
                       aa_unix_peer_perm(peer->file ? peer->file->f_cred : NULL,
                                rcu_dereference_protected(peer_ctx->label,
                                                          true),
                                OP_SENDMSG, AA_MAY_RECEIVE, peer->sk,
                                sock->sk, label));
        __end_current_label_crit_section(label, needput);

        return error;
}

static int apparmor_socket_create(int family, int type, int protocol, int kern)
{
        struct aa_label *label;
        int error = 0;

        AA_BUG(in_interrupt());

        if (kern)
                return 0;

        label = begin_current_label_crit_section();
        if (!unconfined(label)) {
                if (family == PF_UNIX)
                        error = aa_unix_create_perm(label, family, type,
                                                    protocol);
                else
                        error = aa_af_perm(current_cred(), label, OP_CREATE,
                                           AA_MAY_CREATE, family, type,
                                           protocol);
        }
        end_current_label_crit_section(label);

        return error;
}

/**
 * apparmor_socket_post_create - setup the per-socket security struct
 * @sock: socket that is being setup
 * @family: family of socket being created
 * @type: type of the socket
 * @protocol: protocol of the socket
 * @kern: socket is a special kernel socket
 *
 * Note:
 * -   kernel sockets labeled kernel_t used to use unconfined
 * -   socket may not have sk here if created with sock_create_lite or
 *     sock_alloc. These should be accept cases which will be handled in
 *     sock_graft.
 */
static int apparmor_socket_post_create(struct socket *sock, int family,
                                       int type, int protocol, int kern)
{
        struct aa_label *label;

        if (kern) {
                label = aa_get_label(kernel_t);
        } else
                label = aa_get_current_label();

        if (sock->sk) {
                struct aa_sk_ctx *ctx = aa_sock(sock->sk);

                /* still not live */
                aa_put_label(rcu_dereference_protected(ctx->label, true));
                rcu_assign_pointer(ctx->label, aa_get_label(label));
        }
        aa_put_label(label);

        return 0;
}

static int apparmor_socket_socketpair(struct socket *socka,
                                      struct socket *sockb)
{
        struct aa_sk_ctx *a_ctx = aa_sock(socka->sk);
        struct aa_sk_ctx *b_ctx = aa_sock(sockb->sk);
        struct aa_label *label;

        /* socks not live yet - initial values set in sk_alloc */
        label = begin_current_label_crit_section();
        if (rcu_access_pointer(a_ctx->label) != label) {
                AA_BUG("a_ctx != label");
                aa_put_label(rcu_dereference_protected(a_ctx->label, true));
                rcu_assign_pointer(a_ctx->label, aa_get_label(label));
        }
        if (rcu_access_pointer(b_ctx->label) != label) {
                AA_BUG("b_ctx != label");
                aa_put_label(rcu_dereference_protected(b_ctx->label, true));
                rcu_assign_pointer(b_ctx->label, aa_get_label(label));
        }

        if (socka->sk->sk_family == PF_UNIX) {
                /* unix socket pairs by-pass unix_stream_connect */
                unix_connect_peers(a_ctx, b_ctx);
        }
        end_current_label_crit_section(label);

        return 0;
}

/**
 * apparmor_socket_bind - check perms before bind addr to socket
 * @sock: socket to bind the address to (must be non-NULL)
 * @address: address that is being bound (must be non-NULL)
 * @addrlen: length of @address
 *
 * Performs security checks before allowing a socket to bind to an address.
 * Handles Unix domain sockets specially through aa_unix_bind_perm().
 * For other socket families, uses generic permission check via aa_sk_perm().
 *
 * Return:
 *   0 if binding is permitted
 *   error code on denial or invalid parameters
 */
static int apparmor_socket_bind(struct socket *sock,
                                struct sockaddr *address, int addrlen)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(!address);
        AA_BUG(in_interrupt());

        if (sock->sk->sk_family == PF_UNIX)
                return aa_unix_bind_perm(sock, address, addrlen);
        return aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk);
}

static int apparmor_socket_connect(struct socket *sock,
                                   struct sockaddr *address, int addrlen)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(!address);
        AA_BUG(in_interrupt());

        /* PF_UNIX goes through unix_stream_connect && unix_may_send */
        if (sock->sk->sk_family == PF_UNIX)
                return 0;
        return aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk);
}

static int apparmor_socket_listen(struct socket *sock, int backlog)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(in_interrupt());

        if (sock->sk->sk_family == PF_UNIX)
                return aa_unix_listen_perm(sock, backlog);
        return aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk);
}

/*
 * Note: while @newsock is created and has some information, the accept
 *       has not been done.
 */
static int apparmor_socket_accept(struct socket *sock, struct socket *newsock)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(!newsock);
        AA_BUG(in_interrupt());

        if (sock->sk->sk_family == PF_UNIX)
                return aa_unix_accept_perm(sock, newsock);
        return aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk);
}

static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock,
                            struct msghdr *msg, int size)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(!msg);
        AA_BUG(in_interrupt());

        /* PF_UNIX goes through unix_may_send */
        if (sock->sk->sk_family == PF_UNIX)
                return 0;
        return aa_sk_perm(op, request, sock->sk);
}

static int apparmor_socket_sendmsg(struct socket *sock,
                                   struct msghdr *msg, int size)
{
        return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size);
}

static int apparmor_socket_recvmsg(struct socket *sock,
                                   struct msghdr *msg, int size, int flags)
{
        return aa_sock_msg_perm(OP_RECVMSG, AA_MAY_RECEIVE, sock, msg, size);
}

/* revaliation, get/set attr, shutdown */
static int aa_sock_perm(const char *op, u32 request, struct socket *sock)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(in_interrupt());

        if (sock->sk->sk_family == PF_UNIX)
                return aa_unix_sock_perm(op, request, sock);
        return aa_sk_perm(op, request, sock->sk);
}

static int apparmor_socket_getsockname(struct socket *sock)
{
        return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock);
}

static int apparmor_socket_getpeername(struct socket *sock)
{
        return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock);
}

/* revaliation, get/set attr, opt */
static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock,
                            int level, int optname)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(in_interrupt());

        if (sock->sk->sk_family == PF_UNIX)
                return aa_unix_opt_perm(op, request, sock, level, optname);
        return aa_sk_perm(op, request, sock->sk);
}

static int apparmor_socket_getsockopt(struct socket *sock, int level,
                                      int optname)
{
        return aa_sock_opt_perm(OP_GETSOCKOPT, AA_MAY_GETOPT, sock,
                                level, optname);
}

static int apparmor_socket_setsockopt(struct socket *sock, int level,
                                      int optname)
{
        return aa_sock_opt_perm(OP_SETSOCKOPT, AA_MAY_SETOPT, sock,
                                level, optname);
}

static int apparmor_socket_shutdown(struct socket *sock, int how)
{
        return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock);
}

#ifdef CONFIG_NETWORK_SECMARK
/**
 * apparmor_socket_sock_rcv_skb - check perms before associating skb to sk
 * @sk: sk to associate @skb with
 * @skb: skb to check for perms
 *
 * Note: can not sleep may be called with locks held
 *
 * dont want protocol specific in __skb_recv_datagram()
 * to deny an incoming connection  socket_sock_rcv_skb()
 */
static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);
        int error;

        if (!skb->secmark)
                return 0;

        /*
         * If reach here before socket_post_create hook is called, in which
         * case label is null, drop the packet.
         */
        if (!rcu_access_pointer(ctx->label))
                return -EACCES;

        rcu_read_lock();
        error = apparmor_secmark_check(rcu_dereference(ctx->label), OP_RECVMSG,
                                       AA_MAY_RECEIVE, skb->secmark, sk);
        rcu_read_unlock();

        return error;
}
#endif


static struct aa_label *sk_peer_get_label(struct sock *sk)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);
        struct aa_label *label = ERR_PTR(-ENOPROTOOPT);

        if (rcu_access_pointer(ctx->peer))
                return aa_get_label_rcu(&ctx->peer);

        if (sk->sk_family != PF_UNIX)
                return ERR_PTR(-ENOPROTOOPT);

        return label;
}

/**
 * apparmor_socket_getpeersec_stream - get security context of peer
 * @sock: socket that we are trying to get the peer context of
 * @optval: output - buffer to copy peer name to
 * @optlen: output - size of copied name in @optval
 * @len: size of @optval buffer
 * Returns: 0 on success, -errno of failure
 *
 * Note: for tcp only valid if using ipsec or cipso on lan
 */
static int apparmor_socket_getpeersec_stream(struct socket *sock,
                                             sockptr_t optval, sockptr_t optlen,
                                             unsigned int len)
{
        char *name = NULL;
        int slen, error = 0;
        struct aa_label *label;
        struct aa_label *peer;

        peer = sk_peer_get_label(sock->sk);
        if (IS_ERR(peer)) {
                error = PTR_ERR(peer);
                goto done;
        }
        label = begin_current_label_crit_section();
        slen = aa_label_asxprint(&name, labels_ns(label), peer,
                                 FLAG_SHOW_MODE | FLAG_VIEW_SUBNS |
                                 FLAG_HIDDEN_UNCONFINED, GFP_KERNEL);
        /* don't include terminating \0 in slen, it breaks some apps */
        if (slen < 0) {
                error = -ENOMEM;
                goto done_put;
        }
        if (slen > len) {
                error = -ERANGE;
                goto done_len;
        }

        if (copy_to_sockptr(optval, name, slen))
                error = -EFAULT;
done_len:
        if (copy_to_sockptr(optlen, &slen, sizeof(slen)))
                error = -EFAULT;

done_put:
        end_current_label_crit_section(label);
        aa_put_label(peer);
done:
        kfree(name);
        return error;
}

/**
 * apparmor_socket_getpeersec_dgram - get security label of packet
 * @sock: the peer socket
 * @skb: packet data
 * @secid: pointer to where to put the secid of the packet
 *
 * Sets the netlabel socket state on sk from parent
 */
static int apparmor_socket_getpeersec_dgram(struct socket *sock,
                                            struct sk_buff *skb, u32 *secid)

{
        /* TODO: requires secid support */
        return -ENOPROTOOPT;
}

/**
 * apparmor_sock_graft - Initialize newly created socket
 * @sk: child sock
 * @parent: parent socket
 *
 * Note: could set off of SOCK_CTX(parent) but need to track inode and we can
 *       just set sk security information off of current creating process label
 *       Labeling of sk for accept case - probably should be sock based
 *       instead of task, because of the case where an implicitly labeled
 *       socket is shared by different tasks.
 */
static void apparmor_sock_graft(struct sock *sk, struct socket *parent)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);

        /* setup - not live */
        if (!rcu_access_pointer(ctx->label))
                rcu_assign_pointer(ctx->label, aa_get_current_label());
}

#ifdef CONFIG_NETWORK_SECMARK
static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb,
                                      struct request_sock *req)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);
        int error;

        if (!skb->secmark)
                return 0;

        rcu_read_lock();
        error = apparmor_secmark_check(rcu_dereference(ctx->label), OP_CONNECT,
                                       AA_MAY_CONNECT, skb->secmark, sk);
        rcu_read_unlock();

        return error;
}
#endif

/*
 * The cred blob is a pointer to, not an instance of, an aa_label.
 */
struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = {
        .lbs_cred = sizeof(struct aa_label *),
        .lbs_file = sizeof(struct aa_file_ctx),
        .lbs_task = sizeof(struct aa_task_ctx),
        .lbs_sock = sizeof(struct aa_sk_ctx),
};

static const struct lsm_id apparmor_lsmid = {
        .name = "apparmor",
        .id = LSM_ID_APPARMOR,
};

static struct security_hook_list apparmor_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme),
        LSM_HOOK_INIT(capget, apparmor_capget),
        LSM_HOOK_INIT(capable, apparmor_capable),

        LSM_HOOK_INIT(move_mount, apparmor_move_mount),
        LSM_HOOK_INIT(sb_mount, apparmor_sb_mount),
        LSM_HOOK_INIT(sb_umount, apparmor_sb_umount),
        LSM_HOOK_INIT(sb_pivotroot, apparmor_sb_pivotroot),

        LSM_HOOK_INIT(path_link, apparmor_path_link),
        LSM_HOOK_INIT(path_unlink, apparmor_path_unlink),
        LSM_HOOK_INIT(path_symlink, apparmor_path_symlink),
        LSM_HOOK_INIT(path_mkdir, apparmor_path_mkdir),
        LSM_HOOK_INIT(path_rmdir, apparmor_path_rmdir),
        LSM_HOOK_INIT(path_mknod, apparmor_path_mknod),
        LSM_HOOK_INIT(path_rename, apparmor_path_rename),
        LSM_HOOK_INIT(path_chmod, apparmor_path_chmod),
        LSM_HOOK_INIT(path_chown, apparmor_path_chown),
        LSM_HOOK_INIT(path_truncate, apparmor_path_truncate),
        LSM_HOOK_INIT(inode_getattr, apparmor_inode_getattr),

        LSM_HOOK_INIT(file_open, apparmor_file_open),
        LSM_HOOK_INIT(file_receive, apparmor_file_receive),
        LSM_HOOK_INIT(file_permission, apparmor_file_permission),
        LSM_HOOK_INIT(file_alloc_security, apparmor_file_alloc_security),
        LSM_HOOK_INIT(file_free_security, apparmor_file_free_security),
        LSM_HOOK_INIT(mmap_file, apparmor_mmap_file),
        LSM_HOOK_INIT(file_mprotect, apparmor_file_mprotect),
        LSM_HOOK_INIT(file_lock, apparmor_file_lock),
        LSM_HOOK_INIT(file_truncate, apparmor_file_truncate),

        LSM_HOOK_INIT(getselfattr, apparmor_getselfattr),
        LSM_HOOK_INIT(setselfattr, apparmor_setselfattr),
        LSM_HOOK_INIT(getprocattr, apparmor_getprocattr),
        LSM_HOOK_INIT(setprocattr, apparmor_setprocattr),

        LSM_HOOK_INIT(sk_alloc_security, apparmor_sk_alloc_security),
        LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security),
        LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security),

        LSM_HOOK_INIT(unix_stream_connect, apparmor_unix_stream_connect),
        LSM_HOOK_INIT(unix_may_send, apparmor_unix_may_send),

        LSM_HOOK_INIT(socket_create, apparmor_socket_create),
        LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create),
        LSM_HOOK_INIT(socket_socketpair, apparmor_socket_socketpair),
        LSM_HOOK_INIT(socket_bind, apparmor_socket_bind),
        LSM_HOOK_INIT(socket_connect, apparmor_socket_connect),
        LSM_HOOK_INIT(socket_listen, apparmor_socket_listen),
        LSM_HOOK_INIT(socket_accept, apparmor_socket_accept),
        LSM_HOOK_INIT(socket_sendmsg, apparmor_socket_sendmsg),
        LSM_HOOK_INIT(socket_recvmsg, apparmor_socket_recvmsg),
        LSM_HOOK_INIT(socket_getsockname, apparmor_socket_getsockname),
        LSM_HOOK_INIT(socket_getpeername, apparmor_socket_getpeername),
        LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt),
        LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt),
        LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown),
#ifdef CONFIG_NETWORK_SECMARK
        LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb),
#endif
        LSM_HOOK_INIT(socket_getpeersec_stream,
                      apparmor_socket_getpeersec_stream),
        LSM_HOOK_INIT(socket_getpeersec_dgram,
                      apparmor_socket_getpeersec_dgram),
        LSM_HOOK_INIT(sock_graft, apparmor_sock_graft),
#ifdef CONFIG_NETWORK_SECMARK
        LSM_HOOK_INIT(inet_conn_request, apparmor_inet_conn_request),
#endif

        LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank),
        LSM_HOOK_INIT(cred_free, apparmor_cred_free),
        LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare),
        LSM_HOOK_INIT(cred_transfer, apparmor_cred_transfer),

        LSM_HOOK_INIT(bprm_creds_for_exec, apparmor_bprm_creds_for_exec),
        LSM_HOOK_INIT(bprm_committing_creds, apparmor_bprm_committing_creds),
        LSM_HOOK_INIT(bprm_committed_creds, apparmor_bprm_committed_creds),

        LSM_HOOK_INIT(task_free, apparmor_task_free),
        LSM_HOOK_INIT(task_alloc, apparmor_task_alloc),
        LSM_HOOK_INIT(current_getlsmprop_subj,
                      apparmor_current_getlsmprop_subj),
        LSM_HOOK_INIT(task_getlsmprop_obj, apparmor_task_getlsmprop_obj),
        LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit),
        LSM_HOOK_INIT(task_kill, apparmor_task_kill),
        LSM_HOOK_INIT(userns_create, apparmor_userns_create),

#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_init, aa_audit_rule_init),
        LSM_HOOK_INIT(audit_rule_known, aa_audit_rule_known),
        LSM_HOOK_INIT(audit_rule_match, aa_audit_rule_match),
        LSM_HOOK_INIT(audit_rule_free, aa_audit_rule_free),
#endif

        LSM_HOOK_INIT(secid_to_secctx, apparmor_secid_to_secctx),
        LSM_HOOK_INIT(lsmprop_to_secctx, apparmor_lsmprop_to_secctx),
        LSM_HOOK_INIT(secctx_to_secid, apparmor_secctx_to_secid),
        LSM_HOOK_INIT(release_secctx, apparmor_release_secctx),

#ifdef CONFIG_IO_URING
        LSM_HOOK_INIT(uring_override_creds, apparmor_uring_override_creds),
        LSM_HOOK_INIT(uring_sqpoll, apparmor_uring_sqpoll),
#endif
};

/*
 * AppArmor sysfs module parameters
 */

static int param_set_aabool(const char *val, const struct kernel_param *kp);
static int param_get_aabool(char *buffer, const struct kernel_param *kp);
#define param_check_aabool param_check_bool
static const struct kernel_param_ops param_ops_aabool = {
        .flags = KERNEL_PARAM_OPS_FL_NOARG,
        .set = param_set_aabool,
        .get = param_get_aabool
};

static int param_set_aauint(const char *val, const struct kernel_param *kp);
static int param_get_aauint(char *buffer, const struct kernel_param *kp);
#define param_check_aauint param_check_uint
static const struct kernel_param_ops param_ops_aauint = {
        .set = param_set_aauint,
        .get = param_get_aauint
};

static int param_set_aacompressionlevel(const char *val,
                                        const struct kernel_param *kp);
static int param_get_aacompressionlevel(char *buffer,
                                        const struct kernel_param *kp);
#define param_check_aacompressionlevel param_check_int
static const struct kernel_param_ops param_ops_aacompressionlevel = {
        .set = param_set_aacompressionlevel,
        .get = param_get_aacompressionlevel
};

static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp);
static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp);
#define param_check_aalockpolicy param_check_bool
static const struct kernel_param_ops param_ops_aalockpolicy = {
        .flags = KERNEL_PARAM_OPS_FL_NOARG,
        .set = param_set_aalockpolicy,
        .get = param_get_aalockpolicy
};

static int param_set_debug(const char *val, const struct kernel_param *kp);
static int param_get_debug(char *buffer, const struct kernel_param *kp);

static int param_set_audit(const char *val, const struct kernel_param *kp);
static int param_get_audit(char *buffer, const struct kernel_param *kp);

static int param_set_mode(const char *val, const struct kernel_param *kp);
static int param_get_mode(char *buffer, const struct kernel_param *kp);

/* Flag values, also controllable via /sys/module/apparmor/parameters
 * We define special types as we want to do additional mediation.
 */

/* AppArmor global enforcement switch - complain, enforce, kill */
enum profile_mode aa_g_profile_mode = APPARMOR_ENFORCE;
module_param_call(mode, param_set_mode, param_get_mode,
                  &aa_g_profile_mode, S_IRUSR | S_IWUSR);

/* whether policy verification hashing is enabled */
bool aa_g_hash_policy = IS_ENABLED(CONFIG_SECURITY_APPARMOR_HASH_DEFAULT);
#ifdef CONFIG_SECURITY_APPARMOR_HASH
module_param_named(hash_policy, aa_g_hash_policy, aabool, S_IRUSR | S_IWUSR);
#endif

/* whether policy exactly as loaded is retained for debug and checkpointing */
bool aa_g_export_binary = IS_ENABLED(CONFIG_SECURITY_APPARMOR_EXPORT_BINARY);
#ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY
module_param_named(export_binary, aa_g_export_binary, aabool, 0600);
#endif

/* policy loaddata compression level */
int aa_g_rawdata_compression_level = AA_DEFAULT_CLEVEL;
module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level,
                   aacompressionlevel, 0400);

/* Debug mode */
int aa_g_debug;
module_param_call(debug, param_set_debug, param_get_debug,
                  &aa_g_debug, 0600);

/* Audit mode */
enum audit_mode aa_g_audit;
module_param_call(audit, param_set_audit, param_get_audit,
                  &aa_g_audit, S_IRUSR | S_IWUSR);

/* Determines if audit header is included in audited messages.  This
 * provides more context if the audit daemon is not running
 */
bool aa_g_audit_header = true;
module_param_named(audit_header, aa_g_audit_header, aabool,
                   S_IRUSR | S_IWUSR);

/* lock out loading/removal of policy
 * TODO: add in at boot loading of policy, which is the only way to
 *       load policy, if lock_policy is set
 */
bool aa_g_lock_policy;
module_param_named(lock_policy, aa_g_lock_policy, aalockpolicy,
                   S_IRUSR | S_IWUSR);

/* Syscall logging mode */
bool aa_g_logsyscall;
module_param_named(logsyscall, aa_g_logsyscall, aabool, S_IRUSR | S_IWUSR);

/* Maximum pathname length before accesses will start getting rejected */
unsigned int aa_g_path_max = 2 * PATH_MAX;
module_param_named(path_max, aa_g_path_max, aauint, S_IRUSR);

/* Determines how paranoid loading of policy is and how much verification
 * on the loaded policy is done.
 * DEPRECATED: read only as strict checking of load is always done now
 * that none root users (user namespaces) can load policy.
 */
bool aa_g_paranoid_load = IS_ENABLED(CONFIG_SECURITY_APPARMOR_PARANOID_LOAD);
module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO);

static int param_get_aaintbool(char *buffer, const struct kernel_param *kp);
static int param_set_aaintbool(const char *val, const struct kernel_param *kp);
#define param_check_aaintbool param_check_int
static const struct kernel_param_ops param_ops_aaintbool = {
        .set = param_set_aaintbool,
        .get = param_get_aaintbool
};
/* Boot time disable flag */
static int apparmor_enabled __ro_after_init = 1;
module_param_named(enabled, apparmor_enabled, aaintbool, 0444);

static int __init apparmor_enabled_setup(char *str)
{
        unsigned long enabled;
        int error = kstrtoul(str, 0, &enabled);
        if (!error)
                apparmor_enabled = enabled ? 1 : 0;
        return 1;
}

__setup("apparmor=", apparmor_enabled_setup);

/* set global flag turning off the ability to load policy */
static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_admin_capable(NULL))
                return -EPERM;
        return param_set_bool(val, kp);
}

static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return param_get_bool(buffer, kp);
}

static int param_set_aabool(const char *val, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_admin_capable(NULL))
                return -EPERM;
        return param_set_bool(val, kp);
}

static int param_get_aabool(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return param_get_bool(buffer, kp);
}

static int param_set_aauint(const char *val, const struct kernel_param *kp)
{
        int error;

        if (!apparmor_enabled)
                return -EINVAL;
        /* file is ro but enforce 2nd line check */
        if (apparmor_initialized)
                return -EPERM;

        error = param_set_uint(val, kp);
        aa_g_path_max = max_t(uint32_t, aa_g_path_max, sizeof(union aa_buffer));
        pr_info("AppArmor: buffer size set to %d bytes\n", aa_g_path_max);

        return error;
}

static int param_get_aauint(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return param_get_uint(buffer, kp);
}

/* Can only be set before AppArmor is initialized (i.e. on boot cmdline). */
static int param_set_aaintbool(const char *val, const struct kernel_param *kp)
{
        struct kernel_param kp_local;
        bool value;
        int error;

        if (apparmor_initialized)
                return -EPERM;

        /* Create local copy, with arg pointing to bool type. */
        value = !!*((int *)kp->arg);
        memcpy(&kp_local, kp, sizeof(kp_local));
        kp_local.arg = &value;

        error = param_set_bool(val, &kp_local);
        if (!error)
                *((int *)kp->arg) = *((bool *)kp_local.arg);
        return error;
}

/*
 * To avoid changing /sys/module/apparmor/parameters/enabled from Y/N to
 * 1/0, this converts the "int that is actually bool" back to bool for
 * display in the /sys filesystem, while keeping it "int" for the LSM
 * infrastructure.
 */
static int param_get_aaintbool(char *buffer, const struct kernel_param *kp)
{
        struct kernel_param kp_local;
        bool value;

        /* Create local copy, with arg pointing to bool type. */
        value = !!*((int *)kp->arg);
        memcpy(&kp_local, kp, sizeof(kp_local));
        kp_local.arg = &value;

        return param_get_bool(buffer, &kp_local);
}

static int param_set_aacompressionlevel(const char *val,
                                        const struct kernel_param *kp)
{
        int error;

        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized)
                return -EPERM;

        error = param_set_int(val, kp);

        aa_g_rawdata_compression_level = clamp(aa_g_rawdata_compression_level,
                                               AA_MIN_CLEVEL, AA_MAX_CLEVEL);
        pr_info("AppArmor: policy rawdata compression level set to %d\n",
                aa_g_rawdata_compression_level);

        return error;
}

static int param_get_aacompressionlevel(char *buffer,
                                        const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return param_get_int(buffer, kp);
}

static int param_get_debug(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return aa_print_debug_params(buffer);
}

static int param_set_debug(const char *val, const struct kernel_param *kp)
{
        int i;

        if (!apparmor_enabled)
                return -EINVAL;
        if (!val)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_admin_capable(NULL))
                return -EPERM;

        i = aa_parse_debug_params(val);
        if (i == DEBUG_PARSE_ERROR)
                return -EINVAL;

        aa_g_debug = i;
        return 0;
}

static int param_get_audit(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return sprintf(buffer, "%s", audit_mode_names[aa_g_audit]);
}

static int param_set_audit(const char *val, const struct kernel_param *kp)
{
        int i;

        if (!apparmor_enabled)
                return -EINVAL;
        if (!val)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_admin_capable(NULL))
                return -EPERM;

        i = match_string(audit_mode_names, AUDIT_MAX_INDEX, val);
        if (i < 0)
                return -EINVAL;

        aa_g_audit = i;
        return 0;
}

static int param_get_mode(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;

        return sprintf(buffer, "%s", aa_profile_mode_names[aa_g_profile_mode]);
}

static int param_set_mode(const char *val, const struct kernel_param *kp)
{
        int i;

        if (!apparmor_enabled)
                return -EINVAL;
        if (!val)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_admin_capable(NULL))
                return -EPERM;

        i = match_string(aa_profile_mode_names, APPARMOR_MODE_NAMES_MAX_INDEX,
                         val);
        if (i < 0)
                return -EINVAL;

        aa_g_profile_mode = i;
        return 0;
}

/* arbitrary cap on how long to hold buffer because contention was
 * encountered before trying to put it back into the global pool
 */
#define MAX_HOLD_COUNT 64

/* the hold count is a heuristic for lock contention, and can be
 * incremented async to actual buffer alloc/free.  Because buffers
 * may be put back onto a percpu cache different than the ->hold was
 * added to the counts can be out of sync. Guard against underflow
 * and overflow
 */
static void cache_hold_inc(unsigned int *hold)
{
        if (*hold > MAX_HOLD_COUNT)
                (*hold)++;
}

char *aa_get_buffer(bool in_atomic)
{
        union aa_buffer *aa_buf;
        struct aa_local_cache *cache;
        bool try_again = true;
        gfp_t flags = (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);

        /* use per cpu cached buffers first */
        cache = get_cpu_ptr(&aa_local_buffers);
        if (!list_empty(&cache->head)) {
                aa_buf = list_first_entry(&cache->head, union aa_buffer, list);
                list_del(&aa_buf->list);
                if (cache->hold)
                        cache->hold--;
                cache->count--;
                put_cpu_ptr(&aa_local_buffers);
                return &aa_buf->buffer[0];
        }
        /* exit percpu as spinlocks may sleep on realtime kernels */
        put_cpu_ptr(&aa_local_buffers);

        if (!spin_trylock(&aa_buffers_lock)) {
                /* had contention on lock so increase hold count. Doesn't
                 * really matter if recorded before or after the spin lock
                 * as there is no way to guarantee the buffer will be put
                 * back on the same percpu cache. Instead rely on holds
                 * roughly averaging out over time.
                 */
                cache = get_cpu_ptr(&aa_local_buffers);
                cache_hold_inc(&cache->hold);
                put_cpu_ptr(&aa_local_buffers);
                spin_lock(&aa_buffers_lock);
        }
retry:
        if (buffer_count > reserve_count ||
            (in_atomic && !list_empty(&aa_global_buffers))) {
                aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer,
                                          list);
                list_del(&aa_buf->list);
                buffer_count--;
                spin_unlock(&aa_buffers_lock);
                return aa_buf->buffer;
        }
        if (in_atomic) {
                /*
                 * out of reserve buffers and in atomic context so increase
                 * how many buffers to keep in reserve
                 */
                reserve_count++;
                flags = GFP_ATOMIC;
        }
        spin_unlock(&aa_buffers_lock);

        if (!in_atomic)
                might_sleep();
        aa_buf = kmalloc(aa_g_path_max, flags);
        if (!aa_buf) {
                if (try_again) {
                        try_again = false;
                        spin_lock(&aa_buffers_lock);
                        goto retry;
                }
                pr_warn_once("AppArmor: Failed to allocate a memory buffer.\n");
                return NULL;
        }
        return aa_buf->buffer;
}

void aa_put_buffer(char *buf)
{
        union aa_buffer *aa_buf;
        struct aa_local_cache *cache;

        if (!buf)
                return;
        aa_buf = container_of(buf, union aa_buffer, buffer[0]);

        cache = get_cpu_ptr(&aa_local_buffers);
        if (!cache->hold) {
                put_cpu_ptr(&aa_local_buffers);

                if (spin_trylock(&aa_buffers_lock)) {
                        /* put back on global list */
                        list_add(&aa_buf->list, &aa_global_buffers);
                        buffer_count++;
                        spin_unlock(&aa_buffers_lock);
                        return;
                }
                /* contention on global list, fallback to percpu */
                cache = get_cpu_ptr(&aa_local_buffers);
                cache_hold_inc(&cache->hold);
        }

        /* cache in percpu list */
        list_add(&aa_buf->list, &cache->head);
        cache->count++;
        put_cpu_ptr(&aa_local_buffers);
}

/*
 * AppArmor init functions
 */

/**
 * set_init_ctx - set a task context and profile on the first task.
 *
 * TODO: allow setting an alternate profile than unconfined
 */
static int __init set_init_ctx(void)
{
        struct cred *cred = (__force struct cred *)current->real_cred;

        set_cred_label(cred, aa_get_label(ns_unconfined(root_ns)));

        return 0;
}

static void destroy_buffers(void)
{
        union aa_buffer *aa_buf;

        spin_lock(&aa_buffers_lock);
        while (!list_empty(&aa_global_buffers)) {
                aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer,
                                         list);
                list_del(&aa_buf->list);
                spin_unlock(&aa_buffers_lock);
                kfree(aa_buf);
                spin_lock(&aa_buffers_lock);
        }
        spin_unlock(&aa_buffers_lock);
}

static int __init alloc_buffers(void)
{
        union aa_buffer *aa_buf;
        int i, num;

        /*
         * per cpu set of cached allocated buffers used to help reduce
         * lock contention
         */
        for_each_possible_cpu(i) {
                per_cpu(aa_local_buffers, i).hold = 0;
                per_cpu(aa_local_buffers, i).count = 0;
                INIT_LIST_HEAD(&per_cpu(aa_local_buffers, i).head);
        }
        /*
         * A function may require two buffers at once. Usually the buffers are
         * used for a short period of time and are shared. On UP kernel buffers
         * two should be enough, with more CPUs it is possible that more
         * buffers will be used simultaneously. The preallocated pool may grow.
         * This preallocation has also the side-effect that AppArmor will be
         * disabled early at boot if aa_g_path_max is extremely high.
         */
        if (num_online_cpus() > 1)
                num = 4 + RESERVE_COUNT;
        else
                num = 2 + RESERVE_COUNT;

        for (i = 0; i < num; i++) {

                aa_buf = kmalloc(aa_g_path_max, GFP_KERNEL |
                                 __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
                if (!aa_buf) {
                        destroy_buffers();
                        return -ENOMEM;
                }
                aa_put_buffer(aa_buf->buffer);
        }
        return 0;
}

#ifdef CONFIG_SYSCTL
static int apparmor_dointvec(const struct ctl_table *table, int write,
                             void *buffer, size_t *lenp, loff_t *ppos)
{
        if (!aa_current_policy_admin_capable(NULL))
                return -EPERM;
        if (!apparmor_enabled)
                return -EINVAL;

        return proc_dointvec(table, write, buffer, lenp, ppos);
}

static const struct ctl_table apparmor_sysctl_table[] = {
#ifdef CONFIG_USER_NS
        {
                .procname       = "unprivileged_userns_apparmor_policy",
                .data           = &unprivileged_userns_apparmor_policy,
                .maxlen         = sizeof(int),
                .mode           = 0600,
                .proc_handler   = apparmor_dointvec,
        },
#endif /* CONFIG_USER_NS */
        {
                .procname       = "apparmor_display_secid_mode",
                .data           = &apparmor_display_secid_mode,
                .maxlen         = sizeof(int),
                .mode           = 0600,
                .proc_handler   = apparmor_dointvec,
        },
        {
                .procname       = "apparmor_restrict_unprivileged_unconfined",
                .data           = &aa_unprivileged_unconfined_restricted,
                .maxlen         = sizeof(int),
                .mode           = 0600,
                .proc_handler   = apparmor_dointvec,
        },
};

static int __init apparmor_init_sysctl(void)
{
        return register_sysctl("kernel", apparmor_sysctl_table) ? 0 : -ENOMEM;
}
#else
static inline int apparmor_init_sysctl(void)
{
        return 0;
}
#endif /* CONFIG_SYSCTL */

#if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK)
static unsigned int apparmor_ip_postroute(void *priv,
                                          struct sk_buff *skb,
                                          const struct nf_hook_state *state)
{
        struct aa_sk_ctx *ctx;
        struct sock *sk;
        int error;

        if (!skb->secmark)
                return NF_ACCEPT;

        sk = skb_to_full_sk(skb);
        if (sk == NULL)
                return NF_ACCEPT;

        ctx = aa_sock(sk);
        rcu_read_lock();
        error = apparmor_secmark_check(rcu_dereference(ctx->label), OP_SENDMSG,
                                       AA_MAY_SEND, skb->secmark, sk);
        rcu_read_unlock();
        if (!error)
                return NF_ACCEPT;

        return NF_DROP_ERR(-ECONNREFUSED);

}

static const struct nf_hook_ops apparmor_nf_ops[] = {
        {
                .hook =         apparmor_ip_postroute,
                .pf =           NFPROTO_IPV4,
                .hooknum =      NF_INET_POST_ROUTING,
                .priority =     NF_IP_PRI_SELINUX_FIRST,
        },
#if IS_ENABLED(CONFIG_IPV6)
        {
                .hook =         apparmor_ip_postroute,
                .pf =           NFPROTO_IPV6,
                .hooknum =      NF_INET_POST_ROUTING,
                .priority =     NF_IP6_PRI_SELINUX_FIRST,
        },
#endif
};

static int __net_init apparmor_nf_register(struct net *net)
{
        return nf_register_net_hooks(net, apparmor_nf_ops,
                                    ARRAY_SIZE(apparmor_nf_ops));
}

static void __net_exit apparmor_nf_unregister(struct net *net)
{
        nf_unregister_net_hooks(net, apparmor_nf_ops,
                                ARRAY_SIZE(apparmor_nf_ops));
}

static struct pernet_operations apparmor_net_ops = {
        .init = apparmor_nf_register,
        .exit = apparmor_nf_unregister,
};

static int __init apparmor_nf_ip_init(void)
{
        int err;

        if (!apparmor_enabled)
                return 0;

        err = register_pernet_subsys(&apparmor_net_ops);
        if (err)
                panic("Apparmor: register_pernet_subsys: error %d\n", err);

        return 0;
}
#endif

static char nulldfa_src[] __aligned(8) = {
        #include "nulldfa.in"
};
static struct aa_dfa *nulldfa;

static char stacksplitdfa_src[] __aligned(8) = {
        #include "stacksplitdfa.in"
};
struct aa_dfa *stacksplitdfa;
struct aa_policydb *nullpdb;

static int __init aa_setup_dfa_engine(void)
{
        int error = -ENOMEM;

        nullpdb = aa_alloc_pdb(GFP_KERNEL);
        if (!nullpdb)
                return -ENOMEM;

        nulldfa = aa_dfa_unpack(nulldfa_src, sizeof(nulldfa_src),
                            TO_ACCEPT1_FLAG(YYTD_DATA32) |
                            TO_ACCEPT2_FLAG(YYTD_DATA32));
        if (IS_ERR(nulldfa)) {
                error = PTR_ERR(nulldfa);
                goto fail;
        }
        nullpdb->dfa = aa_get_dfa(nulldfa);
        nullpdb->perms = kzalloc_objs(struct aa_perms, 2);
        if (!nullpdb->perms)
                goto fail;
        nullpdb->size = 2;

        stacksplitdfa = aa_dfa_unpack(stacksplitdfa_src,
                                      sizeof(stacksplitdfa_src),
                                      TO_ACCEPT1_FLAG(YYTD_DATA32) |
                                      TO_ACCEPT2_FLAG(YYTD_DATA32));
        if (IS_ERR(stacksplitdfa)) {
                error = PTR_ERR(stacksplitdfa);
                goto fail;
        }

        return 0;

fail:
        aa_put_pdb(nullpdb);
        aa_put_dfa(nulldfa);
        nullpdb = NULL;
        nulldfa = NULL;
        stacksplitdfa = NULL;

        return error;
}

static void __init aa_teardown_dfa_engine(void)
{
        aa_put_dfa(stacksplitdfa);
        aa_put_dfa(nulldfa);
        aa_put_pdb(nullpdb);
        nullpdb = NULL;
        stacksplitdfa = NULL;
        nulldfa = NULL;
}

static int __init apparmor_init(void)
{
        int error;

        error = aa_setup_dfa_engine();
        if (error) {
                AA_ERROR("Unable to setup dfa engine\n");
                goto alloc_out;
        }

        error = aa_alloc_root_ns();
        if (error) {
                AA_ERROR("Unable to allocate default profile namespace\n");
                goto alloc_out;
        }

        error = apparmor_init_sysctl();
        if (error) {
                AA_ERROR("Unable to register sysctls\n");
                goto alloc_out;

        }

        error = alloc_buffers();
        if (error) {
                AA_ERROR("Unable to allocate work buffers\n");
                goto alloc_out;
        }

        error = set_init_ctx();
        if (error) {
                AA_ERROR("Failed to set context on init task\n");
                aa_free_root_ns();
                goto buffers_out;
        }
        security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks),
                                &apparmor_lsmid);

        /* Inform the audit system that secctx is used */
        audit_cfg_lsm(&apparmor_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT);

        /* Report that AppArmor successfully initialized */
        apparmor_initialized = 1;
        if (aa_g_profile_mode == APPARMOR_COMPLAIN)
                aa_info_message("AppArmor initialized: complain mode enabled");
        else if (aa_g_profile_mode == APPARMOR_KILL)
                aa_info_message("AppArmor initialized: kill mode enabled");
        else
                aa_info_message("AppArmor initialized");

        return error;

buffers_out:
        destroy_buffers();
alloc_out:
        aa_destroy_aafs();
        aa_teardown_dfa_engine();

        apparmor_enabled = false;
        return error;
}

DEFINE_LSM(apparmor) = {
        .id = &apparmor_lsmid,
        .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
        .enabled = &apparmor_enabled,
        .blobs = &apparmor_blob_sizes,
        .init = apparmor_init,
        .initcall_fs = aa_create_aafs,
#if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK)
        .initcall_device = apparmor_nf_ip_init,
#endif
#ifdef CONFIG_SECURITY_APPARMOR_HASH
        .initcall_late = init_profile_hash,
#endif
};























































































   12 






















   10 






    5 























































   13 
   14 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _LINUX_RCUREF_H
#define _LINUX_RCUREF_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/limits.h>
#include <linux/lockdep.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>

#define RCUREF_ONEREF                0x00000000U
#define RCUREF_MAXREF                0x7FFFFFFFU
#define RCUREF_SATURATED        0xA0000000U
#define RCUREF_RELEASED                0xC0000000U
#define RCUREF_DEAD                0xE0000000U
#define RCUREF_NOREF                0xFFFFFFFFU

/**
 * rcuref_init - Initialize a rcuref reference count with the given reference count
 * @ref:        Pointer to the reference count
 * @cnt:        The initial reference count typically '1'
 */
static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
{
        atomic_set(&ref->refcnt, cnt - 1);
}

/**
 * rcuref_read - Read the number of held reference counts of a rcuref
 * @ref:        Pointer to the reference count
 *
 * Return: The number of held references (0 ... N). The value 0 does not
 * indicate that it is safe to schedule the object, protected by this reference
 * counter, for deconstruction.
 * If you want to know if the reference counter has been marked DEAD (as
 * signaled by rcuref_put()) please use rcuread_is_dead().
 */
static inline unsigned int rcuref_read(rcuref_t *ref)
{
        unsigned int c = atomic_read(&ref->refcnt);

        /* Return 0 if within the DEAD zone. */
        return c >= RCUREF_RELEASED ? 0 : c + 1;
}

/**
 * rcuref_is_dead -        Check if the rcuref has been already marked dead
 * @ref:                Pointer to the reference count
 *
 * Return: True if the object has been marked DEAD. This signals that a previous
 * invocation of rcuref_put() returned true on this reference counter meaning
 * the protected object can safely be scheduled for deconstruction.
 * Otherwise, returns false.
 */
static inline bool rcuref_is_dead(rcuref_t *ref)
{
        unsigned int c = atomic_read(&ref->refcnt);

        return (c >= RCUREF_RELEASED) && (c < RCUREF_NOREF);
}

extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);

/**
 * rcuref_get - Acquire one reference on a rcuref reference count
 * @ref:        Pointer to the reference count
 *
 * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See documentation in lib/rcuref.c
 *
 * Return:
 *        False if the attempt to acquire a reference failed. This happens
 *        when the last reference has been put already
 *
 *        True if a reference was successfully acquired
 */
static inline __must_check bool rcuref_get(rcuref_t *ref)
{
        /*
         * Unconditionally increase the reference count. The saturation and
         * dead zones provide enough tolerance for this.
         */
        if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt)))
                return true;

        /* Handle the cases inside the saturation and dead zones */
        return rcuref_get_slowpath(ref);
}

extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt);

/*
 * Internal helper. Do not invoke directly.
 */
static __always_inline __must_check bool __rcuref_put(rcuref_t *ref)
{
        int cnt;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(),
                         "suspicious rcuref_put_rcusafe() usage");
        /*
         * Unconditionally decrease the reference count. The saturation and
         * dead zones provide enough tolerance for this.
         */
        cnt = atomic_sub_return_release(1, &ref->refcnt);
        if (likely(cnt >= 0))
                return false;

        /*
         * Handle the last reference drop and cases inside the saturation
         * and dead zones.
         */
        return rcuref_put_slowpath(ref, cnt);
}

/**
 * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe
 * @ref:        Pointer to the reference count
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Can be invoked from contexts, which guarantee that no grace period can
 * happen which would free the object concurrently if the decrement drops
 * the last reference and the slowpath races against a concurrent get() and
 * put() pair. rcu_read_lock()'ed and atomic contexts qualify.
 *
 * Return:
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely release the
 *        object which is protected by the reference counter.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        release the protected object.
 */
static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref)
{
        return __rcuref_put(ref);
}

/**
 * rcuref_put -- Release one reference for a rcuref reference count
 * @ref:        Pointer to the reference count
 *
 * Can be invoked from any context.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return:
 *
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely schedule the
 *        object, which is protected by the reference counter, for
 *        deconstruction.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        deconstruct the protected object.
 */
static inline __must_check bool rcuref_put(rcuref_t *ref)
{
        bool released;

        preempt_disable();
        released = __rcuref_put(ref);
        preempt_enable();
        return released;
}

#endif



























































































































































































    2 



    2 

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Authentication token and access key management internal defs
 *
 * Copyright (C) 2003-5, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _INTERNAL_H
#define _INTERNAL_H

#include <linux/sched.h>
#include <linux/wait_bit.h>
#include <linux/cred.h>
#include <linux/key-type.h>
#include <linux/task_work.h>
#include <linux/keyctl.h>
#include <linux/refcount.h>
#include <linux/watch_queue.h>
#include <linux/compat.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>

struct iovec;

#ifdef __KDEBUG
#define kenter(FMT, ...) \
        printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
#define kleave(FMT, ...) \
        printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
#define kdebug(FMT, ...) \
        printk(KERN_DEBUG "   "FMT"\n", ##__VA_ARGS__)
#else
#define kenter(FMT, ...) \
        no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
#define kleave(FMT, ...) \
        no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
#define kdebug(FMT, ...) \
        no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
#endif

extern struct key_type key_type_dead;
extern struct key_type key_type_user;
extern struct key_type key_type_logon;

/*****************************************************************************/
/*
 * Keep track of keys for a user.
 *
 * This needs to be separate to user_struct to avoid a refcount-loop
 * (user_struct pins some keyrings which pin this struct).
 *
 * We also keep track of keys under request from userspace for this UID here.
 */
struct key_user {
        struct rb_node                node;
        struct mutex                cons_lock;        /* construction initiation lock */
        spinlock_t                lock;
        refcount_t                usage;                /* for accessing qnkeys & qnbytes */
        atomic_t                nkeys;                /* number of keys */
        atomic_t                nikeys;                /* number of instantiated keys */
        kuid_t                        uid;
        int                        qnkeys;                /* number of keys allocated to this user */
        int                        qnbytes;        /* number of bytes allocated to this user */
};

extern struct rb_root        key_user_tree;
extern spinlock_t        key_user_lock;
extern struct key_user        root_key_user;

extern struct key_user *key_user_lookup(kuid_t uid);
extern void key_user_put(struct key_user *user);

/*
 * Key quota limits.
 * - root has its own separate limits to everyone else
 */
extern unsigned key_quota_root_maxkeys;
extern unsigned key_quota_root_maxbytes;
extern unsigned key_quota_maxkeys;
extern unsigned key_quota_maxbytes;

#define KEYQUOTA_LINK_BYTES        4                /* a link in a keyring is worth 4 bytes */


extern struct kmem_cache *key_jar;
extern struct rb_root key_serial_tree;
extern spinlock_t key_serial_lock;
extern struct mutex key_construction_mutex;
extern wait_queue_head_t request_key_conswq;

extern void key_set_index_key(struct keyring_index_key *index_key);
extern struct key_type *key_type_lookup(const char *type);
extern void key_type_put(struct key_type *ktype);

extern int __key_link_lock(struct key *keyring,
                           const struct keyring_index_key *index_key);
extern int __key_move_lock(struct key *l_keyring, struct key *u_keyring,
                           const struct keyring_index_key *index_key);
extern int __key_link_begin(struct key *keyring,
                            const struct keyring_index_key *index_key,
                            struct assoc_array_edit **_edit);
extern int __key_link_check_live_key(struct key *keyring, struct key *key);
extern void __key_link(struct key *keyring, struct key *key,
                       struct assoc_array_edit **_edit);
extern void __key_link_end(struct key *keyring,
                           const struct keyring_index_key *index_key,
                           struct assoc_array_edit *edit);

extern key_ref_t find_key_to_update(key_ref_t keyring_ref,
                                    const struct keyring_index_key *index_key);

struct keyring_search_context {
        struct keyring_index_key index_key;
        const struct cred        *cred;
        struct key_match_data        match_data;
        unsigned                flags;
#define KEYRING_SEARCH_NO_STATE_CHECK        0x0001        /* Skip state checks */
#define KEYRING_SEARCH_DO_STATE_CHECK        0x0002        /* Override NO_STATE_CHECK */
#define KEYRING_SEARCH_NO_UPDATE_TIME        0x0004        /* Don't update times */
#define KEYRING_SEARCH_NO_CHECK_PERM        0x0008        /* Don't check permissions */
#define KEYRING_SEARCH_DETECT_TOO_DEEP        0x0010        /* Give an error on excessive depth */
#define KEYRING_SEARCH_SKIP_EXPIRED        0x0020        /* Ignore expired keys (intention to replace) */
#define KEYRING_SEARCH_RECURSE                0x0040        /* Search child keyrings also */

        int (*iterator)(const void *object, void *iterator_data);

        /* Internal stuff */
        int                        skipped_ret;
        bool                        possessed;
        key_ref_t                result;
        time64_t                now;
};

extern bool key_default_cmp(const struct key *key,
                            const struct key_match_data *match_data);
extern key_ref_t keyring_search_rcu(key_ref_t keyring_ref,
                                    struct keyring_search_context *ctx);

extern key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx);
extern key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx);

extern struct key *find_keyring_by_name(const char *name, bool uid_keyring);

extern int look_up_user_keyrings(struct key **, struct key **);
extern struct key *get_user_session_keyring_rcu(const struct cred *);
extern int install_thread_keyring_to_cred(struct cred *);
extern int install_process_keyring_to_cred(struct cred *);
extern int install_session_keyring_to_cred(struct cred *, struct key *);

extern struct key *request_key_and_link(struct key_type *type,
                                        const char *description,
                                        struct key_tag *domain_tag,
                                        const void *callout_info,
                                        size_t callout_len,
                                        void *aux,
                                        struct key *dest_keyring,
                                        unsigned long flags);

extern bool lookup_user_key_possessed(const struct key *key,
                                      const struct key_match_data *match_data);

extern long join_session_keyring(const char *name);
extern void key_change_session_keyring(struct callback_head *twork);

extern struct work_struct key_gc_work;
extern unsigned key_gc_delay;
extern void keyring_gc(struct key *keyring, time64_t limit);
extern void keyring_restriction_gc(struct key *keyring,
                                   struct key_type *dead_type);
void key_set_expiry(struct key *key, time64_t expiry);
extern void key_schedule_gc(time64_t gc_at);
extern void key_schedule_gc_links(void);
extern void key_gc_keytype(struct key_type *ktype);

extern int key_task_permission(const key_ref_t key_ref,
                               const struct cred *cred,
                               enum key_need_perm need_perm);

static inline void notify_key(struct key *key,
                              enum key_notification_subtype subtype, u32 aux)
{
#ifdef CONFIG_KEY_NOTIFICATIONS
        struct key_notification n = {
                .watch.type        = WATCH_TYPE_KEY_NOTIFY,
                .watch.subtype        = subtype,
                .watch.info        = watch_sizeof(n),
                .key_id                = key_serial(key),
                .aux                = aux,
        };

        post_watch_notification(key->watchers, &n.watch, current_cred(),
                                n.key_id);
#endif
}

/*
 * Check to see whether permission is granted to use a key in the desired way.
 */
static inline int key_permission(const key_ref_t key_ref,
                                 enum key_need_perm need_perm)
{
        return key_task_permission(key_ref, current_cred(), need_perm);
}

extern struct key_type key_type_request_key_auth;
extern struct key *request_key_auth_new(struct key *target,
                                        const char *op,
                                        const void *callout_info,
                                        size_t callout_len,
                                        struct key *dest_keyring);

extern struct key *key_get_instantiation_authkey(key_serial_t target_id);

/*
 * Determine whether a key is dead.
 */
static inline bool key_is_dead(const struct key *key, time64_t limit)
{
        time64_t expiry = key->expiry;

        if (expiry != TIME64_MAX) {
                if (!(key->type->flags & KEY_TYPE_INSTANT_REAP))
                        expiry += key_gc_delay;
                if (expiry <= limit)
                        return true;
        }

        return
                key->flags & ((1 << KEY_FLAG_DEAD) |
                              (1 << KEY_FLAG_INVALIDATED)) ||
                key->domain_tag->removed;
}

/*
 * keyctl() functions
 */
extern long keyctl_get_keyring_ID(key_serial_t, int);
extern long keyctl_join_session_keyring(const char __user *);
extern long keyctl_update_key(key_serial_t, const void __user *, size_t);
extern long keyctl_revoke_key(key_serial_t);
extern long keyctl_keyring_clear(key_serial_t);
extern long keyctl_keyring_link(key_serial_t, key_serial_t);
extern long keyctl_keyring_move(key_serial_t, key_serial_t, key_serial_t, unsigned int);
extern long keyctl_keyring_unlink(key_serial_t, key_serial_t);
extern long keyctl_describe_key(key_serial_t, char __user *, size_t);
extern long keyctl_keyring_search(key_serial_t, const char __user *,
                                  const char __user *, key_serial_t);
extern long keyctl_read_key(key_serial_t, char __user *, size_t);
extern long keyctl_chown_key(key_serial_t, uid_t, gid_t);
extern long keyctl_setperm_key(key_serial_t, key_perm_t);
extern long keyctl_instantiate_key(key_serial_t, const void __user *,
                                   size_t, key_serial_t);
extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
extern long keyctl_set_reqkey_keyring(int);
extern long keyctl_set_timeout(key_serial_t, unsigned);
extern long keyctl_assume_authority(key_serial_t);
extern long keyctl_get_security(key_serial_t keyid, char __user *buffer,
                                size_t buflen);
extern long keyctl_session_to_parent(void);
extern long keyctl_reject_key(key_serial_t, unsigned, unsigned, key_serial_t);
extern long keyctl_instantiate_key_iov(key_serial_t,
                                       const struct iovec __user *,
                                       unsigned, key_serial_t);
extern long keyctl_invalidate_key(key_serial_t);
extern long keyctl_restrict_keyring(key_serial_t id,
                                    const char __user *_type,
                                    const char __user *_restriction);
#ifdef CONFIG_PERSISTENT_KEYRINGS
extern long keyctl_get_persistent(uid_t, key_serial_t);
extern unsigned persistent_keyring_expiry;
#else
static inline long keyctl_get_persistent(uid_t uid, key_serial_t destring)
{
        return -EOPNOTSUPP;
}
#endif

#ifdef CONFIG_KEY_DH_OPERATIONS
extern long keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *,
                              size_t, struct keyctl_kdf_params __user *);
extern long __keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *,
                                size_t, struct keyctl_kdf_params *);
#ifdef CONFIG_COMPAT
extern long compat_keyctl_dh_compute(struct keyctl_dh_params __user *params,
                                char __user *buffer, size_t buflen,
                                struct compat_keyctl_kdf_params __user *kdf);
#endif
#define KEYCTL_KDF_MAX_OUTPUT_LEN        1024        /* max length of KDF output */
#define KEYCTL_KDF_MAX_OI_LEN                64        /* max length of otherinfo */
#else
static inline long keyctl_dh_compute(struct keyctl_dh_params __user *params,
                                     char __user *buffer, size_t buflen,
                                     struct keyctl_kdf_params __user *kdf)
{
        return -EOPNOTSUPP;
}

#ifdef CONFIG_COMPAT
static inline long compat_keyctl_dh_compute(
                                struct keyctl_dh_params __user *params,
                                char __user *buffer, size_t buflen,
                                struct keyctl_kdf_params __user *kdf)
{
        return -EOPNOTSUPP;
}
#endif
#endif

#ifdef CONFIG_ASYMMETRIC_KEY_TYPE
extern long keyctl_pkey_query(key_serial_t,
                              const char __user *,
                              struct keyctl_pkey_query __user *);

extern long keyctl_pkey_verify(const struct keyctl_pkey_params __user *,
                               const char __user *,
                               const void __user *, const void __user *);

extern long keyctl_pkey_e_d_s(int,
                              const struct keyctl_pkey_params __user *,
                              const char __user *,
                              const void __user *, void __user *);
#else
static inline long keyctl_pkey_query(key_serial_t id,
                                     const char __user *_info,
                                     struct keyctl_pkey_query __user *_res)
{
        return -EOPNOTSUPP;
}

static inline long keyctl_pkey_verify(const struct keyctl_pkey_params __user *params,
                                      const char __user *_info,
                                      const void __user *_in,
                                      const void __user *_in2)
{
        return -EOPNOTSUPP;
}

static inline long keyctl_pkey_e_d_s(int op,
                                     const struct keyctl_pkey_params __user *params,
                                     const char __user *_info,
                                     const void __user *_in,
                                     void __user *_out)
{
        return -EOPNOTSUPP;
}
#endif

extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen);

#ifdef CONFIG_KEY_NOTIFICATIONS
extern long keyctl_watch_key(key_serial_t, int, int);
#else
static inline long keyctl_watch_key(key_serial_t key_id, int watch_fd, int watch_id)
{
        return -EOPNOTSUPP;
}
#endif

/*
 * Debugging key validation
 */
#ifdef KEY_DEBUGGING
extern void __key_check(const struct key *);

static inline void key_check(const struct key *key)
{
        if (key && (IS_ERR(key) || key->magic != KEY_DEBUG_MAGIC))
                __key_check(key);
}

#else

#define key_check(key) do {} while(0)

#endif
#endif /* _INTERNAL_H */










































    1 






























































































































































































































































































    1 






























    1 

    1 



    1 



































    1 







































    1 



















    1 

    1 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
// SPDX-License-Identifier: GPL-2.0-only
/*
 * umh - the kernel usermode helper
 */
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/binfmts.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fs_struct.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/resource.h>
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <linux/rwsem.h>
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
#include <linux/initrd.h>
#include <linux/freezer.h>

#include <trace/events/module.h>

static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
static DEFINE_SPINLOCK(umh_sysctl_lock);
static DECLARE_RWSEM(umhelper_sem);

static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
        if (info->cleanup)
                (*info->cleanup)(info);
        kfree(info);
}

static void umh_complete(struct subprocess_info *sub_info)
{
        struct completion *comp = xchg(&sub_info->complete, NULL);
        /*
         * See call_usermodehelper_exec(). If xchg() returns NULL
         * we own sub_info, the UMH_KILLABLE caller has gone away
         * or the caller used UMH_NO_WAIT.
         */
        if (comp)
                complete(comp);
        else
                call_usermodehelper_freeinfo(sub_info);
}

/*
 * This is the task which runs the usermode application
 */
static int call_usermodehelper_exec_async(void *data)
{
        struct subprocess_info *sub_info = data;
        struct cred *new;
        int retval;

        spin_lock_irq(&current->sighand->siglock);
        flush_signal_handlers(current, 1);
        spin_unlock_irq(&current->sighand->siglock);

        /*
         * Initial kernel threads share ther FS with init, in order to
         * get the init root directory. But we've now created a new
         * thread that is going to execve a user process and has its own
         * 'struct fs_struct'. Reset umask to the default.
         */
        current->fs->umask = 0022;

        /*
         * Our parent (unbound workqueue) runs with elevated scheduling
         * priority. Avoid propagating that into the userspace child.
         */
        set_user_nice(current, 0);

        retval = -ENOMEM;
        new = prepare_kernel_cred(current);
        if (!new)
                goto out;

        spin_lock(&umh_sysctl_lock);
        new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
        new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
                                             new->cap_inheritable);
        spin_unlock(&umh_sysctl_lock);

        if (sub_info->init) {
                retval = sub_info->init(sub_info, new);
                if (retval) {
                        abort_creds(new);
                        goto out;
                }
        }

        commit_creds(new);

        wait_for_initramfs();
        retval = kernel_execve(sub_info->path,
                               (const char *const *)sub_info->argv,
                               (const char *const *)sub_info->envp);
out:
        sub_info->retval = retval;
        /*
         * call_usermodehelper_exec_sync() will call umh_complete
         * if UHM_WAIT_PROC.
         */
        if (!(sub_info->wait & UMH_WAIT_PROC))
                umh_complete(sub_info);
        if (!retval)
                return 0;
        do_exit(0);
}

/* Handles UMH_WAIT_PROC.  */
static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
        pid_t pid;

        /* If SIGCLD is ignored do_wait won't populate the status. */
        kernel_sigaction(SIGCHLD, SIG_DFL);
        pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
        if (pid < 0)
                sub_info->retval = pid;
        else
                kernel_wait(pid, &sub_info->retval);

        /* Restore default kernel sig handler */
        kernel_sigaction(SIGCHLD, SIG_IGN);
        umh_complete(sub_info);
}

/*
 * We need to create the usermodehelper kernel thread from a task that is affine
 * to an optimized set of CPUs (or nohz housekeeping ones) such that they
 * inherit a widest affinity irrespective of call_usermodehelper() callers with
 * possibly reduced affinity (eg: per-cpu workqueues). We don't want
 * usermodehelper targets to contend a busy CPU.
 *
 * Unbound workqueues provide such wide affinity and allow to block on
 * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
 *
 * Besides, workqueues provide the privilege level that caller might not have
 * to perform the usermodehelper request.
 *
 */
static void call_usermodehelper_exec_work(struct work_struct *work)
{
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);

        if (sub_info->wait & UMH_WAIT_PROC) {
                call_usermodehelper_exec_sync(sub_info);
        } else {
                pid_t pid;
                /*
                 * Use CLONE_PARENT to reparent it to kthreadd; we do not
                 * want to pollute current->children, and we need a parent
                 * that always ignores SIGCHLD to ensure auto-reaping.
                 */
                pid = user_mode_thread(call_usermodehelper_exec_async, sub_info,
                                       CLONE_PARENT | SIGCHLD);
                if (pid < 0) {
                        sub_info->retval = pid;
                        umh_complete(sub_info);
                }
        }
}

/*
 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
 * (used for preventing user land processes from being created after the user
 * land has been frozen during a system-wide hibernation or suspend operation).
 * Should always be manipulated under umhelper_sem acquired for write.
 */
static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;

/* Number of helpers running */
static atomic_t running_helpers = ATOMIC_INIT(0);

/*
 * Wait queue head used by usermodehelper_disable() to wait for all running
 * helpers to finish.
 */
static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);

/*
 * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
 * to become 'false'.
 */
static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);

/*
 * Time to wait for running_helpers to become zero before the setting of
 * usermodehelper_disabled in usermodehelper_disable() fails
 */
#define RUNNING_HELPERS_TIMEOUT        (5 * HZ)

int usermodehelper_read_trylock(void)
{
        DEFINE_WAIT(wait);
        int ret = 0;

        down_read(&umhelper_sem);
        for (;;) {
                prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
                                TASK_INTERRUPTIBLE);
                if (!usermodehelper_disabled)
                        break;

                if (usermodehelper_disabled == UMH_DISABLED)
                        ret = -EAGAIN;

                up_read(&umhelper_sem);

                if (ret)
                        break;

                schedule();
                try_to_freeze();

                down_read(&umhelper_sem);
        }
        finish_wait(&usermodehelper_disabled_waitq, &wait);
        return ret;
}
EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);

long usermodehelper_read_lock_wait(long timeout)
{
        DEFINE_WAIT(wait);

        if (timeout < 0)
                return -EINVAL;

        down_read(&umhelper_sem);
        for (;;) {
                prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
                                TASK_UNINTERRUPTIBLE);
                if (!usermodehelper_disabled)
                        break;

                up_read(&umhelper_sem);

                timeout = schedule_timeout(timeout);
                if (!timeout)
                        break;

                down_read(&umhelper_sem);
        }
        finish_wait(&usermodehelper_disabled_waitq, &wait);
        return timeout;
}
EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);

void usermodehelper_read_unlock(void)
{
        up_read(&umhelper_sem);
}
EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);

/**
 * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
 * @depth: New value to assign to usermodehelper_disabled.
 *
 * Change the value of usermodehelper_disabled (under umhelper_sem locked for
 * writing) and wakeup tasks waiting for it to change.
 */
void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
{
        down_write(&umhelper_sem);
        usermodehelper_disabled = depth;
        wake_up(&usermodehelper_disabled_waitq);
        up_write(&umhelper_sem);
}

/**
 * __usermodehelper_disable - Prevent new helpers from being started.
 * @depth: New value to assign to usermodehelper_disabled.
 *
 * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
 */
int __usermodehelper_disable(enum umh_disable_depth depth)
{
        long retval;

        if (!depth)
                return -EINVAL;

        down_write(&umhelper_sem);
        usermodehelper_disabled = depth;
        up_write(&umhelper_sem);

        /*
         * From now on call_usermodehelper_exec() won't start any new
         * helpers, so it is sufficient if running_helpers turns out to
         * be zero at one point (it may be increased later, but that
         * doesn't matter).
         */
        retval = wait_event_timeout(running_helpers_waitq,
                                        atomic_read(&running_helpers) == 0,
                                        RUNNING_HELPERS_TIMEOUT);
        if (retval)
                return 0;

        __usermodehelper_set_disable_depth(UMH_ENABLED);
        return -EAGAIN;
}

static void helper_lock(void)
{
        atomic_inc(&running_helpers);
        smp_mb__after_atomic();
}

static void helper_unlock(void)
{
        if (atomic_dec_and_test(&running_helpers))
                wake_up(&running_helpers_waitq);
}

/**
 * call_usermodehelper_setup - prepare to call a usermode helper
 * @path: path to usermode executable
 * @argv: arg vector for process
 * @envp: environment for process
 * @gfp_mask: gfp mask for memory allocation
 * @init: an init function
 * @cleanup: a cleanup function
 * @data: arbitrary context sensitive data
 *
 * Returns either %NULL on allocation failure, or a subprocess_info
 * structure.  This should be passed to call_usermodehelper_exec to
 * exec the process and free the structure.
 *
 * The init function is used to customize the helper process prior to
 * exec.  A non-zero return code causes the process to error out, exit,
 * and return the failure to the calling process
 *
 * The cleanup function is just before the subprocess_info is about to
 * be freed.  This can be used for freeing the argv and envp.  The
 * Function must be runnable in either a process context or the
 * context in which call_usermodehelper_exec is called.
 */
struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
                char **envp, gfp_t gfp_mask,
                int (*init)(struct subprocess_info *info, struct cred *new),
                void (*cleanup)(struct subprocess_info *info),
                void *data)
{
        struct subprocess_info *sub_info;
        sub_info = kzalloc_obj(struct subprocess_info, gfp_mask);
        if (!sub_info)
                goto out;

        INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);

#ifdef CONFIG_STATIC_USERMODEHELPER
        sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
#else
        sub_info->path = path;
#endif
        sub_info->argv = argv;
        sub_info->envp = envp;

        sub_info->cleanup = cleanup;
        sub_info->init = init;
        sub_info->data = data;
  out:
        return sub_info;
}
EXPORT_SYMBOL(call_usermodehelper_setup);

/**
 * call_usermodehelper_exec - start a usermode application
 * @sub_info: information about the subprocess
 * @wait: wait for the application to finish and return status.
 *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
 *        when the program couldn't be exec'ed. This makes it safe to call
 *        from interrupt context.
 *
 * Runs a user-space application.  The application is started
 * asynchronously if wait is not set, and runs as a child of system workqueues.
 * (ie. it runs with full root capabilities and optimized affinity).
 *
 * Note: successful return value does not guarantee the helper was called at
 * all. You can't rely on sub_info->{init,cleanup} being called even for
 * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers
 * into a successful no-op.
 */
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
        unsigned int state = TASK_UNINTERRUPTIBLE;
        DECLARE_COMPLETION_ONSTACK(done);
        int retval = 0;

        if (!sub_info->path) {
                call_usermodehelper_freeinfo(sub_info);
                return -EINVAL;
        }
        helper_lock();
        if (usermodehelper_disabled) {
                retval = -EBUSY;
                goto out;
        }

        /*
         * If there is no binary for us to call, then just return and get out of
         * here.  This allows us to set STATIC_USERMODEHELPER_PATH to "" and
         * disable all call_usermodehelper() calls.
         */
        if (strlen(sub_info->path) == 0)
                goto out;

        /*
         * Set the completion pointer only if there is a waiter.
         * This makes it possible to use umh_complete to free
         * the data structure in case of UMH_NO_WAIT.
         */
        sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
        sub_info->wait = wait;

        queue_work(system_unbound_wq, &sub_info->work);
        if (wait == UMH_NO_WAIT)        /* task has freed sub_info */
                goto unlock;

        if (wait & UMH_FREEZABLE)
                state |= TASK_FREEZABLE;

        if (wait & UMH_KILLABLE) {
                retval = wait_for_completion_state(&done, state | TASK_KILLABLE);
                if (!retval)
                        goto wait_done;

                /* umh_complete() will see NULL and free sub_info */
                if (xchg(&sub_info->complete, NULL))
                        goto unlock;

                /*
                 * fallthrough; in case of -ERESTARTSYS now do uninterruptible
                 * wait_for_completion_state(). Since umh_complete() shall call
                 * complete() in a moment if xchg() above returned NULL, this
                 * uninterruptible wait_for_completion_state() will not block
                 * SIGKILL'ed processes for long.
                 */
        }
        wait_for_completion_state(&done, state);

wait_done:
        retval = sub_info->retval;
out:
        call_usermodehelper_freeinfo(sub_info);
unlock:
        helper_unlock();
        return retval;
}
EXPORT_SYMBOL(call_usermodehelper_exec);

/**
 * call_usermodehelper() - prepare and start a usermode application
 * @path: path to usermode executable
 * @argv: arg vector for process
 * @envp: environment for process
 * @wait: wait for the application to finish and return status.
 *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
 *        when the program couldn't be exec'ed. This makes it safe to call
 *        from interrupt context.
 *
 * This function is the equivalent to use call_usermodehelper_setup() and
 * call_usermodehelper_exec().
 */
int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
{
        struct subprocess_info *info;
        gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;

        info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
                                         NULL, NULL, NULL);
        if (info == NULL)
                return -ENOMEM;

        return call_usermodehelper_exec(info, wait);
}
EXPORT_SYMBOL(call_usermodehelper);

#if defined(CONFIG_SYSCTL)
static int proc_cap_handler(const struct ctl_table *table, int write,
                         void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        unsigned long cap_array[2];
        kernel_cap_t new_cap, *cap;
        int err;

        if (write && (!capable(CAP_SETPCAP) ||
                      !capable(CAP_SYS_MODULE)))
                return -EPERM;

        /*
         * convert from the global kernel_cap_t to the ulong array to print to
         * userspace if this is a read.
         *
         * Legacy format: capabilities are exposed as two 32-bit values
         */
        cap = table->data;
        spin_lock(&umh_sysctl_lock);
        cap_array[0] = (u32) cap->val;
        cap_array[1] = cap->val >> 32;
        spin_unlock(&umh_sysctl_lock);

        t = *table;
        t.data = &cap_array;

        /*
         * actually read or write and array of ulongs from userspace.  Remember
         * these are least significant 32 bits first
         */
        err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
        if (err < 0)
                return err;

        new_cap.val = (u32)cap_array[0];
        new_cap.val += (u64)cap_array[1] << 32;

        /*
         * Drop everything not in the new_cap (but don't add things)
         */
        if (write) {
                spin_lock(&umh_sysctl_lock);
                *cap = cap_intersect(*cap, new_cap);
                spin_unlock(&umh_sysctl_lock);
        }

        return 0;
}

static const struct ctl_table usermodehelper_table[] = {
        {
                .procname        = "bset",
                .data                = &usermodehelper_bset,
                .maxlen                = 2 * sizeof(unsigned long),
                .mode                = 0600,
                .proc_handler        = proc_cap_handler,
        },
        {
                .procname        = "inheritable",
                .data                = &usermodehelper_inheritable,
                .maxlen                = 2 * sizeof(unsigned long),
                .mode                = 0600,
                .proc_handler        = proc_cap_handler,
        },
};

static int __init init_umh_sysctls(void)
{
        register_sysctl_init("kernel/usermodehelper", usermodehelper_table);
        return 0;
}
early_initcall(init_umh_sysctls);
#endif /* CONFIG_SYSCTL */






























    1 

















    1 











    2 

































































    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2010
 * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * xattr_id.c
 */

/*
 * This file implements code to map the 32-bit xattr id stored in the inode
 * into the on disk location of the xattr data.
 */

#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/slab.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs.h"
#include "xattr.h"

/*
 * Map xattr id using the xattr id look up table
 */
int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
                int *count, unsigned int *size, unsigned long long *xattr)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        int block = SQUASHFS_XATTR_BLOCK(index);
        int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
        u64 start_block;
        struct squashfs_xattr_id id;
        int err;

        if (index >= msblk->xattr_ids)
                return -EINVAL;

        start_block = le64_to_cpu(msblk->xattr_id_table[block]);

        err = squashfs_read_metadata(sb, &id, &start_block, &offset,
                                                        sizeof(id));
        if (err < 0)
                return err;

        *xattr = le64_to_cpu(id.xattr);
        *size = le32_to_cpu(id.size);
        *count = le32_to_cpu(id.count);
        return 0;
}


/*
 * Read uncompressed xattr id lookup table indexes from disk into memory
 */
__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
                u64 *xattr_table_start, unsigned int *xattr_ids)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        unsigned int len, indexes;
        struct squashfs_xattr_id_table *id_table;
        __le64 *table;
        u64 start, end;
        int n;

        id_table = squashfs_read_table(sb, table_start, sizeof(*id_table));
        if (IS_ERR(id_table))
                return (__le64 *) id_table;

        *xattr_table_start = le64_to_cpu(id_table->xattr_table_start);
        *xattr_ids = le32_to_cpu(id_table->xattr_ids);
        kfree(id_table);

        /* Sanity check values */

        /* there is always at least one xattr id */
        if (*xattr_ids == 0)
                return ERR_PTR(-EINVAL);

        len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
        indexes = SQUASHFS_XATTR_BLOCKS(*xattr_ids);

        /*
         * The computed size of the index table (len bytes) should exactly
         * match the table start and end points
         */
        start = table_start + sizeof(*id_table);
        end = msblk->bytes_used;

        if (len != (end - start))
                return ERR_PTR(-EINVAL);

        table = squashfs_read_table(sb, start, len);
        if (IS_ERR(table))
                return table;

        /* table[0], table[1], ... table[indexes - 1] store the locations
         * of the compressed xattr id blocks.  Each entry should be less than
         * the next (i.e. table[0] < table[1]), and the difference between them
         * should be SQUASHFS_METADATA_SIZE or less.  table[indexes - 1]
         * should be less than table_start, and again the difference
         * shouls be SQUASHFS_METADATA_SIZE or less.
         *
         * Finally xattr_table_start should be less than table[0].
         */
        for (n = 0; n < (indexes - 1); n++) {
                start = le64_to_cpu(table[n]);
                end = le64_to_cpu(table[n + 1]);

                if (start >= end || (end - start) >
                                (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                        kfree(table);
                        return ERR_PTR(-EINVAL);
                }
        }

        start = le64_to_cpu(table[indexes - 1]);
        if (start >= table_start || (table_start - start) >
                                (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                kfree(table);
                return ERR_PTR(-EINVAL);
        }

        if (*xattr_table_start >= le64_to_cpu(table[0])) {
                kfree(table);
                return ERR_PTR(-EINVAL);
        }

        return table;
}
















































































































    1 



    1 












    1 




    1 



































































































































































    1 




















   13 







   14 






























   13 



































   13 













   12 
    1 
    9 
   10 
   11 
   10 


































   11 























    7 





   11 



























    3 
    4 




















    1 






    1 



















































































   11 


   10 






























































    9 









   15 

   12 

    1 





































   15 


    1 








   12 







   14 







   14 





















   12 
   13 

















    1 
























































































































































































































































































































































































































































































































































































   13 




    4 

    4 























    2 








































































































































































































   14 

















   12 

















   12 
   13 




















   12 





























   10 



    2 



    1 












   10 





    9 




    9 
















   14 










   11 












    4 






















    3 

































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/super.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  super.c contains code to handle: - mount structures
 *                                   - super-block tables
 *                                   - filesystem drivers list
 *                                   - mount system call
 *                                   - umount system call
 *                                   - ustat system call
 *
 * GK 2/5/95  -  Changed to support mounting the root fs via NFS
 *
 *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
 *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
 *  Added options to /proc/mounts:
 *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
 *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
 *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
 */

#include <linux/export.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/writeback.h>                /* for the emergency remount stuff */
#include <linux/idr.h>
#include <linux/mutex.h>
#include <linux/backing-dev.h>
#include <linux/rculist_bl.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
#include <linux/fs_context.h>
#include <linux/fserror.h>
#include <uapi/linux/mount.h>
#include "internal.h"

static int thaw_super_locked(struct super_block *sb, enum freeze_holder who,
                             const void *freeze_owner);

static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);

static char *sb_writers_name[SB_FREEZE_LEVELS] = {
        "sb_writers",
        "sb_pagefaults",
        "sb_internal",
};

static inline void __super_lock(struct super_block *sb, bool excl)
{
        if (excl)
                down_write(&sb->s_umount);
        else
                down_read(&sb->s_umount);
}

static inline void super_unlock(struct super_block *sb, bool excl)
{
        if (excl)
                up_write(&sb->s_umount);
        else
                up_read(&sb->s_umount);
}

static inline void __super_lock_excl(struct super_block *sb)
{
        __super_lock(sb, true);
}

static inline void super_unlock_excl(struct super_block *sb)
{
        super_unlock(sb, true);
}

static inline void super_unlock_shared(struct super_block *sb)
{
        super_unlock(sb, false);
}

static bool super_flags(const struct super_block *sb, unsigned int flags)
{
        /*
         * Pairs with smp_store_release() in super_wake() and ensures
         * that we see @flags after we're woken.
         */
        return smp_load_acquire(&sb->s_flags) & flags;
}

/**
 * super_lock - wait for superblock to become ready and lock it
 * @sb: superblock to wait for
 * @excl: whether exclusive access is required
 *
 * If the superblock has neither passed through vfs_get_tree() or
 * generic_shutdown_super() yet wait for it to happen. Either superblock
 * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
 * woken and we'll see SB_DYING.
 *
 * The caller must have acquired a temporary reference on @sb->s_count.
 *
 * Return: The function returns true if SB_BORN was set and with
 *         s_umount held. The function returns false if SB_DYING was
 *         set and without s_umount held.
 */
static __must_check bool super_lock(struct super_block *sb, bool excl)
{
        lockdep_assert_not_held(&sb->s_umount);

        /* wait until the superblock is ready or dying */
        wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING));

        /* Don't pointlessly acquire s_umount. */
        if (super_flags(sb, SB_DYING))
                return false;

        __super_lock(sb, excl);

        /*
         * Has gone through generic_shutdown_super() in the meantime.
         * @sb->s_root is NULL and @sb->s_active is 0. No one needs to
         * grab a reference to this. Tell them so.
         */
        if (sb->s_flags & SB_DYING) {
                super_unlock(sb, excl);
                return false;
        }

        WARN_ON_ONCE(!(sb->s_flags & SB_BORN));
        return true;
}

/* wait and try to acquire read-side of @sb->s_umount */
static inline bool super_lock_shared(struct super_block *sb)
{
        return super_lock(sb, false);
}

/* wait and try to acquire write-side of @sb->s_umount */
static inline bool super_lock_excl(struct super_block *sb)
{
        return super_lock(sb, true);
}

/* wake waiters */
#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD)
static void super_wake(struct super_block *sb, unsigned int flag)
{
        WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS));
        WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1);

        /*
         * Pairs with smp_load_acquire() in super_lock() to make sure
         * all initializations in the superblock are seen by the user
         * seeing SB_BORN sent.
         */
        smp_store_release(&sb->s_flags, sb->s_flags | flag);
        /*
         * Pairs with the barrier in prepare_to_wait_event() to make sure
         * ___wait_var_event() either sees SB_BORN set or
         * waitqueue_active() check in wake_up_var() sees the waiter.
         */
        smp_mb();
        wake_up_var(&sb->s_flags);
}

/*
 * One thing we have to be careful of with a per-sb shrinker is that we don't
 * drop the last active reference to the superblock from within the shrinker.
 * If that happens we could trigger unregistering the shrinker from within the
 * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
 * take a passive reference to the superblock to avoid this from occurring.
 */
static unsigned long super_cache_scan(struct shrinker *shrink,
                                      struct shrink_control *sc)
{
        struct super_block *sb;
        long        fs_objects = 0;
        long        total_objects;
        long        freed = 0;
        long        dentries;
        long        inodes;

        sb = shrink->private_data;

        /*
         * Deadlock avoidance.  We may hold various FS locks, and we don't want
         * to recurse into the FS that called us in clear_inode() and friends..
         */
        if (!(sc->gfp_mask & __GFP_FS))
                return SHRINK_STOP;

        if (!super_trylock_shared(sb))
                return SHRINK_STOP;

        if (sb->s_op->nr_cached_objects)
                fs_objects = sb->s_op->nr_cached_objects(sb, sc);

        inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
        dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
        total_objects = dentries + inodes + fs_objects;
        if (!total_objects)
                total_objects = 1;

        /* proportion the scan between the caches */
        dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
        inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
        fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);

        /*
         * prune the dcache first as the icache is pinned by it, then
         * prune the icache, followed by the filesystem specific caches
         *
         * Ensure that we always scan at least one object - memcg kmem
         * accounting uses this to fully empty the caches.
         */
        sc->nr_to_scan = dentries + 1;
        freed = prune_dcache_sb(sb, sc);
        sc->nr_to_scan = inodes + 1;
        freed += prune_icache_sb(sb, sc);

        if (fs_objects) {
                sc->nr_to_scan = fs_objects + 1;
                freed += sb->s_op->free_cached_objects(sb, sc);
        }

        super_unlock_shared(sb);
        return freed;
}

static unsigned long super_cache_count(struct shrinker *shrink,
                                       struct shrink_control *sc)
{
        struct super_block *sb;
        long        total_objects = 0;

        sb = shrink->private_data;

        /*
         * We don't call super_trylock_shared() here as it is a scalability
         * bottleneck, so we're exposed to partial setup state. The shrinker
         * rwsem does not protect filesystem operations backing
         * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can
         * change between super_cache_count and super_cache_scan, so we really
         * don't need locks here.
         *
         * However, if we are currently mounting the superblock, the underlying
         * filesystem might be in a state of partial construction and hence it
         * is dangerous to access it.  super_trylock_shared() uses a SB_BORN check
         * to avoid this situation, so do the same here. The memory barrier is
         * matched with the one in mount_fs() as we don't hold locks here.
         */
        if (!(sb->s_flags & SB_BORN))
                return 0;
        smp_rmb();

        if (sb->s_op && sb->s_op->nr_cached_objects)
                total_objects = sb->s_op->nr_cached_objects(sb, sc);

        total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
        total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);

        if (!total_objects)
                return SHRINK_EMPTY;

        total_objects = vfs_pressure_ratio(total_objects);
        return total_objects;
}

static void destroy_super_work(struct work_struct *work)
{
        struct super_block *s = container_of(work, struct super_block,
                                                        destroy_work);
        fsnotify_sb_free(s);
        security_sb_free(s);
        put_user_ns(s->s_user_ns);
        kfree(s->s_subtype);
        for (int i = 0; i < SB_FREEZE_LEVELS; i++)
                percpu_free_rwsem(&s->s_writers.rw_sem[i]);
        kfree(s);
}

static void destroy_super_rcu(struct rcu_head *head)
{
        struct super_block *s = container_of(head, struct super_block, rcu);
        INIT_WORK(&s->destroy_work, destroy_super_work);
        schedule_work(&s->destroy_work);
}

/* Free a superblock that has never been seen by anyone */
static void destroy_unused_super(struct super_block *s)
{
        if (!s)
                return;
        super_unlock_excl(s);
        list_lru_destroy(&s->s_dentry_lru);
        list_lru_destroy(&s->s_inode_lru);
        shrinker_free(s->s_shrink);
        /* no delays needed */
        destroy_super_work(&s->destroy_work);
}

/**
 *        alloc_super        -        create new superblock
 *        @type:        filesystem type superblock should belong to
 *        @flags: the mount flags
 *        @user_ns: User namespace for the super_block
 *
 *        Allocates and initializes a new &struct super_block.  alloc_super()
 *        returns a pointer new superblock or %NULL if allocation had failed.
 */
static struct super_block *alloc_super(struct file_system_type *type, int flags,
                                       struct user_namespace *user_ns)
{
        struct super_block *s = kzalloc_obj(struct super_block);
        static const struct super_operations default_op;
        int i;

        if (!s)
                return NULL;

        s->s_user_ns = get_user_ns(user_ns);
        init_rwsem(&s->s_umount);
        lockdep_set_class(&s->s_umount, &type->s_umount_key);
        /*
         * sget() can have s_umount recursion.
         *
         * When it cannot find a suitable sb, it allocates a new
         * one (this one), and tries again to find a suitable old
         * one.
         *
         * In case that succeeds, it will acquire the s_umount
         * lock of the old one. Since these are clearly distrinct
         * locks, and this object isn't exposed yet, there's no
         * risk of deadlocks.
         *
         * Annotate this by putting this lock in a different
         * subclass.
         */
        down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);

        if (security_sb_alloc(s))
                goto fail;

        for (i = 0; i < SB_FREEZE_LEVELS; i++) {
                if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
                                        sb_writers_name[i],
                                        &type->s_writers_key[i]))
                        goto fail;
        }
        s->s_bdi = &noop_backing_dev_info;
        s->s_flags = flags;
        if (s->s_user_ns != &init_user_ns)
                s->s_iflags |= SB_I_NODEV;
        INIT_HLIST_NODE(&s->s_instances);
        INIT_HLIST_BL_HEAD(&s->s_roots);
        mutex_init(&s->s_sync_lock);
        INIT_LIST_HEAD(&s->s_inodes);
        spin_lock_init(&s->s_inode_list_lock);
        INIT_LIST_HEAD(&s->s_inodes_wb);
        spin_lock_init(&s->s_inode_wblist_lock);
        fserror_mount(s);

        s->s_count = 1;
        atomic_set(&s->s_active, 1);
        mutex_init(&s->s_vfs_rename_mutex);
        lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
        init_rwsem(&s->s_dquot.dqio_sem);
        s->s_maxbytes = MAX_NON_LFS;
        s->s_op = &default_op;
        s->s_time_gran = 1000000000;
        s->s_time_min = TIME64_MIN;
        s->s_time_max = TIME64_MAX;

        s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
                                     "sb-%s", type->name);
        if (!s->s_shrink)
                goto fail;

        s->s_shrink->scan_objects = super_cache_scan;
        s->s_shrink->count_objects = super_cache_count;
        s->s_shrink->batch = 1024;
        s->s_shrink->private_data = s;

        if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink))
                goto fail;
        if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
                goto fail;
        s->s_min_writeback_pages = MIN_WRITEBACK_PAGES;
        return s;

fail:
        destroy_unused_super(s);
        return NULL;
}

/* Superblock refcounting  */

/*
 * Drop a superblock's refcount.  The caller must hold sb_lock.
 */
static void __put_super(struct super_block *s)
{
        if (!--s->s_count) {
                list_del_init(&s->s_list);
                WARN_ON(s->s_dentry_lru.node);
                WARN_ON(s->s_inode_lru.node);
                WARN_ON(s->s_mounts);
                call_rcu(&s->rcu, destroy_super_rcu);
        }
}

/**
 *        put_super        -        drop a temporary reference to superblock
 *        @sb: superblock in question
 *
 *        Drops a temporary reference, frees superblock if there's no
 *        references left.
 */
void put_super(struct super_block *sb)
{
        spin_lock(&sb_lock);
        __put_super(sb);
        spin_unlock(&sb_lock);
}

static void kill_super_notify(struct super_block *sb)
{
        lockdep_assert_not_held(&sb->s_umount);

        /* already notified earlier */
        if (sb->s_flags & SB_DEAD)
                return;

        /*
         * Remove it from @fs_supers so it isn't found by new
         * sget{_fc}() walkers anymore. Any concurrent mounter still
         * managing to grab a temporary reference is guaranteed to
         * already see SB_DYING and will wait until we notify them about
         * SB_DEAD.
         */
        spin_lock(&sb_lock);
        hlist_del_init(&sb->s_instances);
        spin_unlock(&sb_lock);

        /*
         * Let concurrent mounts know that this thing is really dead.
         * We don't need @sb->s_umount here as every concurrent caller
         * will see SB_DYING and either discard the superblock or wait
         * for SB_DEAD.
         */
        super_wake(sb, SB_DEAD);
}

/**
 *        deactivate_locked_super        -        drop an active reference to superblock
 *        @s: superblock to deactivate
 *
 *        Drops an active reference to superblock, converting it into a temporary
 *        one if there is no other active references left.  In that case we
 *        tell fs driver to shut it down and drop the temporary reference we
 *        had just acquired.
 *
 *        Caller holds exclusive lock on superblock; that lock is released.
 */
void deactivate_locked_super(struct super_block *s)
{
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
                shrinker_free(s->s_shrink);
                fs->kill_sb(s);

                kill_super_notify(s);

                /*
                 * Since list_lru_destroy() may sleep, we cannot call it from
                 * put_super(), where we hold the sb_lock. Therefore we destroy
                 * the lru lists right now.
                 */
                list_lru_destroy(&s->s_dentry_lru);
                list_lru_destroy(&s->s_inode_lru);

                put_filesystem(fs);
                put_super(s);
        } else {
                super_unlock_excl(s);
        }
}

EXPORT_SYMBOL(deactivate_locked_super);

/**
 *        deactivate_super        -        drop an active reference to superblock
 *        @s: superblock to deactivate
 *
 *        Variant of deactivate_locked_super(), except that superblock is *not*
 *        locked by caller.  If we are going to drop the final active reference,
 *        lock will be acquired prior to that.
 */
void deactivate_super(struct super_block *s)
{
        if (!atomic_add_unless(&s->s_active, -1, 1)) {
                __super_lock_excl(s);
                deactivate_locked_super(s);
        }
}

EXPORT_SYMBOL(deactivate_super);

/**
 * grab_super - acquire an active reference to a superblock
 * @sb: superblock to acquire
 *
 * Acquire a temporary reference on a superblock and try to trade it for
 * an active reference. This is used in sget{_fc}() to wait for a
 * superblock to either become SB_BORN or for it to pass through
 * sb->kill() and be marked as SB_DEAD.
 *
 * Return: This returns true if an active reference could be acquired,
 *         false if not.
 */
static bool grab_super(struct super_block *sb)
{
        bool locked;

        sb->s_count++;
        spin_unlock(&sb_lock);
        locked = super_lock_excl(sb);
        if (locked) {
                if (atomic_inc_not_zero(&sb->s_active)) {
                        put_super(sb);
                        return true;
                }
                super_unlock_excl(sb);
        }
        wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD));
        put_super(sb);
        return false;
}

/*
 *        super_trylock_shared - try to grab ->s_umount shared
 *        @sb: reference we are trying to grab
 *
 *        Try to prevent fs shutdown.  This is used in places where we
 *        cannot take an active reference but we need to ensure that the
 *        filesystem is not shut down while we are working on it. It returns
 *        false if we cannot acquire s_umount or if we lose the race and
 *        filesystem already got into shutdown, and returns true with the s_umount
 *        lock held in read mode in case of success. On successful return,
 *        the caller must drop the s_umount lock when done.
 *
 *        Note that unlike get_super() et.al. this one does *not* bump ->s_count.
 *        The reason why it's safe is that we are OK with doing trylock instead
 *        of down_read().  There's a couple of places that are OK with that, but
 *        it's very much not a general-purpose interface.
 */
bool super_trylock_shared(struct super_block *sb)
{
        if (down_read_trylock(&sb->s_umount)) {
                if (!(sb->s_flags & SB_DYING) && sb->s_root &&
                    (sb->s_flags & SB_BORN))
                        return true;
                super_unlock_shared(sb);
        }

        return false;
}

/**
 *        retire_super        -        prevents superblock from being reused
 *        @sb: superblock to retire
 *
 *        The function marks superblock to be ignored in superblock test, which
 *        prevents it from being reused for any new mounts.  If the superblock has
 *        a private bdi, it also unregisters it, but doesn't reduce the refcount
 *        of the superblock to prevent potential races.  The refcount is reduced
 *        by generic_shutdown_super().  The function can not be called
 *        concurrently with generic_shutdown_super().  It is safe to call the
 *        function multiple times, subsequent calls have no effect.
 *
 *        The marker will affect the re-use only for block-device-based
 *        superblocks.  Other superblocks will still get marked if this function
 *        is used, but that will not affect their reusability.
 */
void retire_super(struct super_block *sb)
{
        WARN_ON(!sb->s_bdev);
        __super_lock_excl(sb);
        if (sb->s_iflags & SB_I_PERSB_BDI) {
                bdi_unregister(sb->s_bdi);
                sb->s_iflags &= ~SB_I_PERSB_BDI;
        }
        sb->s_iflags |= SB_I_RETIRED;
        super_unlock_excl(sb);
}
EXPORT_SYMBOL(retire_super);

/**
 *        generic_shutdown_super        -        common helper for ->kill_sb()
 *        @sb: superblock to kill
 *
 *        generic_shutdown_super() does all fs-independent work on superblock
 *        shutdown.  Typical ->kill_sb() should pick all fs-specific objects
 *        that need destruction out of superblock, call generic_shutdown_super()
 *        and release aforementioned objects.  Note: dentries and inodes _are_
 *        taken care of and do not need specific handling.
 *
 *        Upon calling this function, the filesystem may no longer alter or
 *        rearrange the set of dentries belonging to this super_block, nor may it
 *        change the attachments of dentries to inodes.
 */
void generic_shutdown_super(struct super_block *sb)
{
        const struct super_operations *sop = sb->s_op;

        if (sb->s_root) {
                fsnotify_sb_delete(sb);
                shrink_dcache_for_umount(sb);
                sync_filesystem(sb);
                sb->s_flags &= ~SB_ACTIVE;

                fserror_unmount(sb);
                cgroup_writeback_umount(sb);

                /* Evict all inodes with zero refcount. */
                evict_inodes(sb);

                /*
                 * Clean up and evict any inodes that still have references due
                 * to the security policy.
                 */
                security_sb_delete(sb);

                if (sb->s_dio_done_wq) {
                        destroy_workqueue(sb->s_dio_done_wq);
                        sb->s_dio_done_wq = NULL;
                }

                if (sop->put_super)
                        sop->put_super(sb);

                /*
                 * Now that all potentially-encrypted inodes have been evicted,
                 * the fscrypt keyring can be destroyed.
                 */
                fscrypt_destroy_keyring(sb);

                if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), NULL,
                                "VFS: Busy inodes after unmount of %s (%s)",
                                sb->s_id, sb->s_type->name)) {
                        /*
                         * Adding a proper bailout path here would be hard, but
                         * we can at least make it more likely that a later
                         * iput_final() or such crashes cleanly.
                         */
                        struct inode *inode;

                        spin_lock(&sb->s_inode_list_lock);
                        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                                inode->i_op = VFS_PTR_POISON;
                                inode->i_sb = VFS_PTR_POISON;
                                inode->i_mapping = VFS_PTR_POISON;
                        }
                        spin_unlock(&sb->s_inode_list_lock);
                }
        }
        /*
         * Broadcast to everyone that grabbed a temporary reference to this
         * superblock before we removed it from @fs_supers that the superblock
         * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
         * discard this superblock and treat it as dead.
         *
         * We leave the superblock on @fs_supers so it can be found by
         * sget{_fc}() until we passed sb->kill_sb().
         */
        super_wake(sb, SB_DYING);
        super_unlock_excl(sb);
        if (sb->s_bdi != &noop_backing_dev_info) {
                if (sb->s_iflags & SB_I_PERSB_BDI)
                        bdi_unregister(sb->s_bdi);
                bdi_put(sb->s_bdi);
                sb->s_bdi = &noop_backing_dev_info;
        }
}

EXPORT_SYMBOL(generic_shutdown_super);

bool mount_capable(struct fs_context *fc)
{
        if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT))
                return capable(CAP_SYS_ADMIN);
        else
                return ns_capable(fc->user_ns, CAP_SYS_ADMIN);
}

/**
 * sget_fc - Find or create a superblock
 * @fc:        Filesystem context.
 * @test: Comparison callback
 * @set: Setup callback
 *
 * Create a new superblock or find an existing one.
 *
 * The @test callback is used to find a matching existing superblock.
 * Whether or not the requested parameters in @fc are taken into account
 * is specific to the @test callback that is used. They may even be
 * completely ignored.
 *
 * If an extant superblock is matched, it will be returned unless:
 *
 * (1) the namespace the filesystem context @fc and the extant
 *     superblock's namespace differ
 *
 * (2) the filesystem context @fc has requested that reusing an extant
 *     superblock is not allowed
 *
 * In both cases EBUSY will be returned.
 *
 * If no match is made, a new superblock will be allocated and basic
 * initialisation will be performed (s_type, s_fs_info and s_id will be
 * set and the @set callback will be invoked), the superblock will be
 * published and it will be returned in a partially constructed state
 * with SB_BORN and SB_ACTIVE as yet unset.
 *
 * Return: On success, an extant or newly created superblock is
 *         returned. On failure an error pointer is returned.
 */
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *))
{
        struct super_block *s = NULL;
        struct super_block *old;
        struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
        int err;

        /*
         * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is
         * not set, as the filesystem is likely unprepared to handle it.
         * This can happen when fsconfig() is called from init_user_ns with
         * an fs_fd opened in another user namespace.
         */
        if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) {
                errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed");
                return ERR_PTR(-EPERM);
        }

retry:
        spin_lock(&sb_lock);
        if (test) {
                hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
                        if (test(old, fc))
                                goto share_extant_sb;
                }
        }
        if (!s) {
                spin_unlock(&sb_lock);
                s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
                if (!s)
                        return ERR_PTR(-ENOMEM);
                goto retry;
        }

        s->s_fs_info = fc->s_fs_info;
        err = set(s, fc);
        if (err) {
                s->s_fs_info = NULL;
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(err);
        }
        fc->s_fs_info = NULL;
        s->s_type = fc->fs_type;
        s->s_iflags |= fc->s_iflags;
        strscpy(s->s_id, s->s_type->name, sizeof(s->s_id));
        /*
         * Make the superblock visible on @super_blocks and @fs_supers.
         * It's in a nascent state and users should wait on SB_BORN or
         * SB_DYING to be set.
         */
        list_add_tail(&s->s_list, &super_blocks);
        hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
        spin_unlock(&sb_lock);
        get_filesystem(s->s_type);
        shrinker_register(s->s_shrink);
        return s;

share_extant_sb:
        if (user_ns != old->s_user_ns || fc->exclusive) {
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                if (fc->exclusive)
                        warnfc(fc, "reusing existing filesystem not allowed");
                else
                        warnfc(fc, "reusing existing filesystem in another namespace not allowed");
                return ERR_PTR(-EBUSY);
        }
        if (!grab_super(old))
                goto retry;
        destroy_unused_super(s);
        return old;
}
EXPORT_SYMBOL(sget_fc);

/**
 *        sget        -        find or create a superblock
 *        @type:          filesystem type superblock should belong to
 *        @test:          comparison callback
 *        @set:          setup callback
 *        @flags:          mount flags
 *        @data:          argument to each of them
 */
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags,
                        void *data)
{
        struct user_namespace *user_ns = current_user_ns();
        struct super_block *s = NULL;
        struct super_block *old;
        int err;

retry:
        spin_lock(&sb_lock);
        if (test) {
                hlist_for_each_entry(old, &type->fs_supers, s_instances) {
                        if (!test(old, data))
                                continue;
                        if (user_ns != old->s_user_ns) {
                                spin_unlock(&sb_lock);
                                destroy_unused_super(s);
                                return ERR_PTR(-EBUSY);
                        }
                        if (!grab_super(old))
                                goto retry;
                        destroy_unused_super(s);
                        return old;
                }
        }
        if (!s) {
                spin_unlock(&sb_lock);
                s = alloc_super(type, flags, user_ns);
                if (!s)
                        return ERR_PTR(-ENOMEM);
                goto retry;
        }

        err = set(s, data);
        if (err) {
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(err);
        }
        s->s_type = type;
        strscpy(s->s_id, type->name, sizeof(s->s_id));
        list_add_tail(&s->s_list, &super_blocks);
        hlist_add_head(&s->s_instances, &type->fs_supers);
        spin_unlock(&sb_lock);
        get_filesystem(type);
        shrinker_register(s->s_shrink);
        return s;
}
EXPORT_SYMBOL(sget);

void drop_super(struct super_block *sb)
{
        super_unlock_shared(sb);
        put_super(sb);
}

EXPORT_SYMBOL(drop_super);

void drop_super_exclusive(struct super_block *sb)
{
        super_unlock_excl(sb);
        put_super(sb);
}
EXPORT_SYMBOL(drop_super_exclusive);

enum super_iter_flags_t {
        SUPER_ITER_EXCL                = (1U << 0),
        SUPER_ITER_UNLOCKED        = (1U << 1),
        SUPER_ITER_REVERSE        = (1U << 2),
};

static inline struct super_block *first_super(enum super_iter_flags_t flags)
{
        if (flags & SUPER_ITER_REVERSE)
                return list_last_entry(&super_blocks, struct super_block, s_list);
        return list_first_entry(&super_blocks, struct super_block, s_list);
}

static inline struct super_block *next_super(struct super_block *sb,
                                             enum super_iter_flags_t flags)
{
        if (flags & SUPER_ITER_REVERSE)
                return list_prev_entry(sb, s_list);
        return list_next_entry(sb, s_list);
}

static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
                             enum super_iter_flags_t flags)
{
        struct super_block *sb, *p = NULL;
        bool excl = flags & SUPER_ITER_EXCL;

        guard(spinlock)(&sb_lock);

        for (sb = first_super(flags);
             !list_entry_is_head(sb, &super_blocks, s_list);
             sb = next_super(sb, flags)) {
                if (super_flags(sb, SB_DYING))
                        continue;
                sb->s_count++;
                spin_unlock(&sb_lock);

                if (flags & SUPER_ITER_UNLOCKED) {
                        f(sb, arg);
                } else if (super_lock(sb, excl)) {
                        f(sb, arg);
                        super_unlock(sb, excl);
                }

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
}

void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
{
        __iterate_supers(f, arg, 0);
}

/**
 *        iterate_supers_type - call function for superblocks of given type
 *        @type: fs type
 *        @f: function to call
 *        @arg: argument to pass to it
 *
 *        Scans the superblock list and calls given function, passing it
 *        locked superblock and given argument.
 */
void iterate_supers_type(struct file_system_type *type,
        void (*f)(struct super_block *, void *), void *arg)
{
        struct super_block *sb, *p = NULL;

        spin_lock(&sb_lock);
        hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
                bool locked;

                if (super_flags(sb, SB_DYING))
                        continue;

                sb->s_count++;
                spin_unlock(&sb_lock);

                locked = super_lock_shared(sb);
                if (locked) {
                        f(sb, arg);
                        super_unlock_shared(sb);
                }

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
}

EXPORT_SYMBOL(iterate_supers_type);

struct super_block *user_get_super(dev_t dev, bool excl)
{
        struct super_block *sb;

        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                bool locked;

                if (sb->s_dev != dev)
                        continue;

                sb->s_count++;
                spin_unlock(&sb_lock);

                locked = super_lock(sb, excl);
                if (locked)
                        return sb;

                spin_lock(&sb_lock);
                __put_super(sb);
                break;
        }
        spin_unlock(&sb_lock);
        return NULL;
}

/**
 * reconfigure_super - asks filesystem to change superblock parameters
 * @fc: The superblock and configuration
 *
 * Alters the configuration parameters of a live superblock.
 */
int reconfigure_super(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        int retval;
        bool remount_ro = false;
        bool remount_rw = false;
        bool force = fc->sb_flags & SB_FORCE;

        if (fc->sb_flags_mask & ~MS_RMT_MASK)
                return -EINVAL;
        if (sb->s_writers.frozen != SB_UNFROZEN)
                return -EBUSY;

        retval = security_sb_remount(sb, fc->security);
        if (retval)
                return retval;

        if (fc->sb_flags_mask & SB_RDONLY) {
#ifdef CONFIG_BLOCK
                if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev &&
                    bdev_read_only(sb->s_bdev))
                        return -EACCES;
#endif
                remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb);
                remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
        }

        if (remount_ro) {
                if (!hlist_empty(&sb->s_pins)) {
                        super_unlock_excl(sb);
                        group_pin_kill(&sb->s_pins);
                        __super_lock_excl(sb);
                        if (!sb->s_root)
                                return 0;
                        if (sb->s_writers.frozen != SB_UNFROZEN)
                                return -EBUSY;
                        remount_ro = !sb_rdonly(sb);
                }
        }
        shrink_dcache_sb(sb);

        /* If we are reconfiguring to RDONLY and current sb is read/write,
         * make sure there are no files open for writing.
         */
        if (remount_ro) {
                if (force) {
                        sb_start_ro_state_change(sb);
                } else {
                        retval = sb_prepare_remount_readonly(sb);
                        if (retval)
                                return retval;
                }
        } else if (remount_rw) {
                /*
                 * Protect filesystem's reconfigure code from writes from
                 * userspace until reconfigure finishes.
                 */
                sb_start_ro_state_change(sb);
        }

        if (fc->ops->reconfigure) {
                retval = fc->ops->reconfigure(fc);
                if (retval) {
                        if (!force)
                                goto cancel_readonly;
                        /* If forced remount, go ahead despite any errors */
                        WARN(1, "forced remount of a %s fs returned %i\n",
                             sb->s_type->name, retval);
                }
        }

        WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
                                 (fc->sb_flags & fc->sb_flags_mask)));
        sb_end_ro_state_change(sb);

        /*
         * Some filesystems modify their metadata via some other path than the
         * bdev buffer cache (eg. use a private mapping, or directories in
         * pagecache, etc). Also file data modifications go via their own
         * mappings. So If we try to mount readonly then copy the filesystem
         * from bdev, we could get stale data, so invalidate it to give a best
         * effort at coherency.
         */
        if (remount_ro && sb->s_bdev)
                invalidate_bdev(sb->s_bdev);
        return 0;

cancel_readonly:
        sb_end_ro_state_change(sb);
        return retval;
}

static void do_emergency_remount_callback(struct super_block *sb, void *unused)
{
        if (sb->s_bdev && !sb_rdonly(sb)) {
                struct fs_context *fc;

                fc = fs_context_for_reconfigure(sb->s_root,
                                        SB_RDONLY | SB_FORCE, SB_RDONLY);
                if (!IS_ERR(fc)) {
                        if (parse_monolithic_mount_data(fc, NULL) == 0)
                                (void)reconfigure_super(fc);
                        put_fs_context(fc);
                }
        }
}

static void do_emergency_remount(struct work_struct *work)
{
        __iterate_supers(do_emergency_remount_callback, NULL,
                         SUPER_ITER_EXCL | SUPER_ITER_REVERSE);
        kfree(work);
        printk("Emergency Remount complete\n");
}

void emergency_remount(void)
{
        struct work_struct *work;

        work = kmalloc_obj(*work, GFP_ATOMIC);
        if (work) {
                INIT_WORK(work, do_emergency_remount);
                schedule_work(work);
        }
}

static void do_thaw_all_callback(struct super_block *sb, void *unused)
{
        if (IS_ENABLED(CONFIG_BLOCK))
                while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
                        pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
        thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE, NULL);
        return;
}

static void do_thaw_all(struct work_struct *work)
{
        __iterate_supers(do_thaw_all_callback, NULL, SUPER_ITER_EXCL);
        kfree(work);
        printk(KERN_WARNING "Emergency Thaw complete\n");
}

/**
 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 *
 * Used for emergency unfreeze of all filesystems via SysRq
 */
void emergency_thaw_all(void)
{
        struct work_struct *work;

        work = kmalloc_obj(*work, GFP_ATOMIC);
        if (work) {
                INIT_WORK(work, do_thaw_all);
                schedule_work(work);
        }
}

static inline bool get_active_super(struct super_block *sb)
{
        bool active = false;

        if (super_lock_excl(sb)) {
                active = atomic_inc_not_zero(&sb->s_active);
                super_unlock_excl(sb);
        }
        return active;
}

static const char *filesystems_freeze_ptr = "filesystems_freeze";

static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all_ptr)
{
        if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super)
                return;

        if (!freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE))
                return;

        if (!get_active_super(sb))
                return;

        if (sb->s_op->freeze_super)
                sb->s_op->freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
                                       filesystems_freeze_ptr);
        else
                freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
                             filesystems_freeze_ptr);

        deactivate_super(sb);
}

void filesystems_freeze(bool freeze_all)
{
        void *freeze_all_ptr = NULL;

        if (freeze_all)
                freeze_all_ptr = &freeze_all;
        __iterate_supers(filesystems_freeze_callback, freeze_all_ptr,
                         SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE);
}

static void filesystems_thaw_callback(struct super_block *sb, void *unused)
{
        if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super)
                return;

        if (!get_active_super(sb))
                return;

        if (sb->s_op->thaw_super)
                sb->s_op->thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
                                     filesystems_freeze_ptr);
        else
                thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
                           filesystems_freeze_ptr);

        deactivate_super(sb);
}

void filesystems_thaw(void)
{
        __iterate_supers(filesystems_thaw_callback, NULL, SUPER_ITER_UNLOCKED);
}

static DEFINE_IDA(unnamed_dev_ida);

/**
 * get_anon_bdev - Allocate a block device for filesystems which don't have one.
 * @p: Pointer to a dev_t.
 *
 * Filesystems which don't use real block devices can call this function
 * to allocate a virtual block device.
 *
 * Context: Any context.  Frequently called while holding sb_lock.
 * Return: 0 on success, -EMFILE if there are no anonymous bdevs left
 * or -ENOMEM if memory allocation failed.
 */
int get_anon_bdev(dev_t *p)
{
        int dev;

        /*
         * Many userspace utilities consider an FSID of 0 invalid.
         * Always return at least 1 from get_anon_bdev.
         */
        dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1,
                        GFP_ATOMIC);
        if (dev == -ENOSPC)
                dev = -EMFILE;
        if (dev < 0)
                return dev;

        *p = MKDEV(0, dev);
        return 0;
}
EXPORT_SYMBOL(get_anon_bdev);

void free_anon_bdev(dev_t dev)
{
        ida_free(&unnamed_dev_ida, MINOR(dev));
}
EXPORT_SYMBOL(free_anon_bdev);

int set_anon_super(struct super_block *s, void *data)
{
        return get_anon_bdev(&s->s_dev);
}
EXPORT_SYMBOL(set_anon_super);

void kill_anon_super(struct super_block *sb)
{
        dev_t dev = sb->s_dev;
        generic_shutdown_super(sb);
        kill_super_notify(sb);
        free_anon_bdev(dev);
}
EXPORT_SYMBOL(kill_anon_super);

int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
{
        return set_anon_super(sb, NULL);
}
EXPORT_SYMBOL(set_anon_super_fc);

static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
{
        return sb->s_fs_info == fc->s_fs_info;
}

static int test_single_super(struct super_block *s, struct fs_context *fc)
{
        return 1;
}

static int vfs_get_super(struct fs_context *fc,
                int (*test)(struct super_block *, struct fs_context *),
                int (*fill_super)(struct super_block *sb,
                                  struct fs_context *fc))
{
        struct super_block *sb;
        int err;

        sb = sget_fc(fc, test, set_anon_super_fc);
        if (IS_ERR(sb))
                return PTR_ERR(sb);

        if (!sb->s_root) {
                err = fill_super(sb, fc);
                if (err)
                        goto error;

                sb->s_flags |= SB_ACTIVE;
        }

        fc->root = dget(sb->s_root);
        return 0;

error:
        deactivate_locked_super(sb);
        return err;
}

int get_tree_nodev(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc))
{
        return vfs_get_super(fc, NULL, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);

int get_tree_single(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc))
{
        return vfs_get_super(fc, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);

int get_tree_keyed(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc),
                void *key)
{
        fc->s_fs_info = key;
        return vfs_get_super(fc, test_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);

static int set_bdev_super(struct super_block *s, void *data)
{
        s->s_dev = *(dev_t *)data;
        return 0;
}

static int super_s_dev_set(struct super_block *s, struct fs_context *fc)
{
        return set_bdev_super(s, fc->sget_key);
}

static int super_s_dev_test(struct super_block *s, struct fs_context *fc)
{
        return !(s->s_iflags & SB_I_RETIRED) &&
                s->s_dev == *(dev_t *)fc->sget_key;
}

/**
 * sget_dev - Find or create a superblock by device number
 * @fc: Filesystem context.
 * @dev: device number
 *
 * Find or create a superblock using the provided device number that
 * will be stored in fc->sget_key.
 *
 * If an extant superblock is matched, then that will be returned with
 * an elevated reference count that the caller must transfer or discard.
 *
 * If no match is made, a new superblock will be allocated and basic
 * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will
 * be set). The superblock will be published and it will be returned in
 * a partially constructed state with SB_BORN and SB_ACTIVE as yet
 * unset.
 *
 * Return: an existing or newly created superblock on success, an error
 *         pointer on failure.
 */
struct super_block *sget_dev(struct fs_context *fc, dev_t dev)
{
        fc->sget_key = &dev;
        return sget_fc(fc, super_s_dev_test, super_s_dev_set);
}
EXPORT_SYMBOL(sget_dev);

#ifdef CONFIG_BLOCK
/*
 * Lock the superblock that is holder of the bdev. Returns the superblock
 * pointer if we successfully locked the superblock and it is alive. Otherwise
 * we return NULL and just unlock bdev->bd_holder_lock.
 *
 * The function must be called with bdev->bd_holder_lock and releases it.
 */
static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
        __releases(&bdev->bd_holder_lock)
{
        struct super_block *sb = bdev->bd_holder;
        bool locked;

        lockdep_assert_held(&bdev->bd_holder_lock);
        lockdep_assert_not_held(&sb->s_umount);
        lockdep_assert_not_held(&bdev->bd_disk->open_mutex);

        /* Make sure sb doesn't go away from under us */
        spin_lock(&sb_lock);
        sb->s_count++;
        spin_unlock(&sb_lock);

        mutex_unlock(&bdev->bd_holder_lock);

        locked = super_lock(sb, excl);

        /*
         * If the superblock wasn't already SB_DYING then we hold
         * s_umount and can safely drop our temporary reference.
         */
        put_super(sb);

        if (!locked)
                return NULL;

        if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
                super_unlock(sb, excl);
                return NULL;
        }

        return sb;
}

static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
{
        struct super_block *sb;

        sb = bdev_super_lock(bdev, false);
        if (!sb)
                return;

        if (sb->s_op->remove_bdev) {
                int ret;

                ret = sb->s_op->remove_bdev(sb, bdev);
                if (!ret) {
                        super_unlock_shared(sb);
                        return;
                }
                /* Fallback to shutdown. */
        }

        if (!surprise)
                sync_filesystem(sb);
        shrink_dcache_sb(sb);
        evict_inodes(sb);
        if (sb->s_op->shutdown)
                sb->s_op->shutdown(sb);

        super_unlock_shared(sb);
}

static void fs_bdev_sync(struct block_device *bdev)
{
        struct super_block *sb;

        sb = bdev_super_lock(bdev, false);
        if (!sb)
                return;

        sync_filesystem(sb);
        super_unlock_shared(sb);
}

static struct super_block *get_bdev_super(struct block_device *bdev)
{
        bool active = false;
        struct super_block *sb;

        sb = bdev_super_lock(bdev, true);
        if (sb) {
                active = atomic_inc_not_zero(&sb->s_active);
                super_unlock_excl(sb);
        }
        if (!active)
                return NULL;
        return sb;
}

/**
 * fs_bdev_freeze - freeze owning filesystem of block device
 * @bdev: block device
 *
 * Freeze the filesystem that owns this block device if it is still
 * active.
 *
 * A filesystem that owns multiple block devices may be frozen from each
 * block device and won't be unfrozen until all block devices are
 * unfrozen. Each block device can only freeze the filesystem once as we
 * nest freezes for block devices in the block layer.
 *
 * Return: If the freeze was successful zero is returned. If the freeze
 *         failed a negative error code is returned.
 */
static int fs_bdev_freeze(struct block_device *bdev)
{
        struct super_block *sb;
        int error = 0;

        lockdep_assert_held(&bdev->bd_fsfreeze_mutex);

        sb = get_bdev_super(bdev);
        if (!sb)
                return -EINVAL;

        if (sb->s_op->freeze_super)
                error = sb->s_op->freeze_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
        else
                error = freeze_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
        if (!error)
                error = sync_blockdev(bdev);
        deactivate_super(sb);
        return error;
}

/**
 * fs_bdev_thaw - thaw owning filesystem of block device
 * @bdev: block device
 *
 * Thaw the filesystem that owns this block device.
 *
 * A filesystem that owns multiple block devices may be frozen from each
 * block device and won't be unfrozen until all block devices are
 * unfrozen. Each block device can only freeze the filesystem once as we
 * nest freezes for block devices in the block layer.
 *
 * Return: If the thaw was successful zero is returned. If the thaw
 *         failed a negative error code is returned. If this function
 *         returns zero it doesn't mean that the filesystem is unfrozen
 *         as it may have been frozen multiple times (kernel may hold a
 *         freeze or might be frozen from other block devices).
 */
static int fs_bdev_thaw(struct block_device *bdev)
{
        struct super_block *sb;
        int error;

        lockdep_assert_held(&bdev->bd_fsfreeze_mutex);

        /*
         * The block device may have been frozen before it was claimed by a
         * filesystem. Concurrently another process might try to mount that
         * frozen block device and has temporarily claimed the block device for
         * that purpose causing a concurrent fs_bdev_thaw() to end up here. The
         * mounter is already about to abort mounting because they still saw an
         * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return
         * NULL in that case.
         */
        sb = get_bdev_super(bdev);
        if (!sb)
                return -EINVAL;

        if (sb->s_op->thaw_super)
                error = sb->s_op->thaw_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
        else
                error = thaw_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
        deactivate_super(sb);
        return error;
}

const struct blk_holder_ops fs_holder_ops = {
        .mark_dead                = fs_bdev_mark_dead,
        .sync                        = fs_bdev_sync,
        .freeze                        = fs_bdev_freeze,
        .thaw                        = fs_bdev_thaw,
};
EXPORT_SYMBOL_GPL(fs_holder_ops);

int setup_bdev_super(struct super_block *sb, int sb_flags,
                struct fs_context *fc)
{
        blk_mode_t mode = sb_open_mode(sb_flags);
        struct file *bdev_file;
        struct block_device *bdev;

        bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
        if (IS_ERR(bdev_file)) {
                if (fc)
                        errorf(fc, "%s: Can't open blockdev", fc->source);
                return PTR_ERR(bdev_file);
        }
        bdev = file_bdev(bdev_file);

        /*
         * This really should be in blkdev_get_by_dev, but right now can't due
         * to legacy issues that require us to allow opening a block device node
         * writable from userspace even for a read-only block device.
         */
        if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
                bdev_fput(bdev_file);
                return -EACCES;
        }

        /*
         * It is enough to check bdev was not frozen before we set
         * s_bdev as freezing will wait until SB_BORN is set.
         */
        if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
                if (fc)
                        warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
                bdev_fput(bdev_file);
                return -EBUSY;
        }
        spin_lock(&sb_lock);
        sb->s_bdev_file = bdev_file;
        sb->s_bdev = bdev;
        sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
        if (bdev_stable_writes(bdev))
                sb->s_iflags |= SB_I_STABLE_WRITES;
        spin_unlock(&sb_lock);

        snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
        shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
                                sb->s_id);
        sb_set_blocksize(sb, block_size(bdev));
        return 0;
}
EXPORT_SYMBOL_GPL(setup_bdev_super);

/**
 * get_tree_bdev_flags - Get a superblock based on a single block device
 * @fc: The filesystem context holding the parameters
 * @fill_super: Helper to initialise a new superblock
 * @flags: GET_TREE_BDEV_* flags
 */
int get_tree_bdev_flags(struct fs_context *fc,
                int (*fill_super)(struct super_block *sb,
                                  struct fs_context *fc), unsigned int flags)
{
        struct super_block *s;
        int error = 0;
        dev_t dev;

        if (!fc->source)
                return invalf(fc, "No source specified");

        error = lookup_bdev(fc->source, &dev);
        if (error) {
                if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP))
                        errorf(fc, "%s: Can't lookup blockdev", fc->source);
                return error;
        }
        fc->sb_flags |= SB_NOSEC;
        s = sget_dev(fc, dev);
        if (IS_ERR(s))
                return PTR_ERR(s);

        if (s->s_root) {
                /* Don't summarily change the RO/RW state. */
                if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
                        warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev);
                        deactivate_locked_super(s);
                        return -EBUSY;
                }
        } else {
                error = setup_bdev_super(s, fc->sb_flags, fc);
                if (!error)
                        error = fill_super(s, fc);
                if (error) {
                        deactivate_locked_super(s);
                        return error;
                }
                s->s_flags |= SB_ACTIVE;
        }

        BUG_ON(fc->root);
        fc->root = dget(s->s_root);
        return 0;
}
EXPORT_SYMBOL_GPL(get_tree_bdev_flags);

/**
 * get_tree_bdev - Get a superblock based on a single block device
 * @fc: The filesystem context holding the parameters
 * @fill_super: Helper to initialise a new superblock
 */
int get_tree_bdev(struct fs_context *fc,
                int (*fill_super)(struct super_block *,
                                  struct fs_context *))
{
        return get_tree_bdev_flags(fc, fill_super, 0);
}
EXPORT_SYMBOL(get_tree_bdev);

void kill_block_super(struct super_block *sb)
{
        struct block_device *bdev = sb->s_bdev;

        generic_shutdown_super(sb);
        if (bdev) {
                sync_blockdev(bdev);
                bdev_fput(sb->s_bdev_file);
        }
}

EXPORT_SYMBOL(kill_block_super);
#endif

/**
 * vfs_get_tree - Get the mountable root
 * @fc: The superblock configuration context.
 *
 * The filesystem is invoked to get or create a superblock which can then later
 * be used for mounting.  The filesystem places a pointer to the root to be
 * used for mounting in @fc->root.
 */
int vfs_get_tree(struct fs_context *fc)
{
        struct super_block *sb;
        int error;

        if (fc->root)
                return -EBUSY;

        /* Get the mountable root in fc->root, with a ref on the root and a ref
         * on the superblock.
         */
        error = fc->ops->get_tree(fc);
        if (error < 0)
                return error;

        if (!fc->root) {
                pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n",
                       fc->fs_type->name, error);
                /* We don't know what the locking state of the superblock is -
                 * if there is a superblock.
                 */
                BUG();
        }

        sb = fc->root->d_sb;
        WARN_ON(!sb->s_bdi);

        /*
         * super_wake() contains a memory barrier which also care of
         * ordering for super_cache_count(). We place it before setting
         * SB_BORN as the data dependency between the two functions is
         * the superblock structure contents that we just set up, not
         * the SB_BORN flag.
         */
        super_wake(sb, SB_BORN);

        error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
        if (unlikely(error)) {
                fc_drop_locked(fc);
                return error;
        }

        /*
         * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
         * but s_maxbytes was an unsigned long long for many releases. Throw
         * this warning for a little while to try and catch filesystems that
         * violate this rule.
         */
        WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
                "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);

        return 0;
}
EXPORT_SYMBOL(vfs_get_tree);

/*
 * Setup private BDI for given superblock. It gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
{
        struct backing_dev_info *bdi;
        int err;
        va_list args;

        bdi = bdi_alloc(NUMA_NO_NODE);
        if (!bdi)
                return -ENOMEM;

        va_start(args, fmt);
        err = bdi_register_va(bdi, fmt, args);
        va_end(args);
        if (err) {
                bdi_put(bdi);
                return err;
        }
        WARN_ON(sb->s_bdi != &noop_backing_dev_info);
        sb->s_bdi = bdi;
        sb->s_iflags |= SB_I_PERSB_BDI;

        return 0;
}
EXPORT_SYMBOL(super_setup_bdi_name);

/*
 * Setup private BDI for given superblock. I gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi(struct super_block *sb)
{
        static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

        return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
                                    atomic_long_inc_return(&bdi_seq));
}
EXPORT_SYMBOL(super_setup_bdi);

/**
 * sb_wait_write - wait until all writers to given file system finish
 * @sb: the super for which we wait
 * @level: type of writers we wait for (normal vs page fault)
 *
 * This function waits until there are no writers of given type to given file
 * system.
 */
static void sb_wait_write(struct super_block *sb, int level)
{
        percpu_down_write(sb->s_writers.rw_sem + level-1);
}

/*
 * We are going to return to userspace and forget about these locks, the
 * ownership goes to the caller of thaw_super() which does unlock().
 */
static void lockdep_sb_freeze_release(struct super_block *sb)
{
        int level;

        for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
                percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_);
}

/*
 * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
 */
static void lockdep_sb_freeze_acquire(struct super_block *sb)
{
        int level;

        for (level = 0; level < SB_FREEZE_LEVELS; ++level)
                percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}

static void sb_freeze_unlock(struct super_block *sb, int level)
{
        for (level--; level >= 0; level--)
                percpu_up_write(sb->s_writers.rw_sem + level);
}

static int wait_for_partially_frozen(struct super_block *sb)
{
        int ret = 0;

        do {
                unsigned short old = sb->s_writers.frozen;

                up_write(&sb->s_umount);
                ret = wait_var_event_killable(&sb->s_writers.frozen,
                                               sb->s_writers.frozen != old);
                down_write(&sb->s_umount);
        } while (ret == 0 &&
                 sb->s_writers.frozen != SB_UNFROZEN &&
                 sb->s_writers.frozen != SB_FREEZE_COMPLETE);

        return ret;
}

#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE)
#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST | FREEZE_EXCL)

static inline int freeze_inc(struct super_block *sb, enum freeze_holder who)
{
        WARN_ON_ONCE((who & ~FREEZE_FLAGS));
        WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);

        if (who & FREEZE_HOLDER_KERNEL)
                ++sb->s_writers.freeze_kcount;
        if (who & FREEZE_HOLDER_USERSPACE)
                ++sb->s_writers.freeze_ucount;
        return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
}

static inline int freeze_dec(struct super_block *sb, enum freeze_holder who)
{
        WARN_ON_ONCE((who & ~FREEZE_FLAGS));
        WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);

        if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount)
                --sb->s_writers.freeze_kcount;
        if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount)
                --sb->s_writers.freeze_ucount;
        return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
}

static inline bool may_freeze(struct super_block *sb, enum freeze_holder who,
                              const void *freeze_owner)
{
        lockdep_assert_held(&sb->s_umount);

        WARN_ON_ONCE((who & ~FREEZE_FLAGS));
        WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);

        if (who & FREEZE_EXCL) {
                if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL)))
                        return false;
                if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL)))
                        return false;
                if (WARN_ON_ONCE(!freeze_owner))
                        return false;
                /* This freeze already has a specific owner. */
                if (sb->s_writers.freeze_owner)
                        return false;
                /*
                 * This is already frozen multiple times so we're just
                 * going to take a reference count and mark the freeze as
                 * being owned by the caller.
                 */
                if (sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount)
                        sb->s_writers.freeze_owner = freeze_owner;
                return true;
        }

        if (who & FREEZE_HOLDER_KERNEL)
                return (who & FREEZE_MAY_NEST) ||
                       sb->s_writers.freeze_kcount == 0;
        if (who & FREEZE_HOLDER_USERSPACE)
                return (who & FREEZE_MAY_NEST) ||
                       sb->s_writers.freeze_ucount == 0;
        return false;
}

static inline bool may_unfreeze(struct super_block *sb, enum freeze_holder who,
                                const void *freeze_owner)
{
        lockdep_assert_held(&sb->s_umount);

        WARN_ON_ONCE((who & ~FREEZE_FLAGS));
        WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);

        if (who & FREEZE_EXCL) {
                if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL)))
                        return false;
                if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL)))
                        return false;
                if (WARN_ON_ONCE(!freeze_owner))
                        return false;
                if (WARN_ON_ONCE(sb->s_writers.freeze_kcount == 0))
                        return false;
                /* This isn't exclusively frozen. */
                if (!sb->s_writers.freeze_owner)
                        return false;
                /* This isn't exclusively frozen by us. */
                if (sb->s_writers.freeze_owner != freeze_owner)
                        return false;
                /*
                 * This is still frozen multiple times so we're just
                 * going to drop our reference count and undo our
                 * exclusive freeze.
                 */
                if ((sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount) > 1)
                        sb->s_writers.freeze_owner = NULL;
                return true;
        }

        if (who & FREEZE_HOLDER_KERNEL) {
                /*
                 * Someone's trying to steal the reference belonging to
                 * @sb->s_writers.freeze_owner.
                 */
                if (sb->s_writers.freeze_kcount == 1 &&
                    sb->s_writers.freeze_owner)
                        return false;
                return sb->s_writers.freeze_kcount > 0;
        }

        if (who & FREEZE_HOLDER_USERSPACE)
                return sb->s_writers.freeze_ucount > 0;

        return false;
}

/**
 * freeze_super - lock the filesystem and force it into a consistent state
 * @sb: the super to lock
 * @who: context that wants to freeze
 * @freeze_owner: owner of the freeze
 *
 * Syncs the super to make sure the filesystem is consistent and calls the fs's
 * freeze_fs.  Subsequent calls to this without first thawing the fs may return
 * -EBUSY.
 *
 * @who should be:
 * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
 * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs.
 * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed.
 *
 * The @who argument distinguishes between the kernel and userspace trying to
 * freeze the filesystem.  Although there cannot be multiple kernel freezes or
 * multiple userspace freezes in effect at any given time, the kernel and
 * userspace can both hold a filesystem frozen.  The filesystem remains frozen
 * until there are no kernel or userspace freezes in effect.
 *
 * A filesystem may hold multiple devices and thus a filesystems may be
 * frozen through the block layer via multiple block devices. In this
 * case the request is marked as being allowed to nest by passing
 * FREEZE_MAY_NEST. The filesystem remains frozen until all block
 * devices are unfrozen. If multiple freezes are attempted without
 * FREEZE_MAY_NEST -EBUSY will be returned.
 *
 * During this function, sb->s_writers.frozen goes through these values:
 *
 * SB_UNFROZEN: File system is normal, all writes progress as usual.
 *
 * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
 * writes should be blocked, though page faults are still allowed. We wait for
 * all writes to complete and then proceed to the next stage.
 *
 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
 * but internal fs threads can still modify the filesystem (although they
 * should not dirty new pages or inodes), writeback can run etc. After waiting
 * for all running page faults we sync the filesystem which will clean all
 * dirty pages and inodes (no new dirty pages or inodes can be created when
 * sync is running).
 *
 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
 * modification are blocked (e.g. XFS preallocation truncation on inode
 * reclaim). This is usually implemented by blocking new transactions for
 * filesystems that have them and need this additional guard. After all
 * internal writers are finished we call ->freeze_fs() to finish filesystem
 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
 *
 * sb->s_writers.frozen is protected by sb->s_umount.
 *
 * Return: If the freeze was successful zero is returned. If the freeze
 *         failed a negative error code is returned.
 */
int freeze_super(struct super_block *sb, enum freeze_holder who, const void *freeze_owner)
{
        int ret;

        if (!super_lock_excl(sb)) {
                WARN_ON_ONCE("Dying superblock while freezing!");
                return -EINVAL;
        }
        atomic_inc(&sb->s_active);

retry:
        if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
                if (may_freeze(sb, who, freeze_owner))
                        ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1);
                else
                        ret = -EBUSY;
                /* All freezers share a single active reference. */
                deactivate_locked_super(sb);
                return ret;
        }

        if (sb->s_writers.frozen != SB_UNFROZEN) {
                ret = wait_for_partially_frozen(sb);
                if (ret) {
                        deactivate_locked_super(sb);
                        return ret;
                }

                goto retry;
        }

        if (sb_rdonly(sb)) {
                /* Nothing to do really... */
                WARN_ON_ONCE(freeze_inc(sb, who) > 1);
                sb->s_writers.freeze_owner = freeze_owner;
                sb->s_writers.frozen = SB_FREEZE_COMPLETE;
                wake_up_var(&sb->s_writers.frozen);
                super_unlock_excl(sb);
                return 0;
        }

        sb->s_writers.frozen = SB_FREEZE_WRITE;
        /* Release s_umount to preserve sb_start_write -> s_umount ordering */
        super_unlock_excl(sb);
        sb_wait_write(sb, SB_FREEZE_WRITE);
        __super_lock_excl(sb);

        /* Now we go and block page faults... */
        sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
        sb_wait_write(sb, SB_FREEZE_PAGEFAULT);

        /* All writers are done so after syncing there won't be dirty data */
        ret = sync_filesystem(sb);
        if (ret) {
                sb->s_writers.frozen = SB_UNFROZEN;
                sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
                wake_up_var(&sb->s_writers.frozen);
                deactivate_locked_super(sb);
                return ret;
        }

        /* Now wait for internal filesystem counter */
        sb->s_writers.frozen = SB_FREEZE_FS;
        sb_wait_write(sb, SB_FREEZE_FS);

        if (sb->s_op->freeze_fs) {
                ret = sb->s_op->freeze_fs(sb);
                if (ret) {
                        printk(KERN_ERR
                                "VFS:Filesystem freeze failed\n");
                        sb->s_writers.frozen = SB_UNFROZEN;
                        sb_freeze_unlock(sb, SB_FREEZE_FS);
                        wake_up_var(&sb->s_writers.frozen);
                        deactivate_locked_super(sb);
                        return ret;
                }
        }
        /*
         * For debugging purposes so that fs can warn if it sees write activity
         * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
         */
        WARN_ON_ONCE(freeze_inc(sb, who) > 1);
        sb->s_writers.freeze_owner = freeze_owner;
        sb->s_writers.frozen = SB_FREEZE_COMPLETE;
        wake_up_var(&sb->s_writers.frozen);
        lockdep_sb_freeze_release(sb);
        super_unlock_excl(sb);
        return 0;
}
EXPORT_SYMBOL(freeze_super);

/*
 * Undoes the effect of a freeze_super_locked call.  If the filesystem is
 * frozen both by userspace and the kernel, a thaw call from either source
 * removes that state without releasing the other state or unlocking the
 * filesystem.
 */
static int thaw_super_locked(struct super_block *sb, enum freeze_holder who,
                             const void *freeze_owner)
{
        int error = -EINVAL;

        if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
                goto out_unlock;

        if (!may_unfreeze(sb, who, freeze_owner))
                goto out_unlock;

        /*
         * All freezers share a single active reference.
         * So just unlock in case there are any left.
         */
        if (freeze_dec(sb, who))
                goto out_unlock;

        if (sb_rdonly(sb)) {
                sb->s_writers.frozen = SB_UNFROZEN;
                sb->s_writers.freeze_owner = NULL;
                wake_up_var(&sb->s_writers.frozen);
                goto out_deactivate;
        }

        lockdep_sb_freeze_acquire(sb);

        if (sb->s_op->unfreeze_fs) {
                error = sb->s_op->unfreeze_fs(sb);
                if (error) {
                        pr_err("VFS: Filesystem thaw failed\n");
                        freeze_inc(sb, who);
                        lockdep_sb_freeze_release(sb);
                        goto out_unlock;
                }
        }

        sb->s_writers.frozen = SB_UNFROZEN;
        sb->s_writers.freeze_owner = NULL;
        wake_up_var(&sb->s_writers.frozen);
        sb_freeze_unlock(sb, SB_FREEZE_FS);
out_deactivate:
        deactivate_locked_super(sb);
        return 0;

out_unlock:
        super_unlock_excl(sb);
        return error;
}

/**
 * thaw_super -- unlock filesystem
 * @sb: the super to thaw
 * @who: context that wants to freeze
 * @freeze_owner: owner of the freeze
 *
 * Unlocks the filesystem and marks it writeable again after freeze_super()
 * if there are no remaining freezes on the filesystem.
 *
 * @who should be:
 * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs;
 * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs.
 * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed
 *
 * A filesystem may hold multiple devices and thus a filesystems may
 * have been frozen through the block layer via multiple block devices.
 * The filesystem remains frozen until all block devices are unfrozen.
 */
int thaw_super(struct super_block *sb, enum freeze_holder who,
               const void *freeze_owner)
{
        if (!super_lock_excl(sb)) {
                WARN_ON_ONCE("Dying superblock while thawing!");
                return -EINVAL;
        }
        return thaw_super_locked(sb, who, freeze_owner);
}
EXPORT_SYMBOL(thaw_super);

/*
 * Create workqueue for deferred direct IO completions. We allocate the
 * workqueue when it's first needed. This avoids creating workqueue for
 * filesystems that don't need it and also allows us to create the workqueue
 * late enough so the we can include s_id in the name of the workqueue.
 */
int sb_init_dio_done_wq(struct super_block *sb)
{
        struct workqueue_struct *old;
        struct workqueue_struct *wq = alloc_workqueue("dio/%s",
                                                      WQ_MEM_RECLAIM | WQ_PERCPU,
                                                      0,
                                                      sb->s_id);
        if (!wq)
                return -ENOMEM;

        old = NULL;
        /*
         * This has to be atomic as more DIOs can race to create the workqueue
         */
        if (!try_cmpxchg(&sb->s_dio_done_wq, &old, wq)) {
                /* Someone created workqueue before us? Free ours... */
                destroy_workqueue(wq);
        }
        return 0;
}
EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
































































































































































    3 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * usb hub driver head file
 *
 * Copyright (C) 1999 Linus Torvalds
 * Copyright (C) 1999 Johannes Erdfelt
 * Copyright (C) 1999 Gregory P. Smith
 * Copyright (C) 2001 Brad Hards (bhards@bigpond.net.au)
 * Copyright (C) 2012 Intel Corp (tianyu.lan@intel.com)
 *
 *  move struct usb_hub to this file.
 */

#include <linux/usb.h>
#include <linux/usb/ch11.h>
#include <linux/usb/hcd.h>
#include <linux/usb/typec.h>
#include "usb.h"

struct usb_hub {
        struct device                *intfdev;        /* the "interface" device */
        struct usb_device        *hdev;
        struct kref                kref;
        struct urb                *urb;                /* for interrupt polling pipe */

        /* buffer for urb ... with extra space in case of babble */
        u8                        (*buffer)[8];
        union {
                struct usb_hub_status        hub;
                struct usb_port_status        port;
        }                        *status;        /* buffer for status reports */
        struct mutex                status_mutex;        /* for the status buffer */

        int                        error;                /* last reported error */
        int                        nerrors;        /* track consecutive errors */

        unsigned long                event_bits[1];        /* status change bitmask */
        unsigned long                change_bits[1];        /* ports with logical connect
                                                        status change */
        unsigned long                removed_bits[1]; /* ports with a "removed"
                                                        device present */
        unsigned long                wakeup_bits[1];        /* ports that have signaled
                                                        remote wakeup */
        unsigned long                power_bits[1]; /* ports that are powered */
        unsigned long                child_usage_bits[1]; /* ports powered on for
                                                        children */
        unsigned long                warm_reset_bits[1]; /* ports requesting warm
                                                        reset recovery */
#if USB_MAXCHILDREN > 31 /* 8*sizeof(unsigned long) - 1 */
#error event_bits[] is too short!
#endif

        struct usb_hub_descriptor *descriptor;        /* class descriptor */
        struct usb_tt                tt;                /* Transaction Translator */

        unsigned                mA_per_port;        /* current for each child */
#ifdef        CONFIG_PM
        unsigned                wakeup_enabled_descendants;
#endif

        unsigned                limited_power:1;
        unsigned                quiescing:1;
        unsigned                disconnected:1;
        unsigned                in_reset:1;
        unsigned                quirk_disable_autosuspend:1;

        unsigned                quirk_check_port_auto_suspend:1;

        unsigned                has_indicators:1;
        u8                        indicator[USB_MAXCHILDREN];
        struct delayed_work        leds;
        struct delayed_work        init_work;
        struct delayed_work        post_resume_work;
        struct work_struct      events;
        spinlock_t                irq_urb_lock;
        struct timer_list        irq_urb_retry;
        struct usb_port                **ports;
        struct list_head        onboard_devs;
};

/**
 * struct usb port - kernel's representation of a usb port
 * @child: usb device attached to the port
 * @dev: generic device interface
 * @port_owner: port's owner
 * @peer: related usb2 and usb3 ports (share the same connector)
 * @connector: USB Type-C connector
 * @req: default pm qos request for hubs without port power control
 * @connect_type: port's connect type
 * @state: device state of the usb device attached to the port
 * @state_kn: kernfs_node of the sysfs attribute that accesses @state
 * @location: opaque representation of platform connector location
 * @status_lock: synchronize port_event() vs usb_port_{suspend|resume}
 * @portnum: port index num based one
 * @is_superspeed cache super-speed status
 * @usb3_lpm_u1_permit: whether USB3 U1 LPM is permitted.
 * @usb3_lpm_u2_permit: whether USB3 U2 LPM is permitted.
 * @early_stop: whether port initialization will be stopped earlier.
 * @ignore_event: whether events of the port are ignored.
 */
struct usb_port {
        struct usb_device *child;
        struct device dev;
        struct usb_dev_state *port_owner;
        struct usb_port *peer;
        struct typec_connector *connector;
        struct dev_pm_qos_request *req;
        enum usb_port_connect_type connect_type;
        enum usb_device_state state;
        struct kernfs_node *state_kn;
        usb_port_location_t location;
        struct mutex status_lock;
        u32 over_current_count;
        u8 portnum;
        u32 quirks;
        unsigned int early_stop:1;
        unsigned int ignore_event:1;
        unsigned int is_superspeed:1;
        unsigned int usb3_lpm_u1_permit:1;
        unsigned int usb3_lpm_u2_permit:1;
};

#define to_usb_port(_dev) \
        container_of(_dev, struct usb_port, dev)

extern int usb_hub_create_port_device(struct usb_hub *hub,
                int port1);
extern void usb_hub_remove_port_device(struct usb_hub *hub,
                int port1);
extern int usb_hub_set_port_power(struct usb_device *hdev, struct usb_hub *hub,
                int port1, bool set);
extern struct usb_hub *usb_hub_to_struct_hub(struct usb_device *hdev);
extern void hub_get(struct usb_hub *hub);
extern void hub_put(struct usb_hub *hub);
extern int hub_port_debounce(struct usb_hub *hub, int port1,
                bool must_be_connected);
extern int usb_clear_port_feature(struct usb_device *hdev,
                int port1, int feature);
extern int usb_hub_port_status(struct usb_hub *hub, int port1,
                u16 *status, u16 *change);
extern int usb_port_is_power_on(struct usb_hub *hub, unsigned int portstatus);

static inline bool hub_is_port_power_switchable(struct usb_hub *hub)
{
        __le16 hcs;

        if (!hub)
                return false;
        hcs = hub->descriptor->wHubCharacteristics;
        return (le16_to_cpu(hcs) & HUB_CHAR_LPSM) < HUB_CHAR_NO_LPSM;
}

static inline int hub_is_superspeed(struct usb_device *hdev)
{
        return hdev->descriptor.bDeviceProtocol == USB_HUB_PR_SS;
}

static inline int hub_is_superspeedplus(struct usb_device *hdev)
{
        return (hdev->descriptor.bDeviceProtocol == USB_HUB_PR_SS &&
                le16_to_cpu(hdev->descriptor.bcdUSB) >= 0x0310 &&
                hdev->bos && hdev->bos->ssp_cap);
}

static inline unsigned hub_power_on_good_delay(struct usb_hub *hub)
{
        unsigned delay = hub->descriptor->bPwrOn2PwrGood * 2;

        if (!hub->hdev->parent)        /* root hub */
                return delay;
        else /* Wait at least 100 msec for power to become stable */
                return max(delay, 100U);
}

static inline int hub_port_debounce_be_connected(struct usb_hub *hub,
                int port1)
{
        return hub_port_debounce(hub, port1, true);
}

static inline int hub_port_debounce_be_stable(struct usb_hub *hub,
                int port1)
{
        return hub_port_debounce(hub, port1, false);
}
















    7 













    7 













    5 

    2 
    3 
    6 





    3 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/lib/kasprintf.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/stdarg.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>

/* Simplified asprintf. */
char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
{
        unsigned int first, second;
        char *p;
        va_list aq;

        va_copy(aq, ap);
        first = vsnprintf(NULL, 0, fmt, aq);
        va_end(aq);

        p = kmalloc_track_caller(first+1, gfp);
        if (!p)
                return NULL;

        second = vsnprintf(p, first+1, fmt, ap);
        WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)",
             first, second, fmt);

        return p;
}
EXPORT_SYMBOL(kvasprintf);

/*
 * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt
 * (or the sole vararg) points to rodata, we will then save a memory
 * allocation and string copy. In any case, the return value should be
 * freed using kfree_const().
 */
const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap)
{
        if (!strchr(fmt, '%'))
                return kstrdup_const(fmt, gfp);
        if (!strcmp(fmt, "%s"))
                return kstrdup_const(va_arg(ap, const char*), gfp);
        return kvasprintf(gfp, fmt, ap);
}
EXPORT_SYMBOL(kvasprintf_const);

char *kasprintf(gfp_t gfp, const char *fmt, ...)
{
        va_list ap;
        char *p;

        va_start(ap, fmt);
        p = kvasprintf(gfp, fmt, ap);
        va_end(ap);

        return p;
}
EXPORT_SYMBOL(kasprintf);

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


















    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/bpf_perf_event.h>
#include <linux/btf.h>
#include <linux/filter.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/error-injection.h>
#include <linux/btf_ids.h>
#include <linux/bpf_lsm.h>
#include <linux/fprobe.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/key.h>
#include <linux/namei.h>

#include <net/bpf_sk_storage.h>

#include <uapi/linux/bpf.h>
#include <uapi/linux/btf.h>

#include <asm/tlb.h>

#include "trace_probe.h"
#include "trace.h"

#define CREATE_TRACE_POINTS
#include "bpf_trace.h"

#define bpf_event_rcu_dereference(p)                                        \
        rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))

#define MAX_UPROBE_MULTI_CNT (1U << 20)
#define MAX_KPROBE_MULTI_CNT (1U << 20)

#ifdef CONFIG_MODULES
struct bpf_trace_module {
        struct module *module;
        struct list_head list;
};

static LIST_HEAD(bpf_trace_modules);
static DEFINE_MUTEX(bpf_module_mutex);

static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
{
        struct bpf_raw_event_map *btp, *ret = NULL;
        struct bpf_trace_module *btm;
        unsigned int i;

        mutex_lock(&bpf_module_mutex);
        list_for_each_entry(btm, &bpf_trace_modules, list) {
                for (i = 0; i < btm->module->num_bpf_raw_events; ++i) {
                        btp = &btm->module->bpf_raw_events[i];
                        if (!strcmp(btp->tp->name, name)) {
                                if (try_module_get(btm->module))
                                        ret = btp;
                                goto out;
                        }
                }
        }
out:
        mutex_unlock(&bpf_module_mutex);
        return ret;
}
#else
static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
{
        return NULL;
}
#endif /* CONFIG_MODULES */

u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
                                  u64 flags, const struct btf **btf,
                                  s32 *btf_id);
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);

static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx);
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx);

/**
 * trace_call_bpf - invoke BPF program
 * @call: tracepoint event
 * @ctx: opaque context pointer
 *
 * kprobe handlers execute BPF programs via this helper.
 * Can be used from static tracepoints in the future.
 *
 * Return: BPF programs always return an integer which is interpreted by
 * kprobe handler as:
 * 0 - return from kprobe (event is filtered out)
 * 1 - store kprobe event into ring buffer
 * Other values are reserved and currently alias to 1
 */
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
        unsigned int ret;

        cant_sleep();

        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
                /*
                 * since some bpf program is already running on this cpu,
                 * don't call into another bpf program (same or different)
                 * and don't send kprobe event into ring-buffer,
                 * so return zero here
                 */
                rcu_read_lock();
                bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
                rcu_read_unlock();
                ret = 0;
                goto out;
        }

        /*
         * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
         * to all call sites, we did a bpf_prog_array_valid() there to check
         * whether call->prog_array is empty or not, which is
         * a heuristic to speed up execution.
         *
         * If bpf_prog_array_valid() fetched prog_array was
         * non-NULL, we go into trace_call_bpf() and do the actual
         * proper rcu_dereference() under RCU lock.
         * If it turns out that prog_array is NULL then, we bail out.
         * For the opposite, if the bpf_prog_array_valid() fetched pointer
         * was NULL, you'll skip the prog_array with the risk of missing
         * out of events when it was updated in between this and the
         * rcu_dereference() which is accepted risk.
         */
        rcu_read_lock();
        ret = bpf_prog_run_array(rcu_dereference(call->prog_array),
                                 ctx, bpf_prog_run);
        rcu_read_unlock();

 out:
        __this_cpu_dec(bpf_prog_active);

        return ret;
}

#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
{
        regs_set_return_value(regs, rc);
        override_function_with_return(regs);
        return 0;
}

static const struct bpf_func_proto bpf_override_return_proto = {
        .func                = bpf_override_return,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};
#endif

static __always_inline int
bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr)
{
        int ret;

        ret = copy_from_user_nofault(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
                memset(dst, 0, size);
        return ret;
}

BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
           const void __user *, unsafe_ptr)
{
        return bpf_probe_read_user_common(dst, size, unsafe_ptr);
}

const struct bpf_func_proto bpf_probe_read_user_proto = {
        .func                = bpf_probe_read_user,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

static __always_inline int
bpf_probe_read_user_str_common(void *dst, u32 size,
                               const void __user *unsafe_ptr)
{
        int ret;

        /*
         * NB: We rely on strncpy_from_user() not copying junk past the NUL
         * terminator into `dst`.
         *
         * strncpy_from_user() does long-sized strides in the fast path. If the
         * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`,
         * then there could be junk after the NUL in `dst`. If user takes `dst`
         * and keys a hash map with it, then semantically identical strings can
         * occupy multiple entries in the map.
         */
        ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
                memset(dst, 0, size);
        return ret;
}

BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
           const void __user *, unsafe_ptr)
{
        return bpf_probe_read_user_str_common(dst, size, unsafe_ptr);
}

const struct bpf_func_proto bpf_probe_read_user_str_proto = {
        .func                = bpf_probe_read_user_str,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
           const void *, unsafe_ptr)
{
        return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
}

const struct bpf_func_proto bpf_probe_read_kernel_proto = {
        .func                = bpf_probe_read_kernel,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

static __always_inline int
bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
{
        int ret;

        /*
         * The strncpy_from_kernel_nofault() call will likely not fill the
         * entire buffer, but that's okay in this circumstance as we're probing
         * arbitrary memory anyway similar to bpf_probe_read_*() and might
         * as well probe the stack. Thus, memory is explicitly cleared
         * only in error case, so that improper users ignoring return
         * code altogether don't copy garbage; otherwise length of string
         * is returned that can be used for bpf_perf_event_output() et al.
         */
        ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
                memset(dst, 0, size);
        return ret;
}

BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
           const void *, unsafe_ptr)
{
        return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}

const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
        .func                = bpf_probe_read_kernel_str,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
           const void *, unsafe_ptr)
{
        if ((unsigned long)unsafe_ptr < TASK_SIZE) {
                return bpf_probe_read_user_common(dst, size,
                                (__force void __user *)unsafe_ptr);
        }
        return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
}

static const struct bpf_func_proto bpf_probe_read_compat_proto = {
        .func                = bpf_probe_read_compat,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
           const void *, unsafe_ptr)
{
        if ((unsigned long)unsafe_ptr < TASK_SIZE) {
                return bpf_probe_read_user_str_common(dst, size,
                                (__force void __user *)unsafe_ptr);
        }
        return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}

static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
        .func                = bpf_probe_read_compat_str,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};
#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */

BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
           u32, size)
{
        /*
         * Ensure we're in user context which is safe for the helper to
         * run. This helper has no business in a kthread.
         *
         * access_ok() should prevent writing to non-user memory, but in
         * some situations (nommu, temporary switch, etc) access_ok() does
         * not provide enough validation, hence the check on KERNEL_DS.
         *
         * nmi_uaccess_okay() ensures the probe is not run in an interim
         * state, when the task or mm are switched. This is specifically
         * required to prevent the use of temporary mm.
         */

        if (unlikely(in_interrupt() ||
                     current->flags & (PF_KTHREAD | PF_EXITING)))
                return -EPERM;
        if (unlikely(!nmi_uaccess_okay()))
                return -EPERM;

        return copy_to_user_nofault(unsafe_ptr, src, size);
}

static const struct bpf_func_proto bpf_probe_write_user_proto = {
        .func                = bpf_probe_write_user,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

#define MAX_TRACE_PRINTK_VARARGS        3
#define BPF_TRACE_PRINTK_SIZE                1024

BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
           u64, arg2, u64, arg3)
{
        u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
        struct bpf_bprintf_data data = {
                .get_bin_args        = true,
                .get_buf        = true,
        };
        int ret;

        ret = bpf_bprintf_prepare(fmt, fmt_size, args,
                                  MAX_TRACE_PRINTK_VARARGS, &data);
        if (ret < 0)
                return ret;

        ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);

        trace_bpf_trace_printk(data.buf);

        bpf_bprintf_cleanup(&data);

        return ret;
}

static const struct bpf_func_proto bpf_trace_printk_proto = {
        .func                = bpf_trace_printk,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE,
};

static void __set_printk_clr_event(struct work_struct *work)
{
        /*
         * This program might be calling bpf_trace_printk,
         * so enable the associated bpf_trace/bpf_trace_printk event.
         * Repeat this each time as it is possible a user has
         * disabled bpf_trace_printk events.  By loading a program
         * calling bpf_trace_printk() however the user has expressed
         * the intent to see such events.
         */
        if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
                pr_warn_ratelimited("could not enable bpf_trace_printk events");
}
static DECLARE_WORK(set_printk_work, __set_printk_clr_event);

const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
        schedule_work(&set_printk_work);
        return &bpf_trace_printk_proto;
}

BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args,
           u32, data_len)
{
        struct bpf_bprintf_data data = {
                .get_bin_args        = true,
                .get_buf        = true,
        };
        int ret, num_args;

        if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
            (data_len && !args))
                return -EINVAL;
        num_args = data_len / 8;

        ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
        if (ret < 0)
                return ret;

        ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);

        trace_bpf_trace_printk(data.buf);

        bpf_bprintf_cleanup(&data);

        return ret;
}

static const struct bpf_func_proto bpf_trace_vprintk_proto = {
        .func                = bpf_trace_vprintk,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE,
        .arg3_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE_OR_ZERO,
};

const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
{
        schedule_work(&set_printk_work);
        return &bpf_trace_vprintk_proto;
}

BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
           const void *, args, u32, data_len)
{
        struct bpf_bprintf_data data = {
                .get_bin_args        = true,
        };
        int err, num_args;

        if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
            (data_len && !args))
                return -EINVAL;
        num_args = data_len / 8;

        err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
        if (err < 0)
                return err;

        seq_bprintf(m, fmt, data.bin_args);

        bpf_bprintf_cleanup(&data);

        return seq_has_overflowed(m) ? -EOVERFLOW : 0;
}

BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file)

static const struct bpf_func_proto bpf_seq_printf_proto = {
        .func                = bpf_seq_printf,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &btf_seq_file_ids[0],
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
{
        return seq_write(m, data, len) ? -EOVERFLOW : 0;
}

static const struct bpf_func_proto bpf_seq_write_proto = {
        .func                = bpf_seq_write,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &btf_seq_file_ids[0],
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr,
           u32, btf_ptr_size, u64, flags)
{
        const struct btf *btf;
        s32 btf_id;
        int ret;

        ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
        if (ret)
                return ret;

        return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags);
}

static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
        .func                = bpf_seq_printf_btf,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &btf_seq_file_ids[0],
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

static __always_inline int
get_map_perf_counter(struct bpf_map *map, u64 flags,
                     u64 *value, u64 *enabled, u64 *running)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        unsigned int cpu = smp_processor_id();
        u64 index = flags & BPF_F_INDEX_MASK;
        struct bpf_event_entry *ee;

        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
                return -EINVAL;
        if (index == BPF_F_CURRENT_CPU)
                index = cpu;
        if (unlikely(index >= array->map.max_entries))
                return -E2BIG;

        ee = READ_ONCE(array->ptrs[index]);
        if (!ee)
                return -ENOENT;

        return perf_event_read_local(ee->event, value, enabled, running);
}

BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
{
        u64 value = 0;
        int err;

        err = get_map_perf_counter(map, flags, &value, NULL, NULL);
        /*
         * this api is ugly since we miss [-22..-2] range of valid
         * counter values, but that's uapi
         */
        if (err)
                return err;
        return value;
}

const struct bpf_func_proto bpf_perf_event_read_proto = {
        .func                = bpf_perf_event_read,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
           struct bpf_perf_event_value *, buf, u32, size)
{
        int err = -EINVAL;

        if (unlikely(size != sizeof(struct bpf_perf_event_value)))
                goto clear;
        err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
                                   &buf->running);
        if (unlikely(err))
                goto clear;
        return 0;
clear:
        memset(buf, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
        .func                = bpf_perf_event_read_value,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void)
{
        return &bpf_perf_event_read_value_proto;
}

static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
                        u64 flags, struct perf_raw_record *raw,
                        struct perf_sample_data *sd)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        unsigned int cpu = smp_processor_id();
        u64 index = flags & BPF_F_INDEX_MASK;
        struct bpf_event_entry *ee;
        struct perf_event *event;

        if (index == BPF_F_CURRENT_CPU)
                index = cpu;
        if (unlikely(index >= array->map.max_entries))
                return -E2BIG;

        ee = READ_ONCE(array->ptrs[index]);
        if (!ee)
                return -ENOENT;

        event = ee->event;
        if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
                     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
                return -EINVAL;

        if (unlikely(event->oncpu != cpu))
                return -EOPNOTSUPP;

        perf_sample_save_raw_data(sd, event, raw);

        return perf_event_output(event, sd, regs);
}

/*
 * Support executing tracepoints in normal, irq, and nmi context that each call
 * bpf_perf_event_output
 */
struct bpf_trace_sample_data {
        struct perf_sample_data sds[3];
};

static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
static DEFINE_PER_CPU(int, bpf_trace_nest_level);
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
           u64, flags, void *, data, u64, size)
{
        struct bpf_trace_sample_data *sds;
        struct perf_raw_record raw = {
                .frag = {
                        .size = size,
                        .data = data,
                },
        };
        struct perf_sample_data *sd;
        int nest_level, err;

        preempt_disable();
        sds = this_cpu_ptr(&bpf_trace_sds);
        nest_level = this_cpu_inc_return(bpf_trace_nest_level);

        if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
                err = -EBUSY;
                goto out;
        }

        sd = &sds->sds[nest_level - 1];

        if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
                err = -EINVAL;
                goto out;
        }

        perf_sample_data_init(sd, 0, 0);

        err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
        this_cpu_dec(bpf_trace_nest_level);
        preempt_enable();
        return err;
}

static const struct bpf_func_proto bpf_perf_event_output_proto = {
        .func                = bpf_perf_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

static DEFINE_PER_CPU(int, bpf_event_output_nest_level);
struct bpf_nested_pt_regs {
        struct pt_regs regs[3];
};
static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs);
static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);

u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
        struct perf_raw_frag frag = {
                .copy                = ctx_copy,
                .size                = ctx_size,
                .data                = ctx,
        };
        struct perf_raw_record raw = {
                .frag = {
                        {
                                .next        = ctx_size ? &frag : NULL,
                        },
                        .size        = meta_size,
                        .data        = meta,
                },
        };
        struct perf_sample_data *sd;
        struct pt_regs *regs;
        int nest_level;
        u64 ret;

        preempt_disable();
        nest_level = this_cpu_inc_return(bpf_event_output_nest_level);

        if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
                ret = -EBUSY;
                goto out;
        }
        sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]);
        regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]);

        perf_fetch_caller_regs(regs);
        perf_sample_data_init(sd, 0, 0);

        ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
        this_cpu_dec(bpf_event_output_nest_level);
        preempt_enable();
        return ret;
}

BPF_CALL_0(bpf_get_current_task)
{
        return (long) current;
}

const struct bpf_func_proto bpf_get_current_task_proto = {
        .func                = bpf_get_current_task,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_0(bpf_get_current_task_btf)
{
        return (unsigned long) current;
}

const struct bpf_func_proto bpf_get_current_task_btf_proto = {
        .func                = bpf_get_current_task_btf,
        .gpl_only        = true,
        .ret_type        = RET_PTR_TO_BTF_ID_TRUSTED,
        .ret_btf_id        = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};

BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
{
        return (unsigned long) task_pt_regs(task);
}

BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs)

const struct bpf_func_proto bpf_task_pt_regs_proto = {
        .func                = bpf_task_pt_regs,
        .gpl_only        = true,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
        .ret_type        = RET_PTR_TO_BTF_ID,
        .ret_btf_id        = &bpf_task_pt_regs_ids[0],
};

struct send_signal_irq_work {
        struct irq_work irq_work;
        struct task_struct *task;
        u32 sig;
        enum pid_type type;
        bool has_siginfo;
        struct kernel_siginfo info;
};

static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);

static void do_bpf_send_signal(struct irq_work *entry)
{
        struct send_signal_irq_work *work;
        struct kernel_siginfo *siginfo;

        work = container_of(entry, struct send_signal_irq_work, irq_work);
        siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV;

        group_send_sig_info(work->sig, siginfo, work->task, work->type);
        put_task_struct(work->task);
}

static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value)
{
        struct send_signal_irq_work *work = NULL;
        struct kernel_siginfo info;
        struct kernel_siginfo *siginfo;

        if (!task) {
                task = current;
                siginfo = SEND_SIG_PRIV;
        } else {
                clear_siginfo(&info);
                info.si_signo = sig;
                info.si_errno = 0;
                info.si_code = SI_KERNEL;
                info.si_pid = 0;
                info.si_uid = 0;
                info.si_value.sival_ptr = (void __user __force *)(unsigned long)value;
                siginfo = &info;
        }

        /* Similar to bpf_probe_write_user, task needs to be
         * in a sound condition and kernel memory access be
         * permitted in order to send signal to the current
         * task.
         */
        if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING)))
                return -EPERM;
        if (unlikely(!nmi_uaccess_okay()))
                return -EPERM;
        /* Task should not be pid=1 to avoid kernel panic. */
        if (unlikely(is_global_init(task)))
                return -EPERM;

        if (preempt_count() != 0 || irqs_disabled()) {
                /* Do an early check on signal validity. Otherwise,
                 * the error is lost in deferred irq_work.
                 */
                if (unlikely(!valid_signal(sig)))
                        return -EINVAL;

                work = this_cpu_ptr(&send_signal_work);
                if (irq_work_is_busy(&work->irq_work))
                        return -EBUSY;

                /* Add the current task, which is the target of sending signal,
                 * to the irq_work. The current task may change when queued
                 * irq works get executed.
                 */
                work->task = get_task_struct(task);
                work->has_siginfo = siginfo == &info;
                if (work->has_siginfo)
                        copy_siginfo(&work->info, &info);
                work->sig = sig;
                work->type = type;
                irq_work_queue(&work->irq_work);
                return 0;
        }

        return group_send_sig_info(sig, siginfo, task, type);
}

BPF_CALL_1(bpf_send_signal, u32, sig)
{
        return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0);
}

const struct bpf_func_proto bpf_send_signal_proto = {
        .func                = bpf_send_signal,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_send_signal_thread, u32, sig)
{
        return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0);
}

const struct bpf_func_proto bpf_send_signal_thread_proto = {
        .func                = bpf_send_signal_thread,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_d_path, const struct path *, path, char *, buf, u32, sz)
{
        struct path copy;
        long len;
        char *p;

        if (!sz)
                return 0;

        /*
         * The path pointer is verified as trusted and safe to use,
         * but let's double check it's valid anyway to workaround
         * potentially broken verifier.
         */
        len = copy_from_kernel_nofault(&copy, path, sizeof(*path));
        if (len < 0)
                return len;

        p = d_path(&copy, buf, sz);
        if (IS_ERR(p)) {
                len = PTR_ERR(p);
        } else {
                len = buf + sz - p;
                memmove(buf, p, len);
        }

        return len;
}

BTF_SET_START(btf_allowlist_d_path)
#ifdef CONFIG_SECURITY
BTF_ID(func, security_file_permission)
BTF_ID(func, security_inode_getattr)
BTF_ID(func, security_file_open)
#endif
#ifdef CONFIG_SECURITY_PATH
BTF_ID(func, security_path_truncate)
#endif
BTF_ID(func, vfs_truncate)
BTF_ID(func, vfs_fallocate)
BTF_ID(func, dentry_open)
BTF_ID(func, vfs_getattr)
BTF_ID(func, filp_close)
BTF_SET_END(btf_allowlist_d_path)

static bool bpf_d_path_allowed(const struct bpf_prog *prog)
{
        if (prog->type == BPF_PROG_TYPE_TRACING &&
            prog->expected_attach_type == BPF_TRACE_ITER)
                return true;

        if (prog->type == BPF_PROG_TYPE_LSM)
                return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);

        return btf_id_set_contains(&btf_allowlist_d_path,
                                   prog->aux->attach_btf_id);
}

BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path)

static const struct bpf_func_proto bpf_d_path_proto = {
        .func                = bpf_d_path,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_d_path_btf_ids[0],
        .arg2_type        = ARG_PTR_TO_MEM | MEM_WRITE,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .allowed        = bpf_d_path_allowed,
};

#define BTF_F_ALL        (BTF_F_COMPACT  | BTF_F_NONAME | \
                         BTF_F_PTR_RAW | BTF_F_ZERO)

static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
                                  u64 flags, const struct btf **btf,
                                  s32 *btf_id)
{
        const struct btf_type *t;

        if (unlikely(flags & ~(BTF_F_ALL)))
                return -EINVAL;

        if (btf_ptr_size != sizeof(struct btf_ptr))
                return -EINVAL;

        *btf = bpf_get_btf_vmlinux();

        if (IS_ERR_OR_NULL(*btf))
                return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;

        if (ptr->type_id > 0)
                *btf_id = ptr->type_id;
        else
                return -EINVAL;

        if (*btf_id > 0)
                t = btf_type_by_id(*btf, *btf_id);
        if (*btf_id <= 0 || !t)
                return -ENOENT;

        return 0;
}

BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr,
           u32, btf_ptr_size, u64, flags)
{
        const struct btf *btf;
        s32 btf_id;
        int ret;

        ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
        if (ret)
                return ret;

        return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size,
                                      flags);
}

const struct bpf_func_proto bpf_snprintf_btf_proto = {
        .func                = bpf_snprintf_btf,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | MEM_WRITE,
        .arg2_type        = ARG_CONST_SIZE,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
{
        /* This helper call is inlined by verifier. */
        return ((u64 *)ctx)[-2];
}

static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
        .func                = bpf_get_func_ip_tracing,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

static inline unsigned long get_entry_ip(unsigned long fentry_ip)
{
#ifdef CONFIG_X86_KERNEL_IBT
        if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE)))
                fentry_ip -= ENDBR_INSN_SIZE;
#endif
        return fentry_ip;
}

BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
        struct bpf_trace_run_ctx *run_ctx __maybe_unused;
        struct kprobe *kp;

#ifdef CONFIG_UPROBES
        run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
        if (run_ctx->is_uprobe)
                return ((struct uprobe_dispatch_data *)current->utask->vaddr)->bp_addr;
#endif

        kp = kprobe_running();

        if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY))
                return 0;

        return get_entry_ip((uintptr_t)kp->addr);
}

static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
        .func                = bpf_get_func_ip_kprobe,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
{
        return bpf_kprobe_multi_entry_ip(current->bpf_ctx);
}

static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
        .func                = bpf_get_func_ip_kprobe_multi,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
{
        return bpf_kprobe_multi_cookie(current->bpf_ctx);
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
        .func                = bpf_get_attach_cookie_kprobe_multi,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_func_ip_uprobe_multi, struct pt_regs *, regs)
{
        return bpf_uprobe_multi_entry_ip(current->bpf_ctx);
}

static const struct bpf_func_proto bpf_get_func_ip_proto_uprobe_multi = {
        .func                = bpf_get_func_ip_uprobe_multi,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs)
{
        return bpf_uprobe_multi_cookie(current->bpf_ctx);
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = {
        .func                = bpf_get_attach_cookie_uprobe_multi,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
{
        struct bpf_trace_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
        return run_ctx->bpf_cookie;
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = {
        .func                = bpf_get_attach_cookie_trace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx)
{
        return ctx->event->bpf_cookie;
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
        .func                = bpf_get_attach_cookie_pe,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx)
{
        struct bpf_trace_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
        return run_ctx->bpf_cookie;
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
        .func                = bpf_get_attach_cookie_tracing,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
{
        static const u32 br_entry_size = sizeof(struct perf_branch_entry);
        u32 entry_cnt = size / br_entry_size;

        entry_cnt = static_call(perf_snapshot_branch_stack)(buf, entry_cnt);

        if (unlikely(flags))
                return -EINVAL;

        if (!entry_cnt)
                return -ENOENT;

        return entry_cnt * br_entry_size;
}

const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
        .func                = bpf_get_branch_snapshot,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
{
        /* This helper call is inlined by verifier. */
        u64 nr_args = ((u64 *)ctx)[-1] & 0xFF;

        if ((u64) n >= nr_args)
                return -EINVAL;
        *value = ((u64 *)ctx)[n];
        return 0;
}

static const struct bpf_func_proto bpf_get_func_arg_proto = {
        .func                = get_func_arg,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
        .arg3_size        = sizeof(u64),
};

BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
{
        /* This helper call is inlined by verifier. */
        u64 nr_args = ((u64 *)ctx)[-1] & 0xFF;

        *value = ((u64 *)ctx)[nr_args];
        return 0;
}

static const struct bpf_func_proto bpf_get_func_ret_proto = {
        .func                = get_func_ret,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
        .arg2_size        = sizeof(u64),
};

BPF_CALL_1(get_func_arg_cnt, void *, ctx)
{
        /* This helper call is inlined by verifier. */
        return ((u64 *)ctx)[-1] & 0xFF;
}

static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
        .func                = get_func_arg_cnt,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        switch (func_id) {
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
        case BPF_FUNC_probe_read:
                return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
                       NULL : &bpf_probe_read_compat_proto;
        case BPF_FUNC_probe_read_str:
                return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
                       NULL : &bpf_probe_read_compat_str_proto;
#endif
        case BPF_FUNC_get_func_ip:
                return &bpf_get_func_ip_proto_tracing;
        default:
                break;
        }

        func_proto = bpf_base_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN))
                return NULL;

        switch (func_id) {
        case BPF_FUNC_probe_write_user:
                return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
                       NULL : &bpf_probe_write_user_proto;
        default:
                return NULL;
        }
}

static bool is_kprobe_multi(const struct bpf_prog *prog)
{
        return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ||
               prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
}

static inline bool is_kprobe_session(const struct bpf_prog *prog)
{
        return prog->type == BPF_PROG_TYPE_KPROBE &&
               prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
}

static inline bool is_uprobe_multi(const struct bpf_prog *prog)
{
        return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI ||
               prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
}

static inline bool is_uprobe_session(const struct bpf_prog *prog)
{
        return prog->type == BPF_PROG_TYPE_KPROBE &&
               prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
}

static inline bool is_trace_fsession(const struct bpf_prog *prog)
{
        return prog->type == BPF_PROG_TYPE_TRACING &&
               prog->expected_attach_type == BPF_TRACE_FSESSION;
}

static const struct bpf_func_proto *
kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_perf_event_output_proto;
        case BPF_FUNC_get_stackid:
                return &bpf_get_stackid_proto;
        case BPF_FUNC_get_stack:
                return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
        case BPF_FUNC_override_return:
                return &bpf_override_return_proto;
#endif
        case BPF_FUNC_get_func_ip:
                if (is_kprobe_multi(prog))
                        return &bpf_get_func_ip_proto_kprobe_multi;
                if (is_uprobe_multi(prog))
                        return &bpf_get_func_ip_proto_uprobe_multi;
                return &bpf_get_func_ip_proto_kprobe;
        case BPF_FUNC_get_attach_cookie:
                if (is_kprobe_multi(prog))
                        return &bpf_get_attach_cookie_proto_kmulti;
                if (is_uprobe_multi(prog))
                        return &bpf_get_attach_cookie_proto_umulti;
                return &bpf_get_attach_cookie_proto_trace;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
}

/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
                                        const struct bpf_prog *prog,
                                        struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= sizeof(struct pt_regs))
                return false;
        if (off % size != 0)
                return false;
        /*
         * Assertion for 32 bit to make sure last 8 byte access
         * (BPF_DW) to the last 4 byte member is disallowed.
         */
        if (off + size > sizeof(struct pt_regs))
                return false;

        if (type == BPF_WRITE)
                prog->aux->kprobe_write_ctx = true;

        return true;
}

const struct bpf_verifier_ops kprobe_verifier_ops = {
        .get_func_proto  = kprobe_prog_func_proto,
        .is_valid_access = kprobe_prog_is_valid_access,
};

const struct bpf_prog_ops kprobe_prog_ops = {
};

BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
           u64, flags, void *, data, u64, size)
{
        struct pt_regs *regs = *(struct pt_regs **)tp_buff;

        /*
         * r1 points to perf tracepoint buffer where first 8 bytes are hidden
         * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
         * from there and call the same bpf_perf_event_output() helper inline.
         */
        return ____bpf_perf_event_output(regs, map, flags, data, size);
}

static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
        .func                = bpf_perf_event_output_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
           u64, flags)
{
        struct pt_regs *regs = *(struct pt_regs **)tp_buff;

        /*
         * Same comment as in bpf_perf_event_output_tp(), only that this time
         * the other helper's function body cannot be inlined due to being
         * external, thus we need to call raw helper function.
         */
        return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
                               flags, 0, 0);
}

static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
        .func                = bpf_get_stackid_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
           u64, flags)
{
        struct pt_regs *regs = *(struct pt_regs **)tp_buff;

        return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
                             (unsigned long) size, flags, 0);
}

static const struct bpf_func_proto bpf_get_stack_proto_tp = {
        .func                = bpf_get_stack_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_perf_event_output_proto_tp;
        case BPF_FUNC_get_stackid:
                return &bpf_get_stackid_proto_tp;
        case BPF_FUNC_get_stack:
                return &bpf_get_stack_proto_tp;
        case BPF_FUNC_get_attach_cookie:
                return &bpf_get_attach_cookie_proto_trace;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
}

static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
                                    const struct bpf_prog *prog,
                                    struct bpf_insn_access_aux *info)
{
        if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
                return false;
        if (type != BPF_READ)
                return false;
        if (off % size != 0)
                return false;

        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
        return true;
}

const struct bpf_verifier_ops tracepoint_verifier_ops = {
        .get_func_proto  = tp_prog_func_proto,
        .is_valid_access = tp_prog_is_valid_access,
};

const struct bpf_prog_ops tracepoint_prog_ops = {
};

BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx,
           struct bpf_perf_event_value *, buf, u32, size)
{
        int err = -EINVAL;

        if (unlikely(size != sizeof(struct bpf_perf_event_value)))
                goto clear;
        err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
                                    &buf->running);
        if (unlikely(err))
                goto clear;
        return 0;
clear:
        memset(buf, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
         .func           = bpf_perf_prog_read_value,
         .gpl_only       = true,
         .ret_type       = RET_INTEGER,
         .arg1_type      = ARG_PTR_TO_CTX,
         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
         .arg3_type      = ARG_CONST_SIZE,
};

BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
           void *, buf, u32, size, u64, flags)
{
        static const u32 br_entry_size = sizeof(struct perf_branch_entry);
        struct perf_branch_stack *br_stack = ctx->data->br_stack;
        u32 to_copy;

        if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE))
                return -EINVAL;

        if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK)))
                return -ENOENT;

        if (unlikely(!br_stack))
                return -ENOENT;

        if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE)
                return br_stack->nr * br_entry_size;

        if (!buf || (size % br_entry_size != 0))
                return -EINVAL;

        to_copy = min_t(u32, br_stack->nr * br_entry_size, size);
        memcpy(buf, br_stack->entries, to_copy);

        return to_copy;
}

static const struct bpf_func_proto bpf_read_branch_records_proto = {
        .func           = bpf_read_branch_records,
        .gpl_only       = true,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
};

static const struct bpf_func_proto *
pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_perf_event_output_proto_tp;
        case BPF_FUNC_get_stackid:
                return &bpf_get_stackid_proto_pe;
        case BPF_FUNC_get_stack:
                return &bpf_get_stack_proto_pe;
        case BPF_FUNC_perf_prog_read_value:
                return &bpf_perf_prog_read_value_proto;
        case BPF_FUNC_read_branch_records:
                return &bpf_read_branch_records_proto;
        case BPF_FUNC_get_attach_cookie:
                return &bpf_get_attach_cookie_proto_pe;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
}

/*
 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
 * to avoid potential recursive reuse issue when/if tracepoints are added
 * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
 *
 * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
 * in normal, irq, and nmi context.
 */
struct bpf_raw_tp_regs {
        struct pt_regs regs[3];
};
static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
static struct pt_regs *get_bpf_raw_tp_regs(void)
{
        struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
        int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);

        if (nest_level > ARRAY_SIZE(tp_regs->regs)) {
                this_cpu_dec(bpf_raw_tp_nest_level);
                return ERR_PTR(-EBUSY);
        }

        return &tp_regs->regs[nest_level - 1];
}

static void put_bpf_raw_tp_regs(void)
{
        this_cpu_dec(bpf_raw_tp_nest_level);
}

BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
           struct bpf_map *, map, u64, flags, void *, data, u64, size)
{
        struct pt_regs *regs = get_bpf_raw_tp_regs();
        int ret;

        if (IS_ERR(regs))
                return PTR_ERR(regs);

        perf_fetch_caller_regs(regs);
        ret = ____bpf_perf_event_output(regs, map, flags, data, size);

        put_bpf_raw_tp_regs();
        return ret;
}

static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
        .func                = bpf_perf_event_output_raw_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

extern const struct bpf_func_proto bpf_skb_output_proto;
extern const struct bpf_func_proto bpf_xdp_output_proto;
extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto;

BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
           struct bpf_map *, map, u64, flags)
{
        struct pt_regs *regs = get_bpf_raw_tp_regs();
        int ret;

        if (IS_ERR(regs))
                return PTR_ERR(regs);

        perf_fetch_caller_regs(regs);
        /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
        ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
                              flags, 0, 0);
        put_bpf_raw_tp_regs();
        return ret;
}

static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
        .func                = bpf_get_stackid_raw_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
           void *, buf, u32, size, u64, flags)
{
        struct pt_regs *regs = get_bpf_raw_tp_regs();
        int ret;

        if (IS_ERR(regs))
                return PTR_ERR(regs);

        perf_fetch_caller_regs(regs);
        ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
                            (unsigned long) size, flags, 0);
        put_bpf_raw_tp_regs();
        return ret;
}

static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
        .func                = bpf_get_stack_raw_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_perf_event_output_proto_raw_tp;
        case BPF_FUNC_get_stackid:
                return &bpf_get_stackid_proto_raw_tp;
        case BPF_FUNC_get_stack:
                return &bpf_get_stack_proto_raw_tp;
        case BPF_FUNC_get_attach_cookie:
                return &bpf_get_attach_cookie_proto_tracing;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *fn;

        switch (func_id) {
#ifdef CONFIG_NET
        case BPF_FUNC_skb_output:
                return &bpf_skb_output_proto;
        case BPF_FUNC_xdp_output:
                return &bpf_xdp_output_proto;
        case BPF_FUNC_skc_to_tcp6_sock:
                return &bpf_skc_to_tcp6_sock_proto;
        case BPF_FUNC_skc_to_tcp_sock:
                return &bpf_skc_to_tcp_sock_proto;
        case BPF_FUNC_skc_to_tcp_timewait_sock:
                return &bpf_skc_to_tcp_timewait_sock_proto;
        case BPF_FUNC_skc_to_tcp_request_sock:
                return &bpf_skc_to_tcp_request_sock_proto;
        case BPF_FUNC_skc_to_udp6_sock:
                return &bpf_skc_to_udp6_sock_proto;
        case BPF_FUNC_skc_to_unix_sock:
                return &bpf_skc_to_unix_sock_proto;
        case BPF_FUNC_skc_to_mptcp_sock:
                return &bpf_skc_to_mptcp_sock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_tracing_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_tracing_proto;
        case BPF_FUNC_sock_from_file:
                return &bpf_sock_from_file_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_ptr_cookie_proto;
        case BPF_FUNC_xdp_get_buff_len:
                return &bpf_xdp_get_buff_len_trace_proto;
#endif
        case BPF_FUNC_seq_printf:
                return prog->expected_attach_type == BPF_TRACE_ITER ?
                       &bpf_seq_printf_proto :
                       NULL;
        case BPF_FUNC_seq_write:
                return prog->expected_attach_type == BPF_TRACE_ITER ?
                       &bpf_seq_write_proto :
                       NULL;
        case BPF_FUNC_seq_printf_btf:
                return prog->expected_attach_type == BPF_TRACE_ITER ?
                       &bpf_seq_printf_btf_proto :
                       NULL;
        case BPF_FUNC_d_path:
                return &bpf_d_path_proto;
        case BPF_FUNC_get_func_arg:
                if (bpf_prog_has_trampoline(prog) ||
                    prog->expected_attach_type == BPF_TRACE_RAW_TP)
                        return &bpf_get_func_arg_proto;
                return NULL;
        case BPF_FUNC_get_func_ret:
                return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
        case BPF_FUNC_get_func_arg_cnt:
                if (bpf_prog_has_trampoline(prog) ||
                    prog->expected_attach_type == BPF_TRACE_RAW_TP)
                        return &bpf_get_func_arg_cnt_proto;
                return NULL;
        case BPF_FUNC_get_attach_cookie:
                if (prog->type == BPF_PROG_TYPE_TRACING &&
                    prog->expected_attach_type == BPF_TRACE_RAW_TP)
                        return &bpf_get_attach_cookie_proto_tracing;
                return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
        default:
                fn = raw_tp_prog_func_proto(func_id, prog);
                if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
                        fn = bpf_iter_get_func_proto(func_id, prog);
                return fn;
        }
}

static bool raw_tp_prog_is_valid_access(int off, int size,
                                        enum bpf_access_type type,
                                        const struct bpf_prog *prog,
                                        struct bpf_insn_access_aux *info)
{
        return bpf_tracing_ctx_access(off, size, type);
}

static bool tracing_prog_is_valid_access(int off, int size,
                                         enum bpf_access_type type,
                                         const struct bpf_prog *prog,
                                         struct bpf_insn_access_aux *info)
{
        return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
}

int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog,
                                     const union bpf_attr *kattr,
                                     union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
        .get_func_proto  = raw_tp_prog_func_proto,
        .is_valid_access = raw_tp_prog_is_valid_access,
};

const struct bpf_prog_ops raw_tracepoint_prog_ops = {
#ifdef CONFIG_NET
        .test_run = bpf_prog_test_run_raw_tp,
#endif
};

const struct bpf_verifier_ops tracing_verifier_ops = {
        .get_func_proto  = tracing_prog_func_proto,
        .is_valid_access = tracing_prog_is_valid_access,
};

const struct bpf_prog_ops tracing_prog_ops = {
        .test_run = bpf_prog_test_run_tracing,
};

static bool raw_tp_writable_prog_is_valid_access(int off, int size,
                                                 enum bpf_access_type type,
                                                 const struct bpf_prog *prog,
                                                 struct bpf_insn_access_aux *info)
{
        if (off == 0) {
                if (size != sizeof(u64) || type != BPF_READ)
                        return false;
                info->reg_type = PTR_TO_TP_BUFFER;
        }
        return raw_tp_prog_is_valid_access(off, size, type, prog, info);
}

const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = {
        .get_func_proto  = raw_tp_prog_func_proto,
        .is_valid_access = raw_tp_writable_prog_is_valid_access,
};

const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = {
};

static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
                                    const struct bpf_prog *prog,
                                    struct bpf_insn_access_aux *info)
{
        const int size_u64 = sizeof(u64);

        if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
                return false;
        if (type != BPF_READ)
                return false;
        if (off % size != 0) {
                if (sizeof(unsigned long) != 4)
                        return false;
                if (size != 8)
                        return false;
                if (off % size != 4)
                        return false;
        }

        switch (off) {
        case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
                bpf_ctx_record_field_size(info, size_u64);
                if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
                        return false;
                break;
        case bpf_ctx_range(struct bpf_perf_event_data, addr):
                bpf_ctx_record_field_size(info, size_u64);
                if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
                        return false;
                break;
        default:
                if (size != sizeof(long))
                        return false;
        }

        return true;
}

static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
                                      const struct bpf_insn *si,
                                      struct bpf_insn *insn_buf,
                                      struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_perf_event_data, sample_period):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
                                                       data), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_perf_event_data_kern, data));
                *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct perf_sample_data, period, 8,
                                                     target_size));
                break;
        case offsetof(struct bpf_perf_event_data, addr):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
                                                       data), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_perf_event_data_kern, data));
                *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct perf_sample_data, addr, 8,
                                                     target_size));
                break;
        default:
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
                                                       regs), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_perf_event_data_kern, regs));
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
                                      si->off);
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops perf_event_verifier_ops = {
        .get_func_proto                = pe_prog_func_proto,
        .is_valid_access        = pe_prog_is_valid_access,
        .convert_ctx_access        = pe_prog_convert_ctx_access,
};

const struct bpf_prog_ops perf_event_prog_ops = {
};

static DEFINE_MUTEX(bpf_event_mutex);

#define BPF_TRACE_MAX_PROGS 64

int perf_event_attach_bpf_prog(struct perf_event *event,
                               struct bpf_prog *prog,
                               u64 bpf_cookie)
{
        struct bpf_prog_array *old_array;
        struct bpf_prog_array *new_array;
        int ret = -EEXIST;

        /*
         * Kprobe override only works if they are on the function entry,
         * and only if they are on the opt-in list.
         */
        if (prog->kprobe_override &&
            (!trace_kprobe_on_func_entry(event->tp_event) ||
             !trace_kprobe_error_injectable(event->tp_event)))
                return -EINVAL;

        mutex_lock(&bpf_event_mutex);

        if (event->prog)
                goto unlock;

        old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
        if (old_array &&
            bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
                ret = -E2BIG;
                goto unlock;
        }

        ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
        if (ret < 0)
                goto unlock;

        /* set the new array to event->tp_event and set event->prog */
        event->prog = prog;
        event->bpf_cookie = bpf_cookie;
        rcu_assign_pointer(event->tp_event->prog_array, new_array);
        bpf_prog_array_free_sleepable(old_array);

unlock:
        mutex_unlock(&bpf_event_mutex);
        return ret;
}

void perf_event_detach_bpf_prog(struct perf_event *event)
{
        struct bpf_prog_array *old_array;
        struct bpf_prog_array *new_array;
        struct bpf_prog *prog = NULL;
        int ret;

        mutex_lock(&bpf_event_mutex);

        if (!event->prog)
                goto unlock;

        old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
        if (!old_array)
                goto put;

        ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
        if (ret < 0) {
                bpf_prog_array_delete_safe(old_array, event->prog);
        } else {
                rcu_assign_pointer(event->tp_event->prog_array, new_array);
                bpf_prog_array_free_sleepable(old_array);
        }

put:
        prog = event->prog;
        event->prog = NULL;

unlock:
        mutex_unlock(&bpf_event_mutex);

        if (prog) {
                /*
                 * It could be that the bpf_prog is not sleepable (and will be freed
                 * via normal RCU), but is called from a point that supports sleepable
                 * programs and uses tasks-trace-RCU.
                 */
                synchronize_rcu_tasks_trace();

                bpf_prog_put(prog);
        }
}

int perf_event_query_prog_array(struct perf_event *event, void __user *info)
{
        struct perf_event_query_bpf __user *uquery = info;
        struct perf_event_query_bpf query = {};
        struct bpf_prog_array *progs;
        u32 *ids, prog_cnt, ids_len;
        int ret;

        if (!perfmon_capable())
                return -EPERM;
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -EINVAL;
        if (copy_from_user(&query, uquery, sizeof(query)))
                return -EFAULT;

        ids_len = query.ids_len;
        if (ids_len > BPF_TRACE_MAX_PROGS)
                return -E2BIG;
        ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN);
        if (!ids)
                return -ENOMEM;
        /*
         * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which
         * is required when user only wants to check for uquery->prog_cnt.
         * There is no need to check for it since the case is handled
         * gracefully in bpf_prog_array_copy_info.
         */

        mutex_lock(&bpf_event_mutex);
        progs = bpf_event_rcu_dereference(event->tp_event->prog_array);
        ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt);
        mutex_unlock(&bpf_event_mutex);

        if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
            copy_to_user(uquery->ids, ids, ids_len * sizeof(u32)))
                ret = -EFAULT;

        kfree(ids);
        return ret;
}

extern struct bpf_raw_event_map __start__bpf_raw_tp[];
extern struct bpf_raw_event_map __stop__bpf_raw_tp[];

struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
{
        struct bpf_raw_event_map *btp = __start__bpf_raw_tp;

        for (; btp < __stop__bpf_raw_tp; btp++) {
                if (!strcmp(btp->tp->name, name))
                        return btp;
        }

        return bpf_get_raw_tracepoint_module(name);
}

void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
{
        struct module *mod;

        guard(rcu)();
        mod = __module_address((unsigned long)btp);
        module_put(mod);
}

static __always_inline
void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
        struct bpf_prog *prog = link->link.prog;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_trace_run_ctx run_ctx;

        rcu_read_lock_dont_migrate();
        if (unlikely(!bpf_prog_get_recursion_context(prog))) {
                bpf_prog_inc_misses_counter(prog);
                goto out;
        }

        run_ctx.bpf_cookie = link->cookie;
        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);

        (void) bpf_prog_run(prog, args);

        bpf_reset_run_ctx(old_run_ctx);
out:
        bpf_prog_put_recursion_context(prog);
        rcu_read_unlock_migrate();
}

#define UNPACK(...)                        __VA_ARGS__
#define REPEAT_1(FN, DL, X, ...)        FN(X)
#define REPEAT_2(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__)
#define REPEAT_3(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__)
#define REPEAT_4(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__)
#define REPEAT_5(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__)
#define REPEAT_6(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__)
#define REPEAT_7(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__)
#define REPEAT_8(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__)
#define REPEAT_9(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__)
#define REPEAT_10(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__)
#define REPEAT_11(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__)
#define REPEAT_12(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__)
#define REPEAT(X, FN, DL, ...)                REPEAT_##X(FN, DL, __VA_ARGS__)

#define SARG(X)                u64 arg##X
#define COPY(X)                args[X] = arg##X

#define __DL_COM        (,)
#define __DL_SEM        (;)

#define __SEQ_0_11        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11

#define BPF_TRACE_DEFN_x(x)                                                \
        void bpf_trace_run##x(struct bpf_raw_tp_link *link,                \
                              REPEAT(x, SARG, __DL_COM, __SEQ_0_11))        \
        {                                                                \
                u64 args[x];                                                \
                REPEAT(x, COPY, __DL_SEM, __SEQ_0_11);                        \
                __bpf_trace_run(link, args);                                \
        }                                                                \
        EXPORT_SYMBOL_GPL(bpf_trace_run##x)
BPF_TRACE_DEFN_x(1);
BPF_TRACE_DEFN_x(2);
BPF_TRACE_DEFN_x(3);
BPF_TRACE_DEFN_x(4);
BPF_TRACE_DEFN_x(5);
BPF_TRACE_DEFN_x(6);
BPF_TRACE_DEFN_x(7);
BPF_TRACE_DEFN_x(8);
BPF_TRACE_DEFN_x(9);
BPF_TRACE_DEFN_x(10);
BPF_TRACE_DEFN_x(11);
BPF_TRACE_DEFN_x(12);

int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
        struct tracepoint *tp = btp->tp;
        struct bpf_prog *prog = link->link.prog;

        /*
         * check that program doesn't access arguments beyond what's
         * available in this tracepoint
         */
        if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))
                return -EINVAL;

        if (prog->aux->max_tp_access > btp->writable_size)
                return -EINVAL;

        return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link);
}

int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
        return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link);
}

int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
                            u32 *fd_type, const char **buf,
                            u64 *probe_offset, u64 *probe_addr,
                            unsigned long *missed)
{
        bool is_tracepoint, is_syscall_tp;
        struct bpf_prog *prog;
        int flags, err = 0;

        prog = event->prog;
        if (!prog)
                return -ENOENT;

        /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
        if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
                return -EOPNOTSUPP;

        *prog_id = prog->aux->id;
        flags = event->tp_event->flags;
        is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
        is_syscall_tp = is_syscall_trace_event(event->tp_event);

        if (is_tracepoint || is_syscall_tp) {
                *buf = is_tracepoint ? event->tp_event->tp->name
                                     : event->tp_event->name;
                /* We allow NULL pointer for tracepoint */
                if (fd_type)
                        *fd_type = BPF_FD_TYPE_TRACEPOINT;
                if (probe_offset)
                        *probe_offset = 0x0;
                if (probe_addr)
                        *probe_addr = 0x0;
        } else {
                /* kprobe/uprobe */
                err = -EOPNOTSUPP;
#ifdef CONFIG_KPROBE_EVENTS
                if (flags & TRACE_EVENT_FL_KPROBE)
                        err = bpf_get_kprobe_info(event, fd_type, buf,
                                                  probe_offset, probe_addr, missed,
                                                  event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
#ifdef CONFIG_UPROBE_EVENTS
                if (flags & TRACE_EVENT_FL_UPROBE)
                        err = bpf_get_uprobe_info(event, fd_type, buf,
                                                  probe_offset, probe_addr,
                                                  event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
        }

        return err;
}

static int __init send_signal_irq_work_init(void)
{
        int cpu;
        struct send_signal_irq_work *work;

        for_each_possible_cpu(cpu) {
                work = per_cpu_ptr(&send_signal_work, cpu);
                init_irq_work(&work->irq_work, do_bpf_send_signal);
        }
        return 0;
}

subsys_initcall(send_signal_irq_work_init);

#ifdef CONFIG_MODULES
static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
                            void *module)
{
        struct bpf_trace_module *btm, *tmp;
        struct module *mod = module;
        int ret = 0;

        if (mod->num_bpf_raw_events == 0 ||
            (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
                goto out;

        mutex_lock(&bpf_module_mutex);

        switch (op) {
        case MODULE_STATE_COMING:
                btm = kzalloc_obj(*btm);
                if (btm) {
                        btm->module = module;
                        list_add(&btm->list, &bpf_trace_modules);
                } else {
                        ret = -ENOMEM;
                }
                break;
        case MODULE_STATE_GOING:
                list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) {
                        if (btm->module == module) {
                                list_del(&btm->list);
                                kfree(btm);
                                break;
                        }
                }
                break;
        }

        mutex_unlock(&bpf_module_mutex);

out:
        return notifier_from_errno(ret);
}

static struct notifier_block bpf_module_nb = {
        .notifier_call = bpf_event_notify,
};

static int __init bpf_event_init(void)
{
        register_module_notifier(&bpf_module_nb);
        return 0;
}

fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */

struct bpf_session_run_ctx {
        struct bpf_run_ctx run_ctx;
        bool is_return;
        void *data;
};

#ifdef CONFIG_FPROBE
struct bpf_kprobe_multi_link {
        struct bpf_link link;
        struct fprobe fp;
        unsigned long *addrs;
        u64 *cookies;
        u32 cnt;
        u32 mods_cnt;
        struct module **mods;
};

struct bpf_kprobe_multi_run_ctx {
        struct bpf_session_run_ctx session_ctx;
        struct bpf_kprobe_multi_link *link;
        unsigned long entry_ip;
};

struct user_syms {
        const char **syms;
        char *buf;
};

#ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS
static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs);
#define bpf_kprobe_multi_pt_regs_ptr()        this_cpu_ptr(&bpf_kprobe_multi_pt_regs)
#else
#define bpf_kprobe_multi_pt_regs_ptr()        (NULL)
#endif

static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip)
{
        unsigned long ip = ftrace_get_symaddr(fentry_ip);

        return ip ? : fentry_ip;
}

static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
{
        unsigned long __user usymbol;
        const char **syms = NULL;
        char *buf = NULL, *p;
        int err = -ENOMEM;
        unsigned int i;

        syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL);
        if (!syms)
                goto error;

        buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL);
        if (!buf)
                goto error;

        for (p = buf, i = 0; i < cnt; i++) {
                if (__get_user(usymbol, usyms + i)) {
                        err = -EFAULT;
                        goto error;
                }
                err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN);
                if (err == KSYM_NAME_LEN)
                        err = -E2BIG;
                if (err < 0)
                        goto error;
                syms[i] = p;
                p += err + 1;
        }

        us->syms = syms;
        us->buf = buf;
        return 0;

error:
        if (err) {
                kvfree(syms);
                kvfree(buf);
        }
        return err;
}

static void kprobe_multi_put_modules(struct module **mods, u32 cnt)
{
        u32 i;

        for (i = 0; i < cnt; i++)
                module_put(mods[i]);
}

static void free_user_syms(struct user_syms *us)
{
        kvfree(us->syms);
        kvfree(us->buf);
}

static void bpf_kprobe_multi_link_release(struct bpf_link *link)
{
        struct bpf_kprobe_multi_link *kmulti_link;

        kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
        unregister_fprobe(&kmulti_link->fp);
        kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
}

static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
{
        struct bpf_kprobe_multi_link *kmulti_link;

        kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
        kvfree(kmulti_link->addrs);
        kvfree(kmulti_link->cookies);
        kfree(kmulti_link->mods);
        kfree(kmulti_link);
}

static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
                                                struct bpf_link_info *info)
{
        u64 __user *ucookies = u64_to_user_ptr(info->kprobe_multi.cookies);
        u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs);
        struct bpf_kprobe_multi_link *kmulti_link;
        u32 ucount = info->kprobe_multi.count;
        int err = 0, i;

        if (!uaddrs ^ !ucount)
                return -EINVAL;
        if (ucookies && !ucount)
                return -EINVAL;

        kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
        info->kprobe_multi.count = kmulti_link->cnt;
        info->kprobe_multi.flags = kmulti_link->link.flags;
        info->kprobe_multi.missed = kmulti_link->fp.nmissed;

        if (!uaddrs)
                return 0;
        if (ucount < kmulti_link->cnt)
                err = -ENOSPC;
        else
                ucount = kmulti_link->cnt;

        if (ucookies) {
                if (kmulti_link->cookies) {
                        if (copy_to_user(ucookies, kmulti_link->cookies, ucount * sizeof(u64)))
                                return -EFAULT;
                } else {
                        for (i = 0; i < ucount; i++) {
                                if (put_user(0, ucookies + i))
                                        return -EFAULT;
                        }
                }
        }

        if (kallsyms_show_value(current_cred())) {
                if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64)))
                        return -EFAULT;
        } else {
                for (i = 0; i < ucount; i++) {
                        if (put_user(0, uaddrs + i))
                                return -EFAULT;
                }
        }
        return err;
}

#ifdef CONFIG_PROC_FS
static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link,
                                         struct seq_file *seq)
{
        struct bpf_kprobe_multi_link *kmulti_link;
        bool has_cookies;

        kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
        has_cookies = !!kmulti_link->cookies;

        seq_printf(seq,
                   "kprobe_cnt:\t%u\n"
                   "missed:\t%lu\n",
                   kmulti_link->cnt,
                   kmulti_link->fp.nmissed);

        seq_printf(seq, "%s\t %s\n", "cookie", "func");
        for (int i = 0; i < kmulti_link->cnt; i++) {
                seq_printf(seq,
                           "%llu\t %pS\n",
                           has_cookies ? kmulti_link->cookies[i] : 0,
                           (void *)kmulti_link->addrs[i]);
        }
}
#endif

static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
        .release = bpf_kprobe_multi_link_release,
        .dealloc_deferred = bpf_kprobe_multi_link_dealloc,
        .fill_link_info = bpf_kprobe_multi_link_fill_link_info,
#ifdef CONFIG_PROC_FS
        .show_fdinfo = bpf_kprobe_multi_show_fdinfo,
#endif
};

static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
{
        const struct bpf_kprobe_multi_link *link = priv;
        unsigned long *addr_a = a, *addr_b = b;
        u64 *cookie_a, *cookie_b;

        cookie_a = link->cookies + (addr_a - link->addrs);
        cookie_b = link->cookies + (addr_b - link->addrs);

        /* swap addr_a/addr_b and cookie_a/cookie_b values */
        swap(*addr_a, *addr_b);
        swap(*cookie_a, *cookie_b);
}

static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b)
{
        const unsigned long *addr_a = a, *addr_b = b;

        if (*addr_a == *addr_b)
                return 0;
        return *addr_a < *addr_b ? -1 : 1;
}

static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
{
        return bpf_kprobe_multi_addrs_cmp(a, b);
}

static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
        struct bpf_kprobe_multi_run_ctx *run_ctx;
        struct bpf_kprobe_multi_link *link;
        u64 *cookie, entry_ip;
        unsigned long *addr;

        if (WARN_ON_ONCE(!ctx))
                return 0;
        run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
                               session_ctx.run_ctx);
        link = run_ctx->link;
        if (!link->cookies)
                return 0;
        entry_ip = run_ctx->entry_ip;
        addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
                       bpf_kprobe_multi_addrs_cmp);
        if (!addr)
                return 0;
        cookie = link->cookies + (addr - link->addrs);
        return *cookie;
}

static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
        struct bpf_kprobe_multi_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
                               session_ctx.run_ctx);
        return run_ctx->entry_ip;
}

static __always_inline int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
                           unsigned long entry_ip, struct ftrace_regs *fregs,
                           bool is_return, void *data)
{
        struct bpf_kprobe_multi_run_ctx run_ctx = {
                .session_ctx = {
                        .is_return = is_return,
                        .data = data,
                },
                .link = link,
                .entry_ip = entry_ip,
        };
        struct bpf_run_ctx *old_run_ctx;
        struct pt_regs *regs;
        int err;

        /*
         * graph tracer framework ensures we won't migrate, so there is no need
         * to use migrate_disable for bpf_prog_run again. The check here just for
         * __this_cpu_inc_return.
         */
        cant_sleep();

        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
                bpf_prog_inc_misses_counter(link->link.prog);
                err = 1;
                goto out;
        }

        rcu_read_lock();
        regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr());
        old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
        err = bpf_prog_run(link->link.prog, regs);
        bpf_reset_run_ctx(old_run_ctx);
        ftrace_partial_regs_update(fregs, bpf_kprobe_multi_pt_regs_ptr());
        rcu_read_unlock();

 out:
        __this_cpu_dec(bpf_prog_active);
        return err;
}

static int
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
                          unsigned long ret_ip, struct ftrace_regs *fregs,
                          void *data)
{
        struct bpf_kprobe_multi_link *link;
        int err;

        link = container_of(fp, struct bpf_kprobe_multi_link, fp);
        err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
                                         fregs, false, data);
        return is_kprobe_session(link->link.prog) ? err : 0;
}

static void
kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
                               unsigned long ret_ip, struct ftrace_regs *fregs,
                               void *data)
{
        struct bpf_kprobe_multi_link *link;

        link = container_of(fp, struct bpf_kprobe_multi_link, fp);
        kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
                                   fregs, true, data);
}

static int symbols_cmp_r(const void *a, const void *b, const void *priv)
{
        const char **str_a = (const char **) a;
        const char **str_b = (const char **) b;

        return strcmp(*str_a, *str_b);
}

struct multi_symbols_sort {
        const char **funcs;
        u64 *cookies;
};

static void symbols_swap_r(void *a, void *b, int size, const void *priv)
{
        const struct multi_symbols_sort *data = priv;
        const char **name_a = a, **name_b = b;

        swap(*name_a, *name_b);

        /* If defined, swap also related cookies. */
        if (data->cookies) {
                u64 *cookie_a, *cookie_b;

                cookie_a = data->cookies + (name_a - data->funcs);
                cookie_b = data->cookies + (name_b - data->funcs);
                swap(*cookie_a, *cookie_b);
        }
}

struct modules_array {
        struct module **mods;
        int mods_cnt;
        int mods_cap;
};

static int add_module(struct modules_array *arr, struct module *mod)
{
        struct module **mods;

        if (arr->mods_cnt == arr->mods_cap) {
                arr->mods_cap = max(16, arr->mods_cap * 3 / 2);
                mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL);
                if (!mods)
                        return -ENOMEM;
                arr->mods = mods;
        }

        arr->mods[arr->mods_cnt] = mod;
        arr->mods_cnt++;
        return 0;
}

static bool has_module(struct modules_array *arr, struct module *mod)
{
        int i;

        for (i = arr->mods_cnt - 1; i >= 0; i--) {
                if (arr->mods[i] == mod)
                        return true;
        }
        return false;
}

static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt)
{
        struct modules_array arr = {};
        u32 i, err = 0;

        for (i = 0; i < addrs_cnt; i++) {
                bool skip_add = false;
                struct module *mod;

                scoped_guard(rcu) {
                        mod = __module_address(addrs[i]);
                        /* Either no module or it's already stored  */
                        if (!mod || has_module(&arr, mod)) {
                                skip_add = true;
                                break; /* scoped_guard */
                        }
                        if (!try_module_get(mod))
                                err = -EINVAL;
                }
                if (skip_add)
                        continue;
                if (err)
                        break;
                err = add_module(&arr, mod);
                if (err) {
                        module_put(mod);
                        break;
                }
        }

        /* We return either err < 0 in case of error, ... */
        if (err) {
                kprobe_multi_put_modules(arr.mods, arr.mods_cnt);
                kfree(arr.mods);
                return err;
        }

        /* or number of modules found if everything is ok. */
        *mods = arr.mods;
        return arr.mods_cnt;
}

static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt)
{
        u32 i;

        for (i = 0; i < cnt; i++) {
                if (!within_error_injection_list(addrs[i]))
                        return -EINVAL;
        }
        return 0;
}

int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct bpf_kprobe_multi_link *link = NULL;
        struct bpf_link_primer link_primer;
        void __user *ucookies;
        unsigned long *addrs;
        u32 flags, cnt, size;
        void __user *uaddrs;
        u64 *cookies = NULL;
        void __user *usyms;
        int err;

        /* no support for 32bit archs yet */
        if (sizeof(u64) != sizeof(void *))
                return -EOPNOTSUPP;

        if (attr->link_create.flags)
                return -EINVAL;

        if (!is_kprobe_multi(prog))
                return -EINVAL;

        /* kprobe_multi is not allowed to be sleepable. */
        if (prog->sleepable)
                return -EINVAL;

        /* Writing to context is not allowed for kprobes. */
        if (prog->aux->kprobe_write_ctx)
                return -EINVAL;

        flags = attr->link_create.kprobe_multi.flags;
        if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
                return -EINVAL;

        uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
        usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
        if (!!uaddrs == !!usyms)
                return -EINVAL;

        cnt = attr->link_create.kprobe_multi.cnt;
        if (!cnt)
                return -EINVAL;
        if (cnt > MAX_KPROBE_MULTI_CNT)
                return -E2BIG;

        size = cnt * sizeof(*addrs);
        addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
        if (!addrs)
                return -ENOMEM;

        ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
        if (ucookies) {
                cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
                if (!cookies) {
                        err = -ENOMEM;
                        goto error;
                }
                if (copy_from_user(cookies, ucookies, size)) {
                        err = -EFAULT;
                        goto error;
                }
        }

        if (uaddrs) {
                if (copy_from_user(addrs, uaddrs, size)) {
                        err = -EFAULT;
                        goto error;
                }
        } else {
                struct multi_symbols_sort data = {
                        .cookies = cookies,
                };
                struct user_syms us;

                err = copy_user_syms(&us, usyms, cnt);
                if (err)
                        goto error;

                if (cookies)
                        data.funcs = us.syms;

                sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r,
                       symbols_swap_r, &data);

                err = ftrace_lookup_symbols(us.syms, cnt, addrs);
                free_user_syms(&us);
                if (err)
                        goto error;
        }

        if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) {
                err = -EINVAL;
                goto error;
        }

        link = kzalloc_obj(*link);
        if (!link) {
                err = -ENOMEM;
                goto error;
        }

        bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
                      &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type);

        err = bpf_link_prime(&link->link, &link_primer);
        if (err)
                goto error;

        if (!(flags & BPF_F_KPROBE_MULTI_RETURN))
                link->fp.entry_handler = kprobe_multi_link_handler;
        if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog))
                link->fp.exit_handler = kprobe_multi_link_exit_handler;
        if (is_kprobe_session(prog))
                link->fp.entry_data_size = sizeof(u64);

        link->addrs = addrs;
        link->cookies = cookies;
        link->cnt = cnt;
        link->link.flags = flags;

        if (cookies) {
                /*
                 * Sorting addresses will trigger sorting cookies as well
                 * (check bpf_kprobe_multi_cookie_swap). This way we can
                 * find cookie based on the address in bpf_get_attach_cookie
                 * helper.
                 */
                sort_r(addrs, cnt, sizeof(*addrs),
                       bpf_kprobe_multi_cookie_cmp,
                       bpf_kprobe_multi_cookie_swap,
                       link);
        }

        err = get_modules_for_addrs(&link->mods, addrs, cnt);
        if (err < 0) {
                bpf_link_cleanup(&link_primer);
                return err;
        }
        link->mods_cnt = err;

        err = register_fprobe_ips(&link->fp, addrs, cnt);
        if (err) {
                kprobe_multi_put_modules(link->mods, link->mods_cnt);
                bpf_link_cleanup(&link_primer);
                return err;
        }

        return bpf_link_settle(&link_primer);

error:
        kfree(link);
        kvfree(addrs);
        kvfree(cookies);
        return err;
}
#else /* !CONFIG_FPROBE */
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
        return 0;
}
static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
        return 0;
}
#endif

#ifdef CONFIG_UPROBES
struct bpf_uprobe_multi_link;

struct bpf_uprobe {
        struct bpf_uprobe_multi_link *link;
        loff_t offset;
        unsigned long ref_ctr_offset;
        u64 cookie;
        struct uprobe *uprobe;
        struct uprobe_consumer consumer;
        bool session;
};

struct bpf_uprobe_multi_link {
        struct path path;
        struct bpf_link link;
        u32 cnt;
        struct bpf_uprobe *uprobes;
        struct task_struct *task;
};

struct bpf_uprobe_multi_run_ctx {
        struct bpf_session_run_ctx session_ctx;
        unsigned long entry_ip;
        struct bpf_uprobe *uprobe;
};

static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
{
        u32 i;

        for (i = 0; i < cnt; i++)
                uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer);

        if (cnt)
                uprobe_unregister_sync();
}

static void bpf_uprobe_multi_link_release(struct bpf_link *link)
{
        struct bpf_uprobe_multi_link *umulti_link;

        umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
        bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt);
        if (umulti_link->task)
                put_task_struct(umulti_link->task);
        path_put(&umulti_link->path);
}

static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
{
        struct bpf_uprobe_multi_link *umulti_link;

        umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
        kvfree(umulti_link->uprobes);
        kfree(umulti_link);
}

static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
                                                struct bpf_link_info *info)
{
        u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets);
        u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies);
        u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets);
        u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path);
        u32 upath_size = info->uprobe_multi.path_size;
        struct bpf_uprobe_multi_link *umulti_link;
        u32 ucount = info->uprobe_multi.count;
        int err = 0, i;
        char *p, *buf;
        long left = 0;

        if (!upath ^ !upath_size)
                return -EINVAL;

        if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount)
                return -EINVAL;

        umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
        info->uprobe_multi.count = umulti_link->cnt;
        info->uprobe_multi.flags = umulti_link->link.flags;
        info->uprobe_multi.pid = umulti_link->task ?
                                 task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;

        upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX;
        buf = kmalloc(upath_size, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
        p = d_path(&umulti_link->path, buf, upath_size);
        if (IS_ERR(p)) {
                kfree(buf);
                return PTR_ERR(p);
        }
        upath_size = buf + upath_size - p;

        if (upath)
                left = copy_to_user(upath, p, upath_size);
        kfree(buf);
        if (left)
                return -EFAULT;
        info->uprobe_multi.path_size = upath_size;

        if (!uoffsets && !ucookies && !uref_ctr_offsets)
                return 0;

        if (ucount < umulti_link->cnt)
                err = -ENOSPC;
        else
                ucount = umulti_link->cnt;

        for (i = 0; i < ucount; i++) {
                if (uoffsets &&
                    put_user(umulti_link->uprobes[i].offset, uoffsets + i))
                        return -EFAULT;
                if (uref_ctr_offsets &&
                    put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i))
                        return -EFAULT;
                if (ucookies &&
                    put_user(umulti_link->uprobes[i].cookie, ucookies + i))
                        return -EFAULT;
        }

        return err;
}

#ifdef CONFIG_PROC_FS
static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link,
                                         struct seq_file *seq)
{
        struct bpf_uprobe_multi_link *umulti_link;
        char *p, *buf;
        pid_t pid;

        umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);

        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                return;

        p = d_path(&umulti_link->path, buf, PATH_MAX);
        if (IS_ERR(p)) {
                kfree(buf);
                return;
        }

        pid = umulti_link->task ?
              task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
        seq_printf(seq,
                   "uprobe_cnt:\t%u\n"
                   "pid:\t%u\n"
                   "path:\t%s\n",
                   umulti_link->cnt, pid, p);

        seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset");
        for (int i = 0; i < umulti_link->cnt; i++) {
                seq_printf(seq,
                           "%llu\t %#llx\t %#lx\n",
                           umulti_link->uprobes[i].cookie,
                           umulti_link->uprobes[i].offset,
                           umulti_link->uprobes[i].ref_ctr_offset);
        }

        kfree(buf);
}
#endif

static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
        .release = bpf_uprobe_multi_link_release,
        .dealloc_deferred = bpf_uprobe_multi_link_dealloc,
        .fill_link_info = bpf_uprobe_multi_link_fill_link_info,
#ifdef CONFIG_PROC_FS
        .show_fdinfo = bpf_uprobe_multi_show_fdinfo,
#endif
};

static int uprobe_prog_run(struct bpf_uprobe *uprobe,
                           unsigned long entry_ip,
                           struct pt_regs *regs,
                           bool is_return, void *data)
{
        struct bpf_uprobe_multi_link *link = uprobe->link;
        struct bpf_uprobe_multi_run_ctx run_ctx = {
                .session_ctx = {
                        .is_return = is_return,
                        .data = data,
                },
                .entry_ip = entry_ip,
                .uprobe = uprobe,
        };
        struct bpf_prog *prog = link->link.prog;
        bool sleepable = prog->sleepable;
        struct bpf_run_ctx *old_run_ctx;
        int err;

        if (link->task && !same_thread_group(current, link->task))
                return 0;

        if (sleepable)
                rcu_read_lock_trace();
        else
                rcu_read_lock();

        migrate_disable();

        old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
        err = bpf_prog_run(link->link.prog, regs);
        bpf_reset_run_ctx(old_run_ctx);

        migrate_enable();

        if (sleepable)
                rcu_read_unlock_trace();
        else
                rcu_read_unlock();
        return err;
}

static bool
uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
{
        struct bpf_uprobe *uprobe;

        uprobe = container_of(con, struct bpf_uprobe, consumer);
        return uprobe->link->task->mm == mm;
}

static int
uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
                          __u64 *data)
{
        struct bpf_uprobe *uprobe;
        int ret;

        uprobe = container_of(con, struct bpf_uprobe, consumer);
        ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data);
        if (uprobe->session)
                return ret ? UPROBE_HANDLER_IGNORE : 0;
        return 0;
}

static int
uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs,
                              __u64 *data)
{
        struct bpf_uprobe *uprobe;

        uprobe = container_of(con, struct bpf_uprobe, consumer);
        uprobe_prog_run(uprobe, func, regs, true, data);
        return 0;
}

static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
        struct bpf_uprobe_multi_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
                               session_ctx.run_ctx);
        return run_ctx->entry_ip;
}

static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
        struct bpf_uprobe_multi_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
                               session_ctx.run_ctx);
        return run_ctx->uprobe->cookie;
}

int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct bpf_uprobe_multi_link *link = NULL;
        unsigned long __user *uref_ctr_offsets;
        struct bpf_link_primer link_primer;
        struct bpf_uprobe *uprobes = NULL;
        struct task_struct *task = NULL;
        unsigned long __user *uoffsets;
        u64 __user *ucookies;
        void __user *upath;
        u32 flags, cnt, i;
        struct path path;
        char *name;
        pid_t pid;
        int err;

        /* no support for 32bit archs yet */
        if (sizeof(u64) != sizeof(void *))
                return -EOPNOTSUPP;

        if (attr->link_create.flags)
                return -EINVAL;

        if (!is_uprobe_multi(prog))
                return -EINVAL;

        flags = attr->link_create.uprobe_multi.flags;
        if (flags & ~BPF_F_UPROBE_MULTI_RETURN)
                return -EINVAL;

        /*
         * path, offsets and cnt are mandatory,
         * ref_ctr_offsets and cookies are optional
         */
        upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
        uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
        cnt = attr->link_create.uprobe_multi.cnt;
        pid = attr->link_create.uprobe_multi.pid;

        if (!upath || !uoffsets || !cnt || pid < 0)
                return -EINVAL;
        if (cnt > MAX_UPROBE_MULTI_CNT)
                return -E2BIG;

        uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
        ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);

        name = strndup_user(upath, PATH_MAX);
        if (IS_ERR(name)) {
                err = PTR_ERR(name);
                return err;
        }

        err = kern_path(name, LOOKUP_FOLLOW, &path);
        kfree(name);
        if (err)
                return err;

        if (!d_is_reg(path.dentry)) {
                err = -EBADF;
                goto error_path_put;
        }

        if (pid) {
                rcu_read_lock();
                task = get_pid_task(find_vpid(pid), PIDTYPE_TGID);
                rcu_read_unlock();
                if (!task) {
                        err = -ESRCH;
                        goto error_path_put;
                }
        }

        err = -ENOMEM;

        link = kzalloc_obj(*link);
        uprobes = kvzalloc_objs(*uprobes, cnt);

        if (!uprobes || !link)
                goto error_free;

        for (i = 0; i < cnt; i++) {
                if (__get_user(uprobes[i].offset, uoffsets + i)) {
                        err = -EFAULT;
                        goto error_free;
                }
                if (uprobes[i].offset < 0) {
                        err = -EINVAL;
                        goto error_free;
                }
                if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) {
                        err = -EFAULT;
                        goto error_free;
                }
                if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
                        err = -EFAULT;
                        goto error_free;
                }

                uprobes[i].link = link;

                if (!(flags & BPF_F_UPROBE_MULTI_RETURN))
                        uprobes[i].consumer.handler = uprobe_multi_link_handler;
                if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog))
                        uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
                if (is_uprobe_session(prog))
                        uprobes[i].session = true;
                if (pid)
                        uprobes[i].consumer.filter = uprobe_multi_link_filter;
        }

        link->cnt = cnt;
        link->uprobes = uprobes;
        link->path = path;
        link->task = task;
        link->link.flags = flags;

        bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
                      &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type);

        for (i = 0; i < cnt; i++) {
                uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
                                                    uprobes[i].offset,
                                                    uprobes[i].ref_ctr_offset,
                                                    &uprobes[i].consumer);
                if (IS_ERR(uprobes[i].uprobe)) {
                        err = PTR_ERR(uprobes[i].uprobe);
                        link->cnt = i;
                        goto error_unregister;
                }
        }

        err = bpf_link_prime(&link->link, &link_primer);
        if (err)
                goto error_unregister;

        return bpf_link_settle(&link_primer);

error_unregister:
        bpf_uprobe_unregister(uprobes, link->cnt);

error_free:
        kvfree(uprobes);
        kfree(link);
        if (task)
                put_task_struct(task);
error_path_put:
        path_put(&path);
        return err;
}
#else /* !CONFIG_UPROBES */
int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}
static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
        return 0;
}
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
        return 0;
}
#endif /* CONFIG_UPROBES */

__bpf_kfunc_start_defs();

__bpf_kfunc bool bpf_session_is_return(void *ctx)
{
        struct bpf_session_run_ctx *session_ctx;

        session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
        return session_ctx->is_return;
}

__bpf_kfunc __u64 *bpf_session_cookie(void *ctx)
{
        struct bpf_session_run_ctx *session_ctx;

        session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
        return session_ctx->data;
}

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(session_kfunc_set_ids)
BTF_ID_FLAGS(func, bpf_session_is_return)
BTF_ID_FLAGS(func, bpf_session_cookie)
BTF_KFUNCS_END(session_kfunc_set_ids)

static int bpf_session_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
        if (!btf_id_set8_contains(&session_kfunc_set_ids, kfunc_id))
                return 0;

        if (!is_kprobe_session(prog) && !is_uprobe_session(prog) && !is_trace_fsession(prog))
                return -EACCES;

        return 0;
}

static const struct btf_kfunc_id_set bpf_session_kfunc_set = {
        .owner = THIS_MODULE,
        .set = &session_kfunc_set_ids,
        .filter = bpf_session_filter,
};

static int __init bpf_trace_kfuncs_init(void)
{
        int err = 0;

        err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_session_kfunc_set);
        err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_session_kfunc_set);

        return err;
}

late_initcall(bpf_trace_kfuncs_init);

typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk);

/*
 * The __always_inline is to make sure the compiler doesn't
 * generate indirect calls into callbacks, which is expensive,
 * on some kernel configurations. This allows compiler to put
 * direct calls into all the specific callback implementations
 * (copy_user_data_sleepable, copy_user_data_nofault, and so on)
 */
static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size,
                                                 const void *unsafe_src,
                                                 copy_fn_t str_copy_fn,
                                                 struct task_struct *tsk)
{
        struct bpf_dynptr_kern *dst;
        u64 chunk_sz, off;
        void *dst_slice;
        int cnt, err;
        char buf[256];

        dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
        if (likely(dst_slice))
                return str_copy_fn(dst_slice, unsafe_src, size, tsk);

        dst = (struct bpf_dynptr_kern *)dptr;
        if (bpf_dynptr_check_off_len(dst, doff, size))
                return -E2BIG;

        for (off = 0; off < size; off += chunk_sz - 1) {
                chunk_sz = min_t(u64, sizeof(buf), size - off);
                /* Expect str_copy_fn to return count of copied bytes, including
                 * zero terminator. Next iteration increment off by chunk_sz - 1 to
                 * overwrite NUL.
                 */
                cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
                if (cnt < 0)
                        return cnt;
                err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0);
                if (err)
                        return err;
                if (cnt < chunk_sz || chunk_sz == 1) /* we are done */
                        return off + cnt;
        }
        return off;
}

static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff,
                                             u64 size, const void *unsafe_src,
                                             copy_fn_t copy_fn, struct task_struct *tsk)
{
        struct bpf_dynptr_kern *dst;
        void *dst_slice;
        char buf[256];
        u64 off, chunk_sz;
        int err;

        dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
        if (likely(dst_slice))
                return copy_fn(dst_slice, unsafe_src, size, tsk);

        dst = (struct bpf_dynptr_kern *)dptr;
        if (bpf_dynptr_check_off_len(dst, doff, size))
                return -E2BIG;

        for (off = 0; off < size; off += chunk_sz) {
                chunk_sz = min_t(u64, sizeof(buf), size - off);
                err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
                if (err)
                        return err;
                err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0);
                if (err)
                        return err;
        }
        return 0;
}

static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src,
                                                  u32 size, struct task_struct *tsk)
{
        return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
}

static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src,
                                                    u32 size, struct task_struct *tsk)
{
        int ret;

        if (!tsk) { /* Read from the current task */
                ret = copy_from_user(dst, (const void __user *)unsafe_src, size);
                if (ret)
                        return -EFAULT;
                return 0;
        }

        ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0);
        if (ret != size)
                return -EFAULT;
        return 0;
}

static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src,
                                                    u32 size, struct task_struct *tsk)
{
        return copy_from_kernel_nofault(dst, unsafe_src, size);
}

static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src,
                                                 u32 size, struct task_struct *tsk)
{
        return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
}

static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src,
                                                   u32 size, struct task_struct *tsk)
{
        int ret;

        if (unlikely(size == 0))
                return 0;

        if (tsk) {
                ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0);
        } else {
                ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1);
                /* strncpy_from_user does not guarantee NUL termination */
                if (ret >= 0)
                        ((char *)dst)[ret] = '\0';
        }

        if (ret < 0)
                return ret;
        return ret + 1;
}

static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src,
                                                   u32 size, struct task_struct *tsk)
{
        return strncpy_from_kernel_nofault(dst, unsafe_src, size);
}

__bpf_kfunc_start_defs();

__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type,
                                     u64 value)
{
        if (type != PIDTYPE_PID && type != PIDTYPE_TGID)
                return -EINVAL;

        return bpf_send_signal_common(sig, type, task, value);
}

__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
                                           u64 size, const void __user *unsafe_ptr__ign)
{
        return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
                                 copy_user_data_nofault, NULL);
}

__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
                                             u64 size, const void *unsafe_ptr__ign)
{
        return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
                                 copy_kernel_data_nofault, NULL);
}

__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
                                               u64 size, const void __user *unsafe_ptr__ign)
{
        return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
                                     copy_user_str_nofault, NULL);
}

__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off,
                                                 u64 size, const void *unsafe_ptr__ign)
{
        return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
                                     copy_kernel_str_nofault, NULL);
}

__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
                                          u64 size, const void __user *unsafe_ptr__ign)
{
        return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
                                 copy_user_data_sleepable, NULL);
}

__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
                                              u64 size, const void __user *unsafe_ptr__ign)
{
        return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
                                     copy_user_str_sleepable, NULL);
}

__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
                                               u64 size, const void __user *unsafe_ptr__ign,
                                               struct task_struct *tsk)
{
        return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
                                 copy_user_data_sleepable, tsk);
}

__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off,
                                                   u64 size, const void __user *unsafe_ptr__ign,
                                                   struct task_struct *tsk)
{
        return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
                                     copy_user_str_sleepable, tsk);
}

__bpf_kfunc_end_defs();





















































































































































































































































































































































































































































































































































































    8 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAP_H
#define _LINUX_SWAP_H

#include <linux/spinlock.h>
#include <linux/linkage.h>
#include <linux/mmzone.h>
#include <linux/list.h>
#include <linux/memcontrol.h>
#include <linux/sched.h>
#include <linux/node.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/page-flags.h>
#include <uapi/linux/mempolicy.h>
#include <asm/page.h>

struct notifier_block;

struct bio;

#define SWAP_FLAG_PREFER        0x8000        /* set if swap priority specified */
#define SWAP_FLAG_PRIO_MASK        0x7fff
#define SWAP_FLAG_DISCARD        0x10000 /* enable discard for swap */
#define SWAP_FLAG_DISCARD_ONCE        0x20000 /* discard swap area at swapon-time */
#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */

#define SWAP_FLAGS_VALID        (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
                                 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
                                 SWAP_FLAG_DISCARD_PAGES)
#define SWAP_BATCH 64

static inline int current_is_kswapd(void)
{
        return current->flags & PF_KSWAPD;
}

/*
 * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
 * be swapped to.  The swap type and the offset into that swap type are
 * encoded into pte's and into pgoff_t's in the swapcache.  Using five bits
 * for the type means that the maximum number of swapcache pages is 27 bits
 * on 32-bit-pgoff_t architectures.  And that assumes that the architecture packs
 * the type/offset into the pte as 5/27 as well.
 */
#define MAX_SWAPFILES_SHIFT        5

/*
 * Use some of the swap files numbers for other purposes. This
 * is a convenient way to hook into the VM to trigger special
 * actions on faults.
 */

/*
 * PTE markers are used to persist information onto PTEs that otherwise
 * should be a none pte.  As its name "PTE" hints, it should only be
 * applied to the leaves of pgtables.
 */
#define SWP_PTE_MARKER_NUM 1
#define SWP_PTE_MARKER     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
                            SWP_MIGRATION_NUM + SWP_DEVICE_NUM)

/*
 * Unaddressable device memory support. See include/linux/hmm.h and
 * Documentation/mm/hmm.rst. Short description is we need struct pages for
 * device memory that is unaddressable (inaccessible) by CPU, so that we can
 * migrate part of a process memory to device memory.
 *
 * When a page is migrated from CPU to device, we set the CPU page table entry
 * to a special SWP_DEVICE_{READ|WRITE} entry.
 *
 * When a page is mapped by the device for exclusive access we set the CPU page
 * table entries to a special SWP_DEVICE_EXCLUSIVE entry.
 */
#ifdef CONFIG_DEVICE_PRIVATE
#define SWP_DEVICE_NUM 3
#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
#define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
#else
#define SWP_DEVICE_NUM 0
#endif

/*
 * Page migration support.
 *
 * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and
 * indicates that the referenced (part of) an anonymous page is exclusive to
 * a single process. For SWP_MIGRATION_WRITE, that information is implicit:
 * (part of) an anonymous page that are mapped writable are exclusive to a
 * single process.
 */
#ifdef CONFIG_MIGRATION
#define SWP_MIGRATION_NUM 3
#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
#define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2)
#else
#define SWP_MIGRATION_NUM 0
#endif

/*
 * Handling of hardware poisoned pages with memory corruption.
 */
#ifdef CONFIG_MEMORY_FAILURE
#define SWP_HWPOISON_NUM 1
#define SWP_HWPOISON                MAX_SWAPFILES
#else
#define SWP_HWPOISON_NUM 0
#endif

#define MAX_SWAPFILES \
        ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
        SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
        SWP_PTE_MARKER_NUM)

/*
 * Magic header for a swap area. The first part of the union is
 * what the swap magic looks like for the old (limited to 128MB)
 * swap area format, the second part of the union adds - in the
 * old reserved area - some extra information. Note that the first
 * kilobyte is reserved for boot loader or disk label stuff...
 *
 * Having the magic at the end of the PAGE_SIZE makes detecting swap
 * areas somewhat tricky on machines that support multiple page sizes.
 * For 2.5 we'll probably want to move the magic to just beyond the
 * bootbits...
 */
union swap_header {
        struct {
                char reserved[PAGE_SIZE - 10];
                char magic[10];                        /* SWAP-SPACE or SWAPSPACE2 */
        } magic;
        struct {
                char                bootbits[1024];        /* Space for disklabel etc. */
                __u32                version;
                __u32                last_page;
                __u32                nr_badpages;
                unsigned char        sws_uuid[16];
                unsigned char        sws_volume[16];
                __u32                padding[117];
                __u32                badpages[1];
        } info;
};

/*
 * current->reclaim_state points to one of these when a task is running
 * memory reclaim
 */
struct reclaim_state {
        /* pages reclaimed outside of LRU-based reclaim */
        unsigned long reclaimed;
#ifdef CONFIG_LRU_GEN
        /* per-thread mm walk data */
        struct lru_gen_mm_walk *mm_walk;
#endif
};

/*
 * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
 * reclaim
 * @pages: number of pages reclaimed
 *
 * If the current process is undergoing a reclaim operation, increment the
 * number of reclaimed pages by @pages.
 */
static inline void mm_account_reclaimed_pages(unsigned long pages)
{
        if (current->reclaim_state)
                current->reclaim_state->reclaimed += pages;
}

#ifdef __KERNEL__

struct address_space;
struct sysinfo;
struct writeback_control;
struct zone;

/*
 * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
 * disk blocks.  A rbtree of swap extents maps the entire swapfile (Where the
 * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart
 * from setup, they're handled identically.
 *
 * We always assume that blocks are of size PAGE_SIZE.
 */
struct swap_extent {
        struct rb_node rb_node;
        pgoff_t start_page;
        pgoff_t nr_pages;
        sector_t start_block;
};

/*
 * Max bad pages in the new format..
 */
#define MAX_SWAP_BADPAGES \
        ((offsetof(union swap_header, magic.magic) - \
          offsetof(union swap_header, info.badpages)) / sizeof(int))

enum {
        SWP_USED        = (1 << 0),        /* is slot in swap_info[] used? */
        SWP_WRITEOK        = (1 << 1),        /* ok to write to this swap?        */
        SWP_DISCARDABLE = (1 << 2),        /* blkdev support discard */
        SWP_DISCARDING        = (1 << 3),        /* now discarding a free cluster */
        SWP_SOLIDSTATE        = (1 << 4),        /* blkdev seeks are cheap */
        SWP_BLKDEV        = (1 << 6),        /* its a block device */
        SWP_ACTIVATED        = (1 << 7),        /* set after swap_activate success */
        SWP_FS_OPS        = (1 << 8),        /* swapfile operations go through fs */
        SWP_AREA_DISCARD = (1 << 9),        /* single-time swap area discards */
        SWP_PAGE_DISCARD = (1 << 10),        /* freed swap page-cluster discards */
        SWP_STABLE_WRITES = (1 << 11),        /* no overwrite PG_writeback pages */
        SWP_SYNCHRONOUS_IO = (1 << 12),        /* synchronous IO is efficient */
                                        /* add others here before... */
};

#define SWAP_CLUSTER_MAX 32UL
#define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10)
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX

/*
 * The first page in the swap file is the swap header, which is always marked
 * bad to prevent it from being allocated as an entry. This also prevents the
 * cluster to which it belongs being marked free. Therefore 0 is safe to use as
 * a sentinel to indicate an entry is not valid.
 */
#define SWAP_ENTRY_INVALID        0

#ifdef CONFIG_THP_SWAP
#define SWAP_NR_ORDERS                (PMD_ORDER + 1)
#else
#define SWAP_NR_ORDERS                1
#endif

/*
 * We keep using same cluster for rotational device so IO will be sequential.
 * The purpose is to optimize SWAP throughput on these device.
 */
struct swap_sequential_cluster {
        unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
};

/*
 * The in-memory structure used to track swap areas.
 */
struct swap_info_struct {
        struct percpu_ref users;        /* indicate and keep swap device valid. */
        unsigned long        flags;                /* SWP_USED etc: see above */
        signed short        prio;                /* swap priority of this type */
        struct plist_node list;                /* entry in swap_active_head */
        signed char        type;                /* strange name for an index */
        unsigned int        max;                /* size of this swap device */
        unsigned long *zeromap;                /* kvmalloc'ed bitmap to track zero pages */
        struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
        struct list_head free_clusters; /* free clusters list */
        struct list_head full_clusters; /* full clusters list */
        struct list_head nonfull_clusters[SWAP_NR_ORDERS];
                                        /* list of cluster that contains at least one free slot */
        struct list_head frag_clusters[SWAP_NR_ORDERS];
                                        /* list of cluster that are fragmented or contented */
        unsigned int pages;                /* total of usable pages of swap */
        atomic_long_t inuse_pages;        /* number of those currently in use */
        struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
        spinlock_t global_cluster_lock;        /* Serialize usage of global cluster */
        struct rb_root swap_extent_root;/* root of the swap extent rbtree */
        struct block_device *bdev;        /* swap device or bdev of swap file */
        struct file *swap_file;                /* seldom referenced */
        struct completion comp;                /* seldom referenced */
        spinlock_t lock;                /*
                                         * protect map scan related fields like
                                         * inuse_pages and all cluster lists.
                                         * Other fields are only changed
                                         * at swapon/swapoff, so are protected
                                         * by swap_lock. changing flags need
                                         * hold this lock and swap_lock. If
                                         * both locks need hold, hold swap_lock
                                         * first.
                                         */
        struct work_struct discard_work; /* discard worker */
        struct work_struct reclaim_work; /* reclaim worker */
        struct list_head discard_clusters; /* discard clusters list */
        struct plist_node avail_list;   /* entry in swap_avail_head */
};

static inline swp_entry_t page_swap_entry(struct page *page)
{
        struct folio *folio = page_folio(page);
        swp_entry_t entry = folio->swap;

        entry.val += folio_page_idx(folio, page);
        return entry;
}

/* linux/mm/workingset.c */
bool workingset_test_recent(void *shadow, bool file, bool *workingset,
                                bool flush);
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
void workingset_refault(struct folio *folio, void *shadow);
void workingset_activation(struct folio *folio);

/* linux/mm/page_alloc.c */
extern unsigned long totalreserve_pages;

/* Definition of global_zone_page_state not available yet */
#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)


/* linux/mm/swap.c */
void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
                unsigned int nr_io, unsigned int nr_rotated);
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void mark_page_accessed(struct page *);
void folio_mark_accessed(struct folio *);

static inline bool folio_may_be_lru_cached(struct folio *folio)
{
        /*
         * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting.
         * Holding small numbers of low-order mTHP folios in per-CPU LRU cache
         * will be sensible, but nobody has implemented and tested that yet.
         */
        return !folio_test_large(folio);
}

extern atomic_t lru_disable_count;

static inline bool lru_cache_disabled(void)
{
        return atomic_read(&lru_disable_count);
}

static inline void lru_cache_enable(void)
{
        atomic_dec(&lru_disable_count);
}

extern void lru_cache_disable(void);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_cpu_zone(struct zone *zone);
extern void lru_add_drain_all(void);
void folio_deactivate(struct folio *folio);
void folio_mark_lazyfree(struct folio *folio);
extern void swap_setup(void);

/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                        gfp_t gfp_mask, nodemask_t *mask);
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);

#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
#define MIN_SWAPPINESS 0
#define MAX_SWAPPINESS 200

/* Just reclaim from anon folios in proactive memory reclaim */
#define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1)

extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                                  unsigned long nr_pages,
                                                  gfp_t gfp_mask,
                                                  unsigned int reclaim_options,
                                                  int *swappiness);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                pg_data_t *pgdat,
                                                unsigned long *nr_scanned);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
long remove_mapping(struct address_space *mapping, struct folio *folio);

#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
extern int reclaim_register_node(struct node *node);
extern void reclaim_unregister_node(struct node *node);

#else

static inline int reclaim_register_node(struct node *node)
{
        return 0;
}

static inline void reclaim_unregister_node(struct node *node)
{
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */

#ifdef CONFIG_NUMA
extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
#endif

void check_move_unevictable_folios(struct folio_batch *fbatch);

extern void __meminit kswapd_run(int nid);
extern void __meminit kswapd_stop(int nid);

#ifdef CONFIG_SWAP

int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block);
int generic_swapfile_activate(struct swap_info_struct *, struct file *,
                sector_t *);

static inline unsigned long total_swapcache_pages(void)
{
        return global_node_page_state(NR_SWAPCACHE);
}

void free_swap_cache(struct folio *folio);
void free_folio_and_swap_cache(struct folio *folio);
void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
extern atomic_t nr_rotate_swap;

/* Swap 50% full? Release swapcache more aggressively.. */
static inline bool vm_swap_full(void)
{
        return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
}

static inline long get_nr_swap_pages(void)
{
        return atomic_long_read(&nr_swap_pages);
}

extern void si_swapinfo(struct sysinfo *);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
extern sector_t swapdev_block(int, pgoff_t);
extern int __swap_count(swp_entry_t entry);
extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
struct backing_dev_info;
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_folio_sector(struct folio *folio);

/*
 * If there is an existing swap slot reference (swap entry) and the caller
 * guarantees that there is no race modification of it (e.g., PTL
 * protecting the swap entry in page table; shmem's cmpxchg protects t
 * he swap entry in shmem mapping), these two helpers below can be used
 * to put/dup the entries directly.
 *
 * All entries must be allocated by folio_alloc_swap(). And they must have
 * a swap count > 1. See comments of folio_*_swap helpers for more info.
 */
int swap_dup_entry_direct(swp_entry_t entry);
void swap_put_entries_direct(swp_entry_t entry, int nr);

/*
 * folio_free_swap tries to free the swap entries pinned by a swap cache
 * folio, it has to be here to be called by other components.
 */
bool folio_free_swap(struct folio *folio);

/* Allocate / free (hibernation) exclusive entries */
swp_entry_t swap_alloc_hibernation_slot(int type);
void swap_free_hibernation_slot(swp_entry_t entry);

static inline void put_swap_device(struct swap_info_struct *si)
{
        percpu_ref_put(&si->users);
}

#else /* CONFIG_SWAP */
static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
        return NULL;
}

static inline void put_swap_device(struct swap_info_struct *si)
{
}

#define get_nr_swap_pages()                        0L
#define total_swap_pages                        0L
#define total_swapcache_pages()                        0UL
#define vm_swap_full()                                0

#define si_swapinfo(val) \
        do { (val)->freeswap = (val)->totalswap = 0; } while (0)
#define free_folio_and_swap_cache(folio) \
        folio_put(folio)
#define free_pages_and_swap_cache(pages, nr) \
        release_pages((pages), (nr));

static inline void free_swap_cache(struct folio *folio)
{
}

static inline int swap_dup_entry_direct(swp_entry_t ent)
{
        return 0;
}

static inline void swap_put_entries_direct(swp_entry_t ent, int nr)
{
}

static inline int __swap_count(swp_entry_t entry)
{
        return 0;
}

static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
        return false;
}

static inline int swp_swapcount(swp_entry_t entry)
{
        return 0;
}

static inline bool folio_free_swap(struct folio *folio)
{
        return false;
}

static inline int add_swap_extent(struct swap_info_struct *sis,
                                  unsigned long start_page,
                                  unsigned long nr_pages, sector_t start_block)
{
        return -EINVAL;
}
#endif /* CONFIG_SWAP */
#ifdef CONFIG_MEMCG
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
        /* Cgroup2 doesn't have per-cgroup swappiness */
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return READ_ONCE(vm_swappiness);

        /* root ? */
        if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
                return READ_ONCE(vm_swappiness);

        return READ_ONCE(memcg->swappiness);
}

void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid);
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
        return READ_ONCE(vm_swappiness);
}
#endif

#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
        if (mem_cgroup_disabled())
                return;
        __folio_throttle_swaprate(folio, gfp);
}
#else
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
}
#endif

#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
                swp_entry_t entry)
{
        if (mem_cgroup_disabled())
                return 0;
        return __mem_cgroup_try_charge_swap(folio, entry);
}

extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge_swap(entry, nr_pages);
}

extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct folio *folio);
#else
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
                                             swp_entry_t entry)
{
        return 0;
}

static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
                                            unsigned int nr_pages)
{
}

static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
        return get_nr_swap_pages();
}

static inline bool mem_cgroup_swap_full(struct folio *folio)
{
        return vm_swap_full();
}
#endif

/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
 * and including the specified highidx
 * @zone: The current zone in the iterator
 * @pgdat: The pgdat which node_zones are being iterated
 * @idx: The index variable
 * @highidx: The index of the highest zone to return
 *
 * This macro iterates through all managed zones up to and including the specified highidx.
 * The zone iterator enters an invalid state after macro call and must be reinitialized
 * before it can be used again.
 */
#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx)        \
        for ((idx) = 0, (zone) = (pgdat)->node_zones;                \
            (idx) <= (highidx);                                        \
            (idx)++, (zone)++)                                        \
                if (!managed_zone(zone))                        \
                        continue;                                \
                else

#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */























    2 








































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#undef TRACE_SYSTEM
#define TRACE_SYSTEM neigh

#if !defined(_TRACE_NEIGH_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NEIGH_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/tracepoint.h>
#include <net/neighbour.h>

#define neigh_state_str(state)                                \
        __print_symbolic(state,                                \
                { NUD_INCOMPLETE, "incomplete" },        \
                { NUD_REACHABLE, "reachable" },                \
                { NUD_STALE, "stale" },                        \
                { NUD_DELAY, "delay" },                        \
                { NUD_PROBE, "probe" },                        \
                { NUD_FAILED, "failed" },                \
                { NUD_NOARP, "noarp" },                        \
                { NUD_PERMANENT, "permanent"})

TRACE_EVENT(neigh_create,

        TP_PROTO(struct neigh_table *tbl, struct net_device *dev,
                 const void *pkey, const struct neighbour *n,
                 bool exempt_from_gc),

        TP_ARGS(tbl, dev, pkey, n, exempt_from_gc),

        TP_STRUCT__entry(
                __field(u32, family)
                __string(dev, dev ? dev->name : "NULL")
                __field(int, entries)
                __field(u8, created)
                __field(u8, gc_exempt)
                __array(u8, primary_key4, 4)
                __array(u8, primary_key6, 16)
        ),

        TP_fast_assign(
                __be32 *p32;

                __entry->family = tbl->family;
                __assign_str(dev);
                __entry->entries = atomic_read(&tbl->gc_entries);
                __entry->created = n != NULL;
                __entry->gc_exempt = exempt_from_gc;
                p32 = (__be32 *)__entry->primary_key4;

                if (tbl->family == AF_INET)
                        *p32 = *(__be32 *)pkey;
                else
                        *p32 = 0;

#if IS_ENABLED(CONFIG_IPV6)
                if (tbl->family == AF_INET6) {
                        struct in6_addr *pin6;

                        pin6 = (struct in6_addr *)__entry->primary_key6;
                        *pin6 = *(struct in6_addr *)pkey;
                }
#endif
        ),

        TP_printk("family %d dev %s entries %d primary_key4 %pI4 primary_key6 %pI6c created %d gc_exempt %d",
                  __entry->family, __get_str(dev), __entry->entries,
                  __entry->primary_key4, __entry->primary_key6,
                  __entry->created, __entry->gc_exempt)
);

TRACE_EVENT(neigh_update,

        TP_PROTO(struct neighbour *n, const u8 *lladdr, u8 new,
                 u32 flags, u32 nlmsg_pid),

        TP_ARGS(n, lladdr, new, flags, nlmsg_pid),

        TP_STRUCT__entry(
                __field(u32, family)
                __string(dev, (n->dev ? n->dev->name : "NULL"))
                __array(u8, lladdr, MAX_ADDR_LEN)
                __field(u8, lladdr_len)
                __field(u8, flags)
                __field(u8, nud_state)
                __field(u8, type)
                __field(u8, dead)
                __field(int, refcnt)
                __array(__u8, primary_key4, 4)
                __array(__u8, primary_key6, 16)
                __field(unsigned long, confirmed)
                __field(unsigned long, updated)
                __field(unsigned long, used)
                __array(u8, new_lladdr, MAX_ADDR_LEN)
                __field(u8, new_state)
                __field(u32, update_flags)
                __field(u32, pid)
        ),

        TP_fast_assign(
                int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN);
                struct in6_addr *pin6;
                __be32 *p32;

                __entry->family = n->tbl->family;
                __assign_str(dev);
                __entry->lladdr_len = lladdr_len;
                memcpy(__entry->lladdr, n->ha, lladdr_len);
                __entry->flags = n->flags;
                __entry->nud_state = n->nud_state;
                __entry->type = n->type;
                __entry->dead = n->dead;
                __entry->refcnt = refcount_read(&n->refcnt);
                pin6 = (struct in6_addr *)__entry->primary_key6;
                p32 = (__be32 *)__entry->primary_key4;

                if (n->tbl->family == AF_INET)
                        *p32 = *(__be32 *)n->primary_key;
                else
                        *p32 = 0;

#if IS_ENABLED(CONFIG_IPV6)
                if (n->tbl->family == AF_INET6) {
                        pin6 = (struct in6_addr *)__entry->primary_key6;
                        *pin6 = *(struct in6_addr *)n->primary_key;
                } else
#endif
                {
                        ipv6_addr_set_v4mapped(*p32, pin6);
                }
                __entry->confirmed = n->confirmed;
                __entry->updated = n->updated;
                __entry->used = n->used;
                if (lladdr)
                        memcpy(__entry->new_lladdr, lladdr, lladdr_len);
                __entry->new_state = new;
                __entry->update_flags = flags;
                __entry->pid = nlmsg_pid;
        ),

        TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x "
                  "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c "
                  "confirmed %lu updated %lu used %lu new_lladdr %s "
                  "new_state %s update_flags %02x pid %d",
                  __entry->family, __get_str(dev),
                  __print_hex_str(__entry->lladdr, __entry->lladdr_len),
                  __entry->flags, neigh_state_str(__entry->nud_state),
                  __entry->type, __entry->dead, __entry->refcnt,
                  __entry->primary_key4, __entry->primary_key6,
                  __entry->confirmed, __entry->updated, __entry->used,
                  __print_hex_str(__entry->new_lladdr, __entry->lladdr_len),
                  neigh_state_str(__entry->new_state),
                  __entry->update_flags, __entry->pid)
);

DECLARE_EVENT_CLASS(neigh__update,
        TP_PROTO(struct neighbour *n, int err),
        TP_ARGS(n, err),
        TP_STRUCT__entry(
                __field(u32, family)
                __string(dev, (n->dev ? n->dev->name : "NULL"))
                __array(u8, lladdr, MAX_ADDR_LEN)
                __field(u8, lladdr_len)
                __field(u8, flags)
                __field(u8, nud_state)
                __field(u8, type)
                __field(u8, dead)
                __field(int, refcnt)
                __array(__u8, primary_key4, 4)
                __array(__u8, primary_key6, 16)
                __field(unsigned long, confirmed)
                __field(unsigned long, updated)
                __field(unsigned long, used)
                __field(u32, err)
        ),

        TP_fast_assign(
                int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN);
                struct in6_addr *pin6;
                __be32 *p32;

                __entry->family = n->tbl->family;
                __assign_str(dev);
                __entry->lladdr_len = lladdr_len;
                memcpy(__entry->lladdr, n->ha, lladdr_len);
                __entry->flags = n->flags;
                __entry->nud_state = n->nud_state;
                __entry->type = n->type;
                __entry->dead = n->dead;
                __entry->refcnt = refcount_read(&n->refcnt);
                pin6 = (struct in6_addr *)__entry->primary_key6;
                p32 = (__be32 *)__entry->primary_key4;

                if (n->tbl->family == AF_INET)
                        *p32 = *(__be32 *)n->primary_key;
                else
                        *p32 = 0;

#if IS_ENABLED(CONFIG_IPV6)
                if (n->tbl->family == AF_INET6) {
                        pin6 = (struct in6_addr *)__entry->primary_key6;
                        *pin6 = *(struct in6_addr *)n->primary_key;
                } else
#endif
                {
                        ipv6_addr_set_v4mapped(*p32, pin6);
                }

                __entry->confirmed = n->confirmed;
                __entry->updated = n->updated;
                __entry->used = n->used;
                __entry->err = err;
        ),

        TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x "
                  "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c "
                  "confirmed %lu updated %lu used %lu err %d",
                  __entry->family, __get_str(dev),
                  __print_hex_str(__entry->lladdr, __entry->lladdr_len),
                  __entry->flags, neigh_state_str(__entry->nud_state),
                  __entry->type, __entry->dead, __entry->refcnt,
                  __entry->primary_key4, __entry->primary_key6,
                  __entry->confirmed, __entry->updated, __entry->used,
                  __entry->err)
);

DEFINE_EVENT(neigh__update, neigh_update_done,
        TP_PROTO(struct neighbour *neigh, int err),
        TP_ARGS(neigh, err)
);

DEFINE_EVENT(neigh__update, neigh_timer_handler,
        TP_PROTO(struct neighbour *neigh, int err),
        TP_ARGS(neigh, err)
);

DEFINE_EVENT(neigh__update, neigh_event_send_done,
        TP_PROTO(struct neighbour *neigh, int err),
        TP_ARGS(neigh, err)
);

DEFINE_EVENT(neigh__update, neigh_event_send_dead,
        TP_PROTO(struct neighbour *neigh, int err),
        TP_ARGS(neigh, err)
);

DEFINE_EVENT(neigh__update, neigh_cleanup_and_release,
        TP_PROTO(struct neighbour *neigh, int rc),
        TP_ARGS(neigh, rc)
);

#endif /* _TRACE_NEIGH_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




















    1 

































    1 












    1 


















    1 




    1 

    1 








    1 
































    1 






    1 














    1 

















    1 






    1 
    1 

    1 







    1 

































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfs/btree.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Handle opening/closing btree
 */

#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/log2.h>

#include "btree.h"

/* Get a reference to a B*Tree and do some initial checks */
struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp keycmp)
{
        struct hfs_btree *tree;
        struct hfs_btree_header_rec *head;
        struct address_space *mapping;
        struct folio *folio;
        struct buffer_head *bh;
        unsigned int size;
        u16 dblock;
        sector_t start_block;
        loff_t offset;

        tree = kzalloc_obj(*tree);
        if (!tree)
                return NULL;

        mutex_init(&tree->tree_lock);
        spin_lock_init(&tree->hash_lock);
        /* Set the correct compare function */
        tree->sb = sb;
        tree->cnid = id;
        tree->keycmp = keycmp;

        tree->inode = iget_locked(sb, id);
        if (!tree->inode)
                goto free_tree;
        BUG_ON(!(inode_state_read_once(tree->inode) & I_NEW));
        {
        struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
        HFS_I(tree->inode)->flags = 0;
        mutex_init(&HFS_I(tree->inode)->extents_lock);
        switch (id) {
        case HFS_EXT_CNID:
                hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
                                    mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz));
                if (HFS_I(tree->inode)->alloc_blocks >
                                        HFS_I(tree->inode)->first_blocks) {
                        pr_err("invalid btree extent records\n");
                        unlock_new_inode(tree->inode);
                        goto free_inode;
                }

                tree->inode->i_mapping->a_ops = &hfs_btree_aops;
                break;
        case HFS_CAT_CNID:
                hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize,
                                    mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz));

                if (!HFS_I(tree->inode)->first_blocks) {
                        pr_err("invalid btree extent records (0 size)\n");
                        unlock_new_inode(tree->inode);
                        goto free_inode;
                }

                tree->inode->i_mapping->a_ops = &hfs_btree_aops;
                break;
        default:
                BUG();
        }
        }
        unlock_new_inode(tree->inode);

        mapping = tree->inode->i_mapping;
        folio = filemap_grab_folio(mapping, 0);
        if (IS_ERR(folio))
                goto free_inode;

        folio_zero_range(folio, 0, folio_size(folio));

        dblock = hfs_ext_find_block(HFS_I(tree->inode)->first_extents, 0);
        start_block = HFS_SB(sb)->fs_start + (dblock * HFS_SB(sb)->fs_div);

        size = folio_size(folio);
        offset = 0;
        while (size > 0) {
                size_t len;

                bh = sb_bread(sb, start_block);
                if (!bh) {
                        pr_err("unable to read tree header\n");
                        goto put_folio;
                }

                len = min_t(size_t, folio_size(folio), sb->s_blocksize);
                memcpy_to_folio(folio, offset, bh->b_data, sb->s_blocksize);

                brelse(bh);

                start_block++;
                offset += len;
                size -= len;
        }

        folio_mark_uptodate(folio);

        /* Load the header */
        head = (struct hfs_btree_header_rec *)(kmap_local_folio(folio, 0) +
                                               sizeof(struct hfs_bnode_desc));
        tree->root = be32_to_cpu(head->root);
        tree->leaf_count = be32_to_cpu(head->leaf_count);
        tree->leaf_head = be32_to_cpu(head->leaf_head);
        tree->leaf_tail = be32_to_cpu(head->leaf_tail);
        tree->node_count = be32_to_cpu(head->node_count);
        tree->free_nodes = be32_to_cpu(head->free_nodes);
        tree->attributes = be32_to_cpu(head->attributes);
        tree->node_size = be16_to_cpu(head->node_size);
        tree->max_key_len = be16_to_cpu(head->max_key_len);
        tree->depth = be16_to_cpu(head->depth);

        size = tree->node_size;
        if (!is_power_of_2(size))
                goto fail_folio;
        if (!tree->node_count)
                goto fail_folio;
        switch (id) {
        case HFS_EXT_CNID:
                if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) {
                        pr_err("invalid extent max_key_len %d\n",
                               tree->max_key_len);
                        goto fail_folio;
                }
                break;
        case HFS_CAT_CNID:
                if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) {
                        pr_err("invalid catalog max_key_len %d\n",
                               tree->max_key_len);
                        goto fail_folio;
                }
                break;
        default:
                BUG();
        }

        tree->node_size_shift = ffs(size) - 1;
        tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT;

        kunmap_local(head);
        folio_unlock(folio);
        folio_put(folio);
        return tree;

fail_folio:
        kunmap_local(head);
put_folio:
        folio_unlock(folio);
        folio_put(folio);
free_inode:
        tree->inode->i_mapping->a_ops = &hfs_aops;
        iput(tree->inode);
free_tree:
        kfree(tree);
        return NULL;
}

/* Release resources used by a btree */
void hfs_btree_close(struct hfs_btree *tree)
{
        struct hfs_bnode *node;
        int i;

        if (!tree)
                return;

        for (i = 0; i < NODE_HASH_SIZE; i++) {
                while ((node = tree->node_hash[i])) {
                        tree->node_hash[i] = node->next_hash;
                        if (atomic_read(&node->refcnt))
                                pr_err("node %d:%d still has %d user(s)!\n",
                                       node->tree->cnid, node->this,
                                       atomic_read(&node->refcnt));
                        hfs_bnode_free(node);
                        tree->node_hash_cnt--;
                }
        }
        iput(tree->inode);
        kfree(tree);
}

void hfs_btree_write(struct hfs_btree *tree)
{
        struct hfs_btree_header_rec *head;
        struct hfs_bnode *node;
        struct page *page;

        node = hfs_bnode_find(tree, 0);
        if (IS_ERR(node))
                /* panic? */
                return;
        /* Load the header */
        page = node->page[0];
        head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
                                               sizeof(struct hfs_bnode_desc));

        head->root = cpu_to_be32(tree->root);
        head->leaf_count = cpu_to_be32(tree->leaf_count);
        head->leaf_head = cpu_to_be32(tree->leaf_head);
        head->leaf_tail = cpu_to_be32(tree->leaf_tail);
        head->node_count = cpu_to_be32(tree->node_count);
        head->free_nodes = cpu_to_be32(tree->free_nodes);
        head->attributes = cpu_to_be32(tree->attributes);
        head->depth = cpu_to_be16(tree->depth);

        kunmap_local(head);
        set_page_dirty(page);
        hfs_bnode_put(node);
}

static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
{
        struct hfs_btree *tree = prev->tree;
        struct hfs_bnode *node;
        struct hfs_bnode_desc desc;
        __be32 cnid;

        node = hfs_bnode_create(tree, idx);
        if (IS_ERR(node))
                return node;

        if (!tree->free_nodes)
                panic("FIXME!!!");
        tree->free_nodes--;
        prev->next = idx;
        cnid = cpu_to_be32(idx);
        hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4);

        node->type = HFS_NODE_MAP;
        node->num_recs = 1;
        hfs_bnode_clear(node, 0, tree->node_size);
        desc.next = 0;
        desc.prev = 0;
        desc.type = HFS_NODE_MAP;
        desc.height = 0;
        desc.num_recs = cpu_to_be16(1);
        desc.reserved = 0;
        hfs_bnode_write(node, &desc, 0, sizeof(desc));
        hfs_bnode_write_u16(node, 14, 0x8000);
        hfs_bnode_write_u16(node, tree->node_size - 2, 14);
        hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6);

        return node;
}

/* Make sure @tree has enough space for the @rsvd_nodes */
int hfs_bmap_reserve(struct hfs_btree *tree, u32 rsvd_nodes)
{
        struct inode *inode = tree->inode;
        u32 count;
        int res;

        while (tree->free_nodes < rsvd_nodes) {
                res = hfs_extend_file(inode);
                if (res)
                        return res;
                HFS_I(inode)->phys_size = inode->i_size =
                                (loff_t)HFS_I(inode)->alloc_blocks *
                                HFS_SB(tree->sb)->alloc_blksz;
                HFS_I(inode)->fs_blocks = inode->i_size >>
                                          tree->sb->s_blocksize_bits;
                inode_set_bytes(inode, inode->i_size);
                count = inode->i_size >> tree->node_size_shift;
                tree->free_nodes += count - tree->node_count;
                tree->node_count = count;
        }
        return 0;
}

struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
{
        struct hfs_bnode *node, *next_node;
        struct page **pagep;
        u32 nidx, idx;
        unsigned off;
        u16 off16;
        u16 len;
        u8 *data, byte, m;
        int i, res;

        res = hfs_bmap_reserve(tree, 1);
        if (res)
                return ERR_PTR(res);

        nidx = 0;
        node = hfs_bnode_find(tree, nidx);
        if (IS_ERR(node))
                return node;
        len = hfs_brec_lenoff(node, 2, &off16);
        off = off16;

        off += node->page_offset;
        pagep = node->page + (off >> PAGE_SHIFT);
        data = kmap_local_page(*pagep);
        off &= ~PAGE_MASK;
        idx = 0;

        for (;;) {
                while (len) {
                        byte = data[off];
                        if (byte != 0xff) {
                                for (m = 0x80, i = 0; i < 8; m >>= 1, i++) {
                                        if (!(byte & m)) {
                                                idx += i;
                                                data[off] |= m;
                                                set_page_dirty(*pagep);
                                                kunmap_local(data);
                                                tree->free_nodes--;
                                                mark_inode_dirty(tree->inode);
                                                hfs_bnode_put(node);
                                                return hfs_bnode_create(tree, idx);
                                        }
                                }
                        }
                        if (++off >= PAGE_SIZE) {
                                kunmap_local(data);
                                data = kmap_local_page(*++pagep);
                                off = 0;
                        }
                        idx += 8;
                        len--;
                }
                kunmap_local(data);
                nidx = node->next;
                if (!nidx) {
                        printk(KERN_DEBUG "create new bmap node...\n");
                        next_node = hfs_bmap_new_bmap(node, idx);
                } else
                        next_node = hfs_bnode_find(tree, nidx);
                hfs_bnode_put(node);
                if (IS_ERR(next_node))
                        return next_node;
                node = next_node;

                len = hfs_brec_lenoff(node, 0, &off16);
                off = off16;
                off += node->page_offset;
                pagep = node->page + (off >> PAGE_SHIFT);
                data = kmap_local_page(*pagep);
                off &= ~PAGE_MASK;
        }
}

void hfs_bmap_free(struct hfs_bnode *node)
{
        struct hfs_btree *tree;
        struct page *page;
        u16 off, len;
        u32 nidx;
        u8 *data, byte, m;

        hfs_dbg("node %u\n", node->this);
        tree = node->tree;
        nidx = node->this;
        node = hfs_bnode_find(tree, 0);
        if (IS_ERR(node))
                return;
        len = hfs_brec_lenoff(node, 2, &off);
        while (nidx >= len * 8) {
                u32 i;

                nidx -= len * 8;
                i = node->next;
                if (!i) {
                        /* panic */;
                        pr_crit("unable to free bnode %u. bmap not found!\n",
                                node->this);
                        hfs_bnode_put(node);
                        return;
                }
                hfs_bnode_put(node);
                node = hfs_bnode_find(tree, i);
                if (IS_ERR(node))
                        return;
                if (node->type != HFS_NODE_MAP) {
                        /* panic */;
                        pr_crit("invalid bmap found! (%u,%d)\n",
                                node->this, node->type);
                        hfs_bnode_put(node);
                        return;
                }
                len = hfs_brec_lenoff(node, 0, &off);
        }
        off += node->page_offset + nidx / 8;
        page = node->page[off >> PAGE_SHIFT];
        data = kmap_local_page(page);
        off &= ~PAGE_MASK;
        m = 1 << (~nidx & 7);
        byte = data[off];
        if (!(byte & m)) {
                pr_crit("trying to free free bnode %u(%d)\n",
                        node->this, node->type);
                kunmap_local(data);
                hfs_bnode_put(node);
                return;
        }
        data[off] = byte & ~m;
        set_page_dirty(page);
        kunmap_local(data);
        hfs_bnode_put(node);
        tree->free_nodes++;
        mark_inode_dirty(tree->inode);
}


















































































































    1 




    1 





    1 





































    1 



















    1 


































































































































































































































































































































    1 







    1 


    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
// SPDX-License-Identifier: GPL-2.0-or-later

#include <linux/syscalls.h>
#include <linux/time_namespace.h>

#include "futex.h"

/*
 * Support for robust futexes: the kernel cleans up held futexes at
 * thread exit time.
 *
 * Implementation: user-space maintains a per-thread list of locks it
 * is holding. Upon do_exit(), the kernel carefully walks this list,
 * and marks all locks that are owned by this thread with the
 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
 * always manipulated with the lock held, so the list is private and
 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
 * field, to allow the kernel to clean up if the thread dies after
 * acquiring the lock, but just before it could have added itself to
 * the list. There can only be one such pending lock.
 */

/**
 * sys_set_robust_list() - Set the robust-futex list head of a task
 * @head:        pointer to the list-head
 * @len:        length of the list-head, as userspace expects
 */
SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
                size_t, len)
{
        /*
         * The kernel knows only one size for now:
         */
        if (unlikely(len != sizeof(*head)))
                return -EINVAL;

        current->robust_list = head;

        return 0;
}

static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat)
{
#ifdef CONFIG_COMPAT
        if (compat)
                return p->compat_robust_list;
#endif
        return p->robust_list;
}

static void __user *futex_get_robust_list_common(int pid, bool compat)
{
        struct task_struct *p = current;
        void __user *head;
        int ret;

        scoped_guard(rcu) {
                if (pid) {
                        p = find_task_by_vpid(pid);
                        if (!p)
                                return (void __user *)ERR_PTR(-ESRCH);
                }
                get_task_struct(p);
        }

        /*
         * Hold exec_update_lock to serialize with concurrent exec()
         * so ptrace_may_access() is checked against stable credentials
         */
        ret = down_read_killable(&p->signal->exec_update_lock);
        if (ret)
                goto err_put;

        ret = -EPERM;
        if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
                goto err_unlock;

        head = futex_task_robust_list(p, compat);

        up_read(&p->signal->exec_update_lock);
        put_task_struct(p);

        return head;

err_unlock:
        up_read(&p->signal->exec_update_lock);
err_put:
        put_task_struct(p);
        return (void __user *)ERR_PTR(ret);
}

/**
 * sys_get_robust_list() - Get the robust-futex list head of a task
 * @pid:        pid of the process [zero for current task]
 * @head_ptr:        pointer to a list-head pointer, the kernel fills it in
 * @len_ptr:        pointer to a length field, the kernel fills in the header size
 */
SYSCALL_DEFINE3(get_robust_list, int, pid,
                struct robust_list_head __user * __user *, head_ptr,
                size_t __user *, len_ptr)
{
        struct robust_list_head __user *head = futex_get_robust_list_common(pid, false);

        if (IS_ERR(head))
                return PTR_ERR(head);

        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(head, head_ptr);
}

long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
{
        unsigned int flags = futex_to_flags(op);
        int cmd = op & FUTEX_CMD_MASK;

        if (flags & FLAGS_CLOCKRT) {
                if (cmd != FUTEX_WAIT_BITSET &&
                    cmd != FUTEX_WAIT_REQUEUE_PI &&
                    cmd != FUTEX_LOCK_PI2)
                        return -ENOSYS;
        }

        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
                fallthrough;
        case FUTEX_WAIT_BITSET:
                return futex_wait(uaddr, flags, val, timeout, val3);
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
                fallthrough;
        case FUTEX_WAKE_BITSET:
                return futex_wake(uaddr, flags, val, val3);
        case FUTEX_REQUEUE:
                return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
        case FUTEX_CMP_REQUEUE:
                return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
        case FUTEX_WAKE_OP:
                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
        case FUTEX_LOCK_PI:
                flags |= FLAGS_CLOCKRT;
                fallthrough;
        case FUTEX_LOCK_PI2:
                return futex_lock_pi(uaddr, flags, timeout, 0);
        case FUTEX_UNLOCK_PI:
                return futex_unlock_pi(uaddr, flags);
        case FUTEX_TRYLOCK_PI:
                return futex_lock_pi(uaddr, flags, NULL, 1);
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
                                             uaddr2);
        case FUTEX_CMP_REQUEUE_PI:
                return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
        }
        return -ENOSYS;
}

static __always_inline bool futex_cmd_has_timeout(u32 cmd)
{
        switch (cmd) {
        case FUTEX_WAIT:
        case FUTEX_LOCK_PI:
        case FUTEX_LOCK_PI2:
        case FUTEX_WAIT_BITSET:
        case FUTEX_WAIT_REQUEUE_PI:
                return true;
        }
        return false;
}

static __always_inline int
futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
{
        if (!timespec64_valid(ts))
                return -EINVAL;

        *t = timespec64_to_ktime(*ts);
        if (cmd == FUTEX_WAIT)
                *t = ktime_add_safe(ktime_get(), *t);
        else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
                *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
        return 0;
}

SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
                const struct __kernel_timespec __user *, utime,
                u32 __user *, uaddr2, u32, val3)
{
        int ret, cmd = op & FUTEX_CMD_MASK;
        ktime_t t, *tp = NULL;
        struct timespec64 ts;

        if (utime && futex_cmd_has_timeout(cmd)) {
                if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
                        return -EFAULT;
                if (get_timespec64(&ts, utime))
                        return -EFAULT;
                ret = futex_init_timeout(cmd, op, &ts, &t);
                if (ret)
                        return ret;
                tp = &t;
        }

        return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}

/**
 * futex_parse_waitv - Parse a waitv array from userspace
 * @futexv:        Kernel side list of waiters to be filled
 * @uwaitv:     Userspace list to be parsed
 * @nr_futexes: Length of futexv
 * @wake:        Wake to call when futex is woken
 * @wake_data:        Data for the wake handler
 *
 * Return: Error code on failure, 0 on success
 */
int futex_parse_waitv(struct futex_vector *futexv,
                      struct futex_waitv __user *uwaitv,
                      unsigned int nr_futexes, futex_wake_fn *wake,
                      void *wake_data)
{
        struct futex_waitv aux;
        unsigned int i;

        for (i = 0; i < nr_futexes; i++) {
                unsigned int flags;

                if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
                        return -EFAULT;

                if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
                        return -EINVAL;

                flags = futex2_to_flags(aux.flags);
                if (!futex_flags_valid(flags))
                        return -EINVAL;

                if (!futex_validate_input(flags, aux.val))
                        return -EINVAL;

                futexv[i].w.flags = flags;
                futexv[i].w.val = aux.val;
                futexv[i].w.uaddr = aux.uaddr;
                futexv[i].q = futex_q_init;
                futexv[i].q.wake = wake;
                futexv[i].q.wake_data = wake_data;
        }

        return 0;
}

static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
                                clockid_t clockid, struct hrtimer_sleeper *to)
{
        int flag_clkid = 0, flag_init = 0;
        struct timespec64 ts;
        ktime_t time;
        int ret;

        if (!timeout)
                return 0;

        if (clockid == CLOCK_REALTIME) {
                flag_clkid = FLAGS_CLOCKRT;
                flag_init = FUTEX_CLOCK_REALTIME;
        }

        if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
                return -EINVAL;

        if (get_timespec64(&ts, timeout))
                return -EFAULT;

        /*
         * Since there's no opcode for futex_waitv, use
         * FUTEX_WAIT_BITSET that uses absolute timeout as well
         */
        ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
        if (ret)
                return ret;

        futex_setup_timer(&time, to, flag_clkid, 0);
        return 0;
}

static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
{
        hrtimer_cancel(&to->timer);
        destroy_hrtimer_on_stack(&to->timer);
}

/**
 * sys_futex_waitv - Wait on a list of futexes
 * @waiters:    List of futexes to wait on
 * @nr_futexes: Length of futexv
 * @flags:      Flag for timeout (monotonic/realtime)
 * @timeout:        Optional absolute timeout.
 * @clockid:        Clock to be used for the timeout, realtime or monotonic.
 *
 * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
 * if a futex_wake() is performed at any uaddr. The syscall returns immediately
 * if any waiter has *uaddr != val. *timeout is an optional timeout value for
 * the operation. Each waiter has individual flags. The `flags` argument for
 * the syscall should be used solely for specifying the timeout as realtime, if
 * needed. Flags for private futexes, sizes, etc. should be used on the
 * individual flags of each waiter.
 *
 * Returns the array index of one of the woken futexes. No further information
 * is provided: any number of other futexes may also have been woken by the
 * same event, and if more than one futex was woken, the retrned index may
 * refer to any one of them. (It is not necessaryily the futex with the
 * smallest index, nor the one most recently woken, nor...)
 */

SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
                unsigned int, nr_futexes, unsigned int, flags,
                struct __kernel_timespec __user *, timeout, clockid_t, clockid)
{
        struct hrtimer_sleeper to;
        struct futex_vector *futexv;
        int ret;

        /* This syscall supports no flags for now */
        if (flags)
                return -EINVAL;

        if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
                return -EINVAL;

        if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
                return ret;

        futexv = kzalloc_objs(*futexv, nr_futexes);
        if (!futexv) {
                ret = -ENOMEM;
                goto destroy_timer;
        }

        ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
                                NULL);
        if (!ret)
                ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);

        kfree(futexv);

destroy_timer:
        if (timeout)
                futex2_destroy_timeout(&to);
        return ret;
}

/*
 * sys_futex_wake - Wake a number of futexes
 * @uaddr:        Address of the futex(es) to wake
 * @mask:        bitmask
 * @nr:                Number of the futexes to wake
 * @flags:        FUTEX2 flags
 *
 * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
 * futex2 family of calls.
 */

SYSCALL_DEFINE4(futex_wake,
                void __user *, uaddr,
                unsigned long, mask,
                int, nr,
                unsigned int, flags)
{
        if (flags & ~FUTEX2_VALID_MASK)
                return -EINVAL;

        flags = futex2_to_flags(flags);
        if (!futex_flags_valid(flags))
                return -EINVAL;

        if (!futex_validate_input(flags, mask))
                return -EINVAL;

        return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
}

/*
 * sys_futex_wait - Wait on a futex
 * @uaddr:        Address of the futex to wait on
 * @val:        Value of @uaddr
 * @mask:        bitmask
 * @flags:        FUTEX2 flags
 * @timeout:        Optional absolute timeout
 * @clockid:        Clock to be used for the timeout, realtime or monotonic
 *
 * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
 * futex2 familiy of calls.
 */

SYSCALL_DEFINE6(futex_wait,
                void __user *, uaddr,
                unsigned long, val,
                unsigned long, mask,
                unsigned int, flags,
                struct __kernel_timespec __user *, timeout,
                clockid_t, clockid)
{
        struct hrtimer_sleeper to;
        int ret;

        if (flags & ~FUTEX2_VALID_MASK)
                return -EINVAL;

        flags = futex2_to_flags(flags);
        if (!futex_flags_valid(flags))
                return -EINVAL;

        if (!futex_validate_input(flags, val) ||
            !futex_validate_input(flags, mask))
                return -EINVAL;

        if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
                return ret;

        ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);

        if (timeout)
                futex2_destroy_timeout(&to);

        return ret;
}

/*
 * sys_futex_requeue - Requeue a waiter from one futex to another
 * @waiters:        array describing the source and destination futex
 * @flags:        unused
 * @nr_wake:        number of futexes to wake
 * @nr_requeue:        number of futexes to requeue
 *
 * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
 * futex2 family of calls.
 */

SYSCALL_DEFINE4(futex_requeue,
                struct futex_waitv __user *, waiters,
                unsigned int, flags,
                int, nr_wake,
                int, nr_requeue)
{
        struct futex_vector futexes[2];
        u32 cmpval;
        int ret;

        if (flags)
                return -EINVAL;

        if (!waiters)
                return -EINVAL;

        ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
        if (ret)
                return ret;

        /*
         * For now mandate both flags are identical, like the sys_futex()
         * interface has. If/when we merge the variable sized futex support,
         * that patch can modify this test to allow a difference in size.
         */
        if (futexes[0].w.flags != futexes[1].w.flags)
                return -EINVAL;

        cmpval = futexes[0].w.val;

        return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
                             u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
                             nr_wake, nr_requeue, &cmpval, 0);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(set_robust_list,
                struct compat_robust_list_head __user *, head,
                compat_size_t, len)
{
        if (unlikely(len != sizeof(*head)))
                return -EINVAL;

        current->compat_robust_list = head;

        return 0;
}

COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
                        compat_uptr_t __user *, head_ptr,
                        compat_size_t __user *, len_ptr)
{
        struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true);

        if (IS_ERR(head))
                return PTR_ERR(head);

        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(ptr_to_compat(head), head_ptr);
}
#endif /* CONFIG_COMPAT */

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
                const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
                u32, val3)
{
        int ret, cmd = op & FUTEX_CMD_MASK;
        ktime_t t, *tp = NULL;
        struct timespec64 ts;

        if (utime && futex_cmd_has_timeout(cmd)) {
                if (get_old_timespec32(&ts, utime))
                        return -EFAULT;
                ret = futex_init_timeout(cmd, op, &ts, &t);
                if (ret)
                        return ret;
                tp = &t;
        }

        return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#endif /* CONFIG_COMPAT_32BIT_TIME */






































    1 


























    1 







    1 








    1 



    1 



































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/ioprio.c
 *
 * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
 *
 * Helper functions for setting/querying io priorities of processes. The
 * system calls closely mimmick getpriority/setpriority, see the man page for
 * those. The prio argument is a composite of prio class and prio data, where
 * the data argument has meaning within that class. The standard scheduling
 * classes have 8 distinct prio levels, with 0 being the highest prio and 7
 * being the lowest.
 *
 * IOW, setting BE scheduling class with prio 2 is done ala:
 *
 * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
 *
 * ioprio_set(PRIO_PROCESS, pid, prio);
 *
 * See also Documentation/block/ioprio.rst
 *
 */
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/ioprio.h>
#include <linux/cred.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>

int ioprio_check_cap(int ioprio)
{
        int class = IOPRIO_PRIO_CLASS(ioprio);
        int level = IOPRIO_PRIO_LEVEL(ioprio);

        switch (class) {
                case IOPRIO_CLASS_RT:
                        /*
                         * Originally this only checked for CAP_SYS_ADMIN,
                         * which was implicitly allowed for pid 0 by security
                         * modules such as SELinux. Make sure we check
                         * CAP_SYS_ADMIN first to avoid a denial/avc for
                         * possibly missing CAP_SYS_NICE permission.
                         */
                        if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
                                return -EPERM;
                        break;
                case IOPRIO_CLASS_BE:
                case IOPRIO_CLASS_IDLE:
                        break;
                case IOPRIO_CLASS_NONE:
                        if (level)
                                return -EINVAL;
                        break;
                case IOPRIO_CLASS_INVALID:
                default:
                        return -EINVAL;
        }

        return 0;
}

SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
{
        struct task_struct *p, *g;
        struct user_struct *user;
        struct pid *pgrp;
        kuid_t uid;
        int ret;

        ret = ioprio_check_cap(ioprio);
        if (ret)
                return ret;

        ret = -ESRCH;
        rcu_read_lock();
        switch (which) {
                case IOPRIO_WHO_PROCESS:
                        if (!who)
                                p = current;
                        else
                                p = find_task_by_vpid(who);
                        if (p)
                                ret = set_task_ioprio(p, ioprio);
                        break;
                case IOPRIO_WHO_PGRP:
                        if (!who)
                                pgrp = task_pgrp(current);
                        else
                                pgrp = find_vpid(who);

                        read_lock(&tasklist_lock);
                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                                ret = set_task_ioprio(p, ioprio);
                                if (ret) {
                                        read_unlock(&tasklist_lock);
                                        goto out;
                                }
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        read_unlock(&tasklist_lock);

                        break;
                case IOPRIO_WHO_USER:
                        uid = make_kuid(current_user_ns(), who);
                        if (!uid_valid(uid))
                                break;
                        if (!who)
                                user = current_user();
                        else
                                user = find_user(uid);

                        if (!user)
                                break;

                        for_each_process_thread(g, p) {
                                if (!uid_eq(task_uid(p), uid) ||
                                    !task_pid_vnr(p))
                                        continue;
                                ret = set_task_ioprio(p, ioprio);
                                if (ret)
                                        goto free_uid;
                        }
free_uid:
                        if (who)
                                free_uid(user);
                        break;
                default:
                        ret = -EINVAL;
        }

out:
        rcu_read_unlock();
        return ret;
}

static int get_task_ioprio(struct task_struct *p)
{
        int ret;

        ret = security_task_getioprio(p);
        if (ret)
                goto out;
        task_lock(p);
        ret = __get_task_ioprio(p);
        task_unlock(p);
out:
        return ret;
}

/*
 * Return raw IO priority value as set by userspace. We use this for
 * ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and
 * also so that userspace can distinguish unset IO priority (which just gets
 * overriden based on task's nice value) from IO priority set to some value.
 */
static int get_task_raw_ioprio(struct task_struct *p)
{
        int ret;

        ret = security_task_getioprio(p);
        if (ret)
                goto out;
        task_lock(p);
        if (p->io_context)
                ret = p->io_context->ioprio;
        else
                ret = IOPRIO_DEFAULT;
        task_unlock(p);
out:
        return ret;
}

static int ioprio_best(unsigned short aprio, unsigned short bprio)
{
        return min(aprio, bprio);
}

SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
{
        struct task_struct *g, *p;
        struct user_struct *user;
        struct pid *pgrp;
        kuid_t uid;
        int ret = -ESRCH;
        int tmpio;

        rcu_read_lock();
        switch (which) {
                case IOPRIO_WHO_PROCESS:
                        if (!who)
                                p = current;
                        else
                                p = find_task_by_vpid(who);
                        if (p)
                                ret = get_task_raw_ioprio(p);
                        break;
                case IOPRIO_WHO_PGRP:
                        if (!who)
                                pgrp = task_pgrp(current);
                        else
                                pgrp = find_vpid(who);
                        read_lock(&tasklist_lock);
                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                                tmpio = get_task_ioprio(p);
                                if (tmpio < 0)
                                        continue;
                                if (ret == -ESRCH)
                                        ret = tmpio;
                                else
                                        ret = ioprio_best(ret, tmpio);
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        read_unlock(&tasklist_lock);

                        break;
                case IOPRIO_WHO_USER:
                        uid = make_kuid(current_user_ns(), who);
                        if (!who)
                                user = current_user();
                        else
                                user = find_user(uid);

                        if (!user)
                                break;

                        for_each_process_thread(g, p) {
                                if (!uid_eq(task_uid(p), user->uid) ||
                                    !task_pid_vnr(p))
                                        continue;
                                tmpio = get_task_ioprio(p);
                                if (tmpio < 0)
                                        continue;
                                if (ret == -ESRCH)
                                        ret = tmpio;
                                else
                                        ret = ioprio_best(ret, tmpio);
                        }

                        if (who)
                                free_uid(user);
                        break;
                default:
                        ret = -EINVAL;
        }

        rcu_read_unlock();
        return ret;
}
























    2 
















    2 




    2 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 */

#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
#include <asm/processor.h>
#include <asm/simd.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/sizes.h>

asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
                                       const u8 *data, size_t nblocks, u32 inc);
asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
                                        const u8 *data, size_t nblocks, u32 inc);

static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);

static void blake2s_compress(struct blake2s_ctx *ctx,
                             const u8 *data, size_t nblocks, u32 inc)
{
        /* SIMD disables preemption, so relax after processing each page. */
        BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);

        if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
                blake2s_compress_generic(ctx, data, nblocks, inc);
                return;
        }

        do {
                const size_t blocks = min_t(size_t, nblocks,
                                            SZ_4K / BLAKE2S_BLOCK_SIZE);

                kernel_fpu_begin();
                if (static_branch_likely(&blake2s_use_avx512))
                        blake2s_compress_avx512(ctx, data, blocks, inc);
                else
                        blake2s_compress_ssse3(ctx, data, blocks, inc);
                kernel_fpu_end();

                data += blocks * BLAKE2S_BLOCK_SIZE;
                nblocks -= blocks;
        } while (nblocks);
}

#define blake2s_mod_init_arch blake2s_mod_init_arch
static void blake2s_mod_init_arch(void)
{
        if (boot_cpu_has(X86_FEATURE_SSSE3))
                static_branch_enable(&blake2s_use_ssse3);

        if (boot_cpu_has(X86_FEATURE_AVX) &&
            boot_cpu_has(X86_FEATURE_AVX2) &&
            boot_cpu_has(X86_FEATURE_AVX512F) &&
            boot_cpu_has(X86_FEATURE_AVX512VL) &&
            cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
                              XFEATURE_MASK_AVX512, NULL))
                static_branch_enable(&blake2s_use_avx512);
}





































































































































































































    1 






    1 
    1 









    1 
    1 














































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright 2019 Google LLC
 */

/*
 * Refer to Documentation/block/inline-encryption.rst for detailed explanation.
 */

#define pr_fmt(fmt) "blk-crypto: " fmt

#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-crypto-profile.h>
#include <linux/module.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>

#include "blk-crypto-internal.h"

const struct blk_crypto_mode blk_crypto_modes[] = {
        [BLK_ENCRYPTION_MODE_AES_256_XTS] = {
                .name = "AES-256-XTS",
                .cipher_str = "xts(aes)",
                .keysize = 64,
                .security_strength = 32,
                .ivsize = 16,
        },
        [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
                .name = "AES-128-CBC-ESSIV",
                .cipher_str = "essiv(cbc(aes),sha256)",
                .keysize = 16,
                .security_strength = 16,
                .ivsize = 16,
        },
        [BLK_ENCRYPTION_MODE_ADIANTUM] = {
                .name = "Adiantum",
                .cipher_str = "adiantum(xchacha12,aes)",
                .keysize = 32,
                .security_strength = 32,
                .ivsize = 32,
        },
        [BLK_ENCRYPTION_MODE_SM4_XTS] = {
                .name = "SM4-XTS",
                .cipher_str = "xts(sm4)",
                .keysize = 32,
                .security_strength = 16,
                .ivsize = 16,
        },
};

/*
 * This number needs to be at least (the number of threads doing IO
 * concurrently) * (maximum recursive depth of a bio), so that we don't
 * deadlock on crypt_ctx allocations. The default is chosen to be the same
 * as the default number of post read contexts in both EXT4 and F2FS.
 */
static int num_prealloc_crypt_ctxs = 128;

module_param(num_prealloc_crypt_ctxs, int, 0444);
MODULE_PARM_DESC(num_prealloc_crypt_ctxs,
                "Number of bio crypto contexts to preallocate");

static struct kmem_cache *bio_crypt_ctx_cache;
static mempool_t *bio_crypt_ctx_pool;

static int __init bio_crypt_ctx_init(void)
{
        size_t i;

        bio_crypt_ctx_cache = KMEM_CACHE(bio_crypt_ctx, 0);
        if (!bio_crypt_ctx_cache)
                goto out_no_mem;

        bio_crypt_ctx_pool = mempool_create_slab_pool(num_prealloc_crypt_ctxs,
                                                      bio_crypt_ctx_cache);
        if (!bio_crypt_ctx_pool)
                goto out_no_mem;

        /* This is assumed in various places. */
        BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);

        /*
         * Validate the crypto mode properties.  This ideally would be done with
         * static assertions, but boot-time checks are the next best thing.
         */
        for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) {
                BUG_ON(blk_crypto_modes[i].keysize >
                       BLK_CRYPTO_MAX_RAW_KEY_SIZE);
                BUG_ON(blk_crypto_modes[i].security_strength >
                       blk_crypto_modes[i].keysize);
                BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE);
        }

        return 0;
out_no_mem:
        panic("Failed to allocate mem for bio crypt ctxs\n");
}
subsys_initcall(bio_crypt_ctx_init);

void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key,
                       const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask)
{
        struct bio_crypt_ctx *bc;

        /*
         * The caller must use a gfp_mask that contains __GFP_DIRECT_RECLAIM so
         * that the mempool_alloc() can't fail.
         */
        WARN_ON_ONCE(!(gfp_mask & __GFP_DIRECT_RECLAIM));

        bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);

        bc->bc_key = key;
        memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun));

        bio->bi_crypt_context = bc;
}

void __bio_crypt_free_ctx(struct bio *bio)
{
        mempool_free(bio->bi_crypt_context, bio_crypt_ctx_pool);
        bio->bi_crypt_context = NULL;
}

int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask)
{
        dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
        if (!dst->bi_crypt_context)
                return -ENOMEM;
        *dst->bi_crypt_context = *src->bi_crypt_context;
        return 0;
}

/* Increments @dun by @inc, treating @dun as a multi-limb integer. */
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
                             unsigned int inc)
{
        int i;

        for (i = 0; inc && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) {
                dun[i] += inc;
                /*
                 * If the addition in this limb overflowed, then we need to
                 * carry 1 into the next limb. Else the carry is 0.
                 */
                if (dun[i] < inc)
                        inc = 1;
                else
                        inc = 0;
        }
}

void __bio_crypt_advance(struct bio *bio, unsigned int bytes)
{
        struct bio_crypt_ctx *bc = bio->bi_crypt_context;

        bio_crypt_dun_increment(bc->bc_dun,
                                bytes >> bc->bc_key->data_unit_size_bits);
}

/*
 * Returns true if @bc->bc_dun plus @bytes converted to data units is equal to
 * @next_dun, treating the DUNs as multi-limb integers.
 */
bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc,
                                 unsigned int bytes,
                                 const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
{
        int i;
        unsigned int carry = bytes >> bc->bc_key->data_unit_size_bits;

        for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) {
                if (bc->bc_dun[i] + carry != next_dun[i])
                        return false;
                /*
                 * If the addition in this limb overflowed, then we need to
                 * carry 1 into the next limb. Else the carry is 0.
                 */
                if ((bc->bc_dun[i] + carry) < carry)
                        carry = 1;
                else
                        carry = 0;
        }

        /* If the DUN wrapped through 0, don't treat it as contiguous. */
        return carry == 0;
}

/*
 * Checks that two bio crypt contexts are compatible - i.e. that
 * they are mergeable except for data_unit_num continuity.
 */
static bool bio_crypt_ctx_compatible(struct bio_crypt_ctx *bc1,
                                     struct bio_crypt_ctx *bc2)
{
        if (!bc1)
                return !bc2;

        return bc2 && bc1->bc_key == bc2->bc_key;
}

bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio)
{
        return bio_crypt_ctx_compatible(rq->crypt_ctx, bio->bi_crypt_context);
}

/*
 * Checks that two bio crypt contexts are compatible, and also
 * that their data_unit_nums are continuous (and can hence be merged)
 * in the order @bc1 followed by @bc2.
 */
bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes,
                             struct bio_crypt_ctx *bc2)
{
        if (!bio_crypt_ctx_compatible(bc1, bc2))
                return false;

        return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun);
}

blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq)
{
        return blk_crypto_get_keyslot(rq->q->crypto_profile,
                                      rq->crypt_ctx->bc_key,
                                      &rq->crypt_keyslot);
}

void __blk_crypto_rq_put_keyslot(struct request *rq)
{
        blk_crypto_put_keyslot(rq->crypt_keyslot);
        rq->crypt_keyslot = NULL;
}

void __blk_crypto_free_request(struct request *rq)
{
        /* The keyslot, if one was needed, should have been released earlier. */
        if (WARN_ON_ONCE(rq->crypt_keyslot))
                __blk_crypto_rq_put_keyslot(rq);

        mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
        rq->crypt_ctx = NULL;
}

/*
 * Process a bio with a crypto context.  Returns true if the caller should
 * submit the passed in bio, false if the bio is consumed.
 *
 * See the kerneldoc comment for blk_crypto_submit_bio for further details.
 */
bool __blk_crypto_submit_bio(struct bio *bio)
{
        const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
        struct block_device *bdev = bio->bi_bdev;

        /* Error if bio has no data. */
        if (WARN_ON_ONCE(!bio_has_data(bio))) {
                bio_io_error(bio);
                return false;
        }

        /*
         * If the device does not natively support the encryption context, try to use
         * the fallback if available.
         */
        if (!blk_crypto_config_supported_natively(bdev, &bc_key->crypto_cfg)) {
                if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)) {
                        pr_warn_once("%pg: crypto API fallback disabled; failing request.\n",
                                bdev);
                        bio->bi_status = BLK_STS_NOTSUPP;
                        bio_endio(bio);
                        return false;
                }
                return blk_crypto_fallback_bio_prep(bio);
        }

        return true;
}
EXPORT_SYMBOL_GPL(__blk_crypto_submit_bio);

int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
                             gfp_t gfp_mask)
{
        if (!rq->crypt_ctx) {
                rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
                if (!rq->crypt_ctx)
                        return -ENOMEM;
        }
        *rq->crypt_ctx = *bio->bi_crypt_context;
        return 0;
}

/**
 * blk_crypto_init_key() - Prepare a key for use with blk-crypto
 * @blk_key: Pointer to the blk_crypto_key to initialize.
 * @key_bytes: the bytes of the key
 * @key_size: size of the key in bytes
 * @key_type: type of the key -- either raw or hardware-wrapped
 * @crypto_mode: identifier for the encryption algorithm to use
 * @dun_bytes: number of bytes that will be used to specify the DUN when this
 *               key is used
 * @data_unit_size: the data unit size to use for en/decryption
 *
 * Return: 0 on success, -errno on failure.  The caller is responsible for
 *           zeroizing both blk_key and key_bytes when done with them.
 */
int blk_crypto_init_key(struct blk_crypto_key *blk_key,
                        const u8 *key_bytes, size_t key_size,
                        enum blk_crypto_key_type key_type,
                        enum blk_crypto_mode_num crypto_mode,
                        unsigned int dun_bytes,
                        unsigned int data_unit_size)
{
        const struct blk_crypto_mode *mode;

        memset(blk_key, 0, sizeof(*blk_key));

        if (crypto_mode >= ARRAY_SIZE(blk_crypto_modes))
                return -EINVAL;

        mode = &blk_crypto_modes[crypto_mode];
        switch (key_type) {
        case BLK_CRYPTO_KEY_TYPE_RAW:
                if (key_size != mode->keysize)
                        return -EINVAL;
                break;
        case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED:
                if (key_size < mode->security_strength ||
                    key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE)
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        if (dun_bytes == 0 || dun_bytes > mode->ivsize)
                return -EINVAL;

        if (!is_power_of_2(data_unit_size))
                return -EINVAL;

        blk_key->crypto_cfg.crypto_mode = crypto_mode;
        blk_key->crypto_cfg.dun_bytes = dun_bytes;
        blk_key->crypto_cfg.data_unit_size = data_unit_size;
        blk_key->crypto_cfg.key_type = key_type;
        blk_key->data_unit_size_bits = ilog2(data_unit_size);
        blk_key->size = key_size;
        memcpy(blk_key->bytes, key_bytes, key_size);

        return 0;
}

bool blk_crypto_config_supported_natively(struct block_device *bdev,
                                          const struct blk_crypto_config *cfg)
{
        return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
                                          cfg);
}

/*
 * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
 * block_device it's submitted to supports inline crypto, or the
 * blk-crypto-fallback is enabled and supports the cfg).
 */
bool blk_crypto_config_supported(struct block_device *bdev,
                                 const struct blk_crypto_config *cfg)
{
        if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) &&
            cfg->key_type == BLK_CRYPTO_KEY_TYPE_RAW)
                return true;
        return blk_crypto_config_supported_natively(bdev, cfg);
}

/**
 * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
 * @bdev: block device to operate on
 * @key: A key to use on the device
 *
 * Upper layers must call this function to ensure that either the hardware
 * supports the key's crypto settings, or the crypto API fallback has transforms
 * for the needed mode allocated and ready to go. This function may allocate
 * an skcipher, and *should not* be called from the data path, since that might
 * cause a deadlock
 *
 * Return: 0 on success; -EOPNOTSUPP if the key is wrapped but the hardware does
 *           not support wrapped keys; -ENOPKG if the key is a raw key but the
 *           hardware does not support raw keys and blk-crypto-fallback is either
 *           disabled or the needed algorithm is disabled in the crypto API; or
 *           another -errno code if something else went wrong.
 */
int blk_crypto_start_using_key(struct block_device *bdev,
                               const struct blk_crypto_key *key)
{
        if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
                return 0;
        if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_RAW) {
                pr_warn_ratelimited("%pg: no support for wrapped keys\n", bdev);
                return -EOPNOTSUPP;
        }
        return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}

/**
 * blk_crypto_evict_key() - Evict a blk_crypto_key from a block_device
 * @bdev: a block_device on which I/O using the key may have been done
 * @key: the key to evict
 *
 * For a given block_device, this function removes the given blk_crypto_key from
 * the keyslot management structures and evicts it from any underlying hardware
 * keyslot(s) or blk-crypto-fallback keyslot it may have been programmed into.
 *
 * Upper layers must call this before freeing the blk_crypto_key.  It must be
 * called for every block_device the key may have been used on.  The key must no
 * longer be in use by any I/O when this function is called.
 *
 * Context: May sleep.
 */
void blk_crypto_evict_key(struct block_device *bdev,
                          const struct blk_crypto_key *key)
{
        struct request_queue *q = bdev_get_queue(bdev);
        int err;

        if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
                err = __blk_crypto_evict_key(q->crypto_profile, key);
        else
                err = blk_crypto_fallback_evict_key(key);
        /*
         * An error can only occur here if the key failed to be evicted from a
         * keyslot (due to a hardware or driver issue) or is allegedly still in
         * use by I/O (due to a kernel bug).  Even in these cases, the key is
         * still unlinked from the keyslot management structures, and the caller
         * is allowed and expected to free it right away.  There's nothing
         * callers can do to handle errors, so just log them and return void.
         */
        if (err)
                pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err);
}
EXPORT_SYMBOL_GPL(blk_crypto_evict_key);

static int blk_crypto_ioctl_import_key(struct blk_crypto_profile *profile,
                                       void __user *argp)
{
        struct blk_crypto_import_key_arg arg;
        u8 raw_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE];
        u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
        int ret;

        if (copy_from_user(&arg, argp, sizeof(arg)))
                return -EFAULT;

        if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
                return -EINVAL;

        if (arg.raw_key_size < 16 || arg.raw_key_size > sizeof(raw_key))
                return -EINVAL;

        if (copy_from_user(raw_key, u64_to_user_ptr(arg.raw_key_ptr),
                           arg.raw_key_size)) {
                ret = -EFAULT;
                goto out;
        }
        ret = blk_crypto_import_key(profile, raw_key, arg.raw_key_size, lt_key);
        if (ret < 0)
                goto out;
        if (ret > arg.lt_key_size) {
                ret = -EOVERFLOW;
                goto out;
        }
        arg.lt_key_size = ret;
        if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key,
                         arg.lt_key_size) ||
            copy_to_user(argp, &arg, sizeof(arg))) {
                ret = -EFAULT;
                goto out;
        }
        ret = 0;

out:
        memzero_explicit(raw_key, sizeof(raw_key));
        memzero_explicit(lt_key, sizeof(lt_key));
        return ret;
}

static int blk_crypto_ioctl_generate_key(struct blk_crypto_profile *profile,
                                         void __user *argp)
{
        struct blk_crypto_generate_key_arg arg;
        u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
        int ret;

        if (copy_from_user(&arg, argp, sizeof(arg)))
                return -EFAULT;

        if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
                return -EINVAL;

        ret = blk_crypto_generate_key(profile, lt_key);
        if (ret < 0)
                goto out;
        if (ret > arg.lt_key_size) {
                ret = -EOVERFLOW;
                goto out;
        }
        arg.lt_key_size = ret;
        if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key,
                         arg.lt_key_size) ||
            copy_to_user(argp, &arg, sizeof(arg))) {
                ret = -EFAULT;
                goto out;
        }
        ret = 0;

out:
        memzero_explicit(lt_key, sizeof(lt_key));
        return ret;
}

static int blk_crypto_ioctl_prepare_key(struct blk_crypto_profile *profile,
                                        void __user *argp)
{
        struct blk_crypto_prepare_key_arg arg;
        u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
        u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
        int ret;

        if (copy_from_user(&arg, argp, sizeof(arg)))
                return -EFAULT;

        if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
                return -EINVAL;

        if (arg.lt_key_size > sizeof(lt_key))
                return -EINVAL;

        if (copy_from_user(lt_key, u64_to_user_ptr(arg.lt_key_ptr),
                           arg.lt_key_size)) {
                ret = -EFAULT;
                goto out;
        }
        ret = blk_crypto_prepare_key(profile, lt_key, arg.lt_key_size, eph_key);
        if (ret < 0)
                goto out;
        if (ret > arg.eph_key_size) {
                ret = -EOVERFLOW;
                goto out;
        }
        arg.eph_key_size = ret;
        if (copy_to_user(u64_to_user_ptr(arg.eph_key_ptr), eph_key,
                         arg.eph_key_size) ||
            copy_to_user(argp, &arg, sizeof(arg))) {
                ret = -EFAULT;
                goto out;
        }
        ret = 0;

out:
        memzero_explicit(lt_key, sizeof(lt_key));
        memzero_explicit(eph_key, sizeof(eph_key));
        return ret;
}

int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
                     void __user *argp)
{
        struct blk_crypto_profile *profile =
                bdev_get_queue(bdev)->crypto_profile;

        if (!profile)
                return -EOPNOTSUPP;

        switch (cmd) {
        case BLKCRYPTOIMPORTKEY:
                return blk_crypto_ioctl_import_key(profile, argp);
        case BLKCRYPTOGENERATEKEY:
                return blk_crypto_ioctl_generate_key(profile, argp);
        case BLKCRYPTOPREPAREKEY:
                return blk_crypto_ioctl_prepare_key(profile, argp);
        default:
                return -ENOTTY;
        }
}






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H

/*
 * If more than 16 keys are ever supported, a thorough audit
 * will be necessary to ensure that the types that store key
 * numbers and masks have sufficient capacity.
 */
#define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1)

extern int arch_set_user_pkey_access(int pkey, unsigned long init_val);

static inline bool arch_pkeys_enabled(void)
{
        return cpu_feature_enabled(X86_FEATURE_OSPKE);
}

/*
 * Try to dedicate one of the protection keys to be used as an
 * execute-only protection key.
 */
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return ARCH_DEFAULT_PKEY;

        return __execute_only_pkey(mm);
}

extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
                int prot, int pkey);
static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
                int prot, int pkey)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return 0;

        return __arch_override_mprotect_pkey(vma, prot, pkey);
}

#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)

#define mm_pkey_allocation_map(mm)        (mm->context.pkey_allocation_map)
#define mm_set_pkey_allocated(mm, pkey) do {                \
        mm_pkey_allocation_map(mm) |= (1U << pkey);        \
} while (0)
#define mm_set_pkey_free(mm, pkey) do {                        \
        mm_pkey_allocation_map(mm) &= ~(1U << pkey);        \
} while (0)

static inline
bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
{
        /*
         * "Allocated" pkeys are those that have been returned
         * from pkey_alloc() or pkey 0 which is allocated
         * implicitly when the mm is created.
         */
        if (pkey < 0)
                return false;
        if (pkey >= arch_max_pkey())
                return false;
        /*
         * The exec-only pkey is set in the allocation map, but
         * is not available to any of the user interfaces like
         * mprotect_pkey().
         */
        if (pkey == mm->context.execute_only_pkey)
                return false;

        return mm_pkey_allocation_map(mm) & (1U << pkey);
}

/*
 * Returns a positive, 4-bit key on success, or -1 on failure.
 */
static inline
int mm_pkey_alloc(struct mm_struct *mm)
{
        /*
         * Note: this is the one and only place we make sure
         * that the pkey is valid as far as the hardware is
         * concerned.  The rest of the kernel trusts that
         * only good, valid pkeys come out of here.
         */
        u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
        int ret;

        /*
         * Are we out of pkeys?  We must handle this specially
         * because ffz() behavior is undefined if there are no
         * zeros.
         */
        if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
                return -1;

        ret = ffz(mm_pkey_allocation_map(mm));

        mm_set_pkey_allocated(mm, ret);

        return ret;
}

static inline
int mm_pkey_free(struct mm_struct *mm, int pkey)
{
        if (!mm_pkey_is_allocated(mm, pkey))
                return -EINVAL;

        mm_set_pkey_free(mm, pkey);

        return 0;
}

static inline int vma_pkey(struct vm_area_struct *vma)
{
        unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
                                      VM_PKEY_BIT2 | VM_PKEY_BIT3;

        return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
}

#endif /*_ASM_X86_PKEYS_H */































































































































































































































































































































































   73 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
 *
 * Based on the original implementation which is:
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
 *  Parts of the original code have been moved to arch/x86/vdso/vma.c
 *
 * This file implements vsyscall emulation.  vsyscalls are a legacy ABI:
 * Userspace can request certain kernel services by calling fixed
 * addresses.  This concept is problematic:
 *
 * - It interferes with ASLR.
 * - It's awkward to write code that lives in kernel addresses but is
 *   callable by userspace at fixed addresses.
 * - The whole concept is impossible for 32-bit compat userspace.
 * - UML cannot easily virtualize a vsyscall.
 *
 * As of mid-2014, I believe that there is no new userspace code that
 * will use a vsyscall if the vDSO is present.  I hope that there will
 * soon be no new userspace code that will ever use a vsyscall.
 *
 * The code in this file emulates vsyscalls when notified of a page
 * fault or a general protection fault to a vsyscall address.
 */

#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/sched/signal.h>
#include <linux/mm_types.h>
#include <linux/syscalls.h>
#include <linux/ratelimit.h>

#include <asm/vsyscall.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/traps.h>

#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"

static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
        NONE;
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
        XONLY;
#else
        #error VSYSCALL config is broken
#endif

static int __init vsyscall_setup(char *str)
{
        if (str) {
                if (!strcmp("emulate", str))
                        vsyscall_mode = EMULATE;
                else if (!strcmp("xonly", str))
                        vsyscall_mode = XONLY;
                else if (!strcmp("none", str))
                        vsyscall_mode = NONE;
                else
                        return -EINVAL;

                if (cpu_feature_enabled(X86_FEATURE_LASS) && vsyscall_mode == EMULATE) {
                        setup_clear_cpu_cap(X86_FEATURE_LASS);
                        pr_warn_once("x86/cpu: Disabling LASS due to vsyscall=emulate\n");
                }

                return 0;
        }

        return -EINVAL;
}
early_param("vsyscall", vsyscall_setup);

static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
                              const char *message)
{
        if (!show_unhandled_signals)
                return;

        printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
                           level, current->comm, task_pid_nr(current),
                           message, regs->ip, regs->cs,
                           regs->sp, regs->ax, regs->si, regs->di);
}

static int addr_to_vsyscall_nr(unsigned long addr)
{
        int nr;

        if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
                return -EINVAL;

        nr = (addr & 0xC00UL) >> 10;
        if (nr >= 3)
                return -EINVAL;

        return nr;
}

static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
        if (!access_ok((void __user *)ptr, size)) {
                struct thread_struct *thread = &current->thread;

                thread->error_code        = X86_PF_USER | X86_PF_WRITE;
                thread->cr2                = ptr;
                thread->trap_nr                = X86_TRAP_PF;

                force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
                return false;
        } else {
                return true;
        }
}

static bool __emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
        unsigned long caller;
        int vsyscall_nr, syscall_nr, tmp;
        long ret;
        unsigned long orig_dx;

        /* Confirm that the fault happened in 64-bit user mode */
        if (!user_64bit_mode(regs))
                return false;

        if (vsyscall_mode == NONE) {
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall attempted with vsyscall=none");
                return false;
        }

        vsyscall_nr = addr_to_vsyscall_nr(address);

        trace_emulate_vsyscall(vsyscall_nr);

        if (vsyscall_nr < 0) {
                warn_bad_vsyscall(KERN_WARNING, regs,
                                  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
                goto sigsegv;
        }

        if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
                warn_bad_vsyscall(KERN_WARNING, regs,
                                  "vsyscall with bad stack (exploit attempt?)");
                goto sigsegv;
        }

        /*
         * Check for access_ok violations and find the syscall nr.
         *
         * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
         * 64-bit, so we don't need to special-case it here.  For all the
         * vsyscalls, NULL means "don't write anything" not "write it at
         * address 0".
         */
        switch (vsyscall_nr) {
        case 0:
                if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||
                    !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_gettimeofday;
                break;

        case 1:
                if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_time;
                break;

        case 2:
                if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
                    !write_ok_or_segv(regs->si, sizeof(unsigned))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_getcpu;
                break;
        }

        /*
         * Handle seccomp.  regs->ip must be the original value.
         * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
         *
         * We could optimize the seccomp disabled case, but performance
         * here doesn't matter.
         */
        regs->orig_ax = syscall_nr;
        regs->ax = -ENOSYS;
        tmp = secure_computing();
        if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
                warn_bad_vsyscall(KERN_DEBUG, regs,
                                  "seccomp tried to change syscall nr or ip");
                force_exit_sig(SIGSYS);
                return true;
        }
        regs->orig_ax = -1;
        if (tmp)
                goto do_ret;  /* skip requested */

        /*
         * With a real vsyscall, page faults cause SIGSEGV.
         */
        ret = -EFAULT;
        switch (vsyscall_nr) {
        case 0:
                /* this decodes regs->di and regs->si on its own */
                ret = __x64_sys_gettimeofday(regs);
                break;

        case 1:
                /* this decodes regs->di on its own */
                ret = __x64_sys_time(regs);
                break;

        case 2:
                /* while we could clobber regs->dx, we didn't in the past... */
                orig_dx = regs->dx;
                regs->dx = 0;
                /* this decodes regs->di, regs->si and regs->dx on its own */
                ret = __x64_sys_getcpu(regs);
                regs->dx = orig_dx;
                break;
        }

check_fault:
        if (ret == -EFAULT) {
                /* Bad news -- userspace fed a bad pointer to a vsyscall. */
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall fault (exploit attempt?)");
                goto sigsegv;
        }

        regs->ax = ret;

do_ret:
        /* Emulate a ret instruction. */
        regs->ip = caller;
        regs->sp += 8;
        return true;

sigsegv:
        force_sig(SIGSEGV);
        return true;
}

bool emulate_vsyscall_pf(unsigned long error_code, struct pt_regs *regs,
                         unsigned long address)
{
        /* Write faults or kernel-privilege faults never get fixed up. */
        if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
                return false;

        /*
         * Assume that faults at regs->ip are because of an instruction
         * fetch. Return early and avoid emulation for faults during
         * data accesses:
         */
        if (address != regs->ip) {
                /* Failed vsyscall read */
                if (vsyscall_mode == EMULATE)
                        return false;

                /* User code tried and failed to read the vsyscall page. */
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
                return false;
        }

        /*
         * X86_PF_INSTR is only set when NX is supported.  When
         * available, use it to double-check that the emulation code
         * is only being used for instruction fetches:
         */
        if (cpu_feature_enabled(X86_FEATURE_NX))
                WARN_ON_ONCE(!(error_code & X86_PF_INSTR));

        return __emulate_vsyscall(regs, address);
}

bool emulate_vsyscall_gp(struct pt_regs *regs)
{
        /* Without LASS, vsyscall accesses are expected to generate a #PF */
        if (!cpu_feature_enabled(X86_FEATURE_LASS))
                return false;

        /* Emulate only if the RIP points to the vsyscall address */
        if (!is_vsyscall_vaddr(regs->ip))
                return false;

        return __emulate_vsyscall(regs, regs->ip);
}

/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
static const char *gate_vma_name(struct vm_area_struct *vma)
{
        return "[vsyscall]";
}
static const struct vm_operations_struct gate_vma_ops = {
        .name = gate_vma_name,
};
static struct vm_area_struct gate_vma __ro_after_init = {
        .vm_start        = VSYSCALL_ADDR,
        .vm_end                = VSYSCALL_ADDR + PAGE_SIZE,
        .vm_page_prot        = PAGE_READONLY_EXEC,
        .vm_flags        = VM_READ | VM_EXEC,
        .vm_ops                = &gate_vma_ops,
};

struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_COMPAT
        if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))
                return NULL;
#endif
        if (vsyscall_mode == NONE)
                return NULL;
        return &gate_vma;
}

int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma = get_gate_vma(mm);

        if (!vma)
                return 0;

        return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

/*
 * Use this when you have no reliable mm, typically from interrupt
 * context. It is less reliable than using a task's mm and may give
 * false positives.
 */
int in_gate_area_no_mm(unsigned long addr)
{
        return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}

/*
 * The VSYSCALL page is the only user-accessible page in the kernel address
 * range.  Normally, the kernel page tables can have _PAGE_USER clear, but
 * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
 * are enabled.
 *
 * Some day we may create a "minimal" vsyscall mode in which we emulate
 * vsyscalls but leave the page not present.  If so, we skip calling
 * this.
 */
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
        set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
        p4d = p4d_offset(pgd, VSYSCALL_ADDR);
        set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
        pud = pud_offset(p4d, VSYSCALL_ADDR);
        set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
        pmd = pmd_offset(pud, VSYSCALL_ADDR);
        set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
}

void __init map_vsyscall(void)
{
        extern char __vsyscall_page;
        unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);

        /*
         * For full emulation, the page needs to exist for real.  In
         * execute-only mode, there is no PTE at all backing the vsyscall
         * page.
         */
        if (vsyscall_mode == EMULATE) {
                __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
                             PAGE_KERNEL_VVAR);
                set_vsyscall_pgtable_user_bits(swapper_pg_dir);
        }

        if (vsyscall_mode == XONLY)
                vm_flags_init(&gate_vma, VM_EXEC);

        BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
                     (unsigned long)VSYSCALL_ADDR);
}
















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_PKT_SCHED_H
#define __NET_PKT_SCHED_H

#include <linux/jiffies.h>
#include <linux/ktime.h>
#include <linux/if_vlan.h>
#include <linux/netdevice.h>
#include <net/sch_generic.h>
#include <net/net_namespace.h>
#include <uapi/linux/pkt_sched.h>

#define DEFAULT_TX_QUEUE_LEN        1000
#define STAB_SIZE_LOG_MAX        30

struct qdisc_walker {
        int        stop;
        int        skip;
        int        count;
        int        (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *);
};

#define qdisc_priv(q)                                                        \
        _Generic(q,                                                        \
                 const struct Qdisc * : (const void *)&q->privdata,        \
                 struct Qdisc * : (void *)&q->privdata)

/* 
   Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth
   
   Normal IP packet size ~ 512byte, hence:

   0.5Kbyte/1Mbyte/sec = 0.5msec, so that we need 50usec timer for
   10Mbit ethernet.

   10msec resolution -> <50Kbit/sec.
   
   The result: [34]86 is not good choice for QoS router :-(

   The things are not so bad, because we may use artificial
   clock evaluated by integration of network data flow
   in the most critical places.
 */

typedef u64        psched_time_t;

/* Avoid doing 64 bit divide */
#define PSCHED_SHIFT                        6
#define PSCHED_TICKS2NS(x)                ((s64)(x) << PSCHED_SHIFT)
#define PSCHED_NS2TICKS(x)                ((x) >> PSCHED_SHIFT)

#define PSCHED_TICKS_PER_SEC                PSCHED_NS2TICKS(NSEC_PER_SEC)
#define PSCHED_PASTPERFECT                0

static inline psched_time_t psched_get_time(void)
{
        return PSCHED_NS2TICKS(ktime_get_ns());
}

struct qdisc_watchdog {
        struct hrtimer        timer;
        struct Qdisc        *qdisc;
};

void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
                                 clockid_t clockid);
void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc);

void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
                                      u64 delta_ns);

static inline void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd,
                                              u64 expires)
{
        return qdisc_watchdog_schedule_range_ns(wd, expires, 0ULL);
}

static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd,
                                           psched_time_t expires)
{
        qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires));
}

void qdisc_watchdog_cancel(struct qdisc_watchdog *wd);

extern struct Qdisc_ops pfifo_qdisc_ops;
extern struct Qdisc_ops bfifo_qdisc_ops;
extern struct Qdisc_ops pfifo_head_drop_qdisc_ops;

int fifo_set_limit(struct Qdisc *q, unsigned int limit);
struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
                               unsigned int limit,
                               struct netlink_ext_ack *extack);

int register_qdisc(struct Qdisc_ops *qops);
void unregister_qdisc(struct Qdisc_ops *qops);
#define NET_SCH_ALIAS_PREFIX "net-sch-"
#define MODULE_ALIAS_NET_SCH(id)        MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id)
void qdisc_get_default(char *id, size_t len);
int qdisc_set_default(const char *id);

void qdisc_hash_add(struct Qdisc *q, bool invisible);
void qdisc_hash_del(struct Qdisc *q);
struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle);
struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle);
struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
                                        struct nlattr *tab,
                                        struct netlink_ext_ack *extack);
void qdisc_put_rtab(struct qdisc_rate_table *tab);
void qdisc_put_stab(struct qdisc_size_table *tab);
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
                     struct net_device *dev, struct netdev_queue *txq,
                     spinlock_t *root_lock, bool validate);

void __qdisc_run(struct Qdisc *q);

static inline struct sk_buff *qdisc_run(struct Qdisc *q)
{
        if (qdisc_run_begin(q)) {
                __qdisc_run(q);
                return qdisc_run_end(q);
        }
        return NULL;
}

extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];

/* Calculate maximal size of packet seen by hard_start_xmit
   routine of this device.
 */
static inline unsigned int psched_mtu(const struct net_device *dev)
{
        return READ_ONCE(dev->mtu) + dev->hard_header_len;
}

static inline struct net *qdisc_net(struct Qdisc *q)
{
        return dev_net(q->dev_queue->dev);
}

struct tc_query_caps_base {
        enum tc_setup_type type;
        void *caps;
};

struct tc_cbs_qopt_offload {
        u8 enable;
        s32 queue;
        s32 hicredit;
        s32 locredit;
        s32 idleslope;
        s32 sendslope;
};

struct tc_etf_qopt_offload {
        u8 enable;
        s32 queue;
};

struct tc_mqprio_caps {
        bool validate_queue_counts:1;
};

struct tc_mqprio_qopt_offload {
        /* struct tc_mqprio_qopt must always be the first element */
        struct tc_mqprio_qopt qopt;
        struct netlink_ext_ack *extack;
        u16 mode;
        u16 shaper;
        u32 flags;
        u64 min_rate[TC_QOPT_MAX_QUEUE];
        u64 max_rate[TC_QOPT_MAX_QUEUE];
        unsigned long preemptible_tcs;
};

struct tc_taprio_caps {
        bool supports_queue_max_sdu:1;
        bool gate_mask_per_txq:1;
        /* Device expects lower TXQ numbers to have higher priority over higher
         * TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS,
         * INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE.
         */
        bool broken_mqprio:1;
};

enum tc_taprio_qopt_cmd {
        TAPRIO_CMD_REPLACE,
        TAPRIO_CMD_DESTROY,
        TAPRIO_CMD_STATS,
        TAPRIO_CMD_QUEUE_STATS,
};

/**
 * struct tc_taprio_qopt_stats - IEEE 802.1Qbv statistics
 * @window_drops: Frames that were dropped because they were too large to be
 *        transmitted in any of the allotted time windows (open gates) for their
 *        traffic class.
 * @tx_overruns: Frames still being transmitted by the MAC after the
 *        transmission gate associated with their traffic class has closed.
 *        Equivalent to `12.29.1.1.2 TransmissionOverrun` from 802.1Q-2018.
 */
struct tc_taprio_qopt_stats {
        u64 window_drops;
        u64 tx_overruns;
};

struct tc_taprio_qopt_queue_stats {
        int queue;
        struct tc_taprio_qopt_stats stats;
};

struct tc_taprio_sched_entry {
        u8 command; /* TC_TAPRIO_CMD_* */

        /* The gate_mask in the offloading side refers to traffic classes */
        u32 gate_mask;
        u32 interval;
};

struct tc_taprio_qopt_offload {
        enum tc_taprio_qopt_cmd cmd;

        union {
                /* TAPRIO_CMD_STATS */
                struct tc_taprio_qopt_stats stats;
                /* TAPRIO_CMD_QUEUE_STATS */
                struct tc_taprio_qopt_queue_stats queue_stats;
                /* TAPRIO_CMD_REPLACE */
                struct {
                        struct tc_mqprio_qopt_offload mqprio;
                        struct netlink_ext_ack *extack;
                        ktime_t base_time;
                        u64 cycle_time;
                        u64 cycle_time_extension;
                        u32 max_sdu[TC_MAX_QUEUE];

                        size_t num_entries;
                        struct tc_taprio_sched_entry entries[];
                };
        };
};

#if IS_ENABLED(CONFIG_NET_SCH_TAPRIO)

/* Reference counting */
struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
                                                  *offload);
void taprio_offload_free(struct tc_taprio_qopt_offload *offload);

#else

/* Reference counting */
static inline struct tc_taprio_qopt_offload *
taprio_offload_get(struct tc_taprio_qopt_offload *offload)
{
        return NULL;
}

static inline void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
{
}

#endif

/* Ensure skb_mstamp_ns, which might have been populated with the txtime, is
 * not mistaken for a software timestamp, because this will otherwise prevent
 * the dispatch of hardware timestamps to the socket.
 */
static inline void skb_txtime_consumed(struct sk_buff *skb)
{
        skb->tstamp = ktime_set(0, 0);
}

static inline bool tc_qdisc_stats_dump(struct Qdisc *sch,
                                       unsigned long cl,
                                       struct qdisc_walker *arg)
{
        if (arg->count >= arg->skip && arg->fn(sch, cl, arg) < 0) {
                arg->stop = 1;
                return false;
        }

        arg->count++;
        return true;
}

static inline void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
{
        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
                        txt, qdisc->ops->id, qdisc->handle >> 16);
                qdisc->flags |= TCQ_F_WARN_NONWC;
        }
}

static inline unsigned int qdisc_peek_len(struct Qdisc *sch)
{
        struct sk_buff *skb;
        unsigned int len;

        skb = sch->ops->peek(sch);
        if (unlikely(skb == NULL)) {
                qdisc_warn_nonwc("qdisc_peek_len", sch);
                return 0;
        }
        len = qdisc_pkt_len(skb);

        return len;
}

static inline void qdisc_lock_init(struct Qdisc *sch,
                                   const struct Qdisc_ops *ops)
{
        spin_lock_init(&sch->q.lock);

        /* Skip dynamic keys if nesting is not possible */
        if (ops->static_flags & TCQ_F_INGRESS ||
            ops == &noqueue_qdisc_ops)
                return;

        lockdep_register_key(&sch->root_lock_key);
        lockdep_set_class(&sch->q.lock, &sch->root_lock_key);
}

static inline void qdisc_lock_uninit(struct Qdisc *sch,
                                     const struct Qdisc_ops *ops)
{
        if (ops->static_flags & TCQ_F_INGRESS ||
            ops == &noqueue_qdisc_ops)
                return;

        lockdep_unregister_key(&sch->root_lock_key);
}

#endif
























































































































































































































































































































































































































































































































































































































































































































































































































































    2 




    2 

























































   14 




































































































































































































































































































































































































































































































































































































































    1 





    1 









   16 










































































































   16 







































































































































































































































   14 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Portions Copyright (C) 1992 Drew Eckhardt
 */
#ifndef _LINUX_BLKDEV_H
#define _LINUX_BLKDEV_H

#include <linux/types.h>
#include <linux/blk_types.h>
#include <linux/device.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <linux/minmax.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/completion.h>
#include <linux/wait.h>
#include <linux/bio.h>
#include <linux/gfp.h>
#include <linux/kdev_t.h>
#include <linux/rcupdate.h>
#include <linux/percpu-refcount.h>
#include <linux/blkzoned.h>
#include <linux/sched.h>
#include <linux/sbitmap.h>
#include <linux/uuid.h>
#include <linux/xarray.h>
#include <linux/file.h>
#include <linux/lockdep.h>

struct module;
struct request_queue;
struct elevator_queue;
struct blk_trace;
struct request;
struct sg_io_hdr;
struct blkcg_gq;
struct blk_flush_queue;
struct kiocb;
struct pr_ops;
struct rq_qos;
struct hd_geometry;
struct blk_report_zones_args;
struct blk_queue_stats;
struct blk_stat_callback;
struct blk_crypto_profile;

extern const struct device_type disk_type;
extern const struct device_type part_type;
extern const struct class block_class;

/*
 * Maximum number of blkcg policies allowed to be registered concurrently.
 * Defined here to simplify include dependency.
 */
#define BLKCG_MAX_POLS                6

#define DISK_MAX_PARTS                        256
#define DISK_NAME_LEN                        32

#define PARTITION_META_INFO_VOLNAMELTH        64
/*
 * Enough for the string representation of any kind of UUID plus NULL.
 * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
 */
#define PARTITION_META_INFO_UUIDLTH        (UUID_STRING_LEN + 1)

struct partition_meta_info {
        char uuid[PARTITION_META_INFO_UUIDLTH];
        u8 volname[PARTITION_META_INFO_VOLNAMELTH];
};

/**
 * DOC: genhd capability flags
 *
 * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to
 * removable media.  When set, the device remains present even when media is not
 * inserted.  Shall not be set for devices which are removed entirely when the
 * media is removed.
 *
 * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events,
 * doesn't appear in sysfs, and can't be opened from userspace or using
 * blkdev_get*. Used for the underlying components of multipath devices.
 *
 * ``GENHD_FL_NO_PART``: partition support is disabled.  The kernel will not
 * scan for partitions from add_disk, and users can't add partitions manually.
 *
 */
enum {
        GENHD_FL_REMOVABLE                        = 1 << 0,
        GENHD_FL_HIDDEN                                = 1 << 1,
        GENHD_FL_NO_PART                        = 1 << 2,
};

enum {
        DISK_EVENT_MEDIA_CHANGE                        = 1 << 0, /* media changed */
        DISK_EVENT_EJECT_REQUEST                = 1 << 1, /* eject requested */
};

enum {
        /* Poll even if events_poll_msecs is unset */
        DISK_EVENT_FLAG_POLL                        = 1 << 0,
        /* Forward events to udev */
        DISK_EVENT_FLAG_UEVENT                        = 1 << 1,
        /* Block event polling when open for exclusive write */
        DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE        = 1 << 2,
};

struct disk_events;
struct badblocks;

enum blk_integrity_checksum {
        BLK_INTEGRITY_CSUM_NONE                = 0,
        BLK_INTEGRITY_CSUM_IP                = 1,
        BLK_INTEGRITY_CSUM_CRC                = 2,
        BLK_INTEGRITY_CSUM_CRC64        = 3,
} __packed ;

struct blk_integrity {
        unsigned char                                flags;
        enum blk_integrity_checksum                csum_type;
        unsigned char                                metadata_size;
        unsigned char                                pi_offset;
        unsigned char                                interval_exp;
        unsigned char                                tag_size;
        unsigned char                                pi_tuple_size;
};

typedef unsigned int __bitwise blk_mode_t;

/* open for reading */
#define BLK_OPEN_READ                ((__force blk_mode_t)(1 << 0))
/* open for writing */
#define BLK_OPEN_WRITE                ((__force blk_mode_t)(1 << 1))
/* open exclusively (vs other exclusive openers */
#define BLK_OPEN_EXCL                ((__force blk_mode_t)(1 << 2))
/* opened with O_NDELAY */
#define BLK_OPEN_NDELAY                ((__force blk_mode_t)(1 << 3))
/* open for "writes" only for ioctls (specialy hack for floppy.c) */
#define BLK_OPEN_WRITE_IOCTL        ((__force blk_mode_t)(1 << 4))
/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
#define BLK_OPEN_RESTRICT_WRITES        ((__force blk_mode_t)(1 << 5))
/* return partition scanning errors */
#define BLK_OPEN_STRICT_SCAN        ((__force blk_mode_t)(1 << 6))

struct gendisk {
        /*
         * major/first_minor/minors should not be set by any new driver, the
         * block core will take care of allocating them automatically.
         */
        int major;
        int first_minor;
        int minors;

        char disk_name[DISK_NAME_LEN];        /* name of major driver */

        unsigned short events;                /* supported events */
        unsigned short event_flags;        /* flags related to event processing */

        struct xarray part_tbl;
        struct block_device *part0;

        const struct block_device_operations *fops;
        struct request_queue *queue;
        void *private_data;

        struct bio_set bio_split;

        int flags;
        unsigned long state;
#define GD_NEED_PART_SCAN                0
#define GD_READ_ONLY                        1
#define GD_DEAD                                2
#define GD_NATIVE_CAPACITY                3
#define GD_ADDED                        4
#define GD_SUPPRESS_PART_SCAN                5
#define GD_OWNS_QUEUE                        6
#define GD_ZONE_APPEND_USED                7

        struct mutex open_mutex;        /* open/close mutex */
        unsigned open_partitions;        /* number of open partitions */

        struct backing_dev_info        *bdi;
        struct kobject queue_kobj;        /* the queue/ directory */
        struct kobject *slave_dir;
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
        struct list_head slave_bdevs;
#endif
        struct timer_rand_state *random;
        struct disk_events *ev;

#ifdef CONFIG_BLK_DEV_ZONED
        /*
         * Zoned block device information. Reads of this information must be
         * protected with blk_queue_enter() / blk_queue_exit(). Modifying this
         * information is only allowed while no requests are being processed.
         * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue().
         */
        unsigned int                nr_zones;
        unsigned int                zone_capacity;
        unsigned int                last_zone_capacity;
        u8 __rcu                *zones_cond;
        unsigned int                zone_wplugs_hash_bits;
        atomic_t                nr_zone_wplugs;
        spinlock_t                zone_wplugs_hash_lock;
        struct mempool                *zone_wplugs_pool;
        struct hlist_head        *zone_wplugs_hash;
        struct workqueue_struct *zone_wplugs_wq;
        spinlock_t                zone_wplugs_list_lock;
        struct list_head        zone_wplugs_list;
        struct task_struct        *zone_wplugs_worker;
        struct completion        zone_wplugs_worker_bio_done;
#endif /* CONFIG_BLK_DEV_ZONED */

#if IS_ENABLED(CONFIG_CDROM)
        struct cdrom_device_info *cdi;
#endif
        int node_id;
        struct badblocks *bb;
        struct lockdep_map lockdep_map;
        u64 diskseq;
        blk_mode_t open_mode;

        /*
         * Independent sector access ranges. This is always NULL for
         * devices that do not have multiple independent access ranges.
         */
        struct blk_independent_access_ranges *ia_ranges;

        struct mutex rqos_state_mutex;        /* rqos state change mutex */
};

/**
 * disk_openers - returns how many openers are there for a disk
 * @disk: disk to check
 *
 * This returns the number of openers for a disk.  Note that this value is only
 * stable if disk->open_mutex is held.
 *
 * Note: Due to a quirk in the block layer open code, each open partition is
 * only counted once even if there are multiple openers.
 */
static inline unsigned int disk_openers(struct gendisk *disk)
{
        return atomic_read(&disk->part0->bd_openers);
}

/**
 * disk_has_partscan - return %true if partition scanning is enabled on a disk
 * @disk: disk to check
 *
 * Returns %true if partitions scanning is enabled for @disk, or %false if
 * partition scanning is disabled either permanently or temporarily.
 */
static inline bool disk_has_partscan(struct gendisk *disk)
{
        return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) &&
                !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
}

/*
 * The gendisk is refcounted by the part0 block_device, and the bd_device
 * therein is also used for device model presentation in sysfs.
 */
#define dev_to_disk(device) \
        (dev_to_bdev(device)->bd_disk)
#define disk_to_dev(disk) \
        (&((disk)->part0->bd_device))

#if IS_REACHABLE(CONFIG_CDROM)
#define disk_to_cdi(disk)        ((disk)->cdi)
#else
#define disk_to_cdi(disk)        NULL
#endif

static inline dev_t disk_devt(struct gendisk *disk)
{
        return MKDEV(disk->major, disk->first_minor);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER)
 * however we constrain this to what we can validate and test.
 */
#define BLK_MAX_BLOCK_SIZE      SZ_64K
#else
#define BLK_MAX_BLOCK_SIZE      PAGE_SIZE
#endif


/* blk_validate_limits() validates bsize, so drivers don't usually need to */
static inline int blk_validate_block_size(unsigned long bsize)
{
        if (bsize < 512 || bsize > BLK_MAX_BLOCK_SIZE || !is_power_of_2(bsize))
                return -EINVAL;

        return 0;
}

static inline bool blk_op_is_passthrough(blk_opf_t op)
{
        op &= REQ_OP_MASK;
        return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
}

/* flags set by the driver in queue_limits.features */
typedef unsigned int __bitwise blk_features_t;

/* supports a volatile write cache */
#define BLK_FEAT_WRITE_CACHE                ((__force blk_features_t)(1u << 0))

/* supports passing on the FUA bit */
#define BLK_FEAT_FUA                        ((__force blk_features_t)(1u << 1))

/* rotational device (hard drive or floppy) */
#define BLK_FEAT_ROTATIONAL                ((__force blk_features_t)(1u << 2))

/* contributes to the random number pool */
#define BLK_FEAT_ADD_RANDOM                ((__force blk_features_t)(1u << 3))

/* do disk/partitions IO accounting */
#define BLK_FEAT_IO_STAT                ((__force blk_features_t)(1u << 4))

/* don't modify data until writeback is done */
#define BLK_FEAT_STABLE_WRITES                ((__force blk_features_t)(1u << 5))

/* always completes in submit context */
#define BLK_FEAT_SYNCHRONOUS                ((__force blk_features_t)(1u << 6))

/* supports REQ_NOWAIT */
#define BLK_FEAT_NOWAIT                        ((__force blk_features_t)(1u << 7))

/* supports DAX */
#define BLK_FEAT_DAX                        ((__force blk_features_t)(1u << 8))

/* supports I/O polling */
#define BLK_FEAT_POLL                        ((__force blk_features_t)(1u << 9))

/* is a zoned device */
#define BLK_FEAT_ZONED                        ((__force blk_features_t)(1u << 10))

/* supports PCI(e) p2p requests */
#define BLK_FEAT_PCI_P2PDMA                ((__force blk_features_t)(1u << 12))

/* skip this queue in blk_mq_(un)quiesce_tagset */
#define BLK_FEAT_SKIP_TAGSET_QUIESCE        ((__force blk_features_t)(1u << 13))

/* atomic writes enabled */
#define BLK_FEAT_ATOMIC_WRITES                ((__force blk_features_t)(1u << 14))

/* undocumented magic for bcache */
#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \
        ((__force blk_features_t)(1u << 15))

/*
 * Flags automatically inherited when stacking limits.
 */
#define BLK_FEAT_INHERIT_MASK \
        (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \
         BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | \
         BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE)

/* internal flags in queue_limits.flags */
typedef unsigned int __bitwise blk_flags_t;

/* do not send FLUSH/FUA commands despite advertising a write cache */
#define BLK_FLAG_WRITE_CACHE_DISABLED        ((__force blk_flags_t)(1u << 0))

/* I/O topology is misaligned */
#define BLK_FLAG_MISALIGNED                ((__force blk_flags_t)(1u << 1))

/* passthrough command IO accounting */
#define BLK_FLAG_IOSTATS_PASSTHROUGH        ((__force blk_flags_t)(1u << 2))

struct queue_limits {
        blk_features_t                features;
        blk_flags_t                flags;
        unsigned long                seg_boundary_mask;
        unsigned long                virt_boundary_mask;

        unsigned int                max_hw_sectors;
        unsigned int                max_dev_sectors;
        unsigned int                chunk_sectors;
        unsigned int                max_sectors;
        unsigned int                max_user_sectors;
        unsigned int                max_segment_size;
        unsigned int                max_fast_segment_size;
        unsigned int                physical_block_size;
        unsigned int                logical_block_size;
        unsigned int                alignment_offset;
        unsigned int                io_min;
        unsigned int                io_opt;
        unsigned int                max_discard_sectors;
        unsigned int                max_hw_discard_sectors;
        unsigned int                max_user_discard_sectors;
        unsigned int                max_secure_erase_sectors;
        unsigned int                max_write_zeroes_sectors;
        unsigned int                max_wzeroes_unmap_sectors;
        unsigned int                max_hw_wzeroes_unmap_sectors;
        unsigned int                max_user_wzeroes_unmap_sectors;
        unsigned int                max_hw_zone_append_sectors;
        unsigned int                max_zone_append_sectors;
        unsigned int                discard_granularity;
        unsigned int                discard_alignment;
        unsigned int                zone_write_granularity;

        /* atomic write limits */
        unsigned int                atomic_write_hw_max;
        unsigned int                atomic_write_max_sectors;
        unsigned int                atomic_write_hw_boundary;
        unsigned int                atomic_write_boundary_sectors;
        unsigned int                atomic_write_hw_unit_min;
        unsigned int                atomic_write_unit_min;
        unsigned int                atomic_write_hw_unit_max;
        unsigned int                atomic_write_unit_max;

        unsigned short                max_segments;
        unsigned short                max_integrity_segments;
        unsigned short                max_discard_segments;

        unsigned short                max_write_streams;
        unsigned int                write_stream_granularity;

        unsigned int                max_open_zones;
        unsigned int                max_active_zones;

        /*
         * Drivers that set dma_alignment to less than 511 must be prepared to
         * handle individual bvec's that are not a multiple of a SECTOR_SIZE
         * due to possible offsets.
         */
        unsigned int                dma_alignment;
        unsigned int                dma_pad_mask;

        struct blk_integrity        integrity;
};

typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
                               void *data);

int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
                     unsigned int idx, struct blk_report_zones_args *args);

int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
                         struct blk_zone *zone);

#define BLK_ALL_ZONES  ((unsigned int)-1)
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
                unsigned int nr_zones, report_zones_cb cb, void *data);
int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
                unsigned int nr_zones, report_zones_cb cb, void *data);
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
                sector_t sectors, sector_t nr_sectors);
int blk_revalidate_disk_zones(struct gendisk *disk);

/*
 * Independent access ranges: struct blk_independent_access_range describes
 * a range of contiguous sectors that can be accessed using device command
 * execution resources that are independent from the resources used for
 * other access ranges. This is typically found with single-LUN multi-actuator
 * HDDs where each access range is served by a different set of heads.
 * The set of independent ranges supported by the device is defined using
 * struct blk_independent_access_ranges. The independent ranges must not overlap
 * and must include all sectors within the disk capacity (no sector holes
 * allowed).
 * For a device with multiple ranges, requests targeting sectors in different
 * ranges can be executed in parallel. A request can straddle an access range
 * boundary.
 */
struct blk_independent_access_range {
        struct kobject                kobj;
        sector_t                sector;
        sector_t                nr_sectors;
};

struct blk_independent_access_ranges {
        struct kobject                                kobj;
        bool                                        sysfs_registered;
        unsigned int                                nr_ia_ranges;
        struct blk_independent_access_range        ia_range[];
};

struct request_queue {
        /*
         * The queue owner gets to use this for whatever they like.
         * ll_rw_blk doesn't touch it.
         */
        void                        *queuedata;

        struct elevator_queue        *elevator;

        const struct blk_mq_ops        *mq_ops;

        /* sw queues */
        struct blk_mq_ctx __percpu        *queue_ctx;

        /*
         * various queue flags, see QUEUE_* below
         */
        unsigned long                queue_flags;

        unsigned int __data_racy rq_timeout;

        unsigned int                queue_depth;

        refcount_t                refs;

        /* hw dispatch queues */
        unsigned int                nr_hw_queues;
        struct blk_mq_hw_ctx * __rcu *queue_hw_ctx __counted_by_ptr(nr_hw_queues);

        struct percpu_ref        q_usage_counter;
        struct lock_class_key        io_lock_cls_key;
        struct lockdep_map        io_lockdep_map;

        struct lock_class_key        q_lock_cls_key;
        struct lockdep_map        q_lockdep_map;

        struct request                *last_merge;

        spinlock_t                queue_lock;

        int                        quiesce_depth;

        struct gendisk                *disk;

        /*
         * mq queue kobject
         */
        struct kobject *mq_kobj;

        struct queue_limits        limits;

#ifdef CONFIG_PM
        struct device                *dev;
        enum rpm_status                rpm_status;
#endif

        /*
         * Number of contexts that have called blk_set_pm_only(). If this
         * counter is above zero then only RQF_PM requests are processed.
         */
        atomic_t                pm_only;

        struct blk_queue_stats        *stats;
        struct rq_qos                *rq_qos;
        struct mutex                rq_qos_mutex;

        /*
         * ida allocated id for this queue.  Used to index queues from
         * ioctx.
         */
        int                        id;

        /*
         * queue settings
         */
        unsigned int                nr_requests;        /* Max # of requests */
        unsigned int                async_depth;        /* Max # of async requests */

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        struct blk_crypto_profile *crypto_profile;
        struct kobject *crypto_kobject;
#endif

        struct timer_list        timeout;
        struct work_struct        timeout_work;

        atomic_t                nr_active_requests_shared_tags;

        struct blk_mq_tags        *sched_shared_tags;

        struct list_head        icq_list;
#ifdef CONFIG_BLK_CGROUP
        DECLARE_BITMAP                (blkcg_pols, BLKCG_MAX_POLS);
        struct blkcg_gq                *root_blkg;
        struct list_head        blkg_list;
        struct mutex                blkcg_mutex;
#endif

        int                        node;

        spinlock_t                requeue_lock;
        struct list_head        requeue_list;
        struct delayed_work        requeue_work;

#ifdef CONFIG_BLK_DEV_IO_TRACE
        struct blk_trace __rcu        *blk_trace;
#endif
        /*
         * for flush operations
         */
        struct blk_flush_queue        *fq;
        struct list_head        flush_list;

        /*
         * Protects against I/O scheduler switching, particularly when updating
         * q->elevator. Since the elevator update code path may also modify q->
         * nr_requests and wbt latency, this lock also protects the sysfs attrs
         * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update
         * may modify hctx tags, reserved-tags and cpumask, so this lock also
         * helps protect the hctx sysfs/debugfs attrs. To ensure proper locking
         * order during an elevator or nr_hw_queue update, first freeze the
         * queue, then acquire ->elevator_lock.
         */
        struct mutex                elevator_lock;

        struct mutex                sysfs_lock;
        /*
         * Protects queue limits and also sysfs attribute read_ahead_kb.
         */
        struct mutex                limits_lock;

        /*
         * for reusing dead hctx instance in case of updating
         * nr_hw_queues
         */
        struct list_head        unused_hctx_list;
        spinlock_t                unused_hctx_lock;

        int                        mq_freeze_depth;

#ifdef CONFIG_BLK_DEV_THROTTLING
        /* Throttle data */
        struct throtl_data *td;
#endif
        struct rcu_head                rcu_head;
#ifdef CONFIG_LOCKDEP
        struct task_struct        *mq_freeze_owner;
        int                        mq_freeze_owner_depth;
        /*
         * Records disk & queue state in current context, used in unfreeze
         * queue
         */
        bool                        mq_freeze_disk_dead;
        bool                        mq_freeze_queue_dying;
#endif
        wait_queue_head_t        mq_freeze_wq;
        /*
         * Protect concurrent access to q_usage_counter by
         * percpu_ref_kill() and percpu_ref_reinit().
         */
        struct mutex                mq_freeze_lock;

        struct blk_mq_tag_set        *tag_set;
        struct list_head        tag_set_list;

        struct dentry                *debugfs_dir;
        struct dentry                *sched_debugfs_dir;
        struct dentry                *rqos_debugfs_dir;
        /*
         * Serializes all debugfs metadata operations using the above dentries.
         */
        struct mutex                debugfs_mutex;
};

/* Keep blk_queue_flag_name[] in sync with the definitions below */
enum {
        QUEUE_FLAG_DYING,                /* queue being torn down */
        QUEUE_FLAG_NOMERGES,                /* disable merge attempts */
        QUEUE_FLAG_SAME_COMP,                /* complete on same CPU-group */
        QUEUE_FLAG_FAIL_IO,                /* fake timeout */
        QUEUE_FLAG_NOXMERGES,                /* No extended merges */
        QUEUE_FLAG_SAME_FORCE,                /* force complete on same CPU */
        QUEUE_FLAG_INIT_DONE,                /* queue is initialized */
        QUEUE_FLAG_STATS,                /* track IO start and completion times */
        QUEUE_FLAG_REGISTERED,                /* queue has been registered to a disk */
        QUEUE_FLAG_QUIESCED,                /* queue has been quiesced */
        QUEUE_FLAG_RQ_ALLOC_TIME,        /* record rq->alloc_time_ns */
        QUEUE_FLAG_HCTX_ACTIVE,                /* at least one blk-mq hctx is active */
        QUEUE_FLAG_SQ_SCHED,                /* single queue style io dispatch */
        QUEUE_FLAG_DISABLE_WBT_DEF,        /* for sched to disable/enable wbt */
        QUEUE_FLAG_NO_ELV_SWITCH,        /* can't switch elevator any more */
        QUEUE_FLAG_QOS_ENABLED,                /* qos is enabled */
        QUEUE_FLAG_BIO_ISSUE_TIME,        /* record bio->issue_time_ns */
        QUEUE_FLAG_ZONED_QD1_WRITES,        /* Limit zoned devices writes to QD=1 */
        QUEUE_FLAG_MAX
};

#define QUEUE_FLAG_MQ_DEFAULT        (1UL << QUEUE_FLAG_SAME_COMP)

void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);

#define blk_queue_dying(q)        test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
#define blk_queue_init_done(q)        test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
#define blk_queue_nomerges(q)        test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
#define blk_queue_noxmerges(q)        \
        test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
#define blk_queue_rot(q)        ((q)->limits.features & BLK_FEAT_ROTATIONAL)
#define blk_queue_io_stat(q)        ((q)->limits.features & BLK_FEAT_IO_STAT)
#define blk_queue_passthrough_stat(q)        \
        ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH)
#define blk_queue_dax(q)        ((q)->limits.features & BLK_FEAT_DAX)
#define blk_queue_pci_p2pdma(q)        ((q)->limits.features & BLK_FEAT_PCI_P2PDMA)
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
#define blk_queue_rq_alloc_time(q)        \
        test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags)
#else
#define blk_queue_rq_alloc_time(q)        false
#endif

#define blk_noretry_request(rq) \
        ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
                             REQ_FAILFAST_DRIVER))
#define blk_queue_quiesced(q)        test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
#define blk_queue_pm_only(q)        atomic_read(&(q)->pm_only)
#define blk_queue_registered(q)        test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
#define blk_queue_sq_sched(q)        test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
#define blk_queue_skip_tagset_quiesce(q) \
        ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE)
#define blk_queue_disable_wbt(q)        \
        test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags)
#define blk_queue_no_elv_switch(q)        \
        test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags)
#define blk_queue_zoned_qd1_writes(q)        \
        test_bit(QUEUE_FLAG_ZONED_QD1_WRITES, &(q)->queue_flags)

extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);

#define list_entry_rq(ptr)        list_entry((ptr), struct request, queuelist)

#define dma_map_bvec(dev, bv, dir, attrs) \
        dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
        (dir), (attrs))

static inline bool queue_is_mq(struct request_queue *q)
{
        return q->mq_ops;
}

#ifdef CONFIG_PM
static inline enum rpm_status queue_rpm_status(struct request_queue *q)
{
        return q->rpm_status;
}
#else
static inline enum rpm_status queue_rpm_status(struct request_queue *q)
{
        return RPM_ACTIVE;
}
#endif

static inline bool blk_queue_is_zoned(struct request_queue *q)
{
        return IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
                (q->limits.features & BLK_FEAT_ZONED);
}

static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
{
        if (!blk_queue_is_zoned(disk->queue))
                return 0;
        return sector >> ilog2(disk->queue->limits.chunk_sectors);
}

static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
{
        return bdev->bd_disk->queue->limits.max_open_zones;
}

static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
{
        return bdev->bd_disk->queue->limits.max_active_zones;
}

static inline unsigned int blk_queue_depth(struct request_queue *q)
{
        if (q->queue_depth)
                return q->queue_depth;

        return q->nr_requests;
}

/*
 * default timeout for SG_IO if none specified
 */
#define BLK_DEFAULT_SG_TIMEOUT        (60 * HZ)
#define BLK_MIN_SG_TIMEOUT        (7 * HZ)

/* This should not be used directly - use rq_for_each_segment */
#define for_each_bio(_bio)                \
        for (; _bio; _bio = _bio->bi_next)

int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
                                 const struct attribute_group **groups,
                                 struct fwnode_handle *fwnode);
int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
                                 const struct attribute_group **groups);
static inline int __must_check add_disk(struct gendisk *disk)
{
        return device_add_disk(NULL, disk, NULL);
}
void del_gendisk(struct gendisk *gp);
void invalidate_disk(struct gendisk *disk);
void set_disk_ro(struct gendisk *disk, bool read_only);
void disk_uevent(struct gendisk *disk, enum kobject_action action);

static inline u8 bdev_partno(const struct block_device *bdev)
{
        return atomic_read(&bdev->__bd_flags) & BD_PARTNO;
}

static inline bool bdev_test_flag(const struct block_device *bdev, unsigned flag)
{
        return atomic_read(&bdev->__bd_flags) & flag;
}

static inline void bdev_set_flag(struct block_device *bdev, unsigned flag)
{
        atomic_or(flag, &bdev->__bd_flags);
}

static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag)
{
        atomic_andnot(flag, &bdev->__bd_flags);
}

static inline bool get_disk_ro(struct gendisk *disk)
{
        return bdev_test_flag(disk->part0, BD_READ_ONLY) ||
                test_bit(GD_READ_ONLY, &disk->state);
}

static inline bool bdev_read_only(struct block_device *bdev)
{
        return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk);
}

bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
void disk_force_media_change(struct gendisk *disk);
void bdev_mark_dead(struct block_device *bdev, bool surprise);

void add_disk_randomness(struct gendisk *disk) __latent_entropy;
void rand_initialize_disk(struct gendisk *disk);

static inline sector_t get_start_sect(struct block_device *bdev)
{
        return bdev->bd_start_sect;
}

static inline sector_t bdev_nr_sectors(struct block_device *bdev)
{
        return bdev->bd_nr_sectors;
}

static inline loff_t bdev_nr_bytes(struct block_device *bdev)
{
        return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT;
}

static inline sector_t get_capacity(struct gendisk *disk)
{
        return bdev_nr_sectors(disk->part0);
}

static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
{
        return bdev_nr_sectors(sb->s_bdev) >>
                (sb->s_blocksize_bits - SECTOR_SHIFT);
}

#ifdef CONFIG_BLK_DEV_ZONED
static inline unsigned int disk_nr_zones(struct gendisk *disk)
{
        return disk->nr_zones;
}

/**
 * bio_needs_zone_write_plugging - Check if a BIO needs to be handled with zone
 *                                   write plugging
 * @bio: The BIO being submitted
 *
 * Return true whenever @bio execution needs to be handled through zone
 * write plugging (using blk_zone_plug_bio()). Return false otherwise.
 */
static inline bool bio_needs_zone_write_plugging(struct bio *bio)
{
        enum req_op op = bio_op(bio);

        /*
         * Only zoned block devices have a zone write plug hash table. But not
         * all of them have one (e.g. DM devices may not need one).
         */
        if (!bio->bi_bdev->bd_disk->zone_wplugs_hash)
                return false;

        /* Only write operations need zone write plugging. */
        if (!op_is_write(op))
                return false;

        /* Ignore empty flush */
        if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
                return false;

        /* Ignore BIOs that already have been handled by zone write plugging. */
        if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
                return false;

        /*
         * All zone write operations must be handled through zone write plugging
         * using blk_zone_plug_bio().
         */
        switch (op) {
        case REQ_OP_ZONE_APPEND:
        case REQ_OP_WRITE:
        case REQ_OP_WRITE_ZEROES:
        case REQ_OP_ZONE_FINISH:
        case REQ_OP_ZONE_RESET:
        case REQ_OP_ZONE_RESET_ALL:
                return true;
        default:
                return false;
        }
}

bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);

/**
 * disk_zone_capacity - returns the zone capacity of zone containing @sector
 * @disk:        disk to work with
 * @sector:        sector number within the querying zone
 *
 * Returns the zone capacity of a zone containing @sector. @sector can be any
 * sector in the zone.
 */
static inline unsigned int disk_zone_capacity(struct gendisk *disk,
                                              sector_t sector)
{
        sector_t zone_sectors = disk->queue->limits.chunk_sectors;

        if (sector + zone_sectors >= get_capacity(disk))
                return disk->last_zone_capacity;
        return disk->zone_capacity;
}
static inline unsigned int bdev_zone_capacity(struct block_device *bdev,
                                              sector_t pos)
{
        return disk_zone_capacity(bdev->bd_disk, pos);
}

bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector);

#else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int disk_nr_zones(struct gendisk *disk)
{
        return 0;
}

static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
{
        return false;
}

static inline bool bio_needs_zone_write_plugging(struct bio *bio)
{
        return false;
}

static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
{
        return false;
}
#endif /* CONFIG_BLK_DEV_ZONED */

static inline unsigned int bdev_nr_zones(struct block_device *bdev)
{
        return disk_nr_zones(bdev->bd_disk);
}

int bdev_disk_changed(struct gendisk *disk, bool invalidate);

void put_disk(struct gendisk *disk);
struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
                struct lock_class_key *lkclass);

/**
 * blk_alloc_disk - allocate a gendisk structure
 * @lim: queue limits to be used for this disk.
 * @node_id: numa node to allocate on
 *
 * Allocate and pre-initialize a gendisk structure for use with BIO based
 * drivers.
 *
 * Returns an ERR_PTR on error, else the allocated disk.
 *
 * Context: can sleep
 */
#define blk_alloc_disk(lim, node_id)                                        \
({                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __blk_alloc_disk(lim, node_id, &__key);                                \
})

int __register_blkdev(unsigned int major, const char *name,
                void (*probe)(dev_t devt));
#define register_blkdev(major, name) \
        __register_blkdev(major, name, NULL)
void unregister_blkdev(unsigned int major, const char *name);

bool disk_check_media_change(struct gendisk *disk);
void set_capacity(struct gendisk *disk, sector_t size);

#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
#else
static inline int bd_link_disk_holder(struct block_device *bdev,
                                      struct gendisk *disk)
{
        return 0;
}
static inline void bd_unlink_disk_holder(struct block_device *bdev,
                                         struct gendisk *disk)
{
}
#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */

dev_t part_devt(struct gendisk *disk, u8 partno);
void inc_diskseq(struct gendisk *disk);
void blk_request_module(dev_t devt);

extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
void submit_bio_noacct(struct bio *bio);
struct bio *bio_split_to_limits(struct bio *bio);
struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
                                    struct bio_set *bs);

extern int blk_lld_busy(struct request_queue *q);
extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
extern void blk_queue_exit(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);

/* Convert a request operation REQ_OP_name into the string "name" */
extern const char *blk_op_str(enum req_op op);

int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno);
const char *blk_status_to_str(blk_status_t status);

/* only poll the hardware once, don't continue until a completion was found */
#define BLK_POLL_ONESHOT                (1 << 0)
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
                        unsigned int flags);

static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
{
        return bdev->bd_queue;        /* this is never NULL */
}

/* Convert a zone condition BLK_ZONE_COND_name into the string "name" */
const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);

static inline unsigned int bio_zone_no(struct bio *bio)
{
        return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
}

static inline bool bio_straddles_zones(struct bio *bio)
{
        return bio_sectors(bio) &&
                bio_zone_no(bio) !=
                disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
}

/*
 * Return how much within the boundary is left to be used for I/O at a given
 * offset.
 */
static inline unsigned int blk_boundary_sectors_left(sector_t offset,
                unsigned int boundary_sectors)
{
        if (unlikely(!is_power_of_2(boundary_sectors)))
                return boundary_sectors - sector_div(offset, boundary_sectors);
        return boundary_sectors - (offset & (boundary_sectors - 1));
}

/**
 * queue_limits_start_update - start an atomic update of queue limits
 * @q:                queue to update
 *
 * This functions starts an atomic update of the queue limits.  It takes a lock
 * to prevent other updates and returns a snapshot of the current limits that
 * the caller can modify.  The caller must call queue_limits_commit_update()
 * to finish the update.
 *
 * Context: process context.
 */
static inline struct queue_limits
queue_limits_start_update(struct request_queue *q)
{
        mutex_lock(&q->limits_lock);
        return q->limits;
}
int queue_limits_commit_update_frozen(struct request_queue *q,
                struct queue_limits *lim);
int queue_limits_commit_update(struct request_queue *q,
                struct queue_limits *lim);
int queue_limits_set(struct request_queue *q, struct queue_limits *lim);
int blk_validate_limits(struct queue_limits *lim);

/**
 * queue_limits_cancel_update - cancel an atomic update of queue limits
 * @q:                queue to update
 *
 * This functions cancels an atomic update of the queue limits started by
 * queue_limits_start_update() and should be used when an error occurs after
 * starting update.
 */
static inline void queue_limits_cancel_update(struct request_queue *q)
{
        mutex_unlock(&q->limits_lock);
}

/*
 * These helpers are for drivers that have sloppy feature negotiation and might
 * have to disable DISCARD, WRITE_ZEROES or SECURE_DISCARD from the I/O
 * completion handler when the device returned an indicator that the respective
 * feature is not actually supported.  They are racy and the driver needs to
 * cope with that.  Try to avoid this scheme if you can.
 */
static inline void blk_queue_disable_discard(struct request_queue *q)
{
        q->limits.max_discard_sectors = 0;
}

static inline void blk_queue_disable_secure_erase(struct request_queue *q)
{
        q->limits.max_secure_erase_sectors = 0;
}

static inline void blk_queue_disable_write_zeroes(struct request_queue *q)
{
        q->limits.max_write_zeroes_sectors = 0;
        q->limits.max_wzeroes_unmap_sectors = 0;
}

/*
 * Access functions for manipulating queue properties
 */
extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
extern void blk_set_stacking_limits(struct queue_limits *lim);
extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                            sector_t offset);
void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
                sector_t offset, const char *pfx);
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);

struct blk_independent_access_ranges *
disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
void disk_set_independent_access_ranges(struct gendisk *disk,
                                struct blk_independent_access_ranges *iars);

bool __must_check blk_get_queue(struct request_queue *);
extern void blk_put_queue(struct request_queue *);

void blk_mark_disk_dead(struct gendisk *disk);

struct rq_list {
        struct request *head;
        struct request *tail;
};

#ifdef CONFIG_BLOCK
/*
 * blk_plug permits building a queue of related requests by holding the I/O
 * fragments for a short period. This allows merging of sequential requests
 * into single larger request. As the requests are moved from a per-task list to
 * the device's request_queue in a batch, this results in improved scalability
 * as the lock contention for request_queue lock is reduced.
 *
 * It is ok not to disable preemption when adding the request to the plug list
 * or when attempting a merge. For details, please see schedule() where
 * blk_flush_plug() is called.
 */
struct blk_plug {
        struct rq_list mq_list; /* blk-mq requests */

        /* if ios_left is > 1, we can batch tag/rq allocations */
        struct rq_list cached_rqs;
        u64 cur_ktime;
        unsigned short nr_ios;

        unsigned short rq_count;

        bool multiple_queues;
        bool has_elevator;

        struct list_head cb_list; /* md requires an unplug callback */
};

struct blk_plug_cb;
typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool);
struct blk_plug_cb {
        struct list_head list;
        blk_plug_cb_fn callback;
        void *data;
};
extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
                                             void *data, int size);
extern void blk_start_plug(struct blk_plug *);
extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
extern void blk_finish_plug(struct blk_plug *);

void __blk_flush_plug(struct blk_plug *plug, bool from_schedule);
static inline void blk_flush_plug(struct blk_plug *plug, bool async)
{
        if (plug)
                __blk_flush_plug(plug, async);
}

/*
 * tsk == current here
 */
static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
{
        struct blk_plug *plug = tsk->plug;

        if (plug)
                plug->cur_ktime = 0;
        current->flags &= ~PF_BLOCK_TS;
}

int blkdev_issue_flush(struct block_device *bdev);
long nr_blockdev_pages(void);
#else /* CONFIG_BLOCK */
struct blk_plug {
};

static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
                                         unsigned short nr_ios)
{
}

static inline void blk_start_plug(struct blk_plug *plug)
{
}

static inline void blk_finish_plug(struct blk_plug *plug)
{
}

static inline void blk_flush_plug(struct blk_plug *plug, bool async)
{
}

static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
{
}

static inline int blkdev_issue_flush(struct block_device *bdev)
{
        return 0;
}

static inline long nr_blockdev_pages(void)
{
        return 0;
}
#endif /* CONFIG_BLOCK */

extern void blk_io_schedule(void);

int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask);
void __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp);

#define BLKDEV_ZERO_NOUNMAP        (1 << 0)  /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK        (1 << 1)  /* don't write explicit zeroes */
#define BLKDEV_ZERO_KILLABLE        (1 << 2)  /* interruptible by fatal signals */

extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
                unsigned flags);
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned flags);

static inline int sb_issue_discard(struct super_block *sb, sector_t block,
                sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
{
        return blkdev_issue_discard(sb->s_bdev,
                                    block << (sb->s_blocksize_bits -
                                              SECTOR_SHIFT),
                                    nr_blocks << (sb->s_blocksize_bits -
                                                  SECTOR_SHIFT),
                                    gfp_mask);
}
static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
                sector_t nr_blocks, gfp_t gfp_mask)
{
        return blkdev_issue_zeroout(sb->s_bdev,
                                    block << (sb->s_blocksize_bits -
                                              SECTOR_SHIFT),
                                    nr_blocks << (sb->s_blocksize_bits -
                                                  SECTOR_SHIFT),
                                    gfp_mask, 0);
}

static inline bool bdev_is_partition(struct block_device *bdev)
{
        return bdev_partno(bdev) != 0;
}

enum blk_default_limits {
        BLK_MAX_SEGMENTS        = 128,
        BLK_SAFE_MAX_SECTORS        = 255,
        BLK_MAX_SEGMENT_SIZE        = 65536,
        BLK_SEG_BOUNDARY_MASK        = 0xFFFFFFFFUL,
};

static inline struct queue_limits *bdev_limits(struct block_device *bdev)
{
        return &bdev_get_queue(bdev)->limits;
}

static inline unsigned long queue_segment_boundary(const struct request_queue *q)
{
        return q->limits.seg_boundary_mask;
}

static inline unsigned long queue_virt_boundary(const struct request_queue *q)
{
        return q->limits.virt_boundary_mask;
}

static inline unsigned int queue_max_sectors(const struct request_queue *q)
{
        return q->limits.max_sectors;
}

static inline unsigned int queue_max_bytes(struct request_queue *q)
{
        return min_t(unsigned int, queue_max_sectors(q), INT_MAX >> 9) << 9;
}

static inline unsigned int queue_max_hw_sectors(const struct request_queue *q)
{
        return q->limits.max_hw_sectors;
}

static inline unsigned short queue_max_segments(const struct request_queue *q)
{
        return q->limits.max_segments;
}

static inline unsigned short queue_max_discard_segments(const struct request_queue *q)
{
        return q->limits.max_discard_segments;
}

static inline unsigned int queue_max_segment_size(const struct request_queue *q)
{
        return q->limits.max_segment_size;
}

static inline bool queue_emulates_zone_append(struct request_queue *q)
{
        return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors;
}

static inline bool bdev_emulates_zone_append(struct block_device *bdev)
{
        return queue_emulates_zone_append(bdev_get_queue(bdev));
}

static inline unsigned int
bdev_max_zone_append_sectors(struct block_device *bdev)
{
        return bdev_limits(bdev)->max_zone_append_sectors;
}

static inline unsigned int bdev_max_segments(struct block_device *bdev)
{
        return queue_max_segments(bdev_get_queue(bdev));
}

static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
{
        if (bdev_is_partition(bdev))
                return 0;
        return bdev_limits(bdev)->max_write_streams;
}

static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
        return q->limits.logical_block_size;
}

static inline unsigned int bdev_logical_block_size(struct block_device *bdev)
{
        return queue_logical_block_size(bdev_get_queue(bdev));
}

static inline unsigned int queue_physical_block_size(const struct request_queue *q)
{
        return q->limits.physical_block_size;
}

static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
{
        return queue_physical_block_size(bdev_get_queue(bdev));
}

static inline unsigned int queue_io_min(const struct request_queue *q)
{
        return q->limits.io_min;
}

static inline unsigned int bdev_io_min(struct block_device *bdev)
{
        return queue_io_min(bdev_get_queue(bdev));
}

static inline unsigned int queue_io_opt(const struct request_queue *q)
{
        return q->limits.io_opt;
}

static inline unsigned int bdev_io_opt(struct block_device *bdev)
{
        return queue_io_opt(bdev_get_queue(bdev));
}

static inline unsigned int
queue_zone_write_granularity(const struct request_queue *q)
{
        return q->limits.zone_write_granularity;
}

static inline unsigned int
bdev_zone_write_granularity(struct block_device *bdev)
{
        return queue_zone_write_granularity(bdev_get_queue(bdev));
}

int bdev_alignment_offset(struct block_device *bdev);
unsigned int bdev_discard_alignment(struct block_device *bdev);

static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev)
{
        return bdev_limits(bdev)->max_discard_sectors;
}

static inline unsigned int bdev_discard_granularity(struct block_device *bdev)
{
        return bdev_limits(bdev)->discard_granularity;
}

static inline unsigned int
bdev_max_secure_erase_sectors(struct block_device *bdev)
{
        return bdev_limits(bdev)->max_secure_erase_sectors;
}

static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
{
        return bdev_limits(bdev)->max_write_zeroes_sectors;
}

static inline unsigned int
bdev_write_zeroes_unmap_sectors(struct block_device *bdev)
{
        return bdev_limits(bdev)->max_wzeroes_unmap_sectors;
}

static inline bool bdev_rot(struct block_device *bdev)
{
        return blk_queue_rot(bdev_get_queue(bdev));
}

static inline bool bdev_synchronous(struct block_device *bdev)
{
        return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS;
}

static inline bool bdev_has_integrity_csum(struct block_device *bdev)
{
        struct queue_limits *lim = bdev_limits(bdev);

        return IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
                lim->integrity.csum_type != BLK_INTEGRITY_CSUM_NONE;
}

static inline bool bdev_stable_writes(struct block_device *bdev)
{
        return bdev_has_integrity_csum(bdev) ||
                (bdev_limits(bdev)->features & BLK_FEAT_STABLE_WRITES);
}

static inline bool blk_queue_write_cache(struct request_queue *q)
{
        return (q->limits.features & BLK_FEAT_WRITE_CACHE) &&
                !(q->limits.flags & BLK_FLAG_WRITE_CACHE_DISABLED);
}

static inline bool bdev_write_cache(struct block_device *bdev)
{
        return blk_queue_write_cache(bdev_get_queue(bdev));
}

static inline bool bdev_fua(struct block_device *bdev)
{
        return bdev_limits(bdev)->features & BLK_FEAT_FUA;
}

static inline bool bdev_nowait(struct block_device *bdev)
{
        return bdev->bd_disk->queue->limits.features & BLK_FEAT_NOWAIT;
}

static inline bool bdev_is_zoned(struct block_device *bdev)
{
        return blk_queue_is_zoned(bdev_get_queue(bdev));
}

static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec)
{
        return disk_zone_no(bdev->bd_disk, sec);
}

static inline sector_t bdev_zone_sectors(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (!blk_queue_is_zoned(q))
                return 0;
        return q->limits.chunk_sectors;
}

static inline sector_t bdev_zone_start(struct block_device *bdev,
                                       sector_t sector)
{
        return sector & ~(bdev_zone_sectors(bdev) - 1);
}

static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev,
                                                   sector_t sector)
{
        return sector & (bdev_zone_sectors(bdev) - 1);
}

static inline sector_t bio_offset_from_zone_start(struct bio *bio)
{
        return bdev_offset_from_zone_start(bio->bi_bdev,
                                           bio->bi_iter.bi_sector);
}

static inline bool bdev_is_zone_start(struct block_device *bdev,
                                      sector_t sector)
{
        return bdev_offset_from_zone_start(bdev, sector) == 0;
}

/* Check whether @sector is a multiple of the zone size. */
static inline bool bdev_is_zone_aligned(struct block_device *bdev,
                                        sector_t sector)
{
        return bdev_is_zone_start(bdev, sector);
}

int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
                           sector_t nr_sects, gfp_t gfp_mask);

static inline unsigned int queue_dma_alignment(const struct request_queue *q)
{
        return q->limits.dma_alignment;
}

static inline unsigned int
queue_atomic_write_unit_max_bytes(const struct request_queue *q)
{
        return q->limits.atomic_write_unit_max;
}

static inline unsigned int
queue_atomic_write_unit_min_bytes(const struct request_queue *q)
{
        return q->limits.atomic_write_unit_min;
}

static inline unsigned int
queue_atomic_write_boundary_bytes(const struct request_queue *q)
{
        return q->limits.atomic_write_boundary_sectors << SECTOR_SHIFT;
}

static inline unsigned int
queue_atomic_write_max_bytes(const struct request_queue *q)
{
        return q->limits.atomic_write_max_sectors << SECTOR_SHIFT;
}

static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
{
        return queue_dma_alignment(bdev_get_queue(bdev));
}

static inline unsigned int
blk_lim_dma_alignment_and_pad(struct queue_limits *lim)
{
        return lim->dma_alignment | lim->dma_pad_mask;
}

static inline bool blk_rq_aligned(struct request_queue *q, unsigned long addr,
                                 unsigned int len)
{
        unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits);

        return !(addr & alignment) && !(len & alignment);
}

/* assumes size > 256 */
static inline unsigned int blksize_bits(unsigned int size)
{
        return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT;
}

int kblockd_schedule_work(struct work_struct *work);
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);

#define MODULE_ALIAS_BLOCKDEV(major,minor) \
        MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
        MODULE_ALIAS("block-major-" __stringify(major) "-*")

#ifdef CONFIG_BLK_INLINE_ENCRYPTION

bool blk_crypto_register(struct blk_crypto_profile *profile,
                         struct request_queue *q);

#else /* CONFIG_BLK_INLINE_ENCRYPTION */

static inline bool blk_crypto_register(struct blk_crypto_profile *profile,
                                       struct request_queue *q)
{
        return true;
}

#endif /* CONFIG_BLK_INLINE_ENCRYPTION */

enum blk_unique_id {
        /* these match the Designator Types specified in SPC */
        BLK_UID_T10        = 1,
        BLK_UID_EUI64        = 2,
        BLK_UID_NAA        = 3,
};

struct block_device_operations {
        void (*submit_bio)(struct bio *bio);
        int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob,
                        unsigned int flags);
        int (*open)(struct gendisk *disk, blk_mode_t mode);
        void (*release)(struct gendisk *disk);
        int (*ioctl)(struct block_device *bdev, blk_mode_t mode,
                        unsigned cmd, unsigned long arg);
        int (*compat_ioctl)(struct block_device *bdev, blk_mode_t mode,
                        unsigned cmd, unsigned long arg);
        unsigned int (*check_events) (struct gendisk *disk,
                                      unsigned int clearing);
        void (*unlock_native_capacity) (struct gendisk *);
        int (*getgeo)(struct gendisk *, struct hd_geometry *);
        int (*set_read_only)(struct block_device *bdev, bool ro);
        void (*free_disk)(struct gendisk *disk);
        /* this callback is with swap_lock and sometimes page table lock held */
        void (*swap_slot_free_notify) (struct block_device *, unsigned long);
        int (*report_zones)(struct gendisk *, sector_t sector,
                            unsigned int nr_zones,
                            struct blk_report_zones_args *args);
        char *(*devnode)(struct gendisk *disk, umode_t *mode);
        /* returns the length of the identifier or a negative errno: */
        int (*get_unique_id)(struct gendisk *disk, u8 id[16],
                        enum blk_unique_id id_type);
        struct module *owner;
        const struct pr_ops *pr_ops;

        /*
         * Special callback for probing GPT entry at a given sector.
         * Needed by Android devices, used by GPT scanner and MMC blk
         * driver.
         */
        int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector);
};

#ifdef CONFIG_COMPAT
extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t,
                                      unsigned int, unsigned long);
#else
#define blkdev_compat_ptr_ioctl NULL
#endif

static inline void blk_wake_io_task(struct task_struct *waiter)
{
        /*
         * If we're polling, the task itself is doing the completions. For
         * that case, we don't need to signal a wakeup, it's enough to just
         * mark us as RUNNING.
         */
        if (waiter == current)
                __set_current_state(TASK_RUNNING);
        else
                wake_up_process(waiter);
}

unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
                                 unsigned long start_time);
void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
                      unsigned int sectors, unsigned long start_time);

unsigned long bio_start_io_acct(struct bio *bio);
void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
                struct block_device *orig_bdev);

/**
 * bio_end_io_acct - end I/O accounting for bio based drivers
 * @bio:        bio to end account for
 * @start_time:        start time returned by bio_start_io_acct()
 */
static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
{
        return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev);
}

int bdev_validate_blocksize(struct block_device *bdev, int block_size);
int set_blocksize(struct file *file, int size);

int lookup_bdev(const char *pathname, dev_t *dev);

void blkdev_show(struct seq_file *seqf, off_t offset);

#define BDEVNAME_SIZE        32        /* Largest string for a blockdev identifier */
#define BDEVT_SIZE        10        /* Largest string for MAJ:MIN for blkdev */
#ifdef CONFIG_BLOCK
#define BLKDEV_MAJOR_MAX        512
#else
#define BLKDEV_MAJOR_MAX        0
#endif

struct blk_holder_ops {
        void (*mark_dead)(struct block_device *bdev, bool surprise);

        /*
         * Sync the file system mounted on the block device.
         */
        void (*sync)(struct block_device *bdev);

        /*
         * Freeze the file system mounted on the block device.
         */
        int (*freeze)(struct block_device *bdev);

        /*
         * Thaw the file system mounted on the block device.
         */
        int (*thaw)(struct block_device *bdev);
};

/*
 * For filesystems using @fs_holder_ops, the @holder argument passed to
 * helpers used to open and claim block devices via
 * bd_prepare_to_claim() must point to a superblock.
 */
extern const struct blk_holder_ops fs_holder_ops;

/*
 * Return the correct open flags for blkdev_get_by_* for super block flags
 * as stored in sb->s_flags.
 */
#define sb_open_mode(flags) \
        (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \
         (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))

struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
                const struct blk_holder_ops *hops);
struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
                void *holder, const struct blk_holder_ops *hops);
int bd_prepare_to_claim(struct block_device *bdev, void *holder,
                const struct blk_holder_ops *hops);
void bd_abort_claiming(struct block_device *bdev, void *holder);

struct block_device *I_BDEV(struct inode *inode);
struct block_device *file_bdev(struct file *bdev_file);
bool disk_live(struct gendisk *disk);
unsigned int block_size(struct block_device *bdev);

#ifdef CONFIG_BLOCK
void invalidate_bdev(struct block_device *bdev);
int sync_blockdev(struct block_device *bdev);
int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
int sync_blockdev_nowait(struct block_device *bdev);
void sync_bdevs(bool wait);
void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask);
void printk_all_partitions(void);
int __init early_lookup_bdev(const char *pathname, dev_t *dev);
#else
static inline void invalidate_bdev(struct block_device *bdev)
{
}
static inline int sync_blockdev(struct block_device *bdev)
{
        return 0;
}
static inline int sync_blockdev_nowait(struct block_device *bdev)
{
        return 0;
}
static inline void sync_bdevs(bool wait)
{
}
static inline void bdev_statx(const struct path *path, struct kstat *stat,
                u32 request_mask)
{
}
static inline void printk_all_partitions(void)
{
}
static inline int early_lookup_bdev(const char *pathname, dev_t *dev)
{
        return -EINVAL;
}
#endif /* CONFIG_BLOCK */

int bdev_freeze(struct block_device *bdev);
int bdev_thaw(struct block_device *bdev);
void bdev_fput(struct file *bdev_file);

struct io_comp_batch {
        struct rq_list req_list;
        bool need_ts;
        void (*complete)(struct io_comp_batch *);
        void *poll_ctx;
};

static inline bool blk_atomic_write_start_sect_aligned(sector_t sector,
                                                struct queue_limits *limits)
{
        unsigned int alignment = max(limits->atomic_write_hw_unit_min,
                                limits->atomic_write_hw_boundary);

        return IS_ALIGNED(sector, alignment >> SECTOR_SHIFT);
}

static inline bool bdev_can_atomic_write(struct block_device *bdev)
{
        struct request_queue *bd_queue = bdev->bd_queue;
        struct queue_limits *limits = &bd_queue->limits;

        if (!limits->atomic_write_unit_min)
                return false;

        if (bdev_is_partition(bdev))
                return blk_atomic_write_start_sect_aligned(bdev->bd_start_sect,
                                                        limits);

        return true;
}

static inline unsigned int
bdev_atomic_write_unit_min_bytes(struct block_device *bdev)
{
        if (!bdev_can_atomic_write(bdev))
                return 0;
        return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev));
}

static inline unsigned int
bdev_atomic_write_unit_max_bytes(struct block_device *bdev)
{
        if (!bdev_can_atomic_write(bdev))
                return 0;
        return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev));
}

static inline int bio_split_rw_at(struct bio *bio,
                const struct queue_limits *lim,
                unsigned *segs, unsigned max_bytes)
{
        return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment);
}

/*
 * Maximum contiguous integrity buffer allocation.
 */
#define BLK_INTEGRITY_MAX_SIZE                SZ_2M

/*
 * Maximum size of I/O that needs a block layer integrity buffer.  Limited
 * by the number of intervals for which we can fit the integrity buffer into
 * the buffer size.  Because the buffer is a single segment it is also limited
 * by the maximum segment size.
 */
static inline unsigned int max_integrity_io_size(struct queue_limits *lim)
{
        return min_t(unsigned int, lim->max_segment_size,
                (BLK_INTEGRITY_MAX_SIZE / lim->integrity.metadata_size) <<
                        lim->integrity.interval_exp);
}

#define DEFINE_IO_COMP_BATCH(name)        struct io_comp_batch name = { }

#endif /* _LINUX_BLKDEV_H */





















































































    1 







    1 
























    1 































    1 





















    1 


    1 


    1 


    1 










    1 
    1 




















































































































































































































































































































































































































































































    1 





























































    1 



















    1 
    1 
    1 
























































































    1 





























































































































    1 












    1 























    1 















    1 
    1 











    1 









    1 














































    1 





    1 







    1 









    1 


















    1 




















    1 


    1 




    1 

















    1 
















    1 





    1 


    1 


    1 






























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
// SPDX-License-Identifier: GPL-2.0
/*
 * USB Serial Converter driver
 *
 * Copyright (C) 2009 - 2013 Johan Hovold (jhovold@gmail.com)
 * Copyright (C) 1999 - 2012 Greg Kroah-Hartman (greg@kroah.com)
 * Copyright (C) 2000 Peter Berger (pberger@brimson.com)
 * Copyright (C) 2000 Al Borchers (borchers@steinerpoint.com)
 *
 * This driver was originally based on the ACM driver by Armin Fuerst (which was
 * based on a driver by Brad Keryan)
 *
 * See Documentation/usb/usb-serial.rst for more information on using this
 * driver
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/tty_flip.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/serial.h>
#include <linux/usb.h>
#include <linux/usb/serial.h>
#include <linux/kfifo.h>
#include <linux/idr.h>

#define DRIVER_AUTHOR "Greg Kroah-Hartman <gregkh@linuxfoundation.org>"
#define DRIVER_DESC "USB Serial Driver core"

#define USB_SERIAL_TTY_MAJOR        188
#define USB_SERIAL_TTY_MINORS        512        /* should be enough for a while */

/* There is no MODULE_DEVICE_TABLE for usbserial.c.  Instead
   the MODULE_DEVICE_TABLE declarations in each serial driver
   cause the "hotplug" program to pull in whatever module is necessary
   via modprobe, and modprobe will load usbserial because the serial
   drivers depend on it.
*/

static DEFINE_IDR(serial_minors);
static DEFINE_MUTEX(table_lock);
static LIST_HEAD(usb_serial_driver_list);

/*
 * Look up the serial port structure.  If it is found and it hasn't been
 * disconnected, return with the parent usb_serial structure's disc_mutex held
 * and its refcount incremented.  Otherwise return NULL.
 */
struct usb_serial_port *usb_serial_port_get_by_minor(unsigned minor)
{
        struct usb_serial *serial;
        struct usb_serial_port *port;

        mutex_lock(&table_lock);
        port = idr_find(&serial_minors, minor);
        if (!port)
                goto exit;

        serial = port->serial;
        mutex_lock(&serial->disc_mutex);
        if (serial->disconnected) {
                mutex_unlock(&serial->disc_mutex);
                port = NULL;
        } else {
                kref_get(&serial->kref);
        }
exit:
        mutex_unlock(&table_lock);
        return port;
}

static int allocate_minors(struct usb_serial *serial, int num_ports)
{
        struct usb_serial_port *port;
        unsigned int i, j;
        int minor;

        dev_dbg(&serial->interface->dev, "%s %d\n", __func__, num_ports);

        mutex_lock(&table_lock);
        for (i = 0; i < num_ports; ++i) {
                port = serial->port[i];
                minor = idr_alloc(&serial_minors, port, 0,
                                        USB_SERIAL_TTY_MINORS, GFP_KERNEL);
                if (minor < 0)
                        goto error;
                port->minor = minor;
                port->port_number = i;
        }
        serial->minors_reserved = 1;
        mutex_unlock(&table_lock);
        return 0;
error:
        /* unwind the already allocated minors */
        for (j = 0; j < i; ++j)
                idr_remove(&serial_minors, serial->port[j]->minor);
        mutex_unlock(&table_lock);
        return minor;
}

static void release_minors(struct usb_serial *serial)
{
        int i;

        mutex_lock(&table_lock);
        for (i = 0; i < serial->num_ports; ++i)
                idr_remove(&serial_minors, serial->port[i]->minor);
        mutex_unlock(&table_lock);
        serial->minors_reserved = 0;
}

int usb_serial_claim_interface(struct usb_serial *serial, struct usb_interface *intf)
{
        struct usb_driver *driver = serial->type->usb_driver;
        int ret;

        if (serial->sibling)
                return -EBUSY;

        ret = usb_driver_claim_interface(driver, intf, serial);
        if (ret) {
                dev_err(&serial->interface->dev,
                                "failed to claim sibling interface: %d\n", ret);
                return ret;
        }

        serial->sibling = intf;

        return 0;
}
EXPORT_SYMBOL_GPL(usb_serial_claim_interface);

static void release_sibling(struct usb_serial *serial, struct usb_interface *intf)
{
        struct usb_driver *driver = serial->type->usb_driver;
        struct usb_interface *sibling;

        if (!serial->sibling)
                return;

        if (intf == serial->sibling)
                sibling = serial->interface;
        else
                sibling = serial->sibling;

        usb_set_intfdata(sibling, NULL);
        usb_driver_release_interface(driver, sibling);
}

static void destroy_serial(struct kref *kref)
{
        struct usb_serial *serial;
        struct usb_serial_port *port;
        int i;

        serial = to_usb_serial(kref);

        /* return the minor range that this device had */
        if (serial->minors_reserved)
                release_minors(serial);

        if (serial->attached && serial->type->release)
                serial->type->release(serial);

        /* Now that nothing is using the ports, they can be freed */
        for (i = 0; i < serial->num_port_pointers; ++i) {
                port = serial->port[i];
                if (port) {
                        port->serial = NULL;
                        put_device(&port->dev);
                }
        }

        usb_put_intf(serial->interface);
        usb_put_dev(serial->dev);
        kfree(serial);
}

void usb_serial_put(struct usb_serial *serial)
{
        kref_put(&serial->kref, destroy_serial);
}

/*****************************************************************************
 * Driver tty interface functions
 *****************************************************************************/

/**
 * serial_install - install tty
 * @driver: the driver (USB in our case)
 * @tty: the tty being created
 *
 * Initialise the termios structure for this tty.  We use the default
 * USB serial settings but permit them to be overridden by
 * serial->type->init_termios on first open.
 *
 * This is the first place a new tty gets used.  Hence this is where we
 * acquire references to the usb_serial structure and the driver module,
 * where we store a pointer to the port.  All these actions are reversed
 * in serial_cleanup().
 */
static int serial_install(struct tty_driver *driver, struct tty_struct *tty)
{
        int idx = tty->index;
        struct usb_serial *serial;
        struct usb_serial_port *port;
        bool init_termios;
        int retval = -ENODEV;

        port = usb_serial_port_get_by_minor(idx);
        if (!port)
                return retval;

        serial = port->serial;
        if (!try_module_get(serial->type->driver.owner))
                goto err_put_serial;

        init_termios = (driver->termios[idx] == NULL);

        retval = tty_standard_install(driver, tty);
        if (retval)
                goto err_put_module;

        mutex_unlock(&serial->disc_mutex);

        /* allow the driver to update the initial settings */
        if (init_termios && serial->type->init_termios)
                serial->type->init_termios(tty);

        tty->driver_data = port;

        return retval;

err_put_module:
        module_put(serial->type->driver.owner);
err_put_serial:
        usb_serial_put(serial);
        mutex_unlock(&serial->disc_mutex);
        return retval;
}

static int serial_port_activate(struct tty_port *tport, struct tty_struct *tty)
{
        struct usb_serial_port *port =
                container_of(tport, struct usb_serial_port, port);
        struct usb_serial *serial = port->serial;
        int retval;

        mutex_lock(&serial->disc_mutex);
        if (serial->disconnected) {
                retval = -ENODEV;
                goto out_unlock;
        }

        retval = usb_autopm_get_interface(serial->interface);
        if (retval)
                goto out_unlock;

        retval = port->serial->type->open(tty, port);
        if (retval)
                usb_autopm_put_interface(serial->interface);
out_unlock:
        mutex_unlock(&serial->disc_mutex);

        if (retval < 0)
                retval = usb_translate_errors(retval);

        return retval;
}

static int serial_open(struct tty_struct *tty, struct file *filp)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        return tty_port_open(&port->port, tty, filp);
}

/**
 * serial_port_shutdown - shut down hardware
 * @tport: tty port to shut down
 *
 * Shut down a USB serial port. Serialized against activate by the
 * tport mutex and kept to matching open/close pairs
 * of calls by the tty-port initialized flag.
 *
 * Not called if tty is console.
 */
static void serial_port_shutdown(struct tty_port *tport)
{
        struct usb_serial_port *port =
                container_of(tport, struct usb_serial_port, port);
        struct usb_serial_driver *drv = port->serial->type;

        if (drv->close)
                drv->close(port);

        usb_autopm_put_interface(port->serial->interface);
}

static void serial_hangup(struct tty_struct *tty)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        tty_port_hangup(&port->port);
}

static void serial_close(struct tty_struct *tty, struct file *filp)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        tty_port_close(&port->port, tty, filp);
}

/**
 * serial_cleanup - free resources post close/hangup
 * @tty: tty to clean up
 *
 * Do the resource freeing and refcount dropping for the port.
 * Avoid freeing the console.
 *
 * Called asynchronously after the last tty kref is dropped.
 */
static void serial_cleanup(struct tty_struct *tty)
{
        struct usb_serial_port *port = tty->driver_data;
        struct usb_serial *serial;
        struct module *owner;

        dev_dbg(&port->dev, "%s\n", __func__);

        /* The console is magical.  Do not hang up the console hardware
         * or there will be tears.
         */
        if (port->port.console)
                return;

        tty->driver_data = NULL;

        serial = port->serial;
        owner = serial->type->driver.owner;

        usb_serial_put(serial);
        module_put(owner);
}

static ssize_t serial_write(struct tty_struct *tty, const u8 *buf, size_t count)
{
        struct usb_serial_port *port = tty->driver_data;
        int retval = -ENODEV;

        if (port->serial->dev->state == USB_STATE_NOTATTACHED)
                goto exit;

        dev_dbg(&port->dev, "%s - %zu byte(s)\n", __func__, count);

        retval = port->serial->type->write(tty, port, buf, count);
        if (retval < 0)
                retval = usb_translate_errors(retval);
exit:
        return retval;
}

static unsigned int serial_write_room(struct tty_struct *tty)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        return port->serial->type->write_room(tty);
}

static unsigned int serial_chars_in_buffer(struct tty_struct *tty)
{
        struct usb_serial_port *port = tty->driver_data;
        struct usb_serial *serial = port->serial;

        dev_dbg(&port->dev, "%s\n", __func__);

        if (serial->disconnected)
                return 0;

        return serial->type->chars_in_buffer(tty);
}

static void serial_wait_until_sent(struct tty_struct *tty, int timeout)
{
        struct usb_serial_port *port = tty->driver_data;
        struct usb_serial *serial = port->serial;

        dev_dbg(&port->dev, "%s\n", __func__);

        if (!port->serial->type->wait_until_sent)
                return;

        mutex_lock(&serial->disc_mutex);
        if (!serial->disconnected)
                port->serial->type->wait_until_sent(tty, timeout);
        mutex_unlock(&serial->disc_mutex);
}

static void serial_throttle(struct tty_struct *tty)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        if (port->serial->type->throttle)
                port->serial->type->throttle(tty);
}

static void serial_unthrottle(struct tty_struct *tty)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        if (port->serial->type->unthrottle)
                port->serial->type->unthrottle(tty);
}

static int serial_get_serial(struct tty_struct *tty, struct serial_struct *ss)
{
        struct usb_serial_port *port = tty->driver_data;
        struct tty_port *tport = &port->port;
        unsigned int close_delay, closing_wait;

        mutex_lock(&tport->mutex);

        close_delay = jiffies_to_msecs(tport->close_delay) / 10;
        closing_wait = tport->closing_wait;
        if (closing_wait != ASYNC_CLOSING_WAIT_NONE)
                closing_wait = jiffies_to_msecs(closing_wait) / 10;

        ss->line = port->minor;
        ss->close_delay = close_delay;
        ss->closing_wait = closing_wait;

        if (port->serial->type->get_serial)
                port->serial->type->get_serial(tty, ss);

        mutex_unlock(&tport->mutex);

        return 0;
}

static int serial_set_serial(struct tty_struct *tty, struct serial_struct *ss)
{
        struct usb_serial_port *port = tty->driver_data;
        struct tty_port *tport = &port->port;
        unsigned int close_delay, closing_wait;
        int ret = 0;

        close_delay = msecs_to_jiffies(ss->close_delay * 10);
        closing_wait = ss->closing_wait;
        if (closing_wait != ASYNC_CLOSING_WAIT_NONE)
                closing_wait = msecs_to_jiffies(closing_wait * 10);

        mutex_lock(&tport->mutex);

        if (!capable(CAP_SYS_ADMIN)) {
                if (close_delay != tport->close_delay ||
                                closing_wait != tport->closing_wait) {
                        ret = -EPERM;
                        goto out_unlock;
                }
        }

        if (port->serial->type->set_serial) {
                ret = port->serial->type->set_serial(tty, ss);
                if (ret)
                        goto out_unlock;
        }

        tport->close_delay = close_delay;
        tport->closing_wait = closing_wait;
out_unlock:
        mutex_unlock(&tport->mutex);

        return ret;
}

static int serial_ioctl(struct tty_struct *tty,
                                        unsigned int cmd, unsigned long arg)
{
        struct usb_serial_port *port = tty->driver_data;
        int retval = -ENOIOCTLCMD;

        dev_dbg(&port->dev, "%s - cmd 0x%04x\n", __func__, cmd);

        switch (cmd) {
        case TIOCMIWAIT:
                if (port->serial->type->tiocmiwait)
                        retval = port->serial->type->tiocmiwait(tty, arg);
                break;
        default:
                if (port->serial->type->ioctl)
                        retval = port->serial->type->ioctl(tty, cmd, arg);
        }

        return retval;
}

static void serial_set_termios(struct tty_struct *tty,
                               const struct ktermios *old)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        if (port->serial->type->set_termios)
                port->serial->type->set_termios(tty, port, old);
        else
                tty_termios_copy_hw(&tty->termios, old);
}

static int serial_break(struct tty_struct *tty, int break_state)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        if (port->serial->type->break_ctl)
                return port->serial->type->break_ctl(tty, break_state);

        return -ENOTTY;
}

static int serial_proc_show(struct seq_file *m, void *v)
{
        struct usb_serial *serial;
        struct usb_serial_port *port;
        int i;
        char tmp[40];

        seq_puts(m, "usbserinfo:1.0 driver:2.0\n");
        for (i = 0; i < USB_SERIAL_TTY_MINORS; ++i) {
                port = usb_serial_port_get_by_minor(i);
                if (port == NULL)
                        continue;
                serial = port->serial;

                seq_printf(m, "%d:", i);
                if (serial->type->driver.owner)
                        seq_printf(m, " module:%s",
                                module_name(serial->type->driver.owner));
                seq_printf(m, " name:\"%s\"",
                                serial->type->description);
                seq_printf(m, " vendor:%04x product:%04x",
                        le16_to_cpu(serial->dev->descriptor.idVendor),
                        le16_to_cpu(serial->dev->descriptor.idProduct));
                seq_printf(m, " num_ports:%d", serial->num_ports);
                seq_printf(m, " port:%d", port->port_number);
                usb_make_path(serial->dev, tmp, sizeof(tmp));
                seq_printf(m, " path:%s", tmp);

                seq_putc(m, '\n');
                usb_serial_put(serial);
                mutex_unlock(&serial->disc_mutex);
        }
        return 0;
}

static int serial_tiocmget(struct tty_struct *tty)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        if (port->serial->type->tiocmget)
                return port->serial->type->tiocmget(tty);
        return -ENOTTY;
}

static int serial_tiocmset(struct tty_struct *tty,
                            unsigned int set, unsigned int clear)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        if (port->serial->type->tiocmset)
                return port->serial->type->tiocmset(tty, set, clear);
        return -ENOTTY;
}

static int serial_get_icount(struct tty_struct *tty,
                                struct serial_icounter_struct *icount)
{
        struct usb_serial_port *port = tty->driver_data;

        dev_dbg(&port->dev, "%s\n", __func__);

        if (port->serial->type->get_icount)
                return port->serial->type->get_icount(tty, icount);
        return -ENOTTY;
}

/*
 * We would be calling tty_wakeup here, but unfortunately some line
 * disciplines have an annoying habit of calling tty->write from
 * the write wakeup callback (e.g. n_hdlc.c).
 */
void usb_serial_port_softint(struct usb_serial_port *port)
{
        schedule_work(&port->work);
}
EXPORT_SYMBOL_GPL(usb_serial_port_softint);

static void usb_serial_port_work(struct work_struct *work)
{
        struct usb_serial_port *port =
                container_of(work, struct usb_serial_port, work);

        tty_port_tty_wakeup(&port->port);
}

static void usb_serial_port_poison_urbs(struct usb_serial_port *port)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i)
                usb_poison_urb(port->read_urbs[i]);
        for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i)
                usb_poison_urb(port->write_urbs[i]);

        usb_poison_urb(port->interrupt_in_urb);
        usb_poison_urb(port->interrupt_out_urb);
}

static void usb_serial_port_unpoison_urbs(struct usb_serial_port *port)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i)
                usb_unpoison_urb(port->read_urbs[i]);
        for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i)
                usb_unpoison_urb(port->write_urbs[i]);

        usb_unpoison_urb(port->interrupt_in_urb);
        usb_unpoison_urb(port->interrupt_out_urb);
}

static void usb_serial_port_release(struct device *dev)
{
        struct usb_serial_port *port = to_usb_serial_port(dev);
        int i;

        dev_dbg(dev, "%s\n", __func__);

        usb_free_urb(port->interrupt_in_urb);
        usb_free_urb(port->interrupt_out_urb);
        for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) {
                usb_free_urb(port->read_urbs[i]);
                kfree(port->bulk_in_buffers[i]);
        }
        for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) {
                usb_free_urb(port->write_urbs[i]);
                kfree(port->bulk_out_buffers[i]);
        }
        kfifo_free(&port->write_fifo);
        kfree(port->interrupt_in_buffer);
        kfree(port->interrupt_out_buffer);
        tty_port_destroy(&port->port);
        kfree(port);
}

static struct usb_serial *create_serial(struct usb_device *dev,
                                        struct usb_interface *interface,
                                        struct usb_serial_driver *driver)
{
        struct usb_serial *serial;

        serial = kzalloc_obj(*serial);
        if (!serial)
                return NULL;
        serial->dev = usb_get_dev(dev);
        serial->type = driver;
        serial->interface = usb_get_intf(interface);
        kref_init(&serial->kref);
        mutex_init(&serial->disc_mutex);
        serial->minors_reserved = 0;

        return serial;
}

static const struct usb_device_id *match_dynamic_id(struct usb_interface *intf,
                                            struct usb_serial_driver *drv)
{
        struct usb_dynid *dynid;

        guard(mutex)(&usb_dynids_lock);
        list_for_each_entry(dynid, &drv->dynids.list, node) {
                if (usb_match_one_id(intf, &dynid->id)) {
                        return &dynid->id;
                }
        }
        return NULL;
}

static const struct usb_device_id *get_iface_id(struct usb_serial_driver *drv,
                                                struct usb_interface *intf)
{
        const struct usb_device_id *id;

        id = usb_match_id(intf, drv->id_table);
        if (id) {
                dev_dbg(&intf->dev, "static descriptor matches\n");
                goto exit;
        }
        id = match_dynamic_id(intf, drv);
        if (id)
                dev_dbg(&intf->dev, "dynamic descriptor matches\n");
exit:
        return id;
}

/* Caller must hold table_lock */
static struct usb_serial_driver *search_serial_device(
                                        struct usb_interface *iface)
{
        const struct usb_device_id *id = NULL;
        struct usb_serial_driver *drv;
        struct usb_driver *driver = to_usb_driver(iface->dev.driver);

        /* Check if the usb id matches a known device */
        list_for_each_entry(drv, &usb_serial_driver_list, driver_list) {
                if (drv->usb_driver == driver)
                        id = get_iface_id(drv, iface);
                if (id)
                        return drv;
        }

        return NULL;
}

static bool serial_port_carrier_raised(struct tty_port *port)
{
        struct usb_serial_port *p = container_of(port, struct usb_serial_port, port);
        struct usb_serial_driver *drv = p->serial->type;

        if (drv->carrier_raised)
                return drv->carrier_raised(p);
        /* No carrier control - don't block */
        return true;
}

static void serial_port_dtr_rts(struct tty_port *port, bool on)
{
        struct usb_serial_port *p = container_of(port, struct usb_serial_port, port);
        struct usb_serial_driver *drv = p->serial->type;

        if (drv->dtr_rts)
                drv->dtr_rts(p, on);
}

static ssize_t port_number_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        struct usb_serial_port *port = to_usb_serial_port(dev);

        return sprintf(buf, "%u\n", port->port_number);
}
static DEVICE_ATTR_RO(port_number);

static struct attribute *usb_serial_port_attrs[] = {
        &dev_attr_port_number.attr,
        NULL
};
ATTRIBUTE_GROUPS(usb_serial_port);

static const struct tty_port_operations serial_port_ops = {
        .carrier_raised                = serial_port_carrier_raised,
        .dtr_rts                = serial_port_dtr_rts,
        .activate                = serial_port_activate,
        .shutdown                = serial_port_shutdown,
};

static void store_endpoint(struct usb_serial *serial,
                                        struct usb_serial_endpoints *epds,
                                        struct usb_endpoint_descriptor *epd)
{
        struct device *dev = &serial->interface->dev;
        u8 addr = epd->bEndpointAddress;

        if (usb_endpoint_is_bulk_in(epd)) {
                if (epds->num_bulk_in == ARRAY_SIZE(epds->bulk_in))
                        return;
                dev_dbg(dev, "found bulk in endpoint %02x\n", addr);
                epds->bulk_in[epds->num_bulk_in++] = epd;
        } else if (usb_endpoint_is_bulk_out(epd)) {
                if (epds->num_bulk_out == ARRAY_SIZE(epds->bulk_out))
                        return;
                dev_dbg(dev, "found bulk out endpoint %02x\n", addr);
                epds->bulk_out[epds->num_bulk_out++] = epd;
        } else if (usb_endpoint_is_int_in(epd)) {
                if (epds->num_interrupt_in == ARRAY_SIZE(epds->interrupt_in))
                        return;
                dev_dbg(dev, "found interrupt in endpoint %02x\n", addr);
                epds->interrupt_in[epds->num_interrupt_in++] = epd;
        } else if (usb_endpoint_is_int_out(epd)) {
                if (epds->num_interrupt_out == ARRAY_SIZE(epds->interrupt_out))
                        return;
                dev_dbg(dev, "found interrupt out endpoint %02x\n", addr);
                epds->interrupt_out[epds->num_interrupt_out++] = epd;
        }
}

static void find_endpoints(struct usb_serial *serial,
                                        struct usb_serial_endpoints *epds,
                                        struct usb_interface *intf)
{
        struct usb_host_interface *iface_desc;
        struct usb_endpoint_descriptor *epd;
        unsigned int i;

        iface_desc = intf->cur_altsetting;
        for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) {
                epd = &iface_desc->endpoint[i].desc;
                store_endpoint(serial, epds, epd);
        }
}

static int setup_port_bulk_in(struct usb_serial_port *port,
                                        struct usb_endpoint_descriptor *epd)
{
        struct usb_serial_driver *type = port->serial->type;
        struct usb_device *udev = port->serial->dev;
        int buffer_size;
        int i;

        buffer_size = max_t(int, type->bulk_in_size, usb_endpoint_maxp(epd));
        port->bulk_in_size = buffer_size;
        port->bulk_in_endpointAddress = epd->bEndpointAddress;

        for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) {
                set_bit(i, &port->read_urbs_free);
                port->read_urbs[i] = usb_alloc_urb(0, GFP_KERNEL);
                if (!port->read_urbs[i])
                        return -ENOMEM;
                port->bulk_in_buffers[i] = kmalloc(buffer_size, GFP_KERNEL);
                if (!port->bulk_in_buffers[i])
                        return -ENOMEM;
                usb_fill_bulk_urb(port->read_urbs[i], udev,
                                usb_rcvbulkpipe(udev, epd->bEndpointAddress),
                                port->bulk_in_buffers[i], buffer_size,
                                type->read_bulk_callback, port);
        }

        port->read_urb = port->read_urbs[0];
        port->bulk_in_buffer = port->bulk_in_buffers[0];

        return 0;
}

static int setup_port_bulk_out(struct usb_serial_port *port,
                                        struct usb_endpoint_descriptor *epd)
{
        struct usb_serial_driver *type = port->serial->type;
        struct usb_device *udev = port->serial->dev;
        int buffer_size;
        int i;

        if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL))
                return -ENOMEM;
        if (type->bulk_out_size)
                buffer_size = type->bulk_out_size;
        else
                buffer_size = usb_endpoint_maxp(epd);
        port->bulk_out_size = buffer_size;
        port->bulk_out_endpointAddress = epd->bEndpointAddress;

        for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) {
                set_bit(i, &port->write_urbs_free);
                port->write_urbs[i] = usb_alloc_urb(0, GFP_KERNEL);
                if (!port->write_urbs[i])
                        return -ENOMEM;
                port->bulk_out_buffers[i] = kmalloc(buffer_size, GFP_KERNEL);
                if (!port->bulk_out_buffers[i])
                        return -ENOMEM;
                usb_fill_bulk_urb(port->write_urbs[i], udev,
                                usb_sndbulkpipe(udev, epd->bEndpointAddress),
                                port->bulk_out_buffers[i], buffer_size,
                                type->write_bulk_callback, port);
        }

        port->write_urb = port->write_urbs[0];
        port->bulk_out_buffer = port->bulk_out_buffers[0];

        return 0;
}

static int setup_port_interrupt_in(struct usb_serial_port *port,
                                        struct usb_endpoint_descriptor *epd)
{
        struct usb_serial_driver *type = port->serial->type;
        struct usb_device *udev = port->serial->dev;
        int buffer_size;

        port->interrupt_in_urb = usb_alloc_urb(0, GFP_KERNEL);
        if (!port->interrupt_in_urb)
                return -ENOMEM;
        buffer_size = usb_endpoint_maxp(epd);
        port->interrupt_in_endpointAddress = epd->bEndpointAddress;
        port->interrupt_in_buffer = kmalloc(buffer_size, GFP_KERNEL);
        if (!port->interrupt_in_buffer)
                return -ENOMEM;
        usb_fill_int_urb(port->interrupt_in_urb, udev,
                        usb_rcvintpipe(udev, epd->bEndpointAddress),
                        port->interrupt_in_buffer, buffer_size,
                        type->read_int_callback, port,
                        epd->bInterval);

        return 0;
}

static int setup_port_interrupt_out(struct usb_serial_port *port,
                                        struct usb_endpoint_descriptor *epd)
{
        struct usb_serial_driver *type = port->serial->type;
        struct usb_device *udev = port->serial->dev;
        int buffer_size;

        port->interrupt_out_urb = usb_alloc_urb(0, GFP_KERNEL);
        if (!port->interrupt_out_urb)
                return -ENOMEM;
        buffer_size = usb_endpoint_maxp(epd);
        port->interrupt_out_size = buffer_size;
        port->interrupt_out_endpointAddress = epd->bEndpointAddress;
        port->interrupt_out_buffer = kmalloc(buffer_size, GFP_KERNEL);
        if (!port->interrupt_out_buffer)
                return -ENOMEM;
        usb_fill_int_urb(port->interrupt_out_urb, udev,
                        usb_sndintpipe(udev, epd->bEndpointAddress),
                        port->interrupt_out_buffer, buffer_size,
                        type->write_int_callback, port,
                        epd->bInterval);

        return 0;
}

static int usb_serial_probe(struct usb_interface *interface,
                               const struct usb_device_id *id)
{
        struct device *ddev = &interface->dev;
        struct usb_device *dev = interface_to_usbdev(interface);
        struct usb_serial *serial = NULL;
        struct usb_serial_port *port;
        struct usb_serial_endpoints *epds;
        struct usb_serial_driver *type = NULL;
        int retval;
        int i;
        int num_ports = 0;
        unsigned char max_endpoints;

        mutex_lock(&table_lock);
        type = search_serial_device(interface);
        if (!type) {
                mutex_unlock(&table_lock);
                dev_dbg(ddev, "none matched\n");
                return -ENODEV;
        }

        if (!try_module_get(type->driver.owner)) {
                mutex_unlock(&table_lock);
                dev_err(ddev, "module get failed, exiting\n");
                return -EIO;
        }
        mutex_unlock(&table_lock);

        serial = create_serial(dev, interface, type);
        if (!serial) {
                retval = -ENOMEM;
                goto err_put_module;
        }

        /* if this device type has a probe function, call it */
        if (type->probe) {
                const struct usb_device_id *id;

                id = get_iface_id(type, interface);
                retval = type->probe(serial, id);

                if (retval) {
                        dev_dbg(ddev, "sub driver rejected device\n");
                        goto err_release_sibling;
                }
        }

        /* descriptor matches, let's find the endpoints needed */
        epds = kzalloc_obj(*epds);
        if (!epds) {
                retval = -ENOMEM;
                goto err_release_sibling;
        }

        find_endpoints(serial, epds, interface);
        if (serial->sibling)
                find_endpoints(serial, epds, serial->sibling);

        if (epds->num_bulk_in < type->num_bulk_in ||
                        epds->num_bulk_out < type->num_bulk_out ||
                        epds->num_interrupt_in < type->num_interrupt_in ||
                        epds->num_interrupt_out < type->num_interrupt_out) {
                dev_err(ddev, "required endpoints missing\n");
                retval = -ENODEV;
                goto err_free_epds;
        }

        if (type->calc_num_ports) {
                retval = type->calc_num_ports(serial, epds);
                if (retval < 0)
                        goto err_free_epds;
                num_ports = retval;
        }

        if (!num_ports)
                num_ports = type->num_ports;

        if (num_ports > MAX_NUM_PORTS) {
                dev_warn(ddev, "too many ports requested: %d\n", num_ports);
                num_ports = MAX_NUM_PORTS;
        }

        serial->num_ports = (unsigned char)num_ports;
        serial->num_bulk_in = epds->num_bulk_in;
        serial->num_bulk_out = epds->num_bulk_out;
        serial->num_interrupt_in = epds->num_interrupt_in;
        serial->num_interrupt_out = epds->num_interrupt_out;

        /* found all that we need */
        dev_info(ddev, "%s converter detected\n", type->description);

        /* create our ports, we need as many as the max endpoints */
        /* we don't use num_ports here because some devices have more
           endpoint pairs than ports */
        max_endpoints = max(epds->num_bulk_in, epds->num_bulk_out);
        max_endpoints = max(max_endpoints, epds->num_interrupt_in);
        max_endpoints = max(max_endpoints, epds->num_interrupt_out);
        max_endpoints = max(max_endpoints, serial->num_ports);
        serial->num_port_pointers = max_endpoints;

        dev_dbg(ddev, "setting up %d port structure(s)\n", max_endpoints);
        for (i = 0; i < max_endpoints; ++i) {
                port = kzalloc_obj(struct usb_serial_port);
                if (!port) {
                        retval = -ENOMEM;
                        goto err_free_epds;
                }
                tty_port_init(&port->port);
                port->port.ops = &serial_port_ops;
                port->serial = serial;
                spin_lock_init(&port->lock);
                /* Keep this for private driver use for the moment but
                   should probably go away */
                INIT_WORK(&port->work, usb_serial_port_work);
                serial->port[i] = port;
                port->dev.parent = &interface->dev;
                port->dev.driver = NULL;
                port->dev.bus = &usb_serial_bus_type;
                port->dev.release = &usb_serial_port_release;
                port->dev.groups = usb_serial_port_groups;
                device_initialize(&port->dev);
        }

        /* set up the endpoint information */
        for (i = 0; i < epds->num_bulk_in; ++i) {
                retval = setup_port_bulk_in(serial->port[i], epds->bulk_in[i]);
                if (retval)
                        goto err_free_epds;
        }

        for (i = 0; i < epds->num_bulk_out; ++i) {
                retval = setup_port_bulk_out(serial->port[i],
                                epds->bulk_out[i]);
                if (retval)
                        goto err_free_epds;
        }

        if (serial->type->read_int_callback) {
                for (i = 0; i < epds->num_interrupt_in; ++i) {
                        retval = setup_port_interrupt_in(serial->port[i],
                                        epds->interrupt_in[i]);
                        if (retval)
                                goto err_free_epds;
                }
        } else if (epds->num_interrupt_in) {
                dev_dbg(ddev, "The device claims to support interrupt in transfers, but read_int_callback is not defined\n");
        }

        if (serial->type->write_int_callback) {
                for (i = 0; i < epds->num_interrupt_out; ++i) {
                        retval = setup_port_interrupt_out(serial->port[i],
                                        epds->interrupt_out[i]);
                        if (retval)
                                goto err_free_epds;
                }
        } else if (epds->num_interrupt_out) {
                dev_dbg(ddev, "The device claims to support interrupt out transfers, but write_int_callback is not defined\n");
        }

        usb_set_intfdata(interface, serial);

        /* if this device type has an attach function, call it */
        if (type->attach) {
                retval = type->attach(serial);
                if (retval < 0)
                        goto err_free_epds;
                serial->attached = 1;
                if (retval > 0) {
                        /* quietly accept this device, but don't bind to a
                           serial port as it's about to disappear */
                        serial->num_ports = 0;
                        goto exit;
                }
        } else {
                serial->attached = 1;
        }

        retval = allocate_minors(serial, num_ports);
        if (retval) {
                dev_err(ddev, "No more free serial minor numbers\n");
                goto err_free_epds;
        }

        /* register all of the individual ports with the driver core */
        for (i = 0; i < num_ports; ++i) {
                port = serial->port[i];
                dev_set_name(&port->dev, "ttyUSB%d", port->minor);
                dev_dbg(ddev, "registering %s\n", dev_name(&port->dev));
                device_enable_async_suspend(&port->dev);

                retval = device_add(&port->dev);
                if (retval)
                        dev_err(ddev, "Error registering port device, continuing\n");
        }

        if (num_ports > 0)
                usb_serial_console_init(serial->port[0]->minor);
exit:
        kfree(epds);
        module_put(type->driver.owner);
        return 0;

err_free_epds:
        kfree(epds);
err_release_sibling:
        release_sibling(serial, interface);
        usb_serial_put(serial);
err_put_module:
        module_put(type->driver.owner);

        return retval;
}

static void usb_serial_disconnect(struct usb_interface *interface)
{
        int i;
        struct usb_serial *serial = usb_get_intfdata(interface);
        struct device *dev = &interface->dev;
        struct usb_serial_port *port;

        /* sibling interface is cleaning up */
        if (!serial)
                return;

        usb_serial_console_disconnect(serial);

        mutex_lock(&serial->disc_mutex);
        /* must set a flag, to signal subdrivers */
        serial->disconnected = 1;
        mutex_unlock(&serial->disc_mutex);

        for (i = 0; i < serial->num_ports; ++i) {
                port = serial->port[i];
                tty_port_tty_vhangup(&port->port);
                usb_serial_port_poison_urbs(port);
                wake_up_interruptible(&port->port.delta_msr_wait);
                cancel_work_sync(&port->work);
                if (device_is_registered(&port->dev))
                        device_del(&port->dev);
        }
        if (serial->type->disconnect)
                serial->type->disconnect(serial);

        release_sibling(serial, interface);

        /* let the last holder of this object cause it to be cleaned up */
        usb_serial_put(serial);
        dev_info(dev, "device disconnected\n");
}

int usb_serial_suspend(struct usb_interface *intf, pm_message_t message)
{
        struct usb_serial *serial = usb_get_intfdata(intf);
        int i, r;

        /* suspend when called for first sibling interface */
        if (serial->suspend_count++)
                return 0;

        /*
         * serial->type->suspend() MUST return 0 in system sleep context,
         * otherwise, the resume callback has to recover device from
         * previous suspend failure.
         */
        if (serial->type->suspend) {
                r = serial->type->suspend(serial, message);
                if (r < 0) {
                        serial->suspend_count--;
                        return r;
                }
        }

        for (i = 0; i < serial->num_ports; ++i)
                usb_serial_port_poison_urbs(serial->port[i]);

        return 0;
}
EXPORT_SYMBOL(usb_serial_suspend);

static void usb_serial_unpoison_port_urbs(struct usb_serial *serial)
{
        int i;

        for (i = 0; i < serial->num_ports; ++i)
                usb_serial_port_unpoison_urbs(serial->port[i]);
}

int usb_serial_resume(struct usb_interface *intf)
{
        struct usb_serial *serial = usb_get_intfdata(intf);
        int rv;

        /* resume when called for last sibling interface */
        if (--serial->suspend_count)
                return 0;

        usb_serial_unpoison_port_urbs(serial);

        if (serial->type->resume)
                rv = serial->type->resume(serial);
        else
                rv = usb_serial_generic_resume(serial);

        return rv;
}
EXPORT_SYMBOL(usb_serial_resume);

static int usb_serial_reset_resume(struct usb_interface *intf)
{
        struct usb_serial *serial = usb_get_intfdata(intf);
        int rv;

        /* resume when called for last sibling interface */
        if (--serial->suspend_count)
                return 0;

        usb_serial_unpoison_port_urbs(serial);

        if (serial->type->reset_resume) {
                rv = serial->type->reset_resume(serial);
        } else {
                rv = -EOPNOTSUPP;
                intf->needs_binding = 1;
        }

        return rv;
}

static const struct tty_operations serial_ops = {
        .open =                        serial_open,
        .close =                serial_close,
        .write =                serial_write,
        .hangup =                serial_hangup,
        .write_room =                serial_write_room,
        .ioctl =                serial_ioctl,
        .set_termios =                serial_set_termios,
        .throttle =                serial_throttle,
        .unthrottle =                serial_unthrottle,
        .break_ctl =                serial_break,
        .chars_in_buffer =        serial_chars_in_buffer,
        .wait_until_sent =        serial_wait_until_sent,
        .tiocmget =                serial_tiocmget,
        .tiocmset =                serial_tiocmset,
        .get_icount =                serial_get_icount,
        .set_serial =                serial_set_serial,
        .get_serial =                serial_get_serial,
        .cleanup =                serial_cleanup,
        .install =                serial_install,
        .proc_show =                serial_proc_show,
};


struct tty_driver *usb_serial_tty_driver;

static int __init usb_serial_init(void)
{
        int result;

        usb_serial_tty_driver = tty_alloc_driver(USB_SERIAL_TTY_MINORS,
                        TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV);
        if (IS_ERR(usb_serial_tty_driver))
                return PTR_ERR(usb_serial_tty_driver);

        /* Initialize our global data */
        result = bus_register(&usb_serial_bus_type);
        if (result) {
                pr_err("%s - registering bus driver failed\n", __func__);
                goto err_put_driver;
        }

        usb_serial_tty_driver->driver_name = "usbserial";
        usb_serial_tty_driver->name = "ttyUSB";
        usb_serial_tty_driver->major = USB_SERIAL_TTY_MAJOR;
        usb_serial_tty_driver->minor_start = 0;
        usb_serial_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
        usb_serial_tty_driver->subtype = SERIAL_TYPE_NORMAL;
        usb_serial_tty_driver->init_termios = tty_std_termios;
        usb_serial_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD
                                                        | HUPCL | CLOCAL;
        usb_serial_tty_driver->init_termios.c_ispeed = 9600;
        usb_serial_tty_driver->init_termios.c_ospeed = 9600;
        tty_set_operations(usb_serial_tty_driver, &serial_ops);
        result = tty_register_driver(usb_serial_tty_driver);
        if (result) {
                pr_err("%s - tty_register_driver failed\n", __func__);
                goto err_unregister_bus;
        }

        /* register the generic driver, if we should */
        result = usb_serial_generic_register();
        if (result < 0) {
                pr_err("%s - registering generic driver failed\n", __func__);
                goto err_unregister_driver;
        }

        return result;

err_unregister_driver:
        tty_unregister_driver(usb_serial_tty_driver);
err_unregister_bus:
        bus_unregister(&usb_serial_bus_type);
err_put_driver:
        pr_err("%s - returning with error %d\n", __func__, result);
        tty_driver_kref_put(usb_serial_tty_driver);
        return result;
}


static void __exit usb_serial_exit(void)
{
        usb_serial_console_exit();

        usb_serial_generic_deregister();

        tty_unregister_driver(usb_serial_tty_driver);
        tty_driver_kref_put(usb_serial_tty_driver);
        bus_unregister(&usb_serial_bus_type);
        idr_destroy(&serial_minors);
}


module_init(usb_serial_init);
module_exit(usb_serial_exit);

#define set_to_generic_if_null(type, function)                                \
        do {                                                                \
                if (!type->function) {                                        \
                        type->function = usb_serial_generic_##function;        \
                        pr_debug("%s: using generic " #function        "\n",        \
                                                type->driver.name);        \
                }                                                        \
        } while (0)

static void usb_serial_operations_init(struct usb_serial_driver *device)
{
        set_to_generic_if_null(device, open);
        set_to_generic_if_null(device, write);
        set_to_generic_if_null(device, close);
        set_to_generic_if_null(device, write_room);
        set_to_generic_if_null(device, chars_in_buffer);
        if (device->tx_empty)
                set_to_generic_if_null(device, wait_until_sent);
        set_to_generic_if_null(device, read_bulk_callback);
        set_to_generic_if_null(device, write_bulk_callback);
        set_to_generic_if_null(device, process_read_urb);
        set_to_generic_if_null(device, prepare_write_buffer);
}

static int usb_serial_register(struct usb_serial_driver *driver)
{
        int retval;

        if (usb_disabled())
                return -ENODEV;

        if (!driver->description)
                driver->description = driver->driver.name;
        if (!driver->usb_driver) {
                WARN(1, "Serial driver %s has no usb_driver\n",
                                driver->description);
                return -EINVAL;
        }

        /* Prevent individual ports from being unbound. */
        driver->driver.suppress_bind_attrs = true;

        usb_serial_operations_init(driver);

        /* Add this device to our list of devices */
        mutex_lock(&table_lock);
        list_add(&driver->driver_list, &usb_serial_driver_list);

        retval = usb_serial_bus_register(driver);
        if (retval) {
                pr_err("problem %d when registering driver %s\n", retval, driver->description);
                list_del(&driver->driver_list);
        } else {
                pr_info("USB Serial support registered for %s\n", driver->description);
        }
        mutex_unlock(&table_lock);
        return retval;
}

static void usb_serial_deregister(struct usb_serial_driver *device)
{
        pr_info("USB Serial deregistering driver %s\n", device->description);

        mutex_lock(&table_lock);
        list_del(&device->driver_list);
        mutex_unlock(&table_lock);

        usb_serial_bus_deregister(device);
}

/**
 * __usb_serial_register_drivers - register drivers for a usb-serial module
 * @serial_drivers: NULL-terminated array of pointers to drivers to be registered
 * @owner: owning module
 * @name: name of the usb_driver for this set of @serial_drivers
 * @id_table: list of all devices this @serial_drivers set binds to
 *
 * Registers all the drivers in the @serial_drivers array, and dynamically
 * creates a struct usb_driver with the name @name and id_table of @id_table.
 */
int __usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[],
                                  struct module *owner, const char *name,
                                  const struct usb_device_id *id_table)
{
        int rc;
        struct usb_driver *udriver;
        struct usb_serial_driver * const *sd;

        /*
         * udriver must be registered before any of the serial drivers,
         * because the store_new_id() routine for the serial drivers (in
         * bus.c) probes udriver.
         *
         * Performance hack: We don't want udriver to be probed until
         * the serial drivers are registered, because the probe would
         * simply fail for lack of a matching serial driver.
         * So we leave udriver's id_table set to NULL until we are all set.
         *
         * Suspend/resume support is implemented in the usb-serial core,
         * so fill in the PM-related fields in udriver.
         */
        udriver = kzalloc_obj(*udriver);
        if (!udriver)
                return -ENOMEM;

        udriver->name = name;
        udriver->no_dynamic_id = 1;
        udriver->supports_autosuspend = 1;
        udriver->suspend = usb_serial_suspend;
        udriver->resume = usb_serial_resume;
        udriver->probe = usb_serial_probe;
        udriver->disconnect = usb_serial_disconnect;

        /* we only set the reset_resume field if the serial_driver has one */
        for (sd = serial_drivers; *sd; ++sd) {
                if ((*sd)->reset_resume) {
                        udriver->reset_resume = usb_serial_reset_resume;
                        break;
                }
        }

        rc = usb_register(udriver);
        if (rc)
                goto err_free_driver;

        for (sd = serial_drivers; *sd; ++sd) {
                (*sd)->usb_driver = udriver;
                (*sd)->driver.owner = owner;
                rc = usb_serial_register(*sd);
                if (rc)
                        goto err_deregister_drivers;
        }

        /* Now set udriver's id_table and look for matches */
        udriver->id_table = id_table;
        rc = driver_attach(&udriver->driver);
        return 0;

err_deregister_drivers:
        while (sd-- > serial_drivers)
                usb_serial_deregister(*sd);
        usb_deregister(udriver);
err_free_driver:
        kfree(udriver);
        return rc;
}
EXPORT_SYMBOL_GPL(__usb_serial_register_drivers);

/**
 * usb_serial_deregister_drivers - deregister drivers for a usb-serial module
 * @serial_drivers: NULL-terminated array of pointers to drivers to be deregistered
 *
 * Deregisters all the drivers in the @serial_drivers array and deregisters and
 * frees the struct usb_driver that was created by the call to
 * usb_serial_register_drivers().
 */
void usb_serial_deregister_drivers(struct usb_serial_driver *const serial_drivers[])
{
        struct usb_driver *udriver = (*serial_drivers)->usb_driver;

        for (; *serial_drivers; ++serial_drivers)
                usb_serial_deregister(*serial_drivers);
        usb_deregister(udriver);
        kfree(udriver);
}
EXPORT_SYMBOL_GPL(usb_serial_deregister_drivers);

MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);
MODULE_LICENSE("GPL v2");
































































    7 




    7 



    1 


    6 


    6 






    4 
    1 
    1 









    2 
    3 




    5 
    1 








    6 
















    4 



    2 











    2 




    2 










    2 















    2 










    1 
    1 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        GRE over IPv4 demultiplexer driver
 *
 *        Authors: Dmitry Kozlov (xeb@mail.ru)
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/if.h>
#include <linux/icmp.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/if_tunnel.h>
#include <linux/spinlock.h>
#include <net/protocol.h>
#include <net/gre.h>
#include <net/erspan.h>

#include <net/icmp.h>
#include <net/route.h>
#include <net/xfrm.h>

static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;

int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{
        if (version >= GREPROTO_MAX)
                return -EINVAL;

        return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
                0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(gre_add_protocol);

int gre_del_protocol(const struct gre_protocol *proto, u8 version)
{
        int ret;

        if (version >= GREPROTO_MAX)
                return -EINVAL;

        ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
                0 : -EBUSY;

        if (ret)
                return ret;

        synchronize_rcu();
        return 0;
}
EXPORT_SYMBOL_GPL(gre_del_protocol);

/* Fills in tpi and returns header length to be pulled.
 * Note that caller must use pskb_may_pull() before pulling GRE header.
 */
int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
                     bool *csum_err, __be16 proto, int nhs)
{
        const struct gre_base_hdr *greh;
        __be32 *options;
        int hdr_len;

        if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr))))
                return -EINVAL;

        greh = (struct gre_base_hdr *)(skb->data + nhs);
        if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
                return -EINVAL;

        gre_flags_to_tnl_flags(tpi->flags, greh->flags);
        hdr_len = gre_calc_hlen(tpi->flags);

        if (!pskb_may_pull(skb, nhs + hdr_len))
                return -EINVAL;

        greh = (struct gre_base_hdr *)(skb->data + nhs);
        tpi->proto = greh->protocol;

        options = (__be32 *)(greh + 1);
        if (greh->flags & GRE_CSUM) {
                if (!skb_checksum_simple_validate(skb)) {
                        skb_checksum_try_convert(skb, IPPROTO_GRE,
                                                 null_compute_pseudo);
                } else if (csum_err) {
                        *csum_err = true;
                        return -EINVAL;
                }

                options++;
        }

        if (greh->flags & GRE_KEY) {
                tpi->key = *options;
                options++;
        } else {
                tpi->key = 0;
        }
        if (unlikely(greh->flags & GRE_SEQ)) {
                tpi->seq = *options;
                options++;
        } else {
                tpi->seq = 0;
        }
        /* WCCP version 1 and 2 protocol decoding.
         * - Change protocol to IPv4/IPv6
         * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
         */
        if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
                u8 _val, *val;

                val = skb_header_pointer(skb, nhs + hdr_len,
                                         sizeof(_val), &_val);
                if (!val)
                        return -EINVAL;
                tpi->proto = proto;
                if ((*val & 0xF0) != 0x40)
                        hdr_len += 4;
        }
        tpi->hdr_len = hdr_len;

        /* ERSPAN ver 1 and 2 protocol sets GRE key field
         * to 0 and sets the configured key in the
         * inner erspan header field
         */
        if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) ||
            greh->protocol == htons(ETH_P_ERSPAN2)) {
                struct erspan_base_hdr *ershdr;

                if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr)))
                        return -EINVAL;

                ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len);
                tpi->key = cpu_to_be32(get_session_id(ershdr));
        }

        return hdr_len;
}
EXPORT_SYMBOL(gre_parse_header);

static int gre_rcv(struct sk_buff *skb)
{
        const struct gre_protocol *proto;
        u8 ver;
        int ret;

        if (!pskb_may_pull(skb, 12))
                goto drop;

        ver = skb->data[1]&0x7f;
        if (ver >= GREPROTO_MAX)
                goto drop;

        rcu_read_lock();
        proto = rcu_dereference(gre_proto[ver]);
        if (!proto || !proto->handler)
                goto drop_nohandler;
        ret = proto->handler(skb);
        rcu_read_unlock();
        return ret;

drop_nohandler:
        rcu_read_unlock();
        dev_core_stats_rx_nohandler_inc(skb->dev);
        kfree_skb(skb);
        return NET_RX_DROP;
drop:
        dev_core_stats_rx_dropped_inc(skb->dev);
        kfree_skb(skb);
        return NET_RX_DROP;
}

static int gre_err(struct sk_buff *skb, u32 info)
{
        const struct gre_protocol *proto;
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
        int err = 0;

        if (ver >= GREPROTO_MAX)
                return -EINVAL;

        rcu_read_lock();
        proto = rcu_dereference(gre_proto[ver]);
        if (proto && proto->err_handler)
                proto->err_handler(skb, info);
        else
                err = -EPROTONOSUPPORT;
        rcu_read_unlock();

        return err;
}

static const struct net_protocol net_gre_protocol = {
        .handler     = gre_rcv,
        .err_handler = gre_err,
};

static int __init gre_init(void)
{
        pr_info("GRE over IPv4 demultiplexer driver\n");

        if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
                pr_err("can't add protocol\n");
                return -EAGAIN;
        }
        return 0;
}

static void __exit gre_exit(void)
{
        inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}

module_init(gre_init);
module_exit(gre_exit);

MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>");
MODULE_LICENSE("GPL");











































































































































    1 






















































































































































    1 
















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Internal procfs definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/mm.h>

struct ctl_table_header;
struct mempolicy;

/*
 * This is not completely implemented yet. The idea is to
 * create an in-memory tree (like the actual /proc filesystem
 * tree) of these proc_dir_entries, so that we can dynamically
 * add new files to /proc.
 *
 * parent/subdir are used for the directory structure (every /proc file has a
 * parent, but "subdir" is empty for all non-directory entries).
 * subdir_node is used to build the rb tree "subdir" of the parent.
 */
struct proc_dir_entry {
        /*
         * number of callers into module in progress;
         * negative -> it's going away RSN
         */
        atomic_t in_use;
        refcount_t refcnt;
        struct list_head pde_openers;        /* who did ->open, but not ->release */
        /* protects ->pde_openers and all struct pde_opener instances */
        spinlock_t pde_unload_lock;
        struct completion *pde_unload_completion;
        const struct inode_operations *proc_iops;
        union {
                const struct proc_ops *proc_ops;
                const struct file_operations *proc_dir_ops;
        };
        union {
                const struct seq_operations *seq_ops;
                int (*single_show)(struct seq_file *, void *);
        };
        proc_write_t write;
        void *data;
        unsigned int state_size;
        unsigned int low_ino;
        nlink_t nlink;
        kuid_t uid;
        kgid_t gid;
        loff_t size;
        struct proc_dir_entry *parent;
        struct rb_root subdir;
        struct rb_node subdir_node;
        char *name;
        umode_t mode;
        u8 flags;
        u8 namelen;
        char inline_name[];
} __randomize_layout;

#define SIZEOF_PDE        (                                \
        sizeof(struct proc_dir_entry) < 128 ? 128 :        \
        sizeof(struct proc_dir_entry) < 192 ? 192 :        \
        sizeof(struct proc_dir_entry) < 256 ? 256 :        \
        sizeof(struct proc_dir_entry) < 512 ? 512 :        \
        0)
#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))

static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
{
        return pde->flags & PROC_ENTRY_PERMANENT;
}

static inline void pde_make_permanent(struct proc_dir_entry *pde)
{
        pde->flags |= PROC_ENTRY_PERMANENT;
}

static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde)
{
        return pde->flags & PROC_ENTRY_proc_read_iter;
}

static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
{
#ifdef CONFIG_COMPAT
        return pde->flags & PROC_ENTRY_proc_compat_ioctl;
#else
        return false;
#endif
}

static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde)
{
        return pde->flags & PROC_ENTRY_proc_lseek;
}

extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);

union proc_op {
        int (*proc_get_link)(struct dentry *, struct path *);
        int (*proc_show)(struct seq_file *m,
                struct pid_namespace *ns, struct pid *pid,
                struct task_struct *task);
        int lsmid;
};

struct proc_inode {
        struct pid *pid;
        unsigned int fd;
        union proc_op op;
        struct proc_dir_entry *pde;
        struct ctl_table_header *sysctl;
        const struct ctl_table *sysctl_entry;
        struct hlist_node sibling_inodes;
        const struct proc_ns_operations *ns_ops;
        struct inode vfs_inode;
} __randomize_layout;

/*
 * General functions
 */
static inline struct proc_inode *PROC_I(const struct inode *inode)
{
        return container_of(inode, struct proc_inode, vfs_inode);
}

static inline struct proc_dir_entry *PDE(const struct inode *inode)
{
        return PROC_I(inode)->pde;
}

static inline struct pid *proc_pid(const struct inode *inode)
{
        return PROC_I(inode)->pid;
}

static inline struct task_struct *get_proc_task(const struct inode *inode)
{
        return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}

void task_dump_owner(struct task_struct *task, umode_t mode,
                     kuid_t *ruid, kgid_t *rgid);

unsigned name_to_int(const struct qstr *qstr);
/*
 * Offset of the first process in the /proc root directory..
 */
#define FIRST_PROCESS_ENTRY 256

/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13

#ifdef CONFIG_PAGE_MAPCOUNT
/**
 * folio_precise_page_mapcount() - Number of mappings of this folio page.
 * @folio: The folio.
 * @page: The page.
 *
 * The number of present user page table entries that reference this page
 * as tracked via the RMAP: either referenced directly (PTE) or as part of
 * a larger area that covers this page (e.g., PMD).
 *
 * Use this function only for the calculation of existing statistics
 * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount).
 *
 * Do not add new users.
 *
 * Returns: The number of mappings of this folio page. 0 for
 * folios that are not mapped to user space or are not tracked via the RMAP
 * (e.g., shared zeropage).
 */
static inline int folio_precise_page_mapcount(struct folio *folio,
                struct page *page)
{
        int mapcount = atomic_read(&page->_mapcount) + 1;

        if (page_mapcount_is_type(mapcount))
                mapcount = 0;
        if (folio_test_large(folio))
                mapcount += folio_entire_mapcount(folio);

        return mapcount;
}
#else /* !CONFIG_PAGE_MAPCOUNT */
static inline int folio_precise_page_mapcount(struct folio *folio,
                struct page *page)
{
        BUILD_BUG();
}
#endif /* CONFIG_PAGE_MAPCOUNT */

/**
 * folio_average_page_mapcount() - Average number of mappings per page in this
 *                                   folio
 * @folio: The folio.
 *
 * The average number of user page table entries that reference each page in
 * this folio as tracked via the RMAP: either referenced directly (PTE) or
 * as part of a larger area that covers this page (e.g., PMD).
 *
 * The average is calculated by rounding to the nearest integer; however,
 * to avoid duplicated code in current callers, the average is at least
 * 1 if any page of the folio is mapped.
 *
 * Returns: The average number of mappings per page in this folio.
 */
static inline int folio_average_page_mapcount(struct folio *folio)
{
        int mapcount, entire_mapcount, avg;

        if (!folio_test_large(folio))
                return atomic_read(&folio->_mapcount) + 1;

        mapcount = folio_large_mapcount(folio);
        if (unlikely(mapcount <= 0))
                return 0;
        entire_mapcount = folio_entire_mapcount(folio);
        if (mapcount <= entire_mapcount)
                return entire_mapcount;
        mapcount -= entire_mapcount;

        /* Round to closest integer ... */
        avg = ((unsigned int)mapcount + folio_large_nr_pages(folio) / 2) >> folio_large_order(folio);
        /* ... but return at least 1. */
        return max_t(int, avg + entire_mapcount, 1);
}
/*
 * array.c
 */
extern const struct file_operations proc_tid_children_operations;

extern void proc_task_name(struct seq_file *m, struct task_struct *p,
                           bool escape);
extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
                         struct pid *, struct task_struct *);
extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
                          struct pid *, struct task_struct *);
extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
                           struct pid *, struct task_struct *);
extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
                          struct pid *, struct task_struct *);

/*
 * base.c
 */
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(struct mnt_idmap *, const struct path *,
                       struct kstat *, u32, unsigned int);
int proc_nochmod_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                        struct iattr *attr);
extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);

/* Lookups */
typedef struct dentry *instantiate_t(struct dentry *,
                                     struct task_struct *, const void *);
bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int,
                           instantiate_t, struct task_struct *, const void *);

/*
 * generic.c
 */
struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
                struct proc_dir_entry **parent, void *data);
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
                struct proc_dir_entry *dp);
extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
extern int proc_readdir(struct file *, struct dir_context *);
int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);

static inline void pde_get(struct proc_dir_entry *pde)
{
        refcount_inc(&pde->refcnt);
}
extern void pde_put(struct proc_dir_entry *);

static inline bool is_empty_pde(const struct proc_dir_entry *pde)
{
        return S_ISDIR(pde->mode) && !pde->proc_iops;
}
extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *);

/*
 * inode.c
 */
struct pde_opener {
        struct list_head lh;
        struct file *file;
        bool closing;
        struct completion *c;
} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;

void proc_init_kmemcache(void);
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);

/*
 * proc_namespaces.c
 */
extern const struct inode_operations proc_ns_dir_inode_operations;
extern const struct file_operations proc_ns_dir_operations;

/*
 * proc_net.c
 */
extern const struct file_operations proc_net_operations;
extern const struct inode_operations proc_net_inode_operations;

#ifdef CONFIG_NET
extern int proc_net_init(void);
#else
static inline int proc_net_init(void) { return 0; }
#endif

/*
 * proc_self.c
 */
extern int proc_setup_self(struct super_block *);

/*
 * proc_thread_self.c
 */
extern int proc_setup_thread_self(struct super_block *);
extern void proc_thread_self_init(void);

/*
 * proc_sysctl.c
 */
#ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void);
extern void proc_sys_evict_inode(struct inode *inode,
                                 struct ctl_table_header *head);
#else
static inline void proc_sys_init(void) { }
static inline void proc_sys_evict_inode(struct  inode *inode,
                                        struct ctl_table_header *head) { }
#endif

/*
 * proc_tty.c
 */
#ifdef CONFIG_TTY
extern void proc_tty_init(void);
#else
static inline void proc_tty_init(void) {}
#endif

/*
 * root.c
 */
extern struct proc_dir_entry proc_root;

extern void proc_self_init(void);
extern unsigned self_inum, thread_self_inum;

/*
 * task_[no]mmu.c
 */
struct mem_size_stats;

struct proc_maps_locking_ctx {
        struct mm_struct *mm;
#ifdef CONFIG_PER_VMA_LOCK
        bool mmap_locked;
        struct vm_area_struct *locked_vma;
#endif
};

struct proc_maps_private {
        struct inode *inode;
        struct task_struct *task;
        struct vma_iterator iter;
        loff_t last_pos;
        struct proc_maps_locking_ctx lock_ctx;
#ifdef CONFIG_NUMA
        struct mempolicy *task_mempolicy;
#endif
} __randomize_layout;

struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);

extern const struct file_operations proc_pid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;

extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
                                unsigned long *, unsigned long *,
                                unsigned long *, unsigned long *);
extern void task_mem(struct seq_file *, struct mm_struct *);

extern const struct dentry_operations proc_net_dentry_ops;
static inline void pde_force_lookup(struct proc_dir_entry *pde)
{
        /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
        pde->flags |= PROC_ENTRY_FORCE_LOOKUP;
}

/*
 * Add a new procfs dentry that can't serve as a mountpoint. That should
 * encompass anything that is ephemeral and can just disappear while the
 * process is still around.
 */
static inline struct dentry *proc_splice_unmountable(struct inode *inode,
                struct dentry *dentry, const struct dentry_operations *d_ops)
{
        dont_mount(dentry);
        return d_splice_alias_ops(inode, dentry, d_ops);
}









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 







    2 





































































































    2 





    1 



    1 



































   11 















   11 





















































    1 






    1 





    1 

























































    1 





    1 


























































































































   11 






   12 





    9 











































   10 






   11 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the diskquota system for the LINUX operating system. QUOTA
 * is implemented using the BSD system call interface as the means of
 * communication with the user level. This file contains the generic routines
 * called by the different filesystems on allocation of an inode or block.
 * These routines take care of the administration needed to have a consistent
 * diskquota tracking system. The ideas of both user and group quotas are based
 * on the Melbourne quota system as used on BSD derived systems. The internal
 * implementation is based on one of the several variants of the LINUX
 * inode-subsystem with added complexity of the diskquota system.
 *
 * Author:        Marco van Wieringen <mvw@planets.elm.net>
 *
 * Fixes:   Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
 *
 *                Revised list management to avoid races
 *                -- Bill Hawes, <whawes@star.net>, 9/98
 *
 *                Fixed races in dquot_transfer(), dqget() and dquot_alloc_...().
 *                As the consequence the locking was moved from dquot_decr_...(),
 *                dquot_incr_...() to calling functions.
 *                invalidate_dquots() now writes modified dquots.
 *                Serialized quota_off() and quota_on() for mount point.
 *                Fixed a few bugs in grow_dquots().
 *                Fixed deadlock in write_dquot() - we no longer account quotas on
 *                quota files
 *                remove_dquot_ref() moved to inode.c - it now traverses through inodes
 *                add_dquot_ref() restarts after blocking
 *                Added check for bogus uid and fixed check for group in quotactl.
 *                Jan Kara, <jack@suse.cz>, sponsored by SuSE CR, 10-11/99
 *
 *                Used struct list_head instead of own list struct
 *                Invalidation of referenced dquots is no longer possible
 *                Improved free_dquots list management
 *                Quota and i_blocks are now updated in one place to avoid races
 *                Warnings are now delayed so we won't block in critical section
 *                Write updated not to require dquot lock
 *                Jan Kara, <jack@suse.cz>, 9/2000
 *
 *                Added dynamic quota structure allocation
 *                Jan Kara <jack@suse.cz> 12/2000
 *
 *                Rewritten quota interface. Implemented new quota format and
 *                formats registering.
 *                Jan Kara, <jack@suse.cz>, 2001,2002
 *
 *                New SMP locking.
 *                Jan Kara, <jack@suse.cz>, 10/2002
 *
 *                Added journalled quota support, fix lock inversion problems
 *                Jan Kara, <jack@suse.cz>, 2003,2004
 *
 * (C) Copyright 1994 - 1997 Marco van Wieringen
 */

#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/mm.h>
#include <linux/time.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/tty.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/kmod.h>
#include <linux/namei.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>

#include <linux/uaccess.h>

/*
 * There are five quota SMP locks:
 * * dq_list_lock protects all lists with quotas and quota formats.
 * * dquot->dq_dqb_lock protects data from dq_dqb
 * * inode->i_lock protects inode->i_blocks, i_bytes and also guards
 *   consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that
 *   dquot_transfer() can stabilize amount it transfers
 * * dq_data_lock protects mem_dqinfo structures and modifications of dquot
 *   pointers in the inode
 * * dq_state_lock protects modifications of quota state (on quotaon and
 *   quotaoff) and readers who care about latest values take it as well.
 *
 * The spinlock ordering is hence:
 *   dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock,
 *   dq_list_lock > dq_state_lock
 *
 * Note that some things (eg. sb pointer, type, id) doesn't change during
 * the life of the dquot structure and so needn't to be protected by a lock
 *
 * Operation accessing dquots via inode pointers are protected by dquot_srcu.
 * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
 * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
 * inode and before dropping dquot references to avoid use of dquots after
 * they are freed. dq_data_lock is used to serialize the pointer setting and
 * clearing operations.
 * Special care needs to be taken about S_NOQUOTA inode flag (marking that
 * inode is a quota file). Functions adding pointers from inode to dquots have
 * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
 * have to do all pointer modifications before dropping dq_data_lock. This makes
 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
 * then drops all pointers to dquots from an inode.
 *
 * Each dquot has its dq_lock mutex.  Dquot is locked when it is being read to
 * memory (or space for it is being allocated) on the first dqget(), when it is
 * being written out, and when it is being released on the last dqput(). The
 * allocation and release operations are serialized by the dq_lock and by
 * checking the use count in dquot_release().
 *
 * Lock ordering (including related VFS locks) is the following:
 *   s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem
 */

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
EXPORT_SYMBOL(dq_data_lock);
DEFINE_STATIC_SRCU(dquot_srcu);

static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq);

void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...)
{
        if (printk_ratelimit()) {
                va_list args;
                struct va_format vaf;

                va_start(args, fmt);

                vaf.fmt = fmt;
                vaf.va = &args;

                printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
                       sb->s_id, func, &vaf);

                va_end(args);
        }
}
EXPORT_SYMBOL(__quota_error);

#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
static char *quotatypes[] = INITQFNAMES;
#endif
static struct quota_format_type *quota_formats;        /* List of registered formats */
static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;

/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;

/* workqueue for work quota_release_work*/
static struct workqueue_struct *quota_unbound_wq;

void register_quota_format(struct quota_format_type *fmt)
{
        spin_lock(&dq_list_lock);
        fmt->qf_next = quota_formats;
        quota_formats = fmt;
        spin_unlock(&dq_list_lock);
}
EXPORT_SYMBOL(register_quota_format);

void unregister_quota_format(struct quota_format_type *fmt)
{
        struct quota_format_type **actqf;

        spin_lock(&dq_list_lock);
        for (actqf = &quota_formats; *actqf && *actqf != fmt;
             actqf = &(*actqf)->qf_next)
                ;
        if (*actqf)
                *actqf = (*actqf)->qf_next;
        spin_unlock(&dq_list_lock);
}
EXPORT_SYMBOL(unregister_quota_format);

static struct quota_format_type *find_quota_format(int id)
{
        struct quota_format_type *actqf;

        spin_lock(&dq_list_lock);
        for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
             actqf = actqf->qf_next)
                ;
        if (!actqf || !try_module_get(actqf->qf_owner)) {
                int qm;

                spin_unlock(&dq_list_lock);

                for (qm = 0; module_names[qm].qm_fmt_id &&
                             module_names[qm].qm_fmt_id != id; qm++)
                        ;
                if (!module_names[qm].qm_fmt_id ||
                    request_module(module_names[qm].qm_mod_name))
                        return NULL;

                spin_lock(&dq_list_lock);
                for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
                     actqf = actqf->qf_next)
                        ;
                if (actqf && !try_module_get(actqf->qf_owner))
                        actqf = NULL;
        }
        spin_unlock(&dq_list_lock);
        return actqf;
}

static void put_quota_format(struct quota_format_type *fmt)
{
        module_put(fmt->qf_owner);
}

/*
 * Dquot List Management:
 * The quota code uses five lists for dquot management: the inuse_list,
 * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array.
 * A single dquot structure may be on some of those lists, depending on
 * its current state.
 *
 * All dquots are placed to the end of inuse_list when first created, and this
 * list is used for invalidate operation, which must look at every dquot.
 *
 * When the last reference of a dquot is dropped, the dquot is added to
 * releasing_dquots. We'll then queue work item which will call
 * synchronize_srcu() and after that perform the final cleanup of all the
 * dquots on the list. Each cleaned up dquot is moved to free_dquots list.
 * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot
 * struct.
 *
 * Unused and cleaned up dquots are in the free_dquots list and this list is
 * searched whenever we need an available dquot. Dquots are removed from the
 * list as soon as they are used again and dqstats.free_dquots gives the number
 * of dquots on the list. When dquot is invalidated it's completely released
 * from memory.
 *
 * Dirty dquots are added to the dqi_dirty_list of quota_info when mark
 * dirtied, and this list is searched when writing dirty dquots back to
 * quota file. Note that some filesystems do dirty dquot tracking on their
 * own (e.g. in a journal) and thus don't use dqi_dirty_list.
 *
 * Dquots with a specific identity (device, type and id) are placed on
 * one of the dquot_hash[] hash chains. The provides an efficient search
 * mechanism to locate a specific dquot.
 */

static LIST_HEAD(inuse_list);
static LIST_HEAD(free_dquots);
static LIST_HEAD(releasing_dquots);
static unsigned int dq_hash_bits, dq_hash_mask;
static struct hlist_head *dquot_hash;

struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);

static qsize_t inode_get_rsv_space(struct inode *inode);
static qsize_t __inode_get_rsv_space(struct inode *inode);
static int __dquot_initialize(struct inode *inode, int type);

static void quota_release_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn);

static inline unsigned int
hashfn(const struct super_block *sb, struct kqid qid)
{
        unsigned int id = from_kqid(&init_user_ns, qid);
        int type = qid.type;
        unsigned long tmp;

        tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type);
        return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask;
}

/*
 * Following list functions expect dq_list_lock to be held
 */
static inline void insert_dquot_hash(struct dquot *dquot)
{
        struct hlist_head *head;
        head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id);
        hlist_add_head(&dquot->dq_hash, head);
}

static inline void remove_dquot_hash(struct dquot *dquot)
{
        hlist_del_init(&dquot->dq_hash);
}

static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
                                struct kqid qid)
{
        struct dquot *dquot;

        hlist_for_each_entry(dquot, dquot_hash+hashent, dq_hash)
                if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid))
                        return dquot;

        return NULL;
}

/* Add a dquot to the tail of the free list */
static inline void put_dquot_last(struct dquot *dquot)
{
        list_add_tail(&dquot->dq_free, &free_dquots);
        dqstats_inc(DQST_FREE_DQUOTS);
}

static inline void put_releasing_dquots(struct dquot *dquot)
{
        list_add_tail(&dquot->dq_free, &releasing_dquots);
        set_bit(DQ_RELEASING_B, &dquot->dq_flags);
}

static inline void remove_free_dquot(struct dquot *dquot)
{
        if (list_empty(&dquot->dq_free))
                return;
        list_del_init(&dquot->dq_free);
        if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags))
                dqstats_dec(DQST_FREE_DQUOTS);
        else
                clear_bit(DQ_RELEASING_B, &dquot->dq_flags);
}

static inline void put_inuse(struct dquot *dquot)
{
        /* We add to the back of inuse list so we don't have to restart
         * when traversing this list and we block */
        list_add_tail(&dquot->dq_inuse, &inuse_list);
        dqstats_inc(DQST_ALLOC_DQUOTS);
}

static inline void remove_inuse(struct dquot *dquot)
{
        dqstats_dec(DQST_ALLOC_DQUOTS);
        list_del(&dquot->dq_inuse);
}
/*
 * End of list functions needing dq_list_lock
 */

static void wait_on_dquot(struct dquot *dquot)
{
        mutex_lock(&dquot->dq_lock);
        mutex_unlock(&dquot->dq_lock);
}

static inline int dquot_active(struct dquot *dquot)
{
        return test_bit(DQ_ACTIVE_B, &dquot->dq_flags);
}

static struct dquot *__dqgrab(struct dquot *dquot)
{
        lockdep_assert_held(&dq_list_lock);
        if (!atomic_read(&dquot->dq_count))
                remove_free_dquot(dquot);
        atomic_inc(&dquot->dq_count);
        return dquot;
}

/*
 * Get reference to dquot when we got pointer to it by some other means. The
 * dquot has to be active and the caller has to make sure it cannot get
 * deactivated under our hands.
 */
struct dquot *dqgrab(struct dquot *dquot)
{
        spin_lock(&dq_list_lock);
        WARN_ON_ONCE(!dquot_active(dquot));
        dquot = __dqgrab(dquot);
        spin_unlock(&dq_list_lock);

        return dquot;
}
EXPORT_SYMBOL_GPL(dqgrab);

static inline int dquot_dirty(struct dquot *dquot)
{
        return test_bit(DQ_MOD_B, &dquot->dq_flags);
}

static inline int mark_dquot_dirty(struct dquot *dquot)
{
        return dquot->dq_sb->dq_op->mark_dirty(dquot);
}

/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
int dquot_mark_dquot_dirty(struct dquot *dquot)
{
        int ret = 1;

        if (!dquot_active(dquot))
                return 0;

        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
                return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags);

        /* If quota is dirty already, we don't have to acquire dq_list_lock */
        if (dquot_dirty(dquot))
                return 1;

        spin_lock(&dq_list_lock);
        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
                list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
                                info[dquot->dq_id.type].dqi_dirty_list);
                ret = 0;
        }
        spin_unlock(&dq_list_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_mark_dquot_dirty);

/* Dirtify all the dquots - this can block when journalling */
static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
{
        int ret, err, cnt;
        struct dquot *dquot;

        ret = err = 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot)
                        /* Even in case of error we have to continue */
                        ret = mark_dquot_dirty(dquot);
                if (!err && ret < 0)
                        err = ret;
        }
        return err;
}

static inline void dqput_all(struct dquot **dquot)
{
        unsigned int cnt;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                dqput(dquot[cnt]);
}

static inline int clear_dquot_dirty(struct dquot *dquot)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
                return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags);

        spin_lock(&dq_list_lock);
        if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) {
                spin_unlock(&dq_list_lock);
                return 0;
        }
        list_del_init(&dquot->dq_dirty);
        spin_unlock(&dq_list_lock);
        return 1;
}

void mark_info_dirty(struct super_block *sb, int type)
{
        spin_lock(&dq_data_lock);
        sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY;
        spin_unlock(&dq_data_lock);
}
EXPORT_SYMBOL(mark_info_dirty);

/*
 *        Read dquot from disk and alloc space for it
 */

int dquot_acquire(struct dquot *dquot)
{
        int ret = 0, ret2 = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
                ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
                if (ret < 0)
                        goto out_iolock;
        }
        /* Make sure flags update is visible after dquot has been filled */
        smp_mb__before_atomic();
        set_bit(DQ_READ_B, &dquot->dq_flags);
        /* Instantiate dquot if needed */
        if (!dquot_active(dquot) && !dquot->dq_off) {
                ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
                /* Write the info if needed */
                if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
                        ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
                                        dquot->dq_sb, dquot->dq_id.type);
                }
                if (ret < 0)
                        goto out_iolock;
                if (ret2 < 0) {
                        ret = ret2;
                        goto out_iolock;
                }
        }
        /*
         * Make sure flags update is visible after on-disk struct has been
         * allocated. Paired with smp_rmb() in dqget().
         */
        smp_mb__before_atomic();
        set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_acquire);

/*
 *        Write dquot to disk
 */
int dquot_commit(struct dquot *dquot)
{
        int ret = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        if (!clear_dquot_dirty(dquot))
                goto out_lock;
        /* Inactive dquot can be only if there was error during read/init
         * => we have better not writing it */
        if (dquot_active(dquot))
                ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
        else
                ret = -EIO;
out_lock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_commit);

/*
 *        Release dquot
 */
int dquot_release(struct dquot *dquot)
{
        int ret = 0, ret2 = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        /* Check whether we are not racing with some other dqget() */
        if (dquot_is_busy(dquot))
                goto out_dqlock;
        if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
                ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
                /* Write the info */
                if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
                        ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
                                                dquot->dq_sb, dquot->dq_id.type);
                }
                if (ret >= 0)
                        ret = ret2;
        }
        clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_dqlock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_release);

void dquot_destroy(struct dquot *dquot)
{
        kmem_cache_free(dquot_cachep, dquot);
}
EXPORT_SYMBOL(dquot_destroy);

static inline void do_destroy_dquot(struct dquot *dquot)
{
        dquot->dq_sb->dq_op->destroy_dquot(dquot);
}

/* Invalidate all dquots on the list. Note that this function is called after
 * quota is disabled and pointers from inodes removed so there cannot be new
 * quota users. There can still be some users of quotas due to inodes being
 * just deleted or pruned by prune_icache() (those are not attached to any
 * list) or parallel quotactl call. We have to wait for such users.
 */
static void invalidate_dquots(struct super_block *sb, int type)
{
        struct dquot *dquot, *tmp;

restart:
        flush_delayed_work(&quota_release_work);

        spin_lock(&dq_list_lock);
        list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) {
                if (dquot->dq_sb != sb)
                        continue;
                if (dquot->dq_id.type != type)
                        continue;
                /* Wait for dquot users */
                if (atomic_read(&dquot->dq_count)) {
                        atomic_inc(&dquot->dq_count);
                        spin_unlock(&dq_list_lock);
                        /*
                         * Once dqput() wakes us up, we know it's time to free
                         * the dquot.
                         * IMPORTANT: we rely on the fact that there is always
                         * at most one process waiting for dquot to free.
                         * Otherwise dq_count would be > 1 and we would never
                         * wake up.
                         */
                        wait_event(dquot_ref_wq,
                                   atomic_read(&dquot->dq_count) == 1);
                        dqput(dquot);
                        /* At this moment dquot() need not exist (it could be
                         * reclaimed by prune_dqcache(). Hence we must
                         * restart. */
                        goto restart;
                }
                /*
                 * The last user already dropped its reference but dquot didn't
                 * get fully cleaned up yet. Restart the scan which flushes the
                 * work cleaning up released dquots.
                 */
                if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
                        spin_unlock(&dq_list_lock);
                        goto restart;
                }
                /*
                 * Quota now has no users and it has been written on last
                 * dqput()
                 */
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
                do_destroy_dquot(dquot);
        }
        spin_unlock(&dq_list_lock);
}

/* Call callback for every active dquot on given filesystem */
int dquot_scan_active(struct super_block *sb,
                      int (*fn)(struct dquot *dquot, unsigned long priv),
                      unsigned long priv)
{
        struct dquot *dquot, *old_dquot = NULL;
        int ret = 0;

        WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));

        spin_lock(&dq_list_lock);
        list_for_each_entry(dquot, &inuse_list, dq_inuse) {
                if (!dquot_active(dquot))
                        continue;
                if (dquot->dq_sb != sb)
                        continue;
                __dqgrab(dquot);
                spin_unlock(&dq_list_lock);
                dqput(old_dquot);
                old_dquot = dquot;
                /*
                 * ->release_dquot() can be racing with us. Our reference
                 * protects us from dquot_release() proceeding so just wait for
                 * any outstanding call and recheck the DQ_ACTIVE_B after that.
                 */
                wait_on_dquot(dquot);
                if (dquot_active(dquot)) {
                        ret = fn(dquot, priv);
                        if (ret < 0)
                                goto out;
                }
                spin_lock(&dq_list_lock);
                /* We are safe to continue now because our dquot could not
                 * be moved out of the inuse list while we hold the reference */
        }
        spin_unlock(&dq_list_lock);
out:
        dqput(old_dquot);
        return ret;
}
EXPORT_SYMBOL(dquot_scan_active);

static inline int dquot_write_dquot(struct dquot *dquot)
{
        int ret = dquot->dq_sb->dq_op->write_dquot(dquot);
        if (ret < 0) {
                quota_error(dquot->dq_sb, "Can't write quota structure "
                            "(error %d). Quota may get out of sync!", ret);
                /* Clear dirty bit anyway to avoid infinite loop. */
                clear_dquot_dirty(dquot);
        }
        return ret;
}

/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
        struct list_head dirty;
        struct dquot *dquot;
        struct quota_info *dqopt = sb_dqopt(sb);
        int cnt;
        int err, ret = 0;

        WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));

        flush_delayed_work(&quota_release_work);

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                spin_lock(&dq_list_lock);
                /* Move list away to avoid livelock. */
                list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty);
                while (!list_empty(&dirty)) {
                        dquot = list_first_entry(&dirty, struct dquot,
                                                 dq_dirty);

                        WARN_ON(!dquot_active(dquot));
                        /* If the dquot is releasing we should not touch it */
                        if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
                                spin_unlock(&dq_list_lock);
                                flush_delayed_work(&quota_release_work);
                                spin_lock(&dq_list_lock);
                                continue;
                        }

                        /* Now we have active dquot from which someone is
                          * holding reference so we can safely just increase
                         * use count */
                        __dqgrab(dquot);
                        spin_unlock(&dq_list_lock);
                        err = dquot_write_dquot(dquot);
                        if (err && !ret)
                                ret = err;
                        dqput(dquot);
                        spin_lock(&dq_list_lock);
                }
                spin_unlock(&dq_list_lock);
        }

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
        dqstats_inc(DQST_SYNCS);

        return ret;
}
EXPORT_SYMBOL(dquot_writeback_dquots);

/* Write all dquot structures to disk and make them visible from userspace */
int dquot_quota_sync(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        int cnt;
        int ret;

        ret = dquot_writeback_dquots(sb, type);
        if (ret)
                return ret;
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
                return 0;

        /* This is not very clever (and fast) but currently I don't know about
         * any other simple way of getting quota data to disk and we must get
         * them there for userspace to be visible... */
        if (sb->s_op->sync_fs) {
                ret = sb->s_op->sync_fs(sb, 1);
                if (ret)
                        return ret;
        }
        ret = sync_blockdev(sb->s_bdev);
        if (ret)
                return ret;

        /*
         * Now when everything is written we can discard the pagecache so
         * that userspace sees the changes.
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                inode_lock(dqopt->files[cnt]);
                truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
                inode_unlock(dqopt->files[cnt]);
        }

        return 0;
}
EXPORT_SYMBOL(dquot_quota_sync);

static unsigned long
dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
        struct dquot *dquot;
        unsigned long freed = 0;

        spin_lock(&dq_list_lock);
        while (!list_empty(&free_dquots) && sc->nr_to_scan) {
                dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
                do_destroy_dquot(dquot);
                sc->nr_to_scan--;
                freed++;
        }
        spin_unlock(&dq_list_lock);
        return freed;
}

static unsigned long
dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
        return vfs_pressure_ratio(
        percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
}

/*
 * Safely release dquot and put reference to dquot.
 */
static void quota_release_workfn(struct work_struct *work)
{
        struct dquot *dquot;
        struct list_head rls_head;

        spin_lock(&dq_list_lock);
        /* Exchange the list head to avoid livelock. */
        list_replace_init(&releasing_dquots, &rls_head);
        spin_unlock(&dq_list_lock);
        synchronize_srcu(&dquot_srcu);

restart:
        spin_lock(&dq_list_lock);
        while (!list_empty(&rls_head)) {
                dquot = list_first_entry(&rls_head, struct dquot, dq_free);
                WARN_ON_ONCE(atomic_read(&dquot->dq_count));
                /*
                 * Note that DQ_RELEASING_B protects us from racing with
                 * invalidate_dquots() calls so we are safe to work with the
                 * dquot even after we drop dq_list_lock.
                 */
                if (dquot_dirty(dquot)) {
                        spin_unlock(&dq_list_lock);
                        /* Commit dquot before releasing */
                        dquot_write_dquot(dquot);
                        goto restart;
                }
                if (dquot_active(dquot)) {
                        spin_unlock(&dq_list_lock);
                        dquot->dq_sb->dq_op->release_dquot(dquot);
                        goto restart;
                }
                /* Dquot is inactive and clean, now move it to free list */
                remove_free_dquot(dquot);
                put_dquot_last(dquot);
        }
        spin_unlock(&dq_list_lock);
}

/*
 * Put reference to dquot
 */
void dqput(struct dquot *dquot)
{
        if (!dquot)
                return;
#ifdef CONFIG_QUOTA_DEBUG
        if (!atomic_read(&dquot->dq_count)) {
                quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
                            quotatypes[dquot->dq_id.type],
                            from_kqid(&init_user_ns, dquot->dq_id));
                BUG();
        }
#endif
        dqstats_inc(DQST_DROPS);

        spin_lock(&dq_list_lock);
        if (atomic_read(&dquot->dq_count) > 1) {
                /* We have more than one user... nothing to do */
                atomic_dec(&dquot->dq_count);
                /* Releasing dquot during quotaoff phase? */
                if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) &&
                    atomic_read(&dquot->dq_count) == 1)
                        wake_up(&dquot_ref_wq);
                spin_unlock(&dq_list_lock);
                return;
        }

        /* Need to release dquot? */
        WARN_ON_ONCE(!list_empty(&dquot->dq_free));
        put_releasing_dquots(dquot);
        atomic_dec(&dquot->dq_count);
        spin_unlock(&dq_list_lock);
        queue_delayed_work(quota_unbound_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);

struct dquot *dquot_alloc(struct super_block *sb, int type)
{
        return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
}
EXPORT_SYMBOL(dquot_alloc);

static struct dquot *get_empty_dquot(struct super_block *sb, int type)
{
        struct dquot *dquot;

        dquot = sb->dq_op->alloc_dquot(sb, type);
        if(!dquot)
                return NULL;

        mutex_init(&dquot->dq_lock);
        INIT_LIST_HEAD(&dquot->dq_free);
        INIT_LIST_HEAD(&dquot->dq_inuse);
        INIT_HLIST_NODE(&dquot->dq_hash);
        INIT_LIST_HEAD(&dquot->dq_dirty);
        dquot->dq_sb = sb;
        dquot->dq_id = make_kqid_invalid(type);
        atomic_set(&dquot->dq_count, 1);
        spin_lock_init(&dquot->dq_dqb_lock);

        return dquot;
}

/*
 * Get reference to dquot
 *
 * Locking is slightly tricky here. We are guarded from parallel quotaoff()
 * destroying our dquot by:
 *   a) checking for quota flags under dq_list_lock and
 *   b) getting a reference to dquot before we release dq_list_lock
 */
struct dquot *dqget(struct super_block *sb, struct kqid qid)
{
        unsigned int hashent = hashfn(sb, qid);
        struct dquot *dquot, *empty = NULL;

        if (!qid_has_mapping(sb->s_user_ns, qid))
                return ERR_PTR(-EINVAL);

        if (!sb_has_quota_active(sb, qid.type))
                return ERR_PTR(-ESRCH);
we_slept:
        spin_lock(&dq_list_lock);
        spin_lock(&dq_state_lock);
        if (!sb_has_quota_active(sb, qid.type)) {
                spin_unlock(&dq_state_lock);
                spin_unlock(&dq_list_lock);
                dquot = ERR_PTR(-ESRCH);
                goto out;
        }
        spin_unlock(&dq_state_lock);

        dquot = find_dquot(hashent, sb, qid);
        if (!dquot) {
                if (!empty) {
                        spin_unlock(&dq_list_lock);
                        empty = get_empty_dquot(sb, qid.type);
                        if (!empty)
                                schedule();        /* Try to wait for a moment... */
                        goto we_slept;
                }
                dquot = empty;
                empty = NULL;
                dquot->dq_id = qid;
                /* all dquots go on the inuse_list */
                put_inuse(dquot);
                /* hash it first so it can be found */
                insert_dquot_hash(dquot);
                spin_unlock(&dq_list_lock);
                dqstats_inc(DQST_LOOKUPS);
        } else {
                __dqgrab(dquot);
                spin_unlock(&dq_list_lock);
                dqstats_inc(DQST_CACHE_HITS);
                dqstats_inc(DQST_LOOKUPS);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is
         * already finished or it will be canceled due to dq_count > 0 test */
        wait_on_dquot(dquot);
        /* Read the dquot / allocate space in quota file */
        if (!dquot_active(dquot)) {
                int err;

                err = sb->dq_op->acquire_dquot(dquot);
                if (err < 0) {
                        dqput(dquot);
                        dquot = ERR_PTR(err);
                        goto out;
                }
        }
        /*
         * Make sure following reads see filled structure - paired with
         * smp_mb__before_atomic() in dquot_acquire().
         */
        smp_rmb();
        /* Has somebody invalidated entry under us? */
        WARN_ON_ONCE(hlist_unhashed(&dquot->dq_hash));
out:
        if (empty)
                do_destroy_dquot(empty);

        return dquot;
}
EXPORT_SYMBOL(dqget);

static inline struct dquot __rcu **i_dquot(struct inode *inode)
{
        return inode->i_sb->s_op->get_dquots(inode);
}

static int dqinit_needed(struct inode *inode, int type)
{
        struct dquot __rcu * const *dquots;
        int cnt;

        if (IS_NOQUOTA(inode))
                return 0;

        dquots = i_dquot(inode);
        if (type != -1)
                return !dquots[type];
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!dquots[cnt])
                        return 1;
        return 0;
}

/* This routine is guarded by s_umount semaphore */
static int add_dquot_ref(struct super_block *sb, int type)
{
        struct inode *inode, *old_inode = NULL;
#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
#endif
        int err = 0;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                spin_lock(&inode->i_lock);
                if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
                    !atomic_read(&inode->i_writecount) ||
                    !dqinit_needed(inode, type)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&sb->s_inode_list_lock);

#ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
                        reserved = 1;
#endif
                iput(old_inode);
                err = __dquot_initialize(inode, type);
                if (err) {
                        iput(inode);
                        goto out;
                }

                /*
                 * We hold a reference to 'inode' so it couldn't have been
                 * removed from s_inodes list while we dropped the
                 * s_inode_list_lock. We cannot iput the inode now as we can be
                 * holding the last reference and we cannot iput it under
                 * s_inode_list_lock. So we keep the reference and iput it
                 * later.
                 */
                old_inode = inode;
                cond_resched();
                spin_lock(&sb->s_inode_list_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);
        iput(old_inode);
out:
#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                quota_error(sb, "Writes happened before quota was turned on "
                        "thus quota information is probably inconsistent. "
                        "Please run quotacheck(8)");
        }
#endif
        return err;
}

static void remove_dquot_ref(struct super_block *sb, int type)
{
        struct inode *inode;
#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
#endif

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 *  We have to scan also I_NEW inodes because they can already
                 *  have quota pointer initialized. Luckily, we need to touch
                 *  only quota pointers and these have separate locking
                 *  (dq_data_lock).
                 */
                spin_lock(&dq_data_lock);
                if (!IS_NOQUOTA(inode)) {
                        struct dquot __rcu **dquots = i_dquot(inode);
                        struct dquot *dquot = srcu_dereference_check(
                                dquots[type], &dquot_srcu,
                                lockdep_is_held(&dq_data_lock));

#ifdef CONFIG_QUOTA_DEBUG
                        if (unlikely(inode_get_rsv_space(inode) > 0))
                                reserved = 1;
#endif
                        rcu_assign_pointer(dquots[type], NULL);
                        if (dquot)
                                dqput(dquot);
                }
                spin_unlock(&dq_data_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened after quota"
                        " was disabled thus quota information is probably "
                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
        }
#endif
}

/* Gather all references from inodes and drop them */
static void drop_dquot_ref(struct super_block *sb, int type)
{
        if (sb->dq_op)
                remove_dquot_ref(sb, type);
}

static inline
void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
{
        if (dquot->dq_dqb.dqb_rsvspace >= number)
                dquot->dq_dqb.dqb_rsvspace -= number;
        else {
                WARN_ON_ONCE(1);
                dquot->dq_dqb.dqb_rsvspace = 0;
        }
        if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <=
            dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time64_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}

static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
            dquot->dq_dqb.dqb_curinodes >= number)
                dquot->dq_dqb.dqb_curinodes -= number;
        else
                dquot->dq_dqb.dqb_curinodes = 0;
        if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
                dquot->dq_dqb.dqb_itime = (time64_t) 0;
        clear_bit(DQ_INODES_B, &dquot->dq_flags);
}

static void dquot_decr_space(struct dquot *dquot, qsize_t number)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
            dquot->dq_dqb.dqb_curspace >= number)
                dquot->dq_dqb.dqb_curspace -= number;
        else
                dquot->dq_dqb.dqb_curspace = 0;
        if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <=
            dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time64_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}

struct dquot_warn {
        struct super_block *w_sb;
        struct kqid w_dq_id;
        short w_type;
};

static int warning_issued(struct dquot *dquot, const int warntype)
{
        int flag = (warntype == QUOTA_NL_BHARDWARN ||
                warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
                ((warntype == QUOTA_NL_IHARDWARN ||
                warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);

        if (!flag)
                return 0;
        return test_and_set_bit(flag, &dquot->dq_flags);
}

#ifdef CONFIG_PRINT_QUOTA_WARNING
static int flag_print_warnings = 1;

static int need_print_warning(struct dquot_warn *warn)
{
        if (!flag_print_warnings)
                return 0;

        switch (warn->w_dq_id.type) {
                case USRQUOTA:
                        return uid_eq(current_fsuid(), warn->w_dq_id.uid);
                case GRPQUOTA:
                        return in_group_p(warn->w_dq_id.gid);
                case PRJQUOTA:
                        return 1;
        }
        return 0;
}

/* Print warning to user which exceeded quota */
static void print_warning(struct dquot_warn *warn)
{
        char *msg = NULL;
        struct tty_struct *tty;
        int warntype = warn->w_type;

        if (warntype == QUOTA_NL_IHARDBELOW ||
            warntype == QUOTA_NL_ISOFTBELOW ||
            warntype == QUOTA_NL_BHARDBELOW ||
            warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn))
                return;

        tty = get_current_tty();
        if (!tty)
                return;
        tty_write_message(tty, warn->w_sb->s_id);
        if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
                tty_write_message(tty, ": warning, ");
        else
                tty_write_message(tty, ": write failed, ");
        tty_write_message(tty, quotatypes[warn->w_dq_id.type]);
        switch (warntype) {
                case QUOTA_NL_IHARDWARN:
                        msg = " file limit reached.\r\n";
                        break;
                case QUOTA_NL_ISOFTLONGWARN:
                        msg = " file quota exceeded too long.\r\n";
                        break;
                case QUOTA_NL_ISOFTWARN:
                        msg = " file quota exceeded.\r\n";
                        break;
                case QUOTA_NL_BHARDWARN:
                        msg = " block limit reached.\r\n";
                        break;
                case QUOTA_NL_BSOFTLONGWARN:
                        msg = " block quota exceeded too long.\r\n";
                        break;
                case QUOTA_NL_BSOFTWARN:
                        msg = " block quota exceeded.\r\n";
                        break;
        }
        tty_write_message(tty, msg);
        tty_kref_put(tty);
}
#endif

static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot,
                            int warntype)
{
        if (warning_issued(dquot, warntype))
                return;
        warn->w_type = warntype;
        warn->w_sb = dquot->dq_sb;
        warn->w_dq_id = dquot->dq_id;
}

/*
 * Write warnings to the console and send warning messages over netlink.
 *
 * Note that this function can call into tty and networking code.
 */
static void flush_warnings(struct dquot_warn *warn)
{
        int i;

        for (i = 0; i < MAXQUOTAS; i++) {
                if (warn[i].w_type == QUOTA_NL_NOWARN)
                        continue;
#ifdef CONFIG_PRINT_QUOTA_WARNING
                print_warning(&warn[i]);
#endif
                quota_send_warning(warn[i].w_dq_id,
                                   warn[i].w_sb->s_dev, warn[i].w_type);
        }
}

static int ignore_hardlimit(struct dquot *dquot)
{
        struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];

        return capable(CAP_SYS_RESOURCE) &&
               (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
                !(info->dqi_flags & DQF_ROOT_SQUASH));
}

static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes,
                            struct dquot_warn *warn)
{
        qsize_t newinodes;
        int ret = 0;

        spin_lock(&dquot->dq_dqb_lock);
        newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                goto add;

        if (dquot->dq_dqb.dqb_ihardlimit &&
            newinodes > dquot->dq_dqb.dqb_ihardlimit &&
            !ignore_hardlimit(dquot)) {
                prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
                ret = -EDQUOT;
                goto out;
        }

        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime &&
            ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime &&
            !ignore_hardlimit(dquot)) {
                prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
                ret = -EDQUOT;
                goto out;
        }

        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime == 0) {
                prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
                dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() +
                    sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
        }
add:
        dquot->dq_dqb.dqb_curinodes = newinodes;

out:
        spin_unlock(&dquot->dq_dqb_lock);
        return ret;
}

static int dquot_add_space(struct dquot *dquot, qsize_t space,
                           qsize_t rsv_space, unsigned int flags,
                           struct dquot_warn *warn)
{
        qsize_t tspace;
        struct super_block *sb = dquot->dq_sb;
        int ret = 0;

        spin_lock(&dquot->dq_dqb_lock);
        if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                goto finish;

        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
                + space + rsv_space;

        if (dquot->dq_dqb.dqb_bhardlimit &&
            tspace > dquot->dq_dqb.dqb_bhardlimit &&
            !ignore_hardlimit(dquot)) {
                if (flags & DQUOT_SPACE_WARN)
                        prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
                ret = -EDQUOT;
                goto finish;
        }

        if (dquot->dq_dqb.dqb_bsoftlimit &&
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime &&
            ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime &&
            !ignore_hardlimit(dquot)) {
                if (flags & DQUOT_SPACE_WARN)
                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
                ret = -EDQUOT;
                goto finish;
        }

        if (dquot->dq_dqb.dqb_bsoftlimit &&
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime == 0) {
                if (flags & DQUOT_SPACE_WARN) {
                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
                        dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() +
                            sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
                } else {
                        /*
                         * We don't allow preallocation to exceed softlimit so exceeding will
                         * be always printed
                         */
                        ret = -EDQUOT;
                        goto finish;
                }
        }
finish:
        /*
         * We have to be careful and go through warning generation & grace time
         * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it
         * only here...
         */
        if (flags & DQUOT_SPACE_NOFAIL)
                ret = 0;
        if (!ret) {
                dquot->dq_dqb.dqb_rsvspace += rsv_space;
                dquot->dq_dqb.dqb_curspace += space;
        }
        spin_unlock(&dquot->dq_dqb_lock);
        return ret;
}

static int info_idq_free(struct dquot *dquot, qsize_t inodes)
{
        qsize_t newinodes;

        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
            !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type))
                return QUOTA_NL_NOWARN;

        newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
        if (newinodes <= dquot->dq_dqb.dqb_isoftlimit)
                return QUOTA_NL_ISOFTBELOW;
        if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
            newinodes < dquot->dq_dqb.dqb_ihardlimit)
                return QUOTA_NL_IHARDBELOW;
        return QUOTA_NL_NOWARN;
}

static int info_bdq_free(struct dquot *dquot, qsize_t space)
{
        qsize_t tspace;

        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace;

        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
            tspace <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_NOWARN;

        if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_BSOFTBELOW;
        if (tspace >= dquot->dq_dqb.dqb_bhardlimit &&
            tspace - space < dquot->dq_dqb.dqb_bhardlimit)
                return QUOTA_NL_BHARDBELOW;
        return QUOTA_NL_NOWARN;
}

static int inode_quota_active(const struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (IS_NOQUOTA(inode))
                return 0;
        return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
}

/*
 * Initialize quota pointers in inode
 *
 * It is better to call this function outside of any transaction as it
 * might need a lot of space in journal for dquot structure allocation.
 */
static int __dquot_initialize(struct inode *inode, int type)
{
        int cnt, init_needed = 0;
        struct dquot __rcu **dquots;
        struct dquot *got[MAXQUOTAS] = {};
        struct super_block *sb = inode->i_sb;
        qsize_t rsv;
        int ret = 0;

        if (!inode_quota_active(inode))
                return 0;

        dquots = i_dquot(inode);

        /* First get references to structures we might need. */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                struct kqid qid;
                kprojid_t projid;
                int rc;
                struct dquot *dquot;

                if (type != -1 && cnt != type)
                        continue;
                /*
                 * The i_dquot should have been initialized in most cases,
                 * we check it without locking here to avoid unnecessary
                 * dqget()/dqput() calls.
                 */
                if (dquots[cnt])
                        continue;

                if (!sb_has_quota_active(sb, cnt))
                        continue;

                init_needed = 1;

                switch (cnt) {
                case USRQUOTA:
                        qid = make_kqid_uid(inode->i_uid);
                        break;
                case GRPQUOTA:
                        qid = make_kqid_gid(inode->i_gid);
                        break;
                case PRJQUOTA:
                        rc = inode->i_sb->dq_op->get_projid(inode, &projid);
                        if (rc)
                                continue;
                        qid = make_kqid_projid(projid);
                        break;
                }
                dquot = dqget(sb, qid);
                if (IS_ERR(dquot)) {
                        /* We raced with somebody turning quotas off... */
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                got[cnt] = dquot;
        }

        /* All required i_dquot has been initialized */
        if (!init_needed)
                return 0;

        spin_lock(&dq_data_lock);
        if (IS_NOQUOTA(inode))
                goto out_lock;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                /* We could race with quotaon or dqget() could have failed */
                if (!got[cnt])
                        continue;
                if (!dquots[cnt]) {
                        rcu_assign_pointer(dquots[cnt], got[cnt]);
                        got[cnt] = NULL;
                        /*
                         * Make quota reservation system happy if someone
                         * did a write before quota was turned on
                         */
                        rsv = inode_get_rsv_space(inode);
                        if (unlikely(rsv)) {
                                struct dquot *dquot = srcu_dereference_check(
                                        dquots[cnt], &dquot_srcu,
                                        lockdep_is_held(&dq_data_lock));

                                spin_lock(&inode->i_lock);
                                /* Get reservation again under proper lock */
                                rsv = __inode_get_rsv_space(inode);
                                spin_lock(&dquot->dq_dqb_lock);
                                dquot->dq_dqb.dqb_rsvspace += rsv;
                                spin_unlock(&dquot->dq_dqb_lock);
                                spin_unlock(&inode->i_lock);
                        }
                }
        }
out_lock:
        spin_unlock(&dq_data_lock);
out_put:
        /* Drop unused references */
        dqput_all(got);

        return ret;
}

int dquot_initialize(struct inode *inode)
{
        return __dquot_initialize(inode, -1);
}
EXPORT_SYMBOL(dquot_initialize);

bool dquot_initialize_needed(struct inode *inode)
{
        struct dquot __rcu **dquots;
        int i;

        if (!inode_quota_active(inode))
                return false;

        dquots = i_dquot(inode);
        for (i = 0; i < MAXQUOTAS; i++)
                if (!dquots[i] && sb_has_quota_active(inode->i_sb, i))
                        return true;
        return false;
}
EXPORT_SYMBOL(dquot_initialize_needed);

/*
 * Release all quotas referenced by inode.
 *
 * This function only be called on inode free or converting
 * a file to quota file, no other users for the i_dquot in
 * both cases, so we needn't call synchronize_srcu() after
 * clearing i_dquot.
 */
static void __dquot_drop(struct inode *inode)
{
        int cnt;
        struct dquot __rcu **dquots = i_dquot(inode);
        struct dquot *put[MAXQUOTAS];

        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu,
                                        lockdep_is_held(&dq_data_lock));
                rcu_assign_pointer(dquots[cnt], NULL);
        }
        spin_unlock(&dq_data_lock);
        dqput_all(put);
}

void dquot_drop(struct inode *inode)
{
        struct dquot __rcu * const *dquots;
        int cnt;

        if (IS_NOQUOTA(inode))
                return;

        /*
         * Test before calling to rule out calls from proc and such
         * where we are not allowed to block. Note that this is
         * actually reliable test even without the lock - the caller
         * must assure that nobody can come after the DQUOT_DROP and
         * add quota pointers back anyway.
         */
        dquots = i_dquot(inode);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (dquots[cnt])
                        break;
        }

        if (cnt < MAXQUOTAS)
                __dquot_drop(inode);
}
EXPORT_SYMBOL(dquot_drop);

/*
 * inode_reserved_space is managed internally by quota, and protected by
 * i_lock similar to i_blocks+i_bytes.
 */
static qsize_t *inode_reserved_space(struct inode * inode)
{
        /* Filesystem must explicitly define it's own method in order to use
         * quota reservation interface */
        BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
        return inode->i_sb->dq_op->get_reserved_space(inode);
}

static qsize_t __inode_get_rsv_space(struct inode *inode)
{
        if (!inode->i_sb->dq_op->get_reserved_space)
                return 0;
        return *inode_reserved_space(inode);
}

static qsize_t inode_get_rsv_space(struct inode *inode)
{
        qsize_t ret;

        if (!inode->i_sb->dq_op->get_reserved_space)
                return 0;
        spin_lock(&inode->i_lock);
        ret = __inode_get_rsv_space(inode);
        spin_unlock(&inode->i_lock);
        return ret;
}

/*
 * This functions updates i_blocks+i_bytes fields and quota information
 * (together with appropriate checks).
 *
 * NOTE: We absolutely rely on the fact that caller dirties the inode
 * (usually helpers in quotaops.h care about this) and holds a handle for
 * the current transaction so that dquot write and inode write go into the
 * same transaction.
 */

/*
 * This operation can block, but only after everything is updated
 */
int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        int reserve = flags & DQUOT_SPACE_RESERVE;
        struct dquot __rcu **dquots;
        struct dquot *dquot;

        if (!inode_quota_active(inode)) {
                if (reserve) {
                        spin_lock(&inode->i_lock);
                        *inode_reserved_space(inode) += number;
                        spin_unlock(&inode->i_lock);
                } else {
                        inode_add_bytes(inode, number);
                }
                goto out;
        }

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                if (reserve) {
                        ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]);
                } else {
                        ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]);
                }
                if (ret) {
                        /* Back out changes we already did */
                        for (cnt--; cnt >= 0; cnt--) {
                                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                                if (!dquot)
                                        continue;
                                spin_lock(&dquot->dq_dqb_lock);
                                if (reserve)
                                        dquot_free_reserved_space(dquot, number);
                                else
                                        dquot_decr_space(dquot, number);
                                spin_unlock(&dquot->dq_dqb_lock);
                        }
                        spin_unlock(&inode->i_lock);
                        goto out_flush_warn;
                }
        }
        if (reserve)
                *inode_reserved_space(inode) += number;
        else
                __inode_add_bytes(inode, number);
        spin_unlock(&inode->i_lock);

        if (reserve)
                goto out_flush_warn;
        ret = mark_all_dquot_dirty(dquots);
out_flush_warn:
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
out:
        return ret;
}
EXPORT_SYMBOL(__dquot_alloc_space);

/*
 * This operation can block, but only after everything is updated
 */
int dquot_alloc_inode(struct inode *inode)
{
        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu * const *dquots;
        struct dquot *dquot;

        if (!inode_quota_active(inode))
                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                ret = dquot_add_inodes(dquot, 1, &warn[cnt]);
                if (ret) {
                        for (cnt--; cnt >= 0; cnt--) {
                                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                                if (!dquot)
                                        continue;
                                /* Back out changes we already did */
                                spin_lock(&dquot->dq_dqb_lock);
                                dquot_decr_inodes(dquot, 1);
                                spin_unlock(&dquot->dq_dqb_lock);
                        }
                        goto warn_put_all;
                }
        }

warn_put_all:
        spin_unlock(&inode->i_lock);
        if (ret == 0)
                ret = mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
        return ret;
}
EXPORT_SYMBOL(dquot_alloc_inode);

/*
 * Convert in-memory reserved quotas to real consumed quotas
 */
void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int cnt, index;

        if (!inode_quota_active(inode)) {
                spin_lock(&inode->i_lock);
                *inode_reserved_space(inode) -= number;
                __inode_add_bytes(inode, number);
                spin_unlock(&inode->i_lock);
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot) {
                        spin_lock(&dquot->dq_dqb_lock);
                        if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
                                number = dquot->dq_dqb.dqb_rsvspace;
                        dquot->dq_dqb.dqb_curspace += number;
                        dquot->dq_dqb.dqb_rsvspace -= number;
                        spin_unlock(&dquot->dq_dqb_lock);
                }
        }
        /* Update inode bytes */
        *inode_reserved_space(inode) -= number;
        __inode_add_bytes(inode, number);
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
}
EXPORT_SYMBOL(dquot_claim_space_nodirty);

/*
 * Convert allocated space back to in-memory reserved quotas
 */
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
{
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int cnt, index;

        if (!inode_quota_active(inode)) {
                spin_lock(&inode->i_lock);
                *inode_reserved_space(inode) += number;
                __inode_sub_bytes(inode, number);
                spin_unlock(&inode->i_lock);
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot) {
                        spin_lock(&dquot->dq_dqb_lock);
                        if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
                                number = dquot->dq_dqb.dqb_curspace;
                        dquot->dq_dqb.dqb_rsvspace += number;
                        dquot->dq_dqb.dqb_curspace -= number;
                        spin_unlock(&dquot->dq_dqb_lock);
                }
        }
        /* Update inode bytes */
        *inode_reserved_space(inode) += number;
        __inode_sub_bytes(inode, number);
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
}
EXPORT_SYMBOL(dquot_reclaim_space_nodirty);

/*
 * This operation can block, but only after everything is updated
 */
void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int reserve = flags & DQUOT_SPACE_RESERVE, index;

        if (!inode_quota_active(inode)) {
                if (reserve) {
                        spin_lock(&inode->i_lock);
                        *inode_reserved_space(inode) -= number;
                        spin_unlock(&inode->i_lock);
                } else {
                        inode_sub_bytes(inode, number);
                }
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;

                warn[cnt].w_type = QUOTA_NL_NOWARN;
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                spin_lock(&dquot->dq_dqb_lock);
                wtype = info_bdq_free(dquot, number);
                if (wtype != QUOTA_NL_NOWARN)
                        prepare_warning(&warn[cnt], dquot, wtype);
                if (reserve)
                        dquot_free_reserved_space(dquot, number);
                else
                        dquot_decr_space(dquot, number);
                spin_unlock(&dquot->dq_dqb_lock);
        }
        if (reserve)
                *inode_reserved_space(inode) -= number;
        else
                __inode_sub_bytes(inode, number);
        spin_unlock(&inode->i_lock);

        if (reserve)
                goto out_unlock;
        mark_all_dquot_dirty(dquots);
out_unlock:
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
}
EXPORT_SYMBOL(__dquot_free_space);

/*
 * This operation can block, but only after everything is updated
 */
void dquot_free_inode(struct inode *inode)
{
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu * const *dquots;
        struct dquot *dquot;
        int index;

        if (!inode_quota_active(inode))
                return;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;
                warn[cnt].w_type = QUOTA_NL_NOWARN;
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                spin_lock(&dquot->dq_dqb_lock);
                wtype = info_idq_free(dquot, 1);
                if (wtype != QUOTA_NL_NOWARN)
                        prepare_warning(&warn[cnt], dquot, wtype);
                dquot_decr_inodes(dquot, 1);
                spin_unlock(&dquot->dq_dqb_lock);
        }
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
}
EXPORT_SYMBOL(dquot_free_inode);

/*
 * Transfer the number of inode and blocks from one diskquota to an other.
 * On success, dquot references in transfer_to are consumed and references
 * to original dquots that need to be released are placed there. On failure,
 * references are kept untouched.
 *
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
 *
 * We are holding reference on transfer_from & transfer_to, no need to
 * protect them by srcu_read_lock().
 */
int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
{
        qsize_t cur_space;
        qsize_t rsv_space = 0;
        qsize_t inode_usage = 1;
        struct dquot __rcu **dquots;
        struct dquot *transfer_from[MAXQUOTAS] = {};
        int cnt, index, ret = 0, err;
        char is_valid[MAXQUOTAS] = {};
        struct dquot_warn warn_to[MAXQUOTAS];
        struct dquot_warn warn_from_inodes[MAXQUOTAS];
        struct dquot_warn warn_from_space[MAXQUOTAS];

        if (IS_NOQUOTA(inode))
                return 0;

        if (inode->i_sb->dq_op->get_inode_usage) {
                ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage);
                if (ret)
                        return ret;
        }

        /* Initialize the arrays */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                warn_to[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
        }

        spin_lock(&dq_data_lock);
        spin_lock(&inode->i_lock);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
                spin_unlock(&inode->i_lock);
                spin_unlock(&dq_data_lock);
                return 0;
        }
        cur_space = __inode_get_bytes(inode);
        rsv_space = __inode_get_rsv_space(inode);
        dquots = i_dquot(inode);
        /*
         * Build the transfer_from list, check limits, and update usage in
         * the target structures.
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                /*
                 * Skip changes for same uid or gid or for turned off quota-type.
                 */
                if (!transfer_to[cnt])
                        continue;
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(inode->i_sb, cnt))
                        continue;
                is_valid[cnt] = 1;
                transfer_from[cnt] = srcu_dereference_check(dquots[cnt],
                                &dquot_srcu, lockdep_is_held(&dq_data_lock));
                ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
                                       &warn_to[cnt]);
                if (ret)
                        goto over_quota;
                ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space,
                                      DQUOT_SPACE_WARN, &warn_to[cnt]);
                if (ret) {
                        spin_lock(&transfer_to[cnt]->dq_dqb_lock);
                        dquot_decr_inodes(transfer_to[cnt], inode_usage);
                        spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
                        goto over_quota;
                }
        }

        /* Decrease usage for source structures and update quota pointers */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!is_valid[cnt])
                        continue;
                /* Due to IO error we might not have transfer_from[] structure */
                if (transfer_from[cnt]) {
                        int wtype;

                        spin_lock(&transfer_from[cnt]->dq_dqb_lock);
                        wtype = info_idq_free(transfer_from[cnt], inode_usage);
                        if (wtype != QUOTA_NL_NOWARN)
                                prepare_warning(&warn_from_inodes[cnt],
                                                transfer_from[cnt], wtype);
                        wtype = info_bdq_free(transfer_from[cnt],
                                              cur_space + rsv_space);
                        if (wtype != QUOTA_NL_NOWARN)
                                prepare_warning(&warn_from_space[cnt],
                                                transfer_from[cnt], wtype);
                        dquot_decr_inodes(transfer_from[cnt], inode_usage);
                        dquot_decr_space(transfer_from[cnt], cur_space);
                        dquot_free_reserved_space(transfer_from[cnt],
                                                  rsv_space);
                        spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
                }
                rcu_assign_pointer(dquots[cnt], transfer_to[cnt]);
        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&dq_data_lock);

        /*
         * These arrays are local and we hold dquot references so we don't need
         * the srcu protection but still take dquot_srcu to avoid warning in
         * mark_all_dquot_dirty().
         */
        index = srcu_read_lock(&dquot_srcu);
        err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
        if (err < 0)
                ret = err;
        err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
        if (err < 0)
                ret = err;
        srcu_read_unlock(&dquot_srcu, index);

        flush_warnings(warn_to);
        flush_warnings(warn_from_inodes);
        flush_warnings(warn_from_space);
        /* Pass back references to put */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (is_valid[cnt])
                        transfer_to[cnt] = transfer_from[cnt];
        return ret;
over_quota:
        /* Back out changes we already did */
        for (cnt--; cnt >= 0; cnt--) {
                if (!is_valid[cnt])
                        continue;
                spin_lock(&transfer_to[cnt]->dq_dqb_lock);
                dquot_decr_inodes(transfer_to[cnt], inode_usage);
                dquot_decr_space(transfer_to[cnt], cur_space);
                dquot_free_reserved_space(transfer_to[cnt], rsv_space);
                spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&dq_data_lock);
        flush_warnings(warn_to);
        return ret;
}
EXPORT_SYMBOL(__dquot_transfer);

/* Wrapper for transferring ownership of an inode for uid/gid only
 * Called from FSXXX_setattr()
 */
int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode,
                   struct iattr *iattr)
{
        struct dquot *transfer_to[MAXQUOTAS] = {};
        struct dquot *dquot;
        struct super_block *sb = inode->i_sb;
        int ret;

        if (!inode_quota_active(inode))
                return 0;

        if (i_uid_needs_update(idmap, iattr, inode)) {
                kuid_t kuid = from_vfsuid(idmap, i_user_ns(inode),
                                          iattr->ia_vfsuid);

                dquot = dqget(sb, make_kqid_uid(kuid));
                if (IS_ERR(dquot)) {
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                transfer_to[USRQUOTA] = dquot;
        }
        if (i_gid_needs_update(idmap, iattr, inode)) {
                kgid_t kgid = from_vfsgid(idmap, i_user_ns(inode),
                                          iattr->ia_vfsgid);

                dquot = dqget(sb, make_kqid_gid(kgid));
                if (IS_ERR(dquot)) {
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                transfer_to[GRPQUOTA] = dquot;
        }
        ret = __dquot_transfer(inode, transfer_to);
out_put:
        dqput_all(transfer_to);
        return ret;
}
EXPORT_SYMBOL(dquot_transfer);

/*
 * Write info of quota file to disk
 */
int dquot_commit_info(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);

        return dqopt->ops[type]->write_file_info(sb, type);
}
EXPORT_SYMBOL(dquot_commit_info);

int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
{
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!sb_has_quota_active(sb, qid->type))
                return -ESRCH;
        if (!dqopt->ops[qid->type]->get_next_id)
                return -ENOSYS;
        return dqopt->ops[qid->type]->get_next_id(sb, qid);
}
EXPORT_SYMBOL(dquot_get_next_id);

/*
 * Definitions of diskquota operations.
 */
const struct dquot_operations dquot_operations = {
        .write_dquot        = dquot_commit,
        .acquire_dquot        = dquot_acquire,
        .release_dquot        = dquot_release,
        .mark_dirty        = dquot_mark_dquot_dirty,
        .write_info        = dquot_commit_info,
        .alloc_dquot        = dquot_alloc,
        .destroy_dquot        = dquot_destroy,
        .get_next_id        = dquot_get_next_id,
};
EXPORT_SYMBOL(dquot_operations);

/*
 * Generic helper for ->open on filesystems supporting disk quotas.
 */
int dquot_file_open(struct inode *inode, struct file *file)
{
        int error;

        error = generic_file_open(inode, file);
        if (!error && (file->f_mode & FMODE_WRITE))
                error = dquot_initialize(inode);
        return error;
}
EXPORT_SYMBOL(dquot_file_open);

static void vfs_cleanup_quota_inode(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode = dqopt->files[type];

        if (!inode)
                return;
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                inode_lock(inode);
                inode->i_flags &= ~S_NOQUOTA;
                inode_unlock(inode);
        }
        dqopt->files[type] = NULL;
        iput(inode);
}

/*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
int dquot_disable(struct super_block *sb, int type, unsigned int flags)
{
        int cnt;
        struct quota_info *dqopt = sb_dqopt(sb);

        rwsem_assert_held_write(&sb->s_umount);

        /* Cannot turn off usage accounting without turning off limits, or
         * suspend quotas and simultaneously turn quotas off. */
        if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
            || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
            DQUOT_USAGE_ENABLED)))
                return -EINVAL;

        /*
         * Skip everything if there's nothing to do. We have to do this because
         * sometimes we are called when fill_super() failed and calling
         * sync_fs() in such cases does no good.
         */
        if (!sb_any_quota_loaded(sb))
                return 0;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_loaded(sb, cnt))
                        continue;

                if (flags & DQUOT_SUSPENDED) {
                        spin_lock(&dq_state_lock);
                        dqopt->flags |=
                                dquot_state_flag(DQUOT_SUSPENDED, cnt);
                        spin_unlock(&dq_state_lock);
                } else {
                        spin_lock(&dq_state_lock);
                        dqopt->flags &= ~dquot_state_flag(flags, cnt);
                        /* Turning off suspended quotas? */
                        if (!sb_has_quota_loaded(sb, cnt) &&
                            sb_has_quota_suspended(sb, cnt)) {
                                dqopt->flags &=        ~dquot_state_flag(
                                                        DQUOT_SUSPENDED, cnt);
                                spin_unlock(&dq_state_lock);
                                vfs_cleanup_quota_inode(sb, cnt);
                                continue;
                        }
                        spin_unlock(&dq_state_lock);
                }

                /* We still have to keep quota loaded? */
                if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
                        continue;

                /* Note: these are blocking operations */
                drop_dquot_ref(sb, cnt);
                invalidate_dquots(sb, cnt);
                /*
                 * Now all dquots should be invalidated, all writes done so we
                 * should be only users of the info. No locks needed.
                 */
                if (info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
                if (dqopt->ops[cnt]->free_file_info)
                        dqopt->ops[cnt]->free_file_info(sb, cnt);
                put_quota_format(dqopt->info[cnt].dqi_format);
                dqopt->info[cnt].dqi_flags = 0;
                dqopt->info[cnt].dqi_igrace = 0;
                dqopt->info[cnt].dqi_bgrace = 0;
                dqopt->ops[cnt] = NULL;
        }

        /* Skip syncing and setting flags if quota files are hidden */
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
                goto put_inodes;

        /* Sync the superblock so that buffers with quota data are written to
         * disk (and so userspace sees correct data afterwards). */
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, 1);
        sync_blockdev(sb->s_bdev);
        /* Now the quota files are just ordinary files and we can set the
         * inode flags back. Moreover we discard the pagecache so that
         * userspace sees the writes we did bypassing the pagecache. We
         * must also discard the blockdev buffers so that we see the
         * changes done by userspace on the next quotaon() */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) {
                        inode_lock(dqopt->files[cnt]);
                        truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
                        inode_unlock(dqopt->files[cnt]);
                }
        if (sb->s_bdev)
                invalidate_bdev(sb->s_bdev);
put_inodes:
        /* We are done when suspending quotas */
        if (flags & DQUOT_SUSPENDED)
                return 0;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!sb_has_quota_loaded(sb, cnt))
                        vfs_cleanup_quota_inode(sb, cnt);
        return 0;
}
EXPORT_SYMBOL(dquot_disable);

int dquot_quota_off(struct super_block *sb, int type)
{
        return dquot_disable(sb, type,
                             DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
}
EXPORT_SYMBOL(dquot_quota_off);

/*
 *        Turn quotas on on a device
 */

static int vfs_setup_quota_inode(struct inode *inode, int type)
{
        struct super_block *sb = inode->i_sb;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (is_bad_inode(inode))
                return -EUCLEAN;
        if (!S_ISREG(inode->i_mode))
                return -EACCES;
        if (IS_RDONLY(inode))
                return -EROFS;
        if (sb_has_quota_loaded(sb, type))
                return -EBUSY;

        /*
         * Quota files should never be encrypted.  They should be thought of as
         * filesystem metadata, not user data.  New-style internal quota files
         * cannot be encrypted by users anyway, but old-style external quota
         * files could potentially be incorrectly created in an encrypted
         * directory, hence this explicit check.  Some reasons why encrypted
         * quota files don't work include: (1) some filesystems that support
         * encryption don't handle it in their quota_read and quota_write, and
         * (2) cleaning up encrypted quota files at unmount would need special
         * consideration, as quota files are cleaned up later than user files.
         */
        if (IS_ENCRYPTED(inode))
                return -EINVAL;

        dqopt->files[type] = igrab(inode);
        if (!dqopt->files[type])
                return -EIO;
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                /* We don't want quota and atime on quota files (deadlocks
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
                inode_lock(inode);
                inode->i_flags |= S_NOQUOTA;
                inode_unlock(inode);
                /*
                 * When S_NOQUOTA is set, remove dquot references as no more
                 * references can be added
                 */
                __dquot_drop(inode);
        }
        return 0;
}

int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
        unsigned int flags)
{
        struct quota_format_type *fmt;
        struct quota_info *dqopt = sb_dqopt(sb);
        int error;

        lockdep_assert_held_write(&sb->s_umount);

        /* Just unsuspend quotas? */
        if (WARN_ON_ONCE(flags & DQUOT_SUSPENDED))
                return -EINVAL;

        fmt = find_quota_format(format_id);
        if (!fmt)
                return -ESRCH;
        if (!sb->dq_op || !sb->s_qcop ||
            (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
                error = -EINVAL;
                goto out_fmt;
        }
        /* Filesystems outside of init_user_ns not yet supported */
        if (sb->s_user_ns != &init_user_ns) {
                error = -EINVAL;
                goto out_fmt;
        }
        /* Usage always has to be set... */
        if (!(flags & DQUOT_USAGE_ENABLED)) {
                error = -EINVAL;
                goto out_fmt;
        }
        if (sb_has_quota_loaded(sb, type)) {
                error = -EBUSY;
                goto out_fmt;
        }

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                /* As we bypass the pagecache we must now flush all the
                 * dirty data and invalidate caches so that kernel sees
                 * changes from userspace. It is not enough to just flush
                 * the quota file since if blocksize < pagesize, invalidation
                 * of the cache could fail because of other unrelated dirty
                 * data */
                sync_filesystem(sb);
                invalidate_bdev(sb->s_bdev);
        }

        error = -EINVAL;
        if (!fmt->qf_ops->check_quota_file(sb, type))
                goto out_fmt;

        dqopt->ops[type] = fmt->qf_ops;
        dqopt->info[type].dqi_format = fmt;
        dqopt->info[type].dqi_fmt_id = format_id;
        INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
        error = dqopt->ops[type]->read_file_info(sb, type);
        if (error < 0)
                goto out_fmt;
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) {
                spin_lock(&dq_data_lock);
                dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
                spin_unlock(&dq_data_lock);
        }
        spin_lock(&dq_state_lock);
        dqopt->flags |= dquot_state_flag(flags, type);
        spin_unlock(&dq_state_lock);

        error = add_dquot_ref(sb, type);
        if (error)
                dquot_disable(sb, type,
                              DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);

        return error;
out_fmt:
        put_quota_format(fmt);

        return error;
}
EXPORT_SYMBOL(dquot_load_quota_sb);

/*
 * More powerful function for turning on quotas on given quota inode allowing
 * setting of individual quota flags
 */
int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
        unsigned int flags)
{
        int err;

        err = vfs_setup_quota_inode(inode, type);
        if (err < 0)
                return err;
        err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags);
        if (err < 0)
                vfs_cleanup_quota_inode(inode->i_sb, type);
        return err;
}
EXPORT_SYMBOL(dquot_load_quota_inode);

/* Reenable quotas on remount RW */
int dquot_resume(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        int ret = 0, cnt;
        unsigned int flags;

        rwsem_assert_held_write(&sb->s_umount);

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_suspended(sb, cnt))
                        continue;

                spin_lock(&dq_state_lock);
                flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
                                                        DQUOT_LIMITS_ENABLED,
                                                        cnt);
                dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
                spin_unlock(&dq_state_lock);

                flags = dquot_generic_flag(flags, cnt);
                ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id,
                                          flags);
                if (ret < 0)
                        vfs_cleanup_quota_inode(sb, cnt);
        }

        return ret;
}
EXPORT_SYMBOL(dquot_resume);

int dquot_quota_on(struct super_block *sb, int type, int format_id,
                   const struct path *path)
{
        int error = security_quota_on(path->dentry);
        if (error)
                return error;
        /* Quota file not on the same filesystem? */
        if (path->dentry->d_sb != sb)
                error = -EXDEV;
        else
                error = dquot_load_quota_inode(d_inode(path->dentry), type,
                                             format_id, DQUOT_USAGE_ENABLED |
                                             DQUOT_LIMITS_ENABLED);
        return error;
}
EXPORT_SYMBOL(dquot_quota_on);

/*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
                int format_id, int type)
{
        struct dentry *dentry;
        int error;

        dentry = lookup_noperm_positive_unlocked(&QSTR(qf_name), sb->s_root);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_quota_on(dentry);
        if (!error)
                error = dquot_load_quota_inode(d_inode(dentry), type, format_id,
                                DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);

        dput(dentry);
        return error;
}
EXPORT_SYMBOL(dquot_quota_on_mount);

static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
{
        int ret;
        int type;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
                return -ENOSYS;
        /* Accounting cannot be turned on while fs is mounted */
        flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
        if (!flags)
                return -EINVAL;
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!(flags & qtype_enforce_flag(type)))
                        continue;
                /* Can't enforce without accounting */
                if (!sb_has_quota_usage_enabled(sb, type)) {
                        ret = -EINVAL;
                        goto out_err;
                }
                if (sb_has_quota_limits_enabled(sb, type)) {
                        /* compatible with XFS */
                        ret = -EEXIST;
                        goto out_err;
                }
                spin_lock(&dq_state_lock);
                dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
                spin_unlock(&dq_state_lock);
        }
        return 0;
out_err:
        /* Backout enforcement enablement we already did */
        for (type--; type >= 0; type--)  {
                if (flags & qtype_enforce_flag(type))
                        dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
        }
        return ret;
}

static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
{
        int ret;
        int type;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
                return -ENOSYS;
        /*
         * We don't support turning off accounting via quotactl. In principle
         * quota infrastructure can do this but filesystems don't expect
         * userspace to be able to do it.
         */
        if (flags &
                  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
                return -EOPNOTSUPP;

        /* Filter out limits not enabled */
        for (type = 0; type < MAXQUOTAS; type++)
                if (!sb_has_quota_limits_enabled(sb, type))
                        flags &= ~qtype_enforce_flag(type);
        /* Nothing left? */
        if (!flags)
                return -EEXIST;
        for (type = 0; type < MAXQUOTAS; type++) {
                if (flags & qtype_enforce_flag(type)) {
                        ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
                        if (ret < 0)
                                goto out_err;
                }
        }
        return 0;
out_err:
        /* Backout enforcement disabling we already did */
        for (type--; type >= 0; type--)  {
                if (flags & qtype_enforce_flag(type)) {
                        spin_lock(&dq_state_lock);
                        dqopt->flags |=
                                dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
                        spin_unlock(&dq_state_lock);
                }
        }
        return ret;
}

/* Generic routine for getting common part of quota structure */
static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
        struct mem_dqblk *dm = &dquot->dq_dqb;

        memset(di, 0, sizeof(*di));
        spin_lock(&dquot->dq_dqb_lock);
        di->d_spc_hardlimit = dm->dqb_bhardlimit;
        di->d_spc_softlimit = dm->dqb_bsoftlimit;
        di->d_ino_hardlimit = dm->dqb_ihardlimit;
        di->d_ino_softlimit = dm->dqb_isoftlimit;
        di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
        di->d_ino_count = dm->dqb_curinodes;
        di->d_spc_timer = dm->dqb_btime;
        di->d_ino_timer = dm->dqb_itime;
        spin_unlock(&dquot->dq_dqb_lock);
}

int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
                    struct qc_dqblk *di)
{
        struct dquot *dquot;

        dquot = dqget(sb, qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        do_get_dqblk(dquot, di);
        dqput(dquot);

        return 0;
}
EXPORT_SYMBOL(dquot_get_dqblk);

int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid,
                         struct qc_dqblk *di)
{
        struct dquot *dquot;
        int err;

        if (!sb->dq_op->get_next_id)
                return -ENOSYS;
        err = sb->dq_op->get_next_id(sb, qid);
        if (err < 0)
                return err;
        dquot = dqget(sb, *qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        do_get_dqblk(dquot, di);
        dqput(dquot);

        return 0;
}
EXPORT_SYMBOL(dquot_get_next_dqblk);

#define VFS_QC_MASK \
        (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
         QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
         QC_SPC_TIMER | QC_INO_TIMER)

/* Generic routine for setting common part of quota structure */
static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
        struct mem_dqblk *dm = &dquot->dq_dqb;
        int check_blim = 0, check_ilim = 0;
        struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
        int ret;

        if (di->d_fieldmask & ~VFS_QC_MASK)
                return -EINVAL;

        if (((di->d_fieldmask & QC_SPC_SOFT) &&
             di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_SPC_HARD) &&
             di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_INO_SOFT) &&
             (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
            ((di->d_fieldmask & QC_INO_HARD) &&
             (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
                return -ERANGE;

        spin_lock(&dquot->dq_dqb_lock);
        if (di->d_fieldmask & QC_SPACE) {
                dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_SPC_SOFT)
                dm->dqb_bsoftlimit = di->d_spc_softlimit;
        if (di->d_fieldmask & QC_SPC_HARD)
                dm->dqb_bhardlimit = di->d_spc_hardlimit;
        if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_COUNT) {
                dm->dqb_curinodes = di->d_ino_count;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_SOFT)
                dm->dqb_isoftlimit = di->d_ino_softlimit;
        if (di->d_fieldmask & QC_INO_HARD)
                dm->dqb_ihardlimit = di->d_ino_hardlimit;
        if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_SPC_TIMER) {
                dm->dqb_btime = di->d_spc_timer;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_TIMER) {
                dm->dqb_itime = di->d_ino_timer;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }

        if (check_blim) {
                if (!dm->dqb_bsoftlimit ||
                    dm->dqb_curspace + dm->dqb_rsvspace <= dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
                } else if (!(di->d_fieldmask & QC_SPC_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace;
        }
        if (check_ilim) {
                if (!dm->dqb_isoftlimit ||
                    dm->dqb_curinodes <= dm->dqb_isoftlimit) {
                        dm->dqb_itime = 0;
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
                } else if (!(di->d_fieldmask & QC_INO_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace;
        }
        if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
            dm->dqb_isoftlimit)
                clear_bit(DQ_FAKE_B, &dquot->dq_flags);
        else
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
        spin_unlock(&dquot->dq_dqb_lock);
        ret = mark_dquot_dirty(dquot);
        if (ret < 0)
                return ret;
        return 0;
}

int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
                  struct qc_dqblk *di)
{
        struct dquot *dquot;
        int rc;

        dquot = dqget(sb, qid);
        if (IS_ERR(dquot)) {
                rc = PTR_ERR(dquot);
                goto out;
        }
        rc = do_set_dqblk(dquot, di);
        dqput(dquot);
out:
        return rc;
}
EXPORT_SYMBOL(dquot_set_dqblk);

/* Generic routine for getting common part of quota file information */
int dquot_get_state(struct super_block *sb, struct qc_state *state)
{
        struct mem_dqinfo *mi;
        struct qc_type_state *tstate;
        struct quota_info *dqopt = sb_dqopt(sb);
        int type;

        memset(state, 0, sizeof(*state));
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!sb_has_quota_active(sb, type))
                        continue;
                tstate = state->s_state + type;
                mi = sb_dqopt(sb)->info + type;
                tstate->flags = QCI_ACCT_ENABLED;
                spin_lock(&dq_data_lock);
                if (mi->dqi_flags & DQF_SYS_FILE)
                        tstate->flags |= QCI_SYSFILE;
                if (mi->dqi_flags & DQF_ROOT_SQUASH)
                        tstate->flags |= QCI_ROOT_SQUASH;
                if (sb_has_quota_limits_enabled(sb, type))
                        tstate->flags |= QCI_LIMITS_ENFORCED;
                tstate->spc_timelimit = mi->dqi_bgrace;
                tstate->ino_timelimit = mi->dqi_igrace;
                if (dqopt->files[type]) {
                        tstate->ino = dqopt->files[type]->i_ino;
                        tstate->blocks = dqopt->files[type]->i_blocks;
                }
                tstate->nextents = 1;        /* We don't know... */
                spin_unlock(&dq_data_lock);
        }
        return 0;
}
EXPORT_SYMBOL(dquot_get_state);

/* Generic routine for setting common part of quota file information */
int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
{
        struct mem_dqinfo *mi;

        if ((ii->i_fieldmask & QC_WARNS_MASK) ||
            (ii->i_fieldmask & QC_RT_SPC_TIMER))
                return -EINVAL;
        if (!sb_has_quota_active(sb, type))
                return -ESRCH;
        mi = sb_dqopt(sb)->info + type;
        if (ii->i_fieldmask & QC_FLAGS) {
                if ((ii->i_flags & QCI_ROOT_SQUASH &&
                     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD))
                        return -EINVAL;
        }
        spin_lock(&dq_data_lock);
        if (ii->i_fieldmask & QC_SPC_TIMER)
                mi->dqi_bgrace = ii->i_spc_timelimit;
        if (ii->i_fieldmask & QC_INO_TIMER)
                mi->dqi_igrace = ii->i_ino_timelimit;
        if (ii->i_fieldmask & QC_FLAGS) {
                if (ii->i_flags & QCI_ROOT_SQUASH)
                        mi->dqi_flags |= DQF_ROOT_SQUASH;
                else
                        mi->dqi_flags &= ~DQF_ROOT_SQUASH;
        }
        spin_unlock(&dq_data_lock);
        mark_info_dirty(sb, type);
        /* Force write to disk */
        return sb->dq_op->write_info(sb, type);
}
EXPORT_SYMBOL(dquot_set_dqinfo);

const struct quotactl_ops dquot_quotactl_sysfile_ops = {
        .quota_enable        = dquot_quota_enable,
        .quota_disable        = dquot_quota_disable,
        .quota_sync        = dquot_quota_sync,
        .get_state        = dquot_get_state,
        .set_info        = dquot_set_dqinfo,
        .get_dqblk        = dquot_get_dqblk,
        .get_nextdqblk        = dquot_get_next_dqblk,
        .set_dqblk        = dquot_set_dqblk
};
EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);

static int do_proc_dqstats(const struct ctl_table *table, int write,
                     void *buffer, size_t *lenp, loff_t *ppos)
{
        unsigned int type = (unsigned long *)table->data - dqstats.stat;
        s64 value = percpu_counter_sum(&dqstats.counter[type]);

        /* Filter negative values for non-monotonic counters */
        if (value < 0 && (type == DQST_ALLOC_DQUOTS ||
                          type == DQST_FREE_DQUOTS))
                value = 0;

        /* Update global table */
        dqstats.stat[type] = value;
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table fs_dqstats_table[] = {
        {
                .procname        = "lookups",
                .data                = &dqstats.stat[DQST_LOOKUPS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "drops",
                .data                = &dqstats.stat[DQST_DROPS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "reads",
                .data                = &dqstats.stat[DQST_READS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "writes",
                .data                = &dqstats.stat[DQST_WRITES],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "cache_hits",
                .data                = &dqstats.stat[DQST_CACHE_HITS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "allocated_dquots",
                .data                = &dqstats.stat[DQST_ALLOC_DQUOTS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "free_dquots",
                .data                = &dqstats.stat[DQST_FREE_DQUOTS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "syncs",
                .data                = &dqstats.stat[DQST_SYNCS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
#ifdef CONFIG_PRINT_QUOTA_WARNING
        {
                .procname        = "warnings",
                .data                = &flag_print_warnings,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
};

static int __init dquot_init(void)
{
        int i, ret;
        unsigned long nr_hash, order;
        struct shrinker *dqcache_shrinker;

        printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);

        register_sysctl_init("fs/quota", fs_dqstats_table);

        dquot_cachep = kmem_cache_create("dquot",
                        sizeof(struct dquot), sizeof(unsigned long) * 4,
                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_PANIC),
                        NULL);

        order = 0;
        dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
        if (!dquot_hash)
                panic("Cannot create dquot hash table");

        ret = percpu_counter_init_many(dqstats.counter, 0, GFP_KERNEL,
                                       _DQST_DQSTAT_LAST);
        if (ret)
                panic("Cannot create dquot stat counters");

        /* Find power-of-two hlist_heads which can fit into allocation */
        nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
        dq_hash_bits = ilog2(nr_hash);

        nr_hash = 1UL << dq_hash_bits;
        dq_hash_mask = nr_hash - 1;
        for (i = 0; i < nr_hash; i++)
                INIT_HLIST_HEAD(dquot_hash + i);

        pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
                " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));

        dqcache_shrinker = shrinker_alloc(0, "dquota-cache");
        if (!dqcache_shrinker)
                panic("Cannot allocate dquot shrinker");

        dqcache_shrinker->count_objects = dqcache_shrink_count;
        dqcache_shrinker->scan_objects = dqcache_shrink_scan;

        shrinker_register(dqcache_shrinker);

        quota_unbound_wq = alloc_workqueue("quota_events_unbound",
                                           WQ_UNBOUND | WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
        if (!quota_unbound_wq)
                panic("Cannot create quota_unbound_wq\n");

        return 0;
}
fs_initcall(dquot_init);


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

























    1 
























































































































































































   23 






   23 

































































































































   23 















































































   22 

   23 









   23 
   23 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMZONE_H
#define _LINUX_MMZONE_H

#ifndef __ASSEMBLY__
#ifndef __GENERATING_BOUNDS_H

#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/numa.h>
#include <linux/init.h>
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/pageblock-flags.h>
#include <linux/page-flags-layout.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
#include <linux/zswap.h>
#include <linux/sizes.h>
#include <asm/page.h>

/* Free memory management - zoned buddy allocator.  */
#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
#define MAX_PAGE_ORDER 10
#else
#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)

#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)

#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)

/* Defines the order for the number of pages that have a migrate type. */
#ifndef CONFIG_PAGE_BLOCK_MAX_ORDER
#define PAGE_BLOCK_MAX_ORDER MAX_PAGE_ORDER
#else
#define PAGE_BLOCK_MAX_ORDER CONFIG_PAGE_BLOCK_MAX_ORDER
#endif /* CONFIG_PAGE_BLOCK_MAX_ORDER */

/*
 * The MAX_PAGE_ORDER, which defines the max order of pages to be allocated
 * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_MAX_ORDER,
 * which defines the order for the number of pages that can have a migrate type
 */
#if (PAGE_BLOCK_MAX_ORDER > MAX_PAGE_ORDER)
#error MAX_PAGE_ORDER must be >= PAGE_BLOCK_MAX_ORDER
#endif

/*
 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
 * costly to service.  That is between allocation orders which should
 * coalesce naturally under reasonable reclaim pressure and those which
 * will not.
 */
#define PAGE_ALLOC_COSTLY_ORDER 3

#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS)
/*
 * We don't expect any folios that exceed buddy sizes (and consequently
 * memory sections).
 */
#define MAX_FOLIO_ORDER                MAX_PAGE_ORDER
#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
/*
 * Only pages within a single memory section are guaranteed to be
 * contiguous. By limiting folios to a single memory section, all folio
 * pages are guaranteed to be contiguous.
 */
#define MAX_FOLIO_ORDER                PFN_SECTION_SHIFT
#elif defined(CONFIG_HUGETLB_PAGE)
/*
 * There is no real limit on the folio size. We limit them to the maximum we
 * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
 * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
 */
#ifdef CONFIG_64BIT
#define MAX_FOLIO_ORDER                (ilog2(SZ_16G) - PAGE_SHIFT)
#else
#define MAX_FOLIO_ORDER                (ilog2(SZ_1G) - PAGE_SHIFT)
#endif
#else
/*
 * Without hugetlb, gigantic folios that are bigger than a single PUD are
 * currently impossible.
 */
#define MAX_FOLIO_ORDER                (PUD_SHIFT - PAGE_SHIFT)
#endif

#define MAX_FOLIO_NR_PAGES        (1UL << MAX_FOLIO_ORDER)

/*
 * HugeTLB Vmemmap Optimization (HVO) requires struct pages of the head page to
 * be naturally aligned with regard to the folio size.
 *
 * HVO which is only active if the size of struct page is a power of 2.
 */
#define MAX_FOLIO_VMEMMAP_ALIGN \
        (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) && \
         is_power_of_2(sizeof(struct page)) ? \
         MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0)

/*
 * vmemmap optimization (like HVO) is only possible for page orders that fill
 * two or more pages with struct pages.
 */
#define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page)))
#define __NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1)
#define NR_VMEMMAP_TAILS (__NR_VMEMMAP_TAILS > 0 ? __NR_VMEMMAP_TAILS : 0)

enum migratetype {
        MIGRATE_UNMOVABLE,
        MIGRATE_MOVABLE,
        MIGRATE_RECLAIMABLE,
        MIGRATE_PCPTYPES,        /* the number of types on the pcp lists */
        MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
        /*
         * MIGRATE_CMA migration type is designed to mimic the way
         * ZONE_MOVABLE works.  Only movable pages can be allocated
         * from MIGRATE_CMA pageblocks and page allocator never
         * implicitly change migration type of MIGRATE_CMA pageblock.
         *
         * The way to use it is to change migratetype of a range of
         * pageblocks to MIGRATE_CMA which can be done by
         * __free_pageblock_cma() function.
         */
        MIGRATE_CMA,
        __MIGRATE_TYPE_END = MIGRATE_CMA,
#else
        __MIGRATE_TYPE_END = MIGRATE_HIGHATOMIC,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
        MIGRATE_ISOLATE,        /* can't allocate from here */
#endif
        MIGRATE_TYPES
};

/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern const char * const migratetype_names[MIGRATE_TYPES];

#ifdef CONFIG_CMA
#  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
/*
 * __dump_folio() in mm/debug.c passes a folio pointer to on-stack struct folio,
 * so folio_pfn() cannot be used and pfn is needed.
 */
#  define is_migrate_cma_folio(folio, pfn) \
        (get_pfnblock_migratetype(&folio->page, pfn) == MIGRATE_CMA)
#else
#  define is_migrate_cma(migratetype) false
#  define is_migrate_cma_page(_page) false
#  define is_migrate_cma_folio(folio, pfn) false
#endif

static inline bool is_migrate_movable(int mt)
{
        return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
}

/*
 * Check whether a migratetype can be merged with another migratetype.
 *
 * It is only mergeable when it can fall back to other migratetypes for
 * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c.
 */
static inline bool migratetype_is_mergeable(int mt)
{
        return mt < MIGRATE_PCPTYPES;
}

#define for_each_migratetype_order(order, type) \
        for (order = 0; order < NR_PAGE_ORDERS; order++) \
                for (type = 0; type < MIGRATE_TYPES; type++)

extern int page_group_by_mobility_disabled;

#define get_pageblock_migratetype(page) \
        get_pfnblock_migratetype(page, page_to_pfn(page))

#define folio_migratetype(folio) \
        get_pageblock_migratetype(&folio->page)

struct free_area {
        struct list_head        free_list[MIGRATE_TYPES];
        unsigned long                nr_free;
};

struct pglist_data;

#ifdef CONFIG_NUMA
enum numa_stat_item {
        NUMA_HIT,                /* allocated in intended node */
        NUMA_MISS,                /* allocated in non intended node */
        NUMA_FOREIGN,                /* was intended here, hit elsewhere */
        NUMA_INTERLEAVE_HIT,        /* interleaver preferred this zone */
        NUMA_LOCAL,                /* allocation from local node */
        NUMA_OTHER,                /* allocation from other node */
        NR_VM_NUMA_EVENT_ITEMS
};
#else
#define NR_VM_NUMA_EVENT_ITEMS 0
#endif

enum zone_stat_item {
        /* First 128 byte cacheline (assuming 64 bit words) */
        NR_FREE_PAGES,
        NR_FREE_PAGES_BLOCKS,
        NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
        NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
        NR_ZONE_ACTIVE_ANON,
        NR_ZONE_INACTIVE_FILE,
        NR_ZONE_ACTIVE_FILE,
        NR_ZONE_UNEVICTABLE,
        NR_ZONE_WRITE_PENDING,        /* Count of dirty, writeback and unstable pages */
        NR_MLOCK,                /* mlock()ed pages found and moved off LRU */
        /* Second 128 byte cacheline */
#if IS_ENABLED(CONFIG_ZSMALLOC)
        NR_ZSPAGES,                /* allocated in zsmalloc */
#endif
        NR_FREE_CMA_PAGES,
#ifdef CONFIG_UNACCEPTED_MEMORY
        NR_UNACCEPTED,
#endif
        NR_VM_ZONE_STAT_ITEMS };

enum node_stat_item {
        NR_LRU_BASE,
        NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
        NR_ACTIVE_ANON,                /*  "     "     "   "       "         */
        NR_INACTIVE_FILE,        /*  "     "     "   "       "         */
        NR_ACTIVE_FILE,                /*  "     "     "   "       "         */
        NR_UNEVICTABLE,                /*  "     "     "   "       "         */
        NR_SLAB_RECLAIMABLE_B,
        NR_SLAB_UNRECLAIMABLE_B,
        NR_ISOLATED_ANON,        /* Temporary isolated pages from anon lru */
        NR_ISOLATED_FILE,        /* Temporary isolated pages from file lru */
        WORKINGSET_NODES,
        WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_FILE,
        WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_FILE,
        WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_FILE,
        WORKINGSET_NODERECLAIM,
        NR_ANON_MAPPED,        /* Mapped anonymous pages */
        NR_FILE_MAPPED,        /* pagecache pages mapped into pagetables.
                           only modified from process context */
        NR_FILE_PAGES,
        NR_FILE_DIRTY,
        NR_WRITEBACK,
        NR_SHMEM,                /* shmem pages (included tmpfs/GEM pages) */
        NR_SHMEM_THPS,
        NR_SHMEM_PMDMAPPED,
        NR_FILE_THPS,
        NR_FILE_PMDMAPPED,
        NR_ANON_THPS,
        NR_VMSCAN_WRITE,
        NR_VMSCAN_IMMEDIATE,        /* Prioritise for reclaim when writeback ends */
        NR_DIRTIED,                /* page dirtyings since bootup */
        NR_WRITTEN,                /* page writings since bootup */
        NR_THROTTLED_WRITTEN,        /* NR_WRITTEN while reclaim throttled */
        NR_KERNEL_MISC_RECLAIMABLE,        /* reclaimable non-slab kernel pages */
        NR_FOLL_PIN_ACQUIRED,        /* via: pin_user_page(), gup flag: FOLL_PIN */
        NR_FOLL_PIN_RELEASED,        /* pages returned via unpin_user_page() */
        NR_VMALLOC,
        NR_KERNEL_STACK_KB,        /* measured in KiB */
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
        NR_KERNEL_SCS_KB,        /* measured in KiB */
#endif
        NR_PAGETABLE,                /* used for pagetables */
        NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */
#ifdef CONFIG_IOMMU_SUPPORT
        NR_IOMMU_PAGES,                /* # of pages allocated by IOMMU */
#endif
#ifdef CONFIG_SWAP
        NR_SWAPCACHE,
#endif
#ifdef CONFIG_NUMA_BALANCING
        PGPROMOTE_SUCCESS,        /* promote successfully */
        /**
         * Candidate pages for promotion based on hint fault latency.  This
         * counter is used to control the promotion rate and adjust the hot
         * threshold.
         */
        PGPROMOTE_CANDIDATE,
        /**
         * Not rate-limited (NRL) candidate pages for those can be promoted
         * without considering hot threshold because of enough free pages in
         * fast-tier node.  These promotions bypass the regular hotness checks
         * and do NOT influence the promotion rate-limiter or
         * threshold-adjustment logic.
         * This is for statistics/monitoring purposes.
         */
        PGPROMOTE_CANDIDATE_NRL,
#endif
        /* PGDEMOTE_*: pages demoted */
        PGDEMOTE_KSWAPD,
        PGDEMOTE_DIRECT,
        PGDEMOTE_KHUGEPAGED,
        PGDEMOTE_PROACTIVE,
        PGSTEAL_KSWAPD,
        PGSTEAL_DIRECT,
        PGSTEAL_KHUGEPAGED,
        PGSTEAL_PROACTIVE,
        PGSTEAL_ANON,
        PGSTEAL_FILE,
        PGSCAN_KSWAPD,
        PGSCAN_DIRECT,
        PGSCAN_KHUGEPAGED,
        PGSCAN_PROACTIVE,
        PGSCAN_ANON,
        PGSCAN_FILE,
        PGREFILL,
#ifdef CONFIG_HUGETLB_PAGE
        NR_HUGETLB,
#endif
        NR_BALLOON_PAGES,
        NR_KERNEL_FILE_PAGES,
        NR_GPU_ACTIVE,        /* Pages assigned to GPU objects */
        NR_GPU_RECLAIM,        /* Pages in shrinkable GPU pools */
        NR_VM_NODE_STAT_ITEMS
};

/*
 * Returns true if the item should be printed in THPs (/proc/vmstat
 * currently prints number of anon, file and shmem THPs. But the item
 * is charged in pages).
 */
static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return false;

        return item == NR_ANON_THPS ||
               item == NR_FILE_THPS ||
               item == NR_SHMEM_THPS ||
               item == NR_SHMEM_PMDMAPPED ||
               item == NR_FILE_PMDMAPPED;
}

/*
 * Returns true if the value is measured in bytes (most vmstat values are
 * measured in pages). This defines the API part, the internal representation
 * might be different.
 */
static __always_inline bool vmstat_item_in_bytes(int idx)
{
        /*
         * Global and per-node slab counters track slab pages.
         * It's expected that changes are multiples of PAGE_SIZE.
         * Internally values are stored in pages.
         *
         * Per-memcg and per-lruvec counters track memory, consumed
         * by individual slab objects. These counters are actually
         * byte-precise.
         */
        return (idx == NR_SLAB_RECLAIMABLE_B ||
                idx == NR_SLAB_UNRECLAIMABLE_B);
}

/*
 * We do arithmetic on the LRU lists in various places in the code,
 * so it is important to keep the active lists LRU_ACTIVE higher in
 * the array than the corresponding inactive lists, and to keep
 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
 *
 * This has to be kept in sync with the statistics in zone_stat_item
 * above and the descriptions in vmstat_text in mm/vmstat.c
 */
#define LRU_BASE 0
#define LRU_ACTIVE 1
#define LRU_FILE 2

enum lru_list {
        LRU_INACTIVE_ANON = LRU_BASE,
        LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
        LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
        LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
        LRU_UNEVICTABLE,
        NR_LRU_LISTS
};

enum vmscan_throttle_state {
        VMSCAN_THROTTLE_WRITEBACK,
        VMSCAN_THROTTLE_ISOLATED,
        VMSCAN_THROTTLE_NOPROGRESS,
        VMSCAN_THROTTLE_CONGESTED,
        NR_VMSCAN_THROTTLE,
};

#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)

static inline bool is_file_lru(enum lru_list lru)
{
        return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
}

static inline bool is_active_lru(enum lru_list lru)
{
        return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}

#define WORKINGSET_ANON 0
#define WORKINGSET_FILE 1
#define ANON_AND_FILE 2

enum lruvec_flags {
        /*
         * An lruvec has many dirty pages backed by a congested BDI:
         * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim.
         *    It can be cleared by cgroup reclaim or kswapd.
         * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim.
         *    It can only be cleared by kswapd.
         *
         * Essentially, kswapd can unthrottle an lruvec throttled by cgroup
         * reclaim, but not vice versa. This only applies to the root cgroup.
         * The goal is to prevent cgroup reclaim on the root cgroup (e.g.
         * memory.reclaim) to unthrottle an unbalanced node (that was throttled
         * by kswapd).
         */
        LRUVEC_CGROUP_CONGESTED,
        LRUVEC_NODE_CONGESTED,
};

#endif /* !__GENERATING_BOUNDS_H */

/*
 * Evictable folios are divided into multiple generations. The youngest and the
 * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
 * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
 * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
 * corresponding generation. The gen counter in folio->flags stores gen+1 while
 * a folio is on one of lrugen->folios[]. Otherwise it stores 0.
 *
 * After a folio is faulted in, the aging needs to check the accessed bit at
 * least twice before handing this folio over to the eviction. The first check
 * clears the accessed bit from the initial fault; the second check makes sure
 * this folio hasn't been used since then. This process, AKA second chance,
 * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI
 * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two
 * generations are considered active; the rest of generations, if they exist,
 * are considered inactive. See lru_gen_is_active().
 *
 * PG_active is always cleared while a folio is on one of lrugen->folios[] so
 * that the sliding window needs not to worry about it. And it's set again when
 * a folio considered active is isolated for non-reclaiming purposes, e.g.,
 * migration. See lru_gen_add_folio() and lru_gen_del_folio().
 *
 * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
 * number of categories of the active/inactive LRU when keeping track of
 * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
 * in folio->flags, masked by LRU_GEN_MASK.
 */
#define MIN_NR_GENS                2U
#define MAX_NR_GENS                4U

/*
 * Each generation is divided into multiple tiers. A folio accessed N times
 * through file descriptors is in tier order_base_2(N). A folio in the first
 * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page
 * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by
 * PG_workingset. A folio in any other tier (1<N<5) between the first and last
 * is marked by additional bits of LRU_REFS_WIDTH in folio->flags.
 *
 * In contrast to moving across generations which requires the LRU lock, moving
 * across tiers only involves atomic operations on folio->flags and therefore
 * has a negligible cost in the buffered access path. In the eviction path,
 * comparisons of refaulted/(evicted+protected) from the first tier and the rest
 * infer whether folios accessed multiple times through file descriptors are
 * statistically hot and thus worth protecting.
 *
 * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
 * number of categories of the active/inactive LRU when keeping track of
 * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
 * folio->flags, masked by LRU_REFS_MASK.
 */
#define MAX_NR_TIERS                4U

#ifndef __GENERATING_BOUNDS_H

#define LRU_GEN_MASK                ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK                ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)

/*
 * For folios accessed multiple times through file descriptors,
 * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags
 * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its
 * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily
 * promoted into the second oldest generation in the eviction path. And when
 * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that
 * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is
 * only valid when PG_referenced is set.
 *
 * For folios accessed multiple times through page tables, folio_update_gen()
 * from a page table walk or lru_gen_set_refs() from a rmap walk sets
 * PG_referenced after the accessed bit is cleared for the first time.
 * Thereafter, those two paths set PG_workingset and promote folios to the
 * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears
 * PG_referenced. Note that for this case, LRU_REFS_MASK is not used.
 *
 * For both cases above, after PG_workingset is set on a folio, it remains until
 * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It
 * can be set again if lru_gen_test_recent() returns true upon a refault.
 */
#define LRU_REFS_FLAGS                (LRU_REFS_MASK | BIT(PG_referenced))

struct lruvec;
struct page_vma_mapped_walk;

#ifdef CONFIG_LRU_GEN

enum {
        LRU_GEN_ANON,
        LRU_GEN_FILE,
};

enum {
        LRU_GEN_CORE,
        LRU_GEN_MM_WALK,
        LRU_GEN_NONLEAF_YOUNG,
        NR_LRU_GEN_CAPS
};

#define MIN_LRU_BATCH                BITS_PER_LONG
#define MAX_LRU_BATCH                (MIN_LRU_BATCH * 64)

/* whether to keep historical stats from evicted generations */
#ifdef CONFIG_LRU_GEN_STATS
#define NR_HIST_GENS                MAX_NR_GENS
#else
#define NR_HIST_GENS                1U
#endif

/*
 * The youngest generation number is stored in max_seq for both anon and file
 * types as they are aged on an equal footing. The oldest generation numbers are
 * stored in min_seq[] separately for anon and file types so that they can be
 * incremented independently. Ideally min_seq[] are kept in sync when both anon
 * and file types are evictable. However, to adapt to situations like extreme
 * swappiness, they are allowed to be out of sync by at most
 * MAX_NR_GENS-MIN_NR_GENS-1.
 *
 * The number of pages in each generation is eventually consistent and therefore
 * can be transiently negative when reset_batch_size() is pending.
 */
struct lru_gen_folio {
        /* the aging increments the youngest generation number */
        unsigned long max_seq;
        /* the eviction increments the oldest generation numbers */
        unsigned long min_seq[ANON_AND_FILE];
        /* the birth time of each generation in jiffies */
        unsigned long timestamps[MAX_NR_GENS];
        /* the multi-gen LRU lists, lazily sorted on eviction */
        struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* the multi-gen LRU sizes, eventually consistent */
        long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* the exponential moving average of refaulted */
        unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
        /* the exponential moving average of evicted+protected */
        unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
        /* can only be modified under the LRU lock */
        unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        /* can be modified without holding the LRU lock */
        atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        /* whether the multi-gen LRU is enabled */
        bool enabled;
        /* the memcg generation this lru_gen_folio belongs to */
        u8 gen;
        /* the list segment this lru_gen_folio belongs to */
        u8 seg;
        /* per-node lru_gen_folio list for global reclaim */
        struct hlist_nulls_node list;
};

enum {
        MM_LEAF_TOTAL,                /* total leaf entries */
        MM_LEAF_YOUNG,                /* young leaf entries */
        MM_NONLEAF_FOUND,        /* non-leaf entries found in Bloom filters */
        MM_NONLEAF_ADDED,        /* non-leaf entries added to Bloom filters */
        NR_MM_STATS
};

/* double-buffering Bloom filters */
#define NR_BLOOM_FILTERS        2

struct lru_gen_mm_state {
        /* synced with max_seq after each iteration */
        unsigned long seq;
        /* where the current iteration continues after */
        struct list_head *head;
        /* where the last iteration ended before */
        struct list_head *tail;
        /* Bloom filters flip after each iteration */
        unsigned long *filters[NR_BLOOM_FILTERS];
        /* the mm stats for debugging */
        unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
};

struct lru_gen_mm_walk {
        /* the lruvec under reclaim */
        struct lruvec *lruvec;
        /* max_seq from lru_gen_folio: can be out of date */
        unsigned long seq;
        /* the next address within an mm to scan */
        unsigned long next_addr;
        /* to batch promoted pages */
        int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* to batch the mm stats */
        int mm_stats[NR_MM_STATS];
        /* total batched items */
        int batched;
        int swappiness;
        bool force_scan;
};

/*
 * For each node, memcgs are divided into two generations: the old and the
 * young. For each generation, memcgs are randomly sharded into multiple bins
 * to improve scalability. For each bin, the hlist_nulls is virtually divided
 * into three segments: the head, the tail and the default.
 *
 * An onlining memcg is added to the tail of a random bin in the old generation.
 * The eviction starts at the head of a random bin in the old generation. The
 * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
 * the old generation, is incremented when all its bins become empty.
 *
 * There are four operations:
 * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its
 *    current generation (old or young) and updates its "seg" to "head";
 * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its
 *    current generation (old or young) and updates its "seg" to "tail";
 * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old
 *    generation, updates its "gen" to "old" and resets its "seg" to "default";
 * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the
 *    young generation, updates its "gen" to "young" and resets its "seg" to
 *    "default".
 *
 * The events that trigger the above operations are:
 * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
 * 2. The first attempt to reclaim a memcg below low, which triggers
 *    MEMCG_LRU_TAIL;
 * 3. The first attempt to reclaim a memcg offlined or below reclaimable size
 *    threshold, which triggers MEMCG_LRU_TAIL;
 * 4. The second attempt to reclaim a memcg offlined or below reclaimable size
 *    threshold, which triggers MEMCG_LRU_YOUNG;
 * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG;
 * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
 * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD.
 *
 * Notes:
 * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing
 *    of their max_seq counters ensures the eventual fairness to all eligible
 *    memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
 * 2. There are only two valid generations: old (seq) and young (seq+1).
 *    MEMCG_NR_GENS is set to three so that when reading the generation counter
 *    locklessly, a stale value (seq-1) does not wraparound to young.
 */
#define MEMCG_NR_GENS        3
#define MEMCG_NR_BINS        8

struct lru_gen_memcg {
        /* the per-node memcg generation counter */
        unsigned long seq;
        /* each memcg has one lru_gen_folio per node */
        unsigned long nr_memcgs[MEMCG_NR_GENS];
        /* per-node lru_gen_folio list for global reclaim */
        struct hlist_nulls_head        fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
        /* protects the above */
        spinlock_t lock;
};

void lru_gen_init_pgdat(struct pglist_data *pgdat);
void lru_gen_init_lruvec(struct lruvec *lruvec);
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr);

void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);
void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid);
bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid);
void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid);

#else /* !CONFIG_LRU_GEN */

static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
}

static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}

static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw,
                unsigned int nr)
{
        return false;
}

static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
}

static inline void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid)
{
}

static inline bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid)
{
        return true;
}

static inline
void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
{
}

#endif /* CONFIG_LRU_GEN */

struct lruvec {
        struct list_head                lists[NR_LRU_LISTS];
        /* per lruvec lru_lock for memcg */
        spinlock_t                        lru_lock;
        /*
         * These track the cost of reclaiming one LRU - file or anon -
         * over the other. As the observed cost of reclaiming one LRU
         * increases, the reclaim scan balance tips toward the other.
         */
        unsigned long                        anon_cost;
        unsigned long                        file_cost;
        /* Non-resident age, driven by LRU movement */
        atomic_long_t                        nonresident_age;
        /* Refaults at the time of last reclaim cycle */
        unsigned long                        refaults[ANON_AND_FILE];
        /* Various lruvec state flags (enum lruvec_flags) */
        unsigned long                        flags;
#ifdef CONFIG_LRU_GEN
        /* evictable pages divided into generations */
        struct lru_gen_folio                lrugen;
#ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* to concurrently iterate lru_gen_mm_list */
        struct lru_gen_mm_state                mm_state;
#endif
#endif /* CONFIG_LRU_GEN */
#ifdef CONFIG_MEMCG
        struct pglist_data *pgdat;
#endif
        struct zswap_lruvec_state zswap_lruvec_state;
};

/* Isolate for asynchronous migration */
#define ISOLATE_ASYNC_MIGRATE        ((__force isolate_mode_t)0x4)
/* Isolate unevictable pages */
#define ISOLATE_UNEVICTABLE        ((__force isolate_mode_t)0x8)

/* LRU Isolation modes. */
typedef unsigned __bitwise isolate_mode_t;

enum zone_watermarks {
        WMARK_MIN,
        WMARK_LOW,
        WMARK_HIGH,
        WMARK_PROMO,
        NR_WMARK
};

/*
 * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists
 * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list
 * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE.
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_PCP_THP 2
#else
#define NR_PCP_THP 0
#endif
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)

/*
 * Flags used in pcp->flags field.
 *
 * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
 * previous page freeing.  To avoid to drain PCP for an accident
 * high-order page freeing.
 *
 * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
 * draining PCP for consecutive high-order pages freeing without
 * allocation if data cache slice of CPU is large enough.  To reduce
 * zone lock contention and keep cache-hot pages reusing.
 */
#define        PCPF_PREV_FREE_HIGH_ORDER        BIT(0)
#define        PCPF_FREE_HIGH_BATCH                BIT(1)

struct per_cpu_pages {
        spinlock_t lock;        /* Protects lists field */
        int count;                /* number of pages in the list */
        int high;                /* high watermark, emptying needed */
        int high_min;                /* min high watermark */
        int high_max;                /* max high watermark */
        int batch;                /* chunk size for buddy add/remove */
        u8 flags;                /* protected by pcp->lock */
        u8 alloc_factor;        /* batch scaling factor during allocate */
#ifdef CONFIG_NUMA
        u8 expire;                /* When 0, remote pagesets are drained */
#endif
        short free_count;        /* consecutive free count */

        /* Lists of pages, one per migrate type stored on the pcp-lists */
        struct list_head lists[NR_PCP_LISTS];
} ____cacheline_aligned_in_smp;

struct per_cpu_zonestat {
#ifdef CONFIG_SMP
        s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
        s8 stat_threshold;
#endif
#ifdef CONFIG_NUMA
        /*
         * Low priority inaccurate counters that are only folded
         * on demand. Use a large type to avoid the overhead of
         * folding during refresh_cpu_vm_stats.
         */
        unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
#endif
};

struct per_cpu_nodestat {
        s8 stat_threshold;
        s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};

#endif /* !__GENERATING_BOUNDS.H */

enum zone_type {
        /*
         * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
         * to DMA to all of the addressable memory (ZONE_NORMAL).
         * On architectures where this area covers the whole 32 bit address
         * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
         * DMA addressing constraints. This distinction is important as a 32bit
         * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
         * platforms may need both zones as they support peripherals with
         * different DMA addressing limitations.
         */
#ifdef CONFIG_ZONE_DMA
        ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
        ZONE_DMA32,
#endif
        /*
         * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
         * performed on pages in ZONE_NORMAL if the DMA devices support
         * transfers to all addressable memory.
         */
        ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
        /*
         * A memory area that is only addressable by the kernel through
         * mapping portions into its own address space. This is for example
         * used by i386 to allow the kernel to address the memory beyond
         * 900MB. The kernel will set up special mappings (page
         * table entries on i386) for each page that the kernel needs to
         * access.
         */
        ZONE_HIGHMEM,
#endif
        /*
         * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
         * movable pages with few exceptional cases described below. Main use
         * cases for ZONE_MOVABLE are to make memory offlining/unplug more
         * likely to succeed, and to locally limit unmovable allocations - e.g.,
         * to increase the number of THP/huge pages. Notable special cases are:
         *
         * 1. Pinned pages: (long-term) pinning of movable pages might
         *    essentially turn such pages unmovable. Therefore, we do not allow
         *    pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
         *    faulted, they come from the right zone right away. However, it is
         *    still possible that address space already has pages in
         *    ZONE_MOVABLE at the time when pages are pinned (i.e. user has
         *    touches that memory before pinning). In such case we migrate them
         *    to a different zone. When migration fails - pinning fails.
         * 2. memblock allocations: kernelcore/movablecore setups might create
         *    situations where ZONE_MOVABLE contains unmovable allocations
         *    after boot. Memory offlining and allocations fail early.
         * 3. Memory holes: kernelcore/movablecore setups might create very rare
         *    situations where ZONE_MOVABLE contains memory holes after boot,
         *    for example, if we have sections that are only partially
         *    populated. Memory offlining and allocations fail early.
         * 4. PG_hwpoison pages: while poisoned pages can be skipped during
         *    memory offlining, such pages cannot be allocated.
         * 5. Unmovable PG_offline pages: in paravirtualized environments,
         *    hotplugged memory blocks might only partially be managed by the
         *    buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
         *    parts not manged by the buddy are unmovable PG_offline pages. In
         *    some cases (virtio-mem), such pages can be skipped during
         *    memory offlining, however, cannot be moved/allocated. These
         *    techniques might use alloc_contig_range() to hide previously
         *    exposed pages from the buddy again (e.g., to implement some sort
         *    of memory unplug in virtio-mem).
         * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
         *    situations where ZERO_PAGE(0) which is allocated differently
         *    on different platforms may end up in a movable zone. ZERO_PAGE(0)
         *    cannot be migrated.
         * 7. Memory-hotplug: when using memmap_on_memory and onlining the
         *    memory to the MOVABLE zone, the vmemmap pages are also placed in
         *    such zone. Such pages cannot be really moved around as they are
         *    self-stored in the range, but they are treated as movable when
         *    the range they describe is about to be offlined.
         *
         * In general, no unmovable allocations that degrade memory offlining
         * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
         * have to expect that migrating pages in ZONE_MOVABLE can fail (even
         * if has_unmovable_pages() states that there are no unmovable pages,
         * there can be false negatives).
         */
        ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
        ZONE_DEVICE,
#endif
        __MAX_NR_ZONES

};

#ifndef __GENERATING_BOUNDS_H

#define ASYNC_AND_SYNC 2

struct zone {
        /* Read-mostly fields */

        /* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long _watermark[NR_WMARK];
        unsigned long watermark_boost;

        unsigned long nr_reserved_highatomic;
        unsigned long nr_free_highatomic;

        /*
         * We don't know if the memory that we're going to allocate will be
         * freeable or/and it will be released eventually, so to avoid totally
         * wasting several GB of ram we must reserve some of the lower zone
         * memory (otherwise we risk to run OOM on the lower zones despite
         * there being tons of freeable ram on the higher zones).  This array is
         * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
         * changes.
         */
        long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
        int node;
#endif
        struct pglist_data        *zone_pgdat;
        struct per_cpu_pages        __percpu *per_cpu_pageset;
        struct per_cpu_zonestat        __percpu *per_cpu_zonestats;
        /*
         * the high and batch values are copied to individual pagesets for
         * faster access
         */
        int pageset_high_min;
        int pageset_high_max;
        int pageset_batch;

#ifndef CONFIG_SPARSEMEM
        /*
         * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
         * In SPARSEMEM, this map is stored in struct mem_section
         */
        unsigned long                *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long                zone_start_pfn;

        /*
         * spanned_pages is the total pages spanned by the zone, including
         * holes, which is calculated as:
         *         spanned_pages = zone_end_pfn - zone_start_pfn;
         *
         * present_pages is physical pages existing within the zone, which
         * is calculated as:
         *        present_pages = spanned_pages - absent_pages(pages in holes);
         *
         * present_early_pages is present pages existing within the zone
         * located on memory available since early boot, excluding hotplugged
         * memory.
         *
         * managed_pages is present pages managed by the buddy system, which
         * is calculated as (reserved_pages includes pages allocated by the
         * bootmem allocator):
         *        managed_pages = present_pages - reserved_pages;
         *
         * cma pages is present pages that are assigned for CMA use
         * (MIGRATE_CMA).
         *
         * So present_pages may be used by memory hotplug or memory power
         * management logic to figure out unmanaged pages by checking
         * (present_pages - managed_pages). And managed_pages should be used
         * by page allocator and vm scanner to calculate all kinds of watermarks
         * and thresholds.
         *
         * Locking rules:
         *
         * zone_start_pfn and spanned_pages are protected by span_seqlock.
         * It is a seqlock because it has to be read outside of zone->lock,
         * and it is done in the main allocator path.  But, it is written
         * quite infrequently.
         *
         * The span_seq lock is declared along with zone->lock because it is
         * frequently read in proximity to zone->lock.  It's good to
         * give them a chance of being in the same cacheline.
         *
         * Write access to present_pages at runtime should be protected by
         * mem_hotplug_begin/done(). Any reader who can't tolerant drift of
         * present_pages should use get_online_mems() to get a stable value.
         */
        atomic_long_t                managed_pages;
        unsigned long                spanned_pages;
        unsigned long                present_pages;
#if defined(CONFIG_MEMORY_HOTPLUG)
        unsigned long                present_early_pages;
#endif
#ifdef CONFIG_CMA
        unsigned long                cma_pages;
#endif

        const char                *name;

#ifdef CONFIG_MEMORY_ISOLATION
        /*
         * Number of isolated pageblock. It is used to solve incorrect
         * freepage counting problem due to racy retrieving migratetype
         * of pageblock. Protected by zone->lock.
         */
        unsigned long                nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
        /* see spanned/present_pages for more description */
        seqlock_t                span_seqlock;
#endif

        int initialized;

        /* Write-intensive fields used from the page allocator */
        CACHELINE_PADDING(_pad1_);

        /* free areas of different sizes */
        struct free_area        free_area[NR_PAGE_ORDERS];

#ifdef CONFIG_UNACCEPTED_MEMORY
        /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
        struct list_head        unaccepted_pages;

        /* To be called once the last page in the zone is accepted */
        struct work_struct        unaccepted_cleanup;
#endif

        /* zone flags, see below */
        unsigned long                flags;

        /* Primarily protects free_area */
        spinlock_t                lock;

        /* Pages to be freed when next trylock succeeds */
        struct llist_head        trylock_free_pages;

        /* Write-intensive fields used by compaction and vmstats. */
        CACHELINE_PADDING(_pad2_);

        /*
         * When free pages are below this point, additional steps are taken
         * when reading the number of free pages to avoid per-cpu counter
         * drift allowing watermarks to be breached
         */
        unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* pfn where compaction free scanner should start */
        unsigned long                compact_cached_free_pfn;
        /* pfn where compaction migration scanner should start */
        unsigned long                compact_cached_migrate_pfn[ASYNC_AND_SYNC];
        unsigned long                compact_init_migrate_pfn;
        unsigned long                compact_init_free_pfn;
#endif

#ifdef CONFIG_COMPACTION
        /*
         * On compaction failure, 1<<compact_defer_shift compactions
         * are skipped before trying again. The number attempted since
         * last failure is tracked with compact_considered.
         * compact_order_failed is the minimum compaction failed order.
         */
        unsigned int                compact_considered;
        unsigned int                compact_defer_shift;
        int                        compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* Set to true when the PG_migrate_skip bits should be cleared */
        bool                        compact_blockskip_flush;
#endif

        bool                        contiguous;

        CACHELINE_PADDING(_pad3_);
        /* Zone statistics */
        atomic_long_t                vm_stat[NR_VM_ZONE_STAT_ITEMS];
        atomic_long_t                vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
        struct page *vmemmap_tails[NR_VMEMMAP_TAILS];
#endif
} ____cacheline_internodealigned_in_smp;

enum pgdat_flags {
        PGDAT_WRITEBACK,                /* reclaim scanning has recently found
                                         * many pages under writeback
                                         */
        PGDAT_RECLAIM_LOCKED,                /* prevents concurrent reclaim */
};

enum zone_flags {
        ZONE_BOOSTED_WATERMARK,                /* zone recently boosted watermarks.
                                         * Cleared when kswapd is woken.
                                         */
        ZONE_RECLAIM_ACTIVE,                /* kswapd may be scanning the zone. */
        ZONE_BELOW_HIGH,                /* zone is below high watermark. */
};

static inline unsigned long wmark_pages(const struct zone *z,
                                        enum zone_watermarks w)
{
        return z->_watermark[w] + z->watermark_boost;
}

static inline unsigned long min_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_MIN);
}

static inline unsigned long low_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_LOW);
}

static inline unsigned long high_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_HIGH);
}

static inline unsigned long promo_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_PROMO);
}

static inline unsigned long zone_managed_pages(const struct zone *zone)
{
        return (unsigned long)atomic_long_read(&zone->managed_pages);
}

static inline unsigned long zone_cma_pages(struct zone *zone)
{
#ifdef CONFIG_CMA
        return zone->cma_pages;
#else
        return 0;
#endif
}

static inline unsigned long zone_end_pfn(const struct zone *zone)
{
        return zone->zone_start_pfn + zone->spanned_pages;
}

static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
{
        return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}

static inline bool zone_is_initialized(const struct zone *zone)
{
        return zone->initialized;
}

static inline bool zone_is_empty(const struct zone *zone)
{
        return zone->spanned_pages == 0;
}

#ifndef BUILD_VDSO32_64
/*
 * The zone field is never updated after free_area_init_core()
 * sets it, so none of the operations on it need to be atomic.
 */

/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
#define SECTIONS_PGOFF                ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF                (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF                (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF        (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF                (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
#define LRU_GEN_PGOFF                (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
#define LRU_REFS_PGOFF                (LRU_GEN_PGOFF - LRU_REFS_WIDTH)

/*
 * Define the bit shifts to access each section.  For non-existent
 * sections we define the shift as 0; that plus a 0 mask ensures
 * the compiler will optimise away reference to them.
 */
#define SECTIONS_PGSHIFT        (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT                (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT                (ZONES_PGOFF * (ZONES_WIDTH != 0))
#define LAST_CPUPID_PGSHIFT        (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
#define KASAN_TAG_PGSHIFT        (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))

/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
#ifdef NODE_NOT_IN_PAGE_FLAGS
#define ZONEID_SHIFT                (SECTIONS_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((SECTIONS_PGOFF < ZONES_PGOFF) ? \
                                                SECTIONS_PGOFF : ZONES_PGOFF)
#else
#define ZONEID_SHIFT                (NODES_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((NODES_PGOFF < ZONES_PGOFF) ? \
                                                NODES_PGOFF : ZONES_PGOFF)
#endif

#define ZONEID_PGSHIFT                (ZONEID_PGOFF * (ZONEID_SHIFT != 0))

#define ZONES_MASK                ((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK                ((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK                ((1UL << SECTIONS_WIDTH) - 1)
#define LAST_CPUPID_MASK        ((1UL << LAST_CPUPID_SHIFT) - 1)
#define KASAN_TAG_MASK                ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK                ((1UL << ZONEID_SHIFT) - 1)

static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags)
{
        ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT);
        return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK;
}

static inline enum zone_type page_zonenum(const struct page *page)
{
        return memdesc_zonenum(page->flags);
}

static inline enum zone_type folio_zonenum(const struct folio *folio)
{
        return memdesc_zonenum(folio->flags);
}

#ifdef CONFIG_ZONE_DEVICE
static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
        return memdesc_zonenum(mdf) == ZONE_DEVICE;
}

static inline struct dev_pagemap *page_pgmap(const struct page *page)
{
        VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page);
        return page_folio(page)->pgmap;
}

/*
 * Consecutive zone device pages should not be merged into the same sgl
 * or bvec segment with other types of pages or if they belong to different
 * pgmaps. Otherwise getting the pgmap of a given segment is not possible
 * without scanning the entire segment. This helper returns true either if
 * both pages are not zone device pages or both pages are zone device pages
 * with the same pgmap.
 */
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
                                                     const struct page *b)
{
        if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags))
                return false;
        if (!memdesc_is_zone_device(a->flags))
                return true;
        return page_pgmap(a) == page_pgmap(b);
}

extern void memmap_init_zone_device(struct zone *, unsigned long,
                                    unsigned long, struct dev_pagemap *);
#else
static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
        return false;
}
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
                                                     const struct page *b)
{
        return true;
}
static inline struct dev_pagemap *page_pgmap(const struct page *page)
{
        return NULL;
}
#endif

static inline bool is_zone_device_page(const struct page *page)
{
        return memdesc_is_zone_device(page->flags);
}

static inline bool folio_is_zone_device(const struct folio *folio)
{
        return memdesc_is_zone_device(folio->flags);
}

static inline bool is_zone_movable_page(const struct page *page)
{
        return page_zonenum(page) == ZONE_MOVABLE;
}

static inline bool folio_is_zone_movable(const struct folio *folio)
{
        return folio_zonenum(folio) == ZONE_MOVABLE;
}
#endif

/*
 * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
 * intersection with the given zone
 */
static inline bool zone_intersects(const struct zone *zone,
                unsigned long start_pfn, unsigned long nr_pages)
{
        if (zone_is_empty(zone))
                return false;
        if (start_pfn >= zone_end_pfn(zone) ||
            start_pfn + nr_pages <= zone->zone_start_pfn)
                return false;

        return true;
}

/*
 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
 * queues ("queue_length >> 12") during an aging round.
 */
#define DEF_PRIORITY 12

/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

enum {
        ZONELIST_FALLBACK,        /* zonelist with fallback */
#ifdef CONFIG_NUMA
        /*
         * The NUMA zonelists are doubled because we need zonelists that
         * restrict the allocations to a single node for __GFP_THISNODE.
         */
        ZONELIST_NOFALLBACK,        /* zonelist without fallback (__GFP_THISNODE) */
#endif
        MAX_ZONELISTS
};

/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */
struct zoneref {
        struct zone *zone;        /* Pointer to actual zone */
        int zone_idx;                /* zone_idx(zoneref->zone) */
};

/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()        - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()        - Return the index of the zone for an entry
 * zonelist_node_idx()        - Return the index of the node for an entry
 */
struct zonelist {
        struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};

/*
 * The array of struct pages for flatmem.
 * It must be declared for SPARSEMEM as well because there are configurations
 * that rely on that.
 */
extern struct page *mem_map;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split {
        spinlock_t split_queue_lock;
        struct list_head split_queue;
        unsigned long split_queue_len;
};
#endif

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Per NUMA node memory failure handling statistics.
 */
struct memory_failure_stats {
        /*
         * Number of raw pages poisoned.
         * Cases not accounted: memory outside kernel control, offline page,
         * arch-specific memory_failure (SGX), hwpoison_filter() filtered
         * error events, and unpoison actions from hwpoison_unpoison.
         */
        unsigned long total;
        /*
         * Recovery results of poisoned raw pages handled by memory_failure,
         * in sync with mf_result.
         * total = ignored + failed + delayed + recovered.
         * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
         */
        unsigned long ignored;
        unsigned long failed;
        unsigned long delayed;
        unsigned long recovered;
};
#endif

/*
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout. On UMA machines there is a single pglist_data which
 * describes the whole memory.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */
typedef struct pglist_data {
        /*
         * node_zones contains just the zones for THIS node. Not all of the
         * zones may be populated, but it is the full list. It is referenced by
         * this node's node_zonelists as well as other node's node_zonelists.
         */
        struct zone node_zones[MAX_NR_ZONES];

        /*
         * node_zonelists contains references to all zones in all nodes.
         * Generally the first zones will be references to this node's
         * node_zones.
         */
        struct zonelist node_zonelists[MAX_ZONELISTS];

        int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLATMEM        /* means !SPARSEMEM */
        struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
        struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
        /*
         * Must be held any time you expect node_start_pfn,
         * node_present_pages, node_spanned_pages or nr_zones to stay constant.
         * Also synchronizes pgdat->first_deferred_pfn during deferred page
         * init.
         *
         * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
         * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
         * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
         *
         * Nests above zone->lock and zone->span_seqlock
         */
        spinlock_t node_size_lock;
#endif
        unsigned long node_start_pfn;
        unsigned long node_present_pages; /* total number of physical pages */
        unsigned long node_spanned_pages; /* total size of physical page
                                             range, including holes */
        int node_id;
        wait_queue_head_t kswapd_wait;
        wait_queue_head_t pfmemalloc_wait;

        /* workqueues for throttling reclaim for different reasons. */
        wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];

        atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
        unsigned long nr_reclaim_start;        /* nr pages written while throttled
                                         * when throttling started. */
#ifdef CONFIG_MEMORY_HOTPLUG
        struct mutex kswapd_lock;
#endif
        struct task_struct *kswapd;        /* Protected by kswapd_lock */
        int kswapd_order;
        enum zone_type kswapd_highest_zoneidx;

        atomic_t kswapd_failures;        /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
        int kcompactd_max_order;
        enum zone_type kcompactd_highest_zoneidx;
        wait_queue_head_t kcompactd_wait;
        struct task_struct *kcompactd;
        bool proactive_compact_trigger;
#endif
        /*
         * This is a per-node reserve of pages that are not available
         * to userspace allocations.
         */
        unsigned long                totalreserve_pages;

#ifdef CONFIG_NUMA
        /*
         * node reclaim becomes active if more unmapped pages exist.
         */
        unsigned long                min_unmapped_pages;
        unsigned long                min_slab_pages;
#endif /* CONFIG_NUMA */

        /* Write-intensive fields used by page reclaim */
        CACHELINE_PADDING(_pad1_);

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        /*
         * If memory initialisation on large machines is deferred then this
         * is the first PFN that needs to be initialised.
         */
        unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_NUMA_BALANCING
        /* start time in ms of current promote rate limit period */
        unsigned int nbp_rl_start;
        /* number of promote candidate pages at start time of current rate limit period */
        unsigned long nbp_rl_nr_cand;
        /* promote threshold in ms */
        unsigned int nbp_threshold;
        /* start time in ms of current promote threshold adjustment period */
        unsigned int nbp_th_start;
        /*
         * number of promote candidate pages at start time of current promote
         * threshold adjustment period
         */
        unsigned long nbp_th_nr_cand;
#endif
        /* Fields commonly accessed by the page reclaim scanner */

        /*
         * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
         *
         * Use mem_cgroup_lruvec() to look up lruvecs.
         */
        struct lruvec                __lruvec;

        unsigned long                flags;

#ifdef CONFIG_LRU_GEN
        /* kswap mm walk data */
        struct lru_gen_mm_walk mm_walk;
        /* lru_gen_folio list */
        struct lru_gen_memcg memcg_lru;
#endif

        CACHELINE_PADDING(_pad2_);

        /* Per-node vmstats */
        struct per_cpu_nodestat __percpu *per_cpu_nodestats;
        atomic_long_t                vm_stat[NR_VM_NODE_STAT_ITEMS];
#ifdef CONFIG_NUMA
        struct memory_tier __rcu *memtier;
#endif
#ifdef CONFIG_MEMORY_FAILURE
        struct memory_failure_stats mf_stats;
#endif
} pg_data_t;

#define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid)        (NODE_DATA(nid)->node_spanned_pages)

#define node_start_pfn(nid)        (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))

static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
        return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}

#include <linux/memory_hotplug.h>

void build_all_zonelists(pg_data_t *pgdat);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                         int highest_zoneidx, unsigned int alloc_flags,
                         long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx,
                unsigned int alloc_flags);

enum kswapd_clear_hopeless_reason {
        KSWAPD_CLEAR_HOPELESS_OTHER = 0,
        KSWAPD_CLEAR_HOPELESS_KSWAPD,
        KSWAPD_CLEAR_HOPELESS_DIRECT,
        KSWAPD_CLEAR_HOPELESS_PCP,
};

void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
                   enum zone_type highest_zoneidx);
void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
                               unsigned int order, int highest_zoneidx);
void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason);
bool kswapd_test_hopeless(pg_data_t *pgdat);

/*
 * Memory initialization context, use to differentiate memory added by
 * the platform statically or via memory hotplug interface.
 */
enum meminit_context {
        MEMINIT_EARLY,
        MEMINIT_HOTPLUG,
};

extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
                                     unsigned long size);

extern void lruvec_init(struct lruvec *lruvec);

static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
#ifdef CONFIG_MEMCG
        return lruvec->pgdat;
#else
        return container_of(lruvec, struct pglist_data, __lruvec);
#endif
}

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
int local_memory_node(int node_id);
#else
static inline int local_memory_node(int node_id) { return node_id; };
#endif

/*
 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
 */
#define zone_idx(zone)                ((zone) - (zone)->zone_pgdat->node_zones)

#ifdef CONFIG_ZONE_DEVICE
static inline bool zone_is_zone_device(const struct zone *zone)
{
        return zone_idx(zone) == ZONE_DEVICE;
}
#else
static inline bool zone_is_zone_device(const struct zone *zone)
{
        return false;
}
#endif

/*
 * Returns true if a zone has pages managed by the buddy allocator.
 * All the reclaim decisions have to use this function rather than
 * populated_zone(). If the whole zone is reserved then we can easily
 * end up with populated_zone() && !managed_zone().
 */
static inline bool managed_zone(const struct zone *zone)
{
        return zone_managed_pages(zone);
}

/* Returns true if a zone has memory */
static inline bool populated_zone(const struct zone *zone)
{
        return zone->present_pages;
}

#ifdef CONFIG_NUMA
static inline int zone_to_nid(const struct zone *zone)
{
        return zone->node;
}

static inline void zone_set_nid(struct zone *zone, int nid)
{
        zone->node = nid;
}
#else
static inline int zone_to_nid(const struct zone *zone)
{
        return 0;
}

static inline void zone_set_nid(struct zone *zone, int nid) {}
#endif

extern int movable_zone;

static inline int is_highmem_idx(enum zone_type idx)
{
#ifdef CONFIG_HIGHMEM
        return (idx == ZONE_HIGHMEM ||
                (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM));
#else
        return 0;
#endif
}

/**
 * is_highmem - helper function to quickly check if a struct zone is a
 *              highmem zone or not.  This is an attempt to keep references
 *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
 * @zone: pointer to struct zone variable
 * Return: 1 for a highmem zone, 0 otherwise
 */
static inline int is_highmem(const struct zone *zone)
{
        return is_highmem_idx(zone_idx(zone));
}

bool has_managed_zone(enum zone_type zone);
static inline bool has_managed_dma(void)
{
#ifdef CONFIG_ZONE_DMA
        return has_managed_zone(ZONE_DMA);
#else
        return false;
#endif
}


#ifndef CONFIG_NUMA

extern struct pglist_data contig_page_data;
static inline struct pglist_data *NODE_DATA(int nid)
{
        return &contig_page_data;
}

#else /* CONFIG_NUMA */

#include <asm/mmzone.h>

#endif /* !CONFIG_NUMA */

extern struct pglist_data *first_online_pgdat(void);
extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
extern struct zone *next_zone(struct zone *zone);

/**
 * for_each_online_pgdat - helper macro to iterate over all online nodes
 * @pgdat: pointer to a pg_data_t variable
 */
#define for_each_online_pgdat(pgdat)                        \
        for (pgdat = first_online_pgdat();                \
             pgdat;                                        \
             pgdat = next_online_pgdat(pgdat))
/**
 * for_each_zone - helper macro to iterate over all memory zones
 * @zone: pointer to struct zone variable
 *
 * The user only needs to declare the zone variable, for_each_zone
 * fills it in.
 */
#define for_each_zone(zone)                                \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))

#define for_each_populated_zone(zone)                        \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))                        \
                if (!populated_zone(zone))                \
                        ; /* do nothing */                \
                else

static inline struct zone *zonelist_zone(struct zoneref *zoneref)
{
        return zoneref->zone;
}

static inline int zonelist_zone_idx(const struct zoneref *zoneref)
{
        return zoneref->zone_idx;
}

static inline int zonelist_node_idx(const struct zoneref *zoneref)
{
        return zone_to_nid(zoneref->zone);
}

struct zoneref *__next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes);

/**
 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
 * @z: The cursor used as a starting point for the search
 * @highest_zoneidx: The zone index of the highest zone to return
 * @nodes: An optional nodemask to filter the zonelist with
 *
 * This function returns the next zone at or below a given zone index that is
 * within the allowed nodemask using a cursor as the starting point for the
 * search. The zoneref returned is a cursor that represents the current zone
 * being examined. It should be advanced by one before calling
 * next_zones_zonelist again.
 *
 * Return: the next zone at or below highest_zoneidx within the allowed
 * nodemask using a cursor within a zonelist as a starting point
 */
static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
                return z;
        return __next_zones_zonelist(z, highest_zoneidx, nodes);
}

/**
 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 * @zonelist: The zonelist to search for a suitable zone
 * @highest_zoneidx: The zone index of the highest zone to return
 * @nodes: An optional nodemask to filter the zonelist with
 *
 * This function returns the first zone at or below a given zone index that is
 * within the allowed nodemask. The zoneref returned is a cursor that can be
 * used to iterate the zonelist with next_zones_zonelist by advancing it by
 * one before calling.
 *
 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
 * never NULL). This may happen either genuinely, or due to concurrent nodemask
 * update due to cpuset modification.
 *
 * Return: Zoneref pointer for the first suitable zone found
 */
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        return next_zones_zonelist(zonelist->_zonerefs,
                                                        highest_zoneidx, nodes);
}

/**
 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
 * @zone: The current zone in the iterator
 * @z: The current pointer within zonelist->_zonerefs being iterated
 * @zlist: The zonelist being iterated
 * @highidx: The zone index of the highest zone to return
 * @nodemask: Nodemask allowed by the allocator
 *
 * This iterator iterates though all zones at or below a given zone index and
 * within a given nodemask
 */
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
        for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))

#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
        for (zone = zonelist_zone(z);        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))


/**
 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
 * @zone: The current zone in the iterator
 * @z: The current pointer within zonelist->zones being iterated
 * @zlist: The zonelist being iterated
 * @highidx: The zone index of the highest zone to return
 *
 * This iterator iterates though all zones at or below a given zone index.
 */
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
        for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)

/* Whether the 'nodes' are all movable nodes */
static inline bool movable_only_nodes(nodemask_t *nodes)
{
        struct zonelist *zonelist;
        struct zoneref *z;
        int nid;

        if (nodes_empty(*nodes))
                return false;

        /*
         * We can chose arbitrary node from the nodemask to get a
         * zonelist as they are interlinked. We just need to find
         * at least one zone that can satisfy kernel allocations.
         */
        nid = first_node(*nodes);
        zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
        z = first_zones_zonelist(zonelist, ZONE_NORMAL,        nodes);
        return (!zonelist_zone(z)) ? true : false;
}


#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#endif

#ifdef CONFIG_FLATMEM
#define pfn_to_nid(pfn)                (0)
#endif

#ifdef CONFIG_SPARSEMEM

/*
 * PA_SECTION_SHIFT                physical address to/from section number
 * PFN_SECTION_SHIFT                pfn to/from section number
 */
#define PA_SECTION_SHIFT        (SECTION_SIZE_BITS)
#define PFN_SECTION_SHIFT        (SECTION_SIZE_BITS - PAGE_SHIFT)

#define NR_MEM_SECTIONS                (1UL << SECTIONS_SHIFT)

#define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
#define PAGE_SECTION_MASK        (~(PAGES_PER_SECTION-1))

#define SECTION_BLOCKFLAGS_BITS \
        ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)

#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
#endif

static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
        return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
        return sec << PFN_SECTION_SHIFT;
}

#define SECTION_ALIGN_UP(pfn)        (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
#define SECTION_ALIGN_DOWN(pfn)        ((pfn) & PAGE_SECTION_MASK)

#define SUBSECTION_SHIFT 21
#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)

#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))

#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
#error Subsection size exceeds section size
#else
#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
#endif

#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)

struct mem_section_usage {
        struct rcu_head rcu;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
        DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
        /* See declaration of similar field in struct zone */
        unsigned long pageblock_flags[0];
};

struct page;
struct page_ext;
struct mem_section {
        /*
         * This is, logically, a pointer to an array of struct
         * pages.  However, it is stored with some other magic.
         * (see sparse_init_one_section())
         *
         * Additionally during early boot we encode node id of
         * the location of the section here to guide allocation.
         * (see sparse.c::memory_present())
         *
         * Making it a UL at least makes someone do a cast
         * before using it wrong.
         */
        unsigned long section_mem_map;

        struct mem_section_usage *usage;
#ifdef CONFIG_PAGE_EXTENSION
        /*
         * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
         * section. (see page_ext.h about this.)
         */
        struct page_ext *page_ext;
        unsigned long pad;
#endif
        /*
         * WARNING: mem_section must be a power-of-2 in size for the
         * calculation and use of SECTION_ROOT_MASK to make sense.
         */
};

#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
#else
#define SECTIONS_PER_ROOT        1
#endif

#define SECTION_NR_TO_ROOT(sec)        ((sec) / SECTIONS_PER_ROOT)
#define NR_SECTION_ROOTS        DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define SECTION_ROOT_MASK        (SECTIONS_PER_ROOT - 1)

#ifdef CONFIG_SPARSEMEM_EXTREME
extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif

static inline unsigned long *section_to_usemap(struct mem_section *ms)
{
        return ms->usage->pageblock_flags;
}

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
        unsigned long root = SECTION_NR_TO_ROOT(nr);

        if (unlikely(root >= NR_SECTION_ROOTS))
                return NULL;

#ifdef CONFIG_SPARSEMEM_EXTREME
        if (!mem_section || !mem_section[root])
                return NULL;
#endif
        return &mem_section[root][nr & SECTION_ROOT_MASK];
}
extern size_t mem_section_usage_size(void);

/*
 * We use the lower bits of the mem_map pointer to store a little bit of
 * information. The pointer is calculated as mem_map - section_nr_to_pfn().
 * The result is aligned to the minimum alignment of the two values:
 *
 * 1. All mem_map arrays are page-aligned.
 * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT lowest bits.
 *
 * We always expect a single section to cover full pages. Therefore,
 * we can safely assume that PFN_SECTION_SHIFT is large enough to
 * accommodate SECTION_MAP_LAST_BIT. We use BUILD_BUG_ON() to ensure this.
 */
enum {
        SECTION_MARKED_PRESENT_BIT,
        SECTION_HAS_MEM_MAP_BIT,
        SECTION_IS_ONLINE_BIT,
        SECTION_IS_EARLY_BIT,
#ifdef CONFIG_ZONE_DEVICE
        SECTION_TAINT_ZONE_DEVICE_BIT,
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
        SECTION_IS_VMEMMAP_PREINIT_BIT,
#endif
        SECTION_MAP_LAST_BIT,
};

#define SECTION_MARKED_PRESENT                BIT(SECTION_MARKED_PRESENT_BIT)
#define SECTION_HAS_MEM_MAP                BIT(SECTION_HAS_MEM_MAP_BIT)
#define SECTION_IS_ONLINE                BIT(SECTION_IS_ONLINE_BIT)
#define SECTION_IS_EARLY                BIT(SECTION_IS_EARLY_BIT)
#ifdef CONFIG_ZONE_DEVICE
#define SECTION_TAINT_ZONE_DEVICE        BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
#define SECTION_IS_VMEMMAP_PREINIT        BIT(SECTION_IS_VMEMMAP_PREINIT_BIT)
#endif
#define SECTION_MAP_MASK                (~(BIT(SECTION_MAP_LAST_BIT) - 1))
#define SECTION_NID_SHIFT                SECTION_MAP_LAST_BIT

static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
        unsigned long map = section->section_mem_map;
        map &= SECTION_MAP_MASK;
        return (struct page *)map;
}

static inline int present_section(const struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}

static inline int present_section_nr(unsigned long nr)
{
        return present_section(__nr_to_section(nr));
}

static inline int valid_section(const struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}

static inline int early_section(const struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_EARLY));
}

static inline int valid_section_nr(unsigned long nr)
{
        return valid_section(__nr_to_section(nr));
}

static inline int online_section(const struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}

#ifdef CONFIG_ZONE_DEVICE
static inline int online_device_section(const struct mem_section *section)
{
        unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;

        return section && ((section->section_mem_map & flags) == flags);
}
#else
static inline int online_device_section(const struct mem_section *section)
{
        return 0;
}
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
static inline int preinited_vmemmap_section(const struct mem_section *section)
{
        return (section &&
                (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT));
}

void sparse_vmemmap_init_nid_early(int nid);
void sparse_vmemmap_init_nid_late(int nid);

#else
static inline int preinited_vmemmap_section(const struct mem_section *section)
{
        return 0;
}
static inline void sparse_vmemmap_init_nid_early(int nid)
{
}

static inline void sparse_vmemmap_init_nid_late(int nid)
{
}
#endif

static inline int online_section_nr(unsigned long nr)
{
        return online_section(__nr_to_section(nr));
}

#ifdef CONFIG_MEMORY_HOTPLUG
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#endif

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
        return __nr_to_section(pfn_to_section_nr(pfn));
}

extern unsigned long __highest_present_section_nr;

static inline int subsection_map_index(unsigned long pfn)
{
        return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
}

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        int idx = subsection_map_index(pfn);
        struct mem_section_usage *usage = READ_ONCE(ms->usage);

        return usage ? test_bit(idx, usage->subsection_map) : 0;
}

static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn)
{
        struct mem_section_usage *usage = READ_ONCE(ms->usage);
        int idx = subsection_map_index(*pfn);
        unsigned long bit;

        if (!usage)
                return false;

        if (test_bit(idx, usage->subsection_map))
                return true;

        /* Find the next subsection that exists */
        bit = find_next_bit(usage->subsection_map, SUBSECTIONS_PER_SECTION, idx);
        if (bit == SUBSECTIONS_PER_SECTION)
                return false;

        *pfn = (*pfn & PAGE_SECTION_MASK) + (bit * PAGES_PER_SUBSECTION);
        return true;
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        return 1;
}

static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn)
{
        return true;
}
#endif

void sparse_init_early_section(int nid, struct page *map, unsigned long pnum,
                               unsigned long flags);

#ifndef CONFIG_HAVE_ARCH_PFN_VALID
/**
 * pfn_valid - check if there is a valid memory map entry for a PFN
 * @pfn: the page frame number to check
 *
 * Check if there is a valid memory map entry aka struct page for the @pfn.
 * Note, that availability of the memory map entry does not imply that
 * there is actual usable memory at that @pfn. The struct page may
 * represent a hole or an unusable page frame.
 *
 * Return: 1 for PFNs that have memory map entries and 0 otherwise
 */
static inline int pfn_valid(unsigned long pfn)
{
        struct mem_section *ms;
        int ret;

        /*
         * Ensure the upper PAGE_SHIFT bits are clear in the
         * pfn. Else it might lead to false positives when
         * some of the upper bits are set, but the lower bits
         * match a valid pfn.
         */
        if (PHYS_PFN(PFN_PHYS(pfn)) != pfn)
                return 0;

        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        ms = __pfn_to_section(pfn);
        rcu_read_lock_sched();
        if (!valid_section(ms)) {
                rcu_read_unlock_sched();
                return 0;
        }
        /*
         * Traditionally early sections always returned pfn_valid() for
         * the entire section-sized span.
         */
        ret = early_section(ms) || pfn_section_valid(ms, pfn);
        rcu_read_unlock_sched();

        return ret;
}

/* Returns end_pfn or higher if no valid PFN remaining in range */
static inline unsigned long first_valid_pfn(unsigned long pfn, unsigned long end_pfn)
{
        unsigned long nr = pfn_to_section_nr(pfn);

        rcu_read_lock_sched();

        while (nr <= __highest_present_section_nr && pfn < end_pfn) {
                struct mem_section *ms = __pfn_to_section(pfn);

                if (valid_section(ms) &&
                    (early_section(ms) || pfn_section_first_valid(ms, &pfn))) {
                        rcu_read_unlock_sched();
                        return pfn;
                }

                /* Nothing left in this section? Skip to next section */
                nr++;
                pfn = section_nr_to_pfn(nr);
        }

        rcu_read_unlock_sched();
        return end_pfn;
}

static inline unsigned long next_valid_pfn(unsigned long pfn, unsigned long end_pfn)
{
        pfn++;

        if (pfn >= end_pfn)
                return end_pfn;

        /*
         * Either every PFN within the section (or subsection for VMEMMAP) is
         * valid, or none of them are. So there's no point repeating the check
         * for every PFN; only call first_valid_pfn() again when crossing a
         * (sub)section boundary (i.e. !(pfn & ~PAGE_{SUB,}SECTION_MASK)).
         */
        if (pfn & ~(IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) ?
                   PAGE_SUBSECTION_MASK : PAGE_SECTION_MASK))
                return pfn;

        return first_valid_pfn(pfn, end_pfn);
}


#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn)                        \
        for ((_pfn) = first_valid_pfn((_start_pfn), (_end_pfn));        \
             (_pfn) < (_end_pfn);                                        \
             (_pfn) = next_valid_pfn((_pfn), (_end_pfn)))

#endif

static inline int pfn_in_present_section(unsigned long pfn)
{
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        return present_section(__pfn_to_section(pfn));
}

static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
        while (++section_nr <= __highest_present_section_nr) {
                if (present_section_nr(section_nr))
                        return section_nr;
        }

        return -1;
}

#define for_each_present_section_nr(start, section_nr)                \
        for (section_nr = next_present_section_nr(start - 1);        \
             section_nr != -1;                                        \
             section_nr = next_present_section_nr(section_nr))

/*
 * These are _only_ used during initialisation, therefore they
 * can use __initdata ...  They could have names to indicate
 * this restriction.
 */
#ifdef CONFIG_NUMA
#define pfn_to_nid(pfn)                                                        \
({                                                                        \
        unsigned long __pfn_to_nid_pfn = (pfn);                                \
        page_to_nid(pfn_to_page(__pfn_to_nid_pfn));                        \
})
#else
#define pfn_to_nid(pfn)                (0)
#endif

#else
#define sparse_vmemmap_init_nid_early(_nid) do {} while (0)
#define sparse_vmemmap_init_nid_late(_nid) do {} while (0)
#define pfn_in_present_section pfn_valid
#endif /* CONFIG_SPARSEMEM */

/*
 * Fallback case for when the architecture provides its own pfn_valid() but
 * not a corresponding for_each_valid_pfn().
 */
#ifndef for_each_valid_pfn
#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn)                        \
        for ((_pfn) = (_start_pfn); (_pfn) < (_end_pfn); (_pfn)++)        \
                if (pfn_valid(_pfn))
#endif

#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */

























































































































































































































































































































































































   16 











































































   17 









   14 
































   17 


    1 
   18 
   17 
    1 

















































   18 


































































































   16 




































   19 



















































































































































































































































   16 




























































   17 





























































   17 






























































   17 






















































































    1 





































   16 























   16 





















   16 






























   18 


























    1 











































































    1 









    1 

    1 






















    1 























    1 






    1 





    1 







    1 






































































































   17 
















   19 

    4 






























   17 






   15 










   17 















































   19 












   16 









































   17 





   18 



















    1 



    1 







    1 




















   17 
   18 

   17 



























































   17 






   17 































   17 

















   15 






















   17 










   17 


   19 





   15 
   17 



















   19 

















































































































































   17 



   19 
   19 































   17 





































   18 














































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
// SPDX-License-Identifier: GPL-2.0

#include <kunit/visibility.h>
#include <linux/kernel.h>
#include <linux/irqflags.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/bug.h>
#include "printk_ringbuffer.h"
#include "internal.h"

/**
 * DOC: printk_ringbuffer overview
 *
 * Data Structure
 * --------------
 * The printk_ringbuffer is made up of 3 internal ringbuffers:
 *
 *   desc_ring
 *     A ring of descriptors and their meta data (such as sequence number,
 *     timestamp, loglevel, etc.) as well as internal state information about
 *     the record and logical positions specifying where in the other
 *     ringbuffer the text strings are located.
 *
 *   text_data_ring
 *     A ring of data blocks. A data block consists of an unsigned long
 *     integer (ID) that maps to a desc_ring index followed by the text
 *     string of the record.
 *
 * The internal state information of a descriptor is the key element to allow
 * readers and writers to locklessly synchronize access to the data.
 *
 * Implementation
 * --------------
 *
 * Descriptor Ring
 * ~~~~~~~~~~~~~~~
 * The descriptor ring is an array of descriptors. A descriptor contains
 * essential meta data to track the data of a printk record using
 * blk_lpos structs pointing to associated text data blocks (see
 * "Data Rings" below). Each descriptor is assigned an ID that maps
 * directly to index values of the descriptor array and has a state. The ID
 * and the state are bitwise combined into a single descriptor field named
 * @state_var, allowing ID and state to be synchronously and atomically
 * updated.
 *
 * Descriptors have four states:
 *
 *   reserved
 *     A writer is modifying the record.
 *
 *   committed
 *     The record and all its data are written. A writer can reopen the
 *     descriptor (transitioning it back to reserved), but in the committed
 *     state the data is consistent.
 *
 *   finalized
 *     The record and all its data are complete and available for reading. A
 *     writer cannot reopen the descriptor.
 *
 *   reusable
 *     The record exists, but its text and/or meta data may no longer be
 *     available.
 *
 * Querying the @state_var of a record requires providing the ID of the
 * descriptor to query. This can yield a possible fifth (pseudo) state:
 *
 *   miss
 *     The descriptor being queried has an unexpected ID.
 *
 * The descriptor ring has a @tail_id that contains the ID of the oldest
 * descriptor and @head_id that contains the ID of the newest descriptor.
 *
 * When a new descriptor should be created (and the ring is full), the tail
 * descriptor is invalidated by first transitioning to the reusable state and
 * then invalidating all tail data blocks up to and including the data blocks
 * associated with the tail descriptor (for the text ring). Then
 * @tail_id is advanced, followed by advancing @head_id. And finally the
 * @state_var of the new descriptor is initialized to the new ID and reserved
 * state.
 *
 * The @tail_id can only be advanced if the new @tail_id would be in the
 * committed or reusable queried state. This makes it possible that a valid
 * sequence number of the tail is always available.
 *
 * Descriptor Finalization
 * ~~~~~~~~~~~~~~~~~~~~~~~
 * When a writer calls the commit function prb_commit(), record data is
 * fully stored and is consistent within the ringbuffer. However, a writer can
 * reopen that record, claiming exclusive access (as with prb_reserve()), and
 * modify that record. When finished, the writer must again commit the record.
 *
 * In order for a record to be made available to readers (and also become
 * recyclable for writers), it must be finalized. A finalized record cannot be
 * reopened and can never become "unfinalized". Record finalization can occur
 * in three different scenarios:
 *
 *   1) A writer can simultaneously commit and finalize its record by calling
 *      prb_final_commit() instead of prb_commit().
 *
 *   2) When a new record is reserved and the previous record has been
 *      committed via prb_commit(), that previous record is automatically
 *      finalized.
 *
 *   3) When a record is committed via prb_commit() and a newer record
 *      already exists, the record being committed is automatically finalized.
 *
 * Data Ring
 * ~~~~~~~~~
 * The text data ring is a byte array composed of data blocks. Data blocks are
 * referenced by blk_lpos structs that point to the logical position of the
 * beginning of a data block and the beginning of the next adjacent data
 * block. Logical positions are mapped directly to index values of the byte
 * array ringbuffer.
 *
 * Each data block consists of an ID followed by the writer data. The ID is
 * the identifier of a descriptor that is associated with the data block. A
 * given data block is considered valid if all of the following conditions
 * are met:
 *
 *   1) The descriptor associated with the data block is in the committed
 *      or finalized queried state.
 *
 *   2) The blk_lpos struct within the descriptor associated with the data
 *      block references back to the same data block.
 *
 *   3) The data block is within the head/tail logical position range.
 *
 * If the writer data of a data block would extend beyond the end of the
 * byte array, only the ID of the data block is stored at the logical
 * position and the full data block (ID and writer data) is stored at the
 * beginning of the byte array. The referencing blk_lpos will point to the
 * ID before the wrap and the next data block will be at the logical
 * position adjacent the full data block after the wrap.
 *
 * Data rings have a @tail_lpos that points to the beginning of the oldest
 * data block and a @head_lpos that points to the logical position of the
 * next (not yet existing) data block.
 *
 * When a new data block should be created (and the ring is full), tail data
 * blocks will first be invalidated by putting their associated descriptors
 * into the reusable state and then pushing the @tail_lpos forward beyond
 * them. Then the @head_lpos is pushed forward and is associated with a new
 * descriptor. If a data block is not valid, the @tail_lpos cannot be
 * advanced beyond it.
 *
 * Info Array
 * ~~~~~~~~~~
 * The general meta data of printk records are stored in printk_info structs,
 * stored in an array with the same number of elements as the descriptor ring.
 * Each info corresponds to the descriptor of the same index in the
 * descriptor ring. Info validity is confirmed by evaluating the corresponding
 * descriptor before and after loading the info.
 *
 * Usage
 * -----
 * Here are some simple examples demonstrating writers and readers. For the
 * examples a global ringbuffer (test_rb) is available (which is not the
 * actual ringbuffer used by printk)::
 *
 *        DEFINE_PRINTKRB(test_rb, 15, 5);
 *
 * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of
 * 1 MiB (2 ^ (15 + 5)) for text data.
 *
 * Sample writer code::
 *
 *        const char *textstr = "message text";
 *        struct prb_reserved_entry e;
 *        struct printk_record r;
 *
 *        // specify how much to allocate
 *        prb_rec_init_wr(&r, strlen(textstr) + 1);
 *
 *        if (prb_reserve(&e, &test_rb, &r)) {
 *                snprintf(r.text_buf, r.text_buf_size, "%s", textstr);
 *
 *                r.info->text_len = strlen(textstr);
 *                r.info->ts_nsec = local_clock();
 *                r.info->caller_id = printk_caller_id();
 *
 *                // commit and finalize the record
 *                prb_final_commit(&e);
 *        }
 *
 * Note that additional writer functions are available to extend a record
 * after it has been committed but not yet finalized. This can be done as
 * long as no new records have been reserved and the caller is the same.
 *
 * Sample writer code (record extending)::
 *
 *                // alternate rest of previous example
 *
 *                r.info->text_len = strlen(textstr);
 *                r.info->ts_nsec = local_clock();
 *                r.info->caller_id = printk_caller_id();
 *
 *                // commit the record (but do not finalize yet)
 *                prb_commit(&e);
 *        }
 *
 *        ...
 *
 *        // specify additional 5 bytes text space to extend
 *        prb_rec_init_wr(&r, 5);
 *
 *        // try to extend, but only if it does not exceed 32 bytes
 *        if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) {
 *                snprintf(&r.text_buf[r.info->text_len],
 *                         r.text_buf_size - r.info->text_len, "hello");
 *
 *                r.info->text_len += 5;
 *
 *                // commit and finalize the record
 *                prb_final_commit(&e);
 *        }
 *
 * Sample reader code::
 *
 *        struct printk_info info;
 *        struct printk_record r;
 *        char text_buf[32];
 *        u64 seq;
 *
 *        prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf));
 *
 *        prb_for_each_record(0, &test_rb, &seq, &r) {
 *                if (info.seq != seq)
 *                        pr_warn("lost %llu records\n", info.seq - seq);
 *
 *                if (info.text_len > r.text_buf_size) {
 *                        pr_warn("record %llu text truncated\n", info.seq);
 *                        text_buf[r.text_buf_size - 1] = 0;
 *                }
 *
 *                pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec,
 *                        &text_buf[0]);
 *        }
 *
 * Note that additional less convenient reader functions are available to
 * allow complex record access.
 *
 * ABA Issues
 * ~~~~~~~~~~
 * To help avoid ABA issues, descriptors are referenced by IDs (array index
 * values combined with tagged bits counting array wraps) and data blocks are
 * referenced by logical positions (array index values combined with tagged
 * bits counting array wraps). However, on 32-bit systems the number of
 * tagged bits is relatively small such that an ABA incident is (at least
 * theoretically) possible. For example, if 4 million maximally sized (1KiB)
 * printk messages were to occur in NMI context on a 32-bit system, the
 * interrupted context would not be able to recognize that the 32-bit integer
 * completely wrapped and thus represents a different data block than the one
 * the interrupted context expects.
 *
 * To help combat this possibility, additional state checking is performed
 * (such as using cmpxchg() even though set() would suffice). These extra
 * checks are commented as such and will hopefully catch any ABA issue that
 * a 32-bit system might experience.
 *
 * Memory Barriers
 * ~~~~~~~~~~~~~~~
 * Multiple memory barriers are used. To simplify proving correctness and
 * generating litmus tests, lines of code related to memory barriers
 * (loads, stores, and the associated memory barriers) are labeled::
 *
 *        LMM(function:letter)
 *
 * Comments reference the labels using only the "function:letter" part.
 *
 * The memory barrier pairs and their ordering are:
 *
 *   desc_reserve:D / desc_reserve:B
 *     push descriptor tail (id), then push descriptor head (id)
 *
 *   desc_reserve:D / data_push_tail:B
 *     push data tail (lpos), then set new descriptor reserved (state)
 *
 *   desc_reserve:D / desc_push_tail:C
 *     push descriptor tail (id), then set new descriptor reserved (state)
 *
 *   desc_reserve:D / prb_first_seq:C
 *     push descriptor tail (id), then set new descriptor reserved (state)
 *
 *   desc_reserve:F / desc_read:D
 *     set new descriptor id and reserved (state), then allow writer changes
 *
 *   data_alloc:A (or data_realloc:A) / desc_read:D
 *     set old descriptor reusable (state), then modify new data block area
 *
 *   data_alloc:A (or data_realloc:A) / data_push_tail:B
 *     push data tail (lpos), then modify new data block area
 *
 *   _prb_commit:B / desc_read:B
 *     store writer changes, then set new descriptor committed (state)
 *
 *   desc_reopen_last:A / _prb_commit:B
 *     set descriptor reserved (state), then read descriptor data
 *
 *   _prb_commit:B / desc_reserve:D
 *     set new descriptor committed (state), then check descriptor head (id)
 *
 *   data_push_tail:D / data_push_tail:A
 *     set descriptor reusable (state), then push data tail (lpos)
 *
 *   desc_push_tail:B / desc_reserve:D
 *     set descriptor reusable (state), then push descriptor tail (id)
 *
 *   desc_update_last_finalized:A / desc_last_finalized_seq:A
 *     store finalized record, then set new highest finalized sequence number
 */

#define DATA_SIZE(data_ring)                _DATA_SIZE((data_ring)->size_bits)
#define DATA_SIZE_MASK(data_ring)        (DATA_SIZE(data_ring) - 1)

#define DESCS_COUNT(desc_ring)                _DESCS_COUNT((desc_ring)->count_bits)
#define DESCS_COUNT_MASK(desc_ring)        (DESCS_COUNT(desc_ring) - 1)

/* Determine the data array index from a logical position. */
#define DATA_INDEX(data_ring, lpos)        ((lpos) & DATA_SIZE_MASK(data_ring))

/* Determine the desc array index from an ID or sequence number. */
#define DESC_INDEX(desc_ring, n)        ((n) & DESCS_COUNT_MASK(desc_ring))

/* Determine how many times the data array has wrapped. */
#define DATA_WRAPS(data_ring, lpos)        ((lpos) >> (data_ring)->size_bits)

/* Determine if a logical position refers to a data-less block. */
#define LPOS_DATALESS(lpos)                ((lpos) & 1UL)
#define BLK_DATALESS(blk)                (LPOS_DATALESS((blk)->begin) && \
                                         LPOS_DATALESS((blk)->next))

/* Get the logical position at index 0 of the current wrap. */
#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \
((lpos) & ~DATA_SIZE_MASK(data_ring))

/* Get the ID for the same index of the previous wrap as the given ID. */
#define DESC_ID_PREV_WRAP(desc_ring, id) \
DESC_ID((id) - DESCS_COUNT(desc_ring))

/*
 * A data block: mapped directly to the beginning of the data block area
 * specified as a logical position within the data ring.
 *
 * @id:   the ID of the associated descriptor
 * @data: the writer data
 *
 * Note that the size of a data block is only known by its associated
 * descriptor.
 */
struct prb_data_block {
        unsigned long        id;
        char                data[];
};

/*
 * Return the descriptor associated with @n. @n can be either a
 * descriptor ID or a sequence number.
 */
static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n)
{
        return &desc_ring->descs[DESC_INDEX(desc_ring, n)];
}

/*
 * Return the printk_info associated with @n. @n can be either a
 * descriptor ID or a sequence number.
 */
static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n)
{
        return &desc_ring->infos[DESC_INDEX(desc_ring, n)];
}

static struct prb_data_block *to_block(struct prb_data_ring *data_ring,
                                       unsigned long begin_lpos)
{
        return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)];
}

/*
 * Increase the data size to account for data block meta data plus any
 * padding so that the adjacent data block is aligned on the ID size.
 */
static unsigned int to_blk_size(unsigned int size)
{
        struct prb_data_block *db = NULL;

        size += sizeof(*db);
        size = ALIGN(size, sizeof(db->id));
        return size;
}

/*
 * Sanity checker for reserve size. The ringbuffer code assumes that a data
 * block does not exceed the maximum possible size that could fit within the
 * ringbuffer. This function provides that basic size check so that the
 * assumption is safe. In particular, it guarantees that data_push_tail() will
 * never attempt to push the tail beyond the head.
 */
static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
{
        /* Data-less blocks take no space. */
        if (size == 0)
                return true;

        /*
         * If data blocks were allowed to be larger than half the data ring
         * size, a wrapping data block could require more space than the full
         * ringbuffer.
         */
        return to_blk_size(size) <= DATA_SIZE(data_ring) / 2;
}

/*
 * Compare the current and requested logical position and decide
 * whether more space is needed.
 *
 * Return false when @lpos_current is already at or beyond @lpos_target.
 *
 * Also return false when the difference between the positions is bigger
 * than the size of the data buffer. It might happen only when the caller
 * raced with another CPU(s) which already made and used the space.
 */
static bool need_more_space(struct prb_data_ring *data_ring,
                            unsigned long lpos_current,
                            unsigned long lpos_target)
{
        return lpos_target - lpos_current - 1 < DATA_SIZE(data_ring);
}

/* Query the state of a descriptor. */
static enum desc_state get_desc_state(unsigned long id,
                                      unsigned long state_val)
{
        if (id != DESC_ID(state_val))
                return desc_miss;

        return DESC_STATE(state_val);
}

/*
 * Get a copy of a specified descriptor and return its queried state. If the
 * descriptor is in an inconsistent state (miss or reserved), the caller can
 * only expect the descriptor's @state_var field to be valid.
 *
 * The sequence number and caller_id can be optionally retrieved. Like all
 * non-state_var data, they are only valid if the descriptor is in a
 * consistent state.
 */
static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
                                 unsigned long id, struct prb_desc *desc_out,
                                 u64 *seq_out, u32 *caller_id_out)
{
        struct printk_info *info = to_info(desc_ring, id);
        struct prb_desc *desc = to_desc(desc_ring, id);
        atomic_long_t *state_var = &desc->state_var;
        enum desc_state d_state;
        unsigned long state_val;

        /* Check the descriptor state. */
        state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
        d_state = get_desc_state(id, state_val);
        if (d_state == desc_miss || d_state == desc_reserved) {
                /*
                 * The descriptor is in an inconsistent state. Set at least
                 * @state_var so that the caller can see the details of
                 * the inconsistent state.
                 */
                goto out;
        }

        /*
         * Guarantee the state is loaded before copying the descriptor
         * content. This avoids copying obsolete descriptor content that might
         * not apply to the descriptor state. This pairs with _prb_commit:B.
         *
         * Memory barrier involvement:
         *
         * If desc_read:A reads from _prb_commit:B, then desc_read:C reads
         * from _prb_commit:A.
         *
         * Relies on:
         *
         * WMB from _prb_commit:A to _prb_commit:B
         *    matching
         * RMB from desc_read:A to desc_read:C
         */
        smp_rmb(); /* LMM(desc_read:B) */

        /*
         * Copy the descriptor data. The data is not valid until the
         * state has been re-checked. A memcpy() for all of @desc
         * cannot be used because of the atomic_t @state_var field.
         */
        if (desc_out) {
                memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
                       sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
        }
        if (seq_out)
                *seq_out = info->seq; /* also part of desc_read:C */
        if (caller_id_out)
                *caller_id_out = info->caller_id; /* also part of desc_read:C */

        /*
         * 1. Guarantee the descriptor content is loaded before re-checking
         *    the state. This avoids reading an obsolete descriptor state
         *    that may not apply to the copied content. This pairs with
         *    desc_reserve:F.
         *
         *    Memory barrier involvement:
         *
         *    If desc_read:C reads from desc_reserve:G, then desc_read:E
         *    reads from desc_reserve:F.
         *
         *    Relies on:
         *
         *    WMB from desc_reserve:F to desc_reserve:G
         *       matching
         *    RMB from desc_read:C to desc_read:E
         *
         * 2. Guarantee the record data is loaded before re-checking the
         *    state. This avoids reading an obsolete descriptor state that may
         *    not apply to the copied data. This pairs with data_alloc:A and
         *    data_realloc:A.
         *
         *    Memory barrier involvement:
         *
         *    If copy_data:A reads from data_alloc:B, then desc_read:E
         *    reads from desc_make_reusable:A.
         *
         *    Relies on:
         *
         *    MB from desc_make_reusable:A to data_alloc:B
         *       matching
         *    RMB from desc_read:C to desc_read:E
         *
         *    Note: desc_make_reusable:A and data_alloc:B can be different
         *          CPUs. However, the data_alloc:B CPU (which performs the
         *          full memory barrier) must have previously seen
         *          desc_make_reusable:A.
         */
        smp_rmb(); /* LMM(desc_read:D) */

        /*
         * The data has been copied. Return the current descriptor state,
         * which may have changed since the load above.
         */
        state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
        d_state = get_desc_state(id, state_val);
out:
        if (desc_out)
                atomic_long_set(&desc_out->state_var, state_val);
        return d_state;
}

/*
 * Take a specified descriptor out of the finalized state by attempting
 * the transition from finalized to reusable. Either this context or some
 * other context will have been successful.
 */
static void desc_make_reusable(struct prb_desc_ring *desc_ring,
                               unsigned long id)
{
        unsigned long val_finalized = DESC_SV(id, desc_finalized);
        unsigned long val_reusable = DESC_SV(id, desc_reusable);
        struct prb_desc *desc = to_desc(desc_ring, id);
        atomic_long_t *state_var = &desc->state_var;

        atomic_long_cmpxchg_relaxed(state_var, val_finalized,
                                    val_reusable); /* LMM(desc_make_reusable:A) */
}

/*
 * Given the text data ring, put the associated descriptor of each
 * data block from @lpos_begin until @lpos_end into the reusable state.
 *
 * If there is any problem making the associated descriptor reusable, either
 * the descriptor has not yet been finalized or another writer context has
 * already pushed the tail lpos past the problematic data block. Regardless,
 * on error the caller can re-load the tail lpos to determine the situation.
 */
static bool data_make_reusable(struct printk_ringbuffer *rb,
                               unsigned long lpos_begin,
                               unsigned long lpos_end,
                               unsigned long *lpos_out)
{

        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct prb_data_block *blk;
        enum desc_state d_state;
        struct prb_desc desc;
        struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos;
        unsigned long id;

        /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
        while (need_more_space(data_ring, lpos_begin, lpos_end)) {
                blk = to_block(data_ring, lpos_begin);

                /*
                 * Load the block ID from the data block. This is a data race
                 * against a writer that may have newly reserved this data
                 * area. If the loaded value matches a valid descriptor ID,
                 * the blk_lpos of that descriptor will be checked to make
                 * sure it points back to this data block. If the check fails,
                 * the data area has been recycled by another writer.
                 */
                id = blk->id; /* LMM(data_make_reusable:A) */

                d_state = desc_read(desc_ring, id, &desc,
                                    NULL, NULL); /* LMM(data_make_reusable:B) */

                switch (d_state) {
                case desc_miss:
                case desc_reserved:
                case desc_committed:
                        return false;
                case desc_finalized:
                        /*
                         * This data block is invalid if the descriptor
                         * does not point back to it.
                         */
                        if (blk_lpos->begin != lpos_begin)
                                return false;
                        desc_make_reusable(desc_ring, id);
                        break;
                case desc_reusable:
                        /*
                         * This data block is invalid if the descriptor
                         * does not point back to it.
                         */
                        if (blk_lpos->begin != lpos_begin)
                                return false;
                        break;
                }

                /* Advance @lpos_begin to the next data block. */
                lpos_begin = blk_lpos->next;
        }

        *lpos_out = lpos_begin;
        return true;
}

/*
 * Advance the data ring tail to at least @lpos. This function puts
 * descriptors into the reusable state if the tail is pushed beyond
 * their associated data block.
 */
static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        unsigned long tail_lpos_new;
        unsigned long tail_lpos;
        unsigned long next_lpos;

        /* If @lpos is from a data-less block, there is nothing to do. */
        if (LPOS_DATALESS(lpos))
                return true;

        /*
         * Any descriptor states that have transitioned to reusable due to the
         * data tail being pushed to this loaded value will be visible to this
         * CPU. This pairs with data_push_tail:D.
         *
         * Memory barrier involvement:
         *
         * If data_push_tail:A reads from data_push_tail:D, then this CPU can
         * see desc_make_reusable:A.
         *
         * Relies on:
         *
         * MB from desc_make_reusable:A to data_push_tail:D
         *    matches
         * READFROM from data_push_tail:D to data_push_tail:A
         *    thus
         * READFROM from desc_make_reusable:A to this CPU
         */
        tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */

        /*
         * Loop until the tail lpos is at or beyond @lpos. This condition
         * may already be satisfied, resulting in no full memory barrier
         * from data_push_tail:D being performed. However, since this CPU
         * sees the new tail lpos, any descriptor states that transitioned to
         * the reusable state must already be visible.
         */
        while (need_more_space(data_ring, tail_lpos, lpos)) {
                /*
                 * Make all descriptors reusable that are associated with
                 * data blocks before @lpos.
                 */
                if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) {
                        /*
                         * 1. Guarantee the block ID loaded in
                         *    data_make_reusable() is performed before
                         *    reloading the tail lpos. The failed
                         *    data_make_reusable() may be due to a newly
                         *    recycled data area causing the tail lpos to
                         *    have been previously pushed. This pairs with
                         *    data_alloc:A and data_realloc:A.
                         *
                         *    Memory barrier involvement:
                         *
                         *    If data_make_reusable:A reads from data_alloc:B,
                         *    then data_push_tail:C reads from
                         *    data_push_tail:D.
                         *
                         *    Relies on:
                         *
                         *    MB from data_push_tail:D to data_alloc:B
                         *       matching
                         *    RMB from data_make_reusable:A to
                         *    data_push_tail:C
                         *
                         *    Note: data_push_tail:D and data_alloc:B can be
                         *          different CPUs. However, the data_alloc:B
                         *          CPU (which performs the full memory
                         *          barrier) must have previously seen
                         *          data_push_tail:D.
                         *
                         * 2. Guarantee the descriptor state loaded in
                         *    data_make_reusable() is performed before
                         *    reloading the tail lpos. The failed
                         *    data_make_reusable() may be due to a newly
                         *    recycled descriptor causing the tail lpos to
                         *    have been previously pushed. This pairs with
                         *    desc_reserve:D.
                         *
                         *    Memory barrier involvement:
                         *
                         *    If data_make_reusable:B reads from
                         *    desc_reserve:F, then data_push_tail:C reads
                         *    from data_push_tail:D.
                         *
                         *    Relies on:
                         *
                         *    MB from data_push_tail:D to desc_reserve:F
                         *       matching
                         *    RMB from data_make_reusable:B to
                         *    data_push_tail:C
                         *
                         *    Note: data_push_tail:D and desc_reserve:F can
                         *          be different CPUs. However, the
                         *          desc_reserve:F CPU (which performs the
                         *          full memory barrier) must have previously
                         *          seen data_push_tail:D.
                         */
                        smp_rmb(); /* LMM(data_push_tail:B) */

                        tail_lpos_new = atomic_long_read(&data_ring->tail_lpos
                                                        ); /* LMM(data_push_tail:C) */
                        if (tail_lpos_new == tail_lpos)
                                return false;

                        /* Another CPU pushed the tail. Try again. */
                        tail_lpos = tail_lpos_new;
                        continue;
                }

                /*
                 * Guarantee any descriptor states that have transitioned to
                 * reusable are stored before pushing the tail lpos. A full
                 * memory barrier is needed since other CPUs may have made
                 * the descriptor states reusable. This pairs with
                 * data_push_tail:A.
                 */
                if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos,
                                            next_lpos)) { /* LMM(data_push_tail:D) */
                        break;
                }
        }

        return true;
}

/*
 * Advance the desc ring tail. This function advances the tail by one
 * descriptor, thus invalidating the oldest descriptor. Before advancing
 * the tail, the tail descriptor is made reusable and all data blocks up to
 * and including the descriptor's data block are invalidated (i.e. the data
 * ring tail is pushed past the data block of the descriptor being made
 * reusable).
 */
static bool desc_push_tail(struct printk_ringbuffer *rb,
                           unsigned long tail_id)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        enum desc_state d_state;
        struct prb_desc desc;

        d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL);

        switch (d_state) {
        case desc_miss:
                /*
                 * If the ID is exactly 1 wrap behind the expected, it is
                 * in the process of being reserved by another writer and
                 * must be considered reserved.
                 */
                if (DESC_ID(atomic_long_read(&desc.state_var)) ==
                    DESC_ID_PREV_WRAP(desc_ring, tail_id)) {
                        return false;
                }

                /*
                 * The ID has changed. Another writer must have pushed the
                 * tail and recycled the descriptor already. Success is
                 * returned because the caller is only interested in the
                 * specified tail being pushed, which it was.
                 */
                return true;
        case desc_reserved:
        case desc_committed:
                return false;
        case desc_finalized:
                desc_make_reusable(desc_ring, tail_id);
                break;
        case desc_reusable:
                break;
        }

        /*
         * Data blocks must be invalidated before their associated
         * descriptor can be made available for recycling. Invalidating
         * them later is not possible because there is no way to trust
         * data blocks once their associated descriptor is gone.
         */

        if (!data_push_tail(rb, desc.text_blk_lpos.next))
                return false;

        /*
         * Check the next descriptor after @tail_id before pushing the tail
         * to it because the tail must always be in a finalized or reusable
         * state. The implementation of prb_first_seq() relies on this.
         *
         * A successful read implies that the next descriptor is less than or
         * equal to @head_id so there is no risk of pushing the tail past the
         * head.
         */
        d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc,
                            NULL, NULL); /* LMM(desc_push_tail:A) */

        if (d_state == desc_finalized || d_state == desc_reusable) {
                /*
                 * Guarantee any descriptor states that have transitioned to
                 * reusable are stored before pushing the tail ID. This allows
                 * verifying the recycled descriptor state. A full memory
                 * barrier is needed since other CPUs may have made the
                 * descriptor states reusable. This pairs with desc_reserve:D.
                 */
                atomic_long_cmpxchg(&desc_ring->tail_id, tail_id,
                                    DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */
        } else {
                /*
                 * Guarantee the last state load from desc_read() is before
                 * reloading @tail_id in order to see a new tail ID in the
                 * case that the descriptor has been recycled. This pairs
                 * with desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If desc_push_tail:A reads from desc_reserve:F, then
                 * desc_push_tail:D reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:F
                 *    matching
                 * RMB from desc_push_tail:A to desc_push_tail:D
                 *
                 * Note: desc_push_tail:B and desc_reserve:F can be different
                 *       CPUs. However, the desc_reserve:F CPU (which performs
                 *       the full memory barrier) must have previously seen
                 *       desc_push_tail:B.
                 */
                smp_rmb(); /* LMM(desc_push_tail:C) */

                /*
                 * Re-check the tail ID. The descriptor following @tail_id is
                 * not in an allowed tail state. But if the tail has since
                 * been moved by another CPU, then it does not matter.
                 */
                if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */
                        return false;
        }

        return true;
}

/* Reserve a new descriptor, invalidating the oldest if necessary. */
static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long prev_state_val;
        unsigned long id_prev_wrap;
        struct prb_desc *desc;
        unsigned long head_id;
        unsigned long id;

        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */

        do {
                id = DESC_ID(head_id + 1);
                id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);

                /*
                 * Guarantee the head ID is read before reading the tail ID.
                 * Since the tail ID is updated before the head ID, this
                 * guarantees that @id_prev_wrap is never ahead of the tail
                 * ID. This pairs with desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If desc_reserve:A reads from desc_reserve:D, then
                 * desc_reserve:C reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:D
                 *    matching
                 * RMB from desc_reserve:A to desc_reserve:C
                 *
                 * Note: desc_push_tail:B and desc_reserve:D can be different
                 *       CPUs. However, the desc_reserve:D CPU (which performs
                 *       the full memory barrier) must have previously seen
                 *       desc_push_tail:B.
                 */
                smp_rmb(); /* LMM(desc_reserve:B) */

                if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id
                                                    )) { /* LMM(desc_reserve:C) */
                        /*
                         * Make space for the new descriptor by
                         * advancing the tail.
                         */
                        if (!desc_push_tail(rb, id_prev_wrap))
                                return false;
                }

                /*
                 * 1. Guarantee the tail ID is read before validating the
                 *    recycled descriptor state. A read memory barrier is
                 *    sufficient for this. This pairs with desc_push_tail:B.
                 *
                 *    Memory barrier involvement:
                 *
                 *    If desc_reserve:C reads from desc_push_tail:B, then
                 *    desc_reserve:E reads from desc_make_reusable:A.
                 *
                 *    Relies on:
                 *
                 *    MB from desc_make_reusable:A to desc_push_tail:B
                 *       matching
                 *    RMB from desc_reserve:C to desc_reserve:E
                 *
                 *    Note: desc_make_reusable:A and desc_push_tail:B can be
                 *          different CPUs. However, the desc_push_tail:B CPU
                 *          (which performs the full memory barrier) must have
                 *          previously seen desc_make_reusable:A.
                 *
                 * 2. Guarantee the tail ID is stored before storing the head
                 *    ID. This pairs with desc_reserve:B.
                 *
                 * 3. Guarantee any data ring tail changes are stored before
                 *    recycling the descriptor. Data ring tail changes can
                 *    happen via desc_push_tail()->data_push_tail(). A full
                 *    memory barrier is needed since another CPU may have
                 *    pushed the data ring tails. This pairs with
                 *    data_push_tail:B.
                 *
                 * 4. Guarantee a new tail ID is stored before recycling the
                 *    descriptor. A full memory barrier is needed since
                 *    another CPU may have pushed the tail ID. This pairs
                 *    with desc_push_tail:C and this also pairs with
                 *    prb_first_seq:C.
                 *
                 * 5. Guarantee the head ID is stored before trying to
                 *    finalize the previous descriptor. This pairs with
                 *    _prb_commit:B.
                 */
        } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id,
                                          id)); /* LMM(desc_reserve:D) */

        desc = to_desc(desc_ring, id);

        /*
         * If the descriptor has been recycled, verify the old state val.
         * See "ABA Issues" about why this verification is performed.
         */
        prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */
        if (prev_state_val &&
            get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) {
                WARN_ON_ONCE(1);
                return false;
        }

        /*
         * Assign the descriptor a new ID and set its state to reserved.
         * See "ABA Issues" about why cmpxchg() instead of set() is used.
         *
         * Guarantee the new descriptor ID and state is stored before making
         * any other changes. A write memory barrier is sufficient for this.
         * This pairs with desc_read:D.
         */
        if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val,
                        DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */
                WARN_ON_ONCE(1);
                return false;
        }

        /* Now data in @desc can be modified: LMM(desc_reserve:G) */

        *id_out = id;
        return true;
}

static bool is_blk_wrapped(struct prb_data_ring *data_ring,
                           unsigned long begin_lpos, unsigned long next_lpos)
{
        /*
         * Subtract one from next_lpos since it's not actually part of this data
         * block. This allows perfectly fitting records to not wrap.
         */
        return DATA_WRAPS(data_ring, begin_lpos) !=
               DATA_WRAPS(data_ring, next_lpos - 1);
}

/* Determine the end of a data block. */
static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
                                   unsigned long lpos, unsigned int size)
{
        unsigned long begin_lpos;
        unsigned long next_lpos;

        begin_lpos = lpos;
        next_lpos = lpos + size;

        /* First check if the data block does not wrap. */
        if (!is_blk_wrapped(data_ring, begin_lpos, next_lpos))
                return next_lpos;

        /* Wrapping data blocks store their data at the beginning. */
        return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size);
}

/*
 * Allocate a new data block, invalidating the oldest data block(s)
 * if necessary. This function also associates the data block with
 * a specified descriptor.
 */
static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
                        struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_data_block *blk;
        unsigned long begin_lpos;
        unsigned long next_lpos;

        if (size == 0) {
                /*
                 * Data blocks are not created for empty lines. Instead, the
                 * reader will recognize these special lpos values and handle
                 * it appropriately.
                 */
                blk_lpos->begin = EMPTY_LINE_LPOS;
                blk_lpos->next = EMPTY_LINE_LPOS;
                return NULL;
        }

        size = to_blk_size(size);

        begin_lpos = atomic_long_read(&data_ring->head_lpos);

        do {
                next_lpos = get_next_lpos(data_ring, begin_lpos, size);

                /*
                 * data_check_size() prevents data block allocation that could
                 * cause illegal ringbuffer states. But double check that the
                 * used space will not be bigger than the ring buffer. Wrapped
                 * messages need to reserve more space, see get_next_lpos().
                 *
                 * Specify a data-less block when the check or the allocation
                 * fails.
                 */
                if (WARN_ON_ONCE(next_lpos - begin_lpos > DATA_SIZE(data_ring)) ||
                    !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
                        blk_lpos->begin = FAILED_LPOS;
                        blk_lpos->next = FAILED_LPOS;
                        return NULL;
                }

                /*
                 * 1. Guarantee any descriptor states that have transitioned
                 *    to reusable are stored before modifying the newly
                 *    allocated data area. A full memory barrier is needed
                 *    since other CPUs may have made the descriptor states
                 *    reusable. See data_push_tail:A about why the reusable
                 *    states are visible. This pairs with desc_read:D.
                 *
                 * 2. Guarantee any updated tail lpos is stored before
                 *    modifying the newly allocated data area. Another CPU may
                 *    be in data_make_reusable() and is reading a block ID
                 *    from this area. data_make_reusable() can handle reading
                 *    a garbage block ID value, but then it must be able to
                 *    load a new tail lpos. A full memory barrier is needed
                 *    since other CPUs may have updated the tail lpos. This
                 *    pairs with data_push_tail:B.
                 */
        } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos,
                                          next_lpos)); /* LMM(data_alloc:A) */

        blk = to_block(data_ring, begin_lpos);
        blk->id = id; /* LMM(data_alloc:B) */

        if (is_blk_wrapped(data_ring, begin_lpos, next_lpos)) {
                /* Wrapping data blocks store their data at the beginning. */
                blk = to_block(data_ring, 0);

                /*
                 * Store the ID on the wrapped block for consistency.
                 * The printk_ringbuffer does not actually use it.
                 */
                blk->id = id;
        }

        blk_lpos->begin = begin_lpos;
        blk_lpos->next = next_lpos;

        return &blk->data[0];
}

/*
 * Try to resize an existing data block associated with the descriptor
 * specified by @id. If the resized data block should become wrapped, it
 * copies the old data to the new data block. If @size yields a data block
 * with the same or less size, the data block is left as is.
 *
 * Fail if this is not the last allocated data block or if there is not
 * enough space or it is not possible make enough space.
 *
 * Return a pointer to the beginning of the entire data buffer or NULL on
 * failure.
 */
static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
                          struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_data_block *blk;
        unsigned long head_lpos;
        unsigned long next_lpos;
        bool wrapped;

        /* Reallocation only works if @blk_lpos is the newest data block. */
        head_lpos = atomic_long_read(&data_ring->head_lpos);
        if (head_lpos != blk_lpos->next)
                return NULL;

        /* Keep track if @blk_lpos was a wrapping data block. */
        wrapped = is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next);

        size = to_blk_size(size);

        next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);

        /*
         * Use the current data block when the size does not increase, i.e.
         * when @head_lpos is already able to accommodate the new @next_lpos.
         *
         * Note that need_more_space() could never return false here because
         * the difference between the positions was bigger than the data
         * buffer size. The data block is reopened and can't get reused.
         */
        if (!need_more_space(data_ring, head_lpos, next_lpos)) {
                if (wrapped)
                        blk = to_block(data_ring, 0);
                else
                        blk = to_block(data_ring, blk_lpos->begin);
                return &blk->data[0];
        }

        /*
         * data_check_size() prevents data block reallocation that could
         * cause illegal ringbuffer states. But double check that the
         * new used space will not be bigger than the ring buffer. Wrapped
         * messages need to reserve more space, see get_next_lpos().
         *
         * Specify failure when the check or the allocation fails.
         */
        if (WARN_ON_ONCE(next_lpos - blk_lpos->begin > DATA_SIZE(data_ring)) ||
            !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
                return NULL;
        }

        /* The memory barrier involvement is the same as data_alloc:A. */
        if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos,
                                     next_lpos)) { /* LMM(data_realloc:A) */
                return NULL;
        }

        blk = to_block(data_ring, blk_lpos->begin);

        if (is_blk_wrapped(data_ring, blk_lpos->begin, next_lpos)) {
                struct prb_data_block *old_blk = blk;

                /* Wrapping data blocks store their data at the beginning. */
                blk = to_block(data_ring, 0);

                /*
                 * Store the ID on the wrapped block for consistency.
                 * The printk_ringbuffer does not actually use it.
                 */
                blk->id = id;

                if (!wrapped) {
                        /*
                         * Since the allocated space is now in the newly
                         * created wrapping data block, copy the content
                         * from the old data block.
                         */
                        memcpy(&blk->data[0], &old_blk->data[0],
                               (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id));
                }
        }

        blk_lpos->next = next_lpos;

        return &blk->data[0];
}

/* Return the number of bytes used by a data block. */
static unsigned int space_used(struct prb_data_ring *data_ring,
                               struct prb_data_blk_lpos *blk_lpos)
{
        /* Data-less blocks take no space. */
        if (BLK_DATALESS(blk_lpos))
                return 0;

        if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) {
                /* Data block does not wrap. */
                return (DATA_INDEX(data_ring, blk_lpos->next) -
                        DATA_INDEX(data_ring, blk_lpos->begin));
        }

        /*
         * For wrapping data blocks, the trailing (wasted) space is
         * also counted.
         */
        return (DATA_INDEX(data_ring, blk_lpos->next) +
                DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin));
}

/*
 * Given @blk_lpos, return a pointer to the writer data from the data block
 * and calculate the size of the data part. A NULL pointer is returned if
 * @blk_lpos specifies values that could never be legal.
 *
 * This function (used by readers) performs strict validation on the lpos
 * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
 * triggered if an internal error is detected.
 */
static const char *get_data(struct prb_data_ring *data_ring,
                            struct prb_data_blk_lpos *blk_lpos,
                            unsigned int *data_size)
{
        struct prb_data_block *db;

        /* Data-less data block description. */
        if (BLK_DATALESS(blk_lpos)) {
                /*
                 * Records that are just empty lines are also valid, even
                 * though they do not have a data block. For such records
                 * explicitly return empty string data to signify success.
                 */
                if (blk_lpos->begin == EMPTY_LINE_LPOS &&
                    blk_lpos->next == EMPTY_LINE_LPOS) {
                        *data_size = 0;
                        return "";
                }

                /* Data lost, invalid, or otherwise unavailable. */
                return NULL;
        }

        /* Regular data block: @begin and @next in the same wrap. */
        if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) {
                db = to_block(data_ring, blk_lpos->begin);
                *data_size = blk_lpos->next - blk_lpos->begin;

        /* Wrapping data block: @begin is one wrap behind @next. */
        } else if (!is_blk_wrapped(data_ring,
                                   blk_lpos->begin + DATA_SIZE(data_ring),
                                   blk_lpos->next)) {
                db = to_block(data_ring, 0);
                *data_size = DATA_INDEX(data_ring, blk_lpos->next);

        /* Illegal block description. */
        } else {
                WARN_ON_ONCE(1);
                return NULL;
        }

        /* Sanity check. Data-less blocks were handled earlier. */
        if (WARN_ON_ONCE(!data_check_size(data_ring, *data_size) || !*data_size))
                return NULL;

        /* A valid data block will always be aligned to the ID size. */
        if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) ||
            WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
                return NULL;
        }

        /* A valid data block will always have at least an ID. */
        if (WARN_ON_ONCE(*data_size < sizeof(db->id)))
                return NULL;

        /* Subtract block ID space from size to reflect data size. */
        *data_size -= sizeof(db->id);

        return &db->data[0];
}

/*
 * Attempt to transition the newest descriptor from committed back to reserved
 * so that the record can be modified by a writer again. This is only possible
 * if the descriptor is not yet finalized and the provided @caller_id matches.
 */
static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
                                         u32 caller_id, unsigned long *id_out)
{
        unsigned long prev_state_val;
        enum desc_state d_state;
        struct prb_desc desc;
        struct prb_desc *d;
        unsigned long id;
        u32 cid;

        id = atomic_long_read(&desc_ring->head_id);

        /*
         * To reduce unnecessarily reopening, first check if the descriptor
         * state and caller ID are correct.
         */
        d_state = desc_read(desc_ring, id, &desc, NULL, &cid);
        if (d_state != desc_committed || cid != caller_id)
                return NULL;

        d = to_desc(desc_ring, id);

        prev_state_val = DESC_SV(id, desc_committed);

        /*
         * Guarantee the reserved state is stored before reading any
         * record data. A full memory barrier is needed because @state_var
         * modification is followed by reading. This pairs with _prb_commit:B.
         *
         * Memory barrier involvement:
         *
         * If desc_reopen_last:A reads from _prb_commit:B, then
         * prb_reserve_in_last:A reads from _prb_commit:A.
         *
         * Relies on:
         *
         * WMB from _prb_commit:A to _prb_commit:B
         *    matching
         * MB If desc_reopen_last:A to prb_reserve_in_last:A
         */
        if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
                        DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */
                return NULL;
        }

        *id_out = id;
        return d;
}

/**
 * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
 *                         used by the newest record.
 *
 * @e:         The entry structure to setup.
 * @rb:        The ringbuffer to re-reserve and extend data in.
 * @r:         The record structure to allocate buffers for.
 * @caller_id: The caller ID of the caller (reserving writer).
 * @max_size:  Fail if the extended size would be greater than this.
 *
 * This is the public function available to writers to re-reserve and extend
 * data.
 *
 * The writer specifies the text size to extend (not the new total size) by
 * setting the @text_buf_size field of @r. To ensure proper initialization
 * of @r, prb_rec_init_wr() should be used.
 *
 * This function will fail if @caller_id does not match the caller ID of the
 * newest record. In that case the caller must reserve new data using
 * prb_reserve().
 *
 * Context: Any context. Disables local interrupts on success.
 * Return: true if text data could be extended, otherwise false.
 *
 * On success:
 *
 *   - @r->text_buf points to the beginning of the entire text buffer.
 *
 *   - @r->text_buf_size is set to the new total size of the buffer.
 *
 *   - @r->info is not touched so that @r->info->text_len could be used
 *     to append the text.
 *
 *   - prb_record_text_space() can be used on @e to query the new
 *     actually used space.
 *
 * Important: All @r->info fields will already be set with the current values
 *            for the record. I.e. @r->info->text_len will be less than
 *            @text_buf_size. Writers can use @r->info->text_len to know
 *            where concatenation begins and writers should update
 *            @r->info->text_len after concatenating.
 */
bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                         struct printk_record *r, u32 caller_id, unsigned int max_size)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info;
        unsigned int data_size;
        struct prb_desc *d;
        unsigned long id;

        local_irq_save(e->irqflags);

        /* Transition the newest descriptor back to the reserved state. */
        d = desc_reopen_last(desc_ring, caller_id, &id);
        if (!d) {
                local_irq_restore(e->irqflags);
                goto fail_reopen;
        }

        /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */

        info = to_info(desc_ring, id);

        /*
         * Set the @e fields here so that prb_commit() can be used if
         * anything fails from now on.
         */
        e->rb = rb;
        e->id = id;

        /*
         * desc_reopen_last() checked the caller_id, but there was no
         * exclusive access at that point. The descriptor may have
         * changed since then.
         */
        if (caller_id != info->caller_id)
                goto fail;

        if (BLK_DATALESS(&d->text_blk_lpos)) {
                if (WARN_ON_ONCE(info->text_len != 0)) {
                        pr_warn_once("wrong text_len value (%hu, expecting 0)\n",
                                     info->text_len);
                        info->text_len = 0;
                }

                if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                        goto fail;

                if (r->text_buf_size > max_size)
                        goto fail;

                r->text_buf = data_alloc(rb, r->text_buf_size,
                                         &d->text_blk_lpos, id);
        } else {
                if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
                        goto fail;

                /*
                 * Increase the buffer size to include the original size. If
                 * the meta data (@text_len) is not sane, use the full data
                 * block size.
                 */
                if (WARN_ON_ONCE(info->text_len > data_size)) {
                        pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n",
                                     info->text_len, data_size);
                        info->text_len = data_size;
                }
                r->text_buf_size += info->text_len;

                if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                        goto fail;

                if (r->text_buf_size > max_size)
                        goto fail;

                r->text_buf = data_realloc(rb, r->text_buf_size,
                                           &d->text_blk_lpos, id);
        }
        if (r->text_buf_size && !r->text_buf)
                goto fail;

        r->info = info;

        e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);

        return true;
fail:
        prb_commit(e);
        /* prb_commit() re-enabled interrupts. */
fail_reopen:
        /* Make it clear to the caller that the re-reserve failed. */
        memset(r, 0, sizeof(*r));
        return false;
}

/*
 * @last_finalized_seq value guarantees that all records up to and including
 * this sequence number are finalized and can be read. The only exception are
 * too old records which have already been overwritten.
 *
 * It is also guaranteed that @last_finalized_seq only increases.
 *
 * Be aware that finalized records following non-finalized records are not
 * reported because they are not yet available to the reader. For example,
 * a new record stored via printk() will not be available to a printer if
 * it follows a record that has not been finalized yet. However, once that
 * non-finalized record becomes finalized, @last_finalized_seq will be
 * appropriately updated and the full set of finalized records will be
 * available to the printer. And since each printk() caller will either
 * directly print or trigger deferred printing of all available unprinted
 * records, all printk() messages will get printed.
 */
static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long ulseq;

        /*
         * Guarantee the sequence number is loaded before loading the
         * associated record in order to guarantee that the record can be
         * seen by this CPU. This pairs with desc_update_last_finalized:A.
         */
        ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq
                                        ); /* LMM(desc_last_finalized_seq:A) */

        return __ulseq_to_u64seq(rb, ulseq);
}

static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
                            struct printk_record *r, unsigned int *line_count);

/*
 * Check if there are records directly following @last_finalized_seq that are
 * finalized. If so, update @last_finalized_seq to the latest of these
 * records. It is not allowed to skip over records that are not yet finalized.
 */
static void desc_update_last_finalized(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        u64 old_seq = desc_last_finalized_seq(rb);
        unsigned long oldval;
        unsigned long newval;
        u64 finalized_seq;
        u64 try_seq;

try_again:
        finalized_seq = old_seq;
        try_seq = finalized_seq + 1;

        /* Try to find later finalized records. */
        while (_prb_read_valid(rb, &try_seq, NULL, NULL)) {
                finalized_seq = try_seq;
                try_seq++;
        }

        /* No update needed if no later finalized record was found. */
        if (finalized_seq == old_seq)
                return;

        oldval = __u64seq_to_ulseq(old_seq);
        newval = __u64seq_to_ulseq(finalized_seq);

        /*
         * Set the sequence number of a later finalized record that has been
         * seen.
         *
         * Guarantee the record data is visible to other CPUs before storing
         * its sequence number. This pairs with desc_last_finalized_seq:A.
         *
         * Memory barrier involvement:
         *
         * If desc_last_finalized_seq:A reads from
         * desc_update_last_finalized:A, then desc_read:A reads from
         * _prb_commit:B.
         *
         * Relies on:
         *
         * RELEASE from _prb_commit:B to desc_update_last_finalized:A
         *    matching
         * ACQUIRE from desc_last_finalized_seq:A to desc_read:A
         *
         * Note: _prb_commit:B and desc_update_last_finalized:A can be
         *       different CPUs. However, the desc_update_last_finalized:A
         *       CPU (which performs the release) must have previously seen
         *       _prb_commit:B.
         */
        if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq,
                                &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */
                old_seq = __ulseq_to_u64seq(rb, oldval);
                goto try_again;
        }
}

/*
 * Attempt to finalize a specified descriptor. If this fails, the descriptor
 * is either already final or it will finalize itself when the writer commits.
 */
static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long prev_state_val = DESC_SV(id, desc_committed);
        struct prb_desc *d = to_desc(desc_ring, id);

        if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val,
                        DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */
                desc_update_last_finalized(rb);
        }
}

/**
 * prb_reserve() - Reserve space in the ringbuffer.
 *
 * @e:  The entry structure to setup.
 * @rb: The ringbuffer to reserve data in.
 * @r:  The record structure to allocate buffers for.
 *
 * This is the public function available to writers to reserve data.
 *
 * The writer specifies the text size to reserve by setting the
 * @text_buf_size field of @r. To ensure proper initialization of @r,
 * prb_rec_init_wr() should be used.
 *
 * Context: Any context. Disables local interrupts on success.
 * Return: true if at least text data could be allocated, otherwise false.
 *
 * On success, the fields @info and @text_buf of @r will be set by this
 * function and should be filled in by the writer before committing. Also
 * on success, prb_record_text_space() can be used on @e to query the actual
 * space used for the text data block.
 *
 * Important: @info->text_len needs to be set correctly by the writer in
 *            order for data to be readable and/or extended. Its value
 *            is initialized to 0.
 */
bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                 struct printk_record *r)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info;
        struct prb_desc *d;
        unsigned long id;
        u64 seq;

        if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                goto fail;

        /*
         * Descriptors in the reserved state act as blockers to all further
         * reservations once the desc_ring has fully wrapped. Disable
         * interrupts during the reserve/commit window in order to minimize
         * the likelihood of this happening.
         */
        local_irq_save(e->irqflags);

        if (!desc_reserve(rb, &id)) {
                /* Descriptor reservation failures are tracked. */
                atomic_long_inc(&rb->fail);
                local_irq_restore(e->irqflags);
                goto fail;
        }

        d = to_desc(desc_ring, id);
        info = to_info(desc_ring, id);

        /*
         * All @info fields (except @seq) are cleared and must be filled in
         * by the writer. Save @seq before clearing because it is used to
         * determine the new sequence number.
         */
        seq = info->seq;
        memset(info, 0, sizeof(*info));

        /*
         * Set the @e fields here so that prb_commit() can be used if
         * text data allocation fails.
         */
        e->rb = rb;
        e->id = id;

        /*
         * Initialize the sequence number if it has "never been set".
         * Otherwise just increment it by a full wrap.
         *
         * @seq is considered "never been set" if it has a value of 0,
         * _except_ for @infos[0], which was specially setup by the ringbuffer
         * initializer and therefore is always considered as set.
         *
         * See the "Bootstrap" comment block in printk_ringbuffer.h for
         * details about how the initializer bootstraps the descriptors.
         */
        if (seq == 0 && DESC_INDEX(desc_ring, id) != 0)
                info->seq = DESC_INDEX(desc_ring, id);
        else
                info->seq = seq + DESCS_COUNT(desc_ring);

        /*
         * New data is about to be reserved. Once that happens, previous
         * descriptors are no longer able to be extended. Finalize the
         * previous descriptor now so that it can be made available to
         * readers. (For seq==0 there is no previous descriptor.)
         */
        if (info->seq > 0)
                desc_make_final(rb, DESC_ID(id - 1));

        r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
        /* If text data allocation fails, a data-less record is committed. */
        if (r->text_buf_size && !r->text_buf) {
                prb_commit(e);
                /* prb_commit() re-enabled interrupts. */
                goto fail;
        }

        r->info = info;

        /* Record full text space used by record. */
        e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);

        return true;
fail:
        /* Make it clear to the caller that the reserve failed. */
        memset(r, 0, sizeof(*r));
        return false;
}
EXPORT_SYMBOL_IF_KUNIT(prb_reserve);

/* Commit the data (possibly finalizing it) and restore interrupts. */
static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val)
{
        struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
        struct prb_desc *d = to_desc(desc_ring, e->id);
        unsigned long prev_state_val = DESC_SV(e->id, desc_reserved);

        /* Now the writer has finished all writing: LMM(_prb_commit:A) */

        /*
         * Set the descriptor as committed. See "ABA Issues" about why
         * cmpxchg() instead of set() is used.
         *
         * 1  Guarantee all record data is stored before the descriptor state
         *    is stored as committed. A write memory barrier is sufficient
         *    for this. This pairs with desc_read:B and desc_reopen_last:A.
         *
         * 2. Guarantee the descriptor state is stored as committed before
         *    re-checking the head ID in order to possibly finalize this
         *    descriptor. This pairs with desc_reserve:D.
         *
         *    Memory barrier involvement:
         *
         *    If prb_commit:A reads from desc_reserve:D, then
         *    desc_make_final:A reads from _prb_commit:B.
         *
         *    Relies on:
         *
         *    MB _prb_commit:B to prb_commit:A
         *       matching
         *    MB desc_reserve:D to desc_make_final:A
         */
        if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
                        DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */
                WARN_ON_ONCE(1);
        }

        /* Restore interrupts, the reserve/commit window is finished. */
        local_irq_restore(e->irqflags);
}

/**
 * prb_commit() - Commit (previously reserved) data to the ringbuffer.
 *
 * @e: The entry containing the reserved data information.
 *
 * This is the public function available to writers to commit data.
 *
 * Note that the data is not yet available to readers until it is finalized.
 * Finalizing happens automatically when space for the next record is
 * reserved.
 *
 * See prb_final_commit() for a version of this function that finalizes
 * immediately.
 *
 * Context: Any context. Enables local interrupts.
 */
void prb_commit(struct prb_reserved_entry *e)
{
        struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
        unsigned long head_id;

        _prb_commit(e, desc_committed);

        /*
         * If this descriptor is no longer the head (i.e. a new record has
         * been allocated), extending the data for this record is no longer
         * allowed and therefore it must be finalized.
         */
        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
        if (head_id != e->id)
                desc_make_final(e->rb, e->id);
}
EXPORT_SYMBOL_IF_KUNIT(prb_commit);

/**
 * prb_final_commit() - Commit and finalize (previously reserved) data to
 *                      the ringbuffer.
 *
 * @e: The entry containing the reserved data information.
 *
 * This is the public function available to writers to commit+finalize data.
 *
 * By finalizing, the data is made immediately available to readers.
 *
 * This function should only be used if there are no intentions of extending
 * this data using prb_reserve_in_last().
 *
 * Context: Any context. Enables local interrupts.
 */
void prb_final_commit(struct prb_reserved_entry *e)
{
        _prb_commit(e, desc_finalized);

        desc_update_last_finalized(e->rb);
}

/*
 * Count the number of lines in provided text. All text has at least 1 line
 * (even if @text_size is 0). Each '\n' processed is counted as an additional
 * line.
 */
static unsigned int count_lines(const char *text, unsigned int text_size)
{
        unsigned int next_size = text_size;
        unsigned int line_count = 1;
        const char *next = text;

        while (next_size) {
                next = memchr(next, '\n', next_size);
                if (!next)
                        break;
                line_count++;
                next++;
                next_size = text_size - (next - text);
        }

        return line_count;
}

/*
 * Given @blk_lpos, copy an expected @len of data into the provided buffer.
 * If @line_count is provided, count the number of lines in the data.
 *
 * This function (used by readers) performs strict validation on the data
 * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
 * triggered if an internal error is detected.
 */
static bool copy_data(struct prb_data_ring *data_ring,
                      struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf,
                      unsigned int buf_size, unsigned int *line_count)
{
        unsigned int data_size;
        const char *data;

        /* Caller might not want any data. */
        if ((!buf || !buf_size) && !line_count)
                return true;

        data = get_data(data_ring, blk_lpos, &data_size);
        if (!data)
                return false;

        /*
         * Actual cannot be less than expected. It can be more than expected
         * because of the trailing alignment padding.
         *
         * Note that invalid @len values can occur because the caller loads
         * the value during an allowed data race.
         */
        if (data_size < (unsigned int)len)
                return false;

        /* Caller interested in the line count? */
        if (line_count)
                *line_count = count_lines(data, len);

        /* Caller interested in the data content? */
        if (!buf || !buf_size)
                return true;

        data_size = min_t(unsigned int, buf_size, len);

        memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
        return true;
}

/*
 * This is an extended version of desc_read(). It gets a copy of a specified
 * descriptor. However, it also verifies that the record is finalized and has
 * the sequence number @seq. On success, 0 is returned.
 *
 * Error return values:
 * -EINVAL: A finalized record with sequence number @seq does not exist.
 * -ENOENT: A finalized record with sequence number @seq exists, but its data
 *          is not available. This is a valid record, so readers should
 *          continue with the next record.
 */
static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring,
                                   unsigned long id, u64 seq,
                                   struct prb_desc *desc_out)
{
        struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos;
        enum desc_state d_state;
        u64 s;

        d_state = desc_read(desc_ring, id, desc_out, &s, NULL);

        /*
         * An unexpected @id (desc_miss) or @seq mismatch means the record
         * does not exist. A descriptor in the reserved or committed state
         * means the record does not yet exist for the reader.
         */
        if (d_state == desc_miss ||
            d_state == desc_reserved ||
            d_state == desc_committed ||
            s != seq) {
                return -EINVAL;
        }

        /*
         * A descriptor in the reusable state may no longer have its data
         * available; report it as existing but with lost data. Or the record
         * may actually be a record with lost data.
         */
        if (d_state == desc_reusable ||
            (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) {
                return -ENOENT;
        }

        return 0;
}

/*
 * Copy the ringbuffer data from the record with @seq to the provided
 * @r buffer. On success, 0 is returned.
 *
 * See desc_read_finalized_seq() for error return values.
 */
static int prb_read(struct printk_ringbuffer *rb, u64 seq,
                    struct printk_record *r, unsigned int *line_count)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info = to_info(desc_ring, seq);
        struct prb_desc *rdesc = to_desc(desc_ring, seq);
        atomic_long_t *state_var = &rdesc->state_var;
        struct prb_desc desc;
        unsigned long id;
        int err;

        /* Extract the ID, used to specify the descriptor to read. */
        id = DESC_ID(atomic_long_read(state_var));

        /* Get a local copy of the correct descriptor (if available). */
        err = desc_read_finalized_seq(desc_ring, id, seq, &desc);

        /*
         * If @r is NULL, the caller is only interested in the availability
         * of the record.
         */
        if (err || !r)
                return err;

        /* If requested, copy meta data. */
        if (r->info)
                memcpy(r->info, info, sizeof(*(r->info)));

        /* Copy text data. If it fails, this is a data-less record. */
        if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len,
                       r->text_buf, r->text_buf_size, line_count)) {
                return -ENOENT;
        }

        /* Ensure the record is still finalized and has the same @seq. */
        return desc_read_finalized_seq(desc_ring, id, seq, &desc);
}

/* Get the sequence number of the tail descriptor. */
u64 prb_first_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        enum desc_state d_state;
        struct prb_desc desc;
        unsigned long id;
        u64 seq;

        for (;;) {
                id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */

                d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */

                /*
                 * This loop will not be infinite because the tail is
                 * _always_ in the finalized or reusable state.
                 */
                if (d_state == desc_finalized || d_state == desc_reusable)
                        break;

                /*
                 * Guarantee the last state load from desc_read() is before
                 * reloading @tail_id in order to see a new tail in the case
                 * that the descriptor has been recycled. This pairs with
                 * desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If prb_first_seq:B reads from desc_reserve:F, then
                 * prb_first_seq:A reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:F
                 *    matching
                 * RMB prb_first_seq:B to prb_first_seq:A
                 */
                smp_rmb(); /* LMM(prb_first_seq:C) */
        }

        return seq;
}

/**
 * prb_next_reserve_seq() - Get the sequence number after the most recently
 *                  reserved record.
 *
 * @rb:  The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what sequence
 * number will be assigned to the next reserved record.
 *
 * Note that depending on the situation, this value can be equal to or
 * higher than the sequence number returned by prb_next_seq().
 *
 * Context: Any context.
 * Return: The sequence number that will be assigned to the next record
 *         reserved.
 */
u64 prb_next_reserve_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long last_finalized_id;
        atomic_long_t *state_var;
        u64 last_finalized_seq;
        unsigned long head_id;
        struct prb_desc desc;
        unsigned long diff;
        struct prb_desc *d;
        int err;

        /*
         * It may not be possible to read a sequence number for @head_id.
         * So the ID of @last_finailzed_seq is used to calculate what the
         * sequence number of @head_id will be.
         */

try_again:
        last_finalized_seq = desc_last_finalized_seq(rb);

        /*
         * @head_id is loaded after @last_finalized_seq to ensure that
         * it points to the record with @last_finalized_seq or newer.
         *
         * Memory barrier involvement:
         *
         * If desc_last_finalized_seq:A reads from
         * desc_update_last_finalized:A, then
         * prb_next_reserve_seq:A reads from desc_reserve:D.
         *
         * Relies on:
         *
         * RELEASE from desc_reserve:D to desc_update_last_finalized:A
         *    matching
         * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A
         *
         * Note: desc_reserve:D and desc_update_last_finalized:A can be
         *       different CPUs. However, the desc_update_last_finalized:A CPU
         *       (which performs the release) must have previously seen
         *       desc_read:C, which implies desc_reserve:D can be seen.
         */
        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */

        d = to_desc(desc_ring, last_finalized_seq);
        state_var = &d->state_var;

        /* Extract the ID, used to specify the descriptor to read. */
        last_finalized_id = DESC_ID(atomic_long_read(state_var));

        /* Ensure @last_finalized_id is correct. */
        err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc);

        if (err == -EINVAL) {
                if (last_finalized_seq == 0) {
                        /*
                         * No record has been finalized or even reserved yet.
                         *
                         * The @head_id is initialized such that the first
                         * increment will yield the first record (seq=0).
                         * Handle it separately to avoid a negative @diff
                         * below.
                         */
                        if (head_id == DESC0_ID(desc_ring->count_bits))
                                return 0;

                        /*
                         * One or more descriptors are already reserved. Use
                         * the descriptor ID of the first one (@seq=0) for
                         * the @diff below.
                         */
                        last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1;
                } else {
                        /* Record must have been overwritten. Try again. */
                        goto try_again;
                }
        }

        /* Diff of known descriptor IDs to compute related sequence numbers. */
        diff = head_id - last_finalized_id;

        /*
         * @head_id points to the most recently reserved record, but this
         * function returns the sequence number that will be assigned to the
         * next (not yet reserved) record. Thus +1 is needed.
         */
        return (last_finalized_seq + diff + 1);
}

/*
 * Non-blocking read of a record.
 *
 * On success @seq is updated to the record that was read and (if provided)
 * @r and @line_count will contain the read/calculated data.
 *
 * On failure @seq is updated to a record that is not yet available to the
 * reader, but it will be the next record available to the reader.
 *
 * Note: When the current CPU is in panic, this function will skip over any
 *       non-existent/non-finalized records in order to allow the panic CPU
 *       to print any and all records that have been finalized.
 */
static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
                            struct printk_record *r, unsigned int *line_count)
{
        u64 tail_seq;
        int err;

        while ((err = prb_read(rb, *seq, r, line_count))) {
                tail_seq = prb_first_seq(rb);

                if (*seq < tail_seq) {
                        /*
                         * Behind the tail. Catch up and try again. This
                         * can happen for -ENOENT and -EINVAL cases.
                         */
                        *seq = tail_seq;

                } else if (err == -ENOENT) {
                        /* Record exists, but the data was lost. Skip. */
                        (*seq)++;

                } else {
                        /*
                         * Non-existent/non-finalized record. Must stop.
                         *
                         * For panic situations it cannot be expected that
                         * non-finalized records will become finalized. But
                         * there may be other finalized records beyond that
                         * need to be printed for a panic situation. If this
                         * is the panic CPU, skip this
                         * non-existent/non-finalized record unless non-panic
                         * CPUs are still running and their debugging is
                         * explicitly enabled.
                         *
                         * Note that new messages printed on panic CPU are
                         * finalized when we are here. The only exception
                         * might be the last message without trailing newline.
                         * But it would have the sequence number returned
                         * by "prb_next_reserve_seq() - 1".
                         */
                        if (panic_on_this_cpu() &&
                            (!debug_non_panic_cpus || legacy_allow_panic_sync) &&
                            ((*seq + 1) < prb_next_reserve_seq(rb))) {
                                (*seq)++;
                        } else {
                                return false;
                        }
                }
        }

        return true;
}

/**
 * prb_read_valid() - Non-blocking read of a requested record or (if gone)
 *                    the next available record.
 *
 * @rb:  The ringbuffer to read from.
 * @seq: The sequence number of the record to read.
 * @r:   A record data buffer to store the read record to.
 *
 * This is the public function available to readers to read a record.
 *
 * The reader provides the @info and @text_buf buffers of @r to be
 * filled in. Any of the buffer pointers can be set to NULL if the reader
 * is not interested in that data. To ensure proper initialization of @r,
 * prb_rec_init_rd() should be used.
 *
 * Context: Any context.
 * Return: true if a record was read, otherwise false.
 *
 * On success, the reader must check r->info.seq to see which record was
 * actually read. This allows the reader to detect dropped records.
 *
 * Failure means @seq refers to a record not yet available to the reader.
 */
bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
                    struct printk_record *r)
{
        return _prb_read_valid(rb, &seq, r, NULL);
}
EXPORT_SYMBOL_IF_KUNIT(prb_read_valid);

/**
 * prb_read_valid_info() - Non-blocking read of meta data for a requested
 *                         record or (if gone) the next available record.
 *
 * @rb:         The ringbuffer to read from.
 * @seq:        The sequence number of the record to read.
 * @info:       A buffer to store the read record meta data to.
 * @line_count: A buffer to store the number of lines in the record text.
 *
 * This is the public function available to readers to read only the
 * meta data of a record.
 *
 * The reader provides the @info, @line_count buffers to be filled in.
 * Either of the buffer pointers can be set to NULL if the reader is not
 * interested in that data.
 *
 * Context: Any context.
 * Return: true if a record's meta data was read, otherwise false.
 *
 * On success, the reader must check info->seq to see which record meta data
 * was actually read. This allows the reader to detect dropped records.
 *
 * Failure means @seq refers to a record not yet available to the reader.
 */
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
                         struct printk_info *info, unsigned int *line_count)
{
        struct printk_record r;

        prb_rec_init_rd(&r, info, NULL, 0);

        return _prb_read_valid(rb, &seq, &r, line_count);
}

/**
 * prb_first_valid_seq() - Get the sequence number of the oldest available
 *                         record.
 *
 * @rb: The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what the
 * first/oldest valid sequence number is.
 *
 * This provides readers a starting point to begin iterating the ringbuffer.
 *
 * Context: Any context.
 * Return: The sequence number of the first/oldest record or, if the
 *         ringbuffer is empty, 0 is returned.
 */
u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
{
        u64 seq = 0;

        if (!_prb_read_valid(rb, &seq, NULL, NULL))
                return 0;

        return seq;
}

/**
 * prb_next_seq() - Get the sequence number after the last available record.
 *
 * @rb:  The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what the next
 * newest sequence number available to readers will be.
 *
 * This provides readers a sequence number to jump to if all currently
 * available records should be skipped. It is guaranteed that all records
 * previous to the returned value have been finalized and are (or were)
 * available to the reader.
 *
 * Context: Any context.
 * Return: The sequence number of the next newest (not yet available) record
 *         for readers.
 */
u64 prb_next_seq(struct printk_ringbuffer *rb)
{
        u64 seq;

        seq = desc_last_finalized_seq(rb);

        /*
         * Begin searching after the last finalized record.
         *
         * On 0, the search must begin at 0 because of hack#2
         * of the bootstrapping phase it is not known if a
         * record at index 0 exists.
         */
        if (seq != 0)
                seq++;

        /*
         * The information about the last finalized @seq might be inaccurate.
         * Search forward to find the current one.
         */
        while (_prb_read_valid(rb, &seq, NULL, NULL))
                seq++;

        return seq;
}

/**
 * prb_init() - Initialize a ringbuffer to use provided external buffers.
 *
 * @rb:       The ringbuffer to initialize.
 * @text_buf: The data buffer for text data.
 * @textbits: The size of @text_buf as a power-of-2 value.
 * @descs:    The descriptor buffer for ringbuffer records.
 * @descbits: The count of @descs items as a power-of-2 value.
 * @infos:    The printk_info buffer for ringbuffer records.
 *
 * This is the public function available to writers to setup a ringbuffer
 * during runtime using provided buffers.
 *
 * This must match the initialization of DEFINE_PRINTKRB().
 *
 * Context: Any context.
 */
void prb_init(struct printk_ringbuffer *rb,
              char *text_buf, unsigned int textbits,
              struct prb_desc *descs, unsigned int descbits,
              struct printk_info *infos)
{
        memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0]));
        memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0]));

        rb->desc_ring.count_bits = descbits;
        rb->desc_ring.descs = descs;
        rb->desc_ring.infos = infos;
        atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
        atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
        atomic_long_set(&rb->desc_ring.last_finalized_seq, 0);

        rb->text_data_ring.size_bits = textbits;
        rb->text_data_ring.data = text_buf;
        atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits));
        atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits));

        atomic_long_set(&rb->fail, 0);

        atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits));
        descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS;
        descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS;

        infos[0].seq = -(u64)_DESCS_COUNT(descbits);
        infos[_DESCS_COUNT(descbits) - 1].seq = 0;
}
EXPORT_SYMBOL_IF_KUNIT(prb_init);

/**
 * prb_record_text_space() - Query the full actual used ringbuffer space for
 *                           the text data of a reserved entry.
 *
 * @e: The successfully reserved entry to query.
 *
 * This is the public function available to writers to see how much actual
 * space is used in the ringbuffer to store the text data of the specified
 * entry.
 *
 * This function is only valid if @e has been successfully reserved using
 * prb_reserve().
 *
 * Context: Any context.
 * Return: The size in bytes used by the text data of the associated record.
 */
unsigned int prb_record_text_space(struct prb_reserved_entry *e)
{
        return e->text_space;
}





































    1 




















































    1 












    1 













    1 



































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Generic INET6 transport hashtables
 *
 * Authors:        Lotsa people, from code originally in tcp, generalised here
 *                by Arnaldo Carvalho de Melo <acme@mandriva.com>
 */

#include <linux/module.h>
#include <linux/random.h>

#include <net/addrconf.h>
#include <net/hotdata.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/secure_seq.h>
#include <net/ip.h>
#include <net/sock_reuseport.h>
#include <net/tcp.h>

void inet6_init_ehash_secret(void)
{
        net_get_random_sleepable_once(&inet6_ehash_secret,
                                      sizeof(inet6_ehash_secret));
        net_get_random_sleepable_once(&tcp_ipv6_hash_secret,
                                      sizeof(tcp_ipv6_hash_secret));
}

u32 inet6_ehashfn(const struct net *net,
                  const struct in6_addr *laddr, const u16 lport,
                  const struct in6_addr *faddr, const __be16 fport)
{
        u32 a, b, c;

        /*
         * Please look at jhash() implementation for reference.
         * Hash laddr + faddr + lport/fport + net_hash_mix.
         * Notes:
         * We combine laddr[0] (high order 32 bits of local address)
         * with net_hash_mix() to hash a multiple of 3 words.
         *
         * We do not include JHASH_INITVAL + 36 contribution
         * to initial values of a, b, c.
         */

        a = b = c = tcp_ipv6_hash_secret;

        a += (__force u32)laddr->s6_addr32[0] ^ net_hash_mix(net);
        b += (__force u32)laddr->s6_addr32[1];
        c += (__force u32)laddr->s6_addr32[2];
        __jhash_mix(a, b, c);

        a += (__force u32)laddr->s6_addr32[3];
        b += (__force u32)faddr->s6_addr32[0];
        c += (__force u32)faddr->s6_addr32[1];
        __jhash_mix(a, b, c);

        a += (__force u32)faddr->s6_addr32[2];
        b += (__force u32)faddr->s6_addr32[3];
        c += (__force u32)fport;
        __jhash_final(a, b, c);

        /* Note: We need to add @lport instead of fully hashing it.
         * See commits 9544d60a2605 ("inet: change lport contribution
         * to inet_ehashfn() and inet6_ehashfn()") and d4438ce68bf1
         * ("inet: call inet6_ehashfn() once from inet6_hash_connect()")
         * for references.
         */
        return lport + c;
}
EXPORT_SYMBOL_GPL(inet6_ehashfn);

/*
 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 *
 * The sockhash lock must be held as a reader here.
 */
struct sock *__inet6_lookup_established(const struct net *net,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 hnum,
                                        const int dif, const int sdif)
{
        const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
        const struct hlist_nulls_node *node;
        struct inet_ehash_bucket *head;
        struct inet_hashinfo *hashinfo;
        unsigned int hash, slot;
        struct sock *sk;

        hashinfo = net->ipv4.tcp_death_row.hashinfo;
        hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
        slot = hash & hashinfo->ehash_mask;
        head = &hashinfo->ehash[slot];
begin:
        sk_nulls_for_each_rcu(sk, node, &head->chain) {
                if (sk->sk_hash != hash)
                        continue;
                if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))
                        continue;
                if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
                        goto out;

                if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) {
                        sock_gen_put(sk);
                        goto begin;
                }
                goto found;
        }
        if (get_nulls_value(node) != slot)
                goto begin;
out:
        sk = NULL;
found:
        return sk;
}
EXPORT_SYMBOL(__inet6_lookup_established);

static inline int compute_score(struct sock *sk, const struct net *net,
                                const unsigned short hnum,
                                const struct in6_addr *daddr,
                                const int dif, const int sdif)
{
        int score = -1;

        if (net_eq(sock_net(sk), net) &&
            READ_ONCE(inet_sk(sk)->inet_num) == hnum &&
            sk->sk_family == PF_INET6) {
                if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
                        return -1;

                if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
                        return -1;

                score =  sk->sk_bound_dev_if ? 2 : 1;
                if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
                        score++;
        }
        return score;
}

/**
 * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary.
 * @net: network namespace.
 * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
 * @skb: context for a potential SK_REUSEPORT program.
 * @doff: header offset.
 * @saddr: source address.
 * @sport: source port.
 * @daddr: destination address.
 * @hnum: destination port in host byte order.
 * @ehashfn: hash function used to generate the fallback hash.
 *
 * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
 *         the selected sock or an error.
 */
struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk,
                                    struct sk_buff *skb, int doff,
                                    const struct in6_addr *saddr,
                                    __be16 sport,
                                    const struct in6_addr *daddr,
                                    unsigned short hnum,
                                    inet6_ehashfn_t *ehashfn)
{
        struct sock *reuse_sk = NULL;
        u32 phash;

        if (sk->sk_reuseport) {
                phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn,
                                           net, daddr, hnum, saddr, sport);
                reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
        }
        return reuse_sk;
}
EXPORT_SYMBOL_GPL(inet6_lookup_reuseport);

/* called with rcu_read_lock() */
static struct sock *inet6_lhash2_lookup(const struct net *net,
                struct inet_listen_hashbucket *ilb2,
                struct sk_buff *skb, int doff,
                const struct in6_addr *saddr,
                const __be16 sport, const struct in6_addr *daddr,
                const unsigned short hnum, const int dif, const int sdif)
{
        struct sock *sk, *result = NULL;
        struct hlist_nulls_node *node;
        int score, hiscore = 0;

        sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
                score = compute_score(sk, net, hnum, daddr, dif, sdif);
                if (score > hiscore) {
                        result = inet6_lookup_reuseport(net, sk, skb, doff,
                                                        saddr, sport, daddr, hnum, inet6_ehashfn);
                        if (result)
                                return result;

                        result = sk;
                        hiscore = score;
                }
        }

        return result;
}

struct sock *inet6_lookup_run_sk_lookup(const struct net *net,
                                        int protocol,
                                        struct sk_buff *skb, int doff,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 hnum, const int dif,
                                        inet6_ehashfn_t *ehashfn)
{
        struct sock *sk, *reuse_sk;
        bool no_reuseport;

        no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport,
                                            daddr, hnum, dif, &sk);
        if (no_reuseport || IS_ERR_OR_NULL(sk))
                return sk;

        reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
                                          saddr, sport, daddr, hnum, ehashfn);
        if (reuse_sk)
                sk = reuse_sk;
        return sk;
}
EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup);

struct sock *inet6_lookup_listener(const struct net *net,
                                   struct sk_buff *skb, int doff,
                                   const struct in6_addr *saddr,
                                   const __be16 sport,
                                   const struct in6_addr *daddr,
                                   const unsigned short hnum,
                                   const int dif, const int sdif)
{
        struct inet_listen_hashbucket *ilb2;
        struct inet_hashinfo *hashinfo;
        struct sock *result = NULL;
        unsigned int hash2;

        /* Lookup redirect from BPF */
        if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
                result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
                                                    saddr, sport, daddr, hnum, dif,
                                                    inet6_ehashfn);
                if (result)
                        goto done;
        }

        hashinfo = net->ipv4.tcp_death_row.hashinfo;
        hash2 = ipv6_portaddr_hash(net, daddr, hnum);
        ilb2 = inet_lhash2_bucket(hashinfo, hash2);

        result = inet6_lhash2_lookup(net, ilb2, skb, doff,
                                     saddr, sport, daddr, hnum,
                                     dif, sdif);
        if (result)
                goto done;

        /* Lookup lhash2 with in6addr_any */
        hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
        ilb2 = inet_lhash2_bucket(hashinfo, hash2);

        result = inet6_lhash2_lookup(net, ilb2, skb, doff,
                                     saddr, sport, &in6addr_any, hnum,
                                     dif, sdif);
done:
        if (IS_ERR(result))
                return NULL;
        return result;
}
EXPORT_SYMBOL_GPL(inet6_lookup_listener);

struct sock *inet6_lookup(const struct net *net,
                          struct sk_buff *skb, int doff,
                          const struct in6_addr *saddr, const __be16 sport,
                          const struct in6_addr *daddr, const __be16 dport,
                          const int dif)
{
        struct sock *sk;
        bool refcounted;

        sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr,
                            ntohs(dport), dif, 0, &refcounted);
        if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
}
EXPORT_SYMBOL_GPL(inet6_lookup);

static int __inet6_check_established(struct inet_timewait_death_row *death_row,
                                     struct sock *sk, const __u16 lport,
                                     struct inet_timewait_sock **twp,
                                     bool rcu_lookup,
                                     u32 hash)
{
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        struct inet_sock *inet = inet_sk(sk);
        const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
        const struct in6_addr *saddr = &sk->sk_v6_daddr;
        const int dif = sk->sk_bound_dev_if;
        struct net *net = sock_net(sk);
        const int sdif = l3mdev_master_ifindex_by_index(net, dif);
        const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
        struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
        struct inet_timewait_sock *tw = NULL;
        const struct hlist_nulls_node *node;
        struct sock *sk2;
        spinlock_t *lock;

        if (rcu_lookup) {
                sk_nulls_for_each(sk2, node, &head->chain) {
                        if (sk2->sk_hash != hash ||
                            !inet6_match(net, sk2, saddr, daddr,
                                         ports, dif, sdif))
                                continue;
                        if (sk2->sk_state == TCP_TIME_WAIT)
                                break;
                        return -EADDRNOTAVAIL;
                }
                return 0;
        }

        lock = inet_ehash_lockp(hinfo, hash);
        spin_lock(lock);

        sk_nulls_for_each(sk2, node, &head->chain) {
                if (sk2->sk_hash != hash)
                        continue;

                if (likely(inet6_match(net, sk2, saddr, daddr, ports,
                                       dif, sdif))) {
                        if (sk2->sk_state == TCP_TIME_WAIT) {
                                tw = inet_twsk(sk2);
                                if (tcp_twsk_unique(sk, sk2, twp))
                                        break;
                        }
                        goto not_unique;
                }
        }

        /* Must record num and sport now. Otherwise we will see
         * in hash table socket with a funny identity.
         */
        inet->inet_num = lport;
        inet->inet_sport = htons(lport);
        sk->sk_hash = hash;
        WARN_ON(!sk_unhashed(sk));
        __sk_nulls_add_node_rcu(sk, &head->chain);
        if (tw) {
                sk_nulls_del_node_init_rcu((struct sock *)tw);
                __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
        }
        spin_unlock(lock);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

        if (twp) {
                *twp = tw;
        } else if (tw) {
                /* Silly. Should hash-dance instead... */
                inet_twsk_deschedule_put(tw);
        }
        return 0;

not_unique:
        spin_unlock(lock);
        return -EADDRNOTAVAIL;
}

static u64 inet6_sk_port_offset(const struct sock *sk)
{
        const struct inet_sock *inet = inet_sk(sk);

        return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32,
                                          sk->sk_v6_daddr.s6_addr32,
                                          inet->inet_dport);
}

int inet6_hash_connect(struct inet_timewait_death_row *death_row,
                       struct sock *sk)
{
        const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
        const struct in6_addr *saddr = &sk->sk_v6_daddr;
        const struct inet_sock *inet = inet_sk(sk);
        const struct net *net = sock_net(sk);
        u64 port_offset = 0;
        u32 hash_port0;

        if (!inet_sk(sk)->inet_num)
                port_offset = inet6_sk_port_offset(sk);

        inet6_init_ehash_secret();

        hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport);

        return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
                                   __inet6_check_established);
}
EXPORT_SYMBOL_GPL(inet6_hash_connect);











































































   14 
    1 
   15 
   15 
   15 













    1 




    1 










    1 
    1 
    1 
    1 






   19 

   16 














   20 
   19 
















   19 
   18 



   16 




   18 



   18 


   20 






   16 















   13 
   20 





   20 































































































































   14 








































































    5 




    6 
    4 
    5 




    8 















































   10 



   11 








































   19 






   12 
    8 

   21 












    8 





    6 
















































































































   13 



   13 
   12 



   11 









   14 



   13 



   13 


   14 













    1 

    1 













































    1 


   14 









































    1 

















    1 







































    1 



































    1 


    1 






    1 







    1 



























































   20 




   18 

   19 














   17 











   14 


   14 
   11 


   18 
   12 
   17 





   15 























































   13 
    5 
   13 
















   10 


   11 

    8 
   11 




































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/swap.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * This file contains the default values for the operation of the
 * Linux VM subsystem. Fine-tuning documentation can be found in
 * Documentation/admin-guide/sysctl/vm.rst.
 * Started 18.12.91
 * Swap aging added 23.2.95, Stephen Tweedie.
 * Buffermem limits added 12.3.98, Rik van Riel.
 */

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/folio_batch.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/mm_inline.h>
#include <linux/percpu_counter.h>
#include <linux/memremap.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
#include <linux/local_lock.h>
#include <linux/buffer_head.h>

#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>

/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
static const int page_cluster_max = 31;

struct cpu_fbatches {
        /*
         * The following folio batches are grouped together because they are protected
         * by disabling preemption (and interrupts remain enabled).
         */
        local_lock_t lock;
        struct folio_batch lru_add;
        struct folio_batch lru_deactivate_file;
        struct folio_batch lru_deactivate;
        struct folio_batch lru_lazyfree;
#ifdef CONFIG_SMP
        struct folio_batch lru_activate;
#endif
        /* Protecting the following batches which require disabling interrupts */
        local_lock_t lock_irq;
        struct folio_batch lru_move_tail;
};

static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
        .lock = INIT_LOCAL_LOCK(lock),
        .lock_irq = INIT_LOCAL_LOCK(lock_irq),
};

static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
                unsigned long *flagsp)
{
        if (folio_test_lru(folio)) {
                folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
                lruvec_del_folio(*lruvecp, folio);
                __folio_clear_lru_flags(folio);
        }
}

/*
 * This path almost never happens for VM activity - pages are normally freed
 * in batches.  But it gets used by networking - and for compound pages.
 */
static void page_cache_release(struct folio *folio)
{
        struct lruvec *lruvec = NULL;
        unsigned long flags;

        __page_cache_release(folio, &lruvec, &flags);
        if (lruvec)
                lruvec_unlock_irqrestore(lruvec, flags);
}

void __folio_put(struct folio *folio)
{
        if (unlikely(folio_is_zone_device(folio))) {
                free_zone_device_folio(folio);
                return;
        }

        if (folio_test_hugetlb(folio)) {
                free_huge_folio(folio);
                return;
        }

        page_cache_release(folio);
        folio_unqueue_deferred_split(folio);
        mem_cgroup_uncharge(folio);
        free_frozen_pages(&folio->page, folio_order(folio));
}
EXPORT_SYMBOL(__folio_put);

typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);

static void lru_add(struct lruvec *lruvec, struct folio *folio)
{
        int was_unevictable = folio_test_clear_unevictable(folio);
        long nr_pages = folio_nr_pages(folio);

        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        /*
         * Is an smp_mb__after_atomic() still required here, before
         * folio_evictable() tests the mlocked flag, to rule out the possibility
         * of stranding an evictable folio on an unevictable LRU?  I think
         * not, because __munlock_folio() only clears the mlocked flag
         * while the LRU lock is held.
         *
         * (That is not true of __page_cache_release(), and not necessarily
         * true of folios_put(): but those only clear the mlocked flag after
         * folio_put_testzero() has excluded any other users of the folio.)
         */
        if (folio_evictable(folio)) {
                if (was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        } else {
                folio_clear_active(folio);
                folio_set_unevictable(folio);
                /*
                 * folio->mlock_count = !!folio_test_mlocked(folio)?
                 * But that leaves __mlock_folio() in doubt whether another
                 * actor has already counted the mlock or not.  Err on the
                 * safe side, underestimate, let page reclaim fix it, rather
                 * than leaving a page on the unevictable LRU indefinitely.
                 */
                folio->mlock_count = 0;
                if (!was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
        }

        lruvec_add_folio(lruvec, folio);
        trace_mm_lru_insertion(folio);
}

static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
{
        int i;
        struct lruvec *lruvec = NULL;
        unsigned long flags = 0;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                /* block memcg migration while the folio moves between lru */
                if (move_fn != lru_add && !folio_test_clear_lru(folio))
                        continue;

                folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
                move_fn(lruvec, folio);

                folio_set_lru(folio);
        }

        if (lruvec)
                lruvec_unlock_irqrestore(lruvec, flags);
        folios_put(fbatch);
}

static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
                struct folio *folio, move_fn_t move_fn, bool disable_irq)
{
        unsigned long flags;

        folio_get(folio);

        if (disable_irq)
                local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
        else
                local_lock(&cpu_fbatches.lock);

        if (!folio_batch_add(this_cpu_ptr(fbatch), folio) ||
                        !folio_may_be_lru_cached(folio) || lru_cache_disabled())
                folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);

        if (disable_irq)
                local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
        else
                local_unlock(&cpu_fbatches.lock);
}

#define folio_batch_add_and_move(folio, op)                \
        __folio_batch_add_and_move(                        \
                &cpu_fbatches.op,                        \
                folio,                                        \
                op,                                        \
                offsetof(struct cpu_fbatches, op) >=        \
                offsetof(struct cpu_fbatches, lock_irq)        \
        )

static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
{
        if (folio_test_unevictable(folio))
                return;

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        lruvec_add_folio_tail(lruvec, folio);
        __count_vm_events(PGROTATED, folio_nr_pages(folio));
}

/*
 * Writeback is about to end against a folio which has been marked for
 * immediate reclaim.  If it still appears to be reclaimable, move it
 * to the tail of the inactive list.
 *
 * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
 */
void folio_rotate_reclaimable(struct folio *folio)
{
        if (folio_test_locked(folio) || folio_test_dirty(folio) ||
            folio_test_unevictable(folio) || !folio_test_lru(folio))
                return;

        folio_batch_add_and_move(folio, lru_move_tail);
}

void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
                unsigned int nr_io, unsigned int nr_rotated)
                __releases(lruvec->lru_lock)
                __releases(rcu)
{
        unsigned long cost;

        /*
         * Reflect the relative cost of incurring IO and spending CPU
         * time on rotations. This doesn't attempt to make a precise
         * comparison, it just says: if reloads are about comparable
         * between the LRU lists, or rotations are overwhelmingly
         * different between them, adjust scan balance for CPU work.
         */
        cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
        if (!cost) {
                spin_unlock_irq(&lruvec->lru_lock);
                rcu_read_unlock();
                return;
        }

        for (;;) {
                unsigned long lrusize;

                /* Record cost event */
                if (file)
                        lruvec->file_cost += cost;
                else
                        lruvec->anon_cost += cost;

                /*
                 * Decay previous events
                 *
                 * Because workloads change over time (and to avoid
                 * overflow) we keep these statistics as a floating
                 * average, which ends up weighing recent refaults
                 * more than old ones.
                 */
                lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
                          lruvec_page_state(lruvec, NR_ACTIVE_FILE);

                if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
                        lruvec->file_cost /= 2;
                        lruvec->anon_cost /= 2;
                }

                spin_unlock_irq(&lruvec->lru_lock);
                lruvec = parent_lruvec(lruvec);
                if (!lruvec) {
                        rcu_read_unlock();
                        break;
                }
                spin_lock_irq(&lruvec->lru_lock);
        }
}

void lru_note_cost_refault(struct folio *folio)
{
        struct lruvec *lruvec;

        lruvec = folio_lruvec_lock_irq(folio);
        lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio),
                                folio_nr_pages(folio), 0);
}

static void lru_activate(struct lruvec *lruvec, struct folio *folio)
{
        long nr_pages = folio_nr_pages(folio);

        if (folio_test_active(folio) || folio_test_unevictable(folio))
                return;


        lruvec_del_folio(lruvec, folio);
        folio_set_active(folio);
        lruvec_add_folio(lruvec, folio);
        trace_mm_lru_activate(folio);

        __count_vm_events(PGACTIVATE, nr_pages);
        count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
}

#ifdef CONFIG_SMP
static void folio_activate_drain(int cpu)
{
        struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu);

        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_activate);
}

void folio_activate(struct folio *folio)
{
        if (folio_test_active(folio) || folio_test_unevictable(folio) ||
            !folio_test_lru(folio))
                return;

        folio_batch_add_and_move(folio, lru_activate);
}

#else
static inline void folio_activate_drain(int cpu)
{
}

void folio_activate(struct folio *folio)
{
        struct lruvec *lruvec;

        if (!folio_test_clear_lru(folio))
                return;

        lruvec = folio_lruvec_lock_irq(folio);
        lru_activate(lruvec, folio);
        lruvec_unlock_irq(lruvec);
        folio_set_lru(folio);
}
#endif

static void __lru_cache_activate_folio(struct folio *folio)
{
        struct folio_batch *fbatch;
        int i;

        local_lock(&cpu_fbatches.lock);
        fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);

        /*
         * Search backwards on the optimistic assumption that the folio being
         * activated has just been added to this batch. Note that only
         * the local batch is examined as a !LRU folio could be in the
         * process of being released, reclaimed, migrated or on a remote
         * batch that is currently being drained. Furthermore, marking
         * a remote batch's folio active potentially hits a race where
         * a folio is marked active just after it is added to the inactive
         * list causing accounting errors and BUG_ON checks to trigger.
         */
        for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) {
                struct folio *batch_folio = fbatch->folios[i];

                if (batch_folio == folio) {
                        folio_set_active(folio);
                        break;
                }
        }

        local_unlock(&cpu_fbatches.lock);
}

#ifdef CONFIG_LRU_GEN

static void lru_gen_inc_refs(struct folio *folio)
{
        unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);

        if (folio_test_unevictable(folio))
                return;

        /* see the comment on LRU_REFS_FLAGS */
        if (!folio_test_referenced(folio)) {
                set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
                return;
        }

        do {
                if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) {
                        if (!folio_test_workingset(folio))
                                folio_set_workingset(folio);
                        return;
                }

                new_flags = old_flags + BIT(LRU_REFS_PGOFF);
        } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
}

static bool lru_gen_clear_refs(struct folio *folio)
{
        int gen = folio_lru_gen(folio);
        int type = folio_is_file_lru(folio);
        unsigned long seq;

        if (gen < 0)
                return true;

        set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0);

        rcu_read_lock();
        seq = READ_ONCE(folio_lruvec(folio)->lrugen.min_seq[type]);
        rcu_read_unlock();
        /* whether can do without shuffling under the LRU lock */
        return gen == lru_gen_from_seq(seq);
}

#else /* !CONFIG_LRU_GEN */

static void lru_gen_inc_refs(struct folio *folio)
{
}

static bool lru_gen_clear_refs(struct folio *folio)
{
        return false;
}

#endif /* CONFIG_LRU_GEN */

/**
 * folio_mark_accessed - Mark a folio as having seen activity.
 * @folio: The folio to mark.
 *
 * This function will perform one of the following transitions:
 *
 * * inactive,unreferenced        ->        inactive,referenced
 * * inactive,referenced        ->        active,unreferenced
 * * active,unreferenced        ->        active,referenced
 *
 * When a newly allocated folio is not yet visible, so safe for non-atomic ops,
 * __folio_set_referenced() may be substituted for folio_mark_accessed().
 */
void folio_mark_accessed(struct folio *folio)
{
        if (folio_test_dropbehind(folio))
                return;
        if (lru_gen_enabled()) {
                lru_gen_inc_refs(folio);
                return;
        }

        if (!folio_test_referenced(folio)) {
                folio_set_referenced(folio);
        } else if (folio_test_unevictable(folio)) {
                /*
                 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
                 * this list is never rotated or maintained, so marking an
                 * unevictable page accessed has no effect.
                 */
        } else if (!folio_test_active(folio)) {
                /*
                 * If the folio is on the LRU, queue it for activation via
                 * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a
                 * folio_batch, mark it active and it'll be moved to the active
                 * LRU on the next drain.
                 */
                if (folio_test_lru(folio))
                        folio_activate(folio);
                else
                        __lru_cache_activate_folio(folio);
                folio_clear_referenced(folio);
                workingset_activation(folio);
        }
        if (folio_test_idle(folio))
                folio_clear_idle(folio);
}
EXPORT_SYMBOL(folio_mark_accessed);

/**
 * folio_add_lru - Add a folio to an LRU list.
 * @folio: The folio to be added to the LRU.
 *
 * Queue the folio for addition to the LRU. The decision on whether
 * to add the page to the [in]active [file|anon] list is deferred until the
 * folio_batch is drained. This gives a chance for the caller of folio_add_lru()
 * have the folio added to the active list using folio_mark_accessed().
 */
void folio_add_lru(struct folio *folio)
{
        VM_BUG_ON_FOLIO(folio_test_active(folio) &&
                        folio_test_unevictable(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        /* see the comment in lru_gen_folio_seq() */
        if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
            lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
                folio_set_active(folio);

        folio_batch_add_and_move(folio, lru_add);
}
EXPORT_SYMBOL(folio_add_lru);

/**
 * folio_add_lru_vma() - Add a folio to the appropriate LRU list for this VMA.
 * @folio: The folio to be added to the LRU.
 * @vma: VMA in which the folio is mapped.
 *
 * If the VMA is mlocked, @folio is added to the unevictable list.
 * Otherwise, it is treated the same way as folio_add_lru().
 */
void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
{
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
                mlock_new_folio(folio);
        else
                folio_add_lru(folio);
}

/*
 * If the folio cannot be invalidated, it is moved to the
 * inactive list to speed up its reclaim.  It is moved to the
 * head of the list, rather than the tail, to give the flusher
 * threads some time to write it out, as this is much more
 * effective than the single-page writeout from reclaim.
 *
 * If the folio isn't mapped and dirty/writeback, the folio
 * could be reclaimed asap using the reclaim flag.
 *
 * 1. active, mapped folio -> none
 * 2. active, dirty/writeback folio -> inactive, head, reclaim
 * 3. inactive, mapped folio -> none
 * 4. inactive, dirty/writeback folio -> inactive, head, reclaim
 * 5. inactive, clean -> inactive, tail
 * 6. Others -> none
 *
 * In 4, it moves to the head of the inactive list so the folio is
 * written out by flusher threads as this is much more efficient
 * than the single-page writeout from reclaim.
 */
static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
{
        bool active = folio_test_active(folio) || lru_gen_enabled();
        long nr_pages = folio_nr_pages(folio);

        if (folio_test_unevictable(folio))
                return;

        /* Some processes are using the folio */
        if (folio_mapped(folio))
                return;

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        folio_clear_referenced(folio);

        if (folio_test_writeback(folio) || folio_test_dirty(folio)) {
                /*
                 * Setting the reclaim flag could race with
                 * folio_end_writeback() and confuse readahead.  But the
                 * race window is _really_ small and  it's not a critical
                 * problem.
                 */
                lruvec_add_folio(lruvec, folio);
                folio_set_reclaim(folio);
        } else {
                /*
                 * The folio's writeback ended while it was in the batch.
                 * We move that folio to the tail of the inactive list.
                 */
                lruvec_add_folio_tail(lruvec, folio);
                __count_vm_events(PGROTATED, nr_pages);
        }

        if (active) {
                __count_vm_events(PGDEACTIVATE, nr_pages);
                count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
                                     nr_pages);
        }
}

static void lru_deactivate(struct lruvec *lruvec, struct folio *folio)
{
        long nr_pages = folio_nr_pages(folio);

        if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled()))
                return;

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        folio_clear_referenced(folio);
        lruvec_add_folio(lruvec, folio);

        __count_vm_events(PGDEACTIVATE, nr_pages);
        count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
}

static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
{
        long nr_pages = folio_nr_pages(folio);

        if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
            folio_test_swapcache(folio) || folio_test_unevictable(folio))
                return;

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        if (lru_gen_enabled())
                lru_gen_clear_refs(folio);
        else
                folio_clear_referenced(folio);
        /*
         * Lazyfree folios are clean anonymous folios.  They have
         * the swapbacked flag cleared, to distinguish them from normal
         * anonymous folios
         */
        folio_clear_swapbacked(folio);
        lruvec_add_folio(lruvec, folio);

        __count_vm_events(PGLAZYFREE, nr_pages);
        count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
}

/*
 * Drain pages out of the cpu's folio_batch.
 * Either "cpu" is the current CPU, and preemption has already been
 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 */
void lru_add_drain_cpu(int cpu)
{
        struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
        struct folio_batch *fbatch = &fbatches->lru_add;

        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_add);

        fbatch = &fbatches->lru_move_tail;
        /* Disabling interrupts below acts as a compiler barrier. */
        if (data_race(folio_batch_count(fbatch))) {
                unsigned long flags;

                /* No harm done if a racing interrupt already did this */
                local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
                folio_batch_move_lru(fbatch, lru_move_tail);
                local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
        }

        fbatch = &fbatches->lru_deactivate_file;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_deactivate_file);

        fbatch = &fbatches->lru_deactivate;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_deactivate);

        fbatch = &fbatches->lru_lazyfree;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_lazyfree);

        folio_activate_drain(cpu);
}

/**
 * deactivate_file_folio() - Deactivate a file folio.
 * @folio: Folio to deactivate.
 *
 * This function hints to the VM that @folio is a good reclaim candidate,
 * for example if its invalidation fails due to the folio being dirty
 * or under writeback.
 *
 * Context: Caller holds a reference on the folio.
 */
void deactivate_file_folio(struct folio *folio)
{
        /* Deactivating an unevictable folio will not accelerate reclaim */
        if (folio_test_unevictable(folio) || !folio_test_lru(folio))
                return;

        if (lru_gen_enabled() && lru_gen_clear_refs(folio))
                return;

        folio_batch_add_and_move(folio, lru_deactivate_file);
}

/*
 * folio_deactivate - deactivate a folio
 * @folio: folio to deactivate
 *
 * folio_deactivate() moves @folio to the inactive list if @folio was on the
 * active list and was not unevictable. This is done to accelerate the
 * reclaim of @folio.
 */
void folio_deactivate(struct folio *folio)
{
        if (folio_test_unevictable(folio) || !folio_test_lru(folio))
                return;

        if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio))
                return;

        folio_batch_add_and_move(folio, lru_deactivate);
}

/**
 * folio_mark_lazyfree - make an anon folio lazyfree
 * @folio: folio to deactivate
 *
 * folio_mark_lazyfree() moves @folio to the inactive file list.
 * This is done to accelerate the reclaim of @folio.
 */
void folio_mark_lazyfree(struct folio *folio)
{
        if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
            !folio_test_lru(folio) ||
            folio_test_swapcache(folio) || folio_test_unevictable(folio))
                return;

        folio_batch_add_and_move(folio, lru_lazyfree);
}

void lru_add_drain(void)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        local_unlock(&cpu_fbatches.lock);
        mlock_drain_local();
}

/*
 * It's called from per-cpu workqueue context in SMP case so
 * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
 * the same cpu. It shouldn't be a problem in !SMP case since
 * the core is only one and the locks will disable preemption.
 */
static void lru_add_and_bh_lrus_drain(void)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        local_unlock(&cpu_fbatches.lock);
        invalidate_bh_lrus_cpu();
        mlock_drain_local();
}

void lru_add_drain_cpu_zone(struct zone *zone)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        drain_local_pages(zone);
        local_unlock(&cpu_fbatches.lock);
        mlock_drain_local();
}

#ifdef CONFIG_SMP

static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
        lru_add_and_bh_lrus_drain();
}

static bool cpu_needs_drain(unsigned int cpu)
{
        struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);

        /* Check these in order of likelihood that they're not zero */
        return folio_batch_count(&fbatches->lru_add) ||
                folio_batch_count(&fbatches->lru_move_tail) ||
                folio_batch_count(&fbatches->lru_deactivate_file) ||
                folio_batch_count(&fbatches->lru_deactivate) ||
                folio_batch_count(&fbatches->lru_lazyfree) ||
                folio_batch_count(&fbatches->lru_activate) ||
                need_mlock_drain(cpu) ||
                has_bh_in_lru(cpu, NULL);
}

/*
 * Doesn't need any cpu hotplug locking because we do rely on per-cpu
 * kworkers being shut down before our page_alloc_cpu_dead callback is
 * executed on the offlined cpu.
 * Calling this function with cpu hotplug locks held can actually lead
 * to obscure indirect dependencies via WQ context.
 */
static inline void __lru_add_drain_all(bool force_all_cpus)
{
        /*
         * lru_drain_gen - Global pages generation number
         *
         * (A) Definition: global lru_drain_gen = x implies that all generations
         *     0 < n <= x are already *scheduled* for draining.
         *
         * This is an optimization for the highly-contended use case where a
         * user space workload keeps constantly generating a flow of pages for
         * each CPU.
         */
        static unsigned int lru_drain_gen;
        static struct cpumask has_work;
        static DEFINE_MUTEX(lock);
        unsigned cpu, this_gen;

        /*
         * Make sure nobody triggers this path before mm_percpu_wq is fully
         * initialized.
         */
        if (WARN_ON(!mm_percpu_wq))
                return;

        /*
         * Guarantee folio_batch counter stores visible by this CPU
         * are visible to other CPUs before loading the current drain
         * generation.
         */
        smp_mb();

        /*
         * (B) Locally cache global LRU draining generation number
         *
         * The read barrier ensures that the counter is loaded before the mutex
         * is taken. It pairs with smp_mb() inside the mutex critical section
         * at (D).
         */
        this_gen = smp_load_acquire(&lru_drain_gen);

        /* It helps everyone if we do our own local drain immediately. */
        lru_add_drain();

        mutex_lock(&lock);

        /*
         * (C) Exit the draining operation if a newer generation, from another
         * lru_add_drain_all(), was already scheduled for draining. Check (A).
         */
        if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
                goto done;

        /*
         * (D) Increment global generation number
         *
         * Pairs with smp_load_acquire() at (B), outside of the critical
         * section. Use a full memory barrier to guarantee that the
         * new global drain generation number is stored before loading
         * folio_batch counters.
         *
         * This pairing must be done here, before the for_each_online_cpu loop
         * below which drains the page vectors.
         *
         * Let x, y, and z represent some system CPU numbers, where x < y < z.
         * Assume CPU #z is in the middle of the for_each_online_cpu loop
         * below and has already reached CPU #y's per-cpu data. CPU #x comes
         * along, adds some pages to its per-cpu vectors, then calls
         * lru_add_drain_all().
         *
         * If the paired barrier is done at any later step, e.g. after the
         * loop, CPU #x will just exit at (C) and miss flushing out all of its
         * added pages.
         */
        WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
        smp_mb();

        cpumask_clear(&has_work);
        for_each_online_cpu(cpu) {
                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

                if (cpu_needs_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        queue_work_on(cpu, mm_percpu_wq, work);
                        __cpumask_set_cpu(cpu, &has_work);
                }
        }

        for_each_cpu(cpu, &has_work)
                flush_work(&per_cpu(lru_add_drain_work, cpu));

done:
        mutex_unlock(&lock);
}

void lru_add_drain_all(void)
{
        __lru_add_drain_all(false);
}
#else
void lru_add_drain_all(void)
{
        lru_add_drain();
}
#endif /* CONFIG_SMP */

atomic_t lru_disable_count = ATOMIC_INIT(0);

/*
 * lru_cache_disable() needs to be called before we start compiling
 * a list of folios to be migrated using folio_isolate_lru().
 * It drains folios on LRU cache and then disable on all cpus until
 * lru_cache_enable is called.
 *
 * Must be paired with a call to lru_cache_enable().
 */
void lru_cache_disable(void)
{
        atomic_inc(&lru_disable_count);
        /*
         * Readers of lru_disable_count are protected by either disabling
         * preemption or rcu_read_lock:
         *
         * preempt_disable, local_irq_disable  [bh_lru_lock()]
         * rcu_read_lock                       [rt_spin_lock CONFIG_PREEMPT_RT]
         * preempt_disable                       [local_lock !CONFIG_PREEMPT_RT]
         *
         * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on
         * preempt_disable() regions of code. So any CPU which sees
         * lru_disable_count = 0 will have exited the critical
         * section when synchronize_rcu() returns.
         */
        synchronize_rcu_expedited();
#ifdef CONFIG_SMP
        __lru_add_drain_all(true);
#else
        lru_add_and_bh_lrus_drain();
#endif
}

/**
 * folios_put_refs - Reduce the reference count on a batch of folios.
 * @folios: The folios.
 * @refs: The number of refs to subtract from each folio.
 *
 * Like folio_put(), but for a batch of folios.  This is more efficient
 * than writing the loop yourself as it will optimise the locks which need
 * to be taken if the folios are freed.  The folios batch is returned
 * empty and ready to be reused for another batch; there is no need
 * to reinitialise it.  If @refs is NULL, we subtract one from each
 * folio refcount.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
{
        int i, j;
        struct lruvec *lruvec = NULL;
        unsigned long flags = 0;

        for (i = 0, j = 0; i < folios->nr; i++) {
                struct folio *folio = folios->folios[i];
                unsigned int nr_refs = refs ? refs[i] : 1;

                if (is_huge_zero_folio(folio))
                        continue;

                if (folio_is_zone_device(folio)) {
                        if (lruvec) {
                                lruvec_unlock_irqrestore(lruvec, flags);
                                lruvec = NULL;
                        }
                        if (folio_ref_sub_and_test(folio, nr_refs))
                                free_zone_device_folio(folio);
                        continue;
                }

                if (!folio_ref_sub_and_test(folio, nr_refs))
                        continue;

                /* hugetlb has its own memcg */
                if (folio_test_hugetlb(folio)) {
                        if (lruvec) {
                                lruvec_unlock_irqrestore(lruvec, flags);
                                lruvec = NULL;
                        }
                        free_huge_folio(folio);
                        continue;
                }
                folio_unqueue_deferred_split(folio);
                __page_cache_release(folio, &lruvec, &flags);

                if (j != i)
                        folios->folios[j] = folio;
                j++;
        }
        if (lruvec)
                lruvec_unlock_irqrestore(lruvec, flags);
        if (!j) {
                folio_batch_reinit(folios);
                return;
        }

        folios->nr = j;
        mem_cgroup_uncharge_folios(folios);
        free_unref_folios(folios);
}
EXPORT_SYMBOL(folios_put_refs);

/**
 * release_pages - batched put_page()
 * @arg: array of pages to release
 * @nr: number of pages
 *
 * Decrement the reference count on all the pages in @arg.  If it
 * fell to zero, remove the page from the LRU and free it.
 *
 * Note that the argument can be an array of pages, encoded pages,
 * or folio pointers. We ignore any encoded bits, and turn any of
 * them into just a folio that gets free'd.
 */
void release_pages(release_pages_arg arg, int nr)
{
        struct folio_batch fbatch;
        int refs[FOLIO_BATCH_SIZE];
        struct encoded_page **encoded = arg.encoded_pages;
        int i;

        folio_batch_init(&fbatch);
        for (i = 0; i < nr; i++) {
                /* Turn any of the argument types into a folio */
                struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));

                /* Is our next entry actually "nr_pages" -> "nr_refs" ? */
                refs[fbatch.nr] = 1;
                if (unlikely(encoded_page_flags(encoded[i]) &
                             ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                        refs[fbatch.nr] = encoded_nr_pages(encoded[++i]);

                if (folio_batch_add(&fbatch, folio) > 0)
                        continue;
                folios_put_refs(&fbatch, refs);
        }

        if (fbatch.nr)
                folios_put_refs(&fbatch, refs);
}
EXPORT_SYMBOL(release_pages);

/*
 * The folios which we're about to release may be in the deferred lru-addition
 * queues.  That would prevent them from really being freed right now.  That's
 * OK from a correctness point of view but is inefficient - those folios may be
 * cache-warm and we want to give them back to the page allocator ASAP.
 *
 * So __folio_batch_release() will drain those queues here.
 * folio_batch_move_lru() calls folios_put() directly to avoid
 * mutual recursion.
 */
void __folio_batch_release(struct folio_batch *fbatch)
{
        if (!fbatch->percpu_pvec_drained) {
                lru_add_drain();
                fbatch->percpu_pvec_drained = true;
        }
        folios_put(fbatch);
}
EXPORT_SYMBOL(__folio_batch_release);

/**
 * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
 * @fbatch: The batch to prune
 *
 * find_get_entries() fills a batch with both folios and shadow/swap/DAX
 * entries.  This function prunes all the non-folio entries from @fbatch
 * without leaving holes, so that it can be passed on to folio-only batch
 * operations.
 */
void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
{
        unsigned int i, j;

        for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];
                if (!xa_is_value(folio))
                        fbatch->folios[j++] = folio;
        }
        fbatch->nr = j;
}

#ifdef CONFIG_MEMCG
static void lruvec_reparent_lru(struct lruvec *child_lruvec,
                                struct lruvec *parent_lruvec,
                                enum lru_list lru, int nid)
{
        int zid;
        struct zone *zone;

        if (lru != LRU_UNEVICTABLE)
                list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]);

        for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
                unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);

                mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
        }
}

void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
{
        enum lru_list lru;
        struct lruvec *child_lruvec, *parent_lruvec;

        child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
        parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
        parent_lruvec->anon_cost += child_lruvec->anon_cost;
        parent_lruvec->file_cost += child_lruvec->file_cost;

        for_each_lru(lru)
                lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid);
}
#endif

static const struct ctl_table swap_sysctl_table[] = {
        {
                .procname        = "page-cluster",
                .data                = &page_cluster,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = (void *)&page_cluster_max,
        }
};

/*
 * Perform any setup for the swap system
 */
void __init swap_setup(void)
{
        unsigned long megs = PAGES_TO_MB(totalram_pages());

        /* Use a smaller cluster for small-memory machines */
        if (megs < 16)
                page_cluster = 2;
        else
                page_cluster = 3;
        /*
         * Right now other parts of the system means that we
         * _really_ don't want to cluster much more
         */

        register_sysctl_init("vm", swap_sysctl_table);
}





































































    2 














    9 































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * NUMA memory policies for Linux.
 * Copyright 2003,2004 Andi Kleen SuSE Labs
 */
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1

#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/node.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>

struct mm_struct;

#define NO_INTERLEAVE_INDEX (-1UL)        /* use task il_prev for interleaving */

#ifdef CONFIG_NUMA

/*
 * Describe a memory policy.
 *
 * A mempolicy can be either associated with a process or with a VMA.
 * For VMA related allocations the VMA policy is preferred, otherwise
 * the process policy is used. Interrupts ignore the memory policy
 * of the current process.
 *
 * Locking policy for interleave:
 * In process context there is no locking because only the process accesses
 * its own state. All vma manipulation is somewhat protected by a down_read on
 * mmap_lock.
 *
 * Freeing policy:
 * Mempolicy objects are reference counted.  A mempolicy will be freed when
 * mpol_put() decrements the reference count to zero.
 *
 * Duplicating policy objects:
 * mpol_dup() allocates a new mempolicy and copies the specified mempolicy
 * to the new storage.  The reference count of the new object is initialized
 * to 1, representing the caller of mpol_dup().
 */
struct mempolicy {
        atomic_t refcnt;
        unsigned short mode;         /* See MPOL_* above */
        unsigned short flags;        /* See set_mempolicy() MPOL_F_* above */
        nodemask_t nodes;        /* interleave/bind/preferred/etc */
        int home_node;                /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */

        union {
                nodemask_t cpuset_mems_allowed;        /* relative to these nodes */
                nodemask_t user_nodemask;        /* nodemask passed by user */
        } w;
        struct rcu_head rcu;
};

/*
 * Support for managing mempolicy data objects (clone, copy, destroy)
 * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
 */

extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
        if (pol)
                __mpol_put(pol);
}

/*
 * Does mempolicy pol need explicit unref after use?
 * Currently only needed for shared policies.
 */
static inline int mpol_needs_cond_ref(struct mempolicy *pol)
{
        return (pol && (pol->flags & MPOL_F_SHARED));
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
        if (mpol_needs_cond_ref(pol))
                __mpol_put(pol);
}

extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
{
        if (pol)
                pol = __mpol_dup(pol);
        return pol;
}

static inline void mpol_get(struct mempolicy *pol)
{
        if (pol)
                atomic_inc(&pol->refcnt);
}

extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (a == b)
                return true;
        return __mpol_equal(a, b);
}

/*
 * Tree of shared policies for a shared memory region.
 */
struct shared_policy {
        struct rb_root root;
        rwlock_t lock;
};
struct sp_node {
        struct rb_node nd;
        pgoff_t start, end;
        struct mempolicy *policy;
};

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
int mpol_set_shared_policy(struct shared_policy *sp,
                           struct vm_area_struct *vma, struct mempolicy *mpol);
void mpol_free_shared_policy(struct shared_policy *sp);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                            pgoff_t idx);

struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr, pgoff_t *ilx);
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr, int order, pgoff_t *ilx);
bool vma_policy_mof(struct vm_area_struct *vma);

extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);

extern int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
                                const nodemask_t *mask);
extern unsigned int mempolicy_slab_node(void);

extern enum zone_type policy_zone;

static inline void check_highest_zone(enum zone_type k)
{
        if (k > policy_zone && k != ZONE_MOVABLE)
                policy_zone = k;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags);


#ifdef CONFIG_TMPFS
extern int mpol_parse_str(char *str, struct mempolicy **mpol);
#endif

extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);

/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);

int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
                                        unsigned long addr);
extern void mpol_put_task_policy(struct task_struct *);

static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
        return  (pol->mode == MPOL_PREFERRED_MANY);
}

extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);

extern int mempolicy_set_node_perf(unsigned int node,
                                   struct access_coordinate *coords);

#else

struct mempolicy {};

static inline struct mempolicy *get_task_policy(struct task_struct *p)
{
        return NULL;
}

static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        return true;
}

static inline void mpol_put(struct mempolicy *pol)
{
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
}

static inline void mpol_get(struct mempolicy *pol)
{
}

struct shared_policy {};

static inline void mpol_shared_policy_init(struct shared_policy *sp,
                                                struct mempolicy *mpol)
{
}

static inline void mpol_free_shared_policy(struct shared_policy *sp)
{
}

static inline struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
{
        return NULL;
}

static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                unsigned long addr, int order, pgoff_t *ilx)
{
        *ilx = 0;
        return NULL;
}

static inline int
vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        return 0;
}

static inline void numa_policy_init(void)
{
}

static inline void numa_default_policy(void)
{
}

static inline void mpol_rebind_task(struct task_struct *tsk,
                                const nodemask_t *new)
{
}

static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}

static inline int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask)
{
        *mpol = NULL;
        *nodemask = NULL;
        return 0;
}

static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
{
        return false;
}

static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                                   const nodemask_t *to, int flags)
{
        return 0;
}

static inline void check_highest_zone(int k)
{
}

#ifdef CONFIG_TMPFS
static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        return 1;        /* error */
}
#endif

static inline int mpol_misplaced(struct folio *folio,
                                 struct vm_fault *vmf,
                                 unsigned long address)
{
        return -1; /* no node preference */
}

static inline void mpol_put_task_policy(struct task_struct *task)
{
}

static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
        return  false;
}

#endif /* CONFIG_NUMA */
#endif

































































































































































































































































































































































   19 


















    2 























    3 































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_DCACHE_H
#define __LINUX_DCACHE_H

#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/math.h>
#include <linux/rculist.h>
#include <linux/rculist_bl.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>
#include <linux/cache.h>
#include <linux/rcupdate.h>
#include <linux/lockref.h>
#include <linux/stringhash.h>
#include <linux/wait.h>

struct path;
struct file;
struct vfsmount;

/*
 * linux/include/linux/dcache.h
 *
 * Dirent cache data structures
 *
 * (C) Copyright 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

#define IS_ROOT(x) ((x) == (x)->d_parent)

/* The hash is always the low bits of hash_len */
#ifdef __LITTLE_ENDIAN
 #define HASH_LEN_DECLARE u32 hash; u32 len
 #define bytemask_from_count(cnt)        (~(~0ul << (cnt)*8))
#else
 #define HASH_LEN_DECLARE u32 len; u32 hash
 #define bytemask_from_count(cnt)        (~(~0ul >> (cnt)*8))
#endif

/*
 * "quick string" -- eases parameter passing, but more importantly
 * saves "metadata" about the string (ie length and the hash).
 *
 * hash comes first so it snuggles against d_parent in the
 * dentry.
 */
struct qstr {
        union {
                struct {
                        HASH_LEN_DECLARE;
                };
                u64 hash_len;
        };
        const unsigned char *name;
};

#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
#define QSTR_LEN(n,l) (struct qstr)QSTR_INIT(n,l)
#define QSTR(n) QSTR_LEN(n, strlen(n))

extern const struct qstr empty_name;
extern const struct qstr slash_name;
extern const struct qstr dotdot_name;

/*
 * Try to keep struct dentry aligned on 64 byte cachelines (this will
 * give reasonable cacheline footprint with larger lines without the
 * large memory footprint increase).
 */
#ifdef CONFIG_64BIT
# define DNAME_INLINE_WORDS 5 /* 192 bytes */
#else
# ifdef CONFIG_SMP
#  define DNAME_INLINE_WORDS 9 /* 128 bytes */
# else
#  define DNAME_INLINE_WORDS 11 /* 128 bytes */
# endif
#endif

#define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long))

union shortname_store {
        unsigned char string[DNAME_INLINE_LEN];
        unsigned long words[DNAME_INLINE_WORDS];
};

#define d_lock        d_lockref.lock
#define d_iname d_shortname.string

struct dentry {
        /* RCU lookup touched fields */
        unsigned int d_flags;                /* protected by d_lock */
        seqcount_spinlock_t d_seq;        /* per dentry seqlock */
        struct hlist_bl_node d_hash;        /* lookup hash list */
        struct dentry *d_parent;        /* parent directory */
        union {
        struct qstr __d_name;                /* for use ONLY in fs/dcache.c */
        const struct qstr d_name;
        };
        struct inode *d_inode;                /* Where the name belongs to - NULL is
                                         * negative */
        union shortname_store d_shortname;
        /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */

        /* Ref lookup also touches following */
        const struct dentry_operations *d_op;
        struct super_block *d_sb;        /* The root of the dentry tree */
        unsigned long d_time;                /* used by d_revalidate */
        void *d_fsdata;                        /* fs-specific data */
        /* --- cacheline 2 boundary (128 bytes) --- */
        struct lockref d_lockref;        /* per-dentry lock and refcount
                                         * keep separate from RCU lookup area if
                                         * possible!
                                         */

        union {
                struct list_head d_lru;                /* LRU list */
                wait_queue_head_t *d_wait;        /* in-lookup ones only */
        };
        struct hlist_node d_sib;        /* child of parent list */
        struct hlist_head d_children;        /* our children */
        /*
         * d_alias and d_rcu can share memory
         */
        union {
                struct hlist_node d_alias;        /* inode alias list */
                struct hlist_bl_node d_in_lookup_hash;        /* only for in-lookup ones */
                 struct rcu_head d_rcu;
        } d_u;
};

/*
 * dentry->d_lock spinlock nesting subclasses:
 *
 * 0: normal
 * 1: nested
 */
enum dentry_d_lock_class
{
        DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
        DENTRY_D_LOCK_NESTED
};

enum d_real_type {
        D_REAL_DATA,
        D_REAL_METADATA,
};

struct dentry_operations {
        int (*d_revalidate)(struct inode *, const struct qstr *,
                            struct dentry *, unsigned int);
        int (*d_weak_revalidate)(struct dentry *, unsigned int);
        int (*d_hash)(const struct dentry *, struct qstr *);
        int (*d_compare)(const struct dentry *,
                        unsigned int, const char *, const struct qstr *);
        int (*d_delete)(const struct dentry *);
        int (*d_init)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_prune)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
        char *(*d_dname)(struct dentry *, char *, int);
        struct vfsmount *(*d_automount)(struct path *);
        int (*d_manage)(const struct path *, bool);
        struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
        bool (*d_unalias_trylock)(const struct dentry *);
        void (*d_unalias_unlock)(const struct dentry *);
} ____cacheline_aligned;

/*
 * Locking rules for dentry_operations callbacks are to be found in
 * Documentation/filesystems/locking.rst. Keep it updated!
 *
 * FUrther descriptions are found in Documentation/filesystems/vfs.rst.
 * Keep it updated too!
 */

/* d_flags entries */
enum dentry_flags {
        DCACHE_OP_HASH                        = BIT(0),
        DCACHE_OP_COMPARE                = BIT(1),
        DCACHE_OP_REVALIDATE                = BIT(2),
        DCACHE_OP_DELETE                = BIT(3),
        DCACHE_OP_PRUNE                        = BIT(4),
        /*
         * This dentry is possibly not currently connected to the dcache tree,
         * in which case its parent will either be itself, or will have this
         * flag as well.  nfsd will not use a dentry with this bit set, but will
         * first endeavour to clear the bit either by discovering that it is
         * connected, or by performing lookup operations.  Any filesystem which
         * supports nfsd_operations MUST have a lookup function which, if it
         * finds a directory inode with a DCACHE_DISCONNECTED dentry, will
         * d_move that dentry into place and return that dentry rather than the
         * passed one, typically using d_splice_alias.
         */
        DCACHE_DISCONNECTED                = BIT(5),
        DCACHE_REFERENCED                = BIT(6),        /* Recently used, don't discard. */
        DCACHE_DONTCACHE                = BIT(7),        /* Purge from memory on final dput() */
        DCACHE_CANT_MOUNT                = BIT(8),
        DCACHE_SHRINK_LIST                = BIT(10),
        DCACHE_OP_WEAK_REVALIDATE        = BIT(11),
        /*
         * this dentry has been "silly renamed" and has to be deleted on the
         * last dput()
         */
        DCACHE_NFSFS_RENAMED                = BIT(12),
        DCACHE_FSNOTIFY_PARENT_WATCHED        = BIT(13),        /* Parent inode is watched by some fsnotify listener */
        DCACHE_DENTRY_KILLED                = BIT(14),
        DCACHE_MOUNTED                        = BIT(15),        /* is a mountpoint */
        DCACHE_NEED_AUTOMOUNT                = BIT(16),        /* handle automount on this dir */
        DCACHE_MANAGE_TRANSIT                = BIT(17),        /* manage transit from this dirent */
        DCACHE_LRU_LIST                        = BIT(18),
        DCACHE_ENTRY_TYPE                = (7 << 19),        /* bits 19..21 are for storing type: */
        DCACHE_MISS_TYPE                = (0 << 19),        /* Negative dentry */
        DCACHE_WHITEOUT_TYPE                = (1 << 19),        /* Whiteout dentry (stop pathwalk) */
        DCACHE_DIRECTORY_TYPE                = (2 << 19),        /* Normal directory */
        DCACHE_AUTODIR_TYPE                = (3 << 19),        /* Lookupless directory (presumed automount) */
        DCACHE_REGULAR_TYPE                = (4 << 19),        /* Regular file type */
        DCACHE_SPECIAL_TYPE                = (5 << 19),        /* Other file type */
        DCACHE_SYMLINK_TYPE                = (6 << 19),        /* Symlink */
        DCACHE_NOKEY_NAME                = BIT(22),        /* Encrypted name encoded without key */
        DCACHE_OP_REAL                        = BIT(23),
        DCACHE_PAR_LOOKUP                = BIT(24),        /* being looked up (with parent locked shared) */
        DCACHE_DENTRY_CURSOR                = BIT(25),
        DCACHE_NORCU                        = BIT(26),        /* No RCU delay for freeing */
        DCACHE_PERSISTENT                = BIT(27)
};

#define DCACHE_MANAGED_DENTRY \
        (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)

extern seqlock_t rename_lock;

/*
 * These are the low-level FS interfaces to the dcache..
 */
extern void d_instantiate(struct dentry *, struct inode *);
extern void d_instantiate_new(struct dentry *, struct inode *);
extern void __d_drop(struct dentry *dentry);
extern void d_drop(struct dentry *dentry);
extern void d_delete(struct dentry *);

/* allocate/de-allocate */
extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
extern struct dentry * d_alloc_anon(struct super_block *);
extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
                                        wait_queue_head_t *);
extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
/* weird procfs mess; *NOT* exported */
extern struct dentry * d_splice_alias_ops(struct inode *, struct dentry *,
                                          const struct dentry_operations *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
                        const struct qstr *name);
extern struct dentry *d_find_any_alias(struct inode *inode);
extern struct dentry * d_obtain_alias(struct inode *);
extern struct dentry * d_obtain_root(struct inode *);
extern void shrink_dcache_sb(struct super_block *);
extern void shrink_dcache_parent(struct dentry *);
extern void d_invalidate(struct dentry *);

/* only used at mount-time */
extern struct dentry * d_make_root(struct inode *);

extern void d_mark_tmpfile(struct file *, struct inode *);
int d_mark_tmpfile_name(struct file *file, const struct qstr *name);
extern void d_tmpfile(struct file *, struct inode *);

extern struct dentry *d_find_alias(struct inode *);
extern void d_prune_aliases(struct inode *);
extern void d_dispose_if_unused(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);

extern struct dentry *d_find_alias_rcu(struct inode *);

/* test whether we have any submounts in a subdir tree */
extern int path_has_submounts(const struct path *);

/*
 * This adds the entry to the hash queues.
 */
extern void d_rehash(struct dentry *);
 
extern void d_add(struct dentry *, struct inode *);

/* used for rename() and baskets */
extern void d_move(struct dentry *, struct dentry *);
extern void d_exchange(struct dentry *, struct dentry *);
extern struct dentry *d_ancestor(struct dentry *, struct dentry *);

extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);

static inline unsigned d_count(const struct dentry *dentry)
{
        return dentry->d_lockref.count;
}

ino_t d_parent_ino(struct dentry *dentry);

/*
 * helper function for dentry_operations.d_dname() members
 */
extern __printf(3, 4)
char *dynamic_dname(char *, int, const char *, ...);

extern char *__d_path(const struct path *, const struct path *, char *, int);
extern char *d_absolute_path(const struct path *, char *, int);
extern char *d_path(const struct path *, char *, int);
extern char *dentry_path_raw(const struct dentry *, char *, int);
extern char *dentry_path(const struct dentry *, char *, int);

/* Allocation counts.. */

/**
 * dget_dlock -        get a reference to a dentry
 * @dentry: dentry to get a reference to
 *
 * Given a live dentry, increment the reference count and return the dentry.
 * Caller must hold @dentry->d_lock.  Making sure that dentry is alive is
 * caller's resonsibility.  There are many conditions sufficient to guarantee
 * that; e.g. anything with non-negative refcount is alive, so's anything
 * hashed, anything positive, anyone's parent, etc.
 */
static inline struct dentry *dget_dlock(struct dentry *dentry)
{
        dentry->d_lockref.count++;
        return dentry;
}


/**
 * dget - get a reference to a dentry
 * @dentry: dentry to get a reference to
 *
 * Given a dentry or %NULL pointer increment the reference count
 * if appropriate and return the dentry.  A dentry will not be
 * destroyed when it has references.  Conversely, a dentry with
 * no references can disappear for any number of reasons, starting
 * with memory pressure.  In other words, that primitive is
 * used to clone an existing reference; using it on something with
 * zero refcount is a bug.
 *
 * NOTE: it will spin if @dentry->d_lock is held.  From the deadlock
 * avoidance point of view it is equivalent to spin_lock()/increment
 * refcount/spin_unlock(), so calling it under @dentry->d_lock is
 * always a bug; so's calling it under ->d_lock on any of its descendents.
 *
 */
static inline struct dentry *dget(struct dentry *dentry)
{
        if (dentry)
                lockref_get(&dentry->d_lockref);
        return dentry;
}

extern struct dentry *dget_parent(struct dentry *dentry);

/**
 * d_unhashed - is dentry hashed
 * @dentry: entry to check
 *
 * Returns true if the dentry passed is not currently hashed.
 */
static inline int d_unhashed(const struct dentry *dentry)
{
        return hlist_bl_unhashed(&dentry->d_hash);
}

static inline int d_unlinked(const struct dentry *dentry)
{
        return d_unhashed(dentry) && !IS_ROOT(dentry);
}

static inline int cant_mount(const struct dentry *dentry)
{
        return (dentry->d_flags & DCACHE_CANT_MOUNT);
}

static inline void dont_mount(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_CANT_MOUNT;
        spin_unlock(&dentry->d_lock);
}

extern void __d_lookup_unhash_wake(struct dentry *dentry);

static inline int d_in_lookup(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_PAR_LOOKUP;
}

static inline void d_lookup_done(struct dentry *dentry)
{
        if (unlikely(d_in_lookup(dentry)))
                __d_lookup_unhash_wake(dentry);
}

extern void dput(struct dentry *);

static inline bool d_managed(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MANAGED_DENTRY;
}

static inline bool d_mountpoint(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MOUNTED;
}

/*
 * Directory cache entry type accessor functions.
 */
static inline unsigned __d_entry_type(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_ENTRY_TYPE;
}

static inline bool d_is_miss(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
}

static inline bool d_is_whiteout(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE;
}

static inline bool d_can_lookup(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
}

static inline bool d_is_autodir(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
}

static inline bool d_is_dir(const struct dentry *dentry)
{
        return d_can_lookup(dentry) || d_is_autodir(dentry);
}

static inline bool d_is_symlink(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
}

static inline bool d_is_reg(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE;
}

static inline bool d_is_special(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE;
}

static inline bool d_is_file(const struct dentry *dentry)
{
        return d_is_reg(dentry) || d_is_special(dentry);
}

static inline bool d_is_negative(const struct dentry *dentry)
{
        // TODO: check d_is_whiteout(dentry) also.
        return d_is_miss(dentry);
}

static inline bool d_flags_negative(unsigned flags)
{
        return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE;
}

static inline bool d_is_positive(const struct dentry *dentry)
{
        return !d_is_negative(dentry);
}

/**
 * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents either an absent name or a name that
 * doesn't map to an inode (ie. ->d_inode is NULL).  The dentry could represent
 * a true miss, a whiteout that isn't represented by a 0,0 chardev or a
 * fallthrough marker in an opaque directory.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.  (3) The dentry may have something attached to ->d_lower and the
 * type field of the flags may be set to something other than miss or whiteout.
 */
static inline bool d_really_is_negative(const struct dentry *dentry)
{
        return dentry->d_inode == NULL;
}

/**
 * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents a name that maps to an inode
 * (ie. ->d_inode is not NULL).  The dentry might still represent a whiteout if
 * that is represented on medium as a 0,0 chardev.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.
 */
static inline bool d_really_is_positive(const struct dentry *dentry)
{
        return dentry->d_inode != NULL;
}

static inline int simple_positive(const struct dentry *dentry)
{
        return d_really_is_positive(dentry) && !d_unhashed(dentry);
}

unsigned long vfs_pressure_ratio(unsigned long val);

/**
 * d_inode - Get the actual inode of this dentry
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode(const struct dentry *dentry)
{
        return dentry->d_inode;
}

/**
 * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE()
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode_rcu(const struct dentry *dentry)
{
        return READ_ONCE(dentry->d_inode);
}

/**
 * d_backing_inode - Get upper or lower inode we should be using
 * @upper: The upper layer
 *
 * This is the helper that should be used to get at the inode that will be used
 * if this dentry were to be opened as a file.  The inode may be on the upper
 * dentry or it may be on a lower dentry pinned by the upper.
 *
 * Normal filesystems should not use this to access their own inodes.
 */
static inline struct inode *d_backing_inode(const struct dentry *upper)
{
        struct inode *inode = upper->d_inode;

        return inode;
}

/**
 * d_real - Return the real dentry
 * @dentry: the dentry to query
 * @type: the type of real dentry (data or metadata)
 *
 * If dentry is on a union/overlay, then return the underlying, real dentry.
 * Otherwise return the dentry itself.
 *
 * See also: Documentation/filesystems/vfs.rst
 */
static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
                return dentry->d_op->d_real(dentry, type);
        else
                return dentry;
}

/**
 * d_real_inode - Return the real inode hosting the data
 * @dentry: The dentry to query
 *
 * If dentry is on a union/overlay, then return the underlying, real inode.
 * Otherwise return d_inode().
 */
static inline struct inode *d_real_inode(const struct dentry *dentry)
{
        /* This usage of d_real() results in const dentry */
        return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA));
}

struct name_snapshot {
        struct qstr name;
        union shortname_store inline_name;
};
void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
void release_dentry_name_snapshot(struct name_snapshot *);

static inline struct dentry *d_first_child(const struct dentry *dentry)
{
        return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib);
}

static inline struct dentry *d_next_sibling(const struct dentry *dentry)
{
        return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib);
}

void set_default_d_op(struct super_block *, const struct dentry_operations *);
struct dentry *d_make_persistent(struct dentry *, struct inode *);
void d_make_discardable(struct dentry *dentry);

#endif        /* __LINUX_DCACHE_H */




































































































































































































































   16 

















   14 
    2 











































   14 
















   14 































   15 




   14 

    1 




   13 














   11 
   16 



    9 
   12 




   15 

   15 














   12 


















    1 
    1 

    1 













   11 

   12 














    8 




    8 
















    1 
    1 





























    9 























   18 
   18 












































   12 

   10 
















   13 
   14 




   15 





























   14 




























   12 








    3 

   12 























    1 




    1 












    3 


    5 




























































   15 





   15 






















   14 

















   14 






   15 


   14 











   14 

    3 


   13 


   14 















   12 


















































































































































































    2 









    3 
    1 

    2 




























   18 









   15 
   12 










































    5 




    5 









    5 


    4 

















    7 




    6 
































    7 


    6 



















































































    3 









    3 
















    3 










    3 









    3 




























    3 







    3 


































































    4 































    3 


    4 





























































































































































    3 







    3 




















   18 









   12 


   16 

   17 


















































































































































































































   11 




















   13 







   15 
    3 












   13 
    1 







    4 



   15 















   20 

    3 












   20 


   16 



   14 













   16 




















































    1 






    3 





    1 






















    2 

























































    2 
































    2 

















    3 














    4 




    3 




    4 










    2 



    1 


    3 
















    2 


    2 































































































    4 





    1 




    2 








    3 








































































































































































































































    1 

    1 






































    1 

    2 




























































































    1 
    1 
    1 










    6 
   12 






















   16 



   16 

    1 



























   13 




   16 



    8 









   10 












   13 









   12 





































































































































    1 
    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
// SPDX-License-Identifier: GPL-2.0-only
/*
 * (C) 1997 Linus Torvalds
 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
 */
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/mm.h>
#include <linux/backing-dev.h>
#include <linux/hash.h>
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/cdev.h>
#include <linux/memblock.h>
#include <linux/fsnotify.h>
#include <linux/fsverity.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <linux/rw_hint.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <trace/events/writeback.h>
#define CREATE_TRACE_POINTS
#include <trace/events/timestamp.h>

#include "internal.h"

/*
 * Inode locking rules:
 *
 * inode->i_lock protects:
 *   inode->i_state, inode->i_hash, __iget(), inode->i_io_list
 * Inode LRU list locks protect:
 *   inode->i_sb->s_inode_lru, inode->i_lru
 * inode->i_sb->s_inode_list_lock protects:
 *   inode->i_sb->s_inodes, inode->i_sb_list
 * bdi->wb.list_lock protects:
 *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
 * inode_hash_lock protects:
 *   inode_hashtable, inode->i_hash
 *
 * Lock ordering:
 *
 * inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *     Inode LRU list locks
 *
 * bdi->wb.list_lock
 *   inode->i_lock
 *
 * inode_hash_lock
 *   inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *
 * iunique_lock
 *   inode_hash_lock
 */

static unsigned int i_hash_mask __ro_after_init;
static unsigned int i_hash_shift __ro_after_init;
static struct hlist_head *inode_hashtable __ro_after_init;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);

/*
 * Empty aops. Can be used for the cases where the user does not
 * define any of the address_space operations.
 */
const struct address_space_operations empty_aops = {
};
EXPORT_SYMBOL(empty_aops);

static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);

static struct kmem_cache *inode_cachep __ro_after_init;

static long get_nr_inodes(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_inodes, i);
        return sum < 0 ? 0 : sum;
}

static inline long get_nr_inodes_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_unused, i);
        return sum < 0 ? 0 : sum;
}

long get_nr_dirty_inodes(void)
{
        /* not actually dirty inodes, but a wild approximation */
        long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
        return nr_dirty > 0 ? nr_dirty : 0;
}

#ifdef CONFIG_DEBUG_FS
static DEFINE_PER_CPU(long, mg_ctime_updates);
static DEFINE_PER_CPU(long, mg_fine_stamps);
static DEFINE_PER_CPU(long, mg_ctime_swaps);

static unsigned long get_mg_ctime_updates(void)
{
        unsigned long sum = 0;
        int i;

        for_each_possible_cpu(i)
                sum += data_race(per_cpu(mg_ctime_updates, i));
        return sum;
}

static unsigned long get_mg_fine_stamps(void)
{
        unsigned long sum = 0;
        int i;

        for_each_possible_cpu(i)
                sum += data_race(per_cpu(mg_fine_stamps, i));
        return sum;
}

static unsigned long get_mg_ctime_swaps(void)
{
        unsigned long sum = 0;
        int i;

        for_each_possible_cpu(i)
                sum += data_race(per_cpu(mg_ctime_swaps, i));
        return sum;
}

#define mgtime_counter_inc(__var)        this_cpu_inc(__var)

static int mgts_show(struct seq_file *s, void *p)
{
        unsigned long ctime_updates = get_mg_ctime_updates();
        unsigned long ctime_swaps = get_mg_ctime_swaps();
        unsigned long fine_stamps = get_mg_fine_stamps();
        unsigned long floor_swaps = timekeeping_get_mg_floor_swaps();

        seq_printf(s, "%lu %lu %lu %lu\n",
                   ctime_updates, ctime_swaps, fine_stamps, floor_swaps);
        return 0;
}

DEFINE_SHOW_ATTRIBUTE(mgts);

static int __init mg_debugfs_init(void)
{
        debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops);
        return 0;
}
late_initcall(mg_debugfs_init);

#else /* ! CONFIG_DEBUG_FS */

#define mgtime_counter_inc(__var)        do { } while (0)

#endif /* CONFIG_DEBUG_FS */

/*
 * Handle nr_inode sysctl
 */
#ifdef CONFIG_SYSCTL
/*
 * Statistics gathering..
 */
static struct inodes_stat_t inodes_stat;

static int proc_nr_inodes(const struct ctl_table *table, int write, void *buffer,
                          size_t *lenp, loff_t *ppos)
{
        inodes_stat.nr_inodes = get_nr_inodes();
        inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table inodes_sysctls[] = {
        {
                .procname        = "inode-nr",
                .data                = &inodes_stat,
                .maxlen                = 2*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_inodes,
        },
        {
                .procname        = "inode-state",
                .data                = &inodes_stat,
                .maxlen                = 7*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_inodes,
        },
};

static int __init init_fs_inode_sysctls(void)
{
        register_sysctl_init("fs", inodes_sysctls);
        return 0;
}
early_initcall(init_fs_inode_sysctls);
#endif

static int no_open(struct inode *inode, struct file *file)
{
        return -ENXIO;
}

/**
 * inode_init_always_gfp - perform inode structure initialisation
 * @sb: superblock inode belongs to
 * @inode: inode to initialise
 * @gfp: allocation flags
 *
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
 * If there are additional allocations required @gfp is used.
 */
int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp)
{
        static const struct inode_operations empty_iops;
        static const struct file_operations no_open_fops = {.open = no_open};
        struct address_space *const mapping = &inode->i_data;

        inode->i_sb = sb;
        inode->i_blkbits = sb->s_blocksize_bits;
        inode->i_flags = 0;
        inode_state_assign_raw(inode, 0);
        atomic64_set(&inode->i_sequence, 0);
        atomic_set(&inode->i_count, 1);
        inode->i_op = &empty_iops;
        inode->i_fop = &no_open_fops;
        inode->i_ino = 0;
        inode->__i_nlink = 1;
        inode->i_opflags = 0;
        if (sb->s_xattr)
                inode->i_opflags |= IOP_XATTR;
        if (sb->s_type->fs_flags & FS_MGTIME)
                inode->i_opflags |= IOP_MGTIME;
        i_uid_write(inode, 0);
        i_gid_write(inode, 0);
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_write_hint = WRITE_LIFE_NOT_SET;
        inode->i_blocks = 0;
        inode->i_bytes = 0;
        inode->i_generation = 0;
        inode->i_pipe = NULL;
        inode->i_cdev = NULL;
        inode->i_link = NULL;
        inode->i_dir_seq = 0;
        inode->i_rdev = 0;
        inode->dirtied_when = 0;

#ifdef CONFIG_CGROUP_WRITEBACK
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
#endif

        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);

        init_rwsem(&inode->i_rwsem);
        lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);

        atomic_set(&inode->i_dio_count, 0);

        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
        mapping->wb_err = 0;
        atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        atomic_set(&mapping->nr_thps, 0);
#endif
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->writeback_index = 0;
        init_rwsem(&mapping->invalidate_lock);
        lockdep_set_class_and_name(&mapping->invalidate_lock,
                                   &sb->s_type->invalidate_lock_key,
                                   "mapping.invalidate_lock");
        if (sb->s_iflags & SB_I_STABLE_WRITES)
                mapping_set_stable_writes(mapping);
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);        /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif

#ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
#endif
        inode->i_flctx = NULL;

        if (unlikely(security_inode_alloc(inode, gfp)))
                return -ENOMEM;

        this_cpu_inc(nr_inodes);

        return 0;
}
EXPORT_SYMBOL(inode_init_always_gfp);

void free_inode_nonrcu(struct inode *inode)
{
        kmem_cache_free(inode_cachep, inode);
}
EXPORT_SYMBOL(free_inode_nonrcu);

static void i_callback(struct rcu_head *head)
{
        struct inode *inode = container_of(head, struct inode, i_rcu);
        if (inode->free_inode)
                inode->free_inode(inode);
        else
                free_inode_nonrcu(inode);
}

/**
 *        alloc_inode         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock.
 *        Inode wont be chained in superblock s_inodes list
 *        This means :
 *        - fs can't be unmount
 *        - quotas, fsnotify, writeback can't work
 */
struct inode *alloc_inode(struct super_block *sb)
{
        const struct super_operations *ops = sb->s_op;
        struct inode *inode;

        if (ops->alloc_inode)
                inode = ops->alloc_inode(sb);
        else
                inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);

        if (!inode)
                return NULL;

        if (unlikely(inode_init_always(sb, inode))) {
                if (ops->destroy_inode) {
                        ops->destroy_inode(inode);
                        if (!ops->free_inode)
                                return NULL;
                }
                inode->free_inode = ops->free_inode;
                i_callback(&inode->i_rcu);
                return NULL;
        }

        return inode;
}

void __destroy_inode(struct inode *inode)
{
        inode_detach_wb(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
        locks_free_lock_context(inode);
        if (!inode->i_nlink) {
                WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

#ifdef CONFIG_FS_POSIX_ACL
        if (inode->i_acl && !is_uncached_acl(inode->i_acl))
                posix_acl_release(inode->i_acl);
        if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
                posix_acl_release(inode->i_default_acl);
#endif
        this_cpu_dec(nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);

static void destroy_inode(struct inode *inode)
{
        const struct super_operations *ops = inode->i_sb->s_op;

        BUG_ON(!list_empty(&inode->i_lru));
        __destroy_inode(inode);
        if (ops->destroy_inode) {
                ops->destroy_inode(inode);
                if (!ops->free_inode)
                        return;
        }
        inode->free_inode = ops->free_inode;
        call_rcu(&inode->i_rcu, i_callback);
}

/**
 * drop_nlink - directly drop an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  In cases
 * where we are attempting to track writes to the
 * filesystem, a decrement to zero means an imminent
 * write when the file is truncated and actually unlinked
 * on the filesystem.
 */
void drop_nlink(struct inode *inode)
{
        WARN_ON(inode->i_nlink == 0);
        inode->__i_nlink--;
        if (!inode->i_nlink)
                atomic_long_inc(&inode->i_sb->s_remove_count);
}
EXPORT_SYMBOL(drop_nlink);

/**
 * clear_nlink - directly zero an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  See
 * drop_nlink() for why we care about i_nlink hitting zero.
 */
void clear_nlink(struct inode *inode)
{
        if (inode->i_nlink) {
                inode->__i_nlink = 0;
                atomic_long_inc(&inode->i_sb->s_remove_count);
        }
}
EXPORT_SYMBOL(clear_nlink);

/**
 * set_nlink - directly set an inode's link count
 * @inode: inode
 * @nlink: new nlink (should be non-zero)
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.
 */
void set_nlink(struct inode *inode, unsigned int nlink)
{
        if (!nlink) {
                clear_nlink(inode);
        } else {
                /* Yes, some filesystems do change nlink from zero to one */
                if (inode->i_nlink == 0)
                        atomic_long_dec(&inode->i_sb->s_remove_count);

                inode->__i_nlink = nlink;
        }
}
EXPORT_SYMBOL(set_nlink);

/**
 * inc_nlink - directly increment an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  Currently,
 * it is only here for parity with dec_nlink().
 */
void inc_nlink(struct inode *inode)
{
        if (unlikely(inode->i_nlink == 0)) {
                WARN_ON(!(inode_state_read_once(inode) & I_LINKABLE));
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

        inode->__i_nlink++;
}
EXPORT_SYMBOL(inc_nlink);

static void __address_space_init_once(struct address_space *mapping)
{
        xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
        init_rwsem(&mapping->i_mmap_rwsem);
        spin_lock_init(&mapping->i_private_lock);
        mapping->i_mmap = RB_ROOT_CACHED;
}

void address_space_init_once(struct address_space *mapping)
{
        memset(mapping, 0, sizeof(*mapping));
        __address_space_init_once(mapping);
}
EXPORT_SYMBOL(address_space_init_once);

/*
 * These are initializations that only need to be done
 * once, because the fields are idempotent across use
 * of the inode, so let the slab aware of that.
 */
void inode_init_once(struct inode *inode)
{
        memset(inode, 0, sizeof(*inode));
        INIT_HLIST_NODE(&inode->i_hash);
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_LIST_HEAD(&inode->i_io_list);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
        INIT_LIST_HEAD(&inode->i_sb_list);
        __address_space_init_once(&inode->i_data);
        i_size_ordered_init(inode);
}
EXPORT_SYMBOL(inode_init_once);

static void init_once(void *foo)
{
        struct inode *inode = (struct inode *) foo;

        inode_init_once(inode);
}

/*
 * get additional reference to inode; caller must already hold one.
 */
void ihold(struct inode *inode)
{
        WARN_ON(atomic_inc_return(&inode->i_count) < 2);
}
EXPORT_SYMBOL(ihold);

struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
                                            struct inode *inode, u32 bit)
{
        void *bit_address;

        bit_address = inode_state_wait_address(inode, bit);
        init_wait_var_entry(wqe, bit_address, 0);
        return __var_waitqueue(bit_address);
}
EXPORT_SYMBOL(inode_bit_waitqueue);

void wait_on_new_inode(struct inode *inode)
{
        struct wait_bit_queue_entry wqe;
        struct wait_queue_head *wq_head;

        spin_lock(&inode->i_lock);
        if (!(inode_state_read(inode) & I_NEW)) {
                spin_unlock(&inode->i_lock);
                return;
        }

        wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
        for (;;) {
                prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
                if (!(inode_state_read(inode) & I_NEW))
                        break;
                spin_unlock(&inode->i_lock);
                schedule();
                spin_lock(&inode->i_lock);
        }
        finish_wait(wq_head, &wqe.wq_entry);
        WARN_ON(inode_state_read(inode) & I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(wait_on_new_inode);

static void __inode_lru_list_add(struct inode *inode, bool rotate)
{
        lockdep_assert_held(&inode->i_lock);

        if (inode_state_read(inode) & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
                return;
        if (icount_read(inode))
                return;
        if (!(inode->i_sb->s_flags & SB_ACTIVE))
                return;
        if (!mapping_shrinkable(&inode->i_data))
                return;

        if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_inc(nr_unused);
        else if (rotate)
                inode_state_set(inode, I_REFERENCED);
}

/*
 * Add inode to LRU if needed (inode is unused and clean).
 */
void inode_lru_list_add(struct inode *inode)
{
        __inode_lru_list_add(inode, false);
}

static void inode_lru_list_del(struct inode *inode)
{
        if (list_empty(&inode->i_lru))
                return;

        if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_dec(nr_unused);
}

static void inode_pin_lru_isolating(struct inode *inode)
{
        lockdep_assert_held(&inode->i_lock);
        WARN_ON(inode_state_read(inode) & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
        inode_state_set(inode, I_LRU_ISOLATING);
}

static void inode_unpin_lru_isolating(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode_state_read(inode) & I_LRU_ISOLATING));
        inode_state_clear(inode, I_LRU_ISOLATING);
        /* Called with inode->i_lock which ensures memory ordering. */
        inode_wake_up_bit(inode, __I_LRU_ISOLATING);
        spin_unlock(&inode->i_lock);
}

static void inode_wait_for_lru_isolating(struct inode *inode)
{
        struct wait_bit_queue_entry wqe;
        struct wait_queue_head *wq_head;

        lockdep_assert_held(&inode->i_lock);
        if (!(inode_state_read(inode) & I_LRU_ISOLATING))
                return;

        wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
        for (;;) {
                prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
                /*
                 * Checking I_LRU_ISOLATING with inode->i_lock guarantees
                 * memory ordering.
                 */
                if (!(inode_state_read(inode) & I_LRU_ISOLATING))
                        break;
                spin_unlock(&inode->i_lock);
                schedule();
                spin_lock(&inode->i_lock);
        }
        finish_wait(wq_head, &wqe.wq_entry);
        WARN_ON(inode_state_read(inode) & I_LRU_ISOLATING);
}

/**
 * inode_sb_list_add - add inode to the superblock list of inodes
 * @inode: inode to add
 */
void inode_sb_list_add(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        spin_lock(&sb->s_inode_list_lock);
        list_add(&inode->i_sb_list, &sb->s_inodes);
        spin_unlock(&sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);

static inline void inode_sb_list_del(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (!list_empty(&inode->i_sb_list)) {
                spin_lock(&sb->s_inode_list_lock);
                list_del_init(&inode->i_sb_list);
                spin_unlock(&sb->s_inode_list_lock);
        }
}

static unsigned long hash(struct super_block *sb, u64 hashval)
{
        unsigned long tmp;

        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
                        L1_CACHE_BYTES;
        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
        return tmp & i_hash_mask;
}

/**
 *        __insert_inode_hash - hash an inode
 *        @inode: unhashed inode
 *        @hashval: u64 value used to locate this object in the
 *                inode_hashtable.
 *
 *        Add an inode to the inode hash for this superblock.
 */
void __insert_inode_hash(struct inode *inode, u64 hashval)
{
        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);

        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_add_head_rcu(&inode->i_hash, b);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__insert_inode_hash);

/**
 *        __remove_inode_hash - remove an inode from the hash
 *        @inode: inode to unhash
 *
 *        Remove an inode from the superblock.
 */
void __remove_inode_hash(struct inode *inode)
{
        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_del_init_rcu(&inode->i_hash);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__remove_inode_hash);

void dump_mapping(const struct address_space *mapping)
{
        struct inode *host;
        const struct address_space_operations *a_ops;
        struct hlist_node *dentry_first;
        struct dentry *dentry_ptr;
        struct dentry dentry;
        char fname[64] = {};
        u64 ino;

        /*
         * If mapping is an invalid pointer, we don't want to crash
         * accessing it, so probe everything depending on it carefully.
         */
        if (get_kernel_nofault(host, &mapping->host) ||
            get_kernel_nofault(a_ops, &mapping->a_ops)) {
                pr_warn("invalid mapping:%px\n", mapping);
                return;
        }

        if (!host) {
                pr_warn("aops:%ps\n", a_ops);
                return;
        }

        if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
            get_kernel_nofault(ino, &host->i_ino)) {
                pr_warn("aops:%ps invalid inode:%px\n", a_ops, host);
                return;
        }

        if (!dentry_first) {
                pr_warn("aops:%ps ino:%llx\n", a_ops, ino);
                return;
        }

        dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
        if (get_kernel_nofault(dentry, dentry_ptr) ||
            !dentry.d_parent || !dentry.d_name.name) {
                pr_warn("aops:%ps ino:%llx invalid dentry:%px\n",
                                a_ops, ino, dentry_ptr);
                return;
        }

        if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0)
                strscpy(fname, "<invalid>");
        /*
         * Even if strncpy_from_kernel_nofault() succeeded,
         * the fname could be unreliable
         */
        pr_warn("aops:%ps ino:%llx dentry name(?):\"%s\"\n",
                a_ops, ino, fname);
}

void clear_inode(struct inode *inode)
{
        /*
         * Only IS_VERITY() inodes can have verity info, so start by checking
         * for IS_VERITY() (which is faster than retrieving the pointer to the
         * verity info).  This minimizes overhead for non-verity inodes.
         */
        if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
                fsverity_cleanup_inode(inode);

        /*
         * We have to cycle the i_pages lock here because reclaim can be in the
         * process of removing the last page (in __filemap_remove_folio())
         * and we must not free the mapping under it.
         */
        xa_lock_irq(&inode->i_data.i_pages);
        BUG_ON(inode->i_data.nrpages);
        /*
         * Almost always, mapping_empty(&inode->i_data) here; but there are
         * two known and long-standing ways in which nodes may get left behind
         * (when deep radix-tree node allocation failed partway; or when THP
         * collapse_file() failed). Until those two known cases are cleaned up,
         * or a cleanup function is called here, do not BUG_ON(!mapping_empty),
         * nor even WARN_ON(!mapping_empty).
         */
        xa_unlock_irq(&inode->i_data.i_pages);
        BUG_ON(!(inode_state_read_once(inode) & I_FREEING));
        BUG_ON(inode_state_read_once(inode) & I_CLEAR);
        BUG_ON(!list_empty(&inode->i_wb_list));
        /* don't need i_lock here, no concurrent mods to i_state */
        inode_state_assign_raw(inode, I_FREEING | I_CLEAR);
}
EXPORT_SYMBOL(clear_inode);

/*
 * Free the inode passed in, removing it from the lists it is still connected
 * to. We remove any pages still attached to the inode and wait for any IO that
 * is still in progress before finally destroying the inode.
 *
 * An inode must already be marked I_FREEING so that we avoid the inode being
 * moved back onto lists if we race with other code that manipulates the lists
 * (e.g. writeback_single_inode). The caller is responsible for setting this.
 *
 * An inode must already be removed from the LRU list before being evicted from
 * the cache. This should occur atomically with setting the I_FREEING state
 * flag, so no inodes here should ever be on the LRU when being evicted.
 */
static void evict(struct inode *inode)
{
        const struct super_operations *op = inode->i_sb->s_op;

        BUG_ON(!(inode_state_read_once(inode) & I_FREEING));
        BUG_ON(!list_empty(&inode->i_lru));

        inode_io_list_del(inode);
        inode_sb_list_del(inode);

        spin_lock(&inode->i_lock);
        inode_wait_for_lru_isolating(inode);

        /*
         * Wait for flusher thread to be done with the inode so that filesystem
         * does not start destroying it while writeback is still running. Since
         * the inode has I_FREEING set, flusher thread won't start new work on
         * the inode.  We just have to wait for running writeback to finish.
         */
        inode_wait_for_writeback(inode);
        spin_unlock(&inode->i_lock);

        if (op->evict_inode) {
                op->evict_inode(inode);
        } else {
                truncate_inode_pages_final(&inode->i_data);
                clear_inode(inode);
        }
        if (S_ISCHR(inode->i_mode) && inode->i_cdev)
                cd_forget(inode);

        remove_inode_hash(inode);

        /*
         * Wake up waiters in __wait_on_freeing_inode().
         *
         * It is an invariant that any thread we need to wake up is already
         * accounted for before remove_inode_hash() acquires ->i_lock -- both
         * sides take the lock and sleep is aborted if the inode is found
         * unhashed. Thus either the sleeper wins and goes off CPU, or removal
         * wins and the sleeper aborts after testing with the lock.
         *
         * This also means we don't need any fences for the call below.
         */
        inode_wake_up_bit(inode, __I_NEW);
        BUG_ON(inode_state_read_once(inode) != (I_FREEING | I_CLEAR));

        destroy_inode(inode);
}

/*
 * dispose_list - dispose of the contents of a local list
 * @head: the head of the list to free
 *
 * Dispose-list gets a local list with local inodes in it, so it doesn't
 * need to worry about list corruption and SMP locks.
 */
static void dispose_list(struct list_head *head)
{
        while (!list_empty(head)) {
                struct inode *inode;

                inode = list_first_entry(head, struct inode, i_lru);
                list_del_init(&inode->i_lru);

                evict(inode);
                cond_resched();
        }
}

/**
 * evict_inodes        - evict all evictable inodes for a superblock
 * @sb:                superblock to operate on
 *
 * Make sure that no inodes with zero refcount are retained.  This is
 * called by superblock shutdown after having SB_ACTIVE flag removed,
 * so any inode reaching zero refcount during or after that call will
 * be immediately evicted.
 */
void evict_inodes(struct super_block *sb)
{
        struct inode *inode;
        LIST_HEAD(dispose);

again:
        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                if (icount_read(inode))
                        continue;

                spin_lock(&inode->i_lock);
                if (icount_read(inode)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                inode_state_set(inode, I_FREEING);
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);

                /*
                 * We can have a ton of inodes to evict at unmount time given
                 * enough memory, check to see if we need to go to sleep for a
                 * bit so we don't livelock.
                 */
                if (need_resched()) {
                        spin_unlock(&sb->s_inode_list_lock);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
        spin_unlock(&sb->s_inode_list_lock);

        dispose_list(&dispose);
}
EXPORT_SYMBOL_GPL(evict_inodes);

/*
 * Isolate the inode from the LRU in preparation for freeing it.
 *
 * If the inode has the I_REFERENCED flag set, then it means that it has been
 * used recently - the flag is set in iput_final(). When we encounter such an
 * inode, clear the flag and move it to the back of the LRU so it gets another
 * pass through the LRU before it gets reclaimed. This is necessary because of
 * the fact we are doing lazy LRU updates to minimise lock contention so the
 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
 * with this flag set because they are the inodes that are out of order.
 */
static enum lru_status inode_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, void *arg)
{
        struct list_head *freeable = arg;
        struct inode        *inode = container_of(item, struct inode, i_lru);

        /*
         * We are inverting the lru lock/inode->i_lock here, so use a
         * trylock. If we fail to get the lock, just skip it.
         */
        if (!spin_trylock(&inode->i_lock))
                return LRU_SKIP;

        /*
         * Inodes can get referenced, redirtied, or repopulated while
         * they're already on the LRU, and this can make them
         * unreclaimable for a while. Remove them lazily here; iput,
         * sync, or the last page cache deletion will requeue them.
         */
        if (icount_read(inode) ||
            (inode_state_read(inode) & ~I_REFERENCED) ||
            !mapping_shrinkable(&inode->i_data)) {
                list_lru_isolate(lru, &inode->i_lru);
                spin_unlock(&inode->i_lock);
                this_cpu_dec(nr_unused);
                return LRU_REMOVED;
        }

        /* Recently referenced inodes get one more pass */
        if (inode_state_read(inode) & I_REFERENCED) {
                inode_state_clear(inode, I_REFERENCED);
                spin_unlock(&inode->i_lock);
                return LRU_ROTATE;
        }

        /*
         * On highmem systems, mapping_shrinkable() permits dropping
         * page cache in order to free up struct inodes: lowmem might
         * be under pressure before the cache inside the highmem zone.
         */
        if (!mapping_empty(&inode->i_data)) {
                unsigned long reap;

                inode_pin_lru_isolating(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&lru->lock);
                reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
                if (current_is_kswapd())
                        __count_vm_events(KSWAPD_INODESTEAL, reap);
                else
                        __count_vm_events(PGINODESTEAL, reap);
                mm_account_reclaimed_pages(reap);
                inode_unpin_lru_isolating(inode);
                return LRU_RETRY;
        }

        WARN_ON(inode_state_read(inode) & I_NEW);
        inode_state_set(inode, I_FREEING);
        list_lru_isolate_move(lru, &inode->i_lru, freeable);
        spin_unlock(&inode->i_lock);

        this_cpu_dec(nr_unused);
        return LRU_REMOVED;
}

/*
 * Walk the superblock inode LRU for freeable inodes and attempt to free them.
 * This is called from the superblock shrinker function with a number of inodes
 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
 * then are freed outside inode_lock by dispose_list().
 */
long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(freeable);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
                                     inode_lru_isolate, &freeable);
        dispose_list(&freeable);
        return freed;
}

static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked);

/*
 * Called with the inode lock held.
 */
static struct inode *find_inode(struct super_block *sb,
                                struct hlist_head *head,
                                int (*test)(struct inode *, void *),
                                void *data, bool hash_locked,
                                bool *isnew)
{
        struct inode *inode = NULL;

        if (hash_locked)
                lockdep_assert_held(&inode_hash_lock);
        else
                lockdep_assert_not_held(&inode_hash_lock);

        rcu_read_lock();
repeat:
        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                if (!test(inode, data))
                        continue;
                spin_lock(&inode->i_lock);
                if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode, hash_locked, true);
                        goto repeat;
                }
                if (unlikely(inode_state_read(inode) & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        rcu_read_unlock();
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                *isnew = !!(inode_state_read(inode) & I_NEW);
                spin_unlock(&inode->i_lock);
                rcu_read_unlock();
                return inode;
        }
        rcu_read_unlock();
        return NULL;
}

/*
 * find_inode_fast is the fast path version of find_inode, see the comment at
 * iget_locked for details.
 */
static struct inode *find_inode_fast(struct super_block *sb,
                                struct hlist_head *head, u64 ino,
                                bool hash_locked, bool *isnew)
{
        struct inode *inode = NULL;

        if (hash_locked)
                lockdep_assert_held(&inode_hash_lock);
        else
                lockdep_assert_not_held(&inode_hash_lock);

        rcu_read_lock();
repeat:
        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_ino != ino)
                        continue;
                if (inode->i_sb != sb)
                        continue;
                spin_lock(&inode->i_lock);
                if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode, hash_locked, true);
                        goto repeat;
                }
                if (unlikely(inode_state_read(inode) & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        rcu_read_unlock();
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                *isnew = !!(inode_state_read(inode) & I_NEW);
                spin_unlock(&inode->i_lock);
                rcu_read_unlock();
                return inode;
        }
        rcu_read_unlock();
        return NULL;
}

/*
 * Each cpu owns a range of LAST_INO_BATCH numbers.
 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
 * to renew the exhausted range.
 *
 * This does not significantly increase overflow rate because every CPU can
 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
 * overflow rate by 2x, which does not seem too significant.
 *
 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
 * error if st_ino won't fit in target struct field. Use 32bit counter
 * here to attempt to avoid that.
 */
#define LAST_INO_BATCH 1024
static DEFINE_PER_CPU(unsigned int, last_ino);

unsigned int get_next_ino(void)
{
        unsigned int *p = &get_cpu_var(last_ino);
        unsigned int res = *p;

#ifdef CONFIG_SMP
        if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
                static atomic_t shared_last_ino;
                int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);

                res = next - LAST_INO_BATCH;
        }
#endif

        res++;
        /* get_next_ino should not provide a 0 inode number */
        if (unlikely(!res))
                res++;
        *p = res;
        put_cpu_var(last_ino);
        return res;
}
EXPORT_SYMBOL(get_next_ino);

/**
 *        new_inode         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock. The default gfp_mask
 *        for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
 *        If HIGHMEM pages are unsuitable or it is known that pages allocated
 *        for the page cache are not reclaimable or migratable,
 *        mapping_set_gfp_mask() must be called with suitable flags on the
 *        newly created inode's mapping
 *
 */
struct inode *new_inode(struct super_block *sb)
{
        struct inode *inode;

        inode = alloc_inode(sb);
        if (inode)
                inode_sb_list_add(inode);
        return inode;
}
EXPORT_SYMBOL(new_inode);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
void lockdep_annotate_inode_mutex_key(struct inode *inode)
{
        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;

                /* Set new key only if filesystem hasn't already changed it */
                if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
                        /*
                         * ensure nobody is actually holding i_rwsem
                         */
                        init_rwsem(&inode->i_rwsem);
                        lockdep_set_class(&inode->i_rwsem,
                                          &type->i_mutex_dir_key);
                }
        }
}
EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
#endif

/**
 * unlock_new_inode - clear the I_NEW state and wake up any waiters
 * @inode:        new inode to unlock
 *
 * Called when the inode is fully initialised to clear the new state of the
 * inode and wake up anyone waiting for the inode to finish initialisation.
 */
void unlock_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode_state_read(inode) & I_NEW));
        inode_state_clear(inode, I_NEW | I_CREATING);
        inode_wake_up_bit(inode, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(unlock_new_inode);

void discard_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode_state_read(inode) & I_NEW));
        inode_state_clear(inode, I_NEW);
        inode_wake_up_bit(inode, __I_NEW);
        spin_unlock(&inode->i_lock);
        iput(inode);
}
EXPORT_SYMBOL(discard_new_inode);

/**
 * lock_two_nondirectories - take two i_mutexes on non-directory objects
 *
 * Lock any non-NULL argument. Passed objects must not be directories.
 * Zero, one or two objects may be locked by this function.
 *
 * @inode1: first inode to lock
 * @inode2: second inode to lock
 */
void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1)
                WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
        if (inode2)
                WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
        if (inode1 > inode2)
                swap(inode1, inode2);
        if (inode1)
                inode_lock(inode1);
        if (inode2 && inode2 != inode1)
                inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);

/**
 * unlock_two_nondirectories - release locks from lock_two_nondirectories()
 * @inode1: first inode to unlock
 * @inode2: second inode to unlock
 */
void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1) {
                WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
                inode_unlock(inode1);
        }
        if (inode2 && inode2 != inode1) {
                WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
                inode_unlock(inode2);
        }
}
EXPORT_SYMBOL(unlock_two_nondirectories);

/**
 * inode_insert5 - obtain an inode from a mounted file system
 * @inode:        pre-allocated inode to use for insert to cache
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 * @isnew:        pointer to a bool which will indicate whether I_NEW is set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present return it with an increased reference count. This is a
 * variant of iget5_locked() that doesn't allocate an inode.
 *
 * If the inode is not present in the cache, insert the pre-allocated inode and
 * return it locked, hashed, and with the I_NEW flag set. The file system gets
 * to fill it in before unlocking it via unlock_new_inode().
 *
 * Note that both @test and @set are called with the inode_hash_lock held, so
 * they can't sleep.
 */
struct inode *inode_insert5(struct inode *inode, u64 hashval,
                            int (*test)(struct inode *, void *),
                            int (*set)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
        struct inode *old;
        bool isnew;

        might_sleep();

again:
        spin_lock(&inode_hash_lock);
        old = find_inode(inode->i_sb, head, test, data, true, &isnew);
        if (unlikely(old)) {
                /*
                 * Uhhuh, somebody else created the same inode under us.
                 * Use the old inode instead of the preallocated one.
                 */
                spin_unlock(&inode_hash_lock);
                if (IS_ERR(old))
                        return NULL;
                if (unlikely(isnew))
                        wait_on_new_inode(old);
                if (unlikely(inode_unhashed(old))) {
                        iput(old);
                        goto again;
                }
                return old;
        }

        if (set && unlikely(set(inode, data))) {
                spin_unlock(&inode_hash_lock);
                return NULL;
        }

        /*
         * Return the locked inode with I_NEW set, the
         * caller is responsible for filling in the contents
         */
        spin_lock(&inode->i_lock);
        inode_state_set(inode, I_NEW);
        hlist_add_head_rcu(&inode->i_hash, head);
        spin_unlock(&inode->i_lock);

        spin_unlock(&inode_hash_lock);

        /*
         * Add inode to the sb list if it's not already. It has I_NEW at this
         * point, so it should be safe to test i_sb_list locklessly.
         */
        if (list_empty(&inode->i_sb_list))
                inode_sb_list_add(inode);

        return inode;
}
EXPORT_SYMBOL(inode_insert5);

/**
 * iget5_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present return it with an increased reference count. This is a
 * generalized version of iget_locked() for file systems where the inode
 * number is not sufficient for unique identification of an inode.
 *
 * If the inode is not present in the cache, allocate and insert a new inode
 * and return it locked, hashed, and with the I_NEW flag set. The file system
 * gets to fill it in before unlocking it via unlock_new_inode().
 *
 * Note that both @test and @set are called with the inode_hash_lock held, so
 * they can't sleep.
 */
struct inode *iget5_locked(struct super_block *sb, u64 hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *), void *data)
{
        struct inode *inode = ilookup5(sb, hashval, test, data);

        if (!inode) {
                struct inode *new = alloc_inode(sb);

                if (new) {
                        inode = inode_insert5(new, hashval, test, set, data);
                        if (unlikely(inode != new))
                                destroy_inode(new);
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget5_locked);

/**
 * iget5_locked_rcu - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * This is equivalent to iget5_locked, except the @test callback must
 * tolerate the inode not being stable, including being mid-teardown.
 */
struct inode *iget5_locked_rcu(struct super_block *sb, u64 hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode, *new;
        bool isnew;

        might_sleep();

again:
        inode = find_inode(sb, head, test, data, false, &isnew);
        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                if (unlikely(isnew))
                        wait_on_new_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
                return inode;
        }

        new = alloc_inode(sb);
        if (new) {
                inode = inode_insert5(new, hashval, test, set, data);
                if (unlikely(inode != new))
                        destroy_inode(new);
        }
        return inode;
}
EXPORT_SYMBOL_GPL(iget5_locked_rcu);

/**
 * iget_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @ino:        inode number to get
 *
 * Search for the inode specified by @ino in the inode cache and if present
 * return it with an increased reference count. This is for file systems
 * where the inode number is sufficient for unique identification of an inode.
 *
 * If the inode is not in cache, allocate a new inode and return it locked,
 * hashed, and with the I_NEW flag set.  The file system gets to fill it in
 * before unlocking it via unlock_new_inode().
 */
struct inode *iget_locked(struct super_block *sb, u64 ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
        bool isnew;

        might_sleep();

again:
        inode = find_inode_fast(sb, head, ino, false, &isnew);
        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                if (unlikely(isnew))
                        wait_on_new_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
                return inode;
        }

        inode = alloc_inode(sb);
        if (inode) {
                struct inode *old;

                spin_lock(&inode_hash_lock);
                /* We released the lock, so.. */
                old = find_inode_fast(sb, head, ino, true, &isnew);
                if (!old) {
                        inode->i_ino = ino;
                        spin_lock(&inode->i_lock);
                        inode_state_assign(inode, I_NEW);
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        spin_unlock(&inode_hash_lock);
                        inode_sb_list_add(inode);

                        /* Return the locked inode with I_NEW set, the
                         * caller is responsible for filling in the contents
                         */
                        return inode;
                }

                /*
                 * Uhhuh, somebody else created the same inode under
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
                spin_unlock(&inode_hash_lock);
                destroy_inode(inode);
                if (IS_ERR(old))
                        return NULL;
                inode = old;
                if (unlikely(isnew))
                        wait_on_new_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget_locked);

/*
 * search the inode cache for a matching inode number.
 * If we find one, then the inode number we are trying to
 * allocate is not unique and so we should not use it.
 *
 * Returns 1 if the inode number is unique, 0 if it is not.
 */
static int test_inode_iunique(struct super_block *sb, u64 ino)
{
        struct hlist_head *b = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        hlist_for_each_entry_rcu(inode, b, i_hash) {
                if (inode->i_ino == ino && inode->i_sb == sb)
                        return 0;
        }
        return 1;
}

/**
 *        iunique - get a unique inode number
 *        @sb: superblock
 *        @max_reserved: highest reserved inode number
 *
 *        Obtain an inode number that is unique on the system for a given
 *        superblock. This is used by file systems that have no natural
 *        permanent inode numbering system. An inode number is returned that
 *        is higher than the reserved limit but unique.
 *
 *        BUGS:
 *        With a large number of inodes live on the file system this function
 *        currently becomes quite slow.
 */
ino_t iunique(struct super_block *sb, ino_t max_reserved)
{
        /*
         * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
         * error if st_ino won't fit in target struct field. Use 32bit counter
         * here to attempt to avoid that.
         */
        static DEFINE_SPINLOCK(iunique_lock);
        static unsigned int counter;
        ino_t res;

        rcu_read_lock();
        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
                        counter = max_reserved + 1;
                res = counter++;
        } while (!test_inode_iunique(sb, res));
        spin_unlock(&iunique_lock);
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(iunique);

struct inode *igrab(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) {
                __iget(inode);
                spin_unlock(&inode->i_lock);
        } else {
                spin_unlock(&inode->i_lock);
                /*
                 * Handle the case where s_op->clear_inode is not been
                 * called yet, and somebody is calling igrab
                 * while the inode is getting freed.
                 */
                inode = NULL;
        }
        return inode;
}
EXPORT_SYMBOL(igrab);

/**
 * ilookup5_nowait - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 * @isnew:        return argument telling whether I_NEW was set when
 *                the inode was found in hash (the caller needs to
 *                wait for I_NEW to clear)
 *
 * Search for the inode specified by @hashval and @data in the inode cache.
 * If the inode is in the cache, the inode is returned with an incremented
 * reference count.
 *
 * Note: I_NEW is not waited upon so you have to be very careful what you do
 * with the returned inode.  You probably should be using ilookup5() instead.
 *
 * Note2: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval,
                int (*test)(struct inode *, void *), void *data, bool *isnew)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        spin_lock(&inode_hash_lock);
        inode = find_inode(sb, head, test, data, true, isnew);
        spin_unlock(&inode_hash_lock);

        return IS_ERR(inode) ? NULL : inode;
}
EXPORT_SYMBOL(ilookup5_nowait);

/**
 * ilookup5 - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if the inode is in the cache, return the inode with an incremented
 * reference count.  Waits on I_NEW before returning the inode.
 * returned with an incremented reference count.
 *
 * This is a generalized version of ilookup() for file systems where the
 * inode number is not sufficient for unique identification of an inode.
 *
 * Note: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5(struct super_block *sb, u64 hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *inode;
        bool isnew;

        might_sleep();

again:
        inode = ilookup5_nowait(sb, hashval, test, data, &isnew);
        if (inode) {
                if (unlikely(isnew))
                        wait_on_new_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup5);

/**
 * ilookup - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @ino:        inode number to search for
 *
 * Search for the inode @ino in the inode cache, and if the inode is in the
 * cache, the inode is returned with an incremented reference count.
 */
struct inode *ilookup(struct super_block *sb, u64 ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
        bool isnew;

        might_sleep();

again:
        inode = find_inode_fast(sb, head, ino, false, &isnew);

        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                if (unlikely(isnew))
                        wait_on_new_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup);

/**
 * find_inode_nowait - find an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @match:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @match
 *
 * Search for the inode specified by @hashval and @data in the inode
 * cache, where the helper function @match will return 0 if the inode
 * does not match, 1 if the inode does match, and -1 if the search
 * should be stopped.  The @match function must be responsible for
 * taking the i_lock spin_lock and checking i_state for an inode being
 * freed or being initialized, and incrementing the reference count
 * before returning 1.  It also must not sleep, since it is called with
 * the inode_hash_lock spinlock held.
 *
 * This is a even more generalized version of ilookup5() when the
 * function must never block --- find_inode() can block in
 * __wait_on_freeing_inode() --- or when the caller can not increment
 * the reference count because the resulting iput() might cause an
 * inode eviction.  The tradeoff is that the @match funtion must be
 * very carefully implemented.
 */
struct inode *find_inode_nowait(struct super_block *sb,
                                u64 hashval,
                                int (*match)(struct inode *, u64,
                                             void *),
                                void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode, *ret_inode = NULL;
        int mval;

        spin_lock(&inode_hash_lock);
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                mval = match(inode, hashval, data);
                if (mval == 0)
                        continue;
                if (mval == 1)
                        ret_inode = inode;
                goto out;
        }
out:
        spin_unlock(&inode_hash_lock);
        return ret_inode;
}
EXPORT_SYMBOL(find_inode_nowait);

/**
 * find_inode_rcu - find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @hashval:        Key to hash
 * @test:        Function to test match on an inode
 * @data:        Data for test function
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_rcu(struct super_block *sb, u64 hashval,
                             int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_sb == sb &&
                    !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE)) &&
                    test(inode, data))
                        return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_rcu);

/**
 * find_inode_by_ino_rcu - Find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @ino:        The inode number to match
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_by_ino_rcu(struct super_block *sb,
                                    u64 ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_by_ino_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_ino == ino &&
                    inode->i_sb == sb &&
                    !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE)))
                    return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_by_ino_rcu);

int insert_inode_locked(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        u64 ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        bool isnew;

        might_sleep();

        while (1) {
                struct inode *old = NULL;
                spin_lock(&inode_hash_lock);
repeat:
                hlist_for_each_entry(old, head, i_hash) {
                        if (old->i_ino != ino)
                                continue;
                        if (old->i_sb != sb)
                                continue;
                        spin_lock(&old->i_lock);
                        break;
                }
                if (likely(!old)) {
                        spin_lock(&inode->i_lock);
                        inode_state_set(inode, I_NEW | I_CREATING);
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return 0;
                }
                if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) {
                        __wait_on_freeing_inode(old, true, false);
                        old = NULL;
                        goto repeat;
                }
                if (unlikely(inode_state_read(old) & I_CREATING)) {
                        spin_unlock(&old->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return -EBUSY;
                }
                __iget(old);
                isnew = !!(inode_state_read(old) & I_NEW);
                spin_unlock(&old->i_lock);
                spin_unlock(&inode_hash_lock);
                if (isnew)
                        wait_on_new_inode(old);
                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
                iput(old);
        }
}
EXPORT_SYMBOL(insert_inode_locked);

int insert_inode_locked4(struct inode *inode, u64 hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *old;

        might_sleep();

        inode_state_set_raw(inode, I_CREATING);
        old = inode_insert5(inode, hashval, test, NULL, data);

        if (old != inode) {
                iput(old);
                return -EBUSY;
        }
        return 0;
}
EXPORT_SYMBOL(insert_inode_locked4);


int inode_just_drop(struct inode *inode)
{
        return 1;
}
EXPORT_SYMBOL(inode_just_drop);

/*
 * Called when we're dropping the last reference
 * to an inode.
 *
 * Call the FS "drop_inode()" function, defaulting to
 * the legacy UNIX filesystem behaviour.  If it tells
 * us to evict inode, do so.  Otherwise, retain inode
 * in cache if fs is alive, sync and evict if fs is
 * shutting down.
 */
static void iput_final(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        const struct super_operations *op = inode->i_sb->s_op;
        int drop;

        WARN_ON(inode_state_read(inode) & I_NEW);
        VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);

        if (op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = inode_generic_drop(inode);

        if (!drop &&
            !(inode_state_read(inode) & I_DONTCACHE) &&
            (sb->s_flags & SB_ACTIVE)) {
                __inode_lru_list_add(inode, true);
                spin_unlock(&inode->i_lock);
                return;
        }

        /*
         * Re-check ->i_count in case the ->drop_inode() hooks played games.
         * Note we only execute this if the verdict was to drop the inode.
         */
        VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);

        if (drop) {
                inode_state_set(inode, I_FREEING);
        } else {
                inode_state_set(inode, I_WILL_FREE);
                spin_unlock(&inode->i_lock);

                write_inode_now(inode, 1);

                spin_lock(&inode->i_lock);
                WARN_ON(inode_state_read(inode) & I_NEW);
                inode_state_replace(inode, I_WILL_FREE, I_FREEING);
        }

        inode_lru_list_del(inode);
        spin_unlock(&inode->i_lock);

        evict(inode);
}

/**
 *        iput        - put an inode
 *        @inode: inode to put
 *
 *        Puts an inode, dropping its usage count. If the inode use count hits
 *        zero, the inode is then freed and may also be destroyed.
 *
 *        Consequently, iput() can sleep.
 */
void iput(struct inode *inode)
{
        might_sleep();
        if (unlikely(!inode))
                return;

retry:
        lockdep_assert_not_held(&inode->i_lock);
        VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_CLEAR), inode);
        /*
         * Note this assert is technically racy as if the count is bogusly
         * equal to one, then two CPUs racing to further drop it can both
         * conclude it's fine.
         */
        VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode);

        if (atomic_add_unless(&inode->i_count, -1, 1))
                return;

        if (inode->i_nlink && sync_lazytime(inode))
                goto retry;

        spin_lock(&inode->i_lock);
        if (unlikely((inode_state_read(inode) & I_DIRTY_TIME) && inode->i_nlink)) {
                spin_unlock(&inode->i_lock);
                goto retry;
        }

        if (!atomic_dec_and_test(&inode->i_count)) {
                spin_unlock(&inode->i_lock);
                return;
        }

        /*
         * iput_final() drops ->i_lock, we can't assert on it as the inode may
         * be deallocated by the time the call returns.
         */
        iput_final(inode);
}
EXPORT_SYMBOL(iput);

/**
 *        iput_not_last        - put an inode assuming this is not the last reference
 *        @inode: inode to put
 */
void iput_not_last(struct inode *inode)
{
        VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_CLEAR), inode);
        VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode);

        WARN_ON(atomic_sub_return(1, &inode->i_count) == 0);
}
EXPORT_SYMBOL(iput_not_last);

#ifdef CONFIG_BLOCK
/**
 *        bmap        - find a block number in a file
 *        @inode:  inode owning the block number being requested
 *        @block: pointer containing the block to find
 *
 *        Replaces the value in ``*block`` with the block number on the device holding
 *        corresponding to the requested block number in the file.
 *        That is, asked for block 4 of inode 1 the function will replace the
 *        4 in ``*block``, with disk block relative to the disk start that holds that
 *        block of the file.
 *
 *        Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
 *        hole, returns 0 and ``*block`` is also set to 0.
 */
int bmap(struct inode *inode, sector_t *block)
{
        if (!inode->i_mapping->a_ops->bmap)
                return -EINVAL;

        *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block);
        return 0;
}
EXPORT_SYMBOL(bmap);
#endif

/*
 * With relative atime, only update atime if the previous atime is
 * earlier than or equal to either the ctime or mtime,
 * or if at least a day has passed since the last atime update.
 */
static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode,
                             struct timespec64 now)
{
        struct timespec64 atime, mtime, ctime;

        if (!(mnt->mnt_flags & MNT_RELATIME))
                return true;
        /*
         * Is mtime younger than or equal to atime? If yes, update atime:
         */
        atime = inode_get_atime(inode);
        mtime = inode_get_mtime(inode);
        if (timespec64_compare(&mtime, &atime) >= 0)
                return true;
        /*
         * Is ctime younger than or equal to atime? If yes, update atime:
         */
        ctime = inode_get_ctime(inode);
        if (timespec64_compare(&ctime, &atime) >= 0)
                return true;

        /*
         * Is the previous atime value older than a day? If yes,
         * update atime:
         */
        if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
                return true;
        /*
         * Good, we can skip the atime update:
         */
        return false;
}

static int inode_update_atime(struct inode *inode)
{
        struct timespec64 atime = inode_get_atime(inode);
        struct timespec64 now = current_time(inode);

        if (timespec64_equal(&now, &atime))
                return 0;

        inode_set_atime_to_ts(inode, now);
        return inode_time_dirty_flag(inode);
}

static int inode_update_cmtime(struct inode *inode, unsigned int flags)
{
        struct timespec64 ctime = inode_get_ctime(inode);
        struct timespec64 mtime = inode_get_mtime(inode);
        struct timespec64 now = inode_set_ctime_current(inode);
        unsigned int dirty = 0;
        bool mtime_changed;

        mtime_changed = !timespec64_equal(&now, &mtime);
        if (mtime_changed || !timespec64_equal(&now, &ctime))
                dirty = inode_time_dirty_flag(inode);

        /*
         * Pure timestamp updates can be recorded in the inode without blocking
         * by not dirtying the inode.  But when the file system requires
         * i_version updates, the update of i_version can still block.
         * Error out if we'd actually have to update i_version or don't support
         * lazytime.
         */
        if (IS_I_VERSION(inode)) {
                if (flags & IOCB_NOWAIT) {
                        if (!(inode->i_sb->s_flags & SB_LAZYTIME) ||
                            inode_iversion_need_inc(inode))
                                return -EAGAIN;
                } else {
                        if (inode_maybe_inc_iversion(inode, !!dirty))
                                dirty |= I_DIRTY_SYNC;
                }
        }

        if (mtime_changed)
                inode_set_mtime_to_ts(inode, now);
        return dirty;
}

/**
 * inode_update_time - update either atime or c/mtime and i_version on the inode
 * @inode: inode to be updated
 * @type: timestamp to be updated
 * @flags: flags for the update
 *
 * Update either atime or c/mtime and version in a inode if needed for a file
 * access or modification.  It is up to the caller to mark the inode dirty
 * appropriately.
 *
 * Returns the positive I_DIRTY_* flags for __mark_inode_dirty() if the inode
 * needs to be marked dirty, 0 if it did not, or a negative errno if an error
 * happened.
 */
int inode_update_time(struct inode *inode, enum fs_update_time type,
                unsigned int flags)
{
        switch (type) {
        case FS_UPD_ATIME:
                return inode_update_atime(inode);
        case FS_UPD_CMTIME:
                return inode_update_cmtime(inode, flags);
        default:
                WARN_ON_ONCE(1);
                return -EIO;
        }
}
EXPORT_SYMBOL(inode_update_time);

/**
 * generic_update_time - update the timestamps on the inode
 * @inode: inode to be updated
 * @type: timestamp to be updated
 * @flags: flags for the update
 *
 * Returns a negative error value on error, else 0.
 */
int generic_update_time(struct inode *inode, enum fs_update_time type,
                unsigned int flags)
{
        int dirty;

        /*
         * ->dirty_inode is what could make generic timestamp updates block.
         * Don't support non-blocking timestamp updates here if it is set.
         * File systems that implement ->dirty_inode but want to support
         * non-blocking timestamp updates should call inode_update_time
         * directly.
         */
        if ((flags & IOCB_NOWAIT) && inode->i_sb->s_op->dirty_inode)
                return -EAGAIN;

        dirty = inode_update_time(inode, type, flags);
        if (dirty <= 0)
                return dirty;
        __mark_inode_dirty(inode, dirty);
        return 0;
}
EXPORT_SYMBOL(generic_update_time);

/**
 *        atime_needs_update        -        update the access time
 *        @path: the &struct path to update
 *        @inode: inode to update
 *
 *        Update the accessed time on an inode and mark it for writeback.
 *        This function automatically handles read only file systems and media,
 *        as well as the "noatime" flag and inode specific "noatime" markers.
 */
bool atime_needs_update(const struct path *path, struct inode *inode)
{
        struct vfsmount *mnt = path->mnt;
        struct timespec64 now, atime;

        if (inode->i_flags & S_NOATIME)
                return false;

        /* Atime updates will likely cause i_uid and i_gid to be written
         * back improprely if their true value is unknown to the vfs.
         */
        if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode))
                return false;

        if (IS_NOATIME(inode))
                return false;
        if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        if (mnt->mnt_flags & MNT_NOATIME)
                return false;
        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        now = current_time(inode);

        if (!relatime_need_update(mnt, inode, now))
                return false;

        atime = inode_get_atime(inode);
        if (timespec64_equal(&atime, &now))
                return false;

        return true;
}

void touch_atime(const struct path *path)
{
        struct vfsmount *mnt = path->mnt;
        struct inode *inode = d_inode(path->dentry);

        if (!atime_needs_update(path, inode))
                return;

        if (!sb_start_write_trylock(inode->i_sb))
                return;

        if (mnt_get_write_access(mnt) != 0)
                goto skip_update;
        /*
         * File systems can error out when updating inodes if they need to
         * allocate new space to modify an inode (such is the case for
         * Btrfs), but since we touch atime while walking down the path we
         * really don't care if we failed to update the atime of the file,
         * so just ignore the return value.
         * We may also fail on filesystems that have the ability to make parts
         * of the fs read only, e.g. subvolumes in Btrfs.
         */
        if (inode->i_op->update_time)
                inode->i_op->update_time(inode, FS_UPD_ATIME, 0);
        else
                generic_update_time(inode, FS_UPD_ATIME, 0);
        mnt_put_write_access(mnt);
skip_update:
        sb_end_write(inode->i_sb);
}
EXPORT_SYMBOL(touch_atime);

/*
 * Return mask of changes for notify_change() that need to be done as a
 * response to write or truncate. Return 0 if nothing has to be changed.
 * Negative value on error (change should be denied).
 */
int dentry_needs_remove_privs(struct mnt_idmap *idmap,
                              struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        int mask = 0;
        int ret;

        if (IS_NOSEC(inode))
                return 0;

        mask = setattr_should_drop_suidgid(idmap, inode);
        ret = security_inode_need_killpriv(dentry);
        if (ret < 0)
                return ret;
        if (ret)
                mask |= ATTR_KILL_PRIV;
        return mask;
}

static int __remove_privs(struct mnt_idmap *idmap,
                          struct dentry *dentry, int kill)
{
        struct iattr newattrs;

        newattrs.ia_valid = ATTR_FORCE | kill;
        /*
         * Note we call this on write, so notify_change will not
         * encounter any conflicting delegations:
         */
        return notify_change(idmap, dentry, &newattrs, NULL);
}

static int file_remove_privs_flags(struct file *file, unsigned int flags)
{
        struct dentry *dentry = file_dentry(file);
        struct inode *inode = file_inode(file);
        int error = 0;
        int kill;

        if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
                return 0;

        kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry);
        if (kill < 0)
                return kill;

        if (kill) {
                if (flags & IOCB_NOWAIT)
                        return -EAGAIN;

                error = __remove_privs(file_mnt_idmap(file), dentry, kill);
        }

        if (!error)
                inode_has_no_xattr(inode);
        return error;
}

/**
 * file_remove_privs - remove special file privileges (suid, capabilities)
 * @file: file to remove privileges from
 *
 * When file is modified by a write or truncation ensure that special
 * file privileges are removed.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_remove_privs(struct file *file)
{
        return file_remove_privs_flags(file, 0);
}
EXPORT_SYMBOL(file_remove_privs);

/**
 * current_time - Return FS time (possibly fine-grained)
 * @inode: inode.
 *
 * Return the current time truncated to the time granularity supported by
 * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
 * as having been QUERIED, get a fine-grained timestamp, but don't update
 * the floor.
 *
 * For a multigrain inode, this is effectively an estimate of the timestamp
 * that a file would receive. An actual update must go through
 * inode_set_ctime_current().
 */
struct timespec64 current_time(struct inode *inode)
{
        struct timespec64 now;
        u32 cns;

        ktime_get_coarse_real_ts64_mg(&now);

        if (!is_mgtime(inode))
                goto out;

        /* If nothing has queried it, then coarse time is fine */
        cns = smp_load_acquire(&inode->i_ctime_nsec);
        if (cns & I_CTIME_QUERIED) {
                /*
                 * If there is no apparent change, then get a fine-grained
                 * timestamp.
                 */
                if (now.tv_nsec == (cns & ~I_CTIME_QUERIED))
                        ktime_get_real_ts64(&now);
        }
out:
        return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);

static inline bool need_cmtime_update(struct inode *inode)
{
        struct timespec64 now = current_time(inode), ts;

        ts = inode_get_mtime(inode);
        if (!timespec64_equal(&ts, &now))
                return true;
        ts = inode_get_ctime(inode);
        if (!timespec64_equal(&ts, &now))
                return true;
        return IS_I_VERSION(inode) && inode_iversion_need_inc(inode);
}

static int file_update_time_flags(struct file *file, unsigned int flags)
{
        struct inode *inode = file_inode(file);
        int ret;

        /* First try to exhaust all avenues to not sync */
        if (IS_NOCMTIME(inode))
                return 0;
        if (unlikely(file->f_mode & FMODE_NOCMTIME))
                return 0;
        if (!need_cmtime_update(inode))
                return 0;

        flags &= IOCB_NOWAIT;
        if (mnt_get_write_access_file(file))
                return 0;
        if (inode->i_op->update_time)
                ret = inode->i_op->update_time(inode, FS_UPD_CMTIME, flags);
        else
                ret = generic_update_time(inode, FS_UPD_CMTIME, flags);
        mnt_put_write_access_file(file);
        return ret;
}

/**
 * file_update_time - update mtime and ctime time
 * @file: file accessed
 *
 * Update the mtime and ctime members of an inode and mark the inode for
 * writeback. Note that this function is meant exclusively for usage in
 * the file write path of filesystems, and filesystems may choose to
 * explicitly ignore updates via this function with the _NOCMTIME inode
 * flag, e.g. for network filesystem where these imestamps are handled
 * by the server. This can return an error for file systems who need to
 * allocate space in order to update an inode.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_update_time(struct file *file)
{
        return file_update_time_flags(file, 0);
}
EXPORT_SYMBOL(file_update_time);

/**
 * file_modified_flags - handle mandated vfs changes when modifying a file
 * @file: file that was modified
 * @flags: kiocb flags
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * If IOCB_NOWAIT is set, special file privileges will not be removed and
 * time settings will not be updated. It will return -EAGAIN.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
static int file_modified_flags(struct file *file, int flags)
{
        int ret;

        /*
         * Clear the security bits if the process is not being run by root.
         * This keeps people from modifying setuid and setgid binaries.
         */
        ret = file_remove_privs_flags(file, flags);
        if (ret)
                return ret;
        return file_update_time_flags(file, flags);
}

/**
 * file_modified - handle mandated vfs changes when modifying a file
 * @file: file that was modified
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_modified(struct file *file)
{
        return file_modified_flags(file, 0);
}
EXPORT_SYMBOL(file_modified);

/**
 * kiocb_modified - handle mandated vfs changes when modifying a file
 * @iocb: iocb that was modified
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
int kiocb_modified(struct kiocb *iocb)
{
        return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
}
EXPORT_SYMBOL_GPL(kiocb_modified);

int inode_needs_sync(struct inode *inode)
{
        if (IS_SYNC(inode))
                return 1;
        if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
                return 1;
        return 0;
}
EXPORT_SYMBOL(inode_needs_sync);

/*
 * If we try to find an inode in the inode hash while it is being
 * deleted, we have to wait until the filesystem completes its
 * deletion before reporting that it isn't found.  This function waits
 * until the deletion _might_ have completed.  Callers are responsible
 * to recheck inode state.
 *
 * It doesn't matter if I_NEW is not set initially, a call to
 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
 * will DTRT.
 */
static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked)
{
        struct wait_bit_queue_entry wqe;
        struct wait_queue_head *wq_head;

        VFS_BUG_ON(!hash_locked && !rcu_locked);

        /*
         * Handle racing against evict(), see that routine for more details.
         */
        if (unlikely(inode_unhashed(inode))) {
                WARN_ON(hash_locked);
                spin_unlock(&inode->i_lock);
                return;
        }

        wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
        prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
        spin_unlock(&inode->i_lock);
        if (rcu_locked)
                rcu_read_unlock();
        if (hash_locked)
                spin_unlock(&inode_hash_lock);
        schedule();
        finish_wait(wq_head, &wqe.wq_entry);
        if (hash_locked)
                spin_lock(&inode_hash_lock);
        if (rcu_locked)
                rcu_read_lock();
}

static __initdata unsigned long ihash_entries;
static int __init set_ihash_entries(char *str)
{
        return kstrtoul(str, 0, &ihash_entries) == 0;
}
__setup("ihash_entries=", set_ihash_entries);

/*
 * Initialize the waitqueues and inode hash table.
 */
void __init inode_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_EARLY | HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void __init inode_init(void)
{
        /* inode slab cache */
        inode_cachep = kmem_cache_create("inode_cache",
                                         sizeof(struct inode),
                                         0,
                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                         SLAB_ACCOUNT),
                                         init_once);

        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
        inode->i_mode = mode;
        switch (inode->i_mode & S_IFMT) {
        case S_IFCHR:
                inode->i_fop = &def_chr_fops;
                inode->i_rdev = rdev;
                break;
        case S_IFBLK:
                if (IS_ENABLED(CONFIG_BLOCK))
                        inode->i_fop = &def_blk_fops;
                inode->i_rdev = rdev;
                break;
        case S_IFIFO:
                inode->i_fop = &pipefifo_fops;
                break;
        case S_IFSOCK:
                /* leave it no_open_fops */
                break;
        default:
                pr_debug("init_special_inode: bogus i_mode (%o) for inode %s:%llu\n",
                         mode, inode->i_sb->s_id, inode->i_ino);
                break;
        }
}
EXPORT_SYMBOL(init_special_inode);

/**
 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
 * @idmap: idmap of the mount the inode was created from
 * @inode: New inode
 * @dir: Directory inode
 * @mode: mode of the new inode
 *
 * If the inode has been created through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions
 * and initializing i_uid and i_gid. On non-idmapped mounts or if permission
 * checking is to be performed on the raw inode simply pass @nop_mnt_idmap.
 */
void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
                      const struct inode *dir, umode_t mode)
{
        inode_fsuid_set(inode, idmap);
        if (dir && dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;

                /* Directories are special, and always inherit S_ISGID */
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
                inode_fsgid_set(inode, idmap);
        inode->i_mode = mode;
}
EXPORT_SYMBOL(inode_init_owner);

/**
 * inode_owner_or_capable - check current task permissions to inode
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode being checked
 *
 * Return true if current either has CAP_FOWNER in a namespace with the
 * inode owner uid mapped, or owns the file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
bool inode_owner_or_capable(struct mnt_idmap *idmap,
                            const struct inode *inode)
{
        vfsuid_t vfsuid;
        struct user_namespace *ns;

        vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                return true;

        ns = current_user_ns();
        if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER))
                return true;
        return false;
}
EXPORT_SYMBOL(inode_owner_or_capable);

/*
 * Direct i/o helper functions
 */
bool inode_dio_finished(const struct inode *inode)
{
        return atomic_read(&inode->i_dio_count) == 0;
}
EXPORT_SYMBOL(inode_dio_finished);

/**
 * inode_dio_wait - wait for outstanding DIO requests to finish
 * @inode: inode to wait for
 *
 * Waits for all pending direct I/O requests to finish so that we can
 * proceed with a truncate or equivalent operation.
 *
 * Must be called under a lock that serializes taking new references
 * to i_dio_count, usually by inode->i_rwsem.
 */
void inode_dio_wait(struct inode *inode)
{
        wait_var_event(&inode->i_dio_count, inode_dio_finished(inode));
}
EXPORT_SYMBOL(inode_dio_wait);

void inode_dio_wait_interruptible(struct inode *inode)
{
        wait_var_event_interruptible(&inode->i_dio_count,
                                     inode_dio_finished(inode));
}
EXPORT_SYMBOL(inode_dio_wait_interruptible);

/*
 * inode_set_flags - atomically set some inode flags
 *
 * Note: the caller should be holding i_rwsem exclusively, or else be sure that
 * they have exclusive access to the inode structure (i.e., while the
 * inode is being instantiated).  The reason for the cmpxchg() loop
 * --- which wouldn't be necessary if all code paths which modify
 * i_flags actually followed this rule, is that there is at least one
 * code path which doesn't today so we use cmpxchg() out of an abundance
 * of caution.
 *
 * In the long run, i_rwsem is overkill, and we should probably look
 * at using the i_lock spinlock to protect i_flags, and then make sure
 * it is so documented in include/linux/fs.h and that all code follows
 * the locking convention!!
 */
void inode_set_flags(struct inode *inode, unsigned int flags,
                     unsigned int mask)
{
        WARN_ON_ONCE(flags & ~mask);
        set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);

void inode_nohighmem(struct inode *inode)
{
        mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
}
EXPORT_SYMBOL(inode_nohighmem);

struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts)
{
        trace_inode_set_ctime_to_ts(inode, &ts);
        set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec);
        inode->i_ctime_sec = ts.tv_sec;
        inode->i_ctime_nsec = ts.tv_nsec;
        return ts;
}
EXPORT_SYMBOL(inode_set_ctime_to_ts);

/**
 * timestamp_truncate - Truncate timespec to a granularity
 * @t: Timespec
 * @inode: inode being updated
 *
 * Truncate a timespec to the granularity supported by the fs
 * containing the inode. Always rounds down. gran must
 * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
 */
struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned int gran = sb->s_time_gran;

        t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
        if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
                t.tv_nsec = 0;

        /* Avoid division in the common cases 1 ns and 1 s. */
        if (gran == 1)
                ; /* nothing */
        else if (gran == NSEC_PER_SEC)
                t.tv_nsec = 0;
        else if (gran > 1 && gran < NSEC_PER_SEC)
                t.tv_nsec -= t.tv_nsec % gran;
        else
                WARN(1, "invalid file time granularity: %u", gran);
        return t;
}
EXPORT_SYMBOL(timestamp_truncate);

/**
 * inode_set_ctime_current - set the ctime to current_time
 * @inode: inode
 *
 * Set the inode's ctime to the current value for the inode. Returns the
 * current value that was assigned. If this is not a multigrain inode, then we
 * set it to the later of the coarse time and floor value.
 *
 * If it is multigrain, then we first see if the coarse-grained timestamp is
 * distinct from what is already there. If so, then use that. Otherwise, get a
 * fine-grained timestamp.
 *
 * After that, try to swap the new value into i_ctime_nsec. Accept the
 * resulting ctime, regardless of the outcome of the swap. If it has
 * already been replaced, then that timestamp is later than the earlier
 * unacceptable one, and is thus acceptable.
 */
struct timespec64 inode_set_ctime_current(struct inode *inode)
{
        struct timespec64 now;
        u32 cns, cur;

        ktime_get_coarse_real_ts64_mg(&now);
        now = timestamp_truncate(now, inode);

        /* Just return that if this is not a multigrain fs */
        if (!is_mgtime(inode)) {
                inode_set_ctime_to_ts(inode, now);
                goto out;
        }

        /*
         * A fine-grained time is only needed if someone has queried
         * for timestamps, and the current coarse grained time isn't
         * later than what's already there.
         */
        cns = smp_load_acquire(&inode->i_ctime_nsec);
        if (cns & I_CTIME_QUERIED) {
                struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
                                            .tv_nsec = cns & ~I_CTIME_QUERIED };

                if (timespec64_compare(&now, &ctime) <= 0) {
                        ktime_get_real_ts64_mg(&now);
                        now = timestamp_truncate(now, inode);
                        mgtime_counter_inc(mg_fine_stamps);
                }
        }
        mgtime_counter_inc(mg_ctime_updates);

        /* No need to cmpxchg if it's exactly the same */
        if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) {
                trace_ctime_xchg_skip(inode, &now);
                goto out;
        }
        cur = cns;
retry:
        /* Try to swap the nsec value into place. */
        if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
                /* If swap occurred, then we're (mostly) done */
                inode->i_ctime_sec = now.tv_sec;
                trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur);
                mgtime_counter_inc(mg_ctime_swaps);
        } else {
                /*
                 * Was the change due to someone marking the old ctime QUERIED?
                 * If so then retry the swap. This can only happen once since
                 * the only way to clear I_CTIME_QUERIED is to stamp the inode
                 * with a new ctime.
                 */
                if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) {
                        cns = cur;
                        goto retry;
                }
                /* Otherwise, keep the existing ctime */
                now.tv_sec = inode->i_ctime_sec;
                now.tv_nsec = cur & ~I_CTIME_QUERIED;
        }
out:
        return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);

/**
 * inode_set_ctime_deleg - try to update the ctime on a delegated inode
 * @inode: inode to update
 * @update: timespec64 to set the ctime
 *
 * Attempt to atomically update the ctime on behalf of a delegation holder.
 *
 * The nfs server can call back the holder of a delegation to get updated
 * inode attributes, including the mtime. When updating the mtime, update
 * the ctime to a value at least equal to that.
 *
 * This can race with concurrent updates to the inode, in which
 * case the update is skipped.
 *
 * Note that this works even when multigrain timestamps are not enabled,
 * so it is used in either case.
 */
struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update)
{
        struct timespec64 now, cur_ts;
        u32 cur, old;

        /* pairs with try_cmpxchg below */
        cur = smp_load_acquire(&inode->i_ctime_nsec);
        cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
        cur_ts.tv_sec = inode->i_ctime_sec;

        /* If the update is older than the existing value, skip it. */
        if (timespec64_compare(&update, &cur_ts) <= 0)
                return cur_ts;

        ktime_get_coarse_real_ts64_mg(&now);

        /* Clamp the update to "now" if it's in the future */
        if (timespec64_compare(&update, &now) > 0)
                update = now;

        update = timestamp_truncate(update, inode);

        /* No need to update if the values are already the same */
        if (timespec64_equal(&update, &cur_ts))
                return cur_ts;

        /*
         * Try to swap the nsec value into place. If it fails, that means
         * it raced with an update due to a write or similar activity. That
         * stamp takes precedence, so just skip the update.
         */
retry:
        old = cur;
        if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) {
                inode->i_ctime_sec = update.tv_sec;
                mgtime_counter_inc(mg_ctime_swaps);
                return update;
        }

        /*
         * Was the change due to another task marking the old ctime QUERIED?
         *
         * If so, then retry the swap. This can only happen once since
         * the only way to clear I_CTIME_QUERIED is to stamp the inode
         * with a new ctime.
         */
        if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED)))
                goto retry;

        /* Otherwise, it was a new timestamp. */
        cur_ts.tv_sec = inode->i_ctime_sec;
        cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
        return cur_ts;
}
EXPORT_SYMBOL(inode_set_ctime_deleg);

/**
 * in_group_or_capable - check whether caller is CAP_FSETID privileged
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check
 * @vfsgid:        the new/current vfsgid of @inode
 *
 * Check whether @vfsgid is in the caller's group list or if the caller is
 * privileged with CAP_FSETID over @inode. This can be used to determine
 * whether the setgid bit can be kept or must be dropped.
 *
 * Return: true if the caller is sufficiently privileged, false if not.
 */
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid)
{
        if (vfsgid_in_group_p(vfsgid))
                return true;
        if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
                return true;
        return false;
}
EXPORT_SYMBOL(in_group_or_capable);

/**
 * mode_strip_sgid - handle the sgid bit for non-directories
 * @idmap: idmap of the mount the inode was created from
 * @dir: parent directory inode
 * @mode: mode of the file to be created in @dir
 *
 * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
 * raised and @dir has the S_ISGID bit raised ensure that the caller is
 * either in the group of the parent directory or they have CAP_FSETID
 * in their user namespace and are privileged over the parent directory.
 * In all other cases, strip the S_ISGID bit from @mode.
 *
 * Return: the new mode to use for the file
 */
umode_t mode_strip_sgid(struct mnt_idmap *idmap,
                        const struct inode *dir, umode_t mode)
{
        if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
                return mode;
        if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
                return mode;
        if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir)))
                return mode;
        return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);

#ifdef CONFIG_DEBUG_VFS
/**
 * dump_inode - dump an inode.
 * @inode: inode to dump
 * @reason: reason for dumping
 *
 * If inode is an invalid pointer, we don't want to crash accessing it,
 * so probe everything depending on it carefully with get_kernel_nofault().
 */
void dump_inode(struct inode *inode, const char *reason)
{
        struct super_block *sb;
        struct file_system_type *s_type;
        const char *fs_name_ptr;
        char fs_name[32] = {};
        umode_t mode;
        unsigned short opflags;
        unsigned int flags;
        unsigned int state;
        int count;

        if (get_kernel_nofault(sb, &inode->i_sb) ||
            get_kernel_nofault(mode, &inode->i_mode) ||
            get_kernel_nofault(opflags, &inode->i_opflags) ||
            get_kernel_nofault(flags, &inode->i_flags)) {
                pr_warn("%s: unreadable inode:%px\n", reason, inode);
                return;
        }

        state = inode_state_read_once(inode);
        count = atomic_read(&inode->i_count);

        if (!sb ||
            get_kernel_nofault(s_type, &sb->s_type) || !s_type ||
            get_kernel_nofault(fs_name_ptr, &s_type->name) || !fs_name_ptr ||
            strncpy_from_kernel_nofault(fs_name, fs_name_ptr, sizeof(fs_name) - 1) < 0)
                strscpy(fs_name, "<unknown, sb unreadable>");

        pr_warn("%s: inode:%px fs:%s mode:%ho opflags:%#x flags:%#x state:%#x count:%d\n",
                reason, inode, fs_name, mode, opflags, flags, state, count);
}
EXPORT_SYMBOL(dump_inode);
#endif




























    9 


















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MATH64_H
#define _LINUX_MATH64_H

#include <linux/types.h>
#include <linux/math.h>
#include <asm/div64.h>
#include <vdso/math64.h>

#if BITS_PER_LONG == 64

#define div64_long(x, y) div64_s64((x), (y))
#define div64_ul(x, y)   div64_u64((x), (y))

/**
 * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 32bit divisor
 * @remainder: pointer to unsigned 32bit remainder
 *
 * Return: sets ``*remainder``, then returns dividend / divisor
 *
 * This is commonly provided by 32bit archs to provide an optimized 64bit
 * divide.
 */
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
        *remainder = dividend % divisor;
        return dividend / divisor;
}

/**
 * div_s64_rem - signed 64bit divide with 32bit divisor with remainder
 * @dividend: signed 64bit dividend
 * @divisor: signed 32bit divisor
 * @remainder: pointer to signed 32bit remainder
 *
 * Return: sets ``*remainder``, then returns dividend / divisor
 */
static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
{
        *remainder = dividend % divisor;
        return dividend / divisor;
}

/**
 * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 64bit divisor
 * @remainder: pointer to unsigned 64bit remainder
 *
 * Return: sets ``*remainder``, then returns dividend / divisor
 */
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
        *remainder = dividend % divisor;
        return dividend / divisor;
}

/**
 * div64_u64 - unsigned 64bit divide with 64bit divisor
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 64bit divisor
 *
 * Return: dividend / divisor
 */
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
        return dividend / divisor;
}

/**
 * div64_s64 - signed 64bit divide with 64bit divisor
 * @dividend: signed 64bit dividend
 * @divisor: signed 64bit divisor
 *
 * Return: dividend / divisor
 */
static inline s64 div64_s64(s64 dividend, s64 divisor)
{
        return dividend / divisor;
}

#elif BITS_PER_LONG == 32

#define div64_long(x, y) div_s64((x), (y))
#define div64_ul(x, y)   div_u64((x), (y))

#ifndef div_u64_rem
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
        *remainder = do_div(dividend, divisor);
        return dividend;
}
#endif

#ifndef div_s64_rem
extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder);
#endif

#ifndef div64_u64_rem
extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder);
#endif

#ifndef div64_u64
extern u64 div64_u64(u64 dividend, u64 divisor);
#endif

#ifndef div64_s64
extern s64 div64_s64(s64 dividend, s64 divisor);
#endif

#endif /* BITS_PER_LONG */

/**
 * div_u64 - unsigned 64bit divide with 32bit divisor
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 32bit divisor
 *
 * This is the most common 64bit divide and should be used if possible,
 * as many 32bit archs can optimize this variant better than a full 64bit
 * divide.
 *
 * Return: dividend / divisor
 */
#ifndef div_u64
static inline u64 div_u64(u64 dividend, u32 divisor)
{
        u32 remainder;
        return div_u64_rem(dividend, divisor, &remainder);
}
#endif

/**
 * div_s64 - signed 64bit divide with 32bit divisor
 * @dividend: signed 64bit dividend
 * @divisor: signed 32bit divisor
 *
 * Return: dividend / divisor
 */
#ifndef div_s64
static inline s64 div_s64(s64 dividend, s32 divisor)
{
        s32 remainder;
        return div_s64_rem(dividend, divisor, &remainder);
}
#endif

u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder);

#ifndef mul_u32_u32
/*
 * Many a GCC version messes this up and generates a 64x64 mult :-(
 */
static inline u64 mul_u32_u32(u32 a, u32 b)
{
        return (u64)a * b;
}
#endif

#ifndef add_u64_u32
/*
 * Many a GCC version also messes this up.
 * Zero extending b and then spilling everything to stack.
 */
static inline u64 add_u64_u32(u64 a, u32 b)
{
        return a + b;
}
#endif

#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)

#ifndef mul_u64_u32_shr
static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
{
        return (u64)(((unsigned __int128)a * mul) >> shift);
}
#endif /* mul_u64_u32_shr */

#ifndef mul_u64_u64_shr
static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
{
        return (u64)(((unsigned __int128)a * mul) >> shift);
}
#endif /* mul_u64_u64_shr */

#else

#ifndef mul_u64_u32_shr
static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
{
        u32 ah = a >> 32, al = a;
        u64 ret;

        ret = mul_u32_u32(al, mul) >> shift;
        if (ah)
                ret += mul_u32_u32(ah, mul) << (32 - shift);
        return ret;
}
#endif /* mul_u64_u32_shr */

#ifndef mul_u64_u64_shr
static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
{
        union {
                u64 ll;
                struct {
#ifdef __BIG_ENDIAN
                        u32 high, low;
#else
                        u32 low, high;
#endif
                } l;
        } rl, rm, rn, rh, a0, b0;
        u64 c;

        a0.ll = a;
        b0.ll = b;

        rl.ll = mul_u32_u32(a0.l.low, b0.l.low);
        rm.ll = mul_u32_u32(a0.l.low, b0.l.high);
        rn.ll = mul_u32_u32(a0.l.high, b0.l.low);
        rh.ll = mul_u32_u32(a0.l.high, b0.l.high);

        /*
         * Each of these lines computes a 64-bit intermediate result into "c",
         * starting at bits 32-95.  The low 32-bits go into the result of the
         * multiplication, the high 32-bits are carried into the next step.
         */
        rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low;
        rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low;
        rh.l.high = (c >> 32) + rh.l.high;

        /*
         * The 128-bit result of the multiplication is in rl.ll and rh.ll,
         * shift it right and throw away the high part of the result.
         */
        if (shift == 0)
                return rl.ll;
        if (shift < 64)
                return (rl.ll >> shift) | (rh.ll << (64 - shift));
        return rh.ll >> (shift & 63);
}
#endif /* mul_u64_u64_shr */

#endif

#ifndef mul_s64_u64_shr
static inline u64 mul_s64_u64_shr(s64 a, u64 b, unsigned int shift)
{
        u64 ret;

        /*
         * Extract the sign before the multiplication and put it back
         * afterwards if needed.
         */
        ret = mul_u64_u64_shr(abs(a), b, shift);

        if (a < 0)
                ret = -((s64) ret);

        return ret;
}
#endif /* mul_s64_u64_shr */

#ifndef mul_u64_u32_div
static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
{
        union {
                u64 ll;
                struct {
#ifdef __BIG_ENDIAN
                        u32 high, low;
#else
                        u32 low, high;
#endif
                } l;
        } u, rl, rh;

        u.ll = a;
        rl.ll = mul_u32_u32(u.l.low, mul);
        rh.ll = mul_u32_u32(u.l.high, mul) + rl.l.high;

        /* Bits 32-63 of the result will be in rh.l.low. */
        rl.l.high = do_div(rh.ll, divisor);

        /* Bits 0-31 of the result will be in rl.l.low.        */
        do_div(rl.ll, divisor);

        rl.l.high = rh.l.low;
        return rl.ll;
}
#endif /* mul_u64_u32_div */

/**
 * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide
 * @a: first unsigned 64bit multiplicand
 * @b: second unsigned 64bit multiplicand
 * @c: unsigned 64bit addend
 * @d: unsigned 64bit divisor
 *
 * Multiply two 64bit values together to generate a 128bit product
 * add a third value and then divide by a fourth.
 * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
 * Architecture specific code may trap on zero or overflow.
 *
 * Return: (@a * @b + @c) / @d
 */
u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d);

/**
 * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide
 * @a: first unsigned 64bit multiplicand
 * @b: second unsigned 64bit multiplicand
 * @d: unsigned 64bit divisor
 *
 * Multiply two 64bit values together to generate a 128bit product
 * and then divide by a third value.
 * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
 * Architecture specific code may trap on zero or overflow.
 *
 * Return: @a * @b / @d
 */
#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d)

/**
 * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up
 * @a: first unsigned 64bit multiplicand
 * @b: second unsigned 64bit multiplicand
 * @d: unsigned 64bit divisor
 *
 * Multiply two 64bit values together to generate a 128bit product
 * and then divide and round up.
 * The Generic code divides by 0 if @d is zero and returns ~0 on overflow.
 * Architecture specific code may trap on zero or overflow.
 *
 * Return: (@a * @b + @d - 1) / @d
 */
#define mul_u64_u64_div_u64_roundup(a, b, d) \
        ({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); })


/**
 * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up
 * @ll: unsigned 64bit dividend
 * @d: unsigned 64bit divisor
 *
 * Divide unsigned 64bit dividend by unsigned 64bit divisor
 * and round up.
 *
 * Return: dividend / divisor rounded up
 */
#define DIV64_U64_ROUND_UP(ll, d)        \
        ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })

/**
 * DIV_U64_ROUND_UP - unsigned 64bit divide with 32bit divisor rounded up
 * @ll: unsigned 64bit dividend
 * @d: unsigned 32bit divisor
 *
 * Divide unsigned 64bit dividend by unsigned 32bit divisor
 * and round up.
 *
 * Return: dividend / divisor rounded up
 */
#define DIV_U64_ROUND_UP(ll, d)                \
        ({ u32 _tmp = (d); div_u64((ll) + _tmp - 1, _tmp); })

/**
 * DIV64_U64_ROUND_CLOSEST - unsigned 64bit divide with 64bit divisor rounded to nearest integer
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 64bit divisor
 *
 * Divide unsigned 64bit dividend by unsigned 64bit divisor
 * and round to closest integer.
 *
 * Return: dividend / divisor rounded to nearest integer
 */
#define DIV64_U64_ROUND_CLOSEST(dividend, divisor)        \
        ({ u64 _tmp = (divisor); div64_u64((dividend) + _tmp / 2, _tmp); })

/**
 * DIV_U64_ROUND_CLOSEST - unsigned 64bit divide with 32bit divisor rounded to nearest integer
 * @dividend: unsigned 64bit dividend
 * @divisor: unsigned 32bit divisor
 *
 * Divide unsigned 64bit dividend by unsigned 32bit divisor
 * and round to closest integer.
 *
 * Return: dividend / divisor rounded to nearest integer
 */
#define DIV_U64_ROUND_CLOSEST(dividend, divisor)        \
        ({ u32 _tmp = (divisor); div_u64((u64)(dividend) + _tmp / 2, _tmp); })

/**
 * DIV_S64_ROUND_CLOSEST - signed 64bit divide with 32bit divisor rounded to nearest integer
 * @dividend: signed 64bit dividend
 * @divisor: signed 32bit divisor
 *
 * Divide signed 64bit dividend by signed 32bit divisor
 * and round to closest integer.
 *
 * Return: dividend / divisor rounded to nearest integer
 */
#define DIV_S64_ROUND_CLOSEST(dividend, divisor)(        \
{                                                        \
        s64 __x = (dividend);                                \
        s32 __d = (divisor);                                \
        ((__x > 0) == (__d > 0)) ?                        \
                div_s64((__x + (__d / 2)), __d) :        \
                div_s64((__x - (__d / 2)), __d);        \
}                                                        \
)

/**
 * roundup_u64 - Round up a 64bit value to the next specified 32bit multiple
 * @x: the value to up
 * @y: 32bit multiple to round up to
 *
 * Rounds @x to the next multiple of @y. For 32bit @x values, see roundup and
 * the faster round_up() for powers of 2.
 *
 * Return: rounded up value.
 */
static inline u64 roundup_u64(u64 x, u32 y)
{
        return DIV_U64_ROUND_UP(x, y) * y;
}
#endif /* _LINUX_MATH64_H */


































   41 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_WORD_AT_A_TIME_H
#define _ASM_WORD_AT_A_TIME_H

#include <linux/bitops.h>
#include <linux/wordpart.h>

struct word_at_a_time {
        const unsigned long one_bits, high_bits;
};

#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }

/* Return nonzero if it has a zero */
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
        unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
        *bits = mask;
        return mask;
}

static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
{
        return bits;
}

#ifdef CONFIG_64BIT

/* Keep the initial has_zero() value for both bitmask and size calc */
#define create_zero_mask(bits) (bits)

static inline unsigned long zero_bytemask(unsigned long bits)
{
        bits = (bits - 1) & ~bits;
        return bits >> 7;
}

#define find_zero(bits) (__ffs(bits) >> 3)

#else

/* Create the final mask for both bytemask and size */
static inline unsigned long create_zero_mask(unsigned long bits)
{
        bits = (bits - 1) & ~bits;
        return bits >> 7;
}

/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)

/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
static inline unsigned long find_zero(unsigned long mask)
{
        /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
        long a = (0x0ff0001+mask) >> 23;
        /* Fix the 1 for 00 case */
        return a & mask;
}

#endif

/*
 * Load an unaligned word from kernel space.
 *
 * In the (very unlikely) case of the word being a page-crosser
 * and the next page not being mapped, take the exception and
 * return zeroes in the non-existing part.
 */
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
        unsigned long ret;

        asm volatile(
                "1:        mov %[mem], %[ret]\n"
                "2:\n"
                _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD)
                : [ret] "=r" (ret)
                : [mem] "m" (*(unsigned long *)addr));

        return ret;
}

#endif /* _ASM_WORD_AT_A_TIME_H */






























































































































































    1 






    1 
    1 













    1 
















































































































































































































































































































    1 












    1 













    1 





























































































































































































































































































































































































































































    1 

    1 














    1 




































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                "Ping" sockets
 *
 * Based on ipv4/udp.c code.
 *
 * Authors:        Vasiliy Kulikov / Openwall (for Linux 2.6),
 *                Pavel Kankovsky (for Linux 2.4.32)
 *
 * Pavel gave all rights to bugs to Vasiliy,
 * none of the bugs are Pavel's now.
 */

#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/export.h>
#include <linux/bpf-cgroup.h>
#include <net/sock.h>
#include <net/ping.h>
#include <net/udp.h>
#include <net/route.h>
#include <net/inet_common.h>
#include <net/checksum.h>

#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#include <linux/icmpv6.h>
#include <net/addrconf.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#endif

struct ping_table {
        struct hlist_head        hash[PING_HTABLE_SIZE];
        spinlock_t                lock;
};

static struct ping_table ping_table;
struct pingv6_ops pingv6_ops;

static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
{
        u32 res = (num + net_hash_mix(net)) & mask;

        pr_debug("hash(%u) = %u\n", num, res);
        return res;
}

static inline struct hlist_head *ping_hashslot(struct ping_table *table,
                                               struct net *net, unsigned int num)
{
        return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
}

int ping_get_port(struct sock *sk, unsigned short ident)
{
        struct net *net = sock_net(sk);
        struct inet_sock *isk, *isk2;
        struct hlist_head *hlist;
        struct sock *sk2 = NULL;

        isk = inet_sk(sk);
        spin_lock(&ping_table.lock);
        if (ident == 0) {
                u16 result = net->ipv4.ping_port_rover + 1;
                u32 i;

                for (i = 0; i < (1L << 16); i++, result++) {
                        if (!result)
                                continue; /* avoid zero */
                        hlist = ping_hashslot(&ping_table, net, result);
                        sk_for_each(sk2, hlist) {
                                if (!net_eq(sock_net(sk2), net))
                                        continue;
                                isk2 = inet_sk(sk2);

                                if (isk2->inet_num == result)
                                        goto next_port;
                        }

                        /* found */
                        net->ipv4.ping_port_rover = ident = result;
                        break;
next_port:
                        ;
                }
                if (i >= (1L << 16))
                        goto fail;
        } else {
                hlist = ping_hashslot(&ping_table, net, ident);
                sk_for_each(sk2, hlist) {
                        if (!net_eq(sock_net(sk2), net))
                                continue;
                        isk2 = inet_sk(sk2);

                        /* BUG? Why is this reuse and not reuseaddr? ping.c
                         * doesn't turn off SO_REUSEADDR, and it doesn't expect
                         * that other ping processes can steal its packets.
                         */
                        if ((isk2->inet_num == ident) &&
                            (sk2 != sk) &&
                            (!sk2->sk_reuse || !sk->sk_reuse))
                                goto fail;
                }
        }

        pr_debug("found port/ident = %d\n", ident);
        isk->inet_num = ident;
        if (sk_unhashed(sk)) {
                pr_debug("was not hashed\n");
                sk_add_node_rcu(sk, hlist);
                sock_set_flag(sk, SOCK_RCU_FREE);
                sock_prot_inuse_add(net, sk->sk_prot, 1);
        }
        spin_unlock(&ping_table.lock);
        return 0;

fail:
        spin_unlock(&ping_table.lock);
        return -EADDRINUSE;
}

void ping_unhash(struct sock *sk)
{
        struct inet_sock *isk = inet_sk(sk);

        pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
        spin_lock(&ping_table.lock);
        if (sk_del_node_init_rcu(sk)) {
                WRITE_ONCE(isk->inet_num, 0);
                isk->inet_sport = 0;
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
        }
        spin_unlock(&ping_table.lock);
}

/* Called under rcu_read_lock() */
static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
{
        struct hlist_head *hslot = ping_hashslot(&ping_table, net, ident);
        struct sock *sk = NULL;
        struct inet_sock *isk;
        int dif, sdif;

        if (skb->protocol == htons(ETH_P_IP)) {
                dif = inet_iif(skb);
                sdif = inet_sdif(skb);
                pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
                         (int)ident, &ip_hdr(skb)->daddr, dif);
#if IS_ENABLED(CONFIG_IPV6)
        } else if (skb->protocol == htons(ETH_P_IPV6)) {
                dif = inet6_iif(skb);
                sdif = inet6_sdif(skb);
                pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n",
                         (int)ident, &ipv6_hdr(skb)->daddr, dif);
#endif
        } else {
                return NULL;
        }

        sk_for_each_rcu(sk, hslot) {
                int bound_dev_if;

                if (!net_eq(sock_net(sk), net))
                        continue;
                isk = inet_sk(sk);

                pr_debug("iterate\n");
                if (READ_ONCE(isk->inet_num) != ident)
                        continue;

                bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
                if (skb->protocol == htons(ETH_P_IP) &&
                    sk->sk_family == AF_INET) {
                        __be32 rcv_saddr = READ_ONCE(isk->inet_rcv_saddr);

                        pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk,
                                 ident, &rcv_saddr,
                                 bound_dev_if);

                        if (rcv_saddr && rcv_saddr != ip_hdr(skb)->daddr)
                                continue;
#if IS_ENABLED(CONFIG_IPV6)
                } else if (skb->protocol == htons(ETH_P_IPV6) &&
                           sk->sk_family == AF_INET6) {

                        pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,
                                 ident,
                                 &sk->sk_v6_rcv_saddr,
                                 bound_dev_if);

                        if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
                            !ipv6_addr_equal(&sk->sk_v6_rcv_saddr,
                                             &ipv6_hdr(skb)->daddr))
                                continue;
#endif
                } else {
                        continue;
                }

                if (bound_dev_if && bound_dev_if != dif &&
                    bound_dev_if != sdif)
                        continue;

                goto exit;
        }

        sk = NULL;
exit:

        return sk;
}

static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
                                          kgid_t *high)
{
        kgid_t *data = net->ipv4.ping_group_range.range;
        unsigned int seq;

        do {
                seq = read_seqbegin(&net->ipv4.ping_group_range.lock);

                *low = data[0];
                *high = data[1];
        } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
}


int ping_init_sock(struct sock *sk)
{
        struct net *net = sock_net(sk);
        kgid_t group = current_egid();
        struct group_info *group_info;
        int i;
        kgid_t low, high;
        int ret = 0;

        if (sk->sk_family == AF_INET6)
                sk->sk_ipv6only = 1;

        inet_get_ping_group_range_net(net, &low, &high);
        if (gid_lte(low, group) && gid_lte(group, high))
                return 0;

        group_info = get_current_groups();
        for (i = 0; i < group_info->ngroups; i++) {
                kgid_t gid = group_info->gid[i];

                if (gid_lte(low, gid) && gid_lte(gid, high))
                        goto out_release_group;
        }

        ret = -EACCES;

out_release_group:
        put_group_info(group_info);
        return ret;
}

void ping_close(struct sock *sk, long timeout)
{
        pr_debug("ping_close(sk=%p,sk->num=%u)\n",
                 inet_sk(sk), inet_sk(sk)->inet_num);
        pr_debug("isk->refcnt = %d\n", refcount_read(&sk->sk_refcnt));

        sk_common_release(sk);
}

static int ping_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
                            int addr_len)
{
        /* This check is replicated from __ip4_datagram_connect() and
         * intended to prevent BPF program called below from accessing bytes
         * that are out of the bound specified by user in addr_len.
         */
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
}

/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
                                struct sockaddr_unsized *uaddr, int addr_len)
{
        struct net *net = sock_net(sk);
        if (sk->sk_family == AF_INET) {
                struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
                u32 tb_id = RT_TABLE_LOCAL;
                int chk_addr_ret;

                if (addr_len < sizeof(*addr))
                        return -EINVAL;

                if (addr->sin_family != AF_INET &&
                    !(addr->sin_family == AF_UNSPEC &&
                      addr->sin_addr.s_addr == htonl(INADDR_ANY)))
                        return -EAFNOSUPPORT;

                pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
                         sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));

                if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
                        return 0;

                tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
                chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);

                if (chk_addr_ret == RTN_MULTICAST ||
                    chk_addr_ret == RTN_BROADCAST ||
                    (chk_addr_ret != RTN_LOCAL &&
                     !inet_can_nonlocal_bind(net, isk)))
                        return -EADDRNOTAVAIL;

#if IS_ENABLED(CONFIG_IPV6)
        } else if (sk->sk_family == AF_INET6) {
                struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
                int addr_type, scoped, has_addr;
                struct net_device *dev = NULL;

                if (addr_len < sizeof(*addr))
                        return -EINVAL;

                if (addr->sin6_family != AF_INET6)
                        return -EAFNOSUPPORT;

                pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",
                         sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port));

                addr_type = ipv6_addr_type(&addr->sin6_addr);
                scoped = __ipv6_addr_needs_scope_id(addr_type);
                if ((addr_type != IPV6_ADDR_ANY &&
                     !(addr_type & IPV6_ADDR_UNICAST)) ||
                    (scoped && !addr->sin6_scope_id))
                        return -EINVAL;

                rcu_read_lock();
                if (addr->sin6_scope_id) {
                        dev = dev_get_by_index_rcu(net, addr->sin6_scope_id);
                        if (!dev) {
                                rcu_read_unlock();
                                return -ENODEV;
                        }
                }

                if (!dev && sk->sk_bound_dev_if) {
                        dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
                        if (!dev) {
                                rcu_read_unlock();
                                return -ENODEV;
                        }
                }
                has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev,
                                                    scoped);
                rcu_read_unlock();

                if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr ||
                      addr_type == IPV6_ADDR_ANY))
                        return -EADDRNOTAVAIL;

                if (scoped)
                        sk->sk_bound_dev_if = addr->sin6_scope_id;
#endif
        } else {
                return -EAFNOSUPPORT;
        }
        return 0;
}

static void ping_set_saddr(struct sock *sk, struct sockaddr_unsized *saddr)
{
        if (saddr->sa_family == AF_INET) {
                struct inet_sock *isk = inet_sk(sk);
                struct sockaddr_in *addr = (struct sockaddr_in *) saddr;

                isk->inet_saddr = addr->sin_addr.s_addr;
                WRITE_ONCE(isk->inet_rcv_saddr, addr->sin_addr.s_addr);
#if IS_ENABLED(CONFIG_IPV6)
        } else if (saddr->sa_family == AF_INET6) {
                struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;
                struct ipv6_pinfo *np = inet6_sk(sk);
                sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr;
#endif
        }
}

/*
 * We need our own bind because there are no privileged id's == local ports.
 * Moreover, we don't allow binding to multi- and broadcast addresses.
 */

int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
        struct inet_sock *isk = inet_sk(sk);
        unsigned short snum;
        int err;
        int dif = sk->sk_bound_dev_if;

        err = ping_check_bind_addr(sk, isk, uaddr, addr_len);
        if (err)
                return err;

        lock_sock(sk);

        err = -EINVAL;
        if (isk->inet_num != 0)
                goto out;

        err = -EADDRINUSE;
        snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port);
        if (ping_get_port(sk, snum) != 0) {
                /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */
                sk->sk_bound_dev_if = dif;
                goto out;
        }
        ping_set_saddr(sk, uaddr);

        pr_debug("after bind(): num = %hu, dif = %d\n",
                 isk->inet_num,
                 sk->sk_bound_dev_if);

        err = 0;
        if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
                sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr))
                sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
#endif

        if (snum)
                sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
        isk->inet_sport = htons(isk->inet_num);
        isk->inet_daddr = 0;
        isk->inet_dport = 0;

#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
#endif

        sk_dst_reset(sk);
out:
        release_sock(sk);
        pr_debug("ping_v4_bind -> %d\n", err);
        return err;
}

/*
 * Is this a supported type of ICMP message?
 */

static inline int ping_supported(int family, int type, int code)
{
        return (family == AF_INET && type == ICMP_ECHO && code == 0) ||
               (family == AF_INET && type == ICMP_EXT_ECHO && code == 0) ||
               (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0) ||
               (family == AF_INET6 && type == ICMPV6_EXT_ECHO_REQUEST && code == 0);
}

/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.
 */

void ping_err(struct sk_buff *skb, int offset, u32 info)
{
        int family;
        struct icmphdr *icmph;
        struct inet_sock *inet_sock;
        int type;
        int code;
        struct net *net = dev_net(skb->dev);
        struct sock *sk;
        int harderr;
        int err;

        if (skb->protocol == htons(ETH_P_IP)) {
                family = AF_INET;
                type = icmp_hdr(skb)->type;
                code = icmp_hdr(skb)->code;
                icmph = (struct icmphdr *)(skb->data + offset);
        } else if (skb->protocol == htons(ETH_P_IPV6)) {
                family = AF_INET6;
                type = icmp6_hdr(skb)->icmp6_type;
                code = icmp6_hdr(skb)->icmp6_code;
                icmph = (struct icmphdr *) (skb->data + offset);
        } else {
                BUG();
        }

        /* We assume the packet has already been checked by icmp_unreach */

        if (!ping_supported(family, icmph->type, icmph->code))
                return;

        pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n",
                 skb->protocol, type, code, ntohs(icmph->un.echo.id),
                 ntohs(icmph->un.echo.sequence));

        sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
        if (!sk) {
                pr_debug("no socket, dropping\n");
                return;        /* No socket for error */
        }
        pr_debug("err on socket %p\n", sk);

        err = 0;
        harderr = 0;
        inet_sock = inet_sk(sk);

        if (skb->protocol == htons(ETH_P_IP)) {
                switch (type) {
                default:
                case ICMP_TIME_EXCEEDED:
                        err = EHOSTUNREACH;
                        break;
                case ICMP_SOURCE_QUENCH:
                        /* This is not a real error but ping wants to see it.
                         * Report it with some fake errno.
                         */
                        err = EREMOTEIO;
                        break;
                case ICMP_PARAMETERPROB:
                        err = EPROTO;
                        harderr = 1;
                        break;
                case ICMP_DEST_UNREACH:
                        if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
                                ipv4_sk_update_pmtu(skb, sk, info);
                                if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) {
                                        err = EMSGSIZE;
                                        harderr = 1;
                                        break;
                                }
                                goto out;
                        }
                        err = EHOSTUNREACH;
                        if (code <= NR_ICMP_UNREACH) {
                                harderr = icmp_err_convert[code].fatal;
                                err = icmp_err_convert[code].errno;
                        }
                        break;
                case ICMP_REDIRECT:
                        /* See ICMP_SOURCE_QUENCH */
                        ipv4_sk_redirect(skb, sk);
                        err = EREMOTEIO;
                        break;
                }
#if IS_ENABLED(CONFIG_IPV6)
        } else if (skb->protocol == htons(ETH_P_IPV6)) {
                harderr = pingv6_ops.icmpv6_err_convert(type, code, &err);
#endif
        }

        /*
         *      RFC1122: OK.  Passes ICMP errors back to application, as per
         *        4.1.3.3.
         */
        if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) ||
            (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) {
                if (!harderr || sk->sk_state != TCP_ESTABLISHED)
                        goto out;
        } else {
                if (family == AF_INET) {
                        ip_icmp_error(sk, skb, err, 0 /* no remote port */,
                                      info, (u8 *)icmph);
#if IS_ENABLED(CONFIG_IPV6)
                } else if (family == AF_INET6) {
                        pingv6_ops.ipv6_icmp_error(sk, skb, err, 0,
                                                   info, (u8 *)icmph);
#endif
                }
        }
        sk->sk_err = err;
        sk_error_report(sk);
out:
        return;
}

/*
 *        Copy and checksum an ICMP Echo packet from user space into a buffer
 *        starting from the payload.
 */

int ping_getfrag(void *from, char *to,
                 int offset, int fraglen, int odd, struct sk_buff *skb)
{
        struct pingfakehdr *pfh = from;

        if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck,
                                          &pfh->msg->msg_iter))
                return -EFAULT;

#if IS_ENABLED(CONFIG_IPV6)
        /* For IPv6, checksum each skb as we go along, as expected by
         * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in
         * wcheck, it will be finalized in ping_v4_push_pending_frames.
         */
        if (pfh->family == AF_INET6) {
                skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd);
                skb->ip_summed = CHECKSUM_NONE;
                pfh->wcheck = 0;
        }
#endif

        return 0;
}

static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
                                       struct flowi4 *fl4)
{
        struct sk_buff *skb = skb_peek(&sk->sk_write_queue);

        if (!skb)
                return 0;
        pfh->wcheck = csum_partial((char *)&pfh->icmph,
                sizeof(struct icmphdr), pfh->wcheck);
        pfh->icmph.checksum = csum_fold(pfh->wcheck);
        memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
        skb->ip_summed = CHECKSUM_NONE;
        return ip_push_pending_frames(sk, fl4);
}

int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
                        void *user_icmph, size_t icmph_len)
{
        u8 type, code;

        if (len > 0xFFFF)
                return -EMSGSIZE;

        /* Must have at least a full ICMP header. */
        if (len < icmph_len)
                return -EINVAL;

        /*
         *        Check the flags.
         */

        /* Mirror BSD error message compatibility */
        if (msg->msg_flags & MSG_OOB)
                return -EOPNOTSUPP;

        /*
         *        Fetch the ICMP header provided by the userland.
         *        iovec is modified! The ICMP header is consumed.
         */
        if (memcpy_from_msg(user_icmph, msg, icmph_len))
                return -EFAULT;

        if (family == AF_INET) {
                type = ((struct icmphdr *) user_icmph)->type;
                code = ((struct icmphdr *) user_icmph)->code;
#if IS_ENABLED(CONFIG_IPV6)
        } else if (family == AF_INET6) {
                type = ((struct icmp6hdr *) user_icmph)->icmp6_type;
                code = ((struct icmp6hdr *) user_icmph)->icmp6_code;
#endif
        } else {
                BUG();
        }

        if (!ping_supported(family, type, code))
                return -EINVAL;

        return 0;
}

static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
        DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
                        IP_OPTIONS_DATA_FIXED_SIZE);
        struct net *net = sock_net(sk);
        struct flowi4 fl4;
        struct inet_sock *inet = inet_sk(sk);
        struct ipcm_cookie ipc;
        struct icmphdr user_icmph;
        struct pingfakehdr pfh;
        struct rtable *rt = NULL;
        int free = 0;
        __be32 saddr, daddr, faddr;
        u8 scope;
        int err;

        pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);

        err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph,
                                  sizeof(user_icmph));
        if (err)
                return err;

        /*
         *        Get and verify the address.
         */

        if (msg->msg_name) {
                DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
                if (msg->msg_namelen < sizeof(*usin))
                        return -EINVAL;
                if (usin->sin_family != AF_INET)
                        return -EAFNOSUPPORT;
                daddr = usin->sin_addr.s_addr;
                /* no remote port */
        } else {
                if (sk->sk_state != TCP_ESTABLISHED)
                        return -EDESTADDRREQ;
                daddr = inet->inet_daddr;
                /* no remote port */
        }

        ipcm_init_sk(&ipc, inet);

        if (msg->msg_controllen) {
                err = ip_cmsg_send(sk, msg, &ipc, false);
                if (unlikely(err)) {
                        kfree(ipc.opt);
                        return err;
                }
                if (ipc.opt)
                        free = 1;
        }
        if (!ipc.opt) {
                struct ip_options_rcu *inet_opt;

                rcu_read_lock();
                inet_opt = rcu_dereference(inet->inet_opt);
                if (inet_opt) {
                        memcpy(opt_copy, inet_opt,
                               sizeof(*inet_opt) + inet_opt->opt.optlen);
                        ipc.opt = opt_copy;
                }
                rcu_read_unlock();
        }

        saddr = ipc.addr;
        ipc.addr = faddr = daddr;

        if (ipc.opt && ipc.opt->opt.srr) {
                if (!daddr) {
                        err = -EINVAL;
                        goto out_free;
                }
                faddr = ipc.opt->opt.faddr;
        }
        scope = ip_sendmsg_scope(inet, &ipc, msg);

        if (ipv4_is_multicast(daddr)) {
                if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
                        ipc.oif = READ_ONCE(inet->mc_index);
                if (!saddr)
                        saddr = READ_ONCE(inet->mc_addr);
        } else if (!ipc.oif)
                ipc.oif = READ_ONCE(inet->uc_index);

        flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark,
                           ipc.tos & INET_DSCP_MASK, scope,
                           sk->sk_protocol, inet_sk_flowi_flags(sk), faddr,
                           saddr, 0, 0, sk_uid(sk));

        fl4.fl4_icmp_type = user_icmph.type;
        fl4.fl4_icmp_code = user_icmph.code;

        security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
        rt = ip_route_output_flow(net, &fl4, sk);
        if (IS_ERR(rt)) {
                err = PTR_ERR(rt);
                rt = NULL;
                if (err == -ENETUNREACH)
                        IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
                goto out;
        }

        err = -EACCES;
        if ((rt->rt_flags & RTCF_BROADCAST) &&
            !sock_flag(sk, SOCK_BROADCAST))
                goto out;

        if (msg->msg_flags & MSG_CONFIRM)
                goto do_confirm;
back_from_confirm:

        if (!ipc.addr)
                ipc.addr = fl4.daddr;

        lock_sock(sk);

        pfh.icmph.type = user_icmph.type; /* already checked */
        pfh.icmph.code = user_icmph.code; /* ditto */
        pfh.icmph.checksum = 0;
        pfh.icmph.un.echo.id = inet->inet_sport;
        pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
        pfh.msg = msg;
        pfh.wcheck = 0;
        pfh.family = AF_INET;

        err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
                             sizeof(struct icmphdr), &ipc, &rt,
                             msg->msg_flags);
        if (err)
                ip_flush_pending_frames(sk);
        else
                err = ping_v4_push_pending_frames(sk, &pfh, &fl4);
        release_sock(sk);

out:
        ip_rt_put(rt);
out_free:
        if (free)
                kfree(ipc.opt);
        if (!err)
                return len;
        return err;

do_confirm:
        if (msg->msg_flags & MSG_PROBE)
                dst_confirm_neigh(&rt->dst, &fl4.daddr);
        if (!(msg->msg_flags & MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
        goto out;
}

int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
{
        struct inet_sock *isk = inet_sk(sk);
        int family = sk->sk_family;
        struct sk_buff *skb;
        int copied, err;

        pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk,
                 READ_ONCE(isk->inet_num));

        err = -EOPNOTSUPP;
        if (flags & MSG_OOB)
                goto out;

        if (flags & MSG_ERRQUEUE)
                return inet_recv_error(sk, msg, len);

        skb = skb_recv_datagram(sk, flags, &err);
        if (!skb)
                goto out;

        copied = skb->len;
        if (copied > len) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }

        /* Don't bother checking the checksum */
        err = skb_copy_datagram_msg(skb, 0, msg, copied);
        if (err)
                goto done;

        sock_recv_timestamp(msg, sk, skb);

        /* Copy the address and add cmsg data. */
        if (family == AF_INET) {
                DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);

                if (sin) {
                        sin->sin_family = AF_INET;
                        sin->sin_port = 0 /* skb->h.uh->source */;
                        sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
                        msg->msg_namelen = sizeof(*sin);
                }

                if (inet_cmsg_flags(isk))
                        ip_cmsg_recv(msg, skb);

#if IS_ENABLED(CONFIG_IPV6)
        } else if (family == AF_INET6) {
                struct ipv6hdr *ip6 = ipv6_hdr(skb);
                DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);

                if (sin6) {
                        sin6->sin6_family = AF_INET6;
                        sin6->sin6_port = 0;
                        sin6->sin6_addr = ip6->saddr;
                        sin6->sin6_flowinfo = 0;
                        if (inet6_test_bit(SNDFLOW, sk))
                                sin6->sin6_flowinfo = ip6_flowinfo(ip6);
                        sin6->sin6_scope_id =
                                ipv6_iface_scope_id(&sin6->sin6_addr,
                                                    inet6_iif(skb));
                        msg->msg_namelen = sizeof(*sin6);
                }

                if (inet6_sk(sk)->rxopt.all)
                        pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb);
                if (skb->protocol == htons(ETH_P_IPV6) &&
                    inet6_sk(sk)->rxopt.all)
                        pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb);
                else if (skb->protocol == htons(ETH_P_IP) &&
                         inet_cmsg_flags(isk))
                        ip_cmsg_recv(msg, skb);
#endif
        } else {
                BUG();
        }

        err = copied;

done:
        skb_free_datagram(sk, skb);
out:
        pr_debug("ping_recvmsg -> %d\n", err);
        return err;
}

static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk,
                                                 struct sk_buff *skb)
{
        enum skb_drop_reason reason;

        pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
                 inet_sk(sk), inet_sk(sk)->inet_num, skb);
        reason = sock_queue_rcv_skb_reason(sk, skb);
        if (reason) {
                sk_skb_reason_drop(sk, skb, reason);
                pr_debug("ping_queue_rcv_skb -> failed\n");
                return reason;
        }
        return SKB_NOT_DROPPED_YET;
}

int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return __ping_queue_rcv_skb(sk, skb) ? -1 : 0;
}


/*
 *        All we need to do is get the socket.
 */

enum skb_drop_reason ping_rcv(struct sk_buff *skb)
{
        struct net *net = dev_net(skb->dev);
        struct icmphdr *icmph = icmp_hdr(skb);
        struct sock *sk;

        /* We assume the packet has already been checked by icmp_rcv */

        pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
                 skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));

        /* Push ICMP header back */
        skb_push(skb, skb->data - (u8 *)icmph);

        sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
        if (sk)
                return __ping_queue_rcv_skb(sk, skb);

        kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET);
        return SKB_DROP_REASON_NO_SOCKET;
}

struct proto ping_prot = {
        .name =                "PING",
        .owner =        THIS_MODULE,
        .init =                ping_init_sock,
        .close =        ping_close,
        .pre_connect =        ping_pre_connect,
        .connect =        ip4_datagram_connect,
        .disconnect =        __udp_disconnect,
        .setsockopt =        ip_setsockopt,
        .getsockopt =        ip_getsockopt,
        .sendmsg =        ping_v4_sendmsg,
        .recvmsg =        ping_recvmsg,
        .bind =                ping_bind,
        .backlog_rcv =        ping_queue_rcv_skb,
        .release_cb =        ip4_datagram_release_cb,
        .unhash =        ping_unhash,
        .get_port =        ping_get_port,
        .put_port =        ping_unhash,
        .obj_size =        sizeof(struct inet_sock),
};

#ifdef CONFIG_PROC_FS

static struct sock *ping_get_first(struct seq_file *seq, int start)
{
        struct sock *sk;
        struct ping_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);

        for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
             ++state->bucket) {
                struct hlist_head *hslot;

                hslot = &ping_table.hash[state->bucket];

                if (hlist_empty(hslot))
                        continue;

                sk_for_each(sk, hslot) {
                        if (net_eq(sock_net(sk), net) &&
                            sk->sk_family == state->family)
                                goto found;
                }
        }
        sk = NULL;
found:
        return sk;
}

static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
{
        struct ping_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);

        do {
                sk = sk_next(sk);
        } while (sk && (!net_eq(sock_net(sk), net)));

        if (!sk)
                return ping_get_first(seq, state->bucket + 1);
        return sk;
}

static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
{
        struct sock *sk = ping_get_first(seq, 0);

        if (sk)
                while (pos && (sk = ping_get_next(seq, sk)) != NULL)
                        --pos;
        return pos ? NULL : sk;
}

void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
        __acquires(ping_table.lock)
{
        struct ping_iter_state *state = seq->private;
        state->bucket = 0;
        state->family = family;

        spin_lock(&ping_table.lock);

        return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}

static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
{
        return ping_seq_start(seq, pos, AF_INET);
}

void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct sock *sk;

        if (v == SEQ_START_TOKEN)
                sk = ping_get_idx(seq, 0);
        else
                sk = ping_get_next(seq, v);

        ++*pos;
        return sk;
}

void ping_seq_stop(struct seq_file *seq, void *v)
        __releases(ping_table.lock)
{
        spin_unlock(&ping_table.lock);
}

static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
                int bucket)
{
        struct inet_sock *inet = inet_sk(sp);
        __be32 dest = inet->inet_daddr;
        __be32 src = inet->inet_rcv_saddr;
        __u16 destp = ntohs(inet->inet_dport);
        __u16 srcp = ntohs(inet->inet_sport);

        seq_printf(f, "%5d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %llu %d %pK %u",
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
                0, 0L, 0,
                from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
                0, sock_i_ino(sp),
                refcount_read(&sp->sk_refcnt), sp,
                sk_drops_read(sp));
}

static int ping_v4_seq_show(struct seq_file *seq, void *v)
{
        seq_setwidth(seq, 127);
        if (v == SEQ_START_TOKEN)
                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
                           "rx_queue tr tm->when retrnsmt   uid  timeout "
                           "inode ref pointer drops");
        else {
                struct ping_iter_state *state = seq->private;

                ping_v4_format_sock(v, seq, state->bucket);
        }
        seq_pad(seq, '\n');
        return 0;
}

static const struct seq_operations ping_v4_seq_ops = {
        .start                = ping_v4_seq_start,
        .show                = ping_v4_seq_show,
        .next                = ping_seq_next,
        .stop                = ping_seq_stop,
};

static int __net_init ping_v4_proc_init_net(struct net *net)
{
        if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops,
                        sizeof(struct ping_iter_state)))
                return -ENOMEM;

        net->ipv4.ping_port_rover = get_random_u16();
        return 0;
}

static void __net_exit ping_v4_proc_exit_net(struct net *net)
{
        remove_proc_entry("icmp", net->proc_net);
}

static struct pernet_operations ping_v4_net_ops = {
        .init = ping_v4_proc_init_net,
        .exit = ping_v4_proc_exit_net,
};

int __init ping_proc_init(void)
{
        return register_pernet_subsys(&ping_v4_net_ops);
}

void ping_proc_exit(void)
{
        unregister_pernet_subsys(&ping_v4_net_ops);
}

#endif

void __init ping_init(void)
{
        int i;

        for (i = 0; i < PING_HTABLE_SIZE; i++)
                INIT_HLIST_HEAD(&ping_table.hash[i]);
        spin_lock_init(&ping_table.lock);
}









































































































   19 











































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
// SPDX-License-Identifier: GPL-2.0-only
/*
 * jump label support
 *
 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
 * Copyright (C) 2011 Peter Zijlstra
 *
 */
#include <linux/memory.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/err.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#include <linux/bug.h>
#include <linux/cpu.h>
#include <asm/sections.h>

/* mutex to protect coming/going of the jump_label table */
static DEFINE_MUTEX(jump_label_mutex);

void jump_label_lock(void)
{
        mutex_lock(&jump_label_mutex);
}

void jump_label_unlock(void)
{
        mutex_unlock(&jump_label_mutex);
}

static int jump_label_cmp(const void *a, const void *b)
{
        const struct jump_entry *jea = a;
        const struct jump_entry *jeb = b;

        /*
         * Entrires are sorted by key.
         */
        if (jump_entry_key(jea) < jump_entry_key(jeb))
                return -1;

        if (jump_entry_key(jea) > jump_entry_key(jeb))
                return 1;

        /*
         * In the batching mode, entries should also be sorted by the code
         * inside the already sorted list of entries, enabling a bsearch in
         * the vector.
         */
        if (jump_entry_code(jea) < jump_entry_code(jeb))
                return -1;

        if (jump_entry_code(jea) > jump_entry_code(jeb))
                return 1;

        return 0;
}

static void jump_label_swap(void *a, void *b, int size)
{
        long delta = (unsigned long)a - (unsigned long)b;
        struct jump_entry *jea = a;
        struct jump_entry *jeb = b;
        struct jump_entry tmp = *jea;

        jea->code        = jeb->code - delta;
        jea->target        = jeb->target - delta;
        jea->key        = jeb->key - delta;

        jeb->code        = tmp.code + delta;
        jeb->target        = tmp.target + delta;
        jeb->key        = tmp.key + delta;
}

static void
jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
{
        unsigned long size;
        void *swapfn = NULL;

        if (IS_ENABLED(CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE))
                swapfn = jump_label_swap;

        size = (((unsigned long)stop - (unsigned long)start)
                                        / sizeof(struct jump_entry));
        sort(start, size, sizeof(struct jump_entry), jump_label_cmp, swapfn);
}

static void jump_label_update(struct static_key *key);

/*
 * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h.
 * The use of 'atomic_read()' requires atomic.h and its problematic for some
 * kernel headers such as kernel.h and others. Since static_key_count() is not
 * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok
 * to have it be a function here. Similarly, for 'static_key_enable()' and
 * 'static_key_disable()', which require bug.h. This should allow jump_label.h
 * to be included from most/all places for CONFIG_JUMP_LABEL.
 */
int static_key_count(struct static_key *key)
{
        /*
         * -1 means the first static_key_slow_inc() is in progress.
         *  static_key_enabled() must return true, so return 1 here.
         */
        int n = atomic_read(&key->enabled);

        return n >= 0 ? n : 1;
}
EXPORT_SYMBOL_GPL(static_key_count);

/*
 * static_key_fast_inc_not_disabled - adds a user for a static key
 * @key: static key that must be already enabled
 *
 * The caller must make sure that the static key can't get disabled while
 * in this function. It doesn't patch jump labels, only adds a user to
 * an already enabled static key.
 *
 * Returns true if the increment was done. Unlike refcount_t the ref counter
 * is not saturated, but will fail to increment on overflow.
 */
bool static_key_fast_inc_not_disabled(struct static_key *key)
{
        int v;

        STATIC_KEY_CHECK_USE(key);
        /*
         * Negative key->enabled has a special meaning: it sends
         * static_key_slow_inc/dec() down the slow path, and it is non-zero
         * so it counts as "enabled" in jump_label_update().
         *
         * The INT_MAX overflow condition is either used by the networking
         * code to reset or detected in the slow path of
         * static_key_slow_inc_cpuslocked().
         */
        v = atomic_read(&key->enabled);
        do {
                if (v <= 0 || v == INT_MAX)
                        return false;
        } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));

        return true;
}
EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled);

bool static_key_slow_inc_cpuslocked(struct static_key *key)
{
        lockdep_assert_cpus_held();

        /*
         * Careful if we get concurrent static_key_slow_inc/dec() calls;
         * later calls must wait for the first one to _finish_ the
         * jump_label_update() process.  At the same time, however,
         * the jump_label_update() call below wants to see
         * static_key_enabled(&key) for jumps to be updated properly.
         */
        if (static_key_fast_inc_not_disabled(key))
                return true;

        guard(mutex)(&jump_label_mutex);
        /* Try to mark it as 'enabling in progress. */
        if (!atomic_cmpxchg(&key->enabled, 0, -1)) {
                jump_label_update(key);
                /*
                 * Ensure that when static_key_fast_inc_not_disabled() or
                 * static_key_dec_not_one() observe the positive value,
                 * they must also observe all the text changes.
                 */
                atomic_set_release(&key->enabled, 1);
        } else {
                /*
                 * While holding the mutex this should never observe
                 * anything else than a value >= 1 and succeed
                 */
                if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key)))
                        return false;
        }
        return true;
}

bool static_key_slow_inc(struct static_key *key)
{
        bool ret;

        cpus_read_lock();
        ret = static_key_slow_inc_cpuslocked(key);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);

void static_key_enable_cpuslocked(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        lockdep_assert_cpus_held();

        if (atomic_read(&key->enabled) > 0) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
                return;
        }

        jump_label_lock();
        if (atomic_read(&key->enabled) == 0) {
                atomic_set(&key->enabled, -1);
                jump_label_update(key);
                /*
                 * See static_key_slow_inc().
                 */
                atomic_set_release(&key->enabled, 1);
        }
        jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked);

void static_key_enable(struct static_key *key)
{
        cpus_read_lock();
        static_key_enable_cpuslocked(key);
        cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(static_key_enable);

void static_key_disable_cpuslocked(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        lockdep_assert_cpus_held();

        if (atomic_read(&key->enabled) != 1) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
                return;
        }

        jump_label_lock();
        if (atomic_cmpxchg(&key->enabled, 1, 0) == 1)
                jump_label_update(key);
        jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked);

void static_key_disable(struct static_key *key)
{
        cpus_read_lock();
        static_key_disable_cpuslocked(key);
        cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(static_key_disable);

static bool static_key_dec_not_one(struct static_key *key)
{
        int v;

        /*
         * Go into the slow path if key::enabled is less than or equal than
         * one. One is valid to shut down the key, anything less than one
         * is an imbalance, which is handled at the call site.
         *
         * That includes the special case of '-1' which is set in
         * static_key_slow_inc_cpuslocked(), but that's harmless as it is
         * fully serialized in the slow path below. By the time this task
         * acquires the jump label lock the value is back to one and the
         * retry under the lock must succeed.
         */
        v = atomic_read(&key->enabled);
        do {
                /*
                 * Warn about the '-1' case though; since that means a
                 * decrement is concurrent with a first (0->1) increment. IOW
                 * people are trying to disable something that wasn't yet fully
                 * enabled. This suggests an ordering problem on the user side.
                 */
                WARN_ON_ONCE(v < 0);

                /*
                 * Warn about underflow, and lie about success in an attempt to
                 * not make things worse.
                 */
                if (WARN_ON_ONCE(v == 0))
                        return true;

                if (v <= 1)
                        return false;
        } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1)));

        return true;
}

static void __static_key_slow_dec_cpuslocked(struct static_key *key)
{
        lockdep_assert_cpus_held();
        int val;

        if (static_key_dec_not_one(key))
                return;

        guard(mutex)(&jump_label_mutex);
        val = atomic_read(&key->enabled);
        /*
         * It should be impossible to observe -1 with jump_label_mutex held,
         * see static_key_slow_inc_cpuslocked().
         */
        if (WARN_ON_ONCE(val == -1))
                return;
        /*
         * Cannot already be 0, something went sideways.
         */
        if (WARN_ON_ONCE(val == 0))
                return;

        if (atomic_dec_and_test(&key->enabled))
                jump_label_update(key);
}

static void __static_key_slow_dec(struct static_key *key)
{
        cpus_read_lock();
        __static_key_slow_dec_cpuslocked(key);
        cpus_read_unlock();
}

void jump_label_update_timeout(struct work_struct *work)
{
        struct static_key_deferred *key =
                container_of(work, struct static_key_deferred, work.work);
        __static_key_slow_dec(&key->key);
}
EXPORT_SYMBOL_GPL(jump_label_update_timeout);

void static_key_slow_dec(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        __static_key_slow_dec(key);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);

void static_key_slow_dec_cpuslocked(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        __static_key_slow_dec_cpuslocked(key);
}

void __static_key_slow_dec_deferred(struct static_key *key,
                                    struct delayed_work *work,
                                    unsigned long timeout)
{
        STATIC_KEY_CHECK_USE(key);

        if (static_key_dec_not_one(key))
                return;

        schedule_delayed_work(work, timeout);
}
EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred);

void __static_key_deferred_flush(void *key, struct delayed_work *work)
{
        STATIC_KEY_CHECK_USE(key);
        flush_delayed_work(work);
}
EXPORT_SYMBOL_GPL(__static_key_deferred_flush);

void jump_label_rate_limit(struct static_key_deferred *key,
                unsigned long rl)
{
        STATIC_KEY_CHECK_USE(key);
        key->timeout = rl;
        INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
EXPORT_SYMBOL_GPL(jump_label_rate_limit);

static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
        if (jump_entry_code(entry) <= (unsigned long)end &&
            jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start)
                return 1;

        return 0;
}

static int __jump_label_text_reserved(struct jump_entry *iter_start,
                struct jump_entry *iter_stop, void *start, void *end, bool init)
{
        struct jump_entry *iter;

        iter = iter_start;
        while (iter < iter_stop) {
                if (init || !jump_entry_is_init(iter)) {
                        if (addr_conflict(iter, start, end))
                                return 1;
                }
                iter++;
        }

        return 0;
}

#ifndef arch_jump_label_transform_static
static void arch_jump_label_transform_static(struct jump_entry *entry,
                                             enum jump_label_type type)
{
        /* nothing to do on most architectures */
}
#endif

static inline struct jump_entry *static_key_entries(struct static_key *key)
{
        WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED);
        return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK);
}

static inline bool static_key_type(struct static_key *key)
{
        return key->type & JUMP_TYPE_TRUE;
}

static inline bool static_key_linked(struct static_key *key)
{
        return key->type & JUMP_TYPE_LINKED;
}

static inline void static_key_clear_linked(struct static_key *key)
{
        key->type &= ~JUMP_TYPE_LINKED;
}

static inline void static_key_set_linked(struct static_key *key)
{
        key->type |= JUMP_TYPE_LINKED;
}

/***
 * A 'struct static_key' uses a union such that it either points directly
 * to a table of 'struct jump_entry' or to a linked list of modules which in
 * turn point to 'struct jump_entry' tables.
 *
 * The two lower bits of the pointer are used to keep track of which pointer
 * type is in use and to store the initial branch direction, we use an access
 * function which preserves these bits.
 */
static void static_key_set_entries(struct static_key *key,
                                   struct jump_entry *entries)
{
        unsigned long type;

        WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK);
        type = key->type & JUMP_TYPE_MASK;
        key->entries = entries;
        key->type |= type;
}

static enum jump_label_type jump_label_type(struct jump_entry *entry)
{
        struct static_key *key = jump_entry_key(entry);
        bool enabled = static_key_enabled(key);
        bool branch = jump_entry_is_branch(entry);

        /* See the comment in linux/jump_label.h */
        return enabled ^ branch;
}

static bool jump_label_can_update(struct jump_entry *entry, bool init)
{
        /*
         * Cannot update code that was in an init text area.
         */
        if (!init && jump_entry_is_init(entry))
                return false;

        if (!kernel_text_address(jump_entry_code(entry))) {
                /*
                 * This skips patching built-in __exit, which
                 * is part of init_section_contains() but is
                 * not part of kernel_text_address().
                 *
                 * Skipping built-in __exit is fine since it
                 * will never be executed.
                 */
                WARN_ONCE(!jump_entry_is_init(entry),
                          "can't patch jump_label at %pS",
                          (void *)jump_entry_code(entry));
                return false;
        }

        return true;
}

#ifndef HAVE_JUMP_LABEL_BATCH
static void __jump_label_update(struct static_key *key,
                                struct jump_entry *entry,
                                struct jump_entry *stop,
                                bool init)
{
        for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
                if (jump_label_can_update(entry, init))
                        arch_jump_label_transform(entry, jump_label_type(entry));
        }
}
#else
static void __jump_label_update(struct static_key *key,
                                struct jump_entry *entry,
                                struct jump_entry *stop,
                                bool init)
{
        for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {

                if (!jump_label_can_update(entry, init))
                        continue;

                if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) {
                        /*
                         * Queue is full: Apply the current queue and try again.
                         */
                        arch_jump_label_transform_apply();
                        BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry)));
                }
        }
        arch_jump_label_transform_apply();
}
#endif

void __init jump_label_init(void)
{
        struct jump_entry *iter_start = __start___jump_table;
        struct jump_entry *iter_stop = __stop___jump_table;
        struct static_key *key = NULL;
        struct jump_entry *iter;

        if (static_key_initialized)
                return;

        cpus_read_lock();
        jump_label_lock();
        jump_label_sort_entries(iter_start, iter_stop);

        for (iter = iter_start; iter < iter_stop; iter++) {
                struct static_key *iterk;
                bool in_init;

                /* rewrite NOPs */
                if (jump_label_type(iter) == JUMP_LABEL_NOP)
                        arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);

                in_init = init_section_contains((void *)jump_entry_code(iter), 1);
                jump_entry_set_init(iter, in_init);

                iterk = jump_entry_key(iter);
                if (iterk == key)
                        continue;

                key = iterk;
                static_key_set_entries(key, iter);
        }
        static_key_initialized = true;
        jump_label_unlock();
        cpus_read_unlock();
}

static inline bool static_key_sealed(struct static_key *key)
{
        return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK);
}

static inline void static_key_seal(struct static_key *key)
{
        unsigned long type = key->type & JUMP_TYPE_TRUE;
        key->type = JUMP_TYPE_LINKED | type;
}

void jump_label_init_ro(void)
{
        struct jump_entry *iter_start = __start___jump_table;
        struct jump_entry *iter_stop = __stop___jump_table;
        struct jump_entry *iter;

        if (WARN_ON_ONCE(!static_key_initialized))
                return;

        cpus_read_lock();
        jump_label_lock();

        for (iter = iter_start; iter < iter_stop; iter++) {
                struct static_key *iterk = jump_entry_key(iter);

                if (!is_kernel_ro_after_init((unsigned long)iterk))
                        continue;

                if (static_key_sealed(iterk))
                        continue;

                static_key_seal(iterk);
        }

        jump_label_unlock();
        cpus_read_unlock();
}

#ifdef CONFIG_MODULES

enum jump_label_type jump_label_init_type(struct jump_entry *entry)
{
        struct static_key *key = jump_entry_key(entry);
        bool type = static_key_type(key);
        bool branch = jump_entry_is_branch(entry);

        /* See the comment in linux/jump_label.h */
        return type ^ branch;
}

struct static_key_mod {
        struct static_key_mod *next;
        struct jump_entry *entries;
        struct module *mod;
};

static inline struct static_key_mod *static_key_mod(struct static_key *key)
{
        WARN_ON_ONCE(!static_key_linked(key));
        return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK);
}

/***
 * key->type and key->next are the same via union.
 * This sets key->next and preserves the type bits.
 *
 * See additional comments above static_key_set_entries().
 */
static void static_key_set_mod(struct static_key *key,
                               struct static_key_mod *mod)
{
        unsigned long type;

        WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK);
        type = key->type & JUMP_TYPE_MASK;
        key->next = mod;
        key->type |= type;
}

static int __jump_label_mod_text_reserved(void *start, void *end)
{
        struct module *mod;
        int ret;

        scoped_guard(rcu) {
                mod = __module_text_address((unsigned long)start);
                WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
                if (!try_module_get(mod))
                        mod = NULL;
        }
        if (!mod)
                return 0;

        ret = __jump_label_text_reserved(mod->jump_entries,
                                mod->jump_entries + mod->num_jump_entries,
                                start, end, mod->state == MODULE_STATE_COMING);

        module_put(mod);

        return ret;
}

static void __jump_label_mod_update(struct static_key *key)
{
        struct static_key_mod *mod;

        for (mod = static_key_mod(key); mod; mod = mod->next) {
                struct jump_entry *stop;
                struct module *m;

                /*
                 * NULL if the static_key is defined in a module
                 * that does not use it
                 */
                if (!mod->entries)
                        continue;

                m = mod->mod;
                if (!m)
                        stop = __stop___jump_table;
                else
                        stop = m->jump_entries + m->num_jump_entries;
                __jump_label_update(key, mod->entries, stop,
                                    m && m->state == MODULE_STATE_COMING);
        }
}

static int jump_label_add_module(struct module *mod)
{
        struct jump_entry *iter_start = mod->jump_entries;
        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
        struct jump_entry *iter;
        struct static_key *key = NULL;
        struct static_key_mod *jlm, *jlm2;

        /* if the module doesn't have jump label entries, just return */
        if (iter_start == iter_stop)
                return 0;

        jump_label_sort_entries(iter_start, iter_stop);

        for (iter = iter_start; iter < iter_stop; iter++) {
                struct static_key *iterk;
                bool in_init;

                in_init = within_module_init(jump_entry_code(iter), mod);
                jump_entry_set_init(iter, in_init);

                iterk = jump_entry_key(iter);
                if (iterk == key)
                        continue;

                key = iterk;
                if (within_module((unsigned long)key, mod)) {
                        static_key_set_entries(key, iter);
                        continue;
                }

                /*
                 * If the key was sealed at init, then there's no need to keep a
                 * reference to its module entries - just patch them now and be
                 * done with it.
                 */
                if (static_key_sealed(key))
                        goto do_poke;

                jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
                if (!jlm)
                        return -ENOMEM;
                if (!static_key_linked(key)) {
                        jlm2 = kzalloc(sizeof(struct static_key_mod),
                                       GFP_KERNEL);
                        if (!jlm2) {
                                kfree(jlm);
                                return -ENOMEM;
                        }
                        scoped_guard(rcu)
                                jlm2->mod = __module_address((unsigned long)key);

                        jlm2->entries = static_key_entries(key);
                        jlm2->next = NULL;
                        static_key_set_mod(key, jlm2);
                        static_key_set_linked(key);
                }
                jlm->mod = mod;
                jlm->entries = iter;
                jlm->next = static_key_mod(key);
                static_key_set_mod(key, jlm);
                static_key_set_linked(key);

                /* Only update if we've changed from our initial state */
do_poke:
                if (jump_label_type(iter) != jump_label_init_type(iter))
                        __jump_label_update(key, iter, iter_stop, true);
        }

        return 0;
}

static void jump_label_del_module(struct module *mod)
{
        struct jump_entry *iter_start = mod->jump_entries;
        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
        struct jump_entry *iter;
        struct static_key *key = NULL;
        struct static_key_mod *jlm, **prev;

        for (iter = iter_start; iter < iter_stop; iter++) {
                if (jump_entry_key(iter) == key)
                        continue;

                key = jump_entry_key(iter);

                if (within_module((unsigned long)key, mod))
                        continue;

                /* No @jlm allocated because key was sealed at init. */
                if (static_key_sealed(key))
                        continue;

                /* No memory during module load */
                if (WARN_ON(!static_key_linked(key)))
                        continue;

                prev = &key->next;
                jlm = static_key_mod(key);

                while (jlm && jlm->mod != mod) {
                        prev = &jlm->next;
                        jlm = jlm->next;
                }

                /* No memory during module load */
                if (WARN_ON(!jlm))
                        continue;

                if (prev == &key->next)
                        static_key_set_mod(key, jlm->next);
                else
                        *prev = jlm->next;

                kfree(jlm);

                jlm = static_key_mod(key);
                /* if only one etry is left, fold it back into the static_key */
                if (jlm->next == NULL) {
                        static_key_set_entries(key, jlm->entries);
                        static_key_clear_linked(key);
                        kfree(jlm);
                }
        }
}

static int
jump_label_module_notify(struct notifier_block *self, unsigned long val,
                         void *data)
{
        struct module *mod = data;
        int ret = 0;

        cpus_read_lock();
        jump_label_lock();

        switch (val) {
        case MODULE_STATE_COMING:
                ret = jump_label_add_module(mod);
                if (ret) {
                        WARN(1, "Failed to allocate memory: jump_label may not work properly.\n");
                        jump_label_del_module(mod);
                }
                break;
        case MODULE_STATE_GOING:
                jump_label_del_module(mod);
                break;
        }

        jump_label_unlock();
        cpus_read_unlock();

        return notifier_from_errno(ret);
}

static struct notifier_block jump_label_module_nb = {
        .notifier_call = jump_label_module_notify,
        .priority = 1, /* higher than tracepoints */
};

static __init int jump_label_init_module(void)
{
        return register_module_notifier(&jump_label_module_nb);
}
early_initcall(jump_label_init_module);

#endif /* CONFIG_MODULES */

/***
 * jump_label_text_reserved - check if addr range is reserved
 * @start: start text addr
 * @end: end text addr
 *
 * checks if the text addr located between @start and @end
 * overlaps with any of the jump label patch addresses. Code
 * that wants to modify kernel text should first verify that
 * it does not overlap with any of the jump label addresses.
 * Caller must hold jump_label_mutex.
 *
 * returns 1 if there is an overlap, 0 otherwise
 */
int jump_label_text_reserved(void *start, void *end)
{
        bool init = system_state < SYSTEM_RUNNING;
        int ret = __jump_label_text_reserved(__start___jump_table,
                        __stop___jump_table, start, end, init);

        if (ret)
                return ret;

#ifdef CONFIG_MODULES
        ret = __jump_label_mod_text_reserved(start, end);
#endif
        return ret;
}

static void jump_label_update(struct static_key *key)
{
        struct jump_entry *stop = __stop___jump_table;
        bool init = system_state < SYSTEM_RUNNING;
        struct jump_entry *entry;
#ifdef CONFIG_MODULES
        struct module *mod;

        if (static_key_linked(key)) {
                __jump_label_mod_update(key);
                return;
        }

        scoped_guard(rcu) {
                mod = __module_address((unsigned long)key);
                if (mod) {
                        stop = mod->jump_entries + mod->num_jump_entries;
                        init = mod->state == MODULE_STATE_COMING;
                }
        }
#endif
        entry = static_key_entries(key);
        /* if there are no users, entry can be NULL */
        if (entry)
                __jump_label_update(key, entry, stop, init);
}

#ifdef CONFIG_STATIC_KEYS_SELFTEST
static DEFINE_STATIC_KEY_TRUE(sk_true);
static DEFINE_STATIC_KEY_FALSE(sk_false);

static __init int jump_label_test(void)
{
        int i;

        for (i = 0; i < 2; i++) {
                WARN_ON(static_key_enabled(&sk_true.key) != true);
                WARN_ON(static_key_enabled(&sk_false.key) != false);

                WARN_ON(!static_branch_likely(&sk_true));
                WARN_ON(!static_branch_unlikely(&sk_true));
                WARN_ON(static_branch_likely(&sk_false));
                WARN_ON(static_branch_unlikely(&sk_false));

                static_branch_disable(&sk_true);
                static_branch_enable(&sk_false);

                WARN_ON(static_key_enabled(&sk_true.key) == true);
                WARN_ON(static_key_enabled(&sk_false.key) == false);

                WARN_ON(static_branch_likely(&sk_true));
                WARN_ON(static_branch_unlikely(&sk_true));
                WARN_ON(!static_branch_likely(&sk_false));
                WARN_ON(!static_branch_unlikely(&sk_false));

                static_branch_enable(&sk_true);
                static_branch_disable(&sk_false);
        }

        return 0;
}
early_initcall(jump_label_test);
#endif /* STATIC_KEYS_SELFTEST */








































    3 




    3 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2025 Google LLC
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM usbcore

#if !defined(_USB_CORE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _USB_CORE_TRACE_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <linux/usb.h>

DECLARE_EVENT_CLASS(usb_core_log_usb_device,
        TP_PROTO(struct usb_device *udev),
        TP_ARGS(udev),
        TP_STRUCT__entry(
                __string(name, dev_name(&udev->dev))
                __field(enum usb_device_speed, speed)
                __field(enum usb_device_state, state)
                __field(unsigned short, bus_mA)
                __field(unsigned, authorized)
        ),
        TP_fast_assign(
                __assign_str(name);
                __entry->speed = udev->speed;
                __entry->state = udev->state;
                __entry->bus_mA = udev->bus_mA;
                __entry->authorized = udev->authorized;
        ),
        TP_printk("usb %s speed %s state %s %dmA [%s]",
                __get_str(name),
                usb_speed_string(__entry->speed),
                usb_state_string(__entry->state),
                __entry->bus_mA,
                __entry->authorized ? "authorized" : "unauthorized")
);

DEFINE_EVENT(usb_core_log_usb_device, usb_set_device_state,
        TP_PROTO(struct usb_device *udev),
        TP_ARGS(udev)
);

DEFINE_EVENT(usb_core_log_usb_device, usb_alloc_dev,
        TP_PROTO(struct usb_device *udev),
        TP_ARGS(udev)
);


#endif /* _USB_CORE_TRACE_H */

/* this part has to be here */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .

#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace

#include <trace/define_trace.h>















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    6 






























    6 








    6 

    6 








    6 


















    5 


















    6 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux INET6 implementation
 *        Forwarding Information Database
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Changes:
 *        Yuji SEKIYA @USAGI:        Support default route on router node;
 *                                remove ip6_null_entry from the top of
 *                                routing table.
 *        Ville Nuorvala:                Fixed routing subtrees.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>

#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/lwtunnel.h>
#include <net/fib_notifier.h>

#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>

static struct kmem_cache *fib6_node_kmem __read_mostly;

struct fib6_cleaner {
        struct fib6_walker w;
        struct net *net;
        int (*func)(struct fib6_info *, void *arg);
        int sernum;
        void *arg;
        bool skip_notify;
};

#ifdef CONFIG_IPV6_SUBTREES
#define FWS_INIT FWS_S
#else
#define FWS_INIT FWS_L
#endif

static struct fib6_info *fib6_find_prefix(struct net *net,
                                         struct fib6_table *table,
                                         struct fib6_node *fn);
static struct fib6_node *fib6_repair_tree(struct net *net,
                                          struct fib6_table *table,
                                          struct fib6_node *fn);
static int fib6_walk(struct net *net, struct fib6_walker *w);
static int fib6_walk_continue(struct fib6_walker *w);

/*
 *        A routing update causes an increase of the serial number on the
 *        affected subtree. This allows for cached routes to be asynchronously
 *        tested when modifications are made to the destination cache as a
 *        result of redirects, path MTU changes, etc.
 */

static void fib6_gc_timer_cb(struct timer_list *t);

#define FOR_WALKERS(net, w) \
        list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)

static void fib6_walker_link(struct net *net, struct fib6_walker *w)
{
        write_lock_bh(&net->ipv6.fib6_walker_lock);
        list_add(&w->lh, &net->ipv6.fib6_walkers);
        write_unlock_bh(&net->ipv6.fib6_walker_lock);
}

static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
{
        write_lock_bh(&net->ipv6.fib6_walker_lock);
        list_del(&w->lh);
        write_unlock_bh(&net->ipv6.fib6_walker_lock);
}

static int fib6_new_sernum(struct net *net)
{
        int new, old = atomic_read(&net->ipv6.fib6_sernum);

        do {
                new = old < INT_MAX ? old + 1 : 1;
        } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new));

        return new;
}

enum {
        FIB6_NO_SERNUM_CHANGE = 0,
};

void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
{
        struct fib6_node *fn;

        fn = rcu_dereference_protected(f6i->fib6_node,
                        lockdep_is_held(&f6i->fib6_table->tb6_lock));
        if (fn)
                WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
}

/*
 *        Auxiliary address test functions for the radix tree.
 *
 *        These assume a 32bit processor (although it will work on
 *        64bit processors)
 */

/*
 *        test bit
 */
#if defined(__LITTLE_ENDIAN)
# define BITOP_BE32_SWIZZLE        (0x1F & ~7)
#else
# define BITOP_BE32_SWIZZLE        0
#endif

static __be32 addr_bit_set(const void *token, int fn_bit)
{
        const __be32 *addr = token;
        /*
         * Here,
         *        1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
         * is optimized version of
         *        htonl(1 << ((~fn_bit)&0x1F))
         * See include/asm-generic/bitops/le.h.
         */
        return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
               addr[fn_bit >> 5];
}

struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
{
        struct fib6_info *f6i;
        size_t sz = sizeof(*f6i);

        if (with_fib6_nh)
                sz += sizeof(struct fib6_nh);

        f6i = kzalloc(sz, gfp_flags);
        if (!f6i)
                return NULL;

        /* fib6_siblings is a union with nh_list, so this initializes both */
        INIT_LIST_HEAD(&f6i->fib6_siblings);
        refcount_set(&f6i->fib6_ref, 1);

        INIT_HLIST_NODE(&f6i->gc_link);

        return f6i;
}

void fib6_info_destroy_rcu(struct rcu_head *head)
{
        struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);

        WARN_ON(f6i->fib6_node);

        if (f6i->nh)
                nexthop_put(f6i->nh);
        else
                fib6_nh_release(f6i->fib6_nh);

        ip_fib_metrics_put(f6i->fib6_metrics);
        kfree(f6i);
}
EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);

static struct fib6_node *node_alloc(struct net *net)
{
        struct fib6_node *fn;

        fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
        if (fn)
                net->ipv6.rt6_stats->fib_nodes++;

        return fn;
}

static void node_free_immediate(struct net *net, struct fib6_node *fn)
{
        kmem_cache_free(fib6_node_kmem, fn);
        net->ipv6.rt6_stats->fib_nodes--;
}

static void node_free(struct net *net, struct fib6_node *fn)
{
        kfree_rcu(fn, rcu);
        net->ipv6.rt6_stats->fib_nodes--;
}

static void fib6_free_table(struct fib6_table *table)
{
        inetpeer_invalidate_tree(&table->tb6_peers);
        kfree(table);
}

static void fib6_link_table(struct net *net, struct fib6_table *tb)
{
        unsigned int h;

        /*
         * Initialize table lock at a single place to give lockdep a key,
         * tables aren't visible prior to being linked to the list.
         */
        spin_lock_init(&tb->tb6_lock);
        h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);

        /*
         * No protection necessary, this is the only list mutatation
         * operation, tables never disappear once they exist.
         */
        hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
}

#ifdef CONFIG_IPV6_MULTIPLE_TABLES

static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
{
        struct fib6_table *table;

        table = kzalloc_obj(*table, GFP_ATOMIC);
        if (table) {
                table->tb6_id = id;
                rcu_assign_pointer(table->tb6_root.leaf,
                                   net->ipv6.fib6_null_entry);
                table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
                inet_peer_base_init(&table->tb6_peers);
                INIT_HLIST_HEAD(&table->tb6_gc_hlist);
        }

        return table;
}

struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
        struct fib6_table *tb, *new_tb;

        if (id == 0)
                id = RT6_TABLE_MAIN;

        tb = fib6_get_table(net, id);
        if (tb)
                return tb;

        new_tb = fib6_alloc_table(net, id);
        if (!new_tb)
                return NULL;

        spin_lock_bh(&net->ipv6.fib_table_hash_lock);

        tb = fib6_get_table(net, id);
        if (unlikely(tb)) {
                spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
                kfree(new_tb);
                return tb;
        }

        fib6_link_table(net, new_tb);

        spin_unlock_bh(&net->ipv6.fib_table_hash_lock);

        return new_tb;
}
EXPORT_SYMBOL_GPL(fib6_new_table);

struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
        struct hlist_head *head;
        struct fib6_table *tb;

        if (!id)
                id = RT6_TABLE_MAIN;

        head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)];

        /* See comment in fib6_link_table().  RCU is not required,
         * but rcu_dereference_raw() is used to avoid data-race.
         */
        hlist_for_each_entry_rcu(tb, head, tb6_hlist, true)
                if (tb->tb6_id == id)
                        return tb;

        return NULL;
}
EXPORT_SYMBOL_GPL(fib6_get_table);

static void __net_init fib6_tables_init(struct net *net)
{
        fib6_link_table(net, net->ipv6.fib6_main_tbl);
        fib6_link_table(net, net->ipv6.fib6_local_tbl);
}
#else

struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
        return fib6_get_table(net, id);
}

struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
          return net->ipv6.fib6_main_tbl;
}

struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb,
                                   int flags, pol_lookup_t lookup)
{
        struct rt6_info *rt;

        rt = pol_lookup_func(lookup,
                        net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
        if (rt->dst.error == -EAGAIN) {
                ip6_rt_put_flags(rt, flags);
                rt = net->ipv6.ip6_null_entry;
                if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                        dst_hold(&rt->dst);
        }

        return &rt->dst;
}

/* called with rcu lock held; no reference taken on fib6_info */
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                struct fib6_result *res, int flags)
{
        return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
                                 res, flags);
}
#if IS_MODULE(CONFIG_NFT_FIB_IPV6)
EXPORT_SYMBOL_GPL(fib6_lookup);
#endif

static void __net_init fib6_tables_init(struct net *net)
{
        fib6_link_table(net, net->ipv6.fib6_main_tbl);
}

#endif

unsigned int fib6_tables_seq_read(const struct net *net)
{
        unsigned int h, fib_seq = 0;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                const struct hlist_head *head = &net->ipv6.fib_table_hash[h];
                const struct fib6_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb6_hlist)
                        fib_seq += READ_ONCE(tb->fib_seq);
        }
        rcu_read_unlock();

        return fib_seq;
}

static int call_fib6_entry_notifier(struct notifier_block *nb,
                                    enum fib_event_type event_type,
                                    struct fib6_info *rt,
                                    struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
        };

        return call_fib6_notifier(nb, event_type, &info.info);
}

static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
                                              enum fib_event_type event_type,
                                              struct fib6_info *rt,
                                              unsigned int nsiblings,
                                              struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
                .nsiblings = nsiblings,
        };

        return call_fib6_notifier(nb, event_type, &info.info);
}

int call_fib6_entry_notifiers(struct net *net,
                              enum fib_event_type event_type,
                              struct fib6_info *rt,
                              struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
        };

        WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
        return call_fib6_notifiers(net, event_type, &info.info);
}

int call_fib6_multipath_entry_notifiers(struct net *net,
                                        enum fib_event_type event_type,
                                        struct fib6_info *rt,
                                        unsigned int nsiblings,
                                        struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
                .nsiblings = nsiblings,
        };

        WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
        return call_fib6_notifiers(net, event_type, &info.info);
}

int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
{
        struct fib6_entry_notifier_info info = {
                .rt = rt,
                .nsiblings = rt->fib6_nsiblings,
        };

        WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
        return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
}

struct fib6_dump_arg {
        struct net *net;
        struct notifier_block *nb;
        struct netlink_ext_ack *extack;
};

static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
{
        enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
        unsigned int nsiblings;
        int err;

        if (!rt || rt == arg->net->ipv6.fib6_null_entry)
                return 0;

        nsiblings = READ_ONCE(rt->fib6_nsiblings);
        if (nsiblings)
                err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
                                                         rt,
                                                         nsiblings,
                                                         arg->extack);
        else
                err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
                                               arg->extack);

        return err;
}

static int fib6_node_dump(struct fib6_walker *w)
{
        int err;

        err = fib6_rt_dump(w->leaf, w->args);
        w->leaf = NULL;
        return err;
}

static int fib6_table_dump(struct net *net, struct fib6_table *tb,
                           struct fib6_walker *w)
{
        int err;

        w->root = &tb->tb6_root;
        spin_lock_bh(&tb->tb6_lock);
        err = fib6_walk(net, w);
        spin_unlock_bh(&tb->tb6_lock);
        return err;
}

/* Called with rcu_read_lock() */
int fib6_tables_dump(struct net *net, struct notifier_block *nb,
                     struct netlink_ext_ack *extack)
{
        struct fib6_dump_arg arg;
        struct fib6_walker *w;
        unsigned int h;
        int err = 0;

        w = kzalloc_obj(*w, GFP_ATOMIC);
        if (!w)
                return -ENOMEM;

        w->func = fib6_node_dump;
        arg.net = net;
        arg.nb = nb;
        arg.extack = extack;
        w->args = &arg;

        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv6.fib_table_hash[h];
                struct fib6_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
                        err = fib6_table_dump(net, tb, w);
                        if (err)
                                goto out;
                }
        }

out:
        kfree(w);

        /* The tree traversal function should never return a positive value. */
        return err > 0 ? -EINVAL : err;
}

static int fib6_dump_node(struct fib6_walker *w)
{
        int res;
        struct fib6_info *rt;

        for_each_fib6_walker_rt(w) {
                res = rt6_dump_route(rt, w->args, w->skip_in_node);
                if (res >= 0) {
                        /* Frame is full, suspend walking */
                        w->leaf = rt;

                        /* We'll restart from this node, so if some routes were
                         * already dumped, skip them next time.
                         */
                        w->skip_in_node += res;

                        return 1;
                }
                w->skip_in_node = 0;

                /* Multipath routes are dumped in one route with the
                 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
                 * last sibling of this route (no need to dump the
                 * sibling routes again)
                 */
                if (rt->fib6_nsiblings)
                        rt = list_last_entry(&rt->fib6_siblings,
                                             struct fib6_info,
                                             fib6_siblings);
        }
        w->leaf = NULL;
        return 0;
}

static void fib6_dump_end(struct netlink_callback *cb)
{
        struct net *net = sock_net(cb->skb->sk);
        struct fib6_walker *w = (void *)cb->args[2];

        if (w) {
                if (cb->args[4]) {
                        cb->args[4] = 0;
                        fib6_walker_unlink(net, w);
                }
                cb->args[2] = 0;
                kfree(w);
        }
        cb->done = (void *)cb->args[3];
        cb->args[1] = 3;
}

static int fib6_dump_done(struct netlink_callback *cb)
{
        fib6_dump_end(cb);
        return cb->done ? cb->done(cb) : 0;
}

static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
                           struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct fib6_walker *w;
        int res;

        w = (void *)cb->args[2];
        w->root = &table->tb6_root;

        if (cb->args[4] == 0) {
                w->count = 0;
                w->skip = 0;
                w->skip_in_node = 0;

                spin_lock_bh(&table->tb6_lock);
                res = fib6_walk(net, w);
                spin_unlock_bh(&table->tb6_lock);
                if (res > 0) {
                        cb->args[4] = 1;
                        cb->args[5] = READ_ONCE(w->root->fn_sernum);
                }
        } else {
                int sernum = READ_ONCE(w->root->fn_sernum);
                if (cb->args[5] != sernum) {
                        /* Begin at the root if the tree changed */
                        cb->args[5] = sernum;
                        w->state = FWS_INIT;
                        w->node = w->root;
                        w->skip = w->count;
                        w->skip_in_node = 0;
                } else
                        w->skip = 0;

                spin_lock_bh(&table->tb6_lock);
                res = fib6_walk_continue(w);
                spin_unlock_bh(&table->tb6_lock);
                if (res <= 0) {
                        fib6_walker_unlink(net, w);
                        cb->args[4] = 0;
                }
        }

        return res;
}

static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rt6_rtnl_dump_arg arg = {
                .filter.dump_exceptions = true,
                .filter.dump_routes = true,
                .filter.rtnl_held = false,
        };
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        unsigned int e = 0, s_e;
        struct hlist_head *head;
        struct fib6_walker *w;
        struct fib6_table *tb;
        unsigned int h, s_h;
        int err = 0;

        rcu_read_lock();
        if (cb->strict_check) {
                err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
                if (err < 0)
                        goto unlock;
        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
                struct rtmsg *rtm = nlmsg_data(nlh);

                if (rtm->rtm_flags & RTM_F_PREFIX)
                        arg.filter.flags = RTM_F_PREFIX;
        }

        w = (void *)cb->args[2];
        if (!w) {
                /* New dump:
                 *
                 * 1. allocate and initialize walker.
                 */
                w = kzalloc_obj(*w, GFP_ATOMIC);
                if (!w) {
                        err = -ENOMEM;
                        goto unlock;
                }
                w->func = fib6_dump_node;
                cb->args[2] = (long)w;

                /* 2. hook callback destructor.
                 */
                cb->args[3] = (long)cb->done;
                cb->done = fib6_dump_done;

        }

        arg.skb = skb;
        arg.cb = cb;
        arg.net = net;
        w->args = &arg;

        if (arg.filter.table_id) {
                tb = fib6_get_table(net, arg.filter.table_id);
                if (!tb) {
                        if (rtnl_msg_family(cb->nlh) != PF_INET6)
                                goto unlock;

                        NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
                        err = -ENOENT;
                        goto unlock;
                }

                if (!cb->args[0]) {
                        err = fib6_dump_table(tb, skb, cb);
                        if (!err)
                                cb->args[0] = 1;
                }
                goto unlock;
        }

        s_h = cb->args[0];
        s_e = cb->args[1];

        for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
                e = 0;
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
                        if (e < s_e)
                                goto next;
                        err = fib6_dump_table(tb, skb, cb);
                        if (err != 0)
                                goto out;
next:
                        e++;
                }
        }
out:
        cb->args[1] = e;
        cb->args[0] = h;

unlock:
        rcu_read_unlock();
        if (err <= 0)
                fib6_dump_end(cb);
        return err;
}

void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
{
        struct dst_metrics *m;

        if (!f6i)
                return;

        if (READ_ONCE(f6i->fib6_metrics) == &dst_default_metrics) {
                struct dst_metrics *dflt = (struct dst_metrics *)&dst_default_metrics;
                struct dst_metrics *p = kzalloc_obj(*p, GFP_ATOMIC);

                if (!p)
                        return;

                p->metrics[metric - 1] = val;
                refcount_set(&p->refcnt, 1);
                if (cmpxchg(&f6i->fib6_metrics, dflt, p) != dflt)
                        kfree(p);
                else
                        return;
        }

        m = READ_ONCE(f6i->fib6_metrics);
        WRITE_ONCE(m->metrics[metric - 1], val);
}

/*
 *        Routing Table
 *
 *        return the appropriate node for a routing tree "add" operation
 *        by either creating and inserting or by returning an existing
 *        node.
 */

static struct fib6_node *fib6_add_1(struct net *net,
                                    struct fib6_table *table,
                                    struct fib6_node *root,
                                    struct in6_addr *addr, int plen,
                                    int offset, int allow_create,
                                    int replace_required,
                                    struct netlink_ext_ack *extack)
{
        struct fib6_node *fn, *in, *ln;
        struct fib6_node *pn = NULL;
        struct rt6key *key;
        int        bit;
        __be32        dir = 0;

        /* insert node in tree */

        fn = root;

        do {
                struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                key = (struct rt6key *)((u8 *)leaf + offset);

                /*
                 *        Prefix match
                 */
                if (plen < fn->fn_bit ||
                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
                        if (!allow_create) {
                                if (replace_required) {
                                        NL_SET_ERR_MSG(extack,
                                                       "Can not replace route - no match found");
                                        pr_warn("Can't replace route, no match found\n");
                                        return ERR_PTR(-ENOENT);
                                }
                                pr_warn("NLM_F_CREATE should be set when creating new route\n");
                        }
                        goto insert_above;
                }

                /*
                 *        Exact match ?
                 */

                if (plen == fn->fn_bit) {
                        /* clean up an intermediate node */
                        if (!(fn->fn_flags & RTN_RTINFO)) {
                                RCU_INIT_POINTER(fn->leaf, NULL);
                                fib6_info_release(leaf);
                        /* remove null_entry in the root node */
                        } else if (fn->fn_flags & RTN_TL_ROOT &&
                                   rcu_access_pointer(fn->leaf) ==
                                   net->ipv6.fib6_null_entry) {
                                RCU_INIT_POINTER(fn->leaf, NULL);
                        }

                        return fn;
                }

                /*
                 *        We have more bits to go
                 */

                /* Try to walk down on tree. */
                dir = addr_bit_set(addr, fn->fn_bit);
                pn = fn;
                fn = dir ?
                     rcu_dereference_protected(fn->right,
                                        lockdep_is_held(&table->tb6_lock)) :
                     rcu_dereference_protected(fn->left,
                                        lockdep_is_held(&table->tb6_lock));
        } while (fn);

        if (!allow_create) {
                /* We should not create new node because
                 * NLM_F_REPLACE was specified without NLM_F_CREATE
                 * I assume it is safe to require NLM_F_CREATE when
                 * REPLACE flag is used! Later we may want to remove the
                 * check for replace_required, because according
                 * to netlink specification, NLM_F_CREATE
                 * MUST be specified if new route is created.
                 * That would keep IPv6 consistent with IPv4
                 */
                if (replace_required) {
                        NL_SET_ERR_MSG(extack,
                                       "Can not replace route - no match found");
                        pr_warn("Can't replace route, no match found\n");
                        return ERR_PTR(-ENOENT);
                }
                pr_warn("NLM_F_CREATE should be set when creating new route\n");
        }
        /*
         *        We walked to the bottom of tree.
         *        Create new leaf node without children.
         */

        ln = node_alloc(net);

        if (!ln)
                return ERR_PTR(-ENOMEM);
        ln->fn_bit = plen;
        RCU_INIT_POINTER(ln->parent, pn);

        if (dir)
                rcu_assign_pointer(pn->right, ln);
        else
                rcu_assign_pointer(pn->left, ln);

        return ln;


insert_above:
        /*
         * split since we don't have a common prefix anymore or
         * we have a less significant route.
         * we've to insert an intermediate node on the list
         * this new node will point to the one we need to create
         * and the current
         */

        pn = rcu_dereference_protected(fn->parent,
                                       lockdep_is_held(&table->tb6_lock));

        /* find 1st bit in difference between the 2 addrs.

           See comment in __ipv6_addr_diff: bit may be an invalid value,
           but if it is >= plen, the value is ignored in any case.
         */

        bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));

        /*
         *                (intermediate)[in]
         *                  /           \
         *        (new leaf node)[ln] (old node)[fn]
         */
        if (plen > bit) {
                in = node_alloc(net);
                ln = node_alloc(net);

                if (!in || !ln) {
                        if (in)
                                node_free_immediate(net, in);
                        if (ln)
                                node_free_immediate(net, ln);
                        return ERR_PTR(-ENOMEM);
                }

                /*
                 * new intermediate node.
                 * RTN_RTINFO will
                 * be off since that an address that chooses one of
                 * the branches would not match less specific routes
                 * in the other branch
                 */

                in->fn_bit = bit;

                RCU_INIT_POINTER(in->parent, pn);
                in->leaf = fn->leaf;
                fib6_info_hold(rcu_dereference_protected(in->leaf,
                                lockdep_is_held(&table->tb6_lock)));

                /* update parent pointer */
                if (dir)
                        rcu_assign_pointer(pn->right, in);
                else
                        rcu_assign_pointer(pn->left, in);

                ln->fn_bit = plen;

                RCU_INIT_POINTER(ln->parent, in);
                rcu_assign_pointer(fn->parent, in);

                if (addr_bit_set(addr, bit)) {
                        rcu_assign_pointer(in->right, ln);
                        rcu_assign_pointer(in->left, fn);
                } else {
                        rcu_assign_pointer(in->left, ln);
                        rcu_assign_pointer(in->right, fn);
                }
        } else { /* plen <= bit */

                /*
                 *                (new leaf node)[ln]
                 *                  /           \
                 *             (old node)[fn] NULL
                 */

                ln = node_alloc(net);

                if (!ln)
                        return ERR_PTR(-ENOMEM);

                ln->fn_bit = plen;

                RCU_INIT_POINTER(ln->parent, pn);

                if (addr_bit_set(&key->addr, plen))
                        RCU_INIT_POINTER(ln->right, fn);
                else
                        RCU_INIT_POINTER(ln->left, fn);

                rcu_assign_pointer(fn->parent, ln);

                if (dir)
                        rcu_assign_pointer(pn->right, ln);
                else
                        rcu_assign_pointer(pn->left, ln);
        }
        return ln;
}

static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
                                  const struct fib6_info *match)
{
        int cpu;

        if (!fib6_nh->rt6i_pcpu)
                return;

        rcu_read_lock();
        /* release the reference to this fib entry from
         * all of its cached pcpu routes
         */
        for_each_possible_cpu(cpu) {
                struct rt6_info **ppcpu_rt;
                struct rt6_info *pcpu_rt;

                ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);

                /* Paired with xchg() in rt6_get_pcpu_route() */
                pcpu_rt = READ_ONCE(*ppcpu_rt);

                /* only dropping the 'from' reference if the cached route
                 * is using 'match'. The cached pcpu_rt->from only changes
                 * from a fib6_info to NULL (ip6_dst_destroy); it can never
                 * change from one fib6_info reference to another
                 */
                if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
                        struct fib6_info *from;

                        from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
                        fib6_info_release(from);
                }
        }
        rcu_read_unlock();
}

static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
{
        struct fib6_info *arg = _arg;

        __fib6_drop_pcpu_from(nh, arg);
        return 0;
}

static void fib6_drop_pcpu_from(struct fib6_info *f6i)
{
        /* Make sure rt6_make_pcpu_route() wont add other percpu routes
         * while we are cleaning them here.
         */
        f6i->fib6_destroying = 1;
        mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */

        if (f6i->nh) {
                rcu_read_lock();
                nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i);
                rcu_read_unlock();
        } else {
                struct fib6_nh *fib6_nh;

                fib6_nh = f6i->fib6_nh;
                __fib6_drop_pcpu_from(fib6_nh, f6i);
        }
}

static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
                          struct net *net)
{
        struct fib6_table *table = rt->fib6_table;

        /* Flush all cached dst in exception table */
        rt6_flush_exceptions(rt);
        fib6_drop_pcpu_from(rt);

        if (rt->nh) {
                spin_lock(&rt->nh->lock);

                if (!list_empty(&rt->nh_list))
                        list_del_init(&rt->nh_list);

                spin_unlock(&rt->nh->lock);
        }

        if (refcount_read(&rt->fib6_ref) != 1) {
                /* This route is used as dummy address holder in some split
                 * nodes. It is not leaked, but it still holds other resources,
                 * which must be released in time. So, scan ascendant nodes
                 * and replace dummy references to this route with references
                 * to still alive ones.
                 */
                while (fn) {
                        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                        struct fib6_info *new_leaf;
                        if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
                                new_leaf = fib6_find_prefix(net, table, fn);
                                fib6_info_hold(new_leaf);

                                rcu_assign_pointer(fn->leaf, new_leaf);
                                fib6_info_release(rt);
                        }
                        fn = rcu_dereference_protected(fn->parent,
                                    lockdep_is_held(&table->tb6_lock));
                }
        }

        fib6_clean_expires(rt);
        fib6_remove_gc_list(rt);
}

/*
 *        Insert routing information in a node.
 */

static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
                            struct nl_info *info, struct netlink_ext_ack *extack,
                            struct list_head *purge_list)
{
        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
        struct fib6_info *iter = NULL;
        struct fib6_info __rcu **ins;
        struct fib6_info __rcu **fallback_ins = NULL;
        int replace = (info->nlh &&
                       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
        int add = (!info->nlh ||
                   (info->nlh->nlmsg_flags & NLM_F_CREATE));
        int found = 0;
        bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
        bool notify_sibling_rt = false;
        u16 nlflags = NLM_F_EXCL;
        int err;

        if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
                nlflags |= NLM_F_APPEND;

        ins = &fn->leaf;

        for (iter = leaf; iter;
             iter = rcu_dereference_protected(iter->fib6_next,
                                lockdep_is_held(&rt->fib6_table->tb6_lock))) {
                /*
                 *        Search for duplicates
                 */

                if (iter->fib6_metric == rt->fib6_metric) {
                        /*
                         *        Same priority level
                         */
                        if (info->nlh &&
                            (info->nlh->nlmsg_flags & NLM_F_EXCL))
                                return -EEXIST;

                        nlflags &= ~NLM_F_EXCL;
                        if (replace) {
                                if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
                                        found++;
                                        break;
                                }
                                fallback_ins = fallback_ins ?: ins;
                                goto next_iter;
                        }

                        if (rt6_duplicate_nexthop(iter, rt)) {
                                if (rt->fib6_nsiblings)
                                        WRITE_ONCE(rt->fib6_nsiblings, 0);
                                if (!(iter->fib6_flags & RTF_EXPIRES))
                                        return -EEXIST;
                                if (!(rt->fib6_flags & RTF_EXPIRES)) {
                                        fib6_clean_expires(iter);
                                        fib6_may_remove_gc_list(info->nl_net, iter);
                                } else {
                                        fib6_set_expires(iter, rt->expires);
                                        fib6_add_gc_list(iter);
                                }
                                if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT)) &&
                                    (iter->nh || !iter->fib6_nh->fib_nh_gw_family)) {
                                        iter->fib6_flags &= ~RTF_ADDRCONF;
                                        iter->fib6_flags &= ~RTF_PREFIX_RT;
                                }

                                if (rt->fib6_pmtu)
                                        fib6_metric_set(iter, RTAX_MTU,
                                                        rt->fib6_pmtu);
                                return -EEXIST;
                        }
                        /* If we have the same destination and the same metric,
                         * but not the same gateway, then the route we try to
                         * add is sibling to this route, increment our counter
                         * of siblings, and later we will add our route to the
                         * list.
                         * Only static routes (which don't have flag
                         * RTF_EXPIRES) are used for ECMPv6.
                         *
                         * To avoid long list, we only had siblings if the
                         * route have a gateway.
                         */
                        if (rt_can_ecmp &&
                            rt6_qualify_for_ecmp(iter))
                                WRITE_ONCE(rt->fib6_nsiblings,
                                           rt->fib6_nsiblings + 1);
                }

                if (iter->fib6_metric > rt->fib6_metric)
                        break;

next_iter:
                ins = &iter->fib6_next;
        }

        if (fallback_ins && !found) {
                /* No matching route with same ecmp-able-ness found, replace
                 * first matching route
                 */
                ins = fallback_ins;
                iter = rcu_dereference_protected(*ins,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                found++;
        }

        /* Reset round-robin state, if necessary */
        if (ins == &fn->leaf)
                fn->rr_ptr = NULL;

        /* Link this route to others same route. */
        if (rt->fib6_nsiblings) {
                unsigned int fib6_nsiblings;
                struct fib6_info *sibling, *temp_sibling;

                /* Find the first route that have the same metric */
                sibling = leaf;
                notify_sibling_rt = true;
                while (sibling) {
                        if (sibling->fib6_metric == rt->fib6_metric &&
                            rt6_qualify_for_ecmp(sibling)) {
                                list_add_tail_rcu(&rt->fib6_siblings,
                                                  &sibling->fib6_siblings);
                                break;
                        }
                        sibling = rcu_dereference_protected(sibling->fib6_next,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                        notify_sibling_rt = false;
                }
                /* For each sibling in the list, increment the counter of
                 * siblings. BUG() if counters does not match, list of siblings
                 * is broken!
                 */
                fib6_nsiblings = 0;
                list_for_each_entry_safe(sibling, temp_sibling,
                                         &rt->fib6_siblings, fib6_siblings) {
                        WRITE_ONCE(sibling->fib6_nsiblings,
                                   sibling->fib6_nsiblings + 1);
                        BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
                        fib6_nsiblings++;
                }
                BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
                rcu_read_lock();
                rt6_multipath_rebalance(temp_sibling);
                rcu_read_unlock();
        }

        /*
         *        insert node
         */
        if (!replace) {
                if (!add)
                        pr_warn("NLM_F_CREATE should be set when creating new route\n");

add:
                nlflags |= NLM_F_CREATE;

                /* The route should only be notified if it is the first
                 * route in the node or if it is added as a sibling
                 * route to the first route in the node.
                 */
                if (!info->skip_notify_kernel &&
                    (notify_sibling_rt || ins == &fn->leaf)) {
                        enum fib_event_type fib_event;

                        if (notify_sibling_rt)
                                fib_event = FIB_EVENT_ENTRY_APPEND;
                        else
                                fib_event = FIB_EVENT_ENTRY_REPLACE;
                        err = call_fib6_entry_notifiers(info->nl_net,
                                                        fib_event, rt,
                                                        extack);
                        if (err) {
                                struct fib6_info *sibling, *next_sibling;

                                /* If the route has siblings, then it first
                                 * needs to be unlinked from them.
                                 */
                                if (!rt->fib6_nsiblings)
                                        return err;

                                list_for_each_entry_safe(sibling, next_sibling,
                                                         &rt->fib6_siblings,
                                                         fib6_siblings)
                                        WRITE_ONCE(sibling->fib6_nsiblings,
                                                   sibling->fib6_nsiblings - 1);
                                WRITE_ONCE(rt->fib6_nsiblings, 0);
                                list_del_rcu(&rt->fib6_siblings);
                                rcu_read_lock();
                                rt6_multipath_rebalance(next_sibling);
                                rcu_read_unlock();
                                return err;
                        }
                }

                rcu_assign_pointer(rt->fib6_next, iter);
                fib6_info_hold(rt);
                rcu_assign_pointer(rt->fib6_node, fn);
                rcu_assign_pointer(*ins, rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
                info->nl_net->ipv6.rt6_stats->fib_rt_entries++;

                if (!(fn->fn_flags & RTN_RTINFO)) {
                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                        fn->fn_flags |= RTN_RTINFO;
                }

        } else {
                int nsiblings;

                if (!found) {
                        if (add)
                                goto add;
                        pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
                        return -ENOENT;
                }

                if (!info->skip_notify_kernel && ins == &fn->leaf) {
                        err = call_fib6_entry_notifiers(info->nl_net,
                                                        FIB_EVENT_ENTRY_REPLACE,
                                                        rt, extack);
                        if (err)
                                return err;
                }

                fib6_info_hold(rt);
                rcu_assign_pointer(rt->fib6_node, fn);
                rt->fib6_next = iter->fib6_next;
                rcu_assign_pointer(*ins, rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
                if (!(fn->fn_flags & RTN_RTINFO)) {
                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                        fn->fn_flags |= RTN_RTINFO;
                }
                nsiblings = iter->fib6_nsiblings;
                iter->fib6_node = NULL;
                list_add(&iter->purge_link, purge_list);
                if (rcu_access_pointer(fn->rr_ptr) == iter)
                        fn->rr_ptr = NULL;

                if (nsiblings) {
                        /* Replacing an ECMP route, remove all siblings */
                        ins = &rt->fib6_next;
                        iter = rcu_dereference_protected(*ins,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                        while (iter) {
                                if (iter->fib6_metric > rt->fib6_metric)
                                        break;
                                if (rt6_qualify_for_ecmp(iter)) {
                                        *ins = iter->fib6_next;
                                        iter->fib6_node = NULL;
                                        list_add(&iter->purge_link, purge_list);
                                        if (rcu_access_pointer(fn->rr_ptr) == iter)
                                                fn->rr_ptr = NULL;
                                        nsiblings--;
                                        info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
                                } else {
                                        ins = &iter->fib6_next;
                                }
                                iter = rcu_dereference_protected(*ins,
                                        lockdep_is_held(&rt->fib6_table->tb6_lock));
                        }
                        WARN_ON(nsiblings != 0);
                }
        }

        return 0;
}

static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt,
                               struct nl_info *info, struct netlink_ext_ack *extack,
                               struct list_head *purge_list)
{
        int err;

        spin_lock(&rt->nh->lock);

        if (rt->nh->dead) {
                NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
                err = -EINVAL;
        } else {
                err = fib6_add_rt2node(fn, rt, info, extack, purge_list);
                if (!err)
                        list_add(&rt->nh_list, &rt->nh->f6i_list);
        }

        spin_unlock(&rt->nh->lock);

        return err;
}

static void fib6_start_gc(struct net *net, struct fib6_info *rt)
{
        if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
            (rt->fib6_flags & RTF_EXPIRES))
                mod_timer(&net->ipv6.ip6_fib_timer,
                          jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval));
}

void fib6_force_start_gc(struct net *net)
{
        if (!timer_pending(&net->ipv6.ip6_fib_timer))
                mod_timer(&net->ipv6.ip6_fib_timer,
                          jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval));
}

static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
                                           int sernum)
{
        struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));

        /* paired with smp_rmb() in fib6_get_cookie_safe() */
        smp_wmb();
        while (fn) {
                WRITE_ONCE(fn->fn_sernum, sernum);
                fn = rcu_dereference_protected(fn->parent,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));
        }
}

void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
{
        __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}

/*
 *        Add routing information to the routing tree.
 *        <destination addr>/<source addr>
 *        with source addr info in sub-trees
 *        Need to own table->tb6_lock
 */

int fib6_add(struct fib6_node *root, struct fib6_info *rt,
             struct nl_info *info, struct netlink_ext_ack *extack)
{
        struct fib6_table *table = rt->fib6_table;
        LIST_HEAD(purge_list);
        struct fib6_node *fn;
#ifdef CONFIG_IPV6_SUBTREES
        struct fib6_node *pn = NULL;
#endif
        int err = -ENOMEM;
        int allow_create = 1;
        int replace_required = 0;

        if (info->nlh) {
                if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
                        allow_create = 0;
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        replace_required = 1;
        }
        if (!allow_create && !replace_required)
                pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");

        fn = fib6_add_1(info->nl_net, table, root,
                        &rt->fib6_dst.addr, rt->fib6_dst.plen,
                        offsetof(struct fib6_info, fib6_dst), allow_create,
                        replace_required, extack);
        if (IS_ERR(fn)) {
                err = PTR_ERR(fn);
                fn = NULL;
                goto out;
        }

#ifdef CONFIG_IPV6_SUBTREES
        pn = fn;

        if (rt->fib6_src.plen) {
                struct fib6_node *sn;

                if (!rcu_access_pointer(fn->subtree)) {
                        struct fib6_node *sfn;

                        /*
                         * Create subtree.
                         *
                         *                fn[main tree]
                         *                |
                         *                sfn[subtree root]
                         *                   \
                         *                    sn[new leaf node]
                         */

                        /* Create subtree root node */
                        sfn = node_alloc(info->nl_net);
                        if (!sfn)
                                goto failure;

                        fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
                        rcu_assign_pointer(sfn->leaf,
                                           info->nl_net->ipv6.fib6_null_entry);
                        sfn->fn_flags = RTN_ROOT;

                        /* Now add the first leaf node to new subtree */

                        sn = fib6_add_1(info->nl_net, table, sfn,
                                        &rt->fib6_src.addr, rt->fib6_src.plen,
                                        offsetof(struct fib6_info, fib6_src),
                                        allow_create, replace_required, extack);

                        if (IS_ERR(sn)) {
                                /* If it is failed, discard just allocated
                                   root, and then (in failure) stale node
                                   in main tree.
                                 */
                                node_free_immediate(info->nl_net, sfn);
                                err = PTR_ERR(sn);
                                goto failure;
                        }

                        /* Now link new subtree to main tree */
                        rcu_assign_pointer(sfn->parent, fn);
                        rcu_assign_pointer(fn->subtree, sfn);
                } else {
                        sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
                                        &rt->fib6_src.addr, rt->fib6_src.plen,
                                        offsetof(struct fib6_info, fib6_src),
                                        allow_create, replace_required, extack);

                        if (IS_ERR(sn)) {
                                err = PTR_ERR(sn);
                                goto failure;
                        }
                }

                if (!rcu_access_pointer(fn->leaf)) {
                        if (fn->fn_flags & RTN_TL_ROOT) {
                                /* put back null_entry for root node */
                                rcu_assign_pointer(fn->leaf,
                                            info->nl_net->ipv6.fib6_null_entry);
                        } else {
                                fib6_info_hold(rt);
                                rcu_assign_pointer(fn->leaf, rt);
                        }
                }
                fn = sn;
        }
#endif

        if (rt->nh)
                err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list);
        else
                err = fib6_add_rt2node(fn, rt, info, extack, &purge_list);
        if (!err) {
                struct fib6_info *iter, *next;

                list_for_each_entry_safe(iter, next, &purge_list, purge_link) {
                        list_del(&iter->purge_link);
                        fib6_purge_rt(iter, fn, info->nl_net);
                        fib6_info_release(iter);
                }

                __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));

                if (rt->fib6_flags & RTF_EXPIRES)
                        fib6_add_gc_list(rt);

                fib6_start_gc(info->nl_net, rt);
        }

out:
        if (err) {
#ifdef CONFIG_IPV6_SUBTREES
                /*
                 * If fib6_add_1 has cleared the old leaf pointer in the
                 * super-tree leaf node we have to find a new one for it.
                 */
                if (pn != fn) {
                        struct fib6_info *pn_leaf =
                                rcu_dereference_protected(pn->leaf,
                                    lockdep_is_held(&table->tb6_lock));
                        if (pn_leaf == rt) {
                                pn_leaf = NULL;
                                RCU_INIT_POINTER(pn->leaf, NULL);
                                fib6_info_release(rt);
                        }
                        if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
                                pn_leaf = fib6_find_prefix(info->nl_net, table,
                                                           pn);
                                if (!pn_leaf)
                                        pn_leaf =
                                            info->nl_net->ipv6.fib6_null_entry;
                                fib6_info_hold(pn_leaf);
                                rcu_assign_pointer(pn->leaf, pn_leaf);
                        }
                }
#endif
                goto failure;
        } else if (fib6_requires_src(rt)) {
                fib6_routes_require_src_inc(info->nl_net);
        }
        return err;

failure:
        /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
         * 1. fn is an intermediate node and we failed to add the new
         * route to it in both subtree creation failure and fib6_add_rt2node()
         * failure case.
         * 2. fn is the root node in the table and we fail to add the first
         * default route to it.
         */
        if (fn &&
            (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
             (fn->fn_flags & RTN_TL_ROOT &&
              !rcu_access_pointer(fn->leaf))))
                fib6_repair_tree(info->nl_net, table, fn);
        return err;
}

/*
 *        Routing tree lookup
 *
 */

struct lookup_args {
        int                        offset;                /* key offset on fib6_info */
        const struct in6_addr        *addr;                /* search key                        */
};

static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
                                            struct lookup_args *args)
{
        struct fib6_node *fn;
        __be32 dir;

        if (unlikely(args->offset == 0))
                return NULL;

        /*
         *        Descend on a tree
         */

        fn = root;

        for (;;) {
                struct fib6_node *next;

                dir = addr_bit_set(args->addr, fn->fn_bit);

                next = dir ? rcu_dereference(fn->right) :
                             rcu_dereference(fn->left);

                if (next) {
                        fn = next;
                        continue;
                }
                break;
        }

        while (fn) {
                struct fib6_node *subtree = FIB6_SUBTREE(fn);

                if (subtree || fn->fn_flags & RTN_RTINFO) {
                        struct fib6_info *leaf = rcu_dereference(fn->leaf);
                        struct rt6key *key;

                        if (!leaf)
                                goto backtrack;

                        key = (struct rt6key *) ((u8 *)leaf + args->offset);

                        if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
#ifdef CONFIG_IPV6_SUBTREES
                                if (subtree) {
                                        struct fib6_node *sfn;
                                        sfn = fib6_node_lookup_1(subtree,
                                                                 args + 1);
                                        if (!sfn)
                                                goto backtrack;
                                        fn = sfn;
                                }
#endif
                                if (fn->fn_flags & RTN_RTINFO)
                                        return fn;
                        }
                }
backtrack:
                if (fn->fn_flags & RTN_ROOT)
                        break;

                fn = rcu_dereference(fn->parent);
        }

        return NULL;
}

/* called with rcu_read_lock() held
 */
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
                                   const struct in6_addr *daddr,
                                   const struct in6_addr *saddr)
{
        struct fib6_node *fn;
        struct lookup_args args[] = {
                {
                        .offset = offsetof(struct fib6_info, fib6_dst),
                        .addr = daddr,
                },
#ifdef CONFIG_IPV6_SUBTREES
                {
                        .offset = offsetof(struct fib6_info, fib6_src),
                        .addr = saddr,
                },
#endif
                {
                        .offset = 0,        /* sentinel */
                }
        };

        fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
        if (!fn || fn->fn_flags & RTN_TL_ROOT)
                fn = root;

        return fn;
}

/*
 *        Get node with specified destination prefix (and source prefix,
 *        if subtrees are used)
 *        exact_match == true means we try to find fn with exact match of
 *        the passed in prefix addr
 *        exact_match == false means we try to find fn with longest prefix
 *        match of the passed in prefix addr. This is useful for finding fn
 *        for cached route as it will be stored in the exception table under
 *        the node with longest prefix length.
 */


static struct fib6_node *fib6_locate_1(struct fib6_node *root,
                                       const struct in6_addr *addr,
                                       int plen, int offset,
                                       bool exact_match)
{
        struct fib6_node *fn, *prev = NULL;

        for (fn = root; fn ; ) {
                struct fib6_info *leaf = rcu_dereference(fn->leaf);
                struct rt6key *key;

                /* This node is being deleted */
                if (!leaf) {
                        if (plen <= fn->fn_bit)
                                goto out;
                        else
                                goto next;
                }

                key = (struct rt6key *)((u8 *)leaf + offset);

                /*
                 *        Prefix match
                 */
                if (plen < fn->fn_bit ||
                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
                        goto out;

                if (plen == fn->fn_bit)
                        return fn;

                if (fn->fn_flags & RTN_RTINFO)
                        prev = fn;

next:
                /*
                 *        We have more bits to go
                 */
                if (addr_bit_set(addr, fn->fn_bit))
                        fn = rcu_dereference(fn->right);
                else
                        fn = rcu_dereference(fn->left);
        }
out:
        if (exact_match)
                return NULL;
        else
                return prev;
}

struct fib6_node *fib6_locate(struct fib6_node *root,
                              const struct in6_addr *daddr, int dst_len,
                              const struct in6_addr *saddr, int src_len,
                              bool exact_match)
{
        struct fib6_node *fn;

        fn = fib6_locate_1(root, daddr, dst_len,
                           offsetof(struct fib6_info, fib6_dst),
                           exact_match);

#ifdef CONFIG_IPV6_SUBTREES
        if (src_len) {
                WARN_ON(saddr == NULL);
                if (fn) {
                        struct fib6_node *subtree = FIB6_SUBTREE(fn);

                        if (subtree) {
                                fn = fib6_locate_1(subtree, saddr, src_len,
                                           offsetof(struct fib6_info, fib6_src),
                                           exact_match);
                        }
                }
        }
#endif

        if (fn && fn->fn_flags & RTN_RTINFO)
                return fn;

        return NULL;
}


/*
 *        Deletion
 *
 */

static struct fib6_info *fib6_find_prefix(struct net *net,
                                         struct fib6_table *table,
                                         struct fib6_node *fn)
{
        struct fib6_node *child_left, *child_right;

        if (fn->fn_flags & RTN_ROOT)
                return net->ipv6.fib6_null_entry;

        while (fn) {
                child_left = rcu_dereference_protected(fn->left,
                                    lockdep_is_held(&table->tb6_lock));
                child_right = rcu_dereference_protected(fn->right,
                                    lockdep_is_held(&table->tb6_lock));
                if (child_left)
                        return rcu_dereference_protected(child_left->leaf,
                                        lockdep_is_held(&table->tb6_lock));
                if (child_right)
                        return rcu_dereference_protected(child_right->leaf,
                                        lockdep_is_held(&table->tb6_lock));

                fn = FIB6_SUBTREE(fn);
        }
        return NULL;
}

/*
 *        Called to trim the tree of intermediate nodes when possible. "fn"
 *        is the node we want to try and remove.
 *        Need to own table->tb6_lock
 */

static struct fib6_node *fib6_repair_tree(struct net *net,
                                          struct fib6_table *table,
                                          struct fib6_node *fn)
{
        int children;
        int nstate;
        struct fib6_node *child;
        struct fib6_walker *w;
        int iter = 0;

        /* Set fn->leaf to null_entry for root node. */
        if (fn->fn_flags & RTN_TL_ROOT) {
                rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
                return fn;
        }

        for (;;) {
                struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn = rcu_dereference_protected(fn->parent,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *new_fn_leaf;

                pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
                iter++;

                WARN_ON(fn->fn_flags & RTN_RTINFO);
                WARN_ON(fn->fn_flags & RTN_TL_ROOT);
                WARN_ON(fn_leaf);

                children = 0;
                child = NULL;
                if (fn_r) {
                        child = fn_r;
                        children |= 1;
                }
                if (fn_l) {
                        child = fn_l;
                        children |= 2;
                }

                if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
                    /* Subtree root (i.e. fn) may have one child */
                    || (children && fn->fn_flags & RTN_ROOT)
#endif
                    ) {
                        new_fn_leaf = fib6_find_prefix(net, table, fn);
#if RT6_DEBUG >= 2
                        if (!new_fn_leaf) {
                                WARN_ON(!new_fn_leaf);
                                new_fn_leaf = net->ipv6.fib6_null_entry;
                        }
#endif
                        fib6_info_hold(new_fn_leaf);
                        rcu_assign_pointer(fn->leaf, new_fn_leaf);
                        return pn;
                }

#ifdef CONFIG_IPV6_SUBTREES
                if (FIB6_SUBTREE(pn) == fn) {
                        WARN_ON(!(fn->fn_flags & RTN_ROOT));
                        RCU_INIT_POINTER(pn->subtree, NULL);
                        nstate = FWS_L;
                } else {
                        WARN_ON(fn->fn_flags & RTN_ROOT);
#endif
                        if (pn_r == fn)
                                rcu_assign_pointer(pn->right, child);
                        else if (pn_l == fn)
                                rcu_assign_pointer(pn->left, child);
#if RT6_DEBUG >= 2
                        else
                                WARN_ON(1);
#endif
                        if (child)
                                rcu_assign_pointer(child->parent, pn);
                        nstate = FWS_R;
#ifdef CONFIG_IPV6_SUBTREES
                }
#endif

                read_lock(&net->ipv6.fib6_walker_lock);
                FOR_WALKERS(net, w) {
                        if (!child) {
                                if (w->node == fn) {
                                        pr_debug("W %p adjusted by delnode 1, s=%d/%d\n",
                                                 w, w->state, nstate);
                                        w->node = pn;
                                        w->state = nstate;
                                }
                        } else {
                                if (w->node == fn) {
                                        w->node = child;
                                        if (children&2) {
                                                pr_debug("W %p adjusted by delnode 2, s=%d\n",
                                                         w, w->state);
                                                w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
                                        } else {
                                                pr_debug("W %p adjusted by delnode 2, s=%d\n",
                                                         w, w->state);
                                                w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
                                        }
                                }
                        }
                }
                read_unlock(&net->ipv6.fib6_walker_lock);

                node_free(net, fn);
                if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
                        return pn;

                RCU_INIT_POINTER(pn->leaf, NULL);
                fib6_info_release(pn_leaf);
                fn = pn;
        }
}

static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
                           struct fib6_info __rcu **rtp, struct nl_info *info)
{
        struct fib6_info *leaf, *replace_rt = NULL;
        struct fib6_walker *w;
        struct fib6_info *rt = rcu_dereference_protected(*rtp,
                                    lockdep_is_held(&table->tb6_lock));
        struct net *net = info->nl_net;
        bool notify_del = false;

        /* If the deleted route is the first in the node and it is not part of
         * a multipath route, then we need to replace it with the next route
         * in the node, if exists.
         */
        leaf = rcu_dereference_protected(fn->leaf,
                                         lockdep_is_held(&table->tb6_lock));
        if (leaf == rt && !rt->fib6_nsiblings) {
                if (rcu_access_pointer(rt->fib6_next))
                        replace_rt = rcu_dereference_protected(rt->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                else
                        notify_del = true;
        }

        /* Unlink it */
        *rtp = rt->fib6_next;
        rt->fib6_node = NULL;
        net->ipv6.rt6_stats->fib_rt_entries--;
        net->ipv6.rt6_stats->fib_discarded_routes++;

        /* Reset round-robin state, if necessary */
        if (rcu_access_pointer(fn->rr_ptr) == rt)
                fn->rr_ptr = NULL;

        /* Remove this entry from other siblings */
        if (rt->fib6_nsiblings) {
                struct fib6_info *sibling, *next_sibling;

                /* The route is deleted from a multipath route. If this
                 * multipath route is the first route in the node, then we need
                 * to emit a delete notification. Otherwise, we need to skip
                 * the notification.
                 */
                if (rt->fib6_metric == leaf->fib6_metric &&
                    rt6_qualify_for_ecmp(leaf))
                        notify_del = true;
                list_for_each_entry_safe(sibling, next_sibling,
                                         &rt->fib6_siblings, fib6_siblings)
                        WRITE_ONCE(sibling->fib6_nsiblings,
                                   sibling->fib6_nsiblings - 1);
                WRITE_ONCE(rt->fib6_nsiblings, 0);
                list_del_rcu(&rt->fib6_siblings);
                rt6_multipath_rebalance(next_sibling);
        }

        /* Adjust walkers */
        read_lock(&net->ipv6.fib6_walker_lock);
        FOR_WALKERS(net, w) {
                if (w->state == FWS_C && w->leaf == rt) {
                        pr_debug("walker %p adjusted by delroute\n", w);
                        w->leaf = rcu_dereference_protected(rt->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                        if (!w->leaf)
                                w->state = FWS_U;
                }
        }
        read_unlock(&net->ipv6.fib6_walker_lock);

        /* If it was last route, call fib6_repair_tree() to:
         * 1. For root node, put back null_entry as how the table was created.
         * 2. For other nodes, expunge its radix tree node.
         */
        if (!rcu_access_pointer(fn->leaf)) {
                if (!(fn->fn_flags & RTN_TL_ROOT)) {
                        fn->fn_flags &= ~RTN_RTINFO;
                        net->ipv6.rt6_stats->fib_route_nodes--;
                }
                fn = fib6_repair_tree(net, table, fn);
        }

        fib6_purge_rt(rt, fn, net);

        if (!info->skip_notify_kernel) {
                if (notify_del)
                        call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
                                                  rt, NULL);
                else if (replace_rt)
                        call_fib6_entry_notifiers_replace(net, replace_rt);
        }
        if (!info->skip_notify)
                inet6_rt_notify(RTM_DELROUTE, rt, info, 0);

        fib6_info_release(rt);
}

/* Need to own table->tb6_lock */
int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
        struct net *net = info->nl_net;
        struct fib6_info __rcu **rtp;
        struct fib6_info __rcu **rtp_next;
        struct fib6_table *table;
        struct fib6_node *fn;

        if (rt == net->ipv6.fib6_null_entry)
                return -ENOENT;

        table = rt->fib6_table;
        fn = rcu_dereference_protected(rt->fib6_node,
                                       lockdep_is_held(&table->tb6_lock));
        if (!fn)
                return -ENOENT;

        WARN_ON(!(fn->fn_flags & RTN_RTINFO));

        /*
         *        Walk the leaf entries looking for ourself
         */

        for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
                struct fib6_info *cur = rcu_dereference_protected(*rtp,
                                        lockdep_is_held(&table->tb6_lock));
                if (rt == cur) {
                        if (fib6_requires_src(cur))
                                fib6_routes_require_src_dec(info->nl_net);
                        fib6_del_route(table, fn, rtp, info);
                        return 0;
                }
                rtp_next = &cur->fib6_next;
        }
        return -ENOENT;
}

/*
 *        Tree traversal function.
 *
 *        Certainly, it is not interrupt safe.
 *        However, it is internally reenterable wrt itself and fib6_add/fib6_del.
 *        It means, that we can modify tree during walking
 *        and use this function for garbage collection, clone pruning,
 *        cleaning tree when a device goes down etc. etc.
 *
 *        It guarantees that every node will be traversed,
 *        and that it will be traversed only once.
 *
 *        Callback function w->func may return:
 *        0 -> continue walking.
 *        positive value -> walking is suspended (used by tree dumps,
 *        and probably by gc, if it will be split to several slices)
 *        negative value -> terminate walking.
 *
 *        The function itself returns:
 *        0   -> walk is complete.
 *        >0  -> walk is incomplete (i.e. suspended)
 *        <0  -> walk is terminated by an error.
 *
 *        This function is called with tb6_lock held.
 */

static int fib6_walk_continue(struct fib6_walker *w)
{
        struct fib6_node *fn, *pn, *left, *right;

        /* w->root should always be table->tb6_root */
        WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));

        for (;;) {
                fn = w->node;
                if (!fn)
                        return 0;

                switch (w->state) {
#ifdef CONFIG_IPV6_SUBTREES
                case FWS_S:
                        if (FIB6_SUBTREE(fn)) {
                                w->node = FIB6_SUBTREE(fn);
                                continue;
                        }
                        w->state = FWS_L;
                        fallthrough;
#endif
                case FWS_L:
                        left = rcu_dereference_protected(fn->left, 1);
                        if (left) {
                                w->node = left;
                                w->state = FWS_INIT;
                                continue;
                        }
                        w->state = FWS_R;
                        fallthrough;
                case FWS_R:
                        right = rcu_dereference_protected(fn->right, 1);
                        if (right) {
                                w->node = right;
                                w->state = FWS_INIT;
                                continue;
                        }
                        w->state = FWS_C;
                        w->leaf = rcu_dereference_protected(fn->leaf, 1);
                        fallthrough;
                case FWS_C:
                        if (w->leaf && fn->fn_flags & RTN_RTINFO) {
                                int err;

                                if (w->skip) {
                                        w->skip--;
                                        goto skip;
                                }

                                err = w->func(w);
                                if (err)
                                        return err;

                                w->count++;
                                continue;
                        }
skip:
                        w->state = FWS_U;
                        fallthrough;
                case FWS_U:
                        if (fn == w->root)
                                return 0;
                        pn = rcu_dereference_protected(fn->parent, 1);
                        left = rcu_dereference_protected(pn->left, 1);
                        right = rcu_dereference_protected(pn->right, 1);
                        w->node = pn;
#ifdef CONFIG_IPV6_SUBTREES
                        if (FIB6_SUBTREE(pn) == fn) {
                                WARN_ON(!(fn->fn_flags & RTN_ROOT));
                                w->state = FWS_L;
                                continue;
                        }
#endif
                        if (left == fn) {
                                w->state = FWS_R;
                                continue;
                        }
                        if (right == fn) {
                                w->state = FWS_C;
                                w->leaf = rcu_dereference_protected(w->node->leaf, 1);
                                continue;
                        }
#if RT6_DEBUG >= 2
                        WARN_ON(1);
#endif
                }
        }
}

static int fib6_walk(struct net *net, struct fib6_walker *w)
{
        int res;

        w->state = FWS_INIT;
        w->node = w->root;

        fib6_walker_link(net, w);
        res = fib6_walk_continue(w);
        if (res <= 0)
                fib6_walker_unlink(net, w);
        return res;
}

static int fib6_clean_node(struct fib6_walker *w)
{
        int res;
        struct fib6_info *rt;
        struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
        struct nl_info info = {
                .nl_net = c->net,
                .skip_notify = c->skip_notify,
        };

        if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
            READ_ONCE(w->node->fn_sernum) != c->sernum)
                WRITE_ONCE(w->node->fn_sernum, c->sernum);

        if (!c->func) {
                WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
                w->leaf = NULL;
                return 0;
        }

        for_each_fib6_walker_rt(w) {
                res = c->func(rt, c->arg);
                if (res == -1) {
                        w->leaf = rt;
                        res = fib6_del(rt, &info);
                        if (res) {
#if RT6_DEBUG >= 2
                                pr_debug("%s: del failed: rt=%p@%p err=%d\n",
                                         __func__, rt,
                                         rcu_access_pointer(rt->fib6_node),
                                         res);
#endif
                                continue;
                        }
                        return 0;
                } else if (res == -2) {
                        if (WARN_ON(!rt->fib6_nsiblings))
                                continue;
                        rt = list_last_entry(&rt->fib6_siblings,
                                             struct fib6_info, fib6_siblings);
                        continue;
                }
                WARN_ON(res != 0);
        }
        w->leaf = rt;
        return 0;
}

/*
 *        Convenient frontend to tree walker.
 *
 *        func is called on each route.
 *                It may return -2 -> skip multipath route.
 *                              -1 -> delete this route.
 *                              0  -> continue walking
 */

static void fib6_clean_tree(struct net *net, struct fib6_node *root,
                            int (*func)(struct fib6_info *, void *arg),
                            int sernum, void *arg, bool skip_notify)
{
        struct fib6_cleaner c;

        c.w.root = root;
        c.w.func = fib6_clean_node;
        c.w.count = 0;
        c.w.skip = 0;
        c.w.skip_in_node = 0;
        c.func = func;
        c.sernum = sernum;
        c.arg = arg;
        c.net = net;
        c.skip_notify = skip_notify;

        fib6_walk(net, &c.w);
}

static void __fib6_clean_all(struct net *net,
                             int (*func)(struct fib6_info *, void *),
                             int sernum, void *arg, bool skip_notify)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        spin_lock_bh(&table->tb6_lock);
                        fib6_clean_tree(net, &table->tb6_root,
                                        func, sernum, arg, skip_notify);
                        spin_unlock_bh(&table->tb6_lock);
                }
        }
        rcu_read_unlock();
}

void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
                    void *arg)
{
        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
}

void fib6_clean_all_skip_notify(struct net *net,
                                int (*func)(struct fib6_info *, void *),
                                void *arg)
{
        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
}

static void fib6_flush_trees(struct net *net)
{
        int new_sernum = fib6_new_sernum(net);

        __fib6_clean_all(net, NULL, new_sernum, NULL, false);
}

/*
 *        Garbage collection
 */
void fib6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args,
                         unsigned long now)
{
        bool may_expire = rt->fib6_flags & RTF_EXPIRES && rt->expires;
        int old_more = gc_args->more;

        rt6_age_exceptions(rt, gc_args, now);

        if (!may_expire && old_more == gc_args->more)
                fib6_remove_gc_list(rt);
}

static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args)
{
        unsigned long now = jiffies;

        /*
         *        check addrconf expiration here.
         *        Routes are expired even if they are in use.
         */

        if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
                if (time_after(now, rt->expires)) {
                        pr_debug("expiring %p\n", rt);
                        return -1;
                }
                gc_args->more++;
        }

        /*        Also age clones in the exception table.
         *        Note, that clones are aged out
         *        only if they are not in use now.
         */
        fib6_age_exceptions(rt, gc_args, now);

        return 0;
}

static void fib6_gc_table(struct net *net,
                          struct fib6_table *tb6,
                          struct fib6_gc_args *gc_args)
{
        struct fib6_info *rt;
        struct hlist_node *n;
        struct nl_info info = {
                .nl_net = net,
                .skip_notify = false,
        };

        hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
                if (fib6_age(rt, gc_args) == -1)
                        fib6_del(rt, &info);
}

static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        spin_lock_bh(&table->tb6_lock);

                        fib6_gc_table(net, table, gc_args);

                        spin_unlock_bh(&table->tb6_lock);
                }
        }
        rcu_read_unlock();
}

void fib6_run_gc(unsigned long expires, struct net *net, bool force)
{
        struct fib6_gc_args gc_args;
        int ip6_rt_gc_interval;
        unsigned long now;

        if (force) {
                spin_lock_bh(&net->ipv6.fib6_gc_lock);
        } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
                mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
                return;
        }
        ip6_rt_gc_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval);
        gc_args.timeout = expires ? (int)expires : ip6_rt_gc_interval;
        gc_args.more = 0;

        fib6_gc_all(net, &gc_args);
        now = jiffies;
        net->ipv6.ip6_rt_last_gc = now;

        if (gc_args.more)
                mod_timer(&net->ipv6.ip6_fib_timer,
                          round_jiffies(now + ip6_rt_gc_interval));
        else
                timer_delete(&net->ipv6.ip6_fib_timer);
        spin_unlock_bh(&net->ipv6.fib6_gc_lock);
}

static void fib6_gc_timer_cb(struct timer_list *t)
{
        struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer);

        fib6_run_gc(0, arg, true);
}

static int __net_init fib6_net_init(struct net *net)
{
        size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
        int err;

        err = fib6_notifier_init(net);
        if (err)
                return err;

        /* Default to 3-tuple */
        net->ipv6.sysctl.multipath_hash_fields =
                FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;

        spin_lock_init(&net->ipv6.fib6_gc_lock);
        rwlock_init(&net->ipv6.fib6_walker_lock);
        INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
        timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);

        net->ipv6.rt6_stats = kzalloc_obj(*net->ipv6.rt6_stats);
        if (!net->ipv6.rt6_stats)
                goto out_notifier;

        /* Avoid false sharing : Use at least a full cache line */
        size = max_t(size_t, size, L1_CACHE_BYTES);

        net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
        if (!net->ipv6.fib_table_hash)
                goto out_rt6_stats;

        spin_lock_init(&net->ipv6.fib_table_hash_lock);

        net->ipv6.fib6_main_tbl = kzalloc_obj(*net->ipv6.fib6_main_tbl);
        if (!net->ipv6.fib6_main_tbl)
                goto out_fib_table_hash;

        net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
        rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
                           net->ipv6.fib6_null_entry);
        net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
        inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
        INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist);

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
        net->ipv6.fib6_local_tbl = kzalloc_obj(*net->ipv6.fib6_local_tbl);
        if (!net->ipv6.fib6_local_tbl)
                goto out_fib6_main_tbl;
        net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
        rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
                           net->ipv6.fib6_null_entry);
        net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
        inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
        INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist);
#endif
        fib6_tables_init(net);

        return 0;

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_fib6_main_tbl:
        kfree(net->ipv6.fib6_main_tbl);
#endif
out_fib_table_hash:
        kfree(net->ipv6.fib_table_hash);
out_rt6_stats:
        kfree(net->ipv6.rt6_stats);
out_notifier:
        fib6_notifier_exit(net);
        return -ENOMEM;
}

static void fib6_net_exit(struct net *net)
{
        unsigned int i;

        timer_delete_sync(&net->ipv6.ip6_fib_timer);

        for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
                struct hlist_head *head = &net->ipv6.fib_table_hash[i];
                struct hlist_node *tmp;
                struct fib6_table *tb;

                hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
                        hlist_del(&tb->tb6_hlist);
                        fib6_free_table(tb);
                }
        }

        kfree(net->ipv6.fib_table_hash);
        kfree(net->ipv6.rt6_stats);
        fib6_notifier_exit(net);
}

static struct pernet_operations fib6_net_ops = {
        .init = fib6_net_init,
        .exit = fib6_net_exit,
};

static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = {
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
         .dumpit = inet6_dump_fib,
         .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
};

int __init fib6_init(void)
{
        int ret = -ENOMEM;

        fib6_node_kmem = KMEM_CACHE(fib6_node,
                                    SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT);
        if (!fib6_node_kmem)
                goto out;

        ret = register_pernet_subsys(&fib6_net_ops);
        if (ret)
                goto out_kmem_cache_create;

        ret = rtnl_register_many(fib6_rtnl_msg_handlers);
        if (ret)
                goto out_unregister_subsys;

        __fib6_flush_trees = fib6_flush_trees;
out:
        return ret;

out_unregister_subsys:
        unregister_pernet_subsys(&fib6_net_ops);
out_kmem_cache_create:
        kmem_cache_destroy(fib6_node_kmem);
        goto out;
}

void fib6_gc_cleanup(void)
{
        unregister_pernet_subsys(&fib6_net_ops);
        kmem_cache_destroy(fib6_node_kmem);
}

#ifdef CONFIG_PROC_FS
static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
{
        struct fib6_info *rt = v;
        struct ipv6_route_iter *iter = seq->private;
        struct fib6_nh *fib6_nh = rt->fib6_nh;
        unsigned int flags = rt->fib6_flags;
        const struct net_device *dev;

        if (rt->nh)
                fib6_nh = nexthop_fib6_nh(rt->nh);

        seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);

#ifdef CONFIG_IPV6_SUBTREES
        seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
#else
        seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
        if (fib6_nh->fib_nh_gw_family) {
                flags |= RTF_GATEWAY;
                seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
        } else {
                seq_puts(seq, "00000000000000000000000000000000");
        }

        dev = fib6_nh->fib_nh_dev;
        seq_printf(seq, " %08x %08x %08x %08x %8s\n",
                   rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
                   flags, dev ? dev->name : "");
        iter->w.leaf = NULL;
        return 0;
}

static int ipv6_route_yield(struct fib6_walker *w)
{
        struct ipv6_route_iter *iter = w->args;

        if (!iter->skip)
                return 1;

        do {
                iter->w.leaf = rcu_dereference_protected(
                                iter->w.leaf->fib6_next,
                                lockdep_is_held(&iter->tbl->tb6_lock));
                iter->skip--;
                if (!iter->skip && iter->w.leaf)
                        return 1;
        } while (iter->w.leaf);

        return 0;
}

static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
                                      struct net *net)
{
        memset(&iter->w, 0, sizeof(iter->w));
        iter->w.func = ipv6_route_yield;
        iter->w.root = &iter->tbl->tb6_root;
        iter->w.state = FWS_INIT;
        iter->w.node = iter->w.root;
        iter->w.args = iter;
        iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
        INIT_LIST_HEAD(&iter->w.lh);
        fib6_walker_link(net, &iter->w);
}

static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
                                                    struct net *net)
{
        unsigned int h;
        struct hlist_node *node;

        if (tbl) {
                h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
                node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist));
        } else {
                h = 0;
                node = NULL;
        }

        while (!node && h < FIB6_TABLE_HASHSZ) {
                node = rcu_dereference(
                        hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
        }
        return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
}

static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
{
        int sernum = READ_ONCE(iter->w.root->fn_sernum);

        if (iter->sernum != sernum) {
                iter->sernum = sernum;
                iter->w.state = FWS_INIT;
                iter->w.node = iter->w.root;
                WARN_ON(iter->w.skip);
                iter->w.skip = iter->w.count;
        }
}

static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        int r;
        struct fib6_info *n;
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        ++(*pos);
        if (!v)
                goto iter_table;

        n = rcu_dereference(((struct fib6_info *)v)->fib6_next);
        if (n)
                return n;

iter_table:
        ipv6_route_check_sernum(iter);
        spin_lock_bh(&iter->tbl->tb6_lock);
        r = fib6_walk_continue(&iter->w);
        spin_unlock_bh(&iter->tbl->tb6_lock);
        if (r > 0) {
                return iter->w.leaf;
        } else if (r < 0) {
                fib6_walker_unlink(net, &iter->w);
                return NULL;
        }
        fib6_walker_unlink(net, &iter->w);

        iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
        if (!iter->tbl)
                return NULL;

        ipv6_route_seq_setup_walk(iter, net);
        goto iter_table;
}

static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        rcu_read_lock();
        iter->tbl = ipv6_route_seq_next_table(NULL, net);
        iter->skip = *pos;

        if (iter->tbl) {
                loff_t p = 0;

                ipv6_route_seq_setup_walk(iter, net);
                return ipv6_route_seq_next(seq, NULL, &p);
        } else {
                return NULL;
        }
}

static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
{
        struct fib6_walker *w = &iter->w;
        return w->node && !(w->state == FWS_U && w->node == w->root);
}

static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        if (ipv6_route_iter_active(iter))
                fib6_walker_unlink(net, &iter->w);

        rcu_read_unlock();
}

#if defined(CONFIG_BPF_SYSCALL)
static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
                                    struct bpf_iter_meta *meta,
                                    void *v)
{
        struct bpf_iter__ipv6_route ctx;

        ctx.meta = meta;
        ctx.rt = v;
        return bpf_iter_run_prog(prog, &ctx);
}

static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
        struct ipv6_route_iter *iter = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        int ret;

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        if (!prog)
                return ipv6_route_native_seq_show(seq, v);

        ret = ipv6_route_prog_seq_show(prog, &meta, v);
        iter->w.leaf = NULL;

        return ret;
}

static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)ipv6_route_prog_seq_show(prog, &meta, v);
        }

        ipv6_route_native_seq_stop(seq, v);
}
#else
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
        return ipv6_route_native_seq_show(seq, v);
}

static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
        ipv6_route_native_seq_stop(seq, v);
}
#endif

const struct seq_operations ipv6_route_seq_ops = {
        .start        = ipv6_route_seq_start,
        .next        = ipv6_route_seq_next,
        .stop        = ipv6_route_seq_stop,
        .show        = ipv6_route_seq_show
};
#endif /* CONFIG_PROC_FS */






   20 

   11 



    9 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __LICENSE_H
#define __LICENSE_H

static inline int license_is_gpl_compatible(const char *license)
{
        return (strcmp(license, "GPL") == 0
                || strcmp(license, "GPL v2") == 0
                || strcmp(license, "GPL and additional rights") == 0
                || strcmp(license, "Dual BSD/GPL") == 0
                || strcmp(license, "Dual MIT/GPL") == 0
                || strcmp(license, "Dual MPL/GPL") == 0);
}

#endif






























































































































































































































































































    1 








    1 
   10 








    1 




















   10 



   10 

   11 






























































   12 










   10 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
// SPDX-License-Identifier: GPL-2.0
/*
 *  mm/pgtable-generic.c
 *
 *  Generic pgtable methods declared in linux/pgtable.h
 *
 *  Copyright (C) 2010  Linus Torvalds
 */

#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mm_inline.h>
#include <linux/iommu.h>
#include <linux/pgalloc.h>

#include <asm/tlb.h>

/*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
 * very seldom) called out from the p?d_none_or_clear_bad macros.
 */

void pgd_clear_bad(pgd_t *pgd)
{
        pgd_ERROR(*pgd);
        pgd_clear(pgd);
}

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *p4d)
{
        p4d_ERROR(*p4d);
        p4d_clear(p4d);
}
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *pud)
{
        pud_ERROR(*pud);
        pud_clear(pud);
}
#endif

/*
 * Note that the pmd variant below can't be stub'ed out just as for p4d/pud
 * above. pmd folding is special and typically pmd_* macros refer to upper
 * level even when folded
 */
void pmd_clear_bad(pmd_t *pmd)
{
        pmd_ERROR(*pmd);
        pmd_clear(pmd);
}

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
 * Only sets the access flags (dirty, accessed), as well as write
 * permission. Furthermore, we know it always gets set to a "more
 * permissive" setting, which allows most architectures to optimize
 * this. We return whether the PTE actually changed, which in turn
 * instructs the caller to do things like update__mmu_cache.  This
 * used to be done in the caller, but sparc needs minor faults to
 * force that call on sun4c so we changed this macro slightly
 */
int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
{
        int changed = !pte_same(ptep_get(ptep), entry);
        if (changed) {
                set_pte_at(vma->vm_mm, address, ptep, entry);
                flush_tlb_fix_spurious_fault(vma, address, ptep);
        }
        return changed;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
bool ptep_clear_flush_young(struct vm_area_struct *vma,
                unsigned long address, pte_t *ptep)
{
        bool young;

        young = ptep_test_and_clear_young(vma, address, ptep);
        if (young)
                flush_tlb_page(vma, address);
        return young;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
                       pte_t *ptep)
{
        struct mm_struct *mm = (vma)->vm_mm;
        pte_t pte;
        pte = ptep_get_and_clear(mm, address, ptep);
        if (pte_accessible(mm, pte))
                flush_tlb_page(vma, address);
        return pte;
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
int pmdp_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pmd_t *pmdp,
                          pmd_t entry, int dirty)
{
        int changed = !pmd_same(*pmdp, entry);
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        if (changed) {
                set_pmd_at(vma->vm_mm, address, pmdp, entry);
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        }
        return changed;
}
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
bool pmdp_clear_flush_young(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp)
{
        bool young;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return young;
}
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp)
{
        pmd_t pmd;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pud_t *pudp)
{
        pud_t pud;

        VM_BUG_ON(address & ~HPAGE_PUD_MASK);
        VM_BUG_ON(!pud_trans_huge(*pudp));
        pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
        flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
        return pud;
}
#endif
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
{
        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        if (!pmd_huge_pte(mm, pmdp))
                INIT_LIST_HEAD(&pgtable->lru);
        else
                list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
        pmd_huge_pte(mm, pmdp) = pgtable;
}
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
/* no "address" argument so destroys page coloring of some arch */
pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
        pgtable_t pgtable;

        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        pgtable = pmd_huge_pte(mm, pmdp);
        pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
                                                          struct page, lru);
        if (pmd_huge_pte(mm, pmdp))
                list_del(&pgtable->lru);
        return pgtable;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));
        pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return old;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
                         pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));
        return pmdp_invalidate(vma, address, pmdp);
}
#endif

#ifndef pmdp_collapse_flush
pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
                          pmd_t *pmdp)
{
        /*
         * pmd and hugepage pte format are same. So we could
         * use the same function.
         */
        pmd_t pmd;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);

        /* collapse entails shooting down ptes not pmd */
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
}
#endif

/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
#ifndef pte_free_defer
static void pte_free_now(struct rcu_head *head)
{
        struct page *page;

        page = container_of(head, struct page, rcu_head);
        pte_free(NULL /* mm not passed and not used */, (pgtable_t)page);
}

void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
{
        struct page *page;

        page = pgtable;
        call_rcu(&page->rcu_head, pte_free_now);
}
#endif /* pte_free_defer */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \
        (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU))
/*
 * See the comment above ptep_get_lockless() in include/linux/pgtable.h:
 * the barriers in pmdp_get_lockless() cannot guarantee that the value in
 * pmd_high actually belongs with the value in pmd_low; but holding interrupts
 * off blocks the TLB flush between present updates, which guarantees that a
 * successful __pte_offset_map() points to a page from matched halves.
 */
static unsigned long pmdp_get_lockless_start(void)
{
        unsigned long irqflags;

        local_irq_save(irqflags);
        return irqflags;
}
static void pmdp_get_lockless_end(unsigned long irqflags)
{
        local_irq_restore(irqflags);
}
#else
static unsigned long pmdp_get_lockless_start(void) { return 0; }
static void pmdp_get_lockless_end(unsigned long irqflags) { }
#endif

pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
{
        unsigned long irqflags;
        pmd_t pmdval;

        rcu_read_lock();
        irqflags = pmdp_get_lockless_start();
        pmdval = pmdp_get_lockless(pmd);
        pmdp_get_lockless_end(irqflags);

        if (pmdvalp)
                *pmdvalp = pmdval;
        if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
                goto nomap;
        if (unlikely(pmd_trans_huge(pmdval)))
                goto nomap;
        if (unlikely(pmd_bad(pmdval))) {
                pmd_clear_bad(pmd);
                goto nomap;
        }
        return __pte_map(&pmdval, addr);
nomap:
        rcu_read_unlock();
        return NULL;
}

pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, spinlock_t **ptlp)
{
        pmd_t pmdval;
        pte_t *pte;

        pte = __pte_offset_map(pmd, addr, &pmdval);
        if (likely(pte))
                *ptlp = pte_lockptr(mm, &pmdval);
        return pte;
}

pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, pmd_t *pmdvalp,
                                spinlock_t **ptlp)
{
        pte_t *pte;

        VM_WARN_ON_ONCE(!pmdvalp);
        pte = __pte_offset_map(pmd, addr, pmdvalp);
        if (likely(pte))
                *ptlp = pte_lockptr(mm, pmdvalp);
        return pte;
}

/*
 * pte_offset_map_lock(mm, pmd, addr, ptlp) is usually called with the pmd
 * pointer for addr, reached by walking down the mm's pgd, p4d, pud for addr:
 * either while holding mmap_lock or vma lock for read or for write; or in
 * truncate or rmap context, while holding file's i_mmap_lock or anon_vma lock
 * for read (or for write). In a few cases, it may be used with pmd pointing to
 * a pmd_t already copied to or constructed on the stack.
 *
 * When successful, it returns the pte pointer for addr, with its page table
 * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent
 * modification by software, with a pointer to that spinlock in ptlp (in some
 * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's
 * struct page).  pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards.
 *
 * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no
 * page table at *pmd: if, for example, the page table has just been removed,
 * or replaced by the huge pmd of a THP.  (When successful, *pmd is rechecked
 * after acquiring the ptlock, and retried internally if it changed: so that a
 * page table can be safely removed or replaced by THP while holding its lock.)
 *
 * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above,
 * just returns the pte pointer for addr, its page table kmapped if necessary;
 * or NULL if there is no page table at *pmd.  It does not attempt to lock the
 * page table, so cannot normally be used when the page table is to be updated,
 * or when entries read must be stable.  But it does take rcu_read_lock(): so
 * that even when page table is racily removed, it remains a valid though empty
 * and disconnected table.  Until pte_unmap(pte) unmaps and rcu_read_unlock()s
 * afterwards.
 *
 * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
 * but when successful, it also outputs a pointer to the spinlock in ptlp - as
 * pte_offset_map_lock() does, but in this case without locking it.  This helps
 * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
 * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock
 * pointer for the page table that it returns. Even after grabbing the spinlock,
 * we might be looking either at a page table that is still mapped or one that
 * was unmapped and is about to get freed. But for R/O access this is sufficient.
 * So it is only applicable for read-only cases where any modification operations
 * to the page table are not allowed even if the corresponding spinlock is held
 * afterwards.
 *
 * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like
 * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval.
 * It is applicable for may-write cases where any modification operations to the
 * page table may happen after the corresponding spinlock is held afterwards.
 * But the users should make sure the page table is stable like checking pte_same()
 * or checking pmd_same() by using the output pmdval before performing the write
 * operations.
 *
 * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will
 * be read-only/read-write protected.
 *
 * Note that free_pgtables(), used after unmapping detached vmas, or when
 * exiting the whole mm, does not take page table lock before freeing a page
 * table, and may not use RCU at all: "outsiders" like khugepaged should avoid
 * pte_offset_map() and co once the vma is detached from mm or mm_users is zero.
 */
pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                           unsigned long addr, spinlock_t **ptlp)
{
        spinlock_t *ptl;
        pmd_t pmdval;
        pte_t *pte;
again:
        pte = __pte_offset_map(pmd, addr, &pmdval);
        if (unlikely(!pte))
                return pte;
        ptl = pte_lockptr(mm, &pmdval);
        spin_lock(ptl);
        if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
                *ptlp = ptl;
                return pte;
        }
        pte_unmap_unlock(pte, ptl);
        goto again;
}

#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
static void kernel_pgtable_work_func(struct work_struct *work);

static struct {
        struct list_head list;
        /* protect above ptdesc lists */
        spinlock_t lock;
        struct work_struct work;
} kernel_pgtable_work = {
        .list = LIST_HEAD_INIT(kernel_pgtable_work.list),
        .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
        .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
};

static void kernel_pgtable_work_func(struct work_struct *work)
{
        struct ptdesc *pt, *next;
        LIST_HEAD(page_list);

        spin_lock(&kernel_pgtable_work.lock);
        list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
        spin_unlock(&kernel_pgtable_work.lock);

        iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL);
        list_for_each_entry_safe(pt, next, &page_list, pt_list)
                __pagetable_free(pt);
}

void pagetable_free_kernel(struct ptdesc *pt)
{
        spin_lock(&kernel_pgtable_work.lock);
        list_add(&pt->pt_list, &kernel_pgtable_work.list);
        spin_unlock(&kernel_pgtable_work.lock);

        schedule_work(&kernel_pgtable_work.work);
}
#endif




















































































































































































































































































































































































































































































































































































































































































































































































































































    3 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Dynamic DMA mapping support.
 *
 * This implementation is a fallback for platforms that do not support
 * I/O TLBs (aka DMA address translation hardware).
 * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
 * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
 * Copyright (C) 2000, 2003 Hewlett-Packard Co
 *        David Mosberger-Tang <davidm@hpl.hp.com>
 *
 * 03/05/07 davidm        Switch from PCI-DMA to generic device DMA API.
 * 00/12/13 davidm        Rename to swiotlb.c and add mark_clean() to avoid
 *                        unnecessary i-cache flushing.
 * 04/07/.. ak                Better overflow handling. Assorted fixes.
 * 05/09/10 linville        Add support for syncing ranges, support syncing for
 *                        DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
 * 08/12/11 beckyb        Add highmem support
 */

#define pr_fmt(fmt) "software IO TLB: " fmt

#include <linux/cache.h>
#include <linux/cc_platform.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/dma-direct.h>
#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/io.h>
#include <linux/kmsan-checks.h>
#include <linux/iommu-helper.h>
#include <linux/init.h>
#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pfn.h>
#include <linux/rculist.h>
#include <linux/scatterlist.h>
#include <linux/set_memory.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/swiotlb.h>
#include <linux/types.h>
#ifdef CONFIG_DMA_RESTRICTED_POOL
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/of_reserved_mem.h>
#include <linux/slab.h>
#endif

#define CREATE_TRACE_POINTS
#include <trace/events/swiotlb.h>

#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))

/*
 * Minimum IO TLB size to bother booting with.  Systems with mainly
 * 64bit capable cards will only lightly use the swiotlb.  If we can't
 * allocate a contiguous 1MB, we're probably in trouble anyway.
 */
#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)

/**
 * struct io_tlb_slot - IO TLB slot descriptor
 * @orig_addr:        The original address corresponding to a mapped entry.
 * @alloc_size:        Size of the allocated buffer.
 * @list:        The free list describing the number of free entries available
 *                from each index.
 * @pad_slots:        Number of preceding padding slots. Valid only in the first
 *                allocated non-padding slot.
 */
struct io_tlb_slot {
        phys_addr_t orig_addr;
        size_t alloc_size;
        unsigned short list;
        unsigned short pad_slots;
};

static bool swiotlb_force_bounce;
static bool swiotlb_force_disable;

#ifdef CONFIG_SWIOTLB_DYNAMIC

static void swiotlb_dyn_alloc(struct work_struct *work);

static struct io_tlb_mem io_tlb_default_mem = {
        .lock = __SPIN_LOCK_UNLOCKED(io_tlb_default_mem.lock),
        .pools = LIST_HEAD_INIT(io_tlb_default_mem.pools),
        .dyn_alloc = __WORK_INITIALIZER(io_tlb_default_mem.dyn_alloc,
                                        swiotlb_dyn_alloc),
};

#else  /* !CONFIG_SWIOTLB_DYNAMIC */

static struct io_tlb_mem io_tlb_default_mem;

#endif        /* CONFIG_SWIOTLB_DYNAMIC */

static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
static unsigned long default_nareas;

/**
 * struct io_tlb_area - IO TLB memory area descriptor
 *
 * This is a single area with a single lock.
 *
 * @used:        The number of used IO TLB block.
 * @index:        The slot index to start searching in this area for next round.
 * @lock:        The lock to protect the above data structures in the map and
 *                unmap calls.
 */
struct io_tlb_area {
        unsigned long used;
        unsigned int index;
        spinlock_t lock;
};

/*
 * Round up number of slabs to the next power of 2. The last area is going
 * be smaller than the rest if default_nslabs is not power of two.
 * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE,
 * otherwise a segment may span two or more areas. It conflicts with free
 * contiguous slots tracking: free slots are treated contiguous no matter
 * whether they cross an area boundary.
 *
 * Return true if default_nslabs is rounded up.
 */
static bool round_up_default_nslabs(void)
{
        if (!default_nareas)
                return false;

        if (default_nslabs < IO_TLB_SEGSIZE * default_nareas)
                default_nslabs = IO_TLB_SEGSIZE * default_nareas;
        else if (is_power_of_2(default_nslabs))
                return false;
        default_nslabs = roundup_pow_of_two(default_nslabs);
        return true;
}

/**
 * swiotlb_adjust_nareas() - adjust the number of areas and slots
 * @nareas:        Desired number of areas. Zero is treated as 1.
 *
 * Adjust the default number of areas in a memory pool.
 * The default size of the memory pool may also change to meet minimum area
 * size requirements.
 */
static void swiotlb_adjust_nareas(unsigned int nareas)
{
        if (!nareas)
                nareas = 1;
        else if (!is_power_of_2(nareas))
                nareas = roundup_pow_of_two(nareas);

        default_nareas = nareas;

        pr_info("area num %d.\n", nareas);
        if (round_up_default_nslabs())
                pr_info("SWIOTLB bounce buffer size roundup to %luMB",
                        (default_nslabs << IO_TLB_SHIFT) >> 20);
}

/**
 * limit_nareas() - get the maximum number of areas for a given memory pool size
 * @nareas:        Desired number of areas.
 * @nslots:        Total number of slots in the memory pool.
 *
 * Limit the number of areas to the maximum possible number of areas in
 * a memory pool of the given size.
 *
 * Return: Maximum possible number of areas.
 */
static unsigned int limit_nareas(unsigned int nareas, unsigned long nslots)
{
        if (nslots < nareas * IO_TLB_SEGSIZE)
                return nslots / IO_TLB_SEGSIZE;
        return nareas;
}

static int __init
setup_io_tlb_npages(char *str)
{
        if (isdigit(*str)) {
                /* avoid tail segment of size < IO_TLB_SEGSIZE */
                default_nslabs =
                        ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE);
        }
        if (*str == ',')
                ++str;
        if (isdigit(*str))
                swiotlb_adjust_nareas(simple_strtoul(str, &str, 0));
        if (*str == ',')
                ++str;
        if (!strcmp(str, "force"))
                swiotlb_force_bounce = true;
        else if (!strcmp(str, "noforce"))
                swiotlb_force_disable = true;

        return 0;
}
early_param("swiotlb", setup_io_tlb_npages);

unsigned long swiotlb_size_or_default(void)
{
        return default_nslabs << IO_TLB_SHIFT;
}

void __init swiotlb_adjust_size(unsigned long size)
{
        /*
         * If swiotlb parameter has not been specified, give a chance to
         * architectures such as those supporting memory encryption to
         * adjust/expand SWIOTLB size for their use.
         */
        if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
                return;

        size = ALIGN(size, IO_TLB_SIZE);
        default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
        if (round_up_default_nslabs())
                size = default_nslabs << IO_TLB_SHIFT;
        pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
}

void swiotlb_print_info(void)
{
        struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;

        if (!mem->nslabs) {
                pr_warn("No low mem\n");
                return;
        }

        pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end,
               (mem->nslabs << IO_TLB_SHIFT) >> 20);
}

static inline unsigned long io_tlb_offset(unsigned long val)
{
        return val & (IO_TLB_SEGSIZE - 1);
}

static inline unsigned long nr_slots(u64 val)
{
        return DIV_ROUND_UP(val, IO_TLB_SIZE);
}

/*
 * Early SWIOTLB allocation may be too early to allow an architecture to
 * perform the desired operations.  This function allows the architecture to
 * call SWIOTLB when the operations are possible.  It needs to be called
 * before the SWIOTLB memory is used.
 */
void __init swiotlb_update_mem_attributes(void)
{
        struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
        unsigned long bytes;

        if (!mem->nslabs || mem->late_alloc)
                return;
        bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
        set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
}

static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
                unsigned long nslabs, bool late_alloc, unsigned int nareas)
{
        void *vaddr = phys_to_virt(start);
        unsigned long bytes = nslabs << IO_TLB_SHIFT, i;

        mem->nslabs = nslabs;
        mem->start = start;
        mem->end = mem->start + bytes;
        mem->late_alloc = late_alloc;
        mem->nareas = nareas;
        mem->area_nslabs = nslabs / mem->nareas;

        for (i = 0; i < mem->nareas; i++) {
                spin_lock_init(&mem->areas[i].lock);
                mem->areas[i].index = 0;
                mem->areas[i].used = 0;
        }

        for (i = 0; i < mem->nslabs; i++) {
                mem->slots[i].list = min(IO_TLB_SEGSIZE - io_tlb_offset(i),
                                         mem->nslabs - i);
                mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
                mem->slots[i].alloc_size = 0;
                mem->slots[i].pad_slots = 0;
        }

        memset(vaddr, 0, bytes);
        mem->vaddr = vaddr;
        return;
}

/**
 * add_mem_pool() - add a memory pool to the allocator
 * @mem:        Software IO TLB allocator.
 * @pool:        Memory pool to be added.
 */
static void add_mem_pool(struct io_tlb_mem *mem, struct io_tlb_pool *pool)
{
#ifdef CONFIG_SWIOTLB_DYNAMIC
        spin_lock(&mem->lock);
        list_add_rcu(&pool->node, &mem->pools);
        mem->nslabs += pool->nslabs;
        spin_unlock(&mem->lock);
#else
        mem->nslabs = pool->nslabs;
#endif
}

static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
                unsigned int flags,
                int (*remap)(void *tlb, unsigned long nslabs))
{
        size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
        void *tlb;

        /*
         * By default allocate the bounce buffer memory from low memory, but
         * allow to pick a location everywhere for hypervisors with guest
         * memory encryption.
         */
        if (flags & SWIOTLB_ANY)
                tlb = memblock_alloc(bytes, PAGE_SIZE);
        else
                tlb = memblock_alloc_low(bytes, PAGE_SIZE);

        if (!tlb) {
                pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
                        __func__, bytes);
                return NULL;
        }

        if (remap && remap(tlb, nslabs) < 0) {
                memblock_free(tlb, PAGE_ALIGN(bytes));
                pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
                return NULL;
        }

        return tlb;
}

/*
 * Statically reserve bounce buffer space and initialize bounce buffer data
 * structures for the software IO TLB used to implement the DMA API.
 */
void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
                int (*remap)(void *tlb, unsigned long nslabs))
{
        struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
        unsigned long nslabs;
        unsigned int nareas;
        size_t alloc_size;
        void *tlb;

        if (!addressing_limit && !swiotlb_force_bounce)
                return;
        if (swiotlb_force_disable)
                return;

        io_tlb_default_mem.force_bounce =
                swiotlb_force_bounce || (flags & SWIOTLB_FORCE);

#ifdef CONFIG_SWIOTLB_DYNAMIC
        if (!remap)
                io_tlb_default_mem.can_grow = true;
        if (flags & SWIOTLB_ANY)
                io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
        else
                io_tlb_default_mem.phys_limit = ARCH_LOW_ADDRESS_LIMIT;
#endif

        if (!default_nareas)
                swiotlb_adjust_nareas(num_possible_cpus());

        nslabs = default_nslabs;
        nareas = limit_nareas(default_nareas, nslabs);
        while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) {
                if (nslabs <= IO_TLB_MIN_SLABS)
                        return;
                nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
                nareas = limit_nareas(nareas, nslabs);
        }

        if (default_nslabs != nslabs) {
                pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs",
                        default_nslabs, nslabs);
                default_nslabs = nslabs;
        }

        alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
        mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
        if (!mem->slots) {
                pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n",
                        __func__, alloc_size, PAGE_SIZE);
                return;
        }

        mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
                nareas), SMP_CACHE_BYTES);
        if (!mem->areas) {
                pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
                return;
        }

        swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas);
        add_mem_pool(&io_tlb_default_mem, mem);

        if (flags & SWIOTLB_VERBOSE)
                swiotlb_print_info();
}

void __init swiotlb_init(bool addressing_limit, unsigned int flags)
{
        swiotlb_init_remap(addressing_limit, flags, NULL);
}

/*
 * Systems with larger DMA zones (those that don't support ISA) can
 * initialize the swiotlb later using the slab allocator if needed.
 * This should be just like above, but with some error catching.
 */
int swiotlb_init_late(size_t size, gfp_t gfp_mask,
                int (*remap)(void *tlb, unsigned long nslabs))
{
        struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
        unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
        unsigned int nareas;
        unsigned char *vstart = NULL;
        unsigned int order, area_order;
        bool retried = false;
        int rc = 0;

        if (io_tlb_default_mem.nslabs)
                return 0;

        if (swiotlb_force_disable)
                return 0;

        io_tlb_default_mem.force_bounce = swiotlb_force_bounce;

#ifdef CONFIG_SWIOTLB_DYNAMIC
        if (!remap)
                io_tlb_default_mem.can_grow = true;
        if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
                io_tlb_default_mem.phys_limit = zone_dma_limit;
        else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
                io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
        else
                io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
#endif

        if (!default_nareas)
                swiotlb_adjust_nareas(num_possible_cpus());

retry:
        order = get_order(nslabs << IO_TLB_SHIFT);
        nslabs = SLABS_PER_PAGE << order;

        while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
                vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
                                                  order);
                if (vstart)
                        break;
                order--;
                nslabs = SLABS_PER_PAGE << order;
                retried = true;
        }

        if (!vstart)
                return -ENOMEM;

        if (remap)
                rc = remap(vstart, nslabs);
        if (rc) {
                free_pages((unsigned long)vstart, order);

                nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
                if (nslabs < IO_TLB_MIN_SLABS)
                        return rc;
                retried = true;
                goto retry;
        }

        if (retried) {
                pr_warn("only able to allocate %ld MB\n",
                        (PAGE_SIZE << order) >> 20);
        }

        nareas = limit_nareas(default_nareas, nslabs);
        area_order = get_order(array_size(sizeof(*mem->areas), nareas));
        mem->areas = (struct io_tlb_area *)
                __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
        if (!mem->areas)
                goto error_area;

        mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                get_order(array_size(sizeof(*mem->slots), nslabs)));
        if (!mem->slots)
                goto error_slots;

        set_memory_decrypted((unsigned long)vstart,
                             (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
        swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true,
                                 nareas);
        add_mem_pool(&io_tlb_default_mem, mem);

        swiotlb_print_info();
        return 0;

error_slots:
        free_pages((unsigned long)mem->areas, area_order);
error_area:
        free_pages((unsigned long)vstart, order);
        return -ENOMEM;
}

void __init swiotlb_exit(void)
{
        struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
        unsigned long tbl_vaddr;
        size_t tbl_size, slots_size;
        unsigned int area_order;

        if (swiotlb_force_bounce)
                return;

        if (!mem->nslabs)
                return;

        pr_info("tearing down default memory pool\n");
        tbl_vaddr = (unsigned long)phys_to_virt(mem->start);
        tbl_size = PAGE_ALIGN(mem->end - mem->start);
        slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));

        set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
        if (mem->late_alloc) {
                area_order = get_order(array_size(sizeof(*mem->areas),
                        mem->nareas));
                free_pages((unsigned long)mem->areas, area_order);
                free_pages(tbl_vaddr, get_order(tbl_size));
                free_pages((unsigned long)mem->slots, get_order(slots_size));
        } else {
                memblock_free(mem->areas,
                        array_size(sizeof(*mem->areas), mem->nareas));
                memblock_phys_free(mem->start, tbl_size);
                memblock_free(mem->slots, slots_size);
        }

        memset(mem, 0, sizeof(*mem));
}

#ifdef CONFIG_SWIOTLB_DYNAMIC

/**
 * alloc_dma_pages() - allocate pages to be used for DMA
 * @gfp:        GFP flags for the allocation.
 * @bytes:        Size of the buffer.
 * @phys_limit:        Maximum allowed physical address of the buffer.
 *
 * Allocate pages from the buddy allocator. If successful, make the allocated
 * pages decrypted that they can be used for DMA.
 *
 * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN)
 * if the allocated physical address was above @phys_limit.
 */
static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
{
        unsigned int order = get_order(bytes);
        struct page *page;
        phys_addr_t paddr;
        void *vaddr;

        page = alloc_pages(gfp, order);
        if (!page)
                return NULL;

        paddr = page_to_phys(page);
        if (paddr + bytes - 1 > phys_limit) {
                __free_pages(page, order);
                return ERR_PTR(-EAGAIN);
        }

        vaddr = phys_to_virt(paddr);
        if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
                goto error;
        return page;

error:
        /* Intentional leak if pages cannot be encrypted again. */
        if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
                __free_pages(page, order);
        return NULL;
}

/**
 * swiotlb_alloc_tlb() - allocate a dynamic IO TLB buffer
 * @dev:        Device for which a memory pool is allocated.
 * @bytes:        Size of the buffer.
 * @phys_limit:        Maximum allowed physical address of the buffer.
 * @gfp:        GFP flags for the allocation.
 *
 * Return: Allocated pages, or %NULL on allocation failure.
 */
static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
                u64 phys_limit, gfp_t gfp)
{
        struct page *page;

        /*
         * Allocate from the atomic pools if memory is encrypted and
         * the allocation is atomic, because decrypting may block.
         */
        if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) {
                void *vaddr;

                if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
                        return NULL;

                return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
                                           dma_coherent_ok);
        }

        gfp &= ~GFP_ZONEMASK;
        if (phys_limit <= zone_dma_limit)
                gfp |= __GFP_DMA;
        else if (phys_limit <= DMA_BIT_MASK(32))
                gfp |= __GFP_DMA32;

        while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) {
                if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
                    phys_limit < DMA_BIT_MASK(64) &&
                    !(gfp & (__GFP_DMA32 | __GFP_DMA)))
                        gfp |= __GFP_DMA32;
                else if (IS_ENABLED(CONFIG_ZONE_DMA) &&
                         !(gfp & __GFP_DMA))
                        gfp = (gfp & ~__GFP_DMA32) | __GFP_DMA;
                else
                        return NULL;
        }

        return page;
}

/**
 * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer
 * @vaddr:        Virtual address of the buffer.
 * @bytes:        Size of the buffer.
 */
static void swiotlb_free_tlb(void *vaddr, size_t bytes)
{
        if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
            dma_free_from_pool(NULL, vaddr, bytes))
                return;

        /* Intentional leak if pages cannot be encrypted again. */
        if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
                __free_pages(virt_to_page(vaddr), get_order(bytes));
}

/**
 * swiotlb_alloc_pool() - allocate a new IO TLB memory pool
 * @dev:        Device for which a memory pool is allocated.
 * @minslabs:        Minimum number of slabs.
 * @nslabs:        Desired (maximum) number of slabs.
 * @nareas:        Number of areas.
 * @phys_limit:        Maximum DMA buffer physical address.
 * @gfp:        GFP flags for the allocations.
 *
 * Allocate and initialize a new IO TLB memory pool. The actual number of
 * slabs may be reduced if allocation of @nslabs fails. If even
 * @minslabs cannot be allocated, this function fails.
 *
 * Return: New memory pool, or %NULL on allocation failure.
 */
static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
                unsigned long minslabs, unsigned long nslabs,
                unsigned int nareas, u64 phys_limit, gfp_t gfp)
{
        struct io_tlb_pool *pool;
        unsigned int slot_order;
        struct page *tlb;
        size_t pool_size;
        size_t tlb_size;

        if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) {
                nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER;
                nareas = limit_nareas(nareas, nslabs);
        }

        pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas);
        pool = kzalloc(pool_size, gfp);
        if (!pool)
                goto error;
        pool->areas = (void *)pool + sizeof(*pool);

        tlb_size = nslabs << IO_TLB_SHIFT;
        while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) {
                if (nslabs <= minslabs)
                        goto error_tlb;
                nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
                nareas = limit_nareas(nareas, nslabs);
                tlb_size = nslabs << IO_TLB_SHIFT;
        }

        slot_order = get_order(array_size(sizeof(*pool->slots), nslabs));
        pool->slots = (struct io_tlb_slot *)
                __get_free_pages(gfp, slot_order);
        if (!pool->slots)
                goto error_slots;

        swiotlb_init_io_tlb_pool(pool, page_to_phys(tlb), nslabs, true, nareas);
        return pool;

error_slots:
        swiotlb_free_tlb(page_address(tlb), tlb_size);
error_tlb:
        kfree(pool);
error:
        return NULL;
}

/**
 * swiotlb_dyn_alloc() - dynamic memory pool allocation worker
 * @work:        Pointer to dyn_alloc in struct io_tlb_mem.
 */
static void swiotlb_dyn_alloc(struct work_struct *work)
{
        struct io_tlb_mem *mem =
                container_of(work, struct io_tlb_mem, dyn_alloc);
        struct io_tlb_pool *pool;

        pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs,
                                  default_nareas, mem->phys_limit, GFP_KERNEL);
        if (!pool) {
                pr_warn_ratelimited("Failed to allocate new pool");
                return;
        }

        add_mem_pool(mem, pool);
}

/**
 * swiotlb_dyn_free() - RCU callback to free a memory pool
 * @rcu:        RCU head in the corresponding struct io_tlb_pool.
 */
static void swiotlb_dyn_free(struct rcu_head *rcu)
{
        struct io_tlb_pool *pool = container_of(rcu, struct io_tlb_pool, rcu);
        size_t slots_size = array_size(sizeof(*pool->slots), pool->nslabs);
        size_t tlb_size = pool->end - pool->start;

        free_pages((unsigned long)pool->slots, get_order(slots_size));
        swiotlb_free_tlb(pool->vaddr, tlb_size);
        kfree(pool);
}

/**
 * __swiotlb_find_pool() - find the IO TLB pool for a physical address
 * @dev:        Device which has mapped the DMA buffer.
 * @paddr:      Physical address within the DMA buffer.
 *
 * Find the IO TLB memory pool descriptor which contains the given physical
 * address, if any. This function is for use only when the dev is known to
 * be using swiotlb. Use swiotlb_find_pool() for the more general case
 * when this condition is not met.
 *
 * Return: Memory pool which contains @paddr, or %NULL if none.
 */
struct io_tlb_pool *__swiotlb_find_pool(struct device *dev, phys_addr_t paddr)
{
        struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
        struct io_tlb_pool *pool;

        rcu_read_lock();
        list_for_each_entry_rcu(pool, &mem->pools, node) {
                if (paddr >= pool->start && paddr < pool->end)
                        goto out;
        }

        list_for_each_entry_rcu(pool, &dev->dma_io_tlb_pools, node) {
                if (paddr >= pool->start && paddr < pool->end)
                        goto out;
        }
        pool = NULL;
out:
        rcu_read_unlock();
        return pool;
}

/**
 * swiotlb_del_pool() - remove an IO TLB pool from a device
 * @dev:        Owning device.
 * @pool:        Memory pool to be removed.
 */
static void swiotlb_del_pool(struct device *dev, struct io_tlb_pool *pool)
{
        unsigned long flags;

        spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
        list_del_rcu(&pool->node);
        spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);

        call_rcu(&pool->rcu, swiotlb_dyn_free);
}

#endif        /* CONFIG_SWIOTLB_DYNAMIC */

/**
 * swiotlb_dev_init() - initialize swiotlb fields in &struct device
 * @dev:        Device to be initialized.
 */
void swiotlb_dev_init(struct device *dev)
{
        dev->dma_io_tlb_mem = &io_tlb_default_mem;
#ifdef CONFIG_SWIOTLB_DYNAMIC
        INIT_LIST_HEAD(&dev->dma_io_tlb_pools);
        spin_lock_init(&dev->dma_io_tlb_lock);
        dev->dma_uses_io_tlb = false;
#endif
}

/**
 * swiotlb_align_offset() - Get required offset into an IO TLB allocation.
 * @dev:         Owning device.
 * @align_mask:  Allocation alignment mask.
 * @addr:        DMA address.
 *
 * Return the minimum offset from the start of an IO TLB allocation which is
 * required for a given buffer address and allocation alignment to keep the
 * device happy.
 *
 * First, the address bits covered by min_align_mask must be identical in the
 * original address and the bounce buffer address. High bits are preserved by
 * choosing a suitable IO TLB slot, but bits below IO_TLB_SHIFT require extra
 * padding bytes before the bounce buffer.
 *
 * Second, @align_mask specifies which bits of the first allocated slot must
 * be zero. This may require allocating additional padding slots, and then the
 * offset (in bytes) from the first such padding slot is returned.
 */
static unsigned int swiotlb_align_offset(struct device *dev,
                                         unsigned int align_mask, u64 addr)
{
        return addr & dma_get_min_align_mask(dev) &
                (align_mask | (IO_TLB_SIZE - 1));
}

/*
 * Bounce: copy the swiotlb buffer from or back to the original dma location
 */
static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
                           enum dma_data_direction dir, struct io_tlb_pool *mem)
{
        int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
        phys_addr_t orig_addr = mem->slots[index].orig_addr;
        size_t alloc_size = mem->slots[index].alloc_size;
        unsigned long pfn = PFN_DOWN(orig_addr);
        unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
        int tlb_offset;

        if (orig_addr == INVALID_PHYS_ADDR)
                return;

        if (dir == DMA_FROM_DEVICE && !dev_is_dma_coherent(dev))
                arch_sync_dma_flush();

        /*
         * It's valid for tlb_offset to be negative. This can happen when the
         * "offset" returned by swiotlb_align_offset() is non-zero, and the
         * tlb_addr is pointing within the first "offset" bytes of the second
         * or subsequent slots of the allocated swiotlb area. While it's not
         * valid for tlb_addr to be pointing within the first "offset" bytes
         * of the first slot, there's no way to check for such an error since
         * this function can't distinguish the first slot from the second and
         * subsequent slots.
         */
        tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) -
                     swiotlb_align_offset(dev, 0, orig_addr);

        orig_addr += tlb_offset;
        alloc_size -= tlb_offset;

        if (size > alloc_size) {
                dev_WARN_ONCE(dev, 1,
                        "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n",
                        alloc_size, size);
                size = alloc_size;
        }

        if (PageHighMem(pfn_to_page(pfn))) {
                unsigned int offset = orig_addr & ~PAGE_MASK;
                struct page *page;
                unsigned int sz = 0;
                unsigned long flags;

                while (size) {
                        sz = min_t(size_t, PAGE_SIZE - offset, size);

                        local_irq_save(flags);
                        page = pfn_to_page(pfn);
                        if (dir == DMA_TO_DEVICE) {
                                /*
                                 * Ideally, kmsan_check_highmem_page()
                                 * could be used here to detect infoleaks,
                                 * but callers may map uninitialized buffers
                                 * that will be written by the device,
                                 * causing false positives.
                                 */
                                memcpy_from_page(vaddr, page, offset, sz);
                        } else {
                                kmsan_unpoison_memory(vaddr, sz);
                                memcpy_to_page(page, offset, vaddr, sz);
                        }
                        local_irq_restore(flags);

                        size -= sz;
                        pfn++;
                        vaddr += sz;
                        offset = 0;
                }
        } else if (dir == DMA_TO_DEVICE) {
                /*
                 * Ideally, kmsan_check_memory() could be used here to detect
                 * infoleaks (uninitialized data being sent to device), but
                 * callers may map uninitialized buffers that will be written
                 * by the device, causing false positives.
                 */
                memcpy(vaddr, phys_to_virt(orig_addr), size);
        } else {
                kmsan_unpoison_memory(vaddr, size);
                memcpy(phys_to_virt(orig_addr), vaddr, size);
        }
}

static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
{
        return start + (idx << IO_TLB_SHIFT);
}

/*
 * Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
 */
static inline unsigned long get_max_slots(unsigned long boundary_mask)
{
        return (boundary_mask >> IO_TLB_SHIFT) + 1;
}

static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index)
{
        if (index >= mem->area_nslabs)
                return 0;
        return index;
}

/*
 * Track the total used slots with a global atomic value in order to have
 * correct information to determine the high water mark. The mem_used()
 * function gives imprecise results because there's no locking across
 * multiple areas.
 */
#ifdef CONFIG_DEBUG_FS
static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
{
        unsigned long old_hiwater, new_used;

        new_used = atomic_long_add_return(nslots, &mem->total_used);
        old_hiwater = atomic_long_read(&mem->used_hiwater);
        do {
                if (new_used <= old_hiwater)
                        break;
        } while (!atomic_long_try_cmpxchg(&mem->used_hiwater,
                                          &old_hiwater, new_used));
}

static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
{
        atomic_long_sub(nslots, &mem->total_used);
}

#else /* !CONFIG_DEBUG_FS */
static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
{
}
static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
{
}
#endif /* CONFIG_DEBUG_FS */

#ifdef CONFIG_SWIOTLB_DYNAMIC
#ifdef CONFIG_DEBUG_FS
static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
{
        atomic_long_add(nslots, &mem->transient_nslabs);
}

static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
{
        atomic_long_sub(nslots, &mem->transient_nslabs);
}

#else /* !CONFIG_DEBUG_FS */
static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
{
}
static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
{
}
#endif /* CONFIG_DEBUG_FS */
#endif /* CONFIG_SWIOTLB_DYNAMIC */

/**
 * swiotlb_search_pool_area() - search one memory area in one pool
 * @dev:        Device which maps the buffer.
 * @pool:        Memory pool to be searched.
 * @area_index:        Index of the IO TLB memory area to be searched.
 * @orig_addr:        Original (non-bounced) IO buffer address.
 * @alloc_size: Total requested size of the bounce buffer,
 *                including initial alignment padding.
 * @alloc_align_mask:        Required alignment of the allocated buffer.
 *
 * Find a suitable sequence of IO TLB entries for the request and allocate
 * a buffer from the given IO TLB memory area.
 * This function takes care of locking.
 *
 * Return: Index of the first allocated slot, or -1 on error.
 */
static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool,
                int area_index, phys_addr_t orig_addr, size_t alloc_size,
                unsigned int alloc_align_mask)
{
        struct io_tlb_area *area = pool->areas + area_index;
        unsigned long boundary_mask = dma_get_seg_boundary(dev);
        dma_addr_t tbl_dma_addr =
                phys_to_dma_unencrypted(dev, pool->start) & boundary_mask;
        unsigned long max_slots = get_max_slots(boundary_mask);
        unsigned int iotlb_align_mask = dma_get_min_align_mask(dev);
        unsigned int nslots = nr_slots(alloc_size), stride;
        unsigned int offset = swiotlb_align_offset(dev, 0, orig_addr);
        unsigned int index, slots_checked, count = 0, i;
        unsigned long flags;
        unsigned int slot_base;
        unsigned int slot_index;

        BUG_ON(!nslots);
        BUG_ON(area_index >= pool->nareas);

        /*
         * Historically, swiotlb allocations >= PAGE_SIZE were guaranteed to be
         * page-aligned in the absence of any other alignment requirements.
         * 'alloc_align_mask' was later introduced to specify the alignment
         * explicitly, however this is passed as zero for streaming mappings
         * and so we preserve the old behaviour there in case any drivers are
         * relying on it.
         */
        if (!alloc_align_mask && !iotlb_align_mask && alloc_size >= PAGE_SIZE)
                alloc_align_mask = PAGE_SIZE - 1;

        /*
         * Ensure that the allocation is at least slot-aligned and update
         * 'iotlb_align_mask' to ignore bits that will be preserved when
         * offsetting into the allocation.
         */
        alloc_align_mask |= (IO_TLB_SIZE - 1);
        iotlb_align_mask &= ~alloc_align_mask;

        /*
         * For mappings with an alignment requirement don't bother looping to
         * unaligned slots once we found an aligned one.
         */
        stride = get_max_slots(max(alloc_align_mask, iotlb_align_mask));

        spin_lock_irqsave(&area->lock, flags);
        if (unlikely(nslots > pool->area_nslabs - area->used))
                goto not_found;

        slot_base = area_index * pool->area_nslabs;
        index = area->index;

        for (slots_checked = 0; slots_checked < pool->area_nslabs; ) {
                phys_addr_t tlb_addr;

                slot_index = slot_base + index;
                tlb_addr = slot_addr(tbl_dma_addr, slot_index);

                if ((tlb_addr & alloc_align_mask) ||
                    (orig_addr && (tlb_addr & iotlb_align_mask) !=
                                  (orig_addr & iotlb_align_mask))) {
                        index = wrap_area_index(pool, index + 1);
                        slots_checked++;
                        continue;
                }

                if (!iommu_is_span_boundary(slot_index, nslots,
                                            nr_slots(tbl_dma_addr),
                                            max_slots)) {
                        if (pool->slots[slot_index].list >= nslots)
                                goto found;
                }
                index = wrap_area_index(pool, index + stride);
                slots_checked += stride;
        }

not_found:
        spin_unlock_irqrestore(&area->lock, flags);
        return -1;

found:
        /*
         * If we find a slot that indicates we have 'nslots' number of
         * contiguous buffers, we allocate the buffers from that slot onwards
         * and set the list of free entries to '0' indicating unavailable.
         */
        for (i = slot_index; i < slot_index + nslots; i++) {
                pool->slots[i].list = 0;
                pool->slots[i].alloc_size = alloc_size - (offset +
                                ((i - slot_index) << IO_TLB_SHIFT));
        }
        for (i = slot_index - 1;
             io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
             pool->slots[i].list; i--)
                pool->slots[i].list = ++count;

        /*
         * Update the indices to avoid searching in the next round.
         */
        area->index = wrap_area_index(pool, index + nslots);
        area->used += nslots;
        spin_unlock_irqrestore(&area->lock, flags);

        inc_used_and_hiwater(dev->dma_io_tlb_mem, nslots);
        return slot_index;
}

#ifdef CONFIG_SWIOTLB_DYNAMIC

/**
 * swiotlb_search_area() - search one memory area in all pools
 * @dev:        Device which maps the buffer.
 * @start_cpu:        Start CPU number.
 * @cpu_offset:        Offset from @start_cpu.
 * @orig_addr:        Original (non-bounced) IO buffer address.
 * @alloc_size: Total requested size of the bounce buffer,
 *                including initial alignment padding.
 * @alloc_align_mask:        Required alignment of the allocated buffer.
 * @retpool:        Used memory pool, updated on return.
 *
 * Search one memory area in all pools for a sequence of slots that match the
 * allocation constraints.
 *
 * Return: Index of the first allocated slot, or -1 on error.
 */
static int swiotlb_search_area(struct device *dev, int start_cpu,
                int cpu_offset, phys_addr_t orig_addr, size_t alloc_size,
                unsigned int alloc_align_mask, struct io_tlb_pool **retpool)
{
        struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
        struct io_tlb_pool *pool;
        int area_index;
        int index = -1;

        rcu_read_lock();
        list_for_each_entry_rcu(pool, &mem->pools, node) {
                if (cpu_offset >= pool->nareas)
                        continue;
                area_index = (start_cpu + cpu_offset) & (pool->nareas - 1);
                index = swiotlb_search_pool_area(dev, pool, area_index,
                                                 orig_addr, alloc_size,
                                                 alloc_align_mask);
                if (index >= 0) {
                        *retpool = pool;
                        break;
                }
        }
        rcu_read_unlock();
        return index;
}

/**
 * swiotlb_find_slots() - search for slots in the whole swiotlb
 * @dev:        Device which maps the buffer.
 * @orig_addr:        Original (non-bounced) IO buffer address.
 * @alloc_size: Total requested size of the bounce buffer,
 *                including initial alignment padding.
 * @alloc_align_mask:        Required alignment of the allocated buffer.
 * @retpool:        Used memory pool, updated on return.
 *
 * Search through the whole software IO TLB to find a sequence of slots that
 * match the allocation constraints.
 *
 * Return: Index of the first allocated slot, or -1 on error.
 */
static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
                size_t alloc_size, unsigned int alloc_align_mask,
                struct io_tlb_pool **retpool)
{
        struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
        struct io_tlb_pool *pool;
        unsigned long nslabs;
        unsigned long flags;
        u64 phys_limit;
        int cpu, i;
        int index;

        if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE)
                return -1;

        cpu = raw_smp_processor_id();
        for (i = 0; i < default_nareas; ++i) {
                index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size,
                                            alloc_align_mask, &pool);
                if (index >= 0)
                        goto found;
        }

        if (!mem->can_grow)
                return -1;

        schedule_work(&mem->dyn_alloc);

        nslabs = nr_slots(alloc_size);
        phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
        pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
                                  GFP_NOWAIT);
        if (!pool)
                return -1;

        index = swiotlb_search_pool_area(dev, pool, 0, orig_addr,
                                         alloc_size, alloc_align_mask);
        if (index < 0) {
                swiotlb_dyn_free(&pool->rcu);
                return -1;
        }

        pool->transient = true;
        spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
        list_add_rcu(&pool->node, &dev->dma_io_tlb_pools);
        spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
        inc_transient_used(mem, pool->nslabs);

found:
        WRITE_ONCE(dev->dma_uses_io_tlb, true);

        /*
         * The general barrier orders reads and writes against a presumed store
         * of the SWIOTLB buffer address by a device driver (to a driver private
         * data structure). It serves two purposes.
         *
         * First, the store to dev->dma_uses_io_tlb must be ordered before the
         * presumed store. This guarantees that the returned buffer address
         * cannot be passed to another CPU before updating dev->dma_uses_io_tlb.
         *
         * Second, the load from mem->pools must be ordered before the same
         * presumed store. This guarantees that the returned buffer address
         * cannot be observed by another CPU before an update of the RCU list
         * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
         * atomicity).
         *
         * See also the comment in swiotlb_find_pool().
         */
        smp_mb();

        *retpool = pool;
        return index;
}

#else  /* !CONFIG_SWIOTLB_DYNAMIC */

static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
                size_t alloc_size, unsigned int alloc_align_mask,
                struct io_tlb_pool **retpool)
{
        struct io_tlb_pool *pool;
        int start, i;
        int index;

        *retpool = pool = &dev->dma_io_tlb_mem->defpool;
        i = start = raw_smp_processor_id() & (pool->nareas - 1);
        do {
                index = swiotlb_search_pool_area(dev, pool, i, orig_addr,
                                                 alloc_size, alloc_align_mask);
                if (index >= 0)
                        return index;
                if (++i >= pool->nareas)
                        i = 0;
        } while (i != start);
        return -1;
}

#endif /* CONFIG_SWIOTLB_DYNAMIC */

#ifdef CONFIG_DEBUG_FS

/**
 * mem_used() - get number of used slots in an allocator
 * @mem:        Software IO TLB allocator.
 *
 * The result is accurate in this version of the function, because an atomic
 * counter is available if CONFIG_DEBUG_FS is set.
 *
 * Return: Number of used slots.
 */
static unsigned long mem_used(struct io_tlb_mem *mem)
{
        return atomic_long_read(&mem->total_used);
}

#else /* !CONFIG_DEBUG_FS */

/**
 * mem_pool_used() - get number of used slots in a memory pool
 * @pool:        Software IO TLB memory pool.
 *
 * The result is not accurate, see mem_used().
 *
 * Return: Approximate number of used slots.
 */
static unsigned long mem_pool_used(struct io_tlb_pool *pool)
{
        int i;
        unsigned long used = 0;

        for (i = 0; i < pool->nareas; i++)
                used += pool->areas[i].used;
        return used;
}

/**
 * mem_used() - get number of used slots in an allocator
 * @mem:        Software IO TLB allocator.
 *
 * The result is not accurate, because there is no locking of individual
 * areas.
 *
 * Return: Approximate number of used slots.
 */
static unsigned long mem_used(struct io_tlb_mem *mem)
{
#ifdef CONFIG_SWIOTLB_DYNAMIC
        struct io_tlb_pool *pool;
        unsigned long used = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(pool, &mem->pools, node)
                used += mem_pool_used(pool);
        rcu_read_unlock();

        return used;
#else
        return mem_pool_used(&mem->defpool);
#endif
}

#endif /* CONFIG_DEBUG_FS */

/**
 * swiotlb_tbl_map_single() - bounce buffer map a single contiguous physical area
 * @dev:                Device which maps the buffer.
 * @orig_addr:                Original (non-bounced) physical IO buffer address
 * @mapping_size:        Requested size of the actual bounce buffer, excluding
 *                        any pre- or post-padding for alignment
 * @alloc_align_mask:        Required start and end alignment of the allocated buffer
 * @dir:                DMA direction
 * @attrs:                Optional DMA attributes for the map operation
 *
 * Find and allocate a suitable sequence of IO TLB slots for the request.
 * The allocated space starts at an alignment specified by alloc_align_mask,
 * and the size of the allocated space is rounded up so that the total amount
 * of allocated space is a multiple of (alloc_align_mask + 1). If
 * alloc_align_mask is zero, the allocated space may be at any alignment and
 * the size is not rounded up.
 *
 * The returned address is within the allocated space and matches the bits
 * of orig_addr that are specified in the DMA min_align_mask for the device. As
 * such, this returned address may be offset from the beginning of the allocated
 * space. The bounce buffer space starting at the returned address for
 * mapping_size bytes is initialized to the contents of the original IO buffer
 * area. Any pre-padding (due to an offset) and any post-padding (due to
 * rounding-up the size) is not initialized.
 */
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
                size_t mapping_size, unsigned int alloc_align_mask,
                enum dma_data_direction dir, unsigned long attrs)
{
        struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
        unsigned int offset;
        struct io_tlb_pool *pool;
        unsigned int i;
        size_t size;
        int index;
        phys_addr_t tlb_addr;
        unsigned short pad_slots;

        if (!mem || !mem->nslabs) {
                dev_warn_ratelimited(dev,
                        "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
                return (phys_addr_t)DMA_MAPPING_ERROR;
        }

        if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
                pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");

        /*
         * The default swiotlb memory pool is allocated with PAGE_SIZE
         * alignment. If a mapping is requested with larger alignment,
         * the mapping may be unable to use the initial slot(s) in all
         * sets of IO_TLB_SEGSIZE slots. In such case, a mapping request
         * of or near the maximum mapping size would always fail.
         */
        dev_WARN_ONCE(dev, alloc_align_mask > ~PAGE_MASK,
                "Alloc alignment may prevent fulfilling requests with max mapping_size\n");

        offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr);
        size = ALIGN(mapping_size + offset, alloc_align_mask + 1);
        index = swiotlb_find_slots(dev, orig_addr, size, alloc_align_mask, &pool);
        if (index == -1) {
                if (!(attrs & DMA_ATTR_NO_WARN))
                        dev_warn_ratelimited(dev,
        "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
                                 size, mem->nslabs, mem_used(mem));
                return (phys_addr_t)DMA_MAPPING_ERROR;
        }

        /*
         * If dma_skip_sync was set, reset it on first SWIOTLB buffer
         * mapping to always sync SWIOTLB buffers.
         */
        dma_reset_need_sync(dev);

        /*
         * Save away the mapping from the original address to the DMA address.
         * This is needed when we sync the memory.  Then we sync the buffer if
         * needed.
         */
        pad_slots = offset >> IO_TLB_SHIFT;
        offset &= (IO_TLB_SIZE - 1);
        index += pad_slots;
        pool->slots[index].pad_slots = pad_slots;
        for (i = 0; i < (nr_slots(size) - pad_slots); i++)
                pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
        tlb_addr = slot_addr(pool->start, index) + offset;
        /*
         * When the device is writing memory, i.e. dir == DMA_FROM_DEVICE, copy
         * the original buffer to the TLB buffer before initiating DMA in order
         * to preserve the original's data if the device does a partial write,
         * i.e. if the device doesn't overwrite the entire buffer.  Preserving
         * the original data, even if it's garbage, is necessary to match
         * hardware behavior.  Use of swiotlb is supposed to be transparent,
         * i.e. swiotlb must not corrupt memory by clobbering unwritten bytes.
         */
        swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE, pool);
        return tlb_addr;
}

static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr,
                                  struct io_tlb_pool *mem)
{
        unsigned long flags;
        unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr);
        int index, nslots, aindex;
        struct io_tlb_area *area;
        int count, i;

        index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
        index -= mem->slots[index].pad_slots;
        nslots = nr_slots(mem->slots[index].alloc_size + offset);
        aindex = index / mem->area_nslabs;
        area = &mem->areas[aindex];

        /*
         * Return the buffer to the free list by setting the corresponding
         * entries to indicate the number of contiguous entries available.
         * While returning the entries to the free list, we merge the entries
         * with slots below and above the pool being returned.
         */
        BUG_ON(aindex >= mem->nareas);

        spin_lock_irqsave(&area->lock, flags);
        if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
                count = mem->slots[index + nslots].list;
        else
                count = 0;

        /*
         * Step 1: return the slots to the free list, merging the slots with
         * superceeding slots
         */
        for (i = index + nslots - 1; i >= index; i--) {
                mem->slots[i].list = ++count;
                mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
                mem->slots[i].alloc_size = 0;
                mem->slots[i].pad_slots = 0;
        }

        /*
         * Step 2: merge the returned slots with the preceding slots, if
         * available (non zero)
         */
        for (i = index - 1;
             io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
             i--)
                mem->slots[i].list = ++count;
        area->used -= nslots;
        spin_unlock_irqrestore(&area->lock, flags);

        dec_used(dev->dma_io_tlb_mem, nslots);
}

#ifdef CONFIG_SWIOTLB_DYNAMIC

/**
 * swiotlb_del_transient() - delete a transient memory pool
 * @dev:        Device which mapped the buffer.
 * @tlb_addr:        Physical address within a bounce buffer.
 * @pool:       Pointer to the transient memory pool to be checked and deleted.
 *
 * Check whether the address belongs to a transient SWIOTLB memory pool.
 * If yes, then delete the pool.
 *
 * Return: %true if @tlb_addr belonged to a transient pool that was released.
 */
static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr,
                struct io_tlb_pool *pool)
{
        if (!pool->transient)
                return false;

        dec_used(dev->dma_io_tlb_mem, pool->nslabs);
        swiotlb_del_pool(dev, pool);
        dec_transient_used(dev->dma_io_tlb_mem, pool->nslabs);
        return true;
}

#else  /* !CONFIG_SWIOTLB_DYNAMIC */

static inline bool swiotlb_del_transient(struct device *dev,
                phys_addr_t tlb_addr, struct io_tlb_pool *pool)
{
        return false;
}

#endif        /* CONFIG_SWIOTLB_DYNAMIC */

/*
 * tlb_addr is the physical address of the bounce buffer to unmap.
 */
void __swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
                size_t mapping_size, enum dma_data_direction dir,
                unsigned long attrs, struct io_tlb_pool *pool)
{
        /*
         * First, sync the memory before unmapping the entry
         */
        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
            (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
                swiotlb_bounce(dev, tlb_addr, mapping_size,
                                                DMA_FROM_DEVICE, pool);

        if (swiotlb_del_transient(dev, tlb_addr, pool))
                return;
        swiotlb_release_slots(dev, tlb_addr, pool);
}

void __swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
                size_t size, enum dma_data_direction dir,
                struct io_tlb_pool *pool)
{
        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
                swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE, pool);
        else
                BUG_ON(dir != DMA_FROM_DEVICE);
}

void __swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
                size_t size, enum dma_data_direction dir,
                struct io_tlb_pool *pool)
{
        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
                swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE, pool);
        else
                BUG_ON(dir != DMA_TO_DEVICE);
}

/*
 * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing
 * to the device copy the data into it as well.
 */
dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
                enum dma_data_direction dir, unsigned long attrs)
{
        phys_addr_t swiotlb_addr;
        dma_addr_t dma_addr;

        trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);

        swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, attrs);
        if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
                return DMA_MAPPING_ERROR;

        /* Ensure that the address returned is DMA'ble */
        dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
        if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
                __swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
                        attrs | DMA_ATTR_SKIP_CPU_SYNC,
                        swiotlb_find_pool(dev, swiotlb_addr));
                dev_WARN_ONCE(dev, 1,
                        "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
                        &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
                return DMA_MAPPING_ERROR;
        }

        if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
                arch_sync_dma_for_device(swiotlb_addr, size, dir);
                arch_sync_dma_flush();
        }
        return dma_addr;
}

size_t swiotlb_max_mapping_size(struct device *dev)
{
        int min_align_mask = dma_get_min_align_mask(dev);
        int min_align = 0;

        /*
         * swiotlb_find_slots() skips slots according to
         * min align mask. This affects max mapping size.
         * Take it into acount here.
         */
        if (min_align_mask)
                min_align = roundup(min_align_mask, IO_TLB_SIZE);

        return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align;
}

/**
 * is_swiotlb_allocated() - check if the default software IO TLB is initialized
 */
bool is_swiotlb_allocated(void)
{
        return io_tlb_default_mem.nslabs;
}

bool is_swiotlb_active(struct device *dev)
{
        struct io_tlb_mem *mem = dev->dma_io_tlb_mem;

        return mem && mem->nslabs;
}

/**
 * default_swiotlb_base() - get the base address of the default SWIOTLB
 *
 * Get the lowest physical address used by the default software IO TLB pool.
 */
phys_addr_t default_swiotlb_base(void)
{
#ifdef CONFIG_SWIOTLB_DYNAMIC
        io_tlb_default_mem.can_grow = false;
#endif
        return io_tlb_default_mem.defpool.start;
}

/**
 * default_swiotlb_limit() - get the address limit of the default SWIOTLB
 *
 * Get the highest physical address used by the default software IO TLB pool.
 */
phys_addr_t default_swiotlb_limit(void)
{
#ifdef CONFIG_SWIOTLB_DYNAMIC
        return io_tlb_default_mem.phys_limit;
#else
        return io_tlb_default_mem.defpool.end - 1;
#endif
}

#ifdef CONFIG_DEBUG_FS
#ifdef CONFIG_SWIOTLB_DYNAMIC
static unsigned long mem_transient_used(struct io_tlb_mem *mem)
{
        return atomic_long_read(&mem->transient_nslabs);
}

static int io_tlb_transient_used_get(void *data, u64 *val)
{
        struct io_tlb_mem *mem = data;

        *val = mem_transient_used(mem);
        return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_transient_used, io_tlb_transient_used_get,
                         NULL, "%llu\n");
#endif /* CONFIG_SWIOTLB_DYNAMIC */

static int io_tlb_used_get(void *data, u64 *val)
{
        struct io_tlb_mem *mem = data;

        *val = mem_used(mem);
        return 0;
}

static int io_tlb_hiwater_get(void *data, u64 *val)
{
        struct io_tlb_mem *mem = data;

        *val = atomic_long_read(&mem->used_hiwater);
        return 0;
}

static int io_tlb_hiwater_set(void *data, u64 val)
{
        struct io_tlb_mem *mem = data;

        /* Only allow setting to zero */
        if (val != 0)
                return -EINVAL;

        atomic_long_set(&mem->used_hiwater, val);
        return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get,
                                io_tlb_hiwater_set, "%llu\n");

static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
                                         const char *dirname)
{
        mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
        if (!mem->nslabs)
                return;

        debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
        debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem,
                        &fops_io_tlb_used);
        debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem,
                        &fops_io_tlb_hiwater);
#ifdef CONFIG_SWIOTLB_DYNAMIC
        debugfs_create_file("io_tlb_transient_nslabs", 0400, mem->debugfs,
                            mem, &fops_io_tlb_transient_used);
#endif
}

static int __init swiotlb_create_default_debugfs(void)
{
        swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
        return 0;
}

late_initcall(swiotlb_create_default_debugfs);

#else  /* !CONFIG_DEBUG_FS */

static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
                                                const char *dirname)
{
}

#endif        /* CONFIG_DEBUG_FS */

#ifdef CONFIG_DMA_RESTRICTED_POOL

struct page *swiotlb_alloc(struct device *dev, size_t size)
{
        struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
        struct io_tlb_pool *pool;
        phys_addr_t tlb_addr;
        unsigned int align;
        int index;

        if (!mem)
                return NULL;

        align = (1 << (get_order(size) + PAGE_SHIFT)) - 1;
        index = swiotlb_find_slots(dev, 0, size, align, &pool);
        if (index == -1)
                return NULL;

        tlb_addr = slot_addr(pool->start, index);
        if (unlikely(!PAGE_ALIGNED(tlb_addr))) {
                dev_WARN_ONCE(dev, 1, "Cannot allocate pages from non page-aligned swiotlb addr 0x%pa.\n",
                              &tlb_addr);
                swiotlb_release_slots(dev, tlb_addr, pool);
                return NULL;
        }

        return pfn_to_page(PFN_DOWN(tlb_addr));
}

bool swiotlb_free(struct device *dev, struct page *page, size_t size)
{
        phys_addr_t tlb_addr = page_to_phys(page);
        struct io_tlb_pool *pool;

        pool = swiotlb_find_pool(dev, tlb_addr);
        if (!pool)
                return false;

        swiotlb_release_slots(dev, tlb_addr, pool);

        return true;
}

static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
                                    struct device *dev)
{
        struct io_tlb_mem *mem = rmem->priv;
        unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;

        /* Set Per-device io tlb area to one */
        unsigned int nareas = 1;

        if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) {
                dev_err(dev, "Restricted DMA pool must be accessible within the linear mapping.");
                return -EINVAL;
        }

        /*
         * Since multiple devices can share the same pool, the private data,
         * io_tlb_mem struct, will be initialized by the first device attached
         * to it.
         */
        if (!mem) {
                struct io_tlb_pool *pool;

                mem = kzalloc_obj(*mem);
                if (!mem)
                        return -ENOMEM;
                pool = &mem->defpool;

                pool->slots = kzalloc_objs(*pool->slots, nslabs);
                if (!pool->slots) {
                        kfree(mem);
                        return -ENOMEM;
                }

                pool->areas = kzalloc_objs(*pool->areas, nareas);
                if (!pool->areas) {
                        kfree(pool->slots);
                        kfree(mem);
                        return -ENOMEM;
                }

                set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
                                     rmem->size >> PAGE_SHIFT);
                swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs,
                                         false, nareas);
                mem->force_bounce = true;
                mem->for_alloc = true;
#ifdef CONFIG_SWIOTLB_DYNAMIC
                spin_lock_init(&mem->lock);
                INIT_LIST_HEAD_RCU(&mem->pools);
#endif
                add_mem_pool(mem, pool);

                rmem->priv = mem;

                swiotlb_create_debugfs_files(mem, rmem->name);
        }

        dev->dma_io_tlb_mem = mem;

        return 0;
}

static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
                                        struct device *dev)
{
        dev->dma_io_tlb_mem = &io_tlb_default_mem;
}

static int __init rmem_swiotlb_setup(unsigned long node,
                                     struct reserved_mem *rmem)
{
        if (of_get_flat_dt_prop(node, "reusable", NULL) ||
            of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
            of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
            of_get_flat_dt_prop(node, "no-map", NULL))
                return -EINVAL;

        pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
                &rmem->base, (unsigned long)rmem->size / SZ_1M);
        return 0;
}

static const struct reserved_mem_ops rmem_swiotlb_ops = {
        .node_init = rmem_swiotlb_setup,
        .device_init = rmem_swiotlb_device_init,
        .device_release = rmem_swiotlb_device_release,
};

RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", &rmem_swiotlb_ops);
#endif /* CONFIG_DMA_RESTRICTED_POOL */






















































































































































    1 








    1 























































































































































































































    1 







































    1 

    1 




    1 
    1 
    1 

























    1 












    1 
    1 



















































    1 


















    1 
















    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
// SPDX-License-Identifier: GPL-2.0
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/swap.h>
#include <linux/rmap.h>
#include <linux/pgalloc.h>
#include <linux/hugetlb.h>

#include <asm/tlb.h>

#ifndef CONFIG_MMU_GATHER_NO_GATHER

static bool tlb_next_batch(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch;

        /* Limit batching if we have delayed rmaps pending */
        if (tlb->delayed_rmap && tlb->active != &tlb->local)
                return false;

        batch = tlb->active;
        if (batch->next) {
                tlb->active = batch->next;
                return true;
        }

        if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
                return false;

        batch = (void *)__get_free_page(GFP_NOWAIT);
        if (!batch)
                return false;

        tlb->batch_count++;
        batch->next = NULL;
        batch->nr   = 0;
        batch->max  = MAX_GATHER_BATCH;

        tlb->active->next = batch;
        tlb->active = batch;

        return true;
}

#ifdef CONFIG_SMP
static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
{
        struct encoded_page **pages = batch->encoded_pages;

        for (int i = 0; i < batch->nr; i++) {
                struct encoded_page *enc = pages[i];

                if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
                        struct page *page = encoded_page_ptr(enc);
                        unsigned int nr_pages = 1;

                        if (unlikely(encoded_page_flags(enc) &
                                     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                nr_pages = encoded_nr_pages(pages[++i]);

                        folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
                                               vma);
                }
        }
}

/**
 * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
 * @tlb: the current mmu_gather
 * @vma: The memory area from which the pages are being removed.
 *
 * Note that because of how tlb_next_batch() above works, we will
 * never start multiple new batches with pending delayed rmaps, so
 * we only need to walk through the current active batch and the
 * original local one.
 */
void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (!tlb->delayed_rmap)
                return;

        tlb_flush_rmap_batch(&tlb->local, vma);
        if (tlb->active != &tlb->local)
                tlb_flush_rmap_batch(tlb->active, vma);
        tlb->delayed_rmap = 0;
}
#endif

/*
 * We might end up freeing a lot of pages. Reschedule on a regular
 * basis to avoid soft lockups in configurations without full
 * preemption enabled. The magic number of 512 folios seems to work.
 */
#define MAX_NR_FOLIOS_PER_FREE                512

static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
{
        struct encoded_page **pages = batch->encoded_pages;
        unsigned int nr, nr_pages;

        while (batch->nr) {
                if (!page_poisoning_enabled_static() && !want_init_on_free()) {
                        nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);

                        /*
                         * Make sure we cover page + nr_pages, and don't leave
                         * nr_pages behind when capping the number of entries.
                         */
                        if (unlikely(encoded_page_flags(pages[nr - 1]) &
                                     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                nr++;
                } else {
                        /*
                         * With page poisoning and init_on_free, the time it
                         * takes to free memory grows proportionally with the
                         * actual memory size. Therefore, limit based on the
                         * actual memory size and not the number of involved
                         * folios.
                         */
                        for (nr = 0, nr_pages = 0;
                             nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
                             nr++) {
                                if (unlikely(encoded_page_flags(pages[nr]) &
                                             ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                        nr_pages += encoded_nr_pages(pages[++nr]);
                                else
                                        nr_pages++;
                        }
                }

                free_pages_and_swap_cache(pages, nr);
                pages += nr;
                batch->nr -= nr;

                cond_resched();
        }
}

static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch;

        for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
                __tlb_batch_free_encoded_pages(batch);
        tlb->active = &tlb->local;
}

static void tlb_batch_list_free(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch, *next;

        for (batch = tlb->local.next; batch; batch = next) {
                next = batch->next;
                free_pages((unsigned long)batch, 0);
        }
        tlb->local.next = NULL;
}

static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
                struct page *page, unsigned int nr_pages, bool delay_rmap,
                int page_size)
{
        int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
        struct mmu_gather_batch *batch;

        VM_BUG_ON(!tlb->end);

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        VM_WARN_ON(tlb->page_size != page_size);
        VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
        VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
#endif

        batch = tlb->active;
        /*
         * Add the page and check if we are full. If so
         * force a flush.
         */
        if (likely(nr_pages == 1)) {
                batch->encoded_pages[batch->nr++] = encode_page(page, flags);
        } else {
                flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
                batch->encoded_pages[batch->nr++] = encode_page(page, flags);
                batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
        }
        /*
         * Make sure that we can always add another "page" + "nr_pages",
         * requiring two entries instead of only a single one.
         */
        if (batch->nr >= batch->max - 1) {
                if (!tlb_next_batch(tlb))
                        return true;
                batch = tlb->active;
        }
        VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);

        return false;
}

bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
                unsigned int nr_pages, bool delay_rmap)
{
        return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
                                             PAGE_SIZE);
}

bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
{
        return __tlb_remove_folio_pages_size(tlb, page, 1, false, page_size);
}

#endif /* MMU_GATHER_NO_GATHER */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

static void __tlb_remove_table_free(struct mmu_table_batch *batch)
{
        int i;

        for (i = 0; i < batch->nr; i++)
                __tlb_remove_table(batch->tables[i]);

        free_page((unsigned long)batch);
}

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE

/*
 * Semi RCU freeing of the page directories.
 *
 * This is needed by some architectures to implement software pagetable walkers.
 *
 * gup_fast() and other software pagetable walkers do a lockless page-table
 * walk and therefore needs some synchronization with the freeing of the page
 * directories. The chosen means to accomplish that is by disabling IRQs over
 * the walk.
 *
 * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 * since we unlink the page, flush TLBs, free the page. Since the disabling of
 * IRQs delays the completion of the TLB flush we can never observe an already
 * freed page.
 *
 * Not all systems IPI every CPU for this purpose:
 *
 * - Some architectures have HW support for cross-CPU synchronisation of TLB
 *   flushes, so there's no IPI at all.
 *
 * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate
 *   with the hypervisor to defer flushing on preempted vCPUs.
 *
 * Such systems need to delay the freeing by some other means, this is that
 * means.
 *
 * What we do is batch the freed directory pages (tables) and RCU free them.
 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 * holds off grace periods.
 *
 * However, in order to batch these pages we need to allocate storage, this
 * allocation is deep inside the MM code and can thus easily fail on memory
 * pressure. To guarantee progress we fall back to single table freeing, see
 * the implementation of tlb_remove_table_one().
 *
 */

static void tlb_remove_table_smp_sync(void *arg)
{
        /* Simply deliver the interrupt */
}

void tlb_remove_table_sync_one(void)
{
        /*
         * This isn't an RCU grace period and hence the page-tables cannot be
         * assumed to be actually RCU-freed.
         *
         * It is however sufficient for software page-table walkers that rely on
         * IRQ disabling.
         */
        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
        __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
}

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
        call_rcu(&batch->rcu, tlb_remove_table_rcu);
}

/**
 * tlb_remove_table_sync_rcu - synchronize with software page-table walkers
 *
 * Like tlb_remove_table_sync_one() but uses RCU grace period instead of IPI
 * broadcast. Use in slow paths where sleeping is acceptable.
 *
 * Software/Lockless page-table walkers use local_irq_disable(), which is also
 * an RCU read-side critical section. synchronize_rcu() waits for all such
 * sections, providing the same guarantee as tlb_remove_table_sync_one() but
 * without disrupting all CPUs with IPIs.
 *
 * Do not use for freeing memory. Use RCU callbacks instead to avoid latency
 * spikes.
 */
void tlb_remove_table_sync_rcu(void)
{
        synchronize_rcu();
}

#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
        __tlb_remove_table_free(batch);
}

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */

/*
 * If we want tlb_remove_table() to imply TLB invalidates.
 */
static inline void tlb_table_invalidate(struct mmu_gather *tlb)
{
        if (tlb_needs_table_invalidate()) {
                /*
                 * Invalidate page-table caches used by hardware walkers. Then
                 * we still need to RCU-sched wait while freeing the pages
                 * because software walkers can still be in-flight.
                 */
                tlb_flush_mmu_tlbonly(tlb);
        }
}

#ifdef CONFIG_PT_RECLAIM
static inline void __tlb_remove_table_one_rcu(struct rcu_head *head)
{
        struct ptdesc *ptdesc;

        ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
        __tlb_remove_table(ptdesc);
}

static inline void __tlb_remove_table_one(void *table)
{
        struct ptdesc *ptdesc;

        ptdesc = table;
        call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu);
}
#else
static inline void __tlb_remove_table_one(void *table)
{
        tlb_remove_table_sync_rcu();
        __tlb_remove_table(table);
}
#endif /* CONFIG_PT_RECLAIM */

static void tlb_remove_table_one(void *table)
{
        __tlb_remove_table_one(table);
}

static void tlb_table_flush(struct mmu_gather *tlb)
{
        struct mmu_table_batch **batch = &tlb->batch;

        if (*batch) {
                tlb_table_invalidate(tlb);
                tlb_remove_table_free(*batch);
                *batch = NULL;
        }
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        struct mmu_table_batch **batch = &tlb->batch;

        if (*batch == NULL) {
                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
                if (*batch == NULL) {
                        tlb_table_invalidate(tlb);
                        tlb_remove_table_one(table);
                        return;
                }
                (*batch)->nr = 0;
        }

        (*batch)->tables[(*batch)->nr++] = table;
        if ((*batch)->nr == MAX_TABLE_BATCH)
                tlb_table_flush(tlb);
}

static inline void tlb_table_init(struct mmu_gather *tlb)
{
        tlb->batch = NULL;
}

#else /* !CONFIG_MMU_GATHER_TABLE_FREE */

static inline void tlb_table_flush(struct mmu_gather *tlb) { }
static inline void tlb_table_init(struct mmu_gather *tlb) { }

#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
        tlb_table_flush(tlb);
#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb_batch_pages_flush(tlb);
#endif
}

void tlb_flush_mmu(struct mmu_gather *tlb)
{
        tlb_flush_mmu_tlbonly(tlb);
        tlb_flush_mmu_free(tlb);
}

static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
                             bool fullmm)
{
        tlb->mm = mm;
        tlb->fullmm = fullmm;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb->need_flush_all = 0;
        tlb->local.next = NULL;
        tlb->local.nr   = 0;
        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
        tlb->active     = &tlb->local;
        tlb->batch_count = 0;
#endif
        tlb->delayed_rmap = 0;

        tlb_table_init(tlb);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        tlb->page_size = 0;
#endif
        tlb->vma_pfn = 0;

        tlb->fully_unshared_tables = 0;
        __tlb_reset_range(tlb);
        inc_tlb_flush_pending(tlb->mm);
}

/**
 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm.
 */
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
{
        __tlb_gather_mmu(tlb, mm, false);
}

/**
 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 *
 * In this case, @mm is without users and we're going to destroy the
 * full address space (exit/execve).
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm.
 */
void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
{
        __tlb_gather_mmu(tlb, mm, true);
}

/**
 * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a
 *                        single VMA
 * @tlb: the mmu_gather structure to initialize
 * @vma: the vm_area_struct
 *
 * Called to initialize an (on-stack) mmu_gather structure for operating on
 * a single VMA. In contrast to tlb_gather_mmu(), calling this function will
 * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(),
 * this function will *not* call flush_cache_range().
 *
 * For hugetlb VMAs, this function will also initialize the mmu_gather
 * page_size accordingly, not requiring a separate call to
 * tlb_change_page_size().
 *
 */
void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        tlb_gather_mmu(tlb, vma->vm_mm);
        tlb_update_vma_flags(tlb, vma);
        if (is_vm_hugetlb_page(vma))
                /* All entries have the same size. */
                tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma)));
}

/**
 * tlb_finish_mmu - finish an mmu_gather structure
 * @tlb: the mmu_gather structure to finish
 *
 * Called at the end of the shootdown operation to free up any resources that
 * were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb)
{
        /*
         * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
         * due to complicated locking requirements with page table unsharing.
         */
        VM_WARN_ON_ONCE(tlb->fully_unshared_tables);

        /*
         * If there are parallel threads are doing PTE changes on same range
         * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
         * flush by batching, one thread may end up seeing inconsistent PTEs
         * and result in having stale TLB entries.  So flush TLB forcefully
         * if we detect parallel PTE batching threads.
         *
         * However, some syscalls, e.g. munmap(), may free page tables, this
         * needs force flush everything in the given range. Otherwise this
         * may result in having stale TLB entries for some architectures,
         * e.g. aarch64, that could specify flush what level TLB.
         */
        if (mm_tlb_flush_nested(tlb->mm)) {
                /*
                 * The aarch64 yields better performance with fullmm by
                 * avoiding multiple CPUs spamming TLBI messages at the
                 * same time.
                 *
                 * On x86 non-fullmm doesn't yield significant difference
                 * against fullmm.
                 */
                tlb->fullmm = 1;
                __tlb_reset_range(tlb);
                tlb->freed_tables = 1;
        }

        tlb_flush_mmu(tlb);

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb_batch_list_free(tlb);
#endif
        dec_tlb_flush_pending(tlb->mm);
}

































































































































































































































    5 


























    5 


















    5 





    5 




















    5 

































































    1 























































































































































    3 






    4 





   30 































































    1 







































    1 






    1 















    1 







    1 









































   71 




























   19 


















































































































   21 









   18 





























    6 








   19 














































































































































































































    5 


    5 
    5 






    5 


    3 


































   21 



   18 



























    5 














   21 





































   28 


















   30 






   19 




   22 











































    5 







   23 



   19 






   18 

















   32 



















   32 



   24 
    3 


















   31 


    5 












    3 




























































   28 











   30 





   32 
   23 

















   21 










   21 





   21 

   19 


















































































































    1 





    4 











































































    3 



    1 











    3 


    3 




















    3 


























    4 






    3 




















    2 
    3 































   39 





   42 

   45 


   24 





































































































































































   73 







   68 




   71 









    1 


   73 

   76 
   73 
















































































































































    1 









































    4 











    4 
    4 
    4 























    3 



    3 



    3 

    3 


    3 

    3 

    3 





























    3 







    3 



































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  High-resolution kernel timers
 *
 *  In contrast to the low-resolution timeout API, aka timer wheel,
 *  hrtimers provide finer resolution and accuracy depending on system
 *  configuration and capabilities.
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  Credits:
 *        Based on the original timer wheel code
 *
 *        Help, testing, suggestions, bugfixes, improvements were
 *        provided by:
 *
 *        George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
 *        et. al.
 */

#include <linux/cpu.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include <trace/events/timer.h>

#include "tick-internal.h"

/*
 * Constants to set the queued state of the timer (INACTIVE, ENQUEUED)
 *
 * The callback state is kept separate in the CPU base because having it in
 * the timer would required touching the timer after the callback, which
 * makes it impossible to free the timer from the callback function.
 *
 * Therefore we track the callback state in:
 *
 *        timer->base->cpu_base->running == timer
 *
 * On SMP it is possible to have a "callback function running and enqueued"
 * status. It happens for example when a posix timer expired and the callback
 * queued a signal. Between dropping the lock which protects the posix timer
 * and reacquiring the base lock of the hrtimer, another CPU can deliver the
 * signal and rearm the timer.
 *
 * All state transitions are protected by cpu_base->lock.
 */
#define HRTIMER_STATE_INACTIVE        false
#define HRTIMER_STATE_ENQUEUED        true

/*
 * The resolution of the clocks. The resolution value is returned in
 * the clock_getres() system call to give application programmers an
 * idea of the (in)accuracy of timers. Timer values are rounded up to
 * this resolution values.
 */
#define HIGH_RES_NSEC                1

/*
 * Masks for selecting the soft and hard context timers from
 * cpu_base->active
 */
#define MASK_SHIFT                (HRTIMER_BASE_MONOTONIC_SOFT)
#define HRTIMER_ACTIVE_HARD        ((1U << MASK_SHIFT) - 1)
#define HRTIMER_ACTIVE_SOFT        (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL        (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)

static void retrigger_next_event(void *arg);
static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);

/*
 * The timer bases:
 *
 * There are more clockids than hrtimer bases. Thus, we index
 * into the timer bases by the hrtimer_base_type enum. When trying
 * to reach a base using a clockid, hrtimer_clockid_to_base()
 * is used to convert from clockid to the proper hrtimer_base_type.
 */

#define BASE_INIT(idx, cid)                        \
        [idx] = { .index = idx, .clockid = cid }

DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
        .clock_base = {
                BASE_INIT(HRTIMER_BASE_MONOTONIC,        CLOCK_MONOTONIC),
                BASE_INIT(HRTIMER_BASE_REALTIME,        CLOCK_REALTIME),
                BASE_INIT(HRTIMER_BASE_BOOTTIME,        CLOCK_BOOTTIME),
                BASE_INIT(HRTIMER_BASE_TAI,                CLOCK_TAI),
                BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT,        CLOCK_MONOTONIC),
                BASE_INIT(HRTIMER_BASE_REALTIME_SOFT,        CLOCK_REALTIME),
                BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT,        CLOCK_BOOTTIME),
                BASE_INIT(HRTIMER_BASE_TAI_SOFT,        CLOCK_TAI),
        },
        .csd = CSD_INIT(retrigger_next_event, NULL)
};

static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
{
        if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
                return true;
        else
                return likely(base->online);
}

#ifdef CONFIG_HIGH_RES_TIMERS
DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key);

static void hrtimer_hres_workfn(struct work_struct *work)
{
        static_branch_enable(&hrtimer_highres_enabled_key);
}

static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn);

static inline void hrtimer_schedule_hres_work(void)
{
        if (!hrtimer_highres_enabled())
                schedule_work(&hrtimer_hres_work);
}
#else
static inline void hrtimer_schedule_hres_work(void) { }
#endif

/*
 * Functions and macros which are different for UP/SMP systems are kept in a
 * single place
 */
#ifdef CONFIG_SMP
/*
 * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
 * such that hrtimer_callback_running() can unconditionally dereference
 * timer->base->cpu_base
 */
static struct hrtimer_cpu_base migration_cpu_base = {
        .clock_base = {
                [0] = {
                        .cpu_base = &migration_cpu_base,
                        .seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
                                                             &migration_cpu_base.lock),
                },
        },
};

#define migration_base        migration_cpu_base.clock_base[0]

/*
 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
 * means that all timers which are tied to this base via timer->base are
 * locked, and the base itself is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found on the lists/queues.
 *
 * When the timer's base is locked, and the timer removed from list, it is
 * possible to set timer->base = &migration_base and drop the lock: the timer
 * remains locked.
 */
static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
                                                    unsigned long *flags)
        __acquires(&timer->base->lock)
{
        for (;;) {
                struct hrtimer_clock_base *base = READ_ONCE(timer->base);

                if (likely(base != &migration_base)) {
                        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
                        /* The timer has migrated to another CPU: */
                        raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
                }
                cpu_relax();
        }
}

/*
 * Check if the elected target is suitable considering its next
 * event and the hotplug state of the current CPU.
 *
 * If the elected target is remote and its next event is after the timer
 * to queue, then a remote reprogram is necessary. However there is no
 * guarantee the IPI handling the operation would arrive in time to meet
 * the high resolution deadline. In this case the local CPU becomes a
 * preferred target, unless it is offline.
 *
 * High and low resolution modes are handled the same way for simplicity.
 *
 * Called with cpu_base->lock of target cpu held.
 */
static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
                                    struct hrtimer_cpu_base *new_cpu_base,
                                    struct hrtimer_cpu_base *this_cpu_base)
{
        ktime_t expires;

        /*
         * The local CPU clockevent can be reprogrammed. Also get_target_base()
         * guarantees it is online.
         */
        if (new_cpu_base == this_cpu_base)
                return true;

        /*
         * The offline local CPU can't be the default target if the
         * next remote target event is after this timer. Keep the
         * elected new base. An IPI will be issued to reprogram
         * it as a last resort.
         */
        if (!hrtimer_base_is_online(this_cpu_base))
                return true;

        expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);

        return expires >= new_base->cpu_base->expires_next;
}

static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned)
{
        if (!hrtimer_base_is_online(base)) {
                int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));

                return &per_cpu(hrtimer_bases, cpu);
        }

#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
        if (static_branch_likely(&timers_migration_enabled) && !pinned)
                return &per_cpu(hrtimer_bases, get_nohz_timer_target());
#endif
        return base;
}

/*
 * We switch the timer base to a power-optimized selected CPU target,
 * if:
 *        - NO_HZ_COMMON is enabled
 *        - timer migration is enabled
 *        - the timer callback is not running
 *        - the timer is not the first expiring timer on the new target
 *
 * If one of the above requirements is not fulfilled we move the timer
 * to the current CPU or leave it on the previously assigned CPU if
 * the timer callback is currently running.
 */
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned)
{
        struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
        struct hrtimer_clock_base *new_base;
        int basenum = base->index;

        this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
        new_base = &new_cpu_base->clock_base[basenum];

        if (base != new_base) {
                /*
                 * We are trying to move timer to new_base. However we can't
                 * change timer's base while it is running, so we keep it on
                 * the same CPU. No hassle vs. reprogramming the event source
                 * in the high resolution case. The remote CPU will take care
                 * of this when the timer function has completed. There is no
                 * conflict as we hold the lock until the timer is enqueued.
                 */
                if (unlikely(hrtimer_callback_running(timer)))
                        return base;

                /* See the comment in lock_hrtimer_base() */
                WRITE_ONCE(timer->base, &migration_base);
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);

                if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
                        raw_spin_unlock(&new_base->cpu_base->lock);
                        raw_spin_lock(&base->cpu_base->lock);
                        new_cpu_base = this_cpu_base;
                        WRITE_ONCE(timer->base, base);
                        goto again;
                }
                WRITE_ONCE(timer->base, new_base);
        } else {
                if (!hrtimer_suitable_target(timer, new_base,  new_cpu_base, this_cpu_base)) {
                        new_cpu_base = this_cpu_base;
                        goto again;
                }
        }
        return new_base;
}

#else /* CONFIG_SMP */

static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
                                                           unsigned long *flags)
        __acquires(&timer->base->cpu_base->lock)
{
        struct hrtimer_clock_base *base = timer->base;

        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
        return base;
}

# define switch_hrtimer_base(t, b, p)        (b)

#endif        /* !CONFIG_SMP */

/*
 * Functions for the union type storage format of ktime_t which are
 * too large for inlining:
 */
#if BITS_PER_LONG < 64
/*
 * Divide a ktime value by a nanosecond value
 */
s64 __ktime_divns(const ktime_t kt, s64 div)
{
        int sft = 0;
        s64 dclc;
        u64 tmp;

        dclc = ktime_to_ns(kt);
        tmp = dclc < 0 ? -dclc : dclc;

        /* Make sure the divisor is less than 2^32: */
        while (div >> 32) {
                sft++;
                div >>= 1;
        }
        tmp >>= sft;
        do_div(tmp, (u32) div);
        return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG < 64 */

/*
 * Add two ktime values and do a safety check for overflow:
 */
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
        ktime_t res = ktime_add_unsafe(lhs, rhs);

        /*
         * We use KTIME_SEC_MAX here, the maximum timeout which we can
         * return to user space in a timespec:
         */
        if (res < 0 || res < lhs || res < rhs)
                res = ktime_set(KTIME_SEC_MAX, 0);

        return res;
}

EXPORT_SYMBOL_GPL(ktime_add_safe);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr hrtimer_debug_descr;

static void *hrtimer_debug_hint(void *addr)
{
        return ACCESS_PRIVATE((struct hrtimer *)addr, function);
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_init(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_free(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

/* Stub timer callback for improperly used timers. */
static enum hrtimer_restart stub_timer(struct hrtimer *unused)
{
        WARN_ON_ONCE(1);
        return HRTIMER_NORESTART;
}

/*
 * hrtimer_fixup_assert_init is called when:
 * - an untracked/uninit-ed object is found
 */
static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr hrtimer_debug_descr = {
        .name                        = "hrtimer",
        .debug_hint                = hrtimer_debug_hint,
        .fixup_init                = hrtimer_fixup_init,
        .fixup_activate                = hrtimer_fixup_activate,
        .fixup_free                = hrtimer_fixup_free,
        .fixup_assert_init        = hrtimer_fixup_assert_init,
};

static inline void debug_hrtimer_init(struct hrtimer *timer)
{
        debug_object_init(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
{
        debug_object_init_on_stack(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode)
{
        debug_object_activate(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
{
        debug_object_deactivate(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_assert_init(struct hrtimer *timer)
{
        debug_object_assert_init(timer, &hrtimer_debug_descr);
}

void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
        debug_object_free(timer, &hrtimer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);

#else

static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { }
#endif

static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode)
{
        debug_hrtimer_init(timer);
        trace_hrtimer_setup(timer, clockid, mode);
}

static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid,
                                        enum hrtimer_mode mode)
{
        debug_hrtimer_init_on_stack(timer);
        trace_hrtimer_setup(timer, clockid, mode);
}

static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed)
{
        debug_hrtimer_activate(timer, mode);
        trace_hrtimer_start(timer, mode, was_armed);
}

#define for_each_active_base(base, cpu_base, active)                                        \
        for (unsigned int idx = ffs(active); idx--; idx = ffs((active)))                \
                for (bool done = false; !done; active &= ~(1U << idx))                        \
                        for (base = &cpu_base->clock_base[idx]; !done; done = true)

#define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node)

#if defined(CONFIG_NO_HZ_COMMON)
/*
 * Same as hrtimer_bases_next_event() below, but skips the excluded timer and
 * does not update cpu_base->next_timer/expires.
 */
static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base,
                                                const struct hrtimer *exclude,
                                                unsigned int active, ktime_t expires_next)
{
        struct hrtimer_clock_base *base;
        ktime_t expires;

        lockdep_assert_held(&cpu_base->lock);

        for_each_active_base(base, cpu_base, active) {
                expires = ktime_sub(base->expires_next, base->offset);
                if (expires >= expires_next)
                        continue;

                /*
                 * If the excluded timer is the first on this base evaluate the
                 * next timer.
                 */
                struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active);

                if (unlikely(&exclude->node == node)) {
                        node = timerqueue_linked_next(node);
                        if (!node)
                                continue;
                        expires = ktime_sub(node->expires, base->offset);
                        if (expires >= expires_next)
                                continue;
                }
                expires_next = expires;
        }
        /* If base->offset changed, the result might be negative */
        return max(expires_next, 0);
}
#endif

static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base)
{
        struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);

        return hrtimer_from_timerqueue_node(next);
}

/* Find the base with the earliest expiry */
static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active,
                                ktime_t *expires_next, struct hrtimer **next_timer)
{
        struct hrtimer_clock_base *base;
        ktime_t expires;

        for_each_active_base(base, cpu_base, active) {
                expires = ktime_sub(base->expires_next, base->offset);
                if (expires < *expires_next) {
                        *expires_next = expires;
                        *next_timer = clock_base_next_timer(base);
                }
        }
}

/*
 * Recomputes cpu_base::*next_timer and returns the earliest expires_next
 * but does not set cpu_base::*expires_next, that is done by
 * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
 * cpu_base::*expires_next right away, reprogramming logic would no longer
 * work.
 *
 * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
 * those timers will get run whenever the softirq gets handled, at the end of
 * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
 *
 * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
 * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
 * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
 *
 * @active_mask must be one of:
 *  - HRTIMER_ACTIVE_ALL,
 *  - HRTIMER_ACTIVE_SOFT, or
 *  - HRTIMER_ACTIVE_HARD.
 */
static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
        struct hrtimer *next_timer = NULL;
        ktime_t expires_next = KTIME_MAX;
        unsigned int active;

        lockdep_assert_held(&cpu_base->lock);

        if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                if (active)
                        hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
                cpu_base->softirq_next_timer = next_timer;
        }

        if (active_mask & HRTIMER_ACTIVE_HARD) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                if (active)
                        hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
                cpu_base->next_timer = next_timer;
        }
        return max(expires_next, 0);
}

static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
{
        ktime_t expires_next, soft = KTIME_MAX;

        /*
         * If the soft interrupt has already been activated, ignore the
         * soft bases. They will be handled in the already raised soft
         * interrupt.
         */
        if (!cpu_base->softirq_activated) {
                soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
                /*
                 * Update the soft expiry time. clock_settime() might have
                 * affected it.
                 */
                cpu_base->softirq_expires_next = soft;
        }

        expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
        /*
         * If a softirq timer is expiring first, update cpu_base->next_timer
         * and program the hardware with the soft expiry time.
         */
        if (expires_next > soft) {
                cpu_base->next_timer = cpu_base->softirq_next_timer;
                expires_next = soft;
        }

        return expires_next;
}

static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
        ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;

        ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real,
                                                   offs_boot, offs_tai);

        base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
        base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
        base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;

        return now;
}

/*
 * Is the high resolution mode active in the CPU base. This cannot use the
 * static key as the CPUs are switched to high resolution mode
 * asynchronously.
 */
static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
        return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
                cpu_base->hres_active : 0;
}

static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred)
{
        trace_hrtimer_rearm(expires_next, deferred);
        tick_program_event(expires_next, 1);
}

static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer,
                                ktime_t expires_next)
{
        cpu_base->expires_next = expires_next;

        /*
         * If hres is not active, hardware does not have to be
         * reprogrammed yet.
         *
         * If a hang was detected in the last timer interrupt then we
         * leave the hang delay active in the hardware. We want the
         * system to make progress. That also prevents the following
         * scenario:
         * T1 expires 50ms from now
         * T2 expires 5s from now
         *
         * T1 is removed, so this code is called and would reprogram
         * the hardware to 5s from now. Any hrtimer_start after that
         * will not reprogram the hardware due to hang_detected being
         * set. So we'd effectively block all timers until the T2 event
         * fires.
         */
        if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
                return;

        hrtimer_rearm_event(expires_next, false);
}

/* Reprogram the event source with a evaluation of all clock bases */
static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal)
{
        ktime_t expires_next = hrtimer_update_next_event(cpu_base);

        if (skip_equal && expires_next == cpu_base->expires_next)
                return;

        __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
}

/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS

/* High resolution timer enabled ? */
static bool hrtimer_hres_enabled __read_mostly  = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);

/* Enable / Disable high resolution mode */
static int __init setup_hrtimer_hres(char *str)
{
        return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
__setup("highres=", setup_hrtimer_hres);

/* hrtimer_high_res_enabled - query, if the highres mode is enabled */
static inline bool hrtimer_is_hres_enabled(void)
{
        return hrtimer_hres_enabled;
}

/* Switch to high resolution mode */
static void hrtimer_switch_to_hres(void)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        if (tick_init_highres()) {
                pr_warn("Could not switch to high resolution mode on CPU %u\n",        base->cpu);
                return;
        }
        base->hres_active = true;
        hrtimer_resolution = HIGH_RES_NSEC;

        tick_setup_sched_timer(true);
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
        hrtimer_schedule_hres_work();
}

#else

static inline bool hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }

#endif /* CONFIG_HIGH_RES_TIMERS */

/*
 * Retrigger next event is called after clock was set with interrupts
 * disabled through an SMP function call or directly from low level
 * resume code.
 *
 * This is only invoked when:
 *        - CONFIG_HIGH_RES_TIMERS is enabled.
 *        - CONFIG_NOHZ_COMMON is enabled
 *
 * For the other cases this function is empty and because the call sites
 * are optimized out it vanishes as well, i.e. no need for lots of
 * #ifdeffery.
 */
static void retrigger_next_event(void *arg)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        /*
         * When high resolution mode or nohz is active, then the offsets of
         * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
         * next tick will take care of that.
         *
         * If high resolution mode is active then the next expiring timer
         * must be reevaluated and the clock event device reprogrammed if
         * necessary.
         *
         * In the NOHZ case the update of the offset and the reevaluation
         * of the next expiring timer is enough. The return from the SMP
         * function call will take care of the reprogramming in case the
         * CPU was in a NOHZ idle sleep.
         *
         * In periodic low resolution mode, the next softirq expiration
         * must also be updated.
         */
        guard(raw_spinlock)(&base->lock);
        hrtimer_update_base(base);
        if (hrtimer_hres_active(base))
                hrtimer_force_reprogram(base, /* skip_equal */ false);
        else
                hrtimer_update_next_event(base);
}

/*
 * When a timer is enqueued and expires earlier than the already enqueued
 * timers, we have to check, whether it expires earlier than the timer for
 * which the clock event device was armed.
 *
 * Called with interrupts disabled and base->cpu_base.lock held
 */
static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *base = timer->base;
        ktime_t expires = hrtimer_get_expires(timer);

        WARN_ON_ONCE(expires < 0);

        expires = ktime_sub(expires, base->offset);
        /*
         * CLOCK_REALTIME timer might be requested with an absolute
         * expiry time which is less than base->offset. Set it to 0.
         */
        if (expires < 0)
                expires = 0;

        if (timer->is_soft) {
                /*
                 * soft hrtimer could be started on a remote CPU. In this
                 * case softirq_expires_next needs to be updated on the
                 * remote CPU. The soft hrtimer will not expire before the
                 * first hard hrtimer on the remote CPU -
                 * hrtimer_check_target() prevents this case.
                 */
                struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;

                if (timer_cpu_base->softirq_activated)
                        return;

                if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
                        return;

                timer_cpu_base->softirq_next_timer = timer;
                timer_cpu_base->softirq_expires_next = expires;

                if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram)
                        return;
        }

        /*
         * If the timer is not on the current cpu, we cannot reprogram
         * the other cpus clock event device.
         */
        if (base->cpu_base != cpu_base)
                return;

        if (expires >= cpu_base->expires_next)
                return;

        /* If a deferred rearm is pending skip reprogramming the device */
        if (cpu_base->deferred_rearm)
                return;

        cpu_base->next_timer = timer;

        __hrtimer_reprogram(cpu_base, timer, expires);
}

static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;
        ktime_t expires;

        /*
         * Update the base offsets unconditionally so the following
         * checks whether the SMP function call is required works.
         *
         * The update is safe even when the remote CPU is in the hrtimer
         * interrupt or the hrtimer soft interrupt and expiring affected
         * bases. Either it will see the update before handling a base or
         * it will see it when it finishes the processing and reevaluates
         * the next expiring timer.
         */
        seq = cpu_base->clock_was_set_seq;
        hrtimer_update_base(cpu_base);

        /*
         * If the sequence did not change over the update then the
         * remote CPU already handled it.
         */
        if (seq == cpu_base->clock_was_set_seq)
                return false;

        /* If a deferred rearm is pending the remote CPU will take care of it */
        if (cpu_base->deferred_rearm) {
                cpu_base->deferred_needs_update = true;
                return false;
        }

        /*
         * Walk the affected clock bases and check whether the first expiring
         * timer in a clock base is moving ahead of the first expiring timer of
         * @cpu_base. If so, the IPI must be invoked because per CPU clock
         * event devices cannot be remotely reprogrammed.
         */
        active &= cpu_base->active_bases;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_linked_node *next;

                next = timerqueue_linked_first(&base->active);
                expires = ktime_sub(next->expires, base->offset);
                if (expires < cpu_base->expires_next)
                        return true;

                /* Extra check for softirq clock bases */
                if (base->index < HRTIMER_BASE_MONOTONIC_SOFT)
                        continue;
                if (cpu_base->softirq_activated)
                        continue;
                if (expires < cpu_base->softirq_expires_next)
                        return true;
        }
        return false;
}

/*
 * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
 * CLOCK_BOOTTIME (for late sleep time injection).
 *
 * This requires to update the offsets for these clocks
 * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
 * also requires to eventually reprogram the per CPU clock event devices
 * when the change moves an affected timer ahead of the first expiring
 * timer on that CPU. Obviously remote per CPU clock event devices cannot
 * be reprogrammed. The other reason why an IPI has to be sent is when the
 * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
 * in the tick, which obviously might be stopped, so this has to bring out
 * the remote CPU which might sleep in idle to get this sorted.
 */
void clock_was_set(unsigned int bases)
{
        cpumask_var_t mask;

        if (!hrtimer_highres_enabled() && !tick_nohz_is_active())
                goto out_timerfd;

        if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
                on_each_cpu(retrigger_next_event, NULL, 1);
                goto out_timerfd;
        }

        /* Avoid interrupting CPUs if possible */
        scoped_guard(cpus_read_lock) {
                int cpu;

                for_each_online_cpu(cpu) {
                        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);

                        guard(raw_spinlock_irqsave)(&cpu_base->lock);
                        if (update_needs_ipi(cpu_base, bases))
                                cpumask_set_cpu(cpu, mask);
                }
                scoped_guard(preempt)
                        smp_call_function_many(mask, retrigger_next_event, NULL, 1);
        }
        free_cpumask_var(mask);

out_timerfd:
        timerfd_clock_was_set();
}

static void clock_was_set_work(struct work_struct *work)
{
        clock_was_set(CLOCK_SET_WALL);
}

static DECLARE_WORK(hrtimer_work, clock_was_set_work);

/*
 * Called from timekeeping code to reprogram the hrtimer interrupt device
 * on all cpus and to notify timerfd.
 */
void clock_was_set_delayed(void)
{
        schedule_work(&hrtimer_work);
}

/*
 * Called during resume either directly from via timekeeping_resume()
 * or in the case of s2idle from tick_unfreeze() to ensure that the
 * hrtimers are up to date.
 */
void hrtimers_resume_local(void)
{
        lockdep_assert_irqs_disabled();
        /* Retrigger on the local CPU */
        retrigger_next_event(NULL);
}

/* Counterpart to lock_hrtimer_base above */
static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        __releases(&timer->base->cpu_base->lock)
{
        raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}

/**
 * hrtimer_forward() - forward the timer expiry
 * @timer:        hrtimer to forward
 * @now:        forward past this time
 * @interval:        the interval to forward
 *
 * Forward the timer expiry so it will expire in the future.
 *
 * .. note::
 *  This only updates the timer expiry value and does not requeue the timer.
 *
 * There is also a variant of this function: hrtimer_forward_now().
 *
 * Context: Can be safely called from the callback function of @timer. If called
 *          from other contexts @timer must neither be enqueued nor running the
 *          callback and the caller needs to take care of serialization.
 *
 * Return: The number of overruns are returned.
 */
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
        ktime_t delta;
        u64 orun = 1;

        delta = ktime_sub(now, hrtimer_get_expires(timer));

        if (delta < 0)
                return 0;

        if (WARN_ON(timer->is_queued))
                return 0;

        if (interval < hrtimer_resolution)
                interval = hrtimer_resolution;

        if (unlikely(delta >= interval)) {
                s64 incr = ktime_to_ns(interval);

                orun = ktime_divns(delta, incr);
                hrtimer_add_expires_ns(timer, incr * orun);
                if (hrtimer_get_expires(timer) > now)
                        return orun;
                /*
                 * This (and the ktime_add() below) is the
                 * correction for exact:
                 */
                orun++;
        }
        hrtimer_add_expires(timer, interval);

        return orun;
}
EXPORT_SYMBOL_GPL(hrtimer_forward);

/*
 * enqueue_hrtimer - internal function to (re)start a timer
 *
 * The timer is inserted in expiry order. Insertion into the
 * red black tree is O(log(n)).
 *
 * Returns true when the new timer is the leftmost timer in the tree.
 */
static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
                            enum hrtimer_mode mode, bool was_armed)
{
        lockdep_assert_held(&base->cpu_base->lock);

        debug_activate(timer, mode, was_armed);
        WARN_ON_ONCE(!base->cpu_base->online);

        base->cpu_base->active_bases |= 1 << base->index;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);

        if (!timerqueue_linked_add(&base->active, &timer->node))
                return false;

        base->expires_next = hrtimer_get_expires(timer);
        return true;
}

static inline void base_update_next_timer(struct hrtimer_clock_base *base)
{
        struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);

        base->expires_next = next ? next->expires : KTIME_MAX;
}

/*
 * __remove_hrtimer - internal function to remove a timer
 *
 * High resolution timer mode reprograms the clock event device when the
 * timer is the one which expires next. The caller can disable this by setting
 * reprogram to zero. This is useful, when the context does a reprogramming
 * anyway (e.g. timer interrupt)
 */
static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
                             bool newstate, bool reprogram)
{
        struct hrtimer_cpu_base *cpu_base = base->cpu_base;
        bool was_first;

        lockdep_assert_held(&cpu_base->lock);

        if (!timer->is_queued)
                return;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->is_queued, newstate);

        was_first = !timerqueue_linked_prev(&timer->node);

        if (!timerqueue_linked_del(&base->active, &timer->node))
                cpu_base->active_bases &= ~(1 << base->index);

        /* Nothing to update if this was not the first timer in the base */
        if (!was_first)
                return;

        base_update_next_timer(base);

        /*
         * If reprogram is false don't update cpu_base->next_timer and do not
         * touch the clock event device.
         *
         * This happens when removing the first timer on a remote CPU, which
         * will be handled by the remote CPU's interrupt. It also happens when
         * a local timer is removed to be immediately restarted. That's handled
         * at the call site.
         */
        if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy)
                return;

        if (cpu_base->deferred_rearm)
                cpu_base->deferred_needs_update = true;
        else
                hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
}

static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
                                  bool newstate)
{
        lockdep_assert_held(&base->cpu_base->lock);

        if (timer->is_queued) {
                bool reprogram;

                debug_hrtimer_deactivate(timer);

                /*
                 * Remove the timer and force reprogramming when high
                 * resolution mode is active and the timer is on the current
                 * CPU. If we remove a timer on another CPU, reprogramming is
                 * skipped. The interrupt event on this CPU is fired and
                 * reprogramming happens in the interrupt handler. This is a
                 * rare case and less expensive than a smp call.
                 */
                reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);

                __remove_hrtimer(timer, base, newstate, reprogram);
                return true;
        }
        return false;
}

/*
 * Update in place has to retrieve the expiry times of the neighbour nodes
 * if they exist. That is cache line neutral because the dequeue/enqueue
 * operation is going to need the same cache lines. But there is a big win
 * when the dequeue/enqueue can be avoided because the RB tree does not
 * have to be rebalanced twice.
 */
static inline bool
hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires)
{
        struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node);
        struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node);

        /* If the new expiry goes behind the next timer, requeue is required */
        if (next && expires > next->expires)
                return false;

        /* If this is the first timer, update in place */
        if (!prev)
                return true;

        /* Update in place when it does not go ahead of the previous one */
        return expires >= prev->expires;
}

static inline bool
remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
                             const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns)
{
        bool was_first = false;

        /* Remove it from the timer queue if active */
        if (timer->is_queued) {
                was_first = !timerqueue_linked_prev(&timer->node);

                /* Try to update in place to avoid the de/enqueue dance */
                if (hrtimer_can_update_in_place(timer, base, expires)) {
                        hrtimer_set_expires_range_ns(timer, expires, delta_ns);
                        trace_hrtimer_start(timer, mode, true);
                        if (was_first)
                                base->expires_next = expires;
                        return was_first;
                }

                debug_hrtimer_deactivate(timer);
                timerqueue_linked_del(&base->active, &timer->node);
        }

        /* Set the new expiry time */
        hrtimer_set_expires_range_ns(timer, expires, delta_ns);

        debug_activate(timer, mode, timer->is_queued);
        base->cpu_base->active_bases |= 1 << base->index;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);

        /* If it's the first expiring timer now or again, update base */
        if (timerqueue_linked_add(&base->active, &timer->node)) {
                base->expires_next = expires;
                return true;
        }

        if (was_first)
                base_update_next_timer(base);

        return false;
}

static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
                                            const enum hrtimer_mode mode)
{
#ifdef CONFIG_TIME_LOW_RES
        /*
         * CONFIG_TIME_LOW_RES indicates that the system has no way to return
         * granular time values. For relative timers we add hrtimer_resolution
         * (i.e. one jiffy) to prevent short timeouts.
         */
        timer->is_rel = mode & HRTIMER_MODE_REL;
        if (timer->is_rel)
                tim = ktime_add_safe(tim, hrtimer_resolution);
#endif
        return tim;
}

static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
        ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);

        /*
         * Reprogramming needs to be triggered, even if the next soft
         * hrtimer expires at the same time as the next hard
         * hrtimer. cpu_base->softirq_expires_next needs to be updated!
         */
        if (expires == KTIME_MAX)
                return;

        /*
         * cpu_base->next_timer is recomputed by __hrtimer_get_next_event()
         * cpu_base->expires_next is only set by hrtimer_reprogram()
         */
        hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}

#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
{
        if (static_branch_likely(&timers_migration_enabled)) {
                /*
                 * If it is local and the first expiring timer keep it on the local
                 * CPU to optimize reprogramming of the clockevent device. Also
                 * avoid switch_hrtimer_base() overhead when local and pinned.
                 */
                if (!is_local)
                        return false;
                if (is_first || is_pinned)
                        return true;

                /* Honour the NOHZ full restrictions */
                if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE))
                        return false;

                /*
                 * If the tick is not stopped or need_resched() is set, then
                 * there is no point in moving the timer somewhere else.
                 */
                return !tick_nohz_tick_stopped() || need_resched();
        }
        return is_local;
}
#else
static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
{
        return is_local;
}
#endif

static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first,
                                     bool is_pinned)
{
        /* If the timer is running the callback it has to stay on its CPU base. */
        if (unlikely(timer->base->running == timer))
                return true;

        return hrtimer_prefer_local(is_local, is_first, is_pinned);
}

static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
                                     const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
{
        struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        bool is_pinned, first, was_first, keep_base = false;
        struct hrtimer_cpu_base *cpu_base = base->cpu_base;

        was_first = cpu_base->next_timer == timer;
        is_pinned = !!(mode & HRTIMER_MODE_PINNED);

        /*
         * Don't keep it local if this enqueue happens on a unplugged CPU
         * after hrtimer_cpu_dying() has been invoked.
         */
        if (likely(this_cpu_base->online)) {
                bool is_local = cpu_base == this_cpu_base;

                keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned);
        }

        /* Calculate absolute expiry time for relative timers */
        if (mode & HRTIMER_MODE_REL)
                tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
        /* Compensate for low resolution granularity */
        tim = hrtimer_update_lowres(timer, tim, mode);

        /*
         * Remove an active timer from the queue. In case it is not queued
         * on the current CPU, make sure that remove_hrtimer() updates the
         * remote data correctly.
         *
         * If it's on the current CPU and the first expiring timer, then
         * skip reprogramming, keep the timer local and enforce
         * reprogramming later if it was the first expiring timer.  This
         * avoids programming the underlying clock event twice (once at
         * removal and once after enqueue).
         *
         * @keep_base is also true if the timer callback is running on a
         * remote CPU and for local pinned timers.
         */
        if (likely(keep_base)) {
                first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns);
        } else {
                /* Keep the ENQUEUED state in case it is queued */
                bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED);

                hrtimer_set_expires_range_ns(timer, tim, delta_ns);

                /* Switch the timer base, if necessary: */
                base = switch_hrtimer_base(timer, base, is_pinned);
                cpu_base = base->cpu_base;

                first = enqueue_hrtimer(timer, base, mode, was_armed);
        }

        /* If a deferred rearm is pending skip reprogramming the device */
        if (cpu_base->deferred_rearm) {
                cpu_base->deferred_needs_update = true;
                return false;
        }

        if (!was_first || cpu_base != this_cpu_base) {
                /*
                 * If the current CPU base is online, then the timer is never
                 * queued on a remote CPU if it would be the first expiring
                 * timer there unless the timer callback is currently executed
                 * on the remote CPU. In the latter case the remote CPU will
                 * re-evaluate the first expiring timer after completing the
                 * callbacks.
                 */
                if (likely(hrtimer_base_is_online(this_cpu_base)))
                        return first;

                /*
                 * Timer was enqueued remote because the current base is
                 * already offline. If the timer is the first to expire,
                 * kick the remote CPU to reprogram the clock event.
                 */
                if (first)
                        smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
                return false;
        }

        /*
         * Special case for the HRTICK timer. It is frequently rearmed and most
         * of the time moves the expiry into the future. That's expensive in
         * virtual machines and it's better to take the pointless already armed
         * interrupt than reprogramming the hardware on every context switch.
         *
         * If the new expiry is before the armed time, then reprogramming is
         * required.
         */
        if (timer->is_lazy) {
                if (cpu_base->expires_next <= hrtimer_get_expires(timer))
                        return false;
        }

        /*
         * Timer was the first expiring timer and forced to stay on the
         * current CPU to avoid reprogramming on removal and enqueue. Force
         * reprogram the hardware by evaluating the new first expiring
         * timer.
         */
        hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
        return false;
}

/**
 * hrtimer_start_range_ns - (re)start an hrtimer
 * @timer:        the timer to be added
 * @tim:        expiry time
 * @delta_ns:        "slack" range for the timer
 * @mode:        timer mode: absolute (HRTIMER_MODE_ABS) or
 *                relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
 *                softirq based mode is considered for debug purpose only!
 */
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
                            const enum hrtimer_mode mode)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;

        debug_hrtimer_assert_init(timer);

        /*
         * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
         * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
         * expiry mode because unmarked timers are moved to softirq expiry.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
        else
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);

        base = lock_hrtimer_base(timer, &flags);

        if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
                hrtimer_reprogram(timer, true);

        unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);

/**
 * hrtimer_try_to_cancel - try to deactivate a timer
 * @timer:        hrtimer to stop
 *
 * Returns:
 *
 *  *  0 when the timer was not active
 *  *  1 when the timer was active
 *  * -1 when the timer is currently executing the callback function and
 *    cannot be stopped
 */
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;
        int ret = -1;

        /*
         * Check lockless first. If the timer is not active (neither
         * enqueued nor running the callback, nothing to do here.  The
         * base lock does not serialize against a concurrent enqueue,
         * so we can avoid taking it.
         */
        if (!hrtimer_active(timer))
                return 0;

        base = lock_hrtimer_base(timer, &flags);

        if (!hrtimer_callback_running(timer)) {
                ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE);
                if (ret)
                        trace_hrtimer_cancel(timer);
        }

        unlock_hrtimer_base(timer, &flags);

        return ret;

}
EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);

#ifdef CONFIG_PREEMPT_RT
static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
{
        spin_lock_init(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
        __acquires(&base->softirq_expiry_lock)
{
        spin_lock(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
        __releases(&base->softirq_expiry_lock)
{
        spin_unlock(&base->softirq_expiry_lock);
}

/*
 * The counterpart to hrtimer_cancel_wait_running().
 *
 * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
 * the timer callback to finish. Drop expiry_lock and reacquire it. That
 * allows the waiter to acquire the lock and make progress.
 */
static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags)
{
        if (atomic_read(&cpu_base->timer_waiters)) {
                raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
                spin_unlock(&cpu_base->softirq_expiry_lock);
                spin_lock(&cpu_base->softirq_expiry_lock);
                raw_spin_lock_irq(&cpu_base->lock);
        }
}

#ifdef CONFIG_SMP
static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return base == &migration_base;
}
#else
static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return false;
}
#endif

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion: if the soft irq thread is preempted
 * in the middle of a timer callback, then calling hrtimer_cancel() can
 * lead to two issues:
 *
 *  - If the caller is on a remote CPU then it has to spin wait for the timer
 *    handler to complete. This can result in unbound priority inversion.
 *
 *  - If the caller originates from the task which preempted the timer
 *    handler on the same CPU, then spin waiting for the timer handler to
 *    complete is never going to end.
 */
void hrtimer_cancel_wait_running(const struct hrtimer *timer)
{
        /* Lockless read. Prevent the compiler from reloading it below */
        struct hrtimer_clock_base *base = READ_ONCE(timer->base);

        /*
         * Just relax if the timer expires in hard interrupt context or if
         * it is currently on the migration base.
         */
        if (!timer->is_soft || is_migration_base(base)) {
                cpu_relax();
                return;
        }

        /*
         * Mark the base as contended and grab the expiry lock, which is
         * held by the softirq across the timer callback. Drop the lock
         * immediately so the softirq can expire the next timer. In theory
         * the timer could already be running again, but that's more than
         * unlikely and just causes another wait loop.
         */
        atomic_inc(&base->cpu_base->timer_waiters);
        spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
        atomic_dec(&base->cpu_base->timer_waiters);
        spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
}
#else
static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { }
#endif

/**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
 * @timer:        the timer to be cancelled
 *
 * Returns:
 *  0 when the timer was not active
 *  1 when the timer was active
 */
int hrtimer_cancel(struct hrtimer *timer)
{
        int ret;

        do {
                ret = hrtimer_try_to_cancel(timer);

                if (ret < 0)
                        hrtimer_cancel_wait_running(timer);
        } while (ret < 0);
        return ret;
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);

/**
 * __hrtimer_get_remaining - get remaining time for the timer
 * @timer:        the timer to read
 * @adjust:        adjust relative timers when CONFIG_TIME_LOW_RES=y
 */
ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
        unsigned long flags;
        ktime_t rem;

        lock_hrtimer_base(timer, &flags);
        if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
                rem = hrtimer_expires_remaining_adjusted(timer);
        else
                rem = hrtimer_expires_remaining(timer);
        unlock_hrtimer_base(timer, &flags);

        return rem;
}
EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);

#ifdef CONFIG_NO_HZ_COMMON
/**
 * hrtimer_get_next_event - get the time until next expiry event
 *
 * Returns the next expiry time or KTIME_MAX if no timer is pending.
 */
u64 hrtimer_get_next_event(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;

        guard(raw_spinlock_irqsave)(&cpu_base->lock);
        if (!hrtimer_hres_active(cpu_base))
                expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);

        return expires;
}

/**
 * hrtimer_next_event_without - time until next expiry event w/o one timer
 * @exclude:        timer to exclude
 *
 * Returns the next expiry time over all timers except for the @exclude one or
 * KTIME_MAX if none of them is pending.
 */
u64 hrtimer_next_event_without(const struct hrtimer *exclude)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned int active;

        guard(raw_spinlock_irqsave)(&cpu_base->lock);
        if (!hrtimer_hres_active(cpu_base))
                return expires;

        active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
        if (active && !cpu_base->softirq_activated)
                expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX);

        active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
        if (!active)
                return expires;
        return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires);
}
#endif

static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
        switch (clock_id) {
        case CLOCK_MONOTONIC:
                return HRTIMER_BASE_MONOTONIC;
        case CLOCK_REALTIME:
                return HRTIMER_BASE_REALTIME;
        case CLOCK_BOOTTIME:
                return HRTIMER_BASE_BOOTTIME;
        case CLOCK_TAI:
                return HRTIMER_BASE_TAI;
        default:
                WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
                return HRTIMER_BASE_MONOTONIC;
        }
}

static ktime_t __hrtimer_cb_get_time(clockid_t clock_id)
{
        switch (clock_id) {
        case CLOCK_MONOTONIC:
                return ktime_get();
        case CLOCK_REALTIME:
                return ktime_get_real();
        case CLOCK_BOOTTIME:
                return ktime_get_boottime();
        case CLOCK_TAI:
                return ktime_get_clocktai();
        default:
                WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
                return ktime_get();
        }
}

ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
{
        return __hrtimer_cb_get_time(timer->base->clockid);
}
EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);

static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *),
                            clockid_t clock_id, enum hrtimer_mode mode)
{
        bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
        struct hrtimer_cpu_base *cpu_base;
        int base;

        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context for latency reasons and because the callbacks
         * can invoke functions which might sleep on RT, e.g. spin_lock().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
                softtimer = true;

        memset(timer, 0, sizeof(struct hrtimer));

        cpu_base = raw_cpu_ptr(&hrtimer_bases);

        /*
         * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
         * clock modifications, so they needs to become CLOCK_MONOTONIC to
         * ensure POSIX compliance.
         */
        if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
                clock_id = CLOCK_MONOTONIC;

        base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
        base += hrtimer_clockid_to_base(clock_id);
        timer->is_soft = softtimer;
        timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
        timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM);
        timer->base = &cpu_base->clock_base[base];
        timerqueue_linked_init(&timer->node);

        if (WARN_ON_ONCE(!fn))
                ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout;
        else
                ACCESS_PRIVATE(timer, function) = fn;
}

/**
 * hrtimer_setup - initialize a timer to the given clock
 * @timer:        the timer to be initialized
 * @function:        the callback function
 * @clock_id:        the clock to be used
 * @mode:       The modes which are relevant for initialization:
 *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
 *              HRTIMER_MODE_REL_SOFT
 *
 *              The PINNED variants of the above can be handed in,
 *              but the PINNED bit is ignored as pinning happens
 *              when the hrtimer is started
 */
void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
                   clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_setup(timer, clock_id, mode);
        __hrtimer_setup(timer, function, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup);

/**
 * hrtimer_setup_on_stack - initialize a timer on stack memory
 * @timer:        The timer to be initialized
 * @function:        the callback function
 * @clock_id:        The clock to be used
 * @mode:       The timer mode
 *
 * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack
 * memory.
 */
void hrtimer_setup_on_stack(struct hrtimer *timer,
                            enum hrtimer_restart (*function)(struct hrtimer *),
                            clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_setup_on_stack(timer, clock_id, mode);
        __hrtimer_setup(timer, function, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack);

/*
 * A timer is active, when it is enqueued into the rbtree or the
 * callback function is running or it's in the state of being migrated
 * to another cpu.
 *
 * It is important for this function to not return a false negative.
 */
bool hrtimer_active(const struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;

        do {
                base = READ_ONCE(timer->base);
                seq = raw_read_seqcount_begin(&base->seq);

                if (timer->is_queued || base->running == timer)
                        return true;

        } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base));

        return false;
}
EXPORT_SYMBOL_GPL(hrtimer_active);

/*
 * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
 * distinct sections:
 *
 *  - queued:        the timer is queued
 *  - callback:        the timer is being ran
 *  - post:        the timer is inactive or (re)queued
 *
 * On the read side we ensure we observe timer->is_queued and cpu_base->running
 * from the same section, if anything changed while we looked at it, we retry.
 * This includes timer->base changing because sequence numbers alone are
 * insufficient for that.
 *
 * The sequence numbers are required because otherwise we could still observe
 * a false negative if the read side got smeared over multiple consecutive
 * __run_hrtimer() invocations.
 */
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base,
                          struct hrtimer *timer, ktime_t now, unsigned long flags)
        __must_hold(&cpu_base->lock)
{
        enum hrtimer_restart (*fn)(struct hrtimer *);
        bool expires_in_hardirq;
        int restart;

        lockdep_assert_held(&cpu_base->lock);

        debug_hrtimer_deactivate(timer);
        base->running = timer;

        /*
         * Separate the ->running assignment from the ->is_queued assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running == NULL &&
         * timer->is_queued == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false);
        fn = ACCESS_PRIVATE(timer, function);

        /*
         * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
         * timer is restarted with a period then it becomes an absolute
         * timer. If its not restarted it does not matter.
         */
        if (IS_ENABLED(CONFIG_TIME_LOW_RES))
                timer->is_rel = false;

        /*
         * The timer is marked as running in the CPU base, so it is
         * protected against migration to a different CPU even if the lock
         * is dropped.
         */
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        trace_hrtimer_expire_entry(timer, now);
        expires_in_hardirq = lockdep_hrtimer_enter(timer);

        restart = fn(timer);

        lockdep_hrtimer_exit(expires_in_hardirq);
        trace_hrtimer_expire_exit(timer);
        raw_spin_lock_irq(&cpu_base->lock);

        /*
         * Note: We clear the running state after enqueue_hrtimer and
         * we do not reprogram the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
         *
         * Note: Because we dropped the cpu_base->lock above,
         * hrtimer_start_range_ns() can have popped in and enqueued the timer
         * for us already.
         */
        if (restart == HRTIMER_RESTART && !timer->is_queued)
                enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false);

        /*
         * Separate the ->running assignment from the ->is_queued assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running.timer == NULL &&
         * timer->is_queued == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        WARN_ON_ONCE(base->running != timer);
        base->running = NULL;
}

static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base)
{
        struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);

        return next ? hrtimer_from_timerqueue_node(next) : NULL;
}

static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
                                 unsigned long flags, unsigned int active_mask)
{
        unsigned int active = cpu_base->active_bases & active_mask;
        struct hrtimer_clock_base *base;

        for_each_active_base(base, cpu_base, active) {
                ktime_t basenow = ktime_add(now, base->offset);
                struct hrtimer *timer;

                while ((timer = clock_base_next_timer(base))) {
                        /*
                         * The immediate goal for using the softexpires is
                         * minimizing wakeups, not running timers at the
                         * earliest interrupt after their soft expiration.
                         * This allows us to avoid using a Priority Search
                         * Tree, which can answer a stabbing query for
                         * overlapping intervals and instead use the simple
                         * BST we already have.
                         * We don't add extra wakeups by delaying timers that
                         * are right-of a not yet expired timer, because that
                         * timer will have to trigger a wakeup anyway.
                         */
                        if (basenow < hrtimer_get_softexpires(timer))
                                break;

                        __run_hrtimer(cpu_base, base, timer, basenow, flags);
                        if (active_mask == HRTIMER_ACTIVE_SOFT)
                                hrtimer_sync_wait_running(cpu_base, flags);
                }
        }
}

static __latent_entropy void hrtimer_run_softirq(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        hrtimer_cpu_base_lock_expiry(cpu_base);
        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        now = hrtimer_update_base(cpu_base);
        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);

        cpu_base->softirq_activated = false;
        hrtimer_update_softirq_timer(cpu_base, true);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        hrtimer_cpu_base_unlock_expiry(cpu_base);
}

#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * Very similar to hrtimer_force_reprogram(), except it deals with
 * deferred_rearm and hang_detected.
 */
static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred)
{
        cpu_base->expires_next = expires_next;
        cpu_base->deferred_rearm = false;

        if (unlikely(cpu_base->hang_detected)) {
                /*
                 * Give the system a chance to do something else than looping
                 * on hrtimer interrupts.
                 */
                expires_next = ktime_add_ns(ktime_get(),
                                            min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time));
        }
        hrtimer_rearm_event(expires_next, deferred);
}

#ifdef CONFIG_HRTIMER_REARM_DEFERRED
void __hrtimer_rearm_deferred(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        ktime_t expires_next;

        if (!cpu_base->deferred_rearm)
                return;

        guard(raw_spinlock)(&cpu_base->lock);
        if (cpu_base->deferred_needs_update) {
                hrtimer_update_base(cpu_base);
                expires_next = hrtimer_update_next_event(cpu_base);
        } else {
                /* No timer added/removed. Use the cached value */
                expires_next = cpu_base->deferred_expires_next;
        }
        hrtimer_rearm(cpu_base, expires_next, true);
}

static __always_inline void
hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
{
        /* hrtimer_interrupt() just re-evaluated the first expiring timer */
        cpu_base->deferred_needs_update = false;
        /* Cache the expiry time */
        cpu_base->deferred_expires_next = expires_next;
        set_thread_flag(TIF_HRTIMER_REARM);
}
#else  /* CONFIG_HRTIMER_REARM_DEFERRED */
static __always_inline void
hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
{
        hrtimer_rearm(cpu_base, expires_next, false);
}
#endif  /* !CONFIG_HRTIMER_REARM_DEFERRED */

/*
 * High resolution timer interrupt
 * Called with interrupts disabled
 */
void hrtimer_interrupt(struct clock_event_device *dev)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        ktime_t expires_next, now, entry_time, delta;
        unsigned long flags;
        int retries = 0;

        BUG_ON(!cpu_base->hres_active);
        cpu_base->nr_events++;
        dev->next_event = KTIME_MAX;
        dev->next_event_forced = 0;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        entry_time = now = hrtimer_update_base(cpu_base);
retry:
        cpu_base->deferred_rearm = true;
        /*
         * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue
         * timers while __hrtimer_run_queues() is expiring the clock bases.
         * Timers which are re/enqueued on the local CPU are not affected by
         * this.
         */
        cpu_base->expires_next = KTIME_MAX;

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = true;
                raise_timer_softirq(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);

        /*
         * The next timer was already expired due to:
         * - tracing
         * - long lasting callbacks
         * - being scheduled away when running in a VM
         *
         * We need to prevent that we loop forever in the hrtiner interrupt
         * routine. We give it 3 attempts to avoid overreacting on some
         * spurious event.
         */
        now = hrtimer_update_base(cpu_base);
        expires_next = hrtimer_update_next_event(cpu_base);
        cpu_base->hang_detected = false;
        if (expires_next < now) {
                if (++retries < 3)
                        goto retry;

                delta = ktime_sub(now, entry_time);
                cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta);
                cpu_base->nr_hangs++;
                cpu_base->hang_detected = true;
        }

        hrtimer_interrupt_rearm(cpu_base, expires_next);
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}

#endif /* !CONFIG_HIGH_RES_TIMERS */

/*
 * Called from run_local_timers in hardirq context every jiffy
 */
void hrtimer_run_queues(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        if (hrtimer_hres_active(cpu_base))
                return;

        /*
         * This _is_ ugly: We have to check periodically, whether we
         * can switch to highres and / or nohz mode. The clocksource
         * switch happens with xtime_lock held. Notification from
         * there only sets the check bit in the tick_oneshot code,
         * otherwise we might deadlock vs. xtime_lock.
         */
        if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
                hrtimer_switch_to_hres();
                return;
        }

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = true;
                raise_timer_softirq(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}

/*
 * Sleep related functions:
 */
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
        struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer);
        struct task_struct *task = t->task;

        t->task = NULL;
        if (task)
                wake_up_process(task);

        return HRTIMER_NORESTART;
}

/**
 * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
 * @sl:                sleeper to be started
 * @mode:        timer mode abs/rel
 *
 * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
 * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
 */
void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode)
{
        /*
         * Make the enqueue delivery mode check work on RT. If the sleeper
         * was initialized for hard interrupt delivery, force the mode bit.
         * This is a special case for hrtimer_sleepers because
         * __hrtimer_setup_sleeper() determines the delivery mode on RT so the
         * fiddling with this decision is avoided at the call sites.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
                mode |= HRTIMER_MODE_HARD;

        hrtimer_start_expires(&sl->timer, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);

static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
                                    enum hrtimer_mode mode)
{
        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context either for latency reasons or because the
         * hrtimer callback takes regular spinlocks or invokes other
         * functions which are not suitable for hard interrupt context on
         * PREEMPT_RT.
         *
         * The hrtimer_sleeper callback is RT compatible in hard interrupt
         * context, but there is a latency concern: Untrusted userspace can
         * spawn many threads which arm timers for the same expiry time on
         * the same CPU. That causes a latency spike due to the wakeup of
         * a gazillion threads.
         *
         * OTOH, privileged real-time user space applications rely on the
         * low latency of hard interrupt wakeups. If the current task is in
         * a real-time scheduling class, mark the mode for hard interrupt
         * expiry.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
                        mode |= HRTIMER_MODE_HARD;
        }

        __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode);
        sl->task = current;
}

/**
 * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory
 * @sl:                sleeper to be initialized
 * @clock_id:        the clock to be used
 * @mode:        timer mode abs/rel
 */
void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
                                    enum hrtimer_mode mode)
{
        debug_setup_on_stack(&sl->timer, clock_id, mode);
        __hrtimer_setup_sleeper(sl, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack);

int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
        switch(restart->nanosleep.type) {
#ifdef CONFIG_COMPAT_32BIT_TIME
        case TT_COMPAT:
                if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
                        return -EFAULT;
                break;
#endif
        case TT_NATIVE:
                if (put_timespec64(ts, restart->nanosleep.rmtp))
                        return -EFAULT;
                break;
        default:
                BUG();
        }
        return -ERESTART_RESTARTBLOCK;
}

static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
{
        struct restart_block *restart;

        do {
                set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                hrtimer_sleeper_start_expires(t, mode);

                if (likely(t->task))
                        schedule();

                hrtimer_cancel(&t->timer);
                mode = HRTIMER_MODE_ABS;

        } while (t->task && !signal_pending(current));

        __set_current_state(TASK_RUNNING);

        if (!t->task)
                return 0;

        restart = &current->restart_block;
        if (restart->nanosleep.type != TT_NONE) {
                ktime_t rem = hrtimer_expires_remaining(&t->timer);
                struct timespec64 rmt;

                if (rem <= 0)
                        return 0;
                rmt = ktime_to_timespec64(rem);

                return nanosleep_copyout(restart, &rmt);
        }
        return -ERESTART_RESTARTBLOCK;
}

static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
        struct hrtimer_sleeper t;
        int ret;

        hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
        hrtimer_set_expires(&t.timer, restart->nanosleep.expires);
        ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid)
{
        struct restart_block *restart;
        struct hrtimer_sleeper t;
        int ret;

        hrtimer_setup_sleeper_on_stack(&t, clockid, mode);
        hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
        ret = do_nanosleep(&t, mode);
        if (ret != -ERESTART_RESTARTBLOCK)
                goto out;

        /* Absolute timers do not update the rmtp value and restart: */
        if (mode == HRTIMER_MODE_ABS) {
                ret = -ERESTARTNOHAND;
                goto out;
        }

        restart = &current->restart_block;
        restart->nanosleep.clockid = t.timer.base->clockid;
        restart->nanosleep.expires = hrtimer_get_expires(&t.timer);
        set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

#ifdef CONFIG_64BIT

SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
                struct __kernel_timespec __user *, rmtp)
{
        struct timespec64 tu;

        if (get_timespec64(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
        current->restart_block.nanosleep.rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}

#endif

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
                struct old_timespec32 __user *, rmtp)
{
        struct timespec64 tu;

        if (get_old_timespec32(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
        current->restart_block.nanosleep.compat_rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif

/*
 * Functions related to boot-time initialization:
 */
int hrtimers_prepare_cpu(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);

        for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];

                clock_b->cpu_base = cpu_base;
                seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
                timerqueue_linked_init_head(&clock_b->active);
        }

        cpu_base->cpu = cpu;
        hrtimer_cpu_base_init_expiry_lock(cpu_base);
        return 0;
}

int hrtimers_cpu_starting(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);

        /* Clear out any left over state from a CPU down operation */
        cpu_base->active_bases = 0;
        cpu_base->hres_active = false;
        cpu_base->hang_detected = false;
        cpu_base->next_timer = NULL;
        cpu_base->softirq_next_timer = NULL;
        cpu_base->expires_next = KTIME_MAX;
        cpu_base->softirq_expires_next = KTIME_MAX;
        cpu_base->softirq_activated = false;
        cpu_base->online = true;
        return 0;
}

#ifdef CONFIG_HOTPLUG_CPU

static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                struct hrtimer_clock_base *new_base)
{
        struct timerqueue_linked_node *node;
        struct hrtimer *timer;

        while ((node = timerqueue_linked_first(&old_base->active))) {
                timer = hrtimer_from_timerqueue_node(node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_hrtimer_deactivate(timer);

                /*
                 * Mark it as ENQUEUED not INACTIVE otherwise the
                 * timer could be seen as !active and just vanish away
                 * under us on another CPU
                 */
                __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false);
                timer->base = new_base;
                /*
                 * Enqueue the timers on the new cpu. This does not
                 * reprogram the event device in case the timer
                 * expires before the earliest on this CPU, but we run
                 * hrtimer_interrupt after we migrated everything to
                 * sort out already expired timers and reprogram the
                 * event device.
                 */
                enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true);
        }
}

int hrtimers_cpu_dying(unsigned int dying_cpu)
{
        int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
        struct hrtimer_cpu_base *old_base, *new_base;

        old_base = this_cpu_ptr(&hrtimer_bases);
        new_base = &per_cpu(hrtimer_bases, ncpu);

        /*
         * The caller is globally serialized and nobody else
         * takes two locks at once, deadlock is not possible.
         */
        raw_spin_lock(&old_base->lock);
        raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);

        for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
                migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]);

        /* Tell the other CPU to retrigger the next event */
        smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);

        raw_spin_unlock(&new_base->lock);
        old_base->online = false;
        raw_spin_unlock(&old_base->lock);

        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

void __init hrtimers_init(void)
{
        hrtimers_prepare_cpu(smp_processor_id());
        hrtimers_cpu_starting(smp_processor_id());
        open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}










































































































































































































    7 


















    2 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright 2019 Google LLC
 */

#ifndef __LINUX_BLK_CRYPTO_H
#define __LINUX_BLK_CRYPTO_H

#include <linux/minmax.h>
#include <linux/types.h>
#include <uapi/linux/blk-crypto.h>

enum blk_crypto_mode_num {
        BLK_ENCRYPTION_MODE_INVALID,
        BLK_ENCRYPTION_MODE_AES_256_XTS,
        BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
        BLK_ENCRYPTION_MODE_ADIANTUM,
        BLK_ENCRYPTION_MODE_SM4_XTS,
        BLK_ENCRYPTION_MODE_MAX,
};

/*
 * Supported types of keys.  Must be bitflags due to their use in
 * blk_crypto_profile::key_types_supported.
 */
enum blk_crypto_key_type {
        /*
         * Raw keys (i.e. "software keys").  These keys are simply kept in raw,
         * plaintext form in kernel memory.
         */
        BLK_CRYPTO_KEY_TYPE_RAW = 0x1,

        /*
         * Hardware-wrapped keys.  These keys are only present in kernel memory
         * in ephemerally-wrapped form, and they can only be unwrapped by
         * dedicated hardware.  For details, see the "Hardware-wrapped keys"
         * section of Documentation/block/inline-encryption.rst.
         */
        BLK_CRYPTO_KEY_TYPE_HW_WRAPPED = 0x2,
};

/*
 * Currently the maximum raw key size is 64 bytes, as that is the key size of
 * BLK_ENCRYPTION_MODE_AES_256_XTS which takes the longest key.
 *
 * The maximum hardware-wrapped key size depends on the hardware's key wrapping
 * algorithm, which is a hardware implementation detail, so it isn't precisely
 * specified.  But currently 128 bytes is plenty in practice.  Implementations
 * are recommended to wrap a 32-byte key for the hardware KDF with AES-256-GCM,
 * which should result in a size closer to 64 bytes than 128.
 *
 * Both of these values can trivially be increased if ever needed.
 */
#define BLK_CRYPTO_MAX_RAW_KEY_SIZE                64
#define BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE        128

#define BLK_CRYPTO_MAX_ANY_KEY_SIZE \
        MAX(BLK_CRYPTO_MAX_RAW_KEY_SIZE, BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE)

/*
 * Size of the "software secret" which can be derived from a hardware-wrapped
 * key.  This is currently always 32 bytes.  Note, the choice of 32 bytes
 * assumes that the software secret is only used directly for algorithms that
 * don't require more than a 256-bit key to get the desired security strength.
 * If it were to be used e.g. directly as an AES-256-XTS key, then this would
 * need to be increased (which is possible if hardware supports it, but care
 * would need to be taken to avoid breaking users who need exactly 32 bytes).
 */
#define BLK_CRYPTO_SW_SECRET_SIZE        32

/**
 * struct blk_crypto_config - an inline encryption key's crypto configuration
 * @crypto_mode: encryption algorithm this key is for
 * @data_unit_size: the data unit size for all encryption/decryptions with this
 *        key.  This is the size in bytes of each individual plaintext and
 *        ciphertext.  This is always a power of 2.  It might be e.g. the
 *        filesystem block size or the disk sector size.
 * @dun_bytes: the maximum number of bytes of DUN used when using this key
 * @key_type: the type of this key -- either raw or hardware-wrapped
 */
struct blk_crypto_config {
        enum blk_crypto_mode_num crypto_mode;
        unsigned int data_unit_size;
        unsigned int dun_bytes;
        enum blk_crypto_key_type key_type;
};

/**
 * struct blk_crypto_key - an inline encryption key
 * @crypto_cfg: the crypto mode, data unit size, key type, and other
 *                characteristics of this key and how it will be used
 * @data_unit_size_bits: log2 of data_unit_size
 * @size: size of this key in bytes.  The size of a raw key is fixed for a given
 *          crypto mode, but the size of a hardware-wrapped key can vary.
 * @bytes: the bytes of this key.  Only the first @size bytes are significant.
 *
 * A blk_crypto_key is immutable once created, and many bios can reference it at
 * the same time.  It must not be freed until all bios using it have completed
 * and it has been evicted from all devices on which it may have been used.
 */
struct blk_crypto_key {
        struct blk_crypto_config crypto_cfg;
        unsigned int data_unit_size_bits;
        unsigned int size;
        u8 bytes[BLK_CRYPTO_MAX_ANY_KEY_SIZE];
};

#define BLK_CRYPTO_MAX_IV_SIZE                32
#define BLK_CRYPTO_DUN_ARRAY_SIZE        (BLK_CRYPTO_MAX_IV_SIZE / sizeof(u64))

/**
 * struct bio_crypt_ctx - an inline encryption context
 * @bc_key: the key, algorithm, and data unit size to use
 * @bc_dun: the data unit number (starting IV) to use
 *
 * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for
 * write requests) or decrypted (for read requests) inline by the storage device
 * or controller, or by the crypto API fallback.
 */
struct bio_crypt_ctx {
        const struct blk_crypto_key        *bc_key;
        u64                                bc_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
};

#include <linux/blk_types.h>
#include <linux/blkdev.h>

#ifdef CONFIG_BLK_INLINE_ENCRYPTION

static inline bool bio_has_crypt_ctx(struct bio *bio)
{
        return bio->bi_crypt_context;
}

static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio)
{
        return bio->bi_crypt_context;
}

void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key,
                       const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
                       gfp_t gfp_mask);

bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc,
                                 unsigned int bytes,
                                 const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]);

int blk_crypto_init_key(struct blk_crypto_key *blk_key,
                        const u8 *key_bytes, size_t key_size,
                        enum blk_crypto_key_type key_type,
                        enum blk_crypto_mode_num crypto_mode,
                        unsigned int dun_bytes,
                        unsigned int data_unit_size);

int blk_crypto_start_using_key(struct block_device *bdev,
                               const struct blk_crypto_key *key);

void blk_crypto_evict_key(struct block_device *bdev,
                          const struct blk_crypto_key *key);

bool blk_crypto_config_supported_natively(struct block_device *bdev,
                                          const struct blk_crypto_config *cfg);
bool blk_crypto_config_supported(struct block_device *bdev,
                                 const struct blk_crypto_config *cfg);

int blk_crypto_derive_sw_secret(struct block_device *bdev,
                                const u8 *eph_key, size_t eph_key_size,
                                u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]);

#else /* CONFIG_BLK_INLINE_ENCRYPTION */

static inline bool bio_has_crypt_ctx(struct bio *bio)
{
        return false;
}

static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio)
{
        return NULL;
}

#endif /* CONFIG_BLK_INLINE_ENCRYPTION */

bool __blk_crypto_submit_bio(struct bio *bio);

/**
 * blk_crypto_submit_bio - Submit a bio that may have a crypto context
 * @bio: bio to submit
 *
 * If @bio has no crypto context, or the crypt context attached to @bio is
 * supported by the underlying device's inline encryption hardware, just submit
 * @bio.
 *
 * Otherwise, try to perform en/decryption for this bio by falling back to the
 * kernel crypto API. For encryption this means submitting newly allocated
 * bios for the encrypted payload while keeping back the source bio until they
 * complete, while for reads the decryption happens in-place by a hooked in
 * completion handler.
 */
static inline void blk_crypto_submit_bio(struct bio *bio)
{
        if (!bio_has_crypt_ctx(bio) || __blk_crypto_submit_bio(bio))
                submit_bio(bio);
}

int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask);
/**
 * bio_crypt_clone - clone bio encryption context
 * @dst: destination bio
 * @src: source bio
 * @gfp_mask: memory allocation flags
 *
 * If @src has an encryption context, clone it to @dst.
 *
 * Return: 0 on success, -ENOMEM if out of memory.  -ENOMEM is only possible if
 *           @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM.
 */
static inline int bio_crypt_clone(struct bio *dst, struct bio *src,
                                  gfp_t gfp_mask)
{
        if (bio_has_crypt_ctx(src))
                return __bio_crypt_clone(dst, src, gfp_mask);
        return 0;
}

#endif /* __LINUX_BLK_CRYPTO_H */




























































































































    1 



    1 


























































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
// SPDX-License-Identifier: GPL-2.0
/*
 * xfrm_input.c
 *
 * Changes:
 *         YOSHIFUJI Hideaki @USAGI
 *                 Split up af-specific portion
 *
 */

#include <linux/bottom_half.h>
#include <linux/cache.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/percpu.h>
#include <net/dst.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/ip_tunnels.h>
#include <net/ip6_tunnel.h>
#include <net/dst_metadata.h>
#include <net/hotdata.h>

#include "xfrm_inout.h"

struct xfrm_trans_tasklet {
        struct work_struct work;
        spinlock_t queue_lock;
        struct sk_buff_head queue;
};

struct xfrm_trans_cb {
        union {
                struct inet_skb_parm        h4;
#if IS_ENABLED(CONFIG_IPV6)
                struct inet6_skb_parm        h6;
#endif
        } header;
        int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb);
        struct net *net;
};

#define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0]))

static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[2][AF_INET6 + 1];

static struct gro_cells gro_cells;
static struct net_device *xfrm_napi_dev;

static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet);

int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo)
{
        int err = 0;

        if (WARN_ON(afinfo->family > AF_INET6))
                return -EAFNOSUPPORT;

        spin_lock_bh(&xfrm_input_afinfo_lock);
        if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family]))
                err = -EEXIST;
        else
                rcu_assign_pointer(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], afinfo);
        spin_unlock_bh(&xfrm_input_afinfo_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_input_register_afinfo);

int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo)
{
        int err = 0;

        spin_lock_bh(&xfrm_input_afinfo_lock);
        if (likely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) {
                const struct xfrm_input_afinfo *cur;

                cur = rcu_access_pointer(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family]);
                if (unlikely(cur != afinfo))
                        err = -EINVAL;
                else
                        RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], NULL);
        }
        spin_unlock_bh(&xfrm_input_afinfo_lock);
        synchronize_rcu();
        return err;
}
EXPORT_SYMBOL(xfrm_input_unregister_afinfo);

static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(u8 family, bool is_ipip)
{
        const struct xfrm_input_afinfo *afinfo;

        if (WARN_ON_ONCE(family > AF_INET6))
                return NULL;

        rcu_read_lock();
        afinfo = rcu_dereference(xfrm_input_afinfo[is_ipip][family]);
        if (unlikely(!afinfo))
                rcu_read_unlock();
        return afinfo;
}

static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol,
                       int err)
{
        bool is_ipip = (protocol == IPPROTO_IPIP || protocol == IPPROTO_IPV6);
        const struct xfrm_input_afinfo *afinfo;
        int ret;

        afinfo = xfrm_input_get_afinfo(family, is_ipip);
        if (!afinfo)
                return -EAFNOSUPPORT;

        ret = afinfo->callback(skb, protocol, err);
        rcu_read_unlock();

        return ret;
}

struct sec_path *secpath_set(struct sk_buff *skb)
{
        struct sec_path *sp, *tmp = skb_ext_find(skb, SKB_EXT_SEC_PATH);

        sp = skb_ext_add(skb, SKB_EXT_SEC_PATH);
        if (!sp)
                return NULL;

        if (tmp) /* reused existing one (was COW'd if needed) */
                return sp;

        /* allocated new secpath */
        memset(sp->ovec, 0, sizeof(sp->ovec));
        sp->olen = 0;
        sp->len = 0;
        sp->verified_cnt = 0;

        return sp;
}
EXPORT_SYMBOL(secpath_set);

/* Fetch spi and seq from ipsec header */

int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
{
        int offset, offset_seq;
        int hlen;

        switch (nexthdr) {
        case IPPROTO_AH:
                hlen = sizeof(struct ip_auth_hdr);
                offset = offsetof(struct ip_auth_hdr, spi);
                offset_seq = offsetof(struct ip_auth_hdr, seq_no);
                break;
        case IPPROTO_ESP:
                hlen = sizeof(struct ip_esp_hdr);
                offset = offsetof(struct ip_esp_hdr, spi);
                offset_seq = offsetof(struct ip_esp_hdr, seq_no);
                break;
        case IPPROTO_COMP:
                if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr)))
                        return -EINVAL;
                *spi = htonl(ntohs(*(__be16 *)(skb_transport_header(skb) + 2)));
                *seq = 0;
                return 0;
        default:
                return 1;
        }

        if (!pskb_may_pull(skb, hlen))
                return -EINVAL;

        *spi = *(__be32 *)(skb_transport_header(skb) + offset);
        *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq);
        return 0;
}
EXPORT_SYMBOL(xfrm_parse_spi);

static int xfrm4_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb)
{
        struct iphdr *iph;
        int optlen = 0;
        int err = -EINVAL;

        skb->protocol = htons(ETH_P_IP);

        if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) {
                struct ip_beet_phdr *ph;
                int phlen;

                if (!pskb_may_pull(skb, sizeof(*ph)))
                        goto out;

                ph = (struct ip_beet_phdr *)skb->data;

                phlen = sizeof(*ph) + ph->padlen;
                optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen);
                if (optlen < 0 || optlen & 3 || optlen > 250)
                        goto out;

                XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr;

                if (!pskb_may_pull(skb, phlen))
                        goto out;
                __skb_pull(skb, phlen);
        }

        skb_push(skb, sizeof(*iph));
        skb_reset_network_header(skb);
        skb_mac_header_rebuild(skb);

        xfrm4_beet_make_header(skb);

        iph = ip_hdr(skb);

        iph->ihl += optlen / 4;
        iph->tot_len = htons(skb->len);
        iph->daddr = x->sel.daddr.a4;
        iph->saddr = x->sel.saddr.a4;
        iph->check = 0;
        iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
        err = 0;
out:
        return err;
}

static void ipip_ecn_decapsulate(struct sk_buff *skb)
{
        struct iphdr *inner_iph = ipip_hdr(skb);

        if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
                IP_ECN_set_ce(inner_iph);
}

static int xfrm4_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb)
{
        int err = -EINVAL;

        skb->protocol = htons(ETH_P_IP);

        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto out;

        err = skb_unclone(skb, GFP_ATOMIC);
        if (err)
                goto out;

        if (x->props.flags & XFRM_STATE_DECAP_DSCP)
                ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb));
        if (!(x->props.flags & XFRM_STATE_NOECN))
                ipip_ecn_decapsulate(skb);

        skb_reset_network_header(skb);
        skb_mac_header_rebuild(skb);
        if (skb->mac_len)
                eth_hdr(skb)->h_proto = skb->protocol;

        err = 0;

out:
        return err;
}

static void ipip6_ecn_decapsulate(struct sk_buff *skb)
{
        struct ipv6hdr *inner_iph = ipipv6_hdr(skb);

        if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
                IP6_ECN_set_ce(skb, inner_iph);
}

static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb)
{
        int err = -EINVAL;

        skb->protocol = htons(ETH_P_IPV6);

        if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
                goto out;

        err = skb_unclone(skb, GFP_ATOMIC);
        if (err)
                goto out;

        if (x->props.flags & XFRM_STATE_DECAP_DSCP)
                ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb));
        if (!(x->props.flags & XFRM_STATE_NOECN))
                ipip6_ecn_decapsulate(skb);

        skb_reset_network_header(skb);
        skb_mac_header_rebuild(skb);
        if (skb->mac_len)
                eth_hdr(skb)->h_proto = skb->protocol;

        err = 0;

out:
        return err;
}

static int xfrm6_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb)
{
        struct ipv6hdr *ip6h;
        int size = sizeof(struct ipv6hdr);
        int err;

        skb->protocol = htons(ETH_P_IPV6);

        err = skb_cow_head(skb, size + skb->mac_len);
        if (err)
                goto out;

        __skb_push(skb, size);
        skb_reset_network_header(skb);
        skb_mac_header_rebuild(skb);

        xfrm6_beet_make_header(skb);

        ip6h = ipv6_hdr(skb);
        ip6h->payload_len = htons(skb->len - size);
        ip6h->daddr = x->sel.daddr.in6;
        ip6h->saddr = x->sel.saddr.in6;
        err = 0;
out:
        return err;
}

/* Remove encapsulation header.
 *
 * The IP header will be moved over the top of the encapsulation
 * header.
 *
 * On entry, the transport header shall point to where the IP header
 * should be and the network header shall be set to where the IP
 * header currently is.  skb->data shall point to the start of the
 * payload.
 */
static int
xfrm_inner_mode_encap_remove(struct xfrm_state *x,
                             struct sk_buff *skb)
{
        switch (x->props.mode) {
        case XFRM_MODE_BEET:
                switch (x->sel.family) {
                case AF_INET:
                        return xfrm4_remove_beet_encap(x, skb);
                case AF_INET6:
                        return xfrm6_remove_beet_encap(x, skb);
                }
                break;
        case XFRM_MODE_TUNNEL:
                switch (XFRM_MODE_SKB_CB(skb)->protocol) {
                case IPPROTO_IPIP:
                        return xfrm4_remove_tunnel_encap(x, skb);
                case IPPROTO_IPV6:
                        return xfrm6_remove_tunnel_encap(x, skb);
                break;
                }
                return -EINVAL;
        }

        WARN_ON_ONCE(1);
        return -EOPNOTSUPP;
}

static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
{
        switch (x->props.family) {
        case AF_INET:
                xfrm4_extract_header(skb);
                break;
        case AF_INET6:
                xfrm6_extract_header(skb);
                break;
        default:
                WARN_ON_ONCE(1);
                return -EAFNOSUPPORT;
        }

        return xfrm_inner_mode_encap_remove(x, skb);
}

/* Remove encapsulation header.
 *
 * The IP header will be moved over the top of the encapsulation header.
 *
 * On entry, skb_transport_header() shall point to where the IP header
 * should be and skb_network_header() shall be set to where the IP header
 * currently is.  skb->data shall point to the start of the payload.
 */
static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
{
        struct xfrm_offload *xo = xfrm_offload(skb);
        int ihl = skb->data - skb_transport_header(skb);

        if (skb->transport_header != skb->network_header) {
                memmove(skb_transport_header(skb),
                        skb_network_header(skb), ihl);
                if (xo)
                        xo->orig_mac_len =
                                skb_mac_header_was_set(skb) ? skb_mac_header_len(skb) : 0;
                skb->network_header = skb->transport_header;
        }
        ip_hdr(skb)->tot_len = htons(skb->len + ihl);
        skb_reset_transport_header(skb);
        return 0;
}

static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IPV6)
        struct xfrm_offload *xo = xfrm_offload(skb);
        int ihl = skb->data - skb_transport_header(skb);

        if (skb->transport_header != skb->network_header) {
                memmove(skb_transport_header(skb),
                        skb_network_header(skb), ihl);
                if (xo)
                        xo->orig_mac_len =
                                skb_mac_header_was_set(skb) ? skb_mac_header_len(skb) : 0;
                skb->network_header = skb->transport_header;
        }
        ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
                                           sizeof(struct ipv6hdr));
        skb_reset_transport_header(skb);
        return 0;
#else
        WARN_ON_ONCE(1);
        return -EAFNOSUPPORT;
#endif
}

static int xfrm_inner_mode_input(struct xfrm_state *x,
                                 struct sk_buff *skb)
{
        switch (x->props.mode) {
        case XFRM_MODE_BEET:
        case XFRM_MODE_TUNNEL:
                return xfrm_prepare_input(x, skb);
        case XFRM_MODE_TRANSPORT:
                if (x->props.family == AF_INET)
                        return xfrm4_transport_input(x, skb);
                if (x->props.family == AF_INET6)
                        return xfrm6_transport_input(x, skb);
                break;
        case XFRM_MODE_ROUTEOPTIMIZATION:
                WARN_ON_ONCE(1);
                break;
        default:
                if (x->mode_cbs && x->mode_cbs->input)
                        return x->mode_cbs->input(x, skb);

                WARN_ON_ONCE(1);
                break;
        }

        return -EOPNOTSUPP;
}

/* NOTE: encap_type - In addition to the normal (non-negative) values for
 * encap_type, a negative value of -1 or -2 can be used to resume/restart this
 * function after a previous invocation early terminated for async operation.
 */
int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
{
        const struct xfrm_state_afinfo *afinfo;
        struct net *net = dev_net(skb->dev);
        int err;
        __be32 seq;
        __be32 seq_hi;
        struct xfrm_state *x = NULL;
        xfrm_address_t *daddr;
        u32 mark = skb->mark;
        unsigned int family = AF_UNSPEC;
        int decaps = 0;
        int async = 0;
        bool xfrm_gro = false;
        bool crypto_done = false;
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct sec_path *sp;

        if (encap_type < 0 || (xo && (xo->flags & XFRM_GRO || encap_type == 0 ||
                                      encap_type == UDP_ENCAP_ESPINUDP))) {
                x = xfrm_input_state(skb);

                if (unlikely(x->km.state != XFRM_STATE_VALID)) {
                        if (x->km.state == XFRM_STATE_ACQ)
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR);
                        else
                                XFRM_INC_STATS(net,
                                               LINUX_MIB_XFRMINSTATEINVALID);

                        if (encap_type == -1)
                                dev_put(skb->dev);
                        goto drop;
                }

                family = x->props.family;

                /* An encap_type of -2 indicates reconstructed inner packet */
                if (encap_type == -2)
                        goto resume_decapped;

                /* An encap_type of -1 indicates async resumption. */
                if (encap_type == -1) {
                        async = 1;
                        seq = XFRM_SKB_CB(skb)->seq.input.low;
                        spin_lock(&x->lock);
                        goto resume;
                }
                /* GRO call */
                seq = XFRM_SPI_SKB_CB(skb)->seq;

                if (xo && (xo->flags & CRYPTO_DONE)) {
                        crypto_done = true;
                        family = XFRM_SPI_SKB_CB(skb)->family;

                        if (!(xo->status & CRYPTO_SUCCESS)) {
                                if (xo->status &
                                    (CRYPTO_TRANSPORT_AH_AUTH_FAILED |
                                     CRYPTO_TRANSPORT_ESP_AUTH_FAILED |
                                     CRYPTO_TUNNEL_AH_AUTH_FAILED |
                                     CRYPTO_TUNNEL_ESP_AUTH_FAILED)) {

                                        xfrm_audit_state_icvfail(x, skb,
                                                                 x->type->proto);
                                        x->stats.integrity_failed++;
                                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR);
                                        goto drop;
                                }

                                if (xo->status & CRYPTO_INVALID_PROTOCOL) {
                                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR);
                                        goto drop;
                                }

                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
                                goto drop;
                        }

                        if (xfrm_parse_spi(skb, nexthdr, &spi, &seq)) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
                                goto drop;
                        }

                        nexthdr = x->type_offload->input_tail(x, skb);
                }

                goto process;
        }

        family = XFRM_SPI_SKB_CB(skb)->family;

        /* if tunnel is present override skb->mark value with tunnel i_key */
        switch (family) {
        case AF_INET:
                if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4)
                        mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key);
                break;
        case AF_INET6:
                if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6)
                        mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key);
                break;
        }

        sp = secpath_set(skb);
        if (!sp) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
                goto drop;
        }

        seq = 0;
        if (!spi && xfrm_parse_spi(skb, nexthdr, &spi, &seq)) {
                secpath_reset(skb);
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
                goto drop;
        }

        daddr = (xfrm_address_t *)(skb_network_header(skb) +
                                   XFRM_SPI_SKB_CB(skb)->daddroff);
        do {
                sp = skb_sec_path(skb);

                if (sp->len == XFRM_MAX_DEPTH) {
                        secpath_reset(skb);
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
                        goto drop;
                }

                x = xfrm_input_state_lookup(net, mark, daddr, spi, nexthdr, family);
                if (x == NULL) {
                        secpath_reset(skb);
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
                        xfrm_audit_state_notfound(skb, family, spi, seq);
                        goto drop;
                }

                if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) {
                        secpath_reset(skb);
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR);
                        xfrm_audit_state_notfound(skb, family, spi, seq);
                        xfrm_state_put(x);
                        x = NULL;
                        goto drop;
                }

                skb->mark = xfrm_smark_get(skb->mark, x);

                sp->xvec[sp->len++] = x;

                skb_dst_force(skb);
                if (!skb_dst(skb)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
                        goto drop;
                }

process:
                seq_hi = htonl(xfrm_replay_seqhi(x, seq));

                XFRM_SKB_CB(skb)->seq.input.low = seq;
                XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;

                spin_lock(&x->lock);

                if (unlikely(x->km.state != XFRM_STATE_VALID)) {
                        if (x->km.state == XFRM_STATE_ACQ)
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR);
                        else
                                XFRM_INC_STATS(net,
                                               LINUX_MIB_XFRMINSTATEINVALID);
                        goto drop_unlock;
                }

                if ((x->encap ? x->encap->encap_type : 0) != encap_type) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
                        goto drop_unlock;
                }

                if (xfrm_replay_check(x, skb, seq)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR);
                        goto drop_unlock;
                }

                if (xfrm_state_check_expire(x)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEEXPIRED);
                        goto drop_unlock;
                }

                if (xfrm_tunnel_check(skb, x, family)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
                        goto drop_unlock;
                }

                if (!crypto_done) {
                        spin_unlock(&x->lock);
                        dev_hold(skb->dev);

                        nexthdr = x->type->input(x, skb);
                        if (nexthdr == -EINPROGRESS) {
                                if (async)
                                        dev_put(skb->dev);
                                return 0;
                        }

                        dev_put(skb->dev);
                        spin_lock(&x->lock);
                }
resume:
                if (nexthdr < 0) {
                        if (nexthdr == -EBADMSG) {
                                xfrm_audit_state_icvfail(x, skb,
                                                         x->type->proto);
                                x->stats.integrity_failed++;
                        }
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR);
                        goto drop_unlock;
                }

                /* only the first xfrm gets the encap type */
                encap_type = 0;

                if (!crypto_done && xfrm_replay_recheck(x, skb, seq)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR);
                        goto drop_unlock;
                }

                xfrm_replay_advance(x, seq);

                x->curlft.bytes += skb->len;
                x->curlft.packets++;
                x->lastused = ktime_get_real_seconds();

                spin_unlock(&x->lock);

                XFRM_MODE_SKB_CB(skb)->protocol = nexthdr;

                err = xfrm_inner_mode_input(x, skb);
                if (err == -EINPROGRESS) {
                        if (async)
                                dev_put(skb->dev);
                        return 0;
                } else if (err) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
                        goto drop;
                }
resume_decapped:
                if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) {
                        decaps = 1;
                        break;
                }

                /*
                 * We need the inner address.  However, we only get here for
                 * transport mode so the outer address is identical.
                 */
                daddr = &x->id.daddr;
                family = x->props.family;

                err = xfrm_parse_spi(skb, nexthdr, &spi, &seq);
                if (err < 0) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
                        goto drop;
                }
                crypto_done = false;
        } while (!err);

        err = xfrm_rcv_cb(skb, family, x->type->proto, 0);
        if (err)
                goto drop;

        nf_reset_ct(skb);

        if (decaps) {
                sp = skb_sec_path(skb);
                if (sp)
                        sp->olen = 0;
                if (skb_valid_dst(skb))
                        skb_dst_drop(skb);
                if (async)
                        dev_put(skb->dev);
                gro_cells_receive(&gro_cells, skb);
                return 0;
        } else {
                xo = xfrm_offload(skb);
                if (xo)
                        xfrm_gro = xo->flags & XFRM_GRO;

                err = -EAFNOSUPPORT;
                rcu_read_lock();
                afinfo = xfrm_state_afinfo_get_rcu(x->props.family);
                if (likely(afinfo))
                        err = afinfo->transport_finish(skb, xfrm_gro || async);
                rcu_read_unlock();
                if (xfrm_gro) {
                        sp = skb_sec_path(skb);
                        if (sp)
                                sp->olen = 0;
                        if (skb_valid_dst(skb))
                                skb_dst_drop(skb);
                        if (async)
                                dev_put(skb->dev);
                        gro_cells_receive(&gro_cells, skb);
                        return err;
                }

                return err;
        }

drop_unlock:
        spin_unlock(&x->lock);
drop:
        if (async)
                dev_put(skb->dev);
        xfrm_rcv_cb(skb, family, x && x->type ? x->type->proto : nexthdr, -1);
        kfree_skb(skb);
        return 0;
}
EXPORT_SYMBOL(xfrm_input);

int xfrm_input_resume(struct sk_buff *skb, int nexthdr)
{
        return xfrm_input(skb, nexthdr, 0, -1);
}
EXPORT_SYMBOL(xfrm_input_resume);

static void xfrm_trans_reinject(struct work_struct *work)
{
        struct xfrm_trans_tasklet *trans = container_of(work, struct xfrm_trans_tasklet, work);
        struct sk_buff_head queue;
        struct sk_buff *skb;

        __skb_queue_head_init(&queue);
        spin_lock_bh(&trans->queue_lock);
        skb_queue_splice_init(&trans->queue, &queue);
        spin_unlock_bh(&trans->queue_lock);

        local_bh_disable();
        while ((skb = __skb_dequeue(&queue)))
                XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net,
                                               NULL, skb);
        local_bh_enable();
}

int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
                         int (*finish)(struct net *, struct sock *,
                                       struct sk_buff *))
{
        struct xfrm_trans_tasklet *trans;

        trans = this_cpu_ptr(&xfrm_trans_tasklet);

        if (skb_queue_len(&trans->queue) >= READ_ONCE(net_hotdata.max_backlog))
                return -ENOBUFS;

        BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb));

        XFRM_TRANS_SKB_CB(skb)->finish = finish;
        XFRM_TRANS_SKB_CB(skb)->net = net;
        spin_lock_bh(&trans->queue_lock);
        __skb_queue_tail(&trans->queue, skb);
        spin_unlock_bh(&trans->queue_lock);
        schedule_work(&trans->work);
        return 0;
}
EXPORT_SYMBOL(xfrm_trans_queue_net);

int xfrm_trans_queue(struct sk_buff *skb,
                     int (*finish)(struct net *, struct sock *,
                                   struct sk_buff *))
{
        return xfrm_trans_queue_net(dev_net(skb->dev), skb, finish);
}
EXPORT_SYMBOL(xfrm_trans_queue);

void __init xfrm_input_init(void)
{
        int err;
        int i;

        xfrm_napi_dev = alloc_netdev_dummy(0);
        if (!xfrm_napi_dev)
                panic("Failed to allocate XFRM dummy netdev\n");

        err = gro_cells_init(&gro_cells, xfrm_napi_dev);
        if (err)
                gro_cells.cells = NULL;

        for_each_possible_cpu(i) {
                struct xfrm_trans_tasklet *trans;

                trans = &per_cpu(xfrm_trans_tasklet, i);
                spin_lock_init(&trans->queue_lock);
                __skb_queue_head_init(&trans->queue);
                INIT_WORK(&trans->work, xfrm_trans_reinject);
        }
}





















































   20 






   21 

































   23 






















   22 







































   21 

















































   22 



































































































   22 










































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VIRTIO_NET_H
#define _LINUX_VIRTIO_NET_H

#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/udp.h>
#include <uapi/linux/tcp.h>
#include <uapi/linux/virtio_net.h>

static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
{
        switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
        case VIRTIO_NET_HDR_GSO_TCPV4:
                return protocol == cpu_to_be16(ETH_P_IP);
        case VIRTIO_NET_HDR_GSO_TCPV6:
                return protocol == cpu_to_be16(ETH_P_IPV6);
        case VIRTIO_NET_HDR_GSO_UDP:
        case VIRTIO_NET_HDR_GSO_UDP_L4:
                return protocol == cpu_to_be16(ETH_P_IP) ||
                       protocol == cpu_to_be16(ETH_P_IPV6);
        default:
                return false;
        }
}

static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
                                           const struct virtio_net_hdr *hdr)
{
        if (skb->protocol)
                return 0;

        switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
        case VIRTIO_NET_HDR_GSO_TCPV4:
        case VIRTIO_NET_HDR_GSO_UDP:
        case VIRTIO_NET_HDR_GSO_UDP_L4:
                skb->protocol = cpu_to_be16(ETH_P_IP);
                break;
        case VIRTIO_NET_HDR_GSO_TCPV6:
                skb->protocol = cpu_to_be16(ETH_P_IPV6);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static inline int __virtio_net_hdr_to_skb(struct sk_buff *skb,
                                          const struct virtio_net_hdr *hdr,
                                          bool little_endian, u8 hdr_gso_type)
{
        unsigned int nh_min_len = sizeof(struct iphdr);
        unsigned int gso_type = 0;
        unsigned int thlen = 0;
        unsigned int p_off = 0;
        unsigned int ip_proto;

        if (hdr_gso_type != VIRTIO_NET_HDR_GSO_NONE) {
                switch (hdr_gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
                case VIRTIO_NET_HDR_GSO_TCPV4:
                        gso_type = SKB_GSO_TCPV4;
                        ip_proto = IPPROTO_TCP;
                        thlen = sizeof(struct tcphdr);
                        break;
                case VIRTIO_NET_HDR_GSO_TCPV6:
                        gso_type = SKB_GSO_TCPV6;
                        ip_proto = IPPROTO_TCP;
                        thlen = sizeof(struct tcphdr);
                        nh_min_len = sizeof(struct ipv6hdr);
                        break;
                case VIRTIO_NET_HDR_GSO_UDP:
                        gso_type = SKB_GSO_UDP;
                        ip_proto = IPPROTO_UDP;
                        thlen = sizeof(struct udphdr);
                        break;
                case VIRTIO_NET_HDR_GSO_UDP_L4:
                        gso_type = SKB_GSO_UDP_L4;
                        ip_proto = IPPROTO_UDP;
                        thlen = sizeof(struct udphdr);
                        break;
                default:
                        return -EINVAL;
                }

                if (hdr_gso_type & VIRTIO_NET_HDR_GSO_ECN)
                        gso_type |= SKB_GSO_TCP_ECN;

                if (hdr->gso_size == 0)
                        return -EINVAL;
        }

        skb_reset_mac_header(skb);

        if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
                u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
                u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
                u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16));

                if (!pskb_may_pull(skb, needed))
                        return -EINVAL;

                if (!skb_partial_csum_set(skb, start, off))
                        return -EINVAL;
                if (skb_transport_offset(skb) < nh_min_len)
                        return -EINVAL;

                nh_min_len = skb_transport_offset(skb);
                p_off = nh_min_len + thlen;
                if (!pskb_may_pull(skb, p_off))
                        return -EINVAL;
        } else {
                /* gso packets without NEEDS_CSUM do not set transport_offset.
                 * probe and drop if does not match one of the above types.
                 */
                if (gso_type && skb->network_header) {
                        struct flow_keys_basic keys;

                        if (!skb->protocol) {
                                __be16 protocol = dev_parse_header_protocol(skb);

                                if (!protocol)
                                        virtio_net_hdr_set_proto(skb, hdr);
                                else if (!virtio_net_hdr_match_proto(protocol,
                                                                 hdr_gso_type))
                                        return -EINVAL;
                                else
                                        skb->protocol = protocol;
                        }
retry:
                        if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                                              NULL, 0, 0, 0,
                                                              0)) {
                                /* UFO does not specify ipv4 or 6: try both */
                                if (gso_type & SKB_GSO_UDP &&
                                    skb->protocol == htons(ETH_P_IP)) {
                                        skb->protocol = htons(ETH_P_IPV6);
                                        goto retry;
                                }
                                return -EINVAL;
                        }

                        p_off = keys.control.thoff + thlen;
                        if (!pskb_may_pull(skb, p_off) ||
                            keys.basic.ip_proto != ip_proto)
                                return -EINVAL;

                        skb_set_transport_header(skb, keys.control.thoff);
                } else if (gso_type) {
                        p_off = nh_min_len + thlen;
                        if (!pskb_may_pull(skb, p_off))
                                return -EINVAL;
                }
        }

        if (hdr_gso_type != VIRTIO_NET_HDR_GSO_NONE) {
                u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size);
                unsigned int nh_off = p_off;
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                switch (gso_type & ~SKB_GSO_TCP_ECN) {
                case SKB_GSO_UDP:
                        /* UFO may not include transport header in gso_size. */
                        nh_off -= thlen;
                        break;
                case SKB_GSO_UDP_L4:
                        if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM))
                                return -EINVAL;
                        if (skb->csum_offset != offsetof(struct udphdr, check))
                                return -EINVAL;
                        if (skb->len - p_off > gso_size * UDP_MAX_SEGMENTS)
                                return -EINVAL;
                        if (gso_type != SKB_GSO_UDP_L4)
                                return -EINVAL;
                        break;
                case SKB_GSO_TCPV4:
                case SKB_GSO_TCPV6:
                        if (skb->ip_summed == CHECKSUM_PARTIAL &&
                            skb->csum_offset != offsetof(struct tcphdr, check))
                                return -EINVAL;
                        break;
                }

                /* Kernel has a special handling for GSO_BY_FRAGS. */
                if (gso_size == GSO_BY_FRAGS)
                        return -EINVAL;

                /* Too small packets are not really GSO ones. */
                if (skb->len - nh_off > gso_size) {
                        shinfo->gso_size = gso_size;
                        shinfo->gso_type = gso_type;

                        /* Header must be checked, and gso_segs computed. */
                        shinfo->gso_type |= SKB_GSO_DODGY;
                        shinfo->gso_segs = 0;
                }
        }

        return 0;
}

static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
                                        const struct virtio_net_hdr *hdr,
                                        bool little_endian)
{
        return __virtio_net_hdr_to_skb(skb, hdr, little_endian, hdr->gso_type);
}

/* This function must be called after virtio_net_hdr_from_skb(). */
static inline void __virtio_net_set_hdrlen(const struct sk_buff *skb,
                                           struct virtio_net_hdr *hdr,
                                           bool little_endian)
{
        u16 hdr_len;

        hdr_len = skb_transport_offset(skb);

        if (hdr->gso_type == VIRTIO_NET_HDR_GSO_UDP_L4)
                hdr_len += sizeof(struct udphdr);
        else
                hdr_len += tcp_hdrlen(skb);

        hdr->hdr_len = __cpu_to_virtio16(little_endian, hdr_len);
}

/* This function must be called after virtio_net_hdr_from_skb(). */
static inline void __virtio_net_set_tnl_hdrlen(const struct sk_buff *skb,
                                               struct virtio_net_hdr *hdr)
{
        u16 hdr_len;

        hdr_len = skb_inner_transport_offset(skb);

        if (hdr->gso_type == VIRTIO_NET_HDR_GSO_UDP_L4)
                hdr_len += sizeof(struct udphdr);
        else
                hdr_len += inner_tcp_hdrlen(skb);

        hdr->hdr_len = __cpu_to_virtio16(true, hdr_len);
}

static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
                                          struct virtio_net_hdr *hdr,
                                          bool little_endian,
                                          bool has_data_valid,
                                          int vlan_hlen)
{
        memset(hdr, 0, sizeof(*hdr));   /* no info leak */

        if (skb_is_gso(skb)) {
                struct skb_shared_info *sinfo = skb_shinfo(skb);

                /* This is a hint as to how much should be linear. */
                hdr->hdr_len = __cpu_to_virtio16(little_endian,
                                                 skb_headlen(skb));
                hdr->gso_size = __cpu_to_virtio16(little_endian,
                                                  sinfo->gso_size);
                if (sinfo->gso_type & SKB_GSO_TCPV4)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
                else if (sinfo->gso_type & SKB_GSO_TCPV6)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
                else if (sinfo->gso_type & SKB_GSO_UDP_L4)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4;
                else
                        return -EINVAL;
                if (sinfo->gso_type & SKB_GSO_TCP_ECN)
                        hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
        } else
                hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
                hdr->csum_start = __cpu_to_virtio16(little_endian,
                        skb_checksum_start_offset(skb) + vlan_hlen);
                hdr->csum_offset = __cpu_to_virtio16(little_endian,
                                skb->csum_offset);
        } else if (has_data_valid &&
                   skb->ip_summed == CHECKSUM_UNNECESSARY) {
                hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
        } /* else everything is zero */

        return 0;
}

static inline unsigned int virtio_l3min(bool is_ipv6)
{
        return is_ipv6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr);
}

static inline int
virtio_net_hdr_tnl_to_skb(struct sk_buff *skb,
                          const struct virtio_net_hdr_v1_hash_tunnel *vhdr,
                          bool tnl_hdr_negotiated,
                          bool tnl_csum_negotiated,
                          bool little_endian)
{
        const struct virtio_net_hdr *hdr = (const struct virtio_net_hdr *)vhdr;
        unsigned int inner_nh, outer_th, inner_th;
        unsigned int inner_l3min, outer_l3min;
        u8 gso_inner_type, gso_tunnel_type;
        bool outer_isv6, inner_isv6;
        int ret;

        gso_tunnel_type = hdr->gso_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL;
        if (!gso_tunnel_type)
                return virtio_net_hdr_to_skb(skb, hdr, little_endian);

        /* Tunnel not supported/negotiated, but the hdr asks for it. */
        if (!tnl_hdr_negotiated)
                return -EINVAL;

        /* Either ipv4 or ipv6. */
        if (gso_tunnel_type == VIRTIO_NET_HDR_GSO_UDP_TUNNEL)
                return -EINVAL;

        /* The UDP tunnel must carry a GSO packet, but no UFO. */
        gso_inner_type = hdr->gso_type & ~(VIRTIO_NET_HDR_GSO_ECN |
                                           VIRTIO_NET_HDR_GSO_UDP_TUNNEL);
        if (!gso_inner_type || gso_inner_type == VIRTIO_NET_HDR_GSO_UDP)
                return -EINVAL;

        /* Rely on csum being present. */
        if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM))
                return -EINVAL;

        /* Validate offsets. */
        outer_isv6 = gso_tunnel_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6;
        inner_isv6 = gso_inner_type == VIRTIO_NET_HDR_GSO_TCPV6;
        inner_l3min = virtio_l3min(inner_isv6);
        outer_l3min = ETH_HLEN + virtio_l3min(outer_isv6);

        inner_th = __virtio16_to_cpu(little_endian, hdr->csum_start);
        inner_nh = le16_to_cpu(vhdr->inner_nh_offset);
        outer_th = le16_to_cpu(vhdr->outer_th_offset);
        if (outer_th < outer_l3min ||
            inner_nh < outer_th + sizeof(struct udphdr) ||
            inner_th < inner_nh + inner_l3min)
                return -EINVAL;

        /* Let the basic parsing deal with plain GSO features. */
        ret = __virtio_net_hdr_to_skb(skb, hdr, true,
                                      hdr->gso_type & ~gso_tunnel_type);
        if (ret)
                return ret;

        /* In case of USO, the inner protocol is still unknown and
         * `inner_isv6` is just a guess, additional parsing is needed.
         * The previous validation ensures that accessing an ipv4 inner
         * network header is safe.
         */
        if (gso_inner_type == VIRTIO_NET_HDR_GSO_UDP_L4) {
                struct iphdr *iphdr = (struct iphdr *)(skb->data + inner_nh);

                inner_isv6 = iphdr->version == 6;
                inner_l3min = virtio_l3min(inner_isv6);
                if (inner_th < inner_nh + inner_l3min)
                        return -EINVAL;
        }

        skb_set_inner_protocol(skb, inner_isv6 ? htons(ETH_P_IPV6) :
                                                 htons(ETH_P_IP));
        if (hdr->flags & VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM) {
                if (!tnl_csum_negotiated)
                        return -EINVAL;

                skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
        } else {
                skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
        }

        skb->inner_transport_header = inner_th + skb_headroom(skb);
        skb->inner_network_header = inner_nh + skb_headroom(skb);
        skb->inner_mac_header = inner_nh + skb_headroom(skb);
        skb->transport_header = outer_th + skb_headroom(skb);
        skb->encapsulation = 1;
        return 0;
}

/* Checksum-related fields validation for the driver */
static inline int virtio_net_handle_csum_offload(struct sk_buff *skb,
                                                 struct virtio_net_hdr *hdr,
                                                 bool tnl_csum_negotiated)
{
        if (!(hdr->gso_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL)) {
                if (!(hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID))
                        return 0;

                skb->ip_summed = CHECKSUM_UNNECESSARY;
                if (!(hdr->flags & VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM))
                        return 0;

                /* tunnel csum packets are invalid when the related
                 * feature has not been negotiated
                 */
                if (!tnl_csum_negotiated)
                        return -EINVAL;
                skb->csum_level = 1;
                return 0;
        }

        /* DATA_VALID is mutually exclusive with NEEDS_CSUM, and GSO
         * over UDP tunnel requires the latter
         */
        if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID)
                return -EINVAL;
        return 0;
}

/*
 * vlan_hlen always refers to the outermost MAC header. That also
 * means it refers to the only MAC header, if the packet does not carry
 * any encapsulation.
 */
static inline int
virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb,
                            struct virtio_net_hdr_v1_hash_tunnel *vhdr,
                            bool tnl_hdr_negotiated,
                            bool little_endian,
                            int vlan_hlen,
                            bool has_data_valid,
                            bool feature_hdrlen)
{
        struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr;
        unsigned int inner_nh, outer_th;
        int tnl_gso_type;
        int ret;

        tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL |
                                                    SKB_GSO_UDP_TUNNEL_CSUM);
        if (!tnl_gso_type) {
                ret = virtio_net_hdr_from_skb(skb, hdr, little_endian,
                                              has_data_valid, vlan_hlen);
                if (ret)
                        return ret;

                if (feature_hdrlen && hdr->hdr_len)
                        __virtio_net_set_hdrlen(skb, hdr, little_endian);

                return ret;
        }

        /* Tunnel support not negotiated but skb ask for it. */
        if (!tnl_hdr_negotiated)
                return -EINVAL;

        vhdr->hash_hdr.hash_value_lo = 0;
        vhdr->hash_hdr.hash_value_hi = 0;
        vhdr->hash_hdr.hash_report = 0;
        vhdr->hash_hdr.padding = 0;

        /* Let the basic parsing deal with plain GSO features. */
        skb_shinfo(skb)->gso_type &= ~tnl_gso_type;
        ret = virtio_net_hdr_from_skb(skb, hdr, true, false, vlan_hlen);
        skb_shinfo(skb)->gso_type |= tnl_gso_type;
        if (ret)
                return ret;

        if (feature_hdrlen && hdr->hdr_len)
                __virtio_net_set_tnl_hdrlen(skb, hdr);

        if (skb->protocol == htons(ETH_P_IPV6))
                hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6;
        else
                hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4;

        if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)
                hdr->flags |= VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM;

        inner_nh = skb->inner_network_header - skb_headroom(skb);
        outer_th = skb->transport_header - skb_headroom(skb);
        vhdr->inner_nh_offset = cpu_to_le16(inner_nh);
        vhdr->outer_th_offset = cpu_to_le16(outer_th);
        return 0;
}

#endif /* _LINUX_VIRTIO_NET_H */


















































































    1 




    1 



















































































































































    1 













    1 


















    1 
    1 



    1 



































































    1 





    1 










    1 







    1 

























    1 



















    1 




    1 



    1 























    1 







    1 











    1 





    1 

















































































    1 



    1 






    1 






    1 

















    1 


















    1 




    1 































































































































































































    1 

    1 







    1 









    1 

















    1 














    1 




    1 























    1 


    1 













   22 


   23 











































































































































































































































































































































































































































    1 



    1 




















    1 






    1 










































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/mmap.c
 *
 * Written by obz.
 *
 * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/profile.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/mmdebug.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/ksm.h>
#include <linux/memfd.h>

#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>

#define CREATE_TRACE_POINTS
#include <trace/events/mmap.h>

#include "internal.h"

#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags)        (0)
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif

static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);

/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
        vm_flags_t vm_flags = vma->vm_flags;
        pgprot_t vm_page_prot;

        vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
        if (vma_wants_writenotify(vma, vm_page_prot)) {
                vm_flags &= ~VM_SHARED;
                vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
        }
        /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
        WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}

/*
 * check_brk_limits() - Use platform specific check of range & verify mlock
 * limits.
 * @addr: The address to check
 * @len: The size of increase.
 *
 * Return: 0 on success.
 */
static int check_brk_limits(unsigned long addr, unsigned long len)
{
        unsigned long mapped_addr;

        mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
        if (IS_ERR_VALUE(mapped_addr))
                return mapped_addr;

        return mlock_future_ok(current->mm,
                              current->mm->def_flags & VM_LOCKED, len)
                ? 0 : -EAGAIN;
}

SYSCALL_DEFINE1(brk, unsigned long, brk)
{
        unsigned long newbrk, oldbrk, origbrk;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *brkvma, *next = NULL;
        unsigned long min_brk;
        bool populate = false;
        LIST_HEAD(uf);
        struct vma_iterator vmi;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        origbrk = mm->brk;

        min_brk = mm->start_brk;
#ifdef CONFIG_COMPAT_BRK
        /*
         * CONFIG_COMPAT_BRK can still be overridden by setting
         * randomize_va_space to 2, which will still cause mm->start_brk
         * to be arbitrarily shifted
         */
        if (!current->brk_randomized)
                min_brk = mm->end_data;
#endif
        if (brk < min_brk)
                goto out;

        /*
         * Check against rlimit here. If this check is done later after the test
         * of oldbrk with newbrk then it can escape the test and let the data
         * segment grow beyond its set limit the in case where the limit is
         * not page aligned -Ram Gupta
         */
        if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
                              mm->end_data, mm->start_data))
                goto out;

        newbrk = PAGE_ALIGN(brk);
        oldbrk = PAGE_ALIGN(mm->brk);
        if (oldbrk == newbrk) {
                mm->brk = brk;
                goto success;
        }

        /* Always allow shrinking brk. */
        if (brk <= mm->brk) {
                /* Search one past newbrk */
                vma_iter_init(&vmi, mm, newbrk);
                brkvma = vma_find(&vmi, oldbrk);
                if (!brkvma || brkvma->vm_start >= oldbrk)
                        goto out; /* mapping intersects with an existing non-brk vma. */
                /*
                 * mm->brk must be protected by write mmap_lock.
                 * do_vmi_align_munmap() will drop the lock on success,  so
                 * update it before calling do_vma_munmap().
                 */
                mm->brk = brk;
                if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf,
                                        /* unlock = */ true))
                        goto out;

                goto success_unlocked;
        }

        if (check_brk_limits(oldbrk, newbrk - oldbrk))
                goto out;

        /*
         * Only check if the next VMA is within the stack_guard_gap of the
         * expansion area
         */
        vma_iter_init(&vmi, mm, oldbrk);
        next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap);
        if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
                goto out;

        brkvma = vma_prev_limit(&vmi, mm->start_brk);
        /* Ok, looks good - let it rip. */
        if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk,
                         EMPTY_VMA_FLAGS) < 0)
                goto out;

        mm->brk = brk;
        if (mm->def_flags & VM_LOCKED)
                populate = true;

success:
        mmap_write_unlock(mm);
success_unlocked:
        userfaultfd_unmap_complete(mm, &uf);
        if (populate)
                mm_populate(oldbrk, newbrk - oldbrk);
        return brk;

out:
        mm->brk = origbrk;
        mmap_write_unlock(mm);
        return origbrk;
}

/*
 * If a hint addr is less than mmap_min_addr change hint to be as
 * low as possible but still greater than mmap_min_addr
 */
static inline unsigned long round_hint_to_min(unsigned long hint)
{
        hint &= PAGE_MASK;
        if (((void *)hint != NULL) &&
            (hint < mmap_min_addr))
                return PAGE_ALIGN(mmap_min_addr);
        return hint;
}

bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked,
                     unsigned long bytes)
{
        unsigned long locked_pages, limit_pages;

        if (!is_vma_locked || capable(CAP_IPC_LOCK))
                return true;

        locked_pages = bytes >> PAGE_SHIFT;
        locked_pages += mm->locked_vm;

        limit_pages = rlimit(RLIMIT_MEMLOCK);
        limit_pages >>= PAGE_SHIFT;

        return locked_pages <= limit_pages;
}

static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISBLK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISSOCK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        /* Special "we do even unsigned file positions" case */
        if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)
                return 0;

        /* Yes, random drivers might want more. But I'm tired of buggy drivers */
        return ULONG_MAX;
}

static inline bool file_mmap_ok(struct file *file, struct inode *inode,
                                unsigned long pgoff, unsigned long len)
{
        u64 maxsize = file_mmap_size_max(file, inode);

        if (maxsize && len > maxsize)
                return false;
        maxsize -= len;
        if (pgoff > maxsize >> PAGE_SHIFT)
                return false;
        return true;
}

/**
 * do_mmap() - Perform a userland memory mapping into the current process
 * address space of length @len with protection bits @prot, mmap flags @flags
 * (from which VMA flags will be inferred), and any additional VMA flags to
 * apply @vm_flags. If this is a file-backed mapping then the file is specified
 * in @file and page offset into the file via @pgoff.
 *
 * This function does not perform security checks on the file and assumes, if
 * @uf is non-NULL, the caller has provided a list head to track unmap events
 * for userfaultfd @uf.
 *
 * It also simply indicates whether memory population is required by setting
 * @populate, which must be non-NULL, expecting the caller to actually perform
 * this task itself if appropriate.
 *
 * This function will invoke architecture-specific (and if provided and
 * relevant, file system-specific) logic to determine the most appropriate
 * unmapped area in which to place the mapping if not MAP_FIXED.
 *
 * Callers which require userland mmap() behaviour should invoke vm_mmap(),
 * which is also exported for module use.
 *
 * Those which require this behaviour less security checks, userfaultfd and
 * populate behaviour, and who handle the mmap write lock themselves, should
 * call this function.
 *
 * Note that the returned address may reside within a merged VMA if an
 * appropriate merge were to take place, so it doesn't necessarily specify the
 * start of a VMA, rather only the start of a valid mapped range of length
 * @len bytes, rounded down to the nearest page size.
 *
 * The caller must write-lock current->mm->mmap_lock.
 *
 * @file: An optional struct file pointer describing the file which is to be
 * mapped, if a file-backed mapping.
 * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the
 * address at which to perform this mapping. See mmap (2) for details. Must be
 * page-aligned.
 * @len: The length of the mapping. Will be page-aligned and must be at least 1
 * page in size.
 * @prot: Protection bits describing access required to the mapping. See mmap
 * (2) for details.
 * @flags: Flags specifying how the mapping should be performed, see mmap (2)
 * for details.
 * @vm_flags: VMA flags which should be set by default, or 0 otherwise.
 * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise.
 * @populate: A pointer to a value which will be set to 0 if no population of
 * the range is required, or the number of bytes to populate if it is. Must be
 * non-NULL. See mmap (2) for details as to under what circumstances population
 * of the range occurs.
 * @uf: An optional pointer to a list head to track userfaultfd unmap events
 * should unmapping events arise. If provided, it is up to the caller to manage
 * this.
 *
 * Returns: Either an error, or the address at which the requested mapping has
 * been performed.
 */
unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
                        unsigned long flags, vm_flags_t vm_flags,
                        unsigned long pgoff, unsigned long *populate,
                        struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        int pkey = 0;

        *populate = 0;

        mmap_assert_write_locked(mm);

        if (!len)
                return -EINVAL;

        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
         * (the exception is when the underlying filesystem is noexec
         *  mounted, in which case we don't add PROT_EXEC.)
         */
        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
                if (!(file && path_noexec(&file->f_path)))
                        prot |= PROT_EXEC;

        /* force arch specific MAP_FIXED handling in get_unmapped_area */
        if (flags & MAP_FIXED_NOREPLACE)
                flags |= MAP_FIXED;

        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);

        /* Careful about overflows.. */
        len = PAGE_ALIGN(len);
        if (!len)
                return -ENOMEM;

        /* offset overflow? */
        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
                return -EOVERFLOW;

        /* Too many mappings? */
        if (mm->map_count > get_sysctl_max_map_count())
                return -ENOMEM;

        /*
         * addr is returned from get_unmapped_area,
         * There are two cases:
         * 1> MAP_FIXED == false
         *        unallocated memory, no need to check sealing.
         * 1> MAP_FIXED == true
         *        sealing is checked inside mmap_region when
         *        do_vmi_munmap is called.
         */

        if (prot == PROT_EXEC) {
                pkey = execute_only_pkey(mm);
                if (pkey < 0)
                        pkey = 0;
        }

        /* Do simple checking here so the lower-level routines won't have
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
        vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

        /* Obtain the address to map to. we verify (or select) it and ensure
         * that it represents a valid section of the address space.
         */
        addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags);
        if (IS_ERR_VALUE(addr))
                return addr;

        if (flags & MAP_FIXED_NOREPLACE) {
                if (find_vma_intersection(mm, addr, addr + len))
                        return -EEXIST;
        }

        if (flags & MAP_LOCKED)
                if (!can_do_mlock())
                        return -EPERM;

        if (!mlock_future_ok(mm, vm_flags & VM_LOCKED, len))
                return -EAGAIN;

        if (file) {
                struct inode *inode = file_inode(file);
                unsigned long flags_mask;
                int err;

                if (!file_mmap_ok(file, inode, pgoff, len))
                        return -EOVERFLOW;

                flags_mask = LEGACY_MAP_MASK;
                if (file->f_op->fop_flags & FOP_MMAP_SYNC)
                        flags_mask |= MAP_SYNC;

                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        /*
                         * Force use of MAP_SHARED_VALIDATE with non-legacy
                         * flags. E.g. MAP_SYNC is dangerous to use with
                         * MAP_SHARED as you don't know which consistency model
                         * you will get. We silently ignore unsupported flags
                         * with MAP_SHARED to preserve backward compatibility.
                         */
                        flags &= LEGACY_MAP_MASK;
                        fallthrough;
                case MAP_SHARED_VALIDATE:
                        if (flags & ~flags_mask)
                                return -EOPNOTSUPP;
                        if (prot & PROT_WRITE) {
                                if (!(file->f_mode & FMODE_WRITE))
                                        return -EACCES;
                                if (IS_SWAPFILE(file->f_mapping->host))
                                        return -ETXTBSY;
                        }

                        /*
                         * Make sure we don't allow writing to an append-only
                         * file..
                         */
                        if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
                                return -EACCES;

                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        if (!(file->f_mode & FMODE_WRITE))
                                vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
                        fallthrough;
                case MAP_PRIVATE:
                        if (!(file->f_mode & FMODE_READ))
                                return -EACCES;
                        if (path_noexec(&file->f_path)) {
                                if (vm_flags & VM_EXEC)
                                        return -EPERM;
                                vm_flags &= ~VM_MAYEXEC;
                        }

                        if (!can_mmap_file(file))
                                return -ENODEV;
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        break;

                default:
                        return -EINVAL;
                }

                /*
                 * Check to see if we are violating any seals and update VMA
                 * flags if necessary to avoid future seal violations.
                 */
                err = memfd_check_seals_mmap(file, &vm_flags);
                if (err)
                        return (unsigned long)err;
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        /*
                         * Ignore pgoff.
                         */
                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_DROPPABLE:
                        if (VM_DROPPABLE == VM_NONE)
                                return -ENOTSUPP;
                        /*
                         * A locked or stack area makes no sense to be droppable.
                         *
                         * Also, since droppable pages can just go away at any time
                         * it makes no sense to copy them on fork or dump them.
                         *
                         * And don't attempt to combine with hugetlb for now.
                         */
                        if (flags & (MAP_LOCKED | MAP_HUGETLB))
                                return -EINVAL;
                        if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP))
                                return -EINVAL;

                        vm_flags |= VM_DROPPABLE;

                        /*
                         * If the pages can be dropped, then it doesn't make
                         * sense to reserve them.
                         */
                        vm_flags |= VM_NORESERVE;

                        /*
                         * Likewise, they're volatile enough that they
                         * shouldn't survive forks or coredumps.
                         */
                        vm_flags |= VM_WIPEONFORK | VM_DONTDUMP;
                        fallthrough;
                case MAP_PRIVATE:
                        /*
                         * Set pgoff according to addr for anon_vma.
                         */
                        pgoff = addr >> PAGE_SHIFT;
                        break;
                default:
                        return -EINVAL;
                }
        }

        /*
         * Set 'VM_NORESERVE' if we should not account for the
         * memory use of this mapping.
         */
        if (flags & MAP_NORESERVE) {
                /* We honor MAP_NORESERVE if allowed to overcommit */
                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
                        vm_flags |= VM_NORESERVE;

                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
                if (file && is_file_hugepages(file))
                        vm_flags |= VM_NORESERVE;
        }

        addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
        if (!IS_ERR_VALUE(addr) &&
            ((vm_flags & VM_LOCKED) ||
             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
                *populate = len;
        return addr;
}

unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
                              unsigned long prot, unsigned long flags,
                              unsigned long fd, unsigned long pgoff)
{
        struct file *file = NULL;
        unsigned long retval;

        if (!(flags & MAP_ANONYMOUS)) {
                audit_mmap_fd(fd, flags);
                file = fget(fd);
                if (!file)
                        return -EBADF;
                if (is_file_hugepages(file)) {
                        len = ALIGN(len, huge_page_size(hstate_file(file)));
                } else if (unlikely(flags & MAP_HUGETLB)) {
                        retval = -EINVAL;
                        goto out_fput;
                }
        } else if (flags & MAP_HUGETLB) {
                struct hstate *hs;

                hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (!hs)
                        return -EINVAL;

                len = ALIGN(len, huge_page_size(hs));
                /*
                 * VM_NORESERVE is used because the reservations will be
                 * taken when vm_ops->mmap() is called
                 */
                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                mk_vma_flags(VMA_NORESERVE_BIT),
                                HUGETLB_ANONHUGE_INODE,
                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }

        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
        if (file)
                fput(file);
        return retval;
}

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, pgoff)
{
        return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}

#ifdef __ARCH_WANT_SYS_OLD_MMAP
struct mmap_arg_struct {
        unsigned long addr;
        unsigned long len;
        unsigned long prot;
        unsigned long flags;
        unsigned long fd;
        unsigned long offset;
};

SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
{
        struct mmap_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        if (offset_in_page(a.offset))
                return -EINVAL;

        return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
                               a.offset >> PAGE_SHIFT);
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */

/*
 * Determine if the allocation needs to ensure that there is no
 * existing mapping within it's guard gaps, for use as start_gap.
 */
static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
{
        if (vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

/*
 * Search for an unmapped address range.
 *
 * We are looking for a range that:
 * - does not intersect with any VMA;
 * - is contained within the [low_limit, high_limit) interval;
 * - is at least the desired size.
 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
 */
unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long addr;

        if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
                addr = unmapped_area_topdown(info);
        else
                addr = unmapped_area(info);

        trace_vm_unmapped_area(addr, info);
        return addr;
}

/* Get an address range which is currently unmapped.
 * For shmat() with addr=0.
 *
 * Ugly calling convention alert:
 * Return value with the low bits set means error value,
 * ie
 *        if (ret & ~PAGE_MASK)
 *                error = ret;
 *
 * This function "knows" that -ENOMEM has the bits set.
 */
unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags, vm_flags_t vm_flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vm_unmapped_area_info info = {};
        const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);

        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                    (!vma || addr + len <= vm_start_gap(vma)) &&
                    (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.length = len;
        info.low_limit = mm->mmap_base;
        info.high_limit = mmap_end;
        info.start_gap = stack_guard_placement(vm_flags);
        if (filp && is_file_hugepages(filp))
                info.align_mask = huge_page_mask_align(filp);
        return vm_unmapped_area(&info);
}

#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                       unsigned long len, unsigned long pgoff,
                       unsigned long flags, vm_flags_t vm_flags)
{
        return generic_get_unmapped_area(filp, addr, len, pgoff, flags,
                                         vm_flags);
}
#endif

/*
 * This mmap-allocator allocates new areas top-down from below the
 * stack's low limit (the base):
 */
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags, vm_flags_t vm_flags)
{
        struct vm_area_struct *vma, *prev;
        struct mm_struct *mm = current->mm;
        struct vm_unmapped_area_info info = {};
        const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);

        /* requested length too big for entire address space */
        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        /* requesting a specific address */
        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                                (!vma || addr + len <= vm_start_gap(vma)) &&
                                (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        info.low_limit = PAGE_SIZE;
        info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
        info.start_gap = stack_guard_placement(vm_flags);
        if (filp && is_file_hugepages(filp))
                info.align_mask = huge_page_mask_align(filp);
        addr = vm_unmapped_area(&info);

        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        if (offset_in_page(addr)) {
                VM_BUG_ON(addr != -ENOMEM);
                info.flags = 0;
                info.low_limit = TASK_UNMAPPED_BASE;
                info.high_limit = mmap_end;
                addr = vm_unmapped_area(&info);
        }

        return addr;
}

#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                               unsigned long len, unsigned long pgoff,
                               unsigned long flags, vm_flags_t vm_flags)
{
        return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags,
                                                 vm_flags);
}
#endif

unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                                           unsigned long len, unsigned long pgoff,
                                           unsigned long flags, vm_flags_t vm_flags)
{
        if (mm_flags_test(MMF_TOPDOWN, current->mm))
                return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
                                                      flags, vm_flags);
        return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
}

unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
        unsigned long (*get_area)(struct file *, unsigned long,
                                  unsigned long, unsigned long, unsigned long)
                                  = NULL;

        unsigned long error = arch_mmap_check(addr, len, flags);
        if (error)
                return error;

        /* Careful about overflows.. */
        if (len > TASK_SIZE)
                return -ENOMEM;

        if (file) {
                if (file->f_op->get_unmapped_area)
                        get_area = file->f_op->get_unmapped_area;
        } else if (flags & MAP_SHARED) {
                /*
                 * mmap_region() will call shmem_zero_setup() to create a file,
                 * so use shmem's get_unmapped_area in case it can be huge.
                 */
                get_area = shmem_get_unmapped_area;
        }

        /* Always treat pgoff as zero for anonymous memory. */
        if (!file)
                pgoff = 0;

        if (get_area) {
                addr = get_area(file, addr, len, pgoff, flags);
        } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file
                   && !addr /* no hint */
                   && IS_ALIGNED(len, PMD_SIZE)) {
                /* Ensures that larger anonymous mappings are THP aligned. */
                addr = thp_get_unmapped_area_vmflags(file, addr, len,
                                                     pgoff, flags, vm_flags);
        } else {
                addr = mm_get_unmapped_area_vmflags(file, addr, len,
                                                    pgoff, flags, vm_flags);
        }
        if (IS_ERR_VALUE(addr))
                return addr;

        if (addr > TASK_SIZE - len)
                return -ENOMEM;
        if (offset_in_page(addr))
                return -EINVAL;

        error = security_mmap_addr(addr);
        return error ? error : addr;
}

unsigned long
mm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                     unsigned long pgoff, unsigned long flags)
{
        return mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, 0);
}
EXPORT_SYMBOL(mm_get_unmapped_area);

/**
 * find_vma_intersection() - Look up the first VMA which intersects the interval
 * @mm: The process address space.
 * @start_addr: The inclusive start user address.
 * @end_addr: The exclusive end user address.
 *
 * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
 * start_addr < end_addr.
 */
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                                             unsigned long start_addr,
                                             unsigned long end_addr)
{
        unsigned long index = start_addr;

        mmap_assert_locked(mm);
        return mt_find(&mm->mm_mt, &index, end_addr - 1);
}
EXPORT_SYMBOL(find_vma_intersection);

/**
 * find_vma() - Find the VMA for a given address, or the next VMA.
 * @mm: The mm_struct to check
 * @addr: The address
 *
 * Returns: The VMA associated with addr, or the next VMA.
 * May return %NULL in the case of no VMA at addr or above.
 */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
        unsigned long index = addr;

        mmap_assert_locked(mm);
        return mt_find(&mm->mm_mt, &index, ULONG_MAX);
}
EXPORT_SYMBOL(find_vma);

/**
 * find_vma_prev() - Find the VMA for a given address, or the next vma and
 * set %pprev to the previous VMA, if any.
 * @mm: The mm_struct to check
 * @addr: The address
 * @pprev: The pointer to set to the previous VMA
 *
 * Note that RCU lock is missing here since the external mmap_lock() is used
 * instead.
 *
 * Returns: The VMA associated with @addr, or the next vma.
 * May return %NULL in the case of no vma at addr or above.
 */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, addr);

        vma = vma_iter_load(&vmi);
        *pprev = vma_prev(&vmi);
        if (!vma)
                vma = vma_next(&vmi);
        return vma;
}

/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;

static int __init cmdline_parse_stack_guard_gap(char *p)
{
        unsigned long val;
        char *endptr;

        val = simple_strtoul(p, &endptr, 10);
        if (!*endptr)
                stack_guard_gap = val << PAGE_SHIFT;

        return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);

#ifdef CONFIG_STACK_GROWSUP
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
        return expand_upwards(vma, address);
}

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        addr &= PAGE_MASK;
        vma = find_vma_prev(mm, addr, &prev);
        if (vma && (vma->vm_start <= addr))
                return vma;
        if (!prev)
                return NULL;
        if (expand_stack_locked(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED)
                populate_vma_page_range(prev, addr, prev->vm_end, NULL);
        return prev;
}
#else
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
        return expand_downwards(vma, address);
}

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma;
        unsigned long start;

        addr &= PAGE_MASK;
        vma = find_vma(mm, addr);
        if (!vma)
                return NULL;
        if (vma->vm_start <= addr)
                return vma;
        start = vma->vm_start;
        if (expand_stack_locked(vma, addr))
                return NULL;
        if (vma->vm_flags & VM_LOCKED)
                populate_vma_page_range(vma, addr, start, NULL);
        return vma;
}
#endif

#if defined(CONFIG_STACK_GROWSUP)

#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
#define vma_expand_down(vma, addr) (-EFAULT)

#else

#define vma_expand_up(vma,addr) (-EFAULT)
#define vma_expand_down(vma, addr) expand_downwards(vma, addr)

#endif

/*
 * expand_stack(): legacy interface for page faulting. Don't use unless
 * you have to.
 *
 * This is called with the mm locked for reading, drops the lock, takes
 * the lock for writing, tries to look up a vma again, expands it if
 * necessary, and downgrades the lock to reading again.
 *
 * If no vma is found or it can't be expanded, it returns NULL and has
 * dropped the lock.
 */
struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        mmap_read_unlock(mm);
        if (mmap_write_lock_killable(mm))
                return NULL;

        vma = find_vma_prev(mm, addr, &prev);
        if (vma && vma->vm_start <= addr)
                goto success;

        if (prev && !vma_expand_up(prev, addr)) {
                vma = prev;
                goto success;
        }

        if (vma && !vma_expand_down(vma, addr))
                goto success;

        mmap_write_unlock(mm);
        return NULL;

success:
        mmap_write_downgrade(mm);
        return vma;
}

/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
 * @mm: The mm_struct
 * @start: The start address to munmap
 * @len: The length to be munmapped.
 * @uf: The userfaultfd list_head
 *
 * Return: 0 on success, error otherwise.
 */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
              struct list_head *uf)
{
        VMA_ITERATOR(vmi, mm, start);

        return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}

int vm_munmap(unsigned long start, size_t len)
{
        return __vm_munmap(start, len, false);
}
EXPORT_SYMBOL(vm_munmap);

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
        addr = untagged_addr(addr);
        return __vm_munmap(addr, len, true);
}


/*
 * Emulation of deprecated remap_file_pages() syscall.
 */
SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{

        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long populate = 0;
        unsigned long ret = -EINVAL;
        struct file *file;
        vm_flags_t vm_flags;

        pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
                     current->comm, current->pid);

        if (prot)
                return ret;
        start = start & PAGE_MASK;
        size = size & PAGE_MASK;

        if (start + size <= start)
                return ret;

        /* Does pgoff wrap? */
        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
                return ret;

        if (mmap_read_lock_killable(mm))
                return -EINTR;

        /*
         * Look up VMA under read lock first so we can perform the security
         * without holding locks (which can be problematic). We reacquire a
         * write lock later and check nothing changed underneath us.
         */
        vma = vma_lookup(mm, start);

        if (!vma || !(vma->vm_flags & VM_SHARED)) {
                mmap_read_unlock(mm);
                return -EINVAL;
        }

        prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
        prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
        prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;

        flags &= MAP_NONBLOCK;
        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;

        /* Save vm_flags used to calculate prot and flags, and recheck later. */
        vm_flags = vma->vm_flags;
        file = get_file(vma->vm_file);

        mmap_read_unlock(mm);

        /* Call outside mmap_lock to be consistent with other callers. */
        ret = security_mmap_file(file, prot, flags);
        if (ret) {
                fput(file);
                return ret;
        }

        ret = -EINVAL;

        /* OK security check passed, take write lock + let it rip. */
        if (mmap_write_lock_killable(mm)) {
                fput(file);
                return -EINTR;
        }

        vma = vma_lookup(mm, start);

        if (!vma)
                goto out;

        /* Make sure things didn't change under us. */
        if (vma->vm_flags != vm_flags)
                goto out;
        if (vma->vm_file != file)
                goto out;

        if (start + size > vma->vm_end) {
                VMA_ITERATOR(vmi, mm, vma->vm_end);
                struct vm_area_struct *next, *prev = vma;

                for_each_vma_range(vmi, next, start + size) {
                        /* hole between vmas ? */
                        if (next->vm_start != prev->vm_end)
                                goto out;

                        if (next->vm_file != vma->vm_file)
                                goto out;

                        if (next->vm_flags != vma->vm_flags)
                                goto out;

                        if (start + size <= next->vm_end)
                                break;

                        prev = next;
                }

                if (!next)
                        goto out;
        }

        ret = do_mmap(vma->vm_file, start, size,
                        prot, flags, 0, pgoff, &populate, NULL);
out:
        mmap_write_unlock(mm);
        fput(file);
        if (populate)
                mm_populate(ret, populate);
        if (!IS_ERR_VALUE(ret))
                ret = 0;
        return ret;
}

int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec)
{
        const vma_flags_t vma_flags = is_exec ?
                mk_vma_flags(VMA_EXEC_BIT) : EMPTY_VMA_FLAGS;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        unsigned long len;
        int ret;
        bool populate;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, addr);

        len = PAGE_ALIGN(request);
        if (len < request)
                return -ENOMEM;
        if (!len)
                return 0;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = check_brk_limits(addr, len);
        if (ret)
                goto limits_failed;

        ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0);
        if (ret)
                goto munmap_failed;

        vma = vma_prev(&vmi);
        ret = do_brk_flags(&vmi, vma, addr, len, vma_flags);
        populate = ((mm->def_flags & VM_LOCKED) != 0);
        mmap_write_unlock(mm);
        userfaultfd_unmap_complete(mm, &uf);
        if (populate && !ret)
                mm_populate(addr, len);
        return ret;

munmap_failed:
limits_failed:
        mmap_write_unlock(mm);
        return ret;
}

static
unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi,
                struct vm_area_struct *vma, unsigned long end)
{
        unsigned long nr_accounted = 0;
        int count = 0;

        mmap_assert_write_locked(mm);
        vma_iter_set(vmi, vma->vm_end);
        do {
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += vma_pages(vma);
                vma_mark_detached(vma);
                remove_vma(vma);
                count++;
                cond_resched();
                vma = vma_next(vmi);
        } while (vma && vma->vm_end <= end);

        VM_WARN_ON_ONCE(count != mm->map_count);
        return nr_accounted;
}

/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        VMA_ITERATOR(vmi, mm, 0);
        struct unmap_desc unmap;

        /* mm's last user has gone, and its about to be pulled down */
        mmu_notifier_release(mm);

        mmap_read_lock(mm);
        arch_exit_mmap(mm);

        vma = vma_next(&vmi);
        if (!vma) {
                /* Can happen if dup_mmap() received an OOM */
                mmap_read_unlock(mm);
                mmap_write_lock(mm);
                goto destroy;
        }

        unmap_all_init(&unmap, &vmi, vma);
        flush_cache_mm(mm);
        tlb_gather_mmu_fullmm(&tlb, mm);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
        unmap_vmas(&tlb, &unmap);
        mmap_read_unlock(mm);

        /*
         * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
         * because the memory has been already freed.
         */
        mm_flags_set(MMF_OOM_SKIP, mm);
        mmap_write_lock(mm);
        unmap.mm_wr_locked = true;
        mt_clear_in_rcu(&mm->mm_mt);
        unmap_pgtable_init(&unmap, &vmi);
        free_pgtables(&tlb, &unmap);
        tlb_finish_mmu(&tlb);

        /*
         * Walk the list again, actually closing and freeing it, with preemption
         * enabled, without holding any MM locks besides the unreachable
         * mmap_write_lock.
         */
        nr_accounted = tear_down_vmas(mm, &vmi, vma, ULONG_MAX);

destroy:
        __mt_destroy(&mm->mm_mt);
        trace_exit_mmap(mm);
        mmap_write_unlock(mm);
        vm_unacct_memory(nr_accounted);
}

/*
 * Return true if the calling process may expand its vm space by the passed
 * number of pages
 */
bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags,
                   unsigned long npages)
{
        if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
                return false;

        if (is_data_mapping_vma_flags(vma_flags) &&
            mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
                /* Workaround for Valgrind */
                if (rlimit(RLIMIT_DATA) == 0 &&
                    mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
                        return true;

                pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
                             current->comm, current->pid,
                             (mm->data_vm + npages) << PAGE_SHIFT,
                             rlimit(RLIMIT_DATA),
                             ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");

                if (!ignore_rlimit_data)
                        return false;
        }

        return true;
}

void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
        WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);

        if (is_exec_mapping(flags))
                mm->exec_vm += npages;
        else if (is_stack_mapping(flags))
                mm->stack_vm += npages;
        else if (is_data_mapping(flags))
                mm->data_vm += npages;
}

static vm_fault_t special_mapping_fault(struct vm_fault *vmf);

/*
 * Close hook, called for unmap() and on the old vma for mremap().
 *
 * Having a close hook prevents vma merging regardless of flags.
 */
static void special_mapping_close(struct vm_area_struct *vma)
{
        const struct vm_special_mapping *sm = vma->vm_private_data;

        if (sm->close)
                sm->close(sm, vma);
}

static const char *special_mapping_name(struct vm_area_struct *vma)
{
        return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}

static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
        struct vm_special_mapping *sm = new_vma->vm_private_data;

        if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
                return -EFAULT;

        if (sm->mremap)
                return sm->mremap(sm, new_vma);

        return 0;
}

static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
{
        /*
         * Forbid splitting special mappings - kernel has expectations over
         * the number of pages in mapping. Together with VM_DONTEXPAND
         * the size of vma should stay the same over the special mapping's
         * lifetime.
         */
        return -EINVAL;
}

static const struct vm_operations_struct special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
        .mremap = special_mapping_mremap,
        .name = special_mapping_name,
        /* vDSO code relies that VVAR can't be accessed remotely */
        .access = NULL,
        .may_split = special_mapping_split,
};

static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        pgoff_t pgoff;
        struct page **pages;
        struct vm_special_mapping *sm = vma->vm_private_data;

        if (sm->fault)
                return sm->fault(sm, vmf->vma, vmf);

        pages = sm->pages;

        for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                pgoff--;

        if (*pages) {
                struct page *page = *pages;
                get_page(page);
                vmf->page = page;
                return 0;
        }

        return VM_FAULT_SIGBUS;
}

static struct vm_area_struct *__install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        vm_flags_t vm_flags, void *priv,
        const struct vm_operations_struct *ops)
{
        int ret;
        struct vm_area_struct *vma;

        vma = vm_area_alloc(mm);
        if (unlikely(vma == NULL))
                return ERR_PTR(-ENOMEM);

        vma_set_range(vma, addr, addr + len, 0);
        vm_flags |= mm->def_flags | VM_DONTEXPAND;
        if (pgtable_supports_soft_dirty())
                vm_flags |= VM_SOFTDIRTY;
        vm_flags_init(vma, vm_flags & ~VM_LOCKED_MASK);
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        vma->vm_ops = ops;
        vma->vm_private_data = priv;

        ret = insert_vm_struct(mm, vma);
        if (ret)
                goto out;

        vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);

        perf_event_mmap(vma);

        return vma;

out:
        vm_area_free(vma);
        return ERR_PTR(ret);
}

bool vma_is_special_mapping(const struct vm_area_struct *vma,
        const struct vm_special_mapping *sm)
{
        return vma->vm_private_data == sm &&
                vma->vm_ops == &special_mapping_vmops;
}

/*
 * Called with mm->mmap_lock held for writing.
 * Insert a new vma covering the given region, with the given flags.
 * Its pages are supplied by the given array of struct page *.
 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
 * The region past the last page supplied will always produce SIGBUS.
 * The array pointer and the pages it points to are assumed to stay alive
 * for as long as this mapping might exist.
 */
struct vm_area_struct *_install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        vm_flags_t vm_flags, const struct vm_special_mapping *spec)
{
        return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
                                        &special_mapping_vmops);
}

#ifdef CONFIG_SYSCTL
#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
                defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
int sysctl_legacy_va_layout;
#endif

static const struct ctl_table mmap_table[] = {
                {
                                .procname       = "max_map_count",
                                .data           = &sysctl_max_map_count,
                                .maxlen         = sizeof(sysctl_max_map_count),
                                .mode           = 0644,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = SYSCTL_ZERO,
                },
#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
                defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
                {
                                .procname       = "legacy_va_layout",
                                .data           = &sysctl_legacy_va_layout,
                                .maxlen         = sizeof(sysctl_legacy_va_layout),
                                .mode           = 0644,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = SYSCTL_ZERO,
                },
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
                {
                                .procname       = "mmap_rnd_bits",
                                .data           = &mmap_rnd_bits,
                                .maxlen         = sizeof(mmap_rnd_bits),
                                .mode           = 0600,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = (void *)&mmap_rnd_bits_min,
                                .extra2         = (void *)&mmap_rnd_bits_max,
                },
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
                {
                                .procname       = "mmap_rnd_compat_bits",
                                .data           = &mmap_rnd_compat_bits,
                                .maxlen         = sizeof(mmap_rnd_compat_bits),
                                .mode           = 0600,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = (void *)&mmap_rnd_compat_bits_min,
                                .extra2         = (void *)&mmap_rnd_compat_bits_max,
                },
#endif
};
#endif /* CONFIG_SYSCTL */

/*
 * initialise the percpu counter for VM, initialise VMA state.
 */
void __init mmap_init(void)
{
        int ret;

        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
        VM_BUG_ON(ret);
#ifdef CONFIG_SYSCTL
        register_sysctl_init("vm", mmap_table);
#endif
        vma_state_init();
}

/*
 * Initialise sysctl_user_reserve_kbytes.
 *
 * This is intended to prevent a user from starting a single memory hogging
 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
 * mode.
 *
 * The default value is min(3% of free memory, 128MB)
 * 128MB is enough to recover with sshd/login, bash, and top/kill.
 */
static int init_user_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

        sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
        return 0;
}
subsys_initcall(init_user_reserve);

/*
 * Initialise sysctl_admin_reserve_kbytes.
 *
 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
 * to log in and kill a memory hogging process.
 *
 * Systems with more than 256MB will reserve 8MB, enough to recover
 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
 * only reserve 3% of free pages by default.
 */
static int init_admin_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
        return 0;
}
subsys_initcall(init_admin_reserve);

/*
 * Reinititalise user and admin reserves if memory is added or removed.
 *
 * The default user reserve max is 128MB, and the default max for the
 * admin reserve is 8MB. These are usually, but not always, enough to
 * enable recovery from a memory hogging process using login/sshd, a shell,
 * and tools like top. It may make sense to increase or even disable the
 * reserve depending on the existence of swap or variations in the recovery
 * tools. So, the admin may have changed them.
 *
 * If memory is added and the reserves have been eliminated or increased above
 * the default max, then we'll trust the admin.
 *
 * If memory is removed and there isn't enough free memory, then we
 * need to reset the reserves.
 *
 * Otherwise keep the reserve set by the admin.
 */
static int reserve_mem_notifier(struct notifier_block *nb,
                             unsigned long action, void *data)
{
        unsigned long tmp, free_kbytes;

        switch (action) {
        case MEM_ONLINE:
                /* Default max is 128MB. Leave alone if modified by operator. */
                tmp = sysctl_user_reserve_kbytes;
                if (tmp > 0 && tmp < SZ_128K)
                        init_user_reserve();

                /* Default max is 8MB.  Leave alone if modified by operator. */
                tmp = sysctl_admin_reserve_kbytes;
                if (tmp > 0 && tmp < SZ_8K)
                        init_admin_reserve();

                break;
        case MEM_OFFLINE:
                free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

                if (sysctl_user_reserve_kbytes > free_kbytes) {
                        init_user_reserve();
                        pr_info("vm.user_reserve_kbytes reset to %lu\n",
                                sysctl_user_reserve_kbytes);
                }

                if (sysctl_admin_reserve_kbytes > free_kbytes) {
                        init_admin_reserve();
                        pr_info("vm.admin_reserve_kbytes reset to %lu\n",
                                sysctl_admin_reserve_kbytes);
                }
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

static int __meminit init_reserve_notifier(void)
{
        if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
                pr_err("Failed registering memory add/remove notifier for admin reserve\n");

        return 0;
}
subsys_initcall(init_reserve_notifier);

/*
 * Obtain a read lock on mm->mmap_lock, if the specified address is below the
 * start of the VMA, the intent is to perform a write, and it is a
 * downward-growing stack, then attempt to expand the stack to contain it.
 *
 * This function is intended only for obtaining an argument page from an ELF
 * image, and is almost certainly NOT what you want to use for any other
 * purpose.
 *
 * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
 * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
 * new VMA being mapped.
 *
 * The function assumes that addr is either contained within the VMA or below
 * it, and makes no attempt to validate this value beyond that.
 *
 * Returns true if the read lock was obtained and a stack was perhaps expanded,
 * false if the stack expansion failed.
 *
 * On stack expansion the function temporarily acquires an mmap write lock
 * before downgrading it.
 */
bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
                                 struct vm_area_struct *new_vma,
                                 unsigned long addr, bool write)
{
        if (!write || addr >= new_vma->vm_start) {
                mmap_read_lock(mm);
                return true;
        }

        if (!(new_vma->vm_flags & VM_GROWSDOWN))
                return false;

        mmap_write_lock(mm);
        if (expand_downwards(new_vma, addr)) {
                mmap_write_unlock(mm);
                return false;
        }

        mmap_write_downgrade(mm);
        return true;
}

__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
        struct vm_area_struct *mpnt, *tmp;
        int retval;
        unsigned long charge = 0;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, 0);

        if (mmap_write_lock_killable(oldmm))
                return -EINTR;
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
        /*
         * Not linked in yet - no deadlock potential:
         */
        mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);

        /* No ordering required: file already has been exposed. */
        dup_mm_exe_file(mm, oldmm);

        mm->total_vm = oldmm->total_vm;
        mm->data_vm = oldmm->data_vm;
        mm->exec_vm = oldmm->exec_vm;
        mm->stack_vm = oldmm->stack_vm;

        /* Use __mt_dup() to efficiently build an identical maple tree. */
        retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
        if (unlikely(retval))
                goto out;

        mt_clear_in_rcu(vmi.mas.tree);
        for_each_vma(vmi, mpnt) {
                struct file *file;

                retval = vma_start_write_killable(mpnt);
                if (retval < 0)
                        goto loop_out;
                if (mpnt->vm_flags & VM_DONTCOPY) {
                        retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
                                                    mpnt->vm_end, GFP_KERNEL);
                        if (retval)
                                goto loop_out;

                        vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned long len = vma_pages(mpnt);

                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }

                tmp = vm_area_dup(mpnt);
                if (!tmp)
                        goto fail_nomem;
                retval = vma_dup_policy(mpnt, tmp);
                if (retval)
                        goto fail_nomem_policy;
                tmp->vm_mm = mm;
                retval = dup_userfaultfd(tmp, &uf);
                if (retval)
                        goto fail_nomem_anon_vma_fork;
                if (tmp->vm_flags & VM_WIPEONFORK) {
                        /*
                         * VM_WIPEONFORK gets a clean slate in the child.
                         * Don't prepare anon_vma until fault since we don't
                         * copy page for current vma.
                         */
                        tmp->anon_vma = NULL;
                } else if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                vm_flags_clear(tmp, VM_LOCKED_MASK);
                /*
                 * Copy/update hugetlb private vma information.
                 */
                if (is_vm_hugetlb_page(tmp))
                        hugetlb_dup_vma_private(tmp);

                /*
                 * Link the vma into the MT. After using __mt_dup(), memory
                 * allocation is not necessary here, so it cannot fail.
                 */
                vma_iter_bulk_store(&vmi, tmp);

                mm->map_count++;

                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);

                file = tmp->vm_file;
                if (file) {
                        struct address_space *mapping = file->f_mapping;

                        get_file(file);
                        i_mmap_lock_write(mapping);
                        if (vma_is_shared_maywrite(tmp))
                                mapping_allow_writable(mapping);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        vma_interval_tree_insert_after(tmp, mpnt,
                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        i_mmap_unlock_write(mapping);
                }

                if (!(tmp->vm_flags & VM_WIPEONFORK))
                        retval = copy_page_range(tmp, mpnt);

                if (retval) {
                        mpnt = vma_next(&vmi);
                        goto loop_out;
                }
        }
        /* a new mm has just been created */
        retval = arch_dup_mmap(oldmm, mm);
loop_out:
        vma_iter_free(&vmi);
        if (!retval) {
                mt_set_in_rcu(vmi.mas.tree);
                ksm_fork(mm, oldmm);
                khugepaged_fork(mm, oldmm);
        } else {
                unsigned long end;

                /*
                 * The entire maple tree has already been duplicated, but
                 * replacing the vmas failed at mpnt (which could be NULL if
                 * all were allocated but the last vma was not fully set up).
                 * Use the start address of the failure point to clean up the
                 * partially initialized tree.
                 */
                if (!mm->map_count) {
                        /* zero vmas were written to the new tree. */
                        end = 0;
                } else if (mpnt) {
                        /* partial tree failure */
                        end = mpnt->vm_start;
                } else {
                        /* All vmas were written to the new tree */
                        end = ULONG_MAX;
                }

                /* Hide mm from oom killer because the memory is being freed */
                mm_flags_set(MMF_OOM_SKIP, mm);
                if (end) {
                        vma_iter_set(&vmi, 0);
                        tmp = vma_next(&vmi);
                        UNMAP_STATE(unmap, &vmi, /* first = */ tmp,
                                    /* vma_start = */ 0, /* vma_end = */ end,
                                    /* prev = */ NULL, /* next = */ NULL);

                        /*
                         * Don't iterate over vmas beyond the failure point for
                         * both unmap_vma() and free_pgtables().
                         */
                        unmap.tree_end = end;
                        flush_cache_mm(mm);
                        unmap_region(&unmap);
                        charge = tear_down_vmas(mm, &vmi, tmp, end);
                        vm_unacct_memory(charge);
                }
                __mt_destroy(&mm->mm_mt);
                /*
                 * The mm_struct is going to exit, but the locks will be dropped
                 * first.  Set the mm_struct as unstable is advisable as it is
                 * not fully initialised.
                 */
                mm_flags_set(MMF_UNSTABLE, mm);
        }
out:
        mmap_write_unlock(mm);
        flush_tlb_mm(oldmm);
        mmap_write_unlock(oldmm);
        if (!retval)
                dup_userfaultfd_complete(&uf);
        else
                dup_userfaultfd_fail(&uf);
        return retval;

fail_nomem_anon_vma_fork:
        mpol_put(vma_policy(tmp));
fail_nomem_policy:
        vm_area_free(tmp);
fail_nomem:
        retval = -ENOMEM;
        vm_unacct_memory(charge);
        goto loop_out;
}









































































































































































    1 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
// SPDX-License-Identifier: GPL-2.0+
/*
 * Fast-charge control for Apple "MFi" devices
 *
 * Copyright (C) 2019 Bastien Nocera <hadess@hadess.net>
 */

/* Standard include files */
#include <linux/module.h>
#include <linux/power_supply.h>
#include <linux/slab.h>
#include <linux/usb.h>

MODULE_AUTHOR("Bastien Nocera <hadess@hadess.net>");
MODULE_DESCRIPTION("Fast-charge control for Apple \"MFi\" devices");
MODULE_LICENSE("GPL");

#define TRICKLE_CURRENT_MA                0
#define FAST_CURRENT_MA                        2500

#define APPLE_VENDOR_ID                        0x05ac        /* Apple */

/* The product ID is defined as starting with 0x12nn, as per the
 * "Choosing an Apple Device USB Configuration" section in
 * release R9 (2012) of the "MFi Accessory Hardware Specification"
 *
 * To distinguish an Apple device, a USB host can check the device
 * descriptor of attached USB devices for the following fields:
 * ■ Vendor ID: 0x05AC
 * ■ Product ID: 0x12nn
 *
 * Those checks will be done in .match() and .probe().
 */

static const struct usb_device_id mfi_fc_id_table[] = {
        { .idVendor = APPLE_VENDOR_ID,
          .match_flags = USB_DEVICE_ID_MATCH_VENDOR },
        {},
};

MODULE_DEVICE_TABLE(usb, mfi_fc_id_table);

/* Driver-local specific stuff */
struct mfi_device {
        struct usb_device *udev;
        struct power_supply *battery;
        struct power_supply_desc battery_desc;
        int charge_type;
};

static int apple_mfi_fc_set_charge_type(struct mfi_device *mfi,
                                        const union power_supply_propval *val)
{
        int current_ma;
        int retval;
        __u8 request_type;

        if (mfi->charge_type == val->intval) {
                dev_dbg(&mfi->udev->dev, "charge type %d already set\n",
                                mfi->charge_type);
                return 0;
        }

        switch (val->intval) {
        case POWER_SUPPLY_CHARGE_TYPE_TRICKLE:
                current_ma = TRICKLE_CURRENT_MA;
                break;
        case POWER_SUPPLY_CHARGE_TYPE_FAST:
                current_ma = FAST_CURRENT_MA;
                break;
        default:
                return -EINVAL;
        }

        request_type = USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE;
        retval = usb_control_msg(mfi->udev, usb_sndctrlpipe(mfi->udev, 0),
                                 0x40, /* Vendor‐defined power request */
                                 request_type,
                                 current_ma, /* wValue, current offset */
                                 current_ma, /* wIndex, current offset */
                                 NULL, 0, USB_CTRL_GET_TIMEOUT);
        if (retval) {
                dev_dbg(&mfi->udev->dev, "retval = %d\n", retval);
                return retval;
        }

        mfi->charge_type = val->intval;

        return 0;
}

static int apple_mfi_fc_get_property(struct power_supply *psy,
                enum power_supply_property psp,
                union power_supply_propval *val)
{
        struct mfi_device *mfi = power_supply_get_drvdata(psy);

        dev_dbg(&mfi->udev->dev, "prop: %d\n", psp);

        switch (psp) {
        case POWER_SUPPLY_PROP_CHARGE_TYPE:
                val->intval = mfi->charge_type;
                break;
        case POWER_SUPPLY_PROP_SCOPE:
                val->intval = POWER_SUPPLY_SCOPE_DEVICE;
                break;
        default:
                return -ENODATA;
        }

        return 0;
}

static int apple_mfi_fc_set_property(struct power_supply *psy,
                enum power_supply_property psp,
                const union power_supply_propval *val)
{
        struct mfi_device *mfi = power_supply_get_drvdata(psy);
        int ret;

        dev_dbg(&mfi->udev->dev, "prop: %d\n", psp);

        ret = pm_runtime_get_sync(&mfi->udev->dev);
        if (ret < 0) {
                pm_runtime_put_noidle(&mfi->udev->dev);
                return ret;
        }

        switch (psp) {
        case POWER_SUPPLY_PROP_CHARGE_TYPE:
                ret = apple_mfi_fc_set_charge_type(mfi, val);
                break;
        default:
                ret = -EINVAL;
        }

        pm_runtime_put_autosuspend(&mfi->udev->dev);

        return ret;
}

static int apple_mfi_fc_property_is_writeable(struct power_supply *psy,
                                              enum power_supply_property psp)
{
        switch (psp) {
        case POWER_SUPPLY_PROP_CHARGE_TYPE:
                return 1;
        default:
                return 0;
        }
}

static enum power_supply_property apple_mfi_fc_properties[] = {
        POWER_SUPPLY_PROP_CHARGE_TYPE,
        POWER_SUPPLY_PROP_SCOPE
};

static const struct power_supply_desc apple_mfi_fc_desc = {
        .name                   = "apple_mfi_fastcharge",
        .type                   = POWER_SUPPLY_TYPE_BATTERY,
        .properties             = apple_mfi_fc_properties,
        .num_properties         = ARRAY_SIZE(apple_mfi_fc_properties),
        .get_property           = apple_mfi_fc_get_property,
        .set_property           = apple_mfi_fc_set_property,
        .property_is_writeable  = apple_mfi_fc_property_is_writeable
};

static bool mfi_fc_match(struct usb_device *udev)
{
        int idProduct;

        idProduct = le16_to_cpu(udev->descriptor.idProduct);
        /* See comment above mfi_fc_id_table[] */
        return (idProduct >= 0x1200 && idProduct <= 0x12ff);
}

static int mfi_fc_probe(struct usb_device *udev)
{
        struct power_supply_config battery_cfg = {};
        struct mfi_device *mfi = NULL;
        char *battery_name;
        int err;

        if (!mfi_fc_match(udev))
                return -ENODEV;

        mfi = kzalloc_obj(struct mfi_device);
        if (!mfi)
                return -ENOMEM;

        battery_name = kasprintf(GFP_KERNEL, "apple_mfi_fastcharge_%d-%d",
                                 udev->bus->busnum, udev->devnum);
        if (!battery_name) {
                err = -ENOMEM;
                goto err_free_mfi;
        }

        mfi->battery_desc = apple_mfi_fc_desc;
        mfi->battery_desc.name = battery_name;

        battery_cfg.drv_data = mfi;

        mfi->charge_type = POWER_SUPPLY_CHARGE_TYPE_TRICKLE;
        mfi->battery = power_supply_register(&udev->dev,
                                                &mfi->battery_desc,
                                                &battery_cfg);
        if (IS_ERR(mfi->battery)) {
                dev_err(&udev->dev, "Can't register battery\n");
                err = PTR_ERR(mfi->battery);
                goto err_free_name;
        }

        mfi->udev = udev;
        dev_set_drvdata(&udev->dev, mfi);

        return 0;

err_free_name:
        kfree(battery_name);
err_free_mfi:
        kfree(mfi);
        return err;
}

static void mfi_fc_disconnect(struct usb_device *udev)
{
        struct mfi_device *mfi;

        mfi = dev_get_drvdata(&udev->dev);
        if (mfi->battery)
                power_supply_unregister(mfi->battery);
        kfree(mfi->battery_desc.name);
        dev_set_drvdata(&udev->dev, NULL);
        kfree(mfi);
}

static struct usb_device_driver mfi_fc_driver = {
        .name =                "apple-mfi-fastcharge",
        .probe =        mfi_fc_probe,
        .disconnect =        mfi_fc_disconnect,
        .id_table =        mfi_fc_id_table,
        .match =        mfi_fc_match,
        .generic_subclass = 1,
};

static int __init mfi_fc_driver_init(void)
{
        return usb_register_device_driver(&mfi_fc_driver, THIS_MODULE);
}

static void __exit mfi_fc_driver_exit(void)
{
        usb_deregister_device_driver(&mfi_fc_driver);
}

module_init(mfi_fc_driver_init);
module_exit(mfi_fc_driver_exit);























    1 






    1 
    1 


    1 





























































































































































































































































    1 



























    1 







































































































































    1 






    1 
















































































































































































    1 




























































    1 



    1 
    1 




























































































    1 


































































    1 





    1 
    1 


    1 






    1 


    1 











    1 











    1 





















    1 






    1 











































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2017-2018 HUAWEI, Inc.
 *             https://www.huawei.com/
 * Copyright (C) 2021, Alibaba Cloud
 */
#include <linux/statfs.h>
#include <linux/seq_file.h>
#include <linux/crc32c.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/exportfs.h>
#include <linux/backing-dev.h>
#include <linux/pseudo_fs.h>
#include "xattr.h"

#define CREATE_TRACE_POINTS
#include <trace/events/erofs.h>

static struct kmem_cache *erofs_inode_cachep __read_mostly;

void _erofs_printk(struct super_block *sb, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;
        int level;

        va_start(args, fmt);

        level = printk_get_level(fmt);
        vaf.fmt = printk_skip_level(fmt);
        vaf.va = &args;
        if (sb)
                printk("%c%cerofs (device %s): %pV",
                                KERN_SOH_ASCII, level, sb->s_id, &vaf);
        else
                printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf);
        va_end(args);
}

static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata)
{
        struct erofs_super_block *dsb = sbdata + EROFS_SUPER_OFFSET;
        u32 len = 1 << EROFS_SB(sb)->blkszbits, crc;

        if (len > EROFS_SUPER_OFFSET)
                len -= EROFS_SUPER_OFFSET;
        len -= offsetof(struct erofs_super_block, checksum) +
                        sizeof(dsb->checksum);

        /* skip .magic(pre-verified) and .checksum(0) fields */
        crc = crc32c(0x5045B54A, (&dsb->checksum) + 1, len);
        if (crc == le32_to_cpu(dsb->checksum))
                return 0;
        erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
                  crc, le32_to_cpu(dsb->checksum));
        return -EBADMSG;
}

static void erofs_inode_init_once(void *ptr)
{
        struct erofs_inode *vi = ptr;

        inode_init_once(&vi->vfs_inode);
}

static struct inode *erofs_alloc_inode(struct super_block *sb)
{
        struct erofs_inode *vi =
                alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL);

        if (!vi)
                return NULL;

        /* zero out everything except vfs_inode */
        memset(vi, 0, offsetof(struct erofs_inode, vfs_inode));
        return &vi->vfs_inode;
}

static void erofs_free_inode(struct inode *inode)
{
        struct erofs_inode *vi = EROFS_I(inode);

        if (inode->i_op == &erofs_fast_symlink_iops)
                kfree(inode->i_link);
        kfree(vi->xattr_shared_xattrs);
        kmem_cache_free(erofs_inode_cachep, vi);
}

/* read variable-sized metadata, offset will be aligned by 4-byte */
void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
                          erofs_off_t *offset, int *lengthp)
{
        u8 *buffer, *ptr;
        int len, i, cnt;

        *offset = round_up(*offset, 4);
        ptr = erofs_bread(buf, *offset, true);
        if (IS_ERR(ptr))
                return ptr;

        len = le16_to_cpu(*(__le16 *)ptr);
        if (!len)
                len = U16_MAX + 1;
        buffer = kmalloc(len, GFP_KERNEL);
        if (!buffer)
                return ERR_PTR(-ENOMEM);
        *offset += sizeof(__le16);
        *lengthp = len;

        for (i = 0; i < len; i += cnt) {
                cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
                            len - i);
                ptr = erofs_bread(buf, *offset, true);
                if (IS_ERR(ptr)) {
                        kfree(buffer);
                        return ptr;
                }
                memcpy(buffer + i, ptr, cnt);
                *offset += cnt;
        }
        return buffer;
}

static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
                             struct erofs_device_info *dif, erofs_off_t *pos)
{
        struct erofs_sb_info *sbi = EROFS_SB(sb);
        struct erofs_fscache *fscache;
        struct erofs_deviceslot *dis;
        struct file *file;
        bool _48bit;

        dis = erofs_read_metabuf(buf, sb, *pos, false);
        if (IS_ERR(dis))
                return PTR_ERR(dis);

        if (!sbi->devs->flatdev && !dif->path) {
                if (!dis->tag[0]) {
                        erofs_err(sb, "empty device tag @ pos %llu", *pos);
                        return -EINVAL;
                }
                dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL);
                if (!dif->path)
                        return -ENOMEM;
        }

        if (erofs_is_fscache_mode(sb)) {
                fscache = erofs_fscache_register_cookie(sb, dif->path, 0);
                if (IS_ERR(fscache))
                        return PTR_ERR(fscache);
                dif->fscache = fscache;
        } else if (!sbi->devs->flatdev) {
                file = erofs_is_fileio_mode(sbi) ?
                                filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
                                bdev_file_open_by_path(dif->path,
                                                BLK_OPEN_READ, sb->s_type, NULL);
                if (IS_ERR(file)) {
                        if (file == ERR_PTR(-ENOTBLK))
                                return -EINVAL;
                        return PTR_ERR(file);
                }

                if (!erofs_is_fileio_mode(sbi)) {
                        dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
                                        &dif->dax_part_off, NULL, NULL);
                } else if (!S_ISREG(file_inode(file)->i_mode)) {
                        fput(file);
                        return -EINVAL;
                }
                if (!dif->dax_dev && test_opt(&sbi->opt, DAX_ALWAYS)) {
                        erofs_info(sb, "DAX unsupported by %s. Turning off DAX.",
                                   dif->path);
                        clear_opt(&sbi->opt, DAX_ALWAYS);
                }
                dif->file = file;
        }

        _48bit = erofs_sb_has_48bit(sbi);
        dif->blocks = le32_to_cpu(dis->blocks_lo) |
                (_48bit ? (u64)le16_to_cpu(dis->blocks_hi) << 32 : 0);
        dif->uniaddr = le32_to_cpu(dis->uniaddr_lo) |
                (_48bit ? (u64)le16_to_cpu(dis->uniaddr_hi) << 32 : 0);
        sbi->total_blocks += dif->blocks;
        *pos += EROFS_DEVT_SLOT_SIZE;
        return 0;
}

static int erofs_scan_devices(struct super_block *sb,
                              struct erofs_super_block *dsb)
{
        struct erofs_sb_info *sbi = EROFS_SB(sb);
        unsigned int ondisk_extradevs;
        erofs_off_t pos;
        struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
        struct erofs_device_info *dif;
        int id, err = 0;

        sbi->total_blocks = sbi->dif0.blocks;
        if (!erofs_sb_has_device_table(sbi))
                ondisk_extradevs = 0;
        else
                ondisk_extradevs = le16_to_cpu(dsb->extra_devices);

        if (sbi->devs->extra_devices &&
            ondisk_extradevs != sbi->devs->extra_devices) {
                erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
                          ondisk_extradevs, sbi->devs->extra_devices);
                return -EINVAL;
        }

        if (test_opt(&sbi->opt, DAX_ALWAYS) && !sbi->dif0.dax_dev) {
                erofs_info(sb, "DAX unsupported by block device. Turning off DAX.");
                clear_opt(&sbi->opt, DAX_ALWAYS);
        }
        if (!ondisk_extradevs)
                return 0;

        if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb))
                sbi->devs->flatdev = true;

        sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
        pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
        down_read(&sbi->devs->rwsem);
        if (sbi->devs->extra_devices) {
                idr_for_each_entry(&sbi->devs->tree, dif, id) {
                        err = erofs_init_device(&buf, sb, dif, &pos);
                        if (err)
                                break;
                }
        } else {
                for (id = 0; id < ondisk_extradevs; id++) {
                        dif = kzalloc_obj(*dif);
                        if (!dif) {
                                err = -ENOMEM;
                                break;
                        }

                        err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
                        if (err < 0) {
                                kfree(dif);
                                break;
                        }
                        ++sbi->devs->extra_devices;

                        err = erofs_init_device(&buf, sb, dif, &pos);
                        if (err)
                                break;
                }
        }
        up_read(&sbi->devs->rwsem);
        erofs_put_metabuf(&buf);
        return err;
}

static int erofs_read_superblock(struct super_block *sb)
{
        struct erofs_sb_info *sbi = EROFS_SB(sb);
        struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
        struct erofs_super_block *dsb;
        void *data;
        int ret;

        data = erofs_read_metabuf(&buf, sb, 0, false);
        if (IS_ERR(data)) {
                erofs_err(sb, "cannot read erofs superblock");
                return PTR_ERR(data);
        }

        dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
        ret = -EINVAL;
        if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) {
                erofs_err(sb, "cannot find valid erofs superblock");
                goto out;
        }

        sbi->blkszbits = dsb->blkszbits;
        if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) {
                erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits);
                goto out;
        }
        if (dsb->dirblkbits) {
                erofs_err(sb, "dirblkbits %u isn't supported", dsb->dirblkbits);
                goto out;
        }

        sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
        if (erofs_sb_has_sb_chksum(sbi)) {
                ret = erofs_superblock_csum_verify(sb, data);
                if (ret)
                        goto out;
        }

        ret = -EINVAL;
        sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat);
        if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) {
                erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
                          sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT);
                goto out;
        }

        sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
        if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) {
                erofs_err(sb, "invalid sb_extslots %u (more than a fs block)",
                          sbi->sb_size);
                goto out;
        }
        sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo);
        sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
        sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
        sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start);
        sbi->xattr_prefix_count = dsb->xattr_prefix_count;
        sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
        if (erofs_sb_has_ishare_xattrs(sbi)) {
                if (dsb->ishare_xattr_prefix_id >= sbi->xattr_prefix_count) {
                        erofs_err(sb, "invalid ishare xattr prefix id %u",
                                  dsb->ishare_xattr_prefix_id);
                        ret = -EFSCORRUPTED;
                        goto out;
                }
                sbi->ishare_xattr_prefix_id = dsb->ishare_xattr_prefix_id;
        }
#endif
        sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
        if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
                sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
                sbi->dif0.blocks = sbi->dif0.blocks |
                                ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32);
        } else {
                sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
        }
        sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
        if (erofs_sb_has_metabox(sbi)) {
                ret = -EFSCORRUPTED;
                if (sbi->sb_size <= offsetof(struct erofs_super_block,
                                             metabox_nid))
                        goto out;
                sbi->metabox_nid = le64_to_cpu(dsb->metabox_nid);
                if (sbi->metabox_nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT))
                        goto out;                /* self-loop detection */
        }
        sbi->inos = le64_to_cpu(dsb->inos);

        sbi->epoch = (s64)le64_to_cpu(dsb->epoch);
        sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec);
        super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));

        if (dsb->volume_name[0]) {
                sbi->volume_name = kstrndup(dsb->volume_name,
                                            sizeof(dsb->volume_name), GFP_KERNEL);
                if (!sbi->volume_name) {
                        ret = -ENOMEM;
                        goto out;
                }
        }

        if (IS_ENABLED(CONFIG_EROFS_FS_ZIP)) {
                ret = z_erofs_parse_cfgs(sb, dsb);
                if (ret < 0)
                        goto out;
        } else if (dsb->u1.available_compr_algs ||
                   erofs_sb_has_lz4_0padding(sbi)) {
                erofs_err(sb, "compression disabled, unable to mount compressed EROFS");
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = erofs_scan_devices(sb, dsb);

        if (erofs_sb_has_48bit(sbi))
                erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!");
        if (erofs_sb_has_metabox(sbi))
                erofs_info(sb, "EXPERIMENTAL metadata compression support in use. Use at your own risk!");
        if (erofs_is_fscache_mode(sb))
                erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!");
out:
        erofs_put_metabuf(&buf);
        return ret;
}

static void erofs_default_options(struct erofs_sb_info *sbi)
{
#ifdef CONFIG_EROFS_FS_ZIP
        sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
        sbi->sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
#endif
        if (IS_ENABLED(CONFIG_EROFS_FS_XATTR))
                set_opt(&sbi->opt, XATTR_USER);
        if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL))
                set_opt(&sbi->opt, POSIX_ACL);
}

enum {
        Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
        Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset,
        Opt_inode_share,
};

static const struct constant_table erofs_param_cache_strategy[] = {
        {"disabled",        EROFS_ZIP_CACHE_DISABLED},
        {"readahead",        EROFS_ZIP_CACHE_READAHEAD},
        {"readaround",        EROFS_ZIP_CACHE_READAROUND},
        {}
};

static const struct constant_table erofs_dax_param_enums[] = {
        {"always",        EROFS_MOUNT_DAX_ALWAYS},
        {"never",        EROFS_MOUNT_DAX_NEVER},
        {}
};

static const struct fs_parameter_spec erofs_fs_parameters[] = {
        fsparam_flag_no("user_xattr",        Opt_user_xattr),
        fsparam_flag_no("acl",                Opt_acl),
        fsparam_enum("cache_strategy",        Opt_cache_strategy,
                     erofs_param_cache_strategy),
        fsparam_flag("dax",             Opt_dax),
        fsparam_enum("dax",                Opt_dax_enum, erofs_dax_param_enums),
        fsparam_string("device",        Opt_device),
        fsparam_string("fsid",                Opt_fsid),
        fsparam_string("domain_id",        Opt_domain_id),
        fsparam_flag_no("directio",        Opt_directio),
        fsparam_u64("fsoffset",                Opt_fsoffset),
        fsparam_flag("inode_share",        Opt_inode_share),
        {}
};

static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
{
        if (IS_ENABLED(CONFIG_FS_DAX)) {
                struct erofs_sb_info *sbi = fc->s_fs_info;

                if (mode == EROFS_MOUNT_DAX_ALWAYS) {
                        set_opt(&sbi->opt, DAX_ALWAYS);
                        clear_opt(&sbi->opt, DAX_NEVER);
                        return true;
                } else if (mode == EROFS_MOUNT_DAX_NEVER) {
                        set_opt(&sbi->opt, DAX_NEVER);
                        clear_opt(&sbi->opt, DAX_ALWAYS);
                        return true;
                }
                DBG_BUGON(1);
                return false;
        }
        errorfc(fc, "dax options not supported");
        return false;
}

static int erofs_fc_parse_param(struct fs_context *fc,
                                struct fs_parameter *param)
{
        struct erofs_sb_info *sbi = fc->s_fs_info;
        struct fs_parse_result result;
        struct erofs_device_info *dif;
        int opt, ret;

        opt = fs_parse(fc, erofs_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_user_xattr:
                if (!IS_ENABLED(CONFIG_EROFS_FS_XATTR))
                        errorfc(fc, "{,no}user_xattr options not supported");
                else if (result.boolean)
                        set_opt(&sbi->opt, XATTR_USER);
                else
                        clear_opt(&sbi->opt, XATTR_USER);
                break;
        case Opt_acl:
                if (!IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL))
                        errorfc(fc, "{,no}acl options not supported");
                else if (result.boolean)
                        set_opt(&sbi->opt, POSIX_ACL);
                else
                        clear_opt(&sbi->opt, POSIX_ACL);
                break;
        case Opt_cache_strategy:
                if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP))
                        errorfc(fc, "compression not supported, cache_strategy ignored");
                else
                        sbi->opt.cache_strategy = result.uint_32;
                break;
        case Opt_dax:
                if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS))
                        return -EINVAL;
                break;
        case Opt_dax_enum:
                if (!erofs_fc_set_dax_mode(fc, result.uint_32))
                        return -EINVAL;
                break;
        case Opt_device:
                dif = kzalloc_obj(*dif);
                if (!dif)
                        return -ENOMEM;
                dif->path = kstrdup(param->string, GFP_KERNEL);
                if (!dif->path) {
                        kfree(dif);
                        return -ENOMEM;
                }
                down_write(&sbi->devs->rwsem);
                ret = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
                up_write(&sbi->devs->rwsem);
                if (ret < 0) {
                        kfree(dif->path);
                        kfree(dif);
                        return ret;
                }
                ++sbi->devs->extra_devices;
                break;
#ifdef CONFIG_EROFS_FS_ONDEMAND
        case Opt_fsid:
                kfree(sbi->fsid);
                sbi->fsid = kstrdup(param->string, GFP_KERNEL);
                if (!sbi->fsid)
                        return -ENOMEM;
                break;
#endif
#if defined(CONFIG_EROFS_FS_ONDEMAND) || defined(CONFIG_EROFS_FS_PAGE_CACHE_SHARE)
        case Opt_domain_id:
                kfree_sensitive(sbi->domain_id);
                sbi->domain_id = no_free_ptr(param->string);
                break;
#else
        case Opt_fsid:
        case Opt_domain_id:
                errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
                break;
#endif
        case Opt_directio:
                if (!IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE))
                        errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
                else if (result.boolean)
                        set_opt(&sbi->opt, DIRECT_IO);
                else
                        clear_opt(&sbi->opt, DIRECT_IO);
                break;
        case Opt_fsoffset:
                sbi->dif0.fsoff = result.uint_64;
                break;
        case Opt_inode_share:
                if (!IS_ENABLED(CONFIG_EROFS_FS_PAGE_CACHE_SHARE))
                        errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
                else
                        set_opt(&sbi->opt, INODE_SHARE);
                break;
        }
        return 0;
}

static int erofs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
                           struct inode *parent)
{
        erofs_nid_t nid = EROFS_I(inode)->nid;
        int len = parent ? 6 : 3;

        if (*max_len < len) {
                *max_len = len;
                return FILEID_INVALID;
        }

        fh[0] = (u32)(nid >> 32);
        fh[1] = (u32)(nid & 0xffffffff);
        fh[2] = inode->i_generation;

        if (parent) {
                nid = EROFS_I(parent)->nid;

                fh[3] = (u32)(nid >> 32);
                fh[4] = (u32)(nid & 0xffffffff);
                fh[5] = parent->i_generation;
        }

        *max_len = len;
        return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN;
}

static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
{
        if ((fh_type != FILEID_INO64_GEN &&
             fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3)
                return NULL;

        return d_obtain_alias(erofs_iget(sb,
                ((u64)fid->raw[0] << 32) | fid->raw[1]));
}

static struct dentry *erofs_fh_to_parent(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
{
        if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6)
                return NULL;

        return d_obtain_alias(erofs_iget(sb,
                ((u64)fid->raw[3] << 32) | fid->raw[4]));
}

static struct dentry *erofs_get_parent(struct dentry *child)
{
        erofs_nid_t nid;
        unsigned int d_type;
        int err;

        err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type);
        if (err)
                return ERR_PTR(err);
        return d_obtain_alias(erofs_iget(child->d_sb, nid));
}

static const struct export_operations erofs_export_ops = {
        .encode_fh = erofs_encode_fh,
        .fh_to_dentry = erofs_fh_to_dentry,
        .fh_to_parent = erofs_fh_to_parent,
        .get_parent = erofs_get_parent,
};

static void erofs_set_sysfs_name(struct super_block *sb)
{
        struct erofs_sb_info *sbi = EROFS_SB(sb);

        if (sbi->domain_id && sbi->fsid)
                super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id,
                                             sbi->fsid);
        else if (sbi->fsid)
                super_set_sysfs_name_generic(sb, "%s", sbi->fsid);
        else if (erofs_is_fileio_mode(sbi))
                super_set_sysfs_name_generic(sb, "%s",
                                             bdi_dev_name(sb->s_bdi));
        else
                super_set_sysfs_name_id(sb);
}

static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct inode *inode;
        struct erofs_sb_info *sbi = EROFS_SB(sb);
        int err;

        sb->s_magic = EROFS_SUPER_MAGIC;
        sb->s_flags |= SB_RDONLY | SB_NOATIME;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_op = &erofs_sops;

        if (!sbi->domain_id && test_opt(&sbi->opt, INODE_SHARE)) {
                errorfc(fc, "domain_id is needed when inode_ishare is on");
                return -EINVAL;
        }
        if (test_opt(&sbi->opt, DAX_ALWAYS) && test_opt(&sbi->opt, INODE_SHARE)) {
                errorfc(fc, "FSDAX is not allowed when inode_ishare is on");
                return -EINVAL;
        }

        sbi->blkszbits = PAGE_SHIFT;
        if (!sb->s_bdev) {
                /*
                 * (File-backed mounts) EROFS claims it's safe to nest other
                 * fs contexts (including its own) due to self-controlled RO
                 * accesses/contexts and no side-effect changes that need to
                 * context save & restore so it can reuse the current thread
                 * context.
                 * However, we still need to prevent kernel stack overflow due
                 * to filesystem nesting: just ensure that s_stack_depth is 0
                 * to disallow mounting EROFS on stacked filesystems.
                 * Note: s_stack_depth is not incremented here for now, since
                 * EROFS is the only fs supporting file-backed mounts for now.
                 * It MUST change if another fs plans to support them, which
                 * may also require adjusting FILESYSTEM_MAX_STACK_DEPTH.
                 */
                if (erofs_is_fileio_mode(sbi)) {
                        inode = file_inode(sbi->dif0.file);
                        if ((inode->i_sb->s_op == &erofs_sops &&
                             !inode->i_sb->s_bdev) ||
                            inode->i_sb->s_stack_depth) {
                                erofs_err(sb, "file-backed mounts cannot be applied to stacked fses");
                                return -ENOTBLK;
                        }
                }
                sb->s_blocksize = PAGE_SIZE;
                sb->s_blocksize_bits = PAGE_SHIFT;

                if (erofs_is_fscache_mode(sb)) {
                        err = erofs_fscache_register_fs(sb);
                        if (err)
                                return err;
                }
                err = super_setup_bdi(sb);
                if (err)
                        return err;
        } else {
                if (!sb_set_blocksize(sb, PAGE_SIZE)) {
                        errorfc(fc, "failed to set initial blksize");
                        return -EINVAL;
                }

                sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
                                &sbi->dif0.dax_part_off, NULL, NULL);
        }

        err = erofs_read_superblock(sb);
        if (err)
                return err;

        if (sb->s_blocksize_bits != sbi->blkszbits) {
                if (erofs_is_fscache_mode(sb)) {
                        errorfc(fc, "unsupported blksize for fscache mode");
                        return -EINVAL;
                }

                if (erofs_is_fileio_mode(sbi)) {
                        sb->s_blocksize = 1 << sbi->blkszbits;
                        sb->s_blocksize_bits = sbi->blkszbits;
                } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
                        errorfc(fc, "failed to set erofs blksize");
                        return -EINVAL;
                }
        }

        if (sbi->dif0.fsoff) {
                if (sbi->dif0.fsoff & (sb->s_blocksize - 1))
                        return invalfc(fc, "fsoffset %llu is not aligned to block size %lu",
                                       sbi->dif0.fsoff, sb->s_blocksize);
                if (erofs_is_fscache_mode(sb))
                        return invalfc(fc, "cannot use fsoffset in fscache mode");
        }

        if (test_opt(&sbi->opt, DAX_ALWAYS) && sbi->blkszbits != PAGE_SHIFT) {
                erofs_info(sb, "unsupported blocksize for DAX");
                clear_opt(&sbi->opt, DAX_ALWAYS);
        }
        if (test_opt(&sbi->opt, INODE_SHARE) && !erofs_sb_has_ishare_xattrs(sbi)) {
                erofs_info(sb, "on-disk ishare xattrs not found. Turning off inode_share.");
                clear_opt(&sbi->opt, INODE_SHARE);
        }
        if (test_opt(&sbi->opt, INODE_SHARE))
                erofs_info(sb, "EXPERIMENTAL EROFS page cache share support in use. Use at your own risk!");

        sb->s_time_gran = 1;
        sb->s_xattr = erofs_xattr_handlers;
        sb->s_export_op = &erofs_export_ops;

        if (test_opt(&sbi->opt, POSIX_ACL))
                sb->s_flags |= SB_POSIXACL;
        else
                sb->s_flags &= ~SB_POSIXACL;

        err = z_erofs_init_super(sb);
        if (err)
                return err;

        if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
                inode = erofs_iget(sb, sbi->packed_nid);
                if (IS_ERR(inode))
                        return PTR_ERR(inode);
                sbi->packed_inode = inode;
        }
        if (erofs_sb_has_metabox(sbi)) {
                inode = erofs_iget(sb, sbi->metabox_nid);
                if (IS_ERR(inode))
                        return PTR_ERR(inode);
                sbi->metabox_inode = inode;
        }

        inode = erofs_iget(sb, sbi->root_nid);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        if (!S_ISDIR(inode->i_mode)) {
                erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)",
                          sbi->root_nid, inode->i_mode);
                iput(inode);
                return -EINVAL;
        }
        sb->s_root = d_make_root(inode);
        if (!sb->s_root)
                return -ENOMEM;

        erofs_shrinker_register(sb);
        err = erofs_xattr_prefixes_init(sb);
        if (err)
                return err;

        erofs_set_sysfs_name(sb);
        err = erofs_register_sysfs(sb);
        if (err)
                return err;

        sbi->dir_ra_bytes = EROFS_DIR_RA_BYTES;
        erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
        return 0;
}

static int erofs_fc_get_tree(struct fs_context *fc)
{
        struct erofs_sb_info *sbi = fc->s_fs_info;
        int ret;

        if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
                return get_tree_nodev(fc, erofs_fc_fill_super);

        ret = get_tree_bdev_flags(fc, erofs_fc_fill_super,
                IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ?
                        GET_TREE_BDEV_QUIET_LOOKUP : 0);
        if (IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && ret == -ENOTBLK) {
                struct file *file;

                if (!fc->source)
                        return invalf(fc, "No source specified");
                file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
                if (IS_ERR(file))
                        return PTR_ERR(file);
                sbi->dif0.file = file;

                if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) &&
                    sbi->dif0.file->f_mapping->a_ops->read_folio)
                        return get_tree_nodev(fc, erofs_fc_fill_super);
        }
        return ret;
}

static int erofs_fc_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        struct erofs_sb_info *sbi = EROFS_SB(sb);
        struct erofs_sb_info *new_sbi = fc->s_fs_info;

        DBG_BUGON(!sb_rdonly(sb));

        if (new_sbi->fsid || new_sbi->domain_id)
                erofs_info(sb, "ignoring reconfiguration for fsid|domain_id.");

        if (test_opt(&new_sbi->opt, POSIX_ACL))
                fc->sb_flags |= SB_POSIXACL;
        else
                fc->sb_flags &= ~SB_POSIXACL;

        sbi->opt = new_sbi->opt;

        fc->sb_flags |= SB_RDONLY;
        return 0;
}

static int erofs_release_device_info(int id, void *ptr, void *data)
{
        struct erofs_device_info *dif = ptr;

        fs_put_dax(dif->dax_dev, NULL);
        if (dif->file)
                fput(dif->file);
        erofs_fscache_unregister_cookie(dif->fscache);
        dif->fscache = NULL;
        kfree(dif->path);
        kfree(dif);
        return 0;
}

static void erofs_free_dev_context(struct erofs_dev_context *devs)
{
        if (!devs)
                return;
        idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
        idr_destroy(&devs->tree);
        kfree(devs);
}

static void erofs_sb_free(struct erofs_sb_info *sbi)
{
        erofs_free_dev_context(sbi->devs);
        kfree(sbi->fsid);
        kfree_sensitive(sbi->domain_id);
        if (sbi->dif0.file)
                fput(sbi->dif0.file);
        kfree(sbi->volume_name);
        kfree(sbi);
}

static void erofs_fc_free(struct fs_context *fc)
{
        struct erofs_sb_info *sbi = fc->s_fs_info;

        if (sbi) /* free here if an error occurs before transferring to sb */
                erofs_sb_free(sbi);
}

static const struct fs_context_operations erofs_context_ops = {
        .parse_param        = erofs_fc_parse_param,
        .get_tree       = erofs_fc_get_tree,
        .reconfigure    = erofs_fc_reconfigure,
        .free                = erofs_fc_free,
};

static int erofs_init_fs_context(struct fs_context *fc)
{
        struct erofs_sb_info *sbi;

        sbi = kzalloc_obj(*sbi);
        if (!sbi)
                return -ENOMEM;

        sbi->devs = kzalloc_obj(struct erofs_dev_context);
        if (!sbi->devs) {
                kfree(sbi);
                return -ENOMEM;
        }
        fc->s_fs_info = sbi;

        idr_init(&sbi->devs->tree);
        init_rwsem(&sbi->devs->rwsem);
        erofs_default_options(sbi);
        fc->ops = &erofs_context_ops;
        return 0;
}

static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi)
{
        iput(sbi->packed_inode);
        sbi->packed_inode = NULL;
        iput(sbi->metabox_inode);
        sbi->metabox_inode = NULL;
#ifdef CONFIG_EROFS_FS_ZIP
        iput(sbi->managed_cache);
        sbi->managed_cache = NULL;
#endif
}

static void erofs_kill_sb(struct super_block *sb)
{
        struct erofs_sb_info *sbi = EROFS_SB(sb);

        if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) ||
            sbi->dif0.file)
                kill_anon_super(sb);
        else
                kill_block_super(sb);
        erofs_drop_internal_inodes(sbi);
        fs_put_dax(sbi->dif0.dax_dev, NULL);
        erofs_fscache_unregister_fs(sb);
        erofs_sb_free(sbi);
        sb->s_fs_info = NULL;
}

static void erofs_put_super(struct super_block *sb)
{
        struct erofs_sb_info *const sbi = EROFS_SB(sb);

        erofs_unregister_sysfs(sb);
        erofs_shrinker_unregister(sb);
        erofs_xattr_prefixes_cleanup(sb);
        erofs_drop_internal_inodes(sbi);
        erofs_free_dev_context(sbi->devs);
        sbi->devs = NULL;
        erofs_fscache_unregister_fs(sb);
}

static struct file_system_type erofs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "erofs",
        .init_fs_context = erofs_init_fs_context,
        .kill_sb        = erofs_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("erofs");

#if defined(CONFIG_EROFS_FS_ONDEMAND) || defined(CONFIG_EROFS_FS_PAGE_CACHE_SHARE)
static void erofs_free_anon_inode(struct inode *inode)
{
        struct erofs_inode *vi = EROFS_I(inode);

#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
        kfree(vi->fingerprint.opaque);
#endif
        kmem_cache_free(erofs_inode_cachep, vi);
}

static const struct super_operations erofs_anon_sops = {
        .alloc_inode = erofs_alloc_inode,
        .drop_inode = inode_just_drop,
        .free_inode = erofs_free_anon_inode,
};

static int erofs_anon_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx;

        ctx = init_pseudo(fc, EROFS_SUPER_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &erofs_anon_sops;
        return 0;
}

struct file_system_type erofs_anon_fs_type = {
        .name           = "pseudo_erofs",
        .init_fs_context = erofs_anon_init_fs_context,
        .kill_sb        = kill_anon_super,
};
#endif

static int __init erofs_module_init(void)
{
        int err;

        erofs_check_ondisk_layout_definitions();

        erofs_inode_cachep = kmem_cache_create("erofs_inode",
                        sizeof(struct erofs_inode), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
                        erofs_inode_init_once);
        if (!erofs_inode_cachep)
                return -ENOMEM;

        err = erofs_init_shrinker();
        if (err)
                goto shrinker_err;

        err = z_erofs_init_subsystem();
        if (err)
                goto zip_err;

        err = erofs_init_sysfs();
        if (err)
                goto sysfs_err;

        err = erofs_init_ishare();
        if (err)
                goto ishare_err;

        err = register_filesystem(&erofs_fs_type);
        if (err)
                goto fs_err;

        return 0;

fs_err:
        erofs_exit_ishare();
ishare_err:
        erofs_exit_sysfs();
sysfs_err:
        z_erofs_exit_subsystem();
zip_err:
        erofs_exit_shrinker();
shrinker_err:
        kmem_cache_destroy(erofs_inode_cachep);
        return err;
}

static void __exit erofs_module_exit(void)
{
        unregister_filesystem(&erofs_fs_type);

        /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
        rcu_barrier();

        erofs_exit_ishare();
        erofs_exit_sysfs();
        z_erofs_exit_subsystem();
        erofs_exit_shrinker();
        kmem_cache_destroy(erofs_inode_cachep);
}

static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct erofs_sb_info *sbi = EROFS_SB(sb);

        buf->f_type = sb->s_magic;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = sbi->total_blocks;
        buf->f_bfree = buf->f_bavail = 0;
        buf->f_files = ULLONG_MAX;
        buf->f_ffree = ULLONG_MAX - sbi->inos;
        buf->f_namelen = EROFS_NAME_LEN;

        if (uuid_is_null(&sb->s_uuid))
                buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 :
                                huge_encode_dev(sb->s_bdev->bd_dev));
        else
                buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
        return 0;
}

static int erofs_show_options(struct seq_file *seq, struct dentry *root)
{
        struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
        struct erofs_mount_opts *opt = &sbi->opt;

        if (IS_ENABLED(CONFIG_EROFS_FS_XATTR))
                seq_puts(seq, test_opt(opt, XATTR_USER) ?
                                ",user_xattr" : ",nouser_xattr");
        if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL))
                seq_puts(seq, test_opt(opt, POSIX_ACL) ? ",acl" : ",noacl");
        if (IS_ENABLED(CONFIG_EROFS_FS_ZIP))
                seq_printf(seq, ",cache_strategy=%s",
                          erofs_param_cache_strategy[opt->cache_strategy].name);
        if (test_opt(opt, DAX_ALWAYS))
                seq_puts(seq, ",dax=always");
        if (test_opt(opt, DAX_NEVER))
                seq_puts(seq, ",dax=never");
        if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO))
                seq_puts(seq, ",directio");
        if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND)) {
                if (sbi->fsid)
                        seq_printf(seq, ",fsid=%s", sbi->fsid);
                if (sbi->domain_id)
                        seq_printf(seq, ",domain_id=%s", sbi->domain_id);
        }
        if (sbi->dif0.fsoff)
                seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff);
        if (test_opt(opt, INODE_SHARE))
                seq_puts(seq, ",inode_share");
        return 0;
}

static void erofs_evict_inode(struct inode *inode)
{
        if (IS_DAX(inode))
                dax_break_layout_final(inode);
        erofs_ishare_free_inode(inode);
        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
}

const struct super_operations erofs_sops = {
        .put_super = erofs_put_super,
        .alloc_inode = erofs_alloc_inode,
        .free_inode = erofs_free_inode,
        .evict_inode = erofs_evict_inode,
        .statfs = erofs_statfs,
        .show_options = erofs_show_options,
};

module_init(erofs_module_init);
module_exit(erofs_module_exit);

MODULE_DESCRIPTION("Enhanced ROM File System");
MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc.");
MODULE_LICENSE("GPL");
























































    1 
    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// SPDX-License-Identifier: GPL-2.0-only
/*
 *      crc16.c
 */

#include <linux/crc16.h>
#include <linux/export.h>
#include <linux/module.h>
#include <linux/types.h>

/** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */
static const u16 crc16_table[256] = {
        0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
        0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
        0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
        0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
        0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
        0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
        0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
        0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
        0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
        0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
        0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
        0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
        0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
        0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
        0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
        0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
        0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
        0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
        0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
        0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
        0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
        0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
        0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
        0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
        0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
        0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
        0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
        0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
        0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
        0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
        0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
        0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
};

/**
 * crc16 - compute the CRC-16 for the data buffer
 * @crc:        previous CRC value
 * @p:                data pointer
 * @len:        number of bytes in the buffer
 *
 * Returns the updated CRC value.
 */
u16 crc16(u16 crc, const u8 *p, size_t len)
{
        while (len--)
                crc = (crc >> 8) ^ crc16_table[(crc & 0xff) ^ *p++];
        return crc;
}
EXPORT_SYMBOL(crc16);

MODULE_DESCRIPTION("CRC16 calculations");
MODULE_LICENSE("GPL");



































































































































































































































































































































































































































































































    3 









    3 



    2 







    3 















































































































































































































































































































































































































































































































































































































































































































































































































































    3 






















    3 
    3 







































    2 




    2 




































































































   11 




   10 

    1 






   11 


   11 

























    2 














































   11 









    3 





    2 









    2 






    2 


    2 


    1 

    1 






    1 


    1 


    1 



    3 

    3 

























    1 
    4 







    2 














    2 








    2 



























    1 






















    1 







    1 


    1 











    1 




    1 












































































































































































































































































































































































































































































    3 



   10 
















































   14 








    1 



















    1 



   11 





















   10 







   11 



















   10 


























    1 










   12 



    3 



   10 
    1 





   13 











   10 
    2 




    2 


    8 






    9 


   11 
















    1 



















   15 























    1 















    1 





   14 





   13 





























    1 















    1 



    1 














    1 
    1 














    1 




    2 




    1 









    1 




    3 
    3 







    2 


    2 



    2 










    3 










    3 





















    4 






    2 
















    1 
    1 





















    1 











    4 


















    1 




















    4 



    3 



















    1 











    1 






    1 












    1 




    3 























































    2 


    2 

































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                ROUTE - implementation of the IP router.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 *                Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 * Fixes:
 *                Alan Cox        :        Verify area fixes.
 *                Alan Cox        :        cli() protects routing changes
 *                Rui Oliveira        :        ICMP routing table updates
 *                (rco@di.uminho.pt)        Routing table insertion and update
 *                Linus Torvalds        :        Rewrote bits to be sensible
 *                Alan Cox        :        Added BSD route gw semantics
 *                Alan Cox        :        Super /proc >4K
 *                Alan Cox        :        MTU in route table
 *                Alan Cox        :        MSS actually. Also added the window
 *                                        clamper.
 *                Sam Lantinga        :        Fixed route matching in rt_del()
 *                Alan Cox        :        Routing cache support.
 *                Alan Cox        :        Removed compatibility cruft.
 *                Alan Cox        :        RTF_REJECT support.
 *                Alan Cox        :        TCP irtt support.
 *                Jonathan Naylor        :        Added Metric support.
 *        Miquel van Smoorenburg        :        BSD API fixes.
 *        Miquel van Smoorenburg        :        Metrics.
 *                Alan Cox        :        Use __u32 properly
 *                Alan Cox        :        Aligned routing errors more closely with BSD
 *                                        our system is still very different.
 *                Alan Cox        :        Faster /proc handling
 *        Alexey Kuznetsov        :        Massive rework to support tree based routing,
 *                                        routing caches and better behaviour.
 *
 *                Olaf Erb        :        irtt wasn't being copied right.
 *                Bjorn Ekwall        :        Kerneld route support.
 *                Alan Cox        :        Multicast fixed (I hope)
 *                Pavel Krauz        :        Limited broadcast fixed
 *                Mike McLagan        :        Routing by source
 *        Alexey Kuznetsov        :        End of old history. Split to fib.c and
 *                                        route.c and rewritten from scratch.
 *                Andi Kleen        :        Load-limit warning messages.
 *        Vitaly E. Lavrov        :        Transparent proxy revived after year coma.
 *        Vitaly E. Lavrov        :        Race condition in ip_route_input_slow.
 *        Tobias Ringstrom        :        Uninitialized res.type in ip_route_output_slow.
 *        Vladimir V. Ivanov        :        IP rule info (flowid) is really useful.
 *                Marc Boucher        :        routing by fwmark
 *        Robert Olsson                :        Added rt_cache statistics
 *        Arnaldo C. Melo                :        Convert proc stuff to seq_file
 *        Eric Dumazet                :        hashed spinlocks and rt_check_expire() fixes.
 *        Ilia Sotnikov                :        Ignore TOS on PMTUD and Redirect
 *        Ilia Sotnikov                :        Removed TOS from hash calculations
 */

#define pr_fmt(fmt) "IPv4: " fmt

#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/memblock.h>
#include <linux/socket.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/pkt_sched.h>
#include <linux/mroute.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
#include <net/lwtunnel.h>
#include <net/netevent.h>
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <net/secure_seq.h>
#include <net/ip_tunnels.h>

#include "fib_lookup.h"

#define RT_GC_TIMEOUT (300*HZ)

#define DEFAULT_MIN_PMTU (512 + 20 + 20)
#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
#define DEFAULT_MIN_ADVMSS 256
static int ip_rt_max_size;
static int ip_rt_redirect_number __read_mostly        = 9;
static int ip_rt_redirect_load __read_mostly        = HZ / 50;
static int ip_rt_redirect_silence __read_mostly        = ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost __read_mostly        = HZ;
static int ip_rt_error_burst __read_mostly        = 5 * HZ;

static int ip_rt_gc_timeout __read_mostly        = RT_GC_TIMEOUT;

/*
 *        Interface to generic destination cache.
 */

INDIRECT_CALLABLE_SCOPE
struct dst_entry        *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int         ipv4_default_advmss(const struct dst_entry *dst);
INDIRECT_CALLABLE_SCOPE
unsigned int                ipv4_mtu(const struct dst_entry *dst);
static void                ipv4_negative_advice(struct sock *sk,
                                             struct dst_entry *dst);
static void                 ipv4_link_failure(struct sk_buff *skb);
static void                 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                           struct sk_buff *skb, u32 mtu,
                                           bool confirm_neigh);
static void                 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
                                        struct sk_buff *skb);
static void                ipv4_dst_destroy(struct dst_entry *dst);

static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
{
        WARN_ON(1);
        return NULL;
}

static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
                                           struct sk_buff *skb,
                                           const void *daddr);
static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);

static struct dst_ops ipv4_dst_ops = {
        .family =                AF_INET,
        .check =                ipv4_dst_check,
        .default_advmss =        ipv4_default_advmss,
        .mtu =                        ipv4_mtu,
        .cow_metrics =                ipv4_cow_metrics,
        .destroy =                ipv4_dst_destroy,
        .negative_advice =        ipv4_negative_advice,
        .link_failure =                ipv4_link_failure,
        .update_pmtu =                ip_rt_update_pmtu,
        .redirect =                ip_do_redirect,
        .local_out =                __ip_local_out,
        .neigh_lookup =                ipv4_neigh_lookup,
        .confirm_neigh =        ipv4_confirm_neigh,
};

#define ECN_OR_COST(class)        TC_PRIO_##class

const __u8 ip_tos2prio[16] = {
        TC_PRIO_BESTEFFORT,
        ECN_OR_COST(BESTEFFORT),
        TC_PRIO_BESTEFFORT,
        ECN_OR_COST(BESTEFFORT),
        TC_PRIO_BULK,
        ECN_OR_COST(BULK),
        TC_PRIO_BULK,
        ECN_OR_COST(BULK),
        TC_PRIO_INTERACTIVE,
        ECN_OR_COST(INTERACTIVE),
        TC_PRIO_INTERACTIVE,
        ECN_OR_COST(INTERACTIVE),
        TC_PRIO_INTERACTIVE_BULK,
        ECN_OR_COST(INTERACTIVE_BULK),
        TC_PRIO_INTERACTIVE_BULK,
        ECN_OR_COST(INTERACTIVE_BULK)
};
EXPORT_SYMBOL(ip_tos2prio);

static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
#ifndef CONFIG_PREEMPT_RT
#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
#else
#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field)
#endif

#ifdef CONFIG_PROC_FS
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
{
        if (*pos)
                return NULL;
        return SEQ_START_TOKEN;
}

static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;
        return NULL;
}

static void rt_cache_seq_stop(struct seq_file *seq, void *v)
{
}

static int rt_cache_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN)
                seq_printf(seq, "%-127s\n",
                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
                           "HHUptod\tSpecDst");
        return 0;
}

static const struct seq_operations rt_cache_seq_ops = {
        .start  = rt_cache_seq_start,
        .next   = rt_cache_seq_next,
        .stop   = rt_cache_seq_stop,
        .show   = rt_cache_seq_show,
};

static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
        int cpu;

        if (*pos == 0)
                return SEQ_START_TOKEN;

        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
                if (!cpu_possible(cpu))
                        continue;
                *pos = cpu+1;
                return &per_cpu(rt_cache_stat, cpu);
        }
        return NULL;
}

static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        int cpu;

        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
                if (!cpu_possible(cpu))
                        continue;
                *pos = cpu+1;
                return &per_cpu(rt_cache_stat, cpu);
        }
        (*pos)++;
        return NULL;

}

static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
{

}

static int rt_cpu_seq_show(struct seq_file *seq, void *v)
{
        struct rt_cache_stat *st = v;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "entries  in_hit   in_slow_tot in_slow_mc in_no_route in_brd   in_martian_dst in_martian_src out_hit  out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
                return 0;
        }

        seq_printf(seq, "%08x %08x %08x    %08x   %08x    %08x %08x       "
                        "%08x       %08x %08x     %08x    %08x %08x   "
                        "%08x     %08x        %08x        %08x\n",
                   dst_entries_get_slow(&ipv4_dst_ops),
                   0, /* st->in_hit */
                   st->in_slow_tot,
                   st->in_slow_mc,
                   st->in_no_route,
                   st->in_brd,
                   st->in_martian_dst,
                   st->in_martian_src,

                   0, /* st->out_hit */
                   st->out_slow_tot,
                   st->out_slow_mc,

                   0, /* st->gc_total */
                   0, /* st->gc_ignored */
                   0, /* st->gc_goal_miss */
                   0, /* st->gc_dst_overflow */
                   0, /* st->in_hlist_search */
                   0  /* st->out_hlist_search */
                );
        return 0;
}

static const struct seq_operations rt_cpu_seq_ops = {
        .start  = rt_cpu_seq_start,
        .next   = rt_cpu_seq_next,
        .stop   = rt_cpu_seq_stop,
        .show   = rt_cpu_seq_show,
};

#ifdef CONFIG_IP_ROUTE_CLASSID
static int rt_acct_proc_show(struct seq_file *m, void *v)
{
        struct ip_rt_acct *dst, *src;
        unsigned int i, j;

        dst = kzalloc_objs(struct ip_rt_acct, 256);
        if (!dst)
                return -ENOMEM;

        for_each_possible_cpu(i) {
                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
                for (j = 0; j < 256; j++) {
                        dst[j].o_bytes   += src[j].o_bytes;
                        dst[j].o_packets += src[j].o_packets;
                        dst[j].i_bytes   += src[j].i_bytes;
                        dst[j].i_packets += src[j].i_packets;
                }
        }

        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
        kfree(dst);
        return 0;
}
#endif

static int __net_init ip_rt_do_proc_init(struct net *net)
{
        struct proc_dir_entry *pde;

        pde = proc_create_seq("rt_cache", 0444, net->proc_net,
                              &rt_cache_seq_ops);
        if (!pde)
                goto err1;

        pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
                              &rt_cpu_seq_ops);
        if (!pde)
                goto err2;

#ifdef CONFIG_IP_ROUTE_CLASSID
        pde = proc_create_single("rt_acct", 0, net->proc_net,
                        rt_acct_proc_show);
        if (!pde)
                goto err3;
#endif
        return 0;

#ifdef CONFIG_IP_ROUTE_CLASSID
err3:
        remove_proc_entry("rt_cache", net->proc_net_stat);
#endif
err2:
        remove_proc_entry("rt_cache", net->proc_net);
err1:
        return -ENOMEM;
}

static void __net_exit ip_rt_do_proc_exit(struct net *net)
{
        remove_proc_entry("rt_cache", net->proc_net_stat);
        remove_proc_entry("rt_cache", net->proc_net);
#ifdef CONFIG_IP_ROUTE_CLASSID
        remove_proc_entry("rt_acct", net->proc_net);
#endif
}

static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
        .init = ip_rt_do_proc_init,
        .exit = ip_rt_do_proc_exit,
};

static int __init ip_rt_proc_init(void)
{
        return register_pernet_subsys(&ip_rt_proc_ops);
}

#else
static inline int ip_rt_proc_init(void)
{
        return 0;
}
#endif /* CONFIG_PROC_FS */

static inline bool rt_is_expired(const struct rtable *rth)
{
        bool res;

        rcu_read_lock();
        res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev));
        rcu_read_unlock();

        return res;
}

void rt_cache_flush(struct net *net)
{
        rt_genid_bump_ipv4(net);
}

static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
                                           struct sk_buff *skb,
                                           const void *daddr)
{
        const struct rtable *rt = container_of(dst, struct rtable, dst);
        struct net_device *dev;
        struct neighbour *n;

        rcu_read_lock();
        dev = dst_dev_rcu(dst);
        if (likely(rt->rt_gw_family == AF_INET)) {
                n = ip_neigh_gw4(dev, rt->rt_gw4);
        } else if (rt->rt_gw_family == AF_INET6) {
                n = ip_neigh_gw6(dev, &rt->rt_gw6);
        } else {
                __be32 pkey;

                pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
                n = ip_neigh_gw4(dev, pkey);
        }

        if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
                n = NULL;

        rcu_read_unlock();

        return n;
}

static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
        const struct rtable *rt = container_of(dst, struct rtable, dst);
        struct net_device *dev = dst_dev(dst);
        const __be32 *pkey = daddr;

        if (rt->rt_gw_family == AF_INET) {
                pkey = (const __be32 *)&rt->rt_gw4;
        } else if (IS_ENABLED(CONFIG_IPV6) && rt->rt_gw_family == AF_INET6) {
                return __ipv6_confirm_neigh(dev, &rt->rt_gw6);
        } else if (!daddr ||
                 (rt->rt_flags &
                  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
                return;
        }
        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}

/* Hash tables of size 2048..262144 depending on RAM size.
 * Each bucket uses 8 bytes.
 */
static u32 ip_idents_mask __read_mostly;
static atomic_t *ip_idents __read_mostly;
static u32 *ip_tstamps __read_mostly;

/* In order to protect privacy, we add a perturbation to identifiers
 * if one generator is seldom used. This makes hard for an attacker
 * to infer how many packets were sent between two points in time.
 */
static u32 ip_idents_reserve(u32 hash, int segs)
{
        u32 bucket, old, now = (u32)jiffies;
        atomic_t *p_id;
        u32 *p_tstamp;
        u32 delta = 0;

        bucket = hash & ip_idents_mask;
        p_tstamp = ip_tstamps + bucket;
        p_id = ip_idents + bucket;
        old = READ_ONCE(*p_tstamp);

        if (old != now && cmpxchg(p_tstamp, old, now) == old)
                delta = get_random_u32_below(now - old);

        /* If UBSAN reports an error there, please make sure your compiler
         * supports -fno-strict-overflow before reporting it that was a bug
         * in UBSAN, and it has been fixed in GCC-8.
         */
        return atomic_add_return(segs + delta, p_id) - segs;
}

void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
{
        u32 hash, id;

        /* Note the following code is not safe, but this is okay. */
        if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
                get_random_bytes(&net->ipv4.ip_id_key,
                                 sizeof(net->ipv4.ip_id_key));

        hash = siphash_3u32((__force u32)iph->daddr,
                            (__force u32)iph->saddr,
                            iph->protocol,
                            &net->ipv4.ip_id_key);
        id = ip_idents_reserve(hash, segs);
        iph->id = htons(id);
}
EXPORT_SYMBOL(__ip_select_ident);

static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
                             const struct sock *sk, const struct iphdr *iph,
                             int oif, __u8 tos, u8 prot, u32 mark,
                             int flow_flags)
{
        __u8 scope = RT_SCOPE_UNIVERSE;

        if (sk) {
                oif = sk->sk_bound_dev_if;
                mark = READ_ONCE(sk->sk_mark);
                tos = ip_sock_rt_tos(sk);
                scope = ip_sock_rt_scope(sk);
                prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
                                                    sk->sk_protocol;
        }

        flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope,
                           prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
                           sock_net_uid(net, sk));
}

static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
                               const struct sock *sk)
{
        const struct net *net = dev_net(skb->dev);
        const struct iphdr *iph = ip_hdr(skb);
        int oif = skb->dev->ifindex;
        u8 prot = iph->protocol;
        u32 mark = skb->mark;
        __u8 tos = iph->tos;

        __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}

static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
{
        const struct inet_sock *inet = inet_sk(sk);
        const struct ip_options_rcu *inet_opt;
        __be32 daddr = inet->inet_daddr;

        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        if (inet_opt && inet_opt->opt.srr)
                daddr = inet_opt->opt.faddr;
        flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
                           ip_sock_rt_tos(sk),
                           ip_sock_rt_scope(sk),
                           inet_test_bit(HDRINCL, sk) ?
                                IPPROTO_RAW : sk->sk_protocol,
                           inet_sk_flowi_flags(sk),
                           daddr, inet->inet_saddr, 0, 0,
                           sk_uid(sk));
        rcu_read_unlock();
}

static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
                                 const struct sk_buff *skb)
{
        if (skb)
                build_skb_flow_key(fl4, skb, sk);
        else
                build_sk_flow_key(fl4, sk);
}

static DEFINE_SPINLOCK(fnhe_lock);

static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
{
        struct rtable *rt;

        rt = rcu_dereference(fnhe->fnhe_rth_input);
        if (rt) {
                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
                dst_dev_put(&rt->dst);
                dst_release(&rt->dst);
        }
        rt = rcu_dereference(fnhe->fnhe_rth_output);
        if (rt) {
                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
                dst_dev_put(&rt->dst);
                dst_release(&rt->dst);
        }
}

static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
{
        struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
        struct fib_nh_exception *fnhe, *oldest = NULL;

        for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
                fnhe = rcu_dereference_protected(*fnhe_p,
                                                 lockdep_is_held(&fnhe_lock));
                if (!fnhe)
                        break;
                if (!oldest ||
                    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
                        oldest = fnhe;
                        oldest_p = fnhe_p;
                }
        }

        /* Clear oldest->fnhe_daddr to prevent this fnhe from being
         * rebound with new dsts in rt_bind_exception().
         */
        oldest->fnhe_daddr = 0;
        fnhe_flush_routes(oldest);
        *oldest_p = oldest->fnhe_next;
        kfree_rcu(oldest, rcu);
}

static u32 fnhe_hashfun(__be32 daddr)
{
        static siphash_aligned_key_t fnhe_hash_key;
        u64 hval;

        net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
        hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
        return hash_64(hval, FNHE_HASH_SHIFT);
}

static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
{
        rt->rt_pmtu = fnhe->fnhe_pmtu;
        rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
        rt->dst.expires = fnhe->fnhe_expires;

        if (fnhe->fnhe_gw) {
                rt->rt_flags |= RTCF_REDIRECTED;
                rt->rt_uses_gateway = 1;
                rt->rt_gw_family = AF_INET;
                rt->rt_gw4 = fnhe->fnhe_gw;
        }
}

static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
                                  __be32 gw, u32 pmtu, bool lock,
                                  unsigned long expires)
{
        struct fnhe_hash_bucket *hash;
        struct fib_nh_exception *fnhe;
        struct rtable *rt;
        u32 genid, hval;
        unsigned int i;
        int depth;

        genid = fnhe_genid(dev_net(nhc->nhc_dev));
        hval = fnhe_hashfun(daddr);

        spin_lock_bh(&fnhe_lock);

        hash = rcu_dereference(nhc->nhc_exceptions);
        if (!hash) {
                hash = kzalloc_objs(*hash, FNHE_HASH_SIZE, GFP_ATOMIC);
                if (!hash)
                        goto out_unlock;
                rcu_assign_pointer(nhc->nhc_exceptions, hash);
        }

        hash += hval;

        depth = 0;
        for (fnhe = rcu_dereference(hash->chain); fnhe;
             fnhe = rcu_dereference(fnhe->fnhe_next)) {
                if (fnhe->fnhe_daddr == daddr)
                        break;
                depth++;
        }

        if (fnhe) {
                if (fnhe->fnhe_genid != genid)
                        fnhe->fnhe_genid = genid;
                if (gw)
                        fnhe->fnhe_gw = gw;
                if (pmtu) {
                        fnhe->fnhe_pmtu = pmtu;
                        fnhe->fnhe_mtu_locked = lock;
                }
                fnhe->fnhe_expires = max(1UL, expires);
                /* Update all cached dsts too */
                rt = rcu_dereference(fnhe->fnhe_rth_input);
                if (rt)
                        fill_route_from_fnhe(rt, fnhe);
                rt = rcu_dereference(fnhe->fnhe_rth_output);
                if (rt)
                        fill_route_from_fnhe(rt, fnhe);
        } else {
                /* Randomize max depth to avoid some side channels attacks. */
                int max_depth = FNHE_RECLAIM_DEPTH +
                                get_random_u32_below(FNHE_RECLAIM_DEPTH);

                while (depth > max_depth) {
                        fnhe_remove_oldest(hash);
                        depth--;
                }

                fnhe = kzalloc_obj(*fnhe, GFP_ATOMIC);
                if (!fnhe)
                        goto out_unlock;

                fnhe->fnhe_next = hash->chain;

                fnhe->fnhe_genid = genid;
                fnhe->fnhe_daddr = daddr;
                fnhe->fnhe_gw = gw;
                fnhe->fnhe_pmtu = pmtu;
                fnhe->fnhe_mtu_locked = lock;
                fnhe->fnhe_expires = max(1UL, expires);

                rcu_assign_pointer(hash->chain, fnhe);

                /* Exception created; mark the cached routes for the nexthop
                 * stale, so anyone caching it rechecks if this exception
                 * applies to them.
                 */
                rt = rcu_dereference(nhc->nhc_rth_input);
                if (rt)
                        WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);

                for_each_possible_cpu(i) {
                        struct rtable __rcu **prt;

                        prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
                        rt = rcu_dereference(*prt);
                        if (rt)
                                WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
                }
        }

        fnhe->fnhe_stamp = jiffies;

out_unlock:
        spin_unlock_bh(&fnhe_lock);
}

static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
                             bool kill_route)
{
        __be32 new_gw = icmp_hdr(skb)->un.gateway;
        __be32 old_gw = ip_hdr(skb)->saddr;
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
        struct fib_result res;
        struct neighbour *n;
        struct net *net;

        switch (icmp_hdr(skb)->code & 7) {
        case ICMP_REDIR_NET:
        case ICMP_REDIR_NETTOS:
        case ICMP_REDIR_HOST:
        case ICMP_REDIR_HOSTTOS:
                break;

        default:
                return;
        }

        if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
                return;

        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                return;

        net = dev_net(dev);
        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
            ipv4_is_zeronet(new_gw))
                goto reject_redirect;

        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
                        goto reject_redirect;
                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
                        goto reject_redirect;
        } else {
                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
                        goto reject_redirect;
        }

        n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
        if (!n)
                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
        if (!IS_ERR(n)) {
                if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
                        neigh_event_send(n, NULL);
                } else {
                        if (fib_lookup(net, fl4, &res, 0) == 0) {
                                struct fib_nh_common *nhc;

                                fib_select_path(net, &res, fl4, skb);
                                nhc = FIB_RES_NHC(res);
                                update_or_create_fnhe(nhc, fl4->daddr, new_gw,
                                                0, false,
                                                jiffies + ip_rt_gc_timeout);
                        }
                        if (kill_route)
                                WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
                }
                neigh_release(n);
        }
        return;

reject_redirect:
#ifdef CONFIG_IP_ROUTE_VERBOSE
        if (IN_DEV_LOG_MARTIANS(in_dev)) {
                const struct iphdr *iph = (const struct iphdr *) skb->data;
                __be32 daddr = iph->daddr;
                __be32 saddr = iph->saddr;

                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
                                     "  Advised path = %pI4 -> %pI4\n",
                                     &old_gw, dev->name, &new_gw,
                                     &saddr, &daddr);
        }
#endif
        ;
}

static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
        struct rtable *rt;
        struct flowi4 fl4;
        const struct iphdr *iph = (const struct iphdr *) skb->data;
        struct net *net = dev_net(skb->dev);
        int oif = skb->dev->ifindex;
        u8 prot = iph->protocol;
        u32 mark = skb->mark;
        __u8 tos = iph->tos;

        rt = dst_rtable(dst);

        __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
        __ip_do_redirect(rt, skb, &fl4, true);
}

static void ipv4_negative_advice(struct sock *sk,
                                 struct dst_entry *dst)
{
        struct rtable *rt = dst_rtable(dst);

        if ((READ_ONCE(dst->obsolete) > 0) ||
            (rt->rt_flags & RTCF_REDIRECTED) ||
            READ_ONCE(rt->dst.expires))
                sk_dst_reset(sk);
}

/*
 * Algorithm:
 *        1. The first ip_rt_redirect_number redirects are sent
 *           with exponential backoff, then we stop sending them at all,
 *           assuming that the host ignores our redirects.
 *        2. If we did not see packets requiring redirects
 *           during ip_rt_redirect_silence, we assume that the host
 *           forgot redirected route and start to send redirects again.
 *
 * This algorithm is much cheaper and more intelligent than dumb load limiting
 * in icmp.c.
 *
 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 */

void ip_rt_send_redirect(struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct in_device *in_dev;
        struct inet_peer *peer;
        struct net *net;
        int log_martians;
        int vif;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(rt->dst.dev);
        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
                rcu_read_unlock();
                return;
        }
        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);

        net = dev_net(rt->dst.dev);
        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
        if (!peer) {
                rcu_read_unlock();
                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
                          rt_nexthop(rt, ip_hdr(skb)->daddr));
                return;
        }

        /* No redirected packets during ip_rt_redirect_silence;
         * reset the algorithm.
         */
        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
                peer->rate_tokens = 0;
                peer->n_redirects = 0;
        }

        /* Too many ignored redirects; do not send anything
         * set dst.rate_last to the last seen redirected packet.
         */
        if (peer->n_redirects >= ip_rt_redirect_number) {
                peer->rate_last = jiffies;
                goto out_unlock;
        }

        /* Check for load limit; set rate_last to the latest sent
         * redirect.
         */
        if (peer->n_redirects == 0 ||
            time_after(jiffies,
                       (peer->rate_last +
                        (ip_rt_redirect_load << peer->n_redirects)))) {
                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);

                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
                peer->rate_last = jiffies;
                ++peer->n_redirects;
                if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
                    peer->n_redirects == ip_rt_redirect_number)
                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
                                             &ip_hdr(skb)->saddr, inet_iif(skb),
                                             &ip_hdr(skb)->daddr, &gw);
        }
out_unlock:
        rcu_read_unlock();
}

static int ip_error(struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
        struct inet_peer *peer;
        unsigned long now;
        struct net *net;
        SKB_DR(reason);
        bool send;
        int code;

        if (netif_is_l3_master(skb->dev)) {
                dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
                if (!dev)
                        goto out;
        }

        in_dev = __in_dev_get_rcu(dev);

        /* IP on this device is disabled. */
        if (!in_dev)
                goto out;

        net = dev_net(rt->dst.dev);
        if (!IN_DEV_FORWARD(in_dev)) {
                switch (rt->dst.error) {
                case EHOSTUNREACH:
                        SKB_DR_SET(reason, IP_INADDRERRORS);
                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
                        break;

                case ENETUNREACH:
                        SKB_DR_SET(reason, IP_INNOROUTES);
                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
                        break;
                }
                goto out;
        }

        switch (rt->dst.error) {
        case EINVAL:
        default:
                goto out;
        case EHOSTUNREACH:
                code = ICMP_HOST_UNREACH;
                break;
        case ENETUNREACH:
                code = ICMP_NET_UNREACH;
                SKB_DR_SET(reason, IP_INNOROUTES);
                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
                break;
        case EACCES:
                code = ICMP_PKT_FILTERED;
                break;
        }

        rcu_read_lock();
        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
                               l3mdev_master_ifindex_rcu(skb->dev));
        send = true;
        if (peer) {
                now = jiffies;
                peer->rate_tokens += now - peer->rate_last;
                if (peer->rate_tokens > ip_rt_error_burst)
                        peer->rate_tokens = ip_rt_error_burst;
                peer->rate_last = now;
                if (peer->rate_tokens >= ip_rt_error_cost)
                        peer->rate_tokens -= ip_rt_error_cost;
                else
                        send = false;
        }
        rcu_read_unlock();

        if (send)
                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);

out:        kfree_skb_reason(skb, reason);
        return 0;
}

static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
        struct dst_entry *dst = &rt->dst;
        struct fib_result res;
        bool lock = false;
        struct net *net;
        u32 old_mtu;

        if (ip_mtu_locked(dst))
                return;

        old_mtu = ipv4_mtu(dst);
        if (old_mtu < mtu)
                return;

        rcu_read_lock();
        net = dst_dev_net_rcu(dst);
        if (mtu < net->ipv4.ip_rt_min_pmtu) {
                lock = true;
                mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
        }

        if (rt->rt_pmtu == mtu && !lock &&
            time_before(jiffies, READ_ONCE(dst->expires) -
                                 net->ipv4.ip_rt_mtu_expires / 2))
                goto out;

        if (fib_lookup(net, fl4, &res, 0) == 0) {
                struct fib_nh_common *nhc;

                fib_select_path(net, &res, fl4, NULL);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
                if (fib_info_num_path(res.fi) > 1) {
                        int nhsel;

                        for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
                                nhc = fib_info_nhc(res.fi, nhsel);
                                update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
                                                      jiffies + net->ipv4.ip_rt_mtu_expires);
                        }
                        goto out;
                }
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
                nhc = FIB_RES_NHC(res);
                update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
                                      jiffies + net->ipv4.ip_rt_mtu_expires);
        }
out:
        rcu_read_unlock();
}

static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                              struct sk_buff *skb, u32 mtu,
                              bool confirm_neigh)
{
        struct rtable *rt = dst_rtable(dst);
        struct flowi4 fl4;

        ip_rt_build_flow_key(&fl4, sk, skb);

        /* Don't make lookup fail for bridged encapsulations */
        if (skb && netif_is_any_bridge_port(skb->dev))
                fl4.flowi4_oif = 0;

        __ip_rt_update_pmtu(rt, &fl4, mtu);
}

void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
                      int oif, u8 protocol)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
        u32 mark = IP4_REPLY_MARK(net, skb->mark);

        __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
                         0);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
                __ip_rt_update_pmtu(rt, &fl4, mtu);
                ip_rt_put(rt);
        }
}
EXPORT_SYMBOL_GPL(ipv4_update_pmtu);

static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;

        __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);

        if (!fl4.flowi4_mark)
                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);

        rt = __ip_route_output_key(sock_net(sk), &fl4);
        if (!IS_ERR(rt)) {
                __ip_rt_update_pmtu(rt, &fl4, mtu);
                ip_rt_put(rt);
        }
}

void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
        struct dst_entry *odst = NULL;
        bool new = false;
        struct net *net = sock_net(sk);

        bh_lock_sock(sk);

        if (!ip_sk_accept_pmtu(sk))
                goto out;

        odst = sk_dst_get(sk);

        if (sock_owned_by_user(sk) || !odst) {
                __ipv4_sk_update_pmtu(skb, sk, mtu);
                goto out;
        }

        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);

        rt = dst_rtable(odst);
        if (READ_ONCE(odst->obsolete) && !odst->ops->check(odst, 0)) {
                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
                if (IS_ERR(rt))
                        goto out;

                new = true;
        }

        __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu);

        if (!dst_check(&rt->dst, 0)) {
                if (new)
                        dst_release(&rt->dst);

                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
                if (IS_ERR(rt))
                        goto out;

                new = true;
        }

        if (new)
                sk_dst_set(sk, &rt->dst);

out:
        bh_unlock_sock(sk);
        dst_release(odst);
}
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);

void ipv4_redirect(struct sk_buff *skb, struct net *net,
                   int oif, u8 protocol)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;

        __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
                __ip_do_redirect(rt, skb, &fl4, false);
                ip_rt_put(rt);
        }
}
EXPORT_SYMBOL_GPL(ipv4_redirect);

void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
        struct net *net = sock_net(sk);

        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
                __ip_do_redirect(rt, skb, &fl4, false);
                ip_rt_put(rt);
        }
}
EXPORT_SYMBOL_GPL(ipv4_sk_redirect);

INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
                                                         u32 cookie)
{
        struct rtable *rt = dst_rtable(dst);

        /* All IPV4 dsts are created with ->obsolete set to the value
         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
         * into this function always.
         *
         * When a PMTU/redirect information update invalidates a route,
         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
         * DST_OBSOLETE_DEAD.
         */
        if (READ_ONCE(dst->obsolete) != DST_OBSOLETE_FORCE_CHK ||
            rt_is_expired(rt))
                return NULL;
        return dst;
}
EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);

static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
        struct inet_skb_parm parm;
        struct net_device *dev;
        int res;

        /* Recompile ip options since IPCB may not be valid anymore.
         * Also check we have a reasonable ipv4 header.
         */
        if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
            ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
                return;

        memset(&parm, 0, sizeof(parm));
        if (ip_hdr(skb)->ihl > 5) {
                if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
                        return;
                parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);

                rcu_read_lock();
                dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
                res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL);
                rcu_read_unlock();

                if (res)
                        return;
        }
        __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm);
}

static void ipv4_link_failure(struct sk_buff *skb)
{
        struct rtable *rt;

        ipv4_send_dest_unreach(skb);

        rt = skb_rtable(skb);
        if (rt)
                dst_set_expires(&rt->dst, 0);
}

static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        pr_debug("%s: %pI4 -> %pI4, %s\n",
                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
                 skb->dev ? skb->dev->name : "?");
        kfree_skb(skb);
        WARN_ON(1);
        return 0;
}

/*
 * We do not cache source address of outgoing interface,
 * because it is used only by IP RR, TS and SRR options,
 * so that it out of fast path.
 *
 * BTW remember: "addr" is allowed to be not aligned
 * in IP options!
 */

void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
{
        __be32 src;

        if (rt_is_output_route(rt))
                src = ip_hdr(skb)->saddr;
        else {
                struct fib_result res;
                struct iphdr *iph = ip_hdr(skb);
                struct flowi4 fl4 = {
                        .daddr = iph->daddr,
                        .saddr = iph->saddr,
                        .flowi4_dscp = ip4h_dscp(iph),
                        .flowi4_oif = rt->dst.dev->ifindex,
                        .flowi4_iif = skb->dev->ifindex,
                        .flowi4_mark = skb->mark,
                };

                rcu_read_lock();
                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
                        src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
                else
                        src = inet_select_addr(rt->dst.dev,
                                               rt_nexthop(rt, iph->daddr),
                                               RT_SCOPE_UNIVERSE);
                rcu_read_unlock();
        }
        memcpy(addr, &src, 4);
}

#ifdef CONFIG_IP_ROUTE_CLASSID
static void set_class_tag(struct rtable *rt, u32 tag)
{
        if (!(rt->dst.tclassid & 0xFFFF))
                rt->dst.tclassid |= tag & 0xFFFF;
        if (!(rt->dst.tclassid & 0xFFFF0000))
                rt->dst.tclassid |= tag & 0xFFFF0000;
}
#endif

static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
        unsigned int advmss;
        struct net *net;

        rcu_read_lock();
        net = dst_dev_net_rcu(dst);
        advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
                                   net->ipv4.ip_rt_min_advmss);
        rcu_read_unlock();

        return min(advmss, IPV4_MAX_PMTU - header_size);
}

INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
{
        return ip_dst_mtu_maybe_forward(dst, false);
}
EXPORT_INDIRECT_CALLABLE(ipv4_mtu);

static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
{
        struct fnhe_hash_bucket *hash;
        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
        u32 hval = fnhe_hashfun(daddr);

        spin_lock_bh(&fnhe_lock);

        hash = rcu_dereference_protected(nhc->nhc_exceptions,
                                         lockdep_is_held(&fnhe_lock));
        hash += hval;

        fnhe_p = &hash->chain;
        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
        while (fnhe) {
                if (fnhe->fnhe_daddr == daddr) {
                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
                        /* set fnhe_daddr to 0 to ensure it won't bind with
                         * new dsts in rt_bind_exception().
                         */
                        fnhe->fnhe_daddr = 0;
                        fnhe_flush_routes(fnhe);
                        kfree_rcu(fnhe, rcu);
                        break;
                }
                fnhe_p = &fnhe->fnhe_next;
                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
                                                 lockdep_is_held(&fnhe_lock));
        }

        spin_unlock_bh(&fnhe_lock);
}

static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
                                               __be32 daddr)
{
        struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
        struct fib_nh_exception *fnhe;
        u32 hval;

        if (!hash)
                return NULL;

        hval = fnhe_hashfun(daddr);

        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
             fnhe = rcu_dereference(fnhe->fnhe_next)) {
                if (fnhe->fnhe_daddr == daddr) {
                        if (fnhe->fnhe_expires &&
                            time_after(jiffies, fnhe->fnhe_expires)) {
                                ip_del_fnhe(nhc, daddr);
                                break;
                        }
                        return fnhe;
                }
        }
        return NULL;
}

/* MTU selection:
 * 1. mtu on route is locked - use it
 * 2. mtu from nexthop exception
 * 3. mtu from egress device
 */

u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
{
        struct fib_nh_common *nhc = res->nhc;
        struct net_device *dev = nhc->nhc_dev;
        struct fib_info *fi = res->fi;
        u32 mtu = 0;

        if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
            fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
                mtu = fi->fib_mtu;

        if (likely(!mtu)) {
                struct fib_nh_exception *fnhe;

                fnhe = find_exception(nhc, daddr);
                if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
                        mtu = fnhe->fnhe_pmtu;
        }

        if (likely(!mtu))
                mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);

        return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
}

static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
                              __be32 daddr, const bool do_cache)
{
        bool ret = false;

        spin_lock_bh(&fnhe_lock);

        if (daddr == fnhe->fnhe_daddr) {
                struct rtable __rcu **porig;
                struct rtable *orig;
                int genid = fnhe_genid(dev_net(rt->dst.dev));

                if (rt_is_input_route(rt))
                        porig = &fnhe->fnhe_rth_input;
                else
                        porig = &fnhe->fnhe_rth_output;
                orig = rcu_dereference(*porig);

                if (fnhe->fnhe_genid != genid) {
                        fnhe->fnhe_genid = genid;
                        fnhe->fnhe_gw = 0;
                        fnhe->fnhe_pmtu = 0;
                        fnhe->fnhe_expires = 0;
                        fnhe->fnhe_mtu_locked = false;
                        fnhe_flush_routes(fnhe);
                        orig = NULL;
                }
                fill_route_from_fnhe(rt, fnhe);
                if (!rt->rt_gw4) {
                        rt->rt_gw4 = daddr;
                        rt->rt_gw_family = AF_INET;
                }

                if (do_cache) {
                        dst_hold(&rt->dst);
                        rcu_assign_pointer(*porig, rt);
                        if (orig) {
                                dst_dev_put(&orig->dst);
                                dst_release(&orig->dst);
                        }
                        ret = true;
                }

                fnhe->fnhe_stamp = jiffies;
        }
        spin_unlock_bh(&fnhe_lock);

        return ret;
}

static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
{
        struct rtable *orig, *prev, **p;
        bool ret = true;

        if (rt_is_input_route(rt)) {
                p = (struct rtable **)&nhc->nhc_rth_input;
        } else {
                p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
        }
        orig = *p;

        /* hold dst before doing cmpxchg() to avoid race condition
         * on this dst
         */
        dst_hold(&rt->dst);
        prev = cmpxchg(p, orig, rt);
        if (prev == orig) {
                if (orig) {
                        rt_add_uncached_list(orig);
                        dst_release(&orig->dst);
                }
        } else {
                dst_release(&rt->dst);
                ret = false;
        }

        return ret;
}

struct uncached_list {
        spinlock_t                lock;
        struct list_head        head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);

void rt_add_uncached_list(struct rtable *rt)
{
        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);

        rt->dst.rt_uncached_list = ul;

        spin_lock_bh(&ul->lock);
        list_add_tail(&rt->dst.rt_uncached, &ul->head);
        spin_unlock_bh(&ul->lock);
}

void rt_del_uncached_list(struct rtable *rt)
{
        struct uncached_list *ul = rt->dst.rt_uncached_list;

        if (ul) {
                spin_lock_bh(&ul->lock);
                list_del_init(&rt->dst.rt_uncached);
                spin_unlock_bh(&ul->lock);
        }
}

static void ipv4_dst_destroy(struct dst_entry *dst)
{
        ip_dst_metrics_put(dst);
        rt_del_uncached_list(dst_rtable(dst));
}

void rt_flush_dev(struct net_device *dev)
{
        struct rtable *rt, *safe;
        int cpu;

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);

                if (list_empty(&ul->head))
                        continue;

                spin_lock_bh(&ul->lock);
                list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
                        if (rt->dst.dev != dev)
                                continue;
                        rt->dst.dev = blackhole_netdev;
                        netdev_ref_replace(dev, blackhole_netdev,
                                           &rt->dst.dev_tracker, GFP_ATOMIC);
                        list_del_init(&rt->dst.rt_uncached);
                }
                spin_unlock_bh(&ul->lock);
        }
}

static bool rt_cache_valid(const struct rtable *rt)
{
        return        rt &&
                READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK &&
                !rt_is_expired(rt);
}

static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
                           const struct fib_result *res,
                           struct fib_nh_exception *fnhe,
                           struct fib_info *fi, u16 type, u32 itag,
                           const bool do_cache)
{
        bool cached = false;

        if (fi) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);

                if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
                        rt->rt_uses_gateway = 1;
                        rt->rt_gw_family = nhc->nhc_gw_family;
                        /* only INET and INET6 are supported */
                        if (likely(nhc->nhc_gw_family == AF_INET))
                                rt->rt_gw4 = nhc->nhc_gw.ipv4;
                        else
                                rt->rt_gw6 = nhc->nhc_gw.ipv6;
                }

                ip_dst_init_metrics(&rt->dst, fi->fib_metrics);

#ifdef CONFIG_IP_ROUTE_CLASSID
                if (nhc->nhc_family == AF_INET) {
                        struct fib_nh *nh;

                        nh = container_of(nhc, struct fib_nh, nh_common);
                        rt->dst.tclassid = nh->nh_tclassid;
                }
#endif
                rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
                if (unlikely(fnhe))
                        cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
                else if (do_cache)
                        cached = rt_cache_route(nhc, rt);
                if (unlikely(!cached)) {
                        /* Routes we intend to cache in nexthop exception or
                         * FIB nexthop have the DST_NOCACHE bit clear.
                         * However, if we are unsuccessful at storing this
                         * route into the cache we really need to set it.
                         */
                        if (!rt->rt_gw4) {
                                rt->rt_gw_family = AF_INET;
                                rt->rt_gw4 = daddr;
                        }
                        rt_add_uncached_list(rt);
                }
        } else
                rt_add_uncached_list(rt);

#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
        set_class_tag(rt, res->tclassid);
#endif
        set_class_tag(rt, itag);
#endif
}

struct rtable *rt_dst_alloc(struct net_device *dev,
                            unsigned int flags, u16 type,
                            bool noxfrm)
{
        struct rtable *rt;

        rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
                       (noxfrm ? DST_NOXFRM : 0));

        if (rt) {
                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
                rt->rt_flags = flags;
                rt->rt_type = type;
                rt->rt_is_input = 0;
                rt->rt_iif = 0;
                rt->rt_pmtu = 0;
                rt->rt_mtu_locked = 0;
                rt->rt_uses_gateway = 0;
                rt->rt_gw_family = 0;
                rt->rt_gw4 = 0;

                rt->dst.output = ip_output;
                if (flags & RTCF_LOCAL)
                        rt->dst.input = ip_local_deliver;
        }

        return rt;
}
EXPORT_SYMBOL(rt_dst_alloc);

struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
{
        struct rtable *new_rt;

        new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
                           rt->dst.flags);

        if (new_rt) {
                new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
                new_rt->rt_flags = rt->rt_flags;
                new_rt->rt_type = rt->rt_type;
                new_rt->rt_is_input = rt->rt_is_input;
                new_rt->rt_iif = rt->rt_iif;
                new_rt->rt_pmtu = rt->rt_pmtu;
                new_rt->rt_mtu_locked = rt->rt_mtu_locked;
                new_rt->rt_gw_family = rt->rt_gw_family;
                if (rt->rt_gw_family == AF_INET)
                        new_rt->rt_gw4 = rt->rt_gw4;
                else if (rt->rt_gw_family == AF_INET6)
                        new_rt->rt_gw6 = rt->rt_gw6;

                new_rt->dst.input = READ_ONCE(rt->dst.input);
                new_rt->dst.output = READ_ONCE(rt->dst.output);
                new_rt->dst.error = rt->dst.error;
                new_rt->dst.lastuse = jiffies;
                new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
        }
        return new_rt;
}
EXPORT_SYMBOL(rt_dst_clone);

/* called in rcu_read_lock() section */
enum skb_drop_reason
ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                      dscp_t dscp, struct net_device *dev,
                      struct in_device *in_dev, u32 *itag)
{
        enum skb_drop_reason reason;

        /* Primary sanity checks. */
        if (!in_dev)
                return SKB_DROP_REASON_NOT_SPECIFIED;

        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
                return SKB_DROP_REASON_IP_INVALID_SOURCE;

        if (skb->protocol != htons(ETH_P_IP))
                return SKB_DROP_REASON_INVALID_PROTO;

        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
                return SKB_DROP_REASON_IP_LOCALNET;

        if (ipv4_is_zeronet(saddr)) {
                if (!ipv4_is_local_multicast(daddr) &&
                    ip_hdr(skb)->protocol != IPPROTO_IGMP)
                        return SKB_DROP_REASON_IP_INVALID_SOURCE;
        } else {
                reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0,
                                                    dev, in_dev, itag);
                if (reason)
                        return reason;
        }
        return SKB_NOT_DROPPED_YET;
}

/* called in rcu_read_lock() section */
static enum skb_drop_reason
ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                  dscp_t dscp, struct net_device *dev, int our)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        unsigned int flags = RTCF_MULTICAST;
        enum skb_drop_reason reason;
        struct rtable *rth;
        u32 itag = 0;

        reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev,
                                       &itag);
        if (reason)
                return reason;

        if (our)
                flags |= RTCF_LOCAL;

        if (IN_DEV_ORCONF(in_dev, NOPOLICY))
                IPCB(skb)->flags |= IPSKB_NOPOLICY;

        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
                           false);
        if (!rth)
                return SKB_DROP_REASON_NOMEM;

#ifdef CONFIG_IP_ROUTE_CLASSID
        rth->dst.tclassid = itag;
#endif
        rth->dst.output = ip_rt_bug;
        rth->rt_is_input= 1;

#ifdef CONFIG_IP_MROUTE
        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
                rth->dst.input = ip_mr_input;
#endif
        RT_CACHE_STAT_INC(in_slow_mc);

        skb_dst_drop(skb);
        skb_dst_set(skb, &rth->dst);
        return SKB_NOT_DROPPED_YET;
}


static void ip_handle_martian_source(struct net_device *dev,
                                     struct in_device *in_dev,
                                     struct sk_buff *skb,
                                     __be32 daddr,
                                     __be32 saddr)
{
        RT_CACHE_STAT_INC(in_martian_src);
#ifdef CONFIG_IP_ROUTE_VERBOSE
        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
                /*
                 *        RFC1812 recommendation, if source is martian,
                 *        the only hint is MAC header.
                 */
                pr_warn("martian source (src=%pI4, dst=%pI4, dev=%s)\n",
                        &saddr, &daddr, dev->name);
                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
                        print_hex_dump(KERN_WARNING, "ll header: ",
                                       DUMP_PREFIX_OFFSET, 16, 1,
                                       skb_mac_header(skb),
                                       dev->hard_header_len, false);
                }
        }
#endif
}

/* called in rcu_read_lock() section */
static enum skb_drop_reason
__mkroute_input(struct sk_buff *skb, const struct fib_result *res,
                struct in_device *in_dev, __be32 daddr,
                __be32 saddr, dscp_t dscp)
{
        enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct fib_nh_common *nhc = FIB_RES_NHC(*res);
        struct net_device *dev = nhc->nhc_dev;
        struct fib_nh_exception *fnhe;
        struct rtable *rth;
        int err;
        struct in_device *out_dev;
        bool do_cache;
        u32 itag = 0;

        /* get a working reference to the output device */
        out_dev = __in_dev_get_rcu(dev);
        if (!out_dev) {
                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
                return reason;
        }

        err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res),
                                  in_dev->dev, in_dev, &itag);
        if (err < 0) {
                reason = -err;
                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
                                         saddr);

                goto cleanup;
        }

        do_cache = res->fi && !itag;
        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
            skb->protocol == htons(ETH_P_IP)) {
                __be32 gw;

                gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
                if (IN_DEV_SHARED_MEDIA(out_dev) ||
                    inet_addr_onlink(out_dev, saddr, gw))
                        IPCB(skb)->flags |= IPSKB_DOREDIRECT;
        }

        if (skb->protocol != htons(ETH_P_IP)) {
                /* Not IP (i.e. ARP). Do not create route, if it is
                 * invalid for proxy arp. DNAT routes are always valid.
                 *
                 * Proxy arp feature have been extended to allow, ARP
                 * replies back to the same interface, to support
                 * Private VLAN switch technologies. See arp.c.
                 */
                if (out_dev == in_dev &&
                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
                        reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE;
                        goto cleanup;
                }
        }

        if (IN_DEV_ORCONF(in_dev, NOPOLICY))
                IPCB(skb)->flags |= IPSKB_NOPOLICY;

        fnhe = find_exception(nhc, daddr);
        if (do_cache) {
                if (fnhe)
                        rth = rcu_dereference(fnhe->fnhe_rth_input);
                else
                        rth = rcu_dereference(nhc->nhc_rth_input);
                if (rt_cache_valid(rth)) {
                        skb_dst_set_noref(skb, &rth->dst);
                        goto out;
                }
        }

        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
                           IN_DEV_ORCONF(out_dev, NOXFRM));
        if (!rth) {
                reason = SKB_DROP_REASON_NOMEM;
                goto cleanup;
        }

        rth->rt_is_input = 1;
        RT_CACHE_STAT_INC(in_slow_tot);

        rth->dst.input = ip_forward;

        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
                       do_cache);
        lwtunnel_set_redirect(&rth->dst);
        skb_dst_set(skb, &rth->dst);
out:
        reason = SKB_NOT_DROPPED_YET;
cleanup:
        return reason;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
/* To make ICMP packets follow the right flow, the multipath hash is
 * calculated from the inner IP addresses.
 */
static void ip_multipath_l3_keys(const struct sk_buff *skb,
                                 struct flow_keys *hash_keys)
{
        const struct iphdr *outer_iph = ip_hdr(skb);
        const struct iphdr *key_iph = outer_iph;
        const struct iphdr *inner_iph;
        const struct icmphdr *icmph;
        struct iphdr _inner_iph;
        struct icmphdr _icmph;

        if (likely(outer_iph->protocol != IPPROTO_ICMP))
                goto out;

        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
                goto out;

        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
                                   &_icmph);
        if (!icmph)
                goto out;

        if (!icmp_is_err(icmph->type))
                goto out;

        inner_iph = skb_header_pointer(skb,
                                       outer_iph->ihl * 4 + sizeof(_icmph),
                                       sizeof(_inner_iph), &_inner_iph);
        if (!inner_iph)
                goto out;

        key_iph = inner_iph;
out:
        hash_keys->addrs.v4addrs.src = key_iph->saddr;
        hash_keys->addrs.v4addrs.dst = key_iph->daddr;
}

static u32 fib_multipath_custom_hash_outer(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool *p_has_inner)
{
        u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
        struct flow_keys keys, hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);

        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 fib_multipath_custom_hash_inner(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool has_inner)
{
        u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
        struct flow_keys keys, hash_keys;

        /* We assume the packet carries an encapsulation, but if none was
         * encountered during dissection of the outer flow, then there is no
         * point in calling the flow dissector again.
         */
        if (!has_inner)
                return 0;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, 0);

        if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
                return 0;

        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
                        hash_keys.tags.flow_label = keys.tags.flow_label;
        }

        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 fib_multipath_custom_hash_skb(const struct net *net,
                                         const struct sk_buff *skb)
{
        u32 mhash, mhash_inner;
        bool has_inner = true;

        mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
        mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);

        return jhash_2words(mhash, mhash_inner, 0);
}

static u32 fib_multipath_custom_hash_fl4(const struct net *net,
                                         const struct flowi4 *fl4)
{
        u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
        struct flow_keys hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v4addrs.src = fl4->saddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v4addrs.dst = fl4->daddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = fl4->flowi4_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
                if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
                        hash_keys.ports.src = (__force __be16)get_random_u16();
                else
                        hash_keys.ports.src = fl4->fl4_sport;
        }
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = fl4->fl4_dport;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

/* if skb is set it will be used and fl4 can be NULL */
int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
                       const struct sk_buff *skb, struct flow_keys *flkeys)
{
        u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
        struct flow_keys hash_keys;
        u32 mhash = 0;

        switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
        case 0:
                memset(&hash_keys, 0, sizeof(hash_keys));
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                if (skb) {
                        ip_multipath_l3_keys(skb, &hash_keys);
                } else {
                        hash_keys.addrs.v4addrs.src = fl4->saddr;
                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 1:
                /* skb is currently provided only when forwarding */
                if (skb) {
                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
                        struct flow_keys keys;

                        /* short-circuit if we already have L4 hash present */
                        if (skb->l4_hash)
                                return skb_get_hash_raw(skb) >> 1;

                        memset(&hash_keys, 0, sizeof(hash_keys));

                        if (!flkeys) {
                                skb_flow_dissect_flow_keys(skb, &keys, flag);
                                flkeys = &keys;
                        }

                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                        hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
                        hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
                        hash_keys.ports.src = flkeys->ports.src;
                        hash_keys.ports.dst = flkeys->ports.dst;
                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
                } else {
                        memset(&hash_keys, 0, sizeof(hash_keys));
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                        hash_keys.addrs.v4addrs.src = fl4->saddr;
                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
                        if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
                                hash_keys.ports.src = (__force __be16)get_random_u16();
                        else
                                hash_keys.ports.src = fl4->fl4_sport;
                        hash_keys.ports.dst = fl4->fl4_dport;
                        hash_keys.basic.ip_proto = fl4->flowi4_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 2:
                memset(&hash_keys, 0, sizeof(hash_keys));
                /* skb is currently provided only when forwarding */
                if (skb) {
                        struct flow_keys keys;

                        skb_flow_dissect_flow_keys(skb, &keys, 0);
                        /* Inner can be v4 or v6 */
                        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                                hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
                                hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
                        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                                hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
                                hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
                                hash_keys.tags.flow_label = keys.tags.flow_label;
                                hash_keys.basic.ip_proto = keys.basic.ip_proto;
                        } else {
                                /* Same as case 0 */
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                                ip_multipath_l3_keys(skb, &hash_keys);
                        }
                } else {
                        /* Same as case 0 */
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                        hash_keys.addrs.v4addrs.src = fl4->saddr;
                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 3:
                if (skb)
                        mhash = fib_multipath_custom_hash_skb(net, skb);
                else
                        mhash = fib_multipath_custom_hash_fl4(net, fl4);
                break;
        }

        if (multipath_hash)
                mhash = jhash_2words(mhash, multipath_hash, 0);

        return mhash >> 1;
}
#endif /* CONFIG_IP_ROUTE_MULTIPATH */

static enum skb_drop_reason
ip_mkroute_input(struct sk_buff *skb, struct fib_result *res,
                 struct in_device *in_dev, __be32 daddr,
                 __be32 saddr, dscp_t dscp, struct flow_keys *hkeys)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res->fi && fib_info_num_path(res->fi) > 1) {
                int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);

                fib_select_multipath(res, h, NULL);
                IPCB(skb)->flags |= IPSKB_MULTIPATH;
        }
#endif

        /* create a routing cache entry */
        return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp);
}

/* Implements all the saddr-related checks as ip_route_input_slow(),
 * assuming daddr is valid and the destination is not a local broadcast one.
 * Uses the provided hint instead of performing a route lookup.
 */
enum skb_drop_reason
ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                  dscp_t dscp, struct net_device *dev,
                  const struct sk_buff *hint)
{
        enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        struct rtable *rt = skb_rtable(hint);
        struct net *net = dev_net(dev);
        u32 tag = 0;

        if (!in_dev)
                return reason;

        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) {
                reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
                goto martian_source;
        }

        if (ipv4_is_zeronet(saddr)) {
                reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
                goto martian_source;
        }

        if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
                reason = SKB_DROP_REASON_IP_LOCALNET;
                goto martian_source;
        }

        if (!(rt->rt_flags & RTCF_LOCAL))
                goto skip_validate_source;

        reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev,
                                            in_dev, &tag);
        if (reason)
                goto martian_source;

skip_validate_source:
        skb_dst_copy(skb, hint);
        return SKB_NOT_DROPPED_YET;

martian_source:
        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
        return reason;
}

/* get device for dst_alloc with local routes */
static struct net_device *ip_rt_get_dev(struct net *net,
                                        const struct fib_result *res)
{
        struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
        struct net_device *dev = NULL;

        if (nhc)
                dev = l3mdev_master_dev_rcu(nhc->nhc_dev);

        return dev ? : net->loopback_dev;
}

/*
 *        NOTE. We drop all the packets that has local source
 *        addresses, because every properly looped back packet
 *        must have correct destination already attached by output routine.
 *        Changes in the enforced policies must be applied also to
 *        ip_route_use_hint().
 *
 *        Such approach solves two big problems:
 *        1. Not simplex devices are handled properly.
 *        2. IP spoofing attempts are filtered with 100% of guarantee.
 *        called with rcu_read_lock()
 */

static enum skb_drop_reason
ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                    dscp_t dscp, struct net_device *dev,
                    struct fib_result *res)
{
        enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        struct flow_keys *flkeys = NULL, _flkeys;
        struct net    *net = dev_net(dev);
        struct ip_tunnel_info *tun_info;
        int                err = -EINVAL;
        unsigned int        flags = 0;
        u32                itag = 0;
        struct rtable        *rth;
        struct flowi4        fl4;
        bool do_cache = true;

        /* IP on this device is disabled. */

        if (!in_dev)
                goto out;

        /* Check for the most weird martians, which can be not detected
         * by fib_lookup.
         */

        tun_info = skb_tunnel_info(skb);
        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
        else
                fl4.flowi4_tun_key.tun_id = 0;
        skb_dst_drop(skb);

        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) {
                reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
                goto martian_source;
        }

        res->fi = NULL;
        res->table = NULL;
        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
                goto brd_input;

        /* Accept zero addresses only to limited broadcast;
         * I even do not know to fix it or not. Waiting for complains :-)
         */
        if (ipv4_is_zeronet(saddr)) {
                reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
                goto martian_source;
        }

        if (ipv4_is_zeronet(daddr)) {
                reason = SKB_DROP_REASON_IP_INVALID_DEST;
                goto martian_destination;
        }

        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
         * and call it once if daddr or/and saddr are loopback addresses
         */
        if (ipv4_is_loopback(daddr)) {
                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
                        reason = SKB_DROP_REASON_IP_LOCALNET;
                        goto martian_destination;
                }
        } else if (ipv4_is_loopback(saddr)) {
                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
                        reason = SKB_DROP_REASON_IP_LOCALNET;
                        goto martian_source;
                }
        }

        /*
         *        Now we are ready to route packet.
         */
        fl4.flowi4_l3mdev = 0;
        fl4.flowi4_oif = 0;
        fl4.flowi4_iif = dev->ifindex;
        fl4.flowi4_mark = skb->mark;
        fl4.flowi4_dscp = dscp;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_flags = 0;
        fl4.daddr = daddr;
        fl4.saddr = saddr;
        fl4.flowi4_uid = sock_net_uid(net, NULL);
        fl4.flowi4_multipath_hash = 0;

        if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
                flkeys = &_flkeys;
        } else {
                fl4.flowi4_proto = 0;
                fl4.fl4_sport = 0;
                fl4.fl4_dport = 0;
        }

        err = fib_lookup(net, &fl4, res, 0);
        if (err != 0) {
                if (!IN_DEV_FORWARD(in_dev))
                        err = -EHOSTUNREACH;
                goto no_route;
        }

        if (res->type == RTN_BROADCAST) {
                if (IN_DEV_BFORWARD(in_dev))
                        goto make_route;
                /* not do cache if bc_forwarding is enabled */
                if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
                        do_cache = false;
                goto brd_input;
        }

        err = -EINVAL;
        if (res->type == RTN_LOCAL) {
                reason = fib_validate_source_reason(skb, saddr, daddr, dscp,
                                                    0, dev, in_dev, &itag);
                if (reason)
                        goto martian_source;
                goto local_input;
        }

        if (!IN_DEV_FORWARD(in_dev)) {
                err = -EHOSTUNREACH;
                goto no_route;
        }
        if (res->type != RTN_UNICAST) {
                reason = SKB_DROP_REASON_IP_INVALID_DEST;
                goto martian_destination;
        }

make_route:
        reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp,
                                  flkeys);

out:
        return reason;

brd_input:
        if (skb->protocol != htons(ETH_P_IP)) {
                reason = SKB_DROP_REASON_INVALID_PROTO;
                goto out;
        }

        if (!ipv4_is_zeronet(saddr)) {
                reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0,
                                                    dev, in_dev, &itag);
                if (reason)
                        goto martian_source;
        }
        flags |= RTCF_BROADCAST;
        res->type = RTN_BROADCAST;
        RT_CACHE_STAT_INC(in_brd);

local_input:
        if (IN_DEV_ORCONF(in_dev, NOPOLICY))
                IPCB(skb)->flags |= IPSKB_NOPOLICY;

        do_cache &= res->fi && !itag;
        if (do_cache) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);

                rth = rcu_dereference(nhc->nhc_rth_input);
                if (rt_cache_valid(rth)) {
                        skb_dst_set_noref(skb, &rth->dst);
                        reason = SKB_NOT_DROPPED_YET;
                        goto out;
                }
        }

        rth = rt_dst_alloc(ip_rt_get_dev(net, res),
                           flags | RTCF_LOCAL, res->type, false);
        if (!rth)
                goto e_nobufs;

        rth->dst.output= ip_rt_bug;
#ifdef CONFIG_IP_ROUTE_CLASSID
        rth->dst.tclassid = itag;
#endif
        rth->rt_is_input = 1;

        RT_CACHE_STAT_INC(in_slow_tot);
        if (res->type == RTN_UNREACHABLE) {
                rth->dst.input= ip_error;
                rth->dst.error= -err;
                rth->rt_flags        &= ~RTCF_LOCAL;
        }

        if (do_cache) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);

                rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
                if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
                        WARN_ON(rth->dst.input == lwtunnel_input);
                        rth->dst.lwtstate->orig_input = rth->dst.input;
                        rth->dst.input = lwtunnel_input;
                }

                if (unlikely(!rt_cache_route(nhc, rth)))
                        rt_add_uncached_list(rth);
        }
        skb_dst_set(skb, &rth->dst);
        reason = SKB_NOT_DROPPED_YET;
        goto out;

no_route:
        RT_CACHE_STAT_INC(in_no_route);
        res->type = RTN_UNREACHABLE;
        res->fi = NULL;
        res->table = NULL;
        goto local_input;

        /*
         *        Do not cache martian addresses: they should be logged (RFC1812)
         */
martian_destination:
        RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
        if (IN_DEV_LOG_MARTIANS(in_dev))
                net_warn_ratelimited("martian destination (src=%pI4, dst=%pI4, dev=%s)\n",
                                     &saddr, &daddr, dev->name);
#endif
        goto out;

e_nobufs:
        reason = SKB_DROP_REASON_NOMEM;
        goto out;

martian_source:
        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
        goto out;
}

/* called with rcu_read_lock held */
static enum skb_drop_reason
ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                   dscp_t dscp, struct net_device *dev,
                   struct fib_result *res)
{
        /* Multicast recognition logic is moved from route cache to here.
         * The problem was that too many Ethernet cards have broken/missing
         * hardware multicast filters :-( As result the host on multicasting
         * network acquires a lot of useless route cache entries, sort of
         * SDR messages from all the world. Now we try to get rid of them.
         * Really, provided software IP multicast filter is organized
         * reasonably (at least, hashed), it does not result in a slowdown
         * comparing with route cache reject entries.
         * Note, that multicast routers are not affected, because
         * route cache entry is created eventually.
         */
        if (ipv4_is_multicast(daddr)) {
                enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
                struct in_device *in_dev = __in_dev_get_rcu(dev);
                int our = 0;

                if (!in_dev)
                        return reason;

                our = ip_check_mc_rcu(in_dev, daddr, saddr,
                                      ip_hdr(skb)->protocol);

                /* check l3 master if no match yet */
                if (!our && netif_is_l3_slave(dev)) {
                        struct in_device *l3_in_dev;

                        l3_in_dev = __in_dev_get_rcu(skb->dev);
                        if (l3_in_dev)
                                our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
                                                      ip_hdr(skb)->protocol);
                }

                if (our
#ifdef CONFIG_IP_MROUTE
                        ||
                    (!ipv4_is_local_multicast(daddr) &&
                     IN_DEV_MFORWARD(in_dev))
#endif
                   ) {
                        reason = ip_route_input_mc(skb, daddr, saddr, dscp,
                                                   dev, our);
                }
                return reason;
        }

        return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res);
}

enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr,
                                          __be32 saddr, dscp_t dscp,
                                          struct net_device *dev)
{
        enum skb_drop_reason reason;
        struct fib_result res;

        rcu_read_lock();
        reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res);
        rcu_read_unlock();

        return reason;
}
EXPORT_SYMBOL(ip_route_input_noref);

/* called with rcu_read_lock() */
static struct rtable *__mkroute_output(const struct fib_result *res,
                                       const struct flowi4 *fl4, int orig_oif,
                                       struct net_device *dev_out,
                                       unsigned int flags)
{
        struct fib_info *fi = res->fi;
        struct fib_nh_exception *fnhe;
        struct in_device *in_dev;
        u16 type = res->type;
        struct rtable *rth;
        bool do_cache;

        in_dev = __in_dev_get_rcu(dev_out);
        if (!in_dev)
                return ERR_PTR(-EINVAL);

        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
                if (ipv4_is_loopback(fl4->saddr) &&
                    !(dev_out->flags & IFF_LOOPBACK) &&
                    !netif_is_l3_master(dev_out))
                        return ERR_PTR(-EINVAL);

        if (ipv4_is_lbcast(fl4->daddr)) {
                type = RTN_BROADCAST;

                /* reset fi to prevent gateway resolution */
                fi = NULL;
        } else if (ipv4_is_multicast(fl4->daddr)) {
                type = RTN_MULTICAST;
        } else if (ipv4_is_zeronet(fl4->daddr)) {
                return ERR_PTR(-EINVAL);
        }

        if (dev_out->flags & IFF_LOOPBACK)
                flags |= RTCF_LOCAL;

        do_cache = true;
        if (type == RTN_BROADCAST) {
                flags |= RTCF_BROADCAST | RTCF_LOCAL;
        } else if (type == RTN_MULTICAST) {
                flags |= RTCF_MULTICAST | RTCF_LOCAL;
                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
                                     fl4->flowi4_proto))
                        flags &= ~RTCF_LOCAL;
                else
                        do_cache = false;
                /* If multicast route do not exist use
                 * default one, but do not gateway in this case.
                 * Yes, it is hack.
                 */
                if (fi && res->prefixlen < 4)
                        fi = NULL;
        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
                   (orig_oif != dev_out->ifindex)) {
                /* For local routes that require a particular output interface
                 * we do not want to cache the result.  Caching the result
                 * causes incorrect behaviour when there are multiple source
                 * addresses on the interface, the end result being that if the
                 * intended recipient is waiting on that interface for the
                 * packet he won't receive it because it will be delivered on
                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
                 * be set to the loopback interface as well.
                 */
                do_cache = false;
        }

        fnhe = NULL;
        do_cache &= fi != NULL;
        if (fi) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
                struct rtable __rcu **prth;

                fnhe = find_exception(nhc, fl4->daddr);
                if (!do_cache)
                        goto add;
                if (fnhe) {
                        prth = &fnhe->fnhe_rth_output;
                } else {
                        if (unlikely(fl4->flowi4_flags &
                                     FLOWI_FLAG_KNOWN_NH &&
                                     !(nhc->nhc_gw_family &&
                                       nhc->nhc_scope == RT_SCOPE_LINK))) {
                                do_cache = false;
                                goto add;
                        }
                        prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
                }
                rth = rcu_dereference(*prth);
                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
                        return rth;
        }

add:
        rth = rt_dst_alloc(dev_out, flags, type,
                           IN_DEV_ORCONF(in_dev, NOXFRM));
        if (!rth)
                return ERR_PTR(-ENOBUFS);

        rth->rt_iif = orig_oif;

        RT_CACHE_STAT_INC(out_slow_tot);

        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
                if (flags & RTCF_LOCAL &&
                    !(dev_out->flags & IFF_LOOPBACK)) {
                        rth->dst.output = ip_mc_output;
                        RT_CACHE_STAT_INC(out_slow_mc);
                }
#ifdef CONFIG_IP_MROUTE
                if (type == RTN_MULTICAST) {
                        if (IN_DEV_MFORWARD(in_dev) &&
                            !ipv4_is_local_multicast(fl4->daddr)) {
                                rth->dst.input = ip_mr_input;
                                rth->dst.output = ip_mr_output;
                        }
                }
#endif
        }

        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
        lwtunnel_set_redirect(&rth->dst);

        return rth;
}

/*
 * Major route resolver routine.
 */

struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
                                        const struct sk_buff *skb)
{
        struct fib_result res = {
                .type                = RTN_UNSPEC,
                .fi                = NULL,
                .table                = NULL,
                .tclassid        = 0,
        };
        struct rtable *rth;

        fl4->flowi4_iif = LOOPBACK_IFINDEX;

        rcu_read_lock();
        rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
        rcu_read_unlock();

        return rth;
}
EXPORT_SYMBOL_GPL(ip_route_output_key_hash);

struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
                                            struct fib_result *res,
                                            const struct sk_buff *skb)
{
        struct net_device *dev_out = NULL;
        int orig_oif = fl4->flowi4_oif;
        unsigned int flags = 0;
        struct rtable *rth;
        int err;

        if (fl4->saddr) {
                if (ipv4_is_multicast(fl4->saddr) ||
                    ipv4_is_lbcast(fl4->saddr)) {
                        rth = ERR_PTR(-EINVAL);
                        goto out;
                }

                rth = ERR_PTR(-ENETUNREACH);

                /* I removed check for oif == dev_out->oif here.
                 * It was wrong for two reasons:
                 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
                 *    is assigned to multiple interfaces.
                 * 2. Moreover, we are allowed to send packets with saddr
                 *    of another iface. --ANK
                 */

                if (fl4->flowi4_oif == 0 &&
                    (ipv4_is_multicast(fl4->daddr) ||
                     ipv4_is_lbcast(fl4->daddr))) {
                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
                        dev_out = __ip_dev_find(net, fl4->saddr, false);
                        if (!dev_out)
                                goto out;

                        /* Special hack: user can direct multicasts
                         * and limited broadcast via necessary interface
                         * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
                         * This hack is not just for fun, it allows
                         * vic,vat and friends to work.
                         * They bind socket to loopback, set ttl to zero
                         * and expect that it will work.
                         * From the viewpoint of routing cache they are broken,
                         * because we are not allowed to build multicast path
                         * with loopback source addr (look, routing cache
                         * cannot know, that ttl is zero, so that packet
                         * will not leave this host and route is valid).
                         * Luckily, this hack is good workaround.
                         */

                        fl4->flowi4_oif = dev_out->ifindex;
                        goto make_route;
                }

                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
                        if (!__ip_dev_find(net, fl4->saddr, false))
                                goto out;
                }
        }


        if (fl4->flowi4_oif) {
                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
                rth = ERR_PTR(-ENODEV);
                if (!dev_out)
                        goto out;

                /* RACE: Check return value of inet_select_addr instead. */
                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
                        rth = ERR_PTR(-ENETUNREACH);
                        goto out;
                }
                if (ipv4_is_local_multicast(fl4->daddr) ||
                    ipv4_is_lbcast(fl4->daddr) ||
                    fl4->flowi4_proto == IPPROTO_IGMP) {
                        if (!fl4->saddr)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_LINK);
                        goto make_route;
                }
                if (!fl4->saddr) {
                        if (ipv4_is_multicast(fl4->daddr))
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              fl4->flowi4_scope);
                        else if (!fl4->daddr)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_HOST);
                }
        }

        if (!fl4->daddr) {
                fl4->daddr = fl4->saddr;
                if (!fl4->daddr)
                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
                dev_out = net->loopback_dev;
                fl4->flowi4_oif = LOOPBACK_IFINDEX;
                res->type = RTN_LOCAL;
                flags |= RTCF_LOCAL;
                goto make_route;
        }

        err = fib_lookup(net, fl4, res, 0);
        if (err) {
                res->fi = NULL;
                res->table = NULL;
                if (fl4->flowi4_oif &&
                    (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
                        /* Apparently, routing tables are wrong. Assume,
                         * that the destination is on link.
                         *
                         * WHY? DW.
                         * Because we are allowed to send to iface
                         * even if it has NO routes and NO assigned
                         * addresses. When oif is specified, routing
                         * tables are looked up with only one purpose:
                         * to catch if destination is gatewayed, rather than
                         * direct. Moreover, if MSG_DONTROUTE is set,
                         * we send packet, ignoring both routing tables
                         * and ifaddr state. --ANK
                         *
                         *
                         * We could make it even if oif is unknown,
                         * likely IPv6, but we do not.
                         */

                        if (fl4->saddr == 0)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_LINK);
                        res->type = RTN_UNICAST;
                        goto make_route;
                }
                rth = ERR_PTR(err);
                goto out;
        }

        if (res->type == RTN_LOCAL) {
                if (!fl4->saddr) {
                        if (res->fi->fib_prefsrc)
                                fl4->saddr = res->fi->fib_prefsrc;
                        else
                                fl4->saddr = fl4->daddr;
                }

                /* L3 master device is the loopback for that domain */
                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
                        net->loopback_dev;

                /* make sure orig_oif points to fib result device even
                 * though packet rx/tx happens over loopback or l3mdev
                 */
                orig_oif = FIB_RES_OIF(*res);

                fl4->flowi4_oif = dev_out->ifindex;
                flags |= RTCF_LOCAL;
                goto make_route;
        }

        fib_select_path(net, res, fl4, skb);

        dev_out = FIB_RES_DEV(*res);

make_route:
        rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);

out:
        return rth;
}

static struct dst_ops ipv4_dst_blackhole_ops = {
        .family                        = AF_INET,
        .default_advmss                = ipv4_default_advmss,
        .neigh_lookup                = ipv4_neigh_lookup,
        .check                        = dst_blackhole_check,
        .cow_metrics                = dst_blackhole_cow_metrics,
        .update_pmtu                = dst_blackhole_update_pmtu,
        .redirect                = dst_blackhole_redirect,
        .mtu                        = dst_blackhole_mtu,
};

struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
        struct rtable *ort = dst_rtable(dst_orig);
        struct rtable *rt;

        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
        if (rt) {
                struct dst_entry *new = &rt->dst;

                new->__use = 1;
                new->input = dst_discard;
                new->output = dst_discard_out;

                new->dev = net->loopback_dev;
                netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);

                rt->rt_is_input = ort->rt_is_input;
                rt->rt_iif = ort->rt_iif;
                rt->rt_pmtu = ort->rt_pmtu;
                rt->rt_mtu_locked = ort->rt_mtu_locked;

                rt->rt_genid = rt_genid_ipv4(net);
                rt->rt_flags = ort->rt_flags;
                rt->rt_type = ort->rt_type;
                rt->rt_uses_gateway = ort->rt_uses_gateway;
                rt->rt_gw_family = ort->rt_gw_family;
                if (rt->rt_gw_family == AF_INET)
                        rt->rt_gw4 = ort->rt_gw4;
                else if (rt->rt_gw_family == AF_INET6)
                        rt->rt_gw6 = ort->rt_gw6;
        }

        dst_release(dst_orig);

        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
}

struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
                                    const struct sock *sk)
{
        struct rtable *rt = __ip_route_output_key(net, flp4);

        if (IS_ERR(rt))
                return rt;

        if (flp4->flowi4_proto) {
                flp4->flowi4_oif = rt->dst.dev->ifindex;
                rt = dst_rtable(xfrm_lookup_route(net, &rt->dst,
                                                  flowi4_to_flowi(flp4),
                                                  sk, 0));
        }

        return rt;
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);

/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
                        struct rtable *rt, u32 table_id, dscp_t dscp,
                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
                        u32 seq, unsigned int flags)
{
        struct rtmsg *r;
        struct nlmsghdr *nlh;
        unsigned long expires = 0;
        u32 error;
        u32 metrics[RTAX_MAX];

        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
        if (!nlh)
                return -EMSGSIZE;

        r = nlmsg_data(nlh);
        r->rtm_family         = AF_INET;
        r->rtm_dst_len        = 32;
        r->rtm_src_len        = 0;
        r->rtm_tos        = inet_dscp_to_dsfield(dscp);
        r->rtm_table        = table_id < 256 ? table_id : RT_TABLE_COMPAT;
        if (nla_put_u32(skb, RTA_TABLE, table_id))
                goto nla_put_failure;
        r->rtm_type        = rt->rt_type;
        r->rtm_scope        = RT_SCOPE_UNIVERSE;
        r->rtm_protocol = RTPROT_UNSPEC;
        r->rtm_flags        = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
        if (rt->rt_flags & RTCF_NOTIFY)
                r->rtm_flags |= RTM_F_NOTIFY;
        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
                r->rtm_flags |= RTCF_DOREDIRECT;

        if (nla_put_in_addr(skb, RTA_DST, dst))
                goto nla_put_failure;
        if (src) {
                r->rtm_src_len = 32;
                if (nla_put_in_addr(skb, RTA_SRC, src))
                        goto nla_put_failure;
        }
        if (rt->dst.dev &&
            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
                goto nla_put_failure;
        if (lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
                goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
        if (rt->dst.tclassid &&
            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
                goto nla_put_failure;
#endif
        if (fl4 && !rt_is_input_route(rt) &&
            fl4->saddr != src) {
                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
                        goto nla_put_failure;
        }
        if (rt->rt_uses_gateway) {
                if (rt->rt_gw_family == AF_INET &&
                    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
                        goto nla_put_failure;
                } else if (rt->rt_gw_family == AF_INET6) {
                        int alen = sizeof(struct in6_addr);
                        struct nlattr *nla;
                        struct rtvia *via;

                        nla = nla_reserve(skb, RTA_VIA, alen + 2);
                        if (!nla)
                                goto nla_put_failure;

                        via = nla_data(nla);
                        via->rtvia_family = AF_INET6;
                        memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
                }
        }

        expires = READ_ONCE(rt->dst.expires);
        if (expires) {
                unsigned long now = jiffies;

                if (time_before(now, expires))
                        expires -= now;
                else
                        expires = 0;
        }

        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
        if (rt->rt_pmtu && expires)
                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
        if (rt->rt_mtu_locked && expires)
                metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
        if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;

        if (fl4) {
                if (fl4->flowi4_mark &&
                    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
                        goto nla_put_failure;

                if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
                    nla_put_u32(skb, RTA_UID,
                                from_kuid_munged(current_user_ns(),
                                                 fl4->flowi4_uid)))
                        goto nla_put_failure;

                if (rt_is_input_route(rt)) {
#ifdef CONFIG_IP_MROUTE
                        if (ipv4_is_multicast(dst) &&
                            !ipv4_is_local_multicast(dst) &&
                            IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
                                int err = ipmr_get_route(net, skb,
                                                         fl4->saddr, fl4->daddr,
                                                         r, portid);

                                if (err <= 0) {
                                        if (err == 0)
                                                return 0;
                                        goto nla_put_failure;
                                }
                        } else
#endif
                                if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
                                        goto nla_put_failure;
                }
        }

        error = rt->dst.error;

        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
                            struct netlink_callback *cb, u32 table_id,
                            struct fnhe_hash_bucket *bucket, int genid,
                            int *fa_index, int fa_start, unsigned int flags)
{
        int i;

        for (i = 0; i < FNHE_HASH_SIZE; i++) {
                struct fib_nh_exception *fnhe;

                for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
                     fnhe = rcu_dereference(fnhe->fnhe_next)) {
                        struct rtable *rt;
                        int err;

                        if (*fa_index < fa_start)
                                goto next;

                        if (fnhe->fnhe_genid != genid)
                                goto next;

                        if (fnhe->fnhe_expires &&
                            time_after(jiffies, fnhe->fnhe_expires))
                                goto next;

                        rt = rcu_dereference(fnhe->fnhe_rth_input);
                        if (!rt)
                                rt = rcu_dereference(fnhe->fnhe_rth_output);
                        if (!rt)
                                goto next;

                        err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
                                           table_id, 0, NULL, skb,
                                           NETLINK_CB(cb->skb).portid,
                                           cb->nlh->nlmsg_seq, flags);
                        if (err)
                                return err;
next:
                        (*fa_index)++;
                }
        }

        return 0;
}

int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
                       u32 table_id, struct fib_info *fi,
                       int *fa_index, int fa_start, unsigned int flags)
{
        struct net *net = sock_net(cb->skb->sk);
        int nhsel, genid = fnhe_genid(net);

        for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
                struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
                struct fnhe_hash_bucket *bucket;
                int err;

                if (nhc->nhc_flags & RTNH_F_DEAD)
                        continue;

                rcu_read_lock();
                bucket = rcu_dereference(nhc->nhc_exceptions);
                err = 0;
                if (bucket)
                        err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
                                               genid, fa_index, fa_start,
                                               flags);
                rcu_read_unlock();
                if (err)
                        return err;
        }

        return 0;
}

static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
                                                   u8 ip_proto, __be16 sport,
                                                   __be16 dport)
{
        struct sk_buff *skb;
        struct iphdr *iph;

        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
                return NULL;

        /* Reserve room for dummy headers, this skb can pass
         * through good chunk of routing engine.
         */
        skb_reset_mac_header(skb);
        skb_reset_network_header(skb);
        skb->protocol = htons(ETH_P_IP);
        iph = skb_put(skb, sizeof(struct iphdr));
        iph->protocol = ip_proto;
        iph->saddr = src;
        iph->daddr = dst;
        iph->version = 0x4;
        iph->frag_off = 0;
        iph->ihl = 0x5;
        skb_set_transport_header(skb, skb->len);

        switch (iph->protocol) {
        case IPPROTO_UDP: {
                struct udphdr *udph;

                udph = skb_put_zero(skb, sizeof(struct udphdr));
                udph->source = sport;
                udph->dest = dport;
                udph->len = htons(sizeof(struct udphdr));
                udph->check = 0;
                break;
        }
        case IPPROTO_TCP: {
                struct tcphdr *tcph;

                tcph = skb_put_zero(skb, sizeof(struct tcphdr));
                tcph->source        = sport;
                tcph->dest        = dport;
                tcph->doff        = sizeof(struct tcphdr) / 4;
                tcph->rst = 1;
                tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
                                            src, dst, 0);
                break;
        }
        case IPPROTO_ICMP: {
                struct icmphdr *icmph;

                icmph = skb_put_zero(skb, sizeof(struct icmphdr));
                icmph->type = ICMP_ECHO;
                icmph->code = 0;
        }
        }

        return skb;
}

static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
                                       const struct nlmsghdr *nlh,
                                       struct nlattr **tb,
                                       struct netlink_ext_ack *extack)
{
        struct rtmsg *rtm;
        int i, err;

        rtm = nlmsg_payload(nlh, sizeof(*rtm));
        if (!rtm) {
                NL_SET_ERR_MSG(extack,
                               "ipv4: Invalid header for route get request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
                                              rtm_ipv4_policy, extack);

        if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
            (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
            rtm->rtm_table || rtm->rtm_protocol ||
            rtm->rtm_scope || rtm->rtm_type) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
                return -EINVAL;
        }

        if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
                               RTM_F_LOOKUP_TABLE |
                               RTM_F_FIB_MATCH)) {
                NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
                                            rtm_ipv4_policy, extack);
        if (err)
                return err;

        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
                NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
                return -EINVAL;
        }

        for (i = 0; i <= RTA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case RTA_IIF:
                case RTA_OIF:
                case RTA_SRC:
                case RTA_DST:
                case RTA_IP_PROTO:
                case RTA_SPORT:
                case RTA_DPORT:
                case RTA_MARK:
                case RTA_UID:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[RTA_MAX+1];
        u32 table_id = RT_TABLE_MAIN;
        __be16 sport = 0, dport = 0;
        struct fib_result res = {};
        u8 ip_proto = IPPROTO_UDP;
        struct rtable *rt = NULL;
        struct sk_buff *skb;
        struct rtmsg *rtm;
        struct flowi4 fl4 = {};
        __be32 dst = 0;
        __be32 src = 0;
        dscp_t dscp;
        kuid_t uid;
        u32 iif;
        int err;
        int mark;

        err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
        if (err < 0)
                return err;

        rtm = nlmsg_data(nlh);
        src = nla_get_in_addr_default(tb[RTA_SRC], 0);
        dst = nla_get_in_addr_default(tb[RTA_DST], 0);
        iif = nla_get_u32_default(tb[RTA_IIF], 0);
        mark = nla_get_u32_default(tb[RTA_MARK], 0);
        dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
        if (tb[RTA_UID])
                uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
        else
                uid = (iif ? INVALID_UID : current_uid());

        if (tb[RTA_IP_PROTO]) {
                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
                                                  &ip_proto, AF_INET, extack);
                if (err)
                        return err;
        }

        if (tb[RTA_SPORT])
                sport = nla_get_be16(tb[RTA_SPORT]);

        if (tb[RTA_DPORT])
                dport = nla_get_be16(tb[RTA_DPORT]);

        skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
        if (!skb)
                return -ENOBUFS;

        fl4.daddr = dst;
        fl4.saddr = src;
        fl4.flowi4_dscp = dscp;
        fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
        fl4.flowi4_mark = mark;
        fl4.flowi4_uid = uid;
        if (sport)
                fl4.fl4_sport = sport;
        if (dport)
                fl4.fl4_dport = dport;
        fl4.flowi4_proto = ip_proto;

        rcu_read_lock();

        if (iif) {
                struct net_device *dev;

                dev = dev_get_by_index_rcu(net, iif);
                if (!dev) {
                        err = -ENODEV;
                        goto errout_rcu;
                }

                fl4.flowi4_iif = iif; /* for rt_fill_info */
                skb->dev        = dev;
                skb->mark        = mark;
                err = ip_route_input_rcu(skb, dst, src, dscp, dev,
                                         &res) ? -EINVAL : 0;

                rt = skb_rtable(skb);
                if (err == 0 && rt->dst.error)
                        err = -rt->dst.error;
        } else {
                fl4.flowi4_iif = LOOPBACK_IFINDEX;
                skb->dev = net->loopback_dev;
                rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
                err = 0;
                if (IS_ERR(rt))
                        err = PTR_ERR(rt);
                else
                        skb_dst_set(skb, &rt->dst);
        }

        if (err)
                goto errout_rcu;

        if (rtm->rtm_flags & RTM_F_NOTIFY)
                rt->rt_flags |= RTCF_NOTIFY;

        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
                table_id = res.table ? res.table->tb_id : 0;

        /* reset skb for netlink reply msg */
        skb_trim(skb, 0);
        skb_reset_network_header(skb);
        skb_reset_transport_header(skb);
        skb_reset_mac_header(skb);

        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
                struct fib_rt_info fri;

                if (!res.fi) {
                        err = fib_props[res.type].error;
                        if (!err)
                                err = -EHOSTUNREACH;
                        goto errout_rcu;
                }
                fri.fi = res.fi;
                fri.tb_id = table_id;
                fri.dst = res.prefix;
                fri.dst_len = res.prefixlen;
                fri.dscp = res.dscp;
                fri.type = rt->rt_type;
                fri.offload = 0;
                fri.trap = 0;
                fri.offload_failed = 0;
                if (res.fa_head) {
                        struct fib_alias *fa;

                        hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
                                u8 slen = 32 - fri.dst_len;

                                if (fa->fa_slen == slen &&
                                    fa->tb_id == fri.tb_id &&
                                    fa->fa_dscp == fri.dscp &&
                                    fa->fa_info == res.fi &&
                                    fa->fa_type == fri.type) {
                                        fri.offload = READ_ONCE(fa->offload);
                                        fri.trap = READ_ONCE(fa->trap);
                                        fri.offload_failed =
                                                READ_ONCE(fa->offload_failed);
                                        break;
                                }
                        }
                }
                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
                                    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
        } else {
                err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4,
                                   skb, NETLINK_CB(in_skb).portid,
                                   nlh->nlmsg_seq, 0);
        }
        if (err < 0)
                goto errout_rcu;

        rcu_read_unlock();

        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);

errout_free:
        return err;
errout_rcu:
        rcu_read_unlock();
        kfree_skb(skb);
        goto errout_free;
}

void ip_rt_multicast_event(struct in_device *in_dev)
{
        rt_cache_flush(dev_net(in_dev->dev));
}

#ifdef CONFIG_SYSCTL
static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
static int ip_rt_gc_min_interval __read_mostly        = HZ / 2;
static int ip_rt_gc_elasticity __read_mostly        = 8;
static int ip_min_valid_pmtu __read_mostly        = IPV4_MIN_MTU;

static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        struct net *net = (struct net *)__ctl->extra1;

        if (write) {
                rt_cache_flush(net);
                fnhe_genid_bump(net);
                return 0;
        }

        return -EINVAL;
}

static struct ctl_table ipv4_route_table[] = {
        {
                .procname        = "gc_thresh",
                .data                = &ipv4_dst_ops.gc_thresh,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "max_size",
                .data                = &ip_rt_max_size,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                /*  Deprecated. Use gc_min_interval_ms */

                .procname        = "gc_min_interval",
                .data                = &ip_rt_gc_min_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "gc_min_interval_ms",
                .data                = &ip_rt_gc_min_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_ms_jiffies,
        },
        {
                .procname        = "gc_timeout",
                .data                = &ip_rt_gc_timeout,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "gc_interval",
                .data                = &ip_rt_gc_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "redirect_load",
                .data                = &ip_rt_redirect_load,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "redirect_number",
                .data                = &ip_rt_redirect_number,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "redirect_silence",
                .data                = &ip_rt_redirect_silence,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "error_cost",
                .data                = &ip_rt_error_cost,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "error_burst",
                .data                = &ip_rt_error_burst,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "gc_elasticity",
                .data                = &ip_rt_gc_elasticity,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};

static const char ipv4_route_flush_procname[] = "flush";

static struct ctl_table ipv4_route_netns_table[] = {
        {
                .procname        = ipv4_route_flush_procname,
                .maxlen                = sizeof(int),
                .mode                = 0200,
                .proc_handler        = ipv4_sysctl_rtcache_flush,
        },
        {
                .procname       = "min_pmtu",
                .data           = &init_net.ipv4.ip_rt_min_pmtu,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &ip_min_valid_pmtu,
        },
        {
                .procname       = "mtu_expires",
                .data           = &init_net.ipv4.ip_rt_mtu_expires,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
        },
        {
                .procname   = "min_adv_mss",
                .data       = &init_net.ipv4.ip_rt_min_advmss,
                .maxlen     = sizeof(int),
                .mode       = 0644,
                .proc_handler   = proc_dointvec,
        },
};

static __net_init int sysctl_route_net_init(struct net *net)
{
        struct ctl_table *tbl;
        size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);

        tbl = ipv4_route_netns_table;
        if (!net_eq(net, &init_net)) {
                int i;

                tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
                if (!tbl)
                        goto err_dup;

                /* Don't export non-whitelisted sysctls to unprivileged users */
                if (net->user_ns != &init_user_ns) {
                        if (tbl[0].procname != ipv4_route_flush_procname)
                                table_size = 0;
                }

                /* Update the variables to point into the current struct net
                 * except for the first element flush
                 */
                for (i = 1; i < table_size; i++)
                        tbl[i].data += (void *)net - (void *)&init_net;
        }
        tbl[0].extra1 = net;

        net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
                                                     tbl, table_size);
        if (!net->ipv4.route_hdr)
                goto err_reg;
        return 0;

err_reg:
        if (tbl != ipv4_route_netns_table)
                kfree(tbl);
err_dup:
        return -ENOMEM;
}

static __net_exit void sysctl_route_net_exit(struct net *net)
{
        const struct ctl_table *tbl;

        tbl = net->ipv4.route_hdr->ctl_table_arg;
        unregister_net_sysctl_table(net->ipv4.route_hdr);
        BUG_ON(tbl == ipv4_route_netns_table);
        kfree(tbl);
}

static __net_initdata struct pernet_operations sysctl_route_ops = {
        .init = sysctl_route_net_init,
        .exit = sysctl_route_net_exit,
};
#endif

static __net_init int netns_ip_rt_init(struct net *net)
{
        /* Set default value for namespaceified sysctls */
        net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
        net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
        net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
        return 0;
}

static struct pernet_operations __net_initdata ip_rt_ops = {
        .init = netns_ip_rt_init,
};

static __net_init int rt_genid_init(struct net *net)
{
        atomic_set(&net->ipv4.rt_genid, 0);
        atomic_set(&net->fnhe_genid, 0);
        atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
        return 0;
}

static __net_initdata struct pernet_operations rt_genid_ops = {
        .init = rt_genid_init,
};

static int __net_init ipv4_inetpeer_init(struct net *net)
{
        struct inet_peer_base *bp = kmalloc_obj(*bp);

        if (!bp)
                return -ENOMEM;
        inet_peer_base_init(bp);
        net->ipv4.peers = bp;
        return 0;
}

static void __net_exit ipv4_inetpeer_exit(struct net *net)
{
        struct inet_peer_base *bp = net->ipv4.peers;

        net->ipv4.peers = NULL;
        inetpeer_invalidate_tree(bp);
        kfree(bp);
}

static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
        .init        =        ipv4_inetpeer_init,
        .exit        =        ipv4_inetpeer_exit,
};

#ifdef CONFIG_IP_ROUTE_CLASSID
struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
#endif /* CONFIG_IP_ROUTE_CLASSID */

static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = {
        {.protocol = PF_INET, .msgtype = RTM_GETROUTE,
         .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
};

int __init ip_rt_init(void)
{
        void *idents_hash;
        int cpu;

        /* For modern hosts, this will use 2 MB of memory */
        idents_hash = alloc_large_system_hash("IP idents",
                                              sizeof(*ip_idents) + sizeof(*ip_tstamps),
                                              0,
                                              16, /* one bucket per 64 KB */
                                              HASH_ZERO,
                                              NULL,
                                              &ip_idents_mask,
                                              2048,
                                              256*1024);

        ip_idents = idents_hash;

        get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));

        ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);

                INIT_LIST_HEAD(&ul->head);
                spin_lock_init(&ul->lock);
        }
#ifdef CONFIG_IP_ROUTE_CLASSID
        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
        if (!ip_rt_acct)
                panic("IP: failed to allocate ip_rt_acct\n");
#endif

        ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable,
                                              SLAB_HWCACHE_ALIGN | SLAB_PANIC);

        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;

        if (dst_entries_init(&ipv4_dst_ops) < 0)
                panic("IP: failed to allocate ipv4_dst_ops counter\n");

        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");

        ipv4_dst_ops.gc_thresh = ~0;
        ip_rt_max_size = INT_MAX;

        devinet_init();
        ip_fib_init();

        if (ip_rt_proc_init())
                pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM
        xfrm_init();
        xfrm4_init();
#endif
        rtnl_register_many(ip_rt_rtnl_msg_handlers);

#ifdef CONFIG_SYSCTL
        register_pernet_subsys(&sysctl_route_ops);
#endif
        register_pernet_subsys(&ip_rt_ops);
        register_pernet_subsys(&rt_genid_ops);
        register_pernet_subsys(&ipv4_inetpeer_ops);
        return 0;
}

#ifdef CONFIG_SYSCTL
/*
 * We really need to sanitize the damn ipv4 init order, then all
 * this nonsense will go away.
 */
void __init ip_static_sysctl_init(void)
{
        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
}
#endif







































   11 












































































   11 

































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Events for filesystem locks
 *
 * Copyright 2013 Jeff Layton <jlayton@poochiereds.net>
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM filelock

#if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FILELOCK_H

#include <linux/tracepoint.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/kdev_t.h>

#define show_fl_flags(val)                                                \
        __print_flags(val, "|",                                         \
                { FL_POSIX,                "FL_POSIX" },                        \
                { FL_FLOCK,                "FL_FLOCK" },                        \
                { FL_DELEG,                "FL_DELEG" },                        \
                { FL_ACCESS,                "FL_ACCESS" },                        \
                { FL_EXISTS,                "FL_EXISTS" },                        \
                { FL_LEASE,                "FL_LEASE" },                        \
                { FL_CLOSE,                "FL_CLOSE" },                        \
                { FL_SLEEP,                "FL_SLEEP" },                        \
                { FL_DOWNGRADE_PENDING,        "FL_DOWNGRADE_PENDING" },        \
                { FL_UNLOCK_PENDING,        "FL_UNLOCK_PENDING" },                \
                { FL_OFDLCK,                "FL_OFDLCK" },                        \
                { FL_RECLAIM,                "FL_RECLAIM"})

#define show_fl_type(val)                                \
        __print_symbolic(val,                                \
                        { F_RDLCK, "F_RDLCK" },                \
                        { F_WRLCK, "F_WRLCK" },                \
                        { F_UNLCK, "F_UNLCK" })

TRACE_EVENT(locks_get_lock_context,
        TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx),

        TP_ARGS(inode, type, ctx),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(struct file_lock_context *, ctx)
                __field(dev_t, s_dev)
                __field(unsigned char, type)
        ),

        TP_fast_assign(
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->type = type;
                __entry->ctx = ctx;
        ),

        TP_printk("dev=0x%x:0x%x ino=0x%llx type=%s ctx=%p",
                  MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                  __entry->i_ino, show_fl_type(__entry->type), __entry->ctx)
);

DECLARE_EVENT_CLASS(filelock_lock,
        TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),

        TP_ARGS(inode, fl, ret),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(loff_t, fl_start)
                __field(loff_t, fl_end)
                __field(struct file_lock *, fl)
                __field(struct file_lock_core *, blocker)
                __field(fl_owner_t, owner)
                __field(dev_t, s_dev)
                __field(unsigned int, pid)
                __field(unsigned int, flags)
                __field(unsigned char, type)
                __field(int, ret)
        ),

        TP_fast_assign(
                __entry->fl = fl ? fl : NULL;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->blocker = fl ? fl->c.flc_blocker : NULL;
                __entry->owner = fl ? fl->c.flc_owner : NULL;
                __entry->pid = fl ? fl->c.flc_pid : 0;
                __entry->flags = fl ? fl->c.flc_flags : 0;
                __entry->type = fl ? fl->c.flc_type : 0;
                __entry->fl_start = fl ? fl->fl_start : 0;
                __entry->fl_end = fl ? fl->fl_end : 0;
                __entry->ret = ret;
        ),

        TP_printk("fl=%p dev=0x%x:0x%x ino=0x%llx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d",
                __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->blocker, __entry->owner,
                __entry->pid, show_fl_flags(__entry->flags),
                show_fl_type(__entry->type),
                __entry->fl_start, __entry->fl_end, __entry->ret)
);

DEFINE_EVENT(filelock_lock, posix_lock_inode,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, fcntl_setlk,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, locks_remove_posix,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, flock_lock_inode,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DECLARE_EVENT_CLASS(filelock_lease,
        TP_PROTO(struct inode *inode, struct file_lease *fl),

        TP_ARGS(inode, fl),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(struct file_lease *, fl)
                __field(struct file_lock_core *, blocker)
                __field(fl_owner_t, owner)
                __field(unsigned long, break_time)
                __field(unsigned long, downgrade_time)
                __field(dev_t, s_dev)
                __field(unsigned int, flags)
                __field(unsigned char, type)
        ),

        TP_fast_assign(
                __entry->fl = fl ? fl : NULL;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->blocker = fl ? fl->c.flc_blocker : NULL;
                __entry->owner = fl ? fl->c.flc_owner : NULL;
                __entry->flags = fl ? fl->c.flc_flags : 0;
                __entry->type = fl ? fl->c.flc_type : 0;
                __entry->break_time = fl ? fl->fl_break_time : 0;
                __entry->downgrade_time = fl ? fl->fl_downgrade_time : 0;
        ),

        TP_printk("fl=%p dev=0x%x:0x%x ino=0x%llx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu",
                __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->blocker, __entry->owner,
                show_fl_flags(__entry->flags),
                show_fl_type(__entry->type),
                __entry->break_time, __entry->downgrade_time)
);

DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

TRACE_EVENT(generic_add_lease,
        TP_PROTO(struct inode *inode, struct file_lease *fl),

        TP_ARGS(inode, fl),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(fl_owner_t, owner)
                __field(dev_t, s_dev)
                __field(int, wcount)
                __field(int, rcount)
                __field(int, icount)
                __field(unsigned int, flags)
                __field(unsigned char, type)
        ),

        TP_fast_assign(
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->wcount = atomic_read(&inode->i_writecount);
                __entry->rcount = atomic_read(&inode->i_readcount);
                __entry->icount = icount_read(inode);
                __entry->owner = fl->c.flc_owner;
                __entry->flags = fl->c.flc_flags;
                __entry->type = fl->c.flc_type;
        ),

        TP_printk("dev=0x%x:0x%x ino=0x%llx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->wcount, __entry->rcount,
                __entry->icount, __entry->owner,
                show_fl_flags(__entry->flags),
                show_fl_type(__entry->type))
);

TRACE_EVENT(leases_conflict,
        TP_PROTO(bool conflict, struct file_lease *lease, struct file_lease *breaker),

        TP_ARGS(conflict, lease, breaker),

        TP_STRUCT__entry(
                __field(void *, lease)
                __field(void *, breaker)
                __field(unsigned int, l_fl_flags)
                __field(unsigned int, b_fl_flags)
                __field(unsigned char, l_fl_type)
                __field(unsigned char, b_fl_type)
                __field(bool, conflict)
        ),

        TP_fast_assign(
                __entry->lease = lease;
                __entry->l_fl_flags = lease->c.flc_flags;
                __entry->l_fl_type = lease->c.flc_type;
                __entry->breaker = breaker;
                __entry->b_fl_flags = breaker->c.flc_flags;
                __entry->b_fl_type = breaker->c.flc_type;
                __entry->conflict = conflict;
        ),

        TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s",
                __entry->conflict,
                __entry->lease,
                show_fl_flags(__entry->l_fl_flags),
                show_fl_type(__entry->l_fl_type),
                __entry->breaker,
                show_fl_flags(__entry->b_fl_flags),
                show_fl_type(__entry->b_fl_type))
);

#endif /* _TRACE_FILELOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






























   22 









































































































































































































































































   21 





   22 



























   24 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* I/O iterator iteration building functions.
 *
 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_IOV_ITER_H
#define _LINUX_IOV_ITER_H

#include <linux/uio.h>
#include <linux/bvec.h>
#include <linux/folio_queue.h>

typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
                             void *priv, void *priv2);
typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len,
                              void *priv, void *priv2);

/*
 * Handle ITER_UBUF.
 */
static __always_inline
size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_ustep_f step)
{
        void __user *base = iter->ubuf;
        size_t progress = 0, remain;

        remain = step(base + iter->iov_offset, 0, len, priv, priv2);
        progress = len - remain;
        iter->iov_offset += progress;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_IOVEC.
 */
static __always_inline
size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                     iov_ustep_f step)
{
        const struct iovec *p = iter->__iov;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t part = min(len, p->iov_len - skip);

                if (likely(part)) {
                        remain = step(p->iov_base + skip, progress, part, priv, priv2);
                        consumed = part - remain;
                        progress += consumed;
                        skip += consumed;
                        len -= consumed;
                        if (skip < p->iov_len)
                                break;
                }
                p++;
                skip = 0;
        } while (len);

        iter->nr_segs -= p - iter->__iov;
        iter->__iov = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_KVEC.
 */
static __always_inline
size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_step_f step)
{
        const struct kvec *p = iter->kvec;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t part = min(len, p->iov_len - skip);

                if (likely(part)) {
                        remain = step(p->iov_base + skip, progress, part, priv, priv2);
                        consumed = part - remain;
                        progress += consumed;
                        skip += consumed;
                        len -= consumed;
                        if (skip < p->iov_len)
                                break;
                }
                p++;
                skip = 0;
        } while (len);

        iter->nr_segs -= p - iter->kvec;
        iter->kvec = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_BVEC.
 */
static __always_inline
size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_step_f step)
{
        const struct bio_vec *p = iter->bvec;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t offset = p->bv_offset + skip, part;
                void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE);

                part = min3(len,
                           (size_t)(p->bv_len - skip),
                           (size_t)(PAGE_SIZE - offset % PAGE_SIZE));
                remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2);
                kunmap_local(kaddr);
                consumed = part - remain;
                len -= consumed;
                progress += consumed;
                skip += consumed;
                if (skip >= p->bv_len) {
                        skip = 0;
                        p++;
                }
                if (remain)
                        break;
        } while (len);

        iter->nr_segs -= p - iter->bvec;
        iter->bvec = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_FOLIOQ.
 */
static __always_inline
size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        const struct folio_queue *folioq = iter->folioq;
        unsigned int slot = iter->folioq_slot;
        size_t progress = 0, skip = iter->iov_offset;

        if (slot == folioq_nr_slots(folioq)) {
                /* The iterator may have been extended. */
                folioq = folioq->next;
                slot = 0;
        }

        do {
                struct folio *folio = folioq_folio(folioq, slot);
                size_t part, remain = 0, consumed;
                size_t fsize;
                void *base;

                if (!folio)
                        break;

                fsize = folioq_folio_size(folioq, slot);
                if (skip < fsize) {
                        base = kmap_local_folio(folio, skip);
                        part = umin(len, PAGE_SIZE - skip % PAGE_SIZE);
                        remain = step(base, progress, part, priv, priv2);
                        kunmap_local(base);
                        consumed = part - remain;
                        len -= consumed;
                        progress += consumed;
                        skip += consumed;
                }
                if (skip >= fsize) {
                        skip = 0;
                        slot++;
                        if (slot == folioq_nr_slots(folioq) && folioq->next) {
                                folioq = folioq->next;
                                slot = 0;
                        }
                }
                if (remain)
                        break;
        } while (len);

        iter->folioq_slot = slot;
        iter->folioq = folioq;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_XARRAY.
 */
static __always_inline
size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        struct folio *folio;
        size_t progress = 0;
        loff_t start = iter->xarray_start + iter->iov_offset;
        pgoff_t index = start / PAGE_SIZE;
        XA_STATE(xas, iter->xarray, index);

        rcu_read_lock();
        xas_for_each(&xas, folio, ULONG_MAX) {
                size_t remain, consumed, offset, part, flen;

                if (xas_retry(&xas, folio))
                        continue;
                if (WARN_ON(xa_is_value(folio)))
                        break;
                if (WARN_ON(folio_test_hugetlb(folio)))
                        break;

                offset = offset_in_folio(folio, start + progress);
                flen = min(folio_size(folio) - offset, len);

                while (flen) {
                        void *base = kmap_local_folio(folio, offset);

                        part = min_t(size_t, flen,
                                     PAGE_SIZE - offset_in_page(offset));
                        remain = step(base, progress, part, priv, priv2);
                        kunmap_local(base);

                        consumed = part - remain;
                        progress += consumed;
                        len -= consumed;

                        if (remain || len == 0)
                                goto out;
                        flen -= consumed;
                        offset += consumed;
                }
        }

out:
        rcu_read_unlock();
        iter->iov_offset += progress;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_DISCARD.
 */
static __always_inline
size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        size_t progress = len;

        iter->count -= progress;
        return progress;
}

/**
 * iterate_and_advance2 - Iterate over an iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @priv2: More data for the step functions.
 * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
 * @step: Function for other iterators; given kernel addresses.
 *
 * Iterate over the next part of an iterator, up to the specified length.  The
 * buffer is presented in segments, which for kernel iteration are broken up by
 * physical pages and mapped, with the mapped address being presented.
 *
 * Two step functions, @step and @ustep, must be provided, one for handling
 * mapped kernel addresses and the other is given user addresses which have the
 * potential to fault since no pinning is performed.
 *
 * The step functions are passed the address and length of the segment, @priv,
 * @priv2 and the amount of data so far iterated over (which can, for example,
 * be added to @priv to point to the right part of a second buffer).  The step
 * functions should return the amount of the segment they didn't process (ie. 0
 * indicates complete processsing).
 *
 * This function returns the amount of data processed (ie. 0 means nothing was
 * processed and the value of @len means processes to completion).
 */
static __always_inline
size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
                            void *priv2, iov_ustep_f ustep, iov_step_f step)
{
        if (unlikely(iter->count < len))
                len = iter->count;
        if (unlikely(!len))
                return 0;

        if (likely(iter_is_ubuf(iter)))
                return iterate_ubuf(iter, len, priv, priv2, ustep);
        if (likely(iter_is_iovec(iter)))
                return iterate_iovec(iter, len, priv, priv2, ustep);
        if (iov_iter_is_bvec(iter))
                return iterate_bvec(iter, len, priv, priv2, step);
        if (iov_iter_is_kvec(iter))
                return iterate_kvec(iter, len, priv, priv2, step);
        if (iov_iter_is_folioq(iter))
                return iterate_folioq(iter, len, priv, priv2, step);
        if (iov_iter_is_xarray(iter))
                return iterate_xarray(iter, len, priv, priv2, step);
        return iterate_discard(iter, len, priv, priv2, step);
}

/**
 * iterate_and_advance - Iterate over an iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
 * @step: Function for other iterators; given kernel addresses.
 *
 * As iterate_and_advance2(), but priv2 is always NULL.
 */
static __always_inline
size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
                           iov_ustep_f ustep, iov_step_f step)
{
        return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
}

/**
 * iterate_and_advance_kernel - Iterate over a kernel-internal iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @priv2: More data for the step functions.
 * @step: Function for other iterators; given kernel addresses.
 *
 * Iterate over the next part of an iterator, up to the specified length.  The
 * buffer is presented in segments, which for kernel iteration are broken up by
 * physical pages and mapped, with the mapped address being presented.
 *
 * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type
 * iterators; it will not handle UBUF or IOVEC-type iterators.
 *
 * A step functions, @step, must be provided, one for handling mapped kernel
 * addresses and the other is given user addresses which have the potential to
 * fault since no pinning is performed.
 *
 * The step functions are passed the address and length of the segment, @priv,
 * @priv2 and the amount of data so far iterated over (which can, for example,
 * be added to @priv to point to the right part of a second buffer).  The step
 * functions should return the amount of the segment they didn't process (ie. 0
 * indicates complete processsing).
 *
 * This function returns the amount of data processed (ie. 0 means nothing was
 * processed and the value of @len means processes to completion).
 */
static __always_inline
size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv,
                                  void *priv2, iov_step_f step)
{
        if (unlikely(iter->count < len))
                len = iter->count;
        if (unlikely(!len))
                return 0;
        if (iov_iter_is_bvec(iter))
                return iterate_bvec(iter, len, priv, priv2, step);
        if (iov_iter_is_kvec(iter))
                return iterate_kvec(iter, len, priv, priv2, step);
        if (iov_iter_is_folioq(iter))
                return iterate_folioq(iter, len, priv, priv2, step);
        if (iov_iter_is_xarray(iter))
                return iterate_xarray(iter, len, priv, priv2, step);
        return iterate_discard(iter, len, priv, priv2, step);
}

#endif /* _LINUX_IOV_ITER_H */

















    4 
    4 




































































































































































































































































































































































































    1 




    1 











































































































































































































   20 












































































   21 
   19 














   21 
   21 




























































































    8 





    8 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
#include <asm/mtrr.h>

#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
EXPORT_SYMBOL(physical_mask);
SYM_PIC_ALIAS(physical_mask);
#endif

pgtable_t pte_alloc_one(struct mm_struct *mm)
{
        return __pte_alloc_one(mm, GFP_PGTABLE_USER);
}

void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
        paravirt_release_pte(page_to_pfn(pte));
        tlb_remove_ptdesc(tlb, page_ptdesc(pte));
}

#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
        paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
        /*
         * NOTE! For PAE, any changes to the top page-directory-pointer-table
         * entries need a full cr3 reload to flush.
         */
#ifdef CONFIG_X86_PAE
        tlb->need_flush_all = 1;
#endif
        tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
}

#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
        tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
}

#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
        paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
        tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */
#endif        /* CONFIG_PGTABLE_LEVELS > 3 */
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

static inline void pgd_list_add(pgd_t *pgd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pgd);

        list_add(&ptdesc->pt_list, &pgd_list);
}

static inline void pgd_list_del(pgd_t *pgd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pgd);

        list_del(&ptdesc->pt_list);
}

static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
        virt_to_ptdesc(pgd)->pt_mm = mm;
}

struct mm_struct *pgd_page_get_mm(struct page *page)
{
        return page_ptdesc(page)->pt_mm;
}

static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
        /* PAE preallocates all its PMDs.  No cloning needed. */
        if (!IS_ENABLED(CONFIG_X86_PAE))
                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
                                KERNEL_PGD_PTRS);

        /* List used to sync kernel mapping updates */
        pgd_set_mm(pgd, mm);
        pgd_list_add(pgd);
}

static void pgd_dtor(pgd_t *pgd)
{
        spin_lock(&pgd_lock);
        pgd_list_del(pgd);
        spin_unlock(&pgd_lock);
}

/*
 * List of all pgd's needed for non-PAE so it can invalidate entries
 * in both cached and uncached pgd's; not needed for PAE since the
 * kernel pmd is shared. If PAE were not to share the pmd a similar
 * tactic would be needed. This is essentially codepath-based locking
 * against pageattr.c; it is the unique case in which a valid change
 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 * vmalloc faults work because attached pagetables are never freed.
 * -- nyc
 */

#ifdef CONFIG_X86_PAE
/*
 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 * updating the top-level pagetable entries to guarantee the
 * processor notices the update.  Since this is expensive, and
 * all 4 top-level entries are used almost immediately in a
 * new process's life, we just pre-populate them here.
 */
#define PREALLOCATED_PMDS        PTRS_PER_PGD

/*
 * "USER_PMDS" are the PMDs for the user copy of the page tables when
 * PTI is enabled. They do not exist when PTI is disabled.  Note that
 * this is distinct from the user _portion_ of the kernel page tables
 * which always exists.
 *
 * We allocate separate PMDs for the kernel part of the user page-table
 * when PTI is enabled. We need them to map the per-process LDT into the
 * user-space page-table.
 */
#define PREALLOCATED_USER_PMDS         (boot_cpu_has(X86_FEATURE_PTI) ? \
                                        KERNEL_PGD_PTRS : 0)
#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS

void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);

        /* Note: almost everything apart from _PAGE_PRESENT is
           reserved at the pmd (PDPT) level. */
        set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));

        /*
         * According to Intel App note "TLBs, Paging-Structure Caches,
         * and Their Invalidation", April 2007, document 317080-001,
         * section 8.1: in PAE mode we explicitly have to flush the
         * TLB via cr3 if the top-level pgd is changed...
         */
        flush_tlb_mm(mm);
}
#else  /* !CONFIG_X86_PAE */

/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS        0
#define PREALLOCATED_USER_PMDS         0
#define MAX_PREALLOCATED_USER_PMDS 0
#endif        /* CONFIG_X86_PAE */

static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
        int i;
        struct ptdesc *ptdesc;

        for (i = 0; i < count; i++)
                if (pmds[i]) {
                        ptdesc = virt_to_ptdesc(pmds[i]);

                        pagetable_dtor(ptdesc);
                        pagetable_free(ptdesc);
                        mm_dec_nr_pmds(mm);
                }
}

static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
        int i;
        bool failed = false;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp &= ~__GFP_ACCOUNT;
        gfp &= ~__GFP_HIGHMEM;

        for (i = 0; i < count; i++) {
                pmd_t *pmd = NULL;
                struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);

                if (!ptdesc)
                        failed = true;
                if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
                        pagetable_free(ptdesc);
                        ptdesc = NULL;
                        failed = true;
                }
                if (ptdesc) {
                        mm_inc_nr_pmds(mm);
                        pmd = ptdesc_address(ptdesc);
                }

                pmds[i] = pmd;
        }

        if (failed) {
                free_pmds(mm, pmds, count);
                return -ENOMEM;
        }

        return 0;
}

/*
 * Mop up any pmd pages which may still be attached to the pgd.
 * Normally they will be freed by munmap/exit_mmap, but any pmd we
 * preallocate which never got a corresponding vma will need to be
 * freed manually.
 */
static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
{
        pgd_t pgd = *pgdp;

        if (pgd_val(pgd) != 0) {
                pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);

                pgd_clear(pgdp);

                paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
                pmd_free(mm, pmd);
                mm_dec_nr_pmds(mm);
        }
}

static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
        int i;

        for (i = 0; i < PREALLOCATED_PMDS; i++)
                mop_up_one_pmd(mm, &pgdp[i]);

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION

        if (!boot_cpu_has(X86_FEATURE_PTI))
                return;

        pgdp = kernel_to_user_pgdp(pgdp);

        for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
                mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
#endif
}

static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
        p4d_t *p4d;
        pud_t *pud;
        int i;

        p4d = p4d_offset(pgd, 0);
        pud = pud_offset(p4d, 0);

        for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
                pmd_t *pmd = pmds[i];

                if (i >= KERNEL_PGD_BOUNDARY)
                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
                               sizeof(pmd_t) * PTRS_PER_PMD);

                pud_populate(mm, pud, pmd);
        }
}

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
                                     pgd_t *k_pgd, pmd_t *pmds[])
{
        pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
        pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
        p4d_t *u_p4d;
        pud_t *u_pud;
        int i;

        u_p4d = p4d_offset(u_pgd, 0);
        u_pud = pud_offset(u_p4d, 0);

        s_pgd += KERNEL_PGD_BOUNDARY;
        u_pud += KERNEL_PGD_BOUNDARY;

        for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
                pmd_t *pmd = pmds[i];

                memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
                       sizeof(pmd_t) * PTRS_PER_PMD);

                pud_populate(mm, u_pud, pmd);
        }

}
#else
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
                                     pgd_t *k_pgd, pmd_t *pmds[])
{
}
#endif

static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
{
        /*
         * PTI and Xen need a whole page for the PAE PGD
         * even though the hardware only needs 32 bytes.
         *
         * For simplicity, allocate a page for all users.
         */
        return __pgd_alloc(mm, pgd_allocation_order());
}

static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        __pgd_free(mm, pgd);
}

pgd_t *pgd_alloc(struct mm_struct *mm)
{
        pgd_t *pgd;
        pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
        pmd_t *pmds[PREALLOCATED_PMDS];

        pgd = _pgd_alloc(mm);

        if (pgd == NULL)
                goto out;

        mm->pgd = pgd;

        if (sizeof(pmds) != 0 &&
                        preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
                goto out_free_pgd;

        if (sizeof(u_pmds) != 0 &&
                        preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
                goto out_free_pmds;

        if (paravirt_pgd_alloc(mm) != 0)
                goto out_free_user_pmds;

        /*
         * Make sure that pre-populating the pmds is atomic with
         * respect to anything walking the pgd_list, so that they
         * never see a partially populated pgd.
         */
        spin_lock(&pgd_lock);

        pgd_ctor(mm, pgd);
        if (sizeof(pmds) != 0)
                pgd_prepopulate_pmd(mm, pgd, pmds);

        if (sizeof(u_pmds) != 0)
                pgd_prepopulate_user_pmd(mm, pgd, u_pmds);

        spin_unlock(&pgd_lock);

        return pgd;

out_free_user_pmds:
        if (sizeof(u_pmds) != 0)
                free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
        if (sizeof(pmds) != 0)
                free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
        _pgd_free(mm, pgd);
out:
        return NULL;
}

void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        pgd_mop_up_pmds(mm, pgd);
        pgd_dtor(pgd);
        paravirt_pgd_free(mm, pgd);
        _pgd_free(mm, pgd);
}

/*
 * Used to set accessed or dirty bits in the page table entries
 * on other architectures. On x86, the accessed and dirty bits
 * are tracked by hardware. However, do_wp_page calls this function
 * to also make the pte writeable at the same time the dirty bit is
 * set. In that case we do actually need to write the PTE.
 */
int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
{
        int changed = !pte_same(*ptep, entry);

        if (changed && dirty)
                set_pte(ptep, entry);

        return changed;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pmd_t *pmdp,
                          pmd_t entry, int dirty)
{
        int changed = !pmd_same(*pmdp, entry);

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        if (changed && dirty) {
                set_pmd(pmdp, entry);
                /*
                 * We had a write-protection fault here and changed the pmd
                 * to to more permissive. No need to flush the TLB for that,
                 * #PF is architecturally guaranteed to do that and in the
                 * worst-case we'll generate a spurious fault.
                 */
        }

        return changed;
}

int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
                          pud_t *pudp, pud_t entry, int dirty)
{
        int changed = !pud_same(*pudp, entry);

        VM_BUG_ON(address & ~HPAGE_PUD_MASK);

        if (changed && dirty) {
                set_pud(pudp, entry);
                /*
                 * We had a write-protection fault here and changed the pud
                 * to to more permissive. No need to flush the TLB for that,
                 * #PF is architecturally guaranteed to do that and in the
                 * worst-case we'll generate a spurious fault.
                 */
        }

        return changed;
}
#endif

bool ptep_test_and_clear_young(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
        bool ret = false;

        if (pte_young(*ptep))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *) &ptep->pte);

        return ret;
}

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
bool pmdp_test_and_clear_young(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmdp)
{
        bool ret = false;

        if (pmd_young(*pmdp))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *)pmdp);

        return ret;
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool pudp_test_and_clear_young(struct vm_area_struct *vma,
                unsigned long addr, pud_t *pudp)
{
        bool ret = false;

        if (pud_young(*pudp))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *)pudp);

        return ret;
}
#endif

bool ptep_clear_flush_young(struct vm_area_struct *vma,
                unsigned long address, pte_t *ptep)
{
        /*
         * On x86 CPUs, clearing the accessed bit without a TLB flush
         * doesn't cause data corruption. [ It could cause incorrect
         * page aging and the (mistaken) reclaim of hot pages, but the
         * chance of that should be relatively low. ]
         *
         * So as a performance optimization don't flush the TLB when
         * clearing the accessed bit, it will eventually be flushed by
         * a context switch or a VM operation anyway. [ In the rare
         * event of it not getting flushed for a long time the delay
         * shouldn't really matter because there's no real memory
         * pressure for swapout to react to. ]
         */
        return ptep_test_and_clear_young(vma, address, ptep);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool pmdp_clear_flush_young(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp)
{
        bool young;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);

        return young;
}

pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
                         pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));

        /*
         * No flush is necessary. Once an invalid PTE is established, the PTE's
         * access and dirty bits cannot be updated.
         */
        return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
}
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pud_t *pudp)
{
        VM_WARN_ON_ONCE(!pud_present(*pudp));
        pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
        flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
        return old;
}
#endif

/**
 * reserve_top_address - Reserve a hole in the top of the kernel address space
 * @reserve: Size of hole to reserve
 *
 * Can be used to relocate the fixmap area and poke a hole in the top
 * of the kernel address space to make room for a hypervisor.
 */
void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
        BUG_ON(fixmaps_set > 0);
        __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
        printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
               -reserve, __FIXADDR_TOP + PAGE_SIZE);
#endif
}

int fixmaps_set;

void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
        unsigned long address = __fix_to_virt(idx);

#ifdef CONFIG_X86_64
       /*
        * Ensure that the static initial page tables are covering the
        * fixmap completely.
        */
        BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
                     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
#endif

        if (idx >= __end_of_fixed_addresses) {
                BUG();
                return;
        }
        set_pte_vaddr(address, pte);
        fixmaps_set++;
}

void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
                       phys_addr_t phys, pgprot_t flags)
{
        /* Sanitize 'prot' against any unsupported bits: */
        pgprot_val(flags) &= __default_kernel_pte_mask;

        __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
#if CONFIG_PGTABLE_LEVELS > 4
/**
 * p4d_set_huge - Set up kernel P4D mapping
 * @p4d: Pointer to the P4D entry
 * @addr: Virtual address associated with the P4D entry
 * @prot: Protection bits to use
 *
 * No 512GB pages yet -- always return 0
 */
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}

/**
 * p4d_clear_huge - Clear kernel P4D mapping when it is set
 * @p4d: Pointer to the P4D entry to clear
 *
 * No 512GB pages yet -- do nothing
 */
void p4d_clear_huge(p4d_t *p4d)
{
}
#endif

/**
 * pud_set_huge - Set up kernel PUD mapping
 * @pud: Pointer to the PUD entry
 * @addr: Virtual address associated with the PUD entry
 * @prot: Protection bits to use
 *
 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
 * function sets up a huge page only if the complete range has the same MTRR
 * caching mode.
 *
 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
 * page mapping attempt fails.
 *
 * Returns 1 on success and 0 on failure.
 */
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        u8 uniform;

        mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
        if (!uniform)
                return 0;

        /* Bail out if we are we on a populated non-leaf entry: */
        if (pud_present(*pud) && !pud_leaf(*pud))
                return 0;

        set_pte((pte_t *)pud, pfn_pte(
                (u64)addr >> PAGE_SHIFT,
                __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));

        return 1;
}

/**
 * pmd_set_huge - Set up kernel PMD mapping
 * @pmd: Pointer to the PMD entry
 * @addr: Virtual address associated with the PMD entry
 * @prot: Protection bits to use
 *
 * See text over pud_set_huge() above.
 *
 * Returns 1 on success and 0 on failure.
 */
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        u8 uniform;

        mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
        if (!uniform) {
                pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
                             __func__, addr, addr + PMD_SIZE);
                return 0;
        }

        /* Bail out if we are we on a populated non-leaf entry: */
        if (pmd_present(*pmd) && !pmd_leaf(*pmd))
                return 0;

        set_pte((pte_t *)pmd, pfn_pte(
                (u64)addr >> PAGE_SHIFT,
                __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));

        return 1;
}

/**
 * pud_clear_huge - Clear kernel PUD mapping when it is set
 * @pud: Pointer to the PUD entry to clear.
 *
 * Returns 1 on success and 0 on failure (no PUD map is found).
 */
int pud_clear_huge(pud_t *pud)
{
        if (pud_leaf(*pud)) {
                pud_clear(pud);
                return 1;
        }

        return 0;
}

/**
 * pmd_clear_huge - Clear kernel PMD mapping when it is set
 * @pmd: Pointer to the PMD entry to clear.
 *
 * Returns 1 on success and 0 on failure (no PMD map is found).
 */
int pmd_clear_huge(pmd_t *pmd)
{
        if (pmd_leaf(*pmd)) {
                pmd_clear(pmd);
                return 1;
        }

        return 0;
}

#ifdef CONFIG_X86_64
/**
 * pud_free_pmd_page - Clear PUD entry and free PMD page
 * @pud: Pointer to a PUD
 * @addr: Virtual address associated with PUD
 *
 * Context: The PUD range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 *
 * NOTE: Callers must allow a single page allocation.
 */
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        pmd_t *pmd, *pmd_sv;
        struct ptdesc *pt;
        int i;

        pmd = pud_pgtable(*pud);
        pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
        if (!pmd_sv)
                return 0;

        for (i = 0; i < PTRS_PER_PMD; i++) {
                pmd_sv[i] = pmd[i];
                if (!pmd_none(pmd[i]))
                        pmd_clear(&pmd[i]);
        }

        pud_clear(pud);

        /* INVLPG to clear all paging-structure caches */
        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);

        for (i = 0; i < PTRS_PER_PMD; i++) {
                if (!pmd_none(pmd_sv[i])) {
                        pt = page_ptdesc(pmd_page(pmd_sv[i]));
                        pagetable_dtor_free(pt);
                }
        }

        free_page((unsigned long)pmd_sv);

        pmd_free(&init_mm, pmd);

        return 1;
}

/**
 * pmd_free_pte_page - Clear PMD entry and free PTE page.
 * @pmd: Pointer to the PMD
 * @addr: Virtual address associated with PMD
 *
 * Context: The PMD range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 */
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        struct ptdesc *pt;

        pt = page_ptdesc(pmd_page(*pmd));
        pmd_clear(pmd);

        /* INVLPG to clear all paging-structure caches */
        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);

        pagetable_dtor_free(pt);

        return 1;
}

#else /* !CONFIG_X86_64 */

/*
 * Disable free page handling on x86-PAE. This assures that ioremap()
 * does not update sync'd PMD entries. See vmalloc_sync_one().
 */
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return pmd_none(*pmd);
}

#endif /* CONFIG_X86_64 */
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_SHADOW_STACK)
                return pte_mkwrite_shstk(pte);

        pte = pte_mkwrite_novma(pte);

        return pte_clear_saveddirty(pte);
}

pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_SHADOW_STACK)
                return pmd_mkwrite_shstk(pmd);

        pmd = pmd_mkwrite_novma(pmd);

        return pmd_clear_saveddirty(pmd);
}

void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
{
        /*
         * Hardware before shadow stack can (rarely) set Dirty=1
         * on a Write=0 PTE. So the below condition
         * only indicates a software bug when shadow stack is
         * supported by the HW. This checking is covered in
         * pte_shstk().
         */
        VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
                        pte_shstk(pte));
}

void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
{
        /* See note in arch_check_zapped_pte() */
        VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
                        pmd_shstk(pmd));
}

void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
{
        /* See note in arch_check_zapped_pte() */
        VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
}
















    4 





    4 
    4 












    1 


    2 


    2 


   13 


   14 


   12 











   11 


   14 


   14 


   14 


   13 


   12 


    1 

    1 



    1 


    2 

    1 



    2 
































































































































































    4 













    4 







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
// SPDX-License-Identifier: GPL-2.0

#include "blk-rq-qos.h"

/*
 * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
 * false if 'v' + 1 would be bigger than 'below'.
 */
static bool atomic_inc_below(atomic_t *v, unsigned int below)
{
        unsigned int cur = atomic_read(v);

        do {
                if (cur >= below)
                        return false;
        } while (!atomic_try_cmpxchg(v, &cur, cur + 1));

        return true;
}

bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
{
        return atomic_inc_below(&rq_wait->inflight, limit);
}

void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio)
{
        do {
                if (rqos->ops->cleanup)
                        rqos->ops->cleanup(rqos, bio);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_done(struct rq_qos *rqos, struct request *rq)
{
        do {
                if (rqos->ops->done)
                        rqos->ops->done(rqos, rq);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_issue(struct rq_qos *rqos, struct request *rq)
{
        do {
                if (rqos->ops->issue)
                        rqos->ops->issue(rqos, rq);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq)
{
        do {
                if (rqos->ops->requeue)
                        rqos->ops->requeue(rqos, rq);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio)
{
        do {
                if (rqos->ops->throttle)
                        rqos->ops->throttle(rqos, bio);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
{
        do {
                if (rqos->ops->track)
                        rqos->ops->track(rqos, rq, bio);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio)
{
        do {
                if (rqos->ops->merge)
                        rqos->ops->merge(rqos, rq, bio);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
{
        do {
                if (rqos->ops->done_bio)
                        rqos->ops->done_bio(rqos, bio);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_queue_depth_changed(struct rq_qos *rqos)
{
        do {
                if (rqos->ops->queue_depth_changed)
                        rqos->ops->queue_depth_changed(rqos);
                rqos = rqos->next;
        } while (rqos);
}

/*
 * Return true, if we can't increase the depth further by scaling
 */
bool rq_depth_calc_max_depth(struct rq_depth *rqd)
{
        unsigned int depth;
        bool ret = false;

        /*
         * For QD=1 devices, this is a special case. It's important for those
         * to have one request ready when one completes, so force a depth of
         * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
         * since the device can't have more than that in flight. If we're
         * scaling down, then keep a setting of 1/1/1.
         */
        if (rqd->queue_depth == 1) {
                if (rqd->scale_step > 0)
                        rqd->max_depth = 1;
                else {
                        rqd->max_depth = 2;
                        ret = true;
                }
        } else {
                /*
                 * scale_step == 0 is our default state. If we have suffered
                 * latency spikes, step will be > 0, and we shrink the
                 * allowed write depths. If step is < 0, we're only doing
                 * writes, and we allow a temporarily higher depth to
                 * increase performance.
                 */
                depth = min_t(unsigned int, rqd->default_depth,
                              rqd->queue_depth);
                if (rqd->scale_step > 0)
                        depth = 1 + ((depth - 1) >> min(31, rqd->scale_step));
                else if (rqd->scale_step < 0) {
                        unsigned int maxd = 3 * rqd->queue_depth / 4;

                        depth = 1 + ((depth - 1) << -rqd->scale_step);
                        if (depth > maxd) {
                                depth = maxd;
                                ret = true;
                        }
                }

                rqd->max_depth = depth;
        }

        return ret;
}

/* Returns true on success and false if scaling up wasn't possible */
bool rq_depth_scale_up(struct rq_depth *rqd)
{
        /*
         * Hit max in previous round, stop here
         */
        if (rqd->scaled_max)
                return false;

        rqd->scale_step--;

        rqd->scaled_max = rq_depth_calc_max_depth(rqd);
        return true;
}

/*
 * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
 * had a latency violation. Returns true on success and returns false if
 * scaling down wasn't possible.
 */
bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
{
        /*
         * Stop scaling down when we've hit the limit. This also prevents
         * ->scale_step from going to crazy values, if the device can't
         * keep up.
         */
        if (rqd->max_depth == 1)
                return false;

        if (rqd->scale_step < 0 && hard_throttle)
                rqd->scale_step = 0;
        else
                rqd->scale_step++;

        rqd->scaled_max = false;
        rq_depth_calc_max_depth(rqd);
        return true;
}

struct rq_qos_wait_data {
        struct wait_queue_entry wq;
        struct rq_wait *rqw;
        acquire_inflight_cb_t *cb;
        void *private_data;
        bool got_token;
};

static int rq_qos_wake_function(struct wait_queue_entry *curr,
                                unsigned int mode, int wake_flags, void *key)
{
        struct rq_qos_wait_data *data = container_of(curr,
                                                     struct rq_qos_wait_data,
                                                     wq);

        /*
         * If we fail to get a budget, return -1 to interrupt the wake up loop
         * in __wake_up_common.
         */
        if (!data->cb(data->rqw, data->private_data))
                return -1;

        data->got_token = true;
        /*
         * autoremove_wake_function() removes the wait entry only when it
         * actually changed the task state. We want the wait always removed.
         * Remove explicitly and use default_wake_function().
         */
        default_wake_function(curr, mode, wake_flags, key);
        /*
         * Note that the order of operations is important as finish_wait()
         * tests whether @curr is removed without grabbing the lock. This
         * should be the last thing to do to make sure we will not have a
         * UAF access to @data. And the semantics of memory barrier in it
         * also make sure the waiter will see the latest @data->got_token
         * once list_empty_careful() in finish_wait() returns true.
         */
        list_del_init_careful(&curr->entry);
        return 1;
}

/**
 * rq_qos_wait - throttle on a rqw if we need to
 * @rqw: rqw to throttle on
 * @private_data: caller provided specific data
 * @acquire_inflight_cb: inc the rqw->inflight counter if we can
 * @cleanup_cb: the callback to cleanup in case we race with a waker
 *
 * This provides a uniform place for the rq_qos users to do their throttling.
 * Since you can end up with a lot of things sleeping at once, this manages the
 * waking up based on the resources available.  The acquire_inflight_cb should
 * inc the rqw->inflight if we have the ability to do so, or return false if not
 * and then we will sleep until the room becomes available.
 *
 * cleanup_cb is in case that we race with a waker and need to cleanup the
 * inflight count accordingly.
 */
void rq_qos_wait(struct rq_wait *rqw, void *private_data,
                 acquire_inflight_cb_t *acquire_inflight_cb,
                 cleanup_cb_t *cleanup_cb)
{
        struct rq_qos_wait_data data = {
                .rqw                = rqw,
                .cb                = acquire_inflight_cb,
                .private_data        = private_data,
                .got_token        = false,
        };
        bool first_waiter;

        /*
         * If there are no waiters in the waiting queue, try to increase the
         * inflight counter if we can. Otherwise, prepare for adding ourselves
         * to the waiting queue.
         */
        if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data))
                return;

        init_wait_func(&data.wq, rq_qos_wake_function);
        first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq,
                                                 TASK_UNINTERRUPTIBLE);
        /*
         * Make sure there is at least one inflight process; otherwise, waiters
         * will never be woken up. Since there may be no inflight process before
         * adding ourselves to the waiting queue above, we need to try to
         * increase the inflight counter for ourselves. And it is sufficient to
         * guarantee that at least the first waiter to enter the waiting queue
         * will re-check the waiting condition before going to sleep, thus
         * ensuring forward progress.
         */
        if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) {
                finish_wait(&rqw->wait, &data.wq);
                /*
                 * We raced with rq_qos_wake_function() getting a token,
                 * which means we now have two. Put our local token
                 * and wake anyone else potentially waiting for one.
                 *
                 * Enough memory barrier in list_empty_careful() in
                 * finish_wait() is paired with list_del_init_careful()
                 * in rq_qos_wake_function() to make sure we will see
                 * the latest @data->got_token.
                 */
                if (data.got_token)
                        cleanup_cb(rqw, private_data);
                return;
        }

        /* we are now relying on the waker to increase our inflight counter. */
        do {
                if (data.got_token)
                        break;
                io_schedule();
                set_current_state(TASK_UNINTERRUPTIBLE);
        } while (1);
        finish_wait(&rqw->wait, &data.wq);
}

void rq_qos_exit(struct request_queue *q)
{
        mutex_lock(&q->rq_qos_mutex);
        while (q->rq_qos) {
                struct rq_qos *rqos = q->rq_qos;
                q->rq_qos = rqos->next;
                rqos->ops->exit(rqos);
        }
        blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
        mutex_unlock(&q->rq_qos_mutex);
}

int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
                const struct rq_qos_ops *ops)
{
        struct request_queue *q = disk->queue;
        unsigned int memflags;

        lockdep_assert_held(&q->rq_qos_mutex);

        rqos->disk = disk;
        rqos->id = id;
        rqos->ops = ops;

        /*
         * No IO can be in-flight when adding rqos, so freeze queue, which
         * is fine since we only support rq_qos for blk-mq queue.
         */
        memflags = blk_mq_freeze_queue(q);

        if (rq_qos_id(q, rqos->id))
                goto ebusy;
        rqos->next = q->rq_qos;
        q->rq_qos = rqos;
        blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);

        blk_mq_unfreeze_queue(q, memflags);
        return 0;
ebusy:
        blk_mq_unfreeze_queue(q, memflags);
        return -EBUSY;
}

void rq_qos_del(struct rq_qos *rqos)
{
        struct request_queue *q = rqos->disk->queue;
        struct rq_qos **cur;
        unsigned int memflags;

        lockdep_assert_held(&q->rq_qos_mutex);

        memflags = blk_mq_freeze_queue(q);
        for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
                if (*cur == rqos) {
                        *cur = rqos->next;
                        break;
                }
        }
        if (!q->rq_qos)
                blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
        blk_mq_unfreeze_queue(q, memflags);
}










    1 
    3 












































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ICMPV6_H
#define _LINUX_ICMPV6_H

#include <linux/skbuff.h>
#include <linux/ipv6.h>
#include <uapi/linux/icmpv6.h>

static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb)
{
        return (struct icmp6hdr *)skb_transport_header(skb);
}

#include <linux/netdevice.h>

#if IS_ENABLED(CONFIG_IPV6)

void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
                const struct in6_addr *force_saddr,
                const struct inet6_skb_parm *parm);

static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
{
        icmp6_send(skb, type, code, info, NULL, IP6CB(skb));
}

int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
                               unsigned int data_len);

#if IS_ENABLED(CONFIG_NF_NAT)
void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
#else
static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
{
        struct inet6_skb_parm parm = { 0 };
        icmp6_send(skb_in, type, code, info, NULL, &parm);
}
#endif

#else

static inline void icmpv6_send(struct sk_buff *skb,
                               u8 type, u8 code, __u32 info)
{
}

static inline void icmpv6_ndo_send(struct sk_buff *skb,
                                   u8 type, u8 code, __u32 info)
{
}
#endif

extern int                                icmpv6_init(void);
extern int                                icmpv6_err_convert(u8 type, u8 code,
                                                           int *err);
extern void                                icmpv6_cleanup(void);
extern void                                icmpv6_param_prob_reason(struct sk_buff *skb,
                                                                 u8 code, int pos,
                                                                 enum skb_drop_reason reason);

struct flowi6;
struct in6_addr;

void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type,
                      const struct in6_addr *saddr,
                      const struct in6_addr *daddr, int oif);

static inline void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
{
        icmpv6_param_prob_reason(skb, code, pos,
                                 SKB_DROP_REASON_NOT_SPECIFIED);
}

static inline bool icmpv6_is_err(int type)
{
        switch (type) {
        case ICMPV6_DEST_UNREACH:
        case ICMPV6_PKT_TOOBIG:
        case ICMPV6_TIME_EXCEED:
        case ICMPV6_PARAMPROB:
                return true;
        }

        return false;
}

#endif




























































































































































































































































































































    1 
























    1 














    1 

    1 










    1 























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux NET3: IP/IP protocol decoder modified to support
 *                    virtual tunnel interface
 *
 *        Authors:
 *                Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
 */

/*
   This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c

   For comments look at net/ipv4/ip_gre.c --ANK
 */


#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
#include <linux/icmpv6.h>

#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/ip_tunnels.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>

static struct rtnl_link_ops vti_link_ops __read_mostly;

static unsigned int vti_net_id __read_mostly;
static int vti_tunnel_init(struct net_device *dev);

static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
                     int encap_type, bool update_skb_dev)
{
        struct ip_tunnel *tunnel;
        const struct iphdr *iph = ip_hdr(skb);
        struct net *net = dev_net(skb->dev);
        struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
        IP_TUNNEL_DECLARE_FLAGS(flags) = { };

        __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);

        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
                                  iph->saddr, iph->daddr, 0);
        if (tunnel) {
                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
                        goto drop;

                XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;

                if (update_skb_dev)
                        skb->dev = tunnel->dev;

                return xfrm_input(skb, nexthdr, spi, encap_type);
        }

        return -EINVAL;
drop:
        kfree_skb(skb);
        return 0;
}

static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
                           int encap_type)
{
        return vti_input(skb, nexthdr, spi, encap_type, false);
}

static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev)
{
        XFRM_SPI_SKB_CB(skb)->family = AF_INET;
        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);

        return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev);
}

static int vti_rcv_proto(struct sk_buff *skb)
{
        return vti_rcv(skb, 0, false);
}

static int vti_rcv_cb(struct sk_buff *skb, int err)
{
        unsigned short family;
        struct net_device *dev;
        struct xfrm_state *x;
        const struct xfrm_mode *inner_mode;
        struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
        u32 orig_mark = skb->mark;
        int ret;

        if (!tunnel)
                return 1;

        dev = tunnel->dev;

        if (err) {
                DEV_STATS_INC(dev, rx_errors);
                DEV_STATS_INC(dev, rx_dropped);

                return 0;
        }

        x = xfrm_input_state(skb);

        inner_mode = &x->inner_mode;

        if (x->sel.family == AF_UNSPEC) {
                inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
                if (inner_mode == NULL) {
                        XFRM_INC_STATS(dev_net(skb->dev),
                                       LINUX_MIB_XFRMINSTATEMODEERROR);
                        return -EINVAL;
                }
        }

        family = inner_mode->family;

        skb->mark = be32_to_cpu(tunnel->parms.i_key);
        ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
        skb->mark = orig_mark;

        if (!ret)
                return -EPERM;

        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
        skb->dev = dev;
        dev_sw_netstats_rx_add(dev, skb->len);

        return 0;
}

static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src)
{
        xfrm_address_t *daddr = (xfrm_address_t *)&dst;
        xfrm_address_t *saddr = (xfrm_address_t *)&src;

        /* if there is no transform then this tunnel is not functional.
         * Or if the xfrm is not mode tunnel.
         */
        if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
            x->props.family != AF_INET)
                return false;

        if (!dst)
                return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET);

        if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET))
                return false;

        return true;
}

static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
                            struct flowi *fl)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct ip_tunnel_parm_kern *parms = &tunnel->parms;
        struct dst_entry *dst = skb_dst(skb);
        struct net_device *tdev;        /* Device to other host */
        int pkt_len = skb->len;
        int err;
        int mtu;

        if (!dst) {
                switch (skb->protocol) {
                case htons(ETH_P_IP): {
                        struct rtable *rt;

                        fl->u.ip4.flowi4_oif = dev->ifindex;
                        fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
                        rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
                        if (IS_ERR(rt)) {
                                DEV_STATS_INC(dev, tx_carrier_errors);
                                goto tx_error_icmp;
                        }
                        dst = &rt->dst;
                        skb_dst_set(skb, dst);
                        break;
                }
#if IS_ENABLED(CONFIG_IPV6)
                case htons(ETH_P_IPV6):
                        fl->u.ip6.flowi6_oif = dev->ifindex;
                        fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
                        dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
                        if (dst->error) {
                                dst_release(dst);
                                dst = NULL;
                                DEV_STATS_INC(dev, tx_carrier_errors);
                                goto tx_error_icmp;
                        }
                        skb_dst_set(skb, dst);
                        break;
#endif
                default:
                        DEV_STATS_INC(dev, tx_carrier_errors);
                        goto tx_error_icmp;
                }
        }

        dst_hold(dst);
        dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0);
        if (IS_ERR(dst)) {
                DEV_STATS_INC(dev, tx_carrier_errors);
                goto tx_error_icmp;
        }

        if (dst->flags & DST_XFRM_QUEUE)
                goto xmit;

        if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
                DEV_STATS_INC(dev, tx_carrier_errors);
                dst_release(dst);
                goto tx_error_icmp;
        }

        tdev = dst_dev(dst);

        if (tdev == dev) {
                dst_release(dst);
                DEV_STATS_INC(dev, collisions);
                goto tx_error;
        }

        mtu = dst_mtu(dst);
        if (skb->len > mtu) {
                skb_dst_update_pmtu_no_confirm(skb, mtu);
                if (skb->protocol == htons(ETH_P_IP)) {
                        if (!(ip_hdr(skb)->frag_off & htons(IP_DF)))
                                goto xmit;
                        icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
                                      htonl(mtu));
                } else {
                        if (mtu < IPV6_MIN_MTU)
                                mtu = IPV6_MIN_MTU;

                        icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                }

                dst_release(dst);
                goto tx_error;
        }

xmit:
        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
        skb_dst_set(skb, dst);
        skb->dev = skb_dst_dev(skb);

        err = dst_output(tunnel->net, skb->sk, skb);
        if (net_xmit_eval(err) == 0)
                err = pkt_len;
        iptunnel_xmit_stats(dev, err);
        return NETDEV_TX_OK;

tx_error_icmp:
        dst_link_failure(skb);
tx_error:
        DEV_STATS_INC(dev, tx_errors);
        kfree_skb(skb);
        return NETDEV_TX_OK;
}

/* This function assumes it is being called from dev_queue_xmit()
 * and that skb is filled properly by that function.
 */
static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct flowi fl;

        if (!pskb_inet_may_pull(skb))
                goto tx_err;

        memset(&fl, 0, sizeof(fl));

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
                xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET);
                break;
        case htons(ETH_P_IPV6):
                memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
                xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6);
                break;
        default:
                goto tx_err;
        }

        /* override mark with tunnel output key */
        fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key);

        return vti_xmit(skb, dev, &fl);

tx_err:
        DEV_STATS_INC(dev, tx_errors);
        kfree_skb(skb);
        return NETDEV_TX_OK;
}

static int vti4_err(struct sk_buff *skb, u32 info)
{
        __be32 spi;
        __u32 mark;
        struct xfrm_state *x;
        struct ip_tunnel *tunnel;
        struct ip_esp_hdr *esph;
        struct ip_auth_hdr *ah ;
        struct ip_comp_hdr *ipch;
        struct net *net = dev_net(skb->dev);
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        int protocol = iph->protocol;
        struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
        IP_TUNNEL_DECLARE_FLAGS(flags) = { };

        __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);

        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
                                  iph->daddr, iph->saddr, 0);
        if (!tunnel)
                return -1;

        mark = be32_to_cpu(tunnel->parms.o_key);

        switch (protocol) {
        case IPPROTO_ESP:
                esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
                spi = esph->spi;
                break;
        case IPPROTO_AH:
                ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
                spi = ah->spi;
                break;
        case IPPROTO_COMP:
                ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
                spi = htonl(ntohs(ipch->cpi));
                break;
        default:
                return 0;
        }

        switch (icmp_hdr(skb)->type) {
        case ICMP_DEST_UNREACH:
                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
                        return 0;
                break;
        case ICMP_REDIRECT:
                break;
        default:
                return 0;
        }

        x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
                              spi, protocol, AF_INET);
        if (!x)
                return 0;

        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
                ipv4_update_pmtu(skb, net, info, 0, protocol);
        else
                ipv4_redirect(skb, net, 0, protocol);
        xfrm_state_put(x);

        return 0;
}

static int
vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
{
        IP_TUNNEL_DECLARE_FLAGS(flags) = { };
        int err = 0;

        if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
                if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP ||
                    p->iph.ihl != 5)
                        return -EINVAL;
        }

        if (!ip_tunnel_flags_is_be16_compat(p->i_flags) ||
            !ip_tunnel_flags_is_be16_compat(p->o_flags))
                return -EOVERFLOW;

        if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY))
                p->i_key = 0;
        if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY))
                p->o_key = 0;

        __set_bit(IP_TUNNEL_VTI_BIT, flags);
        ip_tunnel_flags_copy(p->i_flags, flags);

        err = ip_tunnel_ctl(dev, p, cmd);
        if (err)
                return err;

        if (cmd != SIOCDELTUNNEL) {
                ip_tunnel_flags_from_be16(flags, GRE_KEY);
                ip_tunnel_flags_or(p->i_flags, p->i_flags, flags);
                ip_tunnel_flags_or(p->o_flags, p->o_flags, flags);
        }
        return 0;
}

static const struct net_device_ops vti_netdev_ops = {
        .ndo_init        = vti_tunnel_init,
        .ndo_uninit        = ip_tunnel_uninit,
        .ndo_start_xmit        = vti_tunnel_xmit,
        .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
        .ndo_change_mtu        = ip_tunnel_change_mtu,
        .ndo_get_stats64 = dev_get_tstats64,
        .ndo_get_iflink = ip_tunnel_get_iflink,
        .ndo_tunnel_ctl        = vti_tunnel_ctl,
};

static void vti_tunnel_setup(struct net_device *dev)
{
        dev->netdev_ops                = &vti_netdev_ops;
        dev->header_ops                = &ip_tunnel_header_ops;
        dev->type                = ARPHRD_TUNNEL;
        ip_tunnel_setup(dev, vti_net_id);
}

static int vti_tunnel_init(struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct iphdr *iph = &tunnel->parms.iph;

        __dev_addr_set(dev, &iph->saddr, 4);
        memcpy(dev->broadcast, &iph->daddr, 4);

        dev->flags                = IFF_NOARP;
        dev->addr_len                = 4;
        dev->lltx                = true;
        netif_keep_dst(dev);

        return ip_tunnel_init(dev);
}

static void __net_init vti_fb_tunnel_init(struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct iphdr *iph = &tunnel->parms.iph;

        iph->version                = 4;
        iph->protocol                = IPPROTO_IPIP;
        iph->ihl                = 5;
}

static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
        .handler        =        vti_rcv_proto,
        .input_handler        =        vti_input_proto,
        .cb_handler        =        vti_rcv_cb,
        .err_handler        =        vti4_err,
        .priority        =        100,
};

static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
        .handler        =        vti_rcv_proto,
        .input_handler        =        vti_input_proto,
        .cb_handler        =        vti_rcv_cb,
        .err_handler        =        vti4_err,
        .priority        =        100,
};

static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
        .handler        =        vti_rcv_proto,
        .input_handler        =        vti_input_proto,
        .cb_handler        =        vti_rcv_cb,
        .err_handler        =        vti4_err,
        .priority        =        100,
};

#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
static int vti_rcv_tunnel(struct sk_buff *skb)
{
        XFRM_SPI_SKB_CB(skb)->family = AF_INET;
        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);

        return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false);
}

static struct xfrm_tunnel vti_ipip_handler __read_mostly = {
        .handler        =        vti_rcv_tunnel,
        .cb_handler        =        vti_rcv_cb,
        .err_handler        =        vti4_err,
        .priority        =        0,
};

#if IS_ENABLED(CONFIG_IPV6)
static struct xfrm_tunnel vti_ipip6_handler __read_mostly = {
        .handler        =        vti_rcv_tunnel,
        .cb_handler        =        vti_rcv_cb,
        .err_handler        =        vti4_err,
        .priority        =        0,
};
#endif
#endif

static int __net_init vti_init_net(struct net *net)
{
        int err;
        struct ip_tunnel_net *itn;

        err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
        if (err)
                return err;
        itn = net_generic(net, vti_net_id);
        if (itn->fb_tunnel_dev)
                vti_fb_tunnel_init(itn->fb_tunnel_dev);
        return 0;
}

static void __net_exit vti_exit_rtnl(struct net *net,
                                     struct list_head *dev_to_kill)
{
        ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill);
}

static struct pernet_operations vti_net_ops = {
        .init = vti_init_net,
        .exit_rtnl = vti_exit_rtnl,
        .id   = &vti_net_id,
        .size = sizeof(struct ip_tunnel_net),
};

static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
                               struct netlink_ext_ack *extack)
{
        return 0;
}

static void vti_netlink_parms(struct nlattr *data[],
                              struct ip_tunnel_parm_kern *parms,
                              __u32 *fwmark)
{
        memset(parms, 0, sizeof(*parms));

        parms->iph.protocol = IPPROTO_IPIP;

        if (!data)
                return;

        __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags);

        if (data[IFLA_VTI_LINK])
                parms->link = nla_get_u32(data[IFLA_VTI_LINK]);

        if (data[IFLA_VTI_IKEY])
                parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);

        if (data[IFLA_VTI_OKEY])
                parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);

        if (data[IFLA_VTI_LOCAL])
                parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]);

        if (data[IFLA_VTI_REMOTE])
                parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]);

        if (data[IFLA_VTI_FWMARK])
                *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]);
}

static int vti_newlink(struct net_device *dev,
                       struct rtnl_newlink_params *params,
                       struct netlink_ext_ack *extack)
{
        struct nlattr **data = params->data;
        struct ip_tunnel_parm_kern parms;
        struct nlattr **tb = params->tb;
        __u32 fwmark = 0;

        vti_netlink_parms(data, &parms, &fwmark);
        return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb,
                                 &parms, fwmark);
}

static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
                          struct nlattr *data[],
                          struct netlink_ext_ack *extack)
{
        struct ip_tunnel *t = netdev_priv(dev);
        struct ip_tunnel_parm_kern p;
        __u32 fwmark = t->fwmark;

        vti_netlink_parms(data, &p, &fwmark);
        return ip_tunnel_changelink(dev, tb, &p, fwmark);
}

static size_t vti_get_size(const struct net_device *dev)
{
        return
                /* IFLA_VTI_LINK */
                nla_total_size(4) +
                /* IFLA_VTI_IKEY */
                nla_total_size(4) +
                /* IFLA_VTI_OKEY */
                nla_total_size(4) +
                /* IFLA_VTI_LOCAL */
                nla_total_size(4) +
                /* IFLA_VTI_REMOTE */
                nla_total_size(4) +
                /* IFLA_VTI_FWMARK */
                nla_total_size(4) +
                0;
}

static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct ip_tunnel *t = netdev_priv(dev);
        struct ip_tunnel_parm_kern *p = &t->parms;

        if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) ||
            nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) ||
            nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key) ||
            nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr) ||
            nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr) ||
            nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark))
                return -EMSGSIZE;

        return 0;
}

static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
        [IFLA_VTI_LINK]                = { .type = NLA_U32 },
        [IFLA_VTI_IKEY]                = { .type = NLA_U32 },
        [IFLA_VTI_OKEY]                = { .type = NLA_U32 },
        [IFLA_VTI_LOCAL]        = { .len = sizeof_field(struct iphdr, saddr) },
        [IFLA_VTI_REMOTE]        = { .len = sizeof_field(struct iphdr, daddr) },
        [IFLA_VTI_FWMARK]        = { .type = NLA_U32 },
};

static struct rtnl_link_ops vti_link_ops __read_mostly = {
        .kind                = "vti",
        .maxtype        = IFLA_VTI_MAX,
        .policy                = vti_policy,
        .priv_size        = sizeof(struct ip_tunnel),
        .setup                = vti_tunnel_setup,
        .validate        = vti_tunnel_validate,
        .newlink        = vti_newlink,
        .changelink        = vti_changelink,
        .dellink        = ip_tunnel_dellink,
        .get_size        = vti_get_size,
        .fill_info        = vti_fill_info,
        .get_link_net        = ip_tunnel_get_link_net,
};

static int __init vti_init(void)
{
        const char *msg;
        int err;

        pr_info("IPv4 over IPsec tunneling driver\n");

        msg = "tunnel device";
        err = register_pernet_device(&vti_net_ops);
        if (err < 0)
                goto pernet_dev_failed;

        msg = "tunnel protocols";
        err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
        if (err < 0)
                goto xfrm_proto_esp_failed;
        err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
        if (err < 0)
                goto xfrm_proto_ah_failed;
        err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
        if (err < 0)
                goto xfrm_proto_comp_failed;

#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
        msg = "ipip tunnel";
        err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET);
        if (err < 0)
                goto xfrm_tunnel_ipip_failed;
#if IS_ENABLED(CONFIG_IPV6)
        err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6);
        if (err < 0)
                goto xfrm_tunnel_ipip6_failed;
#endif
#endif

        msg = "netlink interface";
        err = rtnl_link_register(&vti_link_ops);
        if (err < 0)
                goto rtnl_link_failed;

        return err;

rtnl_link_failed:
#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
#if IS_ENABLED(CONFIG_IPV6)
        xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6);
xfrm_tunnel_ipip6_failed:
#endif
        xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
xfrm_tunnel_ipip_failed:
#endif
        xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
xfrm_proto_comp_failed:
        xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
xfrm_proto_ah_failed:
        xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
xfrm_proto_esp_failed:
        unregister_pernet_device(&vti_net_ops);
pernet_dev_failed:
        pr_err("vti init: failed to register %s\n", msg);
        return err;
}

static void __exit vti_fini(void)
{
        rtnl_link_unregister(&vti_link_ops);
#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
#if IS_ENABLED(CONFIG_IPV6)
        xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6);
#endif
        xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
#endif
        xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
        xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
        xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
        unregister_pernet_device(&vti_net_ops);
}

module_init(vti_init);
module_exit(vti_fini);
MODULE_DESCRIPTION("Virtual (secure) IP tunneling library");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("vti");
MODULE_ALIAS_NETDEV("ip_vti0");

































   12 









   11 











































    1 






















    1 




















































    1 







    1 

    1 











    1 




    1 













    2 


    1 



    1 





    1 




    2 




    1 



    1 























































































































































































   20 









    1 




    1 






   11 









































































   22 




   25 





   23 















   22 





   20 







   19 





   20 

   20 















   13 



    1 
   13 











   14 


    1 
   12 






   11 




   11 

   11 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
 * Authors: David Chinner and Glauber Costa
 *
 * Generic LRU infrastructure
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
#include "slab.h"
#include "internal.h"

#ifdef CONFIG_MEMCG
static LIST_HEAD(memcg_list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);

static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
        return lru->memcg_aware;
}

static void list_lru_register(struct list_lru *lru)
{
        if (!list_lru_memcg_aware(lru))
                return;

        mutex_lock(&list_lrus_mutex);
        list_add(&lru->list, &memcg_list_lrus);
        mutex_unlock(&list_lrus_mutex);
}

static void list_lru_unregister(struct list_lru *lru)
{
        if (!list_lru_memcg_aware(lru))
                return;

        mutex_lock(&list_lrus_mutex);
        list_del(&lru->list);
        mutex_unlock(&list_lrus_mutex);
}

static int lru_shrinker_id(struct list_lru *lru)
{
        return lru->shrinker_id;
}

static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
        if (list_lru_memcg_aware(lru) && idx >= 0) {
                struct list_lru_memcg *mlru = xa_load(&lru->xa, idx);

                return mlru ? &mlru->node[nid] : NULL;
        }
        return &lru->node[nid].lru;
}

static inline bool lock_list_lru(struct list_lru_one *l, bool irq)
{
        if (irq)
                spin_lock_irq(&l->lock);
        else
                spin_lock(&l->lock);
        if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) {
                if (irq)
                        spin_unlock_irq(&l->lock);
                else
                        spin_unlock(&l->lock);
                return false;
        }
        return true;
}

static inline struct list_lru_one *
lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                       bool irq, bool skip_empty)
{
        struct list_lru_one *l;

        rcu_read_lock();
again:
        l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
        if (likely(l) && lock_list_lru(l, irq)) {
                rcu_read_unlock();
                return l;
        }
        /*
         * Caller may simply bail out if raced with reparenting or
         * may iterate through the list_lru and expect empty slots.
         */
        if (skip_empty) {
                rcu_read_unlock();
                return NULL;
        }
        VM_WARN_ON(!css_is_dying(&memcg->css));
        memcg = parent_mem_cgroup(memcg);
        goto again;
}

static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
{
        if (irq_off)
                spin_unlock_irq(&l->lock);
        else
                spin_unlock(&l->lock);
}
#else
static void list_lru_register(struct list_lru *lru)
{
}

static void list_lru_unregister(struct list_lru *lru)
{
}

static int lru_shrinker_id(struct list_lru *lru)
{
        return -1;
}

static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
        return false;
}

static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
        return &lru->node[nid].lru;
}

static inline struct list_lru_one *
lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                       bool irq, bool skip_empty)
{
        struct list_lru_one *l = &lru->node[nid].lru;

        if (irq)
                spin_lock_irq(&l->lock);
        else
                spin_lock(&l->lock);

        return l;
}

static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
{
        if (irq_off)
                spin_unlock_irq(&l->lock);
        else
                spin_unlock(&l->lock);
}
#endif /* CONFIG_MEMCG */

/* The caller must ensure the memcg lifetime. */
bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
                  struct mem_cgroup *memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;

        l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
        if (!l)
                return false;
        if (list_empty(item)) {
                list_add_tail(item, &l->list);
                /* Set shrinker bit if the first element was added */
                if (!l->nr_items++)
                        set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
                unlock_list_lru(l, false);
                atomic_long_inc(&nlru->nr_items);
                return true;
        }
        unlock_list_lru(l, false);
        return false;
}
EXPORT_SYMBOL_GPL(list_lru_add);

bool list_lru_add_obj(struct list_lru *lru, struct list_head *item)
{
        bool ret;
        int nid = page_to_nid(virt_to_page(item));

        if (list_lru_memcg_aware(lru)) {
                rcu_read_lock();
                ret = list_lru_add(lru, item, nid, mem_cgroup_from_virt(item));
                rcu_read_unlock();
        } else {
                ret = list_lru_add(lru, item, nid, NULL);
        }

        return ret;
}
EXPORT_SYMBOL_GPL(list_lru_add_obj);

/* The caller must ensure the memcg lifetime. */
bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
                  struct mem_cgroup *memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;
        l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
        if (!l)
                return false;
        if (!list_empty(item)) {
                list_del_init(item);
                l->nr_items--;
                unlock_list_lru(l, false);
                atomic_long_dec(&nlru->nr_items);
                return true;
        }
        unlock_list_lru(l, false);
        return false;
}

bool list_lru_del_obj(struct list_lru *lru, struct list_head *item)
{
        bool ret;
        int nid = page_to_nid(virt_to_page(item));

        if (list_lru_memcg_aware(lru)) {
                rcu_read_lock();
                ret = list_lru_del(lru, item, nid, mem_cgroup_from_virt(item));
                rcu_read_unlock();
        } else {
                ret = list_lru_del(lru, item, nid, NULL);
        }

        return ret;
}
EXPORT_SYMBOL_GPL(list_lru_del_obj);

void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
{
        list_del_init(item);
        list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate);

void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
                           struct list_head *head)
{
        list_move(item, head);
        list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);

unsigned long list_lru_count_one(struct list_lru *lru,
                                 int nid, struct mem_cgroup *memcg)
{
        struct list_lru_one *l;
        long count;

        rcu_read_lock();
        l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
        count = l ? READ_ONCE(l->nr_items) : 0;
        rcu_read_unlock();

        if (unlikely(count < 0))
                count = 0;

        return count;
}
EXPORT_SYMBOL_GPL(list_lru_count_one);

unsigned long list_lru_count_node(struct list_lru *lru, int nid)
{
        struct list_lru_node *nlru;

        nlru = &lru->node[nid];
        return atomic_long_read(&nlru->nr_items);
}
EXPORT_SYMBOL_GPL(list_lru_count_node);

static unsigned long
__list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                    list_lru_walk_cb isolate, void *cb_arg,
                    unsigned long *nr_to_walk, bool irq_off)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l = NULL;
        struct list_head *item, *n;
        unsigned long isolated = 0;

restart:
        l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true);
        if (!l)
                return isolated;
        list_for_each_safe(item, n, &l->list) {
                enum lru_status ret;

                /*
                 * decrement nr_to_walk first so that we don't livelock if we
                 * get stuck on large numbers of LRU_RETRY items
                 */
                if (!*nr_to_walk)
                        break;
                --*nr_to_walk;

                ret = isolate(item, l, cb_arg);
                switch (ret) {
                /*
                 * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru
                 * lock. List traversal will have to restart from scratch.
                 */
                case LRU_RETRY:
                        goto restart;
                case LRU_REMOVED_RETRY:
                        fallthrough;
                case LRU_REMOVED:
                        isolated++;
                        atomic_long_dec(&nlru->nr_items);
                        if (ret == LRU_REMOVED_RETRY)
                                goto restart;
                        break;
                case LRU_ROTATE:
                        list_move_tail(item, &l->list);
                        break;
                case LRU_SKIP:
                        break;
                case LRU_STOP:
                        goto out;
                default:
                        BUG();
                }
        }
        unlock_list_lru(l, irq_off);
out:
        return isolated;
}

unsigned long
list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                  list_lru_walk_cb isolate, void *cb_arg,
                  unsigned long *nr_to_walk)
{
        return __list_lru_walk_one(lru, nid, memcg, isolate,
                                   cb_arg, nr_to_walk, false);
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);

unsigned long
list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                      list_lru_walk_cb isolate, void *cb_arg,
                      unsigned long *nr_to_walk)
{
        return __list_lru_walk_one(lru, nid, memcg, isolate,
                                   cb_arg, nr_to_walk, true);
}

unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
                                 list_lru_walk_cb isolate, void *cb_arg,
                                 unsigned long *nr_to_walk)
{
        long isolated = 0;

        isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
                                      nr_to_walk);

#ifdef CONFIG_MEMCG
        if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
                struct list_lru_memcg *mlru;
                struct mem_cgroup *memcg;
                unsigned long index;

                xa_for_each(&lru->xa, index, mlru) {
                        rcu_read_lock();
                        memcg = mem_cgroup_from_private_id(index);
                        if (!mem_cgroup_tryget(memcg)) {
                                rcu_read_unlock();
                                continue;
                        }
                        rcu_read_unlock();
                        isolated += __list_lru_walk_one(lru, nid, memcg,
                                                        isolate, cb_arg,
                                                        nr_to_walk, false);
                        mem_cgroup_put(memcg);

                        if (*nr_to_walk <= 0)
                                break;
                }
        }
#endif

        return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);

static void init_one_lru(struct list_lru *lru, struct list_lru_one *l)
{
        INIT_LIST_HEAD(&l->list);
        spin_lock_init(&l->lock);
        l->nr_items = 0;
#ifdef CONFIG_LOCKDEP
        if (lru->key)
                lockdep_set_class(&l->lock, lru->key);
#endif
}

#ifdef CONFIG_MEMCG
static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp)
{
        int nid;
        struct list_lru_memcg *mlru;

        mlru = kmalloc_flex(*mlru, node, nr_node_ids, gfp);
        if (!mlru)
                return NULL;

        for_each_node(nid)
                init_one_lru(lru, &mlru->node[nid]);

        return mlru;
}

static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
        if (memcg_aware)
                xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ);
        lru->memcg_aware = memcg_aware;
}

static void memcg_destroy_list_lru(struct list_lru *lru)
{
        XA_STATE(xas, &lru->xa, 0);
        struct list_lru_memcg *mlru;

        if (!list_lru_memcg_aware(lru))
                return;

        xas_lock_irq(&xas);
        xas_for_each(&xas, mlru, ULONG_MAX) {
                kfree(mlru);
                xas_store(&xas, NULL);
        }
        xas_unlock_irq(&xas);
}

static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid,
                                        struct list_lru_one *src,
                                        struct mem_cgroup *dst_memcg)
{
        int dst_idx = dst_memcg->kmemcg_id;
        struct list_lru_one *dst;

        spin_lock_irq(&src->lock);
        dst = list_lru_from_memcg_idx(lru, nid, dst_idx);
        spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING);

        list_splice_init(&src->list, &dst->list);
        if (src->nr_items) {
                WARN_ON(src->nr_items < 0);
                dst->nr_items += src->nr_items;
                set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
        }
        /* Mark the list_lru_one dead */
        src->nr_items = LONG_MIN;

        spin_unlock(&dst->lock);
        spin_unlock_irq(&src->lock);
}

void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
        struct list_lru *lru;
        int i;

        mutex_lock(&list_lrus_mutex);
        list_for_each_entry(lru, &memcg_list_lrus, list) {
                struct list_lru_memcg *mlru;
                XA_STATE(xas, &lru->xa, memcg->kmemcg_id);

                /*
                 * Lock the Xarray to ensure no on going list_lru_memcg
                 * allocation and further allocation will see css_is_dying().
                 */
                xas_lock_irq(&xas);
                mlru = xas_store(&xas, NULL);
                xas_unlock_irq(&xas);
                if (!mlru)
                        continue;

                /*
                 * With Xarray value set to NULL, holding the lru lock below
                 * prevents list_lru_{add,del,isolate} from touching the lru,
                 * safe to reparent.
                 */
                for_each_node(i)
                        memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent);

                /*
                 * Here all list_lrus corresponding to the cgroup are guaranteed
                 * to remain empty, we can safely free this lru, any further
                 * memcg_list_lru_alloc() call will simply bail out.
                 */
                kvfree_rcu(mlru, rcu);
        }
        mutex_unlock(&list_lrus_mutex);
}

static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
                                            struct list_lru *lru)
{
        int idx = memcg->kmemcg_id;

        return idx < 0 || xa_load(&lru->xa, idx);
}

int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
                         gfp_t gfp)
{
        unsigned long flags;
        struct list_lru_memcg *mlru = NULL;
        struct mem_cgroup *pos, *parent;
        XA_STATE(xas, &lru->xa, 0);

        if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
                return 0;

        gfp &= GFP_RECLAIM_MASK;
        /*
         * Because the list_lru can be reparented to the parent cgroup's
         * list_lru, we should make sure that this cgroup and all its
         * ancestors have allocated list_lru_memcg.
         */
        do {
                /*
                 * Keep finding the farest parent that wasn't populated
                 * until found memcg itself.
                 */
                pos = memcg;
                parent = parent_mem_cgroup(pos);
                while (!memcg_list_lru_allocated(parent, lru)) {
                        pos = parent;
                        parent = parent_mem_cgroup(pos);
                }

                if (!mlru) {
                        mlru = memcg_init_list_lru_one(lru, gfp);
                        if (!mlru)
                                return -ENOMEM;
                }
                xas_set(&xas, pos->kmemcg_id);
                do {
                        xas_lock_irqsave(&xas, flags);
                        if (!xas_load(&xas) && !css_is_dying(&pos->css)) {
                                xas_store(&xas, mlru);
                                if (!xas_error(&xas))
                                        mlru = NULL;
                        }
                        xas_unlock_irqrestore(&xas, flags);
                } while (xas_nomem(&xas, gfp));
        } while (pos != memcg && !css_is_dying(&pos->css));

        if (unlikely(mlru))
                kfree(mlru);

        return xas_error(&xas);
}
#else
static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
}

static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
#endif /* CONFIG_MEMCG */

int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker)
{
        int i;

#ifdef CONFIG_MEMCG
        if (shrinker)
                lru->shrinker_id = shrinker->id;
        else
                lru->shrinker_id = -1;

        if (mem_cgroup_kmem_disabled())
                memcg_aware = false;
#endif

        lru->node = kzalloc_objs(*lru->node, nr_node_ids);
        if (!lru->node)
                return -ENOMEM;

        for_each_node(i)
                init_one_lru(lru, &lru->node[i].lru);

        memcg_init_list_lru(lru, memcg_aware);
        list_lru_register(lru);

        return 0;
}
EXPORT_SYMBOL_GPL(__list_lru_init);

void list_lru_destroy(struct list_lru *lru)
{
        /* Already destroyed or not yet initialized? */
        if (!lru->node)
                return;

        list_lru_unregister(lru);

        memcg_destroy_list_lru(lru);
        kfree(lru->node);
        lru->node = NULL;

#ifdef CONFIG_MEMCG
        lru->shrinker_id = -1;
#endif
}
EXPORT_SYMBOL_GPL(list_lru_destroy);









































































































































































































    5 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Integer base 2 logarithm calculation
 *
 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_LOG2_H
#define _LINUX_LOG2_H

#include <linux/types.h>
#include <linux/bitops.h>

/*
 * non-constant log of base 2 calculators
 * - the arch may override these in asm/bitops.h if they can be implemented
 *   more efficiently than using fls() and fls64()
 * - the arch is not required to handle n==0 if implementing the fallback
 */
#ifndef CONFIG_ARCH_HAS_ILOG2_U32
static __always_inline __attribute__((const))
int __ilog2_u32(u32 n)
{
        return fls(n) - 1;
}
#endif

#ifndef CONFIG_ARCH_HAS_ILOG2_U64
static __always_inline __attribute__((const))
int __ilog2_u64(u64 n)
{
        return fls64(n) - 1;
}
#endif

/**
 * is_power_of_2() - check if a value is a power of two
 * @n: the value to check
 *
 * Determine whether some value is a power of two, where zero is
 * *not* considered a power of two.
 * Return: true if @n is a power of 2, otherwise false.
 */
static __always_inline __attribute__((const))
bool is_power_of_2(unsigned long n)
{
        return n - 1 < (n ^ (n - 1));
}

/**
 * __roundup_pow_of_two() - round up to nearest power of two
 * @n: value to round up
 */
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
        return 1UL << fls_long(n - 1);
}

/**
 * __rounddown_pow_of_two() - round down to nearest power of two
 * @n: value to round down
 */
static inline __attribute__((const))
unsigned long __rounddown_pow_of_two(unsigned long n)
{
        return 1UL << (fls_long(n) - 1);
}

/**
 * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value
 * @n: parameter
 *
 * Use this where sparse expects a true constant expression, e.g. for array
 * indices.
 */
#define const_ilog2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                (n) < 2 ? 0 :                        \
                (n) & (1ULL << 63) ? 63 :        \
                (n) & (1ULL << 62) ? 62 :        \
                (n) & (1ULL << 61) ? 61 :        \
                (n) & (1ULL << 60) ? 60 :        \
                (n) & (1ULL << 59) ? 59 :        \
                (n) & (1ULL << 58) ? 58 :        \
                (n) & (1ULL << 57) ? 57 :        \
                (n) & (1ULL << 56) ? 56 :        \
                (n) & (1ULL << 55) ? 55 :        \
                (n) & (1ULL << 54) ? 54 :        \
                (n) & (1ULL << 53) ? 53 :        \
                (n) & (1ULL << 52) ? 52 :        \
                (n) & (1ULL << 51) ? 51 :        \
                (n) & (1ULL << 50) ? 50 :        \
                (n) & (1ULL << 49) ? 49 :        \
                (n) & (1ULL << 48) ? 48 :        \
                (n) & (1ULL << 47) ? 47 :        \
                (n) & (1ULL << 46) ? 46 :        \
                (n) & (1ULL << 45) ? 45 :        \
                (n) & (1ULL << 44) ? 44 :        \
                (n) & (1ULL << 43) ? 43 :        \
                (n) & (1ULL << 42) ? 42 :        \
                (n) & (1ULL << 41) ? 41 :        \
                (n) & (1ULL << 40) ? 40 :        \
                (n) & (1ULL << 39) ? 39 :        \
                (n) & (1ULL << 38) ? 38 :        \
                (n) & (1ULL << 37) ? 37 :        \
                (n) & (1ULL << 36) ? 36 :        \
                (n) & (1ULL << 35) ? 35 :        \
                (n) & (1ULL << 34) ? 34 :        \
                (n) & (1ULL << 33) ? 33 :        \
                (n) & (1ULL << 32) ? 32 :        \
                (n) & (1ULL << 31) ? 31 :        \
                (n) & (1ULL << 30) ? 30 :        \
                (n) & (1ULL << 29) ? 29 :        \
                (n) & (1ULL << 28) ? 28 :        \
                (n) & (1ULL << 27) ? 27 :        \
                (n) & (1ULL << 26) ? 26 :        \
                (n) & (1ULL << 25) ? 25 :        \
                (n) & (1ULL << 24) ? 24 :        \
                (n) & (1ULL << 23) ? 23 :        \
                (n) & (1ULL << 22) ? 22 :        \
                (n) & (1ULL << 21) ? 21 :        \
                (n) & (1ULL << 20) ? 20 :        \
                (n) & (1ULL << 19) ? 19 :        \
                (n) & (1ULL << 18) ? 18 :        \
                (n) & (1ULL << 17) ? 17 :        \
                (n) & (1ULL << 16) ? 16 :        \
                (n) & (1ULL << 15) ? 15 :        \
                (n) & (1ULL << 14) ? 14 :        \
                (n) & (1ULL << 13) ? 13 :        \
                (n) & (1ULL << 12) ? 12 :        \
                (n) & (1ULL << 11) ? 11 :        \
                (n) & (1ULL << 10) ? 10 :        \
                (n) & (1ULL <<  9) ?  9 :        \
                (n) & (1ULL <<  8) ?  8 :        \
                (n) & (1ULL <<  7) ?  7 :        \
                (n) & (1ULL <<  6) ?  6 :        \
                (n) & (1ULL <<  5) ?  5 :        \
                (n) & (1ULL <<  4) ?  4 :        \
                (n) & (1ULL <<  3) ?  3 :        \
                (n) & (1ULL <<  2) ?  2 :        \
                1) :                                \
        -1)

/**
 * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
 * @n: parameter
 *
 * constant-capable log of base 2 calculation
 * - this can be used to initialise global variables from constant data, hence
 * the massive ternary operator construction
 *
 * selects the appropriately-sized optimised version depending on sizeof(n)
 */
#define ilog2(n) \
( \
        __builtin_constant_p(n) ?        \
        ((n) < 2 ? 0 :                        \
         63 - __builtin_clzll(n)) :        \
        (sizeof(n) <= 4) ?                \
        __ilog2_u32(n) :                \
        __ilog2_u64(n)                        \
 )

/**
 * roundup_pow_of_two - round the given value up to nearest power of two
 * @n: parameter
 *
 * round the given value up to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define roundup_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 1) ? 1 :                \
                (1UL << (ilog2((n) - 1) + 1))        \
                                   ) :                \
        __roundup_pow_of_two(n)                        \
 )

/**
 * rounddown_pow_of_two - round the given value down to nearest power of two
 * @n: parameter
 *
 * round the given value down to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define rounddown_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                (1UL << ilog2(n))) :                \
        __rounddown_pow_of_two(n)                \
 )

static inline __attribute_const__
int __order_base_2(unsigned long n)
{
        return n > 1 ? ilog2(n - 1) + 1 : 0;
}

/**
 * order_base_2 - calculate the (rounded up) base 2 order of the argument
 * @n: parameter
 *
 * The first few values calculated by this routine:
 *  ob2(0) = 0
 *  ob2(1) = 0
 *  ob2(2) = 1
 *  ob2(3) = 2
 *  ob2(4) = 2
 *  ob2(5) = 3
 *  ... and so on.
 */
#define order_base_2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1) ? 0 :        \
                ilog2((n) - 1) + 1) :                \
        __order_base_2(n)                        \
)

static inline __attribute__((const))
int __bits_per(unsigned long n)
{
        if (n < 2)
                return 1;
        if (is_power_of_2(n))
                return order_base_2(n) + 1;
        return order_base_2(n);
}

/**
 * bits_per - calculate the number of bits required for the argument
 * @n: parameter
 *
 * This is constant-capable and can be used for compile time
 * initializations, e.g bitfields.
 *
 * The first few values calculated by this routine:
 * bf(0) = 1
 * bf(1) = 1
 * bf(2) = 2
 * bf(3) = 2
 * bf(4) = 3
 * ... and so on.
 */
#define bits_per(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1)                \
                        ? 1 : ilog2(n) + 1        \
        ) :                                        \
        __bits_per(n)                                \
)

/**
 * max_pow_of_two_factor - return highest power-of-2 factor
 * @n: parameter
 *
 * find highest power-of-2 which is evenly divisible into n.
 * 0 is returned for n == 0 or 1.
 */
static inline __attribute__((const))
unsigned int max_pow_of_two_factor(unsigned int n)
{
        return n & -n;
}

#endif /* _LINUX_LOG2_H */
























































    1 















    1 






















    1 














    1 



























































    1 












    1 









    1 





    1 








































    1 









    1 


































































































































    1 





    1 
    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/*
 *  linux/fs/hfs/mdb.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains functions for reading/writing the MDB.
 */

#include <linux/cdrom.h>
#include <linux/blkdev.h>
#include <linux/nls.h>
#include <linux/slab.h>

#include "hfs_fs.h"
#include "btree.h"

/*================ File-local data types ================*/

/*
 * The HFS Master Directory Block (MDB).
 *
 * Also known as the Volume Information Block (VIB), this structure is
 * the HFS equivalent of a superblock.
 *
 * Reference: _Inside Macintosh: Files_ pages 2-59 through 2-62
 *
 * modified for HFS Extended
 */

static int hfs_get_last_session(struct super_block *sb,
                                sector_t *start, sector_t *size)
{
        struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);

        /* default values */
        *start = 0;
        *size = bdev_nr_sectors(sb->s_bdev);

        if (HFS_SB(sb)->session >= 0) {
                struct cdrom_tocentry te;
        
                if (!cdi)
                        return -EINVAL;

                te.cdte_track = HFS_SB(sb)->session;
                te.cdte_format = CDROM_LBA;
                if (cdrom_read_tocentry(cdi, &te) ||
                    (te.cdte_ctrl & CDROM_DATA_TRACK) != 4) {
                        pr_err("invalid session number or type of track\n");
                        return -EINVAL;
                }

                *start = (sector_t)te.cdte_addr.lba << 2;
        } else if (cdi) {
                struct cdrom_multisession ms_info;

                ms_info.addr_format = CDROM_LBA;
                if (cdrom_multisession(cdi, &ms_info) == 0 && ms_info.xa_flag)
                        *start = (sector_t)ms_info.addr.lba << 2;
        }

        return 0;
}

bool is_hfs_cnid_counts_valid(struct super_block *sb)
{
        struct hfs_sb_info *sbi = HFS_SB(sb);
        bool corrupted = false;

        if (unlikely(atomic64_read(&sbi->next_id) > U32_MAX)) {
                pr_warn("next CNID exceeds limit\n");
                corrupted = true;
        }
        if (unlikely(atomic64_read(&sbi->file_count) > U32_MAX)) {
                pr_warn("file count exceeds limit\n");
                corrupted = true;
        }
        if (unlikely(atomic64_read(&sbi->folder_count) > U32_MAX)) {
                pr_warn("folder count exceeds limit\n");
                corrupted = true;
        }

        return !corrupted;
}

/*
 * hfs_mdb_get()
 *
 * Build the in-core MDB for a filesystem, including
 * the B-trees and the volume bitmap.
 */
int hfs_mdb_get(struct super_block *sb)
{
        struct buffer_head *bh;
        struct hfs_mdb *mdb, *mdb2;
        unsigned int block;
        char *ptr;
        int off2, len, size, sect;
        sector_t part_start, part_size;
        loff_t off;
        __be16 attrib;

        /* set the device driver to 512-byte blocks */
        size = sb_min_blocksize(sb, HFS_SECTOR_SIZE);
        if (!size)
                return -EINVAL;

        if (hfs_get_last_session(sb, &part_start, &part_size))
                return -EINVAL;
        while (1) {
                /* See if this is an HFS filesystem */
                bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb);
                if (!bh)
                        return -EIO;

                if (mdb->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC))
                        break;
                brelse(bh);

                /* check for a partition block
                 * (should do this only for cdrom/loop though)
                 */
                if (hfs_part_find(sb, &part_start, &part_size))
                        return -EIO;
        }

        HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz);
        if (!size || (size & (HFS_SECTOR_SIZE - 1))) {
                pr_err("bad allocation block size %d\n", size);
                brelse(bh);
                return -EIO;
        }

        size = min(HFS_SB(sb)->alloc_blksz, (u32)PAGE_SIZE);
        /* size must be a multiple of 512 */
        while (size & (size - 1))
                size -= HFS_SECTOR_SIZE;
        sect = be16_to_cpu(mdb->drAlBlSt) + part_start;
        /* align block size to first sector */
        while (sect & ((size - 1) >> HFS_SECTOR_SIZE_BITS))
                size >>= 1;
        /* align block size to weird alloc size */
        while (HFS_SB(sb)->alloc_blksz & (size - 1))
                size >>= 1;
        brelse(bh);
        if (!sb_set_blocksize(sb, size)) {
                pr_err("unable to set blocksize to %u\n", size);
                return -EIO;
        }

        bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb);
        if (!bh)
                return -EIO;
        if (mdb->drSigWord != cpu_to_be16(HFS_SUPER_MAGIC)) {
                brelse(bh);
                return -EIO;
        }

        HFS_SB(sb)->mdb_bh = bh;
        HFS_SB(sb)->mdb = mdb;

        /* These parameters are read from the MDB, and never written */
        HFS_SB(sb)->part_start = part_start;
        HFS_SB(sb)->fs_ablocks = be16_to_cpu(mdb->drNmAlBlks);
        HFS_SB(sb)->fs_div = HFS_SB(sb)->alloc_blksz >> sb->s_blocksize_bits;
        HFS_SB(sb)->clumpablks = be32_to_cpu(mdb->drClpSiz) /
                                 HFS_SB(sb)->alloc_blksz;
        if (!HFS_SB(sb)->clumpablks)
                HFS_SB(sb)->clumpablks = 1;
        HFS_SB(sb)->fs_start = (be16_to_cpu(mdb->drAlBlSt) + part_start) >>
                               (sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS);

        /* These parameters are read from and written to the MDB */
        HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks);
        atomic64_set(&HFS_SB(sb)->next_id, be32_to_cpu(mdb->drNxtCNID));
        HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls);
        HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs);
        atomic64_set(&HFS_SB(sb)->file_count, be32_to_cpu(mdb->drFilCnt));
        atomic64_set(&HFS_SB(sb)->folder_count, be32_to_cpu(mdb->drDirCnt));

        if (!is_hfs_cnid_counts_valid(sb)) {
                pr_warn("filesystem possibly corrupted, running fsck.hfs is recommended. Mounting read-only.\n");
                sb->s_flags |= SB_RDONLY;
        }

        /* TRY to get the alternate (backup) MDB. */
        sect = part_start + part_size - 2;
        bh = sb_bread512(sb, sect, mdb2);
        if (bh) {
                if (mdb2->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) {
                        HFS_SB(sb)->alt_mdb_bh = bh;
                        HFS_SB(sb)->alt_mdb = mdb2;
                } else
                        brelse(bh);
        }

        if (!HFS_SB(sb)->alt_mdb) {
                pr_warn("unable to locate alternate MDB\n");
                pr_warn("continuing without an alternate MDB\n");
        }

        HFS_SB(sb)->bitmap = kzalloc(8192, GFP_KERNEL);
        if (!HFS_SB(sb)->bitmap)
                return -EIO;

        /* read in the bitmap */
        block = be16_to_cpu(mdb->drVBMSt) + part_start;
        off = (loff_t)block << HFS_SECTOR_SIZE_BITS;
        size = (HFS_SB(sb)->fs_ablocks + 8) / 8;
        ptr = (u8 *)HFS_SB(sb)->bitmap;
        while (size) {
                bh = sb_bread(sb, off >> sb->s_blocksize_bits);
                if (!bh) {
                        pr_err("unable to read volume bitmap\n");
                        return -EIO;
                }
                off2 = off & (sb->s_blocksize - 1);
                len = min((int)sb->s_blocksize - off2, size);
                memcpy(ptr, bh->b_data + off2, len);
                brelse(bh);
                ptr += len;
                off += len;
                size -= len;
        }

        HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp);
        if (!HFS_SB(sb)->ext_tree) {
                pr_err("unable to open extent tree\n");
                return -EIO;
        }
        HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp);
        if (!HFS_SB(sb)->cat_tree) {
                pr_err("unable to open catalog tree\n");
                return -EIO;
        }

        attrib = mdb->drAtrb;
        if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
                pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.        Mounting read-only.\n");
                sb->s_flags |= SB_RDONLY;
        }
        if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) {
                pr_warn("filesystem is marked locked, mounting read-only.\n");
                sb->s_flags |= SB_RDONLY;
        }
        if (!sb_rdonly(sb)) {
                /* Mark the volume uncleanly unmounted in case we crash */
                attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT);
                attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT);
                mdb->drAtrb = attrib;
                be32_add_cpu(&mdb->drWrCnt, 1);
                mdb->drLsMod = hfs_mtime();

                mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
                sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
        }

        return 0;
}

/*
 * hfs_mdb_commit()
 *
 * Description:
 *   This updates the MDB on disk.
 *   It does not check, if the superblock has been modified, or
 *   if the filesystem has been mounted read-only. It is mainly
 *   called by hfs_sync_fs() and flush_mdb().
 * Input Variable(s):
 *   struct hfs_mdb *mdb: Pointer to the hfs MDB
 *   int backup;
 * Output Variable(s):
 *   NONE
 * Returns:
 *   void
 * Preconditions:
 *   'mdb' points to a "valid" (struct hfs_mdb).
 * Postconditions:
 *   The HFS MDB and on disk will be updated, by copying the possibly
 *   modified fields from the in memory MDB (in native byte order) to
 *   the disk block buffer.
 *   If 'backup' is non-zero then the alternate MDB is also written
 *   and the function doesn't return until it is actually on disk.
 */
void hfs_mdb_commit(struct super_block *sb)
{
        struct hfs_mdb *mdb = HFS_SB(sb)->mdb;

        if (sb_rdonly(sb))
                return;

        lock_buffer(HFS_SB(sb)->mdb_bh);
        if (test_and_clear_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags)) {
                /* These parameters may have been modified, so write them back */
                mdb->drLsMod = hfs_mtime();
                mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks);
                mdb->drNxtCNID =
                        cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->next_id));
                mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files);
                mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs);
                mdb->drFilCnt =
                        cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->file_count));
                mdb->drDirCnt =
                        cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->folder_count));

                /* write MDB to disk */
                mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
        }

        /* write the backup MDB, not returning until it is written.
         * we only do this when either the catalog or extents overflow
         * files grow. */
        if (test_and_clear_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags) &&
            HFS_SB(sb)->alt_mdb) {
                hfs_inode_write_fork(HFS_SB(sb)->ext_tree->inode, mdb->drXTExtRec,
                                     &mdb->drXTFlSize, NULL);
                hfs_inode_write_fork(HFS_SB(sb)->cat_tree->inode, mdb->drCTExtRec,
                                     &mdb->drCTFlSize, NULL);

                lock_buffer(HFS_SB(sb)->alt_mdb_bh);
                memcpy(HFS_SB(sb)->alt_mdb, HFS_SB(sb)->mdb, HFS_SECTOR_SIZE);
                HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
                HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
                unlock_buffer(HFS_SB(sb)->alt_mdb_bh);

                mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
                sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
        }

        if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
                struct buffer_head *bh;
                sector_t block;
                char *ptr;
                int off, size, len;

                block = be16_to_cpu(HFS_SB(sb)->mdb->drVBMSt) + HFS_SB(sb)->part_start;
                off = (block << HFS_SECTOR_SIZE_BITS) & (sb->s_blocksize - 1);
                block >>= sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS;
                size = (HFS_SB(sb)->fs_ablocks + 7) / 8;
                ptr = (u8 *)HFS_SB(sb)->bitmap;
                while (size) {
                        bh = sb_bread(sb, block);
                        if (!bh) {
                                pr_err("unable to read volume bitmap\n");
                                break;
                        }
                        len = min((int)sb->s_blocksize - off, size);

                        lock_buffer(bh);
                        memcpy(bh->b_data + off, ptr, len);
                        unlock_buffer(bh);

                        mark_buffer_dirty(bh);
                        brelse(bh);
                        block++;
                        off = 0;
                        ptr += len;
                        size -= len;
                }
        }
        unlock_buffer(HFS_SB(sb)->mdb_bh);
}

void hfs_mdb_close(struct super_block *sb)
{
        /* update volume attributes */
        if (sb_rdonly(sb))
                return;
        HFS_SB(sb)->mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
        HFS_SB(sb)->mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
        mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
}

/*
 * hfs_mdb_put()
 *
 * Release the resources associated with the in-core MDB.  */
void hfs_mdb_put(struct super_block *sb)
{
        /* free the B-trees */
        hfs_btree_close(HFS_SB(sb)->ext_tree);
        hfs_btree_close(HFS_SB(sb)->cat_tree);

        /* free the buffers holding the primary and alternate MDBs */
        brelse(HFS_SB(sb)->mdb_bh);
        brelse(HFS_SB(sb)->alt_mdb_bh);

        unload_nls(HFS_SB(sb)->nls_io);
        unload_nls(HFS_SB(sb)->nls_disk);

        kfree(HFS_SB(sb)->bitmap);
}





































    2 





    2 


    2 

















































































































































































































































































































































    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS direct block pointer.
 *
 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Koji Sato.
 */

#include <linux/errno.h>
#include "nilfs.h"
#include "page.h"
#include "direct.h"
#include "alloc.h"
#include "dat.h"

static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct)
{
        return (__le64 *)
                ((struct nilfs_direct_node *)direct->b_u.u_data + 1);
}

static inline __u64
nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key)
{
        return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key));
}

static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct,
                                        __u64 key, __u64 ptr)
{
        *(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr);
}

static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
                               __u64 key, int level, __u64 *ptrp)
{
        __u64 ptr;

        if (key > NILFS_DIRECT_KEY_MAX || level != 1)
                return -ENOENT;
        ptr = nilfs_direct_get_ptr(direct, key);
        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;

        *ptrp = ptr;
        return 0;
}

static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
                                      __u64 key, __u64 *ptrp,
                                      unsigned int maxblocks)
{
        struct inode *dat = NULL;
        __u64 ptr, ptr2;
        sector_t blocknr;
        int ret, cnt;

        if (key > NILFS_DIRECT_KEY_MAX)
                return -ENOENT;
        ptr = nilfs_direct_get_ptr(direct, key);
        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;

        if (NILFS_BMAP_USE_VBN(direct)) {
                dat = nilfs_bmap_get_dat(direct);
                ret = nilfs_dat_translate(dat, ptr, &blocknr);
                if (ret < 0)
                        goto dat_error;
                ptr = blocknr;
        }

        maxblocks = min_t(unsigned int, maxblocks,
                          NILFS_DIRECT_KEY_MAX - key + 1);
        for (cnt = 1; cnt < maxblocks &&
                     (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
                     NILFS_BMAP_INVALID_PTR;
             cnt++) {
                if (dat) {
                        ret = nilfs_dat_translate(dat, ptr2, &blocknr);
                        if (ret < 0)
                                goto dat_error;
                        ptr2 = blocknr;
                }
                if (ptr2 != ptr + cnt)
                        break;
        }
        *ptrp = ptr;
        return cnt;

 dat_error:
        if (ret == -ENOENT)
                ret = -EINVAL;  /* Notify bmap layer of metadata corruption */
        return ret;
}

static __u64
nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
{
        __u64 ptr;

        ptr = nilfs_bmap_find_target_seq(direct, key);
        if (ptr != NILFS_BMAP_INVALID_PTR)
                /* sequential access */
                return ptr;

        /* block group */
        return nilfs_bmap_find_target_in_group(direct);
}

static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
{
        union nilfs_bmap_ptr_req req;
        struct inode *dat = NULL;
        struct buffer_head *bh;
        int ret;

        if (key > NILFS_DIRECT_KEY_MAX)
                return -ENOENT;
        if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR)
                return -EEXIST;

        if (NILFS_BMAP_USE_VBN(bmap)) {
                req.bpr_ptr = nilfs_direct_find_target_v(bmap, key);
                dat = nilfs_bmap_get_dat(bmap);
        }
        ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
        if (!ret) {
                /* ptr must be a pointer to a buffer head. */
                bh = (struct buffer_head *)((unsigned long)ptr);
                set_buffer_nilfs_volatile(bh);

                nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
                nilfs_direct_set_ptr(bmap, key, req.bpr_ptr);

                if (!nilfs_bmap_dirty(bmap))
                        nilfs_bmap_set_dirty(bmap);

                if (NILFS_BMAP_USE_VBN(bmap))
                        nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);

                nilfs_inode_add_blocks(bmap->b_inode, 1);
        }
        return ret;
}

static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
{
        union nilfs_bmap_ptr_req req;
        struct inode *dat;
        int ret;

        if (key > NILFS_DIRECT_KEY_MAX ||
            nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;

        dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
        req.bpr_ptr = nilfs_direct_get_ptr(bmap, key);

        ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
        if (!ret) {
                nilfs_bmap_commit_end_ptr(bmap, &req, dat);
                nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
                nilfs_inode_sub_blocks(bmap->b_inode, 1);
        }
        return ret;
}

static int nilfs_direct_seek_key(const struct nilfs_bmap *direct, __u64 start,
                                 __u64 *keyp)
{
        __u64 key;

        for (key = start; key <= NILFS_DIRECT_KEY_MAX; key++) {
                if (nilfs_direct_get_ptr(direct, key) !=
                    NILFS_BMAP_INVALID_PTR) {
                        *keyp = key;
                        return 0;
                }
        }
        return -ENOENT;
}

static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
{
        __u64 key, lastkey;

        lastkey = NILFS_DIRECT_KEY_MAX + 1;
        for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
                if (nilfs_direct_get_ptr(direct, key) !=
                    NILFS_BMAP_INVALID_PTR)
                        lastkey = key;

        if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
                return -ENOENT;

        *keyp = lastkey;

        return 0;
}

static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
{
        return key > NILFS_DIRECT_KEY_MAX;
}

static int nilfs_direct_gather_data(struct nilfs_bmap *direct,
                                    __u64 *keys, __u64 *ptrs, int nitems)
{
        __u64 key;
        __u64 ptr;
        int n;

        if (nitems > NILFS_DIRECT_NBLOCKS)
                nitems = NILFS_DIRECT_NBLOCKS;
        n = 0;
        for (key = 0; key < nitems; key++) {
                ptr = nilfs_direct_get_ptr(direct, key);
                if (ptr != NILFS_BMAP_INVALID_PTR) {
                        keys[n] = key;
                        ptrs[n] = ptr;
                        n++;
                }
        }
        return n;
}

int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
                                    __u64 key, __u64 *keys, __u64 *ptrs, int n)
{
        __le64 *dptrs;
        int ret, i, j;

        /* no need to allocate any resource for conversion */

        /* delete */
        ret = bmap->b_ops->bop_delete(bmap, key);
        if (ret < 0)
                return ret;

        /* free resources */
        if (bmap->b_ops->bop_clear != NULL)
                bmap->b_ops->bop_clear(bmap);

        /* convert */
        dptrs = nilfs_direct_dptrs(bmap);
        for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
                if ((j < n) && (i == keys[j])) {
                        dptrs[i] = (i != key) ?
                                cpu_to_le64(ptrs[j]) :
                                NILFS_BMAP_INVALID_PTR;
                        j++;
                } else
                        dptrs[i] = NILFS_BMAP_INVALID_PTR;
        }

        nilfs_direct_init(bmap);
        return 0;
}

static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
                                  struct buffer_head *bh)
{
        struct nilfs_palloc_req oldreq, newreq;
        struct inode *dat;
        __u64 key;
        __u64 ptr;
        int ret;

        if (!NILFS_BMAP_USE_VBN(bmap))
                return 0;

        dat = nilfs_bmap_get_dat(bmap);
        key = nilfs_bmap_data_get_key(bmap, bh);
        ptr = nilfs_direct_get_ptr(bmap, key);
        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -EINVAL;

        if (!buffer_nilfs_volatile(bh)) {
                oldreq.pr_entry_nr = ptr;
                newreq.pr_entry_nr = ptr;
                ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
                if (ret < 0)
                        return ret;
                nilfs_dat_commit_update(dat, &oldreq, &newreq,
                                        bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
                set_buffer_nilfs_volatile(bh);
                nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr);
        } else
                ret = nilfs_dat_mark_dirty(dat, ptr);

        return ret;
}

static int nilfs_direct_assign_v(struct nilfs_bmap *direct,
                                 __u64 key, __u64 ptr,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
{
        struct inode *dat = nilfs_bmap_get_dat(direct);
        union nilfs_bmap_ptr_req req;
        int ret;

        req.bpr_ptr = ptr;
        ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
        if (!ret) {
                nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
                binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
                binfo->bi_v.bi_blkoff = cpu_to_le64(key);
        }
        return ret;
}

static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
                                 __u64 key, __u64 ptr,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
{
        nilfs_direct_set_ptr(direct, key, blocknr);

        binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
        binfo->bi_dat.bi_level = 0;
        memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad));

        return 0;
}

static int nilfs_direct_assign(struct nilfs_bmap *bmap,
                               struct buffer_head **bh,
                               sector_t blocknr,
                               union nilfs_binfo *binfo)
{
        __u64 key;
        __u64 ptr;

        key = nilfs_bmap_data_get_key(bmap, *bh);
        if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
                nilfs_crit(bmap->b_inode->i_sb,
                           "%s (ino=%llu): invalid key: %llu",
                           __func__,
                           bmap->b_inode->i_ino, (unsigned long long)key);
                return -EINVAL;
        }
        ptr = nilfs_direct_get_ptr(bmap, key);
        if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
                nilfs_crit(bmap->b_inode->i_sb,
                           "%s (ino=%llu): invalid pointer: %llu",
                           __func__,
                           bmap->b_inode->i_ino, (unsigned long long)ptr);
                return -EINVAL;
        }

        return NILFS_BMAP_USE_VBN(bmap) ?
                nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) :
                nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo);
}

static const struct nilfs_bmap_operations nilfs_direct_ops = {
        .bop_lookup                =        nilfs_direct_lookup,
        .bop_lookup_contig        =        nilfs_direct_lookup_contig,
        .bop_insert                =        nilfs_direct_insert,
        .bop_delete                =        nilfs_direct_delete,
        .bop_clear                =        NULL,

        .bop_propagate                =        nilfs_direct_propagate,

        .bop_lookup_dirty_buffers        =        NULL,

        .bop_assign                =        nilfs_direct_assign,
        .bop_mark                =        NULL,

        .bop_seek_key                =        nilfs_direct_seek_key,
        .bop_last_key                =        nilfs_direct_last_key,

        .bop_check_insert        =        nilfs_direct_check_insert,
        .bop_check_delete        =        NULL,
        .bop_gather_data        =        nilfs_direct_gather_data,
};


int nilfs_direct_init(struct nilfs_bmap *bmap)
{
        bmap->b_ops = &nilfs_direct_ops;
        return 0;
}



























   17 








   17 















    3 








    3 




   11 














    3 


   34 











   34 
















   39 










   12 


   39 
































   23 



















    1 































































































































   11 











    1 
    2 

    2 



































    1 










































    1 

    1 




























   22 












   17 










   13 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_NOTIFY_H
#define _LINUX_FS_NOTIFY_H

/*
 * include/linux/fsnotify.h - generic hooks for filesystem notification, to
 * reduce in-source duplication from both dnotify and inotify.
 *
 * We don't compile any of this away in some complicated menagerie of ifdefs.
 * Instead, we rely on the code inside to optimize away as needed.
 *
 * (C) Copyright 2005 Robert Love
 */

#include <linux/fsnotify_backend.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/bug.h>

/* Are there any inode/mount/sb objects watched with priority prio or above? */
static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb,
                                                     int prio)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);

        /* Were any marks ever added to any object on this sb? */
        if (!sbinfo)
                return false;

        return atomic_long_read(&sbinfo->watched_objects[prio]);
}

/* Are there any inode/mount/sb objects that are being watched at all? */
static inline bool fsnotify_sb_has_watchers(struct super_block *sb)
{
        return fsnotify_sb_has_priority_watchers(sb, 0);
}

/*
 * Notify this @dir inode about a change in a child directory entry.
 * The directory entry may have turned positive or negative or its inode may
 * have changed (i.e. renamed over).
 *
 * Unlike fsnotify_parent(), the event will be reported regardless of the
 * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only
 * the child is interested and not the parent.
 */
static inline int fsnotify_name(__u32 mask, const void *data, int data_type,
                                struct inode *dir, const struct qstr *name,
                                u32 cookie)
{
        if (!fsnotify_sb_has_watchers(dir->i_sb))
                return 0;

        return fsnotify(mask, data, data_type, dir, name, NULL, cookie);
}

static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
                                   __u32 mask)
{
        fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0);
}

static inline void fsnotify_inode(struct inode *inode, __u32 mask)
{
        if (!fsnotify_sb_has_watchers(inode->i_sb))
                return;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0);
}

/* Notify this dentry's parent about a child's events. */
static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        struct inode *inode = d_inode(dentry);

        if (!fsnotify_sb_has_watchers(inode->i_sb))
                return 0;

        if (S_ISDIR(inode->i_mode)) {
                mask |= FS_ISDIR;

                /* sb/mount marks are not interested in name of directory */
                if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                        goto notify_child;
        }

        /* disconnected dentry cannot notify parent */
        if (IS_ROOT(dentry))
                goto notify_child;

        return __fsnotify_parent(dentry, mask, data, data_type);

notify_child:
        return fsnotify(mask, data, data_type, NULL, NULL, inode, 0);
}

/*
 * Simple wrappers to consolidate calls to fsnotify_parent() when an event
 * is on a file/dentry.
 */
static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
{
        fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
}

static inline int fsnotify_path(const struct path *path, __u32 mask)
{
        return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}

static inline int fsnotify_file(struct file *file, __u32 mask)
{
        /*
         * FMODE_NONOTIFY are fds generated by fanotify itself which should not
         * generate new events. We also don't want to generate events for
         * FMODE_PATH fds (involves open & close events) as they are just
         * handle creation / destruction events and not "real" file events.
         */
        if (FMODE_FSNOTIFY_NONE(file->f_mode))
                return 0;

        return fsnotify_path(&file->f_path, mask);
}

#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS

int fsnotify_open_perm_and_set_mode(struct file *file);

/*
 * fsnotify_file_area_perm - permission hook before access to file range
 */
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
                                          const loff_t *ppos, size_t count)
{
        /*
         * filesystem may be modified in the context of permission events
         * (e.g. by HSM filling a file on access), so sb freeze protection
         * must not be held.
         */
        lockdep_assert_once(file_write_not_started(file));

        if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS)))
                return 0;

        /*
         * read()/write() and other types of access generate pre-content events.
         */
        if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
                int ret = fsnotify_pre_content(&file->f_path, ppos, count);

                if (ret)
                        return ret;
        }

        if (!(perm_mask & MAY_READ) ||
            likely(!FMODE_FSNOTIFY_ACCESS_PERM(file->f_mode)))
                return 0;

        /*
         * read() also generates the legacy FS_ACCESS_PERM event, so content
         * scanners can inspect the content filled by pre-content event.
         */
        return fsnotify_path(&file->f_path, FS_ACCESS_PERM);
}

/*
 * fsnotify_mmap_perm - permission hook before mmap of file range
 */
static inline int fsnotify_mmap_perm(struct file *file, int prot,
                                     const loff_t off, size_t len)
{
        /*
         * mmap() generates only pre-content events.
         */
        if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode)))
                return 0;

        return fsnotify_pre_content(&file->f_path, &off, len);
}

/*
 * fsnotify_truncate_perm - permission hook before file truncate
 */
static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
{
        struct inode *inode = d_inode(path->dentry);

        if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) ||
            !fsnotify_sb_has_priority_watchers(inode->i_sb,
                                               FSNOTIFY_PRIO_PRE_CONTENT))
                return 0;

        return fsnotify_pre_content(path, &length, 0);
}

/*
 * fsnotify_file_perm - permission hook before file access (unknown range)
 */
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
        return fsnotify_file_area_perm(file, perm_mask, NULL, 0);
}

#else
static inline int fsnotify_open_perm_and_set_mode(struct file *file)
{
        return 0;
}

static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
                                          const loff_t *ppos, size_t count)
{
        return 0;
}

static inline int fsnotify_mmap_perm(struct file *file, int prot,
                                     const loff_t off, size_t len)
{
        return 0;
}

static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
{
        return 0;
}

static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
        return 0;
}
#endif

/*
 * fsnotify_link_count - inode's link count changed
 */
static inline void fsnotify_link_count(struct inode *inode)
{
        fsnotify_inode(inode, FS_ATTRIB);
}

/*
 * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
 */
static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
                                 const struct qstr *old_name,
                                 int isdir, struct inode *target,
                                 struct dentry *moved)
{
        struct inode *source = moved->d_inode;
        u32 fs_cookie = fsnotify_get_cookie();
        __u32 old_dir_mask = FS_MOVED_FROM;
        __u32 new_dir_mask = FS_MOVED_TO;
        __u32 rename_mask = FS_RENAME;
        const struct qstr *new_name = &moved->d_name;

        if (isdir) {
                old_dir_mask |= FS_ISDIR;
                new_dir_mask |= FS_ISDIR;
                rename_mask |= FS_ISDIR;
        }

        /* Event with information about both old and new parent+name */
        fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY,
                      old_dir, old_name, 0);

        fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      old_dir, old_name, fs_cookie);
        fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      new_dir, new_name, fs_cookie);

        if (target)
                fsnotify_link_count(target);
        fsnotify_inode(source, FS_MOVE_SELF);
        audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
}

/*
 * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
 */
static inline void fsnotify_inode_delete(struct inode *inode)
{
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed
 */
static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        __fsnotify_vfsmount_delete(mnt);
}

static inline void fsnotify_mntns_delete(struct mnt_namespace *mntns)
{
        __fsnotify_mntns_delete(mntns);
}

/*
 * fsnotify_inoderemove - an inode is going away
 */
static inline void fsnotify_inoderemove(struct inode *inode)
{
        fsnotify_inode(inode, FS_DELETE_SELF);
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_create - 'name' was linked in
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_create(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE);
}

/*
 * fsnotify_link - new hardlink in 'inode' directory
 *
 * Caller must make sure that new_dentry->d_name is stable.
 * Note: We have to pass also the linked inode ptr as some filesystems leave
 *   new_dentry->d_inode NULL and instantiate inode pointer later
 */
static inline void fsnotify_link(struct inode *dir, struct inode *inode,
                                 struct dentry *new_dentry)
{
        fsnotify_link_count(inode);
        audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE,
                      dir, &new_dentry->d_name, 0);
}

/*
 * fsnotify_delete - @dentry was unlinked and unhashed
 *
 * Caller must make sure that dentry->d_name is stable.
 *
 * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode
 * as this may be called after d_delete() and old_dentry may be negative.
 */
static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
                                   struct dentry *dentry)
{
        __u32 mask = FS_DELETE;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name,
                      0);
}

/**
 * d_delete_notify - delete a dentry and call fsnotify_delete()
 * @dentry: The dentry to delete
 *
 * This helper is used to guaranty that the unlinked inode cannot be found
 * by lookup of this name after fsnotify_delete() event has been delivered.
 */
static inline void d_delete_notify(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        ihold(inode);
        d_delete(dentry);
        fsnotify_delete(dir, inode, dentry);
        iput(inode);
}

/*
 * fsnotify_unlink - 'name' was unlinked
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_mkdir - directory 'name' was created
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR);
}

/*
 * fsnotify_rmdir - directory 'name' was removed
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_access - file was read
 */
static inline void fsnotify_access(struct file *file)
{
        fsnotify_file(file, FS_ACCESS);
}

/*
 * fsnotify_modify - file was modified
 */
static inline void fsnotify_modify(struct file *file)
{
        fsnotify_file(file, FS_MODIFY);
}

/*
 * fsnotify_open - file was opened
 */
static inline void fsnotify_open(struct file *file)
{
        __u32 mask = FS_OPEN;

        if (file->f_flags & __FMODE_EXEC)
                mask |= FS_OPEN_EXEC;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_close - file was closed
 */
static inline void fsnotify_close(struct file *file)
{
        __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE :
                                                    FS_CLOSE_NOWRITE;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_xattr - extended attributes were changed
 */
static inline void fsnotify_xattr(struct dentry *dentry)
{
        fsnotify_dentry(dentry, FS_ATTRIB);
}

/*
 * fsnotify_change - notify_change event.  file was modified and/or metadata
 * was changed.
 */
static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
{
        __u32 mask = 0;

        if (ia_valid & ATTR_UID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_GID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_SIZE)
                mask |= FS_MODIFY;

        /* both times implies a utime(s) call */
        if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
                mask |= FS_ATTRIB;
        else if (ia_valid & ATTR_ATIME)
                mask |= FS_ACCESS;
        else if (ia_valid & ATTR_MTIME)
                mask |= FS_MODIFY;

        if (ia_valid & ATTR_MODE)
                mask |= FS_ATTRIB;

        if (mask)
                fsnotify_dentry(dentry, mask);
}

static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt)
{
        fsnotify_mnt(FS_MNT_ATTACH, ns, mnt);
}

static inline void fsnotify_mnt_detach(struct mnt_namespace *ns, struct vfsmount *mnt)
{
        fsnotify_mnt(FS_MNT_DETACH, ns, mnt);
}

static inline void fsnotify_mnt_move(struct mnt_namespace *ns, struct vfsmount *mnt)
{
        fsnotify_mnt(FS_MNT_MOVE, ns, mnt);
}

#endif        /* _LINUX_FS_NOTIFY_H */



























   22 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Global definitions for the Ethernet IEEE 802.3 interface.
 *
 * Version:        @(#)if_ether.h        1.0.1a        02/08/94
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Donald Becker, <becker@super.org>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk>
 */
#ifndef _LINUX_IF_ETHER_H
#define _LINUX_IF_ETHER_H

#include <linux/skbuff.h>
#include <uapi/linux/if_ether.h>

/* XX:XX:XX:XX:XX:XX */
#define MAC_ADDR_STR_LEN (3 * ETH_ALEN - 1)

static inline struct ethhdr *eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb_mac_header(skb);
}

/* Prefer this version in TX path, instead of
 * skb_reset_mac_header() + eth_hdr()
 */
static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb->data;
}

static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb_inner_mac_header(skb);
}

int eth_header_parse(const struct sk_buff *skb, const struct net_device *dev,
                     unsigned char *haddr);

extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len);

#endif        /* _LINUX_IF_ETHER_H */

































































































































































































































    3 
    3 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>

extern struct file_system_type nullfs_fs_type;
extern struct list_head notify_list;

struct mnt_namespace {
        struct ns_common        ns;
        struct mount *        root;
        struct {
                struct rb_root        mounts;                 /* Protected by namespace_sem */
                struct rb_node        *mnt_last_node;         /* last (rightmost) mount in the rbtree */
                struct rb_node        *mnt_first_node; /* first (leftmost) mount in the rbtree */
        };
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        wait_queue_head_t        poll;
        u64                        seq_origin; /* Sequence number of origin mount namespace */
        u64 event;
#ifdef CONFIG_FSNOTIFY
        __u32                        n_fsnotify_mask;
        struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
#endif
        unsigned int                nr_mounts; /* # of mounts in the namespace */
        unsigned int                pending_mounts;
        refcount_t                passive; /* number references not pinning @mounts */
        bool                        is_anon;
} __randomize_layout;

struct mnt_pcp {
        int mnt_count;
        int mnt_writers;
};

struct mountpoint {
        struct hlist_node m_hash;
        struct dentry *m_dentry;
        struct hlist_head m_list;
};

struct mount {
        struct hlist_node mnt_hash;
        struct mount *mnt_parent;
        struct dentry *mnt_mountpoint;
        struct vfsmount mnt;
        union {
                struct rb_node mnt_node; /* node in the ns->mounts rbtree */
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
        };
#ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
#else
        int mnt_count;
        int mnt_writers;
#endif
        struct list_head mnt_mounts;        /* list of children, anchored here */
        struct list_head mnt_child;        /* and going through their mnt_child */
        struct mount *mnt_next_for_sb;        /* the next two fields are hlist_node, */
        struct mount * __aligned(1) *mnt_pprev_for_sb;
                                        /* except that LSB of pprev is stolen */
#define WRITE_HOLD 1                        /* ... for use by mnt_hold_writers() */
        const char *mnt_devname;        /* Name of device e.g. /dev/dsk/hda1 */
        struct list_head mnt_list;
        struct list_head mnt_expire;        /* link in fs-specific expiry list */
        struct list_head mnt_share;        /* circular list of shared mounts */
        struct hlist_head mnt_slave_list;/* list of slave mounts */
        struct hlist_node mnt_slave;        /* slave list entry */
        struct mount *mnt_master;        /* slave is on master->mnt_slave_list */
        struct mnt_namespace *mnt_ns;        /* containing namespace */
        struct mountpoint *mnt_mp;        /* where is it mounted */
        union {
                struct hlist_node mnt_mp_list;        /* list mounts with the same mountpoint */
                struct hlist_node mnt_umount;
        };
#ifdef CONFIG_FSNOTIFY
        struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
        __u32 mnt_fsnotify_mask;
        struct list_head to_notify;        /* need to queue notification */
        struct mnt_namespace *prev_ns;        /* previous namespace (NULL if none) */
#endif
        int mnt_t_flags;                /* namespace_sem-protected flags */
        int mnt_id;                        /* mount identifier, reused */
        u64 mnt_id_unique;                /* mount ID unique until reboot */
        int mnt_group_id;                /* peer group identifier */
        int mnt_expiry_mark;                /* true if marked for expiry */
        struct hlist_head mnt_pins;
        struct hlist_head mnt_stuck_children;
        struct mount *overmount;        /* mounted on ->mnt_root */
} __randomize_layout;

enum {
        T_SHARED                = 1, /* mount is shared */
        T_UNBINDABLE                = 2, /* mount is unbindable */
        T_MARKED                = 4, /* internal mark for propagate_... */
        T_UMOUNT_CANDIDATE        = 8, /* for propagate_umount */

        /*
         * T_SHARED_MASK is the set of flags that should be cleared when a
         * mount becomes shared.  Currently, this is only the flag that says a
         * mount cannot be bind mounted, since this is how we create a mount
         * that shares events with another mount.  If you add a new T_*
         * flag, consider how it interacts with shared mounts.
         */
        T_SHARED_MASK        = T_UNBINDABLE,
};

#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */

static inline struct mount *real_mount(struct vfsmount *mnt)
{
        return container_of(mnt, struct mount, mnt);
}

static inline int mnt_has_parent(const struct mount *mnt)
{
        return mnt != mnt->mnt_parent;
}

static inline int is_mounted(struct vfsmount *mnt)
{
        /* neither detached nor internal? */
        return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
}

extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);

extern int __legitimize_mnt(struct vfsmount *, unsigned);

static inline bool __path_is_mountpoint(const struct path *path)
{
        struct mount *m = __lookup_mnt(path->mnt, path->dentry);
        return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
}

extern void __detach_mounts(struct dentry *dentry);

static inline void detach_mounts(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return;
        __detach_mounts(dentry);
}

static inline void get_mnt_ns(struct mnt_namespace *ns)
{
        ns_ref_inc(ns);
}

extern seqlock_t mount_lock;

DEFINE_LOCK_GUARD_0(mount_writer, write_seqlock(&mount_lock),
                    write_sequnlock(&mount_lock))
DEFINE_LOCK_GUARD_0(mount_locked_reader, read_seqlock_excl(&mount_lock),
                    read_sequnlock_excl(&mount_lock))

struct proc_mounts {
        struct mnt_namespace *ns;
        struct path root;
        int (*show)(struct seq_file *, struct vfsmount *);
};

extern const struct seq_operations mounts_op;

extern bool __is_local_mountpoint(const struct dentry *dentry);
static inline bool is_local_mountpoint(const struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return false;

        return __is_local_mountpoint(dentry);
}

static inline bool is_anon_ns(struct mnt_namespace *ns)
{
        return ns->is_anon;
}

static inline bool anon_ns_root(const struct mount *m)
{
        struct mnt_namespace *ns = READ_ONCE(m->mnt_ns);

        return !IS_ERR_OR_NULL(ns) && is_anon_ns(ns) && m == ns->root;
}

static inline bool mnt_ns_attached(const struct mount *mnt)
{
        return !RB_EMPTY_NODE(&mnt->mnt_node);
}

static inline bool mnt_ns_empty(const struct mnt_namespace *ns)
{
        return RB_EMPTY_ROOT(&ns->mounts);
}

static inline void move_from_ns(struct mount *mnt)
{
        struct mnt_namespace *ns = mnt->mnt_ns;
        WARN_ON(!mnt_ns_attached(mnt));
        if (ns->mnt_last_node == &mnt->mnt_node)
                ns->mnt_last_node = rb_prev(&mnt->mnt_node);
        if (ns->mnt_first_node == &mnt->mnt_node)
                ns->mnt_first_node = rb_next(&mnt->mnt_node);
        rb_erase(&mnt->mnt_node, &ns->mounts);
        RB_CLEAR_NODE(&mnt->mnt_node);
}

bool has_locked_children(struct mount *mnt, struct dentry *dentry);
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
                                            bool previous);

static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
        return container_of(ns, struct mnt_namespace, ns);
}

#ifdef CONFIG_FSNOTIFY
static inline void mnt_notify_add(struct mount *m)
{
        /* Optimize the case where there are no watches */
        if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) ||
            (m->prev_ns && m->prev_ns->n_fsnotify_marks))
                list_add_tail(&m->to_notify, &notify_list);
        else
                m->prev_ns = m->mnt_ns;
}
#else
static inline void mnt_notify_add(struct mount *m)
{
}
#endif

static inline struct mount *topmost_overmount(struct mount *m)
{
        while (m->overmount)
                m = m->overmount;
        return m;
}

static inline bool __test_write_hold(struct mount * __aligned(1) *val)
{
        return (unsigned long)val & WRITE_HOLD;
}

static inline bool test_write_hold(const struct mount *m)
{
        return __test_write_hold(m->mnt_pprev_for_sb);
}

static inline void set_write_hold(struct mount *m)
{
        m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
                                       | WRITE_HOLD);
}

static inline void clear_write_hold(struct mount *m)
{
        m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
                                       & ~WRITE_HOLD);
}

struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);




















































































































































































































































































    5 




















    6 

















    5 










    2 







    3 















    3 















    3 






    3 


























    3 






    2 








    3 




























































































    3 















































    3 



    2 






    3 


























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/sysfs/file.c - sysfs regular (text) file implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * Please see Documentation/filesystems/sysfs.rst for more information.
 */

#include <linux/module.h>
#include <linux/kobject.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#include <linux/mm.h>

#include "sysfs.h"

static struct kobject *sysfs_file_kobj(struct kernfs_node *kn)
{
        guard(rcu)();
        return rcu_dereference(kn->__parent)->priv;
}

/*
 * Determine ktype->sysfs_ops for the given kernfs_node.  This function
 * must be called while holding an active reference.
 */
static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
{
        struct kobject *kobj = sysfs_file_kobj(kn);

        if (kn->flags & KERNFS_LOCKDEP)
                lockdep_assert_held(kn);
        return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
}

/*
 * Reads on sysfs are handled through seq_file, which takes care of hairy
 * details like buffering and seeking.  The following function pipes
 * sysfs_ops->show() result through seq_file.
 */
static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;
        struct kobject *kobj = sysfs_file_kobj(of->kn);
        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
        ssize_t count;
        char *buf;

        if (WARN_ON_ONCE(!ops->show))
                return -EINVAL;

        /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
        count = seq_get_buf(sf, &buf);
        if (count < PAGE_SIZE) {
                seq_commit(sf, -1);
                return 0;
        }
        memset(buf, 0, PAGE_SIZE);

        count = ops->show(kobj, of->kn->priv, buf);
        if (count < 0)
                return count;

        /*
         * The code works fine with PAGE_SIZE return but it's likely to
         * indicate truncated result or overflow in normal use cases.
         */
        if (count >= (ssize_t)PAGE_SIZE) {
                printk("fill_read_buffer: %pS returned bad count\n",
                                ops->show);
                /* Try to struggle along */
                count = PAGE_SIZE - 1;
        }
        seq_commit(sf, count);
        return 0;
}

static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
                                 size_t count, loff_t pos)
{
        const struct bin_attribute *battr = of->kn->priv;
        struct kobject *kobj = sysfs_file_kobj(of->kn);
        loff_t size = file_inode(of->file)->i_size;

        if (!count)
                return 0;

        if (size) {
                if (pos >= size)
                        return 0;
                if (pos + count > size)
                        count = size - pos;
        }

        if (!battr->read)
                return -EIO;

        return battr->read(of->file, kobj, battr, buf, pos, count);
}

/* kernfs read callback for regular sysfs files with pre-alloc */
static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
                             size_t count, loff_t pos)
{
        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
        struct kobject *kobj = sysfs_file_kobj(of->kn);
        ssize_t len;

        /*
         * If buf != of->prealloc_buf, we don't know how
         * large it is, so cannot safely pass it to ->show
         */
        if (WARN_ON_ONCE(buf != of->prealloc_buf))
                return 0;
        len = ops->show(kobj, of->kn->priv, buf);
        if (len < 0)
                return len;
        if (pos) {
                if (len <= pos)
                        return 0;
                len -= pos;
                memmove(buf, buf + pos, len);
        }
        return min_t(ssize_t, count, len);
}

/* kernfs write callback for regular sysfs files */
static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
                              size_t count, loff_t pos)
{
        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
        struct kobject *kobj = sysfs_file_kobj(of->kn);

        if (!count)
                return 0;

        return ops->store(kobj, of->kn->priv, buf, count);
}

/* kernfs write callback for bin sysfs files */
static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
                                  size_t count, loff_t pos)
{
        const struct bin_attribute *battr = of->kn->priv;
        struct kobject *kobj = sysfs_file_kobj(of->kn);
        loff_t size = file_inode(of->file)->i_size;

        if (size) {
                if (size <= pos)
                        return -EFBIG;
                count = min_t(ssize_t, count, size - pos);
        }
        if (!count)
                return 0;

        if (!battr->write)
                return -EIO;

        return battr->write(of->file, kobj, battr, buf, pos, count);
}

static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
                             struct vm_area_struct *vma)
{
        const struct bin_attribute *battr = of->kn->priv;
        struct kobject *kobj = sysfs_file_kobj(of->kn);

        return battr->mmap(of->file, kobj, battr, vma);
}

static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
                                  int whence)
{
        const struct bin_attribute *battr = of->kn->priv;
        struct kobject *kobj = sysfs_file_kobj(of->kn);

        if (battr->llseek)
                return battr->llseek(of->file, kobj, battr, offset, whence);
        else
                return generic_file_llseek(of->file, offset, whence);
}

static int sysfs_kf_bin_open(struct kernfs_open_file *of)
{
        const struct bin_attribute *battr = of->kn->priv;

        if (battr->f_mapping)
                of->file->f_mapping = battr->f_mapping();

        return 0;
}

void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
{
        struct kernfs_node *kn = kobj->sd, *tmp;

        if (kn && dir)
                kn = kernfs_find_and_get(kn, dir);
        else
                kernfs_get(kn);

        if (kn && attr) {
                tmp = kernfs_find_and_get(kn, attr);
                kernfs_put(kn);
                kn = tmp;
        }

        if (kn) {
                kernfs_notify(kn);
                kernfs_put(kn);
        }
}
EXPORT_SYMBOL_GPL(sysfs_notify);

static const struct kernfs_ops sysfs_file_kfops_empty = {
};

static const struct kernfs_ops sysfs_file_kfops_ro = {
        .seq_show        = sysfs_kf_seq_show,
};

static const struct kernfs_ops sysfs_file_kfops_wo = {
        .write                = sysfs_kf_write,
};

static const struct kernfs_ops sysfs_file_kfops_rw = {
        .seq_show        = sysfs_kf_seq_show,
        .write                = sysfs_kf_write,
};

static const struct kernfs_ops sysfs_prealloc_kfops_ro = {
        .read                = sysfs_kf_read,
        .prealloc        = true,
};

static const struct kernfs_ops sysfs_prealloc_kfops_wo = {
        .write                = sysfs_kf_write,
        .prealloc        = true,
};

static const struct kernfs_ops sysfs_prealloc_kfops_rw = {
        .read                = sysfs_kf_read,
        .write                = sysfs_kf_write,
        .prealloc        = true,
};

static const struct kernfs_ops sysfs_bin_kfops_ro = {
        .read                = sysfs_kf_bin_read,
};

static const struct kernfs_ops sysfs_bin_kfops_wo = {
        .write                = sysfs_kf_bin_write,
};

static const struct kernfs_ops sysfs_bin_kfops_rw = {
        .read                = sysfs_kf_bin_read,
        .write                = sysfs_kf_bin_write,
};

static const struct kernfs_ops sysfs_bin_kfops_mmap = {
        .read                = sysfs_kf_bin_read,
        .write                = sysfs_kf_bin_write,
        .mmap                = sysfs_kf_bin_mmap,
        .open                = sysfs_kf_bin_open,
        .llseek                = sysfs_kf_bin_llseek,
};

int sysfs_add_file_mode_ns(struct kernfs_node *parent,
                const struct attribute *attr, umode_t mode, kuid_t uid,
                kgid_t gid, const struct ns_common *ns)
{
        struct kobject *kobj = parent->priv;
        const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
        struct lock_class_key *key = NULL;
        const struct kernfs_ops *ops = NULL;
        struct kernfs_node *kn;

        /* every kobject with an attribute needs a ktype assigned */
        if (WARN(!sysfs_ops, KERN_ERR
                        "missing sysfs attribute operations for kobject: %s\n",
                        kobject_name(kobj)))
                return -EINVAL;

        if (mode & SYSFS_PREALLOC) {
                if (sysfs_ops->show && sysfs_ops->store)
                        ops = &sysfs_prealloc_kfops_rw;
                else if (sysfs_ops->show)
                        ops = &sysfs_prealloc_kfops_ro;
                else if (sysfs_ops->store)
                        ops = &sysfs_prealloc_kfops_wo;
        } else {
                if (sysfs_ops->show && sysfs_ops->store)
                        ops = &sysfs_file_kfops_rw;
                else if (sysfs_ops->show)
                        ops = &sysfs_file_kfops_ro;
                else if (sysfs_ops->store)
                        ops = &sysfs_file_kfops_wo;
        }

        if (!ops)
                ops = &sysfs_file_kfops_empty;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (!attr->ignore_lockdep)
                key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif

        kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
                                  PAGE_SIZE, ops, (void *)attr, ns, key);
        if (IS_ERR(kn)) {
                if (PTR_ERR(kn) == -EEXIST)
                        sysfs_warn_dup(parent, attr->name);
                return PTR_ERR(kn);
        }
        return 0;
}

int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
                const struct bin_attribute *battr, umode_t mode, size_t size,
                kuid_t uid, kgid_t gid, const struct ns_common *ns)
{
        const struct attribute *attr = &battr->attr;
        struct lock_class_key *key = NULL;
        const struct kernfs_ops *ops;
        struct kernfs_node *kn;

        if (battr->mmap)
                ops = &sysfs_bin_kfops_mmap;
        else if (battr->read && battr->write)
                ops = &sysfs_bin_kfops_rw;
        else if (battr->read)
                ops = &sysfs_bin_kfops_ro;
        else if (battr->write)
                ops = &sysfs_bin_kfops_wo;
        else
                ops = &sysfs_file_kfops_empty;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (!attr->ignore_lockdep)
                key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif

        kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
                                  size, ops, (void *)attr, ns, key);
        if (IS_ERR(kn)) {
                if (PTR_ERR(kn) == -EEXIST)
                        sysfs_warn_dup(parent, attr->name);
                return PTR_ERR(kn);
        }
        return 0;
}

/**
 * sysfs_create_file_ns - create an attribute file for an object with custom ns
 * @kobj: object we're creating for
 * @attr: attribute descriptor
 * @ns: namespace the new file should belong to
 */
int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
                         const struct ns_common *ns)
{
        kuid_t uid;
        kgid_t gid;

        if (WARN_ON(!kobj || !kobj->sd || !attr))
                return -EINVAL;

        kobject_get_ownership(kobj, &uid, &gid);
        return sysfs_add_file_mode_ns(kobj->sd, attr, attr->mode, uid, gid, ns);
}
EXPORT_SYMBOL_GPL(sysfs_create_file_ns);

int sysfs_create_files(struct kobject *kobj, const struct attribute * const *ptr)
{
        int err = 0;
        int i;

        for (i = 0; ptr[i] && !err; i++)
                err = sysfs_create_file(kobj, ptr[i]);
        if (err)
                while (--i >= 0)
                        sysfs_remove_file(kobj, ptr[i]);
        return err;
}
EXPORT_SYMBOL_GPL(sysfs_create_files);

/**
 * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
 * @kobj: object we're acting for.
 * @attr: attribute descriptor.
 * @group: group name.
 */
int sysfs_add_file_to_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
        struct kernfs_node *parent;
        kuid_t uid;
        kgid_t gid;
        int error;

        if (group) {
                parent = kernfs_find_and_get(kobj->sd, group);
        } else {
                parent = kobj->sd;
                kernfs_get(parent);
        }

        if (!parent)
                return -ENOENT;

        kobject_get_ownership(kobj, &uid, &gid);
        error = sysfs_add_file_mode_ns(parent, attr, attr->mode, uid, gid,
                                       NULL);
        kernfs_put(parent);

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);

/**
 * sysfs_chmod_file - update the modified mode value on an object attribute.
 * @kobj: object we're acting for.
 * @attr: attribute descriptor.
 * @mode: file permissions.
 *
 */
int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
                     umode_t mode)
{
        struct kernfs_node *kn;
        struct iattr newattrs;
        int rc;

        kn = kernfs_find_and_get(kobj->sd, attr->name);
        if (!kn)
                return -ENOENT;

        newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE;

        rc = kernfs_setattr(kn, &newattrs);

        kernfs_put(kn);
        return rc;
}
EXPORT_SYMBOL_GPL(sysfs_chmod_file);

/**
 * sysfs_break_active_protection - break "active" protection
 * @kobj: The kernel object @attr is associated with.
 * @attr: The attribute to break the "active" protection for.
 *
 * With sysfs, just like kernfs, deletion of an attribute is postponed until
 * all active .show() and .store() callbacks have finished unless this function
 * is called. Hence this function is useful in methods that implement self
 * deletion.
 */
struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj,
                                                  const struct attribute *attr)
{
        struct kernfs_node *kn;

        kobject_get(kobj);
        kn = kernfs_find_and_get(kobj->sd, attr->name);
        if (kn)
                kernfs_break_active_protection(kn);
        else
                kobject_put(kobj);
        return kn;
}
EXPORT_SYMBOL_GPL(sysfs_break_active_protection);

/**
 * sysfs_unbreak_active_protection - restore "active" protection
 * @kn: Pointer returned by sysfs_break_active_protection().
 *
 * Undo the effects of sysfs_break_active_protection(). Since this function
 * calls kernfs_put() on the kernfs node that corresponds to the 'attr'
 * argument passed to sysfs_break_active_protection() that attribute may have
 * been removed between the sysfs_break_active_protection() and
 * sysfs_unbreak_active_protection() calls, it is not safe to access @kn after
 * this function has returned.
 */
void sysfs_unbreak_active_protection(struct kernfs_node *kn)
{
        struct kobject *kobj = sysfs_file_kobj(kn);

        kernfs_unbreak_active_protection(kn);
        kernfs_put(kn);
        kobject_put(kobj);
}
EXPORT_SYMBOL_GPL(sysfs_unbreak_active_protection);

/**
 * sysfs_remove_file_ns - remove an object attribute with a custom ns tag
 * @kobj: object we're acting for
 * @attr: attribute descriptor
 * @ns: namespace tag of the file to remove
 *
 * Hash the attribute name and namespace tag and kill the victim.
 */
void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
                          const struct ns_common *ns)
{
        struct kernfs_node *parent = kobj->sd;

        kernfs_remove_by_name_ns(parent, attr->name, ns);
}
EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);

/**
 * sysfs_remove_file_self - remove an object attribute from its own method
 * @kobj: object we're acting for
 * @attr: attribute descriptor
 *
 * See kernfs_remove_self() for details.
 */
bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
{
        struct kernfs_node *parent = kobj->sd;
        struct kernfs_node *kn;
        bool ret;

        kn = kernfs_find_and_get(parent, attr->name);
        if (WARN_ON_ONCE(!kn))
                return false;

        ret = kernfs_remove_self(kn);

        kernfs_put(kn);
        return ret;
}
EXPORT_SYMBOL_GPL(sysfs_remove_file_self);

void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr)
{
        int i;

        for (i = 0; ptr[i]; i++)
                sysfs_remove_file(kobj, ptr[i]);
}
EXPORT_SYMBOL_GPL(sysfs_remove_files);

/**
 * sysfs_remove_file_from_group - remove an attribute file from a group.
 * @kobj: object we're acting for.
 * @attr: attribute descriptor.
 * @group: group name.
 */
void sysfs_remove_file_from_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
        struct kernfs_node *parent;

        if (group) {
                parent = kernfs_find_and_get(kobj->sd, group);
        } else {
                parent = kobj->sd;
                kernfs_get(parent);
        }

        if (parent) {
                kernfs_remove_by_name(parent, attr->name);
                kernfs_put(parent);
        }
}
EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);

/**
 *        sysfs_create_bin_file - create binary file for object.
 *        @kobj:        object.
 *        @attr:        attribute descriptor.
 */
int sysfs_create_bin_file(struct kobject *kobj,
                          const struct bin_attribute *attr)
{
        kuid_t uid;
        kgid_t gid;

        if (WARN_ON(!kobj || !kobj->sd || !attr))
                return -EINVAL;

        kobject_get_ownership(kobj, &uid, &gid);
        return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode,
                                          attr->size, uid, gid, NULL);
}
EXPORT_SYMBOL_GPL(sysfs_create_bin_file);

/**
 *        sysfs_remove_bin_file - remove binary file for object.
 *        @kobj:        object.
 *        @attr:        attribute descriptor.
 */
void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr)
{
        kernfs_remove_by_name(kobj->sd, attr->attr.name);
}
EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);

static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid,
                                 kgid_t kgid)
{
        struct iattr newattrs = {
                .ia_valid = ATTR_UID | ATTR_GID,
                .ia_uid = kuid,
                .ia_gid = kgid,
        };
        return kernfs_setattr(kn, &newattrs);
}

/**
 *        sysfs_link_change_owner - change owner of a sysfs file.
 *        @kobj:        object of the kernfs_node the symlink is located in.
 *        @targ:        object of the kernfs_node the symlink points to.
 *        @name:        name of the link.
 *        @kuid:        new owner's kuid
 *        @kgid:        new owner's kgid
 *
 * This function looks up the sysfs symlink entry @name under @kobj and changes
 * the ownership to @kuid/@kgid. The symlink is looked up in the namespace of
 * @targ.
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
                            const char *name, kuid_t kuid, kgid_t kgid)
{
        struct kernfs_node *kn = NULL;
        int error;

        if (!name || !kobj->state_in_sysfs || !targ->state_in_sysfs)
                return -EINVAL;

        error = -ENOENT;
        kn = kernfs_find_and_get_ns(kobj->sd, name, targ->sd->ns);
        if (!kn)
                goto out;

        error = -EINVAL;
        if (kernfs_type(kn) != KERNFS_LINK)
                goto out;
        if (kn->symlink.target_kn->priv != targ)
                goto out;

        error = internal_change_owner(kn, kuid, kgid);

out:
        kernfs_put(kn);
        return error;
}

/**
 *        sysfs_file_change_owner - change owner of a sysfs file.
 *        @kobj:        object.
 *        @name:        name of the file to change.
 *        @kuid:        new owner's kuid
 *        @kgid:        new owner's kgid
 *
 * This function looks up the sysfs entry @name under @kobj and changes the
 * ownership to @kuid/@kgid.
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
                            kgid_t kgid)
{
        struct kernfs_node *kn;
        int error;

        if (!name)
                return -EINVAL;

        if (!kobj->state_in_sysfs)
                return -EINVAL;

        kn = kernfs_find_and_get(kobj->sd, name);
        if (!kn)
                return -ENOENT;

        error = internal_change_owner(kn, kuid, kgid);

        kernfs_put(kn);

        return error;
}

/**
 *        sysfs_change_owner - change owner of the given object.
 *        @kobj:        object.
 *        @kuid:        new owner's kuid
 *        @kgid:        new owner's kgid
 *
 * Change the owner of the default directory, files, groups, and attributes of
 * @kobj to @kuid/@kgid. Note that sysfs_change_owner mirrors how the sysfs
 * entries for a kobject are added by driver core. In summary,
 * sysfs_change_owner() takes care of the default directory entry for @kobj,
 * the default attributes associated with the ktype of @kobj and the default
 * attributes associated with the ktype of @kobj.
 * Additional properties not added by driver core have to be changed by the
 * driver or subsystem which created them. This is similar to how
 * driver/subsystem specific entries are removed.
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
{
        int error;
        const struct kobj_type *ktype;

        if (!kobj->state_in_sysfs)
                return -EINVAL;

        /* Change the owner of the kobject itself. */
        error = internal_change_owner(kobj->sd, kuid, kgid);
        if (error)
                return error;

        ktype = get_ktype(kobj);
        if (ktype) {
                /*
                 * Change owner of the default groups associated with the
                 * ktype of @kobj.
                 */
                error = sysfs_groups_change_owner(kobj, ktype->default_groups,
                                                  kuid, kgid);
                if (error)
                        return error;
        }

        return 0;
}

/**
 *        sysfs_emit - scnprintf equivalent, aware of PAGE_SIZE buffer.
 *        @buf:        start of PAGE_SIZE buffer.
 *        @fmt:        format
 *        @...:        optional arguments to @format
 *
 *
 * Returns number of characters written to @buf.
 */
int sysfs_emit(char *buf, const char *fmt, ...)
{
        va_list args;
        int len;

        if (WARN(!buf || offset_in_page(buf),
                 "invalid sysfs_emit: buf:%p\n", buf))
                return 0;

        va_start(args, fmt);
        len = vscnprintf(buf, PAGE_SIZE, fmt, args);
        va_end(args);

        return len;
}
EXPORT_SYMBOL_GPL(sysfs_emit);

/**
 *        sysfs_emit_at - scnprintf equivalent, aware of PAGE_SIZE buffer.
 *        @buf:        start of PAGE_SIZE buffer.
 *        @at:        offset in @buf to start write in bytes
 *                @at must be >= 0 && < PAGE_SIZE
 *        @fmt:        format
 *        @...:        optional arguments to @fmt
 *
 *
 * Returns number of characters written starting at &@buf[@at].
 */
int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
{
        va_list args;
        int len;

        if (WARN(!buf || offset_in_page(buf) || at < 0 || at >= PAGE_SIZE,
                 "invalid sysfs_emit_at: buf:%p at:%d\n", buf, at))
                return 0;

        va_start(args, fmt);
        len = vscnprintf(buf + at, PAGE_SIZE - at, fmt, args);
        va_end(args);

        return len;
}
EXPORT_SYMBOL_GPL(sysfs_emit_at);

/**
 *        sysfs_bin_attr_simple_read - read callback to simply copy from memory.
 *        @file:        attribute file which is being read.
 *        @kobj:        object to which the attribute belongs.
 *        @attr:        attribute descriptor.
 *        @buf:        destination buffer.
 *        @off:        offset in bytes from which to read.
 *        @count:        maximum number of bytes to read.
 *
 * Simple ->read() callback for bin_attributes backed by a buffer in memory.
 * The @private and @size members in struct bin_attribute must be set to the
 * buffer's location and size before the bin_attribute is created in sysfs.
 *
 * Bounds check for @off and @count is done in sysfs_kf_bin_read().
 * Negative value check for @off is done in vfs_setpos() and default_llseek().
 *
 * Returns number of bytes written to @buf.
 */
ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
                                   const struct bin_attribute *attr, char *buf,
                                   loff_t off, size_t count)
{
        memcpy(buf, attr->private + off, count);
        return count;
}
EXPORT_SYMBOL_GPL(sysfs_bin_attr_simple_read);


























































































































































































































































































































































































































































































   18 






   18 

















   16 
   19 



























































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/panic.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * This function is used through-out the kernel (including mm and fs)
 * to indicate a major problem.
 */
#include <linux/debug_locks.h>
#include <linux/sched/debug.h>
#include <linux/interrupt.h>
#include <linux/kgdb.h>
#include <linux/kmsg_dump.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
#include <linux/vt_kern.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/ftrace.h>
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/kexec.h>
#include <linux/panic_notifier.h>
#include <linux/sched.h>
#include <linux/string_helpers.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/console.h>
#include <linux/bug.h>
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
#include <linux/sysfs.h>
#include <linux/context_tracking.h>
#include <linux/seq_buf.h>
#include <linux/sys_info.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>

#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
#define PANIC_MSG_BUFSZ 1024

#ifdef CONFIG_SMP
/*
 * Should we dump all CPUs backtraces in an oops event?
 * Defaults to 0, can be changed via sysctl.
 */
static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
#else
#define sysctl_oops_all_cpu_backtrace 0
#endif /* CONFIG_SMP */

int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS);
static unsigned long tainted_mask =
        IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
static unsigned int warn_limit __read_mostly;
static bool panic_console_replay;

bool panic_triggering_all_cpu_backtrace;
static bool panic_this_cpu_backtrace_printed;

int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);

unsigned long panic_print;

static int panic_force_cpu = -1;

ATOMIC_NOTIFIER_HEAD(panic_notifier_list);

EXPORT_SYMBOL(panic_notifier_list);

static void panic_print_deprecated(void)
{
        pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n");
}

#ifdef CONFIG_SYSCTL

/*
 * Taint values can only be increased
 * This means we can safely use a temporary.
 */
static int proc_taint(const struct ctl_table *table, int write,
                               void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        unsigned long tmptaint = get_taint();
        int err;

        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;

        t = *table;
        t.data = &tmptaint;
        err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
        if (err < 0)
                return err;

        if (write) {
                int i;

                /*
                 * If we are relying on panic_on_taint not producing
                 * false positives due to userspace input, bail out
                 * before setting the requested taint flags.
                 */
                if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint))
                        return -EINVAL;

                /*
                 * Poor man's atomic or. Not worth adding a primitive
                 * to everyone's atomic.h for this
                 */
                for (i = 0; i < TAINT_FLAGS_COUNT; i++)
                        if ((1UL << i) & tmptaint)
                                add_taint(i, LOCKDEP_STILL_OK);
        }

        return err;
}

static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
                           void *buffer, size_t *lenp, loff_t *ppos)
{
        if (write)
                panic_print_deprecated();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table kern_panic_table[] = {
#ifdef CONFIG_SMP
        {
                .procname       = "oops_all_cpu_backtrace",
                .data           = &sysctl_oops_all_cpu_backtrace,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = SYSCTL_ZERO,
                .extra2         = SYSCTL_ONE,
        },
#endif
        {
                .procname        = "tainted",
                .maxlen                = sizeof(long),
                .mode                = 0644,
                .proc_handler        = proc_taint,
        },
        {
                .procname        = "panic",
                .data                = &panic_timeout,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "panic_on_oops",
                .data                = &panic_on_oops,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "panic_print",
                .data                = &panic_print,
                .maxlen                = sizeof(unsigned long),
                .mode                = 0644,
                .proc_handler        = sysctl_panic_print_handler,
        },
        {
                .procname        = "panic_on_warn",
                .data                = &panic_on_warn,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname       = "warn_limit",
                .data           = &warn_limit,
                .maxlen         = sizeof(warn_limit),
                .mode           = 0644,
                .proc_handler   = proc_douintvec,
        },
#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
        defined(CONFIG_DEBUG_STACKOVERFLOW)
        {
                .procname        = "panic_on_stackoverflow",
                .data                = &sysctl_panic_on_stackoverflow,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
        {
                .procname        = "panic_sys_info",
                .data                = &panic_print,
                .maxlen         = sizeof(panic_print),
                .mode                = 0644,
                .proc_handler        = sysctl_sys_info_handler,
        },
};

static __init int kernel_panic_sysctls_init(void)
{
        register_sysctl_init("kernel", kern_panic_table);
        return 0;
}
late_initcall(kernel_panic_sysctls_init);
#endif

/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */
static int __init setup_panic_sys_info(char *buf)
{
        /* There is no risk of race in kernel boot phase */
        panic_print = sys_info_parse_param(buf);
        return 1;
}
__setup("panic_sys_info=", setup_panic_sys_info);

static atomic_t warn_count = ATOMIC_INIT(0);

#ifdef CONFIG_SYSFS
static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr,
                               char *page)
{
        return sysfs_emit(page, "%d\n", atomic_read(&warn_count));
}

static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count);

static __init int kernel_panic_sysfs_init(void)
{
        sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL);
        return 0;
}
late_initcall(kernel_panic_sysfs_init);
#endif

static long no_blink(int state)
{
        return 0;
}

/* Returns how long it waited in ms */
long (*panic_blink)(int state);
EXPORT_SYMBOL(panic_blink);

/*
 * Stop ourself in panic -- architecture code may override this
 */
void __weak __noreturn panic_smp_self_stop(void)
{
        while (1)
                cpu_relax();
}

/*
 * Stop ourselves in NMI context if another CPU has already panicked. Arch code
 * may override this to prepare for crash dumping, e.g. save regs info.
 */
void __weak __noreturn nmi_panic_self_stop(struct pt_regs *regs)
{
        panic_smp_self_stop();
}

/*
 * Stop other CPUs in panic.  Architecture dependent code may override this
 * with more suitable version.  For example, if the architecture supports
 * crash dump, it should save registers of each stopped CPU and disable
 * per-CPU features such as virtualization extensions.
 */
void __weak crash_smp_send_stop(void)
{
        static int cpus_stopped;

        /*
         * This function can be called twice in panic path, but obviously
         * we execute this only once.
         */
        if (cpus_stopped)
                return;

        /*
         * Note smp_send_stop is the usual smp shutdown function, which
         * unfortunately means it may not be hardened to work in a panic
         * situation.
         */
        smp_send_stop();
        cpus_stopped = 1;
}

atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
atomic_t panic_redirect_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);

#if defined(CONFIG_SMP) && defined(CONFIG_CRASH_DUMP)
static char *panic_force_buf;

static int __init panic_force_cpu_setup(char *str)
{
        int cpu;

        if (!str)
                return -EINVAL;

        if (kstrtoint(str, 0, &cpu) || cpu < 0 || cpu >= nr_cpu_ids) {
                pr_warn("panic_force_cpu: invalid value '%s'\n", str);
                return -EINVAL;
        }

        panic_force_cpu = cpu;
        return 0;
}
early_param("panic_force_cpu", panic_force_cpu_setup);

static int __init panic_force_cpu_late_init(void)
{
        if (panic_force_cpu < 0)
                return 0;

        panic_force_buf = kmalloc(PANIC_MSG_BUFSZ, GFP_KERNEL);

        return 0;
}
late_initcall(panic_force_cpu_late_init);

static void do_panic_on_target_cpu(void *info)
{
        panic("%s", (char *)info);
}

/**
 * panic_smp_redirect_cpu - Redirect panic to target CPU
 * @target_cpu: CPU that should handle the panic
 * @msg: formatted panic message
 *
 * Default implementation uses IPI. Architectures with NMI support
 * can override this for more reliable delivery.
 *
 * Return: 0 on success, negative errno on failure
 */
int __weak panic_smp_redirect_cpu(int target_cpu, void *msg)
{
        static call_single_data_t panic_csd;

        panic_csd.func = do_panic_on_target_cpu;
        panic_csd.info = msg;

        return smp_call_function_single_async(target_cpu, &panic_csd);
}

/**
 * panic_try_force_cpu - Redirect panic to a specific CPU for crash kernel
 * @fmt: panic message format string
 * @args: arguments for format string
 *
 * Some platforms require panic handling to occur on a specific CPU
 * for the crash kernel to function correctly. This function redirects
 * panic handling to the CPU specified via the panic_force_cpu= boot parameter.
 *
 * Returns false if panic should proceed on current CPU.
 * Returns true if panic was redirected.
 */
__printf(1, 0)
static bool panic_try_force_cpu(const char *fmt, va_list args)
{
        int this_cpu = raw_smp_processor_id();
        int old_cpu = PANIC_CPU_INVALID;
        const char *msg;

        /* Feature not enabled via boot parameter */
        if (panic_force_cpu < 0)
                return false;

        /* Already on target CPU - proceed normally */
        if (this_cpu == panic_force_cpu)
                return false;

        /* Target CPU is offline, can't redirect */
        if (!cpu_online(panic_force_cpu)) {
                pr_warn("panic: target CPU %d is offline, continuing on CPU %d\n",
                        panic_force_cpu, this_cpu);
                return false;
        }

        /* Another panic already in progress */
        if (panic_in_progress())
                return false;

        /*
         * Only one CPU can do the redirect. Use atomic cmpxchg to ensure
         * we don't race with another CPU also trying to redirect.
         */
        if (!atomic_try_cmpxchg(&panic_redirect_cpu, &old_cpu, this_cpu))
                return false;

        /*
         * Use dynamically allocated buffer if available, otherwise
         * fall back to static message for early boot panics or allocation failure.
         */
        if (panic_force_buf) {
                vsnprintf(panic_force_buf, PANIC_MSG_BUFSZ, fmt, args);
                msg = panic_force_buf;
        } else {
                msg = "Redirected panic (buffer unavailable)";
        }

        console_verbose();
        bust_spinlocks(1);

        pr_emerg("panic: Redirecting from CPU %d to CPU %d for crash kernel.\n",
                 this_cpu, panic_force_cpu);

        /* Dump original CPU before redirecting */
        if (!test_taint(TAINT_DIE) &&
            oops_in_progress <= 1 &&
            IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
                dump_stack();
        }

        if (panic_smp_redirect_cpu(panic_force_cpu, (void *)msg) != 0) {
                atomic_set(&panic_redirect_cpu, PANIC_CPU_INVALID);
                pr_warn("panic: failed to redirect to CPU %d, continuing on CPU %d\n",
                        panic_force_cpu, this_cpu);
                return false;
        }

        /* IPI/NMI sent, this CPU should stop */
        return true;
}
#else
__printf(1, 0)
static inline bool panic_try_force_cpu(const char *fmt, va_list args)
{
        return false;
}
#endif /* CONFIG_SMP && CONFIG_CRASH_DUMP */

bool panic_try_start(void)
{
        int old_cpu, this_cpu;

        /*
         * Only one CPU is allowed to execute the crash_kexec() code as with
         * panic().  Otherwise parallel calls of panic() and crash_kexec()
         * may stop each other.  To exclude them, we use panic_cpu here too.
         */
        old_cpu = PANIC_CPU_INVALID;
        this_cpu = raw_smp_processor_id();

        return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu);
}
EXPORT_SYMBOL(panic_try_start);

void panic_reset(void)
{
        atomic_set(&panic_cpu, PANIC_CPU_INVALID);
}
EXPORT_SYMBOL(panic_reset);

bool panic_in_progress(void)
{
        return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
}
EXPORT_SYMBOL(panic_in_progress);

/* Return true if a panic is in progress on the current CPU. */
bool panic_on_this_cpu(void)
{
        /*
         * We can use raw_smp_processor_id() here because it is impossible for
         * the task to be migrated to the panic_cpu, or away from it. If
         * panic_cpu has already been set, and we're not currently executing on
         * that CPU, then we never will be.
         */
        return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
}
EXPORT_SYMBOL(panic_on_this_cpu);

/*
 * Return true if a panic is in progress on a remote CPU.
 *
 * On true, the local CPU should immediately release any printing resources
 * that may be needed by the panic CPU.
 */
bool panic_on_other_cpu(void)
{
        return (panic_in_progress() && !panic_on_this_cpu());
}
EXPORT_SYMBOL(panic_on_other_cpu);

/*
 * A variant of panic() called from NMI context. We return if we've already
 * panicked on this CPU. If another CPU already panicked, loop in
 * nmi_panic_self_stop() which can provide architecture dependent code such
 * as saving register state for crash dump.
 */
void nmi_panic(struct pt_regs *regs, const char *msg)
{
        if (panic_try_start())
                panic("%s", msg);
        else if (panic_on_other_cpu())
                nmi_panic_self_stop(regs);
}
EXPORT_SYMBOL(nmi_panic);

void check_panic_on_warn(const char *origin)
{
        unsigned int limit;

        if (panic_on_warn)
                panic("%s: panic_on_warn set ...\n", origin);

        limit = READ_ONCE(warn_limit);
        if (atomic_inc_return(&warn_count) >= limit && limit)
                panic("%s: system warned too often (kernel.warn_limit is %d)",
                      origin, limit);
}

static void panic_trigger_all_cpu_backtrace(void)
{
        /* Temporary allow non-panic CPUs to write their backtraces. */
        panic_triggering_all_cpu_backtrace = true;

        if (panic_this_cpu_backtrace_printed)
                trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id());
        else
                trigger_all_cpu_backtrace();

        panic_triggering_all_cpu_backtrace = false;
}

/*
 * Helper that triggers the NMI backtrace (if set in panic_print)
 * and then performs the secondary CPUs shutdown - we cannot have
 * the NMI backtrace after the CPUs are off!
 */
static void panic_other_cpus_shutdown(bool crash_kexec)
{
        if (panic_print & SYS_INFO_ALL_BT)
                panic_trigger_all_cpu_backtrace();

        /*
         * Note that smp_send_stop() is the usual SMP shutdown function,
         * which unfortunately may not be hardened to work in a panic
         * situation. If we want to do crash dump after notifier calls
         * and kmsg_dump, we will need architecture dependent extra
         * bits in addition to stopping other CPUs, hence we rely on
         * crash_smp_send_stop() for that.
         */
        if (!crash_kexec)
                smp_send_stop();
        else
                crash_smp_send_stop();
}

/**
 * vpanic - halt the system
 * @fmt: The text string to print
 * @args: Arguments for the format string
 *
 * Display a message, then perform cleanups. This function never returns.
 */
void vpanic(const char *fmt, va_list args)
{
        static char buf[PANIC_MSG_BUFSZ];
        long i, i_next = 0, len;
        int state = 0;
        bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;

        if (panic_on_warn) {
                /*
                 * This thread may hit another WARN() in the panic path.
                 * Resetting this prevents additional WARN() from panicking the
                 * system on this thread.  Other threads are blocked by the
                 * panic_mutex in panic().
                 */
                panic_on_warn = 0;
        }

        /*
         * Disable local interrupts. This will prevent panic_smp_self_stop
         * from deadlocking the first cpu that invokes the panic, since
         * there is nothing to prevent an interrupt handler (that runs
         * after setting panic_cpu) from invoking panic() again.
         */
        local_irq_disable();
        preempt_disable_notrace();

        /* Redirect panic to target CPU if configured via panic_force_cpu=. */
        if (panic_try_force_cpu(fmt, args)) {
                /*
                 * Mark ourselves offline so panic_other_cpus_shutdown() won't wait
                 * for us on architectures that check num_online_cpus().
                 */
                set_cpu_online(smp_processor_id(), false);
                panic_smp_self_stop();
        }
        /*
         * It's possible to come here directly from a panic-assertion and
         * not have preempt disabled. Some functions called from here want
         * preempt to be disabled. No point enabling it later though...
         *
         * Only one CPU is allowed to execute the panic code from here. For
         * multiple parallel invocations of panic, all other CPUs either
         * stop themself or will wait until they are stopped by the 1st CPU
         * with smp_send_stop().
         *
         * cmpxchg success means this is the 1st CPU which comes here,
         * so go ahead.
         * `old_cpu == this_cpu' means we came from nmi_panic() which sets
         * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
         */
        /* atomic_try_cmpxchg updates old_cpu on failure */
        if (panic_try_start()) {
                /* go ahead */
        } else if (panic_on_other_cpu())
                panic_smp_self_stop();

        console_verbose();
        bust_spinlocks(1);
        len = vscnprintf(buf, sizeof(buf), fmt, args);

        if (len && buf[len - 1] == '\n')
                buf[len - 1] = '\0';

        pr_emerg("Kernel panic - not syncing: %s\n", buf);
        /*
         * Avoid nested stack-dumping if a panic occurs during oops processing
         */
        if (atomic_read(&panic_redirect_cpu) != PANIC_CPU_INVALID &&
            panic_force_cpu == raw_smp_processor_id()) {
                pr_emerg("panic: Redirected from CPU %d, skipping stack dump.\n",
                         atomic_read(&panic_redirect_cpu));
        } else if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
                panic_this_cpu_backtrace_printed = true;
        } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
                dump_stack();
                panic_this_cpu_backtrace_printed = true;
        }

        /*
         * If kgdb is enabled, give it a chance to run before we stop all
         * the other CPUs or else we won't be able to debug processes left
         * running on them.
         */
        kgdb_panic(buf);

        /*
         * If we have crashed and we have a crash kernel loaded let it handle
         * everything else.
         * If we want to run this after calling panic_notifiers, pass
         * the "crash_kexec_post_notifiers" option to the kernel.
         *
         * Bypass the panic_cpu check and call __crash_kexec directly.
         */
        if (!_crash_kexec_post_notifiers)
                __crash_kexec(NULL);

        panic_other_cpus_shutdown(_crash_kexec_post_notifiers);

        printk_legacy_allow_panic_sync();

        /*
         * Run any panic handlers, including those that might need to
         * add information to the kmsg dump output.
         */
        atomic_notifier_call_chain(&panic_notifier_list, 0, buf);

        sys_info(panic_print);

        kmsg_dump_desc(KMSG_DUMP_PANIC, buf);

        /*
         * If you doubt kdump always works fine in any situation,
         * "crash_kexec_post_notifiers" offers you a chance to run
         * panic_notifiers and dumping kmsg before kdump.
         * Note: since some panic_notifiers can make crashed kernel
         * more unstable, it can increase risks of the kdump failure too.
         *
         * Bypass the panic_cpu check and call __crash_kexec directly.
         */
        if (_crash_kexec_post_notifiers)
                __crash_kexec(NULL);

        console_unblank();

        /*
         * We may have ended up stopping the CPU holding the lock (in
         * smp_send_stop()) while still having some valuable data in the console
         * buffer.  Try to acquire the lock then release it regardless of the
         * result.  The release will also print the buffers out.  Locks debug
         * should be disabled to avoid reporting bad unlock balance when
         * panic() is not being callled from OOPS.
         */
        debug_locks_off();
        console_flush_on_panic(CONSOLE_FLUSH_PENDING);

        if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) ||
                panic_console_replay)
                console_flush_on_panic(CONSOLE_REPLAY_ALL);

        if (!panic_blink)
                panic_blink = no_blink;

        if (panic_timeout > 0) {
                /*
                 * Delay timeout seconds before rebooting the machine.
                 * We can't use the "normal" timers since we just panicked.
                 */
                pr_emerg("Rebooting in %d seconds..\n", panic_timeout);

                for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
                        touch_nmi_watchdog();
                        if (i >= i_next) {
                                i += panic_blink(state ^= 1);
                                i_next = i + 3600 / PANIC_BLINK_SPD;
                        }
                        mdelay(PANIC_TIMER_STEP);
                }
        }
        if (panic_timeout != 0) {
                /*
                 * This will not be a clean reboot, with everything
                 * shutting down.  But if there is a chance of
                 * rebooting the system it will be rebooted.
                 */
                if (panic_reboot_mode != REBOOT_UNDEFINED)
                        reboot_mode = panic_reboot_mode;
                emergency_restart();
        }
#ifdef __sparc__
        {
                extern int stop_a_enabled;
                /* Make sure the user can actually press Stop-A (L1-A) */
                stop_a_enabled = 1;
                pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n"
                         "twice on console to return to the boot prom\n");
        }
#endif
#if defined(CONFIG_S390)
        disabled_wait();
#endif
        pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);

        /* Do not scroll important messages printed above */
        suppress_printk = 1;

        /*
         * The final messages may not have been printed if in a context that
         * defers printing (such as NMI) and irq_work is not available.
         * Explicitly flush the kernel log buffer one last time.
         */
        console_flush_on_panic(CONSOLE_FLUSH_PENDING);
        nbcon_atomic_flush_unsafe();

        local_irq_enable();
        for (i = 0; ; i += PANIC_TIMER_STEP) {
                touch_softlockup_watchdog();
                if (i >= i_next) {
                        i += panic_blink(state ^= 1);
                        i_next = i + 3600 / PANIC_BLINK_SPD;
                }
                mdelay(PANIC_TIMER_STEP);
        }
}
EXPORT_SYMBOL(vpanic);

/* Identical to vpanic(), except it takes variadic arguments instead of va_list */
void panic(const char *fmt, ...)
{
        va_list args;

        va_start(args, fmt);
        vpanic(fmt, args);
        va_end(args);
}
EXPORT_SYMBOL(panic);

#define TAINT_FLAG(taint, _c_true, _c_false)                                \
        [ TAINT_##taint ] = {                                                \
                .c_true = _c_true, .c_false = _c_false,                        \
                .desc = #taint,                                                \
        }

/*
 * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT,
 * please also modify tools/debugging/kernel-chktaint and
 * Documentation/admin-guide/tainted-kernels.rst, including its
 * small shell script that prints the TAINT_FLAGS_COUNT bits of
 * /proc/sys/kernel/tainted.
 *
 * Also, update INIT_TAINT_BUF_MAX below.
 */
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
        TAINT_FLAG(PROPRIETARY_MODULE,                'P', 'G'),
        TAINT_FLAG(FORCED_MODULE,                'F', ' '),
        TAINT_FLAG(CPU_OUT_OF_SPEC,                'S', ' '),
        TAINT_FLAG(FORCED_RMMOD,                'R', ' '),
        TAINT_FLAG(MACHINE_CHECK,                'M', ' '),
        TAINT_FLAG(BAD_PAGE,                        'B', ' '),
        TAINT_FLAG(USER,                        'U', ' '),
        TAINT_FLAG(DIE,                                'D', ' '),
        TAINT_FLAG(OVERRIDDEN_ACPI_TABLE,        'A', ' '),
        TAINT_FLAG(WARN,                        'W', ' '),
        TAINT_FLAG(CRAP,                        'C', ' '),
        TAINT_FLAG(FIRMWARE_WORKAROUND,                'I', ' '),
        TAINT_FLAG(OOT_MODULE,                        'O', ' '),
        TAINT_FLAG(UNSIGNED_MODULE,                'E', ' '),
        TAINT_FLAG(SOFTLOCKUP,                        'L', ' '),
        TAINT_FLAG(LIVEPATCH,                        'K', ' '),
        TAINT_FLAG(AUX,                                'X', ' '),
        TAINT_FLAG(RANDSTRUCT,                        'T', ' '),
        TAINT_FLAG(TEST,                        'N', ' '),
        TAINT_FLAG(FWCTL,                        'J', ' '),
};

#undef TAINT_FLAG

static void print_tainted_seq(struct seq_buf *s, bool verbose)
{
        const char *sep = "";
        int i;

        if (!tainted_mask) {
                seq_buf_puts(s, "Not tainted");
                return;
        }

        seq_buf_printf(s, "Tainted: ");
        for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
                const struct taint_flag *t = &taint_flags[i];
                bool is_set = test_bit(i, &tainted_mask);
                char c = is_set ? t->c_true : t->c_false;

                if (verbose) {
                        if (is_set) {
                                seq_buf_printf(s, "%s[%c]=%s", sep, c, t->desc);
                                sep = ", ";
                        }
                } else {
                        seq_buf_putc(s, c);
                }
        }
}

/* The initial buffer can accommodate all taint flags in verbose
 * mode, with some headroom. Once the allocator is available, the
 * exact size is allocated dynamically; the initial buffer remains
 * as a fallback if allocation fails.
 *
 * The verbose taint string currently requires up to 327 characters.
 */
#define INIT_TAINT_BUF_MAX 350

static char init_taint_buf[INIT_TAINT_BUF_MAX] __initdata;
static char *taint_buf __refdata = init_taint_buf;
static size_t taint_buf_size = INIT_TAINT_BUF_MAX;

static __init int alloc_taint_buf(void)
{
        int i;
        char *buf;
        size_t size = 0;

        size += sizeof("Tainted: ") - 1;
        for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
                size += 2; /* For ", " */
                size += 4; /* For "[%c]=" */
                size += strlen(taint_flags[i].desc);
        }

        size += 1; /* For NULL terminator */

        buf = kmalloc(size, GFP_KERNEL);

        if (!buf) {
                panic("Failed to allocate taint string buffer");
        }

        taint_buf = buf;
        taint_buf_size = size;

        return 0;
}
postcore_initcall(alloc_taint_buf);

static const char *_print_tainted(bool verbose)
{
        struct seq_buf s;

        BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);

        seq_buf_init(&s, taint_buf, taint_buf_size);

        print_tainted_seq(&s, verbose);

        return seq_buf_str(&s);
}

/**
 * print_tainted - return a string to represent the kernel taint state.
 *
 * For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst
 *
 * The string is overwritten by the next call to print_tainted(),
 * but is always NULL terminated.
 */
const char *print_tainted(void)
{
        return _print_tainted(false);
}

/**
 * print_tainted_verbose - A more verbose version of print_tainted()
 */
const char *print_tainted_verbose(void)
{
        return _print_tainted(true);
}

int test_taint(unsigned flag)
{
        return test_bit(flag, &tainted_mask);
}
EXPORT_SYMBOL(test_taint);

unsigned long get_taint(void)
{
        return tainted_mask;
}

/**
 * add_taint: add a taint flag if not already set.
 * @flag: one of the TAINT_* constants.
 * @lockdep_ok: whether lock debugging is still OK.
 *
 * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
 * some notewortht-but-not-corrupting cases, it can be set to true.
 */
void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
{
        if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
                pr_warn("Disabling lock debugging due to kernel taint\n");

        set_bit(flag, &tainted_mask);

        if (tainted_mask & panic_on_taint) {
                panic_on_taint = 0;
                panic("panic_on_taint set ...");
        }
}
EXPORT_SYMBOL(add_taint);

static void spin_msec(int msecs)
{
        int i;

        for (i = 0; i < msecs; i++) {
                touch_nmi_watchdog();
                mdelay(1);
        }
}

/*
 * It just happens that oops_enter() and oops_exit() are identically
 * implemented...
 */
static void do_oops_enter_exit(void)
{
        unsigned long flags;
        static int spin_counter;

        if (!pause_on_oops)
                return;

        spin_lock_irqsave(&pause_on_oops_lock, flags);
        if (pause_on_oops_flag == 0) {
                /* This CPU may now print the oops message */
                pause_on_oops_flag = 1;
        } else {
                /* We need to stall this CPU */
                if (!spin_counter) {
                        /* This CPU gets to do the counting */
                        spin_counter = pause_on_oops;
                        do {
                                spin_unlock(&pause_on_oops_lock);
                                spin_msec(MSEC_PER_SEC);
                                spin_lock(&pause_on_oops_lock);
                        } while (--spin_counter);
                        pause_on_oops_flag = 0;
                } else {
                        /* This CPU waits for a different one */
                        while (spin_counter) {
                                spin_unlock(&pause_on_oops_lock);
                                spin_msec(1);
                                spin_lock(&pause_on_oops_lock);
                        }
                }
        }
        spin_unlock_irqrestore(&pause_on_oops_lock, flags);
}

/*
 * Return true if the calling CPU is allowed to print oops-related info.
 * This is a bit racy..
 */
bool oops_may_print(void)
{
        return pause_on_oops_flag == 0;
}

/*
 * Called when the architecture enters its oops handler, before it prints
 * anything.  If this is the first CPU to oops, and it's oopsing the first
 * time then let it proceed.
 *
 * This is all enabled by the pause_on_oops kernel boot option.  We do all
 * this to ensure that oopses don't scroll off the screen.  It has the
 * side-effect of preventing later-oopsing CPUs from mucking up the display,
 * too.
 *
 * It turns out that the CPU which is allowed to print ends up pausing for
 * the right duration, whereas all the other CPUs pause for twice as long:
 * once in oops_enter(), once in oops_exit().
 */
void oops_enter(void)
{
        nbcon_cpu_emergency_enter();
        tracing_off();
        /* can't trust the integrity of the kernel anymore: */
        debug_locks_off();
        do_oops_enter_exit();

        if (sysctl_oops_all_cpu_backtrace)
                trigger_all_cpu_backtrace();
}

static void print_oops_end_marker(void)
{
        pr_warn("---[ end trace %016llx ]---\n", 0ULL);
}

/*
 * Called when the architecture exits its oops handler, after printing
 * everything.
 */
void oops_exit(void)
{
        do_oops_enter_exit();
        print_oops_end_marker();
        nbcon_cpu_emergency_exit();
        kmsg_dump(KMSG_DUMP_OOPS);
}

struct warn_args {
        const char *fmt;
        va_list args;
};

void __warn(const char *file, int line, void *caller, unsigned taint,
            struct pt_regs *regs, struct warn_args *args)
{
        nbcon_cpu_emergency_enter();

        disable_trace_on_warning();

        if (file) {
                pr_warn("WARNING: %s:%d at %pS, CPU#%d: %s/%d\n",
                        file, line, caller,
                        raw_smp_processor_id(), current->comm, current->pid);
        } else {
                pr_warn("WARNING: at %pS, CPU#%d: %s/%d\n",
                        caller,
                        raw_smp_processor_id(), current->comm, current->pid);
        }

#pragma GCC diagnostic push
#ifndef __clang__
#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
#endif
        if (args)
                vprintk(args->fmt, args->args);
#pragma GCC diagnostic pop

        print_modules();

        if (regs)
                show_regs(regs);

        check_panic_on_warn("kernel");

        if (!regs)
                dump_stack();

        print_irqtrace_events(current);

        print_oops_end_marker();
        trace_error_report_end(ERROR_DETECTOR_WARN, (unsigned long)caller);

        /* Just a warning, don't kill lockdep. */
        add_taint(taint, LOCKDEP_STILL_OK);

        nbcon_cpu_emergency_exit();
}

#ifdef CONFIG_BUG
#ifndef __WARN_FLAGS
void warn_slowpath_fmt(const char *file, int line, unsigned taint,
                       const char *fmt, ...)
{
        bool rcu = warn_rcu_enter();
        struct warn_args args;

        pr_warn(CUT_HERE);

        if (!fmt) {
                __warn(file, line, __builtin_return_address(0), taint,
                       NULL, NULL);
                warn_rcu_exit(rcu);
                return;
        }

        args.fmt = fmt;
        va_start(args.args, fmt);
        __warn(file, line, __builtin_return_address(0), taint, NULL, &args);
        va_end(args.args);
        warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
        bool rcu = warn_rcu_enter();
        va_list args;

        pr_warn(CUT_HERE);

        va_start(args, fmt);
        vprintk(fmt, args);
        va_end(args);
        warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(__warn_printk);
#endif

/* Support resetting WARN*_ONCE state */

static int clear_warn_once_set(void *data, u64 val)
{
        generic_bug_clear_once();
        memset(__start_once, 0, __end_once - __start_once);
        return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set,
                         "%lld\n");

static __init int register_warn_debugfs(void)
{
        /* Don't care about failure */
        debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL,
                                   &clear_warn_once_fops);
        return 0;
}

device_initcall(register_warn_debugfs);
#endif

#ifdef CONFIG_STACKPROTECTOR

/*
 * Called when gcc's -fstack-protector feature is used, and
 * gcc detects corruption of the on-stack canary value
 */
__visible noinstr void __stack_chk_fail(void)
{
        unsigned long flags;

        instrumentation_begin();
        flags = user_access_save();

        panic("stack-protector: Kernel stack is corrupted in: %pB",
                __builtin_return_address(0));

        user_access_restore(flags);
        instrumentation_end();
}
EXPORT_SYMBOL(__stack_chk_fail);

#endif

core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
core_param(panic_console_replay, panic_console_replay, bool, 0644);

static int panic_print_set(const char *val, const struct kernel_param *kp)
{
        panic_print_deprecated();
        return  param_set_ulong(val, kp);
}

static int panic_print_get(char *val, const struct kernel_param *kp)
{
        return  param_get_ulong(val, kp);
}

static const struct kernel_param_ops panic_print_ops = {
        .set        = panic_print_set,
        .get        = panic_print_get,
};
__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644);

static int __init oops_setup(char *s)
{
        if (!s)
                return -EINVAL;
        if (!strcmp(s, "panic"))
                panic_on_oops = 1;
        return 0;
}
early_param("oops", oops_setup);

static int __init panic_on_taint_setup(char *s)
{
        char *taint_str;

        if (!s)
                return -EINVAL;

        taint_str = strsep(&s, ",");
        if (kstrtoul(taint_str, 16, &panic_on_taint))
                return -EINVAL;

        /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */
        panic_on_taint &= TAINT_FLAGS_MAX;

        if (!panic_on_taint)
                return -EINVAL;

        if (s && !strcmp(s, "nousertaint"))
                panic_on_taint_nousertaint = true;

        pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n",
                panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint));

        return 0;
}
early_param("panic_on_taint", panic_on_taint_setup);






















































































































































































































































    1 





















    1 









































    1 

















































































































































































































































































































































































































































































































    1 






















    1 




















    1 







































    1 






































































































































































    1 

















































































































































































































































































































    1 






    1 






    1 






    1 










































































































































































    1 






    1 












































































































































































































    1 

















































    1 

















































































































































































































































































































































































































































    1 
















































































































































































































































































































































































































    1 
































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ext4

#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_EXT4_H

#include <linux/writeback.h>
#include <linux/tracepoint.h>

struct ext4_allocation_context;
struct ext4_allocation_request;
struct ext4_extent;
struct ext4_prealloc_space;
struct ext4_inode_info;
struct mpage_da_data;
struct ext4_map_blocks;
struct extent_status;
struct ext4_fsmap;
struct partial_cluster;

#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))

#define show_mballoc_flags(flags) __print_flags(flags, "|",        \
        { EXT4_MB_HINT_MERGE,                "HINT_MERGE" },                \
        { EXT4_MB_HINT_FIRST,                "HINT_FIRST" },                \
        { EXT4_MB_HINT_DATA,                "HINT_DATA" },                \
        { EXT4_MB_HINT_NOPREALLOC,        "HINT_NOPREALLOC" },        \
        { EXT4_MB_HINT_GROUP_ALLOC,        "HINT_GRP_ALLOC" },        \
        { EXT4_MB_HINT_GOAL_ONLY,        "HINT_GOAL_ONLY" },        \
        { EXT4_MB_HINT_TRY_GOAL,        "HINT_TRY_GOAL" },        \
        { EXT4_MB_DELALLOC_RESERVED,        "DELALLOC_RESV" },        \
        { EXT4_MB_STREAM_ALLOC,                "STREAM_ALLOC" },        \
        { EXT4_MB_USE_ROOT_BLOCKS,        "USE_ROOT_BLKS" },        \
        { EXT4_MB_USE_RESERVED,                "USE_RESV" },                \
        { EXT4_MB_STRICT_CHECK,                "STRICT_CHECK" })

#define show_map_flags(flags) __print_flags(flags, "|",                        \
        { EXT4_GET_BLOCKS_CREATE,                "CREATE" },                \
        { EXT4_GET_BLOCKS_UNWRIT_EXT,                "UNWRIT" },                \
        { EXT4_GET_BLOCKS_DELALLOC_RESERVE,        "DELALLOC" },                \
        { EXT4_GET_BLOCKS_SPLIT_NOMERGE,        "SPLIT_NOMERGE" },        \
        { EXT4_GET_BLOCKS_CONVERT,                "CONVERT" },                \
        { EXT4_GET_BLOCKS_METADATA_NOFAIL,        "METADATA_NOFAIL" },        \
        { EXT4_GET_BLOCKS_NO_NORMALIZE,                "NO_NORMALIZE" },        \
        { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,        "CONVERT_UNWRITTEN" },  \
        { EXT4_GET_BLOCKS_ZERO,                        "ZERO" },                \
        { EXT4_GET_BLOCKS_IO_SUBMIT,                "IO_SUBMIT" },                \
        { EXT4_EX_NOCACHE,                        "EX_NOCACHE" })

/*
 * __print_flags() requires that all enum values be wrapped in the
 * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
 * ring buffer.
 */
TRACE_DEFINE_ENUM(BH_New);
TRACE_DEFINE_ENUM(BH_Mapped);
TRACE_DEFINE_ENUM(BH_Unwritten);
TRACE_DEFINE_ENUM(BH_Boundary);

#define show_mflags(flags) __print_flags(flags, "",        \
        { EXT4_MAP_NEW,                "N" },                        \
        { EXT4_MAP_MAPPED,        "M" },                        \
        { EXT4_MAP_UNWRITTEN,        "U" },                        \
        { EXT4_MAP_BOUNDARY,        "B" })

#define show_free_flags(flags) __print_flags(flags, "|",        \
        { EXT4_FREE_BLOCKS_METADATA,                "METADATA" },        \
        { EXT4_FREE_BLOCKS_FORGET,                "FORGET" },        \
        { EXT4_FREE_BLOCKS_VALIDATED,                "VALIDATED" },        \
        { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE,        "NO_QUOTA" },        \
        { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\
        { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER,        "LAST_CLUSTER" })

TRACE_DEFINE_ENUM(ES_WRITTEN_B);
TRACE_DEFINE_ENUM(ES_UNWRITTEN_B);
TRACE_DEFINE_ENUM(ES_DELAYED_B);
TRACE_DEFINE_ENUM(ES_HOLE_B);
TRACE_DEFINE_ENUM(ES_REFERENCED_B);

#define show_extent_status(status) __print_flags(status, "",        \
        { EXTENT_STATUS_WRITTEN,        "W" },                        \
        { EXTENT_STATUS_UNWRITTEN,        "U" },                        \
        { EXTENT_STATUS_DELAYED,        "D" },                        \
        { EXTENT_STATUS_HOLE,                "H" },                        \
        { EXTENT_STATUS_REFERENCED,        "R" })

#define show_falloc_mode(mode) __print_flags(mode, "|",                \
        { FALLOC_FL_KEEP_SIZE,                "KEEP_SIZE"},                \
        { FALLOC_FL_PUNCH_HOLE,                "PUNCH_HOLE"},                \
        { FALLOC_FL_COLLAPSE_RANGE,        "COLLAPSE_RANGE"},        \
        { FALLOC_FL_ZERO_RANGE,                "ZERO_RANGE"},                \
        { FALLOC_FL_WRITE_ZEROES,        "WRITE_ZEROES"})

TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_NOMEM);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_SWAP_BOOT);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MIGRATE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);

#define show_fc_reason(reason)                                                \
        __print_symbolic(reason,                                        \
                { EXT4_FC_REASON_XATTR,                "XATTR"},                \
                { EXT4_FC_REASON_CROSS_RENAME,        "CROSS_RENAME"},        \
                { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \
                { EXT4_FC_REASON_NOMEM,        "NO_MEM"},                        \
                { EXT4_FC_REASON_SWAP_BOOT,        "SWAP_BOOT"},                \
                { EXT4_FC_REASON_RESIZE,        "RESIZE"},                \
                { EXT4_FC_REASON_RENAME_DIR,        "RENAME_DIR"},                \
                { EXT4_FC_REASON_FALLOC_RANGE,        "FALLOC_RANGE"},        \
                { EXT4_FC_REASON_INODE_JOURNAL_DATA,        "INODE_JOURNAL_DATA"}, \
                { EXT4_FC_REASON_ENCRYPTED_FILENAME,        "ENCRYPTED_FILENAME"}, \
                { EXT4_FC_REASON_MIGRATE,                "MIGRATE"},        \
                { EXT4_FC_REASON_VERITY,                "VERITY"},        \
                { EXT4_FC_REASON_MOVE_EXT,                "MOVE_EXT"})

TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED);
TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST);
TRACE_DEFINE_ENUM(CR_BEST_AVAIL_LEN);
TRACE_DEFINE_ENUM(CR_GOAL_LEN_SLOW);
TRACE_DEFINE_ENUM(CR_ANY_FREE);

#define show_criteria(cr)                                               \
        __print_symbolic(cr,                                            \
                         { CR_POWER2_ALIGNED, "CR_POWER2_ALIGNED" },        \
                         { CR_GOAL_LEN_FAST, "CR_GOAL_LEN_FAST" },      \
                         { CR_BEST_AVAIL_LEN, "CR_BEST_AVAIL_LEN" },    \
                         { CR_GOAL_LEN_SLOW, "CR_GOAL_LEN_SLOW" },      \
                         { CR_ANY_FREE, "CR_ANY_FREE" })

TRACE_EVENT(ext4_other_inode_update_time,
        TP_PROTO(struct inode *inode, u64 orig_ino),

        TP_ARGS(inode, orig_ino),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        u64,        orig_ino                )
                __field(        dev_t,        dev                        )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->orig_ino = orig_ino;
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d orig_ino %llu ino %llu mode 0%o uid %u gid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->orig_ino,
                  __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid)
);

TRACE_EVENT(ext4_free_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64, blocks                        )
                __field(        dev_t,        dev                        )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->blocks        = inode->i_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %llu mode 0%o uid %u gid %u blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid, __entry->blocks)
);

TRACE_EVENT(ext4_request_inode,
        TP_PROTO(struct inode *dir, int mode),

        TP_ARGS(dir, mode),

        TP_STRUCT__entry(
                __field(        u64,        dir                        )
                __field(        dev_t,        dev                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = dir->i_sb->s_dev;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d dir %llu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_allocate_inode,
        TP_PROTO(struct inode *inode, struct inode *dir, int mode),

        TP_ARGS(inode, dir, mode),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        u64,        dir                        )
                __field(        dev_t,        dev                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %llu dir %llu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_evict_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        dev_t,        dev                        )
                __field(        int,        nlink                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->nlink        = inode->i_nlink;
        ),

        TP_printk("dev %d,%d ino %llu nlink %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->nlink)
);

TRACE_EVENT(ext4_drop_inode,
        TP_PROTO(struct inode *inode, int drop),

        TP_ARGS(inode, drop),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        dev_t,        dev                        )
                __field(        int,        drop                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->drop        = drop;
        ),

        TP_printk("dev %d,%d ino %llu drop %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->drop)
);

TRACE_EVENT(ext4_nfs_commit_metadata,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        dev_t,        dev                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
        ),

        TP_printk("dev %d,%d ino %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino)
);

TRACE_EVENT(ext4_mark_inode_dirty,
        TP_PROTO(struct inode *inode, unsigned long IP),

        TP_ARGS(inode, IP),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(unsigned long,        ip                        )
                __field(        dev_t,        dev                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->ip        = IP;
        ),

        TP_printk("dev %d,%d ino %llu caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, (void *)__entry->ip)
);

TRACE_EVENT(ext4_begin_ordered_truncate,
        TP_PROTO(struct inode *inode, loff_t new_size),

        TP_ARGS(inode, new_size),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        loff_t,        new_size                )
                __field(        dev_t,        dev                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->new_size        = new_size;
        ),

        TP_printk("dev %d,%d ino %llu new_size %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->new_size)
);

DECLARE_EVENT_CLASS(ext4__write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        dev_t,        dev                        )
                __field(        unsigned int, len                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %llu pos %lld len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->pos, __entry->len)
);

DEFINE_EVENT(ext4__write_begin, ext4_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len)
);

DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len)
);

DECLARE_EVENT_CLASS(ext4__write_end,
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                        unsigned int copied),

        TP_ARGS(inode, pos, len, copied),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        dev_t,        dev                        )
                __field(        unsigned int, len                )
                __field(        unsigned int, copied                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
                __entry->copied        = copied;
        ),

        TP_printk("dev %d,%d ino %llu pos %lld len %u copied %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->pos, __entry->len, __entry->copied)
);

DEFINE_EVENT(ext4__write_end, ext4_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_da_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

TRACE_EVENT(ext4_writepages,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        loff_t,        range_start                )
                __field(        loff_t,        range_end                )
                __field(        long,        nr_to_write                )
                __field(        long,        pages_skipped                )
                __field(       pgoff_t,        writeback_index                )
                __field(        dev_t,        dev                        )
                __field(        int,        sync_mode                )
                __field(        char,        for_kupdate                )
                __field(        char,        range_cyclic                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->range_start        = wbc->range_start;
                __entry->range_end        = wbc->range_end;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->range_cyclic        = wbc->range_cyclic;
        ),

        TP_printk("dev %d,%d ino %llu nr_to_write %ld pages_skipped %ld "
                  "range_start %lld range_end %lld sync_mode %d "
                  "for_kupdate %d range_cyclic %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->nr_to_write,
                  __entry->pages_skipped, __entry->range_start,
                  __entry->range_end, __entry->sync_mode,
                  __entry->for_kupdate, __entry->range_cyclic,
                  (unsigned long) __entry->writeback_index)
);

TRACE_EVENT(ext4_da_write_folios_start,
        TP_PROTO(struct inode *inode, loff_t start_pos, loff_t next_pos,
                 struct writeback_control *wbc),

        TP_ARGS(inode, start_pos, next_pos, wbc),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(       loff_t,        start_pos                )
                __field(       loff_t,        next_pos                )
                __field(         long,        nr_to_write                )
                __field(        dev_t,        dev                        )
                __field(          int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start_pos        = start_pos;
                __entry->next_pos        = next_pos;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %llu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld sync_mode %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->start_pos, __entry->next_pos,
                  __entry->nr_to_write, __entry->sync_mode)
);

TRACE_EVENT(ext4_da_write_folios_end,
        TP_PROTO(struct inode *inode, loff_t start_pos, loff_t next_pos,
                 struct writeback_control *wbc, int ret),

        TP_ARGS(inode, start_pos, next_pos, wbc, ret),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(       loff_t,        start_pos                )
                __field(       loff_t,        next_pos                )
                __field(         long,        nr_to_write                )
                __field(        dev_t,        dev                        )
                __field(          int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start_pos        = start_pos;
                __entry->next_pos        = next_pos;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %llu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->start_pos, __entry->next_pos,
                  __entry->nr_to_write, __entry->ret)
);

TRACE_EVENT(ext4_da_write_pages_extent,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),

        TP_ARGS(inode, map),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64,        lblk                        )
                __field(        dev_t,        dev                        )
                __field(        __u32,        len                        )
                __field(        __u32,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = map->m_lblk;
                __entry->len                = map->m_len;
                __entry->flags                = map->m_flags;
        ),

        TP_printk("dev %d,%d ino %llu lblk %llu len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->lblk, __entry->len,
                  show_mflags(__entry->flags))
);

TRACE_EVENT(ext4_writepages_result,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                        int ret, int pages_written),

        TP_ARGS(inode, wbc, ret, pages_written),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        long,        pages_skipped                )
                __field(       pgoff_t,        writeback_index                )
                __field(        dev_t,        dev                        )
                __field(        int,        ret                        )
                __field(        int,        pages_written                )
                __field(        int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
                __entry->pages_written        = pages_written;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %llu ret %d pages_written %d pages_skipped %ld "
                  "sync_mode %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->ret,
                  __entry->pages_written, __entry->pages_skipped,
                  __entry->sync_mode,
                  (unsigned long) __entry->writeback_index)
);

DECLARE_EVENT_CLASS(ext4__folio_op,
        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        pgoff_t, index                        )
                __field(        dev_t,        dev                        )

        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->index        = folio->index;
        ),

        TP_printk("dev %d,%d ino %llu folio_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  (unsigned long) __entry->index)
);

DEFINE_EVENT(ext4__folio_op, ext4_read_folio,

        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio)
);

DEFINE_EVENT(ext4__folio_op, ext4_release_folio,

        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio)
);

DECLARE_EVENT_CLASS(ext4_invalidate_folio_op,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        pgoff_t, index                        )
                __field(        size_t, offset                        )
                __field(        size_t, length                        )
                __field(        dev_t,        dev                        )
        ),

        TP_fast_assign(
                __entry->dev        = folio->mapping->host->i_sb->s_dev;
                __entry->ino        = folio->mapping->host->i_ino;
                __entry->index        = folio->index;
                __entry->offset        = offset;
                __entry->length        = length;
        ),

        TP_printk("dev %d,%d ino %llu folio_index %lu offset %zu length %zu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  (unsigned long) __entry->index,
                  __entry->offset, __entry->length)
);

DEFINE_EVENT(ext4_invalidate_folio_op, ext4_invalidate_folio,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length)
);

DEFINE_EVENT(ext4_invalidate_folio_op, ext4_journalled_invalidate_folio,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length)
);

TRACE_EVENT(ext4_discard_blocks,
        TP_PROTO(struct super_block *sb, unsigned long long blk,
                        unsigned long long count),

        TP_ARGS(sb, blk, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        blk                        )
                __field(        __u64,        count                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->blk        = blk;
                __entry->count        = count;
        ),

        TP_printk("dev %d,%d blk %llu count %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blk, __entry->count)
);

DECLARE_EVENT_CLASS(ext4__mb_new_pa,
        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u64,        pa_lstart                )
                __field(        dev_t,        dev                        )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_lstart        = pa->pa_lstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d ino %llu pstart %llu len %u lstart %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

TRACE_EVENT(ext4_mb_release_inode_pa,
        TP_PROTO(struct ext4_prealloc_space *pa,
                 unsigned long long block, unsigned int count),

        TP_ARGS(pa, block, count),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64,        block                        )
                __field(        dev_t,        dev                        )
                __field(        __u32,        count                        )

        ),

        TP_fast_assign(
                __entry->dev                = pa->pa_inode->i_sb->s_dev;
                __entry->ino                = pa->pa_inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
        ),

        TP_printk("dev %d,%d ino %llu block %llu count %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->block, __entry->count)
);

TRACE_EVENT(ext4_mb_release_group_pa,
        TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa),

        TP_ARGS(sb, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d pstart %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->pa_pstart, __entry->pa_len)
);

TRACE_EVENT(ext4_discard_preallocations,
        TP_PROTO(struct inode *inode, unsigned int len),

        TP_ARGS(inode, len),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        dev_t,                dev                )
                __field(        unsigned int,        len                )

        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %llu len: %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->len)
);

TRACE_EVENT(ext4_mb_discard_preallocations,
        TP_PROTO(struct super_block *sb, int needed),

        TP_ARGS(sb, needed),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        needed                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->needed        = needed;
        ),

        TP_printk("dev %d,%d needed %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->needed)
);

TRACE_EVENT(ext4_request_blocks,
        TP_PROTO(struct ext4_allocation_request *ar),

        TP_ARGS(ar),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        dev_t,        dev                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %llu flags %s len %u lblk %u goal %llu "
                  "lleft %u lright %u pleft %llu pright %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->logical, __entry->goal,
                  __entry->lleft, __entry->lright, __entry->pleft,
                  __entry->pright)
);

TRACE_EVENT(ext4_allocate_blocks,
        TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block),

        TP_ARGS(ar, block),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64,        block                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        dev_t,        dev                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->block        = block;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %llu flags %s len %u block %llu lblk %u "
                  "goal %llu lleft %u lright %u pleft %llu pright %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->block, __entry->logical,
                  __entry->goal,  __entry->lleft, __entry->lright,
                  __entry->pleft, __entry->pright)
);

TRACE_EVENT(ext4_free_blocks,
        TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
                 int flags),

        TP_ARGS(inode, block, count, flags),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned long,        count                )
                __field(        dev_t,        dev                        )
                __field(        int,        flags                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
                __entry->flags                = flags;
                __entry->mode                = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %llu mode 0%o block %llu count %lu flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->mode, __entry->block, __entry->count,
                  show_free_flags(__entry->flags))
);

TRACE_EVENT(ext4_sync_file_enter,
        TP_PROTO(struct file *file, int datasync),

        TP_ARGS(file, datasync),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        u64,        parent                        )
                __field(        dev_t,        dev                        )
                __field(        int,        datasync                )
        ),

        TP_fast_assign(
                struct dentry *dentry = file->f_path.dentry;

                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->datasync        = datasync;
                __entry->parent                = d_inode(dentry->d_parent)->i_ino;
        ),

        TP_printk("dev %d,%d ino %llu parent %llu datasync %d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->parent, __entry->datasync)
);

TRACE_EVENT(ext4_sync_file_exit,
        TP_PROTO(struct inode *inode, int ret),

        TP_ARGS(inode, ret),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        dev_t,        dev                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %llu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->ret)
);

TRACE_EVENT(ext4_sync_fs,
        TP_PROTO(struct super_block *sb, int wait),

        TP_ARGS(sb, wait),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        wait                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->wait        = wait;
        ),

        TP_printk("dev %d,%d wait %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->wait)
);

TRACE_EVENT(ext4_alloc_da_blocks,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        dev_t,        dev                        )
                __field( unsigned int,        data_blocks                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
        ),

        TP_printk("dev %d,%d ino %llu reserved_data_blocks %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->data_blocks)
);

TRACE_EVENT(ext4_mballoc_alloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        dev_t,        dev                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         goal_logical                )
                __field(          int,        goal_start                )
                __field(        __u32,         goal_group                )
                __field(          int,        goal_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
                __field(        __u16,        found                        )
                __field(        __u16,        groups                        )
                __field(        __u16,        buddy                        )
                __field(        __u16,        flags                        )
                __field(        __u16,        tail                        )
                __field(        __u8,        cr                        )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->goal_logical        = ac->ac_g_ex.fe_logical;
                __entry->goal_start        = ac->ac_g_ex.fe_start;
                __entry->goal_group        = ac->ac_g_ex.fe_group;
                __entry->goal_len        = ac->ac_g_ex.fe_len;
                __entry->result_logical        = ac->ac_f_ex.fe_logical;
                __entry->result_start        = ac->ac_f_ex.fe_start;
                __entry->result_group        = ac->ac_f_ex.fe_group;
                __entry->result_len        = ac->ac_f_ex.fe_len;
                __entry->found                = ac->ac_found;
                __entry->flags                = ac->ac_flags;
                __entry->groups                = ac->ac_groups_scanned;
                __entry->buddy                = ac->ac_buddy;
                __entry->tail                = ac->ac_tail;
                __entry->cr                = ac->ac_criteria;
        ),

        TP_printk("dev %d,%d inode %llu orig %u/%d/%u@%u goal %u/%d/%u@%u "
                  "result %u/%d/%u@%u blks %u grps %u cr %s flags %s "
                  "tail %u broken %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->goal_group, __entry->goal_start,
                  __entry->goal_len, __entry->goal_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical,
                  __entry->found, __entry->groups, show_criteria(__entry->cr),
                  show_mballoc_flags(__entry->flags), __entry->tail,
                  __entry->buddy ? 1 << __entry->buddy : 0)
);

TRACE_EVENT(ext4_mballoc_prealloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        dev_t,        dev                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->result_logical        = ac->ac_b_ex.fe_logical;
                __entry->result_start        = ac->ac_b_ex.fe_start;
                __entry->result_group        = ac->ac_b_ex.fe_group;
                __entry->result_len        = ac->ac_b_ex.fe_len;
        ),

        TP_printk("dev %d,%d inode %llu orig %u/%d/%u@%u result %u/%d/%u@%u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical)
);

DECLARE_EVENT_CLASS(ext4__mballoc,
        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        dev_t,        dev                        )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->result_start        = start;
                __entry->result_group        = group;
                __entry->result_len        = len;
        ),

        TP_printk("dev %d,%d inode %llu extent %u/%d/%d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

TRACE_EVENT(ext4_forget,
        TP_PROTO(struct inode *inode, int is_metadata, __u64 block),

        TP_ARGS(inode, is_metadata, block),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64,        block                        )
                __field(        dev_t,        dev                        )
                __field(        int,        is_metadata                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->block        = block;
                __entry->is_metadata = is_metadata;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %llu mode 0%o is_metadata %d block %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->mode, __entry->is_metadata, __entry->block)
);

TRACE_EVENT(ext4_da_update_reserve_space,
        TP_PROTO(struct inode *inode, int used_blocks, int quota_claim),

        TP_ARGS(inode, used_blocks, quota_claim),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        dev_t,        dev                        )
                __field(        int,        used_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        int,        quota_claim                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->used_blocks = used_blocks;
                __entry->reserved_data_blocks =
                                EXT4_I(inode)->i_reserved_data_blocks;
                __entry->quota_claim = quota_claim;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %llu mode 0%o i_blocks %llu used_blocks %d "
                  "reserved_data_blocks %d quota_claim %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->used_blocks, __entry->reserved_data_blocks,
                  __entry->quota_claim)
);

TRACE_EVENT(ext4_da_reserve_space,
        TP_PROTO(struct inode *inode, int nr_resv),

        TP_ARGS(inode, nr_resv),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        dev_t,        dev                        )
                __field(        int,        reserve_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->reserve_blocks = nr_resv;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %llu mode 0%o i_blocks %llu reserve_blocks %d"
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->reserve_blocks, __entry->reserved_data_blocks)
);

TRACE_EVENT(ext4_da_release_space,
        TP_PROTO(struct inode *inode, int freed_blocks),

        TP_ARGS(inode, freed_blocks),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        dev_t,        dev                        )
                __field(        int,        freed_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->freed_blocks = freed_blocks;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %llu mode 0%o i_blocks %llu freed_blocks %d "
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->freed_blocks, __entry->reserved_data_blocks)
);

DECLARE_EVENT_CLASS(ext4__bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

TRACE_EVENT(ext4_read_block_bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch),

        TP_ARGS(sb, group, prefetch),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        bool,        prefetch                )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->prefetch = prefetch;
        ),

        TP_printk("dev %d,%d group %u prefetch %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->prefetch)
);

DECLARE_EVENT_CLASS(ext4__fallocate_mode,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        loff_t,        offset                        )
                __field(        loff_t, len                        )
                __field(        dev_t,        dev                        )
                __field(        int,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %llu offset %lld len %lld mode %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->offset, __entry->len,
                  show_falloc_mode(__entry->mode))
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

TRACE_EVENT(ext4_fallocate_exit,
        TP_PROTO(struct inode *inode, loff_t offset,
                 unsigned int max_blocks, int ret),

        TP_ARGS(inode, offset, max_blocks, ret),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        dev_t,        dev                        )
                __field(        unsigned int,        blocks                )
                __field(        int,         ret                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = offset;
                __entry->blocks        = max_blocks;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %llu pos %lld blocks %u ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->pos, __entry->blocks,
                  __entry->ret)
);

TRACE_EVENT(ext4_unlink_enter,
        TP_PROTO(struct inode *parent, struct dentry *dentry),

        TP_ARGS(parent, dentry),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        u64,        parent                        )
                __field(        loff_t,        size                        )
                __field(        dev_t,        dev                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->parent                = parent->i_ino;
                __entry->size                = d_inode(dentry)->i_size;
        ),

        TP_printk("dev %d,%d ino %llu size %lld parent %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->size,
                  __entry->parent)
);

TRACE_EVENT(ext4_unlink_exit,
        TP_PROTO(struct dentry *dentry, int ret),

        TP_ARGS(dentry, ret),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        dev_t,        dev                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %llu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->ret)
);

DECLARE_EVENT_CLASS(ext4__truncate,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        __u64,                blocks                )
                __field(        dev_t,                dev                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->blocks        = inode->i_blocks;
        ),

        TP_printk("dev %d,%d ino %llu blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->blocks)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/* 'ux' is the unwritten extent. */
TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux),

        TP_ARGS(inode, map, ux),

        TP_STRUCT__entry(
                __field(        u64,                ino        )
                __field(        ext4_fsblk_t,        u_pblk        )
                __field(        dev_t,                dev        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
        ),

        TP_printk("dev %d,%d ino %llu m_lblk %u m_len %u u_lblk %u u_len %u "
                  "u_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk)
);

/*
 * 'ux' is the unwritten extent.
 * 'ix' is the initialized extent to which blocks are transferred.
 */
TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux, struct ext4_extent *ix),

        TP_ARGS(inode, map, ux, ix),

        TP_STRUCT__entry(
                __field(        u64,                ino        )
                __field(        ext4_fsblk_t,        u_pblk        )
                __field(        ext4_fsblk_t,        i_pblk        )
                __field(        dev_t,                dev        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_lblk_t,        i_lblk        )
                __field(        unsigned,        i_len        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
                __entry->i_lblk                = le32_to_cpu(ix->ee_block);
                __entry->i_len                = ext4_ext_get_actual_len(ix);
                __entry->i_pblk                = ext4_ext_pblock(ix);
        ),

        TP_printk("dev %d,%d ino %llu m_lblk %u m_len %u "
                  "u_lblk %u u_len %u u_pblk %llu "
                  "i_lblk %u i_len %u i_pblk %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk,
                  __entry->i_lblk, __entry->i_len, __entry->i_pblk)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned int len, unsigned int flags),

        TP_ARGS(inode, lblk, len, flags),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        dev_t,                dev                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d ino %llu lblk %u len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->lblk, __entry->len, show_map_flags(__entry->flags))
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map,
                 int ret),

        TP_ARGS(inode, flags, map, ret),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        dev_t,                dev                )
                __field(        unsigned int,        flags                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        mflags                )
                __field(        int,                ret                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->flags        = flags;
                __entry->pblk        = map->m_pblk;
                __entry->lblk        = map->m_lblk;
                __entry->len        = map->m_len;
                __entry->mflags        = map->m_flags;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %llu flags %s lblk %u pblk %llu len %u "
                  "mflags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  show_map_flags(__entry->flags), __entry->lblk, __entry->pblk,
                  __entry->len, show_mflags(__entry->mflags), __entry->ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

TRACE_EVENT(ext4_ext_load_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk),

        TP_ARGS(inode, lblk, pblk),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        dev_t,                dev                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %llu lblk %u pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->lblk, __entry->pblk)
);

TRACE_EVENT(ext4_load_inode,
        TP_PROTO(struct super_block *sb, unsigned long ino),

        TP_ARGS(sb, ino),

        TP_STRUCT__entry(
                __field(        u64,        ino                )
                __field(        dev_t,        dev                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = ino;
        ),

        TP_printk("dev %d,%d ino %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino)
);

TRACE_EVENT(ext4_journal_start_sb,
        TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks,
                 int revoke_creds, int type, unsigned long IP),

        TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, type, IP),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        unsigned long,        ip                )
                __field(        int,                blocks                )
                __field(        int,                rsv_blocks        )
                __field(        int,                revoke_creds        )
                __field(        int,                type                )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
                __entry->rsv_blocks         = rsv_blocks;
                __entry->revoke_creds         = revoke_creds;
                __entry->type                 = type;
        ),

        TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d,"
                  " type %d, caller %pS", MAJOR(__entry->dev),
                  MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks,
                  __entry->revoke_creds, __entry->type, (void *)__entry->ip)
);

TRACE_EVENT(ext4_journal_start_inode,
        TP_PROTO(struct inode *inode, int blocks, int rsv_blocks,
                 int revoke_creds, int type, unsigned long IP),

        TP_ARGS(inode, blocks, rsv_blocks, revoke_creds, type, IP),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        dev_t,                dev                )
                __field(        unsigned long,        ip                )
                __field(        int,                blocks                )
                __field(        int,                rsv_blocks        )
                __field(        int,                revoke_creds        )
                __field(        int,                type                )
        ),

        TP_fast_assign(
                __entry->dev                 = inode->i_sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
                __entry->rsv_blocks         = rsv_blocks;
                __entry->revoke_creds         = revoke_creds;
                __entry->type                 = type;
                __entry->ino                 = inode->i_ino;
        ),

        TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d,"
                  " type %d, ino %llu, caller %pS", MAJOR(__entry->dev),
                  MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks,
                  __entry->revoke_creds, __entry->type,
                  __entry->ino, (void *)__entry->ip)
);

TRACE_EVENT(ext4_journal_start_reserved,
        TP_PROTO(struct super_block *sb, int blocks, unsigned long IP),

        TP_ARGS(sb, blocks, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(unsigned long,        ip                        )
                __field(          int,        blocks                        )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
        ),

        TP_printk("dev %d,%d blocks, %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blocks, (void *)__entry->ip)
);

DECLARE_EVENT_CLASS(ext4__trim,
        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len),

        TP_STRUCT__entry(
                __field(        int,        dev_major                )
                __field(        int,        dev_minor                )
                __field(        __u32,         group                        )
                __field(        int,        start                        )
                __field(        int,        len                        )
        ),

        TP_fast_assign(
                __entry->dev_major        = MAJOR(sb->s_dev);
                __entry->dev_minor        = MINOR(sb->s_dev);
                __entry->group                = group;
                __entry->start                = start;
                __entry->len                = len;
        ),

        TP_printk("dev %d,%d group %u, start %d, len %d",
                  __entry->dev_major, __entry->dev_minor,
                  __entry->group, __entry->start, __entry->len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_extent,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_all_free,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

TRACE_EVENT(ext4_ext_handle_unwritten_extents,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags,
                 unsigned int allocated, ext4_fsblk_t newblock),

        TP_ARGS(inode, map, flags, allocated, newblock),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_fsblk_t,        newblk                )
                __field(        dev_t,                dev                )
                __field(        int,                flags                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        allocated        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->flags                = flags;
                __entry->lblk                = map->m_lblk;
                __entry->pblk                = map->m_pblk;
                __entry->len                = map->m_len;
                __entry->allocated        = allocated;
                __entry->newblk                = newblock;
        ),

        TP_printk("dev %d,%d ino %llu m_lblk %u m_pblk %llu m_len %u flags %s "
                  "allocated %d newblock %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_map_flags(__entry->flags),
                  (unsigned int) __entry->allocated,
                  (unsigned long long) __entry->newblk)
);

TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
        TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret),

        TP_ARGS(sb, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        unsigned int,        flags        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        unsigned int,        len        )
                __field(        int,                ret        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = map->m_flags;
                __entry->lblk        = map->m_lblk;
                __entry->pblk        = map->m_pblk;
                __entry->len        = map->m_len;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_mflags(__entry->flags), __entry->ret)
);

TRACE_EVENT(ext4_ext_show_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                 unsigned short len),

        TP_ARGS(inode, lblk, pblk, len),

        TP_STRUCT__entry(
                __field(        u64,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        dev_t,                dev        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        unsigned short,        len        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %llu lblk %u pblk %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  (unsigned) __entry->lblk,
                  (unsigned long long) __entry->pblk,
                  (unsigned short) __entry->len)
);

TRACE_EVENT(ext4_remove_blocks,
        TP_PROTO(struct inode *inode, struct ext4_extent *ex,
                 ext4_lblk_t from, ext4_fsblk_t to,
                 struct partial_cluster *pc),

        TP_ARGS(inode, ex, from, to, pc),

        TP_STRUCT__entry(
                __field(        u64,                ino        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        dev_t,                dev        )
                __field(        ext4_lblk_t,        from        )
                __field(        ext4_lblk_t,        to        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        unsigned short,        ee_len        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->from                = from;
                __entry->to                = to;
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %llu extent [%u(%llu), %u]"
                  "from %u to %u partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (unsigned) __entry->from,
                  (unsigned) __entry->to,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_leaf,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 struct ext4_extent *ex,
                 struct partial_cluster *pc),

        TP_ARGS(inode, start, ex, pc),

        TP_STRUCT__entry(
                __field(        u64,                ino        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        dev_t,                dev        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        short,                ee_len        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %llu start_lblk %u last_extent [%u(%llu), %u]"
                  "partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_idx,
        TP_PROTO(struct inode *inode, ext4_fsblk_t pblk),

        TP_ARGS(inode, pblk),

        TP_STRUCT__entry(
                __field(        u64,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        dev_t,                dev        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
        ),

        TP_printk("dev %d,%d ino %llu index_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  (unsigned long long) __entry->pblk)
);

TRACE_EVENT(ext4_ext_remove_space,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 ext4_lblk_t end, int depth),

        TP_ARGS(inode, start, end, depth),

        TP_STRUCT__entry(
                __field(        u64,                ino        )
                __field(        dev_t,                dev        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        end        )
                __field(        int,                depth        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->start        = start;
                __entry->end        = end;
                __entry->depth        = depth;
        ),

        TP_printk("dev %d,%d ino %llu since %u end %u depth %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth)
);

TRACE_EVENT(ext4_ext_remove_space_done,
        TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
                 int depth, struct partial_cluster *pc, __le16 eh_entries),

        TP_ARGS(inode, start, end, depth, pc, eh_entries),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        ext4_fsblk_t,        pc_pclu                )
                __field(        dev_t,                dev                )
                __field(        ext4_lblk_t,        start                )
                __field(        ext4_lblk_t,        end                )
                __field(        int,                depth                )
                __field(        ext4_lblk_t,        pc_lblk                )
                __field(        int,                pc_state        )
                __field(        unsigned short,        eh_entries        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->end                = end;
                __entry->depth                = depth;
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
                __entry->eh_entries        = le16_to_cpu(eh_entries);
        ),

        TP_printk("dev %d,%d ino %llu since %u end %u depth %d "
                  "partial [pclu %lld lblk %u state %d] "
                  "remaining_entries %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state,
                  (unsigned short) __entry->eh_entries)
);

DECLARE_EVENT_CLASS(ext4__es_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        u64,                seq                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        dev_t,                dev                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        char,                status                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
                __entry->seq        = EXT4_I(inode)->i_es_seq;
        ),

        TP_printk("dev %d,%d ino %llu es [%u/%u) mapped %llu status %s seq %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status),
                  __entry->seq)
);

DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

TRACE_EVENT(ext4_es_remove_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len),

        TP_ARGS(inode, lblk, len),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(        loff_t,        lblk                        )
                __field(        loff_t,        len                        )
                __field(        u64,        seq                        )
                __field(        dev_t,        dev                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
                __entry->seq        = EXT4_I(inode)->i_es_seq;
        ),

        TP_printk("dev %d,%d ino %llu es [%lld/%lld) seq %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->lblk, __entry->len, __entry->seq)
);

TRACE_EVENT(ext4_es_find_extent_range_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        dev_t,                dev                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %llu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_find_extent_range_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        dev_t,                dev                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %llu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

TRACE_EVENT(ext4_es_lookup_extent_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        dev_t,                dev                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %llu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_lookup_extent_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 int found),

        TP_ARGS(inode, es, found),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        dev_t,                dev                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        int,                found                )
                __field(        char,                status                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
                __entry->found        = found;
        ),

        TP_printk("dev %d,%d ino %llu found %d [%u/%u) %llu %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->found,
                  __entry->lblk, __entry->len,
                  __entry->found ? __entry->pblk : 0,
                  show_extent_status(__entry->found ? __entry->status : 0))
);

DECLARE_EVENT_CLASS(ext4__es_shrink_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_to_scan                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_to_scan        = nr_to_scan;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_to_scan %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan, __entry->cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

TRACE_EVENT(ext4_es_shrink_scan_exit,
        TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt),

        TP_ARGS(sb, nr_shrunk, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_shrunk                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_shrunk, __entry->cache_cnt)
);

TRACE_EVENT(ext4_collapse_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(u64,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
                __field(dev_t,        dev)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %llu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_insert_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(u64,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
                __field(dev_t,        dev)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %llu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_es_shrink,
        TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
                 int nr_skipped, int retried),

        TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        int,                nr_shrunk        )
                __field(        unsigned long long, scan_time        )
                __field(        int,                nr_skipped        )
                __field(        int,                retried                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->scan_time        = div_u64(scan_time, 1000);
                __entry->nr_skipped        = nr_skipped;
                __entry->retried        = retried;
        ),

        TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu "
                  "nr_skipped %d retried %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
                  __entry->scan_time, __entry->nr_skipped, __entry->retried)
);

TRACE_EVENT(ext4_es_insert_delayed_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 bool lclu_allocated, bool end_allocated),

        TP_ARGS(inode, es, lclu_allocated, end_allocated),

        TP_STRUCT__entry(
                __field(        u64,                ino                )
                __field(        u64,                seq                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        dev_t,                dev                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        char,                status                )
                __field(        bool,                lclu_allocated        )
                __field(        bool,                end_allocated        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = es->es_lblk;
                __entry->len                = es->es_len;
                __entry->pblk                = ext4_es_show_pblock(es);
                __entry->status                = ext4_es_status(es);
                __entry->lclu_allocated        = lclu_allocated;
                __entry->end_allocated        = end_allocated;
                __entry->seq                = EXT4_I(inode)->i_es_seq;
        ),

        TP_printk("dev %d,%d ino %llu es [%u/%u) mapped %llu status %s allocated %d %d seq %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status),
                  __entry->lclu_allocated, __entry->end_allocated,
                  __entry->seq)
);

/* fsmap traces */
DECLARE_EVENT_CLASS(ext4_fsmap_class,
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,
                 u64 owner),
        TP_ARGS(sb, keydev, agno, bno, len, owner),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u32, agno)
                __field(u64, bno)
                __field(u64, len)
                __field(u64, owner)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(keydev);
                __entry->agno = agno;
                __entry->bno = bno;
                __entry->len = len;
                __entry->owner = owner;
        ),
        TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->agno,
                  __entry->bno,
                  __entry->len,
                  __entry->owner)
)
#define DEFINE_FSMAP_EVENT(name) \
DEFINE_EVENT(ext4_fsmap_class, name, \
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \
                 u64 owner), \
        TP_ARGS(sb, keydev, agno, bno, len, owner))
DEFINE_FSMAP_EVENT(ext4_fsmap_low_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_high_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_mapping);

DECLARE_EVENT_CLASS(ext4_getfsmap_class,
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap),
        TP_ARGS(sb, fsmap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u64, block)
                __field(u64, len)
                __field(u64, owner)
                __field(u64, flags)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(fsmap->fmr_device);
                __entry->block = fsmap->fmr_physical;
                __entry->len = fsmap->fmr_length;
                __entry->owner = fsmap->fmr_owner;
                __entry->flags = fsmap->fmr_flags;
        ),
        TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->block,
                  __entry->len,
                  __entry->owner,
                  __entry->flags)
)
#define DEFINE_GETFSMAP_EVENT(name) \
DEFINE_EVENT(ext4_getfsmap_class, name, \
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \
        TP_ARGS(sb, fsmap))
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping);

TRACE_EVENT(ext4_shutdown,
        TP_PROTO(struct super_block *sb, unsigned long flags),

        TP_ARGS(sb, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(     unsigned,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->flags)
);

TRACE_EVENT(ext4_error,
        TP_PROTO(struct super_block *sb, const char *function,
                 unsigned int line),

        TP_ARGS(sb, function, line),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field( const char *,        function                )
                __field(     unsigned,        line                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->function = function;
                __entry->line        = line;
        ),

        TP_printk("dev %d,%d function %s line %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->function, __entry->line)
);

TRACE_EVENT(ext4_prefetch_bitmaps,
            TP_PROTO(struct super_block *sb, ext4_group_t group,
                     ext4_group_t next, unsigned int prefetch_ios),

        TP_ARGS(sb, group, next, prefetch_ios),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        __u32,        next                        )
                __field(        __u32,        ios                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->next        = next;
                __entry->ios        = prefetch_ios;
        ),

        TP_printk("dev %d,%d group %u next %u ios %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->next, __entry->ios)
);

TRACE_EVENT(ext4_lazy_itable_init,
            TP_PROTO(struct super_block *sb, ext4_group_t group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group)
);

TRACE_EVENT(ext4_fc_replay_scan,
        TP_PROTO(struct super_block *sb, int error, int off),

        TP_ARGS(sb, error, off),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, error)
                __field(int, off)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->error = error;
                __entry->off = off;
        ),

        TP_printk("dev %d,%d error %d, off %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->error, __entry->off)
);

TRACE_EVENT(ext4_fc_replay,
        TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2),

        TP_ARGS(sb, tag, ino, priv1, priv2),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, tag)
                __field(int, ino)
                __field(int, priv1)
                __field(int, priv2)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->tag = tag;
                __entry->ino = ino;
                __entry->priv1 = priv1;
                __entry->priv2 = priv2;
        ),

        TP_printk("dev %d,%d: tag %d, ino %d, data1 %d, data2 %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tag, __entry->ino, __entry->priv1, __entry->priv2)
);

TRACE_EVENT(ext4_fc_commit_start,
        TP_PROTO(struct super_block *sb, tid_t commit_tid),

        TP_ARGS(sb, commit_tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->tid = commit_tid;
        ),

        TP_printk("dev %d,%d tid %u", MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tid)
);

TRACE_EVENT(ext4_fc_commit_stop,
            TP_PROTO(struct super_block *sb, int nblks, int reason,
                     tid_t commit_tid),

        TP_ARGS(sb, nblks, reason, commit_tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, nblks)
                __field(int, reason)
                __field(int, num_fc)
                __field(int, num_fc_ineligible)
                __field(int, nblks_agg)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->nblks = nblks;
                __entry->reason = reason;
                __entry->num_fc = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->num_fc_ineligible =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->nblks_agg = EXT4_SB(sb)->s_fc_stats.fc_numblks;
                __entry->tid = commit_tid;
        ),

        TP_printk("dev %d,%d nblks %d, reason %d, fc = %d, ineligible = %d, agg_nblks %d, tid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nblks, __entry->reason, __entry->num_fc,
                  __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
);

#define FC_REASON_NAME_STAT(reason)                                        \
        show_fc_reason(reason),                                                \
        __entry->fc_ineligible_rc[reason]

TRACE_EVENT(ext4_fc_stats,
        TP_PROTO(struct super_block *sb),

        TP_ARGS(sb),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __array(unsigned int, fc_ineligible_rc, EXT4_FC_REASON_MAX)
                __field(unsigned long, fc_commits)
                __field(unsigned long, fc_ineligible_commits)
                __field(unsigned long, fc_numblks)
        ),

        TP_fast_assign(
                int i;

                __entry->dev = sb->s_dev;
                for (i = 0; i < EXT4_FC_REASON_MAX; i++) {
                        __entry->fc_ineligible_rc[i] =
                                EXT4_SB(sb)->s_fc_stats.fc_ineligible_reason_count[i];
                }
                __entry->fc_commits = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->fc_ineligible_commits =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->fc_numblks = EXT4_SB(sb)->s_fc_stats.fc_numblks;
        ),

        TP_printk("dev %d,%d fc ineligible reasons:\n"
                  "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u"
                  "num_commits:%lu, ineligible: %lu, numblks: %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME),
                  __entry->fc_commits, __entry->fc_ineligible_commits,
                  __entry->fc_numblks)
);

DECLARE_EVENT_CLASS(ext4_fc_track_dentry,

        TP_PROTO(handle_t *handle, struct inode *inode,
                 struct dentry *dentry, int ret),

        TP_ARGS(handle, inode, dentry, ret),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(tid_t, i_sync_tid)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->error = ret;
        ),

        TP_printk("dev %d,%d, t_tid %u, ino %llu, i_sync_tid %u, error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error
        )
);

#define DEFINE_EVENT_CLASS_DENTRY(__type)                                \
DEFINE_EVENT(ext4_fc_track_dentry, ext4_fc_track_##__type,                \
        TP_PROTO(handle_t *handle, struct inode *inode,                        \
                 struct dentry *dentry, int ret),                        \
        TP_ARGS(handle, inode, dentry, ret)                                \
)

DEFINE_EVENT_CLASS_DENTRY(create);
DEFINE_EVENT_CLASS_DENTRY(link);
DEFINE_EVENT_CLASS_DENTRY(unlink);

TRACE_EVENT(ext4_fc_track_inode,
        TP_PROTO(handle_t *handle, struct inode *inode, int ret),

        TP_ARGS(handle, inode, ret),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(tid_t, i_sync_tid)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->error = ret;
        ),

        TP_printk("dev %d:%d, t_tid %u, inode %llu, i_sync_tid %u, error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error)
        );

TRACE_EVENT(ext4_fc_track_range,
        TP_PROTO(handle_t *handle, struct inode *inode,
                 long start, long end, int ret),

        TP_ARGS(handle, inode, start, end, ret),

        TP_STRUCT__entry(
                __field(u64, i_ino)
                __field(long, start)
                __field(long, end)
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(tid_t, i_sync_tid)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->start = start;
                __entry->end = end;
                __entry->error = ret;
        ),

        TP_printk("dev %d:%d, t_tid %u, inode %llu, i_sync_tid %u, error %d, start %ld, end %ld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error, __entry->start, __entry->end)
        );

TRACE_EVENT(ext4_fc_cleanup,
        TP_PROTO(journal_t *journal, int full, tid_t tid),

        TP_ARGS(journal, full, tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, j_fc_off)
                __field(int, full)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                struct super_block *sb = journal->j_private;

                __entry->dev = sb->s_dev;
                __entry->j_fc_off = journal->j_fc_off;
                __entry->full = full;
                __entry->tid = tid;
        ),

        TP_printk("dev %d,%d, j_fc_off %d, full %d, tid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->j_fc_off, __entry->full, __entry->tid)
        );

TRACE_EVENT(ext4_update_sb,
        TP_PROTO(struct super_block *sb, ext4_fsblk_t fsblk,
                 unsigned int flags),

        TP_ARGS(sb, fsblk, flags),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ext4_fsblk_t,        fsblk)
                __field(unsigned int,        flags)
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->fsblk        = fsblk;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d fsblk %llu flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->fsblk, __entry->flags)
);

TRACE_EVENT(ext4_move_extent_enter,
        TP_PROTO(struct inode *orig_inode, struct ext4_map_blocks *orig_map,
                 struct inode *donor_inode, ext4_lblk_t donor_lblk),

        TP_ARGS(orig_inode, orig_map, donor_inode, donor_lblk),

        TP_STRUCT__entry(
                __field(u64, orig_ino)
                __field(u64, donor_ino)
                __field(dev_t, dev)
                __field(ext4_lblk_t, orig_lblk)
                __field(unsigned int, orig_flags)
                __field(ext4_lblk_t, donor_lblk)
                __field(unsigned int, len)
        ),

        TP_fast_assign(
                __entry->dev                = orig_inode->i_sb->s_dev;
                __entry->orig_ino        = orig_inode->i_ino;
                __entry->orig_lblk        = orig_map->m_lblk;
                __entry->orig_flags        = orig_map->m_flags;
                __entry->donor_ino        = donor_inode->i_ino;
                __entry->donor_lblk        = donor_lblk;
                __entry->len                = orig_map->m_len;
        ),

        TP_printk("dev %d,%d origin ino %llu lblk %u flags %s donor ino %llu lblk %u len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->orig_ino,  __entry->orig_lblk,
                  show_mflags(__entry->orig_flags),
                  __entry->donor_ino,  __entry->donor_lblk,
                  __entry->len)
);

TRACE_EVENT(ext4_move_extent_exit,
        TP_PROTO(struct inode *orig_inode, ext4_lblk_t orig_lblk,
                 struct inode *donor_inode, ext4_lblk_t donor_lblk,
                 unsigned int m_len, u64 move_len, int move_type, int ret),

        TP_ARGS(orig_inode, orig_lblk, donor_inode, donor_lblk, m_len,
                move_len, move_type, ret),

        TP_STRUCT__entry(
                __field(u64, orig_ino)
                __field(u64, donor_ino)
                __field(u64, move_len)
                __field(dev_t, dev)
                __field(ext4_lblk_t, orig_lblk)
                __field(ext4_lblk_t, donor_lblk)
                __field(unsigned int, m_len)
                __field(int, move_type)
                __field(int, ret)
        ),

        TP_fast_assign(
                __entry->dev                = orig_inode->i_sb->s_dev;
                __entry->orig_ino        = orig_inode->i_ino;
                __entry->orig_lblk        = orig_lblk;
                __entry->donor_ino        = donor_inode->i_ino;
                __entry->donor_lblk        = donor_lblk;
                __entry->m_len                = m_len;
                __entry->move_len        = move_len;
                __entry->move_type        = move_type;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d origin ino %llu lblk %u donor ino %llu lblk %u m_len %u, move_len %llu type %d ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->orig_ino,  __entry->orig_lblk,
                  __entry->donor_ino,  __entry->donor_lblk,
                  __entry->m_len, __entry->move_len, __entry->move_type,
                  __entry->ret)
);

#endif /* _TRACE_EXT4_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




































   35 




   15 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_COMMON_H
#define _NF_CONNTRACK_COMMON_H

#include <linux/refcount.h>
#include <uapi/linux/netfilter/nf_conntrack_common.h>

struct ip_conntrack_stat {
        unsigned int found;
        unsigned int invalid;
        unsigned int insert;
        unsigned int insert_failed;
        unsigned int clash_resolve;
        unsigned int drop;
        unsigned int early_drop;
        unsigned int error;
        unsigned int expect_new;
        unsigned int expect_create;
        unsigned int expect_delete;
        unsigned int search_restart;
        unsigned int chaintoolong;
};

#define NFCT_INFOMASK        7UL
#define NFCT_PTRMASK        ~(NFCT_INFOMASK)

struct nf_conntrack {
        refcount_t use;
};

void nf_conntrack_destroy(struct nf_conntrack *nfct);

/* like nf_ct_put, but without module dependency on nf_conntrack */
static inline void nf_conntrack_put(struct nf_conntrack *nfct)
{
        if (nfct && refcount_dec_and_test(&nfct->use))
                nf_conntrack_destroy(nfct);
}
static inline void nf_conntrack_get(struct nf_conntrack *nfct)
{
        if (nfct)
                refcount_inc(&nfct->use);
}

#endif /* _NF_CONNTRACK_COMMON_H */































































































































































































    2 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM nilfs2

#if !defined(_TRACE_NILFS2_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NILFS2_H

#include <linux/tracepoint.h>

struct nilfs_sc_info;

#define show_collection_stage(type)                                        \
        __print_symbolic(type,                                                \
        { NILFS_ST_INIT, "ST_INIT" },                                        \
        { NILFS_ST_GC, "ST_GC" },                                        \
        { NILFS_ST_FILE, "ST_FILE" },                                        \
        { NILFS_ST_IFILE, "ST_IFILE" },                                        \
        { NILFS_ST_CPFILE, "ST_CPFILE" },                                \
        { NILFS_ST_SUFILE, "ST_SUFILE" },                                \
        { NILFS_ST_DAT, "ST_DAT" },                                        \
        { NILFS_ST_SR, "ST_SR" },                                        \
        { NILFS_ST_DSYNC, "ST_DSYNC" },                                        \
        { NILFS_ST_DONE, "ST_DONE"})

TRACE_EVENT(nilfs2_collection_stage_transition,

            TP_PROTO(struct nilfs_sc_info *sci),

            TP_ARGS(sci),

            TP_STRUCT__entry(
                    __field(void *, sci)
                    __field(int, stage)
            ),

            TP_fast_assign(
                        __entry->sci = sci;
                        __entry->stage = sci->sc_stage.scnt;
                    ),

            TP_printk("sci = %p stage = %s",
                      __entry->sci,
                      show_collection_stage(__entry->stage))
);

#ifndef TRACE_HEADER_MULTI_READ
enum nilfs2_transaction_transition_state {
        TRACE_NILFS2_TRANSACTION_BEGIN,
        TRACE_NILFS2_TRANSACTION_COMMIT,
        TRACE_NILFS2_TRANSACTION_ABORT,
        TRACE_NILFS2_TRANSACTION_TRYLOCK,
        TRACE_NILFS2_TRANSACTION_LOCK,
        TRACE_NILFS2_TRANSACTION_UNLOCK,
};
#endif

#define show_transaction_state(type)                                        \
        __print_symbolic(type,                                                \
                         { TRACE_NILFS2_TRANSACTION_BEGIN, "BEGIN" },        \
                         { TRACE_NILFS2_TRANSACTION_COMMIT, "COMMIT" },        \
                         { TRACE_NILFS2_TRANSACTION_ABORT, "ABORT" },        \
                         { TRACE_NILFS2_TRANSACTION_TRYLOCK, "TRYLOCK" }, \
                         { TRACE_NILFS2_TRANSACTION_LOCK, "LOCK" },        \
                         { TRACE_NILFS2_TRANSACTION_UNLOCK, "UNLOCK" })

TRACE_EVENT(nilfs2_transaction_transition,
            TP_PROTO(struct super_block *sb,
                     struct nilfs_transaction_info *ti,
                     int count,
                     unsigned int flags,
                     enum nilfs2_transaction_transition_state state),

            TP_ARGS(sb, ti, count, flags, state),

            TP_STRUCT__entry(
                    __field(void *, sb)
                    __field(void *, ti)
                    __field(int, count)
                    __field(unsigned int, flags)
                    __field(int, state)
            ),

            TP_fast_assign(
                    __entry->sb = sb;
                    __entry->ti = ti;
                    __entry->count = count;
                    __entry->flags = flags;
                    __entry->state = state;
                    ),

            TP_printk("sb = %p ti = %p count = %d flags = %x state = %s",
                      __entry->sb,
                      __entry->ti,
                      __entry->count,
                      __entry->flags,
                      show_transaction_state(__entry->state))
);

TRACE_EVENT(nilfs2_segment_usage_check,
            TP_PROTO(struct inode *sufile,
                     __u64 segnum,
                     unsigned long cnt),

            TP_ARGS(sufile, segnum, cnt),

            TP_STRUCT__entry(
                    __field(struct inode *, sufile)
                    __field(__u64, segnum)
                    __field(unsigned long, cnt)
            ),

            TP_fast_assign(
                    __entry->sufile = sufile;
                    __entry->segnum = segnum;
                    __entry->cnt = cnt;
                    ),

            TP_printk("sufile = %p segnum = %llu cnt = %lu",
                      __entry->sufile,
                      __entry->segnum,
                      __entry->cnt)
);

TRACE_EVENT(nilfs2_segment_usage_allocated,
            TP_PROTO(struct inode *sufile,
                     __u64 segnum),

            TP_ARGS(sufile, segnum),

            TP_STRUCT__entry(
                    __field(struct inode *, sufile)
                    __field(__u64, segnum)
            ),

            TP_fast_assign(
                    __entry->sufile = sufile;
                    __entry->segnum = segnum;
                    ),

            TP_printk("sufile = %p segnum = %llu",
                      __entry->sufile,
                      __entry->segnum)
);

TRACE_EVENT(nilfs2_segment_usage_freed,
            TP_PROTO(struct inode *sufile,
                     __u64 segnum),

            TP_ARGS(sufile, segnum),

            TP_STRUCT__entry(
                    __field(struct inode *, sufile)
                    __field(__u64, segnum)
            ),

            TP_fast_assign(
                    __entry->sufile = sufile;
                    __entry->segnum = segnum;
                    ),

            TP_printk("sufile = %p segnum = %llu",
                      __entry->sufile,
                      __entry->segnum)
);

TRACE_EVENT(nilfs2_mdt_insert_new_block,
            TP_PROTO(struct inode *inode,
                     u64 ino,
                     unsigned long block),

            TP_ARGS(inode, ino, block),

            TP_STRUCT__entry(
                    __field(u64, ino)
                    __field(struct inode *, inode)
                    __field(unsigned long, block)
            ),

            TP_fast_assign(
                    __entry->inode = inode;
                    __entry->ino = ino;
                    __entry->block = block;
                    ),

            TP_printk("inode = %p ino = %llu block = %lu",
                      __entry->inode,
                      __entry->ino,
                      __entry->block)
);

TRACE_EVENT(nilfs2_mdt_submit_block,
            TP_PROTO(struct inode *inode,
                     u64 ino,
                     unsigned long blkoff,
                     enum req_op mode),

            TP_ARGS(inode, ino, blkoff, mode),

            TP_STRUCT__entry(
                    __field(u64, ino)
                    __field(struct inode *, inode)
                    __field(unsigned long, blkoff)
                    /*
                     * Use field_struct() to avoid is_signed_type() on the
                     * bitwise type enum req_op.
                     */
                    __field_struct(enum req_op, mode)
            ),

            TP_fast_assign(
                    __entry->inode = inode;
                    __entry->ino = ino;
                    __entry->blkoff = blkoff;
                    __entry->mode = mode;
                    ),

            TP_printk("inode = %p ino = %llu blkoff = %lu mode = %x",
                      __entry->inode,
                      __entry->ino,
                      __entry->blkoff,
                      __entry->mode)
);

#endif /* _TRACE_NILFS2_H */

/* This part must be outside protection */
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE nilfs2
#include <trace/define_trace.h>





































































































































































   22 


   21 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
// SPDX-License-Identifier: GPL-2.0-only
/* Page fragment allocator
 *
 * Page Fragment:
 *  An arbitrary-length arbitrary-offset area of memory which resides within a
 *  0 or higher order page.  Multiple fragments within that page are
 *  individually refcounted, in the page's reference counter.
 *
 * The page_frag functions provide a simple allocation framework for page
 * fragments.  This is used by the network stack and network device drivers to
 * provide a backing region of memory for use as either an sk_buff->head, or to
 * be used in the "frags" portion of skb_shared_info.
 */

#include <linux/build_bug.h>
#include <linux/export.h>
#include <linux/gfp_types.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/page_frag_cache.h>
#include "internal.h"

static unsigned long encoded_page_create(struct page *page, unsigned int order,
                                         bool pfmemalloc)
{
        BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK);
        BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_BIT >= PAGE_SIZE);

        return (unsigned long)page_address(page) |
                (order & PAGE_FRAG_CACHE_ORDER_MASK) |
                ((unsigned long)pfmemalloc * PAGE_FRAG_CACHE_PFMEMALLOC_BIT);
}

static unsigned long encoded_page_decode_order(unsigned long encoded_page)
{
        return encoded_page & PAGE_FRAG_CACHE_ORDER_MASK;
}

static void *encoded_page_decode_virt(unsigned long encoded_page)
{
        return (void *)(encoded_page & PAGE_MASK);
}

static struct page *encoded_page_decode_page(unsigned long encoded_page)
{
        return virt_to_page((void *)encoded_page);
}

static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
                                             gfp_t gfp_mask)
{
        unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
        struct page *page = NULL;
        gfp_t gfp = gfp_mask;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
        gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
                   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
        page = __alloc_pages(gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER,
                             numa_mem_id(), NULL);
#endif
        if (unlikely(!page)) {
                page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
                order = 0;
        }

        nc->encoded_page = page ?
                encoded_page_create(page, order, page_is_pfmemalloc(page)) : 0;

        return page;
}

void page_frag_cache_drain(struct page_frag_cache *nc)
{
        if (!nc->encoded_page)
                return;

        __page_frag_cache_drain(encoded_page_decode_page(nc->encoded_page),
                                nc->pagecnt_bias);
        nc->encoded_page = 0;
}
EXPORT_SYMBOL(page_frag_cache_drain);

void __page_frag_cache_drain(struct page *page, unsigned int count)
{
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);

        if (page_ref_sub_and_test(page, count))
                free_frozen_pages(page, compound_order(page));
}
EXPORT_SYMBOL(__page_frag_cache_drain);

void *__page_frag_alloc_align(struct page_frag_cache *nc,
                              unsigned int fragsz, gfp_t gfp_mask,
                              unsigned int align_mask)
{
        unsigned long encoded_page = nc->encoded_page;
        unsigned int size, offset;
        struct page *page;

        if (unlikely(!encoded_page)) {
refill:
                page = __page_frag_cache_refill(nc, gfp_mask);
                if (!page)
                        return NULL;

                encoded_page = nc->encoded_page;

                /* Even if we own the page, we do not use atomic_set().
                 * This would break get_page_unless_zero() users.
                 */
                page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);

                /* reset page count bias and offset to start of new frag */
                nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
                nc->offset = 0;
        }

        size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
        offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
        if (unlikely(offset + fragsz > size)) {
                if (unlikely(fragsz > PAGE_SIZE)) {
                        /*
                         * The caller is trying to allocate a fragment
                         * with fragsz > PAGE_SIZE but the cache isn't big
                         * enough to satisfy the request, this may
                         * happen in low memory conditions.
                         * We don't release the cache page because
                         * it could make memory pressure worse
                         * so we simply return NULL here.
                         */
                        return NULL;
                }

                page = encoded_page_decode_page(encoded_page);

                if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
                        goto refill;

                if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) {
                        free_frozen_pages(page,
                                        encoded_page_decode_order(encoded_page));
                        goto refill;
                }

                /* OK, page count is 0, we can safely set it */
                set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);

                /* reset page count bias and offset to start of new frag */
                nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
                offset = 0;
        }

        nc->pagecnt_bias--;
        nc->offset = offset + fragsz;

        return encoded_page_decode_virt(encoded_page) + offset;
}
EXPORT_SYMBOL(__page_frag_alloc_align);

/*
 * Frees a page fragment allocated out of either a compound or order 0 page.
 */
void page_frag_free(void *addr)
{
        struct page *page = virt_to_head_page(addr);

        if (unlikely(put_page_testzero(page)))
                free_frozen_pages(page, compound_order(page));
}
EXPORT_SYMBOL(page_frag_free);




























   72 











   64 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// SPDX-License-Identifier: GPL-2.0
/*
 * This file contains functions which manage high resolution tick
 * related events.
 *
 * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
 */
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>

#include "tick-internal.h"

/**
 * tick_program_event - program the CPU local timer device for the next event
 * @expires: the time at which the next timer event should occur
 * @force: flag to force reprograming even if the event time hasn't changed
 *
 * Return: 0 on success, negative error code on failure
 */
int tick_program_event(ktime_t expires, int force)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);

        if (unlikely(expires == KTIME_MAX)) {
                /*
                 * We don't need the clock event device any more, stop it.
                 */
                clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
                dev->next_event = KTIME_MAX;
                return 0;
        }

        if (unlikely(clockevent_state_oneshot_stopped(dev))) {
                /*
                 * We need the clock event again, configure it in ONESHOT mode
                 * before using it.
                 */
                clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
        }

        return clockevents_program_event(dev, expires, force);
}

/**
 * tick_resume_oneshot - resume oneshot mode
 */
void tick_resume_oneshot(void)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);

        clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
        clockevents_program_event(dev, ktime_get(), true);
}

/**
 * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
 * @newdev: Pointer to the clock event device to configure
 * @handler: Function to be called when the event device triggers an interrupt
 * @next_event: Initial expiry time for the next event (in ktime)
 *
 * Configures the specified clock event device for onshot mode,
 * assigns the given handler as its event callback, and programs
 * the device to trigger at the specified next event time.
 */
void tick_setup_oneshot(struct clock_event_device *newdev,
                        void (*handler)(struct clock_event_device *),
                        ktime_t next_event)
{
        newdev->event_handler = handler;
        clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT);
        clockevents_program_event(newdev, next_event, true);
}

/**
 * tick_switch_to_oneshot - switch to oneshot mode
 * @handler: function to call when an event occurs on the tick device
 *
 * Return: 0 on success, -EINVAL if the tick device is not present,
 *         not functional, or does not support oneshot mode.
 */
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
        struct clock_event_device *dev = td->evtdev;

        if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
                    !tick_device_is_functional(dev)) {

                pr_info("Clockevents: could not switch to one-shot mode:");
                if (!dev) {
                        pr_cont(" no tick device\n");
                } else {
                        if (!tick_device_is_functional(dev))
                                pr_cont(" %s is not functional.\n", dev->name);
                        else
                                pr_cont(" %s does not support one-shot mode.\n",
                                        dev->name);
                }
                return -EINVAL;
        }

        td->mode = TICKDEV_MODE_ONESHOT;
        dev->event_handler = handler;
        clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
        tick_broadcast_switch_to_oneshot();
        return 0;
}

/**
 * tick_oneshot_mode_active - check whether the system is in oneshot mode
 *
 * Return: 1 when either nohz or highres are enabled, otherwise 0.
 */
int tick_oneshot_mode_active(void)
{
        unsigned long flags;
        int ret;

        local_irq_save(flags);
        ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
        local_irq_restore(flags);

        return ret;
}

#ifdef CONFIG_HIGH_RES_TIMERS
/**
 * tick_init_highres - switch to high resolution mode
 *
 * Called with interrupts disabled.
 *
 * Return: 0 on success, -EINVAL if the tick device cannot switch
 *         to oneshot/high-resolution mode.
 */
int tick_init_highres(void)
{
        return tick_switch_to_oneshot(hrtimer_interrupt);
}
#endif


















   18 




    5 








































    5 






































    5 







































































    5 





















    2 






    3 


































   11 




    5 











































    5 





    2 



















    3 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
// SPDX-License-Identifier: GPL-2.0-only
/* tnum: tracked (or tristate) numbers
 *
 * A tnum tracks knowledge about the bits of a value.  Each bit can be either
 * known (0 or 1), or unknown (x).  Arithmetic operations on tnums will
 * propagate the unknown bits such that the tnum result represents all the
 * possible results for possible values of the operands.
 */
#include <linux/kernel.h>
#include <linux/tnum.h>
#include <linux/swab.h>

#define TNUM(_v, _m)        (struct tnum){.value = _v, .mask = _m}
/* A completely unknown value */
const struct tnum tnum_unknown = { .value = 0, .mask = -1 };

struct tnum tnum_const(u64 value)
{
        return TNUM(value, 0);
}

struct tnum tnum_range(u64 min, u64 max)
{
        u64 chi = min ^ max, delta;
        u8 bits = fls64(chi);

        /* special case, needed because 1ULL << 64 is undefined */
        if (bits > 63)
                return tnum_unknown;
        /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
         * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
         *  constant min (since min == max).
         */
        delta = (1ULL << bits) - 1;
        return TNUM(min & ~delta, delta);
}

struct tnum tnum_lshift(struct tnum a, u8 shift)
{
        return TNUM(a.value << shift, a.mask << shift);
}

struct tnum tnum_rshift(struct tnum a, u8 shift)
{
        return TNUM(a.value >> shift, a.mask >> shift);
}

struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness)
{
        /* if a.value is negative, arithmetic shifting by minimum shift
         * will have larger negative offset compared to more shifting.
         * If a.value is nonnegative, arithmetic shifting by minimum shift
         * will have larger positive offset compare to more shifting.
         */
        if (insn_bitness == 32)
                return TNUM((u32)(((s32)a.value) >> min_shift),
                            (u32)(((s32)a.mask)  >> min_shift));
        else
                return TNUM((s64)a.value >> min_shift,
                            (s64)a.mask  >> min_shift);
}

struct tnum tnum_add(struct tnum a, struct tnum b)
{
        u64 sm, sv, sigma, chi, mu;

        sm = a.mask + b.mask;
        sv = a.value + b.value;
        sigma = sm + sv;
        chi = sigma ^ sv;
        mu = chi | a.mask | b.mask;
        return TNUM(sv & ~mu, mu);
}

struct tnum tnum_sub(struct tnum a, struct tnum b)
{
        u64 dv, alpha, beta, chi, mu;

        dv = a.value - b.value;
        alpha = dv + a.mask;
        beta = dv - b.mask;
        chi = alpha ^ beta;
        mu = chi | a.mask | b.mask;
        return TNUM(dv & ~mu, mu);
}

struct tnum tnum_neg(struct tnum a)
{
        return tnum_sub(TNUM(0, 0), a);
}

struct tnum tnum_and(struct tnum a, struct tnum b)
{
        u64 alpha, beta, v;

        alpha = a.value | a.mask;
        beta = b.value | b.mask;
        v = a.value & b.value;
        return TNUM(v, alpha & beta & ~v);
}

struct tnum tnum_or(struct tnum a, struct tnum b)
{
        u64 v, mu;

        v = a.value | b.value;
        mu = a.mask | b.mask;
        return TNUM(v, mu & ~v);
}

struct tnum tnum_xor(struct tnum a, struct tnum b)
{
        u64 v, mu;

        v = a.value ^ b.value;
        mu = a.mask | b.mask;
        return TNUM(v & ~mu, mu);
}

/* Perform long multiplication, iterating through the bits in a using rshift:
 * - if LSB(a) is a known 0, keep current accumulator
 * - if LSB(a) is a known 1, add b to current accumulator
 * - if LSB(a) is unknown, take a union of the above cases.
 *
 * For example:
 *
 *               acc_0:        acc_1:
 *
 *     11 *  ->      11 *  ->      11 *  -> union(0011, 1001) == x0x1
 *     x1            01            11
 * ------        ------        ------
 *     11            11            11
 *    xx            00            11
 * ------        ------        ------
 *   ????          0011          1001
 */
struct tnum tnum_mul(struct tnum a, struct tnum b)
{
        struct tnum acc = TNUM(0, 0);

        while (a.value || a.mask) {
                /* LSB of tnum a is a certain 1 */
                if (a.value & 1)
                        acc = tnum_add(acc, b);
                /* LSB of tnum a is uncertain */
                else if (a.mask & 1) {
                        /* acc = tnum_union(acc_0, acc_1), where acc_0 and
                         * acc_1 are partial accumulators for cases
                         * LSB(a) = certain 0 and LSB(a) = certain 1.
                         * acc_0 = acc + 0 * b = acc.
                         * acc_1 = acc + 1 * b = tnum_add(acc, b).
                         */

                        acc = tnum_union(acc, tnum_add(acc, b));
                }
                /* Note: no case for LSB is certain 0 */
                a = tnum_rshift(a, 1);
                b = tnum_lshift(b, 1);
        }
        return acc;
}

bool tnum_overlap(struct tnum a, struct tnum b)
{
        u64 mu;

        mu = ~a.mask & ~b.mask;
        return (a.value & mu) == (b.value & mu);
}

/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
 * a 'known 0' - this will return a 'known 1' for that bit.
 */
struct tnum tnum_intersect(struct tnum a, struct tnum b)
{
        u64 v, mu;

        v = a.value | b.value;
        mu = a.mask & b.mask;
        return TNUM(v & ~mu, mu);
}

/* Returns a tnum with the uncertainty from both a and b, and in addition, new
 * uncertainty at any position that a and b disagree. This represents a
 * superset of the union of the concrete sets of both a and b. Despite the
 * overapproximation, it is optimal.
 */
struct tnum tnum_union(struct tnum a, struct tnum b)
{
        u64 v = a.value & b.value;
        u64 mu = (a.value ^ b.value) | a.mask | b.mask;

        return TNUM(v & ~mu, mu);
}

struct tnum tnum_cast(struct tnum a, u8 size)
{
        a.value &= (1ULL << (size * 8)) - 1;
        a.mask &= (1ULL << (size * 8)) - 1;
        return a;
}

bool tnum_is_aligned(struct tnum a, u64 size)
{
        if (!size)
                return true;
        return !((a.value | a.mask) & (size - 1));
}

bool tnum_in(struct tnum a, struct tnum b)
{
        if (b.mask & ~a.mask)
                return false;
        b.value &= ~a.mask;
        return a.value == b.value;
}

int tnum_sbin(char *str, size_t size, struct tnum a)
{
        size_t n;

        for (n = 64; n; n--) {
                if (n < size) {
                        if (a.mask & 1)
                                str[n - 1] = 'x';
                        else if (a.value & 1)
                                str[n - 1] = '1';
                        else
                                str[n - 1] = '0';
                }
                a.mask >>= 1;
                a.value >>= 1;
        }
        str[min(size - 1, (size_t)64)] = 0;
        return 64;
}

struct tnum tnum_subreg(struct tnum a)
{
        return tnum_cast(a, 4);
}

struct tnum tnum_clear_subreg(struct tnum a)
{
        return tnum_lshift(tnum_rshift(a, 32), 32);
}

struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg)
{
        return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg));
}

struct tnum tnum_const_subreg(struct tnum a, u32 value)
{
        return tnum_with_subreg(a, tnum_const(value));
}

struct tnum tnum_bswap16(struct tnum a)
{
        return TNUM(swab16(a.value & 0xFFFF), swab16(a.mask & 0xFFFF));
}

struct tnum tnum_bswap32(struct tnum a)
{
        return TNUM(swab32(a.value & 0xFFFFFFFF), swab32(a.mask & 0xFFFFFFFF));
}

struct tnum tnum_bswap64(struct tnum a)
{
        return TNUM(swab64(a.value), swab64(a.mask));
}

/* Given tnum t, and a number z such that tmin <= z < tmax, where tmin
 * is the smallest member of the t (= t.value) and tmax is the largest
 * member of t (= t.value | t.mask), returns the smallest member of t
 * larger than z.
 *
 * For example,
 * t      = x11100x0
 * z      = 11110001 (241)
 * result = 11110010 (242)
 *
 * Note: if this function is called with z >= tmax, it just returns
 * early with tmax; if this function is called with z < tmin, the
 * algorithm already returns tmin.
 */
u64 tnum_step(struct tnum t, u64 z)
{
        u64 tmax, d, carry_mask, filled, inc;

        tmax = t.value | t.mask;

        /* if z >= largest member of t, return largest member of t */
        if (z >= tmax)
                return tmax;

        /* if z < smallest member of t, return smallest member of t */
        if (z < t.value)
                return t.value;

        /*
         * Let r be the result tnum member, z = t.value + d.
         * Every tnum member is t.value | s for some submask s of t.mask,
         * and since t.value & t.mask == 0, t.value | s == t.value + s.
         * So r > z becomes s > d where d = z - t.value.
         *
         * Find the smallest submask s of t.mask greater than d by
         * "incrementing d within the mask": fill every non-mask
         * position with 1 (`filled`) so +1 ripples through the gaps,
         * then keep only mask bits. `carry_mask` additionally fills
         * positions below the highest non-mask 1 in d, preventing
         * it from trapping the carry.
         */
        d = z - t.value;
        carry_mask = (1ULL << fls64(d & ~t.mask)) - 1;
        filled = d | carry_mask | ~t.mask;
        inc = (filled + 1) & t.mask;
        return t.value | inc;
}














































































































































































































































    4 


















    4 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
// SPDX-License-Identifier: GPL-2.0-only

/*
 * rcuref - A scalable reference count implementation for RCU managed objects
 *
 * rcuref is provided to replace open coded reference count implementations
 * based on atomic_t. It protects explicitely RCU managed objects which can
 * be visible even after the last reference has been dropped and the object
 * is heading towards destruction.
 *
 * A common usage pattern is:
 *
 * get()
 *        rcu_read_lock();
 *        p = get_ptr();
 *        if (p && !atomic_inc_not_zero(&p->refcnt))
 *                p = NULL;
 *        rcu_read_unlock();
 *        return p;
 *
 * put()
 *        if (!atomic_dec_return(&->refcnt)) {
 *                remove_ptr(p);
 *                kfree_rcu((p, rcu);
 *        }
 *
 * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
 * O(N^2) behaviour under contention with N concurrent operations.
 *
 * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales
 * better under contention.
 *
 * Why not refcount?
 * =================
 *
 * In principle it should be possible to make refcount use the rcuref
 * scheme, but the destruction race described below cannot be prevented
 * unless the protected object is RCU managed.
 *
 * Theory of operation
 * ===================
 *
 * rcuref uses an unsigned integer reference counter. As long as the
 * counter value is greater than or equal to RCUREF_ONEREF and not larger
 * than RCUREF_MAXREF the reference is alive:
 *
 * ONEREF   MAXREF               SATURATED             RELEASED      DEAD    NOREF
 * 0        0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
 * <---valid --------> <-------saturation zone-------> <-----dead zone----->
 *
 * The get() and put() operations do unconditional increments and
 * decrements. The result is checked after the operation. This optimizes
 * for the fast path.
 *
 * If the reference count is saturated or dead, then the increments and
 * decrements are not harmful as the reference count still stays in the
 * respective zones and is always set back to STATURATED resp. DEAD. The
 * zones have room for 2^28 racing operations in each direction, which
 * makes it practically impossible to escape the zones.
 *
 * Once the last reference is dropped the reference count becomes
 * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
 * slowpath then tries to set the reference count from RCUREF_NOREF to
 * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
 * concurrent rcuref_get() can acquire the reference count and bring it
 * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
 *
 * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
 * DEAD + 1, which is inside the dead zone. If that happens the reference
 * count is put back to DEAD.
 *
 * The actual race is possible due to the unconditional increment and
 * decrements in rcuref_get() and rcuref_put():
 *
 *        T1                                T2
 *        get()                                put()
 *                                        if (atomic_add_negative(-1, &ref->refcnt))
 *                succeeds->                        atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);
 *
 *        atomic_add_negative(1, &ref->refcnt);        <- Elevates refcount to DEAD + 1
 *
 * As the result of T1's add is negative, the get() goes into the slow path
 * and observes refcnt being in the dead zone which makes the operation fail.
 *
 * Possible critical states:
 *
 *        Context Counter        References        Operation
 *        T1        0        1                init()
 *        T2        1        2                get()
 *        T1        0        1                put()
 *        T2     -1        0                put() tries to mark dead
 *        T1        0        1                get()
 *        T2        0        1                put() mark dead fails
 *        T1     -1        0                put() tries to mark dead
 *        T1    DEAD        0                put() mark dead succeeds
 *        T2    DEAD+1        0                get() fails and puts it back to DEAD
 *
 * Of course there are more complex scenarios, but the above illustrates
 * the working principle. The rest is left to the imagination of the
 * reader.
 *
 * Deconstruction race
 * ===================
 *
 * The release operation must be protected by prohibiting a grace period in
 * order to prevent a possible use after free:
 *
 *        T1                                T2
 *        put()                                get()
 *        // ref->refcnt = ONEREF
 *        if (!atomic_add_negative(-1, &ref->refcnt))
 *                return false;                                <- Not taken
 *
 *        // ref->refcnt == NOREF
 *        --> preemption
 *                                        // Elevates ref->refcnt to ONEREF
 *                                        if (!atomic_add_negative(1, &ref->refcnt))
 *                                                return true;                        <- taken
 *
 *                                        if (put(&p->ref)) { <-- Succeeds
 *                                                remove_pointer(p);
 *                                                kfree_rcu(p, rcu);
 *                                        }
 *
 *                RCU grace period ends, object is freed
 *
 *        atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);        <- UAF
 *
 * This is prevented by disabling preemption around the put() operation as
 * that's in most kernel configurations cheaper than a rcu_read_lock() /
 * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
 * prevents the grace period which keeps the object alive until all put()
 * operations complete.
 *
 * Saturation protection
 * =====================
 *
 * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
 * Once this is exceedded the reference count becomes stale by setting it
 * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
 * wrap arounds which obviously cause worse problems than a memory
 * leak. When saturation is reached a warning is emitted.
 *
 * Race conditions
 * ===============
 *
 * All reference count increment/decrement operations are unconditional and
 * only verified after the fact. This optimizes for the good case and takes
 * the occasional race vs. a dead or already saturated refcount into
 * account. The saturation and dead zones are large enough to accomodate
 * for that.
 *
 * Memory ordering
 * ===============
 *
 * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
 * and provide only what is strictly required for refcounts.
 *
 * The increments are fully relaxed; these will not provide ordering. The
 * rationale is that whatever is used to obtain the object to increase the
 * reference count on will provide the ordering. For locked data
 * structures, its the lock acquire, for RCU/lockless data structures its
 * the dependent load.
 *
 * rcuref_get() provides a control dependency ordering future stores which
 * ensures that the object is not modified when acquiring a reference
 * fails.
 *
 * rcuref_put() provides release order, i.e. all prior loads and stores
 * will be issued before. It also provides a control dependency ordering
 * against the subsequent destruction of the object.
 *
 * If rcuref_put() successfully dropped the last reference and marked the
 * object DEAD it also provides acquire ordering.
 */

#include <linux/export.h>
#include <linux/rcuref.h>

/**
 * rcuref_get_slowpath - Slowpath of rcuref_get()
 * @ref:        Pointer to the reference count
 *
 * Invoked when the reference count is outside of the valid zone.
 *
 * Return:
 *        False if the reference count was already marked dead
 *
 *        True if the reference count is saturated, which prevents the
 *        object from being deconstructed ever.
 */
bool rcuref_get_slowpath(rcuref_t *ref)
{
        unsigned int cnt = atomic_read(&ref->refcnt);

        /*
         * If the reference count was already marked dead, undo the
         * increment so it stays in the middle of the dead zone and return
         * fail.
         */
        if (cnt >= RCUREF_RELEASED) {
                atomic_set(&ref->refcnt, RCUREF_DEAD);
                return false;
        }

        /*
         * If it was saturated, warn and mark it so. In case the increment
         * was already on a saturated value restore the saturation
         * marker. This keeps it in the middle of the saturation zone and
         * prevents the reference count from overflowing. This leaks the
         * object memory, but prevents the obvious reference count overflow
         * damage.
         */
        if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory"))
                atomic_set(&ref->refcnt, RCUREF_SATURATED);
        return true;
}
EXPORT_SYMBOL_GPL(rcuref_get_slowpath);

/**
 * rcuref_put_slowpath - Slowpath of __rcuref_put()
 * @ref:        Pointer to the reference count
 * @cnt:        The resulting value of the fastpath decrement
 *
 * Invoked when the reference count is outside of the valid zone.
 *
 * Return:
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely schedule the
 *        object, which is protected by the reference counter, for
 *        deconstruction.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        deconstruct the protected object.
 */
bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt)
{
        /* Did this drop the last reference? */
        if (likely(cnt == RCUREF_NOREF)) {
                /*
                 * Carefully try to set the reference count to RCUREF_DEAD.
                 *
                 * This can fail if a concurrent get() operation has
                 * elevated it again or the corresponding put() even marked
                 * it dead already. Both are valid situations and do not
                 * require a retry. If this fails the caller is not
                 * allowed to deconstruct the object.
                 */
                if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD))
                        return false;

                /*
                 * The caller can safely schedule the object for
                 * deconstruction. Provide acquire ordering.
                 */
                smp_acquire__after_ctrl_dep();
                return true;
        }

        /*
         * If the reference count was already in the dead zone, then this
         * put() operation is imbalanced. Warn, put the reference count back to
         * DEAD and tell the caller to not deconstruct the object.
         */
        if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
                atomic_set(&ref->refcnt, RCUREF_DEAD);
                return false;
        }

        /*
         * This is a put() operation on a saturated refcount. Restore the
         * mean saturation value and tell the caller to not deconstruct the
         * object.
         */
        if (cnt > RCUREF_MAXREF)
                atomic_set(&ref->refcnt, RCUREF_SATURATED);
        return false;
}
EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
















































































































    1 







    1 




    1 























    1 















    1 







    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
 */

/* Devmaps primary use is as a backend map for XDP BPF helper call
 * bpf_redirect_map(). Because XDP is mostly concerned with performance we
 * spent some effort to ensure the datapath with redirect maps does not use
 * any locking. This is a quick note on the details.
 *
 * We have three possible paths to get into the devmap control plane bpf
 * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
 * will invoke an update, delete, or lookup operation. To ensure updates and
 * deletes appear atomic from the datapath side xchg() is used to modify the
 * netdev_map array. Then because the datapath does a lookup into the netdev_map
 * array (read-only) from an RCU critical section we use call_rcu() to wait for
 * an rcu grace period before free'ing the old data structures. This ensures the
 * datapath always has a valid copy. However, the datapath does a "flush"
 * operation that pushes any pending packets in the driver outside the RCU
 * critical section. Each bpf_dtab_netdev tracks these pending operations using
 * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed  until
 * this list is empty, indicating outstanding flush operations have completed.
 *
 * BPF syscalls may race with BPF program calls on any of the update, delete
 * or lookup operations. As noted above the xchg() operation also keep the
 * netdev_map consistent in this case. From the devmap side BPF programs
 * calling into these operations are the same as multiple user space threads
 * making system calls.
 *
 * Finally, any of the above may race with a netdev_unregister notifier. The
 * unregister notifier must search for net devices in the map structure that
 * contain a reference to the net device and remove them. This is a two step
 * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
 * check to see if the ifindex is the same as the net_device being removed.
 * When removing the dev a cmpxchg() is used to ensure the correct dev is
 * removed, in the case of a concurrent update or delete operation it is
 * possible that the initially referenced dev is no longer in the map. As the
 * notifier hook walks the map we know that new dev references can not be
 * added by the user because core infrastructure ensures dev_get_by_index()
 * calls will fail at this point.
 *
 * The devmap_hash type is a map type which interprets keys as ifindexes and
 * indexes these using a hashmap. This allows maps that use ifindex as key to be
 * densely packed instead of having holes in the lookup array for unused
 * ifindexes. The setup and packet enqueue/send code is shared between the two
 * types of devmap; only the lookup and insertion is different.
 */
#include <linux/bpf.h>
#include <linux/local_lock.h>
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>

#define DEV_CREATE_FLAG_MASK \
        (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)

struct xdp_dev_bulk_queue {
        struct xdp_frame *q[DEV_MAP_BULK_SIZE];
        struct list_head flush_node;
        struct net_device *dev;
        struct net_device *dev_rx;
        struct bpf_prog *xdp_prog;
        unsigned int count;
        local_lock_t bq_lock;
};

struct bpf_dtab_netdev {
        struct net_device *dev; /* must be first member, due to tracepoint */
        struct hlist_node index_hlist;
        struct bpf_prog *xdp_prog;
        struct rcu_head rcu;
        unsigned int idx;
        struct bpf_devmap_val val;
};

struct bpf_dtab {
        struct bpf_map map;
        struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
        struct list_head list;

        /* these are only used for DEVMAP_HASH type maps */
        struct hlist_head *dev_index_head;
        spinlock_t index_lock;
        unsigned int items;
        u32 n_buckets;
};

static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);

static struct hlist_head *dev_map_create_hash(unsigned int entries,
                                              int numa_node)
{
        int i;
        struct hlist_head *hash;

        hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
        if (hash != NULL)
                for (i = 0; i < entries; i++)
                        INIT_HLIST_HEAD(&hash[i]);

        return hash;
}

static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
                                                    int idx)
{
        return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
}

static int dev_map_alloc_check(union bpf_attr *attr)
{
        u32 valsize = attr->value_size;

        /* check sanity of attributes. 2 value sizes supported:
         * 4 bytes: ifindex
         * 8 bytes: ifindex + prog fd
         */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
            (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
             valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
            attr->map_flags & ~DEV_CREATE_FLAG_MASK)
                return -EINVAL;

        if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
                /* Hash table size must be power of 2; roundup_pow_of_two()
                 * can overflow into UB on 32-bit arches
                 */
                if (attr->max_entries > 1UL << 31)
                        return -EINVAL;
        }

        return 0;
}

static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
        /* Lookup returns a pointer straight to dev->ifindex, so make sure the
         * verifier prevents writes from the BPF side
         */
        attr->map_flags |= BPF_F_RDONLY_PROG;
        bpf_map_init_from_attr(&dtab->map, attr);

        if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
                /* Hash table size must be power of 2 */
                dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
                dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
                                                           dtab->map.numa_node);
                if (!dtab->dev_index_head)
                        return -ENOMEM;

                spin_lock_init(&dtab->index_lock);
        } else {
                dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
                                                      sizeof(struct bpf_dtab_netdev *),
                                                      dtab->map.numa_node);
                if (!dtab->netdev_map)
                        return -ENOMEM;
        }

        return 0;
}

static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
        struct bpf_dtab *dtab;
        int err;

        dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
        if (!dtab)
                return ERR_PTR(-ENOMEM);

        err = dev_map_init_map(dtab, attr);
        if (err) {
                bpf_map_area_free(dtab);
                return ERR_PTR(err);
        }

        spin_lock(&dev_map_lock);
        list_add_tail_rcu(&dtab->list, &dev_map_list);
        spin_unlock(&dev_map_lock);

        return &dtab->map;
}

static void dev_map_free(struct bpf_map *map)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        u32 i;

        /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
         * so the programs (can be more than one that used this map) were
         * disconnected from events. The following synchronize_rcu() guarantees
         * both rcu read critical sections complete and waits for
         * preempt-disable regions (NAPI being the relevant context here) so we
         * are certain there will be no further reads against the netdev_map and
         * all flush operations are complete. Flush operations can only be done
         * from NAPI context for this reason.
         */

        spin_lock(&dev_map_lock);
        list_del_rcu(&dtab->list);
        spin_unlock(&dev_map_lock);

        /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
         * during NAPI callback and cleared after the XDP redirect. There is no
         * explicit RCU read section which protects bpf_redirect_info->map but
         * local_bh_disable() also marks the beginning an RCU section. This
         * makes the complete softirq callback RCU protected. Thus after
         * following synchronize_rcu() there no bpf_redirect_info->map == map
         * assignment.
         */
        synchronize_rcu();

        /* Make sure prior __dev_map_entry_free() have completed. */
        rcu_barrier();

        if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
                for (i = 0; i < dtab->n_buckets; i++) {
                        struct bpf_dtab_netdev *dev;
                        struct hlist_head *head;
                        struct hlist_node *next;

                        head = dev_map_index_hash(dtab, i);

                        hlist_for_each_entry_safe(dev, next, head, index_hlist) {
                                hlist_del_rcu(&dev->index_hlist);
                                if (dev->xdp_prog)
                                        bpf_prog_put(dev->xdp_prog);
                                dev_put(dev->dev);
                                kfree(dev);
                        }
                }

                bpf_map_area_free(dtab->dev_index_head);
        } else {
                for (i = 0; i < dtab->map.max_entries; i++) {
                        struct bpf_dtab_netdev *dev;

                        dev = rcu_dereference_raw(dtab->netdev_map[i]);
                        if (!dev)
                                continue;

                        if (dev->xdp_prog)
                                bpf_prog_put(dev->xdp_prog);
                        dev_put(dev->dev);
                        kfree(dev);
                }

                bpf_map_area_free(dtab->netdev_map);
        }

        bpf_map_area_free(dtab);
}

static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        u32 index = key ? *(u32 *)key : U32_MAX;
        u32 *next = next_key;

        if (index >= dtab->map.max_entries) {
                *next = 0;
                return 0;
        }

        if (index == dtab->map.max_entries - 1)
                return -ENOENT;
        *next = index + 1;
        return 0;
}

/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
 * by local_bh_disable() (from XDP calls inside NAPI). The
 * rcu_read_lock_bh_held() below makes lockdep accept both.
 */
static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct hlist_head *head = dev_map_index_hash(dtab, key);
        struct bpf_dtab_netdev *dev;

        hlist_for_each_entry_rcu(dev, head, index_hlist,
                                 lockdep_is_held(&dtab->index_lock))
                if (dev->idx == key)
                        return dev;

        return NULL;
}

static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
                                    void *next_key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        u32 idx, *next = next_key;
        struct bpf_dtab_netdev *dev, *next_dev;
        struct hlist_head *head;
        int i = 0;

        if (!key)
                goto find_first;

        idx = *(u32 *)key;

        dev = __dev_map_hash_lookup_elem(map, idx);
        if (!dev)
                goto find_first;

        next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
                                    struct bpf_dtab_netdev, index_hlist);

        if (next_dev) {
                *next = next_dev->idx;
                return 0;
        }

        i = idx & (dtab->n_buckets - 1);
        i++;

 find_first:
        for (; i < dtab->n_buckets; i++) {
                head = dev_map_index_hash(dtab, i);

                next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
                                            struct bpf_dtab_netdev,
                                            index_hlist);
                if (next_dev) {
                        *next = next_dev->idx;
                        return 0;
                }
        }

        return -ENOENT;
}

static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
                                struct xdp_frame **frames, int n,
                                struct net_device *tx_dev,
                                struct net_device *rx_dev)
{
        struct xdp_txq_info txq = { .dev = tx_dev };
        struct xdp_rxq_info rxq = { .dev = rx_dev };
        struct xdp_buff xdp;
        int i, nframes = 0;

        for (i = 0; i < n; i++) {
                struct xdp_frame *xdpf = frames[i];
                u32 act;
                int err;

                xdp_convert_frame_to_buff(xdpf, &xdp);
                xdp.txq = &txq;
                xdp.rxq = &rxq;

                act = bpf_prog_run_xdp(xdp_prog, &xdp);
                switch (act) {
                case XDP_PASS:
                        err = xdp_update_frame_from_buff(&xdp, xdpf);
                        if (unlikely(err < 0))
                                xdp_return_frame_rx_napi(xdpf);
                        else
                                frames[nframes++] = xdpf;
                        break;
                default:
                        bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
                        fallthrough;
                case XDP_ABORTED:
                        trace_xdp_exception(tx_dev, xdp_prog, act);
                        fallthrough;
                case XDP_DROP:
                        xdp_return_frame_rx_napi(xdpf);
                        break;
                }
        }
        return nframes; /* sent frames count */
}

static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
        struct net_device *dev = bq->dev;
        unsigned int cnt = bq->count;
        int sent = 0, err = 0;
        int to_send = cnt;
        int i;

        lockdep_assert_held(&bq->bq_lock);

        if (unlikely(!cnt))
                return;

        for (i = 0; i < cnt; i++) {
                struct xdp_frame *xdpf = bq->q[i];

                prefetch(xdpf);
        }

        if (bq->xdp_prog) {
                to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
                if (!to_send)
                        goto out;
        }

        sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
        if (sent < 0) {
                /* If ndo_xdp_xmit fails with an errno, no frames have
                 * been xmit'ed.
                 */
                err = sent;
                sent = 0;
        }

        /* If not all frames have been transmitted, it is our
         * responsibility to free them
         */
        for (i = sent; unlikely(i < to_send); i++)
                xdp_return_frame_rx_napi(bq->q[i]);

out:
        bq->count = 0;
        trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
}

/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
 * driver before returning from its napi->poll() routine. See the comment above
 * xdp_do_flush() in filter.c.
 */
void __dev_flush(struct list_head *flush_list)
{
        struct xdp_dev_bulk_queue *bq, *tmp;

        list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
                local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
                bq_xmit_all(bq, XDP_XMIT_FLUSH);
                bq->dev_rx = NULL;
                bq->xdp_prog = NULL;
                __list_del_clearprev(&bq->flush_node);
                local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
        }
}

/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
 * by local_bh_disable() (from XDP calls inside NAPI). The
 * rcu_read_lock_bh_held() below makes lockdep accept both.
 */
static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *obj;

        if (key >= map->max_entries)
                return NULL;

        obj = rcu_dereference_check(dtab->netdev_map[key],
                                    rcu_read_lock_bh_held());
        return obj;
}

/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
 * variable access, and map elements stick around. See comment above
 * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh()
 * to serialise access to the per-CPU bq.
 */
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                       struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
        struct xdp_dev_bulk_queue *bq;

        local_lock_nested_bh(&dev->xdp_bulkq->bq_lock);
        bq = this_cpu_ptr(dev->xdp_bulkq);

        if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
                bq_xmit_all(bq, 0);

        /* Ingress dev_rx will be the same for all xdp_frame's in
         * bulk_queue, because bq stored per-CPU and must be flushed
         * from net_device drivers NAPI func end.
         *
         * Do the same with xdp_prog and flush_list since these fields
         * are only ever modified together.
         */
        if (!bq->dev_rx) {
                struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();

                bq->dev_rx = dev_rx;
                bq->xdp_prog = xdp_prog;
                list_add(&bq->flush_node, flush_list);
        }

        bq->q[bq->count++] = xdpf;

        local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock);
}

static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                                struct net_device *dev_rx,
                                struct bpf_prog *xdp_prog)
{
        int err;

        if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
                return -EOPNOTSUPP;

        if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
                     xdp_frame_has_frags(xdpf)))
                return -EOPNOTSUPP;

        err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
        if (unlikely(err))
                return err;

        bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
        return 0;
}

static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
{
        struct xdp_txq_info txq = { .dev = dst->dev };
        struct xdp_buff xdp;
        u32 act;

        if (!dst->xdp_prog)
                return XDP_PASS;

        __skb_pull(skb, skb->mac_len);
        xdp.txq = &txq;

        act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
        switch (act) {
        case XDP_PASS:
                __skb_push(skb, skb->mac_len);
                break;
        default:
                bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
                fallthrough;
        case XDP_ABORTED:
                trace_xdp_exception(dst->dev, dst->xdp_prog, act);
                fallthrough;
        case XDP_DROP:
                kfree_skb(skb);
                break;
        }

        return act;
}

int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                    struct net_device *dev_rx)
{
        return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}

int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
                    struct net_device *dev_rx)
{
        struct net_device *dev = dst->dev;

        return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
}

static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{
        if (!obj)
                return false;

        if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
                return false;

        if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
                     xdp_frame_has_frags(xdpf)))
                return false;

        if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
                return false;

        return true;
}

static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
                                 struct net_device *dev_rx,
                                 struct xdp_frame *xdpf)
{
        struct xdp_frame *nxdpf;

        nxdpf = xdpf_clone(xdpf);
        if (!nxdpf)
                return -ENOMEM;

        bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);

        return 0;
}

static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
{
        while (num_excluded--) {
                if (ifindex == excluded[num_excluded])
                        return true;
        }
        return false;
}

/* Get ifindex of each upper device. 'indexes' must be able to hold at
 * least 'max' elements.
 * Returns the number of ifindexes added, or -EOVERFLOW if there are too
 * many upper devices.
 */
static int get_upper_ifindexes(struct net_device *dev, int *indexes, int max)
{
        struct net_device *upper;
        struct list_head *iter;
        int n = 0;

        netdev_for_each_upper_dev_rcu(dev, upper, iter) {
                if (n >= max)
                        return -EOVERFLOW;
                indexes[n++] = upper->ifindex;
        }

        return n;
}

int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
                          struct bpf_map *map, bool exclude_ingress)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *dst, *last_dst = NULL;
        int excluded_devices[1+MAX_NEST_DEV];
        struct hlist_head *head;
        int num_excluded = 0;
        unsigned int i;
        int err;

        if (exclude_ingress) {
                num_excluded = get_upper_ifindexes(dev_rx, excluded_devices,
                                                   ARRAY_SIZE(excluded_devices) - 1);
                if (num_excluded < 0)
                        return num_excluded;

                excluded_devices[num_excluded++] = dev_rx->ifindex;
        }

        if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
                for (i = 0; i < map->max_entries; i++) {
                        dst = rcu_dereference_check(dtab->netdev_map[i],
                                                    rcu_read_lock_bh_held());
                        if (!is_valid_dst(dst, xdpf))
                                continue;

                        if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
                                continue;

                        /* we only need n-1 clones; last_dst enqueued below */
                        if (!last_dst) {
                                last_dst = dst;
                                continue;
                        }

                        err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
                        if (err)
                                return err;

                        last_dst = dst;
                }
        } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
                for (i = 0; i < dtab->n_buckets; i++) {
                        head = dev_map_index_hash(dtab, i);
                        hlist_for_each_entry_rcu(dst, head, index_hlist,
                                                rcu_read_lock_bh_held()) {
                                if (!is_valid_dst(dst, xdpf))
                                        continue;

                                if (is_ifindex_excluded(excluded_devices, num_excluded,
                                                        dst->dev->ifindex))
                                        continue;

                                /* we only need n-1 clones; last_dst enqueued below */
                                if (!last_dst) {
                                        last_dst = dst;
                                        continue;
                                }

                                err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
                                if (err)
                                        return err;

                                last_dst = dst;
                        }
                }
        }

        /* consume the last copy of the frame */
        if (last_dst)
                bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
        else
                xdp_return_frame_rx_napi(xdpf); /* dtab is empty */

        return 0;
}

int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
                             const struct bpf_prog *xdp_prog)
{
        int err;

        err = xdp_ok_fwd_dev(dst->dev, skb->len);
        if (unlikely(err))
                return err;

        /* Redirect has already succeeded semantically at this point, so we just
         * return 0 even if packet is dropped. Helper below takes care of
         * freeing skb.
         */
        if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
                return 0;

        skb->dev = dst->dev;
        generic_xdp_tx(skb, xdp_prog);

        return 0;
}

static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
                                  struct sk_buff *skb,
                                  const struct bpf_prog *xdp_prog)
{
        struct sk_buff *nskb;
        int err;

        nskb = skb_clone(skb, GFP_ATOMIC);
        if (!nskb)
                return -ENOMEM;

        err = dev_map_generic_redirect(dst, nskb, xdp_prog);
        if (unlikely(err)) {
                consume_skb(nskb);
                return err;
        }

        return 0;
}

int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
                           const struct bpf_prog *xdp_prog,
                           struct bpf_map *map, bool exclude_ingress)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *dst, *last_dst = NULL;
        int excluded_devices[1+MAX_NEST_DEV];
        struct hlist_head *head;
        int num_excluded = 0;
        unsigned int i;
        int err;

        if (exclude_ingress) {
                num_excluded = get_upper_ifindexes(dev, excluded_devices,
                                                   ARRAY_SIZE(excluded_devices) - 1);
                if (num_excluded < 0)
                        return num_excluded;

                excluded_devices[num_excluded++] = dev->ifindex;
        }

        if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
                for (i = 0; i < map->max_entries; i++) {
                        dst = rcu_dereference_check(dtab->netdev_map[i],
                                                    rcu_read_lock_bh_held());
                        if (!dst)
                                continue;

                        if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
                                continue;

                        /* we only need n-1 clones; last_dst enqueued below */
                        if (!last_dst) {
                                last_dst = dst;
                                continue;
                        }

                        err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
                        if (err)
                                return err;

                        last_dst = dst;

                }
        } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
                for (i = 0; i < dtab->n_buckets; i++) {
                        head = dev_map_index_hash(dtab, i);
                        hlist_for_each_entry_rcu(dst, head, index_hlist, rcu_read_lock_bh_held()) {
                                if (is_ifindex_excluded(excluded_devices, num_excluded,
                                                        dst->dev->ifindex))
                                        continue;

                                /* we only need n-1 clones; last_dst enqueued below */
                                if (!last_dst) {
                                        last_dst = dst;
                                        continue;
                                }

                                err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
                                if (err)
                                        return err;

                                last_dst = dst;
                        }
                }
        }

        /* consume the first skb and return */
        if (last_dst)
                return dev_map_generic_redirect(last_dst, skb, xdp_prog);

        /* dtab is empty */
        consume_skb(skb);
        return 0;
}

static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
        struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);

        return obj ? &obj->val : NULL;
}

static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
{
        struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
                                                                *(u32 *)key);
        return obj ? &obj->val : NULL;
}

static void __dev_map_entry_free(struct rcu_head *rcu)
{
        struct bpf_dtab_netdev *dev;

        dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
        if (dev->xdp_prog)
                bpf_prog_put(dev->xdp_prog);
        dev_put(dev->dev);
        kfree(dev);
}

static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *old_dev;
        u32 k = *(u32 *)key;

        if (k >= map->max_entries)
                return -EINVAL;

        old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
        if (old_dev) {
                call_rcu(&old_dev->rcu, __dev_map_entry_free);
                atomic_dec((atomic_t *)&dtab->items);
        }
        return 0;
}

static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *old_dev;
        u32 k = *(u32 *)key;
        unsigned long flags;
        int ret = -ENOENT;

        spin_lock_irqsave(&dtab->index_lock, flags);

        old_dev = __dev_map_hash_lookup_elem(map, k);
        if (old_dev) {
                dtab->items--;
                hlist_del_init_rcu(&old_dev->index_hlist);
                call_rcu(&old_dev->rcu, __dev_map_entry_free);
                ret = 0;
        }
        spin_unlock_irqrestore(&dtab->index_lock, flags);

        return ret;
}

static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
                                                    struct bpf_dtab *dtab,
                                                    struct bpf_devmap_val *val,
                                                    unsigned int idx)
{
        struct bpf_prog *prog = NULL;
        struct bpf_dtab_netdev *dev;

        dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
                                   GFP_NOWAIT,
                                   dtab->map.numa_node);
        if (!dev)
                return ERR_PTR(-ENOMEM);

        dev->dev = dev_get_by_index(net, val->ifindex);
        if (!dev->dev)
                goto err_out;

        if (val->bpf_prog.fd > 0) {
                prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
                                             BPF_PROG_TYPE_XDP, false);
                if (IS_ERR(prog))
                        goto err_put_dev;
                if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
                    !bpf_prog_map_compatible(&dtab->map, prog))
                        goto err_put_prog;
        }

        dev->idx = idx;
        if (prog) {
                dev->xdp_prog = prog;
                dev->val.bpf_prog.id = prog->aux->id;
        } else {
                dev->xdp_prog = NULL;
                dev->val.bpf_prog.id = 0;
        }
        dev->val.ifindex = val->ifindex;

        return dev;
err_put_prog:
        bpf_prog_put(prog);
err_put_dev:
        dev_put(dev->dev);
err_out:
        kfree(dev);
        return ERR_PTR(-EINVAL);
}

static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
                                  void *key, void *value, u64 map_flags)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *dev, *old_dev;
        struct bpf_devmap_val val = {};
        u32 i = *(u32 *)key;

        if (unlikely(map_flags > BPF_EXIST))
                return -EINVAL;
        if (unlikely(i >= dtab->map.max_entries))
                return -E2BIG;
        if (unlikely(map_flags == BPF_NOEXIST))
                return -EEXIST;

        /* already verified value_size <= sizeof val */
        memcpy(&val, value, map->value_size);

        if (!val.ifindex) {
                dev = NULL;
                /* can not specify fd if ifindex is 0 */
                if (val.bpf_prog.fd > 0)
                        return -EINVAL;
        } else {
                dev = __dev_map_alloc_node(net, dtab, &val, i);
                if (IS_ERR(dev))
                        return PTR_ERR(dev);
        }

        /* Use call_rcu() here to ensure rcu critical sections have completed
         * Remembering the driver side flush operation will happen before the
         * net device is removed.
         */
        old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
        if (old_dev)
                call_rcu(&old_dev->rcu, __dev_map_entry_free);
        else
                atomic_inc((atomic_t *)&dtab->items);

        return 0;
}

static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
                                u64 map_flags)
{
        return __dev_map_update_elem(current->nsproxy->net_ns,
                                     map, key, value, map_flags);
}

static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
                                       void *key, void *value, u64 map_flags)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *dev, *old_dev;
        struct bpf_devmap_val val = {};
        u32 idx = *(u32 *)key;
        unsigned long flags;
        int err = -EEXIST;

        /* already verified value_size <= sizeof val */
        memcpy(&val, value, map->value_size);

        if (unlikely(map_flags > BPF_EXIST || !val.ifindex))
                return -EINVAL;

        spin_lock_irqsave(&dtab->index_lock, flags);

        old_dev = __dev_map_hash_lookup_elem(map, idx);
        if (old_dev && (map_flags & BPF_NOEXIST))
                goto out_err;

        dev = __dev_map_alloc_node(net, dtab, &val, idx);
        if (IS_ERR(dev)) {
                err = PTR_ERR(dev);
                goto out_err;
        }

        if (old_dev) {
                hlist_del_rcu(&old_dev->index_hlist);
        } else {
                if (dtab->items >= dtab->map.max_entries) {
                        spin_unlock_irqrestore(&dtab->index_lock, flags);
                        call_rcu(&dev->rcu, __dev_map_entry_free);
                        return -E2BIG;
                }
                dtab->items++;
        }

        hlist_add_head_rcu(&dev->index_hlist,
                           dev_map_index_hash(dtab, idx));
        spin_unlock_irqrestore(&dtab->index_lock, flags);

        if (old_dev)
                call_rcu(&old_dev->rcu, __dev_map_entry_free);

        return 0;

out_err:
        spin_unlock_irqrestore(&dtab->index_lock, flags);
        return err;
}

static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
                                     u64 map_flags)
{
        return __dev_map_hash_update_elem(current->nsproxy->net_ns,
                                         map, key, value, map_flags);
}

static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
        return __bpf_xdp_redirect_map(map, ifindex, flags,
                                      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
                                      __dev_map_lookup_elem);
}

static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
        return __bpf_xdp_redirect_map(map, ifindex, flags,
                                      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
                                      __dev_map_hash_lookup_elem);
}

static u64 dev_map_mem_usage(const struct bpf_map *map)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        u64 usage = sizeof(struct bpf_dtab);

        if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
                usage += (u64)dtab->n_buckets * sizeof(struct hlist_head);
        else
                usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *);
        usage += atomic_read((atomic_t *)&dtab->items) *
                         (u64)sizeof(struct bpf_dtab_netdev);
        return usage;
}

BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = dev_map_alloc_check,
        .map_alloc = dev_map_alloc,
        .map_free = dev_map_free,
        .map_get_next_key = dev_map_get_next_key,
        .map_lookup_elem = dev_map_lookup_elem,
        .map_update_elem = dev_map_update_elem,
        .map_delete_elem = dev_map_delete_elem,
        .map_check_btf = map_check_no_btf,
        .map_mem_usage = dev_map_mem_usage,
        .map_btf_id = &dev_map_btf_ids[0],
        .map_redirect = dev_map_redirect,
};

const struct bpf_map_ops dev_map_hash_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = dev_map_alloc_check,
        .map_alloc = dev_map_alloc,
        .map_free = dev_map_free,
        .map_get_next_key = dev_map_hash_get_next_key,
        .map_lookup_elem = dev_map_hash_lookup_elem,
        .map_update_elem = dev_map_hash_update_elem,
        .map_delete_elem = dev_map_hash_delete_elem,
        .map_check_btf = map_check_no_btf,
        .map_mem_usage = dev_map_mem_usage,
        .map_btf_id = &dev_map_btf_ids[0],
        .map_redirect = dev_hash_map_redirect,
};

static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
                                       struct net_device *netdev)
{
        unsigned long flags;
        u32 i;

        spin_lock_irqsave(&dtab->index_lock, flags);
        for (i = 0; i < dtab->n_buckets; i++) {
                struct bpf_dtab_netdev *dev;
                struct hlist_head *head;
                struct hlist_node *next;

                head = dev_map_index_hash(dtab, i);

                hlist_for_each_entry_safe(dev, next, head, index_hlist) {
                        if (netdev != dev->dev)
                                continue;

                        dtab->items--;
                        hlist_del_rcu(&dev->index_hlist);
                        call_rcu(&dev->rcu, __dev_map_entry_free);
                }
        }
        spin_unlock_irqrestore(&dtab->index_lock, flags);
}

static int dev_map_notification(struct notifier_block *notifier,
                                ulong event, void *ptr)
{
        struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
        struct bpf_dtab *dtab;
        int i, cpu;

        switch (event) {
        case NETDEV_REGISTER:
                if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
                        break;

                /* will be freed in free_netdev() */
                netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue);
                if (!netdev->xdp_bulkq)
                        return NOTIFY_BAD;

                for_each_possible_cpu(cpu) {
                        struct xdp_dev_bulk_queue *bq;

                        bq = per_cpu_ptr(netdev->xdp_bulkq, cpu);
                        bq->dev = netdev;
                        local_lock_init(&bq->bq_lock);
                }
                break;
        case NETDEV_UNREGISTER:
                /* This rcu_read_lock/unlock pair is needed because
                 * dev_map_list is an RCU list AND to ensure a delete
                 * operation does not free a netdev_map entry while we
                 * are comparing it against the netdev being unregistered.
                 */
                rcu_read_lock();
                list_for_each_entry_rcu(dtab, &dev_map_list, list) {
                        if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
                                dev_map_hash_remove_netdev(dtab, netdev);
                                continue;
                        }

                        for (i = 0; i < dtab->map.max_entries; i++) {
                                struct bpf_dtab_netdev *dev, *odev;

                                dev = rcu_dereference(dtab->netdev_map[i]);
                                if (!dev || netdev != dev->dev)
                                        continue;
                                odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
                                if (dev == odev) {
                                        call_rcu(&dev->rcu,
                                                 __dev_map_entry_free);
                                        atomic_dec((atomic_t *)&dtab->items);
                                }
                        }
                }
                rcu_read_unlock();
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

static struct notifier_block dev_map_notifier = {
        .notifier_call = dev_map_notification,
};

static int __init dev_map_init(void)
{
        /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
        BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
                     offsetof(struct _bpf_dtab_netdev, dev));
        register_netdevice_notifier(&dev_map_notifier);

        return 0;
}

subsys_initcall(dev_map_init);

























   20 














































   19 
   17 
   16 


   17 





















   20 
   20 
   19 


   19 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NS_COMMON_H
#define _LINUX_NS_COMMON_H

#include <linux/ns/ns_common_types.h>
#include <linux/refcount.h>
#include <linux/vfsdebug.h>
#include <uapi/linux/sched.h>
#include <uapi/linux/nsfs.h>

bool is_current_namespace(struct ns_common *ns);
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
void __ns_common_free(struct ns_common *ns);
struct ns_common *__must_check ns_owner(struct ns_common *ns);

static __always_inline bool is_ns_init_inum(const struct ns_common *ns)
{
        VFS_WARN_ON_ONCE(ns->inum == 0);
        return unlikely(in_range(ns->inum, MNT_NS_INIT_INO,
                                 IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1));
}

static __always_inline bool is_ns_init_id(const struct ns_common *ns)
{
        VFS_WARN_ON_ONCE(ns->ns_id == 0);
        return ns->ns_id <= NS_LAST_INIT_ID;
}

#define NS_COMMON_INIT(nsname)                                                                                \
{                                                                                                        \
        .ns_type                        = ns_common_type(&nsname),                                        \
        .ns_id                                = ns_init_id(&nsname),                                                \
        .inum                                = ns_init_inum(&nsname),                                        \
        .ops                                = to_ns_operations(&nsname),                                        \
        .stashed                        = NULL,                                                                \
        .__ns_ref                        = REFCOUNT_INIT(1),                                                \
        .__ns_ref_active                = ATOMIC_INIT(1),                                                \
        .ns_unified_node.ns_list_entry        = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry),        \
        .ns_tree_node.ns_list_entry        = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry),                \
        .ns_owner_node.ns_list_entry        = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry),        \
        .ns_owner_root.ns_list_head        = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head),                \
}

#define ns_common_init(__ns)                     \
        __ns_common_init(to_ns_common(__ns),     \
                         ns_common_type(__ns),   \
                         to_ns_operations(__ns), \
                         (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0))

#define ns_common_init_inum(__ns, __inum)        \
        __ns_common_init(to_ns_common(__ns),     \
                         ns_common_type(__ns),   \
                         to_ns_operations(__ns), \
                         __inum)

#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))

bool may_see_all_namespaces(void);

static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns)
{
        return atomic_read(&ns->__ns_ref_active);
}

static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns)
{
        return refcount_read(&ns->__ns_ref);
}

static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
{
        if (is_ns_init_id(ns)) {
                VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
                VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
                return false;
        }
        if (refcount_dec_and_test(&ns->__ns_ref)) {
                VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
                return true;
        }
        return false;
}

static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
{
        if (is_ns_init_id(ns)) {
                VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
                VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
                return true;
        }
        if (refcount_inc_not_zero(&ns->__ns_ref))
                return true;
        VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
        return false;
}

static __always_inline void __ns_ref_inc(struct ns_common *ns)
{
        if (is_ns_init_id(ns)) {
                VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
                VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
                return;
        }
        refcount_inc(&ns->__ns_ref);
}

static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns,
                                                               spinlock_t *ns_lock)
{
        if (is_ns_init_id(ns)) {
                VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
                VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
                return false;
        }
        return refcount_dec_and_lock(&ns->__ns_ref, ns_lock);
}

#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns)))
#define ns_ref_inc(__ns) \
        do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0)
#define ns_ref_get(__ns) \
        ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false)
#define ns_ref_put(__ns) \
        ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false)
#define ns_ref_put_and_lock(__ns, __ns_lock) \
        ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false)

#define ns_ref_active_read(__ns) \
        ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)

void __ns_ref_active_put(struct ns_common *ns);

#define ns_ref_active_put(__ns) \
        do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0)

static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
{
        if (!__ns_ref_active_read(ns)) {
                VFS_WARN_ON_ONCE(is_ns_init_id(ns));
                return NULL;
        }
        if (!__ns_ref_get(ns))
                return NULL;
        return ns;
}

void __ns_ref_active_get(struct ns_common *ns);

#define ns_ref_active_get(__ns) \
        do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)

#endif






































































































    1 



    1 



    1 





    1 






























    1 

















































    1 


























































    1 


































    1 



    1 









    1 

    1 



































































































































































































































































































































































































































































    1 





    1 

















    1 














    1 




















    1 



    1 
















    1 
    1 




    1 
































































    1 



    1 
































































































































































    1 
    1 









    1 





















































































































    1 


    1 









    1 





































    1 









    1 



















    1 



    1 

    1 


    1 









































    1 
    1 


















    1 











    1 













    1 























































































































































































































    1 



    1 





































































    1 

























    1 


















    1 


































    1 

















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 */
#include "xfs_platform.h"
#include <linux/backing-dev.h>
#include <linux/dax.h>

#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_log_recover.h"
#include "xfs_log_priv.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_ag.h"
#include "xfs_buf_mem.h"
#include "xfs_notify_failure.h"

struct kmem_cache *xfs_buf_cache;

/*
 * Locking orders
 *
 * xfs_buf_stale:
 *        b_sema (caller holds)
 *          b_lockref.lock
 *            lru_lock
 *
 * xfs_buf_rele:
 *        b_lockref.lock
 *          lru_lock
 *
 * xfs_buftarg_drain_rele
 *        lru_lock
 *          b_lockref.lock (trylock due to inversion)
 *
 * xfs_buftarg_isolate
 *        lru_lock
 *          b_lockref.lock (trylock due to inversion)
 */

static void xfs_buf_submit(struct xfs_buf *bp);
static int xfs_buf_iowait(struct xfs_buf *bp);

static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
{
        return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
}

/*
 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 * b_lru_ref count so that the buffer is freed immediately when the buffer
 * reference count falls to zero. If the buffer is already on the LRU, we need
 * to remove the reference that LRU holds on the buffer.
 *
 * This prevents build-up of stale buffers on the LRU.
 */
void
xfs_buf_stale(
        struct xfs_buf        *bp)
{
        ASSERT(xfs_buf_islocked(bp));

        bp->b_flags |= XBF_STALE;

        /*
         * Clear the delwri status so that a delwri queue walker will not
         * flush this buffer to disk now that it is stale. The delwri queue has
         * a reference to the buffer, so this is safe to do.
         */
        bp->b_flags &= ~_XBF_DELWRI_Q;

        spin_lock(&bp->b_lockref.lock);
        atomic_set(&bp->b_lru_ref, 0);
        if (!__lockref_is_dead(&bp->b_lockref))
                list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
        spin_unlock(&bp->b_lockref.lock);
}

static void
xfs_buf_free_callback(
        struct callback_head        *cb)
{
        struct xfs_buf                *bp = container_of(cb, struct xfs_buf, b_rcu);

        if (bp->b_maps != &bp->__b_map)
                kfree(bp->b_maps);
        kmem_cache_free(xfs_buf_cache, bp);
}

static void
xfs_buf_free(
        struct xfs_buf                *bp)
{
        unsigned int                size = BBTOB(bp->b_length);

        might_sleep();
        trace_xfs_buf_free(bp, _RET_IP_);

        ASSERT(list_empty(&bp->b_lru));

        if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
                mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));

        if (is_vmalloc_addr(bp->b_addr))
                vfree(bp->b_addr);
        else if (bp->b_flags & _XBF_KMEM)
                kfree(bp->b_addr);
        else
                folio_put(virt_to_folio(bp->b_addr));

        call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}

static int
xfs_buf_alloc_kmem(
        struct xfs_buf                *bp,
        size_t                        size,
        gfp_t                        gfp_mask)
{
        ASSERT(is_power_of_2(size));
        ASSERT(size < PAGE_SIZE);

        bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL);
        if (!bp->b_addr)
                return -ENOMEM;

        /*
         * Slab guarantees that we get back naturally aligned allocations for
         * power of two sizes.  Keep this check as the canary in the coal mine
         * if anything changes in slab.
         */
        if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) {
                kfree(bp->b_addr);
                bp->b_addr = NULL;
                return -ENOMEM;
        }
        bp->b_flags |= _XBF_KMEM;
        trace_xfs_buf_backing_kmem(bp, _RET_IP_);
        return 0;
}

/*
 * Allocate backing memory for a buffer.
 *
 * For tmpfs-backed buffers used by in-memory btrees this directly maps the
 * tmpfs page cache folios.
 *
 * For real file system buffers there are three different kinds backing memory:
 *
 * The first type backs the buffer by a kmalloc allocation.  This is done for
 * less than PAGE_SIZE allocations to avoid wasting memory.
 *
 * The second type is a single folio buffer - this may be a high order folio or
 * just a single page sized folio, but either way they get treated the same way
 * by the rest of the code - the buffer memory spans a single contiguous memory
 * region that we don't have to map and unmap to access the data directly.
 *
 * The third type of buffer is the vmalloc()d buffer. This provides the buffer
 * with the required contiguous memory region but backed by discontiguous
 * physical pages.
 */
static int
xfs_buf_alloc_backing_mem(
        struct xfs_buf        *bp,
        xfs_buf_flags_t        flags)
{
        size_t                size = BBTOB(bp->b_length);
        gfp_t                gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
        struct folio        *folio;

        if (xfs_buftarg_is_mem(bp->b_target))
                return xmbuf_map_backing_mem(bp);

        /* Assure zeroed buffer for non-read cases. */
        if (!(flags & XBF_READ))
                gfp_mask |= __GFP_ZERO;

        if (flags & XBF_READ_AHEAD)
                gfp_mask |= __GFP_NORETRY;

        /*
         * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that
         * is properly aligned.  The slab allocator now guarantees an aligned
         * allocation for all power of two sizes, which matches most of the
         * smaller than PAGE_SIZE buffers used by XFS.
         */
        if (size < PAGE_SIZE && is_power_of_2(size))
                return xfs_buf_alloc_kmem(bp, size, gfp_mask);

        /*
         * Don't bother with the retry loop for single PAGE allocations: vmalloc
         * won't do any better.
         */
        if (size <= PAGE_SIZE)
                gfp_mask |= __GFP_NOFAIL;

        /*
         * Optimistically attempt a single high order folio allocation for
         * larger than PAGE_SIZE buffers.
         *
         * Allocating a high order folio makes the assumption that buffers are a
         * power-of-2 size, matching the power-of-2 folios sizes available.
         *
         * The exception here are user xattr data buffers, which can be arbitrarily
         * sized up to 64kB plus structure metadata, skip straight to the vmalloc
         * path for them instead of wasting memory here.
         */
        if (size > PAGE_SIZE) {
                if (!is_power_of_2(size))
                        goto fallback;
                gfp_mask &= ~__GFP_DIRECT_RECLAIM;
                gfp_mask |= __GFP_NORETRY;
        }
        folio = folio_alloc(gfp_mask, get_order(size));
        if (!folio) {
                if (size <= PAGE_SIZE)
                        return -ENOMEM;
                trace_xfs_buf_backing_fallback(bp, _RET_IP_);
                goto fallback;
        }
        bp->b_addr = folio_address(folio);
        trace_xfs_buf_backing_folio(bp, _RET_IP_);
        return 0;

fallback:
        for (;;) {
                bp->b_addr = __vmalloc(size, gfp_mask);
                if (bp->b_addr)
                        break;
                if (flags & XBF_READ_AHEAD)
                        return -ENOMEM;
                XFS_STATS_INC(bp->b_mount, xb_page_retries);
                memalloc_retry_wait(gfp_mask);
        }

        trace_xfs_buf_backing_vmalloc(bp, _RET_IP_);
        return 0;
}

static int
xfs_buf_alloc(
        struct xfs_buftarg        *target,
        struct xfs_buf_map        *map,
        int                        nmaps,
        xfs_buf_flags_t                flags,
        struct xfs_buf                **bpp)
{
        struct xfs_buf                *bp;
        int                        error;
        int                        i;

        *bpp = NULL;
        bp = kmem_cache_zalloc(xfs_buf_cache,
                        GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);

        /*
         * We don't want certain flags to appear in b_flags unless they are
         * specifically set by later operations on the buffer.
         */
        flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);

        /*
         * A new buffer is held and locked by the owner.  This ensures that the
         * buffer is owned by the caller and racing RCU lookups right after
         * inserting into the hash table are safe (and will have to wait for
         * the unlock to do anything non-trivial).
         */
        lockref_init(&bp->b_lockref);
        sema_init(&bp->b_sema, 0); /* held, no waiters */
        atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
        INIT_LIST_HEAD(&bp->b_li_list);
        bp->b_target = target;
        bp->b_mount = target->bt_mount;
        bp->b_flags = flags;
        bp->b_rhash_key = map[0].bm_bn;
        bp->b_length = 0;
        bp->b_map_count = nmaps;
        if (nmaps == 1)
                bp->b_maps = &bp->__b_map;
        else
                bp->b_maps = kzalloc_objs(struct xfs_buf_map, nmaps,
                                          GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
        for (i = 0; i < nmaps; i++) {
                bp->b_maps[i].bm_bn = map[i].bm_bn;
                bp->b_maps[i].bm_len = map[i].bm_len;
                bp->b_length += map[i].bm_len;
        }

        atomic_set(&bp->b_pin_count, 0);
        init_waitqueue_head(&bp->b_waiters);

        XFS_STATS_INC(bp->b_mount, xb_create);
        trace_xfs_buf_init(bp, _RET_IP_);

        error = xfs_buf_alloc_backing_mem(bp, flags);
        if (error) {
                xfs_buf_free(bp);
                return error;
        }

        *bpp = bp;
        return 0;
}

/*
 *        Finding and Reading Buffers
 */
static int
_xfs_buf_obj_cmp(
        struct rhashtable_compare_arg        *arg,
        const void                        *obj)
{
        const struct xfs_buf_map        *map = arg->key;
        const struct xfs_buf                *bp = obj;

        /*
         * The key hashing in the lookup path depends on the key being the
         * first element of the compare_arg, make sure to assert this.
         */
        BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);

        if (bp->b_rhash_key != map->bm_bn)
                return 1;

        if (unlikely(bp->b_length != map->bm_len)) {
                /*
                 * found a block number match. If the range doesn't
                 * match, the only way this is allowed is if the buffer
                 * in the cache is stale and the transaction that made
                 * it stale has not yet committed. i.e. we are
                 * reallocating a busy extent. Skip this buffer and
                 * continue searching for an exact match.
                 *
                 * Note: If we're scanning for incore buffers to stale, don't
                 * complain if we find non-stale buffers.
                 */
                if (!(map->bm_flags & XBM_LIVESCAN))
                        ASSERT(bp->b_flags & XBF_STALE);
                return 1;
        }
        return 0;
}

static const struct rhashtable_params xfs_buf_hash_params = {
        .min_size                = 32,        /* empty AGs have minimal footprint */
        .nelem_hint                = 16,
        .key_len                = sizeof(xfs_daddr_t),
        .key_offset                = offsetof(struct xfs_buf, b_rhash_key),
        .head_offset                = offsetof(struct xfs_buf, b_rhash_head),
        .automatic_shrinking        = true,
        .obj_cmpfn                = _xfs_buf_obj_cmp,
};

static int
xfs_buf_map_verify(
        struct xfs_buftarg        *btp,
        struct xfs_buf_map        *map)
{
        /* Check for IOs smaller than the sector size / not sector aligned */
        ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
        ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));

        /*
         * Corrupted block numbers can get through to here, unfortunately, so we
         * have to check that the buffer falls within the filesystem bounds.
         */
        if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) {
                xfs_alert(btp->bt_mount,
                          "%s: daddr 0x%llx out of range, EOFS 0x%llx",
                          __func__, map->bm_bn, btp->bt_nr_sectors);
                WARN_ON(1);
                return -EFSCORRUPTED;
        }
        return 0;
}

static int
xfs_buf_find_lock(
        struct xfs_buf          *bp,
        xfs_buf_flags_t                flags)
{
        if (flags & XBF_TRYLOCK) {
                if (!xfs_buf_trylock(bp)) {
                        XFS_STATS_INC(bp->b_mount, xb_busy_locked);
                        return -EAGAIN;
                }
        } else {
                xfs_buf_lock(bp);
                XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
        }

        /*
         * if the buffer is stale, clear all the external state associated with
         * it. We need to keep flags such as how we allocated the buffer memory
         * intact here.
         */
        if (bp->b_flags & XBF_STALE) {
                if (flags & XBF_LIVESCAN) {
                        xfs_buf_unlock(bp);
                        return -ENOENT;
                }
                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
                bp->b_flags &= _XBF_KMEM;
                bp->b_ops = NULL;
        }
        return 0;
}

static inline int
xfs_buf_lookup(
        struct xfs_buftarg        *btp,
        struct xfs_buf_map        *map,
        xfs_buf_flags_t                flags,
        struct xfs_buf                **bpp)
{
        struct xfs_buf          *bp;
        int                        error;

        rcu_read_lock();
        bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
        if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
                rcu_read_unlock();
                return -ENOENT;
        }
        rcu_read_unlock();

        error = xfs_buf_find_lock(bp, flags);
        if (error) {
                xfs_buf_rele(bp);
                return error;
        }

        trace_xfs_buf_find(bp, flags, _RET_IP_);
        *bpp = bp;
        return 0;
}

/*
 * Insert the new_bp into the hash table. This consumes the perag reference
 * taken for the lookup regardless of the result of the insert.
 */
static int
xfs_buf_find_insert(
        struct xfs_buftarg        *btp,
        struct xfs_perag        *pag,
        struct xfs_buf_map        *cmap,
        struct xfs_buf_map        *map,
        int                        nmaps,
        xfs_buf_flags_t                flags,
        struct xfs_buf                **bpp)
{
        struct xfs_buf                *new_bp;
        struct xfs_buf                *bp;
        int                        error;

        error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
        if (error)
                goto out_drop_pag;

        /* The new buffer keeps the perag reference until it is freed. */
        new_bp->b_pag = pag;

        rcu_read_lock();
        bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
                        &new_bp->b_rhash_head, xfs_buf_hash_params);
        if (IS_ERR(bp)) {
                rcu_read_unlock();
                error = PTR_ERR(bp);
                goto out_free_buf;
        }
        if (bp && lockref_get_not_dead(&bp->b_lockref)) {
                /* found an existing buffer */
                rcu_read_unlock();
                error = xfs_buf_find_lock(bp, flags);
                if (error)
                        xfs_buf_rele(bp);
                else
                        *bpp = bp;
                goto out_free_buf;
        }
        rcu_read_unlock();

        *bpp = new_bp;
        return 0;

out_free_buf:
        xfs_buf_free(new_bp);
out_drop_pag:
        if (pag)
                xfs_perag_put(pag);
        return error;
}

static inline struct xfs_perag *
xfs_buftarg_get_pag(
        struct xfs_buftarg                *btp,
        const struct xfs_buf_map        *map)
{
        struct xfs_mount                *mp = btp->bt_mount;

        if (xfs_buftarg_is_mem(btp))
                return NULL;
        return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
}

/*
 * Assembles a buffer covering the specified range. The code is optimised for
 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
 * more hits than misses.
 */
int
xfs_buf_get_map(
        struct xfs_buftarg        *btp,
        struct xfs_buf_map        *map,
        int                        nmaps,
        xfs_buf_flags_t                flags,
        struct xfs_buf                **bpp)
{
        struct xfs_perag        *pag;
        struct xfs_buf                *bp = NULL;
        struct xfs_buf_map        cmap = { .bm_bn = map[0].bm_bn };
        int                        error;
        int                        i;

        if (flags & XBF_LIVESCAN)
                cmap.bm_flags |= XBM_LIVESCAN;
        for (i = 0; i < nmaps; i++)
                cmap.bm_len += map[i].bm_len;

        error = xfs_buf_map_verify(btp, &cmap);
        if (error)
                return error;

        pag = xfs_buftarg_get_pag(btp, &cmap);

        error = xfs_buf_lookup(btp, &cmap, flags, &bp);
        if (error && error != -ENOENT)
                goto out_put_perag;

        /* cache hits always outnumber misses by at least 10:1 */
        if (unlikely(!bp)) {
                XFS_STATS_INC(btp->bt_mount, xb_miss_locked);

                if (flags & XBF_INCORE)
                        goto out_put_perag;

                /* xfs_buf_find_insert() consumes the perag reference. */
                error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
                                flags, &bp);
                if (error)
                        return error;
        } else {
                XFS_STATS_INC(btp->bt_mount, xb_get_locked);
                if (pag)
                        xfs_perag_put(pag);
        }

        /*
         * Clear b_error if this is a lookup from a caller that doesn't expect
         * valid data to be found in the buffer.
         */
        if (!(flags & XBF_READ))
                xfs_buf_ioerror(bp, 0);

        XFS_STATS_INC(btp->bt_mount, xb_get);
        trace_xfs_buf_get(bp, flags, _RET_IP_);
        *bpp = bp;
        return 0;

out_put_perag:
        if (pag)
                xfs_perag_put(pag);
        return error;
}

int
_xfs_buf_read(
        struct xfs_buf                *bp)
{
        ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);

        bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
        bp->b_flags |= XBF_READ;
        xfs_buf_submit(bp);
        return xfs_buf_iowait(bp);
}

/*
 * Reverify a buffer found in cache without an attached ->b_ops.
 *
 * If the caller passed an ops structure and the buffer doesn't have ops
 * assigned, set the ops and use it to verify the contents. If verification
 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
 * already in XBF_DONE state on entry.
 *
 * Under normal operations, every in-core buffer is verified on read I/O
 * completion. There are two scenarios that can lead to in-core buffers without
 * an assigned ->b_ops. The first is during log recovery of buffers on a V4
 * filesystem, though these buffers are purged at the end of recovery. The
 * other is online repair, which intentionally reads with a NULL buffer ops to
 * run several verifiers across an in-core buffer in order to establish buffer
 * type.  If repair can't establish that, the buffer will be left in memory
 * with NULL buffer ops.
 */
int
xfs_buf_reverify(
        struct xfs_buf                *bp,
        const struct xfs_buf_ops *ops)
{
        ASSERT(bp->b_flags & XBF_DONE);
        ASSERT(bp->b_error == 0);

        if (!ops || bp->b_ops)
                return 0;

        bp->b_ops = ops;
        bp->b_ops->verify_read(bp);
        if (bp->b_error)
                bp->b_flags &= ~XBF_DONE;
        return bp->b_error;
}

int
xfs_buf_read_map(
        struct xfs_buftarg        *target,
        struct xfs_buf_map        *map,
        int                        nmaps,
        xfs_buf_flags_t                flags,
        struct xfs_buf                **bpp,
        const struct xfs_buf_ops *ops,
        xfs_failaddr_t                fa)
{
        struct xfs_buf                *bp;
        int                        error;

        ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD)));

        flags |= XBF_READ;
        *bpp = NULL;

        error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
        if (error)
                return error;

        trace_xfs_buf_read(bp, flags, _RET_IP_);

        if (!(bp->b_flags & XBF_DONE)) {
                /* Initiate the buffer read and wait. */
                XFS_STATS_INC(target->bt_mount, xb_get_read);
                bp->b_ops = ops;
                error = _xfs_buf_read(bp);
        } else {
                /* Buffer already read; all we need to do is check it. */
                error = xfs_buf_reverify(bp, ops);

                /* We do not want read in the flags */
                bp->b_flags &= ~XBF_READ;
                ASSERT(bp->b_ops != NULL || ops == NULL);
        }

        /*
         * If we've had a read error, then the contents of the buffer are
         * invalid and should not be used. To ensure that a followup read tries
         * to pull the buffer from disk again, we clear the XBF_DONE flag and
         * mark the buffer stale. This ensures that anyone who has a current
         * reference to the buffer will interpret it's contents correctly and
         * future cache lookups will also treat it as an empty, uninitialised
         * buffer.
         */
        if (error) {
                /*
                 * Check against log shutdown for error reporting because
                 * metadata writeback may require a read first and we need to
                 * report errors in metadata writeback until the log is shut
                 * down. High level transaction read functions already check
                 * against mount shutdown, anyway, so we only need to be
                 * concerned about low level IO interactions here.
                 */
                if (!xlog_is_shutdown(target->bt_mount->m_log))
                        xfs_buf_ioerror_alert(bp, fa);

                bp->b_flags &= ~XBF_DONE;
                xfs_buf_stale(bp);
                xfs_buf_relse(bp);

                /* bad CRC means corrupted metadata */
                if (error == -EFSBADCRC)
                        error = -EFSCORRUPTED;
                return error;
        }

        *bpp = bp;
        return 0;
}

/*
 *        If we are not low on memory then do the readahead in a deadlock
 *        safe manner.
 */
void
xfs_buf_readahead_map(
        struct xfs_buftarg        *target,
        struct xfs_buf_map        *map,
        int                        nmaps,
        const struct xfs_buf_ops *ops)
{
        const xfs_buf_flags_t        flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD;
        struct xfs_buf                *bp;

        /*
         * Currently we don't have a good means or justification for performing
         * xmbuf_map_page asynchronously, so we don't do readahead.
         */
        if (xfs_buftarg_is_mem(target))
                return;

        if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp))
                return;
        trace_xfs_buf_readahead(bp, 0, _RET_IP_);

        if (bp->b_flags & XBF_DONE) {
                xfs_buf_reverify(bp, ops);
                xfs_buf_relse(bp);
                return;
        }
        XFS_STATS_INC(target->bt_mount, xb_get_read);
        bp->b_ops = ops;
        bp->b_flags &= ~(XBF_WRITE | XBF_DONE);
        bp->b_flags |= flags;
        percpu_counter_inc(&target->bt_readahead_count);
        xfs_buf_submit(bp);
}

/*
 * Read an uncached buffer from disk. Allocates and returns a locked
 * buffer containing the disk contents or nothing. Uncached buffers always have
 * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
 * is cached or uncached during fault diagnosis.
 */
int
xfs_buf_read_uncached(
        struct xfs_buftarg        *target,
        xfs_daddr_t                daddr,
        size_t                        numblks,
        struct xfs_buf                **bpp,
        const struct xfs_buf_ops *ops)
{
        struct xfs_buf                *bp;
        int                        error;

        *bpp = NULL;

        error = xfs_buf_get_uncached(target, numblks, &bp);
        if (error)
                return error;

        /* set up the buffer for a read IO */
        ASSERT(bp->b_map_count == 1);
        bp->b_rhash_key = XFS_BUF_DADDR_NULL;
        bp->b_maps[0].bm_bn = daddr;
        bp->b_flags |= XBF_READ;
        bp->b_ops = ops;

        xfs_buf_submit(bp);
        error = xfs_buf_iowait(bp);
        if (error) {
                xfs_buf_relse(bp);
                return error;
        }

        *bpp = bp;
        return 0;
}

int
xfs_buf_get_uncached(
        struct xfs_buftarg        *target,
        size_t                        numblks,
        struct xfs_buf                **bpp)
{
        int                        error;
        DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);

        error = xfs_buf_alloc(target, &map, 1, 0, bpp);
        if (!error)
                trace_xfs_buf_get_uncached(*bpp, _RET_IP_);
        return error;
}

/*
 *        Increment reference count on buffer, to hold the buffer concurrently
 *        with another thread which may release (free) the buffer asynchronously.
 *        Must hold the buffer already to call this function.
 */
void
xfs_buf_hold(
        struct xfs_buf                *bp)
{
        trace_xfs_buf_hold(bp, _RET_IP_);

        lockref_get(&bp->b_lockref);
}

static void
xfs_buf_destroy(
        struct xfs_buf                *bp)
{
        ASSERT(__lockref_is_dead(&bp->b_lockref));
        ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));

        if (!xfs_buf_is_uncached(bp)) {
                rhashtable_remove_fast(&bp->b_target->bt_hash,
                                &bp->b_rhash_head, xfs_buf_hash_params);

                if (bp->b_pag)
                        xfs_perag_put(bp->b_pag);
        }

        xfs_buf_free(bp);
}

/*
 * Release a hold on the specified buffer.
 */
void
xfs_buf_rele(
        struct xfs_buf                *bp)
{
        trace_xfs_buf_rele(bp, _RET_IP_);

        if (lockref_put_or_lock(&bp->b_lockref))
                return;
        if (!--bp->b_lockref.count) {
                if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
                        goto kill;
                list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
        }
        spin_unlock(&bp->b_lockref.lock);
        return;

kill:
        lockref_mark_dead(&bp->b_lockref);
        list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
        spin_unlock(&bp->b_lockref.lock);

        xfs_buf_destroy(bp);
}

/*
 *        Lock a buffer object, if it is not already locked.
 *
 *        If we come across a stale, pinned, locked buffer, we know that we are
 *        being asked to lock a buffer that has been reallocated. Because it is
 *        pinned, we know that the log has not been pushed to disk and hence it
 *        will still be locked.  Rather than continuing to have trylock attempts
 *        fail until someone else pushes the log, push it ourselves before
 *        returning.  This means that the xfsaild will not get stuck trying
 *        to push on stale inode buffers.
 */
int
xfs_buf_trylock(
        struct xfs_buf                *bp)
{
        int                        locked;

        locked = down_trylock(&bp->b_sema) == 0;
        if (locked)
                trace_xfs_buf_trylock(bp, _RET_IP_);
        else
                trace_xfs_buf_trylock_fail(bp, _RET_IP_);
        return locked;
}

/*
 *        Lock a buffer object.
 *
 *        If we come across a stale, pinned, locked buffer, we know that we
 *        are being asked to lock a buffer that has been reallocated. Because
 *        it is pinned, we know that the log has not been pushed to disk and
 *        hence it will still be locked. Rather than sleeping until someone
 *        else pushes the log, push it ourselves before trying to get the lock.
 */
void
xfs_buf_lock(
        struct xfs_buf                *bp)
{
        trace_xfs_buf_lock(bp, _RET_IP_);

        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
                xfs_log_force(bp->b_mount, 0);
        down(&bp->b_sema);

        trace_xfs_buf_lock_done(bp, _RET_IP_);
}

void
xfs_buf_unlock(
        struct xfs_buf                *bp)
{
        ASSERT(xfs_buf_islocked(bp));

        up(&bp->b_sema);
        trace_xfs_buf_unlock(bp, _RET_IP_);
}

STATIC void
xfs_buf_wait_unpin(
        struct xfs_buf                *bp)
{
        DECLARE_WAITQUEUE        (wait, current);

        if (atomic_read(&bp->b_pin_count) == 0)
                return;

        add_wait_queue(&bp->b_waiters, &wait);
        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (atomic_read(&bp->b_pin_count) == 0)
                        break;
                io_schedule();
        }
        remove_wait_queue(&bp->b_waiters, &wait);
        set_current_state(TASK_RUNNING);
}

static void
xfs_buf_ioerror_alert_ratelimited(
        struct xfs_buf                *bp)
{
        static unsigned long        lasttime;
        static struct xfs_buftarg *lasttarg;

        if (bp->b_target != lasttarg ||
            time_after(jiffies, (lasttime + 5*HZ))) {
                lasttime = jiffies;
                xfs_buf_ioerror_alert(bp, __this_address);
        }
        lasttarg = bp->b_target;
}

/*
 * Account for this latest trip around the retry handler, and decide if
 * we've failed enough times to constitute a permanent failure.
 */
static bool
xfs_buf_ioerror_permanent(
        struct xfs_buf                *bp,
        struct xfs_error_cfg        *cfg)
{
        struct xfs_mount        *mp = bp->b_mount;

        if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
            ++bp->b_retries > cfg->max_retries)
                return true;
        if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
            time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
                return true;

        /* At unmount we may treat errors differently */
        if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
                return true;

        return false;
}

/*
 * On a sync write or shutdown we just want to stale the buffer and let the
 * caller handle the error in bp->b_error appropriately.
 *
 * If the write was asynchronous then no one will be looking for the error.  If
 * this is the first failure of this type, clear the error state and write the
 * buffer out again. This means we always retry an async write failure at least
 * once, but we also need to set the buffer up to behave correctly now for
 * repeated failures.
 *
 * If we get repeated async write failures, then we take action according to the
 * error configuration we have been set up to use.
 *
 * Returns true if this function took care of error handling and the caller must
 * not touch the buffer again.  Return false if the caller should proceed with
 * normal I/O completion handling.
 */
static bool
xfs_buf_ioend_handle_error(
        struct xfs_buf                *bp)
{
        struct xfs_mount        *mp = bp->b_mount;
        struct xfs_error_cfg        *cfg;
        struct xfs_log_item        *lip;

        /*
         * If we've already shutdown the journal because of I/O errors, there's
         * no point in giving this a retry.
         */
        if (xlog_is_shutdown(mp->m_log))
                goto out_stale;

        xfs_buf_ioerror_alert_ratelimited(bp);

        /*
         * We're not going to bother about retrying this during recovery.
         * One strike!
         */
        if (bp->b_flags & _XBF_LOGRECOVERY) {
                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                return false;
        }

        /*
         * Synchronous writes will have callers process the error.
         */
        if (!(bp->b_flags & XBF_ASYNC))
                goto out_stale;

        trace_xfs_buf_iodone_async(bp, _RET_IP_);

        cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
        if (bp->b_last_error != bp->b_error ||
            !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
                bp->b_last_error = bp->b_error;
                if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
                    !bp->b_first_retry_time)
                        bp->b_first_retry_time = jiffies;
                goto resubmit;
        }

        /*
         * Permanent error - we need to trigger a shutdown if we haven't already
         * to indicate that inconsistency will result from this action.
         */
        if (xfs_buf_ioerror_permanent(bp, cfg)) {
                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                goto out_stale;
        }

        /* Still considered a transient error. Caller will schedule retries. */
        list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
                set_bit(XFS_LI_FAILED, &lip->li_flags);
                clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
        }

        xfs_buf_ioerror(bp, 0);
        xfs_buf_relse(bp);
        return true;

resubmit:
        xfs_buf_ioerror(bp, 0);
        bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
        reinit_completion(&bp->b_iowait);
        xfs_buf_submit(bp);
        return true;
out_stale:
        xfs_buf_stale(bp);
        bp->b_flags |= XBF_DONE;
        bp->b_flags &= ~XBF_WRITE;
        trace_xfs_buf_error_relse(bp, _RET_IP_);
        return false;
}

/* returns false if the caller needs to resubmit the I/O, else true */
static bool
__xfs_buf_ioend(
        struct xfs_buf        *bp)
{
        trace_xfs_buf_iodone(bp, _RET_IP_);

        if (bp->b_flags & XBF_READ) {
                if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
                        invalidate_kernel_vmap_range(bp->b_addr,
                                roundup(BBTOB(bp->b_length), PAGE_SIZE));
                if (!bp->b_error && bp->b_ops)
                        bp->b_ops->verify_read(bp);
                if (!bp->b_error)
                        bp->b_flags |= XBF_DONE;
                if (bp->b_flags & XBF_READ_AHEAD)
                        percpu_counter_dec(&bp->b_target->bt_readahead_count);
        } else {
                if (!bp->b_error) {
                        bp->b_flags &= ~XBF_WRITE_FAIL;
                        bp->b_flags |= XBF_DONE;
                }

                if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
                        return false;

                /* clear the retry state */
                bp->b_last_error = 0;
                bp->b_retries = 0;
                bp->b_first_retry_time = 0;

                /*
                 * Note that for things like remote attribute buffers, there may
                 * not be a buffer log item here, so processing the buffer log
                 * item must remain optional.
                 */
                if (bp->b_log_item)
                        xfs_buf_item_done(bp);

                if (bp->b_iodone)
                        bp->b_iodone(bp);
        }

        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
                         _XBF_LOGRECOVERY);
        return true;
}

static void
xfs_buf_ioend(
        struct xfs_buf        *bp)
{
        if (!__xfs_buf_ioend(bp))
                return;
        if (bp->b_flags & XBF_ASYNC)
                xfs_buf_relse(bp);
        else
                complete(&bp->b_iowait);
}

static void
xfs_buf_ioend_work(
        struct work_struct        *work)
{
        struct xfs_buf                *bp =
                container_of(work, struct xfs_buf, b_ioend_work);

        if (__xfs_buf_ioend(bp))
                xfs_buf_relse(bp);
}

void
__xfs_buf_ioerror(
        struct xfs_buf                *bp,
        int                        error,
        xfs_failaddr_t                failaddr)
{
        ASSERT(error <= 0 && error >= -1000);
        bp->b_error = error;
        trace_xfs_buf_ioerror(bp, error, failaddr);
}

void
xfs_buf_ioerror_alert(
        struct xfs_buf                *bp,
        xfs_failaddr_t                func)
{
        xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
                "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
                                  func, (uint64_t)xfs_buf_daddr(bp),
                                  bp->b_length, -bp->b_error);
}

/*
 * To simulate an I/O failure, the buffer must be locked and held with at least
 * two references.
 *
 * The buf item reference is dropped via ioend processing. The second reference
 * is owned by the caller and is dropped on I/O completion if the buffer is
 * XBF_ASYNC.
 */
void
xfs_buf_ioend_fail(
        struct xfs_buf        *bp)
{
        bp->b_flags &= ~XBF_DONE;
        xfs_buf_stale(bp);
        xfs_buf_ioerror(bp, -EIO);
        xfs_buf_ioend(bp);
}

int
xfs_bwrite(
        struct xfs_buf                *bp)
{
        int                        error;

        ASSERT(xfs_buf_islocked(bp));

        bp->b_flags |= XBF_WRITE;
        bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
                         XBF_DONE);

        xfs_buf_submit(bp);
        error = xfs_buf_iowait(bp);
        if (error)
                xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
        return error;
}

static void
xfs_buf_bio_end_io(
        struct bio                *bio)
{
        struct xfs_buf                *bp = bio->bi_private;

        if (bio->bi_status)
                xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status));
        else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
                 XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
                xfs_buf_ioerror(bp, -EIO);

        if (bp->b_flags & XBF_ASYNC) {
                INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
                queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
        } else {
                complete(&bp->b_iowait);
        }

        bio_put(bio);
}

static inline blk_opf_t
xfs_buf_bio_op(
        struct xfs_buf                *bp)
{
        blk_opf_t                op;

        if (bp->b_flags & XBF_WRITE) {
                op = REQ_OP_WRITE;
        } else {
                op = REQ_OP_READ;
                if (bp->b_flags & XBF_READ_AHEAD)
                        op |= REQ_RAHEAD;
        }

        return op | REQ_META;
}

static void
xfs_buf_submit_bio(
        struct xfs_buf                *bp)
{
        unsigned int                len = BBTOB(bp->b_length);
        unsigned int                nr_vecs = bio_add_max_vecs(bp->b_addr, len);
        unsigned int                map = 0;
        struct blk_plug                plug;
        struct bio                *bio;

        bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp),
                        GFP_NOIO);
        if (is_vmalloc_addr(bp->b_addr))
                bio_add_vmalloc(bio, bp->b_addr, len);
        else
                bio_add_virt_nofail(bio, bp->b_addr, len);
        bio->bi_private = bp;
        bio->bi_end_io = xfs_buf_bio_end_io;

        /*
         * If there is more than one map segment, split out a new bio for each
         * map except of the last one.  The last map is handled by the
         * remainder of the original bio outside the loop.
         */
        blk_start_plug(&plug);
        for (map = 0; map < bp->b_map_count - 1; map++) {
                struct bio        *split;

                split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS,
                                &fs_bio_set);
                split->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
                bio_chain(split, bio);
                submit_bio(split);
        }
        bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
        submit_bio(bio);
        blk_finish_plug(&plug);
}

/*
 * Wait for I/O completion of a sync buffer and return the I/O error code.
 */
static int
xfs_buf_iowait(
        struct xfs_buf        *bp)
{
        ASSERT(!(bp->b_flags & XBF_ASYNC));

        do {
                trace_xfs_buf_iowait(bp, _RET_IP_);
                wait_for_completion(&bp->b_iowait);
                trace_xfs_buf_iowait_done(bp, _RET_IP_);
        } while (!__xfs_buf_ioend(bp));

        return bp->b_error;
}

/*
 * Run the write verifier callback function if it exists. If this fails, mark
 * the buffer with an error and do not dispatch the I/O.
 */
static bool
xfs_buf_verify_write(
        struct xfs_buf                *bp)
{
        if (bp->b_ops) {
                bp->b_ops->verify_write(bp);
                if (bp->b_error)
                        return false;
        } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
                /*
                 * Non-crc filesystems don't attach verifiers during log
                 * recovery, so don't warn for such filesystems.
                 */
                if (xfs_has_crc(bp->b_mount)) {
                        xfs_warn(bp->b_mount,
                                "%s: no buf ops on daddr 0x%llx len %d",
                                __func__, xfs_buf_daddr(bp),
                                bp->b_length);
                        xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN);
                        dump_stack();
                }
        }

        return true;
}

/*
 * Buffer I/O submission path, read or write. Asynchronous submission transfers
 * the buffer lock ownership and the current reference to the IO. It is not
 * safe to reference the buffer after a call to this function unless the caller
 * holds an additional reference itself.
 */
static void
xfs_buf_submit(
        struct xfs_buf        *bp)
{
        trace_xfs_buf_submit(bp, _RET_IP_);

        ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));

        /*
         * On log shutdown we stale and complete the buffer immediately. We can
         * be called to read the superblock before the log has been set up, so
         * be careful checking the log state.
         *
         * Checking the mount shutdown state here can result in the log tail
         * moving inappropriately on disk as the log may not yet be shut down.
         * i.e. failing this buffer on mount shutdown can remove it from the AIL
         * and move the tail of the log forwards without having written this
         * buffer to disk. This corrupts the log tail state in memory, and
         * because the log may not be shut down yet, it can then be propagated
         * to disk before the log is shutdown. Hence we check log shutdown
         * state here rather than mount state to avoid corrupting the log tail
         * on shutdown.
         */
        if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) {
                xfs_buf_ioend_fail(bp);
                return;
        }

        if (bp->b_flags & XBF_WRITE)
                xfs_buf_wait_unpin(bp);

        /*
         * Make sure we capture only current IO errors rather than stale errors
         * left over from previous use of the buffer (e.g. failed readahead).
         */
        bp->b_error = 0;

        if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) {
                xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE);
                xfs_buf_ioend(bp);
                return;
        }

        /* In-memory targets are directly mapped, no I/O required. */
        if (xfs_buftarg_is_mem(bp->b_target)) {
                xfs_buf_ioend(bp);
                return;
        }

        xfs_buf_submit_bio(bp);
}

/*
 * Log a message about and stale a buffer that a caller has decided is corrupt.
 *
 * This function should be called for the kinds of metadata corruption that
 * cannot be detect from a verifier, such as incorrect inter-block relationship
 * data.  Do /not/ call this function from a verifier function.
 *
 * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
 * be marked stale, but b_error will not be set.  The caller is responsible for
 * releasing the buffer or fixing it.
 */
void
__xfs_buf_mark_corrupt(
        struct xfs_buf                *bp,
        xfs_failaddr_t                fa)
{
        ASSERT(bp->b_flags & XBF_DONE);

        xfs_buf_corruption_error(bp, fa);
        xfs_buf_stale(bp);
}

/*
 *        Handling of buffer targets (buftargs).
 */

/*
 * Wait for any bufs with callbacks that have been submitted but have not yet
 * returned. These buffers will have an elevated hold count, so wait on those
 * while freeing all the buffers only held by the LRU.
 */
static enum lru_status
xfs_buftarg_drain_rele(
        struct list_head        *item,
        struct list_lru_one        *lru,
        void                        *arg)

{
        struct xfs_buf                *bp = container_of(item, struct xfs_buf, b_lru);
        struct list_head        *dispose = arg;

        if (!spin_trylock(&bp->b_lockref.lock))
                return LRU_SKIP;
        if (bp->b_lockref.count > 0) {
                /* need to wait, so skip it this pass */
                spin_unlock(&bp->b_lockref.lock);
                trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
                return LRU_SKIP;
        }

        lockref_mark_dead(&bp->b_lockref);
        list_lru_isolate_move(lru, item, dispose);
        spin_unlock(&bp->b_lockref.lock);
        return LRU_REMOVED;
}

/*
 * Wait for outstanding I/O on the buftarg to complete.
 */
void
xfs_buftarg_wait(
        struct xfs_buftarg        *btp)
{
        /*
         * First wait for all in-flight readahead buffers to be released.  This is
         * critical as new buffers do not make the LRU until they are released.
         *
         * Next, flush the buffer workqueue to ensure all completion processing
         * has finished. Just waiting on buffer locks is not sufficient for
         * async IO as the reference count held over IO is not released until
         * after the buffer lock is dropped. Hence we need to ensure here that
         * all reference counts have been dropped before we start walking the
         * LRU list.
         */
        while (percpu_counter_sum(&btp->bt_readahead_count))
                delay(100);
        flush_workqueue(btp->bt_mount->m_buf_workqueue);
}

void
xfs_buftarg_drain(
        struct xfs_buftarg        *btp)
{
        LIST_HEAD(dispose);
        int                        loop = 0;
        bool                        write_fail = false;

        xfs_buftarg_wait(btp);

        /* loop until there is nothing left on the lru list. */
        while (list_lru_count(&btp->bt_lru)) {
                list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
                              &dispose, LONG_MAX);

                while (!list_empty(&dispose)) {
                        struct xfs_buf *bp;
                        bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
                        list_del_init(&bp->b_lru);
                        if (bp->b_flags & XBF_WRITE_FAIL) {
                                write_fail = true;
                                xfs_buf_alert_ratelimited(bp,
                                        "XFS: Corruption Alert",
"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
                                        (long long)xfs_buf_daddr(bp));
                        }
                        xfs_buf_destroy(bp);
                }
                if (loop++ != 0)
                        delay(100);
        }

        /*
         * If one or more failed buffers were freed, that means dirty metadata
         * was thrown away. This should only ever happen after I/O completion
         * handling has elevated I/O error(s) to permanent failures and shuts
         * down the journal.
         */
        if (write_fail) {
                ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
                xfs_alert(btp->bt_mount,
              "Please run xfs_repair to determine the extent of the problem.");
        }
}

static enum lru_status
xfs_buftarg_isolate(
        struct list_head        *item,
        struct list_lru_one        *lru,
        void                        *arg)
{
        struct xfs_buf                *bp = container_of(item, struct xfs_buf, b_lru);
        struct list_head        *dispose = arg;

        /*
         * We are inverting the lru lock vs bp->b_lockref.lock order here, so
         * use a trylock.  If we fail to get the lock, just skip the buffer.
         */
        if (!spin_trylock(&bp->b_lockref.lock))
                return LRU_SKIP;

        /*
         * If the buffer is in use, remove it from the LRU for now.  We can't
         * free it while someone is using it, and we should also not count
         * eviction passed for it, just as if it hadn't been added to the LRU
         * yet.
         */
        if (bp->b_lockref.count > 0) {
                list_lru_isolate(lru, &bp->b_lru);
                spin_unlock(&bp->b_lockref.lock);
                return LRU_REMOVED;
        }

        /*
         * Decrement the b_lru_ref count unless the value is already
         * zero. If the value is already zero, we need to reclaim the
         * buffer, otherwise it gets another trip through the LRU.
         */
        if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
                spin_unlock(&bp->b_lockref.lock);
                return LRU_ROTATE;
        }

        lockref_mark_dead(&bp->b_lockref);
        list_lru_isolate_move(lru, item, dispose);
        spin_unlock(&bp->b_lockref.lock);
        return LRU_REMOVED;
}

static unsigned long
xfs_buftarg_shrink_scan(
        struct shrinker                *shrink,
        struct shrink_control        *sc)
{
        struct xfs_buftarg        *btp = shrink->private_data;
        LIST_HEAD(dispose);
        unsigned long                freed;

        freed = list_lru_shrink_walk(&btp->bt_lru, sc,
                                     xfs_buftarg_isolate, &dispose);

        while (!list_empty(&dispose)) {
                struct xfs_buf *bp;
                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
                list_del_init(&bp->b_lru);
                xfs_buf_destroy(bp);
        }

        return freed;
}

static unsigned long
xfs_buftarg_shrink_count(
        struct shrinker                *shrink,
        struct shrink_control        *sc)
{
        struct xfs_buftarg        *btp = shrink->private_data;
        return list_lru_shrink_count(&btp->bt_lru, sc);
}

void
xfs_destroy_buftarg(
        struct xfs_buftarg        *btp)
{
        shrinker_free(btp->bt_shrinker);
        ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
        percpu_counter_destroy(&btp->bt_readahead_count);
        list_lru_destroy(&btp->bt_lru);
        rhashtable_destroy(&btp->bt_hash);
}

void
xfs_free_buftarg(
        struct xfs_buftarg        *btp)
{
        xfs_destroy_buftarg(btp);
        fs_put_dax(btp->bt_daxdev, btp->bt_mount);
        /* the main block device is closed by kill_block_super */
        if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
                bdev_fput(btp->bt_file);
        kfree(btp);
}

/*
 * Configure this buffer target for hardware-assisted atomic writes if the
 * underlying block device supports is congruent with the filesystem geometry.
 */
static inline void
xfs_configure_buftarg_atomic_writes(
        struct xfs_buftarg        *btp)
{
        struct xfs_mount        *mp = btp->bt_mount;
        unsigned int                min_bytes, max_bytes;

        min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev);
        max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev);

        /*
         * Ignore atomic write geometry that is nonsense or doesn't even cover
         * a single fsblock.
         */
        if (min_bytes > max_bytes ||
            min_bytes > mp->m_sb.sb_blocksize ||
            max_bytes < mp->m_sb.sb_blocksize) {
                min_bytes = 0;
                max_bytes = 0;
        }

        btp->bt_awu_min = min_bytes;
        btp->bt_awu_max = max_bytes;
}

/* Configure a buffer target that abstracts a block device. */
int
xfs_configure_buftarg(
        struct xfs_buftarg        *btp,
        unsigned int                sectorsize,
        xfs_rfsblock_t                nr_blocks)
{
        struct xfs_mount        *mp = btp->bt_mount;

        if (btp->bt_bdev) {
                int                error;

                error = bdev_validate_blocksize(btp->bt_bdev, sectorsize);
                if (error) {
                        xfs_warn(mp,
                                "Cannot use blocksize %u on device %pg, err %d",
                                sectorsize, btp->bt_bdev, error);
                        return -EINVAL;
                }

                if (bdev_can_atomic_write(btp->bt_bdev))
                        xfs_configure_buftarg_atomic_writes(btp);
        }

        btp->bt_meta_sectorsize = sectorsize;
        btp->bt_meta_sectormask = sectorsize - 1;
        /* m_blkbb_log is not set up yet */
        btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT);
        return 0;
}

int
xfs_init_buftarg(
        struct xfs_buftarg                *btp,
        size_t                                logical_sectorsize,
        const char                        *descr)
{
        /* The maximum size of the buftarg is only known once the sb is read. */
        btp->bt_nr_sectors = XFS_BUF_DADDR_MAX;

        /* Set up device logical sector size mask */
        btp->bt_logical_sectorsize = logical_sectorsize;
        btp->bt_logical_sectormask = logical_sectorsize - 1;

        /*
         * Buffer IO error rate limiting. Limit it to no more than 10 messages
         * per 30 seconds so as to not spam logs too much on repeated errors.
         */
        ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
                             DEFAULT_RATELIMIT_BURST);

        if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
                return -ENOMEM;
        if (list_lru_init(&btp->bt_lru))
                goto out_destroy_hash;
        if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
                goto out_destroy_lru;

        btp->bt_shrinker =
                shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr);
        if (!btp->bt_shrinker)
                goto out_destroy_io_count;
        btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
        btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
        btp->bt_shrinker->private_data = btp;
        shrinker_register(btp->bt_shrinker);
        return 0;

out_destroy_io_count:
        percpu_counter_destroy(&btp->bt_readahead_count);
out_destroy_lru:
        list_lru_destroy(&btp->bt_lru);
out_destroy_hash:
        rhashtable_destroy(&btp->bt_hash);
        return -ENOMEM;
}

struct xfs_buftarg *
xfs_alloc_buftarg(
        struct xfs_mount        *mp,
        struct file                *bdev_file)
{
        struct xfs_buftarg        *btp;
        const struct dax_holder_operations *ops = NULL;
        int                        error;


#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
        ops = &xfs_dax_holder_operations;
#endif
        btp = kzalloc_obj(*btp, GFP_KERNEL | __GFP_NOFAIL);

        btp->bt_mount = mp;
        btp->bt_file = bdev_file;
        btp->bt_bdev = file_bdev(bdev_file);
        btp->bt_dev = btp->bt_bdev->bd_dev;
        btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
                                            mp, ops);

        /*
         * Flush and invalidate all devices' pagecaches before reading any
         * metadata because XFS doesn't use the bdev pagecache.
         */
        error = sync_blockdev(btp->bt_bdev);
        if (error)
                goto error_free;

        /*
         * When allocating the buftargs we have not yet read the super block and
         * thus don't know the file system sector size yet.
         */
        btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev);
        btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1;

        error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize,
                                mp->m_super->s_id);
        if (error)
                goto error_free;

        return btp;

error_free:
        fs_put_dax(btp->bt_daxdev, mp);
        kfree(btp);
        return ERR_PTR(error);
}

static inline void
xfs_buf_list_del(
        struct xfs_buf                *bp)
{
        list_del_init(&bp->b_list);
        wake_up_var(&bp->b_list);
}

/*
 * Cancel a delayed write list.
 *
 * Remove each buffer from the list, clear the delwri queue flag and drop the
 * associated buffer reference.
 */
void
xfs_buf_delwri_cancel(
        struct list_head        *list)
{
        struct xfs_buf                *bp;

        while (!list_empty(list)) {
                bp = list_first_entry(list, struct xfs_buf, b_list);

                xfs_buf_lock(bp);
                bp->b_flags &= ~_XBF_DELWRI_Q;
                xfs_buf_list_del(bp);
                xfs_buf_relse(bp);
        }
}

/*
 * Add a buffer to the delayed write list.
 *
 * This queues a buffer for writeout if it hasn't already been.  Note that
 * neither this routine nor the buffer list submission functions perform
 * any internal synchronization.  It is expected that the lists are thread-local
 * to the callers.
 *
 * Returns true if we queued up the buffer, or false if it already had
 * been on the buffer list.
 */
bool
xfs_buf_delwri_queue(
        struct xfs_buf                *bp,
        struct list_head        *list)
{
        ASSERT(xfs_buf_islocked(bp));
        ASSERT(!(bp->b_flags & XBF_READ));

        /*
         * If the buffer is already marked delwri it already is queued up
         * by someone else for imediate writeout.  Just ignore it in that
         * case.
         */
        if (bp->b_flags & _XBF_DELWRI_Q) {
                trace_xfs_buf_delwri_queued(bp, _RET_IP_);
                return false;
        }

        trace_xfs_buf_delwri_queue(bp, _RET_IP_);

        /*
         * If a buffer gets written out synchronously or marked stale while it
         * is on a delwri list we lazily remove it. To do this, the other party
         * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
         * It remains referenced and on the list.  In a rare corner case it
         * might get readded to a delwri list after the synchronous writeout, in
         * which case we need just need to re-add the flag here.
         */
        bp->b_flags |= _XBF_DELWRI_Q;
        if (list_empty(&bp->b_list)) {
                xfs_buf_hold(bp);
                list_add_tail(&bp->b_list, list);
        }

        return true;
}

/*
 * Queue a buffer to this delwri list as part of a data integrity operation.
 * If the buffer is on any other delwri list, we'll wait for that to clear
 * so that the caller can submit the buffer for IO and wait for the result.
 * Callers must ensure the buffer is not already on the list.
 */
void
xfs_buf_delwri_queue_here(
        struct xfs_buf                *bp,
        struct list_head        *buffer_list)
{
        /*
         * We need this buffer to end up on the /caller's/ delwri list, not any
         * old list.  This can happen if the buffer is marked stale (which
         * clears DELWRI_Q) after the AIL queues the buffer to its list but
         * before the AIL has a chance to submit the list.
         */
        while (!list_empty(&bp->b_list)) {
                xfs_buf_unlock(bp);
                wait_var_event(&bp->b_list, list_empty(&bp->b_list));
                xfs_buf_lock(bp);
        }

        ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));

        xfs_buf_delwri_queue(bp, buffer_list);
}

/*
 * Compare function is more complex than it needs to be because
 * the return value is only 32 bits and we are doing comparisons
 * on 64 bit values
 */
static int
xfs_buf_cmp(
        void                        *priv,
        const struct list_head        *a,
        const struct list_head        *b)
{
        struct xfs_buf        *ap = container_of(a, struct xfs_buf, b_list);
        struct xfs_buf        *bp = container_of(b, struct xfs_buf, b_list);
        xfs_daddr_t                diff;

        diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
        if (diff < 0)
                return -1;
        if (diff > 0)
                return 1;
        return 0;
}

static bool
xfs_buf_delwri_submit_prep(
        struct xfs_buf                *bp)
{
        /*
         * Someone else might have written the buffer synchronously or marked it
         * stale in the meantime.  In that case only the _XBF_DELWRI_Q flag got
         * cleared, and we have to drop the reference and remove it from the
         * list here.
         */
        if (!(bp->b_flags & _XBF_DELWRI_Q)) {
                xfs_buf_list_del(bp);
                xfs_buf_relse(bp);
                return false;
        }

        trace_xfs_buf_delwri_split(bp, _RET_IP_);
        bp->b_flags &= ~_XBF_DELWRI_Q;
        bp->b_flags |= XBF_WRITE;
        return true;
}

/*
 * Write out a buffer list asynchronously.
 *
 * This will take the @buffer_list, write all non-locked and non-pinned buffers
 * out and not wait for I/O completion on any of the buffers.  This interface
 * is only safely useable for callers that can track I/O completion by higher
 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
 * function.
 *
 * Note: this function will skip buffers it would block on, and in doing so
 * leaves them on @buffer_list so they can be retried on a later pass. As such,
 * it is up to the caller to ensure that the buffer list is fully submitted or
 * cancelled appropriately when they are finished with the list. Failure to
 * cancel or resubmit the list until it is empty will result in leaked buffers
 * at unmount time.
 */
int
xfs_buf_delwri_submit_nowait(
        struct list_head        *buffer_list)
{
        struct xfs_buf                *bp, *n;
        int                        pinned = 0;
        struct blk_plug                plug;

        list_sort(NULL, buffer_list, xfs_buf_cmp);

        blk_start_plug(&plug);
        list_for_each_entry_safe(bp, n, buffer_list, b_list) {
                if (!xfs_buf_trylock(bp))
                        continue;
                if (xfs_buf_ispinned(bp)) {
                        xfs_buf_unlock(bp);
                        pinned++;
                        continue;
                }
                if (!xfs_buf_delwri_submit_prep(bp))
                        continue;
                bp->b_flags |= XBF_ASYNC;
                xfs_buf_list_del(bp);
                xfs_buf_submit(bp);
        }
        blk_finish_plug(&plug);

        return pinned;
}

/*
 * Write out a buffer list synchronously.
 *
 * This will take the @buffer_list, write all buffers out and wait for I/O
 * completion on all of the buffers. @buffer_list is consumed by the function,
 * so callers must have some other way of tracking buffers if they require such
 * functionality.
 */
int
xfs_buf_delwri_submit(
        struct list_head        *buffer_list)
{
        LIST_HEAD                (wait_list);
        int                        error = 0, error2;
        struct xfs_buf                *bp, *n;
        struct blk_plug                plug;

        list_sort(NULL, buffer_list, xfs_buf_cmp);

        blk_start_plug(&plug);
        list_for_each_entry_safe(bp, n, buffer_list, b_list) {
                xfs_buf_lock(bp);
                if (!xfs_buf_delwri_submit_prep(bp))
                        continue;
                bp->b_flags &= ~XBF_ASYNC;
                list_move_tail(&bp->b_list, &wait_list);
                xfs_buf_submit(bp);
        }
        blk_finish_plug(&plug);

        /* Wait for IO to complete. */
        while (!list_empty(&wait_list)) {
                bp = list_first_entry(&wait_list, struct xfs_buf, b_list);

                xfs_buf_list_del(bp);

                /*
                 * Wait on the locked buffer, check for errors and unlock and
                 * release the delwri queue reference.
                 */
                error2 = xfs_buf_iowait(bp);
                xfs_buf_relse(bp);
                if (!error)
                        error = error2;
        }

        return error;
}

void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
        /*
         * Set the lru reference count to 0 based on the error injection tag.
         * This allows userspace to disrupt buffer caching for debug/testing
         * purposes.
         */
        if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
                lru_ref = 0;

        atomic_set(&bp->b_lru_ref, lru_ref);
}

/*
 * Verify an on-disk magic value against the magic value specified in the
 * verifier structure. The verifier magic is in disk byte order so the caller is
 * expected to pass the value directly from disk.
 */
bool
xfs_verify_magic(
        struct xfs_buf                *bp,
        __be32                        dmagic)
{
        struct xfs_mount        *mp = bp->b_mount;
        int                        idx;

        idx = xfs_has_crc(mp);
        if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
                return false;
        return dmagic == bp->b_ops->magic[idx];
}
/*
 * Verify an on-disk magic value against the magic value specified in the
 * verifier structure. The verifier magic is in disk byte order so the caller is
 * expected to pass the value directly from disk.
 */
bool
xfs_verify_magic16(
        struct xfs_buf                *bp,
        __be16                        dmagic)
{
        struct xfs_mount        *mp = bp->b_mount;
        int                        idx;

        idx = xfs_has_crc(mp);
        if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
                return false;
        return dmagic == bp->b_ops->magic16[idx];
}








































































    1 


    1 


    1 

    1 




    1 





    1 









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Pseudo-driver for the loopback interface.
 *
 * Version:        @(#)loopback.c        1.0.4b        08/16/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Donald Becker, <becker@scyld.com>
 *
 *                Alan Cox        :        Fixed oddments for NET3.014
 *                Alan Cox        :        Rejig for NET3.029 snap #3
 *                Alan Cox        :        Fixed NET3.029 bugs and sped up
 *                Larry McVoy        :        Tiny tweak to double performance
 *                Alan Cox        :        Backed out LMV's tweak - the linux mm
 *                                        can't take it...
 *              Michael Griffith:       Don't bother computing the checksums
 *                                      on packets received on the loopback
 *                                      interface.
 *                Alexey Kuznetsov:        Potential hang under some extreme
 *                                        cases removed.
 */
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/errno.h>
#include <linux/fcntl.h>
#include <linux/in.h>

#include <linux/uaccess.h>
#include <linux/io.h>

#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/ethtool.h>
#include <net/sch_generic.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <linux/if_ether.h>        /* For the statistics structure. */
#include <linux/if_arp.h>        /* For ARPHRD_ETHER */
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/percpu.h>
#include <linux/net_tstamp.h>
#include <net/net_namespace.h>
#include <net/netdev_lock.h>
#include <linux/u64_stats_sync.h>

/* blackhole_netdev - a device used for dsts that are marked expired!
 * This is global device (instead of per-net-ns) since it's not needed
 * to be per-ns and gets initialized at boot time.
 */
struct net_device *blackhole_netdev;
EXPORT_SYMBOL(blackhole_netdev);

/* The higher levels take care of making this non-reentrant (it's
 * called with bh's disabled).
 */
static netdev_tx_t loopback_xmit(struct sk_buff *skb,
                                 struct net_device *dev)
{
        int len;

        skb_tx_timestamp(skb);

        /* do not fool net_timestamp_check() with various clock bases */
        skb_clear_tstamp(skb);

        skb_orphan(skb);

        /* Before queueing this packet to __netif_rx(),
         * make sure dst is refcounted.
         */
        skb_dst_force(skb);

        skb->protocol = eth_type_trans(skb, dev);

        len = skb->len;
        if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
                dev_lstats_add(dev, len);

        return NETDEV_TX_OK;
}

void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes)
{
        int i;

        *packets = 0;
        *bytes = 0;

        for_each_possible_cpu(i) {
                const struct pcpu_lstats *lb_stats;
                u64 tbytes, tpackets;
                unsigned int start;

                lb_stats = per_cpu_ptr(dev->lstats, i);
                do {
                        start = u64_stats_fetch_begin(&lb_stats->syncp);
                        tpackets = u64_stats_read(&lb_stats->packets);
                        tbytes = u64_stats_read(&lb_stats->bytes);
                } while (u64_stats_fetch_retry(&lb_stats->syncp, start));
                *bytes   += tbytes;
                *packets += tpackets;
        }
}
EXPORT_SYMBOL(dev_lstats_read);

static void loopback_get_stats64(struct net_device *dev,
                                 struct rtnl_link_stats64 *stats)
{
        u64 packets, bytes;

        dev_lstats_read(dev, &packets, &bytes);

        stats->rx_packets = packets;
        stats->tx_packets = packets;
        stats->rx_bytes   = bytes;
        stats->tx_bytes   = bytes;
}

static u32 always_on(struct net_device *dev)
{
        return 1;
}

static const struct ethtool_ops loopback_ethtool_ops = {
        .get_link                = always_on,
        .get_ts_info                = ethtool_op_get_ts_info,
};

static int loopback_dev_init(struct net_device *dev)
{
        netdev_lockdep_set_classes(dev);
        return 0;
}

static void loopback_dev_free(struct net_device *dev)
{
        dev_net(dev)->loopback_dev = NULL;
}

static const struct net_device_ops loopback_ops = {
        .ndo_init        = loopback_dev_init,
        .ndo_start_xmit  = loopback_xmit,
        .ndo_get_stats64 = loopback_get_stats64,
        .ndo_set_mac_address = eth_mac_addr,
};

static void gen_lo_setup(struct net_device *dev,
                         unsigned int mtu,
                         const struct ethtool_ops *eth_ops,
                         const struct header_ops *hdr_ops,
                         const struct net_device_ops *dev_ops,
                         void (*dev_destructor)(struct net_device *dev))
{
        dev->mtu                = mtu;
        dev->hard_header_len        = ETH_HLEN;        /* 14        */
        dev->min_header_len        = ETH_HLEN;        /* 14        */
        dev->addr_len                = ETH_ALEN;        /* 6        */
        dev->type                = ARPHRD_LOOPBACK;        /* 0x0001*/
        dev->flags                = IFF_LOOPBACK;
        dev->priv_flags                |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
        dev->lltx                = true;
        dev->netns_immutable        = true;
        netif_keep_dst(dev);
        dev->hw_features        = NETIF_F_GSO_SOFTWARE;
        dev->features                = NETIF_F_SG | NETIF_F_FRAGLIST
                | NETIF_F_GSO_SOFTWARE
                | NETIF_F_HW_CSUM
                | NETIF_F_RXCSUM
                | NETIF_F_SCTP_CRC
                | NETIF_F_HIGHDMA
                | NETIF_F_VLAN_CHALLENGED
                | NETIF_F_LOOPBACK;
        dev->ethtool_ops        = eth_ops;
        dev->header_ops                = hdr_ops;
        dev->netdev_ops                = dev_ops;
        dev->needs_free_netdev        = true;
        dev->pcpu_stat_type        = NETDEV_PCPU_STAT_LSTATS;
        dev->priv_destructor        = dev_destructor;

        netif_set_tso_max_size(dev, GSO_MAX_SIZE);
}

/* The loopback device is special. There is only one instance
 * per network namespace.
 */
static void loopback_setup(struct net_device *dev)
{
        gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, &eth_header_ops,
                     &loopback_ops, loopback_dev_free);
}

/* Setup and register the loopback device. */
static __net_init int loopback_net_init(struct net *net)
{
        struct net_device *dev;
        int err;

        err = -ENOMEM;
        dev = alloc_netdev(0, "lo", NET_NAME_PREDICTABLE, loopback_setup);
        if (!dev)
                goto out;

        dev_net_set(dev, net);
        err = register_netdev(dev);
        if (err)
                goto out_free_netdev;

        BUG_ON(dev->ifindex != LOOPBACK_IFINDEX);
        net->loopback_dev = dev;
        return 0;

out_free_netdev:
        free_netdev(dev);
out:
        if (net_eq(net, &init_net))
                panic("loopback: Failed to register netdevice: %d\n", err);
        return err;
}

/* Registered in net/core/dev.c */
struct pernet_operations __net_initdata loopback_net_ops = {
        .init = loopback_net_init,
};

/* blackhole netdevice */
static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb,
                                         struct net_device *dev)
{
        kfree_skb(skb);
        net_warn_ratelimited("%s(): Dropping skb.\n", __func__);
        return NETDEV_TX_OK;
}

static int blackhole_neigh_output(struct neighbour *n, struct sk_buff *skb)
{
        kfree_skb(skb);
        return 0;
}

static int blackhole_neigh_construct(struct net_device *dev,
                                     struct neighbour *n)
{
        n->output = blackhole_neigh_output;
        return 0;
}

static const struct net_device_ops blackhole_netdev_ops = {
        .ndo_start_xmit = blackhole_netdev_xmit,
        .ndo_neigh_construct = blackhole_neigh_construct,
};

/* This is a dst-dummy device used specifically for invalidated
 * DSTs and unlike loopback, this is not per-ns.
 */
static void blackhole_netdev_setup(struct net_device *dev)
{
        gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL);
}

/* Setup and register the blackhole_netdev. */
static int __init blackhole_netdev_init(void)
{
        blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN,
                                        blackhole_netdev_setup);
        if (!blackhole_netdev)
                return -ENOMEM;

        rtnl_net_lock(&init_net);
        dev_init_scheduler(blackhole_netdev);
        dev_activate(blackhole_netdev);
        rtnl_net_unlock(&init_net);

        blackhole_netdev->flags |= IFF_UP | IFF_RUNNING;

        return 0;
}

device_initcall(blackhole_netdev_init);

































    2 













































    2 



    2 

    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
// SPDX-License-Identifier: GPL-2.0
/*
 * Compatibility functions which bloat the callers too much to make inline.
 * All of the callers of these functions should be converted to use folios
 * eventually.
 */

#include <linux/migrate.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include "internal.h"

void unlock_page(struct page *page)
{
        return folio_unlock(page_folio(page));
}
EXPORT_SYMBOL(unlock_page);

void end_page_writeback(struct page *page)
{
        return folio_end_writeback(page_folio(page));
}
EXPORT_SYMBOL(end_page_writeback);

void wait_on_page_writeback(struct page *page)
{
        return folio_wait_writeback(page_folio(page));
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);

void mark_page_accessed(struct page *page)
{
        folio_mark_accessed(page_folio(page));
}
EXPORT_SYMBOL(mark_page_accessed);

void set_page_writeback(struct page *page)
{
        folio_start_writeback(page_folio(page));
}
EXPORT_SYMBOL(set_page_writeback);

bool set_page_dirty(struct page *page)
{
        return folio_mark_dirty(page_folio(page));
}
EXPORT_SYMBOL(set_page_dirty);

int set_page_dirty_lock(struct page *page)
{
        return folio_mark_dirty_lock(page_folio(page));
}
EXPORT_SYMBOL(set_page_dirty_lock);

bool clear_page_dirty_for_io(struct page *page)
{
        return folio_clear_dirty_for_io(page_folio(page));
}
EXPORT_SYMBOL(clear_page_dirty_for_io);

bool redirty_page_for_writepage(struct writeback_control *wbc,
                struct page *page)
{
        return folio_redirty_for_writepage(wbc, page_folio(page));
}
EXPORT_SYMBOL(redirty_page_for_writepage);

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                pgoff_t index, gfp_t gfp)
{
        return filemap_add_folio(mapping, page_folio(page), index, gfp);
}
EXPORT_SYMBOL(add_to_page_cache_lru);

noinline
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp)
{
        struct folio *folio;

        folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
        if (IS_ERR(folio))
                return NULL;
        return folio_file_page(folio, index);
}
EXPORT_SYMBOL(pagecache_get_page);

















































    1 



    1 










    1 








    1 




    1 



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfsplus/options.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Option parsing
 */

#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/fs_struct.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/nls.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include "hfsplus_fs.h"

enum {
        opt_creator, opt_type,
        opt_umask, opt_uid, opt_gid,
        opt_part, opt_session, opt_nls,
        opt_decompose, opt_barrier,
        opt_force,
};

static const struct fs_parameter_spec hfs_param_spec[] = {
        fsparam_string        ("creator",        opt_creator),
        fsparam_string        ("type",        opt_type),
        fsparam_u32oct        ("umask",        opt_umask),
        fsparam_u32        ("uid",                opt_uid),
        fsparam_u32        ("gid",                opt_gid),
        fsparam_u32        ("part",        opt_part),
        fsparam_u32        ("session",        opt_session),
        fsparam_string        ("nls",                opt_nls),
        fsparam_flag_no        ("decompose",        opt_decompose),
        fsparam_flag_no        ("barrier",        opt_barrier),
        fsparam_flag        ("force",        opt_force),
        {}
};

/* Initialize an options object to reasonable defaults */
void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
{
        if (!opts)
                return;

        opts->creator = HFSPLUS_DEF_CR_TYPE;
        opts->type = HFSPLUS_DEF_CR_TYPE;
        opts->umask = current_umask();
        opts->uid = current_uid();
        opts->gid = current_gid();
        opts->part = -1;
        opts->session = -1;
}

/* Parse options from mount. Returns nonzero errno on failure */
int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct hfsplus_sb_info *sbi = fc->s_fs_info;
        struct fs_parse_result result;
        int opt;

        /*
         * Only the force option is examined during remount, all others
         * are ignored.
         */
        if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
            strncmp(param->key, "force", 5))
                return 0;

        opt = fs_parse(fc, hfs_param_spec, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case opt_creator:
                if (strlen(param->string) != 4) {
                        pr_err("creator requires a 4 character value\n");
                        return -EINVAL;
                }
                memcpy(&sbi->creator, param->string, 4);
                break;
        case opt_type:
                if (strlen(param->string) != 4) {
                        pr_err("type requires a 4 character value\n");
                        return -EINVAL;
                }
                memcpy(&sbi->type, param->string, 4);
                break;
        case opt_umask:
                sbi->umask = (umode_t)result.uint_32;
                break;
        case opt_uid:
                sbi->uid = result.uid;
                set_bit(HFSPLUS_SB_UID, &sbi->flags);
                break;
        case opt_gid:
                sbi->gid = result.gid;
                set_bit(HFSPLUS_SB_GID, &sbi->flags);
                break;
        case opt_part:
                sbi->part = result.uint_32;
                break;
        case opt_session:
                sbi->session = result.uint_32;
                break;
        case opt_nls:
                if (sbi->nls) {
                        pr_err("unable to change nls mapping\n");
                        return -EINVAL;
                }
                sbi->nls = load_nls(param->string);
                if (!sbi->nls) {
                        pr_err("unable to load nls mapping \"%s\"\n",
                               param->string);
                        return -EINVAL;
                }
                break;
        case opt_decompose:
                if (result.negated)
                        set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
                else
                        clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
                break;
        case opt_barrier:
                if (result.negated)
                        set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
                else
                        clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
                break;
        case opt_force:
                set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
{
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);

        if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
                seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
        if (sbi->type != HFSPLUS_DEF_CR_TYPE)
                seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
        seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
                        from_kuid_munged(&init_user_ns, sbi->uid),
                        from_kgid_munged(&init_user_ns, sbi->gid));
        if (sbi->part >= 0)
                seq_printf(seq, ",part=%u", sbi->part);
        if (sbi->session >= 0)
                seq_printf(seq, ",session=%u", sbi->session);
        if (sbi->nls)
                seq_printf(seq, ",nls=%s", sbi->nls->charset);
        if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
                seq_puts(seq, ",nodecompose");
        if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
                seq_puts(seq, ",nobarrier");
        return 0;
}



































































































































































































































































































































































































































































































    1 







    3 

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Generic nexthop implementation
 *
 * Copyright (c) 2017-19 Cumulus Networks
 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
 */

#ifndef __LINUX_NEXTHOP_H
#define __LINUX_NEXTHOP_H

#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/route.h>
#include <linux/types.h>
#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/netlink.h>

#define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK

struct nexthop;

struct nh_config {
        u32                nh_id;

        u8                nh_family;
        u8                nh_protocol;
        u8                nh_blackhole;
        u8                nh_fdb;
        u32                nh_flags;

        int                nh_ifindex;
        struct net_device *dev;

        union {
                __be32                ipv4;
                struct in6_addr        ipv6;
        } gw;

        struct nlattr        *nh_grp;
        u16                nh_grp_type;
        u16                nh_grp_res_num_buckets;
        unsigned long        nh_grp_res_idle_timer;
        unsigned long        nh_grp_res_unbalanced_timer;
        bool                nh_grp_res_has_num_buckets;
        bool                nh_grp_res_has_idle_timer;
        bool                nh_grp_res_has_unbalanced_timer;

        bool                nh_hw_stats;

        struct nlattr        *nh_encap;
        u16                nh_encap_type;

        u32                nlflags;
        struct nl_info        nlinfo;
};

struct nh_info {
        struct hlist_node        dev_hash;    /* entry on netns devhash */
        struct nexthop                *nh_parent;

        u8                        family;
        bool                        reject_nh;
        bool                        fdb_nh;

        union {
                struct fib_nh_common        fib_nhc;
                struct fib_nh                fib_nh;
                struct fib6_nh                fib6_nh;
        };
};

struct nh_res_bucket {
        struct nh_grp_entry __rcu *nh_entry;
        atomic_long_t                used_time;
        unsigned long                migrated_time;
        bool                        occupied;
        u8                        nh_flags;
};

struct nh_res_table {
        struct net                *net;
        u32                        nhg_id;
        struct delayed_work        upkeep_dw;

        /* List of NHGEs that have too few buckets ("uw" for underweight).
         * Reclaimed buckets will be given to entries in this list.
         */
        struct list_head        uw_nh_entries;
        unsigned long                unbalanced_since;

        u32                        idle_timer;
        u32                        unbalanced_timer;

        u16                        num_nh_buckets;
        struct nh_res_bucket        nh_buckets[] __counted_by(num_nh_buckets);
};

struct nh_grp_entry_stats {
        u64_stats_t packets;
        struct u64_stats_sync syncp;
};

struct nh_grp_entry {
        struct nexthop        *nh;
        struct nh_grp_entry_stats __percpu        *stats;
        u16                weight;

        union {
                struct {
                        atomic_t        upper_bound;
                } hthr;
                struct {
                        /* Member on uw_nh_entries. */
                        struct list_head        uw_nh_entry;

                        u16                        count_buckets;
                        u16                        wants_buckets;
                } res;
        };

        struct list_head nh_list;
        struct nexthop        *nh_parent;  /* nexthop of group with this entry */
        u64                packets_hw;
};

struct nh_group {
        struct nh_group                *spare; /* spare group for removals */
        u16                        num_nh;
        bool                        is_multipath;
        bool                        hash_threshold;
        bool                        resilient;
        bool                        fdb_nh;
        bool                        has_v4;
        bool                        hw_stats;

        struct nh_res_table __rcu *res_table;
        struct nh_grp_entry        nh_entries[] __counted_by(num_nh);
};

struct nexthop {
        struct rb_node                rb_node;    /* entry on netns rbtree */
        struct list_head        fi_list;    /* v4 entries using nh */
        struct list_head        f6i_list;   /* v6 entries using nh */
        struct list_head        fdb_list;   /* fdb entries using this nh */
        struct list_head        grp_list;   /* nh group entries using this nh */
        struct net                *net;

        u32                        id;

        u8                        protocol;   /* app managing this nh */
        u8                        nh_flags;
        bool                        is_group;
        bool                        dead;
        spinlock_t                lock;       /* protect dead and f6i_list */

        refcount_t                refcnt;
        struct rcu_head                rcu;

        union {
                struct nh_info        __rcu *nh_info;
                struct nh_group __rcu *nh_grp;
        };
};

enum nexthop_event_type {
        NEXTHOP_EVENT_DEL,
        NEXTHOP_EVENT_REPLACE,
        NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
        NEXTHOP_EVENT_BUCKET_REPLACE,
        NEXTHOP_EVENT_HW_STATS_REPORT_DELTA,
};

enum nh_notifier_info_type {
        NH_NOTIFIER_INFO_TYPE_SINGLE,
        NH_NOTIFIER_INFO_TYPE_GRP,
        NH_NOTIFIER_INFO_TYPE_RES_TABLE,
        NH_NOTIFIER_INFO_TYPE_RES_BUCKET,
        NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS,
};

struct nh_notifier_single_info {
        struct net_device *dev;
        u8 gw_family;
        union {
                __be32 ipv4;
                struct in6_addr ipv6;
        };
        u32 id;
        u8 is_reject:1,
           is_fdb:1,
           has_encap:1;
};

struct nh_notifier_grp_entry_info {
        u16 weight;
        struct nh_notifier_single_info nh;
};

struct nh_notifier_grp_info {
        u16 num_nh;
        bool is_fdb;
        bool hw_stats;
        struct nh_notifier_grp_entry_info nh_entries[] __counted_by(num_nh);
};

struct nh_notifier_res_bucket_info {
        u16 bucket_index;
        unsigned int idle_timer_ms;
        bool force;
        struct nh_notifier_single_info old_nh;
        struct nh_notifier_single_info new_nh;
};

struct nh_notifier_res_table_info {
        u16 num_nh_buckets;
        bool hw_stats;
        struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets);
};

struct nh_notifier_grp_hw_stats_entry_info {
        u32 id;
        u64 packets;
};

struct nh_notifier_grp_hw_stats_info {
        u16 num_nh;
        bool hw_stats_used;
        struct nh_notifier_grp_hw_stats_entry_info stats[] __counted_by(num_nh);
};

struct nh_notifier_info {
        struct net *net;
        struct netlink_ext_ack *extack;
        u32 id;
        enum nh_notifier_info_type type;
        union {
                struct nh_notifier_single_info *nh;
                struct nh_notifier_grp_info *nh_grp;
                struct nh_notifier_res_table_info *nh_res_table;
                struct nh_notifier_res_bucket_info *nh_res_bucket;
                struct nh_notifier_grp_hw_stats_info *nh_grp_hw_stats;
        };
};

int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
                              struct netlink_ext_ack *extack);
int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap);
void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
                                 bool offload, bool trap);
void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
                                     unsigned long *activity);
void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info,
                                  unsigned int nh_idx,
                                  u64 delta_packets);

/* caller is holding rcu or rtnl; no reference taken to nexthop */
struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
void nexthop_free_rcu(struct rcu_head *head);

static inline bool nexthop_get(struct nexthop *nh)
{
        return refcount_inc_not_zero(&nh->refcnt);
}

static inline void nexthop_put(struct nexthop *nh)
{
        if (refcount_dec_and_test(&nh->refcnt))
                call_rcu_hurry(&nh->rcu, nexthop_free_rcu);
}

static inline bool nexthop_cmp(const struct nexthop *nh1,
                               const struct nexthop *nh2)
{
        return nh1 == nh2;
}

static inline bool nexthop_is_fdb(const struct nexthop *nh)
{
        if (nh->is_group) {
                const struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->fdb_nh;
        } else {
                const struct nh_info *nhi;

                nhi = rcu_dereference_rtnl(nh->nh_info);
                return nhi->fdb_nh;
        }
}

static inline bool nexthop_has_v4(const struct nexthop *nh)
{
        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->has_v4;
        }
        return false;
}

static inline bool nexthop_is_multipath(const struct nexthop *nh)
{
        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->is_multipath;
        }
        return false;
}

struct nexthop *nexthop_select_path(struct nexthop *nh, int hash);

static inline unsigned int nexthop_num_path(const struct nexthop *nh)
{
        unsigned int rc = 1;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->is_multipath)
                        rc = nh_grp->num_nh;
        }

        return rc;
}

static inline
struct nexthop *nexthop_mpath_select(const struct nh_group *nhg, int nhsel)
{
        /* for_nexthops macros in fib_semantics.c grabs a pointer to
         * the nexthop before checking nhsel
         */
        if (nhsel >= nhg->num_nh)
                return NULL;

        return nhg->nh_entries[nhsel].nh;
}

static inline
int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh,
                            u8 rt_family)
{
        struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
        int i;

        for (i = 0; i < nhg->num_nh; i++) {
                struct nexthop *nhe = nhg->nh_entries[i].nh;
                struct nh_info *nhi = rcu_dereference_rtnl(nhe->nh_info);
                struct fib_nh_common *nhc = &nhi->fib_nhc;
                int weight = nhg->nh_entries[i].weight;

                if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0)
                        return -EMSGSIZE;
        }

        return 0;
}

/* called with rcu lock */
static inline bool nexthop_is_blackhole(const struct nexthop *nh)
{
        const struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->num_nh > 1)
                        return false;

                nh = nh_grp->nh_entries[0].nh;
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        return nhi->reject_nh;
}

static inline void nexthop_path_fib_result(struct fib_result *res, int hash)
{
        struct nh_info *nhi;
        struct nexthop *nh;

        nh = nexthop_select_path(res->fi->nh, hash);
        nhi = rcu_dereference(nh->nh_info);
        res->nhc = &nhi->fib_nhc;
}

/* called with rcu read lock or rtnl held */
static inline
struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel)
{
        struct nh_info *nhi;

        BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0);
        BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0);

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->is_multipath) {
                        nh = nexthop_mpath_select(nh_grp, nhsel);
                        if (!nh)
                                return NULL;
                }
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        return &nhi->fib_nhc;
}

/* called from fib_table_lookup with rcu_lock */
static inline
struct fib_nh_common *nexthop_get_nhc_lookup(const struct nexthop *nh,
                                             int fib_flags,
                                             const struct flowi4 *flp,
                                             int *nhsel)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nhg = rcu_dereference(nh->nh_grp);
                int i;

                for (i = 0; i < nhg->num_nh; i++) {
                        struct nexthop *nhe = nhg->nh_entries[i].nh;

                        nhi = rcu_dereference(nhe->nh_info);
                        if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) {
                                *nhsel = i;
                                return &nhi->fib_nhc;
                        }
                }
        } else {
                nhi = rcu_dereference(nh->nh_info);
                if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) {
                        *nhsel = 0;
                        return &nhi->fib_nhc;
                }
        }

        return NULL;
}

static inline bool nexthop_uses_dev(const struct nexthop *nh,
                                    const struct net_device *dev)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nhg = rcu_dereference(nh->nh_grp);
                int i;

                for (i = 0; i < nhg->num_nh; i++) {
                        struct nexthop *nhe = nhg->nh_entries[i].nh;

                        nhi = rcu_dereference(nhe->nh_info);
                        if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev))
                                return true;
                }
        } else {
                nhi = rcu_dereference(nh->nh_info);
                if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev))
                        return true;
        }

        return false;
}

static inline unsigned int fib_info_num_path(const struct fib_info *fi)
{
        if (unlikely(fi->nh))
                return nexthop_num_path(fi->nh);

        return fi->fib_nhs;
}

int fib_check_nexthop(struct nexthop *nh, u8 scope,
                      struct netlink_ext_ack *extack);

static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel)
{
        if (unlikely(fi->nh))
                return nexthop_fib_nhc(fi->nh, nhsel);

        return &fi->fib_nh[nhsel].nh_common;
}

/* only used when fib_nh is built into fib_info */
static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
{
        WARN_ON(fi->nh);

        return &fi->fib_nh[nhsel];
}

/*
 * IPv6 variants
 */
int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
                       struct netlink_ext_ack *extack);

/* Caller should either hold rcu_read_lock(), or RTNL. */
static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                nh = nexthop_mpath_select(nh_grp, 0);
                if (!nh)
                        return NULL;
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        if (nhi->family == AF_INET6)
                return &nhi->fib6_nh;

        return NULL;
}

static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
{
        struct fib6_nh *fib6_nh;

        fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh;
        return fib6_nh->fib_nh_dev;
}

static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
{
        struct nexthop *nh = res->f6i->nh;
        struct nh_info *nhi;

        nh = nexthop_select_path(nh, hash);

        nhi = rcu_dereference_rtnl(nh->nh_info);
        if (nhi->reject_nh) {
                res->fib6_type = RTN_BLACKHOLE;
                res->fib6_flags |= RTF_REJECT;
                res->nh = nexthop_fib6_nh(nh);
        } else {
                res->nh = &nhi->fib6_nh;
        }
}

int nexthop_for_each_fib6_nh(struct nexthop *nh,
                             int (*cb)(struct fib6_nh *nh, void *arg),
                             void *arg);

static inline int nexthop_get_family(struct nexthop *nh)
{
        struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);

        return nhi->family;
}

static inline
struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh)
{
        struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);

        return &nhi->fib_nhc;
}

static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh,
                                                            int hash)
{
        struct nh_info *nhi;
        struct nexthop *nhp;

        nhp = nexthop_select_path(nh, hash);
        if (unlikely(!nhp))
                return NULL;
        nhi = rcu_dereference(nhp->nh_info);
        return &nhi->fib_nhc;
}
#endif

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    8 








    8 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/swapfile.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 */

#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/shmem_fs.h>
#include <linux/blk-cgroup.h>
#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
#include <linux/mutex.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
#include <linux/swapfile.h>
#include <linux/export.h>
#include <linux/sort.h>
#include <linux/completion.h>
#include <linux/suspend.h>
#include <linux/zswap.h>
#include <linux/plist.h>

#include <asm/tlbflush.h>
#include <linux/leafops.h>
#include <linux/swap_cgroup.h>
#include "swap_table.h"
#include "internal.h"
#include "swap.h"

static void swap_range_alloc(struct swap_info_struct *si,
                             unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
static void move_cluster(struct swap_info_struct *si,
                         struct swap_cluster_info *ci, struct list_head *list,
                         enum swap_cluster_flags new_flags);

/*
 * Protects the swap_info array, and the SWP_USED flag. swap_info contains
 * lazily allocated & freed swap device info struts, and SWP_USED indicates
 * which device is used, ~SWP_USED devices and can be reused.
 *
 * Also protects swap_active_head total_swap_pages, and the SWP_WRITEOK flag.
 */
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
/*
 * Some modules use swappable objects and may try to swap them out under
 * memory pressure (via the shrinker). Before doing so, they may wish to
 * check to see if any swap space is available.
 */
EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
#define DEF_SWAP_PRIO  -1
unsigned long swapfile_maximum_size;
#ifdef CONFIG_MIGRATION
bool swap_migration_ad_supported;
#endif        /* CONFIG_MIGRATION */

static const char Bad_file[] = "Bad swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";

/*
 * all active swap_info_structs
 * protected with swap_lock, and ordered by priority.
 */
static PLIST_HEAD(swap_active_head);

/*
 * all available (active, not full) swap_info_structs
 * protected with swap_avail_lock, ordered by priority.
 * This is used by folio_alloc_swap() instead of swap_active_head
 * because swap_active_head includes all swap_info_structs,
 * but folio_alloc_swap() doesn't need to look at full ones.
 * This uses its own lock instead of swap_lock because when a
 * swap_info_struct changes between not-full/full, it needs to
 * add/remove itself to/from this list, but the swap_info_struct->lock
 * is held and the locking order requires swap_lock to be taken
 * before any swap_info_struct->lock.
 */
static PLIST_HEAD(swap_avail_head);
static DEFINE_SPINLOCK(swap_avail_lock);

struct swap_info_struct *swap_info[MAX_SWAPFILES];

static struct kmem_cache *swap_table_cachep;

/* Protects si->swap_file for /proc/swaps usage */
static DEFINE_MUTEX(swapon_mutex);

static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
/* Activity counter to indicate that a swapon or swapoff has occurred */
static atomic_t proc_poll_event = ATOMIC_INIT(0);

atomic_t nr_rotate_swap = ATOMIC_INIT(0);

struct percpu_swap_cluster {
        struct swap_info_struct *si[SWAP_NR_ORDERS];
        unsigned long offset[SWAP_NR_ORDERS];
        local_lock_t lock;
};

static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
        .si = { NULL },
        .offset = { SWAP_ENTRY_INVALID },
        .lock = INIT_LOCAL_LOCK(),
};

/* May return NULL on invalid type, caller must check for NULL return */
static struct swap_info_struct *swap_type_to_info(int type)
{
        if (type >= MAX_SWAPFILES)
                return NULL;
        return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}

/* May return NULL on invalid entry, caller must check for NULL return */
static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry)
{
        return swap_type_to_info(swp_type(entry));
}

/*
 * Use the second highest bit of inuse_pages counter as the indicator
 * if one swap device is on the available plist, so the atomic can
 * still be updated arithmetically while having special data embedded.
 *
 * inuse_pages counter is the only thing indicating if a device should
 * be on avail_lists or not (except swapon / swapoff). By embedding the
 * off-list bit in the atomic counter, updates no longer need any lock
 * to check the list status.
 *
 * This bit will be set if the device is not on the plist and not
 * usable, will be cleared if the device is on the plist.
 */
#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
static long swap_usage_in_pages(struct swap_info_struct *si)
{
        return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
}

/* Reclaim the swap entry anyway if possible */
#define TTRS_ANYWAY                0x1
/*
 * Reclaim the swap entry if there are no more mappings of the
 * corresponding page
 */
#define TTRS_UNMAPPED                0x2
/* Reclaim the swap entry if swap is getting full */
#define TTRS_FULL                0x4

static bool swap_only_has_cache(struct swap_cluster_info *ci,
                                unsigned long offset, int nr_pages)
{
        unsigned int ci_off = offset % SWAPFILE_CLUSTER;
        unsigned int ci_end = ci_off + nr_pages;
        unsigned long swp_tb;

        do {
                swp_tb = __swap_table_get(ci, ci_off);
                VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb));
                if (swp_tb_get_count(swp_tb))
                        return false;
        } while (++ci_off < ci_end);

        return true;
}

/*
 * returns number of pages in the folio that backs the swap entry. If positive,
 * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
 * folio was associated with the swap entry.
 */
static int __try_to_reclaim_swap(struct swap_info_struct *si,
                                 unsigned long offset, unsigned long flags)
{
        const swp_entry_t entry = swp_entry(si->type, offset);
        struct swap_cluster_info *ci;
        struct folio *folio;
        int ret, nr_pages;
        bool need_reclaim;

again:
        folio = swap_cache_get_folio(entry);
        if (!folio)
                return 0;

        nr_pages = folio_nr_pages(folio);
        ret = -nr_pages;

        /*
         * We hold a folio lock here. We have to use trylock for
         * avoiding deadlock. This is a special case and you should
         * use folio_free_swap() with explicit folio_lock() in usual
         * operations.
         */
        if (!folio_trylock(folio))
                goto out;

        /*
         * Offset could point to the middle of a large folio, or folio
         * may no longer point to the expected offset before it's locked.
         */
        if (!folio_matches_swap_entry(folio, entry)) {
                folio_unlock(folio);
                folio_put(folio);
                goto again;
        }
        offset = swp_offset(folio->swap);

        need_reclaim = ((flags & TTRS_ANYWAY) ||
                        ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
                        ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
        if (!need_reclaim || !folio_swapcache_freeable(folio))
                goto out_unlock;

        /*
         * It's safe to delete the folio from swap cache only if the folio
         * is in swap cache with swap count == 0. The slots have no page table
         * reference or pending writeback, and can't be allocated to others.
         */
        ci = swap_cluster_lock(si, offset);
        need_reclaim = swap_only_has_cache(ci, offset, nr_pages);
        swap_cluster_unlock(ci);
        if (!need_reclaim)
                goto out_unlock;

        swap_cache_del_folio(folio);
        folio_set_dirty(folio);
        ret = nr_pages;
out_unlock:
        folio_unlock(folio);
out:
        folio_put(folio);
        return ret;
}

static inline struct swap_extent *first_se(struct swap_info_struct *sis)
{
        struct rb_node *rb = rb_first(&sis->swap_extent_root);
        return rb_entry(rb, struct swap_extent, rb_node);
}

static inline struct swap_extent *next_se(struct swap_extent *se)
{
        struct rb_node *rb = rb_next(&se->rb_node);
        return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
}

/*
 * swapon tell device that all the old swap contents can be discarded,
 * to allow the swap device to optimize its wear-levelling.
 */
static int discard_swap(struct swap_info_struct *si)
{
        struct swap_extent *se;
        sector_t start_block;
        sector_t nr_blocks;
        int err = 0;

        /* Do not discard the swap header page! */
        se = first_se(si);
        start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
        if (nr_blocks) {
                err = blkdev_issue_discard(si->bdev, start_block,
                                nr_blocks, GFP_KERNEL);
                if (err)
                        return err;
                cond_resched();
        }

        for (se = next_se(se); se; se = next_se(se)) {
                start_block = se->start_block << (PAGE_SHIFT - 9);
                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);

                err = blkdev_issue_discard(si->bdev, start_block,
                                nr_blocks, GFP_KERNEL);
                if (err)
                        break;

                cond_resched();
        }
        return err;                /* That will often be -EOPNOTSUPP */
}

static struct swap_extent *
offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
{
        struct swap_extent *se;
        struct rb_node *rb;

        rb = sis->swap_extent_root.rb_node;
        while (rb) {
                se = rb_entry(rb, struct swap_extent, rb_node);
                if (offset < se->start_page)
                        rb = rb->rb_left;
                else if (offset >= se->start_page + se->nr_pages)
                        rb = rb->rb_right;
                else
                        return se;
        }
        /* It *must* be present */
        BUG();
}

sector_t swap_folio_sector(struct folio *folio)
{
        struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
        struct swap_extent *se;
        sector_t sector;
        pgoff_t offset;

        offset = swp_offset(folio->swap);
        se = offset_to_swap_extent(sis, offset);
        sector = se->start_block + (offset - se->start_page);
        return sector << (PAGE_SHIFT - 9);
}

/*
 * swap allocation tell device that a cluster of swap can now be discarded,
 * to allow the swap device to optimize its wear-levelling.
 */
static void discard_swap_cluster(struct swap_info_struct *si,
                                 pgoff_t start_page, pgoff_t nr_pages)
{
        struct swap_extent *se = offset_to_swap_extent(si, start_page);

        while (nr_pages) {
                pgoff_t offset = start_page - se->start_page;
                sector_t start_block = se->start_block + offset;
                sector_t nr_blocks = se->nr_pages - offset;

                if (nr_blocks > nr_pages)
                        nr_blocks = nr_pages;
                start_page += nr_blocks;
                nr_pages -= nr_blocks;

                start_block <<= PAGE_SHIFT - 9;
                nr_blocks <<= PAGE_SHIFT - 9;
                if (blkdev_issue_discard(si->bdev, start_block,
                                        nr_blocks, GFP_NOIO))
                        break;

                se = next_se(se);
        }
}

#define LATENCY_LIMIT                256

static inline bool cluster_is_empty(struct swap_cluster_info *info)
{
        return info->count == 0;
}

static inline bool cluster_is_discard(struct swap_cluster_info *info)
{
        return info->flags == CLUSTER_FLAG_DISCARD;
}

static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci)
{
        return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock));
}

static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
{
        if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
                return false;
        if (!cluster_table_is_alloced(ci))
                return false;
        if (!order)
                return true;
        return cluster_is_empty(ci) || order == ci->order;
}

static inline unsigned int cluster_index(struct swap_info_struct *si,
                                         struct swap_cluster_info *ci)
{
        return ci - si->cluster_info;
}

static inline unsigned int cluster_offset(struct swap_info_struct *si,
                                          struct swap_cluster_info *ci)
{
        return cluster_index(si, ci) * SWAPFILE_CLUSTER;
}

static struct swap_table *swap_table_alloc(gfp_t gfp)
{
        struct folio *folio;

        if (!SWP_TABLE_USE_PAGE)
                return kmem_cache_zalloc(swap_table_cachep, gfp);

        folio = folio_alloc(gfp | __GFP_ZERO, 0);
        if (folio)
                return folio_address(folio);
        return NULL;
}

static void swap_table_free_folio_rcu_cb(struct rcu_head *head)
{
        struct folio *folio;

        folio = page_folio(container_of(head, struct page, rcu_head));
        folio_put(folio);
}

static void swap_table_free(struct swap_table *table)
{
        if (!SWP_TABLE_USE_PAGE) {
                kmem_cache_free(swap_table_cachep, table);
                return;
        }

        call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head),
                 swap_table_free_folio_rcu_cb);
}

/*
 * Sanity check to ensure nothing leaked, and the specified range is empty.
 * One special case is that bad slots can't be freed, so check the number of
 * bad slots for swapoff, and non-swapoff path must never free bad slots.
 */
static void swap_cluster_assert_empty(struct swap_cluster_info *ci,
                                      unsigned int ci_off, unsigned int nr,
                                      bool swapoff)
{
        unsigned int ci_end = ci_off + nr;
        unsigned long swp_tb;
        int bad_slots = 0;

        if (!IS_ENABLED(CONFIG_DEBUG_VM) && !swapoff)
                return;

        do {
                swp_tb = __swap_table_get(ci, ci_off);
                if (swp_tb_is_bad(swp_tb))
                        bad_slots++;
                else
                        WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
        } while (++ci_off < ci_end);

        WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0));
        WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table);
}

static void swap_cluster_free_table(struct swap_cluster_info *ci)
{
        struct swap_table *table;

        /* Only empty cluster's table is allow to be freed  */
        lockdep_assert_held(&ci->lock);
        table = (void *)rcu_dereference_protected(ci->table, true);
        rcu_assign_pointer(ci->table, NULL);

        swap_table_free(table);
}

/*
 * Allocate swap table for one cluster. Attempt an atomic allocation first,
 * then fallback to sleeping allocation.
 */
static struct swap_cluster_info *
swap_cluster_alloc_table(struct swap_info_struct *si,
                         struct swap_cluster_info *ci)
{
        struct swap_table *table;

        /*
         * Only cluster isolation from the allocator does table allocation.
         * Swap allocator uses percpu clusters and holds the local lock.
         */
        lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
        if (!(si->flags & SWP_SOLIDSTATE))
                lockdep_assert_held(&si->global_cluster_lock);
        lockdep_assert_held(&ci->lock);

        /* The cluster must be free and was just isolated from the free list. */
        VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));

        table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
        if (table) {
                rcu_assign_pointer(ci->table, table);
                return ci;
        }

        /*
         * Try a sleep allocation. Each isolated free cluster may cause
         * a sleep allocation, but there is a limited number of them, so
         * the potential recursive allocation is limited.
         */
        spin_unlock(&ci->lock);
        if (!(si->flags & SWP_SOLIDSTATE))
                spin_unlock(&si->global_cluster_lock);
        local_unlock(&percpu_swap_cluster.lock);

        table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL);

        /*
         * Back to atomic context. We might have migrated to a new CPU with a
         * usable percpu cluster. But just keep using the isolated cluster to
         * make things easier. Migration indicates a slight change of workload
         * so using a new free cluster might not be a bad idea, and the worst
         * could happen with ignoring the percpu cluster is fragmentation,
         * which is acceptable since this fallback and race is rare.
         */
        local_lock(&percpu_swap_cluster.lock);
        if (!(si->flags & SWP_SOLIDSTATE))
                spin_lock(&si->global_cluster_lock);
        spin_lock(&ci->lock);

        /* Nothing except this helper should touch a dangling empty cluster. */
        if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) {
                if (table)
                        swap_table_free(table);
                return ci;
        }

        if (!table) {
                move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
                spin_unlock(&ci->lock);
                return NULL;
        }

        rcu_assign_pointer(ci->table, table);
        return ci;
}

static void move_cluster(struct swap_info_struct *si,
                         struct swap_cluster_info *ci, struct list_head *list,
                         enum swap_cluster_flags new_flags)
{
        VM_WARN_ON(ci->flags == new_flags);

        BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX);
        lockdep_assert_held(&ci->lock);

        spin_lock(&si->lock);
        if (ci->flags == CLUSTER_FLAG_NONE)
                list_add_tail(&ci->list, list);
        else
                list_move_tail(&ci->list, list);
        spin_unlock(&si->lock);
        ci->flags = new_flags;
}

/* Add a cluster to discard list and schedule it to do discard */
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
                struct swap_cluster_info *ci)
{
        VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
        move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD);
        schedule_work(&si->discard_work);
}

static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
        swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, false);
        swap_cluster_free_table(ci);
        move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
        ci->order = 0;
}

/*
 * Isolate and lock the first cluster that is not contented on a list,
 * clean its flag before taken off-list. Cluster flag must be in sync
 * with list status, so cluster updaters can always know the cluster
 * list status without touching si lock.
 *
 * Note it's possible that all clusters on a list are contented so
 * this returns NULL for an non-empty list.
 */
static struct swap_cluster_info *isolate_lock_cluster(
                struct swap_info_struct *si, struct list_head *list)
{
        struct swap_cluster_info *ci, *found = NULL;
        u8 flags = CLUSTER_FLAG_NONE;

        spin_lock(&si->lock);
        list_for_each_entry(ci, list, list) {
                if (!spin_trylock(&ci->lock))
                        continue;

                /* We may only isolate and clear flags of following lists */
                VM_BUG_ON(!ci->flags);
                VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
                          ci->flags != CLUSTER_FLAG_FULL);

                list_del(&ci->list);
                flags = ci->flags;
                ci->flags = CLUSTER_FLAG_NONE;
                found = ci;
                break;
        }
        spin_unlock(&si->lock);

        if (found && !cluster_table_is_alloced(found)) {
                /* Only an empty free cluster's swap table can be freed. */
                VM_WARN_ON_ONCE(flags != CLUSTER_FLAG_FREE);
                VM_WARN_ON_ONCE(list != &si->free_clusters);
                VM_WARN_ON_ONCE(!cluster_is_empty(found));
                return swap_cluster_alloc_table(si, found);
        }

        return found;
}

/*
 * Doing discard actually. After a cluster discard is finished, the cluster
 * will be added to free cluster list. Discard cluster is a bit special as
 * they don't participate in allocation or reclaim, so clusters marked as
 * CLUSTER_FLAG_DISCARD must remain off-list or on discard list.
 */
static bool swap_do_scheduled_discard(struct swap_info_struct *si)
{
        struct swap_cluster_info *ci;
        bool ret = false;
        unsigned int idx;

        spin_lock(&si->lock);
        while (!list_empty(&si->discard_clusters)) {
                ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
                /*
                 * Delete the cluster from list to prepare for discard, but keep
                 * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
                 * pointing to it, or ran into by relocate_cluster.
                 */
                list_del(&ci->list);
                idx = cluster_index(si, ci);
                spin_unlock(&si->lock);
                discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
                                SWAPFILE_CLUSTER);

                spin_lock(&ci->lock);
                /*
                 * Discard is done, clear its flags as it's off-list, then
                 * return the cluster to allocation list.
                 */
                ci->flags = CLUSTER_FLAG_NONE;
                __free_cluster(si, ci);
                spin_unlock(&ci->lock);
                ret = true;
                spin_lock(&si->lock);
        }
        spin_unlock(&si->lock);
        return ret;
}

static void swap_discard_work(struct work_struct *work)
{
        struct swap_info_struct *si;

        si = container_of(work, struct swap_info_struct, discard_work);

        swap_do_scheduled_discard(si);
}

static void swap_users_ref_free(struct percpu_ref *ref)
{
        struct swap_info_struct *si;

        si = container_of(ref, struct swap_info_struct, users);
        complete(&si->comp);
}

/*
 * Must be called after freeing if ci->count == 0, moves the cluster to free
 * or discard list.
 */
static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
        VM_BUG_ON(ci->count != 0);
        VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
        lockdep_assert_held(&ci->lock);

        /*
         * If the swap is discardable, prepare discard the cluster
         * instead of free it immediately. The cluster will be freed
         * after discard.
         */
        if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
            (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
                swap_cluster_schedule_discard(si, ci);
                return;
        }

        __free_cluster(si, ci);
}

/*
 * Must be called after freeing if ci->count != 0, moves the cluster to
 * nonfull list.
 */
static void partial_free_cluster(struct swap_info_struct *si,
                                 struct swap_cluster_info *ci)
{
        VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER);
        lockdep_assert_held(&ci->lock);

        if (ci->flags != CLUSTER_FLAG_NONFULL)
                move_cluster(si, ci, &si->nonfull_clusters[ci->order],
                             CLUSTER_FLAG_NONFULL);
}

/*
 * Must be called after allocation, moves the cluster to full or frag list.
 * Note: allocation doesn't acquire si lock, and may drop the ci lock for
 * reclaim, so the cluster could be any where when called.
 */
static void relocate_cluster(struct swap_info_struct *si,
                             struct swap_cluster_info *ci)
{
        lockdep_assert_held(&ci->lock);

        /* Discard cluster must remain off-list or on discard list */
        if (cluster_is_discard(ci))
                return;

        if (!ci->count) {
                if (ci->flags != CLUSTER_FLAG_FREE)
                        free_cluster(si, ci);
        } else if (ci->count != SWAPFILE_CLUSTER) {
                if (ci->flags != CLUSTER_FLAG_FRAG)
                        move_cluster(si, ci, &si->frag_clusters[ci->order],
                                     CLUSTER_FLAG_FRAG);
        } else {
                if (ci->flags != CLUSTER_FLAG_FULL)
                        move_cluster(si, ci, &si->full_clusters,
                                     CLUSTER_FLAG_FULL);
        }
}

/*
 * The cluster corresponding to @offset will be accounted as having one bad
 * slot. The cluster will not be added to the free cluster list, and its
 * usage counter will be increased by 1. Only used for initialization.
 */
static int swap_cluster_setup_bad_slot(struct swap_info_struct *si,
                                       struct swap_cluster_info *cluster_info,
                                       unsigned int offset, bool mask)
{
        unsigned int ci_off = offset % SWAPFILE_CLUSTER;
        unsigned long idx = offset / SWAPFILE_CLUSTER;
        struct swap_cluster_info *ci;
        struct swap_table *table;
        int ret = 0;

        /* si->max may got shrunk by swap swap_activate() */
        if (offset >= si->max && !mask) {
                pr_debug("Ignoring bad slot %u (max: %u)\n", offset, si->max);
                return 0;
        }
        /*
         * Account it, skip header slot: si->pages is initiated as
         * si->max - 1. Also skip the masking of last cluster,
         * si->pages doesn't include that part.
         */
        if (offset && !mask)
                si->pages -= 1;
        if (!si->pages) {
                pr_warn("Empty swap-file\n");
                return -EINVAL;
        }

        ci = cluster_info + idx;
        if (!ci->table) {
                table = swap_table_alloc(GFP_KERNEL);
                if (!table)
                        return -ENOMEM;
                rcu_assign_pointer(ci->table, table);
        }
        spin_lock(&ci->lock);
        /* Check for duplicated bad swap slots. */
        if (__swap_table_xchg(ci, ci_off, SWP_TB_BAD) != SWP_TB_NULL) {
                pr_warn("Duplicated bad slot offset %d\n", offset);
                ret = -EINVAL;
        } else {
                ci->count++;
        }
        spin_unlock(&ci->lock);

        WARN_ON(ci->count > SWAPFILE_CLUSTER);
        WARN_ON(ci->flags);

        return ret;
}

/*
 * Reclaim drops the ci lock, so the cluster may become unusable (freed or
 * stolen by a lower order). @usable will be set to false if that happens.
 */
static bool cluster_reclaim_range(struct swap_info_struct *si,
                                  struct swap_cluster_info *ci,
                                  unsigned long start, unsigned int order,
                                  bool *usable)
{
        unsigned int nr_pages = 1 << order;
        unsigned long offset = start, end = start + nr_pages;
        unsigned long swp_tb;

        spin_unlock(&ci->lock);
        do {
                swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
                if (swp_tb_get_count(swp_tb))
                        break;
                if (swp_tb_is_folio(swp_tb))
                        if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0)
                                break;
        } while (++offset < end);
        spin_lock(&ci->lock);

        /*
         * We just dropped ci->lock so cluster could be used by another
         * order or got freed, check if it's still usable or empty.
         */
        if (!cluster_is_usable(ci, order)) {
                *usable = false;
                return false;
        }
        *usable = true;

        /* Fast path, no need to scan if the whole cluster is empty */
        if (cluster_is_empty(ci))
                return true;

        /*
         * Recheck the range no matter reclaim succeeded or not, the slot
         * could have been be freed while we are not holding the lock.
         */
        for (offset = start; offset < end; offset++) {
                swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
                if (!swp_tb_is_null(swp_tb))
                        return false;
        }

        return true;
}

static bool cluster_scan_range(struct swap_info_struct *si,
                               struct swap_cluster_info *ci,
                               unsigned long offset, unsigned int nr_pages,
                               bool *need_reclaim)
{
        unsigned int ci_off = offset % SWAPFILE_CLUSTER;
        unsigned int ci_end = ci_off + nr_pages;
        unsigned long swp_tb;

        do {
                swp_tb = __swap_table_get(ci, ci_off);
                if (swp_tb_is_null(swp_tb))
                        continue;
                if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) {
                        if (!vm_swap_full())
                                return false;
                        *need_reclaim = true;
                        continue;
                }
                /* Slot with zero count can only be NULL or folio */
                VM_WARN_ON(!swp_tb_get_count(swp_tb));
                return false;
        } while (++ci_off < ci_end);

        return true;
}

static bool __swap_cluster_alloc_entries(struct swap_info_struct *si,
                                         struct swap_cluster_info *ci,
                                         struct folio *folio,
                                         unsigned int ci_off)
{
        unsigned int order;
        unsigned long nr_pages;

        lockdep_assert_held(&ci->lock);

        if (!(si->flags & SWP_WRITEOK))
                return false;

        /*
         * All mm swap allocation starts with a folio (folio_alloc_swap),
         * it's also the only allocation path for large orders allocation.
         * Such swap slots starts with count == 0 and will be increased
         * upon folio unmap.
         *
         * Else, it's a exclusive order 0 allocation for hibernation.
         * The slot starts with count == 1 and never increases.
         */
        if (likely(folio)) {
                order = folio_order(folio);
                nr_pages = 1 << order;
                swap_cluster_assert_empty(ci, ci_off, nr_pages, false);
                __swap_cache_add_folio(ci, folio, swp_entry(si->type,
                                                            ci_off + cluster_offset(si, ci)));
        } else if (IS_ENABLED(CONFIG_HIBERNATION)) {
                order = 0;
                nr_pages = 1;
                swap_cluster_assert_empty(ci, ci_off, 1, false);
                /* Sets a fake shadow as placeholder */
                __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1));
        } else {
                /* Allocation without folio is only possible with hibernation */
                WARN_ON_ONCE(1);
                return false;
        }

        /*
         * The first allocation in a cluster makes the
         * cluster exclusive to this order
         */
        if (cluster_is_empty(ci))
                ci->order = order;
        ci->count += nr_pages;
        swap_range_alloc(si, nr_pages);

        return true;
}

/* Try use a new cluster for current CPU and allocate from it. */
static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
                                            struct swap_cluster_info *ci,
                                            struct folio *folio, unsigned long offset)
{
        unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
        unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
        unsigned int order = likely(folio) ? folio_order(folio) : 0;
        unsigned long end = start + SWAPFILE_CLUSTER;
        unsigned int nr_pages = 1 << order;
        bool need_reclaim, ret, usable;

        lockdep_assert_held(&ci->lock);
        VM_WARN_ON(!cluster_is_usable(ci, order));

        if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER)
                goto out;

        for (end -= nr_pages; offset <= end; offset += nr_pages) {
                need_reclaim = false;
                if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim))
                        continue;
                if (need_reclaim) {
                        ret = cluster_reclaim_range(si, ci, offset, order, &usable);
                        if (!usable)
                                goto out;
                        if (cluster_is_empty(ci))
                                offset = start;
                        /* Reclaim failed but cluster is usable, try next */
                        if (!ret)
                                continue;
                }
                if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER))
                        break;
                found = offset;
                offset += nr_pages;
                if (ci->count < SWAPFILE_CLUSTER && offset <= end)
                        next = offset;
                break;
        }
out:
        relocate_cluster(si, ci);
        swap_cluster_unlock(ci);
        if (si->flags & SWP_SOLIDSTATE) {
                this_cpu_write(percpu_swap_cluster.offset[order], next);
                this_cpu_write(percpu_swap_cluster.si[order], si);
        } else {
                si->global_cluster->next[order] = next;
        }
        return found;
}

static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
                                         struct list_head *list,
                                         struct folio *folio,
                                         bool scan_all)
{
        unsigned int found = SWAP_ENTRY_INVALID;

        do {
                struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
                unsigned long offset;

                if (!ci)
                        break;
                offset = cluster_offset(si, ci);
                found = alloc_swap_scan_cluster(si, ci, folio, offset);
                if (found)
                        break;
        } while (scan_all);

        return found;
}

static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
        long to_scan = 1;
        unsigned long offset, end;
        struct swap_cluster_info *ci;
        unsigned long swp_tb;
        int nr_reclaim;

        if (force)
                to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;

        while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
                offset = cluster_offset(si, ci);
                end = min(si->max, offset + SWAPFILE_CLUSTER);
                to_scan--;

                while (offset < end) {
                        swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
                        if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) {
                                spin_unlock(&ci->lock);
                                nr_reclaim = __try_to_reclaim_swap(si, offset,
                                                                   TTRS_ANYWAY);
                                spin_lock(&ci->lock);
                                if (nr_reclaim) {
                                        offset += abs(nr_reclaim);
                                        continue;
                                }
                        }
                        offset++;
                }

                /* in case no swap cache is reclaimed */
                if (ci->flags == CLUSTER_FLAG_NONE)
                        relocate_cluster(si, ci);

                swap_cluster_unlock(ci);
                if (to_scan <= 0)
                        break;
        }
}

static void swap_reclaim_work(struct work_struct *work)
{
        struct swap_info_struct *si;

        si = container_of(work, struct swap_info_struct, reclaim_work);

        swap_reclaim_full_clusters(si, true);
}

/*
 * Try to allocate swap entries with specified order and try set a new
 * cluster for current CPU too.
 */
static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
                                              struct folio *folio)
{
        struct swap_cluster_info *ci;
        unsigned int order = likely(folio) ? folio_order(folio) : 0;
        unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;

        /*
         * Swapfile is not block device so unable
         * to allocate large entries.
         */
        if (order && !(si->flags & SWP_BLKDEV))
                return 0;

        if (!(si->flags & SWP_SOLIDSTATE)) {
                /* Serialize HDD SWAP allocation for each device. */
                spin_lock(&si->global_cluster_lock);
                offset = si->global_cluster->next[order];
                if (offset == SWAP_ENTRY_INVALID)
                        goto new_cluster;

                ci = swap_cluster_lock(si, offset);
                /* Cluster could have been used by another order */
                if (cluster_is_usable(ci, order)) {
                        if (cluster_is_empty(ci))
                                offset = cluster_offset(si, ci);
                        found = alloc_swap_scan_cluster(si, ci, folio, offset);
                } else {
                        swap_cluster_unlock(ci);
                }
                if (found)
                        goto done;
        }

new_cluster:
        /*
         * If the device need discard, prefer new cluster over nonfull
         * to spread out the writes.
         */
        if (si->flags & SWP_PAGE_DISCARD) {
                found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
                if (found)
                        goto done;
        }

        if (order < PMD_ORDER) {
                found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true);
                if (found)
                        goto done;
        }

        if (!(si->flags & SWP_PAGE_DISCARD)) {
                found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
                if (found)
                        goto done;
        }

        /* Try reclaim full clusters if free and nonfull lists are drained */
        if (vm_swap_full())
                swap_reclaim_full_clusters(si, false);

        if (order < PMD_ORDER) {
                /*
                 * Scan only one fragment cluster is good enough. Order 0
                 * allocation will surely success, and large allocation
                 * failure is not critical. Scanning one cluster still
                 * keeps the list rotated and reclaimed (for clean swap cache).
                 */
                found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false);
                if (found)
                        goto done;
        }

        if (order)
                goto done;

        /* Order 0 stealing from higher order */
        for (int o = 1; o < SWAP_NR_ORDERS; o++) {
                /*
                 * Clusters here have at least one usable slots and can't fail order 0
                 * allocation, but reclaim may drop si->lock and race with another user.
                 */
                found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true);
                if (found)
                        goto done;

                found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true);
                if (found)
                        goto done;
        }
done:
        if (!(si->flags & SWP_SOLIDSTATE))
                spin_unlock(&si->global_cluster_lock);

        return found;
}

/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
{
        unsigned long pages;

        spin_lock(&swap_avail_lock);

        if (swapoff) {
                /*
                 * Forcefully remove it. Clear the SWP_WRITEOK flags for
                 * swapoff here so it's synchronized by both si->lock and
                 * swap_avail_lock, to ensure the result can be seen by
                 * add_to_avail_list.
                 */
                lockdep_assert_held(&si->lock);
                si->flags &= ~SWP_WRITEOK;
                atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
        } else {
                /*
                 * If not called by swapoff, take it off-list only if it's
                 * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
                 * si->inuse_pages == pages), any concurrent slot freeing,
                 * or device already removed from plist by someone else
                 * will make this return false.
                 */
                pages = si->pages;
                if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
                                             pages | SWAP_USAGE_OFFLIST_BIT))
                        goto skip;
        }

        plist_del(&si->avail_list, &swap_avail_head);

skip:
        spin_unlock(&swap_avail_lock);
}

/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
{
        long val;
        unsigned long pages;

        spin_lock(&swap_avail_lock);

        /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
        if (swapon) {
                lockdep_assert_held(&si->lock);
                si->flags |= SWP_WRITEOK;
        } else {
                if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
                        goto skip;
        }

        if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
                goto skip;

        val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);

        /*
         * When device is full and device is on the plist, only one updater will
         * see (inuse_pages == si->pages) and will call del_from_avail_list. If
         * that updater happen to be here, just skip adding.
         */
        pages = si->pages;
        if (val == pages) {
                /* Just like the cmpxchg in del_from_avail_list */
                if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
                                            pages | SWAP_USAGE_OFFLIST_BIT))
                        goto skip;
        }

        plist_add(&si->avail_list, &swap_avail_head);

skip:
        spin_unlock(&swap_avail_lock);
}

/*
 * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
 * within each cluster, so the total contribution to the global counter should
 * always be positive and cannot exceed the total number of usable slots.
 */
static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
{
        long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);

        /*
         * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
         * remove it from the plist.
         */
        if (unlikely(val == si->pages)) {
                del_from_avail_list(si, false);
                return true;
        }

        return false;
}

static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
{
        long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);

        /*
         * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
         * add it to the plist.
         */
        if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
                add_to_avail_list(si, false);
}

static void swap_range_alloc(struct swap_info_struct *si,
                             unsigned int nr_entries)
{
        if (swap_usage_add(si, nr_entries)) {
                if (vm_swap_full())
                        schedule_work(&si->reclaim_work);
        }
        atomic_long_sub(nr_entries, &nr_swap_pages);
}

static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
                            unsigned int nr_entries)
{
        unsigned long end = offset + nr_entries - 1;
        void (*swap_slot_free_notify)(struct block_device *, unsigned long);
        unsigned int i;

        /*
         * Use atomic clear_bit operations only on zeromap instead of non-atomic
         * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
         */
        for (i = 0; i < nr_entries; i++) {
                clear_bit(offset + i, si->zeromap);
                zswap_invalidate(swp_entry(si->type, offset + i));
        }

        if (si->flags & SWP_BLKDEV)
                swap_slot_free_notify =
                        si->bdev->bd_disk->fops->swap_slot_free_notify;
        else
                swap_slot_free_notify = NULL;
        while (offset <= end) {
                arch_swap_invalidate_page(si->type, offset);
                if (swap_slot_free_notify)
                        swap_slot_free_notify(si->bdev, offset);
                offset++;
        }

        /*
         * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
         * only after the above cleanups are done.
         */
        smp_wmb();
        atomic_long_add(nr_entries, &nr_swap_pages);
        swap_usage_sub(si, nr_entries);
}

static bool get_swap_device_info(struct swap_info_struct *si)
{
        if (!percpu_ref_tryget_live(&si->users))
                return false;
        /*
         * Guarantee the si->users are checked before accessing other
         * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
         * up to dated.
         *
         * Paired with the spin_unlock() after setup_swap_info() in
         * enable_swap_info(), and smp_wmb() in swapoff.
         */
        smp_rmb();
        return true;
}

/*
 * Fast path try to get swap entries with specified order from current
 * CPU's swap entry pool (a cluster).
 */
static bool swap_alloc_fast(struct folio *folio)
{
        unsigned int order = folio_order(folio);
        struct swap_cluster_info *ci;
        struct swap_info_struct *si;
        unsigned int offset;

        /*
         * Once allocated, swap_info_struct will never be completely freed,
         * so checking it's liveness by get_swap_device_info is enough.
         */
        si = this_cpu_read(percpu_swap_cluster.si[order]);
        offset = this_cpu_read(percpu_swap_cluster.offset[order]);
        if (!si || !offset || !get_swap_device_info(si))
                return false;

        ci = swap_cluster_lock(si, offset);
        if (cluster_is_usable(ci, order)) {
                if (cluster_is_empty(ci))
                        offset = cluster_offset(si, ci);
                alloc_swap_scan_cluster(si, ci, folio, offset);
        } else {
                swap_cluster_unlock(ci);
        }

        put_swap_device(si);
        return folio_test_swapcache(folio);
}

/* Rotate the device and switch to a new cluster */
static void swap_alloc_slow(struct folio *folio)
{
        struct swap_info_struct *si, *next;

        spin_lock(&swap_avail_lock);
start_over:
        plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
                /* Rotate the device and switch to a new cluster */
                plist_requeue(&si->avail_list, &swap_avail_head);
                spin_unlock(&swap_avail_lock);
                if (get_swap_device_info(si)) {
                        cluster_alloc_swap_entry(si, folio);
                        put_swap_device(si);
                        if (folio_test_swapcache(folio))
                                return;
                        if (folio_test_large(folio))
                                return;
                }

                spin_lock(&swap_avail_lock);
                /*
                 * if we got here, it's likely that si was almost full before,
                 * multiple callers probably all tried to get a page from the
                 * same si and it filled up before we could get one; or, the si
                 * filled up between us dropping swap_avail_lock.
                 * Since we dropped the swap_avail_lock, the swap_avail_list
                 * may have been modified; so if next is still in the
                 * swap_avail_head list then try it, otherwise start over if we
                 * have not gotten any slots.
                 */
                if (plist_node_empty(&next->avail_list))
                        goto start_over;
        }
        spin_unlock(&swap_avail_lock);
}

/*
 * Discard pending clusters in a synchronized way when under high pressure.
 * Return: true if any cluster is discarded.
 */
static bool swap_sync_discard(void)
{
        bool ret = false;
        struct swap_info_struct *si, *next;

        spin_lock(&swap_lock);
start_over:
        plist_for_each_entry_safe(si, next, &swap_active_head, list) {
                spin_unlock(&swap_lock);
                if (get_swap_device_info(si)) {
                        if (si->flags & SWP_PAGE_DISCARD)
                                ret = swap_do_scheduled_discard(si);
                        put_swap_device(si);
                }
                if (ret)
                        return true;

                spin_lock(&swap_lock);
                if (plist_node_empty(&next->list))
                        goto start_over;
        }
        spin_unlock(&swap_lock);

        return false;
}

static int swap_extend_table_alloc(struct swap_info_struct *si,
                                   struct swap_cluster_info *ci, gfp_t gfp)
{
        void *table;

        table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp);
        if (!table)
                return -ENOMEM;

        spin_lock(&ci->lock);
        if (!ci->extend_table)
                ci->extend_table = table;
        else
                kfree(table);
        spin_unlock(&ci->lock);
        return 0;
}

int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
{
        int ret;
        struct swap_info_struct *si;
        struct swap_cluster_info *ci;
        unsigned long offset = swp_offset(entry);

        si = get_swap_device(entry);
        if (!si)
                return 0;

        ci = __swap_offset_to_cluster(si, offset);
        ret = swap_extend_table_alloc(si, ci, gfp);

        put_swap_device(si);
        return ret;
}

static void swap_extend_table_try_free(struct swap_cluster_info *ci)
{
        unsigned long i;
        bool can_free = true;

        if (!ci->extend_table)
                return;

        for (i = 0; i < SWAPFILE_CLUSTER; i++) {
                if (ci->extend_table[i])
                        can_free = false;
        }

        if (can_free) {
                kfree(ci->extend_table);
                ci->extend_table = NULL;
        }
}

/* Decrease the swap count of one slot, without freeing it */
static void __swap_cluster_put_entry(struct swap_cluster_info *ci,
                                    unsigned int ci_off)
{
        int count;
        unsigned long swp_tb;

        lockdep_assert_held(&ci->lock);
        swp_tb = __swap_table_get(ci, ci_off);
        count = __swp_tb_get_count(swp_tb);

        VM_WARN_ON_ONCE(count <= 0);
        VM_WARN_ON_ONCE(count > SWP_TB_COUNT_MAX);

        if (count == SWP_TB_COUNT_MAX) {
                count = ci->extend_table[ci_off];
                /* Overflow starts with SWP_TB_COUNT_MAX */
                VM_WARN_ON_ONCE(count < SWP_TB_COUNT_MAX);
                count--;
                if (count == (SWP_TB_COUNT_MAX - 1)) {
                        ci->extend_table[ci_off] = 0;
                        __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count));
                        swap_extend_table_try_free(ci);
                } else {
                        ci->extend_table[ci_off] = count;
                }
        } else {
                __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count));
        }
}

/**
 * swap_put_entries_cluster - Decrease the swap count of slots within one cluster
 * @si: The swap device.
 * @offset: start offset of slots.
 * @nr: number of slots.
 * @reclaim_cache: if true, also reclaim the swap cache if slots are freed.
 *
 * This helper decreases the swap count of a set of slots and tries to
 * batch free them. Also reclaims the swap cache if @reclaim_cache is true.
 *
 * Context: The specified slots must be pinned by existing swap count or swap
 * cache reference, so they won't be released until this helper returns.
 */
static void swap_put_entries_cluster(struct swap_info_struct *si,
                                     pgoff_t offset, int nr,
                                     bool reclaim_cache)
{
        struct swap_cluster_info *ci;
        unsigned int ci_off, ci_end;
        pgoff_t end = offset + nr;
        bool need_reclaim = false;
        unsigned int nr_reclaimed;
        unsigned long swp_tb;
        int ci_batch = -1;

        ci = swap_cluster_lock(si, offset);
        ci_off = offset % SWAPFILE_CLUSTER;
        ci_end = ci_off + nr;
        do {
                swp_tb = __swap_table_get(ci, ci_off);
                if (swp_tb_get_count(swp_tb) == 1) {
                        /* count == 1 and non-cached slots will be batch freed. */
                        if (!swp_tb_is_folio(swp_tb)) {
                                if (ci_batch == -1)
                                        ci_batch = ci_off;
                                continue;
                        }
                        /* count will be 0 after put, slot can be reclaimed */
                        need_reclaim = true;
                }
                /*
                 * A count != 1 or cached slot can't be freed. Put its swap
                 * count and then free the interrupted pending batch. Cached
                 * slots will be freed when folio is removed from swap cache
                 * (__swap_cache_del_folio).
                 */
                __swap_cluster_put_entry(ci, ci_off);
                if (ci_batch != -1) {
                        __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch);
                        ci_batch = -1;
                }
        } while (++ci_off < ci_end);

        if (ci_batch != -1)
                __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch);
        swap_cluster_unlock(ci);

        if (!need_reclaim || !reclaim_cache)
                return;

        do {
                nr_reclaimed = __try_to_reclaim_swap(si, offset,
                                                     TTRS_UNMAPPED | TTRS_FULL);
                offset++;
                if (nr_reclaimed)
                        offset = round_up(offset, abs(nr_reclaimed));
        } while (offset < end);
}

/* Increase the swap count of one slot. */
static int __swap_cluster_dup_entry(struct swap_cluster_info *ci,
                                    unsigned int ci_off)
{
        int count;
        unsigned long swp_tb;

        lockdep_assert_held(&ci->lock);
        swp_tb = __swap_table_get(ci, ci_off);
        /* Bad or special slots can't be handled */
        if (WARN_ON_ONCE(swp_tb_is_bad(swp_tb)))
                return -EINVAL;
        count = __swp_tb_get_count(swp_tb);
        /* Must be either cached or have a count already */
        if (WARN_ON_ONCE(!count && !swp_tb_is_folio(swp_tb)))
                return -ENOENT;

        if (likely(count < (SWP_TB_COUNT_MAX - 1))) {
                __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count + 1));
                VM_WARN_ON_ONCE(ci->extend_table && ci->extend_table[ci_off]);
        } else if (count == (SWP_TB_COUNT_MAX - 1)) {
                if (ci->extend_table) {
                        VM_WARN_ON_ONCE(ci->extend_table[ci_off]);
                        ci->extend_table[ci_off] = SWP_TB_COUNT_MAX;
                        __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, SWP_TB_COUNT_MAX));
                } else {
                        return -ENOMEM;
                }
        } else if (count == SWP_TB_COUNT_MAX) {
                VM_WARN_ON_ONCE(ci->extend_table[ci_off] >=
                               type_max(typeof(ci->extend_table[0])));
                ++ci->extend_table[ci_off];
        } else {
                /* Never happens unless counting went wrong */
                WARN_ON_ONCE(1);
        }

        return 0;
}

/**
 * swap_dup_entries_cluster: Increase the swap count of slots within one cluster.
 * @si: The swap device.
 * @offset: start offset of slots.
 * @nr: number of slots.
 *
 * Context: The specified slots must be pinned by existing swap count or swap
 * cache reference, so they won't be released until this helper returns.
 * Return: 0 on success. -ENOMEM if the swap count maxed out (SWP_TB_COUNT_MAX)
 * and failed to allocate an extended table, -EINVAL if any entry is bad entry.
 */
static int swap_dup_entries_cluster(struct swap_info_struct *si,
                                    pgoff_t offset, int nr)
{
        int err;
        struct swap_cluster_info *ci;
        unsigned int ci_start, ci_off, ci_end;

        ci_start = offset % SWAPFILE_CLUSTER;
        ci_end = ci_start + nr;
        ci_off = ci_start;
        ci = swap_cluster_lock(si, offset);
restart:
        do {
                err = __swap_cluster_dup_entry(ci, ci_off);
                if (unlikely(err)) {
                        if (err == -ENOMEM) {
                                spin_unlock(&ci->lock);
                                err = swap_extend_table_alloc(si, ci, GFP_ATOMIC);
                                spin_lock(&ci->lock);
                                if (!err)
                                        goto restart;
                        }
                        goto failed;
                }
        } while (++ci_off < ci_end);
        swap_cluster_unlock(ci);
        return 0;
failed:
        while (ci_off-- > ci_start)
                __swap_cluster_put_entry(ci, ci_off);
        swap_extend_table_try_free(ci);
        swap_cluster_unlock(ci);
        return err;
}

/**
 * folio_alloc_swap - allocate swap space for a folio
 * @folio: folio we want to move to swap
 *
 * Allocate swap space for the folio and add the folio to the
 * swap cache.
 *
 * Context: Caller needs to hold the folio lock.
 * Return: Whether the folio was added to the swap cache.
 */
int folio_alloc_swap(struct folio *folio)
{
        unsigned int order = folio_order(folio);
        unsigned int size = 1 << order;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);

        if (order) {
                /*
                 * Reject large allocation when THP_SWAP is disabled,
                 * the caller should split the folio and try again.
                 */
                if (!IS_ENABLED(CONFIG_THP_SWAP))
                        return -EAGAIN;

                /*
                 * Allocation size should never exceed cluster size
                 * (HPAGE_PMD_SIZE).
                 */
                if (size > SWAPFILE_CLUSTER) {
                        VM_WARN_ON_ONCE(1);
                        return -EINVAL;
                }
        }

again:
        local_lock(&percpu_swap_cluster.lock);
        if (!swap_alloc_fast(folio))
                swap_alloc_slow(folio);
        local_unlock(&percpu_swap_cluster.lock);

        if (!order && unlikely(!folio_test_swapcache(folio))) {
                if (swap_sync_discard())
                        goto again;
        }

        /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
        if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap)))
                swap_cache_del_folio(folio);

        if (unlikely(!folio_test_swapcache(folio)))
                return -ENOMEM;

        return 0;
}

/**
 * folio_dup_swap() - Increase swap count of swap entries of a folio.
 * @folio: folio with swap entries bounded.
 * @subpage: if not NULL, only increase the swap count of this subpage.
 *
 * Typically called when the folio is unmapped and have its swap entry to
 * take its place: Swap entries allocated to a folio has count == 0 and pinned
 * by swap cache. The swap cache pin doesn't increase the swap count. This
 * helper sets the initial count == 1 and increases the count as the folio is
 * unmapped and swap entries referencing the slots are generated to replace
 * the folio.
 *
 * Context: Caller must ensure the folio is locked and in the swap cache.
 * NOTE: The caller also has to ensure there is no raced call to
 * swap_put_entries_direct on its swap entry before this helper returns, or
 * the swap count may underflow.
 */
int folio_dup_swap(struct folio *folio, struct page *subpage)
{
        swp_entry_t entry = folio->swap;
        unsigned long nr_pages = folio_nr_pages(folio);

        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio);

        if (subpage) {
                entry.val += folio_page_idx(folio, subpage);
                nr_pages = 1;
        }

        return swap_dup_entries_cluster(swap_entry_to_info(entry),
                                        swp_offset(entry), nr_pages);
}

/**
 * folio_put_swap() - Decrease swap count of swap entries of a folio.
 * @folio: folio with swap entries bounded, must be in swap cache and locked.
 * @subpage: if not NULL, only decrease the swap count of this subpage.
 *
 * This won't free the swap slots even if swap count drops to zero, they are
 * still pinned by the swap cache. User may call folio_free_swap to free them.
 * Context: Caller must ensure the folio is locked and in the swap cache.
 */
void folio_put_swap(struct folio *folio, struct page *subpage)
{
        swp_entry_t entry = folio->swap;
        unsigned long nr_pages = folio_nr_pages(folio);
        struct swap_info_struct *si = __swap_entry_to_info(entry);

        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio);

        if (subpage) {
                entry.val += folio_page_idx(folio, subpage);
                nr_pages = 1;
        }

        swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false);
}

/*
 * When we get a swap entry, if there aren't some other ways to
 * prevent swapoff, such as the folio in swap cache is locked, RCU
 * reader side is locked, etc., the swap entry may become invalid
 * because of swapoff.  Then, we need to enclose all swap related
 * functions with get_swap_device() and put_swap_device(), unless the
 * swap functions call get/put_swap_device() by themselves.
 *
 * RCU reader side lock (including any spinlock) is sufficient to
 * prevent swapoff, because synchronize_rcu() is called in swapoff()
 * before freeing data structures.
 *
 * Check whether swap entry is valid in the swap device.  If so,
 * return pointer to swap_info_struct, and keep the swap entry valid
 * via preventing the swap device from being swapoff, until
 * put_swap_device() is called.  Otherwise return NULL.
 *
 * Notice that swapoff or swapoff+swapon can still happen before the
 * percpu_ref_tryget_live() in get_swap_device() or after the
 * percpu_ref_put() in put_swap_device() if there isn't any other way
 * to prevent swapoff.  The caller must be prepared for that.  For
 * example, the following situation is possible.
 *
 *   CPU1                                CPU2
 *   do_swap_page()
 *     ...                                swapoff+swapon
 *     swap_cache_alloc_folio()
 *       swap_cache_add_folio()
 *         // check swap_map
 *     // verify PTE not changed
 *
 * In __swap_duplicate(), the swap_map need to be checked before
 * changing partly because the specified swap entry may be for another
 * swap device which has been swapoff.  And in do_swap_page(), after
 * the page is read from the swap device, the PTE is verified not
 * changed with the page table locked to check whether the swap device
 * has been swapoff or swapoff+swapon.
 */
struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
        struct swap_info_struct *si;
        unsigned long offset;

        if (!entry.val)
                goto out;
        si = swap_entry_to_info(entry);
        if (!si)
                goto bad_nofile;
        if (!get_swap_device_info(si))
                goto out;
        offset = swp_offset(entry);
        if (offset >= si->max)
                goto put_out;

        return si;
bad_nofile:
        pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
        return NULL;
put_out:
        pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
        percpu_ref_put(&si->users);
        return NULL;
}

/*
 * Free a set of swap slots after their swap count dropped to zero, or will be
 * zero after putting the last ref (saves one __swap_cluster_put_entry call).
 */
void __swap_cluster_free_entries(struct swap_info_struct *si,
                                 struct swap_cluster_info *ci,
                                 unsigned int ci_start, unsigned int nr_pages)
{
        unsigned long old_tb;
        unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
        unsigned long offset = cluster_offset(si, ci) + ci_start;

        VM_WARN_ON(ci->count < nr_pages);

        ci->count -= nr_pages;
        do {
                old_tb = __swap_table_get(ci, ci_off);
                /* Release the last ref, or after swap cache is dropped */
                VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1);
                __swap_table_set(ci, ci_off, null_to_swp_tb());
        } while (++ci_off < ci_end);

        mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages);
        swap_range_free(si, offset, nr_pages);
        swap_cluster_assert_empty(ci, ci_start, nr_pages, false);

        if (!ci->count)
                free_cluster(si, ci);
        else
                partial_free_cluster(si, ci);
}

int __swap_count(swp_entry_t entry)
{
        struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
        unsigned int ci_off = swp_cluster_offset(entry);

        return swp_tb_get_count(__swap_table_get(ci, ci_off));
}

/**
 * swap_entry_swapped - Check if the swap entry is swapped.
 * @si: the swap device.
 * @entry: the swap entry.
 */
bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
        pgoff_t offset = swp_offset(entry);
        struct swap_cluster_info *ci;
        unsigned long swp_tb;

        ci = swap_cluster_lock(si, offset);
        swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
        swap_cluster_unlock(ci);

        return swp_tb_get_count(swp_tb) > 0;
}

/*
 * How many references to @entry are currently swapped out?
 * This returns exact answer.
 */
int swp_swapcount(swp_entry_t entry)
{
        struct swap_info_struct *si;
        struct swap_cluster_info *ci;
        unsigned long swp_tb;
        int count;

        si = get_swap_device(entry);
        if (!si)
                return 0;

        ci = swap_cluster_lock(si, swp_offset(entry));
        swp_tb = __swap_table_get(ci, swp_cluster_offset(entry));
        count = swp_tb_get_count(swp_tb);
        if (count == SWP_TB_COUNT_MAX)
                count = ci->extend_table[swp_cluster_offset(entry)];
        swap_cluster_unlock(ci);
        put_swap_device(si);

        return count < 0 ? 0 : count;
}

/*
 * folio_maybe_swapped - Test if a folio covers any swap slot with count > 0.
 *
 * Check if a folio is swapped. Holding the folio lock ensures the folio won't
 * go from not-swapped to swapped because the initial swap count increment can
 * only be done by folio_dup_swap, which also locks the folio. But a concurrent
 * decrease of swap count is possible through swap_put_entries_direct, so this
 * may return a false positive.
 *
 * Context: Caller must ensure the folio is locked and in the swap cache.
 */
static bool folio_maybe_swapped(struct folio *folio)
{
        swp_entry_t entry = folio->swap;
        struct swap_cluster_info *ci;
        unsigned int ci_off, ci_end;
        bool ret = false;

        VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);

        ci = __swap_entry_to_cluster(entry);
        ci_off = swp_cluster_offset(entry);
        ci_end = ci_off + folio_nr_pages(folio);
        /*
         * Extra locking not needed, folio lock ensures its swap entries
         * won't be released, the backing data won't be gone either.
         */
        rcu_read_lock();
        do {
                if (__swp_tb_get_count(__swap_table_get(ci, ci_off))) {
                        ret = true;
                        break;
                }
        } while (++ci_off < ci_end);
        rcu_read_unlock();

        return ret;
}

static bool folio_swapcache_freeable(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (!folio_test_swapcache(folio))
                return false;
        if (folio_test_writeback(folio))
                return false;

        /*
         * Once hibernation has begun to create its image of memory,
         * there's a danger that one of the calls to folio_free_swap()
         * - most probably a call from __try_to_reclaim_swap() while
         * hibernation is allocating its own swap pages for the image,
         * but conceivably even a call from memory reclaim - will free
         * the swap from a folio which has already been recorded in the
         * image as a clean swapcache folio, and then reuse its swap for
         * another page of the image.  On waking from hibernation, the
         * original folio might be freed under memory pressure, then
         * later read back in from swap, now with the wrong data.
         *
         * Hibernation suspends storage while it is writing the image
         * to disk so check that here.
         */
        if (pm_suspended_storage())
                return false;

        return true;
}

/**
 * folio_free_swap() - Free the swap space used for this folio.
 * @folio: The folio to remove.
 *
 * If swap is getting full, or if there are no more mappings of this folio,
 * then call folio_free_swap to free its swap space.
 *
 * Return: true if we were able to release the swap space.
 */
bool folio_free_swap(struct folio *folio)
{
        if (!folio_swapcache_freeable(folio))
                return false;
        if (folio_maybe_swapped(folio))
                return false;

        swap_cache_del_folio(folio);
        folio_set_dirty(folio);
        return true;
}

/**
 * swap_put_entries_direct() - Release reference on range of swap entries and
 *                             reclaim their cache if no more references remain.
 * @entry: First entry of range.
 * @nr: Number of entries in range.
 *
 * For each swap entry in the contiguous range, release a reference. If any swap
 * entries become free, try to reclaim their underlying folios, if present. The
 * offset range is defined by [entry.offset, entry.offset + nr).
 *
 * Context: Caller must ensure there is no race condition on the reference
 * owner. e.g., locking the PTL of a PTE containing the entry being released.
 */
void swap_put_entries_direct(swp_entry_t entry, int nr)
{
        const unsigned long start_offset = swp_offset(entry);
        const unsigned long end_offset = start_offset + nr;
        unsigned long offset, cluster_end;
        struct swap_info_struct *si;

        si = get_swap_device(entry);
        if (WARN_ON_ONCE(!si))
                return;
        if (WARN_ON_ONCE(end_offset > si->max))
                goto out;

        /* Put entries and reclaim cache in each cluster */
        offset = start_offset;
        do {
                cluster_end = min(round_up(offset + 1, SWAPFILE_CLUSTER), end_offset);
                swap_put_entries_cluster(si, offset, cluster_end - offset, true);
                offset = cluster_end;
        } while (offset < end_offset);
out:
        put_swap_device(si);
}

#ifdef CONFIG_HIBERNATION
/* Allocate a slot for hibernation */
swp_entry_t swap_alloc_hibernation_slot(int type)
{
        struct swap_info_struct *pcp_si, *si = swap_type_to_info(type);
        unsigned long pcp_offset, offset = SWAP_ENTRY_INVALID;
        struct swap_cluster_info *ci;
        swp_entry_t entry = {0};

        if (!si)
                goto fail;

        /* This is called for allocating swap entry, not cache */
        if (get_swap_device_info(si)) {
                if (si->flags & SWP_WRITEOK) {
                        /*
                         * Try the local cluster first if it matches the device. If
                         * not, try grab a new cluster and override local cluster.
                         */
                        local_lock(&percpu_swap_cluster.lock);
                        pcp_si = this_cpu_read(percpu_swap_cluster.si[0]);
                        pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]);
                        if (pcp_si == si && pcp_offset) {
                                ci = swap_cluster_lock(si, pcp_offset);
                                if (cluster_is_usable(ci, 0))
                                        offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset);
                                else
                                        swap_cluster_unlock(ci);
                        }
                        if (!offset)
                                offset = cluster_alloc_swap_entry(si, NULL);
                        local_unlock(&percpu_swap_cluster.lock);
                        if (offset)
                                entry = swp_entry(si->type, offset);
                }
                put_swap_device(si);
        }
fail:
        return entry;
}

/* Free a slot allocated by swap_alloc_hibernation_slot */
void swap_free_hibernation_slot(swp_entry_t entry)
{
        struct swap_info_struct *si;
        struct swap_cluster_info *ci;
        pgoff_t offset = swp_offset(entry);

        si = get_swap_device(entry);
        if (WARN_ON(!si))
                return;

        ci = swap_cluster_lock(si, offset);
        __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER);
        __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1);
        swap_cluster_unlock(ci);

        /* In theory readahead might add it to the swap cache by accident */
        __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
        put_swap_device(si);
}

/*
 * Find the swap type that corresponds to given device (if any).
 *
 * @offset - number of the PAGE_SIZE-sized block of the device, starting
 * from 0, in which the swap header is expected to be located.
 *
 * This is needed for the suspend to disk (aka swsusp).
 */
int swap_type_of(dev_t device, sector_t offset)
{
        int type;

        if (!device)
                return -1;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *sis = swap_info[type];

                if (!(sis->flags & SWP_WRITEOK))
                        continue;

                if (device == sis->bdev->bd_dev) {
                        struct swap_extent *se = first_se(sis);

                        if (se->start_block == offset) {
                                spin_unlock(&swap_lock);
                                return type;
                        }
                }
        }
        spin_unlock(&swap_lock);
        return -ENODEV;
}

int find_first_swap(dev_t *device)
{
        int type;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *sis = swap_info[type];

                if (!(sis->flags & SWP_WRITEOK))
                        continue;
                *device = sis->bdev->bd_dev;
                spin_unlock(&swap_lock);
                return type;
        }
        spin_unlock(&swap_lock);
        return -ENODEV;
}

/*
 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
 * corresponding to given index in swap_info (swap type).
 */
sector_t swapdev_block(int type, pgoff_t offset)
{
        struct swap_info_struct *si = swap_type_to_info(type);
        struct swap_extent *se;

        if (!si || !(si->flags & SWP_WRITEOK))
                return 0;
        se = offset_to_swap_extent(si, offset);
        return se->start_block + (offset - se->start_page);
}

/*
 * Return either the total number of swap pages of given type, or the number
 * of free pages of that type (depending on @free)
 *
 * This is needed for software suspend
 */
unsigned int count_swap_pages(int type, int free)
{
        unsigned int n = 0;

        spin_lock(&swap_lock);
        if ((unsigned int)type < nr_swapfiles) {
                struct swap_info_struct *sis = swap_info[type];

                spin_lock(&sis->lock);
                if (sis->flags & SWP_WRITEOK) {
                        n = sis->pages;
                        if (free)
                                n -= swap_usage_in_pages(sis);
                }
                spin_unlock(&sis->lock);
        }
        spin_unlock(&swap_lock);
        return n;
}
#endif /* CONFIG_HIBERNATION */

static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
        return pte_same(pte_swp_clear_flags(pte), swp_pte);
}

/*
 * No need to decide whether this PTE shares the swap entry with others,
 * just let do_wp_page work it out if a write is requested later - to
 * force COW, vm_page_prot omits write permission from any private vma.
 */
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct folio *folio)
{
        struct page *page;
        struct folio *swapcache;
        spinlock_t *ptl;
        pte_t *pte, new_pte, old_pte;
        bool hwpoisoned = false;
        int ret = 1;

        /*
         * If the folio is removed from swap cache by others, continue to
         * unuse other PTEs. try_to_unuse may try again if we missed this one.
         */
        if (!folio_matches_swap_entry(folio, entry))
                return 0;

        swapcache = folio;
        folio = ksm_might_need_to_copy(folio, vma, addr);
        if (unlikely(!folio))
                return -ENOMEM;
        else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
                hwpoisoned = true;
                folio = swapcache;
        }

        page = folio_file_page(folio, swp_offset(entry));
        if (PageHWPoison(page))
                hwpoisoned = true;

        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
                                                swp_entry_to_pte(entry)))) {
                ret = 0;
                goto out;
        }

        old_pte = ptep_get(pte);

        if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
                swp_entry_t swp_entry;

                dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
                if (hwpoisoned) {
                        swp_entry = make_hwpoison_entry(page);
                } else {
                        swp_entry = make_poisoned_swp_entry();
                }
                new_pte = swp_entry_to_pte(swp_entry);
                ret = 0;
                goto setpte;
        }

        /*
         * Some architectures may have to restore extra metadata to the page
         * when reading from swap. This metadata may be indexed by swap entry
         * so this must be called before folio_put_swap().
         */
        arch_swap_restore(folio_swap(entry, folio), folio);

        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
        folio_get(folio);
        if (folio == swapcache) {
                rmap_t rmap_flags = RMAP_NONE;

                /*
                 * See do_swap_page(): writeback would be problematic.
                 * However, we do a folio_wait_writeback() just before this
                 * call and have the folio locked.
                 */
                VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
                if (pte_swp_exclusive(old_pte))
                        rmap_flags |= RMAP_EXCLUSIVE;
                /*
                 * We currently only expect small !anon folios, which are either
                 * fully exclusive or fully shared. If we ever get large folios
                 * here, we have to be careful.
                 */
                if (!folio_test_anon(folio)) {
                        VM_WARN_ON_ONCE(folio_test_large(folio));
                        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
                        folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
                } else {
                        folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
                }
        } else { /* ksm created a completely new copy */
                folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
                folio_add_lru_vma(folio, vma);
        }
        new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
        if (pte_swp_soft_dirty(old_pte))
                new_pte = pte_mksoft_dirty(new_pte);
        if (pte_swp_uffd_wp(old_pte))
                new_pte = pte_mkuffd_wp(new_pte);
setpte:
        set_pte_at(vma->vm_mm, addr, pte, new_pte);
        folio_put_swap(swapcache, folio_file_page(swapcache, swp_offset(entry)));
out:
        if (pte)
                pte_unmap_unlock(pte, ptl);
        if (folio != swapcache) {
                folio_unlock(folio);
                folio_put(folio);
        }
        return ret;
}

static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned int type)
{
        pte_t *pte = NULL;

        do {
                struct folio *folio;
                unsigned long swp_tb;
                softleaf_t entry;
                int ret;
                pte_t ptent;

                if (!pte++) {
                        pte = pte_offset_map(pmd, addr);
                        if (!pte)
                                break;
                }

                ptent = ptep_get_lockless(pte);
                entry = softleaf_from_pte(ptent);

                if (!softleaf_is_swap(entry))
                        continue;
                if (swp_type(entry) != type)
                        continue;

                pte_unmap(pte);
                pte = NULL;

                folio = swap_cache_get_folio(entry);
                if (!folio) {
                        struct vm_fault vmf = {
                                .vma = vma,
                                .address = addr,
                                .real_address = addr,
                                .pmd = pmd,
                        };

                        folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                &vmf);
                }
                if (!folio) {
                        swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
                                                swp_cluster_offset(entry));
                        if (swp_tb_get_count(swp_tb) <= 0)
                                continue;
                        return -ENOMEM;
                }

                folio_lock(folio);
                folio_wait_writeback(folio);
                ret = unuse_pte(vma, pmd, addr, entry, folio);
                if (ret < 0) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return ret;
                }

                folio_free_swap(folio);
                folio_unlock(folio);
                folio_put(folio);
        } while (addr += PAGE_SIZE, addr != end);

        if (pte)
                pte_unmap(pte);
        return 0;
}

static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        pmd_t *pmd;
        unsigned long next;
        int ret;

        pmd = pmd_offset(pud, addr);
        do {
                cond_resched();
                next = pmd_addr_end(addr, end);
                ret = unuse_pte_range(vma, pmd, addr, next, type);
                if (ret)
                        return ret;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        pud_t *pud;
        unsigned long next;
        int ret;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                ret = unuse_pmd_range(vma, pud, addr, next, type);
                if (ret)
                        return ret;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        p4d_t *p4d;
        unsigned long next;
        int ret;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                ret = unuse_pud_range(vma, p4d, addr, next, type);
                if (ret)
                        return ret;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
{
        pgd_t *pgd;
        unsigned long addr, end, next;
        int ret;

        addr = vma->vm_start;
        end = vma->vm_end;

        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                ret = unuse_p4d_range(vma, pgd, addr, next, type);
                if (ret)
                        return ret;
        } while (pgd++, addr = next, addr != end);
        return 0;
}

static int unuse_mm(struct mm_struct *mm, unsigned int type)
{
        struct vm_area_struct *vma;
        int ret = 0;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_read_lock(mm);
        if (check_stable_address_space(mm))
                goto unlock;
        for_each_vma(vmi, vma) {
                if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
                        ret = unuse_vma(vma, type);
                        if (ret)
                                break;
                }

                cond_resched();
        }
unlock:
        mmap_read_unlock(mm);
        return ret;
}

/*
 * Scan swap table from current position to next entry still in use.
 * Return 0 if there are no inuse entries after prev till end of
 * the map.
 */
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                                        unsigned int prev)
{
        unsigned int i;
        unsigned long swp_tb;

        /*
         * No need for swap_lock here: we're just looking
         * for whether an entry is in use, not modifying it; false
         * hits are okay, and sys_swapoff() has already prevented new
         * allocations from this area (while holding swap_lock).
         */
        for (i = prev + 1; i < si->max; i++) {
                swp_tb = swap_table_get(__swap_offset_to_cluster(si, i),
                                        i % SWAPFILE_CLUSTER);
                if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb))
                        break;
                if ((i % LATENCY_LIMIT) == 0)
                        cond_resched();
        }

        if (i == si->max)
                i = 0;

        return i;
}

static int try_to_unuse(unsigned int type)
{
        struct mm_struct *prev_mm;
        struct mm_struct *mm;
        struct list_head *p;
        int retval = 0;
        struct swap_info_struct *si = swap_info[type];
        struct folio *folio;
        swp_entry_t entry;
        unsigned int i;

        if (!swap_usage_in_pages(si))
                goto success;

retry:
        retval = shmem_unuse(type);
        if (retval)
                return retval;

        prev_mm = &init_mm;
        mmget(prev_mm);

        spin_lock(&mmlist_lock);
        p = &init_mm.mmlist;
        while (swap_usage_in_pages(si) &&
               !signal_pending(current) &&
               (p = p->next) != &init_mm.mmlist) {

                mm = list_entry(p, struct mm_struct, mmlist);
                if (!mmget_not_zero(mm))
                        continue;
                spin_unlock(&mmlist_lock);
                mmput(prev_mm);
                prev_mm = mm;
                retval = unuse_mm(mm, type);
                if (retval) {
                        mmput(prev_mm);
                        return retval;
                }

                /*
                 * Make sure that we aren't completely killing
                 * interactive performance.
                 */
                cond_resched();
                spin_lock(&mmlist_lock);
        }
        spin_unlock(&mmlist_lock);

        mmput(prev_mm);

        i = 0;
        while (swap_usage_in_pages(si) &&
               !signal_pending(current) &&
               (i = find_next_to_unuse(si, i)) != 0) {

                entry = swp_entry(type, i);
                folio = swap_cache_get_folio(entry);
                if (!folio)
                        continue;

                /*
                 * It is conceivable that a racing task removed this folio from
                 * swap cache just before we acquired the page lock. The folio
                 * might even be back in swap cache on another swap area. But
                 * that is okay, folio_free_swap() only removes stale folios.
                 */
                folio_lock(folio);
                folio_wait_writeback(folio);
                folio_free_swap(folio);
                folio_unlock(folio);
                folio_put(folio);
        }

        /*
         * Lets check again to see if there are still swap entries in the map.
         * If yes, we would need to do retry the unuse logic again.
         * Under global memory pressure, swap entries can be reinserted back
         * into process space after the mmlist loop above passes over them.
         *
         * Limit the number of retries? No: when mmget_not_zero()
         * above fails, that mm is likely to be freeing swap from
         * exit_mmap(), which proceeds at its own independent pace;
         * and even shmem_writeout() could have been preempted after
         * folio_alloc_swap(), temporarily hiding that swap.  It's easy
         * and robust (though cpu-intensive) just to keep retrying.
         */
        if (swap_usage_in_pages(si)) {
                if (!signal_pending(current))
                        goto retry;
                return -EINTR;
        }

success:
        /*
         * Make sure that further cleanups after try_to_unuse() returns happen
         * after swap_range_free() reduces si->inuse_pages to 0.
         */
        smp_mb();
        return 0;
}

/*
 * After a successful try_to_unuse, if no swap is now in use, we know
 * we can empty the mmlist.  swap_lock must be held on entry and exit.
 * Note that mmlist_lock nests inside swap_lock, and an mm must be
 * added to the mmlist just after page_duplicate - before would be racy.
 */
static void drain_mmlist(void)
{
        struct list_head *p, *next;
        unsigned int type;

        for (type = 0; type < nr_swapfiles; type++)
                if (swap_usage_in_pages(swap_info[type]))
                        return;
        spin_lock(&mmlist_lock);
        list_for_each_safe(p, next, &init_mm.mmlist)
                list_del_init(p);
        spin_unlock(&mmlist_lock);
}

/*
 * Free all of a swapdev's extent information
 */
static void destroy_swap_extents(struct swap_info_struct *sis,
                                 struct file *swap_file)
{
        while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
                struct rb_node *rb = sis->swap_extent_root.rb_node;
                struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);

                rb_erase(rb, &sis->swap_extent_root);
                kfree(se);
        }

        if (sis->flags & SWP_ACTIVATED) {
                struct address_space *mapping = swap_file->f_mapping;

                sis->flags &= ~SWP_ACTIVATED;
                if (mapping->a_ops->swap_deactivate)
                        mapping->a_ops->swap_deactivate(swap_file);
        }
}

/*
 * Add a block range (and the corresponding page range) into this swapdev's
 * extent tree.
 *
 * This function rather assumes that it is called in ascending page order.
 */
int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block)
{
        struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
        struct swap_extent *se;
        struct swap_extent *new_se;

        /*
         * place the new node at the right most since the
         * function is called in ascending page order.
         */
        while (*link) {
                parent = *link;
                link = &parent->rb_right;
        }

        if (parent) {
                se = rb_entry(parent, struct swap_extent, rb_node);
                BUG_ON(se->start_page + se->nr_pages != start_page);
                if (se->start_block + se->nr_pages == start_block) {
                        /* Merge it */
                        se->nr_pages += nr_pages;
                        return 0;
                }
        }

        /* No merge, insert a new extent. */
        new_se = kmalloc_obj(*se);
        if (new_se == NULL)
                return -ENOMEM;
        new_se->start_page = start_page;
        new_se->nr_pages = nr_pages;
        new_se->start_block = start_block;

        rb_link_node(&new_se->rb_node, parent, link);
        rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
        return 1;
}
EXPORT_SYMBOL_GPL(add_swap_extent);

/*
 * A `swap extent' is a simple thing which maps a contiguous range of pages
 * onto a contiguous range of disk blocks.  A rbtree of swap extents is
 * built at swapon time and is then used at swap_writepage/swap_read_folio
 * time for locating where on disk a page belongs.
 *
 * If the swapfile is an S_ISBLK block device, a single extent is installed.
 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
 * swap files identically.
 *
 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
 * extent rbtree operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
 * swapfiles are handled *identically* after swapon time.
 *
 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
 * and will parse them into a rbtree, in PAGE_SIZE chunks.  If some stray
 * blocks are found which do not fall within the PAGE_SIZE alignment
 * requirements, they are simply tossed out - we will never use those blocks
 * for swapping.
 *
 * For all swap devices we set S_SWAPFILE across the life of the swapon.  This
 * prevents users from writing to the swap device, which will corrupt memory.
 *
 * The amount of disk space which a single swap extent represents varies.
 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
 * extents in the rbtree. - akpm.
 */
static int setup_swap_extents(struct swap_info_struct *sis,
                              struct file *swap_file, sector_t *span)
{
        struct address_space *mapping = swap_file->f_mapping;
        struct inode *inode = mapping->host;
        int ret;

        if (S_ISBLK(inode->i_mode)) {
                ret = add_swap_extent(sis, 0, sis->max, 0);
                *span = sis->pages;
                return ret;
        }

        if (mapping->a_ops->swap_activate) {
                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
                if (ret < 0)
                        return ret;
                sis->flags |= SWP_ACTIVATED;
                if ((sis->flags & SWP_FS_OPS) &&
                    sio_pool_init() != 0) {
                        destroy_swap_extents(sis, swap_file);
                        return -ENOMEM;
                }
                return ret;
        }

        return generic_swapfile_activate(sis, swap_file, span);
}

static void _enable_swap_info(struct swap_info_struct *si)
{
        atomic_long_add(si->pages, &nr_swap_pages);
        total_swap_pages += si->pages;

        assert_spin_locked(&swap_lock);

        plist_add(&si->list, &swap_active_head);

        /* Add back to available list */
        add_to_avail_list(si, true);
}

/*
 * Called after the swap device is ready, resurrect its percpu ref, it's now
 * safe to reference it. Add it to the list to expose it to the allocator.
 */
static void enable_swap_info(struct swap_info_struct *si)
{
        percpu_ref_resurrect(&si->users);
        spin_lock(&swap_lock);
        spin_lock(&si->lock);
        _enable_swap_info(si);
        spin_unlock(&si->lock);
        spin_unlock(&swap_lock);
}

static void reinsert_swap_info(struct swap_info_struct *si)
{
        spin_lock(&swap_lock);
        spin_lock(&si->lock);
        _enable_swap_info(si);
        spin_unlock(&si->lock);
        spin_unlock(&swap_lock);
}

/*
 * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
 * see the updated flags, so there will be no more allocations.
 */
static void wait_for_allocation(struct swap_info_struct *si)
{
        unsigned long offset;
        unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
        struct swap_cluster_info *ci;

        BUG_ON(si->flags & SWP_WRITEOK);

        for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
                ci = swap_cluster_lock(si, offset);
                swap_cluster_unlock(ci);
        }
}

static void free_swap_cluster_info(struct swap_cluster_info *cluster_info,
                                   unsigned long maxpages)
{
        struct swap_cluster_info *ci;
        int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);

        if (!cluster_info)
                return;
        for (i = 0; i < nr_clusters; i++) {
                ci = cluster_info + i;
                /* Cluster with bad marks count will have a remaining table */
                spin_lock(&ci->lock);
                if (rcu_dereference_protected(ci->table, true)) {
                        swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, true);
                        swap_cluster_free_table(ci);
                }
                spin_unlock(&ci->lock);
        }
        kvfree(cluster_info);
}

/*
 * Called after swap device's reference count is dead, so
 * neither scan nor allocation will use it.
 */
static void flush_percpu_swap_cluster(struct swap_info_struct *si)
{
        int cpu, i;
        struct swap_info_struct **pcp_si;

        for_each_possible_cpu(cpu) {
                pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
                /*
                 * Invalidate the percpu swap cluster cache, si->users
                 * is dead, so no new user will point to it, just flush
                 * any existing user.
                 */
                for (i = 0; i < SWAP_NR_ORDERS; i++)
                        cmpxchg(&pcp_si[i], si, NULL);
        }
}


SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
        struct swap_info_struct *p = NULL;
        unsigned long *zeromap;
        struct swap_cluster_info *cluster_info;
        struct file *swap_file, *victim;
        struct address_space *mapping;
        struct inode *inode;
        unsigned int maxpages;
        int err, found = 0;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        BUG_ON(!current->mm);

        CLASS(filename, pathname)(specialfile);
        victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
        if (IS_ERR(victim))
                return PTR_ERR(victim);

        mapping = victim->f_mapping;
        spin_lock(&swap_lock);
        plist_for_each_entry(p, &swap_active_head, list) {
                if (p->flags & SWP_WRITEOK) {
                        if (p->swap_file->f_mapping == mapping) {
                                found = 1;
                                break;
                        }
                }
        }
        if (!found) {
                err = -EINVAL;
                spin_unlock(&swap_lock);
                goto out_dput;
        }
        if (!security_vm_enough_memory_mm(current->mm, p->pages))
                vm_unacct_memory(p->pages);
        else {
                err = -ENOMEM;
                spin_unlock(&swap_lock);
                goto out_dput;
        }
        spin_lock(&p->lock);
        del_from_avail_list(p, true);
        plist_del(&p->list, &swap_active_head);
        atomic_long_sub(p->pages, &nr_swap_pages);
        total_swap_pages -= p->pages;
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);

        wait_for_allocation(p);

        set_current_oom_origin();
        err = try_to_unuse(p->type);
        clear_current_oom_origin();

        if (err) {
                /* re-insert swap space back into swap_list */
                reinsert_swap_info(p);
                goto out_dput;
        }

        /*
         * Wait for swap operations protected by get/put_swap_device()
         * to complete.  Because of synchronize_rcu() here, all swap
         * operations protected by RCU reader side lock (including any
         * spinlock) will be waited too.  This makes it easy to
         * prevent folio_test_swapcache() and the following swap cache
         * operations from racing with swapoff.
         */
        percpu_ref_kill(&p->users);
        synchronize_rcu();
        wait_for_completion(&p->comp);

        flush_work(&p->discard_work);
        flush_work(&p->reclaim_work);
        flush_percpu_swap_cluster(p);

        destroy_swap_extents(p, p->swap_file);

        if (!(p->flags & SWP_SOLIDSTATE))
                atomic_dec(&nr_rotate_swap);

        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        drain_mmlist();

        swap_file = p->swap_file;
        p->swap_file = NULL;
        zeromap = p->zeromap;
        p->zeromap = NULL;
        maxpages = p->max;
        cluster_info = p->cluster_info;
        p->max = 0;
        p->cluster_info = NULL;
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        arch_swap_invalidate_area(p->type);
        zswap_swapoff(p->type);
        mutex_unlock(&swapon_mutex);
        kfree(p->global_cluster);
        p->global_cluster = NULL;
        kvfree(zeromap);
        free_swap_cluster_info(cluster_info, maxpages);
        /* Destroy swap account information */
        swap_cgroup_swapoff(p->type);

        inode = mapping->host;

        inode_lock(inode);
        inode->i_flags &= ~S_SWAPFILE;
        inode_unlock(inode);
        filp_close(swap_file, NULL);

        /*
         * Clear the SWP_USED flag after all resources are freed so that swapon
         * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
         * not hold p->lock after we cleared its SWP_WRITEOK.
         */
        spin_lock(&swap_lock);
        p->flags = 0;
        spin_unlock(&swap_lock);

        err = 0;
        atomic_inc(&proc_poll_event);
        wake_up_interruptible(&proc_poll_wait);

out_dput:
        filp_close(victim, NULL);
        return err;
}

#ifdef CONFIG_PROC_FS
static __poll_t swaps_poll(struct file *file, poll_table *wait)
{
        struct seq_file *seq = file->private_data;

        poll_wait(file, &proc_poll_wait, wait);

        if (seq->poll_event != atomic_read(&proc_poll_event)) {
                seq->poll_event = atomic_read(&proc_poll_event);
                return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
        }

        return EPOLLIN | EPOLLRDNORM;
}

/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
        struct swap_info_struct *si;
        int type;
        loff_t l = *pos;

        mutex_lock(&swapon_mutex);

        if (!l)
                return SEQ_START_TOKEN;

        for (type = 0; (si = swap_type_to_info(type)); type++) {
                if (!(si->swap_file))
                        continue;
                if (!--l)
                        return si;
        }

        return NULL;
}

static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
        struct swap_info_struct *si = v;
        int type;

        if (v == SEQ_START_TOKEN)
                type = 0;
        else
                type = si->type + 1;

        ++(*pos);
        for (; (si = swap_type_to_info(type)); type++) {
                if (!(si->swap_file))
                        continue;
                return si;
        }

        return NULL;
}

static void swap_stop(struct seq_file *swap, void *v)
{
        mutex_unlock(&swapon_mutex);
}

static int swap_show(struct seq_file *swap, void *v)
{
        struct swap_info_struct *si = v;
        struct file *file;
        int len;
        unsigned long bytes, inuse;

        if (si == SEQ_START_TOKEN) {
                seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
                return 0;
        }

        bytes = K(si->pages);
        inuse = K(swap_usage_in_pages(si));

        file = si->swap_file;
        len = seq_file_path(swap, file, " \t\n\\");
        seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
                        len < 40 ? 40 - len : 1, " ",
                        S_ISBLK(file_inode(file)->i_mode) ?
                                "partition" : "file\t",
                        bytes, bytes < 10000000 ? "\t" : "",
                        inuse, inuse < 10000000 ? "\t" : "",
                        si->prio);
        return 0;
}

static const struct seq_operations swaps_op = {
        .start =        swap_start,
        .next =                swap_next,
        .stop =                swap_stop,
        .show =                swap_show
};

static int swaps_open(struct inode *inode, struct file *file)
{
        struct seq_file *seq;
        int ret;

        ret = seq_open(file, &swaps_op);
        if (ret)
                return ret;

        seq = file->private_data;
        seq->poll_event = atomic_read(&proc_poll_event);
        return 0;
}

static const struct proc_ops swaps_proc_ops = {
        .proc_flags        = PROC_ENTRY_PERMANENT,
        .proc_open        = swaps_open,
        .proc_read        = seq_read,
        .proc_lseek        = seq_lseek,
        .proc_release        = seq_release,
        .proc_poll        = swaps_poll,
};

static int __init procswaps_init(void)
{
        proc_create("swaps", 0, NULL, &swaps_proc_ops);
        return 0;
}
__initcall(procswaps_init);
#endif /* CONFIG_PROC_FS */

#ifdef MAX_SWAPFILES_CHECK
static int __init max_swapfiles_check(void)
{
        MAX_SWAPFILES_CHECK();
        return 0;
}
late_initcall(max_swapfiles_check);
#endif

static struct swap_info_struct *alloc_swap_info(void)
{
        struct swap_info_struct *p;
        struct swap_info_struct *defer = NULL;
        unsigned int type;

        p = kvzalloc_obj(struct swap_info_struct);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (percpu_ref_init(&p->users, swap_users_ref_free,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
                kvfree(p);
                return ERR_PTR(-ENOMEM);
        }

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                if (!(swap_info[type]->flags & SWP_USED))
                        break;
        }
        if (type >= MAX_SWAPFILES) {
                spin_unlock(&swap_lock);
                percpu_ref_exit(&p->users);
                kvfree(p);
                return ERR_PTR(-EPERM);
        }
        if (type >= nr_swapfiles) {
                p->type = type;
                /*
                 * Publish the swap_info_struct after initializing it.
                 * Note that kvzalloc() above zeroes all its fields.
                 */
                smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
                nr_swapfiles++;
        } else {
                defer = p;
                p = swap_info[type];
                /*
                 * Do not memset this entry: a racing procfs swap_next()
                 * would be relying on p->type to remain valid.
                 */
        }
        p->swap_extent_root = RB_ROOT;
        plist_node_init(&p->list, 0);
        plist_node_init(&p->avail_list, 0);
        p->flags = SWP_USED;
        spin_unlock(&swap_lock);
        if (defer) {
                percpu_ref_exit(&defer->users);
                kvfree(defer);
        }
        spin_lock_init(&p->lock);
        atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
        init_completion(&p->comp);

        return p;
}

static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
{
        if (S_ISBLK(inode->i_mode)) {
                si->bdev = I_BDEV(inode);
                /*
                 * Zoned block devices contain zones that have a sequential
                 * write only restriction.  Hence zoned block devices are not
                 * suitable for swapping.  Disallow them here.
                 */
                if (bdev_is_zoned(si->bdev))
                        return -EINVAL;
                si->flags |= SWP_BLKDEV;
        } else if (S_ISREG(inode->i_mode)) {
                si->bdev = inode->i_sb->s_bdev;
        }

        return 0;
}


/*
 * Find out how many pages are allowed for a single swap device. There
 * are two limiting factors:
 * 1) the number of bits for the swap offset in the swp_entry_t type, and
 * 2) the number of bits in the swap pte, as defined by the different
 * architectures.
 *
 * In order to find the largest possible bit mask, a swap entry with
 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
 * decoded to a swp_entry_t again, and finally the swap offset is
 * extracted.
 *
 * This will mask all the bits from the initial ~0UL mask that can't
 * be encoded in either the swp_entry_t or the architecture definition
 * of a swap pte.
 */
unsigned long generic_max_swapfile_size(void)
{
        swp_entry_t entry = swp_entry(0, ~0UL);
        const pte_t pte = softleaf_to_pte(entry);

        /*
         * Since the PTE can be an invalid softleaf entry (e.g. the none PTE),
         * we need to do this manually.
         */
        entry = __pte_to_swp_entry(pte);
        entry = swp_entry(__swp_type(entry), __swp_offset(entry));

        return swp_offset(entry) + 1;
}

/* Can be overridden by an architecture for additional checks. */
__weak unsigned long arch_max_swapfile_size(void)
{
        return generic_max_swapfile_size();
}

static unsigned long read_swap_header(struct swap_info_struct *si,
                                        union swap_header *swap_header,
                                        struct inode *inode)
{
        int i;
        unsigned long maxpages;
        unsigned long swapfilepages;
        unsigned long last_page;

        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
                pr_err("Unable to find swap-space signature\n");
                return 0;
        }

        /* swap partition endianness hack... */
        if (swab32(swap_header->info.version) == 1) {
                swab32s(&swap_header->info.version);
                swab32s(&swap_header->info.last_page);
                swab32s(&swap_header->info.nr_badpages);
                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                        return 0;
                for (i = 0; i < swap_header->info.nr_badpages; i++)
                        swab32s(&swap_header->info.badpages[i]);
        }
        /* Check the swap header's sub-version */
        if (swap_header->info.version != 1) {
                pr_warn("Unable to handle swap header version %d\n",
                        swap_header->info.version);
                return 0;
        }

        maxpages = swapfile_maximum_size;
        last_page = swap_header->info.last_page;
        if (!last_page) {
                pr_warn("Empty swap-file\n");
                return 0;
        }
        if (last_page > maxpages) {
                pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
                        K(maxpages), K(last_page));
        }
        if (maxpages > last_page) {
                maxpages = last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
                if ((unsigned int)maxpages == 0)
                        maxpages = UINT_MAX;
        }

        if (!maxpages)
                return 0;
        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
        if (swapfilepages && maxpages > swapfilepages) {
                pr_warn("Swap area shorter than signature indicates\n");
                return 0;
        }
        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
                return 0;
        if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                return 0;

        return maxpages;
}

static int setup_swap_clusters_info(struct swap_info_struct *si,
                                    union swap_header *swap_header,
                                    unsigned long maxpages)
{
        unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
        struct swap_cluster_info *cluster_info;
        int err = -ENOMEM;
        unsigned long i;

        cluster_info = kvzalloc_objs(*cluster_info, nr_clusters);
        if (!cluster_info)
                goto err;

        for (i = 0; i < nr_clusters; i++)
                spin_lock_init(&cluster_info[i].lock);

        if (!(si->flags & SWP_SOLIDSTATE)) {
                si->global_cluster = kmalloc_obj(*si->global_cluster);
                if (!si->global_cluster)
                        goto err;
                for (i = 0; i < SWAP_NR_ORDERS; i++)
                        si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
                spin_lock_init(&si->global_cluster_lock);
        }

        /*
         * Mark unusable pages (header page, bad pages, and the EOF part of
         * the last cluster) as unavailable. The clusters aren't marked free
         * yet, so no list operations are involved yet.
         */
        err = swap_cluster_setup_bad_slot(si, cluster_info, 0, false);
        if (err)
                goto err;
        for (i = 0; i < swap_header->info.nr_badpages; i++) {
                unsigned int page_nr = swap_header->info.badpages[i];

                if (!page_nr || page_nr > swap_header->info.last_page) {
                        pr_warn("Bad slot offset is out of border: %d (last_page: %d)\n",
                                page_nr, swap_header->info.last_page);
                        err = -EINVAL;
                        goto err;
                }
                err = swap_cluster_setup_bad_slot(si, cluster_info, page_nr, false);
                if (err)
                        goto err;
        }
        for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
                err = swap_cluster_setup_bad_slot(si, cluster_info, i, true);
                if (err)
                        goto err;
        }

        INIT_LIST_HEAD(&si->free_clusters);
        INIT_LIST_HEAD(&si->full_clusters);
        INIT_LIST_HEAD(&si->discard_clusters);

        for (i = 0; i < SWAP_NR_ORDERS; i++) {
                INIT_LIST_HEAD(&si->nonfull_clusters[i]);
                INIT_LIST_HEAD(&si->frag_clusters[i]);
        }

        for (i = 0; i < nr_clusters; i++) {
                struct swap_cluster_info *ci = &cluster_info[i];

                if (ci->count) {
                        ci->flags = CLUSTER_FLAG_NONFULL;
                        list_add_tail(&ci->list, &si->nonfull_clusters[0]);
                } else {
                        ci->flags = CLUSTER_FLAG_FREE;
                        list_add_tail(&ci->list, &si->free_clusters);
                }
        }

        si->cluster_info = cluster_info;
        return 0;
err:
        free_swap_cluster_info(cluster_info, maxpages);
        return err;
}

SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
        struct swap_info_struct *si;
        struct file *swap_file = NULL;
        struct address_space *mapping;
        struct dentry *dentry;
        int prio;
        int error;
        union swap_header *swap_header;
        int nr_extents;
        sector_t span;
        unsigned long maxpages;
        struct folio *folio = NULL;
        struct inode *inode = NULL;
        bool inced_nr_rotate_swap = false;

        if (swap_flags & ~SWAP_FLAGS_VALID)
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        /*
         * Allocate or reuse existing !SWP_USED swap_info. The returned
         * si will stay in a dying status, so nothing will access its content
         * until enable_swap_info resurrects its percpu ref and expose it.
         */
        si = alloc_swap_info();
        if (IS_ERR(si))
                return PTR_ERR(si);

        INIT_WORK(&si->discard_work, swap_discard_work);
        INIT_WORK(&si->reclaim_work, swap_reclaim_work);

        CLASS(filename, name)(specialfile);
        swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
        if (IS_ERR(swap_file)) {
                error = PTR_ERR(swap_file);
                swap_file = NULL;
                goto bad_swap;
        }

        mapping = swap_file->f_mapping;
        dentry = swap_file->f_path.dentry;
        inode = mapping->host;

        error = claim_swapfile(si, inode);
        if (unlikely(error))
                goto bad_swap;

        inode_lock(inode);
        if (d_unlinked(dentry) || cant_mount(dentry)) {
                error = -ENOENT;
                goto bad_swap_unlock_inode;
        }
        if (IS_SWAPFILE(inode)) {
                error = -EBUSY;
                goto bad_swap_unlock_inode;
        }

        /*
         * The swap subsystem needs a major overhaul to support this.
         * It doesn't work yet so just disable it for now.
         */
        if (mapping_min_folio_order(mapping) > 0) {
                error = -EINVAL;
                goto bad_swap_unlock_inode;
        }

        /*
         * Read the swap header.
         */
        if (!mapping->a_ops->read_folio) {
                error = -EINVAL;
                goto bad_swap_unlock_inode;
        }
        folio = read_mapping_folio(mapping, 0, swap_file);
        if (IS_ERR(folio)) {
                error = PTR_ERR(folio);
                goto bad_swap_unlock_inode;
        }
        swap_header = kmap_local_folio(folio, 0);

        maxpages = read_swap_header(si, swap_header, inode);
        if (unlikely(!maxpages)) {
                error = -EINVAL;
                goto bad_swap_unlock_inode;
        }

        si->max = maxpages;
        si->pages = maxpages - 1;
        nr_extents = setup_swap_extents(si, swap_file, &span);
        if (nr_extents < 0) {
                error = nr_extents;
                goto bad_swap_unlock_inode;
        }
        if (si->pages != si->max - 1) {
                pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max);
                error = -EINVAL;
                goto bad_swap_unlock_inode;
        }

        maxpages = si->max;

        /* Set up the swap cluster info */
        error = setup_swap_clusters_info(si, swap_header, maxpages);
        if (error)
                goto bad_swap_unlock_inode;

        error = swap_cgroup_swapon(si->type, maxpages);
        if (error)
                goto bad_swap_unlock_inode;

        /*
         * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
         * be above MAX_PAGE_ORDER incase of a large swap file.
         */
        si->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long),
                                     GFP_KERNEL | __GFP_ZERO);
        if (!si->zeromap) {
                error = -ENOMEM;
                goto bad_swap_unlock_inode;
        }

        if (si->bdev && bdev_stable_writes(si->bdev))
                si->flags |= SWP_STABLE_WRITES;

        if (si->bdev && bdev_synchronous(si->bdev))
                si->flags |= SWP_SYNCHRONOUS_IO;

        if (si->bdev && !bdev_rot(si->bdev)) {
                si->flags |= SWP_SOLIDSTATE;
        } else {
                atomic_inc(&nr_rotate_swap);
                inced_nr_rotate_swap = true;
        }

        if ((swap_flags & SWAP_FLAG_DISCARD) &&
            si->bdev && bdev_max_discard_sectors(si->bdev)) {
                /*
                 * When discard is enabled for swap with no particular
                 * policy flagged, we set all swap discard flags here in
                 * order to sustain backward compatibility with older
                 * swapon(8) releases.
                 */
                si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
                             SWP_PAGE_DISCARD);

                /*
                 * By flagging sys_swapon, a sysadmin can tell us to
                 * either do single-time area discards only, or to just
                 * perform discards for released swap page-clusters.
                 * Now it's time to adjust the p->flags accordingly.
                 */
                if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
                        si->flags &= ~SWP_PAGE_DISCARD;
                else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
                        si->flags &= ~SWP_AREA_DISCARD;

                /* issue a swapon-time discard if it's still required */
                if (si->flags & SWP_AREA_DISCARD) {
                        int err = discard_swap(si);
                        if (unlikely(err))
                                pr_err("swapon: discard_swap(%p): %d\n",
                                        si, err);
                }
        }

        error = zswap_swapon(si->type, maxpages);
        if (error)
                goto bad_swap_unlock_inode;

        /*
         * Flush any pending IO and dirty mappings before we start using this
         * swap device.
         */
        inode->i_flags |= S_SWAPFILE;
        error = inode_drain_writes(inode);
        if (error) {
                inode->i_flags &= ~S_SWAPFILE;
                goto free_swap_zswap;
        }

        mutex_lock(&swapon_mutex);
        prio = DEF_SWAP_PRIO;
        if (swap_flags & SWAP_FLAG_PREFER)
                prio = swap_flags & SWAP_FLAG_PRIO_MASK;

        /*
         * The plist prio is negated because plist ordering is
         * low-to-high, while swap ordering is high-to-low
         */
        si->prio = prio;
        si->list.prio = -si->prio;
        si->avail_list.prio = -si->prio;
        si->swap_file = swap_file;

        /* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */
        enable_swap_info(si);

        pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s\n",
                K(si->pages), name->name, si->prio, nr_extents,
                K((unsigned long long)span),
                (si->flags & SWP_SOLIDSTATE) ? "SS" : "",
                (si->flags & SWP_DISCARDABLE) ? "D" : "",
                (si->flags & SWP_AREA_DISCARD) ? "s" : "",
                (si->flags & SWP_PAGE_DISCARD) ? "c" : "");

        mutex_unlock(&swapon_mutex);
        atomic_inc(&proc_poll_event);
        wake_up_interruptible(&proc_poll_wait);

        error = 0;
        goto out;
free_swap_zswap:
        zswap_swapoff(si->type);
bad_swap_unlock_inode:
        inode_unlock(inode);
bad_swap:
        kfree(si->global_cluster);
        si->global_cluster = NULL;
        inode = NULL;
        destroy_swap_extents(si, swap_file);
        swap_cgroup_swapoff(si->type);
        free_swap_cluster_info(si->cluster_info, si->max);
        si->cluster_info = NULL;
        kvfree(si->zeromap);
        si->zeromap = NULL;
        /*
         * Clear the SWP_USED flag after all resources are freed so
         * alloc_swap_info can reuse this si safely.
         */
        spin_lock(&swap_lock);
        si->flags = 0;
        spin_unlock(&swap_lock);
        if (inced_nr_rotate_swap)
                atomic_dec(&nr_rotate_swap);
        if (swap_file)
                filp_close(swap_file, NULL);
out:
        if (!IS_ERR_OR_NULL(folio))
                folio_release_kmap(folio, swap_header);
        if (inode)
                inode_unlock(inode);
        return error;
}

void si_swapinfo(struct sysinfo *val)
{
        unsigned int type;
        unsigned long nr_to_be_unused = 0;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *si = swap_info[type];

                if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
                        nr_to_be_unused += swap_usage_in_pages(si);
        }
        val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
        val->totalswap = total_swap_pages + nr_to_be_unused;
        spin_unlock(&swap_lock);
}

/*
 * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
 * @entry: first swap entry from which we want to increase the refcount.
 *
 * Returns 0 for success, or -ENOMEM if the extend table is required
 * but could not be atomically allocated.  Returns -EINVAL if the swap
 * entry is invalid, which might occur if a page table entry has got
 * corrupted.
 *
 * Context: Caller must ensure there is no race condition on the reference
 * owner. e.g., locking the PTL of a PTE containing the entry being increased.
 * Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should
 * be used.
 */
int swap_dup_entry_direct(swp_entry_t entry)
{
        struct swap_info_struct *si;

        si = swap_entry_to_info(entry);
        if (WARN_ON_ONCE(!si)) {
                pr_err("%s%08lx\n", Bad_file, entry.val);
                return -EINVAL;
        }

        /*
         * The caller must be increasing the swap count from a direct
         * reference of the swap slot (e.g. a swap entry in page table).
         * So the swap count must be >= 1.
         */
        VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry));

        return swap_dup_entries_cluster(si, swp_offset(entry), 1);
}

#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
static bool __has_usable_swap(void)
{
        return !plist_head_empty(&swap_active_head);
}

void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
        struct swap_info_struct *si;

        if (!(gfp & __GFP_IO))
                return;

        if (!__has_usable_swap())
                return;

        if (!blk_cgroup_congested())
                return;

        /*
         * We've already scheduled a throttle, avoid taking the global swap
         * lock.
         */
        if (current->throttle_disk)
                return;

        spin_lock(&swap_avail_lock);
        plist_for_each_entry(si, &swap_avail_head, avail_list) {
                if (si->bdev) {
                        blkcg_schedule_throttle(si->bdev->bd_disk, true);
                        break;
                }
        }
        spin_unlock(&swap_avail_lock);
}
#endif

static int __init swapfile_init(void)
{
        swapfile_maximum_size = arch_max_swapfile_size();

        /*
         * Once a cluster is freed, it's swap table content is read
         * only, and all swap cache readers (swap_cache_*) verifies
         * the content before use. So it's safe to use RCU slab here.
         */
        if (!SWP_TABLE_USE_PAGE)
                swap_table_cachep = kmem_cache_create("swap_table",
                                    sizeof(struct swap_table),
                                    0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);

#ifdef CONFIG_MIGRATION
        if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
                swap_migration_ad_supported = true;
#endif        /* CONFIG_MIGRATION */

        return 0;
}
subsys_initcall(swapfile_init);


































































































































































































































































































































































































































































































   14 



















































































































































































































































    6 













































































































































































































































































































































































































































































































































   13 









   14 





























    1 

   13 

    6 
    6 






   14 
    3 
















    3 










   13 





   13 




































































































































































































































































































































































































































































































































































    3 






    3 
























































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  fs/eventpoll.c (Efficient event retrieval implementation)
 *  Copyright (C) 2001,...,2009         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/rbtree.h>
#include <linux/wait.h>
#include <linux/eventpoll.h>
#include <linux/mount.h>
#include <linux/bitops.h>
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
#include <linux/capability.h>
#include <net/busy_poll.h>

/*
 * LOCKING:
 * There are three level of locking required by epoll :
 *
 * 1) epnested_mutex (mutex)
 * 2) ep->mtx (mutex)
 * 3) ep->lock (spinlock)
 *
 * The acquire order is the one listed above, from 1 to 3.
 * We need a spinlock (ep->lock) because we manipulate objects
 * from inside the poll callback, that might be triggered from
 * a wake_up() that in turn might be called from IRQ context.
 * So we can't sleep inside the poll callback and hence we need
 * a spinlock. During the event transfer loop (from kernel to
 * user space) we could end up sleeping due a copy_to_user(), so
 * we need a lock that will allow us to sleep. This lock is a
 * mutex (ep->mtx). It is acquired during the event transfer loop,
 * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
 * The epnested_mutex is acquired when inserting an epoll fd onto another
 * epoll fd. We do this so that we walk the epoll tree and ensure that this
 * insertion does not create a cycle of epoll file descriptors, which
 * could lead to deadlock. We need a global mutex to prevent two
 * simultaneous inserts (A into B and B into A) from racing and
 * constructing a cycle without either insert observing that it is
 * going to.
 * It is necessary to acquire multiple "ep->mtx"es at once in the
 * case when one epoll fd is added to another. In this case, we
 * always acquire the locks in the order of nesting (i.e. after
 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
 * before e2->mtx). Since we disallow cycles of epoll file
 * descriptors, this ensures that the mutexes are well-ordered. In
 * order to communicate this nesting to lockdep, when walking a tree
 * of epoll file descriptors, we use the current recursion depth as
 * the lockdep subkey.
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epnested_mutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
 * Events that require holding "epnested_mutex" are very rare, while for
 * normal operations the epoll private "ep->mtx" will guarantee
 * a better scalability.
 */

/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)

#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)

#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
                                EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)

/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4

#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))

#define EP_UNACTIVE_PTR ((void *) -1L)

#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))

struct epoll_filefd {
        struct file *file;
        int fd;
} __packed;

/* Wait structure used by the poll hooks */
struct eppoll_entry {
        /* List header used to link this structure to the "struct epitem" */
        struct eppoll_entry *next;

        /* The "base" pointer is set to the container "struct epitem" */
        struct epitem *base;

        /*
         * Wait queue item that will be linked to the target file wait
         * queue head.
         */
        wait_queue_entry_t wait;

        /* The wait queue head that linked the "wait" wait queue item */
        wait_queue_head_t *whead;
};

/*
 * Each file descriptor added to the eventpoll interface will
 * have an entry of this type linked to the "rbr" RB tree.
 * Avoid increasing the size of this struct, there can be many thousands
 * of these on a server and we do not want this to take another cache line.
 */
struct epitem {
        union {
                /* RB tree node links this structure to the eventpoll RB tree */
                struct rb_node rbn;
                /* Used to free the struct epitem */
                struct rcu_head rcu;
        };

        /* List header used to link this structure to the eventpoll ready list */
        struct list_head rdllink;

        /*
         * Works together "struct eventpoll"->ovflist in keeping the
         * single linked chain of items.
         */
        struct epitem *next;

        /* The file descriptor information this item refers to */
        struct epoll_filefd ffd;

        /*
         * Protected by file->f_lock, true for to-be-released epitem already
         * removed from the "struct file" items list; together with
         * eventpoll->refcount orchestrates "struct eventpoll" disposal
         */
        bool dying;

        /* List containing poll wait queues */
        struct eppoll_entry *pwqlist;

        /* The "container" of this item */
        struct eventpoll *ep;

        /* List header used to link this item to the "struct file" items list */
        struct hlist_node fllink;

        /* wakeup_source used when EPOLLWAKEUP is set */
        struct wakeup_source __rcu *ws;

        /* The structure that describe the interested events and the source fd */
        struct epoll_event event;
};

/*
 * This structure is stored inside the "private_data" member of the file
 * structure and represents the main data structure for the eventpoll
 * interface.
 */
struct eventpoll {
        /*
         * This mutex is used to ensure that files are not removed
         * while epoll is using them. This is held during the event
         * collection loop, the file cleanup path, the epoll file exit
         * code and the ctl operations.
         */
        struct mutex mtx;

        /* Wait queue used by sys_epoll_wait() */
        wait_queue_head_t wq;

        /* Wait queue used by file->poll() */
        wait_queue_head_t poll_wait;

        /* List of ready file descriptors */
        struct list_head rdllist;

        /* Lock which protects rdllist and ovflist */
        spinlock_t lock;

        /* RB tree root used to store monitored fd structs */
        struct rb_root_cached rbr;

        /*
         * This is a single linked list that chains all the "struct epitem" that
         * happened while transferring ready events to userspace w/out
         * holding ->lock.
         */
        struct epitem *ovflist;

        /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
        struct wakeup_source *ws;

        /* The user that created the eventpoll descriptor */
        struct user_struct *user;

        struct file *file;

        /* used to optimize loop detection check */
        u64 gen;
        struct hlist_head refs;
        u8 loop_check_depth;

        /*
         * usage count, used together with epitem->dying to
         * orchestrate the disposal of this struct
         */
        refcount_t refcount;

        /* used to defer freeing past ep_get_upwards_depth_proc() RCU walk */
        struct rcu_head rcu;

#ifdef CONFIG_NET_RX_BUSY_POLL
        /* used to track busy poll napi_id */
        unsigned int napi_id;
        /* busy poll timeout */
        u32 busy_poll_usecs;
        /* busy poll packet budget */
        u16 busy_poll_budget;
        bool prefer_busy_poll;
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /* tracks wakeup nests for lockdep validation */
        u8 nests;
#endif
};

/* Wrapper struct used by poll queueing */
struct ep_pqueue {
        poll_table pt;
        struct epitem *epi;
};

/*
 * Configuration options available inside /proc/sys/fs/epoll/
 */
/* Maximum number of epoll watched descriptors, per user */
static long max_user_watches __read_mostly;

/* Used for cycles detection */
static DEFINE_MUTEX(epnested_mutex);

static u64 loop_check_gen = 0;

/* Used to check for epoll file descriptor inclusion loops */
static struct eventpoll *inserting_into;

/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __ro_after_init;

/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __ro_after_init;

/*
 * List of files with newly added links, where we may need to limit the number
 * of emanating paths. Protected by the epnested_mutex.
 */
struct epitems_head {
        struct hlist_head epitems;
        struct epitems_head *next;
};
static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;

static struct kmem_cache *ephead_cache __ro_after_init;

static inline void free_ephead(struct epitems_head *head)
{
        if (head)
                kmem_cache_free(ephead_cache, head);
}

static void list_file(struct file *file)
{
        struct epitems_head *head;

        head = container_of(file->f_ep, struct epitems_head, epitems);
        if (!head->next) {
                head->next = tfile_check_list;
                tfile_check_list = head;
        }
}

static void unlist_file(struct epitems_head *head)
{
        struct epitems_head *to_free = head;
        struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
        if (p) {
                struct epitem *epi= container_of(p, struct epitem, fllink);
                spin_lock(&epi->ffd.file->f_lock);
                if (!hlist_empty(&head->epitems))
                        to_free = NULL;
                head->next = NULL;
                spin_unlock(&epi->ffd.file->f_lock);
        }
        free_ephead(to_free);
}

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static long long_zero;
static long long_max = LONG_MAX;

static const struct ctl_table epoll_table[] = {
        {
                .procname        = "max_user_watches",
                .data                = &max_user_watches,
                .maxlen                = sizeof(max_user_watches),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
                .extra1                = &long_zero,
                .extra2                = &long_max,
        },
};

static void __init epoll_sysctls_init(void)
{
        register_sysctl("fs/epoll", epoll_table);
}
#else
#define epoll_sysctls_init() do { } while (0)
#endif /* CONFIG_SYSCTL */

static const struct file_operations eventpoll_fops;

static inline int is_file_epoll(struct file *f)
{
        return f->f_op == &eventpoll_fops;
}

/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
                              struct file *file, int fd)
{
        ffd->file = file;
        ffd->fd = fd;
}

/* Compare RB tree keys */
static inline int ep_cmp_ffd(struct epoll_filefd *p1,
                             struct epoll_filefd *p2)
{
        return (p1->file > p2->file ? +1:
                (p1->file < p2->file ? -1 : p1->fd - p2->fd));
}

/* Tells us if the item is currently linked */
static inline int ep_is_linked(struct epitem *epi)
{
        return !list_empty(&epi->rdllink);
}

static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
{
        return container_of(p, struct eppoll_entry, wait);
}

/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{
        return container_of(p, struct eppoll_entry, wait)->base;
}

/**
 * ep_events_available - Checks if ready events might be available.
 *
 * @ep: Pointer to the eventpoll context.
 *
 * Return: a value different than %zero if ready events are available,
 *          or %zero otherwise.
 */
static inline int ep_events_available(struct eventpoll *ep)
{
        return !list_empty_careful(&ep->rdllist) ||
                READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}

#ifdef CONFIG_NET_RX_BUSY_POLL
/**
 * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
 * from the epoll instance ep is preferred, but if it is not set fallback to
 * the system-wide global via busy_loop_timeout.
 *
 * @start_time: The start time used to compute the remaining time until timeout.
 * @ep: Pointer to the eventpoll context.
 *
 * Return: true if the timeout has expired, false otherwise.
 */
static bool busy_loop_ep_timeout(unsigned long start_time,
                                 struct eventpoll *ep)
{
        unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        } else {
                return busy_loop_timeout(start_time);
        }
}

static bool ep_busy_loop_on(struct eventpoll *ep)
{
        return !!READ_ONCE(ep->busy_poll_usecs) ||
               READ_ONCE(ep->prefer_busy_poll) ||
               net_busy_loop_on();
}

static bool ep_busy_loop_end(void *p, unsigned long start_time)
{
        struct eventpoll *ep = p;

        return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
}

/*
 * Busy poll if globally on and supporting sockets found && no events,
 * busy loop will return if need_resched or ep_events_available.
 *
 * we must do our busy polling with irqs enabled
 */
static bool ep_busy_loop(struct eventpoll *ep)
{
        unsigned int napi_id = READ_ONCE(ep->napi_id);
        u16 budget = READ_ONCE(ep->busy_poll_budget);
        bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);

        if (!budget)
                budget = BUSY_POLL_BUDGET;

        if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) {
                napi_busy_loop(napi_id, ep_busy_loop_end,
                               ep, prefer_busy_poll, budget);
                if (ep_events_available(ep))
                        return true;
                /*
                 * Busy poll timed out.  Drop NAPI ID for now, we can add
                 * it back in when we have moved a socket with a valid NAPI
                 * ID onto the ready list.
                 */
                if (prefer_busy_poll)
                        napi_resume_irqs(napi_id);
                ep->napi_id = 0;
                return false;
        }
        return false;
}

/*
 * Set epoll busy poll NAPI ID from sk.
 */
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
        struct eventpoll *ep = epi->ep;
        unsigned int napi_id;
        struct socket *sock;
        struct sock *sk;

        if (!ep_busy_loop_on(ep))
                return;

        sock = sock_from_file(epi->ffd.file);
        if (!sock)
                return;

        sk = sock->sk;
        if (!sk)
                return;

        napi_id = READ_ONCE(sk->sk_napi_id);

        /* Non-NAPI IDs can be rejected
         *        or
         * Nothing to do if we already have this ID
         */
        if (!napi_id_valid(napi_id) || napi_id == ep->napi_id)
                return;

        /* record NAPI ID for use in next busy poll */
        ep->napi_id = napi_id;
}

static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        struct eventpoll *ep = file->private_data;
        void __user *uarg = (void __user *)arg;
        struct epoll_params epoll_params;

        switch (cmd) {
        case EPIOCSPARAMS:
                if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
                        return -EFAULT;

                /* pad byte must be zero */
                if (epoll_params.__pad)
                        return -EINVAL;

                if (epoll_params.busy_poll_usecs > S32_MAX)
                        return -EINVAL;

                if (epoll_params.prefer_busy_poll > 1)
                        return -EINVAL;

                if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
                    !capable(CAP_NET_ADMIN))
                        return -EPERM;

                WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
                WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
                WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
                return 0;
        case EPIOCGPARAMS:
                memset(&epoll_params, 0, sizeof(epoll_params));
                epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
                epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
                epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
                if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
                        return -EFAULT;
                return 0;
        default:
                return -ENOIOCTLCMD;
        }
}

static void ep_suspend_napi_irqs(struct eventpoll *ep)
{
        unsigned int napi_id = READ_ONCE(ep->napi_id);

        if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
                napi_suspend_irqs(napi_id);
}

static void ep_resume_napi_irqs(struct eventpoll *ep)
{
        unsigned int napi_id = READ_ONCE(ep->napi_id);

        if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
                napi_resume_irqs(napi_id);
}

#else

static inline bool ep_busy_loop(struct eventpoll *ep)
{
        return false;
}

static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
}

static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        return -EOPNOTSUPP;
}

static void ep_suspend_napi_irqs(struct eventpoll *ep)
{
}

static void ep_resume_napi_irqs(struct eventpoll *ep)
{
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

/*
 * As described in commit 0ccf831cb lockdep: annotate epoll
 * the use of wait queues used by epoll is done in a very controlled
 * manner. Wake ups can nest inside each other, but are never done
 * with the same locking. For example:
 *
 *   dfd = socket(...);
 *   efd1 = epoll_create();
 *   efd2 = epoll_create();
 *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
 *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
 *
 * When a packet arrives to the device underneath "dfd", the net code will
 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
 * callback wakeup entry on that queue, and the wake_up() performed by the
 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
 * (efd1) notices that it may have some event ready, so it needs to wake up
 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
 * that ends up in another wake_up(), after having checked about the
 * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid
 * stack blasting.
 *
 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
 * this special case of epoll.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
                             unsigned pollflags)
{
        struct eventpoll *ep_src;
        unsigned long flags;
        u8 nests = 0;

        /*
         * To set the subclass or nesting level for spin_lock_irqsave_nested()
         * it might be natural to create a per-cpu nest count. However, since
         * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
         * schedule() in the -rt kernel, the per-cpu variable are no longer
         * protected. Thus, we are introducing a per eventpoll nest field.
         * If we are not being call from ep_poll_callback(), epi is NULL and
         * we are at the first level of nesting, 0. Otherwise, we are being
         * called from ep_poll_callback() and if a previous wakeup source is
         * not an epoll file itself, we are at depth 1 since the wakeup source
         * is depth 0. If the wakeup source is a previous epoll file in the
         * wakeup chain then we use its nests value and record ours as
         * nests + 1. The previous epoll file nests value is stable since its
         * already holding its own poll_wait.lock.
         */
        if (epi) {
                if ((is_file_epoll(epi->ffd.file))) {
                        ep_src = epi->ffd.file->private_data;
                        nests = ep_src->nests;
                } else {
                        nests = 1;
                }
        }
        spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
        ep->nests = nests + 1;
        wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
        ep->nests = 0;
        spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}

#else

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
                             __poll_t pollflags)
{
        wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}

#endif

static void ep_remove_wait_queue(struct eppoll_entry *pwq)
{
        wait_queue_head_t *whead;

        rcu_read_lock();
        /*
         * If it is cleared by POLLFREE, it should be rcu-safe.
         * If we read NULL we need a barrier paired with
         * smp_store_release() in ep_poll_callback(), otherwise
         * we rely on whead->lock.
         */
        whead = smp_load_acquire(&pwq->whead);
        if (whead)
                remove_wait_queue(whead, &pwq->wait);
        rcu_read_unlock();
}

/*
 * This function unregisters poll callbacks from the associated file
 * descriptor.  Must be called with "mtx" held.
 */
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
        struct eppoll_entry **p = &epi->pwqlist;
        struct eppoll_entry *pwq;

        while ((pwq = *p) != NULL) {
                *p = pwq->next;
                ep_remove_wait_queue(pwq);
                kmem_cache_free(pwq_cache, pwq);
        }
}

/* call only when ep->mtx is held */
static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
{
        return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
}

/* call only when ep->mtx is held */
static inline void ep_pm_stay_awake(struct epitem *epi)
{
        struct wakeup_source *ws = ep_wakeup_source(epi);

        if (ws)
                __pm_stay_awake(ws);
}

static inline bool ep_has_wakeup_source(struct epitem *epi)
{
        return rcu_access_pointer(epi->ws) ? true : false;
}

/* call when ep->mtx cannot be held (ep_poll_callback) */
static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
{
        struct wakeup_source *ws;

        rcu_read_lock();
        ws = rcu_dereference(epi->ws);
        if (ws)
                __pm_stay_awake(ws);
        rcu_read_unlock();
}


/*
 * ep->mutex needs to be held because we could be hit by
 * eventpoll_release_file() and epoll_ctl().
 */
static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
{
        /*
         * Steal the ready list, and re-init the original one to the
         * empty list. Also, set ep->ovflist to NULL so that events
         * happening while looping w/out locks, are not lost. We cannot
         * have the poll callback to queue directly on ep->rdllist,
         * because we want the "sproc" callback to be able to do it
         * in a lockless way.
         */
        lockdep_assert_irqs_enabled();
        spin_lock_irq(&ep->lock);
        list_splice_init(&ep->rdllist, txlist);
        WRITE_ONCE(ep->ovflist, NULL);
        spin_unlock_irq(&ep->lock);
}

static void ep_done_scan(struct eventpoll *ep,
                         struct list_head *txlist)
{
        struct epitem *epi, *nepi;

        spin_lock_irq(&ep->lock);
        /*
         * During the time we spent inside the "sproc" callback, some
         * other events might have been queued by the poll callback.
         * We re-insert them inside the main ready-list here.
         */
        for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
                /*
                 * We need to check if the item is already in the list.
                 * During the "sproc" callback execution time, items are
                 * queued into ->ovflist but the "txlist" might already
                 * contain them, and the list_splice() below takes care of them.
                 */
                if (!ep_is_linked(epi)) {
                        /*
                         * ->ovflist is LIFO, so we have to reverse it in order
                         * to keep in FIFO.
                         */
                        list_add(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
                }
        }
        /*
         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
         * releasing the lock, events will be queued in the normal way inside
         * ep->rdllist.
         */
        WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);

        /*
         * Quickly re-inject items left on "txlist".
         */
        list_splice(txlist, &ep->rdllist);
        __pm_relax(ep->ws);

        if (!list_empty(&ep->rdllist)) {
                if (waitqueue_active(&ep->wq))
                        wake_up(&ep->wq);
        }

        spin_unlock_irq(&ep->lock);
}

static void ep_get(struct eventpoll *ep)
{
        refcount_inc(&ep->refcount);
}

/*
 * Returns true if the event poll can be disposed
 */
static bool ep_refcount_dec_and_test(struct eventpoll *ep)
{
        if (!refcount_dec_and_test(&ep->refcount))
                return false;

        WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
        return true;
}

static void ep_free(struct eventpoll *ep)
{
        ep_resume_napi_irqs(ep);
        mutex_destroy(&ep->mtx);
        free_uid(ep->user);
        wakeup_source_unregister(ep->ws);
        /* ep_get_upwards_depth_proc() may still hold epi->ep under RCU */
        kfree_rcu(ep, rcu);
}

/*
 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
 * all the associated resources. Must be called with "mtx" held.
 * If the dying flag is set, do the removal only if force is true.
 * This prevents ep_clear_and_put() from dropping all the ep references
 * while running concurrently with eventpoll_release_file().
 * Returns true if the eventpoll can be disposed.
 */
static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
{
        struct file *file = epi->ffd.file;
        struct epitems_head *to_free;
        struct hlist_head *head;

        lockdep_assert_irqs_enabled();

        /*
         * Removes poll wait queue hooks.
         */
        ep_unregister_pollwait(ep, epi);

        /* Remove the current item from the list of epoll hooks */
        spin_lock(&file->f_lock);
        if (epi->dying && !force) {
                spin_unlock(&file->f_lock);
                return false;
        }

        to_free = NULL;
        head = file->f_ep;
        if (head->first == &epi->fllink && !epi->fllink.next) {
                /* See eventpoll_release() for details. */
                WRITE_ONCE(file->f_ep, NULL);
                if (!is_file_epoll(file)) {
                        struct epitems_head *v;
                        v = container_of(head, struct epitems_head, epitems);
                        if (!smp_load_acquire(&v->next))
                                to_free = v;
                }
        }
        hlist_del_rcu(&epi->fllink);
        spin_unlock(&file->f_lock);
        free_ephead(to_free);

        rb_erase_cached(&epi->rbn, &ep->rbr);

        spin_lock_irq(&ep->lock);
        if (ep_is_linked(epi))
                list_del_init(&epi->rdllink);
        spin_unlock_irq(&ep->lock);

        wakeup_source_unregister(ep_wakeup_source(epi));
        /*
         * At this point it is safe to free the eventpoll item. Use the union
         * field epi->rcu, since we are trying to minimize the size of
         * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
         * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
         * use of the rbn field.
         */
        kfree_rcu(epi, rcu);

        percpu_counter_dec(&ep->user->epoll_watches);
        return true;
}

/*
 * ep_remove variant for callers owing an additional reference to the ep
 */
static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
{
        if (__ep_remove(ep, epi, false))
                WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
}

static void ep_clear_and_put(struct eventpoll *ep)
{
        struct rb_node *rbp, *next;
        struct epitem *epi;

        /* We need to release all tasks waiting for these file */
        if (waitqueue_active(&ep->poll_wait))
                ep_poll_safewake(ep, NULL, 0);

        mutex_lock(&ep->mtx);

        /*
         * Walks through the whole tree by unregistering poll callbacks.
         */
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);

                ep_unregister_pollwait(ep, epi);
                cond_resched();
        }

        /*
         * Walks through the whole tree and try to free each "struct epitem".
         * Note that ep_remove_safe() will not remove the epitem in case of a
         * racing eventpoll_release_file(); the latter will do the removal.
         * At this point we are sure no poll callbacks will be lingering around.
         * Since we still own a reference to the eventpoll struct, the loop can't
         * dispose it.
         */
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
                next = rb_next(rbp);
                epi = rb_entry(rbp, struct epitem, rbn);
                ep_remove_safe(ep, epi);
                cond_resched();
        }

        mutex_unlock(&ep->mtx);
        if (ep_refcount_dec_and_test(ep))
                ep_free(ep);
}

static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        int ret;

        if (!is_file_epoll(file))
                return -EINVAL;

        switch (cmd) {
        case EPIOCSPARAMS:
        case EPIOCGPARAMS:
                ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        return ret;
}

static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
        struct eventpoll *ep = file->private_data;

        if (ep)
                ep_clear_and_put(ep);

        return 0;
}

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);

static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
{
        struct eventpoll *ep = file->private_data;
        LIST_HEAD(txlist);
        struct epitem *epi, *tmp;
        poll_table pt;
        __poll_t res = 0;

        init_poll_funcptr(&pt, NULL);

        /* Insert inside our poll wait queue */
        poll_wait(file, &ep->poll_wait, wait);

        /*
         * Proceed to find out if wanted events are really available inside
         * the ready list.
         */
        mutex_lock_nested(&ep->mtx, depth);
        ep_start_scan(ep, &txlist);
        list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
                if (ep_item_poll(epi, &pt, depth + 1)) {
                        res = EPOLLIN | EPOLLRDNORM;
                        break;
                } else {
                        /*
                         * Item has been dropped into the ready list by the poll
                         * callback, but it's not actually ready, as far as
                         * caller requested events goes. We can remove it here.
                         */
                        __pm_relax(ep_wakeup_source(epi));
                        list_del_init(&epi->rdllink);
                }
        }
        ep_done_scan(ep, &txlist);
        mutex_unlock(&ep->mtx);
        return res;
}

/*
 * The ffd.file pointer may be in the process of being torn down due to
 * being closed, but we may not have finished eventpoll_release() yet.
 *
 * Normally, even with the atomic_long_inc_not_zero, the file may have
 * been free'd and then gotten re-allocated to something else (since
 * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
 *
 * But for epoll, users hold the ep->mtx mutex, and as such any file in
 * the process of being free'd will block in eventpoll_release_file()
 * and thus the underlying file allocation will not be free'd, and the
 * file re-use cannot happen.
 *
 * For the same reason we can avoid a rcu_read_lock() around the
 * operation - 'ffd.file' cannot go away even if the refcount has
 * reached zero (but we must still not call out to ->poll() functions
 * etc).
 */
static struct file *epi_fget(const struct epitem *epi)
{
        struct file *file;

        file = epi->ffd.file;
        if (!file_ref_get(&file->f_ref))
                file = NULL;
        return file;
}

/*
 * Differs from ep_eventpoll_poll() in that internal callers already have
 * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
 * is correctly annotated.
 */
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
                                 int depth)
{
        struct file *file = epi_fget(epi);
        __poll_t res;

        /*
         * We could return EPOLLERR | EPOLLHUP or something, but let's
         * treat this more as "file doesn't exist, poll didn't happen".
         */
        if (!file)
                return 0;

        pt->_key = epi->event.events;
        if (!is_file_epoll(file))
                res = vfs_poll(file, pt);
        else
                res = __ep_eventpoll_poll(file, pt, depth);
        fput(file);
        return res & epi->event.events;
}

static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
{
        return __ep_eventpoll_poll(file, wait, 0);
}

#ifdef CONFIG_PROC_FS
static void ep_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct eventpoll *ep = f->private_data;
        struct rb_node *rbp;

        mutex_lock(&ep->mtx);
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
                struct inode *inode = file_inode(epi->ffd.file);

                seq_printf(m, "tfd: %8d events: %8x data: %16llx "
                           " pos:%lli ino:%llx sdev:%x\n",
                           epi->ffd.fd, epi->event.events,
                           (long long)epi->event.data,
                           (long long)epi->ffd.file->f_pos,
                           inode->i_ino, inode->i_sb->s_dev);
                if (seq_has_overflowed(m))
                        break;
        }
        mutex_unlock(&ep->mtx);
}
#endif

/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = ep_show_fdinfo,
#endif
        .release        = ep_eventpoll_release,
        .poll                = ep_eventpoll_poll,
        .llseek                = noop_llseek,
        .unlocked_ioctl        = ep_eventpoll_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
};

/*
 * This is called from eventpoll_release() to unlink files from the eventpoll
 * interface. We need to have this facility to cleanup correctly files that are
 * closed without being removed from the eventpoll interface.
 */
void eventpoll_release_file(struct file *file)
{
        struct eventpoll *ep;
        struct epitem *epi;
        bool dispose;

        /*
         * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
         * touching the epitems list before eventpoll_release_file() can access
         * the ep->mtx.
         */
again:
        spin_lock(&file->f_lock);
        if (file->f_ep && file->f_ep->first) {
                epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
                epi->dying = true;
                spin_unlock(&file->f_lock);

                /*
                 * ep access is safe as we still own a reference to the ep
                 * struct
                 */
                ep = epi->ep;
                mutex_lock(&ep->mtx);
                dispose = __ep_remove(ep, epi, true);
                mutex_unlock(&ep->mtx);

                if (dispose && ep_refcount_dec_and_test(ep))
                        ep_free(ep);
                goto again;
        }
        spin_unlock(&file->f_lock);
}

static int ep_alloc(struct eventpoll **pep)
{
        struct eventpoll *ep;

        ep = kzalloc_obj(*ep);
        if (unlikely(!ep))
                return -ENOMEM;

        mutex_init(&ep->mtx);
        spin_lock_init(&ep->lock);
        init_waitqueue_head(&ep->wq);
        init_waitqueue_head(&ep->poll_wait);
        INIT_LIST_HEAD(&ep->rdllist);
        ep->rbr = RB_ROOT_CACHED;
        ep->ovflist = EP_UNACTIVE_PTR;
        ep->user = get_current_user();
        refcount_set(&ep->refcount, 1);

        *pep = ep;

        return 0;
}

/*
 * Search the file inside the eventpoll tree. The RB tree operations
 * are protected by the "mtx" mutex, and ep_find() must be called with
 * "mtx" held.
 */
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
        int kcmp;
        struct rb_node *rbp;
        struct epitem *epi, *epir = NULL;
        struct epoll_filefd ffd;

        ep_set_ffd(&ffd, file, fd);
        for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
                epi = rb_entry(rbp, struct epitem, rbn);
                kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
                if (kcmp > 0)
                        rbp = rbp->rb_right;
                else if (kcmp < 0)
                        rbp = rbp->rb_left;
                else {
                        epir = epi;
                        break;
                }
        }

        return epir;
}

#ifdef CONFIG_KCMP
static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
{
        struct rb_node *rbp;
        struct epitem *epi;

        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (epi->ffd.fd == tfd) {
                        if (toff == 0)
                                return epi;
                        else
                                toff--;
                }
                cond_resched();
        }

        return NULL;
}

struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
                                     unsigned long toff)
{
        struct file *file_raw;
        struct eventpoll *ep;
        struct epitem *epi;

        if (!is_file_epoll(file))
                return ERR_PTR(-EINVAL);

        ep = file->private_data;

        mutex_lock(&ep->mtx);
        epi = ep_find_tfd(ep, tfd, toff);
        if (epi)
                file_raw = epi->ffd.file;
        else
                file_raw = ERR_PTR(-ENOENT);
        mutex_unlock(&ep->mtx);

        return file_raw;
}
#endif /* CONFIG_KCMP */

/*
 * This is the callback that is passed to the wait queue wakeup
 * mechanism. It is called by the stored file descriptors when they
 * have events to report.
 */
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        int pwake = 0;
        struct epitem *epi = ep_item_from_wait(wait);
        struct eventpoll *ep = epi->ep;
        __poll_t pollflags = key_to_poll(key);
        unsigned long flags;
        int ewake = 0;

        spin_lock_irqsave(&ep->lock, flags);

        ep_set_busy_poll_napi_id(epi);

        /*
         * If the event mask does not contain any poll(2) event, we consider the
         * descriptor to be disabled. This condition is likely the effect of the
         * EPOLLONESHOT bit that disables the descriptor when an event is received,
         * until the next EPOLL_CTL_MOD will be issued.
         */
        if (!(epi->event.events & ~EP_PRIVATE_BITS))
                goto out_unlock;

        /*
         * Check the events coming with the callback. At this stage, not
         * every device reports the events in the "key" parameter of the
         * callback. We need to be able to handle both cases here, hence the
         * test for "key" != NULL before the event match test.
         */
        if (pollflags && !(pollflags & epi->event.events))
                goto out_unlock;

        /*
         * If we are transferring events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
         * semantics). All the events that happen during that period of time are
         * chained in ep->ovflist and requeued later on.
         */
        if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
                if (epi->next == EP_UNACTIVE_PTR) {
                        epi->next = READ_ONCE(ep->ovflist);
                        WRITE_ONCE(ep->ovflist, epi);
                        ep_pm_stay_awake_rcu(epi);
                }
        } else if (!ep_is_linked(epi)) {
                /* In the usual case, add event to ready list. */
                list_add_tail(&epi->rdllink, &ep->rdllist);
                ep_pm_stay_awake_rcu(epi);
        }

        /*
         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
         * wait list.
         */
        if (waitqueue_active(&ep->wq)) {
                if ((epi->event.events & EPOLLEXCLUSIVE) &&
                                        !(pollflags & POLLFREE)) {
                        switch (pollflags & EPOLLINOUT_BITS) {
                        case EPOLLIN:
                                if (epi->event.events & EPOLLIN)
                                        ewake = 1;
                                break;
                        case EPOLLOUT:
                                if (epi->event.events & EPOLLOUT)
                                        ewake = 1;
                                break;
                        case 0:
                                ewake = 1;
                                break;
                        }
                }
                if (sync)
                        wake_up_sync(&ep->wq);
                else
                        wake_up(&ep->wq);
        }
        if (waitqueue_active(&ep->poll_wait))
                pwake++;

out_unlock:
        spin_unlock_irqrestore(&ep->lock, flags);

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);

        if (!(epi->event.events & EPOLLEXCLUSIVE))
                ewake = 1;

        if (pollflags & POLLFREE) {
                /*
                 * If we race with ep_remove_wait_queue() it can miss
                 * ->whead = NULL and do another remove_wait_queue() after
                 * us, so we can't use __remove_wait_queue().
                 */
                list_del_init(&wait->entry);
                /*
                 * ->whead != NULL protects us from the race with
                 * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
                 * takes whead->lock held by the caller. Once we nullify it,
                 * nothing protects ep/epi or even wait.
                 */
                smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
        }

        return ewake;
}

/*
 * This is the callback that is used to add our wait queue to the
 * target file wakeup lists.
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                                 poll_table *pt)
{
        struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
        struct epitem *epi = epq->epi;
        struct eppoll_entry *pwq;

        if (unlikely(!epi))        // an earlier allocation has failed
                return;

        pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
        if (unlikely(!pwq)) {
                epq->epi = NULL;
                return;
        }

        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        if (epi->event.events & EPOLLEXCLUSIVE)
                add_wait_queue_exclusive(whead, &pwq->wait);
        else
                add_wait_queue(whead, &pwq->wait);
        pwq->next = epi->pwqlist;
        epi->pwqlist = pwq;
}

static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
{
        int kcmp;
        struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
        struct epitem *epic;
        bool leftmost = true;

        while (*p) {
                parent = *p;
                epic = rb_entry(parent, struct epitem, rbn);
                kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
                if (kcmp > 0) {
                        p = &parent->rb_right;
                        leftmost = false;
                } else
                        p = &parent->rb_left;
        }
        rb_link_node(&epi->rbn, parent, p);
        rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
}



#define PATH_ARR_SIZE 5
/*
 * These are the number paths of length 1 to 5, that we are allowing to emanate
 * from a single file of interest. For example, we allow 1000 paths of length
 * 1, to emanate from each file of interest. This essentially represents the
 * potential wakeup paths, which need to be limited in order to avoid massive
 * uncontrolled wakeup storms. The common use case should be a single ep which
 * is connected to n file sources. In this case each file source has 1 path
 * of length 1. Thus, the numbers below should be more than sufficient. These
 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
 * and delete can't add additional paths. Protected by the epnested_mutex.
 */
static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
static int path_count[PATH_ARR_SIZE];

static int path_count_inc(int nests)
{
        /* Allow an arbitrary number of depth 1 paths */
        if (nests == 0)
                return 0;

        if (++path_count[nests] > path_limits[nests])
                return -1;
        return 0;
}

static void path_count_init(void)
{
        int i;

        for (i = 0; i < PATH_ARR_SIZE; i++)
                path_count[i] = 0;
}

static int reverse_path_check_proc(struct hlist_head *refs, int depth)
{
        int error = 0;
        struct epitem *epi;

        if (depth > EP_MAX_NESTS) /* too deep nesting */
                return -1;

        /* CTL_DEL can remove links here, but that can't increase our count */
        hlist_for_each_entry_rcu(epi, refs, fllink) {
                struct hlist_head *refs = &epi->ep->refs;
                if (hlist_empty(refs))
                        error = path_count_inc(depth);
                else
                        error = reverse_path_check_proc(refs, depth + 1);
                if (error != 0)
                        break;
        }
        return error;
}

/**
 * reverse_path_check - The tfile_check_list is list of epitem_head, which have
 *                      links that are proposed to be newly added. We need to
 *                      make sure that those added links don't add too many
 *                      paths such that we will spend all our time waking up
 *                      eventpoll objects.
 *
 * Return: %zero if the proposed links don't create too many paths,
 *            %-1 otherwise.
 */
static int reverse_path_check(void)
{
        struct epitems_head *p;

        for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
                int error;
                path_count_init();
                rcu_read_lock();
                error = reverse_path_check_proc(&p->epitems, 0);
                rcu_read_unlock();
                if (error)
                        return error;
        }
        return 0;
}

static int ep_create_wakeup_source(struct epitem *epi)
{
        struct name_snapshot n;
        struct wakeup_source *ws;

        if (!epi->ep->ws) {
                epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
                if (!epi->ep->ws)
                        return -ENOMEM;
        }

        take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
        ws = wakeup_source_register(NULL, n.name.name);
        release_dentry_name_snapshot(&n);

        if (!ws)
                return -ENOMEM;
        rcu_assign_pointer(epi->ws, ws);

        return 0;
}

/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
static noinline void ep_destroy_wakeup_source(struct epitem *epi)
{
        struct wakeup_source *ws = ep_wakeup_source(epi);

        RCU_INIT_POINTER(epi->ws, NULL);

        /*
         * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
         * used internally by wakeup_source_remove, too (called by
         * wakeup_source_unregister), so we cannot use call_rcu
         */
        synchronize_rcu();
        wakeup_source_unregister(ws);
}

static int attach_epitem(struct file *file, struct epitem *epi)
{
        struct epitems_head *to_free = NULL;
        struct hlist_head *head = NULL;
        struct eventpoll *ep = NULL;

        if (is_file_epoll(file))
                ep = file->private_data;

        if (ep) {
                head = &ep->refs;
        } else if (!READ_ONCE(file->f_ep)) {
allocate:
                to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
                if (!to_free)
                        return -ENOMEM;
                head = &to_free->epitems;
        }
        spin_lock(&file->f_lock);
        if (!file->f_ep) {
                if (unlikely(!head)) {
                        spin_unlock(&file->f_lock);
                        goto allocate;
                }
                /* See eventpoll_release() for details. */
                WRITE_ONCE(file->f_ep, head);
                to_free = NULL;
        }
        hlist_add_head_rcu(&epi->fllink, file->f_ep);
        spin_unlock(&file->f_lock);
        free_ephead(to_free);
        return 0;
}

/*
 * Must be called with "mtx" held.
 */
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
                     struct file *tfile, int fd, int full_check)
{
        int error, pwake = 0;
        __poll_t revents;
        struct epitem *epi;
        struct ep_pqueue epq;
        struct eventpoll *tep = NULL;

        if (is_file_epoll(tfile))
                tep = tfile->private_data;

        lockdep_assert_irqs_enabled();

        if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
                                            max_user_watches) >= 0))
                return -ENOSPC;
        percpu_counter_inc(&ep->user->epoll_watches);

        if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
                percpu_counter_dec(&ep->user->epoll_watches);
                return -ENOMEM;
        }

        /* Item initialization follow here ... */
        INIT_LIST_HEAD(&epi->rdllink);
        epi->ep = ep;
        ep_set_ffd(&epi->ffd, tfile, fd);
        epi->event = *event;
        epi->next = EP_UNACTIVE_PTR;

        if (tep)
                mutex_lock_nested(&tep->mtx, 1);
        /* Add the current item to the list of active epoll hook for this file */
        if (unlikely(attach_epitem(tfile, epi) < 0)) {
                if (tep)
                        mutex_unlock(&tep->mtx);
                kmem_cache_free(epi_cache, epi);
                percpu_counter_dec(&ep->user->epoll_watches);
                return -ENOMEM;
        }

        if (full_check && !tep)
                list_file(tfile);

        /*
         * Add the current item to the RB tree. All RB tree operations are
         * protected by "mtx", and ep_insert() is called with "mtx" held.
         */
        ep_rbtree_insert(ep, epi);
        if (tep)
                mutex_unlock(&tep->mtx);

        /*
         * ep_remove_safe() calls in the later error paths can't lead to
         * ep_free() as the ep file itself still holds an ep reference.
         */
        ep_get(ep);

        /* now check if we've created too many backpaths */
        if (unlikely(full_check && reverse_path_check())) {
                ep_remove_safe(ep, epi);
                return -EINVAL;
        }

        if (epi->event.events & EPOLLWAKEUP) {
                error = ep_create_wakeup_source(epi);
                if (error) {
                        ep_remove_safe(ep, epi);
                        return error;
                }
        }

        /* Initialize the poll table using the queue callback */
        epq.epi = epi;
        init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

        /*
         * Attach the item to the poll hooks and get current event bits.
         * We can safely use the file* here because its usage count has
         * been increased by the caller of this function. Note that after
         * this operation completes, the poll callback can start hitting
         * the new item.
         */
        revents = ep_item_poll(epi, &epq.pt, 1);

        /*
         * We have to check if something went wrong during the poll wait queue
         * install process. Namely an allocation for a wait queue failed due
         * high memory pressure.
         */
        if (unlikely(!epq.epi)) {
                ep_remove_safe(ep, epi);
                return -ENOMEM;
        }

        /* We have to drop the new item inside our item list to keep track of it */
        spin_lock_irq(&ep->lock);

        /* record NAPI ID of new item if present */
        ep_set_busy_poll_napi_id(epi);

        /* If the file is already "ready" we drop it inside the ready list */
        if (revents && !ep_is_linked(epi)) {
                list_add_tail(&epi->rdllink, &ep->rdllist);
                ep_pm_stay_awake(epi);

                /* Notify waiting tasks that events are available */
                if (waitqueue_active(&ep->wq))
                        wake_up(&ep->wq);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }

        spin_unlock_irq(&ep->lock);

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, NULL, 0);

        return 0;
}

/*
 * Modify the interest event mask by dropping an event if the new mask
 * has a match in the current file status. Must be called with "mtx" held.
 */
static int ep_modify(struct eventpoll *ep, struct epitem *epi,
                     const struct epoll_event *event)
{
        int pwake = 0;
        poll_table pt;

        lockdep_assert_irqs_enabled();

        init_poll_funcptr(&pt, NULL);

        /*
         * Set the new event interest mask before calling f_op->poll();
         * otherwise we might miss an event that happens between the
         * f_op->poll() call and the new event set registering.
         */
        epi->event.events = event->events; /* need barrier below */
        epi->event.data = event->data; /* protected by mtx */
        if (epi->event.events & EPOLLWAKEUP) {
                if (!ep_has_wakeup_source(epi))
                        ep_create_wakeup_source(epi);
        } else if (ep_has_wakeup_source(epi)) {
                ep_destroy_wakeup_source(epi);
        }

        /*
         * The following barrier has two effects:
         *
         * 1) Flush epi changes above to other CPUs.  This ensures
         *    we do not miss events from ep_poll_callback if an
         *    event occurs immediately after we call f_op->poll().
         *    We need this because we did not take ep->lock while
         *    changing epi above (but ep_poll_callback does take
         *    ep->lock).
         *
         * 2) We also need to ensure we do not miss _past_ events
         *    when calling f_op->poll().  This barrier also
         *    pairs with the barrier in wq_has_sleeper (see
         *    comments for wq_has_sleeper).
         *
         * This barrier will now guarantee ep_poll_callback or f_op->poll
         * (or both) will notice the readiness of an item.
         */
        smp_mb();

        /*
         * Get current event bits. We can safely use the file* here because
         * its usage count has been increased by the caller of this function.
         * If the item is "hot" and it is not registered inside the ready
         * list, push it inside.
         */
        if (ep_item_poll(epi, &pt, 1)) {
                spin_lock_irq(&ep->lock);
                if (!ep_is_linked(epi)) {
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);

                        /* Notify waiting tasks that events are available */
                        if (waitqueue_active(&ep->wq))
                                wake_up(&ep->wq);
                        if (waitqueue_active(&ep->poll_wait))
                                pwake++;
                }
                spin_unlock_irq(&ep->lock);
        }

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, NULL, 0);

        return 0;
}

static int ep_send_events(struct eventpoll *ep,
                          struct epoll_event __user *events, int maxevents)
{
        struct epitem *epi, *tmp;
        LIST_HEAD(txlist);
        poll_table pt;
        int res = 0;

        /*
         * Always short-circuit for fatal signals to allow threads to make a
         * timely exit without the chance of finding more events available and
         * fetching repeatedly.
         */
        if (fatal_signal_pending(current))
                return -EINTR;

        init_poll_funcptr(&pt, NULL);

        mutex_lock(&ep->mtx);
        ep_start_scan(ep, &txlist);

        /*
         * We can loop without lock because we are passed a task private list.
         * Items cannot vanish during the loop we are holding ep->mtx.
         */
        list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
                struct wakeup_source *ws;
                __poll_t revents;

                if (res >= maxevents)
                        break;

                /*
                 * Activate ep->ws before deactivating epi->ws to prevent
                 * triggering auto-suspend here (in case we reactive epi->ws
                 * below).
                 *
                 * This could be rearranged to delay the deactivation of epi->ws
                 * instead, but then epi->ws would temporarily be out of sync
                 * with ep_is_linked().
                 */
                ws = ep_wakeup_source(epi);
                if (ws) {
                        if (ws->active)
                                __pm_stay_awake(ep->ws);
                        __pm_relax(ws);
                }

                list_del_init(&epi->rdllink);

                /*
                 * If the event mask intersect the caller-requested one,
                 * deliver the event to userspace. Again, we are holding ep->mtx,
                 * so no operations coming from userspace can change the item.
                 */
                revents = ep_item_poll(epi, &pt, 1);
                if (!revents)
                        continue;

                events = epoll_put_uevent(revents, epi->event.data, events);
                if (!events) {
                        list_add(&epi->rdllink, &txlist);
                        ep_pm_stay_awake(epi);
                        if (!res)
                                res = -EFAULT;
                        break;
                }
                res++;
                if (epi->event.events & EPOLLONESHOT)
                        epi->event.events &= EP_PRIVATE_BITS;
                else if (!(epi->event.events & EPOLLET)) {
                        /*
                         * If this file has been added with Level
                         * Trigger mode, we need to insert back inside
                         * the ready list, so that the next call to
                         * epoll_wait() will check again the events
                         * availability. At this point, no one can insert
                         * into ep->rdllist besides us. The epoll_ctl()
                         * callers are locked out by
                         * ep_send_events() holding "mtx" and the
                         * poll callback will queue them in ep->ovflist.
                         */
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
                }
        }
        ep_done_scan(ep, &txlist);
        mutex_unlock(&ep->mtx);

        return res;
}

static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
{
        struct timespec64 now;

        if (ms < 0)
                return NULL;

        if (!ms) {
                to->tv_sec = 0;
                to->tv_nsec = 0;
                return to;
        }

        to->tv_sec = ms / MSEC_PER_SEC;
        to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);

        ktime_get_ts64(&now);
        *to = timespec64_add_safe(now, *to);
        return to;
}

/*
 * autoremove_wake_function, but remove even on failure to wake up, because we
 * know that default_wake_function/ttwu will only fail if the thread is already
 * woken, and in that case the ep_poll loop will remove the entry anyways, not
 * try to reuse it.
 */
static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
                                       unsigned int mode, int sync, void *key)
{
        int ret = default_wake_function(wq_entry, mode, sync, key);

        /*
         * Pairs with list_empty_careful in ep_poll, and ensures future loop
         * iterations see the cause of this wakeup.
         */
        list_del_init_careful(&wq_entry->entry);
        return ret;
}

static int ep_try_send_events(struct eventpoll *ep,
                              struct epoll_event __user *events, int maxevents)
{
        int res;

        /*
         * Try to transfer events to user space. In case we get 0 events and
         * there's still timeout left over, we go trying again in search of
         * more luck.
         */
        res = ep_send_events(ep, events, maxevents);
        if (res > 0)
                ep_suspend_napi_irqs(ep);
        return res;
}

static int ep_schedule_timeout(ktime_t *to)
{
        if (to)
                return ktime_after(*to, ktime_get());
        else
                return 1;
}

/**
 * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
 *           event buffer.
 *
 * @ep: Pointer to the eventpoll context.
 * @events: Pointer to the userspace buffer where the ready events should be
 *          stored.
 * @maxevents: Size (in terms of number of events) of the caller event buffer.
 * @timeout: Maximum timeout for the ready events fetch operation, in
 *           timespec. If the timeout is zero, the function will not block,
 *           while if the @timeout ptr is NULL, the function will block
 *           until at least one event has been retrieved (or an error
 *           occurred).
 *
 * Return: the number of ready events which have been fetched, or an
 *          error code, in case of error.
 */
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, struct timespec64 *timeout)
{
        int res, eavail, timed_out = 0;
        u64 slack = 0;
        wait_queue_entry_t wait;
        ktime_t expires, *to = NULL;

        lockdep_assert_irqs_enabled();

        if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
                slack = select_estimate_accuracy(timeout);
                to = &expires;
                *to = timespec64_to_ktime(*timeout);
        } else if (timeout) {
                /*
                 * Avoid the unnecessary trip to the wait queue loop, if the
                 * caller specified a non blocking operation.
                 */
                timed_out = 1;
        }

        /*
         * This call is racy: We may or may not see events that are being added
         * to the ready list under the lock (e.g., in IRQ callbacks). For cases
         * with a non-zero timeout, this thread will check the ready list under
         * lock and will add to the wait queue.  For cases with a zero
         * timeout, the user by definition should not care and will have to
         * recheck again.
         */
        eavail = ep_events_available(ep);

        while (1) {
                if (eavail) {
                        res = ep_try_send_events(ep, events, maxevents);
                        if (res)
                                return res;
                }

                if (timed_out)
                        return 0;

                eavail = ep_busy_loop(ep);
                if (eavail)
                        continue;

                if (signal_pending(current))
                        return -EINTR;

                /*
                 * Internally init_wait() uses autoremove_wake_function(),
                 * thus wait entry is removed from the wait queue on each
                 * wakeup. Why it is important? In case of several waiters
                 * each new wakeup will hit the next waiter, giving it the
                 * chance to harvest new event. Otherwise wakeup can be
                 * lost. This is also good performance-wise, because on
                 * normal wakeup path no need to call __remove_wait_queue()
                 * explicitly, thus ep->lock is not taken, which halts the
                 * event delivery.
                 *
                 * In fact, we now use an even more aggressive function that
                 * unconditionally removes, because we don't reuse the wait
                 * entry between loop iterations. This lets us also avoid the
                 * performance issue if a process is killed, causing all of its
                 * threads to wake up without being removed normally.
                 */
                init_wait(&wait);
                wait.func = ep_autoremove_wake_function;

                spin_lock_irq(&ep->lock);
                /*
                 * Barrierless variant, waitqueue_active() is called under
                 * the same lock on wakeup ep_poll_callback() side, so it
                 * is safe to avoid an explicit barrier.
                 */
                __set_current_state(TASK_INTERRUPTIBLE);

                /*
                 * Do the final check under the lock. ep_start/done_scan()
                 * plays with two lists (->rdllist and ->ovflist) and there
                 * is always a race when both lists are empty for short
                 * period of time although events are pending, so lock is
                 * important.
                 */
                eavail = ep_events_available(ep);
                if (!eavail)
                        __add_wait_queue_exclusive(&ep->wq, &wait);

                spin_unlock_irq(&ep->lock);

                if (!eavail)
                        timed_out = !ep_schedule_timeout(to) ||
                                !schedule_hrtimeout_range(to, slack,
                                                          HRTIMER_MODE_ABS);
                __set_current_state(TASK_RUNNING);

                /*
                 * We were woken up, thus go and try to harvest some events.
                 * If timed out and still on the wait queue, recheck eavail
                 * carefully under lock, below.
                 */
                eavail = 1;

                if (!list_empty_careful(&wait.entry)) {
                        spin_lock_irq(&ep->lock);
                        /*
                         * If the thread timed out and is not on the wait queue,
                         * it means that the thread was woken up after its
                         * timeout expired before it could reacquire the lock.
                         * Thus, when wait.entry is empty, it needs to harvest
                         * events.
                         */
                        if (timed_out)
                                eavail = list_empty(&wait.entry);
                        __remove_wait_queue(&ep->wq, &wait);
                        spin_unlock_irq(&ep->lock);
                }
        }
}

/**
 * ep_loop_check_proc - verify that adding an epoll file @ep inside another
 *                      epoll file does not create closed loops, and
 *                      determine the depth of the subtree starting at @ep
 *
 * @ep: the &struct eventpoll to be currently checked.
 * @depth: Current depth of the path being checked.
 *
 * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found
 * a loop or went too deep.
 */
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
        int result = 0;
        struct rb_node *rbp;
        struct epitem *epi;

        if (ep->gen == loop_check_gen)
                return ep->loop_check_depth;

        mutex_lock_nested(&ep->mtx, depth + 1);
        ep->gen = loop_check_gen;
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (unlikely(is_file_epoll(epi->ffd.file))) {
                        struct eventpoll *ep_tovisit;
                        ep_tovisit = epi->ffd.file->private_data;
                        if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
                                result = EP_MAX_NESTS+1;
                        else
                                result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
                        if (result > EP_MAX_NESTS)
                                break;
                } else {
                        /*
                         * If we've reached a file that is not associated with
                         * an ep, then we need to check if the newly added
                         * links are going to add too many wakeup paths. We do
                         * this by adding it to the tfile_check_list, if it's
                         * not already there, and calling reverse_path_check()
                         * during ep_insert().
                         */
                        list_file(epi->ffd.file);
                }
        }
        ep->loop_check_depth = result;
        mutex_unlock(&ep->mtx);

        return result;
}

/* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */
static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth)
{
        int result = 0;
        struct epitem *epi;

        if (ep->gen == loop_check_gen)
                return ep->loop_check_depth;
        hlist_for_each_entry_rcu(epi, &ep->refs, fllink)
                result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1);
        ep->gen = loop_check_gen;
        ep->loop_check_depth = result;
        return result;
}

/**
 * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
 *                 into another epoll file (represented by @ep) does not create
 *                 closed loops or too deep chains.
 *
 * @ep: Pointer to the epoll we are inserting into.
 * @to: Pointer to the epoll to be inserted.
 *
 * Return: %zero if adding the epoll @to inside the epoll @from
 * does not violate the constraints, or %-1 otherwise.
 */
static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
        int depth, upwards_depth;

        inserting_into = ep;
        /*
         * Check how deep down we can get from @to, and whether it is possible
         * to loop up to @ep.
         */
        depth = ep_loop_check_proc(to, 0);
        if (depth > EP_MAX_NESTS)
                return -1;
        /* Check how far up we can go from @ep. */
        rcu_read_lock();
        upwards_depth = ep_get_upwards_depth_proc(ep, 0);
        rcu_read_unlock();

        return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;
}

static void clear_tfile_check_list(void)
{
        rcu_read_lock();
        while (tfile_check_list != EP_UNACTIVE_PTR) {
                struct epitems_head *head = tfile_check_list;
                tfile_check_list = head->next;
                unlist_file(head);
        }
        rcu_read_unlock();
}

/*
 * Open an eventpoll file descriptor.
 */
static int do_epoll_create(int flags)
{
        int error;
        struct eventpoll *ep;

        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

        if (flags & ~EPOLL_CLOEXEC)
                return -EINVAL;
        /*
         * Create the internal data structure ("struct eventpoll").
         */
        error = ep_alloc(&ep);
        if (error < 0)
                return error;
        /*
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
         */
        FD_PREPARE(fdf, O_RDWR | (flags & O_CLOEXEC),
                   anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                                      O_RDWR | (flags & O_CLOEXEC)));
        if (fdf.err) {
                ep_clear_and_put(ep);
                return fdf.err;
        }
        ep->file = fd_prepare_file(fdf);
        return fd_publish(fdf);
}

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
        return do_epoll_create(flags);
}

SYSCALL_DEFINE1(epoll_create, int, size)
{
        if (size <= 0)
                return -EINVAL;

        return do_epoll_create(0);
}

#ifdef CONFIG_PM_SLEEP
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
        if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
                epev->events &= ~EPOLLWAKEUP;
}
#else
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
        epev->events &= ~EPOLLWAKEUP;
}
#endif

static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
                                   bool nonblock)
{
        if (!nonblock) {
                mutex_lock_nested(mutex, depth);
                return 0;
        }
        if (mutex_trylock(mutex))
                return 0;
        return -EAGAIN;
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock)
{
        int error;
        int full_check = 0;
        struct eventpoll *ep;
        struct epitem *epi;
        struct eventpoll *tep = NULL;

        CLASS(fd, f)(epfd);
        if (fd_empty(f))
                return -EBADF;

        /* Get the "struct file *" for the target file */
        CLASS(fd, tf)(fd);
        if (fd_empty(tf))
                return -EBADF;

        /* The target file descriptor must support poll */
        if (!file_can_poll(fd_file(tf)))
                return -EPERM;

        /* Check if EPOLLWAKEUP is allowed */
        if (ep_op_has_event(op))
                ep_take_care_of_epollwakeup(epds);

        /*
         * We have to check that the file structure underneath the file descriptor
         * the user passed to us _is_ an eventpoll file. And also we do not permit
         * adding an epoll file descriptor inside itself.
         */
        error = -EINVAL;
        if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
                goto error_tgt_fput;

        /*
         * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
         * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
         * Also, we do not currently supported nested exclusive wakeups.
         */
        if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
                if (op == EPOLL_CTL_MOD)
                        goto error_tgt_fput;
                if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
                                (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
                        goto error_tgt_fput;
        }

        /*
         * At this point it is safe to assume that the "private_data" contains
         * our own data structure.
         */
        ep = fd_file(f)->private_data;

        /*
         * When we insert an epoll file descriptor inside another epoll file
         * descriptor, there is the chance of creating closed loops, which are
         * better be handled here, than in more critical paths. While we are
         * checking for loops we also determine the list of files reachable
         * and hang them on the tfile_check_list, so we can check that we
         * haven't created too many possible wakeup paths.
         *
         * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
         * the epoll file descriptor is attaching directly to a wakeup source,
         * unless the epoll file descriptor is nested. The purpose of taking the
         * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
         * deep wakeup paths from forming in parallel through multiple
         * EPOLL_CTL_ADD operations.
         */
        error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
        if (error)
                goto error_tgt_fput;
        if (op == EPOLL_CTL_ADD) {
                if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
                    is_file_epoll(fd_file(tf))) {
                        mutex_unlock(&ep->mtx);
                        error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
                        if (error)
                                goto error_tgt_fput;
                        loop_check_gen++;
                        full_check = 1;
                        if (is_file_epoll(fd_file(tf))) {
                                tep = fd_file(tf)->private_data;
                                error = -ELOOP;
                                if (ep_loop_check(ep, tep) != 0)
                                        goto error_tgt_fput;
                        }
                        error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
                        if (error)
                                goto error_tgt_fput;
                }
        }

        /*
         * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
         * above, we can be sure to be able to use the item looked up by
         * ep_find() till we release the mutex.
         */
        epi = ep_find(ep, fd_file(tf), fd);

        error = -EINVAL;
        switch (op) {
        case EPOLL_CTL_ADD:
                if (!epi) {
                        epds->events |= EPOLLERR | EPOLLHUP;
                        error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
                } else
                        error = -EEXIST;
                break;
        case EPOLL_CTL_DEL:
                if (epi) {
                        /*
                         * The eventpoll itself is still alive: the refcount
                         * can't go to zero here.
                         */
                        ep_remove_safe(ep, epi);
                        error = 0;
                } else {
                        error = -ENOENT;
                }
                break;
        case EPOLL_CTL_MOD:
                if (epi) {
                        if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                                epds->events |= EPOLLERR | EPOLLHUP;
                                error = ep_modify(ep, epi, epds);
                        }
                } else
                        error = -ENOENT;
                break;
        }
        mutex_unlock(&ep->mtx);

error_tgt_fput:
        if (full_check) {
                clear_tfile_check_list();
                loop_check_gen++;
                mutex_unlock(&epnested_mutex);
        }
        return error;
}

/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                struct epoll_event __user *, event)
{
        struct epoll_event epds;

        if (ep_op_has_event(op) &&
            copy_from_user(&epds, event, sizeof(struct epoll_event)))
                return -EFAULT;

        return do_epoll_ctl(epfd, op, fd, &epds, false);
}

static int ep_check_params(struct file *file, struct epoll_event __user *evs,
                           int maxevents)
{
        /* The maximum number of event must be greater than zero */
        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
                return -EINVAL;

        /* Verify that the area passed by the user is writeable */
        if (!access_ok(evs, maxevents * sizeof(struct epoll_event)))
                return -EFAULT;

        /*
         * We have to check that the file structure underneath the fd
         * the user passed to us _is_ an eventpoll file.
         */
        if (!is_file_epoll(file))
                return -EINVAL;

        return 0;
}

int epoll_sendevents(struct file *file, struct epoll_event __user *events,
                     int maxevents)
{
        struct eventpoll *ep;
        int ret;

        ret = ep_check_params(file, events, maxevents);
        if (unlikely(ret))
                return ret;

        ep = file->private_data;
        /*
         * Racy call, but that's ok - it should get retried based on
         * poll readiness anyway.
         */
        if (ep_events_available(ep))
                return ep_try_send_events(ep, events, maxevents);
        return 0;
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_wait(2).
 */
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
                         int maxevents, struct timespec64 *to)
{
        struct eventpoll *ep;
        int ret;

        /* Get the "struct file *" for the eventpoll file */
        CLASS(fd, f)(epfd);
        if (fd_empty(f))
                return -EBADF;

        ret = ep_check_params(fd_file(f), events, maxevents);
        if (unlikely(ret))
                return ret;

        /*
         * At this point it is safe to assume that the "private_data" contains
         * our own data structure.
         */
        ep = fd_file(f)->private_data;

        /* Time to fish for events ... */
        return ep_poll(ep, events, maxevents, to);
}

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout)
{
        struct timespec64 to;

        return do_epoll_wait(epfd, events, maxevents,
                             ep_timeout_to_timespec(&to, timeout));
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_pwait(2).
 */
static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
                          int maxevents, struct timespec64 *to,
                          const sigset_t __user *sigmask, size_t sigsetsize)
{
        int error;

        /*
         * If the caller wants a certain signal mask to be set during the wait,
         * we apply it here.
         */
        error = set_user_sigmask(sigmask, sigsetsize);
        if (error)
                return error;

        error = do_epoll_wait(epfd, events, maxevents, to);

        restore_saved_sigmask_unless(error == -EINTR);

        return error;
}

SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        struct timespec64 to;

        return do_epoll_pwait(epfd, events, maxevents,
                              ep_timeout_to_timespec(&to, timeout),
                              sigmask, sigsetsize);
}

SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
                int, maxevents, const struct __kernel_timespec __user *, timeout,
                const sigset_t __user *, sigmask, size_t, sigsetsize)
{
        struct timespec64 ts, *to = NULL;

        if (timeout) {
                if (get_timespec64(&ts, timeout))
                        return -EFAULT;
                to = &ts;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        return do_epoll_pwait(epfd, events, maxevents, to,
                              sigmask, sigsetsize);
}

#ifdef CONFIG_COMPAT
static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
                                 int maxevents, struct timespec64 *timeout,
                                 const compat_sigset_t __user *sigmask,
                                 compat_size_t sigsetsize)
{
        long err;

        /*
         * If the caller wants a certain signal mask to be set during the wait,
         * we apply it here.
         */
        err = set_compat_user_sigmask(sigmask, sigsetsize);
        if (err)
                return err;

        err = do_epoll_wait(epfd, events, maxevents, timeout);

        restore_saved_sigmask_unless(err == -EINTR);

        return err;
}

COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
                       struct epoll_event __user *, events,
                       int, maxevents, int, timeout,
                       const compat_sigset_t __user *, sigmask,
                       compat_size_t, sigsetsize)
{
        struct timespec64 to;

        return do_compat_epoll_pwait(epfd, events, maxevents,
                                     ep_timeout_to_timespec(&to, timeout),
                                     sigmask, sigsetsize);
}

COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
                       struct epoll_event __user *, events,
                       int, maxevents,
                       const struct __kernel_timespec __user *, timeout,
                       const compat_sigset_t __user *, sigmask,
                       compat_size_t, sigsetsize)
{
        struct timespec64 ts, *to = NULL;

        if (timeout) {
                if (get_timespec64(&ts, timeout))
                        return -EFAULT;
                to = &ts;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        return do_compat_epoll_pwait(epfd, events, maxevents, to,
                                     sigmask, sigsetsize);
}

#endif

static int __init eventpoll_init(void)
{
        struct sysinfo si;

        si_meminfo(&si);
        /*
         * Allows top 4% of lomem to be allocated for epoll watches (per user).
         */
        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
        BUG_ON(max_user_watches < 0);

        /*
         * We can have many thousands of epitems, so prevent this from
         * using an extra cache line on 64-bit (and smaller) CPUs
         */
        BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);

        /* Allocates slab cache used to allocate "struct epitem" items */
        epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

        /* Allocates slab cache used to allocate "struct eppoll_entry" */
        pwq_cache = kmem_cache_create("eventpoll_pwq",
                sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
        epoll_sysctls_init();

        ephead_cache = kmem_cache_create("ep_head",
                sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);

        return 0;
}
fs_initcall(eventpoll_init);







































































































































































































































































































    1 



    1 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/netfilter.h>

#include <net/netfilter/nf_bpf_link.h>
#include <uapi/linux/netfilter_ipv4.h>

static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
                                    const struct nf_hook_state *s)
{
        const struct bpf_prog *prog = bpf_prog;
        struct bpf_nf_ctx ctx = {
                .state = s,
                .skb = skb,
        };

        return bpf_prog_run_pin_on_cpu(prog, &ctx);
}

struct bpf_nf_link {
        struct bpf_link link;
        struct nf_hook_ops hook_ops;
        netns_tracker ns_tracker;
        struct net *net;
        u32 dead;
        const struct nf_defrag_hook *defrag_hook;
};

#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
static const struct nf_defrag_hook *
get_proto_defrag_hook(struct bpf_nf_link *link,
                      const struct nf_defrag_hook __rcu **ptr_global_hook,
                      const char *mod)
{
        const struct nf_defrag_hook *hook;
        int err;

        /* RCU protects us from races against module unloading */
        rcu_read_lock();
        hook = rcu_dereference(*ptr_global_hook);
        if (!hook) {
                rcu_read_unlock();
                err = request_module("%s", mod);
                if (err)
                        return ERR_PTR(err < 0 ? err : -EINVAL);

                rcu_read_lock();
                hook = rcu_dereference(*ptr_global_hook);
        }

        if (hook && try_module_get(hook->owner)) {
                /* Once we have a refcnt on the module, we no longer need RCU */
                hook = rcu_pointer_handoff(hook);
        } else {
                WARN_ONCE(!hook, "%s has bad registration", mod);
                hook = ERR_PTR(-ENOENT);
        }
        rcu_read_unlock();

        if (!IS_ERR(hook)) {
                err = hook->enable(link->net);
                if (err) {
                        module_put(hook->owner);
                        hook = ERR_PTR(err);
                }
        }

        return hook;
}
#endif

static int bpf_nf_enable_defrag(struct bpf_nf_link *link)
{
        const struct nf_defrag_hook __maybe_unused *hook;

        switch (link->hook_ops.pf) {
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
        case NFPROTO_IPV4:
                hook = get_proto_defrag_hook(link, &nf_defrag_v4_hook, "nf_defrag_ipv4");
                if (IS_ERR(hook))
                        return PTR_ERR(hook);

                link->defrag_hook = hook;
                return 0;
#endif
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
        case NFPROTO_IPV6:
                hook = get_proto_defrag_hook(link, &nf_defrag_v6_hook, "nf_defrag_ipv6");
                if (IS_ERR(hook))
                        return PTR_ERR(hook);

                link->defrag_hook = hook;
                return 0;
#endif
        default:
                return -EAFNOSUPPORT;
        }
}

static void bpf_nf_disable_defrag(struct bpf_nf_link *link)
{
        const struct nf_defrag_hook *hook = link->defrag_hook;

        if (!hook)
                return;
        hook->disable(link->net);
        module_put(hook->owner);
}

static void bpf_nf_link_release(struct bpf_link *link)
{
        struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);

        if (nf_link->dead)
                return;

        /* do not double release in case .detach was already called */
        if (!cmpxchg(&nf_link->dead, 0, 1)) {
                nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
                bpf_nf_disable_defrag(nf_link);
                put_net_track(nf_link->net, &nf_link->ns_tracker);
        }
}

static void bpf_nf_link_dealloc(struct bpf_link *link)
{
        struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);

        kfree(nf_link);
}

static int bpf_nf_link_detach(struct bpf_link *link)
{
        bpf_nf_link_release(link);
        return 0;
}

static void bpf_nf_link_show_info(const struct bpf_link *link,
                                  struct seq_file *seq)
{
        struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);

        seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n",
                   nf_link->hook_ops.pf, nf_link->hook_ops.hooknum,
                   nf_link->hook_ops.priority);
}

static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
                                      struct bpf_link_info *info)
{
        struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
        const struct nf_defrag_hook *hook = nf_link->defrag_hook;

        info->netfilter.pf = nf_link->hook_ops.pf;
        info->netfilter.hooknum = nf_link->hook_ops.hooknum;
        info->netfilter.priority = nf_link->hook_ops.priority;
        info->netfilter.flags = hook ? BPF_F_NETFILTER_IP_DEFRAG : 0;

        return 0;
}

static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
                              struct bpf_prog *old_prog)
{
        return -EOPNOTSUPP;
}

static const struct bpf_link_ops bpf_nf_link_lops = {
        .release = bpf_nf_link_release,
        .dealloc_deferred = bpf_nf_link_dealloc,
        .detach = bpf_nf_link_detach,
        .show_fdinfo = bpf_nf_link_show_info,
        .fill_link_info = bpf_nf_link_fill_link_info,
        .update_prog = bpf_nf_link_update,
};

static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr)
{
        int prio;

        switch (attr->link_create.netfilter.pf) {
        case NFPROTO_IPV4:
        case NFPROTO_IPV6:
                if (attr->link_create.netfilter.hooknum >= NF_INET_NUMHOOKS)
                        return -EPROTO;
                break;
        default:
                return -EAFNOSUPPORT;
        }

        if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG)
                return -EOPNOTSUPP;

        /* make sure conntrack confirm is always last */
        prio = attr->link_create.netfilter.priority;
        if (prio == NF_IP_PRI_FIRST)
                return -ERANGE;  /* sabotage_in and other warts */
        else if (prio == NF_IP_PRI_LAST)
                return -ERANGE;  /* e.g. conntrack confirm */
        else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) &&
                 prio <= NF_IP_PRI_CONNTRACK_DEFRAG)
                return -ERANGE;  /* cannot use defrag if prog runs before nf_defrag */

        return 0;
}

int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct net *net = current->nsproxy->net_ns;
        struct bpf_link_primer link_primer;
        struct bpf_nf_link *link;
        int err;

        if (attr->link_create.flags)
                return -EINVAL;

        err = bpf_nf_check_pf_and_hooks(attr);
        if (err)
                return err;

        link = kzalloc_obj(*link, GFP_USER);
        if (!link)
                return -ENOMEM;

        bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog,
                      attr->link_create.attach_type);

        link->hook_ops.hook = nf_hook_run_bpf;
        link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
        link->hook_ops.priv = prog;

        link->hook_ops.pf = attr->link_create.netfilter.pf;
        link->hook_ops.priority = attr->link_create.netfilter.priority;
        link->hook_ops.hooknum = attr->link_create.netfilter.hooknum;

        link->net = net;
        link->dead = false;
        link->defrag_hook = NULL;

        err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                return err;
        }

        if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) {
                err = bpf_nf_enable_defrag(link);
                if (err) {
                        bpf_link_cleanup(&link_primer);
                        return err;
                }
        }

        err = nf_register_net_hook(net, &link->hook_ops);
        if (err) {
                bpf_nf_disable_defrag(link);
                bpf_link_cleanup(&link_primer);
                return err;
        }

        get_net_track(net, &link->ns_tracker, GFP_KERNEL);

        return bpf_link_settle(&link_primer);
}

const struct bpf_prog_ops netfilter_prog_ops = {
        .test_run = bpf_prog_test_run_nf,
};

static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name)
{
        struct btf *btf;
        s32 type_id;

        btf = bpf_get_btf_vmlinux();
        if (IS_ERR_OR_NULL(btf))
                return false;

        type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
        if (WARN_ON_ONCE(type_id < 0))
                return false;

        info->btf = btf;
        info->btf_id = type_id;
        info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
        return true;
}

static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
                               const struct bpf_prog *prog,
                               struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= sizeof(struct bpf_nf_ctx))
                return false;

        if (off % size != 0)
                return false;

        if (type == BPF_WRITE)
                return false;

        switch (off) {
        case bpf_ctx_range(struct bpf_nf_ctx, skb):
                if (size != sizeof_field(struct bpf_nf_ctx, skb))
                        return false;

                return nf_ptr_to_btf_id(info, "sk_buff");
        case bpf_ctx_range(struct bpf_nf_ctx, state):
                if (size != sizeof_field(struct bpf_nf_ctx, state))
                        return false;

                return nf_ptr_to_btf_id(info, "nf_hook_state");
        default:
                return false;
        }

        return false;
}

static const struct bpf_func_proto *
bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        return bpf_base_func_proto(func_id, prog);
}

const struct bpf_verifier_ops netfilter_verifier_ops = {
        .is_valid_access        = nf_is_valid_access,
        .get_func_proto                = bpf_nf_func_proto,
};















































































































































    1 








    1 




    1 










    1 









    1 











    1 
    1 
















    1 



    1 
















































































































































































    2 






    1 




























    1 

















































































































































    2 





    1 


    1 




















































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 





    1 















    1 


    1 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        GRE over IPv6 protocol decoder.
 *
 *        Authors: Dmitry Kozlov (xeb@mail.ru)
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/hash.h>
#include <linux/if_tunnel.h>
#include <linux/ip6_tunnel.h>

#include <net/sock.h>
#include <net/ip.h>
#include <net/ip_tunnels.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/addrconf.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/netdev_lock.h>
#include <net/rtnetlink.h>

#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ip6_tunnel.h>
#include <net/gre.h>
#include <net/erspan.h>
#include <net/dst_metadata.h>


static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

#define IP6_GRE_HASH_SIZE_SHIFT  5
#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT)

static unsigned int ip6gre_net_id __read_mostly;
struct ip6gre_net {
        struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];

        struct ip6_tnl __rcu *collect_md_tun;
        struct ip6_tnl __rcu *collect_md_tun_erspan;
        struct net_device *fb_tunnel_dev;
};

static struct rtnl_link_ops ip6gre_link_ops __read_mostly;
static struct rtnl_link_ops ip6gre_tap_ops __read_mostly;
static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly;
static int ip6gre_tunnel_init(struct net_device *dev);
static void ip6gre_tunnel_setup(struct net_device *dev);
static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t);
static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu);
static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu);

/* Tunnel hash table */

/*
   4 hash tables:

   3: (remote,local)
   2: (remote,*)
   1: (*,local)
   0: (*,*)

   We require exact key match i.e. if a key is present in packet
   it will match only tunnel with the same key; if it is not present,
   it will match only keyless tunnel.

   All keysless packets, if not matched configured keyless tunnels
   will match fallback tunnel.
 */

#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1))
static u32 HASH_ADDR(const struct in6_addr *addr)
{
        u32 hash = ipv6_addr_hash(addr);

        return hash_32(hash, IP6_GRE_HASH_SIZE_SHIFT);
}

#define tunnels_r_l        tunnels[3]
#define tunnels_r        tunnels[2]
#define tunnels_l        tunnels[1]
#define tunnels_wc        tunnels[0]

static bool ip6gre_tunnel_match(struct ip6_tnl *t, int dev_type, int link,
                                int *cand_score, struct ip6_tnl **ret)
{
        int score = 0;

        if (t->dev->type != ARPHRD_IP6GRE &&
            t->dev->type != dev_type)
                return false;

        if (t->parms.link != link)
                score |= 1;
        if (t->dev->type != dev_type)
                score |= 2;
        if (score == 0) {
                *ret = t;
                return true;
        }

        if (score < *cand_score) {
                *ret = t;
                *cand_score = score;
        }
        return false;
}

/* Given src, dst and key, find appropriate for input tunnel. */
static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
                const struct in6_addr *remote, const struct in6_addr *local,
                __be32 key, __be16 gre_proto)
{
        struct net *net = dev_net(dev);
        int link = dev->ifindex;
        unsigned int h0 = HASH_ADDR(remote);
        unsigned int h1 = HASH_KEY(key);
        struct ip6_tnl *t, *cand = NULL;
        struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
        int dev_type = (gre_proto == htons(ETH_P_TEB) ||
                        gre_proto == htons(ETH_P_ERSPAN) ||
                        gre_proto == htons(ETH_P_ERSPAN2)) ?
                       ARPHRD_ETHER : ARPHRD_IP6GRE;
        struct net_device *ndev;
        int cand_score = 4;

        for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
                if (!ipv6_addr_equal(local, &t->parms.laddr) ||
                    !ipv6_addr_equal(remote, &t->parms.raddr) ||
                    key != t->parms.i_key ||
                    !(t->dev->flags & IFF_UP))
                        continue;

                if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
                        return cand;
        }

        for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
                if (!ipv6_addr_equal(remote, &t->parms.raddr) ||
                    key != t->parms.i_key ||
                    !(t->dev->flags & IFF_UP))
                        continue;

                if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
                        return cand;
        }

        for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
                if ((!ipv6_addr_equal(local, &t->parms.laddr) &&
                          (!ipv6_addr_equal(local, &t->parms.raddr) ||
                                 !ipv6_addr_is_multicast(local))) ||
                    key != t->parms.i_key ||
                    !(t->dev->flags & IFF_UP))
                        continue;

                if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
                        return cand;
        }

        for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
                if (t->parms.i_key != key ||
                    !(t->dev->flags & IFF_UP))
                        continue;

                if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
                        return cand;
        }

        if (cand)
                return cand;

        if (gre_proto == htons(ETH_P_ERSPAN) ||
            gre_proto == htons(ETH_P_ERSPAN2))
                t = rcu_dereference(ign->collect_md_tun_erspan);
        else
                t = rcu_dereference(ign->collect_md_tun);

        if (t && t->dev->flags & IFF_UP)
                return t;

        ndev = READ_ONCE(ign->fb_tunnel_dev);
        if (ndev && ndev->flags & IFF_UP)
                return netdev_priv(ndev);

        return NULL;
}

static struct ip6_tnl __rcu **__ip6gre_bucket(struct ip6gre_net *ign,
                const struct __ip6_tnl_parm *p)
{
        const struct in6_addr *remote = &p->raddr;
        const struct in6_addr *local = &p->laddr;
        unsigned int h = HASH_KEY(p->i_key);
        int prio = 0;

        if (!ipv6_addr_any(local))
                prio |= 1;
        if (!ipv6_addr_any(remote) && !ipv6_addr_is_multicast(remote)) {
                prio |= 2;
                h ^= HASH_ADDR(remote);
        }

        return &ign->tunnels[prio][h];
}

static void ip6gre_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t)
{
        if (t->parms.collect_md)
                rcu_assign_pointer(ign->collect_md_tun, t);
}

static void ip6erspan_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t)
{
        if (t->parms.collect_md)
                rcu_assign_pointer(ign->collect_md_tun_erspan, t);
}

static void ip6gre_tunnel_unlink_md(struct ip6gre_net *ign, struct ip6_tnl *t)
{
        if (t->parms.collect_md)
                rcu_assign_pointer(ign->collect_md_tun, NULL);
}

static void ip6erspan_tunnel_unlink_md(struct ip6gre_net *ign,
                                       struct ip6_tnl *t)
{
        if (t->parms.collect_md)
                rcu_assign_pointer(ign->collect_md_tun_erspan, NULL);
}

static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign,
                const struct ip6_tnl *t)
{
        return __ip6gre_bucket(ign, &t->parms);
}

static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t)
{
        struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t);

        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
        rcu_assign_pointer(*tp, t);
}

static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t)
{
        struct ip6_tnl __rcu **tp;
        struct ip6_tnl *iter;

        for (tp = ip6gre_bucket(ign, t);
             (iter = rtnl_dereference(*tp)) != NULL;
             tp = &iter->next) {
                if (t == iter) {
                        rcu_assign_pointer(*tp, t->next);
                        break;
                }
        }
}

static struct ip6_tnl *ip6gre_tunnel_find(struct net *net,
                                           const struct __ip6_tnl_parm *parms,
                                           int type)
{
        const struct in6_addr *remote = &parms->raddr;
        const struct in6_addr *local = &parms->laddr;
        __be32 key = parms->i_key;
        int link = parms->link;
        struct ip6_tnl *t;
        struct ip6_tnl __rcu **tp;
        struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);

        for (tp = __ip6gre_bucket(ign, parms);
             (t = rtnl_dereference(*tp)) != NULL;
             tp = &t->next)
                if (ipv6_addr_equal(local, &t->parms.laddr) &&
                    ipv6_addr_equal(remote, &t->parms.raddr) &&
                    key == t->parms.i_key &&
                    link == t->parms.link &&
                    type == t->dev->type)
                        break;

        return t;
}

static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
                const struct __ip6_tnl_parm *parms, int create)
{
        struct ip6_tnl *t, *nt;
        struct net_device *dev;
        char name[IFNAMSIZ];
        struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);

        t = ip6gre_tunnel_find(net, parms, ARPHRD_IP6GRE);
        if (t && create)
                return NULL;
        if (t || !create)
                return t;

        if (parms->name[0]) {
                if (!dev_valid_name(parms->name))
                        return NULL;
                strscpy(name, parms->name);
        } else {
                strscpy(name, "ip6gre%d");
        }
        dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
                           ip6gre_tunnel_setup);
        if (!dev)
                return NULL;

        dev_net_set(dev, net);

        nt = netdev_priv(dev);
        nt->parms = *parms;
        dev->rtnl_link_ops = &ip6gre_link_ops;

        nt->dev = dev;
        nt->net = dev_net(dev);

        if (register_netdevice(dev) < 0)
                goto failed_free;

        ip6gre_tnl_link_config(nt, 1);
        ip6gre_tunnel_link(ign, nt);
        return nt;

failed_free:
        free_netdev(dev);
        return NULL;
}

static void ip6erspan_tunnel_uninit(struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);

        ip6erspan_tunnel_unlink_md(ign, t);
        ip6gre_tunnel_unlink(ign, t);
        dst_cache_reset(&t->dst_cache);
        netdev_put(dev, &t->dev_tracker);
}

static void ip6gre_tunnel_uninit(struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);

        ip6gre_tunnel_unlink_md(ign, t);
        ip6gre_tunnel_unlink(ign, t);
        if (ign->fb_tunnel_dev == dev)
                WRITE_ONCE(ign->fb_tunnel_dev, NULL);
        dst_cache_reset(&t->dst_cache);
        netdev_put(dev, &t->dev_tracker);
}


static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                       u8 type, u8 code, int offset, __be32 info)
{
        struct net *net = dev_net(skb->dev);
        const struct ipv6hdr *ipv6h;
        struct tnl_ptk_info tpi;
        struct ip6_tnl *t;

        if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
                             offset) < 0)
                return -EINVAL;

        ipv6h = (const struct ipv6hdr *)skb->data;
        t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
                                 tpi.key, tpi.proto);
        if (!t)
                return -ENOENT;

        switch (type) {
        case ICMPV6_DEST_UNREACH:
                net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
                                    t->parms.name);
                if (code != ICMPV6_PORT_UNREACH)
                        break;
                return 0;
        case ICMPV6_TIME_EXCEED:
                if (code == ICMPV6_EXC_HOPLIMIT) {
                        net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
                                            t->parms.name);
                        break;
                }
                return 0;
        case ICMPV6_PARAMPROB: {
                struct ipv6_tlv_tnl_enc_lim *tel;
                __u32 teli;

                teli = 0;
                if (code == ICMPV6_HDR_FIELD)
                        teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);

                if (teli && teli == be32_to_cpu(info) - 2) {
                        tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
                        if (tel->encap_limit == 0) {
                                net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
                                                    t->parms.name);
                        }
                } else {
                        net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
                                            t->parms.name);
                }
                return 0;
        }
        case ICMPV6_PKT_TOOBIG:
                ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
                return 0;
        case NDISC_REDIRECT:
                ip6_redirect(skb, net, skb->dev->ifindex, 0,
                             sock_net_uid(net, NULL));
                return 0;
        }

        if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO))
                t->err_count++;
        else
                t->err_count = 1;
        t->err_time = jiffies;

        return 0;
}

static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
{
        const struct ipv6hdr *ipv6h;
        struct ip6_tnl *tunnel;

        ipv6h = ipv6_hdr(skb);
        tunnel = ip6gre_tunnel_lookup(skb->dev,
                                      &ipv6h->saddr, &ipv6h->daddr, tpi->key,
                                      tpi->proto);
        if (tunnel) {
                if (tunnel->parms.collect_md) {
                        IP_TUNNEL_DECLARE_FLAGS(flags);
                        struct metadata_dst *tun_dst;
                        __be64 tun_id;

                        ip_tunnel_flags_copy(flags, tpi->flags);
                        tun_id = key32_to_tunnel_id(tpi->key);

                        tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0);
                        if (!tun_dst)
                                return PACKET_REJECT;

                        ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
                } else {
                        ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
                }

                return PACKET_RCVD;
        }

        return PACKET_REJECT;
}

static int ip6erspan_rcv(struct sk_buff *skb,
                         struct tnl_ptk_info *tpi,
                         int gre_hdr_len)
{
        struct erspan_base_hdr *ershdr;
        const struct ipv6hdr *ipv6h;
        struct erspan_md2 *md2;
        struct ip6_tnl *tunnel;
        u8 ver;

        if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr))))
                return PACKET_REJECT;

        ipv6h = ipv6_hdr(skb);
        ershdr = (struct erspan_base_hdr *)skb->data;
        ver = ershdr->ver;

        tunnel = ip6gre_tunnel_lookup(skb->dev,
                                      &ipv6h->saddr, &ipv6h->daddr, tpi->key,
                                      tpi->proto);
        if (tunnel) {
                int len = erspan_hdr_len(ver);

                if (unlikely(!pskb_may_pull(skb, len)))
                        return PACKET_REJECT;

                if (__iptunnel_pull_header(skb, len,
                                           htons(ETH_P_TEB),
                                           false, false) < 0)
                        return PACKET_REJECT;

                if (tunnel->parms.collect_md) {
                        struct erspan_metadata *pkt_md, *md;
                        IP_TUNNEL_DECLARE_FLAGS(flags);
                        struct metadata_dst *tun_dst;
                        struct ip_tunnel_info *info;
                        unsigned char *gh;
                        __be64 tun_id;

                        __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags);
                        ip_tunnel_flags_copy(flags, tpi->flags);
                        tun_id = key32_to_tunnel_id(tpi->key);

                        tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id,
                                                  sizeof(*md));
                        if (!tun_dst)
                                return PACKET_REJECT;

                        /* MUST set options_len before referencing options */
                        info = &tun_dst->u.tun_info;
                        info->options_len = sizeof(*md);

                        /* skb can be uncloned in __iptunnel_pull_header, so
                         * old pkt_md is no longer valid and we need to reset
                         * it
                         */
                        gh = skb_network_header(skb) +
                             skb_network_header_len(skb);
                        pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
                                                            sizeof(*ershdr));
                        md = ip_tunnel_info_opts(info);
                        md->version = ver;
                        md2 = &md->u.md2;
                        memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
                                                       ERSPAN_V2_MDSIZE);
                        __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
                                  info->key.tun_flags);

                        ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);

                } else {
                        ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
                }

                return PACKET_RCVD;
        }

        return PACKET_REJECT;
}

static int gre_rcv(struct sk_buff *skb)
{
        struct tnl_ptk_info tpi;
        bool csum_err = false;
        int hdr_len;

        hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IPV6), 0);
        if (hdr_len < 0)
                goto drop;

        if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
                goto drop;

        if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
                     tpi.proto == htons(ETH_P_ERSPAN2))) {
                if (ip6erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
                        return 0;
                goto out;
        }

        if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD)
                return 0;

out:
        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
drop:
        dev_core_stats_rx_dropped_inc(skb->dev);
        kfree_skb(skb);
        return 0;
}

static int gre_handle_offloads(struct sk_buff *skb, bool csum)
{
        return iptunnel_handle_offloads(skb,
                                        csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}

static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb,
                                     struct net_device *dev,
                                     struct flowi6 *fl6, __u8 *dsfield,
                                     int *encap_limit)
{
        const struct iphdr *iph = ip_hdr(skb);
        struct ip6_tnl *t = netdev_priv(dev);

        if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                *encap_limit = t->parms.encap_limit;

        memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));

        if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
                *dsfield = ipv4_get_dsfield(iph);
        else
                *dsfield = ip6_tclass(t->parms.flowinfo);

        if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
                fl6->flowi6_mark = skb->mark;
        else
                fl6->flowi6_mark = t->parms.fwmark;

        fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
}

static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb,
                                    struct net_device *dev,
                                    struct flowi6 *fl6, __u8 *dsfield,
                                    int *encap_limit)
{
        struct ipv6hdr *ipv6h;
        struct ip6_tnl *t = netdev_priv(dev);
        __u16 offset;

        offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
        /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
        ipv6h = ipv6_hdr(skb);

        if (offset > 0) {
                struct ipv6_tlv_tnl_enc_lim *tel;

                tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
                if (tel->encap_limit == 0) {
                        icmpv6_ndo_send(skb, ICMPV6_PARAMPROB,
                                        ICMPV6_HDR_FIELD, offset + 2);
                        return -1;
                }
                *encap_limit = tel->encap_limit - 1;
        } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
                *encap_limit = t->parms.encap_limit;
        }

        memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));

        if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
                *dsfield = ipv6_get_dsfield(ipv6h);
        else
                *dsfield = ip6_tclass(t->parms.flowinfo);

        if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
                fl6->flowlabel |= ip6_flowlabel(ipv6h);

        if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
                fl6->flowi6_mark = skb->mark;
        else
                fl6->flowi6_mark = t->parms.fwmark;

        fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);

        return 0;
}

static int prepare_ip6gre_xmit_other(struct sk_buff *skb,
                                     struct net_device *dev,
                                     struct flowi6 *fl6, __u8 *dsfield,
                                     int *encap_limit)
{
        struct ip6_tnl *t = netdev_priv(dev);

        if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                *encap_limit = t->parms.encap_limit;

        memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));

        if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
                *dsfield = 0;
        else
                *dsfield = ip6_tclass(t->parms.flowinfo);

        if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
                fl6->flowi6_mark = skb->mark;
        else
                fl6->flowi6_mark = t->parms.fwmark;

        fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);

        return 0;
}

static struct ip_tunnel_info *skb_tunnel_info_txcheck(struct sk_buff *skb)
{
        struct ip_tunnel_info *tun_info;

        tun_info = skb_tunnel_info(skb);
        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
                return ERR_PTR(-EINVAL);

        return tun_info;
}

static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
                               struct net_device *dev, __u8 dsfield,
                               struct flowi6 *fl6, int encap_limit,
                               __u32 *pmtu, __be16 proto)
{
        struct ip6_tnl *tunnel = netdev_priv(dev);
        IP_TUNNEL_DECLARE_FLAGS(flags);
        __be16 protocol;

        if (dev->type == ARPHRD_ETHER)
                IPCB(skb)->flags = 0;

        if (dev->header_ops && dev->type == ARPHRD_IP6GRE)
                fl6->daddr = ((struct ipv6hdr *)skb->data)->daddr;
        else
                fl6->daddr = tunnel->parms.raddr;

        /* Push GRE header. */
        protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;

        if (tunnel->parms.collect_md) {
                struct ip_tunnel_info *tun_info;
                const struct ip_tunnel_key *key;
                int tun_hlen;

                tun_info = skb_tunnel_info_txcheck(skb);
                if (IS_ERR(tun_info) ||
                    unlikely(ip_tunnel_info_af(tun_info) != AF_INET6))
                        return -EINVAL;

                key = &tun_info->key;
                memset(fl6, 0, sizeof(*fl6));
                fl6->flowi6_proto = IPPROTO_GRE;
                fl6->daddr = key->u.ipv6.dst;
                fl6->flowlabel = key->label;
                fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
                fl6->fl6_gre_key = tunnel_id_to_key32(key->tun_id);

                dsfield = key->tos;
                ip_tunnel_flags_zero(flags);
                __set_bit(IP_TUNNEL_CSUM_BIT, flags);
                __set_bit(IP_TUNNEL_KEY_BIT, flags);
                __set_bit(IP_TUNNEL_SEQ_BIT, flags);
                ip_tunnel_flags_and(flags, flags, key->tun_flags);
                tun_hlen = gre_calc_hlen(flags);

                if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen))
                        return -ENOMEM;

                gre_build_header(skb, tun_hlen,
                                 flags, protocol,
                                 tunnel_id_to_key32(tun_info->key.tun_id),
                                 test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
                                 htonl(atomic_fetch_inc(&tunnel->o_seqno)) :
                                 0);

        } else {
                if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
                        return -ENOMEM;

                ip_tunnel_flags_copy(flags, tunnel->parms.o_flags);

                gre_build_header(skb, tunnel->tun_hlen, flags,
                                 protocol, tunnel->parms.o_key,
                                 test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
                                 htonl(atomic_fetch_inc(&tunnel->o_seqno)) :
                                 0);
        }

        return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
                            NEXTHDR_GRE);
}

static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        int encap_limit = -1;
        struct flowi6 fl6;
        __u8 dsfield = 0;
        __u32 mtu;
        int err;

        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));

        if (!t->parms.collect_md)
                prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
                                         &dsfield, &encap_limit);

        err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
                                                t->parms.o_flags));
        if (err)
                return -1;

        err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
                          skb->protocol);
        if (err != 0) {
                /* XXX: send ICMP error even if DF is not set. */
                if (err == -EMSGSIZE)
                        icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
                                      htonl(mtu));
                return -1;
        }

        return 0;
}

static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        int encap_limit = -1;
        struct flowi6 fl6;
        __u8 dsfield = 0;
        __u32 mtu;
        int err;

        if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
                return -1;

        if (!t->parms.collect_md &&
            prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit))
                return -1;

        if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
                                              t->parms.o_flags)))
                return -1;

        err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit,
                          &mtu, skb->protocol);
        if (err != 0) {
                if (err == -EMSGSIZE)
                        icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                return -1;
        }

        return 0;
}

static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        int encap_limit = -1;
        struct flowi6 fl6;
        __u8 dsfield = 0;
        __u32 mtu;
        int err;

        if (!t->parms.collect_md &&
            prepare_ip6gre_xmit_other(skb, dev, &fl6, &dsfield, &encap_limit))
                return -1;

        err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
                                                t->parms.o_flags));
        if (err)
                return err;
        err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol);

        return err;
}

static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
        struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        __be16 payload_protocol;
        int ret;

        if (!pskb_inet_may_pull(skb))
                goto tx_err;

        if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
                goto tx_err;

        payload_protocol = skb_protocol(skb, true);
        switch (payload_protocol) {
        case htons(ETH_P_IP):
                ret = ip6gre_xmit_ipv4(skb, dev);
                break;
        case htons(ETH_P_IPV6):
                ret = ip6gre_xmit_ipv6(skb, dev);
                break;
        default:
                ret = ip6gre_xmit_other(skb, dev);
                break;
        }

        if (ret < 0)
                goto tx_err;

        return NETDEV_TX_OK;

tx_err:
        if (!t->parms.collect_md || !IS_ERR(skb_tunnel_info_txcheck(skb)))
                DEV_STATS_INC(dev, tx_errors);
        DEV_STATS_INC(dev, tx_dropped);
        kfree_skb(skb);
        return NETDEV_TX_OK;
}

static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
                                         struct net_device *dev)
{
        struct ip_tunnel_info *tun_info = NULL;
        struct ip6_tnl *t = netdev_priv(dev);
        struct dst_entry *dst = skb_dst(skb);
        IP_TUNNEL_DECLARE_FLAGS(flags) = { };
        bool truncate = false;
        int encap_limit = -1;
        __u8 dsfield = false;
        struct flowi6 fl6;
        int err = -EINVAL;
        __be16 proto;
        __u32 mtu;
        int nhoff;

        if (!pskb_inet_may_pull(skb))
                goto tx_err;

        if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
                goto tx_err;

        if (gre_handle_offloads(skb, false))
                goto tx_err;

        if (skb->len > dev->mtu + dev->hard_header_len) {
                if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
                        goto tx_err;
                truncate = true;
        }

        nhoff = skb_network_offset(skb);
        if (skb->protocol == htons(ETH_P_IP) &&
            (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
                truncate = true;

        if (skb->protocol == htons(ETH_P_IPV6)) {
                int thoff;

                if (skb_transport_header_was_set(skb))
                        thoff = skb_transport_offset(skb);
                else
                        thoff = nhoff + sizeof(struct ipv6hdr);
                if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
                        truncate = true;
        }

        if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
                goto tx_err;

        __clear_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags);
        IPCB(skb)->flags = 0;

        /* For collect_md mode, derive fl6 from the tunnel key,
         * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}.
         */
        if (t->parms.collect_md) {
                const struct ip_tunnel_key *key;
                struct erspan_metadata *md;
                __be32 tun_id;

                tun_info = skb_tunnel_info_txcheck(skb);
                if (IS_ERR(tun_info) ||
                    unlikely(ip_tunnel_info_af(tun_info) != AF_INET6))
                        goto tx_err;

                key = &tun_info->key;
                memset(&fl6, 0, sizeof(fl6));
                fl6.flowi6_proto = IPPROTO_GRE;
                fl6.daddr = key->u.ipv6.dst;
                fl6.flowlabel = key->label;
                fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
                fl6.fl6_gre_key = tunnel_id_to_key32(key->tun_id);

                dsfield = key->tos;
                if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
                              tun_info->key.tun_flags))
                        goto tx_err;
                if (tun_info->options_len < sizeof(*md))
                        goto tx_err;
                md = ip_tunnel_info_opts(tun_info);

                tun_id = tunnel_id_to_key32(key->tun_id);
                if (md->version == 1) {
                        erspan_build_header(skb,
                                            ntohl(tun_id),
                                            ntohl(md->u.index), truncate,
                                            false);
                        proto = htons(ETH_P_ERSPAN);
                } else if (md->version == 2) {
                        erspan_build_header_v2(skb,
                                               ntohl(tun_id),
                                               md->u.md2.dir,
                                               get_hwid(&md->u.md2),
                                               truncate, false);
                        proto = htons(ETH_P_ERSPAN2);
                } else {
                        goto tx_err;
                }
        } else {
                switch (skb->protocol) {
                case htons(ETH_P_IP):
                        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
                        prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
                                                 &dsfield, &encap_limit);
                        break;
                case htons(ETH_P_IPV6):
                        if (ipv6_addr_equal(&t->parms.raddr, &ipv6_hdr(skb)->saddr))
                                goto tx_err;
                        if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6,
                                                     &dsfield, &encap_limit))
                                goto tx_err;
                        break;
                default:
                        memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
                        break;
                }

                if (t->parms.erspan_ver == 1) {
                        erspan_build_header(skb, ntohl(t->parms.o_key),
                                            t->parms.index,
                                            truncate, false);
                        proto = htons(ETH_P_ERSPAN);
                } else if (t->parms.erspan_ver == 2) {
                        erspan_build_header_v2(skb, ntohl(t->parms.o_key),
                                               t->parms.dir,
                                               t->parms.hwid,
                                               truncate, false);
                        proto = htons(ETH_P_ERSPAN2);
                } else {
                        goto tx_err;
                }

                fl6.daddr = t->parms.raddr;
        }

        /* Push GRE header. */
        __set_bit(IP_TUNNEL_SEQ_BIT, flags);
        gre_build_header(skb, 8, flags, proto, 0,
                         htonl(atomic_fetch_inc(&t->o_seqno)));

        /* TooBig packet may have updated dst->dev's mtu */
        if (!t->parms.collect_md && dst) {
                mtu = READ_ONCE(dst_dev(dst)->mtu);
                if (dst6_mtu(dst) > mtu)
                        dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
        }
        err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
                           NEXTHDR_GRE);
        if (err != 0) {
                /* XXX: send ICMP error even if DF is not set. */
                if (err == -EMSGSIZE) {
                        if (skb->protocol == htons(ETH_P_IP))
                                icmp_ndo_send(skb, ICMP_DEST_UNREACH,
                                              ICMP_FRAG_NEEDED, htonl(mtu));
                        else
                                icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                }

                goto tx_err;
        }
        return NETDEV_TX_OK;

tx_err:
        if (!IS_ERR(tun_info))
                DEV_STATS_INC(dev, tx_errors);
        DEV_STATS_INC(dev, tx_dropped);
        kfree_skb(skb);
        return NETDEV_TX_OK;
}

static void ip6gre_tnl_link_config_common(struct ip6_tnl *t)
{
        struct net_device *dev = t->dev;
        struct __ip6_tnl_parm *p = &t->parms;
        struct flowi6 *fl6 = &t->fl.u.ip6;

        if (dev->type != ARPHRD_ETHER) {
                __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
                memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
        }

        /* Set up flowi template */
        fl6->saddr = p->laddr;
        fl6->daddr = p->raddr;
        fl6->flowi6_oif = p->link;
        fl6->flowlabel = 0;
        fl6->flowi6_proto = IPPROTO_GRE;
        fl6->fl6_gre_key = t->parms.o_key;

        if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
                fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
        if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
                fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;

        p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET);
        p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);

        if (p->flags&IP6_TNL_F_CAP_XMIT &&
                        p->flags&IP6_TNL_F_CAP_RCV && dev->type != ARPHRD_ETHER)
                dev->flags |= IFF_POINTOPOINT;
        else
                dev->flags &= ~IFF_POINTOPOINT;
}

static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu,
                                         int t_hlen)
{
        const struct __ip6_tnl_parm *p = &t->parms;
        struct net_device *dev = t->dev;

        if (p->flags & IP6_TNL_F_CAP_XMIT) {
                int strict = (ipv6_addr_type(&p->raddr) &
                              (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));

                struct rt6_info *rt = rt6_lookup(t->net,
                                                 &p->raddr, &p->laddr,
                                                 p->link, NULL, strict);

                if (!rt)
                        return;

                if (rt->dst.dev) {
                        unsigned short dst_len = rt->dst.dev->hard_header_len +
                                                 t_hlen;

                        if (t->dev->header_ops)
                                dev->hard_header_len = dst_len;
                        else
                                dev->needed_headroom = dst_len;

                        if (set_mtu) {
                                int mtu = rt->dst.dev->mtu - t_hlen;

                                if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                                        mtu -= 8;
                                if (dev->type == ARPHRD_ETHER)
                                        mtu -= ETH_HLEN;

                                if (mtu < IPV6_MIN_MTU)
                                        mtu = IPV6_MIN_MTU;
                                WRITE_ONCE(dev->mtu, mtu);
                        }
                }
                ip6_rt_put(rt);
        }
}

static int ip6gre_calc_hlen(struct ip6_tnl *tunnel)
{
        int t_hlen;

        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;

        t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);

        if (tunnel->dev->header_ops)
                tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
        else
                tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;

        return t_hlen;
}

static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
{
        ip6gre_tnl_link_config_common(t);
        ip6gre_tnl_link_config_route(t, set_mtu, ip6gre_calc_hlen(t));
}

static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t,
                                     const struct __ip6_tnl_parm *p)
{
        t->parms.laddr = p->laddr;
        t->parms.raddr = p->raddr;
        t->parms.flags = p->flags;
        t->parms.hop_limit = p->hop_limit;
        t->parms.encap_limit = p->encap_limit;
        t->parms.flowinfo = p->flowinfo;
        t->parms.link = p->link;
        t->parms.proto = p->proto;
        t->parms.i_key = p->i_key;
        t->parms.o_key = p->o_key;
        ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags);
        ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags);
        t->parms.fwmark = p->fwmark;
        t->parms.erspan_ver = p->erspan_ver;
        t->parms.index = p->index;
        t->parms.dir = p->dir;
        t->parms.hwid = p->hwid;
        dst_cache_reset(&t->dst_cache);
}

static int ip6gre_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p,
                             int set_mtu)
{
        ip6gre_tnl_copy_tnl_parm(t, p);
        ip6gre_tnl_link_config(t, set_mtu);
        return 0;
}

static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p,
        const struct ip6_tnl_parm2 *u)
{
        p->laddr = u->laddr;
        p->raddr = u->raddr;
        p->flags = u->flags;
        p->hop_limit = u->hop_limit;
        p->encap_limit = u->encap_limit;
        p->flowinfo = u->flowinfo;
        p->link = u->link;
        p->i_key = u->i_key;
        p->o_key = u->o_key;
        gre_flags_to_tnl_flags(p->i_flags, u->i_flags);
        gre_flags_to_tnl_flags(p->o_flags, u->o_flags);
        memcpy(p->name, u->name, sizeof(u->name));
}

static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u,
        const struct __ip6_tnl_parm *p)
{
        u->proto = IPPROTO_GRE;
        u->laddr = p->laddr;
        u->raddr = p->raddr;
        u->flags = p->flags;
        u->hop_limit = p->hop_limit;
        u->encap_limit = p->encap_limit;
        u->flowinfo = p->flowinfo;
        u->link = p->link;
        u->i_key = p->i_key;
        u->o_key = p->o_key;
        u->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
        u->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
        memcpy(u->name, p->name, sizeof(u->name));
}

static int ip6gre_tunnel_siocdevprivate(struct net_device *dev,
                                        struct ifreq *ifr, void __user *data,
                                        int cmd)
{
        int err = 0;
        struct ip6_tnl_parm2 p;
        struct __ip6_tnl_parm p1;
        struct ip6_tnl *t = netdev_priv(dev);
        struct net *net = t->net;
        struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);

        memset(&p1, 0, sizeof(p1));

        switch (cmd) {
        case SIOCGETTUNNEL:
                if (dev == ign->fb_tunnel_dev) {
                        if (copy_from_user(&p, data, sizeof(p))) {
                                err = -EFAULT;
                                break;
                        }
                        ip6gre_tnl_parm_from_user(&p1, &p);
                        t = ip6gre_tunnel_locate(net, &p1, 0);
                        if (!t)
                                t = netdev_priv(dev);
                }
                memset(&p, 0, sizeof(p));
                ip6gre_tnl_parm_to_user(&p, &t->parms);
                if (copy_to_user(data, &p, sizeof(p)))
                        err = -EFAULT;
                break;

        case SIOCADDTUNNEL:
        case SIOCCHGTUNNEL:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto done;

                err = -EFAULT;
                if (copy_from_user(&p, data, sizeof(p)))
                        goto done;

                err = -EINVAL;
                if ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))
                        goto done;

                if (!(p.i_flags&GRE_KEY))
                        p.i_key = 0;
                if (!(p.o_flags&GRE_KEY))
                        p.o_key = 0;

                ip6gre_tnl_parm_from_user(&p1, &p);
                t = ip6gre_tunnel_locate(net, &p1, cmd == SIOCADDTUNNEL);

                if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
                        if (t) {
                                if (t->dev != dev) {
                                        err = -EEXIST;
                                        break;
                                }
                        } else {
                                t = netdev_priv(dev);

                                ip6gre_tunnel_unlink(ign, t);
                                synchronize_net();
                                ip6gre_tnl_change(t, &p1, 1);
                                ip6gre_tunnel_link(ign, t);
                                netdev_state_change(dev);
                        }
                }

                if (t) {
                        err = 0;

                        memset(&p, 0, sizeof(p));
                        ip6gre_tnl_parm_to_user(&p, &t->parms);
                        if (copy_to_user(data, &p, sizeof(p)))
                                err = -EFAULT;
                } else
                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
                break;

        case SIOCDELTUNNEL:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto done;

                if (dev == ign->fb_tunnel_dev) {
                        err = -EFAULT;
                        if (copy_from_user(&p, data, sizeof(p)))
                                goto done;
                        err = -ENOENT;
                        ip6gre_tnl_parm_from_user(&p1, &p);
                        t = ip6gre_tunnel_locate(net, &p1, 0);
                        if (!t)
                                goto done;
                        err = -EPERM;
                        if (t == netdev_priv(ign->fb_tunnel_dev))
                                goto done;
                        dev = t->dev;
                }
                unregister_netdevice(dev);
                err = 0;
                break;

        default:
                err = -EINVAL;
        }

done:
        return err;
}

static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
                         unsigned short type, const void *daddr,
                         const void *saddr, unsigned int len)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct ipv6hdr *ipv6h;
        int needed;
        __be16 *p;

        needed = t->hlen + sizeof(*ipv6h);
        if (skb_headroom(skb) < needed &&
            pskb_expand_head(skb, HH_DATA_ALIGN(needed - skb_headroom(skb)),
                             0, GFP_ATOMIC))
                return -needed;

        ipv6h = skb_push(skb, needed);
        ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb,
                                                  t->fl.u.ip6.flowlabel,
                                                  true, &t->fl.u.ip6));
        ipv6h->hop_limit = t->parms.hop_limit;
        ipv6h->nexthdr = NEXTHDR_GRE;
        ipv6h->saddr = t->parms.laddr;
        ipv6h->daddr = t->parms.raddr;

        p = (__be16 *)(ipv6h + 1);
        p[0] = ip_tunnel_flags_to_be16(t->parms.o_flags);
        p[1] = htons(type);

        /*
         *        Set the source hardware address.
         */

        if (saddr)
                memcpy(&ipv6h->saddr, saddr, sizeof(struct in6_addr));
        if (daddr)
                memcpy(&ipv6h->daddr, daddr, sizeof(struct in6_addr));
        if (!ipv6_addr_any(&ipv6h->daddr))
                return t->hlen;

        return -t->hlen;
}

static const struct header_ops ip6gre_header_ops = {
        .create        = ip6gre_header,
};

static const struct net_device_ops ip6gre_netdev_ops = {
        .ndo_init                = ip6gre_tunnel_init,
        .ndo_uninit                = ip6gre_tunnel_uninit,
        .ndo_start_xmit                = ip6gre_tunnel_xmit,
        .ndo_siocdevprivate        = ip6gre_tunnel_siocdevprivate,
        .ndo_change_mtu                = ip6_tnl_change_mtu,
        .ndo_get_iflink                = ip6_tnl_get_iflink,
};

static void ip6gre_dev_free(struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);

        gro_cells_destroy(&t->gro_cells);
        dst_cache_destroy(&t->dst_cache);
}

static void ip6gre_tunnel_setup(struct net_device *dev)
{
        dev->netdev_ops = &ip6gre_netdev_ops;
        dev->needs_free_netdev = true;
        dev->priv_destructor = ip6gre_dev_free;

        dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
        dev->type = ARPHRD_IP6GRE;

        dev->flags |= IFF_NOARP;
        dev->addr_len = sizeof(struct in6_addr);
        netif_keep_dst(dev);
        /* This perm addr will be used as interface identifier by IPv6 */
        dev->addr_assign_type = NET_ADDR_RANDOM;
        eth_random_addr(dev->perm_addr);
}

#define GRE6_FEATURES (NETIF_F_SG |                \
                       NETIF_F_FRAGLIST |        \
                       NETIF_F_HIGHDMA |        \
                       NETIF_F_HW_CSUM)

static void ip6gre_tnl_init_features(struct net_device *dev)
{
        struct ip6_tnl *nt = netdev_priv(dev);

        dev->features                |= GRE6_FEATURES;
        dev->hw_features        |= GRE6_FEATURES;

        /* TCP offload with GRE SEQ is not supported, nor can we support 2
         * levels of outer headers requiring an update.
         */
        if (test_bit(IP_TUNNEL_SEQ_BIT, nt->parms.o_flags))
                return;
        if (test_bit(IP_TUNNEL_CSUM_BIT, nt->parms.o_flags) &&
            nt->encap.type != TUNNEL_ENCAP_NONE)
                return;

        dev->features |= NETIF_F_GSO_SOFTWARE;
        dev->hw_features |= NETIF_F_GSO_SOFTWARE;

        dev->lltx = true;
}

static int ip6gre_tunnel_init_common(struct net_device *dev)
{
        struct ip6_tnl *tunnel;
        int ret;
        int t_hlen;

        tunnel = netdev_priv(dev);

        tunnel->dev = dev;
        strscpy(tunnel->parms.name, dev->name);

        ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
        if (ret)
                return ret;

        ret = gro_cells_init(&tunnel->gro_cells, dev);
        if (ret)
                goto cleanup_dst_cache_init;

        t_hlen = ip6gre_calc_hlen(tunnel);
        dev->mtu = ETH_DATA_LEN - t_hlen;
        if (dev->type == ARPHRD_ETHER)
                dev->mtu -= ETH_HLEN;
        if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                dev->mtu -= 8;

        if (tunnel->parms.collect_md) {
                netif_keep_dst(dev);
        }
        ip6gre_tnl_init_features(dev);

        netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
        netdev_lockdep_set_classes(dev);
        return 0;

cleanup_dst_cache_init:
        dst_cache_destroy(&tunnel->dst_cache);
        return ret;
}

static int ip6gre_tunnel_init(struct net_device *dev)
{
        struct ip6_tnl *tunnel;
        int ret;

        ret = ip6gre_tunnel_init_common(dev);
        if (ret)
                return ret;

        tunnel = netdev_priv(dev);

        if (tunnel->parms.collect_md)
                return 0;

        __dev_addr_set(dev, &tunnel->parms.laddr, sizeof(struct in6_addr));
        memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));

        if (ipv6_addr_any(&tunnel->parms.raddr))
                dev->header_ops = &ip6gre_header_ops;

        return 0;
}

static void ip6gre_fb_tunnel_init(struct net_device *dev)
{
        struct ip6_tnl *tunnel = netdev_priv(dev);

        tunnel->dev = dev;
        tunnel->net = dev_net(dev);
        strscpy(tunnel->parms.name, dev->name);

        tunnel->hlen                = sizeof(struct ipv6hdr) + 4;
}

static struct inet6_protocol ip6gre_protocol __read_mostly = {
        .handler     = gre_rcv,
        .err_handler = ip6gre_err,
        .flags       = INET6_PROTO_FINAL,
};

static void __net_exit ip6gre_exit_rtnl_net(struct net *net, struct list_head *head)
{
        struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
        struct net_device *dev, *aux;
        int prio;

        for_each_netdev_safe(net, dev, aux)
                if (dev->rtnl_link_ops == &ip6gre_link_ops ||
                    dev->rtnl_link_ops == &ip6gre_tap_ops ||
                    dev->rtnl_link_ops == &ip6erspan_tap_ops)
                        unregister_netdevice_queue(dev, head);

        for (prio = 0; prio < 4; prio++) {
                int h;
                for (h = 0; h < IP6_GRE_HASH_SIZE; h++) {
                        struct ip6_tnl *t;

                        t = rtnl_net_dereference(net, ign->tunnels[prio][h]);

                        while (t) {
                                /* If dev is in the same netns, it has already
                                 * been added to the list by the previous loop.
                                 */
                                if (!net_eq(dev_net(t->dev), net))
                                        unregister_netdevice_queue(t->dev, head);

                                t = rtnl_net_dereference(net, t->next);
                        }
                }
        }
}

static int __net_init ip6gre_init_net(struct net *net)
{
        struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
        struct net_device *ndev;
        int err;

        if (!net_has_fallback_tunnels(net))
                return 0;
        ndev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
                            NET_NAME_UNKNOWN, ip6gre_tunnel_setup);
        if (!ndev) {
                err = -ENOMEM;
                goto err_alloc_dev;
        }
        ign->fb_tunnel_dev = ndev;
        dev_net_set(ign->fb_tunnel_dev, net);
        /* FB netdevice is special: we have one, and only one per netns.
         * Allowing to move it to another netns is clearly unsafe.
         */
        ign->fb_tunnel_dev->netns_immutable = true;

        ip6gre_fb_tunnel_init(ign->fb_tunnel_dev);
        ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops;

        err = register_netdev(ign->fb_tunnel_dev);
        if (err)
                goto err_reg_dev;

        rcu_assign_pointer(ign->tunnels_wc[0],
                           netdev_priv(ign->fb_tunnel_dev));
        return 0;

err_reg_dev:
        free_netdev(ndev);
err_alloc_dev:
        return err;
}

static struct pernet_operations ip6gre_net_ops = {
        .init = ip6gre_init_net,
        .exit_rtnl = ip6gre_exit_rtnl_net,
        .id   = &ip6gre_net_id,
        .size = sizeof(struct ip6gre_net),
};

static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
                                  struct netlink_ext_ack *extack)
{
        __be16 flags;

        if (!data)
                return 0;

        flags = 0;
        if (data[IFLA_GRE_IFLAGS])
                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
        if (data[IFLA_GRE_OFLAGS])
                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
        if (flags & (GRE_VERSION|GRE_ROUTING))
                return -EINVAL;

        return 0;
}

static int ip6gre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
                               struct netlink_ext_ack *extack)
{
        struct in6_addr daddr;

        if (tb[IFLA_ADDRESS]) {
                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
                        return -EINVAL;
                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
                        return -EADDRNOTAVAIL;
        }

        if (!data)
                goto out;

        if (data[IFLA_GRE_REMOTE]) {
                daddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]);
                if (ipv6_addr_any(&daddr))
                        return -EINVAL;
        }

out:
        return ip6gre_tunnel_validate(tb, data, extack);
}

static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[],
                                  struct netlink_ext_ack *extack)
{
        __be16 flags = 0;
        int ret, ver = 0;

        if (!data)
                return 0;

        ret = ip6gre_tap_validate(tb, data, extack);
        if (ret)
                return ret;

        /* ERSPAN should only have GRE sequence and key flag */
        if (data[IFLA_GRE_OFLAGS])
                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
        if (data[IFLA_GRE_IFLAGS])
                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
        if (!data[IFLA_GRE_COLLECT_METADATA] &&
            flags != (GRE_SEQ | GRE_KEY))
                return -EINVAL;

        /* ERSPAN Session ID only has 10-bit. Since we reuse
         * 32-bit key field as ID, check it's range.
         */
        if (data[IFLA_GRE_IKEY] &&
            (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
                return -EINVAL;

        if (data[IFLA_GRE_OKEY] &&
            (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
                return -EINVAL;

        if (data[IFLA_GRE_ERSPAN_VER]) {
                ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
                if (ver != 1 && ver != 2)
                        return -EINVAL;
        }

        if (ver == 1) {
                if (data[IFLA_GRE_ERSPAN_INDEX]) {
                        u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);

                        if (index & ~INDEX_MASK)
                                return -EINVAL;
                }
        } else if (ver == 2) {
                if (data[IFLA_GRE_ERSPAN_DIR]) {
                        u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);

                        if (dir & ~(DIR_MASK >> DIR_OFFSET))
                                return -EINVAL;
                }

                if (data[IFLA_GRE_ERSPAN_HWID]) {
                        u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);

                        if (hwid & ~(HWID_MASK >> HWID_OFFSET))
                                return -EINVAL;
                }
        }

        return 0;
}

static void ip6erspan_set_version(struct nlattr *data[],
                                  struct __ip6_tnl_parm *parms)
{
        if (!data)
                return;

        parms->erspan_ver = 1;
        if (data[IFLA_GRE_ERSPAN_VER])
                parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);

        if (parms->erspan_ver == 1) {
                if (data[IFLA_GRE_ERSPAN_INDEX])
                        parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
        } else if (parms->erspan_ver == 2) {
                if (data[IFLA_GRE_ERSPAN_DIR])
                        parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
                if (data[IFLA_GRE_ERSPAN_HWID])
                        parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
        }
}

static void ip6gre_netlink_parms(struct nlattr *data[],
                                struct __ip6_tnl_parm *parms)
{
        memset(parms, 0, sizeof(*parms));

        if (!data)
                return;

        if (data[IFLA_GRE_LINK])
                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);

        if (data[IFLA_GRE_IFLAGS])
                gre_flags_to_tnl_flags(parms->i_flags,
                                       nla_get_be16(data[IFLA_GRE_IFLAGS]));

        if (data[IFLA_GRE_OFLAGS])
                gre_flags_to_tnl_flags(parms->o_flags,
                                       nla_get_be16(data[IFLA_GRE_OFLAGS]));

        if (data[IFLA_GRE_IKEY])
                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);

        if (data[IFLA_GRE_OKEY])
                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);

        if (data[IFLA_GRE_LOCAL])
                parms->laddr = nla_get_in6_addr(data[IFLA_GRE_LOCAL]);

        if (data[IFLA_GRE_REMOTE])
                parms->raddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]);

        if (data[IFLA_GRE_TTL])
                parms->hop_limit = nla_get_u8(data[IFLA_GRE_TTL]);

        if (data[IFLA_GRE_ENCAP_LIMIT])
                parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]);

        if (data[IFLA_GRE_FLOWINFO])
                parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]);

        if (data[IFLA_GRE_FLAGS])
                parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]);

        if (data[IFLA_GRE_FWMARK])
                parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);

        if (data[IFLA_GRE_COLLECT_METADATA])
                parms->collect_md = true;
}

static int ip6gre_tap_init(struct net_device *dev)
{
        int ret;

        ret = ip6gre_tunnel_init_common(dev);
        if (ret)
                return ret;

        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;

        return 0;
}

static const struct net_device_ops ip6gre_tap_netdev_ops = {
        .ndo_init = ip6gre_tap_init,
        .ndo_uninit = ip6gre_tunnel_uninit,
        .ndo_start_xmit = ip6gre_tunnel_xmit,
        .ndo_set_mac_address = eth_mac_addr,
        .ndo_validate_addr = eth_validate_addr,
        .ndo_change_mtu = ip6_tnl_change_mtu,
        .ndo_get_iflink = ip6_tnl_get_iflink,
};

static int ip6erspan_calc_hlen(struct ip6_tnl *tunnel)
{
        int t_hlen;

        tunnel->tun_hlen = 8;
        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
                       erspan_hdr_len(tunnel->parms.erspan_ver);

        t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
        tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
        return t_hlen;
}

static int ip6erspan_tap_init(struct net_device *dev)
{
        struct ip6_tnl *tunnel;
        int t_hlen;
        int ret;

        tunnel = netdev_priv(dev);

        tunnel->dev = dev;
        strscpy(tunnel->parms.name, dev->name);

        ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
        if (ret)
                return ret;

        ret = gro_cells_init(&tunnel->gro_cells, dev);
        if (ret)
                goto cleanup_dst_cache_init;

        t_hlen = ip6erspan_calc_hlen(tunnel);
        dev->mtu = ETH_DATA_LEN - t_hlen;
        if (dev->type == ARPHRD_ETHER)
                dev->mtu -= ETH_HLEN;
        if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                dev->mtu -= 8;

        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
        ip6erspan_tnl_link_config(tunnel, 1);

        netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
        netdev_lockdep_set_classes(dev);
        return 0;

cleanup_dst_cache_init:
        dst_cache_destroy(&tunnel->dst_cache);
        return ret;
}

static const struct net_device_ops ip6erspan_netdev_ops = {
        .ndo_init =                ip6erspan_tap_init,
        .ndo_uninit =                ip6erspan_tunnel_uninit,
        .ndo_start_xmit =        ip6erspan_tunnel_xmit,
        .ndo_set_mac_address =        eth_mac_addr,
        .ndo_validate_addr =        eth_validate_addr,
        .ndo_change_mtu =        ip6_tnl_change_mtu,
        .ndo_get_iflink =        ip6_tnl_get_iflink,
};

static void ip6gre_tap_setup(struct net_device *dev)
{

        ether_setup(dev);

        dev->max_mtu = 0;
        dev->netdev_ops = &ip6gre_tap_netdev_ops;
        dev->needs_free_netdev = true;
        dev->priv_destructor = ip6gre_dev_free;

        dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
        netif_keep_dst(dev);
}

static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
                                       struct ip_tunnel_encap *ipencap)
{
        bool ret = false;

        memset(ipencap, 0, sizeof(*ipencap));

        if (!data)
                return ret;

        if (data[IFLA_GRE_ENCAP_TYPE]) {
                ret = true;
                ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
        }

        if (data[IFLA_GRE_ENCAP_FLAGS]) {
                ret = true;
                ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
        }

        if (data[IFLA_GRE_ENCAP_SPORT]) {
                ret = true;
                ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
        }

        if (data[IFLA_GRE_ENCAP_DPORT]) {
                ret = true;
                ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
        }

        return ret;
}

static int ip6gre_newlink_common(struct net *link_net, struct net_device *dev,
                                 struct nlattr *tb[], struct nlattr *data[],
                                 struct netlink_ext_ack *extack)
{
        struct ip6_tnl *nt;
        struct ip_tunnel_encap ipencap;
        int err;

        nt = netdev_priv(dev);

        if (ip6gre_netlink_encap_parms(data, &ipencap)) {
                int err = ip6_tnl_encap_setup(nt, &ipencap);

                if (err < 0)
                        return err;
        }

        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
                eth_hw_addr_random(dev);

        nt->dev = dev;
        nt->net = link_net;

        err = register_netdevice(dev);
        if (err)
                goto out;

        if (tb[IFLA_MTU])
                ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));

out:
        return err;
}

static int ip6gre_newlink(struct net_device *dev,
                          struct rtnl_newlink_params *params,
                          struct netlink_ext_ack *extack)
{
        struct net *net = params->link_net ? : dev_net(dev);
        struct ip6_tnl *nt = netdev_priv(dev);
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        struct ip6gre_net *ign;
        int err;

        ip6gre_netlink_parms(data, &nt->parms);
        ign = net_generic(net, ip6gre_net_id);

        if (nt->parms.collect_md) {
                if (rtnl_dereference(ign->collect_md_tun))
                        return -EEXIST;
        } else {
                if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
                        return -EEXIST;
        }

        err = ip6gre_newlink_common(net, dev, tb, data, extack);
        if (!err) {
                ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
                ip6gre_tunnel_link_md(ign, nt);
                ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
        }
        return err;
}

static struct ip6_tnl *
ip6gre_changelink_common(struct net_device *dev, struct nlattr *tb[],
                         struct nlattr *data[], struct __ip6_tnl_parm *p_p,
                         struct netlink_ext_ack *extack)
{
        struct ip6_tnl *t, *nt = netdev_priv(dev);
        struct net *net = nt->net;
        struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
        struct ip_tunnel_encap ipencap;

        if (dev == ign->fb_tunnel_dev)
                return ERR_PTR(-EINVAL);

        if (ip6gre_netlink_encap_parms(data, &ipencap)) {
                int err = ip6_tnl_encap_setup(nt, &ipencap);

                if (err < 0)
                        return ERR_PTR(err);
        }

        ip6gre_netlink_parms(data, p_p);

        t = ip6gre_tunnel_locate(net, p_p, 0);

        if (t) {
                if (t->dev != dev)
                        return ERR_PTR(-EEXIST);
        } else {
                t = nt;
        }

        return t;
}

static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
                             struct nlattr *data[],
                             struct netlink_ext_ack *extack)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
        struct __ip6_tnl_parm p;

        t = ip6gre_changelink_common(dev, tb, data, &p, extack);
        if (IS_ERR(t))
                return PTR_ERR(t);

        ip6gre_tunnel_unlink_md(ign, t);
        ip6gre_tunnel_unlink(ign, t);
        ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
        ip6gre_tunnel_link_md(ign, t);
        ip6gre_tunnel_link(ign, t);
        return 0;
}

static void ip6gre_dellink(struct net_device *dev, struct list_head *head)
{
        struct net *net = dev_net(dev);
        struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);

        if (dev != ign->fb_tunnel_dev)
                unregister_netdevice_queue(dev, head);
}

static size_t ip6gre_get_size(const struct net_device *dev)
{
        return
                /* IFLA_GRE_LINK */
                nla_total_size(4) +
                /* IFLA_GRE_IFLAGS */
                nla_total_size(2) +
                /* IFLA_GRE_OFLAGS */
                nla_total_size(2) +
                /* IFLA_GRE_IKEY */
                nla_total_size(4) +
                /* IFLA_GRE_OKEY */
                nla_total_size(4) +
                /* IFLA_GRE_LOCAL */
                nla_total_size(sizeof(struct in6_addr)) +
                /* IFLA_GRE_REMOTE */
                nla_total_size(sizeof(struct in6_addr)) +
                /* IFLA_GRE_TTL */
                nla_total_size(1) +
                /* IFLA_GRE_ENCAP_LIMIT */
                nla_total_size(1) +
                /* IFLA_GRE_FLOWINFO */
                nla_total_size(4) +
                /* IFLA_GRE_FLAGS */
                nla_total_size(4) +
                /* IFLA_GRE_ENCAP_TYPE */
                nla_total_size(2) +
                /* IFLA_GRE_ENCAP_FLAGS */
                nla_total_size(2) +
                /* IFLA_GRE_ENCAP_SPORT */
                nla_total_size(2) +
                /* IFLA_GRE_ENCAP_DPORT */
                nla_total_size(2) +
                /* IFLA_GRE_COLLECT_METADATA */
                nla_total_size(0) +
                /* IFLA_GRE_FWMARK */
                nla_total_size(4) +
                /* IFLA_GRE_ERSPAN_INDEX */
                nla_total_size(4) +
                0;
}

static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct __ip6_tnl_parm *p = &t->parms;
        IP_TUNNEL_DECLARE_FLAGS(o_flags);

        ip_tunnel_flags_copy(o_flags, p->o_flags);

        if (p->erspan_ver == 1 || p->erspan_ver == 2) {
                if (!p->collect_md)
                        __set_bit(IP_TUNNEL_KEY_BIT, o_flags);

                if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver))
                        goto nla_put_failure;

                if (p->erspan_ver == 1) {
                        if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
                                goto nla_put_failure;
                } else {
                        if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir))
                                goto nla_put_failure;
                        if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid))
                                goto nla_put_failure;
                }
        }

        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
            nla_put_be16(skb, IFLA_GRE_IFLAGS,
                         gre_tnl_flags_to_gre_flags(p->i_flags)) ||
            nla_put_be16(skb, IFLA_GRE_OFLAGS,
                         gre_tnl_flags_to_gre_flags(o_flags)) ||
            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
            nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) ||
            nla_put_in6_addr(skb, IFLA_GRE_REMOTE, &p->raddr) ||
            nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) ||
            nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) ||
            nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
            nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) ||
            nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark))
                goto nla_put_failure;

        if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
                        t->encap.type) ||
            nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
                         t->encap.sport) ||
            nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
                         t->encap.dport) ||
            nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
                        t->encap.flags))
                goto nla_put_failure;

        if (p->collect_md) {
                if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
        [IFLA_GRE_LINK]        = { .type = NLA_U32 },
        [IFLA_GRE_IFLAGS]      = { .type = NLA_U16 },
        [IFLA_GRE_OFLAGS]      = { .type = NLA_U16 },
        [IFLA_GRE_IKEY]        = { .type = NLA_U32 },
        [IFLA_GRE_OKEY]        = { .type = NLA_U32 },
        [IFLA_GRE_LOCAL]       = { .len = sizeof_field(struct ipv6hdr, saddr) },
        [IFLA_GRE_REMOTE]      = { .len = sizeof_field(struct ipv6hdr, daddr) },
        [IFLA_GRE_TTL]         = { .type = NLA_U8 },
        [IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 },
        [IFLA_GRE_FLOWINFO]    = { .type = NLA_U32 },
        [IFLA_GRE_FLAGS]       = { .type = NLA_U32 },
        [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
        [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
        [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
        [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
        [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
        [IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
        [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
        [IFLA_GRE_ERSPAN_VER]        = { .type = NLA_U8 },
        [IFLA_GRE_ERSPAN_DIR]        = { .type = NLA_U8 },
        [IFLA_GRE_ERSPAN_HWID]        = { .type = NLA_U16 },
};

static void ip6erspan_tap_setup(struct net_device *dev)
{
        ether_setup(dev);

        dev->max_mtu = 0;
        dev->netdev_ops = &ip6erspan_netdev_ops;
        dev->needs_free_netdev = true;
        dev->priv_destructor = ip6gre_dev_free;

        dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
        netif_keep_dst(dev);
}

static int ip6erspan_newlink(struct net_device *dev,
                             struct rtnl_newlink_params *params,
                             struct netlink_ext_ack *extack)
{
        struct net *net = params->link_net ? : dev_net(dev);
        struct ip6_tnl *nt = netdev_priv(dev);
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        struct ip6gre_net *ign;
        int err;

        ip6gre_netlink_parms(data, &nt->parms);
        ip6erspan_set_version(data, &nt->parms);
        ign = net_generic(net, ip6gre_net_id);

        if (nt->parms.collect_md) {
                if (rtnl_dereference(ign->collect_md_tun_erspan))
                        return -EEXIST;
        } else {
                if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
                        return -EEXIST;
        }

        err = ip6gre_newlink_common(net, dev, tb, data, extack);
        if (!err) {
                ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]);
                ip6erspan_tunnel_link_md(ign, nt);
                ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
        }
        return err;
}

static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu)
{
        ip6gre_tnl_link_config_common(t);
        ip6gre_tnl_link_config_route(t, set_mtu, ip6erspan_calc_hlen(t));
}

static int ip6erspan_tnl_change(struct ip6_tnl *t,
                                const struct __ip6_tnl_parm *p, int set_mtu)
{
        ip6gre_tnl_copy_tnl_parm(t, p);
        ip6erspan_tnl_link_config(t, set_mtu);
        return 0;
}

static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[],
                                struct nlattr *data[],
                                struct netlink_ext_ack *extack)
{
        struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id);
        struct __ip6_tnl_parm p;
        struct ip6_tnl *t;

        t = ip6gre_changelink_common(dev, tb, data, &p, extack);
        if (IS_ERR(t))
                return PTR_ERR(t);

        ip6erspan_set_version(data, &p);
        ip6gre_tunnel_unlink_md(ign, t);
        ip6gre_tunnel_unlink(ign, t);
        ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]);
        ip6erspan_tunnel_link_md(ign, t);
        ip6gre_tunnel_link(ign, t);
        return 0;
}

static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
        .kind                = "ip6gre",
        .maxtype        = IFLA_GRE_MAX,
        .policy                = ip6gre_policy,
        .priv_size        = sizeof(struct ip6_tnl),
        .setup                = ip6gre_tunnel_setup,
        .validate        = ip6gre_tunnel_validate,
        .newlink        = ip6gre_newlink,
        .changelink        = ip6gre_changelink,
        .dellink        = ip6gre_dellink,
        .get_size        = ip6gre_get_size,
        .fill_info        = ip6gre_fill_info,
        .get_link_net        = ip6_tnl_get_link_net,
};

static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
        .kind                = "ip6gretap",
        .maxtype        = IFLA_GRE_MAX,
        .policy                = ip6gre_policy,
        .priv_size        = sizeof(struct ip6_tnl),
        .setup                = ip6gre_tap_setup,
        .validate        = ip6gre_tap_validate,
        .newlink        = ip6gre_newlink,
        .changelink        = ip6gre_changelink,
        .get_size        = ip6gre_get_size,
        .fill_info        = ip6gre_fill_info,
        .get_link_net        = ip6_tnl_get_link_net,
};

static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = {
        .kind                = "ip6erspan",
        .maxtype        = IFLA_GRE_MAX,
        .policy                = ip6gre_policy,
        .priv_size        = sizeof(struct ip6_tnl),
        .setup                = ip6erspan_tap_setup,
        .validate        = ip6erspan_tap_validate,
        .newlink        = ip6erspan_newlink,
        .changelink        = ip6erspan_changelink,
        .get_size        = ip6gre_get_size,
        .fill_info        = ip6gre_fill_info,
        .get_link_net        = ip6_tnl_get_link_net,
};

/*
 *        And now the modules code and kernel interface.
 */

static int __init ip6gre_init(void)
{
        int err;

        pr_info("GRE over IPv6 tunneling driver\n");

        err = register_pernet_device(&ip6gre_net_ops);
        if (err < 0)
                return err;

        err = inet6_add_protocol(&ip6gre_protocol, IPPROTO_GRE);
        if (err < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                goto add_proto_failed;
        }

        err = rtnl_link_register(&ip6gre_link_ops);
        if (err < 0)
                goto rtnl_link_failed;

        err = rtnl_link_register(&ip6gre_tap_ops);
        if (err < 0)
                goto tap_ops_failed;

        err = rtnl_link_register(&ip6erspan_tap_ops);
        if (err < 0)
                goto erspan_link_failed;

out:
        return err;

erspan_link_failed:
        rtnl_link_unregister(&ip6gre_tap_ops);
tap_ops_failed:
        rtnl_link_unregister(&ip6gre_link_ops);
rtnl_link_failed:
        inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE);
add_proto_failed:
        unregister_pernet_device(&ip6gre_net_ops);
        goto out;
}

static void __exit ip6gre_fini(void)
{
        rtnl_link_unregister(&ip6gre_tap_ops);
        rtnl_link_unregister(&ip6gre_link_ops);
        rtnl_link_unregister(&ip6erspan_tap_ops);
        inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE);
        unregister_pernet_device(&ip6gre_net_ops);
}

module_init(ip6gre_init);
module_exit(ip6gre_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>");
MODULE_DESCRIPTION("GRE over IPv6 tunneling device");
MODULE_ALIAS_RTNL_LINK("ip6gre");
MODULE_ALIAS_RTNL_LINK("ip6gretap");
MODULE_ALIAS_RTNL_LINK("ip6erspan");
MODULE_ALIAS_NETDEV("ip6gre0");

































   23 
   18 















   23 






   20 

    1 



















   19 
   24 
























   17 
   19 




























    1 



















    1 

    1 









    1 














































    1 























   21 


















   21 







   20 

    1 



   23 




   20 










   24 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/realpath.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/magic.h>
#include <linux/proc_fs.h>

/**
 * tomoyo_encode2 - Encode binary string to ascii string.
 *
 * @str:     String in binary format.
 * @str_len: Size of @str in byte.
 *
 * Returns pointer to @str in ascii format on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_encode2(const char *str, int str_len)
{
        int i;
        int len = 0;
        const char *p = str;
        char *cp;
        char *cp0;

        if (!p)
                return NULL;
        for (i = 0; i < str_len; i++) {
                const unsigned char c = p[i];

                if (c == '\\')
                        len += 2;
                else if (c > ' ' && c < 127)
                        len++;
                else
                        len += 4;
        }
        len++;
        /* Reserve space for appending "/". */
        cp = kzalloc(len + 10, GFP_NOFS);
        if (!cp)
                return NULL;
        cp0 = cp;
        p = str;
        for (i = 0; i < str_len; i++) {
                const unsigned char c = p[i];

                if (c == '\\') {
                        *cp++ = '\\';
                        *cp++ = '\\';
                } else if (c > ' ' && c < 127) {
                        *cp++ = c;
                } else {
                        *cp++ = '\\';
                        *cp++ = (c >> 6) + '0';
                        *cp++ = ((c >> 3) & 7) + '0';
                        *cp++ = (c & 7) + '0';
                }
        }
        return cp0;
}

/**
 * tomoyo_encode - Encode binary string to ascii string.
 *
 * @str: String in binary format.
 *
 * Returns pointer to @str in ascii format on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_encode(const char *str)
{
        return str ? tomoyo_encode2(str, strlen(str)) : NULL;
}

/**
 * tomoyo_get_absolute_path - Get the path of a dentry but ignores chroot'ed root.
 *
 * @path:   Pointer to "struct path".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 */
static char *tomoyo_get_absolute_path(const struct path *path, char * const buffer,
                                      const int buflen)
{
        char *pos = ERR_PTR(-ENOMEM);

        if (buflen >= 256) {
                /* go to whatever namespace root we are under */
                pos = d_absolute_path(path, buffer, buflen - 1);
                if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
                        struct inode *inode = d_backing_inode(path->dentry);

                        if (inode && S_ISDIR(inode->i_mode)) {
                                buffer[buflen - 2] = '/';
                                buffer[buflen - 1] = '\0';
                        }
                }
        }
        return pos;
}

/**
 * tomoyo_get_dentry_path - Get the path of a dentry.
 *
 * @dentry: Pointer to "struct dentry".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 */
static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer,
                                    const int buflen)
{
        char *pos = ERR_PTR(-ENOMEM);

        if (buflen >= 256) {
                pos = dentry_path_raw(dentry, buffer, buflen - 1);
                if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
                        struct inode *inode = d_backing_inode(dentry);

                        if (inode && S_ISDIR(inode->i_mode)) {
                                buffer[buflen - 2] = '/';
                                buffer[buflen - 1] = '\0';
                        }
                }
        }
        return pos;
}

/**
 * tomoyo_get_local_path - Get the path of a dentry.
 *
 * @dentry: Pointer to "struct dentry".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 */
static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
                                   const int buflen)
{
        struct super_block *sb = dentry->d_sb;
        char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen);

        if (IS_ERR(pos))
                return pos;
        /* Convert from $PID to self if $PID is current thread. */
        if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') {
                char *ep;
                const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10);
                struct pid_namespace *proc_pidns = proc_pid_ns(sb);

                if (*ep == '/' && pid && pid ==
                    task_tgid_nr_ns(current, proc_pidns)) {
                        pos = ep - 5;
                        if (pos < buffer)
                                goto out;
                        memmove(pos, "/self", 5);
                }
                goto prepend_filesystem_name;
        }
        /* Use filesystem name for unnamed devices. */
        if (!MAJOR(sb->s_dev))
                goto prepend_filesystem_name;
        {
                struct inode *inode = d_backing_inode(sb->s_root);

                /*
                 * Use filesystem name if filesystem does not support rename()
                 * operation.
                 */
                if (!inode->i_op->rename)
                        goto prepend_filesystem_name;
        }
        /* Prepend device name. */
        {
                char name[64];
                int name_len;
                const dev_t dev = sb->s_dev;

                name[sizeof(name) - 1] = '\0';
                snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev),
                         MINOR(dev));
                name_len = strlen(name);
                pos -= name_len;
                if (pos < buffer)
                        goto out;
                memmove(pos, name, name_len);
                return pos;
        }
        /* Prepend filesystem name. */
prepend_filesystem_name:
        {
                const char *name = sb->s_type->name;
                const int name_len = strlen(name);

                pos -= name_len + 1;
                if (pos < buffer)
                        goto out;
                memmove(pos, name, name_len);
                pos[name_len] = ':';
        }
        return pos;
out:
        return ERR_PTR(-ENOMEM);
}

/**
 * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root.
 *
 * @path: Pointer to "struct path".
 *
 * Returns the realpath of the given @path on success, NULL otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 * Characters out of 0x20 < c < 0x7F range are converted to
 * \ooo style octal string.
 * Character \ is converted to \\ string.
 *
 * These functions use kzalloc(), so the caller must call kfree()
 * if these functions didn't return NULL.
 */
char *tomoyo_realpath_from_path(const struct path *path)
{
        char *buf = NULL;
        char *name = NULL;
        unsigned int buf_len = PAGE_SIZE / 2;
        struct dentry *dentry = path->dentry;
        struct super_block *sb = dentry->d_sb;

        while (1) {
                char *pos;
                struct inode *inode;

                buf_len <<= 1;
                kfree(buf);
                buf = kmalloc(buf_len, GFP_NOFS);
                if (!buf)
                        break;
                /* To make sure that pos is '\0' terminated. */
                buf[buf_len - 1] = '\0';
                /* For "pipe:[\$]" and "socket:[\$]". */
                if (dentry->d_op && dentry->d_op->d_dname) {
                        pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1);
                        goto encode;
                }
                inode = d_backing_inode(sb->s_root);
                /*
                 * Get local name for filesystems without rename() operation
                 */
                if ((!inode->i_op->rename &&
                     !(sb->s_type->fs_flags & FS_REQUIRES_DEV)))
                        pos = tomoyo_get_local_path(path->dentry, buf,
                                                    buf_len - 1);
                /* Get absolute name for the rest. */
                else {
                        pos = tomoyo_get_absolute_path(path, buf, buf_len - 1);
                        /*
                         * Fall back to local name if absolute name is not
                         * available.
                         */
                        if (pos == ERR_PTR(-EINVAL))
                                pos = tomoyo_get_local_path(path->dentry, buf,
                                                            buf_len - 1);
                }
encode:
                if (IS_ERR(pos))
                        continue;
                name = tomoyo_encode(pos);
                break;
        }
        kfree(buf);
        if (!name)
                tomoyo_warn_oom(__func__);
        return name;
}

/**
 * tomoyo_realpath_nofollow - Get realpath of a pathname.
 *
 * @pathname: The pathname to solve.
 *
 * Returns the realpath of @pathname on success, NULL otherwise.
 */
char *tomoyo_realpath_nofollow(const char *pathname)
{
        struct path path;

        if (pathname && kern_path(pathname, 0, &path) == 0) {
                char *buf = tomoyo_realpath_from_path(&path);

                path_put(&path);
                return buf;
        }
        return NULL;
}


























    3 




























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the TCP protocol.
 *
 * Version:        @(#)tcp.h        1.0.2        04/28/93
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_TCP_H
#define _LINUX_TCP_H


#include <linux/skbuff.h>
#include <linux/win_minmax.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#include <uapi/linux/tcp.h>

static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb)
{
        return (struct tcphdr *)skb_transport_header(skb);
}

static inline unsigned int __tcp_hdrlen(const struct tcphdr *th)
{
        return th->doff * 4;
}

static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
{
        return __tcp_hdrlen(tcp_hdr(skb));
}

static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb)
{
        return (struct tcphdr *)skb_inner_transport_header(skb);
}

static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb)
{
        return inner_tcp_hdr(skb)->doff * 4;
}

/**
 * skb_tcp_all_headers - Returns size of all headers for a TCP packet
 * @skb: buffer
 *
 * Used in TX path, for a packet known to be a TCP one.
 *
 * if (skb_is_gso(skb)) {
 *         int hlen = skb_tcp_all_headers(skb);
 *         ...
 */
static inline int skb_tcp_all_headers(const struct sk_buff *skb)
{
        return skb_transport_offset(skb) + tcp_hdrlen(skb);
}

/**
 * skb_inner_tcp_all_headers - Returns size of all headers for an encap TCP packet
 * @skb: buffer
 *
 * Used in TX path, for a packet known to be a TCP one.
 *
 * if (skb_is_gso(skb) && skb->encapsulation) {
 *         int hlen = skb_inner_tcp_all_headers(skb);
 *         ...
 */
static inline int skb_inner_tcp_all_headers(const struct sk_buff *skb)
{
        return skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb);
}

static inline unsigned int tcp_optlen(const struct sk_buff *skb)
{
        return (tcp_hdr(skb)->doff - 5) * 4;
}

/* TCP Fast Open */
#define TCP_FASTOPEN_COOKIE_MIN        4        /* Min Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_MAX        16        /* Max Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_SIZE 8        /* the size employed by this impl. */

/* TCP Fast Open Cookie as stored in memory */
struct tcp_fastopen_cookie {
        __le64        val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))];
        s8        len;
        bool        exp;        /* In RFC6994 experimental option format */
};

/* This defines a selective acknowledgement block. */
struct tcp_sack_block_wire {
        __be32        start_seq;
        __be32        end_seq;
};

struct tcp_sack_block {
        u32        start_seq;
        u32        end_seq;
};

/*These are used to set the sack_ok field in struct tcp_options_received */
#define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
#define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/

struct tcp_options_received {
/*        PAWS/RTTM data        */
        int        ts_recent_stamp;/* Time we stored ts_recent (for aging) */
        u32        ts_recent;        /* Time stamp to echo next                */
        u32        rcv_tsval;        /* Time stamp value                     */
        u32        rcv_tsecr;        /* Time stamp echo reply                */
        u16         saw_tstamp : 1,        /* Saw TIMESTAMP on last packet                */
                tstamp_ok : 1,        /* TIMESTAMP seen on SYN packet                */
                dsack : 1,        /* D-SACK is scheduled                        */
                wscale_ok : 1,        /* Wscale seen on SYN packet                */
                sack_ok : 3,        /* SACK seen on SYN packet                */
                smc_ok : 1,        /* SMC seen on SYN packet                */
                snd_wscale : 4,        /* Window scaling received from sender        */
                rcv_wscale : 4;        /* Window scaling to send to receiver        */
        u8        accecn:6,        /* AccECN index in header, 0=no options        */
                saw_unknown:1,        /* Received unknown option                */
                unused:1;
        u8        num_sacks;        /* Number of SACK blocks                */
        u16        user_mss;        /* mss requested by user in ioctl        */
        u16        mss_clamp;        /* Maximal mss, negotiated at connection setup */
};

static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
{
        rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
        rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
#if IS_ENABLED(CONFIG_SMC)
        rx_opt->smc_ok = 0;
#endif
}

/* This is the max number of SACKS that we'll generate and process. It's safe
 * to increase this, although since:
 *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
 * only four options will fit in a standard TCP header */
#define TCP_NUM_SACKS 4

struct tcp_request_sock_ops;

struct tcp_request_sock {
        struct inet_request_sock         req;
        const struct tcp_request_sock_ops *af_specific;
        u64                                snt_synack; /* first SYNACK sent time */
        bool                                tfo_listener;
        bool                                is_mptcp;
        bool                                req_usec_ts;
#if IS_ENABLED(CONFIG_MPTCP)
        bool                                drop_req;
#endif
        u32                                txhash;
        u32                                rcv_isn;
        u32                                snt_isn;
        u32                                ts_off;
        u32                                snt_tsval_first;
        u32                                snt_tsval_last;
        u32                                last_oow_ack_time; /* last SYNACK */
        u32                                rcv_nxt; /* the ack # by SYNACK. For
                                                  * FastOpen it's the seq#
                                                  * after data-in-SYN.
                                                  */
        u8                                syn_tos;
        bool                                accecn_ok;
        u8                                syn_ect_snt: 2,
                                        syn_ect_rcv: 2,
                                        accecn_fail_mode:4;
        u8                                saw_accecn_opt  :2;
#ifdef CONFIG_TCP_AO
        u8                                ao_keyid;
        u8                                ao_rcv_next;
        bool                                used_tcp_ao;
#endif
};

#define tcp_rsk(ptr) container_of_const(ptr, struct tcp_request_sock, req.req)

static inline bool tcp_rsk_used_ao(const struct request_sock *req)
{
#ifndef CONFIG_TCP_AO
        return false;
#else
        return tcp_rsk(req)->used_tcp_ao;
#endif
}

#define TCP_RMEM_TO_WIN_SCALE 8

struct tcp_sock {
        /* Cacheline organization can be found documented in
         * Documentation/networking/net_cachelines/tcp_sock.rst.
         * Please update the document when adding new fields.
         */

        /* inet_connection_sock has to be the first member of tcp_sock */
        struct inet_connection_sock        inet_conn;

        /* TX read-mostly hotpath cache lines */
        __cacheline_group_begin(tcp_sock_read_tx);
        u32        max_window;        /* Maximal window ever seen from peer        */
        u32        rcv_ssthresh;        /* Current window clamp                        */
        u32        reordering;        /* Packet reordering metric.                */
        u32        notsent_lowat;        /* TCP_NOTSENT_LOWAT */
        u16        gso_segs;        /* Max number of segs per GSO packet        */
        /* from STCP, retrans queue hinting */
        struct sk_buff *retransmit_skb_hint;
#if defined(CONFIG_TLS_DEVICE)
        void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq);
#endif
        __cacheline_group_end(tcp_sock_read_tx);

        /* TXRX read-mostly hotpath cache lines */
        __cacheline_group_begin(tcp_sock_read_txrx);
        u32        tsoffset;        /* timestamp offset */
        u32        snd_wnd;        /* The window we expect to receive        */
        u32        mss_cache;        /* Cached effective mss, not including SACKS */
        u32        snd_cwnd;        /* Sending congestion window                */
        u32        prr_out;        /* Total number of pkts sent during Recovery. */
        u32        lost_out;        /* Lost packets                        */
        u32        sacked_out;        /* SACK'd packets                        */
        u16        tcp_header_len;        /* Bytes of tcp header to send                */
        u8        scaling_ratio;        /* see tcp_win_from_space() */
        u8        repair      : 1,
                tcp_usec_ts : 1, /* TSval values in usec */
                is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
                is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
                recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */
        __cacheline_group_end(tcp_sock_read_txrx);

        /* RX read-mostly hotpath cache lines */
        __cacheline_group_begin(tcp_sock_read_rx);
        u32        copied_seq;        /* Head of yet unread data */
        u32        snd_wl1;        /* Sequence for window update                */
        u32        tlp_high_seq;        /* snd_nxt at the time of TLP */
        u32        rttvar_us;        /* smoothed mdev_max                        */
        u32        retrans_out;        /* Retransmitted packets out                */
        u16        advmss;                /* Advertised MSS                        */
        u16        urg_data;        /* Saved octet of OOB data and control flags */
        u32        lost;                /* Total data packets lost incl. rexmits */
        u32        snd_ssthresh;        /* Slow start size threshold                */
        struct  minmax rtt_min;
        /* OOO segments go in this rbtree. Socket lock must be held. */
        struct rb_root        out_of_order_queue;
        __cacheline_group_end(tcp_sock_read_rx);

        /* TX read-write hotpath cache lines */
        __cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned;
        u32        segs_out;        /* RFC4898 tcpEStatsPerfSegsOut
                                 * The total number of segments sent.
                                 */
        u32        data_segs_out;        /* RFC4898 tcpEStatsPerfDataSegsOut
                                 * total number of data segments sent.
                                 */
        u64        bytes_sent;        /* RFC4898 tcpEStatsPerfHCDataOctetsOut
                                 * total number of data bytes sent.
                                 */
        u32        snd_sml;        /* Last byte of the most recently transmitted small packet */
        u8        chrono_type;        /* current chronograph type */
        u32        chrono_start;        /* Start time in jiffies of a TCP chrono */
        u32        chrono_stat[3];        /* Time in jiffies for chrono_stat stats */
        u32        write_seq;        /* Tail(+1) of data held in tcp send buffer */
        u32        pushed_seq;        /* Last pushed seq, required to talk to windows */
        u32        lsndtime;        /* timestamp of last sent data packet (for restart window) */
        u32        mdev_us;        /* medium deviation                        */
        u32        rtt_seq;        /* sequence number to update rttvar        */
        u64        tcp_wstamp_ns;        /* departure time for next sent data packet */
        u64        accecn_opt_tstamp;        /* Last AccECN option sent timestamp */
        struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
        struct sk_buff *highest_sack;   /* skb just after the highest
                                         * skb with SACKed bit set
                                         * (validity guaranteed only if
                                         * sacked_out > 0)
                                         */
        u8        ecn_flags;        /* ECN status bits.                        */
        __cacheline_group_end(tcp_sock_write_tx);

        /* TXRX read-write hotpath cache lines */
        __cacheline_group_begin(tcp_sock_write_txrx);
/*
 *        Header prediction flags
 *        0x5?10 << 16 + snd_wnd in net byte order
 */
        u8        nonagle     : 4,/* Disable Nagle algorithm?             */
                rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
        u8        received_ce_pending:4, /* Not yet transmit cnt of received_ce */
                accecn_opt_sent_w_dsack:1,/* Sent ACCECN opt in previous ACK w/ D-SACK */
                unused2:3;
        u8        accecn_minlen:2,/* Minimum length of AccECN option sent */
                est_ecnfield:2,/* ECN field for AccECN delivered estimates */
                accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
                prev_ecnfield:2; /* ECN bits from the previous segment */
        __be32        pred_flags;
        u64        tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
        u64        tcp_mstamp;        /* most recent packet received/sent */
        u32        rcv_nxt;        /* What we want to receive next                */
        u32        snd_nxt;        /* Next sequence we send                */
        u32        snd_una;        /* First byte we want an ack for        */
        u32        window_clamp;        /* Maximal window to advertise                */
        u32        srtt_us;        /* smoothed round trip time << 3 in usecs */
        u32        packets_out;        /* Packets which are "in flight"        */
        u32        snd_up;                /* Urgent pointer                */
        u32        delivered;        /* Total data packets delivered incl. rexmits */
        u32        delivered_ce;        /* Like the above but only ECE marked packets */
        u32        received_ce;        /* Like the above but for rcvd CE marked pkts */
        u32        received_ecn_bytes[3]; /* received byte counters for three ECN
                                        * types: INET_ECN_ECT_1, INET_ECN_ECT_0,
                                        * and INET_ECN_CE
                                        */
        u32        app_limited;        /* limited until "delivered" reaches this val */
        u32        rcv_wnd;        /* Current receiver window                */
        u32        rcv_mwnd_seq;        /* Maximum window sequence number (RFC 7323,
                                 * section 2.4, receiver requirements)
                                 */
        u32        rcv_tstamp;        /* timestamp of last received ACK (for keepalives) */
/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
        struct tcp_options_received rx_opt;
        __cacheline_group_end(tcp_sock_write_txrx);

        /* RX read-write hotpath cache lines */
        __cacheline_group_begin(tcp_sock_write_rx) __aligned(8);
        u64        bytes_received;
                                /* RFC4898 tcpEStatsAppHCThruOctetsReceived
                                 * sum(delta(rcv_nxt)), or how many bytes
                                 * were acked.
                                 */
        u32        segs_in;        /* RFC4898 tcpEStatsPerfSegsIn
                                 * total number of segments in.
                                 */
        u32        data_segs_in;        /* RFC4898 tcpEStatsPerfDataSegsIn
                                 * total number of data segments in.
                                 */
        u32        rcv_wup;        /* rcv_nxt on last window update sent        */
        u32        max_packets_out;  /* max packets_out in last window */
        u32        cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */
        u32        rate_delivered;    /* saved rate sample: packets delivered */
        u32        rate_interval_us;  /* saved rate sample: time elapsed */
        u32        rcv_rtt_last_tsecr;
        u32        delivered_ecn_bytes[3];
        u16        pkts_acked_ewma;/* Pkts acked EWMA for AccECN cep heuristic */
        u64        first_tx_mstamp;  /* start of window send phase */
        u64        delivered_mstamp; /* time we reached "delivered" */
        u64        bytes_acked;        /* RFC4898 tcpEStatsAppHCThruOctetsAcked
                                 * sum(delta(snd_una)), or how many bytes
                                 * were acked.
                                 */
        struct {
                u32        rtt_us;
                u32        seq;
                u64        time;
        } rcv_rtt_est;
/* Receiver queue space */
        struct {
                int        space;
                u32        seq;
                u64        time;
        } rcvq_space;
        __cacheline_group_end(tcp_sock_write_rx);
        /* End of Hot Path */

/*
 *        RFC793 variables by their proper names. This means you can
 *        read the code and the spec side by side (and laugh ...)
 *        See RFC793 and RFC1122. The RFC writes these in capitals.
 */
        u32        dsack_dups;        /* RFC4898 tcpEStatsStackDSACKDups
                                 * total number of DSACK blocks received
                                 */
        u32        compressed_ack_rcv_nxt;
        struct list_head tsq_node; /* anchor in tsq_tasklet.head list */

        /* Information of the most recently (s)acked skb */
        struct tcp_rack {
                u64 mstamp; /* (Re)sent time of the skb */
                u32 rtt_us;  /* Associated RTT */
                u32 end_seq; /* Ending TCP sequence of the skb */
                u32 last_delivered; /* tp->delivered at last reo_wnd adj */
                u8 reo_wnd_steps;   /* Allowed reordering window */
#define TCP_RACK_RECOVERY_THRESH 16
                u8 reo_wnd_persist:5, /* No. of recovery since last adj */
                   dsack_seen:1, /* Whether DSACK seen after last adj */
                   advanced:1;         /* mstamp advanced since last lost marking */
        } rack;
        u8        compressed_ack;
        u8        dup_ack_counter:2,
                tlp_retrans:1,        /* TLP is a retransmission */
                syn_ect_snt:2,        /* AccECN ECT memory, only */
                syn_ect_rcv:2;        /* ... needed during 3WHS + first seqno */
        u8        thin_lto    : 1,/* Use linear timeouts for thin streams */
                fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
                fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
                fastopen_client_fail:2, /* reason why fastopen failed */
                frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
        u8        repair_queue;
        u8        save_syn:2,        /* Save headers of SYN packet */
                syn_data:1,        /* SYN includes data */
                syn_fastopen:1,        /* SYN includes Fast Open option */
                syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
                syn_fastopen_ch:1, /* Active TFO re-enabling probe */
                syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
                syn_fastopen_child:1; /* created TFO passive child socket */

        u8        keepalive_probes; /* num of allowed keep alive probes        */
        u8        accecn_fail_mode:4,        /* AccECN failure handling */
                saw_accecn_opt:2;        /* An AccECN option was seen */
        u32        tcp_tx_delay;        /* delay (in usec) added to TX packets */

/* RTT measurement */
        u32        mdev_max_us;        /* maximal mdev for the last rtt period        */

        u32        reord_seen;        /* number of data packet reordering events */

/*
 *        Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
        u32        snd_cwnd_cnt;        /* Linear increase counter                */
        u32        snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
        u32        snd_cwnd_used;
        u32        snd_cwnd_stamp;
        u32        prior_cwnd;        /* cwnd right before starting loss recovery */
        u32        prr_delivered;        /* Number of newly delivered packets to
                                 * receiver in Recovery. */
        u32        last_oow_ack_time;  /* timestamp of last out-of-window ACK */

        struct hrtimer        pacing_timer;
        struct hrtimer        compressed_ack_timer;

        struct sk_buff        *ooo_last_skb; /* cache rb_last(out_of_order_queue) */

        /* SACKs data, these 2 need to be together (see tcp_options_write) */
        struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
        struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

        struct tcp_sack_block recv_sack_cache[4];

        u32        prior_ssthresh; /* ssthresh saved at recovery start        */
        u32        high_seq;        /* snd_nxt at onset of congestion        */

        u32        retrans_stamp;        /* Timestamp of the last retransmit,
                                 * also used in SYN-SENT to remember stamp of
                                 * the first SYN. */
        u32        undo_marker;        /* snd_una upon a new recovery episode. */
        int        undo_retrans;        /* number of undoable retransmissions. */
        u32        mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
                           * while socket was owned by user.
                           */
        u64        bytes_retrans;        /* RFC4898 tcpEStatsPerfOctetsRetrans
                                 * Total data bytes retransmitted
                                 */
        u32        total_retrans;        /* Total retransmits for entire connection */
        u32        rto_stamp;        /* Start time (ms) of last CA_Loss recovery */
        u16        total_rto;        /* Total number of RTO timeouts, including
                                 * SYN/SYN-ACK and recurring timeouts.
                                 */
        u16        total_rto_recoveries;        /* Total number of RTO recoveries,
                                         * including any unfinished recovery.
                                         */
        u32        total_rto_time;        /* ms spent in (completed) RTO recoveries. */

        u32        urg_seq;        /* Seq of received urgent pointer */
        unsigned int                keepalive_time;          /* time before keep alive takes place */
        unsigned int                keepalive_intvl;  /* time interval between keep alive probes */

        int                        linger2;


/* Sock_ops bpf program related variables */
#ifdef CONFIG_BPF
        u8        bpf_sock_ops_cb_flags;  /* Control calling BPF programs
                                         * values defined in uapi/linux/tcp.h
                                         */
        u8        bpf_chg_cc_inprogress:1; /* In the middle of
                                          * bpf_setsockopt(TCP_CONGESTION),
                                          * it is to avoid the bpf_tcp_cc->init()
                                          * to recur itself by calling
                                          * bpf_setsockopt(TCP_CONGESTION, "itself").
                                          */
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
#else
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
#endif

        u16 timeout_rehash;        /* Timeout-triggered rehash attempts */

        u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */

/* TCP-specific MTU probe information. */
        struct {
                u32                  probe_seq_start;
                u32                  probe_seq_end;
        } mtu_probe;
        u32     plb_rehash;     /* PLB-triggered rehash attempts */
#if IS_ENABLED(CONFIG_MPTCP)
        bool        is_mptcp;
#endif
#if IS_ENABLED(CONFIG_SMC)
        bool        syn_smc;        /* SYN includes SMC */
        bool        (*smc_hs_congested)(const struct sock *sk);
#endif

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
/* TCP AF-Specific parts; only used by TCP-AO/MD5 Signature support so far */
        const struct tcp_sock_af_ops        *af_specific;

#ifdef CONFIG_TCP_MD5SIG
/* TCP MD5 Signature Option information */
        struct tcp_md5sig_info        __rcu *md5sig_info;
#endif
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info        __rcu *ao_info;
#endif
#endif

/* TCP fastopen related information */
        struct tcp_fastopen_request *fastopen_req;
        /* fastopen_rsk points to request_sock that resulted in this big
         * socket. Used to retransmit SYNACKs etc.
         */
        struct request_sock __rcu *fastopen_rsk;
        struct saved_syn *saved_syn;
};

enum tsq_enum {
        TSQ_THROTTLED,
        TSQ_QUEUED,
        TCP_TSQ_DEFERRED,           /* tcp_tasklet_func() found socket was owned */
        TCP_WRITE_TIMER_DEFERRED,  /* tcp_write_timer() found socket was owned */
        TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
        TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
                                    * tcp_v{4|6}_mtu_reduced()
                                    */
        TCP_ACK_DEFERRED,           /* TX pure ack is deferred */
};

enum tsq_flags {
        TSQF_THROTTLED                        = BIT(TSQ_THROTTLED),
        TSQF_QUEUED                        = BIT(TSQ_QUEUED),
        TCPF_TSQ_DEFERRED                = BIT(TCP_TSQ_DEFERRED),
        TCPF_WRITE_TIMER_DEFERRED        = BIT(TCP_WRITE_TIMER_DEFERRED),
        TCPF_DELACK_TIMER_DEFERRED        = BIT(TCP_DELACK_TIMER_DEFERRED),
        TCPF_MTU_REDUCED_DEFERRED        = BIT(TCP_MTU_REDUCED_DEFERRED),
        TCPF_ACK_DEFERRED                = BIT(TCP_ACK_DEFERRED),
};

/* Flags of interest for tcp_release_cb() */
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |                \
                          TCPF_WRITE_TIMER_DEFERRED |        \
                          TCPF_DELACK_TIMER_DEFERRED |        \
                          TCPF_MTU_REDUCED_DEFERRED |        \
                          TCPF_ACK_DEFERRED)

#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)

/* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket.
 * Used in context of (lockless) tcp listeners.
 */
#define tcp_sk_rw(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)

struct tcp_timewait_sock {
        struct inet_timewait_sock tw_sk;
#define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
#define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt
        u32                          tw_rcv_wnd;
        u32                          tw_ts_offset;
        u32                          tw_ts_recent;

        /* The time we sent the last out-of-window ACK: */
        u32                          tw_last_oow_ack_time;

        int                          tw_ts_recent_stamp;
        u32                          tw_tx_delay;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key          *tw_md5_key;
#endif
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info        __rcu *ao_info;
#endif
};

static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
{
        return (struct tcp_timewait_sock *)sk;
}

static inline bool tcp_passive_fastopen(const struct sock *sk)
{
        return sk->sk_state == TCP_SYN_RECV &&
               rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL;
}

static inline void fastopen_queue_tune(struct sock *sk, int backlog)
{
        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
        int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn);

        WRITE_ONCE(queue->fastopenq.max_qlen, min_t(unsigned int, backlog, somaxconn));
}

static inline void tcp_move_syn(struct tcp_sock *tp,
                                struct request_sock *req)
{
        tp->saved_syn = req->saved_syn;
        req->saved_syn = NULL;
}

static inline void tcp_saved_syn_free(struct tcp_sock *tp)
{
        kfree(tp->saved_syn);
        tp->saved_syn = NULL;
}

static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
{
        return saved_syn->mac_hdrlen + saved_syn->network_hdrlen +
                saved_syn->tcp_hdrlen;
}

struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
                                               const struct sk_buff *orig_skb,
                                               const struct sk_buff *ack_skb);

static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
{
        /* We use READ_ONCE() here because socket might not be locked.
         * This happens for listeners.
         */
        u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);

        return (user_mss && user_mss < mss) ? user_mss : mss;
}

int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
                  int shiftlen);

void __tcp_sock_set_cork(struct sock *sk, bool on);
void tcp_sock_set_cork(struct sock *sk, bool on);
int tcp_sock_set_keepcnt(struct sock *sk, int val);
int tcp_sock_set_keepidle_locked(struct sock *sk, int val);
int tcp_sock_set_keepidle(struct sock *sk, int val);
int tcp_sock_set_keepintvl(struct sock *sk, int val);
void __tcp_sock_set_nodelay(struct sock *sk, bool on);
void tcp_sock_set_nodelay(struct sock *sk);
void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
int tcp_sock_set_user_timeout(struct sock *sk, int val);
int tcp_sock_set_maxseg(struct sock *sk, int val);

static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
{
        return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS);
}

#endif        /* _LINUX_TCP_H */






















































































































    1 








    1 
















    1 




    1 




    1 















    1 





    1 















































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
// SPDX-License-Identifier: GPL-2.0-only
/* net/core/xdp.c
 *
 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
 */
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/filter.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
#include <linux/bug.h>
#include <net/page_pool/helpers.h>

#include <net/hotdata.h>
#include <net/netdev_lock.h>
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
#include <trace/events/xdp.h>
#include <net/xdp_sock_drv.h>

#define REG_STATE_NEW                0x0
#define REG_STATE_REGISTERED        0x1
#define REG_STATE_UNREGISTERED        0x2
#define REG_STATE_UNUSED        0x3

static DEFINE_IDA(mem_id_pool);
static DEFINE_MUTEX(mem_id_lock);
#define MEM_ID_MAX 0xFFFE
#define MEM_ID_MIN 1
static int mem_id_next = MEM_ID_MIN;

static bool mem_id_init; /* false */
static struct rhashtable *mem_id_ht;

static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
{
        const u32 *k = data;
        const u32 key = *k;

        BUILD_BUG_ON(sizeof_field(struct xdp_mem_allocator, mem.id)
                     != sizeof(u32));

        /* Use cyclic increasing ID as direct hash key */
        return key;
}

static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
                          const void *ptr)
{
        const struct xdp_mem_allocator *xa = ptr;
        u32 mem_id = *(u32 *)arg->key;

        return xa->mem.id != mem_id;
}

static const struct rhashtable_params mem_id_rht_params = {
        .nelem_hint = 64,
        .head_offset = offsetof(struct xdp_mem_allocator, node),
        .key_offset  = offsetof(struct xdp_mem_allocator, mem.id),
        .key_len = sizeof_field(struct xdp_mem_allocator, mem.id),
        .max_size = MEM_ID_MAX,
        .min_size = 8,
        .automatic_shrinking = true,
        .hashfn    = xdp_mem_id_hashfn,
        .obj_cmpfn = xdp_mem_id_cmp,
};

static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
{
        struct xdp_mem_allocator *xa;

        xa = container_of(rcu, struct xdp_mem_allocator, rcu);

        /* Allow this ID to be reused */
        ida_free(&mem_id_pool, xa->mem.id);

        kfree(xa);
}

static void mem_xa_remove(struct xdp_mem_allocator *xa)
{
        trace_mem_disconnect(xa);

        if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
                call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
}

static void mem_allocator_disconnect(void *allocator)
{
        struct xdp_mem_allocator *xa;
        struct rhashtable_iter iter;

        mutex_lock(&mem_id_lock);

        rhashtable_walk_enter(mem_id_ht, &iter);
        do {
                rhashtable_walk_start(&iter);

                while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
                        if (xa->allocator == allocator)
                                mem_xa_remove(xa);
                }

                rhashtable_walk_stop(&iter);

        } while (xa == ERR_PTR(-EAGAIN));
        rhashtable_walk_exit(&iter);

        mutex_unlock(&mem_id_lock);
}

void xdp_unreg_mem_model(struct xdp_mem_info *mem)
{
        struct xdp_mem_allocator *xa;
        int type = mem->type;
        int id = mem->id;

        /* Reset mem info to defaults */
        mem->id = 0;
        mem->type = 0;

        if (id == 0)
                return;

        if (type == MEM_TYPE_PAGE_POOL) {
                xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
                page_pool_destroy(xa->page_pool);
        }
}
EXPORT_SYMBOL_GPL(xdp_unreg_mem_model);

void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
        if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
                WARN(1, "Missing register, driver bug");
                return;
        }

        xdp_unreg_mem_model(&xdp_rxq->mem);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);

void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
        /* Simplify driver cleanup code paths, allow unreg "unused" */
        if (xdp_rxq->reg_state == REG_STATE_UNUSED)
                return;

        xdp_rxq_info_unreg_mem_model(xdp_rxq);

        xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
        xdp_rxq->dev = NULL;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);

static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
{
        memset(xdp_rxq, 0, sizeof(*xdp_rxq));
}

/* Returns 0 on success, negative on failure */
int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
                       struct net_device *dev, u32 queue_index,
                       unsigned int napi_id, u32 frag_size)
{
        if (!dev) {
                WARN(1, "Missing net_device from driver");
                return -ENODEV;
        }

        if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
                WARN(1, "Driver promised not to register this");
                return -EINVAL;
        }

        if (xdp_rxq->reg_state == REG_STATE_REGISTERED) {
                WARN(1, "Missing unregister, handled but fix driver");
                xdp_rxq_info_unreg(xdp_rxq);
        }

        /* State either UNREGISTERED or NEW */
        xdp_rxq_info_init(xdp_rxq);
        xdp_rxq->dev = dev;
        xdp_rxq->queue_index = queue_index;
        xdp_rxq->frag_size = frag_size;

        xdp_rxq->reg_state = REG_STATE_REGISTERED;
        return 0;
}
EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg);

void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
{
        xdp_rxq->reg_state = REG_STATE_UNUSED;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);

bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
{
        return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);

static int __mem_id_init_hash_table(void)
{
        struct rhashtable *rht;
        int ret;

        if (unlikely(mem_id_init))
                return 0;

        rht = kzalloc_obj(*rht);
        if (!rht)
                return -ENOMEM;

        ret = rhashtable_init(rht, &mem_id_rht_params);
        if (ret < 0) {
                kfree(rht);
                return ret;
        }
        mem_id_ht = rht;
        smp_mb(); /* mutex lock should provide enough pairing */
        mem_id_init = true;

        return 0;
}

/* Allocate a cyclic ID that maps to allocator pointer.
 * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
 *
 * Caller must lock mem_id_lock.
 */
static int __mem_id_cyclic_get(gfp_t gfp)
{
        int retries = 1;
        int id;

again:
        id = ida_alloc_range(&mem_id_pool, mem_id_next, MEM_ID_MAX - 1, gfp);
        if (id < 0) {
                if (id == -ENOSPC) {
                        /* Cyclic allocator, reset next id */
                        if (retries--) {
                                mem_id_next = MEM_ID_MIN;
                                goto again;
                        }
                }
                return id; /* errno */
        }
        mem_id_next = id + 1;

        return id;
}

static bool __is_supported_mem_type(enum xdp_mem_type type)
{
        if (type == MEM_TYPE_PAGE_POOL)
                return is_page_pool_compiled_in();

        if (type >= MEM_TYPE_MAX)
                return false;

        return true;
}

static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,
                                                     enum xdp_mem_type type,
                                                     void *allocator)
{
        struct xdp_mem_allocator *xdp_alloc;
        gfp_t gfp = GFP_KERNEL;
        int id, errno, ret;
        void *ptr;

        if (!__is_supported_mem_type(type))
                return ERR_PTR(-EOPNOTSUPP);

        mem->type = type;

        if (!allocator) {
                if (type == MEM_TYPE_PAGE_POOL)
                        return ERR_PTR(-EINVAL); /* Setup time check page_pool req */
                return NULL;
        }

        /* Delay init of rhashtable to save memory if feature isn't used */
        if (!mem_id_init) {
                mutex_lock(&mem_id_lock);
                ret = __mem_id_init_hash_table();
                mutex_unlock(&mem_id_lock);
                if (ret < 0)
                        return ERR_PTR(ret);
        }

        xdp_alloc = kzalloc_obj(*xdp_alloc, gfp);
        if (!xdp_alloc)
                return ERR_PTR(-ENOMEM);

        mutex_lock(&mem_id_lock);
        id = __mem_id_cyclic_get(gfp);
        if (id < 0) {
                errno = id;
                goto err;
        }
        mem->id = id;
        xdp_alloc->mem = *mem;
        xdp_alloc->allocator = allocator;

        /* Insert allocator into ID lookup table */
        ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
        if (IS_ERR(ptr)) {
                ida_free(&mem_id_pool, mem->id);
                mem->id = 0;
                errno = PTR_ERR(ptr);
                goto err;
        }

        if (type == MEM_TYPE_PAGE_POOL)
                page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem);

        mutex_unlock(&mem_id_lock);

        return xdp_alloc;
err:
        mutex_unlock(&mem_id_lock);
        kfree(xdp_alloc);
        return ERR_PTR(errno);
}

int xdp_reg_mem_model(struct xdp_mem_info *mem,
                      enum xdp_mem_type type, void *allocator)
{
        struct xdp_mem_allocator *xdp_alloc;

        xdp_alloc = __xdp_reg_mem_model(mem, type, allocator);
        if (IS_ERR(xdp_alloc))
                return PTR_ERR(xdp_alloc);
        return 0;
}
EXPORT_SYMBOL_GPL(xdp_reg_mem_model);

int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
                               enum xdp_mem_type type, void *allocator)
{
        struct xdp_mem_allocator *xdp_alloc;

        if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
                WARN(1, "Missing register, driver bug");
                return -EFAULT;
        }

        xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator);
        if (IS_ERR(xdp_alloc))
                return PTR_ERR(xdp_alloc);

        if (type == MEM_TYPE_XSK_BUFF_POOL && allocator)
                xsk_pool_set_rxq_info(allocator, xdp_rxq);

        if (trace_mem_connect_enabled() && xdp_alloc)
                trace_mem_connect(xdp_alloc, xdp_rxq);
        return 0;
}

EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);

/**
 * xdp_reg_page_pool - register &page_pool as a memory provider for XDP
 * @pool: &page_pool to register
 *
 * Can be used to register pools manually without connecting to any XDP RxQ
 * info, so that the XDP layer will be aware of them. Then, they can be
 * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool().
 *
 * Return: %0 on success, -errno on error.
 */
int xdp_reg_page_pool(struct page_pool *pool)
{
        struct xdp_mem_info mem;

        return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool);
}
EXPORT_SYMBOL_GPL(xdp_reg_page_pool);

/**
 * xdp_unreg_page_pool - unregister &page_pool from the memory providers list
 * @pool: &page_pool to unregister
 *
 * A shorthand for manual unregistering page pools. If the pool was previously
 * attached to an RxQ info, it must be detached first.
 */
void xdp_unreg_page_pool(const struct page_pool *pool)
{
        struct xdp_mem_info mem = {
                .type        = MEM_TYPE_PAGE_POOL,
                .id        = pool->xdp_mem_id,
        };

        xdp_unreg_mem_model(&mem);
}
EXPORT_SYMBOL_GPL(xdp_unreg_page_pool);

/**
 * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info
 * @xdp_rxq: XDP RxQ info to attach the pool to
 * @pool: pool to attach
 *
 * If the pool was registered manually, this function must be called instead
 * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info.
 */
void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq,
                                   const struct page_pool *pool)
{
        struct xdp_mem_info mem = {
                .type        = MEM_TYPE_PAGE_POOL,
                .id        = pool->xdp_mem_id,
        };

        xdp_rxq_info_attach_mem_model(xdp_rxq, &mem);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool);

/* XDP RX runs under NAPI protection, and in different delivery error
 * scenarios (e.g. queue full), it is possible to return the xdp_frame
 * while still leveraging this protection.  The @napi_direct boolean
 * is used for those calls sites.  Thus, allowing for faster recycling
 * of xdp_frames/pages in those cases.
 */
void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
                  bool napi_direct, struct xdp_buff *xdp)
{
        switch (mem_type) {
        case MEM_TYPE_PAGE_POOL:
                netmem = netmem_compound_head(netmem);
                if (napi_direct && xdp_return_frame_no_direct())
                        napi_direct = false;
                /* No need to check netmem_is_pp() as mem->type knows this a
                 * page_pool page
                 */
                page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
                                          napi_direct);
                break;
        case MEM_TYPE_PAGE_SHARED:
                page_frag_free(__netmem_address(netmem));
                break;
        case MEM_TYPE_PAGE_ORDER0:
                put_page(__netmem_to_page(netmem));
                break;
        case MEM_TYPE_XSK_BUFF_POOL:
                /* NB! Only valid from an xdp_buff! */
                xsk_buff_free(xdp);
                break;
        default:
                /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
                WARN(1, "Incorrect XDP memory type (%d) usage", mem_type);
                break;
        }
}

void xdp_return_frame(struct xdp_frame *xdpf)
{
        struct skb_shared_info *sinfo;

        if (likely(!xdp_frame_has_frags(xdpf)))
                goto out;

        sinfo = xdp_get_shared_info_from_frame(xdpf);
        for (u32 i = 0; i < sinfo->nr_frags; i++)
                __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
                             false, NULL);

out:
        __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);

void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
        struct skb_shared_info *sinfo;

        if (likely(!xdp_frame_has_frags(xdpf)))
                goto out;

        sinfo = xdp_get_shared_info_from_frame(xdpf);
        for (u32 i = 0; i < sinfo->nr_frags; i++)
                __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
                             true, NULL);

out:
        __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);

/* XDP bulk APIs introduce a defer/flush mechanism to return
 * pages belonging to the same xdp_mem_allocator object
 * (identified via the mem.id field) in bulk to optimize
 * I-cache and D-cache.
 * The bulk queue size is set to 16 to be aligned to how
 * XDP_REDIRECT bulking works. The bulk is flushed when
 * it is full or when mem.id changes.
 * xdp_frame_bulk is usually stored/allocated on the function
 * call-stack to avoid locking penalties.
 */

/* Must be called with rcu_read_lock held */
void xdp_return_frame_bulk(struct xdp_frame *xdpf,
                           struct xdp_frame_bulk *bq)
{
        if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) {
                xdp_return_frame(xdpf);
                return;
        }

        if (bq->count == XDP_BULK_QUEUE_SIZE)
                xdp_flush_frame_bulk(bq);

        if (unlikely(xdp_frame_has_frags(xdpf))) {
                struct skb_shared_info *sinfo;
                int i;

                sinfo = xdp_get_shared_info_from_frame(xdpf);
                for (i = 0; i < sinfo->nr_frags; i++) {
                        skb_frag_t *frag = &sinfo->frags[i];

                        bq->q[bq->count++] = skb_frag_netmem(frag);
                        if (bq->count == XDP_BULK_QUEUE_SIZE)
                                xdp_flush_frame_bulk(bq);
                }
        }
        bq->q[bq->count++] = virt_to_netmem(xdpf->data);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);

/**
 * xdp_return_frag -- free one XDP frag or decrement its refcount
 * @netmem: network memory reference to release
 * @xdp: &xdp_buff to release the frag for
 */
void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp)
{
        __xdp_return(netmem, xdp->rxq->mem.type, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frag);

void xdp_return_buff(struct xdp_buff *xdp)
{
        struct skb_shared_info *sinfo;

        if (likely(!xdp_buff_has_frags(xdp)))
                goto out;

        sinfo = xdp_get_shared_info_from_buff(xdp);
        for (u32 i = 0; i < sinfo->nr_frags; i++)
                __xdp_return(skb_frag_netmem(&sinfo->frags[i]),
                             xdp->rxq->mem.type, true, xdp);

out:
        __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);

void xdp_attachment_setup(struct xdp_attachment_info *info,
                          struct netdev_bpf *bpf)
{
        if (info->prog)
                bpf_prog_put(info->prog);
        info->prog = bpf->prog;
        info->flags = bpf->flags;
}
EXPORT_SYMBOL_GPL(xdp_attachment_setup);

struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
{
        unsigned int metasize, totsize;
        void *addr, *data_to_copy;
        struct xdp_frame *xdpf;
        struct page *page;

        /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */
        metasize = xdp_data_meta_unsupported(xdp) ? 0 :
                   xdp->data - xdp->data_meta;
        totsize = xdp->data_end - xdp->data + metasize;

        if (sizeof(*xdpf) + totsize > PAGE_SIZE)
                return NULL;

        page = dev_alloc_page();
        if (!page)
                return NULL;

        addr = page_to_virt(page);
        xdpf = addr;
        memset(xdpf, 0, sizeof(*xdpf));

        addr += sizeof(*xdpf);
        data_to_copy = metasize ? xdp->data_meta : xdp->data;
        memcpy(addr, data_to_copy, totsize);

        xdpf->data = addr + metasize;
        xdpf->len = totsize - metasize;
        xdpf->headroom = 0;
        xdpf->metasize = metasize;
        xdpf->frame_sz = PAGE_SIZE;
        xdpf->mem_type = MEM_TYPE_PAGE_ORDER0;

        xsk_buff_free(xdp);
        return xdpf;
}
EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);

/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */
void xdp_warn(const char *msg, const char *func, const int line)
{
        WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg);
};
EXPORT_SYMBOL_GPL(xdp_warn);

/**
 * xdp_build_skb_from_buff - create an skb from &xdp_buff
 * @xdp: &xdp_buff to convert to an skb
 *
 * Perform common operations to create a new skb to pass up the stack from
 * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize
 * skb data pointers and offsets, set the recycle bit if the buff is
 * PP-backed, Rx queue index, protocol and update frags info.
 *
 * Return: new &sk_buff on success, %NULL on error.
 */
struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp)
{
        const struct xdp_rxq_info *rxq = xdp->rxq;
        const struct skb_shared_info *sinfo;
        struct sk_buff *skb;
        u32 nr_frags = 0;
        int metalen;

        if (unlikely(xdp_buff_has_frags(xdp))) {
                sinfo = xdp_get_shared_info_from_buff(xdp);
                nr_frags = sinfo->nr_frags;
        }

        skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
        if (unlikely(!skb))
                return NULL;

        skb_reserve(skb, xdp->data - xdp->data_hard_start);
        __skb_put(skb, xdp->data_end - xdp->data);

        metalen = xdp->data - xdp->data_meta;
        if (metalen > 0)
                skb_metadata_set(skb, metalen);

        if (rxq->mem.type == MEM_TYPE_PAGE_POOL)
                skb_mark_for_recycle(skb);

        skb_record_rx_queue(skb, rxq->queue_index);

        if (unlikely(nr_frags)) {
                u32 tsize;

                tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz;
                xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
                                          tsize, xdp_buff_get_skb_flags(xdp));
        }

        skb->protocol = eth_type_trans(skb, rxq->dev);

        return skb;
}
EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff);

/**
 * xdp_copy_frags_from_zc - copy frags from XSk buff to skb
 * @skb: skb to copy frags to
 * @xdp: XSk &xdp_buff from which the frags will be copied
 * @pp: &page_pool backing page allocation, if available
 *
 * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack.
 * Allocate a new buffer for each frag, copy it and attach to the skb.
 *
 * Return: true on success, false on netmem allocation fail.
 */
static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
                                            const struct xdp_buff *xdp,
                                            struct page_pool *pp)
{
        struct skb_shared_info *sinfo = skb_shinfo(skb);
        const struct skb_shared_info *xinfo;
        u32 nr_frags, tsize = 0;
        u32 flags = 0;

        xinfo = xdp_get_shared_info_from_buff(xdp);
        nr_frags = xinfo->nr_frags;

        for (u32 i = 0; i < nr_frags; i++) {
                const skb_frag_t *frag = &xinfo->frags[i];
                u32 len = skb_frag_size(frag);
                u32 offset, truesize = len;
                struct page *page;

                page = page_pool_dev_alloc(pp, &offset, &truesize);
                if (unlikely(!page)) {
                        sinfo->nr_frags = i;
                        return false;
                }

                memcpy(page_address(page) + offset, skb_frag_address(frag),
                       LARGEST_ALIGN(len));
                __skb_fill_page_desc_noacc(sinfo, i, page, offset, len);

                tsize += truesize;
                if (page_is_pfmemalloc(page))
                        flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
        }

        xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize,
                                  flags);

        return true;
}

/**
 * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff
 * @xdp: source XSk buff
 *
 * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb
 * head, new buffer for the head, copy the data and initialize the skb fields.
 * If there are frags, allocate new buffers for them and copy.
 * Buffers are allocated from the system percpu pools to try recycling them.
 * If new skb was built successfully, @xdp is returned to XSk pool's freelist.
 * On error, it remains untouched and the caller must take care of this.
 *
 * Return: new &sk_buff on success, %NULL on error.
 */
struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
{
        const struct xdp_rxq_info *rxq = xdp->rxq;
        u32 len = xdp->data_end - xdp->data_meta;
        u32 truesize = xdp->frame_sz;
        struct sk_buff *skb = NULL;
        struct page_pool *pp;
        int metalen;
        void *data;

        if (!IS_ENABLED(CONFIG_PAGE_POOL))
                return NULL;

        local_lock_nested_bh(&system_page_pool.bh_lock);
        pp = this_cpu_read(system_page_pool.pool);
        data = page_pool_dev_alloc_va(pp, &truesize);
        if (unlikely(!data))
                goto out;

        skb = napi_build_skb(data, truesize);
        if (unlikely(!skb)) {
                page_pool_free_va(pp, data, true);
                goto out;
        }

        skb_mark_for_recycle(skb);
        skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);

        memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len));

        metalen = xdp->data - xdp->data_meta;
        if (metalen > 0) {
                skb_metadata_set(skb, metalen);
                __skb_pull(skb, metalen);
        }

        skb_record_rx_queue(skb, rxq->queue_index);

        if (unlikely(xdp_buff_has_frags(xdp)) &&
            unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) {
                napi_consume_skb(skb, true);
                skb = NULL;
                goto out;
        }

        xsk_buff_free(xdp);

        skb->protocol = eth_type_trans(skb, rxq->dev);

out:
        local_unlock_nested_bh(&system_page_pool.bh_lock);
        return skb;
}
EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc);

struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
                                           struct sk_buff *skb,
                                           struct net_device *dev)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
        unsigned int headroom, frame_size;
        void *hard_start;
        u8 nr_frags;

        /* xdp frags frame */
        if (unlikely(xdp_frame_has_frags(xdpf)))
                nr_frags = sinfo->nr_frags;

        /* Part of headroom was reserved to xdpf */
        headroom = sizeof(*xdpf) + xdpf->headroom;

        /* Memory size backing xdp_frame data already have reserved
         * room for build_skb to place skb_shared_info in tailroom.
         */
        frame_size = xdpf->frame_sz;

        hard_start = xdpf->data - headroom;
        skb = build_skb_around(skb, hard_start, frame_size);
        if (unlikely(!skb))
                return NULL;

        skb_reserve(skb, headroom);
        __skb_put(skb, xdpf->len);
        if (xdpf->metasize)
                skb_metadata_set(skb, xdpf->metasize);

        if (unlikely(xdp_frame_has_frags(xdpf)))
                xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
                                          nr_frags * xdpf->frame_sz,
                                          xdp_frame_get_skb_flags(xdpf));

        /* Essential SKB info: protocol and skb->dev */
        skb->protocol = eth_type_trans(skb, dev);

        /* Optional SKB info, currently missing:
         * - HW checksum info                (skb->ip_summed)
         * - HW RX hash                        (skb_set_hash)
         * - RX ring dev queue index        (skb_record_rx_queue)
         */

        if (xdpf->mem_type == MEM_TYPE_PAGE_POOL)
                skb_mark_for_recycle(skb);

        /* Allow SKB to reuse area used by xdp_frame */
        xdp_scrub_frame(xdpf);

        return skb;
}
EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame);

struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
                                         struct net_device *dev)
{
        struct sk_buff *skb;

        skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));

        return __xdp_build_skb_from_frame(xdpf, skb, dev);
}
EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);

struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
{
        unsigned int headroom, totalsize;
        struct xdp_frame *nxdpf;
        struct page *page;
        void *addr;

        headroom = xdpf->headroom + sizeof(*xdpf);
        totalsize = headroom + xdpf->len;

        if (unlikely(totalsize > PAGE_SIZE))
                return NULL;
        page = dev_alloc_page();
        if (!page)
                return NULL;
        addr = page_to_virt(page);

        memcpy(addr, xdpf, totalsize);

        nxdpf = addr;
        nxdpf->data = addr + headroom;
        nxdpf->frame_sz = PAGE_SIZE;
        nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0;

        return nxdpf;
}

__bpf_kfunc_start_defs();

/**
 * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp.
 * @ctx: XDP context pointer.
 * @timestamp: Return value pointer.
 *
 * Return:
 * * Returns 0 on success or ``-errno`` on error.
 * * ``-EOPNOTSUPP`` : means device driver does not implement kfunc
 * * ``-ENODATA``    : means no RX-timestamp available for this frame
 */
__bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
{
        return -EOPNOTSUPP;
}

/**
 * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash.
 * @ctx: XDP context pointer.
 * @hash: Return value pointer.
 * @rss_type: Return value pointer for RSS type.
 *
 * The RSS hash type (@rss_type) specifies what portion of packet headers NIC
 * hardware used when calculating RSS hash value.  The RSS type can be decoded
 * via &enum xdp_rss_hash_type either matching on individual L3/L4 bits
 * ``XDP_RSS_L*`` or by combined traditional *RSS Hashing Types*
 * ``XDP_RSS_TYPE_L*``.
 *
 * Return:
 * * Returns 0 on success or ``-errno`` on error.
 * * ``-EOPNOTSUPP`` : means device driver doesn't implement kfunc
 * * ``-ENODATA``    : means no RX-hash available for this frame
 */
__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
                                         enum xdp_rss_hash_type *rss_type)
{
        return -EOPNOTSUPP;
}

/**
 * bpf_xdp_metadata_rx_vlan_tag - Get XDP packet outermost VLAN tag
 * @ctx: XDP context pointer.
 * @vlan_proto: Destination pointer for VLAN Tag protocol identifier (TPID).
 * @vlan_tci: Destination pointer for VLAN TCI (VID + DEI + PCP)
 *
 * In case of success, ``vlan_proto`` contains *Tag protocol identifier (TPID)*,
 * usually ``ETH_P_8021Q`` or ``ETH_P_8021AD``, but some networks can use
 * custom TPIDs. ``vlan_proto`` is stored in **network byte order (BE)**
 * and should be used as follows:
 * ``if (vlan_proto == bpf_htons(ETH_P_8021Q)) do_something();``
 *
 * ``vlan_tci`` contains the remaining 16 bits of a VLAN tag.
 * Driver is expected to provide those in **host byte order (usually LE)**,
 * so the bpf program should not perform byte conversion.
 * According to 802.1Q standard, *VLAN TCI (Tag control information)*
 * is a bit field that contains:
 * *VLAN identifier (VID)* that can be read with ``vlan_tci & 0xfff``,
 * *Drop eligible indicator (DEI)* - 1 bit,
 * *Priority code point (PCP)* - 3 bits.
 * For detailed meaning of DEI and PCP, please refer to other sources.
 *
 * Return:
 * * Returns 0 on success or ``-errno`` on error.
 * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
 * * ``-ENODATA``    : VLAN tag was not stripped or is not available
 */
__bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
                                             __be16 *vlan_proto, u16 *vlan_tci)
{
        return -EOPNOTSUPP;
}

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(xdp_metadata_kfunc_ids)
#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name)
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
BTF_KFUNCS_END(xdp_metadata_kfunc_ids)

static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
        .owner = THIS_MODULE,
        .set   = &xdp_metadata_kfunc_ids,
};

BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str)
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC

u32 bpf_xdp_metadata_kfunc_id(int id)
{
        /* xdp_metadata_kfunc_ids is sorted and can't be used */
        return xdp_metadata_kfunc_ids_unsorted[id];
}

bool bpf_dev_bound_kfunc_id(u32 btf_id)
{
        return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id);
}

static int __init xdp_metadata_init(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
}
late_initcall(xdp_metadata_init);

void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val)
{
        val &= NETDEV_XDP_ACT_MASK;
        if (dev->xdp_features == val)
                return;

        netdev_assert_locked_or_invisible(dev);
        dev->xdp_features = val;

        if (dev->reg_state == NETREG_REGISTERED)
                call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev);
}
EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked);

void xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
{
        netdev_lock(dev);
        xdp_set_features_flag_locked(dev, val);
        netdev_unlock(dev);
}
EXPORT_SYMBOL_GPL(xdp_set_features_flag);

void xdp_features_set_redirect_target_locked(struct net_device *dev,
                                             bool support_sg)
{
        xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT);

        if (support_sg)
                val |= NETDEV_XDP_ACT_NDO_XMIT_SG;
        xdp_set_features_flag_locked(dev, val);
}
EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked);

void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
{
        netdev_lock(dev);
        xdp_features_set_redirect_target_locked(dev, support_sg);
        netdev_unlock(dev);
}
EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target);

void xdp_features_clear_redirect_target_locked(struct net_device *dev)
{
        xdp_features_t val = dev->xdp_features;

        val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG);
        xdp_set_features_flag_locked(dev, val);
}
EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked);

void xdp_features_clear_redirect_target(struct net_device *dev)
{
        netdev_lock(dev);
        xdp_features_clear_redirect_target_locked(dev);
        netdev_unlock(dev);
}
EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target);

































































    3 
















    3 















    3 








    3 
































    3 


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_LWTUNNEL_H
#define __NET_LWTUNNEL_H 1

#include <linux/lwtunnel.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <net/route.h>

#define LWTUNNEL_HASH_BITS   7
#define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)

/* lw tunnel state flags */
#define LWTUNNEL_STATE_OUTPUT_REDIRECT        BIT(0)
#define LWTUNNEL_STATE_INPUT_REDIRECT        BIT(1)
#define LWTUNNEL_STATE_XMIT_REDIRECT        BIT(2)

/* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return
 * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety.
 */
enum {
        LWTUNNEL_XMIT_DONE,
        LWTUNNEL_XMIT_CONTINUE = 0x100,
};


struct lwtunnel_state {
        __u16                type;
        __u16                flags;
        __u16                headroom;
        atomic_t        refcnt;
        int                (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int                (*orig_input)(struct sk_buff *);
        struct                rcu_head rcu;
        __u8            data[];
};

struct lwtunnel_encap_ops {
        int (*build_state)(struct net *net, struct nlattr *encap,
                           unsigned int family, const void *cfg,
                           struct lwtunnel_state **ts,
                           struct netlink_ext_ack *extack);
        void (*destroy_state)(struct lwtunnel_state *lws);
        int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int (*input)(struct sk_buff *skb);
        int (*fill_encap)(struct sk_buff *skb,
                          struct lwtunnel_state *lwtstate);
        int (*get_encap_size)(struct lwtunnel_state *lwtstate);
        int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
        int (*xmit)(struct sk_buff *skb);

        struct module *owner;
};

#ifdef CONFIG_LWTUNNEL

DECLARE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);

void lwtstate_free(struct lwtunnel_state *lws);

static inline struct lwtunnel_state *
lwtstate_get(struct lwtunnel_state *lws)
{
        if (lws)
                atomic_inc(&lws->refcnt);

        return lws;
}

static inline void lwtstate_put(struct lwtunnel_state *lws)
{
        if (!lws)
                return;

        if (atomic_dec_and_test(&lws->refcnt))
                lwtstate_free(lws);
}

static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
{
        if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT))
                return true;

        return false;
}

static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate)
{
        if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT))
                return true;

        return false;
}

static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
{
        if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT))
                return true;

        return false;
}

static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
                                             unsigned int mtu)
{
        if ((lwtunnel_xmit_redirect(lwtstate) ||
             lwtunnel_output_redirect(lwtstate)) && lwtstate->headroom < mtu)
                return lwtstate->headroom;

        return 0;
}

int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
                           unsigned int num);
int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
                           unsigned int num);
int lwtunnel_valid_encap_type(u16 encap_type,
                              struct netlink_ext_ack *extack);
int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
                                   struct netlink_ext_ack *extack);
int lwtunnel_build_state(struct net *net, u16 encap_type,
                         struct nlattr *encap,
                         unsigned int family, const void *cfg,
                         struct lwtunnel_state **lws,
                         struct netlink_ext_ack *extack);
int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate,
                        int encap_attr, int encap_type_attr);
int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int lwtunnel_input(struct sk_buff *skb);
int lwtunnel_xmit(struct sk_buff *skb);
int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
                          bool ingress);

static inline void lwtunnel_set_redirect(struct dst_entry *dst)
{
        if (lwtunnel_output_redirect(dst->lwtstate)) {
                dst->lwtstate->orig_output = READ_ONCE(dst->output);
                WRITE_ONCE(dst->output, lwtunnel_output);
        }
        if (lwtunnel_input_redirect(dst->lwtstate)) {
                dst->lwtstate->orig_input = READ_ONCE(dst->input);
                WRITE_ONCE(dst->input, lwtunnel_input);
        }
}
#else

static inline void lwtstate_free(struct lwtunnel_state *lws)
{
}

static inline struct lwtunnel_state *
lwtstate_get(struct lwtunnel_state *lws)
{
        return lws;
}

static inline void lwtstate_put(struct lwtunnel_state *lws)
{
}

static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
{
        return false;
}

static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate)
{
        return false;
}

static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
{
        return false;
}

static inline void lwtunnel_set_redirect(struct dst_entry *dst)
{
}

static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
                                             unsigned int mtu)
{
        return 0;
}

static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
                                         unsigned int num)
{
        return -EOPNOTSUPP;

}

static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
                                         unsigned int num)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_valid_encap_type(u16 encap_type,
                                            struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel");
        return -EOPNOTSUPP;
}

static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
                                                 struct netlink_ext_ack *extack)
{
        /* return 0 since we are not walking attr looking for
         * RTA_ENCAP_TYPE attribute on nexthops.
         */
        return 0;
}

static inline int lwtunnel_build_state(struct net *net, u16 encap_type,
                                       struct nlattr *encap,
                                       unsigned int family, const void *cfg,
                                       struct lwtunnel_state **lws,
                                       struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_fill_encap(struct sk_buff *skb,
                                      struct lwtunnel_state *lwtstate,
                                      int encap_attr, int encap_type_attr)
{
        return 0;
}

static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
{
        return 0;
}

static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len)
{
        return NULL;
}

static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a,
                                     struct lwtunnel_state *b)
{
        return 0;
}

static inline int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_input(struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_xmit(struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

#endif /* CONFIG_LWTUNNEL */

#define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type))

#endif /* __NET_LWTUNNEL_H */













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 



















    1 


































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
// SPDX-License-Identifier: GPL-2.0-only
/*
  File: fs/xattr.c

  Extended attribute handling.

  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fsnotify.h>
#include <linux/audit.h>
#include <linux/vmalloc.h>
#include <linux/posix_acl_xattr.h>
#include <linux/rhashtable.h>

#include <linux/uaccess.h>

#include "internal.h"

static const char *
strcmp_prefix(const char *a, const char *a_prefix)
{
        while (*a_prefix && *a == *a_prefix) {
                a++;
                a_prefix++;
        }
        return *a_prefix ? NULL : a;
}

/*
 * In order to implement different sets of xattr operations for each xattr
 * prefix, a filesystem should create a null-terminated array of struct
 * xattr_handler (one for each prefix) and hang a pointer to it off of the
 * s_xattr field of the superblock.
 */
#define for_each_xattr_handler(handlers, handler)                \
        if (handlers)                                                \
                for ((handler) = *(handlers)++;                        \
                        (handler) != NULL;                        \
                        (handler) = *(handlers)++)

/*
 * Find the xattr_handler with the matching prefix.
 */
static const struct xattr_handler *
xattr_resolve_name(struct inode *inode, const char **name)
{
        const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
        const struct xattr_handler *handler;

        if (!(inode->i_opflags & IOP_XATTR)) {
                if (unlikely(is_bad_inode(inode)))
                        return ERR_PTR(-EIO);
                return ERR_PTR(-EOPNOTSUPP);
        }
        for_each_xattr_handler(handlers, handler) {
                const char *n;

                n = strcmp_prefix(*name, xattr_prefix(handler));
                if (n) {
                        if (!handler->prefix ^ !*n) {
                                if (*n)
                                        continue;
                                return ERR_PTR(-EINVAL);
                        }
                        *name = n;
                        return handler;
                }
        }
        return ERR_PTR(-EOPNOTSUPP);
}

/**
 * may_write_xattr - check whether inode allows writing xattr
 * @idmap: idmap of the mount the inode was found from
 * @inode: the inode on which to set an xattr
 *
 * Check whether the inode allows writing xattrs. Specifically, we can never
 * set or remove an extended attribute on a read-only filesystem  or on an
 * immutable / append-only inode.
 *
 * We also need to ensure that the inode has a mapping in the mount to
 * not risk writing back invalid i_{g,u}id values.
 *
 * Return: On success zero is returned. On error a negative errno is returned.
 */
int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode)
{
        if (IS_IMMUTABLE(inode))
                return -EPERM;
        if (IS_APPEND(inode))
                return -EPERM;
        if (HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        return 0;
}

static inline int xattr_permission_error(int mask)
{
        if (mask & MAY_WRITE)
                return -EPERM;
        return -ENODATA;
}

/*
 * Check permissions for extended attribute access.  This is a bit complicated
 * because different namespaces have very different rules.
 */
static int
xattr_permission(struct mnt_idmap *idmap, struct inode *inode,
                 const char *name, int mask)
{
        if (mask & MAY_WRITE) {
                int ret;

                ret = may_write_xattr(idmap, inode);
                if (ret)
                        return ret;
        }

        /*
         * No restriction for security.* and system.* from the VFS.  Decision
         * on these is left to the underlying filesystem / security module.
         */
        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
            !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
                return 0;

        /*
         * The trusted.* namespace can only be accessed by privileged users.
         */
        if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
                if (!capable(CAP_SYS_ADMIN))
                        return xattr_permission_error(mask);
                return 0;
        }

        /*
         * In the user.* namespace, only regular files and directories can have
         * extended attributes. For sticky directories, only the owner and
         * privileged users can write attributes.
         */
        if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
                switch (inode->i_mode & S_IFMT) {
                case S_IFREG:
                        break;
                case S_IFDIR:
                        if (!(inode->i_mode & S_ISVTX))
                                break;
                        if (!(mask & MAY_WRITE))
                                break;
                        if (inode_owner_or_capable(idmap, inode))
                                break;
                        return -EPERM;
                case S_IFSOCK:
                        break;
                default:
                        return xattr_permission_error(mask);
                }
        }

        return inode_permission(idmap, inode, mask);
}

/*
 * Look for any handler that deals with the specified namespace.
 */
int
xattr_supports_user_prefix(struct inode *inode)
{
        const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
        const struct xattr_handler *handler;

        if (!(inode->i_opflags & IOP_XATTR)) {
                if (unlikely(is_bad_inode(inode)))
                        return -EIO;
                return -EOPNOTSUPP;
        }

        for_each_xattr_handler(handlers, handler) {
                if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX,
                             XATTR_USER_PREFIX_LEN))
                        return 0;
        }

        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(xattr_supports_user_prefix);

int
__vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
               struct inode *inode, const char *name, const void *value,
               size_t size, int flags)
{
        const struct xattr_handler *handler;

        if (is_posix_acl_xattr(name))
                return -EOPNOTSUPP;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->set)
                return -EOPNOTSUPP;
        if (size == 0)
                value = "";  /* empty EA, do not remove */
        return handler->set(handler, idmap, dentry, inode, name, value,
                            size, flags);
}
EXPORT_SYMBOL(__vfs_setxattr);

/**
 *  __vfs_setxattr_noperm - perform setxattr operation without performing
 *  permission checks.
 *
 *  @idmap: idmap of the mount the inode was found from
 *  @dentry: object to perform setxattr on
 *  @name: xattr name to set
 *  @value: value to set @name to
 *  @size: size of @value
 *  @flags: flags to pass into filesystem operations
 *
 *  returns the result of the internal setxattr or setsecurity operations.
 *
 *  This function requires the caller to lock the inode's i_rwsem before it
 *  is executed. It also assumes that the caller will make the appropriate
 *  permission checks.
 */
int __vfs_setxattr_noperm(struct mnt_idmap *idmap,
                          struct dentry *dentry, const char *name,
                          const void *value, size_t size, int flags)
{
        struct inode *inode = dentry->d_inode;
        int error = -EAGAIN;
        int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
                                   XATTR_SECURITY_PREFIX_LEN);

        if (issec)
                inode->i_flags &= ~S_NOSEC;
        if (inode->i_opflags & IOP_XATTR) {
                error = __vfs_setxattr(idmap, dentry, inode, name, value,
                                       size, flags);
                if (!error) {
                        fsnotify_xattr(dentry);
                        security_inode_post_setxattr(dentry, name, value,
                                                     size, flags);
                }
        } else {
                if (unlikely(is_bad_inode(inode)))
                        return -EIO;
        }
        if (error == -EAGAIN) {
                error = -EOPNOTSUPP;

                if (issec) {
                        const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;

                        error = security_inode_setsecurity(inode, suffix, value,
                                                           size, flags);
                        if (!error)
                                fsnotify_xattr(dentry);
                }
        }

        return error;
}

/**
 * __vfs_setxattr_locked - set an extended attribute while holding the inode
 * lock
 *
 *  @idmap: idmap of the mount of the target inode
 *  @dentry: object to perform setxattr on
 *  @name: xattr name to set
 *  @value: value to set @name to
 *  @size: size of @value
 *  @flags: flags to pass into filesystem operations
 *  @delegated_inode: on return, will contain an inode pointer that
 *  a delegation was broken on, NULL if none.
 */
int
__vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry,
                      const char *name, const void *value, size_t size,
                      int flags, struct delegated_inode *delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_WRITE);
        if (error)
                return error;

        error = security_inode_setxattr(idmap, dentry, name, value, size,
                                        flags);
        if (error)
                goto out;

        error = try_break_deleg(inode, delegated_inode);
        if (error)
                goto out;

        error = __vfs_setxattr_noperm(idmap, dentry, name, value,
                                      size, flags);

out:
        return error;
}
EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);

int
vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
             const char *name, const void *value, size_t size, int flags)
{
        struct inode *inode = dentry->d_inode;
        struct delegated_inode delegated_inode = { };
        const void  *orig_value = value;
        int error;

        if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
                error = cap_convert_nscap(idmap, dentry, &value, size);
                if (error < 0)
                        return error;
                size = error;
        }

retry_deleg:
        inode_lock(inode);
        error = __vfs_setxattr_locked(idmap, dentry, name, value, size,
                                      flags, &delegated_inode);
        inode_unlock(inode);

        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        if (value != orig_value)
                kfree(value);

        return error;
}
EXPORT_SYMBOL_GPL(vfs_setxattr);

static ssize_t
xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode,
                  const char *name, void *value, size_t size)
{
        void *buffer = NULL;
        ssize_t len;

        if (!value || !size) {
                len = security_inode_getsecurity(idmap, inode, name,
                                                 &buffer, false);
                goto out_noalloc;
        }

        len = security_inode_getsecurity(idmap, inode, name, &buffer,
                                         true);
        if (len < 0)
                return len;
        if (size < len) {
                len = -ERANGE;
                goto out;
        }
        memcpy(value, buffer, len);
out:
        kfree(buffer);
out_noalloc:
        return len;
}

/*
 * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
 *
 * Allocate memory, if not already allocated, or re-allocate correct size,
 * before retrieving the extended attribute.  The xattr value buffer should
 * always be freed by the caller, even on error.
 *
 * Returns the result of alloc, if failed, or the getxattr operation.
 */
int
vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *name, char **xattr_value, size_t xattr_size,
                   gfp_t flags)
{
        const struct xattr_handler *handler;
        struct inode *inode = dentry->d_inode;
        char *value = *xattr_value;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_READ);
        if (error)
                return error;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->get)
                return -EOPNOTSUPP;
        error = handler->get(handler, dentry, inode, name, NULL, 0);
        if (error < 0)
                return error;

        if (!value || (error > xattr_size)) {
                value = krealloc(*xattr_value, error + 1, flags);
                if (!value)
                        return -ENOMEM;
                memset(value, 0, error + 1);
        }

        error = handler->get(handler, dentry, inode, name, value, error);
        *xattr_value = value;
        return error;
}

ssize_t
__vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
               void *value, size_t size)
{
        const struct xattr_handler *handler;

        if (is_posix_acl_xattr(name))
                return -EOPNOTSUPP;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->get)
                return -EOPNOTSUPP;
        return handler->get(handler, dentry, inode, name, value, size);
}
EXPORT_SYMBOL(__vfs_getxattr);

ssize_t
vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry,
             const char *name, void *value, size_t size)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_READ);
        if (error)
                return error;

        error = security_inode_getxattr(dentry, name);
        if (error)
                return error;

        if (!strncmp(name, XATTR_SECURITY_PREFIX,
                                XATTR_SECURITY_PREFIX_LEN)) {
                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
                int ret = xattr_getsecurity(idmap, inode, suffix, value,
                                            size);
                /*
                 * Only overwrite the return value if a security module
                 * is actually active.
                 */
                if (ret == -EOPNOTSUPP)
                        goto nolsm;
                return ret;
        }
nolsm:
        return __vfs_getxattr(dentry, inode, name, value, size);
}
EXPORT_SYMBOL_GPL(vfs_getxattr);

/**
 * vfs_listxattr - retrieve \0 separated list of xattr names
 * @dentry: the dentry from whose inode the xattr names are retrieved
 * @list: buffer to store xattr names into
 * @size: size of the buffer
 *
 * This function returns the names of all xattrs associated with the
 * inode of @dentry.
 *
 * Note, for legacy reasons the vfs_listxattr() function lists POSIX
 * ACLs as well. Since POSIX ACLs are decoupled from IOP_XATTR the
 * vfs_listxattr() function doesn't check for this flag since a
 * filesystem could implement POSIX ACLs without implementing any other
 * xattrs.
 *
 * However, since all codepaths that remove IOP_XATTR also assign of
 * inode operations that either don't implement or implement a stub
 * ->listxattr() operation.
 *
 * Return: On success, the size of the buffer that was used. On error a
 *         negative error code.
 */
ssize_t
vfs_listxattr(struct dentry *dentry, char *list, size_t size)
{
        struct inode *inode = d_inode(dentry);
        ssize_t error;

        error = security_inode_listxattr(dentry);
        if (error)
                return error;

        if (inode->i_op->listxattr) {
                error = inode->i_op->listxattr(dentry, list, size);
        } else {
                error = security_inode_listsecurity(inode, list, size);
                if (size && error > size)
                        error = -ERANGE;
        }
        return error;
}
EXPORT_SYMBOL_GPL(vfs_listxattr);

int
__vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                  const char *name)
{
        struct inode *inode = d_inode(dentry);
        const struct xattr_handler *handler;

        if (is_posix_acl_xattr(name))
                return -EOPNOTSUPP;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->set)
                return -EOPNOTSUPP;
        return handler->set(handler, idmap, dentry, inode, name, NULL, 0,
                            XATTR_REPLACE);
}
EXPORT_SYMBOL(__vfs_removexattr);

/**
 * __vfs_removexattr_locked - set an extended attribute while holding the inode
 * lock
 *
 *  @idmap: idmap of the mount of the target inode
 *  @dentry: object to perform setxattr on
 *  @name: name of xattr to remove
 *  @delegated_inode: on return, will contain an inode pointer that
 *  a delegation was broken on, NULL if none.
 */
int
__vfs_removexattr_locked(struct mnt_idmap *idmap,
                         struct dentry *dentry, const char *name,
                         struct delegated_inode *delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_WRITE);
        if (error)
                return error;

        error = security_inode_removexattr(idmap, dentry, name);
        if (error)
                goto out;

        error = try_break_deleg(inode, delegated_inode);
        if (error)
                goto out;

        error = __vfs_removexattr(idmap, dentry, name);
        if (error)
                return error;

        fsnotify_xattr(dentry);
        security_inode_post_removexattr(dentry, name);

out:
        return error;
}
EXPORT_SYMBOL_GPL(__vfs_removexattr_locked);

int
vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                const char *name)
{
        struct inode *inode = dentry->d_inode;
        struct delegated_inode delegated_inode = { };
        int error;

retry_deleg:
        inode_lock(inode);
        error = __vfs_removexattr_locked(idmap, dentry,
                                         name, &delegated_inode);
        inode_unlock(inode);

        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        return error;
}
EXPORT_SYMBOL_GPL(vfs_removexattr);

int import_xattr_name(struct xattr_name *kname, const char __user *name)
{
        int error = strncpy_from_user(kname->name, name,
                                        sizeof(kname->name));
        if (error == 0 || error == sizeof(kname->name))
                return -ERANGE;
        if (error < 0)
                return error;
        return 0;
}

/*
 * Extended attribute SET operations
 */

int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx)
{
        int error;

        if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
                return -EINVAL;

        error = import_xattr_name(ctx->kname, name);
        if (error)
                return error;

        if (ctx->size) {
                if (ctx->size > XATTR_SIZE_MAX)
                        return -E2BIG;

                ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size);
                if (IS_ERR(ctx->kvalue)) {
                        error = PTR_ERR(ctx->kvalue);
                        ctx->kvalue = NULL;
                }
        }

        return error;
}

static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                struct kernel_xattr_ctx *ctx)
{
        if (is_posix_acl_xattr(ctx->kname->name))
                return do_set_acl(idmap, dentry, ctx->kname->name,
                                  ctx->kvalue, ctx->size);

        return vfs_setxattr(idmap, dentry, ctx->kname->name,
                        ctx->kvalue, ctx->size, ctx->flags);
}

int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx)
{
        int error = mnt_want_write_file(f);

        if (!error) {
                audit_file(f);
                error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx);
                mnt_drop_write_file(f);
        }
        return error;
}

int filename_setxattr(int dfd, struct filename *filename,
                      unsigned int lookup_flags, struct kernel_xattr_ctx *ctx)
{
        struct path path;
        int error;

retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                return error;
        error = mnt_want_write(path.mnt);
        if (!error) {
                error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx);
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

static int path_setxattrat(int dfd, const char __user *pathname,
                           unsigned int at_flags, const char __user *name,
                           const void __user *value, size_t size, int flags)
{
        struct xattr_name kname;
        struct kernel_xattr_ctx ctx = {
                .cvalue        = value,
                .kvalue        = NULL,
                .size        = size,
                .kname        = &kname,
                .flags        = flags,
        };
        unsigned int lookup_flags = 0;
        int error;

        if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;

        if (!(at_flags & AT_SYMLINK_NOFOLLOW))
                lookup_flags = LOOKUP_FOLLOW;

        error = setxattr_copy(name, &ctx);
        if (error)
                return error;

        CLASS(filename_maybe_null, filename)(pathname, at_flags);
        if (!filename && dfd >= 0) {
                CLASS(fd, f)(dfd);
                if (fd_empty(f))
                        error = -EBADF;
                else
                        error = file_setxattr(fd_file(f), &ctx);
        } else {
                error = filename_setxattr(dfd, filename, lookup_flags, &ctx);
        }
        kvfree(ctx.kvalue);
        return error;
}

SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
                const char __user *, name, const struct xattr_args __user *, uargs,
                size_t, usize)
{
        struct xattr_args args = {};
        int error;

        BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
        BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);

        if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
                return -EINVAL;
        if (usize > PAGE_SIZE)
                return -E2BIG;

        error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (error)
                return error;

        return path_setxattrat(dfd, pathname, at_flags, name,
                               u64_to_user_ptr(args.value), args.size,
                               args.flags);
}

SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
                const char __user *, name, const void __user *, value,
                size_t, size, int, flags)
{
        return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags);
}

SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
                const char __user *, name, const void __user *, value,
                size_t, size, int, flags)
{
        return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
                               value, size, flags);
}

SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                const void __user *,value, size_t, size, int, flags)
{
        return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name,
                               value, size, flags);
}

/*
 * Extended attribute GET operations
 */
static ssize_t
do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
        struct kernel_xattr_ctx *ctx)
{
        ssize_t error;
        char *kname = ctx->kname->name;
        void *kvalue = NULL;

        if (ctx->size) {
                if (ctx->size > XATTR_SIZE_MAX)
                        ctx->size = XATTR_SIZE_MAX;
                kvalue = kvzalloc(ctx->size, GFP_KERNEL);
                if (!kvalue)
                        return -ENOMEM;
        }

        if (is_posix_acl_xattr(kname))
                error = do_get_acl(idmap, d, kname, kvalue, ctx->size);
        else
                error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size);
        if (error > 0) {
                if (ctx->size && copy_to_user(ctx->value, kvalue, error))
                        error = -EFAULT;
        } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
                /* The file system tried to returned a value bigger
                   than XATTR_SIZE_MAX bytes. Not possible. */
                error = -E2BIG;
        }

        kvfree(kvalue);
        return error;
}

ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx)
{
        audit_file(f);
        return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx);
}

ssize_t filename_getxattr(int dfd, struct filename *filename,
                          unsigned int lookup_flags, struct kernel_xattr_ctx *ctx)
{
        struct path path;
        ssize_t error;
retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                return error;
        error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

static ssize_t path_getxattrat(int dfd, const char __user *pathname,
                               unsigned int at_flags, const char __user *name,
                               void __user *value, size_t size)
{
        struct xattr_name kname;
        struct kernel_xattr_ctx ctx = {
                .value    = value,
                .size     = size,
                .kname    = &kname,
                .flags    = 0,
        };
        ssize_t error;

        if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;

        error = import_xattr_name(&kname, name);
        if (error)
                return error;

        CLASS(filename_maybe_null, filename)(pathname, at_flags);
        if (!filename && dfd >= 0) {
                CLASS(fd, f)(dfd);
                if (fd_empty(f))
                        return -EBADF;
                return file_getxattr(fd_file(f), &ctx);
        } else {
                int lookup_flags = 0;
                if (!(at_flags & AT_SYMLINK_NOFOLLOW))
                        lookup_flags = LOOKUP_FOLLOW;
                return filename_getxattr(dfd, filename, lookup_flags, &ctx);
        }
}

SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
                const char __user *, name, struct xattr_args __user *, uargs, size_t, usize)
{
        struct xattr_args args = {};
        int error;

        BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
        BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);

        if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
                return -EINVAL;
        if (usize > PAGE_SIZE)
                return -E2BIG;

        error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (error)
                return error;

        if (args.flags != 0)
                return -EINVAL;

        return path_getxattrat(dfd, pathname, at_flags, name,
                               u64_to_user_ptr(args.value), args.size);
}

SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
                const char __user *, name, void __user *, value, size_t, size)
{
        return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size);
}

SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
                const char __user *, name, void __user *, value, size_t, size)
{
        return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
                               value, size);
}

SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
                void __user *, value, size_t, size)
{
        return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size);
}

/*
 * Extended attribute LIST operations
 */
static ssize_t
listxattr(struct dentry *d, char __user *list, size_t size)
{
        ssize_t error;
        char *klist = NULL;

        if (size) {
                if (size > XATTR_LIST_MAX)
                        size = XATTR_LIST_MAX;
                klist = kvmalloc(size, GFP_KERNEL);
                if (!klist)
                        return -ENOMEM;
        }

        error = vfs_listxattr(d, klist, size);
        if (error > 0) {
                if (size && copy_to_user(list, klist, error))
                        error = -EFAULT;
        } else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
                /* The file system tried to returned a list bigger
                   than XATTR_LIST_MAX bytes. Not possible. */
                error = -E2BIG;
        }

        kvfree(klist);

        return error;
}

static
ssize_t file_listxattr(struct file *f, char __user *list, size_t size)
{
        audit_file(f);
        return listxattr(f->f_path.dentry, list, size);
}

static
ssize_t filename_listxattr(int dfd, struct filename *filename,
                           unsigned int lookup_flags,
                           char __user *list, size_t size)
{
        struct path path;
        ssize_t error;
retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                return error;
        error = listxattr(path.dentry, list, size);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

static ssize_t path_listxattrat(int dfd, const char __user *pathname,
                                unsigned int at_flags, char __user *list,
                                size_t size)
{
        int lookup_flags;

        if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;

        CLASS(filename_maybe_null, filename)(pathname, at_flags);
        if (!filename) {
                CLASS(fd, f)(dfd);
                if (fd_empty(f))
                        return -EBADF;
                return file_listxattr(fd_file(f), list, size);
        }

        lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        return filename_listxattr(dfd, filename, lookup_flags, list, size);
}

SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname,
                unsigned int, at_flags,
                char __user *, list, size_t, size)
{
        return path_listxattrat(dfd, pathname, at_flags, list, size);
}

SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
                size_t, size)
{
        return path_listxattrat(AT_FDCWD, pathname, 0, list, size);
}

SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
                size_t, size)
{
        return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size);
}

SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
{
        return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size);
}

/*
 * Extended attribute REMOVE operations
 */
static long
removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name)
{
        if (is_posix_acl_xattr(name))
                return vfs_remove_acl(idmap, d, name);
        return vfs_removexattr(idmap, d, name);
}

static int file_removexattr(struct file *f, struct xattr_name *kname)
{
        int error = mnt_want_write_file(f);

        if (!error) {
                audit_file(f);
                error = removexattr(file_mnt_idmap(f),
                                    f->f_path.dentry, kname->name);
                mnt_drop_write_file(f);
        }
        return error;
}

static int filename_removexattr(int dfd, struct filename *filename,
                                unsigned int lookup_flags, struct xattr_name *kname)
{
        struct path path;
        int error;

retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                return error;
        error = mnt_want_write(path.mnt);
        if (!error) {
                error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name);
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

static int path_removexattrat(int dfd, const char __user *pathname,
                              unsigned int at_flags, const char __user *name)
{
        struct xattr_name kname;
        unsigned int lookup_flags;
        int error;

        if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;

        error = import_xattr_name(&kname, name);
        if (error)
                return error;

        CLASS(filename_maybe_null, filename)(pathname, at_flags);
        if (!filename) {
                CLASS(fd, f)(dfd);
                if (fd_empty(f))
                        return -EBADF;
                return file_removexattr(fd_file(f), &kname);
        }
        lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        return filename_removexattr(dfd, filename, lookup_flags, &kname);
}

SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname,
                unsigned int, at_flags, const char __user *, name)
{
        return path_removexattrat(dfd, pathname, at_flags, name);
}

SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
                const char __user *, name)
{
        return path_removexattrat(AT_FDCWD, pathname, 0, name);
}

SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
                const char __user *, name)
{
        return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name);
}

SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
{
        return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name);
}

int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
{
        size_t len;

        len = strlen(name) + 1;
        if (*buffer) {
                if (*remaining_size < len)
                        return -ERANGE;
                memcpy(*buffer, name, len);
                *buffer += len;
        }
        *remaining_size -= len;
        return 0;
}

/**
 * generic_listxattr - run through a dentry's xattr list() operations
 * @dentry: dentry to list the xattrs
 * @buffer: result buffer
 * @buffer_size: size of @buffer
 *
 * Combine the results of the list() operation from every xattr_handler in the
 * xattr_handler stack.
 *
 * Note that this will not include the entries for POSIX ACLs.
 */
ssize_t
generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
        ssize_t remaining_size = buffer_size;

        for_each_xattr_handler(handlers, handler) {
                int err;

                if (!handler->name || (handler->list && !handler->list(dentry)))
                        continue;
                err = xattr_list_one(&buffer, &remaining_size, handler->name);
                if (err)
                        return err;
        }

        return buffer_size - remaining_size;
}
EXPORT_SYMBOL(generic_listxattr);

/**
 * xattr_full_name  -  Compute full attribute name from suffix
 *
 * @handler:        handler of the xattr_handler operation
 * @name:        name passed to the xattr_handler operation
 *
 * The get and set xattr handler operations are called with the remainder of
 * the attribute name after skipping the handler's prefix: for example, "foo"
 * is passed to the get operation of a handler with prefix "user." to get
 * attribute "user.foo".  The full name is still "there" in the name though.
 *
 * Note: the list xattr handler operation when called from the vfs is passed a
 * NULL name; some file systems use this operation internally, with varying
 * semantics.
 */
const char *xattr_full_name(const struct xattr_handler *handler,
                            const char *name)
{
        size_t prefix_len = strlen(xattr_prefix(handler));

        return name - prefix_len;
}
EXPORT_SYMBOL(xattr_full_name);

/**
 * simple_xattr_space - estimate the memory used by a simple xattr
 * @name: the full name of the xattr
 * @size: the size of its value
 *
 * This takes no account of how much larger the two slab objects actually are:
 * that would depend on the slab implementation, when what is required is a
 * deterministic number, which grows with name length and size and quantity.
 *
 * Return: The approximate number of bytes of memory used by such an xattr.
 */
size_t simple_xattr_space(const char *name, size_t size)
{
        /*
         * Use "40" instead of sizeof(struct simple_xattr), to return the
         * same result on 32-bit and 64-bit, and even if simple_xattr grows.
         */
        return 40 + size + strlen(name);
}

/**
 * simple_xattr_free - free an xattr object
 * @xattr: the xattr object
 *
 * Free the xattr object. Can handle @xattr being NULL.
 */
void simple_xattr_free(struct simple_xattr *xattr)
{
        if (xattr)
                kfree(xattr->name);
        kvfree(xattr);
}

static void simple_xattr_rcu_free(struct rcu_head *head)
{
        struct simple_xattr *xattr = container_of(head, struct simple_xattr, rcu);

        simple_xattr_free(xattr);
}

/**
 * simple_xattr_free_rcu - free an xattr object with RCU delay
 * @xattr: the xattr object
 *
 * Free the xattr object after an RCU grace period. This must be used when
 * the xattr was removed from a data structure that concurrent RCU readers
 * may still be traversing. Can handle @xattr being NULL.
 */
void simple_xattr_free_rcu(struct simple_xattr *xattr)
{
        if (xattr)
                call_rcu(&xattr->rcu, simple_xattr_rcu_free);
}

/**
 * simple_xattr_alloc - allocate new xattr object
 * @value: value of the xattr object
 * @size: size of @value
 *
 * Allocate a new xattr object and initialize respective members. The caller is
 * responsible for handling the name of the xattr.
 *
 * Return: New xattr object on success, NULL if @value is NULL, ERR_PTR on
 * failure.
 */
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
{
        struct simple_xattr *new_xattr;
        size_t len;

        if (!value)
                return NULL;

        /* wrap around? */
        len = sizeof(*new_xattr) + size;
        if (len < sizeof(*new_xattr))
                return ERR_PTR(-ENOMEM);

        new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT);
        if (!new_xattr)
                return ERR_PTR(-ENOMEM);

        new_xattr->size = size;
        memcpy(new_xattr->value, value, size);
        return new_xattr;
}

static u32 simple_xattr_hashfn(const void *data, u32 len, u32 seed)
{
        const char *name = data;
        return jhash(name, strlen(name), seed);
}

static u32 simple_xattr_obj_hashfn(const void *obj, u32 len, u32 seed)
{
        const struct simple_xattr *xattr = obj;
        return jhash(xattr->name, strlen(xattr->name), seed);
}

static int simple_xattr_obj_cmpfn(struct rhashtable_compare_arg *arg,
                                   const void *obj)
{
        const struct simple_xattr *xattr = obj;
        return strcmp(xattr->name, arg->key);
}

static const struct rhashtable_params simple_xattr_params = {
        .head_offset    = offsetof(struct simple_xattr, hash_node),
        .hashfn         = simple_xattr_hashfn,
        .obj_hashfn     = simple_xattr_obj_hashfn,
        .obj_cmpfn      = simple_xattr_obj_cmpfn,
        .automatic_shrinking = true,
};

/**
 * simple_xattr_get - get an xattr object
 * @xattrs: the header of the xattr object
 * @name: the name of the xattr to retrieve
 * @buffer: the buffer to store the value into
 * @size: the size of @buffer
 *
 * Try to find and retrieve the xattr object associated with @name.
 * If @buffer is provided store the value of @xattr in @buffer
 * otherwise just return the length. The size of @buffer is limited
 * to XATTR_SIZE_MAX which currently is 65536.
 *
 * Return: On success the length of the xattr value is returned. On error a
 * negative error code is returned.
 */
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
                     void *buffer, size_t size)
{
        struct simple_xattr *xattr;
        int ret = -ENODATA;

        guard(rcu)();
        xattr = rhashtable_lookup(&xattrs->ht, name, simple_xattr_params);
        if (xattr) {
                ret = xattr->size;
                if (buffer) {
                        if (size < xattr->size)
                                ret = -ERANGE;
                        else
                                memcpy(buffer, xattr->value, xattr->size);
                }
        }
        return ret;
}

/**
 * simple_xattr_set - set an xattr object
 * @xattrs: the header of the xattr object
 * @name: the name of the xattr to retrieve
 * @value: the value to store along the xattr
 * @size: the size of @value
 * @flags: the flags determining how to set the xattr
 *
 * Set a new xattr object.
 * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE
 * is specified in @flags a matching xattr object for @name must already exist.
 * If it does it will be replaced with the new xattr object. If it doesn't we
 * fail. If XATTR_CREATE is specified and a matching xattr does already exist
 * we fail. If it doesn't we create a new xattr. If @flags is zero we simply
 * insert the new xattr replacing any existing one.
 *
 * If @value is empty and a matching xattr object is found we delete it if
 * XATTR_REPLACE is specified in @flags or @flags is zero.
 *
 * If @value is empty and no matching xattr object for @name is found we do
 * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For
 * XATTR_REPLACE we fail as mentioned above.
 *
 * Note: Callers must externally serialize writes. All current callers hold
 * the inode lock for write operations. The lookup->replace/remove sequence
 * is not atomic with respect to the rhashtable's per-bucket locking, but
 * is safe because writes are serialized by the caller.
 *
 * Return: On success, the removed or replaced xattr is returned, to be freed
 * by the caller; or NULL if none. On failure a negative error code is returned.
 */
struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs,
                                      const char *name, const void *value,
                                      size_t size, int flags)
{
        struct simple_xattr *old_xattr = NULL;
        int err;

        CLASS(simple_xattr, new_xattr)(value, size);
        if (IS_ERR(new_xattr))
                return new_xattr;

        if (new_xattr) {
                new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
                if (!new_xattr->name)
                        return ERR_PTR(-ENOMEM);
        }

        /* Lookup is safe without RCU here since writes are serialized. */
        old_xattr = rhashtable_lookup_fast(&xattrs->ht, name,
                                           simple_xattr_params);

        if (old_xattr) {
                /* Fail if XATTR_CREATE is requested and the xattr exists. */
                if (flags & XATTR_CREATE)
                        return ERR_PTR(-EEXIST);

                if (new_xattr) {
                        err = rhashtable_replace_fast(&xattrs->ht,
                                                      &old_xattr->hash_node,
                                                      &new_xattr->hash_node,
                                                      simple_xattr_params);
                        if (err)
                                return ERR_PTR(err);
                } else {
                        err = rhashtable_remove_fast(&xattrs->ht,
                                                     &old_xattr->hash_node,
                                                     simple_xattr_params);
                        if (err)
                                return ERR_PTR(err);
                }
        } else {
                /* Fail if XATTR_REPLACE is requested but no xattr is found. */
                if (flags & XATTR_REPLACE)
                        return ERR_PTR(-ENODATA);

                /*
                 * If XATTR_CREATE or no flags are specified together with a
                 * new value simply insert it.
                 */
                if (new_xattr) {
                        err = rhashtable_insert_fast(&xattrs->ht,
                                                     &new_xattr->hash_node,
                                                     simple_xattr_params);
                        if (err)
                                return ERR_PTR(err);
                }

                /*
                 * If XATTR_CREATE or no flags are specified and neither an
                 * old or new xattr exist then we don't need to do anything.
                 */
        }

        retain_and_null_ptr(new_xattr);
        return old_xattr;
}

static inline void simple_xattr_limits_dec(struct simple_xattr_limits *limits,
                                           size_t size)
{
        atomic_sub(size, &limits->xattr_size);
        atomic_dec(&limits->nr_xattrs);
}

static inline int simple_xattr_limits_inc(struct simple_xattr_limits *limits,
                                          size_t size)
{
        if (atomic_inc_return(&limits->nr_xattrs) > SIMPLE_XATTR_MAX_NR) {
                atomic_dec(&limits->nr_xattrs);
                return -ENOSPC;
        }

        if (atomic_add_return(size, &limits->xattr_size) <= SIMPLE_XATTR_MAX_SIZE)
                return 0;

        simple_xattr_limits_dec(limits, size);
        return -ENOSPC;
}

/**
 * simple_xattr_set_limited - set an xattr with per-inode user.* limits
 * @xattrs: the header of the xattr object
 * @limits: per-inode limit counters for user.* xattrs
 * @name: the name of the xattr to set or remove
 * @value: the value to store (NULL to remove)
 * @size: the size of @value
 * @flags: XATTR_CREATE, XATTR_REPLACE, or 0
 *
 * Like simple_xattr_set(), but enforces per-inode count and total value size
 * limits for user.* xattrs. Uses speculative pre-increment of the atomic
 * counters to avoid races without requiring external locks.
 *
 * Return: On success zero is returned. On failure a negative error code is
 * returned.
 */
int simple_xattr_set_limited(struct simple_xattrs *xattrs,
                             struct simple_xattr_limits *limits,
                             const char *name, const void *value,
                             size_t size, int flags)
{
        struct simple_xattr *old_xattr;
        int ret;

        if (value) {
                ret = simple_xattr_limits_inc(limits, size);
                if (ret)
                        return ret;
        }

        old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
        if (IS_ERR(old_xattr)) {
                if (value)
                        simple_xattr_limits_dec(limits, size);
                return PTR_ERR(old_xattr);
        }
        if (old_xattr) {
                simple_xattr_limits_dec(limits, old_xattr->size);
                simple_xattr_free_rcu(old_xattr);
        }
        return 0;
}

static bool xattr_is_trusted(const char *name)
{
        return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}

static bool xattr_is_maclabel(const char *name)
{
        const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;

        return !strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) &&
                security_ismaclabel(suffix);
}

/**
 * simple_xattr_list - list all xattr objects
 * @inode: inode from which to get the xattrs
 * @xattrs: the header of the xattr object
 * @buffer: the buffer to store all xattrs into
 * @size: the size of @buffer
 *
 * List all xattrs associated with @inode. If @buffer is NULL we returned
 * the required size of the buffer. If @buffer is provided we store the
 * xattrs value into it provided it is big enough.
 *
 * Note, the number of xattr names that can be listed with listxattr(2) is
 * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed
 * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names
 * are found it will return -E2BIG.
 *
 * Return: On success the required size or the size of the copied xattrs is
 * returned. On error a negative error code is returned.
 */
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
                          char *buffer, size_t size)
{
        bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
        struct rhashtable_iter iter;
        struct simple_xattr *xattr;
        ssize_t remaining_size = size;
        int err = 0;

        err = posix_acl_listxattr(inode, &buffer, &remaining_size);
        if (err)
                return err;

        err = security_inode_listsecurity(inode, buffer, remaining_size);
        if (err < 0)
                return err;

        if (buffer) {
                if (remaining_size < err)
                        return -ERANGE;
                buffer += err;
        }
        remaining_size -= err;
        err = 0;

        if (!xattrs)
                return size - remaining_size;

        rhashtable_walk_enter(&xattrs->ht, &iter);
        rhashtable_walk_start(&iter);

        while ((xattr = rhashtable_walk_next(&iter)) != NULL) {
                if (IS_ERR(xattr)) {
                        if (PTR_ERR(xattr) == -EAGAIN)
                                continue;
                        err = PTR_ERR(xattr);
                        break;
                }

                /* skip "trusted." attributes for unprivileged callers */
                if (!trusted && xattr_is_trusted(xattr->name))
                        continue;

                /* skip MAC labels; these are provided by LSM above */
                if (xattr_is_maclabel(xattr->name))
                        continue;

                err = xattr_list_one(&buffer, &remaining_size, xattr->name);
                if (err)
                        break;
        }

        rhashtable_walk_stop(&iter);
        rhashtable_walk_exit(&iter);

        return err ? err : size - remaining_size;
}

/**
 * simple_xattr_add - add xattr objects
 * @xattrs: the header of the xattr object
 * @new_xattr: the xattr object to add
 *
 * Add an xattr object to @xattrs. This assumes no replacement or removal
 * of matching xattrs is wanted. Should only be called during inode
 * initialization when a few distinct initial xattrs are supposed to be set.
 *
 * Return: On success zero is returned. On failure a negative error code is
 * returned.
 */
int simple_xattr_add(struct simple_xattrs *xattrs,
                     struct simple_xattr *new_xattr)
{
        return rhashtable_insert_fast(&xattrs->ht, &new_xattr->hash_node,
                                      simple_xattr_params);
}

/**
 * simple_xattrs_init - initialize new xattr header
 * @xattrs: header to initialize
 *
 * Initialize the rhashtable used to store xattr objects.
 *
 * Return: On success zero is returned. On failure a negative error code is
 * returned.
 */
int simple_xattrs_init(struct simple_xattrs *xattrs)
{
        return rhashtable_init(&xattrs->ht, &simple_xattr_params);
}

/**
 * simple_xattrs_alloc - allocate and initialize a new xattr header
 *
 * Dynamically allocate a simple_xattrs header and initialize the
 * underlying rhashtable. This is intended for consumers that want
 * to lazily allocate xattr storage only when the first xattr is set,
 * avoiding the per-inode rhashtable overhead when no xattrs are used.
 *
 * Return: On success a new simple_xattrs is returned. On failure an
 * ERR_PTR is returned.
 */
struct simple_xattrs *simple_xattrs_alloc(void)
{
        struct simple_xattrs *xattrs __free(kfree) = NULL;
        int ret;

        xattrs = kzalloc(sizeof(*xattrs), GFP_KERNEL);
        if (!xattrs)
                return ERR_PTR(-ENOMEM);

        ret = simple_xattrs_init(xattrs);
        if (ret)
                return ERR_PTR(ret);

        return no_free_ptr(xattrs);
}

/**
 * simple_xattrs_lazy_alloc - get or allocate xattrs for a set operation
 * @xattrsp: pointer to the xattrs pointer (may point to NULL)
 * @value: value being set (NULL means remove)
 * @flags: xattr set flags
 *
 * For lazily-allocated xattrs on the write path. If no xattrs exist yet
 * and this is a remove operation, returns the appropriate result without
 * allocating. Otherwise ensures xattrs is allocated and published with
 * store-release semantics.
 *
 * Return: On success a valid pointer to the xattrs is returned. On
 * failure or early-exit an ERR_PTR or NULL is returned. Callers should
 * check with IS_ERR_OR_NULL() and propagate with PTR_ERR() which
 * correctly returns 0 for the NULL no-op case.
 */
struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp,
                                               const void *value, int flags)
{
        struct simple_xattrs *xattrs;

        xattrs = READ_ONCE(*xattrsp);
        if (xattrs)
                return xattrs;

        if (!value)
                return (flags & XATTR_REPLACE) ? ERR_PTR(-ENODATA) : NULL;

        xattrs = simple_xattrs_alloc();
        if (!IS_ERR(xattrs))
                smp_store_release(xattrsp, xattrs);
        return xattrs;
}

static void simple_xattr_ht_free(void *ptr, void *arg)
{
        struct simple_xattr *xattr = ptr;
        size_t *freed_space = arg;

        if (freed_space)
                *freed_space += simple_xattr_space(xattr->name, xattr->size);
        simple_xattr_free(xattr);
}

/**
 * simple_xattrs_free - free xattrs
 * @xattrs: xattr header whose xattrs to destroy
 * @freed_space: approximate number of bytes of memory freed from @xattrs
 *
 * Destroy all xattrs in @xattr. When this is called no one can hold a
 * reference to any of the xattrs anymore.
 */
void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space)
{
        might_sleep();

        if (freed_space)
                *freed_space = 0;
        rhashtable_free_and_destroy(&xattrs->ht, simple_xattr_ht_free,
                                    freed_space);
}



















































































































































































































































































































































































































































































































































































































































































































































    3 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
// SPDX-License-Identifier: GPL-2.0+
/*
 * Copyright (c) 2001-2002 by David Brownell
 */

#ifndef __USB_CORE_HCD_H
#define __USB_CORE_HCD_H

#ifdef __KERNEL__

#include <linux/rwsem.h>
#include <linux/interrupt.h>
#include <linux/idr.h>

#define MAX_TOPO_LEVEL                6

/* This file contains declarations of usbcore internals that are mostly
 * used or exposed by Host Controller Drivers.
 */

/*
 * USB Packet IDs (PIDs)
 */
#define USB_PID_EXT                        0xf0        /* USB 2.0 LPM ECN */
#define USB_PID_OUT                        0xe1
#define USB_PID_ACK                        0xd2
#define USB_PID_DATA0                        0xc3
#define USB_PID_PING                        0xb4        /* USB 2.0 */
#define USB_PID_SOF                        0xa5
#define USB_PID_NYET                        0x96        /* USB 2.0 */
#define USB_PID_DATA2                        0x87        /* USB 2.0 */
#define USB_PID_SPLIT                        0x78        /* USB 2.0 */
#define USB_PID_IN                        0x69
#define USB_PID_NAK                        0x5a
#define USB_PID_DATA1                        0x4b
#define USB_PID_PREAMBLE                0x3c        /* Token mode */
#define USB_PID_ERR                        0x3c        /* USB 2.0: handshake mode */
#define USB_PID_SETUP                        0x2d
#define USB_PID_STALL                        0x1e
#define USB_PID_MDATA                        0x0f        /* USB 2.0 */

/*-------------------------------------------------------------------------*/

/*
 * USB Host Controller Driver (usb_hcd) framework
 *
 * Since "struct usb_bus" is so thin, you can't share much code in it.
 * This framework is a layer over that, and should be more shareable.
 */

/*-------------------------------------------------------------------------*/

struct giveback_urb_bh {
        bool running;
        bool high_prio;
        spinlock_t lock;
        struct list_head  head;
        struct work_struct bh;
        struct usb_host_endpoint *completing_ep;
};

enum usb_dev_authorize_policy {
        USB_DEVICE_AUTHORIZE_NONE        = 0,
        USB_DEVICE_AUTHORIZE_ALL        = 1,
        USB_DEVICE_AUTHORIZE_INTERNAL        = 2,
};

struct usb_hcd {

        /*
         * housekeeping
         */
        struct usb_bus                self;                /* hcd is-a bus */
        struct kref                kref;                /* reference counter */

        const char                *product_desc;        /* product/vendor string */
        int                        speed;                /* Speed for this roothub.
                                                 * May be different from
                                                 * hcd->driver->flags & HCD_MASK
                                                 */
        char                        irq_descr[24];        /* driver + bus # */

        struct timer_list        rh_timer;        /* drives root-hub polling */
        struct urb                *status_urb;        /* the current status urb */
#ifdef CONFIG_PM
        struct work_struct        wakeup_work;        /* for remote wakeup */
#endif
        struct work_struct        died_work;        /* for when the device dies */

        /*
         * hardware info/state
         */
        const struct hc_driver        *driver;        /* hw-specific hooks */

        /*
         * OTG and some Host controllers need software interaction with phys;
         * other external phys should be software-transparent
         */
        struct usb_phy                *usb_phy;
        struct usb_phy_roothub        *phy_roothub;

        /* Flags that need to be manipulated atomically because they can
         * change while the host controller is running.  Always use
         * set_bit() or clear_bit() to change their values.
         */
        unsigned long                flags;
#define HCD_FLAG_HW_ACCESSIBLE                0        /* at full power */
#define HCD_FLAG_POLL_RH                2        /* poll for rh status? */
#define HCD_FLAG_POLL_PENDING                3        /* status has changed? */
#define HCD_FLAG_WAKEUP_PENDING                4        /* root hub is resuming? */
#define HCD_FLAG_RH_RUNNING                5        /* root hub is running? */
#define HCD_FLAG_DEAD                        6        /* controller has died? */
#define HCD_FLAG_INTF_AUTHORIZED        7        /* authorize interfaces? */
#define HCD_FLAG_DEFER_RH_REGISTER        8        /* Defer roothub registration */

        /* The flags can be tested using these macros; they are likely to
         * be slightly faster than test_bit().
         */
#define HCD_HW_ACCESSIBLE(hcd)        ((hcd)->flags & (1U << HCD_FLAG_HW_ACCESSIBLE))
#define HCD_POLL_RH(hcd)        ((hcd)->flags & (1U << HCD_FLAG_POLL_RH))
#define HCD_POLL_PENDING(hcd)        ((hcd)->flags & (1U << HCD_FLAG_POLL_PENDING))
#define HCD_WAKEUP_PENDING(hcd)        ((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING))
#define HCD_RH_RUNNING(hcd)        ((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING))
#define HCD_DEAD(hcd)                ((hcd)->flags & (1U << HCD_FLAG_DEAD))
#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER))

        /*
         * Specifies if interfaces are authorized by default
         * or they require explicit user space authorization; this bit is
         * settable through /sys/class/usb_host/X/interface_authorized_default
         */
#define HCD_INTF_AUTHORIZED(hcd) \
        ((hcd)->flags & (1U << HCD_FLAG_INTF_AUTHORIZED))

        /*
         * Specifies if devices are authorized by default
         * or they require explicit user space authorization; this bit is
         * settable through /sys/class/usb_host/X/authorized_default
         */
        enum usb_dev_authorize_policy dev_policy;

        /* Flags that get set only during HCD registration or removal. */
        unsigned                rh_registered:1;/* is root hub registered? */
        unsigned                rh_pollable:1;        /* may we poll the root hub? */
        unsigned                msix_enabled:1;        /* driver has MSI-X enabled? */
        unsigned                msi_enabled:1;        /* driver has MSI enabled? */
        /*
         * do not manage the PHY state in the HCD core, instead let the driver
         * handle this (for example if the PHY can only be turned on after a
         * specific event)
         */
        unsigned                skip_phy_initialization:1;

        /* The next flag is a stopgap, to be removed when all the HCDs
         * support the new root-hub polling mechanism. */
        unsigned                uses_new_polling:1;
        unsigned                has_tt:1;        /* Integrated TT in root hub */
        unsigned                amd_resume_bug:1; /* AMD remote wakeup quirk */
        unsigned                can_do_streams:1; /* HC supports streams */
        unsigned                tpl_support:1; /* OTG & EH TPL support */
        unsigned                cant_recv_wakeups:1;
                        /* wakeup requests from downstream aren't received */

        unsigned int                irq;                /* irq allocated */
        void __iomem                *regs;                /* device memory/io */
        resource_size_t                rsrc_start;        /* memory/io resource start */
        resource_size_t                rsrc_len;        /* memory/io resource length */
        unsigned                power_budget;        /* in mA, 0 = no limit */

        struct giveback_urb_bh  high_prio_bh;
        struct giveback_urb_bh  low_prio_bh;

        /* bandwidth_mutex should be taken before adding or removing
         * any new bus bandwidth constraints:
         *   1. Before adding a configuration for a new device.
         *   2. Before removing the configuration to put the device into
         *      the addressed state.
         *   3. Before selecting a different configuration.
         *   4. Before selecting an alternate interface setting.
         *
         * bandwidth_mutex should be dropped after a successful control message
         * to the device, or resetting the bandwidth after a failed attempt.
         */
        struct mutex                *address0_mutex;
        struct mutex                *bandwidth_mutex;
        struct usb_hcd                *shared_hcd;
        struct usb_hcd                *primary_hcd;


#define HCD_BUFFER_POOLS        4
        struct dma_pool                *pool[HCD_BUFFER_POOLS];

        int                        state;
#        define        __ACTIVE                0x01
#        define        __SUSPEND                0x04
#        define        __TRANSIENT                0x80

#        define        HC_STATE_HALT                0
#        define        HC_STATE_RUNNING        (__ACTIVE)
#        define        HC_STATE_QUIESCING        (__SUSPEND|__TRANSIENT|__ACTIVE)
#        define        HC_STATE_RESUMING        (__SUSPEND|__TRANSIENT)
#        define        HC_STATE_SUSPENDED        (__SUSPEND)

#define        HC_IS_RUNNING(state) ((state) & __ACTIVE)
#define        HC_IS_SUSPENDED(state) ((state) & __SUSPEND)

        /* memory pool for HCs having local memory, or %NULL */
        struct gen_pool         *localmem_pool;

        /* more shared queuing code would be good; it should support
         * smarter scheduling, handle transaction translators, etc;
         * input size of periodic table to an interrupt scheduler.
         * (ohci 32, uhci 1024, ehci 256/512/1024).
         */

        /* The HC driver's private data is stored at the end of
         * this structure.
         */
        unsigned long hcd_priv[]
                        __attribute__ ((aligned(sizeof(s64))));
};

/* 2.4 does this a bit differently ... */
static inline struct usb_bus *hcd_to_bus(struct usb_hcd *hcd)
{
        return &hcd->self;
}

static inline struct usb_hcd *bus_to_hcd(struct usb_bus *bus)
{
        return container_of(bus, struct usb_hcd, self);
}

/*-------------------------------------------------------------------------*/


struct hc_driver {
        const char        *description;        /* "ehci-hcd" etc */
        const char        *product_desc;        /* product/vendor string */
        size_t                hcd_priv_size;        /* size of private data */

        /* irq handler */
        irqreturn_t        (*irq) (struct usb_hcd *hcd);

        int        flags;
#define        HCD_MEMORY        0x0001                /* HC regs use memory (else I/O) */
#define        HCD_DMA                0x0002                /* HC uses DMA */
#define        HCD_SHARED        0x0004                /* Two (or more) usb_hcds share HW */
#define        HCD_USB11        0x0010                /* USB 1.1 */
#define        HCD_USB2        0x0020                /* USB 2.0 */
#define        HCD_USB3        0x0040                /* USB 3.0 */
#define        HCD_USB31        0x0050                /* USB 3.1 */
#define        HCD_USB32        0x0060                /* USB 3.2 */
#define        HCD_MASK        0x0070
#define        HCD_BH                0x0100                /* URB complete in BH context */

        /* called to init HCD and root hub */
        int        (*reset) (struct usb_hcd *hcd);
        int        (*start) (struct usb_hcd *hcd);

        /* NOTE:  these suspend/resume calls relate to the HC as
         * a whole, not just the root hub; they're for PCI bus glue.
         */
        /* called after suspending the hub, before entering D3 etc */
        int        (*pci_suspend)(struct usb_hcd *hcd, bool do_wakeup);

        /* called after entering D0 (etc), before resuming the hub */
        int        (*pci_resume)(struct usb_hcd *hcd, pm_message_t state);

        /* called just before hibernate final D3 state, allows host to poweroff parts */
        int        (*pci_poweroff_late)(struct usb_hcd *hcd, bool do_wakeup);

        /* cleanly make HCD stop writing memory and doing I/O */
        void        (*stop) (struct usb_hcd *hcd);

        /* shutdown HCD */
        void        (*shutdown) (struct usb_hcd *hcd);

        /* return current frame number */
        int        (*get_frame_number) (struct usb_hcd *hcd);

        /* manage i/o requests, device state */
        int        (*urb_enqueue)(struct usb_hcd *hcd,
                                struct urb *urb, gfp_t mem_flags);
        int        (*urb_dequeue)(struct usb_hcd *hcd,
                                struct urb *urb, int status);

        /*
         * (optional) these hooks allow an HCD to override the default DMA
         * mapping and unmapping routines.  In general, they shouldn't be
         * necessary unless the host controller has special DMA requirements,
         * such as alignment constraints.  If these are not specified, the
         * general usb_hcd_(un)?map_urb_for_dma functions will be used instead
         * (and it may be a good idea to call these functions in your HCD
         * implementation)
         */
        int        (*map_urb_for_dma)(struct usb_hcd *hcd, struct urb *urb,
                                   gfp_t mem_flags);
        void    (*unmap_urb_for_dma)(struct usb_hcd *hcd, struct urb *urb);

        /* hw synch, freeing endpoint resources that urb_dequeue can't */
        void        (*endpoint_disable)(struct usb_hcd *hcd,
                        struct usb_host_endpoint *ep);

        /* (optional) reset any endpoint state such as sequence number
           and current window */
        void        (*endpoint_reset)(struct usb_hcd *hcd,
                        struct usb_host_endpoint *ep);

        /* root hub support */
        int        (*hub_status_data) (struct usb_hcd *hcd, char *buf);
        int        (*hub_control) (struct usb_hcd *hcd,
                                u16 typeReq, u16 wValue, u16 wIndex,
                                char *buf, u16 wLength);
        int        (*bus_suspend)(struct usb_hcd *);
        int        (*bus_resume)(struct usb_hcd *);
        int        (*start_port_reset)(struct usb_hcd *, unsigned port_num);
        unsigned long        (*get_resuming_ports)(struct usb_hcd *);

                /* force handover of high-speed port to full-speed companion */
        void        (*relinquish_port)(struct usb_hcd *, int);
                /* has a port been handed over to a companion? */
        int        (*port_handed_over)(struct usb_hcd *, int);

                /* CLEAR_TT_BUFFER completion callback */
        void        (*clear_tt_buffer_complete)(struct usb_hcd *,
                                struct usb_host_endpoint *);

        /* xHCI specific functions */
                /* Called by usb_alloc_dev to alloc HC device structures */
        int        (*alloc_dev)(struct usb_hcd *, struct usb_device *);
                /* Called by usb_disconnect to free HC device structures */
        void        (*free_dev)(struct usb_hcd *, struct usb_device *);
        /* Change a group of bulk endpoints to support multiple stream IDs */
        int        (*alloc_streams)(struct usb_hcd *hcd, struct usb_device *udev,
                struct usb_host_endpoint **eps, unsigned int num_eps,
                unsigned int num_streams, gfp_t mem_flags);
        /* Reverts a group of bulk endpoints back to not using stream IDs.
         * Can fail if we run out of memory.
         */
        int        (*free_streams)(struct usb_hcd *hcd, struct usb_device *udev,
                struct usb_host_endpoint **eps, unsigned int num_eps,
                gfp_t mem_flags);

        /* Bandwidth computation functions */
        /* Note that add_endpoint() can only be called once per endpoint before
         * check_bandwidth() or reset_bandwidth() must be called.
         * drop_endpoint() can only be called once per endpoint also.
         * A call to xhci_drop_endpoint() followed by a call to
         * xhci_add_endpoint() will add the endpoint to the schedule with
         * possibly new parameters denoted by a different endpoint descriptor
         * in usb_host_endpoint.  A call to xhci_add_endpoint() followed by a
         * call to xhci_drop_endpoint() is not allowed.
         */
                /* Allocate endpoint resources and add them to a new schedule */
        int        (*add_endpoint)(struct usb_hcd *, struct usb_device *,
                                struct usb_host_endpoint *);
                /* Drop an endpoint from a new schedule */
        int        (*drop_endpoint)(struct usb_hcd *, struct usb_device *,
                                 struct usb_host_endpoint *);
                /* Check that a new hardware configuration, set using
                 * endpoint_enable and endpoint_disable, does not exceed bus
                 * bandwidth.  This must be called before any set configuration
                 * or set interface requests are sent to the device.
                 */
        int        (*check_bandwidth)(struct usb_hcd *, struct usb_device *);
                /* Reset the device schedule to the last known good schedule,
                 * which was set from a previous successful call to
                 * check_bandwidth().  This reverts any add_endpoint() and
                 * drop_endpoint() calls since that last successful call.
                 * Used for when a check_bandwidth() call fails due to resource
                 * or bandwidth constraints.
                 */
        void        (*reset_bandwidth)(struct usb_hcd *, struct usb_device *);
                /* Set the hardware-chosen device address */
        int        (*address_device)(struct usb_hcd *, struct usb_device *udev,
                                  unsigned int timeout_ms);
                /* prepares the hardware to send commands to the device */
        int        (*enable_device)(struct usb_hcd *, struct usb_device *udev);
                /* Notifies the HCD after a hub descriptor is fetched.
                 * Will block.
                 */
        int        (*update_hub_device)(struct usb_hcd *, struct usb_device *hdev,
                        struct usb_tt *tt, gfp_t mem_flags);
        int        (*reset_device)(struct usb_hcd *, struct usb_device *);
                /* Notifies the HCD after a device is connected and its
                 * address is set
                 */
        int        (*update_device)(struct usb_hcd *, struct usb_device *);
        int        (*set_usb2_hw_lpm)(struct usb_hcd *, struct usb_device *, int);
        /* USB 3.0 Link Power Management */
                /* Returns the USB3 hub-encoded value for the U1/U2 timeout. */
        int        (*enable_usb3_lpm_timeout)(struct usb_hcd *,
                        struct usb_device *, enum usb3_link_state state);
                /* The xHCI host controller can still fail the command to
                 * disable the LPM timeouts, so this can return an error code.
                 */
        int        (*disable_usb3_lpm_timeout)(struct usb_hcd *,
                        struct usb_device *, enum usb3_link_state state);
        int        (*find_raw_port_number)(struct usb_hcd *, int);
        /* Call for power on/off the port if necessary */
        int        (*port_power)(struct usb_hcd *hcd, int portnum, bool enable);
        /* Call for SINGLE_STEP_SET_FEATURE Test for USB2 EH certification */
#define EHSET_TEST_SINGLE_STEP_SET_FEATURE 0x06
        int        (*submit_single_step_set_feature)(struct usb_hcd *,
                        struct urb *, int);
};

static inline int hcd_giveback_urb_in_bh(struct usb_hcd *hcd)
{
        return hcd->driver->flags & HCD_BH;
}

static inline bool hcd_periodic_completion_in_progress(struct usb_hcd *hcd,
                struct usb_host_endpoint *ep)
{
        return hcd->high_prio_bh.completing_ep == ep;
}

static inline bool hcd_uses_dma(struct usb_hcd *hcd)
{
        return IS_ENABLED(CONFIG_HAS_DMA) && (hcd->driver->flags & HCD_DMA);
}

extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb);
extern int usb_hcd_check_unlink_urb(struct usb_hcd *hcd, struct urb *urb,
                int status);
extern void usb_hcd_unlink_urb_from_ep(struct usb_hcd *hcd, struct urb *urb);

extern int usb_hcd_submit_urb(struct urb *urb, gfp_t mem_flags);
extern int usb_hcd_unlink_urb(struct urb *urb, int status);
extern void usb_hcd_giveback_urb(struct usb_hcd *hcd, struct urb *urb,
                int status);
extern int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
                gfp_t mem_flags);
extern void usb_hcd_unmap_urb_setup_for_dma(struct usb_hcd *, struct urb *);
extern void usb_hcd_unmap_urb_for_dma(struct usb_hcd *, struct urb *);
extern void usb_hcd_flush_endpoint(struct usb_device *udev,
                struct usb_host_endpoint *ep);
extern void usb_hcd_disable_endpoint(struct usb_device *udev,
                struct usb_host_endpoint *ep);
extern void usb_hcd_reset_endpoint(struct usb_device *udev,
                struct usb_host_endpoint *ep);
extern void usb_hcd_synchronize_unlinks(struct usb_device *udev);
extern int usb_hcd_alloc_bandwidth(struct usb_device *udev,
                struct usb_host_config *new_config,
                struct usb_host_interface *old_alt,
                struct usb_host_interface *new_alt);
extern int usb_hcd_get_frame_number(struct usb_device *udev);

struct usb_hcd *__usb_create_hcd(const struct hc_driver *driver,
                struct device *sysdev, struct device *dev, const char *bus_name,
                struct usb_hcd *primary_hcd);
extern struct usb_hcd *usb_create_hcd(const struct hc_driver *driver,
                struct device *dev, const char *bus_name);
extern struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
                struct device *dev, const char *bus_name,
                struct usb_hcd *shared_hcd);
extern struct usb_hcd *usb_get_hcd(struct usb_hcd *hcd);
extern void usb_put_hcd(struct usb_hcd *hcd);
extern int usb_hcd_is_primary_hcd(struct usb_hcd *hcd);
extern int usb_add_hcd(struct usb_hcd *hcd,
                unsigned int irqnum, unsigned long irqflags);
extern void usb_remove_hcd(struct usb_hcd *hcd);
extern int usb_hcd_find_raw_port_number(struct usb_hcd *hcd, int port1);
int usb_hcd_setup_local_mem(struct usb_hcd *hcd, phys_addr_t phys_addr,
                            dma_addr_t dma, size_t size);

struct platform_device;
extern void usb_hcd_platform_shutdown(struct platform_device *dev);
#ifdef CONFIG_USB_HCD_TEST_MODE
extern int ehset_single_step_set_feature(struct usb_hcd *hcd, int port);
#else
static inline int ehset_single_step_set_feature(struct usb_hcd *hcd, int port)
{
        return 0;
}
#endif /* CONFIG_USB_HCD_TEST_MODE */

#ifdef CONFIG_USB_PCI
struct pci_dev;
struct pci_device_id;
extern int usb_hcd_pci_probe(struct pci_dev *dev,
                             const struct hc_driver *driver);
extern void usb_hcd_pci_remove(struct pci_dev *dev);
extern void usb_hcd_pci_shutdown(struct pci_dev *dev);

#ifdef CONFIG_USB_PCI_AMD
extern int usb_hcd_amd_remote_wakeup_quirk(struct pci_dev *dev);

static inline bool usb_hcd_amd_resume_bug(struct pci_dev *dev,
                                          const struct hc_driver *driver)
{
        if (!usb_hcd_amd_remote_wakeup_quirk(dev))
                return false;
        if (driver->flags & (HCD_USB11 | HCD_USB3))
                return true;
        return false;
}
#else /* CONFIG_USB_PCI_AMD */
static inline bool usb_hcd_amd_resume_bug(struct pci_dev *dev,
                                          const struct hc_driver *driver)
{
        return false;
}
#endif
extern const struct dev_pm_ops usb_hcd_pci_pm_ops;
#endif /* CONFIG_USB_PCI */

/* pci-ish (pdev null is ok) buffer alloc/mapping support */
void usb_init_pool_max(void);
int hcd_buffer_create(struct usb_hcd *hcd);
void hcd_buffer_destroy(struct usb_hcd *hcd);

void *hcd_buffer_alloc(struct usb_bus *bus, size_t size,
        gfp_t mem_flags, dma_addr_t *dma);
void hcd_buffer_free(struct usb_bus *bus, size_t size,
        void *addr, dma_addr_t dma);

void *hcd_buffer_alloc_pages(struct usb_hcd *hcd,
                size_t size, gfp_t mem_flags, dma_addr_t *dma);
void hcd_buffer_free_pages(struct usb_hcd *hcd,
                size_t size, void *addr, dma_addr_t dma);

/* generic bus glue, needed for host controllers that don't use PCI */
extern irqreturn_t usb_hcd_irq(int irq, void *__hcd);

extern void usb_hc_died(struct usb_hcd *hcd);
extern void usb_hcd_poll_rh_status(struct usb_hcd *hcd);
extern void usb_wakeup_notification(struct usb_device *hdev,
                unsigned int portnum);

extern void usb_hcd_start_port_resume(struct usb_bus *bus, int portnum);
extern void usb_hcd_end_port_resume(struct usb_bus *bus, int portnum);

/* The D0/D1 toggle bits ... USE WITH CAUTION (they're almost hcd-internal) */
#define usb_gettoggle(dev, ep, out) (((dev)->toggle[out] >> (ep)) & 1)
#define        usb_dotoggle(dev, ep, out)  ((dev)->toggle[out] ^= (1 << (ep)))
#define usb_settoggle(dev, ep, out, bit) \
                ((dev)->toggle[out] = ((dev)->toggle[out] & ~(1 << (ep))) | \
                 ((bit) << (ep)))

/* -------------------------------------------------------------------------- */

/* Enumeration is only for the hub driver, or HCD virtual root hubs */
extern struct usb_device *usb_alloc_dev(struct usb_device *parent,
                                        struct usb_bus *, unsigned port);
extern int usb_new_device(struct usb_device *dev);
extern void usb_disconnect(struct usb_device **);

extern int usb_get_configuration(struct usb_device *dev);
extern void usb_destroy_configuration(struct usb_device *dev);

/*-------------------------------------------------------------------------*/

/*
 * HCD Root Hub support
 */

#include <linux/usb/ch11.h>

/*
 * As of USB 2.0, full/low speed devices are segregated into trees.
 * One type grows from USB 1.1 host controllers (OHCI, UHCI etc).
 * The other type grows from high speed hubs when they connect to
 * full/low speed devices using "Transaction Translators" (TTs).
 *
 * TTs should only be known to the hub driver, and high speed bus
 * drivers (only EHCI for now).  They affect periodic scheduling and
 * sometimes control/bulk error recovery.
 */

struct usb_device;

struct usb_tt {
        struct usb_device        *hub;        /* upstream highspeed hub */
        int                        multi;        /* true means one TT per port */
        unsigned                think_time;        /* think time in ns */
        void                        *hcpriv;        /* HCD private data */

        /* for control/bulk error recovery (CLEAR_TT_BUFFER) */
        spinlock_t                lock;
        struct list_head        clear_list;        /* of usb_tt_clear */
        struct work_struct        clear_work;
};

struct usb_tt_clear {
        struct list_head        clear_list;
        unsigned                tt;
        u16                        devinfo;
        struct usb_hcd                *hcd;
        struct usb_host_endpoint        *ep;
};

extern int usb_hub_clear_tt_buffer(struct urb *urb);
extern void usb_ep0_reinit(struct usb_device *);

/* (shifted) direction/type/recipient from the USB 2.0 spec, table 9.2 */
#define DeviceRequest \
        ((USB_DIR_IN|USB_TYPE_STANDARD|USB_RECIP_DEVICE)<<8)
#define DeviceOutRequest \
        ((USB_DIR_OUT|USB_TYPE_STANDARD|USB_RECIP_DEVICE)<<8)

#define InterfaceRequest \
        ((USB_DIR_IN|USB_TYPE_STANDARD|USB_RECIP_INTERFACE)<<8)

#define EndpointRequest \
        ((USB_DIR_IN|USB_TYPE_STANDARD|USB_RECIP_ENDPOINT)<<8)
#define EndpointOutRequest \
        ((USB_DIR_OUT|USB_TYPE_STANDARD|USB_RECIP_ENDPOINT)<<8)

/* class requests from the USB 2.0 hub spec, table 11-15 */
#define HUB_CLASS_REQ(dir, type, request) ((((dir) | (type)) << 8) | (request))
/* GetBusState and SetHubDescriptor are optional, omitted */
#define ClearHubFeature                HUB_CLASS_REQ(USB_DIR_OUT, USB_RT_HUB, USB_REQ_CLEAR_FEATURE)
#define ClearPortFeature        HUB_CLASS_REQ(USB_DIR_OUT, USB_RT_PORT, USB_REQ_CLEAR_FEATURE)
#define GetHubDescriptor        HUB_CLASS_REQ(USB_DIR_IN, USB_RT_HUB, USB_REQ_GET_DESCRIPTOR)
#define GetHubStatus                HUB_CLASS_REQ(USB_DIR_IN, USB_RT_HUB, USB_REQ_GET_STATUS)
#define GetPortStatus                HUB_CLASS_REQ(USB_DIR_IN, USB_RT_PORT, USB_REQ_GET_STATUS)
#define SetHubFeature                HUB_CLASS_REQ(USB_DIR_OUT, USB_RT_HUB, USB_REQ_SET_FEATURE)
#define SetPortFeature                HUB_CLASS_REQ(USB_DIR_OUT, USB_RT_PORT, USB_REQ_SET_FEATURE)
#define ClearTTBuffer                HUB_CLASS_REQ(USB_DIR_OUT, USB_RT_PORT, HUB_CLEAR_TT_BUFFER)
#define ResetTT                        HUB_CLASS_REQ(USB_DIR_OUT, USB_RT_PORT, HUB_RESET_TT)
#define GetTTState                HUB_CLASS_REQ(USB_DIR_IN, USB_RT_PORT, HUB_GET_TT_STATE)
#define StopTT                        HUB_CLASS_REQ(USB_DIR_OUT, USB_RT_PORT, HUB_STOP_TT)


/*-------------------------------------------------------------------------*/

/* class requests from USB 3.1 hub spec, table 10-7 */
#define SetHubDepth                HUB_CLASS_REQ(USB_DIR_OUT, USB_RT_HUB, HUB_SET_DEPTH)
#define GetPortErrorCount        HUB_CLASS_REQ(USB_DIR_IN, USB_RT_PORT, HUB_GET_PORT_ERR_COUNT)

/*
 * Generic bandwidth allocation constants/support
 */
#define FRAME_TIME_USECS        1000L
#define BitTime(bytecount) (7 * 8 * bytecount / 6) /* with integer truncation */
                /* Trying not to use worst-case bit-stuffing
                 * of (7/6 * 8 * bytecount) = 9.33 * bytecount */
                /* bytecount = data payload byte count */

#define NS_TO_US(ns)        DIV_ROUND_UP(ns, 1000L)
                        /* convert nanoseconds to microseconds, rounding up */

/*
 * Full/low speed bandwidth allocation constants/support.
 */
#define BW_HOST_DELAY        1000L                /* nanoseconds */
#define BW_HUB_LS_SETUP        333L                /* nanoseconds */
                        /* 4 full-speed bit times (est.) */

#define FRAME_TIME_BITS                        12000L        /* frame = 1 millisecond */
#define FRAME_TIME_MAX_BITS_ALLOC        (90L * FRAME_TIME_BITS / 100L)
#define FRAME_TIME_MAX_USECS_ALLOC        (90L * FRAME_TIME_USECS / 100L)

/*
 * Ceiling [nano/micro]seconds (typical) for that many bytes at high speed
 * ISO is a bit less, no ACK ... from USB 2.0 spec, 5.11.3 (and needed
 * to preallocate bandwidth)
 */
#define USB2_HOST_DELAY        5        /* nsec, guess */
#define HS_NSECS(bytes) (((55 * 8 * 2083) \
        + (2083UL * (3 + BitTime(bytes))))/1000 \
        + USB2_HOST_DELAY)
#define HS_NSECS_ISO(bytes) (((38 * 8 * 2083) \
        + (2083UL * (3 + BitTime(bytes))))/1000 \
        + USB2_HOST_DELAY)
#define HS_USECS(bytes)                NS_TO_US(HS_NSECS(bytes))
#define HS_USECS_ISO(bytes)        NS_TO_US(HS_NSECS_ISO(bytes))

extern long usb_calc_bus_time(int speed, int is_input,
                        int isoc, int bytecount);

/*-------------------------------------------------------------------------*/

extern void usb_set_device_state(struct usb_device *udev,
                enum usb_device_state new_state);

/*-------------------------------------------------------------------------*/

/* exported only within usbcore */

extern struct idr usb_bus_idr;
extern struct mutex usb_bus_idr_lock;
extern wait_queue_head_t usb_kill_urb_queue;


#define usb_endpoint_out(ep_dir)        (!((ep_dir) & USB_DIR_IN))

#ifdef CONFIG_PM
extern unsigned usb_wakeup_enabled_descendants(struct usb_device *udev);
extern void usb_root_hub_lost_power(struct usb_device *rhdev);
extern int hcd_bus_suspend(struct usb_device *rhdev, pm_message_t msg);
extern int hcd_bus_resume(struct usb_device *rhdev, pm_message_t msg);
extern void usb_hcd_resume_root_hub(struct usb_hcd *hcd);
#else
static inline unsigned usb_wakeup_enabled_descendants(struct usb_device *udev)
{
        return 0;
}
static inline void usb_hcd_resume_root_hub(struct usb_hcd *hcd)
{
        return;
}
#endif /* CONFIG_PM */

/*-------------------------------------------------------------------------*/

#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE)

struct usb_mon_operations {
        void (*urb_submit)(struct usb_bus *bus, struct urb *urb);
        void (*urb_submit_error)(struct usb_bus *bus, struct urb *urb, int err);
        void (*urb_complete)(struct usb_bus *bus, struct urb *urb, int status);
        /* void (*urb_unlink)(struct usb_bus *bus, struct urb *urb); */
};

extern const struct usb_mon_operations *mon_ops;

static inline void usbmon_urb_submit(struct usb_bus *bus, struct urb *urb)
{
        if (bus->monitored)
                (*mon_ops->urb_submit)(bus, urb);
}

static inline void usbmon_urb_submit_error(struct usb_bus *bus, struct urb *urb,
    int error)
{
        if (bus->monitored)
                (*mon_ops->urb_submit_error)(bus, urb, error);
}

static inline void usbmon_urb_complete(struct usb_bus *bus, struct urb *urb,
                int status)
{
        if (bus->monitored)
                (*mon_ops->urb_complete)(bus, urb, status);
}

int usb_mon_register(const struct usb_mon_operations *ops);
void usb_mon_deregister(void);

#else

static inline void usbmon_urb_submit(struct usb_bus *bus, struct urb *urb) {}
static inline void usbmon_urb_submit_error(struct usb_bus *bus, struct urb *urb,
    int error) {}
static inline void usbmon_urb_complete(struct usb_bus *bus, struct urb *urb,
                int status) {}

#endif /* CONFIG_USB_MON || CONFIG_USB_MON_MODULE */

/*-------------------------------------------------------------------------*/

/* random stuff */

/* This rwsem is for use only by the hub driver and ehci-hcd.
 * Nobody else should touch it.
 */
extern struct rw_semaphore ehci_cf_port_reset_rwsem;

#endif /* __KERNEL__ */

#endif /* __USB_CORE_HCD_H */



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   28 
























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
/*
 * Performance events:
 *
 *    Copyright (C) 2008-2009, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
 *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
 *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
 *
 * Data type definitions, declarations, prototypes.
 *
 *    Started by: Thomas Gleixner and Ingo Molnar
 *
 * For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_PERF_EVENT_H
#define _LINUX_PERF_EVENT_H

#include <uapi/linux/perf_event.h>
#include <uapi/linux/bpf_perf_event.h>

/*
 * Kernel-internal data types and definitions:
 */

#ifdef CONFIG_PERF_EVENTS
# include <asm/perf_event.h>
# include <asm/local64.h>
#endif

#ifdef CONFIG_HAVE_HW_BREAKPOINT
# include <linux/rhashtable-types.h>
# include <asm/hw_breakpoint.h>
#endif

#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/pid_namespace.h>
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/irq_work.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#include <linux/atomic.h>
#include <linux/sysfs.h>
#include <linux/perf_regs.h>
#include <linux/cgroup.h>
#include <linux/refcount.h>
#include <linux/security.h>
#include <linux/static_call.h>
#include <linux/lockdep.h>

#include <asm/local.h>

struct perf_callchain_entry {
        u64                                nr;
        u64                                ip[]; /* /proc/sys/kernel/perf_event_max_stack */
};

struct perf_callchain_entry_ctx {
        struct perf_callchain_entry        *entry;
        u32                                max_stack;
        u32                                nr;
        short                                contexts;
        bool                                contexts_maxed;
};

typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
                                     unsigned long off, unsigned long len);

struct perf_raw_frag {
        union {
                struct perf_raw_frag        *next;
                unsigned long                pad;
        };
        perf_copy_f                        copy;
        void                                *data;
        u32                                size;
} __packed;

struct perf_raw_record {
        struct perf_raw_frag                frag;
        u32                                size;
};

static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
{
        return frag->pad < sizeof(u64);
}

/*
 * branch stack layout:
 *  nr: number of taken branches stored in entries[]
 *  hw_idx: The low level index of raw branch records
 *          for the most recent branch.
 *          -1ULL means invalid/unknown.
 *
 * Note that nr can vary from sample to sample
 * branches (to, from) are stored from most recent
 * to least recent, i.e., entries[0] contains the most
 * recent branch.
 * The entries[] is an abstraction of raw branch records,
 * which may not be stored in age order in HW, e.g. Intel LBR.
 * The hw_idx is to expose the low level index of raw
 * branch record for the most recent branch aka entries[0].
 * The hw_idx index is between -1 (unknown) and max depth,
 * which can be retrieved in /sys/devices/cpu/caps/branches.
 * For the architectures whose raw branch records are
 * already stored in age order, the hw_idx should be 0.
 */
struct perf_branch_stack {
        u64                                nr;
        u64                                hw_idx;
        struct perf_branch_entry        entries[];
};

struct task_struct;

/*
 * extra PMU register associated with an event
 */
struct hw_perf_event_extra {
        u64                                config;        /* register value */
        unsigned int                        reg;        /* register address or index */
        int                                alloc;        /* extra register already allocated */
        int                                idx;        /* index in shared_regs->regs[] */
};

/**
 * hw_perf_event::flag values
 *
 * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
 * usage.
 */
#define PERF_EVENT_FLAG_ARCH                0x0fffffff
#define PERF_EVENT_FLAG_USER_READ_CNT        0x80000000

static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0);

/**
 * struct hw_perf_event - performance event hardware details:
 */
struct hw_perf_event {
#ifdef CONFIG_PERF_EVENTS
        union {
                struct { /* hardware */
                        u64                config;
                        u64                config1;
                        u64                last_tag;
                        u64                dyn_constraint;
                        unsigned long        config_base;
                        unsigned long        event_base;
                        int                event_base_rdpmc;
                        int                idx;
                        int                last_cpu;
                        int                flags;

                        struct hw_perf_event_extra extra_reg;
                        struct hw_perf_event_extra branch_reg;
                };
                struct { /* aux / Intel-PT */
                        u64                aux_config;
                        /*
                         * For AUX area events, aux_paused cannot be a state
                         * flag because it can be updated asynchronously to
                         * state.
                         */
                        unsigned int        aux_paused;
                };
                struct { /* software */
                        struct hrtimer        hrtimer;
                };
                struct { /* tracepoint */
                        /* for tp_event->class */
                        struct list_head        tp_list;
                };
                struct { /* amd_power */
                        u64        pwr_acc;
                        u64        ptsc;
                };
#ifdef CONFIG_HAVE_HW_BREAKPOINT
                struct { /* breakpoint */
                        /*
                         * Crufty hack to avoid the chicken and egg
                         * problem hw_breakpoint has with context
                         * creation and event initalization.
                         */
                        struct arch_hw_breakpoint        info;
                        struct rhlist_head                bp_list;
                };
#endif
                struct { /* amd_iommu */
                        u8        iommu_bank;
                        u8        iommu_cntr;
                        u16        padding;
                        u64        conf;
                        u64        conf1;
                };
        };
        /*
         * If the event is a per task event, this will point to the task in
         * question. See the comment in perf_event_alloc().
         */
        struct task_struct                *target;

        /*
         * PMU would store hardware filter configuration
         * here.
         */
        void                                *addr_filters;

        /* Last sync'ed generation of filters */
        unsigned long                        addr_filters_gen;

/*
 * hw_perf_event::state flags; used to track the PERF_EF_* state.
 */

/* the counter is stopped */
#define PERF_HES_STOPPED                0x01

/* event->count up-to-date */
#define PERF_HES_UPTODATE                0x02

#define PERF_HES_ARCH                        0x04

        int                                state;

        /*
         * The last observed hardware counter value, updated with a
         * local64_cmpxchg() such that pmu::read() can be called nested.
         */
        local64_t                        prev_count;

        /*
         * The period to start the next sample with.
         */
        u64                                sample_period;

        union {
                struct { /* Sampling */
                        /*
                         * The period we started this sample with.
                         */
                        u64                                last_period;

                        /*
                         * However much is left of the current period;
                         * note that this is a full 64bit value and
                         * allows for generation of periods longer
                         * than hardware might allow.
                         */
                        local64_t                        period_left;
                };
                struct { /* Topdown events counting for context switch */
                        u64                                saved_metric;
                        u64                                saved_slots;
                };
        };

        /*
         * State for throttling the event, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                             interrupts_seq;
        u64                                interrupts;

        /*
         * State for freq target events, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                                freq_time_stamp;
        u64                                freq_count_stamp;
#endif /* CONFIG_PERF_EVENTS */
};

struct perf_event;
struct perf_event_pmu_context;

/*
 * Common implementation detail of pmu::{start,commit,cancel}_txn
 */

/* txn to add/schedule event on PMU */
#define PERF_PMU_TXN_ADD                0x1

/* txn to read event group from PMU */
#define PERF_PMU_TXN_READ                0x2

/**
 * pmu::capabilities flags
 */
#define PERF_PMU_CAP_NO_INTERRUPT        0x0001
#define PERF_PMU_CAP_NO_NMI                0x0002
#define PERF_PMU_CAP_AUX_NO_SG                0x0004
#define PERF_PMU_CAP_EXTENDED_REGS        0x0008
#define PERF_PMU_CAP_EXCLUSIVE                0x0010
#define PERF_PMU_CAP_ITRACE                0x0020
#define PERF_PMU_CAP_NO_EXCLUDE                0x0040
#define PERF_PMU_CAP_AUX_OUTPUT                0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE        0x0100
#define PERF_PMU_CAP_AUX_PAUSE                0x0200
#define PERF_PMU_CAP_AUX_PREFER_LARGE        0x0400
#define PERF_PMU_CAP_MEDIATED_VPMU        0x0800

/**
 * pmu::scope
 */
enum perf_pmu_scope {
        PERF_PMU_SCOPE_NONE = 0,
        PERF_PMU_SCOPE_CORE,
        PERF_PMU_SCOPE_DIE,
        PERF_PMU_SCOPE_CLUSTER,
        PERF_PMU_SCOPE_PKG,
        PERF_PMU_SCOPE_SYS_WIDE,
        PERF_PMU_MAX_SCOPE,
};

struct perf_output_handle;

#define PMU_NULL_DEV        ((void *)(~0UL))

/**
 * struct pmu - generic performance monitoring unit
 */
struct pmu {
        struct list_head                entry;

        spinlock_t                        events_lock;
        struct list_head                events;

        struct module                        *module;
        struct device                        *dev;
        struct device                        *parent;
        const struct attribute_group        **attr_groups;
        const struct attribute_group        **attr_update;
        const char                        *name;
        int                                type;

        /*
         * various common per-pmu feature flags
         */
        int                                capabilities;

        /*
         * PMU scope
         */
        unsigned int                        scope;

        struct perf_cpu_pmu_context * __percpu *cpu_pmu_context;
        atomic_t                        exclusive_cnt; /* < 0: cpu; > 0: tsk */
        int                                task_ctx_nr;
        int                                hrtimer_interval_ms;

        /* number of address filters this PMU can do */
        unsigned int                        nr_addr_filters;

        /*
         * Fully disable/enable this PMU, can be used to protect from the PMI
         * as well as for lazy/batch writing of the MSRs.
         */
        void (*pmu_enable)                (struct pmu *pmu); /* optional */
        void (*pmu_disable)                (struct pmu *pmu); /* optional */

        /*
         * Try and initialize the event for this PMU.
         *
         * Returns:
         *  -ENOENT        -- @event is not for this PMU
         *
         *  -ENODEV        -- @event is for this PMU but PMU not present
         *  -EBUSY        -- @event is for this PMU but PMU temporarily unavailable
         *  -EINVAL        -- @event is for this PMU but @event is not valid
         *  -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported
         *  -EACCES        -- @event is for this PMU, @event is valid, but no privileges
         *
         *  0                -- @event is for this PMU and valid
         *
         * Other error return values are allowed.
         */
        int (*event_init)                (struct perf_event *event);

        /*
         * Notification that the event was mapped or unmapped.  Called
         * in the context of the mapping task.
         */
        void (*event_mapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */
        void (*event_unmapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */

        /*
         * Flags for ->add()/->del()/ ->start()/->stop(). There are
         * matching hw_perf_event::state flags.
         */

/* start the counter when adding    */
#define PERF_EF_START                        0x01

/* reload the counter when starting */
#define PERF_EF_RELOAD                        0x02

/* update the counter when stopping */
#define PERF_EF_UPDATE                        0x04

/* AUX area event, pause tracing */
#define PERF_EF_PAUSE                        0x08

/* AUX area event, resume tracing */
#define PERF_EF_RESUME                        0x10

        /*
         * Adds/Removes a counter to/from the PMU, can be done inside a
         * transaction, see the ->*_txn() methods.
         *
         * The add/del callbacks will reserve all hardware resources required
         * to service the event, this includes any counter constraint
         * scheduling etc.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on.
         *
         * ->add() called without PERF_EF_START should result in the same state
         *  as ->add() followed by ->stop().
         *
         * ->del() must always PERF_EF_UPDATE stop an event. If it calls
         *  ->stop() that must deal with already being stopped without
         *  PERF_EF_UPDATE.
         */
        int  (*add)                        (struct perf_event *event, int flags);
        void (*del)                        (struct perf_event *event, int flags);

        /*
         * Starts/Stops a counter present on the PMU.
         *
         * The PMI handler should stop the counter when perf_event_overflow()
         * returns !0. ->start() will be used to continue.
         *
         * Also used to change the sample period.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on -- will be called from NMI context with the PMU generates
         * NMIs.
         *
         * ->stop() with PERF_EF_UPDATE will read the counter and update
         *  period/count values like ->read() would.
         *
         * ->start() with PERF_EF_RELOAD will reprogram the counter
         *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
         *
         * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not
         * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with
         * PERF_EF_RESUME.
         *
         * ->start() with PERF_EF_RESUME will start as simply as possible but
         * only if the counter is not otherwise stopped. Will not overlap
         * another ->start() with PERF_EF_RESUME nor ->stop() with
         * PERF_EF_PAUSE.
         *
         * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other
         * ->stop()/->start() invocations, just not itself.
         */
        void (*start)                        (struct perf_event *event, int flags);
        void (*stop)                        (struct perf_event *event, int flags);

        /*
         * Updates the counter value of the event.
         *
         * For sampling capable PMUs this will also update the software period
         * hw_perf_event::period_left field.
         */
        void (*read)                        (struct perf_event *event);

        /*
         * Group events scheduling is treated as a transaction, add
         * group events as a whole and perform one schedulability test.
         * If the test fails, roll back the whole group
         *
         * Start the transaction, after this ->add() doesn't need to
         * do schedulability tests.
         *
         * Optional.
         */
        void (*start_txn)                (struct pmu *pmu, unsigned int txn_flags);
        /*
         * If ->start_txn() disabled the ->add() schedulability test
         * then ->commit_txn() is required to perform one. On success
         * the transaction is closed. On error the transaction is kept
         * open until ->cancel_txn() is called.
         *
         * Optional.
         */
        int  (*commit_txn)                (struct pmu *pmu);
        /*
         * Will cancel the transaction, assumes ->del() is called
         * for each successful ->add() during the transaction.
         *
         * Optional.
         */
        void (*cancel_txn)                (struct pmu *pmu);

        /*
         * Will return the value for perf_event_mmap_page::index for this event,
         * if no implementation is provided it will default to 0 (see
         * perf_event_idx_default).
         */
        int (*event_idx)                (struct perf_event *event); /*optional */

        /*
         * context-switches callback
         */
        void (*sched_task)                (struct perf_event_pmu_context *pmu_ctx,
                                         struct task_struct *task, bool sched_in);

        /*
         * Kmem cache of PMU specific data
         */
        struct kmem_cache                *task_ctx_cache;

        /*
         * Set up pmu-private data structures for an AUX area
         */
        void *(*setup_aux)                (struct perf_event *event, void **pages,
                                         int nr_pages, bool overwrite);
                                        /* optional */

        /*
         * Free pmu-private AUX data structures
         */
        void (*free_aux)                (void *aux); /* optional */

        /*
         * Take a snapshot of the AUX buffer without touching the event
         * state, so that preempting ->start()/->stop() callbacks does
         * not interfere with their logic. Called in PMI context.
         *
         * Returns the size of AUX data copied to the output handle.
         *
         * Optional.
         */
        long (*snapshot_aux)                (struct perf_event *event,
                                         struct perf_output_handle *handle,
                                         unsigned long size);

        /*
         * Validate address range filters: make sure the HW supports the
         * requested configuration and number of filters; return 0 if the
         * supplied filters are valid, -errno otherwise.
         *
         * Runs in the context of the ioctl()ing process and is not serialized
         * with the rest of the PMU callbacks.
         */
        int (*addr_filters_validate)        (struct list_head *filters);
                                        /* optional */

        /*
         * Synchronize address range filter configuration:
         * translate hw-agnostic filters into hardware configuration in
         * event::hw::addr_filters.
         *
         * Runs as a part of filter sync sequence that is done in ->start()
         * callback by calling perf_event_addr_filters_sync().
         *
         * May (and should) traverse event::addr_filters::list, for which its
         * caller provides necessary serialization.
         */
        void (*addr_filters_sync)        (struct perf_event *event);
                                        /* optional */

        /*
         * Check if event can be used for aux_output purposes for
         * events of this PMU.
         *
         * Runs from perf_event_open(). Should return 0 for "no match"
         * or non-zero for "match".
         */
        int (*aux_output_match)                (struct perf_event *event);
                                        /* optional */

        /*
         * Skip programming this PMU on the given CPU. Typically needed for
         * big.LITTLE things.
         */
        bool (*filter)                        (struct pmu *pmu, int cpu); /* optional */

        /*
         * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
         */
        int (*check_period)                (struct perf_event *event, u64 value); /* optional */
};

enum perf_addr_filter_action_t {
        PERF_ADDR_FILTER_ACTION_STOP = 0,
        PERF_ADDR_FILTER_ACTION_START,
        PERF_ADDR_FILTER_ACTION_FILTER,
};

/**
 * struct perf_addr_filter - address range filter definition
 * @entry:        event's filter list linkage
 * @path:        object file's path for file-based filters
 * @offset:        filter range offset
 * @size:        filter range size (size==0 means single address trigger)
 * @action:        filter/start/stop
 *
 * This is a hardware-agnostic filter configuration as specified by the user.
 */
struct perf_addr_filter {
        struct list_head                entry;
        struct path                        path;
        unsigned long                        offset;
        unsigned long                        size;
        enum perf_addr_filter_action_t        action;
};

/**
 * struct perf_addr_filters_head - container for address range filters
 * @list:        list of filters for this event
 * @lock:        spinlock that serializes accesses to the @list and event's
 *                (and its children's) filter generations.
 * @nr_file_filters:        number of file-based filters
 *
 * A child event will use parent's @list (and therefore @lock), so they are
 * bundled together; see perf_event_addr_filters().
 */
struct perf_addr_filters_head {
        struct list_head                list;
        raw_spinlock_t                        lock;
        unsigned int                        nr_file_filters;
};

struct perf_addr_filter_range {
        unsigned long                        start;
        unsigned long                        size;
};

/*
 * The normal states are:
 *
 *            ACTIVE    --.
 *               ^        |
 *               |        |
 *       sched_{in,out}() |
 *               |        |
 *               v        |
 *      ,---> INACTIVE  --+ <-.
 *      |                 |   |
 *      |                {dis,en}able()
 *   sched_in()           |   |
 *      |       OFF    <--' --+
 *      |                     |
 *      `--->  ERROR    ------'
 *
 * That is:
 *
 * sched_in:       INACTIVE          -> {ACTIVE,ERROR}
 * sched_out:      ACTIVE            -> INACTIVE
 * disable:        {ACTIVE,INACTIVE} -> OFF
 * enable:         {OFF,ERROR}       -> INACTIVE
 *
 * Where {OFF,ERROR} are disabled states.
 *
 * Then we have the {EXIT,REVOKED,DEAD} states which are various shades of
 * defunct events:
 *
 *  - EXIT means task that the even was assigned to died, but child events
 *    still live, and further children can still be created. But the event
 *    itself will never be active again. It can only transition to
 *    {REVOKED,DEAD};
 *
 *  - REVOKED means the PMU the event was associated with is gone; all
 *    functionality is stopped but the event is still alive. Can only
 *    transition to DEAD;
 *
 *  - DEAD event really is DYING tearing down state and freeing bits.
 *
 */
enum perf_event_state {
        PERF_EVENT_STATE_DEAD                = -5,
        PERF_EVENT_STATE_REVOKED        = -4, /* pmu gone, must not touch */
        PERF_EVENT_STATE_EXIT                = -3, /* task died, still inherit */
        PERF_EVENT_STATE_ERROR                = -2, /* scheduling error, can enable */
        PERF_EVENT_STATE_OFF                = -1,
        PERF_EVENT_STATE_INACTIVE        =  0,
        PERF_EVENT_STATE_ACTIVE                =  1,
};

struct file;
struct perf_sample_data;

typedef void (*perf_overflow_handler_t)(struct perf_event *,
                                        struct perf_sample_data *,
                                        struct pt_regs *regs);

/*
 * Event capabilities. For event_caps and groups caps.
 *
 * PERF_EV_CAP_SOFTWARE: Is a software event.
 * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read
 * from any CPU in the package where it is active.
 * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
 * cannot be a group leader. If an event with this flag is detached from the
 * group it is scheduled out and moved into an unrecoverable ERROR state.
 * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the
 * PMU scope where it is active.
 */
#define PERF_EV_CAP_SOFTWARE                BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG        BIT(1)
#define PERF_EV_CAP_SIBLING                BIT(2)
#define PERF_EV_CAP_READ_SCOPE                BIT(3)

#define SWEVENT_HLIST_BITS                8
#define SWEVENT_HLIST_SIZE                (1 << SWEVENT_HLIST_BITS)

struct swevent_hlist {
        struct hlist_head                heads[SWEVENT_HLIST_SIZE];
        struct rcu_head                        rcu_head;
};

#define PERF_ATTACH_CONTEXT                0x0001
#define PERF_ATTACH_GROUP                0x0002
#define PERF_ATTACH_TASK                0x0004
#define PERF_ATTACH_TASK_DATA                0x0008
#define PERF_ATTACH_GLOBAL_DATA                0x0010
#define PERF_ATTACH_SCHED_CB                0x0020
#define PERF_ATTACH_CHILD                0x0040
#define PERF_ATTACH_EXCLUSIVE                0x0080
#define PERF_ATTACH_CALLCHAIN                0x0100
#define PERF_ATTACH_ITRACE                0x0200

struct bpf_prog;
struct perf_cgroup;
struct perf_buffer;

struct pmu_event_list {
        raw_spinlock_t                        lock;
        struct list_head                list;
};

/*
 * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex
 * as such iteration must hold either lock. However, since ctx->lock is an IRQ
 * safe lock, and is only held by the CPU doing the modification, having IRQs
 * disabled is sufficient since it will hold-off the IPIs.
 */
#ifdef CONFIG_PROVE_LOCKING
# define lockdep_assert_event_ctx(event)                        \
        WARN_ON_ONCE(__lockdep_enabled &&                        \
                     (this_cpu_read(hardirqs_enabled) &&        \
                      lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD))
#else
# define lockdep_assert_event_ctx(event)
#endif

#define for_each_sibling_event(sibling, event)                        \
        lockdep_assert_event_ctx(event);                        \
        if ((event)->group_leader == (event))                        \
                list_for_each_entry((sibling), &(event)->sibling_list, sibling_list)

/**
 * struct perf_event - performance event kernel representation:
 */
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
        /*
         * entry onto perf_event_context::event_list;
         *   modifications require ctx->lock
         *   RCU safe iterations.
         */
        struct list_head                event_entry;

        /*
         * Locked for modification by both ctx->mutex and ctx->lock; holding
         * either sufficies for read.
         */
        struct list_head                sibling_list;
        struct list_head                active_list;
        /*
         * Node on the pinned or flexible tree located at the event context;
         */
        struct rb_node                        group_node;
        u64                                group_index;
        /*
         * We need storage to track the entries in perf_pmu_migrate_context; we
         * cannot use the event_entry because of RCU and we want to keep the
         * group in tact which avoids us using the other two entries.
         */
        struct list_head                migrate_entry;

        struct hlist_node                hlist_entry;
        struct list_head                active_entry;
        int                                nr_siblings;

        /* Not serialized. Only written during event initialization. */
        int                                event_caps;
        /* The cumulative AND of all event_caps for events in this group. */
        int                                group_caps;

        unsigned int                        group_generation;
        struct perf_event                *group_leader;
        /*
         * event->pmu will always point to pmu in which this event belongs.
         * Whereas event->pmu_ctx->pmu may point to other pmu when group of
         * different pmu events is created.
         */
        struct pmu                        *pmu;
        void                                *pmu_private;

        enum perf_event_state                state;
        unsigned int                        attach_state;
        local64_t                        count;
        atomic64_t                        child_count;

        /*
         * These are the total time in nanoseconds that the event
         * has been enabled (i.e. eligible to run, and the task has
         * been scheduled in, if this is a per-task event)
         * and running (scheduled onto the CPU), respectively.
         */
        u64                                total_time_enabled;
        u64                                total_time_running;
        u64                                tstamp;

        struct perf_event_attr                attr;
        u16                                header_size;
        u16                                id_header_size;
        u16                                read_size;
        struct hw_perf_event                hw;

        struct perf_event_context        *ctx;
        /*
         * event->pmu_ctx points to perf_event_pmu_context in which the event
         * is added. This pmu_ctx can be of other pmu for sw event when that
         * sw event is part of a group which also contains non-sw events.
         */
        struct perf_event_pmu_context        *pmu_ctx;
        atomic_long_t                        refcount;

        /*
         * These accumulate total time (in nanoseconds) that children
         * events have been enabled and running, respectively.
         */
        atomic64_t                        child_total_time_enabled;
        atomic64_t                        child_total_time_running;

        /*
         * Protect attach/detach and child_list:
         */
        struct mutex                        child_mutex;
        struct list_head                child_list;
        struct perf_event                *parent;

        int                                oncpu;
        int                                cpu;

        struct list_head                owner_entry;
        struct task_struct                *owner;

        /* mmap bits */
        struct mutex                        mmap_mutex;
        refcount_t                        mmap_count;

        struct perf_buffer                *rb;
        struct list_head                rb_entry;
        unsigned long                        rcu_batches;
        int                                rcu_pending;

        /* poll related */
        wait_queue_head_t                waitq;
        struct fasync_struct                *fasync;

        /* delayed work for NMIs and such */
        unsigned int                        pending_wakeup;
        unsigned int                        pending_kill;
        unsigned int                        pending_disable;
        unsigned long                        pending_addr;        /* SIGTRAP */
        struct irq_work                        pending_irq;
        struct irq_work                        pending_disable_irq;
        struct callback_head                pending_task;
        unsigned int                        pending_work;

        atomic_t                        event_limit;

        /* address range filters */
        struct perf_addr_filters_head        addr_filters;
        /* vma address array for file-based filders */
        struct perf_addr_filter_range        *addr_filter_ranges;
        unsigned long                        addr_filters_gen;

        /* for aux_output events */
        struct perf_event                *aux_event;

        void (*destroy)(struct perf_event *);
        struct rcu_head                        rcu_head;

        struct pid_namespace                *ns;
        u64                                id;

        atomic64_t                        lost_samples;

        u64                                (*clock)(void);
        perf_overflow_handler_t                overflow_handler;
        void                                *overflow_handler_context;
        struct bpf_prog                        *prog;
        u64                                bpf_cookie;

#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call                *tp_event;
        struct event_filter                *filter;
# ifdef CONFIG_FUNCTION_TRACER
        struct ftrace_ops               ftrace_ops;
# endif
#endif

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp; /* cgroup event is attach to */
#endif

#ifdef CONFIG_SECURITY
        void *security;
#endif
        struct list_head                sb_list;
        struct list_head                pmu_list;

        /*
         * Certain events gets forwarded to another pmu internally by over-
         * writing kernel copy of event->attr.type without user being aware
         * of it. event->orig_type contains original 'type' requested by
         * user.
         */
        u32                                orig_type;
#endif /* CONFIG_PERF_EVENTS */
};

/*
 *           ,-----------------------[1:n]------------------------.
 *           V                                                    V
 * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event
 *                                        |                       |
 *                                        `--[n:1]-> pmu <-[1:n]--'
 *
 *
 * struct perf_event_pmu_context  lifetime is refcount based and RCU freed
 * (similar to perf_event_context). Locking is as if it were a member of
 * perf_event_context; specifically:
 *
 *   modification, both: ctx->mutex && ctx->lock
 *   reading, either:    ctx->mutex || ctx->lock
 *
 * There is one exception to this; namely put_pmu_ctx() isn't always called
 * with ctx->mutex held; this means that as long as we can guarantee the epc
 * has events the above rules hold.
 *
 * Specificially, sys_perf_event_open()'s group_leader case depends on
 * ctx->mutex pinning the configuration. Since we hold a reference on
 * group_leader (through the filedesc) it can't go away, therefore it's
 * associated pmu_ctx must exist and cannot change due to ctx->mutex.
 *
 * perf_event holds a refcount on perf_event_context
 * perf_event holds a refcount on perf_event_pmu_context
 */
struct perf_event_pmu_context {
        struct pmu                        *pmu;
        struct perf_event_context       *ctx;

        struct list_head                pmu_ctx_entry;

        struct list_head                pinned_active;
        struct list_head                flexible_active;

        /* Used to identify the per-cpu perf_event_pmu_context */
        unsigned int                        embedded : 1;

        unsigned int                        nr_events;
        unsigned int                        nr_cgroups;
        unsigned int                        nr_freq;

        atomic_t                        refcount; /* event <-> epc */
        struct rcu_head                        rcu_head;

        /*
         * Set when one or more (plausibly active) event can't be scheduled
         * due to pmu overcommit or pmu constraints, except tolerant to
         * events not necessary to be active due to scheduling constraints,
         * such as cgroups.
         */
        int                                rotate_necessary;
};

static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc)
{
        return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active);
}

struct perf_event_groups {
        struct rb_root                        tree;
        u64                                index;
};

struct perf_time_ctx {
        u64                time;
        u64                stamp;
        u64                offset;
};

/**
 * struct perf_event_context - event context structure
 *
 * Used as a container for task events and CPU events as well:
 */
struct perf_event_context {
        /*
         * Protect the states of the events in the list,
         * nr_active, and the list:
         */
        raw_spinlock_t                        lock;
        /*
         * Protect the list of events.  Locking either mutex or lock
         * is sufficient to ensure the list doesn't change; to change
         * the list you need to lock both the mutex and the spinlock.
         */
        struct mutex                        mutex;

        struct list_head                pmu_ctx_list;
        struct perf_event_groups        pinned_groups;
        struct perf_event_groups        flexible_groups;
        struct list_head                event_list;

        int                                nr_events;
        int                                nr_user;
        int                                is_active;

        int                                nr_stat;
        int                                nr_freq;
        int                                rotate_disable;

        refcount_t                        refcount; /* event <-> ctx */
        struct task_struct                *task;

        /*
         * Context clock, runs when context enabled.
         */
        struct perf_time_ctx                time;

        /*
         * Context clock, runs when in the guest mode.
         */
        struct perf_time_ctx                timeguest;

        /*
         * These fields let us detect when two contexts have both
         * been cloned (inherited) from a common ancestor.
         */
        struct perf_event_context        *parent_ctx;
        u64                                parent_gen;
        u64                                generation;
        int                                pin_count;
#ifdef CONFIG_CGROUP_PERF
        int                                nr_cgroups;         /* cgroup evts */
#endif
        struct rcu_head                        rcu_head;

        /*
         * The count of events for which using the switch-out fast path
         * should be avoided.
         *
         * Sum (event->pending_work + events with
         *    (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)))
         *
         * The SIGTRAP is targeted at ctx->task, as such it won't do changing
         * that until the signal is delivered.
         */
        local_t                                nr_no_switch_fast;
};

/**
 * struct perf_ctx_data - PMU specific data for a task
 * @rcu_head:  To avoid the race on free PMU specific data
 * @refcount:  To track users
 * @global:    To track system-wide users
 * @ctx_cache: Kmem cache of PMU specific data
 * @data:      PMU specific data
 *
 * Currently, the struct is only used in Intel LBR call stack mode to
 * save/restore the call stack of a task on context switches.
 *
 * The rcu_head is used to prevent the race on free the data.
 * The data only be allocated when Intel LBR call stack mode is enabled.
 * The data will be freed when the mode is disabled.
 * The content of the data will only be accessed in context switch, which
 * should be protected by rcu_read_lock().
 *
 * Because of the alignment requirement of Intel Arch LBR, the Kmem cache
 * is used to allocate the PMU specific data. The ctx_cache is to track
 * the Kmem cache.
 *
 * Careful: Struct perf_ctx_data is added as a pointer in struct task_struct.
 * When system-wide Intel LBR call stack mode is enabled, a buffer with
 * constant size will be allocated for each task.
 * Also, system memory consumption can further grow when the size of
 * struct perf_ctx_data enlarges.
 */
struct perf_ctx_data {
        struct rcu_head                        rcu_head;
        refcount_t                        refcount;
        int                                global;
        struct kmem_cache                *ctx_cache;
        void                                *data;
};

struct perf_cpu_pmu_context {
        struct perf_event_pmu_context        epc;
        struct perf_event_pmu_context        *task_epc;

        struct list_head                sched_cb_entry;
        int                                sched_cb_usage;

        int                                active_oncpu;
        int                                exclusive;
        int                                pmu_disable_count;

        raw_spinlock_t                        hrtimer_lock;
        struct hrtimer                        hrtimer;
        ktime_t                                hrtimer_interval;
        unsigned int                        hrtimer_active;
};

/**
 * struct perf_event_cpu_context - per cpu event context structure
 */
struct perf_cpu_context {
        struct perf_event_context        ctx;
        struct perf_event_context        *task_ctx;
        int                                online;

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp;
#endif

        /*
         * Per-CPU storage for iterators used in visit_groups_merge. The default
         * storage is of size 2 to hold the CPU and any CPU event iterators.
         */
        int                                heap_size;
        struct perf_event                **heap;
        struct perf_event                *heap_default[2];
};

struct perf_output_handle {
        struct perf_event                *event;
        struct perf_buffer                *rb;
        unsigned long                        wakeup;
        unsigned long                        size;
        union {
                u64                        flags;                /* perf_output*() */
                u64                        aux_flags;        /* perf_aux_output*() */
                struct {
                        u64                skip_read : 1;
                };
        };
        union {
                void                        *addr;
                unsigned long                head;
        };
        int                                page;
};

struct bpf_perf_event_data_kern {
        bpf_user_pt_regs_t *regs;
        struct perf_sample_data *data;
        struct perf_event *event;
};

#ifdef CONFIG_CGROUP_PERF

/*
 * perf_cgroup_info keeps track of time_enabled for a cgroup.
 * This is a per-cpu dynamically allocated data structure.
 */
struct perf_cgroup_info {
        struct perf_time_ctx                time;
        struct perf_time_ctx                timeguest;
        int                                active;
};

struct perf_cgroup {
        struct cgroup_subsys_state        css;
        struct perf_cgroup_info        __percpu *info;
};

/*
 * Must ensure cgroup is pinned (css_get) before calling
 * this function. In other words, we cannot call this function
 * if there is no cgroup event for the current CPU context.
 */
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
{
        return container_of(task_css_check(task, perf_event_cgrp_id,
                                           ctx ? lockdep_is_held(&ctx->lock)
                                               : true),
                            struct perf_cgroup, css);
}
#endif /* CONFIG_CGROUP_PERF */

#ifdef CONFIG_PERF_EVENTS

extern struct perf_event_context *perf_cpu_task_ctx(void);

extern void *perf_aux_output_begin(struct perf_output_handle *handle,
                                   struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
                                unsigned long size);
extern int perf_aux_output_skip(struct perf_output_handle *handle,
                                unsigned long size);
extern void *perf_get_aux(struct perf_output_handle *handle);
extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
extern void perf_event_itrace_started(struct perf_event *event);

extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
extern int perf_pmu_unregister(struct pmu *pmu);

extern void __perf_event_task_sched_in(struct task_struct *prev,
                                       struct task_struct *task);
extern void __perf_event_task_sched_out(struct task_struct *prev,
                                        struct task_struct *next);
extern int perf_event_init_task(struct task_struct *child, u64 clone_flags);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
extern struct file *perf_event_get(unsigned int fd);
extern const struct perf_event *perf_get_event(struct file *file);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
extern void perf_sched_cb_dec(struct pmu *pmu);
extern void perf_sched_cb_inc(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);

extern void perf_pmu_resched(struct pmu *pmu);

extern int perf_event_refresh(struct perf_event *event, int refresh);
extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);

extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                 int cpu,
                                 struct task_struct *task,
                                 perf_overflow_handler_t callback,
                                 void *context);

extern void perf_pmu_migrate_context(struct pmu *pmu,
                                     int src_cpu, int dst_cpu);
extern int perf_event_read_local(struct perf_event *event, u64 *value,
                                 u64 *enabled, u64 *running);
extern u64 perf_event_read_value(struct perf_event *event,
                                 u64 *enabled, u64 *running);

extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);

static inline bool branch_sample_no_flags(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS;
}

static inline bool branch_sample_no_cycles(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES;
}

static inline bool branch_sample_type(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE;
}

static inline bool branch_sample_hw_index(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
}

static inline bool branch_sample_priv(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE;
}

static inline bool branch_sample_counters(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS;
}

static inline bool branch_sample_call_stack(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
}

struct perf_sample_data {
        /*
         * Fields set by perf_sample_data_init() unconditionally,
         * group so as to minimize the cachelines touched.
         */
        u64                                sample_flags;
        u64                                period;
        u64                                dyn_size;

        /*
         * Fields commonly set by __perf_event_header__init_id(),
         * group so as to minimize the cachelines touched.
         */
        u64                                type;
        struct {
                u32        pid;
                u32        tid;
        }                                tid_entry;
        u64                                time;
        u64                                id;
        struct {
                u32        cpu;
                u32        reserved;
        }                                cpu_entry;

        /*
         * The other fields, optionally {set,used} by
         * perf_{prepare,output}_sample().
         */
        u64                                ip;
        struct perf_callchain_entry        *callchain;
        struct perf_raw_record                *raw;
        struct perf_branch_stack        *br_stack;
        u64                                *br_stack_cntr;
        union perf_sample_weight        weight;
        union  perf_mem_data_src        data_src;
        u64                                txn;

        struct perf_regs                regs_user;
        struct perf_regs                regs_intr;
        u64                                stack_user_size;

        u64                                stream_id;
        u64                                cgroup;
        u64                                addr;
        u64                                phys_addr;
        u64                                data_page_size;
        u64                                code_page_size;
        u64                                aux_size;
} ____cacheline_aligned;

/* default value for data source */
#define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
                    PERF_MEM_S(LVL, NA)   |\
                    PERF_MEM_S(SNOOP, NA) |\
                    PERF_MEM_S(LOCK, NA)  |\
                    PERF_MEM_S(TLB, NA)   |\
                    PERF_MEM_S(LVLNUM, NA))

static inline void perf_sample_data_init(struct perf_sample_data *data,
                                         u64 addr, u64 period)
{
        /* remaining struct members initialized in perf_prepare_sample() */
        data->sample_flags = PERF_SAMPLE_PERIOD;
        data->period = period;
        data->dyn_size = 0;

        if (addr) {
                data->addr = addr;
                data->sample_flags |= PERF_SAMPLE_ADDR;
        }
}

static inline void perf_sample_save_callchain(struct perf_sample_data *data,
                                              struct perf_event *event,
                                              struct pt_regs *regs)
{
        int size = 1;

        if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
                return;
        if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN))
                return;

        data->callchain = perf_callchain(event, regs);
        size += data->callchain->nr;

        data->dyn_size += size * sizeof(u64);
        data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
}

static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
                                             struct perf_event *event,
                                             struct perf_raw_record *raw)
{
        struct perf_raw_frag *frag = &raw->frag;
        u32 sum = 0;
        int size;

        if (!(event->attr.sample_type & PERF_SAMPLE_RAW))
                return;
        if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW))
                return;

        do {
                sum += frag->size;
                if (perf_raw_frag_last(frag))
                        break;
                frag = frag->next;
        } while (1);

        size = round_up(sum + sizeof(u32), sizeof(u64));
        raw->size = size - sizeof(u32);
        frag->pad = raw->size - sum;

        data->raw = raw;
        data->dyn_size += size;
        data->sample_flags |= PERF_SAMPLE_RAW;
}

static inline bool has_branch_stack(struct perf_event *event)
{
        return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}

static inline void perf_sample_save_brstack(struct perf_sample_data *data,
                                            struct perf_event *event,
                                            struct perf_branch_stack *brs,
                                            u64 *brs_cntr)
{
        int size = sizeof(u64); /* nr */

        if (!has_branch_stack(event))
                return;
        if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK))
                return;

        if (branch_sample_hw_index(event))
                size += sizeof(u64);

        brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr);

        size += brs->nr * sizeof(struct perf_branch_entry);

        /*
         * The extension space for counters is appended after the
         * struct perf_branch_stack. It is used to store the occurrences
         * of events of each branch.
         */
        if (brs_cntr)
                size += brs->nr * sizeof(u64);

        data->br_stack = brs;
        data->br_stack_cntr = brs_cntr;
        data->dyn_size += size;
        data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}

static inline u32 perf_sample_data_size(struct perf_sample_data *data,
                                        struct perf_event *event)
{
        u32 size = sizeof(struct perf_event_header);

        size += event->header_size + event->id_header_size;
        size += data->dyn_size;

        return size;
}

/*
 * Clear all bitfields in the perf_branch_entry.
 * The to and from fields are not cleared because they are
 * systematically modified by caller.
 */
static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br)
{
        br->mispred        = 0;
        br->predicted        = 0;
        br->in_tx        = 0;
        br->abort        = 0;
        br->cycles        = 0;
        br->type        = 0;
        br->spec        = PERF_BR_SPEC_NA;
        br->reserved        = 0;
}

extern void perf_output_sample(struct perf_output_handle *handle,
                               struct perf_event_header *header,
                               struct perf_sample_data *data,
                               struct perf_event *event);
extern void perf_prepare_sample(struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);
extern void perf_prepare_header(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);

extern int perf_event_overflow(struct perf_event *event,
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs);

extern void perf_event_output_forward(struct perf_event *event,
                                     struct perf_sample_data *data,
                                     struct pt_regs *regs);
extern void perf_event_output_backward(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs);
extern int perf_event_output(struct perf_event *event,
                             struct perf_sample_data *data,
                             struct pt_regs *regs);

static inline bool
is_default_overflow_handler(struct perf_event *event)
{
        perf_overflow_handler_t overflow_handler = event->overflow_handler;

        if (likely(overflow_handler == perf_event_output_forward))
                return true;
        if (unlikely(overflow_handler == perf_event_output_backward))
                return true;
        return false;
}

extern void
perf_event_header__init_id(struct perf_event_header *header,
                           struct perf_sample_data *data,
                           struct perf_event *event);
extern void
perf_event__output_id_sample(struct perf_event *event,
                             struct perf_output_handle *handle,
                             struct perf_sample_data *sample);

extern void
perf_log_lost_samples(struct perf_event *event, u64 lost);

static inline bool event_has_any_exclude_flag(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        return attr->exclude_idle || attr->exclude_user ||
               attr->exclude_kernel || attr->exclude_hv ||
               attr->exclude_guest || attr->exclude_host;
}

static inline bool is_sampling_event(struct perf_event *event)
{
        return event->attr.sample_period != 0;
}

/*
 * Return 1 for a software event, 0 for a hardware event
 */
static inline int is_software_event(struct perf_event *event)
{
        return event->event_caps & PERF_EV_CAP_SOFTWARE;
}

/*
 * Return 1 for event in sw context, 0 for event in hw context
 */
static inline int in_software_context(struct perf_event *event)
{
        return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context;
}

static inline int is_exclusive_pmu(struct pmu *pmu)
{
        return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
}

extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);

#ifndef perf_arch_fetch_caller_regs
static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
#endif

/*
 * When generating a perf sample in-line, instead of from an interrupt /
 * exception, we lack a pt_regs. This is typically used from software events
 * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints.
 *
 * We typically don't need a full set, but (for x86) do require:
 * - ip for PERF_SAMPLE_IP
 * - cs for user_mode() tests
 * - sp for PERF_SAMPLE_CALLCHAIN
 * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs())
 *
 * NOTE: assumes @regs is otherwise already 0 filled; this is important for
 * things like PERF_SAMPLE_REGS_INTR.
 */
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
        perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
}

static __always_inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        if (static_key_false(&perf_swevent_enabled[event_id]))
                __perf_sw_event(event_id, nr, regs, addr);
}

DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);

/*
 * 'Special' version for the scheduler, it hard assumes no recursion,
 * which is guaranteed by us not actually scheduling inside other swevents
 * because those disable preemption.
 */
static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
{
        struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

        perf_fetch_caller_regs(regs);
        ___perf_sw_event(event_id, nr, regs, addr);
}

extern struct static_key_false perf_sched_events;

static __always_inline bool __perf_sw_enabled(int swevt)
{
        return static_key_false(&perf_swevent_enabled[swevt]);
}

static inline void perf_event_task_migrate(struct task_struct *task)
{
        if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS))
                task->sched_migrated = 1;
}

static inline void perf_event_task_sched_in(struct task_struct *prev,
                                            struct task_struct *task)
{
        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_in(prev, task);

        if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
            task->sched_migrated) {
                __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
                task->sched_migrated = 0;
        }
}

static inline void perf_event_task_sched_out(struct task_struct *prev,
                                             struct task_struct *next)
{
        if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
                __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);

#ifdef CONFIG_CGROUP_PERF
        if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
            perf_cgroup_from_task(prev, NULL) !=
            perf_cgroup_from_task(next, NULL))
                __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
#endif

        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_out(prev, next);
}

extern void perf_event_mmap(struct vm_area_struct *vma);

extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                               bool unregister, const char *sym);
extern void perf_event_bpf_event(struct bpf_prog *prog,
                                 enum perf_bpf_event_type type,
                                 u16 flags);

#define PERF_GUEST_ACTIVE                0x01
#define PERF_GUEST_USER                        0x02

struct perf_guest_info_callbacks {
        unsigned int                        (*state)(void);
        unsigned long                        (*get_ip)(void);
        unsigned int                        (*handle_intel_pt_intr)(void);

        void                                (*handle_mediated_pmi)(void);
};

#ifdef CONFIG_GUEST_PERF_EVENTS

extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state);
DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
DECLARE_STATIC_CALL(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi);

static inline unsigned int perf_guest_state(void)
{
        return static_call(__perf_guest_state)();
}

static inline unsigned long perf_guest_get_ip(void)
{
        return static_call(__perf_guest_get_ip)();
}

static inline unsigned int perf_guest_handle_intel_pt_intr(void)
{
        return static_call(__perf_guest_handle_intel_pt_intr)();
}

static inline void perf_guest_handle_mediated_pmi(void)
{
        static_call(__perf_guest_handle_mediated_pmi)();
}

extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);

#else /* !CONFIG_GUEST_PERF_EVENTS: */

static inline unsigned int perf_guest_state(void)                 { return 0; }
static inline unsigned long perf_guest_get_ip(void)                 { return 0; }
static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; }

#endif /* !CONFIG_GUEST_PERF_EVENTS */

extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
extern void perf_event_text_poke(const void *addr,
                                 const void *old_bytes, size_t old_len,
                                 const void *new_bytes, size_t new_len);

/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);

extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
                   u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
extern void put_callchain_entry(int rctx);

extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;

static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
                struct perf_callchain_entry *entry = ctx->entry;

                entry->ip[entry->nr++] = ip;
                ++ctx->contexts;
                return 0;
        } else {
                ctx->contexts_maxed = true;
                return -1; /* no more room, stop walking the stack */
        }
}

static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
                struct perf_callchain_entry *entry = ctx->entry;

                entry->ip[entry->nr++] = ip;
                ++ctx->nr;
                return 0;
        } else {
                return -1; /* no more room, stop walking the stack */
        }
}

extern int sysctl_perf_event_paranoid;
extern int sysctl_perf_event_sample_rate;

extern void perf_sample_event_took(u64 sample_len_ns);

/* Access to perf_event_open(2) syscall. */
#define PERF_SECURITY_OPEN                0

/* Finer grained perf_event_open(2) access control. */
#define PERF_SECURITY_CPU                1
#define PERF_SECURITY_KERNEL                2
#define PERF_SECURITY_TRACEPOINT        3

static inline int perf_is_paranoid(void)
{
        return sysctl_perf_event_paranoid > -1;
}

extern int perf_allow_kernel(void);

static inline int perf_allow_cpu(void)
{
        if (sysctl_perf_event_paranoid > 0 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(PERF_SECURITY_CPU);
}

static inline int perf_allow_tracepoint(void)
{
        if (sysctl_perf_event_paranoid > -1 && !perfmon_capable())
                return -EPERM;

        return security_perf_event_open(PERF_SECURITY_TRACEPOINT);
}

extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs);

extern void perf_event_init(void);
extern void perf_tp_event(u16 event_type, u64 count, void *record,
                          int entry_size, struct pt_regs *regs,
                          struct hlist_head *head, int rctx,
                          struct task_struct *task);
extern void perf_bp_event(struct perf_event *event, void *data);

extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs);
extern unsigned long perf_instruction_pointer(struct perf_event *event,
                                              struct pt_regs *regs);

#ifndef perf_arch_misc_flags
# define perf_arch_misc_flags(regs) \
                (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
# define perf_arch_instruction_pointer(regs)        instruction_pointer(regs)
#endif
#ifndef perf_arch_bpf_user_pt_regs
# define perf_arch_bpf_user_pt_regs(regs) regs
#endif

#ifndef perf_arch_guest_misc_flags
static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
{
        unsigned long guest_state = perf_guest_state();

        if (!(guest_state & PERF_GUEST_ACTIVE))
                return 0;

        if (guest_state & PERF_GUEST_USER)
                return PERF_RECORD_MISC_GUEST_USER;
        else
                return PERF_RECORD_MISC_GUEST_KERNEL;
}
# define perf_arch_guest_misc_flags(regs)        perf_arch_guest_misc_flags(regs)
#endif

static inline bool needs_branch_stack(struct perf_event *event)
{
        return event->attr.branch_sample_type != 0;
}

static inline bool has_aux(struct perf_event *event)
{
        return event->pmu && event->pmu->setup_aux;
}

static inline bool has_aux_action(struct perf_event *event)
{
        return event->attr.aux_sample_size ||
               event->attr.aux_pause ||
               event->attr.aux_resume;
}

static inline bool is_write_backward(struct perf_event *event)
{
        return !!event->attr.write_backward;
}

static inline bool has_addr_filter(struct perf_event *event)
{
        return event->pmu->nr_addr_filters;
}

/*
 * An inherited event uses parent's filters
 */
static inline struct perf_addr_filters_head *
perf_event_addr_filters(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = &event->addr_filters;

        if (event->parent)
                ifh = &event->parent->addr_filters;

        return ifh;
}

static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
{
        /* Only the parent has fasync state */
        if (event->parent)
                event = event->parent;
        return &event->fasync;
}

extern void perf_event_addr_filters_sync(struct perf_event *event);
extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id);

extern int perf_output_begin(struct perf_output_handle *handle,
                             struct perf_sample_data *data,
                             struct perf_event *event, unsigned int size);
extern int perf_output_begin_forward(struct perf_output_handle *handle,
                                     struct perf_sample_data *data,
                                     struct perf_event *event,
                                     unsigned int size);
extern int perf_output_begin_backward(struct perf_output_handle *handle,
                                      struct perf_sample_data *data,
                                      struct perf_event *event,
                                      unsigned int size);

extern void perf_output_end(struct perf_output_handle *handle);
extern unsigned int perf_output_copy(struct perf_output_handle *handle,
                                     const void *buf, unsigned int len);
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
                                     unsigned int len);
extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
                                 struct perf_output_handle *handle,
                                 unsigned long from, unsigned long to);
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern u64 perf_swevent_set_period(struct perf_event *event);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
extern int perf_event_period(struct perf_event *event, u64 value);
extern u64 perf_event_pause(struct perf_event *event, bool reset);

#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU
int perf_create_mediated_pmu(void);
void perf_release_mediated_pmu(void);
void perf_load_guest_context(void);
void perf_put_guest_context(void);
#endif

#else /* !CONFIG_PERF_EVENTS: */

static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event)                                { return NULL; }
static inline void
perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
                                                                        { }
static inline int
perf_aux_output_skip(struct perf_output_handle *handle,
                     unsigned long size)                                { return -EINVAL; }
static inline void *
perf_get_aux(struct perf_output_handle *handle)                                { return NULL; }
static inline void
perf_event_task_migrate(struct task_struct *task)                        { }
static inline void
perf_event_task_sched_in(struct task_struct *prev,
                         struct task_struct *task)                        { }
static inline void
perf_event_task_sched_out(struct task_struct *prev,
                          struct task_struct *next)                        { }
static inline int perf_event_init_task(struct task_struct *child,
                                       u64 clone_flags)                        { return 0; }
static inline void perf_event_exit_task(struct task_struct *child)        { }
static inline void perf_event_free_task(struct task_struct *task)        { }
static inline void perf_event_delayed_put(struct task_struct *task)        { }
static inline struct file *perf_event_get(unsigned int fd)        { return ERR_PTR(-EINVAL); }
static inline const struct perf_event *perf_get_event(struct file *file)
{
        return ERR_PTR(-EINVAL);
}
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        return ERR_PTR(-EINVAL);
}
static inline int perf_event_read_local(struct perf_event *event, u64 *value,
                                        u64 *enabled, u64 *running)
{
        return -EINVAL;
}
static inline void perf_event_print_debug(void)                                { }
static inline int perf_event_task_disable(void)                                { return -EINVAL; }
static inline int perf_event_task_enable(void)                                { return -EINVAL; }
static inline int perf_event_refresh(struct perf_event *event, int refresh)
{
        return -EINVAL;
}

static inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)        { }
static inline void
perf_bp_event(struct perf_event *event, void *data)                        { }

static inline void perf_event_mmap(struct vm_area_struct *vma)                { }

typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                                      bool unregister, const char *sym)        { }
static inline void perf_event_bpf_event(struct bpf_prog *prog,
                                        enum perf_bpf_event_type type,
                                        u16 flags)                        { }
static inline void perf_event_exec(void)                                { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec)        { }
static inline void perf_event_namespaces(struct task_struct *tsk)        { }
static inline void perf_event_fork(struct task_struct *tsk)                { }
static inline void perf_event_text_poke(const void *addr,
                                        const void *old_bytes,
                                        size_t old_len,
                                        const void *new_bytes,
                                        size_t new_len)                        { }
static inline void perf_event_init(void)                                { }
static inline int  perf_swevent_get_recursion_context(void)                { return -1; }
static inline void perf_swevent_put_recursion_context(int rctx)                { }
static inline u64 perf_swevent_set_period(struct perf_event *event)        { return 0; }
static inline void perf_event_enable(struct perf_event *event)                { }
static inline void perf_event_disable(struct perf_event *event)                { }
static inline int __perf_event_disable(void *info)                        { return -1; }
static inline void perf_event_task_tick(void)                                { }
static inline int perf_event_release_kernel(struct perf_event *event)        { return 0; }
static inline int
perf_event_period(struct perf_event *event, u64 value)                        { return -EINVAL; }
static inline u64
perf_event_pause(struct perf_event *event, bool reset)                        { return 0; }
static inline int
perf_exclude_event(struct perf_event *event, struct pt_regs *regs)        { return 0; }

#endif /* !CONFIG_PERF_EVENTS */

#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
extern void perf_restore_debug_store(void);
#else
static inline void perf_restore_debug_store(void)                        { }
#endif

#define perf_output_put(handle, x)        perf_output_copy((handle), &(x), sizeof(x))

struct perf_pmu_events_attr {
        struct device_attribute                attr;
        u64                                id;
        const char                        *event_str;
};

struct perf_pmu_events_ht_attr {
        struct device_attribute                attr;
        u64                                id;
        const char                        *event_str_ht;
        const char                        *event_str_noht;
};

struct perf_pmu_events_hybrid_attr {
        struct device_attribute                attr;
        u64                                id;
        const char                        *event_str;
        u64                                pmu_type;
};

struct perf_pmu_format_hybrid_attr {
        struct device_attribute                attr;
        u64                                pmu_type;
};

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page);

#define PMU_EVENT_ATTR(_name, _var, _id, _show)                                \
static struct perf_pmu_events_attr _var = {                                \
        .attr = __ATTR(_name, 0444, _show, NULL),                        \
        .id   =  _id,                                                        \
};

#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                            \
static struct perf_pmu_events_attr _var = {                                    \
        .attr                = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
        .id                = 0,                                                    \
        .event_str        = _str,                                                    \
};

#define PMU_EVENT_ATTR_ID(_name, _show, _id)                                \
        (&((struct perf_pmu_events_attr[]) {                                \
                { .attr = __ATTR(_name, 0444, _show, NULL),                \
                  .id = _id, }                                                \
        })[0].attr.attr)

#define PMU_FORMAT_ATTR_SHOW(_name, _format)                                \
static ssize_t                                                                \
_name##_show(struct device *dev,                                        \
                               struct device_attribute *attr,                \
                               char *page)                                \
{                                                                        \
        BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                        \
        return sprintf(page, _format "\n");                                \
}                                                                        \

#define PMU_FORMAT_ATTR(_name, _format)                                        \
        PMU_FORMAT_ATTR_SHOW(_name, _format)                                \
                                                                        \
static struct device_attribute format_attr_##_name = __ATTR_RO(_name)

/* Performance counter hotplug functions */
#ifdef CONFIG_PERF_EVENTS
extern int perf_event_init_cpu(unsigned int cpu);
extern int perf_event_exit_cpu(unsigned int cpu);
#else
# define perf_event_init_cpu                NULL
# define perf_event_exit_cpu                NULL
#endif

extern void arch_perf_update_userpage(struct perf_event *event,
                                      struct perf_event_mmap_page *userpg,
                                      u64 now);

/*
 * Snapshot branch stack on software events.
 *
 * Branch stack can be very useful in understanding software events. For
 * example, when a long function, e.g. sys_perf_event_open, returns an
 * errno, it is not obvious why the function failed. Branch stack could
 * provide very helpful information in this type of scenarios.
 *
 * On software event, it is necessary to stop the hardware branch recorder
 * fast. Otherwise, the hardware register/buffer will be flushed with
 * entries of the triggering event. Therefore, static call is used to
 * stop the hardware recorder.
 */

/*
 * cnt is the number of entries allocated for entries.
 * Return number of entries copied to .
 */
typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries,
                                           unsigned int cnt);
DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);

#ifndef PERF_NEEDS_LOPWR_CB
static inline void perf_lopwr_cb(bool mode)
{
}
#endif

#endif /* _LINUX_PERF_EVENT_H */



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   67 










































   67 
   67 




















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xsave/xrstor support.
 *
 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
 */
#include <linux/bitops.h>
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/mman.h>
#include <linux/kvm_types.h>
#include <linux/nospec.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
#include <linux/coredump.h>
#include <linux/sort.h>

#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/xcr.h>

#include <asm/cpuid/api.h>
#include <asm/msr.h>
#include <asm/tlbflush.h>
#include <asm/prctl.h>
#include <asm/elf.h>

#include <uapi/asm/elf.h>

#include "context.h"
#include "internal.h"
#include "legacy.h"
#include "xstate.h"

#define for_each_extended_xfeature(bit, mask)                                \
        (bit) = FIRST_EXTENDED_XFEATURE;                                \
        for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))

/*
 * Although we spell it out in here, the Processor Trace
 * xfeature is completely unused.  We use other mechanisms
 * to save/restore PT state in Linux.
 */
static const char *xfeature_names[] =
{
        "x87 floating point registers",
        "SSE registers",
        "AVX registers",
        "MPX bounds registers",
        "MPX CSR",
        "AVX-512 opmask",
        "AVX-512 Hi256",
        "AVX-512 ZMM_Hi256",
        "Processor Trace (unused)",
        "Protection Keys User registers",
        "PASID state",
        "Control-flow User registers",
        "Control-flow Kernel registers (KVM only)",
        "unknown xstate feature",
        "unknown xstate feature",
        "unknown xstate feature",
        "unknown xstate feature",
        "AMX Tile config",
        "AMX Tile data",
        "APX registers",
        "unknown xstate feature",
};

static unsigned short xsave_cpuid_features[] __initdata = {
        [XFEATURE_FP]                                = X86_FEATURE_FPU,
        [XFEATURE_SSE]                                = X86_FEATURE_XMM,
        [XFEATURE_YMM]                                = X86_FEATURE_AVX,
        [XFEATURE_BNDREGS]                        = X86_FEATURE_MPX,
        [XFEATURE_BNDCSR]                        = X86_FEATURE_MPX,
        [XFEATURE_OPMASK]                        = X86_FEATURE_AVX512F,
        [XFEATURE_ZMM_Hi256]                        = X86_FEATURE_AVX512F,
        [XFEATURE_Hi16_ZMM]                        = X86_FEATURE_AVX512F,
        [XFEATURE_PT_UNIMPLEMENTED_SO_FAR]        = X86_FEATURE_INTEL_PT,
        [XFEATURE_PKRU]                                = X86_FEATURE_OSPKE,
        [XFEATURE_PASID]                        = X86_FEATURE_ENQCMD,
        [XFEATURE_CET_USER]                        = X86_FEATURE_SHSTK,
        [XFEATURE_CET_KERNEL]                        = X86_FEATURE_SHSTK,
        [XFEATURE_XTILE_CFG]                        = X86_FEATURE_AMX_TILE,
        [XFEATURE_XTILE_DATA]                        = X86_FEATURE_AMX_TILE,
        [XFEATURE_APX]                                = X86_FEATURE_APX,
};

static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
        { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
        { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;

/*
 * Ordering of xstate components in uncompacted format:  The xfeature
 * number does not necessarily indicate its position in the XSAVE buffer.
 * This array defines the traversal order of xstate features.
 */
static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init =
        { [ 0 ... XFEATURE_MAX - 1] = -1};

static inline unsigned int next_xfeature_order(unsigned int i, u64 mask)
{
        for (; xfeature_uncompact_order[i] != -1; i++) {
                if (mask & BIT_ULL(xfeature_uncompact_order[i]))
                        break;
        }

        return i;
}

/* Iterate xstate features in uncompacted order: */
#define for_each_extended_xfeature_in_order(i, mask)        \
        for (i = 0;                                        \
             i = next_xfeature_order(i, mask),                \
             xfeature_uncompact_order[i] != -1;                \
             i++)

#define XSTATE_FLAG_SUPERVISOR        BIT(0)
#define XSTATE_FLAG_ALIGNED64        BIT(1)

/*
 * Return whether the system supports a given xfeature.
 *
 * Also return the name of the (most advanced) feature that the caller requested:
 */
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
        u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;

        if (unlikely(feature_name)) {
                long xfeature_idx, max_idx;
                u64 xfeatures_print;
                /*
                 * So we use FLS here to be able to print the most advanced
                 * feature that was requested but is missing. So if a driver
                 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
                 * missing AVX feature - this is the most informative message
                 * to users:
                 */
                if (xfeatures_missing)
                        xfeatures_print = xfeatures_missing;
                else
                        xfeatures_print = xfeatures_needed;

                xfeature_idx = fls64(xfeatures_print)-1;
                max_idx = ARRAY_SIZE(xfeature_names)-1;
                xfeature_idx = min(xfeature_idx, max_idx);

                *feature_name = xfeature_names[xfeature_idx];
        }

        if (xfeatures_missing)
                return 0;

        return 1;
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);

static bool xfeature_is_aligned64(int xfeature_nr)
{
        return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
}

static bool xfeature_is_supervisor(int xfeature_nr)
{
        return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
}

static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
{
        unsigned int offs, i;

        /*
         * Non-compacted format and legacy features use the cached fixed
         * offsets.
         */
        if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
            xfeature <= XFEATURE_SSE)
                return xstate_offsets[xfeature];

        /*
         * Compacted format offsets depend on the actual content of the
         * compacted xsave area which is determined by the xcomp_bv header
         * field.
         */
        offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
        for_each_extended_xfeature(i, xcomp_bv) {
                if (xfeature_is_aligned64(i))
                        offs = ALIGN(offs, 64);
                if (i == xfeature)
                        break;
                offs += xstate_sizes[i];
        }
        return offs;
}

/*
 * Enable the extended processor state save/restore feature.
 * Called once per CPU onlining.
 */
void fpu__init_cpu_xstate(void)
{
        if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
                return;

        cr4_set_bits(X86_CR4_OSXSAVE);

        /*
         * Must happen after CR4 setup and before xsetbv() to allow KVM
         * lazy passthrough.  Write independent of the dynamic state static
         * key as that does not work on the boot CPU. This also ensures
         * that any stale state is wiped out from XFD. Reset the per CPU
         * xfd cache too.
         */
        if (cpu_feature_enabled(X86_FEATURE_XFD))
                xfd_set_state(init_fpstate.xfd);

        /*
         * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
         * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
         * states can be set here.
         */
        xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);

        /*
         * MSR_IA32_XSS sets supervisor states managed by XSAVES.
         */
        if (boot_cpu_has(X86_FEATURE_XSAVES)) {
                wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
                                     xfeatures_mask_independent());
        }
}

static bool xfeature_enabled(enum xfeature xfeature)
{
        return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
}

static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2)
{
        return  xstate_offsets[*(unsigned int *)xfeature1] -
                xstate_offsets[*(unsigned int *)xfeature2];
}

/*
 * Record the offsets and sizes of various xstates contained
 * in the XSAVE state memory layout. Also, create an ordered
 * list of xfeatures for handling out-of-order offsets.
 */
static void __init setup_xstate_cache(void)
{
        u32 eax, ebx, ecx, edx, xfeature, i = 0;
        /*
         * The FP xstates and SSE xstates are legacy states. They are always
         * in the fixed offsets in the xsave area in either compacted form
         * or standard form.
         */
        xstate_offsets[XFEATURE_FP]        = 0;
        xstate_sizes[XFEATURE_FP]        = offsetof(struct fxregs_state,
                                                   xmm_space);

        xstate_offsets[XFEATURE_SSE]        = xstate_sizes[XFEATURE_FP];
        xstate_sizes[XFEATURE_SSE]        = sizeof_field(struct fxregs_state,
                                                       xmm_space);

        for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) {
                cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx);

                xstate_sizes[xfeature] = eax;
                xstate_flags[xfeature] = ecx;

                /*
                 * If an xfeature is supervisor state, the offset in EBX is
                 * invalid, leave it to -1.
                 */
                if (xfeature_is_supervisor(xfeature))
                        continue;

                xstate_offsets[xfeature] = ebx;

                /* Populate the list of xfeatures before sorting */
                xfeature_uncompact_order[i++] = xfeature;
        }

        /*
         * Sort xfeatures by their offsets to support out-of-order
         * offsets in the uncompacted format.
         */
        sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL);
}

/*
 * Print out all the supported xstate features:
 */
static void __init print_xstate_features(void)
{
        int i;

        for (i = 0; i < XFEATURE_MAX; i++) {
                u64 mask = BIT_ULL(i);
                const char *name;

                if (cpu_has_xfeatures(mask, &name))
                        pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name);
        }
}

/*
 * This check is important because it is easy to get XSTATE_*
 * confused with XSTATE_BIT_*.
 */
#define CHECK_XFEATURE(nr) do {                \
        WARN_ON(nr < FIRST_EXTENDED_XFEATURE);        \
        WARN_ON(nr >= XFEATURE_MAX);        \
} while (0)

/*
 * Print out xstate component offsets and sizes
 */
static void __init print_xstate_offset_size(void)
{
        int i;

        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
                pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
                        i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
                        i, xstate_sizes[i]);
        }
}

/*
 * This function is called only during boot time when x86 caps are not set
 * up and alternative can not be used yet.
 */
static __init void os_xrstor_booting(struct xregs_state *xstate)
{
        u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        if (cpu_feature_enabled(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        /*
         * We should never fault when copying from a kernel buffer, and the FPU
         * state we set at boot time should be valid.
         */
        WARN_ON_FPU(err);
}

/*
 * All supported features have either init state all zeros or are
 * handled in setup_init_fpu() individually. This is an explicit
 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
 * newly added supported features at build time and make people
 * actually look at the init state for the new feature.
 */
#define XFEATURES_INIT_FPSTATE_HANDLED                \
        (XFEATURE_MASK_FP |                        \
         XFEATURE_MASK_SSE |                        \
         XFEATURE_MASK_YMM |                        \
         XFEATURE_MASK_OPMASK |                        \
         XFEATURE_MASK_ZMM_Hi256 |                \
         XFEATURE_MASK_Hi16_ZMM         |                \
         XFEATURE_MASK_PKRU |                        \
         XFEATURE_MASK_BNDREGS |                \
         XFEATURE_MASK_BNDCSR |                        \
         XFEATURE_MASK_PASID |                        \
         XFEATURE_MASK_CET_USER |                \
         XFEATURE_MASK_CET_KERNEL |                \
         XFEATURE_MASK_XTILE |                        \
         XFEATURE_MASK_APX)

/*
 * setup the xstate image representing the init state
 */
static void __init setup_init_fpu_buf(void)
{
        BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
                      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
                     XFEATURES_INIT_FPSTATE_HANDLED);

        if (!boot_cpu_has(X86_FEATURE_XSAVE))
                return;

        print_xstate_features();

        xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);

        /*
         * Init all the features state with header.xfeatures being 0x0
         */
        os_xrstor_booting(&init_fpstate.regs.xsave);

        /*
         * All components are now in init state. Read the state back so
         * that init_fpstate contains all non-zero init state. This only
         * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
         * those use the init optimization which skips writing data for
         * components in init state.
         *
         * XSAVE could be used, but that would require to reshuffle the
         * data when XSAVEC/S is available because XSAVEC/S uses xstate
         * compaction. But doing so is a pointless exercise because most
         * components have an all zeros init state except for the legacy
         * ones (FP and SSE). Those can be saved with FXSAVE into the
         * legacy area. Adding new features requires to ensure that init
         * state is all zeroes or if not to add the necessary handling
         * here.
         */
        fxsave(&init_fpstate.regs.fxsave);
}

int xfeature_size(int xfeature_nr)
{
        u32 eax, ebx, ecx, edx;

        CHECK_XFEATURE(xfeature_nr);
        cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
        return eax;
}

/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
static int validate_user_xstate_header(const struct xstate_header *hdr,
                                       struct fpstate *fpstate)
{
        /* No unknown or supervisor features may be set */
        if (hdr->xfeatures & ~fpstate->user_xfeatures)
                return -EINVAL;

        /* Userspace must use the uncompacted format */
        if (hdr->xcomp_bv)
                return -EINVAL;

        /*
         * If 'reserved' is shrunken to add a new field, make sure to validate
         * that new field here!
         */
        BUILD_BUG_ON(sizeof(hdr->reserved) != 48);

        /* No reserved bits may be set */
        if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
                return -EINVAL;

        return 0;
}

static void __init __xstate_dump_leaves(void)
{
        int i;
        u32 eax, ebx, ecx, edx;
        static int should_dump = 1;

        if (!should_dump)
                return;
        should_dump = 0;
        /*
         * Dump out a few leaves past the ones that we support
         * just in case there are some goodies up there
         */
        for (i = 0; i < XFEATURE_MAX + 10; i++) {
                cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
                pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
                        CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
        }
}

#define XSTATE_WARN_ON(x, fmt, ...) do {                                        \
        if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {        \
                __xstate_dump_leaves();                                                \
        }                                                                        \
} while (0)

#define XCHECK_SZ(sz, nr, __struct) ({                                        \
        if (WARN_ONCE(sz != sizeof(__struct),                                \
            "[%s]: struct is %zu bytes, cpu state %d bytes\n",                \
            xfeature_names[nr], sizeof(__struct), sz)) {                \
                __xstate_dump_leaves();                                        \
        }                                                                \
        true;                                                                \
})


/**
 * check_xtile_data_against_struct - Check tile data state size.
 *
 * Calculate the state size by multiplying the single tile size which is
 * recorded in a C struct, and the number of tiles that the CPU informs.
 * Compare the provided size with the calculation.
 *
 * @size:        The tile data state size
 *
 * Returns:        0 on success, -EINVAL on mismatch.
 */
static int __init check_xtile_data_against_struct(int size)
{
        u32 max_palid, palid, state_size;
        u32 eax, ebx, ecx, edx;
        u16 max_tile;

        /*
         * Check the maximum palette id:
         *   eax: the highest numbered palette subleaf.
         */
        cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);

        /*
         * Cross-check each tile size and find the maximum number of
         * supported tiles.
         */
        for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
                u16 tile_size, max;

                /*
                 * Check the tile size info:
                 *   eax[31:16]:  bytes per title
                 *   ebx[31:16]:  the max names (or max number of tiles)
                 */
                cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
                tile_size = eax >> 16;
                max = ebx >> 16;

                if (tile_size != sizeof(struct xtile_data)) {
                        pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
                               __stringify(XFEATURE_XTILE_DATA),
                               sizeof(struct xtile_data), tile_size);
                        __xstate_dump_leaves();
                        return -EINVAL;
                }

                if (max > max_tile)
                        max_tile = max;
        }

        state_size = sizeof(struct xtile_data) * max_tile;
        if (size != state_size) {
                pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
                       __stringify(XFEATURE_XTILE_DATA), state_size, size);
                __xstate_dump_leaves();
                return -EINVAL;
        }
        return 0;
}

/*
 * We have a C struct for each 'xstate'.  We need to ensure
 * that our software representation matches what the CPU
 * tells us about the state's size.
 */
static bool __init check_xstate_against_struct(int nr)
{
        /*
         * Ask the CPU for the size of the state.
         */
        int sz = xfeature_size(nr);

        /*
         * Match each CPU state with the corresponding software
         * structure.
         */
        switch (nr) {
        case XFEATURE_YMM:          return XCHECK_SZ(sz, nr, struct ymmh_struct);
        case XFEATURE_BNDREGS:          return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
        case XFEATURE_BNDCSR:          return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
        case XFEATURE_OPMASK:          return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
        case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
        case XFEATURE_Hi16_ZMM:          return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
        case XFEATURE_PKRU:          return XCHECK_SZ(sz, nr, struct pkru_state);
        case XFEATURE_PASID:          return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
        case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
        case XFEATURE_CET_USER:          return XCHECK_SZ(sz, nr, struct cet_user_state);
        case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state);
        case XFEATURE_APX:        return XCHECK_SZ(sz, nr, struct apx_state);
        case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
        default:
                XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
                return false;
        }

        return true;
}

static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
{
        unsigned int topmost = fls64(xfeatures) -  1;
        unsigned int offset, i;

        if (topmost <= XFEATURE_SSE)
                return sizeof(struct xregs_state);

        if (compacted) {
                offset = xfeature_get_offset(xfeatures, topmost);
        } else {
                /* Walk through the xfeature order to pick the last */
                for_each_extended_xfeature_in_order(i, xfeatures)
                        topmost = xfeature_uncompact_order[i];
                offset = xstate_offsets[topmost];
        }

        return offset + xstate_sizes[topmost];
}

/*
 * This essentially double-checks what the cpu told us about
 * how large the XSAVE buffer needs to be.  We are recalculating
 * it to be safe.
 *
 * Independent XSAVE features allocate their own buffers and are not
 * covered by these checks. Only the size of the buffer for task->fpu
 * is checked here.
 */
static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
{
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
        bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
        unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
        int i;

        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
                if (!check_xstate_against_struct(i))
                        return false;
                /*
                 * Supervisor state components can be managed only by
                 * XSAVES.
                 */
                if (!xsaves && xfeature_is_supervisor(i)) {
                        XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
                        return false;
                }
        }
        size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
        XSTATE_WARN_ON(size != kernel_size,
                       "size %u != kernel_size %u\n", size, kernel_size);
        return size == kernel_size;
}

/*
 * Get total size of enabled xstates in XCR0 | IA32_XSS.
 *
 * Note the SDM's wording here.  "sub-function 0" only enumerates
 * the size of the *user* states.  If we use it to size a buffer
 * that we use 'XSAVES' on, we could potentially overflow the
 * buffer because 'XSAVES' saves system states too.
 *
 * This also takes compaction into account. So this works for
 * XSAVEC as well.
 */
static unsigned int __init get_compacted_size(void)
{
        unsigned int eax, ebx, ecx, edx;
        /*
         * - CPUID function 0DH, sub-function 1:
         *    EBX enumerates the size (in bytes) required by
         *    the XSAVES instruction for an XSAVE area
         *    containing all the state components
         *    corresponding to bits currently set in
         *    XCR0 | IA32_XSS.
         *
         * When XSAVES is not available but XSAVEC is (virt), then there
         * are no supervisor states, but XSAVEC still uses compacted
         * format.
         */
        cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
        return ebx;
}

/*
 * Get the total size of the enabled xstates without the independent supervisor
 * features.
 */
static unsigned int __init get_xsave_compacted_size(void)
{
        u64 mask = xfeatures_mask_independent();
        unsigned int size;

        if (!mask)
                return get_compacted_size();

        /* Disable independent features. */
        wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor());

        /*
         * Ask the hardware what size is required of the buffer.
         * This is the size required for the task->fpu buffer.
         */
        size = get_compacted_size();

        /* Re-enable independent features so XSAVES will work on them again. */
        wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);

        return size;
}

static unsigned int __init get_xsave_size_user(void)
{
        unsigned int eax, ebx, ecx, edx;
        /*
         * - CPUID function 0DH, sub-function 0:
         *    EBX enumerates the size (in bytes) required by
         *    the XSAVE instruction for an XSAVE area
         *    containing all the *user* state components
         *    corresponding to bits currently set in XCR0.
         */
        cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
        return ebx;
}

static int __init init_xstate_size(void)
{
        /* Recompute the context size for enabled features: */
        unsigned int user_size, kernel_size, kernel_default_size;
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);

        /* Uncompacted user space size */
        user_size = get_xsave_size_user();

        /*
         * XSAVES kernel size includes supervisor states and uses compacted
         * format. XSAVEC uses compacted format, but does not save
         * supervisor states.
         *
         * XSAVE[OPT] do not support supervisor states so kernel and user
         * size is identical.
         */
        if (compacted)
                kernel_size = get_xsave_compacted_size();
        else
                kernel_size = user_size;

        kernel_default_size =
                xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);

        if (!paranoid_xstate_size_valid(kernel_size))
                return -EINVAL;

        fpu_kernel_cfg.max_size = kernel_size;
        fpu_user_cfg.max_size = user_size;

        fpu_kernel_cfg.default_size = kernel_default_size;
        fpu_user_cfg.default_size =
                xstate_calculate_size(fpu_user_cfg.default_features, false);

        guest_default_cfg.size =
                xstate_calculate_size(guest_default_cfg.features, compacted);

        return 0;
}

/*
 * We enabled the XSAVE hardware, but something went wrong and
 * we can not use it.  Disable it.
 */
static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
{
        pr_info("x86/fpu: XSAVE disabled\n");

        fpu_kernel_cfg.max_features = 0;
        cr4_clear_bits(X86_CR4_OSXSAVE);
        setup_clear_cpu_cap(X86_FEATURE_XSAVE);

        /* Restore the legacy size.*/
        fpu_kernel_cfg.max_size = legacy_size;
        fpu_kernel_cfg.default_size = legacy_size;
        fpu_user_cfg.max_size = legacy_size;
        fpu_user_cfg.default_size = legacy_size;
        guest_default_cfg.size = legacy_size;

        /*
         * Prevent enabling the static branch which enables writes to the
         * XFD MSR.
         */
        init_fpstate.xfd = 0;

        fpstate_reset(x86_task_fpu(current));
}

static u64 __init host_default_mask(void)
{
        /*
         * Exclude dynamic features (require userspace opt-in) and features
         * that are supported only for KVM guests.
         */
        return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR);
}

static u64 __init guest_default_mask(void)
{
        /*
         * Exclude dynamic features, which require userspace opt-in even
         * for KVM guests.
         */
        return ~(u64)XFEATURE_MASK_USER_DYNAMIC;
}

/*
 * Enable and initialize the xsave feature.
 * Called once per system bootup.
 */
void __init fpu__init_system_xstate(unsigned int legacy_size)
{
        unsigned int eax, ebx, ecx, edx;
        u64 xfeatures;
        int err;
        int i;

        if (!boot_cpu_has(X86_FEATURE_FPU)) {
                pr_info("x86/fpu: No FPU detected\n");
                return;
        }

        if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
                pr_info("x86/fpu: x87 FPU will use %s\n",
                        boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
                return;
        }

        /*
         * Find user xstates supported by the processor.
         */
        cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
        fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);

        /*
         * Find supervisor xstates supported by the processor.
         */
        cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
        fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);

        if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
                /*
                 * This indicates that something really unexpected happened
                 * with the enumeration.  Disable XSAVE and try to continue
                 * booting without it.  This is too early to BUG().
                 */
                pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
                       fpu_kernel_cfg.max_features);
                goto out_disable;
        }

        if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX &&
            fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) {
                /*
                 * This is a problematic CPU configuration where two
                 * conflicting state components are both enumerated.
                 */
                pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n",
                       fpu_kernel_cfg.max_features);
                goto out_disable;
        }

        fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
                                              XFEATURE_MASK_INDEPENDENT;

        /*
         * Clear XSAVE features that are disabled in the normal CPUID.
         */
        for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
                unsigned short cid = xsave_cpuid_features[i];

                /* Careful: X86_FEATURE_FPU is 0! */
                if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
                        fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
        }

        if (!cpu_feature_enabled(X86_FEATURE_XFD))
                fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;

        if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
                fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
        else
                fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
                                        XFEATURE_MASK_SUPERVISOR_SUPPORTED;

        fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
        fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;

        /*
         * Now, given maximum feature set, determine default values by
         * applying default masks.
         */
        fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask();
        fpu_user_cfg.default_features   = fpu_user_cfg.max_features & host_default_mask();
        guest_default_cfg.features      = fpu_kernel_cfg.max_features & guest_default_mask();

        /* Store it for paranoia check at the end */
        xfeatures = fpu_kernel_cfg.max_features;

        /*
         * Initialize the default XFD state in initfp_state and enable the
         * dynamic sizing mechanism if dynamic states are available.  The
         * static key cannot be enabled here because this runs before
         * jump_label_init(). This is delayed to an initcall.
         */
        init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;

        /* Set up compaction feature bit */
        if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
            cpu_feature_enabled(X86_FEATURE_XSAVES))
                setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);

        /* Enable xstate instructions to be able to continue with initialization: */
        fpu__init_cpu_xstate();

        /* Cache size, offset and flags for initialization */
        setup_xstate_cache();

        err = init_xstate_size();
        if (err)
                goto out_disable;

        /*
         * Update info used for ptrace frames; use standard-format size and no
         * supervisor xstates:
         */
        update_regset_xstate_info(fpu_user_cfg.max_size,
                                  fpu_user_cfg.max_features);

        /*
         * init_fpstate excludes dynamic states as they are large but init
         * state is zero.
         */
        init_fpstate.size                = fpu_kernel_cfg.default_size;
        init_fpstate.xfeatures                = fpu_kernel_cfg.default_features;

        if (init_fpstate.size > sizeof(init_fpstate.regs)) {
                pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n",
                        sizeof(init_fpstate.regs), init_fpstate.size);
                goto out_disable;
        }

        setup_init_fpu_buf();

        /*
         * Paranoia check whether something in the setup modified the
         * xfeatures mask.
         */
        if (xfeatures != fpu_kernel_cfg.max_features) {
                pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n",
                       xfeatures, fpu_kernel_cfg.max_features);
                goto out_disable;
        }

        /*
         * CPU capabilities initialization runs before FPU init. So
         * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
         * functional, set the feature bit so depending code works.
         */
        setup_force_cpu_cap(X86_FEATURE_OSXSAVE);

        print_xstate_offset_size();
        pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
                fpu_kernel_cfg.max_features,
                fpu_kernel_cfg.max_size,
                boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
        return;

out_disable:
        /* something went wrong, try to boot without any XSAVE support */
        fpu__init_disable_system_xstate(legacy_size);
}

/*
 * Restore minimal FPU state after suspend:
 */
void fpu__resume_cpu(void)
{
        /*
         * Restore XCR0 on xsave capable CPUs:
         */
        if (cpu_feature_enabled(X86_FEATURE_XSAVE))
                xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);

        /*
         * Restore IA32_XSS. The same CPUID bit enumerates support
         * of XSAVES and MSR_IA32_XSS.
         */
        if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
                wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
                                     xfeatures_mask_independent());
        }

        if (fpu_state_size_dynamic())
                wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd);
}

/*
 * Given an xstate feature nr, calculate where in the xsave
 * buffer the state is.  Callers should ensure that the buffer
 * is valid.
 */
static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
{
        u64 xcomp_bv = xsave->header.xcomp_bv;

        if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
                return NULL;

        if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
                if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
                        return NULL;
        }

        return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
}

/*
 * Given the xsave area and a state inside, this function returns the
 * address of the state.
 *
 * This is the API that is called to get xstate address in either
 * standard format or compacted format of xsave area.
 *
 * Note that if there is no data for the field in the xsave buffer
 * this will return NULL.
 *
 * Inputs:
 *        xstate: the thread's storage area for all FPU data
 *        xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
 *        XFEATURE_SSE, etc...)
 * Output:
 *        address of the state in the xsave area, or NULL if the
 *        field is not present in the xsave buffer.
 */
void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
{
        /*
         * Do we even *have* xsave state?
         */
        if (!boot_cpu_has(X86_FEATURE_XSAVE))
                return NULL;

        /*
         * We should not ever be requesting features that we
         * have not enabled.
         */
        if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
                return NULL;

        /*
         * This assumes the last 'xsave*' instruction to
         * have requested that 'xfeature_nr' be saved.
         * If it did not, we might be seeing and old value
         * of the field in the buffer.
         *
         * This can happen because the last 'xsave' did not
         * request that this feature be saved (unlikely)
         * or because the "init optimization" caused it
         * to not be saved.
         */
        if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
                return NULL;

        return __raw_xsave_addr(xsave, xfeature_nr);
}
EXPORT_SYMBOL_FOR_KVM(get_xsave_addr);

/*
 * Given an xstate feature nr, calculate where in the xsave buffer the state is.
 * The xsave buffer should be in standard format, not compacted (e.g. user mode
 * signal frames).
 */
void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
{
        if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
                return NULL;

        return (void __user *)xsave + xstate_offsets[xfeature_nr];
}

#ifdef CONFIG_ARCH_HAS_PKEYS

/*
 * This will go out and modify PKRU register to set the access
 * rights for @pkey to @init_val.
 */
int arch_set_user_pkey_access(int pkey, unsigned long init_val)
{
        u32 old_pkru, new_pkru_bits = 0;
        int pkey_shift;

        /*
         * This check implies XSAVE support.  OSPKE only gets
         * set if we enable XSAVE and we enable PKU in XCR0.
         */
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return -EINVAL;

        /*
         * This code should only be called with valid 'pkey'
         * values originating from in-kernel users.  Complain
         * if a bad value is observed.
         */
        if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
                return -EINVAL;

        /* Set the bits we need in PKRU:  */
        if (init_val & PKEY_DISABLE_ACCESS)
                new_pkru_bits |= PKRU_AD_BIT;
        if (init_val & PKEY_DISABLE_WRITE)
                new_pkru_bits |= PKRU_WD_BIT;

        /* Shift the bits in to the correct place in PKRU for pkey: */
        pkey_shift = pkey * PKRU_BITS_PER_PKEY;
        new_pkru_bits <<= pkey_shift;

        /* Get old PKRU and mask off any old bits in place: */
        old_pkru = read_pkru();
        old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);

        /* Write old part along with new part: */
        write_pkru(old_pkru | new_pkru_bits);

        return 0;
}
#endif /* ! CONFIG_ARCH_HAS_PKEYS */

static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
                         void *init_xstate, unsigned int size)
{
        membuf_write(to, from_xstate ? xstate : init_xstate, size);
}

/**
 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
 * @to:                membuf descriptor
 * @fpstate:        The fpstate buffer from which to copy
 * @xfeatures:        The mask of xfeatures to save (XSAVE mode only)
 * @pkru_val:        The PKRU value to store in the PKRU component
 * @copy_mode:        The requested copy mode
 *
 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
 * format, i.e. from the kernel internal hardware dependent storage format
 * to the requested @mode. UABI XSTATE is always uncompacted!
 *
 * It supports partial copy but @to.pos always starts from zero.
 */
void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
                               u64 xfeatures, u32 pkru_val,
                               enum xstate_copy_mode copy_mode)
{
        const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
        struct xregs_state *xinit = &init_fpstate.regs.xsave;
        struct xregs_state *xsave = &fpstate->regs.xsave;
        unsigned int zerofrom, i, xfeature;
        struct xstate_header header;
        u64 mask;

        memset(&header, 0, sizeof(header));
        header.xfeatures = xsave->header.xfeatures;

        /* Mask out the feature bits depending on copy mode */
        switch (copy_mode) {
        case XSTATE_COPY_FP:
                header.xfeatures &= XFEATURE_MASK_FP;
                break;

        case XSTATE_COPY_FX:
                header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
                break;

        case XSTATE_COPY_XSAVE:
                header.xfeatures &= fpstate->user_xfeatures & xfeatures;
                break;
        }

        /* Copy FP state up to MXCSR */
        copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
                     &xinit->i387, off_mxcsr);

        /* Copy MXCSR when SSE or YMM are set in the feature mask */
        copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
                     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
                     MXCSR_AND_FLAGS_SIZE);

        /* Copy the remaining FP state */
        copy_feature(header.xfeatures & XFEATURE_MASK_FP,
                     &to, &xsave->i387.st_space, &xinit->i387.st_space,
                     sizeof(xsave->i387.st_space));

        /* Copy the SSE state - shared with YMM, but independently managed */
        copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
                     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
                     sizeof(xsave->i387.xmm_space));

        if (copy_mode != XSTATE_COPY_XSAVE)
                goto out;

        /* Zero the padding area */
        membuf_zero(&to, sizeof(xsave->i387.padding));

        /* Copy xsave->i387.sw_reserved */
        membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));

        /* Copy the user space relevant state of @xsave->header */
        membuf_write(&to, &header, sizeof(header));

        zerofrom = offsetof(struct xregs_state, extended_state_area);

        /*
         * This 'mask' indicates which states to copy from fpstate.
         * Those extended states that are not present in fpstate are
         * either disabled or initialized:
         *
         * In non-compacted format, disabled features still occupy
         * state space but there is no state to copy from in the
         * compacted init_fpstate. The gap tracking will zero these
         * states.
         *
         * The extended features have an all zeroes init state. Thus,
         * remove them from 'mask' to zero those features in the user
         * buffer instead of retrieving them from init_fpstate.
         */
        mask = header.xfeatures;

        for_each_extended_xfeature_in_order(i, mask) {
                xfeature = xfeature_uncompact_order[i];
                /*
                 * If there was a feature or alignment gap, zero the space
                 * in the destination buffer.
                 */
                if (zerofrom < xstate_offsets[xfeature])
                        membuf_zero(&to, xstate_offsets[xfeature] - zerofrom);

                if (xfeature == XFEATURE_PKRU) {
                        struct pkru_state pkru = {0};
                        /*
                         * PKRU is not necessarily up to date in the
                         * XSAVE buffer. Use the provided value.
                         */
                        pkru.pkru = pkru_val;
                        membuf_write(&to, &pkru, sizeof(pkru));
                } else {
                        membuf_write(&to,
                                     __raw_xsave_addr(xsave, xfeature),
                                     xstate_sizes[xfeature]);
                }
                /*
                 * Keep track of the last copied state in the non-compacted
                 * target buffer for gap zeroing.
                 */
                zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature];
        }

out:
        if (to.left)
                membuf_zero(&to, to.left);
}

/**
 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
 * @to:                membuf descriptor
 * @tsk:        The task from which to copy the saved xstate
 * @copy_mode:        The requested copy mode
 *
 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
 * format, i.e. from the kernel internal hardware dependent storage format
 * to the requested @mode. UABI XSTATE is always uncompacted!
 *
 * It supports partial copy but @to.pos always starts from zero.
 */
void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
                             enum xstate_copy_mode copy_mode)
{
        __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate,
                                  x86_task_fpu(tsk)->fpstate->user_xfeatures,
                                  tsk->thread.pkru, copy_mode);
}

static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
                            const void *kbuf, const void __user *ubuf)
{
        if (kbuf) {
                memcpy(dst, kbuf + offset, size);
        } else {
                if (copy_from_user(dst, ubuf + offset, size))
                        return -EFAULT;
        }
        return 0;
}


/**
 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
 * @fpstate:        The fpstate buffer to copy to
 * @kbuf:        The UABI format buffer, if it comes from the kernel
 * @ubuf:        The UABI format buffer, if it comes from userspace
 * @pkru:        The location to write the PKRU value to
 *
 * Converts from the UABI format into the kernel internal hardware
 * dependent format.
 *
 * This function ultimately has three different callers with distinct PKRU
 * behavior.
 * 1.        When called from sigreturn the PKRU register will be restored from
 *        @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
 *        @fpstate is sufficient to cover this case, but the caller will also
 *        pass a pointer to the thread_struct's pkru field in @pkru and updating
 *        it is harmless.
 * 2.        When called from ptrace the PKRU register will be restored from the
 *        thread_struct's pkru field. A pointer to that is passed in @pkru.
 *        The kernel will restore it manually, so the XRSTOR behavior that resets
 *        the PKRU register to the hardware init value (0) if the corresponding
 *        xfeatures bit is not set is emulated here.
 * 3.        When called from KVM the PKRU register will be restored from the vcpu's
 *        pkru field. A pointer to that is passed in @pkru. KVM hasn't used
 *        XRSTOR and hasn't had the PKRU resetting behavior described above. To
 *        preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
 *        bit is not set.
 */
static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
                               const void __user *ubuf, u32 *pkru)
{
        struct xregs_state *xsave = &fpstate->regs.xsave;
        unsigned int offset, size;
        struct xstate_header hdr;
        u64 mask;
        int i;

        offset = offsetof(struct xregs_state, header);
        if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
                return -EFAULT;

        if (validate_user_xstate_header(&hdr, fpstate))
                return -EINVAL;

        /* Validate MXCSR when any of the related features is in use */
        mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
        if (hdr.xfeatures & mask) {
                u32 mxcsr[2];

                offset = offsetof(struct fxregs_state, mxcsr);
                if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
                        return -EFAULT;

                /* Reserved bits in MXCSR must be zero. */
                if (mxcsr[0] & ~mxcsr_feature_mask)
                        return -EINVAL;

                /* SSE and YMM require MXCSR even when FP is not in use. */
                if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
                        xsave->i387.mxcsr = mxcsr[0];
                        xsave->i387.mxcsr_mask = mxcsr[1];
                }
        }

        for (i = 0; i < XFEATURE_MAX; i++) {
                mask = BIT_ULL(i);

                if (hdr.xfeatures & mask) {
                        void *dst = __raw_xsave_addr(xsave, i);

                        offset = xstate_offsets[i];
                        size = xstate_sizes[i];

                        if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
                                return -EFAULT;
                }
        }

        if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
                struct pkru_state *xpkru;

                xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
                *pkru = xpkru->pkru;
        } else {
                /*
                 * KVM may pass NULL here to indicate that it does not need
                 * PKRU updated.
                 */
                if (pkru)
                        *pkru = 0;
        }

        /*
         * The state that came in from userspace was user-state only.
         * Mask all the user states out of 'xfeatures':
         */
        xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;

        /*
         * Add back in the features that came in from userspace:
         */
        xsave->header.xfeatures |= hdr.xfeatures;

        return 0;
}

/*
 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
 * format and copy to the target thread. Used by ptrace and KVM.
 */
int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
{
        return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
}

/*
 * Convert from a sigreturn standard-format user-space buffer to kernel
 * XSAVE[S] format and copy to the target thread. This is called from the
 * sigreturn() and rt_sigreturn() system calls.
 */
int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
                                      const void __user *ubuf)
{
        return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru);
}

static bool validate_independent_components(u64 mask)
{
        u64 xchk;

        if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
                return false;

        xchk = ~xfeatures_mask_independent();

        if (WARN_ON_ONCE(!mask || mask & xchk))
                return false;

        return true;
}

/**
 * xsaves - Save selected components to a kernel xstate buffer
 * @xstate:        Pointer to the buffer
 * @mask:        Feature mask to select the components to save
 *
 * The @xstate buffer must be 64 byte aligned and correctly initialized as
 * XSAVES does not write the full xstate header. Before first use the
 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
 * can #GP.
 *
 * The feature mask must be a subset of the independent features.
 */
void xsaves(struct xregs_state *xstate, u64 mask)
{
        int err;

        if (!validate_independent_components(mask))
                return;

        XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
        WARN_ON_ONCE(err);
}

/**
 * xrstors - Restore selected components from a kernel xstate buffer
 * @xstate:        Pointer to the buffer
 * @mask:        Feature mask to select the components to restore
 *
 * The @xstate buffer must be 64 byte aligned and correctly initialized
 * otherwise XRSTORS from that buffer can #GP.
 *
 * Proper usage is to restore the state which was saved with
 * xsaves() into @xstate.
 *
 * The feature mask must be a subset of the independent features.
 */
void xrstors(struct xregs_state *xstate, u64 mask)
{
        int err;

        if (!validate_independent_components(mask))
                return;

        XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
        WARN_ON_ONCE(err);
}

#if IS_ENABLED(CONFIG_KVM)
void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature)
{
        void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature);

        if (addr)
                memset(addr, 0, xstate_sizes[xfeature]);
}
EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component);
#endif

#ifdef CONFIG_X86_64

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
 * can safely operate on the @fpstate buffer.
 */
static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
{
        u64 xfd = __this_cpu_read(xfd_state);

        if (fpstate->xfd == xfd)
                return true;

         /*
          * The XFD MSR does not match fpstate->xfd. That's invalid when
          * the passed in fpstate is current's fpstate.
          */
        if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd)
                return false;

        /*
         * XRSTOR(S) from init_fpstate are always correct as it will just
         * bring all components into init state and not read from the
         * buffer. XSAVE(S) raises #PF after init.
         */
        if (fpstate == &init_fpstate)
                return rstor;

        /*
         * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
         * XRSTORS(S): fpu_swap_kvm_fpstate()
         */

        /*
         * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
         * the buffer area for XFD-disabled state components.
         */
        mask &= ~xfd;

        /*
         * Remove features which are valid in fpstate. They
         * have space allocated in fpstate.
         */
        mask &= ~fpstate->xfeatures;

        /*
         * Any remaining state components in 'mask' might be written
         * by XSAVE/XRSTOR. Fail validation it found.
         */
        return !mask;
}

void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
{
        WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
}
#endif /* CONFIG_X86_DEBUG_FPU */

static int __init xfd_update_static_branch(void)
{
        /*
         * If init_fpstate.xfd has bits set then dynamic features are
         * available and the dynamic sizing must be enabled.
         */
        if (init_fpstate.xfd)
                static_branch_enable(&__fpu_state_size_dynamic);
        return 0;
}
arch_initcall(xfd_update_static_branch)

void fpstate_free(struct fpu *fpu)
{
        if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
                vfree(fpu->fpstate);
}

/**
 * fpstate_realloc - Reallocate struct fpstate for the requested new features
 *
 * @xfeatures:        A bitmap of xstate features which extend the enabled features
 *                of that task
 * @ksize:        The required size for the kernel buffer
 * @usize:        The required size for user space buffers
 * @guest_fpu:        Pointer to a guest FPU container. NULL for host allocations
 *
 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
 * with large states are likely to live longer.
 *
 * Returns: 0 on success, -ENOMEM on allocation error.
 */
static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
                           unsigned int usize, struct fpu_guest *guest_fpu)
{
        struct fpu *fpu = x86_task_fpu(current);
        struct fpstate *curfps, *newfps = NULL;
        unsigned int fpsize;
        bool in_use;

        fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);

        newfps = vzalloc(fpsize);
        if (!newfps)
                return -ENOMEM;
        newfps->size = ksize;
        newfps->user_size = usize;
        newfps->is_valloc = true;

        /*
         * When a guest FPU is supplied, use @guest_fpu->fpstate
         * as reference independent whether it is in use or not.
         */
        curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;

        /* Determine whether @curfps is the active fpstate */
        in_use = fpu->fpstate == curfps;

        if (guest_fpu) {
                newfps->is_guest = true;
                newfps->is_confidential = curfps->is_confidential;
                newfps->in_use = curfps->in_use;
                guest_fpu->xfeatures |= xfeatures;
                guest_fpu->uabi_size = usize;
        }

        fpregs_lock();
        /*
         * If @curfps is in use, ensure that the current state is in the
         * registers before swapping fpstate as that might invalidate it
         * due to layout changes.
         */
        if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();

        newfps->xfeatures = curfps->xfeatures | xfeatures;
        newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
        newfps->xfd = curfps->xfd & ~xfeatures;

        /* Do the final updates within the locked region */
        xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);

        if (guest_fpu) {
                guest_fpu->fpstate = newfps;
                /* If curfps is active, update the FPU fpstate pointer */
                if (in_use)
                        fpu->fpstate = newfps;
        } else {
                fpu->fpstate = newfps;
        }

        if (in_use)
                xfd_update_state(fpu->fpstate);
        fpregs_unlock();

        /* Only free valloc'ed state */
        if (curfps && curfps->is_valloc)
                vfree(curfps);

        return 0;
}

static int validate_sigaltstack(unsigned int usize)
{
        struct task_struct *thread, *leader = current->group_leader;
        unsigned long framesize = get_sigframe_size();

        lockdep_assert_held(&current->sighand->siglock);

        /* get_sigframe_size() is based on fpu_user_cfg.max_size */
        framesize -= fpu_user_cfg.max_size;
        framesize += usize;
        for_each_thread(leader, thread) {
                if (thread->sas_ss_size && thread->sas_ss_size < framesize)
                        return -ENOSPC;
        }
        return 0;
}

static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
{
        /*
         * This deliberately does not exclude !XSAVES as we still might
         * decide to optionally context switch XCR0 or talk the silicon
         * vendors into extending XFD for the pre AMX states, especially
         * AVX512.
         */
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
        struct fpu *fpu = x86_task_fpu(current->group_leader);
        struct fpu_state_perm *perm;
        unsigned int ksize, usize;
        u64 mask;
        int ret = 0;

        /* Check whether fully enabled */
        if ((permitted & requested) == requested)
                return 0;

        /*
         * Calculate the resulting kernel state size.  Note, @permitted also
         * contains supervisor xfeatures even though supervisor are always
         * permitted for kernel and guest FPUs, and never permitted for user
         * FPUs.
         */
        mask = permitted | requested;
        ksize = xstate_calculate_size(mask, compacted);

        /*
         * Calculate the resulting user state size.  Take care not to clobber
         * the supervisor xfeatures in the new mask!
         */
        usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false);

        if (!guest) {
                ret = validate_sigaltstack(usize);
                if (ret)
                        return ret;
        }

        perm = guest ? &fpu->guest_perm : &fpu->perm;
        /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
        WRITE_ONCE(perm->__state_perm, mask);
        /* Protected by sighand lock */
        perm->__state_size = ksize;
        perm->__user_state_size = usize;
        return ret;
}

/*
 * Permissions array to map facilities with more than one component
 */
static const u64 xstate_prctl_req[XFEATURE_MAX] = {
        [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
};

static int xstate_request_perm(unsigned long idx, bool guest)
{
        u64 permitted, requested;
        int ret;

        if (idx >= XFEATURE_MAX)
                return -EINVAL;

        /*
         * Look up the facility mask which can require more than
         * one xstate component.
         */
        idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
        requested = xstate_prctl_req[idx];
        if (!requested)
                return -EOPNOTSUPP;

        if ((fpu_user_cfg.max_features & requested) != requested)
                return -EOPNOTSUPP;

        /* Lockless quick check */
        permitted = xstate_get_group_perm(guest);
        if ((permitted & requested) == requested)
                return 0;

        /* Protect against concurrent modifications */
        spin_lock_irq(&current->sighand->siglock);
        permitted = xstate_get_group_perm(guest);

        /* First vCPU allocation locks the permissions. */
        if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
                ret = -EBUSY;
        else
                ret = __xstate_request_perm(permitted, requested, guest);
        spin_unlock_irq(&current->sighand->siglock);
        return ret;
}

int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
{
        u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
        struct fpu_state_perm *perm;
        unsigned int ksize, usize;
        struct fpu *fpu;

        if (!xfd_event) {
                if (!guest_fpu)
                        pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
                return 0;
        }

        /* Protect against concurrent modifications */
        spin_lock_irq(&current->sighand->siglock);

        /* If not permitted let it die */
        if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
                spin_unlock_irq(&current->sighand->siglock);
                return -EPERM;
        }

        fpu = x86_task_fpu(current->group_leader);
        perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
        ksize = perm->__state_size;
        usize = perm->__user_state_size;

        /*
         * The feature is permitted. State size is sufficient.  Dropping
         * the lock is safe here even if more features are added from
         * another task, the retrieved buffer sizes are valid for the
         * currently requested feature(s).
         */
        spin_unlock_irq(&current->sighand->siglock);

        /*
         * Try to allocate a new fpstate. If that fails there is no way
         * out.
         */
        if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
                return -EFAULT;
        return 0;
}

int xfd_enable_feature(u64 xfd_err)
{
        return __xfd_enable_feature(xfd_err, NULL);
}

#else /* CONFIG_X86_64 */
static inline int xstate_request_perm(unsigned long idx, bool guest)
{
        return -EPERM;
}
#endif  /* !CONFIG_X86_64 */

u64 xstate_get_guest_group_perm(void)
{
        return xstate_get_group_perm(true);
}
EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm);

/**
 * fpu_xstate_prctl - xstate permission operations
 * @option:        A subfunction of arch_prctl()
 * @arg2:        option argument
 * Return:        0 if successful; otherwise, an error code
 *
 * Option arguments:
 *
 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
 * ARCH_REQ_XCOMP_PERM: Facility number requested
 *
 * For facilities which require more than one XSTATE component, the request
 * must be the highest state component number related to that facility,
 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
 */
long fpu_xstate_prctl(int option, unsigned long arg2)
{
        u64 __user *uptr = (u64 __user *)arg2;
        u64 permitted, supported;
        unsigned long idx = arg2;
        bool guest = false;

        switch (option) {
        case ARCH_GET_XCOMP_SUPP:
                supported = fpu_user_cfg.max_features |        fpu_user_cfg.legacy_features;
                return put_user(supported, uptr);

        case ARCH_GET_XCOMP_PERM:
                /*
                 * Lockless snapshot as it can also change right after the
                 * dropping the lock.
                 */
                permitted = xstate_get_host_group_perm();
                permitted &= XFEATURE_MASK_USER_SUPPORTED;
                return put_user(permitted, uptr);

        case ARCH_GET_XCOMP_GUEST_PERM:
                permitted = xstate_get_guest_group_perm();
                permitted &= XFEATURE_MASK_USER_SUPPORTED;
                return put_user(permitted, uptr);

        case ARCH_REQ_XCOMP_GUEST_PERM:
                guest = true;
                fallthrough;

        case ARCH_REQ_XCOMP_PERM:
                if (!IS_ENABLED(CONFIG_X86_64))
                        return -EOPNOTSUPP;

                return xstate_request_perm(idx, guest);

        default:
                return -EINVAL;
        }
}

#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
 * Report the amount of time elapsed in millisecond since last AVX512
 * use in the task. Report -1 if no AVX-512 usage.
 */
static void avx512_status(struct seq_file *m, struct task_struct *task)
{
        unsigned long timestamp;
        long delta = -1;

        /* AVX-512 usage is not tracked for kernel threads. Don't report anything. */
        if (task->flags & (PF_KTHREAD | PF_USER_WORKER))
                return;

        timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);

        if (timestamp) {
                delta = (long)(jiffies - timestamp);
                /*
                 * Cap to LONG_MAX if time difference > LONG_MAX
                 */
                if (delta < 0)
                        delta = LONG_MAX;
                delta = jiffies_to_msecs(delta);
        }

        seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
        seq_putc(m, '\n');
}

/*
 * Report architecture specific information
 */
int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
{
        /*
         * Report AVX512 state if the processor and build option supported.
         */
        if (cpu_feature_enabled(X86_FEATURE_AVX512F))
                avx512_status(m, task);

        return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */

#ifdef CONFIG_COREDUMP
static const char owner_name[] = "LINUX";

/*
 * Dump type, size, offset and flag values for every xfeature that is present.
 */
static int dump_xsave_layout_desc(struct coredump_params *cprm)
{
        int num_records = 0;
        int i;

        for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
                struct x86_xfeat_component xc = {
                        .type   = i,
                        .size   = xstate_sizes[i],
                        .offset = xstate_offsets[i],
                        /* reserved for future use */
                        .flags  = 0,
                };

                if (!dump_emit(cprm, &xc, sizeof(xc)))
                        return -1;

                num_records++;
        }
        return num_records;
}

static u32 get_xsave_desc_size(void)
{
        u32 cnt = 0;
        u32 i;

        for_each_extended_xfeature(i, fpu_user_cfg.max_features)
                cnt++;

        return cnt * (sizeof(struct x86_xfeat_component));
}

int elf_coredump_extra_notes_write(struct coredump_params *cprm)
{
        int num_records = 0;
        struct elf_note en;

        if (!fpu_user_cfg.max_features)
                return 0;

        en.n_namesz = sizeof(owner_name);
        en.n_descsz = get_xsave_desc_size();
        en.n_type = NT_X86_XSAVE_LAYOUT;

        if (!dump_emit(cprm, &en, sizeof(en)))
                return 1;
        if (!dump_emit(cprm, owner_name, en.n_namesz))
                return 1;
        if (!dump_align(cprm, 4))
                return 1;

        num_records = dump_xsave_layout_desc(cprm);
        if (num_records < 0)
                return 1;

        /* Total size should be equal to the number of records */
        if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
                return 1;

        return 0;
}

int elf_coredump_extra_notes_size(void)
{
        int size;

        if (!fpu_user_cfg.max_features)
                return 0;

        /* .note header */
        size  = sizeof(struct elf_note);
        /*  Name plus alignment to 4 bytes */
        size += roundup(sizeof(owner_name), 4);
        size += get_xsave_desc_size();

        return size;
}
#endif /* CONFIG_COREDUMP */
























































    2 






    2 





















    1 


    1 
    1 











































    2 



    2 




































    2 











    2 





















































    2 


    2 



    2 












    1 















































    2 









    1 


























    1 















    1 













































































    2 





































































    2 














    1 







    2 








































    1 




    1 

















    2 











    2 























    2 









    2 


    2 


    2 












    2 




    2 




































    2 

































    1 



























    1 











    2 















































    1 

























    2 



















    2 




    2 












    2 




























    2 








    2 


    1 
    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
// SPDX-License-Identifier: GPL-2.0+
/*
 * the_nilfs shared structure.
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Ryusuke Konishi.
 *
 */

#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/log2.h>
#include <linux/crc32.h>
#include "nilfs.h"
#include "segment.h"
#include "alloc.h"
#include "cpfile.h"
#include "sufile.h"
#include "dat.h"
#include "segbuf.h"


static int nilfs_valid_sb(struct nilfs_super_block *sbp);

void nilfs_set_last_segment(struct the_nilfs *nilfs,
                            sector_t start_blocknr, u64 seq, __u64 cno)
{
        spin_lock(&nilfs->ns_last_segment_lock);
        nilfs->ns_last_pseg = start_blocknr;
        nilfs->ns_last_seq = seq;
        nilfs->ns_last_cno = cno;

        if (!nilfs_sb_dirty(nilfs)) {
                if (nilfs->ns_prev_seq == nilfs->ns_last_seq)
                        goto stay_cursor;

                set_nilfs_sb_dirty(nilfs);
        }
        nilfs->ns_prev_seq = nilfs->ns_last_seq;

 stay_cursor:
        spin_unlock(&nilfs->ns_last_segment_lock);
}

/**
 * alloc_nilfs - allocate a nilfs object
 * @sb: super block instance
 *
 * Return: a pointer to the allocated nilfs object on success, or NULL on
 * failure.
 */
struct the_nilfs *alloc_nilfs(struct super_block *sb)
{
        struct the_nilfs *nilfs;

        nilfs = kzalloc_obj(*nilfs);
        if (!nilfs)
                return NULL;

        nilfs->ns_sb = sb;
        nilfs->ns_bdev = sb->s_bdev;
        atomic_set(&nilfs->ns_ndirtyblks, 0);
        init_rwsem(&nilfs->ns_sem);
        mutex_init(&nilfs->ns_snapshot_mount_mutex);
        INIT_LIST_HEAD(&nilfs->ns_dirty_files);
        INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
        spin_lock_init(&nilfs->ns_inode_lock);
        spin_lock_init(&nilfs->ns_last_segment_lock);
        nilfs->ns_cptree = RB_ROOT;
        spin_lock_init(&nilfs->ns_cptree_lock);
        init_rwsem(&nilfs->ns_segctor_sem);
        nilfs->ns_sb_update_freq = NILFS_SB_FREQ;

        return nilfs;
}

/**
 * destroy_nilfs - destroy nilfs object
 * @nilfs: nilfs object to be released
 */
void destroy_nilfs(struct the_nilfs *nilfs)
{
        might_sleep();
        if (nilfs_init(nilfs)) {
                brelse(nilfs->ns_sbh[0]);
                brelse(nilfs->ns_sbh[1]);
        }
        kfree(nilfs);
}

static int nilfs_load_super_root(struct the_nilfs *nilfs,
                                 struct super_block *sb, sector_t sr_block)
{
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
        struct nilfs_inode *rawi;
        unsigned int dat_entry_size, segment_usage_size, checkpoint_size;
        unsigned int inode_size;
        int err;

        err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
        if (unlikely(err))
                return err;

        down_read(&nilfs->ns_sem);
        dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
        checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
        segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
        up_read(&nilfs->ns_sem);

        inode_size = nilfs->ns_inode_size;

        rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
        err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
        if (err)
                goto failed;

        rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
        err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
        if (err)
                goto failed_dat;

        rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
        err = nilfs_sufile_read(sb, segment_usage_size, rawi,
                                &nilfs->ns_sufile);
        if (err)
                goto failed_cpfile;

        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
        nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);

 failed:
        brelse(bh_sr);
        return err;

 failed_cpfile:
        iput(nilfs->ns_cpfile);

 failed_dat:
        iput(nilfs->ns_dat);
        goto failed;
}

static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
{
        memset(ri, 0, sizeof(*ri));
        INIT_LIST_HEAD(&ri->ri_used_segments);
}

static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
{
        nilfs_dispose_segment_list(&ri->ri_used_segments);
}

/**
 * nilfs_store_log_cursor - load log cursor from a super block
 * @nilfs: nilfs object
 * @sbp: buffer storing super block to be read
 *
 * nilfs_store_log_cursor() reads the last position of the log
 * containing a super root from a given super block, and initializes
 * relevant information on the nilfs object preparatory for log
 * scanning and recovery.
 *
 * Return: 0 on success, or %-EINVAL if current segment number is out
 * of range.
 */
static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
                                  struct nilfs_super_block *sbp)
{
        int ret = 0;

        nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
        nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
        nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);

        nilfs->ns_prev_seq = nilfs->ns_last_seq;
        nilfs->ns_seg_seq = nilfs->ns_last_seq;
        nilfs->ns_segnum =
                nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
        nilfs->ns_cno = nilfs->ns_last_cno + 1;
        if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
                nilfs_err(nilfs->ns_sb,
                          "pointed segment number is out of range: segnum=%llu, nsegments=%lu",
                          (unsigned long long)nilfs->ns_segnum,
                          nilfs->ns_nsegments);
                ret = -EINVAL;
        }
        return ret;
}

/**
 * nilfs_get_blocksize - get block size from raw superblock data
 * @sb: super block instance
 * @sbp: superblock raw data buffer
 * @blocksize: place to store block size
 *
 * nilfs_get_blocksize() calculates the block size from the block size
 * exponent information written in @sbp and stores it in @blocksize,
 * or aborts with an error message if it's too large.
 *
 * Return: 0 on success, or %-EINVAL if the block size is too large.
 */
static int nilfs_get_blocksize(struct super_block *sb,
                               struct nilfs_super_block *sbp, int *blocksize)
{
        unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);

        if (unlikely(shift_bits >
                     ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)) {
                nilfs_err(sb, "too large filesystem blocksize: 2 ^ %u KiB",
                          shift_bits);
                return -EINVAL;
        }
        *blocksize = BLOCK_SIZE << shift_bits;
        return 0;
}

/**
 * load_nilfs - load and recover the nilfs
 * @nilfs: the_nilfs structure to be released
 * @sb: super block instance used to recover past segment
 *
 * load_nilfs() searches and load the latest super root,
 * attaches the last segment, and does recovery if needed.
 * The caller must call this exclusively for simultaneous mounts.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - No valid segment found.
 * * %-EIO        - I/O error.
 * * %-ENOMEM        - Insufficient memory available.
 * * %-EROFS        - Read only device or RO compat mode (if recovery is required)
 */
int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
{
        struct nilfs_recovery_info ri;
        unsigned int s_flags = sb->s_flags;
        int really_read_only = bdev_read_only(nilfs->ns_bdev);
        int valid_fs = nilfs_valid_fs(nilfs);
        int err;

        if (!valid_fs) {
                nilfs_warn(sb, "mounting unchecked fs");
                if (s_flags & SB_RDONLY) {
                        nilfs_info(sb,
                                   "recovery required for readonly filesystem");
                        nilfs_info(sb,
                                   "write access will be enabled during recovery");
                }
        }

        nilfs_init_recovery_info(&ri);

        err = nilfs_search_super_root(nilfs, &ri);
        if (unlikely(err)) {
                struct nilfs_super_block **sbp = nilfs->ns_sbp;
                int blocksize;

                if (err != -EINVAL)
                        goto scan_error;

                if (!nilfs_valid_sb(sbp[1])) {
                        nilfs_warn(sb,
                                   "unable to fall back to spare super block");
                        goto scan_error;
                }
                nilfs_info(sb, "trying rollback from an earlier position");

                /*
                 * restore super block with its spare and reconfigure
                 * relevant states of the nilfs object.
                 */
                memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
                nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed);
                nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);

                /* verify consistency between two super blocks */
                err = nilfs_get_blocksize(sb, sbp[0], &blocksize);
                if (err)
                        goto scan_error;

                if (blocksize != nilfs->ns_blocksize) {
                        nilfs_warn(sb,
                                   "blocksize differs between two super blocks (%d != %d)",
                                   blocksize, nilfs->ns_blocksize);
                        err = -EINVAL;
                        goto scan_error;
                }

                err = nilfs_store_log_cursor(nilfs, sbp[0]);
                if (err)
                        goto scan_error;

                /* drop clean flag to allow roll-forward and recovery */
                nilfs->ns_mount_state &= ~NILFS_VALID_FS;
                valid_fs = 0;

                err = nilfs_search_super_root(nilfs, &ri);
                if (err)
                        goto scan_error;
        }

        err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
        if (unlikely(err)) {
                nilfs_err(sb, "error %d while loading super root", err);
                goto failed;
        }

        err = nilfs_sysfs_create_device_group(sb);
        if (unlikely(err))
                goto sysfs_error;

        if (valid_fs)
                goto skip_recovery;

        if (s_flags & SB_RDONLY) {
                __u64 features;

                if (nilfs_test_opt(nilfs, NORECOVERY)) {
                        nilfs_info(sb,
                                   "norecovery option specified, skipping roll-forward recovery");
                        goto skip_recovery;
                }
                features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
                        ~NILFS_FEATURE_COMPAT_RO_SUPP;
                if (features) {
                        nilfs_err(sb,
                                  "couldn't proceed with recovery because of unsupported optional features (%llx)",
                                  (unsigned long long)features);
                        err = -EROFS;
                        goto failed_unload;
                }
                if (really_read_only) {
                        nilfs_err(sb,
                                  "write access unavailable, cannot proceed");
                        err = -EROFS;
                        goto failed_unload;
                }
                sb->s_flags &= ~SB_RDONLY;
        } else if (nilfs_test_opt(nilfs, NORECOVERY)) {
                nilfs_err(sb,
                          "recovery cancelled because norecovery option was specified for a read/write mount");
                err = -EINVAL;
                goto failed_unload;
        }

        err = nilfs_salvage_orphan_logs(nilfs, sb, &ri);
        if (err)
                goto failed_unload;

        down_write(&nilfs->ns_sem);
        nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
        err = nilfs_cleanup_super(sb);
        up_write(&nilfs->ns_sem);

        if (err) {
                nilfs_err(sb,
                          "error %d updating super block. recovery unfinished.",
                          err);
                goto failed_unload;
        }
        nilfs_info(sb, "recovery complete");

 skip_recovery:
        nilfs_clear_recovery_info(&ri);
        sb->s_flags = s_flags;
        return 0;

 scan_error:
        nilfs_err(sb, "error %d while searching super root", err);
        goto failed;

 failed_unload:
        nilfs_sysfs_delete_device_group(nilfs);

 sysfs_error:
        iput(nilfs->ns_cpfile);
        iput(nilfs->ns_sufile);
        iput(nilfs->ns_dat);

 failed:
        nilfs_clear_recovery_info(&ri);
        sb->s_flags = s_flags;
        return err;
}

static unsigned long long nilfs_max_size(unsigned int blkbits)
{
        unsigned int max_bits;
        unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */

        max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
        if (max_bits < 64)
                res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
        return res;
}

/**
 * nilfs_nrsvsegs - calculate the number of reserved segments
 * @nilfs: nilfs object
 * @nsegs: total number of segments
 *
 * Return: Number of reserved segments.
 */
unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
{
        return max_t(unsigned long, NILFS_MIN_NRSVSEGS,
                     DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage,
                                  100));
}

/**
 * nilfs_max_segment_count - calculate the maximum number of segments
 * @nilfs: nilfs object
 *
 * Return: Maximum number of segments
 */
static u64 nilfs_max_segment_count(struct the_nilfs *nilfs)
{
        u64 max_count = U64_MAX;

        max_count = div64_ul(max_count, nilfs->ns_blocks_per_segment);
        return min_t(u64, max_count, ULONG_MAX);
}

void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
{
        nilfs->ns_nsegments = nsegs;
        nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs);
}

static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
                                   struct nilfs_super_block *sbp)
{
        u64 nsegments, nblocks;

        if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
                nilfs_err(nilfs->ns_sb,
                          "unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).",
                          le32_to_cpu(sbp->s_rev_level),
                          le16_to_cpu(sbp->s_minor_rev_level),
                          NILFS_CURRENT_REV, NILFS_MINOR_REV);
                return -EINVAL;
        }
        nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
        if (nilfs->ns_sbsize > BLOCK_SIZE)
                return -EINVAL;

        nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
        if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
                nilfs_err(nilfs->ns_sb, "too large inode size: %d bytes",
                          nilfs->ns_inode_size);
                return -EINVAL;
        } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
                nilfs_err(nilfs->ns_sb, "too small inode size: %d bytes",
                          nilfs->ns_inode_size);
                return -EINVAL;
        }

        nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
        if (nilfs->ns_first_ino < NILFS_USER_INO) {
                nilfs_err(nilfs->ns_sb,
                          "too small lower limit for non-reserved inode numbers: %u",
                          nilfs->ns_first_ino);
                return -EINVAL;
        }

        nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
        if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
                nilfs_err(nilfs->ns_sb, "too short segment: %lu blocks",
                          nilfs->ns_blocks_per_segment);
                return -EINVAL;
        }

        nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
        nilfs->ns_r_segments_percentage =
                le32_to_cpu(sbp->s_r_segments_percentage);
        if (nilfs->ns_r_segments_percentage < 1 ||
            nilfs->ns_r_segments_percentage > 99) {
                nilfs_err(nilfs->ns_sb,
                          "invalid reserved segments percentage: %lu",
                          nilfs->ns_r_segments_percentage);
                return -EINVAL;
        }

        nsegments = le64_to_cpu(sbp->s_nsegments);
        if (nsegments > nilfs_max_segment_count(nilfs)) {
                nilfs_err(nilfs->ns_sb,
                          "segment count %llu exceeds upper limit (%llu segments)",
                          (unsigned long long)nsegments,
                          (unsigned long long)nilfs_max_segment_count(nilfs));
                return -EINVAL;
        }

        nblocks = sb_bdev_nr_blocks(nilfs->ns_sb);
        if (nblocks) {
                u64 min_block_count = nsegments * nilfs->ns_blocks_per_segment;
                /*
                 * To avoid failing to mount early device images without a
                 * second superblock, exclude that block count from the
                 * "min_block_count" calculation.
                 */

                if (nblocks < min_block_count) {
                        nilfs_err(nilfs->ns_sb,
                                  "total number of segment blocks %llu exceeds device size (%llu blocks)",
                                  (unsigned long long)min_block_count,
                                  (unsigned long long)nblocks);
                        return -EINVAL;
                }
        }

        nilfs_set_nsegments(nilfs, nsegments);
        nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
        return 0;
}

static int nilfs_valid_sb(struct nilfs_super_block *sbp)
{
        static unsigned char sum[4];
        const int sumoff = offsetof(struct nilfs_super_block, s_sum);
        size_t bytes;
        u32 crc;

        if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
                return 0;
        bytes = le16_to_cpu(sbp->s_bytes);
        if (bytes < sumoff + 4 || bytes > BLOCK_SIZE)
                return 0;
        crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
                       sumoff);
        crc = crc32_le(crc, sum, 4);
        crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
                       bytes - sumoff - 4);
        return crc == le32_to_cpu(sbp->s_sum);
}

/**
 * nilfs_sb2_bad_offset - check the location of the second superblock
 * @sbp: superblock raw data buffer
 * @offset: byte offset of second superblock calculated from device size
 *
 * nilfs_sb2_bad_offset() checks if the position on the second
 * superblock is valid or not based on the filesystem parameters
 * stored in @sbp.  If @offset points to a location within the segment
 * area, or if the parameters themselves are not normal, it is
 * determined to be invalid.
 *
 * Return: true if invalid, false if valid.
 */
static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
{
        unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);
        u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
        u64 nsegments = le64_to_cpu(sbp->s_nsegments);
        u64 index;

        if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS ||
            shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)
                return true;

        index = offset >> (shift_bits + BLOCK_SIZE_BITS);
        do_div(index, blocks_per_segment);
        return index < nsegments;
}

static void nilfs_release_super_block(struct the_nilfs *nilfs)
{
        int i;

        for (i = 0; i < 2; i++) {
                if (nilfs->ns_sbp[i]) {
                        brelse(nilfs->ns_sbh[i]);
                        nilfs->ns_sbh[i] = NULL;
                        nilfs->ns_sbp[i] = NULL;
                }
        }
}

void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
{
        brelse(nilfs->ns_sbh[0]);
        nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
        nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
        nilfs->ns_sbh[1] = NULL;
        nilfs->ns_sbp[1] = NULL;
}

void nilfs_swap_super_block(struct the_nilfs *nilfs)
{
        struct buffer_head *tsbh = nilfs->ns_sbh[0];
        struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];

        nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
        nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
        nilfs->ns_sbh[1] = tsbh;
        nilfs->ns_sbp[1] = tsbp;
}

static int nilfs_load_super_block(struct the_nilfs *nilfs,
                                  struct super_block *sb, int blocksize,
                                  struct nilfs_super_block **sbpp)
{
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
        struct buffer_head **sbh = nilfs->ns_sbh;
        u64 sb2off, devsize = bdev_nr_bytes(nilfs->ns_bdev);
        int valid[2], swp = 0, older;

        if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) {
                nilfs_err(sb, "device size too small");
                return -EINVAL;
        }
        sb2off = NILFS_SB2_OFFSET_BYTES(devsize);

        sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
                                        &sbh[0]);
        sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);

        if (!sbp[0]) {
                if (!sbp[1]) {
                        nilfs_err(sb, "unable to read superblock");
                        return -EIO;
                }
                nilfs_warn(sb,
                           "unable to read primary superblock (blocksize = %d)",
                           blocksize);
        } else if (!sbp[1]) {
                nilfs_warn(sb,
                           "unable to read secondary superblock (blocksize = %d)",
                           blocksize);
        }

        /*
         * Compare two super blocks and set 1 in swp if the secondary
         * super block is valid and newer.  Otherwise, set 0 in swp.
         */
        valid[0] = nilfs_valid_sb(sbp[0]);
        valid[1] = nilfs_valid_sb(sbp[1]);
        swp = valid[1] && (!valid[0] ||
                           le64_to_cpu(sbp[1]->s_last_cno) >
                           le64_to_cpu(sbp[0]->s_last_cno));

        if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
                brelse(sbh[1]);
                sbh[1] = NULL;
                sbp[1] = NULL;
                valid[1] = 0;
                swp = 0;
        }
        if (!valid[swp]) {
                nilfs_release_super_block(nilfs);
                nilfs_err(sb, "couldn't find nilfs on the device");
                return -EINVAL;
        }

        if (!valid[!swp])
                nilfs_warn(sb,
                           "broken superblock, retrying with spare superblock (blocksize = %d)",
                           blocksize);
        if (swp)
                nilfs_swap_super_block(nilfs);

        /*
         * Calculate the array index of the older superblock data.
         * If one has been dropped, set index 0 pointing to the remaining one,
         * otherwise set index 1 pointing to the old one (including if both
         * are the same).
         *
         *  Divided case             valid[0]  valid[1]  swp  ->  older
         *  -------------------------------------------------------------
         *  Both SBs are invalid        0         0       N/A (Error)
         *  SB1 is invalid              0         1       1         0
         *  SB2 is invalid              1         0       0         0
         *  SB2 is newer                1         1       1         0
         *  SB2 is older or the same    1         1       0         1
         */
        older = valid[1] ^ swp;

        nilfs->ns_sbwcount = 0;
        nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
        nilfs->ns_prot_seq = le64_to_cpu(sbp[older]->s_last_seq);
        *sbpp = sbp[0];
        return 0;
}

/**
 * init_nilfs - initialize a NILFS instance.
 * @nilfs: the_nilfs structure
 * @sb: super block
 *
 * init_nilfs() performs common initialization per block device (e.g.
 * reading the super block, getting disk layout information, initializing
 * shared fields in the_nilfs).
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
{
        struct nilfs_super_block *sbp;
        int blocksize;
        int err;

        blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
        if (!blocksize) {
                nilfs_err(sb, "unable to set blocksize");
                err = -EINVAL;
                goto out;
        }
        err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
        if (err)
                goto out;

        err = nilfs_store_magic(sb, sbp);
        if (err)
                goto failed_sbh;

        err = nilfs_check_feature_compatibility(sb, sbp);
        if (err)
                goto failed_sbh;

        err = nilfs_get_blocksize(sb, sbp, &blocksize);
        if (err)
                goto failed_sbh;

        if (blocksize < NILFS_MIN_BLOCK_SIZE) {
                nilfs_err(sb,
                          "couldn't mount because of unsupported filesystem blocksize %d",
                          blocksize);
                err = -EINVAL;
                goto failed_sbh;
        }
        if (sb->s_blocksize != blocksize) {
                int hw_blocksize = bdev_logical_block_size(sb->s_bdev);

                if (blocksize < hw_blocksize) {
                        nilfs_err(sb,
                                  "blocksize %d too small for device (sector-size = %d)",
                                  blocksize, hw_blocksize);
                        err = -EINVAL;
                        goto failed_sbh;
                }
                nilfs_release_super_block(nilfs);
                if (!sb_set_blocksize(sb, blocksize)) {
                        nilfs_err(sb, "bad blocksize %d", blocksize);
                        err = -EINVAL;
                        goto out;
                }

                err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
                if (err)
                        goto out;
                        /*
                         * Not to failed_sbh; sbh is released automatically
                         * when reloading fails.
                         */
        }
        nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
        nilfs->ns_blocksize = blocksize;

        err = nilfs_store_disk_layout(nilfs, sbp);
        if (err)
                goto failed_sbh;

        sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);

        nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);

        err = nilfs_store_log_cursor(nilfs, sbp);
        if (err)
                goto failed_sbh;

        set_nilfs_init(nilfs);
        err = 0;
 out:
        return err;

 failed_sbh:
        nilfs_release_super_block(nilfs);
        goto out;
}

int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                            size_t nsegs)
{
        sector_t seg_start, seg_end;
        sector_t start = 0, nblocks = 0;
        unsigned int sects_per_block;
        __u64 *sn;
        int ret = 0;

        sects_per_block = (1 << nilfs->ns_blocksize_bits) /
                bdev_logical_block_size(nilfs->ns_bdev);
        for (sn = segnump; sn < segnump + nsegs; sn++) {
                nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);

                if (!nblocks) {
                        start = seg_start;
                        nblocks = seg_end - seg_start + 1;
                } else if (start + nblocks == seg_start) {
                        nblocks += seg_end - seg_start + 1;
                } else {
                        ret = blkdev_issue_discard(nilfs->ns_bdev,
                                                   start * sects_per_block,
                                                   nblocks * sects_per_block,
                                                   GFP_NOFS);
                        if (ret < 0)
                                return ret;
                        nblocks = 0;
                }
        }
        if (nblocks)
                ret = blkdev_issue_discard(nilfs->ns_bdev,
                                           start * sects_per_block,
                                           nblocks * sects_per_block,
                                           GFP_NOFS);
        return ret;
}

int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
{
        unsigned long ncleansegs;

        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
        *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
        return 0;
}

int nilfs_near_disk_full(struct the_nilfs *nilfs)
{
        unsigned long ncleansegs, nincsegs;

        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
        nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
                nilfs->ns_blocks_per_segment + 1;

        return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
}

struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
{
        struct rb_node *n;
        struct nilfs_root *root;

        spin_lock(&nilfs->ns_cptree_lock);
        n = nilfs->ns_cptree.rb_node;
        while (n) {
                root = rb_entry(n, struct nilfs_root, rb_node);

                if (cno < root->cno) {
                        n = n->rb_left;
                } else if (cno > root->cno) {
                        n = n->rb_right;
                } else {
                        refcount_inc(&root->count);
                        spin_unlock(&nilfs->ns_cptree_lock);
                        return root;
                }
        }
        spin_unlock(&nilfs->ns_cptree_lock);

        return NULL;
}

struct nilfs_root *
nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
{
        struct rb_node **p, *parent;
        struct nilfs_root *root, *new;
        int err;

        root = nilfs_lookup_root(nilfs, cno);
        if (root)
                return root;

        new = kzalloc_obj(*root);
        if (!new)
                return NULL;

        spin_lock(&nilfs->ns_cptree_lock);

        p = &nilfs->ns_cptree.rb_node;
        parent = NULL;

        while (*p) {
                parent = *p;
                root = rb_entry(parent, struct nilfs_root, rb_node);

                if (cno < root->cno) {
                        p = &(*p)->rb_left;
                } else if (cno > root->cno) {
                        p = &(*p)->rb_right;
                } else {
                        refcount_inc(&root->count);
                        spin_unlock(&nilfs->ns_cptree_lock);
                        kfree(new);
                        return root;
                }
        }

        new->cno = cno;
        new->ifile = NULL;
        new->nilfs = nilfs;
        refcount_set(&new->count, 1);
        atomic64_set(&new->inodes_count, 0);
        atomic64_set(&new->blocks_count, 0);

        rb_link_node(&new->rb_node, parent, p);
        rb_insert_color(&new->rb_node, &nilfs->ns_cptree);

        spin_unlock(&nilfs->ns_cptree_lock);

        err = nilfs_sysfs_create_snapshot_group(new);
        if (err) {
                kfree(new);
                new = NULL;
        }

        return new;
}

void nilfs_put_root(struct nilfs_root *root)
{
        struct the_nilfs *nilfs = root->nilfs;

        if (refcount_dec_and_lock(&root->count, &nilfs->ns_cptree_lock)) {
                rb_erase(&root->rb_node, &nilfs->ns_cptree);
                spin_unlock(&nilfs->ns_cptree_lock);

                nilfs_sysfs_delete_snapshot_group(root);
                iput(root->ifile);

                kfree(root);
        }
}










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 











    1 































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * CIPSO - Commercial IP Security Option
 *
 * This is an implementation of the CIPSO 2.2 protocol as specified in
 * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
 * FIPS-188.  While CIPSO never became a full IETF RFC standard many vendors
 * have chosen to adopt the protocol and over the years it has become a
 * de-facto standard for labeled networking.
 *
 * The CIPSO draft specification can be found in the kernel's Documentation
 * directory as well as the following URL:
 *   https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
 * The FIPS-188 specification can be found at the following URL:
 *   https://www.itl.nist.gov/fipspubs/fip188.htm
 *
 * Author: Paul Moore <paul.moore@hp.com>
 */

/*
 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
 */

#include <linux/init.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/jhash.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/tcp.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/unaligned.h>

/* List of available DOI definitions */
/* XXX - This currently assumes a minimal number of different DOIs in use,
 * if in practice there are a lot of different DOIs this list should
 * probably be turned into a hash table or something similar so we
 * can do quick lookups. */
static DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
static LIST_HEAD(cipso_v4_doi_list);

/* Label mapping cache */
int cipso_v4_cache_enabled = 1;
int cipso_v4_cache_bucketsize = 10;
#define CIPSO_V4_CACHE_BUCKETBITS     7
#define CIPSO_V4_CACHE_BUCKETS        (1 << CIPSO_V4_CACHE_BUCKETBITS)
#define CIPSO_V4_CACHE_REORDERLIMIT   10
struct cipso_v4_map_cache_bkt {
        spinlock_t lock;
        u32 size;
        struct list_head list;
};

struct cipso_v4_map_cache_entry {
        u32 hash;
        unsigned char *key;
        size_t key_len;

        struct netlbl_lsm_cache *lsm_data;

        u32 activity;
        struct list_head list;
};

static struct cipso_v4_map_cache_bkt *cipso_v4_cache;

/* Restricted bitmap (tag #1) flags */
int cipso_v4_rbm_optfmt;
int cipso_v4_rbm_strictvalid = 1;

/*
 * Protocol Constants
 */

/* Maximum size of the CIPSO IP option, derived from the fact that the maximum
 * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */
#define CIPSO_V4_OPT_LEN_MAX          40

/* Length of the base CIPSO option, this includes the option type (1 byte), the
 * option length (1 byte), and the DOI (4 bytes). */
#define CIPSO_V4_HDR_LEN              6

/* Base length of the restrictive category bitmap tag (tag #1). */
#define CIPSO_V4_TAG_RBM_BLEN         4

/* Base length of the enumerated category tag (tag #2). */
#define CIPSO_V4_TAG_ENUM_BLEN        4

/* Base length of the ranged categories bitmap tag (tag #5). */
#define CIPSO_V4_TAG_RNG_BLEN         4
/* The maximum number of category ranges permitted in the ranged category tag
 * (tag #5).  You may note that the IETF draft states that the maximum number
 * of category ranges is 7, but if the low end of the last category range is
 * zero then it is possible to fit 8 category ranges because the zero should
 * be omitted. */
#define CIPSO_V4_TAG_RNG_CAT_MAX      8

/* Base length of the local tag (non-standard tag).
 *  Tag definition (may change between kernel versions)
 *
 * 0          8          16         24         32
 * +----------+----------+----------+----------+
 * | 10000000 | 00000110 | 32-bit secid value  |
 * +----------+----------+----------+----------+
 * | in (host byte order)|
 * +----------+----------+
 *
 */
#define CIPSO_V4_TAG_LOC_BLEN         6

/*
 * Helper Functions
 */

/**
 * cipso_v4_cache_entry_free - Frees a cache entry
 * @entry: the entry to free
 *
 * Description:
 * This function frees the memory associated with a cache entry including the
 * LSM cache data if there are no longer any users, i.e. reference count == 0.
 *
 */
static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry)
{
        if (entry->lsm_data)
                netlbl_secattr_cache_free(entry->lsm_data);
        kfree(entry->key);
        kfree(entry);
}

/**
 * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache
 * @key: the hash key
 * @key_len: the length of the key in bytes
 *
 * Description:
 * The CIPSO tag hashing function.  Returns a 32-bit hash value.
 *
 */
static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len)
{
        return jhash(key, key_len, 0);
}

/*
 * Label Mapping Cache Functions
 */

/**
 * cipso_v4_cache_init - Initialize the CIPSO cache
 *
 * Description:
 * Initializes the CIPSO label mapping cache, this function should be called
 * before any of the other functions defined in this file.  Returns zero on
 * success, negative values on error.
 *
 */
static int __init cipso_v4_cache_init(void)
{
        u32 iter;

        cipso_v4_cache = kzalloc_objs(struct cipso_v4_map_cache_bkt,
                                      CIPSO_V4_CACHE_BUCKETS);
        if (!cipso_v4_cache)
                return -ENOMEM;

        for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
                spin_lock_init(&cipso_v4_cache[iter].lock);
                cipso_v4_cache[iter].size = 0;
                INIT_LIST_HEAD(&cipso_v4_cache[iter].list);
        }

        return 0;
}

/**
 * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache
 *
 * Description:
 * Invalidates and frees any entries in the CIPSO cache.
 *
 */
void cipso_v4_cache_invalidate(void)
{
        struct cipso_v4_map_cache_entry *entry, *tmp_entry;
        u32 iter;

        for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
                spin_lock_bh(&cipso_v4_cache[iter].lock);
                list_for_each_entry_safe(entry,
                                         tmp_entry,
                                         &cipso_v4_cache[iter].list, list) {
                        list_del(&entry->list);
                        cipso_v4_cache_entry_free(entry);
                }
                cipso_v4_cache[iter].size = 0;
                spin_unlock_bh(&cipso_v4_cache[iter].lock);
        }
}

/**
 * cipso_v4_cache_check - Check the CIPSO cache for a label mapping
 * @key: the buffer to check
 * @key_len: buffer length in bytes
 * @secattr: the security attribute struct to use
 *
 * Description:
 * This function checks the cache to see if a label mapping already exists for
 * the given key.  If there is a match then the cache is adjusted and the
 * @secattr struct is populated with the correct LSM security attributes.  The
 * cache is adjusted in the following manner if the entry is not already the
 * first in the cache bucket:
 *
 *  1. The cache entry's activity counter is incremented
 *  2. The previous (higher ranking) entry's activity counter is decremented
 *  3. If the difference between the two activity counters is geater than
 *     CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped
 *
 * Returns zero on success, -ENOENT for a cache miss, and other negative values
 * on error.
 *
 */
static int cipso_v4_cache_check(const unsigned char *key,
                                u32 key_len,
                                struct netlbl_lsm_secattr *secattr)
{
        u32 bkt;
        struct cipso_v4_map_cache_entry *entry;
        struct cipso_v4_map_cache_entry *prev_entry = NULL;
        u32 hash;

        if (!READ_ONCE(cipso_v4_cache_enabled))
                return -ENOENT;

        hash = cipso_v4_map_cache_hash(key, key_len);
        bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1);
        spin_lock_bh(&cipso_v4_cache[bkt].lock);
        list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) {
                if (entry->hash == hash &&
                    entry->key_len == key_len &&
                    memcmp(entry->key, key, key_len) == 0) {
                        entry->activity += 1;
                        refcount_inc(&entry->lsm_data->refcount);
                        secattr->cache = entry->lsm_data;
                        secattr->flags |= NETLBL_SECATTR_CACHE;
                        secattr->type = NETLBL_NLTYPE_CIPSOV4;
                        if (!prev_entry) {
                                spin_unlock_bh(&cipso_v4_cache[bkt].lock);
                                return 0;
                        }

                        if (prev_entry->activity > 0)
                                prev_entry->activity -= 1;
                        if (entry->activity > prev_entry->activity &&
                            entry->activity - prev_entry->activity >
                            CIPSO_V4_CACHE_REORDERLIMIT) {
                                __list_del(entry->list.prev, entry->list.next);
                                __list_add(&entry->list,
                                           prev_entry->list.prev,
                                           &prev_entry->list);
                        }

                        spin_unlock_bh(&cipso_v4_cache[bkt].lock);
                        return 0;
                }
                prev_entry = entry;
        }
        spin_unlock_bh(&cipso_v4_cache[bkt].lock);

        return -ENOENT;
}

/**
 * cipso_v4_cache_add - Add an entry to the CIPSO cache
 * @cipso_ptr: pointer to CIPSO IP option
 * @secattr: the packet's security attributes
 *
 * Description:
 * Add a new entry into the CIPSO label mapping cache.  Add the new entry to
 * head of the cache bucket's list, if the cache bucket is out of room remove
 * the last entry in the list first.  It is important to note that there is
 * currently no checking for duplicate keys.  Returns zero on success,
 * negative values on failure.
 *
 */
int cipso_v4_cache_add(const unsigned char *cipso_ptr,
                       const struct netlbl_lsm_secattr *secattr)
{
        int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize);
        int ret_val = -EPERM;
        u32 bkt;
        struct cipso_v4_map_cache_entry *entry = NULL;
        struct cipso_v4_map_cache_entry *old_entry = NULL;
        u32 cipso_ptr_len;

        if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0)
                return 0;

        cipso_ptr_len = cipso_ptr[1];

        entry = kzalloc_obj(*entry, GFP_ATOMIC);
        if (!entry)
                return -ENOMEM;
        entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC);
        if (!entry->key) {
                ret_val = -ENOMEM;
                goto cache_add_failure;
        }
        entry->key_len = cipso_ptr_len;
        entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len);
        refcount_inc(&secattr->cache->refcount);
        entry->lsm_data = secattr->cache;

        bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
        spin_lock_bh(&cipso_v4_cache[bkt].lock);
        if (cipso_v4_cache[bkt].size < bkt_size) {
                list_add(&entry->list, &cipso_v4_cache[bkt].list);
                cipso_v4_cache[bkt].size += 1;
        } else {
                old_entry = list_entry(cipso_v4_cache[bkt].list.prev,
                                       struct cipso_v4_map_cache_entry, list);
                list_del(&old_entry->list);
                list_add(&entry->list, &cipso_v4_cache[bkt].list);
                cipso_v4_cache_entry_free(old_entry);
        }
        spin_unlock_bh(&cipso_v4_cache[bkt].lock);

        return 0;

cache_add_failure:
        if (entry)
                cipso_v4_cache_entry_free(entry);
        return ret_val;
}

/*
 * DOI List Functions
 */

/**
 * cipso_v4_doi_search - Searches for a DOI definition
 * @doi: the DOI to search for
 *
 * Description:
 * Search the DOI definition list for a DOI definition with a DOI value that
 * matches @doi.  The caller is responsible for calling rcu_read_[un]lock().
 * Returns a pointer to the DOI definition on success and NULL on failure.
 */
static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
{
        struct cipso_v4_doi *iter;

        list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
                if (iter->doi == doi && refcount_read(&iter->refcount))
                        return iter;
        return NULL;
}

/**
 * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine
 * @doi_def: the DOI structure
 * @audit_info: NetLabel audit information
 *
 * Description:
 * The caller defines a new DOI for use by the CIPSO engine and calls this
 * function to add it to the list of acceptable domains.  The caller must
 * ensure that the mapping table specified in @doi_def->map meets all of the
 * requirements of the mapping type (see cipso_ipv4.h for details).  Returns
 * zero on success and non-zero on failure.
 *
 */
int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
                     struct netlbl_audit *audit_info)
{
        int ret_val = -EINVAL;
        u32 iter;
        u32 doi;
        u32 doi_type;
        struct audit_buffer *audit_buf;

        doi = doi_def->doi;
        doi_type = doi_def->type;

        if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
                goto doi_add_return;
        for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
                switch (doi_def->tags[iter]) {
                case CIPSO_V4_TAG_RBITMAP:
                        break;
                case CIPSO_V4_TAG_RANGE:
                case CIPSO_V4_TAG_ENUM:
                        if (doi_def->type != CIPSO_V4_MAP_PASS)
                                goto doi_add_return;
                        break;
                case CIPSO_V4_TAG_LOCAL:
                        if (doi_def->type != CIPSO_V4_MAP_LOCAL)
                                goto doi_add_return;
                        break;
                case CIPSO_V4_TAG_INVALID:
                        if (iter == 0)
                                goto doi_add_return;
                        break;
                default:
                        goto doi_add_return;
                }
        }

        refcount_set(&doi_def->refcount, 1);

        spin_lock(&cipso_v4_doi_list_lock);
        if (cipso_v4_doi_search(doi_def->doi)) {
                spin_unlock(&cipso_v4_doi_list_lock);
                ret_val = -EEXIST;
                goto doi_add_return;
        }
        list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list);
        spin_unlock(&cipso_v4_doi_list_lock);
        ret_val = 0;

doi_add_return:
        audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info);
        if (audit_buf) {
                const char *type_str;
                switch (doi_type) {
                case CIPSO_V4_MAP_TRANS:
                        type_str = "trans";
                        break;
                case CIPSO_V4_MAP_PASS:
                        type_str = "pass";
                        break;
                case CIPSO_V4_MAP_LOCAL:
                        type_str = "local";
                        break;
                default:
                        type_str = "(unknown)";
                }
                audit_log_format(audit_buf,
                                 " cipso_doi=%u cipso_type=%s res=%u",
                                 doi, type_str, ret_val == 0 ? 1 : 0);
                audit_log_end(audit_buf);
        }

        return ret_val;
}

/**
 * cipso_v4_doi_free - Frees a DOI definition
 * @doi_def: the DOI definition
 *
 * Description:
 * This function frees all of the memory associated with a DOI definition.
 *
 */
void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
{
        if (!doi_def)
                return;

        switch (doi_def->type) {
        case CIPSO_V4_MAP_TRANS:
                kfree(doi_def->map.std->lvl.cipso);
                kfree(doi_def->map.std->lvl.local);
                kfree(doi_def->map.std->cat.cipso);
                kfree(doi_def->map.std->cat.local);
                kfree(doi_def->map.std);
                break;
        }
        kfree(doi_def);
}

/**
 * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer
 * @entry: the entry's RCU field
 *
 * Description:
 * This function is designed to be used as a callback to the call_rcu()
 * function so that the memory allocated to the DOI definition can be released
 * safely.
 *
 */
static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
{
        struct cipso_v4_doi *doi_def;

        doi_def = container_of(entry, struct cipso_v4_doi, rcu);
        cipso_v4_doi_free(doi_def);
}

/**
 * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
 * @doi: the DOI value
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Removes a DOI definition from the CIPSO engine.  The NetLabel routines will
 * be called to release their own LSM domain mappings as well as our own
 * domain list.  Returns zero on success and negative values on failure.
 *
 */
int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
{
        int ret_val;
        struct cipso_v4_doi *doi_def;
        struct audit_buffer *audit_buf;

        spin_lock(&cipso_v4_doi_list_lock);
        doi_def = cipso_v4_doi_search(doi);
        if (!doi_def) {
                spin_unlock(&cipso_v4_doi_list_lock);
                ret_val = -ENOENT;
                goto doi_remove_return;
        }
        list_del_rcu(&doi_def->list);
        spin_unlock(&cipso_v4_doi_list_lock);

        cipso_v4_doi_putdef(doi_def);
        ret_val = 0;

doi_remove_return:
        audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info);
        if (audit_buf) {
                audit_log_format(audit_buf,
                                 " cipso_doi=%u res=%u",
                                 doi, ret_val == 0 ? 1 : 0);
                audit_log_end(audit_buf);
        }

        return ret_val;
}

/**
 * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition
 * @doi: the DOI value
 *
 * Description:
 * Searches for a valid DOI definition and if one is found it is returned to
 * the caller.  Otherwise NULL is returned.  The caller must ensure that
 * rcu_read_lock() is held while accessing the returned definition and the DOI
 * definition reference count is decremented when the caller is done.
 *
 */
struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
{
        struct cipso_v4_doi *doi_def;

        rcu_read_lock();
        doi_def = cipso_v4_doi_search(doi);
        if (!doi_def)
                goto doi_getdef_return;
        if (!refcount_inc_not_zero(&doi_def->refcount))
                doi_def = NULL;

doi_getdef_return:
        rcu_read_unlock();
        return doi_def;
}

/**
 * cipso_v4_doi_putdef - Releases a reference for the given DOI definition
 * @doi_def: the DOI definition
 *
 * Description:
 * Releases a DOI definition reference obtained from cipso_v4_doi_getdef().
 *
 */
void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
{
        if (!doi_def)
                return;

        if (!refcount_dec_and_test(&doi_def->refcount))
                return;

        cipso_v4_cache_invalidate();
        call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
}

/**
 * cipso_v4_doi_walk - Iterate through the DOI definitions
 * @skip_cnt: skip past this number of DOI definitions, updated
 * @callback: callback for each DOI definition
 * @cb_arg: argument for the callback function
 *
 * Description:
 * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
 * For each entry call @callback, if @callback returns a negative value stop
 * 'walking' through the list and return.  Updates the value in @skip_cnt upon
 * return.  Returns zero on success, negative values on failure.
 *
 */
int cipso_v4_doi_walk(u32 *skip_cnt,
                     int (*callback) (struct cipso_v4_doi *doi_def, void *arg),
                     void *cb_arg)
{
        int ret_val = -ENOENT;
        u32 doi_cnt = 0;
        struct cipso_v4_doi *iter_doi;

        rcu_read_lock();
        list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list)
                if (refcount_read(&iter_doi->refcount) > 0) {
                        if (doi_cnt++ < *skip_cnt)
                                continue;
                        ret_val = callback(iter_doi, cb_arg);
                        if (ret_val < 0) {
                                doi_cnt--;
                                goto doi_walk_return;
                        }
                }

doi_walk_return:
        rcu_read_unlock();
        *skip_cnt = doi_cnt;
        return ret_val;
}

/*
 * Label Mapping Functions
 */

/**
 * cipso_v4_map_lvl_valid - Checks to see if the given level is understood
 * @doi_def: the DOI definition
 * @level: the level to check
 *
 * Description:
 * Checks the given level against the given DOI definition and returns a
 * negative value if the level does not have a valid mapping and a zero value
 * if the level is defined by the DOI.
 *
 */
static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
{
        switch (doi_def->type) {
        case CIPSO_V4_MAP_PASS:
                return 0;
        case CIPSO_V4_MAP_TRANS:
                if ((level < doi_def->map.std->lvl.cipso_size) &&
                    (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL))
                        return 0;
                break;
        }

        return -EFAULT;
}

/**
 * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network
 * @doi_def: the DOI definition
 * @host_lvl: the host MLS level
 * @net_lvl: the network/CIPSO MLS level
 *
 * Description:
 * Perform a label mapping to translate a local MLS level to the correct
 * CIPSO level using the given DOI definition.  Returns zero on success,
 * negative values otherwise.
 *
 */
static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def,
                                 u32 host_lvl,
                                 u32 *net_lvl)
{
        switch (doi_def->type) {
        case CIPSO_V4_MAP_PASS:
                *net_lvl = host_lvl;
                return 0;
        case CIPSO_V4_MAP_TRANS:
                if (host_lvl < doi_def->map.std->lvl.local_size &&
                    doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) {
                        *net_lvl = doi_def->map.std->lvl.local[host_lvl];
                        return 0;
                }
                return -EPERM;
        }

        return -EINVAL;
}

/**
 * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host
 * @doi_def: the DOI definition
 * @net_lvl: the network/CIPSO MLS level
 * @host_lvl: the host MLS level
 *
 * Description:
 * Perform a label mapping to translate a CIPSO level to the correct local MLS
 * level using the given DOI definition.  Returns zero on success, negative
 * values otherwise.
 *
 */
static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def,
                                 u32 net_lvl,
                                 u32 *host_lvl)
{
        struct cipso_v4_std_map_tbl *map_tbl;

        switch (doi_def->type) {
        case CIPSO_V4_MAP_PASS:
                *host_lvl = net_lvl;
                return 0;
        case CIPSO_V4_MAP_TRANS:
                map_tbl = doi_def->map.std;
                if (net_lvl < map_tbl->lvl.cipso_size &&
                    map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) {
                        *host_lvl = doi_def->map.std->lvl.cipso[net_lvl];
                        return 0;
                }
                return -EPERM;
        }

        return -EINVAL;
}

/**
 * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid
 * @doi_def: the DOI definition
 * @bitmap: category bitmap
 * @bitmap_len: bitmap length in bytes
 *
 * Description:
 * Checks the given category bitmap against the given DOI definition and
 * returns a negative value if any of the categories in the bitmap do not have
 * a valid mapping and a zero value if all of the categories are valid.
 *
 */
static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
                                      const unsigned char *bitmap,
                                      u32 bitmap_len)
{
        int cat = -1;
        u32 bitmap_len_bits = bitmap_len * 8;
        u32 cipso_cat_size;
        u32 *cipso_array;

        switch (doi_def->type) {
        case CIPSO_V4_MAP_PASS:
                return 0;
        case CIPSO_V4_MAP_TRANS:
                cipso_cat_size = doi_def->map.std->cat.cipso_size;
                cipso_array = doi_def->map.std->cat.cipso;
                for (;;) {
                        cat = netlbl_bitmap_walk(bitmap,
                                                 bitmap_len_bits,
                                                 cat + 1,
                                                 1);
                        if (cat < 0)
                                break;
                        if (cat >= cipso_cat_size ||
                            cipso_array[cat] >= CIPSO_V4_INV_CAT)
                                return -EFAULT;
                }

                if (cat == -1)
                        return 0;
                break;
        }

        return -EFAULT;
}

/**
 * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network
 * @doi_def: the DOI definition
 * @secattr: the security attributes
 * @net_cat: the zero'd out category bitmap in network/CIPSO format
 * @net_cat_len: the length of the CIPSO bitmap in bytes
 *
 * Description:
 * Perform a label mapping to translate a local MLS category bitmap to the
 * correct CIPSO bitmap using the given DOI definition.  Returns the minimum
 * size in bytes of the network bitmap on success, negative values otherwise.
 *
 */
static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
                                     const struct netlbl_lsm_secattr *secattr,
                                     unsigned char *net_cat,
                                     u32 net_cat_len)
{
        int host_spot = -1;
        u32 net_spot = CIPSO_V4_INV_CAT;
        u32 net_spot_max = 0;
        u32 net_clen_bits = net_cat_len * 8;
        u32 host_cat_size = 0;
        u32 *host_cat_array = NULL;

        if (doi_def->type == CIPSO_V4_MAP_TRANS) {
                host_cat_size = doi_def->map.std->cat.local_size;
                host_cat_array = doi_def->map.std->cat.local;
        }

        for (;;) {
                host_spot = netlbl_catmap_walk(secattr->attr.mls.cat,
                                               host_spot + 1);
                if (host_spot < 0)
                        break;

                switch (doi_def->type) {
                case CIPSO_V4_MAP_PASS:
                        net_spot = host_spot;
                        break;
                case CIPSO_V4_MAP_TRANS:
                        if (host_spot >= host_cat_size)
                                return -EPERM;
                        net_spot = host_cat_array[host_spot];
                        if (net_spot >= CIPSO_V4_INV_CAT)
                                return -EPERM;
                        break;
                }
                if (net_spot >= net_clen_bits)
                        return -ENOSPC;
                netlbl_bitmap_setbit(net_cat, net_spot, 1);

                if (net_spot > net_spot_max)
                        net_spot_max = net_spot;
        }

        if (++net_spot_max % 8)
                return net_spot_max / 8 + 1;
        return net_spot_max / 8;
}

/**
 * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host
 * @doi_def: the DOI definition
 * @net_cat: the category bitmap in network/CIPSO format
 * @net_cat_len: the length of the CIPSO bitmap in bytes
 * @secattr: the security attributes
 *
 * Description:
 * Perform a label mapping to translate a CIPSO bitmap to the correct local
 * MLS category bitmap using the given DOI definition.  Returns zero on
 * success, negative values on failure.
 *
 */
static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
                                     const unsigned char *net_cat,
                                     u32 net_cat_len,
                                     struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        int net_spot = -1;
        u32 host_spot = CIPSO_V4_INV_CAT;
        u32 net_clen_bits = net_cat_len * 8;
        u32 net_cat_size = 0;
        u32 *net_cat_array = NULL;

        if (doi_def->type == CIPSO_V4_MAP_TRANS) {
                net_cat_size = doi_def->map.std->cat.cipso_size;
                net_cat_array = doi_def->map.std->cat.cipso;
        }

        for (;;) {
                net_spot = netlbl_bitmap_walk(net_cat,
                                              net_clen_bits,
                                              net_spot + 1,
                                              1);
                if (net_spot < 0)
                        return 0;

                switch (doi_def->type) {
                case CIPSO_V4_MAP_PASS:
                        host_spot = net_spot;
                        break;
                case CIPSO_V4_MAP_TRANS:
                        if (net_spot >= net_cat_size)
                                return -EPERM;
                        host_spot = net_cat_array[net_spot];
                        if (host_spot >= CIPSO_V4_INV_CAT)
                                return -EPERM;
                        break;
                }
                ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
                                                       host_spot,
                                                       GFP_ATOMIC);
                if (ret_val != 0)
                        return ret_val;
        }

        return -EINVAL;
}

/**
 * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid
 * @doi_def: the DOI definition
 * @enumcat: category list
 * @enumcat_len: length of the category list in bytes
 *
 * Description:
 * Checks the given categories against the given DOI definition and returns a
 * negative value if any of the categories do not have a valid mapping and a
 * zero value if all of the categories are valid.
 *
 */
static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def,
                                       const unsigned char *enumcat,
                                       u32 enumcat_len)
{
        u16 cat;
        int cat_prev = -1;
        u32 iter;

        if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01)
                return -EFAULT;

        for (iter = 0; iter < enumcat_len; iter += 2) {
                cat = get_unaligned_be16(&enumcat[iter]);
                if (cat <= cat_prev)
                        return -EFAULT;
                cat_prev = cat;
        }

        return 0;
}

/**
 * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network
 * @doi_def: the DOI definition
 * @secattr: the security attributes
 * @net_cat: the zero'd out category list in network/CIPSO format
 * @net_cat_len: the length of the CIPSO category list in bytes
 *
 * Description:
 * Perform a label mapping to translate a local MLS category bitmap to the
 * correct CIPSO category list using the given DOI definition.   Returns the
 * size in bytes of the network category bitmap on success, negative values
 * otherwise.
 *
 */
static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
                                      const struct netlbl_lsm_secattr *secattr,
                                      unsigned char *net_cat,
                                      u32 net_cat_len)
{
        int cat = -1;
        u32 cat_iter = 0;

        for (;;) {
                cat = netlbl_catmap_walk(secattr->attr.mls.cat, cat + 1);
                if (cat < 0)
                        break;
                if ((cat_iter + 2) > net_cat_len)
                        return -ENOSPC;

                *((__be16 *)&net_cat[cat_iter]) = htons(cat);
                cat_iter += 2;
        }

        return cat_iter;
}

/**
 * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host
 * @doi_def: the DOI definition
 * @net_cat: the category list in network/CIPSO format
 * @net_cat_len: the length of the CIPSO bitmap in bytes
 * @secattr: the security attributes
 *
 * Description:
 * Perform a label mapping to translate a CIPSO category list to the correct
 * local MLS category bitmap using the given DOI definition.  Returns zero on
 * success, negative values on failure.
 *
 */
static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
                                      const unsigned char *net_cat,
                                      u32 net_cat_len,
                                      struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        u32 iter;

        for (iter = 0; iter < net_cat_len; iter += 2) {
                ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
                                             get_unaligned_be16(&net_cat[iter]),
                                             GFP_ATOMIC);
                if (ret_val != 0)
                        return ret_val;
        }

        return 0;
}

/**
 * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid
 * @doi_def: the DOI definition
 * @rngcat: category list
 * @rngcat_len: length of the category list in bytes
 *
 * Description:
 * Checks the given categories against the given DOI definition and returns a
 * negative value if any of the categories do not have a valid mapping and a
 * zero value if all of the categories are valid.
 *
 */
static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def,
                                      const unsigned char *rngcat,
                                      u32 rngcat_len)
{
        u16 cat_high;
        u16 cat_low;
        u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1;
        u32 iter;

        if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01)
                return -EFAULT;

        for (iter = 0; iter < rngcat_len; iter += 4) {
                cat_high = get_unaligned_be16(&rngcat[iter]);
                if ((iter + 4) <= rngcat_len)
                        cat_low = get_unaligned_be16(&rngcat[iter + 2]);
                else
                        cat_low = 0;

                if (cat_high > cat_prev)
                        return -EFAULT;

                cat_prev = cat_low;
        }

        return 0;
}

/**
 * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network
 * @doi_def: the DOI definition
 * @secattr: the security attributes
 * @net_cat: the zero'd out category list in network/CIPSO format
 * @net_cat_len: the length of the CIPSO category list in bytes
 *
 * Description:
 * Perform a label mapping to translate a local MLS category bitmap to the
 * correct CIPSO category list using the given DOI definition.   Returns the
 * size in bytes of the network category bitmap on success, negative values
 * otherwise.
 *
 */
static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
                                     const struct netlbl_lsm_secattr *secattr,
                                     unsigned char *net_cat,
                                     u32 net_cat_len)
{
        int iter = -1;
        u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2];
        u32 array_cnt = 0;
        u32 cat_size = 0;

        /* make sure we don't overflow the 'array[]' variable */
        if (net_cat_len >
            (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN))
                return -ENOSPC;

        for (;;) {
                iter = netlbl_catmap_walk(secattr->attr.mls.cat, iter + 1);
                if (iter < 0)
                        break;
                cat_size += (iter == 0 ? 0 : sizeof(u16));
                if (cat_size > net_cat_len)
                        return -ENOSPC;
                array[array_cnt++] = iter;

                iter = netlbl_catmap_walkrng(secattr->attr.mls.cat, iter);
                if (iter < 0)
                        return -EFAULT;
                cat_size += sizeof(u16);
                if (cat_size > net_cat_len)
                        return -ENOSPC;
                array[array_cnt++] = iter;
        }

        for (iter = 0; array_cnt > 0;) {
                *((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]);
                iter += 2;
                array_cnt--;
                if (array[array_cnt] != 0) {
                        *((__be16 *)&net_cat[iter]) = htons(array[array_cnt]);
                        iter += 2;
                }
        }

        return cat_size;
}

/**
 * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host
 * @doi_def: the DOI definition
 * @net_cat: the category list in network/CIPSO format
 * @net_cat_len: the length of the CIPSO bitmap in bytes
 * @secattr: the security attributes
 *
 * Description:
 * Perform a label mapping to translate a CIPSO category list to the correct
 * local MLS category bitmap using the given DOI definition.  Returns zero on
 * success, negative values on failure.
 *
 */
static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
                                     const unsigned char *net_cat,
                                     u32 net_cat_len,
                                     struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        u32 net_iter;
        u16 cat_low;
        u16 cat_high;

        for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) {
                cat_high = get_unaligned_be16(&net_cat[net_iter]);
                if ((net_iter + 4) <= net_cat_len)
                        cat_low = get_unaligned_be16(&net_cat[net_iter + 2]);
                else
                        cat_low = 0;

                ret_val = netlbl_catmap_setrng(&secattr->attr.mls.cat,
                                               cat_low,
                                               cat_high,
                                               GFP_ATOMIC);
                if (ret_val != 0)
                        return ret_val;
        }

        return 0;
}

/*
 * Protocol Handling Functions
 */

/**
 * cipso_v4_gentag_hdr - Generate a CIPSO option header
 * @doi_def: the DOI definition
 * @len: the total tag length in bytes, not including this header
 * @buf: the CIPSO option buffer
 *
 * Description:
 * Write a CIPSO header into the beginning of @buffer.
 *
 */
static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def,
                                unsigned char *buf,
                                u32 len)
{
        buf[0] = IPOPT_CIPSO;
        buf[1] = CIPSO_V4_HDR_LEN + len;
        put_unaligned_be32(doi_def->doi, &buf[2]);
}

/**
 * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1)
 * @doi_def: the DOI definition
 * @secattr: the security attributes
 * @buffer: the option buffer
 * @buffer_len: length of buffer in bytes
 *
 * Description:
 * Generate a CIPSO option using the restricted bitmap tag, tag type #1.  The
 * actual buffer length may be larger than the indicated size due to
 * translation between host and network category bitmaps.  Returns the size of
 * the tag on success, negative values on failure.
 *
 */
static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
                               const struct netlbl_lsm_secattr *secattr,
                               unsigned char *buffer,
                               u32 buffer_len)
{
        int ret_val;
        u32 tag_len;
        u32 level;

        if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
                return -EPERM;

        ret_val = cipso_v4_map_lvl_hton(doi_def,
                                        secattr->attr.mls.lvl,
                                        &level);
        if (ret_val != 0)
                return ret_val;

        if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
                ret_val = cipso_v4_map_cat_rbm_hton(doi_def,
                                                    secattr,
                                                    &buffer[4],
                                                    buffer_len - 4);
                if (ret_val < 0)
                        return ret_val;

                /* This will send packets using the "optimized" format when
                 * possible as specified in  section 3.4.2.6 of the
                 * CIPSO draft. */
                if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 &&
                    ret_val <= 10)
                        tag_len = 14;
                else
                        tag_len = 4 + ret_val;
        } else
                tag_len = 4;

        buffer[0] = CIPSO_V4_TAG_RBITMAP;
        buffer[1] = tag_len;
        buffer[3] = level;

        return tag_len;
}

/**
 * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag
 * @doi_def: the DOI definition
 * @tag: the CIPSO tag
 * @secattr: the security attributes
 *
 * Description:
 * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security
 * attributes in @secattr.  Return zero on success, negatives values on
 * failure.
 *
 */
static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
                                 const unsigned char *tag,
                                 struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        u8 tag_len = tag[1];
        u32 level;

        ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
        if (ret_val != 0)
                return ret_val;
        secattr->attr.mls.lvl = level;
        secattr->flags |= NETLBL_SECATTR_MLS_LVL;

        if (tag_len > 4) {
                ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
                                                    &tag[4],
                                                    tag_len - 4,
                                                    secattr);
                if (ret_val != 0) {
                        netlbl_catmap_free(secattr->attr.mls.cat);
                        return ret_val;
                }

                if (secattr->attr.mls.cat)
                        secattr->flags |= NETLBL_SECATTR_MLS_CAT;
        }

        return 0;
}

/**
 * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2)
 * @doi_def: the DOI definition
 * @secattr: the security attributes
 * @buffer: the option buffer
 * @buffer_len: length of buffer in bytes
 *
 * Description:
 * Generate a CIPSO option using the enumerated tag, tag type #2.  Returns the
 * size of the tag on success, negative values on failure.
 *
 */
static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
                                const struct netlbl_lsm_secattr *secattr,
                                unsigned char *buffer,
                                u32 buffer_len)
{
        int ret_val;
        u32 tag_len;
        u32 level;

        if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
                return -EPERM;

        ret_val = cipso_v4_map_lvl_hton(doi_def,
                                        secattr->attr.mls.lvl,
                                        &level);
        if (ret_val != 0)
                return ret_val;

        if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
                ret_val = cipso_v4_map_cat_enum_hton(doi_def,
                                                     secattr,
                                                     &buffer[4],
                                                     buffer_len - 4);
                if (ret_val < 0)
                        return ret_val;

                tag_len = 4 + ret_val;
        } else
                tag_len = 4;

        buffer[0] = CIPSO_V4_TAG_ENUM;
        buffer[1] = tag_len;
        buffer[3] = level;

        return tag_len;
}

/**
 * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag
 * @doi_def: the DOI definition
 * @tag: the CIPSO tag
 * @secattr: the security attributes
 *
 * Description:
 * Parse a CIPSO enumerated tag (tag type #2) and return the security
 * attributes in @secattr.  Return zero on success, negatives values on
 * failure.
 *
 */
static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
                                  const unsigned char *tag,
                                  struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        u8 tag_len = tag[1];
        u32 level;

        ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
        if (ret_val != 0)
                return ret_val;
        secattr->attr.mls.lvl = level;
        secattr->flags |= NETLBL_SECATTR_MLS_LVL;

        if (tag_len > 4) {
                ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
                                                     &tag[4],
                                                     tag_len - 4,
                                                     secattr);
                if (ret_val != 0) {
                        netlbl_catmap_free(secattr->attr.mls.cat);
                        return ret_val;
                }

                secattr->flags |= NETLBL_SECATTR_MLS_CAT;
        }

        return 0;
}

/**
 * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5)
 * @doi_def: the DOI definition
 * @secattr: the security attributes
 * @buffer: the option buffer
 * @buffer_len: length of buffer in bytes
 *
 * Description:
 * Generate a CIPSO option using the ranged tag, tag type #5.  Returns the
 * size of the tag on success, negative values on failure.
 *
 */
static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
                               const struct netlbl_lsm_secattr *secattr,
                               unsigned char *buffer,
                               u32 buffer_len)
{
        int ret_val;
        u32 tag_len;
        u32 level;

        if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
                return -EPERM;

        ret_val = cipso_v4_map_lvl_hton(doi_def,
                                        secattr->attr.mls.lvl,
                                        &level);
        if (ret_val != 0)
                return ret_val;

        if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
                ret_val = cipso_v4_map_cat_rng_hton(doi_def,
                                                    secattr,
                                                    &buffer[4],
                                                    buffer_len - 4);
                if (ret_val < 0)
                        return ret_val;

                tag_len = 4 + ret_val;
        } else
                tag_len = 4;

        buffer[0] = CIPSO_V4_TAG_RANGE;
        buffer[1] = tag_len;
        buffer[3] = level;

        return tag_len;
}

/**
 * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag
 * @doi_def: the DOI definition
 * @tag: the CIPSO tag
 * @secattr: the security attributes
 *
 * Description:
 * Parse a CIPSO ranged tag (tag type #5) and return the security attributes
 * in @secattr.  Return zero on success, negatives values on failure.
 *
 */
static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
                                 const unsigned char *tag,
                                 struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        u8 tag_len = tag[1];
        u32 level;

        ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
        if (ret_val != 0)
                return ret_val;
        secattr->attr.mls.lvl = level;
        secattr->flags |= NETLBL_SECATTR_MLS_LVL;

        if (tag_len > 4) {
                ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
                                                    &tag[4],
                                                    tag_len - 4,
                                                    secattr);
                if (ret_val != 0) {
                        netlbl_catmap_free(secattr->attr.mls.cat);
                        return ret_val;
                }

                if (secattr->attr.mls.cat)
                        secattr->flags |= NETLBL_SECATTR_MLS_CAT;
        }

        return 0;
}

/**
 * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard)
 * @doi_def: the DOI definition
 * @secattr: the security attributes
 * @buffer: the option buffer
 * @buffer_len: length of buffer in bytes
 *
 * Description:
 * Generate a CIPSO option using the local tag.  Returns the size of the tag
 * on success, negative values on failure.
 *
 */
static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def,
                               const struct netlbl_lsm_secattr *secattr,
                               unsigned char *buffer,
                               u32 buffer_len)
{
        if (!(secattr->flags & NETLBL_SECATTR_SECID))
                return -EPERM;

        buffer[0] = CIPSO_V4_TAG_LOCAL;
        buffer[1] = CIPSO_V4_TAG_LOC_BLEN;
        *(u32 *)&buffer[2] = secattr->attr.secid;

        return CIPSO_V4_TAG_LOC_BLEN;
}

/**
 * cipso_v4_parsetag_loc - Parse a CIPSO local tag
 * @doi_def: the DOI definition
 * @tag: the CIPSO tag
 * @secattr: the security attributes
 *
 * Description:
 * Parse a CIPSO local tag and return the security attributes in @secattr.
 * Return zero on success, negatives values on failure.
 *
 */
static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
                                 const unsigned char *tag,
                                 struct netlbl_lsm_secattr *secattr)
{
        secattr->attr.secid = *(u32 *)&tag[2];
        secattr->flags |= NETLBL_SECATTR_SECID;

        return 0;
}

/**
 * cipso_v4_optptr - Find the CIPSO option in the packet
 * @skb: the packet
 *
 * Description:
 * Parse the packet's IP header looking for a CIPSO option.  Returns a pointer
 * to the start of the CIPSO option on success, NULL if one is not found.
 *
 */
unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
{
        const struct iphdr *iph = ip_hdr(skb);
        unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]);
        int optlen;
        int taglen;

        for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {
                switch (optptr[0]) {
                case IPOPT_END:
                        return NULL;
                case IPOPT_NOOP:
                        taglen = 1;
                        break;
                default:
                        taglen = optptr[1];
                }
                if (!taglen || taglen > optlen)
                        return NULL;
                if (optptr[0] == IPOPT_CIPSO)
                        return optptr;

                optlen -= taglen;
                optptr += taglen;
        }

        return NULL;
}

/**
 * cipso_v4_validate - Validate a CIPSO option
 * @skb: the packet
 * @option: the start of the option, on error it is set to point to the error
 *
 * Description:
 * This routine is called to validate a CIPSO option, it checks all of the
 * fields to ensure that they are at least valid, see the draft snippet below
 * for details.  If the option is valid then a zero value is returned and
 * the value of @option is unchanged.  If the option is invalid then a
 * non-zero value is returned and @option is adjusted to point to the
 * offending portion of the option.  From the IETF draft ...
 *
 *  "If any field within the CIPSO options, such as the DOI identifier, is not
 *   recognized the IP datagram is discarded and an ICMP 'parameter problem'
 *   (type 12) is generated and returned.  The ICMP code field is set to 'bad
 *   parameter' (code 0) and the pointer is set to the start of the CIPSO field
 *   that is unrecognized."
 *
 */
int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
{
        unsigned char *opt = *option;
        unsigned char *tag;
        unsigned char opt_iter;
        unsigned char err_offset = 0;
        u8 opt_len;
        u8 tag_len;
        struct cipso_v4_doi *doi_def = NULL;
        u32 tag_iter;

        /* caller already checks for length values that are too large */
        opt_len = opt[1];
        if (opt_len < 8) {
                err_offset = 1;
                goto validate_return;
        }

        rcu_read_lock();
        doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2]));
        if (!doi_def) {
                err_offset = 2;
                goto validate_return_locked;
        }

        opt_iter = CIPSO_V4_HDR_LEN;
        tag = opt + opt_iter;
        while (opt_iter < opt_len) {
                for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];)
                        if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID ||
                            ++tag_iter == CIPSO_V4_TAG_MAXCNT) {
                                err_offset = opt_iter;
                                goto validate_return_locked;
                        }

                if (opt_iter + 1 == opt_len) {
                        err_offset = opt_iter;
                        goto validate_return_locked;
                }
                tag_len = tag[1];
                if (tag_len > (opt_len - opt_iter)) {
                        err_offset = opt_iter + 1;
                        goto validate_return_locked;
                }

                switch (tag[0]) {
                case CIPSO_V4_TAG_RBITMAP:
                        if (tag_len < CIPSO_V4_TAG_RBM_BLEN) {
                                err_offset = opt_iter + 1;
                                goto validate_return_locked;
                        }

                        /* We are already going to do all the verification
                         * necessary at the socket layer so from our point of
                         * view it is safe to turn these checks off (and less
                         * work), however, the CIPSO draft says we should do
                         * all the CIPSO validations here but it doesn't
                         * really specify _exactly_ what we need to validate
                         * ... so, just make it a sysctl tunable. */
                        if (READ_ONCE(cipso_v4_rbm_strictvalid)) {
                                if (cipso_v4_map_lvl_valid(doi_def,
                                                           tag[3]) < 0) {
                                        err_offset = opt_iter + 3;
                                        goto validate_return_locked;
                                }
                                if (tag_len > CIPSO_V4_TAG_RBM_BLEN &&
                                    cipso_v4_map_cat_rbm_valid(doi_def,
                                                            &tag[4],
                                                            tag_len - 4) < 0) {
                                        err_offset = opt_iter + 4;
                                        goto validate_return_locked;
                                }
                        }
                        break;
                case CIPSO_V4_TAG_ENUM:
                        if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) {
                                err_offset = opt_iter + 1;
                                goto validate_return_locked;
                        }

                        if (cipso_v4_map_lvl_valid(doi_def,
                                                   tag[3]) < 0) {
                                err_offset = opt_iter + 3;
                                goto validate_return_locked;
                        }
                        if (tag_len > CIPSO_V4_TAG_ENUM_BLEN &&
                            cipso_v4_map_cat_enum_valid(doi_def,
                                                        &tag[4],
                                                        tag_len - 4) < 0) {
                                err_offset = opt_iter + 4;
                                goto validate_return_locked;
                        }
                        break;
                case CIPSO_V4_TAG_RANGE:
                        if (tag_len < CIPSO_V4_TAG_RNG_BLEN) {
                                err_offset = opt_iter + 1;
                                goto validate_return_locked;
                        }

                        if (cipso_v4_map_lvl_valid(doi_def,
                                                   tag[3]) < 0) {
                                err_offset = opt_iter + 3;
                                goto validate_return_locked;
                        }
                        if (tag_len > CIPSO_V4_TAG_RNG_BLEN &&
                            cipso_v4_map_cat_rng_valid(doi_def,
                                                       &tag[4],
                                                       tag_len - 4) < 0) {
                                err_offset = opt_iter + 4;
                                goto validate_return_locked;
                        }
                        break;
                case CIPSO_V4_TAG_LOCAL:
                        /* This is a non-standard tag that we only allow for
                         * local connections, so if the incoming interface is
                         * not the loopback device drop the packet. Further,
                         * there is no legitimate reason for setting this from
                         * userspace so reject it if skb is NULL. */
                        if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) {
                                err_offset = opt_iter;
                                goto validate_return_locked;
                        }
                        if (tag_len != CIPSO_V4_TAG_LOC_BLEN) {
                                err_offset = opt_iter + 1;
                                goto validate_return_locked;
                        }
                        break;
                default:
                        err_offset = opt_iter;
                        goto validate_return_locked;
                }

                tag += tag_len;
                opt_iter += tag_len;
        }

validate_return_locked:
        rcu_read_unlock();
validate_return:
        *option = opt + err_offset;
        return err_offset;
}

/**
 * cipso_v4_error - Send the correct response for a bad packet
 * @skb: the packet
 * @error: the error code
 * @gateway: CIPSO gateway flag
 *
 * Description:
 * Based on the error code given in @error, send an ICMP error message back to
 * the originating host.  From the IETF draft ...
 *
 *  "If the contents of the CIPSO [option] are valid but the security label is
 *   outside of the configured host or port label range, the datagram is
 *   discarded and an ICMP 'destination unreachable' (type 3) is generated and
 *   returned.  The code field of the ICMP is set to 'communication with
 *   destination network administratively prohibited' (code 9) or to
 *   'communication with destination host administratively prohibited'
 *   (code 10).  The value of the code is dependent on whether the originator
 *   of the ICMP message is acting as a CIPSO host or a CIPSO gateway.  The
 *   recipient of the ICMP message MUST be able to handle either value.  The
 *   same procedure is performed if a CIPSO [option] can not be added to an
 *   IP packet because it is too large to fit in the IP options area."
 *
 *  "If the error is triggered by receipt of an ICMP message, the message is
 *   discarded and no response is permitted (consistent with general ICMP
 *   processing rules)."
 *
 */
void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
{
        struct inet_skb_parm parm;
        int res;

        if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
                return;

        /*
         * We might be called above the IP layer,
         * so we can not use icmp_send and IPCB here.
         */

        memset(&parm, 0, sizeof(parm));
        parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
        rcu_read_lock();
        res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL);
        rcu_read_unlock();

        if (res)
                return;

        if (gateway)
                __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm);
        else
                __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm);
}

/**
 * cipso_v4_genopt - Generate a CIPSO option
 * @buf: the option buffer
 * @buf_len: the size of opt_buf
 * @doi_def: the CIPSO DOI to use
 * @secattr: the security attributes
 *
 * Description:
 * Generate a CIPSO option using the DOI definition and security attributes
 * passed to the function.  Returns the length of the option on success and
 * negative values on failure.
 *
 */
static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
                           const struct cipso_v4_doi *doi_def,
                           const struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        u32 iter;

        if (buf_len <= CIPSO_V4_HDR_LEN)
                return -ENOSPC;

        /* XXX - This code assumes only one tag per CIPSO option which isn't
         * really a good assumption to make but since we only support the MAC
         * tags right now it is a safe assumption. */
        iter = 0;
        do {
                memset(buf, 0, buf_len);
                switch (doi_def->tags[iter]) {
                case CIPSO_V4_TAG_RBITMAP:
                        ret_val = cipso_v4_gentag_rbm(doi_def,
                                                   secattr,
                                                   &buf[CIPSO_V4_HDR_LEN],
                                                   buf_len - CIPSO_V4_HDR_LEN);
                        break;
                case CIPSO_V4_TAG_ENUM:
                        ret_val = cipso_v4_gentag_enum(doi_def,
                                                   secattr,
                                                   &buf[CIPSO_V4_HDR_LEN],
                                                   buf_len - CIPSO_V4_HDR_LEN);
                        break;
                case CIPSO_V4_TAG_RANGE:
                        ret_val = cipso_v4_gentag_rng(doi_def,
                                                   secattr,
                                                   &buf[CIPSO_V4_HDR_LEN],
                                                   buf_len - CIPSO_V4_HDR_LEN);
                        break;
                case CIPSO_V4_TAG_LOCAL:
                        ret_val = cipso_v4_gentag_loc(doi_def,
                                                   secattr,
                                                   &buf[CIPSO_V4_HDR_LEN],
                                                   buf_len - CIPSO_V4_HDR_LEN);
                        break;
                default:
                        return -EPERM;
                }

                iter++;
        } while (ret_val < 0 &&
                 iter < CIPSO_V4_TAG_MAXCNT &&
                 doi_def->tags[iter] != CIPSO_V4_TAG_INVALID);
        if (ret_val < 0)
                return ret_val;
        cipso_v4_gentag_hdr(doi_def, buf, ret_val);
        return CIPSO_V4_HDR_LEN + ret_val;
}

static int cipso_v4_get_actual_opt_len(const unsigned char *data, int len)
{
        int iter = 0, optlen = 0;

        /* determining the new total option length is tricky because of
         * the padding necessary, the only thing i can think to do at
         * this point is walk the options one-by-one, skipping the
         * padding at the end to determine the actual option size and
         * from there we can determine the new total option length
         */
        while (iter < len) {
                if (data[iter] == IPOPT_END) {
                        break;
                } else if (data[iter] == IPOPT_NOP) {
                        iter++;
                } else {
                        iter += data[iter + 1];
                        optlen = iter;
                }
        }
        return optlen;
}

/**
 * cipso_v4_sock_setattr - Add a CIPSO option to a socket
 * @sk: the socket
 * @doi_def: the CIPSO DOI to use
 * @secattr: the specific security attributes of the socket
 * @sk_locked: true if caller holds the socket lock
 *
 * Description:
 * Set the CIPSO option on the given socket using the DOI definition and
 * security attributes passed to the function.  This function requires
 * exclusive access to @sk, which means it either needs to be in the
 * process of being created or locked.  Returns zero on success and negative
 * values on failure.
 *
 */
int cipso_v4_sock_setattr(struct sock *sk,
                          const struct cipso_v4_doi *doi_def,
                          const struct netlbl_lsm_secattr *secattr,
                          bool sk_locked)
{
        int ret_val = -EPERM;
        unsigned char *buf = NULL;
        u32 buf_len;
        u32 opt_len;
        struct ip_options_rcu *old, *opt = NULL;
        struct inet_sock *sk_inet;
        struct inet_connection_sock *sk_conn;

        /* In the case of sock_create_lite(), the sock->sk field is not
         * defined yet but it is not a problem as the only users of these
         * "lite" PF_INET sockets are functions which do an accept() call
         * afterwards so we will label the socket as part of the accept(). */
        if (!sk)
                return 0;

        /* We allocate the maximum CIPSO option size here so we are probably
         * being a little wasteful, but it makes our life _much_ easier later
         * on and after all we are only talking about 40 bytes. */
        buf_len = CIPSO_V4_OPT_LEN_MAX;
        buf = kmalloc(buf_len, GFP_ATOMIC);
        if (!buf) {
                ret_val = -ENOMEM;
                goto socket_setattr_failure;
        }

        ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
        if (ret_val < 0)
                goto socket_setattr_failure;
        buf_len = ret_val;

        /* We can't use ip_options_get() directly because it makes a call to
         * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
         * we won't always have CAP_NET_RAW even though we _always_ want to
         * set the IPOPT_CIPSO option. */
        opt_len = (buf_len + 3) & ~3;
        opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
        if (!opt) {
                ret_val = -ENOMEM;
                goto socket_setattr_failure;
        }
        memcpy(opt->opt.__data, buf, buf_len);
        opt->opt.optlen = opt_len;
        opt->opt.cipso = sizeof(struct iphdr);
        kfree(buf);
        buf = NULL;

        sk_inet = inet_sk(sk);

        old = rcu_dereference_protected(sk_inet->inet_opt, sk_locked);
        if (inet_test_bit(IS_ICSK, sk)) {
                sk_conn = inet_csk(sk);
                if (old)
                        sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
                sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
                sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
        }
        rcu_assign_pointer(sk_inet->inet_opt, opt);
        if (old)
                kfree_rcu(old, rcu);

        return 0;

socket_setattr_failure:
        kfree(buf);
        kfree(opt);
        return ret_val;
}

/**
 * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket
 * @req: the connection request socket
 * @doi_def: the CIPSO DOI to use
 * @secattr: the specific security attributes of the socket
 *
 * Description:
 * Set the CIPSO option on the given socket using the DOI definition and
 * security attributes passed to the function.  Returns zero on success and
 * negative values on failure.
 *
 */
int cipso_v4_req_setattr(struct request_sock *req,
                         const struct cipso_v4_doi *doi_def,
                         const struct netlbl_lsm_secattr *secattr)
{
        int ret_val = -EPERM;
        unsigned char *buf = NULL;
        u32 buf_len;
        u32 opt_len;
        struct ip_options_rcu *opt = NULL;
        struct inet_request_sock *req_inet;

        /* We allocate the maximum CIPSO option size here so we are probably
         * being a little wasteful, but it makes our life _much_ easier later
         * on and after all we are only talking about 40 bytes. */
        buf_len = CIPSO_V4_OPT_LEN_MAX;
        buf = kmalloc(buf_len, GFP_ATOMIC);
        if (!buf) {
                ret_val = -ENOMEM;
                goto req_setattr_failure;
        }

        ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
        if (ret_val < 0)
                goto req_setattr_failure;
        buf_len = ret_val;

        /* We can't use ip_options_get() directly because it makes a call to
         * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
         * we won't always have CAP_NET_RAW even though we _always_ want to
         * set the IPOPT_CIPSO option. */
        opt_len = (buf_len + 3) & ~3;
        opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
        if (!opt) {
                ret_val = -ENOMEM;
                goto req_setattr_failure;
        }
        memcpy(opt->opt.__data, buf, buf_len);
        opt->opt.optlen = opt_len;
        opt->opt.cipso = sizeof(struct iphdr);
        kfree(buf);
        buf = NULL;

        req_inet = inet_rsk(req);
        opt = unrcu_pointer(xchg(&req_inet->ireq_opt, RCU_INITIALIZER(opt)));
        if (opt)
                kfree_rcu(opt, rcu);

        return 0;

req_setattr_failure:
        kfree(buf);
        kfree(opt);
        return ret_val;
}

/**
 * cipso_v4_delopt - Delete the CIPSO option from a set of IP options
 * @opt_ptr: IP option pointer
 *
 * Description:
 * Deletes the CIPSO IP option from a set of IP options and makes the necessary
 * adjustments to the IP option structure.  Returns zero on success, negative
 * values on failure.
 *
 */
static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr)
{
        struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1);
        int hdr_delta = 0;

        if (!opt || opt->opt.cipso == 0)
                return 0;
        if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
                u8 cipso_len;
                u8 cipso_off;
                unsigned char *cipso_ptr;
                int optlen_new;

                cipso_off = opt->opt.cipso - sizeof(struct iphdr);
                cipso_ptr = &opt->opt.__data[cipso_off];
                cipso_len = cipso_ptr[1];

                if (opt->opt.srr > opt->opt.cipso)
                        opt->opt.srr -= cipso_len;
                if (opt->opt.rr > opt->opt.cipso)
                        opt->opt.rr -= cipso_len;
                if (opt->opt.ts > opt->opt.cipso)
                        opt->opt.ts -= cipso_len;
                if (opt->opt.router_alert > opt->opt.cipso)
                        opt->opt.router_alert -= cipso_len;
                opt->opt.cipso = 0;

                memmove(cipso_ptr, cipso_ptr + cipso_len,
                        opt->opt.optlen - cipso_off - cipso_len);

                optlen_new = cipso_v4_get_actual_opt_len(opt->opt.__data,
                                                         opt->opt.optlen);
                hdr_delta = opt->opt.optlen;
                opt->opt.optlen = (optlen_new + 3) & ~3;
                hdr_delta -= opt->opt.optlen;
        } else {
                /* only the cipso option was present on the socket so we can
                 * remove the entire option struct */
                *opt_ptr = NULL;
                hdr_delta = opt->opt.optlen;
                kfree_rcu(opt, rcu);
        }

        return hdr_delta;
}

/**
 * cipso_v4_sock_delattr - Delete the CIPSO option from a socket
 * @sk: the socket
 *
 * Description:
 * Removes the CIPSO option from a socket, if present.
 *
 */
void cipso_v4_sock_delattr(struct sock *sk)
{
        struct inet_sock *sk_inet;
        int hdr_delta;

        sk_inet = inet_sk(sk);

        hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
        if (inet_test_bit(IS_ICSK, sk) && hdr_delta > 0) {
                struct inet_connection_sock *sk_conn = inet_csk(sk);
                sk_conn->icsk_ext_hdr_len -= hdr_delta;
                sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
        }
}

/**
 * cipso_v4_req_delattr - Delete the CIPSO option from a request socket
 * @req: the request socket
 *
 * Description:
 * Removes the CIPSO option from a request socket, if present.
 *
 */
void cipso_v4_req_delattr(struct request_sock *req)
{
        cipso_v4_delopt(&inet_rsk(req)->ireq_opt);
}

/**
 * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions
 * @cipso: the CIPSO v4 option
 * @secattr: the security attributes
 *
 * Description:
 * Inspect @cipso and return the security attributes in @secattr.  Returns zero
 * on success and negative values on failure.
 *
 */
int cipso_v4_getattr(const unsigned char *cipso,
                     struct netlbl_lsm_secattr *secattr)
{
        int ret_val = -ENOMSG;
        u32 doi;
        struct cipso_v4_doi *doi_def;

        if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0)
                return 0;

        doi = get_unaligned_be32(&cipso[2]);
        rcu_read_lock();
        doi_def = cipso_v4_doi_search(doi);
        if (!doi_def)
                goto getattr_return;
        /* XXX - This code assumes only one tag per CIPSO option which isn't
         * really a good assumption to make but since we only support the MAC
         * tags right now it is a safe assumption. */
        switch (cipso[6]) {
        case CIPSO_V4_TAG_RBITMAP:
                ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr);
                break;
        case CIPSO_V4_TAG_ENUM:
                ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr);
                break;
        case CIPSO_V4_TAG_RANGE:
                ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
                break;
        case CIPSO_V4_TAG_LOCAL:
                ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr);
                break;
        }
        if (ret_val == 0)
                secattr->type = NETLBL_NLTYPE_CIPSOV4;

getattr_return:
        rcu_read_unlock();
        return ret_val;
}

/**
 * cipso_v4_sock_getattr - Get the security attributes from a sock
 * @sk: the sock
 * @secattr: the security attributes
 *
 * Description:
 * Query @sk to see if there is a CIPSO option attached to the sock and if
 * there is return the CIPSO security attributes in @secattr.  This function
 * requires that @sk be locked, or privately held, but it does not do any
 * locking itself.  Returns zero on success and negative values on failure.
 *
 */
int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
{
        struct ip_options_rcu *opt;
        int res = -ENOMSG;

        rcu_read_lock();
        opt = rcu_dereference(inet_sk(sk)->inet_opt);
        if (opt && opt->opt.cipso)
                res = cipso_v4_getattr(opt->opt.__data +
                                                opt->opt.cipso -
                                                sizeof(struct iphdr),
                                       secattr);
        rcu_read_unlock();
        return res;
}

/**
 * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet
 * @skb: the packet
 * @doi_def: the DOI structure
 * @secattr: the security attributes
 *
 * Description:
 * Set the CIPSO option on the given packet based on the security attributes.
 * Returns a pointer to the IP header on success and NULL on failure.
 *
 */
int cipso_v4_skbuff_setattr(struct sk_buff *skb,
                            const struct cipso_v4_doi *doi_def,
                            const struct netlbl_lsm_secattr *secattr)
{
        int ret_val;
        struct iphdr *iph;
        struct ip_options *opt = &IPCB(skb)->opt;
        unsigned char buf[CIPSO_V4_OPT_LEN_MAX];
        u32 buf_len = CIPSO_V4_OPT_LEN_MAX;
        u32 opt_len;
        int len_delta;

        ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
        if (ret_val < 0)
                return ret_val;
        buf_len = ret_val;
        opt_len = (buf_len + 3) & ~3;

        /* we overwrite any existing options to ensure that we have enough
         * room for the CIPSO option, the reason is that we _need_ to guarantee
         * that the security label is applied to the packet - we do the same
         * thing when using the socket options and it hasn't caused a problem,
         * if we need to we can always revisit this choice later */

        len_delta = opt_len - opt->optlen;
        /* if we don't ensure enough headroom we could panic on the skb_push()
         * call below so make sure we have enough, we are also "mangling" the
         * packet so we should probably do a copy-on-write call anyway */
        ret_val = skb_cow(skb,
                          skb_headroom(skb) + (len_delta > 0 ? len_delta : 0));
        if (ret_val < 0)
                return ret_val;

        if (len_delta > 0) {
                /* we assume that the header + opt->optlen have already been
                 * "pushed" in ip_options_build() or similar */
                iph = ip_hdr(skb);
                skb_push(skb, len_delta);
                memmove((char *)iph - len_delta, iph, iph->ihl << 2);
                skb_reset_network_header(skb);
                iph = ip_hdr(skb);
        } else if (len_delta < 0) {
                iph = ip_hdr(skb);
                memset(iph + 1, IPOPT_NOP, opt->optlen);
        } else
                iph = ip_hdr(skb);

        if (opt->optlen > 0)
                memset(opt, 0, sizeof(*opt));
        opt->optlen = opt_len;
        opt->cipso = sizeof(struct iphdr);
        opt->is_changed = 1;

        /* we have to do the following because we are being called from a
         * netfilter hook which means the packet already has had the header
         * fields populated and the checksum calculated - yes this means we
         * are doing more work than needed but we do it to keep the core
         * stack clean and tidy */
        memcpy(iph + 1, buf, buf_len);
        if (opt_len > buf_len)
                memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
        if (len_delta != 0) {
                iph->ihl = 5 + (opt_len >> 2);
                iph_set_totlen(iph, skb->len);
        }
        ip_send_check(iph);

        return 0;
}

/**
 * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet
 * @skb: the packet
 *
 * Description:
 * Removes any and all CIPSO options from the given packet.  Returns zero on
 * success, negative values on failure.
 *
 */
int cipso_v4_skbuff_delattr(struct sk_buff *skb)
{
        int ret_val, cipso_len, hdr_len_actual, new_hdr_len_actual, new_hdr_len,
            hdr_len_delta;
        struct iphdr *iph;
        struct ip_options *opt = &IPCB(skb)->opt;
        unsigned char *cipso_ptr;

        if (opt->cipso == 0)
                return 0;

        /* since we are changing the packet we should make a copy */
        ret_val = skb_cow(skb, skb_headroom(skb));
        if (ret_val < 0)
                return ret_val;

        iph = ip_hdr(skb);
        cipso_ptr = (unsigned char *)iph + opt->cipso;
        cipso_len = cipso_ptr[1];

        hdr_len_actual = sizeof(struct iphdr) +
                         cipso_v4_get_actual_opt_len((unsigned char *)(iph + 1),
                                                     opt->optlen);
        new_hdr_len_actual = hdr_len_actual - cipso_len;
        new_hdr_len = (new_hdr_len_actual + 3) & ~3;
        hdr_len_delta = (iph->ihl << 2) - new_hdr_len;

        /* 1. shift any options after CIPSO to the left */
        memmove(cipso_ptr, cipso_ptr + cipso_len,
                new_hdr_len_actual - opt->cipso);
        /* 2. move the whole IP header to its new place */
        memmove((unsigned char *)iph + hdr_len_delta, iph, new_hdr_len_actual);
        /* 3. adjust the skb layout */
        skb_pull(skb, hdr_len_delta);
        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
        /* 4. re-fill new padding with IPOPT_END (may now be longer) */
        memset((unsigned char *)iph + new_hdr_len_actual, IPOPT_END,
               new_hdr_len - new_hdr_len_actual);

        opt->optlen -= hdr_len_delta;
        opt->cipso = 0;
        opt->is_changed = 1;
        if (hdr_len_delta != 0) {
                iph->ihl = new_hdr_len >> 2;
                iph_set_totlen(iph, skb->len);
        }
        ip_send_check(iph);

        return 0;
}

/*
 * Setup Functions
 */

/**
 * cipso_v4_init - Initialize the CIPSO module
 *
 * Description:
 * Initialize the CIPSO module and prepare it for use.  Returns zero on success
 * and negative values on failure.
 *
 */
static int __init cipso_v4_init(void)
{
        int ret_val;

        ret_val = cipso_v4_cache_init();
        if (ret_val != 0)
                panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n",
                      ret_val);

        return 0;
}

subsys_initcall(cipso_v4_init);




































































   16 








   17 
   15 





   16 












   15 












    9 



    2 
    2 





   16 



   16 




   21 




    3 

   18 




   18 








    3 





    3 


    3 







    2 



    3 










    2 


    2 







    3 


    3 




    6 




    6 











   18 








































   18 
   18 


   18 
    4 
















    3 
    3 



    3 


    3 
    3 





























   16 
   14 


    3 
    3 













    9 





    2 
    9 




    9 
    2 

    8 





    1 




















    1 


    1 











































































































    3 












    3 





    3 




    2 





    3 























    1 





    3 





    3 


    3 
    3 
























































    3 



    1 



    1 







    1 













   17 



   18 



   17 





















   15 




   16 









   17 





   18 




   15 

































   18 



   17 





   18 










   15 




















   17 



   18 



   17 
























    2 
    2 

    2 

















    2 




















    2 





































































    2 






    2 





































    1 



    1 














    1 



    1 

























   13 

    3 
    3 


    1 







    3 
    3 










    3 
    1 




    3 
    7 























    3 
    1 

    2 









    2 


    1 


    2 






    4 


















   10 
   17 

    7 
















    6 











    6 

    6 



    2 

    2 


    4 









    1 


    1 
    1 

































    2 
    2 















    3 
    3 












    3 













    3 
    3 


































   18 




   15 

    6 








    1 

    4 




    1 







   10 


    3 










   11 
















   10 





    9 



    9 



    1 







































    1 




























    1 





















   14 



    3 










    3 


    3 



    3 




    3 













































































   17 


   16 










   17 





   16 


   15 
    3 




   18 





   18 





   18 
   16 


   18 
   18 








   16 



   16 








   18 




   18 
    3 


   17 
   10 




   17 








    2 







   17 













    2 















































































   18 















   18 








   17 












   18 




   18 










   15 








    8 
    1 





    1 


















    2 

































   17 




   17 









   16 






































   17 


   17 











   14 
    3 


   18 



   17 































   18 

    8 













    7 








    1 










    5 
















    1 

















    6 






    4 










    1 











   18 



    2 
    9 



    3 



















   16 
































   18 
   18 







   17 








   18 
   18 


   18 







   18 


   13 

    3 



    3 



    2 

    3 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */

#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/hashtable.h>
#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/sort.h>

#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)

struct per_frame_masks {
        spis_t may_read;        /* stack slots that may be read by this instruction */
        spis_t must_write;        /* stack slots written by this instruction */
        spis_t live_before;        /* stack slots that may be read by this insn and its successors */
};

/*
 * A function instance keyed by (callsite, depth).
 * Encapsulates read and write marks for each instruction in the function.
 * Marks are tracked for each frame up to @depth.
 */
struct func_instance {
        struct hlist_node hl_node;
        u32 callsite;                /* call insn that invoked this subprog (subprog_start for depth 0) */
        u32 depth;                /* call depth (0 = entry subprog) */
        u32 subprog;                /* subprog index */
        u32 subprog_start;        /* cached env->subprog_info[subprog].start */
        u32 insn_cnt;                /* cached number of insns in the function */
        /* Per frame, per instruction masks, frames allocated lazily. */
        struct per_frame_masks *frames[MAX_CALL_FRAMES];
        bool must_write_initialized;
};

struct live_stack_query {
        struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */
        u32 callsites[MAX_CALL_FRAMES]; /* callsite[i] = insn calling frame i+1 */
        u32 curframe;
        u32 insn_idx;
};

struct bpf_liveness {
        DECLARE_HASHTABLE(func_instances, 8);                /* maps (depth, callsite) to func_instance */
        struct live_stack_query live_stack_query;        /* cache to avoid repetitive ht lookups */
        u32 subprog_calls;                                /* analyze_subprog() invocations */
};

/*
 * Hash/compare key for func_instance: (depth, callsite).
 * For depth == 0 (entry subprog), @callsite is the subprog start insn.
 * For depth > 0, @callsite is the call instruction index that invoked the subprog.
 */
static u32 instance_hash(u32 callsite, u32 depth)
{
        u32 key[2] = { depth, callsite };

        return jhash2(key, 2, 0);
}

static struct func_instance *find_instance(struct bpf_verifier_env *env,
                                           u32 callsite, u32 depth)
{
        struct bpf_liveness *liveness = env->liveness;
        struct func_instance *f;
        u32 key = instance_hash(callsite, depth);

        hash_for_each_possible(liveness->func_instances, f, hl_node, key)
                if (f->depth == depth && f->callsite == callsite)
                        return f;
        return NULL;
}

static struct func_instance *call_instance(struct bpf_verifier_env *env,
                                           struct func_instance *caller,
                                           u32 callsite, int subprog)
{
        u32 depth = caller ? caller->depth + 1 : 0;
        u32 subprog_start = env->subprog_info[subprog].start;
        u32 lookup_key = depth > 0 ? callsite : subprog_start;
        struct func_instance *f;
        u32 hash;

        f = find_instance(env, lookup_key, depth);
        if (f)
                return f;

        f = kvzalloc(sizeof(*f), GFP_KERNEL_ACCOUNT);
        if (!f)
                return ERR_PTR(-ENOMEM);
        f->callsite = lookup_key;
        f->depth = depth;
        f->subprog = subprog;
        f->subprog_start = subprog_start;
        f->insn_cnt = (env->subprog_info + subprog + 1)->start - subprog_start;
        hash = instance_hash(lookup_key, depth);
        hash_add(env->liveness->func_instances, &f->hl_node, hash);
        return f;
}

static struct func_instance *lookup_instance(struct bpf_verifier_env *env,
                                             struct bpf_verifier_state *st,
                                             u32 frameno)
{
        u32 callsite, subprog_start;
        struct func_instance *f;
        u32 key, depth;

        subprog_start = env->subprog_info[st->frame[frameno]->subprogno].start;
        callsite = frameno > 0 ? st->frame[frameno]->callsite : subprog_start;

        for (depth = frameno; ; depth--) {
                key = depth > 0 ? callsite : subprog_start;
                f = find_instance(env, key, depth);
                if (f || depth == 0)
                        return f;
        }
}

int bpf_stack_liveness_init(struct bpf_verifier_env *env)
{
        env->liveness = kvzalloc_obj(*env->liveness, GFP_KERNEL_ACCOUNT);
        if (!env->liveness)
                return -ENOMEM;
        hash_init(env->liveness->func_instances);
        return 0;
}

void bpf_stack_liveness_free(struct bpf_verifier_env *env)
{
        struct func_instance *instance;
        struct hlist_node *tmp;
        int bkt, i;

        if (!env->liveness)
                return;
        hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) {
                for (i = 0; i <= instance->depth; i++)
                        kvfree(instance->frames[i]);
                kvfree(instance);
        }
        kvfree(env->liveness);
}

/*
 * Convert absolute instruction index @insn_idx to an index relative
 * to start of the function corresponding to @instance.
 */
static int relative_idx(struct func_instance *instance, u32 insn_idx)
{
        return insn_idx - instance->subprog_start;
}

static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
                                               u32 frame, u32 insn_idx)
{
        if (!instance->frames[frame])
                return NULL;

        return &instance->frames[frame][relative_idx(instance, insn_idx)];
}

static struct per_frame_masks *alloc_frame_masks(struct func_instance *instance,
                                                 u32 frame, u32 insn_idx)
{
        struct per_frame_masks *arr;

        if (!instance->frames[frame]) {
                arr = kvzalloc_objs(*arr, instance->insn_cnt,
                                    GFP_KERNEL_ACCOUNT);
                instance->frames[frame] = arr;
                if (!arr)
                        return ERR_PTR(-ENOMEM);
        }
        return get_frame_masks(instance, frame, insn_idx);
}

/* Accumulate may_read masks for @frame at @insn_idx */
static int mark_stack_read(struct func_instance *instance, u32 frame, u32 insn_idx, spis_t mask)
{
        struct per_frame_masks *masks;

        masks = alloc_frame_masks(instance, frame, insn_idx);
        if (IS_ERR(masks))
                return PTR_ERR(masks);
        masks->may_read = spis_or(masks->may_read, mask);
        return 0;
}

static int mark_stack_write(struct func_instance *instance, u32 frame, u32 insn_idx, spis_t mask)
{
        struct per_frame_masks *masks;

        masks = alloc_frame_masks(instance, frame, insn_idx);
        if (IS_ERR(masks))
                return PTR_ERR(masks);
        masks->must_write = spis_or(masks->must_write, mask);
        return 0;
}

int bpf_jmp_offset(struct bpf_insn *insn)
{
        u8 code = insn->code;

        if (code == (BPF_JMP32 | BPF_JA))
                return insn->imm;
        return insn->off;
}

__diag_push();
__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl");

/*
 * Returns an array of instructions succ, with succ->items[0], ...,
 * succ->items[n-1] with successor instructions, where n=succ->cnt
 */
inline struct bpf_iarray *
bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
{
        static const struct opcode_info {
                bool can_jump;
                bool can_fallthrough;
        } opcode_info_tbl[256] = {
                [0 ... 255] = {.can_jump = false, .can_fallthrough = true},
        #define _J(code, ...) \
                [BPF_JMP   | code] = __VA_ARGS__, \
                [BPF_JMP32 | code] = __VA_ARGS__

                _J(BPF_EXIT,  {.can_jump = false, .can_fallthrough = false}),
                _J(BPF_JA,    {.can_jump = true,  .can_fallthrough = false}),
                _J(BPF_JEQ,   {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JNE,   {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JLT,   {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JLE,   {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JGT,   {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JGE,   {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JSGT,  {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JSGE,  {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JSLT,  {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JSLE,  {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JCOND, {.can_jump = true,  .can_fallthrough = true}),
                _J(BPF_JSET,  {.can_jump = true,  .can_fallthrough = true}),
        #undef _J
        };
        struct bpf_prog *prog = env->prog;
        struct bpf_insn *insn = &prog->insnsi[idx];
        const struct opcode_info *opcode_info;
        struct bpf_iarray *succ, *jt;
        int insn_sz;

        jt = env->insn_aux_data[idx].jt;
        if (unlikely(jt))
                return jt;

        /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */
        succ = env->succ;
        succ->cnt = 0;

        opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)];
        insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
        if (opcode_info->can_fallthrough)
                succ->items[succ->cnt++] = idx + insn_sz;

        if (opcode_info->can_jump)
                succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1;

        return succ;
}

__diag_pop();


static inline bool update_insn(struct bpf_verifier_env *env,
                               struct func_instance *instance, u32 frame, u32 insn_idx)
{
        spis_t new_before, new_after;
        struct per_frame_masks *insn, *succ_insn;
        struct bpf_iarray *succ;
        u32 s;
        bool changed;

        succ = bpf_insn_successors(env, insn_idx);
        if (succ->cnt == 0)
                return false;

        changed = false;
        insn = get_frame_masks(instance, frame, insn_idx);
        new_before = SPIS_ZERO;
        new_after = SPIS_ZERO;
        for (s = 0; s < succ->cnt; ++s) {
                succ_insn = get_frame_masks(instance, frame, succ->items[s]);
                new_after = spis_or(new_after, succ_insn->live_before);
        }
        /*
         * New "live_before" is a union of all "live_before" of successors
         * minus slots written by instruction plus slots read by instruction.
         * new_before = (new_after & ~insn->must_write) | insn->may_read
         */
        new_before = spis_or(spis_and(new_after, spis_not(insn->must_write)),
                             insn->may_read);
        changed |= !spis_equal(new_before, insn->live_before);
        insn->live_before = new_before;
        return changed;
}

/* Fixed-point computation of @live_before marks */
static void update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
{
        u32 i, frame, po_start, po_end;
        int *insn_postorder = env->cfg.insn_postorder;
        struct bpf_subprog_info *subprog;
        bool changed;

        instance->must_write_initialized = true;
        subprog = &env->subprog_info[instance->subprog];
        po_start = subprog->postorder_start;
        po_end = (subprog + 1)->postorder_start;
        /* repeat until fixed point is reached */
        do {
                changed = false;
                for (frame = 0; frame <= instance->depth; frame++) {
                        if (!instance->frames[frame])
                                continue;

                        for (i = po_start; i < po_end; i++)
                                changed |= update_insn(env, instance, frame, insn_postorder[i]);
                }
        } while (changed);
}

static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 half_spi)
{
        struct per_frame_masks *masks;

        masks = get_frame_masks(instance, frameno, insn_idx);
        return masks && spis_test_bit(masks->live_before, half_spi);
}

int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
        struct live_stack_query *q = &env->liveness->live_stack_query;
        struct func_instance *instance;
        u32 frame;

        memset(q, 0, sizeof(*q));
        for (frame = 0; frame <= st->curframe; frame++) {
                instance = lookup_instance(env, st, frame);
                if (IS_ERR_OR_NULL(instance))
                        q->instances[frame] = NULL;
                else
                        q->instances[frame] = instance;
                if (frame < st->curframe)
                        q->callsites[frame] = st->frame[frame + 1]->callsite;
        }
        q->curframe = st->curframe;
        q->insn_idx = st->insn_idx;
        return 0;
}

bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 half_spi)
{
        /*
         * Slot is alive if it is read before q->insn_idx in current func instance,
         * or if for some outer func instance:
         * - alive before callsite if callsite calls callback, otherwise
         * - alive after callsite
         */
        struct live_stack_query *q = &env->liveness->live_stack_query;
        struct func_instance *instance, *curframe_instance;
        u32 i, callsite, rel;
        int cur_delta, delta;
        bool alive = false;

        curframe_instance = q->instances[q->curframe];
        if (!curframe_instance)
                return true;
        cur_delta = (int)curframe_instance->depth - (int)q->curframe;
        rel = frameno + cur_delta;
        if (rel <= curframe_instance->depth)
                alive = is_live_before(curframe_instance, q->insn_idx, rel, half_spi);

        if (alive)
                return true;

        for (i = frameno; i < q->curframe; i++) {
                instance = q->instances[i];
                if (!instance)
                        return true;
                /* Map actual frameno to frame index within this instance */
                delta = (int)instance->depth - (int)i;
                rel = frameno + delta;
                if (rel > instance->depth)
                        return true;

                /* Get callsite from verifier state, not from instance callchain */
                callsite = q->callsites[i];

                alive = bpf_calls_callback(env, callsite)
                        ? is_live_before(instance, callsite, rel, half_spi)
                        : is_live_before(instance, callsite + 1, rel, half_spi);
                if (alive)
                        return true;
        }

        return false;
}

static char *fmt_subprog(struct bpf_verifier_env *env, int subprog)
{
        const char *name = env->subprog_info[subprog].name;

        snprintf(env->tmp_str_buf, sizeof(env->tmp_str_buf),
                 "subprog#%d%s%s", subprog, name ? " " : "", name ? name : "");
        return env->tmp_str_buf;
}

static char *fmt_instance(struct bpf_verifier_env *env, struct func_instance *instance)
{
        snprintf(env->tmp_str_buf, sizeof(env->tmp_str_buf),
                 "(d%d,cs%d)", instance->depth, instance->callsite);
        return env->tmp_str_buf;
}

static int spi_off(int spi)
{
        return -(spi + 1) * BPF_REG_SIZE;
}

/*
 * When both halves of an 8-byte SPI are set, print as "-8","-16",...
 * When only one half is set, print as "-4h","-8h",...
 * Runs of 3+ consecutive fully-set SPIs are collapsed: "fp0-8..-24"
 */
static char *fmt_spis_mask(struct bpf_verifier_env *env, int frame, bool first, spis_t spis)
{
        int buf_sz = sizeof(env->tmp_str_buf);
        char *buf = env->tmp_str_buf;
        int spi, n, run_start;

        buf[0] = '\0';

        for (spi = 0; spi < STACK_SLOTS / 2 && buf_sz > 0; spi++) {
                bool lo = spis_test_bit(spis, spi * 2);
                bool hi = spis_test_bit(spis, spi * 2 + 1);
                const char *space = first ? "" : " ";

                if (!lo && !hi)
                        continue;

                if (!lo || !hi) {
                        /* half-spi */
                        n = scnprintf(buf, buf_sz, "%sfp%d%d%s",
                                      space, frame, spi_off(spi) + (lo ? STACK_SLOT_SZ : 0), "h");
                } else if (spi + 2 < STACK_SLOTS / 2 &&
                           spis_test_bit(spis, spi * 2 + 2) &&
                           spis_test_bit(spis, spi * 2 + 3) &&
                           spis_test_bit(spis, spi * 2 + 4) &&
                           spis_test_bit(spis, spi * 2 + 5)) {
                        /* 3+ consecutive full spis */
                        run_start = spi;
                        while (spi + 1 < STACK_SLOTS / 2 &&
                               spis_test_bit(spis, (spi + 1) * 2) &&
                               spis_test_bit(spis, (spi + 1) * 2 + 1))
                                spi++;
                        n = scnprintf(buf, buf_sz, "%sfp%d%d..%d",
                                      space, frame, spi_off(run_start), spi_off(spi));
                } else {
                        /* just a full spi */
                        n = scnprintf(buf, buf_sz, "%sfp%d%d", space, frame, spi_off(spi));
                }
                first = false;
                buf += n;
                buf_sz -= n;
        }
        return env->tmp_str_buf;
}

static void print_instance(struct bpf_verifier_env *env, struct func_instance *instance)
{
        int start = env->subprog_info[instance->subprog].start;
        struct bpf_insn *insns = env->prog->insnsi;
        struct per_frame_masks *masks;
        int len = instance->insn_cnt;
        int insn_idx, frame, i;
        bool has_use, has_def;
        u64 pos, insn_pos;

        if (!(env->log.level & BPF_LOG_LEVEL2))
                return;

        verbose(env, "stack use/def %s ", fmt_subprog(env, instance->subprog));
        verbose(env, "%s:\n", fmt_instance(env, instance));
        for (i = 0; i < len; i++) {
                insn_idx = start + i;
                has_use = false;
                has_def = false;
                pos = env->log.end_pos;
                verbose(env, "%3d: ", insn_idx);
                bpf_verbose_insn(env, &insns[insn_idx]);
                bpf_vlog_reset(&env->log, env->log.end_pos - 1); /* remove \n */
                insn_pos = env->log.end_pos;
                verbose(env, "%*c;", bpf_vlog_alignment(insn_pos - pos), ' ');
                pos = env->log.end_pos;
                verbose(env, " use: ");
                for (frame = instance->depth; frame >= 0; --frame) {
                        masks = get_frame_masks(instance, frame, insn_idx);
                        if (!masks || spis_is_zero(masks->may_read))
                                continue;
                        verbose(env, "%s", fmt_spis_mask(env, frame, !has_use, masks->may_read));
                        has_use = true;
                }
                if (!has_use)
                        bpf_vlog_reset(&env->log, pos);
                pos = env->log.end_pos;
                verbose(env, " def: ");
                for (frame = instance->depth; frame >= 0; --frame) {
                        masks = get_frame_masks(instance, frame, insn_idx);
                        if (!masks || spis_is_zero(masks->must_write))
                                continue;
                        verbose(env, "%s", fmt_spis_mask(env, frame, !has_def, masks->must_write));
                        has_def = true;
                }
                if (!has_def)
                        bpf_vlog_reset(&env->log, has_use ? pos : insn_pos);
                verbose(env, "\n");
                if (bpf_is_ldimm64(&insns[insn_idx]))
                        i++;
        }
}

static int cmp_instances(const void *pa, const void *pb)
{
        struct func_instance *a = *(struct func_instance **)pa;
        struct func_instance *b = *(struct func_instance **)pb;
        int dcallsite = (int)a->callsite - b->callsite;
        int ddepth = (int)a->depth - b->depth;

        if (dcallsite)
                return dcallsite;
        if (ddepth)
                return ddepth;
        return 0;
}

/* print use/def slots for all instances ordered by callsite first, then by depth */
static int print_instances(struct bpf_verifier_env *env)
{
        struct func_instance *instance, **sorted_instances;
        struct bpf_liveness *liveness = env->liveness;
        int i, bkt, cnt;

        cnt = 0;
        hash_for_each(liveness->func_instances, bkt, instance, hl_node)
                cnt++;
        sorted_instances = kvmalloc_objs(*sorted_instances, cnt, GFP_KERNEL_ACCOUNT);
        if (!sorted_instances)
                return -ENOMEM;
        cnt = 0;
        hash_for_each(liveness->func_instances, bkt, instance, hl_node)
                sorted_instances[cnt++] = instance;
        sort(sorted_instances, cnt, sizeof(*sorted_instances), cmp_instances, NULL);
        for (i = 0; i < cnt; i++)
                print_instance(env, sorted_instances[i]);
        kvfree(sorted_instances);
        return 0;
}

/*
 * Per-register tracking state for compute_subprog_args().
 * Tracks which frame's FP a value is derived from
 * and the byte offset from that frame's FP.
 *
 * The .frame field forms a lattice with three levels of precision:
 *
 *   precise {frame=N, off=V}      -- known absolute frame index and byte offset
 *        |
 *   offset-imprecise {frame=N, cnt=0}
 *        |                        -- known frame identity, unknown offset
 *   fully-imprecise {frame=ARG_IMPRECISE, mask=bitmask}
 *                                 -- unknown frame identity; .mask is a
 *                                    bitmask of which frame indices might be
 *                                    involved
 *
 * At CFG merge points, arg_track_join() moves down the lattice:
 *   - same frame + same offset  -> precise
 *   - same frame + different offset -> offset-imprecise
 *   - different frames          -> fully-imprecise (bitmask OR)
 *
 * At memory access sites (LDX/STX/ST), offset-imprecise marks only
 * the known frame's access mask as SPIS_ALL, while fully-imprecise
 * iterates bits in the bitmask and routes each frame to its target.
 */
#define MAX_ARG_OFFSETS 4

struct arg_track {
        union {
                s16 off[MAX_ARG_OFFSETS]; /* byte offsets; off_cnt says how many */
                u16 mask;        /* arg bitmask when arg == ARG_IMPRECISE */
        };
        s8 frame;        /* absolute frame index, or enum arg_track_state */
        s8 off_cnt;        /* 0 = offset-imprecise, 1-4 = # of precise offsets */
};

enum arg_track_state {
        ARG_NONE        = -1,        /* not derived from any argument */
        ARG_UNVISITED        = -2,        /* not yet reached by dataflow */
        ARG_IMPRECISE        = -3,        /* lost identity; .mask is arg bitmask */
};

/* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */
#define MAX_ARG_SPILL_SLOTS 64

static bool arg_is_visited(const struct arg_track *at)
{
        return at->frame != ARG_UNVISITED;
}

static bool arg_is_fp(const struct arg_track *at)
{
        return at->frame >= 0 || at->frame == ARG_IMPRECISE;
}

static void verbose_arg_track(struct bpf_verifier_env *env, struct arg_track *at)
{
        int i;

        switch (at->frame) {
        case ARG_NONE:      verbose(env, "_");                          break;
        case ARG_UNVISITED: verbose(env, "?");                          break;
        case ARG_IMPRECISE: verbose(env, "IMP%x", at->mask);            break;
        default:
                /* frame >= 0: absolute frame index */
                if (at->off_cnt == 0) {
                        verbose(env, "fp%d ?", at->frame);
                } else {
                        for (i = 0; i < at->off_cnt; i++) {
                                if (i)
                                        verbose(env, "|");
                                verbose(env, "fp%d%+d", at->frame, at->off[i]);
                        }
                }
                break;
        }
}

static bool arg_track_eq(const struct arg_track *a, const struct arg_track *b)
{
        int i;

        if (a->frame != b->frame)
                return false;
        if (a->frame == ARG_IMPRECISE)
                return a->mask == b->mask;
        if (a->frame < 0)
                return true;
        if (a->off_cnt != b->off_cnt)
                return false;
        for (i = 0; i < a->off_cnt; i++)
                if (a->off[i] != b->off[i])
                        return false;
        return true;
}

static struct arg_track arg_single(s8 arg, s16 off)
{
        struct arg_track at = {};

        at.frame = arg;
        at.off[0] = off;
        at.off_cnt = 1;
        return at;
}

/*
 * Merge two sorted offset arrays, deduplicate.
 * Returns off_cnt=0 if the result exceeds MAX_ARG_OFFSETS.
 * Both args must have the same frame and off_cnt > 0.
 */
static struct arg_track arg_merge_offsets(struct arg_track a, struct arg_track b)
{
        struct arg_track result = { .frame = a.frame };
        struct arg_track imp = { .frame = a.frame };
        int i = 0, j = 0, k = 0;

        while (i < a.off_cnt && j < b.off_cnt) {
                s16 v;

                if (a.off[i] <= b.off[j]) {
                        v = a.off[i++];
                        if (v == b.off[j])
                                j++;
                } else {
                        v = b.off[j++];
                }
                if (k > 0 && result.off[k - 1] == v)
                        continue;
                if (k >= MAX_ARG_OFFSETS)
                        return imp;
                result.off[k++] = v;
        }
        while (i < a.off_cnt) {
                if (k >= MAX_ARG_OFFSETS)
                        return imp;
                result.off[k++] = a.off[i++];
        }
        while (j < b.off_cnt) {
                if (k >= MAX_ARG_OFFSETS)
                        return imp;
                result.off[k++] = b.off[j++];
        }
        result.off_cnt = k;
        return result;
}

/*
 * Merge two arg_tracks into ARG_IMPRECISE, collecting the frame
 * bits from both operands. Precise frame indices (frame >= 0)
 * contribute a single bit; existing ARG_IMPRECISE values
 * contribute their full bitmask.
 */
static struct arg_track arg_join_imprecise(struct arg_track a, struct arg_track b)
{
        u32 m = 0;

        if (a.frame >= 0)
                m |= BIT(a.frame);
        else if (a.frame == ARG_IMPRECISE)
                m |= a.mask;

        if (b.frame >= 0)
                m |= BIT(b.frame);
        else if (b.frame == ARG_IMPRECISE)
                m |= b.mask;

        return (struct arg_track){ .mask = m, .frame = ARG_IMPRECISE };
}

/* Join two arg_track values at merge points */
static struct arg_track __arg_track_join(struct arg_track a, struct arg_track b)
{
        if (!arg_is_visited(&b))
                return a;
        if (!arg_is_visited(&a))
                return b;
        if (a.frame == b.frame && a.frame >= 0) {
                /* Both offset-imprecise: stay imprecise */
                if (a.off_cnt == 0 || b.off_cnt == 0)
                        return (struct arg_track){ .frame = a.frame };
                /* Merge offset sets; falls back to off_cnt=0 if >4 */
                return arg_merge_offsets(a, b);
        }

        /*
         * args are different, but one of them is known
         * arg + none -> arg
         * none + arg -> arg
         *
         * none + none -> none
         */
        if (a.frame == ARG_NONE && b.frame == ARG_NONE)
                return a;
        if (a.frame >= 0 && b.frame == ARG_NONE) {
                /*
                 * When joining single fp-N add fake fp+0 to
                 * keep stack_use and prevent stack_def
                 */
                if (a.off_cnt == 1)
                        return arg_merge_offsets(a, arg_single(a.frame, 0));
                return a;
        }
        if (b.frame >= 0 && a.frame == ARG_NONE) {
                if (b.off_cnt == 1)
                        return arg_merge_offsets(b, arg_single(b.frame, 0));
                return b;
        }

        return arg_join_imprecise(a, b);
}

static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, int r,
                           struct arg_track *in, struct arg_track out)
{
        struct arg_track old = *in;
        struct arg_track new_val = __arg_track_join(old, out);

        if (arg_track_eq(&new_val, &old))
                return false;

        *in = new_val;
        if (!(env->log.level & BPF_LOG_LEVEL2) || !arg_is_visited(&old))
                return true;

        verbose(env, "arg JOIN insn %d -> %d ", idx, target);
        if (r >= 0)
                verbose(env, "r%d: ", r);
        else
                verbose(env, "fp%+d: ", r * 8);
        verbose_arg_track(env, &old);
        verbose(env, " + ");
        verbose_arg_track(env, &out);
        verbose(env, " => ");
        verbose_arg_track(env, &new_val);
        verbose(env, "\n");
        return true;
}

/*
 * Compute the result when an ALU op destroys offset precision.
 * If a single arg is identifiable, preserve it with OFF_IMPRECISE.
 * If two different args are involved or one is already ARG_IMPRECISE,
 * the result is fully ARG_IMPRECISE.
 */
static void arg_track_alu64(struct arg_track *dst, const struct arg_track *src)
{
        WARN_ON_ONCE(!arg_is_visited(dst));
        WARN_ON_ONCE(!arg_is_visited(src));

        if (dst->frame >= 0 && (src->frame == ARG_NONE || src->frame == dst->frame)) {
                /*
                 * rX += rY where rY is not arg derived
                 * rX += rX
                 */
                dst->off_cnt = 0;
                return;
        }
        if (src->frame >= 0 && dst->frame == ARG_NONE) {
                /*
                 * rX += rY where rX is not arg derived
                 * rY identity leaks into rX
                 */
                dst->off_cnt = 0;
                dst->frame = src->frame;
                return;
        }

        if (dst->frame == ARG_NONE && src->frame == ARG_NONE)
                return;

        *dst = arg_join_imprecise(*dst, *src);
}

static bool arg_add(s16 off, s64 delta, s16 *out)
{
        s16 d = delta;

        if (d != delta)
                return true;
        return check_add_overflow(off, d, out);
}

static void arg_padd(struct arg_track *at, s64 delta)
{
        int i;

        if (at->off_cnt == 0)
                return;
        for (i = 0; i < at->off_cnt; i++) {
                s16 new_off;

                if (arg_add(at->off[i], delta, &new_off)) {
                        at->off_cnt = 0;
                        return;
                }
                at->off[i] = new_off;
        }
}

/*
 * Convert a byte offset from FP to a callee stack slot index.
 * Returns -1 if out of range or not 8-byte aligned.
 * Slot 0 = fp-8, slot 1 = fp-16, ..., slot 7 = fp-64, ....
 */
static int fp_off_to_slot(s16 off)
{
        if (off >= 0 || off < -(int)(MAX_ARG_SPILL_SLOTS * 8))
                return -1;
        if (off % 8)
                return -1;
        return (-off) / 8 - 1;
}

static struct arg_track fill_from_stack(struct bpf_insn *insn,
                                        struct arg_track *at_out, int reg,
                                        struct arg_track *at_stack_out,
                                        int depth)
{
        struct arg_track imp = {
                .mask = (1u << (depth + 1)) - 1,
                .frame = ARG_IMPRECISE
        };
        struct arg_track result = { .frame = ARG_NONE };
        int cnt, i;

        if (reg == BPF_REG_FP) {
                int slot = fp_off_to_slot(insn->off);

                return slot >= 0 ? at_stack_out[slot] : imp;
        }
        cnt = at_out[reg].off_cnt;
        if (cnt == 0)
                return imp;

        for (i = 0; i < cnt; i++) {
                s16 fp_off, slot;

                if (arg_add(at_out[reg].off[i], insn->off, &fp_off))
                        return imp;
                slot = fp_off_to_slot(fp_off);
                if (slot < 0)
                        return imp;
                result = __arg_track_join(result, at_stack_out[slot]);
        }
        return result;
}

/*
 * Spill @val to all possible stack slots indicated by the FP offsets in @reg.
 * For an 8-byte store, single candidate slot gets @val. multi-slots are joined.
 * sub-8-byte store joins with ARG_NONE.
 * When exact offset is unknown conservatively add reg values to all slots in at_stack_out.
 */
static void spill_to_stack(struct bpf_insn *insn, struct arg_track *at_out,
                           int reg, struct arg_track *at_stack_out,
                           struct arg_track *val, u32 sz)
{
        struct arg_track none = { .frame = ARG_NONE };
        struct arg_track new_val = sz == 8 ? *val : none;
        int cnt, i;

        if (reg == BPF_REG_FP) {
                int slot = fp_off_to_slot(insn->off);

                if (slot >= 0)
                        at_stack_out[slot] = new_val;
                return;
        }
        cnt = at_out[reg].off_cnt;
        if (cnt == 0) {
                for (int slot = 0; slot < MAX_ARG_SPILL_SLOTS; slot++)
                        at_stack_out[slot] = __arg_track_join(at_stack_out[slot], new_val);
                return;
        }
        for (i = 0; i < cnt; i++) {
                s16 fp_off;
                int slot;

                if (arg_add(at_out[reg].off[i], insn->off, &fp_off))
                        continue;
                slot = fp_off_to_slot(fp_off);
                if (slot < 0)
                        continue;
                if (cnt == 1)
                        at_stack_out[slot] = new_val;
                else
                        at_stack_out[slot] = __arg_track_join(at_stack_out[slot], new_val);
        }
}

/*
 * Clear all tracked callee stack slots overlapping the byte range
 * [off, off+sz-1] where off is a negative FP-relative offset.
 */
static void clear_overlapping_stack_slots(struct arg_track *at_stack, s16 off, u32 sz, int cnt)
{
        struct arg_track none = { .frame = ARG_NONE };

        if (cnt == 0) {
                for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++)
                        at_stack[i] = __arg_track_join(at_stack[i], none);
                return;
        }
        for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++) {
                int slot_start = -((i + 1) * 8);
                int slot_end = slot_start + 8;

                if (slot_start < off + (int)sz && slot_end > off) {
                        if (cnt == 1)
                                at_stack[i] = none;
                        else
                                at_stack[i] = __arg_track_join(at_stack[i], none);
                }
        }
}

/*
 * Clear stack slots overlapping all possible FP offsets in @reg.
 */
static void clear_stack_for_all_offs(struct bpf_insn *insn,
                                     struct arg_track *at_out, int reg,
                                     struct arg_track *at_stack_out, u32 sz)
{
        int cnt, i;

        if (reg == BPF_REG_FP) {
                clear_overlapping_stack_slots(at_stack_out, insn->off, sz, 1);
                return;
        }
        cnt = at_out[reg].off_cnt;
        if (cnt == 0) {
                clear_overlapping_stack_slots(at_stack_out, 0, sz, cnt);
                return;
        }
        for (i = 0; i < cnt; i++) {
                s16 fp_off;

                if (arg_add(at_out[reg].off[i], insn->off, &fp_off)) {
                        clear_overlapping_stack_slots(at_stack_out, 0, sz, 0);
                        break;
                }
                clear_overlapping_stack_slots(at_stack_out, fp_off, sz, cnt);
        }
}

static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, int idx,
                          struct arg_track *at_in, struct arg_track *at_stack_in,
                          struct arg_track *at_out, struct arg_track *at_stack_out)
{
        bool printed = false;
        int i;

        if (!(env->log.level & BPF_LOG_LEVEL2))
                return;
        for (i = 0; i < MAX_BPF_REG; i++) {
                if (arg_track_eq(&at_out[i], &at_in[i]))
                        continue;
                if (!printed) {
                        verbose(env, "%3d: ", idx);
                        bpf_verbose_insn(env, insn);
                        bpf_vlog_reset(&env->log, env->log.end_pos - 1);
                        printed = true;
                }
                verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]);
                verbose(env, " -> "); verbose_arg_track(env, &at_out[i]);
        }
        for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) {
                if (arg_track_eq(&at_stack_out[i], &at_stack_in[i]))
                        continue;
                if (!printed) {
                        verbose(env, "%3d: ", idx);
                        bpf_verbose_insn(env, insn);
                        bpf_vlog_reset(&env->log, env->log.end_pos - 1);
                        printed = true;
                }
                verbose(env, "\tfp%+d: ", -(i + 1) * 8); verbose_arg_track(env, &at_stack_in[i]);
                verbose(env, " -> "); verbose_arg_track(env, &at_stack_out[i]);
        }
        if (printed)
                verbose(env, "\n");
}

static bool can_be_local_fp(int depth, int regno, struct arg_track *at)
{
        return regno == BPF_REG_FP || at->frame == depth ||
               (at->frame == ARG_IMPRECISE && (at->mask & BIT(depth)));
}

/*
 * Pure dataflow transfer function for arg_track state.
 * Updates at_out[] based on how the instruction modifies registers.
 * Tracks spill/fill, but not other memory accesses.
 */
static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn,
                           int insn_idx,
                           struct arg_track *at_out, struct arg_track *at_stack_out,
                           struct func_instance *instance,
                           u32 *callsites)
{
        int depth = instance->depth;
        u8 class = BPF_CLASS(insn->code);
        u8 code = BPF_OP(insn->code);
        struct arg_track *dst = &at_out[insn->dst_reg];
        struct arg_track *src = &at_out[insn->src_reg];
        struct arg_track none = { .frame = ARG_NONE };
        int r;

        if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) {
                if (code == BPF_MOV) {
                        *dst = none;
                } else if (dst->frame >= 0) {
                        if (code == BPF_ADD)
                                arg_padd(dst, insn->imm);
                        else if (code == BPF_SUB)
                                arg_padd(dst, -(s64)insn->imm);
                        else
                                /* Any other 64-bit alu on the pointer makes it imprecise */
                                dst->off_cnt = 0;
                } /* else if dst->frame is imprecise it stays so */
        } else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_X) {
                if (code == BPF_MOV) {
                        if (insn->off == 0) {
                                *dst = *src;
                        } else {
                                /* addr_space_cast destroys a pointer */
                                *dst = none;
                        }
                } else {
                        arg_track_alu64(dst, src);
                }
        } else if (class == BPF_ALU) {
                /*
                 * 32-bit alu destroys the pointer.
                 * If src was a pointer it cannot leak into dst
                 */
                *dst = none;
        } else if (class == BPF_JMP && code == BPF_CALL) {
                /*
                 * at_stack_out[slot] is not cleared by the helper and subprog calls.
                 * The fill_from_stack() may return the stale spill — which is an FP-derived arg_track
                 * (the value that was originally spilled there). The loaded register then carries
                 * a phantom FP-derived identity that doesn't correspond to what's actually in the slot.
                 * This phantom FP pointer propagates forward, and wherever it's subsequently used
                 * (as a helper argument, another store, etc.), it sets stack liveness bits.
                 * Those bits correspond to stack accesses that don't actually happen.
                 * So the effect is over-reporting stack liveness — marking slots as live that aren't
                 * actually accessed. The verifier preserves more state than necessary across calls,
                 * which is conservative.
                 *
                 * helpers can scratch stack slots, but they won't make a valid pointer out of it.
                 * subprogs are allowed to write into parent slots, but they cannot write
                 * _any_ FP-derived pointer into it (either their own or parent's FP).
                 */
                for (r = BPF_REG_0; r <= BPF_REG_5; r++)
                        at_out[r] = none;
        } else if (class == BPF_LDX) {
                u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
                bool src_is_local_fp = can_be_local_fp(depth, insn->src_reg, src);

                /*
                 * Reload from callee stack: if src is current-frame FP-derived
                 * and the load is an 8-byte BPF_MEM, try to restore the spill
                 * identity.  For imprecise sources fill_from_stack() returns
                 * ARG_IMPRECISE (off_cnt == 0).
                 */
                if (src_is_local_fp && BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
                        *dst = fill_from_stack(insn, at_out, insn->src_reg, at_stack_out, depth);
                } else if (src->frame >= 0 && src->frame < depth &&
                           BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
                        struct arg_track *parent_stack =
                                env->callsite_at_stack[callsites[src->frame]];

                        *dst = fill_from_stack(insn, at_out, insn->src_reg,
                                               parent_stack, src->frame);
                } else if (src->frame == ARG_IMPRECISE &&
                           !(src->mask & BIT(depth)) && src->mask &&
                           BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
                        /*
                         * Imprecise src with only parent-frame bits:
                         * conservative fallback.
                         */
                        *dst = *src;
                } else {
                        *dst = none;
                }
        } else if (class == BPF_LD && BPF_MODE(insn->code) == BPF_IMM) {
                *dst = none;
        } else if (class == BPF_STX) {
                u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
                bool dst_is_local_fp;

                /* Track spills to current-frame FP-derived callee stack */
                dst_is_local_fp = can_be_local_fp(depth, insn->dst_reg, dst);
                if (dst_is_local_fp && BPF_MODE(insn->code) == BPF_MEM)
                        spill_to_stack(insn, at_out, insn->dst_reg,
                                       at_stack_out, src, sz);

                if (BPF_MODE(insn->code) == BPF_ATOMIC) {
                        if (dst_is_local_fp && insn->imm != BPF_LOAD_ACQ)
                                clear_stack_for_all_offs(insn, at_out, insn->dst_reg,
                                                         at_stack_out, sz);

                        if (insn->imm == BPF_CMPXCHG)
                                at_out[BPF_REG_0] = none;
                        else if (insn->imm == BPF_LOAD_ACQ)
                                *dst = none;
                        else if (insn->imm & BPF_FETCH)
                                *src = none;
                }
        } else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
                u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
                bool dst_is_local_fp = can_be_local_fp(depth, insn->dst_reg, dst);

                /* BPF_ST to FP-derived dst: clear overlapping stack slots */
                if (dst_is_local_fp)
                        clear_stack_for_all_offs(insn, at_out, insn->dst_reg,
                                                 at_stack_out, sz);
        }
}

/*
 * Record access_bytes from helper/kfunc or load/store insn.
 *   access_bytes > 0:      stack read
 *   access_bytes < 0:      stack write
 *   access_bytes == S64_MIN: unknown   — conservative, mark [0..slot] as read
 *   access_bytes == 0:      no access
 *
 */
static int record_stack_access_off(struct func_instance *instance, s64 fp_off,
                                   s64 access_bytes, u32 frame, u32 insn_idx)
{
        s32 slot_hi, slot_lo;
        spis_t mask;

        if (fp_off >= 0)
                /*
                 * out of bounds stack access doesn't contribute
                 * into actual stack liveness. It will be rejected
                 * by the main verifier pass later.
                 */
                return 0;
        if (access_bytes == S64_MIN) {
                /* helper/kfunc read unknown amount of bytes from fp_off until fp+0 */
                slot_hi = (-fp_off - 1) / STACK_SLOT_SZ;
                mask = SPIS_ZERO;
                spis_or_range(&mask, 0, slot_hi);
                return mark_stack_read(instance, frame, insn_idx, mask);
        }
        if (access_bytes > 0) {
                /* Mark any touched slot as use */
                slot_hi = (-fp_off - 1) / STACK_SLOT_SZ;
                slot_lo = max_t(s32, (-fp_off - access_bytes) / STACK_SLOT_SZ, 0);
                mask = SPIS_ZERO;
                spis_or_range(&mask, slot_lo, slot_hi);
                return mark_stack_read(instance, frame, insn_idx, mask);
        } else if (access_bytes < 0) {
                /* Mark only fully covered slots as def */
                access_bytes = -access_bytes;
                slot_hi = (-fp_off) / STACK_SLOT_SZ - 1;
                slot_lo = max_t(s32, (-fp_off - access_bytes + STACK_SLOT_SZ - 1) / STACK_SLOT_SZ, 0);
                if (slot_lo <= slot_hi) {
                        mask = SPIS_ZERO;
                        spis_or_range(&mask, slot_lo, slot_hi);
                        return mark_stack_write(instance, frame, insn_idx, mask);
                }
        }
        return 0;
}

/*
 * 'arg' is FP-derived argument to helper/kfunc or load/store that
 * reads (positive) or writes (negative) 'access_bytes' into 'use' or 'def'.
 */
static int record_stack_access(struct func_instance *instance,
                               const struct arg_track *arg,
                               s64 access_bytes, u32 frame, u32 insn_idx)
{
        int i, err;

        if (access_bytes == 0)
                return 0;
        if (arg->off_cnt == 0) {
                if (access_bytes > 0 || access_bytes == S64_MIN)
                        return mark_stack_read(instance, frame, insn_idx, SPIS_ALL);
                return 0;
        }
        if (access_bytes != S64_MIN && access_bytes < 0 && arg->off_cnt != 1)
                /* multi-offset write cannot set stack_def */
                return 0;

        for (i = 0; i < arg->off_cnt; i++) {
                err = record_stack_access_off(instance, arg->off[i], access_bytes, frame, insn_idx);
                if (err)
                        return err;
        }
        return 0;
}

/*
 * When a pointer is ARG_IMPRECISE, conservatively mark every frame in
 * the bitmask as fully used.
 */
static int record_imprecise(struct func_instance *instance, u32 mask, u32 insn_idx)
{
        int depth = instance->depth;
        int f, err;

        for (f = 0; mask; f++, mask >>= 1) {
                if (!(mask & 1))
                        continue;
                if (f <= depth) {
                        err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
                        if (err)
                                return err;
                }
        }
        return 0;
}

/* Record load/store access for a given 'at' state of 'insn'. */
static int record_load_store_access(struct bpf_verifier_env *env,
                                    struct func_instance *instance,
                                    struct arg_track *at, int insn_idx)
{
        struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
        int depth = instance->depth;
        s32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
        u8 class = BPF_CLASS(insn->code);
        struct arg_track resolved, *ptr;
        int oi;

        switch (class) {
        case BPF_LDX:
                ptr = &at[insn->src_reg];
                break;
        case BPF_STX:
                if (BPF_MODE(insn->code) == BPF_ATOMIC) {
                        if (insn->imm == BPF_STORE_REL)
                                sz = -sz;
                        if (insn->imm == BPF_LOAD_ACQ)
                                ptr = &at[insn->src_reg];
                        else
                                ptr = &at[insn->dst_reg];
                } else {
                        ptr = &at[insn->dst_reg];
                        sz = -sz;
                }
                break;
        case BPF_ST:
                ptr = &at[insn->dst_reg];
                sz = -sz;
                break;
        default:
                return 0;
        }

        /* Resolve offsets: fold insn->off into arg_track */
        if (ptr->off_cnt > 0) {
                resolved.off_cnt = ptr->off_cnt;
                resolved.frame = ptr->frame;
                for (oi = 0; oi < ptr->off_cnt; oi++) {
                        if (arg_add(ptr->off[oi], insn->off, &resolved.off[oi])) {
                                resolved.off_cnt = 0;
                                break;
                        }
                }
                ptr = &resolved;
        }

        if (ptr->frame >= 0 && ptr->frame <= depth)
                return record_stack_access(instance, ptr, sz, ptr->frame, insn_idx);
        if (ptr->frame == ARG_IMPRECISE)
                return record_imprecise(instance, ptr->mask, insn_idx);
        /* ARG_NONE: not derived from any frame pointer, skip */
        return 0;
}

/* Record stack access for a given 'at' state of helper/kfunc 'insn' */
static int record_call_access(struct bpf_verifier_env *env,
                              struct func_instance *instance,
                              struct arg_track *at,
                              int insn_idx)
{
        struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
        int depth = instance->depth;
        struct bpf_call_summary cs;
        int r, err = 0, num_params = 5;

        if (bpf_pseudo_call(insn))
                return 0;

        if (bpf_get_call_summary(env, insn, &cs))
                num_params = cs.num_params;

        for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) {
                int frame = at[r].frame;
                s64 bytes;

                if (!arg_is_fp(&at[r]))
                        continue;

                if (bpf_helper_call(insn)) {
                        bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx);
                } else if (bpf_pseudo_kfunc_call(insn)) {
                        bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx);
                } else {
                        for (int f = 0; f <= depth; f++) {
                                err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
                                if (err)
                                        return err;
                        }
                        return 0;
                }
                if (bytes == 0)
                        continue;

                if (frame >= 0 && frame <= depth)
                        err = record_stack_access(instance, &at[r], bytes, frame, insn_idx);
                else if (frame == ARG_IMPRECISE)
                        err = record_imprecise(instance, at[r].mask, insn_idx);
                if (err)
                        return err;
        }
        return 0;
}

/*
 * For a calls_callback helper, find the callback subprog and determine
 * which caller register maps to which callback register for FP passthrough.
 */
static int find_callback_subprog(struct bpf_verifier_env *env,
                                 struct bpf_insn *insn, int insn_idx,
                                 int *caller_reg, int *callee_reg)
{
        struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
        int cb_reg = -1;

        *caller_reg = -1;
        *callee_reg = -1;

        if (!bpf_helper_call(insn))
                return -1;
        switch (insn->imm) {
        case BPF_FUNC_loop:
                /* bpf_loop(nr, cb, ctx, flags): cb=R2, R3->cb R2 */
                cb_reg = BPF_REG_2;
                *caller_reg = BPF_REG_3;
                *callee_reg = BPF_REG_2;
                break;
        case BPF_FUNC_for_each_map_elem:
                /* for_each_map_elem(map, cb, ctx, flags): cb=R2, R3->cb R4 */
                cb_reg = BPF_REG_2;
                *caller_reg = BPF_REG_3;
                *callee_reg = BPF_REG_4;
                break;
        case BPF_FUNC_find_vma:
                /* find_vma(task, addr, cb, ctx, flags): cb=R3, R4->cb R3 */
                cb_reg = BPF_REG_3;
                *caller_reg = BPF_REG_4;
                *callee_reg = BPF_REG_3;
                break;
        case BPF_FUNC_user_ringbuf_drain:
                /* user_ringbuf_drain(map, cb, ctx, flags): cb=R2, R3->cb R2 */
                cb_reg = BPF_REG_2;
                *caller_reg = BPF_REG_3;
                *callee_reg = BPF_REG_2;
                break;
        default:
                return -1;
        }

        if (!(aux->const_reg_subprog_mask & BIT(cb_reg)))
                return -2;

        return aux->const_reg_vals[cb_reg];
}

/* Per-subprog intermediate state kept alive across analysis phases */
struct subprog_at_info {
        struct arg_track (*at_in)[MAX_BPF_REG];
        int len;
};

static void print_subprog_arg_access(struct bpf_verifier_env *env,
                                     int subprog,
                                     struct subprog_at_info *info,
                                     struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS])
{
        struct bpf_insn *insns = env->prog->insnsi;
        int start = env->subprog_info[subprog].start;
        int len = info->len;
        int i, r;

        if (!(env->log.level & BPF_LOG_LEVEL2))
                return;

        verbose(env, "%s:\n", fmt_subprog(env, subprog));
        for (i = 0; i < len; i++) {
                int idx = start + i;
                bool has_extra = false;
                u8 cls = BPF_CLASS(insns[idx].code);
                bool is_ldx_stx_call = cls == BPF_LDX || cls == BPF_STX ||
                                       insns[idx].code == (BPF_JMP | BPF_CALL);

                verbose(env, "%3d: ", idx);
                bpf_verbose_insn(env, &insns[idx]);

                /* Collect what needs printing */
                if (is_ldx_stx_call &&
                    arg_is_visited(&info->at_in[i][0])) {
                        for (r = 0; r < MAX_BPF_REG - 1; r++)
                                if (arg_is_fp(&info->at_in[i][r]))
                                        has_extra = true;
                }
                if (is_ldx_stx_call) {
                        for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
                                if (arg_is_fp(&at_stack_in[i][r]))
                                        has_extra = true;
                }

                if (!has_extra) {
                        if (bpf_is_ldimm64(&insns[idx]))
                                i++;
                        continue;
                }

                bpf_vlog_reset(&env->log, env->log.end_pos - 1);
                verbose(env, " //");

                if (is_ldx_stx_call && info->at_in &&
                    arg_is_visited(&info->at_in[i][0])) {
                        for (r = 0; r < MAX_BPF_REG - 1; r++) {
                                if (!arg_is_fp(&info->at_in[i][r]))
                                        continue;
                                verbose(env, " r%d=", r);
                                verbose_arg_track(env, &info->at_in[i][r]);
                        }
                }

                if (is_ldx_stx_call) {
                        for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) {
                                if (!arg_is_fp(&at_stack_in[i][r]))
                                        continue;
                                verbose(env, " fp%+d=", -(r + 1) * 8);
                                verbose_arg_track(env, &at_stack_in[i][r]);
                        }
                }

                verbose(env, "\n");
                if (bpf_is_ldimm64(&insns[idx]))
                        i++;
        }
}

/*
 * Compute arg tracking dataflow for a single subprog.
 * Runs forward fixed-point with arg_track_xfer(), then records
 * memory accesses in a single linear pass over converged state.
 *
 * @callee_entry: pre-populated entry state for R1-R5
 *                NULL for main (subprog 0).
 * @info:         stores at_in, len for debug printing.
 */
static int compute_subprog_args(struct bpf_verifier_env *env,
                                struct subprog_at_info *info,
                                struct arg_track *callee_entry,
                                struct func_instance *instance,
                                u32 *callsites)
{
        int subprog = instance->subprog;
        struct bpf_insn *insns = env->prog->insnsi;
        int depth = instance->depth;
        int start = env->subprog_info[subprog].start;
        int po_start = env->subprog_info[subprog].postorder_start;
        int end = env->subprog_info[subprog + 1].start;
        int po_end = env->subprog_info[subprog + 1].postorder_start;
        int len = end - start;
        struct arg_track (*at_in)[MAX_BPF_REG] = NULL;
        struct arg_track at_out[MAX_BPF_REG];
        struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL;
        struct arg_track *at_stack_out = NULL;
        struct arg_track unvisited = { .frame = ARG_UNVISITED };
        struct arg_track none = { .frame = ARG_NONE };
        bool changed;
        int i, p, r, err = -ENOMEM;

        at_in = kvmalloc_objs(*at_in, len, GFP_KERNEL_ACCOUNT);
        if (!at_in)
                goto err_free;

        at_stack_in = kvmalloc_objs(*at_stack_in, len, GFP_KERNEL_ACCOUNT);
        if (!at_stack_in)
                goto err_free;

        at_stack_out = kvmalloc_objs(*at_stack_out, MAX_ARG_SPILL_SLOTS, GFP_KERNEL_ACCOUNT);
        if (!at_stack_out)
                goto err_free;

        for (i = 0; i < len; i++) {
                for (r = 0; r < MAX_BPF_REG; r++)
                        at_in[i][r] = unvisited;
                for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
                        at_stack_in[i][r] = unvisited;
        }

        for (r = 0; r < MAX_BPF_REG; r++)
                at_in[0][r] = none;

        /* Entry: R10 is always precisely the current frame's FP */
        at_in[0][BPF_REG_FP] = arg_single(depth, 0);

        /* R1-R5: from caller or ARG_NONE for main */
        if (callee_entry) {
                for (r = BPF_REG_1; r <= BPF_REG_5; r++)
                        at_in[0][r] = callee_entry[r];
        }

        /* Entry: all stack slots are ARG_NONE */
        for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
                at_stack_in[0][r] = none;

        if (env->log.level & BPF_LOG_LEVEL2)
                verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth);

        /* Forward fixed-point iteration in reverse post order */
redo:
        changed = false;
        for (p = po_end - 1; p >= po_start; p--) {
                int idx = env->cfg.insn_postorder[p];
                int i = idx - start;
                struct bpf_insn *insn = &insns[idx];
                struct bpf_iarray *succ;

                if (!arg_is_visited(&at_in[i][0]) && !arg_is_visited(&at_in[i][1]))
                        continue;

                memcpy(at_out, at_in[i], sizeof(at_out));
                memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out));

                arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites);
                arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out);

                /* Propagate to successors within this subprogram */
                succ = bpf_insn_successors(env, idx);
                for (int s = 0; s < succ->cnt; s++) {
                        int target = succ->items[s];
                        int ti;

                        /* Filter: stay within the subprogram's range */
                        if (target < start || target >= end)
                                continue;
                        ti = target - start;

                        for (r = 0; r < MAX_BPF_REG; r++)
                                changed |= arg_track_join(env, idx, target, r,
                                                          &at_in[ti][r], at_out[r]);

                        for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
                                changed |= arg_track_join(env, idx, target, -r - 1,
                                                          &at_stack_in[ti][r], at_stack_out[r]);
                }
        }
        if (changed)
                goto redo;

        /* Record memory accesses using converged at_in (RPO skips dead code) */
        for (p = po_end - 1; p >= po_start; p--) {
                int idx = env->cfg.insn_postorder[p];
                int i = idx - start;
                struct bpf_insn *insn = &insns[idx];

                err = record_load_store_access(env, instance, at_in[i], idx);
                if (err)
                        goto err_free;

                if (insn->code == (BPF_JMP | BPF_CALL)) {
                        err = record_call_access(env, instance, at_in[i], idx);
                        if (err)
                                goto err_free;
                }

                if (bpf_pseudo_call(insn) || bpf_calls_callback(env, idx)) {
                        kvfree(env->callsite_at_stack[idx]);
                        env->callsite_at_stack[idx] =
                                kvmalloc_objs(*env->callsite_at_stack[idx],
                                              MAX_ARG_SPILL_SLOTS, GFP_KERNEL_ACCOUNT);
                        if (!env->callsite_at_stack[idx]) {
                                err = -ENOMEM;
                                goto err_free;
                        }
                        memcpy(env->callsite_at_stack[idx],
                               at_stack_in[i], sizeof(struct arg_track) * MAX_ARG_SPILL_SLOTS);
                }
        }

        info->at_in = at_in;
        at_in = NULL;
        info->len = len;
        print_subprog_arg_access(env, subprog, info, at_stack_in);
        err = 0;

err_free:
        kvfree(at_stack_out);
        kvfree(at_stack_in);
        kvfree(at_in);
        return err;
}

/* Return true if any of R1-R5 is derived from a frame pointer. */
static bool has_fp_args(struct arg_track *args)
{
        for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
                if (args[r].frame != ARG_NONE)
                        return true;
        return false;
}

/*
 * Merge a freshly analyzed instance into the original.
 * may_read: union (any pass might read the slot).
 * must_write: intersection (only slots written on ALL passes are guaranteed).
 * live_before is recomputed by a subsequent update_instance() on @dst.
 */
static void merge_instances(struct func_instance *dst, struct func_instance *src)
{
        int f, i;

        for (f = 0; f <= dst->depth; f++) {
                if (!src->frames[f]) {
                        /* This pass didn't touch frame f — must_write intersects with empty. */
                        if (dst->frames[f])
                                for (i = 0; i < dst->insn_cnt; i++)
                                        dst->frames[f][i].must_write = SPIS_ZERO;
                        continue;
                }
                if (!dst->frames[f]) {
                        /* Previous pass didn't touch frame f — take src, zero must_write. */
                        dst->frames[f] = src->frames[f];
                        src->frames[f] = NULL;
                        for (i = 0; i < dst->insn_cnt; i++)
                                dst->frames[f][i].must_write = SPIS_ZERO;
                        continue;
                }
                for (i = 0; i < dst->insn_cnt; i++) {
                        dst->frames[f][i].may_read =
                                spis_or(dst->frames[f][i].may_read,
                                        src->frames[f][i].may_read);
                        dst->frames[f][i].must_write =
                                spis_and(dst->frames[f][i].must_write,
                                         src->frames[f][i].must_write);
                }
        }
}

static struct func_instance *fresh_instance(struct func_instance *src)
{
        struct func_instance *f;

        f = kvzalloc_obj(*f, GFP_KERNEL_ACCOUNT);
        if (!f)
                return ERR_PTR(-ENOMEM);
        f->callsite = src->callsite;
        f->depth = src->depth;
        f->subprog = src->subprog;
        f->subprog_start = src->subprog_start;
        f->insn_cnt = src->insn_cnt;
        return f;
}

static void free_instance(struct func_instance *instance)
{
        int i;

        for (i = 0; i <= instance->depth; i++)
                kvfree(instance->frames[i]);
        kvfree(instance);
}

/*
 * Recursively analyze a subprog with specific 'entry_args'.
 * Each callee is analyzed with the exact args from its call site.
 *
 * Args are recomputed for each call because the dataflow result at_in[]
 * depends on the entry args and frame depth. Consider: A->C->D and B->C->D
 * Callsites in A and B pass different args into C, so C is recomputed.
 * Then within C the same callsite passes different args into D.
 */
static int analyze_subprog(struct bpf_verifier_env *env,
                           struct arg_track *entry_args,
                           struct subprog_at_info *info,
                           struct func_instance *instance,
                           u32 *callsites)
{
        int subprog = instance->subprog;
        int depth = instance->depth;
        struct bpf_insn *insns = env->prog->insnsi;
        int start = env->subprog_info[subprog].start;
        int po_start = env->subprog_info[subprog].postorder_start;
        int po_end = env->subprog_info[subprog + 1].postorder_start;
        struct func_instance *prev_instance = NULL;
        int j, err;

        if (++env->liveness->subprog_calls > 10000) {
                verbose(env, "liveness analysis exceeded complexity limit (%d calls)\n",
                        env->liveness->subprog_calls);
                return -E2BIG;
        }

        if (need_resched())
                cond_resched();


        /*
         * When an instance is reused (must_write_initialized == true),
         * record into a fresh instance and merge afterward.  This avoids
         * stale must_write marks for instructions not reached in this pass.
         */
        if (instance->must_write_initialized) {
                struct func_instance *fresh = fresh_instance(instance);

                if (IS_ERR(fresh))
                        return PTR_ERR(fresh);
                prev_instance = instance;
                instance = fresh;
        }

        /* Free prior analysis if this subprog was already visited */
        kvfree(info[subprog].at_in);
        info[subprog].at_in = NULL;

        err = compute_subprog_args(env, &info[subprog], entry_args, instance, callsites);
        if (err)
                goto out_free;

        /* For each reachable call site in the subprog, recurse into callees */
        for (int p = po_start; p < po_end; p++) {
                int idx = env->cfg.insn_postorder[p];
                struct arg_track callee_args[BPF_REG_5 + 1];
                struct arg_track none = { .frame = ARG_NONE };
                struct bpf_insn *insn = &insns[idx];
                struct func_instance *callee_instance;
                int callee, target;
                int caller_reg, cb_callee_reg;

                j = idx - start; /* relative index within this subprog */

                if (bpf_pseudo_call(insn)) {
                        target = idx + insn->imm + 1;
                        callee = bpf_find_subprog(env, target);
                        if (callee < 0)
                                continue;

                        /* Build entry args: R1-R5 from at_in at call site */
                        for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
                                callee_args[r] = info[subprog].at_in[j][r];
                } else if (bpf_calls_callback(env, idx)) {
                        callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg);
                        if (callee == -2) {
                                /*
                                 * same bpf_loop() calls two different callbacks and passes
                                 * stack pointer to them
                                 */
                                if (info[subprog].at_in[j][caller_reg].frame == ARG_NONE)
                                        continue;
                                for (int f = 0; f <= depth; f++) {
                                        err = mark_stack_read(instance, f, idx, SPIS_ALL);
                                        if (err)
                                                goto out_free;
                                }
                                continue;
                        }
                        if (callee < 0)
                                continue;

                        for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
                                callee_args[r] = none;
                        callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg];
                } else {
                        continue;
                }

                if (!has_fp_args(callee_args))
                        continue;

                if (depth == MAX_CALL_FRAMES - 1) {
                        err = -EINVAL;
                        goto out_free;
                }

                callee_instance = call_instance(env, instance, idx, callee);
                if (IS_ERR(callee_instance)) {
                        err = PTR_ERR(callee_instance);
                        goto out_free;
                }
                callsites[depth] = idx;
                err = analyze_subprog(env, callee_args, info, callee_instance, callsites);
                if (err)
                        goto out_free;

                /* Pull callee's entry liveness back to caller's callsite */
                {
                        u32 callee_start = callee_instance->subprog_start;
                        struct per_frame_masks *entry;

                        for (int f = 0; f < callee_instance->depth; f++) {
                                entry = get_frame_masks(callee_instance, f, callee_start);
                                if (!entry)
                                        continue;
                                err = mark_stack_read(instance, f, idx, entry->live_before);
                                if (err)
                                        goto out_free;
                        }
                }
        }

        if (prev_instance) {
                merge_instances(prev_instance, instance);
                free_instance(instance);
                instance = prev_instance;
        }
        update_instance(env, instance);
        return 0;

out_free:
        if (prev_instance)
                free_instance(instance);
        return err;
}

int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env)
{
        u32 callsites[MAX_CALL_FRAMES] = {};
        int insn_cnt = env->prog->len;
        struct func_instance *instance;
        struct subprog_at_info *info;
        int k, err = 0;

        info = kvzalloc_objs(*info, env->subprog_cnt, GFP_KERNEL_ACCOUNT);
        if (!info)
                return -ENOMEM;

        env->callsite_at_stack = kvzalloc_objs(*env->callsite_at_stack, insn_cnt,
                                               GFP_KERNEL_ACCOUNT);
        if (!env->callsite_at_stack) {
                kvfree(info);
                return -ENOMEM;
        }

        instance = call_instance(env, NULL, 0, 0);
        if (IS_ERR(instance)) {
                err = PTR_ERR(instance);
                goto out;
        }
        err = analyze_subprog(env, NULL, info, instance, callsites);
        if (err)
                goto out;

        /*
         * Subprogs and callbacks that don't receive FP-derived arguments
         * cannot access ancestor stack frames, so they were skipped during
         * the recursive walk above.  Async callbacks (timer, workqueue) are
         * also not reachable from the main program's call graph.  Analyze
         * all unvisited subprogs as independent roots at depth 0.
         *
         * Use reverse topological order (callers before callees) so that
         * each subprog is analyzed before its callees, allowing the
         * recursive walk inside analyze_subprog() to naturally
         * reach nested callees that also lack FP-derived args.
         */
        for (k = env->subprog_cnt - 1; k >= 0; k--) {
                int sub = env->subprog_topo_order[k];

                if (info[sub].at_in && !bpf_subprog_is_global(env, sub))
                        continue;
                instance = call_instance(env, NULL, 0, sub);
                if (IS_ERR(instance)) {
                        err = PTR_ERR(instance);
                        goto out;
                }
                err = analyze_subprog(env, NULL, info, instance, callsites);
                if (err)
                        goto out;
        }

        if (env->log.level & BPF_LOG_LEVEL2)
                err = print_instances(env);

out:
        for (k = 0; k < insn_cnt; k++)
                kvfree(env->callsite_at_stack[k]);
        kvfree(env->callsite_at_stack);
        env->callsite_at_stack = NULL;
        for (k = 0; k < env->subprog_cnt; k++)
                kvfree(info[k].at_in);
        kvfree(info);
        return err;
}

/* Each field is a register bitmask */
struct insn_live_regs {
        u16 use;        /* registers read by instruction */
        u16 def;        /* registers written by instruction */
        u16 in;                /* registers that may be alive before instruction */
        u16 out;        /* registers that may be alive after instruction */
};

/* Bitmask with 1s for all caller saved registers */
#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)

/* Compute info->{use,def} fields for the instruction */
static void compute_insn_live_regs(struct bpf_verifier_env *env,
                                   struct bpf_insn *insn,
                                   struct insn_live_regs *info)
{
        struct bpf_call_summary cs;
        u8 class = BPF_CLASS(insn->code);
        u8 code = BPF_OP(insn->code);
        u8 mode = BPF_MODE(insn->code);
        u16 src = BIT(insn->src_reg);
        u16 dst = BIT(insn->dst_reg);
        u16 r0  = BIT(0);
        u16 def = 0;
        u16 use = 0xffff;

        switch (class) {
        case BPF_LD:
                switch (mode) {
                case BPF_IMM:
                        if (BPF_SIZE(insn->code) == BPF_DW) {
                                def = dst;
                                use = 0;
                        }
                        break;
                case BPF_LD | BPF_ABS:
                case BPF_LD | BPF_IND:
                        /* stick with defaults */
                        break;
                }
                break;
        case BPF_LDX:
                switch (mode) {
                case BPF_MEM:
                case BPF_MEMSX:
                        def = dst;
                        use = src;
                        break;
                }
                break;
        case BPF_ST:
                switch (mode) {
                case BPF_MEM:
                        def = 0;
                        use = dst;
                        break;
                }
                break;
        case BPF_STX:
                switch (mode) {
                case BPF_MEM:
                        def = 0;
                        use = dst | src;
                        break;
                case BPF_ATOMIC:
                        switch (insn->imm) {
                        case BPF_CMPXCHG:
                                use = r0 | dst | src;
                                def = r0;
                                break;
                        case BPF_LOAD_ACQ:
                                def = dst;
                                use = src;
                                break;
                        case BPF_STORE_REL:
                                def = 0;
                                use = dst | src;
                                break;
                        default:
                                use = dst | src;
                                if (insn->imm & BPF_FETCH)
                                        def = src;
                                else
                                        def = 0;
                        }
                        break;
                }
                break;
        case BPF_ALU:
        case BPF_ALU64:
                switch (code) {
                case BPF_END:
                        use = dst;
                        def = dst;
                        break;
                case BPF_MOV:
                        def = dst;
                        if (BPF_SRC(insn->code) == BPF_K)
                                use = 0;
                        else
                                use = src;
                        break;
                default:
                        def = dst;
                        if (BPF_SRC(insn->code) == BPF_K)
                                use = dst;
                        else
                                use = dst | src;
                }
                break;
        case BPF_JMP:
        case BPF_JMP32:
                switch (code) {
                case BPF_JA:
                        def = 0;
                        if (BPF_SRC(insn->code) == BPF_X)
                                use = dst;
                        else
                                use = 0;
                        break;
                case BPF_JCOND:
                        def = 0;
                        use = 0;
                        break;
                case BPF_EXIT:
                        def = 0;
                        use = r0;
                        break;
                case BPF_CALL:
                        def = ALL_CALLER_SAVED_REGS;
                        use = def & ~BIT(BPF_REG_0);
                        if (bpf_get_call_summary(env, insn, &cs))
                                use = GENMASK(cs.num_params, 1);
                        break;
                default:
                        def = 0;
                        if (BPF_SRC(insn->code) == BPF_K)
                                use = dst;
                        else
                                use = dst | src;
                }
                break;
        }

        info->def = def;
        info->use = use;
}

/* Compute may-live registers after each instruction in the program.
 * The register is live after the instruction I if it is read by some
 * instruction S following I during program execution and is not
 * overwritten between I and S.
 *
 * Store result in env->insn_aux_data[i].live_regs.
 */
int bpf_compute_live_registers(struct bpf_verifier_env *env)
{
        struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
        struct bpf_insn *insns = env->prog->insnsi;
        struct insn_live_regs *state;
        int insn_cnt = env->prog->len;
        int err = 0, i, j;
        bool changed;

        /* Use the following algorithm:
         * - define the following:
         *   - I.use : a set of all registers read by instruction I;
         *   - I.def : a set of all registers written by instruction I;
         *   - I.in  : a set of all registers that may be alive before I execution;
         *   - I.out : a set of all registers that may be alive after I execution;
         *   - insn_successors(I): a set of instructions S that might immediately
         *                         follow I for some program execution;
         * - associate separate empty sets 'I.in' and 'I.out' with each instruction;
         * - visit each instruction in a postorder and update
         *   state[i].in, state[i].out as follows:
         *
         *       state[i].out = U [state[s].in for S in insn_successors(i)]
         *       state[i].in  = (state[i].out / state[i].def) U state[i].use
         *
         *   (where U stands for set union, / stands for set difference)
         * - repeat the computation while {in,out} fields changes for
         *   any instruction.
         */
        state = kvzalloc_objs(*state, insn_cnt, GFP_KERNEL_ACCOUNT);
        if (!state) {
                err = -ENOMEM;
                goto out;
        }

        for (i = 0; i < insn_cnt; ++i)
                compute_insn_live_regs(env, &insns[i], &state[i]);

        /* Forward pass: resolve stack access through FP-derived pointers */
        err = bpf_compute_subprog_arg_access(env);
        if (err)
                goto out;

        changed = true;
        while (changed) {
                changed = false;
                for (i = 0; i < env->cfg.cur_postorder; ++i) {
                        int insn_idx = env->cfg.insn_postorder[i];
                        struct insn_live_regs *live = &state[insn_idx];
                        struct bpf_iarray *succ;
                        u16 new_out = 0;
                        u16 new_in = 0;

                        succ = bpf_insn_successors(env, insn_idx);
                        for (int s = 0; s < succ->cnt; ++s)
                                new_out |= state[succ->items[s]].in;
                        new_in = (new_out & ~live->def) | live->use;
                        if (new_out != live->out || new_in != live->in) {
                                live->in = new_in;
                                live->out = new_out;
                                changed = true;
                        }
                }
        }

        for (i = 0; i < insn_cnt; ++i)
                insn_aux[i].live_regs_before = state[i].in;

        if (env->log.level & BPF_LOG_LEVEL2) {
                verbose(env, "Live regs before insn:\n");
                for (i = 0; i < insn_cnt; ++i) {
                        if (env->insn_aux_data[i].scc)
                                verbose(env, "%3d ", env->insn_aux_data[i].scc);
                        else
                                verbose(env, "    ");
                        verbose(env, "%3d: ", i);
                        for (j = BPF_REG_0; j < BPF_REG_10; ++j)
                                if (insn_aux[i].live_regs_before & BIT(j))
                                        verbose(env, "%d", j);
                                else
                                        verbose(env, ".");
                        verbose(env, " ");
                        bpf_verbose_insn(env, &insns[i]);
                        if (bpf_is_ldimm64(&insns[i]))
                                i++;
                }
        }

out:
        kvfree(state);
        return err;
}






























































    3 





    3 

















    3 

















    3 







    3 
    3 
    3 




















    3 
    3 
    3 


















































    3 
    3 

    3 



    3 









    3 
    3 









    3 



    3 
    3 







    3 
    3 








    3 






    3 


    3 


    3 




    2 








    3 















    3 


    3 


























    3 
    3 
    3 





























































    3 








    3 



    3 


    3 

    3 








    3 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
// SPDX-License-Identifier: GPL-2.0-only
/*
 * klist.c - Routines for manipulating klists.
 *
 * Copyright (C) 2005 Patrick Mochel
 *
 * This klist interface provides a couple of structures that wrap around
 * struct list_head to provide explicit list "head" (struct klist) and list
 * "node" (struct klist_node) objects. For struct klist, a spinlock is
 * included that protects access to the actual list itself. struct
 * klist_node provides a pointer to the klist that owns it and a kref
 * reference count that indicates the number of current users of that node
 * in the list.
 *
 * The entire point is to provide an interface for iterating over a list
 * that is safe and allows for modification of the list during the
 * iteration (e.g. insertion and removal), including modification of the
 * current node on the list.
 *
 * It works using a 3rd object type - struct klist_iter - that is declared
 * and initialized before an iteration. klist_next() is used to acquire the
 * next element in the list. It returns NULL if there are no more items.
 * Internally, that routine takes the klist's lock, decrements the
 * reference count of the previous klist_node and increments the count of
 * the next klist_node. It then drops the lock and returns.
 *
 * There are primitives for adding and removing nodes to/from a klist.
 * When deleting, klist_del() will simply decrement the reference count.
 * Only when the count goes to 0 is the node removed from the list.
 * klist_remove() will try to delete the node from the list and block until
 * it is actually removed. This is useful for objects (like devices) that
 * have been removed from the system and must be freed (but must wait until
 * all accessors have finished).
 */

#include <linux/klist.h>
#include <linux/export.h>
#include <linux/sched.h>

/*
 * Use the lowest bit of n_klist to mark deleted nodes and exclude
 * dead ones from iteration.
 */
#define KNODE_DEAD                1LU
#define KNODE_KLIST_MASK        ~KNODE_DEAD

static struct klist *knode_klist(struct klist_node *knode)
{
        return (struct klist *)
                ((unsigned long)knode->n_klist & KNODE_KLIST_MASK);
}

static bool knode_dead(struct klist_node *knode)
{
        return (unsigned long)knode->n_klist & KNODE_DEAD;
}

static void knode_set_klist(struct klist_node *knode, struct klist *klist)
{
        knode->n_klist = klist;
        /* no knode deserves to start its life dead */
        WARN_ON(knode_dead(knode));
}

static void knode_kill(struct klist_node *knode)
{
        /* and no knode should die twice ever either, see we're very humane */
        WARN_ON(knode_dead(knode));
        *(unsigned long *)&knode->n_klist |= KNODE_DEAD;
}

/**
 * klist_init - Initialize a klist structure.
 * @k: The klist we're initializing.
 * @get: The get function for the embedding object (NULL if none)
 * @put: The put function for the embedding object (NULL if none)
 *
 * Initialises the klist structure.  If the klist_node structures are
 * going to be embedded in refcounted objects (necessary for safe
 * deletion) then the get/put arguments are used to initialise
 * functions that take and release references on the embedding
 * objects.
 */
void klist_init(struct klist *k, void (*get)(struct klist_node *),
                void (*put)(struct klist_node *))
{
        INIT_LIST_HEAD(&k->k_list);
        spin_lock_init(&k->k_lock);
        k->get = get;
        k->put = put;
}
EXPORT_SYMBOL_GPL(klist_init);

static void add_head(struct klist *k, struct klist_node *n)
{
        spin_lock(&k->k_lock);
        list_add(&n->n_node, &k->k_list);
        spin_unlock(&k->k_lock);
}

static void add_tail(struct klist *k, struct klist_node *n)
{
        spin_lock(&k->k_lock);
        list_add_tail(&n->n_node, &k->k_list);
        spin_unlock(&k->k_lock);
}

static void klist_node_init(struct klist *k, struct klist_node *n)
{
        INIT_LIST_HEAD(&n->n_node);
        kref_init(&n->n_ref);
        knode_set_klist(n, k);
        if (k->get)
                k->get(n);
}

/**
 * klist_add_head - Initialize a klist_node and add it to front.
 * @n: node we're adding.
 * @k: klist it's going on.
 */
void klist_add_head(struct klist_node *n, struct klist *k)
{
        klist_node_init(k, n);
        add_head(k, n);
}
EXPORT_SYMBOL_GPL(klist_add_head);

/**
 * klist_add_tail - Initialize a klist_node and add it to back.
 * @n: node we're adding.
 * @k: klist it's going on.
 */
void klist_add_tail(struct klist_node *n, struct klist *k)
{
        klist_node_init(k, n);
        add_tail(k, n);
}
EXPORT_SYMBOL_GPL(klist_add_tail);

/**
 * klist_add_behind - Init a klist_node and add it after an existing node
 * @n: node we're adding.
 * @pos: node to put @n after
 */
void klist_add_behind(struct klist_node *n, struct klist_node *pos)
{
        struct klist *k = knode_klist(pos);

        klist_node_init(k, n);
        spin_lock(&k->k_lock);
        list_add(&n->n_node, &pos->n_node);
        spin_unlock(&k->k_lock);
}
EXPORT_SYMBOL_GPL(klist_add_behind);

/**
 * klist_add_before - Init a klist_node and add it before an existing node
 * @n: node we're adding.
 * @pos: node to put @n after
 */
void klist_add_before(struct klist_node *n, struct klist_node *pos)
{
        struct klist *k = knode_klist(pos);

        klist_node_init(k, n);
        spin_lock(&k->k_lock);
        list_add_tail(&n->n_node, &pos->n_node);
        spin_unlock(&k->k_lock);
}
EXPORT_SYMBOL_GPL(klist_add_before);

struct klist_waiter {
        struct list_head list;
        struct klist_node *node;
        struct task_struct *process;
        int woken;
};

static DEFINE_SPINLOCK(klist_remove_lock);
static LIST_HEAD(klist_remove_waiters);

static void klist_release(struct kref *kref)
{
        struct klist_waiter *waiter, *tmp;
        struct klist_node *n = container_of(kref, struct klist_node, n_ref);

        WARN_ON(!knode_dead(n));
        list_del(&n->n_node);
        spin_lock(&klist_remove_lock);
        list_for_each_entry_safe(waiter, tmp, &klist_remove_waiters, list) {
                if (waiter->node != n)
                        continue;

                list_del(&waiter->list);
                waiter->woken = 1;
                mb();
                wake_up_process(waiter->process);
        }
        spin_unlock(&klist_remove_lock);
        knode_set_klist(n, NULL);
}

static int klist_dec_and_del(struct klist_node *n)
{
        return kref_put(&n->n_ref, klist_release);
}

static void klist_put(struct klist_node *n, bool kill)
{
        struct klist *k = knode_klist(n);
        void (*put)(struct klist_node *) = k->put;

        spin_lock(&k->k_lock);
        if (kill)
                knode_kill(n);
        if (!klist_dec_and_del(n))
                put = NULL;
        spin_unlock(&k->k_lock);
        if (put)
                put(n);
}

/**
 * klist_del - Decrement the reference count of node and try to remove.
 * @n: node we're deleting.
 */
void klist_del(struct klist_node *n)
{
        klist_put(n, true);
}
EXPORT_SYMBOL_GPL(klist_del);

/**
 * klist_remove - Decrement the refcount of node and wait for it to go away.
 * @n: node we're removing.
 */
void klist_remove(struct klist_node *n)
{
        struct klist_waiter waiter;

        waiter.node = n;
        waiter.process = current;
        waiter.woken = 0;
        spin_lock(&klist_remove_lock);
        list_add(&waiter.list, &klist_remove_waiters);
        spin_unlock(&klist_remove_lock);

        klist_del(n);

        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (waiter.woken)
                        break;
                schedule();
        }
        __set_current_state(TASK_RUNNING);
}
EXPORT_SYMBOL_GPL(klist_remove);

/**
 * klist_node_attached - Say whether a node is bound to a list or not.
 * @n: Node that we're testing.
 */
int klist_node_attached(struct klist_node *n)
{
        return (n->n_klist != NULL);
}
EXPORT_SYMBOL_GPL(klist_node_attached);

/**
 * klist_iter_init_node - Initialize a klist_iter structure.
 * @k: klist we're iterating.
 * @i: klist_iter we're filling.
 * @n: node to start with.
 *
 * Similar to klist_iter_init(), but starts the action off with @n,
 * instead of with the list head.
 */
void klist_iter_init_node(struct klist *k, struct klist_iter *i,
                          struct klist_node *n)
{
        i->i_klist = k;
        i->i_cur = NULL;
        if (n && kref_get_unless_zero(&n->n_ref))
                i->i_cur = n;
}
EXPORT_SYMBOL_GPL(klist_iter_init_node);

/**
 * klist_iter_init - Iniitalize a klist_iter structure.
 * @k: klist we're iterating.
 * @i: klist_iter structure we're filling.
 *
 * Similar to klist_iter_init_node(), but start with the list head.
 */
void klist_iter_init(struct klist *k, struct klist_iter *i)
{
        klist_iter_init_node(k, i, NULL);
}
EXPORT_SYMBOL_GPL(klist_iter_init);

/**
 * klist_iter_exit - Finish a list iteration.
 * @i: Iterator structure.
 *
 * Must be called when done iterating over list, as it decrements the
 * refcount of the current node. Necessary in case iteration exited before
 * the end of the list was reached, and always good form.
 */
void klist_iter_exit(struct klist_iter *i)
{
        if (i->i_cur) {
                klist_put(i->i_cur, false);
                i->i_cur = NULL;
        }
}
EXPORT_SYMBOL_GPL(klist_iter_exit);

static struct klist_node *to_klist_node(struct list_head *n)
{
        return container_of(n, struct klist_node, n_node);
}

/**
 * klist_prev - Ante up prev node in list.
 * @i: Iterator structure.
 *
 * First grab list lock. Decrement the reference count of the previous
 * node, if there was one. Grab the prev node, increment its reference
 * count, drop the lock, and return that prev node.
 */
struct klist_node *klist_prev(struct klist_iter *i)
{
        void (*put)(struct klist_node *) = i->i_klist->put;
        struct klist_node *last = i->i_cur;
        struct klist_node *prev;
        unsigned long flags;

        spin_lock_irqsave(&i->i_klist->k_lock, flags);

        if (last) {
                prev = to_klist_node(last->n_node.prev);
                if (!klist_dec_and_del(last))
                        put = NULL;
        } else
                prev = to_klist_node(i->i_klist->k_list.prev);

        i->i_cur = NULL;
        while (prev != to_klist_node(&i->i_klist->k_list)) {
                if (likely(!knode_dead(prev))) {
                        kref_get(&prev->n_ref);
                        i->i_cur = prev;
                        break;
                }
                prev = to_klist_node(prev->n_node.prev);
        }

        spin_unlock_irqrestore(&i->i_klist->k_lock, flags);

        if (put && last)
                put(last);
        return i->i_cur;
}
EXPORT_SYMBOL_GPL(klist_prev);

/**
 * klist_next - Ante up next node in list.
 * @i: Iterator structure.
 *
 * First grab list lock. Decrement the reference count of the previous
 * node, if there was one. Grab the next node, increment its reference
 * count, drop the lock, and return that next node.
 */
struct klist_node *klist_next(struct klist_iter *i)
{
        void (*put)(struct klist_node *) = i->i_klist->put;
        struct klist_node *last = i->i_cur;
        struct klist_node *next;
        unsigned long flags;

        spin_lock_irqsave(&i->i_klist->k_lock, flags);

        if (last) {
                next = to_klist_node(last->n_node.next);
                if (!klist_dec_and_del(last))
                        put = NULL;
        } else
                next = to_klist_node(i->i_klist->k_list.next);

        i->i_cur = NULL;
        while (next != to_klist_node(&i->i_klist->k_list)) {
                if (likely(!knode_dead(next))) {
                        kref_get(&next->n_ref);
                        i->i_cur = next;
                        break;
                }
                next = to_klist_node(next->n_node.next);
        }

        spin_unlock_irqrestore(&i->i_klist->k_lock, flags);

        if (put && last)
                put(last);
        return i->i_cur;
}
EXPORT_SYMBOL_GPL(klist_next);














































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __SHMEM_FS_H
#define __SHMEM_FS_H

#include <linux/file.h>
#include <linux/swap.h>
#include <linux/mempolicy.h>
#include <linux/pagemap.h>
#include <linux/percpu_counter.h>
#include <linux/xattr.h>
#include <linux/fs_parser.h>
#include <linux/userfaultfd_k.h>
#include <linux/bits.h>

struct swap_iocb;

/* inode in-kernel data */

#ifdef CONFIG_TMPFS_QUOTA
#define SHMEM_MAXQUOTAS 2
#endif

/* Suppress pre-accounting of the entire object size. */
#define SHMEM_F_NORESERVE        BIT(0)
/* Disallow swapping. */
#define SHMEM_F_LOCKED                BIT(1)
/*
 * Disallow growing, shrinking, or hole punching in the inode. Combined with
 * folio pinning, makes sure the inode's mapping stays fixed.
 *
 * In some ways similar to F_SEAL_GROW | F_SEAL_SHRINK, but can be removed and
 * isn't directly visible to userspace.
 */
#define SHMEM_F_MAPPING_FROZEN        BIT(2)

struct shmem_inode_info {
        spinlock_t                lock;
        unsigned int                seals;                /* shmem seals */
        unsigned long                flags;
        unsigned long                alloced;        /* data pages alloced to file */
        unsigned long                swapped;        /* subtotal assigned to swap */
        union {
            struct offset_ctx        dir_offsets;        /* stable directory offsets */
            struct {
                struct list_head shrinklist;        /* shrinkable hpage inodes */
                struct list_head swaplist;        /* chain of maybes on swap */
            };
        };
        struct timespec64        i_crtime;        /* file creation time */
        struct shared_policy        policy;                /* NUMA memory alloc policy */
        struct simple_xattrs        *xattrs;        /* list of xattrs */
        pgoff_t                        fallocend;        /* highest fallocate endindex */
        unsigned int                fsflags;        /* for FS_IOC_[SG]ETFLAGS */
        atomic_t                stop_eviction;        /* hold when working on inode */
#ifdef CONFIG_TMPFS_QUOTA
        struct dquot __rcu        *i_dquot[MAXQUOTAS];
#endif
        struct inode                vfs_inode;
};

#define SHMEM_FL_USER_VISIBLE                (FS_FL_USER_VISIBLE | FS_CASEFOLD_FL)
#define SHMEM_FL_USER_MODIFIABLE \
        (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL)
#define SHMEM_FL_INHERITED                (FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL)

struct shmem_quota_limits {
        qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */
        qsize_t usrquota_ihardlimit; /* Default user quota inode hard limit */
        qsize_t grpquota_bhardlimit; /* Default group quota block hard limit */
        qsize_t grpquota_ihardlimit; /* Default group quota inode hard limit */
};

struct shmem_sb_info {
        unsigned long max_blocks;   /* How many blocks are allowed */
        struct percpu_counter used_blocks;  /* How many are allocated */
        unsigned long max_inodes;   /* How many inodes are allowed */
        unsigned long free_ispace;  /* How much ispace left for allocation */
        raw_spinlock_t stat_lock;   /* Serialize shmem_sb_info changes */
        umode_t mode;                    /* Mount mode for root directory */
        unsigned char huge;            /* Whether to try for hugepages */
        kuid_t uid;                    /* Mount uid for root directory */
        kgid_t gid;                    /* Mount gid for root directory */
        bool full_inums;            /* If i_ino should be uint or ino_t */
        bool noswap;                    /* ignores VM reclaim / swap requests */
        ino_t next_ino;                    /* The next per-sb inode number to use */
        ino_t __percpu *ino_batch;  /* The next per-cpu inode number to use */
        struct mempolicy *mpol;     /* default memory policy for mappings */
        spinlock_t shrinklist_lock;   /* Protects shrinklist */
        struct list_head shrinklist;  /* List of shinkable inodes */
        unsigned long shrinklist_len; /* Length of shrinklist */
        struct shmem_quota_limits qlimits; /* Default quota limits */
};

static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
{
        return container_of(inode, struct shmem_inode_info, vfs_inode);
}

/*
 * Functions in mm/shmem.c called directly from elsewhere:
 */
extern const struct fs_parameter_spec shmem_fs_parameters[];
extern void shmem_init(void);
extern int shmem_init_fs_context(struct fs_context *fc);
struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags);
struct file *shmem_kernel_file_setup(const char *name, loff_t size, vma_flags_t vma_flags);
extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
                const char *name, loff_t size, vma_flags_t flags);
int shmem_zero_setup(struct vm_area_struct *vma);
int shmem_zero_setup_desc(struct vm_area_desc *desc);
extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags);
extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
#ifdef CONFIG_SHMEM
bool shmem_mapping(const struct address_space *mapping);
#else
static inline bool shmem_mapping(const struct address_space *mapping)
{
        return false;
}
#endif /* CONFIG_SHMEM */
void shmem_unlock_mapping(struct address_space *mapping);
struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask);
int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
                struct list_head *folio_list);
void shmem_truncate_range(struct inode *inode, loff_t start, uoff_t end);
int shmem_unuse(unsigned int type);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
unsigned long shmem_allowable_huge_orders(struct inode *inode,
                                struct vm_area_struct *vma, pgoff_t index,
                                loff_t write_end, bool shmem_huge_force);
bool shmem_hpage_pmd_enabled(void);
#else
static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
                                struct vm_area_struct *vma, pgoff_t index,
                                loff_t write_end, bool shmem_huge_force)
{
        return 0;
}

static inline bool shmem_hpage_pmd_enabled(void)
{
        return false;
}
#endif

#ifdef CONFIG_SHMEM
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern void shmem_uncharge(struct inode *inode, long pages);
#else
static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
        return 0;
}

static inline void shmem_uncharge(struct inode *inode, long pages)
{
}
#endif
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                                                pgoff_t start, pgoff_t end);

/* Flag allocation requirements to shmem_get_folio */
enum sgp_type {
        SGP_READ,        /* don't exceed i_size, don't allocate page */
        SGP_NOALLOC,        /* similar, but fail on hole or use fallocated page */
        SGP_CACHE,        /* don't exceed i_size, may allocate page */
        SGP_WRITE,        /* may exceed i_size, may allocate !Uptodate page */
        SGP_FALLOC,        /* like SGP_WRITE, but make existing page Uptodate */
};

int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
                struct folio **foliop, enum sgp_type sgp);
struct folio *shmem_read_folio_gfp(struct address_space *mapping,
                pgoff_t index, gfp_t gfp);

static inline struct folio *shmem_read_folio(struct address_space *mapping,
                pgoff_t index)
{
        return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping));
}

static inline struct page *shmem_read_mapping_page(
                                struct address_space *mapping, pgoff_t index)
{
        return shmem_read_mapping_page_gfp(mapping, index,
                                        mapping_gfp_mask(mapping));
}

static inline bool shmem_file(struct file *file)
{
        if (!IS_ENABLED(CONFIG_SHMEM))
                return false;
        if (!file || !file->f_mapping)
                return false;
        return shmem_mapping(file->f_mapping);
}

/* Must be called with inode lock taken exclusive. */
static inline void shmem_freeze(struct inode *inode, bool freeze)
{
        if (freeze)
                SHMEM_I(inode)->flags |= SHMEM_F_MAPPING_FROZEN;
        else
                SHMEM_I(inode)->flags &= ~SHMEM_F_MAPPING_FROZEN;
}

/*
 * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages
 * beyond i_size's notion of EOF, which fallocate has committed to reserving:
 * which split_huge_page() must therefore not delete.  This use of a single
 * "fallocend" per inode errs on the side of not deleting a reservation when
 * in doubt: there are plenty of cases when it preserves unreserved pages.
 */
static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
{
        return max(eof, SHMEM_I(inode)->fallocend);
}

extern bool shmem_charge(struct inode *inode, long pages);

/*
 * Used space is stored as unsigned 64-bit value in bytes but
 * quota core supports only signed 64-bit values so use that
 * as a limit
 */
#define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */
#define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL

#ifdef CONFIG_TMPFS_QUOTA
extern const struct dquot_operations shmem_quota_operations;
extern struct quota_format_type shmem_quota_format;
#endif /* CONFIG_TMPFS_QUOTA */

#endif




































































    3 











































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
// SPDX-License-Identifier: GPL-2.0-only

/*
 * drm_sysfs.c - Modifications to drm_sysfs_class.c to support
 *               extra sysfs attribute from DRM. Normal drm_sysfs_class
 *               does not allow adding attributes.
 *
 * Copyright (c) 2004 Jon Smirl <jonsmirl@gmail.com>
 * Copyright (c) 2003-2004 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (c) 2003-2004 IBM Corp.
 */

#include <linux/acpi.h>
#include <linux/component.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/i2c.h>
#include <linux/kdev_t.h>
#include <linux/pci.h>
#include <linux/property.h>
#include <linux/slab.h>

#include <drm/drm_accel.h>
#include <drm/drm_connector.h>
#include <drm/drm_device.h>
#include <drm/drm_file.h>
#include <drm/drm_modes.h>
#include <drm/drm_print.h>
#include <drm/drm_property.h>
#include <drm/drm_sysfs.h>

#include <asm/video.h>

#include "drm_internal.h"
#include "drm_crtc_internal.h"

#define to_drm_minor(d) dev_get_drvdata(d)
#define to_drm_connector(d) dev_get_drvdata(d)

/**
 * DOC: overview
 *
 * DRM provides very little additional support to drivers for sysfs
 * interactions, beyond just all the standard stuff. Drivers who want to expose
 * additional sysfs properties and property groups can attach them at either
 * &drm_device.dev or &drm_connector.kdev.
 *
 * Registration is automatically handled when calling drm_dev_register(), or
 * drm_connector_register() in case of hot-plugged connectors. Unregistration is
 * also automatically handled by drm_dev_unregister() and
 * drm_connector_unregister().
 */

static struct device_type drm_sysfs_device_minor = {
        .name = "drm_minor"
};

static struct device_type drm_sysfs_device_connector = {
        .name = "drm_connector",
};

struct class *drm_class;

#ifdef CONFIG_ACPI
static bool drm_connector_acpi_bus_match(struct device *dev)
{
        return dev->type == &drm_sysfs_device_connector;
}

static struct acpi_device *drm_connector_acpi_find_companion(struct device *dev)
{
        struct drm_connector *connector = to_drm_connector(dev);

        return to_acpi_device_node(connector->fwnode);
}

static struct acpi_bus_type drm_connector_acpi_bus = {
        .name = "drm_connector",
        .match = drm_connector_acpi_bus_match,
        .find_companion = drm_connector_acpi_find_companion,
};

static void drm_sysfs_acpi_register(void)
{
        register_acpi_bus_type(&drm_connector_acpi_bus);
}

static void drm_sysfs_acpi_unregister(void)
{
        unregister_acpi_bus_type(&drm_connector_acpi_bus);
}
#else
static void drm_sysfs_acpi_register(void) { }
static void drm_sysfs_acpi_unregister(void) { }
#endif

static char *drm_devnode(const struct device *dev, umode_t *mode)
{
        return kasprintf(GFP_KERNEL, "dri/%s", dev_name(dev));
}

static int typec_connector_bind(struct device *dev,
                                struct device *typec_connector, void *data)
{
        int ret;

        ret = sysfs_create_link(&dev->kobj, &typec_connector->kobj, "typec_connector");
        if (ret)
                return ret;

        ret = sysfs_create_link(&typec_connector->kobj, &dev->kobj, "drm_connector");
        if (ret)
                sysfs_remove_link(&dev->kobj, "typec_connector");

        return ret;
}

static void typec_connector_unbind(struct device *dev,
                                   struct device *typec_connector, void *data)
{
        sysfs_remove_link(&typec_connector->kobj, "drm_connector");
        sysfs_remove_link(&dev->kobj, "typec_connector");
}

static const struct component_ops typec_connector_ops = {
        .bind = typec_connector_bind,
        .unbind = typec_connector_unbind,
};

static CLASS_ATTR_STRING(version, S_IRUGO, "drm 1.1.0 20060810");

/**
 * drm_sysfs_init - initialize sysfs helpers
 *
 * This is used to create the DRM class, which is the implicit parent of any
 * other top-level DRM sysfs objects.
 *
 * You must call drm_sysfs_destroy() to release the allocated resources.
 *
 * Return: 0 on success, negative error code on failure.
 */
int drm_sysfs_init(void)
{
        int err;

        drm_class = class_create("drm");
        if (IS_ERR(drm_class))
                return PTR_ERR(drm_class);

        err = class_create_file(drm_class, &class_attr_version.attr);
        if (err) {
                class_destroy(drm_class);
                drm_class = NULL;
                return err;
        }

        drm_class->devnode = drm_devnode;

        drm_sysfs_acpi_register();
        return 0;
}

/**
 * drm_sysfs_destroy - destroys DRM class
 *
 * Destroy the DRM device class.
 */
void drm_sysfs_destroy(void)
{
        if (IS_ERR_OR_NULL(drm_class))
                return;
        drm_sysfs_acpi_unregister();
        class_remove_file(drm_class, &class_attr_version.attr);
        class_destroy(drm_class);
        drm_class = NULL;
}

static void drm_sysfs_release(struct device *dev)
{
        kfree(dev);
}

/*
 * Connector properties
 */
static ssize_t status_store(struct device *device,
                           struct device_attribute *attr,
                           const char *buf, size_t count)
{
        struct drm_connector *connector = to_drm_connector(device);
        struct drm_device *dev = connector->dev;
        enum drm_connector_force old_force;
        int ret;

        ret = mutex_lock_interruptible(&dev->mode_config.mutex);
        if (ret)
                return ret;

        old_force = connector->force;

        if (sysfs_streq(buf, "detect"))
                connector->force = 0;
        else if (sysfs_streq(buf, "on"))
                connector->force = DRM_FORCE_ON;
        else if (sysfs_streq(buf, "on-digital"))
                connector->force = DRM_FORCE_ON_DIGITAL;
        else if (sysfs_streq(buf, "off"))
                connector->force = DRM_FORCE_OFF;
        else
                ret = -EINVAL;

        if (old_force != connector->force || !connector->force) {
                drm_dbg_kms(dev, "[CONNECTOR:%d:%s] force updated from %d to %d or reprobing\n",
                            connector->base.id, connector->name,
                            old_force, connector->force);

                connector->funcs->fill_modes(connector,
                                             dev->mode_config.max_width,
                                             dev->mode_config.max_height);
        }

        mutex_unlock(&dev->mode_config.mutex);

        return ret ? ret : count;
}

static ssize_t status_show(struct device *device,
                           struct device_attribute *attr,
                           char *buf)
{
        struct drm_connector *connector = to_drm_connector(device);
        enum drm_connector_status status;

        status = READ_ONCE(connector->status);

        return sysfs_emit(buf, "%s\n",
                          drm_get_connector_status_name(status));
}

static ssize_t dpms_show(struct device *device,
                           struct device_attribute *attr,
                           char *buf)
{
        struct drm_connector *connector = to_drm_connector(device);
        int dpms;

        dpms = READ_ONCE(connector->dpms);

        return sysfs_emit(buf, "%s\n", drm_get_dpms_name(dpms));
}

static ssize_t enabled_show(struct device *device,
                            struct device_attribute *attr,
                           char *buf)
{
        struct drm_connector *connector = to_drm_connector(device);
        bool enabled;

        enabled = READ_ONCE(connector->encoder);

        return sysfs_emit(buf, enabled ? "enabled\n" : "disabled\n");
}

static ssize_t edid_show(struct file *filp, struct kobject *kobj,
                         const struct bin_attribute *attr, char *buf, loff_t off,
                         size_t count)
{
        struct device *connector_dev = kobj_to_dev(kobj);
        struct drm_connector *connector = to_drm_connector(connector_dev);
        ssize_t ret;

        ret = drm_edid_connector_property_show(connector, buf, off, count);

        return ret;
}

static ssize_t modes_show(struct device *device,
                           struct device_attribute *attr,
                           char *buf)
{
        struct drm_connector *connector = to_drm_connector(device);
        struct drm_display_mode *mode;
        int written = 0;

        mutex_lock(&connector->dev->mode_config.mutex);
        list_for_each_entry(mode, &connector->modes, head) {
                written += scnprintf(buf + written, PAGE_SIZE - written, "%s\n",
                                    mode->name);
        }
        mutex_unlock(&connector->dev->mode_config.mutex);

        return written;
}

static ssize_t connector_id_show(struct device *device,
                                 struct device_attribute *attr,
                                 char *buf)
{
        struct drm_connector *connector = to_drm_connector(device);

        return sysfs_emit(buf, "%d\n", connector->base.id);
}

static DEVICE_ATTR_RW(status);
static DEVICE_ATTR_RO(enabled);
static DEVICE_ATTR_RO(dpms);
static DEVICE_ATTR_RO(modes);
static DEVICE_ATTR_RO(connector_id);

static struct attribute *connector_dev_attrs[] = {
        &dev_attr_status.attr,
        &dev_attr_enabled.attr,
        &dev_attr_dpms.attr,
        &dev_attr_modes.attr,
        &dev_attr_connector_id.attr,
        NULL
};

static const struct bin_attribute edid_attr = {
        .attr.name = "edid",
        .attr.mode = 0444,
        .size = 0,
        .read = edid_show,
};

static const struct bin_attribute *const connector_bin_attrs[] = {
        &edid_attr,
        NULL
};

static const struct attribute_group connector_dev_group = {
        .attrs = connector_dev_attrs,
        .bin_attrs = connector_bin_attrs,
};

static const struct attribute_group *connector_dev_groups[] = {
        &connector_dev_group,
        NULL
};

int drm_sysfs_connector_add(struct drm_connector *connector)
{
        struct drm_device *dev = connector->dev;
        struct device *kdev;
        int r;

        if (connector->kdev)
                return 0;

        kdev = kzalloc_obj(*kdev);
        if (!kdev)
                return -ENOMEM;

        device_initialize(kdev);
        kdev->class = drm_class;
        kdev->type = &drm_sysfs_device_connector;
        kdev->parent = dev->primary->kdev;
        kdev->groups = connector_dev_groups;
        kdev->release = drm_sysfs_release;
        dev_set_drvdata(kdev, connector);

        r = dev_set_name(kdev, "card%d-%s", dev->primary->index, connector->name);
        if (r)
                goto err_free;

        drm_dbg_kms(dev, "[CONNECTOR:%d:%s] adding connector to sysfs\n",
                    connector->base.id, connector->name);

        r = device_add(kdev);
        if (r) {
                drm_err(dev, "failed to register connector device: %d\n", r);
                goto err_free;
        }

        connector->kdev = kdev;

        if (dev_fwnode(kdev)) {
                r = component_add(kdev, &typec_connector_ops);
                if (r)
                        drm_err(dev, "failed to add component to create link to typec connector\n");
        }

        return 0;

err_free:
        put_device(kdev);
        return r;
}

int drm_sysfs_connector_add_late(struct drm_connector *connector)
{
        if (connector->ddc)
                return sysfs_create_link(&connector->kdev->kobj,
                                         &connector->ddc->dev.kobj, "ddc");

        return 0;
}

void drm_sysfs_connector_remove_early(struct drm_connector *connector)
{
        if (connector->ddc)
                sysfs_remove_link(&connector->kdev->kobj, "ddc");
}

void drm_sysfs_connector_remove(struct drm_connector *connector)
{
        if (!connector->kdev)
                return;

        if (dev_fwnode(connector->kdev))
                component_del(connector->kdev, &typec_connector_ops);

        drm_dbg_kms(connector->dev,
                    "[CONNECTOR:%d:%s] removing connector from sysfs\n",
                    connector->base.id, connector->name);

        device_unregister(connector->kdev);
        connector->kdev = NULL;
}

void drm_sysfs_lease_event(struct drm_device *dev)
{
        char *event_string = "LEASE=1";
        char *envp[] = { event_string, NULL };

        drm_dbg_lease(dev, "generating lease event\n");

        kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
}

/**
 * drm_sysfs_hotplug_event - generate a DRM uevent
 * @dev: DRM device
 *
 * Send a uevent for the DRM device specified by @dev.  Currently we only
 * set HOTPLUG=1 in the uevent environment, but this could be expanded to
 * deal with other types of events.
 *
 * Any new uapi should be using the drm_sysfs_connector_status_event()
 * for uevents on connector status change.
 */
void drm_sysfs_hotplug_event(struct drm_device *dev)
{
        char *event_string = "HOTPLUG=1";
        char *envp[] = { event_string, NULL };

        drm_dbg_kms(dev, "generating hotplug event\n");

        kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
}
EXPORT_SYMBOL(drm_sysfs_hotplug_event);

/**
 * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any connector
 * change
 * @connector: connector which has changed
 *
 * Send a uevent for the DRM connector specified by @connector. This will send
 * a uevent with the properties HOTPLUG=1 and CONNECTOR.
 */
void drm_sysfs_connector_hotplug_event(struct drm_connector *connector)
{
        struct drm_device *dev = connector->dev;
        char hotplug_str[] = "HOTPLUG=1", conn_id[21];
        char *envp[] = { hotplug_str, conn_id, NULL };

        snprintf(conn_id, sizeof(conn_id),
                 "CONNECTOR=%u", connector->base.id);

        drm_dbg_kms(connector->dev,
                    "[CONNECTOR:%d:%s] generating connector hotplug event\n",
                    connector->base.id, connector->name);

        kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
}
EXPORT_SYMBOL(drm_sysfs_connector_hotplug_event);

/**
 * drm_sysfs_connector_property_event - generate a DRM uevent for connector
 * property change
 * @connector: connector on which property changed
 * @property: connector property which has changed.
 *
 * Send a uevent for the specified DRM connector and property.  Currently we
 * set HOTPLUG=1 and connector id along with the attached property id
 * related to the change.
 */
void drm_sysfs_connector_property_event(struct drm_connector *connector,
                                        struct drm_property *property)
{
        struct drm_device *dev = connector->dev;
        char hotplug_str[] = "HOTPLUG=1", conn_id[21], prop_id[21];
        char *envp[4] = { hotplug_str, conn_id, prop_id, NULL };

        WARN_ON(!drm_mode_obj_find_prop_id(&connector->base,
                                           property->base.id));

        snprintf(conn_id, ARRAY_SIZE(conn_id),
                 "CONNECTOR=%u", connector->base.id);
        snprintf(prop_id, ARRAY_SIZE(prop_id),
                 "PROPERTY=%u", property->base.id);

        drm_dbg_kms(connector->dev,
                    "[CONNECTOR:%d:%s] generating connector property event for [PROP:%d:%s]\n",
                    connector->base.id, connector->name,
                    property->base.id, property->name);

        kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
}
EXPORT_SYMBOL(drm_sysfs_connector_property_event);

static ssize_t boot_display_show(struct device *dev, struct device_attribute *attr,
                                 char *buf)
{
        return sysfs_emit(buf, "1\n");
}
static DEVICE_ATTR_RO(boot_display);

static struct attribute *display_attrs[] = {
        &dev_attr_boot_display.attr,
        NULL
};

static umode_t boot_display_visible(struct kobject *kobj,
                                    struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj)->parent;

        if (dev_is_pci(dev)) {
                struct pci_dev *pdev = to_pci_dev(dev);

                if (video_is_primary_device(&pdev->dev))
                        return a->mode;
        }

        return 0;
}

static const struct attribute_group display_attr_group = {
        .attrs = display_attrs,
        .is_visible = boot_display_visible,
};

static const struct attribute_group *card_dev_groups[] = {
        &display_attr_group,
        NULL
};

struct device *drm_sysfs_minor_alloc(struct drm_minor *minor)
{
        const char *minor_str;
        struct device *kdev;
        int r;

        kdev = kzalloc_obj(*kdev);
        if (!kdev)
                return ERR_PTR(-ENOMEM);

        device_initialize(kdev);

        if (minor->type == DRM_MINOR_ACCEL) {
                minor_str = "accel%d";
                accel_set_device_instance_params(kdev, minor->index);
        } else {
                if (minor->type == DRM_MINOR_RENDER)
                        minor_str = "renderD%d";
                else
                        minor_str = "card%d";

                kdev->devt = MKDEV(DRM_MAJOR, minor->index);
                kdev->class = drm_class;
                kdev->groups = card_dev_groups;
                kdev->type = &drm_sysfs_device_minor;
        }

        kdev->parent = minor->dev->dev;
        kdev->release = drm_sysfs_release;
        dev_set_drvdata(kdev, minor);

        r = dev_set_name(kdev, minor_str, minor->index);
        if (r < 0)
                goto err_free;

        return kdev;

err_free:
        put_device(kdev);
        return ERR_PTR(r);
}

/**
 * drm_class_device_register - register new device with the DRM sysfs class
 * @dev: device to register
 *
 * Registers a new &struct device within the DRM sysfs class. Essentially only
 * used by ttm to have a place for its global settings. Drivers should never use
 * this.
 */
int drm_class_device_register(struct device *dev)
{
        if (!drm_class || IS_ERR(drm_class))
                return -ENOENT;

        dev->class = drm_class;
        return device_register(dev);
}
EXPORT_SYMBOL_GPL(drm_class_device_register);

/**
 * drm_class_device_unregister - unregister device with the DRM sysfs class
 * @dev: device to unregister
 *
 * Unregisters a &struct device from the DRM sysfs class. Essentially only used
 * by ttm to have a place for its global settings. Drivers should never use
 * this.
 */
void drm_class_device_unregister(struct device *dev)
{
        return device_unregister(dev);
}
EXPORT_SYMBOL_GPL(drm_class_device_unregister);
















































    6 








    6 







































   20 



































































































































    1 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_DST_METADATA_H
#define __NET_DST_METADATA_H 1

#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/ip_tunnels.h>
#include <net/macsec.h>
#include <net/dst.h>

enum metadata_type {
        METADATA_IP_TUNNEL,
        METADATA_HW_PORT_MUX,
        METADATA_MACSEC,
        METADATA_XFRM,
};

struct hw_port_info {
        struct net_device *lower_dev;
        u32 port_id;
};

struct macsec_info {
        sci_t sci;
};

struct xfrm_md_info {
        u32 if_id;
        int link;
        struct dst_entry *dst_orig;
};

struct metadata_dst {
        struct dst_entry                dst;
        enum metadata_type                type;
        union {
                struct ip_tunnel_info        tun_info;
                struct hw_port_info        port_info;
                struct macsec_info        macsec_info;
                struct xfrm_md_info        xfrm_info;
        } u;
};

static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb)
{
        struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb);

        if (md_dst && md_dst->dst.flags & DST_METADATA)
                return md_dst;

        return NULL;
}

static inline struct ip_tunnel_info *
skb_tunnel_info(const struct sk_buff *skb)
{
        struct metadata_dst *md_dst = skb_metadata_dst(skb);
        struct dst_entry *dst;

        if (md_dst && md_dst->type == METADATA_IP_TUNNEL)
                return &md_dst->u.tun_info;

        dst = skb_dst(skb);
        if (dst && dst->lwtstate &&
            (dst->lwtstate->type == LWTUNNEL_ENCAP_IP ||
             dst->lwtstate->type == LWTUNNEL_ENCAP_IP6))
                return lwt_tun_info(dst->lwtstate);

        return NULL;
}

static inline struct xfrm_md_info *lwt_xfrm_info(struct lwtunnel_state *lwt)
{
        return (struct xfrm_md_info *)lwt->data;
}

static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb)
{
        struct metadata_dst *md_dst = skb_metadata_dst(skb);
        struct dst_entry *dst;

        if (md_dst && md_dst->type == METADATA_XFRM)
                return &md_dst->u.xfrm_info;

        dst = skb_dst(skb);
        if (dst && dst->lwtstate &&
            dst->lwtstate->type == LWTUNNEL_ENCAP_XFRM)
                return lwt_xfrm_info(dst->lwtstate);

        return NULL;
}

static inline bool skb_valid_dst(const struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);

        return dst && !(dst->flags & DST_METADATA);
}

static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
                                       const struct sk_buff *skb_b)
{
        const struct metadata_dst *a, *b;

        if (!(skb_a->_skb_refdst | skb_b->_skb_refdst))
                return 0;

        a = (const struct metadata_dst *) skb_dst(skb_a);
        b = (const struct metadata_dst *) skb_dst(skb_b);

        if (!a != !b || a->type != b->type)
                return 1;

        switch (a->type) {
        case METADATA_HW_PORT_MUX:
                return memcmp(&a->u.port_info, &b->u.port_info,
                              sizeof(a->u.port_info));
        case METADATA_IP_TUNNEL:
                return memcmp(&a->u.tun_info, &b->u.tun_info,
                              sizeof(a->u.tun_info) +
                                         a->u.tun_info.options_len);
        case METADATA_MACSEC:
                return memcmp(&a->u.macsec_info, &b->u.macsec_info,
                              sizeof(a->u.macsec_info));
        case METADATA_XFRM:
                return memcmp(&a->u.xfrm_info, &b->u.xfrm_info,
                              sizeof(a->u.xfrm_info));
        default:
                return 1;
        }
}

void metadata_dst_free(struct metadata_dst *);
struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
                                        gfp_t flags);
void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst);
struct metadata_dst __percpu *
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags);

static inline struct metadata_dst *tun_rx_dst(int md_size)
{
        struct metadata_dst *tun_dst;

        tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
        if (!tun_dst)
                return NULL;

        tun_dst->u.tun_info.options_len = 0;
        tun_dst->u.tun_info.mode = 0;
        return tun_dst;
}

static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
{
        struct metadata_dst *md_dst = skb_metadata_dst(skb);
        int md_size;
        struct metadata_dst *new_md;

        if (!md_dst || md_dst->type != METADATA_IP_TUNNEL)
                return ERR_PTR(-EINVAL);

        md_size = md_dst->u.tun_info.options_len;
        new_md = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
        if (!new_md)
                return ERR_PTR(-ENOMEM);

        memcpy(&new_md->u.tun_info, &md_dst->u.tun_info,
               sizeof(struct ip_tunnel_info) + md_size);
#ifdef CONFIG_DST_CACHE
        /* Unclone the dst cache if there is one */
        if (new_md->u.tun_info.dst_cache.cache) {
                int ret;

                ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC);
                if (ret) {
                        metadata_dst_free(new_md);
                        return ERR_PTR(ret);
                }
        }
#endif

        skb_dst_drop(skb);
        skb_dst_set(skb, &new_md->dst);
        return new_md;
}

static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb)
{
        struct metadata_dst *dst;

        dst = tun_dst_unclone(skb);
        if (IS_ERR(dst))
                return NULL;

        return &dst->u.tun_info;
}

static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr,
                                                    __be32 daddr,
                                                    __u8 tos, __u8 ttl,
                                                    __be16 tp_dst,
                                                    const unsigned long *flags,
                                                    __be64 tunnel_id,
                                                    int md_size)
{
        struct metadata_dst *tun_dst;

        tun_dst = tun_rx_dst(md_size);
        if (!tun_dst)
                return NULL;

        ip_tunnel_key_init(&tun_dst->u.tun_info.key,
                           saddr, daddr, tos, ttl,
                           0, 0, tp_dst, tunnel_id, flags);
        return tun_dst;
}

static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
                                                 const unsigned long *flags,
                                                 __be64 tunnel_id,
                                                 int md_size)
{
        const struct iphdr *iph = ip_hdr(skb);
        struct metadata_dst *tun_dst;

        tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
                                   0, flags, tunnel_id, md_size);

        if (tun_dst && (iph->frag_off & htons(IP_DF)))
                __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT,
                          tun_dst->u.tun_info.key.tun_flags);
        return tun_dst;
}

static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr,
                                                      const struct in6_addr *daddr,
                                                      __u8 tos, __u8 ttl,
                                                      __be16 tp_dst,
                                                      __be32 label,
                                                      const unsigned long *flags,
                                                      __be64 tunnel_id,
                                                      int md_size)
{
        struct metadata_dst *tun_dst;
        struct ip_tunnel_info *info;

        tun_dst = tun_rx_dst(md_size);
        if (!tun_dst)
                return NULL;

        info = &tun_dst->u.tun_info;
        info->mode = IP_TUNNEL_INFO_IPV6;
        ip_tunnel_flags_copy(info->key.tun_flags, flags);
        info->key.tun_id = tunnel_id;
        info->key.tp_src = 0;
        info->key.tp_dst = tp_dst;

        info->key.u.ipv6.src = *saddr;
        info->key.u.ipv6.dst = *daddr;

        info->key.tos = tos;
        info->key.ttl = ttl;
        info->key.label = label;

        return tun_dst;
}

static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
                                                   const unsigned long *flags,
                                                   __be64 tunnel_id,
                                                   int md_size)
{
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);

        return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr,
                                  ipv6_get_dsfield(ip6h), ip6h->hop_limit,
                                  0, ip6_flowlabel(ip6h), flags, tunnel_id,
                                  md_size);
}
#endif /* __NET_DST_METADATA_H */





























































   12 




    5 




    2 


    1 











    2 






































    3 


    5 
    1 
    3 




























    8 







    7 



















   17 






    5 






    1 











   17 



   15 
   18 

   11 










   17 
















   16 








   17 






   18 


   18 
   18 

   18 



   18 







   18 



   11 



















































   18 






   18 






   18 




    3 
































   16 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */

#include <linux/bpf_verifier.h>

/*
 * Forward dataflow analysis to determine constant register values at every
 * instruction. Tracks 64-bit constant values in R0-R9 through the program,
 * using a fixed-point iteration in reverse postorder. Records which registers
 * hold known constants and their values in
 * env->insn_aux_data[].{const_reg_mask, const_reg_vals}.
 */

enum const_arg_state {
        CONST_ARG_UNVISITED,        /* instruction not yet reached */
        CONST_ARG_UNKNOWN,        /* register value not a known constant */
        CONST_ARG_CONST,        /* register holds a known 64-bit constant */
        CONST_ARG_MAP_PTR,        /* register holds a map pointer, map_index is set */
        CONST_ARG_MAP_VALUE,        /* register points to map value data, val is offset */
        CONST_ARG_SUBPROG,        /* register holds a subprog pointer, val is subprog number */
};

struct const_arg_info {
        enum const_arg_state state;
        u32 map_index;
        u64 val;
};

static bool ci_is_unvisited(const struct const_arg_info *ci)
{
        return ci->state == CONST_ARG_UNVISITED;
}

static bool ci_is_unknown(const struct const_arg_info *ci)
{
        return ci->state == CONST_ARG_UNKNOWN;
}

static bool ci_is_const(const struct const_arg_info *ci)
{
        return ci->state == CONST_ARG_CONST;
}

static bool ci_is_map_value(const struct const_arg_info *ci)
{
        return ci->state == CONST_ARG_MAP_VALUE;
}

/* Transfer function: compute output register state from instruction. */
static void const_reg_xfer(struct bpf_verifier_env *env, struct const_arg_info *ci_out,
                           struct bpf_insn *insn, struct bpf_insn *insns, int idx)
{
        struct const_arg_info unknown = { .state = CONST_ARG_UNKNOWN, .val = 0 };
        struct const_arg_info *dst = &ci_out[insn->dst_reg];
        struct const_arg_info *src = &ci_out[insn->src_reg];
        u8 class = BPF_CLASS(insn->code);
        u8 mode = BPF_MODE(insn->code);
        u8 opcode = BPF_OP(insn->code) | BPF_SRC(insn->code);
        int r;

        switch (class) {
        case BPF_ALU:
        case BPF_ALU64:
                switch (opcode) {
                case BPF_MOV | BPF_K:
                        dst->state = CONST_ARG_CONST;
                        dst->val = (s64)insn->imm;
                        break;
                case BPF_MOV | BPF_X:
                        *dst = *src;
                        if (!insn->off)
                                break;
                        if (!ci_is_const(dst)) {
                                *dst = unknown;
                                break;
                        }
                        switch (insn->off) {
                        case 8:  dst->val = (s8)dst->val; break;
                        case 16: dst->val = (s16)dst->val; break;
                        case 32: dst->val = (s32)dst->val; break;
                        default: *dst = unknown; break;
                        }
                        break;
                case BPF_ADD | BPF_K:
                        if (!ci_is_const(dst) && !ci_is_map_value(dst)) {
                                *dst = unknown;
                                break;
                        }
                        dst->val += insn->imm;
                        break;
                case BPF_SUB | BPF_K:
                        if (!ci_is_const(dst) && !ci_is_map_value(dst)) {
                                *dst = unknown;
                                break;
                        }
                        dst->val -= insn->imm;
                        break;
                case BPF_AND | BPF_K:
                        if (!ci_is_const(dst)) {
                                if (!insn->imm) {
                                        dst->state = CONST_ARG_CONST;
                                        dst->val = 0;
                                } else {
                                        *dst = unknown;
                                }
                                break;
                        }
                        dst->val &= (s64)insn->imm;
                        break;
                case BPF_AND | BPF_X:
                        if (ci_is_const(dst) && dst->val == 0)
                                break; /* 0 & x == 0 */
                        if (ci_is_const(src) && src->val == 0) {
                                dst->state = CONST_ARG_CONST;
                                dst->val = 0;
                                break;
                        }
                        if (!ci_is_const(dst) || !ci_is_const(src)) {
                                *dst = unknown;
                                break;
                        }
                        dst->val &= src->val;
                        break;
                default:
                        *dst = unknown;
                        break;
                }
                if (class == BPF_ALU) {
                        if (ci_is_const(dst))
                                dst->val = (u32)dst->val;
                        else if (!ci_is_unknown(dst))
                                *dst = unknown;
                }
                break;
        case BPF_LD:
                if (mode == BPF_ABS || mode == BPF_IND)
                        goto process_call;
                if (mode != BPF_IMM || BPF_SIZE(insn->code) != BPF_DW)
                        break;
                if (insn->src_reg == BPF_PSEUDO_FUNC) {
                        int subprog = bpf_find_subprog(env, idx + insn->imm + 1);

                        if (subprog >= 0) {
                                dst->state = CONST_ARG_SUBPROG;
                                dst->val = subprog;
                        } else {
                                *dst = unknown;
                        }
                } else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
                           insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
                        dst->state = CONST_ARG_MAP_VALUE;
                        dst->map_index = env->insn_aux_data[idx].map_index;
                        dst->val = env->insn_aux_data[idx].map_off;
                } else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
                           insn->src_reg == BPF_PSEUDO_MAP_IDX) {
                        dst->state = CONST_ARG_MAP_PTR;
                        dst->map_index = env->insn_aux_data[idx].map_index;
                } else if (insn->src_reg == 0) {
                        dst->state = CONST_ARG_CONST;
                        dst->val = (u64)(u32)insn->imm | ((u64)(u32)insns[idx + 1].imm << 32);
                } else {
                        *dst = unknown;
                }
                break;
        case BPF_LDX:
                if (!ci_is_map_value(src)) {
                        *dst = unknown;
                        break;
                }
                struct bpf_map *map = env->used_maps[src->map_index];
                int size = bpf_size_to_bytes(BPF_SIZE(insn->code));
                bool is_ldsx = mode == BPF_MEMSX;
                int off = src->val + insn->off;
                u64 val = 0;

                if (!bpf_map_is_rdonly(map) || !map->ops->map_direct_value_addr ||
                    map->map_type == BPF_MAP_TYPE_INSN_ARRAY ||
                    off < 0 || off + size > map->value_size ||
                    bpf_map_direct_read(map, off, size, &val, is_ldsx)) {
                        *dst = unknown;
                        break;
                }
                dst->state = CONST_ARG_CONST;
                dst->val = val;
                break;
        case BPF_JMP:
                if (opcode != BPF_CALL)
                        break;
process_call:
                for (r = BPF_REG_0; r <= BPF_REG_5; r++)
                        ci_out[r] = unknown;
                break;
        case BPF_STX:
                if (mode != BPF_ATOMIC)
                        break;
                if (insn->imm == BPF_CMPXCHG)
                        ci_out[BPF_REG_0] = unknown;
                else if (insn->imm == BPF_LOAD_ACQ)
                        *dst = unknown;
                else if (insn->imm & BPF_FETCH)
                        *src = unknown;
                break;
        }
}

/* Join function: merge output state into a successor's input state. */
static bool const_reg_join(struct const_arg_info *ci_target,
                           struct const_arg_info *ci_out)
{
        bool changed = false;
        int r;

        for (r = 0; r < MAX_BPF_REG; r++) {
                struct const_arg_info *old = &ci_target[r];
                struct const_arg_info *new = &ci_out[r];

                if (ci_is_unvisited(old) && !ci_is_unvisited(new)) {
                        ci_target[r] = *new;
                        changed = true;
                } else if (!ci_is_unknown(old) && !ci_is_unvisited(old) &&
                           (new->state != old->state || new->val != old->val ||
                            new->map_index != old->map_index)) {
                        old->state = CONST_ARG_UNKNOWN;
                        changed = true;
                }
        }
        return changed;
}

int bpf_compute_const_regs(struct bpf_verifier_env *env)
{
        struct const_arg_info unknown = { .state = CONST_ARG_UNKNOWN, .val = 0 };
        struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
        struct bpf_insn *insns = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        struct const_arg_info (*ci_in)[MAX_BPF_REG];
        struct const_arg_info ci_out[MAX_BPF_REG];
        struct bpf_iarray *succ;
        bool changed;
        int i, r;

        /* kvzalloc zeroes memory, so all entries start as CONST_ARG_UNVISITED (0) */
        ci_in = kvzalloc_objs(*ci_in, insn_cnt, GFP_KERNEL_ACCOUNT);
        if (!ci_in)
                return -ENOMEM;

        /* Subprogram entries (including main at subprog 0): all registers unknown */
        for (i = 0; i < env->subprog_cnt; i++) {
                int start = env->subprog_info[i].start;

                for (r = 0; r < MAX_BPF_REG; r++)
                        ci_in[start][r] = unknown;
        }

redo:
        changed = false;
        for (i = env->cfg.cur_postorder - 1; i >= 0; i--) {
                int idx = env->cfg.insn_postorder[i];
                struct bpf_insn *insn = &insns[idx];
                struct const_arg_info *ci = ci_in[idx];

                memcpy(ci_out, ci, sizeof(ci_out));

                const_reg_xfer(env, ci_out, insn, insns, idx);

                succ = bpf_insn_successors(env, idx);
                for (int s = 0; s < succ->cnt; s++)
                        changed |= const_reg_join(ci_in[succ->items[s]], ci_out);
        }
        if (changed)
                goto redo;

        /* Save computed constants into insn_aux[] if they fit into 32-bit */
        for (i = 0; i < insn_cnt; i++) {
                u16 mask = 0, map_mask = 0, subprog_mask = 0;
                struct bpf_insn_aux_data *aux = &insn_aux[i];
                struct const_arg_info *ci = ci_in[i];

                for (r = BPF_REG_0; r < ARRAY_SIZE(aux->const_reg_vals); r++) {
                        struct const_arg_info *c = &ci[r];

                        switch (c->state) {
                        case CONST_ARG_CONST: {
                                u64 val = c->val;

                                if (val != (u32)val)
                                        break;
                                mask |= BIT(r);
                                aux->const_reg_vals[r] = val;
                                break;
                        }
                        case CONST_ARG_MAP_PTR:
                                map_mask |= BIT(r);
                                aux->const_reg_vals[r] = c->map_index;
                                break;
                        case CONST_ARG_SUBPROG:
                                subprog_mask |= BIT(r);
                                aux->const_reg_vals[r] = c->val;
                                break;
                        default:
                                break;
                        }
                }
                aux->const_reg_mask = mask;
                aux->const_reg_map_mask = map_mask;
                aux->const_reg_subprog_mask = subprog_mask;
        }

        kvfree(ci_in);
        return 0;
}

static int eval_const_branch(u8 opcode, u64 dst_val, u64 src_val)
{
        switch (BPF_OP(opcode)) {
        case BPF_JEQ:        return dst_val == src_val;
        case BPF_JNE:        return dst_val != src_val;
        case BPF_JGT:        return dst_val > src_val;
        case BPF_JGE:        return dst_val >= src_val;
        case BPF_JLT:        return dst_val < src_val;
        case BPF_JLE:        return dst_val <= src_val;
        case BPF_JSGT:        return (s64)dst_val > (s64)src_val;
        case BPF_JSGE:        return (s64)dst_val >= (s64)src_val;
        case BPF_JSLT:        return (s64)dst_val < (s64)src_val;
        case BPF_JSLE:        return (s64)dst_val <= (s64)src_val;
        case BPF_JSET:        return (bool)(dst_val & src_val);
        default:        return -1;
        }
}

/*
 * Rewrite conditional branches with constant outcomes into unconditional
 * jumps using register values resolved by bpf_compute_const_regs() pass.
 * This eliminates dead edges from the CFG so that compute_live_registers()
 * doesn't propagate liveness through dead code.
 */
int bpf_prune_dead_branches(struct bpf_verifier_env *env)
{
        struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
        struct bpf_insn *insns = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        bool changed = false;
        int i;

        for (i = 0; i < insn_cnt; i++) {
                struct bpf_insn_aux_data *aux = &insn_aux[i];
                struct bpf_insn *insn = &insns[i];
                u8 class = BPF_CLASS(insn->code);
                u64 dst_val, src_val;
                int taken;

                if (!bpf_insn_is_cond_jump(insn->code))
                        continue;
                if (bpf_is_may_goto_insn(insn))
                        continue;

                if (!(aux->const_reg_mask & BIT(insn->dst_reg)))
                        continue;
                dst_val = aux->const_reg_vals[insn->dst_reg];

                if (BPF_SRC(insn->code) == BPF_K) {
                        src_val = insn->imm;
                } else {
                        if (!(aux->const_reg_mask & BIT(insn->src_reg)))
                                continue;
                        src_val = aux->const_reg_vals[insn->src_reg];
                }

                if (class == BPF_JMP32) {
                        /*
                         * The (s32) cast maps the 32-bit range into two u64 sub-ranges:
                         * [0x00000000, 0x7FFFFFFF] -> [0x0000000000000000, 0x000000007FFFFFFF]
                         * [0x80000000, 0xFFFFFFFF] -> [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
                         * The ordering is preserved within each sub-range, and
                         * the second sub-range is above the first as u64.
                         */
                        dst_val = (s32)dst_val;
                        src_val = (s32)src_val;
                }

                taken = eval_const_branch(insn->code, dst_val, src_val);
                if (taken < 0) {
                        bpf_log(&env->log, "Unknown conditional jump %x\n", insn->code);
                        return -EFAULT;
                }
                *insn = BPF_JMP_A(taken ? insn->off : 0);
                changed = true;
        }

        if (!changed)
                return 0;
        /* recompute postorder, since CFG has changed */
        kvfree(env->cfg.insn_postorder);
        env->cfg.insn_postorder = NULL;
        return bpf_compute_postorder(env);
}

































































































































































































































































































































































































































































































































































































































































































































    3 







    1 


    3 
    3 


    1 

















    3 















    3 






    3 



    3 







    3 








    3 



    3 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
// SPDX-License-Identifier: GPL-2.0
/*
 * USB device quirk handling logic and table
 *
 * Copyright (c) 2007 Oliver Neukum
 * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de>
 */

#include <linux/moduleparam.h>
#include <linux/usb.h>
#include <linux/usb/quirks.h>
#include <linux/usb/hcd.h>
#include "usb.h"

struct quirk_entry {
        u16 vid;
        u16 pid;
        u32 flags;
};

static DEFINE_MUTEX(quirk_mutex);

static struct quirk_entry *quirk_list;
static unsigned int quirk_count;

static char quirks_param[128];

static int quirks_param_set(const char *value, const struct kernel_param *kp)
{
        char *val, *p, *field;
        u16 vid, pid;
        u32 flags;
        size_t i;
        int err;

        val = kstrdup(value, GFP_KERNEL);
        if (!val)
                return -ENOMEM;

        err = param_set_copystring(val, kp);
        if (err) {
                kfree(val);
                return err;
        }

        mutex_lock(&quirk_mutex);

        if (!*val) {
                quirk_count = 0;
                kfree(quirk_list);
                quirk_list = NULL;
                goto unlock;
        }

        for (quirk_count = 1, i = 0; val[i]; i++)
                if (val[i] == ',')
                        quirk_count++;

        if (quirk_list) {
                kfree(quirk_list);
                quirk_list = NULL;
        }

        quirk_list = kzalloc_objs(struct quirk_entry, quirk_count);
        if (!quirk_list) {
                quirk_count = 0;
                mutex_unlock(&quirk_mutex);
                kfree(val);
                return -ENOMEM;
        }

        for (i = 0, p = val; p && *p;) {
                /* Each entry consists of VID:PID:flags */
                field = strsep(&p, ":");
                if (!field)
                        break;

                if (kstrtou16(field, 16, &vid))
                        break;

                field = strsep(&p, ":");
                if (!field)
                        break;

                if (kstrtou16(field, 16, &pid))
                        break;

                field = strsep(&p, ",");
                if (!field || !*field)
                        break;

                /* Collect the flags */
                for (flags = 0; *field; field++) {
                        switch (*field) {
                        case 'a':
                                flags |= USB_QUIRK_STRING_FETCH_255;
                                break;
                        case 'b':
                                flags |= USB_QUIRK_RESET_RESUME;
                                break;
                        case 'c':
                                flags |= USB_QUIRK_NO_SET_INTF;
                                break;
                        case 'd':
                                flags |= USB_QUIRK_CONFIG_INTF_STRINGS;
                                break;
                        case 'e':
                                flags |= USB_QUIRK_RESET;
                                break;
                        case 'f':
                                flags |= USB_QUIRK_HONOR_BNUMINTERFACES;
                                break;
                        case 'g':
                                flags |= USB_QUIRK_DELAY_INIT;
                                break;
                        case 'h':
                                flags |= USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL;
                                break;
                        case 'i':
                                flags |= USB_QUIRK_DEVICE_QUALIFIER;
                                break;
                        case 'j':
                                flags |= USB_QUIRK_IGNORE_REMOTE_WAKEUP;
                                break;
                        case 'k':
                                flags |= USB_QUIRK_NO_LPM;
                                break;
                        case 'l':
                                flags |= USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL;
                                break;
                        case 'm':
                                flags |= USB_QUIRK_DISCONNECT_SUSPEND;
                                break;
                        case 'n':
                                flags |= USB_QUIRK_DELAY_CTRL_MSG;
                                break;
                        case 'o':
                                flags |= USB_QUIRK_HUB_SLOW_RESET;
                                break;
                        case 'p':
                                flags |= USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT;
                                break;
                        case 'q':
                                flags |= USB_QUIRK_FORCE_ONE_CONFIG;
                        /* Ignore unrecognized flag characters */
                        }
                }

                quirk_list[i++] = (struct quirk_entry)
                        { .vid = vid, .pid = pid, .flags = flags };
        }

        if (i < quirk_count)
                quirk_count = i;

unlock:
        mutex_unlock(&quirk_mutex);
        kfree(val);

        return 0;
}

static const struct kernel_param_ops quirks_param_ops = {
        .set = quirks_param_set,
        .get = param_get_string,
};

static struct kparam_string quirks_param_string = {
        .maxlen = sizeof(quirks_param),
        .string = quirks_param,
};

device_param_cb(quirks, &quirks_param_ops, &quirks_param_string, 0644);
MODULE_PARM_DESC(quirks, "Add/modify USB quirks by specifying quirks=vendorID:productID:quirks");

/* Lists of quirky USB devices, split in device quirks and interface quirks.
 * Device quirks are applied at the very beginning of the enumeration process,
 * right after reading the device descriptor. They can thus only match on device
 * information.
 *
 * Interface quirks are applied after reading all the configuration descriptors.
 * They can match on both device and interface information.
 *
 * Note that the DELAY_INIT and HONOR_BNUMINTERFACES quirks do not make sense as
 * interface quirks, as they only influence the enumeration process which is run
 * before processing the interface quirks.
 *
 * Please keep the lists ordered by:
 *         1) Vendor ID
 *         2) Product ID
 *         3) Class ID
 */
static const struct usb_device_id usb_quirk_list[] = {
        /* CBM - Flash disk */
        { USB_DEVICE(0x0204, 0x6025), .driver_info = USB_QUIRK_RESET_RESUME },

        /* WORLDE Controller KS49 or Prodipe MIDI 49C USB controller */
        { USB_DEVICE(0x0218, 0x0201), .driver_info =
                        USB_QUIRK_CONFIG_INTF_STRINGS },

        /* WORLDE easy key (easykey.25) MIDI controller  */
        { USB_DEVICE(0x0218, 0x0401), .driver_info =
                        USB_QUIRK_CONFIG_INTF_STRINGS },

        /* HP 5300/5370C scanner */
        { USB_DEVICE(0x03f0, 0x0701), .driver_info =
                        USB_QUIRK_STRING_FETCH_255 },

        /* HP v222w 16GB Mini USB Drive */
        { USB_DEVICE(0x03f0, 0x3f40), .driver_info = USB_QUIRK_DELAY_INIT },

        /* Huawei 4G LTE module ME906S  */
        { USB_DEVICE(0x03f0, 0xa31d), .driver_info =
                        USB_QUIRK_DISCONNECT_SUSPEND },

        /* Creative SB Audigy 2 NX */
        { USB_DEVICE(0x041e, 0x3020), .driver_info = USB_QUIRK_RESET_RESUME },

        /* USB3503 */
        { USB_DEVICE(0x0424, 0x3503), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Microsoft Wireless Laser Mouse 6000 Receiver */
        { USB_DEVICE(0x045e, 0x00e1), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Microsoft LifeCam-VX700 v2.0 */
        { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Microsoft Surface Dock Ethernet (RTL8153 GigE) */
        { USB_DEVICE(0x045e, 0x07c6), .driver_info = USB_QUIRK_NO_LPM },

        /* Cherry Stream G230 2.0 (G85-231) and 3.0 (G85-232) */
        { USB_DEVICE(0x046a, 0x0023), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Logitech HD Webcam C270 */
        { USB_DEVICE(0x046d, 0x0825), .driver_info = USB_QUIRK_RESET_RESUME |
                USB_QUIRK_NO_LPM},

        /* Logitech HD Pro Webcams C920, C920-C, C922, C925e and C930e */
        { USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT },
        { USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT },
        { USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT },
        { USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT },
        { USB_DEVICE(0x046d, 0x085c), .driver_info = USB_QUIRK_DELAY_INIT },

        /* Logitech ConferenceCam CC3000e */
        { USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT },
        { USB_DEVICE(0x046d, 0x0848), .driver_info = USB_QUIRK_DELAY_INIT },

        /* Logitech PTZ Pro Camera */
        { USB_DEVICE(0x046d, 0x0853), .driver_info = USB_QUIRK_DELAY_INIT },

        /* Logitech Screen Share */
        { USB_DEVICE(0x046d, 0x086c), .driver_info = USB_QUIRK_NO_LPM },

        /* Logitech Quickcam Fusion */
        { USB_DEVICE(0x046d, 0x08c1), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Logitech Quickcam Orbit MP */
        { USB_DEVICE(0x046d, 0x08c2), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Logitech Quickcam Pro for Notebook */
        { USB_DEVICE(0x046d, 0x08c3), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Logitech Quickcam Pro 5000 */
        { USB_DEVICE(0x046d, 0x08c5), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Logitech Quickcam OEM Dell Notebook */
        { USB_DEVICE(0x046d, 0x08c6), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Logitech Quickcam OEM Cisco VT Camera II */
        { USB_DEVICE(0x046d, 0x08c7), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Logitech Harmony 700-series */
        { USB_DEVICE(0x046d, 0xc122), .driver_info = USB_QUIRK_DELAY_INIT },

        /* Philips PSC805 audio device */
        { USB_DEVICE(0x0471, 0x0155), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Plantronic Audio 655 DSP */
        { USB_DEVICE(0x047f, 0xc008), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Plantronic Audio 648 USB */
        { USB_DEVICE(0x047f, 0xc013), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Artisman Watchdog Dongle */
        { USB_DEVICE(0x04b4, 0x0526), .driver_info =
                        USB_QUIRK_CONFIG_INTF_STRINGS },

        /* Microchip Joss Optical infrared touchboard device */
        { USB_DEVICE(0x04d8, 0x000c), .driver_info =
                        USB_QUIRK_CONFIG_INTF_STRINGS },

        /* CarrolTouch 4000U */
        { USB_DEVICE(0x04e7, 0x0009), .driver_info = USB_QUIRK_RESET_RESUME },

        /* CarrolTouch 4500U */
        { USB_DEVICE(0x04e7, 0x0030), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Samsung Android phone modem - ID conflict with SPH-I500 */
        { USB_DEVICE(0x04e8, 0x6601), .driver_info =
                        USB_QUIRK_CONFIG_INTF_STRINGS },

        /* Elan Touchscreen */
        { USB_DEVICE(0x04f3, 0x0089), .driver_info =
                        USB_QUIRK_DEVICE_QUALIFIER },

        { USB_DEVICE(0x04f3, 0x009b), .driver_info =
                        USB_QUIRK_DEVICE_QUALIFIER },

        { USB_DEVICE(0x04f3, 0x010c), .driver_info =
                        USB_QUIRK_DEVICE_QUALIFIER },

        { USB_DEVICE(0x04f3, 0x0125), .driver_info =
                        USB_QUIRK_DEVICE_QUALIFIER },

        { USB_DEVICE(0x04f3, 0x016f), .driver_info =
                        USB_QUIRK_DEVICE_QUALIFIER },

        { USB_DEVICE(0x04f3, 0x0381), .driver_info =
                        USB_QUIRK_NO_LPM },

        { USB_DEVICE(0x04f3, 0x21b8), .driver_info =
                        USB_QUIRK_DEVICE_QUALIFIER },

        /* Roland SC-8820 */
        { USB_DEVICE(0x0582, 0x0007), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Edirol SD-20 */
        { USB_DEVICE(0x0582, 0x0027), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Alcor Micro Corp. Hub */
        { USB_DEVICE(0x058f, 0x9254), .driver_info = USB_QUIRK_RESET_RESUME },

        /* appletouch */
        { USB_DEVICE(0x05ac, 0x021a), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */
        { USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM },

        /* ELSA MicroLink 56K */
        { USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */
        { USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM },

        /* Avision AV600U */
        { USB_DEVICE(0x0638, 0x0a13), .driver_info =
          USB_QUIRK_STRING_FETCH_255 },

        /* Prolific Single-LUN Mass Storage Card Reader */
        { USB_DEVICE(0x067b, 0x2731), .driver_info = USB_QUIRK_DELAY_INIT |
          USB_QUIRK_NO_LPM },

        /* Saitek Cyborg Gold Joystick */
        { USB_DEVICE(0x06a3, 0x0006), .driver_info =
                        USB_QUIRK_CONFIG_INTF_STRINGS },

        /* Agfa SNAPSCAN 1212U */
        { USB_DEVICE(0x06bd, 0x0001), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Guillemot Webcam Hercules Dualpix Exchange (2nd ID) */
        { USB_DEVICE(0x06f8, 0x0804), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Guillemot Webcam Hercules Dualpix Exchange*/
        { USB_DEVICE(0x06f8, 0x3005), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Guillemot Hercules DJ Console audio card (BZ 208357) */
        { USB_DEVICE(0x06f8, 0xb000), .driver_info =
                        USB_QUIRK_ENDPOINT_IGNORE },

        /* Midiman M-Audio Keystation 88es */
        { USB_DEVICE(0x0763, 0x0192), .driver_info = USB_QUIRK_RESET_RESUME },

        /* SanDisk Ultra Fit and Ultra Flair */
        { USB_DEVICE(0x0781, 0x5583), .driver_info = USB_QUIRK_NO_LPM },
        { USB_DEVICE(0x0781, 0x5591), .driver_info = USB_QUIRK_NO_LPM },

        /* SanDisk Corp. SanDisk 3.2Gen1 */
        { USB_DEVICE(0x0781, 0x5596), .driver_info = USB_QUIRK_DELAY_INIT },
        { USB_DEVICE(0x0781, 0x55a3), .driver_info = USB_QUIRK_DELAY_INIT },

        /* SanDisk Extreme 55AE */
        { USB_DEVICE(0x0781, 0x55ae), .driver_info = USB_QUIRK_NO_LPM },

        /* Avermedia Live Gamer Ultra 2.1 (GC553G2) - BOS descriptor fetch hangs at SuperSpeed Plus */
        { USB_DEVICE(0x07ca, 0x2553), .driver_info = USB_QUIRK_NO_BOS },

        /* Realforce 87U Keyboard */
        { USB_DEVICE(0x0853, 0x011b), .driver_info = USB_QUIRK_NO_LPM },

        /* M-Systems Flash Disk Pioneers */
        { USB_DEVICE(0x08ec, 0x1000), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Baum Vario Ultra */
        { USB_DEVICE(0x0904, 0x6101), .driver_info =
                        USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL },
        { USB_DEVICE(0x0904, 0x6102), .driver_info =
                        USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL },
        { USB_DEVICE(0x0904, 0x6103), .driver_info =
                        USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL },

        /* Silicon Motion Flash Drive */
        { USB_DEVICE(0x090c, 0x1000), .driver_info = USB_QUIRK_DELAY_INIT },
        { USB_DEVICE(0x090c, 0x2000), .driver_info = USB_QUIRK_DELAY_INIT },

        /* Sound Devices USBPre2 */
        { USB_DEVICE(0x0926, 0x0202), .driver_info =
                        USB_QUIRK_ENDPOINT_IGNORE },

        /* Sound Devices MixPre-D */
        { USB_DEVICE(0x0926, 0x0208), .driver_info =
                        USB_QUIRK_ENDPOINT_IGNORE },

        /* Keytouch QWERTY Panel keyboard */
        { USB_DEVICE(0x0926, 0x3333), .driver_info =
                        USB_QUIRK_CONFIG_INTF_STRINGS },

        /* Kingston DataTraveler 3.0 */
        { USB_DEVICE(0x0951, 0x1666), .driver_info = USB_QUIRK_NO_LPM },

        /* TOSHIBA TransMemory-Mx */
        { USB_DEVICE(0x0930, 0x1408), .driver_info = USB_QUIRK_NO_LPM },

        /* NVIDIA Jetson devices in Force Recovery mode */
        { USB_DEVICE(0x0955, 0x7018), .driver_info = USB_QUIRK_RESET_RESUME },
        { USB_DEVICE(0x0955, 0x7019), .driver_info = USB_QUIRK_RESET_RESUME },
        { USB_DEVICE(0x0955, 0x7418), .driver_info = USB_QUIRK_RESET_RESUME },
        { USB_DEVICE(0x0955, 0x7721), .driver_info = USB_QUIRK_RESET_RESUME },
        { USB_DEVICE(0x0955, 0x7c18), .driver_info = USB_QUIRK_RESET_RESUME },
        { USB_DEVICE(0x0955, 0x7e19), .driver_info = USB_QUIRK_RESET_RESUME },
        { USB_DEVICE(0x0955, 0x7f21), .driver_info = USB_QUIRK_RESET_RESUME },

        /* X-Rite/Gretag-Macbeth Eye-One Pro display colorimeter */
        { USB_DEVICE(0x0971, 0x2000), .driver_info = USB_QUIRK_NO_SET_INTF },

        /* ELMO L-12F document camera */
        { USB_DEVICE(0x09a1, 0x0028), .driver_info = USB_QUIRK_DELAY_CTRL_MSG },

        /* Broadcom BCM92035DGROM BT dongle */
        { USB_DEVICE(0x0a5c, 0x2021), .driver_info = USB_QUIRK_RESET_RESUME },

        /* MAYA44USB sound device */
        { USB_DEVICE(0x0a92, 0x0091), .driver_info = USB_QUIRK_RESET_RESUME },

        /* ASUS Base Station(T100) */
        { USB_DEVICE(0x0b05, 0x17e0), .driver_info =
                        USB_QUIRK_IGNORE_REMOTE_WAKEUP },

        /* ASUS TUF 4K PRO - BOS descriptor fetch hangs at SuperSpeed Plus */
        { USB_DEVICE(0x0b05, 0x1ab9), .driver_info = USB_QUIRK_NO_BOS },

        /* Realtek Semiconductor Corp. Mass Storage Device (Multicard Reader)*/
        { USB_DEVICE(0x0bda, 0x0151), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS },

        /* Realtek hub in Dell WD19 (Type-C) */
        { USB_DEVICE(0x0bda, 0x0487), .driver_info = USB_QUIRK_NO_LPM },

        /* Generic RTL8153 based ethernet adapters */
        { USB_DEVICE(0x0bda, 0x8153), .driver_info = USB_QUIRK_NO_LPM },

        /* SONiX USB DEVICE Touchpad */
        { USB_DEVICE(0x0c45, 0x7056), .driver_info =
                        USB_QUIRK_IGNORE_REMOTE_WAKEUP },

        /* Elgato 4K X - BOS descriptor fetch hangs at SuperSpeed Plus */
        { USB_DEVICE(0x0fd9, 0x009b), .driver_info = USB_QUIRK_NO_BOS },

        /* Sony Xperia XZ1 Compact (lilac) smartphone in fastboot mode */
        { USB_DEVICE(0x0fce, 0x0dde), .driver_info = USB_QUIRK_NO_LPM },

        /* Action Semiconductor flash disk */
        { USB_DEVICE(0x10d6, 0x2200), .driver_info =
                        USB_QUIRK_STRING_FETCH_255 },

        /* novation SoundControl XL */
        { USB_DEVICE(0x1235, 0x0061), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Focusrite Scarlett Solo USB */
        { USB_DEVICE(0x1235, 0x8211), .driver_info =
                        USB_QUIRK_DISCONNECT_SUSPEND },

        /* Huawei 4G LTE module */
        { USB_DEVICE(0x12d1, 0x15bb), .driver_info =
                        USB_QUIRK_DISCONNECT_SUSPEND },
        { USB_DEVICE(0x12d1, 0x15c1), .driver_info =
                        USB_QUIRK_DISCONNECT_SUSPEND },
        { USB_DEVICE(0x12d1, 0x15c3), .driver_info =
                        USB_QUIRK_DISCONNECT_SUSPEND },

        /* SKYMEDI USB_DRIVE */
        { USB_DEVICE(0x1516, 0x8628), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Razer - Razer Blade Keyboard */
        { USB_DEVICE(0x1532, 0x0116), .driver_info =
                        USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL },
        /* Razer - Razer Kiyo Pro Webcam */
        { USB_DEVICE(0x1532, 0x0e05), .driver_info = USB_QUIRK_NO_LPM },

        /* Lenovo ThinkPad OneLink+ Dock twin hub controllers (VIA Labs VL812) */
        { USB_DEVICE(0x17ef, 0x1018), .driver_info = USB_QUIRK_RESET_RESUME },
        { USB_DEVICE(0x17ef, 0x1019), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Lenovo USB-C to Ethernet Adapter RTL8153-04 */
        { USB_DEVICE(0x17ef, 0x720c), .driver_info = USB_QUIRK_NO_LPM },

        /* Lenovo Powered USB-C Travel Hub (4X90S92381, RTL8153 GigE) */
        { USB_DEVICE(0x17ef, 0x721e), .driver_info = USB_QUIRK_NO_LPM },

        /* Lenovo ThinkCenter A630Z TI024Gen3 usb-audio */
        { USB_DEVICE(0x17ef, 0xa012), .driver_info =
                        USB_QUIRK_DISCONNECT_SUSPEND },

        /* Lenovo ThinkPad USB-C Dock Gen2 Ethernet (RTL8153 GigE) */
        { USB_DEVICE(0x17ef, 0xa387), .driver_info = USB_QUIRK_NO_LPM },

        /* BUILDWIN Photo Frame */
        { USB_DEVICE(0x1908, 0x1315), .driver_info =
                        USB_QUIRK_HONOR_BNUMINTERFACES },

        /* Protocol and OTG Electrical Test Device */
        { USB_DEVICE(0x1a0a, 0x0200), .driver_info =
                        USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL },

        /* Terminus Technology Inc. Hub */
        { USB_DEVICE(0x1a40, 0x0101), .driver_info = USB_QUIRK_HUB_SLOW_RESET },

        /* Corsair K70 RGB */
        { USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT |
          USB_QUIRK_DELAY_CTRL_MSG },

        /* Corsair Strafe */
        { USB_DEVICE(0x1b1c, 0x1b15), .driver_info = USB_QUIRK_DELAY_INIT |
          USB_QUIRK_DELAY_CTRL_MSG },

        /* Corsair Strafe RGB */
        { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT |
          USB_QUIRK_DELAY_CTRL_MSG },

        /* Corsair K70 LUX RGB */
        { USB_DEVICE(0x1b1c, 0x1b33), .driver_info = USB_QUIRK_DELAY_INIT },

        /* Corsair K70 LUX */
        { USB_DEVICE(0x1b1c, 0x1b36), .driver_info = USB_QUIRK_DELAY_INIT },

        /* Corsair K70 RGB RAPDIFIRE */
        { USB_DEVICE(0x1b1c, 0x1b38), .driver_info = USB_QUIRK_DELAY_INIT |
          USB_QUIRK_DELAY_CTRL_MSG },

        /* START BP-850k Printer */
        { USB_DEVICE(0x1bc3, 0x0003), .driver_info = USB_QUIRK_NO_SET_INTF },

        /* MIDI keyboard WORLDE MINI */
        { USB_DEVICE(0x1c75, 0x0204), .driver_info =
                        USB_QUIRK_CONFIG_INTF_STRINGS },

        /* Acer C120 LED Projector */
        { USB_DEVICE(0x1de1, 0xc102), .driver_info = USB_QUIRK_NO_LPM },

        /* Blackmagic Design Intensity Shuttle */
        { USB_DEVICE(0x1edb, 0xbd3b), .driver_info = USB_QUIRK_NO_LPM },

        /* Blackmagic Design UltraStudio SDI */
        { USB_DEVICE(0x1edb, 0xbd4f), .driver_info = USB_QUIRK_NO_LPM },

        /* Teclast disk */
        { USB_DEVICE(0x1f75, 0x0917), .driver_info = USB_QUIRK_NO_LPM },

        /* Hauppauge HVR-950q */
        { USB_DEVICE(0x2040, 0x7200), .driver_info =
                        USB_QUIRK_CONFIG_INTF_STRINGS },

        /* VLI disk */
        { USB_DEVICE(0x2109, 0x0711), .driver_info = USB_QUIRK_NO_LPM },

        /* Raydium Touchscreen */
        { USB_DEVICE(0x2386, 0x3114), .driver_info = USB_QUIRK_NO_LPM },

        { USB_DEVICE(0x2386, 0x3119), .driver_info = USB_QUIRK_NO_LPM },

        { USB_DEVICE(0x2386, 0x350e), .driver_info = USB_QUIRK_NO_LPM },

        /* UGREEN 35871 - BOS descriptor fetch hangs at SuperSpeed Plus */
        { USB_DEVICE(0x2b89, 0x5871), .driver_info = USB_QUIRK_NO_BOS },

        /* APTIV AUTOMOTIVE HUB */
        { USB_DEVICE(0x2c48, 0x0132), .driver_info =
                        USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT },

        /* DJI CineSSD */
        { USB_DEVICE(0x2ca3, 0x0031), .driver_info = USB_QUIRK_NO_LPM },

        /* Alcor Link AK9563 SC Reader used in 2022 Lenovo ThinkPads */
        { USB_DEVICE(0x2ce3, 0x9563), .driver_info = USB_QUIRK_NO_LPM },

        /* ezcap401 - BOS descriptor fetch hangs at SuperSpeed Plus */
        { USB_DEVICE(0x32ed, 0x0401), .driver_info = USB_QUIRK_NO_BOS },

        /* DELL USB GEN2 */
        { USB_DEVICE(0x413c, 0xb062), .driver_info = USB_QUIRK_NO_LPM | USB_QUIRK_RESET_RESUME },

        /* VCOM device */
        { USB_DEVICE(0x4296, 0x7570), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS },

        /* Noji-MCS SmartCard Reader */
        { USB_DEVICE(0x5131, 0x2007), .driver_info = USB_QUIRK_FORCE_ONE_CONFIG },

        /* INTEL VALUE SSD */
        { USB_DEVICE(0x8086, 0xf1a5), .driver_info = USB_QUIRK_RESET_RESUME },

        { }  /* terminating entry must be last */
};

static const struct usb_device_id usb_interface_quirk_list[] = {
        /* Logitech UVC Cameras */
        { USB_VENDOR_AND_INTERFACE_INFO(0x046d, USB_CLASS_VIDEO, 1, 0),
          .driver_info = USB_QUIRK_RESET_RESUME },

        { }  /* terminating entry must be last */
};

static const struct usb_device_id usb_amd_resume_quirk_list[] = {
        /* Lenovo Mouse with Pixart controller */
        { USB_DEVICE(0x17ef, 0x602e), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Pixart Mouse */
        { USB_DEVICE(0x093a, 0x2500), .driver_info = USB_QUIRK_RESET_RESUME },
        { USB_DEVICE(0x093a, 0x2510), .driver_info = USB_QUIRK_RESET_RESUME },
        { USB_DEVICE(0x093a, 0x2521), .driver_info = USB_QUIRK_RESET_RESUME },
        { USB_DEVICE(0x03f0, 0x2b4a), .driver_info = USB_QUIRK_RESET_RESUME },

        /* Logitech Optical Mouse M90/M100 */
        { USB_DEVICE(0x046d, 0xc05a), .driver_info = USB_QUIRK_RESET_RESUME },

        { }  /* terminating entry must be last */
};

/*
 * Entries for endpoints that should be ignored when parsing configuration
 * descriptors.
 *
 * Matched for devices with USB_QUIRK_ENDPOINT_IGNORE.
 */
static const struct usb_device_id usb_endpoint_ignore[] = {
        { USB_DEVICE_INTERFACE_NUMBER(0x06f8, 0xb000, 5), .driver_info = 0x01 },
        { USB_DEVICE_INTERFACE_NUMBER(0x06f8, 0xb000, 5), .driver_info = 0x81 },
        { USB_DEVICE_INTERFACE_NUMBER(0x0926, 0x0202, 1), .driver_info = 0x85 },
        { USB_DEVICE_INTERFACE_NUMBER(0x0926, 0x0208, 1), .driver_info = 0x85 },
        { }
};

bool usb_endpoint_is_ignored(struct usb_device *udev,
                             struct usb_host_interface *intf,
                             struct usb_endpoint_descriptor *epd)
{
        const struct usb_device_id *id;
        unsigned int address;

        for (id = usb_endpoint_ignore; id->match_flags; ++id) {
                if (!usb_match_device(udev, id))
                        continue;

                if (!usb_match_one_id_intf(udev, intf, id))
                        continue;

                address = id->driver_info;
                if (address == epd->bEndpointAddress)
                        return true;
        }

        return false;
}

static bool usb_match_any_interface(struct usb_device *udev,
                                    const struct usb_device_id *id)
{
        unsigned int i;

        for (i = 0; i < udev->descriptor.bNumConfigurations; ++i) {
                struct usb_host_config *cfg = &udev->config[i];
                unsigned int j;

                for (j = 0; j < cfg->desc.bNumInterfaces; ++j) {
                        struct usb_interface_cache *cache;
                        struct usb_host_interface *intf;

                        cache = cfg->intf_cache[j];
                        if (cache->num_altsetting == 0)
                                continue;

                        intf = &cache->altsetting[0];
                        if (usb_match_one_id_intf(udev, intf, id))
                                return true;
                }
        }

        return false;
}

static int usb_amd_resume_quirk(struct usb_device *udev)
{
        struct usb_hcd *hcd;

        hcd = bus_to_hcd(udev->bus);
        /* The device should be attached directly to root hub */
        if (udev->level == 1 && hcd->amd_resume_bug == 1)
                return 1;

        return 0;
}

static u32 usb_detect_static_quirks(struct usb_device *udev,
                                    const struct usb_device_id *id)
{
        u32 quirks = 0;

        for (; id->match_flags; id++) {
                if (!usb_match_device(udev, id))
                        continue;

                if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_INFO) &&
                    !usb_match_any_interface(udev, id))
                        continue;

                quirks |= (u32)(id->driver_info);
        }

        return quirks;
}

static u32 usb_detect_dynamic_quirks(struct usb_device *udev)
{
        u16 vid = le16_to_cpu(udev->descriptor.idVendor);
        u16 pid = le16_to_cpu(udev->descriptor.idProduct);
        int i, flags = 0;

        mutex_lock(&quirk_mutex);

        for (i = 0; i < quirk_count; i++) {
                if (vid == quirk_list[i].vid && pid == quirk_list[i].pid) {
                        flags = quirk_list[i].flags;
                        break;
                }
        }

        mutex_unlock(&quirk_mutex);

        return flags;
}

/*
 * Detect any quirks the device has, and do any housekeeping for it if needed.
 */
void usb_detect_quirks(struct usb_device *udev)
{
        udev->quirks = usb_detect_static_quirks(udev, usb_quirk_list);

        /*
         * Pixart-based mice would trigger remote wakeup issue on AMD
         * Yangtze chipset, so set them as RESET_RESUME flag.
         */
        if (usb_amd_resume_quirk(udev))
                udev->quirks |= usb_detect_static_quirks(udev,
                                usb_amd_resume_quirk_list);

        udev->quirks ^= usb_detect_dynamic_quirks(udev);

        if (udev->quirks)
                dev_dbg(&udev->dev, "USB quirks for this device: 0x%x\n",
                        udev->quirks);

#ifdef CONFIG_USB_DEFAULT_PERSIST
        if (!(udev->quirks & USB_QUIRK_RESET))
                udev->persist_enabled = 1;
#else
        /* Hubs are automatically enabled for USB-PERSIST */
        if (udev->descriptor.bDeviceClass == USB_CLASS_HUB)
                udev->persist_enabled = 1;
#endif        /* CONFIG_USB_DEFAULT_PERSIST */
}

void usb_detect_interface_quirks(struct usb_device *udev)
{
        u32 quirks;

        quirks = usb_detect_static_quirks(udev, usb_interface_quirk_list);
        if (quirks == 0)
                return;

        dev_dbg(&udev->dev, "USB interface quirks for this device: %x\n",
                quirks);
        udev->quirks |= quirks;
}

void usb_release_quirk_list(void)
{
        mutex_lock(&quirk_mutex);
        kfree(quirk_list);
        quirk_list = NULL;
        mutex_unlock(&quirk_mutex);
}





































   27 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PKRU_H
#define _ASM_X86_PKRU_H

#include <asm/cpufeature.h>

#define PKRU_AD_BIT 0x1u
#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
extern u32 init_pkru_value;
#define pkru_get_init_value()        READ_ONCE(init_pkru_value)
#else
#define init_pkru_value        0
#define pkru_get_init_value()        0
#endif

static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
}

static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        /*
         * Access-disable disables writes too so we need to check
         * both bits here.
         */
        return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
}

static inline u32 read_pkru(void)
{
        if (cpu_feature_enabled(X86_FEATURE_OSPKE))
                return rdpkru();
        return 0;
}

static inline void write_pkru(u32 pkru)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;
        /*
         * WRPKRU is relatively expensive compared to RDPKRU.
         * Avoid WRPKRU when it would not change the value.
         */
        if (pkru != rdpkru())
                wrpkru(pkru);
}

static inline void pkru_write_default(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        wrpkru(pkru_get_init_value());
}

#endif










































































































































































































































































































































































































































   20 





































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X_TABLES_H
#define _X_TABLES_H


#include <linux/netdevice.h>
#include <linux/static_key.h>
#include <linux/netfilter.h>
#include <uapi/linux/netfilter/x_tables.h>

/* Test a struct->invflags and a boolean for inequality */
#define NF_INVF(ptr, flag, boolean)                                        \
        ((boolean) ^ !!((ptr)->invflags & (flag)))

/**
 * struct xt_action_param - parameters for matches/targets
 *
 * @match:        the match extension
 * @target:        the target extension
 * @matchinfo:        per-match data
 * @targetinfo:        per-target data
 * @state:        pointer to hook state this packet came from
 * @fragoff:        packet is a fragment, this is the data offset
 * @thoff:        position of transport header relative to skb->data
 *
 * Fields written to by extensions:
 *
 * @hotdrop:        drop packet if we had inspection problems
 */
struct xt_action_param {
        union {
                const struct xt_match *match;
                const struct xt_target *target;
        };
        union {
                const void *matchinfo, *targinfo;
        };
        const struct nf_hook_state *state;
        unsigned int thoff;
        u16 fragoff;
        bool hotdrop;
};

static inline struct net *xt_net(const struct xt_action_param *par)
{
        return par->state->net;
}

static inline struct net_device *xt_in(const struct xt_action_param *par)
{
        return par->state->in;
}

static inline struct net_device *xt_out(const struct xt_action_param *par)
{
        return par->state->out;
}

static inline unsigned int xt_hooknum(const struct xt_action_param *par)
{
        return par->state->hook;
}

static inline u_int8_t xt_family(const struct xt_action_param *par)
{
        return par->state->pf;
}

/**
 * struct xt_mtchk_param - parameters for match extensions'
 * checkentry functions
 *
 * @net:        network namespace through which the check was invoked
 * @table:        table the rule is tried to be inserted into
 * @entryinfo:        the family-specific rule data
 *                 (struct ipt_ip, ip6t_ip, arpt_arp or (note) ebt_entry)
 * @match:        struct xt_match through which this function was invoked
 * @matchinfo:        per-match data
 * @hook_mask:        via which hooks the new rule is reachable
 * Other fields as above.
 */
struct xt_mtchk_param {
        struct net *net;
        const char *table;
        const void *entryinfo;
        const struct xt_match *match;
        void *matchinfo;
        unsigned int hook_mask;
        u_int8_t family;
        bool nft_compat;
};

/**
 * struct xt_mdtor_param - match destructor parameters
 * Fields as above.
 */
struct xt_mtdtor_param {
        struct net *net;
        const struct xt_match *match;
        void *matchinfo;
        u_int8_t family;
};

/**
 * struct xt_tgchk_param - parameters for target extensions'
 * checkentry functions
 *
 * @entryinfo:        the family-specific rule data
 *                 (struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry)
 *
 * Other fields see above.
 */
struct xt_tgchk_param {
        struct net *net;
        const char *table;
        const void *entryinfo;
        const struct xt_target *target;
        void *targinfo;
        unsigned int hook_mask;
        u_int8_t family;
        bool nft_compat;
};

/* Target destructor parameters */
struct xt_tgdtor_param {
        struct net *net;
        const struct xt_target *target;
        void *targinfo;
        u_int8_t family;
};

struct xt_match {
        struct list_head list;

        const char name[XT_EXTENSION_MAXNAMELEN];
        u_int8_t revision;

        /* Return true or false: return FALSE and set *hotdrop = 1 to
           force immediate packet drop. */
        /* Arguments changed since 2.6.9, as this must now handle
           non-linear skb, using skb_header_pointer and
           skb_ip_make_writable. */
        bool (*match)(const struct sk_buff *skb,
                      struct xt_action_param *);

        /* Called when user tries to insert an entry of this type. */
        int (*checkentry)(const struct xt_mtchk_param *);

        /* Called when entry of this type deleted. */
        void (*destroy)(const struct xt_mtdtor_param *);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        /* Called when userspace align differs from kernel space one */
        void (*compat_from_user)(void *dst, const void *src);
        int (*compat_to_user)(void __user *dst, const void *src);
#endif
        /* Set this to THIS_MODULE if you are a module, otherwise NULL */
        struct module *me;

        const char *table;
        unsigned int matchsize;
        unsigned int usersize;
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        unsigned int compatsize;
#endif
        unsigned int hooks;
        unsigned short proto;

        unsigned short family;
};

/* Registration hooks for targets. */
struct xt_target {
        struct list_head list;

        const char name[XT_EXTENSION_MAXNAMELEN];
        u_int8_t revision;

        /* Returns verdict. Argument order changed since 2.6.9, as this
           must now handle non-linear skbs, using skb_copy_bits and
           skb_ip_make_writable. */
        unsigned int (*target)(struct sk_buff *skb,
                               const struct xt_action_param *);

        /* Called when user tries to insert an entry of this type:
           hook_mask is a bitmask of hooks from which it can be
           called. */
        /* Should return 0 on success or an error code otherwise (-Exxxx). */
        int (*checkentry)(const struct xt_tgchk_param *);

        /* Called when entry of this type deleted. */
        void (*destroy)(const struct xt_tgdtor_param *);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        /* Called when userspace align differs from kernel space one */
        void (*compat_from_user)(void *dst, const void *src);
        int (*compat_to_user)(void __user *dst, const void *src);
#endif
        /* Set this to THIS_MODULE if you are a module, otherwise NULL */
        struct module *me;

        const char *table;
        unsigned int targetsize;
        unsigned int usersize;
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        unsigned int compatsize;
#endif
        unsigned int hooks;
        unsigned short proto;

        unsigned short family;
};

/* Furniture shopping... */
struct xt_table {
        struct list_head list;

        /* What hooks you will enter on */
        unsigned int valid_hooks;

        /* Man behind the curtain... */
        struct xt_table_info *private;

        /* hook ops that register the table with the netfilter core */
        struct nf_hook_ops *ops;

        /* Set this to THIS_MODULE if you are a module, otherwise NULL */
        struct module *me;

        u_int8_t af;                /* address/protocol family */
        int priority;                /* hook order */

        /* A unique name... */
        const char name[XT_TABLE_MAXNAMELEN];
};

#include <linux/netfilter_ipv4.h>

/* The table itself */
struct xt_table_info {
        /* Size per table */
        unsigned int size;
        /* Number of entries: FIXME. --RR */
        unsigned int number;
        /* Initial number of entries. Needed for module usage count */
        unsigned int initial_entries;

        /* Entry points and underflows */
        unsigned int hook_entry[NF_INET_NUMHOOKS];
        unsigned int underflow[NF_INET_NUMHOOKS];

        /*
         * Number of user chains. Since tables cannot have loops, at most
         * @stacksize jumps (number of user chains) can possibly be made.
         */
        unsigned int stacksize;
        void ***jumpstack;

        unsigned char entries[] __aligned(8);
};

int xt_register_target(struct xt_target *target);
void xt_unregister_target(struct xt_target *target);
int xt_register_targets(struct xt_target *target, unsigned int n);
void xt_unregister_targets(struct xt_target *target, unsigned int n);

int xt_register_match(struct xt_match *target);
void xt_unregister_match(struct xt_match *target);
int xt_register_matches(struct xt_match *match, unsigned int n);
void xt_unregister_matches(struct xt_match *match, unsigned int n);

int xt_check_entry_offsets(const void *base, const char *elems,
                           unsigned int target_offset,
                           unsigned int next_offset);

int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks);

unsigned int *xt_alloc_entry_offsets(unsigned int size);
bool xt_find_jump_offset(const unsigned int *offsets,
                         unsigned int target, unsigned int size);

int xt_check_proc_name(const char *name, unsigned int size);

int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto,
                   bool inv_proto);
int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto,
                    bool inv_proto);

int xt_match_to_user(const struct xt_entry_match *m,
                     struct xt_entry_match __user *u);
int xt_target_to_user(const struct xt_entry_target *t,
                      struct xt_entry_target __user *u);
int xt_data_to_user(void __user *dst, const void *src,
                    int usersize, int size, int aligned_size);

void *xt_copy_counters(sockptr_t arg, unsigned int len,
                       struct xt_counters_info *info);
struct xt_counters *xt_counters_alloc(unsigned int counters);

struct xt_table *xt_register_table(struct net *net,
                                   const struct xt_table *table,
                                   struct xt_table_info *bootstrap,
                                   struct xt_table_info *newinfo);
void *xt_unregister_table(struct xt_table *table);

struct xt_table_info *xt_replace_table(struct xt_table *table,
                                       unsigned int num_counters,
                                       struct xt_table_info *newinfo,
                                       int *error);

struct xt_match *xt_find_match(u8 af, const char *name, u8 revision);
struct xt_match *xt_request_find_match(u8 af, const char *name, u8 revision);
struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision);
int xt_find_revision(u8 af, const char *name, u8 revision, int target,
                     int *err);

struct xt_table *xt_find_table(struct net *net, u8 af, const char *name);
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
                                    const char *name);
struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af,
                                            const char *name);
void xt_table_unlock(struct xt_table *t);

int xt_proto_init(struct net *net, u_int8_t af);
void xt_proto_fini(struct net *net, u_int8_t af);

struct xt_table_info *xt_alloc_table_info(unsigned int size);
void xt_free_table_info(struct xt_table_info *info);

/**
 * xt_recseq - recursive seqcount for netfilter use
 *
 * Packet processing changes the seqcount only if no recursion happened
 * get_counters() can use read_seqcount_begin()/read_seqcount_retry(),
 * because we use the normal seqcount convention :
 * Low order bit set to 1 if a writer is active.
 */
DECLARE_PER_CPU(seqcount_t, xt_recseq);

/* xt_tee_enabled - true if x_tables needs to handle reentrancy
 *
 * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
 */
extern struct static_key xt_tee_enabled;

/**
 * xt_write_recseq_begin - start of a write section
 *
 * Begin packet processing : all readers must wait the end
 * 1) Must be called with preemption disabled
 * 2) softirqs must be disabled too (or we should use this_cpu_add())
 * Returns:
 *  1 if no recursion on this cpu
 *  0 if recursion detected
 */
static inline unsigned int xt_write_recseq_begin(void)
{
        unsigned int addend;

        /*
         * Low order bit of sequence is set if we already
         * called xt_write_recseq_begin().
         */
        addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1;

        /*
         * This is kind of a write_seqcount_begin(), but addend is 0 or 1
         * We dont check addend value to avoid a test and conditional jump,
         * since addend is most likely 1
         */
        __this_cpu_add(xt_recseq.sequence, addend);
        smp_mb();

        return addend;
}

/**
 * xt_write_recseq_end - end of a write section
 * @addend: return value from previous xt_write_recseq_begin()
 *
 * End packet processing : all readers can proceed
 * 1) Must be called with preemption disabled
 * 2) softirqs must be disabled too (or we should use this_cpu_add())
 */
static inline void xt_write_recseq_end(unsigned int addend)
{
        /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
        smp_wmb();
        __this_cpu_add(xt_recseq.sequence, addend);
}

/*
 * This helper is performance critical and must be inlined
 */
static inline unsigned long ifname_compare_aligned(const char *_a,
                                                   const char *_b,
                                                   const char *_mask)
{
        const unsigned long *a = (const unsigned long *)_a;
        const unsigned long *b = (const unsigned long *)_b;
        const unsigned long *mask = (const unsigned long *)_mask;
        unsigned long ret;

        ret = (a[0] ^ b[0]) & mask[0];
        if (IFNAMSIZ > sizeof(unsigned long))
                ret |= (a[1] ^ b[1]) & mask[1];
        if (IFNAMSIZ > 2 * sizeof(unsigned long))
                ret |= (a[2] ^ b[2]) & mask[2];
        if (IFNAMSIZ > 3 * sizeof(unsigned long))
                ret |= (a[3] ^ b[3]) & mask[3];
        BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
        return ret;
}

struct xt_percpu_counter_alloc_state {
        unsigned int off;
        const char __percpu *mem;
};

bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
                             struct xt_counters *counter);
void xt_percpu_counter_free(struct xt_counters *cnt);

static inline struct xt_counters *
xt_get_this_cpu_counter(struct xt_counters *cnt)
{
        if (nr_cpu_ids > 1)
                return this_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt);

        return cnt;
}

static inline struct xt_counters *
xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
{
        if (nr_cpu_ids > 1)
                return per_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt, cpu);

        return cnt;
}

struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);

int xt_register_template(const struct xt_table *t, int(*table_init)(struct net *net));
void xt_unregister_template(const struct xt_table *t);

#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
#include <net/compat.h>

struct compat_xt_entry_match {
        union {
                struct {
                        u_int16_t match_size;
                        char name[XT_FUNCTION_MAXNAMELEN - 1];
                        u_int8_t revision;
                } user;
                struct {
                        u_int16_t match_size;
                        compat_uptr_t match;
                } kernel;
                u_int16_t match_size;
        } u;
        unsigned char data[];
};

struct compat_xt_entry_target {
        union {
                struct {
                        u_int16_t target_size;
                        char name[XT_FUNCTION_MAXNAMELEN - 1];
                        u_int8_t revision;
                } user;
                struct {
                        u_int16_t target_size;
                        compat_uptr_t target;
                } kernel;
                u_int16_t target_size;
        } u;
        unsigned char data[];
};

/* FIXME: this works only on 32 bit tasks
 * need to change whole approach in order to calculate align as function of
 * current task alignment */

struct compat_xt_counters {
        compat_u64 pcnt, bcnt;                        /* Packet and byte counters */
};

struct compat_xt_counters_info {
        char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t num_counters;
        struct compat_xt_counters counters[];
};

struct _compat_xt_align {
        __u8 u8;
        __u16 u16;
        __u32 u32;
        compat_u64 u64;
};

#define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align))

void xt_compat_lock(u_int8_t af);
void xt_compat_unlock(u_int8_t af);

int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta);
void xt_compat_flush_offsets(u_int8_t af);
int xt_compat_init_offsets(u8 af, unsigned int number);
int xt_compat_calc_jump(u_int8_t af, unsigned int offset);

int xt_compat_match_offset(const struct xt_match *match);
void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
                              unsigned int *size);
int xt_compat_match_to_user(const struct xt_entry_match *m,
                            void __user **dstptr, unsigned int *size);

int xt_compat_target_offset(const struct xt_target *target);
void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
                                unsigned int *size);
int xt_compat_target_to_user(const struct xt_entry_target *t,
                             void __user **dstptr, unsigned int *size);
int xt_compat_check_entry_offsets(const void *base, const char *elems,
                                  unsigned int target_offset,
                                  unsigned int next_offset);

#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
#endif /* _X_TABLES_H */








































































































































































































































































































































































































    3 



    3 






   11 





















   10 












































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET  is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Forwarding Information Base.
 *
 * Authors:        A.N.Kuznetsov, <kuznet@ms2.inr.ac.ru>
 */

#ifndef _NET_IP_FIB_H
#define _NET_IP_FIB_H

#include <net/flow.h>
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <net/fib_notifier.h>
#include <net/fib_rules.h>
#include <net/inet_dscp.h>
#include <net/inetpeer.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/refcount.h>
#include <linux/ip.h>
#include <linux/in_route.h>

struct fib_config {
        u8                        fc_dst_len;
        dscp_t                        fc_dscp;
        u8                        fc_protocol;
        u8                        fc_scope;
        u8                        fc_type;
        u8                        fc_gw_family;
        /* 2 bytes unused */
        u32                        fc_table;
        __be32                        fc_dst;
        union {
                __be32                fc_gw4;
                struct in6_addr        fc_gw6;
        };
        int                        fc_oif;
        u32                        fc_flags;
        u32                        fc_priority;
        __be32                        fc_prefsrc;
        u32                        fc_nh_id;
        struct nlattr                *fc_mx;
        struct rtnexthop        *fc_mp;
        int                        fc_mx_len;
        int                        fc_mp_len;
        u32                        fc_flow;
        u32                        fc_nlflags;
        struct nl_info                fc_nlinfo;
        struct nlattr                *fc_encap;
        u16                        fc_encap_type;
};

struct fib_info;
struct rtable;

struct fib_nh_exception {
        struct fib_nh_exception __rcu        *fnhe_next;
        int                                fnhe_genid;
        __be32                                fnhe_daddr;
        u32                                fnhe_pmtu;
        bool                                fnhe_mtu_locked;
        __be32                                fnhe_gw;
        unsigned long                        fnhe_expires;
        struct rtable __rcu                *fnhe_rth_input;
        struct rtable __rcu                *fnhe_rth_output;
        unsigned long                        fnhe_stamp;
        struct rcu_head                        rcu;
};

struct fnhe_hash_bucket {
        struct fib_nh_exception __rcu        *chain;
};

#define FNHE_HASH_SHIFT                11
#define FNHE_HASH_SIZE                (1 << FNHE_HASH_SHIFT)
#define FNHE_RECLAIM_DEPTH        5

struct fib_nh_common {
        struct net_device        *nhc_dev;
        netdevice_tracker        nhc_dev_tracker;
        int                        nhc_oif;
        unsigned char                nhc_scope;
        u8                        nhc_family;
        u8                        nhc_gw_family;
        unsigned char                nhc_flags;
        struct lwtunnel_state        *nhc_lwtstate;

        union {
                __be32          ipv4;
                struct in6_addr ipv6;
        } nhc_gw;

        int                        nhc_weight;
        atomic_t                nhc_upper_bound;

        /* v4 specific, but allows fib6_nh with v4 routes */
        struct rtable __rcu * __percpu *nhc_pcpu_rth_output;
        struct rtable __rcu     *nhc_rth_input;
        struct fnhe_hash_bucket        __rcu *nhc_exceptions;
};

struct fib_nh {
        struct fib_nh_common        nh_common;
        struct hlist_node        nh_hash;
        struct fib_info                *nh_parent;
#ifdef CONFIG_IP_ROUTE_CLASSID
        __u32                        nh_tclassid;
#endif
        __be32                        nh_saddr;
        int                        nh_saddr_genid;
#define fib_nh_family                nh_common.nhc_family
#define fib_nh_dev                nh_common.nhc_dev
#define fib_nh_dev_tracker        nh_common.nhc_dev_tracker
#define fib_nh_oif                nh_common.nhc_oif
#define fib_nh_flags                nh_common.nhc_flags
#define fib_nh_lws                nh_common.nhc_lwtstate
#define fib_nh_scope                nh_common.nhc_scope
#define fib_nh_gw_family        nh_common.nhc_gw_family
#define fib_nh_gw4                nh_common.nhc_gw.ipv4
#define fib_nh_gw6                nh_common.nhc_gw.ipv6
#define fib_nh_weight                nh_common.nhc_weight
#define fib_nh_upper_bound        nh_common.nhc_upper_bound
};

/*
 * This structure contains data shared by many of routes.
 */

struct nexthop;

struct fib_info {
        struct hlist_node        fib_hash;
        struct hlist_node        fib_lhash;
        struct list_head        nh_list;
        struct net                *fib_net;
        refcount_t                fib_treeref;
        refcount_t                fib_clntref;
        unsigned int                fib_flags;
        unsigned char                fib_dead;
        unsigned char                fib_protocol;
        unsigned char                fib_scope;
        unsigned char                fib_type;
        __be32                        fib_prefsrc;
        u32                        fib_tb_id;
        u32                        fib_priority;
        struct dst_metrics        *fib_metrics;
#define fib_mtu fib_metrics->metrics[RTAX_MTU-1]
#define fib_window fib_metrics->metrics[RTAX_WINDOW-1]
#define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
        int                        fib_nhs;
        bool                        fib_nh_is_v6;
        bool                        nh_updated;
        bool                        pfsrc_removed;
        struct nexthop                *nh;
        struct rcu_head                rcu;
        struct fib_nh                fib_nh[] __counted_by(fib_nhs);
};

int __net_init fib4_semantics_init(struct net *net);
void __net_exit fib4_semantics_exit(struct net *net);

#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rule;
#endif

struct fib_table;
struct fib_result {
        __be32                        prefix;
        unsigned char                prefixlen;
        unsigned char                nh_sel;
        unsigned char                type;
        unsigned char                scope;
        u32                        tclassid;
        dscp_t                        dscp;
        struct fib_nh_common        *nhc;
        struct fib_info                *fi;
        struct fib_table        *table;
        struct hlist_head        *fa_head;
};

struct fib_result_nl {
        __be32                fl_addr;   /* To be looked up*/
        u32                fl_mark;
        unsigned char        fl_tos;
        unsigned char   fl_scope;
        unsigned char   tb_id_in;

        unsigned char   tb_id;      /* Results */
        unsigned char        prefixlen;
        unsigned char        nh_sel;
        unsigned char        type;
        unsigned char        scope;
        int             err;
};

#ifdef CONFIG_IP_MULTIPLE_TABLES
#define FIB_TABLE_HASHSZ 256
#else
#define FIB_TABLE_HASHSZ 2
#endif

__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
                                 unsigned char scope);
__be32 fib_result_prefsrc(struct net *net, struct fib_result *res);

#define FIB_RES_NHC(res)                ((res).nhc)
#define FIB_RES_DEV(res)        (FIB_RES_NHC(res)->nhc_dev)
#define FIB_RES_OIF(res)        (FIB_RES_NHC(res)->nhc_oif)

struct fib_rt_info {
        struct fib_info                *fi;
        u32                        tb_id;
        __be32                        dst;
        int                        dst_len;
        dscp_t                        dscp;
        u8                        type;
        u8                        offload:1,
                                trap:1,
                                offload_failed:1,
                                unused:5;
};

struct fib_entry_notifier_info {
        struct fib_notifier_info info; /* must be first */
        u32 dst;
        int dst_len;
        struct fib_info *fi;
        dscp_t dscp;
        u8 type;
        u32 tb_id;
};

struct fib_nh_notifier_info {
        struct fib_notifier_info info; /* must be first */
        struct fib_nh *fib_nh;
};

int call_fib4_notifier(struct notifier_block *nb,
                       enum fib_event_type event_type,
                       struct fib_notifier_info *info);
int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
                        struct fib_notifier_info *info);

int __net_init fib4_notifier_init(struct net *net);
void __net_exit fib4_notifier_exit(struct net *net);

void fib_info_notify_update(struct net *net, struct nl_info *info);
int fib_notify(struct net *net, struct notifier_block *nb,
               struct netlink_ext_ack *extack);

struct fib_table {
        struct hlist_node        tb_hlist;
        u32                        tb_id;
        int                        tb_num_default;
        struct rcu_head                rcu;
        unsigned long                 *tb_data;
        unsigned long                __data[];
};

struct fib_dump_filter {
        u32                        table_id;
        /* filter_set is an optimization that an entry is set */
        bool                        filter_set;
        bool                        dump_routes;
        bool                        dump_exceptions;
        bool                        rtnl_held;
        unsigned char                protocol;
        unsigned char                rt_type;
        unsigned int                flags;
        struct net_device        *dev;
};

int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
                     struct fib_result *res, int fib_flags);
int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
                     struct netlink_ext_ack *extack);
int fib_table_delete(struct net *, struct fib_table *, struct fib_config *,
                     struct netlink_ext_ack *extack);
int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
                   struct netlink_callback *cb, struct fib_dump_filter *filter);
int fib_table_flush(struct net *net, struct fib_table *table, bool flush_all);
struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
void fib_table_flush_external(struct fib_table *table);
void fib_free_table(struct fib_table *tb);

#ifndef CONFIG_IP_MULTIPLE_TABLES

#define TABLE_LOCAL_INDEX        (RT_TABLE_LOCAL & (FIB_TABLE_HASHSZ - 1))
#define TABLE_MAIN_INDEX        (RT_TABLE_MAIN  & (FIB_TABLE_HASHSZ - 1))

static inline struct fib_table *fib_get_table(struct net *net, u32 id)
{
        struct hlist_node *tb_hlist;
        struct hlist_head *ptr;

        ptr = id == RT_TABLE_LOCAL ?
                &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
                &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];

        tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr));

        return hlist_entry(tb_hlist, struct fib_table, tb_hlist);
}

static inline struct fib_table *fib_new_table(struct net *net, u32 id)
{
        return fib_get_table(net, id);
}

static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
                             struct fib_result *res, unsigned int flags)
{
        struct fib_table *tb;
        int err = -ENETUNREACH;

        rcu_read_lock();

        tb = fib_get_table(net, RT_TABLE_MAIN);
        if (tb)
                err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF);

        if (err == -EAGAIN)
                err = -ENETUNREACH;

        rcu_read_unlock();

        return err;
}

static inline bool fib4_has_custom_rules(const struct net *net)
{
        return false;
}

static inline bool fib4_rule_default(const struct fib_rule *rule)
{
        return true;
}

static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb,
                                  struct netlink_ext_ack *extack)
{
        return 0;
}

static inline unsigned int fib4_rules_seq_read(const struct net *net)
{
        return 0;
}

static inline bool fib4_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi4 *fl4,
                                                 struct flow_keys *flkeys)
{
        return false;
}
#else /* CONFIG_IP_MULTIPLE_TABLES */
int __net_init fib4_rules_init(struct net *net);
void __net_exit fib4_rules_exit(struct net *net);

struct fib_table *fib_new_table(struct net *net, u32 id);
struct fib_table *fib_get_table(struct net *net, u32 id);

int __fib_lookup(struct net *net, struct flowi4 *flp,
                 struct fib_result *res, unsigned int flags);

static inline int fib_lookup(struct net *net, struct flowi4 *flp,
                             struct fib_result *res, unsigned int flags)
{
        struct fib_table *tb;
        int err = -ENETUNREACH;

        flags |= FIB_LOOKUP_NOREF;
        if (net->ipv4.fib_has_custom_rules)
                return __fib_lookup(net, flp, res, flags);

        rcu_read_lock();

        res->tclassid = 0;

        tb = rcu_dereference_rtnl(net->ipv4.fib_main);
        if (tb)
                err = fib_table_lookup(tb, flp, res, flags);

        if (!err)
                goto out;

        tb = rcu_dereference_rtnl(net->ipv4.fib_default);
        if (tb)
                err = fib_table_lookup(tb, flp, res, flags);

out:
        if (err == -EAGAIN)
                err = -ENETUNREACH;

        rcu_read_unlock();

        return err;
}

static inline bool fib4_has_custom_rules(const struct net *net)
{
        return net->ipv4.fib_has_custom_rules;
}

bool fib4_rule_default(const struct fib_rule *rule);
int fib4_rules_dump(struct net *net, struct notifier_block *nb,
                    struct netlink_ext_ack *extack);
unsigned int fib4_rules_seq_read(const struct net *net);

static inline bool fib4_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi4 *fl4,
                                                 struct flow_keys *flkeys)
{
        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;

        if (!net->ipv4.fib_rules_require_fldissect)
                return false;

        memset(flkeys, 0, sizeof(*flkeys));
        __skb_flow_dissect(net, skb, &flow_keys_dissector,
                           flkeys, NULL, 0, 0, 0, flag);

        fl4->fl4_sport = flkeys->ports.src;
        fl4->fl4_dport = flkeys->ports.dst;
        fl4->flowi4_proto = flkeys->basic.ip_proto;

        return true;
}

#endif /* CONFIG_IP_MULTIPLE_TABLES */

static inline bool fib_dscp_masked_match(dscp_t dscp, const struct flowi4 *fl4)
{
        return dscp == (fl4->flowi4_dscp & INET_DSCP_LEGACY_TOS_MASK);
}

/* Exported by fib_frontend.c */
extern const struct nla_policy rtm_ipv4_policy[];
void ip_fib_init(void);
int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
                    struct netlink_ext_ack *extack);
__be32 fib_compute_spec_dst(struct sk_buff *skb);
bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev);
int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                        dscp_t dscp, int oif, struct net_device *dev,
                        struct in_device *idev, u32 *itag);

static inline enum skb_drop_reason
fib_validate_source_reason(struct sk_buff *skb, __be32 src, __be32 dst,
                           dscp_t dscp, int oif, struct net_device *dev,
                           struct in_device *idev, u32 *itag)
{
        int err = fib_validate_source(skb, src, dst, dscp, oif, dev, idev,
                                      itag);
        if (err < 0)
                return -err;
        return SKB_NOT_DROPPED_YET;
}

#ifdef CONFIG_IP_ROUTE_CLASSID
static inline int fib_num_tclassid_users(struct net *net)
{
        return atomic_read(&net->ipv4.fib_num_tclassid_users);
}
#else
static inline int fib_num_tclassid_users(struct net *net)
{
        return 0;
}
#endif
int fib_unmerge(struct net *net);

static inline bool nhc_l3mdev_matches_dev(const struct fib_nh_common *nhc,
const struct net_device *dev)
{
        if (nhc->nhc_dev == dev ||
            l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex)
                return true;

        return false;
}

/* Exported by fib_semantics.c */
int ip_fib_check_default(__be32 gw, struct net_device *dev);
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
int fib_sync_down_addr(struct net_device *dev, __be32 local);
int fib_sync_up(struct net_device *dev, unsigned char nh_flags);
void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig);

/* Fields used for sysctl_fib_multipath_hash_fields.
 * Common to IPv4 and IPv6.
 *
 * Add new fields at the end. This is user API.
 */
#define FIB_MULTIPATH_HASH_FIELD_SRC_IP                        BIT(0)
#define FIB_MULTIPATH_HASH_FIELD_DST_IP                        BIT(1)
#define FIB_MULTIPATH_HASH_FIELD_IP_PROTO                BIT(2)
#define FIB_MULTIPATH_HASH_FIELD_FLOWLABEL                BIT(3)
#define FIB_MULTIPATH_HASH_FIELD_SRC_PORT                BIT(4)
#define FIB_MULTIPATH_HASH_FIELD_DST_PORT                BIT(5)
#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP                BIT(6)
#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP                BIT(7)
#define FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO                BIT(8)
#define FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL        BIT(9)
#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT                BIT(10)
#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT                BIT(11)

#define FIB_MULTIPATH_HASH_FIELD_OUTER_MASK                \
        (FIB_MULTIPATH_HASH_FIELD_SRC_IP |                \
         FIB_MULTIPATH_HASH_FIELD_DST_IP |                \
         FIB_MULTIPATH_HASH_FIELD_IP_PROTO |                \
         FIB_MULTIPATH_HASH_FIELD_FLOWLABEL |                \
         FIB_MULTIPATH_HASH_FIELD_SRC_PORT |                \
         FIB_MULTIPATH_HASH_FIELD_DST_PORT)

#define FIB_MULTIPATH_HASH_FIELD_INNER_MASK                \
        (FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP |        \
         FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP |        \
         FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO |        \
         FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL |        \
         FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT |        \
         FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)

#define FIB_MULTIPATH_HASH_FIELD_ALL_MASK                \
        (FIB_MULTIPATH_HASH_FIELD_OUTER_MASK |                \
         FIB_MULTIPATH_HASH_FIELD_INNER_MASK)

#define FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK                \
        (FIB_MULTIPATH_HASH_FIELD_SRC_IP |                \
         FIB_MULTIPATH_HASH_FIELD_DST_IP |                \
         FIB_MULTIPATH_HASH_FIELD_IP_PROTO)

#ifdef CONFIG_IP_ROUTE_MULTIPATH
int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
                       const struct sk_buff *skb, struct flow_keys *flkeys);

static void
fib_multipath_hash_construct_key(siphash_key_t *key, u32 mp_seed)
{
        u64 mp_seed_64 = mp_seed;

        key->key[0] = (mp_seed_64 << 32) | mp_seed_64;
        key->key[1] = key->key[0];
}

static inline u32 fib_multipath_hash_from_keys(const struct net *net,
                                               struct flow_keys *keys)
{
        siphash_aligned_key_t hash_key;
        u32 mp_seed;

        mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed);
        fib_multipath_hash_construct_key(&hash_key, mp_seed);

        return flow_hash_from_keys_seed(keys, &hash_key);
}
#else
static inline u32 fib_multipath_hash_from_keys(const struct net *net,
                                               struct flow_keys *keys)
{
        return flow_hash_from_keys(keys);
}
#endif

int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
                 struct netlink_ext_ack *extack);
void fib_select_multipath(struct fib_result *res, int hash,
                          const struct flowi4 *fl4);
void fib_select_path(struct net *net, struct fib_result *res,
                     struct flowi4 *fl4, const struct sk_buff *skb);

int fib_nh_init(struct net *net, struct fib_nh *fib_nh,
                struct fib_config *cfg, int nh_weight,
                struct netlink_ext_ack *extack);
void fib_nh_release(struct net *net, struct fib_nh *fib_nh);
int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
                       struct nlattr *fc_encap, u16 fc_encap_type,
                       void *cfg, gfp_t gfp_flags,
                       struct netlink_ext_ack *extack);
void fib_nh_common_release(struct fib_nh_common *nhc);

/* Exported by fib_trie.c */
void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri);
void fib_trie_init(void);
struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
                         const struct flowi4 *flp);

static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
        struct fib_nh_common *nhc = res->nhc;
#ifdef CONFIG_IP_MULTIPLE_TABLES
        u32 rtag;
#endif
        if (nhc->nhc_family == AF_INET) {
                struct fib_nh *nh;

                nh = container_of(nhc, struct fib_nh, nh_common);
                *itag = nh->nh_tclassid << 16;
        } else {
                *itag = 0;
        }

#ifdef CONFIG_IP_MULTIPLE_TABLES
        rtag = res->tclassid;
        if (*itag == 0)
                *itag = (rtag<<16);
        *itag |= (rtag>>16);
#endif
#endif
}

void fib_flush(struct net *net);
void free_fib_info(struct fib_info *fi);

static inline void fib_info_hold(struct fib_info *fi)
{
        refcount_inc(&fi->fib_clntref);
}

static inline void fib_info_put(struct fib_info *fi)
{
        if (refcount_dec_and_test(&fi->fib_clntref))
                free_fib_info(fi);
}

#ifdef CONFIG_PROC_FS
int __net_init fib_proc_init(struct net *net);
void __net_exit fib_proc_exit(struct net *net);
#else
static inline int fib_proc_init(struct net *net)
{
        return 0;
}
static inline void fib_proc_exit(struct net *net)
{
}
#endif

u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);

int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
                          struct fib_dump_filter *filter,
                          struct netlink_callback *cb);

int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh,
                     u8 rt_family, unsigned char *flags, bool skip_oif);
int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh,
                    int nh_weight, u8 rt_family, u32 nh_tclassid);
#endif  /* _NET_FIB_H */

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Device core Trace Support
 * Copyright (C) 2021, Intel Corporation
 *
 * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
 */

#undef TRACE_SYSTEM
#define TRACE_SYSTEM dev

#if !defined(__DEV_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define __DEV_TRACE_H

#include <linux/device.h>
#include <linux/tracepoint.h>
#include <linux/types.h>

DECLARE_EVENT_CLASS(devres,
        TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size),
        TP_ARGS(dev, op, node, name, size),
        TP_STRUCT__entry(
                __string(devname, dev_name(dev))
                __field(struct device *, dev)
                __field(const char *, op)
                __field(void *, node)
                __string(name, name)
                __field(size_t, size)
        ),
        TP_fast_assign(
                __assign_str(devname);
                __entry->op = op;
                __entry->node = node;
                __assign_str(name);
                __entry->size = size;
        ),
        TP_printk("%s %3s %p %s (%zu bytes)", __get_str(devname),
                  __entry->op, __entry->node, __get_str(name), __entry->size)
);

DEFINE_EVENT(devres, devres_log,
        TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size),
        TP_ARGS(dev, op, node, name, size)
);

#endif /* __DEV_TRACE_H */

/* this part has to be here */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .

#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace

#include <trace/define_trace.h>














































































































































































































































































































































































































































































































































































































































































   19 



















   19 









   17 

































   19 




































































   17 










































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  linux/drivers/char/serial_core.h
 *
 *  Copyright (C) 2000 Deep Blue Solutions Ltd.
 */
#ifndef LINUX_SERIAL_CORE_H
#define LINUX_SERIAL_CORE_H

#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/console.h>
#include <linux/interrupt.h>
#include <linux/lockdep.h>
#include <linux/printk.h>
#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/tty.h>
#include <linux/mutex.h>
#include <linux/sysrq.h>
#include <uapi/linux/serial_core.h>

#ifdef CONFIG_SERIAL_CORE_CONSOLE
#define uart_console(port) \
        ((port)->cons && (port)->cons->index == (port)->line)
#else
#define uart_console(port)      ({ (void)port; 0; })
#endif

struct uart_port;
struct serial_struct;
struct serial_port_device;
struct device;
struct gpio_desc;

/**
 * struct uart_ops -- interface between serial_core and the driver
 *
 * This structure describes all the operations that can be done on the
 * physical hardware.
 *
 * @tx_empty: ``unsigned int ()(struct uart_port *port)``
 *
 *        This function tests whether the transmitter fifo and shifter for the
 *        @port is empty. If it is empty, this function should return
 *        %TIOCSER_TEMT, otherwise return 0. If the port does not support this
 *        operation, then it should return %TIOCSER_TEMT.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *        This call must not sleep
 *
 * @set_mctrl: ``void ()(struct uart_port *port, unsigned int mctrl)``
 *
 *        This function sets the modem control lines for @port to the state
 *        described by @mctrl. The relevant bits of @mctrl are:
 *
 *                - %TIOCM_RTS        RTS signal.
 *                - %TIOCM_DTR        DTR signal.
 *                - %TIOCM_OUT1        OUT1 signal.
 *                - %TIOCM_OUT2        OUT2 signal.
 *                - %TIOCM_LOOP        Set the port into loopback mode.
 *
 *        If the appropriate bit is set, the signal should be driven
 *        active.  If the bit is clear, the signal should be driven
 *        inactive.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @get_mctrl: ``unsigned int ()(struct uart_port *port)``
 *
 *        Returns the current state of modem control inputs of @port. The state
 *        of the outputs should not be returned, since the core keeps track of
 *        their state. The state information should include:
 *
 *                - %TIOCM_CAR        state of DCD signal
 *                - %TIOCM_CTS        state of CTS signal
 *                - %TIOCM_DSR        state of DSR signal
 *                - %TIOCM_RI        state of RI signal
 *
 *        The bit is set if the signal is currently driven active.  If
 *        the port does not support CTS, DCD or DSR, the driver should
 *        indicate that the signal is permanently active. If RI is
 *        not available, the signal should not be indicated as active.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @stop_tx: ``void ()(struct uart_port *port)``
 *
 *        Stop transmitting characters. This might be due to the CTS line
 *        becoming inactive or the tty layer indicating we want to stop
 *        transmission due to an %XOFF character.
 *
 *        The driver should stop transmitting characters as soon as possible.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @start_tx: ``void ()(struct uart_port *port)``
 *
 *        Start transmitting characters.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @throttle: ``void ()(struct uart_port *port)``
 *
 *        Notify the serial driver that input buffers for the line discipline are
 *        close to full, and it should somehow signal that no more characters
 *        should be sent to the serial port.
 *        This will be called only if hardware assisted flow control is enabled.
 *
 *        Locking: serialized with @unthrottle() and termios modification by the
 *        tty layer.
 *
 * @unthrottle: ``void ()(struct uart_port *port)``
 *
 *        Notify the serial driver that characters can now be sent to the serial
 *        port without fear of overrunning the input buffers of the line
 *        disciplines.
 *
 *        This will be called only if hardware assisted flow control is enabled.
 *
 *        Locking: serialized with @throttle() and termios modification by the
 *        tty layer.
 *
 * @send_xchar: ``void ()(struct uart_port *port, char ch)``
 *
 *        Transmit a high priority character, even if the port is stopped. This
 *        is used to implement XON/XOFF flow control and tcflow(). If the serial
 *        driver does not implement this function, the tty core will append the
 *        character to the circular buffer and then call start_tx() / stop_tx()
 *        to flush the data out.
 *
 *        Do not transmit if @ch == '\0' (%__DISABLED_CHAR).
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @start_rx: ``void ()(struct uart_port *port)``
 *
 *        Start receiving characters.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @stop_rx: ``void ()(struct uart_port *port)``
 *
 *        Stop receiving characters; the @port is in the process of being closed.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @enable_ms: ``void ()(struct uart_port *port)``
 *
 *        Enable the modem status interrupts.
 *
 *        This method may be called multiple times. Modem status interrupts
 *        should be disabled when the @shutdown() method is called.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @break_ctl: ``void ()(struct uart_port *port, int ctl)``
 *
 *        Control the transmission of a break signal. If @ctl is nonzero, the
 *        break signal should be transmitted. The signal should be terminated
 *        when another call is made with a zero @ctl.
 *
 *        Locking: caller holds tty_port->mutex
 *
 * @startup: ``int ()(struct uart_port *port)``
 *
 *        Grab any interrupt resources and initialise any low level driver state.
 *        Enable the port for reception. It should not activate RTS nor DTR;
 *        this will be done via a separate call to @set_mctrl().
 *
 *        This method will only be called when the port is initially opened.
 *
 *        Locking: port_sem taken.
 *        Interrupts: globally disabled.
 *
 * @shutdown: ``void ()(struct uart_port *port)``
 *
 *        Disable the @port, disable any break condition that may be in effect,
 *        and free any interrupt resources. It should not disable RTS nor DTR;
 *        this will have already been done via a separate call to @set_mctrl().
 *
 *        Drivers must not access @port->state once this call has completed.
 *
 *        This method will only be called when there are no more users of this
 *        @port.
 *
 *        Locking: port_sem taken.
 *        Interrupts: caller dependent.
 *
 * @flush_buffer: ``void ()(struct uart_port *port)``
 *
 *        Flush any write buffers, reset any DMA state and stop any ongoing DMA
 *        transfers.
 *
 *        This will be called whenever the @port->state->xmit circular buffer is
 *        cleared.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @set_termios: ``void ()(struct uart_port *port, struct ktermios *new,
 *                        struct ktermios *old)``
 *
 *        Change the @port parameters, including word length, parity, stop bits.
 *        Update @port->read_status_mask and @port->ignore_status_mask to
 *        indicate the types of events we are interested in receiving. Relevant
 *        ktermios::c_cflag bits are:
 *
 *        - %CSIZE - word size
 *        - %CSTOPB - 2 stop bits
 *        - %PARENB - parity enable
 *        - %PARODD - odd parity (when %PARENB is in force)
 *        - %ADDRB - address bit (changed through uart_port::rs485_config()).
 *        - %CREAD - enable reception of characters (if not set, still receive
 *          characters from the port, but throw them away).
 *        - %CRTSCTS - if set, enable CTS status change reporting.
 *        - %CLOCAL - if not set, enable modem status change reporting.
 *
 *        Relevant ktermios::c_iflag bits are:
 *
 *        - %INPCK - enable frame and parity error events to be passed to the TTY
 *          layer.
 *        - %BRKINT / %PARMRK - both of these enable break events to be passed to
 *          the TTY layer.
 *        - %IGNPAR - ignore parity and framing errors.
 *        - %IGNBRK - ignore break errors. If %IGNPAR is also set, ignore overrun
 *          errors as well.
 *
 *        The interaction of the ktermios::c_iflag bits is as follows (parity
 *        error given as an example):
 *
 *        ============ ======= ======= =========================================
 *        Parity error INPCK   IGNPAR
 *        ============ ======= ======= =========================================
 *        n/a             0             n/a     character received, marked as %TTY_NORMAL
 *        None             1             n/a     character received, marked as %TTY_NORMAL
 *        Yes             1             0             character received, marked as %TTY_PARITY
 *        Yes             1             1             character discarded
 *        ============ ======= ======= =========================================
 *
 *        Other flags may be used (eg, xon/xoff characters) if your hardware
 *        supports hardware "soft" flow control.
 *
 *        Locking: caller holds tty_port->mutex
 *        Interrupts: caller dependent.
 *        This call must not sleep
 *
 * @set_ldisc: ``void ()(struct uart_port *port, struct ktermios *termios)``
 *
 *        Notifier for discipline change. See
 *        Documentation/driver-api/tty/tty_ldisc.rst.
 *
 *        Locking: caller holds tty_port->mutex
 *
 * @pm: ``void ()(struct uart_port *port, unsigned int state,
 *                 unsigned int oldstate)``
 *
 *        Perform any power management related activities on the specified @port.
 *        @state indicates the new state (defined by enum uart_pm_state),
 *        @oldstate indicates the previous state.
 *
 *        This function should not be used to grab any resources.
 *
 *        This will be called when the @port is initially opened and finally
 *        closed, except when the @port is also the system console. This will
 *        occur even if %CONFIG_PM is not set.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @type: ``const char *()(struct uart_port *port)``
 *
 *        Return a pointer to a string constant describing the specified @port,
 *        or return %NULL, in which case the string 'unknown' is substituted.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @release_port: ``void ()(struct uart_port *port)``
 *
 *        Release any memory and IO region resources currently in use by the
 *        @port.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @request_port: ``int ()(struct uart_port *port)``
 *
 *        Request any memory and IO region resources required by the port. If any
 *        fail, no resources should be registered when this function returns, and
 *        it should return -%EBUSY on failure.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @config_port: ``void ()(struct uart_port *port, int type)``
 *
 *        Perform any autoconfiguration steps required for the @port. @type
 *        contains a bit mask of the required configuration. %UART_CONFIG_TYPE
 *        indicates that the port requires detection and identification.
 *        @port->type should be set to the type found, or %PORT_UNKNOWN if no
 *        port was detected.
 *
 *        %UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
 *        which should be probed using standard kernel autoprobing techniques.
 *        This is not necessary on platforms where ports have interrupts
 *        internally hard wired (eg, system on a chip implementations).
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @verify_port: ``int ()(struct uart_port *port,
 *                        struct serial_struct *serinfo)``
 *
 *        Verify the new serial port information contained within @serinfo is
 *        suitable for this port type.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @ioctl: ``int ()(struct uart_port *port, unsigned int cmd,
 *                unsigned long arg)``
 *
 *        Perform any port specific IOCTLs. IOCTL commands must be defined using
 *        the standard numbering system found in <asm/ioctl.h>.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @poll_init: ``int ()(struct uart_port *port)``
 *
 *        Called by kgdb to perform the minimal hardware initialization needed to
 *        support @poll_put_char() and @poll_get_char(). Unlike @startup(), this
 *        should not request interrupts.
 *
 *        Locking: %tty_mutex and tty_port->mutex taken.
 *        Interrupts: n/a.
 *
 * @poll_put_char: ``void ()(struct uart_port *port, unsigned char ch)``
 *
 *        Called by kgdb to write a single character @ch directly to the serial
 *        @port. It can and should block until there is space in the TX FIFO.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *        This call must not sleep
 *
 * @poll_get_char: ``int ()(struct uart_port *port)``
 *
 *        Called by kgdb to read a single character directly from the serial
 *        port. If data is available, it should be returned; otherwise the
 *        function should return %NO_POLL_CHAR immediately.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *        This call must not sleep
 */
struct uart_ops {
        unsigned int        (*tx_empty)(struct uart_port *);
        void                (*set_mctrl)(struct uart_port *, unsigned int mctrl);
        unsigned int        (*get_mctrl)(struct uart_port *);
        void                (*stop_tx)(struct uart_port *);
        void                (*start_tx)(struct uart_port *);
        void                (*throttle)(struct uart_port *);
        void                (*unthrottle)(struct uart_port *);
        void                (*send_xchar)(struct uart_port *, char ch);
        void                (*stop_rx)(struct uart_port *);
        void                (*start_rx)(struct uart_port *);
        void                (*enable_ms)(struct uart_port *);
        void                (*break_ctl)(struct uart_port *, int ctl);
        int                (*startup)(struct uart_port *);
        void                (*shutdown)(struct uart_port *);
        void                (*flush_buffer)(struct uart_port *);
        void                (*set_termios)(struct uart_port *, struct ktermios *new,
                                       const struct ktermios *old);
        void                (*set_ldisc)(struct uart_port *, struct ktermios *);
        void                (*pm)(struct uart_port *, unsigned int state,
                              unsigned int oldstate);
        const char        *(*type)(struct uart_port *);
        void                (*release_port)(struct uart_port *);
        int                (*request_port)(struct uart_port *);
        void                (*config_port)(struct uart_port *, int);
        int                (*verify_port)(struct uart_port *, struct serial_struct *);
        int                (*ioctl)(struct uart_port *, unsigned int, unsigned long);
#ifdef CONFIG_CONSOLE_POLL
        int                (*poll_init)(struct uart_port *);
        void                (*poll_put_char)(struct uart_port *, unsigned char);
        int                (*poll_get_char)(struct uart_port *);
#endif
};

#define NO_POLL_CHAR                0x00ff0000
#define UART_CONFIG_TYPE        (1 << 0)
#define UART_CONFIG_IRQ                (1 << 1)

struct uart_icount {
        __u32        cts;
        __u32        dsr;
        __u32        rng;
        __u32        dcd;
        __u32        rx;
        __u32        tx;
        __u32        frame;
        __u32        overrun;
        __u32        parity;
        __u32        brk;
        __u32        buf_overrun;
};

typedef u64 __bitwise upf_t;
typedef unsigned int __bitwise upstat_t;

enum uart_iotype {
        UPIO_UNKNOWN        = -1,
        UPIO_PORT        = SERIAL_IO_PORT,        /* 8b I/O port access */
        UPIO_HUB6        = SERIAL_IO_HUB6,        /* Hub6 ISA card */
        UPIO_MEM        = SERIAL_IO_MEM,        /* driver-specific */
        UPIO_MEM32        = SERIAL_IO_MEM32,        /* 32b little endian */
        UPIO_AU                = SERIAL_IO_AU,                /* Au1x00 and RT288x type IO */
        UPIO_TSI        = SERIAL_IO_TSI,        /* Tsi108/109 type IO */
        UPIO_MEM32BE        = SERIAL_IO_MEM32BE,        /* 32b big endian */
        UPIO_MEM16        = SERIAL_IO_MEM16,        /* 16b little endian */
};

struct uart_port {
        spinlock_t                lock;                        /* port lock */
        unsigned long                iobase;                        /* in/out[bwl] */
        unsigned char __iomem        *membase;                /* read/write[bwl] */
        u32                        (*serial_in)(struct uart_port *, unsigned int offset);
        void                        (*serial_out)(struct uart_port *, unsigned int offset, u32 val);
        void                        (*set_termios)(struct uart_port *,
                                               struct ktermios *new,
                                               const struct ktermios *old);
        void                        (*set_ldisc)(struct uart_port *,
                                             struct ktermios *);
        unsigned int                (*get_mctrl)(struct uart_port *);
        void                        (*set_mctrl)(struct uart_port *, unsigned int);
        unsigned int                (*get_divisor)(struct uart_port *,
                                               unsigned int baud,
                                               unsigned int *frac);
        void                        (*set_divisor)(struct uart_port *,
                                               unsigned int baud,
                                               unsigned int quot,
                                               unsigned int quot_frac);
        int                        (*startup)(struct uart_port *port);
        void                        (*shutdown)(struct uart_port *port);
        void                        (*throttle)(struct uart_port *port);
        void                        (*unthrottle)(struct uart_port *port);
        int                        (*handle_irq)(struct uart_port *);
        void                        (*pm)(struct uart_port *, unsigned int state,
                                      unsigned int old);
        void                        (*handle_break)(struct uart_port *);
        int                        (*rs485_config)(struct uart_port *,
                                                struct ktermios *termios,
                                                struct serial_rs485 *rs485);
        int                        (*iso7816_config)(struct uart_port *,
                                                  struct serial_iso7816 *iso7816);
        unsigned int                ctrl_id;                /* optional serial core controller id */
        unsigned int                port_id;                /* optional serial core port id */
        unsigned int                irq;                        /* irq number */
        unsigned long                irqflags;                /* irq flags  */
        unsigned int                uartclk;                /* base uart clock */
        unsigned int                fifosize;                /* tx fifo size */
        unsigned char                x_char;                        /* xon/xoff char */
        unsigned char                regshift;                /* reg offset shift */

        unsigned char                quirks;                        /* internal quirks */

        /* internal quirks must be updated while holding port mutex */
#define UPQ_NO_TXEN_TEST        BIT(0)

        enum uart_iotype        iotype;                        /* io access style */

        unsigned int                read_status_mask;        /* driver specific */
        unsigned int                ignore_status_mask;        /* driver specific */
        struct uart_state        *state;                        /* pointer to parent state */
        struct uart_icount        icount;                        /* statistics */

        struct console                *cons;                        /* struct console, if any */
        /* flags must be updated while holding port mutex */
        upf_t                        flags;

        /*
         * These flags must be equivalent to the flags defined in
         * include/uapi/linux/tty_flags.h which are the userspace definitions
         * assigned from the serial_struct flags in uart_set_info()
         * [for bit definitions in the UPF_CHANGE_MASK]
         *
         * Bits [0..ASYNCB_LAST_USER] are userspace defined/visible/changeable
         * The remaining bits are serial-core specific and not modifiable by
         * userspace.
         */
#ifdef CONFIG_HAS_IOPORT
#define UPF_FOURPORT                ((__force upf_t) ASYNC_FOURPORT       /* 1  */ )
#else
#define UPF_FOURPORT                0
#endif
#define UPF_SAK                        ((__force upf_t) ASYNC_SAK            /* 2  */ )
#define UPF_SPD_HI                ((__force upf_t) ASYNC_SPD_HI         /* 4  */ )
#define UPF_SPD_VHI                ((__force upf_t) ASYNC_SPD_VHI        /* 5  */ )
#define UPF_SPD_CUST                ((__force upf_t) ASYNC_SPD_CUST   /* 0x0030 */ )
#define UPF_SPD_WARP                ((__force upf_t) ASYNC_SPD_WARP   /* 0x1010 */ )
#define UPF_SPD_MASK                ((__force upf_t) ASYNC_SPD_MASK   /* 0x1030 */ )
#define UPF_SKIP_TEST                ((__force upf_t) ASYNC_SKIP_TEST      /* 6  */ )
#define UPF_AUTO_IRQ                ((__force upf_t) ASYNC_AUTO_IRQ       /* 7  */ )
#define UPF_HARDPPS_CD                ((__force upf_t) ASYNC_HARDPPS_CD     /* 11 */ )
#define UPF_SPD_SHI                ((__force upf_t) ASYNC_SPD_SHI        /* 12 */ )
#define UPF_LOW_LATENCY                ((__force upf_t) ASYNC_LOW_LATENCY    /* 13 */ )
#define UPF_BUGGY_UART                ((__force upf_t) ASYNC_BUGGY_UART     /* 14 */ )
#define UPF_MAGIC_MULTIPLIER        ((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ )

#define UPF_NO_THRE_TEST        ((__force upf_t) BIT_ULL(19))
/* Port has hardware-assisted h/w flow control */
#define UPF_AUTO_CTS                ((__force upf_t) BIT_ULL(20))
#define UPF_AUTO_RTS                ((__force upf_t) BIT_ULL(21))
#define UPF_HARD_FLOW                ((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS))
/* Port has hardware-assisted s/w flow control */
#define UPF_SOFT_FLOW                ((__force upf_t) BIT_ULL(22))
#define UPF_CONS_FLOW                ((__force upf_t) BIT_ULL(23))
#define UPF_SHARE_IRQ                ((__force upf_t) BIT_ULL(24))
#define UPF_EXAR_EFR                ((__force upf_t) BIT_ULL(25))
#define UPF_BUG_THRE                ((__force upf_t) BIT_ULL(26))
/* The exact UART type is known and should not be probed.  */
#define UPF_FIXED_TYPE                ((__force upf_t) BIT_ULL(27))
#define UPF_BOOT_AUTOCONF        ((__force upf_t) BIT_ULL(28))
#define UPF_FIXED_PORT                ((__force upf_t) BIT_ULL(29))
#define UPF_DEAD                ((__force upf_t) BIT_ULL(30))
#define UPF_IOREMAP                ((__force upf_t) BIT_ULL(31))
#define UPF_FULL_PROBE                ((__force upf_t) BIT_ULL(32))

#define __UPF_CHANGE_MASK        0x17fff
#define UPF_CHANGE_MASK                ((__force upf_t) __UPF_CHANGE_MASK)
#define UPF_USR_MASK                ((__force upf_t) (UPF_SPD_MASK|UPF_LOW_LATENCY))

#if __UPF_CHANGE_MASK > ASYNC_FLAGS
#error Change mask not equivalent to userspace-visible bit defines
#endif

        /*
         * Must hold termios_rwsem, port mutex and port lock to change;
         * can hold any one lock to read.
         */
        upstat_t                status;

#define UPSTAT_CTS_ENABLE        ((__force upstat_t) (1 << 0))
#define UPSTAT_DCD_ENABLE        ((__force upstat_t) (1 << 1))
#define UPSTAT_AUTORTS                ((__force upstat_t) (1 << 2))
#define UPSTAT_AUTOCTS                ((__force upstat_t) (1 << 3))
#define UPSTAT_AUTOXOFF                ((__force upstat_t) (1 << 4))
#define UPSTAT_SYNC_FIFO        ((__force upstat_t) (1 << 5))

        bool                        hw_stopped;                /* sw-assisted CTS flow state */
        unsigned int                mctrl;                        /* current modem ctrl settings */
        unsigned int                frame_time;                /* frame timing in ns */
        unsigned int                type;                        /* port type */
        const struct uart_ops        *ops;
        unsigned int                custom_divisor;
        unsigned int                line;                        /* port index */
        unsigned int                minor;
        resource_size_t                mapbase;                /* for ioremap */
        resource_size_t                mapsize;
        struct device                *dev;                        /* serial port physical parent device */
        struct serial_port_device *port_dev;                /* serial core port device */

        unsigned long                sysrq;                        /* sysrq timeout */
        u8                        sysrq_ch;                /* char for sysrq */
        unsigned char                has_sysrq;
        unsigned char                sysrq_seq;                /* index in sysrq_toggle_seq */

        unsigned char                hub6;                        /* this should be in the 8250 driver */
        unsigned char                suspended;
        unsigned char                console_reinit;
        const char                *name;                        /* port name */
        struct attribute_group        *attr_group;                /* port specific attributes */
        const struct attribute_group **tty_groups;        /* all attributes (serial core use only) */
        struct serial_rs485     rs485;
        struct serial_rs485        rs485_supported;        /* Supported mask for serial_rs485 */
        struct gpio_desc        *rs485_term_gpio;        /* enable RS485 bus termination */
        struct gpio_desc        *rs485_rx_during_tx_gpio; /* Output GPIO that sets the state of RS485 RX during TX */
        struct serial_iso7816   iso7816;
        void                        *private_data;                /* generic platform data pointer */
};

/*
 * Only for console->device_lock()/_unlock() callbacks and internal
 * port lock wrapper synchronization.
 */
static inline void __uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
{
        spin_lock_irqsave(&up->lock, *flags);
}

/*
 * Only for console->device_lock()/_unlock() callbacks and internal
 * port lock wrapper synchronization.
 */
static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
{
        spin_unlock_irqrestore(&up->lock, flags);
}

/**
 * uart_port_set_cons - Safely set the @cons field for a uart
 * @up:                The uart port to set
 * @con:        The new console to set to
 *
 * This function must be used to set @up->cons. It uses the port lock to
 * synchronize with the port lock wrappers in order to ensure that the console
 * cannot change or disappear while another context is holding the port lock.
 */
static inline void uart_port_set_cons(struct uart_port *up, struct console *con)
{
        unsigned long flags;

        __uart_port_lock_irqsave(up, &flags);
        up->cons = con;
        __uart_port_unlock_irqrestore(up, flags);
}

/* Only for internal port lock wrapper usage. */
static inline bool __uart_port_using_nbcon(struct uart_port *up)
{
        lockdep_assert_held_once(&up->lock);

        if (likely(!uart_console(up)))
                return false;

        /*
         * @up->cons is only modified under the port lock. Therefore it is
         * certain that it cannot disappear here.
         *
         * @up->cons->node is added/removed from the console list under the
         * port lock. Therefore it is certain that the registration status
         * cannot change here, thus @up->cons->flags can be read directly.
         */
        if (hlist_unhashed_lockless(&up->cons->node) ||
            !(up->cons->flags & CON_NBCON) ||
            !up->cons->write_atomic) {
                return false;
        }

        return true;
}

/* Only for internal port lock wrapper usage. */
static inline bool __uart_port_nbcon_try_acquire(struct uart_port *up)
{
        if (!__uart_port_using_nbcon(up))
                return true;

        return nbcon_device_try_acquire(up->cons);
}

/* Only for internal port lock wrapper usage. */
static inline void __uart_port_nbcon_acquire(struct uart_port *up)
{
        if (!__uart_port_using_nbcon(up))
                return;

        while (!nbcon_device_try_acquire(up->cons))
                cpu_relax();
}

/* Only for internal port lock wrapper usage. */
static inline void __uart_port_nbcon_release(struct uart_port *up)
{
        if (!__uart_port_using_nbcon(up))
                return;

        nbcon_device_release(up->cons);
}

/**
 * uart_port_lock - Lock the UART port
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_lock(struct uart_port *up)
{
        spin_lock(&up->lock);
        __uart_port_nbcon_acquire(up);
}

/**
 * uart_port_lock_irq - Lock the UART port and disable interrupts
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_lock_irq(struct uart_port *up)
{
        spin_lock_irq(&up->lock);
        __uart_port_nbcon_acquire(up);
}

/**
 * uart_port_lock_irqsave - Lock the UART port, save and disable interrupts
 * @up:                Pointer to UART port structure
 * @flags:        Pointer to interrupt flags storage
 */
static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
{
        spin_lock_irqsave(&up->lock, *flags);
        __uart_port_nbcon_acquire(up);
}

/**
 * uart_port_trylock - Try to lock the UART port
 * @up:                Pointer to UART port structure
 *
 * Returns: True if lock was acquired, false otherwise
 */
static inline bool uart_port_trylock(struct uart_port *up)
{
        if (!spin_trylock(&up->lock))
                return false;

        if (!__uart_port_nbcon_try_acquire(up)) {
                spin_unlock(&up->lock);
                return false;
        }

        return true;
}

/**
 * uart_port_trylock_irqsave - Try to lock the UART port, save and disable interrupts
 * @up:                Pointer to UART port structure
 * @flags:        Pointer to interrupt flags storage
 *
 * Returns: True if lock was acquired, false otherwise
 */
static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags)
{
        if (!spin_trylock_irqsave(&up->lock, *flags))
                return false;

        if (!__uart_port_nbcon_try_acquire(up)) {
                spin_unlock_irqrestore(&up->lock, *flags);
                return false;
        }

        return true;
}

/**
 * uart_port_unlock - Unlock the UART port
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_unlock(struct uart_port *up)
{
        __uart_port_nbcon_release(up);
        spin_unlock(&up->lock);
}

/**
 * uart_port_unlock_irq - Unlock the UART port and re-enable interrupts
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_unlock_irq(struct uart_port *up)
{
        __uart_port_nbcon_release(up);
        spin_unlock_irq(&up->lock);
}

/**
 * uart_port_unlock_irqrestore - Unlock the UART port, restore interrupts
 * @up:                Pointer to UART port structure
 * @flags:        The saved interrupt flags for restore
 */
static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
{
        __uart_port_nbcon_release(up);
        spin_unlock_irqrestore(&up->lock, flags);
}

DEFINE_GUARD(uart_port_lock, struct uart_port *, uart_port_lock(_T), uart_port_unlock(_T));
DEFINE_GUARD_COND(uart_port_lock, _try, uart_port_trylock(_T));

DEFINE_GUARD(uart_port_lock_irq, struct uart_port *, uart_port_lock_irq(_T),
             uart_port_unlock_irq(_T));

DEFINE_LOCK_GUARD_1(uart_port_lock_irqsave, struct uart_port,
                    uart_port_lock_irqsave(_T->lock, &_T->flags),
                    uart_port_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags);
DEFINE_LOCK_GUARD_1_COND(uart_port_lock_irqsave, _try,
                         uart_port_trylock_irqsave(_T->lock, &_T->flags));

static inline int serial_port_in(struct uart_port *up, int offset)
{
        return up->serial_in(up, offset);
}

static inline void serial_port_out(struct uart_port *up, int offset, int value)
{
        up->serial_out(up, offset, value);
}

/**
 * enum uart_pm_state - power states for UARTs
 * @UART_PM_STATE_ON: UART is powered, up and operational
 * @UART_PM_STATE_OFF: UART is powered off
 * @UART_PM_STATE_UNDEFINED: sentinel
 */
enum uart_pm_state {
        UART_PM_STATE_ON = 0,
        UART_PM_STATE_OFF = 3, /* number taken from ACPI */
        UART_PM_STATE_UNDEFINED,
};

/*
 * This is the state information which is persistent across opens.
 */
struct uart_state {
        struct tty_port                port;

        enum uart_pm_state        pm_state;

        atomic_t                refcount;
        wait_queue_head_t        remove_wait;
        struct uart_port        *uart_port;
};

#define UART_XMIT_SIZE        PAGE_SIZE


/* number of characters left in xmit buffer before we ask for more */
#define WAKEUP_CHARS                256

/**
 * uart_xmit_advance - Advance xmit buffer and account Tx'ed chars
 * @up: uart_port structure describing the port
 * @chars: number of characters sent
 *
 * This function advances the tail of circular xmit buffer by the number of
 * @chars transmitted and handles accounting of transmitted bytes (into
 * @up's icount.tx).
 */
static inline void uart_xmit_advance(struct uart_port *up, unsigned int chars)
{
        struct tty_port *tport = &up->state->port;

        kfifo_skip_count(&tport->xmit_fifo, chars);
        up->icount.tx += chars;
}

static inline unsigned int uart_fifo_out(struct uart_port *up,
                unsigned char *buf, unsigned int chars)
{
        struct tty_port *tport = &up->state->port;

        chars = kfifo_out(&tport->xmit_fifo, buf, chars);
        up->icount.tx += chars;

        return chars;
}

static inline unsigned int uart_fifo_get(struct uart_port *up,
                unsigned char *ch)
{
        struct tty_port *tport = &up->state->port;
        unsigned int chars;

        chars = kfifo_get(&tport->xmit_fifo, ch);
        up->icount.tx += chars;

        return chars;
}

struct module;
struct tty_driver;

struct uart_driver {
        struct module                *owner;
        const char                *driver_name;
        const char                *dev_name;
        int                         major;
        int                         minor;
        int                         nr;
        struct console                *cons;

        /*
         * these are private; the low level driver should not
         * touch these; they should be initialised to NULL
         */
        struct uart_state        *state;
        struct tty_driver        *tty_driver;
};

void uart_write_wakeup(struct uart_port *port);

/**
 * enum UART_TX_FLAGS -- flags for uart_port_tx_flags()
 *
 * @UART_TX_NOSTOP: don't call port->ops->stop_tx() on empty buffer
 */
enum UART_TX_FLAGS {
        UART_TX_NOSTOP = BIT(0),
};

#define __uart_port_tx(uport, ch, flags, tx_ready, put_char, tx_done,              \
                       for_test, for_post)                                      \
({                                                                              \
        struct uart_port *__port = (uport);                                      \
        struct tty_port *__tport = &__port->state->port;                      \
        unsigned int pending;                                                      \
                                                                              \
        for (; (for_test) && (tx_ready); (for_post), __port->icount.tx++) {   \
                if (__port->x_char) {                                              \
                        (ch) = __port->x_char;                                      \
                        (put_char);                                              \
                        __port->x_char = 0;                                      \
                        continue;                                              \
                }                                                              \
                                                                              \
                if (uart_tx_stopped(__port))                                      \
                        break;                                                      \
                                                                              \
                if (!kfifo_get(&__tport->xmit_fifo, &(ch)))                      \
                        break;                                                      \
                                                                              \
                (put_char);                                                      \
        }                                                                      \
                                                                              \
        (tx_done);                                                              \
                                                                              \
        pending = kfifo_len(&__tport->xmit_fifo);                              \
        if (pending < WAKEUP_CHARS) {                                              \
                uart_write_wakeup(__port);                                      \
                                                                              \
                if (!((flags) & UART_TX_NOSTOP) && pending == 0)              \
                        __port->ops->stop_tx(__port);                              \
        }                                                                      \
                                                                              \
        pending;                                                              \
})

/**
 * uart_port_tx_limited -- transmit helper for uart_port with count limiting
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @count: a limit of characters to send
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 * @tx_done: function to call after the loop is done
 *
 * This helper transmits characters from the xmit buffer to the hardware using
 * @put_char(). It does so until @count characters are sent and while @tx_ready
 * evaluates to true.
 *
 * Returns: the number of characters in the xmit buffer when done.
 *
 * The expression in macro parameters shall be designed as follows:
 *  * **tx_ready:** should evaluate to true if the HW can accept more data to
 *    be sent. This parameter can be %true, which means the HW is always ready.
 *  * **put_char:** shall write @ch to the device of @port.
 *  * **tx_done:** when the write loop is done, this can perform arbitrary
 *    action before potential invocation of ops->stop_tx() happens. If the
 *    driver does not need to do anything, use e.g. ({}).
 *
 * For all of them, @port->lock is held, interrupts are locally disabled and
 * the expressions must not sleep.
 */
#define uart_port_tx_limited(port, ch, count, tx_ready, put_char, tx_done) ({ \
        unsigned int __count = (count);                                              \
        __uart_port_tx(port, ch, 0, tx_ready, put_char, tx_done, __count,     \
                        __count--);                                              \
})

/**
 * uart_port_tx_limited_flags -- transmit helper for uart_port with count limiting with flags
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @flags: %UART_TX_NOSTOP or similar
 * @count: a limit of characters to send
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 * @tx_done: function to call after the loop is done
 *
 * See uart_port_tx_limited() for more details.
 */
#define uart_port_tx_limited_flags(port, ch, flags, count, tx_ready, put_char, tx_done) ({ \
        unsigned int __count = (count);                                                           \
        __uart_port_tx(port, ch, flags, tx_ready, put_char, tx_done, __count,                   \
                        __count--);                                                           \
})

/**
 * uart_port_tx -- transmit helper for uart_port
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 *
 * See uart_port_tx_limited() for more details.
 */
#define uart_port_tx(port, ch, tx_ready, put_char)                        \
        __uart_port_tx(port, ch, 0, tx_ready, put_char, ({}), true, ({}))


/**
 * uart_port_tx_flags -- transmit helper for uart_port with flags
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @flags: %UART_TX_NOSTOP or similar
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 *
 * See uart_port_tx_limited() for more details.
 */
#define uart_port_tx_flags(port, ch, flags, tx_ready, put_char)                \
        __uart_port_tx(port, ch, flags, tx_ready, put_char, ({}), true, ({}))
/*
 * Baud rate helpers.
 */
void uart_update_timeout(struct uart_port *port, unsigned int cflag,
                         unsigned int baud);
unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios,
                                const struct ktermios *old, unsigned int min,
                                unsigned int max);
unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud);

/*
 * Calculates FIFO drain time.
 */
static inline unsigned long uart_fifo_timeout(struct uart_port *port)
{
        u64 fifo_timeout = (u64)READ_ONCE(port->frame_time) * port->fifosize;

        /* Add .02 seconds of slop */
        fifo_timeout += 20 * NSEC_PER_MSEC;

        return max(nsecs_to_jiffies(fifo_timeout), 1UL);
}

/* Base timer interval for polling */
static inline unsigned long uart_poll_timeout(struct uart_port *port)
{
        unsigned long timeout = uart_fifo_timeout(port);

        return timeout > 6 ? (timeout / 2 - 2) : 1;
}

/*
 * Console helpers.
 */
struct earlycon_device {
        struct console *con;
        struct uart_port port;
        char options[32];                /* e.g., 115200n8 */
        unsigned int baud;
};

struct earlycon_id {
        char        name[15];
        char        name_term;        /* In case compiler didn't '\0' term name */
        char        compatible[128];
        int        (*setup)(struct earlycon_device *, const char *options);
};

extern const struct earlycon_id __earlycon_table[];
extern const struct earlycon_id __earlycon_table_end[];

#if defined(CONFIG_SERIAL_EARLYCON) && !defined(MODULE)
#define EARLYCON_USED_OR_UNUSED        __used
#else
#define EARLYCON_USED_OR_UNUSED        __maybe_unused
#endif

#define OF_EARLYCON_DECLARE(_name, compat, fn)                                \
        static const struct earlycon_id __UNIQUE_ID(__earlycon_##_name) \
                EARLYCON_USED_OR_UNUSED  __section("__earlycon_table")  \
                __aligned(__alignof__(struct earlycon_id))                \
                = { .name = __stringify(_name),                                \
                    .compatible = compat,                                \
                    .setup = fn }

#define EARLYCON_DECLARE(_name, fn)        OF_EARLYCON_DECLARE(_name, "", fn)

int of_setup_earlycon(const struct earlycon_id *match, unsigned long node,
                      const char *options);

#ifdef CONFIG_SERIAL_EARLYCON
extern bool earlycon_acpi_spcr_enable __initdata;
int setup_earlycon(char *buf);
#else
static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED;
static inline int setup_earlycon(char *buf) { return 0; }
#endif

/* Variant of uart_console_registered() when the console_list_lock is held. */
static inline bool uart_console_registered_locked(struct uart_port *port)
{
        return uart_console(port) && console_is_registered_locked(port->cons);
}

static inline bool uart_console_registered(struct uart_port *port)
{
        return uart_console(port) && console_is_registered(port->cons);
}

int uart_parse_earlycon(char *p, enum uart_iotype *iotype,
                        resource_size_t *addr, char **options);
void uart_parse_options(const char *options, int *baud, int *parity, int *bits,
                        int *flow);
int uart_set_options(struct uart_port *port, struct console *co, int baud,
                     int parity, int bits, int flow);
struct tty_driver *uart_console_device(struct console *co, int *index);
void uart_console_write(struct uart_port *port, const char *s,
                        unsigned int count,
                        void (*putchar)(struct uart_port *, unsigned char));

/*
 * Port/driver registration/removal
 */
int uart_register_driver(struct uart_driver *uart);
void uart_unregister_driver(struct uart_driver *uart);
int uart_add_one_port(struct uart_driver *reg, struct uart_port *port);
void uart_remove_one_port(struct uart_driver *reg, struct uart_port *port);
int uart_read_port_properties(struct uart_port *port);
int uart_read_and_validate_port_properties(struct uart_port *port);
bool uart_match_port(const struct uart_port *port1,
                const struct uart_port *port2);

/*
 * Power Management
 */
int uart_suspend_port(struct uart_driver *reg, struct uart_port *port);
int uart_resume_port(struct uart_driver *reg, struct uart_port *port);

static inline int uart_tx_stopped(struct uart_port *port)
{
        struct tty_struct *tty = port->state->port.tty;
        if ((tty && tty->flow.stopped) || port->hw_stopped)
                return 1;
        return 0;
}

static inline bool uart_cts_enabled(struct uart_port *uport)
{
        return !!(uport->status & UPSTAT_CTS_ENABLE);
}

static inline bool uart_softcts_mode(struct uart_port *uport)
{
        upstat_t mask = UPSTAT_CTS_ENABLE | UPSTAT_AUTOCTS;

        return ((uport->status & mask) == UPSTAT_CTS_ENABLE);
}

/*
 * The following are helper functions for the low level drivers.
 */

void uart_handle_dcd_change(struct uart_port *uport, bool active);
void uart_handle_cts_change(struct uart_port *uport, bool active);

void uart_insert_char(struct uart_port *port, unsigned int status,
                      unsigned int overrun, u8 ch, u8 flag);

void uart_xchar_out(struct uart_port *uport, int offset);

#ifdef CONFIG_MAGIC_SYSRQ_SERIAL
#define SYSRQ_TIMEOUT        (HZ * 5)

bool uart_try_toggle_sysrq(struct uart_port *port, u8 ch);

static inline int uart_handle_sysrq_char(struct uart_port *port, u8 ch)
{
        if (!port->sysrq)
                return 0;

        if (ch && time_before(jiffies, port->sysrq)) {
                if (sysrq_mask()) {
                        handle_sysrq(ch);
                        port->sysrq = 0;
                        return 1;
                }
                if (uart_try_toggle_sysrq(port, ch))
                        return 1;
        }
        port->sysrq = 0;

        return 0;
}

static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch)
{
        if (!port->sysrq)
                return 0;

        if (ch && time_before(jiffies, port->sysrq)) {
                if (sysrq_mask()) {
                        port->sysrq_ch = ch;
                        port->sysrq = 0;
                        return 1;
                }
                if (uart_try_toggle_sysrq(port, ch))
                        return 1;
        }
        port->sysrq = 0;

        return 0;
}

static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
{
        u8 sysrq_ch;

        if (!port->has_sysrq) {
                uart_port_unlock(port);
                return;
        }

        sysrq_ch = port->sysrq_ch;
        port->sysrq_ch = 0;

        uart_port_unlock(port);

        if (sysrq_ch)
                handle_sysrq(sysrq_ch);
}

static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port,
                unsigned long flags)
{
        u8 sysrq_ch;

        if (!port->has_sysrq) {
                uart_port_unlock_irqrestore(port, flags);
                return;
        }

        sysrq_ch = port->sysrq_ch;
        port->sysrq_ch = 0;

        uart_port_unlock_irqrestore(port, flags);

        if (sysrq_ch)
                handle_sysrq(sysrq_ch);
}
#else        /* CONFIG_MAGIC_SYSRQ_SERIAL */
static inline int uart_handle_sysrq_char(struct uart_port *port, u8 ch)
{
        return 0;
}
static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch)
{
        return 0;
}
static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
{
        uart_port_unlock(port);
}
static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port,
                unsigned long flags)
{
        uart_port_unlock_irqrestore(port, flags);
}
#endif        /* CONFIG_MAGIC_SYSRQ_SERIAL */

/*
 * We do the SysRQ and SAK checking like this...
 */
static inline int uart_handle_break(struct uart_port *port)
{
        struct uart_state *state = port->state;

        if (port->handle_break)
                port->handle_break(port);

#ifdef CONFIG_MAGIC_SYSRQ_SERIAL
        if (port->has_sysrq && uart_console(port)) {
                if (!port->sysrq) {
                        port->sysrq = jiffies + SYSRQ_TIMEOUT;
                        return 1;
                }
                port->sysrq = 0;
        }
#endif
        if (port->flags & UPF_SAK)
                do_SAK(state->port.tty);
        return 0;
}

/*
 *        UART_ENABLE_MS - determine if port should enable modem status irqs
 */
#define UART_ENABLE_MS(port,cflag)        ((port)->flags & UPF_HARDPPS_CD || \
                                         (cflag) & CRTSCTS || \
                                         !((cflag) & CLOCAL))

int uart_get_rs485_mode(struct uart_port *port);
#endif /* LINUX_SERIAL_CORE_H */
















































































































































































































































    1 















































    1 













































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FUTEX_H
#define _FUTEX_H

#include <linux/futex.h>
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
#include <linux/cleanup.h>

#ifdef CONFIG_PREEMPT_RT
#include <linux/rcuwait.h>
#endif

#include <asm/futex.h>

/*
 * Futex flags used to encode options to functions and preserve them across
 * restarts.
 */
#define FLAGS_SIZE_8                0x0000
#define FLAGS_SIZE_16                0x0001
#define FLAGS_SIZE_32                0x0002
#define FLAGS_SIZE_64                0x0003

#define FLAGS_SIZE_MASK                0x0003

#ifdef CONFIG_MMU
# define FLAGS_SHARED                0x0010
#else
/*
 * NOMMU does not have per process address space. Let the compiler optimize
 * code away.
 */
# define FLAGS_SHARED                0x0000
#endif
#define FLAGS_CLOCKRT                0x0020
#define FLAGS_HAS_TIMEOUT        0x0040
#define FLAGS_NUMA                0x0080
#define FLAGS_STRICT                0x0100
#define FLAGS_MPOL                0x0200

/* FUTEX_ to FLAGS_ */
static inline unsigned int futex_to_flags(unsigned int op)
{
        unsigned int flags = FLAGS_SIZE_32;

        if (!(op & FUTEX_PRIVATE_FLAG))
                flags |= FLAGS_SHARED;

        if (op & FUTEX_CLOCK_REALTIME)
                flags |= FLAGS_CLOCKRT;

        return flags;
}

#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE)

/* FUTEX2_ to FLAGS_ */
static inline unsigned int futex2_to_flags(unsigned int flags2)
{
        unsigned int flags = flags2 & FUTEX2_SIZE_MASK;

        if (!(flags2 & FUTEX2_PRIVATE))
                flags |= FLAGS_SHARED;

        if (flags2 & FUTEX2_NUMA)
                flags |= FLAGS_NUMA;

        if (flags2 & FUTEX2_MPOL)
                flags |= FLAGS_MPOL;

        return flags;
}

static inline unsigned int futex_size(unsigned int flags)
{
        return 1 << (flags & FLAGS_SIZE_MASK);
}

static inline bool futex_flags_valid(unsigned int flags)
{
        /* Only 64bit futexes for 64bit code */
        if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
                if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64)
                        return false;
        }

        /* Only 32bit futexes are implemented -- for now */
        if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
                return false;

        /*
         * Must be able to represent both FUTEX_NO_NODE and every valid nodeid
         * in a futex word.
         */
        if (flags & FLAGS_NUMA) {
                int bits = 8 * futex_size(flags);
                u64 max = ~0ULL;

                max >>= 64 - bits;
                if (nr_node_ids >= max)
                        return false;
        }

        return true;
}

static inline bool futex_validate_input(unsigned int flags, u64 val)
{
        int bits = 8 * futex_size(flags);

        if (bits < 64 && (val >> bits))
                return false;

        return true;
}

#ifdef CONFIG_FAIL_FUTEX
extern bool should_fail_futex(bool fshared);
#else
static inline bool should_fail_futex(bool fshared)
{
        return false;
}
#endif

/*
 * Hash buckets are shared by all the futex_keys that hash to the same
 * location.  Each key may have multiple futex_q structures, one for each task
 * waiting on a futex.
 */
struct futex_hash_bucket {
        atomic_t waiters;
        spinlock_t lock;
        struct plist_head chain;
        struct futex_private_hash *priv;
} ____cacheline_aligned_in_smp;

/*
 * Priority Inheritance state:
 */
struct futex_pi_state {
        /*
         * list of 'owned' pi_state instances - these have to be
         * cleaned up in do_exit() if the task exits prematurely:
         */
        struct list_head list;

        /*
         * The PI object:
         */
        struct rt_mutex_base pi_mutex;

        struct task_struct *owner;
        refcount_t refcount;

        union futex_key key;
} __randomize_layout;

struct futex_q;
typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);

/**
 * struct futex_q - The hashed futex queue entry, one per waiting task
 * @list:                priority-sorted list of tasks waiting on this futex
 * @task:                the task waiting on the futex
 * @lock_ptr:                the hash bucket lock
 * @wake:                the wake handler for this queue
 * @wake_data:                data associated with the wake handler
 * @key:                the key the futex is hashed on
 * @pi_state:                optional priority inheritance state
 * @rt_waiter:                rt_waiter storage for use with requeue_pi
 * @requeue_pi_key:        the requeue_pi target futex key
 * @bitset:                bitset for the optional bitmasked wakeup
 * @requeue_state:        State field for futex_requeue_pi()
 * @drop_hb_ref:        Waiter should drop the extra hash bucket reference if true
 * @requeue_wait:        RCU wait for futex_requeue_pi() (RT only)
 *
 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
 * we can wake only the relevant ones (hashed queues may be shared).
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 * The order of wakeup is always to make the first condition true, then
 * the second.
 *
 * PI futexes are typically woken before they are removed from the hash list via
 * the rt_mutex code. See futex_unqueue_pi().
 */
struct futex_q {
        struct plist_node list;

        struct task_struct *task;
        spinlock_t *lock_ptr;
        futex_wake_fn *wake;
        void *wake_data;
        union futex_key key;
        struct futex_pi_state *pi_state;
        struct rt_mutex_waiter *rt_waiter;
        union futex_key *requeue_pi_key;
        u32 bitset;
        atomic_t requeue_state;
        bool drop_hb_ref;
#ifdef CONFIG_PREEMPT_RT
        struct rcuwait requeue_wait;
#endif
} __randomize_layout;

extern const struct futex_q futex_q_init;

enum futex_access {
        FUTEX_READ,
        FUTEX_WRITE
};

extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
                         enum futex_access rw);
extern void futex_q_lockptr_lock(struct futex_q *q) __acquires(q->lock_ptr);
extern struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
                  int flags, u64 range_ns);

extern struct futex_hash_bucket *futex_hash(union futex_key *key);
#ifdef CONFIG_FUTEX_PRIVATE_HASH
extern void futex_hash_get(struct futex_hash_bucket *hb);
extern void futex_hash_put(struct futex_hash_bucket *hb);

extern struct futex_private_hash *futex_private_hash(void);
extern void futex_private_hash_put(struct futex_private_hash *fph);

#else /* !CONFIG_FUTEX_PRIVATE_HASH */
static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
#endif

DEFINE_CLASS(hb, struct futex_hash_bucket *,
             if (_T) futex_hash_put(_T),
             futex_hash(key), union futex_key *key);

DEFINE_CLASS(private_hash, struct futex_private_hash *,
             if (_T) futex_private_hash_put(_T),
             futex_private_hash(), void);

/**
 * futex_match - Check whether two futex keys are equal
 * @key1:        Pointer to key1
 * @key2:        Pointer to key2
 *
 * Return 1 if two futex_keys are equal, 0 otherwise.
 */
static inline int futex_match(union futex_key *key1, union futex_key *key2)
{
        return (key1 && key2
                && key1->both.word == key2->both.word
                && key1->both.ptr == key2->both.ptr
                && key1->both.offset == key2->both.offset);
}

extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                            struct futex_q *q, union futex_key *key2,
                            struct task_struct *task);
extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout);
extern bool __futex_wake_mark(struct futex_q *q);
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);

extern int fault_in_user_writeable(u32 __user *uaddr);
extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);

static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
{
        int ret;

        pagefault_disable();
        ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
        pagefault_enable();

        return ret;
}

/* Read from user memory with pagefaults disabled */
static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
{
        guard(pagefault)();
        return get_user_inline(*dest, from);
}

extern void __futex_unqueue(struct futex_q *q);
extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
                                struct task_struct *task);
extern int futex_unqueue(struct futex_q *q);

/**
 * futex_queue() - Enqueue the futex_q on the futex_hash_bucket
 * @q:        The futex_q to enqueue
 * @hb:        The destination hash bucket
 * @task: Task queueing this futex
 *
 * The hb->lock must be held by the caller, and is released here. A call to
 * futex_queue() is typically paired with exactly one call to futex_unqueue().  The
 * exceptions involve the PI related operations, which may use futex_unqueue_pi()
 * or nothing if the unqueue is done as part of the wake process and the unqueue
 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
 * an example).
 *
 * Note that @task may be NULL, for async usage of futexes.
 */
static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
                               struct task_struct *task)
        __releases(&hb->lock)
        __releases(q->lock_ptr)
{
        __futex_queue(q, hb, task);
        spin_unlock(&hb->lock);
        __release(q->lock_ptr);
}

extern void futex_unqueue_pi(struct futex_q *q);

extern void wait_for_owner_exiting(int ret, struct task_struct *exiting);

/*
 * Reflects a new waiter being added to the waitqueue.
 */
static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
        atomic_inc(&hb->waiters);
        /*
         * Full barrier (A), see the ordering comment above.
         */
        smp_mb__after_atomic();
#endif
}

/*
 * Reflects a waiter being removed from the waitqueue by wakeup
 * paths.
 */
static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
        atomic_dec(&hb->waiters);
#endif
}

static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
        /*
         * Full barrier (B), see the ordering comment above.
         */
        smp_mb();
        return atomic_read(&hb->waiters);
#else
        return 1;
#endif
}

extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
        __acquires(&hb->lock)
        __acquires(q->lock_ptr);

extern void futex_q_unlock(struct futex_hash_bucket *hb)
        __releases(&hb->lock);

extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                union futex_key *key,
                                struct futex_pi_state **ps,
                                struct task_struct *task,
                                struct task_struct **exiting,
                                int set_waiters);

extern int refill_pi_state_cache(void);
extern void get_pi_state(struct futex_pi_state *pi_state);
extern void put_pi_state(struct futex_pi_state *pi_state);
extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked);

/*
 * Express the locking dependencies for lockdep:
 */
static inline void
double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
        __acquires(&hb1->lock)
        __acquires(&hb2->lock)
        __no_context_analysis
{
        if (hb1 > hb2)
                swap(hb1, hb2);

        spin_lock(&hb1->lock);
        if (hb1 != hb2)
                spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
}

static inline void
double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
        __releases(&hb1->lock)
        __releases(&hb2->lock)
        __no_context_analysis
{
        spin_unlock(&hb1->lock);
        if (hb1 != hb2)
                spin_unlock(&hb2->lock);
}

/* syscalls */

extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
                                 val, ktime_t *abs_time, u32 bitset, u32 __user
                                 *uaddr2);

extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
                         u32 __user *uaddr2, unsigned int flags2,
                         int nr_wake, int nr_requeue,
                         u32 *cmpval, int requeue_pi);

extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
                        struct hrtimer_sleeper *to, u32 bitset);

extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
                      ktime_t *abs_time, u32 bitset);

/**
 * struct futex_vector - Auxiliary struct for futex_waitv()
 * @w: Userspace provided data
 * @q: Kernel side data
 *
 * Struct used to build an array with all data need for futex_waitv()
 */
struct futex_vector {
        struct futex_waitv w;
        struct futex_q q;
};

extern int futex_parse_waitv(struct futex_vector *futexv,
                             struct futex_waitv __user *uwaitv,
                             unsigned int nr_futexes, futex_wake_fn *wake,
                             void *wake_data);

extern int futex_wait_multiple_setup(struct futex_vector *vs, int count,
                                     int *woken);

extern int futex_unqueue_multiple(struct futex_vector *v, int count);

extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
                               struct hrtimer_sleeper *to);

extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset);

extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
                         u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);

extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);

extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);

#endif /* _FUTEX_H */
















































































































































































































































































   20 


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */

#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/gfp.h>
#include <linux/memory.h>
#include <linux/mutex.h>

static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
{
        init_llist_node(&elem->node);
        elem->total_len = len;
        elem->consumed_len = 0;
}

static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
{
        const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf);
        struct bpf_stream_elem *elem;
        size_t alloc_size;

        /*
         * Length denotes the amount of data to be written as part of stream element,
         * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can
         * accomodate, therefore deny allocations that won't fit into them.
         */
        if (len < 0 || len > max_len)
                return NULL;

        alloc_size = offsetof(struct bpf_stream_elem, str[len]);
        elem = kmalloc_nolock(alloc_size, __GFP_ZERO, -1);
        if (!elem)
                return NULL;

        bpf_stream_elem_init(elem, len);

        return elem;
}

static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len)
{
        struct bpf_stream_elem *elem = NULL;

        /*
         * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream
         * log, elements will be popped at once and reversed to print the log.
         */
        elem = bpf_stream_elem_alloc(len);
        if (!elem)
                return -ENOMEM;

        memcpy(elem->str, str, len);
        llist_add(&elem->node, log);

        return 0;
}

static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len)
{
        if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY)
                return -ENOSPC;
        if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) {
                atomic_sub(len, &stream->capacity);
                return -ENOSPC;
        }
        return 0;
}

static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem)
{
        int len = elem->total_len;

        atomic_sub(len, &stream->capacity);
}

static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len)
{
        int ret = bpf_stream_consume_capacity(stream, len);

        return ret ?: __bpf_stream_push_str(&stream->log, str, len);
}

static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux)
{
        if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR)
                return NULL;
        return &aux->stream[stream_id - 1];
}

static void bpf_stream_free_elem(struct bpf_stream_elem *elem)
{
        kfree_nolock(elem);
}

static void bpf_stream_free_list(struct llist_node *list)
{
        struct bpf_stream_elem *elem, *tmp;

        llist_for_each_entry_safe(elem, tmp, list, node)
                bpf_stream_free_elem(elem);
}

static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream)
{
        return stream->backlog_head;
}

static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream)
{
        struct llist_node *node;

        node = stream->backlog_head;
        if (stream->backlog_head == stream->backlog_tail)
                stream->backlog_head = stream->backlog_tail = NULL;
        else
                stream->backlog_head = node->next;
        return node;
}

static void bpf_stream_backlog_fill(struct bpf_stream *stream)
{
        struct llist_node *head, *tail;

        if (llist_empty(&stream->log))
                return;
        tail = llist_del_all(&stream->log);
        if (!tail)
                return;
        head = llist_reverse_order(tail);

        if (!stream->backlog_head) {
                stream->backlog_head = head;
                stream->backlog_tail = tail;
        } else {
                stream->backlog_tail->next = head;
                stream->backlog_tail = tail;
        }

        return;
}

static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len)
{
        int rem = elem->total_len - elem->consumed_len;
        int used = min(rem, *len);

        elem->consumed_len += used;
        *len -= used;

        return elem->consumed_len == elem->total_len;
}

static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len)
{
        int rem_len = len, cons_len, ret = 0;
        struct bpf_stream_elem *elem = NULL;
        struct llist_node *node;

        mutex_lock(&stream->lock);

        while (rem_len) {
                int pos = len - rem_len;
                bool cont;

                node = bpf_stream_backlog_peek(stream);
                if (!node) {
                        bpf_stream_backlog_fill(stream);
                        node = bpf_stream_backlog_peek(stream);
                }
                if (!node)
                        break;
                elem = container_of(node, typeof(*elem), node);

                cons_len = elem->consumed_len;
                cont = bpf_stream_consume_elem(elem, &rem_len) == false;

                ret = copy_to_user(buf + pos, elem->str + cons_len,
                                   elem->consumed_len - cons_len);
                /* Restore in case of error. */
                if (ret) {
                        ret = -EFAULT;
                        elem->consumed_len = cons_len;
                        break;
                }

                if (cont)
                        continue;
                bpf_stream_backlog_pop(stream);
                bpf_stream_release_capacity(stream, elem);
                bpf_stream_free_elem(elem);
        }

        mutex_unlock(&stream->lock);
        return ret ? ret : len - rem_len;
}

int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len)
{
        struct bpf_stream *stream;

        stream = bpf_stream_get(stream_id, prog->aux);
        if (!stream)
                return -ENOENT;
        return bpf_stream_read(stream, buf, len);
}

__bpf_kfunc_start_defs();

/*
 * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
 * enum in headers.
 */
__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
                                   u32 len__sz, struct bpf_prog_aux *aux)
{
        struct bpf_bprintf_data data = {
                .get_bin_args        = true,
                .get_buf        = true,
        };
        u32 fmt_size = strlen(fmt__str) + 1;
        struct bpf_stream *stream;
        u32 data_len = len__sz;
        int ret, num_args;

        stream = bpf_stream_get(stream_id, aux);
        if (!stream)
                return -ENOENT;

        if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
            (data_len && !args))
                return -EINVAL;
        num_args = data_len / 8;

        ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data);
        if (ret < 0)
                return ret;

        ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args);
        /* Exclude NULL byte during push. */
        ret = bpf_stream_push_str(stream, data.buf, ret);
        bpf_bprintf_cleanup(&data);

        return ret;
}

/* Directly trigger a stack dump from the program. */
__bpf_kfunc int bpf_stream_print_stack(int stream_id, struct bpf_prog_aux *aux)
{
        struct bpf_stream_stage ss;
        struct bpf_prog *prog;

        /* Make sure the stream ID is valid. */
        if (!bpf_stream_get(stream_id, aux))
                return -ENOENT;

        prog = aux->main_prog_aux->prog;

        bpf_stream_stage(ss, prog, stream_id, ({
                bpf_stream_dump_stack(ss);
        }));

        return 0;
}

__bpf_kfunc_end_defs();

/* Added kfunc to common_btf_ids */

void bpf_prog_stream_init(struct bpf_prog *prog)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
                atomic_set(&prog->aux->stream[i].capacity, 0);
                init_llist_head(&prog->aux->stream[i].log);
                mutex_init(&prog->aux->stream[i].lock);
                prog->aux->stream[i].backlog_head = NULL;
                prog->aux->stream[i].backlog_tail = NULL;
        }
}

void bpf_prog_stream_free(struct bpf_prog *prog)
{
        struct llist_node *list;
        int i;

        for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
                list = llist_del_all(&prog->aux->stream[i].log);
                bpf_stream_free_list(list);
                bpf_stream_free_list(prog->aux->stream[i].backlog_head);
        }
}

void bpf_stream_stage_init(struct bpf_stream_stage *ss)
{
        init_llist_head(&ss->log);
        ss->len = 0;
}

void bpf_stream_stage_free(struct bpf_stream_stage *ss)
{
        struct llist_node *node;

        node = llist_del_all(&ss->log);
        bpf_stream_free_list(node);
}

int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...)
{
        struct bpf_bprintf_buffers *buf;
        va_list args;
        int ret;

        if (bpf_try_get_buffers(&buf))
                return -EBUSY;

        va_start(args, fmt);
        ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args);
        va_end(args);
        ss->len += ret;
        /* Exclude NULL byte during push. */
        ret = __bpf_stream_push_str(&ss->log, buf->buf, ret);
        bpf_put_buffers();
        return ret;
}

int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
                            enum bpf_stream_id stream_id)
{
        struct llist_node *list, *head, *tail;
        struct bpf_stream *stream;
        int ret;

        stream = bpf_stream_get(stream_id, prog->aux);
        if (!stream)
                return -EINVAL;

        ret = bpf_stream_consume_capacity(stream, ss->len);
        if (ret)
                return ret;

        list = llist_del_all(&ss->log);
        head = tail = list;

        if (!list)
                return 0;
        while (llist_next(list)) {
                tail = llist_next(list);
                list = tail;
        }
        llist_add_batch(head, tail, &stream->log);
        return 0;
}

struct dump_stack_ctx {
        struct bpf_stream_stage *ss;
        int err;
};

static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
{
        struct dump_stack_ctx *ctxp = cookie;
        const char *file = "", *line = "";
        struct bpf_prog *prog;
        int num, ret;

        rcu_read_lock();
        prog = bpf_prog_ksym_find(ip);
        rcu_read_unlock();
        if (prog) {
                ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num);
                if (ret < 0)
                        goto end;
                ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n  %s @ %s:%d\n",
                                                    (void *)(long)ip, line, file, num);
                return !ctxp->err;
        }
end:
        ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)(long)ip);
        return !ctxp->err;
}

int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss)
{
        struct dump_stack_ctx ctx = { .ss = ss };
        int ret;

        ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n",
                                      raw_smp_processor_id(), __kuid_val(current_real_cred()->euid),
                                      current->pid, current->comm);
        if (ret)
                return ret;
        ret = bpf_stream_stage_printk(ss, "Call trace:\n");
        if (ret)
                return ret;
        arch_bpf_stack_walk(dump_stack_cb, &ctx);
        if (ctx.err)
                return ctx.err;
        return bpf_stream_stage_printk(ss, "\n");
}
















































































    3 


































    3 

















































































    3 






































    3 



    3 


































    3 
    3 






    3 






















    3 

    3 















    3 














































































    2 





    3 



    3 





    3 








    3 






    3 

























































    1 

    2 


















































    3 





























































    3 





























    3 












    3 

































    3 

























    3 




    3 






























    3 



















    3 


















    3 




































































































    1 
























































































































































































































































































































    3 






















    3 







































































    3 








    3 


    3 


    3 


    3 


    3 


    1 











    3 


    3 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
// SPDX-License-Identifier: GPL-2.0
/*
 * USB Raw Gadget driver.
 * See Documentation/usb/raw-gadget.rst for more details.
 *
 * Copyright (c) 2020 Google, Inc.
 * Author: Andrey Konovalov <andreyknvl@gmail.com>
 */

#include <linux/compiler.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/delay.h>
#include <linux/idr.h>
#include <linux/kref.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/semaphore.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/wait.h>

#include <linux/usb.h>
#include <linux/usb/ch9.h>
#include <linux/usb/ch11.h>
#include <linux/usb/gadget.h>
#include <linux/usb/composite.h>

#include <uapi/linux/usb/raw_gadget.h>

#define        DRIVER_DESC "USB Raw Gadget"
#define DRIVER_NAME "raw-gadget"

MODULE_DESCRIPTION(DRIVER_DESC);
MODULE_AUTHOR("Andrey Konovalov");
MODULE_LICENSE("GPL");

/*----------------------------------------------------------------------*/

static DEFINE_IDA(driver_id_numbers);
#define DRIVER_DRIVER_NAME_LENGTH_MAX        32
#define USB_RAW_IO_LENGTH_MAX KMALLOC_MAX_SIZE

#define RAW_EVENT_QUEUE_SIZE        16

struct raw_event_queue {
        /* See the comment in raw_event_queue_fetch() for locking details. */
        spinlock_t                lock;
        struct semaphore        sema;
        struct usb_raw_event        *events[RAW_EVENT_QUEUE_SIZE];
        int                        size;
};

static void raw_event_queue_init(struct raw_event_queue *queue)
{
        spin_lock_init(&queue->lock);
        sema_init(&queue->sema, 0);
        queue->size = 0;
}

static int raw_event_queue_add(struct raw_event_queue *queue,
        enum usb_raw_event_type type, size_t length, const void *data)
{
        unsigned long flags;
        struct usb_raw_event *event;

        spin_lock_irqsave(&queue->lock, flags);
        if (queue->size >= RAW_EVENT_QUEUE_SIZE) {
                spin_unlock_irqrestore(&queue->lock, flags);
                return -ENOMEM;
        }
        event = kmalloc(sizeof(*event) + length, GFP_ATOMIC);
        if (!event) {
                spin_unlock_irqrestore(&queue->lock, flags);
                return -ENOMEM;
        }
        event->type = type;
        event->length = length;
        if (event->length)
                memcpy(&event->data[0], data, length);
        queue->events[queue->size] = event;
        queue->size++;
        up(&queue->sema);
        spin_unlock_irqrestore(&queue->lock, flags);
        return 0;
}

static struct usb_raw_event *raw_event_queue_fetch(
                                struct raw_event_queue *queue)
{
        int ret;
        unsigned long flags;
        struct usb_raw_event *event;

        /*
         * This function can be called concurrently. We first check that
         * there's at least one event queued by decrementing the semaphore,
         * and then take the lock to protect queue struct fields.
         */
        ret = down_interruptible(&queue->sema);
        if (ret)
                return ERR_PTR(ret);
        spin_lock_irqsave(&queue->lock, flags);
        /*
         * queue->size must have the same value as queue->sema counter (before
         * the down_interruptible() call above), so this check is a fail-safe.
         */
        if (WARN_ON(!queue->size)) {
                spin_unlock_irqrestore(&queue->lock, flags);
                return ERR_PTR(-ENODEV);
        }
        event = queue->events[0];
        queue->size--;
        memmove(&queue->events[0], &queue->events[1],
                        queue->size * sizeof(queue->events[0]));
        spin_unlock_irqrestore(&queue->lock, flags);
        return event;
}

static void raw_event_queue_destroy(struct raw_event_queue *queue)
{
        int i;

        for (i = 0; i < queue->size; i++)
                kfree(queue->events[i]);
        queue->size = 0;
}

/*----------------------------------------------------------------------*/

struct raw_dev;

enum ep_state {
        STATE_EP_DISABLED,
        STATE_EP_ENABLED,
};

struct raw_ep {
        struct raw_dev                *dev;
        enum ep_state                state;
        struct usb_ep                *ep;
        u8                        addr;
        struct usb_request        *req;
        bool                        urb_queued;
        bool                        disabling;
        ssize_t                        status;
};

enum dev_state {
        STATE_DEV_INVALID = 0,
        STATE_DEV_OPENED,
        STATE_DEV_INITIALIZED,
        STATE_DEV_REGISTERING,
        STATE_DEV_RUNNING,
        STATE_DEV_CLOSED,
        STATE_DEV_FAILED
};

struct raw_dev {
        struct kref                        count;
        spinlock_t                        lock;

        const char                        *udc_name;
        struct usb_gadget_driver        driver;

        /* Reference to misc device: */
        struct device                        *dev;

        /* Make driver names unique */
        int                                driver_id_number;

        /* Protected by lock: */
        enum dev_state                        state;
        bool                                gadget_registered;
        struct usb_gadget                *gadget;
        struct usb_request                *req;
        bool                                ep0_in_pending;
        bool                                ep0_out_pending;
        bool                                ep0_urb_queued;
        ssize_t                                ep0_status;
        struct raw_ep                        eps[USB_RAW_EPS_NUM_MAX];
        int                                eps_num;

        struct completion                ep0_done;
        struct raw_event_queue                queue;
};

static struct raw_dev *dev_new(void)
{
        struct raw_dev *dev;

        dev = kzalloc_obj(*dev);
        if (!dev)
                return NULL;
        /* Matches kref_put() in raw_release(). */
        kref_init(&dev->count);
        spin_lock_init(&dev->lock);
        init_completion(&dev->ep0_done);
        raw_event_queue_init(&dev->queue);
        dev->driver_id_number = -1;
        return dev;
}

static void dev_free(struct kref *kref)
{
        struct raw_dev *dev = container_of(kref, struct raw_dev, count);
        int i;

        kfree(dev->udc_name);
        kfree(dev->driver.udc_name);
        kfree(dev->driver.driver.name);
        if (dev->driver_id_number >= 0)
                ida_free(&driver_id_numbers, dev->driver_id_number);
        if (dev->req) {
                if (dev->ep0_urb_queued)
                        usb_ep_dequeue(dev->gadget->ep0, dev->req);
                usb_ep_free_request(dev->gadget->ep0, dev->req);
        }
        raw_event_queue_destroy(&dev->queue);
        for (i = 0; i < dev->eps_num; i++) {
                if (dev->eps[i].state == STATE_EP_DISABLED)
                        continue;
                usb_ep_disable(dev->eps[i].ep);
                usb_ep_free_request(dev->eps[i].ep, dev->eps[i].req);
                kfree(dev->eps[i].ep->desc);
                dev->eps[i].state = STATE_EP_DISABLED;
        }
        kfree(dev);
}

/*----------------------------------------------------------------------*/

static int raw_queue_event(struct raw_dev *dev,
        enum usb_raw_event_type type, size_t length, const void *data)
{
        int ret = 0;
        unsigned long flags;

        ret = raw_event_queue_add(&dev->queue, type, length, data);
        if (ret < 0) {
                spin_lock_irqsave(&dev->lock, flags);
                dev->state = STATE_DEV_FAILED;
                spin_unlock_irqrestore(&dev->lock, flags);
        }
        return ret;
}

static void gadget_ep0_complete(struct usb_ep *ep, struct usb_request *req)
{
        struct raw_dev *dev = req->context;
        unsigned long flags;

        spin_lock_irqsave(&dev->lock, flags);
        if (req->status)
                dev->ep0_status = req->status;
        else
                dev->ep0_status = req->actual;
        if (dev->ep0_in_pending)
                dev->ep0_in_pending = false;
        else
                dev->ep0_out_pending = false;
        spin_unlock_irqrestore(&dev->lock, flags);

        complete(&dev->ep0_done);
}

static u8 get_ep_addr(const char *name)
{
        /* If the endpoint has fixed function (named as e.g. "ep12out-bulk"),
         * parse the endpoint address from its name. We deliberately use
         * deprecated simple_strtoul() function here, as the number isn't
         * followed by '\0' nor '\n'.
         */
        if (isdigit(name[2]))
                return simple_strtoul(&name[2], NULL, 10);
        /* Otherwise the endpoint is configurable (named as e.g. "ep-a"). */
        return USB_RAW_EP_ADDR_ANY;
}

static int gadget_bind(struct usb_gadget *gadget,
                        struct usb_gadget_driver *driver)
{
        int ret = 0, i = 0;
        struct raw_dev *dev = container_of(driver, struct raw_dev, driver);
        struct usb_request *req;
        struct usb_ep *ep;
        unsigned long flags;

        if (strcmp(gadget->name, dev->udc_name) != 0)
                return -ENODEV;

        set_gadget_data(gadget, dev);
        req = usb_ep_alloc_request(gadget->ep0, GFP_KERNEL);
        if (!req) {
                dev_err(&gadget->dev, "usb_ep_alloc_request failed\n");
                set_gadget_data(gadget, NULL);
                return -ENOMEM;
        }

        spin_lock_irqsave(&dev->lock, flags);
        dev->req = req;
        dev->req->context = dev;
        dev->req->complete = gadget_ep0_complete;
        dev->gadget = gadget;
        gadget_for_each_ep(ep, dev->gadget) {
                dev->eps[i].ep = ep;
                dev->eps[i].addr = get_ep_addr(ep->name);
                dev->eps[i].state = STATE_EP_DISABLED;
                i++;
        }
        dev->eps_num = i;
        spin_unlock_irqrestore(&dev->lock, flags);

        dev_dbg(&gadget->dev, "gadget connected\n");
        ret = raw_queue_event(dev, USB_RAW_EVENT_CONNECT, 0, NULL);
        if (ret < 0) {
                dev_err(&gadget->dev, "failed to queue connect event\n");
                set_gadget_data(gadget, NULL);
                return ret;
        }

        /* Matches kref_put() in gadget_unbind(). */
        kref_get(&dev->count);
        return ret;
}

static void gadget_unbind(struct usb_gadget *gadget)
{
        struct raw_dev *dev = get_gadget_data(gadget);

        set_gadget_data(gadget, NULL);
        /* Matches kref_get() in gadget_bind(). */
        kref_put(&dev->count, dev_free);
}

static int gadget_setup(struct usb_gadget *gadget,
                        const struct usb_ctrlrequest *ctrl)
{
        int ret = 0;
        struct raw_dev *dev = get_gadget_data(gadget);
        unsigned long flags;

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_err(&gadget->dev, "ignoring, device is not running\n");
                ret = -ENODEV;
                goto out_unlock;
        }
        if (dev->ep0_in_pending || dev->ep0_out_pending) {
                dev_dbg(&gadget->dev, "stalling, request already pending\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if ((ctrl->bRequestType & USB_DIR_IN) && ctrl->wLength)
                dev->ep0_in_pending = true;
        else
                dev->ep0_out_pending = true;
        spin_unlock_irqrestore(&dev->lock, flags);

        ret = raw_queue_event(dev, USB_RAW_EVENT_CONTROL, sizeof(*ctrl), ctrl);
        if (ret < 0)
                dev_err(&gadget->dev, "failed to queue control event\n");
        goto out;

out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
out:
        if (ret == 0 && ctrl->wLength == 0) {
                /*
                 * Return USB_GADGET_DELAYED_STATUS as a workaround to stop
                 * some UDC drivers (e.g. dwc3) from automatically proceeding
                 * with the status stage for 0-length transfers.
                 * Should be removed once all UDC drivers are fixed to always
                 * delay the status stage until a response is queued to EP0.
                 */
                return USB_GADGET_DELAYED_STATUS;
        }
        return ret;
}

static void gadget_disconnect(struct usb_gadget *gadget)
{
        struct raw_dev *dev = get_gadget_data(gadget);
        int ret;

        dev_dbg(&gadget->dev, "gadget disconnected\n");
        ret = raw_queue_event(dev, USB_RAW_EVENT_DISCONNECT, 0, NULL);
        if (ret < 0)
                dev_err(&gadget->dev, "failed to queue disconnect event\n");
}
static void gadget_suspend(struct usb_gadget *gadget)
{
        struct raw_dev *dev = get_gadget_data(gadget);
        int ret;

        dev_dbg(&gadget->dev, "gadget suspended\n");
        ret = raw_queue_event(dev, USB_RAW_EVENT_SUSPEND, 0, NULL);
        if (ret < 0)
                dev_err(&gadget->dev, "failed to queue suspend event\n");
}
static void gadget_resume(struct usb_gadget *gadget)
{
        struct raw_dev *dev = get_gadget_data(gadget);
        int ret;

        dev_dbg(&gadget->dev, "gadget resumed\n");
        ret = raw_queue_event(dev, USB_RAW_EVENT_RESUME, 0, NULL);
        if (ret < 0)
                dev_err(&gadget->dev, "failed to queue resume event\n");
}
static void gadget_reset(struct usb_gadget *gadget)
{
        struct raw_dev *dev = get_gadget_data(gadget);
        int ret;

        dev_dbg(&gadget->dev, "gadget reset\n");
        ret = raw_queue_event(dev, USB_RAW_EVENT_RESET, 0, NULL);
        if (ret < 0)
                dev_err(&gadget->dev, "failed to queue reset event\n");
}

/*----------------------------------------------------------------------*/

static struct miscdevice raw_misc_device;

static int raw_open(struct inode *inode, struct file *fd)
{
        struct raw_dev *dev;

        /* Nonblocking I/O is not supported yet. */
        if (fd->f_flags & O_NONBLOCK)
                return -EINVAL;

        dev = dev_new();
        if (!dev)
                return -ENOMEM;
        fd->private_data = dev;
        dev->state = STATE_DEV_OPENED;
        dev->dev = raw_misc_device.this_device;
        return 0;
}

static int raw_release(struct inode *inode, struct file *fd)
{
        int ret = 0;
        struct raw_dev *dev = fd->private_data;
        unsigned long flags;
        bool unregister = false;

        spin_lock_irqsave(&dev->lock, flags);
        dev->state = STATE_DEV_CLOSED;
        if (!dev->gadget) {
                spin_unlock_irqrestore(&dev->lock, flags);
                goto out_put;
        }
        if (dev->gadget_registered)
                unregister = true;
        dev->gadget_registered = false;
        spin_unlock_irqrestore(&dev->lock, flags);

        if (unregister) {
                ret = usb_gadget_unregister_driver(&dev->driver);
                if (ret != 0)
                        dev_err(dev->dev,
                                "usb_gadget_unregister_driver() failed with %d\n",
                                ret);
                /* Matches kref_get() in raw_ioctl_run(). */
                kref_put(&dev->count, dev_free);
        }

out_put:
        /* Matches dev_new() in raw_open(). */
        kref_put(&dev->count, dev_free);
        return ret;
}

/*----------------------------------------------------------------------*/

static int raw_ioctl_init(struct raw_dev *dev, unsigned long value)
{
        int ret = 0;
        int driver_id_number;
        struct usb_raw_init arg;
        char *udc_driver_name;
        char *udc_device_name;
        char *driver_driver_name;
        unsigned long flags;

        if (copy_from_user(&arg, (void __user *)value, sizeof(arg)))
                return -EFAULT;

        switch (arg.speed) {
        case USB_SPEED_UNKNOWN:
                arg.speed = USB_SPEED_HIGH;
                break;
        case USB_SPEED_LOW:
        case USB_SPEED_FULL:
        case USB_SPEED_HIGH:
        case USB_SPEED_SUPER:
                break;
        default:
                return -EINVAL;
        }

        driver_id_number = ida_alloc(&driver_id_numbers, GFP_KERNEL);
        if (driver_id_number < 0)
                return driver_id_number;

        driver_driver_name = kmalloc(DRIVER_DRIVER_NAME_LENGTH_MAX, GFP_KERNEL);
        if (!driver_driver_name) {
                ret = -ENOMEM;
                goto out_free_driver_id_number;
        }
        snprintf(driver_driver_name, DRIVER_DRIVER_NAME_LENGTH_MAX,
                                DRIVER_NAME ".%d", driver_id_number);

        udc_driver_name = kmalloc(UDC_NAME_LENGTH_MAX, GFP_KERNEL);
        if (!udc_driver_name) {
                ret = -ENOMEM;
                goto out_free_driver_driver_name;
        }
        ret = strscpy(udc_driver_name, &arg.driver_name[0],
                                UDC_NAME_LENGTH_MAX);
        if (ret < 0)
                goto out_free_udc_driver_name;
        ret = 0;

        udc_device_name = kmalloc(UDC_NAME_LENGTH_MAX, GFP_KERNEL);
        if (!udc_device_name) {
                ret = -ENOMEM;
                goto out_free_udc_driver_name;
        }
        ret = strscpy(udc_device_name, &arg.device_name[0],
                                UDC_NAME_LENGTH_MAX);
        if (ret < 0)
                goto out_free_udc_device_name;
        ret = 0;

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_OPENED) {
                dev_dbg(dev->dev, "fail, device is not opened\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        dev->udc_name = udc_driver_name;

        dev->driver.function = DRIVER_DESC;
        dev->driver.max_speed = arg.speed;
        dev->driver.setup = gadget_setup;
        dev->driver.disconnect = gadget_disconnect;
        dev->driver.bind = gadget_bind;
        dev->driver.unbind = gadget_unbind;
        dev->driver.suspend = gadget_suspend;
        dev->driver.resume = gadget_resume;
        dev->driver.reset = gadget_reset;
        dev->driver.driver.name = driver_driver_name;
        dev->driver.udc_name = udc_device_name;
        dev->driver.match_existing_only = 1;
        dev->driver_id_number = driver_id_number;

        dev->state = STATE_DEV_INITIALIZED;
        spin_unlock_irqrestore(&dev->lock, flags);
        return ret;

out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
out_free_udc_device_name:
        kfree(udc_device_name);
out_free_udc_driver_name:
        kfree(udc_driver_name);
out_free_driver_driver_name:
        kfree(driver_driver_name);
out_free_driver_id_number:
        ida_free(&driver_id_numbers, driver_id_number);
        return ret;
}

static int raw_ioctl_run(struct raw_dev *dev, unsigned long value)
{
        int ret = 0;
        unsigned long flags;

        if (value)
                return -EINVAL;

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_INITIALIZED) {
                dev_dbg(dev->dev, "fail, device is not initialized\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        dev->state = STATE_DEV_REGISTERING;
        spin_unlock_irqrestore(&dev->lock, flags);

        ret = usb_gadget_register_driver(&dev->driver);

        spin_lock_irqsave(&dev->lock, flags);
        if (ret) {
                dev_err(dev->dev,
                        "fail, usb_gadget_register_driver returned %d\n", ret);
                dev->state = STATE_DEV_FAILED;
                goto out_unlock;
        }
        dev->gadget_registered = true;
        dev->state = STATE_DEV_RUNNING;
        /* Matches kref_put() in raw_release(). */
        kref_get(&dev->count);

out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
        return ret;
}

static int raw_ioctl_event_fetch(struct raw_dev *dev, unsigned long value)
{
        struct usb_raw_event arg;
        unsigned long flags;
        struct usb_raw_event *event;
        uint32_t length;

        if (copy_from_user(&arg, (void __user *)value, sizeof(arg)))
                return -EFAULT;

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_dbg(dev->dev, "fail, device is not running\n");
                spin_unlock_irqrestore(&dev->lock, flags);
                return -EINVAL;
        }
        if (!dev->gadget) {
                dev_dbg(dev->dev, "fail, gadget is not bound\n");
                spin_unlock_irqrestore(&dev->lock, flags);
                return -EBUSY;
        }
        spin_unlock_irqrestore(&dev->lock, flags);

        event = raw_event_queue_fetch(&dev->queue);
        if (PTR_ERR(event) == -EINTR) {
                dev_dbg(&dev->gadget->dev, "event fetching interrupted\n");
                return -EINTR;
        }
        if (IS_ERR(event)) {
                dev_err(&dev->gadget->dev, "failed to fetch event\n");
                spin_lock_irqsave(&dev->lock, flags);
                dev->state = STATE_DEV_FAILED;
                spin_unlock_irqrestore(&dev->lock, flags);
                return -ENODEV;
        }
        length = min(arg.length, event->length);
        if (copy_to_user((void __user *)value, event, sizeof(*event) + length)) {
                kfree(event);
                return -EFAULT;
        }

        kfree(event);
        return 0;
}

static void *raw_alloc_io_data(struct usb_raw_ep_io *io, void __user *ptr,
                                bool get_from_user)
{
        void *data;

        if (copy_from_user(io, ptr, sizeof(*io)))
                return ERR_PTR(-EFAULT);
        if (io->ep >= USB_RAW_EPS_NUM_MAX)
                return ERR_PTR(-EINVAL);
        if (!usb_raw_io_flags_valid(io->flags))
                return ERR_PTR(-EINVAL);
        if (io->length > USB_RAW_IO_LENGTH_MAX)
                return ERR_PTR(-EINVAL);
        if (get_from_user)
                data = memdup_user(ptr + sizeof(*io), io->length);
        else {
                data = kmalloc(io->length, GFP_KERNEL);
                if (!data)
                        data = ERR_PTR(-ENOMEM);
        }
        return data;
}

static int raw_process_ep0_io(struct raw_dev *dev, struct usb_raw_ep_io *io,
                                void *data, bool in)
{
        int ret = 0;
        unsigned long flags;

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_dbg(dev->dev, "fail, device is not running\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (!dev->gadget) {
                dev_dbg(dev->dev, "fail, gadget is not bound\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (dev->ep0_urb_queued) {
                dev_dbg(&dev->gadget->dev, "fail, urb already queued\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if ((in && !dev->ep0_in_pending) ||
                        (!in && !dev->ep0_out_pending)) {
                dev_dbg(&dev->gadget->dev, "fail, wrong direction\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (WARN_ON(in && dev->ep0_out_pending)) {
                ret = -ENODEV;
                dev->state = STATE_DEV_FAILED;
                goto out_unlock;
        }
        if (WARN_ON(!in && dev->ep0_in_pending)) {
                ret = -ENODEV;
                dev->state = STATE_DEV_FAILED;
                goto out_unlock;
        }

        dev->req->buf = data;
        dev->req->length = io->length;
        dev->req->zero = usb_raw_io_flags_zero(io->flags);
        dev->ep0_urb_queued = true;
        spin_unlock_irqrestore(&dev->lock, flags);

        ret = usb_ep_queue(dev->gadget->ep0, dev->req, GFP_KERNEL);
        if (ret) {
                dev_err(&dev->gadget->dev,
                                "fail, usb_ep_queue returned %d\n", ret);
                spin_lock_irqsave(&dev->lock, flags);
                goto out_queue_failed;
        }

        ret = wait_for_completion_interruptible(&dev->ep0_done);
        if (ret) {
                dev_dbg(&dev->gadget->dev, "wait interrupted\n");
                usb_ep_dequeue(dev->gadget->ep0, dev->req);
                wait_for_completion(&dev->ep0_done);
                spin_lock_irqsave(&dev->lock, flags);
                if (dev->ep0_status == -ECONNRESET)
                        dev->ep0_status = -EINTR;
                goto out_interrupted;
        }

        spin_lock_irqsave(&dev->lock, flags);

out_interrupted:
        ret = dev->ep0_status;
out_queue_failed:
        dev->ep0_urb_queued = false;
out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
        return ret;
}

static int raw_ioctl_ep0_write(struct raw_dev *dev, unsigned long value)
{
        int ret = 0;
        void *data;
        struct usb_raw_ep_io io;

        data = raw_alloc_io_data(&io, (void __user *)value, true);
        if (IS_ERR(data))
                return PTR_ERR(data);
        ret = raw_process_ep0_io(dev, &io, data, true);
        kfree(data);
        return ret;
}

static int raw_ioctl_ep0_read(struct raw_dev *dev, unsigned long value)
{
        int ret = 0;
        void *data;
        struct usb_raw_ep_io io;
        unsigned int length;

        data = raw_alloc_io_data(&io, (void __user *)value, false);
        if (IS_ERR(data))
                return PTR_ERR(data);
        ret = raw_process_ep0_io(dev, &io, data, false);
        if (ret < 0)
                goto free;

        length = min_t(unsigned int, io.length, ret);
        if (copy_to_user((void __user *)(value + sizeof(io)), data, length))
                ret = -EFAULT;
        else
                ret = length;
free:
        kfree(data);
        return ret;
}

static int raw_ioctl_ep0_stall(struct raw_dev *dev, unsigned long value)
{
        int ret = 0;
        unsigned long flags;

        if (value)
                return -EINVAL;
        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_dbg(dev->dev, "fail, device is not running\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (!dev->gadget) {
                dev_dbg(dev->dev, "fail, gadget is not bound\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (dev->ep0_urb_queued) {
                dev_dbg(&dev->gadget->dev, "fail, urb already queued\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (!dev->ep0_in_pending && !dev->ep0_out_pending) {
                dev_dbg(&dev->gadget->dev, "fail, no request pending\n");
                ret = -EBUSY;
                goto out_unlock;
        }

        ret = usb_ep_set_halt(dev->gadget->ep0);
        if (ret < 0)
                dev_err(&dev->gadget->dev,
                                "fail, usb_ep_set_halt returned %d\n", ret);

        if (dev->ep0_in_pending)
                dev->ep0_in_pending = false;
        else
                dev->ep0_out_pending = false;

out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
        return ret;
}

static int raw_ioctl_ep_enable(struct raw_dev *dev, unsigned long value)
{
        int ret = 0, i;
        unsigned long flags;
        struct usb_endpoint_descriptor *desc;
        struct raw_ep *ep;
        bool ep_props_matched = false;

        desc = memdup_user((void __user *)value, sizeof(*desc));
        if (IS_ERR(desc))
                return PTR_ERR(desc);

        /*
         * Endpoints with a maxpacket length of 0 can cause crashes in UDC
         * drivers.
         */
        if (usb_endpoint_maxp(desc) == 0) {
                dev_dbg(dev->dev, "fail, bad endpoint maxpacket\n");
                kfree(desc);
                return -EINVAL;
        }

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_dbg(dev->dev, "fail, device is not running\n");
                ret = -EINVAL;
                goto out_free;
        }
        if (!dev->gadget) {
                dev_dbg(dev->dev, "fail, gadget is not bound\n");
                ret = -EBUSY;
                goto out_free;
        }

        for (i = 0; i < dev->eps_num; i++) {
                ep = &dev->eps[i];
                if (ep->addr != usb_endpoint_num(desc) &&
                                ep->addr != USB_RAW_EP_ADDR_ANY)
                        continue;
                if (!usb_gadget_ep_match_desc(dev->gadget, ep->ep, desc, NULL))
                        continue;
                ep_props_matched = true;
                if (ep->state != STATE_EP_DISABLED)
                        continue;
                ep->ep->desc = desc;
                ret = usb_ep_enable(ep->ep);
                if (ret < 0) {
                        dev_err(&dev->gadget->dev,
                                "fail, usb_ep_enable returned %d\n", ret);
                        goto out_free;
                }
                ep->req = usb_ep_alloc_request(ep->ep, GFP_ATOMIC);
                if (!ep->req) {
                        dev_err(&dev->gadget->dev,
                                "fail, usb_ep_alloc_request failed\n");
                        usb_ep_disable(ep->ep);
                        ret = -ENOMEM;
                        goto out_free;
                }
                ep->state = STATE_EP_ENABLED;
                ep->ep->driver_data = ep;
                ret = i;
                goto out_unlock;
        }

        if (!ep_props_matched) {
                dev_dbg(&dev->gadget->dev, "fail, bad endpoint descriptor\n");
                ret = -EINVAL;
        } else {
                dev_dbg(&dev->gadget->dev, "fail, no endpoints available\n");
                ret = -EBUSY;
        }

out_free:
        kfree(desc);
out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
        return ret;
}

static int raw_ioctl_ep_disable(struct raw_dev *dev, unsigned long value)
{
        int ret = 0, i = value;
        unsigned long flags;

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_dbg(dev->dev, "fail, device is not running\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (!dev->gadget) {
                dev_dbg(dev->dev, "fail, gadget is not bound\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (i < 0 || i >= dev->eps_num) {
                dev_dbg(dev->dev, "fail, invalid endpoint\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (dev->eps[i].state == STATE_EP_DISABLED) {
                dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (dev->eps[i].disabling) {
                dev_dbg(&dev->gadget->dev,
                                "fail, disable already in progress\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (dev->eps[i].urb_queued) {
                dev_dbg(&dev->gadget->dev,
                                "fail, waiting for urb completion\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        dev->eps[i].disabling = true;
        spin_unlock_irqrestore(&dev->lock, flags);

        usb_ep_disable(dev->eps[i].ep);

        spin_lock_irqsave(&dev->lock, flags);
        usb_ep_free_request(dev->eps[i].ep, dev->eps[i].req);
        kfree(dev->eps[i].ep->desc);
        dev->eps[i].state = STATE_EP_DISABLED;
        dev->eps[i].disabling = false;

out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
        return ret;
}

static int raw_ioctl_ep_set_clear_halt_wedge(struct raw_dev *dev,
                unsigned long value, bool set, bool halt)
{
        int ret = 0, i = value;
        unsigned long flags;

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_dbg(dev->dev, "fail, device is not running\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (!dev->gadget) {
                dev_dbg(dev->dev, "fail, gadget is not bound\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (i < 0 || i >= dev->eps_num) {
                dev_dbg(dev->dev, "fail, invalid endpoint\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (dev->eps[i].state == STATE_EP_DISABLED) {
                dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (dev->eps[i].disabling) {
                dev_dbg(&dev->gadget->dev,
                                "fail, disable is in progress\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (dev->eps[i].urb_queued) {
                dev_dbg(&dev->gadget->dev,
                                "fail, waiting for urb completion\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (usb_endpoint_xfer_isoc(dev->eps[i].ep->desc)) {
                dev_dbg(&dev->gadget->dev,
                                "fail, can't halt/wedge ISO endpoint\n");
                ret = -EINVAL;
                goto out_unlock;
        }

        if (set && halt) {
                ret = usb_ep_set_halt(dev->eps[i].ep);
                if (ret < 0)
                        dev_err(&dev->gadget->dev,
                                "fail, usb_ep_set_halt returned %d\n", ret);
        } else if (!set && halt) {
                ret = usb_ep_clear_halt(dev->eps[i].ep);
                if (ret < 0)
                        dev_err(&dev->gadget->dev,
                                "fail, usb_ep_clear_halt returned %d\n", ret);
        } else if (set && !halt) {
                ret = usb_ep_set_wedge(dev->eps[i].ep);
                if (ret < 0)
                        dev_err(&dev->gadget->dev,
                                "fail, usb_ep_set_wedge returned %d\n", ret);
        }

out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
        return ret;
}

static void gadget_ep_complete(struct usb_ep *ep, struct usb_request *req)
{
        struct raw_ep *r_ep = (struct raw_ep *)ep->driver_data;
        struct raw_dev *dev = r_ep->dev;
        unsigned long flags;

        spin_lock_irqsave(&dev->lock, flags);
        if (req->status)
                r_ep->status = req->status;
        else
                r_ep->status = req->actual;
        spin_unlock_irqrestore(&dev->lock, flags);

        complete((struct completion *)req->context);
}

static int raw_process_ep_io(struct raw_dev *dev, struct usb_raw_ep_io *io,
                                void *data, bool in)
{
        int ret = 0;
        unsigned long flags;
        struct raw_ep *ep;
        DECLARE_COMPLETION_ONSTACK(done);

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_dbg(dev->dev, "fail, device is not running\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (!dev->gadget) {
                dev_dbg(dev->dev, "fail, gadget is not bound\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (io->ep >= dev->eps_num) {
                dev_dbg(&dev->gadget->dev, "fail, invalid endpoint\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        ep = &dev->eps[io->ep];
        if (ep->state != STATE_EP_ENABLED) {
                dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (ep->disabling) {
                dev_dbg(&dev->gadget->dev,
                                "fail, endpoint is already being disabled\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (ep->urb_queued) {
                dev_dbg(&dev->gadget->dev, "fail, urb already queued\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        if (in != usb_endpoint_dir_in(ep->ep->desc)) {
                dev_dbg(&dev->gadget->dev, "fail, wrong direction\n");
                ret = -EINVAL;
                goto out_unlock;
        }

        ep->dev = dev;
        ep->req->context = &done;
        ep->req->complete = gadget_ep_complete;
        ep->req->buf = data;
        ep->req->length = io->length;
        ep->req->zero = usb_raw_io_flags_zero(io->flags);
        ep->urb_queued = true;
        spin_unlock_irqrestore(&dev->lock, flags);

        ret = usb_ep_queue(ep->ep, ep->req, GFP_KERNEL);
        if (ret) {
                dev_err(&dev->gadget->dev,
                                "fail, usb_ep_queue returned %d\n", ret);
                spin_lock_irqsave(&dev->lock, flags);
                goto out_queue_failed;
        }

        ret = wait_for_completion_interruptible(&done);
        if (ret) {
                dev_dbg(&dev->gadget->dev, "wait interrupted\n");
                usb_ep_dequeue(ep->ep, ep->req);
                wait_for_completion(&done);
                spin_lock_irqsave(&dev->lock, flags);
                if (ep->status == -ECONNRESET)
                        ep->status = -EINTR;
                goto out_interrupted;
        }

        spin_lock_irqsave(&dev->lock, flags);

out_interrupted:
        ret = ep->status;
out_queue_failed:
        ep->urb_queued = false;
out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
        return ret;
}

static int raw_ioctl_ep_write(struct raw_dev *dev, unsigned long value)
{
        int ret = 0;
        char *data;
        struct usb_raw_ep_io io;

        data = raw_alloc_io_data(&io, (void __user *)value, true);
        if (IS_ERR(data))
                return PTR_ERR(data);
        ret = raw_process_ep_io(dev, &io, data, true);
        kfree(data);
        return ret;
}

static int raw_ioctl_ep_read(struct raw_dev *dev, unsigned long value)
{
        int ret = 0;
        char *data;
        struct usb_raw_ep_io io;
        unsigned int length;

        data = raw_alloc_io_data(&io, (void __user *)value, false);
        if (IS_ERR(data))
                return PTR_ERR(data);
        ret = raw_process_ep_io(dev, &io, data, false);
        if (ret < 0)
                goto free;

        length = min_t(unsigned int, io.length, ret);
        if (copy_to_user((void __user *)(value + sizeof(io)), data, length))
                ret = -EFAULT;
        else
                ret = length;
free:
        kfree(data);
        return ret;
}

static int raw_ioctl_configure(struct raw_dev *dev, unsigned long value)
{
        int ret = 0;
        unsigned long flags;

        if (value)
                return -EINVAL;
        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_dbg(dev->dev, "fail, device is not running\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (!dev->gadget) {
                dev_dbg(dev->dev, "fail, gadget is not bound\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        usb_gadget_set_state(dev->gadget, USB_STATE_CONFIGURED);

out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
        return ret;
}

static int raw_ioctl_vbus_draw(struct raw_dev *dev, unsigned long value)
{
        int ret = 0;
        unsigned long flags;

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_dbg(dev->dev, "fail, device is not running\n");
                ret = -EINVAL;
                goto out_unlock;
        }
        if (!dev->gadget) {
                dev_dbg(dev->dev, "fail, gadget is not bound\n");
                ret = -EBUSY;
                goto out_unlock;
        }
        usb_gadget_vbus_draw(dev->gadget, 2 * value);

out_unlock:
        spin_unlock_irqrestore(&dev->lock, flags);
        return ret;
}

static void fill_ep_caps(struct usb_ep_caps *caps,
                                struct usb_raw_ep_caps *raw_caps)
{
        raw_caps->type_control = caps->type_control;
        raw_caps->type_iso = caps->type_iso;
        raw_caps->type_bulk = caps->type_bulk;
        raw_caps->type_int = caps->type_int;
        raw_caps->dir_in = caps->dir_in;
        raw_caps->dir_out = caps->dir_out;
}

static void fill_ep_limits(struct usb_ep *ep, struct usb_raw_ep_limits *limits)
{
        limits->maxpacket_limit = ep->maxpacket_limit;
        limits->max_streams = ep->max_streams;
}

static int raw_ioctl_eps_info(struct raw_dev *dev, unsigned long value)
{
        int ret = 0, i;
        unsigned long flags;
        struct usb_raw_eps_info *info;
        struct raw_ep *ep;

        info = kzalloc_obj(*info);
        if (!info) {
                ret = -ENOMEM;
                goto out;
        }

        spin_lock_irqsave(&dev->lock, flags);
        if (dev->state != STATE_DEV_RUNNING) {
                dev_dbg(dev->dev, "fail, device is not running\n");
                ret = -EINVAL;
                spin_unlock_irqrestore(&dev->lock, flags);
                goto out_free;
        }
        if (!dev->gadget) {
                dev_dbg(dev->dev, "fail, gadget is not bound\n");
                ret = -EBUSY;
                spin_unlock_irqrestore(&dev->lock, flags);
                goto out_free;
        }

        for (i = 0; i < dev->eps_num; i++) {
                ep = &dev->eps[i];
                strscpy(&info->eps[i].name[0], ep->ep->name,
                                USB_RAW_EP_NAME_MAX);
                info->eps[i].addr = ep->addr;
                fill_ep_caps(&ep->ep->caps, &info->eps[i].caps);
                fill_ep_limits(ep->ep, &info->eps[i].limits);
        }
        ret = dev->eps_num;
        spin_unlock_irqrestore(&dev->lock, flags);

        if (copy_to_user((void __user *)value, info, sizeof(*info)))
                ret = -EFAULT;

out_free:
        kfree(info);
out:
        return ret;
}

static long raw_ioctl(struct file *fd, unsigned int cmd, unsigned long value)
{
        struct raw_dev *dev = fd->private_data;
        int ret = 0;

        if (!dev)
                return -EBUSY;

        switch (cmd) {
        case USB_RAW_IOCTL_INIT:
                ret = raw_ioctl_init(dev, value);
                break;
        case USB_RAW_IOCTL_RUN:
                ret = raw_ioctl_run(dev, value);
                break;
        case USB_RAW_IOCTL_EVENT_FETCH:
                ret = raw_ioctl_event_fetch(dev, value);
                break;
        case USB_RAW_IOCTL_EP0_WRITE:
                ret = raw_ioctl_ep0_write(dev, value);
                break;
        case USB_RAW_IOCTL_EP0_READ:
                ret = raw_ioctl_ep0_read(dev, value);
                break;
        case USB_RAW_IOCTL_EP_ENABLE:
                ret = raw_ioctl_ep_enable(dev, value);
                break;
        case USB_RAW_IOCTL_EP_DISABLE:
                ret = raw_ioctl_ep_disable(dev, value);
                break;
        case USB_RAW_IOCTL_EP_WRITE:
                ret = raw_ioctl_ep_write(dev, value);
                break;
        case USB_RAW_IOCTL_EP_READ:
                ret = raw_ioctl_ep_read(dev, value);
                break;
        case USB_RAW_IOCTL_CONFIGURE:
                ret = raw_ioctl_configure(dev, value);
                break;
        case USB_RAW_IOCTL_VBUS_DRAW:
                ret = raw_ioctl_vbus_draw(dev, value);
                break;
        case USB_RAW_IOCTL_EPS_INFO:
                ret = raw_ioctl_eps_info(dev, value);
                break;
        case USB_RAW_IOCTL_EP0_STALL:
                ret = raw_ioctl_ep0_stall(dev, value);
                break;
        case USB_RAW_IOCTL_EP_SET_HALT:
                ret = raw_ioctl_ep_set_clear_halt_wedge(
                                        dev, value, true, true);
                break;
        case USB_RAW_IOCTL_EP_CLEAR_HALT:
                ret = raw_ioctl_ep_set_clear_halt_wedge(
                                        dev, value, false, true);
                break;
        case USB_RAW_IOCTL_EP_SET_WEDGE:
                ret = raw_ioctl_ep_set_clear_halt_wedge(
                                        dev, value, true, false);
                break;
        default:
                ret = -EINVAL;
        }

        return ret;
}

/*----------------------------------------------------------------------*/

static const struct file_operations raw_fops = {
        .open =                        raw_open,
        .unlocked_ioctl =        raw_ioctl,
        .compat_ioctl =                raw_ioctl,
        .release =                raw_release,
};

static struct miscdevice raw_misc_device = {
        .minor = MISC_DYNAMIC_MINOR,
        .name = DRIVER_NAME,
        .fops = &raw_fops,
};

module_misc_device(raw_misc_device);










   21 

























   21 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/uaccess.h>
#include <linux/kernel.h>

#include <asm/vsyscall.h>

#ifdef CONFIG_X86_64
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
{
        unsigned long vaddr = (unsigned long)unsafe_src;

        /*
         * Do not allow userspace addresses.  This disallows
         * normal userspace and the userspace guard page:
         */
        if (vaddr < TASK_SIZE_MAX + PAGE_SIZE)
                return false;

        /*
         * Reading from the vsyscall page may cause an unhandled fault in
         * certain cases.  Though it is at an address above TASK_SIZE_MAX, it is
         * usually considered as a user space address.
         */
        if (is_vsyscall_vaddr(vaddr))
                return false;

        /*
         * Allow everything during early boot before 'x86_virt_bits'
         * is initialized.  Needed for instruction decoding in early
         * exception handlers.
         */
        if (!boot_cpu_data.x86_virt_bits)
                return true;

        return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
}
#else
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
{
        return (unsigned long)unsafe_src >= TASK_SIZE_MAX;
}
#endif
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * AppArmor security module
 *
 * This file contains AppArmor network mediation definitions.
 *
 * Copyright (C) 1998-2008 Novell/SUSE
 * Copyright 2009-2017 Canonical Ltd.
 */

#ifndef __AA_NET_H
#define __AA_NET_H

#include <net/sock.h>
#include <linux/path.h>

#include "apparmorfs.h"
#include "label.h"
#include "perms.h"
#include "policy.h"

#define AA_MAY_SEND                AA_MAY_WRITE
#define AA_MAY_RECEIVE                AA_MAY_READ

#define AA_MAY_SHUTDOWN                AA_MAY_DELETE

#define AA_MAY_CONNECT                AA_MAY_OPEN
#define AA_MAY_ACCEPT                0x00100000

#define AA_MAY_BIND                0x00200000
#define AA_MAY_LISTEN                0x00400000

#define AA_MAY_SETOPT                0x01000000
#define AA_MAY_GETOPT                0x02000000

#define NET_PERMS_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE |    \
                        AA_MAY_SHUTDOWN | AA_MAY_BIND | AA_MAY_LISTEN |          \
                        AA_MAY_CONNECT | AA_MAY_ACCEPT | AA_MAY_SETATTR | \
                        AA_MAY_GETATTR | AA_MAY_SETOPT | AA_MAY_GETOPT)

#define NET_FS_PERMS (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE |        \
                      AA_MAY_SHUTDOWN | AA_MAY_CONNECT | AA_MAY_RENAME |\
                      AA_MAY_SETATTR | AA_MAY_GETATTR | AA_MAY_CHMOD |        \
                      AA_MAY_CHOWN | AA_MAY_CHGRP | AA_MAY_LOCK |        \
                      AA_MAY_MPROT)

#define NET_PEER_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CONNECT |        \
                       AA_MAY_ACCEPT)
struct aa_sk_ctx {
        struct aa_label __rcu *label;
        struct aa_label __rcu *peer;
        struct aa_label __rcu *peer_lastupdate;        /* ptr cmp only, no deref */
};

static inline struct aa_sk_ctx *aa_sock(const struct sock *sk)
{
        return sk->sk_security + apparmor_blob_sizes.lbs_sock;
}

#define DEFINE_AUDIT_NET(NAME, OP, CRED, SK, F, T, P)                          \
        struct lsm_network_audit NAME ## _net = { .sk = (SK),                  \
                                                  .family = (F)};          \
        DEFINE_AUDIT_DATA(NAME,                                                  \
                          ((SK) && (F) != AF_UNIX) ? LSM_AUDIT_DATA_NET : \
                                                     LSM_AUDIT_DATA_NONE, \
                                                     AA_CLASS_NET,        \
                          OP);                                                  \
        NAME.common.u.net = &(NAME ## _net);                                  \
        NAME.subj_cred = (CRED);                                          \
        NAME.net.type = (T);                                                  \
        NAME.net.protocol = (P)

#define DEFINE_AUDIT_SK(NAME, OP, CRED, SK)                                     \
        DEFINE_AUDIT_NET(NAME, OP, CRED, SK, (SK)->sk_family, (SK)->sk_type, \
                         (SK)->sk_protocol)


struct aa_secmark {
        u8 audit;
        u8 deny;
        u32 secid;
        char *label;
};

extern struct aa_sfs_entry aa_sfs_entry_network[];
extern struct aa_sfs_entry aa_sfs_entry_networkv9[];

int aa_do_perms(struct aa_profile *profile, struct aa_policydb *policy,
                aa_state_t state, u32 request, struct aa_perms *p,
                struct apparmor_audit_data *ad);
/* passing in state returned by XXX_mediates_AF() */
aa_state_t aa_match_to_prot(struct aa_policydb *policy, aa_state_t state,
                            u32 request, u16 af, int type, int protocol,
                            struct aa_perms **p, const char **info);
void audit_net_cb(struct audit_buffer *ab, void *va);
int aa_profile_af_perm(struct aa_profile *profile,
                       struct apparmor_audit_data *ad,
                       u32 request, u16 family, int type, int protocol);
int aa_af_perm(const struct cred *subj_cred, struct aa_label *label,
               const char *op, u32 request, u16 family,
               int type, int protocol);
static inline int aa_profile_af_sk_perm(struct aa_profile *profile,
                                        struct apparmor_audit_data *ad,
                                        u32 request,
                                        struct sock *sk)
{
        return aa_profile_af_perm(profile, ad, request, sk->sk_family,
                                  sk->sk_type, sk->sk_protocol);
}
int aa_sk_perm(const char *op, u32 request, struct sock *sk);

int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label,
                      const char *op, u32 request,
                      struct file *file);

int apparmor_secmark_check(struct aa_label *label, char *op, u32 request,
                           u32 secid, const struct sock *sk);

#endif /* __AA_NET_H */



















































































































    1 








    1 







    1 









    1 


    1 




































    1 




















    1 

    1 
    1 








    2 





    1 
    2 





    2 








































    1 








    1 































    1 



    1 

    1 
    1 












































































































































































































    1 







    1 




































    1 







    1 
























    1 






















    1 









    1 




















    1 










    1 






















    1 





    1 























    1 














    1 


















    1 



















    1 








    1 













    1 





















    1 























    1 





    1 













    1 










































    1 















    1 




    1 
    1 

















    1 













    1 

    1 









    1 
















































































































































































































































































    1 








    1 













































    1 

































    1 







































    1 
    1 



    1 












    1 



    1 








    1 










































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
// SPDX-License-Identifier: GPL-2.0-or-later
/* Keyring handling
 *
 * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/user_namespace.h>
#include <linux/nsproxy.h>
#include <keys/keyring-type.h>
#include <keys/user-type.h>
#include <linux/assoc_array_priv.h>
#include <linux/uaccess.h>
#include <net/net_namespace.h>
#include "internal.h"

/*
 * When plumbing the depths of the key tree, this sets a hard limit
 * set on how deep we're willing to go.
 */
#define KEYRING_SEARCH_MAX_DEPTH 6

/*
 * We mark pointers we pass to the associative array with bit 1 set if
 * they're keyrings and clear otherwise.
 */
#define KEYRING_PTR_SUBTYPE        0x2UL

static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x)
{
        return (unsigned long)x & KEYRING_PTR_SUBTYPE;
}
static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x)
{
        void *object = assoc_array_ptr_to_leaf(x);
        return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE);
}
static inline void *keyring_key_to_ptr(struct key *key)
{
        if (key->type == &key_type_keyring)
                return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE);
        return key;
}

static DEFINE_RWLOCK(keyring_name_lock);

/*
 * Clean up the bits of user_namespace that belong to us.
 */
void key_free_user_ns(struct user_namespace *ns)
{
        write_lock(&keyring_name_lock);
        list_del_init(&ns->keyring_name_list);
        write_unlock(&keyring_name_lock);

        key_put(ns->user_keyring_register);
#ifdef CONFIG_PERSISTENT_KEYRINGS
        key_put(ns->persistent_keyring_register);
#endif
}

/*
 * The keyring key type definition.  Keyrings are simply keys of this type and
 * can be treated as ordinary keys in addition to having their own special
 * operations.
 */
static int keyring_preparse(struct key_preparsed_payload *prep);
static void keyring_free_preparse(struct key_preparsed_payload *prep);
static int keyring_instantiate(struct key *keyring,
                               struct key_preparsed_payload *prep);
static void keyring_revoke(struct key *keyring);
static void keyring_destroy(struct key *keyring);
static void keyring_describe(const struct key *keyring, struct seq_file *m);
static long keyring_read(const struct key *keyring,
                         char *buffer, size_t buflen);

struct key_type key_type_keyring = {
        .name                = "keyring",
        .def_datalen        = 0,
        .preparse        = keyring_preparse,
        .free_preparse        = keyring_free_preparse,
        .instantiate        = keyring_instantiate,
        .revoke                = keyring_revoke,
        .destroy        = keyring_destroy,
        .describe        = keyring_describe,
        .read                = keyring_read,
};
EXPORT_SYMBOL(key_type_keyring);

/*
 * Semaphore to serialise link/link calls to prevent two link calls in parallel
 * introducing a cycle.
 */
static DEFINE_MUTEX(keyring_serialise_link_lock);

/*
 * Publish the name of a keyring so that it can be found by name (if it has
 * one and it doesn't begin with a dot).
 */
static void keyring_publish_name(struct key *keyring)
{
        struct user_namespace *ns = current_user_ns();

        if (keyring->description &&
            keyring->description[0] &&
            keyring->description[0] != '.') {
                write_lock(&keyring_name_lock);
                list_add_tail(&keyring->name_link, &ns->keyring_name_list);
                write_unlock(&keyring_name_lock);
        }
}

/*
 * Preparse a keyring payload
 */
static int keyring_preparse(struct key_preparsed_payload *prep)
{
        return prep->datalen != 0 ? -EINVAL : 0;
}

/*
 * Free a preparse of a user defined key payload
 */
static void keyring_free_preparse(struct key_preparsed_payload *prep)
{
}

/*
 * Initialise a keyring.
 *
 * Returns 0 on success, -EINVAL if given any data.
 */
static int keyring_instantiate(struct key *keyring,
                               struct key_preparsed_payload *prep)
{
        assoc_array_init(&keyring->keys);
        /* make the keyring available by name if it has one */
        keyring_publish_name(keyring);
        return 0;
}

/*
 * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit.  Ideally we'd
 * fold the carry back too, but that requires inline asm.
 */
static u64 mult_64x32_and_fold(u64 x, u32 y)
{
        u64 hi = (u64)(u32)(x >> 32) * y;
        u64 lo = (u64)(u32)(x) * y;
        return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32);
}

/*
 * Hash a key type and description.
 */
static void hash_key_type_and_desc(struct keyring_index_key *index_key)
{
        const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP;
        const unsigned long fan_mask = ASSOC_ARRAY_FAN_MASK;
        const char *description = index_key->description;
        unsigned long hash, type;
        u32 piece;
        u64 acc;
        int n, desc_len = index_key->desc_len;

        type = (unsigned long)index_key->type;
        acc = mult_64x32_and_fold(type, desc_len + 13);
        acc = mult_64x32_and_fold(acc, 9207);
        piece = (unsigned long)index_key->domain_tag;
        acc = mult_64x32_and_fold(acc, piece);
        acc = mult_64x32_and_fold(acc, 9207);

        for (;;) {
                n = desc_len;
                if (n <= 0)
                        break;
                if (n > 4)
                        n = 4;
                piece = 0;
                memcpy(&piece, description, n);
                description += n;
                desc_len -= n;
                acc = mult_64x32_and_fold(acc, piece);
                acc = mult_64x32_and_fold(acc, 9207);
        }

        /* Fold the hash down to 32 bits if need be. */
        hash = acc;
        if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32)
                hash ^= acc >> 32;

        /* Squidge all the keyrings into a separate part of the tree to
         * ordinary keys by making sure the lowest level segment in the hash is
         * zero for keyrings and non-zero otherwise.
         */
        if (index_key->type != &key_type_keyring && (hash & fan_mask) == 0)
                hash |= (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1;
        else if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0)
                hash = (hash + (hash << level_shift)) & ~fan_mask;
        index_key->hash = hash;
}

/*
 * Finalise an index key to include a part of the description actually in the
 * index key, to set the domain tag and to calculate the hash.
 */
void key_set_index_key(struct keyring_index_key *index_key)
{
        static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), };
        size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc));

        memcpy(index_key->desc, index_key->description, n);

        if (!index_key->domain_tag) {
                if (index_key->type->flags & KEY_TYPE_NET_DOMAIN)
                        index_key->domain_tag = current->nsproxy->net_ns->key_domain;
                else
                        index_key->domain_tag = &default_domain_tag;
        }

        hash_key_type_and_desc(index_key);
}

/**
 * key_put_tag - Release a ref on a tag.
 * @tag: The tag to release.
 *
 * This releases a reference the given tag and returns true if that ref was the
 * last one.
 */
bool key_put_tag(struct key_tag *tag)
{
        if (refcount_dec_and_test(&tag->usage)) {
                kfree_rcu(tag, rcu);
                return true;
        }

        return false;
}

/**
 * key_remove_domain - Kill off a key domain and gc its keys
 * @domain_tag: The domain tag to release.
 *
 * This marks a domain tag as being dead and releases a ref on it.  If that
 * wasn't the last reference, the garbage collector is poked to try and delete
 * all keys that were in the domain.
 */
void key_remove_domain(struct key_tag *domain_tag)
{
        domain_tag->removed = true;
        if (!key_put_tag(domain_tag))
                key_schedule_gc_links();
}

/*
 * Build the next index key chunk.
 *
 * We return it one word-sized chunk at a time.
 */
static unsigned long keyring_get_key_chunk(const void *data, int level)
{
        const struct keyring_index_key *index_key = data;
        unsigned long chunk = 0;
        const u8 *d;
        int desc_len = index_key->desc_len, n = sizeof(chunk);

        level /= ASSOC_ARRAY_KEY_CHUNK_SIZE;
        switch (level) {
        case 0:
                return index_key->hash;
        case 1:
                return index_key->x;
        case 2:
                return (unsigned long)index_key->type;
        case 3:
                return (unsigned long)index_key->domain_tag;
        default:
                level -= 4;
                if (desc_len <= sizeof(index_key->desc))
                        return 0;

                d = index_key->description + sizeof(index_key->desc);
                d += level * sizeof(long);
                desc_len -= sizeof(index_key->desc);
                if (desc_len > n)
                        desc_len = n;
                do {
                        chunk <<= 8;
                        chunk |= *d++;
                } while (--desc_len > 0);
                return chunk;
        }
}

static unsigned long keyring_get_object_key_chunk(const void *object, int level)
{
        const struct key *key = keyring_ptr_to_key(object);
        return keyring_get_key_chunk(&key->index_key, level);
}

static bool keyring_compare_object(const void *object, const void *data)
{
        const struct keyring_index_key *index_key = data;
        const struct key *key = keyring_ptr_to_key(object);

        return key->index_key.type == index_key->type &&
                key->index_key.domain_tag == index_key->domain_tag &&
                key->index_key.desc_len == index_key->desc_len &&
                memcmp(key->index_key.description, index_key->description,
                       index_key->desc_len) == 0;
}

/*
 * Compare the index keys of a pair of objects and determine the bit position
 * at which they differ - if they differ.
 */
static int keyring_diff_objects(const void *object, const void *data)
{
        const struct key *key_a = keyring_ptr_to_key(object);
        const struct keyring_index_key *a = &key_a->index_key;
        const struct keyring_index_key *b = data;
        unsigned long seg_a, seg_b;
        int level, i;

        level = 0;
        seg_a = a->hash;
        seg_b = b->hash;
        if ((seg_a ^ seg_b) != 0)
                goto differ;
        level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8;

        /* The number of bits contributed by the hash is controlled by a
         * constant in the assoc_array headers.  Everything else thereafter we
         * can deal with as being machine word-size dependent.
         */
        seg_a = a->x;
        seg_b = b->x;
        if ((seg_a ^ seg_b) != 0)
                goto differ;
        level += sizeof(unsigned long);

        /* The next bit may not work on big endian */
        seg_a = (unsigned long)a->type;
        seg_b = (unsigned long)b->type;
        if ((seg_a ^ seg_b) != 0)
                goto differ;
        level += sizeof(unsigned long);

        seg_a = (unsigned long)a->domain_tag;
        seg_b = (unsigned long)b->domain_tag;
        if ((seg_a ^ seg_b) != 0)
                goto differ;
        level += sizeof(unsigned long);

        i = sizeof(a->desc);
        if (a->desc_len <= i)
                goto same;

        for (; i < a->desc_len; i++) {
                seg_a = *(unsigned char *)(a->description + i);
                seg_b = *(unsigned char *)(b->description + i);
                if ((seg_a ^ seg_b) != 0)
                        goto differ_plus_i;
        }

same:
        return -1;

differ_plus_i:
        level += i;
differ:
        i = level * 8 + __ffs(seg_a ^ seg_b);
        return i;
}

/*
 * Free an object after stripping the keyring flag off of the pointer.
 */
static void keyring_free_object(void *object)
{
        key_put(keyring_ptr_to_key(object));
}

/*
 * Operations for keyring management by the index-tree routines.
 */
static const struct assoc_array_ops keyring_assoc_array_ops = {
        .get_key_chunk                = keyring_get_key_chunk,
        .get_object_key_chunk        = keyring_get_object_key_chunk,
        .compare_object                = keyring_compare_object,
        .diff_objects                = keyring_diff_objects,
        .free_object                = keyring_free_object,
};

/*
 * Clean up a keyring when it is destroyed.  Unpublish its name if it had one
 * and dispose of its data.
 *
 * The garbage collector detects the final key_put(), removes the keyring from
 * the serial number tree and then does RCU synchronisation before coming here,
 * so we shouldn't need to worry about code poking around here with the RCU
 * readlock held by this time.
 */
static void keyring_destroy(struct key *keyring)
{
        if (keyring->description) {
                write_lock(&keyring_name_lock);

                if (keyring->name_link.next != NULL &&
                    !list_empty(&keyring->name_link))
                        list_del(&keyring->name_link);

                write_unlock(&keyring_name_lock);
        }

        if (keyring->restrict_link) {
                struct key_restriction *keyres = keyring->restrict_link;

                key_put(keyres->key);
                kfree(keyres);
        }

        assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops);
}

/*
 * Describe a keyring for /proc.
 */
static void keyring_describe(const struct key *keyring, struct seq_file *m)
{
        if (keyring->description)
                seq_puts(m, keyring->description);
        else
                seq_puts(m, "[anon]");

        if (key_is_positive(keyring)) {
                if (keyring->keys.nr_leaves_on_tree != 0)
                        seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree);
                else
                        seq_puts(m, ": empty");
        }
}

struct keyring_read_iterator_context {
        size_t                        buflen;
        size_t                        count;
        key_serial_t                *buffer;
};

static int keyring_read_iterator(const void *object, void *data)
{
        struct keyring_read_iterator_context *ctx = data;
        const struct key *key = keyring_ptr_to_key(object);

        kenter("{%s,%d},,{%zu/%zu}",
               key->type->name, key->serial, ctx->count, ctx->buflen);

        if (ctx->count >= ctx->buflen)
                return 1;

        *ctx->buffer++ = key->serial;
        ctx->count += sizeof(key->serial);
        return 0;
}

/*
 * Read a list of key IDs from the keyring's contents in binary form
 *
 * The keyring's semaphore is read-locked by the caller.  This prevents someone
 * from modifying it under us - which could cause us to read key IDs multiple
 * times.
 */
static long keyring_read(const struct key *keyring,
                         char *buffer, size_t buflen)
{
        struct keyring_read_iterator_context ctx;
        long ret;

        kenter("{%d},,%zu", key_serial(keyring), buflen);

        if (buflen & (sizeof(key_serial_t) - 1))
                return -EINVAL;

        /* Copy as many key IDs as fit into the buffer */
        if (buffer && buflen) {
                ctx.buffer = (key_serial_t *)buffer;
                ctx.buflen = buflen;
                ctx.count = 0;
                ret = assoc_array_iterate(&keyring->keys,
                                          keyring_read_iterator, &ctx);
                if (ret < 0) {
                        kleave(" = %ld [iterate]", ret);
                        return ret;
                }
        }

        /* Return the size of the buffer needed */
        ret = keyring->keys.nr_leaves_on_tree * sizeof(key_serial_t);
        if (ret <= buflen)
                kleave("= %ld [ok]", ret);
        else
                kleave("= %ld [buffer too small]", ret);
        return ret;
}

/*
 * Allocate a keyring and link into the destination keyring.
 */
struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
                          const struct cred *cred, key_perm_t perm,
                          unsigned long flags,
                          struct key_restriction *restrict_link,
                          struct key *dest)
{
        struct key *keyring;
        int ret;

        keyring = key_alloc(&key_type_keyring, description,
                            uid, gid, cred, perm, flags, restrict_link);
        if (!IS_ERR(keyring)) {
                ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
                if (ret < 0) {
                        key_put(keyring);
                        keyring = ERR_PTR(ret);
                }
        }

        return keyring;
}
EXPORT_SYMBOL(keyring_alloc);

/**
 * restrict_link_reject - Give -EPERM to restrict link
 * @keyring: The keyring being added to.
 * @type: The type of key being added.
 * @payload: The payload of the key intended to be added.
 * @restriction_key: Keys providing additional data for evaluating restriction.
 *
 * Reject the addition of any links to a keyring.  It can be overridden by
 * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when
 * adding a key to a keyring.
 *
 * This is meant to be stored in a key_restriction structure which is passed
 * in the restrict_link parameter to keyring_alloc().
 */
int restrict_link_reject(struct key *keyring,
                         const struct key_type *type,
                         const union key_payload *payload,
                         struct key *restriction_key)
{
        return -EPERM;
}

/*
 * By default, we keys found by getting an exact match on their descriptions.
 */
bool key_default_cmp(const struct key *key,
                     const struct key_match_data *match_data)
{
        return strcmp(key->description, match_data->raw_data) == 0;
}

/*
 * Iteration function to consider each key found.
 */
static int keyring_search_iterator(const void *object, void *iterator_data)
{
        struct keyring_search_context *ctx = iterator_data;
        const struct key *key = keyring_ptr_to_key(object);
        unsigned long kflags = READ_ONCE(key->flags);
        short state = READ_ONCE(key->state);

        kenter("{%d}", key->serial);

        /* ignore keys not of this type */
        if (key->type != ctx->index_key.type) {
                kleave(" = 0 [!type]");
                return 0;
        }

        /* skip invalidated, revoked and expired keys */
        if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) {
                time64_t expiry = READ_ONCE(key->expiry);

                if (kflags & ((1 << KEY_FLAG_INVALIDATED) |
                              (1 << KEY_FLAG_REVOKED))) {
                        ctx->result = ERR_PTR(-EKEYREVOKED);
                        kleave(" = %d [invrev]", ctx->skipped_ret);
                        goto skipped;
                }

                if (expiry && ctx->now >= expiry) {
                        if (!(ctx->flags & KEYRING_SEARCH_SKIP_EXPIRED))
                                ctx->result = ERR_PTR(-EKEYEXPIRED);
                        kleave(" = %d [expire]", ctx->skipped_ret);
                        goto skipped;
                }
        }

        /* keys that don't match */
        if (!ctx->match_data.cmp(key, &ctx->match_data)) {
                kleave(" = 0 [!match]");
                return 0;
        }

        /* key must have search permissions */
        if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) &&
            key_task_permission(make_key_ref(key, ctx->possessed),
                                ctx->cred, KEY_NEED_SEARCH) < 0) {
                ctx->result = ERR_PTR(-EACCES);
                kleave(" = %d [!perm]", ctx->skipped_ret);
                goto skipped;
        }

        if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) {
                /* we set a different error code if we pass a negative key */
                if (state < 0) {
                        ctx->result = ERR_PTR(state);
                        kleave(" = %d [neg]", ctx->skipped_ret);
                        goto skipped;
                }
        }

        /* Found */
        ctx->result = make_key_ref(key, ctx->possessed);
        kleave(" = 1 [found]");
        return 1;

skipped:
        return ctx->skipped_ret;
}

/*
 * Search inside a keyring for a key.  We can search by walking to it
 * directly based on its index-key or we can iterate over the entire
 * tree looking for it, based on the match function.
 */
static int search_keyring(struct key *keyring, struct keyring_search_context *ctx)
{
        if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_DIRECT) {
                const void *object;

                object = assoc_array_find(&keyring->keys,
                                          &keyring_assoc_array_ops,
                                          &ctx->index_key);
                return object ? ctx->iterator(object, ctx) : 0;
        }
        return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx);
}

/*
 * Search a tree of keyrings that point to other keyrings up to the maximum
 * depth.
 */
static bool search_nested_keyrings(struct key *keyring,
                                   struct keyring_search_context *ctx)
{
        struct {
                struct key *keyring;
                struct assoc_array_node *node;
                int slot;
        } stack[KEYRING_SEARCH_MAX_DEPTH];

        struct assoc_array_shortcut *shortcut;
        struct assoc_array_node *node;
        struct assoc_array_ptr *ptr;
        struct key *key;
        int sp = 0, slot;

        kenter("{%d},{%s,%s}",
               keyring->serial,
               ctx->index_key.type->name,
               ctx->index_key.description);

#define STATE_CHECKS (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_DO_STATE_CHECK)
        BUG_ON((ctx->flags & STATE_CHECKS) == 0 ||
               (ctx->flags & STATE_CHECKS) == STATE_CHECKS);

        if (ctx->index_key.description)
                key_set_index_key(&ctx->index_key);

        /* Check to see if this top-level keyring is what we are looking for
         * and whether it is valid or not.
         */
        if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_ITERATE ||
            keyring_compare_object(keyring, &ctx->index_key)) {
                ctx->skipped_ret = 2;
                switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) {
                case 1:
                        goto found;
                case 2:
                        return false;
                default:
                        break;
                }
        }

        ctx->skipped_ret = 0;

        /* Start processing a new keyring */
descend_to_keyring:
        kdebug("descend to %d", keyring->serial);
        if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) |
                              (1 << KEY_FLAG_REVOKED)))
                goto not_this_keyring;

        /* Search through the keys in this keyring before its searching its
         * subtrees.
         */
        if (search_keyring(keyring, ctx))
                goto found;

        /* Then manually iterate through the keyrings nested in this one.
         *
         * Start from the root node of the index tree.  Because of the way the
         * hash function has been set up, keyrings cluster on the leftmost
         * branch of the root node (root slot 0) or in the root node itself.
         * Non-keyrings avoid the leftmost branch of the root entirely (root
         * slots 1-15).
         */
        if (!(ctx->flags & KEYRING_SEARCH_RECURSE))
                goto not_this_keyring;

        ptr = READ_ONCE(keyring->keys.root);
        if (!ptr)
                goto not_this_keyring;

        if (assoc_array_ptr_is_shortcut(ptr)) {
                /* If the root is a shortcut, either the keyring only contains
                 * keyring pointers (everything clusters behind root slot 0) or
                 * doesn't contain any keyring pointers.
                 */
                shortcut = assoc_array_ptr_to_shortcut(ptr);
                if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0)
                        goto not_this_keyring;

                ptr = READ_ONCE(shortcut->next_node);
                node = assoc_array_ptr_to_node(ptr);
                goto begin_node;
        }

        node = assoc_array_ptr_to_node(ptr);
        ptr = node->slots[0];
        if (!assoc_array_ptr_is_meta(ptr))
                goto begin_node;

descend_to_node:
        /* Descend to a more distal node in this keyring's content tree and go
         * through that.
         */
        kdebug("descend");
        if (assoc_array_ptr_is_shortcut(ptr)) {
                shortcut = assoc_array_ptr_to_shortcut(ptr);
                ptr = READ_ONCE(shortcut->next_node);
                BUG_ON(!assoc_array_ptr_is_node(ptr));
        }
        node = assoc_array_ptr_to_node(ptr);

begin_node:
        kdebug("begin_node");
        slot = 0;
ascend_to_node:
        /* Go through the slots in a node */
        for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
                ptr = READ_ONCE(node->slots[slot]);

                if (assoc_array_ptr_is_meta(ptr)) {
                        if (node->back_pointer ||
                            assoc_array_ptr_is_shortcut(ptr))
                                goto descend_to_node;
                }

                if (!keyring_ptr_is_keyring(ptr))
                        continue;

                key = keyring_ptr_to_key(ptr);

                if (sp >= KEYRING_SEARCH_MAX_DEPTH) {
                        if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) {
                                ctx->result = ERR_PTR(-ELOOP);
                                return false;
                        }
                        goto not_this_keyring;
                }

                /* Search a nested keyring */
                if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) &&
                    key_task_permission(make_key_ref(key, ctx->possessed),
                                        ctx->cred, KEY_NEED_SEARCH) < 0)
                        continue;

                /* stack the current position */
                stack[sp].keyring = keyring;
                stack[sp].node = node;
                stack[sp].slot = slot;
                sp++;

                /* begin again with the new keyring */
                keyring = key;
                goto descend_to_keyring;
        }

        /* We've dealt with all the slots in the current node, so now we need
         * to ascend to the parent and continue processing there.
         */
        ptr = READ_ONCE(node->back_pointer);
        slot = node->parent_slot;

        if (ptr && assoc_array_ptr_is_shortcut(ptr)) {
                shortcut = assoc_array_ptr_to_shortcut(ptr);
                ptr = READ_ONCE(shortcut->back_pointer);
                slot = shortcut->parent_slot;
        }
        if (!ptr)
                goto not_this_keyring;
        node = assoc_array_ptr_to_node(ptr);
        slot++;

        /* If we've ascended to the root (zero backpointer), we must have just
         * finished processing the leftmost branch rather than the root slots -
         * so there can't be any more keyrings for us to find.
         */
        if (node->back_pointer) {
                kdebug("ascend %d", slot);
                goto ascend_to_node;
        }

        /* The keyring we're looking at was disqualified or didn't contain a
         * matching key.
         */
not_this_keyring:
        kdebug("not_this_keyring %d", sp);
        if (sp <= 0) {
                kleave(" = false");
                return false;
        }

        /* Resume the processing of a keyring higher up in the tree */
        sp--;
        keyring = stack[sp].keyring;
        node = stack[sp].node;
        slot = stack[sp].slot + 1;
        kdebug("ascend to %d [%d]", keyring->serial, slot);
        goto ascend_to_node;

        /* We found a viable match */
found:
        key = key_ref_to_ptr(ctx->result);
        key_check(key);
        if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) {
                key->last_used_at = ctx->now;
                keyring->last_used_at = ctx->now;
                while (sp > 0)
                        stack[--sp].keyring->last_used_at = ctx->now;
        }
        kleave(" = true");
        return true;
}

/**
 * keyring_search_rcu - Search a keyring tree for a matching key under RCU
 * @keyring_ref: A pointer to the keyring with possession indicator.
 * @ctx: The keyring search context.
 *
 * Search the supplied keyring tree for a key that matches the criteria given.
 * The root keyring and any linked keyrings must grant Search permission to the
 * caller to be searchable and keys can only be found if they too grant Search
 * to the caller. The possession flag on the root keyring pointer controls use
 * of the possessor bits in permissions checking of the entire tree.  In
 * addition, the LSM gets to forbid keyring searches and key matches.
 *
 * The search is performed as a breadth-then-depth search up to the prescribed
 * limit (KEYRING_SEARCH_MAX_DEPTH).  The caller must hold the RCU read lock to
 * prevent keyrings from being destroyed or rearranged whilst they are being
 * searched.
 *
 * Keys are matched to the type provided and are then filtered by the match
 * function, which is given the description to use in any way it sees fit.  The
 * match function may use any attributes of a key that it wishes to
 * determine the match.  Normally the match function from the key type would be
 * used.
 *
 * RCU can be used to prevent the keyring key lists from disappearing without
 * the need to take lots of locks.
 *
 * Returns a pointer to the found key and increments the key usage count if
 * successful; -EAGAIN if no matching keys were found, or if expired or revoked
 * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the
 * specified keyring wasn't a keyring.
 *
 * In the case of a successful return, the possession attribute from
 * @keyring_ref is propagated to the returned key reference.
 */
key_ref_t keyring_search_rcu(key_ref_t keyring_ref,
                             struct keyring_search_context *ctx)
{
        struct key *keyring;
        long err;

        ctx->iterator = keyring_search_iterator;
        ctx->possessed = is_key_possessed(keyring_ref);
        ctx->result = ERR_PTR(-EAGAIN);

        keyring = key_ref_to_ptr(keyring_ref);
        key_check(keyring);

        if (keyring->type != &key_type_keyring)
                return ERR_PTR(-ENOTDIR);

        if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) {
                err = key_task_permission(keyring_ref, ctx->cred, KEY_NEED_SEARCH);
                if (err < 0)
                        return ERR_PTR(err);
        }

        ctx->now = ktime_get_real_seconds();
        if (search_nested_keyrings(keyring, ctx))
                __key_get(key_ref_to_ptr(ctx->result));
        return ctx->result;
}

/**
 * keyring_search - Search the supplied keyring tree for a matching key
 * @keyring: The root of the keyring tree to be searched.
 * @type: The type of keyring we want to find.
 * @description: The name of the keyring we want to find.
 * @recurse: True to search the children of @keyring also
 *
 * As keyring_search_rcu() above, but using the current task's credentials and
 * type's default matching function and preferred search method.
 */
key_ref_t keyring_search(key_ref_t keyring,
                         struct key_type *type,
                         const char *description,
                         bool recurse)
{
        struct keyring_search_context ctx = {
                .index_key.type                = type,
                .index_key.description        = description,
                .index_key.desc_len        = strlen(description),
                .cred                        = current_cred(),
                .match_data.cmp                = key_default_cmp,
                .match_data.raw_data        = description,
                .match_data.lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
                .flags                        = KEYRING_SEARCH_DO_STATE_CHECK,
        };
        key_ref_t key;
        int ret;

        if (recurse)
                ctx.flags |= KEYRING_SEARCH_RECURSE;
        if (type->match_preparse) {
                ret = type->match_preparse(&ctx.match_data);
                if (ret < 0)
                        return ERR_PTR(ret);
        }

        rcu_read_lock();
        key = keyring_search_rcu(keyring, &ctx);
        rcu_read_unlock();

        if (type->match_free)
                type->match_free(&ctx.match_data);
        return key;
}
EXPORT_SYMBOL(keyring_search);

static struct key_restriction *keyring_restriction_alloc(
        key_restrict_link_func_t check)
{
        struct key_restriction *keyres =
                kzalloc_obj(struct key_restriction);

        if (!keyres)
                return ERR_PTR(-ENOMEM);

        keyres->check = check;

        return keyres;
}

/*
 * Semaphore to serialise restriction setup to prevent reference count
 * cycles through restriction key pointers.
 */
static DECLARE_RWSEM(keyring_serialise_restrict_sem);

/*
 * Check for restriction cycles that would prevent keyring garbage collection.
 * keyring_serialise_restrict_sem must be held.
 */
static bool keyring_detect_restriction_cycle(const struct key *dest_keyring,
                                             struct key_restriction *keyres)
{
        while (keyres && keyres->key &&
               keyres->key->type == &key_type_keyring) {
                if (keyres->key == dest_keyring)
                        return true;

                keyres = keyres->key->restrict_link;
        }

        return false;
}

/**
 * keyring_restrict - Look up and apply a restriction to a keyring
 * @keyring_ref: The keyring to be restricted
 * @type: The key type that will provide the restriction checker.
 * @restriction: The restriction options to apply to the keyring
 *
 * Look up a keyring and apply a restriction to it.  The restriction is managed
 * by the specific key type, but can be configured by the options specified in
 * the restriction string.
 */
int keyring_restrict(key_ref_t keyring_ref, const char *type,
                     const char *restriction)
{
        struct key *keyring;
        struct key_type *restrict_type = NULL;
        struct key_restriction *restrict_link;
        int ret = 0;

        keyring = key_ref_to_ptr(keyring_ref);
        key_check(keyring);

        if (keyring->type != &key_type_keyring)
                return -ENOTDIR;

        if (!type) {
                restrict_link = keyring_restriction_alloc(restrict_link_reject);
        } else {
                restrict_type = key_type_lookup(type);

                if (IS_ERR(restrict_type))
                        return PTR_ERR(restrict_type);

                if (!restrict_type->lookup_restriction) {
                        ret = -ENOENT;
                        goto error;
                }

                restrict_link = restrict_type->lookup_restriction(restriction);
        }

        if (IS_ERR(restrict_link)) {
                ret = PTR_ERR(restrict_link);
                goto error;
        }

        down_write(&keyring->sem);
        down_write(&keyring_serialise_restrict_sem);

        if (keyring->restrict_link) {
                ret = -EEXIST;
        } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) {
                ret = -EDEADLK;
        } else {
                keyring->restrict_link = restrict_link;
                notify_key(keyring, NOTIFY_KEY_SETATTR, 0);
        }

        up_write(&keyring_serialise_restrict_sem);
        up_write(&keyring->sem);

        if (ret < 0) {
                key_put(restrict_link->key);
                kfree(restrict_link);
        }

error:
        if (restrict_type)
                key_type_put(restrict_type);

        return ret;
}
EXPORT_SYMBOL(keyring_restrict);

/*
 * Search the given keyring for a key that might be updated.
 *
 * The caller must guarantee that the keyring is a keyring and that the
 * permission is granted to modify the keyring as no check is made here.  The
 * caller must also hold a lock on the keyring semaphore.
 *
 * Returns a pointer to the found key with usage count incremented if
 * successful and returns NULL if not found.  Revoked and invalidated keys are
 * skipped over.
 *
 * If successful, the possession indicator is propagated from the keyring ref
 * to the returned key reference.
 */
key_ref_t find_key_to_update(key_ref_t keyring_ref,
                             const struct keyring_index_key *index_key)
{
        struct key *keyring, *key;
        const void *object;

        keyring = key_ref_to_ptr(keyring_ref);

        kenter("{%d},{%s,%s}",
               keyring->serial, index_key->type->name, index_key->description);

        object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops,
                                  index_key);

        if (object)
                goto found;

        kleave(" = NULL");
        return NULL;

found:
        key = keyring_ptr_to_key(object);
        if (key->flags & ((1 << KEY_FLAG_INVALIDATED) |
                          (1 << KEY_FLAG_REVOKED))) {
                kleave(" = NULL [x]");
                return NULL;
        }
        __key_get(key);
        kleave(" = {%d}", key->serial);
        return make_key_ref(key, is_key_possessed(keyring_ref));
}

/*
 * Find a keyring with the specified name.
 *
 * Only keyrings that have nonzero refcount, are not revoked, and are owned by a
 * user in the current user namespace are considered.  If @uid_keyring is %true,
 * the keyring additionally must have been allocated as a user or user session
 * keyring; otherwise, it must grant Search permission directly to the caller.
 *
 * Returns a pointer to the keyring with the keyring's refcount having being
 * incremented on success.  -ENOKEY is returned if a key could not be found.
 */
struct key *find_keyring_by_name(const char *name, bool uid_keyring)
{
        struct user_namespace *ns = current_user_ns();
        struct key *keyring;

        if (!name)
                return ERR_PTR(-EINVAL);

        read_lock(&keyring_name_lock);

        /* Search this hash bucket for a keyring with a matching name that
         * grants Search permission and that hasn't been revoked
         */
        list_for_each_entry(keyring, &ns->keyring_name_list, name_link) {
                if (!kuid_has_mapping(ns, keyring->user->uid))
                        continue;

                if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
                        continue;

                if (strcmp(keyring->description, name) != 0)
                        continue;

                if (uid_keyring) {
                        if (!test_bit(KEY_FLAG_UID_KEYRING,
                                      &keyring->flags))
                                continue;
                } else {
                        if (key_permission(make_key_ref(keyring, 0),
                                           KEY_NEED_SEARCH) < 0)
                                continue;
                }

                /* we've got a match but we might end up racing with
                 * key_cleanup() if the keyring is currently 'dead'
                 * (ie. it has a zero usage count) */
                if (!refcount_inc_not_zero(&keyring->usage))
                        continue;
                keyring->last_used_at = ktime_get_real_seconds();
                goto out;
        }

        keyring = ERR_PTR(-ENOKEY);
out:
        read_unlock(&keyring_name_lock);
        return keyring;
}

static int keyring_detect_cycle_iterator(const void *object,
                                         void *iterator_data)
{
        struct keyring_search_context *ctx = iterator_data;
        const struct key *key = keyring_ptr_to_key(object);

        kenter("{%d}", key->serial);

        /* We might get a keyring with matching index-key that is nonetheless a
         * different keyring. */
        if (key != ctx->match_data.raw_data)
                return 0;

        ctx->result = ERR_PTR(-EDEADLK);
        return 1;
}

/*
 * See if a cycle will be created by inserting acyclic tree B in acyclic
 * tree A at the topmost level (ie: as a direct child of A).
 *
 * Since we are adding B to A at the top level, checking for cycles should just
 * be a matter of seeing if node A is somewhere in tree B.
 */
static int keyring_detect_cycle(struct key *A, struct key *B)
{
        struct keyring_search_context ctx = {
                .index_key                = A->index_key,
                .match_data.raw_data        = A,
                .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT,
                .iterator                = keyring_detect_cycle_iterator,
                .flags                        = (KEYRING_SEARCH_NO_STATE_CHECK |
                                           KEYRING_SEARCH_NO_UPDATE_TIME |
                                           KEYRING_SEARCH_NO_CHECK_PERM |
                                           KEYRING_SEARCH_DETECT_TOO_DEEP |
                                           KEYRING_SEARCH_RECURSE),
        };

        rcu_read_lock();
        search_nested_keyrings(B, &ctx);
        rcu_read_unlock();
        return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result);
}

/*
 * Lock keyring for link.
 */
int __key_link_lock(struct key *keyring,
                    const struct keyring_index_key *index_key)
        __acquires(&keyring->sem)
        __acquires(&keyring_serialise_link_lock)
{
        if (keyring->type != &key_type_keyring)
                return -ENOTDIR;

        down_write(&keyring->sem);

        /* Serialise link/link calls to prevent parallel calls causing a cycle
         * when linking two keyring in opposite orders.
         */
        if (index_key->type == &key_type_keyring)
                mutex_lock(&keyring_serialise_link_lock);

        return 0;
}

/*
 * Lock keyrings for move (link/unlink combination).
 */
int __key_move_lock(struct key *l_keyring, struct key *u_keyring,
                    const struct keyring_index_key *index_key)
        __acquires(&l_keyring->sem)
        __acquires(&u_keyring->sem)
        __acquires(&keyring_serialise_link_lock)
{
        if (l_keyring->type != &key_type_keyring ||
            u_keyring->type != &key_type_keyring)
                return -ENOTDIR;

        /* We have to be very careful here to take the keyring locks in the
         * right order, lest we open ourselves to deadlocking against another
         * move operation.
         */
        if (l_keyring < u_keyring) {
                down_write(&l_keyring->sem);
                down_write_nested(&u_keyring->sem, 1);
        } else {
                down_write(&u_keyring->sem);
                down_write_nested(&l_keyring->sem, 1);
        }

        /* Serialise link/link calls to prevent parallel calls causing a cycle
         * when linking two keyring in opposite orders.
         */
        if (index_key->type == &key_type_keyring)
                mutex_lock(&keyring_serialise_link_lock);

        return 0;
}

/*
 * Preallocate memory so that a key can be linked into to a keyring.
 */
int __key_link_begin(struct key *keyring,
                     const struct keyring_index_key *index_key,
                     struct assoc_array_edit **_edit)
{
        struct assoc_array_edit *edit;
        int ret;

        kenter("%d,%s,%s,",
               keyring->serial, index_key->type->name, index_key->description);

        BUG_ON(index_key->desc_len == 0);
        BUG_ON(*_edit != NULL);

        *_edit = NULL;

        ret = -EKEYREVOKED;
        if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
                goto error;

        /* Create an edit script that will insert/replace the key in the
         * keyring tree.
         */
        edit = assoc_array_insert(&keyring->keys,
                                  &keyring_assoc_array_ops,
                                  index_key,
                                  NULL);
        if (IS_ERR(edit)) {
                ret = PTR_ERR(edit);
                goto error;
        }

        /* If we're not replacing a link in-place then we're going to need some
         * extra quota.
         */
        if (!edit->dead_leaf) {
                ret = key_payload_reserve(keyring,
                                          keyring->datalen + KEYQUOTA_LINK_BYTES);
                if (ret < 0)
                        goto error_cancel;
        }

        *_edit = edit;
        kleave(" = 0");
        return 0;

error_cancel:
        assoc_array_cancel_edit(edit);
error:
        kleave(" = %d", ret);
        return ret;
}

/*
 * Check already instantiated keys aren't going to be a problem.
 *
 * The caller must have called __key_link_begin(). Don't need to call this for
 * keys that were created since __key_link_begin() was called.
 */
int __key_link_check_live_key(struct key *keyring, struct key *key)
{
        if (key->type == &key_type_keyring)
                /* check that we aren't going to create a cycle by linking one
                 * keyring to another */
                return keyring_detect_cycle(keyring, key);
        return 0;
}

/*
 * Link a key into to a keyring.
 *
 * Must be called with __key_link_begin() having being called.  Discards any
 * already extant link to matching key if there is one, so that each keyring
 * holds at most one link to any given key of a particular type+description
 * combination.
 */
void __key_link(struct key *keyring, struct key *key,
                struct assoc_array_edit **_edit)
{
        __key_get(key);
        assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key));
        assoc_array_apply_edit(*_edit);
        *_edit = NULL;
        notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key));
}

/*
 * Finish linking a key into to a keyring.
 *
 * Must be called with __key_link_begin() having being called.
 */
void __key_link_end(struct key *keyring,
                    const struct keyring_index_key *index_key,
                    struct assoc_array_edit *edit)
        __releases(&keyring->sem)
        __releases(&keyring_serialise_link_lock)
{
        BUG_ON(index_key->type == NULL);
        kenter("%d,%s,", keyring->serial, index_key->type->name);

        if (edit) {
                if (!edit->dead_leaf) {
                        key_payload_reserve(keyring,
                                keyring->datalen - KEYQUOTA_LINK_BYTES);
                }
                assoc_array_cancel_edit(edit);
        }
        up_write(&keyring->sem);

        if (index_key->type == &key_type_keyring)
                mutex_unlock(&keyring_serialise_link_lock);
}

/*
 * Check addition of keys to restricted keyrings.
 */
static int __key_link_check_restriction(struct key *keyring, struct key *key)
{
        if (!keyring->restrict_link || !keyring->restrict_link->check)
                return 0;
        return keyring->restrict_link->check(keyring, key->type, &key->payload,
                                             keyring->restrict_link->key);
}

/**
 * key_link - Link a key to a keyring
 * @keyring: The keyring to make the link in.
 * @key: The key to link to.
 *
 * Make a link in a keyring to a key, such that the keyring holds a reference
 * on that key and the key can potentially be found by searching that keyring.
 *
 * This function will write-lock the keyring's semaphore and will consume some
 * of the user's key data quota to hold the link.
 *
 * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring,
 * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is
 * full, -EDQUOT if there is insufficient key data quota remaining to add
 * another link or -ENOMEM if there's insufficient memory.
 *
 * It is assumed that the caller has checked that it is permitted for a link to
 * be made (the keyring should have Write permission and the key Link
 * permission).
 */
int key_link(struct key *keyring, struct key *key)
{
        struct assoc_array_edit *edit = NULL;
        int ret;

        kenter("{%d,%d}", keyring->serial, refcount_read(&keyring->usage));

        key_check(keyring);
        key_check(key);

        ret = __key_link_lock(keyring, &key->index_key);
        if (ret < 0)
                goto error;

        ret = __key_link_begin(keyring, &key->index_key, &edit);
        if (ret < 0)
                goto error_end;

        kdebug("begun {%d,%d}", keyring->serial, refcount_read(&keyring->usage));
        ret = __key_link_check_restriction(keyring, key);
        if (ret == 0)
                ret = __key_link_check_live_key(keyring, key);
        if (ret == 0)
                __key_link(keyring, key, &edit);

error_end:
        __key_link_end(keyring, &key->index_key, edit);
error:
        kleave(" = %d {%d,%d}", ret, keyring->serial, refcount_read(&keyring->usage));
        return ret;
}
EXPORT_SYMBOL(key_link);

/*
 * Lock a keyring for unlink.
 */
static int __key_unlink_lock(struct key *keyring)
        __acquires(&keyring->sem)
{
        if (keyring->type != &key_type_keyring)
                return -ENOTDIR;

        down_write(&keyring->sem);
        return 0;
}

/*
 * Begin the process of unlinking a key from a keyring.
 */
static int __key_unlink_begin(struct key *keyring, struct key *key,
                              struct assoc_array_edit **_edit)
{
        struct assoc_array_edit *edit;

        BUG_ON(*_edit != NULL);

        edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops,
                                  &key->index_key);
        if (IS_ERR(edit))
                return PTR_ERR(edit);

        if (!edit)
                return -ENOENT;

        *_edit = edit;
        return 0;
}

/*
 * Apply an unlink change.
 */
static void __key_unlink(struct key *keyring, struct key *key,
                         struct assoc_array_edit **_edit)
{
        assoc_array_apply_edit(*_edit);
        notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key));
        *_edit = NULL;
        key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES);
}

/*
 * Finish unlinking a key from to a keyring.
 */
static void __key_unlink_end(struct key *keyring,
                             struct key *key,
                             struct assoc_array_edit *edit)
        __releases(&keyring->sem)
{
        if (edit)
                assoc_array_cancel_edit(edit);
        up_write(&keyring->sem);
}

/**
 * key_unlink - Unlink the first link to a key from a keyring.
 * @keyring: The keyring to remove the link from.
 * @key: The key the link is to.
 *
 * Remove a link from a keyring to a key.
 *
 * This function will write-lock the keyring's semaphore.
 *
 * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if
 * the key isn't linked to by the keyring or -ENOMEM if there's insufficient
 * memory.
 *
 * It is assumed that the caller has checked that it is permitted for a link to
 * be removed (the keyring should have Write permission; no permissions are
 * required on the key).
 */
int key_unlink(struct key *keyring, struct key *key)
{
        struct assoc_array_edit *edit = NULL;
        int ret;

        key_check(keyring);
        key_check(key);

        ret = __key_unlink_lock(keyring);
        if (ret < 0)
                return ret;

        ret = __key_unlink_begin(keyring, key, &edit);
        if (ret == 0)
                __key_unlink(keyring, key, &edit);
        __key_unlink_end(keyring, key, edit);
        return ret;
}
EXPORT_SYMBOL(key_unlink);

/**
 * key_move - Move a key from one keyring to another
 * @key: The key to move
 * @from_keyring: The keyring to remove the link from.
 * @to_keyring: The keyring to make the link in.
 * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL.
 *
 * Make a link in @to_keyring to a key, such that the keyring holds a reference
 * on that key and the key can potentially be found by searching that keyring
 * whilst simultaneously removing a link to the key from @from_keyring.
 *
 * This function will write-lock both keyring's semaphores and will consume
 * some of the user's key data quota to hold the link on @to_keyring.
 *
 * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring,
 * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second
 * keyring is full, -EDQUOT if there is insufficient key data quota remaining
 * to add another link or -ENOMEM if there's insufficient memory.  If
 * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a
 * matching key in @to_keyring.
 *
 * It is assumed that the caller has checked that it is permitted for a link to
 * be made (the keyring should have Write permission and the key Link
 * permission).
 */
int key_move(struct key *key,
             struct key *from_keyring,
             struct key *to_keyring,
             unsigned int flags)
{
        struct assoc_array_edit *from_edit = NULL, *to_edit = NULL;
        int ret;

        kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial);

        if (from_keyring == to_keyring)
                return 0;

        key_check(key);
        key_check(from_keyring);
        key_check(to_keyring);

        ret = __key_move_lock(from_keyring, to_keyring, &key->index_key);
        if (ret < 0)
                goto out;
        ret = __key_unlink_begin(from_keyring, key, &from_edit);
        if (ret < 0)
                goto error;
        ret = __key_link_begin(to_keyring, &key->index_key, &to_edit);
        if (ret < 0)
                goto error;

        ret = -EEXIST;
        if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL))
                goto error;

        ret = __key_link_check_restriction(to_keyring, key);
        if (ret < 0)
                goto error;
        ret = __key_link_check_live_key(to_keyring, key);
        if (ret < 0)
                goto error;

        __key_unlink(from_keyring, key, &from_edit);
        __key_link(to_keyring, key, &to_edit);
error:
        __key_link_end(to_keyring, &key->index_key, to_edit);
        __key_unlink_end(from_keyring, key, from_edit);
out:
        kleave(" = %d", ret);
        return ret;
}
EXPORT_SYMBOL(key_move);

/**
 * keyring_clear - Clear a keyring
 * @keyring: The keyring to clear.
 *
 * Clear the contents of the specified keyring.
 *
 * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring.
 */
int keyring_clear(struct key *keyring)
{
        struct assoc_array_edit *edit;
        int ret;

        if (keyring->type != &key_type_keyring)
                return -ENOTDIR;

        down_write(&keyring->sem);

        edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops);
        if (IS_ERR(edit)) {
                ret = PTR_ERR(edit);
        } else {
                if (edit)
                        assoc_array_apply_edit(edit);
                notify_key(keyring, NOTIFY_KEY_CLEARED, 0);
                key_payload_reserve(keyring, 0);
                ret = 0;
        }

        up_write(&keyring->sem);
        return ret;
}
EXPORT_SYMBOL(keyring_clear);

/*
 * Dispose of the links from a revoked keyring.
 *
 * This is called with the key sem write-locked.
 */
static void keyring_revoke(struct key *keyring)
{
        struct assoc_array_edit *edit;

        edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops);
        if (!IS_ERR(edit)) {
                if (edit)
                        assoc_array_apply_edit(edit);
                key_payload_reserve(keyring, 0);
        }
}

static bool keyring_gc_select_iterator(void *object, void *iterator_data)
{
        struct key *key = keyring_ptr_to_key(object);
        time64_t *limit = iterator_data;

        if (key_is_dead(key, *limit))
                return false;
        key_get(key);
        return true;
}

static int keyring_gc_check_iterator(const void *object, void *iterator_data)
{
        const struct key *key = keyring_ptr_to_key(object);
        time64_t *limit = iterator_data;

        key_check(key);
        return key_is_dead(key, *limit);
}

/*
 * Garbage collect pointers from a keyring.
 *
 * Not called with any locks held.  The keyring's key struct will not be
 * deallocated under us as only our caller may deallocate it.
 */
void keyring_gc(struct key *keyring, time64_t limit)
{
        int result;

        kenter("%x{%s}", keyring->serial, keyring->description ?: "");

        if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) |
                              (1 << KEY_FLAG_REVOKED)))
                goto dont_gc;

        /* scan the keyring looking for dead keys */
        rcu_read_lock();
        result = assoc_array_iterate(&keyring->keys,
                                     keyring_gc_check_iterator, &limit);
        rcu_read_unlock();
        if (result == true)
                goto do_gc;

dont_gc:
        kleave(" [no gc]");
        return;

do_gc:
        down_write(&keyring->sem);
        assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops,
                       keyring_gc_select_iterator, &limit);
        up_write(&keyring->sem);
        kleave(" [gc]");
}

/*
 * Garbage collect restriction pointers from a keyring.
 *
 * Keyring restrictions are associated with a key type, and must be cleaned
 * up if the key type is unregistered. The restriction is altered to always
 * reject additional keys so a keyring cannot be opened up by unregistering
 * a key type.
 *
 * Not called with any keyring locks held. The keyring's key struct will not
 * be deallocated under us as only our caller may deallocate it.
 *
 * The caller is required to hold key_types_sem and dead_type->sem. This is
 * fulfilled by key_gc_keytype() holding the locks on behalf of
 * key_garbage_collector(), which it invokes on a workqueue.
 */
void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type)
{
        struct key_restriction *keyres;

        kenter("%x{%s}", keyring->serial, keyring->description ?: "");

        /*
         * keyring->restrict_link is only assigned at key allocation time
         * or with the key type locked, so the only values that could be
         * concurrently assigned to keyring->restrict_link are for key
         * types other than dead_type. Given this, it's ok to check
         * the key type before acquiring keyring->sem.
         */
        if (!dead_type || !keyring->restrict_link ||
            keyring->restrict_link->keytype != dead_type) {
                kleave(" [no restriction gc]");
                return;
        }

        /* Lock the keyring to ensure that a link is not in progress */
        down_write(&keyring->sem);

        keyres = keyring->restrict_link;

        keyres->check = restrict_link_reject;

        key_put(keyres->key);
        keyres->key = NULL;
        keyres->keytype = NULL;

        up_write(&keyring->sem);

        kleave(" [restriction gc]");
}


















































































    1 


    1 









    1 



    1 
























































   20 





   22 

   21 
   23 







   21 


   22 






































    1 




    1 


























































































































    1 


































    1 




































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Ethernet-type device handling.
 *
 * Version:        @(#)eth.c        1.0.7        05/25/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *                Florian  La Roche, <rzsfl@rz.uni-sb.de>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *
 * Fixes:
 *                Mr Linux        : Arp problems
 *                Alan Cox        : Generic queue tidyup (very tiny here)
 *                Alan Cox        : eth_header ntohs should be htons
 *                Alan Cox        : eth_rebuild_header missing an htons and
 *                                  minor other things.
 *                Tegge                : Arp bug fixes.
 *                Florian                : Removed many unnecessary functions, code cleanup
 *                                  and changes for new arp and skbuff.
 *                Alan Cox        : Redid header building to reflect new format.
 *                Alan Cox        : ARP only when compiled with CONFIG_INET
 *                Greg Page        : 802.2 and SNAP stuff.
 *                Alan Cox        : MAC layer pointers/new format.
 *                Paul Gortmaker        : eth_copy_and_sum shouldn't csum padding.
 *                Alan Cox        : Protect against forwarding explosions with
 *                                  older network drivers and IFF_ALLMULTI.
 *        Christer Weinigel        : Better rebuild header message.
 *             Andrew Morton    : 26Feb01: kill ether_setup() - use netdev_boot_setup().
 */
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/nvmem-consumer.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/if_ether.h>
#include <linux/of_net.h>
#include <linux/pci.h>
#include <linux/property.h>
#include <net/dst.h>
#include <net/arp.h>
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/ip.h>
#include <net/dsa.h>
#include <net/flow_dissector.h>
#include <net/gro.h>
#include <linux/uaccess.h>
#include <net/pkt_sched.h>

/**
 * eth_header - create the Ethernet header
 * @skb:        buffer to alter
 * @dev:        source device
 * @type:        Ethernet type field
 * @daddr: destination address (NULL leave destination address)
 * @saddr: source address (NULL use device source address)
 * @len:   packet length (<= skb->len)
 *
 *
 * Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length
 * in here instead.
 */
int eth_header(struct sk_buff *skb, struct net_device *dev,
               unsigned short type,
               const void *daddr, const void *saddr, unsigned int len)
{
        struct ethhdr *eth = skb_push(skb, ETH_HLEN);

        if (type != ETH_P_802_3 && type != ETH_P_802_2)
                eth->h_proto = htons(type);
        else
                eth->h_proto = htons(len);

        /*
         *      Set the source hardware address.
         */

        if (!saddr)
                saddr = dev->dev_addr;
        memcpy(eth->h_source, saddr, ETH_ALEN);

        if (daddr) {
                memcpy(eth->h_dest, daddr, ETH_ALEN);
                return ETH_HLEN;
        }

        /*
         *      Anyway, the loopback-device should never use this function...
         */

        if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
                eth_zero_addr(eth->h_dest);
                return ETH_HLEN;
        }

        return -ETH_HLEN;
}
EXPORT_SYMBOL(eth_header);

/**
 * eth_get_headlen - determine the length of header for an ethernet frame
 * @dev: pointer to network device
 * @data: pointer to start of frame
 * @len: total length of frame
 *
 * Make a best effort attempt to pull the length for all of the headers for
 * a given frame in a linear buffer.
 */
u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len)
{
        const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
        const struct ethhdr *eth = (const struct ethhdr *)data;
        struct flow_keys_basic keys;

        /* this should never happen, but better safe than sorry */
        if (unlikely(len < sizeof(*eth)))
                return len;

        /* parse any remaining L2/L3 headers, check for L4 */
        if (!skb_flow_dissect_flow_keys_basic(dev_net(dev), NULL, &keys, data,
                                              eth->h_proto, sizeof(*eth),
                                              len, flags))
                return max_t(u32, keys.control.thoff, sizeof(*eth));

        /* parse for any L4 headers */
        return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
}
EXPORT_SYMBOL(eth_get_headlen);

/**
 * eth_type_trans - determine the packet's protocol ID.
 * @skb: received socket data
 * @dev: receiving network device
 *
 * The rule here is that we
 * assume 802.3 if the type field is short enough to be a length.
 * This is normal practice and works for any 'now in use' protocol.
 */
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
        const unsigned short *sap;
        const struct ethhdr *eth;
        __be16 res;

        skb->dev = dev;
        skb_reset_mac_header(skb);

        eth = eth_skb_pull_mac(skb);
        eth_skb_pkt_type(skb, dev);

        /*
         * Some variants of DSA tagging don't have an ethertype field
         * at all, so we check here whether one of those tagging
         * variants has been configured on the receiving interface,
         * and if so, set skb->protocol without looking at the packet.
         */
        if (unlikely(netdev_uses_dsa(dev)))
                return htons(ETH_P_XDSA);

        if (likely(eth_proto_is_802_3(eth->h_proto)))
                return eth->h_proto;

        /*
         *      This is a magic hack to spot IPX packets. Older Novell breaks
         *      the protocol design and runs IPX over 802.3 without an 802.2 LLC
         *      layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
         *      won't work for fault tolerant netware but does for the rest.
         *        We use skb->dev as temporary storage to not hit
         *        CONFIG_STACKPROTECTOR_STRONG=y costs on some platforms.
         */
        sap = skb_header_pointer(skb, 0, sizeof(*sap), &skb->dev);
        res = (sap && *sap == 0xFFFF) ? htons(ETH_P_802_3) : htons(ETH_P_802_2);

        /* restore skb->dev in case it was mangled by skb_header_pointer(). */
        skb->dev = dev;
        return res;
}
EXPORT_SYMBOL(eth_type_trans);

int eth_header_parse(const struct sk_buff *skb, const struct net_device *dev,
                     unsigned char *haddr)
{
        const struct ethhdr *eth = eth_hdr(skb);

        memcpy(haddr, eth->h_source, ETH_ALEN);
        return ETH_ALEN;
}
EXPORT_SYMBOL(eth_header_parse);

/**
 * eth_header_cache - fill cache entry from neighbour
 * @neigh: source neighbour
 * @hh: destination cache entry
 * @type: Ethernet type field
 *
 * Create an Ethernet header template from the neighbour.
 */
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type)
{
        struct ethhdr *eth;
        const struct net_device *dev = neigh->dev;

        eth = (struct ethhdr *)
            (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth))));

        if (type == htons(ETH_P_802_3))
                return -1;

        eth->h_proto = type;
        memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
        memcpy(eth->h_dest, neigh->ha, ETH_ALEN);

        /* Pairs with READ_ONCE() in neigh_resolve_output(),
         * neigh_hh_output() and neigh_update_hhs().
         */
        smp_store_release(&hh->hh_len, ETH_HLEN);

        return 0;
}
EXPORT_SYMBOL(eth_header_cache);

/**
 * eth_header_cache_update - update cache entry
 * @hh: destination cache entry
 * @dev: network device
 * @haddr: new hardware address
 *
 * Called by Address Resolution module to notify changes in address.
 */
void eth_header_cache_update(struct hh_cache *hh,
                             const struct net_device *dev,
                             const unsigned char *haddr)
{
        memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)),
               haddr, ETH_ALEN);
}
EXPORT_SYMBOL(eth_header_cache_update);

/**
 * eth_header_parse_protocol - extract protocol from L2 header
 * @skb: packet to extract protocol from
 */
__be16 eth_header_parse_protocol(const struct sk_buff *skb)
{
        const struct ethhdr *eth = eth_hdr(skb);

        return eth->h_proto;
}
EXPORT_SYMBOL(eth_header_parse_protocol);

/**
 * eth_prepare_mac_addr_change - prepare for mac change
 * @dev: network device
 * @p: socket address
 */
int eth_prepare_mac_addr_change(struct net_device *dev, void *p)
{
        struct sockaddr *addr = p;

        if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
                return -EBUSY;
        if (!is_valid_ether_addr(addr->sa_data))
                return -EADDRNOTAVAIL;
        return 0;
}
EXPORT_SYMBOL(eth_prepare_mac_addr_change);

/**
 * eth_commit_mac_addr_change - commit mac change
 * @dev: network device
 * @p: socket address
 */
void eth_commit_mac_addr_change(struct net_device *dev, void *p)
{
        struct sockaddr *addr = p;

        eth_hw_addr_set(dev, addr->sa_data);
}
EXPORT_SYMBOL(eth_commit_mac_addr_change);

/**
 * eth_mac_addr - set new Ethernet hardware address
 * @dev: network device
 * @p: socket address
 *
 * Change hardware address of device.
 *
 * This doesn't change hardware matching, so needs to be overridden
 * for most real devices.
 */
int eth_mac_addr(struct net_device *dev, void *p)
{
        int ret;

        ret = eth_prepare_mac_addr_change(dev, p);
        if (ret < 0)
                return ret;
        eth_commit_mac_addr_change(dev, p);
        return 0;
}
EXPORT_SYMBOL(eth_mac_addr);

int eth_validate_addr(struct net_device *dev)
{
        if (!is_valid_ether_addr(dev->dev_addr))
                return -EADDRNOTAVAIL;

        return 0;
}
EXPORT_SYMBOL(eth_validate_addr);

const struct header_ops eth_header_ops ____cacheline_aligned = {
        .create                = eth_header,
        .parse                = eth_header_parse,
        .cache                = eth_header_cache,
        .cache_update        = eth_header_cache_update,
        .parse_protocol        = eth_header_parse_protocol,
};

/**
 * ether_setup - setup Ethernet network device
 * @dev: network device
 *
 * Fill in the fields of the device structure with Ethernet-generic values.
 */
void ether_setup(struct net_device *dev)
{
        dev->header_ops                = &eth_header_ops;
        dev->type                = ARPHRD_ETHER;
        dev->hard_header_len         = ETH_HLEN;
        dev->min_header_len        = ETH_HLEN;
        dev->mtu                = ETH_DATA_LEN;
        dev->min_mtu                = ETH_MIN_MTU;
        dev->max_mtu                = ETH_DATA_LEN;
        dev->addr_len                = ETH_ALEN;
        dev->tx_queue_len        = DEFAULT_TX_QUEUE_LEN;
        dev->flags                = IFF_BROADCAST|IFF_MULTICAST;
        dev->priv_flags                |= IFF_TX_SKB_SHARING;

        eth_broadcast_addr(dev->broadcast);

}
EXPORT_SYMBOL(ether_setup);

/**
 * alloc_etherdev_mqs - Allocates and sets up an Ethernet device
 * @sizeof_priv: Size of additional driver-private structure to be allocated
 *        for this Ethernet device
 * @txqs: The number of TX queues this device has.
 * @rxqs: The number of RX queues this device has.
 *
 * Fill in the fields of the device structure with Ethernet-generic
 * values. Basically does everything except registering the device.
 *
 * Constructs a new net device, complete with a private data area of
 * size (sizeof_priv).  A 32-byte (not bit) alignment is enforced for
 * this private data area.
 */

struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
                                      unsigned int rxqs)
{
        return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_ENUM,
                                ether_setup, txqs, rxqs);
}
EXPORT_SYMBOL(alloc_etherdev_mqs);

ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
{
        return sysfs_emit(buf, "%*phC\n", len, addr);
}
EXPORT_SYMBOL(sysfs_format_mac);

struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
{
        const struct packet_offload *ptype;
        unsigned int hlen, off_eth;
        struct sk_buff *pp = NULL;
        struct ethhdr *eh, *eh2;
        struct sk_buff *p;
        __be16 type;
        int flush = 1;

        off_eth = skb_gro_offset(skb);
        hlen = off_eth + sizeof(*eh);
        eh = skb_gro_header(skb, hlen, off_eth);
        if (unlikely(!eh))
                goto out;

        flush = 0;

        list_for_each_entry(p, head, list) {
                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                eh2 = (struct ethhdr *)(p->data + off_eth);
                if (compare_ether_header(eh, eh2)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }
        }

        type = eh->h_proto;

        ptype = gro_find_receive_by_type(type);
        if (ptype == NULL) {
                flush = 1;
                goto out;
        }

        skb_gro_pull(skb, sizeof(*eh));
        skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));

        pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
                                            ipv6_gro_receive, inet_gro_receive,
                                            head, skb);

out:
        skb_gro_flush_final(skb, pp, flush);

        return pp;
}
EXPORT_SYMBOL(eth_gro_receive);

int eth_gro_complete(struct sk_buff *skb, int nhoff)
{
        struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff);
        __be16 type = eh->h_proto;
        struct packet_offload *ptype;
        int err = -ENOSYS;

        if (skb->encapsulation)
                skb_set_inner_mac_header(skb, nhoff);

        ptype = gro_find_complete_by_type(type);
        if (ptype != NULL)
                err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
                                         ipv6_gro_complete, inet_gro_complete,
                                         skb, nhoff + sizeof(*eh));

        return err;
}
EXPORT_SYMBOL(eth_gro_complete);

static struct packet_offload eth_packet_offload __read_mostly = {
        .type = cpu_to_be16(ETH_P_TEB),
        .priority = 10,
        .callbacks = {
                .gro_receive = eth_gro_receive,
                .gro_complete = eth_gro_complete,
        },
};

static int __init eth_offload_init(void)
{
        dev_add_offload(&eth_packet_offload);

        return 0;
}

fs_initcall(eth_offload_init);

unsigned char * __weak arch_get_platform_mac_address(void)
{
        return NULL;
}

int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
{
        unsigned char *addr;
        int ret;

        ret = of_get_mac_address(dev->of_node, mac_addr);
        if (!ret)
                return 0;

        addr = arch_get_platform_mac_address();
        if (!addr)
                return -ENODEV;

        ether_addr_copy(mac_addr, addr);

        return 0;
}
EXPORT_SYMBOL(eth_platform_get_mac_address);

/**
 * platform_get_ethdev_address - Set netdev's MAC address from a given device
 * @dev:        Pointer to the device
 * @netdev:        Pointer to netdev to write the address to
 *
 * Wrapper around eth_platform_get_mac_address() which writes the address
 * directly to netdev->dev_addr.
 */
int platform_get_ethdev_address(struct device *dev, struct net_device *netdev)
{
        u8 addr[ETH_ALEN] __aligned(2);
        int ret;

        ret = eth_platform_get_mac_address(dev, addr);
        if (!ret)
                eth_hw_addr_set(netdev, addr);
        return ret;
}
EXPORT_SYMBOL(platform_get_ethdev_address);

/**
 * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named
 * 'mac-address' associated with given device.
 *
 * @dev:        Device with which the mac-address cell is associated.
 * @addrbuf:        Buffer to which the MAC address will be copied on success.
 *
 * Returns 0 on success or a negative error number on failure.
 */
int nvmem_get_mac_address(struct device *dev, void *addrbuf)
{
        struct nvmem_cell *cell;
        const void *mac;
        size_t len;

        cell = nvmem_cell_get(dev, "mac-address");
        if (IS_ERR(cell))
                return PTR_ERR(cell);

        mac = nvmem_cell_read(cell, &len);
        nvmem_cell_put(cell);

        if (IS_ERR(mac))
                return PTR_ERR(mac);

        if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
                kfree(mac);
                return -EINVAL;
        }

        ether_addr_copy(addrbuf, mac);
        kfree(mac);

        return 0;
}

static int fwnode_get_mac_addr(struct fwnode_handle *fwnode,
                               const char *name, char *addr)
{
        int ret;

        ret = fwnode_property_read_u8_array(fwnode, name, addr, ETH_ALEN);
        if (ret)
                return ret;

        if (!is_valid_ether_addr(addr))
                return -EINVAL;
        return 0;
}

/**
 * fwnode_get_mac_address - Get the MAC from the firmware node
 * @fwnode:        Pointer to the firmware node
 * @addr:        Address of buffer to store the MAC in
 *
 * Search the firmware node for the best MAC address to use.  'mac-address' is
 * checked first, because that is supposed to contain to "most recent" MAC
 * address. If that isn't set, then 'local-mac-address' is checked next,
 * because that is the default address.  If that isn't set, then the obsolete
 * 'address' is checked, just in case we're using an old device tree.
 *
 * Note that the 'address' property is supposed to contain a virtual address of
 * the register set, but some DTS files have redefined that property to be the
 * MAC address.
 *
 * All-zero MAC addresses are rejected, because those could be properties that
 * exist in the firmware tables, but were not updated by the firmware.  For
 * example, the DTS could define 'mac-address' and 'local-mac-address', with
 * zero MAC addresses.  Some older U-Boots only initialized 'local-mac-address'.
 * In this case, the real MAC is in 'local-mac-address', and 'mac-address'
 * exists but is all zeros.
 */
int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr)
{
        if (!fwnode_get_mac_addr(fwnode, "mac-address", addr) ||
            !fwnode_get_mac_addr(fwnode, "local-mac-address", addr) ||
            !fwnode_get_mac_addr(fwnode, "address", addr))
                return 0;

        return -ENOENT;
}
EXPORT_SYMBOL(fwnode_get_mac_address);

/**
 * device_get_mac_address - Get the MAC for a given device
 * @dev:        Pointer to the device
 * @addr:        Address of buffer to store the MAC in
 */
int device_get_mac_address(struct device *dev, char *addr)
{
        if (!fwnode_get_mac_address(dev_fwnode(dev), addr))
                return 0;

        return nvmem_get_mac_address(dev, addr);
}
EXPORT_SYMBOL(device_get_mac_address);

/**
 * device_get_ethdev_address - Set netdev's MAC address from a given device
 * @dev:        Pointer to the device
 * @netdev:        Pointer to netdev to write the address to
 *
 * Wrapper around device_get_mac_address() which writes the address
 * directly to netdev->dev_addr.
 */
int device_get_ethdev_address(struct device *dev, struct net_device *netdev)
{
        u8 addr[ETH_ALEN];
        int ret;

        ret = device_get_mac_address(dev, addr);
        if (!ret)
                eth_hw_addr_set(netdev, addr);
        return ret;
}
EXPORT_SYMBOL(device_get_ethdev_address);




























   11 
   10 













































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008 IBM Corporation
 *
 * Authors:
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima_iint.c
 *        - implements the IMA hook: ima_inode_free
 *        - cache integrity information in the inode security blob
 */
#include <linux/slab.h>

#include "ima.h"

static struct kmem_cache *ima_iint_cache __ro_after_init;

/**
 * ima_iint_find - Return the iint associated with an inode
 * @inode: Pointer to the inode
 *
 * Return the IMA integrity information (iint) associated with an inode, if the
 * inode was processed by IMA.
 *
 * Return: Found iint or NULL.
 */
struct ima_iint_cache *ima_iint_find(struct inode *inode)
{
        if (!IS_IMA(inode))
                return NULL;

        return ima_inode_get_iint(inode);
}

#define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH + 1)

/*
 * It is not clear that IMA should be nested at all, but as long is it measures
 * files both on overlayfs and on underlying fs, we need to annotate the iint
 * mutex to avoid lockdep false positives related to IMA + overlayfs.
 * See ovl_lockdep_annotate_inode_mutex_key() for more details.
 */
static inline void ima_iint_lockdep_annotate(struct ima_iint_cache *iint,
                                             struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        static struct lock_class_key ima_iint_mutex_key[IMA_MAX_NESTING];

        int depth = inode->i_sb->s_stack_depth;

        if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING))
                depth = 0;

        lockdep_set_class(&iint->mutex, &ima_iint_mutex_key[depth]);
#endif
}

static void ima_iint_init_always(struct ima_iint_cache *iint,
                                 struct inode *inode)
{
        iint->ima_hash = NULL;
        iint->real_inode.version = 0;
        iint->flags = 0UL;
        iint->atomic_flags = 0UL;
        iint->ima_file_status = INTEGRITY_UNKNOWN;
        iint->ima_mmap_status = INTEGRITY_UNKNOWN;
        iint->ima_bprm_status = INTEGRITY_UNKNOWN;
        iint->ima_read_status = INTEGRITY_UNKNOWN;
        iint->ima_creds_status = INTEGRITY_UNKNOWN;
        iint->measured_pcrs = 0;
        mutex_init(&iint->mutex);
        ima_iint_lockdep_annotate(iint, inode);
}

static void ima_iint_free(struct ima_iint_cache *iint)
{
        kfree(iint->ima_hash);
        mutex_destroy(&iint->mutex);
        kmem_cache_free(ima_iint_cache, iint);
}

/**
 * ima_inode_get - Find or allocate an iint associated with an inode
 * @inode: Pointer to the inode
 *
 * Find an iint associated with an inode, and allocate a new one if not found.
 * Caller must lock i_mutex.
 *
 * Return: An iint on success, NULL on error.
 */
struct ima_iint_cache *ima_inode_get(struct inode *inode)
{
        struct ima_iint_cache *iint;

        iint = ima_iint_find(inode);
        if (iint)
                return iint;

        iint = kmem_cache_alloc(ima_iint_cache, GFP_NOFS);
        if (!iint)
                return NULL;

        ima_iint_init_always(iint, inode);

        inode->i_flags |= S_IMA;
        ima_inode_set_iint(inode, iint);

        return iint;
}

/**
 * ima_inode_free_rcu - Called to free an inode via a RCU callback
 * @inode_security: The inode->i_security pointer
 *
 * Free the IMA data associated with an inode.
 */
void ima_inode_free_rcu(void *inode_security)
{
        struct ima_iint_cache **iint_p = inode_security + ima_blob_sizes.lbs_inode;

        /* *iint_p should be NULL if !IS_IMA(inode) */
        if (*iint_p)
                ima_iint_free(*iint_p);
}

static void ima_iint_init_once(void *foo)
{
        struct ima_iint_cache *iint = (struct ima_iint_cache *)foo;

        memset(iint, 0, sizeof(*iint));
}

void __init ima_iintcache_init(void)
{
        ima_iint_cache =
            kmem_cache_create("ima_iint_cache", sizeof(struct ima_iint_cache),
                              0, SLAB_PANIC, ima_iint_init_once);
}





















    2 





















   15 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the IP protocol.
 *
 * Version:        @(#)ip.h        1.0.2        04/28/93
 *
 * Authors:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_IP_H
#define _LINUX_IP_H

#include <linux/skbuff.h>
#include <uapi/linux/ip.h>

static inline struct iphdr *ip_hdr(const struct sk_buff *skb)
{
        return (struct iphdr *)skb_network_header(skb);
}

static inline struct iphdr *inner_ip_hdr(const struct sk_buff *skb)
{
        return (struct iphdr *)skb_inner_network_header(skb);
}

static inline struct iphdr *ipip_hdr(const struct sk_buff *skb)
{
        return (struct iphdr *)skb_transport_header(skb);
}

static inline unsigned int ip_transport_len(const struct sk_buff *skb)
{
        return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb);
}

static inline unsigned int iph_totlen(const struct sk_buff *skb, const struct iphdr *iph)
{
        u32 len = ntohs(iph->tot_len);

        return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ?
               len : skb->len - skb_network_offset(skb);
}

static inline unsigned int skb_ip_totlen(const struct sk_buff *skb)
{
        return iph_totlen(skb, ip_hdr(skb));
}

/* IPv4 datagram length is stored into 16bit field (tot_len) */
#define IP_MAX_MTU        0xFFFFU

static inline void iph_set_totlen(struct iphdr *iph, unsigned int len)
{
        iph->tot_len = len <= IP_MAX_MTU ? htons(len) : 0;
}
#endif        /* _LINUX_IP_H */
















































































   41 


   45 


















































































































































































































































































































   22 
   19 


























   22 

   21 
   21 























   22 
   20 








   21 









   22 
   19 















   20 









   20 

   22 

   22 


   22 





































   22 
















   23 




   22 
















   22 




   23 












   19 




   23 












   23 




   23 



















   21 

   23 
   22 



   23 
















   22 






   23 

























   23 








   23 
































































































    2 














    2 



    2 


    2 


























    2 





































































































































   20 








    6 

















   17 


















   20 






























   23 


   23 















   21 



   20 

   19 



































































































   23 




















   23 

   22 







   20 




















    6 













   22 
   21 
























   22 







   23 

















   10 




   22 






   21 





    4 
































































   23 









   22 



   23 

   23 














































    6 






    6 












    6 
    6 

















    6 

    6 







    6 
    6 













    6 
    6 








    6 






    6 

    6 















































   23 



   22 







   23 












   22 


   23 







   21 



































































































    4 









   23 































































    3 
   23 

   22 











   20 













   23 































   23 



   22 

































   23 














   19 























    5 



    5 
















   21 
















   19 




























   21 




   22 
   17 

   21 
   19 























   22 


















   20 









   23 


   20 










   18 

   23 








   21 


   23 






















   22 












   23 


















































































    6 


    6 



    6 
    6 






    6 






    6 


    6 















    6 



    6 
    6 









    6 






    6 






























    6 











    6 

    6 

    5 





    6 



    5 












    5 














    5 




    5 













    6 


    6 
    2 






    6 












    6 





    6 


    5 






    6 




























   20 


   19 














   20 





   20 


   21 








   22 


































    6 










   22 












   22 

   21 














































































































































































































































































    6 







































































































































    6 








    6 





    6 




























    6 

    6 


















    6 































































































































































































   22 









   21 















   21 

































































    6 


    6 



    6 













   21 



















   22 






































































































   20 






















   19 

   20 








   21 



















































































































































































   21 

    2 











    2 
















   20 


















   21 






   21 














   23 

























































































   20 











   22 































   22 




















   23 





























   20 

   23 




   22 










































































   19 










    1 





   20 
























































   23 












   22 














































   22 












   20 
   18 

   21 

















   19 
   19 









































   19 
   18 































































































    2 











    2 













    2 


























    2 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1993  Linus Torvalds
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
 *  Numa awareness, Christoph Lameter, SGI, June 2005
 *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
 */

#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/memcontrol.h>
#include <linux/llist.h>
#include <linux/uio.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
#include <linux/pgtable.h>
#include <linux/hugetlb.h>
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
#include <linux/page_owner.h>

#define CREATE_TRACE_POINTS
#include <trace/events/vmalloc.h>

#include "internal.h"
#include "pgalloc-track.h"

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;

static int __init set_nohugeiomap(char *str)
{
        ioremap_max_page_shift = PAGE_SHIFT;
        return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
static bool __ro_after_init vmap_allow_huge = true;

static int __init set_nohugevmalloc(char *str)
{
        vmap_allow_huge = false;
        return 0;
}
early_param("nohugevmalloc", set_nohugevmalloc);
#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
static const bool vmap_allow_huge = false;
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */

bool is_vmalloc_addr(const void *x)
{
        unsigned long addr = (unsigned long)kasan_reset_tag(x);

        return addr >= VMALLOC_START && addr < VMALLOC_END;
}
EXPORT_SYMBOL(is_vmalloc_addr);

struct vfree_deferred {
        struct llist_head list;
        struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);

/*** Page table manipulation functions ***/
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pte_t *pte;
        u64 pfn;
        struct page *page;
        unsigned long size = PAGE_SIZE;

        if (WARN_ON_ONCE(!PAGE_ALIGNED(end - addr)))
                return -EINVAL;

        pfn = phys_addr >> PAGE_SHIFT;
        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;

        lazy_mmu_mode_enable();

        do {
                if (unlikely(!pte_none(ptep_get(pte)))) {
                        if (pfn_valid(pfn)) {
                                page = pfn_to_page(pfn);
                                dump_page(page, "remapping already mapped page");
                        }
                        BUG();
                }

#ifdef CONFIG_HUGETLB_PAGE
                size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
                if (size != PAGE_SIZE) {
                        pte_t entry = pfn_pte(pfn, prot);

                        entry = arch_make_huge_pte(entry, ilog2(size), 0);
                        set_huge_pte_at(&init_mm, addr, pte, entry, size);
                        pfn += PFN_DOWN(size);
                        continue;
                }
#endif
                set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
                pfn++;
        } while (pte += PFN_DOWN(size), addr += size, addr != end);

        lazy_mmu_mode_disable();
        *mask |= PGTBL_PTE_MODIFIED;
        return 0;
}

static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < PMD_SHIFT)
                return 0;

        if (!arch_vmap_pmd_supported(prot))
                return 0;

        if ((end - addr) != PMD_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, PMD_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, PMD_SIZE))
                return 0;

        if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
                return 0;

        return pmd_set_huge(pmd, phys_addr, prot);
}

static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int err = 0;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);

                if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_PMD_MODIFIED;
                        continue;
                }

                err = vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask);
                if (err)
                        break;
        } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
        return err;
}

static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < PUD_SHIFT)
                return 0;

        if (!arch_vmap_pud_supported(prot))
                return 0;

        if ((end - addr) != PUD_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, PUD_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, PUD_SIZE))
                return 0;

        if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
                return 0;

        return pud_set_huge(pud, phys_addr, prot);
}

static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int err = 0;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);

                if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_PUD_MODIFIED;
                        continue;
                }

                err = vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask);
                if (err)
                        break;
        } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
        return err;
}

static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < P4D_SHIFT)
                return 0;

        if (!arch_vmap_p4d_supported(prot))
                return 0;

        if ((end - addr) != P4D_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, P4D_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, P4D_SIZE))
                return 0;

        if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
                return 0;

        return p4d_set_huge(p4d, phys_addr, prot);
}

static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;
        int err = 0;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);

                if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_P4D_MODIFIED;
                        continue;
                }

                err = vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask);
                if (err)
                        break;
        } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
        return err;
}

static int vmap_range_noflush(unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        pgd_t *pgd;
        unsigned long start;
        unsigned long next;
        int err;
        pgtbl_mod_mask mask = 0;

        /*
         * Might allocate pagetables (for most archs a more precise annotation
         * would be might_alloc(GFP_PGTABLE_KERNEL)). Also might shootdown TLB
         * (requires IRQs enabled on x86).
         */
        might_sleep();
        BUG_ON(addr >= end);

        start = addr;
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
                                        max_page_shift, &mask);
                if (err)
                        break;
        } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return err;
}

int vmap_page_range(unsigned long addr, unsigned long end,
                    phys_addr_t phys_addr, pgprot_t prot)
{
        int err;

        err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
                                 ioremap_max_page_shift);
        flush_cache_vmap(addr, end);
        if (!err)
                err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
                                               ioremap_max_page_shift);
        return err;
}

int ioremap_page_range(unsigned long addr, unsigned long end,
                phys_addr_t phys_addr, pgprot_t prot)
{
        struct vm_struct *area;

        area = find_vm_area((void *)addr);
        if (!area || !(area->flags & VM_IOREMAP)) {
                WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
                return -EINVAL;
        }
        if (addr != (unsigned long)area->addr ||
            (void *)end != area->addr + get_vm_area_size(area)) {
                WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
                          addr, end, (long)area->addr,
                          (long)area->addr + get_vm_area_size(area));
                return -ERANGE;
        }
        return vmap_page_range(addr, end, phys_addr, prot);
}

static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pte_t *pte;
        pte_t ptent;
        unsigned long size = PAGE_SIZE;

        pte = pte_offset_kernel(pmd, addr);
        lazy_mmu_mode_enable();

        do {
#ifdef CONFIG_HUGETLB_PAGE
                size = arch_vmap_pte_range_unmap_size(addr, pte);
                if (size != PAGE_SIZE) {
                        if (WARN_ON(!IS_ALIGNED(addr, size))) {
                                addr = ALIGN_DOWN(addr, size);
                                pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
                        }
                        ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
                        if (WARN_ON(end - addr < size))
                                size = end - addr;
                } else
#endif
                        ptent = ptep_get_and_clear(&init_mm, addr, pte);
                WARN_ON(!pte_none(ptent) && !pte_present(ptent));
        } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);

        lazy_mmu_mode_disable();
        *mask |= PGTBL_PTE_MODIFIED;
}

static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int cleared;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);

                cleared = pmd_clear_huge(pmd);
                if (cleared || pmd_bad(*pmd))
                        *mask |= PGTBL_PMD_MODIFIED;

                if (cleared) {
                        WARN_ON(next - addr < PMD_SIZE);
                        continue;
                }
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                vunmap_pte_range(pmd, addr, next, mask);

                cond_resched();
        } while (pmd++, addr = next, addr != end);
}

static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int cleared;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);

                cleared = pud_clear_huge(pud);
                if (cleared || pud_bad(*pud))
                        *mask |= PGTBL_PUD_MODIFIED;

                if (cleared) {
                        WARN_ON(next - addr < PUD_SIZE);
                        continue;
                }
                if (pud_none_or_clear_bad(pud))
                        continue;
                vunmap_pmd_range(pud, addr, next, mask);
        } while (pud++, addr = next, addr != end);
}

static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);

                p4d_clear_huge(p4d);
                if (p4d_bad(*p4d))
                        *mask |= PGTBL_P4D_MODIFIED;

                if (p4d_none_or_clear_bad(p4d))
                        continue;
                vunmap_pud_range(p4d, addr, next, mask);
        } while (p4d++, addr = next, addr != end);
}

/*
 * vunmap_range_noflush is similar to vunmap_range, but does not
 * flush caches or TLBs.
 *
 * The caller is responsible for calling flush_cache_vmap() before calling
 * this function, and flush_tlb_kernel_range after it has returned
 * successfully (and before the addresses are expected to cause a page fault
 * or be re-mapped for something else, if TLB flushes are being delayed or
 * coalesced).
 *
 * This is an internal function only. Do not use outside mm/.
 */
void __vunmap_range_noflush(unsigned long start, unsigned long end)
{
        unsigned long next;
        pgd_t *pgd;
        unsigned long addr = start;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                vunmap_p4d_range(pgd, addr, next, &mask);
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);
}

void vunmap_range_noflush(unsigned long start, unsigned long end)
{
        kmsan_vunmap_range_noflush(start, end);
        __vunmap_range_noflush(start, end);
}

/**
 * vunmap_range - unmap kernel virtual addresses
 * @addr: start of the VM area to unmap
 * @end: end of the VM area to unmap (non-inclusive)
 *
 * Clears any present PTEs in the virtual address range, flushes TLBs and
 * caches. Any subsequent access to the address before it has been re-mapped
 * is a kernel bug.
 */
void vunmap_range(unsigned long addr, unsigned long end)
{
        flush_cache_vunmap(addr, end);
        vunmap_range_noflush(addr, end);
        flush_tlb_kernel_range(addr, end);
}

static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        int err = 0;
        pte_t *pte;

        /*
         * nr is a running index into the array which helps higher level
         * callers keep track of where we're up to.
         */

        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;

        lazy_mmu_mode_enable();

        do {
                struct page *page = pages[*nr];

                if (WARN_ON(!pte_none(ptep_get(pte)))) {
                        err = -EBUSY;
                        break;
                }
                if (WARN_ON(!page)) {
                        err = -ENOMEM;
                        break;
                }
                if (WARN_ON(!pfn_valid(page_to_pfn(page)))) {
                        err = -EINVAL;
                        break;
                }

                set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
                (*nr)++;
        } while (pte++, addr += PAGE_SIZE, addr != end);

        lazy_mmu_mode_disable();
        *mask |= PGTBL_PTE_MODIFIED;

        return err;
}

static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);
                if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages)
{
        unsigned long start = addr;
        pgd_t *pgd;
        unsigned long next;
        int err = 0;
        int nr = 0;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return err;
}

/*
 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
 * flush caches.
 *
 * The caller is responsible for calling flush_cache_vmap() after this
 * function returns successfully and before the addresses are accessed.
 *
 * This is an internal function only. Do not use outside mm/.
 */
int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        unsigned int i, nr = (end - addr) >> PAGE_SHIFT;

        WARN_ON(page_shift < PAGE_SHIFT);

        if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
                        page_shift == PAGE_SHIFT)
                return vmap_small_pages_range_noflush(addr, end, prot, pages);

        for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
                int err;

                err = vmap_range_noflush(addr, addr + (1UL << page_shift),
                                        page_to_phys(pages[i]), prot,
                                        page_shift);
                if (err)
                        return err;

                addr += 1UL << page_shift;
        }

        return 0;
}

int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift,
                gfp_t gfp_mask)
{
        int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
                                                page_shift, gfp_mask);

        if (ret)
                return ret;
        return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}

static int __vmap_pages_range(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift,
                gfp_t gfp_mask)
{
        int err;

        err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask);
        flush_cache_vmap(addr, end);
        return err;
}

/**
 * vmap_pages_range - map pages to a kernel virtual address
 * @addr: start of the VM area to map
 * @end: end of the VM area to map (non-inclusive)
 * @prot: page protection flags to use
 * @pages: pages to map (always PAGE_SIZE pages)
 * @page_shift: maximum shift that the pages may be mapped with, @pages must
 * be aligned and contiguous up to at least this shift.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int vmap_pages_range(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL);
}

static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
                                unsigned long end)
{
        might_sleep();
        if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
                return -EINVAL;
        if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
                return -EINVAL;
        if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
                return -EINVAL;
        if ((end - start) >> PAGE_SHIFT > totalram_pages())
                return -E2BIG;
        if (start < (unsigned long)area->addr ||
            (void *)end > area->addr + get_vm_area_size(area))
                return -ERANGE;
        return 0;
}

/**
 * vm_area_map_pages - map pages inside given sparse vm_area
 * @area: vm_area
 * @start: start address inside vm_area
 * @end: end address inside vm_area
 * @pages: pages to map (always PAGE_SIZE pages)
 */
int vm_area_map_pages(struct vm_struct *area, unsigned long start,
                      unsigned long end, struct page **pages)
{
        int err;

        err = check_sparse_vm_area(area, start, end);
        if (err)
                return err;

        return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
}

/**
 * vm_area_unmap_pages - unmap pages inside given sparse vm_area
 * @area: vm_area
 * @start: start address inside vm_area
 * @end: end address inside vm_area
 */
void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
                         unsigned long end)
{
        if (check_sparse_vm_area(area, start, end))
                return;

        vunmap_range(start, end);
}

int is_vmalloc_or_module_addr(const void *x)
{
        /*
         * ARM, x86-64 and sparc64 put modules in a special place,
         * and fall back on vmalloc() if that fails. Others
         * just put it in the vmalloc space.
         */
#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR)
        unsigned long addr = (unsigned long)kasan_reset_tag(x);
        if (addr >= MODULES_VADDR && addr < MODULES_END)
                return 1;
#endif
        return is_vmalloc_addr(x);
}
EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);

/*
 * Walk a vmap address to the struct page it maps. Huge vmap mappings will
 * return the tail page that corresponds to the base page address, which
 * matches small vmap mappings.
 */
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
        unsigned long addr = (unsigned long) vmalloc_addr;
        struct page *page = NULL;
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep, pte;

        /*
         * XXX we might need to change this if we add VIRTUAL_BUG_ON for
         * architectures that do not vmalloc module space
         */
        VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));

        if (pgd_none(*pgd))
                return NULL;
        if (WARN_ON_ONCE(pgd_leaf(*pgd)))
                return NULL; /* XXX: no allowance for huge pgd */
        if (WARN_ON_ONCE(pgd_bad(*pgd)))
                return NULL;

        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d))
                return NULL;
        if (p4d_leaf(*p4d))
                return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(p4d_bad(*p4d)))
                return NULL;

        pud = pud_offset(p4d, addr);
        if (pud_none(*pud))
                return NULL;
        if (pud_leaf(*pud))
                return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(pud_bad(*pud)))
                return NULL;

        pmd = pmd_offset(pud, addr);
        if (pmd_none(*pmd))
                return NULL;
        if (pmd_leaf(*pmd))
                return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(pmd_bad(*pmd)))
                return NULL;

        ptep = pte_offset_kernel(pmd, addr);
        pte = ptep_get(ptep);
        if (pte_present(pte))
                page = pte_page(pte);

        return page;
}
EXPORT_SYMBOL(vmalloc_to_page);

/*
 * Map a vmalloc()-space virtual address to the physical page frame number.
 */
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
        return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);


/*** Global kva allocator ***/

#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0


static DEFINE_SPINLOCK(free_vmap_area_lock);
static bool vmap_initialized __read_mostly;

/*
 * This kmem_cache is used for vmap_area objects. Instead of
 * allocating from slab we reuse an object from this cache to
 * make things faster. Especially in "no edge" splitting of
 * free block.
 */
static struct kmem_cache *vmap_area_cachep;

/*
 * This linked list is used in pair with free_vmap_area_root.
 * It gives O(1) access to prev/next to perform fast coalescing.
 */
static LIST_HEAD(free_vmap_area_list);

/*
 * This augment red-black tree represents the free vmap space.
 * All vmap_area objects in this tree are sorted by va->va_start
 * address. It is used for allocation and merging when a vmap
 * object is released.
 *
 * Each vmap_area node contains a maximum available free block
 * of its sub-tree, right or left. Therefore it is possible to
 * find a lowest match of free area.
 */
static struct rb_root free_vmap_area_root = RB_ROOT;

/*
 * Preload a CPU with one object for "no edge" split case. The
 * aim is to get rid of allocations from the atomic context, thus
 * to use more permissive allocation masks.
 */
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);

/*
 * This structure defines a single, solid model where a list and
 * rb-tree are part of one entity protected by the lock. Nodes are
 * sorted in ascending order, thus for O(1) access to left/right
 * neighbors a list is used as well as for sequential traversal.
 */
struct rb_list {
        struct rb_root root;
        struct list_head head;
        spinlock_t lock;
};

/*
 * A fast size storage contains VAs up to 1M size. A pool consists
 * of linked between each other ready to go VAs of certain sizes.
 * An index in the pool-array corresponds to number of pages + 1.
 */
#define MAX_VA_SIZE_PAGES 256

struct vmap_pool {
        struct list_head head;
        unsigned long len;
};

/*
 * An effective vmap-node logic. Users make use of nodes instead
 * of a global heap. It allows to balance an access and mitigate
 * contention.
 */
static struct vmap_node {
        /* Simple size segregated storage. */
        struct vmap_pool pool[MAX_VA_SIZE_PAGES];
        spinlock_t pool_lock;
        bool skip_populate;

        /* Bookkeeping data of this node. */
        struct rb_list busy;
        struct rb_list lazy;

        /*
         * Ready-to-free areas.
         */
        struct list_head purge_list;
        struct work_struct purge_work;
        unsigned long nr_purged;
} single;

/*
 * Initial setup consists of one single node, i.e. a balancing
 * is fully disabled. Later on, after vmap is initialized these
 * parameters are updated based on a system capacity.
 */
static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;

/* A simple iterator over all vmap-nodes. */
#define for_each_vmap_node(vn)        \
        for ((vn) = &vmap_nodes[0];        \
                (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)

static inline unsigned int
addr_to_node_id(unsigned long addr)
{
        return (addr / vmap_zone_size) % nr_vmap_nodes;
}

static inline struct vmap_node *
addr_to_node(unsigned long addr)
{
        return &vmap_nodes[addr_to_node_id(addr)];
}

static inline struct vmap_node *
id_to_node(unsigned int id)
{
        return &vmap_nodes[id % nr_vmap_nodes];
}

static inline unsigned int
node_to_id(struct vmap_node *node)
{
        /* Pointer arithmetic. */
        unsigned int id = node - vmap_nodes;

        if (likely(id < nr_vmap_nodes))
                return id;

        WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node);
        return 0;
}

/*
 * We use the value 0 to represent "no node", that is why
 * an encoded value will be the node-id incremented by 1.
 * It is always greater then 0. A valid node_id which can
 * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
 * is not valid 0 is returned.
 */
static unsigned int
encode_vn_id(unsigned int node_id)
{
        /* Can store U8_MAX [0:254] nodes. */
        if (node_id < nr_vmap_nodes)
                return (node_id + 1) << BITS_PER_BYTE;

        /* Warn and no node encoded. */
        WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
        return 0;
}

/*
 * Returns an encoded node-id, the valid range is within
 * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
 * returned if extracted data is wrong.
 */
static unsigned int
decode_vn_id(unsigned int val)
{
        unsigned int node_id = (val >> BITS_PER_BYTE) - 1;

        /* Can store U8_MAX [0:254] nodes. */
        if (node_id < nr_vmap_nodes)
                return node_id;

        /* If it was _not_ zero, warn. */
        WARN_ONCE(node_id != UINT_MAX,
                "Decode wrong node id (%d)\n", node_id);

        return nr_vmap_nodes;
}

static bool
is_vn_id_valid(unsigned int node_id)
{
        if (node_id < nr_vmap_nodes)
                return true;

        return false;
}

static __always_inline unsigned long
va_size(struct vmap_area *va)
{
        return (va->va_end - va->va_start);
}

static __always_inline unsigned long
get_subtree_max_size(struct rb_node *node)
{
        struct vmap_area *va;

        va = rb_entry_safe(node, struct vmap_area, rb_node);
        return va ? va->subtree_max_size : 0;
}

RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
        struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)

static void reclaim_and_purge_vmap_areas(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);

static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;

static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
        struct rb_node *n = root->rb_node;

        addr = (unsigned long)kasan_reset_tag((void *)addr);

        while (n) {
                struct vmap_area *va;

                va = rb_entry(n, struct vmap_area, rb_node);
                if (addr < va->va_start)
                        n = n->rb_left;
                else if (addr >= va->va_end)
                        n = n->rb_right;
                else
                        return va;
        }

        return NULL;
}

/* Look up the first VA which satisfies addr < va_end, NULL if none. */
static struct vmap_area *
__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
{
        struct vmap_area *va = NULL;
        struct rb_node *n = root->rb_node;

        addr = (unsigned long)kasan_reset_tag((void *)addr);

        while (n) {
                struct vmap_area *tmp;

                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_end > addr) {
                        va = tmp;
                        if (tmp->va_start <= addr)
                                break;

                        n = n->rb_left;
                } else
                        n = n->rb_right;
        }

        return va;
}

/*
 * Returns a node where a first VA, that satisfies addr < va_end, resides.
 * If success, a node is locked. A user is responsible to unlock it when a
 * VA is no longer needed to be accessed.
 *
 * Returns NULL if nothing found.
 */
static struct vmap_node *
find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
        unsigned long va_start_lowest;
        struct vmap_node *vn;

repeat:
        va_start_lowest = 0;

        for_each_vmap_node(vn) {
                spin_lock(&vn->busy.lock);
                *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);

                if (*va)
                        if (!va_start_lowest || (*va)->va_start < va_start_lowest)
                                va_start_lowest = (*va)->va_start;
                spin_unlock(&vn->busy.lock);
        }

        /*
         * Check if found VA exists, it might have gone away.  In this case we
         * repeat the search because a VA has been removed concurrently and we
         * need to proceed to the next one, which is a rare case.
         */
        if (va_start_lowest) {
                vn = addr_to_node(va_start_lowest);

                spin_lock(&vn->busy.lock);
                *va = __find_vmap_area(va_start_lowest, &vn->busy.root);

                if (*va)
                        return vn;

                spin_unlock(&vn->busy.lock);
                goto repeat;
        }

        return NULL;
}

/*
 * This function returns back addresses of parent node
 * and its left or right link for further processing.
 *
 * Otherwise NULL is returned. In that case all further
 * steps regarding inserting of conflicting overlap range
 * have to be declined and actually considered as a bug.
 */
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
        struct rb_root *root, struct rb_node *from,
        struct rb_node **parent)
{
        struct vmap_area *tmp_va;
        struct rb_node **link;

        if (root) {
                link = &root->rb_node;
                if (unlikely(!*link)) {
                        *parent = NULL;
                        return link;
                }
        } else {
                link = &from;
        }

        /*
         * Go to the bottom of the tree. When we hit the last point
         * we end up with parent rb_node and correct direction, i name
         * it link, where the new va->rb_node will be attached to.
         */
        do {
                tmp_va = rb_entry(*link, struct vmap_area, rb_node);

                /*
                 * During the traversal we also do some sanity check.
                 * Trigger the BUG() if there are sides(left/right)
                 * or full overlaps.
                 */
                if (va->va_end <= tmp_va->va_start)
                        link = &(*link)->rb_left;
                else if (va->va_start >= tmp_va->va_end)
                        link = &(*link)->rb_right;
                else {
                        WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
                                va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);

                        return NULL;
                }
        } while (*link);

        *parent = &tmp_va->rb_node;
        return link;
}

static __always_inline struct list_head *
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
{
        struct list_head *list;

        if (unlikely(!parent))
                /*
                 * The red-black tree where we try to find VA neighbors
                 * before merging or inserting is empty, i.e. it means
                 * there is no free vmap space. Normally it does not
                 * happen but we handle this case anyway.
                 */
                return NULL;

        list = &rb_entry(parent, struct vmap_area, rb_node)->list;
        return (&parent->rb_right == link ? list->next : list);
}

static __always_inline void
__link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head, bool augment)
{
        /*
         * VA is still not in the list, but we can
         * identify its future previous list_head node.
         */
        if (likely(parent)) {
                head = &rb_entry(parent, struct vmap_area, rb_node)->list;
                if (&parent->rb_right != link)
                        head = head->prev;
        }

        /* Insert to the rb-tree */
        rb_link_node(&va->rb_node, parent, link);
        if (augment) {
                /*
                 * Some explanation here. Just perform simple insertion
                 * to the tree. We do not set va->subtree_max_size to
                 * its current size before calling rb_insert_augmented().
                 * It is because we populate the tree from the bottom
                 * to parent levels when the node _is_ in the tree.
                 *
                 * Therefore we set subtree_max_size to zero after insertion,
                 * to let __augment_tree_propagate_from() puts everything to
                 * the correct order later on.
                 */
                rb_insert_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
                va->subtree_max_size = 0;
        } else {
                rb_insert_color(&va->rb_node, root);
        }

        /* Address-sort this list */
        list_add(&va->list, head);
}

static __always_inline void
link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head)
{
        __link_va(va, root, parent, link, head, false);
}

static __always_inline void
link_va_augment(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head)
{
        __link_va(va, root, parent, link, head, true);
}

static __always_inline void
__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
{
        if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
                return;

        if (augment)
                rb_erase_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
        else
                rb_erase(&va->rb_node, root);

        list_del_init(&va->list);
        RB_CLEAR_NODE(&va->rb_node);
}

static __always_inline void
unlink_va(struct vmap_area *va, struct rb_root *root)
{
        __unlink_va(va, root, false);
}

static __always_inline void
unlink_va_augment(struct vmap_area *va, struct rb_root *root)
{
        __unlink_va(va, root, true);
}

#if DEBUG_AUGMENT_PROPAGATE_CHECK
/*
 * Gets called when remove the node and rotate.
 */
static __always_inline unsigned long
compute_subtree_max_size(struct vmap_area *va)
{
        return max3(va_size(va),
                get_subtree_max_size(va->rb_node.rb_left),
                get_subtree_max_size(va->rb_node.rb_right));
}

static void
augment_tree_propagate_check(void)
{
        struct vmap_area *va;
        unsigned long computed_size;

        list_for_each_entry(va, &free_vmap_area_list, list) {
                computed_size = compute_subtree_max_size(va);
                if (computed_size != va->subtree_max_size)
                        pr_emerg("tree is corrupted: %lu, %lu\n",
                                va_size(va), va->subtree_max_size);
        }
}
#endif

/*
 * This function populates subtree_max_size from bottom to upper
 * levels starting from VA point. The propagation must be done
 * when VA size is modified by changing its va_start/va_end. Or
 * in case of newly inserting of VA to the tree.
 *
 * It means that __augment_tree_propagate_from() must be called:
 * - After VA has been inserted to the tree(free path);
 * - After VA has been shrunk(allocation path);
 * - After VA has been increased(merging path).
 *
 * Please note that, it does not mean that upper parent nodes
 * and their subtree_max_size are recalculated all the time up
 * to the root node.
 *
 *       4--8
 *        /\
 *       /  \
 *      /    \
 *    2--2  8--8
 *
 * For example if we modify the node 4, shrinking it to 2, then
 * no any modification is required. If we shrink the node 2 to 1
 * its subtree_max_size is updated only, and set to 1. If we shrink
 * the node 8 to 6, then its subtree_max_size is set to 6 and parent
 * node becomes 4--6.
 */
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
        /*
         * Populate the tree from bottom towards the root until
         * the calculated maximum available size of checked node
         * is equal to its current one.
         */
        free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);

#if DEBUG_AUGMENT_PROPAGATE_CHECK
        augment_tree_propagate_check();
#endif
}

static void
insert_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        link = find_va_links(va, root, NULL, &parent);
        if (link)
                link_va(va, root, parent, link, head);
}

static void
insert_vmap_area_augment(struct vmap_area *va,
        struct rb_node *from, struct rb_root *root,
        struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        if (from)
                link = find_va_links(va, NULL, from, &parent);
        else
                link = find_va_links(va, root, NULL, &parent);

        if (link) {
                link_va_augment(va, root, parent, link, head);
                augment_tree_propagate_from(va);
        }
}

/*
 * Merge de-allocated chunk of VA memory with previous
 * and next free blocks. If coalesce is not done a new
 * free area is inserted. If VA has been merged, it is
 * freed.
 *
 * Please note, it can return NULL in case of overlap
 * ranges, followed by WARN() report. Despite it is a
 * buggy behaviour, a system can be alive and keep
 * ongoing.
 */
static __always_inline struct vmap_area *
__merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head, bool augment)
{
        struct vmap_area *sibling;
        struct list_head *next;
        struct rb_node **link;
        struct rb_node *parent;
        bool merged = false;

        /*
         * Find a place in the tree where VA potentially will be
         * inserted, unless it is merged with its sibling/siblings.
         */
        link = find_va_links(va, root, NULL, &parent);
        if (!link)
                return NULL;

        /*
         * Get next node of VA to check if merging can be done.
         */
        next = get_va_next_sibling(parent, link);
        if (unlikely(next == NULL))
                goto insert;

        /*
         * start            end
         * |                |
         * |<------VA------>|<-----Next----->|
         *                  |                |
         *                  start            end
         */
        if (next != head) {
                sibling = list_entry(next, struct vmap_area, list);
                if (sibling->va_start == va->va_end) {
                        sibling->va_start = va->va_start;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

        /*
         * start            end
         * |                |
         * |<-----Prev----->|<------VA------>|
         *                  |                |
         *                  start            end
         */
        if (next->prev != head) {
                sibling = list_entry(next->prev, struct vmap_area, list);
                if (sibling->va_end == va->va_start) {
                        /*
                         * If both neighbors are coalesced, it is important
                         * to unlink the "next" node first, followed by merging
                         * with "previous" one. Otherwise the tree might not be
                         * fully populated if a sibling's augmented value is
                         * "normalized" because of rotation operations.
                         */
                        if (merged)
                                __unlink_va(va, root, augment);

                        sibling->va_end = va->va_end;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

insert:
        if (!merged)
                __link_va(va, root, parent, link, head, augment);

        return va;
}

static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        return __merge_or_add_vmap_area(va, root, head, false);
}

static __always_inline struct vmap_area *
merge_or_add_vmap_area_augment(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        va = __merge_or_add_vmap_area(va, root, head, true);
        if (va)
                augment_tree_propagate_from(va);

        return va;
}

static __always_inline bool
is_within_this_va(struct vmap_area *va, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        unsigned long nva_start_addr;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Can be overflowed due to big size or alignment. */
        if (nva_start_addr + size < nva_start_addr ||
                        nva_start_addr < vstart)
                return false;

        return (nva_start_addr + size <= va->va_end);
}

/*
 * Find the first free block(lowest start address) in the tree,
 * that will accomplish the request corresponding to passing
 * parameters. Please note, with an alignment bigger than PAGE_SIZE,
 * a search length is adjusted to account for worst case alignment
 * overhead.
 */
static __always_inline struct vmap_area *
find_vmap_lowest_match(struct rb_root *root, unsigned long size,
        unsigned long align, unsigned long vstart, bool adjust_search_size)
{
        struct vmap_area *va;
        struct rb_node *node;
        unsigned long length;

        /* Start from the root. */
        node = root->rb_node;

        /* Adjust the search size for alignment overhead. */
        length = adjust_search_size ? size + align - 1 : size;

        while (node) {
                va = rb_entry(node, struct vmap_area, rb_node);

                if (get_subtree_max_size(node->rb_left) >= length &&
                                vstart < va->va_start) {
                        node = node->rb_left;
                } else {
                        if (is_within_this_va(va, size, align, vstart))
                                return va;

                        /*
                         * Does not make sense to go deeper towards the right
                         * sub-tree if it does not have a free block that is
                         * equal or bigger to the requested search length.
                         */
                        if (get_subtree_max_size(node->rb_right) >= length) {
                                node = node->rb_right;
                                continue;
                        }

                        /*
                         * OK. We roll back and find the first right sub-tree,
                         * that will satisfy the search criteria. It can happen
                         * due to "vstart" restriction or an alignment overhead
                         * that is bigger then PAGE_SIZE.
                         */
                        while ((node = rb_parent(node))) {
                                va = rb_entry(node, struct vmap_area, rb_node);
                                if (is_within_this_va(va, size, align, vstart))
                                        return va;

                                if (get_subtree_max_size(node->rb_right) >= length &&
                                                vstart <= va->va_start) {
                                        /*
                                         * Shift the vstart forward. Please note, we update it with
                                         * parent's start address adding "1" because we do not want
                                         * to enter same sub-tree after it has already been checked
                                         * and no suitable free block found there.
                                         */
                                        vstart = va->va_start + 1;
                                        node = node->rb_right;
                                        break;
                                }
                        }
                }
        }

        return NULL;
}

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
#include <linux/random.h>

static struct vmap_area *
find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        struct vmap_area *va;

        list_for_each_entry(va, head, list) {
                if (!is_within_this_va(va, size, align, vstart))
                        continue;

                return va;
        }

        return NULL;
}

static void
find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
                             unsigned long size, unsigned long align)
{
        struct vmap_area *va_1, *va_2;
        unsigned long vstart;
        unsigned int rnd;

        get_random_bytes(&rnd, sizeof(rnd));
        vstart = VMALLOC_START + rnd;

        va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
        va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);

        if (va_1 != va_2)
                pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
                        va_1, va_2, vstart);
}
#endif

enum fit_type {
        NOTHING_FIT = 0,
        FL_FIT_TYPE = 1,        /* full fit */
        LE_FIT_TYPE = 2,        /* left edge fit */
        RE_FIT_TYPE = 3,        /* right edge fit */
        NE_FIT_TYPE = 4                /* no edge fit */
};

static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area *va,
        unsigned long nva_start_addr, unsigned long size)
{
        enum fit_type type;

        /* Check if it is within VA. */
        if (nva_start_addr < va->va_start ||
                        nva_start_addr + size > va->va_end)
                return NOTHING_FIT;

        /* Now classify. */
        if (va->va_start == nva_start_addr) {
                if (va->va_end == nva_start_addr + size)
                        type = FL_FIT_TYPE;
                else
                        type = LE_FIT_TYPE;
        } else if (va->va_end == nva_start_addr + size) {
                type = RE_FIT_TYPE;
        } else {
                type = NE_FIT_TYPE;
        }

        return type;
}

static __always_inline int
va_clip(struct rb_root *root, struct list_head *head,
                struct vmap_area *va, unsigned long nva_start_addr,
                unsigned long size)
{
        struct vmap_area *lva = NULL;
        enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);

        if (type == FL_FIT_TYPE) {
                /*
                 * No need to split VA, it fully fits.
                 *
                 * |               |
                 * V      NVA      V
                 * |---------------|
                 */
                unlink_va_augment(va, root);
                kmem_cache_free(vmap_area_cachep, va);
        } else if (type == LE_FIT_TYPE) {
                /*
                 * Split left edge of fit VA.
                 *
                 * |       |
                 * V  NVA  V   R
                 * |-------|-------|
                 */
                va->va_start += size;
        } else if (type == RE_FIT_TYPE) {
                /*
                 * Split right edge of fit VA.
                 *
                 *         |       |
                 *     L   V  NVA  V
                 * |-------|-------|
                 */
                va->va_end = nva_start_addr;
        } else if (type == NE_FIT_TYPE) {
                /*
                 * Split no edge of fit VA.
                 *
                 *     |       |
                 *   L V  NVA  V R
                 * |---|-------|---|
                 */
                lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
                if (unlikely(!lva)) {
                        /*
                         * For percpu allocator we do not do any pre-allocation
                         * and leave it as it is. The reason is it most likely
                         * never ends up with NE_FIT_TYPE splitting. In case of
                         * percpu allocations offsets and sizes are aligned to
                         * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
                         * are its main fitting cases.
                         *
                         * There are a few exceptions though, as an example it is
                         * a first allocation (early boot up) when we have "one"
                         * big free space that has to be split.
                         *
                         * Also we can hit this path in case of regular "vmap"
                         * allocations, if "this" current CPU was not preloaded.
                         * See the comment in alloc_vmap_area() why. If so, then
                         * GFP_NOWAIT is used instead to get an extra object for
                         * split purpose. That is rare and most time does not
                         * occur.
                         *
                         * What happens if an allocation gets failed. Basically,
                         * an "overflow" path is triggered to purge lazily freed
                         * areas to free some memory, then, the "retry" path is
                         * triggered to repeat one more time. See more details
                         * in alloc_vmap_area() function.
                         */
                        lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!lva)
                                return -ENOMEM;
                }

                /*
                 * Build the remainder.
                 */
                lva->va_start = va->va_start;
                lva->va_end = nva_start_addr;

                /*
                 * Shrink this VA to remaining size.
                 */
                va->va_start = nva_start_addr + size;
        } else {
                return -EINVAL;
        }

        if (type != FL_FIT_TYPE) {
                augment_tree_propagate_from(va);

                if (lva)        /* type == NE_FIT_TYPE */
                        insert_vmap_area_augment(lva, &va->rb_node, root, head);
        }

        return 0;
}

static unsigned long
va_alloc(struct vmap_area *va,
                struct rb_root *root, struct list_head *head,
                unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend)
{
        unsigned long nva_start_addr;
        int ret;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Check the "vend" restriction. */
        if (nva_start_addr + size > vend)
                return -ERANGE;

        /* Update the free vmap_area. */
        ret = va_clip(root, head, va, nva_start_addr, size);
        if (WARN_ON_ONCE(ret))
                return ret;

        return nva_start_addr;
}

/*
 * Returns a start address of the newly allocated area, if success.
 * Otherwise an error value is returned that indicates failure.
 */
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
        unsigned long size, unsigned long align,
        unsigned long vstart, unsigned long vend)
{
        bool adjust_search_size = true;
        unsigned long nva_start_addr;
        struct vmap_area *va;

        /*
         * Do not adjust when:
         *   a) align <= PAGE_SIZE, because it does not make any sense.
         *      All blocks(their start addresses) are at least PAGE_SIZE
         *      aligned anyway;
         *   b) a short range where a requested size corresponds to exactly
         *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
         *      With adjusted search length an allocation would not succeed.
         */
        if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
                adjust_search_size = false;

        va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
        if (unlikely(!va))
                return -ENOENT;

        nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
        if (!IS_ERR_VALUE(nva_start_addr))
                find_vmap_lowest_match_check(root, head, size, align);
#endif

        return nva_start_addr;
}

/*
 * Free a region of KVA allocated by alloc_vmap_area
 */
static void free_vmap_area(struct vmap_area *va)
{
        struct vmap_node *vn = addr_to_node(va->va_start);

        /*
         * Remove from the busy tree/list.
         */
        spin_lock(&vn->busy.lock);
        unlink_va(va, &vn->busy.root);
        spin_unlock(&vn->busy.lock);

        /*
         * Insert/Merge it back to the free tree/list.
         */
        spin_lock(&free_vmap_area_lock);
        merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

static inline void
preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
{
        struct vmap_area *va = NULL, *tmp;

        /*
         * Preload this CPU with one extra vmap_area object. It is used
         * when fit type of free area is NE_FIT_TYPE. It guarantees that
         * a CPU that does an allocation is preloaded.
         *
         * We do it in non-atomic context, thus it allows us to use more
         * permissive allocation masks to be more stable under low memory
         * condition and high memory pressure.
         */
        if (!this_cpu_read(ne_fit_preload_node))
                va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);

        spin_lock(lock);

        tmp = NULL;
        if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va))
                kmem_cache_free(vmap_area_cachep, va);
}

static struct vmap_pool *
size_to_va_pool(struct vmap_node *vn, unsigned long size)
{
        unsigned int idx = (size - 1) / PAGE_SIZE;

        if (idx < MAX_VA_SIZE_PAGES)
                return &vn->pool[idx];

        return NULL;
}

static bool
node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
{
        struct vmap_pool *vp;

        vp = size_to_va_pool(n, va_size(va));
        if (!vp)
                return false;

        spin_lock(&n->pool_lock);
        list_add(&va->list, &vp->head);
        WRITE_ONCE(vp->len, vp->len + 1);
        spin_unlock(&n->pool_lock);

        return true;
}

static struct vmap_area *
node_pool_del_va(struct vmap_node *vn, unsigned long size,
                unsigned long align, unsigned long vstart,
                unsigned long vend)
{
        struct vmap_area *va = NULL;
        struct vmap_pool *vp;
        int err = 0;

        vp = size_to_va_pool(vn, size);
        if (!vp || list_empty(&vp->head))
                return NULL;

        spin_lock(&vn->pool_lock);
        if (!list_empty(&vp->head)) {
                va = list_first_entry(&vp->head, struct vmap_area, list);

                if (IS_ALIGNED(va->va_start, align)) {
                        /*
                         * Do some sanity check and emit a warning
                         * if one of below checks detects an error.
                         */
                        err |= (va_size(va) != size);
                        err |= (va->va_start < vstart);
                        err |= (va->va_end > vend);

                        if (!WARN_ON_ONCE(err)) {
                                list_del_init(&va->list);
                                WRITE_ONCE(vp->len, vp->len - 1);
                        } else {
                                va = NULL;
                        }
                } else {
                        list_move_tail(&va->list, &vp->head);
                        va = NULL;
                }
        }
        spin_unlock(&vn->pool_lock);

        return va;
}

static struct vmap_area *
node_alloc(unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend,
                unsigned long *addr, unsigned int *vn_id)
{
        struct vmap_area *va;

        *vn_id = 0;
        *addr = -EINVAL;

        /*
         * Fallback to a global heap if not vmalloc or there
         * is only one node.
         */
        if (vstart != VMALLOC_START || vend != VMALLOC_END ||
                        nr_vmap_nodes == 1)
                return NULL;

        *vn_id = raw_smp_processor_id() % nr_vmap_nodes;
        va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
        *vn_id = encode_vn_id(*vn_id);

        if (va)
                *addr = va->va_start;

        return va;
}

static inline void setup_vmalloc_vm(struct vm_struct *vm,
        struct vmap_area *va, unsigned long flags, const void *caller)
{
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
        vm->size = vm->requested_size = va_size(va);
        vm->caller = caller;
        va->vm = vm;
}

/*
 * Allocate a region of KVA of the specified size and alignment, within the
 * vstart and vend. If vm is passed in, the two will also be bound.
 */
static struct vmap_area *alloc_vmap_area(unsigned long size,
                                unsigned long align,
                                unsigned long vstart, unsigned long vend,
                                int node, gfp_t gfp_mask,
                                unsigned long va_flags, struct vm_struct *vm)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        unsigned long freed;
        unsigned long addr;
        unsigned int vn_id;
        bool allow_block;
        int purged = 0;
        int ret;

        if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
                return ERR_PTR(-EINVAL);

        if (unlikely(!vmap_initialized))
                return ERR_PTR(-EBUSY);

        /* Only reclaim behaviour flags are relevant. */
        gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
        allow_block = gfpflags_allow_blocking(gfp_mask);
        might_sleep_if(allow_block);

        /*
         * If a VA is obtained from a global heap(if it fails here)
         * it is anyway marked with this "vn_id" so it is returned
         * to this pool's node later. Such way gives a possibility
         * to populate pools based on users demand.
         *
         * On success a ready to go VA is returned.
         */
        va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
        if (!va) {
                va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
                if (unlikely(!va))
                        return ERR_PTR(-ENOMEM);

                /*
                 * Only scan the relevant parts containing pointers to other objects
                 * to avoid false negatives.
                 */
                kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
        }

retry:
        if (IS_ERR_VALUE(addr)) {
                preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
                addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
                        size, align, vstart, vend);
                spin_unlock(&free_vmap_area_lock);

                /*
                 * This is not a fast path.  Check if yielding is needed. This
                 * is the only reschedule point in the vmalloc() path.
                 */
                if (allow_block)
                        cond_resched();
        }

        trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));

        /*
         * If an allocation fails, the error value is
         * returned. Therefore trigger the overflow path.
         */
        if (IS_ERR_VALUE(addr)) {
                if (allow_block)
                        goto overflow;

                /*
                 * We can not trigger any reclaim logic because
                 * sleeping is not allowed, thus fail an allocation.
                 */
                goto out_free_va;
        }

        va->va_start = addr;
        va->va_end = addr + size;
        va->vm = NULL;
        va->flags = (va_flags | vn_id);

        if (vm) {
                vm->addr = (void *)va->va_start;
                vm->size = va_size(va);
                va->vm = vm;
        }

        vn = addr_to_node(va->va_start);

        spin_lock(&vn->busy.lock);
        insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
        spin_unlock(&vn->busy.lock);

        BUG_ON(!IS_ALIGNED(va->va_start, align));
        BUG_ON(va->va_start < vstart);
        BUG_ON(va->va_end > vend);

        ret = kasan_populate_vmalloc(addr, size, gfp_mask);
        if (ret) {
                free_vmap_area(va);
                return ERR_PTR(ret);
        }

        return va;

overflow:
        if (!purged) {
                reclaim_and_purge_vmap_areas();
                purged = 1;
                goto retry;
        }

        freed = 0;
        blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);

        if (freed > 0) {
                purged = 0;
                goto retry;
        }

        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
                pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n",
                                size, vstart, vend);

out_free_va:
        kmem_cache_free(vmap_area_cachep, va);
        return ERR_PTR(-EBUSY);
}

int register_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);

int unregister_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);

/*
 * lazy_max_pages is the maximum amount of virtual address space we gather up
 * before attempting to purge with a TLB flush.
 *
 * There is a tradeoff here: a larger number will cover more kernel page tables
 * and take slightly longer to purge, but it will linearly reduce the number of
 * global TLB flushes that must be performed. It would seem natural to scale
 * this number up linearly with the number of CPUs (because vmapping activity
 * could also scale linearly with the number of CPUs), however it is likely
 * that in practice, workloads might be constrained in other ways that mean
 * vmap activity will not scale linearly with CPUs. Also, I want to be
 * conservative and not introduce a big latency on huge systems, so go with
 * a less aggressive log scale. It will still be an improvement over the old
 * code, and it will be simple to change the scale factor if we find that it
 * becomes a problem on bigger systems.
 */
static unsigned long lazy_max_pages(void)
{
        unsigned int log;

        log = fls(num_online_cpus());

        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}

/*
 * Serialize vmap purging.  There is no actual critical section protected
 * by this lock, but we want to avoid concurrent calls for performance
 * reasons and to make the pcpu_get_vm_areas more deterministic.
 */
static DEFINE_MUTEX(vmap_purge_lock);

/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);

static void
reclaim_list_global(struct list_head *head)
{
        struct vmap_area *va, *n;

        if (list_empty(head))
                return;

        spin_lock(&free_vmap_area_lock);
        list_for_each_entry_safe(va, n, head, list)
                merge_or_add_vmap_area_augment(va,
                        &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

static void
decay_va_pool_node(struct vmap_node *vn, bool full_decay)
{
        LIST_HEAD(decay_list);
        struct rb_root decay_root = RB_ROOT;
        struct vmap_area *va, *nva;
        unsigned long n_decay, pool_len;
        int i;

        for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
                LIST_HEAD(tmp_list);

                if (list_empty(&vn->pool[i].head))
                        continue;

                /* Detach the pool, so no-one can access it. */
                spin_lock(&vn->pool_lock);
                list_replace_init(&vn->pool[i].head, &tmp_list);
                spin_unlock(&vn->pool_lock);

                pool_len = n_decay = vn->pool[i].len;
                WRITE_ONCE(vn->pool[i].len, 0);

                /* Decay a pool by ~25% out of left objects. */
                if (!full_decay)
                        n_decay >>= 2;
                pool_len -= n_decay;

                list_for_each_entry_safe(va, nva, &tmp_list, list) {
                        if (!n_decay--)
                                break;

                        list_del_init(&va->list);
                        merge_or_add_vmap_area(va, &decay_root, &decay_list);
                }

                /*
                 * Attach the pool back if it has been partly decayed.
                 * Please note, it is supposed that nobody(other contexts)
                 * can populate the pool therefore a simple list replace
                 * operation takes place here.
                 */
                if (!list_empty(&tmp_list)) {
                        spin_lock(&vn->pool_lock);
                        list_replace_init(&tmp_list, &vn->pool[i].head);
                        WRITE_ONCE(vn->pool[i].len, pool_len);
                        spin_unlock(&vn->pool_lock);
                }
        }

        reclaim_list_global(&decay_list);
}

#define KASAN_RELEASE_BATCH_SIZE 32

static void
kasan_release_vmalloc_node(struct vmap_node *vn)
{
        struct vmap_area *va;
        unsigned long start, end;
        unsigned int batch_count = 0;

        start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
        end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;

        list_for_each_entry(va, &vn->purge_list, list) {
                if (is_vmalloc_or_module_addr((void *) va->va_start))
                        kasan_release_vmalloc(va->va_start, va->va_end,
                                va->va_start, va->va_end,
                                KASAN_VMALLOC_PAGE_RANGE);

                if (need_resched() || (++batch_count >= KASAN_RELEASE_BATCH_SIZE)) {
                        cond_resched();
                        batch_count = 0;
                }
        }

        kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
}

static void purge_vmap_node(struct work_struct *work)
{
        struct vmap_node *vn = container_of(work,
                struct vmap_node, purge_work);
        unsigned long nr_purged_pages = 0;
        struct vmap_area *va, *n_va;
        LIST_HEAD(local_list);

        if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
                kasan_release_vmalloc_node(vn);

        vn->nr_purged = 0;

        list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
                unsigned long nr = va_size(va) >> PAGE_SHIFT;
                unsigned int vn_id = decode_vn_id(va->flags);

                list_del_init(&va->list);

                nr_purged_pages += nr;
                vn->nr_purged++;

                if (is_vn_id_valid(vn_id) && !vn->skip_populate)
                        if (node_pool_add_va(vn, va))
                                continue;

                /* Go back to global. */
                list_add(&va->list, &local_list);
        }

        atomic_long_sub(nr_purged_pages, &vmap_lazy_nr);

        reclaim_list_global(&local_list);
}

/*
 * Purges all lazily-freed vmap areas.
 */
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
                bool full_pool_decay)
{
        unsigned long nr_purged_areas = 0;
        unsigned int nr_purge_helpers;
        static cpumask_t purge_nodes;
        unsigned int nr_purge_nodes;
        struct vmap_node *vn;
        int i;

        lockdep_assert_held(&vmap_purge_lock);

        /*
         * Use cpumask to mark which node has to be processed.
         */
        purge_nodes = CPU_MASK_NONE;

        for_each_vmap_node(vn) {
                INIT_LIST_HEAD(&vn->purge_list);
                vn->skip_populate = full_pool_decay;
                decay_va_pool_node(vn, full_pool_decay);

                if (RB_EMPTY_ROOT(&vn->lazy.root))
                        continue;

                spin_lock(&vn->lazy.lock);
                WRITE_ONCE(vn->lazy.root.rb_node, NULL);
                list_replace_init(&vn->lazy.head, &vn->purge_list);
                spin_unlock(&vn->lazy.lock);

                start = min(start, list_first_entry(&vn->purge_list,
                        struct vmap_area, list)->va_start);

                end = max(end, list_last_entry(&vn->purge_list,
                        struct vmap_area, list)->va_end);

                cpumask_set_cpu(node_to_id(vn), &purge_nodes);
        }

        nr_purge_nodes = cpumask_weight(&purge_nodes);
        if (nr_purge_nodes > 0) {
                flush_tlb_kernel_range(start, end);

                /* One extra worker is per a lazy_max_pages() full set minus one. */
                nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
                nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;

                for_each_cpu(i, &purge_nodes) {
                        vn = &vmap_nodes[i];

                        if (nr_purge_helpers > 0) {
                                INIT_WORK(&vn->purge_work, purge_vmap_node);

                                if (cpumask_test_cpu(i, cpu_online_mask))
                                        schedule_work_on(i, &vn->purge_work);
                                else
                                        schedule_work(&vn->purge_work);

                                nr_purge_helpers--;
                        } else {
                                vn->purge_work.func = NULL;
                                purge_vmap_node(&vn->purge_work);
                                nr_purged_areas += vn->nr_purged;
                        }
                }

                for_each_cpu(i, &purge_nodes) {
                        vn = &vmap_nodes[i];

                        if (vn->purge_work.func) {
                                flush_work(&vn->purge_work);
                                nr_purged_areas += vn->nr_purged;
                        }
                }
        }

        trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
        return nr_purged_areas > 0;
}

/*
 * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
 */
static void reclaim_and_purge_vmap_areas(void)

{
        mutex_lock(&vmap_purge_lock);
        purge_fragmented_blocks_allcpus();
        __purge_vmap_area_lazy(ULONG_MAX, 0, true);
        mutex_unlock(&vmap_purge_lock);
}

static void drain_vmap_area_work(struct work_struct *work)
{
        mutex_lock(&vmap_purge_lock);
        __purge_vmap_area_lazy(ULONG_MAX, 0, false);
        mutex_unlock(&vmap_purge_lock);
}

/*
 * Free a vmap area, caller ensuring that the area has been unmapped,
 * unlinked and flush_cache_vunmap had been called for the correct
 * range previously.
 */
static void free_vmap_area_noflush(struct vmap_area *va)
{
        unsigned long nr_lazy_max = lazy_max_pages();
        unsigned long va_start = va->va_start;
        unsigned int vn_id = decode_vn_id(va->flags);
        struct vmap_node *vn;
        unsigned long nr_lazy;

        if (WARN_ON_ONCE(!list_empty(&va->list)))
                return;

        nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT,
                                         &vmap_lazy_nr);

        /*
         * If it was request by a certain node we would like to
         * return it to that node, i.e. its pool for later reuse.
         */
        vn = is_vn_id_valid(vn_id) ?
                id_to_node(vn_id):addr_to_node(va->va_start);

        spin_lock(&vn->lazy.lock);
        insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
        spin_unlock(&vn->lazy.lock);

        trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);

        /* After this point, we may free va at any time */
        if (unlikely(nr_lazy > nr_lazy_max))
                schedule_work(&drain_vmap_work);
}

/*
 * Free and unmap a vmap area
 */
static void free_unmap_vmap_area(struct vmap_area *va)
{
        flush_cache_vunmap(va->va_start, va->va_end);
        vunmap_range_noflush(va->va_start, va->va_end);
        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(va->va_start, va->va_end);

        free_vmap_area_noflush(va);
}

struct vmap_area *find_vmap_area(unsigned long addr)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i, j;

        if (unlikely(!vmap_initialized))
                return NULL;

        /*
         * An addr_to_node_id(addr) converts an address to a node index
         * where a VA is located. If VA spans several zones and passed
         * addr is not the same as va->va_start, what is not common, we
         * may need to scan extra nodes. See an example:
         *
         *      <----va---->
         * -|-----|-----|-----|-----|-
         *     1     2     0     1
         *
         * VA resides in node 1 whereas it spans 1, 2 an 0. If passed
         * addr is within 2 or 0 nodes we should do extra work.
         */
        i = j = addr_to_node_id(addr);
        do {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                va = __find_vmap_area(addr, &vn->busy.root);
                spin_unlock(&vn->busy.lock);

                if (va)
                        return va;
        } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);

        return NULL;
}

static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i, j;

        /*
         * Check the comment in the find_vmap_area() about the loop.
         */
        i = j = addr_to_node_id(addr);
        do {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                va = __find_vmap_area(addr, &vn->busy.root);
                if (va)
                        unlink_va(va, &vn->busy.root);
                spin_unlock(&vn->busy.lock);

                if (va)
                        return va;
        } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);

        return NULL;
}

/*** Per cpu kva allocator ***/

/*
 * vmap space is limited especially on 32 bit architectures. Ensure there is
 * room for at least 16 percpu vmap blocks per CPU.
 */
/*
 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
 * to #define VMALLOC_SPACE                (VMALLOC_END-VMALLOC_START). Guess
 * instead (we just need a rough idea)
 */
#if BITS_PER_LONG == 32
#define VMALLOC_SPACE                (128UL*1024*1024)
#else
#define VMALLOC_SPACE                (128UL*1024*1024*1024)
#endif

#define VMALLOC_PAGES                (VMALLOC_SPACE / PAGE_SIZE)
#define VMAP_MAX_ALLOC                BITS_PER_LONG        /* 256K with 4K pages */
#define VMAP_BBMAP_BITS_MAX        1024        /* 4MB with 4K pages */
#define VMAP_BBMAP_BITS_MIN        (VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y)                ((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y)                ((x) > (y) ? (x) : (y)) /* can't use max() */
#define VMAP_BBMAP_BITS                \
                VMAP_MIN(VMAP_BBMAP_BITS_MAX,        \
                VMAP_MAX(VMAP_BBMAP_BITS_MIN,        \
                        VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))

#define VMAP_BLOCK_SIZE                (VMAP_BBMAP_BITS * PAGE_SIZE)

/*
 * Purge threshold to prevent overeager purging of fragmented blocks for
 * regular operations: Purge if vb->free is less than 1/4 of the capacity.
 */
#define VMAP_PURGE_THRESHOLD        (VMAP_BBMAP_BITS / 4)

#define VMAP_RAM                0x1 /* indicates vm_map_ram area*/
#define VMAP_BLOCK                0x2 /* mark out the vmap_block sub-type*/
#define VMAP_FLAGS_MASK                0x3

struct vmap_block_queue {
        spinlock_t lock;
        struct list_head free;

        /*
         * An xarray requires an extra memory dynamically to
         * be allocated. If it is an issue, we can use rb-tree
         * instead.
         */
        struct xarray vmap_blocks;
};

struct vmap_block {
        spinlock_t lock;
        struct vmap_area *va;
        unsigned long free, dirty;
        DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
        unsigned long dirty_min, dirty_max; /*< dirty range */
        struct list_head free_list;
        struct rcu_head rcu_head;
        struct list_head purge;
        unsigned int cpu;
};

/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);

/*
 * In order to fast access to any "vmap_block" associated with a
 * specific address, we use a hash.
 *
 * A per-cpu vmap_block_queue is used in both ways, to serialize
 * an access to free block chains among CPUs(alloc path) and it
 * also acts as a vmap_block hash(alloc/free paths). It means we
 * overload it, since we already have the per-cpu array which is
 * used as a hash table. When used as a hash a 'cpu' passed to
 * per_cpu() is not actually a CPU but rather a hash index.
 *
 * A hash function is addr_to_vb_xa() which hashes any address
 * to a specific index(in a hash) it belongs to. This then uses a
 * per_cpu() macro to access an array with generated index.
 *
 * An example:
 *
 *  CPU_1  CPU_2  CPU_0
 *    |      |      |
 *    V      V      V
 * 0     10     20     30     40     50     60
 * |------|------|------|------|------|------|...<vmap address space>
 *   CPU0   CPU1   CPU2   CPU0   CPU1   CPU2
 *
 * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
 *   it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
 *
 * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
 *   it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
 *
 * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
 *   it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
 *
 * This technique almost always avoids lock contention on insert/remove,
 * however xarray spinlocks protect against any contention that remains.
 */
static struct xarray *
addr_to_vb_xa(unsigned long addr)
{
        int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids;

        /*
         * Please note, nr_cpu_ids points on a highest set
         * possible bit, i.e. we never invoke cpumask_next()
         * if an index points on it which is nr_cpu_ids - 1.
         */
        if (!cpu_possible(index))
                index = cpumask_next(index, cpu_possible_mask);

        return &per_cpu(vmap_block_queue, index).vmap_blocks;
}

/*
 * We should probably have a fallback mechanism to allocate virtual memory
 * out of partially filled vmap blocks. However vmap block sizing should be
 * fairly reasonable according to the vmalloc size, so it shouldn't be a
 * big problem.
 */

static unsigned long addr_to_vb_idx(unsigned long addr)
{
        addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
        addr /= VMAP_BLOCK_SIZE;
        return addr;
}

static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
{
        unsigned long addr;

        addr = va_start + (pages_off << PAGE_SHIFT);
        BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
        return (void *)addr;
}

/**
 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
 *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
 * @order:    how many 2^order pages should be occupied in newly allocated block
 * @gfp_mask: flags for the page level allocator
 *
 * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
 */
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        struct vmap_area *va;
        struct xarray *xa;
        unsigned long vb_idx;
        int node, err;
        void *vaddr;

        node = numa_node_id();

        vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask, node);
        if (unlikely(!vb))
                return ERR_PTR(-ENOMEM);

        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
                                        VMALLOC_START, VMALLOC_END,
                                        node, gfp_mask,
                                        VMAP_RAM|VMAP_BLOCK, NULL);
        if (IS_ERR(va)) {
                kfree(vb);
                return ERR_CAST(va);
        }

        vaddr = vmap_block_vaddr(va->va_start, 0);
        spin_lock_init(&vb->lock);
        vb->va = va;
        /* At least something should be left free */
        BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
        bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
        vb->free = VMAP_BBMAP_BITS - (1UL << order);
        vb->dirty = 0;
        vb->dirty_min = VMAP_BBMAP_BITS;
        vb->dirty_max = 0;
        bitmap_set(vb->used_map, 0, (1UL << order));
        INIT_LIST_HEAD(&vb->free_list);
        vb->cpu = raw_smp_processor_id();

        xa = addr_to_vb_xa(va->va_start);
        vb_idx = addr_to_vb_idx(va->va_start);
        err = xa_insert(xa, vb_idx, vb, gfp_mask);
        if (err) {
                kfree(vb);
                free_vmap_area(va);
                return ERR_PTR(err);
        }
        /*
         * list_add_tail_rcu could happened in another core
         * rather than vb->cpu due to task migration, which
         * is safe as list_add_tail_rcu will ensure the list's
         * integrity together with list_for_each_rcu from read
         * side.
         */
        vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
        spin_lock(&vbq->lock);
        list_add_tail_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);

        return vaddr;
}

static void free_vmap_block(struct vmap_block *vb)
{
        struct vmap_node *vn;
        struct vmap_block *tmp;
        struct xarray *xa;

        xa = addr_to_vb_xa(vb->va->va_start);
        tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
        BUG_ON(tmp != vb);

        vn = addr_to_node(vb->va->va_start);
        spin_lock(&vn->busy.lock);
        unlink_va(vb->va, &vn->busy.root);
        spin_unlock(&vn->busy.lock);

        free_vmap_area_noflush(vb->va);
        kfree_rcu(vb, rcu_head);
}

static bool purge_fragmented_block(struct vmap_block *vb,
                struct list_head *purge_list, bool force_purge)
{
        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);

        if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
            vb->dirty == VMAP_BBMAP_BITS)
                return false;

        /* Don't overeagerly purge usable blocks unless requested */
        if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
                return false;

        /* prevent further allocs after releasing lock */
        WRITE_ONCE(vb->free, 0);
        /* prevent purging it again */
        WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
        vb->dirty_min = 0;
        vb->dirty_max = VMAP_BBMAP_BITS;
        spin_lock(&vbq->lock);
        list_del_rcu(&vb->free_list);
        spin_unlock(&vbq->lock);
        list_add_tail(&vb->purge, purge_list);
        return true;
}

static void free_purged_blocks(struct list_head *purge_list)
{
        struct vmap_block *vb, *n_vb;

        list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
                list_del(&vb->purge);
                free_vmap_block(vb);
        }
}

static void purge_fragmented_blocks(int cpu)
{
        LIST_HEAD(purge);
        struct vmap_block *vb;
        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);

        rcu_read_lock();
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long free = READ_ONCE(vb->free);
                unsigned long dirty = READ_ONCE(vb->dirty);

                if (free + dirty != VMAP_BBMAP_BITS ||
                    dirty == VMAP_BBMAP_BITS)
                        continue;

                spin_lock(&vb->lock);
                purge_fragmented_block(vb, &purge, true);
                spin_unlock(&vb->lock);
        }
        rcu_read_unlock();
        free_purged_blocks(&purge);
}

static void purge_fragmented_blocks_allcpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                purge_fragmented_blocks(cpu);
}

static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        void *vaddr = NULL;
        unsigned int order;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
        if (WARN_ON(size == 0)) {
                /*
                 * Allocating 0 bytes isn't what caller wants since
                 * get_order(0) returns funny result. Just warn and terminate
                 * early.
                 */
                return ERR_PTR(-EINVAL);
        }
        order = get_order(size);

        rcu_read_lock();
        vbq = raw_cpu_ptr(&vmap_block_queue);
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long pages_off;

                if (READ_ONCE(vb->free) < (1UL << order))
                        continue;

                spin_lock(&vb->lock);
                if (vb->free < (1UL << order)) {
                        spin_unlock(&vb->lock);
                        continue;
                }

                pages_off = VMAP_BBMAP_BITS - vb->free;
                vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
                WRITE_ONCE(vb->free, vb->free - (1UL << order));
                bitmap_set(vb->used_map, pages_off, (1UL << order));
                if (vb->free == 0) {
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
                }

                spin_unlock(&vb->lock);
                break;
        }

        rcu_read_unlock();

        /* Allocate new block if nothing was found */
        if (!vaddr)
                vaddr = new_vmap_block(order, gfp_mask);

        return vaddr;
}

static void vb_free(unsigned long addr, unsigned long size)
{
        unsigned long offset;
        unsigned int order;
        struct vmap_block *vb;
        struct xarray *xa;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);

        flush_cache_vunmap(addr, addr + size);

        order = get_order(size);
        offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;

        xa = addr_to_vb_xa(addr);
        vb = xa_load(xa, addr_to_vb_idx(addr));

        spin_lock(&vb->lock);
        bitmap_clear(vb->used_map, offset, (1UL << order));
        spin_unlock(&vb->lock);

        vunmap_range_noflush(addr, addr + size);

        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(addr, addr + size);

        spin_lock(&vb->lock);

        /* Expand the not yet TLB flushed dirty range */
        vb->dirty_min = min(vb->dirty_min, offset);
        vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));

        WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
        if (vb->dirty == VMAP_BBMAP_BITS) {
                BUG_ON(vb->free);
                spin_unlock(&vb->lock);
                free_vmap_block(vb);
        } else
                spin_unlock(&vb->lock);
}

static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
        LIST_HEAD(purge_list);
        int cpu;

        if (unlikely(!vmap_initialized))
                return;

        mutex_lock(&vmap_purge_lock);

        for_each_possible_cpu(cpu) {
                struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
                struct vmap_block *vb;
                unsigned long idx;

                rcu_read_lock();
                xa_for_each(&vbq->vmap_blocks, idx, vb) {
                        spin_lock(&vb->lock);

                        /*
                         * Try to purge a fragmented block first. If it's
                         * not purgeable, check whether there is dirty
                         * space to be flushed.
                         */
                        if (!purge_fragmented_block(vb, &purge_list, false) &&
                            vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
                                unsigned long va_start = vb->va->va_start;
                                unsigned long s, e;

                                s = va_start + (vb->dirty_min << PAGE_SHIFT);
                                e = va_start + (vb->dirty_max << PAGE_SHIFT);

                                start = min(s, start);
                                end   = max(e, end);

                                /* Prevent that this is flushed again */
                                vb->dirty_min = VMAP_BBMAP_BITS;
                                vb->dirty_max = 0;

                                flush = 1;
                        }
                        spin_unlock(&vb->lock);
                }
                rcu_read_unlock();
        }
        free_purged_blocks(&purge_list);

        if (!__purge_vmap_area_lazy(start, end, false) && flush)
                flush_tlb_kernel_range(start, end);
        mutex_unlock(&vmap_purge_lock);
}

/**
 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
 *
 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
 * to amortize TLB flushing overheads. What this means is that any page you
 * have now, may, in a former life, have been mapped into kernel virtual
 * address by the vmap layer and so there might be some CPUs with TLB entries
 * still referencing that page (additional to the regular 1:1 kernel mapping).
 *
 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
 * be sure that none of the pages we have control over will have any aliases
 * from the vmap layer.
 */
void vm_unmap_aliases(void)
{
        _vm_unmap_aliases(ULONG_MAX, 0, 0);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);

/**
 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
 * @mem: the pointer returned by vm_map_ram
 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
 */
void vm_unmap_ram(const void *mem, unsigned int count)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr = (unsigned long)kasan_reset_tag(mem);
        struct vmap_area *va;

        might_sleep();
        BUG_ON(!addr);
        BUG_ON(addr < VMALLOC_START);
        BUG_ON(addr > VMALLOC_END);
        BUG_ON(!PAGE_ALIGNED(addr));

        kasan_poison_vmalloc(mem, size);

        if (likely(count <= VMAP_MAX_ALLOC)) {
                debug_check_no_locks_freed(mem, size);
                vb_free(addr, size);
                return;
        }

        va = find_unlink_vmap_area(addr);
        if (WARN_ON_ONCE(!va))
                return;

        debug_check_no_locks_freed((void *)va->va_start, va_size(va));
        free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);

/**
 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
 * @pages: an array of pointers to the pages to be mapped
 * @count: number of pages
 * @node: prefer to allocate data structures on this node
 *
 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
 * faster than vmap so it's good.  But if you mix long-life and short-life
 * objects with vm_map_ram(), it could consume lots of address space through
 * fragmentation (especially on a 32bit machine).  You could see failures in
 * the end.  Please use this function for short-lived objects.
 *
 * Returns: a pointer to the address that has been mapped, or %NULL on failure
 */
void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr;
        void *mem;

        if (likely(count <= VMAP_MAX_ALLOC)) {
                mem = vb_alloc(size, GFP_KERNEL);
                if (IS_ERR(mem))
                        return NULL;
                addr = (unsigned long)mem;
        } else {
                struct vmap_area *va;
                va = alloc_vmap_area(size, PAGE_SIZE,
                                VMALLOC_START, VMALLOC_END,
                                node, GFP_KERNEL, VMAP_RAM,
                                NULL);
                if (IS_ERR(va))
                        return NULL;

                addr = va->va_start;
                mem = (void *)addr;
        }

        if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
                                pages, PAGE_SHIFT) < 0) {
                vm_unmap_ram(mem, count);
                return NULL;
        }

        /*
         * Mark the pages as accessible, now that they are mapped.
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);

        return mem;
}
EXPORT_SYMBOL(vm_map_ram);

static struct vm_struct *vmlist __initdata;

static inline unsigned int vm_area_page_order(struct vm_struct *vm)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        return vm->page_order;
#else
        return 0;
#endif
}

unsigned int get_vm_area_page_order(struct vm_struct *vm)
{
        return vm_area_page_order(vm);
}

static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        vm->page_order = order;
#else
        BUG_ON(order != 0);
#endif
}

/**
 * vm_area_add_early - add vmap area early during boot
 * @vm: vm_struct to add
 *
 * This function is used to add fixed kernel vm area to vmlist before
 * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
 * should contain proper values and the other fields should be zero.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_add_early(struct vm_struct *vm)
{
        struct vm_struct *tmp, **p;

        BUG_ON(vmap_initialized);
        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
                if (tmp->addr >= vm->addr) {
                        BUG_ON(tmp->addr < vm->addr + vm->size);
                        break;
                } else
                        BUG_ON(tmp->addr + tmp->size > vm->addr);
        }
        vm->next = *p;
        *p = vm;
}

/**
 * vm_area_register_early - register vmap area early during boot
 * @vm: vm_struct to register
 * @align: requested alignment
 *
 * This function is used to register kernel vm area before
 * vmalloc_init() is called.  @vm->size and @vm->flags should contain
 * proper values on entry and other fields should be zero.  On return,
 * vm->addr contains the allocated address.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
        unsigned long addr = ALIGN(VMALLOC_START, align);
        struct vm_struct *cur, **p;

        BUG_ON(vmap_initialized);

        for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
                if ((unsigned long)cur->addr - addr >= vm->size)
                        break;
                addr = ALIGN((unsigned long)cur->addr + cur->size, align);
        }

        BUG_ON(addr > VMALLOC_END - vm->size);
        vm->addr = (void *)addr;
        vm->next = *p;
        *p = vm;
        kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}

void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
        /*
         * Before removing VM_UNINITIALIZED,
         * we should make sure that vm has proper values.
         * Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
         */
        smp_wmb();
        vm->flags &= ~VM_UNINITIALIZED;
}

struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long shift, unsigned long flags,
                unsigned long start, unsigned long end, int node,
                gfp_t gfp_mask, const void *caller)
{
        struct vmap_area *va;
        struct vm_struct *area;
        unsigned long requested_size = size;

        BUG_ON(in_interrupt());
        size = ALIGN(size, 1ul << shift);
        if (unlikely(!size))
                return NULL;

        if (flags & VM_IOREMAP)
                align = 1ul << clamp_t(int, get_count_order_long(size),
                                       PAGE_SHIFT, IOREMAP_MAX_ORDER);

        area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!area))
                return NULL;

        if (!(flags & VM_NO_GUARD))
                size += PAGE_SIZE;

        area->flags = flags;
        area->caller = caller;
        area->requested_size = requested_size;

        va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
        if (IS_ERR(va)) {
                kfree(area);
                return NULL;
        }

        /*
         * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
         * best-effort approach, as they can be mapped outside of vmalloc code.
         * For VM_ALLOC mappings, the pages are marked as accessible after
         * getting mapped in __vmalloc_node_range().
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        if (!(flags & VM_ALLOC))
                area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
                                                    KASAN_VMALLOC_PROT_NORMAL);

        return area;
}

struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
                                       const void *caller)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * get_vm_area - reserve a contiguous kernel virtual area
 * @size:         size of the area
 * @flags:         %VM_IOREMAP for I/O mappings or VM_ALLOC
 *
 * Search an area of @size in the kernel virtual mapping area,
 * and reserved it for out purposes.  Returns the area descriptor
 * on success or %NULL on failure.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
                                  VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL,
                                  __builtin_return_address(0));
}

struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                const void *caller)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
                                  VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * find_vm_area - find a continuous kernel virtual area
 * @addr:          base address
 *
 * Search for the kernel VM area starting at @addr, and return it.
 * It is up to the caller to do all required locking to keep the returned
 * pointer valid.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *find_vm_area(const void *addr)
{
        struct vmap_area *va;

        va = find_vmap_area((unsigned long)addr);
        if (!va)
                return NULL;

        return va->vm;
}

/**
 * remove_vm_area - find and remove a continuous kernel virtual area
 * @addr:            base address
 *
 * Search for the kernel VM area starting at @addr, and remove it.
 * This function returns the found VM area, but using it is NOT safe
 * on SMP machines, except for its size or flags.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *remove_vm_area(const void *addr)
{
        struct vmap_area *va;
        struct vm_struct *vm;

        might_sleep();

        if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
                        addr))
                return NULL;

        va = find_unlink_vmap_area((unsigned long)addr);
        if (!va || !va->vm)
                return NULL;
        vm = va->vm;

        debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
        debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
        kasan_free_module_shadow(vm);
        kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));

        free_unmap_vmap_area(va);
        return vm;
}

static inline void set_area_direct_map(const struct vm_struct *area,
                                       int (*set_direct_map)(struct page *page))
{
        int i;

        /* HUGE_VMALLOC passes small pages to set_direct_map */
        for (i = 0; i < area->nr_pages; i++)
                if (page_address(area->pages[i]))
                        set_direct_map(area->pages[i]);
}

/*
 * Flush the vm mapping and reset the direct map.
 */
static void vm_reset_perms(struct vm_struct *area)
{
        unsigned long start = ULONG_MAX, end = 0;
        unsigned int page_order = vm_area_page_order(area);
        int flush_dmap = 0;
        int i;

        /*
         * Find the start and end range of the direct mappings to make sure that
         * the vm_unmap_aliases() flush includes the direct map.
         */
        for (i = 0; i < area->nr_pages; i += 1U << page_order) {
                unsigned long addr = (unsigned long)page_address(area->pages[i]);

                if (addr) {
                        unsigned long page_size;

                        page_size = PAGE_SIZE << page_order;
                        start = min(addr, start);
                        end = max(addr + page_size, end);
                        flush_dmap = 1;
                }
        }

        /*
         * Set direct map to something invalid so that it won't be cached if
         * there are any accesses after the TLB flush, then flush the TLB and
         * reset the direct map permissions to the default.
         */
        set_area_direct_map(area, set_direct_map_invalid_noflush);
        _vm_unmap_aliases(start, end, flush_dmap);
        set_area_direct_map(area, set_direct_map_default_noflush);
}

static void delayed_vfree_work(struct work_struct *w)
{
        struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
        struct llist_node *t, *llnode;

        llist_for_each_safe(llnode, t, llist_del_all(&p->list))
                vfree(llnode);
}

/**
 * vfree_atomic - release memory allocated by vmalloc()
 * @addr:          memory base address
 *
 * This one is just like vfree() but can be called in any atomic context
 * except NMIs.
 */
void vfree_atomic(const void *addr)
{
        struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);

        BUG_ON(in_nmi());
        kmemleak_free(addr);

        /*
         * Use raw_cpu_ptr() because this can be called from preemptible
         * context. Preemption is absolutely fine here, because the llist_add()
         * implementation is lockless, so it works even if we are adding to
         * another cpu's list. schedule_work() should be fine with this too.
         */
        if (addr && llist_add((struct llist_node *)addr, &p->list))
                schedule_work(&p->wq);
}

/**
 * vfree - Release memory allocated by vmalloc()
 * @addr:  Memory base address
 *
 * Free the virtually continuous memory area starting at @addr, as obtained
 * from one of the vmalloc() family of APIs.  This will usually also free the
 * physical memory underlying the virtual allocation, but that memory is
 * reference counted, so it will not be freed until the last user goes away.
 *
 * If @addr is NULL, no operation is performed.
 *
 * Context:
 * May sleep if called *not* from interrupt context.
 * Must not be called in NMI context (strictly speaking, it could be
 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
 * conventions for vfree() arch-dependent would be a really bad idea).
 */
void vfree(const void *addr)
{
        struct vm_struct *vm;
        int i;

        if (unlikely(in_interrupt())) {
                vfree_atomic(addr);
                return;
        }

        BUG_ON(in_nmi());
        kmemleak_free(addr);
        might_sleep();

        if (!addr)
                return;

        vm = remove_vm_area(addr);
        if (unlikely(!vm)) {
                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                                addr);
                return;
        }

        if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
                vm_reset_perms(vm);
        for (i = 0; i < vm->nr_pages; i++) {
                struct page *page = vm->pages[i];

                BUG_ON(!page);
                /*
                 * High-order allocs for huge vmallocs are split, so
                 * can be freed as an array of order-0 allocations
                 */
                if (!(vm->flags & VM_MAP_PUT_PAGES))
                        mod_lruvec_page_state(page, NR_VMALLOC, -1);
                __free_page(page);
                cond_resched();
        }
        kvfree(vm->pages);
        kfree(vm);
}
EXPORT_SYMBOL(vfree);

/**
 * vunmap - release virtual mapping obtained by vmap()
 * @addr:   memory base address
 *
 * Free the virtually contiguous memory area starting at @addr,
 * which was created from the page array passed to vmap().
 *
 * Must not be called in interrupt context.
 */
void vunmap(const void *addr)
{
        struct vm_struct *vm;

        BUG_ON(in_interrupt());
        might_sleep();

        if (!addr)
                return;
        vm = remove_vm_area(addr);
        if (unlikely(!vm)) {
                WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
                                addr);
                return;
        }
        kfree(vm);
}
EXPORT_SYMBOL(vunmap);

/**
 * vmap - map an array of pages into virtually contiguous space
 * @pages: array of page pointers
 * @count: number of pages to map
 * @flags: vm_area->flags
 * @prot: page protection for the mapping
 *
 * Maps @count pages from @pages into contiguous kernel virtual space.
 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
 * (which must be kmalloc or vmalloc memory) and one reference per pages in it
 * are transferred from the caller to vmap(), and will be freed / dropped when
 * vfree() is called on the return value.
 *
 * Return: the address of the area or %NULL on failure
 */
void *vmap(struct page **pages, unsigned int count,
           unsigned long flags, pgprot_t prot)
{
        struct vm_struct *area;
        unsigned long addr;
        unsigned long size;                /* In bytes */

        might_sleep();

        if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
                return NULL;

        /*
         * Your top guard is someone else's bottom guard. Not having a top
         * guard compromises someone else's mappings too.
         */
        if (WARN_ON_ONCE(flags & VM_NO_GUARD))
                flags &= ~VM_NO_GUARD;

        if (count > totalram_pages())
                return NULL;

        size = (unsigned long)count << PAGE_SHIFT;
        area = get_vm_area_caller(size, flags, __builtin_return_address(0));
        if (!area)
                return NULL;

        addr = (unsigned long)area->addr;
        if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
                                pages, PAGE_SHIFT) < 0) {
                vunmap(area->addr);
                return NULL;
        }

        if (flags & VM_MAP_PUT_PAGES) {
                area->pages = pages;
                area->nr_pages = count;
        }
        return area->addr;
}
EXPORT_SYMBOL(vmap);

#ifdef CONFIG_VMAP_PFN
struct vmap_pfn_data {
        unsigned long        *pfns;
        pgprot_t        prot;
        unsigned int        idx;
};

static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
        struct vmap_pfn_data *data = private;
        unsigned long pfn = data->pfns[data->idx];
        pte_t ptent;

        if (WARN_ON_ONCE(pfn_valid(pfn)))
                return -EINVAL;

        ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
        set_pte_at(&init_mm, addr, pte, ptent);

        data->idx++;
        return 0;
}

/**
 * vmap_pfn - map an array of PFNs into virtually contiguous space
 * @pfns: array of PFNs
 * @count: number of pages to map
 * @prot: page protection for the mapping
 *
 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
 * the start address of the mapping.
 */
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
{
        struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
        struct vm_struct *area;

        area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
                        __builtin_return_address(0));
        if (!area)
                return NULL;
        if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
                        count * PAGE_SIZE, vmap_pfn_apply, &data)) {
                free_vm_area(area);
                return NULL;
        }

        flush_cache_vmap((unsigned long)area->addr,
                         (unsigned long)area->addr + count * PAGE_SIZE);

        return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */

/*
 * Helper for vmalloc to adjust the gfp flags for certain allocations.
 */
static inline gfp_t vmalloc_gfp_adjust(gfp_t flags, const bool large)
{
        flags |= __GFP_NOWARN;
        if (large)
                flags &= ~__GFP_NOFAIL;
        return flags;
}

static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
                unsigned int order, unsigned int nr_pages, struct page **pages)
{
        unsigned int nr_allocated = 0;
        unsigned int nr_remaining = nr_pages;
        unsigned int max_attempt_order = MAX_PAGE_ORDER;
        struct page *page;
        int i;
        unsigned int large_order = ilog2(nr_remaining);
        gfp_t large_gfp = vmalloc_gfp_adjust(gfp, large_order) & ~__GFP_DIRECT_RECLAIM;

        large_order = min(max_attempt_order, large_order);

        /*
         * Initially, attempt to have the page allocator give us large order
         * pages. Do not attempt allocating smaller than order chunks since
         * __vmap_pages_range() expects physically contigous pages of exactly
         * order long chunks.
         */
        while (large_order > order && nr_remaining) {
                if (nid == NUMA_NO_NODE)
                        page = alloc_pages_noprof(large_gfp, large_order);
                else
                        page = alloc_pages_node_noprof(nid, large_gfp, large_order);

                if (unlikely(!page)) {
                        max_attempt_order = --large_order;
                        continue;
                }

                mod_lruvec_page_state(page, NR_VMALLOC, 1 << large_order);

                split_page(page, large_order);
                for (i = 0; i < (1U << large_order); i++)
                        pages[nr_allocated + i] = page + i;

                nr_allocated += 1U << large_order;
                nr_remaining = nr_pages - nr_allocated;

                large_order = ilog2(nr_remaining);
                large_order = min(max_attempt_order, large_order);
        }

        /*
         * For order-0 pages we make use of bulk allocator, if
         * the page array is partly or not at all populated due
         * to fails, fallback to a single page allocator that is
         * more permissive.
         */
        if (!order) {
                while (nr_allocated < nr_pages) {
                        unsigned int nr, nr_pages_request;
                        int i;

                        /*
                         * A maximum allowed request is hard-coded and is 100
                         * pages per call. That is done in order to prevent a
                         * long preemption off scenario in the bulk-allocator
                         * so the range is [1:100].
                         */
                        nr_pages_request = min(100U, nr_pages - nr_allocated);

                        /* memory allocation should consider mempolicy, we can't
                         * wrongly use nearest node when nid == NUMA_NO_NODE,
                         * otherwise memory may be allocated in only one node,
                         * but mempolicy wants to alloc memory by interleaving.
                         */
                        if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
                                nr = alloc_pages_bulk_mempolicy_noprof(gfp,
                                                        nr_pages_request,
                                                        pages + nr_allocated);
                        else
                                nr = alloc_pages_bulk_node_noprof(gfp, nid,
                                                        nr_pages_request,
                                                        pages + nr_allocated);

                        for (i = nr_allocated; i < nr_allocated + nr; i++)
                                mod_lruvec_page_state(pages[i], NR_VMALLOC, 1);

                        nr_allocated += nr;

                        /*
                         * If zero or pages were obtained partly,
                         * fallback to a single page allocator.
                         */
                        if (nr != nr_pages_request)
                                break;
                }
        }

        /* High-order pages or fallback path if "bulk" fails. */
        while (nr_allocated < nr_pages) {
                if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current))
                        break;

                if (nid == NUMA_NO_NODE)
                        page = alloc_pages_noprof(gfp, order);
                else
                        page = alloc_pages_node_noprof(nid, gfp, order);

                if (unlikely(!page))
                        break;

                mod_lruvec_page_state(page, NR_VMALLOC, 1 << order);

                /*
                 * High-order allocations must be able to be treated as
                 * independent small pages by callers (as they can with
                 * small-page vmallocs). Some drivers do their own refcounting
                 * on vmalloc_to_page() pages, some use page->mapping,
                 * page->lru, etc.
                 */
                if (order)
                        split_page(page, order);

                /*
                 * Careful, we allocate and map page-order pages, but
                 * tracking is done per PAGE_SIZE page so as to keep the
                 * vm_struct APIs independent of the physical/mapped size.
                 */
                for (i = 0; i < (1U << order); i++)
                        pages[nr_allocated + i] = page + i;

                nr_allocated += 1U << order;
        }

        return nr_allocated;
}

static LLIST_HEAD(pending_vm_area_cleanup);
static void cleanup_vm_area_work(struct work_struct *work)
{
        struct vm_struct *area, *tmp;
        struct llist_node *head;

        head = llist_del_all(&pending_vm_area_cleanup);
        if (!head)
                return;

        llist_for_each_entry_safe(area, tmp, head, llnode) {
                if (!area->pages)
                        free_vm_area(area);
                else
                        vfree(area->addr);
        }
}

/*
 * Helper for __vmalloc_area_node() to defer cleanup
 * of partially initialized vm_struct in error paths.
 */
static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
static void defer_vm_area_cleanup(struct vm_struct *area)
{
        if (llist_add(&area->llnode, &pending_vm_area_cleanup))
                schedule_work(&cleanup_vm_area);
}

/*
 * Page tables allocations ignore external GFP. Enforces it by
 * the memalloc scope API. It is used by vmalloc internals and
 * KASAN shadow population only.
 *
 * GFP to scope mapping:
 *
 * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save()
 * GFP_NOFS - memalloc_nofs_save()
 * GFP_NOIO - memalloc_noio_save()
 * __GFP_RETRY_MAYFAIL, __GFP_NORETRY - memalloc_noreclaim_save()
 * to prevent OOMs
 *
 * Returns a flag cookie to pair with restore.
 */
unsigned int
memalloc_apply_gfp_scope(gfp_t gfp_mask)
{
        unsigned int flags = 0;

        if (!gfpflags_allow_blocking(gfp_mask) ||
                        (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_NORETRY)))
                flags = memalloc_noreclaim_save();
        else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
                flags = memalloc_nofs_save();
        else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
                flags = memalloc_noio_save();

        /* 0 - no scope applied. */
        return flags;
}

void
memalloc_restore_scope(unsigned int flags)
{
        if (flags)
                memalloc_flags_restore(flags);
}

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, unsigned int page_shift,
                                 int node)
{
        const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
        bool nofail = gfp_mask & __GFP_NOFAIL;
        unsigned long addr = (unsigned long)area->addr;
        unsigned long size = get_vm_area_size(area);
        unsigned long array_size;
        unsigned int nr_small_pages = size >> PAGE_SHIFT;
        unsigned int page_order;
        unsigned int flags;
        int ret;

        array_size = (unsigned long)nr_small_pages * sizeof(struct page *);

        /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */
        if (!gfpflags_allow_blocking(gfp_mask))
                nofail = false;

        if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
                gfp_mask |= __GFP_HIGHMEM;

        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
                area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
                                        area->caller);
        } else {
                area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
        }

        if (!area->pages) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to allocated page array size %lu",
                        nr_small_pages * PAGE_SIZE, array_size);
                goto fail;
        }

        set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
        page_order = vm_area_page_order(area);

        /*
         * High-order nofail allocations are really expensive and
         * potentially dangerous (pre-mature OOM, disruptive reclaim
         * and compaction etc.
         *
         * Please note, the __vmalloc_node_range_noprof() falls-back
         * to order-0 pages if high-order attempt is unsuccessful.
         */
        area->nr_pages = vm_area_alloc_pages(
                        vmalloc_gfp_adjust(gfp_mask, page_order), node,
                        page_order, nr_small_pages, area->pages);

        /*
         * If not enough pages were obtained to accomplish an
         * allocation request, free them via vfree() if any.
         */
        if (area->nr_pages != nr_small_pages) {
                /*
                 * vm_area_alloc_pages() can fail due to insufficient memory but
                 * also:-
                 *
                 * - a pending fatal signal
                 * - insufficient huge page-order pages
                 *
                 * Since we always retry allocations at order-0 in the huge page
                 * case a warning for either is spurious.
                 */
                if (!fatal_signal_pending(current) && page_order == 0)
                        warn_alloc(gfp_mask, NULL,
                                "vmalloc error: size %lu, failed to allocate pages",
                                nr_small_pages * PAGE_SIZE);
                goto fail;
        }

        /*
         * page tables allocations ignore external gfp mask, enforce it
         * by the scope API
         */
        flags = memalloc_apply_gfp_scope(gfp_mask);
        do {
                ret = __vmap_pages_range(addr, addr + size, prot, area->pages,
                                page_shift, nested_gfp);
                if (nofail && (ret < 0))
                        schedule_timeout_uninterruptible(1);
        } while (nofail && (ret < 0));
        memalloc_restore_scope(flags);

        if (ret < 0) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to map pages",
                        area->nr_pages * PAGE_SIZE);
                goto fail;
        }

        return area->addr;

fail:
        defer_vm_area_cleanup(area);
        return NULL;
}

/*
 * See __vmalloc_node_range() for a clear list of supported vmalloc flags.
 * This gfp lists all flags currently passed through vmalloc. Currently,
 * __GFP_ZERO is used by BPF and __GFP_NORETRY is used by percpu. Both drm
 * and BPF also use GFP_USER. Additionally, various users pass
 * GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP.
 */
#define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\
                                __GFP_NOFAIL | __GFP_ZERO |\
                                __GFP_NORETRY | __GFP_RETRY_MAYFAIL |\
                                GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\
                                GFP_USER | __GFP_NOLOCKDEP)

static gfp_t vmalloc_fix_flags(gfp_t flags)
{
        gfp_t invalid_mask = flags & ~GFP_VMALLOC_SUPPORTED;

        flags &= GFP_VMALLOC_SUPPORTED;
        WARN_ONCE(1, "Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
                  invalid_mask, &invalid_mask, flags, &flags);
        return flags;
}

/**
 * __vmalloc_node_range - allocate virtually contiguous memory
 * @size:                  allocation size
 * @align:                  desired alignment
 * @start:                  vm area range start
 * @end:                  vm area range end
 * @gfp_mask:                  flags for the page level allocator
 * @prot:                  protection mask for the allocated pages
 * @vm_flags:                  additional vm area flags (e.g. %VM_NO_GUARD)
 * @node:                  node to use for allocation or NUMA_NO_NODE
 * @caller:                  caller's return address
 *
 * Allocate enough pages to cover @size from the page level
 * allocator with @gfp_mask flags and map them into contiguous
 * virtual range with protection @prot.
 *
 * Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT,
 * %__GFP_RETRY_MAYFAIL, %__GFP_NORETRY, %GFP_NOFS and %GFP_NOIO.
 * Zone modifiers are not supported.
 * Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only
 * by __vmalloc().
 *
 * Retry modifiers: only %__GFP_NOFAIL is fully supported;
 * %__GFP_NORETRY and %__GFP_RETRY_MAYFAIL are supported with limitation,
 * i.e. page tables are allocated with NOWAIT semantic so they might fail
 * under moderate memory pressure.
 *
 * %__GFP_NOWARN can be used to suppress failure messages.
 *
 * Can not be called from interrupt nor NMI contexts.
 * Return: the address of the area or %NULL on failure
 */
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
                        pgprot_t prot, unsigned long vm_flags, int node,
                        const void *caller)
{
        struct vm_struct *area;
        void *ret;
        kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
        unsigned long original_align = align;
        unsigned int shift = PAGE_SHIFT;

        if (WARN_ON_ONCE(!size))
                return NULL;

        if ((size >> PAGE_SHIFT) > totalram_pages()) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, exceeds total pages",
                        size);
                return NULL;
        }

        if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
                /*
                 * Try huge pages. Only try for PAGE_KERNEL allocations,
                 * others like modules don't yet expect huge pages in
                 * their allocations due to apply_to_page_range not
                 * supporting them.
                 */

                if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
                        shift = PMD_SHIFT;
                else
                        shift = arch_vmap_pte_supported_shift(size);

                align = max(original_align, 1UL << shift);
        }

again:
        area = __get_vm_area_node(size, align, shift, VM_ALLOC |
                                  VM_UNINITIALIZED | vm_flags, start, end, node,
                                  gfp_mask, caller);
        if (!area) {
                bool nofail = gfp_mask & __GFP_NOFAIL;
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, vm_struct allocation failed%s",
                        size, (nofail) ? ". Retrying." : "");
                if (nofail) {
                        schedule_timeout_uninterruptible(1);
                        goto again;
                }
                goto fail;
        }

        /*
         * Prepare arguments for __vmalloc_area_node() and
         * kasan_unpoison_vmalloc().
         */
        if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
                if (kasan_hw_tags_enabled()) {
                        /*
                         * Modify protection bits to allow tagging.
                         * This must be done before mapping.
                         */
                        prot = arch_vmap_pgprot_tagged(prot);

                        /*
                         * Skip page_alloc poisoning and zeroing for physical
                         * pages backing VM_ALLOC mapping. Memory is instead
                         * poisoned and zeroed by kasan_unpoison_vmalloc().
                         */
                        gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
                }

                /* Take note that the mapping is PAGE_KERNEL. */
                kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
        }

        /* Allocate physical pages and map them into vmalloc space. */
        ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
        if (!ret)
                goto fail;

        /*
         * Mark the pages as accessible, now that they are mapped.
         * The condition for setting KASAN_VMALLOC_INIT should complement the
         * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
         * to make sure that memory is initialized under the same conditions.
         * Tag-based KASAN modes only assign tags to normal non-executable
         * allocations, see __kasan_unpoison_vmalloc().
         */
        kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
        if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
            (gfp_mask & __GFP_SKIP_ZERO))
                kasan_flags |= KASAN_VMALLOC_INIT;
        /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
        area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);

        /*
         * In this function, newly allocated vm_struct has VM_UNINITIALIZED
         * flag. It means that vm_struct is not fully initialized.
         * Now, it is fully initialized, so remove this flag here.
         */
        clear_vm_uninitialized_flag(area);

        if (!(vm_flags & VM_DEFER_KMEMLEAK))
                kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask);

        return area->addr;

fail:
        if (shift > PAGE_SHIFT) {
                shift = PAGE_SHIFT;
                align = original_align;
                goto again;
        }

        return NULL;
}

/**
 * __vmalloc_node - allocate virtually contiguous memory
 * @size:            allocation size
 * @align:            desired alignment
 * @gfp_mask:            flags for the page level allocator
 * @node:            node to use for allocation or NUMA_NO_NODE
 * @caller:            caller's return address
 *
 * Allocate enough pages to cover @size from the page level allocator with
 * @gfp_mask flags.  Map them into contiguous kernel virtual space.
 *
 * Semantics of @gfp_mask (including reclaim/retry modifiers such as
 * __GFP_NOFAIL) are the same as in __vmalloc_node_range_noprof().
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, int node, const void *caller)
{
        return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
                                gfp_mask, PAGE_KERNEL, 0, node, caller);
}
/*
 * This is only for performance analysis of vmalloc and stress purpose.
 * It is required by vmalloc test module, therefore do not use it other
 * than that.
 */
#ifdef CONFIG_TEST_VMALLOC_MODULE
EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
#endif

void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
        if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED))
                gfp_mask = vmalloc_fix_flags(gfp_mask);
        return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc_noprof);

/**
 * vmalloc - allocate virtually contiguous memory
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_noprof);

/**
 * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
 * @size:      allocation size
 * @gfp_mask:  flags for the page level allocator
 * @node:            node to use for allocation or NUMA_NO_NODE
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * If @size is greater than or equal to PMD_SIZE, allow using
 * huge pages for the memory
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
        if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED))
                gfp_mask = vmalloc_fix_flags(gfp_mask);
        return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
                                           gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
                                           node, __builtin_return_address(0));
}
EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);

/**
 * vzalloc - allocate virtually contiguous memory with zero fill
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_noprof);

/**
 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
 * @size: allocation size
 *
 * The resulting memory area is zeroed so it can be mapped to userspace
 * without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_user_noprof(unsigned long size)
{
        return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user_noprof);

/**
 * vmalloc_node - allocate memory on a specific node
 * @size:          allocation size
 * @node:          numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_node_noprof(unsigned long size, int node)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node_noprof);

/**
 * vzalloc_node - allocate memory on a specific node with zero fill
 * @size:        allocation size
 * @node:        numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_node_noprof(unsigned long size, int node)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node_noprof);

/**
 * vrealloc_node_align - reallocate virtually contiguous memory; contents
 * remain unchanged
 * @p: object to reallocate memory for
 * @size: the size to reallocate
 * @align: requested alignment
 * @flags: the flags for the page level allocator
 * @nid: node number of the target node
 *
 * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size
 * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
 *
 * If the caller wants the new memory to be on specific node *only*,
 * __GFP_THISNODE flag should be set, otherwise the function will try to avoid
 * reallocation and possibly disregard the specified @nid.
 *
 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
 * initial memory allocation, every subsequent call to this API for the same
 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
 * __GFP_ZERO is not fully honored by this API.
 *
 * Requesting an alignment that is bigger than the alignment of the existing
 * allocation will fail.
 *
 * In any case, the contents of the object pointed to are preserved up to the
 * lesser of the new and old sizes.
 *
 * This function must not be called concurrently with itself or vfree() for the
 * same memory allocation.
 *
 * Return: pointer to the allocated memory; %NULL if @size is zero or in case of
 *         failure
 */
void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
                                 gfp_t flags, int nid)
{
        struct vm_struct *vm = NULL;
        size_t alloced_size = 0;
        size_t old_size = 0;
        void *n;

        if (!size) {
                vfree(p);
                return NULL;
        }

        if (p) {
                vm = find_vm_area(p);
                if (unlikely(!vm)) {
                        WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
                        return NULL;
                }

                alloced_size = get_vm_area_size(vm);
                old_size = vm->requested_size;
                if (WARN(alloced_size < old_size,
                         "vrealloc() has mismatched area vs requested sizes (%p)\n", p))
                        return NULL;
                if (WARN(!IS_ALIGNED((unsigned long)p, align),
                         "will not reallocate with a bigger alignment (0x%lx)\n", align))
                        return NULL;
                if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
                             nid != page_to_nid(vmalloc_to_page(p)))
                        goto need_realloc;
        }

        /*
         * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What
         * would be a good heuristic for when to shrink the vm_area?
         */
        if (size <= old_size) {
                /* Zero out "freed" memory, potentially for future realloc. */
                if (want_init_on_free() || want_init_on_alloc(flags))
                        memset((void *)p + size, 0, old_size - size);
                vm->requested_size = size;
                kasan_vrealloc(p, old_size, size);
                return (void *)p;
        }

        /*
         * We already have the bytes available in the allocation; use them.
         */
        if (size <= alloced_size) {
                /*
                 * No need to zero memory here, as unused memory will have
                 * already been zeroed at initial allocation time or during
                 * realloc shrink time.
                 */
                vm->requested_size = size;
                kasan_vrealloc(p, old_size, size);
                return (void *)p;
        }

need_realloc:
        /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
        n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0));

        if (!n)
                return NULL;

        if (p) {
                memcpy(n, p, old_size);
                vfree(p);
        }

        return n;
}
EXPORT_SYMBOL(vrealloc_node_align_noprof);

#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
#else
/*
 * 64b systems should always have either DMA or DMA32 zones. For others
 * GFP_DMA32 should do the right thing and use the normal zone.
 */
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif

/**
 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
 * @size:        allocation size
 *
 * Allocate enough 32bit PA addressable pages to cover @size from the
 * page level allocator and map them into contiguous kernel virtual space.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_noprof);

/**
 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
 * @size:             allocation size
 *
 * The resulting memory area is 32bit addressable and zeroed so it can be
 * mapped to userspace without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_user_noprof(unsigned long size)
{
        return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user_noprof);

/*
 * Atomically zero bytes in the iterator.
 *
 * Returns the number of zeroed bytes.
 */
static size_t zero_iter(struct iov_iter *iter, size_t count)
{
        size_t remains = count;

        while (remains > 0) {
                size_t num, copied;

                num = min_t(size_t, remains, PAGE_SIZE);
                copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
                remains -= copied;

                if (copied < num)
                        break;
        }

        return count - remains;
}

/*
 * small helper routine, copy contents to iter from addr.
 * If the page is not present, fill zero.
 *
 * Returns the number of copied bytes.
 */
static size_t aligned_vread_iter(struct iov_iter *iter,
                                 const char *addr, size_t count)
{
        size_t remains = count;
        struct page *page;

        while (remains > 0) {
                unsigned long offset, length;
                size_t copied = 0;

                offset = offset_in_page(addr);
                length = PAGE_SIZE - offset;
                if (length > remains)
                        length = remains;
                page = vmalloc_to_page(addr);
                /*
                 * To do safe access to this _mapped_ area, we need lock. But
                 * adding lock here means that we need to add overhead of
                 * vmalloc()/vfree() calls for this _debug_ interface, rarely
                 * used. Instead of that, we'll use an local mapping via
                 * copy_page_to_iter_nofault() and accept a small overhead in
                 * this access function.
                 */
                if (page)
                        copied = copy_page_to_iter_nofault(page, offset,
                                                           length, iter);
                else
                        copied = zero_iter(iter, length);

                addr += copied;
                remains -= copied;

                if (copied != length)
                        break;
        }

        return count - remains;
}

/*
 * Read from a vm_map_ram region of memory.
 *
 * Returns the number of copied bytes.
 */
static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
                                  size_t count, unsigned long flags)
{
        char *start;
        struct vmap_block *vb;
        struct xarray *xa;
        unsigned long offset;
        unsigned int rs, re;
        size_t remains, n;

        /*
         * If it's area created by vm_map_ram() interface directly, but
         * not further subdividing and delegating management to vmap_block,
         * handle it here.
         */
        if (!(flags & VMAP_BLOCK))
                return aligned_vread_iter(iter, addr, count);

        remains = count;

        /*
         * Area is split into regions and tracked with vmap_block, read out
         * each region and zero fill the hole between regions.
         */
        xa = addr_to_vb_xa((unsigned long) addr);
        vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
        if (!vb)
                goto finished_zero;

        spin_lock(&vb->lock);
        if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
                spin_unlock(&vb->lock);
                goto finished_zero;
        }

        for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
                size_t copied;

                if (remains == 0)
                        goto finished;

                start = vmap_block_vaddr(vb->va->va_start, rs);

                if (addr < start) {
                        size_t to_zero = min_t(size_t, start - addr, remains);
                        size_t zeroed = zero_iter(iter, to_zero);

                        addr += zeroed;
                        remains -= zeroed;

                        if (remains == 0 || zeroed != to_zero)
                                goto finished;
                }

                /*it could start reading from the middle of used region*/
                offset = offset_in_page(addr);
                n = ((re - rs + 1) << PAGE_SHIFT) - offset;
                if (n > remains)
                        n = remains;

                copied = aligned_vread_iter(iter, start + offset, n);

                addr += copied;
                remains -= copied;

                if (copied != n)
                        goto finished;
        }

        spin_unlock(&vb->lock);

finished_zero:
        /* zero-fill the left dirty or free regions */
        return count - remains + zero_iter(iter, remains);
finished:
        /* We couldn't copy/zero everything */
        spin_unlock(&vb->lock);
        return count - remains;
}

/**
 * vread_iter() - read vmalloc area in a safe way to an iterator.
 * @iter:         the iterator to which data should be written.
 * @addr:         vm address.
 * @count:        number of bytes to be read.
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * copies data from that area to a given iterator. If the given memory range
 * of [addr...addr+count) includes some valid address, data is copied to
 * proper area of @iter. If there are memory holes, they'll be zero-filled.
 * IOREMAP area is treated as memory hole and no copy is done.
 *
 * If [addr...addr+count) doesn't includes any intersects with alive
 * vm_struct area, returns 0.
 *
 * Note: In usual ops, vread_iter() is never necessary because the caller
 * should know vmalloc() area is valid and can use memcpy().
 * This is for routines which have to access vmalloc area without
 * any information, as /proc/kcore.
 *
 * Return: number of bytes for which addr and iter should be advanced
 * (same number as @count) or %0 if [addr...addr+count) doesn't
 * include any intersection with valid vmalloc area
 */
long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        struct vm_struct *vm;
        char *vaddr;
        size_t n, size, flags, remains;
        unsigned long next;

        addr = kasan_reset_tag(addr);

        /* Don't allow overflow */
        if ((unsigned long) addr + count < count)
                count = -(unsigned long) addr;

        remains = count;

        vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
        if (!vn)
                goto finished_zero;

        /* no intersects with alive vmap_area */
        if ((unsigned long)addr + remains <= va->va_start)
                goto finished_zero;

        do {
                size_t copied;

                if (remains == 0)
                        goto finished;

                vm = va->vm;
                flags = va->flags & VMAP_FLAGS_MASK;
                /*
                 * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
                 * be set together with VMAP_RAM.
                 */
                WARN_ON(flags == VMAP_BLOCK);

                if (!vm && !flags)
                        goto next_va;

                if (vm && (vm->flags & VM_UNINITIALIZED))
                        goto next_va;

                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                smp_rmb();

                vaddr = (char *) va->va_start;
                size = vm ? get_vm_area_size(vm) : va_size(va);

                if (addr >= vaddr + size)
                        goto next_va;

                if (addr < vaddr) {
                        size_t to_zero = min_t(size_t, vaddr - addr, remains);
                        size_t zeroed = zero_iter(iter, to_zero);

                        addr += zeroed;
                        remains -= zeroed;

                        if (remains == 0 || zeroed != to_zero)
                                goto finished;
                }

                n = vaddr + size - addr;
                if (n > remains)
                        n = remains;

                if (flags & VMAP_RAM)
                        copied = vmap_ram_vread_iter(iter, addr, n, flags);
                else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
                        copied = aligned_vread_iter(iter, addr, n);
                else /* IOREMAP | SPARSE area is treated as memory hole */
                        copied = zero_iter(iter, n);

                addr += copied;
                remains -= copied;

                if (copied != n)
                        goto finished;

        next_va:
                next = va->va_end;
                spin_unlock(&vn->busy.lock);
        } while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));

finished_zero:
        if (vn)
                spin_unlock(&vn->busy.lock);

        /* zero-fill memory holes */
        return count - remains + zero_iter(iter, remains);
finished:
        /* Nothing remains, or We couldn't copy/zero everything. */
        if (vn)
                spin_unlock(&vn->busy.lock);

        return count - remains;
}

/**
 * remap_vmalloc_range_partial - map vmalloc pages to userspace
 * @vma:                vma to cover
 * @uaddr:                target user address to start at
 * @kaddr:                virtual address of vmalloc kernel memory
 * @pgoff:                offset from @kaddr to start at
 * @size:                size of map area
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that @kaddr is a valid vmalloc'ed area,
 * and that it is big enough to cover the range starting at
 * @uaddr in @vma. Will return failure if that criteria isn't
 * met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
                                void *kaddr, unsigned long pgoff,
                                unsigned long size)
{
        struct vm_struct *area;
        unsigned long off;
        unsigned long end_index;

        if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
                return -EINVAL;

        size = PAGE_ALIGN(size);

        if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
                return -EINVAL;

        area = find_vm_area(kaddr);
        if (!area)
                return -EINVAL;

        if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
                return -EINVAL;

        if (check_add_overflow(size, off, &end_index) ||
            end_index > get_vm_area_size(area))
                return -EINVAL;
        kaddr += off;

        do {
                struct page *page = vmalloc_to_page(kaddr);
                int ret;

                ret = vm_insert_page(vma, uaddr, page);
                if (ret)
                        return ret;

                uaddr += PAGE_SIZE;
                kaddr += PAGE_SIZE;
                size -= PAGE_SIZE;
        } while (size > 0);

        vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);

        return 0;
}

/**
 * remap_vmalloc_range - map vmalloc pages to userspace
 * @vma:                vma to cover (map full range of vma)
 * @addr:                vmalloc memory
 * @pgoff:                number of pages into addr before first page to map
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * that it is big enough to cover the vma. Will return failure if
 * that criteria isn't met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                                                unsigned long pgoff)
{
        return remap_vmalloc_range_partial(vma, vma->vm_start,
                                           addr, pgoff,
                                           vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);

void free_vm_area(struct vm_struct *area)
{
        struct vm_struct *ret;
        ret = remove_vm_area(area->addr);
        BUG_ON(ret != area);
        kfree(area);
}
EXPORT_SYMBOL_GPL(free_vm_area);

#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
        return rb_entry_safe(n, struct vmap_area, rb_node);
}

/**
 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
 * @addr: target address
 *
 * Returns: vmap_area if it is found. If there is no such area
 *   the first highest(reverse order) vmap_area is returned
 *   i.e. va->va_start < addr && va->va_end < addr or NULL
 *   if there are no any areas before @addr.
 */
static struct vmap_area *
pvm_find_va_enclose_addr(unsigned long addr)
{
        struct vmap_area *va, *tmp;
        struct rb_node *n;

        n = free_vmap_area_root.rb_node;
        va = NULL;

        while (n) {
                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_start <= addr) {
                        va = tmp;
                        if (tmp->va_end >= addr)
                                break;

                        n = n->rb_right;
                } else {
                        n = n->rb_left;
                }
        }

        return va;
}

/**
 * pvm_determine_end_from_reverse - find the highest aligned address
 * of free block below VMALLOC_END
 * @va:
 *   in - the VA we start the search(reverse order);
 *   out - the VA with the highest aligned end address.
 * @align: alignment for required highest address
 *
 * Returns: determined end address within vmap_area
 */
static unsigned long
pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
        unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        unsigned long addr;

        if (likely(*va)) {
                list_for_each_entry_from_reverse((*va),
                                &free_vmap_area_list, list) {
                        addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
                        if ((*va)->va_start < addr)
                                return addr;
                }
        }

        return 0;
}

/**
 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
 * @offsets: array containing offset of each area
 * @sizes: array containing size of each area
 * @nr_vms: the number of areas to allocate
 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
 *
 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 *            vm_structs on success, %NULL on failure
 *
 * Percpu allocator wants to use congruent vm areas so that it can
 * maintain the offsets among percpu areas.  This function allocates
 * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
 * be scattered pretty far, distance between two areas easily going up
 * to gigabytes.  To avoid interacting with regular vmallocs, these
 * areas are allocated from top.
 *
 * Despite its complicated look, this allocator is rather simple. It
 * does everything top-down and scans free blocks from the end looking
 * for matching base. While scanning, if any of the areas do not fit the
 * base address is pulled down to fit the area. Scanning is repeated till
 * all the areas fit and then all necessary data structures are inserted
 * and the result is returned.
 */
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
                                     size_t align)
{
        const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
        const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        struct vmap_area **vas, *va;
        struct vm_struct **vms;
        int area, area2, last_area, term_area;
        unsigned long base, start, size, end, last_end, orig_start, orig_end;
        bool purged = false;

        /* verify parameters and allocate data structures */
        BUG_ON(offset_in_page(align) || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
                start = offsets[area];
                end = start + sizes[area];

                /* is everything aligned properly? */
                BUG_ON(!IS_ALIGNED(offsets[area], align));
                BUG_ON(!IS_ALIGNED(sizes[area], align));

                /* detect the area with the highest address */
                if (start > offsets[last_area])
                        last_area = area;

                for (area2 = area + 1; area2 < nr_vms; area2++) {
                        unsigned long start2 = offsets[area2];
                        unsigned long end2 = start2 + sizes[area2];

                        BUG_ON(start2 < end && start < end2);
                }
        }
        last_end = offsets[last_area] + sizes[last_area];

        if (vmalloc_end - vmalloc_start < last_end) {
                WARN_ON(true);
                return NULL;
        }

        vms = kzalloc_objs(vms[0], nr_vms);
        vas = kzalloc_objs(vas[0], nr_vms);
        if (!vas || !vms)
                goto err_free2;

        for (area = 0; area < nr_vms; area++) {
                vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
                vms[area] = kzalloc_obj(struct vm_struct);
                if (!vas[area] || !vms[area])
                        goto err_free;
        }
retry:
        spin_lock(&free_vmap_area_lock);

        /* start scanning - we scan from the top, begin with the last area */
        area = term_area = last_area;
        start = offsets[area];
        end = start + sizes[area];

        va = pvm_find_va_enclose_addr(vmalloc_end);
        base = pvm_determine_end_from_reverse(&va, align) - end;

        while (true) {
                /*
                 * base might have underflowed, add last_end before
                 * comparing.
                 */
                if (base + last_end < vmalloc_start + last_end)
                        goto overflow;

                /*
                 * Fitting base has not been found.
                 */
                if (va == NULL)
                        goto overflow;

                /*
                 * If required width exceeds current VA block, move
                 * base downwards and then recheck.
                 */
                if (base + end > va->va_end) {
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * If this VA does not fit, move base downwards and recheck.
                 */
                if (base + start < va->va_start) {
                        va = node_to_va(rb_prev(&va->rb_node));
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * This area fits, move on to the previous one.  If
                 * the previous one is the terminal one, we're done.
                 */
                area = (area + nr_vms - 1) % nr_vms;
                if (area == term_area)
                        break;

                start = offsets[area];
                end = start + sizes[area];
                va = pvm_find_va_enclose_addr(base + end);
        }

        /* we've found a fitting base, insert all va's */
        for (area = 0; area < nr_vms; area++) {
                int ret;

                start = base + offsets[area];
                size = sizes[area];

                va = pvm_find_va_enclose_addr(start);
                if (WARN_ON_ONCE(va == NULL))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                ret = va_clip(&free_vmap_area_root,
                        &free_vmap_area_list, va, start, size);
                if (WARN_ON_ONCE(unlikely(ret)))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                /* Allocated area. */
                va = vas[area];
                va->va_start = start;
                va->va_end = start + size;
        }

        spin_unlock(&free_vmap_area_lock);

        /* populate the kasan shadow space */
        for (area = 0; area < nr_vms; area++) {
                if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL))
                        goto err_free_shadow;
        }

        /* insert all vm's */
        for (area = 0; area < nr_vms; area++) {
                struct vmap_node *vn = addr_to_node(vas[area]->va_start);

                spin_lock(&vn->busy.lock);
                insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
                setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
                                 pcpu_get_vm_areas);
                spin_unlock(&vn->busy.lock);
        }

        /*
         * Mark allocated areas as accessible. Do it now as a best-effort
         * approach, as they can be mapped outside of vmalloc code.
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        kasan_unpoison_vmap_areas(vms, nr_vms, KASAN_VMALLOC_PROT_NORMAL);

        kfree(vas);
        return vms;

recovery:
        /*
         * Remove previously allocated areas. There is no
         * need in removing these areas from the busy tree,
         * because they are inserted only on the final step
         * and when pcpu_get_vm_areas() is success.
         */
        while (area--) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
                                &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end,
                                KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
                vas[area] = NULL;
        }

overflow:
        spin_unlock(&free_vmap_area_lock);
        if (!purged) {
                reclaim_and_purge_vmap_areas();
                purged = true;

                /* Before "retry", check if we recover. */
                for (area = 0; area < nr_vms; area++) {
                        if (vas[area])
                                continue;

                        vas[area] = kmem_cache_zalloc(
                                vmap_area_cachep, GFP_KERNEL);
                        if (!vas[area])
                                goto err_free;
                }

                goto retry;
        }

err_free:
        for (area = 0; area < nr_vms; area++) {
                if (vas[area])
                        kmem_cache_free(vmap_area_cachep, vas[area]);

                kfree(vms[area]);
        }
err_free2:
        kfree(vas);
        kfree(vms);
        return NULL;

err_free_shadow:
        spin_lock(&free_vmap_area_lock);
        /*
         * We release all the vmalloc shadows, even the ones for regions that
         * hadn't been successfully added. This relies on kasan_release_vmalloc
         * being able to tolerate this case.
         */
        for (area = 0; area < nr_vms; area++) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
                                &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end,
                                KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
                vas[area] = NULL;
                kfree(vms[area]);
        }
        spin_unlock(&free_vmap_area_lock);
        kfree(vas);
        kfree(vms);
        return NULL;
}

/**
 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
 * @nr_vms: the number of allocated areas
 *
 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
 */
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
        int i;

        for (i = 0; i < nr_vms; i++)
                free_vm_area(vms[i]);
        kfree(vms);
}
#endif        /* CONFIG_SMP */

#ifdef CONFIG_PRINTK
bool vmalloc_dump_obj(void *object)
{
        const void *caller;
        struct vm_struct *vm;
        struct vmap_area *va;
        struct vmap_node *vn;
        unsigned long addr;
        unsigned int nr_pages;

        addr = PAGE_ALIGN((unsigned long) object);
        vn = addr_to_node(addr);

        if (!spin_trylock(&vn->busy.lock))
                return false;

        va = __find_vmap_area(addr, &vn->busy.root);
        if (!va || !va->vm) {
                spin_unlock(&vn->busy.lock);
                return false;
        }

        vm = va->vm;
        addr = (unsigned long) vm->addr;
        caller = vm->caller;
        nr_pages = vm->nr_pages;
        spin_unlock(&vn->busy.lock);

        pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
                nr_pages, addr, caller);

        return true;
}
#endif

#ifdef CONFIG_PROC_FS

/*
 * Print number of pages allocated on each memory node.
 *
 * This function can only be called if CONFIG_NUMA is enabled
 * and VM_UNINITIALIZED bit in v->flags is disabled.
 */
static void show_numa_info(struct seq_file *m, struct vm_struct *v,
                                 unsigned int *counters)
{
        unsigned int nr;
        unsigned int step = 1U << vm_area_page_order(v);

        if (!counters)
                return;

        memset(counters, 0, nr_node_ids * sizeof(unsigned int));

        for (nr = 0; nr < v->nr_pages; nr += step)
                counters[page_to_nid(v->pages[nr])] += step;
        for_each_node_state(nr, N_HIGH_MEMORY)
                if (counters[nr])
                        seq_printf(m, " N%u=%u", nr, counters[nr]);
}

static void show_purge_info(struct seq_file *m)
{
        struct vmap_node *vn;
        struct vmap_area *va;

        for_each_vmap_node(vn) {
                spin_lock(&vn->lazy.lock);
                list_for_each_entry(va, &vn->lazy.head, list) {
                        seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
                                (void *)va->va_start, (void *)va->va_end,
                                va_size(va));
                }
                spin_unlock(&vn->lazy.lock);
        }
}

static int vmalloc_info_show(struct seq_file *m, void *p)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        struct vm_struct *v;
        unsigned int *counters;

        if (IS_ENABLED(CONFIG_NUMA))
                counters = kmalloc_array(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);

        for_each_vmap_node(vn) {
                spin_lock(&vn->busy.lock);
                list_for_each_entry(va, &vn->busy.head, list) {
                        if (!va->vm) {
                                if (va->flags & VMAP_RAM)
                                        seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
                                                (void *)va->va_start, (void *)va->va_end,
                                                va_size(va));

                                continue;
                        }

                        v = va->vm;
                        if (v->flags & VM_UNINITIALIZED)
                                continue;

                        /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                        smp_rmb();

                        seq_printf(m, "0x%pK-0x%pK %7ld",
                                v->addr, v->addr + v->size, v->size);

                        if (v->caller)
                                seq_printf(m, " %pS", v->caller);

                        if (v->nr_pages)
                                seq_printf(m, " pages=%d", v->nr_pages);

                        if (v->phys_addr)
                                seq_printf(m, " phys=%pa", &v->phys_addr);

                        if (v->flags & VM_IOREMAP)
                                seq_puts(m, " ioremap");

                        if (v->flags & VM_SPARSE)
                                seq_puts(m, " sparse");

                        if (v->flags & VM_ALLOC)
                                seq_puts(m, " vmalloc");

                        if (v->flags & VM_MAP)
                                seq_puts(m, " vmap");

                        if (v->flags & VM_USERMAP)
                                seq_puts(m, " user");

                        if (v->flags & VM_DMA_COHERENT)
                                seq_puts(m, " dma-coherent");

                        if (is_vmalloc_addr(v->pages))
                                seq_puts(m, " vpages");

                        if (IS_ENABLED(CONFIG_NUMA))
                                show_numa_info(m, v, counters);

                        seq_putc(m, '\n');
                }
                spin_unlock(&vn->busy.lock);
        }

        /*
         * As a final step, dump "unpurged" areas.
         */
        show_purge_info(m);
        if (IS_ENABLED(CONFIG_NUMA))
                kfree(counters);
        return 0;
}

static int __init proc_vmalloc_init(void)
{
        proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
        return 0;
}
module_init(proc_vmalloc_init);

#endif

static void __init vmap_init_free_space(void)
{
        unsigned long vmap_start = 1;
        const unsigned long vmap_end = ULONG_MAX;
        struct vmap_area *free;
        struct vm_struct *busy;

        /*
         *     B     F     B     B     B     F
         * -|-----|.....|-----|-----|-----|.....|-
         *  |           The KVA space           |
         *  |<--------------------------------->|
         */
        for (busy = vmlist; busy; busy = busy->next) {
                if ((unsigned long) busy->addr - vmap_start > 0) {
                        free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!WARN_ON_ONCE(!free)) {
                                free->va_start = vmap_start;
                                free->va_end = (unsigned long) busy->addr;

                                insert_vmap_area_augment(free, NULL,
                                        &free_vmap_area_root,
                                                &free_vmap_area_list);
                        }
                }

                vmap_start = (unsigned long) busy->addr + busy->size;
        }

        if (vmap_end - vmap_start > 0) {
                free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (!WARN_ON_ONCE(!free)) {
                        free->va_start = vmap_start;
                        free->va_end = vmap_end;

                        insert_vmap_area_augment(free, NULL,
                                &free_vmap_area_root,
                                        &free_vmap_area_list);
                }
        }
}

static void vmap_init_nodes(void)
{
        struct vmap_node *vn;
        int i;

#if BITS_PER_LONG == 64
        /*
         * A high threshold of max nodes is fixed and bound to 128,
         * thus a scale factor is 1 for systems where number of cores
         * are less or equal to specified threshold.
         *
         * As for NUMA-aware notes. For bigger systems, for example
         * NUMA with multi-sockets, where we can end-up with thousands
         * of cores in total, a "sub-numa-clustering" should be added.
         *
         * In this case a NUMA domain is considered as a single entity
         * with dedicated sub-nodes in it which describe one group or
         * set of cores. Therefore a per-domain purging is supposed to
         * be added as well as a per-domain balancing.
         */
        int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);

        if (n > 1) {
                vn = kmalloc_objs(*vn, n, GFP_NOWAIT);
                if (vn) {
                        /* Node partition is 16 pages. */
                        vmap_zone_size = (1 << 4) * PAGE_SIZE;
                        nr_vmap_nodes = n;
                        vmap_nodes = vn;
                } else {
                        pr_err("Failed to allocate an array. Disable a node layer\n");
                }
        }
#endif

        for_each_vmap_node(vn) {
                vn->busy.root = RB_ROOT;
                INIT_LIST_HEAD(&vn->busy.head);
                spin_lock_init(&vn->busy.lock);

                vn->lazy.root = RB_ROOT;
                INIT_LIST_HEAD(&vn->lazy.head);
                spin_lock_init(&vn->lazy.lock);

                for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
                        INIT_LIST_HEAD(&vn->pool[i].head);
                        WRITE_ONCE(vn->pool[i].len, 0);
                }

                spin_lock_init(&vn->pool_lock);
        }
}

static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
        unsigned long count = 0;
        struct vmap_node *vn;
        int i;

        for_each_vmap_node(vn) {
                for (i = 0; i < MAX_VA_SIZE_PAGES; i++)
                        count += READ_ONCE(vn->pool[i].len);
        }

        return count ? count : SHRINK_EMPTY;
}

static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
        struct vmap_node *vn;

        guard(mutex)(&vmap_purge_lock);
        for_each_vmap_node(vn)
                decay_va_pool_node(vn, true);

        return SHRINK_STOP;
}

void __init vmalloc_init(void)
{
        struct shrinker *vmap_node_shrinker;
        struct vmap_area *va;
        struct vmap_node *vn;
        struct vm_struct *tmp;
        int i;

        /*
         * Create the cache for vmap_area objects.
         */
        vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);

        for_each_possible_cpu(i) {
                struct vmap_block_queue *vbq;
                struct vfree_deferred *p;

                vbq = &per_cpu(vmap_block_queue, i);
                spin_lock_init(&vbq->lock);
                INIT_LIST_HEAD(&vbq->free);
                p = &per_cpu(vfree_deferred, i);
                init_llist_head(&p->list);
                INIT_WORK(&p->wq, delayed_vfree_work);
                xa_init(&vbq->vmap_blocks);
        }

        /*
         * Setup nodes before importing vmlist.
         */
        vmap_init_nodes();

        /* Import existing vmlist entries. */
        for (tmp = vmlist; tmp; tmp = tmp->next) {
                va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (WARN_ON_ONCE(!va))
                        continue;

                va->va_start = (unsigned long)tmp->addr;
                va->va_end = va->va_start + tmp->size;
                va->vm = tmp;

                vn = addr_to_node(va->va_start);
                insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
        }

        /*
         * Now we can initialize a free vmap space.
         */
        vmap_init_free_space();
        vmap_initialized = true;

        vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
        if (!vmap_node_shrinker) {
                pr_err("Failed to allocate vmap-node shrinker!\n");
                return;
        }

        vmap_node_shrinker->count_objects = vmap_node_shrink_count;
        vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
        shrinker_register(vmap_node_shrinker);
}




















































































































































































































































































































































































































































































































































    1 






    1 






















































    1 
















    1 



    1 






    1 
    1 

















































































































    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * vma.h
 *
 * Core VMA manipulation API implemented in vma.c.
 */
#ifndef __MM_VMA_H
#define __MM_VMA_H

/*
 * VMA lock generalization
 */
struct vma_prepare {
        struct vm_area_struct *vma;
        struct vm_area_struct *adj_next;
        struct file *file;
        struct address_space *mapping;
        struct anon_vma *anon_vma;
        struct vm_area_struct *insert;
        struct vm_area_struct *remove;
        struct vm_area_struct *remove2;

        bool skip_vma_uprobe :1;
};

struct unlink_vma_file_batch {
        int count;
        struct vm_area_struct *vmas[8];
};

/*
 * vma munmap operation
 */
struct vma_munmap_struct {
        struct vma_iterator *vmi;
        struct vm_area_struct *vma;     /* The first vma to munmap */
        struct vm_area_struct *prev;    /* vma before the munmap area */
        struct vm_area_struct *next;    /* vma after the munmap area */
        struct list_head *uf;           /* Userfaultfd list_head */
        unsigned long start;            /* Aligned start addr (inclusive) */
        unsigned long end;              /* Aligned end addr (exclusive) */
        unsigned long unmap_start;      /* Unmap PTE start */
        unsigned long unmap_end;        /* Unmap PTE end */
        int vma_count;                  /* Number of vmas that will be removed */
        bool unlock;                    /* Unlock after the munmap */
        bool clear_ptes;                /* If there are outstanding PTE to be cleared */
        /* 2 byte hole */
        unsigned long nr_pages;         /* Number of pages being removed */
        unsigned long locked_vm;        /* Number of locked pages */
        unsigned long nr_accounted;     /* Number of VM_ACCOUNT pages */
        unsigned long exec_vm;
        unsigned long stack_vm;
        unsigned long data_vm;
};

enum vma_merge_state {
        VMA_MERGE_START,
        VMA_MERGE_ERROR_NOMEM,
        VMA_MERGE_NOMERGE,
        VMA_MERGE_SUCCESS,
};

/*
 * Describes a VMA merge operation and is threaded throughout it.
 *
 * Any of the fields may be mutated by the merge operation, so no guarantees are
 * made to the contents of this structure after a merge operation has completed.
 */
struct vma_merge_struct {
        struct mm_struct *mm;
        struct vma_iterator *vmi;
        /*
         * Adjacent VMAs, any of which may be NULL if not present:
         *
         * |------|--------|------|
         * | prev | middle | next |
         * |------|--------|------|
         *
         * middle may not yet exist in the case of a proposed new VMA being
         * merged, or it may be an existing VMA.
         *
         * next may be assigned by the caller.
         */
        struct vm_area_struct *prev;
        struct vm_area_struct *middle;
        struct vm_area_struct *next;
        /* This is the VMA we ultimately target to become the merged VMA. */
        struct vm_area_struct *target;
        /*
         * Initially, the start, end, pgoff fields are provided by the caller
         * and describe the proposed new VMA range, whether modifying an
         * existing VMA (which will be 'middle'), or adding a new one.
         *
         * During the merge process these fields are updated to describe the new
         * range _including those VMAs which will be merged_.
         */
        unsigned long start;
        unsigned long end;
        pgoff_t pgoff;

        union {
                /* Temporary while VMA flags are being converted. */
                vm_flags_t vm_flags;
                vma_flags_t vma_flags;
        };
        struct file *file;
        struct anon_vma *anon_vma;
        struct mempolicy *policy;
        struct vm_userfaultfd_ctx uffd_ctx;
        struct anon_vma_name *anon_name;
        enum vma_merge_state state;

        /* If copied from (i.e. mremap()'d) the VMA from which we are copying. */
        struct vm_area_struct *copied_from;

        /* Flags which callers can use to modify merge behaviour: */

        /*
         * If we can expand, simply do so. We know there is nothing to merge to
         * the right. Does not reset state upon failure to merge. The VMA
         * iterator is assumed to be positioned at the previous VMA, rather than
         * at the gap.
         */
        bool just_expand :1;

        /*
         * If a merge is possible, but an OOM error occurs, give up and don't
         * execute the merge, returning NULL.
         */
        bool give_up_on_oom :1;

        /*
         * If set, skip uprobe_mmap upon merged vma.
         */
        bool skip_vma_uprobe :1;

        /* Internal flags set during merge process: */

        /*
         * Internal flag indicating the merge increases vmg->middle->vm_start
         * (and thereby, vmg->prev->vm_end).
         */
        bool __adjust_middle_start :1;
        /*
         * Internal flag indicating the merge decreases vmg->next->vm_start
         * (and thereby, vmg->middle->vm_end).
         */
        bool __adjust_next_start :1;
        /*
         * Internal flag used during the merge operation to indicate we will
         * remove vmg->middle.
         */
        bool __remove_middle :1;
        /*
         * Internal flag used during the merge operation to indicate we will
         * remove vmg->next.
         */
        bool __remove_next :1;

};

struct unmap_desc {
        struct  ma_state *mas;        /* the maple state point to the first vma */
        struct vm_area_struct *first; /* The first vma */
        unsigned long pg_start;       /* The first pagetable address to free (floor) */
        unsigned long pg_end;         /* The last pagetable address to free (ceiling) */
        unsigned long vma_start;      /* The min vma address */
        unsigned long vma_end;        /* The max vma address */
        unsigned long tree_end;       /* Maximum for the vma tree search */
        unsigned long tree_reset;     /* Where to reset the vma tree walk */
        bool mm_wr_locked;            /* If the mmap write lock is held */
};

/*
 * unmap_all_init() - Initialize unmap_desc to remove all vmas, point the
 * pg_start and pg_end to a safe location.
 */
static inline void unmap_all_init(struct unmap_desc *unmap,
                struct vma_iterator *vmi, struct vm_area_struct *vma)
{
        unmap->mas = &vmi->mas;
        unmap->first = vma;
        unmap->pg_start = FIRST_USER_ADDRESS;
        unmap->pg_end = USER_PGTABLES_CEILING;
        unmap->vma_start = 0;
        unmap->vma_end = ULONG_MAX;
        unmap->tree_end = ULONG_MAX;
        unmap->tree_reset = vma->vm_end;
        unmap->mm_wr_locked = false;
}

/*
 * unmap_pgtable_init() - Initialize unmap_desc to remove all page tables within
 * the user range.
 *
 * ARM can have mappings outside of vmas.
 * See: e2cdef8c847b4 ("[PATCH] freepgt: free_pgtables from FIRST_USER_ADDRESS")
 *
 * ARM LPAE uses page table mappings beyond the USER_PGTABLES_CEILING
 * See: CONFIG_ARM_LPAE in arch/arm/include/asm/pgtable.h
 */
static inline void unmap_pgtable_init(struct unmap_desc *unmap,
                                      struct vma_iterator *vmi)
{
        vma_iter_set(vmi, unmap->tree_reset);
        unmap->vma_start = FIRST_USER_ADDRESS;
        unmap->vma_end = USER_PGTABLES_CEILING;
        unmap->tree_end = USER_PGTABLES_CEILING;
}

#define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next)      \
        struct unmap_desc name = {                                             \
                .mas = &(_vmi)->mas,                                           \
                .first = _vma,                                                 \
                .pg_start = _prev ? ((struct vm_area_struct *)_prev)->vm_end : \
                        FIRST_USER_ADDRESS,                                    \
                .pg_end = _next ? ((struct vm_area_struct *)_next)->vm_start : \
                        USER_PGTABLES_CEILING,                                 \
                .vma_start = _vma_start,                                       \
                .vma_end = _vma_end,                                           \
                .tree_end = _next ?                                            \
                        ((struct vm_area_struct *)_next)->vm_start :           \
                        USER_PGTABLES_CEILING,                                 \
                .tree_reset = _vma->vm_end,                                    \
                .mm_wr_locked = true,                                          \
        }

static inline bool vmg_nomem(struct vma_merge_struct *vmg)
{
        return vmg->state == VMA_MERGE_ERROR_NOMEM;
}

/* Assumes addr >= vma->vm_start. */
static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
                                       unsigned long addr)
{
        return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
}

#define VMG_STATE(name, mm_, vmi_, start_, end_, vma_flags_, pgoff_)        \
        struct vma_merge_struct name = {                                \
                .mm = mm_,                                                \
                .vmi = vmi_,                                                \
                .start = start_,                                        \
                .end = end_,                                                \
                .vma_flags = vma_flags_,                                \
                .pgoff = pgoff_,                                        \
                .state = VMA_MERGE_START,                                \
        }

#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_)        \
        struct vma_merge_struct name = {                        \
                .mm = vma_->vm_mm,                                \
                .vmi = vmi_,                                        \
                .prev = prev_,                                        \
                .middle = vma_,                                        \
                .next = NULL,                                        \
                .start = start_,                                \
                .end = end_,                                        \
                .vm_flags = vma_->vm_flags,                        \
                .pgoff = vma_pgoff_offset(vma_, start_),        \
                .file = vma_->vm_file,                                \
                .anon_vma = vma_->anon_vma,                        \
                .policy = vma_policy(vma_),                        \
                .uffd_ctx = vma_->vm_userfaultfd_ctx,                \
                .anon_name = anon_vma_name(vma_),                \
                .state = VMA_MERGE_START,                        \
        }

#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
void validate_mm(struct mm_struct *mm);
#else
#define validate_mm(mm) do { } while (0)
#endif

__must_check int vma_expand(struct vma_merge_struct *vmg);
__must_check int vma_shrink(struct vma_iterator *vmi,
                struct vm_area_struct *vma,
                unsigned long start, unsigned long end, pgoff_t pgoff);

static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
                        struct vm_area_struct *vma, gfp_t gfp)

{
        if (vmi->mas.status != ma_start &&
            ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
                vma_iter_invalidate(vmi);

        __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
        mas_store_gfp(&vmi->mas, vma, gfp);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        vma_mark_attached(vma);
        return 0;
}

/*
 * Temporary helper function for stacked mmap handlers which specify
 * f_op->mmap() but which might have an underlying file system which implements
 * f_op->mmap_prepare().
 */
static inline void compat_set_vma_from_desc(struct vm_area_struct *vma,
                struct vm_area_desc *desc)
{
        /*
         * Since we're invoking .mmap_prepare() despite having a partially
         * established VMA, we must take care to handle setting fields
         * correctly.
         */

        /* Mutable fields. Populated with initial state. */
        vma->vm_pgoff = desc->pgoff;
        if (desc->vm_file != vma->vm_file)
                vma_set_file(vma, desc->vm_file);
        vma->flags = desc->vma_flags;
        vma->vm_page_prot = desc->page_prot;

        /* User-defined fields. */
        vma->vm_ops = desc->vm_ops;
        vma->vm_private_data = desc->private_data;
}

int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                    struct mm_struct *mm, unsigned long start,
                    unsigned long end, struct list_head *uf, bool unlock);

int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                  unsigned long start, size_t len, struct list_head *uf,
                  bool unlock);

void remove_vma(struct vm_area_struct *vma);
void unmap_region(struct unmap_desc *unmap);

/**
 * vma_modify_flags() - Perform any necessary split/merge in preparation for
 * setting VMA flags to *@vm_flags in the range @start to @end contained within
 * @vma.
 * @vmi: Valid VMA iterator positioned at @vma.
 * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
 * @vma: The VMA containing the range @start to @end to be updated.
 * @start: The start of the range to update. May be offset within @vma.
 * @end: The exclusive end of the range to update, may be offset within @vma.
 * @vma_flags_ptr: A pointer to the VMA flags that the @start to @end range is
 * about to be set to. On merge, this will be updated to include sticky flags.
 *
 * IMPORTANT: The actual modification being requested here is NOT applied,
 * rather the VMA is perhaps split, perhaps merged to accommodate the change,
 * and the caller is expected to perform the actual modification.
 *
 * In order to account for sticky VMA flags, the @vma_flags_ptr parameter points
 * to the requested flags which are then updated so the caller, should they
 * overwrite any existing flags, correctly retains these.
 *
 * Returns: A VMA which contains the range @start to @end ready to have its
 * flags altered to *@vma_flags.
 */
__must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
                struct vm_area_struct *prev, struct vm_area_struct *vma,
                unsigned long start, unsigned long end, vma_flags_t *vma_flags_ptr);

/**
 * vma_modify_name() - Perform any necessary split/merge in preparation for
 * setting anonymous VMA name to @new_name in the range @start to @end contained
 * within @vma.
 * @vmi: Valid VMA iterator positioned at @vma.
 * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
 * @vma: The VMA containing the range @start to @end to be updated.
 * @start: The start of the range to update. May be offset within @vma.
 * @end: The exclusive end of the range to update, may be offset within @vma.
 * @new_name: The anonymous VMA name that the @start to @end range is about to
 * be set to.
 *
 * IMPORTANT: The actual modification being requested here is NOT applied,
 * rather the VMA is perhaps split, perhaps merged to accommodate the change,
 * and the caller is expected to perform the actual modification.
 *
 * Returns: A VMA which contains the range @start to @end ready to have its
 * anonymous VMA name changed to @new_name.
 */
__must_check struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi,
                struct vm_area_struct *prev, struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                struct anon_vma_name *new_name);

/**
 * vma_modify_policy() - Perform any necessary split/merge in preparation for
 * setting NUMA policy to @new_pol in the range @start to @end contained
 * within @vma.
 * @vmi: Valid VMA iterator positioned at @vma.
 * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
 * @vma: The VMA containing the range @start to @end to be updated.
 * @start: The start of the range to update. May be offset within @vma.
 * @end: The exclusive end of the range to update, may be offset within @vma.
 * @new_pol: The NUMA policy that the @start to @end range is about to be set
 * to.
 *
 * IMPORTANT: The actual modification being requested here is NOT applied,
 * rather the VMA is perhaps split, perhaps merged to accommodate the change,
 * and the caller is expected to perform the actual modification.
 *
 * Returns: A VMA which contains the range @start to @end ready to have its
 * NUMA policy changed to @new_pol.
 */
__must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi,
                   struct vm_area_struct *prev, struct vm_area_struct *vma,
                   unsigned long start, unsigned long end,
                   struct mempolicy *new_pol);

/**
 * vma_modify_flags_uffd() - Perform any necessary split/merge in preparation for
 * setting VMA flags to @vm_flags and UFFD context to @new_ctx in the range
 * @start to @end contained within @vma.
 * @vmi: Valid VMA iterator positioned at @vma.
 * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
 * @vma: The VMA containing the range @start to @end to be updated.
 * @start: The start of the range to update. May be offset within @vma.
 * @end: The exclusive end of the range to update, may be offset within @vma.
 * @vma_flags: The VMA flags that the @start to @end range is about to be set to.
 * @new_ctx: The userfaultfd context that the @start to @end range is about to
 * be set to.
 * @give_up_on_oom: If an out of memory condition occurs on merge, simply give
 * up on it and treat the merge as best-effort.
 *
 * IMPORTANT: The actual modification being requested here is NOT applied,
 * rather the VMA is perhaps split, perhaps merged to accommodate the change,
 * and the caller is expected to perform the actual modification.
 *
 * Returns: A VMA which contains the range @start to @end ready to have its VMA
 * flags changed to @vma_flags and its userfaultfd context changed to @new_ctx.
 */
__must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi,
                struct vm_area_struct *prev, struct vm_area_struct *vma,
                unsigned long start, unsigned long end, const vma_flags_t *vma_flags,
                struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom);

__must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg);

__must_check struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
                  struct vm_area_struct *vma, unsigned long delta);

void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);

void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb);

void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
                               struct vm_area_struct *vma);

struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks);

struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma);

bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);

int mm_take_all_locks(struct mm_struct *mm);
void mm_drop_all_locks(struct mm_struct *mm);

unsigned long mmap_region(struct file *file, unsigned long addr,
                unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
                struct list_head *uf);

int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
                 unsigned long addr, unsigned long request,
                 vma_flags_t vma_flags);

unsigned long unmapped_area(struct vm_unmapped_area_info *info);
unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);

static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
        /*
         * We want to check manually if we can change individual PTEs writable
         * if we can't do that automatically for all PTEs in a mapping. For
         * private mappings, that's always the case when we have write
         * permissions as we properly have to handle COW.
         */
        if (vma->vm_flags & VM_SHARED)
                return vma_wants_writenotify(vma, vma->vm_page_prot);
        return !!(vma->vm_flags & VM_WRITE);
}

#ifdef CONFIG_MMU
static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags)
{
        return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}
#endif

static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
                                                    unsigned long min)
{
        return mas_prev(&vmi->mas, min);
}

/*
 * These three helpers classifies VMAs for virtual memory accounting.
 */

/*
 * Executable code area - executable, not writable, not stack
 */
static inline bool is_exec_mapping(vm_flags_t flags)
{
        return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
}

/*
 * Stack area (including shadow stacks)
 *
 * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
 * do_mmap() forbids all other combinations.
 */
static inline bool is_stack_mapping(vm_flags_t flags)
{
        return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
}

/*
 * Data area - private, writable, not stack
 */
static inline bool is_data_mapping(vm_flags_t flags)
{
        return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
}

static inline bool is_data_mapping_vma_flags(const vma_flags_t *vma_flags)
{
        return vma_flags_test(vma_flags, VMA_WRITE_BIT) &&
                !vma_flags_test_any(vma_flags, VMA_SHARED_BIT, VMA_STACK_BIT);
}

static inline void vma_iter_config(struct vma_iterator *vmi,
                unsigned long index, unsigned long last)
{
        __mas_set_range(&vmi->mas, index, last - 1);
}

static inline void vma_iter_reset(struct vma_iterator *vmi)
{
        mas_reset(&vmi->mas);
}

static inline
struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
{
        return mas_prev_range(&vmi->mas, min);
}

static inline
struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
{
        return mas_next_range(&vmi->mas, max);
}

static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
                                       unsigned long max, unsigned long size)
{
        return mas_empty_area(&vmi->mas, min, max - 1, size);
}

static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
                                        unsigned long max, unsigned long size)
{
        return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
}

/*
 * VMA Iterator functions shared between nommu and mmap
 */
static inline int vma_iter_prealloc(struct vma_iterator *vmi,
                struct vm_area_struct *vma)
{
        return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
}

static inline void vma_iter_clear(struct vma_iterator *vmi)
{
        mas_store_prealloc(&vmi->mas, NULL);
}

static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
{
        return mas_walk(&vmi->mas);
}

/* Store a VMA with preallocated memory */
static inline void vma_iter_store_overwrite(struct vma_iterator *vmi,
                                            struct vm_area_struct *vma)
{
        vma_assert_attached(vma);

#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
        if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
                        vmi->mas.index > vma->vm_start)) {
                pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
                        vmi->mas.index, vma->vm_start, vma->vm_start,
                        vma->vm_end, vmi->mas.index, vmi->mas.last);
        }
        if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
                        vmi->mas.last <  vma->vm_start)) {
                pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
                       vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
                       vmi->mas.index, vmi->mas.last);
        }
#endif

        if (vmi->mas.status != ma_start &&
            ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
                vma_iter_invalidate(vmi);

        __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
        mas_store_prealloc(&vmi->mas, vma);
}

static inline void vma_iter_store_new(struct vma_iterator *vmi,
                                      struct vm_area_struct *vma)
{
        vma_mark_attached(vma);
        vma_iter_store_overwrite(vmi, vma);
}

static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
{
        return vmi->mas.index;
}

static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
{
        return vmi->mas.last + 1;
}

static inline
struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
{
        return mas_prev_range(&vmi->mas, 0);
}

/*
 * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or
 * if no previous VMA, to index 0.
 */
static inline
struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
                struct vm_area_struct **pprev)
{
        struct vm_area_struct *next = vma_next(vmi);
        struct vm_area_struct *prev = vma_prev(vmi);

        /*
         * Consider the case where no previous VMA exists. We advance to the
         * next VMA, skipping any gap, then rewind to the start of the range.
         *
         * If we were to unconditionally advance to the next range we'd wind up
         * at the next VMA again, so we check to ensure there is a previous VMA
         * to skip over.
         */
        if (prev)
                vma_iter_next_range(vmi);

        if (pprev)
                *pprev = prev;

        return next;
}

#ifdef CONFIG_64BIT
static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
        return (vma->vm_flags & VM_SEALED);
}
#else
static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
        return false;
}
#endif

#if defined(CONFIG_STACK_GROWSUP)
int expand_upwards(struct vm_area_struct *vma, unsigned long address);
#endif

int expand_downwards(struct vm_area_struct *vma, unsigned long address);

int __vm_munmap(unsigned long start, size_t len, bool unlock);

int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);

/* vma_init.h, shared between CONFIG_MMU and nommu. */
void __init vma_state_init(void);
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
void vm_area_free(struct vm_area_struct *vma);

/* vma_exec.c */
#ifdef CONFIG_MMU
int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
                          unsigned long *top_mem_p);
int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
#endif

#ifdef CONFIG_MMU
/*
 * Denies creating a writable executable mapping or gaining executable permissions.
 *
 * This denies the following:
 *
 *        a)        mmap(PROT_WRITE | PROT_EXEC)
 *
 *        b)        mmap(PROT_WRITE)
 *                mprotect(PROT_EXEC)
 *
 *        c)        mmap(PROT_WRITE)
 *                mprotect(PROT_READ)
 *                mprotect(PROT_EXEC)
 *
 * But allows the following:
 *
 *        d)        mmap(PROT_READ | PROT_EXEC)
 *                mmap(PROT_READ | PROT_EXEC | PROT_BTI)
 *
 * This is only applicable if the user has set the Memory-Deny-Write-Execute
 * (MDWE) protection mask for the current process.
 *
 * @old specifies the VMA flags the VMA originally possessed, and @new the ones
 * we propose to set.
 *
 * Return: false if proposed change is OK, true if not ok and should be denied.
 */
static inline bool map_deny_write_exec(const vma_flags_t *old,
                                       const vma_flags_t *new)
{
        /* If MDWE is disabled, we have nothing to deny. */
        if (!mm_flags_test(MMF_HAS_MDWE, current->mm))
                return false;

        /* If the new VMA is not executable, we have nothing to deny. */
        if (!vma_flags_test(new, VMA_EXEC_BIT))
                return false;

        /* Under MDWE we do not accept newly writably executable VMAs... */
        if (vma_flags_test(new, VMA_WRITE_BIT))
                return true;

        /* ...nor previously non-executable VMAs becoming executable. */
        if (!vma_flags_test(old, VMA_EXEC_BIT))
                return true;

        return false;
}
#endif

#endif        /* __MM_VMA_H */






















































    1 





    1 

























































































































































































































































































































































































































    1 





    1 





    1 


















































































































































    1 
















    1 




    1 


    1 








    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/proc/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/cache.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
#include <linux/pid_namespace.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/completion.h>
#include <linux/poll.h>
#include <linux/printk.h>
#include <linux/file.h>
#include <linux/limits.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/bug.h>

#include "internal.h"

static void proc_evict_inode(struct inode *inode)
{
        struct ctl_table_header *head;
        struct proc_inode *ei = PROC_I(inode);

        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);

        /* Stop tracking associated processes */
        if (ei->pid)
                proc_pid_evict_inode(ei);

        head = ei->sysctl;
        if (head) {
                WRITE_ONCE(ei->sysctl, NULL);
                proc_sys_evict_inode(inode, head);
        }
}

static struct kmem_cache *proc_inode_cachep __ro_after_init;
static struct kmem_cache *pde_opener_cache __ro_after_init;

static struct inode *proc_alloc_inode(struct super_block *sb)
{
        struct proc_inode *ei;

        ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        ei->pid = NULL;
        ei->fd = 0;
        ei->op.proc_get_link = NULL;
        ei->pde = NULL;
        ei->sysctl = NULL;
        ei->sysctl_entry = NULL;
        INIT_HLIST_NODE(&ei->sibling_inodes);
        ei->ns_ops = NULL;
        return &ei->vfs_inode;
}

static void proc_free_inode(struct inode *inode)
{
        struct proc_inode *ei = PROC_I(inode);

        if (ei->pid)
                put_pid(ei->pid);
        /* Let go of any associated proc directory entry */
        if (ei->pde)
                pde_put(ei->pde);
        kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}

static void init_once(void *foo)
{
        struct proc_inode *ei = (struct proc_inode *) foo;

        inode_init_once(&ei->vfs_inode);
}

void __init proc_init_kmemcache(void)
{
        proc_inode_cachep = kmem_cache_create("proc_inode_cache",
                                             sizeof(struct proc_inode),
                                             0, (SLAB_RECLAIM_ACCOUNT|
                                                SLAB_ACCOUNT|
                                                SLAB_PANIC),
                                             init_once);
        pde_opener_cache =
                kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0,
                                  SLAB_ACCOUNT|SLAB_PANIC, NULL);
        proc_dir_entry_cache = kmem_cache_create_usercopy(
                "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC,
                offsetof(struct proc_dir_entry, inline_name),
                SIZEOF_PDE_INLINE_NAME, NULL);
        BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}

void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
        struct hlist_node *node;
        struct super_block *old_sb = NULL;

        rcu_read_lock();
        while ((node = hlist_first_rcu(inodes))) {
                struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
                struct super_block *sb;
                struct inode *inode;

                spin_lock(lock);
                hlist_del_init_rcu(&ei->sibling_inodes);
                spin_unlock(lock);

                inode = &ei->vfs_inode;
                sb = inode->i_sb;
                if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
                        continue;
                inode = igrab(inode);
                rcu_read_unlock();
                if (sb != old_sb) {
                        if (old_sb)
                                deactivate_super(old_sb);
                        old_sb = sb;
                }
                if (unlikely(!inode)) {
                        rcu_read_lock();
                        continue;
                }

                if (S_ISDIR(inode->i_mode)) {
                        struct dentry *dir = d_find_any_alias(inode);
                        if (dir) {
                                d_invalidate(dir);
                                dput(dir);
                        }
                } else {
                        struct dentry *dentry;
                        while ((dentry = d_find_alias(inode))) {
                                d_invalidate(dentry);
                                dput(dentry);
                        }
                }
                iput(inode);

                rcu_read_lock();
        }
        rcu_read_unlock();
        if (old_sb)
                deactivate_super(old_sb);
}

static inline const char *hidepid2str(enum proc_hidepid v)
{
        switch (v) {
                case HIDEPID_OFF: return "off";
                case HIDEPID_NO_ACCESS: return "noaccess";
                case HIDEPID_INVISIBLE: return "invisible";
                case HIDEPID_NOT_PTRACEABLE: return "ptraceable";
        }
        WARN_ONCE(1, "bad hide_pid value: %d\n", v);
        return "unknown";
}

static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
        struct proc_fs_info *fs_info = proc_sb_info(root->d_sb);

        if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid));
        if (fs_info->hide_pid != HIDEPID_OFF)
                seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid));
        if (fs_info->pidonly != PROC_PIDONLY_OFF)
                seq_printf(seq, ",subset=pid");

        return 0;
}

const struct super_operations proc_sops = {
        .alloc_inode        = proc_alloc_inode,
        .free_inode        = proc_free_inode,
        .drop_inode        = inode_just_drop,
        .evict_inode        = proc_evict_inode,
        .statfs                = simple_statfs,
        .show_options        = proc_show_options,
};

enum {BIAS = -1U<<31};

static inline int use_pde(struct proc_dir_entry *pde)
{
        return likely(atomic_inc_unless_negative(&pde->in_use));
}

static void unuse_pde(struct proc_dir_entry *pde)
{
        if (unlikely(atomic_dec_return(&pde->in_use) == BIAS))
                complete(pde->pde_unload_completion);
}

/*
 * At most 2 contexts can enter this function: the one doing the last
 * close on the descriptor and whoever is deleting PDE itself.
 *
 * First to enter calls ->proc_release hook and signals its completion
 * to the second one which waits and then does nothing.
 *
 * PDE is locked on entry, unlocked on exit.
 */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
        __releases(&pde->pde_unload_lock)
{
        /*
         * close() (proc_reg_release()) can't delete an entry and proceed:
         * ->release hook needs to be available at the right moment.
         *
         * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
         * "struct file" needs to be available at the right moment.
         */
        if (pdeo->closing) {
                /* somebody else is doing that, just wait */
                DECLARE_COMPLETION_ONSTACK(c);
                pdeo->c = &c;
                spin_unlock(&pde->pde_unload_lock);
                wait_for_completion(&c);
        } else {
                struct file *file;
                struct completion *c;

                pdeo->closing = true;
                spin_unlock(&pde->pde_unload_lock);

                file = pdeo->file;
                pde->proc_ops->proc_release(file_inode(file), file);

                spin_lock(&pde->pde_unload_lock);
                /* Strictly after ->proc_release, see above. */
                list_del(&pdeo->lh);
                c = pdeo->c;
                spin_unlock(&pde->pde_unload_lock);
                if (unlikely(c))
                        complete(c);
                kmem_cache_free(pde_opener_cache, pdeo);
        }
}

void proc_entry_rundown(struct proc_dir_entry *de)
{
        DECLARE_COMPLETION_ONSTACK(c);
        /* Wait until all existing callers into module are done. */
        de->pde_unload_completion = &c;
        if (atomic_add_return(BIAS, &de->in_use) != BIAS)
                wait_for_completion(&c);

        /* ->pde_openers list can't grow from now on. */

        spin_lock(&de->pde_unload_lock);
        while (!list_empty(&de->pde_openers)) {
                struct pde_opener *pdeo;
                pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
                close_pdeo(de, pdeo);
                spin_lock(&de->pde_unload_lock);
        }
        spin_unlock(&de->pde_unload_lock);
}

static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        loff_t rv = -EINVAL;

        if (pde_is_permanent(pde)) {
                return pde->proc_ops->proc_lseek(file, offset, whence);
        } else if (use_pde(pde)) {
                rv = pde->proc_ops->proc_lseek(file, offset, whence);
                unuse_pde(pde);
        }
        return rv;
}

static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp));
        ssize_t ret;

        if (pde_is_permanent(pde))
                return pde->proc_ops->proc_read_iter(iocb, iter);

        if (!use_pde(pde))
                return -EIO;
        ret = pde->proc_ops->proc_read_iter(iocb, iter);
        unuse_pde(pde);
        return ret;
}

static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        const auto read = pde->proc_ops->proc_read;
        if (read)
                return read(file, buf, count, ppos);
        return -EIO;
}

static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        ssize_t rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_read(pde, file, buf, count, ppos);
        } else if (use_pde(pde)) {
                rv = pde_read(pde, file, buf, count, ppos);
                unuse_pde(pde);
        }
        return rv;
}

static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
        const auto write = pde->proc_ops->proc_write;
        if (write)
                return write(file, buf, count, ppos);
        return -EIO;
}

static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        ssize_t rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_write(pde, file, buf, count, ppos);
        } else if (use_pde(pde)) {
                rv = pde_write(pde, file, buf, count, ppos);
                unuse_pde(pde);
        }
        return rv;
}

static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
{
        const auto poll = pde->proc_ops->proc_poll;
        if (poll)
                return poll(file, pts);
        return DEFAULT_POLLMASK;
}

static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        __poll_t rv = DEFAULT_POLLMASK;

        if (pde_is_permanent(pde)) {
                return pde_poll(pde, file, pts);
        } else if (use_pde(pde)) {
                rv = pde_poll(pde, file, pts);
                unuse_pde(pde);
        }
        return rv;
}

static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
        const auto ioctl = pde->proc_ops->proc_ioctl;
        if (ioctl)
                return ioctl(file, cmd, arg);
        return -ENOTTY;
}

static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        long rv = -ENOTTY;

        if (pde_is_permanent(pde)) {
                return pde_ioctl(pde, file, cmd, arg);
        } else if (use_pde(pde)) {
                rv = pde_ioctl(pde, file, cmd, arg);
                unuse_pde(pde);
        }
        return rv;
}

#ifdef CONFIG_COMPAT
static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
        const auto compat_ioctl = pde->proc_ops->proc_compat_ioctl;
        if (compat_ioctl)
                return compat_ioctl(file, cmd, arg);
        return -ENOTTY;
}

static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        long rv = -ENOTTY;
        if (pde_is_permanent(pde)) {
                return pde_compat_ioctl(pde, file, cmd, arg);
        } else if (use_pde(pde)) {
                rv = pde_compat_ioctl(pde, file, cmd, arg);
                unuse_pde(pde);
        }
        return rv;
}
#endif

static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
{
        const auto mmap = pde->proc_ops->proc_mmap;
        if (mmap)
                return mmap(file, vma);
        return -EIO;
}

static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        int rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_mmap(pde, file, vma);
        } else if (use_pde(pde)) {
                rv = pde_mmap(pde, file, vma);
                unuse_pde(pde);
        }
        return rv;
}

static unsigned long
pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
                           unsigned long len, unsigned long pgoff,
                           unsigned long flags)
{
        if (pde->proc_ops->proc_get_unmapped_area)
                return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);

#ifdef CONFIG_MMU
        return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags);
#endif

        return orig_addr;
}

static unsigned long
proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
                           unsigned long len, unsigned long pgoff,
                           unsigned long flags)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        unsigned long rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
        } else if (use_pde(pde)) {
                rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
                unuse_pde(pde);
        }
        return rv;
}

static int proc_reg_open(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *pde = PDE(inode);
        int rv = 0;
        typeof_member(struct proc_ops, proc_open) open;
        struct pde_opener *pdeo;

        if (!pde_has_proc_lseek(pde))
                file->f_mode &= ~FMODE_LSEEK;

        if (pde_is_permanent(pde)) {
                open = pde->proc_ops->proc_open;
                if (open)
                        rv = open(inode, file);
                return rv;
        }

        /*
         * Ensure that
         * 1) PDE's ->release hook will be called no matter what
         *    either normally by close()/->release, or forcefully by
         *    rmmod/remove_proc_entry.
         *
         * 2) rmmod isn't blocked by opening file in /proc and sitting on
         *    the descriptor (including "rmmod foo </proc/foo" scenario).
         *
         * Save every "struct file" with custom ->release hook.
         */
        if (!use_pde(pde))
                return -ENOENT;

        const auto release = pde->proc_ops->proc_release;
        if (release) {
                pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
                if (!pdeo) {
                        rv = -ENOMEM;
                        goto out_unuse;
                }
        }

        open = pde->proc_ops->proc_open;
        if (open)
                rv = open(inode, file);

        if (release) {
                if (rv == 0) {
                        /* To know what to release. */
                        pdeo->file = file;
                        pdeo->closing = false;
                        pdeo->c = NULL;
                        spin_lock(&pde->pde_unload_lock);
                        list_add(&pdeo->lh, &pde->pde_openers);
                        spin_unlock(&pde->pde_unload_lock);
                } else
                        kmem_cache_free(pde_opener_cache, pdeo);
        }

out_unuse:
        unuse_pde(pde);
        return rv;
}

static int proc_reg_release(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *pde = PDE(inode);
        struct pde_opener *pdeo;

        if (pde_is_permanent(pde)) {
                const auto release = pde->proc_ops->proc_release;
                if (release)
                        return release(inode, file);
                return 0;
        }

        spin_lock(&pde->pde_unload_lock);
        list_for_each_entry(pdeo, &pde->pde_openers, lh) {
                if (pdeo->file == file) {
                        close_pdeo(pde, pdeo);
                        return 0;
                }
        }
        spin_unlock(&pde->pde_unload_lock);
        return 0;
}

static const struct file_operations proc_reg_file_ops = {
        .llseek                = proc_reg_llseek,
        .read                = proc_reg_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

static const struct file_operations proc_iter_file_ops = {
        .llseek                = proc_reg_llseek,
        .read_iter        = proc_reg_read_iter,
        .write                = proc_reg_write,
        .splice_read        = copy_splice_read,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

#ifdef CONFIG_COMPAT
static const struct file_operations proc_reg_file_ops_compat = {
        .llseek                = proc_reg_llseek,
        .read                = proc_reg_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .compat_ioctl        = proc_reg_compat_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

static const struct file_operations proc_iter_file_ops_compat = {
        .llseek                = proc_reg_llseek,
        .read_iter        = proc_reg_read_iter,
        .splice_read        = copy_splice_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .compat_ioctl        = proc_reg_compat_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};
#endif

static void proc_put_link(void *p)
{
        unuse_pde(p);
}

static const char *proc_get_link(struct dentry *dentry,
                                 struct inode *inode,
                                 struct delayed_call *done)
{
        struct proc_dir_entry *pde = PDE(inode);
        if (!use_pde(pde))
                return ERR_PTR(-EINVAL);
        set_delayed_call(done, proc_put_link, pde);
        return pde->data;
}

const struct inode_operations proc_link_inode_operations = {
        .get_link        = proc_get_link,
};

struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
        struct inode *inode = new_inode(sb);

        if (!inode) {
                pde_put(de);
                return NULL;
        }

        inode->i_private = de->data;
        inode->i_ino = de->low_ino;
        simple_inode_init_ts(inode);
        PROC_I(inode)->pde = de;
        if (is_empty_pde(de)) {
                make_empty_dir_inode(inode);
                return inode;
        }

        if (de->mode) {
                inode->i_mode = de->mode;
                inode->i_uid = de->uid;
                inode->i_gid = de->gid;
        }
        if (de->size)
                inode->i_size = de->size;
        if (de->nlink)
                set_nlink(inode, de->nlink);

        if (S_ISREG(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                if (pde_has_proc_read_iter(de))
                        inode->i_fop = &proc_iter_file_ops;
                else
                        inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
                if (pde_has_proc_compat_ioctl(de)) {
                        if (pde_has_proc_read_iter(de))
                                inode->i_fop = &proc_iter_file_ops_compat;
                        else
                                inode->i_fop = &proc_reg_file_ops_compat;
                }
#endif
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                inode->i_fop = de->proc_dir_ops;
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                inode->i_fop = NULL;
        } else {
                BUG();
        }
        return inode;
}





































    1 

    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_COOKIE_H
#define __LINUX_COOKIE_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <asm/local.h>

struct pcpu_gen_cookie {
        local_t nesting;
        u64 last;
} __aligned(16);

struct gen_cookie {
        struct pcpu_gen_cookie __percpu *local;
        atomic64_t forward_last ____cacheline_aligned_in_smp;
        atomic64_t reverse_last;
};

#define COOKIE_LOCAL_BATCH        4096

#define DEFINE_COOKIE(name)                                                \
        static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name);        \
        static struct gen_cookie name = {                                \
                .local                = &__##name,                                \
                .forward_last        = ATOMIC64_INIT(0),                        \
                .reverse_last        = ATOMIC64_INIT(0),                        \
        }

static __always_inline u64 gen_cookie_next(struct gen_cookie *gc)
{
        struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local);
        u64 val;

        if (likely(local_inc_return(&local->nesting) == 1)) {
                val = local->last;
                if (__is_defined(CONFIG_SMP) &&
                    unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) {
                        s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH,
                                                       &gc->forward_last);
                        val = next - COOKIE_LOCAL_BATCH;
                }
                local->last = ++val;
        } else {
                val = atomic64_dec_return(&gc->reverse_last);
        }
        local_dec(&local->nesting);
        return val;
}

#endif /* __LINUX_COOKIE_H */



































































































































   29 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This header provides generic wrappers for memory access instrumentation that
 * the compiler cannot emit for: KASAN, KCSAN, KMSAN.
 */
#ifndef _LINUX_INSTRUMENTED_H
#define _LINUX_INSTRUMENTED_H

#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>
#include <linux/kmsan-checks.h>
#include <linux/types.h>

/**
 * instrument_read - instrument regular read access
 * @v: address of access
 * @size: size of access
 *
 * Instrument a regular read access. The instrumentation should be inserted
 * before the actual read happens.
 */
static __always_inline void instrument_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_read(v, size);
}

/**
 * instrument_write - instrument regular write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 */
static __always_inline void instrument_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_write(v, size);
}

/**
 * instrument_read_write - instrument regular read-write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 */
static __always_inline void instrument_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_read_write(v, size);
}

static __always_inline void instrument_atomic_check_alignment(const volatile void *v, size_t size)
{
#ifndef __DISABLE_EXPORTS
        if (IS_ENABLED(CONFIG_DEBUG_ATOMIC)) {
                unsigned int mask = size - 1;

                if (IS_ENABLED(CONFIG_DEBUG_ATOMIC_LARGEST_ALIGN))
                        mask &= sizeof(struct { long x; } __aligned_largest) - 1;
                WARN_ON_ONCE((unsigned long)v & mask);
        }
#endif
}

/**
 * instrument_atomic_read - instrument atomic read access
 * @v: address of access
 * @size: size of access
 *
 * Instrument an atomic read access. The instrumentation should be inserted
 * before the actual read happens.
 */
static __always_inline void instrument_atomic_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_atomic_read(v, size);
        instrument_atomic_check_alignment(v, size);
}

/**
 * instrument_atomic_write - instrument atomic write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument an atomic write access. The instrumentation should be inserted
 * before the actual write happens.
 */
static __always_inline void instrument_atomic_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_write(v, size);
        instrument_atomic_check_alignment(v, size);
}

/**
 * instrument_atomic_read_write - instrument atomic read-write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument an atomic read-write access. The instrumentation should be
 * inserted before the actual write happens.
 */
static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_read_write(v, size);
        instrument_atomic_check_alignment(v, size);
}

/**
 * instrument_copy_to_user - instrument reads of copy_to_user
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 *
 * Instrument reads from kernel memory, that are due to copy_to_user (and
 * variants). The instrumentation must be inserted before the accesses.
 */
static __always_inline void
instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        kasan_check_read(from, n);
        kcsan_check_read(from, n);
        kmsan_copy_to_user(to, from, n, 0);
}

/**
 * instrument_copy_from_user_before - add instrumentation before copy_from_user
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 *
 * Instrument writes to kernel memory, that are due to copy_from_user (and
 * variants). The instrumentation should be inserted before the accesses.
 */
static __always_inline void
instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n)
{
        kasan_check_write(to, n);
        kcsan_check_write(to, n);
}

/**
 * instrument_copy_from_user_after - add instrumentation after copy_from_user
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 * @left: number of bytes not copied (as returned by copy_from_user)
 *
 * Instrument writes to kernel memory, that are due to copy_from_user (and
 * variants). The instrumentation should be inserted after the accesses.
 */
static __always_inline void
instrument_copy_from_user_after(const void *to, const void __user *from,
                                unsigned long n, unsigned long left)
{
        kmsan_unpoison_memory(to, n - left);
}

/**
 * instrument_memcpy_before - add instrumentation before non-instrumented memcpy
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 *
 * Instrument memory accesses that happen in custom memcpy implementations. The
 * instrumentation should be inserted before the memcpy call.
 */
static __always_inline void instrument_memcpy_before(void *to, const void *from,
                                                     unsigned long n)
{
        kasan_check_write(to, n);
        kasan_check_read(from, n);
        kcsan_check_write(to, n);
        kcsan_check_read(from, n);
}

/**
 * instrument_memcpy_after - add instrumentation after non-instrumented memcpy
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 * @left: number of bytes not copied (if known)
 *
 * Instrument memory accesses that happen in custom memcpy implementations. The
 * instrumentation should be inserted after the memcpy call.
 */
static __always_inline void instrument_memcpy_after(void *to, const void *from,
                                                    unsigned long n,
                                                    unsigned long left)
{
        kmsan_memmove(to, from, n - left);
}

/**
 * instrument_get_user() - add instrumentation to get_user()-like macros
 * @to: destination variable, may not be address-taken
 *
 * get_user() and friends are fragile, so it may depend on the implementation
 * whether the instrumentation happens before or after the data is copied from
 * the userspace.
 */
#define instrument_get_user(to)                                \
({                                                        \
        u64 __tmp = (u64)(to);                                \
        kmsan_unpoison_memory(&__tmp, sizeof(__tmp));        \
        to = __tmp;                                        \
})


/**
 * instrument_put_user() - add instrumentation to put_user()-like macros
 * @from: source address
 * @ptr: userspace pointer to copy to
 * @size: number of bytes to copy
 *
 * put_user() and friends are fragile, so it may depend on the implementation
 * whether the instrumentation happens before or after the data is copied from
 * the userspace.
 */
#define instrument_put_user(from, ptr, size)                        \
({                                                                \
        kmsan_copy_to_user(ptr, &from, sizeof(from), 0);        \
})

#endif /* _LINUX_INSTRUMENTED_H */





























































    3 















   15 

    3 
   14 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
 *
 * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
 * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
 */
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/route.h>
#include <linux/ip.h>
#include <net/ip.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("iptables mangle table");

#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
                            (1 << NF_INET_LOCAL_IN) | \
                            (1 << NF_INET_FORWARD) | \
                            (1 << NF_INET_LOCAL_OUT) | \
                            (1 << NF_INET_POST_ROUTING))

static const struct xt_table packet_mangler = {
        .name                = "mangle",
        .valid_hooks        = MANGLE_VALID_HOOKS,
        .me                = THIS_MODULE,
        .af                = NFPROTO_IPV4,
        .priority        = NF_IP_PRI_MANGLE,
};

static unsigned int
ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
        unsigned int ret, verdict;
        const struct iphdr *iph;
        __be32 saddr, daddr;
        u32 mark;
        int err;
        u8 tos;

        /* Save things which could affect route */
        mark = skb->mark;
        iph = ip_hdr(skb);
        saddr = iph->saddr;
        daddr = iph->daddr;
        tos = iph->tos;

        ret = ipt_do_table(priv, skb, state);
        verdict = ret & NF_VERDICT_MASK;
        /* Reroute for ANY change. */
        if (verdict != NF_DROP && verdict != NF_STOLEN) {
                iph = ip_hdr(skb);

                if (iph->saddr != saddr ||
                    iph->daddr != daddr ||
                    skb->mark != mark ||
                    iph->tos != tos) {
                        err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
        }

        return ret;
}

/* The work comes in here from netfilter.c. */
static unsigned int
iptable_mangle_hook(void *priv,
                     struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        if (state->hook == NF_INET_LOCAL_OUT)
                return ipt_mangle_out(priv, skb, state);
        return ipt_do_table(priv, skb, state);
}

static struct nf_hook_ops *mangle_ops __read_mostly;
static int iptable_mangle_table_init(struct net *net)
{
        struct ipt_replace *repl;
        int ret;

        repl = ipt_alloc_initial_table(&packet_mangler);
        if (repl == NULL)
                return -ENOMEM;
        ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops);
        kfree(repl);
        return ret;
}

static void __net_exit iptable_mangle_net_pre_exit(struct net *net)
{
        ipt_unregister_table_pre_exit(net, "mangle");
}

static void __net_exit iptable_mangle_net_exit(struct net *net)
{
        ipt_unregister_table_exit(net, "mangle");
}

static struct pernet_operations iptable_mangle_net_ops = {
        .pre_exit = iptable_mangle_net_pre_exit,
        .exit = iptable_mangle_net_exit,
};

static int __init iptable_mangle_init(void)
{
        int ret = xt_register_template(&packet_mangler,
                                       iptable_mangle_table_init);
        if (ret < 0)
                return ret;

        mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
        if (IS_ERR(mangle_ops)) {
                xt_unregister_template(&packet_mangler);
                ret = PTR_ERR(mangle_ops);
                return ret;
        }

        ret = register_pernet_subsys(&iptable_mangle_net_ops);
        if (ret < 0) {
                xt_unregister_template(&packet_mangler);
                kfree(mangle_ops);
                return ret;
        }

        return ret;
}

static void __exit iptable_mangle_fini(void)
{
        unregister_pernet_subsys(&iptable_mangle_net_ops);
        xt_unregister_template(&packet_mangler);
        kfree(mangle_ops);
}

module_init(iptable_mangle_init);
module_exit(iptable_mangle_fini);





























































































































































































































































































































































    3 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for inet_sock
 *
 * Authors:        Many, reorganised here by
 *                 Arnaldo Carvalho de Melo <acme@mandriva.com>
 */
#ifndef _INET_SOCK_H
#define _INET_SOCK_H

#include <linux/bitops.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/jhash.h>
#include <linux/netdevice.h>

#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/sock.h>
#include <net/request_sock.h>
#include <net/netns/hash.h>
#include <net/tcp_states.h>
#include <net/l3mdev.h>

#define IP_OPTIONS_DATA_FIXED_SIZE 40

/** struct ip_options - IP Options
 *
 * @faddr - Saved first hop address
 * @nexthop - Saved nexthop address in LSRR and SSRR
 * @is_strictroute - Strict source route
 * @srr_is_hit - Packet destination addr was our one
 * @is_changed - IP checksum more not valid
 * @rr_needaddr - Need to record addr of outgoing dev
 * @ts_needtime - Need to record timestamp
 * @ts_needaddr - Need to record addr of outgoing dev
 */
struct ip_options {
        __be32                faddr;
        __be32                nexthop;
        unsigned char        optlen;
        unsigned char        srr;
        unsigned char        rr;
        unsigned char        ts;
        unsigned char        is_strictroute:1,
                        srr_is_hit:1,
                        is_changed:1,
                        rr_needaddr:1,
                        ts_needtime:1,
                        ts_needaddr:1;
        unsigned char        router_alert;
        unsigned char        cipso;
        unsigned char        __pad2;
        unsigned char        __data[];
};

struct ip_options_rcu {
        struct rcu_head rcu;

        /* Must be last as it ends in a flexible-array member. */
        struct ip_options opt;
};

struct inet_request_sock {
        struct request_sock        req;
#define ir_loc_addr                req.__req_common.skc_rcv_saddr
#define ir_rmt_addr                req.__req_common.skc_daddr
#define ir_num                        req.__req_common.skc_num
#define ir_rmt_port                req.__req_common.skc_dport
#define ir_v6_rmt_addr                req.__req_common.skc_v6_daddr
#define ir_v6_loc_addr                req.__req_common.skc_v6_rcv_saddr
#define ir_iif                        req.__req_common.skc_bound_dev_if
#define ir_cookie                req.__req_common.skc_cookie
#define ireq_net                req.__req_common.skc_net
#define ireq_state                req.__req_common.skc_state
#define ireq_family                req.__req_common.skc_family

        u16                        snd_wscale : 4,
                                rcv_wscale : 4,
                                tstamp_ok  : 1,
                                sack_ok           : 1,
                                wscale_ok  : 1,
                                ecn_ok           : 1,
                                acked           : 1,
                                no_srccheck: 1,
                                smc_ok           : 1;
        u32                     ir_mark;
        union {
                struct ip_options_rcu __rcu        *ireq_opt;
#if IS_ENABLED(CONFIG_IPV6)
                struct {
                        struct ipv6_txoptions        *ipv6_opt;
                        struct sk_buff                *pktopts;
                };
#endif
        };
};

#define inet_rsk(ptr) container_of_const(ptr, struct inet_request_sock, req)

static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
{
        u32 mark = READ_ONCE(sk->sk_mark);

        if (!mark && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept))
                return skb->mark;

        return mark;
}

static inline int inet_request_bound_dev_if(const struct sock *sk,
                                            struct sk_buff *skb)
{
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
#ifdef CONFIG_NET_L3_MASTER_DEV
        struct net *net = sock_net(sk);

        if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
                return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
#endif

        return bound_dev_if;
}

static inline int inet_sk_bound_l3mdev(const struct sock *sk)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
        struct net *net = sock_net(sk);

        if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
                return l3mdev_master_ifindex_by_index(net,
                                                      sk->sk_bound_dev_if);
#endif

        return 0;
}

static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
                                     int dif, int sdif)
{
        if (!bound_dev_if)
                return !sdif || l3mdev_accept;
        return bound_dev_if == dif || bound_dev_if == sdif;
}

static inline bool inet_sk_bound_dev_eq(const struct net *net,
                                        int bound_dev_if,
                                        int dif, int sdif)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept),
                                 bound_dev_if, dif, sdif);
#else
        return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
#endif
}

struct inet6_cork {
        struct ipv6_txoptions *opt;
        u8 hop_limit;
        u8 tclass;
        u8 dontfrag:1;
};

struct inet_cork {
        unsigned int                flags;
        __be32                        addr;
        struct ip_options        *opt;
        unsigned int                fragsize;
        int                        length; /* Total length of all frames */
        struct dst_entry        *dst;
        u8                        tx_flags;
        __u8                        ttl;
        __s16                        tos;
        u32                        priority;
        __u16                        gso_size;
        u32                        ts_opt_id;
        u64                        transmit_time;
        u32                        mark;
};

struct inet_cork_full {
        struct inet_cork        base;
        struct flowi                fl;
#if IS_ENABLED(CONFIG_IPV6)
        struct inet6_cork        base6;
#endif
};

struct ip_mc_socklist;
struct ipv6_pinfo;
struct rtable;

/** struct inet_sock - representation of INET sockets
 *
 * @sk - ancestor class
 * @pinet6 - pointer to IPv6 control block
 * @inet_daddr - Foreign IPv4 addr
 * @inet_rcv_saddr - Bound local IPv4 addr
 * @inet_dport - Destination port
 * @inet_num - Local port
 * @inet_flags - various atomic flags
 * @inet_saddr - Sending source
 * @uc_ttl - Unicast TTL
 * @inet_sport - Source port
 * @inet_id - ID counter for DF pkts
 * @tos - TOS
 * @mc_ttl - Multicasting TTL
 * @uc_index - Unicast outgoing device index
 * @mc_index - Multicast device index
 * @mc_list - Group array
 * @cork - info to build ip hdr on each ip frag while socket is corked
 */
struct inet_sock {
        /* sk and pinet6 has to be the first two members of inet_sock */
        struct sock                sk;
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6_pinfo        *pinet6;
        struct ipv6_fl_socklist __rcu *ipv6_fl_list;
#endif
        /* Socket demultiplex comparisons on incoming packets. */
#define inet_daddr                sk.__sk_common.skc_daddr
#define inet_rcv_saddr                sk.__sk_common.skc_rcv_saddr
#define inet_dport                sk.__sk_common.skc_dport
#define inet_num                sk.__sk_common.skc_num

        unsigned long                inet_flags;
        __be32                        inet_saddr;
        __s16                        uc_ttl;
        __be16                        inet_sport;
        struct ip_options_rcu __rcu        *inet_opt;
        atomic_t                inet_id;

        __u8                        tos;
        __u8                        min_ttl;
        __u8                        mc_ttl;
        __u8                        pmtudisc;
        __u8                        rcv_tos;
        __u8                        convert_csum;
        int                        uc_index;
        int                        mc_index;
        __be32                        mc_addr;
        u32                        local_port_range;        /* high << 16 | low */

        struct ip_mc_socklist __rcu        *mc_list;
        struct inet_cork_full        cork;
};

#define IPCORK_OPT                1        /* ip-options has been held in ipcork.opt */
#define IPCORK_TS_OPT_ID        2        /* ts_opt_id field is valid, overriding sk_tskey */

enum {
        INET_FLAGS_PKTINFO        = 0,
        INET_FLAGS_TTL                = 1,
        INET_FLAGS_TOS                = 2,
        INET_FLAGS_RECVOPTS        = 3,
        INET_FLAGS_RETOPTS        = 4,
        INET_FLAGS_PASSSEC        = 5,
        INET_FLAGS_ORIGDSTADDR        = 6,
        INET_FLAGS_CHECKSUM        = 7,
        INET_FLAGS_RECVFRAGSIZE        = 8,

        INET_FLAGS_RECVERR        = 9,
        INET_FLAGS_RECVERR_RFC4884 = 10,
        INET_FLAGS_FREEBIND        = 11,
        INET_FLAGS_HDRINCL        = 12,
        INET_FLAGS_MC_LOOP        = 13,
        INET_FLAGS_MC_ALL        = 14,
        INET_FLAGS_TRANSPARENT        = 15,
        INET_FLAGS_IS_ICSK        = 16,
        INET_FLAGS_NODEFRAG        = 17,
        INET_FLAGS_BIND_ADDRESS_NO_PORT = 18,
        INET_FLAGS_DEFER_CONNECT = 19,
        INET_FLAGS_MC6_LOOP        = 20,
        INET_FLAGS_RECVERR6_RFC4884 = 21,
        INET_FLAGS_MC6_ALL        = 22,
        INET_FLAGS_AUTOFLOWLABEL_SET = 23,
        INET_FLAGS_AUTOFLOWLABEL = 24,
        INET_FLAGS_DONTFRAG        = 25,
        INET_FLAGS_RECVERR6        = 26,
        INET_FLAGS_REPFLOW        = 27,
        INET_FLAGS_RTALERT_ISOLATE = 28,
        INET_FLAGS_SNDFLOW        = 29,
        INET_FLAGS_RTALERT        = 30,
};

/* cmsg flags for inet */
#define IP_CMSG_PKTINFO                BIT(INET_FLAGS_PKTINFO)
#define IP_CMSG_TTL                BIT(INET_FLAGS_TTL)
#define IP_CMSG_TOS                BIT(INET_FLAGS_TOS)
#define IP_CMSG_RECVOPTS        BIT(INET_FLAGS_RECVOPTS)
#define IP_CMSG_RETOPTS                BIT(INET_FLAGS_RETOPTS)
#define IP_CMSG_PASSSEC                BIT(INET_FLAGS_PASSSEC)
#define IP_CMSG_ORIGDSTADDR        BIT(INET_FLAGS_ORIGDSTADDR)
#define IP_CMSG_CHECKSUM        BIT(INET_FLAGS_CHECKSUM)
#define IP_CMSG_RECVFRAGSIZE        BIT(INET_FLAGS_RECVFRAGSIZE)

#define IP_CMSG_ALL        (IP_CMSG_PKTINFO | IP_CMSG_TTL |                \
                         IP_CMSG_TOS | IP_CMSG_RECVOPTS |                \
                         IP_CMSG_RETOPTS | IP_CMSG_PASSSEC |                \
                         IP_CMSG_ORIGDSTADDR | IP_CMSG_CHECKSUM |        \
                         IP_CMSG_RECVFRAGSIZE)

static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
{
        return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL;
}

static inline dscp_t inet_sk_dscp(const struct inet_sock *inet)
{
        return inet_dsfield_to_dscp(READ_ONCE(inet->tos));
}

#define inet_test_bit(nr, sk)                        \
        test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_set_bit(nr, sk)                        \
        set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_clear_bit(nr, sk)                        \
        clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_assign_bit(nr, sk, val)                \
        assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)

/**
 * sk_to_full_sk - Access to a full socket
 * @sk: pointer to a socket
 *
 * SYNACK messages might be attached to request sockets.
 * Some places want to reach the listener in this case.
 */
static inline struct sock *sk_to_full_sk(struct sock *sk)
{
#ifdef CONFIG_INET
        if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
                sk = inet_reqsk(sk)->rsk_listener;
        if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
                sk = NULL;
#endif
        return sk;
}

/* sk_to_full_sk() variant with a const argument */
static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
{
#ifdef CONFIG_INET
        if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
                sk = ((const struct request_sock *)sk)->rsk_listener;
        if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
                sk = NULL;
#endif
        return sk;
}

static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
{
        return sk_to_full_sk(skb->sk);
}

#define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk)

int inet_sk_rebuild_header(struct sock *sk);

/**
 * inet_sk_state_load - read sk->sk_state for lockless contexts
 * @sk: socket pointer
 *
 * Paired with inet_sk_state_store(). Used in places we don't hold socket lock:
 * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...
 */
static inline int inet_sk_state_load(const struct sock *sk)
{
        /* state change might impact lockless readers. */
        return smp_load_acquire(&sk->sk_state);
}

/**
 * inet_sk_state_store - update sk->sk_state
 * @sk: socket pointer
 * @newstate: new state
 *
 * Paired with inet_sk_state_load(). Should be used in contexts where
 * state change might impact lockless readers.
 */
void inet_sk_state_store(struct sock *sk, int newstate);

void inet_sk_set_state(struct sock *sk, int state);

static inline unsigned int __inet_ehashfn(const __be32 laddr,
                                          const __u16 lport,
                                          const __be32 faddr,
                                          const __be16 fport,
                                          u32 initval)
{
        return jhash_3words((__force __u32) laddr,
                            (__force __u32) faddr,
                            ((__u32) lport) << 16 | (__force __u32)fport,
                            initval);
}

struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
                                      struct sock *sk_listener,
                                      bool attach_listener);

static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
{
        __u8 flags = 0;

        if (inet_test_bit(TRANSPARENT, sk) || inet_test_bit(HDRINCL, sk))
                flags |= FLOWI_FLAG_ANYSRC;
        return flags;
}

static inline void inet_inc_convert_csum(struct sock *sk)
{
        inet_sk(sk)->convert_csum++;
}

static inline void inet_dec_convert_csum(struct sock *sk)
{
        if (inet_sk(sk)->convert_csum > 0)
                inet_sk(sk)->convert_csum--;
}

static inline bool inet_get_convert_csum(struct sock *sk)
{
        return !!inet_sk(sk)->convert_csum;
}


static inline bool inet_can_nonlocal_bind(struct net *net,
                                          struct inet_sock *inet)
{
        return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) ||
                test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) ||
                test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags);
}

static inline bool inet_addr_valid_or_nonlocal(struct net *net,
                                               struct inet_sock *inet,
                                               __be32 addr,
                                               int addr_type)
{
        return inet_can_nonlocal_bind(net, inet) ||
                addr == htonl(INADDR_ANY) ||
                addr_type == RTN_LOCAL ||
                addr_type == RTN_MULTICAST ||
                addr_type == RTN_BROADCAST;
}

#endif        /* _INET_SOCK_H */








































































































    2 


































































    1 







    2 
    2 











    2 



    2 
















    1 































    2 










    2 









    2 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
// SPDX-License-Identifier: GPL-2.0
/*
 *                INETPEER - A storage for permanent information about peers
 *
 *  Authors:        Andrey V. Savochkin <saw@msu.ru>
 */

#include <linux/cache.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/random.h>
#include <linux/timer.h>
#include <linux/time.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
#include <linux/workqueue.h>
#include <net/ip.h>
#include <net/inetpeer.h>
#include <net/secure_seq.h>

/*
 *  Theory of operations.
 *  We keep one entry for each peer IP address.  The nodes contains long-living
 *  information about the peer which doesn't depend on routes.
 *
 *  Nodes are removed only when reference counter goes to 0.
 *  When it's happened the node may be removed when a sufficient amount of
 *  time has been passed since its last use.  The less-recently-used entry can
 *  also be removed if the pool is overloaded i.e. if the total amount of
 *  entries is greater-or-equal than the threshold.
 *
 *  Node pool is organised as an RB tree.
 *  Such an implementation has been chosen not just for fun.  It's a way to
 *  prevent easy and efficient DoS attacks by creating hash collisions.  A huge
 *  amount of long living nodes in a single hash slot would significantly delay
 *  lookups performed with disabled BHs.
 *
 *  Serialisation issues.
 *  1.  Nodes may appear in the tree only with the pool lock held.
 *  2.  Nodes may disappear from the tree only with the pool lock held
 *      AND reference count being 0.
 *  3.  Global variable peer_total is modified under the pool lock.
 *  4.  struct inet_peer fields modification:
 *                rb_node: pool lock
 *                refcnt: atomically against modifications on other CPU;
 *                   usually under some other lock to prevent node disappearing
 *                daddr: unchangeable
 */

static struct kmem_cache *peer_cachep __ro_after_init;

void inet_peer_base_init(struct inet_peer_base *bp)
{
        bp->rb_root = RB_ROOT;
        seqlock_init(&bp->lock);
        bp->total = 0;
}

#define PEER_MAX_GC 32

/* Exported for sysctl_net_ipv4.  */
int inet_peer_threshold __read_mostly;        /* start to throw entries more
                                         * aggressively at this stage */
int inet_peer_minttl __read_mostly = 120 * HZ;        /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;        /* usual time to live: 10 min */

/* Called from ip_output.c:ip_init  */
void __init inet_initpeers(void)
{
        u64 nr_entries;

         /* 1% of physical memory */
        nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT,
                              100 * L1_CACHE_ALIGN(sizeof(struct inet_peer)));

        inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128);

        peer_cachep = KMEM_CACHE(inet_peer, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
}

/* Called with rcu_read_lock() or base->lock held */
static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
                                struct inet_peer_base *base,
                                unsigned int seq,
                                struct inet_peer *gc_stack[],
                                unsigned int *gc_cnt,
                                struct rb_node **parent_p,
                                struct rb_node ***pp_p)
{
        struct rb_node **pp, *parent, *next;
        struct inet_peer *p;
        u32 now;

        pp = &base->rb_root.rb_node;
        parent = NULL;
        while (1) {
                int cmp;

                next = rcu_dereference_raw(*pp);
                if (!next)
                        break;
                parent = next;
                p = rb_entry(parent, struct inet_peer, rb_node);
                cmp = inetpeer_addr_cmp(daddr, &p->daddr);
                if (cmp == 0) {
                        now = jiffies;
                        if (READ_ONCE(p->dtime) != now)
                                WRITE_ONCE(p->dtime, now);
                        return p;
                }
                if (gc_stack) {
                        if (*gc_cnt < PEER_MAX_GC)
                                gc_stack[(*gc_cnt)++] = p;
                } else if (unlikely(read_seqretry(&base->lock, seq))) {
                        break;
                }
                if (cmp == -1)
                        pp = &next->rb_left;
                else
                        pp = &next->rb_right;
        }
        *parent_p = parent;
        *pp_p = pp;
        return NULL;
}

/* perform garbage collect on all items stacked during a lookup */
static void inet_peer_gc(struct inet_peer_base *base,
                         struct inet_peer *gc_stack[],
                         unsigned int gc_cnt)
{
        int peer_threshold, peer_maxttl, peer_minttl;
        struct inet_peer *p;
        __u32 delta, ttl;
        int i;

        peer_threshold = READ_ONCE(inet_peer_threshold);
        peer_maxttl = READ_ONCE(inet_peer_maxttl);
        peer_minttl = READ_ONCE(inet_peer_minttl);

        if (base->total >= peer_threshold)
                ttl = 0; /* be aggressive */
        else
                ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ *
                        base->total / peer_threshold * HZ;
        for (i = 0; i < gc_cnt; i++) {
                p = gc_stack[i];

                delta = (__u32)jiffies - READ_ONCE(p->dtime);

                if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
                        gc_stack[i] = NULL;
        }
        for (i = 0; i < gc_cnt; i++) {
                p = gc_stack[i];
                if (p) {
                        rb_erase(&p->rb_node, &base->rb_root);
                        base->total--;
                        kfree_rcu(p, rcu);
                }
        }
}

/* Must be called under RCU : No refcount change is done here. */
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
                               const struct inetpeer_addr *daddr)
{
        struct inet_peer *p, *gc_stack[PEER_MAX_GC];
        struct rb_node **pp, *parent;
        unsigned int gc_cnt, seq;

        /* Attempt a lockless lookup first.
         * Because of a concurrent writer, we might not find an existing entry.
         */
        seq = read_seqbegin(&base->lock);
        p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);

        if (p)
                return p;

        /* retry an exact lookup, taking the lock before.
         * At least, nodes should be hot in our cache.
         */
        parent = NULL;
        write_seqlock_bh(&base->lock);

        gc_cnt = 0;
        p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
        if (!p) {
                p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
                if (p) {
                        p->daddr = *daddr;
                        p->dtime = (__u32)jiffies;
                        refcount_set(&p->refcnt, 1);
                        atomic_set(&p->rid, 0);
                        p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
                        p->rate_tokens = 0;
                        p->n_redirects = 0;
                        /* 60*HZ is arbitrary, but chosen enough high so that the first
                         * calculation of tokens is at its maximum.
                         */
                        p->rate_last = jiffies - 60*HZ;

                        rb_link_node(&p->rb_node, parent, pp);
                        rb_insert_color(&p->rb_node, &base->rb_root);
                        base->total++;
                }
        }
        if (gc_cnt)
                inet_peer_gc(base, gc_stack, gc_cnt);
        write_sequnlock_bh(&base->lock);

        return p;
}

void inet_putpeer(struct inet_peer *p)
{
        if (refcount_dec_and_test(&p->refcnt))
                kfree_rcu(p, rcu);
}

/*
 *        Check transmit rate limitation for given message.
 *        The rate information is held in the inet_peer entries now.
 *        This function is generic and could be used for other purposes
 *        too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
 *
 *        Note that the same inet_peer fields are modified by functions in
 *        route.c too, but these work for packet destinations while xrlim_allow
 *        works for icmp destinations. This means the rate limiting information
 *        for one "ip object" is shared - and these ICMPs are twice limited:
 *        by source and by destination.
 *
 *        RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
 *                          SHOULD allow setting of rate limits
 *
 *         Shared between ICMPv4 and ICMPv6.
 */
#define XRLIM_BURST_FACTOR 6
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
{
        unsigned long now, token, otoken, delta;
        bool rc = false;

        if (!peer)
                return true;

        token = otoken = READ_ONCE(peer->rate_tokens);
        now = jiffies;
        delta = now - READ_ONCE(peer->rate_last);
        if (delta) {
                WRITE_ONCE(peer->rate_last, now);
                token += delta;
                if (token > XRLIM_BURST_FACTOR * timeout)
                        token = XRLIM_BURST_FACTOR * timeout;
        }
        if (token >= timeout) {
                token -= timeout;
                rc = true;
        }
        if (token != otoken)
                WRITE_ONCE(peer->rate_tokens, token);
        return rc;
}

void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
        struct rb_node *p = rb_first(&base->rb_root);

        while (p) {
                struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node);

                p = rb_next(p);
                rb_erase(&peer->rb_node, &base->rb_root);
                inet_putpeer(peer);
                cond_resched();
        }

        base->total = 0;
}


































































    2 






    2 
    2 



    2 












































































    2 





    2 






















    1 

    1 































    1 
    1 

































    1 












    1 




    1 






    1 













    1 









    1 













    1 















    1 





























































































































































































































    2 















    2 















































































































































































    2 






    2 
























































    1 
















    1 






    1 










    1 






    2 







    1 




    2 



















    2 












    2 











    1 





















    1 


















































































































    2 

























    2 












    1 







    1 










    1 


































































































    2 





    2 


















    1 


    1 






































    1 








    2 











    2 






    2 

















    2 


































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS module and super block management.
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Ryusuke Konishi.
 */
/*
 *  linux/fs/ext2/super.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/crc32.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include "nilfs.h"
#include "export.h"
#include "mdt.h"
#include "alloc.h"
#include "btree.h"
#include "btnode.h"
#include "page.h"
#include "cpfile.h"
#include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */
#include "ifile.h"
#include "dat.h"
#include "segment.h"
#include "segbuf.h"

MODULE_AUTHOR("NTT Corp.");
MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
MODULE_LICENSE("GPL");

static struct kmem_cache *nilfs_inode_cachep;
struct kmem_cache *nilfs_transaction_cachep;
struct kmem_cache *nilfs_segbuf_cachep;
struct kmem_cache *nilfs_btree_path_cache;

static int nilfs_setup_super(struct super_block *sb, int is_mount);

void __nilfs_msg(struct super_block *sb, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;
        int level;

        va_start(args, fmt);

        level = printk_get_level(fmt);
        vaf.fmt = printk_skip_level(fmt);
        vaf.va = &args;

        if (sb)
                printk("%c%cNILFS (%s): %pV\n",
                       KERN_SOH_ASCII, level, sb->s_id, &vaf);
        else
                printk("%c%cNILFS: %pV\n",
                       KERN_SOH_ASCII, level, &vaf);

        va_end(args);
}

static void nilfs_set_error(struct super_block *sb)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;

        down_write(&nilfs->ns_sem);
        if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
                nilfs->ns_mount_state |= NILFS_ERROR_FS;
                sbp = nilfs_prepare_super(sb, 0);
                if (likely(sbp)) {
                        sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
                        if (sbp[1])
                                sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
                        nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
                }
        }
        up_write(&nilfs->ns_sem);
}

/**
 * __nilfs_error() - report failure condition on a filesystem
 * @sb:       super block instance
 * @function: name of calling function
 * @fmt:      format string for message to be output
 * @...:      optional arguments to @fmt
 *
 * __nilfs_error() sets an ERROR_FS flag on the superblock as well as
 * reporting an error message.  This function should be called when
 * NILFS detects incoherences or defects of meta data on disk.
 *
 * This implements the body of nilfs_error() macro.  Normally,
 * nilfs_error() should be used.  As for sustainable errors such as a
 * single-shot I/O error, nilfs_err() should be used instead.
 *
 * Callers should not add a trailing newline since this will do it.
 */
void __nilfs_error(struct super_block *sb, const char *function,
                   const char *fmt, ...)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
               sb->s_id, function, &vaf);

        va_end(args);

        if (!sb_rdonly(sb)) {
                nilfs_set_error(sb);

                if (nilfs_test_opt(nilfs, ERRORS_RO)) {
                        printk(KERN_CRIT "Remounting filesystem read-only\n");
                        sb->s_flags |= SB_RDONLY;
                }
        }

        if (nilfs_test_opt(nilfs, ERRORS_PANIC))
                panic("NILFS (device %s): panic forced after error\n",
                      sb->s_id);
}

struct inode *nilfs_alloc_inode(struct super_block *sb)
{
        struct nilfs_inode_info *ii;

        ii = alloc_inode_sb(sb, nilfs_inode_cachep, GFP_NOFS);
        if (!ii)
                return NULL;
        ii->i_bh = NULL;
        ii->i_state = 0;
        ii->i_type = 0;
        ii->i_cno = 0;
        ii->i_assoc_inode = NULL;
        ii->i_bmap = &ii->i_bmap_data;
        return &ii->vfs_inode;
}

static void nilfs_free_inode(struct inode *inode)
{
        if (nilfs_is_metadata_file_inode(inode))
                nilfs_mdt_destroy(inode);

        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
}

static int nilfs_sync_super(struct super_block *sb, int flag)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        int err;

 retry:
        set_buffer_dirty(nilfs->ns_sbh[0]);
        if (nilfs_test_opt(nilfs, BARRIER)) {
                err = __sync_dirty_buffer(nilfs->ns_sbh[0],
                                          REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
        } else {
                err = sync_dirty_buffer(nilfs->ns_sbh[0]);
        }

        if (unlikely(err)) {
                nilfs_err(sb, "unable to write superblock: err=%d", err);
                if (err == -EIO && nilfs->ns_sbh[1]) {
                        /*
                         * sbp[0] points to newer log than sbp[1],
                         * so copy sbp[0] to sbp[1] to take over sbp[0].
                         */
                        memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0],
                               nilfs->ns_sbsize);
                        nilfs_fall_back_super_block(nilfs);
                        goto retry;
                }
        } else {
                struct nilfs_super_block *sbp = nilfs->ns_sbp[0];

                nilfs->ns_sbwcount++;

                /*
                 * The latest segment becomes trailable from the position
                 * written in superblock.
                 */
                clear_nilfs_discontinued(nilfs);

                /* update GC protection for recent segments */
                if (nilfs->ns_sbh[1]) {
                        if (flag == NILFS_SB_COMMIT_ALL) {
                                set_buffer_dirty(nilfs->ns_sbh[1]);
                                if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0)
                                        goto out;
                        }
                        if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) <
                            le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno))
                                sbp = nilfs->ns_sbp[1];
                }

                spin_lock(&nilfs->ns_last_segment_lock);
                nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
                spin_unlock(&nilfs->ns_last_segment_lock);
        }
 out:
        return err;
}

void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
                          struct the_nilfs *nilfs)
{
        sector_t nfreeblocks;

        /* nilfs->ns_sem must be locked by the caller. */
        nilfs_count_free_blocks(nilfs, &nfreeblocks);
        sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks);

        spin_lock(&nilfs->ns_last_segment_lock);
        sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
        sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
        sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
        spin_unlock(&nilfs->ns_last_segment_lock);
}

struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
                                               int flip)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;

        /* nilfs->ns_sem must be locked by the caller. */
        if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
                if (sbp[1] &&
                    sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
                        memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
                } else {
                        nilfs_crit(sb, "superblock broke");
                        return NULL;
                }
        } else if (sbp[1] &&
                   sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
        }

        if (flip && sbp[1])
                nilfs_swap_super_block(nilfs);

        return sbp;
}

int nilfs_commit_super(struct super_block *sb, int flag)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
        time64_t t;

        /* nilfs->ns_sem must be locked by the caller. */
        t = ktime_get_real_seconds();
        nilfs->ns_sbwtime = t;
        sbp[0]->s_wtime = cpu_to_le64(t);
        sbp[0]->s_sum = 0;
        sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
                                             (unsigned char *)sbp[0],
                                             nilfs->ns_sbsize));
        if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) {
                sbp[1]->s_wtime = sbp[0]->s_wtime;
                sbp[1]->s_sum = 0;
                sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
                                            (unsigned char *)sbp[1],
                                            nilfs->ns_sbsize));
        }
        clear_nilfs_sb_dirty(nilfs);
        nilfs->ns_flushed_device = 1;
        /* make sure store to ns_flushed_device cannot be reordered */
        smp_wmb();
        return nilfs_sync_super(sb, flag);
}

/**
 * nilfs_cleanup_super() - write filesystem state for cleanup
 * @sb: super block instance to be unmounted or degraded to read-only
 *
 * This function restores state flags in the on-disk super block.
 * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
 * filesystem was not clean previously.
 *
 * Return: 0 on success, %-EIO if I/O error or superblock is corrupted.
 */
int nilfs_cleanup_super(struct super_block *sb)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;
        int flag = NILFS_SB_COMMIT;
        int ret = -EIO;

        sbp = nilfs_prepare_super(sb, 0);
        if (sbp) {
                sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
                nilfs_set_log_cursor(sbp[0], nilfs);
                if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
                        /*
                         * make the "clean" flag also to the opposite
                         * super block if both super blocks point to
                         * the same checkpoint.
                         */
                        sbp[1]->s_state = sbp[0]->s_state;
                        flag = NILFS_SB_COMMIT_ALL;
                }
                ret = nilfs_commit_super(sb, flag);
        }
        return ret;
}

/**
 * nilfs_move_2nd_super - relocate secondary super block
 * @sb: super block instance
 * @sb2off: new offset of the secondary super block (in bytes)
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct buffer_head *nsbh;
        struct nilfs_super_block *nsbp;
        sector_t blocknr, newblocknr;
        unsigned long offset;
        int sb2i;  /* array index of the secondary superblock */
        int ret = 0;

        /* nilfs->ns_sem must be locked by the caller. */
        if (nilfs->ns_sbh[1] &&
            nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) {
                sb2i = 1;
                blocknr = nilfs->ns_sbh[1]->b_blocknr;
        } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
                sb2i = 0;
                blocknr = nilfs->ns_sbh[0]->b_blocknr;
        } else {
                sb2i = -1;
                blocknr = 0;
        }
        if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
                goto out;  /* super block location is unchanged */

        /* Get new super block buffer */
        newblocknr = sb2off >> nilfs->ns_blocksize_bits;
        offset = sb2off & (nilfs->ns_blocksize - 1);
        nsbh = sb_getblk(sb, newblocknr);
        if (!nsbh) {
                nilfs_warn(sb,
                           "unable to move secondary superblock to block %llu",
                           (unsigned long long)newblocknr);
                ret = -EIO;
                goto out;
        }
        nsbp = (void *)nsbh->b_data + offset;

        lock_buffer(nsbh);
        if (sb2i >= 0) {
                /*
                 * The position of the second superblock only changes by 4KiB,
                 * which is larger than the maximum superblock data size
                 * (= 1KiB), so there is no need to use memmove() to allow
                 * overlap between source and destination.
                 */
                memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize);

                /*
                 * Zero fill after copy to avoid overwriting in case of move
                 * within the same block.
                 */
                memset(nsbh->b_data, 0, offset);
                memset((void *)nsbp + nilfs->ns_sbsize, 0,
                       nsbh->b_size - offset - nilfs->ns_sbsize);
        } else {
                memset(nsbh->b_data, 0, nsbh->b_size);
        }
        set_buffer_uptodate(nsbh);
        unlock_buffer(nsbh);

        if (sb2i >= 0) {
                brelse(nilfs->ns_sbh[sb2i]);
                nilfs->ns_sbh[sb2i] = nsbh;
                nilfs->ns_sbp[sb2i] = nsbp;
        } else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) {
                /* secondary super block will be restored to index 1 */
                nilfs->ns_sbh[1] = nsbh;
                nilfs->ns_sbp[1] = nsbp;
        } else {
                brelse(nsbh);
        }
out:
        return ret;
}

/**
 * nilfs_resize_fs - resize the filesystem
 * @sb: super block instance
 * @newsize: new size of the filesystem (in bytes)
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;
        __u64 devsize, newnsegs;
        loff_t sb2off;
        int ret;

        ret = -ERANGE;
        devsize = bdev_nr_bytes(sb->s_bdev);
        if (newsize > devsize)
                goto out;

        /*
         * Prevent underflow in second superblock position calculation.
         * The exact minimum size check is done in nilfs_sufile_resize().
         */
        if (newsize < 4096) {
                ret = -ENOSPC;
                goto out;
        }

        /*
         * Write lock is required to protect some functions depending
         * on the number of segments, the number of reserved segments,
         * and so forth.
         */
        down_write(&nilfs->ns_segctor_sem);

        sb2off = NILFS_SB2_OFFSET_BYTES(newsize);
        newnsegs = sb2off >> nilfs->ns_blocksize_bits;
        newnsegs = div64_ul(newnsegs, nilfs->ns_blocks_per_segment);

        ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs);
        up_write(&nilfs->ns_segctor_sem);
        if (ret < 0)
                goto out;

        ret = nilfs_construct_segment(sb);
        if (ret < 0)
                goto out;

        down_write(&nilfs->ns_sem);
        nilfs_move_2nd_super(sb, sb2off);
        ret = -EIO;
        sbp = nilfs_prepare_super(sb, 0);
        if (likely(sbp)) {
                nilfs_set_log_cursor(sbp[0], nilfs);
                /*
                 * Drop NILFS_RESIZE_FS flag for compatibility with
                 * mount-time resize which may be implemented in a
                 * future release.
                 */
                sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) &
                                              ~NILFS_RESIZE_FS);
                sbp[0]->s_dev_size = cpu_to_le64(newsize);
                sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments);
                if (sbp[1])
                        memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
                ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
        }
        up_write(&nilfs->ns_sem);

        /*
         * Reset the range of allocatable segments last.  This order
         * is important in the case of expansion because the secondary
         * superblock must be protected from log write until migration
         * completes.
         */
        if (!ret)
                nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1);
out:
        return ret;
}

static void nilfs_put_super(struct super_block *sb)
{
        struct the_nilfs *nilfs = sb->s_fs_info;

        nilfs_detach_log_writer(sb);

        if (!sb_rdonly(sb)) {
                down_write(&nilfs->ns_sem);
                nilfs_cleanup_super(sb);
                up_write(&nilfs->ns_sem);
        }

        nilfs_sysfs_delete_device_group(nilfs);
        iput(nilfs->ns_sufile);
        iput(nilfs->ns_cpfile);
        iput(nilfs->ns_dat);

        destroy_nilfs(nilfs);
        sb->s_fs_info = NULL;
}

static int nilfs_sync_fs(struct super_block *sb, int wait)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;
        int err = 0;

        /* This function is called when super block should be written back */
        if (wait)
                err = nilfs_construct_segment(sb);

        down_write(&nilfs->ns_sem);
        if (nilfs_sb_dirty(nilfs)) {
                sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs));
                if (likely(sbp)) {
                        nilfs_set_log_cursor(sbp[0], nilfs);
                        nilfs_commit_super(sb, NILFS_SB_COMMIT);
                }
        }
        up_write(&nilfs->ns_sem);

        if (!err)
                err = nilfs_flush_device(nilfs);

        return err;
}

int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
                            struct nilfs_root **rootp)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_root *root;
        int err = -ENOMEM;

        root = nilfs_find_or_create_root(
                nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
        if (!root)
                return err;

        if (root->ifile)
                goto reuse; /* already attached checkpoint */

        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_ifile_read(sb, root, cno, nilfs->ns_inode_size);
        up_read(&nilfs->ns_segctor_sem);
        if (unlikely(err))
                goto failed;

 reuse:
        *rootp = root;
        return 0;

 failed:
        if (err == -EINVAL)
                nilfs_err(sb, "Invalid checkpoint (checkpoint number=%llu)",
                          (unsigned long long)cno);
        nilfs_put_root(root);

        return err;
}

static int nilfs_freeze(struct super_block *sb)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        int err;

        if (sb_rdonly(sb))
                return 0;

        /* Mark super block clean */
        down_write(&nilfs->ns_sem);
        err = nilfs_cleanup_super(sb);
        up_write(&nilfs->ns_sem);
        return err;
}

static int nilfs_unfreeze(struct super_block *sb)
{
        struct the_nilfs *nilfs = sb->s_fs_info;

        if (sb_rdonly(sb))
                return 0;

        down_write(&nilfs->ns_sem);
        nilfs_setup_super(sb, false);
        up_write(&nilfs->ns_sem);
        return 0;
}

static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root;
        struct the_nilfs *nilfs = root->nilfs;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        unsigned long long blocks;
        unsigned long overhead;
        unsigned long nrsvblocks;
        sector_t nfreeblocks;
        u64 nmaxinodes, nfreeinodes;
        int err;

        /*
         * Compute all of the segment blocks
         *
         * The blocks before first segment and after last segment
         * are excluded.
         */
        blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
                - nilfs->ns_first_data_block;
        nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;

        /*
         * Compute the overhead
         *
         * When distributing meta data blocks outside segment structure,
         * We must count them as the overhead.
         */
        overhead = 0;

        err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
        if (unlikely(err))
                return err;

        err = nilfs_ifile_count_free_inodes(root->ifile,
                                            &nmaxinodes, &nfreeinodes);
        if (unlikely(err)) {
                nilfs_warn(sb, "failed to count free inodes: err=%d", err);
                if (err == -ERANGE) {
                        /*
                         * If nilfs_palloc_count_max_entries() returns
                         * -ERANGE error code then we simply treat
                         * curent inodes count as maximum possible and
                         * zero as free inodes value.
                         */
                        nmaxinodes = atomic64_read(&root->inodes_count);
                        nfreeinodes = 0;
                        err = 0;
                } else
                        return err;
        }

        buf->f_type = NILFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = blocks - overhead;
        buf->f_bfree = nfreeblocks;
        buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
                (buf->f_bfree - nrsvblocks) : 0;
        buf->f_files = nmaxinodes;
        buf->f_ffree = nfreeinodes;
        buf->f_namelen = NILFS_NAME_LEN;
        buf->f_fsid = u64_to_fsid(id);

        return 0;
}

static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
        struct super_block *sb = dentry->d_sb;
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root;

        if (!nilfs_test_opt(nilfs, BARRIER))
                seq_puts(seq, ",nobarrier");
        if (root->cno != NILFS_CPTREE_CURRENT_CNO)
                seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
        if (nilfs_test_opt(nilfs, ERRORS_PANIC))
                seq_puts(seq, ",errors=panic");
        if (nilfs_test_opt(nilfs, ERRORS_CONT))
                seq_puts(seq, ",errors=continue");
        if (nilfs_test_opt(nilfs, STRICT_ORDER))
                seq_puts(seq, ",order=strict");
        if (nilfs_test_opt(nilfs, NORECOVERY))
                seq_puts(seq, ",norecovery");
        if (nilfs_test_opt(nilfs, DISCARD))
                seq_puts(seq, ",discard");

        return 0;
}

static const struct super_operations nilfs_sops = {
        .alloc_inode    = nilfs_alloc_inode,
        .free_inode     = nilfs_free_inode,
        .dirty_inode    = nilfs_dirty_inode,
        .evict_inode    = nilfs_evict_inode,
        .put_super      = nilfs_put_super,
        .sync_fs        = nilfs_sync_fs,
        .freeze_fs        = nilfs_freeze,
        .unfreeze_fs        = nilfs_unfreeze,
        .statfs         = nilfs_statfs,
        .show_options = nilfs_show_options
};

enum {
        Opt_err, Opt_barrier, Opt_snapshot, Opt_order, Opt_norecovery,
        Opt_discard,
};

static const struct constant_table nilfs_param_err[] = {
        {"continue",        NILFS_MOUNT_ERRORS_CONT},
        {"panic",        NILFS_MOUNT_ERRORS_PANIC},
        {"remount-ro",        NILFS_MOUNT_ERRORS_RO},
        {}
};

static const struct fs_parameter_spec nilfs_param_spec[] = {
        fsparam_enum        ("errors", Opt_err, nilfs_param_err),
        fsparam_flag_no        ("barrier", Opt_barrier),
        fsparam_u64        ("cp", Opt_snapshot),
        fsparam_string        ("order", Opt_order),
        fsparam_flag        ("norecovery", Opt_norecovery),
        fsparam_flag_no        ("discard", Opt_discard),
        {}
};

struct nilfs_fs_context {
        unsigned long ns_mount_opt;
        __u64 cno;
};

static int nilfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct nilfs_fs_context *nilfs = fc->fs_private;
        int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, nilfs_param_spec, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_barrier:
                if (result.negated)
                        nilfs_clear_opt(nilfs, BARRIER);
                else
                        nilfs_set_opt(nilfs, BARRIER);
                break;
        case Opt_order:
                if (strcmp(param->string, "relaxed") == 0)
                        /* Ordered data semantics */
                        nilfs_clear_opt(nilfs, STRICT_ORDER);
                else if (strcmp(param->string, "strict") == 0)
                        /* Strict in-order semantics */
                        nilfs_set_opt(nilfs, STRICT_ORDER);
                else
                        return -EINVAL;
                break;
        case Opt_err:
                nilfs->ns_mount_opt &= ~NILFS_MOUNT_ERROR_MODE;
                nilfs->ns_mount_opt |= result.uint_32;
                break;
        case Opt_snapshot:
                if (is_remount) {
                        struct super_block *sb = fc->root->d_sb;

                        nilfs_err(sb,
                                  "\"%s\" option is invalid for remount",
                                  param->key);
                        return -EINVAL;
                }
                if (result.uint_64 == 0) {
                        nilfs_err(NULL,
                                  "invalid option \"cp=0\": invalid checkpoint number 0");
                        return -EINVAL;
                }
                nilfs->cno = result.uint_64;
                break;
        case Opt_norecovery:
                nilfs_set_opt(nilfs, NORECOVERY);
                break;
        case Opt_discard:
                if (result.negated)
                        nilfs_clear_opt(nilfs, DISCARD);
                else
                        nilfs_set_opt(nilfs, DISCARD);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static int nilfs_setup_super(struct super_block *sb, int is_mount)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;
        int max_mnt_count;
        int mnt_count;

        /* nilfs->ns_sem must be locked by the caller. */
        sbp = nilfs_prepare_super(sb, 0);
        if (!sbp)
                return -EIO;

        if (!is_mount)
                goto skip_mount_setup;

        max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
        mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);

        if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
                nilfs_warn(sb, "mounting fs with errors");
#if 0
        } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
                nilfs_warn(sb, "maximal mount count reached");
#endif
        }
        if (!max_mnt_count)
                sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);

        sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
        sbp[0]->s_mtime = cpu_to_le64(ktime_get_real_seconds());

skip_mount_setup:
        sbp[0]->s_state =
                cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
        /* synchronize sbp[1] with sbp[0] */
        if (sbp[1])
                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
        return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
}

struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
                                                 u64 pos, int blocksize,
                                                 struct buffer_head **pbh)
{
        unsigned long long sb_index = pos;
        unsigned long offset;

        offset = do_div(sb_index, blocksize);
        *pbh = sb_bread(sb, sb_index);
        if (!*pbh)
                return NULL;
        return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
}

int nilfs_store_magic(struct super_block *sb,
                      struct nilfs_super_block *sbp)
{
        struct the_nilfs *nilfs = sb->s_fs_info;

        sb->s_magic = le16_to_cpu(sbp->s_magic);

        /* FS independent flags */
#ifdef NILFS_ATIME_DISABLE
        sb->s_flags |= SB_NOATIME;
#endif

        nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid);
        nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid);
        nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
        nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);

        return 0;
}

int nilfs_check_feature_compatibility(struct super_block *sb,
                                      struct nilfs_super_block *sbp)
{
        __u64 features;

        features = le64_to_cpu(sbp->s_feature_incompat) &
                ~NILFS_FEATURE_INCOMPAT_SUPP;
        if (features) {
                nilfs_err(sb,
                          "couldn't mount because of unsupported optional features (%llx)",
                          (unsigned long long)features);
                return -EINVAL;
        }
        features = le64_to_cpu(sbp->s_feature_compat_ro) &
                ~NILFS_FEATURE_COMPAT_RO_SUPP;
        if (!sb_rdonly(sb) && features) {
                nilfs_err(sb,
                          "couldn't mount RDWR because of unsupported optional features (%llx)",
                          (unsigned long long)features);
                return -EINVAL;
        }
        return 0;
}

static int nilfs_get_root_dentry(struct super_block *sb,
                                 struct nilfs_root *root,
                                 struct dentry **root_dentry)
{
        struct inode *inode;
        struct dentry *dentry;
        int ret = 0;

        inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
                nilfs_err(sb, "error %d getting root inode", ret);
                goto out;
        }
        if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
                iput(inode);
                nilfs_err(sb, "corrupt root inode");
                ret = -EINVAL;
                goto out;
        }

        if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
                dentry = d_find_alias(inode);
                if (!dentry) {
                        dentry = d_make_root(inode);
                        if (!dentry) {
                                ret = -ENOMEM;
                                goto failed_dentry;
                        }
                } else {
                        iput(inode);
                }
        } else {
                dentry = d_obtain_root(inode);
                if (IS_ERR(dentry)) {
                        ret = PTR_ERR(dentry);
                        goto failed_dentry;
                }
        }
        *root_dentry = dentry;
 out:
        return ret;

 failed_dentry:
        nilfs_err(sb, "error %d getting root dentry", ret);
        goto out;
}

static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
                                 struct dentry **root_dentry)
{
        struct the_nilfs *nilfs = s->s_fs_info;
        struct nilfs_root *root;
        int ret;

        mutex_lock(&nilfs->ns_snapshot_mount_mutex);

        down_read(&nilfs->ns_segctor_sem);
        ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
        up_read(&nilfs->ns_segctor_sem);
        if (ret < 0) {
                ret = (ret == -ENOENT) ? -EINVAL : ret;
                goto out;
        } else if (!ret) {
                nilfs_err(s,
                          "The specified checkpoint is not a snapshot (checkpoint number=%llu)",
                          (unsigned long long)cno);
                ret = -EINVAL;
                goto out;
        }

        ret = nilfs_attach_checkpoint(s, cno, false, &root);
        if (ret) {
                nilfs_err(s,
                          "error %d while loading snapshot (checkpoint number=%llu)",
                          ret, (unsigned long long)cno);
                goto out;
        }
        ret = nilfs_get_root_dentry(s, root, root_dentry);
        nilfs_put_root(root);
 out:
        mutex_unlock(&nilfs->ns_snapshot_mount_mutex);
        return ret;
}

/**
 * nilfs_tree_is_busy() - try to shrink dentries of a checkpoint
 * @root_dentry: root dentry of the tree to be shrunk
 *
 * Return: true if the tree was in-use, false otherwise.
 */
static bool nilfs_tree_is_busy(struct dentry *root_dentry)
{
        shrink_dcache_parent(root_dentry);
        return d_count(root_dentry) > 1;
}

int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_root *root;
        struct inode *inode;
        struct dentry *dentry;
        int ret;

        if (cno > nilfs->ns_cno)
                return false;

        if (cno >= nilfs_last_cno(nilfs))
                return true;        /* protect recent checkpoints */

        ret = false;
        root = nilfs_lookup_root(nilfs, cno);
        if (root) {
                inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
                if (inode) {
                        dentry = d_find_alias(inode);
                        if (dentry) {
                                ret = nilfs_tree_is_busy(dentry);
                                dput(dentry);
                        }
                        iput(inode);
                }
                nilfs_put_root(root);
        }
        return ret;
}

/**
 * nilfs_fill_super() - initialize a super block instance
 * @sb: super_block
 * @fc: filesystem context
 *
 * This function is called exclusively by nilfs->ns_mount_mutex.
 * So, the recovery process is protected from other simultaneous mounts.
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int
nilfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct the_nilfs *nilfs;
        struct nilfs_root *fsroot;
        struct nilfs_fs_context *ctx = fc->fs_private;
        __u64 cno;
        int err;

        nilfs = alloc_nilfs(sb);
        if (!nilfs)
                return -ENOMEM;

        sb->s_fs_info = nilfs;

        err = init_nilfs(nilfs, sb);
        if (err)
                goto failed_nilfs;

        /* Copy in parsed mount options */
        nilfs->ns_mount_opt = ctx->ns_mount_opt;

        sb->s_op = &nilfs_sops;
        sb->s_export_op = &nilfs_export_ops;
        sb->s_root = NULL;
        sb->s_time_gran = 1;
        sb->s_max_links = NILFS_LINK_MAX;

        sb->s_bdi = bdi_get(sb->s_bdev->bd_disk->bdi);

        err = load_nilfs(nilfs, sb);
        if (err)
                goto failed_nilfs;

        super_set_uuid(sb, nilfs->ns_sbp[0]->s_uuid,
                       sizeof(nilfs->ns_sbp[0]->s_uuid));
        super_set_sysfs_name_bdev(sb);

        cno = nilfs_last_cno(nilfs);
        err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
        if (err) {
                nilfs_err(sb,
                          "error %d while loading last checkpoint (checkpoint number=%llu)",
                          err, (unsigned long long)cno);
                goto failed_unload;
        }

        if (!sb_rdonly(sb)) {
                err = nilfs_attach_log_writer(sb, fsroot);
                if (err)
                        goto failed_checkpoint;
        }

        err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
        if (err)
                goto failed_segctor;

        nilfs_put_root(fsroot);

        if (!sb_rdonly(sb)) {
                down_write(&nilfs->ns_sem);
                nilfs_setup_super(sb, true);
                up_write(&nilfs->ns_sem);
        }

        return 0;

 failed_segctor:
        nilfs_detach_log_writer(sb);

 failed_checkpoint:
        nilfs_put_root(fsroot);

 failed_unload:
        nilfs_sysfs_delete_device_group(nilfs);
        iput(nilfs->ns_sufile);
        iput(nilfs->ns_cpfile);
        iput(nilfs->ns_dat);

 failed_nilfs:
        destroy_nilfs(nilfs);
        return err;
}

static int nilfs_reconfigure(struct fs_context *fc)
{
        struct nilfs_fs_context *ctx = fc->fs_private;
        struct super_block *sb = fc->root->d_sb;
        struct the_nilfs *nilfs = sb->s_fs_info;
        int err;

        sync_filesystem(sb);

        err = -EINVAL;

        if (!nilfs_valid_fs(nilfs)) {
                nilfs_warn(sb,
                           "couldn't remount because the filesystem is in an incomplete recovery state");
                goto ignore_opts;
        }
        if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
                goto out;
        if (fc->sb_flags & SB_RDONLY) {
                sb->s_flags |= SB_RDONLY;

                /*
                 * Remounting a valid RW partition RDONLY, so set
                 * the RDONLY flag and then mark the partition as valid again.
                 */
                down_write(&nilfs->ns_sem);
                nilfs_cleanup_super(sb);
                up_write(&nilfs->ns_sem);
        } else {
                __u64 features;
                struct nilfs_root *root;

                /*
                 * Mounting a RDONLY partition read-write, so reread and
                 * store the current valid flag.  (It may have been changed
                 * by fsck since we originally mounted the partition.)
                 */
                down_read(&nilfs->ns_sem);
                features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
                        ~NILFS_FEATURE_COMPAT_RO_SUPP;
                up_read(&nilfs->ns_sem);
                if (features) {
                        nilfs_warn(sb,
                                   "couldn't remount RDWR because of unsupported optional features (%llx)",
                                   (unsigned long long)features);
                        err = -EROFS;
                        goto ignore_opts;
                }

                sb->s_flags &= ~SB_RDONLY;

                root = NILFS_I(d_inode(sb->s_root))->i_root;
                err = nilfs_attach_log_writer(sb, root);
                if (err) {
                        sb->s_flags |= SB_RDONLY;
                        goto ignore_opts;
                }

                down_write(&nilfs->ns_sem);
                nilfs_setup_super(sb, true);
                up_write(&nilfs->ns_sem);
        }
 out:
        sb->s_flags = (sb->s_flags & ~SB_POSIXACL);
        /* Copy over parsed remount options */
        nilfs->ns_mount_opt = ctx->ns_mount_opt;

        return 0;

 ignore_opts:
        return err;
}

static int
nilfs_get_tree(struct fs_context *fc)
{
        struct nilfs_fs_context *ctx = fc->fs_private;
        struct super_block *s;
        dev_t dev;
        int err;

        if (ctx->cno && !(fc->sb_flags & SB_RDONLY)) {
                nilfs_err(NULL,
                          "invalid option \"cp=%llu\": read-only option is not specified",
                          ctx->cno);
                return -EINVAL;
        }

        err = lookup_bdev(fc->source, &dev);
        if (err)
                return err;

        s = sget_dev(fc, dev);
        if (IS_ERR(s))
                return PTR_ERR(s);

        if (!s->s_root) {
                err = setup_bdev_super(s, fc->sb_flags, fc);
                if (!err)
                        err = nilfs_fill_super(s, fc);
                if (err)
                        goto failed_super;

                s->s_flags |= SB_ACTIVE;
        } else if (!ctx->cno) {
                if (nilfs_tree_is_busy(s->s_root)) {
                        if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
                                nilfs_err(s,
                                          "the device already has a %s mount.",
                                          sb_rdonly(s) ? "read-only" : "read/write");
                                err = -EBUSY;
                                goto failed_super;
                        }
                } else {
                        /*
                         * Try reconfigure to setup mount states if the current
                         * tree is not mounted and only snapshots use this sb.
                         *
                         * Since nilfs_reconfigure() requires fc->root to be
                         * set, set it first and release it on failure.
                         */
                        fc->root = dget(s->s_root);
                        err = nilfs_reconfigure(fc);
                        if (err) {
                                dput(fc->root);
                                fc->root = NULL;  /* prevent double release */
                                goto failed_super;
                        }
                        return 0;
                }
        }

        if (ctx->cno) {
                struct dentry *root_dentry;

                err = nilfs_attach_snapshot(s, ctx->cno, &root_dentry);
                if (err)
                        goto failed_super;
                fc->root = root_dentry;
                return 0;
        }

        fc->root = dget(s->s_root);
        return 0;

 failed_super:
        deactivate_locked_super(s);
        return err;
}

static void nilfs_free_fc(struct fs_context *fc)
{
        kfree(fc->fs_private);
}

static const struct fs_context_operations nilfs_context_ops = {
        .parse_param        = nilfs_parse_param,
        .get_tree        = nilfs_get_tree,
        .reconfigure        = nilfs_reconfigure,
        .free                = nilfs_free_fc,
};

static int nilfs_init_fs_context(struct fs_context *fc)
{
        struct nilfs_fs_context *ctx;

        ctx = kzalloc_obj(*ctx);
        if (!ctx)
                return -ENOMEM;

        ctx->ns_mount_opt = NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
        fc->fs_private = ctx;
        fc->ops = &nilfs_context_ops;

        return 0;
}

struct file_system_type nilfs_fs_type = {
        .owner    = THIS_MODULE,
        .name     = "nilfs2",
        .kill_sb  = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV,
        .init_fs_context = nilfs_init_fs_context,
        .parameters = nilfs_param_spec,
};
MODULE_ALIAS_FS("nilfs2");

static void nilfs_inode_init_once(void *obj)
{
        struct nilfs_inode_info *ii = obj;

        INIT_LIST_HEAD(&ii->i_dirty);
#ifdef CONFIG_NILFS_XATTR
        init_rwsem(&ii->xattr_sem);
#endif
        inode_init_once(&ii->vfs_inode);
}

static void nilfs_segbuf_init_once(void *obj)
{
        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
}

static void nilfs_destroy_cachep(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();

        kmem_cache_destroy(nilfs_inode_cachep);
        kmem_cache_destroy(nilfs_transaction_cachep);
        kmem_cache_destroy(nilfs_segbuf_cachep);
        kmem_cache_destroy(nilfs_btree_path_cache);
}

static int __init nilfs_init_cachep(void)
{
        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
                        sizeof(struct nilfs_inode_info), 0,
                        SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
                        nilfs_inode_init_once);
        if (!nilfs_inode_cachep)
                goto fail;

        nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
                        sizeof(struct nilfs_transaction_info), 0,
                        SLAB_RECLAIM_ACCOUNT, NULL);
        if (!nilfs_transaction_cachep)
                goto fail;

        nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
                        sizeof(struct nilfs_segment_buffer), 0,
                        SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
        if (!nilfs_segbuf_cachep)
                goto fail;

        nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
                        sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
                        0, 0, NULL);
        if (!nilfs_btree_path_cache)
                goto fail;

        return 0;

fail:
        nilfs_destroy_cachep();
        return -ENOMEM;
}

static int __init init_nilfs_fs(void)
{
        int err;

        err = nilfs_init_cachep();
        if (err)
                goto fail;

        err = nilfs_sysfs_init();
        if (err)
                goto free_cachep;

        err = register_filesystem(&nilfs_fs_type);
        if (err)
                goto deinit_sysfs_entry;

        printk(KERN_INFO "NILFS version 2 loaded\n");
        return 0;

deinit_sysfs_entry:
        nilfs_sysfs_exit();
free_cachep:
        nilfs_destroy_cachep();
fail:
        return err;
}

static void __exit exit_nilfs_fs(void)
{
        nilfs_destroy_cachep();
        nilfs_sysfs_exit();
        unregister_filesystem(&nilfs_fs_type);
}

module_init(init_nilfs_fs)
module_exit(exit_nilfs_fs)











































































































































































































































































































































































































    1 













    1 


    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        TCP over IPv6
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Based on:
 *        linux/net/ipv4/tcp.c
 *        linux/net/ipv4/tcp_input.c
 *        linux/net/ipv4/tcp_output.c
 *
 *        Fixes:
 *        Hideaki YOSHIFUJI        :        sin6_scope_id support
 *        YOSHIFUJI Hideaki @USAGI and:        Support IPV6_V6ONLY socket option, which
 *        Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
 *                                        a single port at the same time.
 *        YOSHIFUJI Hideaki @USAGI:        convert /proc/net/tcp6 to seq_file.
 */

#include <linux/bottom_half.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/jiffies.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/init.h>
#include <linux/jhash.h>
#include <linux/ipsec.h>
#include <linux/times.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/random.h>
#include <linux/indirect_call_wrapper.h>

#include <net/aligned_data.h>
#include <net/tcp.h>
#include <net/ndisc.h>
#include <net/inet6_hashtables.h>
#include <net/inet6_connection_sock.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/ip6_checksum.h>
#include <net/inet_ecn.h>
#include <net/protocol.h>
#include <net/xfrm.h>
#include <net/snmp.h>
#include <net/dsfield.h>
#include <net/timewait_sock.h>
#include <net/inet_common.h>
#include <net/secure_seq.h>
#include <net/hotdata.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>
#include <net/psp.h>

#include <linux/proc_fs.h>
#include <linux/seq_file.h>

#include <crypto/md5.h>
#include <crypto/utils.h>

#include <trace/events/tcp.h>

static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
                              enum sk_rst_reason reason);
static void        tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
                                      struct request_sock *req);

INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);

static const struct inet_connection_sock_af_ops ipv6_mapped;
const struct inet_connection_sock_af_ops ipv6_specific;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
#endif

/* Helper returning the inet6 address from a given tcp socket.
 * It can be used in TCP stack instead of inet6_sk(sk).
 * This avoids a dereference and allow compiler optimizations.
 * It is a specialized version of inet6_sk_generic().
 */
#define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \
                                              struct tcp6_sock, tcp)->inet6)

static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst_hold_safe(dst)) {
                rcu_assign_pointer(sk->sk_rx_dst, dst);
                sk->sk_rx_dst_ifindex = skb->skb_iif;
                sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
        }
}

INDIRECT_CALLABLE_SCOPE union tcp_seq_and_ts_off
tcp_v6_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
{
        return secure_tcpv6_seq_and_ts_off(net,
                                           ipv6_hdr(skb)->daddr.s6_addr32,
                                           ipv6_hdr(skb)->saddr.s6_addr32,
                                           tcp_hdr(skb)->dest,
                                           tcp_hdr(skb)->source);
}

static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
                              int addr_len)
{
        /* This check is replicated from tcp_v6_connect() and intended to
         * prevent BPF program called below from accessing bytes that are out
         * of the bound specified by user in addr_len.
         */
        if (addr_len < SIN6_LEN_RFC2133)
                return -EINVAL;

        sock_owned_by_me(sk);

        return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len);
}

static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
                          int addr_len)
{
        struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct inet_timewait_death_row *tcp_death_row;
        struct ipv6_pinfo *np = tcp_inet6_sk(sk);
        struct in6_addr *saddr = NULL, *final_p;
        struct inet_sock *inet = inet_sk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        struct ipv6_txoptions *opt;
        struct dst_entry *dst;
        struct flowi6 *fl6;
        int addr_type;
        int err;

        if (addr_len < SIN6_LEN_RFC2133)
                return -EINVAL;

        if (usin->sin6_family != AF_INET6)
                return -EAFNOSUPPORT;

        fl6 = &inet_sk(sk)->cork.fl.u.ip6;
        memset(fl6, 0, sizeof(*fl6));

        if (inet6_test_bit(SNDFLOW, sk)) {
                fl6->flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
                IP6_ECN_flow_init(fl6->flowlabel);
                if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
                        struct ip6_flowlabel *flowlabel;
                        flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
                        if (IS_ERR(flowlabel))
                                return -EINVAL;
                        fl6_sock_release(flowlabel);
                }
        }

        /*
         *        connect() to INADDR_ANY means loopback (BSD'ism).
         */

        if (ipv6_addr_any(&usin->sin6_addr)) {
                if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
                        ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
                                               &usin->sin6_addr);
                else
                        usin->sin6_addr = in6addr_loopback;
        }

        addr_type = ipv6_addr_type(&usin->sin6_addr);

        if (addr_type & IPV6_ADDR_MULTICAST)
                return -ENETUNREACH;

        if (addr_type&IPV6_ADDR_LINKLOCAL) {
                if (addr_len >= sizeof(struct sockaddr_in6) &&
                    usin->sin6_scope_id) {
                        /* If interface is set while binding, indices
                         * must coincide.
                         */
                        if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id))
                                return -EINVAL;

                        sk->sk_bound_dev_if = usin->sin6_scope_id;
                }

                /* Connect to link-local address requires an interface */
                if (!sk->sk_bound_dev_if)
                        return -EINVAL;
        }

        if (tp->rx_opt.ts_recent_stamp &&
            !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) {
                tp->rx_opt.ts_recent = 0;
                tp->rx_opt.ts_recent_stamp = 0;
                WRITE_ONCE(tp->write_seq, 0);
        }

        sk->sk_v6_daddr = usin->sin6_addr;
        np->flow_label = fl6->flowlabel;

        /*
         *        TCP over IPv4
         */

        if (addr_type & IPV6_ADDR_MAPPED) {
                u32 exthdrlen = icsk->icsk_ext_hdr_len;
                struct sockaddr_in sin;

                if (ipv6_only_sock(sk))
                        return -ENETUNREACH;

                sin.sin_family = AF_INET;
                sin.sin_port = usin->sin6_port;
                sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];

                /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
                WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped);
                if (sk_is_mptcp(sk))
                        mptcpv6_handle_mapped(sk, true);
                sk->sk_backlog_rcv = tcp_v4_do_rcv;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
                tp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif

                err = tcp_v4_connect(sk, (struct sockaddr_unsized *)&sin, sizeof(sin));

                if (err) {
                        icsk->icsk_ext_hdr_len = exthdrlen;
                        /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
                        WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific);
                        if (sk_is_mptcp(sk))
                                mptcpv6_handle_mapped(sk, false);
                        sk->sk_backlog_rcv = tcp_v6_do_rcv;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
                        tp->af_specific = &tcp_sock_ipv6_specific;
#endif
                        goto failure;
                }
                np->saddr = sk->sk_v6_rcv_saddr;

                return err;
        }

        if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
                saddr = &sk->sk_v6_rcv_saddr;

        fl6->flowi6_proto = IPPROTO_TCP;
        fl6->daddr = sk->sk_v6_daddr;
        fl6->saddr = saddr ? *saddr : np->saddr;
        fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label);
        fl6->flowi6_oif = sk->sk_bound_dev_if;
        fl6->flowi6_mark = sk->sk_mark;
        fl6->fl6_dport = usin->sin6_port;
        fl6->fl6_sport = inet->inet_sport;
        if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6->fl6_sport)
                fl6->flowi6_flags = FLOWI_FLAG_ANY_SPORT;
        fl6->flowi6_uid = sk_uid(sk);

        opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
        final_p = fl6_update_dst(fl6, opt, &np->final);

        security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));

        dst = ip6_dst_lookup_flow(net, sk, fl6, final_p);
        if (IS_ERR(dst)) {
                err = PTR_ERR(dst);
                goto failure;
        }

        tp->tcp_usec_ts = dst_tcp_usec_ts(dst);
        tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;

        if (!saddr) {
                saddr = &fl6->saddr;

                err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
                if (err)
                        goto failure;
        }

        /* set the source address */
        np->saddr = *saddr;
        inet->inet_rcv_saddr = LOOPBACK4_IPV6;

        sk->sk_gso_type = SKB_GSO_TCPV6;
        ip6_dst_store(sk, dst, false, false);

        icsk->icsk_ext_hdr_len = psp_sk_overhead(sk);
        if (opt)
                icsk->icsk_ext_hdr_len += opt->opt_flen +
                                          opt->opt_nflen;

        tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);

        inet->inet_dport = usin->sin6_port;

        tcp_set_state(sk, TCP_SYN_SENT);
        err = inet6_hash_connect(tcp_death_row, sk);
        if (err)
                goto late_failure;

        sk_set_txhash(sk);

        if (likely(!tp->repair)) {
                union tcp_seq_and_ts_off st;

                st = secure_tcpv6_seq_and_ts_off(net,
                                                 np->saddr.s6_addr32,
                                                 sk->sk_v6_daddr.s6_addr32,
                                                 inet->inet_sport,
                                                 inet->inet_dport);
                if (!tp->write_seq)
                        WRITE_ONCE(tp->write_seq, st.seq);
                WRITE_ONCE(tp->tsoffset, st.ts_off);
        }

        if (tcp_fastopen_defer_connect(sk, &err))
                return err;
        if (err)
                goto late_failure;

        err = tcp_connect(sk);
        if (err)
                goto late_failure;

        return 0;

late_failure:
        tcp_set_state(sk, TCP_CLOSE);
        inet_bhash2_reset_saddr(sk);
failure:
        inet->inet_dport = 0;
        sk->sk_route_caps = 0;
        return err;
}

static struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
{
        struct flowi6 *fl6 = &inet_sk(sk)->cork.fl.u.ip6;
        struct dst_entry *dst;

        dst = inet6_csk_route_socket(sk, fl6);

        if (IS_ERR(dst))
                return NULL;
        dst->ops->update_pmtu(dst, sk, NULL, mtu, true);

        dst = inet6_csk_route_socket(sk, fl6);
        return IS_ERR(dst) ? NULL : dst;
}

static void tcp_v6_mtu_reduced(struct sock *sk)
{
        struct dst_entry *dst;
        u32 mtu, dmtu;

        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
                return;

        mtu = READ_ONCE(tcp_sk(sk)->mtu_info);

        /* Drop requests trying to increase our current mss.
         * Check done in __ip6_rt_update_pmtu() is too late.
         */
        if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache)
                return;

        dst = inet6_csk_update_pmtu(sk, mtu);
        if (!dst)
                return;

        dmtu = dst6_mtu(dst);
        if (inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
                tcp_sync_mss(sk, dmtu);
                tcp_simple_retransmit(sk);
        }
}

static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                u8 type, u8 code, int offset, __be32 info)
{
        const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
        const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
        struct net *net = dev_net_rcu(skb->dev);
        struct request_sock *fastopen;
        struct ipv6_pinfo *np;
        struct tcp_sock *tp;
        __u32 seq, snd_una;
        struct sock *sk;
        bool fatal;
        int err;

        sk = __inet6_lookup_established(net, &hdr->daddr, th->dest,
                                        &hdr->saddr, ntohs(th->source),
                                        skb->dev->ifindex, inet6_sdif(skb));

        if (!sk) {
                __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
                                  ICMP6_MIB_INERRORS);
                return -ENOENT;
        }

        if (sk->sk_state == TCP_TIME_WAIT) {
                /* To increase the counter of ignored icmps for TCP-AO */
                tcp_ao_ignore_icmp(sk, AF_INET6, type, code);
                inet_twsk_put(inet_twsk(sk));
                return 0;
        }
        seq = ntohl(th->seq);
        fatal = icmpv6_err_convert(type, code, &err);
        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                tcp_req_err(sk, seq, fatal);
                return 0;
        }

        if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) {
                sock_put(sk);
                return 0;
        }

        bh_lock_sock(sk);
        if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
                __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);

        if (sk->sk_state == TCP_CLOSE)
                goto out;

        if (static_branch_unlikely(&ip6_min_hopcount)) {
                /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
                if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
                        goto out;
                }
        }

        tp = tcp_sk(sk);
        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
        fastopen = rcu_dereference(tp->fastopen_rsk);
        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
        if (sk->sk_state != TCP_LISTEN &&
            !between(seq, snd_una, tp->snd_nxt)) {
                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
                goto out;
        }

        np = tcp_inet6_sk(sk);

        if (type == NDISC_REDIRECT) {
                if (!sock_owned_by_user(sk)) {
                        struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);

                        if (dst)
                                dst->ops->redirect(dst, sk, skb);
                }
                goto out;
        }

        if (type == ICMPV6_PKT_TOOBIG) {
                u32 mtu = ntohl(info);

                /* We are not interested in TCP_LISTEN and open_requests
                 * (SYN-ACKs send out by Linux are always <576bytes so
                 * they should go through unfragmented).
                 */
                if (sk->sk_state == TCP_LISTEN)
                        goto out;

                if (!ip6_sk_accept_pmtu(sk))
                        goto out;

                if (mtu < IPV6_MIN_MTU)
                        goto out;

                WRITE_ONCE(tp->mtu_info, mtu);

                if (!sock_owned_by_user(sk))
                        tcp_v6_mtu_reduced(sk);
                else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
                                           &sk->sk_tsq_flags))
                        sock_hold(sk);
                goto out;
        }


        /* Might be for an request_sock */
        switch (sk->sk_state) {
        case TCP_SYN_SENT:
        case TCP_SYN_RECV:
                /* Only in fast or simultaneous open. If a fast open socket is
                 * already accepted it is treated as a connected one below.
                 */
                if (fastopen && !fastopen->sk)
                        break;

                ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th);

                if (!sock_owned_by_user(sk))
                        tcp_done_with_error(sk, err);
                else
                        WRITE_ONCE(sk->sk_err_soft, err);
                goto out;
        case TCP_LISTEN:
                break;
        default:
                /* check if this ICMP message allows revert of backoff.
                 * (see RFC 6069)
                 */
                if (!fastopen && type == ICMPV6_DEST_UNREACH &&
                    code == ICMPV6_NOROUTE)
                        tcp_ld_RTO_revert(sk, seq);
        }

        if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
                WRITE_ONCE(sk->sk_err, err);
                sk_error_report(sk);
        } else {
                WRITE_ONCE(sk->sk_err_soft, err);
        }
out:
        bh_unlock_sock(sk);
        sock_put(sk);
        return 0;
}


static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
                              struct flowi *fl,
                              struct request_sock *req,
                              struct tcp_fastopen_cookie *foc,
                              enum tcp_synack_type synack_type,
                              struct sk_buff *syn_skb)
{
        struct inet_request_sock *ireq = inet_rsk(req);
        const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
        struct ipv6_txoptions *opt;
        struct flowi6 *fl6 = &fl->u.ip6;
        struct sk_buff *skb;
        int err = -ENOMEM;
        u8 tclass;

        /* First, grab a route. */
        if (!dst && (dst = inet6_csk_route_req(sk, NULL, fl6, req,
                                               IPPROTO_TCP)) == NULL)
                goto done;

        skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);

        if (skb) {
                tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK;
                __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
                                    &ireq->ir_v6_rmt_addr);

                fl6->daddr = ireq->ir_v6_rmt_addr;
                if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts)
                        fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));

                tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
                                (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
                                (np->tclass & INET_ECN_MASK) :
                                np->tclass;

                if (!INET_ECN_is_capable(tclass) &&
                    tcp_bpf_ca_needs_ecn((struct sock *)req))
                        tclass |= INET_ECN_ECT_0;

                rcu_read_lock();
                opt = ireq->ipv6_opt;
                if (!opt)
                        opt = rcu_dereference(np->opt);
                err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark),
                               opt, tclass, READ_ONCE(sk->sk_priority));
                rcu_read_unlock();
                err = net_xmit_eval(err);
        }

done:
        return err;
}


static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
        kfree(inet_rsk(req)->ipv6_opt);
        consume_skb(inet_rsk(req)->pktopts);
}

#ifdef CONFIG_TCP_MD5SIG
static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
                                                   const struct in6_addr *addr,
                                                   int l3index)
{
        return tcp_md5_do_lookup(sk, l3index,
                                 (union tcp_md5_addr *)addr, AF_INET6);
}

static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
                                                const struct sock *addr_sk)
{
        int l3index;

        l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
                                                 addr_sk->sk_bound_dev_if);
        return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr,
                                    l3index);
}

static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
                                 sockptr_t optval, int optlen)
{
        struct tcp_md5sig cmd;
        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
        union tcp_ao_addr *addr;
        int l3index = 0;
        u8 prefixlen;
        bool l3flag;
        u8 flags;

        if (optlen < sizeof(cmd))
                return -EINVAL;

        if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
                return -EFAULT;

        if (sin6->sin6_family != AF_INET6)
                return -EINVAL;

        flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
        l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;

        if (optname == TCP_MD5SIG_EXT &&
            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
                prefixlen = cmd.tcpm_prefixlen;
                if (prefixlen > 128 || (ipv6_addr_v4mapped(&sin6->sin6_addr) &&
                                        prefixlen > 32))
                        return -EINVAL;
        } else {
                prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128;
        }

        if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
            cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
                struct net_device *dev;

                rcu_read_lock();
                dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
                if (dev && netif_is_l3_master(dev))
                        l3index = dev->ifindex;
                rcu_read_unlock();

                /* ok to reference set/not set outside of rcu;
                 * right now device MUST be an L3 master
                 */
                if (!dev || !l3index)
                        return -EINVAL;
        }

        if (!cmd.tcpm_keylen) {
                if (ipv6_addr_v4mapped(&sin6->sin6_addr))
                        return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
                                              AF_INET, prefixlen,
                                              l3index, flags);
                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
                                      AF_INET6, prefixlen, l3index, flags);
        }

        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
                return -EINVAL;

        if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
                addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3];

                /* Don't allow keys for peers that have a matching TCP-AO key.
                 * See the comment in tcp_ao_add_cmd()
                 */
                if (tcp_ao_required(sk, addr, AF_INET,
                                    l3flag ? l3index : -1, false))
                        return -EKEYREJECTED;
                return tcp_md5_do_add(sk, addr,
                                      AF_INET, prefixlen, l3index, flags,
                                      cmd.tcpm_key, cmd.tcpm_keylen);
        }

        addr = (union tcp_md5_addr *)&sin6->sin6_addr;

        /* Don't allow keys for peers that have a matching TCP-AO key.
         * See the comment in tcp_ao_add_cmd()
         */
        if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false))
                return -EKEYREJECTED;

        return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags,
                              cmd.tcpm_key, cmd.tcpm_keylen);
}

static void tcp_v6_md5_hash_headers(struct md5_ctx *ctx,
                                    const struct in6_addr *daddr,
                                    const struct in6_addr *saddr,
                                    const struct tcphdr *th, int nbytes)
{
        struct {
                struct tcp6_pseudohdr ip; /* TCP pseudo-header (RFC2460) */
                struct tcphdr tcp;
        } h;

        h.ip.saddr = *saddr;
        h.ip.daddr = *daddr;
        h.ip.protocol = cpu_to_be32(IPPROTO_TCP);
        h.ip.len = cpu_to_be32(nbytes);
        h.tcp = *th;
        h.tcp.check = 0;
        md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
}

static noinline_for_stack void
tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
                    const struct in6_addr *daddr, struct in6_addr *saddr,
                    const struct tcphdr *th)
{
        struct md5_ctx ctx;

        md5_init(&ctx);
        tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
        tcp_md5_hash_key(&ctx, key);
        md5_final(&ctx, md5_hash);
}

static noinline_for_stack void
tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
                    const struct sock *sk, const struct sk_buff *skb)
{
        const struct tcphdr *th = tcp_hdr(skb);
        const struct in6_addr *saddr, *daddr;
        struct md5_ctx ctx;

        if (sk) { /* valid for establish/request sockets */
                saddr = &sk->sk_v6_rcv_saddr;
                daddr = &sk->sk_v6_daddr;
        } else {
                const struct ipv6hdr *ip6h = ipv6_hdr(skb);
                saddr = &ip6h->saddr;
                daddr = &ip6h->daddr;
        }

        md5_init(&ctx);
        tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
        tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
        tcp_md5_hash_key(&ctx, key);
        md5_final(&ctx, md5_hash);
}
#endif

static void tcp_v6_init_req(struct request_sock *req,
                            const struct sock *sk_listener,
                            struct sk_buff *skb,
                            u32 tw_isn)
{
        bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
        struct inet_request_sock *ireq = inet_rsk(req);
        const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener);

        ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
        ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
        ireq->ir_rmt_addr = LOOPBACK4_IPV6;
        ireq->ir_loc_addr = LOOPBACK4_IPV6;

        /* So that link locals have meaning */
        if ((!sk_listener->sk_bound_dev_if || l3_slave) &&
            ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
                ireq->ir_iif = tcp_v6_iif(skb);

        if (!tw_isn &&
            (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) ||
             np->rxopt.bits.rxinfo ||
             np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
             np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) {
                refcount_inc(&skb->users);
                ireq->pktopts = skb;
        }
}

static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
                                          struct sk_buff *skb,
                                          struct flowi *fl,
                                          struct request_sock *req,
                                          u32 tw_isn)
{
        tcp_v6_init_req(req, sk, skb, tw_isn);

        if (security_inet_conn_request(sk, skb, req))
                return NULL;

        return inet6_csk_route_req(sk, NULL, &fl->u.ip6, req, IPPROTO_TCP);
}

struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
        .family                =        AF_INET6,
        .obj_size        =        sizeof(struct tcp6_request_sock),
        .send_ack        =        tcp_v6_reqsk_send_ack,
        .destructor        =        tcp_v6_reqsk_destructor,
        .send_reset        =        tcp_v6_send_reset,
};

const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
        .mss_clamp        =        IPV6_MIN_MTU - sizeof(struct tcphdr) -
                                sizeof(struct ipv6hdr),
#ifdef CONFIG_TCP_MD5SIG
        .req_md5_lookup        =        tcp_v6_md5_lookup,
        .calc_md5_hash        =        tcp_v6_md5_hash_skb,
#endif
#ifdef CONFIG_TCP_AO
        .ao_lookup        =        tcp_v6_ao_lookup_rsk,
        .ao_calc_key        =        tcp_v6_ao_calc_key_rsk,
        .ao_synack_hash =        tcp_v6_ao_synack_hash,
#endif
#ifdef CONFIG_SYN_COOKIES
        .cookie_init_seq =        cookie_v6_init_sequence,
#endif
        .route_req        =        tcp_v6_route_req,
        .init_seq_and_ts_off        = tcp_v6_init_seq_and_ts_off,
        .send_synack        =        tcp_v6_send_synack,
};

static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
                                 u32 ack, u32 win, u32 tsval, u32 tsecr,
                                 int oif, int rst, u8 tclass, __be32 label,
                                 u32 priority, u32 txhash, struct tcp_key *key)
{
        struct net *net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
        unsigned int tot_len = sizeof(struct tcphdr);
        struct sock *ctl_sk = net->ipv6.tcp_sk;
        const struct tcphdr *th = tcp_hdr(skb);
        __be32 mrst = 0, *topt;
        struct dst_entry *dst;
        struct sk_buff *buff;
        struct tcphdr *t1;
        struct flowi6 fl6;
        u32 mark = 0;

        if (tsecr)
                tot_len += TCPOLEN_TSTAMP_ALIGNED;
        if (tcp_key_is_md5(key))
                tot_len += TCPOLEN_MD5SIG_ALIGNED;
        if (tcp_key_is_ao(key))
                tot_len += tcp_ao_len_aligned(key->ao_key);

#ifdef CONFIG_MPTCP
        if (rst && !tcp_key_is_md5(key)) {
                mrst = mptcp_reset_option(skb);

                if (mrst)
                        tot_len += sizeof(__be32);
        }
#endif

        buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
        if (!buff)
                return;

        skb_reserve(buff, MAX_TCP_HEADER);

        t1 = skb_push(buff, tot_len);
        skb_reset_transport_header(buff);

        /* Swap the send and the receive. */
        memset(t1, 0, sizeof(*t1));
        t1->dest = th->source;
        t1->source = th->dest;
        t1->doff = tot_len / 4;
        t1->seq = htonl(seq);
        t1->ack_seq = htonl(ack);
        t1->ack = !rst || !th->ack;
        t1->rst = rst;
        t1->window = htons(win);

        topt = (__be32 *)(t1 + 1);

        if (tsecr) {
                *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
                                (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
                *topt++ = htonl(tsval);
                *topt++ = htonl(tsecr);
        }

        if (mrst)
                *topt++ = mrst;

#ifdef CONFIG_TCP_MD5SIG
        if (tcp_key_is_md5(key)) {
                *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
                                (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
                tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key,
                                    &ipv6_hdr(skb)->saddr,
                                    &ipv6_hdr(skb)->daddr, t1);
        }
#endif
#ifdef CONFIG_TCP_AO
        if (tcp_key_is_ao(key)) {
                *topt++ = htonl((TCPOPT_AO << 24) |
                                (tcp_ao_len(key->ao_key) << 16) |
                                (key->ao_key->sndid << 8) |
                                (key->rcv_next));

                tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key,
                                key->traffic_key,
                                (union tcp_ao_addr *)&ipv6_hdr(skb)->saddr,
                                (union tcp_ao_addr *)&ipv6_hdr(skb)->daddr,
                                t1, key->sne);
        }
#endif

        memset(&fl6, 0, sizeof(fl6));
        fl6.daddr = ipv6_hdr(skb)->saddr;
        fl6.saddr = ipv6_hdr(skb)->daddr;
        fl6.flowlabel = label;

        buff->ip_summed = CHECKSUM_PARTIAL;

        __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);

        fl6.flowi6_proto = IPPROTO_TCP;
        if (rt6_need_strict(&fl6.daddr) && !oif)
                fl6.flowi6_oif = tcp_v6_iif(skb);
        else {
                if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
                        oif = skb->skb_iif;

                fl6.flowi6_oif = oif;
        }

        if (sk) {
                /* unconstify the socket only to attach it to buff with care. */
                skb_set_owner_edemux(buff, (struct sock *)sk);
                psp_reply_set_decrypted(sk, buff);

                if (sk->sk_state == TCP_TIME_WAIT)
                        mark = inet_twsk(sk)->tw_mark;
                else
                        mark = READ_ONCE(sk->sk_mark);
                skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC);
        }
        if (txhash) {
                /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */
                skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4);
        }
        fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
        fl6.fl6_dport = t1->dest;
        fl6.fl6_sport = t1->source;
        fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
        security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));

        /* Pass a socket to ip6_dst_lookup either it is for RST
         * Underlying function will use this to retrieve the network
         * namespace
         */
        if (sk && sk->sk_state != TCP_TIME_WAIT)
                dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/
        else
                dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL);
        if (!IS_ERR(dst)) {
                skb_dst_set(buff, dst);
                ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL,
                         tclass, priority);
                TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
                if (rst)
                        TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
                return;
        }

        kfree_skb(buff);
}

static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
                              enum sk_rst_reason reason)
{
        const struct tcphdr *th = tcp_hdr(skb);
        struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        const __u8 *md5_hash_location = NULL;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        bool allocated_traffic_key = false;
#endif
        const struct tcp_ao_hdr *aoh;
        struct tcp_key key = {};
        u32 seq = 0, ack_seq = 0;
        __be32 label = 0;
        u32 priority = 0;
        struct net *net;
        u32 txhash = 0;
        int oif = 0;
#ifdef CONFIG_TCP_MD5SIG
        unsigned char newhash[16];
        struct sock *sk1 = NULL;
#endif

        if (th->rst)
                return;

        /* If sk not NULL, it means we did a successful lookup and incoming
         * route had to be correct. prequeue might have dropped our dst.
         */
        if (!sk && !ipv6_unicast_destination(skb))
                return;

        net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
        /* Invalid TCP option size or twice included auth */
        if (tcp_parse_auth_options(th, &md5_hash_location, &aoh))
                return;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        rcu_read_lock();
#endif
#ifdef CONFIG_TCP_MD5SIG
        if (sk && sk_fullsock(sk)) {
                int l3index;

                /* sdif set, means packet ingressed via a device
                 * in an L3 domain and inet_iif is set to it.
                 */
                l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
                key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index);
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
        } else if (md5_hash_location) {
                int dif = tcp_v6_iif_l3_slave(skb);
                int sdif = tcp_v6_sdif(skb);
                int l3index;

                /*
                 * active side is lost. Try to find listening socket through
                 * source port, and then find md5 key through listening socket.
                 * we are not loose security here:
                 * Incoming packet is checked with md5 hash with finding key,
                 * no RST generated if md5 hash doesn't match.
                 */
                sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source,
                                            &ipv6h->daddr, ntohs(th->source),
                                            dif, sdif);
                if (!sk1)
                        goto out;

                /* sdif set, means packet ingressed via a device
                 * in an L3 domain and dif is set to it.
                 */
                l3index = tcp_v6_sdif(skb) ? dif : 0;

                key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index);
                if (!key.md5_key)
                        goto out;
                key.type = TCP_KEY_MD5;

                tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb);
                if (crypto_memneq(md5_hash_location, newhash, 16))
                        goto out;
        }
#endif

        if (th->ack)
                seq = ntohl(th->ack_seq);
        else
                ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
                          (th->doff << 2);

#ifdef CONFIG_TCP_AO
        if (aoh) {
                int l3index;

                l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
                if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq,
                                         &key.ao_key, &key.traffic_key,
                                         &allocated_traffic_key,
                                         &key.rcv_next, &key.sne))
                        goto out;
                key.type = TCP_KEY_AO;
        }
#endif

        if (sk) {
                oif = sk->sk_bound_dev_if;
                if (sk_fullsock(sk)) {
                        if (inet6_test_bit(REPFLOW, sk))
                                label = ip6_flowlabel(ipv6h);
                        priority = READ_ONCE(sk->sk_priority);
                        txhash = sk->sk_txhash;
                }
                if (sk->sk_state == TCP_TIME_WAIT) {
                        label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
                        priority = inet_twsk(sk)->tw_priority;
                        txhash = inet_twsk(sk)->tw_txhash;
                }
        } else {
                if (READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) &
                    FLOWLABEL_REFLECT_TCP_RESET)
                        label = ip6_flowlabel(ipv6h);
        }

        trace_tcp_send_reset(sk, skb, reason);

        tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
                             ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK,
                             label, priority, txhash,
                             &key);

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
out:
        if (allocated_traffic_key)
                kfree(key.traffic_key);
        rcu_read_unlock();
#endif
}

static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
                            u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
                            struct tcp_key *key, u8 tclass,
                            __be32 label, u32 priority, u32 txhash)
{
        tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0,
                             tclass, label, priority, txhash, key);
}

static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb,
                                enum tcp_tw_status tw_status)
{
        struct inet_timewait_sock *tw = inet_twsk(sk);
        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
        u8 tclass = tw->tw_tclass;
        struct tcp_key key = {};

        if (tw_status == TCP_TW_ACK_OOW)
                tclass &= ~INET_ECN_MASK;
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info *ao_info;

        if (static_branch_unlikely(&tcp_ao_needed.key)) {

                /* FIXME: the segment to-be-acked is not verified yet */
                ao_info = rcu_dereference(tcptw->ao_info);
                if (ao_info) {
                        const struct tcp_ao_hdr *aoh;

                        /* Invalid TCP option size or twice included auth */
                        if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
                                goto out;
                        if (aoh)
                                key.ao_key = tcp_ao_established_key(sk, ao_info,
                                                                    aoh->rnext_keyid, -1);
                }
        }
        if (key.ao_key) {
                struct tcp_ao_key *rnext_key;

                key.traffic_key = snd_other_key(key.ao_key);
                /* rcv_next switches to our rcv_next */
                rnext_key = READ_ONCE(ao_info->rnext_key);
                key.rcv_next = rnext_key->rcvid;
                key.sne = READ_ONCE(ao_info->snd_sne);
                key.type = TCP_KEY_AO;
#else
        if (0) {
#endif
#ifdef CONFIG_TCP_MD5SIG
        } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
                key.md5_key = tcp_twsk_md5_key(tcptw);
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
#endif
        }

        tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt,
                        READ_ONCE(tcptw->tw_rcv_nxt),
                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
                        tcp_tw_tsval(tcptw),
                        READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if,
                        &key, tclass, cpu_to_be32(tw->tw_flowlabel),
                        tw->tw_priority, tw->tw_txhash);

#ifdef CONFIG_TCP_AO
out:
#endif
        inet_twsk_put(tw);
}

static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req)
{
        struct tcp_key key = {};

#ifdef CONFIG_TCP_AO
        if (static_branch_unlikely(&tcp_ao_needed.key) &&
            tcp_rsk_used_ao(req)) {
                const struct in6_addr *addr = &ipv6_hdr(skb)->saddr;
                const struct tcp_ao_hdr *aoh;
                int l3index;

                l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
                /* Invalid TCP option size or twice included auth */
                if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
                        return;
                if (!aoh)
                        return;
                key.ao_key = tcp_ao_do_lookup(sk, l3index,
                                              (union tcp_ao_addr *)addr,
                                              AF_INET6, aoh->rnext_keyid, -1);
                if (unlikely(!key.ao_key)) {
                        /* Send ACK with any matching MKT for the peer */
                        key.ao_key = tcp_ao_do_lookup(sk, l3index,
                                                      (union tcp_ao_addr *)addr,
                                                      AF_INET6, -1, -1);
                        /* Matching key disappeared (user removed the key?)
                         * let the handshake timeout.
                         */
                        if (!key.ao_key) {
                                net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n",
                                                     addr,
                                                     ntohs(tcp_hdr(skb)->source),
                                                     &ipv6_hdr(skb)->daddr,
                                                     ntohs(tcp_hdr(skb)->dest));
                                return;
                        }
                }
                key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
                if (!key.traffic_key)
                        return;

                key.type = TCP_KEY_AO;
                key.rcv_next = aoh->keyid;
                tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
#else
        if (0) {
#endif
#ifdef CONFIG_TCP_MD5SIG
        } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
                int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;

                key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr,
                                                   l3index);
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
#endif
        }

        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
         */
        tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
                        tcp_rsk(req)->rcv_nxt,
                        tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
                        tcp_rsk_tsval(tcp_rsk(req)),
                        req->ts_recent, sk->sk_bound_dev_if,
                        &key, ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK,
                        0,
                        READ_ONCE(sk->sk_priority),
                        READ_ONCE(tcp_rsk(req)->txhash));
        if (tcp_key_is_ao(&key))
                kfree(key.traffic_key);
}


static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
{
#ifdef CONFIG_SYN_COOKIES
        const struct tcphdr *th = tcp_hdr(skb);

        if (!th->syn)
                sk = cookie_v6_check(sk, skb);
#endif
        return sk;
}

u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
                         struct tcphdr *th, u32 *cookie)
{
        u16 mss = 0;
#ifdef CONFIG_SYN_COOKIES
        mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops,
                                    &tcp_request_sock_ipv6_ops, sk, th);
        if (mss) {
                *cookie = __cookie_v6_init_sequence(iph, th, &mss);
                tcp_synq_overflow(sk);
        }
#endif
        return mss;
}

static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
        if (skb->protocol == htons(ETH_P_IP))
                return tcp_v4_conn_request(sk, skb);

        if (!ipv6_unicast_destination(skb))
                goto drop;

        if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
                __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
                return 0;
        }

        return tcp_conn_request(&tcp6_request_sock_ops,
                                &tcp_request_sock_ipv6_ops, sk, skb);

drop:
        tcp_listendrop(sk);
        return 0; /* don't send reset */
}

static void tcp_v6_restore_cb(struct sk_buff *skb)
{
        /* We need to move header back to the beginning if xfrm6_policy_check()
         * and tcp_v6_fill_cb() are going to be called again.
         * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
         */
        memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
                sizeof(struct inet6_skb_parm));
}

/* Called from tcp_v4_syn_recv_sock() for v6_mapped children. */
static void tcp_v6_mapped_child_init(struct sock *newsk, const struct sock *sk)
{
        struct inet_sock *newinet = inet_sk(newsk);
        struct ipv6_pinfo *newnp;

        newinet->pinet6 = newnp = tcp_inet6_sk(newsk);
        newinet->ipv6_fl_list = NULL;

        memcpy(newnp, tcp_inet6_sk(sk), sizeof(struct ipv6_pinfo));

        newnp->saddr = newsk->sk_v6_rcv_saddr;

        inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
        if (sk_is_mptcp(newsk))
                mptcpv6_handle_mapped(newsk, true);
        newsk->sk_backlog_rcv = tcp_v4_do_rcv;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        tcp_sk(newsk)->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif

        newnp->ipv6_mc_list = NULL;
        newnp->ipv6_ac_list = NULL;
        newnp->pktoptions  = NULL;
        newnp->opt           = NULL;

        /* tcp_v4_syn_recv_sock() has initialized newinet->mc_{index,ttl} */
        newnp->mcast_oif   = newinet->mc_index;
        newnp->mcast_hops  = newinet->mc_ttl;

        newnp->rcv_flowinfo = 0;
        if (inet6_test_bit(REPFLOW, sk))
                newnp->flow_label = 0;
}

static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
                                         struct request_sock *req,
                                         struct dst_entry *dst,
                                         struct request_sock *req_unhash,
                                         bool *own_req,
                                         void (*opt_child_init)(struct sock *newsk,
                                                                const struct sock *sk))
{
        const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
        struct inet_request_sock *ireq;
        struct ipv6_txoptions *opt;
        struct inet_sock *newinet;
        bool found_dup_sk = false;
        struct ipv6_pinfo *newnp;
        struct tcp_sock *newtp;
        struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *key;
        int l3index;
#endif
        struct flowi6 fl6;

        if (skb->protocol == htons(ETH_P_IP))
                return tcp_v4_syn_recv_sock(sk, skb, req, dst,
                                            req_unhash, own_req,
                                            tcp_v6_mapped_child_init);
        ireq = inet_rsk(req);

        if (sk_acceptq_is_full(sk))
                goto exit_overflow;

        dst = inet6_csk_route_req(sk, dst, &fl6, req, IPPROTO_TCP);
        if (!dst)
                goto exit;

        newsk = tcp_create_openreq_child(sk, req, skb);
        if (!newsk)
                goto exit_nonewsk;

        /*
         * No need to charge this sock to the relevant IPv6 refcnt debug socks
         * count here, tcp_create_openreq_child now does this for us, see the
         * comment in that function for the gory details. -acme
         */

        newsk->sk_gso_type = SKB_GSO_TCPV6;
        inet6_sk_rx_dst_set(newsk, skb);

        newinet = inet_sk(newsk);
        newinet->cork.fl.u.ip6 = fl6;
        newinet->pinet6 = tcp_inet6_sk(newsk);
        newinet->ipv6_fl_list = NULL;
        newinet->inet_opt = NULL;

        newtp = tcp_sk(newsk);
        newnp = tcp_inet6_sk(newsk);

        memcpy(newnp, np, sizeof(struct ipv6_pinfo));

        ip6_dst_store(newsk, dst, false, false);

        newnp->saddr = ireq->ir_v6_loc_addr;

        /* Now IPv6 options...

           First: no IPv4 options.
         */
        newnp->ipv6_mc_list = NULL;
        newnp->ipv6_ac_list = NULL;

        /* Clone RX bits */
        newnp->rxopt.all = np->rxopt.all;

        newnp->pktoptions = NULL;
        newnp->opt          = NULL;
        newnp->mcast_oif  = tcp_v6_iif(skb);
        newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
        newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
        if (inet6_test_bit(REPFLOW, sk))
                newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));

        /* Set ToS of the new socket based upon the value of incoming SYN.
         * ECT bits are set later in tcp_init_transfer().
         */
        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
                newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;

        /* Clone native IPv6 options from listening socket (if any)

           Yes, keeping reference count would be much more clever,
           but we make one more one thing there: reattach optmem
           to newsk.
         */
        opt = ireq->ipv6_opt;
        if (!opt)
                opt = rcu_dereference(np->opt);
        if (opt) {
                opt = ipv6_dup_options(newsk, opt);
                RCU_INIT_POINTER(newnp->opt, opt);
        }
        inet_csk(newsk)->icsk_ext_hdr_len = 0;
        if (opt)
                inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
                                                    opt->opt_flen;

        tcp_ca_openreq_child(newsk, dst);

        tcp_sync_mss(newsk, dst6_mtu(dst));
        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));

        tcp_initialize_rcv_mss(newsk);

#ifdef CONFIG_TCP_MD5SIG
        l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);

        if (!tcp_rsk_used_ao(req)) {
                /* Copy over the MD5 key from the original socket */
                key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index);
                if (key) {
                        const union tcp_md5_addr *addr;

                        addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr;
                        if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key))
                                goto put_and_exit;
                }
        }
#endif
#ifdef CONFIG_TCP_AO
        /* Copy over tcp_ao_info if any */
        if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6))
                goto put_and_exit; /* OOM */
#endif

        if (__inet_inherit_port(sk, newsk) < 0)
                goto put_and_exit;
        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
                                       &found_dup_sk);
        if (*own_req) {
                tcp_move_syn(newtp, req);

                /* Clone pktoptions received with SYN, if we own the req */
                if (ireq->pktopts) {
                        newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk);
                        consume_skb(ireq->pktopts);
                        ireq->pktopts = NULL;
                        if (newnp->pktoptions)
                                tcp_v6_restore_cb(newnp->pktoptions);
                }
        } else {
                if (!req_unhash && found_dup_sk) {
                        /* This code path should only be executed in the
                         * syncookie case only
                         */
                        bh_unlock_sock(newsk);
                        sock_put(newsk);
                        newsk = NULL;
                }
        }

        return newsk;

exit_overflow:
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
        dst_release(dst);
exit:
        tcp_listendrop(sk);
        return NULL;
put_and_exit:
        inet_csk_prepare_forced_close(newsk);
        tcp_done(newsk);
        goto exit;
}

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
/* The socket must have it's spinlock held when we get
 * here, unless it is a TCP_LISTEN socket.
 *
 * We have a potential double-lock case here, so even when
 * doing backlog processing we use the BH locking scheme.
 * This is because we cannot sleep with the original spinlock
 * held.
 */
INDIRECT_CALLABLE_SCOPE
int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct ipv6_pinfo *np = tcp_inet6_sk(sk);
        struct sk_buff *opt_skb = NULL;
        enum skb_drop_reason reason;
        struct tcp_sock *tp;

        /* Imagine: socket is IPv6. IPv4 packet arrives,
           goes to IPv4 receive handler and backlogged.
           From backlog it always goes here. Kerboom...
           Fortunately, tcp_rcv_established and rcv_established
           handle them correctly, but it is not case with
           tcp_v6_hnd_req and tcp_v6_send_reset().   --ANK
         */

        if (skb->protocol == htons(ETH_P_IP))
                return tcp_v4_do_rcv(sk, skb);

        reason = psp_sk_rx_policy_check(sk, skb);
        if (reason)
                goto err_discard;

        /*
         *        socket locking is here for SMP purposes as backlog rcv
         *        is currently called with bh processing disabled.
         */

        /* Do Stevens' IPV6_PKTOPTIONS.

           Yes, guys, it is the only place in our code, where we
           may make it not affecting IPv4.
           The rest of code is protocol independent,
           and I do not like idea to uglify IPv4.

           Actually, all the idea behind IPV6_PKTOPTIONS
           looks not very well thought. For now we latch
           options, received in the last packet, enqueued
           by tcp. Feel free to propose better solution.
                                               --ANK (980728)
         */
        if (np->rxopt.all && sk->sk_state != TCP_LISTEN)
                opt_skb = skb_clone_and_charge_r(skb, sk);

        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
                struct dst_entry *dst;

                dst = rcu_dereference_protected(sk->sk_rx_dst,
                                                lockdep_sock_is_held(sk));

                sock_rps_save_rxhash(sk, skb);
                sk_mark_napi_id(sk, skb);
                if (dst && unlikely(dst != skb_dst(skb))) {
                        if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
                            INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
                                            dst, sk->sk_rx_dst_cookie) == NULL) {
                                RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
                                dst_release(dst);
                        }
                }

                tcp_rcv_established(sk, skb);
                if (opt_skb)
                        goto ipv6_pktoptions;
                return 0;
        }

        if (tcp_checksum_complete(skb))
                goto csum_err;

        if (sk->sk_state == TCP_LISTEN) {
                struct sock *nsk = tcp_v6_cookie_check(sk, skb);

                if (nsk != sk) {
                        if (nsk) {
                                reason = tcp_child_process(sk, nsk, skb);
                                if (reason)
                                        goto reset;
                        }
                        return 0;
                }
        } else
                sock_rps_save_rxhash(sk, skb);

        reason = tcp_rcv_state_process(sk, skb);
        if (reason)
                goto reset;
        if (opt_skb)
                goto ipv6_pktoptions;
        return 0;

reset:
        tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
discard:
        if (opt_skb)
                __kfree_skb(opt_skb);
        sk_skb_reason_drop(sk, skb, reason);
        return 0;
csum_err:
        reason = SKB_DROP_REASON_TCP_CSUM;
        trace_tcp_bad_csum(skb);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
err_discard:
        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
        goto discard;


ipv6_pktoptions:
        /* Do you ask, what is it?

           1. skb was enqueued by tcp.
           2. skb is added to tail of read queue, rather than out of order.
           3. socket is not in passive state.
           4. Finally, it really contains options, which user wants to receive.
         */
        tp = tcp_sk(sk);
        if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
            !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
                if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
                        WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb));
                if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
                        WRITE_ONCE(np->mcast_hops,
                                   ipv6_hdr(opt_skb)->hop_limit);
                if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
                        np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
                if (inet6_test_bit(REPFLOW, sk))
                        np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
                if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
                        tcp_v6_restore_cb(opt_skb);
                        opt_skb = xchg(&np->pktoptions, opt_skb);
                } else {
                        __kfree_skb(opt_skb);
                        opt_skb = xchg(&np->pktoptions, NULL);
                }
        }

        consume_skb(opt_skb);
        return 0;
}

static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
                           const struct tcphdr *th)
{
        /* This is tricky: we move IP6CB at its correct location into
         * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because
         * _decode_session6() uses IP6CB().
         * barrier() makes sure compiler won't play aliasing games.
         */
        memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
                sizeof(struct inet6_skb_parm));
        barrier();

        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
                                    skb->len - th->doff*4);
        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
        TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
        TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
        TCP_SKB_CB(skb)->sacked = 0;
        TCP_SKB_CB(skb)->has_rxtstamp =
                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
}

INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
{
        struct net *net = dev_net_rcu(skb->dev);
        enum skb_drop_reason drop_reason;
        enum tcp_tw_status tw_status;
        int sdif = inet6_sdif(skb);
        int dif = inet6_iif(skb);
        const struct tcphdr *th;
        const struct ipv6hdr *hdr;
        struct sock *sk = NULL;
        bool refcounted;
        int ret;
        u32 isn;

        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        if (skb->pkt_type != PACKET_HOST)
                goto discard_it;

        /*
         *        Count it even if it's bad.
         */
        __TCP_INC_STATS(net, TCP_MIB_INSEGS);

        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
                goto discard_it;

        th = (const struct tcphdr *)skb->data;

        if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
                drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
                goto bad_packet;
        }
        if (!pskb_may_pull(skb, th->doff*4))
                goto discard_it;

        if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo))
                goto csum_error;

        th = (const struct tcphdr *)skb->data;
        hdr = ipv6_hdr(skb);

lookup:
        sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th),
                                th->source, th->dest, inet6_iif(skb), sdif,
                                &refcounted);
        if (!sk)
                goto no_tcp_socket;

        if (sk->sk_state == TCP_TIME_WAIT)
                goto do_time_wait;

        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);
                bool req_stolen = false;
                struct sock *nsk;

                sk = req->rsk_listener;
                if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
                        drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                else
                        drop_reason = tcp_inbound_hash(sk, req, skb,
                                                       &hdr->saddr, &hdr->daddr,
                                                       AF_INET6, dif, sdif);
                if (drop_reason) {
                        sk_drops_skbadd(sk, skb);
                        reqsk_put(req);
                        goto discard_it;
                }
                if (tcp_checksum_complete(skb)) {
                        reqsk_put(req);
                        goto csum_error;
                }
                if (unlikely(sk->sk_state != TCP_LISTEN)) {
                        nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
                        if (!nsk) {
                                inet_csk_reqsk_queue_drop_and_put(sk, req);
                                goto lookup;
                        }
                        sk = nsk;
                        /* reuseport_migrate_sock() has already held one sk_refcnt
                         * before returning.
                         */
                } else {
                        sock_hold(sk);
                }
                refcounted = true;
                nsk = NULL;
                drop_reason = tcp_filter(sk, skb);
                if (!drop_reason) {
                        th = (const struct tcphdr *)skb->data;
                        hdr = ipv6_hdr(skb);
                        tcp_v6_fill_cb(skb, hdr, th);
                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
                                            &drop_reason);
                }
                if (!nsk) {
                        reqsk_put(req);
                        if (req_stolen) {
                                /* Another cpu got exclusive access to req
                                 * and created a full blown socket.
                                 * Try to feed this packet to this socket
                                 * instead of discarding it.
                                 */
                                tcp_v6_restore_cb(skb);
                                sock_put(sk);
                                goto lookup;
                        }
                        goto discard_and_relse;
                }
                nf_reset_ct(skb);
                if (nsk == sk) {
                        reqsk_put(req);
                        tcp_v6_restore_cb(skb);
                } else {
                        drop_reason = tcp_child_process(sk, nsk, skb);
                        if (drop_reason) {
                                enum sk_rst_reason rst_reason;

                                rst_reason = sk_rst_convert_drop_reason(drop_reason);
                                tcp_v6_send_reset(nsk, skb, rst_reason);
                                goto discard_and_relse;
                        }
                        sock_put(sk);
                        return 0;
                }
        }

process:
        if (static_branch_unlikely(&ip6_min_hopcount)) {
                /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
                if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) {
                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
                        drop_reason = SKB_DROP_REASON_TCP_MINTTL;
                        goto discard_and_relse;
                }
        }

        if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                goto discard_and_relse;
        }

        drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr,
                                       AF_INET6, dif, sdif);
        if (drop_reason)
                goto discard_and_relse;

        nf_reset_ct(skb);

        drop_reason = tcp_filter(sk, skb);
        if (drop_reason)
                goto discard_and_relse;

        th = (const struct tcphdr *)skb->data;
        hdr = ipv6_hdr(skb);
        tcp_v6_fill_cb(skb, hdr, th);

        skb->dev = NULL;

        if (sk->sk_state == TCP_LISTEN) {
                ret = tcp_v6_do_rcv(sk, skb);
                goto put_and_return;
        }

        sk_incoming_cpu_update(sk);

        bh_lock_sock_nested(sk);
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
                ret = tcp_v6_do_rcv(sk, skb);
        } else {
                drop_reason = tcp_add_backlog(sk, skb);
                if (drop_reason)
                        goto discard_and_relse;
        }
        bh_unlock_sock(sk);
put_and_return:
        if (refcounted)
                sock_put(sk);
        return ret ? -1 : 0;

no_tcp_socket:
        drop_reason = SKB_DROP_REASON_NO_SOCKET;
        if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
                goto discard_it;

        tcp_v6_fill_cb(skb, hdr, th);

        if (tcp_checksum_complete(skb)) {
csum_error:
                drop_reason = SKB_DROP_REASON_TCP_CSUM;
                trace_tcp_bad_csum(skb);
                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
                __TCP_INC_STATS(net, TCP_MIB_INERRS);
        } else {
                tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
        }

discard_it:
        SKB_DR_OR(drop_reason, NOT_SPECIFIED);
        sk_skb_reason_drop(sk, skb, drop_reason);
        return 0;

discard_and_relse:
        sk_drops_skbadd(sk, skb);
        if (refcounted)
                sock_put(sk);
        goto discard_it;

do_time_wait:
        if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                inet_twsk_put(inet_twsk(sk));
                goto discard_it;
        }

        tcp_v6_fill_cb(skb, hdr, th);

        if (tcp_checksum_complete(skb)) {
                inet_twsk_put(inet_twsk(sk));
                goto csum_error;
        }

        tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
                                               &drop_reason);
        switch (tw_status) {
        case TCP_TW_SYN:
        {
                struct sock *sk2;

                sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th),
                                            &ipv6_hdr(skb)->saddr, th->source,
                                            &ipv6_hdr(skb)->daddr,
                                            ntohs(th->dest),
                                            tcp_v6_iif_l3_slave(skb),
                                            sdif);
                if (sk2) {
                        struct inet_timewait_sock *tw = inet_twsk(sk);
                        inet_twsk_deschedule_put(tw);
                        sk = sk2;
                        tcp_v6_restore_cb(skb);
                        refcounted = false;
                        __this_cpu_write(tcp_tw_isn, isn);
                        goto process;
                }

                drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
                if (drop_reason)
                        break;
        }
                /* to ACK */
                fallthrough;
        case TCP_TW_ACK:
        case TCP_TW_ACK_OOW:
                tcp_v6_timewait_ack(sk, skb, tw_status);
                break;
        case TCP_TW_RST:
                tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
                inet_twsk_deschedule_put(inet_twsk(sk));
                goto discard_it;
        case TCP_TW_SUCCESS:
                ;
        }
        goto discard_it;
}

static struct timewait_sock_ops tcp6_timewait_sock_ops = {
        .twsk_obj_size        = sizeof(struct tcp6_timewait_sock),
};

const struct inet_connection_sock_af_ops ipv6_specific = {
        .queue_xmit           = inet6_csk_xmit,
        .rebuild_header           = inet6_sk_rebuild_header,
        .sk_rx_dst_set           = inet6_sk_rx_dst_set,
        .conn_request           = tcp_v6_conn_request,
        .syn_recv_sock           = tcp_v6_syn_recv_sock,
        .net_header_len           = sizeof(struct ipv6hdr),
        .setsockopt           = ipv6_setsockopt,
        .getsockopt           = ipv6_getsockopt,
        .mtu_reduced           = tcp_v6_mtu_reduced,
};

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
#ifdef CONFIG_TCP_MD5SIG
        .md5_lookup        =        tcp_v6_md5_lookup,
        .calc_md5_hash        =        tcp_v6_md5_hash_skb,
        .md5_parse        =        tcp_v6_parse_md5_keys,
#endif
#ifdef CONFIG_TCP_AO
        .ao_lookup        =        tcp_v6_ao_lookup,
        .calc_ao_hash        =        tcp_v6_ao_hash_skb,
        .ao_parse        =        tcp_v6_parse_ao,
        .ao_calc_key_sk        =        tcp_v6_ao_calc_key_sk,
#endif
};
#endif

/*
 *        TCP over IPv4 via INET6 API
 */
static const struct inet_connection_sock_af_ops ipv6_mapped = {
        .queue_xmit           = ip_queue_xmit,
        .rebuild_header           = inet_sk_rebuild_header,
        .sk_rx_dst_set           = inet_sk_rx_dst_set,
        .conn_request           = tcp_v6_conn_request,
        .syn_recv_sock           = tcp_v6_syn_recv_sock,
        .net_header_len           = sizeof(struct iphdr),
        .setsockopt           = ipv6_setsockopt,
        .getsockopt           = ipv6_getsockopt,
        .mtu_reduced           = tcp_v4_mtu_reduced,
};

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
#ifdef CONFIG_TCP_MD5SIG
        .md5_lookup        =        tcp_v4_md5_lookup,
        .calc_md5_hash        =        tcp_v4_md5_hash_skb,
        .md5_parse        =        tcp_v6_parse_md5_keys,
#endif
#ifdef CONFIG_TCP_AO
        .ao_lookup        =        tcp_v6_ao_lookup,
        .calc_ao_hash        =        tcp_v4_ao_hash_skb,
        .ao_parse        =        tcp_v6_parse_ao,
        .ao_calc_key_sk        =        tcp_v4_ao_calc_key_sk,
#endif
};

static void tcp6_destruct_sock(struct sock *sk)
{
        tcp_md5_destruct_sock(sk);
        tcp_ao_destroy_sock(sk, false);
        inet6_sock_destruct(sk);
}
#endif

/* NOTE: A lot of things set to zero explicitly by call to
 *       sk_alloc() so need not be done here.
 */
static int tcp_v6_init_sock(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        tcp_init_sock(sk);

        icsk->icsk_af_ops = &ipv6_specific;

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
        sk->sk_destruct = tcp6_destruct_sock;
#endif

        return 0;
}

#ifdef CONFIG_PROC_FS
/* Proc filesystem TCPv6 sock list dumping. */
static void get_openreq6(struct seq_file *seq,
                         const struct request_sock *req, int i)
{
        long ttd = req->rsk_timer.expires - jiffies;
        const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr;
        const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr;

        if (ttd < 0)
                ttd = 0;

        seq_printf(seq,
                   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
                   "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n",
                   i,
                   src->s6_addr32[0], src->s6_addr32[1],
                   src->s6_addr32[2], src->s6_addr32[3],
                   inet_rsk(req)->ir_num,
                   dest->s6_addr32[0], dest->s6_addr32[1],
                   dest->s6_addr32[2], dest->s6_addr32[3],
                   ntohs(inet_rsk(req)->ir_rmt_port),
                   TCP_SYN_RECV,
                   0, 0, /* could print option size, but that is af dependent. */
                   1,   /* timers active (only the expire timer) */
                   jiffies_to_clock_t(ttd),
                   req->num_timeout,
                   from_kuid_munged(seq_user_ns(seq),
                                    sk_uid(req->rsk_listener)),
                   0,  /* non standard timer */
                   0, /* open_requests have no inode */
                   0, req);
}

static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
{
        const struct in6_addr *dest, *src;
        __u16 destp, srcp;
        int timer_active;
        unsigned long timer_expires;
        const struct inet_sock *inet = inet_sk(sp);
        const struct tcp_sock *tp = tcp_sk(sp);
        const struct inet_connection_sock *icsk = inet_csk(sp);
        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
        u8 icsk_pending;
        int rx_queue;
        int state;

        dest  = &sp->sk_v6_daddr;
        src   = &sp->sk_v6_rcv_saddr;
        destp = ntohs(inet->inet_dport);
        srcp  = ntohs(inet->inet_sport);

        icsk_pending = smp_load_acquire(&icsk->icsk_pending);
        if (icsk_pending == ICSK_TIME_RETRANS ||
            icsk_pending == ICSK_TIME_REO_TIMEOUT ||
            icsk_pending == ICSK_TIME_LOSS_PROBE) {
                timer_active        = 1;
                timer_expires        = tcp_timeout_expires(sp);
        } else if (icsk_pending == ICSK_TIME_PROBE0) {
                timer_active        = 4;
                timer_expires        = tcp_timeout_expires(sp);
        } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
                timer_active        = 2;
                timer_expires        = icsk->icsk_keepalive_timer.expires;
        } else {
                timer_active        = 0;
                timer_expires = jiffies;
        }

        state = inet_sk_state_load(sp);
        if (state == TCP_LISTEN)
                rx_queue = READ_ONCE(sp->sk_ack_backlog);
        else
                /* Because we don't lock the socket,
                 * we might find a transient negative value.
                 */
                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
                                      READ_ONCE(tp->copied_seq), 0);

        seq_printf(seq,
                   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
                   "%02X %08X:%08X %02X:%08lX %08X %5u %8d %llu %d %pK %lu %lu %u %u %d\n",
                   i,
                   src->s6_addr32[0], src->s6_addr32[1],
                   src->s6_addr32[2], src->s6_addr32[3], srcp,
                   dest->s6_addr32[0], dest->s6_addr32[1],
                   dest->s6_addr32[2], dest->s6_addr32[3], destp,
                   state,
                   READ_ONCE(tp->write_seq) - tp->snd_una,
                   rx_queue,
                   timer_active,
                   jiffies_delta_to_clock_t(timer_expires - jiffies),
                   READ_ONCE(icsk->icsk_retransmits),
                   from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
                   READ_ONCE(icsk->icsk_probes_out),
                   sock_i_ino(sp),
                   refcount_read(&sp->sk_refcnt), sp,
                   jiffies_to_clock_t(icsk->icsk_rto),
                   jiffies_to_clock_t(icsk->icsk_ack.ato),
                   (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp),
                   tcp_snd_cwnd(tp),
                   state == TCP_LISTEN ?
                        fastopenq->max_qlen :
                        (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)
                   );
}

static void get_timewait6_sock(struct seq_file *seq,
                               struct inet_timewait_sock *tw, int i)
{
        long delta = tw->tw_timer.expires - jiffies;
        const struct in6_addr *dest, *src;
        __u16 destp, srcp;

        dest = &tw->tw_v6_daddr;
        src  = &tw->tw_v6_rcv_saddr;
        destp = ntohs(tw->tw_dport);
        srcp  = ntohs(tw->tw_sport);

        seq_printf(seq,
                   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
                   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
                   i,
                   src->s6_addr32[0], src->s6_addr32[1],
                   src->s6_addr32[2], src->s6_addr32[3], srcp,
                   dest->s6_addr32[0], dest->s6_addr32[1],
                   dest->s6_addr32[2], dest->s6_addr32[3], destp,
                   READ_ONCE(tw->tw_substate), 0, 0,
                   3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
                   refcount_read(&tw->tw_refcnt), tw);
}

static int tcp6_seq_show(struct seq_file *seq, void *v)
{
        struct tcp_iter_state *st;
        struct sock *sk = v;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "  sl  "
                         "local_address                         "
                         "remote_address                        "
                         "st tx_queue rx_queue tr tm->when retrnsmt"
                         "   uid  timeout inode\n");
                goto out;
        }
        st = seq->private;

        if (sk->sk_state == TCP_TIME_WAIT)
                get_timewait6_sock(seq, v, st->num);
        else if (sk->sk_state == TCP_NEW_SYN_RECV)
                get_openreq6(seq, v, st->num);
        else
                get_tcp6_sock(seq, v, st->num);
out:
        return 0;
}

static const struct seq_operations tcp6_seq_ops = {
        .show                = tcp6_seq_show,
        .start                = tcp_seq_start,
        .next                = tcp_seq_next,
        .stop                = tcp_seq_stop,
};

static struct tcp_seq_afinfo tcp6_seq_afinfo = {
        .family                = AF_INET6,
};

int __net_init tcp6_proc_init(struct net *net)
{
        if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops,
                        sizeof(struct tcp_iter_state), &tcp6_seq_afinfo))
                return -ENOMEM;
        return 0;
}

void tcp6_proc_exit(struct net *net)
{
        remove_proc_entry("tcp6", net->proc_net);
}
#endif

struct proto tcpv6_prot = {
        .name                        = "TCPv6",
        .owner                        = THIS_MODULE,
        .close                        = tcp_close,
        .pre_connect                = tcp_v6_pre_connect,
        .connect                = tcp_v6_connect,
        .disconnect                = tcp_disconnect,
        .accept                        = inet_csk_accept,
        .ioctl                        = tcp_ioctl,
        .init                        = tcp_v6_init_sock,
        .destroy                = tcp_v4_destroy_sock,
        .shutdown                = tcp_shutdown,
        .setsockopt                = tcp_setsockopt,
        .getsockopt                = tcp_getsockopt,
        .bpf_bypass_getsockopt        = tcp_bpf_bypass_getsockopt,
        .keepalive                = tcp_set_keepalive,
        .recvmsg                = tcp_recvmsg,
        .sendmsg                = tcp_sendmsg,
        .splice_eof                = tcp_splice_eof,
        .backlog_rcv                = tcp_v6_do_rcv,
        .release_cb                = tcp_release_cb,
        .hash                        = inet_hash,
        .unhash                        = inet_unhash,
        .get_port                = inet_csk_get_port,
        .put_port                = inet_put_port,
#ifdef CONFIG_BPF_SYSCALL
        .psock_update_sk_prot        = tcp_bpf_update_proto,
#endif
        .enter_memory_pressure        = tcp_enter_memory_pressure,
        .leave_memory_pressure        = tcp_leave_memory_pressure,
        .stream_memory_free        = tcp_stream_memory_free,
        .sockets_allocated        = &tcp_sockets_allocated,

        .memory_allocated        = &net_aligned_data.tcp_memory_allocated,
        .per_cpu_fw_alloc        = &tcp_memory_per_cpu_fw_alloc,

        .memory_pressure        = &tcp_memory_pressure,
        .sysctl_mem                = sysctl_tcp_mem,
        .sysctl_wmem_offset        = offsetof(struct net, ipv4.sysctl_tcp_wmem),
        .sysctl_rmem_offset        = offsetof(struct net, ipv4.sysctl_tcp_rmem),
        .max_header                = MAX_TCP_HEADER,
        .obj_size                = sizeof(struct tcp6_sock),
        .freeptr_offset                = offsetof(struct tcp6_sock,
                                           tcp.inet_conn.icsk_inet.sk.sk_freeptr),
        .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6),
        .slab_flags                = SLAB_TYPESAFE_BY_RCU,
        .twsk_prot                = &tcp6_timewait_sock_ops,
        .rsk_prot                = &tcp6_request_sock_ops,
        .h.hashinfo                = NULL,
        .no_autobind                = true,
        .diag_destroy                = tcp_abort,
};
EXPORT_SYMBOL_GPL(tcpv6_prot);


static struct inet_protosw tcpv6_protosw = {
        .type                =        SOCK_STREAM,
        .protocol        =        IPPROTO_TCP,
        .prot                =        &tcpv6_prot,
        .ops                =        &inet6_stream_ops,
        .flags                =        INET_PROTOSW_PERMANENT |
                                INET_PROTOSW_ICSK,
};

static int __net_init tcpv6_net_init(struct net *net)
{
        int res;

        res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
                                   SOCK_RAW, IPPROTO_TCP, net);
        if (!res)
                net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC;

        return res;
}

static void __net_exit tcpv6_net_exit(struct net *net)
{
        inet_ctl_sock_destroy(net->ipv6.tcp_sk);
}

static struct pernet_operations tcpv6_net_ops = {
        .init            = tcpv6_net_init,
        .exit            = tcpv6_net_exit,
};

int __init tcpv6_init(void)
{
        int ret;

        net_hotdata.tcpv6_protocol = (struct inet6_protocol) {
                .handler     = tcp_v6_rcv,
                .err_handler = tcp_v6_err,
                .flags             = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
        };
        ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
        if (ret)
                goto out;

        /* register inet6 protocol */
        ret = inet6_register_protosw(&tcpv6_protosw);
        if (ret)
                goto out_tcpv6_protocol;

        ret = register_pernet_subsys(&tcpv6_net_ops);
        if (ret)
                goto out_tcpv6_protosw;

        ret = mptcpv6_init();
        if (ret)
                goto out_tcpv6_pernet_subsys;

out:
        return ret;

out_tcpv6_pernet_subsys:
        unregister_pernet_subsys(&tcpv6_net_ops);
out_tcpv6_protosw:
        inet6_unregister_protosw(&tcpv6_protosw);
out_tcpv6_protocol:
        inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
        goto out;
}

void tcpv6_exit(void)
{
        unregister_pernet_subsys(&tcpv6_net_ops);
        inet6_unregister_protosw(&tcpv6_protosw);
        inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
}


































    2 


























    2 





    2 


    2 



    2 













    2 























































































































































































































    1 

    1 





















































































































































































    2 



    2 




    2 

    2 















    2 






    2 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS block mapping.
 *
 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Koji Sato.
 */

#include <linux/fs.h>
#include <linux/string.h>
#include <linux/errno.h>
#include "nilfs.h"
#include "bmap.h"
#include "btree.h"
#include "direct.h"
#include "btnode.h"
#include "mdt.h"
#include "dat.h"
#include "alloc.h"

struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
{
        struct the_nilfs *nilfs = bmap->b_inode->i_sb->s_fs_info;

        return nilfs->ns_dat;
}

static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
                                     const char *fname, int err)
{
        struct inode *inode = bmap->b_inode;

        if (err == -EINVAL) {
                __nilfs_error(inode->i_sb, fname,
                              "broken bmap (inode number=%llu)", inode->i_ino);
                err = -EIO;
        }
        return err;
}

/**
 * nilfs_bmap_lookup_at_level - find a data block or node block
 * @bmap: bmap
 * @key: key
 * @level: level
 * @ptrp: place to store the value associated to @key
 *
 * Description: nilfs_bmap_lookup_at_level() finds a record whose key
 * matches @key in the block at @level of the bmap.  The record associated
 * with @key is stored in the place pointed to by @ptrp.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - A record associated with @key does not exist.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
                               __u64 *ptrp)
{
        sector_t blocknr;
        int ret;

        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
        if (ret < 0)
                goto out;

        if (NILFS_BMAP_USE_VBN(bmap)) {
                ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
                                          &blocknr);
                if (!ret)
                        *ptrp = blocknr;
                else if (ret == -ENOENT) {
                        /*
                         * If there was no valid entry in DAT for the block
                         * address obtained by b_ops->bop_lookup, then pass
                         * internal code -EINVAL to nilfs_bmap_convert_error
                         * to treat it as metadata corruption.
                         */
                        ret = -EINVAL;
                }
        }

 out:
        up_read(&bmap->b_sem);
        return nilfs_bmap_convert_error(bmap, __func__, ret);
}

int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
                             unsigned int maxblocks)
{
        int ret;

        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
        up_read(&bmap->b_sem);

        return nilfs_bmap_convert_error(bmap, __func__, ret);
}

static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
{
        __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
        __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
        int ret, n;

        if (bmap->b_ops->bop_check_insert != NULL) {
                ret = bmap->b_ops->bop_check_insert(bmap, key);
                if (ret > 0) {
                        n = bmap->b_ops->bop_gather_data(
                                bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
                        if (n < 0)
                                return n;
                        ret = nilfs_btree_convert_and_insert(
                                bmap, key, ptr, keys, ptrs, n);
                        if (ret == 0)
                                bmap->b_u.u_flags |= NILFS_BMAP_LARGE;

                        return ret;
                } else if (ret < 0)
                        return ret;
        }

        return bmap->b_ops->bop_insert(bmap, key, ptr);
}

/**
 * nilfs_bmap_insert - insert a new key-record pair into a bmap
 * @bmap: bmap
 * @key: key
 * @rec: record
 *
 * Description: nilfs_bmap_insert() inserts the new key-record pair specified
 * by @key and @rec into @bmap.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EEXIST        - A record associated with @key already exists.
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec)
{
        int ret;

        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_insert(bmap, key, rec);
        up_write(&bmap->b_sem);

        return nilfs_bmap_convert_error(bmap, __func__, ret);
}

static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
{
        __u64 keys[NILFS_BMAP_LARGE_LOW + 1];
        __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
        int ret, n;

        if (bmap->b_ops->bop_check_delete != NULL) {
                ret = bmap->b_ops->bop_check_delete(bmap, key);
                if (ret > 0) {
                        n = bmap->b_ops->bop_gather_data(
                                bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
                        if (n < 0)
                                return n;
                        ret = nilfs_direct_delete_and_convert(
                                bmap, key, keys, ptrs, n);
                        if (ret == 0)
                                bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;

                        return ret;
                } else if (ret < 0)
                        return ret;
        }

        return bmap->b_ops->bop_delete(bmap, key);
}

/**
 * nilfs_bmap_seek_key - seek a valid entry and return its key
 * @bmap: bmap struct
 * @start: start key number
 * @keyp: place to store valid key
 *
 * Description: nilfs_bmap_seek_key() seeks a valid key on @bmap
 * starting from @start, and stores it to @keyp if found.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - No valid entry was found.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp)
{
        int ret;

        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_seek_key(bmap, start, keyp);
        up_read(&bmap->b_sem);

        if (ret < 0)
                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
        return ret;
}

int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
{
        int ret;

        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_last_key(bmap, keyp);
        up_read(&bmap->b_sem);

        if (ret < 0)
                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
        return ret;
}

/**
 * nilfs_bmap_delete - delete a key-record pair from a bmap
 * @bmap: bmap
 * @key: key
 *
 * Description: nilfs_bmap_delete() deletes the key-record pair specified by
 * @key from @bmap.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - A record associated with @key does not exist.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key)
{
        int ret;

        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_delete(bmap, key);
        up_write(&bmap->b_sem);

        return nilfs_bmap_convert_error(bmap, __func__, ret);
}

static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key)
{
        __u64 lastkey;
        int ret;

        ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
        if (ret < 0) {
                if (ret == -ENOENT)
                        ret = 0;
                return ret;
        }

        while (key <= lastkey) {
                ret = nilfs_bmap_do_delete(bmap, lastkey);
                if (ret < 0)
                        return ret;
                ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
                if (ret < 0) {
                        if (ret == -ENOENT)
                                ret = 0;
                        return ret;
                }
        }
        return 0;
}

/**
 * nilfs_bmap_truncate - truncate a bmap to a specified key
 * @bmap: bmap
 * @key: key
 *
 * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
 * greater than or equal to @key from @bmap.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key)
{
        int ret;

        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_truncate(bmap, key);
        up_write(&bmap->b_sem);

        return nilfs_bmap_convert_error(bmap, __func__, ret);
}

/**
 * nilfs_bmap_clear - free resources a bmap holds
 * @bmap: bmap
 *
 * Description: nilfs_bmap_clear() frees resources associated with @bmap.
 */
void nilfs_bmap_clear(struct nilfs_bmap *bmap)
{
        down_write(&bmap->b_sem);
        if (bmap->b_ops->bop_clear != NULL)
                bmap->b_ops->bop_clear(bmap);
        up_write(&bmap->b_sem);
}

/**
 * nilfs_bmap_propagate - propagate dirty state
 * @bmap: bmap
 * @bh: buffer head
 *
 * Description: nilfs_bmap_propagate() marks the buffers that directly or
 * indirectly refer to the block specified by @bh dirty.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
{
        int ret;

        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_propagate(bmap, bh);
        up_write(&bmap->b_sem);

        return nilfs_bmap_convert_error(bmap, __func__, ret);
}

/**
 * nilfs_bmap_lookup_dirty_buffers - collect dirty block buffers
 * @bmap: bmap
 * @listp: pointer to buffer head list
 */
void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
                                     struct list_head *listp)
{
        if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
                bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
}

/**
 * nilfs_bmap_assign - assign a new block number to a block
 * @bmap:    bmap
 * @bh:      place to store a pointer to the buffer head to which a block
 *           address is assigned (in/out)
 * @blocknr: block number
 * @binfo:   block information
 *
 * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
 * buffer specified by @bh.  The block information is stored in the memory
 * pointed to by @binfo, and the buffer head may be replaced as a block
 * address is assigned, in which case a pointer to the new buffer head is
 * stored in the memory pointed to by @bh.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_bmap_assign(struct nilfs_bmap *bmap,
                      struct buffer_head **bh,
                      unsigned long blocknr,
                      union nilfs_binfo *binfo)
{
        int ret;

        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
        up_write(&bmap->b_sem);

        return nilfs_bmap_convert_error(bmap, __func__, ret);
}

/**
 * nilfs_bmap_mark - mark block dirty
 * @bmap: bmap
 * @key: key
 * @level: level
 *
 * Description: nilfs_bmap_mark() marks the block specified by @key and @level
 * as dirty.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
{
        int ret;

        if (bmap->b_ops->bop_mark == NULL)
                return 0;

        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_mark(bmap, key, level);
        up_write(&bmap->b_sem);

        return nilfs_bmap_convert_error(bmap, __func__, ret);
}

/**
 * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
 * @bmap: bmap
 *
 * Description: nilfs_test_and_clear() is the atomic operation to test and
 * clear the dirty state of @bmap.
 *
 * Return: 1 if @bmap is dirty, or 0 if clear.
 */
int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
{
        int ret;

        down_write(&bmap->b_sem);
        ret = nilfs_bmap_dirty(bmap);
        nilfs_bmap_clear_dirty(bmap);
        up_write(&bmap->b_sem);
        return ret;
}


/*
 * Internal use only
 */
__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
                              const struct buffer_head *bh)
{
        loff_t pos = folio_pos(bh->b_folio) + bh_offset(bh);

        return pos >> bmap->b_inode->i_blkbits;
}

__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
{
        __s64 diff;

        diff = key - bmap->b_last_allocated_key;
        if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
            (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
            (bmap->b_last_allocated_ptr + diff > 0))
                return bmap->b_last_allocated_ptr + diff;
        else
                return NILFS_BMAP_INVALID_PTR;
}

#define NILFS_BMAP_GROUP_DIV        8        /* must be power of 2 */

__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
{
        struct inode *dat = nilfs_bmap_get_dat(bmap);
        unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
        unsigned long group;
        u32 index;

        BUILD_BUG_ON_NOT_POWER_OF_2(NILFS_BMAP_GROUP_DIV);

        group = div_u64(bmap->b_inode->i_ino, entries_per_group);
        index = bmap->b_inode->i_ino & (NILFS_BMAP_GROUP_DIV - 1);

        return group * entries_per_group +
               index * (entries_per_group / NILFS_BMAP_GROUP_DIV);
}


static struct lock_class_key nilfs_bmap_dat_lock_key;
static struct lock_class_key nilfs_bmap_mdt_lock_key;

/**
 * nilfs_bmap_read - read a bmap from an inode
 * @bmap: bmap
 * @raw_inode: on-disk inode
 *
 * Description: nilfs_bmap_read() initializes the bmap @bmap.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (corrupted bmap).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
{
        if (raw_inode == NULL)
                memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
        else
                memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);

        init_rwsem(&bmap->b_sem);
        bmap->b_state = 0;
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
        switch (bmap->b_inode->i_ino) {
        case NILFS_DAT_INO:
                bmap->b_ptr_type = NILFS_BMAP_PTR_P;
                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
                break;
        case NILFS_CPFILE_INO:
        case NILFS_SUFILE_INO:
                bmap->b_ptr_type = NILFS_BMAP_PTR_VS;
                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
                break;
        case NILFS_IFILE_INO:
                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
                fallthrough;
        default:
                bmap->b_ptr_type = NILFS_BMAP_PTR_VM;
                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
                break;
        }

        return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
                nilfs_btree_init(bmap) : nilfs_direct_init(bmap);
}

/**
 * nilfs_bmap_write - write back a bmap to an inode
 * @bmap: bmap
 * @raw_inode: on-disk inode
 *
 * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
 */
void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
{
        memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
               NILFS_INODE_BMAP_SIZE * sizeof(__le64));
        if (bmap->b_inode->i_ino == NILFS_DAT_INO)
                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
}

void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
{
        memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
        init_rwsem(&bmap->b_sem);
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
        bmap->b_ptr_type = NILFS_BMAP_PTR_U;
        bmap->b_last_allocated_key = 0;
        bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
        bmap->b_state = 0;
        nilfs_btree_init_gc(bmap);
}

void nilfs_bmap_save(const struct nilfs_bmap *bmap,
                     struct nilfs_bmap_store *store)
{
        memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
        store->last_allocated_key = bmap->b_last_allocated_key;
        store->last_allocated_ptr = bmap->b_last_allocated_ptr;
        store->state = bmap->b_state;
}

void nilfs_bmap_restore(struct nilfs_bmap *bmap,
                        const struct nilfs_bmap_store *store)
{
        memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
        bmap->b_last_allocated_key = store->last_allocated_key;
        bmap->b_last_allocated_ptr = store->last_allocated_ptr;
        bmap->b_state = store->state;
}































































































































































































    2 





    2 








































    2 
















































    2 


    2 
    2 

    2 









    2 
    2 



    3 
    1 
    2 


























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
/*
 * linux/fs/nls/nls_base.c
 *
 * Native language support--charsets and unicode translations.
 * By Gordon Chaffee 1996, 1997
 *
 * Unicode based case conversion 1999 by Wolfram Pienkoss
 *
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/nls.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/kmod.h>
#include <linux/spinlock.h>
#include <asm/byteorder.h>

static struct nls_table default_table;
static struct nls_table *tables = &default_table;
static DEFINE_SPINLOCK(nls_lock);

/*
 * Sample implementation from Unicode home page.
 * http://www.stonehand.com/unicode/standard/fss-utf.html
 */
struct utf8_table {
        int     cmask;
        int     cval;
        int     shift;
        long    lmask;
        long    lval;
};

static const struct utf8_table utf8_table[] =
{
    {0x80,  0x00,   0*6,    0x7F,           0,         /* 1 byte sequence */},
    {0xE0,  0xC0,   1*6,    0x7FF,          0x80,      /* 2 byte sequence */},
    {0xF0,  0xE0,   2*6,    0xFFFF,         0x800,     /* 3 byte sequence */},
    {0xF8,  0xF0,   3*6,    0x1FFFFF,       0x10000,   /* 4 byte sequence */},
    {0xFC,  0xF8,   4*6,    0x3FFFFFF,      0x200000,  /* 5 byte sequence */},
    {0xFE,  0xFC,   5*6,    0x7FFFFFFF,     0x4000000, /* 6 byte sequence */},
    {0,                                                       /* end of table    */}
};

#define UNICODE_MAX        0x0010ffff
#define PLANE_SIZE        0x00010000

#define SURROGATE_MASK        0xfffff800
#define SURROGATE_PAIR        0x0000d800
#define SURROGATE_LOW        0x00000400
#define SURROGATE_BITS        0x000003ff

int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu)
{
        unsigned long l;
        int c0, c, nc;
        const struct utf8_table *t;
  
        nc = 0;
        c0 = *s;
        l = c0;
        for (t = utf8_table; t->cmask; t++) {
                nc++;
                if ((c0 & t->cmask) == t->cval) {
                        l &= t->lmask;
                        if (l < t->lval || l > UNICODE_MAX ||
                                        (l & SURROGATE_MASK) == SURROGATE_PAIR)
                                return -EILSEQ;

                        *pu = (unicode_t) l;
                        return nc;
                }
                if (inlen <= nc)
                        return -EOVERFLOW;

                s++;
                c = (*s ^ 0x80) & 0xFF;
                if (c & 0xC0)
                        return -EILSEQ;

                l = (l << 6) | c;
        }
        return -EILSEQ;
}
EXPORT_SYMBOL(utf8_to_utf32);

int utf32_to_utf8(unicode_t u, u8 *s, int maxout)
{
        unsigned long l;
        int c, nc;
        const struct utf8_table *t;

        if (!s)
                return 0;

        l = u;
        if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
                return -EILSEQ;

        nc = 0;
        for (t = utf8_table; t->cmask && maxout; t++, maxout--) {
                nc++;
                if (l <= t->lmask) {
                        c = t->shift;
                        *s = (u8) (t->cval | (l >> c));
                        while (c > 0) {
                                c -= 6;
                                s++;
                                *s = (u8) (0x80 | ((l >> c) & 0x3F));
                        }
                        return nc;
                }
        }
        return -EOVERFLOW;
}
EXPORT_SYMBOL(utf32_to_utf8);

static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian)
{
        switch (endian) {
        default:
                *s = (wchar_t) c;
                break;
        case UTF16_LITTLE_ENDIAN:
                *s = __cpu_to_le16(c);
                break;
        case UTF16_BIG_ENDIAN:
                *s = __cpu_to_be16(c);
                break;
        }
}

int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian,
                wchar_t *pwcs, int maxout)
{
        u16 *op;
        int size;
        unicode_t u;

        op = pwcs;
        while (inlen > 0 && maxout > 0 && *s) {
                if (*s & 0x80) {
                        size = utf8_to_utf32(s, inlen, &u);
                        if (size < 0)
                                return -EINVAL;
                        s += size;
                        inlen -= size;

                        if (u >= PLANE_SIZE) {
                                if (maxout < 2)
                                        break;
                                u -= PLANE_SIZE;
                                put_utf16(op++, SURROGATE_PAIR |
                                                ((u >> 10) & SURROGATE_BITS),
                                                endian);
                                put_utf16(op++, SURROGATE_PAIR |
                                                SURROGATE_LOW |
                                                (u & SURROGATE_BITS),
                                                endian);
                                maxout -= 2;
                        } else {
                                put_utf16(op++, u, endian);
                                maxout--;
                        }
                } else {
                        put_utf16(op++, *s++, endian);
                        inlen--;
                        maxout--;
                }
        }
        return op - pwcs;
}
EXPORT_SYMBOL(utf8s_to_utf16s);

static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
{
        switch (endian) {
        default:
                return c;
        case UTF16_LITTLE_ENDIAN:
                return __le16_to_cpu(c);
        case UTF16_BIG_ENDIAN:
                return __be16_to_cpu(c);
        }
}

int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
                u8 *s, int maxout)
{
        u8 *op;
        int size;
        unsigned long u, v;

        op = s;
        while (inlen > 0 && maxout > 0) {
                u = get_utf16(*pwcs, endian);
                if (!u)
                        break;
                pwcs++;
                inlen--;
                if (u > 0x7f) {
                        if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
                                if (u & SURROGATE_LOW) {
                                        /* Ignore character and move on */
                                        continue;
                                }
                                if (inlen <= 0)
                                        break;
                                v = get_utf16(*pwcs, endian);
                                if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
                                                !(v & SURROGATE_LOW)) {
                                        /* Ignore character and move on */
                                        continue;
                                }
                                u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
                                                + (v & SURROGATE_BITS);
                                pwcs++;
                                inlen--;
                        }
                        size = utf32_to_utf8(u, op, maxout);
                        if (size < 0) {
                                if (size == -EILSEQ) {
                                        /* Ignore character and move on */
                                        continue;
                                }
                                /*
                                 * Stop filling the buffer with data once a character
                                 * does not fit anymore.
                                 */
                                break;
                        } else {
                                op += size;
                                maxout -= size;
                        }
                } else {
                        *op++ = (u8) u;
                        maxout--;
                }
        }
        return op - s;
}
EXPORT_SYMBOL(utf16s_to_utf8s);

int __register_nls(struct nls_table *nls, struct module *owner)
{
        struct nls_table ** tmp = &tables;

        if (nls->next)
                return -EBUSY;

        nls->owner = owner;
        spin_lock(&nls_lock);
        while (*tmp) {
                if (nls == *tmp) {
                        spin_unlock(&nls_lock);
                        return -EBUSY;
                }
                tmp = &(*tmp)->next;
        }
        nls->next = tables;
        tables = nls;
        spin_unlock(&nls_lock);
        return 0;        
}
EXPORT_SYMBOL(__register_nls);

int unregister_nls(struct nls_table * nls)
{
        struct nls_table ** tmp = &tables;

        spin_lock(&nls_lock);
        while (*tmp) {
                if (nls == *tmp) {
                        *tmp = nls->next;
                        spin_unlock(&nls_lock);
                        return 0;
                }
                tmp = &(*tmp)->next;
        }
        spin_unlock(&nls_lock);
        return -EINVAL;
}

static struct nls_table *find_nls(const char *charset)
{
        struct nls_table *nls;
        spin_lock(&nls_lock);
        for (nls = tables; nls; nls = nls->next) {
                if (!strcmp(nls->charset, charset))
                        break;
                if (nls->alias && !strcmp(nls->alias, charset))
                        break;
        }
        if (nls && !try_module_get(nls->owner))
                nls = NULL;
        spin_unlock(&nls_lock);
        return nls;
}

struct nls_table *load_nls(const char *charset)
{
        return try_then_request_module(find_nls(charset), "nls_%s", charset);
}

void unload_nls(struct nls_table *nls)
{
        if (nls)
                module_put(nls->owner);
}

static const wchar_t charset2uni[256] = {
        /* 0x00*/
        0x0000, 0x0001, 0x0002, 0x0003,
        0x0004, 0x0005, 0x0006, 0x0007,
        0x0008, 0x0009, 0x000a, 0x000b,
        0x000c, 0x000d, 0x000e, 0x000f,
        /* 0x10*/
        0x0010, 0x0011, 0x0012, 0x0013,
        0x0014, 0x0015, 0x0016, 0x0017,
        0x0018, 0x0019, 0x001a, 0x001b,
        0x001c, 0x001d, 0x001e, 0x001f,
        /* 0x20*/
        0x0020, 0x0021, 0x0022, 0x0023,
        0x0024, 0x0025, 0x0026, 0x0027,
        0x0028, 0x0029, 0x002a, 0x002b,
        0x002c, 0x002d, 0x002e, 0x002f,
        /* 0x30*/
        0x0030, 0x0031, 0x0032, 0x0033,
        0x0034, 0x0035, 0x0036, 0x0037,
        0x0038, 0x0039, 0x003a, 0x003b,
        0x003c, 0x003d, 0x003e, 0x003f,
        /* 0x40*/
        0x0040, 0x0041, 0x0042, 0x0043,
        0x0044, 0x0045, 0x0046, 0x0047,
        0x0048, 0x0049, 0x004a, 0x004b,
        0x004c, 0x004d, 0x004e, 0x004f,
        /* 0x50*/
        0x0050, 0x0051, 0x0052, 0x0053,
        0x0054, 0x0055, 0x0056, 0x0057,
        0x0058, 0x0059, 0x005a, 0x005b,
        0x005c, 0x005d, 0x005e, 0x005f,
        /* 0x60*/
        0x0060, 0x0061, 0x0062, 0x0063,
        0x0064, 0x0065, 0x0066, 0x0067,
        0x0068, 0x0069, 0x006a, 0x006b,
        0x006c, 0x006d, 0x006e, 0x006f,
        /* 0x70*/
        0x0070, 0x0071, 0x0072, 0x0073,
        0x0074, 0x0075, 0x0076, 0x0077,
        0x0078, 0x0079, 0x007a, 0x007b,
        0x007c, 0x007d, 0x007e, 0x007f,
        /* 0x80*/
        0x0080, 0x0081, 0x0082, 0x0083,
        0x0084, 0x0085, 0x0086, 0x0087,
        0x0088, 0x0089, 0x008a, 0x008b,
        0x008c, 0x008d, 0x008e, 0x008f,
        /* 0x90*/
        0x0090, 0x0091, 0x0092, 0x0093,
        0x0094, 0x0095, 0x0096, 0x0097,
        0x0098, 0x0099, 0x009a, 0x009b,
        0x009c, 0x009d, 0x009e, 0x009f,
        /* 0xa0*/
        0x00a0, 0x00a1, 0x00a2, 0x00a3,
        0x00a4, 0x00a5, 0x00a6, 0x00a7,
        0x00a8, 0x00a9, 0x00aa, 0x00ab,
        0x00ac, 0x00ad, 0x00ae, 0x00af,
        /* 0xb0*/
        0x00b0, 0x00b1, 0x00b2, 0x00b3,
        0x00b4, 0x00b5, 0x00b6, 0x00b7,
        0x00b8, 0x00b9, 0x00ba, 0x00bb,
        0x00bc, 0x00bd, 0x00be, 0x00bf,
        /* 0xc0*/
        0x00c0, 0x00c1, 0x00c2, 0x00c3,
        0x00c4, 0x00c5, 0x00c6, 0x00c7,
        0x00c8, 0x00c9, 0x00ca, 0x00cb,
        0x00cc, 0x00cd, 0x00ce, 0x00cf,
        /* 0xd0*/
        0x00d0, 0x00d1, 0x00d2, 0x00d3,
        0x00d4, 0x00d5, 0x00d6, 0x00d7,
        0x00d8, 0x00d9, 0x00da, 0x00db,
        0x00dc, 0x00dd, 0x00de, 0x00df,
        /* 0xe0*/
        0x00e0, 0x00e1, 0x00e2, 0x00e3,
        0x00e4, 0x00e5, 0x00e6, 0x00e7,
        0x00e8, 0x00e9, 0x00ea, 0x00eb,
        0x00ec, 0x00ed, 0x00ee, 0x00ef,
        /* 0xf0*/
        0x00f0, 0x00f1, 0x00f2, 0x00f3,
        0x00f4, 0x00f5, 0x00f6, 0x00f7,
        0x00f8, 0x00f9, 0x00fa, 0x00fb,
        0x00fc, 0x00fd, 0x00fe, 0x00ff,
};

static const unsigned char page00[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static const unsigned char *const page_uni2charset[256] = {
        page00
};

static const unsigned char charset2lower[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
        0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static const unsigned char charset2upper[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
        0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};


static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
{
        const unsigned char *uni2charset;
        unsigned char cl = uni & 0x00ff;
        unsigned char ch = (uni & 0xff00) >> 8;

        if (boundlen <= 0)
                return -ENAMETOOLONG;

        uni2charset = page_uni2charset[ch];
        if (uni2charset && uni2charset[cl])
                out[0] = uni2charset[cl];
        else
                return -EINVAL;
        return 1;
}

static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
{
        *uni = charset2uni[*rawstring];
        if (*uni == 0x0000)
                return -EINVAL;
        return 1;
}

static struct nls_table default_table = {
        .charset        = "default",
        .uni2char        = uni2char,
        .char2uni        = char2uni,
        .charset2lower        = charset2lower,
        .charset2upper        = charset2upper,
};

/* Returns a simple default translation table */
struct nls_table *load_nls_default(void)
{
        struct nls_table *default_nls;
        
        default_nls = load_nls(CONFIG_NLS_DEFAULT);
        if (default_nls != NULL)
                return default_nls;
        else
                return &default_table;
}

EXPORT_SYMBOL(unregister_nls);
EXPORT_SYMBOL(unload_nls);
EXPORT_SYMBOL(load_nls);
EXPORT_SYMBOL(load_nls_default);

MODULE_DESCRIPTION("Base file system native language support");
MODULE_LICENSE("Dual BSD/GPL");










































































































































































































































    1 










































































































































































































































    1 
    1 

    1 









    1 




    1 



























































































































































































































































































































































































































































































































































































    1 





    1 








    1 
































































































































































































































    1 


    1 












    1 

    1 
    1 













    1 






    1 





    1 

    1 












    1 
    1 



    1 


































    1 







    1 
























    1 





    1 







    1 




    1 





    1 



    1 



    1 






    1 













    1 










    1 





















    1 













































































































































































































































































    1 





















    1 
    1 











    1 
    1 











    1 
    1 















































































































































































































































    1 






























    1 





    1 




















    1 



















































































































































































































    1 










































    1 









































    1 











    1 










    1 


















    1 














    1 

    1 

























































    1 










    1 








    1 










    1 
    1 

    1 






    1 



























    1 







    1 





















































    1 
















    1 





















    1 
















    1 












    1 

    1 

    1 




    1 









    1 
    1 






    1 

    1 


    1 
















































    1 




    1 


    1 















    1 



    1 












































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
// SPDX-License-Identifier: GPL-2.0-or-later

/*
 * VMA-specific functions.
 */

#include "vma_internal.h"
#include "vma.h"

struct mmap_state {
        struct mm_struct *mm;
        struct vma_iterator *vmi;

        unsigned long addr;
        unsigned long end;
        pgoff_t pgoff;
        unsigned long pglen;
        union {
                vm_flags_t vm_flags;
                vma_flags_t vma_flags;
        };
        struct file *file;
        pgprot_t page_prot;

        /* User-defined fields, perhaps updated by .mmap_prepare(). */
        const struct vm_operations_struct *vm_ops;
        void *vm_private_data;

        unsigned long charged;

        struct vm_area_struct *prev;
        struct vm_area_struct *next;

        /* Unmapping state. */
        struct vma_munmap_struct vms;
        struct ma_state mas_detach;
        struct maple_tree mt_detach;

        /* Determine if we can check KSM flags early in mmap() logic. */
        bool check_ksm_early :1;
        /* If .mmap_prepare changed the file, we don't need to pin. */
        bool file_doesnt_need_get :1;
};

#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vma_flags_, file_) \
        struct mmap_state name = {                                        \
                .mm = mm_,                                                \
                .vmi = vmi_,                                                \
                .addr = addr_,                                                \
                .end = (addr_) + (len_),                                \
                .pgoff = pgoff_,                                        \
                .pglen = PHYS_PFN(len_),                                \
                .vma_flags = vma_flags_,                                \
                .file = file_,                                                \
                .page_prot = vma_get_page_prot(vma_flags_),                \
        }

#define VMG_MMAP_STATE(name, map_, vma_)                                \
        struct vma_merge_struct name = {                                \
                .mm = (map_)->mm,                                        \
                .vmi = (map_)->vmi,                                        \
                .start = (map_)->addr,                                        \
                .end = (map_)->end,                                        \
                .vma_flags = (map_)->vma_flags,                                \
                .pgoff = (map_)->pgoff,                                        \
                .file = (map_)->file,                                        \
                .prev = (map_)->prev,                                        \
                .middle = vma_,                                                \
                .next = (vma_) ? NULL : (map_)->next,                        \
                .state = VMA_MERGE_START,                                \
        }

/* Was this VMA ever forked from a parent, i.e. maybe contains CoW mappings? */
static bool vma_is_fork_child(struct vm_area_struct *vma)
{
        /*
         * The list_is_singular() test is to avoid merging VMA cloned from
         * parents. This can improve scalability caused by the anon_vma root
         * lock.
         */
        return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain);
}

static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
        struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
        vma_flags_t diff;

        if (!mpol_equal(vmg->policy, vma_policy(vma)))
                return false;

        diff = vma_flags_diff_pair(&vma->flags, &vmg->vma_flags);
        vma_flags_clear_mask(&diff, VMA_IGNORE_MERGE_FLAGS);

        if (!vma_flags_empty(&diff))
                return false;
        if (vma->vm_file != vmg->file)
                return false;
        if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
                return false;
        if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
                return false;
        return true;
}

static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
{
        struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
        struct vm_area_struct *src = vmg->middle; /* existing merge case. */
        struct anon_vma *tgt_anon = tgt->anon_vma;
        struct anon_vma *src_anon = vmg->anon_vma;

        /*
         * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we
         * will remove the existing VMA's anon_vma's so there's no scalability
         * concerns.
         */
        VM_WARN_ON(src && src_anon != src->anon_vma);

        /* Case 1 - we will dup_anon_vma() from src into tgt. */
        if (!tgt_anon && src_anon) {
                struct vm_area_struct *copied_from = vmg->copied_from;

                if (vma_is_fork_child(src))
                        return false;
                if (vma_is_fork_child(copied_from))
                        return false;

                return true;
        }
        /* Case 2 - we will simply use tgt's anon_vma. */
        if (tgt_anon && !src_anon)
                return !vma_is_fork_child(tgt);
        /* Case 3 - the anon_vma's are already shared. */
        return src_anon == tgt_anon;
}

/*
 * init_multi_vma_prep() - Initializer for struct vma_prepare
 * @vp: The vma_prepare struct
 * @vma: The vma that will be altered once locked
 * @vmg: The merge state that will be used to determine adjustment and VMA
 *       removal.
 */
static void init_multi_vma_prep(struct vma_prepare *vp,
                                struct vm_area_struct *vma,
                                struct vma_merge_struct *vmg)
{
        struct vm_area_struct *adjust;
        struct vm_area_struct **remove = &vp->remove;

        memset(vp, 0, sizeof(struct vma_prepare));
        vp->vma = vma;
        vp->anon_vma = vma->anon_vma;

        if (vmg && vmg->__remove_middle) {
                *remove = vmg->middle;
                remove = &vp->remove2;
        }
        if (vmg && vmg->__remove_next)
                *remove = vmg->next;

        if (vmg && vmg->__adjust_middle_start)
                adjust = vmg->middle;
        else if (vmg && vmg->__adjust_next_start)
                adjust = vmg->next;
        else
                adjust = NULL;

        vp->adj_next = adjust;
        if (!vp->anon_vma && adjust)
                vp->anon_vma = adjust->anon_vma;

        VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma &&
                   vp->anon_vma != adjust->anon_vma);

        vp->file = vma->vm_file;
        if (vp->file)
                vp->mapping = vma->vm_file->f_mapping;

        if (vmg && vmg->skip_vma_uprobe)
                vp->skip_vma_uprobe = true;
}

/*
 * Return true if we can merge this (vma_flags,anon_vma,file,vm_pgoff)
 * in front of (at a lower virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We don't check here for the merged mmap wrapping around the end of pagecache
 * indices (16TB on ia32) because do_mmap() does not permit mmap's which
 * wrap, nor mmaps which cover the final page at index -1UL.
 *
 * We assume the vma may be removed as part of the merge.
 */
static bool can_vma_merge_before(struct vma_merge_struct *vmg)
{
        pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);

        if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
            is_mergeable_anon_vma(vmg, /* merge_next = */ true)) {
                if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
                        return true;
        }

        return false;
}

/*
 * Return true if we can merge this (vma_flags,anon_vma,file,vm_pgoff)
 * beyond (at a higher virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We assume that vma is not removed as part of the merge.
 */
static bool can_vma_merge_after(struct vma_merge_struct *vmg)
{
        if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
            is_mergeable_anon_vma(vmg, /* merge_next = */ false)) {
                if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
                        return true;
        }
        return false;
}

static void __vma_link_file(struct vm_area_struct *vma,
                            struct address_space *mapping)
{
        if (vma_is_shared_maywrite(vma))
                mapping_allow_writable(mapping);

        flush_dcache_mmap_lock(mapping);
        vma_interval_tree_insert(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
}

/*
 * Requires inode->i_mapping->i_mmap_rwsem
 */
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                                      struct address_space *mapping)
{
        if (vma_is_shared_maywrite(vma))
                mapping_unmap_writable(mapping);

        flush_dcache_mmap_lock(mapping);
        vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
}

/*
 * vma has some anon_vma assigned, and is already inserted on that
 * anon_vma's interval trees.
 *
 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
 * vma must be removed from the anon_vma's interval trees using
 * anon_vma_interval_tree_pre_update_vma().
 *
 * After the update, the vma will be reinserted using
 * anon_vma_interval_tree_post_update_vma().
 *
 * The entire update must be protected by exclusive mmap_lock and by
 * the root anon_vma's mutex.
 */
static void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}

static void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}

/*
 * vma_prepare() - Helper function for handling locking VMAs prior to altering
 * @vp: The initialized vma_prepare struct
 */
static void vma_prepare(struct vma_prepare *vp)
{
        if (vp->file) {
                uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);

                if (vp->adj_next)
                        uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
                                      vp->adj_next->vm_end);

                i_mmap_lock_write(vp->mapping);
                if (vp->insert && vp->insert->vm_file) {
                        /*
                         * Put into interval tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
                         * throughout; but we cannot insert into address
                         * space until vma start or end is updated.
                         */
                        __vma_link_file(vp->insert,
                                        vp->insert->vm_file->f_mapping);
                }
        }

        if (vp->anon_vma) {
                anon_vma_lock_write(vp->anon_vma);
                anon_vma_interval_tree_pre_update_vma(vp->vma);
                if (vp->adj_next)
                        anon_vma_interval_tree_pre_update_vma(vp->adj_next);
        }

        if (vp->file) {
                flush_dcache_mmap_lock(vp->mapping);
                vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
                if (vp->adj_next)
                        vma_interval_tree_remove(vp->adj_next,
                                                 &vp->mapping->i_mmap);
        }

}

/*
 * vma_complete- Helper function for handling the unlocking after altering VMAs,
 * or for inserting a VMA.
 *
 * @vp: The vma_prepare struct
 * @vmi: The vma iterator
 * @mm: The mm_struct
 */
static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
                         struct mm_struct *mm)
{
        if (vp->file) {
                if (vp->adj_next)
                        vma_interval_tree_insert(vp->adj_next,
                                                 &vp->mapping->i_mmap);
                vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
                flush_dcache_mmap_unlock(vp->mapping);
        }

        if (vp->remove && vp->file) {
                __remove_shared_vm_struct(vp->remove, vp->mapping);
                if (vp->remove2)
                        __remove_shared_vm_struct(vp->remove2, vp->mapping);
        } else if (vp->insert) {
                /*
                 * split_vma has split insert from vma, and needs
                 * us to insert it before dropping the locks
                 * (it may either follow vma or precede it).
                 */
                vma_iter_store_new(vmi, vp->insert);
                mm->map_count++;
        }

        if (vp->anon_vma) {
                anon_vma_interval_tree_post_update_vma(vp->vma);
                if (vp->adj_next)
                        anon_vma_interval_tree_post_update_vma(vp->adj_next);
                anon_vma_unlock_write(vp->anon_vma);
        }

        if (vp->file) {
                i_mmap_unlock_write(vp->mapping);

                if (!vp->skip_vma_uprobe) {
                        uprobe_mmap(vp->vma);

                        if (vp->adj_next)
                                uprobe_mmap(vp->adj_next);
                }
        }

        if (vp->remove) {
again:
                vma_mark_detached(vp->remove);
                if (vp->file) {
                        uprobe_munmap(vp->remove, vp->remove->vm_start,
                                      vp->remove->vm_end);
                        fput(vp->file);
                }
                if (vp->remove->anon_vma)
                        unlink_anon_vmas(vp->remove);
                mm->map_count--;
                mpol_put(vma_policy(vp->remove));
                if (!vp->remove2)
                        WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
                vm_area_free(vp->remove);

                /*
                 * In mprotect's case 6 (see comments on vma_merge),
                 * we are removing both mid and next vmas
                 */
                if (vp->remove2) {
                        vp->remove = vp->remove2;
                        vp->remove2 = NULL;
                        goto again;
                }
        }
        if (vp->insert && vp->file)
                uprobe_mmap(vp->insert);
}

/*
 * init_vma_prep() - Initializer wrapper for vma_prepare struct
 * @vp: The vma_prepare struct
 * @vma: The vma that will be altered once locked
 */
static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
{
        init_multi_vma_prep(vp, vma, NULL);
}

/*
 * Can the proposed VMA be merged with the left (previous) VMA taking into
 * account the start position of the proposed range.
 */
static bool can_vma_merge_left(struct vma_merge_struct *vmg)

{
        return vmg->prev && vmg->prev->vm_end == vmg->start &&
                can_vma_merge_after(vmg);
}

/*
 * Can the proposed VMA be merged with the right (next) VMA taking into
 * account the end position of the proposed range.
 *
 * In addition, if we can merge with the left VMA, ensure that left and right
 * anon_vma's are also compatible.
 */
static bool can_vma_merge_right(struct vma_merge_struct *vmg,
                                bool can_merge_left)
{
        struct vm_area_struct *next = vmg->next;
        struct vm_area_struct *prev;

        if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg))
                return false;

        if (!can_merge_left)
                return true;

        /*
         * If we can merge with prev (left) and next (right), indicating that
         * each VMA's anon_vma is compatible with the proposed anon_vma, this
         * does not mean prev and next are compatible with EACH OTHER.
         *
         * We therefore check this in addition to mergeability to either side.
         */
        prev = vmg->prev;
        return !prev->anon_vma || !next->anon_vma ||
                prev->anon_vma == next->anon_vma;
}

/*
 * Close a vm structure and free it.
 */
void remove_vma(struct vm_area_struct *vma)
{
        might_sleep();
        vma_close(vma);
        if (vma->vm_file)
                fput(vma->vm_file);
        mpol_put(vma_policy(vma));
        vm_area_free(vma);
}

/*
 * Get rid of page table information in the indicated region.
 *
 * Called with the mm semaphore held.
 */
void unmap_region(struct unmap_desc *unmap)
{
        struct mm_struct *mm = unmap->first->vm_mm;
        struct mmu_gather tlb;

        tlb_gather_mmu(&tlb, mm);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, unmap);
        mas_set(unmap->mas, unmap->tree_reset);
        free_pgtables(&tlb, unmap);
        tlb_finish_mmu(&tlb);
}

/*
 * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
 * has already been checked or doesn't make sense to fail.
 * VMA Iterator will point to the original VMA.
 */
static __must_check int
__split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
            unsigned long addr, int new_below)
{
        struct vma_prepare vp;
        struct vm_area_struct *new;
        int err;

        WARN_ON(vma->vm_start >= addr);
        WARN_ON(vma->vm_end <= addr);

        if (vma->vm_ops && vma->vm_ops->may_split) {
                err = vma->vm_ops->may_split(vma, addr);
                if (err)
                        return err;
        }

        new = vm_area_dup(vma);
        if (!new)
                return -ENOMEM;

        if (new_below) {
                new->vm_end = addr;
        } else {
                new->vm_start = addr;
                new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
        }

        err = -ENOMEM;
        vma_iter_config(vmi, new->vm_start, new->vm_end);
        if (vma_iter_prealloc(vmi, new))
                goto out_free_vma;

        err = vma_dup_policy(vma, new);
        if (err)
                goto out_free_vmi;

        err = anon_vma_clone(new, vma, VMA_OP_SPLIT);
        if (err)
                goto out_free_mpol;

        if (new->vm_file)
                get_file(new->vm_file);

        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);

        vma_start_write(vma);
        vma_start_write(new);

        init_vma_prep(&vp, vma);
        vp.insert = new;
        vma_prepare(&vp);

        /*
         * Get rid of huge pages and shared page tables straddling the split
         * boundary.
         */
        vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
        if (is_vm_hugetlb_page(vma))
                hugetlb_split(vma, addr);

        if (new_below) {
                vma->vm_start = addr;
                vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
        } else {
                vma->vm_end = addr;
        }

        /* vma_complete stores the new vma */
        vma_complete(&vp, vmi, vma->vm_mm);
        validate_mm(vma->vm_mm);

        /* Success. */
        if (new_below)
                vma_next(vmi);
        else
                vma_prev(vmi);

        return 0;

out_free_mpol:
        mpol_put(vma_policy(new));
out_free_vmi:
        vma_iter_free(vmi);
out_free_vma:
        vm_area_free(new);
        return err;
}

/*
 * Split a vma into two pieces at address 'addr', a new vma is allocated
 * either for the first part or the tail.
 */
static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
                     unsigned long addr, int new_below)
{
        if (vma->vm_mm->map_count >= get_sysctl_max_map_count())
                return -ENOMEM;

        return __split_vma(vmi, vma, addr, new_below);
}

/*
 * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the
 * instance that the destination VMA has no anon_vma but the source does.
 *
 * @dst: The destination VMA
 * @src: The source VMA
 * @dup: Pointer to the destination VMA when successful.
 *
 * Returns: 0 on success.
 */
static int dup_anon_vma(struct vm_area_struct *dst,
                        struct vm_area_struct *src, struct vm_area_struct **dup)
{
        /*
         * There are three cases to consider for correctly propagating
         * anon_vma's on merge.
         *
         * The first is trivial - neither VMA has anon_vma, we need not do
         * anything.
         *
         * The second where both have anon_vma is also a no-op, as they must
         * then be the same, so there is simply nothing to copy.
         *
         * Here we cover the third - if the destination VMA has no anon_vma,
         * that is it is unfaulted, we need to ensure that the newly merged
         * range is referenced by the anon_vma's of the source.
         */
        if (src->anon_vma && !dst->anon_vma) {
                int ret;

                vma_assert_write_locked(dst);
                dst->anon_vma = src->anon_vma;
                ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED);
                if (ret)
                        return ret;

                *dup = dst;
        }

        return 0;
}

#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
void validate_mm(struct mm_struct *mm)
{
        int bug = 0;
        int i = 0;
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        mt_validate(&mm->mm_mt);
        for_each_vma(vmi, vma) {
#ifdef CONFIG_DEBUG_VM_RB
                struct anon_vma *anon_vma = vma->anon_vma;
                struct anon_vma_chain *avc;
#endif
                unsigned long vmi_start, vmi_end;
                bool warn = 0;

                vmi_start = vma_iter_addr(&vmi);
                vmi_end = vma_iter_end(&vmi);
                if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
                        warn = 1;

                if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
                        warn = 1;

                if (warn) {
                        pr_emerg("issue in %s\n", current->comm);
                        dump_stack();
                        dump_vma(vma);
                        pr_emerg("tree range: %px start %lx end %lx\n", vma,
                                 vmi_start, vmi_end - 1);
                        vma_iter_dump_tree(&vmi);
                }

#ifdef CONFIG_DEBUG_VM_RB
                if (anon_vma) {
                        anon_vma_lock_read(anon_vma);
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                anon_vma_interval_tree_verify(avc);
                        anon_vma_unlock_read(anon_vma);
                }
#endif
                /* Check for a infinite loop */
                if (++i > mm->map_count + 10) {
                        i = -1;
                        break;
                }
        }
        if (i != mm->map_count) {
                pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
                bug = 1;
        }
        VM_BUG_ON_MM(bug, mm);
}
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */

/*
 * Based on the vmg flag indicating whether we need to adjust the vm_start field
 * for the middle or next VMA, we calculate what the range of the newly adjusted
 * VMA ought to be, and set the VMA's range accordingly.
 */
static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
{
        struct vm_area_struct *adjust;
        pgoff_t pgoff;

        if (vmg->__adjust_middle_start) {
                adjust = vmg->middle;
                pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start);
        } else if (vmg->__adjust_next_start) {
                adjust = vmg->next;
                pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end);
        } else {
                return;
        }

        vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff);
}

/*
 * Actually perform the VMA merge operation.
 *
 * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
 * modify any VMAs or cause inconsistent state should an OOM condition arise.
 *
 * Returns 0 on success, or an error value on failure.
 */
static int commit_merge(struct vma_merge_struct *vmg)
{
        struct vm_area_struct *vma;
        struct vma_prepare vp;

        if (vmg->__adjust_next_start) {
                /* We manipulate middle and adjust next, which is the target. */
                vma = vmg->middle;
                vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end);
        } else {
                vma = vmg->target;
                 /* Note: vma iterator must be pointing to 'start'. */
                vma_iter_config(vmg->vmi, vmg->start, vmg->end);
        }

        init_multi_vma_prep(&vp, vma, vmg);

        /*
         * If vmg->give_up_on_oom is set, we're safe, because we don't actually
         * manipulate any VMAs until we succeed at preallocation.
         *
         * Past this point, we will not return an error.
         */
        if (vma_iter_prealloc(vmg->vmi, vma))
                return -ENOMEM;

        vma_prepare(&vp);
        /*
         * THP pages may need to do additional splits if we increase
         * middle->vm_start.
         */
        vma_adjust_trans_huge(vma, vmg->start, vmg->end,
                              vmg->__adjust_middle_start ? vmg->middle : NULL);
        vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
        vmg_adjust_set_range(vmg);
        vma_iter_store_overwrite(vmg->vmi, vmg->target);

        vma_complete(&vp, vmg->vmi, vma->vm_mm);

        return 0;
}

/* We can only remove VMAs when merging if they do not have a close hook. */
static bool can_merge_remove_vma(struct vm_area_struct *vma)
{
        return !vma->vm_ops || !vma->vm_ops->close;
}

/*
 * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
 * attributes modified.
 *
 * @vmg: Describes the modifications being made to a VMA and associated
 *       metadata.
 *
 * When the attributes of a range within a VMA change, then it might be possible
 * for immediately adjacent VMAs to be merged into that VMA due to having
 * identical properties.
 *
 * This function checks for the existence of any such mergeable VMAs and updates
 * the maple tree describing the @vmg->middle->vm_mm address space to account
 * for this, as well as any VMAs shrunk/expanded/deleted as a result of this
 * merge.
 *
 * As part of this operation, if a merge occurs, the @vmg object will have its
 * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
 * calls to this function should reset these fields.
 *
 * Returns: The merged VMA if merge succeeds, or NULL otherwise.
 *
 * ASSUMPTIONS:
 * - The caller must assign the VMA to be modified to @vmg->middle.
 * - The caller must have set @vmg->prev to the previous VMA, if there is one.
 * - The caller must not set @vmg->next, as we determine this.
 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
 * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end).
 */
static __must_check struct vm_area_struct *vma_merge_existing_range(
                struct vma_merge_struct *vmg)
{
        vma_flags_t sticky_flags = vma_flags_and_mask(&vmg->vma_flags,
                                                      VMA_STICKY_FLAGS);
        struct vm_area_struct *middle = vmg->middle;
        struct vm_area_struct *prev = vmg->prev;
        struct vm_area_struct *next;
        struct vm_area_struct *anon_dup = NULL;
        unsigned long start = vmg->start;
        unsigned long end = vmg->end;
        bool left_side = middle && start == middle->vm_start;
        bool right_side = middle && end == middle->vm_end;
        int err = 0;
        bool merge_left, merge_right, merge_both;

        mmap_assert_write_locked(vmg->mm);
        VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */
        VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */
        VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg);
        VM_WARN_ON_VMG(start >= end, vmg);

        /*
         * If middle == prev, then we are offset into a VMA. Otherwise, if we are
         * not, we must span a portion of the VMA.
         */
        VM_WARN_ON_VMG(middle &&
                       ((middle != prev && vmg->start != middle->vm_start) ||
                        vmg->end > middle->vm_end), vmg);
        /* The vmi must be positioned within vmg->middle. */
        VM_WARN_ON_VMG(middle &&
                       !(vma_iter_addr(vmg->vmi) >= middle->vm_start &&
                         vma_iter_addr(vmg->vmi) < middle->vm_end), vmg);
        /* An existing merge can never be used by the mremap() logic. */
        VM_WARN_ON_VMG(vmg->copied_from, vmg);

        vmg->state = VMA_MERGE_NOMERGE;

        /*
         * If a special mapping or if the range being modified is neither at the
         * furthermost left or right side of the VMA, then we have no chance of
         * merging and should abort.
         */
        if (vma_flags_test_any_mask(&vmg->vma_flags, VMA_SPECIAL_FLAGS) ||
            (!left_side && !right_side))
                return NULL;

        if (left_side)
                merge_left = can_vma_merge_left(vmg);
        else
                merge_left = false;

        if (right_side) {
                next = vmg->next = vma_iter_next_range(vmg->vmi);
                vma_iter_prev_range(vmg->vmi);

                merge_right = can_vma_merge_right(vmg, merge_left);
        } else {
                merge_right = false;
                next = NULL;
        }

        if (merge_left)                /* If merging prev, position iterator there. */
                vma_prev(vmg->vmi);
        else if (!merge_right)        /* If we have nothing to merge, abort. */
                return NULL;

        merge_both = merge_left && merge_right;
        /* If we span the entire VMA, a merge implies it will be deleted. */
        vmg->__remove_middle = left_side && right_side;

        /*
         * If we need to remove middle in its entirety but are unable to do so,
         * we have no sensible recourse but to abort the merge.
         */
        if (vmg->__remove_middle && !can_merge_remove_vma(middle))
                return NULL;

        /*
         * If we merge both VMAs, then next is also deleted. This implies
         * merge_will_delete_vma also.
         */
        vmg->__remove_next = merge_both;

        /*
         * If we cannot delete next, then we can reduce the operation to merging
         * prev and middle (thereby deleting middle).
         */
        if (vmg->__remove_next && !can_merge_remove_vma(next)) {
                vmg->__remove_next = false;
                merge_right = false;
                merge_both = false;
        }

        /* No matter what happens, we will be adjusting middle. */
        vma_start_write(middle);

        if (merge_right) {
                vma_flags_t next_sticky;

                vma_start_write(next);
                vmg->target = next;
                next_sticky = vma_flags_and_mask(&next->flags, VMA_STICKY_FLAGS);
                vma_flags_set_mask(&sticky_flags, next_sticky);
        }

        if (merge_left) {
                vma_flags_t prev_sticky;

                vma_start_write(prev);
                vmg->target = prev;

                prev_sticky = vma_flags_and_mask(&prev->flags, VMA_STICKY_FLAGS);
                vma_flags_set_mask(&sticky_flags, prev_sticky);
        }

        if (merge_both) {
                /*
                 * |<-------------------->|
                 * |-------********-------|
                 *   prev   middle   next
                 *  extend  delete  delete
                 */

                vmg->start = prev->vm_start;
                vmg->end = next->vm_end;
                vmg->pgoff = prev->vm_pgoff;

                /*
                 * We already ensured anon_vma compatibility above, so now it's
                 * simply a case of, if prev has no anon_vma object, which of
                 * next or middle contains the anon_vma we must duplicate.
                 */
                err = dup_anon_vma(prev, next->anon_vma ? next : middle,
                                   &anon_dup);
        } else if (merge_left) {
                /*
                 * |<------------>|      OR
                 * |<----------------->|
                 * |-------*************
                 *   prev     middle
                 *  extend shrink/delete
                 */

                vmg->start = prev->vm_start;
                vmg->pgoff = prev->vm_pgoff;

                if (!vmg->__remove_middle)
                        vmg->__adjust_middle_start = true;

                err = dup_anon_vma(prev, middle, &anon_dup);
        } else { /* merge_right */
                /*
                 *     |<------------->| OR
                 * |<----------------->|
                 * *************-------|
                 *    middle     next
                 * shrink/delete extend
                 */

                pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);

                VM_WARN_ON_VMG(!merge_right, vmg);
                /* If we are offset into a VMA, then prev must be middle. */
                VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg);

                if (vmg->__remove_middle) {
                        vmg->end = next->vm_end;
                        vmg->pgoff = next->vm_pgoff - pglen;
                } else {
                        /* We shrink middle and expand next. */
                        vmg->__adjust_next_start = true;
                        vmg->start = middle->vm_start;
                        vmg->end = start;
                        vmg->pgoff = middle->vm_pgoff;
                }

                err = dup_anon_vma(next, middle, &anon_dup);
        }

        if (err || commit_merge(vmg))
                goto abort;

        vma_set_flags_mask(vmg->target, sticky_flags);
        khugepaged_enter_vma(vmg->target, vmg->vm_flags);
        vmg->state = VMA_MERGE_SUCCESS;
        return vmg->target;

abort:
        vma_iter_set(vmg->vmi, start);
        vma_iter_load(vmg->vmi);

        if (anon_dup)
                unlink_anon_vmas(anon_dup);

        /*
         * This means we have failed to clone anon_vma's correctly, but no
         * actual changes to VMAs have occurred, so no harm no foul - if the
         * user doesn't want this reported and instead just wants to give up on
         * the merge, allow it.
         */
        if (!vmg->give_up_on_oom)
                vmg->state = VMA_MERGE_ERROR_NOMEM;
        return NULL;
}

/*
 * vma_merge_new_range - Attempt to merge a new VMA into address space
 *
 * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
 *       (exclusive), which we try to merge with any adjacent VMAs if possible.
 *
 * We are about to add a VMA to the address space starting at @vmg->start and
 * ending at @vmg->end. There are three different possible scenarios:
 *
 * 1. There is a VMA with identical properties immediately adjacent to the
 *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
 *    EXPAND that VMA:
 *
 * Proposed:       |-----|  or  |-----|
 * Existing:  |----|                  |----|
 *
 * 2. There are VMAs with identical properties immediately adjacent to the
 *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
 *    EXPAND the former and REMOVE the latter:
 *
 * Proposed:       |-----|
 * Existing:  |----|     |----|
 *
 * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
 *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
 *
 * In instances where we can merge, this function returns the expanded VMA which
 * will have its range adjusted accordingly and the underlying maple tree also
 * adjusted.
 *
 * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
 *          to the VMA we expanded.
 *
 * This function adjusts @vmg to provide @vmg->next if not already specified,
 * and adjusts [@vmg->start, @vmg->end) to span the expanded range.
 *
 * ASSUMPTIONS:
 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
 * - The caller must have determined that [@vmg->start, @vmg->end) is empty,
     other than VMAs that will be unmapped should the operation succeed.
 * - The caller must have specified the previous vma in @vmg->prev.
 * - The caller must have specified the next vma in @vmg->next.
 * - The caller must have positioned the vmi at or before the gap.
 */
struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
{
        struct vm_area_struct *prev = vmg->prev;
        struct vm_area_struct *next = vmg->next;
        unsigned long end = vmg->end;
        bool can_merge_left, can_merge_right;

        mmap_assert_write_locked(vmg->mm);
        VM_WARN_ON_VMG(vmg->middle, vmg);
        VM_WARN_ON_VMG(vmg->target, vmg);
        /* vmi must point at or before the gap. */
        VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);

        vmg->state = VMA_MERGE_NOMERGE;

        /* Special VMAs are unmergeable, also if no prev/next. */
        if (vma_flags_test_any_mask(&vmg->vma_flags, VMA_SPECIAL_FLAGS) ||
            (!prev && !next))
                return NULL;

        can_merge_left = can_vma_merge_left(vmg);
        can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left);

        /* If we can merge with the next VMA, adjust vmg accordingly. */
        if (can_merge_right) {
                vmg->end = next->vm_end;
                vmg->target = next;
        }

        /* If we can merge with the previous VMA, adjust vmg accordingly. */
        if (can_merge_left) {
                vmg->start = prev->vm_start;
                vmg->target = prev;
                vmg->pgoff = prev->vm_pgoff;

                /*
                 * If this merge would result in removal of the next VMA but we
                 * are not permitted to do so, reduce the operation to merging
                 * prev and vma.
                 */
                if (can_merge_right && !can_merge_remove_vma(next))
                        vmg->end = end;

                /* In expand-only case we are already positioned at prev. */
                if (!vmg->just_expand) {
                        /* Equivalent to going to the previous range. */
                        vma_prev(vmg->vmi);
                }
        }

        /*
         * Now try to expand adjacent VMA(s). This takes care of removing the
         * following VMA if we have VMAs on both sides.
         */
        if (vmg->target && !vma_expand(vmg)) {
                khugepaged_enter_vma(vmg->target, vmg->vm_flags);
                vmg->state = VMA_MERGE_SUCCESS;
                return vmg->target;
        }

        return NULL;
}

/*
 * vma_merge_copied_range - Attempt to merge a VMA that is being copied by
 * mremap()
 *
 * @vmg: Describes the VMA we are adding, in the copied-to range @vmg->start to
 *       @vmg->end (exclusive), which we try to merge with any adjacent VMAs if
 *       possible.
 *
 * vmg->prev, next, start, end, pgoff should all be relative to the COPIED TO
 * range, i.e. the target range for the VMA.
 *
 * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
 *          to the VMA we expanded.
 *
 * ASSUMPTIONS: Same as vma_merge_new_range(), except vmg->middle must contain
 *              the copied-from VMA.
 */
static struct vm_area_struct *vma_merge_copied_range(struct vma_merge_struct *vmg)
{
        /* We must have a copied-from VMA. */
        VM_WARN_ON_VMG(!vmg->middle, vmg);

        vmg->copied_from = vmg->middle;
        vmg->middle = NULL;
        return vma_merge_new_range(vmg);
}

/*
 * vma_expand - Expand an existing VMA
 *
 * @vmg: Describes a VMA expansion operation.
 *
 * Expand @vma to vmg->start and vmg->end.  Can expand off the start and end.
 * Will expand over vmg->next if it's different from vmg->target and vmg->end ==
 * vmg->next->vm_end.  Checking if the vmg->target can expand and merge with
 * vmg->next needs to be handled by the caller.
 *
 * Returns: 0 on success.
 *
 * ASSUMPTIONS:
 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
 * - The caller must have set @vmg->target and @vmg->next.
 */
int vma_expand(struct vma_merge_struct *vmg)
{
        struct vm_area_struct *anon_dup = NULL;
        struct vm_area_struct *target = vmg->target;
        struct vm_area_struct *next = vmg->next;
        bool remove_next = false;
        vma_flags_t sticky_flags =
                vma_flags_and_mask(&vmg->vma_flags, VMA_STICKY_FLAGS);
        vma_flags_t target_sticky;
        int ret = 0;

        mmap_assert_write_locked(vmg->mm);
        vma_start_write(target);

        target_sticky = vma_flags_and_mask(&target->flags, VMA_STICKY_FLAGS);

        if (next && target != next && vmg->end == next->vm_end)
                remove_next = true;

        /* We must have a target. */
        VM_WARN_ON_VMG(!target, vmg);
        /* This should have already been checked by this point. */
        VM_WARN_ON_VMG(remove_next && !can_merge_remove_vma(next), vmg);
        /* Not merging but overwriting any part of next is not handled. */
        VM_WARN_ON_VMG(next && !remove_next &&
                       next != target && vmg->end > next->vm_start, vmg);
        /* Only handles expanding. */
        VM_WARN_ON_VMG(target->vm_start < vmg->start ||
                       target->vm_end > vmg->end, vmg);

        vma_flags_set_mask(&sticky_flags, target_sticky);

        /*
         * If we are removing the next VMA or copying from a VMA
         * (e.g. mremap()'ing), we must propagate anon_vma state.
         *
         * Note that, by convention, callers ignore OOM for this case, so
         * we don't need to account for vmg->give_up_on_mm here.
         */
        if (remove_next)
                ret = dup_anon_vma(target, next, &anon_dup);
        if (!ret && vmg->copied_from)
                ret = dup_anon_vma(target, vmg->copied_from, &anon_dup);
        if (ret)
                return ret;

        if (remove_next) {
                vma_flags_t next_sticky;

                vma_start_write(next);
                vmg->__remove_next = true;

                next_sticky = vma_flags_and_mask(&next->flags, VMA_STICKY_FLAGS);
                vma_flags_set_mask(&sticky_flags, next_sticky);
        }
        if (commit_merge(vmg))
                goto nomem;

        vma_set_flags_mask(target, sticky_flags);
        return 0;

nomem:
        if (anon_dup)
                unlink_anon_vmas(anon_dup);
        /*
         * If the user requests that we just give upon OOM, we are safe to do so
         * here, as commit merge provides this contract to us. Nothing has been
         * changed - no harm no foul, just don't report it.
         */
        if (!vmg->give_up_on_oom)
                vmg->state = VMA_MERGE_ERROR_NOMEM;
        return -ENOMEM;
}

/*
 * vma_shrink() - Reduce an existing VMAs memory area
 * @vmi: The vma iterator
 * @vma: The VMA to modify
 * @start: The new start
 * @end: The new end
 *
 * Returns: 0 on success, -ENOMEM otherwise
 */
int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
               unsigned long start, unsigned long end, pgoff_t pgoff)
{
        struct vma_prepare vp;

        WARN_ON((vma->vm_start != start) && (vma->vm_end != end));

        if (vma->vm_start < start)
                vma_iter_config(vmi, vma->vm_start, start);
        else
                vma_iter_config(vmi, end, vma->vm_end);

        if (vma_iter_prealloc(vmi, NULL))
                return -ENOMEM;

        vma_start_write(vma);

        init_vma_prep(&vp, vma);
        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, start, end, NULL);

        vma_iter_clear(vmi);
        vma_set_range(vma, start, end, pgoff);
        vma_complete(&vp, vmi, vma->vm_mm);
        validate_mm(vma->vm_mm);
        return 0;
}

static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
                    struct ma_state *mas_detach, bool mm_wr_locked)
{
        struct unmap_desc unmap = {
                .mas = mas_detach,
                .first = vms->vma,
                /* start and end may be different if there is no prev or next vma. */
                .pg_start = vms->unmap_start,
                .pg_end = vms->unmap_end,
                .vma_start = vms->start,
                .vma_end = vms->end,
                /*
                 * The tree limits and reset differ from the normal case since it's a
                 * side-tree
                 */
                .tree_reset = 1,
                .tree_end = vms->vma_count,
                /*
                 * We can free page tables without write-locking mmap_lock because VMAs
                 * were isolated before we downgraded mmap_lock.
                 */
                .mm_wr_locked = mm_wr_locked,
        };

        if (!vms->clear_ptes) /* Nothing to do */
                return;

        mas_set(mas_detach, 1);
        unmap_region(&unmap);
        vms->clear_ptes = false;
}

static void vms_clean_up_area(struct vma_munmap_struct *vms,
                struct ma_state *mas_detach)
{
        struct vm_area_struct *vma;

        if (!vms->nr_pages)
                return;

        vms_clear_ptes(vms, mas_detach, true);
        mas_set(mas_detach, 0);
        mas_for_each(mas_detach, vma, ULONG_MAX)
                vma_close(vma);
}

/*
 * vms_complete_munmap_vmas() - Finish the munmap() operation
 * @vms: The vma munmap struct
 * @mas_detach: The maple state of the detached vmas
 *
 * This updates the mm_struct, unmaps the region, frees the resources
 * used for the munmap() and may downgrade the lock - if requested.  Everything
 * needed to be done once the vma maple tree is updated.
 */
static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
                struct ma_state *mas_detach)
{
        struct vm_area_struct *vma;
        struct mm_struct *mm;

        mm = current->mm;
        mm->map_count -= vms->vma_count;
        mm->locked_vm -= vms->locked_vm;
        if (vms->unlock)
                mmap_write_downgrade(mm);

        if (!vms->nr_pages)
                return;

        vms_clear_ptes(vms, mas_detach, !vms->unlock);
        /* Update high watermark before we lower total_vm */
        update_hiwater_vm(mm);
        /* Stat accounting */
        WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages);
        /* Paranoid bookkeeping */
        VM_WARN_ON(vms->exec_vm > mm->exec_vm);
        VM_WARN_ON(vms->stack_vm > mm->stack_vm);
        VM_WARN_ON(vms->data_vm > mm->data_vm);
        mm->exec_vm -= vms->exec_vm;
        mm->stack_vm -= vms->stack_vm;
        mm->data_vm -= vms->data_vm;

        /* Remove and clean up vmas */
        mas_set(mas_detach, 0);
        mas_for_each(mas_detach, vma, ULONG_MAX)
                remove_vma(vma);

        vm_unacct_memory(vms->nr_accounted);
        validate_mm(mm);
        if (vms->unlock)
                mmap_read_unlock(mm);

        __mt_destroy(mas_detach->tree);
}

/*
 * reattach_vmas() - Undo any munmap work and free resources
 * @mas_detach: The maple state with the detached maple tree
 *
 * Reattach any detached vmas and free up the maple tree used to track the vmas.
 */
static void reattach_vmas(struct ma_state *mas_detach)
{
        struct vm_area_struct *vma;

        mas_set(mas_detach, 0);
        mas_for_each(mas_detach, vma, ULONG_MAX)
                vma_mark_attached(vma);

        __mt_destroy(mas_detach->tree);
}

/*
 * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
 * for removal at a later date.  Handles splitting first and last if necessary
 * and marking the vmas as isolated.
 *
 * @vms: The vma munmap struct
 * @mas_detach: The maple state tracking the detached tree
 *
 * Return: 0 on success, error otherwise
 */
static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
                struct ma_state *mas_detach)
{
        struct vm_area_struct *next = NULL;
        int error;

        /*
         * If we need to split any vma, do it now to save pain later.
         * Does it split the first one?
         */
        if (vms->start > vms->vma->vm_start) {

                /*
                 * Make sure that map_count on return from munmap() will
                 * not exceed its limit; but let map_count go just above
                 * its limit temporarily, to help free resources as expected.
                 */
                if (vms->end < vms->vma->vm_end &&
                    vms->vma->vm_mm->map_count >= get_sysctl_max_map_count()) {
                        error = -ENOMEM;
                        goto map_count_exceeded;
                }

                /* Don't bother splitting the VMA if we can't unmap it anyway */
                if (vma_is_sealed(vms->vma)) {
                        error = -EPERM;
                        goto start_split_failed;
                }

                error = __split_vma(vms->vmi, vms->vma, vms->start, 1);
                if (error)
                        goto start_split_failed;
        }
        vms->prev = vma_prev(vms->vmi);
        if (vms->prev)
                vms->unmap_start = vms->prev->vm_end;

        /*
         * Detach a range of VMAs from the mm. Using next as a temp variable as
         * it is always overwritten.
         */
        for_each_vma_range(*(vms->vmi), next, vms->end) {
                long nrpages;

                if (vma_is_sealed(next)) {
                        error = -EPERM;
                        goto modify_vma_failed;
                }
                /* Does it split the end? */
                if (next->vm_end > vms->end) {
                        error = __split_vma(vms->vmi, next, vms->end, 0);
                        if (error)
                                goto end_split_failed;
                }
                vma_start_write(next);
                mas_set(mas_detach, vms->vma_count++);
                error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
                if (error)
                        goto munmap_gather_failed;

                vma_mark_detached(next);
                nrpages = vma_pages(next);

                vms->nr_pages += nrpages;
                if (vma_test(next, VMA_LOCKED_BIT))
                        vms->locked_vm += nrpages;

                if (vma_test(next, VMA_ACCOUNT_BIT))
                        vms->nr_accounted += nrpages;

                if (is_exec_mapping(next->vm_flags))
                        vms->exec_vm += nrpages;
                else if (is_stack_mapping(next->vm_flags))
                        vms->stack_vm += nrpages;
                else if (is_data_mapping_vma_flags(&next->flags))
                        vms->data_vm += nrpages;

                if (vms->uf) {
                        /*
                         * If userfaultfd_unmap_prep returns an error the vmas
                         * will remain split, but userland will get a
                         * highly unexpected error anyway. This is no
                         * different than the case where the first of the two
                         * __split_vma fails, but we don't undo the first
                         * split, despite we could. This is unlikely enough
                         * failure that it's not worth optimizing it for.
                         */
                        error = userfaultfd_unmap_prep(next, vms->start,
                                                       vms->end, vms->uf);
                        if (error)
                                goto userfaultfd_error;
                }
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
                BUG_ON(next->vm_start < vms->start);
                BUG_ON(next->vm_start > vms->end);
#endif
        }

        vms->next = vma_next(vms->vmi);
        if (vms->next)
                vms->unmap_end = vms->next->vm_start;

#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
        /* Make sure no VMAs are about to be lost. */
        {
                MA_STATE(test, mas_detach->tree, 0, 0);
                struct vm_area_struct *vma_mas, *vma_test;
                int test_count = 0;

                vma_iter_set(vms->vmi, vms->start);
                rcu_read_lock();
                vma_test = mas_find(&test, vms->vma_count - 1);
                for_each_vma_range(*(vms->vmi), vma_mas, vms->end) {
                        BUG_ON(vma_mas != vma_test);
                        test_count++;
                        vma_test = mas_next(&test, vms->vma_count - 1);
                }
                rcu_read_unlock();
                BUG_ON(vms->vma_count != test_count);
        }
#endif

        while (vma_iter_addr(vms->vmi) > vms->start)
                vma_iter_prev_range(vms->vmi);

        vms->clear_ptes = true;
        return 0;

userfaultfd_error:
munmap_gather_failed:
end_split_failed:
modify_vma_failed:
        reattach_vmas(mas_detach);
start_split_failed:
map_count_exceeded:
        return error;
}

/*
 * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
 * @vms: The vma munmap struct
 * @vmi: The vma iterator
 * @vma: The first vm_area_struct to munmap
 * @start: The aligned start address to munmap
 * @end: The aligned end address to munmap
 * @uf: The userfaultfd list_head
 * @unlock: Unlock after the operation.  Only unlocked on success
 */
static void init_vma_munmap(struct vma_munmap_struct *vms,
                struct vma_iterator *vmi, struct vm_area_struct *vma,
                unsigned long start, unsigned long end, struct list_head *uf,
                bool unlock)
{
        vms->vmi = vmi;
        vms->vma = vma;
        if (vma) {
                vms->start = start;
                vms->end = end;
        } else {
                vms->start = vms->end = 0;
        }
        vms->unlock = unlock;
        vms->uf = uf;
        vms->vma_count = 0;
        vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
        vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
        vms->unmap_start = FIRST_USER_ADDRESS;
        vms->unmap_end = USER_PGTABLES_CEILING;
        vms->clear_ptes = false;
}

/*
 * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
 * @vmi: The vma iterator
 * @vma: The starting vm_area_struct
 * @mm: The mm_struct
 * @start: The aligned start address to munmap.
 * @end: The aligned end address to munmap.
 * @uf: The userfaultfd list_head
 * @unlock: Set to true to drop the mmap_lock.  unlocking only happens on
 * success.
 *
 * Return: 0 on success and drops the lock if so directed, error and leaves the
 * lock held otherwise.
 */
int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                struct mm_struct *mm, unsigned long start, unsigned long end,
                struct list_head *uf, bool unlock)
{
        struct maple_tree mt_detach;
        MA_STATE(mas_detach, &mt_detach, 0, 0);
        mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
        mt_on_stack(mt_detach);
        struct vma_munmap_struct vms;
        int error;

        init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
        error = vms_gather_munmap_vmas(&vms, &mas_detach);
        if (error)
                goto gather_failed;

        error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
        if (error)
                goto clear_tree_failed;

        /* Point of no return */
        vms_complete_munmap_vmas(&vms, &mas_detach);
        return 0;

clear_tree_failed:
        reattach_vmas(&mas_detach);
gather_failed:
        validate_mm(mm);
        return error;
}

/*
 * do_vmi_munmap() - munmap a given range.
 * @vmi: The vma iterator
 * @mm: The mm_struct
 * @start: The start address to munmap
 * @len: The length of the range to munmap
 * @uf: The userfaultfd list_head
 * @unlock: set to true if the user wants to drop the mmap_lock on success
 *
 * This function takes a @mas that is either pointing to the previous VMA or set
 * to MA_START and sets it up to remove the mapping(s).  The @len will be
 * aligned.
 *
 * Return: 0 on success and drops the lock if so directed, error and leaves the
 * lock held otherwise.
 */
int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                  unsigned long start, size_t len, struct list_head *uf,
                  bool unlock)
{
        unsigned long end;
        struct vm_area_struct *vma;

        if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
                return -EINVAL;

        end = start + PAGE_ALIGN(len);
        if (end == start)
                return -EINVAL;

        /* Find the first overlapping VMA */
        vma = vma_find(vmi, end);
        if (!vma) {
                if (unlock)
                        mmap_write_unlock(mm);
                return 0;
        }

        return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}

/*
 * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
 * context and anonymous VMA name within the range [start, end).
 *
 * As a result, we might be able to merge the newly modified VMA range with an
 * adjacent VMA with identical properties.
 *
 * If no merge is possible and the range does not span the entirety of the VMA,
 * we then need to split the VMA to accommodate the change.
 *
 * The function returns either the merged VMA, the original VMA if a split was
 * required instead, or an error if the split failed.
 */
static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
        struct vm_area_struct *vma = vmg->middle;
        unsigned long start = vmg->start;
        unsigned long end = vmg->end;
        struct vm_area_struct *merged;

        /* First, try to merge. */
        merged = vma_merge_existing_range(vmg);
        if (merged)
                return merged;
        if (vmg_nomem(vmg))
                return ERR_PTR(-ENOMEM);

        /*
         * Split can fail for reasons other than OOM, so if the user requests
         * this it's probably a mistake.
         */
        VM_WARN_ON(vmg->give_up_on_oom &&
                   (vma->vm_start != start || vma->vm_end != end));

        /* Split any preceding portion of the VMA. */
        if (vma->vm_start < start) {
                int err = split_vma(vmg->vmi, vma, start, 1);

                if (err)
                        return ERR_PTR(err);
        }

        /* Split any trailing portion of the VMA. */
        if (vma->vm_end > end) {
                int err = split_vma(vmg->vmi, vma, end, 0);

                if (err)
                        return ERR_PTR(err);
        }

        return vma;
}

struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
                struct vm_area_struct *prev, struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                vma_flags_t *vma_flags_ptr)
{
        VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
        const vma_flags_t vma_flags = *vma_flags_ptr;
        struct vm_area_struct *ret;

        vmg.vma_flags = vma_flags;

        ret = vma_modify(&vmg);
        if (IS_ERR(ret))
                return ret;

        /*
         * For a merge to succeed, the flags must match those
         * requested. However, sticky flags may have been retained, so propagate
         * them to the caller.
         */
        if (vmg.state == VMA_MERGE_SUCCESS)
                *vma_flags_ptr = ret->flags;
        return ret;
}

struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi,
                struct vm_area_struct *prev, struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                struct anon_vma_name *new_name)
{
        VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);

        vmg.anon_name = new_name;

        return vma_modify(&vmg);
}

struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi,
                struct vm_area_struct *prev, struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                struct mempolicy *new_pol)
{
        VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);

        vmg.policy = new_pol;

        return vma_modify(&vmg);
}

struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi,
                struct vm_area_struct *prev, struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                const vma_flags_t *vma_flags, struct vm_userfaultfd_ctx new_ctx,
                bool give_up_on_oom)
{
        VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);

        vmg.vma_flags = *vma_flags;
        vmg.uffd_ctx = new_ctx;
        if (give_up_on_oom)
                vmg.give_up_on_oom = true;

        return vma_modify(&vmg);
}

/*
 * Expand vma by delta bytes, potentially merging with an immediately adjacent
 * VMA with identical properties.
 */
struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
                                        struct vm_area_struct *vma,
                                        unsigned long delta)
{
        VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);

        vmg.next = vma_iter_next_rewind(vmi, NULL);
        vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */

        return vma_merge_new_range(&vmg);
}

void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
{
        vb->count = 0;
}

static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
{
        struct address_space *mapping;
        int i;

        mapping = vb->vmas[0]->vm_file->f_mapping;
        i_mmap_lock_write(mapping);
        for (i = 0; i < vb->count; i++) {
                VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
                __remove_shared_vm_struct(vb->vmas[i], mapping);
        }
        i_mmap_unlock_write(mapping);

        unlink_file_vma_batch_init(vb);
}

void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
                               struct vm_area_struct *vma)
{
        if (vma->vm_file == NULL)
                return;

        if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
            vb->count == ARRAY_SIZE(vb->vmas))
                unlink_file_vma_batch_process(vb);

        vb->vmas[vb->count] = vma;
        vb->count++;
}

void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
{
        if (vb->count > 0)
                unlink_file_vma_batch_process(vb);
}

static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock)
{
        struct file *file = vma->vm_file;
        struct address_space *mapping;

        if (file) {
                mapping = file->f_mapping;
                i_mmap_lock_write(mapping);
                __vma_link_file(vma, mapping);
                if (!hold_rmap_lock)
                        i_mmap_unlock_write(mapping);
        }
}

static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
{
        VMA_ITERATOR(vmi, mm, 0);

        vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        vma_start_write(vma);
        vma_iter_store_new(&vmi, vma);
        vma_link_file(vma, /* hold_rmap_lock= */false);
        mm->map_count++;
        validate_mm(mm);
        return 0;
}

/*
 * Copy the vma structure to a new location in the same mm,
 * prior to moving page table entries, to effect an mremap move.
 */
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks)
{
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma;
        bool faulted_in_anon_vma = true;
        VMA_ITERATOR(vmi, mm, addr);
        VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len);

        /*
         * If anonymous vma has not yet been faulted, update new pgoff
         * to match new location, to increase its chance of merging.
         */
        if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
                pgoff = addr >> PAGE_SHIFT;
                faulted_in_anon_vma = false;
        }

        /*
         * If the VMA we are copying might contain a uprobe PTE, ensure
         * that we do not establish one upon merge. Otherwise, when mremap()
         * moves page tables, it will orphan the newly created PTE.
         */
        if (vma->vm_file)
                vmg.skip_vma_uprobe = true;

        new_vma = find_vma_prev(mm, addr, &vmg.prev);
        if (new_vma && new_vma->vm_start < addr + len)
                return NULL;        /* should never get here */

        vmg.pgoff = pgoff;
        vmg.next = vma_iter_next_rewind(&vmi, NULL);
        new_vma = vma_merge_copied_range(&vmg);

        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
                 */
                if (unlikely(vma_start >= new_vma->vm_start &&
                             vma_start < new_vma->vm_end)) {
                        /*
                         * The only way we can get a vma_merge with
                         * self during an mremap is if the vma hasn't
                         * been faulted in yet and we were allowed to
                         * reset the dst vma->vm_pgoff to the
                         * destination address of the mremap to allow
                         * the merge to happen. mremap must change the
                         * vm_pgoff linearity between src and dst vmas
                         * (in turn preventing a vma_merge) to be
                         * safe. It is only safe to keep the vm_pgoff
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
                        *vmap = vma = new_vma;
                }
                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = vm_area_dup(vma);
                if (!new_vma)
                        goto out;
                vma_set_range(new_vma, addr, addr + len, pgoff);
                if (vma_dup_policy(vma, new_vma))
                        goto out_free_vma;
                if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP))
                        goto out_free_mempol;
                if (new_vma->vm_file)
                        get_file(new_vma->vm_file);
                if (new_vma->vm_ops && new_vma->vm_ops->open)
                        new_vma->vm_ops->open(new_vma);
                if (vma_link(mm, new_vma))
                        goto out_vma_link;
                *need_rmap_locks = false;
        }
        return new_vma;

out_vma_link:
        fixup_hugetlb_reservations(new_vma);
        vma_close(new_vma);

        if (new_vma->vm_file)
                fput(new_vma->vm_file);

        unlink_anon_vmas(new_vma);
out_free_mempol:
        mpol_put(vma_policy(new_vma));
out_free_vma:
        vm_area_free(new_vma);
out:
        return NULL;
}

/*
 * Rough compatibility check to quickly see if it's even worth looking
 * at sharing an anon_vma.
 *
 * They need to have the same vm_file, and the flags can only differ
 * in things that mprotect may change.
 *
 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
 * we can merge the two vma's. For example, we refuse to merge a vma if
 * there is a vm_ops->close() function, because that indicates that the
 * driver is doing some kind of reference counting. But that doesn't
 * really matter for the anon_vma sharing case.
 */
static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
{
        vma_flags_t diff = vma_flags_diff_pair(&a->flags, &b->flags);

        vma_flags_clear_mask(&diff, VMA_ACCESS_FLAGS);
        vma_flags_clear_mask(&diff, VMA_IGNORE_MERGE_FLAGS);

        return a->vm_end == b->vm_start &&
                mpol_equal(vma_policy(a), vma_policy(b)) &&
                a->vm_file == b->vm_file &&
                vma_flags_empty(&diff) &&
                b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}

/*
 * Do some basic sanity checking to see if we can re-use the anon_vma
 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
 * the same as 'old', the other will be the new one that is trying
 * to share the anon_vma.
 *
 * NOTE! This runs with mmap_lock held for reading, so it is possible that
 * the anon_vma of 'old' is concurrently in the process of being set up
 * by another page fault trying to merge _that_. But that's ok: if it
 * is being set up, that automatically means that it will be a singleton
 * acceptable for merging, so we can do all of this optimistically. But
 * we do that READ_ONCE() to make sure that we never re-load the pointer.
 *
 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
 * is to return an anon_vma that is "complex" due to having gone through
 * a fork).
 *
 * We also make sure that the two vma's are compatible (adjacent,
 * and with the same memory policies). That's all stable, even with just
 * a read lock on the mmap_lock.
 */
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old,
                                          struct vm_area_struct *a,
                                          struct vm_area_struct *b)
{
        if (anon_vma_compatible(a, b)) {
                struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);

                if (anon_vma && list_is_singular(&old->anon_vma_chain))
                        return anon_vma;
        }
        return NULL;
}

/*
 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
 * neighbouring vmas for a suitable anon_vma, before it goes off
 * to allocate a new anon_vma.  It checks because a repetitive
 * sequence of mprotects and faults may otherwise lead to distinct
 * anon_vmas being allocated, preventing vma merge in subsequent
 * mprotect.
 */
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
        struct anon_vma *anon_vma = NULL;
        struct vm_area_struct *prev, *next;
        VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);

        /* Try next first. */
        next = vma_iter_load(&vmi);
        if (next) {
                anon_vma = reusable_anon_vma(next, vma, next);
                if (anon_vma)
                        return anon_vma;
        }

        prev = vma_prev(&vmi);
        VM_BUG_ON_VMA(prev != vma, vma);
        prev = vma_prev(&vmi);
        /* Try prev next. */
        if (prev)
                anon_vma = reusable_anon_vma(prev, prev, vma);

        /*
         * We might reach here with anon_vma == NULL if we can't find
         * any reusable anon_vma.
         * There's no absolute need to look only at touching neighbours:
         * we could search further afield for "compatible" anon_vmas.
         * But it would probably just be a waste of time searching,
         * or lead to too many vmas hanging off the same anon_vma.
         * We're trying to allow mprotect remerging later on,
         * not trying to minimize memory used for anon_vmas.
         */
        return anon_vma;
}

static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
{
        return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
}

static bool vma_is_shared_writable(struct vm_area_struct *vma)
{
        return vma_test_all(vma, VMA_WRITE_BIT, VMA_SHARED_BIT);
}

static bool vma_fs_can_writeback(struct vm_area_struct *vma)
{
        /* No managed pages to writeback. */
        if (vma_test(vma, VMA_PFNMAP_BIT))
                return false;

        return vma->vm_file && vma->vm_file->f_mapping &&
                mapping_can_writeback(vma->vm_file->f_mapping);
}

/*
 * Does this VMA require the underlying folios to have their dirty state
 * tracked?
 */
bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
{
        /* Only shared, writable VMAs require dirty tracking. */
        if (!vma_is_shared_writable(vma))
                return false;

        /* Does the filesystem need to be notified? */
        if (vm_ops_needs_writenotify(vma->vm_ops))
                return true;

        /*
         * Even if the filesystem doesn't indicate a need for writenotify, if it
         * can writeback, dirty tracking is still required.
         */
        return vma_fs_can_writeback(vma);
}

/*
 * Some shared mappings will want the pages marked read-only
 * to track write events. If so, we'll downgrade vm_page_prot
 * to the private version (using protection_map[] without the
 * VM_SHARED bit).
 */
bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
        /* If it was private or non-writable, the write bit is already clear */
        if (!vma_is_shared_writable(vma))
                return false;

        /* The backer wishes to know when pages are first written to? */
        if (vm_ops_needs_writenotify(vma->vm_ops))
                return true;

        /* The open routine did something to the protections that pgprot_modify
         * won't preserve? */
        if (pgprot_val(vm_page_prot) !=
            pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
                return false;

        /*
         * Do we need to track softdirty? hugetlb does not support softdirty
         * tracking yet.
         */
        if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
                return true;

        /* Do we need write faults for uffd-wp tracking? */
        if (userfaultfd_wp(vma))
                return true;

        /* Can the mapping track the dirty pages? */
        return vma_fs_can_writeback(vma);
}

static DEFINE_MUTEX(mm_all_locks_mutex);

static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
                /*
                 * We can safely modify head.next after taking the
                 * anon_vma->root->rwsem. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
                 * anon_vma->root->rwsem.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
        }
}

static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
{
        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change from under us because
                 * we hold the mm_all_locks_mutex.
                 *
                 * Operations on ->flags have to be atomic because
                 * even if AS_MM_ALL_LOCKS is stable thanks to the
                 * mm_all_locks_mutex, there may be other cpus
                 * changing other bitflags in parallel to us.
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
                down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
        }
}

/*
 * This operation locks against the VM for all pte/vma/mm related
 * operations that could ever happen on a certain mm. This includes
 * vmtruncate, try_to_unmap, and all page faults.
 *
 * The caller must take the mmap_lock in write mode before calling
 * mm_take_all_locks(). The caller isn't allowed to release the
 * mmap_lock until mm_drop_all_locks() returns.
 *
 * mmap_lock in write mode is required in order to block all operations
 * that could modify pagetables and free pages without need of
 * altering the vma layout. It's also needed in write mode to avoid new
 * anon_vmas to be associated with existing vmas.
 *
 * A single task can't take more than one mm_take_all_locks() in a row
 * or it would deadlock.
 *
 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
 * mapping->flags avoid to take the same lock twice, if more than one
 * vma in this mm is backed by the same anon_vma or address_space.
 *
 * We take locks in following order, accordingly to comment at beginning
 * of mm/rmap.c:
 *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
 *     hugetlb mapping);
 *   - all vmas marked locked
 *   - all i_mmap_rwsem locks;
 *   - all anon_vma->rwseml
 *
 * We can take all locks within these types randomly because the VM code
 * doesn't nest them and we protected from parallel mm_take_all_locks() by
 * mm_all_locks_mutex.
 *
 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
 * that may have to take thousand of locks.
 *
 * mm_take_all_locks() can fail if it's interrupted by signals.
 */
int mm_take_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_assert_write_locked(mm);

        mutex_lock(&mm_all_locks_mutex);

        /*
         * vma_start_write() does not have a complement in mm_drop_all_locks()
         * because vma_start_write() is always asymmetrical; it marks a VMA as
         * being written to until mmap_write_unlock() or mmap_write_downgrade()
         * is reached.
         */
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                vma_start_write(vma);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                !is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_lock_anon_vma(mm, avc->anon_vma);
        }

        return 0;

out_unlock:
        mm_drop_all_locks(mm);
        return -EINTR;
}

static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
                 * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
                 * anon_vma->root->rwsem.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
                anon_vma_unlock_write(anon_vma);
        }
}

static void vm_unlock_mapping(struct address_space *mapping)
{
        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                i_mmap_unlock_write(mapping);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();
        }
}

/*
 * The mmap_lock cannot be released by the caller until
 * mm_drop_all_locks() returns.
 */
void mm_drop_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_assert_write_locked(mm);
        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));

        for_each_vma(vmi, vma) {
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_unlock_anon_vma(avc->anon_vma);
                if (vma->vm_file && vma->vm_file->f_mapping)
                        vm_unlock_mapping(vma->vm_file->f_mapping);
        }

        mutex_unlock(&mm_all_locks_mutex);
}

/*
 * We account for memory if it's a private writeable mapping,
 * not hugepages and VM_NORESERVE wasn't set.
 */
static bool accountable_mapping(struct mmap_state *map)
{
        const struct file *file = map->file;

        /*
         * hugetlb has its own accounting separate from the core VM
         * VM_HUGETLB may not be set yet so we cannot check for that flag.
         */
        if (file && is_file_hugepages(file))
                return false;

        return vma_flags_test(&map->vma_flags, VMA_WRITE_BIT) &&
                !vma_flags_test_any(&map->vma_flags, VMA_NORESERVE_BIT,
                                    VMA_SHARED_BIT);
}

/*
 * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
 * operation.
 * @vms: The vma unmap structure
 * @mas_detach: The maple state with the detached maple tree
 *
 * Reattach any detached vmas, free up the maple tree used to track the vmas.
 * If that's not possible because the ptes are cleared (and vm_ops->closed() may
 * have been called), then a NULL is written over the vmas and the vmas are
 * removed (munmap() completed).
 */
static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
                struct ma_state *mas_detach)
{
        struct ma_state *mas = &vms->vmi->mas;

        if (!vms->nr_pages)
                return;

        if (vms->clear_ptes)
                return reattach_vmas(mas_detach);

        /*
         * Aborting cannot just call the vm_ops open() because they are often
         * not symmetrical and state data has been lost.  Resort to the old
         * failure method of leaving a gap where the MAP_FIXED mapping failed.
         */
        mas_set_range(mas, vms->start, vms->end - 1);
        mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
        /* Clean up the insertion of the unfortunate gap */
        vms_complete_munmap_vmas(vms, mas_detach);
}

static void update_ksm_flags(struct mmap_state *map)
{
        map->vma_flags = ksm_vma_flags(map->mm, map->file, map->vma_flags);
}

static void set_desc_from_map(struct vm_area_desc *desc,
                const struct mmap_state *map)
{
        desc->start = map->addr;
        desc->end = map->end;

        desc->pgoff = map->pgoff;
        desc->vm_file = map->file;
        desc->vma_flags = map->vma_flags;
        desc->page_prot = map->page_prot;
}

/*
 * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be
 * unmapped once the map operation is completed, check limits, account mapping
 * and clean up any pre-existing VMAs.
 *
 * As a result it sets up the @map and @desc objects.
 *
 * @map: Mapping state.
 * @desc: VMA descriptor
 * @uf:  Userfaultfd context list.
 *
 * Returns: 0 on success, error code otherwise.
 */
static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc,
                        struct list_head *uf)
{
        int error;
        struct vma_iterator *vmi = map->vmi;
        struct vma_munmap_struct *vms = &map->vms;

        /* Find the first overlapping VMA and initialise unmap state. */
        vms->vma = vma_find(vmi, map->end);
        init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf,
                        /* unlock = */ false);

        /* OK, we have overlapping VMAs - prepare to unmap them. */
        if (vms->vma) {
                mt_init_flags(&map->mt_detach,
                              vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
                mt_on_stack(map->mt_detach);
                mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0);
                /* Prepare to unmap any existing mapping in the area */
                error = vms_gather_munmap_vmas(vms, &map->mas_detach);
                if (error) {
                        /* On error VMAs will already have been reattached. */
                        vms->nr_pages = 0;
                        return error;
                }

                map->next = vms->next;
                map->prev = vms->prev;
        } else {
                map->next = vma_iter_next_rewind(vmi, &map->prev);
        }

        /* Check against address space limit. */
        if (!may_expand_vm(map->mm, &map->vma_flags, map->pglen - vms->nr_pages))
                return -ENOMEM;

        /* Private writable mapping: check memory availability. */
        if (accountable_mapping(map)) {
                map->charged = map->pglen;
                map->charged -= vms->nr_accounted;
                if (map->charged) {
                        error = security_vm_enough_memory_mm(map->mm, map->charged);
                        if (error)
                                return error;
                }

                vms->nr_accounted = 0;
                vma_flags_set(&map->vma_flags, VMA_ACCOUNT_BIT);
        }

        /*
         * Clear PTEs while the vma is still in the tree so that rmap
         * cannot race with the freeing later in the truncate scenario.
         * This is also needed for mmap_file(), which is why vm_ops
         * close function is called.
         */
        vms_clean_up_area(vms, &map->mas_detach);

        set_desc_from_map(desc, map);
        return 0;
}


static int __mmap_new_file_vma(struct mmap_state *map,
                               struct vm_area_struct *vma)
{
        struct vma_iterator *vmi = map->vmi;
        int error;

        vma->vm_file = map->file;
        if (!map->file_doesnt_need_get)
                get_file(map->file);

        if (!map->file->f_op->mmap)
                return 0;

        error = mmap_file(vma->vm_file, vma);
        if (error) {
                UNMAP_STATE(unmap, vmi, vma, vma->vm_start, vma->vm_end,
                            map->prev, map->next);
                fput(vma->vm_file);
                vma->vm_file = NULL;

                vma_iter_set(vmi, vma->vm_end);
                /* Undo any partial mapping done by a device driver. */
                unmap_region(&unmap);
                return error;
        }

        /* Drivers cannot alter the address of the VMA. */
        WARN_ON_ONCE(map->addr != vma->vm_start);
        /*
         * Drivers should not permit writability when previously it was
         * disallowed.
         */
        VM_WARN_ON_ONCE(!vma_flags_same_pair(&map->vma_flags, &vma->flags) &&
                        !vma_flags_test(&map->vma_flags, VMA_MAYWRITE_BIT) &&
                        vma_test(vma, VMA_MAYWRITE_BIT));

        map->file = vma->vm_file;
        map->vma_flags = vma->flags;

        return 0;
}

/*
 * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not
 * possible.
 *
 * @map:  Mapping state.
 * @vmap: Output pointer for the new VMA.
 * @action: Any mmap_prepare action that is still to complete.
 *
 * Returns: Zero on success, or an error.
 */
static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap,
        struct mmap_action *action)
{
        struct vma_iterator *vmi = map->vmi;
        int error = 0;
        struct vm_area_struct *vma;

        /*
         * Determine the object being mapped and call the appropriate
         * specific mapper. the address has already been validated, but
         * not unmapped, but the maps are removed from the list.
         */
        vma = vm_area_alloc(map->mm);
        if (!vma)
                return -ENOMEM;

        vma_iter_config(vmi, map->addr, map->end);
        vma_set_range(vma, map->addr, map->end, map->pgoff);
        vma->flags = map->vma_flags;
        vma->vm_page_prot = map->page_prot;

        if (vma_iter_prealloc(vmi, vma)) {
                error = -ENOMEM;
                goto free_vma;
        }

        if (map->file)
                error = __mmap_new_file_vma(map, vma);
        else if (vma_flags_test(&map->vma_flags, VMA_SHARED_BIT))
                error = shmem_zero_setup(vma);
        else
                vma_set_anonymous(vma);

        if (error)
                goto free_iter_vma;

        if (!map->check_ksm_early) {
                update_ksm_flags(map);
                vma->flags = map->vma_flags;
        }

#ifdef CONFIG_SPARC64
        /* TODO: Fix SPARC ADI! */
        WARN_ON_ONCE(!arch_validate_flags(map->vm_flags));
#endif

        /* Lock the VMA since it is modified after insertion into VMA tree */
        vma_start_write(vma);
        vma_iter_store_new(vmi, vma);
        map->mm->map_count++;
        vma_link_file(vma, action->hide_from_rmap_until_complete);

        /*
         * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
         * call covers the non-merge case.
         */
        if (!vma_is_anonymous(vma))
                khugepaged_enter_vma(vma, map->vm_flags);
        *vmap = vma;
        return 0;

free_iter_vma:
        vma_iter_free(vmi);
free_vma:
        vm_area_free(vma);
        return error;
}

/*
 * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping
 *                     statistics, handle locking and finalise the VMA.
 *
 * @map: Mapping state.
 * @vma: Merged or newly allocated VMA for the mmap()'d region.
 */
static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
{
        struct mm_struct *mm = map->mm;

        perf_event_mmap(vma);

        /* Unmap any existing mapping in the area. */
        vms_complete_munmap_vmas(&map->vms, &map->mas_detach);

        vm_stat_account(mm, vma->vm_flags, map->pglen);
        if (vma_test(vma, VMA_LOCKED_BIT)) {
                if (!vma_supports_mlock(vma))
                        vma_clear_flags_mask(vma, VMA_LOCKED_MASK);
                else
                        mm->locked_vm += map->pglen;
        }

        if (vma->vm_file)
                uprobe_mmap(vma);

        /*
         * New (or expanded) vma always get soft dirty status.
         * Otherwise user-space soft-dirty page tracker won't
         * be able to distinguish situation when vma area unmapped,
         * then new mapped in-place (which must be aimed as
         * a completely new data area).
         */
        if (pgtable_supports_soft_dirty())
                vma_set_flags(vma, VMA_SOFTDIRTY_BIT);

        vma_set_page_prot(vma);
}

static int call_action_prepare(struct mmap_state *map,
                               struct vm_area_desc *desc)
{
        int err;

        err = mmap_action_prepare(desc);
        if (err)
                return err;

        return 0;
}

/*
 * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
 * specifies it.
 *
 * This is called prior to any merge attempt, and updates whitelisted fields
 * that are permitted to be updated by the caller.
 *
 * All but user-defined fields will be pre-populated with original values.
 *
 * Returns 0 on success, or an error code otherwise.
 */
static int call_mmap_prepare(struct mmap_state *map,
                struct vm_area_desc *desc)
{
        int err;

        /* Invoke the hook. */
        err = vfs_mmap_prepare(map->file, desc);
        if (err)
                return err;

        err = call_action_prepare(map, desc);
        if (err)
                return err;

        /* Update fields permitted to be changed. */
        map->pgoff = desc->pgoff;
        if (desc->vm_file != map->file) {
                map->file_doesnt_need_get = true;
                map->file = desc->vm_file;
        }
        map->vma_flags = desc->vma_flags;
        map->page_prot = desc->page_prot;
        /* User-defined fields. */
        map->vm_ops = desc->vm_ops;
        map->vm_private_data = desc->private_data;

        return 0;
}

static void set_vma_user_defined_fields(struct vm_area_struct *vma,
                struct mmap_state *map)
{
        if (map->vm_ops)
                vma->vm_ops = map->vm_ops;
        vma->vm_private_data = map->vm_private_data;
}

/*
 * Are we guaranteed no driver can change state such as to preclude KSM merging?
 * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
 */
static bool can_set_ksm_flags_early(struct mmap_state *map)
{
        struct file *file = map->file;

        /* Anonymous mappings have no driver which can change them. */
        if (!file)
                return true;

        /*
         * If .mmap_prepare() is specified, then the driver will have already
         * manipulated state prior to updating KSM flags. So no need to worry
         * about mmap callbacks modifying VMA flags after the KSM flag has been
         * updated here, which could otherwise affect KSM eligibility.
         */
        if (file->f_op->mmap_prepare)
                return true;

        /* shmem is safe. */
        if (shmem_file(file))
                return true;

        /* Any other .mmap callback is not safe. */
        return false;
}

static unsigned long __mmap_region(struct file *file, unsigned long addr,
                unsigned long len, vma_flags_t vma_flags,
                unsigned long pgoff, struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        bool have_mmap_prepare = file && file->f_op->mmap_prepare;
        VMA_ITERATOR(vmi, mm, addr);
        MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vma_flags, file);
        struct vm_area_desc desc = {
                .mm = mm,
                .file = file,
                .action = {
                        .type = MMAP_NOTHING, /* Default to no further action. */
                },
        };
        bool allocated_new = false;
        int error;

        map.check_ksm_early = can_set_ksm_flags_early(&map);

        error = __mmap_setup(&map, &desc, uf);
        if (!error && have_mmap_prepare)
                error = call_mmap_prepare(&map, &desc);
        if (error)
                goto abort_munmap;

        if (map.check_ksm_early)
                update_ksm_flags(&map);

        /* Attempt to merge with adjacent VMAs... */
        if (map.prev || map.next) {
                VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);

                vma = vma_merge_new_range(&vmg);
        }

        /* ...but if we can't, allocate a new VMA. */
        if (!vma) {
                error = __mmap_new_vma(&map, &vma, &desc.action);
                if (error)
                        goto unacct_error;
                allocated_new = true;
        }

        if (have_mmap_prepare)
                set_vma_user_defined_fields(vma, &map);

        __mmap_complete(&map, vma);

        if (have_mmap_prepare && allocated_new) {
                error = mmap_action_complete(vma, &desc.action);
                if (error)
                        return error;
        }

        return addr;

        /* Accounting was done by __mmap_setup(). */
unacct_error:
        if (map.charged)
                vm_unacct_memory(map.charged);
abort_munmap:
        /*
         * This indicates that .mmap_prepare has set a new file, differing from
         * desc->vm_file. But since we're aborting the operation, only the
         * original file will be cleaned up. Ensure we clean up both.
         */
        if (map.file_doesnt_need_get)
                fput(map.file);
        vms_abort_munmap_vmas(&map.vms, &map.mas_detach);
        return error;
}

/**
 * mmap_region() - Actually perform the userland mapping of a VMA into
 * current->mm with known, aligned and overflow-checked @addr and @len, and
 * correctly determined VMA flags @vm_flags and page offset @pgoff.
 *
 * This is an internal memory management function, and should not be used
 * directly.
 *
 * The caller must write-lock current->mm->mmap_lock.
 *
 * @file: If a file-backed mapping, a pointer to the struct file describing the
 * file to be mapped, otherwise NULL.
 * @addr: The page-aligned address at which to perform the mapping.
 * @len: The page-aligned, non-zero, length of the mapping.
 * @vm_flags: The VMA flags which should be applied to the mapping.
 * @pgoff: If @file is specified, the page offset into the file, if not then
 * the virtual page offset in memory of the anonymous mapping.
 * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap
 * events.
 *
 * Returns: Either an error, or the address at which the requested mapping has
 * been performed.
 */
unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, vm_flags_t vm_flags,
                          unsigned long pgoff, struct list_head *uf)
{
        unsigned long ret;
        bool writable_file_mapping = false;
        const vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags);

        mmap_assert_write_locked(current->mm);

        /* Check to see if MDWE is applicable. */
        if (map_deny_write_exec(&vma_flags, &vma_flags))
                return -EACCES;

        /* Allow architectures to sanity-check the vm_flags. */
        if (!arch_validate_flags(vm_flags))
                return -EINVAL;

        /* Map writable and ensure this isn't a sealed memfd. */
        if (file && is_shared_maywrite(&vma_flags)) {
                int error = mapping_map_writable(file->f_mapping);

                if (error)
                        return error;
                writable_file_mapping = true;
        }

        ret = __mmap_region(file, addr, len, vma_flags, pgoff, uf);

        /* Clear our write mapping regardless of error. */
        if (writable_file_mapping)
                mapping_unmap_writable(file->f_mapping);

        validate_mm(current->mm);
        return ret;
}

/**
 * do_brk_flags() - Increase the brk vma if the flags match.
 * @vmi: The vma iterator
 * @addr: The start address
 * @len: The length of the increase
 * @vma: The vma,
 * @vma_flags: The VMA Flags
 *
 * Extend the brk VMA from addr to addr + len.  If the VMA is NULL or the flags
 * do not match then create a new anonymous VMA.  Eventually we may be able to
 * do some brk-specific accounting here.
 *
 * Returns: %0 on success, or otherwise an error.
 */
int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
                 unsigned long addr, unsigned long len, vma_flags_t vma_flags)
{
        struct mm_struct *mm = current->mm;

        /*
         * Check against address space limits by the changed size
         * Note: This happens *after* clearing old mappings in some code paths.
         */
        vma_flags_set_mask(&vma_flags, VMA_DATA_DEFAULT_FLAGS);
        vma_flags_set(&vma_flags, VMA_ACCOUNT_BIT);
        vma_flags_set_mask(&vma_flags, mm->def_vma_flags);

        vma_flags = ksm_vma_flags(mm, NULL, vma_flags);
        if (!may_expand_vm(mm, &vma_flags, len >> PAGE_SHIFT))
                return -ENOMEM;

        if (mm->map_count > get_sysctl_max_map_count())
                return -ENOMEM;

        if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;

        /*
         * Expand the existing vma if possible; Note that singular lists do not
         * occur after forking, so the expand will only happen on new VMAs.
         */
        if (vma && vma->vm_end == addr) {
                VMG_STATE(vmg, mm, vmi, addr, addr + len, vma_flags, PHYS_PFN(addr));

                vmg.prev = vma;
                /* vmi is positioned at prev, which this mode expects. */
                vmg.just_expand = true;

                if (vma_merge_new_range(&vmg))
                        goto out;
                else if (vmg_nomem(&vmg))
                        goto unacct_fail;
        }

        if (vma)
                vma_iter_next_range(vmi);
        /* create a vma struct for an anonymous mapping */
        vma = vm_area_alloc(mm);
        if (!vma)
                goto unacct_fail;

        vma_set_anonymous(vma);
        vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
        vma->flags = vma_flags;
        vma->vm_page_prot = vm_get_page_prot(vma_flags_to_legacy(vma_flags));
        vma_start_write(vma);
        if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
                goto mas_store_fail;

        mm->map_count++;
        validate_mm(mm);
out:
        perf_event_mmap(vma);
        mm->total_vm += len >> PAGE_SHIFT;
        mm->data_vm += len >> PAGE_SHIFT;
        if (vma_flags_test(&vma_flags, VMA_LOCKED_BIT))
                mm->locked_vm += (len >> PAGE_SHIFT);
        if (pgtable_supports_soft_dirty())
                vma_set_flags(vma, VMA_SOFTDIRTY_BIT);
        return 0;

mas_store_fail:
        vm_area_free(vma);
unacct_fail:
        vm_unacct_memory(len >> PAGE_SHIFT);
        return -ENOMEM;
}

/**
 * unmapped_area() - Find an area between the low_limit and the high_limit with
 * the correct alignment and offset, all from @info. Note: current->mm is used
 * for the search.
 *
 * @info: The unmapped area information including the range [low_limit -
 * high_limit), the alignment offset and mask.
 *
 * Return: A memory address or -ENOMEM.
 */
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long length, gap;
        unsigned long low_limit, high_limit;
        struct vm_area_struct *tmp;
        VMA_ITERATOR(vmi, current->mm, 0);

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask + info->start_gap;
        if (length < info->length)
                return -ENOMEM;

        low_limit = info->low_limit;
        if (low_limit < mmap_min_addr)
                low_limit = mmap_min_addr;
        high_limit = info->high_limit;
retry:
        if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
                return -ENOMEM;

        /*
         * Adjust for the gap first so it doesn't interfere with the later
         * alignment. The first step is the minimum needed to fulfill the start
         * gap, the next step is the minimum to align that. It is the minimum
         * needed to fulfill both.
         */
        gap = vma_iter_addr(&vmi) + info->start_gap;
        gap += (info->align_offset - gap) & info->align_mask;
        tmp = vma_next(&vmi);
        /* Avoid prev check if possible */
        if (tmp && vma_test_any_mask(tmp, VMA_STARTGAP_FLAGS)) {
                if (vm_start_gap(tmp) < gap + length - 1) {
                        low_limit = tmp->vm_end;
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        } else {
                tmp = vma_prev(&vmi);
                if (tmp && vm_end_gap(tmp) > gap) {
                        low_limit = vm_end_gap(tmp);
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        }

        return gap;
}

/**
 * unmapped_area_topdown() - Find an area between the low_limit and the
 * high_limit with the correct alignment and offset at the highest available
 * address, all from @info. Note: current->mm is used for the search.
 *
 * @info: The unmapped area information including the range [low_limit -
 * high_limit), the alignment offset and mask.
 *
 * Return: A memory address or -ENOMEM.
 */
unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
        unsigned long length, gap, gap_end;
        unsigned long low_limit, high_limit;
        struct vm_area_struct *tmp;
        VMA_ITERATOR(vmi, current->mm, 0);

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask + info->start_gap;
        if (length < info->length)
                return -ENOMEM;

        low_limit = info->low_limit;
        if (low_limit < mmap_min_addr)
                low_limit = mmap_min_addr;
        high_limit = info->high_limit;
retry:
        if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
                return -ENOMEM;

        gap = vma_iter_end(&vmi) - info->length;
        gap -= (gap - info->align_offset) & info->align_mask;
        gap_end = vma_iter_end(&vmi);
        tmp = vma_next(&vmi);
         /* Avoid prev check if possible */
        if (tmp && vma_test_any_mask(tmp, VMA_STARTGAP_FLAGS)) {
                if (vm_start_gap(tmp) < gap_end) {
                        high_limit = vm_start_gap(tmp);
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        } else {
                tmp = vma_prev(&vmi);
                if (tmp && vm_end_gap(tmp) > gap) {
                        high_limit = tmp->vm_start;
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        }

        return gap;
}

/*
 * Verify that the stack growth is acceptable and
 * update accounting. This is shared with both the
 * grow-up and grow-down cases.
 */
static int acct_stack_growth(struct vm_area_struct *vma,
                             unsigned long size, unsigned long grow)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long new_start;

        /* address space limit tests */
        if (!may_expand_vm(mm, &vma->flags, grow))
                return -ENOMEM;

        /* Stack limit test */
        if (size > rlimit(RLIMIT_STACK))
                return -ENOMEM;

        /* mlock limit tests */
        if (!mlock_future_ok(mm, vma_test(vma, VMA_LOCKED_BIT),
                             grow << PAGE_SHIFT))
                return -ENOMEM;

        /* Check to ensure the stack will not grow into a hugetlb-only region */
        new_start = vma->vm_end - size;
#ifdef CONFIG_STACK_GROWSUP
        if (vma_test(vma, VMA_GROWSUP_BIT))
                new_start = vma->vm_start;
#endif
        if (is_hugepage_only_range(vma->vm_mm, new_start, size))
                return -EFAULT;

        /*
         * Overcommit..  This must be the final test, as it will
         * update security statistics.
         */
        if (security_vm_enough_memory_mm(mm, grow))
                return -ENOMEM;

        return 0;
}

#ifdef CONFIG_STACK_GROWSUP
/*
 * PA-RISC uses this for its stack.
 * vma is the last one with address > vma->vm_end.  Have to extend vma.
 */
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *next;
        unsigned long gap_addr;
        int error = 0;
        VMA_ITERATOR(vmi, mm, vma->vm_start);

        if (!vma_test(vma, VMA_GROWSUP_BIT))
                return -EFAULT;

        mmap_assert_write_locked(mm);

        /* Guard against exceeding limits of the address space. */
        address &= PAGE_MASK;
        if (address >= (TASK_SIZE & PAGE_MASK))
                return -ENOMEM;
        address += PAGE_SIZE;

        /* Enforce stack_guard_gap */
        gap_addr = address + stack_guard_gap;

        /* Guard against overflow */
        if (gap_addr < address || gap_addr > TASK_SIZE)
                gap_addr = TASK_SIZE;

        next = find_vma_intersection(mm, vma->vm_end, gap_addr);
        if (next && vma_is_accessible(next)) {
                if (!vma_test(next, VMA_GROWSUP_BIT))
                        return -ENOMEM;
                /* Check that both stack segments have the same anon_vma? */
        }

        if (next)
                vma_iter_prev_range_limit(&vmi, address);

        vma_iter_config(&vmi, vma->vm_start, address);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma))) {
                vma_iter_free(&vmi);
                return -ENOMEM;
        }

        /* Lock the VMA before expanding to prevent concurrent page faults */
        vma_start_write(vma);
        /* We update the anon VMA tree. */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address > vma->vm_end) {
                unsigned long size, grow;

                size = address - vma->vm_start;
                grow = (address - vma->vm_end) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                if (vma_test(vma, VMA_LOCKED_BIT))
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
                                /* Overwrite old entry in mtree. */
                                vma_iter_store_overwrite(&vmi, vma);
                                anon_vma_interval_tree_post_update_vma(vma);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        vma_iter_free(&vmi);
        validate_mm(mm);
        return error;
}
#endif /* CONFIG_STACK_GROWSUP */

/*
 * vma is the first one with address < vma->vm_start.  Have to extend vma.
 * mmap_lock held for writing.
 */
int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *prev;
        int error = 0;
        VMA_ITERATOR(vmi, mm, vma->vm_start);

        if (!vma_test(vma, VMA_GROWSDOWN_BIT))
                return -EFAULT;

        mmap_assert_write_locked(mm);

        address &= PAGE_MASK;
        if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
                return -EPERM;

        /* Enforce stack_guard_gap */
        prev = vma_prev(&vmi);
        /* Check that both stack segments have the same anon_vma? */
        if (prev) {
                if (!vma_test(prev, VMA_GROWSDOWN_BIT) &&
                    vma_is_accessible(prev) &&
                    (address - prev->vm_end < stack_guard_gap))
                        return -ENOMEM;
        }

        if (prev)
                vma_iter_next_range_limit(&vmi, vma->vm_start);

        vma_iter_config(&vmi, address, vma->vm_end);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma))) {
                vma_iter_free(&vmi);
                return -ENOMEM;
        }

        /* Lock the VMA before expanding to prevent concurrent page faults */
        vma_start_write(vma);
        /* We update the anon VMA tree. */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address < vma->vm_start) {
                unsigned long size, grow;

                size = vma->vm_end - address;
                grow = (vma->vm_start - address) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                if (vma_test(vma, VMA_LOCKED_BIT))
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
                                /* Overwrite old entry in mtree. */
                                vma_iter_store_overwrite(&vmi, vma);
                                anon_vma_interval_tree_post_update_vma(vma);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        vma_iter_free(&vmi);
        validate_mm(mm);
        return error;
}

int __vm_munmap(unsigned long start, size_t len, bool unlock)
{
        int ret;
        struct mm_struct *mm = current->mm;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, start);

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
        if (ret || !unlock)
                mmap_write_unlock(mm);

        userfaultfd_unmap_complete(mm, &uf);
        return ret;
}

/* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_rwsem is taken here.
 */
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
        unsigned long charged = vma_pages(vma);

        if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
                return -ENOMEM;

        if (vma_test(vma, VMA_ACCOUNT_BIT) &&
             security_vm_enough_memory_mm(mm, charged))
                return -ENOMEM;

        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
         * until its first write fault, when page's anon_vma and index
         * are set.  But now set the vm_pgoff it will almost certainly
         * end up with (unless mremap moves it elsewhere before that
         * first wfault), so /proc/pid/maps tells a consistent story.
         *
         * By setting it to reflect the virtual start address of the
         * vma, merges and splits can happen in a seamless way, just
         * using the existing file pgoff checks and manipulations.
         * Similarly in do_mmap and in do_brk_flags.
         */
        if (vma_is_anonymous(vma)) {
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }

        if (vma_link(mm, vma)) {
                if (vma_test(vma, VMA_ACCOUNT_BIT))
                        vm_unacct_memory(charged);
                return -ENOMEM;
        }

        return 0;
}

/**
 * vma_mmu_pagesize - Default MMU page size granularity for this VMA.
 * @vma: The user mapping.
 *
 * In the common case, the default page size used by the MMU matches the
 * default page size used by the kernel (see vma_kernel_pagesize()). On
 * architectures where it differs, an architecture-specific 'strong' version
 * of this symbol is required.
 *
 * The default MMU page size is not affected by Transparent Huge Pages
 * being in effect, or any usage of larger MMU page sizes (either through
 * architectural huge-page mappings or other explicit/implicit coalescing of
 * virtual ranges performed by the MMU).
 *
 * Return: The default MMU page size granularity for this VMA.
 */
__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
        return vma_kernel_pagesize(vma);
}


















































   18 




   16 

    3 














































































   18 







   16 








    3 















   19 




   17 


   18 


















   18 



































   17 






   19 
   15 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/mount.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/slab.h>
#include <uapi/linux/mount.h>
#include "common.h"

/* String table for special mount operations. */
static const char * const tomoyo_mounts[TOMOYO_MAX_SPECIAL_MOUNT] = {
        [TOMOYO_MOUNT_BIND]            = "--bind",
        [TOMOYO_MOUNT_MOVE]            = "--move",
        [TOMOYO_MOUNT_REMOUNT]         = "--remount",
        [TOMOYO_MOUNT_MAKE_UNBINDABLE] = "--make-unbindable",
        [TOMOYO_MOUNT_MAKE_PRIVATE]    = "--make-private",
        [TOMOYO_MOUNT_MAKE_SLAVE]      = "--make-slave",
        [TOMOYO_MOUNT_MAKE_SHARED]     = "--make-shared",
};

/**
 * tomoyo_audit_mount_log - Audit mount log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_mount_log(struct tomoyo_request_info *r)
        __must_hold_shared(&tomoyo_ss)
{
        return tomoyo_supervisor(r, "file mount %s %s %s 0x%lX\n",
                                 r->param.mount.dev->name,
                                 r->param.mount.dir->name,
                                 r->param.mount.type->name,
                                 r->param.mount.flags);
}

/**
 * tomoyo_check_mount_acl - Check permission for path path path number operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_mount_acl(struct tomoyo_request_info *r,
                                   const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_mount_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return tomoyo_compare_number_union(r->param.mount.flags,
                                           &acl->flags) &&
                tomoyo_compare_name_union(r->param.mount.type,
                                          &acl->fs_type) &&
                tomoyo_compare_name_union(r->param.mount.dir,
                                          &acl->dir_name) &&
                (!r->param.mount.need_dev ||
                 tomoyo_compare_name_union(r->param.mount.dev,
                                           &acl->dev_name));
}

/**
 * tomoyo_mount_acl - Check permission for mount() operation.
 *
 * @r:        Pointer to "struct tomoyo_request_info".
 * @dev_name: Name of device file. Maybe NULL.
 * @dir:      Pointer to "struct path".
 * @type:     Name of filesystem type.
 * @flags:    Mount options.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_mount_acl(struct tomoyo_request_info *r,
                            const char *dev_name,
                            const struct path *dir, const char *type,
                            unsigned long flags)
        __must_hold_shared(&tomoyo_ss)
{
        struct tomoyo_obj_info obj = { };
        struct path path;
        struct file_system_type *fstype = NULL;
        const char *requested_type = NULL;
        const char *requested_dir_name = NULL;
        const char *requested_dev_name = NULL;
        struct tomoyo_path_info rtype;
        struct tomoyo_path_info rdev;
        struct tomoyo_path_info rdir;
        int need_dev = 0;
        int error = -ENOMEM;

        r->obj = &obj;

        /* Get fstype. */
        requested_type = tomoyo_encode(type);
        if (!requested_type)
                goto out;
        rtype.name = requested_type;
        tomoyo_fill_path_info(&rtype);

        /* Get mount point. */
        obj.path2 = *dir;
        requested_dir_name = tomoyo_realpath_from_path(dir);
        if (!requested_dir_name) {
                error = -ENOMEM;
                goto out;
        }
        rdir.name = requested_dir_name;
        tomoyo_fill_path_info(&rdir);

        /* Compare fs name. */
        if (type == tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]) {
                /* dev_name is ignored. */
        } else if (type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE] ||
                   type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE] ||
                   type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE] ||
                   type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]) {
                /* dev_name is ignored. */
        } else if (type == tomoyo_mounts[TOMOYO_MOUNT_BIND] ||
                   type == tomoyo_mounts[TOMOYO_MOUNT_MOVE]) {
                need_dev = -1; /* dev_name is a directory */
        } else {
                fstype = get_fs_type(type);
                if (!fstype) {
                        error = -ENODEV;
                        goto out;
                }
                if (fstype->fs_flags & FS_REQUIRES_DEV)
                        /* dev_name is a block device file. */
                        need_dev = 1;
        }
        if (need_dev) {
                /* Get mount point or device file. */
                if (!dev_name || kern_path(dev_name, LOOKUP_FOLLOW, &path)) {
                        error = -ENOENT;
                        goto out;
                }
                obj.path1 = path;
                requested_dev_name = tomoyo_realpath_from_path(&path);
                if (!requested_dev_name) {
                        error = -ENOENT;
                        goto out;
                }
        } else {
                /* Map dev_name to "<NULL>" if no dev_name given. */
                if (!dev_name)
                        dev_name = "<NULL>";
                requested_dev_name = tomoyo_encode(dev_name);
                if (!requested_dev_name) {
                        error = -ENOMEM;
                        goto out;
                }
        }
        rdev.name = requested_dev_name;
        tomoyo_fill_path_info(&rdev);
        r->param_type = TOMOYO_TYPE_MOUNT_ACL;
        r->param.mount.need_dev = need_dev;
        r->param.mount.dev = &rdev;
        r->param.mount.dir = &rdir;
        r->param.mount.type = &rtype;
        r->param.mount.flags = flags;
        do {
                tomoyo_check_acl(r, tomoyo_check_mount_acl);
                error = tomoyo_audit_mount_log(r);
        } while (error == TOMOYO_RETRY_REQUEST);
 out:
        kfree(requested_dev_name);
        kfree(requested_dir_name);
        if (fstype)
                put_filesystem(fstype);
        kfree(requested_type);
        /* Drop refcount obtained by kern_path(). */
        if (obj.path1.dentry)
                path_put(&obj.path1);
        return error;
}

/**
 * tomoyo_mount_permission - Check permission for mount() operation.
 *
 * @dev_name:  Name of device file. Maybe NULL.
 * @path:      Pointer to "struct path".
 * @type:      Name of filesystem type. Maybe NULL.
 * @flags:     Mount options.
 * @data_page: Optional data. Maybe NULL.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_mount_permission(const char *dev_name, const struct path *path,
                            const char *type, unsigned long flags,
                            void *data_page)
{
        struct tomoyo_request_info r;
        int error;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, TOMOYO_MAC_FILE_MOUNT)
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
                flags &= ~MS_MGC_MSK;
        if (flags & MS_REMOUNT) {
                type = tomoyo_mounts[TOMOYO_MOUNT_REMOUNT];
                flags &= ~MS_REMOUNT;
        } else if (flags & MS_BIND) {
                type = tomoyo_mounts[TOMOYO_MOUNT_BIND];
                flags &= ~MS_BIND;
        } else if (flags & MS_SHARED) {
                if (flags & (MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                        return -EINVAL;
                type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED];
                flags &= ~MS_SHARED;
        } else if (flags & MS_PRIVATE) {
                if (flags & (MS_SHARED | MS_SLAVE | MS_UNBINDABLE))
                        return -EINVAL;
                type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE];
                flags &= ~MS_PRIVATE;
        } else if (flags & MS_SLAVE) {
                if (flags & (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE))
                        return -EINVAL;
                type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE];
                flags &= ~MS_SLAVE;
        } else if (flags & MS_UNBINDABLE) {
                if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE))
                        return -EINVAL;
                type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE];
                flags &= ~MS_UNBINDABLE;
        } else if (flags & MS_MOVE) {
                type = tomoyo_mounts[TOMOYO_MOUNT_MOVE];
                flags &= ~MS_MOVE;
        }
        if (!type)
                type = "<NULL>";
        idx = tomoyo_read_lock();
        error = tomoyo_mount_acl(&r, dev_name, path, type, flags);
        tomoyo_read_unlock(idx);
        return error;
}

































































































































































































































































































































    1 


    1 






























































































    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 


    1 











    1 





























    1 
    1 
















    1 















    1 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Event char devices, giving access to raw input device events.
 *
 * Copyright (c) 1999-2002 Vojtech Pavlik
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#define EVDEV_MINOR_BASE        64
#define EVDEV_MINORS                32
#define EVDEV_MIN_BUFFER_SIZE        64U
#define EVDEV_BUF_PACKETS        8

#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/input/mt.h>
#include <linux/major.h>
#include <linux/device.h>
#include <linux/cdev.h>
#include "input-compat.h"

struct evdev {
        int open;
        struct input_handle handle;
        struct evdev_client __rcu *grab;
        struct list_head client_list;
        spinlock_t client_lock; /* protects client_list */
        struct mutex mutex;
        struct device dev;
        struct cdev cdev;
        bool exist;
};

struct evdev_client {
        unsigned int head;
        unsigned int tail;
        unsigned int packet_head; /* [future] position of the first element of next packet */
        spinlock_t buffer_lock; /* protects access to buffer, head and tail */
        wait_queue_head_t wait;
        struct fasync_struct *fasync;
        struct evdev *evdev;
        struct list_head node;
        enum input_clock_type clk_type;
        bool revoked;
        unsigned long *evmasks[EV_CNT];
        unsigned int bufsize;
        struct input_event buffer[] __counted_by(bufsize);
};

static size_t evdev_get_mask_cnt(unsigned int type)
{
        static const size_t counts[EV_CNT] = {
                /* EV_SYN==0 is EV_CNT, _not_ SYN_CNT, see EVIOCGBIT */
                [EV_SYN]        = EV_CNT,
                [EV_KEY]        = KEY_CNT,
                [EV_REL]        = REL_CNT,
                [EV_ABS]        = ABS_CNT,
                [EV_MSC]        = MSC_CNT,
                [EV_SW]                = SW_CNT,
                [EV_LED]        = LED_CNT,
                [EV_SND]        = SND_CNT,
                [EV_FF]                = FF_CNT,
        };

        return (type < EV_CNT) ? counts[type] : 0;
}

/* requires the buffer lock to be held */
static bool __evdev_is_filtered(struct evdev_client *client,
                                unsigned int type,
                                unsigned int code)
{
        unsigned long *mask;
        size_t cnt;

        /* EV_SYN and unknown codes are never filtered */
        if (type == EV_SYN || type >= EV_CNT)
                return false;

        /* first test whether the type is filtered */
        mask = client->evmasks[0];
        if (mask && !test_bit(type, mask))
                return true;

        /* unknown values are never filtered */
        cnt = evdev_get_mask_cnt(type);
        if (!cnt || code >= cnt)
                return false;

        mask = client->evmasks[type];
        return mask && !test_bit(code, mask);
}

/* flush queued events of type @type, caller must hold client->buffer_lock */
static void __evdev_flush_queue(struct evdev_client *client, unsigned int type)
{
        unsigned int i, head, num;
        unsigned int mask = client->bufsize - 1;
        bool is_report;
        struct input_event *ev;

        BUG_ON(type == EV_SYN);

        head = client->tail;
        client->packet_head = client->tail;

        /* init to 1 so a leading SYN_REPORT will not be dropped */
        num = 1;

        for (i = client->tail; i != client->head; i = (i + 1) & mask) {
                ev = &client->buffer[i];
                is_report = ev->type == EV_SYN && ev->code == SYN_REPORT;

                if (ev->type == type) {
                        /* drop matched entry */
                        continue;
                } else if (is_report && !num) {
                        /* drop empty SYN_REPORT groups */
                        continue;
                } else if (head != i) {
                        /* move entry to fill the gap */
                        client->buffer[head] = *ev;
                }

                num++;
                head = (head + 1) & mask;

                if (is_report) {
                        num = 0;
                        client->packet_head = head;
                }
        }

        client->head = head;
}

static void __evdev_queue_syn_dropped(struct evdev_client *client)
{
        ktime_t *ev_time = input_get_timestamp(client->evdev->handle.dev);
        struct timespec64 ts = ktime_to_timespec64(ev_time[client->clk_type]);
        struct input_event ev;

        ev.input_event_sec = ts.tv_sec;
        ev.input_event_usec = ts.tv_nsec / NSEC_PER_USEC;
        ev.type = EV_SYN;
        ev.code = SYN_DROPPED;
        ev.value = 0;

        client->buffer[client->head++] = ev;
        client->head &= client->bufsize - 1;

        if (unlikely(client->head == client->tail)) {
                /* drop queue but keep our SYN_DROPPED event */
                client->tail = (client->head - 1) & (client->bufsize - 1);
                client->packet_head = client->tail;
        }
}

static void evdev_queue_syn_dropped(struct evdev_client *client)
{
        unsigned long flags;

        spin_lock_irqsave(&client->buffer_lock, flags);
        __evdev_queue_syn_dropped(client);
        spin_unlock_irqrestore(&client->buffer_lock, flags);
}

static int evdev_set_clk_type(struct evdev_client *client, unsigned int clkid)
{
        unsigned long flags;
        enum input_clock_type clk_type;

        switch (clkid) {

        case CLOCK_REALTIME:
                clk_type = INPUT_CLK_REAL;
                break;
        case CLOCK_MONOTONIC:
                clk_type = INPUT_CLK_MONO;
                break;
        case CLOCK_BOOTTIME:
                clk_type = INPUT_CLK_BOOT;
                break;
        default:
                return -EINVAL;
        }

        if (client->clk_type != clk_type) {
                client->clk_type = clk_type;

                /*
                 * Flush pending events and queue SYN_DROPPED event,
                 * but only if the queue is not empty.
                 */
                spin_lock_irqsave(&client->buffer_lock, flags);

                if (client->head != client->tail) {
                        client->packet_head = client->head = client->tail;
                        __evdev_queue_syn_dropped(client);
                }

                spin_unlock_irqrestore(&client->buffer_lock, flags);
        }

        return 0;
}

static void __pass_event(struct evdev_client *client,
                         const struct input_event *event)
{
        client->buffer[client->head++] = *event;
        client->head &= client->bufsize - 1;

        if (unlikely(client->head == client->tail)) {
                /*
                 * This effectively "drops" all unconsumed events, leaving
                 * EV_SYN/SYN_DROPPED plus the newest event in the queue.
                 */
                client->tail = (client->head - 2) & (client->bufsize - 1);

                client->buffer[client->tail] = (struct input_event) {
                        .input_event_sec = event->input_event_sec,
                        .input_event_usec = event->input_event_usec,
                        .type = EV_SYN,
                        .code = SYN_DROPPED,
                        .value = 0,
                };

                client->packet_head = client->tail;
        }

        if (event->type == EV_SYN && event->code == SYN_REPORT) {
                client->packet_head = client->head;
                kill_fasync(&client->fasync, SIGIO, POLL_IN);
        }
}

static void evdev_pass_values(struct evdev_client *client,
                        const struct input_value *vals, unsigned int count,
                        ktime_t *ev_time)
{
        const struct input_value *v;
        struct input_event event;
        struct timespec64 ts;
        bool wakeup = false;

        if (client->revoked)
                return;

        ts = ktime_to_timespec64(ev_time[client->clk_type]);
        event.input_event_sec = ts.tv_sec;
        event.input_event_usec = ts.tv_nsec / NSEC_PER_USEC;

        /* Interrupts are disabled, just acquire the lock. */
        spin_lock(&client->buffer_lock);

        for (v = vals; v != vals + count; v++) {
                if (__evdev_is_filtered(client, v->type, v->code))
                        continue;

                if (v->type == EV_SYN && v->code == SYN_REPORT) {
                        /* drop empty SYN_REPORT */
                        if (client->packet_head == client->head)
                                continue;

                        wakeup = true;
                }

                event.type = v->type;
                event.code = v->code;
                event.value = v->value;
                __pass_event(client, &event);
        }

        spin_unlock(&client->buffer_lock);

        if (wakeup)
                wake_up_interruptible_poll(&client->wait,
                        EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM);
}

/*
 * Pass incoming events to all connected clients.
 */
static unsigned int evdev_events(struct input_handle *handle,
                                 struct input_value *vals, unsigned int count)
{
        struct evdev *evdev = handle->private;
        struct evdev_client *client;
        ktime_t *ev_time = input_get_timestamp(handle->dev);

        rcu_read_lock();

        client = rcu_dereference(evdev->grab);

        if (client)
                evdev_pass_values(client, vals, count, ev_time);
        else
                list_for_each_entry_rcu(client, &evdev->client_list, node)
                        evdev_pass_values(client, vals, count, ev_time);

        rcu_read_unlock();

        return count;
}

static int evdev_fasync(int fd, struct file *file, int on)
{
        struct evdev_client *client = file->private_data;

        return fasync_helper(fd, file, on, &client->fasync);
}

static void evdev_free(struct device *dev)
{
        struct evdev *evdev = container_of(dev, struct evdev, dev);

        input_put_device(evdev->handle.dev);
        kfree(evdev);
}

/*
 * Grabs an event device (along with underlying input device).
 * This function is called with evdev->mutex taken.
 */
static int evdev_grab(struct evdev *evdev, struct evdev_client *client)
{
        int error;

        if (evdev->grab)
                return -EBUSY;

        error = input_grab_device(&evdev->handle);
        if (error)
                return error;

        rcu_assign_pointer(evdev->grab, client);

        return 0;
}

static int evdev_ungrab(struct evdev *evdev, struct evdev_client *client)
{
        struct evdev_client *grab = rcu_dereference_protected(evdev->grab,
                                        lockdep_is_held(&evdev->mutex));

        if (grab != client)
                return  -EINVAL;

        rcu_assign_pointer(evdev->grab, NULL);
        synchronize_rcu();
        input_release_device(&evdev->handle);

        return 0;
}

static void evdev_attach_client(struct evdev *evdev,
                                struct evdev_client *client)
{
        spin_lock(&evdev->client_lock);
        list_add_tail_rcu(&client->node, &evdev->client_list);
        spin_unlock(&evdev->client_lock);
}

static void evdev_detach_client(struct evdev *evdev,
                                struct evdev_client *client)
{
        spin_lock(&evdev->client_lock);
        list_del_rcu(&client->node);
        spin_unlock(&evdev->client_lock);
        synchronize_rcu();
}

static int evdev_open_device(struct evdev *evdev)
{
        int retval;

        retval = mutex_lock_interruptible(&evdev->mutex);
        if (retval)
                return retval;

        if (!evdev->exist)
                retval = -ENODEV;
        else if (!evdev->open++) {
                retval = input_open_device(&evdev->handle);
                if (retval)
                        evdev->open--;
        }

        mutex_unlock(&evdev->mutex);
        return retval;
}

static void evdev_close_device(struct evdev *evdev)
{
        mutex_lock(&evdev->mutex);

        if (evdev->exist && !--evdev->open)
                input_close_device(&evdev->handle);

        mutex_unlock(&evdev->mutex);
}

/*
 * Wake up users waiting for IO so they can disconnect from
 * dead device.
 */
static void evdev_hangup(struct evdev *evdev)
{
        struct evdev_client *client;

        spin_lock(&evdev->client_lock);
        list_for_each_entry(client, &evdev->client_list, node) {
                kill_fasync(&client->fasync, SIGIO, POLL_HUP);
                wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR);
        }
        spin_unlock(&evdev->client_lock);
}

static int evdev_release(struct inode *inode, struct file *file)
{
        struct evdev_client *client = file->private_data;
        struct evdev *evdev = client->evdev;
        unsigned int i;

        mutex_lock(&evdev->mutex);

        if (evdev->exist && !client->revoked)
                input_flush_device(&evdev->handle, file);

        evdev_ungrab(evdev, client);
        mutex_unlock(&evdev->mutex);

        evdev_detach_client(evdev, client);

        for (i = 0; i < EV_CNT; ++i)
                bitmap_free(client->evmasks[i]);

        kvfree(client);

        evdev_close_device(evdev);

        return 0;
}

static unsigned int evdev_compute_buffer_size(struct input_dev *dev)
{
        unsigned int n_events =
                max(dev->hint_events_per_packet * EVDEV_BUF_PACKETS,
                    EVDEV_MIN_BUFFER_SIZE);

        return roundup_pow_of_two(n_events);
}

static int evdev_open(struct inode *inode, struct file *file)
{
        struct evdev *evdev = container_of(inode->i_cdev, struct evdev, cdev);
        unsigned int bufsize = evdev_compute_buffer_size(evdev->handle.dev);
        struct evdev_client *client;
        int error;

        client = kvzalloc_flex(*client, buffer, bufsize);
        if (!client)
                return -ENOMEM;

        init_waitqueue_head(&client->wait);
        client->bufsize = bufsize;
        spin_lock_init(&client->buffer_lock);
        client->evdev = evdev;
        evdev_attach_client(evdev, client);

        error = evdev_open_device(evdev);
        if (error)
                goto err_free_client;

        file->private_data = client;
        stream_open(inode, file);

        return 0;

 err_free_client:
        evdev_detach_client(evdev, client);
        kvfree(client);
        return error;
}

static ssize_t evdev_write(struct file *file, const char __user *buffer,
                           size_t count, loff_t *ppos)
{
        struct evdev_client *client = file->private_data;
        struct evdev *evdev = client->evdev;
        struct input_event event;
        int retval = 0;

        /*
         * Limit amount of data we inject into the input subsystem so that
         * we do not hold evdev->mutex for too long. 4096 bytes corresponds
         * to 170 input events.
         */
        count = min(count, 4096);

        if (count != 0 && count < input_event_size())
                return -EINVAL;

        retval = mutex_lock_interruptible(&evdev->mutex);
        if (retval)
                return retval;

        if (!evdev->exist || client->revoked) {
                retval = -ENODEV;
                goto out;
        }

        while (retval + input_event_size() <= count) {

                if (input_event_from_user(buffer + retval, &event)) {
                        retval = -EFAULT;
                        goto out;
                }
                retval += input_event_size();

                input_inject_event(&evdev->handle,
                                   event.type, event.code, event.value);
                cond_resched();
        }

 out:
        mutex_unlock(&evdev->mutex);
        return retval;
}

static int evdev_fetch_next_event(struct evdev_client *client,
                                  struct input_event *event)
{
        int have_event;

        spin_lock_irq(&client->buffer_lock);

        have_event = client->packet_head != client->tail;
        if (have_event) {
                *event = client->buffer[client->tail++];
                client->tail &= client->bufsize - 1;
        }

        spin_unlock_irq(&client->buffer_lock);

        return have_event;
}

static ssize_t evdev_read(struct file *file, char __user *buffer,
                          size_t count, loff_t *ppos)
{
        struct evdev_client *client = file->private_data;
        struct evdev *evdev = client->evdev;
        struct input_event event;
        size_t read = 0;
        int error;

        if (count != 0 && count < input_event_size())
                return -EINVAL;

        for (;;) {
                if (!evdev->exist || client->revoked)
                        return -ENODEV;

                if (client->packet_head == client->tail &&
                    (file->f_flags & O_NONBLOCK))
                        return -EAGAIN;

                /*
                 * count == 0 is special - no IO is done but we check
                 * for error conditions (see above).
                 */
                if (count == 0)
                        break;

                while (read + input_event_size() <= count &&
                       evdev_fetch_next_event(client, &event)) {

                        if (input_event_to_user(buffer + read, &event))
                                return -EFAULT;

                        read += input_event_size();
                }

                if (read)
                        break;

                if (!(file->f_flags & O_NONBLOCK)) {
                        error = wait_event_interruptible(client->wait,
                                        client->packet_head != client->tail ||
                                        !evdev->exist || client->revoked);
                        if (error)
                                return error;
                }
        }

        return read;
}

/* No kernel lock - fine */
static __poll_t evdev_poll(struct file *file, poll_table *wait)
{
        struct evdev_client *client = file->private_data;
        struct evdev *evdev = client->evdev;
        __poll_t mask;

        poll_wait(file, &client->wait, wait);

        if (evdev->exist && !client->revoked)
                mask = EPOLLOUT | EPOLLWRNORM;
        else
                mask = EPOLLHUP | EPOLLERR;

        if (client->packet_head != client->tail)
                mask |= EPOLLIN | EPOLLRDNORM;

        return mask;
}

#ifdef CONFIG_COMPAT

#define BITS_PER_LONG_COMPAT (sizeof(compat_long_t) * 8)
#define BITS_TO_LONGS_COMPAT(x) ((((x) - 1) / BITS_PER_LONG_COMPAT) + 1)

#ifdef __BIG_ENDIAN
static int bits_to_user(unsigned long *bits, unsigned int maxbit,
                        unsigned int maxlen, void __user *p, int compat)
{
        int len, i;

        if (compat) {
                len = BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t);
                if (len > maxlen)
                        len = maxlen;

                for (i = 0; i < len / sizeof(compat_long_t); i++)
                        if (copy_to_user((compat_long_t __user *) p + i,
                                         (compat_long_t *) bits +
                                                i + 1 - ((i % 2) << 1),
                                         sizeof(compat_long_t)))
                                return -EFAULT;
        } else {
                len = BITS_TO_LONGS(maxbit) * sizeof(long);
                if (len > maxlen)
                        len = maxlen;

                if (copy_to_user(p, bits, len))
                        return -EFAULT;
        }

        return len;
}

static int bits_from_user(unsigned long *bits, unsigned int maxbit,
                          unsigned int maxlen, const void __user *p, int compat)
{
        int len, i;

        if (compat) {
                if (maxlen % sizeof(compat_long_t))
                        return -EINVAL;

                len = BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t);
                if (len > maxlen)
                        len = maxlen;

                for (i = 0; i < len / sizeof(compat_long_t); i++)
                        if (copy_from_user((compat_long_t *) bits +
                                                i + 1 - ((i % 2) << 1),
                                           (compat_long_t __user *) p + i,
                                           sizeof(compat_long_t)))
                                return -EFAULT;
                if (i % 2)
                        *((compat_long_t *) bits + i - 1) = 0;

        } else {
                if (maxlen % sizeof(long))
                        return -EINVAL;

                len = BITS_TO_LONGS(maxbit) * sizeof(long);
                if (len > maxlen)
                        len = maxlen;

                if (copy_from_user(bits, p, len))
                        return -EFAULT;
        }

        return len;
}

#else

static int bits_to_user(unsigned long *bits, unsigned int maxbit,
                        unsigned int maxlen, void __user *p, int compat)
{
        int len = compat ?
                        BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t) :
                        BITS_TO_LONGS(maxbit) * sizeof(long);

        if (len > maxlen)
                len = maxlen;

        return copy_to_user(p, bits, len) ? -EFAULT : len;
}

static int bits_from_user(unsigned long *bits, unsigned int maxbit,
                          unsigned int maxlen, const void __user *p, int compat)
{
        size_t chunk_size = compat ? sizeof(compat_long_t) : sizeof(long);
        int len;

        if (maxlen % chunk_size)
                return -EINVAL;

        len = compat ? BITS_TO_LONGS_COMPAT(maxbit) : BITS_TO_LONGS(maxbit);
        len *= chunk_size;
        if (len > maxlen)
                len = maxlen;

        return copy_from_user(bits, p, len) ? -EFAULT : len;
}

#endif /* __BIG_ENDIAN */

#else

static int bits_to_user(unsigned long *bits, unsigned int maxbit,
                        unsigned int maxlen, void __user *p, int compat)
{
        int len = BITS_TO_LONGS(maxbit) * sizeof(long);

        if (len > maxlen)
                len = maxlen;

        return copy_to_user(p, bits, len) ? -EFAULT : len;
}

static int bits_from_user(unsigned long *bits, unsigned int maxbit,
                          unsigned int maxlen, const void __user *p, int compat)
{
        int len;

        if (maxlen % sizeof(long))
                return -EINVAL;

        len = BITS_TO_LONGS(maxbit) * sizeof(long);
        if (len > maxlen)
                len = maxlen;

        return copy_from_user(bits, p, len) ? -EFAULT : len;
}

#endif /* CONFIG_COMPAT */

static int str_to_user(const char *str, unsigned int maxlen, void __user *p)
{
        int len;

        if (!str)
                return -ENOENT;

        len = strlen(str) + 1;
        if (len > maxlen)
                len = maxlen;

        return copy_to_user(p, str, len) ? -EFAULT : len;
}

static int handle_eviocgbit(struct input_dev *dev,
                            unsigned int type, unsigned int size,
                            void __user *p, int compat_mode)
{
        unsigned long *bits;
        int len;

        switch (type) {

        case      0: bits = dev->evbit;  len = EV_MAX;  break;
        case EV_KEY: bits = dev->keybit; len = KEY_MAX; break;
        case EV_REL: bits = dev->relbit; len = REL_MAX; break;
        case EV_ABS: bits = dev->absbit; len = ABS_MAX; break;
        case EV_MSC: bits = dev->mscbit; len = MSC_MAX; break;
        case EV_LED: bits = dev->ledbit; len = LED_MAX; break;
        case EV_SND: bits = dev->sndbit; len = SND_MAX; break;
        case EV_FF:  bits = dev->ffbit;  len = FF_MAX;  break;
        case EV_SW:  bits = dev->swbit;  len = SW_MAX;  break;
        default: return -EINVAL;
        }

        return bits_to_user(bits, len, size, p, compat_mode);
}

static int evdev_handle_get_keycode(struct input_dev *dev, void __user *p)
{
        struct input_keymap_entry ke = {
                .len        = sizeof(unsigned int),
                .flags        = 0,
        };
        int __user *ip = (int __user *)p;
        int error;

        /* legacy case */
        if (copy_from_user(ke.scancode, p, sizeof(unsigned int)))
                return -EFAULT;

        error = input_get_keycode(dev, &ke);
        if (error)
                return error;

        if (put_user(ke.keycode, ip + 1))
                return -EFAULT;

        return 0;
}

static int evdev_handle_get_keycode_v2(struct input_dev *dev, void __user *p)
{
        struct input_keymap_entry ke;
        int error;

        if (copy_from_user(&ke, p, sizeof(ke)))
                return -EFAULT;

        error = input_get_keycode(dev, &ke);
        if (error)
                return error;

        if (copy_to_user(p, &ke, sizeof(ke)))
                return -EFAULT;

        return 0;
}

static int evdev_handle_set_keycode(struct input_dev *dev, void __user *p)
{
        struct input_keymap_entry ke = {
                .len        = sizeof(unsigned int),
                .flags        = 0,
        };
        int __user *ip = (int __user *)p;

        if (copy_from_user(ke.scancode, p, sizeof(unsigned int)))
                return -EFAULT;

        if (get_user(ke.keycode, ip + 1))
                return -EFAULT;

        return input_set_keycode(dev, &ke);
}

static int evdev_handle_set_keycode_v2(struct input_dev *dev, void __user *p)
{
        struct input_keymap_entry ke;

        if (copy_from_user(&ke, p, sizeof(ke)))
                return -EFAULT;

        if (ke.len > sizeof(ke.scancode))
                return -EINVAL;

        return input_set_keycode(dev, &ke);
}

/*
 * If we transfer state to the user, we should flush all pending events
 * of the same type from the client's queue. Otherwise, they might end up
 * with duplicate events, which can screw up client's state tracking.
 * If bits_to_user fails after flushing the queue, we queue a SYN_DROPPED
 * event so user-space will notice missing events.
 *
 * LOCKING:
 * We need to take event_lock before buffer_lock to avoid dead-locks. But we
 * need the even_lock only to guarantee consistent state. We can safely release
 * it while flushing the queue. This allows input-core to handle filters while
 * we flush the queue.
 */
static int evdev_handle_get_val(struct evdev_client *client,
                                struct input_dev *dev, unsigned int type,
                                unsigned long *bits, unsigned int maxbit,
                                unsigned int maxlen, void __user *p,
                                int compat)
{
        int ret;
        unsigned long *mem;

        mem = bitmap_alloc(maxbit, GFP_KERNEL);
        if (!mem)
                return -ENOMEM;

        spin_lock_irq(&dev->event_lock);
        spin_lock(&client->buffer_lock);

        bitmap_copy(mem, bits, maxbit);

        spin_unlock(&dev->event_lock);

        __evdev_flush_queue(client, type);

        spin_unlock_irq(&client->buffer_lock);

        ret = bits_to_user(mem, maxbit, maxlen, p, compat);
        if (ret < 0)
                evdev_queue_syn_dropped(client);

        bitmap_free(mem);

        return ret;
}

static int evdev_handle_mt_request(struct input_dev *dev,
                                   unsigned int size,
                                   int __user *ip)
{
        const struct input_mt *mt = dev->mt;
        unsigned int code;
        int max_slots;
        int i;

        if (get_user(code, &ip[0]))
                return -EFAULT;
        if (!mt || !input_is_mt_value(code))
                return -EINVAL;

        max_slots = (size - sizeof(__u32)) / sizeof(__s32);
        for (i = 0; i < mt->num_slots && i < max_slots; i++) {
                int value = input_mt_get_value(&mt->slots[i], code);
                if (put_user(value, &ip[1 + i]))
                        return -EFAULT;
        }

        return 0;
}

static int evdev_revoke(struct evdev *evdev, struct evdev_client *client,
                        struct file *file)
{
        client->revoked = true;
        evdev_ungrab(evdev, client);
        input_flush_device(&evdev->handle, file);
        wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR);

        return 0;
}

/* must be called with evdev-mutex held */
static int evdev_set_mask(struct evdev_client *client,
                          unsigned int type,
                          const void __user *codes,
                          u32 codes_size,
                          int compat)
{
        unsigned long flags, *mask, *oldmask;
        size_t cnt;
        int error;

        /* we allow unknown types and 'codes_size > size' for forward-compat */
        cnt = evdev_get_mask_cnt(type);
        if (!cnt)
                return 0;

        mask = bitmap_zalloc(cnt, GFP_KERNEL);
        if (!mask)
                return -ENOMEM;

        error = bits_from_user(mask, cnt - 1, codes_size, codes, compat);
        if (error < 0) {
                bitmap_free(mask);
                return error;
        }

        spin_lock_irqsave(&client->buffer_lock, flags);
        oldmask = client->evmasks[type];
        client->evmasks[type] = mask;
        spin_unlock_irqrestore(&client->buffer_lock, flags);

        bitmap_free(oldmask);

        return 0;
}

/* must be called with evdev-mutex held */
static int evdev_get_mask(struct evdev_client *client,
                          unsigned int type,
                          void __user *codes,
                          u32 codes_size,
                          int compat)
{
        unsigned long *mask;
        size_t cnt, size, xfer_size;
        int i;
        int error;

        /* we allow unknown types and 'codes_size > size' for forward-compat */
        cnt = evdev_get_mask_cnt(type);
        size = sizeof(unsigned long) * BITS_TO_LONGS(cnt);
        xfer_size = min_t(size_t, codes_size, size);

        if (cnt > 0) {
                mask = client->evmasks[type];
                if (mask) {
                        error = bits_to_user(mask, cnt - 1,
                                             xfer_size, codes, compat);
                        if (error < 0)
                                return error;
                } else {
                        /* fake mask with all bits set */
                        for (i = 0; i < xfer_size; i++)
                                if (put_user(0xffU, (u8 __user *)codes + i))
                                        return -EFAULT;
                }
        }

        if (xfer_size < codes_size)
                if (clear_user(codes + xfer_size, codes_size - xfer_size))
                        return -EFAULT;

        return 0;
}

static long evdev_do_ioctl(struct file *file, unsigned int cmd,
                           void __user *p, int compat_mode)
{
        struct evdev_client *client = file->private_data;
        struct evdev *evdev = client->evdev;
        struct input_dev *dev = evdev->handle.dev;
        struct input_absinfo abs;
        struct input_mask mask;
        struct ff_effect effect;
        int __user *ip = (int __user *)p;
        unsigned int i, t, u, v;
        unsigned int size;
        int error;

        /* First we check for fixed-length commands */
        switch (cmd) {

        case EVIOCGVERSION:
                return put_user(EV_VERSION, ip);

        case EVIOCGID:
                if (copy_to_user(p, &dev->id, sizeof(struct input_id)))
                        return -EFAULT;
                return 0;

        case EVIOCGREP:
                if (!test_bit(EV_REP, dev->evbit))
                        return -ENOSYS;
                if (put_user(dev->rep[REP_DELAY], ip))
                        return -EFAULT;
                if (put_user(dev->rep[REP_PERIOD], ip + 1))
                        return -EFAULT;
                return 0;

        case EVIOCSREP:
                if (!test_bit(EV_REP, dev->evbit))
                        return -ENOSYS;
                if (get_user(u, ip))
                        return -EFAULT;
                if (get_user(v, ip + 1))
                        return -EFAULT;

                input_inject_event(&evdev->handle, EV_REP, REP_DELAY, u);
                input_inject_event(&evdev->handle, EV_REP, REP_PERIOD, v);

                return 0;

        case EVIOCRMFF:
                return input_ff_erase(dev, (int)(unsigned long) p, file);

        case EVIOCGEFFECTS:
                i = test_bit(EV_FF, dev->evbit) ?
                                dev->ff->max_effects : 0;
                if (put_user(i, ip))
                        return -EFAULT;
                return 0;

        case EVIOCGRAB:
                if (p)
                        return evdev_grab(evdev, client);
                else
                        return evdev_ungrab(evdev, client);

        case EVIOCREVOKE:
                if (p)
                        return -EINVAL;
                else
                        return evdev_revoke(evdev, client, file);

        case EVIOCGMASK: {
                void __user *codes_ptr;

                if (copy_from_user(&mask, p, sizeof(mask)))
                        return -EFAULT;

                codes_ptr = (void __user *)(unsigned long)mask.codes_ptr;
                return evdev_get_mask(client,
                                      mask.type, codes_ptr, mask.codes_size,
                                      compat_mode);
        }

        case EVIOCSMASK: {
                const void __user *codes_ptr;

                if (copy_from_user(&mask, p, sizeof(mask)))
                        return -EFAULT;

                codes_ptr = (const void __user *)(unsigned long)mask.codes_ptr;
                return evdev_set_mask(client,
                                      mask.type, codes_ptr, mask.codes_size,
                                      compat_mode);
        }

        case EVIOCSCLOCKID:
                if (copy_from_user(&i, p, sizeof(unsigned int)))
                        return -EFAULT;

                return evdev_set_clk_type(client, i);

        case EVIOCGKEYCODE:
                return evdev_handle_get_keycode(dev, p);

        case EVIOCSKEYCODE:
                return evdev_handle_set_keycode(dev, p);

        case EVIOCGKEYCODE_V2:
                return evdev_handle_get_keycode_v2(dev, p);

        case EVIOCSKEYCODE_V2:
                return evdev_handle_set_keycode_v2(dev, p);
        }

        size = _IOC_SIZE(cmd);

        /* Now check variable-length commands */
#define EVIOC_MASK_SIZE(nr)        ((nr) & ~(_IOC_SIZEMASK << _IOC_SIZESHIFT))
        switch (EVIOC_MASK_SIZE(cmd)) {

        case EVIOCGPROP(0):
                return bits_to_user(dev->propbit, INPUT_PROP_MAX,
                                    size, p, compat_mode);

        case EVIOCGMTSLOTS(0):
                return evdev_handle_mt_request(dev, size, ip);

        case EVIOCGKEY(0):
                return evdev_handle_get_val(client, dev, EV_KEY, dev->key,
                                            KEY_MAX, size, p, compat_mode);

        case EVIOCGLED(0):
                return evdev_handle_get_val(client, dev, EV_LED, dev->led,
                                            LED_MAX, size, p, compat_mode);

        case EVIOCGSND(0):
                return evdev_handle_get_val(client, dev, EV_SND, dev->snd,
                                            SND_MAX, size, p, compat_mode);

        case EVIOCGSW(0):
                return evdev_handle_get_val(client, dev, EV_SW, dev->sw,
                                            SW_MAX, size, p, compat_mode);

        case EVIOCGNAME(0):
                return str_to_user(dev->name, size, p);

        case EVIOCGPHYS(0):
                return str_to_user(dev->phys, size, p);

        case EVIOCGUNIQ(0):
                return str_to_user(dev->uniq, size, p);

        case EVIOC_MASK_SIZE(EVIOCSFF):
                if (input_ff_effect_from_user(p, size, &effect))
                        return -EFAULT;

                error = input_ff_upload(dev, &effect, file);
                if (error)
                        return error;

                if (put_user(effect.id, &(((struct ff_effect __user *)p)->id)))
                        return -EFAULT;

                return 0;
        }

        /* Multi-number variable-length handlers */
        if (_IOC_TYPE(cmd) != 'E')
                return -EINVAL;

        if (_IOC_DIR(cmd) == _IOC_READ) {

                if ((_IOC_NR(cmd) & ~EV_MAX) == _IOC_NR(EVIOCGBIT(0, 0)))
                        return handle_eviocgbit(dev,
                                                _IOC_NR(cmd) & EV_MAX, size,
                                                p, compat_mode);

                if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCGABS(0))) {

                        if (!dev->absinfo)
                                return -EINVAL;

                        t = _IOC_NR(cmd) & ABS_MAX;
                        abs = dev->absinfo[t];

                        if (copy_to_user(p, &abs, min_t(size_t,
                                        size, sizeof(struct input_absinfo))))
                                return -EFAULT;

                        return 0;
                }
        }

        if (_IOC_DIR(cmd) == _IOC_WRITE) {

                if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCSABS(0))) {

                        if (!dev->absinfo)
                                return -EINVAL;

                        t = _IOC_NR(cmd) & ABS_MAX;

                        if (copy_from_user(&abs, p, min_t(size_t,
                                        size, sizeof(struct input_absinfo))))
                                return -EFAULT;

                        if (size < sizeof(struct input_absinfo))
                                abs.resolution = 0;

                        /* We can't change number of reserved MT slots */
                        if (t == ABS_MT_SLOT)
                                return -EINVAL;

                        /*
                         * Take event lock to ensure that we are not
                         * changing device parameters in the middle
                         * of event.
                         */
                        spin_lock_irq(&dev->event_lock);
                        dev->absinfo[t] = abs;
                        spin_unlock_irq(&dev->event_lock);

                        return 0;
                }
        }

        return -EINVAL;
}

static long evdev_ioctl_handler(struct file *file, unsigned int cmd,
                                void __user *p, int compat_mode)
{
        struct evdev_client *client = file->private_data;
        struct evdev *evdev = client->evdev;
        int retval;

        retval = mutex_lock_interruptible(&evdev->mutex);
        if (retval)
                return retval;

        if (!evdev->exist || client->revoked) {
                retval = -ENODEV;
                goto out;
        }

        retval = evdev_do_ioctl(file, cmd, p, compat_mode);

 out:
        mutex_unlock(&evdev->mutex);
        return retval;
}

static long evdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return evdev_ioctl_handler(file, cmd, (void __user *)arg, 0);
}

#ifdef CONFIG_COMPAT
static long evdev_ioctl_compat(struct file *file,
                                unsigned int cmd, unsigned long arg)
{
        return evdev_ioctl_handler(file, cmd, compat_ptr(arg), 1);
}
#endif

static const struct file_operations evdev_fops = {
        .owner                = THIS_MODULE,
        .read                = evdev_read,
        .write                = evdev_write,
        .poll                = evdev_poll,
        .open                = evdev_open,
        .release        = evdev_release,
        .unlocked_ioctl        = evdev_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = evdev_ioctl_compat,
#endif
        .fasync                = evdev_fasync,
};

/*
 * Mark device non-existent. This disables writes, ioctls and
 * prevents new users from opening the device. Already posted
 * blocking reads will stay, however new ones will fail.
 */
static void evdev_mark_dead(struct evdev *evdev)
{
        mutex_lock(&evdev->mutex);
        evdev->exist = false;
        mutex_unlock(&evdev->mutex);
}

static void evdev_cleanup(struct evdev *evdev)
{
        struct input_handle *handle = &evdev->handle;

        evdev_mark_dead(evdev);
        evdev_hangup(evdev);

        /* evdev is marked dead so no one else accesses evdev->open */
        if (evdev->open) {
                input_flush_device(handle, NULL);
                input_close_device(handle);
        }
}

/*
 * Create new evdev device. Note that input core serializes calls
 * to connect and disconnect.
 */
static int evdev_connect(struct input_handler *handler, struct input_dev *dev,
                         const struct input_device_id *id)
{
        struct evdev *evdev;
        int minor;
        int dev_no;
        int error;

        minor = input_get_new_minor(EVDEV_MINOR_BASE, EVDEV_MINORS, true);
        if (minor < 0) {
                error = minor;
                pr_err("failed to reserve new minor: %d\n", error);
                return error;
        }

        evdev = kzalloc_obj(struct evdev);
        if (!evdev) {
                error = -ENOMEM;
                goto err_free_minor;
        }

        INIT_LIST_HEAD(&evdev->client_list);
        spin_lock_init(&evdev->client_lock);
        mutex_init(&evdev->mutex);
        evdev->exist = true;

        dev_no = minor;
        /* Normalize device number if it falls into legacy range */
        if (dev_no < EVDEV_MINOR_BASE + EVDEV_MINORS)
                dev_no -= EVDEV_MINOR_BASE;
        dev_set_name(&evdev->dev, "event%d", dev_no);

        evdev->handle.dev = input_get_device(dev);
        evdev->handle.name = dev_name(&evdev->dev);
        evdev->handle.handler = handler;
        evdev->handle.private = evdev;

        evdev->dev.devt = MKDEV(INPUT_MAJOR, minor);
        evdev->dev.class = &input_class;
        evdev->dev.parent = &dev->dev;
        evdev->dev.release = evdev_free;
        device_initialize(&evdev->dev);

        error = input_register_handle(&evdev->handle);
        if (error)
                goto err_free_evdev;

        cdev_init(&evdev->cdev, &evdev_fops);

        error = cdev_device_add(&evdev->cdev, &evdev->dev);
        if (error)
                goto err_cleanup_evdev;

        return 0;

 err_cleanup_evdev:
        evdev_cleanup(evdev);
        input_unregister_handle(&evdev->handle);
 err_free_evdev:
        put_device(&evdev->dev);
 err_free_minor:
        input_free_minor(minor);
        return error;
}

static void evdev_disconnect(struct input_handle *handle)
{
        struct evdev *evdev = handle->private;

        cdev_device_del(&evdev->cdev, &evdev->dev);
        evdev_cleanup(evdev);
        input_free_minor(MINOR(evdev->dev.devt));
        input_unregister_handle(handle);
        put_device(&evdev->dev);
}

static const struct input_device_id evdev_ids[] = {
        {
                /* Matches all devices */
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT,
                .evbit = { BIT_MASK(EV_SYN) },
        },
        { }        /* Terminating zero entry */
};

MODULE_DEVICE_TABLE(input, evdev_ids);

static struct input_handler evdev_handler = {
        .events                = evdev_events,
        .connect        = evdev_connect,
        .disconnect        = evdev_disconnect,
        .legacy_minors        = true,
        .minor                = EVDEV_MINOR_BASE,
        .name                = "evdev",
        .id_table        = evdev_ids,
};

static int __init evdev_init(void)
{
        return input_register_handler(&evdev_handler);
}

static void __exit evdev_exit(void)
{
        input_unregister_handler(&evdev_handler);
}

module_init(evdev_init);
module_exit(evdev_exit);

MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
MODULE_DESCRIPTION("Input driver event char devices");
MODULE_LICENSE("GPL");















































































































































































































































































































































































































    2 










    2 















    2 



    2 



























































    2 











































    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS disk address translation.
 *
 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Koji Sato.
 */

#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/string.h>
#include <linux/errno.h>
#include "nilfs.h"
#include "mdt.h"
#include "alloc.h"
#include "dat.h"


#define NILFS_CNO_MIN        ((__u64)1)
#define NILFS_CNO_MAX        (~(__u64)0)

/**
 * struct nilfs_dat_info - on-memory private data of DAT file
 * @mi: on-memory private data of metadata file
 * @palloc_cache: persistent object allocator cache of DAT file
 * @shadow: shadow map of DAT file
 */
struct nilfs_dat_info {
        struct nilfs_mdt_info mi;
        struct nilfs_palloc_cache palloc_cache;
        struct nilfs_shadow_map shadow;
};

static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
{
        return (struct nilfs_dat_info *)NILFS_MDT(dat);
}

static int nilfs_dat_prepare_entry(struct inode *dat,
                                   struct nilfs_palloc_req *req, int create)
{
        int ret;

        ret = nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
                                           create, &req->pr_entry_bh);
        if (unlikely(ret == -ENOENT)) {
                nilfs_err(dat->i_sb,
                          "DAT doesn't have a block to manage vblocknr = %llu",
                          (unsigned long long)req->pr_entry_nr);
                /*
                 * Return internal code -EINVAL to notify bmap layer of
                 * metadata corruption.
                 */
                ret = -EINVAL;
        }
        return ret;
}

static void nilfs_dat_commit_entry(struct inode *dat,
                                   struct nilfs_palloc_req *req)
{
        mark_buffer_dirty(req->pr_entry_bh);
        nilfs_mdt_mark_dirty(dat);
        brelse(req->pr_entry_bh);
}

static void nilfs_dat_abort_entry(struct inode *dat,
                                  struct nilfs_palloc_req *req)
{
        brelse(req->pr_entry_bh);
}

int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
{
        int ret;

        ret = nilfs_palloc_prepare_alloc_entry(dat, req, true);
        if (ret < 0)
                return ret;

        ret = nilfs_dat_prepare_entry(dat, req, 1);
        if (ret < 0)
                nilfs_palloc_abort_alloc_entry(dat, req);

        return ret;
}

void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
{
        struct nilfs_dat_entry *entry;
        size_t offset;

        offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr,
                                           req->pr_entry_bh);
        entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset);
        entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
        entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
        entry->de_blocknr = cpu_to_le64(0);
        kunmap_local(entry);

        nilfs_palloc_commit_alloc_entry(dat, req);
        nilfs_dat_commit_entry(dat, req);
}

void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
{
        nilfs_dat_abort_entry(dat, req);
        nilfs_palloc_abort_alloc_entry(dat, req);
}

static void nilfs_dat_commit_free(struct inode *dat,
                                  struct nilfs_palloc_req *req)
{
        struct nilfs_dat_entry *entry;
        size_t offset;

        offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr,
                                           req->pr_entry_bh);
        entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset);
        entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
        entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
        entry->de_blocknr = cpu_to_le64(0);
        kunmap_local(entry);

        nilfs_dat_commit_entry(dat, req);

        if (unlikely(req->pr_desc_bh == NULL || req->pr_bitmap_bh == NULL)) {
                nilfs_error(dat->i_sb,
                            "state inconsistency probably due to duplicate use of vblocknr = %llu",
                            (unsigned long long)req->pr_entry_nr);
                return;
        }
        nilfs_palloc_commit_free_entry(dat, req);
}

int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
{
        return nilfs_dat_prepare_entry(dat, req, 0);
}

void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
                            sector_t blocknr)
{
        struct nilfs_dat_entry *entry;
        size_t offset;

        offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr,
                                           req->pr_entry_bh);
        entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset);
        entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
        entry->de_blocknr = cpu_to_le64(blocknr);
        kunmap_local(entry);

        nilfs_dat_commit_entry(dat, req);
}

int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
{
        struct nilfs_dat_entry *entry;
        __u64 start;
        sector_t blocknr;
        size_t offset;
        int ret;

        ret = nilfs_dat_prepare_entry(dat, req, 0);
        if (ret < 0)
                return ret;

        offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr,
                                           req->pr_entry_bh);
        entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset);
        start = le64_to_cpu(entry->de_start);
        blocknr = le64_to_cpu(entry->de_blocknr);
        kunmap_local(entry);

        if (blocknr == 0) {
                ret = nilfs_palloc_prepare_free_entry(dat, req);
                if (ret < 0) {
                        nilfs_dat_abort_entry(dat, req);
                        return ret;
                }
        }
        if (unlikely(start > nilfs_mdt_cno(dat))) {
                nilfs_err(dat->i_sb,
                          "vblocknr = %llu has abnormal lifetime: start cno (= %llu) > current cno (= %llu)",
                          (unsigned long long)req->pr_entry_nr,
                          (unsigned long long)start,
                          (unsigned long long)nilfs_mdt_cno(dat));
                nilfs_dat_abort_entry(dat, req);
                return -EINVAL;
        }

        return 0;
}

void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
                          int dead)
{
        struct nilfs_dat_entry *entry;
        __u64 start, end;
        sector_t blocknr;
        size_t offset;

        offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr,
                                           req->pr_entry_bh);
        entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset);
        end = start = le64_to_cpu(entry->de_start);
        if (!dead) {
                end = nilfs_mdt_cno(dat);
                WARN_ON(start > end);
        }
        entry->de_end = cpu_to_le64(end);
        blocknr = le64_to_cpu(entry->de_blocknr);
        kunmap_local(entry);

        if (blocknr == 0)
                nilfs_dat_commit_free(dat, req);
        else
                nilfs_dat_commit_entry(dat, req);
}

void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
{
        struct nilfs_dat_entry *entry;
        __u64 start;
        sector_t blocknr;
        size_t offset;

        offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr,
                                           req->pr_entry_bh);
        entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset);
        start = le64_to_cpu(entry->de_start);
        blocknr = le64_to_cpu(entry->de_blocknr);
        kunmap_local(entry);

        if (start == nilfs_mdt_cno(dat) && blocknr == 0)
                nilfs_palloc_abort_free_entry(dat, req);
        nilfs_dat_abort_entry(dat, req);
}

int nilfs_dat_prepare_update(struct inode *dat,
                             struct nilfs_palloc_req *oldreq,
                             struct nilfs_palloc_req *newreq)
{
        int ret;

        ret = nilfs_dat_prepare_end(dat, oldreq);
        if (!ret) {
                ret = nilfs_dat_prepare_alloc(dat, newreq);
                if (ret < 0)
                        nilfs_dat_abort_end(dat, oldreq);
        }
        return ret;
}

void nilfs_dat_commit_update(struct inode *dat,
                             struct nilfs_palloc_req *oldreq,
                             struct nilfs_palloc_req *newreq, int dead)
{
        nilfs_dat_commit_end(dat, oldreq, dead);
        nilfs_dat_commit_alloc(dat, newreq);
}

void nilfs_dat_abort_update(struct inode *dat,
                            struct nilfs_palloc_req *oldreq,
                            struct nilfs_palloc_req *newreq)
{
        nilfs_dat_abort_end(dat, oldreq);
        nilfs_dat_abort_alloc(dat, newreq);
}

/**
 * nilfs_dat_mark_dirty - mark the DAT block buffer containing the specified
 *                        virtual block address entry as dirty
 * @dat:      DAT file inode
 * @vblocknr: virtual block number
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - Invalid DAT entry (internal code).
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
{
        struct nilfs_palloc_req req;
        int ret;

        req.pr_entry_nr = vblocknr;
        ret = nilfs_dat_prepare_entry(dat, &req, 0);
        if (ret == 0)
                nilfs_dat_commit_entry(dat, &req);
        return ret;
}

/**
 * nilfs_dat_freev - free virtual block numbers
 * @dat: DAT file inode
 * @vblocknrs: array of virtual block numbers
 * @nitems: number of virtual block numbers
 *
 * Description: nilfs_dat_freev() frees the virtual block numbers specified by
 * @vblocknrs and @nitems.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - The virtual block number have not been allocated.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
{
        return nilfs_palloc_freev(dat, vblocknrs, nitems);
}

/**
 * nilfs_dat_move - change a block number
 * @dat: DAT file inode
 * @vblocknr: virtual block number
 * @blocknr: block number
 *
 * Description: nilfs_dat_move() changes the block number associated with
 * @vblocknr to @blocknr.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
{
        struct buffer_head *entry_bh;
        struct nilfs_dat_entry *entry;
        size_t offset;
        int ret;

        ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
        if (ret < 0)
                return ret;

        /*
         * The given disk block number (blocknr) is not yet written to
         * the device at this point.
         *
         * To prevent nilfs_dat_translate() from returning the
         * uncommitted block number, this makes a copy of the entry
         * buffer and redirects nilfs_dat_translate() to the copy.
         */
        if (!buffer_nilfs_redirected(entry_bh)) {
                ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
                if (ret) {
                        brelse(entry_bh);
                        return ret;
                }
        }

        offset = nilfs_palloc_entry_offset(dat, vblocknr, entry_bh);
        entry = kmap_local_folio(entry_bh->b_folio, offset);
        if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
                nilfs_crit(dat->i_sb,
                           "%s: invalid vblocknr = %llu, [%llu, %llu)",
                           __func__, (unsigned long long)vblocknr,
                           (unsigned long long)le64_to_cpu(entry->de_start),
                           (unsigned long long)le64_to_cpu(entry->de_end));
                kunmap_local(entry);
                brelse(entry_bh);
                return -EINVAL;
        }
        WARN_ON(blocknr == 0);
        entry->de_blocknr = cpu_to_le64(blocknr);
        kunmap_local(entry);

        mark_buffer_dirty(entry_bh);
        nilfs_mdt_mark_dirty(dat);

        brelse(entry_bh);

        return 0;
}

/**
 * nilfs_dat_translate - translate a virtual block number to a block number
 * @dat: DAT file inode
 * @vblocknr: virtual block number
 * @blocknrp: pointer to a block number
 *
 * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
 * to the corresponding block number.  The block number associated with
 * @vblocknr is stored in the place pointed to by @blocknrp.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - A block number associated with @vblocknr does not exist.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
{
        struct buffer_head *entry_bh, *bh;
        struct nilfs_dat_entry *entry;
        sector_t blocknr;
        size_t offset;
        int ret;

        ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
        if (ret < 0)
                return ret;

        if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
                bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
                if (bh) {
                        WARN_ON(!buffer_uptodate(bh));
                        brelse(entry_bh);
                        entry_bh = bh;
                }
        }

        offset = nilfs_palloc_entry_offset(dat, vblocknr, entry_bh);
        entry = kmap_local_folio(entry_bh->b_folio, offset);
        blocknr = le64_to_cpu(entry->de_blocknr);
        if (blocknr == 0) {
                ret = -ENOENT;
                goto out;
        }
        *blocknrp = blocknr;

 out:
        kunmap_local(entry);
        brelse(entry_bh);
        return ret;
}

ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
                            size_t nvi)
{
        struct buffer_head *entry_bh;
        struct nilfs_dat_entry *entry, *first_entry;
        struct nilfs_vinfo *vinfo = buf;
        __u64 first, last;
        size_t offset;
        unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
        unsigned int entry_size = NILFS_MDT(dat)->mi_entry_size;
        int i, j, n, ret;

        for (i = 0; i < nvi; i += n) {
                ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr,
                                                   0, &entry_bh);
                if (ret < 0)
                        return ret;

                first = vinfo->vi_vblocknr;
                first = div64_ul(first, entries_per_block);
                first *= entries_per_block;
                /* first virtual block number in this block */

                last = first + entries_per_block - 1;
                /* last virtual block number in this block */

                offset = nilfs_palloc_entry_offset(dat, first, entry_bh);
                first_entry = kmap_local_folio(entry_bh->b_folio, offset);
                for (j = i, n = 0;
                     j < nvi && vinfo->vi_vblocknr >= first &&
                             vinfo->vi_vblocknr <= last;
                     j++, n++, vinfo = (void *)vinfo + visz) {
                        entry = (void *)first_entry +
                                (vinfo->vi_vblocknr - first) * entry_size;
                        vinfo->vi_start = le64_to_cpu(entry->de_start);
                        vinfo->vi_end = le64_to_cpu(entry->de_end);
                        vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
                }
                kunmap_local(first_entry);
                brelse(entry_bh);
        }

        return nvi;
}

/**
 * nilfs_dat_read - read or get dat inode
 * @sb: super block instance
 * @entry_size: size of a dat entry
 * @raw_inode: on-disk dat inode
 * @inodep: buffer to store the inode
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_dat_read(struct super_block *sb, size_t entry_size,
                   struct nilfs_inode *raw_inode, struct inode **inodep)
{
        static struct lock_class_key dat_lock_key;
        struct inode *dat;
        struct nilfs_dat_info *di;
        int err;

        if (entry_size > sb->s_blocksize) {
                nilfs_err(sb, "too large DAT entry size: %zu bytes",
                          entry_size);
                return -EINVAL;
        } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
                nilfs_err(sb, "too small DAT entry size: %zu bytes",
                          entry_size);
                return -EINVAL;
        }

        dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
        if (unlikely(!dat))
                return -ENOMEM;
        if (!(inode_state_read_once(dat) & I_NEW))
                goto out;

        err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
        if (err)
                goto failed;

        err = nilfs_palloc_init_blockgroup(dat, entry_size);
        if (err)
                goto failed;

        di = NILFS_DAT_I(dat);
        lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
        nilfs_palloc_setup_cache(dat, &di->palloc_cache);
        err = nilfs_mdt_setup_shadow_map(dat, &di->shadow);
        if (err)
                goto failed;

        err = nilfs_attach_btree_node_cache(dat);
        if (err)
                goto failed;
        err = nilfs_read_inode_common(dat, raw_inode);
        if (err)
                goto failed;

        unlock_new_inode(dat);
 out:
        *inodep = dat;
        return 0;
 failed:
        iget_failed(dat);
        return err;
}







































































































































































































































































































































































































































































































































   16 
   15 







































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Landlock LSM - Ruleset management
 *
 * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
 * Copyright © 2018-2020 ANSSI
 */

#include <linux/bits.h>
#include <linux/bug.h>
#include <linux/cleanup.h>
#include <linux/compiler_types.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/overflow.h>
#include <linux/rbtree.h>
#include <linux/refcount.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>

#include "access.h"
#include "domain.h"
#include "limits.h"
#include "object.h"
#include "ruleset.h"

static struct landlock_ruleset *create_ruleset(const u32 num_layers)
{
        struct landlock_ruleset *new_ruleset;

        new_ruleset = kzalloc_flex(*new_ruleset, access_masks, num_layers,
                                   GFP_KERNEL_ACCOUNT);
        if (!new_ruleset)
                return ERR_PTR(-ENOMEM);
        refcount_set(&new_ruleset->usage, 1);
        mutex_init(&new_ruleset->lock);
        new_ruleset->root_inode = RB_ROOT;

#if IS_ENABLED(CONFIG_INET)
        new_ruleset->root_net_port = RB_ROOT;
#endif /* IS_ENABLED(CONFIG_INET) */

        new_ruleset->num_layers = num_layers;
        /*
         * hierarchy = NULL
         * num_rules = 0
         * access_masks[] = 0
         */
        return new_ruleset;
}

struct landlock_ruleset *
landlock_create_ruleset(const access_mask_t fs_access_mask,
                        const access_mask_t net_access_mask,
                        const access_mask_t scope_mask)
{
        struct landlock_ruleset *new_ruleset;

        /* Informs about useless ruleset. */
        if (!fs_access_mask && !net_access_mask && !scope_mask)
                return ERR_PTR(-ENOMSG);
        new_ruleset = create_ruleset(1);
        if (IS_ERR(new_ruleset))
                return new_ruleset;
        if (fs_access_mask)
                landlock_add_fs_access_mask(new_ruleset, fs_access_mask, 0);
        if (net_access_mask)
                landlock_add_net_access_mask(new_ruleset, net_access_mask, 0);
        if (scope_mask)
                landlock_add_scope_mask(new_ruleset, scope_mask, 0);
        return new_ruleset;
}

static void build_check_rule(void)
{
        const struct landlock_rule rule = {
                .num_layers = ~0,
        };

        /*
         * Checks that .num_layers is large enough for at least
         * LANDLOCK_MAX_NUM_LAYERS layers.
         */
        BUILD_BUG_ON(rule.num_layers < LANDLOCK_MAX_NUM_LAYERS);
}

static bool is_object_pointer(const enum landlock_key_type key_type)
{
        switch (key_type) {
        case LANDLOCK_KEY_INODE:
                return true;

#if IS_ENABLED(CONFIG_INET)
        case LANDLOCK_KEY_NET_PORT:
                return false;
#endif /* IS_ENABLED(CONFIG_INET) */

        default:
                WARN_ON_ONCE(1);
                return false;
        }
}

static struct landlock_rule *
create_rule(const struct landlock_id id,
            const struct landlock_layer (*layers)[], const u32 num_layers,
            const struct landlock_layer *const new_layer)
{
        struct landlock_rule *new_rule;
        u32 new_num_layers;

        build_check_rule();
        if (new_layer) {
                /* Should already be checked by landlock_merge_ruleset(). */
                if (WARN_ON_ONCE(num_layers >= LANDLOCK_MAX_NUM_LAYERS))
                        return ERR_PTR(-E2BIG);
                new_num_layers = num_layers + 1;
        } else {
                new_num_layers = num_layers;
        }
        new_rule = kzalloc_flex(*new_rule, layers, new_num_layers,
                                GFP_KERNEL_ACCOUNT);
        if (!new_rule)
                return ERR_PTR(-ENOMEM);
        RB_CLEAR_NODE(&new_rule->node);
        if (is_object_pointer(id.type)) {
                /* This should have been caught by insert_rule(). */
                WARN_ON_ONCE(!id.key.object);
                landlock_get_object(id.key.object);
        }

        new_rule->key = id.key;
        new_rule->num_layers = new_num_layers;
        /* Copies the original layer stack. */
        memcpy(new_rule->layers, layers,
               flex_array_size(new_rule, layers, num_layers));
        if (new_layer)
                /* Adds a copy of @new_layer on the layer stack. */
                new_rule->layers[new_rule->num_layers - 1] = *new_layer;
        return new_rule;
}

static struct rb_root *get_root(struct landlock_ruleset *const ruleset,
                                const enum landlock_key_type key_type)
{
        switch (key_type) {
        case LANDLOCK_KEY_INODE:
                return &ruleset->root_inode;

#if IS_ENABLED(CONFIG_INET)
        case LANDLOCK_KEY_NET_PORT:
                return &ruleset->root_net_port;
#endif /* IS_ENABLED(CONFIG_INET) */

        default:
                WARN_ON_ONCE(1);
                return ERR_PTR(-EINVAL);
        }
}

static void free_rule(struct landlock_rule *const rule,
                      const enum landlock_key_type key_type)
{
        might_sleep();
        if (!rule)
                return;
        if (is_object_pointer(key_type))
                landlock_put_object(rule->key.object);
        kfree(rule);
}

static void build_check_ruleset(void)
{
        const struct landlock_ruleset ruleset = {
                .num_rules = ~0,
                .num_layers = ~0,
        };

        BUILD_BUG_ON(ruleset.num_rules < LANDLOCK_MAX_NUM_RULES);
        BUILD_BUG_ON(ruleset.num_layers < LANDLOCK_MAX_NUM_LAYERS);
}

/**
 * insert_rule - Create and insert a rule in a ruleset
 *
 * @ruleset: The ruleset to be updated.
 * @id: The ID to build the new rule with.  The underlying kernel object, if
 *      any, must be held by the caller.
 * @layers: One or multiple layers to be copied into the new rule.
 * @num_layers: The number of @layers entries.
 *
 * When user space requests to add a new rule to a ruleset, @layers only
 * contains one entry and this entry is not assigned to any level.  In this
 * case, the new rule will extend @ruleset, similarly to a boolean OR between
 * access rights.
 *
 * When merging a ruleset in a domain, or copying a domain, @layers will be
 * added to @ruleset as new constraints, similarly to a boolean AND between
 * access rights.
 *
 * Return: 0 on success, -errno on failure.
 */
static int insert_rule(struct landlock_ruleset *const ruleset,
                       const struct landlock_id id,
                       const struct landlock_layer (*layers)[],
                       const size_t num_layers)
{
        struct rb_node **walker_node;
        struct rb_node *parent_node = NULL;
        struct landlock_rule *new_rule;
        struct rb_root *root;

        might_sleep();
        lockdep_assert_held(&ruleset->lock);
        if (WARN_ON_ONCE(!layers))
                return -ENOENT;

        if (is_object_pointer(id.type) && WARN_ON_ONCE(!id.key.object))
                return -ENOENT;

        root = get_root(ruleset, id.type);
        if (IS_ERR(root))
                return PTR_ERR(root);

        walker_node = &root->rb_node;
        while (*walker_node) {
                struct landlock_rule *const this =
                        rb_entry(*walker_node, struct landlock_rule, node);

                if (this->key.data != id.key.data) {
                        parent_node = *walker_node;
                        if (this->key.data < id.key.data)
                                walker_node = &((*walker_node)->rb_right);
                        else
                                walker_node = &((*walker_node)->rb_left);
                        continue;
                }

                /* Only a single-level layer should match an existing rule. */
                if (WARN_ON_ONCE(num_layers != 1))
                        return -EINVAL;

                /* If there is a matching rule, updates it. */
                if ((*layers)[0].level == 0) {
                        /*
                         * Extends access rights when the request comes from
                         * landlock_add_rule(2), i.e. @ruleset is not a domain.
                         */
                        if (WARN_ON_ONCE(this->num_layers != 1))
                                return -EINVAL;
                        if (WARN_ON_ONCE(this->layers[0].level != 0))
                                return -EINVAL;
                        this->layers[0].access |= (*layers)[0].access;
                        return 0;
                }

                if (WARN_ON_ONCE(this->layers[0].level == 0))
                        return -EINVAL;

                /*
                 * Intersects access rights when it is a merge between a
                 * ruleset and a domain.
                 */
                new_rule = create_rule(id, &this->layers, this->num_layers,
                                       &(*layers)[0]);
                if (IS_ERR(new_rule))
                        return PTR_ERR(new_rule);
                rb_replace_node(&this->node, &new_rule->node, root);
                free_rule(this, id.type);
                return 0;
        }

        /* There is no match for @id. */
        build_check_ruleset();
        if (ruleset->num_rules >= LANDLOCK_MAX_NUM_RULES)
                return -E2BIG;
        new_rule = create_rule(id, layers, num_layers, NULL);
        if (IS_ERR(new_rule))
                return PTR_ERR(new_rule);
        rb_link_node(&new_rule->node, parent_node, walker_node);
        rb_insert_color(&new_rule->node, root);
        ruleset->num_rules++;
        return 0;
}

static void build_check_layer(void)
{
        const struct landlock_layer layer = {
                .level = ~0,
                .access = ~0,
        };

        /*
         * Checks that .level and .access are large enough to contain their expected
         * maximum values.
         */
        BUILD_BUG_ON(layer.level < LANDLOCK_MAX_NUM_LAYERS);
        BUILD_BUG_ON(layer.access < LANDLOCK_MASK_ACCESS_FS);
}

/* @ruleset must be locked by the caller. */
int landlock_insert_rule(struct landlock_ruleset *const ruleset,
                         const struct landlock_id id,
                         const access_mask_t access)
{
        struct landlock_layer layers[] = { {
                .access = access,
                /* When @level is zero, insert_rule() extends @ruleset. */
                .level = 0,
        } };

        build_check_layer();
        return insert_rule(ruleset, id, &layers, ARRAY_SIZE(layers));
}

static int merge_tree(struct landlock_ruleset *const dst,
                      struct landlock_ruleset *const src,
                      const enum landlock_key_type key_type)
{
        struct landlock_rule *walker_rule, *next_rule;
        struct rb_root *src_root;
        int err = 0;

        might_sleep();
        lockdep_assert_held(&dst->lock);
        lockdep_assert_held(&src->lock);

        src_root = get_root(src, key_type);
        if (IS_ERR(src_root))
                return PTR_ERR(src_root);

        /* Merges the @src tree. */
        rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, src_root,
                                             node) {
                struct landlock_layer layers[] = { {
                        .level = dst->num_layers,
                } };
                const struct landlock_id id = {
                        .key = walker_rule->key,
                        .type = key_type,
                };

                if (WARN_ON_ONCE(walker_rule->num_layers != 1))
                        return -EINVAL;

                if (WARN_ON_ONCE(walker_rule->layers[0].level != 0))
                        return -EINVAL;

                layers[0].access = walker_rule->layers[0].access;

                err = insert_rule(dst, id, &layers, ARRAY_SIZE(layers));
                if (err)
                        return err;
        }
        return err;
}

static int merge_ruleset(struct landlock_ruleset *const dst,
                         struct landlock_ruleset *const src)
{
        int err = 0;

        might_sleep();
        /* Should already be checked by landlock_merge_ruleset() */
        if (WARN_ON_ONCE(!src))
                return 0;
        /* Only merge into a domain. */
        if (WARN_ON_ONCE(!dst || !dst->hierarchy))
                return -EINVAL;

        /* Locks @dst first because we are its only owner. */
        mutex_lock(&dst->lock);
        mutex_lock_nested(&src->lock, SINGLE_DEPTH_NESTING);

        /* Stacks the new layer. */
        if (WARN_ON_ONCE(src->num_layers != 1 || dst->num_layers < 1)) {
                err = -EINVAL;
                goto out_unlock;
        }
        dst->access_masks[dst->num_layers - 1] =
                landlock_upgrade_handled_access_masks(src->access_masks[0]);

        /* Merges the @src inode tree. */
        err = merge_tree(dst, src, LANDLOCK_KEY_INODE);
        if (err)
                goto out_unlock;

#if IS_ENABLED(CONFIG_INET)
        /* Merges the @src network port tree. */
        err = merge_tree(dst, src, LANDLOCK_KEY_NET_PORT);
        if (err)
                goto out_unlock;
#endif /* IS_ENABLED(CONFIG_INET) */

out_unlock:
        mutex_unlock(&src->lock);
        mutex_unlock(&dst->lock);
        return err;
}

static int inherit_tree(struct landlock_ruleset *const parent,
                        struct landlock_ruleset *const child,
                        const enum landlock_key_type key_type)
{
        struct landlock_rule *walker_rule, *next_rule;
        struct rb_root *parent_root;
        int err = 0;

        might_sleep();
        lockdep_assert_held(&parent->lock);
        lockdep_assert_held(&child->lock);

        parent_root = get_root(parent, key_type);
        if (IS_ERR(parent_root))
                return PTR_ERR(parent_root);

        /* Copies the @parent inode or network tree. */
        rbtree_postorder_for_each_entry_safe(walker_rule, next_rule,
                                             parent_root, node) {
                const struct landlock_id id = {
                        .key = walker_rule->key,
                        .type = key_type,
                };

                err = insert_rule(child, id, &walker_rule->layers,
                                  walker_rule->num_layers);
                if (err)
                        return err;
        }
        return err;
}

static int inherit_ruleset(struct landlock_ruleset *const parent,
                           struct landlock_ruleset *const child)
{
        int err = 0;

        might_sleep();
        if (!parent)
                return 0;

        /* Locks @child first because we are its only owner. */
        mutex_lock(&child->lock);
        mutex_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);

        /* Copies the @parent inode tree. */
        err = inherit_tree(parent, child, LANDLOCK_KEY_INODE);
        if (err)
                goto out_unlock;

#if IS_ENABLED(CONFIG_INET)
        /* Copies the @parent network port tree. */
        err = inherit_tree(parent, child, LANDLOCK_KEY_NET_PORT);
        if (err)
                goto out_unlock;
#endif /* IS_ENABLED(CONFIG_INET) */

        if (WARN_ON_ONCE(child->num_layers <= parent->num_layers)) {
                err = -EINVAL;
                goto out_unlock;
        }
        /* Copies the parent layer stack and leaves a space for the new layer. */
        memcpy(child->access_masks, parent->access_masks,
               flex_array_size(parent, access_masks, parent->num_layers));

        if (WARN_ON_ONCE(!parent->hierarchy)) {
                err = -EINVAL;
                goto out_unlock;
        }
        landlock_get_hierarchy(parent->hierarchy);
        child->hierarchy->parent = parent->hierarchy;

out_unlock:
        mutex_unlock(&parent->lock);
        mutex_unlock(&child->lock);
        return err;
}

static void free_ruleset(struct landlock_ruleset *const ruleset)
{
        struct landlock_rule *freeme, *next;

        might_sleep();
        rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_inode,
                                             node)
                free_rule(freeme, LANDLOCK_KEY_INODE);

#if IS_ENABLED(CONFIG_INET)
        rbtree_postorder_for_each_entry_safe(freeme, next,
                                             &ruleset->root_net_port, node)
                free_rule(freeme, LANDLOCK_KEY_NET_PORT);
#endif /* IS_ENABLED(CONFIG_INET) */

        landlock_put_hierarchy(ruleset->hierarchy);
        kfree(ruleset);
}

void landlock_put_ruleset(struct landlock_ruleset *const ruleset)
{
        might_sleep();
        if (ruleset && refcount_dec_and_test(&ruleset->usage))
                free_ruleset(ruleset);
}

static void free_ruleset_work(struct work_struct *const work)
{
        struct landlock_ruleset *ruleset;

        ruleset = container_of(work, struct landlock_ruleset, work_free);
        free_ruleset(ruleset);
}

/* Only called by hook_cred_free(). */
void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset)
{
        if (ruleset && refcount_dec_and_test(&ruleset->usage)) {
                INIT_WORK(&ruleset->work_free, free_ruleset_work);
                schedule_work(&ruleset->work_free);
        }
}

/**
 * landlock_merge_ruleset - Merge a ruleset with a domain
 *
 * @parent: Parent domain.
 * @ruleset: New ruleset to be merged.
 *
 * The current task is requesting to be restricted.  The subjective credentials
 * must not be in an overridden state. cf. landlock_init_hierarchy_log().
 *
 * Return: A new domain merging @parent and @ruleset on success, or ERR_PTR()
 * on failure.  If @parent is NULL, the new domain duplicates @ruleset.
 */
struct landlock_ruleset *
landlock_merge_ruleset(struct landlock_ruleset *const parent,
                       struct landlock_ruleset *const ruleset)
{
        struct landlock_ruleset *new_dom __free(landlock_put_ruleset) = NULL;
        u32 num_layers;
        int err;

        might_sleep();
        if (WARN_ON_ONCE(!ruleset || parent == ruleset))
                return ERR_PTR(-EINVAL);

        if (parent) {
                if (parent->num_layers >= LANDLOCK_MAX_NUM_LAYERS)
                        return ERR_PTR(-E2BIG);
                num_layers = parent->num_layers + 1;
        } else {
                num_layers = 1;
        }

        /* Creates a new domain... */
        new_dom = create_ruleset(num_layers);
        if (IS_ERR(new_dom))
                return new_dom;

        new_dom->hierarchy =
                kzalloc_obj(*new_dom->hierarchy, GFP_KERNEL_ACCOUNT);
        if (!new_dom->hierarchy)
                return ERR_PTR(-ENOMEM);

        refcount_set(&new_dom->hierarchy->usage, 1);

        /* ...as a child of @parent... */
        err = inherit_ruleset(parent, new_dom);
        if (err)
                return ERR_PTR(err);

        /* ...and including @ruleset. */
        err = merge_ruleset(new_dom, ruleset);
        if (err)
                return ERR_PTR(err);

        err = landlock_init_hierarchy_log(new_dom->hierarchy);
        if (err)
                return ERR_PTR(err);

        return no_free_ptr(new_dom);
}

/*
 * The returned access has the same lifetime as @ruleset.
 */
const struct landlock_rule *
landlock_find_rule(const struct landlock_ruleset *const ruleset,
                   const struct landlock_id id)
{
        const struct rb_root *root;
        const struct rb_node *node;

        root = get_root((struct landlock_ruleset *)ruleset, id.type);
        if (IS_ERR(root))
                return NULL;
        node = root->rb_node;

        while (node) {
                struct landlock_rule *this =
                        rb_entry(node, struct landlock_rule, node);

                if (this->key.data == id.key.data)
                        return this;
                if (this->key.data < id.key.data)
                        node = node->rb_right;
                else
                        node = node->rb_left;
        }
        return NULL;
}

/**
 * landlock_unmask_layers - Remove the access rights in @masks
 *                          which are granted in @rule
 *
 * Updates the set of (per-layer) unfulfilled access rights @masks
 * so that all the access rights granted in @rule are removed from it
 * (because they are now fulfilled).
 *
 * @rule: A rule that grants a set of access rights for each layer
 * @masks: A matrix of unfulfilled access rights for each layer
 *
 * Return: True if the request is allowed (i.e. the access rights granted all
 * remaining unfulfilled access rights and masks has no leftover set bits).
 */
bool landlock_unmask_layers(const struct landlock_rule *const rule,
                            struct layer_access_masks *masks)
{
        if (!masks)
                return true;
        if (!rule)
                return false;

        /*
         * An access is granted if, for each policy layer, at least one rule
         * encountered on the pathwalk grants the requested access,
         * regardless of its position in the layer stack.  We must then check
         * the remaining layers for each inode, from the first added layer to
         * the last one.  When there is multiple requested accesses, for each
         * policy layer, the full set of requested accesses may not be granted
         * by only one rule, but by the union (binary OR) of multiple rules.
         * E.g. /a/b <execute> + /a <read> => /a/b <execute + read>
         */
        for (size_t i = 0; i < rule->num_layers; i++) {
                const struct landlock_layer *const layer = &rule->layers[i];

                /* Clear the bits where the layer in the rule grants access. */
                masks->access[layer->level - 1] &= ~layer->access;
        }

        for (size_t i = 0; i < ARRAY_SIZE(masks->access); i++) {
                if (masks->access[i])
                        return false;
        }
        return true;
}

typedef access_mask_t
get_access_mask_t(const struct landlock_ruleset *const ruleset,
                  const u16 layer_level);

/**
 * landlock_init_layer_masks - Initialize layer masks from an access request
 *
 * Populates @masks such that for each access right in @access_request,
 * the bits for all the layers are set where this access right is handled.
 *
 * @domain: The domain that defines the current restrictions.
 * @access_request: The requested access rights to check.
 * @masks: Layer access masks to populate.
 * @key_type: The key type to switch between access masks of different types.
 *
 * Return: An access mask where each access right bit is set which is handled
 * in any of the active layers in @domain.
 */
access_mask_t
landlock_init_layer_masks(const struct landlock_ruleset *const domain,
                          const access_mask_t access_request,
                          struct layer_access_masks *const masks,
                          const enum landlock_key_type key_type)
{
        access_mask_t handled_accesses = 0;
        get_access_mask_t *get_access_mask;

        switch (key_type) {
        case LANDLOCK_KEY_INODE:
                get_access_mask = landlock_get_fs_access_mask;
                break;

#if IS_ENABLED(CONFIG_INET)
        case LANDLOCK_KEY_NET_PORT:
                get_access_mask = landlock_get_net_access_mask;
                break;
#endif /* IS_ENABLED(CONFIG_INET) */

        default:
                WARN_ON_ONCE(1);
                return 0;
        }

        /* An empty access request can happen because of O_WRONLY | O_RDWR. */
        if (!access_request)
                return 0;

        for (size_t i = 0; i < domain->num_layers; i++) {
                const access_mask_t handled = get_access_mask(domain, i);

                masks->access[i] = access_request & handled;
                handled_accesses |= masks->access[i];
        }
        for (size_t i = domain->num_layers; i < ARRAY_SIZE(masks->access); i++)
                masks->access[i] = 0;

        return handled_accesses;
}





































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_KSM_H
#define __LINUX_KSM_H
/*
 * Memory merging support.
 *
 * This code enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork().
 */

#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/sched.h>

#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, vm_flags_t *vm_flags);
vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file,
                          vma_flags_t vma_flags);
int ksm_enable_merge_any(struct mm_struct *mm);
int ksm_disable_merge_any(struct mm_struct *mm);
int ksm_disable(struct mm_struct *mm);

int __ksm_enter(struct mm_struct *mm);
void __ksm_exit(struct mm_struct *mm);
/*
 * To identify zeropages that were mapped by KSM, we reuse the dirty bit
 * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when
 * deduplicating memory.
 */
#define is_ksm_zero_pte(pte)        (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte))

extern atomic_long_t ksm_zero_pages;

static inline void ksm_map_zero_page(struct mm_struct *mm)
{
        atomic_long_inc(&ksm_zero_pages);
        atomic_long_inc(&mm->ksm_zero_pages);
}

static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
        if (is_ksm_zero_pte(pte)) {
                atomic_long_dec(&ksm_zero_pages);
                atomic_long_dec(&mm->ksm_zero_pages);
        }
}

static inline long mm_ksm_zero_pages(struct mm_struct *mm)
{
        return atomic_long_read(&mm->ksm_zero_pages);
}

static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
        /* Adding mm to ksm is best effort on fork. */
        if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) {
                long nr_ksm_zero_pages = atomic_long_read(&mm->ksm_zero_pages);

                mm->ksm_merging_pages = 0;
                mm->ksm_rmap_items = 0;
                atomic_long_add(nr_ksm_zero_pages, &ksm_zero_pages);
                __ksm_enter(mm);
        }
}

static inline int ksm_execve(struct mm_struct *mm)
{
        if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
                return __ksm_enter(mm);

        return 0;
}

static inline void ksm_exit(struct mm_struct *mm)
{
        if (mm_flags_test(MMF_VM_MERGEABLE, mm))
                __ksm_exit(mm);
}

/*
 * When do_swap_page() first faults in from swap what used to be a KSM page,
 * no problem, it will be assigned to this vma's anon_vma; but thereafter,
 * it might be faulted into a different anon_vma (or perhaps to a different
 * offset in the same anon_vma).  do_swap_page() cannot do all the locking
 * needed to reconstitute a cross-anon_vma KSM page: for now it has to make
 * a copy, and leave remerging the pages to a later pass of ksmd.
 *
 * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
 * but what if the vma was unmerged while the page was swapped out?
 */
struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr);

void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
void collect_procs_ksm(const struct folio *folio, const struct page *page,
                struct list_head *to_kill, int force_early);
long ksm_process_profit(struct mm_struct *);
bool ksm_process_mergeable(struct mm_struct *mm);

#else  /* !CONFIG_KSM */

static inline vma_flags_t ksm_vma_flags(struct mm_struct *mm,
                const struct file *file, vma_flags_t vma_flags)
{
        return vma_flags;
}

static inline int ksm_disable(struct mm_struct *mm)
{
        return 0;
}

static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
}

static inline int ksm_execve(struct mm_struct *mm)
{
        return 0;
}

static inline void ksm_exit(struct mm_struct *mm)
{
}

static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
}

static inline void collect_procs_ksm(const struct folio *folio,
                const struct page *page, struct list_head *to_kill,
                int force_early)
{
}

#ifdef CONFIG_MMU
static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, vm_flags_t *vm_flags)
{
        return 0;
}

static inline struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return folio;
}

static inline void rmap_walk_ksm(struct folio *folio,
                        struct rmap_walk_control *rwc)
{
}

static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old)
{
}
#endif /* CONFIG_MMU */
#endif /* !CONFIG_KSM */

#endif /* __LINUX_KSM_H */


















































































































































































































































































































































































































































































































    1 














    1 














    1 
    1 

    1 
    1 

    1 

    1 
    1 





    1 





























































































































































































































































































































































































    1 







    1 

















    1 









    1 













    1 

















    1 













    1 






    1 






















    1 








    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Apple USB BCM5974 (Macbook Air and Penryn Macbook Pro) multitouch driver
 *
 * Copyright (C) 2008           Henrik Rydberg (rydberg@euromail.se)
 * Copyright (C) 2015      John Horan (knasher@gmail.com)
 *
 * The USB initialization and package decoding was made by
 * Scott Shawcroft as part of the touchd user-space driver project:
 * Copyright (C) 2008           Scott Shawcroft (scott.shawcroft@gmail.com)
 *
 * The BCM5974 driver is based on the appletouch driver:
 * Copyright (C) 2001-2004 Greg Kroah-Hartman (greg@kroah.com)
 * Copyright (C) 2005      Johannes Berg (johannes@sipsolutions.net)
 * Copyright (C) 2005           Stelian Pop (stelian@popies.net)
 * Copyright (C) 2005           Frank Arnold (frank@scirocco-5v-turbo.de)
 * Copyright (C) 2005           Peter Osterlund (petero2@telia.com)
 * Copyright (C) 2005           Michael Hanselmann (linux-kernel@hansmi.ch)
 * Copyright (C) 2006           Nicolas Boichat (nicolas@boichat.ch)
 */

#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/usb/input.h>
#include <linux/hid.h>
#include <linux/mutex.h>
#include <linux/input/mt.h>

#define USB_VENDOR_ID_APPLE                0x05ac

/* MacbookAir, aka wellspring */
#define USB_DEVICE_ID_APPLE_WELLSPRING_ANSI        0x0223
#define USB_DEVICE_ID_APPLE_WELLSPRING_ISO        0x0224
#define USB_DEVICE_ID_APPLE_WELLSPRING_JIS        0x0225
/* MacbookProPenryn, aka wellspring2 */
#define USB_DEVICE_ID_APPLE_WELLSPRING2_ANSI        0x0230
#define USB_DEVICE_ID_APPLE_WELLSPRING2_ISO        0x0231
#define USB_DEVICE_ID_APPLE_WELLSPRING2_JIS        0x0232
/* Macbook5,1 (unibody), aka wellspring3 */
#define USB_DEVICE_ID_APPLE_WELLSPRING3_ANSI        0x0236
#define USB_DEVICE_ID_APPLE_WELLSPRING3_ISO        0x0237
#define USB_DEVICE_ID_APPLE_WELLSPRING3_JIS        0x0238
/* MacbookAir3,2 (unibody), aka wellspring5 */
#define USB_DEVICE_ID_APPLE_WELLSPRING4_ANSI        0x023f
#define USB_DEVICE_ID_APPLE_WELLSPRING4_ISO        0x0240
#define USB_DEVICE_ID_APPLE_WELLSPRING4_JIS        0x0241
/* MacbookAir3,1 (unibody), aka wellspring4 */
#define USB_DEVICE_ID_APPLE_WELLSPRING4A_ANSI        0x0242
#define USB_DEVICE_ID_APPLE_WELLSPRING4A_ISO        0x0243
#define USB_DEVICE_ID_APPLE_WELLSPRING4A_JIS        0x0244
/* Macbook8 (unibody, March 2011) */
#define USB_DEVICE_ID_APPLE_WELLSPRING5_ANSI        0x0245
#define USB_DEVICE_ID_APPLE_WELLSPRING5_ISO        0x0246
#define USB_DEVICE_ID_APPLE_WELLSPRING5_JIS        0x0247
/* MacbookAir4,1 (unibody, July 2011) */
#define USB_DEVICE_ID_APPLE_WELLSPRING6A_ANSI        0x0249
#define USB_DEVICE_ID_APPLE_WELLSPRING6A_ISO        0x024a
#define USB_DEVICE_ID_APPLE_WELLSPRING6A_JIS        0x024b
/* MacbookAir4,2 (unibody, July 2011) */
#define USB_DEVICE_ID_APPLE_WELLSPRING6_ANSI        0x024c
#define USB_DEVICE_ID_APPLE_WELLSPRING6_ISO        0x024d
#define USB_DEVICE_ID_APPLE_WELLSPRING6_JIS        0x024e
/* Macbook8,2 (unibody) */
#define USB_DEVICE_ID_APPLE_WELLSPRING5A_ANSI        0x0252
#define USB_DEVICE_ID_APPLE_WELLSPRING5A_ISO        0x0253
#define USB_DEVICE_ID_APPLE_WELLSPRING5A_JIS        0x0254
/* MacbookPro10,1 (unibody, June 2012) */
#define USB_DEVICE_ID_APPLE_WELLSPRING7_ANSI        0x0262
#define USB_DEVICE_ID_APPLE_WELLSPRING7_ISO        0x0263
#define USB_DEVICE_ID_APPLE_WELLSPRING7_JIS        0x0264
/* MacbookPro10,2 (unibody, October 2012) */
#define USB_DEVICE_ID_APPLE_WELLSPRING7A_ANSI        0x0259
#define USB_DEVICE_ID_APPLE_WELLSPRING7A_ISO        0x025a
#define USB_DEVICE_ID_APPLE_WELLSPRING7A_JIS        0x025b
/* MacbookAir6,2 (unibody, June 2013) */
#define USB_DEVICE_ID_APPLE_WELLSPRING8_ANSI        0x0290
#define USB_DEVICE_ID_APPLE_WELLSPRING8_ISO        0x0291
#define USB_DEVICE_ID_APPLE_WELLSPRING8_JIS        0x0292
/* MacbookPro12,1 (2015) */
#define USB_DEVICE_ID_APPLE_WELLSPRING9_ANSI        0x0272
#define USB_DEVICE_ID_APPLE_WELLSPRING9_ISO        0x0273
#define USB_DEVICE_ID_APPLE_WELLSPRING9_JIS        0x0274

#define BCM5974_DEVICE(prod) {                                        \
        .match_flags = (USB_DEVICE_ID_MATCH_DEVICE |                \
                        USB_DEVICE_ID_MATCH_INT_CLASS |                \
                        USB_DEVICE_ID_MATCH_INT_PROTOCOL),        \
        .idVendor = USB_VENDOR_ID_APPLE,                        \
        .idProduct = (prod),                                        \
        .bInterfaceClass = USB_INTERFACE_CLASS_HID,                \
        .bInterfaceProtocol = USB_INTERFACE_PROTOCOL_MOUSE        \
}

/* table of devices that work with this driver */
static const struct usb_device_id bcm5974_table[] = {
        /* MacbookAir1.1 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING_JIS),
        /* MacbookProPenryn */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING2_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING2_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING2_JIS),
        /* Macbook5,1 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING3_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING3_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING3_JIS),
        /* MacbookAir3,2 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING4_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING4_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING4_JIS),
        /* MacbookAir3,1 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING4A_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING4A_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING4A_JIS),
        /* MacbookPro8 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING5_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING5_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING5_JIS),
        /* MacbookAir4,1 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING6A_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING6A_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING6A_JIS),
        /* MacbookAir4,2 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING6_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING6_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING6_JIS),
        /* MacbookPro8,2 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING5A_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING5A_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING5A_JIS),
        /* MacbookPro10,1 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING7_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING7_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING7_JIS),
        /* MacbookPro10,2 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING7A_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING7A_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING7A_JIS),
        /* MacbookAir6,2 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING8_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING8_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING8_JIS),
        /* MacbookPro12,1 */
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING9_ANSI),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING9_ISO),
        BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING9_JIS),
        /* Terminating entry */
        {}
};
MODULE_DEVICE_TABLE(usb, bcm5974_table);

MODULE_AUTHOR("Henrik Rydberg");
MODULE_DESCRIPTION("Apple USB BCM5974 multitouch driver");
MODULE_LICENSE("GPL");

#define dprintk(level, format, a...)\
        { if (debug >= level) printk(KERN_DEBUG format, ##a); }

static int debug = 1;
module_param(debug, int, 0644);
MODULE_PARM_DESC(debug, "Activate debugging output");

/* button data structure */
struct bt_data {
        u8 unknown1;                /* constant */
        u8 button;                /* left button */
        u8 rel_x;                /* relative x coordinate */
        u8 rel_y;                /* relative y coordinate */
};

/* trackpad header types */
enum tp_type {
        TYPE1,                        /* plain trackpad */
        TYPE2,                        /* button integrated in trackpad */
        TYPE3,                        /* additional header fields since June 2013 */
        TYPE4                        /* additional header field for pressure data */
};

/* trackpad finger data offsets, le16-aligned */
#define HEADER_TYPE1                (13 * sizeof(__le16))
#define HEADER_TYPE2                (15 * sizeof(__le16))
#define HEADER_TYPE3                (19 * sizeof(__le16))
#define HEADER_TYPE4                (23 * sizeof(__le16))

/* trackpad button data offsets */
#define BUTTON_TYPE1                0
#define BUTTON_TYPE2                15
#define BUTTON_TYPE3                23
#define BUTTON_TYPE4                31

/* list of device capability bits */
#define HAS_INTEGRATED_BUTTON        1

/* trackpad finger data block size */
#define FSIZE_TYPE1                (14 * sizeof(__le16))
#define FSIZE_TYPE2                (14 * sizeof(__le16))
#define FSIZE_TYPE3                (14 * sizeof(__le16))
#define FSIZE_TYPE4                (15 * sizeof(__le16))

/* offset from header to finger struct */
#define DELTA_TYPE1                (0 * sizeof(__le16))
#define DELTA_TYPE2                (0 * sizeof(__le16))
#define DELTA_TYPE3                (0 * sizeof(__le16))
#define DELTA_TYPE4                (1 * sizeof(__le16))

/* usb control message mode switch data */
#define USBMSG_TYPE1                8, 0x300, 0, 0, 0x1, 0x8
#define USBMSG_TYPE2                8, 0x300, 0, 0, 0x1, 0x8
#define USBMSG_TYPE3                8, 0x300, 0, 0, 0x1, 0x8
#define USBMSG_TYPE4                2, 0x302, 2, 1, 0x1, 0x0

/* Wellspring initialization constants */
#define BCM5974_WELLSPRING_MODE_READ_REQUEST_ID                1
#define BCM5974_WELLSPRING_MODE_WRITE_REQUEST_ID        9

/* trackpad finger structure, le16-aligned */
struct tp_finger {
        __le16 origin;                /* zero when switching track finger */
        __le16 abs_x;                /* absolute x coodinate */
        __le16 abs_y;                /* absolute y coodinate */
        __le16 rel_x;                /* relative x coodinate */
        __le16 rel_y;                /* relative y coodinate */
        __le16 tool_major;        /* tool area, major axis */
        __le16 tool_minor;        /* tool area, minor axis */
        __le16 orientation;        /* 16384 when point, else 15 bit angle */
        __le16 touch_major;        /* touch area, major axis */
        __le16 touch_minor;        /* touch area, minor axis */
        __le16 unused[2];        /* zeros */
        __le16 pressure;        /* pressure on forcetouch touchpad */
        __le16 multi;                /* one finger: varies, more fingers: constant */
} __attribute__((packed,aligned(2)));

/* trackpad finger data size, empirically at least ten fingers */
#define MAX_FINGERS                16
#define MAX_FINGER_ORIENTATION        16384

/* device-specific parameters */
struct bcm5974_param {
        int snratio;                /* signal-to-noise ratio */
        int min;                /* device minimum reading */
        int max;                /* device maximum reading */
};

/* device-specific configuration */
struct bcm5974_config {
        int ansi, iso, jis;        /* the product id of this device */
        int caps;                /* device capability bitmask */
        int bt_ep;                /* the endpoint of the button interface */
        int bt_datalen;                /* data length of the button interface */
        int tp_ep;                /* the endpoint of the trackpad interface */
        enum tp_type tp_type;        /* type of trackpad interface */
        int tp_header;                /* bytes in header block */
        int tp_datalen;                /* data length of the trackpad interface */
        int tp_button;                /* offset to button data */
        int tp_fsize;                /* bytes in single finger block */
        int tp_delta;                /* offset from header to finger struct */
        int um_size;                /* usb control message length */
        int um_req_val;                /* usb control message value */
        int um_req_idx;                /* usb control message index */
        int um_switch_idx;        /* usb control message mode switch index */
        int um_switch_on;        /* usb control message mode switch on */
        int um_switch_off;        /* usb control message mode switch off */
        struct bcm5974_param p;        /* finger pressure limits */
        struct bcm5974_param w;        /* finger width limits */
        struct bcm5974_param x;        /* horizontal limits */
        struct bcm5974_param y;        /* vertical limits */
        struct bcm5974_param o;        /* orientation limits */
};

/* logical device structure */
struct bcm5974 {
        char phys[64];
        struct usb_device *udev;        /* usb device */
        struct usb_interface *intf;        /* our interface */
        struct input_dev *input;        /* input dev */
        struct bcm5974_config cfg;        /* device configuration */
        struct mutex pm_mutex;                /* serialize access to open/suspend */
        int opened;                        /* 1: opened, 0: closed */
        struct urb *bt_urb;                /* button usb request block */
        struct bt_data *bt_data;        /* button transferred data */
        struct urb *tp_urb;                /* trackpad usb request block */
        u8 *tp_data;                        /* trackpad transferred data */
        const struct tp_finger *index[MAX_FINGERS];        /* finger index data */
        struct input_mt_pos pos[MAX_FINGERS];                /* position array */
        int slots[MAX_FINGERS];                                /* slot assignments */
        struct work_struct mode_reset_work;
        unsigned long last_mode_reset;
};

/* trackpad finger block data, le16-aligned */
static const struct tp_finger *get_tp_finger(const struct bcm5974 *dev, int i)
{
        const struct bcm5974_config *c = &dev->cfg;
        u8 *f_base = dev->tp_data + c->tp_header + c->tp_delta;

        return (const struct tp_finger *)(f_base + i * c->tp_fsize);
}

#define DATAFORMAT(type)                                \
        type,                                                \
        HEADER_##type,                                        \
        HEADER_##type + (MAX_FINGERS) * (FSIZE_##type),        \
        BUTTON_##type,                                        \
        FSIZE_##type,                                        \
        DELTA_##type,                                        \
        USBMSG_##type

/* logical signal quality */
#define SN_PRESSURE        45                /* pressure signal-to-noise ratio */
#define SN_WIDTH        25                /* width signal-to-noise ratio */
#define SN_COORD        250                /* coordinate signal-to-noise ratio */
#define SN_ORIENT        10                /* orientation signal-to-noise ratio */

/* device constants */
static const struct bcm5974_config bcm5974_config_table[] = {
        {
                USB_DEVICE_ID_APPLE_WELLSPRING_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING_JIS,
                0,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE1),
                { SN_PRESSURE, 0, 256 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4824, 5342 },
                { SN_COORD, -172, 5820 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING2_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING2_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING2_JIS,
                0,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE1),
                { SN_PRESSURE, 0, 256 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4824, 4824 },
                { SN_COORD, -172, 4290 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING3_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING3_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING3_JIS,
                HAS_INTEGRATED_BUTTON,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE2),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4460, 5166 },
                { SN_COORD, -75, 6700 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING4_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING4_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING4_JIS,
                HAS_INTEGRATED_BUTTON,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE2),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4620, 5140 },
                { SN_COORD, -150, 6600 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING4A_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING4A_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING4A_JIS,
                HAS_INTEGRATED_BUTTON,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE2),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4616, 5112 },
                { SN_COORD, -142, 5234 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING5_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING5_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING5_JIS,
                HAS_INTEGRATED_BUTTON,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE2),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4415, 5050 },
                { SN_COORD, -55, 6680 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING6_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING6_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING6_JIS,
                HAS_INTEGRATED_BUTTON,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE2),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4620, 5140 },
                { SN_COORD, -150, 6600 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING5A_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING5A_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING5A_JIS,
                HAS_INTEGRATED_BUTTON,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE2),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4750, 5280 },
                { SN_COORD, -150, 6730 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING6A_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING6A_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING6A_JIS,
                HAS_INTEGRATED_BUTTON,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE2),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4620, 5140 },
                { SN_COORD, -150, 6600 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING7_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING7_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING7_JIS,
                HAS_INTEGRATED_BUTTON,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE2),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4750, 5280 },
                { SN_COORD, -150, 6730 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING7A_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING7A_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING7A_JIS,
                HAS_INTEGRATED_BUTTON,
                0x84, sizeof(struct bt_data),
                0x81, DATAFORMAT(TYPE2),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4750, 5280 },
                { SN_COORD, -150, 6730 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING8_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING8_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING8_JIS,
                HAS_INTEGRATED_BUTTON,
                0, sizeof(struct bt_data),
                0x83, DATAFORMAT(TYPE3),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4620, 5140 },
                { SN_COORD, -150, 6600 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {
                USB_DEVICE_ID_APPLE_WELLSPRING9_ANSI,
                USB_DEVICE_ID_APPLE_WELLSPRING9_ISO,
                USB_DEVICE_ID_APPLE_WELLSPRING9_JIS,
                HAS_INTEGRATED_BUTTON,
                0, sizeof(struct bt_data),
                0x83, DATAFORMAT(TYPE4),
                { SN_PRESSURE, 0, 300 },
                { SN_WIDTH, 0, 2048 },
                { SN_COORD, -4828, 5345 },
                { SN_COORD, -203, 6803 },
                { SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
        },
        {}
};

/* return the device-specific configuration by device */
static const struct bcm5974_config *bcm5974_get_config(struct usb_device *udev)
{
        u16 id = le16_to_cpu(udev->descriptor.idProduct);
        const struct bcm5974_config *cfg;

        for (cfg = bcm5974_config_table; cfg->ansi; ++cfg)
                if (cfg->ansi == id || cfg->iso == id || cfg->jis == id)
                        return cfg;

        return bcm5974_config_table;
}

/* convert 16-bit little endian to signed integer */
static inline int raw2int(__le16 x)
{
        return (signed short)le16_to_cpu(x);
}

static void set_abs(struct input_dev *input, unsigned int code,
                    const struct bcm5974_param *p)
{
        int fuzz = p->snratio ? (p->max - p->min) / p->snratio : 0;
        input_set_abs_params(input, code, p->min, p->max, fuzz, 0);
}

/* setup which logical events to report */
static void setup_events_to_report(struct input_dev *input_dev,
                                   const struct bcm5974_config *cfg)
{
        __set_bit(EV_ABS, input_dev->evbit);

        /* for synaptics only */
        input_set_abs_params(input_dev, ABS_PRESSURE, 0, 256, 5, 0);
        input_set_abs_params(input_dev, ABS_TOOL_WIDTH, 0, 16, 0, 0);

        /* finger touch area */
        set_abs(input_dev, ABS_MT_TOUCH_MAJOR, &cfg->w);
        set_abs(input_dev, ABS_MT_TOUCH_MINOR, &cfg->w);
        /* finger approach area */
        set_abs(input_dev, ABS_MT_WIDTH_MAJOR, &cfg->w);
        set_abs(input_dev, ABS_MT_WIDTH_MINOR, &cfg->w);
        /* finger orientation */
        set_abs(input_dev, ABS_MT_ORIENTATION, &cfg->o);
        /* finger position */
        set_abs(input_dev, ABS_MT_POSITION_X, &cfg->x);
        set_abs(input_dev, ABS_MT_POSITION_Y, &cfg->y);

        __set_bit(EV_KEY, input_dev->evbit);
        __set_bit(BTN_LEFT, input_dev->keybit);

        if (cfg->caps & HAS_INTEGRATED_BUTTON)
                __set_bit(INPUT_PROP_BUTTONPAD, input_dev->propbit);

        input_mt_init_slots(input_dev, MAX_FINGERS,
                INPUT_MT_POINTER | INPUT_MT_DROP_UNUSED | INPUT_MT_TRACK);
}

/* report button data as logical button state */
static int report_bt_state(struct bcm5974 *dev, int size)
{
        if (size != sizeof(struct bt_data))
                return -EIO;

        dprintk(7,
                "bcm5974: button data: %x %x %x %x\n",
                dev->bt_data->unknown1, dev->bt_data->button,
                dev->bt_data->rel_x, dev->bt_data->rel_y);

        input_report_key(dev->input, BTN_LEFT, dev->bt_data->button);
        input_sync(dev->input);

        return 0;
}

static void report_finger_data(struct input_dev *input, int slot,
                               const struct input_mt_pos *pos,
                               const struct tp_finger *f)
{
        input_mt_slot(input, slot);
        input_mt_report_slot_state(input, MT_TOOL_FINGER, true);

        input_report_abs(input, ABS_MT_TOUCH_MAJOR,
                         raw2int(f->touch_major) << 1);
        input_report_abs(input, ABS_MT_TOUCH_MINOR,
                         raw2int(f->touch_minor) << 1);
        input_report_abs(input, ABS_MT_WIDTH_MAJOR,
                         raw2int(f->tool_major) << 1);
        input_report_abs(input, ABS_MT_WIDTH_MINOR,
                         raw2int(f->tool_minor) << 1);
        input_report_abs(input, ABS_MT_ORIENTATION,
                         MAX_FINGER_ORIENTATION - raw2int(f->orientation));
        input_report_abs(input, ABS_MT_POSITION_X, pos->x);
        input_report_abs(input, ABS_MT_POSITION_Y, pos->y);
}

static void report_synaptics_data(struct input_dev *input,
                                  const struct bcm5974_config *cfg,
                                  const struct tp_finger *f, int raw_n)
{
        int abs_p = 0, abs_w = 0;

        if (raw_n) {
                int p = raw2int(f->touch_major);
                int w = raw2int(f->tool_major);
                if (p > 0 && raw2int(f->origin)) {
                        abs_p = clamp_val(256 * p / cfg->p.max, 0, 255);
                        abs_w = clamp_val(16 * w / cfg->w.max, 0, 15);
                }
        }

        input_report_abs(input, ABS_PRESSURE, abs_p);
        input_report_abs(input, ABS_TOOL_WIDTH, abs_w);
}

/* report trackpad data as logical trackpad state */
static int report_tp_state(struct bcm5974 *dev, int size)
{
        const struct bcm5974_config *c = &dev->cfg;
        const struct tp_finger *f;
        struct input_dev *input = dev->input;
        int raw_n, i, n = 0;

        if (size < c->tp_header || (size - c->tp_header) % c->tp_fsize != 0)
                return -EIO;

        raw_n = (size - c->tp_header) / c->tp_fsize;

        for (i = 0; i < raw_n; i++) {
                f = get_tp_finger(dev, i);
                if (raw2int(f->touch_major) == 0)
                        continue;
                dev->pos[n].x = raw2int(f->abs_x);
                dev->pos[n].y = c->y.min + c->y.max - raw2int(f->abs_y);
                dev->index[n++] = f;
        }

        input_mt_assign_slots(input, dev->slots, dev->pos, n, 0);

        for (i = 0; i < n; i++)
                report_finger_data(input, dev->slots[i],
                                   &dev->pos[i], dev->index[i]);

        input_mt_sync_frame(input);

        report_synaptics_data(input, c, get_tp_finger(dev, 0), raw_n);

        /* later types report button events via integrated button only */
        if (c->caps & HAS_INTEGRATED_BUTTON) {
                int ibt = raw2int(dev->tp_data[c->tp_button]);
                input_report_key(input, BTN_LEFT, ibt);
        }

        input_sync(input);

        return 0;
}

static int bcm5974_wellspring_mode(struct bcm5974 *dev, bool on)
{
        const struct bcm5974_config *c = &dev->cfg;
        int retval = 0, size;
        char *data;

        /* Type 3 does not require a mode switch */
        if (c->tp_type == TYPE3)
                return 0;

        data = kmalloc(c->um_size, GFP_KERNEL);
        if (!data) {
                dev_err(&dev->intf->dev, "out of memory\n");
                retval = -ENOMEM;
                goto out;
        }

        /* read configuration */
        size = usb_control_msg(dev->udev, usb_rcvctrlpipe(dev->udev, 0),
                        BCM5974_WELLSPRING_MODE_READ_REQUEST_ID,
                        USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE,
                        c->um_req_val, c->um_req_idx, data, c->um_size, 5000);

        if (size != c->um_size) {
                dev_err(&dev->intf->dev, "could not read from device\n");
                retval = -EIO;
                goto out;
        }

        /* apply the mode switch */
        data[c->um_switch_idx] = on ? c->um_switch_on : c->um_switch_off;

        /* write configuration */
        size = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0),
                        BCM5974_WELLSPRING_MODE_WRITE_REQUEST_ID,
                        USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE,
                        c->um_req_val, c->um_req_idx, data, c->um_size, 5000);

        if (size != c->um_size) {
                dev_err(&dev->intf->dev, "could not write to device\n");
                retval = -EIO;
                goto out;
        }

        dprintk(2, "bcm5974: switched to %s mode.\n",
                on ? "wellspring" : "normal");

 out:
        kfree(data);
        return retval;
}

/*
 * Mode switches sent before the control response are ignored.
 * Fixing this state requires switching to normal mode and waiting
 * about 1ms before switching back to wellspring mode.
 */
static void bcm5974_mode_reset_work(struct work_struct *work)
{
        struct bcm5974 *dev = container_of(work, struct bcm5974, mode_reset_work);
        int error;

        guard(mutex)(&dev->pm_mutex);
        dev->last_mode_reset = jiffies;

        error = bcm5974_wellspring_mode(dev, false);
        if (error) {
                dev_err(&dev->intf->dev, "reset to normal mode failed\n");
                return;
        }

        fsleep(1000);

        error = bcm5974_wellspring_mode(dev, true);
        if (error)
                dev_err(&dev->intf->dev, "mode switch after reset failed\n");
}

static void bcm5974_irq_button(struct urb *urb)
{
        struct bcm5974 *dev = urb->context;
        struct usb_interface *intf = dev->intf;
        int error;

        switch (urb->status) {
        case 0:
                break;
        case -EOVERFLOW:
        case -ECONNRESET:
        case -ENOENT:
        case -ESHUTDOWN:
                dev_dbg(&intf->dev, "button urb shutting down: %d\n",
                        urb->status);
                return;
        default:
                dev_dbg(&intf->dev, "button urb status: %d\n", urb->status);
                goto exit;
        }

        if (report_bt_state(dev, dev->bt_urb->actual_length))
                dprintk(1, "bcm5974: bad button package, length: %d\n",
                        dev->bt_urb->actual_length);

exit:
        error = usb_submit_urb(dev->bt_urb, GFP_ATOMIC);
        if (error)
                dev_err(&intf->dev, "button urb failed: %d\n", error);
}

static void bcm5974_irq_trackpad(struct urb *urb)
{
        struct bcm5974 *dev = urb->context;
        struct usb_interface *intf = dev->intf;
        int error;

        switch (urb->status) {
        case 0:
                break;
        case -EOVERFLOW:
        case -ECONNRESET:
        case -ENOENT:
        case -ESHUTDOWN:
                dev_dbg(&intf->dev, "trackpad urb shutting down: %d\n",
                        urb->status);
                return;
        default:
                dev_dbg(&intf->dev, "trackpad urb status: %d\n", urb->status);
                goto exit;
        }

        /* control response ignored */
        if (dev->tp_urb->actual_length == 2)
                goto exit;

        if (report_tp_state(dev, dev->tp_urb->actual_length)) {
                dprintk(1, "bcm5974: bad trackpad package, length: %d\n",
                        dev->tp_urb->actual_length);

                /*
                 * Receiving a HID packet means we aren't in wellspring mode.
                 * If we haven't tried a reset in the last second, try now.
                 */
                if (dev->tp_urb->actual_length == 8 &&
                    time_after(jiffies, dev->last_mode_reset + msecs_to_jiffies(1000))) {
                        schedule_work(&dev->mode_reset_work);
                }
        }

exit:
        error = usb_submit_urb(dev->tp_urb, GFP_ATOMIC);
        if (error)
                dev_err(&intf->dev, "trackpad urb failed: %d\n", error);
}

/*
 * The Wellspring trackpad, like many recent Apple trackpads, share
 * the usb device with the keyboard. Since keyboards are usually
 * handled by the HID system, the device ends up being handled by two
 * modules. Setting up the device therefore becomes slightly
 * complicated. To enable multitouch features, a mode switch is
 * required, which is usually applied via the control interface of the
 * device.  It can be argued where this switch should take place. In
 * some drivers, like appletouch, the switch is made during
 * probe. However, the hid module may also alter the state of the
 * device, resulting in trackpad malfunction under certain
 * circumstances. To get around this problem, there is at least one
 * example that utilizes the USB_QUIRK_RESET_RESUME quirk in order to
 * receive a reset_resume request rather than the normal resume.
 * Since the implementation of reset_resume is equal to mode switch
 * plus start_traffic, it seems easier to always do the switch when
 * starting traffic on the device.
 */
static int bcm5974_start_traffic(struct bcm5974 *dev)
{
        int error;

        error = bcm5974_wellspring_mode(dev, true);
        if (error) {
                dprintk(1, "bcm5974: mode switch failed\n");
                goto err_out;
        }

        if (dev->bt_urb) {
                error = usb_submit_urb(dev->bt_urb, GFP_KERNEL);
                if (error)
                        goto err_reset_mode;
        }

        error = usb_submit_urb(dev->tp_urb, GFP_KERNEL);
        if (error)
                goto err_kill_bt;

        return 0;

err_kill_bt:
        usb_kill_urb(dev->bt_urb);
err_reset_mode:
        bcm5974_wellspring_mode(dev, false);
err_out:
        return error;
}

static void bcm5974_pause_traffic(struct bcm5974 *dev)
{
        usb_kill_urb(dev->tp_urb);
        usb_kill_urb(dev->bt_urb);
        bcm5974_wellspring_mode(dev, false);
}

/*
 * The code below implements open/close and manual suspend/resume.
 * All functions may be called in random order.
 *
 * Opening a suspended device fails with EACCES - permission denied.
 *
 * Failing a resume leaves the device resumed but closed.
 */
static int bcm5974_open(struct input_dev *input)
{
        struct bcm5974 *dev = input_get_drvdata(input);
        int error;

        error = usb_autopm_get_interface(dev->intf);
        if (error)
                return error;

        scoped_guard(mutex, &dev->pm_mutex) {
                error = bcm5974_start_traffic(dev);
                if (!error)
                        dev->opened = 1;
        }

        if (error)
                usb_autopm_put_interface(dev->intf);

        return error;
}

static void bcm5974_close(struct input_dev *input)
{
        struct bcm5974 *dev = input_get_drvdata(input);

        scoped_guard(mutex, &dev->pm_mutex) {
                bcm5974_pause_traffic(dev);
                dev->opened = 0;
        }

        usb_autopm_put_interface(dev->intf);
}

static int bcm5974_suspend(struct usb_interface *iface, pm_message_t message)
{
        struct bcm5974 *dev = usb_get_intfdata(iface);

        guard(mutex)(&dev->pm_mutex);

        if (dev->opened)
                bcm5974_pause_traffic(dev);

        return 0;
}

static int bcm5974_resume(struct usb_interface *iface)
{
        struct bcm5974 *dev = usb_get_intfdata(iface);

        guard(mutex)(&dev->pm_mutex);

        if (dev->opened)
                return bcm5974_start_traffic(dev);

        return 0;
}

static int bcm5974_probe(struct usb_interface *iface,
                         const struct usb_device_id *id)
{
        struct usb_device *udev = interface_to_usbdev(iface);
        const struct bcm5974_config *cfg;
        struct bcm5974 *dev;
        struct input_dev *input_dev;
        int error = -ENOMEM;

        /* find the product index */
        cfg = bcm5974_get_config(udev);

        /* allocate memory for our device state and initialize it */
        dev = kzalloc_obj(*dev);
        input_dev = input_allocate_device();
        if (!dev || !input_dev) {
                dev_err(&iface->dev, "out of memory\n");
                goto err_free_devs;
        }

        dev->udev = udev;
        dev->intf = iface;
        dev->input = input_dev;
        dev->cfg = *cfg;
        INIT_WORK(&dev->mode_reset_work, bcm5974_mode_reset_work);
        mutex_init(&dev->pm_mutex);

        /* setup urbs */
        if (cfg->tp_type == TYPE1) {
                dev->bt_urb = usb_alloc_urb(0, GFP_KERNEL);
                if (!dev->bt_urb)
                        goto err_free_devs;
        }

        dev->tp_urb = usb_alloc_urb(0, GFP_KERNEL);
        if (!dev->tp_urb)
                goto err_free_bt_urb;

        if (dev->bt_urb) {
                dev->bt_data = usb_alloc_coherent(dev->udev,
                                          dev->cfg.bt_datalen, GFP_KERNEL,
                                          &dev->bt_urb->transfer_dma);
                if (!dev->bt_data)
                        goto err_free_urb;
        }

        dev->tp_data = usb_alloc_coherent(dev->udev,
                                          dev->cfg.tp_datalen, GFP_KERNEL,
                                          &dev->tp_urb->transfer_dma);
        if (!dev->tp_data)
                goto err_free_bt_buffer;

        if (dev->bt_urb) {
                usb_fill_int_urb(dev->bt_urb, udev,
                                 usb_rcvintpipe(udev, cfg->bt_ep),
                                 dev->bt_data, dev->cfg.bt_datalen,
                                 bcm5974_irq_button, dev, 1);

                dev->bt_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
        }

        usb_fill_int_urb(dev->tp_urb, udev,
                         usb_rcvintpipe(udev, cfg->tp_ep),
                         dev->tp_data, dev->cfg.tp_datalen,
                         bcm5974_irq_trackpad, dev, 1);

        dev->tp_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;

        /* create bcm5974 device */
        usb_make_path(udev, dev->phys, sizeof(dev->phys));
        strlcat(dev->phys, "/input0", sizeof(dev->phys));

        input_dev->name = "bcm5974";
        input_dev->phys = dev->phys;
        usb_to_input_id(dev->udev, &input_dev->id);
        /* report driver capabilities via the version field */
        input_dev->id.version = cfg->caps;
        input_dev->dev.parent = &iface->dev;

        input_set_drvdata(input_dev, dev);

        input_dev->open = bcm5974_open;
        input_dev->close = bcm5974_close;

        setup_events_to_report(input_dev, cfg);

        error = input_register_device(dev->input);
        if (error)
                goto err_free_buffer;

        /* save our data pointer in this interface device */
        usb_set_intfdata(iface, dev);

        return 0;

err_free_buffer:
        usb_free_coherent(dev->udev, dev->cfg.tp_datalen,
                dev->tp_data, dev->tp_urb->transfer_dma);
err_free_bt_buffer:
        if (dev->bt_urb)
                usb_free_coherent(dev->udev, dev->cfg.bt_datalen,
                                  dev->bt_data, dev->bt_urb->transfer_dma);
err_free_urb:
        usb_free_urb(dev->tp_urb);
err_free_bt_urb:
        usb_free_urb(dev->bt_urb);
err_free_devs:
        usb_set_intfdata(iface, NULL);
        input_free_device(input_dev);
        kfree(dev);
        return error;
}

static void bcm5974_disconnect(struct usb_interface *iface)
{
        struct bcm5974 *dev = usb_get_intfdata(iface);

        disable_work_sync(&dev->mode_reset_work);
        usb_set_intfdata(iface, NULL);

        input_unregister_device(dev->input);
        usb_free_coherent(dev->udev, dev->cfg.tp_datalen,
                          dev->tp_data, dev->tp_urb->transfer_dma);
        if (dev->bt_urb)
                usb_free_coherent(dev->udev, dev->cfg.bt_datalen,
                                  dev->bt_data, dev->bt_urb->transfer_dma);
        usb_free_urb(dev->tp_urb);
        usb_free_urb(dev->bt_urb);
        kfree(dev);
}

static struct usb_driver bcm5974_driver = {
        .name                        = "bcm5974",
        .probe                        = bcm5974_probe,
        .disconnect                = bcm5974_disconnect,
        .suspend                = bcm5974_suspend,
        .resume                        = bcm5974_resume,
        .id_table                = bcm5974_table,
        .supports_autosuspend        = 1,
};

module_usb_driver(bcm5974_driver);














































































































































    1 











    1 












    1 




















































    1 















    1 


















    1 















































    1 





    1 








    1 




































    1 












    1 



















    1 




    1 








    1 



























    1 










    2 





    1 

    1 








    1 

    1 




    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux NET3:        GRE over IP protocol decoder.
 *
 *        Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>

#include <net/flow.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ip_tunnels.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/gre.h>
#include <net/dst_metadata.h>
#include <net/erspan.h>

/*
   Problems & solutions
   --------------------

   1. The most important issue is detecting local dead loops.
   They would cause complete host lockup in transmit, which
   would be "resolved" by stack overflow or, if queueing is enabled,
   with infinite looping in net_bh.

   We cannot track such dead loops during route installation,
   it is infeasible task. The most general solutions would be
   to keep skb->encapsulation counter (sort of local ttl),
   and silently drop packet when it expires. It is a good
   solution, but it supposes maintaining new variable in ALL
   skb, even if no tunneling is used.

   Current solution: xmit_recursion breaks dead loops. This is a percpu
   counter, since when we enter the first ndo_xmit(), cpu migration is
   forbidden. We force an exit if this counter reaches RECURSION_LIMIT

   2. Networking dead loops would not kill routers, but would really
   kill network. IP hop limit plays role of "t->recursion" in this case,
   if we copy it from packet being encapsulated to upper header.
   It is very good solution, but it introduces two problems:

   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
     do not work over tunnels.
   - traceroute does not work. I planned to relay ICMP from tunnel,
     so that this problem would be solved and traceroute output
     would even more informative. This idea appeared to be wrong:
     only Linux complies to rfc1812 now (yes, guys, Linux is the only
     true router now :-)), all routers (at least, in neighbourhood of mine)
     return only 8 bytes of payload. It is the end.

   Hence, if we want that OSPF worked or traceroute said something reasonable,
   we should search for another solution.

   One of them is to parse packet trying to detect inner encapsulation
   made by our node. It is difficult or even impossible, especially,
   taking into account fragmentation. TO be short, ttl is not solution at all.

   Current solution: The solution was UNEXPECTEDLY SIMPLE.
   We force DF flag on tunnels with preconfigured hop limit,
   that is ALL. :-) Well, it does not remove the problem completely,
   but exponential growth of network traffic is changed to linear
   (branches, that exceed pmtu are pruned) and tunnel mtu
   rapidly degrades to value <68, where looping stops.
   Yes, it is not good if there exists a router in the loop,
   which does not force DF, even when encapsulating packets have DF set.
   But it is not our problem! Nobody could accuse us, we made
   all that we could make. Even if it is your gated who injected
   fatal route to network, even if it were you who configured
   fatal static route: you are innocent. :-)

   Alexey Kuznetsov.
 */

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

static struct rtnl_link_ops ipgre_link_ops __read_mostly;
static const struct header_ops ipgre_header_ops;

static int ipgre_tunnel_init(struct net_device *dev);
static void erspan_build_header(struct sk_buff *skb,
                                u32 id, u32 index,
                                bool truncate, bool is_ipv4);

static unsigned int ipgre_net_id __read_mostly;
static unsigned int gre_tap_net_id __read_mostly;
static unsigned int erspan_net_id __read_mostly;

static int ipgre_err(struct sk_buff *skb, u32 info,
                     const struct tnl_ptk_info *tpi)
{

        /* All the routers (except for Linux) return only
           8 bytes of packet payload. It means, that precise relaying of
           ICMP in the real Internet is absolutely infeasible.

           Moreover, Cisco "wise men" put GRE key to the third word
           in GRE header. It makes impossible maintaining even soft
           state for keyed GRE tunnels with enabled checksum. Tell
           them "thank you".

           Well, I wonder, rfc1812 was written by Cisco employee,
           what the hell these idiots break standards established
           by themselves???
           */
        struct net *net = dev_net(skb->dev);
        struct ip_tunnel_net *itn;
        const struct iphdr *iph;
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        struct ip_tunnel *t;

        if (tpi->proto == htons(ETH_P_TEB))
                itn = net_generic(net, gre_tap_net_id);
        else if (tpi->proto == htons(ETH_P_ERSPAN) ||
                 tpi->proto == htons(ETH_P_ERSPAN2))
                itn = net_generic(net, erspan_net_id);
        else
                itn = net_generic(net, ipgre_net_id);

        iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
        t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
                             iph->daddr, iph->saddr, tpi->key);

        if (!t)
                return -ENOENT;

        switch (type) {
        default:
        case ICMP_PARAMETERPROB:
                return 0;

        case ICMP_DEST_UNREACH:
                switch (code) {
                case ICMP_SR_FAILED:
                case ICMP_PORT_UNREACH:
                        /* Impossible event. */
                        return 0;
                default:
                        /* All others are translated to HOST_UNREACH.
                           rfc2003 contains "deep thoughts" about NET_UNREACH,
                           I believe they are just ether pollution. --ANK
                         */
                        break;
                }
                break;

        case ICMP_TIME_EXCEEDED:
                if (code != ICMP_EXC_TTL)
                        return 0;
                break;

        case ICMP_REDIRECT:
                break;
        }

#if IS_ENABLED(CONFIG_IPV6)
        if (tpi->proto == htons(ETH_P_IPV6)) {
                unsigned int data_len = 0;

                if (type == ICMP_TIME_EXCEEDED)
                        data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */

                if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
                                                type, data_len))
                        return 0;
        }
#endif

        if (t->parms.iph.daddr == 0 ||
            ipv4_is_multicast(t->parms.iph.daddr))
                return 0;

        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
                return 0;

        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
                t->err_count++;
        else
                t->err_count = 1;
        t->err_time = jiffies;

        return 0;
}

static void gre_err(struct sk_buff *skb, u32 info)
{
        /* All the routers (except for Linux) return only
         * 8 bytes of packet payload. It means, that precise relaying of
         * ICMP in the real Internet is absolutely infeasible.
         *
         * Moreover, Cisco "wise men" put GRE key to the third word
         * in GRE header. It makes impossible maintaining even soft
         * state for keyed
         * GRE tunnels with enabled checksum. Tell them "thank you".
         *
         * Well, I wonder, rfc1812 was written by Cisco employee,
         * what the hell these idiots break standards established
         * by themselves???
         */

        const struct iphdr *iph = (struct iphdr *)skb->data;
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        struct tnl_ptk_info tpi;

        if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
                             iph->ihl * 4) < 0)
                return;

        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
                                 skb->dev->ifindex, IPPROTO_GRE);
                return;
        }
        if (type == ICMP_REDIRECT) {
                ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
                              IPPROTO_GRE);
                return;
        }

        ipgre_err(skb, info, &tpi);
}

static bool is_erspan_type1(int gre_hdr_len)
{
        /* Both ERSPAN type I (version 0) and type II (version 1) use
         * protocol 0x88BE, but the type I has only 4-byte GRE header,
         * while type II has 8-byte.
         */
        return gre_hdr_len == 4;
}

static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
                      int gre_hdr_len)
{
        struct net *net = dev_net(skb->dev);
        struct metadata_dst *tun_dst = NULL;
        struct erspan_base_hdr *ershdr;
        IP_TUNNEL_DECLARE_FLAGS(flags);
        struct ip_tunnel_net *itn;
        struct ip_tunnel *tunnel;
        const struct iphdr *iph;
        struct erspan_md2 *md2;
        int ver;
        int len;

        ip_tunnel_flags_copy(flags, tpi->flags);

        itn = net_generic(net, erspan_net_id);
        iph = ip_hdr(skb);
        if (is_erspan_type1(gre_hdr_len)) {
                ver = 0;
                __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
                tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
                                          iph->saddr, iph->daddr, 0);
        } else {
                if (unlikely(!pskb_may_pull(skb,
                                            gre_hdr_len + sizeof(*ershdr))))
                        return PACKET_REJECT;

                ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
                ver = ershdr->ver;
                iph = ip_hdr(skb);
                __set_bit(IP_TUNNEL_KEY_BIT, flags);
                tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
                                          iph->saddr, iph->daddr, tpi->key);
        }

        if (tunnel) {
                if (is_erspan_type1(gre_hdr_len))
                        len = gre_hdr_len;
                else
                        len = gre_hdr_len + erspan_hdr_len(ver);

                if (unlikely(!pskb_may_pull(skb, len)))
                        return PACKET_REJECT;

                if (__iptunnel_pull_header(skb,
                                           len,
                                           htons(ETH_P_TEB),
                                           false, false) < 0)
                        goto drop;

                if (tunnel->collect_md) {
                        struct erspan_metadata *pkt_md, *md;
                        struct ip_tunnel_info *info;
                        unsigned char *gh;
                        __be64 tun_id;

                        __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags);
                        ip_tunnel_flags_copy(flags, tpi->flags);
                        tun_id = key32_to_tunnel_id(tpi->key);

                        tun_dst = ip_tun_rx_dst(skb, flags,
                                                tun_id, sizeof(*md));
                        if (!tun_dst)
                                return PACKET_REJECT;

                        /* MUST set options_len before referencing options */
                        info = &tun_dst->u.tun_info;
                        info->options_len = sizeof(*md);

                        /* skb can be uncloned in __iptunnel_pull_header, so
                         * old pkt_md is no longer valid and we need to reset
                         * it
                         */
                        gh = skb_network_header(skb) +
                             skb_network_header_len(skb);
                        pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
                                                            sizeof(*ershdr));
                        md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
                        md->version = ver;
                        md2 = &md->u.md2;
                        memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
                                                       ERSPAN_V2_MDSIZE);

                        __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
                                  info->key.tun_flags);
                }

                skb_reset_mac_header(skb);
                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
                return PACKET_RCVD;
        }
        return PACKET_REJECT;

drop:
        kfree_skb(skb);
        return PACKET_RCVD;
}

static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
                       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
        struct metadata_dst *tun_dst = NULL;
        const struct iphdr *iph;
        struct ip_tunnel *tunnel;

        iph = ip_hdr(skb);
        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
                                  iph->saddr, iph->daddr, tpi->key);

        if (tunnel) {
                const struct iphdr *tnl_params;

                if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
                                           raw_proto, false) < 0)
                        goto drop;

                /* Special case for ipgre_header_parse(), which expects the
                 * mac_header to point to the outer IP header.
                 */
                if (tunnel->dev->header_ops == &ipgre_header_ops)
                        skb_pop_mac_header(skb);
                else
                        skb_reset_mac_header(skb);

                tnl_params = &tunnel->parms.iph;
                if (tunnel->collect_md || tnl_params->daddr == 0) {
                        IP_TUNNEL_DECLARE_FLAGS(flags) = { };
                        __be64 tun_id;

                        __set_bit(IP_TUNNEL_CSUM_BIT, flags);
                        __set_bit(IP_TUNNEL_KEY_BIT, flags);
                        ip_tunnel_flags_and(flags, tpi->flags, flags);

                        tun_id = key32_to_tunnel_id(tpi->key);
                        tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
                        if (!tun_dst)
                                return PACKET_REJECT;
                }

                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
                return PACKET_RCVD;
        }
        return PACKET_NEXT;

drop:
        kfree_skb(skb);
        return PACKET_RCVD;
}

static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
                     int hdr_len)
{
        struct net *net = dev_net(skb->dev);
        struct ip_tunnel_net *itn;
        int res;

        if (tpi->proto == htons(ETH_P_TEB))
                itn = net_generic(net, gre_tap_net_id);
        else
                itn = net_generic(net, ipgre_net_id);

        res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
        if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
                /* ipgre tunnels in collect metadata mode should receive
                 * also ETH_P_TEB traffic.
                 */
                itn = net_generic(net, ipgre_net_id);
                res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
        }
        return res;
}

static int gre_rcv(struct sk_buff *skb)
{
        struct tnl_ptk_info tpi;
        bool csum_err = false;
        int hdr_len;

#ifdef CONFIG_NET_IPGRE_BROADCAST
        if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
                /* Looped back packet, drop it! */
                if (rt_is_output_route(skb_rtable(skb)))
                        goto drop;
        }
#endif

        hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
        if (hdr_len < 0)
                goto drop;

        if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
                     tpi.proto == htons(ETH_P_ERSPAN2))) {
                if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
                        return 0;
                goto out;
        }

        if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
                return 0;

out:
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
drop:
        dev_core_stats_rx_dropped_inc(skb->dev);
        kfree_skb(skb);
        return 0;
}

static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
                       const struct iphdr *tnl_params,
                       __be16 proto)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        IP_TUNNEL_DECLARE_FLAGS(flags);

        ip_tunnel_flags_copy(flags, tunnel->parms.o_flags);

        /* Push GRE header. */
        gre_build_header(skb, tunnel->tun_hlen,
                         flags, proto, tunnel->parms.o_key,
                         test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
                         htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);

        ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}

static int gre_handle_offloads(struct sk_buff *skb, bool csum)
{
        return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}

static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
                        __be16 proto)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        IP_TUNNEL_DECLARE_FLAGS(flags) = { };
        struct ip_tunnel_info *tun_info;
        const struct ip_tunnel_key *key;
        int tunnel_hlen;

        tun_info = skb_tunnel_info(skb);
        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
                     ip_tunnel_info_af(tun_info) != AF_INET))
                goto err_free_skb;

        key = &tun_info->key;
        tunnel_hlen = gre_calc_hlen(key->tun_flags);

        if (skb_cow_head(skb, dev->needed_headroom))
                goto err_free_skb;

        /* Push Tunnel header. */
        if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
                                              tunnel->parms.o_flags)))
                goto err_free_skb;

        __set_bit(IP_TUNNEL_CSUM_BIT, flags);
        __set_bit(IP_TUNNEL_KEY_BIT, flags);
        __set_bit(IP_TUNNEL_SEQ_BIT, flags);
        ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags);

        gre_build_header(skb, tunnel_hlen, flags, proto,
                         tunnel_id_to_key32(tun_info->key.tun_id),
                         test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
                         htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);

        ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);

        return;

err_free_skb:
        kfree_skb(skb);
        DEV_STATS_INC(dev, tx_dropped);
}

static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        IP_TUNNEL_DECLARE_FLAGS(flags) = { };
        struct ip_tunnel_info *tun_info;
        const struct ip_tunnel_key *key;
        struct erspan_metadata *md;
        bool truncate = false;
        __be16 proto;
        int tunnel_hlen;
        int version;
        int nhoff;

        tun_info = skb_tunnel_info(skb);
        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
                     ip_tunnel_info_af(tun_info) != AF_INET))
                goto err_free_skb;

        key = &tun_info->key;
        if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
                goto err_free_skb;
        if (tun_info->options_len < sizeof(*md))
                goto err_free_skb;
        md = ip_tunnel_info_opts(tun_info);

        /* ERSPAN has fixed 8 byte GRE header */
        version = md->version;
        tunnel_hlen = 8 + erspan_hdr_len(version);

        if (skb_cow_head(skb, dev->needed_headroom))
                goto err_free_skb;

        if (gre_handle_offloads(skb, false))
                goto err_free_skb;

        if (skb->len > dev->mtu + dev->hard_header_len) {
                if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
                        goto err_free_skb;
                truncate = true;
        }

        nhoff = skb_network_offset(skb);
        if (skb->protocol == htons(ETH_P_IP) &&
            (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
                truncate = true;

        if (skb->protocol == htons(ETH_P_IPV6)) {
                int thoff;

                if (skb_transport_header_was_set(skb))
                        thoff = skb_transport_offset(skb);
                else
                        thoff = nhoff + sizeof(struct ipv6hdr);
                if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
                        truncate = true;
        }

        if (version == 1) {
                erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
                                    ntohl(md->u.index), truncate, true);
                proto = htons(ETH_P_ERSPAN);
        } else if (version == 2) {
                erspan_build_header_v2(skb,
                                       ntohl(tunnel_id_to_key32(key->tun_id)),
                                       md->u.md2.dir,
                                       get_hwid(&md->u.md2),
                                       truncate, true);
                proto = htons(ETH_P_ERSPAN2);
        } else {
                goto err_free_skb;
        }

        __set_bit(IP_TUNNEL_SEQ_BIT, flags);
        gre_build_header(skb, 8, flags, proto, 0,
                         htonl(atomic_fetch_inc(&tunnel->o_seqno)));

        ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);

        return;

err_free_skb:
        kfree_skb(skb);
        DEV_STATS_INC(dev, tx_dropped);
}

static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
        struct ip_tunnel_info *info = skb_tunnel_info(skb);
        const struct ip_tunnel_key *key;
        struct rtable *rt;
        struct flowi4 fl4;

        if (ip_tunnel_info_af(info) != AF_INET)
                return -EINVAL;

        key = &info->key;
        ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
                            tunnel_id_to_key32(key->tun_id),
                            key->tos & ~INET_ECN_MASK, dev_net(dev), 0,
                            skb->mark, skb_get_hash(skb), key->flow_flags);
        rt = ip_route_output_key(dev_net(dev), &fl4);
        if (IS_ERR(rt))
                return PTR_ERR(rt);

        ip_rt_put(rt);
        info->key.u.ipv4.src = fl4.saddr;
        return 0;
}

static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
                              struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        const struct iphdr *tnl_params;

        if (!pskb_inet_may_pull(skb))
                goto free_skb;

        if (tunnel->collect_md) {
                gre_fb_xmit(skb, dev, skb->protocol);
                return NETDEV_TX_OK;
        }

        if (dev->header_ops) {
                int pull_len = tunnel->hlen + sizeof(struct iphdr);

                if (skb_cow_head(skb, 0))
                        goto free_skb;

                if (!pskb_may_pull(skb, pull_len))
                        goto free_skb;

                tnl_params = (const struct iphdr *)skb->data;

                /* ip_tunnel_xmit() needs skb->data pointing to gre header. */
                skb_pull(skb, pull_len);
                skb_reset_mac_header(skb);

                if (skb->ip_summed == CHECKSUM_PARTIAL &&
                    skb_checksum_start(skb) < skb->data)
                        goto free_skb;
        } else {
                if (skb_cow_head(skb, dev->needed_headroom))
                        goto free_skb;

                tnl_params = &tunnel->parms.iph;
        }

        if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
                                              tunnel->parms.o_flags)))
                goto free_skb;

        __gre_xmit(skb, dev, tnl_params, skb->protocol);
        return NETDEV_TX_OK;

free_skb:
        kfree_skb(skb);
        DEV_STATS_INC(dev, tx_dropped);
        return NETDEV_TX_OK;
}

static netdev_tx_t erspan_xmit(struct sk_buff *skb,
                               struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        bool truncate = false;
        __be16 proto;

        if (!pskb_inet_may_pull(skb))
                goto free_skb;

        if (tunnel->collect_md) {
                erspan_fb_xmit(skb, dev);
                return NETDEV_TX_OK;
        }

        if (gre_handle_offloads(skb, false))
                goto free_skb;

        if (skb_cow_head(skb, dev->needed_headroom))
                goto free_skb;

        if (skb->len > dev->mtu + dev->hard_header_len) {
                if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
                        goto free_skb;
                truncate = true;
        }

        /* Push ERSPAN header */
        if (tunnel->erspan_ver == 0) {
                proto = htons(ETH_P_ERSPAN);
                __clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags);
        } else if (tunnel->erspan_ver == 1) {
                erspan_build_header(skb, ntohl(tunnel->parms.o_key),
                                    tunnel->index,
                                    truncate, true);
                proto = htons(ETH_P_ERSPAN);
        } else if (tunnel->erspan_ver == 2) {
                erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
                                       tunnel->dir, tunnel->hwid,
                                       truncate, true);
                proto = htons(ETH_P_ERSPAN2);
        } else {
                goto free_skb;
        }

        __clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags);
        __gre_xmit(skb, dev, &tunnel->parms.iph, proto);
        return NETDEV_TX_OK;

free_skb:
        kfree_skb(skb);
        DEV_STATS_INC(dev, tx_dropped);
        return NETDEV_TX_OK;
}

static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
                                struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);

        if (!pskb_inet_may_pull(skb))
                goto free_skb;

        if (tunnel->collect_md) {
                gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
                return NETDEV_TX_OK;
        }

        if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
                                              tunnel->parms.o_flags)))
                goto free_skb;

        if (skb_cow_head(skb, dev->needed_headroom))
                goto free_skb;

        __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
        return NETDEV_TX_OK;

free_skb:
        kfree_skb(skb);
        DEV_STATS_INC(dev, tx_dropped);
        return NETDEV_TX_OK;
}

static void ipgre_link_update(struct net_device *dev, bool set_mtu)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        int len;

        len = tunnel->tun_hlen;
        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
        len = tunnel->tun_hlen - len;
        tunnel->hlen = tunnel->hlen + len;

        if (dev->header_ops)
                dev->hard_header_len += len;
        else
                dev->needed_headroom += len;

        if (set_mtu)
                WRITE_ONCE(dev->mtu, max_t(int, dev->mtu - len, 68));

        if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) ||
            (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) &&
             tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
                dev->features &= ~NETIF_F_GSO_SOFTWARE;
                dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
        } else {
                dev->features |= NETIF_F_GSO_SOFTWARE;
                dev->hw_features |= NETIF_F_GSO_SOFTWARE;
        }
}

static int ipgre_tunnel_ctl(struct net_device *dev,
                            struct ip_tunnel_parm_kern *p,
                            int cmd)
{
        __be16 i_flags, o_flags;
        int err;

        if (!ip_tunnel_flags_is_be16_compat(p->i_flags) ||
            !ip_tunnel_flags_is_be16_compat(p->o_flags))
                return -EOVERFLOW;

        i_flags = ip_tunnel_flags_to_be16(p->i_flags);
        o_flags = ip_tunnel_flags_to_be16(p->o_flags);

        if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
                if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
                    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
                    ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING)))
                        return -EINVAL;
        }

        gre_flags_to_tnl_flags(p->i_flags, i_flags);
        gre_flags_to_tnl_flags(p->o_flags, o_flags);

        err = ip_tunnel_ctl(dev, p, cmd);
        if (err)
                return err;

        if (cmd == SIOCCHGTUNNEL) {
                struct ip_tunnel *t = netdev_priv(dev);

                ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags);
                ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags);

                if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
                        ipgre_link_update(dev, true);
        }

        i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
        ip_tunnel_flags_from_be16(p->i_flags, i_flags);
        o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
        ip_tunnel_flags_from_be16(p->o_flags, o_flags);

        return 0;
}

/* Nice toy. Unfortunately, useless in real life :-)
   It allows to construct virtual multiprotocol broadcast "LAN"
   over the Internet, provided multicast routing is tuned.


   I have no idea was this bicycle invented before me,
   so that I had to set ARPHRD_IPGRE to a random value.
   I have an impression, that Cisco could make something similar,
   but this feature is apparently missing in IOS<=11.2(8).

   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)

   ping -t 255 224.66.66.66

   If nobody answers, mbone does not work.

   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
   ip addr add 10.66.66.<somewhat>/24 dev Universe
   ifconfig Universe up
   ifconfig Universe add fe80::<Your_real_addr>/10
   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
   ftp 10.66.66.66
   ...
   ftp fec0:6666:6666::193.233.7.65
   ...
 */
static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
                        unsigned short type,
                        const void *daddr, const void *saddr, unsigned int len)
{
        struct ip_tunnel *t = netdev_priv(dev);
        struct gre_base_hdr *greh;
        struct iphdr *iph;
        int needed;

        needed = t->hlen + sizeof(*iph);
        if (skb_headroom(skb) < needed &&
            pskb_expand_head(skb, HH_DATA_ALIGN(needed - skb_headroom(skb)),
                             0, GFP_ATOMIC))
                return -needed;

        iph = skb_push(skb, needed);
        greh = (struct gre_base_hdr *)(iph+1);
        greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
        greh->protocol = htons(type);

        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));

        /* Set the source hardware address. */
        if (saddr)
                memcpy(&iph->saddr, saddr, 4);
        if (daddr)
                memcpy(&iph->daddr, daddr, 4);
        if (iph->daddr)
                return t->hlen + sizeof(*iph);

        return -(t->hlen + sizeof(*iph));
}

static int ipgre_header_parse(const struct sk_buff *skb, const struct net_device *dev,
                              unsigned char *haddr)
{
        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
        memcpy(haddr, &iph->saddr, 4);
        return 4;
}

static const struct header_ops ipgre_header_ops = {
        .create        = ipgre_header,
        .parse        = ipgre_header_parse,
};

#ifdef CONFIG_NET_IPGRE_BROADCAST
static int ipgre_open(struct net_device *dev)
{
        struct ip_tunnel *t = netdev_priv(dev);

        if (ipv4_is_multicast(t->parms.iph.daddr)) {
                struct flowi4 fl4 = {
                        .flowi4_oif = t->parms.link,
                        .flowi4_dscp = ip4h_dscp(&t->parms.iph),
                        .flowi4_scope = RT_SCOPE_UNIVERSE,
                        .flowi4_proto = IPPROTO_GRE,
                        .saddr = t->parms.iph.saddr,
                        .daddr = t->parms.iph.daddr,
                        .fl4_gre_key = t->parms.o_key,
                };
                struct rtable *rt;

                rt = ip_route_output_key(t->net, &fl4);
                if (IS_ERR(rt))
                        return -EADDRNOTAVAIL;
                dev = rt->dst.dev;
                ip_rt_put(rt);
                if (!__in_dev_get_rtnl(dev))
                        return -EADDRNOTAVAIL;
                t->mlink = dev->ifindex;
                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
        }
        return 0;
}

static int ipgre_close(struct net_device *dev)
{
        struct ip_tunnel *t = netdev_priv(dev);

        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
                struct in_device *in_dev;
                in_dev = inetdev_by_index(t->net, t->mlink);
                if (in_dev)
                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
        }
        return 0;
}
#endif

static const struct net_device_ops ipgre_netdev_ops = {
        .ndo_init                = ipgre_tunnel_init,
        .ndo_uninit                = ip_tunnel_uninit,
#ifdef CONFIG_NET_IPGRE_BROADCAST
        .ndo_open                = ipgre_open,
        .ndo_stop                = ipgre_close,
#endif
        .ndo_start_xmit                = ipgre_xmit,
        .ndo_siocdevprivate        = ip_tunnel_siocdevprivate,
        .ndo_change_mtu                = ip_tunnel_change_mtu,
        .ndo_get_stats64        = dev_get_tstats64,
        .ndo_get_iflink                = ip_tunnel_get_iflink,
        .ndo_tunnel_ctl                = ipgre_tunnel_ctl,
};

#define GRE_FEATURES (NETIF_F_SG |                \
                      NETIF_F_FRAGLIST |        \
                      NETIF_F_HIGHDMA |                \
                      NETIF_F_HW_CSUM)

static void ipgre_tunnel_setup(struct net_device *dev)
{
        dev->netdev_ops                = &ipgre_netdev_ops;
        dev->type                = ARPHRD_IPGRE;
        ip_tunnel_setup(dev, ipgre_net_id);
}

static void __gre_tunnel_init(struct net_device *dev)
{
        struct ip_tunnel *tunnel;

        tunnel = netdev_priv(dev);
        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
        tunnel->parms.iph.protocol = IPPROTO_GRE;

        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
        dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);

        dev->features                |= GRE_FEATURES;
        dev->hw_features        |= GRE_FEATURES;

        /* TCP offload with GRE SEQ is not supported, nor can we support 2
         * levels of outer headers requiring an update.
         */
        if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags))
                return;
        if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) &&
            tunnel->encap.type != TUNNEL_ENCAP_NONE)
                return;

        dev->features |= NETIF_F_GSO_SOFTWARE;
        dev->hw_features |= NETIF_F_GSO_SOFTWARE;

        dev->lltx = true;
}

static int ipgre_tunnel_init(struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct iphdr *iph = &tunnel->parms.iph;

        __gre_tunnel_init(dev);

        __dev_addr_set(dev, &iph->saddr, 4);
        memcpy(dev->broadcast, &iph->daddr, 4);

        dev->flags                = IFF_NOARP;
        netif_keep_dst(dev);
        dev->addr_len                = 4;

        if (iph->daddr && !tunnel->collect_md) {
#ifdef CONFIG_NET_IPGRE_BROADCAST
                if (ipv4_is_multicast(iph->daddr)) {
                        if (!iph->saddr)
                                return -EINVAL;
                        dev->flags = IFF_BROADCAST;
                        dev->header_ops = &ipgre_header_ops;
                        dev->hard_header_len = tunnel->hlen + sizeof(*iph);
                        dev->needed_headroom = 0;
                }
#endif
        } else if (!tunnel->collect_md) {
                dev->header_ops = &ipgre_header_ops;
                dev->hard_header_len = tunnel->hlen + sizeof(*iph);
                dev->needed_headroom = 0;
        }

        return ip_tunnel_init(dev);
}

static const struct gre_protocol ipgre_protocol = {
        .handler     = gre_rcv,
        .err_handler = gre_err,
};

static int __net_init ipgre_init_net(struct net *net)
{
        return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
}

static void __net_exit ipgre_exit_rtnl(struct net *net,
                                       struct list_head *dev_to_kill)
{
        ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill);
}

static struct pernet_operations ipgre_net_ops = {
        .init = ipgre_init_net,
        .exit_rtnl = ipgre_exit_rtnl,
        .id   = &ipgre_net_id,
        .size = sizeof(struct ip_tunnel_net),
};

static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
                                 struct netlink_ext_ack *extack)
{
        __be16 flags;

        if (!data)
                return 0;

        flags = 0;
        if (data[IFLA_GRE_IFLAGS])
                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
        if (data[IFLA_GRE_OFLAGS])
                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
        if (flags & (GRE_VERSION|GRE_ROUTING))
                return -EINVAL;

        if (data[IFLA_GRE_COLLECT_METADATA] &&
            data[IFLA_GRE_ENCAP_TYPE] &&
            nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
                return -EINVAL;

        return 0;
}

static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
                              struct netlink_ext_ack *extack)
{
        __be32 daddr;

        if (tb[IFLA_ADDRESS]) {
                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
                        return -EINVAL;
                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
                        return -EADDRNOTAVAIL;
        }

        if (!data)
                goto out;

        if (data[IFLA_GRE_REMOTE]) {
                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
                if (!daddr)
                        return -EINVAL;
        }

out:
        return ipgre_tunnel_validate(tb, data, extack);
}

static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
                           struct netlink_ext_ack *extack)
{
        __be16 flags = 0;
        int ret;

        if (!data)
                return 0;

        ret = ipgre_tap_validate(tb, data, extack);
        if (ret)
                return ret;

        if (data[IFLA_GRE_ERSPAN_VER] &&
            nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
                return 0;

        /* ERSPAN type II/III should only have GRE sequence and key flag */
        if (data[IFLA_GRE_OFLAGS])
                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
        if (data[IFLA_GRE_IFLAGS])
                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
        if (!data[IFLA_GRE_COLLECT_METADATA] &&
            flags != (GRE_SEQ | GRE_KEY))
                return -EINVAL;

        /* ERSPAN Session ID only has 10-bit. Since we reuse
         * 32-bit key field as ID, check it's range.
         */
        if (data[IFLA_GRE_IKEY] &&
            (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
                return -EINVAL;

        if (data[IFLA_GRE_OKEY] &&
            (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
                return -EINVAL;

        return 0;
}

static int ipgre_netlink_parms(struct net_device *dev,
                                struct nlattr *data[],
                                struct nlattr *tb[],
                                struct ip_tunnel_parm_kern *parms,
                                __u32 *fwmark)
{
        struct ip_tunnel *t = netdev_priv(dev);

        memset(parms, 0, sizeof(*parms));

        parms->iph.protocol = IPPROTO_GRE;

        if (!data)
                return 0;

        if (data[IFLA_GRE_LINK])
                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);

        if (data[IFLA_GRE_IFLAGS])
                gre_flags_to_tnl_flags(parms->i_flags,
                                       nla_get_be16(data[IFLA_GRE_IFLAGS]));

        if (data[IFLA_GRE_OFLAGS])
                gre_flags_to_tnl_flags(parms->o_flags,
                                       nla_get_be16(data[IFLA_GRE_OFLAGS]));

        if (data[IFLA_GRE_IKEY])
                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);

        if (data[IFLA_GRE_OKEY])
                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);

        if (data[IFLA_GRE_LOCAL])
                parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);

        if (data[IFLA_GRE_REMOTE])
                parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);

        if (data[IFLA_GRE_TTL])
                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);

        if (data[IFLA_GRE_TOS])
                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);

        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
                if (t->ignore_df)
                        return -EINVAL;
                parms->iph.frag_off = htons(IP_DF);
        }

        if (data[IFLA_GRE_COLLECT_METADATA]) {
                t->collect_md = true;
                if (dev->type == ARPHRD_IPGRE)
                        dev->type = ARPHRD_NONE;
        }

        if (data[IFLA_GRE_IGNORE_DF]) {
                if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
                  && (parms->iph.frag_off & htons(IP_DF)))
                        return -EINVAL;
                t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
        }

        if (data[IFLA_GRE_FWMARK])
                *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);

        return 0;
}

static int erspan_netlink_parms(struct net_device *dev,
                                struct nlattr *data[],
                                struct nlattr *tb[],
                                struct ip_tunnel_parm_kern *parms,
                                __u32 *fwmark)
{
        struct ip_tunnel *t = netdev_priv(dev);
        int err;

        err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
        if (err)
                return err;
        if (!data)
                return 0;

        if (data[IFLA_GRE_ERSPAN_VER]) {
                t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);

                if (t->erspan_ver > 2)
                        return -EINVAL;
        }

        if (t->erspan_ver == 1) {
                if (data[IFLA_GRE_ERSPAN_INDEX]) {
                        t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
                        if (t->index & ~INDEX_MASK)
                                return -EINVAL;
                }
        } else if (t->erspan_ver == 2) {
                if (data[IFLA_GRE_ERSPAN_DIR]) {
                        t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
                        if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
                                return -EINVAL;
                }
                if (data[IFLA_GRE_ERSPAN_HWID]) {
                        t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
                        if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
                                return -EINVAL;
                }
        }

        return 0;
}

/* This function returns true when ENCAP attributes are present in the nl msg */
static bool ipgre_netlink_encap_parms(struct nlattr *data[],
                                      struct ip_tunnel_encap *ipencap)
{
        bool ret = false;

        memset(ipencap, 0, sizeof(*ipencap));

        if (!data)
                return ret;

        if (data[IFLA_GRE_ENCAP_TYPE]) {
                ret = true;
                ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
        }

        if (data[IFLA_GRE_ENCAP_FLAGS]) {
                ret = true;
                ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
        }

        if (data[IFLA_GRE_ENCAP_SPORT]) {
                ret = true;
                ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
        }

        if (data[IFLA_GRE_ENCAP_DPORT]) {
                ret = true;
                ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
        }

        return ret;
}

static int gre_tap_init(struct net_device *dev)
{
        __gre_tunnel_init(dev);
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
        netif_keep_dst(dev);

        return ip_tunnel_init(dev);
}

static const struct net_device_ops gre_tap_netdev_ops = {
        .ndo_init                = gre_tap_init,
        .ndo_uninit                = ip_tunnel_uninit,
        .ndo_start_xmit                = gre_tap_xmit,
        .ndo_set_mac_address         = eth_mac_addr,
        .ndo_validate_addr        = eth_validate_addr,
        .ndo_change_mtu                = ip_tunnel_change_mtu,
        .ndo_get_stats64        = dev_get_tstats64,
        .ndo_get_iflink                = ip_tunnel_get_iflink,
        .ndo_fill_metadata_dst        = gre_fill_metadata_dst,
};

static int erspan_tunnel_init(struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);

        if (tunnel->erspan_ver == 0)
                tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */
        else
                tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */

        tunnel->parms.iph.protocol = IPPROTO_GRE;
        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
                       erspan_hdr_len(tunnel->erspan_ver);

        dev->features                |= GRE_FEATURES;
        dev->hw_features        |= GRE_FEATURES;
        dev->priv_flags                |= IFF_LIVE_ADDR_CHANGE;
        netif_keep_dst(dev);

        return ip_tunnel_init(dev);
}

static const struct net_device_ops erspan_netdev_ops = {
        .ndo_init                = erspan_tunnel_init,
        .ndo_uninit                = ip_tunnel_uninit,
        .ndo_start_xmit                = erspan_xmit,
        .ndo_set_mac_address        = eth_mac_addr,
        .ndo_validate_addr        = eth_validate_addr,
        .ndo_change_mtu                = ip_tunnel_change_mtu,
        .ndo_get_stats64        = dev_get_tstats64,
        .ndo_get_iflink                = ip_tunnel_get_iflink,
        .ndo_fill_metadata_dst        = gre_fill_metadata_dst,
};

static void ipgre_tap_setup(struct net_device *dev)
{
        ether_setup(dev);
        dev->max_mtu = 0;
        dev->netdev_ops        = &gre_tap_netdev_ops;
        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
        dev->priv_flags        |= IFF_LIVE_ADDR_CHANGE;
        ip_tunnel_setup(dev, gre_tap_net_id);
}

static int
ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
{
        struct ip_tunnel_encap ipencap;

        if (ipgre_netlink_encap_parms(data, &ipencap)) {
                struct ip_tunnel *t = netdev_priv(dev);
                int err = ip_tunnel_encap_setup(t, &ipencap);

                if (err < 0)
                        return err;
        }

        return 0;
}

static int ipgre_newlink(struct net_device *dev,
                         struct rtnl_newlink_params *params,
                         struct netlink_ext_ack *extack)
{
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        struct ip_tunnel_parm_kern p;
        __u32 fwmark = 0;
        int err;

        err = ipgre_newlink_encap_setup(dev, data);
        if (err)
                return err;

        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
        if (err < 0)
                return err;
        return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
                                 fwmark);
}

static int erspan_newlink(struct net_device *dev,
                          struct rtnl_newlink_params *params,
                          struct netlink_ext_ack *extack)
{
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        struct ip_tunnel_parm_kern p;
        __u32 fwmark = 0;
        int err;

        err = ipgre_newlink_encap_setup(dev, data);
        if (err)
                return err;

        err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
        if (err)
                return err;
        return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
                                 fwmark);
}

static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
                            struct nlattr *data[],
                            struct netlink_ext_ack *extack)
{
        struct ip_tunnel *t = netdev_priv(dev);
        struct ip_tunnel_parm_kern p;
        __u32 fwmark = t->fwmark;
        int err;

        err = ipgre_newlink_encap_setup(dev, data);
        if (err)
                return err;

        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
        if (err < 0)
                return err;

        err = ip_tunnel_changelink(dev, tb, &p, fwmark);
        if (err < 0)
                return err;

        ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags);
        ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags);

        ipgre_link_update(dev, !tb[IFLA_MTU]);

        return 0;
}

static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
                             struct nlattr *data[],
                             struct netlink_ext_ack *extack)
{
        struct ip_tunnel *t = netdev_priv(dev);
        struct ip_tunnel_parm_kern p;
        __u32 fwmark = t->fwmark;
        int err;

        err = ipgre_newlink_encap_setup(dev, data);
        if (err)
                return err;

        err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
        if (err < 0)
                return err;

        err = ip_tunnel_changelink(dev, tb, &p, fwmark);
        if (err < 0)
                return err;

        ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags);
        ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags);

        return 0;
}

static size_t ipgre_get_size(const struct net_device *dev)
{
        return
                /* IFLA_GRE_LINK */
                nla_total_size(4) +
                /* IFLA_GRE_IFLAGS */
                nla_total_size(2) +
                /* IFLA_GRE_OFLAGS */
                nla_total_size(2) +
                /* IFLA_GRE_IKEY */
                nla_total_size(4) +
                /* IFLA_GRE_OKEY */
                nla_total_size(4) +
                /* IFLA_GRE_LOCAL */
                nla_total_size(4) +
                /* IFLA_GRE_REMOTE */
                nla_total_size(4) +
                /* IFLA_GRE_TTL */
                nla_total_size(1) +
                /* IFLA_GRE_TOS */
                nla_total_size(1) +
                /* IFLA_GRE_PMTUDISC */
                nla_total_size(1) +
                /* IFLA_GRE_ENCAP_TYPE */
                nla_total_size(2) +
                /* IFLA_GRE_ENCAP_FLAGS */
                nla_total_size(2) +
                /* IFLA_GRE_ENCAP_SPORT */
                nla_total_size(2) +
                /* IFLA_GRE_ENCAP_DPORT */
                nla_total_size(2) +
                /* IFLA_GRE_COLLECT_METADATA */
                nla_total_size(0) +
                /* IFLA_GRE_IGNORE_DF */
                nla_total_size(1) +
                /* IFLA_GRE_FWMARK */
                nla_total_size(4) +
                /* IFLA_GRE_ERSPAN_INDEX */
                nla_total_size(4) +
                /* IFLA_GRE_ERSPAN_VER */
                nla_total_size(1) +
                /* IFLA_GRE_ERSPAN_DIR */
                nla_total_size(1) +
                /* IFLA_GRE_ERSPAN_HWID */
                nla_total_size(2) +
                0;
}

static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct ip_tunnel *t = netdev_priv(dev);
        struct ip_tunnel_parm_kern *p = &t->parms;
        IP_TUNNEL_DECLARE_FLAGS(o_flags);

        ip_tunnel_flags_copy(o_flags, p->o_flags);

        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
            nla_put_be16(skb, IFLA_GRE_IFLAGS,
                         gre_tnl_flags_to_gre_flags(p->i_flags)) ||
            nla_put_be16(skb, IFLA_GRE_OFLAGS,
                         gre_tnl_flags_to_gre_flags(o_flags)) ||
            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
            nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
            nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
            nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
            nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
            nla_put_u8(skb, IFLA_GRE_PMTUDISC,
                       !!(p->iph.frag_off & htons(IP_DF))) ||
            nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
                goto nla_put_failure;

        if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
                        t->encap.type) ||
            nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
                         t->encap.sport) ||
            nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
                         t->encap.dport) ||
            nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
                        t->encap.flags))
                goto nla_put_failure;

        if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
                goto nla_put_failure;

        if (t->collect_md) {
                if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct ip_tunnel *t = netdev_priv(dev);

        if (t->erspan_ver <= 2) {
                if (t->erspan_ver != 0 && !t->collect_md)
                        __set_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags);

                if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
                        goto nla_put_failure;

                if (t->erspan_ver == 1) {
                        if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
                                goto nla_put_failure;
                } else if (t->erspan_ver == 2) {
                        if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
                                goto nla_put_failure;
                        if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
                                goto nla_put_failure;
                }
        }

        return ipgre_fill_info(skb, dev);

nla_put_failure:
        return -EMSGSIZE;
}

static void erspan_setup(struct net_device *dev)
{
        struct ip_tunnel *t = netdev_priv(dev);

        ether_setup(dev);
        dev->max_mtu = 0;
        dev->netdev_ops = &erspan_netdev_ops;
        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
        ip_tunnel_setup(dev, erspan_net_id);
        t->erspan_ver = 1;
}

static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
        [IFLA_GRE_LINK]                = { .type = NLA_U32 },
        [IFLA_GRE_IFLAGS]        = { .type = NLA_U16 },
        [IFLA_GRE_OFLAGS]        = { .type = NLA_U16 },
        [IFLA_GRE_IKEY]                = { .type = NLA_U32 },
        [IFLA_GRE_OKEY]                = { .type = NLA_U32 },
        [IFLA_GRE_LOCAL]        = { .len = sizeof_field(struct iphdr, saddr) },
        [IFLA_GRE_REMOTE]        = { .len = sizeof_field(struct iphdr, daddr) },
        [IFLA_GRE_TTL]                = { .type = NLA_U8 },
        [IFLA_GRE_TOS]                = { .type = NLA_U8 },
        [IFLA_GRE_PMTUDISC]        = { .type = NLA_U8 },
        [IFLA_GRE_ENCAP_TYPE]        = { .type = NLA_U16 },
        [IFLA_GRE_ENCAP_FLAGS]        = { .type = NLA_U16 },
        [IFLA_GRE_ENCAP_SPORT]        = { .type = NLA_U16 },
        [IFLA_GRE_ENCAP_DPORT]        = { .type = NLA_U16 },
        [IFLA_GRE_COLLECT_METADATA]        = { .type = NLA_FLAG },
        [IFLA_GRE_IGNORE_DF]        = { .type = NLA_U8 },
        [IFLA_GRE_FWMARK]        = { .type = NLA_U32 },
        [IFLA_GRE_ERSPAN_INDEX]        = { .type = NLA_U32 },
        [IFLA_GRE_ERSPAN_VER]        = { .type = NLA_U8 },
        [IFLA_GRE_ERSPAN_DIR]        = { .type = NLA_U8 },
        [IFLA_GRE_ERSPAN_HWID]        = { .type = NLA_U16 },
};

static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
        .kind                = "gre",
        .maxtype        = IFLA_GRE_MAX,
        .policy                = ipgre_policy,
        .priv_size        = sizeof(struct ip_tunnel),
        .setup                = ipgre_tunnel_setup,
        .validate        = ipgre_tunnel_validate,
        .newlink        = ipgre_newlink,
        .changelink        = ipgre_changelink,
        .dellink        = ip_tunnel_dellink,
        .get_size        = ipgre_get_size,
        .fill_info        = ipgre_fill_info,
        .get_link_net        = ip_tunnel_get_link_net,
};

static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
        .kind                = "gretap",
        .maxtype        = IFLA_GRE_MAX,
        .policy                = ipgre_policy,
        .priv_size        = sizeof(struct ip_tunnel),
        .setup                = ipgre_tap_setup,
        .validate        = ipgre_tap_validate,
        .newlink        = ipgre_newlink,
        .changelink        = ipgre_changelink,
        .dellink        = ip_tunnel_dellink,
        .get_size        = ipgre_get_size,
        .fill_info        = ipgre_fill_info,
        .get_link_net        = ip_tunnel_get_link_net,
};

static struct rtnl_link_ops erspan_link_ops __read_mostly = {
        .kind                = "erspan",
        .maxtype        = IFLA_GRE_MAX,
        .policy                = ipgre_policy,
        .priv_size        = sizeof(struct ip_tunnel),
        .setup                = erspan_setup,
        .validate        = erspan_validate,
        .newlink        = erspan_newlink,
        .changelink        = erspan_changelink,
        .dellink        = ip_tunnel_dellink,
        .get_size        = ipgre_get_size,
        .fill_info        = erspan_fill_info,
        .get_link_net        = ip_tunnel_get_link_net,
};

struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
                                        u8 name_assign_type)
{
        struct rtnl_newlink_params params = { .src_net = net };
        struct nlattr *tb[IFLA_MAX + 1];
        struct net_device *dev;
        LIST_HEAD(list_kill);
        struct ip_tunnel *t;
        int err;

        memset(&tb, 0, sizeof(tb));
        params.tb = tb;

        dev = rtnl_create_link(net, name, name_assign_type,
                               &ipgre_tap_ops, tb, NULL);
        if (IS_ERR(dev))
                return dev;

        /* Configure flow based GRE device. */
        t = netdev_priv(dev);
        t->collect_md = true;

        err = ipgre_newlink(dev, &params, NULL);
        if (err < 0) {
                free_netdev(dev);
                return ERR_PTR(err);
        }

        /* openvswitch users expect packet sizes to be unrestricted,
         * so set the largest MTU we can.
         */
        err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
        if (err)
                goto out;

        err = rtnl_configure_link(dev, NULL, 0, NULL);
        if (err < 0)
                goto out;

        return dev;
out:
        ip_tunnel_dellink(dev, &list_kill);
        unregister_netdevice_many(&list_kill);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(gretap_fb_dev_create);

static int __net_init ipgre_tap_init_net(struct net *net)
{
        return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
}

static void __net_exit ipgre_tap_exit_rtnl(struct net *net,
                                           struct list_head *dev_to_kill)
{
        ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill);
}

static struct pernet_operations ipgre_tap_net_ops = {
        .init = ipgre_tap_init_net,
        .exit_rtnl = ipgre_tap_exit_rtnl,
        .id   = &gre_tap_net_id,
        .size = sizeof(struct ip_tunnel_net),
};

static int __net_init erspan_init_net(struct net *net)
{
        return ip_tunnel_init_net(net, erspan_net_id,
                                  &erspan_link_ops, "erspan0");
}

static void __net_exit erspan_exit_rtnl(struct net *net,
                                        struct list_head *dev_to_kill)
{
        ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill);
}

static struct pernet_operations erspan_net_ops = {
        .init = erspan_init_net,
        .exit_rtnl = erspan_exit_rtnl,
        .id   = &erspan_net_id,
        .size = sizeof(struct ip_tunnel_net),
};

static int __init ipgre_init(void)
{
        int err;

        pr_info("GRE over IPv4 tunneling driver\n");

        err = register_pernet_device(&ipgre_net_ops);
        if (err < 0)
                return err;

        err = register_pernet_device(&ipgre_tap_net_ops);
        if (err < 0)
                goto pnet_tap_failed;

        err = register_pernet_device(&erspan_net_ops);
        if (err < 0)
                goto pnet_erspan_failed;

        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
        if (err < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                goto add_proto_failed;
        }

        err = rtnl_link_register(&ipgre_link_ops);
        if (err < 0)
                goto rtnl_link_failed;

        err = rtnl_link_register(&ipgre_tap_ops);
        if (err < 0)
                goto tap_ops_failed;

        err = rtnl_link_register(&erspan_link_ops);
        if (err < 0)
                goto erspan_link_failed;

        return 0;

erspan_link_failed:
        rtnl_link_unregister(&ipgre_tap_ops);
tap_ops_failed:
        rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
add_proto_failed:
        unregister_pernet_device(&erspan_net_ops);
pnet_erspan_failed:
        unregister_pernet_device(&ipgre_tap_net_ops);
pnet_tap_failed:
        unregister_pernet_device(&ipgre_net_ops);
        return err;
}

static void __exit ipgre_fini(void)
{
        rtnl_link_unregister(&ipgre_tap_ops);
        rtnl_link_unregister(&ipgre_link_ops);
        rtnl_link_unregister(&erspan_link_ops);
        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
        unregister_pernet_device(&ipgre_tap_net_ops);
        unregister_pernet_device(&ipgre_net_ops);
        unregister_pernet_device(&erspan_net_ops);
}

module_init(ipgre_init);
module_exit(ipgre_fini);
MODULE_DESCRIPTION("IPv4 GRE tunnels over IP library");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("gre");
MODULE_ALIAS_RTNL_LINK("gretap");
MODULE_ALIAS_RTNL_LINK("erspan");
MODULE_ALIAS_NETDEV("gre0");
MODULE_ALIAS_NETDEV("gretap0");
MODULE_ALIAS_NETDEV("erspan0");











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




















    1 

















    1 














    1 

    1 










    1 






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
// SPDX-License-Identifier: GPL-2.0
/*
  USB Driver for GSM modems

  Copyright (C) 2005  Matthias Urlichs <smurf@smurf.noris.de>

  Portions copied from the Keyspan driver by Hugh Blemings <hugh@blemings.org>

  History: see the git log.

  Work sponsored by: Sigos GmbH, Germany <info@sigos.de>

  This driver exists because the "normal" serial driver doesn't work too well
  with GSM modems. Issues:
  - data loss -- one single Receive URB is not nearly enough
  - nonstandard flow (Option devices) control
  - controlling the baud rate doesn't make sense

  This driver is named "option" because the most common device it's
  used for is a PC-Card (with an internal OHCI-USB interface, behind
  which the GSM interface sits), made by Option Inc.

  Some of the "one port" devices actually exhibit multiple USB instances
  on the USB bus. This is not a bug, these ports are used for different
  device features.
*/

#define DRIVER_AUTHOR "Matthias Urlichs <smurf@smurf.noris.de>"
#define DRIVER_DESC "USB Driver for GSM modems"

#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/errno.h>
#include <linux/tty.h>
#include <linux/tty_flip.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/usb.h>
#include <linux/usb/serial.h>
#include "usb-wwan.h"

/* Function prototypes */
static int  option_probe(struct usb_serial *serial,
                        const struct usb_device_id *id);
static int option_attach(struct usb_serial *serial);
static void option_release(struct usb_serial *serial);
static void option_instat_callback(struct urb *urb);

/* Vendor and product IDs */
#define OPTION_VENDOR_ID                        0x0AF0
#define OPTION_PRODUCT_COLT                        0x5000
#define OPTION_PRODUCT_RICOLA                        0x6000
#define OPTION_PRODUCT_RICOLA_LIGHT                0x6100
#define OPTION_PRODUCT_RICOLA_QUAD                0x6200
#define OPTION_PRODUCT_RICOLA_QUAD_LIGHT        0x6300
#define OPTION_PRODUCT_RICOLA_NDIS                0x6050
#define OPTION_PRODUCT_RICOLA_NDIS_LIGHT        0x6150
#define OPTION_PRODUCT_RICOLA_NDIS_QUAD                0x6250
#define OPTION_PRODUCT_RICOLA_NDIS_QUAD_LIGHT        0x6350
#define OPTION_PRODUCT_COBRA                        0x6500
#define OPTION_PRODUCT_COBRA_BUS                0x6501
#define OPTION_PRODUCT_VIPER                        0x6600
#define OPTION_PRODUCT_VIPER_BUS                0x6601
#define OPTION_PRODUCT_GT_MAX_READY                0x6701
#define OPTION_PRODUCT_FUJI_MODEM_LIGHT                0x6721
#define OPTION_PRODUCT_FUJI_MODEM_GT                0x6741
#define OPTION_PRODUCT_FUJI_MODEM_EX                0x6761
#define OPTION_PRODUCT_KOI_MODEM                0x6800
#define OPTION_PRODUCT_SCORPION_MODEM                0x6901
#define OPTION_PRODUCT_ETNA_MODEM                0x7001
#define OPTION_PRODUCT_ETNA_MODEM_LITE                0x7021
#define OPTION_PRODUCT_ETNA_MODEM_GT                0x7041
#define OPTION_PRODUCT_ETNA_MODEM_EX                0x7061
#define OPTION_PRODUCT_ETNA_KOI_MODEM                0x7100
#define OPTION_PRODUCT_GTM380_MODEM                0x7201

#define HUAWEI_VENDOR_ID                        0x12D1
#define HUAWEI_PRODUCT_E173                        0x140C
#define HUAWEI_PRODUCT_E1750                        0x1406
#define HUAWEI_PRODUCT_K4505                        0x1464
#define HUAWEI_PRODUCT_K3765                        0x1465
#define HUAWEI_PRODUCT_K4605                        0x14C6
#define HUAWEI_PRODUCT_E173S6                        0x1C07

#define QUANTA_VENDOR_ID                        0x0408
#define QUANTA_PRODUCT_Q101                        0xEA02
#define QUANTA_PRODUCT_Q111                        0xEA03
#define QUANTA_PRODUCT_GLX                        0xEA04
#define QUANTA_PRODUCT_GKE                        0xEA05
#define QUANTA_PRODUCT_GLE                        0xEA06

#define NOVATELWIRELESS_VENDOR_ID                0x1410

/* YISO PRODUCTS */

#define YISO_VENDOR_ID                                0x0EAB
#define YISO_PRODUCT_U893                        0xC893

/*
 * NOVATEL WIRELESS PRODUCTS
 *
 * Note from Novatel Wireless:
 * If your Novatel modem does not work on linux, don't
 * change the option module, but check our website. If
 * that does not help, contact ddeschepper@nvtl.com
*/
/* MERLIN EVDO PRODUCTS */
#define NOVATELWIRELESS_PRODUCT_V640                0x1100
#define NOVATELWIRELESS_PRODUCT_V620                0x1110
#define NOVATELWIRELESS_PRODUCT_V740                0x1120
#define NOVATELWIRELESS_PRODUCT_V720                0x1130

/* MERLIN HSDPA/HSPA PRODUCTS */
#define NOVATELWIRELESS_PRODUCT_U730                0x1400
#define NOVATELWIRELESS_PRODUCT_U740                0x1410
#define NOVATELWIRELESS_PRODUCT_U870                0x1420
#define NOVATELWIRELESS_PRODUCT_XU870                0x1430
#define NOVATELWIRELESS_PRODUCT_X950D                0x1450

/* EXPEDITE PRODUCTS */
#define NOVATELWIRELESS_PRODUCT_EV620                0x2100
#define NOVATELWIRELESS_PRODUCT_ES720                0x2110
#define NOVATELWIRELESS_PRODUCT_E725                0x2120
#define NOVATELWIRELESS_PRODUCT_ES620                0x2130
#define NOVATELWIRELESS_PRODUCT_EU730                0x2400
#define NOVATELWIRELESS_PRODUCT_EU740                0x2410
#define NOVATELWIRELESS_PRODUCT_EU870D                0x2420
/* OVATION PRODUCTS */
#define NOVATELWIRELESS_PRODUCT_MC727                0x4100
#define NOVATELWIRELESS_PRODUCT_MC950D                0x4400
/*
 * Note from Novatel Wireless:
 * All PID in the 5xxx range are currently reserved for
 * auto-install CDROMs, and should not be added to this
 * module.
 *
 * #define NOVATELWIRELESS_PRODUCT_U727                0x5010
 * #define NOVATELWIRELESS_PRODUCT_MC727_NEW        0x5100
*/
#define NOVATELWIRELESS_PRODUCT_OVMC760                0x6002
#define NOVATELWIRELESS_PRODUCT_MC780                0x6010
#define NOVATELWIRELESS_PRODUCT_EVDO_FULLSPEED        0x6000
#define NOVATELWIRELESS_PRODUCT_EVDO_HIGHSPEED        0x6001
#define NOVATELWIRELESS_PRODUCT_HSPA_FULLSPEED        0x7000
#define NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED        0x7001
#define NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED3        0x7003
#define NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED4        0x7004
#define NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED5        0x7005
#define NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED6        0x7006
#define NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED7        0x7007
#define NOVATELWIRELESS_PRODUCT_MC996D                0x7030
#define NOVATELWIRELESS_PRODUCT_MF3470                0x7041
#define NOVATELWIRELESS_PRODUCT_MC547                0x7042
#define NOVATELWIRELESS_PRODUCT_EVDO_EMBEDDED_FULLSPEED        0x8000
#define NOVATELWIRELESS_PRODUCT_EVDO_EMBEDDED_HIGHSPEED        0x8001
#define NOVATELWIRELESS_PRODUCT_HSPA_EMBEDDED_FULLSPEED        0x9000
#define NOVATELWIRELESS_PRODUCT_HSPA_EMBEDDED_HIGHSPEED        0x9001
#define NOVATELWIRELESS_PRODUCT_E362                0x9010
#define NOVATELWIRELESS_PRODUCT_E371                0x9011
#define NOVATELWIRELESS_PRODUCT_U620L                0x9022
#define NOVATELWIRELESS_PRODUCT_G2                0xA010
#define NOVATELWIRELESS_PRODUCT_MC551                0xB001

#define UBLOX_VENDOR_ID                                0x1546

/* AMOI PRODUCTS */
#define AMOI_VENDOR_ID                                0x1614
#define AMOI_PRODUCT_H01                        0x0800
#define AMOI_PRODUCT_H01A                        0x7002
#define AMOI_PRODUCT_H02                        0x0802
#define AMOI_PRODUCT_SKYPEPHONE_S2                0x0407

#define DELL_VENDOR_ID                                0x413C

/* Dell modems */
#define DELL_PRODUCT_5700_MINICARD                0x8114
#define DELL_PRODUCT_5500_MINICARD                0x8115
#define DELL_PRODUCT_5505_MINICARD                0x8116
#define DELL_PRODUCT_5700_EXPRESSCARD                0x8117
#define DELL_PRODUCT_5510_EXPRESSCARD                0x8118

#define DELL_PRODUCT_5700_MINICARD_SPRINT        0x8128
#define DELL_PRODUCT_5700_MINICARD_TELUS        0x8129

#define DELL_PRODUCT_5720_MINICARD_VZW                0x8133
#define DELL_PRODUCT_5720_MINICARD_SPRINT        0x8134
#define DELL_PRODUCT_5720_MINICARD_TELUS        0x8135
#define DELL_PRODUCT_5520_MINICARD_CINGULAR        0x8136
#define DELL_PRODUCT_5520_MINICARD_GENERIC_L        0x8137
#define DELL_PRODUCT_5520_MINICARD_GENERIC_I        0x8138

#define DELL_PRODUCT_5730_MINICARD_SPRINT        0x8180
#define DELL_PRODUCT_5730_MINICARD_TELUS        0x8181
#define DELL_PRODUCT_5730_MINICARD_VZW                0x8182

#define DELL_PRODUCT_5800_MINICARD_VZW                0x8195  /* Novatel E362 */
#define DELL_PRODUCT_5800_V2_MINICARD_VZW        0x8196  /* Novatel E362 */
#define DELL_PRODUCT_5804_MINICARD_ATT                0x819b  /* Novatel E371 */

#define DELL_PRODUCT_5821E                        0x81d7
#define DELL_PRODUCT_5821E_ESIM                        0x81e0
#define DELL_PRODUCT_5829E_ESIM                        0x81e4
#define DELL_PRODUCT_5829E                        0x81e6

#define DELL_PRODUCT_FM101R_ESIM                0x8213
#define DELL_PRODUCT_FM101R                        0x8215

#define KYOCERA_VENDOR_ID                        0x0c88
#define KYOCERA_PRODUCT_KPC650                        0x17da
#define KYOCERA_PRODUCT_KPC680                        0x180a

#define ANYDATA_VENDOR_ID                        0x16d5
#define ANYDATA_PRODUCT_ADU_620UW                0x6202
#define ANYDATA_PRODUCT_ADU_E100A                0x6501
#define ANYDATA_PRODUCT_ADU_500A                0x6502

#define AXESSTEL_VENDOR_ID                        0x1726
#define AXESSTEL_PRODUCT_MV110H                        0x1000

#define BANDRICH_VENDOR_ID                        0x1A8D
#define BANDRICH_PRODUCT_C100_1                        0x1002
#define BANDRICH_PRODUCT_C100_2                        0x1003
#define BANDRICH_PRODUCT_1004                        0x1004
#define BANDRICH_PRODUCT_1005                        0x1005
#define BANDRICH_PRODUCT_1006                        0x1006
#define BANDRICH_PRODUCT_1007                        0x1007
#define BANDRICH_PRODUCT_1008                        0x1008
#define BANDRICH_PRODUCT_1009                        0x1009
#define BANDRICH_PRODUCT_100A                        0x100a

#define BANDRICH_PRODUCT_100B                        0x100b
#define BANDRICH_PRODUCT_100C                        0x100c
#define BANDRICH_PRODUCT_100D                        0x100d
#define BANDRICH_PRODUCT_100E                        0x100e

#define BANDRICH_PRODUCT_100F                        0x100f
#define BANDRICH_PRODUCT_1010                        0x1010
#define BANDRICH_PRODUCT_1011                        0x1011
#define BANDRICH_PRODUCT_1012                        0x1012

#define QUALCOMM_VENDOR_ID                        0x05C6
/* These Quectel products use Qualcomm's vendor ID */
#define QUECTEL_PRODUCT_UC20                        0x9003
#define QUECTEL_PRODUCT_UC15                        0x9090
/* These u-blox products use Qualcomm's vendor ID */
#define UBLOX_PRODUCT_R410M                        0x90b2
/* These Yuga products use Qualcomm's vendor ID */
#define YUGA_PRODUCT_CLM920_NC5                        0x9625

#define QUECTEL_VENDOR_ID                        0x2c7c
/* These Quectel products use Quectel's vendor ID */
#define QUECTEL_PRODUCT_EC21                        0x0121
#define QUECTEL_PRODUCT_RG650V                        0x0122
#define QUECTEL_PRODUCT_EM061K_LTA                0x0123
#define QUECTEL_PRODUCT_EM061K_LMS                0x0124
#define QUECTEL_PRODUCT_EC25                        0x0125
#define QUECTEL_PRODUCT_EM060K_128                0x0128
#define QUECTEL_PRODUCT_EM060K_129                0x0129
#define QUECTEL_PRODUCT_EM060K_12a                0x012a
#define QUECTEL_PRODUCT_EM060K_12b                0x012b
#define QUECTEL_PRODUCT_EM060K_12c                0x012c
#define QUECTEL_PRODUCT_EG91                        0x0191
#define QUECTEL_PRODUCT_EG95                        0x0195
#define QUECTEL_PRODUCT_BG96                        0x0296
#define QUECTEL_PRODUCT_EP06                        0x0306
#define QUECTEL_PRODUCT_EM05G                        0x030a
#define QUECTEL_PRODUCT_EM060K                        0x030b
#define QUECTEL_PRODUCT_EM05G_CS                0x030c
#define QUECTEL_PRODUCT_EM05GV2                        0x030e
#define QUECTEL_PRODUCT_EM05CN_SG                0x0310
#define QUECTEL_PRODUCT_EM05G_SG                0x0311
#define QUECTEL_PRODUCT_EM05CN                        0x0312
#define QUECTEL_PRODUCT_EM05G_GR                0x0313
#define QUECTEL_PRODUCT_EM05G_RS                0x0314
#define QUECTEL_PRODUCT_RG255C                        0x0316
#define QUECTEL_PRODUCT_EM12                        0x0512
#define QUECTEL_PRODUCT_RM500Q                        0x0800
#define QUECTEL_PRODUCT_RM520N                        0x0801
#define QUECTEL_PRODUCT_EC200U                        0x0901
#define QUECTEL_PRODUCT_EG912Y                        0x6001
#define QUECTEL_PRODUCT_EC200S_CN                0x6002
#define QUECTEL_PRODUCT_EC200A                        0x6005
#define QUECTEL_PRODUCT_EG916Q                        0x6007
#define QUECTEL_PRODUCT_EM061K_LWW                0x6008
#define QUECTEL_PRODUCT_EM061K_LCN                0x6009
#define QUECTEL_PRODUCT_EC200T                        0x6026
#define QUECTEL_PRODUCT_RM500K                        0x7001

#define CMOTECH_VENDOR_ID                        0x16d8
#define CMOTECH_PRODUCT_6001                        0x6001
#define CMOTECH_PRODUCT_CMU_300                        0x6002
#define CMOTECH_PRODUCT_6003                        0x6003
#define CMOTECH_PRODUCT_6004                        0x6004
#define CMOTECH_PRODUCT_6005                        0x6005
#define CMOTECH_PRODUCT_CGU_628A                0x6006
#define CMOTECH_PRODUCT_CHE_628S                0x6007
#define CMOTECH_PRODUCT_CMU_301                        0x6008
#define CMOTECH_PRODUCT_CHU_628                        0x6280
#define CMOTECH_PRODUCT_CHU_628S                0x6281
#define CMOTECH_PRODUCT_CDU_680                        0x6803
#define CMOTECH_PRODUCT_CDU_685A                0x6804
#define CMOTECH_PRODUCT_CHU_720S                0x7001
#define CMOTECH_PRODUCT_7002                        0x7002
#define CMOTECH_PRODUCT_CHU_629K                0x7003
#define CMOTECH_PRODUCT_7004                        0x7004
#define CMOTECH_PRODUCT_7005                        0x7005
#define CMOTECH_PRODUCT_CGU_629                        0x7006
#define CMOTECH_PRODUCT_CHU_629S                0x700a
#define CMOTECH_PRODUCT_CHU_720I                0x7211
#define CMOTECH_PRODUCT_7212                        0x7212
#define CMOTECH_PRODUCT_7213                        0x7213
#define CMOTECH_PRODUCT_7251                        0x7251
#define CMOTECH_PRODUCT_7252                        0x7252
#define CMOTECH_PRODUCT_7253                        0x7253

#define TELIT_VENDOR_ID                                0x1bc7
#define TELIT_PRODUCT_UC864E                        0x1003
#define TELIT_PRODUCT_UC864G                        0x1004
#define TELIT_PRODUCT_CC864_DUAL                0x1005
#define TELIT_PRODUCT_CC864_SINGLE                0x1006
#define TELIT_PRODUCT_DE910_DUAL                0x1010
#define TELIT_PRODUCT_UE910_V2                        0x1012
#define TELIT_PRODUCT_LE922_USBCFG1                0x1040
#define TELIT_PRODUCT_LE922_USBCFG2                0x1041
#define TELIT_PRODUCT_LE922_USBCFG0                0x1042
#define TELIT_PRODUCT_LE922_USBCFG3                0x1043
#define TELIT_PRODUCT_LE922_USBCFG5                0x1045
#define TELIT_PRODUCT_ME910                        0x1100
#define TELIT_PRODUCT_ME910_DUAL_MODEM                0x1101
#define TELIT_PRODUCT_LE920                        0x1200
#define TELIT_PRODUCT_LE910                        0x1201
#define TELIT_PRODUCT_LE910_USBCFG4                0x1206
#define TELIT_PRODUCT_LE920A4_1207                0x1207
#define TELIT_PRODUCT_LE920A4_1208                0x1208
#define TELIT_PRODUCT_LE920A4_1211                0x1211
#define TELIT_PRODUCT_LE920A4_1212                0x1212
#define TELIT_PRODUCT_LE920A4_1213                0x1213
#define TELIT_PRODUCT_LE920A4_1214                0x1214

/* ZTE PRODUCTS */
#define ZTE_VENDOR_ID                                0x19d2
#define ZTE_PRODUCT_MF622                        0x0001
#define ZTE_PRODUCT_MF628                        0x0015
#define ZTE_PRODUCT_MF626                        0x0031
#define ZTE_PRODUCT_ZM8620_X                        0x0396
#define ZTE_PRODUCT_ME3620_MBIM                        0x0426
#define ZTE_PRODUCT_ME3620_X                        0x1432
#define ZTE_PRODUCT_ME3620_L                        0x1433
#define ZTE_PRODUCT_AC2726                        0xfff1
#define ZTE_PRODUCT_MG880                        0xfffd
#define ZTE_PRODUCT_CDMA_TECH                        0xfffe
#define ZTE_PRODUCT_AC8710T                        0xffff
#define ZTE_PRODUCT_MC2718                        0xffe8
#define ZTE_PRODUCT_AD3812                        0xffeb
#define ZTE_PRODUCT_MC2716                        0xffed

#define BENQ_VENDOR_ID                                0x04a5
#define BENQ_PRODUCT_H10                        0x4068

#define DLINK_VENDOR_ID                                0x1186
#define DLINK_PRODUCT_DWM_652                        0x3e04
#define DLINK_PRODUCT_DWM_652_U5                0xce16
#define DLINK_PRODUCT_DWM_652_U5A                0xce1e

#define QISDA_VENDOR_ID                                0x1da5
#define QISDA_PRODUCT_H21_4512                        0x4512
#define QISDA_PRODUCT_H21_4523                        0x4523
#define QISDA_PRODUCT_H20_4515                        0x4515
#define QISDA_PRODUCT_H20_4518                        0x4518
#define QISDA_PRODUCT_H20_4519                        0x4519

/* TLAYTECH PRODUCTS */
#define TLAYTECH_VENDOR_ID                        0x20B9
#define TLAYTECH_PRODUCT_TEU800                        0x1682

/* TOSHIBA PRODUCTS */
#define TOSHIBA_VENDOR_ID                        0x0930
#define TOSHIBA_PRODUCT_HSDPA_MINICARD                0x1302
#define TOSHIBA_PRODUCT_G450                        0x0d45

#define ALINK_VENDOR_ID                                0x1e0e
#define SIMCOM_PRODUCT_SIM7100E                        0x9001 /* Yes, ALINK_VENDOR_ID */
#define ALINK_PRODUCT_PH300                        0x9100
#define ALINK_PRODUCT_3GU                        0x9200

/* ALCATEL PRODUCTS */
#define ALCATEL_VENDOR_ID                        0x1bbb
#define ALCATEL_PRODUCT_X060S_X200                0x0000
#define ALCATEL_PRODUCT_X220_X500D                0x0017
#define ALCATEL_PRODUCT_L100V                        0x011e
#define ALCATEL_PRODUCT_L800MA                        0x0203

#define PIRELLI_VENDOR_ID                        0x1266
#define PIRELLI_PRODUCT_C100_1                        0x1002
#define PIRELLI_PRODUCT_C100_2                        0x1003
#define PIRELLI_PRODUCT_1004                        0x1004
#define PIRELLI_PRODUCT_1005                        0x1005
#define PIRELLI_PRODUCT_1006                        0x1006
#define PIRELLI_PRODUCT_1007                        0x1007
#define PIRELLI_PRODUCT_1008                        0x1008
#define PIRELLI_PRODUCT_1009                        0x1009
#define PIRELLI_PRODUCT_100A                        0x100a
#define PIRELLI_PRODUCT_100B                        0x100b
#define PIRELLI_PRODUCT_100C                        0x100c
#define PIRELLI_PRODUCT_100D                        0x100d
#define PIRELLI_PRODUCT_100E                        0x100e
#define PIRELLI_PRODUCT_100F                        0x100f
#define PIRELLI_PRODUCT_1011                        0x1011
#define PIRELLI_PRODUCT_1012                        0x1012

/* Airplus products */
#define AIRPLUS_VENDOR_ID                        0x1011
#define AIRPLUS_PRODUCT_MCD650                        0x3198

/* Longcheer/Longsung vendor ID; makes whitelabel devices that
 * many other vendors like 4G Systems, Alcatel, ChinaBird,
 * Mobidata, etc sell under their own brand names.
 */
#define LONGCHEER_VENDOR_ID                        0x1c9e

/* 4G Systems products */
/* This one was sold as the VW and Skoda "Carstick LTE" */
#define FOUR_G_SYSTEMS_PRODUCT_CARSTICK_LTE        0x7605
/* This is the 4G XS Stick W14 a.k.a. Mobilcom Debitel Surf-Stick *
 * It seems to contain a Qualcomm QSC6240/6290 chipset            */
#define FOUR_G_SYSTEMS_PRODUCT_W14                0x9603
#define FOUR_G_SYSTEMS_PRODUCT_W100                0x9b01

/* Fujisoft products */
#define FUJISOFT_PRODUCT_FS040U                        0x9b02

/* iBall 3.5G connect wireless modem */
#define IBALL_3_5G_CONNECT                        0x9605

/* Zoom */
#define ZOOM_PRODUCT_4597                        0x9607

/* SpeedUp SU9800 usb 3g modem */
#define SPEEDUP_PRODUCT_SU9800                        0x9800

/* Haier products */
#define HAIER_VENDOR_ID                                0x201e
#define HAIER_PRODUCT_CE81B                        0x10f8
#define HAIER_PRODUCT_CE100                        0x2009

/* Gemalto's Cinterion products (formerly Siemens) */
#define SIEMENS_VENDOR_ID                        0x0681
#define CINTERION_VENDOR_ID                        0x1e2d
#define CINTERION_PRODUCT_HC25_MDMNET                0x0040
#define CINTERION_PRODUCT_HC25_MDM                0x0047
#define CINTERION_PRODUCT_HC28_MDMNET                0x004A /* same for HC28J */
#define CINTERION_PRODUCT_HC28_MDM                0x004C
#define CINTERION_PRODUCT_EU3_E                        0x0051
#define CINTERION_PRODUCT_EU3_P                        0x0052
#define CINTERION_PRODUCT_PH8                        0x0053
#define CINTERION_PRODUCT_AHXX                        0x0055
#define CINTERION_PRODUCT_PLXX                        0x0060
#define CINTERION_PRODUCT_EXS82                        0x006c
#define CINTERION_PRODUCT_PH8_2RMNET                0x0082
#define CINTERION_PRODUCT_PH8_AUDIO                0x0083
#define CINTERION_PRODUCT_AHXX_2RMNET                0x0084
#define CINTERION_PRODUCT_AHXX_AUDIO                0x0085
#define CINTERION_PRODUCT_CLS8                        0x00b0
#define CINTERION_PRODUCT_MV31_MBIM                0x00b3
#define CINTERION_PRODUCT_MV31_RMNET                0x00b7
#define CINTERION_PRODUCT_MV31_2_MBIM                0x00b8
#define CINTERION_PRODUCT_MV31_2_RMNET                0x00b9
#define CINTERION_PRODUCT_MV32_WA                0x00f1
#define CINTERION_PRODUCT_MV32_WB                0x00f2
#define CINTERION_PRODUCT_MV32_WA_RMNET                0x00f3
#define CINTERION_PRODUCT_MV32_WB_RMNET                0x00f4

/* Olivetti products */
#define OLIVETTI_VENDOR_ID                        0x0b3c
#define OLIVETTI_PRODUCT_OLICARD100                0xc000
#define OLIVETTI_PRODUCT_OLICARD120                0xc001
#define OLIVETTI_PRODUCT_OLICARD140                0xc002
#define OLIVETTI_PRODUCT_OLICARD145                0xc003
#define OLIVETTI_PRODUCT_OLICARD155                0xc004
#define OLIVETTI_PRODUCT_OLICARD200                0xc005
#define OLIVETTI_PRODUCT_OLICARD160                0xc00a
#define OLIVETTI_PRODUCT_OLICARD500                0xc00b

/* Celot products */
#define CELOT_VENDOR_ID                                0x211f
#define CELOT_PRODUCT_CT680M                        0x6801

/* Samsung products */
#define SAMSUNG_VENDOR_ID                       0x04e8
#define SAMSUNG_PRODUCT_GT_B3730                0x6889

/* YUGA products  www.yuga-info.com gavin.kx@qq.com */
#define YUGA_VENDOR_ID                                0x257A
#define YUGA_PRODUCT_CEM600                        0x1601
#define YUGA_PRODUCT_CEM610                        0x1602
#define YUGA_PRODUCT_CEM500                        0x1603
#define YUGA_PRODUCT_CEM510                        0x1604
#define YUGA_PRODUCT_CEM800                        0x1605
#define YUGA_PRODUCT_CEM900                        0x1606

#define YUGA_PRODUCT_CEU818                        0x1607
#define YUGA_PRODUCT_CEU816                        0x1608
#define YUGA_PRODUCT_CEU828                        0x1609
#define YUGA_PRODUCT_CEU826                        0x160A
#define YUGA_PRODUCT_CEU518                        0x160B
#define YUGA_PRODUCT_CEU516                        0x160C
#define YUGA_PRODUCT_CEU528                        0x160D
#define YUGA_PRODUCT_CEU526                        0x160F
#define YUGA_PRODUCT_CEU881                        0x161F
#define YUGA_PRODUCT_CEU882                        0x162F

#define YUGA_PRODUCT_CWM600                        0x2601
#define YUGA_PRODUCT_CWM610                        0x2602
#define YUGA_PRODUCT_CWM500                        0x2603
#define YUGA_PRODUCT_CWM510                        0x2604
#define YUGA_PRODUCT_CWM800                        0x2605
#define YUGA_PRODUCT_CWM900                        0x2606

#define YUGA_PRODUCT_CWU718                        0x2607
#define YUGA_PRODUCT_CWU716                        0x2608
#define YUGA_PRODUCT_CWU728                        0x2609
#define YUGA_PRODUCT_CWU726                        0x260A
#define YUGA_PRODUCT_CWU518                        0x260B
#define YUGA_PRODUCT_CWU516                        0x260C
#define YUGA_PRODUCT_CWU528                        0x260D
#define YUGA_PRODUCT_CWU581                        0x260E
#define YUGA_PRODUCT_CWU526                        0x260F
#define YUGA_PRODUCT_CWU582                        0x261F
#define YUGA_PRODUCT_CWU583                        0x262F

#define YUGA_PRODUCT_CLM600                        0x3601
#define YUGA_PRODUCT_CLM610                        0x3602
#define YUGA_PRODUCT_CLM500                        0x3603
#define YUGA_PRODUCT_CLM510                        0x3604
#define YUGA_PRODUCT_CLM800                        0x3605
#define YUGA_PRODUCT_CLM900                        0x3606

#define YUGA_PRODUCT_CLU718                        0x3607
#define YUGA_PRODUCT_CLU716                        0x3608
#define YUGA_PRODUCT_CLU728                        0x3609
#define YUGA_PRODUCT_CLU726                        0x360A
#define YUGA_PRODUCT_CLU518                        0x360B
#define YUGA_PRODUCT_CLU516                        0x360C
#define YUGA_PRODUCT_CLU528                        0x360D
#define YUGA_PRODUCT_CLU526                        0x360F

/* Viettel products */
#define VIETTEL_VENDOR_ID                        0x2262
#define VIETTEL_PRODUCT_VT1000                        0x0002

/* ZD Incorporated */
#define ZD_VENDOR_ID                                0x0685
#define ZD_PRODUCT_7000                                0x7000

/* LG products */
#define LG_VENDOR_ID                                0x1004
#define LG_PRODUCT_L02C                                0x618f

/* MediaTek products */
#define MEDIATEK_VENDOR_ID                        0x0e8d
#define MEDIATEK_PRODUCT_DC_1COM                0x00a0
#define MEDIATEK_PRODUCT_DC_4COM                0x00a5
#define MEDIATEK_PRODUCT_DC_4COM2                0x00a7
#define MEDIATEK_PRODUCT_DC_5COM                0x00a4
#define MEDIATEK_PRODUCT_7208_1COM                0x7101
#define MEDIATEK_PRODUCT_7208_2COM                0x7102
#define MEDIATEK_PRODUCT_7103_2COM                0x7103
#define MEDIATEK_PRODUCT_7106_2COM                0x7106
#define MEDIATEK_PRODUCT_FP_1COM                0x0003
#define MEDIATEK_PRODUCT_FP_2COM                0x0023
#define MEDIATEK_PRODUCT_FPDC_1COM                0x0043
#define MEDIATEK_PRODUCT_FPDC_2COM                0x0033

/* Cellient products */
#define CELLIENT_VENDOR_ID                        0x2692
#define CELLIENT_PRODUCT_MEN200                        0x9005
#define CELLIENT_PRODUCT_MPL200                        0x9025

/* Hyundai Petatel Inc. products */
#define PETATEL_VENDOR_ID                        0x1ff4
#define PETATEL_PRODUCT_NP10T_600A                0x600a
#define PETATEL_PRODUCT_NP10T_600E                0x600e

/* TP-LINK Incorporated products */
#define TPLINK_VENDOR_ID                        0x2357
#define TPLINK_PRODUCT_LTE                        0x000D
#define TPLINK_PRODUCT_MA180                        0x0201

/* Changhong products */
#define CHANGHONG_VENDOR_ID                        0x2077
#define CHANGHONG_PRODUCT_CH690                        0x7001

/* Inovia */
#define INOVIA_VENDOR_ID                        0x20a6
#define INOVIA_SEW858                                0x1105

/* VIA Telecom */
#define VIATELECOM_VENDOR_ID                        0x15eb
#define VIATELECOM_PRODUCT_CDS7                        0x0001

/* WeTelecom products */
#define WETELECOM_VENDOR_ID                        0x22de
#define WETELECOM_PRODUCT_WMD200                0x6801
#define WETELECOM_PRODUCT_6802                        0x6802
#define WETELECOM_PRODUCT_WMD300                0x6803

/* OPPO products */
#define OPPO_VENDOR_ID                                0x22d9
#define OPPO_PRODUCT_R11                        0x276c

/* Sierra Wireless products */
#define SIERRA_VENDOR_ID                        0x1199
#define SIERRA_PRODUCT_EM9191                        0x90d3
#define SIERRA_PRODUCT_EM9291                        0x90e3

/* UNISOC (Spreadtrum) products */
#define UNISOC_VENDOR_ID                        0x1782
/* TOZED LT70-C based on UNISOC SL8563 uses UNISOC's vendor ID */
#define TOZED_PRODUCT_LT70C                        0x4055
#define UNISOC_PRODUCT_UIS7720                        0x4064
/* Luat Air72*U series based on UNISOC UIS8910 uses UNISOC's vendor ID */
#define LUAT_PRODUCT_AIR720U                        0x4e00

/* Device flags */

/* Highest interface number which can be used with NCTRL() and RSVD() */
#define FLAG_IFNUM_MAX        7

/* Interface does not support modem-control requests */
#define NCTRL(ifnum)        ((BIT(ifnum) & 0xff) << 8)

/* Interface is reserved */
#define RSVD(ifnum)        ((BIT(ifnum) & 0xff) << 0)

/* Interface must have two endpoints */
#define NUMEP2                BIT(16)

/* Device needs ZLP */
#define ZLP                BIT(17)


static const struct usb_device_id option_ids[] = {
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA_LIGHT) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA_QUAD) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA_QUAD_LIGHT) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA_NDIS) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA_NDIS_LIGHT) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA_NDIS_QUAD) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA_NDIS_QUAD_LIGHT) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COBRA) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COBRA_BUS) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_VIPER) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_VIPER_BUS) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_GT_MAX_READY) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_FUJI_MODEM_LIGHT) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_FUJI_MODEM_GT) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_FUJI_MODEM_EX) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_KOI_MODEM) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_SCORPION_MODEM) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_ETNA_MODEM) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_ETNA_MODEM_LITE) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_ETNA_MODEM_GT) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_ETNA_MODEM_EX) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_ETNA_KOI_MODEM) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_GTM380_MODEM) },
        { USB_DEVICE(QUANTA_VENDOR_ID, QUANTA_PRODUCT_Q101) },
        { USB_DEVICE(QUANTA_VENDOR_ID, QUANTA_PRODUCT_Q111) },
        { USB_DEVICE(QUANTA_VENDOR_ID, QUANTA_PRODUCT_GLX) },
        { USB_DEVICE(QUANTA_VENDOR_ID, QUANTA_PRODUCT_GKE) },
        { USB_DEVICE(QUANTA_VENDOR_ID, QUANTA_PRODUCT_GLE) },
        { USB_DEVICE(QUANTA_VENDOR_ID, 0xea42),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0x1c05, USB_CLASS_COMM, 0x02, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0x1c1f, USB_CLASS_COMM, 0x02, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0x1c23, USB_CLASS_COMM, 0x02, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E173, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E173S6, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1750, 0xff, 0xff, 0xff),
          .driver_info = RSVD(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0x1441, USB_CLASS_COMM, 0x02, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0x1442, USB_CLASS_COMM, 0x02, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4505, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) | RSVD(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K3765, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) | RSVD(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0x14ac, 0xff, 0xff, 0xff),        /* Huawei E1820 */
          .driver_info = RSVD(1) },
        { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4605, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) | RSVD(2) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0xff, 0xff) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x01) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x02) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x03) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x04) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x05) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x06) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x0A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x0B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x0D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x0E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x0F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x10) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x12) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x13) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x14) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x15) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x17) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x18) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x19) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x1A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x1B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x1C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x31) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x32) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x33) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x34) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x35) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x36) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x3A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x3B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x3D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x3E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x3F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x48) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x49) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x4A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x4B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x4C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x61) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x62) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x63) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x64) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x65) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x66) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x6A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x6B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x6D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x6E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x6F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x72) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x73) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x74) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x75) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x78) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x79) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x7A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x7B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x7C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x01) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x02) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x03) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x04) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x05) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x06) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x0A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x0B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x0D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x0E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x0F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x10) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x12) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x13) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x14) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x15) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x17) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x18) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x19) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x1A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x1B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x1C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x31) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x32) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x33) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x34) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x35) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x36) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x3A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x3B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x3D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x3E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x3F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x48) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x49) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x4A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x4B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x4C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x61) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x62) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x63) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x64) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x65) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x66) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x6A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x6B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x6D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x6E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x6F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x72) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x73) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x74) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x75) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x78) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x79) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x7A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x7B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x7C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x01) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x02) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x03) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x04) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x05) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x06) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x0A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x0B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x0D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x0E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x0F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x10) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x12) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x13) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x14) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x15) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x17) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x18) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x19) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x1A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x1B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x1C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x31) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x32) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x33) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x34) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x35) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x36) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x3A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x3B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x3D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x3E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x3F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x48) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x49) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x4A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x4B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x4C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x61) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x62) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x63) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x64) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x65) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x66) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x6A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x6B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x6D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x6E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x6F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x72) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x73) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x74) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x75) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x78) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x79) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x7A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x7B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x03, 0x7C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x01) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x02) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x03) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x04) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x05) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x06) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x0A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x0B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x0D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x0E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x0F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x10) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x12) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x13) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x14) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x15) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x17) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x18) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x19) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x1A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x1B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x1C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x31) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x32) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x33) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x34) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x35) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x36) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x3A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x3B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x3D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x3E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x3F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x48) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x49) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x4A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x4B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x4C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x61) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x62) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x63) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x64) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x65) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x66) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x6A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x6B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x6D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x6E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x6F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x72) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x73) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x74) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x75) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x78) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x79) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x7A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x7B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x04, 0x7C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x01) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x02) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x03) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x04) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x05) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x06) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x0A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x0B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x0D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x0E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x0F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x10) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x12) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x13) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x14) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x15) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x17) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x18) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x19) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x1A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x1B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x1C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x31) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x32) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x33) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x34) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x35) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x36) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x3A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x3B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x3D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x3E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x3F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x48) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x49) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x4A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x4B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x4C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x61) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x62) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x63) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x64) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x65) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x66) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x6A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x6B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x6D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x6E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x6F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x72) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x73) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x74) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x75) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x78) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x79) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x7A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x7B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x05, 0x7C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x01) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x02) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x03) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x04) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x05) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x06) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x0A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x0B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x0D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x0E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x0F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x10) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x12) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x13) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x14) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x15) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x17) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x18) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x19) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x1A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x1B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x1C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x31) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x32) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x33) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x34) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x35) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x36) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x3A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x3B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x3D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x3E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x3F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x48) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x49) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x4A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x4B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x4C) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x61) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x62) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x63) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x64) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x65) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x66) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x6A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x6B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x6D) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x6E) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x6F) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x72) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x73) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x74) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x75) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x78) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x79) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x7A) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x7B) },
        { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x06, 0x7C) },

        /* Motorola devices */
        { USB_DEVICE_AND_INTERFACE_INFO(0x22b8, 0x2a70, 0xff, 0xff, 0xff) },        /* mdm6600 */
        { USB_DEVICE_AND_INTERFACE_INFO(0x22b8, 0x2e0a, 0xff, 0xff, 0xff) },        /* mdm9600 */
        { USB_DEVICE_AND_INTERFACE_INFO(0x22b8, 0x4281, 0x0a, 0x00, 0xfc) },        /* mdm ram dl */
        { USB_DEVICE_AND_INTERFACE_INFO(0x22b8, 0x900e, 0xff, 0xff, 0xff) },        /* mdm qc dl */

        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_V640) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_V620) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_V740) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_V720) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_U730) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_U740) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_U870) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_XU870) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_X950D) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_EV620) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_ES720) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_E725) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_ES620) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_EU730) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_EU740) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_EU870D) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_MC950D) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_MC727) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_OVMC760) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_MC780) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_EVDO_FULLSPEED) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_HSPA_FULLSPEED) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_EVDO_EMBEDDED_FULLSPEED) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_HSPA_EMBEDDED_FULLSPEED) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_EVDO_HIGHSPEED) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED3) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED4) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED5) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED6) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_HSPA_HIGHSPEED7) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_MC996D) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_MF3470) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_MC547) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_EVDO_EMBEDDED_HIGHSPEED) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_HSPA_EMBEDDED_HIGHSPEED) },
        { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_G2) },
        /* Novatel Ovation MC551 a.k.a. Verizon USB551L */
        { USB_DEVICE_AND_INTERFACE_INFO(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_MC551, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_E362, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_E371, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_U620L, 0xff, 0x00, 0x00) },

        { USB_DEVICE(AMOI_VENDOR_ID, AMOI_PRODUCT_H01) },
        { USB_DEVICE(AMOI_VENDOR_ID, AMOI_PRODUCT_H01A) },
        { USB_DEVICE(AMOI_VENDOR_ID, AMOI_PRODUCT_H02) },
        { USB_DEVICE(AMOI_VENDOR_ID, AMOI_PRODUCT_SKYPEPHONE_S2) },

        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5700_MINICARD) },                /* Dell Wireless 5700 Mobile Broadband CDMA/EVDO Mini-Card == Novatel Expedite EV620 CDMA/EV-DO */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5500_MINICARD) },                /* Dell Wireless 5500 Mobile Broadband HSDPA Mini-Card == Novatel Expedite EU740 HSDPA/3G */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5505_MINICARD) },                /* Dell Wireless 5505 Mobile Broadband HSDPA Mini-Card == Novatel Expedite EU740 HSDPA/3G */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5700_EXPRESSCARD) },                /* Dell Wireless 5700 Mobile Broadband CDMA/EVDO ExpressCard == Novatel Merlin XV620 CDMA/EV-DO */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5510_EXPRESSCARD) },                /* Dell Wireless 5510 Mobile Broadband HSDPA ExpressCard == Novatel Merlin XU870 HSDPA/3G */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5700_MINICARD_SPRINT) },        /* Dell Wireless 5700 Mobile Broadband CDMA/EVDO Mini-Card == Novatel Expedite E720 CDMA/EV-DO */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5700_MINICARD_TELUS) },        /* Dell Wireless 5700 Mobile Broadband CDMA/EVDO Mini-Card == Novatel Expedite ET620 CDMA/EV-DO */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5720_MINICARD_VZW) },         /* Dell Wireless 5720 == Novatel EV620 CDMA/EV-DO */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5720_MINICARD_SPRINT) },         /* Dell Wireless 5720 == Novatel EV620 CDMA/EV-DO */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5720_MINICARD_TELUS) },         /* Dell Wireless 5720 == Novatel EV620 CDMA/EV-DO */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5520_MINICARD_CINGULAR) },        /* Dell Wireless HSDPA 5520 == Novatel Expedite EU860D */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5520_MINICARD_GENERIC_L) },        /* Dell Wireless HSDPA 5520 */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5520_MINICARD_GENERIC_I) },        /* Dell Wireless 5520 Voda I Mobile Broadband (3G HSDPA) Minicard */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5730_MINICARD_SPRINT) },        /* Dell Wireless 5730 Mobile Broadband EVDO/HSPA Mini-Card */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5730_MINICARD_TELUS) },        /* Dell Wireless 5730 Mobile Broadband EVDO/HSPA Mini-Card */
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5730_MINICARD_VZW) },         /* Dell Wireless 5730 Mobile Broadband EVDO/HSPA Mini-Card */
        { USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, DELL_PRODUCT_5800_MINICARD_VZW, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, DELL_PRODUCT_5800_V2_MINICARD_VZW, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, DELL_PRODUCT_5804_MINICARD_ATT, 0xff, 0xff, 0xff) },
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5821E),
          .driver_info = RSVD(0) | RSVD(1) | RSVD(6) },
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5821E_ESIM),
          .driver_info = RSVD(0) | RSVD(1) | RSVD(6) },
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5829E),
          .driver_info = RSVD(0) | RSVD(6) },
        { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5829E_ESIM),
          .driver_info = RSVD(0) | RSVD(6) },
        { USB_DEVICE_INTERFACE_CLASS(DELL_VENDOR_ID, DELL_PRODUCT_FM101R, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(DELL_VENDOR_ID, DELL_PRODUCT_FM101R_ESIM, 0xff) },
        { USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_E100A) },        /* ADU-E100, ADU-310 */
        { USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_500A) },
        { USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_620UW) },
        { USB_DEVICE(AXESSTEL_VENDOR_ID, AXESSTEL_PRODUCT_MV110H) },
        { USB_DEVICE(YISO_VENDOR_ID, YISO_PRODUCT_U893) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_C100_1, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_C100_2, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_1004, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_1005, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_1006, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_1007, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_1008, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_1009, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_100A, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_100B, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_100C, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_100D, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_100E, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_100F, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_1010, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_1011, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(BANDRICH_VENDOR_ID, BANDRICH_PRODUCT_1012, 0xff) },
        { USB_DEVICE(KYOCERA_VENDOR_ID, KYOCERA_PRODUCT_KPC650) },
        { USB_DEVICE(KYOCERA_VENDOR_ID, KYOCERA_PRODUCT_KPC680) },
        { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6000)}, /* ZTE AC8700 */
        { USB_DEVICE_AND_INTERFACE_INFO(QUALCOMM_VENDOR_ID, 0x6001, 0xff, 0xff, 0xff), /* 4G LTE usb-modem U901 */
          .driver_info = RSVD(3) },
        { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6613)}, /* Onda H600/ZTE MF330 */
        { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x0023)}, /* ONYX 3G device */
        { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x9000), /* SIMCom SIM5218 */
          .driver_info = NCTRL(0) | NCTRL(1) | NCTRL(2) | NCTRL(3) | RSVD(4) },
        /* Quectel products using Qualcomm vendor ID */
        { USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC15)},
        { USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC20),
          .driver_info = RSVD(4) },
        /* Yuga products use Qualcomm vendor ID */
        { USB_DEVICE(QUALCOMM_VENDOR_ID, YUGA_PRODUCT_CLM920_NC5),
          .driver_info = RSVD(1) | RSVD(4) },
        /* u-blox products using Qualcomm vendor ID */
        { USB_DEVICE(QUALCOMM_VENDOR_ID, UBLOX_PRODUCT_R410M),
          .driver_info = RSVD(1) | RSVD(3) },
        { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x908b),        /* u-blox LARA-R6 00B */
          .driver_info = RSVD(4) },
        { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x90fa),
          .driver_info = RSVD(3) },
        /* u-blox products */
        { USB_DEVICE(UBLOX_VENDOR_ID, 0x1311) },        /* u-blox LARA-R6 01B */
        { USB_DEVICE(UBLOX_VENDOR_ID, 0x1312),                /* u-blox LARA-R6 01B (RMNET) */
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(UBLOX_VENDOR_ID, 0x1313, 0xff) },        /* u-blox LARA-R6 01B (ECM) */
        { USB_DEVICE(UBLOX_VENDOR_ID, 0x1341) },        /* u-blox LARA-L6 */
        { USB_DEVICE(UBLOX_VENDOR_ID, 0x1342),                /* u-blox LARA-L6 (RMNET) */
          .driver_info = RSVD(4) },
        { USB_DEVICE(UBLOX_VENDOR_ID, 0x1343),                /* u-blox LARA-L6 (ECM) */
          .driver_info = RSVD(4) },
        /* Quectel products using Quectel vendor ID */
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC21, 0xff, 0xff, 0xff),
          .driver_info = NUMEP2 },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC21, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC25, 0xff, 0xff, 0xff),
          .driver_info = NUMEP2 },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC25, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG91, 0xff, 0xff, 0xff),
          .driver_info = NUMEP2 },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG91, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG95, 0xff, 0xff, 0xff),
          .driver_info = NUMEP2 },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG95, 0xff, 0, 0) },
        { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, 0x0203, 0xff), /* BG95-M3 */
          .driver_info = ZLP },
        { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_BG96),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) | RSVD(2) | RSVD(3) | RSVD(4) | NUMEP2 },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0, 0) },
        { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM05CN, 0xff),
          .driver_info = RSVD(6) | ZLP },
        { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM05CN_SG, 0xff),
          .driver_info = RSVD(6) | ZLP },
        { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM05G, 0xff),
          .driver_info = RSVD(6) | ZLP },
        { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM05GV2, 0xff),
          .driver_info = RSVD(4) | ZLP },
        { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM05G_CS, 0xff),
          .driver_info = RSVD(6) | ZLP },
        { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM05G_GR, 0xff),
          .driver_info = RSVD(6) | ZLP },
        { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM05G_RS, 0xff),
          .driver_info = RSVD(6) | ZLP },
        { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM05G_SG, 0xff),
          .driver_info = RSVD(6) | ZLP },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_128, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_128, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_128, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_129, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_129, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_129, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12a, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12a, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12a, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12b, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12b, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12b, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12c, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12c, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12c, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LMS, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LMS, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LMS, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LTA, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LTA, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LTA, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LWW, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LWW, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LWW, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM12, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) | RSVD(2) | RSVD(3) | RSVD(4) | NUMEP2 },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM12, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, 0x0620, 0xff, 0xff, 0x30) },        /* EM160R-GL */
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, 0x0620, 0xff, 0, 0) },
        { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, 0x0700, 0xff), /* BG95 */
          .driver_info = RSVD(3) | ZLP },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x10),
          .driver_info = ZLP },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM520N, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM520N, 0xff, 0, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM520N, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, 0x0900, 0xff, 0, 0), /* RM500U-CN */
          .driver_info = ZLP },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200A, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200U, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200S_CN, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200T, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG912Y, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG916Q, 0xff, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500K, 0xff, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RG650V, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RG650V, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RG255C, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RG255C, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RG255C, 0xff, 0xff, 0x40) },

        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CMU_300) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6003),
          .driver_info = RSVD(0) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6004) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6005) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CGU_628A) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CHE_628S),
          .driver_info = RSVD(0) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CMU_301),
          .driver_info = RSVD(0) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CHU_628),
          .driver_info = RSVD(0) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CHU_628S) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CDU_680) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CDU_685A) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CHU_720S),
          .driver_info = RSVD(0) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_7002),
          .driver_info = RSVD(0) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CHU_629K),
          .driver_info = RSVD(4) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_7004),
          .driver_info = RSVD(3) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_7005) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CGU_629),
          .driver_info = RSVD(5) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CHU_629S),
          .driver_info = RSVD(4) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CHU_720I),
          .driver_info = RSVD(0) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_7212),
          .driver_info = RSVD(0) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_7213),
          .driver_info = RSVD(0) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_7251),
          .driver_info = RSVD(1) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_7252),
          .driver_info = RSVD(1) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_7253),
          .driver_info = RSVD(1) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_UC864E) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_UC864G) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_CC864_DUAL) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_CC864_SINGLE) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_DE910_DUAL) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_UE910_V2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1031, 0xff),        /* Telit LE910C1-EUX */
         .driver_info = NCTRL(0) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1033, 0xff),        /* Telit LE910C1-EUX (ECM) */
         .driver_info = NCTRL(0) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1034, 0xff),        /* Telit LE910C4-WWX (rmnet) */
         .driver_info = RSVD(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1035, 0xff) }, /* Telit LE910C4-WWX (ECM) */
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1036, 0xff) },  /* Telit LE910C4-WWX */
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1037, 0xff),        /* Telit LE910C4-WWX (rmnet) */
         .driver_info = NCTRL(0) | NCTRL(1) | RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1038, 0xff),        /* Telit LE910C4-WWX (rmnet) */
         .driver_info = NCTRL(0) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x103b, 0xff),        /* Telit LE910C4-WWX */
         .driver_info = NCTRL(0) | NCTRL(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x103c, 0xff),        /* Telit LE910C4-WWX */
         .driver_info = NCTRL(0) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG0),
          .driver_info = RSVD(0) | RSVD(1) | NCTRL(2) | RSVD(3) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG1),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG2),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG3),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG5, 0xff),
          .driver_info = RSVD(0) | RSVD(1) | NCTRL(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1050, 0xff),        /* Telit FN980 (rmnet) */
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1051, 0xff),        /* Telit FN980 (MBIM) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1052, 0xff),        /* Telit FN980 (RNDIS) */
          .driver_info = NCTRL(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1053, 0xff),        /* Telit FN980 (ECM) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1054, 0xff),        /* Telit FT980-KS */
          .driver_info = NCTRL(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1055, 0xff),        /* Telit FN980 (PCIe) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1056, 0xff),        /* Telit FD980 */
          .driver_info = NCTRL(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1057, 0xff),        /* Telit FN980 */
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1058, 0xff),        /* Telit FN980 (PCIe) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1060, 0xff),        /* Telit LN920 (rmnet) */
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1061, 0xff),        /* Telit LN920 (MBIM) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1062, 0xff),        /* Telit LN920 (RNDIS) */
          .driver_info = NCTRL(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1063, 0xff),        /* Telit LN920 (ECM) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1070, 0xff),        /* Telit FN990A (rmnet) */
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1071, 0xff),        /* Telit FN990A (MBIM) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1072, 0xff),        /* Telit FN990A (RNDIS) */
          .driver_info = NCTRL(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1073, 0xff),        /* Telit FN990A (ECM) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1074, 0xff),        /* Telit FN990A (MBIM) */
          .driver_info = NCTRL(5) | RSVD(6) | RSVD(7) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1075, 0xff),        /* Telit FN990A (PCIe) */
          .driver_info = RSVD(0) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1077, 0xff),        /* Telit FN990A (rmnet + audio) */
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1078, 0xff),        /* Telit FN990A (MBIM + audio) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1079, 0xff),        /* Telit FN990A (RNDIS + audio) */
          .driver_info = NCTRL(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1080, 0xff),        /* Telit FE990A (rmnet) */
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1081, 0xff),        /* Telit FE990A (MBIM) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1082, 0xff),        /* Telit FE990A (RNDIS) */
          .driver_info = NCTRL(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1083, 0xff),        /* Telit FE990A (ECM) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a0, 0xff),        /* Telit FN20C04 (rmnet) */
          .driver_info = RSVD(0) | NCTRL(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a1, 0xff),        /* Telit FN20C04 (RNDIS) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a2, 0xff),        /* Telit FN920C04 (MBIM) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a3, 0xff),        /* Telit FN920C04 (ECM) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a4, 0xff),        /* Telit FN20C04 (rmnet) */
          .driver_info = RSVD(0) | NCTRL(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a6, 0xff),        /* Telit FN920C04 (RNDIS) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a7, 0xff),        /* Telit FN920C04 (MBIM) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a8, 0xff),        /* Telit FN920C04 (ECM) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a9, 0xff),        /* Telit FN20C04 (rmnet) */
          .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10aa, 0xff),        /* Telit FN920C04 (MBIM) */
          .driver_info = NCTRL(3) | RSVD(4) | RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10ab, 0xff),        /* Telit FN920C04 (RNDIS) */
          .driver_info = NCTRL(3) | RSVD(4) | RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b0, 0xff, 0xff, 0x30),        /* Telit FE990B (rmnet) */
          .driver_info = NCTRL(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b0, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b0, 0xff, 0xff, 0x60) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b1, 0xff, 0xff, 0x30),        /* Telit FE990B (MBIM) */
          .driver_info = NCTRL(6) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b1, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b1, 0xff, 0xff, 0x60) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b2, 0xff, 0xff, 0x30),        /* Telit FE990B (RNDIS) */
          .driver_info = NCTRL(6) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b2, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b2, 0xff, 0xff, 0x60) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b3, 0xff, 0xff, 0x30),        /* Telit FE990B (ECM) */
          .driver_info = NCTRL(6) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b3, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10b3, 0xff, 0xff, 0x60) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c0, 0xff),        /* Telit FE910C04 (rmnet) */
          .driver_info = RSVD(0) | NCTRL(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c1, 0xff),        /* Telit FE910C04 (RNDIS) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c2, 0xff),        /* Telit FE910C04 (MBIM) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c3, 0xff),        /* Telit FE910C04 (ECM) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c4, 0xff),        /* Telit FE910C04 (rmnet) */
          .driver_info = RSVD(0) | NCTRL(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c5, 0xff),        /* Telit FE910C04 (RNDIS) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c6, 0xff),        /* Telit FE910C04 (MBIM) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10c7, 0xff, 0xff, 0x30),        /* Telit FE910C04 (ECM) */
          .driver_info = NCTRL(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10c7, 0xff, 0xff, 0x40) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c8, 0xff),        /* Telit FE910C04 (rmnet) */
          .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c9, 0xff),        /* Telit FE910C04 (MBIM) */
          .driver_info = NCTRL(3) | RSVD(4) | RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10cb, 0xff),        /* Telit FE910C04 (RNDIS) */
          .driver_info = NCTRL(3) | RSVD(4) | RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d0, 0xff, 0xff, 0x30),        /* Telit FN990B (rmnet) */
          .driver_info = NCTRL(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d0, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d0, 0xff, 0xff, 0x60) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d1, 0xff, 0xff, 0x30),        /* Telit FN990B (MBIM) */
          .driver_info = NCTRL(6) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d1, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d1, 0xff, 0xff, 0x60) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d2, 0xff, 0xff, 0x30),        /* Telit FN990B (RNDIS) */
          .driver_info = NCTRL(6) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d2, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d2, 0xff, 0xff, 0x60) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d3, 0xff, 0xff, 0x30),        /* Telit FN990B (ECM) */
          .driver_info = NCTRL(6) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d3, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x10d3, 0xff, 0xff, 0x60) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM),
          .driver_info = NCTRL(0) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1102, 0xff),        /* Telit ME910 (ECM) */
          .driver_info = NCTRL(0) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x110a, 0xff),        /* Telit ME910G1 */
          .driver_info = NCTRL(0) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x110b, 0xff),        /* Telit ME910G1 (ECM) */
          .driver_info = NCTRL(0) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1203, 0xff),        /* Telit LE910Cx (RNDIS) */
          .driver_info = NCTRL(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1204, 0xff),        /* Telit LE910Cx (MBIM) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(5) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1207) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1208),
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1211),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1212),
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1213, 0xff) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1214),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1230, 0xff),        /* Telit LE910Cx (rmnet) */
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1231, 0xff),        /* Telit LE910Cx (RNDIS) */
          .driver_info = NCTRL(2) | RSVD(3) },
        { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x1250, 0xff, 0x00, 0x00) },        /* Telit LE910Cx (rmnet) */
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1252, 0xff) },        /* Telit LE910Cx (MBIM) */
        { USB_DEVICE(TELIT_VENDOR_ID, 0x1260),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE(TELIT_VENDOR_ID, 0x1261),
          .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
        { USB_DEVICE(TELIT_VENDOR_ID, 0x1900),                                /* Telit LN940 (QMI) */
          .driver_info = NCTRL(0) | RSVD(1) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1901, 0xff),        /* Telit LN940 (MBIM) */
          .driver_info = NCTRL(0) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x3000, 0xff),        /* Telit FN912 */
          .driver_info = RSVD(0) | NCTRL(3) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x3001, 0xff),        /* Telit FN912 */
          .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x7010, 0xff),        /* Telit LE910-S1 (RNDIS) */
          .driver_info = NCTRL(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x7011, 0xff),        /* Telit LE910-S1 (ECM) */
          .driver_info = NCTRL(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x701a, 0xff),        /* Telit LE910R1 (RNDIS) */
          .driver_info = NCTRL(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x701b, 0xff),        /* Telit LE910R1 (ECM) */
          .driver_info = NCTRL(2) },
        { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x9000, 0xff),        /* Telit generic core-dump device */
          .driver_info = NCTRL(0) },
        { USB_DEVICE(TELIT_VENDOR_ID, 0x9010),                                /* Telit SBL FN980 flashing device */
          .driver_info = NCTRL(0) | ZLP },
        { USB_DEVICE(TELIT_VENDOR_ID, 0x9200),                                /* Telit LE910S1 flashing device */
          .driver_info = NCTRL(0) | ZLP },
        { USB_DEVICE(TELIT_VENDOR_ID, 0x9201),                                /* Telit LE910R1 flashing device */
          .driver_info = NCTRL(0) | ZLP },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0002, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0003, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0004, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0005, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0006, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0008, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0009, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x000a, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x000b, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x000c, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x000d, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x000e, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x000f, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0010, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0011, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0012, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0013, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF628, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0016, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0017, 0xff, 0xff, 0xff),
          .driver_info = RSVD(3) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0018, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0019, 0xff, 0xff, 0xff),
          .driver_info = RSVD(3) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0020, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0021, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0022, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0023, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0024, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0025, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0028, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0029, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0030, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF626, 0xff, 0xff, 0xff),
          .driver_info = NCTRL(0) | NCTRL(1) | RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0032, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0033, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0034, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0037, 0xff, 0xff, 0xff),
          .driver_info = NCTRL(0) | NCTRL(1) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0038, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0039, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0040, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0042, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0043, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0044, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0048, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0049, 0xff, 0xff, 0xff),
          .driver_info = RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0050, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0051, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0052, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0054, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0055, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0056, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0057, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0058, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0061, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0062, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0063, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0064, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0065, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0066, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0067, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0069, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0076, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0077, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0078, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0079, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0082, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0083, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0086, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0087, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0088, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0089, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0090, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0091, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0092, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0093, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0094, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0095, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0096, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0097, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0104, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0105, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0106, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0108, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0113, 0xff, 0xff, 0xff),
          .driver_info = RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0117, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0118, 0xff, 0xff, 0xff),
          .driver_info = RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0121, 0xff, 0xff, 0xff),
          .driver_info = RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0122, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0123, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0124, 0xff, 0xff, 0xff),
          .driver_info = RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0125, 0xff, 0xff, 0xff),
          .driver_info = RSVD(6) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0126, 0xff, 0xff, 0xff),
          .driver_info = RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0128, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0135, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0136, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0137, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0139, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0142, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0143, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0144, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0145, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0148, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0151, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0153, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0155, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0156, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0157, 0xff, 0xff, 0xff),
          .driver_info = RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0158, 0xff, 0xff, 0xff),
          .driver_info = RSVD(3) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0159, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0161, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0162, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0164, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0165, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0167, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0189, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0191, 0xff, 0xff, 0xff), /* ZTE EuFi890 */
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0196, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0197, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0199, 0xff, 0xff, 0xff), /* ZTE MF820S */
          .driver_info = RSVD(1) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0200, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0201, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0254, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0257, 0xff, 0xff, 0xff), /* ZTE MF821 */
          .driver_info = RSVD(3) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0265, 0xff, 0xff, 0xff), /* ONDA MT8205 */
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0284, 0xff, 0xff, 0xff), /* ZTE MF880 */
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0317, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0326, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0330, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0395, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0412, 0xff, 0xff, 0xff), /* Telewell TW-LTE 4G */
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0414, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0417, 0xff, 0xff, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(ZTE_VENDOR_ID, 0x0601, 0xff) },        /* GosunCn ZTE WeLink ME3630 (RNDIS mode) */
        { USB_DEVICE_INTERFACE_CLASS(ZTE_VENDOR_ID, 0x0602, 0xff) },        /* GosunCn ZTE WeLink ME3630 (MBIM mode) */
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1008, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1010, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1012, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1018, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1021, 0xff, 0xff, 0xff),
          .driver_info = RSVD(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1057, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1058, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1059, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1060, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1061, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1062, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1063, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1064, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1065, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1066, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1067, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1068, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1069, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1070, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1071, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1072, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1073, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1074, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1075, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1076, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1077, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1078, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1079, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1080, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1081, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1082, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1083, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1084, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1085, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1086, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1087, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1088, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1089, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1090, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1091, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1092, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1093, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1094, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1095, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1096, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1097, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1098, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1099, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1100, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1101, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1102, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1103, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1104, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1105, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1106, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1107, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1108, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1109, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1110, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1111, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1112, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1113, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1114, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1115, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1116, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1117, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1118, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1119, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1120, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1121, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1122, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1123, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1124, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1125, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1126, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1127, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1128, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1129, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1130, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1131, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1132, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1133, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1134, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1135, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1136, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1137, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1138, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1139, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1140, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1141, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1142, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1143, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1144, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1145, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1146, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1147, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1148, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1149, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1150, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1151, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1152, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1153, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1154, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1155, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1156, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1157, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1158, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1159, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1160, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1161, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1162, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1163, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1164, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1165, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1166, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1167, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1168, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1169, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1170, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1244, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1245, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1246, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1247, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1248, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1249, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1250, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1251, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1252, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1253, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1254, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1255, 0xff, 0xff, 0xff),
          .driver_info = RSVD(3) | RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1256, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1257, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1258, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1259, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1260, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1261, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1262, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1263, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1264, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1265, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1266, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1267, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1268, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1269, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1270, 0xff, 0xff, 0xff),
          .driver_info = RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1271, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1272, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1273, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1274, 0xff, 0xff, 0xff) },
        { USB_DEVICE(ZTE_VENDOR_ID, 0x1275),        /* ZTE P685M */
          .driver_info = RSVD(3) | RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1276, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1277, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1278, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1279, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1280, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1281, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1282, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1283, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1284, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1285, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1286, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1287, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1288, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1289, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1290, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1291, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1292, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1293, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1294, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1295, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1296, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1297, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1298, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1299, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1300, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1301, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1302, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1303, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1333, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1401, 0xff, 0xff, 0xff),
          .driver_info = RSVD(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1402, 0xff, 0xff, 0xff),
          .driver_info = RSVD(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1424, 0xff, 0xff, 0xff),
          .driver_info = RSVD(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1425, 0xff, 0xff, 0xff),
          .driver_info = RSVD(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1426, 0xff, 0xff, 0xff),  /* ZTE MF91 */
          .driver_info = RSVD(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1428, 0xff, 0xff, 0xff),  /* Telewell TW-LTE 4G v2 */
          .driver_info = RSVD(2) },
        { USB_DEVICE_INTERFACE_CLASS(ZTE_VENDOR_ID, 0x1476, 0xff) },        /* GosunCn ZTE WeLink ME3630 (ECM/NCM mode) */
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1481, 0xff, 0x00, 0x00) }, /* ZTE MF871A */
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1485, 0xff, 0xff, 0xff),  /* ZTE MF286D */
          .driver_info = RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1533, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1534, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1535, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1545, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1546, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1547, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1565, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1566, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1567, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1589, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1590, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1591, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1592, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1594, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1596, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1598, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1600, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x2002, 0xff, 0xff, 0xff),
          .driver_info = NCTRL(0) | NCTRL(1) | NCTRL(2) | RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x2003, 0xff, 0xff, 0xff) },

        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0014, 0xff, 0xff, 0xff) }, /* ZTE CDMA products */
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0027, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0059, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0060, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0070, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0073, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0130, 0xff, 0xff, 0xff),
          .driver_info = RSVD(1) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0133, 0xff, 0xff, 0xff),
          .driver_info = RSVD(3) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0141, 0xff, 0xff, 0xff),
          .driver_info = RSVD(5) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0147, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0152, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0168, 0xff, 0xff, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0170, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0176, 0xff, 0xff, 0xff),
          .driver_info = RSVD(3) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0178, 0xff, 0xff, 0xff),
          .driver_info = RSVD(3) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff42, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff43, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff44, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff45, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff46, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff47, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff48, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff49, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff4a, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff4b, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff4c, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff4d, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff4e, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff4f, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff50, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff51, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff52, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff53, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff54, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff55, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff56, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff57, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff58, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff59, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff5a, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff5b, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff5c, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff5d, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff5e, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff5f, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff60, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff61, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff62, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff63, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff64, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff65, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff66, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff67, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff68, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff69, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff6a, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff6b, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff6c, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff6d, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff6e, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff6f, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff70, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff71, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff72, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff73, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff74, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff75, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff76, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff77, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff78, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff79, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff7a, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff7b, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff7c, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff7d, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff7e, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff7f, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff80, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff81, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff82, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff83, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff84, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff85, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff86, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff87, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff88, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff89, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff8a, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff8b, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff8c, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff8d, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff8e, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff8f, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff90, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff91, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff92, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff93, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff94, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff9f, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffa0, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffa1, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffa2, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffa3, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffa4, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffa5, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffa6, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffa7, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffa8, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffa9, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffaa, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffab, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffac, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffae, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffaf, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffb0, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffb1, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffb2, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffb3, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffb4, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffb5, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffb6, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffb7, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffb8, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffb9, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffba, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffbb, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffbc, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffbd, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffbe, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffbf, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffc0, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffc1, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffc2, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffc3, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffc4, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffc5, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffc6, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffc7, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffc8, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffc9, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffca, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffcb, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffcc, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffcd, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffce, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffcf, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffd0, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffd1, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffd2, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffd3, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffd4, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffd5, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffe9, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffec, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xffee, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xfff6, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xfff7, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xfff8, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xfff9, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xfffb, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xfffc, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MG880, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_CDMA_TECH, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_AC2726, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_AC8710T, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MC2718, 0xff, 0xff, 0xff),
         .driver_info = NCTRL(1) | NCTRL(2) | NCTRL(3) | NCTRL(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_AD3812, 0xff, 0xff, 0xff),
         .driver_info = NCTRL(0) | NCTRL(1) | NCTRL(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MC2716, 0xff, 0xff, 0xff),
         .driver_info = NCTRL(1) | NCTRL(2) | NCTRL(3) },
        { USB_DEVICE(ZTE_VENDOR_ID, ZTE_PRODUCT_ME3620_L),
         .driver_info = RSVD(3) | RSVD(4) | RSVD(5) },
        { USB_DEVICE(ZTE_VENDOR_ID, ZTE_PRODUCT_ME3620_MBIM),
         .driver_info = RSVD(2) | RSVD(3) | RSVD(4) },
        { USB_DEVICE(ZTE_VENDOR_ID, ZTE_PRODUCT_ME3620_X),
         .driver_info = RSVD(3) | RSVD(4) | RSVD(5) },
        { USB_DEVICE(ZTE_VENDOR_ID, ZTE_PRODUCT_ZM8620_X),
         .driver_info = RSVD(3) | RSVD(4) | RSVD(5) },
        { USB_VENDOR_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff, 0x02, 0x01) },
        { USB_VENDOR_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff, 0x02, 0x05) },
        { USB_VENDOR_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff, 0x86, 0x10) },

        { USB_DEVICE(BENQ_VENDOR_ID, BENQ_PRODUCT_H10) },
        { USB_DEVICE(DLINK_VENDOR_ID, DLINK_PRODUCT_DWM_652) },
        { USB_DEVICE(ALINK_VENDOR_ID, DLINK_PRODUCT_DWM_652_U5) }, /* Yes, ALINK_VENDOR_ID */
        { USB_DEVICE(ALINK_VENDOR_ID, DLINK_PRODUCT_DWM_652_U5A) },
        { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H21_4512) },
        { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H21_4523) },
        { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H20_4515) },
        { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H20_4518) },
        { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H20_4519) },
        { USB_DEVICE(TOSHIBA_VENDOR_ID, TOSHIBA_PRODUCT_G450) },
        { USB_DEVICE(TOSHIBA_VENDOR_ID, TOSHIBA_PRODUCT_HSDPA_MINICARD ) }, /* Toshiba 3G HSDPA == Novatel Expedite EU870D MiniCard */
        { USB_DEVICE(ALINK_VENDOR_ID, 0x9000) },
        { USB_DEVICE(ALINK_VENDOR_ID, ALINK_PRODUCT_PH300) },
        { USB_DEVICE_AND_INTERFACE_INFO(ALINK_VENDOR_ID, ALINK_PRODUCT_3GU, 0xff, 0xff, 0xff) },
        { USB_DEVICE(ALINK_VENDOR_ID, SIMCOM_PRODUCT_SIM7100E),
          .driver_info = RSVD(5) | RSVD(6) },
        { USB_DEVICE_INTERFACE_CLASS(0x1e0e, 0x9003, 0xff) },        /* Simcom SIM7500/SIM7600 MBIM mode */
        { USB_DEVICE_INTERFACE_CLASS(0x1e0e, 0x9011, 0xff),        /* Simcom SIM7500/SIM7600 RNDIS mode */
          .driver_info = RSVD(7) },
        { USB_DEVICE(0x1e0e, 0x9071),                                /* Simcom SIM8230 RMNET mode */
          .driver_info = RSVD(3) | RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x1e0e, 0x9078, 0xff),        /* Simcom SIM8230 ECM mode */
          .driver_info = RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(0x1e0e, 0x907b, 0xff),        /* Simcom SIM8230 RNDIS mode */
          .driver_info = RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(0x1e0e, 0x9205, 0xff) },        /* Simcom SIM7070/SIM7080/SIM7090 AT+ECM mode */
        { USB_DEVICE_INTERFACE_CLASS(0x1e0e, 0x9206, 0xff) },        /* Simcom SIM7070/SIM7080/SIM7090 AT-only mode */
        { USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_X060S_X200),
          .driver_info = NCTRL(0) | NCTRL(1) | RSVD(4) },
        { USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_X220_X500D),
          .driver_info = RSVD(6) },
        { USB_DEVICE(ALCATEL_VENDOR_ID, 0x0052),
          .driver_info = RSVD(6) },
        { USB_DEVICE(ALCATEL_VENDOR_ID, 0x00b6),
          .driver_info = RSVD(3) },
        { USB_DEVICE(ALCATEL_VENDOR_ID, 0x00b7),
          .driver_info = RSVD(5) },
        { USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_L100V),
          .driver_info = RSVD(4) },
        { USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_L800MA),
          .driver_info = RSVD(2) },
        { USB_DEVICE(AIRPLUS_VENDOR_ID, AIRPLUS_PRODUCT_MCD650) },
        { USB_DEVICE(TLAYTECH_VENDOR_ID, TLAYTECH_PRODUCT_TEU800) },
        { USB_DEVICE(LONGCHEER_VENDOR_ID, FOUR_G_SYSTEMS_PRODUCT_CARSTICK_LTE),
          .driver_info = RSVD(0) },
        { USB_DEVICE(LONGCHEER_VENDOR_ID, FOUR_G_SYSTEMS_PRODUCT_W14),
          .driver_info = NCTRL(0) | NCTRL(1) },
        { USB_DEVICE(LONGCHEER_VENDOR_ID, FOUR_G_SYSTEMS_PRODUCT_W100),
          .driver_info = NCTRL(1) | NCTRL(2) | RSVD(3) },
        {USB_DEVICE(LONGCHEER_VENDOR_ID, FUJISOFT_PRODUCT_FS040U),
         .driver_info = RSVD(3)},
        { USB_DEVICE_INTERFACE_CLASS(LONGCHEER_VENDOR_ID, SPEEDUP_PRODUCT_SU9800, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(LONGCHEER_VENDOR_ID, 0x9801, 0xff),
          .driver_info = RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(LONGCHEER_VENDOR_ID, 0x9803, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE(LONGCHEER_VENDOR_ID, 0x9b05),        /* Longsung U8300 */
          .driver_info = RSVD(4) | RSVD(5) },
        { USB_DEVICE(LONGCHEER_VENDOR_ID, 0x9b3c),        /* Longsung U9300 */
          .driver_info = RSVD(0) | RSVD(4) },
        { USB_DEVICE(LONGCHEER_VENDOR_ID, ZOOM_PRODUCT_4597) },
        { USB_DEVICE(LONGCHEER_VENDOR_ID, IBALL_3_5G_CONNECT) },
        { USB_DEVICE(HAIER_VENDOR_ID, HAIER_PRODUCT_CE100) },
        { USB_DEVICE_AND_INTERFACE_INFO(HAIER_VENDOR_ID, HAIER_PRODUCT_CE81B, 0xff, 0xff, 0xff) },
        /* Pirelli  */
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_C100_1, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_C100_2, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1004, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1005, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1006, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1007, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1008, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1009, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100A, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100B, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100C, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100D, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100E, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100F, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1011, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1012, 0xff) },
        /* Cinterion */
        { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_EU3_E) },
        { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_EU3_P) },
        { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_PH8),
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_AHXX, 0xff) },
        { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_PLXX),
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_PH8_2RMNET, 0xff),
          .driver_info = RSVD(4) | RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_PH8_AUDIO, 0xff),
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_AHXX_2RMNET, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_AHXX_AUDIO, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_CLS8, 0xff),
          .driver_info = RSVD(0) | RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_EXS82, 0xff) },
        { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_HC28_MDM) },
        { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_HC28_MDMNET) },
        { USB_DEVICE(SIEMENS_VENDOR_ID, CINTERION_PRODUCT_HC25_MDM) },
        { USB_DEVICE(SIEMENS_VENDOR_ID, CINTERION_PRODUCT_HC25_MDMNET) },
        { USB_DEVICE(SIEMENS_VENDOR_ID, CINTERION_PRODUCT_HC28_MDM) }, /* HC28 enumerates with Siemens or Cinterion VID depending on FW revision */
        { USB_DEVICE(SIEMENS_VENDOR_ID, CINTERION_PRODUCT_HC28_MDMNET) },
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_MV31_MBIM, 0xff),
          .driver_info = RSVD(3)},
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_MV31_RMNET, 0xff),
          .driver_info = RSVD(0)},
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_MV31_2_MBIM, 0xff),
          .driver_info = RSVD(3)},
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_MV31_2_RMNET, 0xff),
          .driver_info = RSVD(0)},
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_MV32_WA, 0xff),
          .driver_info = RSVD(3)},
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_MV32_WA_RMNET, 0xff),
          .driver_info = RSVD(0) },
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_MV32_WB, 0xff),
          .driver_info = RSVD(3)},
        { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_MV32_WB_RMNET, 0xff),
          .driver_info = RSVD(0) },
        { USB_DEVICE(OLIVETTI_VENDOR_ID, OLIVETTI_PRODUCT_OLICARD100),
          .driver_info = RSVD(4) },
        { USB_DEVICE(OLIVETTI_VENDOR_ID, OLIVETTI_PRODUCT_OLICARD120),
          .driver_info = RSVD(4) },
        { USB_DEVICE(OLIVETTI_VENDOR_ID, OLIVETTI_PRODUCT_OLICARD140),
          .driver_info = RSVD(4) },
        { USB_DEVICE(OLIVETTI_VENDOR_ID, OLIVETTI_PRODUCT_OLICARD145) },
        { USB_DEVICE(OLIVETTI_VENDOR_ID, OLIVETTI_PRODUCT_OLICARD155),
          .driver_info = RSVD(6) },
        { USB_DEVICE(OLIVETTI_VENDOR_ID, OLIVETTI_PRODUCT_OLICARD200),
          .driver_info = RSVD(6) },
        { USB_DEVICE(OLIVETTI_VENDOR_ID, OLIVETTI_PRODUCT_OLICARD160),
          .driver_info = RSVD(6) },
        { USB_DEVICE(OLIVETTI_VENDOR_ID, OLIVETTI_PRODUCT_OLICARD500),
          .driver_info = RSVD(4) },
        { USB_DEVICE(CELOT_VENDOR_ID, CELOT_PRODUCT_CT680M) }, /* CT-650 CDMA 450 1xEVDO modem */
        { USB_DEVICE_AND_INTERFACE_INFO(SAMSUNG_VENDOR_ID, SAMSUNG_PRODUCT_GT_B3730, USB_CLASS_CDC_DATA, 0x00, 0x00) }, /* Samsung GT-B3730 LTE USB modem.*/
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEM600) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEM610) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEM500) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEM510) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEM800) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEM900) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEU818) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEU816) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEU828) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEU826) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEU518) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEU516) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEU528) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEU526) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWM600) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWM610) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWM500) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWM510) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWM800) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWM900) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU718) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU716) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU728) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU726) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU518) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU516) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU528) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU526) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLM600) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLM610) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLM500) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLM510) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLM800) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLM900) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLU718) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLU716) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLU728) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLU726) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLU518) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLU516) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLU528) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CLU526) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEU881) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CEU882) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU581) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU582) },
        { USB_DEVICE(YUGA_VENDOR_ID, YUGA_PRODUCT_CWU583) },
        { USB_DEVICE_AND_INTERFACE_INFO(VIETTEL_VENDOR_ID, VIETTEL_PRODUCT_VT1000, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(ZD_VENDOR_ID, ZD_PRODUCT_7000, 0xff, 0xff, 0xff) },
        { USB_DEVICE(LG_VENDOR_ID, LG_PRODUCT_L02C) }, /* docomo L-02C modem */
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a1, 0xff, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a1, 0xff, 0x02, 0x01) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a2, 0xff, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a2, 0xff, 0x02, 0x01) },        /* MediaTek MT6276M modem & app port */
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_1COM, 0x0a, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_5COM, 0xff, 0x02, 0x01) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_5COM, 0xff, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM, 0xff, 0x02, 0x01) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM, 0xff, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7208_1COM, 0x02, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7208_2COM, 0x02, 0x02, 0x01) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FP_1COM, 0x0a, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FP_2COM, 0x0a, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FPDC_1COM, 0x0a, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FPDC_2COM, 0x0a, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7103_2COM, 0xff, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7106_2COM, 0x02, 0x02, 0x01) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM2, 0xff, 0x02, 0x01) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM2, 0xff, 0x00, 0x00) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7126, 0xff, 0x00, 0x00),
          .driver_info = NCTRL(2) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7127, 0xff, 0x00, 0x00),
          .driver_info = NCTRL(2) | NCTRL(3) | NCTRL(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7129, 0xff, 0x00, 0x00),        /* MediaTek T7XX  */
          .driver_info = NCTRL(2) | NCTRL(3) | NCTRL(4) },
        { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) },
        { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MPL200),
          .driver_info = RSVD(1) | RSVD(4) },
        { USB_DEVICE(PETATEL_VENDOR_ID, PETATEL_PRODUCT_NP10T_600A) },
        { USB_DEVICE(PETATEL_VENDOR_ID, PETATEL_PRODUCT_NP10T_600E) },
        { USB_DEVICE_AND_INTERFACE_INFO(TPLINK_VENDOR_ID, TPLINK_PRODUCT_LTE, 0xff, 0x00, 0x00) },        /* TP-Link LTE Module */
        { USB_DEVICE(TPLINK_VENDOR_ID, TPLINK_PRODUCT_MA180),
          .driver_info = RSVD(4) },
        { USB_DEVICE(TPLINK_VENDOR_ID, 0x9000),                                        /* TP-Link MA260 */
          .driver_info = RSVD(4) },
        { USB_DEVICE(CHANGHONG_VENDOR_ID, CHANGHONG_PRODUCT_CH690) },
        { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7d01, 0xff) },                        /* D-Link DWM-156 (variant) */
        { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7d02, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7d03, 0xff) },
        { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7d04, 0xff),                        /* D-Link DWM-158 */
         .driver_info = RSVD(4) | RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7d0e, 0xff) },                        /* D-Link DWM-157 C1 */
        { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7e19, 0xff),                        /* D-Link DWM-221 B1 */
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7e35, 0xff),                        /* D-Link DWM-222 */
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7e3d, 0xff),                        /* D-Link DWM-222 A2 */
          .driver_info = RSVD(4) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3e01, 0xff, 0xff, 0xff) },        /* D-Link DWM-152/C1 */
        { USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3e02, 0xff, 0xff, 0xff) },        /* D-Link DWM-156/C1 */
        { USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x7e11, 0xff, 0xff, 0xff) },        /* D-Link DWM-156/A3 */
        { USB_DEVICE_INTERFACE_CLASS(0x1435, 0xd191, 0xff),                        /* Wistron Neweb D19Q1 */
          .driver_info = RSVD(1) | RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x1690, 0x7588, 0xff),                        /* ASKEY WWHC050 */
          .driver_info = RSVD(1) | RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x2020, 0x2031, 0xff),                        /* Olicard 600 */
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x2020, 0x2033, 0xff),                        /* BroadMobi BM806U */
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x2020, 0x2060, 0xff),                        /* BroadMobi BM818 */
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x2020, 0x4000, 0xff) },                        /* OLICARD300 - MT6225 */
        { USB_DEVICE(INOVIA_VENDOR_ID, INOVIA_SEW858) },
        { USB_DEVICE(VIATELECOM_VENDOR_ID, VIATELECOM_PRODUCT_CDS7) },
        { USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_WMD200, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_6802, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_WMD300, 0xff, 0xff, 0xff) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0x421d, 0xff, 0xff, 0xff) },        /* HP lt2523 (Novatel E371) */
        { USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x10) },        /* HP lt4132 (Huawei ME906s-158) */
        { USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x12) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x13) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x14) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x1b) },
        { USB_DEVICE(0x0489, 0xe0b4),                                                /* Foxconn T77W968 */
          .driver_info = RSVD(0) | RSVD(1) | RSVD(6) },
        { USB_DEVICE(0x0489, 0xe0b5),                                                /* Foxconn T77W968 ESIM */
          .driver_info = RSVD(0) | RSVD(1) | RSVD(6) },
        { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe0da, 0xff),                     /* Foxconn T99W265 MBIM variant */
          .driver_info = RSVD(3) | RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe0db, 0xff),                        /* Foxconn T99W265 MBIM */
          .driver_info = RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe0ee, 0xff),                        /* Foxconn T99W368 MBIM */
          .driver_info = RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe0f0, 0xff),                        /* Foxconn T99W373 MBIM */
          .driver_info = RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe123, 0xff),                        /* Foxconn T99W760 MBIM */
          .driver_info = RSVD(3) },
        { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe145, 0xff),                        /* Foxconn T99W651 RNDIS */
          .driver_info = RSVD(5) | RSVD(6) },
        { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe15f, 0xff),                     /* Foxconn T99W709 */
          .driver_info = RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe167, 0xff),                     /* Foxconn T99W640 MBIM */
          .driver_info = RSVD(3) },
        { USB_DEVICE(0x1508, 0x1001),                                                /* Fibocom NL668 (IOT version) */
          .driver_info = RSVD(4) | RSVD(5) | RSVD(6) },
        { USB_DEVICE(0x1782, 0x4d10) },                                                /* Fibocom L610 (AT mode) */
        { USB_DEVICE_INTERFACE_CLASS(0x1782, 0x4d11, 0xff) },                        /* Fibocom L610 (ECM/RNDIS mode) */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x0001, 0xff, 0xff, 0xff) },        /* Fibocom L716-EU (ECM/RNDIS mode) */
        { USB_DEVICE(0x2cb7, 0x0104),                                                /* Fibocom NL678 series */
          .driver_info = RSVD(4) | RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0105, 0xff),                        /* Fibocom NL678 series */
          .driver_info = RSVD(6) },
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0106, 0xff) },                        /* Fibocom MA510 (ECM mode w/ diag intf.) */
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x010a, 0xff) },                        /* Fibocom MA510 (ECM mode) */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x010b, 0xff, 0xff, 0x30) },        /* Fibocom FG150 Diag */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x010b, 0xff, 0, 0) },                /* Fibocom FG150 AT */
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0111, 0xff) },                        /* Fibocom FM160 (MBIM mode) */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x0112, 0xff, 0xff, 0x30) },        /* Fibocom FG132 Diag */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x0112, 0xff, 0xff, 0x40) },        /* Fibocom FG132 AT */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x0112, 0xff, 0, 0) },                /* Fibocom FG132 NMEA */
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0115, 0xff),                        /* Fibocom FM135 (laptop MBIM) */
          .driver_info = RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a0, 0xff) },                        /* Fibocom NL668-AM/NL652-EU (laptop MBIM) */
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a2, 0xff) },                        /* Fibocom FM101-GL (laptop MBIM) */
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a3, 0xff) },                        /* Fibocom FM101-GL (laptop MBIM) */
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a4, 0xff),                        /* Fibocom FM101-GL (laptop MBIM) */
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a04, 0xff) },                        /* Fibocom FM650-CN (ECM mode) */
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a05, 0xff) },                        /* Fibocom FM650-CN (NCM mode) */
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a06, 0xff) },                        /* Fibocom FM650-CN (RNDIS mode) */
        { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a07, 0xff) },                        /* Fibocom FM650-CN (MBIM mode) */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d41, 0xff, 0, 0) },                /* MeiG Smart SLM320 */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d57, 0xff, 0, 0) },                /* MeiG Smart SLM770A */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0, 0) },                /* MeiG Smart SRM815 */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0x10, 0x02) },        /* MeiG Smart SLM828 */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0x10, 0x03) },        /* MeiG Smart SLM828 */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0xff, 0x30) },        /* MeiG Smart SRM815 and SRM825L */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0xff, 0x40) },        /* MeiG Smart SRM825L */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0xff, 0x60) },        /* MeiG Smart SRM825L */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d38, 0xff, 0xff, 0x30) },        /* MeiG Smart SRM825WN (Diag) */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d38, 0xff, 0xff, 0x40) },        /* MeiG Smart SRM825WN (AT) */
        { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d38, 0xff, 0xff, 0x60) },        /* MeiG Smart SRM825WN (NMEA) */
        { USB_DEVICE_INTERFACE_CLASS(0x2df3, 0x9d03, 0xff) },                        /* LongSung M5710 */
        { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1404, 0xff) },                        /* GosunCn GM500 RNDIS */
        { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1405, 0xff) },                        /* GosunCn GM500 MBIM */
        { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1406, 0xff) },                        /* GosunCn GM500 ECM/NCM */
        { USB_DEVICE(0x33f8, 0x0104),                                                /* Rolling RW101-GL (laptop RMNET) */
          .driver_info = RSVD(4) | RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0115, 0xff),                        /* Rolling RW135-GL (laptop MBIM) */
          .driver_info = RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a2, 0xff) },                        /* Rolling RW101-GL (laptop MBIM) */
        { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a3, 0xff) },                        /* Rolling RW101-GL (laptop MBIM) */
        { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a4, 0xff),                        /* Rolling RW101-GL (laptop MBIM) */
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a8, 0xff),                        /* Rolling RW101R-GL (laptop MBIM) */
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a9, 0xff),                        /* Rolling RW101R-GL (laptop MBIM) */
          .driver_info = RSVD(4) },
        { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0301, 0xff) },                        /* Rolling RW101R-GL (laptop MBIM) */
        { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0302, 0xff) },                        /* Rolling RW101R-GL (laptop MBIM) */
        { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0802, 0xff),                        /* Rolling RW350-GL (laptop MBIM) */
          .driver_info = RSVD(5) },
        { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x1003, 0xff) },                        /* Rolling RW135R-GL (laptop MBIM) */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0100, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WWD for Global */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0100, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0100, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0101, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WRD for Global SKU */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0101, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0101, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0106, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WRD for China SKU */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0106, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0106, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0111, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WWD for SA */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0111, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0111, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0112, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WWD for EU */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0112, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0112, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0113, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WWD for NA */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0113, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0113, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0115, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WWD for China EDU */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0115, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0115, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WWD for Golbal EDU */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WRD for WWAN Ready */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WWD for WWAN Ready */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WRD for WWAN Ready */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0xff, 0x30) },        /* NetPrisma LCUK54-WWD for WWAN Ready */
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0x00, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(OPPO_VENDOR_ID, OPPO_PRODUCT_R11, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9291, 0xff, 0xff, 0x30) },
        { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9291, 0xff, 0xff, 0x40) },
        { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, UNISOC_PRODUCT_UIS7720, 0xff, 0, 0) },
        { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) },
        { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0530, 0xff),                        /* TCL IK512 MBIM */
          .driver_info = NCTRL(1) },
        { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0640, 0xff),                        /* TCL IK512 ECM */
          .driver_info = NCTRL(3) },
        { USB_DEVICE_INTERFACE_CLASS(0x2949, 0x8700, 0xff) },                        /* Neoway N723-EA */
        { } /* Terminating entry */
};
MODULE_DEVICE_TABLE(usb, option_ids);

/* The card has three separate interfaces, which the serial driver
 * recognizes separately, thus num_port=1.
 */

static struct usb_serial_driver option_1port_device = {
        .driver = {
                .name =                "option1",
        },
        .description       = "GSM modem (1-port)",
        .id_table          = option_ids,
        .num_ports         = 1,
        .probe             = option_probe,
        .open              = usb_wwan_open,
        .close             = usb_wwan_close,
        .dtr_rts           = usb_wwan_dtr_rts,
        .write             = usb_wwan_write,
        .write_room        = usb_wwan_write_room,
        .chars_in_buffer   = usb_wwan_chars_in_buffer,
        .tiocmget          = usb_wwan_tiocmget,
        .tiocmset          = usb_wwan_tiocmset,
        .attach            = option_attach,
        .release           = option_release,
        .port_probe        = usb_wwan_port_probe,
        .port_remove           = usb_wwan_port_remove,
        .read_int_callback = option_instat_callback,
#ifdef CONFIG_PM
        .suspend           = usb_wwan_suspend,
        .resume            = usb_wwan_resume,
#endif
};

static struct usb_serial_driver * const serial_drivers[] = {
        &option_1port_device, NULL
};

module_usb_serial_driver(serial_drivers, option_ids);

static bool iface_is_reserved(unsigned long device_flags, u8 ifnum)
{
        if (ifnum > FLAG_IFNUM_MAX)
                return false;

        return device_flags & RSVD(ifnum);
}

static int option_probe(struct usb_serial *serial,
                        const struct usb_device_id *id)
{
        struct usb_interface_descriptor *iface_desc =
                                &serial->interface->cur_altsetting->desc;
        unsigned long device_flags = id->driver_info;

        /* Never bind to the CD-Rom emulation interface        */
        if (iface_desc->bInterfaceClass == USB_CLASS_MASS_STORAGE)
                return -ENODEV;

        /*
         * Don't bind reserved interfaces (like network ones) which often have
         * the same class/subclass/protocol as the serial interfaces.  Look at
         * the Windows driver .INF files for reserved interface numbers.
         */
        if (iface_is_reserved(device_flags, iface_desc->bInterfaceNumber))
                return -ENODEV;

        /*
         * Allow matching on bNumEndpoints for devices whose interface numbers
         * can change (e.g. Quectel EP06).
         */
        if (device_flags & NUMEP2 && iface_desc->bNumEndpoints != 2)
                return -ENODEV;

        /* Store the device flags so we can use them during attach. */
        usb_set_serial_data(serial, (void *)device_flags);

        return 0;
}

static bool iface_no_modem_control(unsigned long device_flags, u8 ifnum)
{
        if (ifnum > FLAG_IFNUM_MAX)
                return false;

        return device_flags & NCTRL(ifnum);
}

static int option_attach(struct usb_serial *serial)
{
        struct usb_interface_descriptor *iface_desc;
        struct usb_wwan_intf_private *data;
        unsigned long device_flags;

        data = kzalloc_obj(struct usb_wwan_intf_private);
        if (!data)
                return -ENOMEM;

        /* Retrieve device flags stored at probe. */
        device_flags = (unsigned long)usb_get_serial_data(serial);

        iface_desc = &serial->interface->cur_altsetting->desc;

        if (!iface_no_modem_control(device_flags, iface_desc->bInterfaceNumber))
                data->use_send_setup = 1;

        if (device_flags & ZLP)
                data->use_zlp = 1;

        spin_lock_init(&data->susp_lock);

        usb_set_serial_data(serial, data);

        return 0;
}

static void option_release(struct usb_serial *serial)
{
        struct usb_wwan_intf_private *intfdata = usb_get_serial_data(serial);

        kfree(intfdata);
}

static void option_instat_callback(struct urb *urb)
{
        int err;
        int status = urb->status;
        struct usb_serial_port *port = urb->context;
        struct device *dev = &port->dev;
        struct usb_wwan_port_private *portdata =
                                        usb_get_serial_port_data(port);

        dev_dbg(dev, "%s: urb %p port %p has data %p\n", __func__, urb, port, portdata);

        if (status == 0) {
                struct usb_ctrlrequest *req_pkt = urb->transfer_buffer;

                if (!req_pkt) {
                        dev_dbg(dev, "%s: NULL req_pkt\n", __func__);
                        return;
                }
                if ((req_pkt->bRequestType == 0xA1) &&
                                (req_pkt->bRequest == 0x20)) {
                        int old_dcd_state;
                        unsigned char signals = *((unsigned char *)
                                        urb->transfer_buffer +
                                        sizeof(struct usb_ctrlrequest));

                        dev_dbg(dev, "%s: signal x%x\n", __func__, signals);

                        old_dcd_state = portdata->dcd_state;
                        portdata->cts_state = 1;
                        portdata->dcd_state = ((signals & 0x01) ? 1 : 0);
                        portdata->dsr_state = ((signals & 0x02) ? 1 : 0);
                        portdata->ri_state = ((signals & 0x08) ? 1 : 0);

                        if (old_dcd_state && !portdata->dcd_state)
                                tty_port_tty_hangup(&port->port, true);
                } else {
                        dev_dbg(dev, "%s: type %x req %x\n", __func__,
                                req_pkt->bRequestType, req_pkt->bRequest);
                }
        } else if (status == -ENOENT || status == -ESHUTDOWN) {
                dev_dbg(dev, "%s: urb stopped: %d\n", __func__, status);
        } else
                dev_dbg(dev, "%s: error %d\n", __func__, status);

        /* Resubmit urb so we continue receiving IRQ data */
        if (status != -ESHUTDOWN && status != -ENOENT) {
                usb_mark_last_busy(port->serial->dev);
                err = usb_submit_urb(urb, GFP_ATOMIC);
                if (err)
                        dev_dbg(dev, "%s: resubmit intr urb failed. (%d)\n",
                                __func__, err);
        }
}

MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);
MODULE_LICENSE("GPL v2");


























































   34 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * NOTE:
 *
 * This header has combined a lot of unrelated to each other stuff.
 * The process of splitting its content is in progress while keeping
 * backward compatibility. That's why it's highly recommended NOT to
 * include this header inside another header file, especially under
 * generic or architectural include/ directory.
 */
#ifndef _LINUX_KERNEL_H
#define _LINUX_KERNEL_H

#include <linux/stdarg.h>
#include <linux/align.h>
#include <linux/array_size.h>
#include <linux/limits.h>
#include <linux/linkage.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/container_of.h>
#include <linux/bitops.h>
#include <linux/kstrtox.h>
#include <linux/log2.h>
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/typecheck.h>
#include <linux/panic.h>
#include <linux/printk.h>
#include <linux/build_bug.h>
#include <linux/sprintf.h>
#include <linux/static_call_types.h>
#include <linux/trace_printk.h>
#include <linux/util_macros.h>
#include <linux/wordpart.h>

#include <asm/byteorder.h>

#include <uapi/linux/kernel.h>

struct completion;
struct user;

#ifdef CONFIG_PREEMPT_VOLUNTARY_BUILD

extern int __cond_resched(void);
# define might_resched() __cond_resched()

#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)

extern int __cond_resched(void);

DECLARE_STATIC_CALL(might_resched, __cond_resched);

static __always_inline void might_resched(void)
{
        static_call_mod(might_resched)();
}

#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)

extern int dynamic_might_resched(void);
# define might_resched() dynamic_might_resched()

#else

# define might_resched() do { } while (0)

#endif /* CONFIG_PREEMPT_* */

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
extern void __might_resched(const char *file, int line, unsigned int offsets);
extern void __might_sleep(const char *file, int line);
extern void __cant_sleep(const char *file, int line, int preempt_offset);
extern void __cant_migrate(const char *file, int line);

/**
 * might_sleep - annotation for functions that can sleep
 *
 * this macro will print a stack trace if it is executed in an atomic
 * context (spinlock, irq-handler, ...). Additional sections where blocking is
 * not allowed can be annotated with non_block_start() and non_block_end()
 * pairs.
 *
 * This is a useful debugging help to be able to catch problems early and not
 * be bitten later when the calling function happens to sleep when it is not
 * supposed to.
 */
# define might_sleep() \
        do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
/**
 * cant_sleep - annotation for functions that cannot sleep
 *
 * this macro will print a stack trace if it is executed with preemption enabled
 */
# define cant_sleep() \
        do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
# define sched_annotate_sleep()        (current->task_state_change = 0)

/**
 * cant_migrate - annotation for functions that cannot migrate
 *
 * Will print a stack trace if executed in code which is migratable
 */
# define cant_migrate()                                                        \
        do {                                                                \
                if (IS_ENABLED(CONFIG_SMP))                                \
                        __cant_migrate(__FILE__, __LINE__);                \
        } while (0)

/**
 * non_block_start - annotate the start of section where sleeping is prohibited
 *
 * This is on behalf of the oom reaper, specifically when it is calling the mmu
 * notifiers. The problem is that if the notifier were to block on, for example,
 * mutex_lock() and if the process which holds that mutex were to perform a
 * sleeping memory allocation, the oom reaper is now blocked on completion of
 * that memory allocation. Other blocking calls like wait_event() pose similar
 * issues.
 */
# define non_block_start() (current->non_block_count++)
/**
 * non_block_end - annotate the end of section where sleeping is prohibited
 *
 * Closes a section opened by non_block_start().
 */
# define non_block_end() WARN_ON(current->non_block_count-- == 0)
#else
  static inline void __might_resched(const char *file, int line,
                                     unsigned int offsets) { }
static inline void __might_sleep(const char *file, int line) { }
# define might_sleep() do { might_resched(); } while (0)
# define cant_sleep() do { } while (0)
# define cant_migrate()                do { } while (0)
# define sched_annotate_sleep() do { } while (0)
# define non_block_start() do { } while (0)
# define non_block_end() do { } while (0)
#endif

#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)

#if defined(CONFIG_MMU) && \
        (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
#define might_fault() __might_fault(__FILE__, __LINE__)
void __might_fault(const char *file, int line);
#else
static inline void might_fault(void) { }
#endif

void do_exit(long error_code) __noreturn;

extern int core_kernel_text(unsigned long addr);
extern int __kernel_text_address(unsigned long addr);
extern int kernel_text_address(unsigned long addr);
extern int func_ptr_is_kernel_text(void *ptr);

extern void bust_spinlocks(int yes);

extern int root_mountflags;

extern bool early_boot_irqs_disabled;

/**
 * enum system_states - Values used for system_state.
 *
 * @SYSTEM_BOOTING:        %0, no init needed
 * @SYSTEM_SCHEDULING: system is ready for scheduling; OK to use RCU
 * @SYSTEM_FREEING_INITMEM: system is freeing all of initmem; almost running
 * @SYSTEM_RUNNING:        system is up and running
 * @SYSTEM_HALT:        system entered clean system halt state
 * @SYSTEM_POWER_OFF:        system entered shutdown/clean power off state
 * @SYSTEM_RESTART:        system entered emergency power off or normal restart
 * @SYSTEM_SUSPEND:        system entered suspend or hibernate state
 *
 * Note:
 * Ordering of the states must not be changed
 * as code checks for <, <=, >, >= STATE.
 */
enum system_states {
        SYSTEM_BOOTING,
        SYSTEM_SCHEDULING,
        SYSTEM_FREEING_INITMEM,
        SYSTEM_RUNNING,
        SYSTEM_HALT,
        SYSTEM_POWER_OFF,
        SYSTEM_RESTART,
        SYSTEM_SUSPEND,
};
extern enum system_states system_state;

/* Rebuild everything on CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_DYNAMIC_FTRACE
# define REBUILD_DUE_TO_DYNAMIC_FTRACE
#endif

#endif











































































































































































































































































































































































































    1 





    1 



    1 



    1 







    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        IPv6 tunneling device
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Ville Nuorvala                <vnuorval@tcs.hut.fi>
 *        Yasuyuki Kozakai        <kozakai@linux-ipv6.org>
 *
 *      Based on:
 *      linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c
 *
 *      RFC 2473
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/sockios.h>
#include <linux/icmp.h>
#include <linux/if.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/icmpv6.h>
#include <linux/init.h>
#include <linux/route.h>
#include <linux/rtnetlink.h>
#include <linux/netfilter_ipv6.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <linux/etherdevice.h>

#include <linux/uaccess.h>
#include <linux/atomic.h>

#include <net/icmp.h>
#include <net/ip.h>
#include <net/ip_tunnels.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/ip6_tunnel.h>
#include <net/xfrm.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/netdev_lock.h>
#include <net/dst_metadata.h>
#include <net/inet_dscp.h>

MODULE_AUTHOR("Ville Nuorvala");
MODULE_DESCRIPTION("IPv6 tunneling device");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("ip6tnl");
MODULE_ALIAS_NETDEV("ip6tnl0");

#define IP6_TUNNEL_HASH_SIZE_SHIFT  5
#define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT)

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
{
        u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);

        return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT);
}

static int ip6_tnl_dev_init(struct net_device *dev);
static void ip6_tnl_dev_setup(struct net_device *dev);
static struct rtnl_link_ops ip6_link_ops __read_mostly;

static unsigned int ip6_tnl_net_id __read_mostly;
struct ip6_tnl_net {
        /* the IPv6 tunnel fallback device */
        struct net_device *fb_tnl_dev;
        /* lists for storing tunnels in use */
        struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
        struct ip6_tnl __rcu *tnls_wc[1];
        struct ip6_tnl __rcu **tnls[2];
        struct ip6_tnl __rcu *collect_md_tun;
};

static inline int ip6_tnl_mpls_supported(void)
{
        return IS_ENABLED(CONFIG_MPLS);
}

/**
 * ip6_tnl_lookup - fetch tunnel matching the end-point addresses
 *   @net: network namespace
 *   @link: ifindex of underlying interface
 *   @remote: the address of the tunnel exit-point
 *   @local: the address of the tunnel entry-point
 *
 * Return:
 *   tunnel matching given end-points if found,
 *   else fallback tunnel if its device is up,
 *   else %NULL
 **/

static struct ip6_tnl *
ip6_tnl_lookup(struct net *net, int link,
               const struct in6_addr *remote, const struct in6_addr *local)
{
        unsigned int hash = HASH(remote, local);
        struct ip6_tnl *t, *cand = NULL;
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
        struct in6_addr any;

        for_each_ip_tunnel_rcu(t, ip6n->tnls_r_l[hash]) {
                if (!ipv6_addr_equal(local, &t->parms.laddr) ||
                    !ipv6_addr_equal(remote, &t->parms.raddr) ||
                    !(t->dev->flags & IFF_UP))
                        continue;

                if (link == t->parms.link)
                        return t;
                else
                        cand = t;
        }

        memset(&any, 0, sizeof(any));
        hash = HASH(&any, local);
        for_each_ip_tunnel_rcu(t, ip6n->tnls_r_l[hash]) {
                if (!ipv6_addr_equal(local, &t->parms.laddr) ||
                    !ipv6_addr_any(&t->parms.raddr) ||
                    !(t->dev->flags & IFF_UP))
                        continue;

                if (link == t->parms.link)
                        return t;
                else if (!cand)
                        cand = t;
        }

        hash = HASH(remote, &any);
        for_each_ip_tunnel_rcu(t, ip6n->tnls_r_l[hash]) {
                if (!ipv6_addr_equal(remote, &t->parms.raddr) ||
                    !ipv6_addr_any(&t->parms.laddr) ||
                    !(t->dev->flags & IFF_UP))
                        continue;

                if (link == t->parms.link)
                        return t;
                else if (!cand)
                        cand = t;
        }

        if (cand)
                return cand;

        t = rcu_dereference(ip6n->collect_md_tun);
        if (t && t->dev->flags & IFF_UP)
                return t;

        t = rcu_dereference(ip6n->tnls_wc[0]);
        if (t && (t->dev->flags & IFF_UP))
                return t;

        return NULL;
}

/**
 * ip6_tnl_bucket - get head of list matching given tunnel parameters
 *   @ip6n: the private data for ip6_vti in the netns
 *   @p: parameters containing tunnel end-points
 *
 * Description:
 *   ip6_tnl_bucket() returns the head of the list matching the
 *   &struct in6_addr entries laddr and raddr in @p.
 *
 * Return: head of IPv6 tunnel list
 **/

static struct ip6_tnl __rcu **
ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p)
{
        const struct in6_addr *remote = &p->raddr;
        const struct in6_addr *local = &p->laddr;
        unsigned int h = 0;
        int prio = 0;

        if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
                prio = 1;
                h = HASH(remote, local);
        }
        return &ip6n->tnls[prio][h];
}

/**
 * ip6_tnl_link - add tunnel to hash table
 *   @ip6n: the private data for ip6_vti in the netns
 *   @t: tunnel to be added
 **/

static void
ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
{
        struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);

        if (t->parms.collect_md)
                rcu_assign_pointer(ip6n->collect_md_tun, t);
        rcu_assign_pointer(t->next , rtnl_dereference(*tp));
        rcu_assign_pointer(*tp, t);
}

/**
 * ip6_tnl_unlink - remove tunnel from hash table
 *   @ip6n: the private data for ip6_vti in the netns
 *   @t: tunnel to be removed
 **/

static void
ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
{
        struct ip6_tnl __rcu **tp;
        struct ip6_tnl *iter;

        if (t->parms.collect_md)
                rcu_assign_pointer(ip6n->collect_md_tun, NULL);

        for (tp = ip6_tnl_bucket(ip6n, &t->parms);
             (iter = rtnl_dereference(*tp)) != NULL;
             tp = &iter->next) {
                if (t == iter) {
                        rcu_assign_pointer(*tp, t->next);
                        break;
                }
        }
}

static void ip6_dev_free(struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);

        gro_cells_destroy(&t->gro_cells);
        dst_cache_destroy(&t->dst_cache);
}

static int ip6_tnl_create2(struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id);
        int err;

        dev->rtnl_link_ops = &ip6_link_ops;
        err = register_netdevice(dev);
        if (err < 0)
                goto out;

        strcpy(t->parms.name, dev->name);

        ip6_tnl_link(ip6n, t);
        return 0;

out:
        return err;
}

/**
 * ip6_tnl_create - create a new tunnel
 *   @net: network namespace
 *   @p: tunnel parameters
 *
 * Description:
 *   Create tunnel matching given parameters.
 *
 * Return:
 *   created tunnel or error pointer
 **/

static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
{
        struct net_device *dev;
        struct ip6_tnl *t;
        char name[IFNAMSIZ];
        int err = -E2BIG;

        if (p->name[0]) {
                if (!dev_valid_name(p->name))
                        goto failed;
                strscpy(name, p->name, IFNAMSIZ);
        } else {
                sprintf(name, "ip6tnl%%d");
        }
        err = -ENOMEM;
        dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
                           ip6_tnl_dev_setup);
        if (!dev)
                goto failed;

        dev_net_set(dev, net);

        t = netdev_priv(dev);
        t->parms = *p;
        t->net = dev_net(dev);
        err = ip6_tnl_create2(dev);
        if (err < 0)
                goto failed_free;

        return t;

failed_free:
        free_netdev(dev);
failed:
        return ERR_PTR(err);
}

/**
 * ip6_tnl_locate - find or create tunnel matching given parameters
 *   @net: network namespace
 *   @p: tunnel parameters
 *   @create: != 0 if allowed to create new tunnel if no match found
 *
 * Description:
 *   ip6_tnl_locate() first tries to locate an existing tunnel
 *   based on @parms. If this is unsuccessful, but @create is set a new
 *   tunnel device is created and registered for use.
 *
 * Return:
 *   matching tunnel or error pointer
 **/

static struct ip6_tnl *ip6_tnl_locate(struct net *net,
                struct __ip6_tnl_parm *p, int create)
{
        const struct in6_addr *remote = &p->raddr;
        const struct in6_addr *local = &p->laddr;
        struct ip6_tnl __rcu **tp;
        struct ip6_tnl *t;
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);

        for (tp = ip6_tnl_bucket(ip6n, p);
             (t = rtnl_dereference(*tp)) != NULL;
             tp = &t->next) {
                if (ipv6_addr_equal(local, &t->parms.laddr) &&
                    ipv6_addr_equal(remote, &t->parms.raddr) &&
                    p->link == t->parms.link) {
                        if (create)
                                return ERR_PTR(-EEXIST);

                        return t;
                }
        }
        if (!create)
                return ERR_PTR(-ENODEV);
        return ip6_tnl_create(net, p);
}

/**
 * ip6_tnl_dev_uninit - tunnel device uninitializer
 *   @dev: the device to be destroyed
 *
 * Description:
 *   ip6_tnl_dev_uninit() removes tunnel from its list
 **/

static void
ip6_tnl_dev_uninit(struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct net *net = t->net;
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);

        if (dev == ip6n->fb_tnl_dev)
                RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
        else
                ip6_tnl_unlink(ip6n, t);
        dst_cache_reset(&t->dst_cache);
        netdev_put(dev, &t->dev_tracker);
}

/**
 * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option
 *   @skb: received socket buffer
 *   @raw: the ICMPv6 error message data
 *
 * Return:
 *   0 if none was found,
 *   else index to encapsulation limit
 **/

__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
{
        const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw;
        unsigned int nhoff = raw - skb->data;
        unsigned int off = nhoff + sizeof(*ipv6h);
        u8 nexthdr = ipv6h->nexthdr;

        while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
                struct ipv6_opt_hdr *hdr;
                u16 optlen;

                if (!pskb_may_pull(skb, off + sizeof(*hdr)))
                        break;

                hdr = (struct ipv6_opt_hdr *)(skb->data + off);
                if (nexthdr == NEXTHDR_FRAGMENT) {
                        optlen = 8;
                } else if (nexthdr == NEXTHDR_AUTH) {
                        optlen = ipv6_authlen(hdr);
                } else {
                        optlen = ipv6_optlen(hdr);
                }

                if (!pskb_may_pull(skb, off + optlen))
                        break;

                hdr = (struct ipv6_opt_hdr *)(skb->data + off);
                if (nexthdr == NEXTHDR_FRAGMENT) {
                        struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr;

                        if (frag_hdr->frag_off)
                                break;
                }
                if (nexthdr == NEXTHDR_DEST) {
                        u16 i = 2;

                        while (1) {
                                struct ipv6_tlv_tnl_enc_lim *tel;

                                /* No more room for encapsulation limit */
                                if (i + sizeof(*tel) > optlen)
                                        break;

                                tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i);
                                /* return index of option if found and valid */
                                if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
                                    tel->length == 1)
                                        return i + off - nhoff;
                                /* else jump to next option */
                                if (tel->type)
                                        i += tel->length + 2;
                                else
                                        i++;
                        }
                }
                nexthdr = hdr->nexthdr;
                off += optlen;
        }
        return 0;
}
EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim);

/* ip6_tnl_err() should handle errors in the tunnel according to the
 * specifications in RFC 2473.
 */
static int
ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
            u8 *type, u8 *code, int *msg, __u32 *info, int offset)
{
        const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data;
        struct net *net = dev_net(skb->dev);
        u8 rel_type = ICMPV6_DEST_UNREACH;
        u8 rel_code = ICMPV6_ADDR_UNREACH;
        __u32 rel_info = 0;
        struct ip6_tnl *t;
        int err = -ENOENT;
        int rel_msg = 0;
        u8 tproto;
        __u16 len;

        /* If the packet doesn't contain the original IPv6 header we are
           in trouble since we might need the source address for further
           processing of the error. */

        rcu_read_lock();
        t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr);
        if (!t)
                goto out;

        tproto = READ_ONCE(t->parms.proto);
        if (tproto != ipproto && tproto != 0)
                goto out;

        err = 0;

        switch (*type) {
        case ICMPV6_DEST_UNREACH:
                net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
                                    t->parms.name);
                rel_msg = 1;
                break;
        case ICMPV6_TIME_EXCEED:
                if ((*code) == ICMPV6_EXC_HOPLIMIT) {
                        net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
                                            t->parms.name);
                        rel_msg = 1;
                }
                break;
        case ICMPV6_PARAMPROB: {
                struct ipv6_tlv_tnl_enc_lim *tel;
                __u32 teli;

                teli = 0;
                if ((*code) == ICMPV6_HDR_FIELD)
                        teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);

                if (teli && teli == *info - 2) {
                        tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
                        if (tel->encap_limit == 0) {
                                net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
                                                    t->parms.name);
                                rel_msg = 1;
                        }
                } else {
                        net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
                                            t->parms.name);
                }
                break;
        }
        case ICMPV6_PKT_TOOBIG: {
                __u32 mtu;

                ip6_update_pmtu(skb, net, htonl(*info), 0, 0,
                                sock_net_uid(net, NULL));
                mtu = *info - offset;
                if (mtu < IPV6_MIN_MTU)
                        mtu = IPV6_MIN_MTU;
                len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len);
                if (len > mtu) {
                        rel_type = ICMPV6_PKT_TOOBIG;
                        rel_code = 0;
                        rel_info = mtu;
                        rel_msg = 1;
                }
                break;
        }
        case NDISC_REDIRECT:
                ip6_redirect(skb, net, skb->dev->ifindex, 0,
                             sock_net_uid(net, NULL));
                break;
        }

        *type = rel_type;
        *code = rel_code;
        *info = rel_info;
        *msg = rel_msg;

out:
        rcu_read_unlock();
        return err;
}

static int
ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
           u8 type, u8 code, int offset, __be32 info)
{
        __u32 rel_info = ntohl(info);
        const struct iphdr *eiph;
        struct sk_buff *skb2;
        int err, rel_msg = 0;
        u8 rel_type = type;
        u8 rel_code = code;
        struct rtable *rt;
        struct flowi4 fl4;

        err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code,
                          &rel_msg, &rel_info, offset);
        if (err < 0)
                return err;

        if (rel_msg == 0)
                return 0;

        switch (rel_type) {
        case ICMPV6_DEST_UNREACH:
                if (rel_code != ICMPV6_ADDR_UNREACH)
                        return 0;
                rel_type = ICMP_DEST_UNREACH;
                rel_code = ICMP_HOST_UNREACH;
                break;
        case ICMPV6_PKT_TOOBIG:
                if (rel_code != 0)
                        return 0;
                rel_type = ICMP_DEST_UNREACH;
                rel_code = ICMP_FRAG_NEEDED;
                break;
        default:
                return 0;
        }

        if (!pskb_may_pull(skb, offset + sizeof(struct iphdr)))
                return 0;

        skb2 = skb_clone(skb, GFP_ATOMIC);
        if (!skb2)
                return 0;

        /* Remove debris left by IPv6 stack. */
        memset(IPCB(skb2), 0, sizeof(*IPCB(skb2)));

        skb_dst_drop(skb2);

        skb_pull(skb2, offset);
        skb_reset_network_header(skb2);
        eiph = ip_hdr(skb2);
        if (eiph->version != 4 || eiph->ihl < 5)
                goto out;

        /* Try to guess incoming interface */
        rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr,
                                   0, 0, 0, IPPROTO_IPIP,
                                   eiph->tos & INET_DSCP_MASK, 0);
        if (IS_ERR(rt))
                goto out;

        skb2->dev = rt->dst.dev;
        ip_rt_put(rt);

        /* route "incoming" packet */
        if (rt->rt_flags & RTCF_LOCAL) {
                rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
                                           eiph->daddr, eiph->saddr, 0, 0,
                                           IPPROTO_IPIP,
                                           eiph->tos & INET_DSCP_MASK, 0);
                if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) {
                        if (!IS_ERR(rt))
                                ip_rt_put(rt);
                        goto out;
                }
                skb_dst_set(skb2, &rt->dst);
        } else {
                if (ip_route_input(skb2, eiph->daddr, eiph->saddr,
                                   ip4h_dscp(eiph), skb2->dev) ||
                    skb_dst_dev(skb2)->type != ARPHRD_TUNNEL6)
                        goto out;
        }

        /* change mtu on this route */
        if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) {
                if (rel_info > dst6_mtu(skb_dst(skb2)))
                        goto out;

                skb_dst_update_pmtu_no_confirm(skb2, rel_info);
        }

        icmp_send(skb2, rel_type, rel_code, htonl(rel_info));

out:
        kfree_skb(skb2);
        return 0;
}

static int
ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
           u8 type, u8 code, int offset, __be32 info)
{
        __u32 rel_info = ntohl(info);
        int err, rel_msg = 0;
        u8 rel_type = type;
        u8 rel_code = code;

        err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code,
                          &rel_msg, &rel_info, offset);
        if (err < 0)
                return err;

        if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) {
                struct rt6_info *rt;
                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);

                if (!skb2)
                        return 0;

                skb_dst_drop(skb2);
                skb_pull(skb2, offset);
                skb_reset_network_header(skb2);

                /* Try to guess incoming interface */
                rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr,
                                NULL, 0, skb2, 0);

                if (rt && rt->dst.dev)
                        skb2->dev = rt->dst.dev;

                icmpv6_send(skb2, rel_type, rel_code, rel_info);

                ip6_rt_put(rt);

                kfree_skb(skb2);
        }

        return 0;
}

static int
mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
            u8 type, u8 code, int offset, __be32 info)
{
        __u32 rel_info = ntohl(info);
        int err, rel_msg = 0;
        u8 rel_type = type;
        u8 rel_code = code;

        err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code,
                          &rel_msg, &rel_info, offset);
        return err;
}

static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
                                       const struct ipv6hdr *ipv6h,
                                       struct sk_buff *skb)
{
        __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK;

        if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
                ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield);

        return IP6_ECN_decapsulate(ipv6h, skb);
}

static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
                                       const struct ipv6hdr *ipv6h,
                                       struct sk_buff *skb)
{
        if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
                ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb));

        return IP6_ECN_decapsulate(ipv6h, skb);
}

static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
                                               const struct ipv6hdr *ipv6h,
                                               struct sk_buff *skb)
{
        /* ECN is not supported in AF_MPLS */
        return 0;
}

__u32 ip6_tnl_get_cap(struct ip6_tnl *t,
                             const struct in6_addr *laddr,
                             const struct in6_addr *raddr)
{
        struct __ip6_tnl_parm *p = &t->parms;
        int ltype = ipv6_addr_type(laddr);
        int rtype = ipv6_addr_type(raddr);
        __u32 flags = 0;

        if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) {
                flags = IP6_TNL_F_CAP_PER_PACKET;
        } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
                   rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
                   !((ltype|rtype) & IPV6_ADDR_LOOPBACK) &&
                   (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) {
                if (ltype&IPV6_ADDR_UNICAST)
                        flags |= IP6_TNL_F_CAP_XMIT;
                if (rtype&IPV6_ADDR_UNICAST)
                        flags |= IP6_TNL_F_CAP_RCV;
        }
        return flags;
}
EXPORT_SYMBOL(ip6_tnl_get_cap);

/* called with rcu_read_lock() */
int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
                                  const struct in6_addr *laddr,
                                  const struct in6_addr *raddr)
{
        struct __ip6_tnl_parm *p = &t->parms;
        int ret = 0;
        struct net *net = t->net;

        if ((p->flags & IP6_TNL_F_CAP_RCV) ||
            ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
             (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) {
                struct net_device *ldev = NULL;

                if (p->link)
                        ldev = dev_get_by_index_rcu(net, p->link);

                if ((ipv6_addr_is_multicast(laddr) ||
                     likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false,
                                                    0, IFA_F_TENTATIVE))) &&
                    ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) ||
                     likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true,
                                                     0, IFA_F_TENTATIVE))))
                        ret = 1;
        }
        return ret;
}
EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl);

static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
                         const struct tnl_ptk_info *tpi,
                         struct metadata_dst *tun_dst,
                         int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
                                                const struct ipv6hdr *ipv6h,
                                                struct sk_buff *skb),
                         bool log_ecn_err)
{
        const struct ipv6hdr *ipv6h;
        int nh, err;

        if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
            test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
                DEV_STATS_INC(tunnel->dev, rx_crc_errors);
                DEV_STATS_INC(tunnel->dev, rx_errors);
                goto drop;
        }

        if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
                if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
                    (tunnel->i_seqno &&
                     (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
                        DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
                        DEV_STATS_INC(tunnel->dev, rx_errors);
                        goto drop;
                }
                tunnel->i_seqno = ntohl(tpi->seq) + 1;
        }

        skb->protocol = tpi->proto;

        /* Warning: All skb pointers will be invalidated! */
        if (tunnel->dev->type == ARPHRD_ETHER) {
                if (!pskb_may_pull(skb, ETH_HLEN)) {
                        DEV_STATS_INC(tunnel->dev, rx_length_errors);
                        DEV_STATS_INC(tunnel->dev, rx_errors);
                        goto drop;
                }

                skb->protocol = eth_type_trans(skb, tunnel->dev);
                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
        } else {
                skb->dev = tunnel->dev;
                skb_reset_mac_header(skb);
        }

        /* Save offset of outer header relative to skb->head,
         * because we are going to reset the network header to the inner header
         * and might change skb->head.
         */
        nh = skb_network_header(skb) - skb->head;

        skb_reset_network_header(skb);

        if (skb_vlan_inet_prepare(skb, true)) {
                DEV_STATS_INC(tunnel->dev, rx_length_errors);
                DEV_STATS_INC(tunnel->dev, rx_errors);
                goto drop;
        }

        /* Get the outer header. */
        ipv6h = (struct ipv6hdr *)(skb->head + nh);

        memset(skb->cb, 0, sizeof(struct inet6_skb_parm));

        __skb_tunnel_rx(skb, tunnel->dev, tunnel->net);

        err = dscp_ecn_decapsulate(tunnel, ipv6h, skb);
        if (unlikely(err)) {
                if (log_ecn_err)
                        net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n",
                                             &ipv6h->saddr,
                                             ipv6_get_dsfield(ipv6h));
                if (err > 1) {
                        DEV_STATS_INC(tunnel->dev, rx_frame_errors);
                        DEV_STATS_INC(tunnel->dev, rx_errors);
                        goto drop;
                }
        }

        dev_sw_netstats_rx_add(tunnel->dev, skb->len);

        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));

        if (tun_dst)
                skb_dst_set(skb, (struct dst_entry *)tun_dst);

        gro_cells_receive(&tunnel->gro_cells, skb);
        return 0;

drop:
        if (tun_dst)
                dst_release((struct dst_entry *)tun_dst);
        kfree_skb(skb);
        return 0;
}

int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb,
                const struct tnl_ptk_info *tpi,
                struct metadata_dst *tun_dst,
                bool log_ecn_err)
{
        int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
                                    const struct ipv6hdr *ipv6h,
                                    struct sk_buff *skb);

        dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate;
        if (tpi->proto == htons(ETH_P_IP))
                dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate;

        return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
                             log_ecn_err);
}
EXPORT_SYMBOL(ip6_tnl_rcv);

static const struct tnl_ptk_info tpi_v6 = {
        /* no tunnel info required for ipxip6. */
        .proto = htons(ETH_P_IPV6),
};

static const struct tnl_ptk_info tpi_v4 = {
        /* no tunnel info required for ipxip6. */
        .proto = htons(ETH_P_IP),
};

static const struct tnl_ptk_info tpi_mpls = {
        /* no tunnel info required for mplsip6. */
        .proto = htons(ETH_P_MPLS_UC),
};

static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
                      const struct tnl_ptk_info *tpi,
                      int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
                                                  const struct ipv6hdr *ipv6h,
                                                  struct sk_buff *skb))
{
        struct ip6_tnl *t;
        const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        struct metadata_dst *tun_dst = NULL;
        int ret = -1;

        rcu_read_lock();
        t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr);

        if (t) {
                u8 tproto = READ_ONCE(t->parms.proto);

                if (tproto != ipproto && tproto != 0)
                        goto drop;
                if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
                        goto drop;
                ipv6h = ipv6_hdr(skb);
                if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr))
                        goto drop;
                if (iptunnel_pull_header(skb, 0, tpi->proto, false))
                        goto drop;
                if (t->parms.collect_md) {
                        IP_TUNNEL_DECLARE_FLAGS(flags) = { };

                        tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0);
                        if (!tun_dst)
                                goto drop;
                }
                ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
                                    log_ecn_error);
        }

        rcu_read_unlock();

        return ret;

drop:
        rcu_read_unlock();
        kfree_skb(skb);
        return 0;
}

static int ip4ip6_rcv(struct sk_buff *skb)
{
        return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4,
                          ip4ip6_dscp_ecn_decapsulate);
}

static int ip6ip6_rcv(struct sk_buff *skb)
{
        return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6,
                          ip6ip6_dscp_ecn_decapsulate);
}

static int mplsip6_rcv(struct sk_buff *skb)
{
        return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls,
                          mplsip6_dscp_ecn_decapsulate);
}

struct ipv6_tel_txoption {
        struct ipv6_txoptions ops;
        __u8 dst_opt[8];
};

static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit)
{
        memset(opt, 0, sizeof(struct ipv6_tel_txoption));

        opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT;
        opt->dst_opt[3] = 1;
        opt->dst_opt[4] = encap_limit;
        opt->dst_opt[5] = IPV6_TLV_PADN;
        opt->dst_opt[6] = 1;

        opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt;
        opt->ops.opt_nflen = 8;
}

/**
 * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
 *   @t: the outgoing tunnel device
 *   @hdr: IPv6 header from the incoming packet
 *
 * Description:
 *   Avoid trivial tunneling loop by checking that tunnel exit-point
 *   doesn't match source of incoming packet.
 *
 * Return:
 *   1 if conflict,
 *   0 else
 **/

static inline bool
ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr)
{
        return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
}

int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
                     const struct in6_addr *laddr,
                     const struct in6_addr *raddr)
{
        struct __ip6_tnl_parm *p = &t->parms;
        int ret = 0;
        struct net *net = t->net;

        if (t->parms.collect_md)
                return 1;

        if ((p->flags & IP6_TNL_F_CAP_XMIT) ||
            ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
             (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) {
                struct net_device *ldev = NULL;

                rcu_read_lock();
                if (p->link)
                        ldev = dev_get_by_index_rcu(net, p->link);

                if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false,
                                                      0, IFA_F_TENTATIVE)))
                        pr_warn_ratelimited("%s xmit: Local address not yet configured!\n",
                                            p->name);
                else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
                         !ipv6_addr_is_multicast(raddr) &&
                         unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev,
                                                          true, 0, IFA_F_TENTATIVE)))
                        pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n",
                                            p->name);
                else
                        ret = 1;
                rcu_read_unlock();
        }
        return ret;
}
EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl);

/**
 * ip6_tnl_xmit - encapsulate packet and send
 *   @skb: the outgoing socket buffer
 *   @dev: the outgoing tunnel device
 *   @dsfield: dscp code for outer header
 *   @fl6: flow of tunneled packet
 *   @encap_limit: encapsulation limit
 *   @pmtu: Path MTU is stored if packet is too big
 *   @proto: next header value
 *
 * Description:
 *   Build new header and do some sanity checks on the packet before sending
 *   it.
 *
 * Return:
 *   0 on success
 *   -1 fail
 *   %-EMSGSIZE message too big. return mtu in this case.
 **/

int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
                 struct flowi6 *fl6, int encap_limit, __u32 *pmtu,
                 __u8 proto)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct net *net = t->net;
        struct ipv6hdr *ipv6h;
        struct ipv6_tel_txoption opt;
        struct dst_entry *dst = NULL, *ndst = NULL;
        struct net_device *tdev;
        int mtu;
        unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0;
        unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
        unsigned int max_headroom = psh_hlen;
        __be16 payload_protocol;
        bool use_cache = false;
        u8 hop_limit;
        int err = -1;

        payload_protocol = skb_protocol(skb, true);

        if (t->parms.collect_md) {
                hop_limit = skb_tunnel_info(skb)->key.ttl;
                goto route_lookup;
        } else {
                hop_limit = t->parms.hop_limit;
        }

        /* NBMA tunnel */
        if (ipv6_addr_any(&t->parms.raddr)) {
                if (payload_protocol == htons(ETH_P_IPV6)) {
                        struct in6_addr *addr6;
                        struct neighbour *neigh;
                        int addr_type;

                        if (!skb_dst(skb))
                                goto tx_err_link_failure;

                        neigh = dst_neigh_lookup(skb_dst(skb),
                                                 &ipv6_hdr(skb)->daddr);
                        if (!neigh)
                                goto tx_err_link_failure;

                        addr6 = (struct in6_addr *)&neigh->primary_key;
                        addr_type = ipv6_addr_type(addr6);

                        if (addr_type == IPV6_ADDR_ANY)
                                addr6 = &ipv6_hdr(skb)->daddr;

                        memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
                        neigh_release(neigh);
                } else if (payload_protocol == htons(ETH_P_IP)) {
                        const struct rtable *rt = skb_rtable(skb);

                        if (!rt)
                                goto tx_err_link_failure;

                        if (rt->rt_gw_family == AF_INET6)
                                memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr));
                }
        } else if (t->parms.proto != 0 && !(t->parms.flags &
                                            (IP6_TNL_F_USE_ORIG_TCLASS |
                                             IP6_TNL_F_USE_ORIG_FWMARK))) {
                /* enable the cache only if neither the outer protocol nor the
                 * routing decision depends on the current inner header value
                 */
                use_cache = true;
        }

        if (use_cache)
                dst = dst_cache_get(&t->dst_cache);

        if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))
                goto tx_err_link_failure;

        if (!dst) {
route_lookup:
                /* add dsfield to flowlabel for route lookup */
                fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel);

                dst = ip6_route_output(net, NULL, fl6);

                if (dst->error)
                        goto tx_err_link_failure;
                dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0);
                if (IS_ERR(dst)) {
                        err = PTR_ERR(dst);
                        dst = NULL;
                        goto tx_err_link_failure;
                }
                if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) &&
                    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
                                       &fl6->daddr, 0, &fl6->saddr))
                        goto tx_err_link_failure;
                ndst = dst;
        }

        tdev = dst_dev(dst);

        if (tdev == dev) {
                DEV_STATS_INC(dev, collisions);
                net_warn_ratelimited("%s: Local routing loop detected!\n",
                                     t->parms.name);
                goto tx_err_dst_release;
        }
        mtu = dst6_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen;
        if (encap_limit >= 0) {
                max_headroom += 8;
                mtu -= 8;
        }
        mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ?
                       IPV6_MIN_MTU : IPV4_MIN_MTU);

        skb_dst_update_pmtu_no_confirm(skb, mtu);
        if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {
                *pmtu = mtu;
                err = -EMSGSIZE;
                goto tx_err_dst_release;
        }

        if (t->err_count > 0) {
                if (time_before(jiffies,
                                t->err_time + IP6TUNNEL_ERR_TIMEO)) {
                        t->err_count--;

                        dst_link_failure(skb);
                } else {
                        t->err_count = 0;
                }
        }

        skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));

        /*
         * Okay, now see if we can stuff it in the buffer as-is.
         */
        max_headroom += LL_RESERVED_SPACE(tdev);

        if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
            (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
                struct sk_buff *new_skb;

                new_skb = skb_realloc_headroom(skb, max_headroom);
                if (!new_skb)
                        goto tx_err_dst_release;

                if (skb->sk)
                        skb_set_owner_w(new_skb, skb->sk);
                consume_skb(skb);
                skb = new_skb;
        }

        if (t->parms.collect_md) {
                if (t->encap.type != TUNNEL_ENCAP_NONE)
                        goto tx_err_dst_release;
        } else {
                if (use_cache && ndst)
                        dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
        }
        skb_dst_set(skb, dst);

        if (hop_limit == 0) {
                if (payload_protocol == htons(ETH_P_IP))
                        hop_limit = ip_hdr(skb)->ttl;
                else if (payload_protocol == htons(ETH_P_IPV6))
                        hop_limit = ipv6_hdr(skb)->hop_limit;
                else
                        hop_limit = ip6_dst_hoplimit(dst);
        }

        /* Calculate max headroom for all the headers and adjust
         * needed_headroom if necessary.
         */
        max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr)
                        + dst->header_len + t->hlen;
        ip_tunnel_adj_headroom(dev, max_headroom);

        err = ip6_tnl_encap(skb, t, &proto, fl6);
        if (err)
                return err;

        if (encap_limit >= 0) {
                init_tel_txopt(&opt, encap_limit);
                proto = ipv6_push_frag_opts(skb, &opt.ops, proto);
        }

        skb_push(skb, sizeof(struct ipv6hdr));
        skb_reset_network_header(skb);
        ipv6h = ipv6_hdr(skb);
        ip6_flow_hdr(ipv6h, dsfield,
                     ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
        ipv6h->hop_limit = hop_limit;
        ipv6h->nexthdr = proto;
        ipv6h->saddr = fl6->saddr;
        ipv6h->daddr = fl6->daddr;
        ip6tunnel_xmit(NULL, skb, dev, 0);
        return 0;
tx_err_link_failure:
        DEV_STATS_INC(dev, tx_carrier_errors);
        dst_link_failure(skb);
tx_err_dst_release:
        dst_release(dst);
        return err;
}
EXPORT_SYMBOL(ip6_tnl_xmit);

static inline int
ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev,
                u8 protocol)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct ipv6hdr *ipv6h;
        const struct iphdr  *iph;
        int encap_limit = -1;
        __u16 offset;
        struct flowi6 fl6;
        __u8 dsfield, orig_dsfield;
        __u32 mtu;
        u8 tproto;
        int err;

        tproto = READ_ONCE(t->parms.proto);
        if (tproto != protocol && tproto != 0)
                return -1;

        if (t->parms.collect_md) {
                struct ip_tunnel_info *tun_info;
                const struct ip_tunnel_key *key;

                tun_info = skb_tunnel_info(skb);
                if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
                             ip_tunnel_info_af(tun_info) != AF_INET6))
                        return -1;
                key = &tun_info->key;
                memset(&fl6, 0, sizeof(fl6));
                fl6.flowi6_proto = protocol;
                fl6.saddr = key->u.ipv6.src;
                fl6.daddr = key->u.ipv6.dst;
                fl6.flowlabel = key->label;
                dsfield =  key->tos;
                switch (protocol) {
                case IPPROTO_IPIP:
                        iph = ip_hdr(skb);
                        orig_dsfield = ipv4_get_dsfield(iph);
                        break;
                case IPPROTO_IPV6:
                        ipv6h = ipv6_hdr(skb);
                        orig_dsfield = ipv6_get_dsfield(ipv6h);
                        break;
                default:
                        orig_dsfield = dsfield;
                        break;
                }
        } else {
                if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                        encap_limit = t->parms.encap_limit;
                if (protocol == IPPROTO_IPV6) {
                        offset = ip6_tnl_parse_tlv_enc_lim(skb,
                                                skb_network_header(skb));
                        /* ip6_tnl_parse_tlv_enc_lim() might have
                         * reallocated skb->head
                         */
                        if (offset > 0) {
                                struct ipv6_tlv_tnl_enc_lim *tel;

                                tel = (void *)&skb_network_header(skb)[offset];
                                if (tel->encap_limit == 0) {
                                        icmpv6_ndo_send(skb, ICMPV6_PARAMPROB,
                                                        ICMPV6_HDR_FIELD, offset + 2);
                                        return -1;
                                }
                                encap_limit = tel->encap_limit - 1;
                        }
                }

                memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
                fl6.flowi6_proto = protocol;

                if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
                        fl6.flowi6_mark = skb->mark;
                else
                        fl6.flowi6_mark = t->parms.fwmark;
                switch (protocol) {
                case IPPROTO_IPIP:
                        iph = ip_hdr(skb);
                        orig_dsfield = ipv4_get_dsfield(iph);
                        if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
                                dsfield = orig_dsfield;
                        else
                                dsfield = ip6_tclass(t->parms.flowinfo);
                        break;
                case IPPROTO_IPV6:
                        ipv6h = ipv6_hdr(skb);
                        orig_dsfield = ipv6_get_dsfield(ipv6h);
                        if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
                                dsfield = orig_dsfield;
                        else
                                dsfield = ip6_tclass(t->parms.flowinfo);
                        if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
                                fl6.flowlabel |= ip6_flowlabel(ipv6h);
                        break;
                default:
                        orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo);
                        break;
                }
        }

        fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
        dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield);

        if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
                return -1;

        skb_set_inner_ipproto(skb, protocol);

        err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
                           protocol);
        if (err != 0) {
                /* XXX: send ICMP error even if DF is not set. */
                if (err == -EMSGSIZE)
                        switch (protocol) {
                        case IPPROTO_IPIP:
                                icmp_ndo_send(skb, ICMP_DEST_UNREACH,
                                              ICMP_FRAG_NEEDED, htonl(mtu));
                                break;
                        case IPPROTO_IPV6:
                                icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                                break;
                        default:
                                break;
                        }
                return -1;
        }

        return 0;
}

static netdev_tx_t
ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        u8 ipproto;
        int ret;

        if (!pskb_inet_may_pull(skb))
                goto tx_err;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                ipproto = IPPROTO_IPIP;
                break;
        case htons(ETH_P_IPV6):
                if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb)))
                        goto tx_err;
                ipproto = IPPROTO_IPV6;
                break;
        case htons(ETH_P_MPLS_UC):
                ipproto = IPPROTO_MPLS;
                break;
        default:
                goto tx_err;
        }

        ret = ipxip6_tnl_xmit(skb, dev, ipproto);
        if (ret < 0)
                goto tx_err;

        return NETDEV_TX_OK;

tx_err:
        DEV_STATS_INC(dev, tx_errors);
        DEV_STATS_INC(dev, tx_dropped);
        kfree_skb(skb);
        return NETDEV_TX_OK;
}

static void ip6_tnl_link_config(struct ip6_tnl *t)
{
        struct net_device *dev = t->dev;
        struct net_device *tdev = NULL;
        struct __ip6_tnl_parm *p = &t->parms;
        struct flowi6 *fl6 = &t->fl.u.ip6;
        int t_hlen;
        int mtu;

        __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
        memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));

        /* Set up flowi template */
        fl6->saddr = p->laddr;
        fl6->daddr = p->raddr;
        fl6->flowi6_oif = p->link;
        fl6->flowlabel = 0;

        if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
                fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
        if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
                fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;

        p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET);
        p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);

        if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV)
                dev->flags |= IFF_POINTOPOINT;
        else
                dev->flags &= ~IFF_POINTOPOINT;

        t->tun_hlen = 0;
        t->hlen = t->encap_hlen + t->tun_hlen;
        t_hlen = t->hlen + sizeof(struct ipv6hdr);

        if (p->flags & IP6_TNL_F_CAP_XMIT) {
                int strict = (ipv6_addr_type(&p->raddr) &
                              (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));

                struct rt6_info *rt = rt6_lookup(t->net,
                                                 &p->raddr, &p->laddr,
                                                 p->link, NULL, strict);
                if (rt) {
                        tdev = rt->dst.dev;
                        ip6_rt_put(rt);
                }

                if (!tdev && p->link)
                        tdev = __dev_get_by_index(t->net, p->link);

                if (tdev) {
                        dev->needed_headroom = tdev->hard_header_len +
                                tdev->needed_headroom + t_hlen;
                        mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU);

                        mtu = mtu - t_hlen;
                        if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                                mtu -= 8;

                        if (mtu < IPV6_MIN_MTU)
                                mtu = IPV6_MIN_MTU;
                        WRITE_ONCE(dev->mtu, mtu);
                }
        }
}

/**
 * ip6_tnl_change - update the tunnel parameters
 *   @t: tunnel to be changed
 *   @p: tunnel configuration parameters
 *
 * Description:
 *   ip6_tnl_change() updates the tunnel parameters
 **/

static void
ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
{
        t->parms.laddr = p->laddr;
        t->parms.raddr = p->raddr;
        t->parms.flags = p->flags;
        t->parms.hop_limit = p->hop_limit;
        t->parms.encap_limit = p->encap_limit;
        t->parms.flowinfo = p->flowinfo;
        t->parms.link = p->link;
        t->parms.proto = p->proto;
        t->parms.fwmark = p->fwmark;
        dst_cache_reset(&t->dst_cache);
        ip6_tnl_link_config(t);
}

static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
{
        struct net *net = t->net;
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);

        ip6_tnl_unlink(ip6n, t);
        synchronize_net();
        ip6_tnl_change(t, p);
        ip6_tnl_link(ip6n, t);
        netdev_state_change(t->dev);
}

static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p,
                           bool strict)
{
        /* For the default ip6tnl0 device, allow changing only the protocol
         * (the IP6_TNL_F_CAP_PER_PACKET flag is set on ip6tnl0, and all other
         * parameters are 0).
         */
        if (strict &&
            (!ipv6_addr_any(&p->laddr) || !ipv6_addr_any(&p->raddr) ||
             p->flags != t->parms.flags || p->hop_limit || p->encap_limit ||
             p->flowinfo || p->link || p->fwmark || p->collect_md))
                return -EINVAL;

        t->parms.proto = p->proto;
        netdev_state_change(t->dev);
        return 0;
}

static void
ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u)
{
        p->laddr = u->laddr;
        p->raddr = u->raddr;
        p->flags = u->flags;
        p->hop_limit = u->hop_limit;
        p->encap_limit = u->encap_limit;
        p->flowinfo = u->flowinfo;
        p->link = u->link;
        p->proto = u->proto;
        memcpy(p->name, u->name, sizeof(u->name));
}

static void
ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p)
{
        u->laddr = p->laddr;
        u->raddr = p->raddr;
        u->flags = p->flags;
        u->hop_limit = p->hop_limit;
        u->encap_limit = p->encap_limit;
        u->flowinfo = p->flowinfo;
        u->link = p->link;
        u->proto = p->proto;
        memcpy(u->name, p->name, sizeof(u->name));
}

/**
 * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace
 *   @dev: virtual device associated with tunnel
 *   @ifr: unused
 *   @data: parameters passed from userspace
 *   @cmd: command to be performed
 *
 * Description:
 *   ip6_tnl_ioctl() is used for managing IPv6 tunnels
 *   from userspace.
 *
 *   The possible commands are the following:
 *     %SIOCGETTUNNEL: get tunnel parameters for device
 *     %SIOCADDTUNNEL: add tunnel matching given tunnel parameters
 *     %SIOCCHGTUNNEL: change tunnel parameters to those given
 *     %SIOCDELTUNNEL: delete tunnel
 *
 *   The fallback device "ip6tnl0", created during module
 *   initialization, can be used for creating other tunnel devices.
 *
 * Return:
 *   0 on success,
 *   %-EFAULT if unable to copy data to or from userspace,
 *   %-EPERM if current process hasn't %CAP_NET_ADMIN set
 *   %-EINVAL if passed tunnel parameters are invalid,
 *   %-EEXIST if changing a tunnel's parameters would cause a conflict
 *   %-ENODEV if attempting to change or delete a nonexisting device
 **/

static int
ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
                       void __user *data, int cmd)
{
        int err = 0;
        struct ip6_tnl_parm p;
        struct __ip6_tnl_parm p1;
        struct ip6_tnl *t = netdev_priv(dev);
        struct net *net = t->net;
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);

        memset(&p1, 0, sizeof(p1));

        switch (cmd) {
        case SIOCGETTUNNEL:
                if (dev == ip6n->fb_tnl_dev) {
                        if (copy_from_user(&p, data, sizeof(p))) {
                                err = -EFAULT;
                                break;
                        }
                        ip6_tnl_parm_from_user(&p1, &p);
                        t = ip6_tnl_locate(net, &p1, 0);
                        if (IS_ERR(t))
                                t = netdev_priv(dev);
                } else {
                        memset(&p, 0, sizeof(p));
                }
                ip6_tnl_parm_to_user(&p, &t->parms);
                if (copy_to_user(data, &p, sizeof(p)))
                        err = -EFAULT;
                break;
        case SIOCADDTUNNEL:
        case SIOCCHGTUNNEL:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        break;
                err = -EFAULT;
                if (copy_from_user(&p, data, sizeof(p)))
                        break;
                err = -EINVAL;
                if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP &&
                    p.proto != 0)
                        break;
                ip6_tnl_parm_from_user(&p1, &p);
                t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL);
                if (cmd == SIOCCHGTUNNEL) {
                        if (!IS_ERR(t)) {
                                if (t->dev != dev) {
                                        err = -EEXIST;
                                        break;
                                }
                        } else
                                t = netdev_priv(dev);
                        if (dev == ip6n->fb_tnl_dev)
                                ip6_tnl0_update(t, &p1, false);
                        else
                                ip6_tnl_update(t, &p1);
                }
                if (!IS_ERR(t)) {
                        err = 0;
                        ip6_tnl_parm_to_user(&p, &t->parms);
                        if (copy_to_user(data, &p, sizeof(p)))
                                err = -EFAULT;

                } else {
                        err = PTR_ERR(t);
                }
                break;
        case SIOCDELTUNNEL:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        break;

                if (dev == ip6n->fb_tnl_dev) {
                        err = -EFAULT;
                        if (copy_from_user(&p, data, sizeof(p)))
                                break;
                        err = -ENOENT;
                        ip6_tnl_parm_from_user(&p1, &p);
                        t = ip6_tnl_locate(net, &p1, 0);
                        if (IS_ERR(t))
                                break;
                        err = -EPERM;
                        if (t->dev == ip6n->fb_tnl_dev)
                                break;
                        dev = t->dev;
                }
                err = 0;
                unregister_netdevice(dev);
                break;
        default:
                err = -EINVAL;
        }
        return err;
}

/**
 * ip6_tnl_change_mtu - change mtu manually for tunnel device
 *   @dev: virtual device associated with tunnel
 *   @new_mtu: the new mtu
 *
 * Return:
 *   0 on success,
 *   %-EINVAL if mtu too small
 **/

int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
{
        struct ip6_tnl *tnl = netdev_priv(dev);
        int t_hlen;

        t_hlen = tnl->hlen + sizeof(struct ipv6hdr);
        if (tnl->parms.proto == IPPROTO_IPV6) {
                if (new_mtu < IPV6_MIN_MTU)
                        return -EINVAL;
        } else {
                if (new_mtu < ETH_MIN_MTU)
                        return -EINVAL;
        }
        if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) {
                if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen)
                        return -EINVAL;
        } else {
                if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen)
                        return -EINVAL;
        }
        WRITE_ONCE(dev->mtu, new_mtu);
        return 0;
}
EXPORT_SYMBOL(ip6_tnl_change_mtu);

int ip6_tnl_get_iflink(const struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);

        return READ_ONCE(t->parms.link);
}
EXPORT_SYMBOL(ip6_tnl_get_iflink);

int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops,
                          unsigned int num)
{
        if (num >= MAX_IPTUN_ENCAP_OPS)
                return -ERANGE;

        return !cmpxchg((const struct ip6_tnl_encap_ops **)
                        &ip6tun_encaps[num],
                        NULL, ops) ? 0 : -1;
}
EXPORT_SYMBOL(ip6_tnl_encap_add_ops);

int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops,
                          unsigned int num)
{
        int ret;

        if (num >= MAX_IPTUN_ENCAP_OPS)
                return -ERANGE;

        ret = (cmpxchg((const struct ip6_tnl_encap_ops **)
                       &ip6tun_encaps[num],
                       ops, NULL) == ops) ? 0 : -1;

        synchronize_net();

        return ret;
}
EXPORT_SYMBOL(ip6_tnl_encap_del_ops);

int ip6_tnl_encap_setup(struct ip6_tnl *t,
                        struct ip_tunnel_encap *ipencap)
{
        int hlen;

        memset(&t->encap, 0, sizeof(t->encap));

        hlen = ip6_encap_hlen(ipencap);
        if (hlen < 0)
                return hlen;

        t->encap.type = ipencap->type;
        t->encap.sport = ipencap->sport;
        t->encap.dport = ipencap->dport;
        t->encap.flags = ipencap->flags;

        t->encap_hlen = hlen;
        t->hlen = t->encap_hlen + t->tun_hlen;

        return 0;
}
EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);

static int ip6_tnl_fill_forward_path(struct net_device_path_ctx *ctx,
                                     struct net_device_path *path)
{
        struct ip6_tnl *t = netdev_priv(ctx->dev);
        struct flowi6 fl6 = {
                .daddr = t->parms.raddr,
        };
        struct dst_entry *dst;
        int err;

        dst = ip6_route_output(dev_net(ctx->dev), NULL, &fl6);
        if (!dst->error) {
                path->type = DEV_PATH_TUN;
                path->tun.src_v6 = t->parms.laddr;
                path->tun.dst_v6 = t->parms.raddr;
                path->tun.l3_proto = IPPROTO_IPV6;
                path->dev = ctx->dev;
                ctx->dev = dst->dev;
        }

        err = dst->error;
        dst_release(dst);

        return err;
}

static const struct net_device_ops ip6_tnl_netdev_ops = {
        .ndo_init        = ip6_tnl_dev_init,
        .ndo_uninit        = ip6_tnl_dev_uninit,
        .ndo_start_xmit = ip6_tnl_start_xmit,
        .ndo_siocdevprivate = ip6_tnl_siocdevprivate,
        .ndo_change_mtu = ip6_tnl_change_mtu,
        .ndo_get_stats64 = dev_get_tstats64,
        .ndo_get_iflink = ip6_tnl_get_iflink,
        .ndo_fill_forward_path = ip6_tnl_fill_forward_path,
};

#define IPXIPX_FEATURES (NETIF_F_SG |                \
                         NETIF_F_FRAGLIST |        \
                         NETIF_F_HIGHDMA |        \
                         NETIF_F_GSO_SOFTWARE |        \
                         NETIF_F_HW_CSUM)

/**
 * ip6_tnl_dev_setup - setup virtual tunnel device
 *   @dev: virtual device associated with tunnel
 *
 * Description:
 *   Initialize function pointers and device parameters
 **/

static void ip6_tnl_dev_setup(struct net_device *dev)
{
        dev->netdev_ops = &ip6_tnl_netdev_ops;
        dev->header_ops = &ip_tunnel_header_ops;
        dev->needs_free_netdev = true;
        dev->priv_destructor = ip6_dev_free;

        dev->type = ARPHRD_TUNNEL6;
        dev->flags |= IFF_NOARP;
        dev->addr_len = sizeof(struct in6_addr);
        dev->lltx = true;
        dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
        netif_keep_dst(dev);

        dev->features                |= IPXIPX_FEATURES;
        dev->hw_features        |= IPXIPX_FEATURES;

        /* This perm addr will be used as interface identifier by IPv6 */
        dev->addr_assign_type = NET_ADDR_RANDOM;
        eth_random_addr(dev->perm_addr);
}


/**
 * ip6_tnl_dev_init_gen - general initializer for all tunnel devices
 *   @dev: virtual device associated with tunnel
 **/

static inline int
ip6_tnl_dev_init_gen(struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        int ret;
        int t_hlen;

        t->dev = dev;

        ret = dst_cache_init(&t->dst_cache, GFP_KERNEL);
        if (ret)
                return ret;

        ret = gro_cells_init(&t->gro_cells, dev);
        if (ret)
                goto destroy_dst;

        t->tun_hlen = 0;
        t->hlen = t->encap_hlen + t->tun_hlen;
        t_hlen = t->hlen + sizeof(struct ipv6hdr);

        dev->type = ARPHRD_TUNNEL6;
        dev->mtu = ETH_DATA_LEN - t_hlen;
        if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                dev->mtu -= 8;
        dev->min_mtu = ETH_MIN_MTU;
        dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen;

        netdev_hold(dev, &t->dev_tracker, GFP_KERNEL);
        netdev_lockdep_set_classes(dev);
        return 0;

destroy_dst:
        dst_cache_destroy(&t->dst_cache);

        return ret;
}

/**
 * ip6_tnl_dev_init - initializer for all non fallback tunnel devices
 *   @dev: virtual device associated with tunnel
 **/

static int ip6_tnl_dev_init(struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        int err = ip6_tnl_dev_init_gen(dev);

        if (err)
                return err;
        ip6_tnl_link_config(t);
        if (t->parms.collect_md)
                netif_keep_dst(dev);
        return 0;
}

/**
 * ip6_fb_tnl_dev_init - initializer for fallback tunnel device
 *   @dev: fallback device
 *
 * Return: 0
 **/

static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct net *net = dev_net(dev);
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);

        t->net = net;
        t->parms.proto = IPPROTO_IPV6;

        rcu_assign_pointer(ip6n->tnls_wc[0], t);
        return 0;
}

static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[],
                            struct netlink_ext_ack *extack)
{
        u8 proto;

        if (!data || !data[IFLA_IPTUN_PROTO])
                return 0;

        proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
        if (proto != IPPROTO_IPV6 &&
            proto != IPPROTO_IPIP &&
            proto != 0)
                return -EINVAL;

        return 0;
}

static void ip6_tnl_netlink_parms(struct nlattr *data[],
                                  struct __ip6_tnl_parm *parms)
{
        memset(parms, 0, sizeof(*parms));

        if (!data)
                return;

        if (data[IFLA_IPTUN_LINK])
                parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);

        if (data[IFLA_IPTUN_LOCAL])
                parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]);

        if (data[IFLA_IPTUN_REMOTE])
                parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]);

        if (data[IFLA_IPTUN_TTL])
                parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]);

        if (data[IFLA_IPTUN_ENCAP_LIMIT])
                parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]);

        if (data[IFLA_IPTUN_FLOWINFO])
                parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]);

        if (data[IFLA_IPTUN_FLAGS])
                parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]);

        if (data[IFLA_IPTUN_PROTO])
                parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);

        if (data[IFLA_IPTUN_COLLECT_METADATA])
                parms->collect_md = true;

        if (data[IFLA_IPTUN_FWMARK])
                parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
}

static int ip6_tnl_newlink(struct net_device *dev,
                           struct rtnl_newlink_params *params,
                           struct netlink_ext_ack *extack)
{
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        struct ip_tunnel_encap ipencap;
        struct ip6_tnl_net *ip6n;
        struct ip6_tnl *nt, *t;
        struct net *net;
        int err;

        net = params->link_net ? : dev_net(dev);
        ip6n = net_generic(net, ip6_tnl_net_id);
        nt = netdev_priv(dev);
        nt->net = net;

        if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
                err = ip6_tnl_encap_setup(nt, &ipencap);
                if (err < 0)
                        return err;
        }

        ip6_tnl_netlink_parms(data, &nt->parms);

        if (nt->parms.collect_md) {
                if (rtnl_dereference(ip6n->collect_md_tun))
                        return -EEXIST;
        } else {
                t = ip6_tnl_locate(net, &nt->parms, 0);
                if (!IS_ERR(t))
                        return -EEXIST;
        }

        err = ip6_tnl_create2(dev);
        if (!err && tb[IFLA_MTU])
                ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));

        return err;
}

static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
                              struct nlattr *data[],
                              struct netlink_ext_ack *extack)
{
        struct ip6_tnl *t = netdev_priv(dev);
        struct __ip6_tnl_parm p;
        struct net *net = t->net;
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
        struct ip_tunnel_encap ipencap;

        if (dev == ip6n->fb_tnl_dev) {
                if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
                        /* iproute2 always sets TUNNEL_ENCAP_FLAG_CSUM6, so
                         * let's ignore this flag.
                         */
                        ipencap.flags &= ~TUNNEL_ENCAP_FLAG_CSUM6;
                        if (memchr_inv(&ipencap, 0, sizeof(ipencap))) {
                                NL_SET_ERR_MSG(extack,
                                               "Only protocol can be changed for fallback tunnel, not encap params");
                                return -EINVAL;
                        }
                }

                ip6_tnl_netlink_parms(data, &p);
                if (ip6_tnl0_update(t, &p, true) < 0) {
                        NL_SET_ERR_MSG(extack,
                                       "Only protocol can be changed for fallback tunnel");
                        return -EINVAL;
                }

                return 0;
        }

        if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
                int err = ip6_tnl_encap_setup(t, &ipencap);

                if (err < 0)
                        return err;
        }
        ip6_tnl_netlink_parms(data, &p);
        if (p.collect_md)
                return -EINVAL;

        t = ip6_tnl_locate(net, &p, 0);
        if (!IS_ERR(t)) {
                if (t->dev != dev)
                        return -EEXIST;
        } else
                t = netdev_priv(dev);

        ip6_tnl_update(t, &p);
        return 0;
}

static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head)
{
        struct net *net = dev_net(dev);
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);

        if (dev != ip6n->fb_tnl_dev)
                unregister_netdevice_queue(dev, head);
}

static size_t ip6_tnl_get_size(const struct net_device *dev)
{
        return
                /* IFLA_IPTUN_LINK */
                nla_total_size(4) +
                /* IFLA_IPTUN_LOCAL */
                nla_total_size(sizeof(struct in6_addr)) +
                /* IFLA_IPTUN_REMOTE */
                nla_total_size(sizeof(struct in6_addr)) +
                /* IFLA_IPTUN_TTL */
                nla_total_size(1) +
                /* IFLA_IPTUN_ENCAP_LIMIT */
                nla_total_size(1) +
                /* IFLA_IPTUN_FLOWINFO */
                nla_total_size(4) +
                /* IFLA_IPTUN_FLAGS */
                nla_total_size(4) +
                /* IFLA_IPTUN_PROTO */
                nla_total_size(1) +
                /* IFLA_IPTUN_ENCAP_TYPE */
                nla_total_size(2) +
                /* IFLA_IPTUN_ENCAP_FLAGS */
                nla_total_size(2) +
                /* IFLA_IPTUN_ENCAP_SPORT */
                nla_total_size(2) +
                /* IFLA_IPTUN_ENCAP_DPORT */
                nla_total_size(2) +
                /* IFLA_IPTUN_COLLECT_METADATA */
                nla_total_size(0) +
                /* IFLA_IPTUN_FWMARK */
                nla_total_size(4) +
                0;
}

static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct ip6_tnl *tunnel = netdev_priv(dev);
        struct __ip6_tnl_parm *parm = &tunnel->parms;

        if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
            nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) ||
            nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) ||
            nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) ||
            nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) ||
            nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) ||
            nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) ||
            nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) ||
            nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark))
                goto nla_put_failure;

        if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
            nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
            nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
            nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
                goto nla_put_failure;

        if (parm->collect_md)
                if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
                        goto nla_put_failure;

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

struct net *ip6_tnl_get_link_net(const struct net_device *dev)
{
        struct ip6_tnl *tunnel = netdev_priv(dev);

        return READ_ONCE(tunnel->net);
}
EXPORT_SYMBOL(ip6_tnl_get_link_net);

static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
        [IFLA_IPTUN_LINK]                = { .type = NLA_U32 },
        [IFLA_IPTUN_LOCAL]                = { .len = sizeof(struct in6_addr) },
        [IFLA_IPTUN_REMOTE]                = { .len = sizeof(struct in6_addr) },
        [IFLA_IPTUN_TTL]                = { .type = NLA_U8 },
        [IFLA_IPTUN_ENCAP_LIMIT]        = { .type = NLA_U8 },
        [IFLA_IPTUN_FLOWINFO]                = { .type = NLA_U32 },
        [IFLA_IPTUN_FLAGS]                = { .type = NLA_U32 },
        [IFLA_IPTUN_PROTO]                = { .type = NLA_U8 },
        [IFLA_IPTUN_ENCAP_TYPE]                = { .type = NLA_U16 },
        [IFLA_IPTUN_ENCAP_FLAGS]        = { .type = NLA_U16 },
        [IFLA_IPTUN_ENCAP_SPORT]        = { .type = NLA_U16 },
        [IFLA_IPTUN_ENCAP_DPORT]        = { .type = NLA_U16 },
        [IFLA_IPTUN_COLLECT_METADATA]        = { .type = NLA_FLAG },
        [IFLA_IPTUN_FWMARK]                = { .type = NLA_U32 },
};

static struct rtnl_link_ops ip6_link_ops __read_mostly = {
        .kind                = "ip6tnl",
        .maxtype        = IFLA_IPTUN_MAX,
        .policy                = ip6_tnl_policy,
        .priv_size        = sizeof(struct ip6_tnl),
        .setup                = ip6_tnl_dev_setup,
        .validate        = ip6_tnl_validate,
        .newlink        = ip6_tnl_newlink,
        .changelink        = ip6_tnl_changelink,
        .dellink        = ip6_tnl_dellink,
        .get_size        = ip6_tnl_get_size,
        .fill_info        = ip6_tnl_fill_info,
        .get_link_net        = ip6_tnl_get_link_net,
};

static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
        .handler        = ip4ip6_rcv,
        .err_handler        = ip4ip6_err,
        .priority        =        1,
};

static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
        .handler        = ip6ip6_rcv,
        .err_handler        = ip6ip6_err,
        .priority        =        1,
};

static struct xfrm6_tunnel mplsip6_handler __read_mostly = {
        .handler        = mplsip6_rcv,
        .err_handler        = mplsip6_err,
        .priority        =        1,
};

static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list)
{
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
        struct net_device *dev, *aux;
        int h;
        struct ip6_tnl *t;

        for_each_netdev_safe(net, dev, aux)
                if (dev->rtnl_link_ops == &ip6_link_ops)
                        unregister_netdevice_queue(dev, list);

        for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) {
                t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]);
                while (t) {
                        /* If dev is in the same netns, it has already
                         * been added to the list by the previous loop.
                         */
                        if (!net_eq(dev_net(t->dev), net))
                                unregister_netdevice_queue(t->dev, list);

                        t = rtnl_net_dereference(net, t->next);
                }
        }

        t = rtnl_net_dereference(net, ip6n->tnls_wc[0]);
        while (t) {
                /* If dev is in the same netns, it has already
                 * been added to the list by the previous loop.
                 */
                if (!net_eq(dev_net(t->dev), net))
                        unregister_netdevice_queue(t->dev, list);

                t = rtnl_net_dereference(net, t->next);
        }
}

static int __net_init ip6_tnl_init_net(struct net *net)
{
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
        struct ip6_tnl *t = NULL;
        int err;

        ip6n->tnls[0] = ip6n->tnls_wc;
        ip6n->tnls[1] = ip6n->tnls_r_l;

        if (!net_has_fallback_tunnels(net))
                return 0;
        err = -ENOMEM;
        ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
                                        NET_NAME_UNKNOWN, ip6_tnl_dev_setup);

        if (!ip6n->fb_tnl_dev)
                goto err_alloc_dev;
        dev_net_set(ip6n->fb_tnl_dev, net);
        ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops;
        /* FB netdevice is special: we have one, and only one per netns.
         * Allowing to move it to another netns is clearly unsafe.
         */
        ip6n->fb_tnl_dev->netns_immutable = true;

        err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
        if (err < 0)
                goto err_register;

        err = register_netdev(ip6n->fb_tnl_dev);
        if (err < 0)
                goto err_register;

        t = netdev_priv(ip6n->fb_tnl_dev);

        strcpy(t->parms.name, ip6n->fb_tnl_dev->name);
        return 0;

err_register:
        free_netdev(ip6n->fb_tnl_dev);
err_alloc_dev:
        return err;
}

static struct pernet_operations ip6_tnl_net_ops = {
        .init = ip6_tnl_init_net,
        .exit_rtnl = ip6_tnl_exit_rtnl_net,
        .id   = &ip6_tnl_net_id,
        .size = sizeof(struct ip6_tnl_net),
};

/**
 * ip6_tunnel_init - register protocol and reserve needed resources
 *
 * Return: 0 on success
 **/

static int __init ip6_tunnel_init(void)
{
        int  err;

        if (!ipv6_mod_enabled())
                return -EOPNOTSUPP;

        err = register_pernet_device(&ip6_tnl_net_ops);
        if (err < 0)
                goto out_pernet;

        err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET);
        if (err < 0) {
                pr_err("%s: can't register ip4ip6\n", __func__);
                goto out_ip4ip6;
        }

        err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6);
        if (err < 0) {
                pr_err("%s: can't register ip6ip6\n", __func__);
                goto out_ip6ip6;
        }

        if (ip6_tnl_mpls_supported()) {
                err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS);
                if (err < 0) {
                        pr_err("%s: can't register mplsip6\n", __func__);
                        goto out_mplsip6;
                }
        }

        err = rtnl_link_register(&ip6_link_ops);
        if (err < 0)
                goto rtnl_link_failed;

        return 0;

rtnl_link_failed:
        if (ip6_tnl_mpls_supported())
                xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS);
out_mplsip6:
        xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6);
out_ip6ip6:
        xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
out_ip4ip6:
        unregister_pernet_device(&ip6_tnl_net_ops);
out_pernet:
        return err;
}

/**
 * ip6_tunnel_cleanup - free resources and unregister protocol
 **/

static void __exit ip6_tunnel_cleanup(void)
{
        rtnl_link_unregister(&ip6_link_ops);
        if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET))
                pr_info("%s: can't deregister ip4ip6\n", __func__);

        if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
                pr_info("%s: can't deregister ip6ip6\n", __func__);

        if (ip6_tnl_mpls_supported() &&
            xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS))
                pr_info("%s: can't deregister mplsip6\n", __func__);
        unregister_pernet_device(&ip6_tnl_net_ops);
}

module_init(ip6_tunnel_init);
module_exit(ip6_tunnel_cleanup);

































































































































































































































































































































































































































    1 


























    1 



    1 




















    1 








    1 





    1 
























    1 
















    1 


































































    1 

































    1 





















    1 













































































































































    1 
    1 

    1 


    1 




















































































































































































































































































































































































    1 

















    1 







    1 

















    1 
































































































































































































    1 








    1 




























































































































































    1 

    1 
















    1 












    1 








    1 










    1 





























    1 











    1 

    1 

    1 























    1 













    1 





























    1 


































































































































































































































    1 

















    1 





























































































































































































































    1 








    1 


















    1 














    1 

































    1 






















































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 */

#include "xfs_platform.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_alloc.h"
#include "xfs_fsops.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_dir2.h"
#include "xfs_extfree_item.h"
#include "xfs_mru_cache.h"
#include "xfs_inode_item.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
#include "xfs_icreate_item.h"
#include "xfs_filestream.h"
#include "xfs_quota.h"
#include "xfs_sysfs.h"
#include "xfs_ondisk.h"
#include "xfs_rmap_item.h"
#include "xfs_refcount_item.h"
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
#include "xfs_pwork.h"
#include "xfs_ag.h"
#include "xfs_defer.h"
#include "xfs_attr_item.h"
#include "xfs_xattr.h"
#include "xfs_error.h"
#include "xfs_errortag.h"
#include "xfs_iunlink_item.h"
#include "xfs_dahash_test.h"
#include "xfs_rtbitmap.h"
#include "xfs_exchmaps_item.h"
#include "xfs_parent.h"
#include "xfs_rtalloc.h"
#include "xfs_zone_alloc.h"
#include "xfs_healthmon.h"
#include "scrub/stats.h"
#include "scrub/rcbag_btree.h"

#include <linux/magic.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/fserror.h>

static const struct super_operations xfs_super_operations;

static struct dentry *xfs_debugfs;        /* top-level xfs debugfs dir */
static struct kset *xfs_kset;                /* top-level xfs sysfs dir */
#ifdef DEBUG
static struct xfs_kobj xfs_dbg_kobj;        /* global debug sysfs attrs */
#endif

enum xfs_dax_mode {
        XFS_DAX_INODE = 0,
        XFS_DAX_ALWAYS = 1,
        XFS_DAX_NEVER = 2,
};

/* Were quota mount options provided?  Must use the upper 16 bits of qflags. */
#define XFS_QFLAGS_MNTOPTS        (1U << 31)

static void
xfs_mount_set_dax_mode(
        struct xfs_mount        *mp,
        enum xfs_dax_mode        mode)
{
        switch (mode) {
        case XFS_DAX_INODE:
                mp->m_features &= ~(XFS_FEAT_DAX_ALWAYS | XFS_FEAT_DAX_NEVER);
                break;
        case XFS_DAX_ALWAYS:
                mp->m_features |= XFS_FEAT_DAX_ALWAYS;
                mp->m_features &= ~XFS_FEAT_DAX_NEVER;
                break;
        case XFS_DAX_NEVER:
                mp->m_features |= XFS_FEAT_DAX_NEVER;
                mp->m_features &= ~XFS_FEAT_DAX_ALWAYS;
                break;
        }
}

static const struct constant_table dax_param_enums[] = {
        {"inode",        XFS_DAX_INODE },
        {"always",        XFS_DAX_ALWAYS },
        {"never",        XFS_DAX_NEVER },
        {}
};

/*
 * Table driven mount option parser.
 */
enum {
        Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
        Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
        Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
        Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32,
        Opt_largeio, Opt_nolargeio,
        Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
        Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
        Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
        Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
        Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write, Opt_errortag,
};

#define fsparam_dead(NAME) \
        __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL)

static const struct fs_parameter_spec xfs_fs_parameters[] = {
        /*
         * These mount options were supposed to be deprecated in September 2025
         * but the deprecation warning was buggy, so not all users were
         * notified.  The deprecation is now obnoxiously loud and postponed to
         * September 2030.
         */
        fsparam_dead("attr2"),
        fsparam_dead("noattr2"),
        fsparam_dead("ikeep"),
        fsparam_dead("noikeep"),

        fsparam_u32("logbufs",                Opt_logbufs),
        fsparam_string("logbsize",        Opt_logbsize),
        fsparam_string("logdev",        Opt_logdev),
        fsparam_string("rtdev",                Opt_rtdev),
        fsparam_flag("wsync",                Opt_wsync),
        fsparam_flag("noalign",                Opt_noalign),
        fsparam_flag("swalloc",                Opt_swalloc),
        fsparam_u32("sunit",                Opt_sunit),
        fsparam_u32("swidth",                Opt_swidth),
        fsparam_flag("nouuid",                Opt_nouuid),
        fsparam_flag("grpid",                Opt_grpid),
        fsparam_flag("nogrpid",                Opt_nogrpid),
        fsparam_flag("bsdgroups",        Opt_bsdgroups),
        fsparam_flag("sysvgroups",        Opt_sysvgroups),
        fsparam_string("allocsize",        Opt_allocsize),
        fsparam_flag("norecovery",        Opt_norecovery),
        fsparam_flag("inode64",                Opt_inode64),
        fsparam_flag("inode32",                Opt_inode32),
        fsparam_flag("largeio",                Opt_largeio),
        fsparam_flag("nolargeio",        Opt_nolargeio),
        fsparam_flag("filestreams",        Opt_filestreams),
        fsparam_flag("quota",                Opt_quota),
        fsparam_flag("noquota",                Opt_noquota),
        fsparam_flag("usrquota",        Opt_usrquota),
        fsparam_flag("grpquota",        Opt_grpquota),
        fsparam_flag("prjquota",        Opt_prjquota),
        fsparam_flag("uquota",                Opt_uquota),
        fsparam_flag("gquota",                Opt_gquota),
        fsparam_flag("pquota",                Opt_pquota),
        fsparam_flag("uqnoenforce",        Opt_uqnoenforce),
        fsparam_flag("gqnoenforce",        Opt_gqnoenforce),
        fsparam_flag("pqnoenforce",        Opt_pqnoenforce),
        fsparam_flag("qnoenforce",        Opt_qnoenforce),
        fsparam_flag("discard",                Opt_discard),
        fsparam_flag("nodiscard",        Opt_nodiscard),
        fsparam_flag("dax",                Opt_dax),
        fsparam_enum("dax",                Opt_dax_enum, dax_param_enums),
        fsparam_u32("max_open_zones",        Opt_max_open_zones),
        fsparam_flag("lifetime",        Opt_lifetime),
        fsparam_flag("nolifetime",        Opt_nolifetime),
        fsparam_string("max_atomic_write",        Opt_max_atomic_write),
        fsparam_string("errortag",        Opt_errortag),
        {}
};

struct proc_xfs_info {
        uint64_t        flag;
        char                *str;
};

static int
xfs_fs_show_options(
        struct seq_file                *m,
        struct dentry                *root)
{
        static struct proc_xfs_info xfs_info_set[] = {
                /* the few simple ones we can get from the mount struct */
                { XFS_FEAT_WSYNC,                ",wsync" },
                { XFS_FEAT_NOALIGN,                ",noalign" },
                { XFS_FEAT_SWALLOC,                ",swalloc" },
                { XFS_FEAT_NOUUID,                ",nouuid" },
                { XFS_FEAT_NORECOVERY,                ",norecovery" },
                { XFS_FEAT_FILESTREAMS,                ",filestreams" },
                { XFS_FEAT_GRPID,                ",grpid" },
                { XFS_FEAT_DISCARD,                ",discard" },
                { XFS_FEAT_LARGE_IOSIZE,        ",largeio" },
                { XFS_FEAT_DAX_ALWAYS,                ",dax=always" },
                { XFS_FEAT_DAX_NEVER,                ",dax=never" },
                { XFS_FEAT_NOLIFETIME,                ",nolifetime" },
                { 0, NULL }
        };
        struct xfs_mount        *mp = XFS_M(root->d_sb);
        struct proc_xfs_info        *xfs_infop;

        for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
                if (mp->m_features & xfs_infop->flag)
                        seq_puts(m, xfs_infop->str);
        }

        seq_printf(m, ",inode%d", xfs_has_small_inums(mp) ? 32 : 64);

        if (xfs_has_allocsize(mp))
                seq_printf(m, ",allocsize=%dk",
                           (1 << mp->m_allocsize_log) >> 10);

        if (mp->m_logbufs > 0)
                seq_printf(m, ",logbufs=%d", mp->m_logbufs);
        if (mp->m_logbsize > 0)
                seq_printf(m, ",logbsize=%dk", mp->m_logbsize >> 10);

        if (mp->m_logname)
                seq_show_option(m, "logdev", mp->m_logname);
        if (mp->m_rtname)
                seq_show_option(m, "rtdev", mp->m_rtname);

        if (mp->m_dalign > 0)
                seq_printf(m, ",sunit=%d",
                                (int)XFS_FSB_TO_BB(mp, mp->m_dalign));
        if (mp->m_swidth > 0)
                seq_printf(m, ",swidth=%d",
                                (int)XFS_FSB_TO_BB(mp, mp->m_swidth));

        if (mp->m_qflags & XFS_UQUOTA_ENFD)
                seq_puts(m, ",usrquota");
        else if (mp->m_qflags & XFS_UQUOTA_ACCT)
                seq_puts(m, ",uqnoenforce");

        if (mp->m_qflags & XFS_PQUOTA_ENFD)
                seq_puts(m, ",prjquota");
        else if (mp->m_qflags & XFS_PQUOTA_ACCT)
                seq_puts(m, ",pqnoenforce");

        if (mp->m_qflags & XFS_GQUOTA_ENFD)
                seq_puts(m, ",grpquota");
        else if (mp->m_qflags & XFS_GQUOTA_ACCT)
                seq_puts(m, ",gqnoenforce");

        if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
                seq_puts(m, ",noquota");

        if (mp->m_max_open_zones)
                seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
        if (mp->m_awu_max_bytes)
                seq_printf(m, ",max_atomic_write=%lluk",
                                mp->m_awu_max_bytes >> 10);

        return 0;
}

static bool
xfs_set_inode_alloc_perag(
        struct xfs_perag        *pag,
        xfs_ino_t                ino,
        xfs_agnumber_t                max_metadata)
{
        if (!xfs_is_inode32(pag_mount(pag))) {
                set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate);
                clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
                return false;
        }

        if (ino > XFS_MAXINUMBER_32) {
                clear_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate);
                clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
                return false;
        }

        set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate);
        if (pag_agno(pag) < max_metadata)
                set_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
        else
                clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
        return true;
}

/*
 * Set parameters for inode allocation heuristics, taking into account
 * filesystem size and inode32/inode64 mount options; i.e. specifically
 * whether or not XFS_FEAT_SMALL_INUMS is set.
 *
 * Inode allocation patterns are altered only if inode32 is requested
 * (XFS_FEAT_SMALL_INUMS), and the filesystem is sufficiently large.
 * If altered, XFS_OPSTATE_INODE32 is set as well.
 *
 * An agcount independent of that in the mount structure is provided
 * because in the growfs case, mp->m_sb.sb_agcount is not yet updated
 * to the potentially higher ag count.
 *
 * Returns the maximum AG index which may contain inodes.
 */
xfs_agnumber_t
xfs_set_inode_alloc(
        struct xfs_mount *mp,
        xfs_agnumber_t        agcount)
{
        xfs_agnumber_t        index;
        xfs_agnumber_t        maxagi = 0;
        xfs_sb_t        *sbp = &mp->m_sb;
        xfs_agnumber_t        max_metadata;
        xfs_agino_t        agino;
        xfs_ino_t        ino;

        /*
         * Calculate how much should be reserved for inodes to meet
         * the max inode percentage.  Used only for inode32.
         */
        if (M_IGEO(mp)->maxicount) {
                uint64_t        icount;

                icount = sbp->sb_dblocks * sbp->sb_imax_pct;
                do_div(icount, 100);
                icount += sbp->sb_agblocks - 1;
                do_div(icount, sbp->sb_agblocks);
                max_metadata = icount;
        } else {
                max_metadata = agcount;
        }

        /* Get the last possible inode in the filesystem */
        agino =        XFS_AGB_TO_AGINO(mp, sbp->sb_agblocks - 1);
        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);

        /*
         * If user asked for no more than 32-bit inodes, and the fs is
         * sufficiently large, set XFS_OPSTATE_INODE32 if we must alter
         * the allocator to accommodate the request.
         */
        if (xfs_has_small_inums(mp) && ino > XFS_MAXINUMBER_32)
                xfs_set_inode32(mp);
        else
                xfs_clear_inode32(mp);

        for (index = 0; index < agcount; index++) {
                struct xfs_perag        *pag;

                ino = XFS_AGINO_TO_INO(mp, index, agino);

                pag = xfs_perag_get(mp, index);
                if (xfs_set_inode_alloc_perag(pag, ino, max_metadata))
                        maxagi++;
                xfs_perag_put(pag);
        }

        return xfs_is_inode32(mp) ? maxagi : agcount;
}

static int
xfs_setup_dax_always(
        struct xfs_mount        *mp)
{
        if (!mp->m_ddev_targp->bt_daxdev &&
            (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) {
                xfs_alert(mp,
                        "DAX unsupported by block device. Turning off DAX.");
                goto disable_dax;
        }

        if (mp->m_super->s_blocksize != PAGE_SIZE) {
                xfs_alert(mp,
                        "DAX not supported for blocksize. Turning off DAX.");
                goto disable_dax;
        }

        if (xfs_has_reflink(mp) &&
            bdev_is_partition(mp->m_ddev_targp->bt_bdev)) {
                xfs_alert(mp,
                        "DAX and reflink cannot work with multi-partitions!");
                return -EINVAL;
        }

        return 0;

disable_dax:
        xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
        return 0;
}

STATIC int
xfs_blkdev_get(
        xfs_mount_t                *mp,
        const char                *name,
        struct file                **bdev_filep)
{
        int                        error = 0;
        blk_mode_t                mode;

        mode = sb_open_mode(mp->m_super->s_flags);
        *bdev_filep = bdev_file_open_by_path(name, mode,
                        mp->m_super, &fs_holder_ops);
        if (IS_ERR(*bdev_filep)) {
                error = PTR_ERR(*bdev_filep);
                *bdev_filep = NULL;
                xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
        }

        return error;
}

STATIC void
xfs_shutdown_devices(
        struct xfs_mount        *mp)
{
        /*
         * Udev is triggered whenever anyone closes a block device or unmounts
         * a file systemm on a block device.
         * The default udev rules invoke blkid to read the fs super and create
         * symlinks to the bdev under /dev/disk.  For this, it uses buffered
         * reads through the page cache.
         *
         * xfs_db also uses buffered reads to examine metadata.  There is no
         * coordination between xfs_db and udev, which means that they can run
         * concurrently.  Note there is no coordination between the kernel and
         * blkid either.
         *
         * On a system with 64k pages, the page cache can cache the superblock
         * and the root inode (and hence the root directory) with the same 64k
         * page.  If udev spawns blkid after the mkfs and the system is busy
         * enough that it is still running when xfs_db starts up, they'll both
         * read from the same page in the pagecache.
         *
         * The unmount writes updated inode metadata to disk directly.  The XFS
         * buffer cache does not use the bdev pagecache, so it needs to
         * invalidate that pagecache on unmount.  If the above scenario occurs,
         * the pagecache no longer reflects what's on disk, xfs_db reads the
         * stale metadata, and fails to find /a.  Most of the time this succeeds
         * because closing a bdev invalidates the page cache, but when processes
         * race, everyone loses.
         */
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
                blkdev_issue_flush(mp->m_logdev_targp->bt_bdev);
                invalidate_bdev(mp->m_logdev_targp->bt_bdev);
        }
        if (mp->m_rtdev_targp) {
                blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
                invalidate_bdev(mp->m_rtdev_targp->bt_bdev);
        }
        blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
        invalidate_bdev(mp->m_ddev_targp->bt_bdev);
}

/*
 * The file system configurations are:
 *        (1) device (partition) with data and internal log
 *        (2) logical volume with data and log subvolumes.
 *        (3) logical volume with data, log, and realtime subvolumes.
 *
 * We only have to handle opening the log and realtime volumes here if
 * they are present.  The data subvolume has already been opened by
 * get_sb_bdev() and is stored in sb->s_bdev.
 */
STATIC int
xfs_open_devices(
        struct xfs_mount        *mp)
{
        struct super_block        *sb = mp->m_super;
        struct block_device        *ddev = sb->s_bdev;
        struct file                *logdev_file = NULL, *rtdev_file = NULL;
        int                        error;

        /*
         * Open real time and log devices - order is important.
         */
        if (mp->m_logname) {
                error = xfs_blkdev_get(mp, mp->m_logname, &logdev_file);
                if (error)
                        return error;
        }

        if (mp->m_rtname) {
                error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_file);
                if (error)
                        goto out_close_logdev;

                if (file_bdev(rtdev_file) == ddev ||
                    (logdev_file &&
                     file_bdev(rtdev_file) == file_bdev(logdev_file))) {
                        xfs_warn(mp,
        "Cannot mount filesystem with identical rtdev and ddev/logdev.");
                        error = -EINVAL;
                        goto out_close_rtdev;
                }
        }

        /*
         * Setup xfs_mount buffer target pointers
         */
        mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file);
        if (IS_ERR(mp->m_ddev_targp)) {
                error = PTR_ERR(mp->m_ddev_targp);
                mp->m_ddev_targp = NULL;
                goto out_close_rtdev;
        }

        if (rtdev_file) {
                mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file);
                if (IS_ERR(mp->m_rtdev_targp)) {
                        error = PTR_ERR(mp->m_rtdev_targp);
                        mp->m_rtdev_targp = NULL;
                        goto out_free_ddev_targ;
                }
        }

        if (logdev_file && file_bdev(logdev_file) != ddev) {
                mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file);
                if (IS_ERR(mp->m_logdev_targp)) {
                        error = PTR_ERR(mp->m_logdev_targp);
                        mp->m_logdev_targp = NULL;
                        goto out_free_rtdev_targ;
                }
        } else {
                mp->m_logdev_targp = mp->m_ddev_targp;
                /* Handle won't be used, drop it */
                if (logdev_file)
                        bdev_fput(logdev_file);
        }

        return 0;

 out_free_rtdev_targ:
        if (mp->m_rtdev_targp)
                xfs_free_buftarg(mp->m_rtdev_targp);
 out_free_ddev_targ:
        xfs_free_buftarg(mp->m_ddev_targp);
 out_close_rtdev:
         if (rtdev_file)
                bdev_fput(rtdev_file);
 out_close_logdev:
        if (logdev_file)
                bdev_fput(logdev_file);
        return error;
}

/*
 * Setup xfs_mount buffer target pointers based on superblock
 */
STATIC int
xfs_setup_devices(
        struct xfs_mount        *mp)
{
        int                        error;

        error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize,
                        mp->m_sb.sb_dblocks);
        if (error)
                return error;

        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
                unsigned int        log_sector_size = BBSIZE;

                if (xfs_has_sector(mp))
                        log_sector_size = mp->m_sb.sb_logsectsize;
                error = xfs_configure_buftarg(mp->m_logdev_targp,
                                log_sector_size, mp->m_sb.sb_logblocks);
                if (error)
                        return error;
        }

        if (mp->m_sb.sb_rtstart) {
                if (mp->m_rtdev_targp) {
                        xfs_warn(mp,
                "can't use internal and external rtdev at the same time");
                        return -EINVAL;
                }
                mp->m_rtdev_targp = mp->m_ddev_targp;
        } else if (mp->m_rtname) {
                error = xfs_configure_buftarg(mp->m_rtdev_targp,
                                mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks);
                if (error)
                        return error;
        }

        return 0;
}

STATIC int
xfs_init_mount_workqueues(
        struct xfs_mount        *mp)
{
        mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
                        XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
                        1, mp->m_super->s_id);
        if (!mp->m_buf_workqueue)
                goto out;

        mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
                        XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
                        0, mp->m_super->s_id);
        if (!mp->m_unwritten_workqueue)
                goto out_destroy_buf;

        mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
                        XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
                        0, mp->m_super->s_id);
        if (!mp->m_reclaim_workqueue)
                goto out_destroy_unwritten;

        mp->m_blockgc_wq = alloc_workqueue("xfs-blockgc/%s",
                        XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM),
                        0, mp->m_super->s_id);
        if (!mp->m_blockgc_wq)
                goto out_destroy_reclaim;

        mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s",
                        XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
                        1, mp->m_super->s_id);
        if (!mp->m_inodegc_wq)
                goto out_destroy_blockgc;

        mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
                        XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0,
                        mp->m_super->s_id);
        if (!mp->m_sync_workqueue)
                goto out_destroy_inodegc;

        return 0;

out_destroy_inodegc:
        destroy_workqueue(mp->m_inodegc_wq);
out_destroy_blockgc:
        destroy_workqueue(mp->m_blockgc_wq);
out_destroy_reclaim:
        destroy_workqueue(mp->m_reclaim_workqueue);
out_destroy_unwritten:
        destroy_workqueue(mp->m_unwritten_workqueue);
out_destroy_buf:
        destroy_workqueue(mp->m_buf_workqueue);
out:
        return -ENOMEM;
}

STATIC void
xfs_destroy_mount_workqueues(
        struct xfs_mount        *mp)
{
        destroy_workqueue(mp->m_sync_workqueue);
        destroy_workqueue(mp->m_blockgc_wq);
        destroy_workqueue(mp->m_inodegc_wq);
        destroy_workqueue(mp->m_reclaim_workqueue);
        destroy_workqueue(mp->m_unwritten_workqueue);
        destroy_workqueue(mp->m_buf_workqueue);
}

static void
xfs_flush_inodes_worker(
        struct work_struct        *work)
{
        struct xfs_mount        *mp = container_of(work, struct xfs_mount,
                                                   m_flush_inodes_work);
        struct super_block        *sb = mp->m_super;

        if (down_read_trylock(&sb->s_umount)) {
                sync_inodes_sb(sb);
                up_read(&sb->s_umount);
        }
}

/*
 * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
 * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
 * for IO to complete so that we effectively throttle multiple callers to the
 * rate at which IO is completing.
 */
void
xfs_flush_inodes(
        struct xfs_mount        *mp)
{
        /*
         * If flush_work() returns true then that means we waited for a flush
         * which was already in progress.  Don't bother running another scan.
         */
        if (flush_work(&mp->m_flush_inodes_work))
                return;

        queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work);
        flush_work(&mp->m_flush_inodes_work);
}

/* Catch misguided souls that try to use this interface on XFS */
STATIC struct inode *
xfs_fs_alloc_inode(
        struct super_block        *sb)
{
        BUG();
        return NULL;
}

/*
 * Now that the generic code is guaranteed not to be accessing
 * the linux inode, we can inactivate and reclaim the inode.
 */
STATIC void
xfs_fs_destroy_inode(
        struct inode                *inode)
{
        struct xfs_inode        *ip = XFS_I(inode);

        trace_xfs_destroy_inode(ip);

        ASSERT(!rwsem_is_locked(&inode->i_rwsem));
        XFS_STATS_INC(ip->i_mount, xs_inode_destroy);
        XFS_STATS_INC(ip->i_mount, xs_inode_destroy2);
        xfs_inode_mark_reclaimable(ip);
}

/*
 * Slab object creation initialisation for the XFS inode.
 * This covers only the idempotent fields in the XFS inode;
 * all other fields need to be initialised on allocation
 * from the slab. This avoids the need to repeatedly initialise
 * fields in the xfs inode that left in the initialise state
 * when freeing the inode.
 */
STATIC void
xfs_fs_inode_init_once(
        void                        *inode)
{
        struct xfs_inode        *ip = inode;

        memset(ip, 0, sizeof(struct xfs_inode));

        /* vfs inode */
        inode_init_once(VFS_I(ip));

        /* xfs inode */
        atomic_set(&ip->i_pincount, 0);
        spin_lock_init(&ip->i_flags_lock);
        init_rwsem(&ip->i_lock);
}

/*
 * We do an unlocked check for XFS_IDONTCACHE here because we are already
 * serialised against cache hits here via the inode->i_lock and igrab() in
 * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be
 * racing with us, and it avoids needing to grab a spinlock here for every inode
 * we drop the final reference on.
 */
STATIC int
xfs_fs_drop_inode(
        struct inode                *inode)
{
        struct xfs_inode        *ip = XFS_I(inode);

        /*
         * If this unlinked inode is in the middle of recovery, don't
         * drop the inode just yet; log recovery will take care of
         * that.  See the comment for this inode flag.
         */
        if (ip->i_flags & XFS_IRECOVERY) {
                ASSERT(xlog_recovery_needed(ip->i_mount->m_log));
                return 0;
        }

        return inode_generic_drop(inode);
}

STATIC void
xfs_fs_evict_inode(
        struct inode                *inode)
{
        if (IS_DAX(inode))
                dax_break_layout_final(inode);

        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);

        if (IS_ENABLED(CONFIG_XFS_RT) &&
            S_ISREG(inode->i_mode) && inode->i_private) {
                xfs_open_zone_put(inode->i_private);
                inode->i_private = NULL;
        }
}

static void
xfs_mount_free(
        struct xfs_mount        *mp)
{
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
                xfs_free_buftarg(mp->m_logdev_targp);
        if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp)
                xfs_free_buftarg(mp->m_rtdev_targp);
        if (mp->m_ddev_targp)
                xfs_free_buftarg(mp->m_ddev_targp);

        debugfs_remove(mp->m_debugfs);
        kfree(mp->m_rtname);
        kfree(mp->m_logname);
#ifdef DEBUG
        kfree(mp->m_errortag);
#endif
        kfree(mp);
}

STATIC int
xfs_fs_sync_fs(
        struct super_block        *sb,
        int                        wait)
{
        struct xfs_mount        *mp = XFS_M(sb);
        int                        error;

        trace_xfs_fs_sync_fs(mp, __return_address);

        /*
         * Doing anything during the async pass would be counterproductive.
         */
        if (!wait)
                return 0;

        error = xfs_log_force(mp, XFS_LOG_SYNC);
        if (error)
                return error;

        /*
         * If we are called with page faults frozen out, it means we are about
         * to freeze the transaction subsystem. Take the opportunity to shut
         * down inodegc because once SB_FREEZE_FS is set it's too late to
         * prevent inactivation races with freeze. The fs doesn't get called
         * again by the freezing process until after SB_FREEZE_FS has been set,
         * so it's now or never.  Same logic applies to speculative allocation
         * garbage collection.
         *
         * We don't care if this is a normal syncfs call that does this or
         * freeze that does this - we can run this multiple times without issue
         * and we won't race with a restart because a restart can only occur
         * when the state is either SB_FREEZE_FS or SB_FREEZE_COMPLETE.
         */
        if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
                xfs_inodegc_stop(mp);
                xfs_blockgc_stop(mp);
                xfs_zone_gc_stop(mp);
        }

        return 0;
}

static xfs_extlen_t
xfs_internal_log_size(
        struct xfs_mount        *mp)
{
        if (!mp->m_sb.sb_logstart)
                return 0;
        return mp->m_sb.sb_logblocks;
}

static void
xfs_statfs_data(
        struct xfs_mount        *mp,
        struct kstatfs                *st)
{
        int64_t                        fdblocks =
                xfs_sum_freecounter(mp, XC_FREE_BLOCKS);

        /* make sure st->f_bfree does not underflow */
        st->f_bfree = max(0LL,
                fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS));

        /*
         * sb_dblocks can change during growfs, but nothing cares about reporting
         * the old or new value during growfs.
         */
        st->f_blocks = mp->m_sb.sb_dblocks - xfs_internal_log_size(mp);
}

/*
 * When stat(v)fs is called on a file with the realtime bit set or a directory
 * with the rtinherit bit, report freespace information for the RT device
 * instead of the main data device.
 */
static void
xfs_statfs_rt(
        struct xfs_mount        *mp,
        struct kstatfs                *st)
{
        st->f_bfree = xfs_rtbxlen_to_blen(mp,
                        xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
        st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp,
                        mp->m_free[XC_FREE_RTEXTENTS].res_total);
}

static void
xfs_statfs_inodes(
        struct xfs_mount        *mp,
        struct kstatfs                *st)
{
        uint64_t                icount = percpu_counter_sum(&mp->m_icount);
        uint64_t                ifree = percpu_counter_sum(&mp->m_ifree);
        uint64_t                fakeinos = XFS_FSB_TO_INO(mp, st->f_bfree);

        st->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
        if (M_IGEO(mp)->maxicount)
                st->f_files = min_t(typeof(st->f_files), st->f_files,
                                        M_IGEO(mp)->maxicount);

        /* If sb_icount overshot maxicount, report actual allocation */
        st->f_files = max_t(typeof(st->f_files), st->f_files,
                        mp->m_sb.sb_icount);

        /* Make sure st->f_ffree does not underflow */
        st->f_ffree = max_t(int64_t, 0, st->f_files - (icount - ifree));
}

STATIC int
xfs_fs_statfs(
        struct dentry                *dentry,
        struct kstatfs                *st)
{
        struct xfs_mount        *mp = XFS_M(dentry->d_sb);
        struct xfs_inode        *ip = XFS_I(d_inode(dentry));

        /*
         * Expedite background inodegc but don't wait. We do not want to block
         * here waiting hours for a billion extent file to be truncated.
         */
        xfs_inodegc_push(mp);

        st->f_type = XFS_SUPER_MAGIC;
        st->f_namelen = MAXNAMELEN - 1;
        st->f_bsize = mp->m_sb.sb_blocksize;
        st->f_fsid = u64_to_fsid(huge_encode_dev(mp->m_ddev_targp->bt_dev));

        xfs_statfs_data(mp, st);
        xfs_statfs_inodes(mp, st);

        if (XFS_IS_REALTIME_MOUNT(mp) &&
            (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME)))
                xfs_statfs_rt(mp, st);

        if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
            ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
                              (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
                xfs_qm_statvfs(ip, st);

        /*
         * XFS does not distinguish between blocks available to privileged and
         * unprivileged users.
         */
        st->f_bavail = st->f_bfree;
        return 0;
}

STATIC void
xfs_save_resvblks(
        struct xfs_mount        *mp)
{
        enum xfs_free_counter        i;

        for (i = 0; i < XC_FREE_NR; i++) {
                mp->m_free[i].res_saved = mp->m_free[i].res_total;
                xfs_reserve_blocks(mp, i, 0);
        }
}

STATIC void
xfs_restore_resvblks(
        struct xfs_mount        *mp)
{
        uint64_t                resblks;
        enum xfs_free_counter        i;

        for (i = 0; i < XC_FREE_NR; i++) {
                if (mp->m_free[i].res_saved) {
                        resblks = mp->m_free[i].res_saved;
                        mp->m_free[i].res_saved = 0;
                } else
                        resblks = xfs_default_resblks(mp, i);
                xfs_reserve_blocks(mp, i, resblks);
        }
}

/*
 * Second stage of a freeze. The data is already frozen so we only
 * need to take care of the metadata. Once that's done sync the superblock
 * to the log to dirty it in case of a crash while frozen. This ensures that we
 * will recover the unlinked inode lists on the next mount.
 */
STATIC int
xfs_fs_freeze(
        struct super_block        *sb)
{
        struct xfs_mount        *mp = XFS_M(sb);
        unsigned int                flags;
        int                        ret;

        /*
         * The filesystem is now frozen far enough that memory reclaim
         * cannot safely operate on the filesystem. Hence we need to
         * set a GFP_NOFS context here to avoid recursion deadlocks.
         */
        flags = memalloc_nofs_save();
        xfs_save_resvblks(mp);
        ret = xfs_log_quiesce(mp);
        memalloc_nofs_restore(flags);

        /*
         * For read-write filesystems, we need to restart the inodegc on error
         * because we stopped it at SB_FREEZE_PAGEFAULT level and a thaw is not
         * going to be run to restart it now.  We are at SB_FREEZE_FS level
         * here, so we can restart safely without racing with a stop in
         * xfs_fs_sync_fs().
         */
        if (ret && !xfs_is_readonly(mp)) {
                xfs_blockgc_start(mp);
                xfs_inodegc_start(mp);
                xfs_zone_gc_start(mp);
        }

        return ret;
}

STATIC int
xfs_fs_unfreeze(
        struct super_block        *sb)
{
        struct xfs_mount        *mp = XFS_M(sb);

        xfs_restore_resvblks(mp);
        xfs_log_work_queue(mp);

        /*
         * Don't reactivate the inodegc worker on a readonly filesystem because
         * inodes are sent directly to reclaim.  Don't reactivate the blockgc
         * worker because there are no speculative preallocations on a readonly
         * filesystem.
         */
        if (!xfs_is_readonly(mp)) {
                xfs_zone_gc_start(mp);
                xfs_blockgc_start(mp);
                xfs_inodegc_start(mp);
        }

        return 0;
}

/*
 * This function fills in xfs_mount_t fields based on mount args.
 * Note: the superblock _has_ now been read in.
 */
STATIC int
xfs_finish_flags(
        struct xfs_mount        *mp)
{
        /* Fail a mount where the logbuf is smaller than the log stripe */
        if (xfs_has_logv2(mp)) {
                if (mp->m_logbsize <= 0 &&
                    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
                } else if (mp->m_logbsize > 0 &&
                           mp->m_logbsize < mp->m_sb.sb_logsunit) {
                        xfs_warn(mp,
                "logbuf size must be greater than or equal to log stripe size");
                        return -EINVAL;
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
                if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
                        xfs_warn(mp,
                "logbuf size for version 1 logs must be 16K or 32K");
                        return -EINVAL;
                }
        }

        /*
         * prohibit r/w mounts of read-only filesystems
         */
        if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) {
                xfs_warn(mp,
                        "cannot mount a read-only filesystem as read-write");
                return -EROFS;
        }

        if ((mp->m_qflags & XFS_GQUOTA_ACCT) &&
            (mp->m_qflags & XFS_PQUOTA_ACCT) &&
            !xfs_has_pquotino(mp)) {
                xfs_warn(mp,
                  "Super block does not support project and group quota together");
                return -EINVAL;
        }

        if (!xfs_has_zoned(mp)) {
                if (mp->m_max_open_zones) {
                        xfs_warn(mp,
"max_open_zones mount option only supported on zoned file systems.");
                        return -EINVAL;
                }
                if (mp->m_features & XFS_FEAT_NOLIFETIME) {
                        xfs_warn(mp,
"nolifetime mount option only supported on zoned file systems.");
                        return -EINVAL;
                }
        }

        return 0;
}

static int
xfs_init_percpu_counters(
        struct xfs_mount        *mp)
{
        int                        error;
        int                        i;

        error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
        if (error)
                return -ENOMEM;

        error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL);
        if (error)
                goto free_icount;

        error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
        if (error)
                goto free_ifree;

        error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
        if (error)
                goto free_delalloc;

        for (i = 0; i < XC_FREE_NR; i++) {
                error = percpu_counter_init(&mp->m_free[i].count, 0,
                                GFP_KERNEL);
                if (error)
                        goto free_freecounters;
        }

        return 0;

free_freecounters:
        while (--i >= 0)
                percpu_counter_destroy(&mp->m_free[i].count);
        percpu_counter_destroy(&mp->m_delalloc_rtextents);
free_delalloc:
        percpu_counter_destroy(&mp->m_delalloc_blks);
free_ifree:
        percpu_counter_destroy(&mp->m_ifree);
free_icount:
        percpu_counter_destroy(&mp->m_icount);
        return -ENOMEM;
}

void
xfs_reinit_percpu_counters(
        struct xfs_mount        *mp)
{
        percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
        percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
        xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
        if (!xfs_has_zoned(mp))
                xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
                                mp->m_sb.sb_frextents);
}

static void
xfs_destroy_percpu_counters(
        struct xfs_mount        *mp)
{
        enum xfs_free_counter        i;

        for (i = 0; i < XC_FREE_NR; i++)
                percpu_counter_destroy(&mp->m_free[i].count);
        percpu_counter_destroy(&mp->m_icount);
        percpu_counter_destroy(&mp->m_ifree);
        ASSERT(xfs_is_shutdown(mp) ||
               percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
        percpu_counter_destroy(&mp->m_delalloc_rtextents);
        ASSERT(xfs_is_shutdown(mp) ||
               percpu_counter_sum(&mp->m_delalloc_blks) == 0);
        percpu_counter_destroy(&mp->m_delalloc_blks);
}

static int
xfs_inodegc_init_percpu(
        struct xfs_mount        *mp)
{
        struct xfs_inodegc        *gc;
        int                        cpu;

        mp->m_inodegc = alloc_percpu(struct xfs_inodegc);
        if (!mp->m_inodegc)
                return -ENOMEM;

        for_each_possible_cpu(cpu) {
                gc = per_cpu_ptr(mp->m_inodegc, cpu);
                gc->cpu = cpu;
                gc->mp = mp;
                init_llist_head(&gc->list);
                gc->items = 0;
                gc->error = 0;
                INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
        }
        return 0;
}

static void
xfs_inodegc_free_percpu(
        struct xfs_mount        *mp)
{
        if (!mp->m_inodegc)
                return;
        free_percpu(mp->m_inodegc);
}

static void
xfs_fs_put_super(
        struct super_block        *sb)
{
        struct xfs_mount        *mp = XFS_M(sb);

        xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid);
        xfs_filestream_unmount(mp);
        xfs_unmountfs(mp);

        xfs_rtmount_freesb(mp);
        xfs_freesb(mp);
        xchk_mount_stats_free(mp);
        free_percpu(mp->m_stats.xs_stats);
        xfs_inodegc_free_percpu(mp);
        xfs_destroy_percpu_counters(mp);
        xfs_destroy_mount_workqueues(mp);
        xfs_shutdown_devices(mp);
}

static long
xfs_fs_nr_cached_objects(
        struct super_block        *sb,
        struct shrink_control        *sc)
{
        /* Paranoia: catch incorrect calls during mount setup or teardown */
        if (WARN_ON_ONCE(!sb->s_fs_info))
                return 0;
        return xfs_reclaim_inodes_count(XFS_M(sb));
}

static long
xfs_fs_free_cached_objects(
        struct super_block        *sb,
        struct shrink_control        *sc)
{
        return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
}

static void
xfs_fs_shutdown(
        struct super_block        *sb)
{
        xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
}

static int
xfs_fs_show_stats(
        struct seq_file                *m,
        struct dentry                *root)
{
        struct xfs_mount        *mp = XFS_M(root->d_sb);

        if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
                xfs_zoned_show_stats(m, mp);
        return 0;
}

static void
xfs_fs_report_error(
        const struct fserror_event        *event)
{
        /* healthmon already knows about non-inode and metadata errors */
        if (event->inode && event->type != FSERR_METADATA)
                xfs_healthmon_report_file_ioerror(XFS_I(event->inode), event);
}

static const struct super_operations xfs_super_operations = {
        .alloc_inode                = xfs_fs_alloc_inode,
        .destroy_inode                = xfs_fs_destroy_inode,
        .drop_inode                = xfs_fs_drop_inode,
        .evict_inode                = xfs_fs_evict_inode,
        .put_super                = xfs_fs_put_super,
        .sync_fs                = xfs_fs_sync_fs,
        .freeze_fs                = xfs_fs_freeze,
        .unfreeze_fs                = xfs_fs_unfreeze,
        .statfs                        = xfs_fs_statfs,
        .show_options                = xfs_fs_show_options,
        .nr_cached_objects        = xfs_fs_nr_cached_objects,
        .free_cached_objects        = xfs_fs_free_cached_objects,
        .shutdown                = xfs_fs_shutdown,
        .show_stats                = xfs_fs_show_stats,
        .report_error                = xfs_fs_report_error,
};

static int
suffix_kstrtoint(
        const char        *s,
        unsigned int        base,
        int                *res)
{
        int                last, shift_left_factor = 0, _res;
        char                *value;
        int                ret = 0;

        value = kstrdup(s, GFP_KERNEL);
        if (!value)
                return -ENOMEM;

        last = strlen(value) - 1;
        if (value[last] == 'K' || value[last] == 'k') {
                shift_left_factor = 10;
                value[last] = '\0';
        }
        if (value[last] == 'M' || value[last] == 'm') {
                shift_left_factor = 20;
                value[last] = '\0';
        }
        if (value[last] == 'G' || value[last] == 'g') {
                shift_left_factor = 30;
                value[last] = '\0';
        }

        if (kstrtoint(value, base, &_res))
                ret = -EINVAL;
        kfree(value);
        *res = _res << shift_left_factor;
        return ret;
}

static int
suffix_kstrtoull(
        const char                *s,
        unsigned int                base,
        unsigned long long        *res)
{
        int                        last, shift_left_factor = 0;
        unsigned long long        _res;
        char                        *value;
        int                        ret = 0;

        value = kstrdup(s, GFP_KERNEL);
        if (!value)
                return -ENOMEM;

        last = strlen(value) - 1;
        if (value[last] == 'K' || value[last] == 'k') {
                shift_left_factor = 10;
                value[last] = '\0';
        }
        if (value[last] == 'M' || value[last] == 'm') {
                shift_left_factor = 20;
                value[last] = '\0';
        }
        if (value[last] == 'G' || value[last] == 'g') {
                shift_left_factor = 30;
                value[last] = '\0';
        }

        if (kstrtoull(value, base, &_res))
                ret = -EINVAL;
        kfree(value);
        *res = _res << shift_left_factor;
        return ret;
}

static inline void
xfs_fs_warn_deprecated(
        struct fs_context        *fc,
        struct fs_parameter        *param)
{
        /*
         * Always warn about someone passing in a deprecated mount option.
         * Previously we wouldn't print the warning if we were reconfiguring
         * and current mount point already had the flag set, but that was not
         * the right thing to do.
         *
         * Many distributions mount the root filesystem with no options in the
         * initramfs and rely on mount -a to remount the root fs with the
         * options in fstab.  However, the old behavior meant that there would
         * never be a warning about deprecated mount options for the root fs in
         * /etc/fstab.  On a single-fs system, that means no warning at all.
         *
         * Compounding this problem are distribution scripts that copy
         * /proc/mounts to fstab, which means that we can't remove mount
         * options unless we're 100% sure they have only ever been advertised
         * in /proc/mounts in response to explicitly provided mount options.
         */
        xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
}

/*
 * Set mount state from a mount option.
 *
 * NOTE: mp->m_super is NULL here!
 */
static int
xfs_fs_parse_param(
        struct fs_context        *fc,
        struct fs_parameter        *param)
{
        struct xfs_mount        *parsing_mp = fc->s_fs_info;
        struct fs_parse_result        result;
        int                        size = 0;
        int                        opt;

        BUILD_BUG_ON(XFS_QFLAGS_MNTOPTS & XFS_MOUNT_QUOTA_ALL);

        opt = fs_parse(fc, xfs_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Op_deprecated:
                xfs_fs_warn_deprecated(fc, param);
                return 0;
        case Opt_logbufs:
                parsing_mp->m_logbufs = result.uint_32;
                return 0;
        case Opt_logbsize:
                if (suffix_kstrtoint(param->string, 10, &parsing_mp->m_logbsize))
                        return -EINVAL;
                return 0;
        case Opt_logdev:
                kfree(parsing_mp->m_logname);
                parsing_mp->m_logname = kstrdup(param->string, GFP_KERNEL);
                if (!parsing_mp->m_logname)
                        return -ENOMEM;
                return 0;
        case Opt_rtdev:
                kfree(parsing_mp->m_rtname);
                parsing_mp->m_rtname = kstrdup(param->string, GFP_KERNEL);
                if (!parsing_mp->m_rtname)
                        return -ENOMEM;
                return 0;
        case Opt_allocsize:
                if (suffix_kstrtoint(param->string, 10, &size))
                        return -EINVAL;
                parsing_mp->m_allocsize_log = ffs(size) - 1;
                parsing_mp->m_features |= XFS_FEAT_ALLOCSIZE;
                return 0;
        case Opt_grpid:
        case Opt_bsdgroups:
                parsing_mp->m_features |= XFS_FEAT_GRPID;
                return 0;
        case Opt_nogrpid:
        case Opt_sysvgroups:
                parsing_mp->m_features &= ~XFS_FEAT_GRPID;
                return 0;
        case Opt_wsync:
                parsing_mp->m_features |= XFS_FEAT_WSYNC;
                return 0;
        case Opt_norecovery:
                parsing_mp->m_features |= XFS_FEAT_NORECOVERY;
                return 0;
        case Opt_noalign:
                parsing_mp->m_features |= XFS_FEAT_NOALIGN;
                return 0;
        case Opt_swalloc:
                parsing_mp->m_features |= XFS_FEAT_SWALLOC;
                return 0;
        case Opt_sunit:
                parsing_mp->m_dalign = result.uint_32;
                return 0;
        case Opt_swidth:
                parsing_mp->m_swidth = result.uint_32;
                return 0;
        case Opt_inode32:
                parsing_mp->m_features |= XFS_FEAT_SMALL_INUMS;
                return 0;
        case Opt_inode64:
                parsing_mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
                return 0;
        case Opt_nouuid:
                parsing_mp->m_features |= XFS_FEAT_NOUUID;
                return 0;
        case Opt_largeio:
                parsing_mp->m_features |= XFS_FEAT_LARGE_IOSIZE;
                return 0;
        case Opt_nolargeio:
                parsing_mp->m_features &= ~XFS_FEAT_LARGE_IOSIZE;
                return 0;
        case Opt_filestreams:
                parsing_mp->m_features |= XFS_FEAT_FILESTREAMS;
                return 0;
        case Opt_noquota:
                parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
                parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
                parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
                return 0;
        case Opt_quota:
        case Opt_uquota:
        case Opt_usrquota:
                parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD);
                parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
                return 0;
        case Opt_qnoenforce:
        case Opt_uqnoenforce:
                parsing_mp->m_qflags |= XFS_UQUOTA_ACCT;
                parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD;
                parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
                return 0;
        case Opt_pquota:
        case Opt_prjquota:
                parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD);
                parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
                return 0;
        case Opt_pqnoenforce:
                parsing_mp->m_qflags |= XFS_PQUOTA_ACCT;
                parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD;
                parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
                return 0;
        case Opt_gquota:
        case Opt_grpquota:
                parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD);
                parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
                return 0;
        case Opt_gqnoenforce:
                parsing_mp->m_qflags |= XFS_GQUOTA_ACCT;
                parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD;
                parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
                return 0;
        case Opt_discard:
                parsing_mp->m_features |= XFS_FEAT_DISCARD;
                return 0;
        case Opt_nodiscard:
                parsing_mp->m_features &= ~XFS_FEAT_DISCARD;
                return 0;
#ifdef CONFIG_FS_DAX
        case Opt_dax:
                xfs_mount_set_dax_mode(parsing_mp, XFS_DAX_ALWAYS);
                return 0;
        case Opt_dax_enum:
                xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
                return 0;
#endif
        case Opt_max_open_zones:
                parsing_mp->m_max_open_zones = result.uint_32;
                return 0;
        case Opt_lifetime:
                parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME;
                return 0;
        case Opt_nolifetime:
                parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
                return 0;
        case Opt_max_atomic_write:
                if (suffix_kstrtoull(param->string, 10,
                                     &parsing_mp->m_awu_max_bytes)) {
                        xfs_warn(parsing_mp,
 "max atomic write size must be positive integer");
                        return -EINVAL;
                }
                return 0;
        case Opt_errortag:
                return xfs_errortag_add_name(parsing_mp, param->string);
        default:
                xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
                return -EINVAL;
        }

        return 0;
}

static int
xfs_fs_validate_params(
        struct xfs_mount        *mp)
{
        /* No recovery flag requires a read-only mount */
        if (xfs_has_norecovery(mp) && !xfs_is_readonly(mp)) {
                xfs_warn(mp, "no-recovery mounts must be read-only.");
                return -EINVAL;
        }

        if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) {
                xfs_warn(mp,
        "sunit and swidth options incompatible with the noalign option");
                return -EINVAL;
        }

        if (!IS_ENABLED(CONFIG_XFS_QUOTA) &&
            (mp->m_qflags & ~XFS_QFLAGS_MNTOPTS)) {
                xfs_warn(mp, "quota support not available in this kernel.");
                return -EINVAL;
        }

        if ((mp->m_dalign && !mp->m_swidth) ||
            (!mp->m_dalign && mp->m_swidth)) {
                xfs_warn(mp, "sunit and swidth must be specified together");
                return -EINVAL;
        }

        if (mp->m_dalign && (mp->m_swidth % mp->m_dalign != 0)) {
                xfs_warn(mp,
        "stripe width (%d) must be a multiple of the stripe unit (%d)",
                        mp->m_swidth, mp->m_dalign);
                return -EINVAL;
        }

        if (mp->m_logbufs != -1 &&
            mp->m_logbufs != 0 &&
            (mp->m_logbufs < XLOG_MIN_ICLOGS ||
             mp->m_logbufs > XLOG_MAX_ICLOGS)) {
                xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
                        mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
                return -EINVAL;
        }

        if (mp->m_logbsize != -1 &&
            mp->m_logbsize !=  0 &&
            (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
             mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
             !is_power_of_2(mp->m_logbsize))) {
                xfs_warn(mp,
                        "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
                        mp->m_logbsize);
                return -EINVAL;
        }

        if (xfs_has_allocsize(mp) &&
            (mp->m_allocsize_log > XFS_MAX_IO_LOG ||
             mp->m_allocsize_log < XFS_MIN_IO_LOG)) {
                xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
                        mp->m_allocsize_log, XFS_MIN_IO_LOG, XFS_MAX_IO_LOG);
                return -EINVAL;
        }

        return 0;
}

struct dentry *
xfs_debugfs_mkdir(
        const char        *name,
        struct dentry        *parent)
{
        struct dentry        *child;

        /* Apparently we're expected to ignore error returns?? */
        child = debugfs_create_dir(name, parent);
        if (IS_ERR(child))
                return NULL;

        return child;
}

static int
xfs_fs_fill_super(
        struct super_block        *sb,
        struct fs_context        *fc)
{
        struct xfs_mount        *mp = sb->s_fs_info;
        struct inode                *root;
        int                        flags = 0, error;

        mp->m_super = sb;

        /*
         * Copy VFS mount flags from the context now that all parameter parsing
         * is guaranteed to have been completed by either the old mount API or
         * the newer fsopen/fsconfig API.
         */
        if (fc->sb_flags & SB_RDONLY)
                xfs_set_readonly(mp);
        if (fc->sb_flags & SB_DIRSYNC)
                mp->m_features |= XFS_FEAT_DIRSYNC;
        if (fc->sb_flags & SB_SYNCHRONOUS)
                mp->m_features |= XFS_FEAT_WSYNC;

        error = xfs_fs_validate_params(mp);
        if (error)
                return error;

        if (!sb_min_blocksize(sb, BBSIZE)) {
                xfs_err(mp, "unable to set blocksize");
                return -EINVAL;
        }
        sb->s_xattr = xfs_xattr_handlers;
        sb->s_export_op = &xfs_export_operations;
#ifdef CONFIG_XFS_QUOTA
        sb->s_qcop = &xfs_quotactl_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
        sb->s_op = &xfs_super_operations;

        /*
         * Delay mount work if the debug hook is set. This is debug
         * instrumention to coordinate simulation of xfs mount failures with
         * VFS superblock operations
         */
        if (xfs_globals.mount_delay) {
                xfs_notice(mp, "Delaying mount for %d seconds.",
                        xfs_globals.mount_delay);
                msleep(xfs_globals.mount_delay * 1000);
        }

        if (fc->sb_flags & SB_SILENT)
                flags |= XFS_MFSI_QUIET;

        error = xfs_open_devices(mp);
        if (error)
                return error;

        if (xfs_debugfs) {
                mp->m_debugfs = xfs_debugfs_mkdir(mp->m_super->s_id,
                                                  xfs_debugfs);
        } else {
                mp->m_debugfs = NULL;
        }

        error = xfs_init_mount_workqueues(mp);
        if (error)
                goto out_shutdown_devices;

        error = xfs_init_percpu_counters(mp);
        if (error)
                goto out_destroy_workqueues;

        error = xfs_inodegc_init_percpu(mp);
        if (error)
                goto out_destroy_counters;

        /* Allocate stats memory before we do operations that might use it */
        mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
        if (!mp->m_stats.xs_stats) {
                error = -ENOMEM;
                goto out_destroy_inodegc;
        }

        error = xchk_mount_stats_alloc(mp);
        if (error)
                goto out_free_stats;

        error = xfs_readsb(mp, flags);
        if (error)
                goto out_free_scrub_stats;

        error = xfs_finish_flags(mp);
        if (error)
                goto out_free_sb;

        error = xfs_setup_devices(mp);
        if (error)
                goto out_free_sb;

        /*
         * V4 support is undergoing deprecation.
         *
         * Note: this has to use an open coded m_features check as xfs_has_crc
         * always returns false for !CONFIG_XFS_SUPPORT_V4.
         */
        if (!(mp->m_features & XFS_FEAT_CRC)) {
                if (!IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) {
                        xfs_warn(mp,
        "Deprecated V4 format (crc=0) not supported by kernel.");
                        error = -EINVAL;
                        goto out_free_sb;
                }
                xfs_warn_once(mp,
        "Deprecated V4 format (crc=0) will not be supported after September 2030.");
        }

        /* ASCII case insensitivity is undergoing deprecation. */
        if (xfs_has_asciici(mp)) {
#ifdef CONFIG_XFS_SUPPORT_ASCII_CI
                xfs_warn_once(mp,
        "Deprecated ASCII case-insensitivity feature (ascii-ci=1) will not be supported after September 2030.");
#else
                xfs_warn(mp,
        "Deprecated ASCII case-insensitivity feature (ascii-ci=1) not supported by kernel.");
                error = -EINVAL;
                goto out_free_sb;
#endif
        }

        /*
         * Filesystem claims it needs repair, so refuse the mount unless
         * norecovery is also specified, in which case the filesystem can
         * be mounted with no risk of further damage.
         */
        if (xfs_has_needsrepair(mp) && !xfs_has_norecovery(mp)) {
                xfs_warn(mp, "Filesystem needs repair.  Please run xfs_repair.");
                error = -EFSCORRUPTED;
                goto out_free_sb;
        }

        /*
         * Don't touch the filesystem if a user tool thinks it owns the primary
         * superblock.  mkfs doesn't clear the flag from secondary supers, so
         * we don't check them at all.
         */
        if (mp->m_sb.sb_inprogress) {
                xfs_warn(mp, "Offline file system operation in progress!");
                error = -EFSCORRUPTED;
                goto out_free_sb;
        }

        if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
                size_t max_folio_size = mapping_max_folio_size_supported();

                if (!xfs_has_crc(mp)) {
                        xfs_warn(mp,
"V4 Filesystem with blocksize %d bytes. Only pagesize (%ld) or less is supported.",
                                mp->m_sb.sb_blocksize, PAGE_SIZE);
                        error = -ENOSYS;
                        goto out_free_sb;
                }

                if (mp->m_sb.sb_blocksize > max_folio_size) {
                        xfs_warn(mp,
"block size (%u bytes) not supported; Only block size (%zu) or less is supported",
                                mp->m_sb.sb_blocksize, max_folio_size);
                        error = -ENOSYS;
                        goto out_free_sb;
                }
        }

        /* Ensure this filesystem fits in the page cache limits */
        if (xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_dblocks) ||
            xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_rblocks)) {
                xfs_warn(mp,
                "file system too large to be mounted on this system.");
                error = -EFBIG;
                goto out_free_sb;
        }

        /*
         * XFS block mappings use 54 bits to store the logical block offset.
         * This should suffice to handle the maximum file size that the VFS
         * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT
         * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes
         * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON
         * to check this assertion.
         *
         * Avoid integer overflow by comparing the maximum bmbt offset to the
         * maximum pagecache offset in units of fs blocks.
         */
        if (!xfs_verify_fileoff(mp, XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE))) {
                xfs_warn(mp,
"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!",
                         XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE),
                         XFS_MAX_FILEOFF);
                error = -EINVAL;
                goto out_free_sb;
        }

        error = xfs_rtmount_readsb(mp);
        if (error)
                goto out_free_sb;

        error = xfs_filestream_mount(mp);
        if (error)
                goto out_free_rtsb;

        /*
         * we must configure the block size in the superblock before we run the
         * full mount process as the mount process can lookup and cache inodes.
         */
        sb->s_magic = XFS_SUPER_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
        sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_max_links = XFS_MAXLINK;
        sb->s_time_gran = 1;
        if (xfs_has_bigtime(mp)) {
                sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN);
                sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX);
        } else {
                sb->s_time_min = XFS_LEGACY_TIME_MIN;
                sb->s_time_max = XFS_LEGACY_TIME_MAX;
        }
        trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
        sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;

        set_posix_acl_flag(sb);

        /* version 5 superblocks support inode version counters. */
        if (xfs_has_crc(mp))
                sb->s_flags |= SB_I_VERSION;

        if (xfs_has_dax_always(mp)) {
                error = xfs_setup_dax_always(mp);
                if (error)
                        goto out_filestream_unmount;
        }

        if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) {
                xfs_warn(mp,
        "mounting with \"discard\" option, but the device does not support discard");
                mp->m_features &= ~XFS_FEAT_DISCARD;
        }

        if (xfs_has_zoned(mp)) {
                if (!xfs_has_metadir(mp)) {
                        xfs_alert(mp,
                "metadir feature required for zoned realtime devices.");
                        error = -EINVAL;
                        goto out_filestream_unmount;
                }
                xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED);
        }

        if (xfs_has_reflink(mp)) {
                if (xfs_has_realtime(mp) &&
                    !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) {
                        xfs_alert(mp,
        "reflink not compatible with realtime extent size %u!",
                                        mp->m_sb.sb_rextsize);
                        error = -EINVAL;
                        goto out_filestream_unmount;
                }

                if (xfs_has_zoned(mp)) {
                        xfs_alert(mp,
        "reflink not compatible with zoned RT device!");
                        error = -EINVAL;
                        goto out_filestream_unmount;
                }

                if (xfs_globals.always_cow) {
                        xfs_info(mp, "using DEBUG-only always_cow mode.");
                        mp->m_always_cow = true;
                }
        }

        /*
         * If no quota mount options were provided, maybe we'll try to pick
         * up the quota accounting and enforcement flags from the ondisk sb.
         */
        if (!(mp->m_qflags & XFS_QFLAGS_MNTOPTS))
                xfs_set_resuming_quotaon(mp);
        mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS;

        error = xfs_mountfs(mp);
        if (error)
                goto out_filestream_unmount;

        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
                error = -ENOENT;
                goto out_unmount;
        }
        sb->s_root = d_make_root(root);
        if (!sb->s_root) {
                error = -ENOMEM;
                goto out_unmount;
        }

        return 0;

 out_filestream_unmount:
        xfs_filestream_unmount(mp);
 out_free_rtsb:
        xfs_rtmount_freesb(mp);
 out_free_sb:
        xfs_freesb(mp);
 out_free_scrub_stats:
        xchk_mount_stats_free(mp);
 out_free_stats:
        free_percpu(mp->m_stats.xs_stats);
 out_destroy_inodegc:
        xfs_inodegc_free_percpu(mp);
 out_destroy_counters:
        xfs_destroy_percpu_counters(mp);
 out_destroy_workqueues:
        xfs_destroy_mount_workqueues(mp);
 out_shutdown_devices:
        xfs_shutdown_devices(mp);
        return error;

 out_unmount:
        xfs_filestream_unmount(mp);
        xfs_unmountfs(mp);
        goto out_free_rtsb;
}

static int
xfs_fs_get_tree(
        struct fs_context        *fc)
{
        return get_tree_bdev(fc, xfs_fs_fill_super);
}

static int
xfs_remount_rw(
        struct xfs_mount        *mp)
{
        struct xfs_sb                *sbp = &mp->m_sb;
        int error;

        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp &&
            xfs_readonly_buftarg(mp->m_logdev_targp)) {
                xfs_warn(mp,
                        "ro->rw transition prohibited by read-only logdev");
                return -EACCES;
        }

        if (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp)) {
                xfs_warn(mp,
                        "ro->rw transition prohibited by read-only rtdev");
                return -EACCES;
        }

        if (xfs_has_norecovery(mp)) {
                xfs_warn(mp,
                        "ro->rw transition prohibited on norecovery mount");
                return -EINVAL;
        }

        if (xfs_sb_is_v5(sbp) &&
            xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
                xfs_warn(mp,
        "ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
                        (sbp->sb_features_ro_compat &
                                XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
                return -EINVAL;
        }

        xfs_clear_readonly(mp);

        /*
         * If this is the first remount to writeable state we might have some
         * superblock changes to update.
         */
        if (mp->m_update_sb) {
                error = xfs_sync_sb(mp, false);
                if (error) {
                        xfs_warn(mp, "failed to write sb changes");
                        return error;
                }
                mp->m_update_sb = false;
        }

        /*
         * Fill out the reserve pool if it is empty. Use the stashed value if
         * it is non-zero, otherwise go with the default.
         */
        xfs_restore_resvblks(mp);
        xfs_log_work_queue(mp);
        xfs_blockgc_start(mp);

        /* Create the per-AG metadata reservation pool .*/
        error = xfs_fs_reserve_ag_blocks(mp);
        if (error && error != -ENOSPC)
                return error;

        /* Re-enable the background inode inactivation worker. */
        xfs_inodegc_start(mp);

        /* Restart zone reclaim */
        xfs_zone_gc_start(mp);

        return 0;
}

static int
xfs_remount_ro(
        struct xfs_mount        *mp)
{
        struct xfs_icwalk        icw = {
                .icw_flags        = XFS_ICWALK_FLAG_SYNC,
        };
        int                        error;

        /* Flush all the dirty data to disk. */
        error = sync_filesystem(mp->m_super);
        if (error)
                return error;

        /*
         * Cancel background eofb scanning so it cannot race with the final
         * log force+buftarg wait and deadlock the remount.
         */
        xfs_blockgc_stop(mp);

        /*
         * Clear out all remaining COW staging extents and speculative post-EOF
         * preallocations so that we don't leave inodes requiring inactivation
         * cleanups during reclaim on a read-only mount.  We must process every
         * cached inode, so this requires a synchronous cache scan.
         */
        error = xfs_blockgc_free_space(mp, &icw);
        if (error) {
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                return error;
        }

        /*
         * Stop the inodegc background worker.  xfs_fs_reconfigure already
         * flushed all pending inodegc work when it sync'd the filesystem.
         * The VFS holds s_umount, so we know that inodes cannot enter
         * xfs_fs_destroy_inode during a remount operation.  In readonly mode
         * we send inodes straight to reclaim, so no inodes will be queued.
         */
        xfs_inodegc_stop(mp);

        /* Stop zone reclaim */
        xfs_zone_gc_stop(mp);

        /* Free the per-AG metadata reservation pool. */
        xfs_fs_unreserve_ag_blocks(mp);

        /*
         * Before we sync the metadata, we need to free up the reserve block
         * pool so that the used block count in the superblock on disk is
         * correct at the end of the remount. Stash the current* reserve pool
         * size so that if we get remounted rw, we can return it to the same
         * size.
         */
        xfs_save_resvblks(mp);

        xfs_log_clean(mp);
        xfs_set_readonly(mp);

        return 0;
}

/*
 * Logically we would return an error here to prevent users from believing
 * they might have changed mount options using remount which can't be changed.
 *
 * But unfortunately mount(8) adds all options from mtab and fstab to the mount
 * arguments in some cases so we can't blindly reject options, but have to
 * check for each specified option if it actually differs from the currently
 * set option and only reject it if that's the case.
 *
 * Until that is implemented we return success for every remount request, and
 * silently ignore all options that we can't actually change.
 */
static int
xfs_fs_reconfigure(
        struct fs_context *fc)
{
        struct xfs_mount        *mp = XFS_M(fc->root->d_sb);
        struct xfs_mount        *new_mp = fc->s_fs_info;
        int                        flags = fc->sb_flags;
        int                        error;

        new_mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS;

        /* version 5 superblocks always support version counters. */
        if (xfs_has_crc(mp))
                fc->sb_flags |= SB_I_VERSION;

        error = xfs_fs_validate_params(new_mp);
        if (error)
                return error;

        xfs_errortag_copy(mp, new_mp);

        /* Validate new max_atomic_write option before making other changes */
        if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
                error = xfs_set_max_atomic_write_opt(mp,
                                new_mp->m_awu_max_bytes);
                if (error)
                        return error;
        }

        /* inode32 -> inode64 */
        if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
                mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
                mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
        }

        /* inode64 -> inode32 */
        if (!xfs_has_small_inums(mp) && xfs_has_small_inums(new_mp)) {
                mp->m_features |= XFS_FEAT_SMALL_INUMS;
                mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
        }

        /*
         * Now that mp has been modified according to the remount options, we
         * do a final option validation with xfs_finish_flags() just like it is
         * just like it is done during mount. We cannot use
         * done during mount. We cannot use xfs_finish_flags() on new_mp as it
         * contains only the user given options.
         */
        error = xfs_finish_flags(mp);
        if (error)
                return error;

        /* ro -> rw */
        if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) {
                error = xfs_remount_rw(mp);
                if (error)
                        return error;
        }

        /* rw -> ro */
        if (!xfs_is_readonly(mp) && (flags & SB_RDONLY)) {
                error = xfs_remount_ro(mp);
                if (error)
                        return error;
        }

        return 0;
}

static void
xfs_fs_free(
        struct fs_context        *fc)
{
        struct xfs_mount        *mp = fc->s_fs_info;

        /*
         * mp is stored in the fs_context when it is initialized.
         * mp is transferred to the superblock on a successful mount,
         * but if an error occurs before the transfer we have to free
         * it here.
         */
        if (mp)
                xfs_mount_free(mp);
}

static const struct fs_context_operations xfs_context_ops = {
        .parse_param = xfs_fs_parse_param,
        .get_tree    = xfs_fs_get_tree,
        .reconfigure = xfs_fs_reconfigure,
        .free        = xfs_fs_free,
};

/*
 * WARNING: do not initialise any parameters in this function that depend on
 * mount option parsing having already been performed as this can be called from
 * fsopen() before any parameters have been set.
 */
static int
xfs_init_fs_context(
        struct fs_context        *fc)
{
        struct xfs_mount        *mp;
        int                        i;

        mp = kzalloc_obj(struct xfs_mount);
        if (!mp)
                return -ENOMEM;
#ifdef DEBUG
        mp->m_errortag = kzalloc_objs(*mp->m_errortag, XFS_ERRTAG_MAX);
        if (!mp->m_errortag) {
                kfree(mp);
                return -ENOMEM;
        }
#endif

        spin_lock_init(&mp->m_sb_lock);
        for (i = 0; i < XG_TYPE_MAX; i++)
                xa_init(&mp->m_groups[i].xa);
        mutex_init(&mp->m_growlock);
        mutex_init(&mp->m_metafile_resv_lock);
        INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
        mp->m_kobj.kobject.kset = xfs_kset;
        /*
         * We don't create the finobt per-ag space reservation until after log
         * recovery, so we must set this to true so that an ifree transaction
         * started during log recovery will not depend on space reservations
         * for finobt expansion.
         */
        mp->m_finobt_nores = true;

        /*
         * These can be overridden by the mount option parsing.
         */
        mp->m_logbufs = -1;
        mp->m_logbsize = -1;
        mp->m_allocsize_log = 16; /* 64k */

        xfs_hooks_init(&mp->m_dir_update_hooks);

        fc->s_fs_info = mp;
        fc->ops = &xfs_context_ops;

        return 0;
}

static void
xfs_kill_sb(
        struct super_block                *sb)
{
        kill_block_super(sb);
        xfs_mount_free(XFS_M(sb));
}

static struct file_system_type xfs_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "xfs",
        .init_fs_context        = xfs_init_fs_context,
        .parameters                = xfs_fs_parameters,
        .kill_sb                = xfs_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME |
                                  FS_LBS,
};
MODULE_ALIAS_FS("xfs");

STATIC int __init
xfs_init_caches(void)
{
        int                error;

        xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
                                         SLAB_HWCACHE_ALIGN |
                                         SLAB_RECLAIM_ACCOUNT,
                                         NULL);
        if (!xfs_buf_cache)
                goto out;

        xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket",
                                                sizeof(struct xlog_ticket),
                                                0, 0, NULL);
        if (!xfs_log_ticket_cache)
                goto out_destroy_buf_cache;

        error = xfs_btree_init_cur_caches();
        if (error)
                goto out_destroy_log_ticket_cache;

        error = rcbagbt_init_cur_cache();
        if (error)
                goto out_destroy_btree_cur_cache;

        error = xfs_defer_init_item_caches();
        if (error)
                goto out_destroy_rcbagbt_cur_cache;

        xfs_da_state_cache = kmem_cache_create("xfs_da_state",
                                              sizeof(struct xfs_da_state),
                                              0, 0, NULL);
        if (!xfs_da_state_cache)
                goto out_destroy_defer_item_cache;

        xfs_ifork_cache = kmem_cache_create("xfs_ifork",
                                           sizeof(struct xfs_ifork),
                                           0, 0, NULL);
        if (!xfs_ifork_cache)
                goto out_destroy_da_state_cache;

        xfs_trans_cache = kmem_cache_create("xfs_trans",
                                           sizeof(struct xfs_trans),
                                           0, 0, NULL);
        if (!xfs_trans_cache)
                goto out_destroy_ifork_cache;


        /*
         * The size of the cache-allocated buf log item is the maximum
         * size possible under XFS.  This wastes a little bit of memory,
         * but it is much faster.
         */
        xfs_buf_item_cache = kmem_cache_create("xfs_buf_item",
                                              sizeof(struct xfs_buf_log_item),
                                              0, 0, NULL);
        if (!xfs_buf_item_cache)
                goto out_destroy_trans_cache;

        xfs_efd_cache = kmem_cache_create("xfs_efd_item",
                        xfs_efd_log_item_sizeof(XFS_EFD_MAX_FAST_EXTENTS),
                        0, 0, NULL);
        if (!xfs_efd_cache)
                goto out_destroy_buf_item_cache;

        xfs_efi_cache = kmem_cache_create("xfs_efi_item",
                        xfs_efi_log_item_sizeof(XFS_EFI_MAX_FAST_EXTENTS),
                        0, 0, NULL);
        if (!xfs_efi_cache)
                goto out_destroy_efd_cache;

        xfs_inode_cache = kmem_cache_create("xfs_inode",
                                           sizeof(struct xfs_inode), 0,
                                           (SLAB_HWCACHE_ALIGN |
                                            SLAB_RECLAIM_ACCOUNT |
                                            SLAB_ACCOUNT),
                                           xfs_fs_inode_init_once);
        if (!xfs_inode_cache)
                goto out_destroy_efi_cache;

        xfs_ili_cache = kmem_cache_create("xfs_ili",
                                         sizeof(struct xfs_inode_log_item), 0,
                                         SLAB_RECLAIM_ACCOUNT,
                                         NULL);
        if (!xfs_ili_cache)
                goto out_destroy_inode_cache;

        xfs_icreate_cache = kmem_cache_create("xfs_icr",
                                             sizeof(struct xfs_icreate_item),
                                             0, 0, NULL);
        if (!xfs_icreate_cache)
                goto out_destroy_ili_cache;

        xfs_rud_cache = kmem_cache_create("xfs_rud_item",
                                         sizeof(struct xfs_rud_log_item),
                                         0, 0, NULL);
        if (!xfs_rud_cache)
                goto out_destroy_icreate_cache;

        xfs_rui_cache = kmem_cache_create("xfs_rui_item",
                        xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
                        0, 0, NULL);
        if (!xfs_rui_cache)
                goto out_destroy_rud_cache;

        xfs_cud_cache = kmem_cache_create("xfs_cud_item",
                                         sizeof(struct xfs_cud_log_item),
                                         0, 0, NULL);
        if (!xfs_cud_cache)
                goto out_destroy_rui_cache;

        xfs_cui_cache = kmem_cache_create("xfs_cui_item",
                        xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
                        0, 0, NULL);
        if (!xfs_cui_cache)
                goto out_destroy_cud_cache;

        xfs_bud_cache = kmem_cache_create("xfs_bud_item",
                                         sizeof(struct xfs_bud_log_item),
                                         0, 0, NULL);
        if (!xfs_bud_cache)
                goto out_destroy_cui_cache;

        xfs_bui_cache = kmem_cache_create("xfs_bui_item",
                        xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
                        0, 0, NULL);
        if (!xfs_bui_cache)
                goto out_destroy_bud_cache;

        xfs_attrd_cache = kmem_cache_create("xfs_attrd_item",
                                            sizeof(struct xfs_attrd_log_item),
                                            0, 0, NULL);
        if (!xfs_attrd_cache)
                goto out_destroy_bui_cache;

        xfs_attri_cache = kmem_cache_create("xfs_attri_item",
                                            sizeof(struct xfs_attri_log_item),
                                            0, 0, NULL);
        if (!xfs_attri_cache)
                goto out_destroy_attrd_cache;

        xfs_iunlink_cache = kmem_cache_create("xfs_iul_item",
                                             sizeof(struct xfs_iunlink_item),
                                             0, 0, NULL);
        if (!xfs_iunlink_cache)
                goto out_destroy_attri_cache;

        xfs_xmd_cache = kmem_cache_create("xfs_xmd_item",
                                         sizeof(struct xfs_xmd_log_item),
                                         0, 0, NULL);
        if (!xfs_xmd_cache)
                goto out_destroy_iul_cache;

        xfs_xmi_cache = kmem_cache_create("xfs_xmi_item",
                                         sizeof(struct xfs_xmi_log_item),
                                         0, 0, NULL);
        if (!xfs_xmi_cache)
                goto out_destroy_xmd_cache;

        xfs_parent_args_cache = kmem_cache_create("xfs_parent_args",
                                             sizeof(struct xfs_parent_args),
                                             0, 0, NULL);
        if (!xfs_parent_args_cache)
                goto out_destroy_xmi_cache;

        return 0;

 out_destroy_xmi_cache:
        kmem_cache_destroy(xfs_xmi_cache);
 out_destroy_xmd_cache:
        kmem_cache_destroy(xfs_xmd_cache);
 out_destroy_iul_cache:
        kmem_cache_destroy(xfs_iunlink_cache);
 out_destroy_attri_cache:
        kmem_cache_destroy(xfs_attri_cache);
 out_destroy_attrd_cache:
        kmem_cache_destroy(xfs_attrd_cache);
 out_destroy_bui_cache:
        kmem_cache_destroy(xfs_bui_cache);
 out_destroy_bud_cache:
        kmem_cache_destroy(xfs_bud_cache);
 out_destroy_cui_cache:
        kmem_cache_destroy(xfs_cui_cache);
 out_destroy_cud_cache:
        kmem_cache_destroy(xfs_cud_cache);
 out_destroy_rui_cache:
        kmem_cache_destroy(xfs_rui_cache);
 out_destroy_rud_cache:
        kmem_cache_destroy(xfs_rud_cache);
 out_destroy_icreate_cache:
        kmem_cache_destroy(xfs_icreate_cache);
 out_destroy_ili_cache:
        kmem_cache_destroy(xfs_ili_cache);
 out_destroy_inode_cache:
        kmem_cache_destroy(xfs_inode_cache);
 out_destroy_efi_cache:
        kmem_cache_destroy(xfs_efi_cache);
 out_destroy_efd_cache:
        kmem_cache_destroy(xfs_efd_cache);
 out_destroy_buf_item_cache:
        kmem_cache_destroy(xfs_buf_item_cache);
 out_destroy_trans_cache:
        kmem_cache_destroy(xfs_trans_cache);
 out_destroy_ifork_cache:
        kmem_cache_destroy(xfs_ifork_cache);
 out_destroy_da_state_cache:
        kmem_cache_destroy(xfs_da_state_cache);
 out_destroy_defer_item_cache:
        xfs_defer_destroy_item_caches();
 out_destroy_rcbagbt_cur_cache:
        rcbagbt_destroy_cur_cache();
 out_destroy_btree_cur_cache:
        xfs_btree_destroy_cur_caches();
 out_destroy_log_ticket_cache:
        kmem_cache_destroy(xfs_log_ticket_cache);
 out_destroy_buf_cache:
        kmem_cache_destroy(xfs_buf_cache);
 out:
        return -ENOMEM;
}

STATIC void
xfs_destroy_caches(void)
{
        /*
         * Make sure all delayed rcu free are flushed before we
         * destroy caches.
         */
        rcu_barrier();
        kmem_cache_destroy(xfs_parent_args_cache);
        kmem_cache_destroy(xfs_xmd_cache);
        kmem_cache_destroy(xfs_xmi_cache);
        kmem_cache_destroy(xfs_iunlink_cache);
        kmem_cache_destroy(xfs_attri_cache);
        kmem_cache_destroy(xfs_attrd_cache);
        kmem_cache_destroy(xfs_bui_cache);
        kmem_cache_destroy(xfs_bud_cache);
        kmem_cache_destroy(xfs_cui_cache);
        kmem_cache_destroy(xfs_cud_cache);
        kmem_cache_destroy(xfs_rui_cache);
        kmem_cache_destroy(xfs_rud_cache);
        kmem_cache_destroy(xfs_icreate_cache);
        kmem_cache_destroy(xfs_ili_cache);
        kmem_cache_destroy(xfs_inode_cache);
        kmem_cache_destroy(xfs_efi_cache);
        kmem_cache_destroy(xfs_efd_cache);
        kmem_cache_destroy(xfs_buf_item_cache);
        kmem_cache_destroy(xfs_trans_cache);
        kmem_cache_destroy(xfs_ifork_cache);
        kmem_cache_destroy(xfs_da_state_cache);
        xfs_defer_destroy_item_caches();
        rcbagbt_destroy_cur_cache();
        xfs_btree_destroy_cur_caches();
        kmem_cache_destroy(xfs_log_ticket_cache);
        kmem_cache_destroy(xfs_buf_cache);
}

STATIC int __init
xfs_init_workqueues(void)
{
        /*
         * The allocation workqueue can be used in memory reclaim situations
         * (writepage path), and parallelism is only limited by the number of
         * AGs in all the filesystems mounted. Hence use the default large
         * max_active value for this workqueue.
         */
        xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU),
                        0);
        if (!xfs_alloc_wq)
                return -ENOMEM;

        xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND),
                        0);
        if (!xfs_discard_wq)
                goto out_free_alloc_wq;

        return 0;
out_free_alloc_wq:
        destroy_workqueue(xfs_alloc_wq);
        return -ENOMEM;
}

STATIC void
xfs_destroy_workqueues(void)
{
        destroy_workqueue(xfs_discard_wq);
        destroy_workqueue(xfs_alloc_wq);
}

STATIC int __init
init_xfs_fs(void)
{
        int                        error;

        xfs_check_ondisk_structs();

        error = xfs_dahash_test();
        if (error)
                return error;

        printk(KERN_INFO XFS_VERSION_STRING " with "
                         XFS_BUILD_OPTIONS " enabled\n");

        xfs_dir_startup();

        error = xfs_init_caches();
        if (error)
                goto out;

        error = xfs_init_workqueues();
        if (error)
                goto out_destroy_caches;

        error = xfs_mru_cache_init();
        if (error)
                goto out_destroy_wq;

        error = xfs_init_procfs();
        if (error)
                goto out_mru_cache_uninit;

        error = xfs_sysctl_register();
        if (error)
                goto out_cleanup_procfs;

        xfs_debugfs = xfs_debugfs_mkdir("xfs", NULL);

        xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
        if (!xfs_kset) {
                error = -ENOMEM;
                goto out_debugfs_unregister;
        }

        xfsstats.xs_kobj.kobject.kset = xfs_kset;

        xfsstats.xs_stats = alloc_percpu(struct xfsstats);
        if (!xfsstats.xs_stats) {
                error = -ENOMEM;
                goto out_kset_unregister;
        }

        error = xfs_sysfs_init(&xfsstats.xs_kobj, &xfs_stats_ktype, NULL,
                               "stats");
        if (error)
                goto out_free_stats;

        error = xchk_global_stats_setup(xfs_debugfs);
        if (error)
                goto out_remove_stats_kobj;

#ifdef DEBUG
        xfs_dbg_kobj.kobject.kset = xfs_kset;
        error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
        if (error)
                goto out_remove_scrub_stats;
#endif

        error = xfs_qm_init();
        if (error)
                goto out_remove_dbg_kobj;

        error = register_filesystem(&xfs_fs_type);
        if (error)
                goto out_qm_exit;
        return 0;

 out_qm_exit:
        xfs_qm_exit();
 out_remove_dbg_kobj:
#ifdef DEBUG
        xfs_sysfs_del(&xfs_dbg_kobj);
 out_remove_scrub_stats:
#endif
        xchk_global_stats_teardown();
 out_remove_stats_kobj:
        xfs_sysfs_del(&xfsstats.xs_kobj);
 out_free_stats:
        free_percpu(xfsstats.xs_stats);
 out_kset_unregister:
        kset_unregister(xfs_kset);
 out_debugfs_unregister:
        debugfs_remove(xfs_debugfs);
        xfs_sysctl_unregister();
 out_cleanup_procfs:
        xfs_cleanup_procfs();
 out_mru_cache_uninit:
        xfs_mru_cache_uninit();
 out_destroy_wq:
        xfs_destroy_workqueues();
 out_destroy_caches:
        xfs_destroy_caches();
 out:
        return error;
}

STATIC void __exit
exit_xfs_fs(void)
{
        xfs_qm_exit();
        unregister_filesystem(&xfs_fs_type);
#ifdef DEBUG
        xfs_sysfs_del(&xfs_dbg_kobj);
#endif
        xchk_global_stats_teardown();
        xfs_sysfs_del(&xfsstats.xs_kobj);
        free_percpu(xfsstats.xs_stats);
        kset_unregister(xfs_kset);
        debugfs_remove(xfs_debugfs);
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
        xfs_mru_cache_uninit();
        xfs_destroy_workqueues();
        xfs_destroy_caches();
        xfs_uuid_table_free();
}

module_init(init_xfs_fs);
module_exit(exit_xfs_fs);

MODULE_AUTHOR("Silicon Graphics, Inc.");
MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
MODULE_LICENSE("GPL");




















































































































































































































    1 




























    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * fscrypt.h: declarations for per-file encryption
 *
 * Filesystems that implement per-file encryption must include this header
 * file.
 *
 * Copyright (C) 2015, Google, Inc.
 *
 * Written by Michael Halcrow, 2015.
 * Modified by Jaegeuk Kim, 2015.
 */
#ifndef _LINUX_FSCRYPT_H
#define _LINUX_FSCRYPT_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <uapi/linux/fscrypt.h>

/*
 * The lengths of all file contents blocks must be divisible by this value.
 * This is needed to ensure that all contents encryption modes will work, as
 * some of the supported modes don't support arbitrarily byte-aligned messages.
 *
 * Since the needed alignment is 16 bytes, most filesystems will meet this
 * requirement naturally, as typical block sizes are powers of 2.  However, if a
 * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via
 * compression), then it will need to pad to this alignment before encryption.
 */
#define FSCRYPT_CONTENTS_ALIGNMENT 16

union fscrypt_policy;
struct fscrypt_inode_info;
struct fs_parameter;
struct seq_file;

struct fscrypt_str {
        unsigned char *name;
        u32 len;
};

struct fscrypt_name {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        u32 hash;
        u32 minor_hash;
        struct fscrypt_str crypto_buf;
        bool is_nokey_name;
};

#define FSTR_INIT(n, l)                { .name = n, .len = l }
#define FSTR_TO_QSTR(f)                QSTR_INIT((f)->name, (f)->len)
#define fname_name(p)                ((p)->disk_name.name)
#define fname_len(p)                ((p)->disk_name.len)

/* Maximum value for the third parameter of fscrypt_operations.set_context(). */
#define FSCRYPT_SET_CONTEXT_MAX_SIZE        40

#ifdef CONFIG_FS_ENCRYPTION

/* Crypto operations for filesystems */
struct fscrypt_operations {
        /*
         * The offset of the pointer to struct fscrypt_inode_info in the
         * filesystem-specific part of the inode, relative to the beginning of
         * the common part of the inode (the 'struct inode').
         */
        ptrdiff_t inode_info_offs;

        /*
         * If set, then fs/crypto/ will allocate a global bounce page pool the
         * first time an encryption key is set up for a file.  The bounce page
         * pool is required by the following functions:
         *
         * - fscrypt_encrypt_pagecache_blocks()
         * - fscrypt_zeroout_range() for files not using inline crypto
         *
         * If the filesystem doesn't use those, it doesn't need to set this.
         */
        unsigned int needs_bounce_pages : 1;

        /*
         * If set, then fs/crypto/ will allow the use of encryption settings
         * that assume inode numbers fit in 32 bits (i.e.
         * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64}), provided that the other
         * prerequisites for these settings are also met.  This is only useful
         * if the filesystem wants to support inline encryption hardware that is
         * limited to 32-bit or 64-bit data unit numbers and where programming
         * keyslots is very slow.
         */
        unsigned int has_32bit_inodes : 1;

        /*
         * If set, then fs/crypto/ will allow users to select a crypto data unit
         * size that is less than the filesystem block size.  This is done via
         * the log2_data_unit_size field of the fscrypt policy.  This flag is
         * not compatible with filesystems that encrypt variable-length blocks
         * (i.e. blocks that aren't all equal to filesystem's block size), for
         * example as a result of compression.  It's also not compatible with
         * the fscrypt_encrypt_block_inplace() and
         * fscrypt_decrypt_block_inplace() functions.
         */
        unsigned int supports_subblock_data_units : 1;

        /*
         * This field exists only for backwards compatibility reasons and should
         * only be set by the filesystems that are setting it already.  It
         * contains the filesystem-specific key description prefix that is
         * accepted for "logon" keys for v1 fscrypt policies.  This
         * functionality is deprecated in favor of the generic prefix
         * "fscrypt:", which itself is deprecated in favor of the filesystem
         * keyring ioctls such as FS_IOC_ADD_ENCRYPTION_KEY.  Filesystems that
         * are newly adding fscrypt support should not set this field.
         */
        const char *legacy_key_prefix;

        /*
         * Get the fscrypt context of the given inode.
         *
         * @inode: the inode whose context to get
         * @ctx: the buffer into which to get the context
         * @len: length of the @ctx buffer in bytes
         *
         * Return: On success, returns the length of the context in bytes; this
         *           may be less than @len.  On failure, returns -ENODATA if the
         *           inode doesn't have a context, -ERANGE if the context is
         *           longer than @len, or another -errno code.
         */
        int (*get_context)(struct inode *inode, void *ctx, size_t len);

        /*
         * Set an fscrypt context on the given inode.
         *
         * @inode: the inode whose context to set.  The inode won't already have
         *           an fscrypt context.
         * @ctx: the context to set
         * @len: length of @ctx in bytes (at most FSCRYPT_SET_CONTEXT_MAX_SIZE)
         * @fs_data: If called from fscrypt_set_context(), this will be the
         *             value the filesystem passed to fscrypt_set_context().
         *             Otherwise (i.e. when called from
         *             FS_IOC_SET_ENCRYPTION_POLICY) this will be NULL.
         *
         * i_rwsem will be held for write.
         *
         * Return: 0 on success, -errno on failure.
         */
        int (*set_context)(struct inode *inode, const void *ctx, size_t len,
                           void *fs_data);

        /*
         * Get the dummy fscrypt policy in use on the filesystem (if any).
         *
         * Filesystems only need to implement this function if they support the
         * test_dummy_encryption mount option.
         *
         * Return: A pointer to the dummy fscrypt policy, if the filesystem is
         *           mounted with test_dummy_encryption; otherwise NULL.
         */
        const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb);

        /*
         * Check whether a directory is empty.  i_rwsem will be held for write.
         */
        bool (*empty_dir)(struct inode *inode);

        /*
         * Check whether the filesystem's inode numbers and UUID are stable,
         * meaning that they will never be changed even by offline operations
         * such as filesystem shrinking and therefore can be used in the
         * encryption without the possibility of files becoming unreadable.
         *
         * Filesystems only need to implement this function if they want to
         * support the FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags.  These
         * flags are designed to work around the limitations of UFS and eMMC
         * inline crypto hardware, and they shouldn't be used in scenarios where
         * such hardware isn't being used.
         *
         * Leaving this NULL is equivalent to always returning false.
         */
        bool (*has_stable_inodes)(struct super_block *sb);

        /*
         * Return an array of pointers to the block devices to which the
         * filesystem may write encrypted file contents, NULL if the filesystem
         * only has a single such block device, or an ERR_PTR() on error.
         *
         * On successful non-NULL return, *num_devs is set to the number of
         * devices in the returned array.  The caller must free the returned
         * array using kfree().
         *
         * If the filesystem can use multiple block devices (other than block
         * devices that aren't used for encrypted file contents, such as
         * external journal devices), and wants to support inline encryption,
         * then it must implement this function.  Otherwise it's not needed.
         */
        struct block_device **(*get_devices)(struct super_block *sb,
                                             unsigned int *num_devs);
};

int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name,
                         struct dentry *dentry, unsigned int flags);

/*
 * Returns the address of the fscrypt info pointer within the
 * filesystem-specific part of the inode.  (To save memory on filesystems that
 * don't support fscrypt, a field in 'struct inode' itself is no longer used.)
 */
static inline struct fscrypt_inode_info **
fscrypt_inode_info_addr(const struct inode *inode)
{
        VFS_WARN_ON_ONCE(inode->i_sb->s_cop->inode_info_offs == 0);
        return (void *)inode + inode->i_sb->s_cop->inode_info_offs;
}

/*
 * Load the inode's fscrypt info pointer, using a raw dereference.  Since this
 * uses a raw dereference with no memory barrier, it is appropriate to use only
 * when the caller knows the inode's key setup already happened, resulting in
 * non-NULL fscrypt info.  E.g., the file contents en/decryption functions use
 * this, since fscrypt_file_open() set up the key.
 */
static inline struct fscrypt_inode_info *
fscrypt_get_inode_info_raw(const struct inode *inode)
{
        struct fscrypt_inode_info *ci = *fscrypt_inode_info_addr(inode);

        VFS_WARN_ON_ONCE(ci == NULL);
        return ci;
}

static inline struct fscrypt_inode_info *
fscrypt_get_inode_info(const struct inode *inode)
{
        /*
         * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info().
         * I.e., another task may publish the fscrypt info concurrently,
         * executing a RELEASE barrier.  Use smp_load_acquire() here to safely
         * ACQUIRE the memory the other task published.
         */
        return smp_load_acquire(fscrypt_inode_info_addr(inode));
}

/**
 * fscrypt_needs_contents_encryption() - check whether an inode needs
 *                                         contents encryption
 * @inode: the inode to check
 *
 * Return: %true iff the inode is an encrypted regular file and the kernel was
 * built with fscrypt support.
 *
 * If you need to know whether the encrypt bit is set even when the kernel was
 * built without fscrypt support, you must use IS_ENCRYPTED() directly instead.
 */
static inline bool fscrypt_needs_contents_encryption(const struct inode *inode)
{
        return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}

/*
 * When d_splice_alias() moves a directory's no-key alias to its
 * plaintext alias as a result of the encryption key being added,
 * DCACHE_NOKEY_NAME must be cleared and there might be an opportunity
 * to disable d_revalidate.  Note that we don't have to support the
 * inverse operation because fscrypt doesn't allow no-key names to be
 * the source or target of a rename().
 */
static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
        /*
         * VFS calls fscrypt_handle_d_move even for non-fscrypt
         * filesystems.
         */
        if (dentry->d_flags & DCACHE_NOKEY_NAME) {
                dentry->d_flags &= ~DCACHE_NOKEY_NAME;

                /*
                 * Other filesystem features might be handling dentry
                 * revalidation, in which case it cannot be disabled.
                 */
                if (dentry->d_op->d_revalidate == fscrypt_d_revalidate)
                        dentry->d_flags &= ~DCACHE_OP_REVALIDATE;
        }
}

/**
 * fscrypt_is_nokey_name() - test whether a dentry is a no-key name
 * @dentry: the dentry to check
 *
 * This returns true if the dentry is a no-key dentry.  A no-key dentry is a
 * dentry that was created in an encrypted directory that hasn't had its
 * encryption key added yet.  Such dentries may be either positive or negative.
 *
 * When a filesystem is asked to create a new filename in an encrypted directory
 * and the new filename's dentry is a no-key dentry, it must fail the operation
 * with ENOKEY.  This includes ->create(), ->mkdir(), ->mknod(), ->symlink(),
 * ->rename(), and ->link().  (However, ->rename() and ->link() are already
 * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().)
 *
 * This is necessary because creating a filename requires the directory's
 * encryption key, but just checking for the key on the directory inode during
 * the final filesystem operation doesn't guarantee that the key was available
 * during the preceding dentry lookup.  And the key must have already been
 * available during the dentry lookup in order for it to have been checked
 * whether the filename already exists in the directory and for the new file's
 * dentry not to be invalidated due to it incorrectly having the no-key flag.
 *
 * Return: %true if the dentry is a no-key name
 */
static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_NOKEY_NAME;
}

static inline void fscrypt_prepare_dentry(struct dentry *dentry,
                                          bool is_nokey_name)
{
        /*
         * This code tries to only take ->d_lock when necessary to write
         * to ->d_flags.  We shouldn't be peeking on d_flags for
         * DCACHE_OP_REVALIDATE unlocked, but in the unlikely case
         * there is a race, the worst it can happen is that we fail to
         * unset DCACHE_OP_REVALIDATE and pay the cost of an extra
         * d_revalidate.
         */
        if (is_nokey_name) {
                spin_lock(&dentry->d_lock);
                dentry->d_flags |= DCACHE_NOKEY_NAME;
                spin_unlock(&dentry->d_lock);
        } else if (dentry->d_flags & DCACHE_OP_REVALIDATE &&
                   dentry->d_op->d_revalidate == fscrypt_d_revalidate) {
                /*
                 * Unencrypted dentries and encrypted dentries where the
                 * key is available are always valid from fscrypt
                 * perspective. Avoid the cost of calling
                 * fscrypt_d_revalidate unnecessarily.
                 */
                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_OP_REVALIDATE;
                spin_unlock(&dentry->d_lock);
        }
}

/* crypto.c */
void fscrypt_enqueue_decrypt_work(struct work_struct *);

struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
                size_t len, size_t offs, gfp_t gfp_flags);
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
                                  unsigned int len, unsigned int offs,
                                  u64 lblk_num);

int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
                                     size_t offs);
int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
                                  unsigned int len, unsigned int offs,
                                  u64 lblk_num);

static inline bool fscrypt_is_bounce_page(struct page *page)
{
        return page->mapping == NULL;
}

static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
{
        return (struct page *)page_private(bounce_page);
}

static inline bool fscrypt_is_bounce_folio(const struct folio *folio)
{
        return folio->mapping == NULL;
}

static inline
struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio)
{
        return bounce_folio->private;
}

void fscrypt_free_bounce_page(struct page *bounce_page);

/* policy.c */
int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg);
int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg);
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child);
int fscrypt_context_for_new_inode(void *ctx, struct inode *inode);
int fscrypt_set_context(struct inode *inode, void *fs_data);

struct fscrypt_dummy_policy {
        const union fscrypt_policy *policy;
};

int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
                                    struct fscrypt_dummy_policy *dummy_policy);
bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
                                  const struct fscrypt_dummy_policy *p2);
void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
                                        struct super_block *sb);
static inline bool
fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy)
{
        return dummy_policy->policy != NULL;
}
static inline void
fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
{
        kfree(dummy_policy->policy);
        dummy_policy->policy = NULL;
}

/* keyring.c */
void fscrypt_destroy_keyring(struct super_block *sb);
int fscrypt_ioctl_add_key(struct file *filp, void __user *arg);
int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg);
int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg);

/* keysetup.c */
int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
                              bool *encrypt_ret);
void fscrypt_put_encryption_info(struct inode *inode);
void fscrypt_free_inode(struct inode *inode);
int fscrypt_drop_inode(struct inode *inode);

/* fname.c */
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
                          u8 *out, unsigned int olen);
bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
                                  u32 max_len, u32 *encrypted_len_ret);
int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname,
                           int lookup, struct fscrypt_name *fname);

static inline void fscrypt_free_filename(struct fscrypt_name *fname)
{
        kfree(fname->crypto_buf.name);
}

int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                               struct fscrypt_str *crypto_str);
void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str);
int fscrypt_fname_disk_to_usr(const struct inode *inode,
                              u32 hash, u32 minor_hash,
                              const struct fscrypt_str *iname,
                              struct fscrypt_str *oname);
bool fscrypt_match_name(const struct fscrypt_name *fname,
                        const u8 *de_name, u32 de_name_len);
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);

/* bio.c */
bool fscrypt_decrypt_bio(struct bio *bio);
int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
                          sector_t sector, u64 len);

/* hooks.c */
int fscrypt_file_open(struct inode *inode, struct file *filp);
int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                           struct dentry *dentry);
int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags);
int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
                             struct fscrypt_name *fname);
int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry);
int __fscrypt_prepare_readdir(struct inode *dir);
int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr);
int fscrypt_prepare_setflags(struct inode *inode,
                             unsigned int oldflags, unsigned int flags);
int fscrypt_prepare_symlink(struct inode *dir, const char *target,
                            unsigned int len, unsigned int max_len,
                            struct fscrypt_str *disk_link);
int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
                              unsigned int len, struct fscrypt_str *disk_link);
const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
                                unsigned int max_size,
                                struct delayed_call *done);
int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat);
static inline void fscrypt_set_ops(struct super_block *sb,
                                   const struct fscrypt_operations *s_cop)
{
        sb->s_cop = s_cop;
}
#else  /* !CONFIG_FS_ENCRYPTION */

static inline struct fscrypt_inode_info *
fscrypt_get_inode_info(const struct inode *inode)
{
        return NULL;
}

static inline bool fscrypt_needs_contents_encryption(const struct inode *inode)
{
        return false;
}

static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
}

static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
{
        return false;
}

static inline void fscrypt_prepare_dentry(struct dentry *dentry,
                                          bool is_nokey_name)
{
}

/* crypto.c */
static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
{
}

static inline struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
                size_t len, size_t offs, gfp_t gfp_flags)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int fscrypt_encrypt_block_inplace(const struct inode *inode,
                                                struct page *page,
                                                unsigned int len,
                                                unsigned int offs, u64 lblk_num)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio,
                                                   size_t len, size_t offs)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_decrypt_block_inplace(const struct inode *inode,
                                                struct page *page,
                                                unsigned int len,
                                                unsigned int offs, u64 lblk_num)
{
        return -EOPNOTSUPP;
}

static inline bool fscrypt_is_bounce_page(struct page *page)
{
        return false;
}

static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
{
        WARN_ON_ONCE(1);
        return ERR_PTR(-EINVAL);
}

static inline bool fscrypt_is_bounce_folio(const struct folio *folio)
{
        return false;
}

static inline
struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio)
{
        WARN_ON_ONCE(1);
        return ERR_PTR(-EINVAL);
}

static inline void fscrypt_free_bounce_page(struct page *bounce_page)
{
}

/* policy.c */
static inline int fscrypt_ioctl_set_policy(struct file *filp,
                                           const void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_policy_ex(struct file *filp,
                                              void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_has_permitted_context(struct inode *parent,
                                                struct inode *child)
{
        return 0;
}

static inline int fscrypt_set_context(struct inode *inode, void *fs_data)
{
        return -EOPNOTSUPP;
}

struct fscrypt_dummy_policy {
};

static inline int
fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
                                    struct fscrypt_dummy_policy *dummy_policy)
{
        return -EINVAL;
}

static inline bool
fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
                             const struct fscrypt_dummy_policy *p2)
{
        return true;
}

static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq,
                                                      char sep,
                                                      struct super_block *sb)
{
}

static inline bool
fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy)
{
        return false;
}

static inline void
fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
{
}

/* keyring.c */
static inline void fscrypt_destroy_keyring(struct super_block *sb)
{
}

static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp,
                                                     void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_key_status(struct file *filp,
                                               void __user *arg)
{
        return -EOPNOTSUPP;
}

/* keysetup.c */

static inline int fscrypt_prepare_new_inode(struct inode *dir,
                                            struct inode *inode,
                                            bool *encrypt_ret)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;
        return 0;
}

static inline void fscrypt_put_encryption_info(struct inode *inode)
{
        return;
}

static inline void fscrypt_free_inode(struct inode *inode)
{
}

static inline int fscrypt_drop_inode(struct inode *inode)
{
        return 0;
}

 /* fname.c */
static inline int fscrypt_setup_filename(struct inode *dir,
                                         const struct qstr *iname,
                                         int lookup, struct fscrypt_name *fname)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;

        memset(fname, 0, sizeof(*fname));
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *)iname->name;
        fname->disk_name.len = iname->len;
        return 0;
}

static inline void fscrypt_free_filename(struct fscrypt_name *fname)
{
        return;
}

static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                                             struct fscrypt_str *crypto_str)
{
        return -EOPNOTSUPP;
}

static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
{
        return;
}

static inline int fscrypt_fname_disk_to_usr(const struct inode *inode,
                                            u32 hash, u32 minor_hash,
                                            const struct fscrypt_str *iname,
                                            struct fscrypt_str *oname)
{
        return -EOPNOTSUPP;
}

static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
                                      const u8 *de_name, u32 de_name_len)
{
        /* Encryption support disabled; use standard comparison */
        if (de_name_len != fname->disk_name.len)
                return false;
        return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
}

static inline u64 fscrypt_fname_siphash(const struct inode *dir,
                                        const struct qstr *name)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name,
                                       struct dentry *dentry, unsigned int flags)
{
        return 1;
}

/* bio.c */
static inline bool fscrypt_decrypt_bio(struct bio *bio)
{
        return true;
}

static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
                                        sector_t sector, u64 len)
{
        return -EOPNOTSUPP;
}

/* hooks.c */

static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
{
        if (IS_ENCRYPTED(inode))
                return -EOPNOTSUPP;
        return 0;
}

static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                                         struct dentry *dentry)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_rename(struct inode *old_dir,
                                           struct dentry *old_dentry,
                                           struct inode *new_dir,
                                           struct dentry *new_dentry,
                                           unsigned int flags)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_lookup(struct inode *dir,
                                           struct dentry *dentry,
                                           struct fscrypt_name *fname)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_prepare_lookup_partial(struct inode *dir,
                                                 struct dentry *dentry)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_readdir(struct inode *dir)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_setattr(struct dentry *dentry,
                                            struct iattr *attr)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_prepare_setflags(struct inode *inode,
                                           unsigned int oldflags,
                                           unsigned int flags)
{
        return 0;
}

static inline int fscrypt_prepare_symlink(struct inode *dir,
                                          const char *target,
                                          unsigned int len,
                                          unsigned int max_len,
                                          struct fscrypt_str *disk_link)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;
        disk_link->name = (unsigned char *)target;
        disk_link->len = len + 1;
        if (disk_link->len > max_len)
                return -ENAMETOOLONG;
        return 0;
}

static inline int __fscrypt_encrypt_symlink(struct inode *inode,
                                            const char *target,
                                            unsigned int len,
                                            struct fscrypt_str *disk_link)
{
        return -EOPNOTSUPP;
}

static inline const char *fscrypt_get_symlink(struct inode *inode,
                                              const void *caddr,
                                              unsigned int max_size,
                                              struct delayed_call *done)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int fscrypt_symlink_getattr(const struct path *path,
                                          struct kstat *stat)
{
        return -EOPNOTSUPP;
}

static inline void fscrypt_set_ops(struct super_block *sb,
                                   const struct fscrypt_operations *s_cop)
{
}

#endif        /* !CONFIG_FS_ENCRYPTION */

/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT

bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode);

void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
                               loff_t pos, gfp_t gfp_mask);

bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
                           loff_t pos);

bool fscrypt_dio_supported(struct inode *inode);

u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks);

#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
        return false;
}

static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio,
                                             const struct inode *inode,
                                             loff_t pos, gfp_t gfp_mask) { }

static inline bool fscrypt_mergeable_bio(struct bio *bio,
                                         const struct inode *inode,
                                         loff_t pos)
{
        return true;
}

static inline bool fscrypt_dio_supported(struct inode *inode)
{
        return !fscrypt_needs_contents_encryption(inode);
}

static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk,
                                          u64 nr_blocks)
{
        return nr_blocks;
}
#endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

/**
 * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline
 *                                        encryption
 * @inode: an inode. If encrypted, its key must be set up.
 *
 * Return: true if the inode requires file contents encryption and if the
 *           encryption should be done in the block layer via blk-crypto rather
 *           than in the filesystem layer.
 */
static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
        return fscrypt_needs_contents_encryption(inode) &&
               __fscrypt_inode_uses_inline_crypto(inode);
}

/**
 * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer
 *                                          encryption
 * @inode: an inode. If encrypted, its key must be set up.
 *
 * Return: true if the inode requires file contents encryption and if the
 *           encryption should be done in the filesystem layer rather than in the
 *           block layer via blk-crypto.
 */
static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode)
{
        return fscrypt_needs_contents_encryption(inode) &&
               !__fscrypt_inode_uses_inline_crypto(inode);
}

/**
 * fscrypt_has_encryption_key() - check whether an inode has had its key set up
 * @inode: the inode to check
 *
 * Return: %true if the inode has had its encryption key set up, else %false.
 *
 * Usually this should be preceded by fscrypt_get_encryption_info() to try to
 * set up the key first.
 */
static inline bool fscrypt_has_encryption_key(const struct inode *inode)
{
        return fscrypt_get_inode_info(inode) != NULL;
}

/**
 * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted
 *                            directory
 * @old_dentry: an existing dentry for the inode being linked
 * @dir: the target directory
 * @dentry: negative dentry for the target filename
 *
 * A new link can only be added to an encrypted directory if the directory's
 * encryption key is available --- since otherwise we'd have no way to encrypt
 * the filename.
 *
 * We also verify that the link will not violate the constraint that all files
 * in an encrypted directory tree use the same encryption policy.
 *
 * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
 * -EXDEV if the link would result in an inconsistent encryption policy, or
 * another -errno code.
 */
static inline int fscrypt_prepare_link(struct dentry *old_dentry,
                                       struct inode *dir,
                                       struct dentry *dentry)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry);
        return 0;
}

/**
 * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted
 *                              directories
 * @old_dir: source directory
 * @old_dentry: dentry for source file
 * @new_dir: target directory
 * @new_dentry: dentry for target location (may be negative unless exchanging)
 * @flags: rename flags (we care at least about %RENAME_EXCHANGE)
 *
 * Prepare for ->rename() where the source and/or target directories may be
 * encrypted.  A new link can only be added to an encrypted directory if the
 * directory's encryption key is available --- since otherwise we'd have no way
 * to encrypt the filename.  A rename to an existing name, on the other hand,
 * *is* cryptographically possible without the key.  However, we take the more
 * conservative approach and just forbid all no-key renames.
 *
 * We also verify that the rename will not violate the constraint that all files
 * in an encrypted directory tree use the same encryption policy.
 *
 * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the
 * rename would cause inconsistent encryption policies, or another -errno code.
 */
static inline int fscrypt_prepare_rename(struct inode *old_dir,
                                         struct dentry *old_dentry,
                                         struct inode *new_dir,
                                         struct dentry *new_dentry,
                                         unsigned int flags)
{
        if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir))
                return __fscrypt_prepare_rename(old_dir, old_dentry,
                                                new_dir, new_dentry, flags);
        return 0;
}

/**
 * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted
 *                              directory
 * @dir: directory being searched
 * @dentry: filename being looked up
 * @fname: (output) the name to use to search the on-disk directory
 *
 * Prepare for ->lookup() in a directory which may be encrypted by determining
 * the name that will actually be used to search the directory on-disk.  If the
 * directory's encryption policy is supported by this kernel and its encryption
 * key is available, then the lookup is assumed to be by plaintext name;
 * otherwise, it is assumed to be by no-key name.
 *
 * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key
 * name.  In this case the filesystem must assign the dentry a dentry_operations
 * which contains fscrypt_d_revalidate (or contains a d_revalidate method that
 * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the
 * directory's encryption key is later added.
 *
 * Return: 0 on success; -ENOENT if the directory's key is unavailable but the
 * filename isn't a valid no-key name, so a negative dentry should be created;
 * or another -errno code.
 */
static inline int fscrypt_prepare_lookup(struct inode *dir,
                                         struct dentry *dentry,
                                         struct fscrypt_name *fname)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_lookup(dir, dentry, fname);

        memset(fname, 0, sizeof(*fname));
        fname->usr_fname = &dentry->d_name;
        fname->disk_name.name = (unsigned char *)dentry->d_name.name;
        fname->disk_name.len = dentry->d_name.len;

        fscrypt_prepare_dentry(dentry, false);

        return 0;
}

/**
 * fscrypt_prepare_readdir() - prepare to read a possibly-encrypted directory
 * @dir: the directory inode
 *
 * If the directory is encrypted and it doesn't already have its encryption key
 * set up, try to set it up so that the filenames will be listed in plaintext
 * form rather than in no-key form.
 *
 * Return: 0 on success; -errno on error.  Note that the encryption key being
 *           unavailable is not considered an error.  It is also not an error if
 *           the encryption policy is unsupported by this kernel; that is treated
 *           like the key being unavailable, so that files can still be deleted.
 */
static inline int fscrypt_prepare_readdir(struct inode *dir)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_readdir(dir);
        return 0;
}

/**
 * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's
 *                               attributes
 * @dentry: dentry through which the inode is being changed
 * @attr: attributes to change
 *
 * Prepare for ->setattr() on a possibly-encrypted inode.  On an encrypted file,
 * most attribute changes are allowed even without the encryption key.  However,
 * without the encryption key we do have to forbid truncates.  This is needed
 * because the size being truncated to may not be a multiple of the filesystem
 * block size, and in that case we'd have to decrypt the final block, zero the
 * portion past i_size, and re-encrypt it.  (We *could* allow truncating to a
 * filesystem block boundary, but it's simpler to just forbid all truncates ---
 * and we already forbid all other contents modifications without the key.)
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 * if a problem occurred while setting up the encryption key.
 */
static inline int fscrypt_prepare_setattr(struct dentry *dentry,
                                          struct iattr *attr)
{
        if (IS_ENCRYPTED(d_inode(dentry)))
                return __fscrypt_prepare_setattr(dentry, attr);
        return 0;
}

/**
 * fscrypt_encrypt_symlink() - encrypt the symlink target if needed
 * @inode: symlink inode
 * @target: plaintext symlink target
 * @len: length of @target excluding null terminator
 * @disk_link: (in/out) the on-disk symlink target being prepared
 *
 * If the symlink target needs to be encrypted, then this function encrypts it
 * into @disk_link->name.  fscrypt_prepare_symlink() must have been called
 * previously to compute @disk_link->len.  If the filesystem did not allocate a
 * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one
 * will be kmalloc()'ed and the filesystem will be responsible for freeing it.
 *
 * Return: 0 on success, -errno on failure
 */
static inline int fscrypt_encrypt_symlink(struct inode *inode,
                                          const char *target,
                                          unsigned int len,
                                          struct fscrypt_str *disk_link)
{
        if (IS_ENCRYPTED(inode))
                return __fscrypt_encrypt_symlink(inode, target, len, disk_link);
        return 0;
}

/* If *pagep is a bounce page, free it and set *pagep to the pagecache page */
static inline void fscrypt_finalize_bounce_page(struct page **pagep)
{
        struct page *page = *pagep;

        if (fscrypt_is_bounce_page(page)) {
                *pagep = fscrypt_pagecache_page(page);
                fscrypt_free_bounce_page(page);
        }
}

#endif        /* _LINUX_FSCRYPT_H */































































































































































































































    1 



    1 






























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Yama Linux Security Module
 *
 * Author: Kees Cook <keescook@chromium.org>
 *
 * Copyright (C) 2010 Canonical, Ltd.
 * Copyright (C) 2011 The Chromium OS Authors.
 */

#include <linux/lsm_hooks.h>
#include <linux/sysctl.h>
#include <linux/ptrace.h>
#include <linux/prctl.h>
#include <linux/ratelimit.h>
#include <linux/workqueue.h>
#include <linux/string_helpers.h>
#include <linux/task_work.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <uapi/linux/lsm.h>

#define YAMA_SCOPE_DISABLED        0
#define YAMA_SCOPE_RELATIONAL        1
#define YAMA_SCOPE_CAPABILITY        2
#define YAMA_SCOPE_NO_ATTACH        3

static int ptrace_scope = YAMA_SCOPE_RELATIONAL;

/* describe a ptrace relationship for potential exception */
struct ptrace_relation {
        struct task_struct *tracer;
        struct task_struct *tracee;
        bool invalid;
        struct list_head node;
        struct rcu_head rcu;
};

static LIST_HEAD(ptracer_relations);
static DEFINE_SPINLOCK(ptracer_relations_lock);

static void yama_relation_cleanup(struct work_struct *work);
static DECLARE_WORK(yama_relation_work, yama_relation_cleanup);

struct access_report_info {
        struct callback_head work;
        const char *access;
        struct task_struct *target;
        struct task_struct *agent;
};

static void __report_access(struct callback_head *work)
{
        struct access_report_info *info =
                container_of(work, struct access_report_info, work);
        char *target_cmd, *agent_cmd;

        target_cmd = kstrdup_quotable_cmdline(info->target, GFP_KERNEL);
        agent_cmd = kstrdup_quotable_cmdline(info->agent, GFP_KERNEL);

        pr_notice_ratelimited(
                "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n",
                info->access, target_cmd, info->target->pid, agent_cmd,
                info->agent->pid);

        kfree(agent_cmd);
        kfree(target_cmd);

        put_task_struct(info->agent);
        put_task_struct(info->target);
        kfree(info);
}

/* defers execution because cmdline access can sleep */
static void report_access(const char *access, struct task_struct *target,
                                struct task_struct *agent)
{
        struct access_report_info *info;

        assert_spin_locked(&target->alloc_lock); /* for target->comm */

        if (current->flags & PF_KTHREAD) {
                /* I don't think kthreads call task_work_run() before exiting.
                 * Imagine angry ranting about procfs here.
                 */
                pr_notice_ratelimited(
                    "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n",
                    access, target->comm, target->pid, agent->comm, agent->pid);
                return;
        }

        info = kmalloc_obj(*info, GFP_ATOMIC);
        if (!info)
                return;
        init_task_work(&info->work, __report_access);
        get_task_struct(target);
        get_task_struct(agent);
        info->access = access;
        info->target = target;
        info->agent = agent;
        if (task_work_add(current, &info->work, TWA_RESUME) == 0)
                return; /* success */

        WARN(1, "report_access called from exiting task");
        put_task_struct(target);
        put_task_struct(agent);
        kfree(info);
}

/**
 * yama_relation_cleanup - remove invalid entries from the relation list
 * @work: unused
 *
 */
static void yama_relation_cleanup(struct work_struct *work)
{
        struct ptrace_relation *relation;

        spin_lock(&ptracer_relations_lock);
        rcu_read_lock();
        list_for_each_entry_rcu(relation, &ptracer_relations, node) {
                if (relation->invalid) {
                        list_del_rcu(&relation->node);
                        kfree_rcu(relation, rcu);
                }
        }
        rcu_read_unlock();
        spin_unlock(&ptracer_relations_lock);
}

/**
 * yama_ptracer_add - add/replace an exception for this tracer/tracee pair
 * @tracer: the task_struct of the process doing the ptrace
 * @tracee: the task_struct of the process to be ptraced
 *
 * Each tracee can have, at most, one tracer registered. Each time this
 * is called, the prior registered tracer will be replaced for the tracee.
 *
 * Returns 0 if relationship was added, -ve on error.
 */
static int yama_ptracer_add(struct task_struct *tracer,
                            struct task_struct *tracee)
{
        struct ptrace_relation *relation, *added;

        added = kmalloc_obj(*added);
        if (!added)
                return -ENOMEM;

        added->tracee = tracee;
        added->tracer = tracer;
        added->invalid = false;

        spin_lock(&ptracer_relations_lock);
        rcu_read_lock();
        list_for_each_entry_rcu(relation, &ptracer_relations, node) {
                if (relation->invalid)
                        continue;
                if (relation->tracee == tracee) {
                        list_replace_rcu(&relation->node, &added->node);
                        kfree_rcu(relation, rcu);
                        goto out;
                }
        }

        list_add_rcu(&added->node, &ptracer_relations);

out:
        rcu_read_unlock();
        spin_unlock(&ptracer_relations_lock);
        return 0;
}

/**
 * yama_ptracer_del - remove exceptions related to the given tasks
 * @tracer: remove any relation where tracer task matches
 * @tracee: remove any relation where tracee task matches
 */
static void yama_ptracer_del(struct task_struct *tracer,
                             struct task_struct *tracee)
{
        struct ptrace_relation *relation;
        bool marked = false;

        rcu_read_lock();
        list_for_each_entry_rcu(relation, &ptracer_relations, node) {
                if (relation->invalid)
                        continue;
                if (relation->tracee == tracee ||
                    (tracer && relation->tracer == tracer)) {
                        relation->invalid = true;
                        marked = true;
                }
        }
        rcu_read_unlock();

        if (marked)
                schedule_work(&yama_relation_work);
}

/**
 * yama_task_free - check for task_pid to remove from exception list
 * @task: task being removed
 */
static void yama_task_free(struct task_struct *task)
{
        yama_ptracer_del(task, task);
}

/**
 * yama_task_prctl - check for Yama-specific prctl operations
 * @option: operation
 * @arg2: argument
 * @arg3: argument
 * @arg4: argument
 * @arg5: argument
 *
 * Return 0 on success, -ve on error.  -ENOSYS is returned when Yama
 * does not handle the given option.
 */
static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                           unsigned long arg4, unsigned long arg5)
{
        int rc = -ENOSYS;
        struct task_struct *myself;

        switch (option) {
        case PR_SET_PTRACER:
                /* Since a thread can call prctl(), find the group leader
                 * before calling _add() or _del() on it, since we want
                 * process-level granularity of control. The tracer group
                 * leader checking is handled later when walking the ancestry
                 * at the time of PTRACE_ATTACH check.
                 */
                myself = current->group_leader;

                if (arg2 == 0) {
                        yama_ptracer_del(NULL, myself);
                        rc = 0;
                } else if (arg2 == PR_SET_PTRACER_ANY || (int)arg2 == -1) {
                        rc = yama_ptracer_add(NULL, myself);
                } else {
                        struct task_struct *tracer;

                        tracer = find_get_task_by_vpid(arg2);
                        if (!tracer) {
                                rc = -EINVAL;
                        } else {
                                rc = yama_ptracer_add(tracer, myself);
                                put_task_struct(tracer);
                        }
                }

                break;
        }

        return rc;
}

/**
 * task_is_descendant - walk up a process family tree looking for a match
 * @parent: the process to compare against while walking up from child
 * @child: the process to start from while looking upwards for parent
 *
 * Returns 1 if child is a descendant of parent, 0 if not.
 */
static int task_is_descendant(struct task_struct *parent,
                              struct task_struct *child)
{
        int rc = 0;
        struct task_struct *walker = child;

        if (!parent || !child)
                return 0;

        rcu_read_lock();
        if (!thread_group_leader(parent))
                parent = rcu_dereference(parent->group_leader);
        while (walker->pid > 0) {
                if (!thread_group_leader(walker))
                        walker = rcu_dereference(walker->group_leader);
                if (walker == parent) {
                        rc = 1;
                        break;
                }
                walker = rcu_dereference(walker->real_parent);
        }
        rcu_read_unlock();

        return rc;
}

/**
 * ptracer_exception_found - tracer registered as exception for this tracee
 * @tracer: the task_struct of the process attempting ptrace
 * @tracee: the task_struct of the process to be ptraced
 *
 * Returns 1 if tracer has a ptracer exception ancestor for tracee.
 */
static int ptracer_exception_found(struct task_struct *tracer,
                                   struct task_struct *tracee)
{
        int rc = 0;
        struct ptrace_relation *relation;
        struct task_struct *parent = NULL;
        bool found = false;

        rcu_read_lock();

        /*
         * If there's already an active tracing relationship, then make an
         * exception for the sake of other accesses, like process_vm_rw().
         */
        parent = ptrace_parent(tracee);
        if (parent != NULL && same_thread_group(parent, tracer)) {
                rc = 1;
                goto unlock;
        }

        /* Look for a PR_SET_PTRACER relationship. */
        if (!thread_group_leader(tracee))
                tracee = rcu_dereference(tracee->group_leader);
        list_for_each_entry_rcu(relation, &ptracer_relations, node) {
                if (relation->invalid)
                        continue;
                if (relation->tracee == tracee) {
                        parent = relation->tracer;
                        found = true;
                        break;
                }
        }

        if (found && (parent == NULL || task_is_descendant(parent, tracer)))
                rc = 1;

unlock:
        rcu_read_unlock();

        return rc;
}

/**
 * yama_ptrace_access_check - validate PTRACE_ATTACH calls
 * @child: task that current task is attempting to ptrace
 * @mode: ptrace attach mode
 *
 * Returns 0 if following the ptrace is allowed, -ve on error.
 */
static int yama_ptrace_access_check(struct task_struct *child,
                                    unsigned int mode)
{
        int rc = 0;

        /* require ptrace target be a child of ptracer on attach */
        if (mode & PTRACE_MODE_ATTACH) {
                switch (ptrace_scope) {
                case YAMA_SCOPE_DISABLED:
                        /* No additional restrictions. */
                        break;
                case YAMA_SCOPE_RELATIONAL:
                        rcu_read_lock();
                        if (!pid_alive(child))
                                rc = -EPERM;
                        if (!rc && !task_is_descendant(current, child) &&
                            !ptracer_exception_found(current, child) &&
                            !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
                                rc = -EPERM;
                        rcu_read_unlock();
                        break;
                case YAMA_SCOPE_CAPABILITY:
                        rcu_read_lock();
                        if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
                                rc = -EPERM;
                        rcu_read_unlock();
                        break;
                case YAMA_SCOPE_NO_ATTACH:
                default:
                        rc = -EPERM;
                        break;
                }
        }

        if (rc && (mode & PTRACE_MODE_NOAUDIT) == 0)
                report_access("attach", child, current);

        return rc;
}

/**
 * yama_ptrace_traceme - validate PTRACE_TRACEME calls
 * @parent: task that will become the ptracer of the current task
 *
 * Returns 0 if following the ptrace is allowed, -ve on error.
 */
static int yama_ptrace_traceme(struct task_struct *parent)
{
        int rc = 0;

        /* Only disallow PTRACE_TRACEME on more aggressive settings. */
        switch (ptrace_scope) {
        case YAMA_SCOPE_CAPABILITY:
                if (!has_ns_capability(parent, current_user_ns(), CAP_SYS_PTRACE))
                        rc = -EPERM;
                break;
        case YAMA_SCOPE_NO_ATTACH:
                rc = -EPERM;
                break;
        }

        if (rc) {
                task_lock(current);
                report_access("traceme", current, parent);
                task_unlock(current);
        }

        return rc;
}

static const struct lsm_id yama_lsmid = {
        .name = "yama",
        .id = LSM_ID_YAMA,
};

static struct security_hook_list yama_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme),
        LSM_HOOK_INIT(task_prctl, yama_task_prctl),
        LSM_HOOK_INIT(task_free, yama_task_free),
};

#ifdef CONFIG_SYSCTL
static int yama_dointvec_minmax(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table table_copy;

        if (write && !capable(CAP_SYS_PTRACE))
                return -EPERM;

        /* Lock the max value if it ever gets set. */
        table_copy = *table;
        if (*(int *)table_copy.data == *(int *)table_copy.extra2)
                table_copy.extra1 = table_copy.extra2;

        return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos);
}

static int max_scope = YAMA_SCOPE_NO_ATTACH;

static const struct ctl_table yama_sysctl_table[] = {
        {
                .procname       = "ptrace_scope",
                .data           = &ptrace_scope,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = yama_dointvec_minmax,
                .extra1         = SYSCTL_ZERO,
                .extra2         = &max_scope,
        },
};
static void __init yama_init_sysctl(void)
{
        if (!register_sysctl("kernel/yama", yama_sysctl_table))
                panic("Yama: sysctl registration failed.\n");
}
#else
static inline void yama_init_sysctl(void) { }
#endif /* CONFIG_SYSCTL */

static int __init yama_init(void)
{
        pr_info("Yama: becoming mindful.\n");
        security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), &yama_lsmid);
        yama_init_sysctl();
        return 0;
}

DEFINE_LSM(yama) = {
        .id = &yama_lsmid,
        .init = yama_init,
};























































































































































































   11 


































   11 











































   11 

   11 













   14 





















































   11 


























    1 



    1 

    1 


















































   10 





















   11 



























   11 


































































































































































































































































































































































































































































































































































































































   11 







   11 
























   10 















   11 




    9 

   11 










   11 




   11 














    9 












    9 

   11 
















    9 









































































































































































































   11 





   11 



    9 







   11 











   10 



   10 
   11 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
























    1 




    1 




    1 



    1 


































    1 



    1 








    1 





















































































































































































































































































































































































































































































































































































































































































































    1 



    1 












    1 






    1 








































































































































































































































































































































































































































































































































































































































































































































    1 



















































    1 























    1 

    1 


































































































    1 
















    1 
    1 




    1 




    1 



















































































































































































































































































































































    1 



    1 

















    1 















   11 

   10 
   10 





















































































































































    1 






    1 














































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
// SPDX-License-Identifier: GPL-2.0
/*
 * Resizable virtual memory filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *                 2000 Transmeta Corp.
 *                 2000-2001 Christoph Rohland
 *                 2000-2001 SAP AG
 *                 2002 Red Hat Inc.
 * Copyright (C) 2002-2011 Hugh Dickins.
 * Copyright (C) 2011 Google Inc.
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
 * tiny-shmem:
 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
#include <linux/ramfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/fileattr.h>
#include <linux/filelock.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
#include <linux/shmem_fs.h>
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include "swap.h"

static struct vfsmount *shm_mnt __ro_after_init;

#ifdef CONFIG_SHMEM
/*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */

#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/folio_batch.h>
#include <linux/percpu_counter.h>
#include <linux/falloc.h>
#include <linux/splice.h>
#include <linux/security.h>
#include <linux/leafops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
#include <linux/ctype.h>
#include <linux/migrate.h>
#include <linux/highmem.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
#include <linux/rmap.h>
#include <linux/uuid.h>
#include <linux/quotaops.h>
#include <linux/rcupdate_wait.h>

#include <linux/uaccess.h>

#include "internal.h"

#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)

/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

/* Pretend that one inode + its dentry occupy this much memory */
#define BOGO_INODE_SIZE 1024

/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128

/*
 * shmem_fallocate communicates with shmem_fault or shmem_writeout via
 * inode->i_private (with i_rwsem making sure that it has only one user at
 * a time): we would prefer not to enlarge the shmem inode just for that.
 */
struct shmem_falloc {
        wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
        pgoff_t start;                /* start of range currently being fallocated */
        pgoff_t next;                /* the next page offset to be fallocated */
        pgoff_t nr_falloced;        /* how many new pages have been fallocated */
        pgoff_t nr_unswapped;        /* how often writeout refused to swap out */
};

struct shmem_options {
        unsigned long long blocks;
        unsigned long long inodes;
        struct mempolicy *mpol;
        kuid_t uid;
        kgid_t gid;
        umode_t mode;
        bool full_inums;
        int huge;
        int seen;
        bool noswap;
        unsigned short quota_types;
        struct shmem_quota_limits qlimits;
#if IS_ENABLED(CONFIG_UNICODE)
        struct unicode_map *encoding;
        bool strict_encoding;
#endif
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
#define SHMEM_SEEN_QUOTA 16
};

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static unsigned long huge_shmem_orders_always __read_mostly;
static unsigned long huge_shmem_orders_madvise __read_mostly;
static unsigned long huge_shmem_orders_inherit __read_mostly;
static unsigned long huge_shmem_orders_within_size __read_mostly;
static bool shmem_orders_configured __initdata;
#endif

#ifdef CONFIG_TMPFS
static unsigned long shmem_default_max_blocks(void)
{
        return totalram_pages() / 2;
}

static unsigned long shmem_default_max_inodes(void)
{
        unsigned long nr_pages = totalram_pages();

        return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
                        ULONG_MAX / BOGO_INODE_SIZE);
}
#endif

static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                        struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
                        struct vm_area_struct *vma, vm_fault_t *fault_type);

static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

/*
 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 * for shared memory and for shared anonymous (/dev/zero) mappings
 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 * consistent with the pre-accounting of private mappings ...
 */
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
        return (flags & SHMEM_F_NORESERVE) ?
                0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
        if (!(flags & SHMEM_F_NORESERVE))
                vm_unacct_memory(VM_ACCT(size));
}

static inline int shmem_reacct_size(unsigned long flags,
                loff_t oldsize, loff_t newsize)
{
        if (!(flags & SHMEM_F_NORESERVE)) {
                if (VM_ACCT(newsize) > VM_ACCT(oldsize))
                        return security_vm_enough_memory_mm(current->mm,
                                        VM_ACCT(newsize) - VM_ACCT(oldsize));
                else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
                        vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
        }
        return 0;
}

/*
 * ... whereas tmpfs objects are accounted incrementally as
 * pages are allocated, in order to allow large sparse files.
 * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 */
static inline int shmem_acct_blocks(unsigned long flags, long pages)
{
        if (!(flags & SHMEM_F_NORESERVE))
                return 0;

        return security_vm_enough_memory_mm(current->mm,
                        pages * VM_ACCT(PAGE_SIZE));
}

static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
        if (flags & SHMEM_F_NORESERVE)
                vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}

int shmem_inode_acct_blocks(struct inode *inode, long pages)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        int err = -ENOSPC;

        if (shmem_acct_blocks(info->flags, pages))
                return err;

        might_sleep();        /* when quotas */
        if (sbinfo->max_blocks) {
                if (!percpu_counter_limited_add(&sbinfo->used_blocks,
                                                sbinfo->max_blocks, pages))
                        goto unacct;

                err = dquot_alloc_block_nodirty(inode, pages);
                if (err) {
                        percpu_counter_sub(&sbinfo->used_blocks, pages);
                        goto unacct;
                }
        } else {
                err = dquot_alloc_block_nodirty(inode, pages);
                if (err)
                        goto unacct;
        }

        return 0;

unacct:
        shmem_unacct_blocks(info->flags, pages);
        return err;
}

static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

        might_sleep();        /* when quotas */
        dquot_free_block_nodirty(inode, pages);

        if (sbinfo->max_blocks)
                percpu_counter_sub(&sbinfo->used_blocks, pages);
        shmem_unacct_blocks(info->flags, pages);
}

static const struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;

bool shmem_mapping(const struct address_space *mapping)
{
        return mapping->a_ops == &shmem_aops;
}
EXPORT_SYMBOL_GPL(shmem_mapping);

bool vma_is_anon_shmem(const struct vm_area_struct *vma)
{
        return vma->vm_ops == &shmem_anon_vm_ops;
}

bool vma_is_shmem(const struct vm_area_struct *vma)
{
        return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
}

static LIST_HEAD(shmem_swaplist);
static DEFINE_SPINLOCK(shmem_swaplist_lock);

#ifdef CONFIG_TMPFS_QUOTA

static int shmem_enable_quotas(struct super_block *sb,
                               unsigned short quota_types)
{
        int type, err = 0;

        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
        for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
                if (!(quota_types & (1 << type)))
                        continue;
                err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
                                          DQUOT_USAGE_ENABLED |
                                          DQUOT_LIMITS_ENABLED);
                if (err)
                        goto out_err;
        }
        return 0;

out_err:
        pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
                type, err);
        for (type--; type >= 0; type--)
                dquot_quota_off(sb, type);
        return err;
}

static void shmem_disable_quotas(struct super_block *sb)
{
        int type;

        for (type = 0; type < SHMEM_MAXQUOTAS; type++)
                dquot_quota_off(sb, type);
}

static struct dquot __rcu **shmem_get_dquots(struct inode *inode)
{
        return SHMEM_I(inode)->i_dquot;
}
#endif /* CONFIG_TMPFS_QUOTA */

/*
 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
 * produces a novel ino for the newly allocated inode.
 *
 * It may also be called when making a hard link to permit the space needed by
 * each dentry. However, in that case, no new inode number is needed since that
 * internally draws from another pool of inode numbers (currently global
 * get_next_ino()). This case is indicated by passing NULL as inop.
 */
#define SHMEM_INO_BATCH 1024
static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        ino_t ino;

        if (!(sb->s_flags & SB_KERNMOUNT)) {
                raw_spin_lock(&sbinfo->stat_lock);
                if (sbinfo->max_inodes) {
                        if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
                                raw_spin_unlock(&sbinfo->stat_lock);
                                return -ENOSPC;
                        }
                        sbinfo->free_ispace -= BOGO_INODE_SIZE;
                }
                if (inop) {
                        ino = sbinfo->next_ino++;
                        if (unlikely(is_zero_ino(ino)))
                                ino = sbinfo->next_ino++;
                        if (unlikely(!sbinfo->full_inums &&
                                     ino > UINT_MAX)) {
                                /*
                                 * Emulate get_next_ino uint wraparound for
                                 * compatibility
                                 */
                                if (IS_ENABLED(CONFIG_64BIT))
                                        pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
                                                __func__, MINOR(sb->s_dev));
                                sbinfo->next_ino = 1;
                                ino = sbinfo->next_ino++;
                        }
                        *inop = ino;
                }
                raw_spin_unlock(&sbinfo->stat_lock);
        } else if (inop) {
                /*
                 * __shmem_file_setup, one of our callers, is lock-free: it
                 * doesn't hold stat_lock in shmem_reserve_inode since
                 * max_inodes is always 0, and is called from potentially
                 * unknown contexts. As such, use a per-cpu batched allocator
                 * which doesn't require the per-sb stat_lock unless we are at
                 * the batch boundary.
                 *
                 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
                 * shmem mounts are not exposed to userspace, so we don't need
                 * to worry about things like glibc compatibility.
                 */
                ino_t *next_ino;

                next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
                ino = *next_ino;
                if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
                        raw_spin_lock(&sbinfo->stat_lock);
                        ino = sbinfo->next_ino;
                        sbinfo->next_ino += SHMEM_INO_BATCH;
                        raw_spin_unlock(&sbinfo->stat_lock);
                        if (unlikely(is_zero_ino(ino)))
                                ino++;
                }
                *inop = ino;
                *next_ino = ++ino;
                put_cpu();
        }

        return 0;
}

static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        if (sbinfo->max_inodes) {
                raw_spin_lock(&sbinfo->stat_lock);
                sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
                raw_spin_unlock(&sbinfo->stat_lock);
        }
}

/**
 * shmem_recalc_inode - recalculate the block usage of an inode
 * @inode: inode to recalc
 * @alloced: the change in number of pages allocated to inode
 * @swapped: the change in number of pages swapped from inode
 *
 * We have to calculate the free blocks since the mm can drop
 * undirtied hole pages behind our back.
 *
 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 *
 * Return: true if swapped was incremented from 0, for shmem_writeout().
 */
bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        bool first_swapped = false;
        long freed;

        spin_lock(&info->lock);
        info->alloced += alloced;
        info->swapped += swapped;
        freed = info->alloced - info->swapped -
                READ_ONCE(inode->i_mapping->nrpages);
        /*
         * Special case: whereas normally shmem_recalc_inode() is called
         * after i_mapping->nrpages has already been adjusted (up or down),
         * shmem_writeout() has to raise swapped before nrpages is lowered -
         * to stop a racing shmem_recalc_inode() from thinking that a page has
         * been freed.  Compensate here, to avoid the need for a followup call.
         */
        if (swapped > 0) {
                if (info->swapped == swapped)
                        first_swapped = true;
                freed += swapped;
        }
        if (freed > 0)
                info->alloced -= freed;
        spin_unlock(&info->lock);

        /* The quota case may block */
        if (freed > 0)
                shmem_inode_unacct_blocks(inode, freed);
        return first_swapped;
}

bool shmem_charge(struct inode *inode, long pages)
{
        struct address_space *mapping = inode->i_mapping;

        if (shmem_inode_acct_blocks(inode, pages))
                return false;

        /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
        xa_lock_irq(&mapping->i_pages);
        mapping->nrpages += pages;
        xa_unlock_irq(&mapping->i_pages);

        shmem_recalc_inode(inode, pages, 0);
        return true;
}

void shmem_uncharge(struct inode *inode, long pages)
{
        /* pages argument is currently unused: keep it to help debugging */
        /* nrpages adjustment done by __filemap_remove_folio() or caller */

        shmem_recalc_inode(inode, 0, 0);
}

/*
 * Replace item expected in xarray by a new item, while holding xa_lock.
 */
static int shmem_replace_entry(struct address_space *mapping,
                        pgoff_t index, void *expected, void *replacement)
{
        XA_STATE(xas, &mapping->i_pages, index);
        void *item;

        VM_BUG_ON(!expected);
        VM_BUG_ON(!replacement);
        item = xas_load(&xas);
        if (item != expected)
                return -ENOENT;
        xas_store(&xas, replacement);
        return 0;
}

/*
 * Sometimes, before we decide whether to proceed or to fail, we must check
 * that an entry was not already brought back or split by a racing thread.
 *
 * Checking folio is not enough: by the time a swapcache folio is locked, it
 * might be reused, and again be swapcache, using the same swap as before.
 * Returns the swap entry's order if it still presents, else returns -1.
 */
static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
                              swp_entry_t swap)
{
        XA_STATE(xas, &mapping->i_pages, index);
        int ret = -1;
        void *entry;

        rcu_read_lock();
        do {
                entry = xas_load(&xas);
                if (entry == swp_to_radix_entry(swap))
                        ret = xas_get_order(&xas);
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();
        return ret;
}

/*
 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 *
 * SHMEM_HUGE_NEVER:
 *        disables huge pages for the mount;
 * SHMEM_HUGE_ALWAYS:
 *        enables huge pages for the mount;
 * SHMEM_HUGE_WITHIN_SIZE:
 *        only allocate huge pages if the page will be fully within i_size,
 *        also respect madvise() hints;
 * SHMEM_HUGE_ADVISE:
 *        only allocate huge pages if requested with madvise();
 */

#define SHMEM_HUGE_NEVER        0
#define SHMEM_HUGE_ALWAYS        1
#define SHMEM_HUGE_WITHIN_SIZE        2
#define SHMEM_HUGE_ADVISE        3

/*
 * Special values.
 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 *
 * SHMEM_HUGE_DENY:
 *        disables huge on shm_mnt and all mounts, for emergency use;
 * SHMEM_HUGE_FORCE:
 *        enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 *
 */
#define SHMEM_HUGE_DENY                (-1)
#define SHMEM_HUGE_FORCE        (-2)

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */

#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER)
#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS)
#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE)
#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE)
#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE
#else
#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
#endif

static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT;

#undef SHMEM_HUGE_DEFAULT

#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER)
#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS)
#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE)
#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE)
#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE
#else
#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
#endif

static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT;

#undef TMPFS_HUGE_DEFAULT

static unsigned int shmem_get_orders_within_size(struct inode *inode,
                unsigned long within_size_orders, pgoff_t index,
                loff_t write_end)
{
        pgoff_t aligned_index;
        unsigned long order;
        loff_t i_size;

        order = highest_order(within_size_orders);
        while (within_size_orders) {
                aligned_index = round_up(index + 1, 1 << order);
                i_size = max(write_end, i_size_read(inode));
                i_size = round_up(i_size, PAGE_SIZE);
                if (i_size >> PAGE_SHIFT >= aligned_index)
                        return within_size_orders;

                order = next_order(&within_size_orders, order);
        }

        return 0;
}

static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
                                              loff_t write_end, bool shmem_huge_force,
                                              struct vm_area_struct *vma,
                                              vm_flags_t vm_flags)
{
        unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
                0 : BIT(HPAGE_PMD_ORDER);
        unsigned long within_size_orders;

        if (!S_ISREG(inode->i_mode))
                return 0;
        if (shmem_huge == SHMEM_HUGE_DENY)
                return 0;
        if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
                return maybe_pmd_order;

        /*
         * The huge order allocation for anon shmem is controlled through
         * the mTHP interface, so we still use PMD-sized huge order to
         * check whether global control is enabled.
         *
         * For tmpfs with 'huge=always' or 'huge=within_size' mount option,
         * we will always try PMD-sized order first. If that failed, it will
         * fall back to small large folios.
         */
        switch (SHMEM_SB(inode->i_sb)->huge) {
        case SHMEM_HUGE_ALWAYS:
                return THP_ORDERS_ALL_FILE_DEFAULT;
        case SHMEM_HUGE_WITHIN_SIZE:
                within_size_orders = shmem_get_orders_within_size(inode,
                                THP_ORDERS_ALL_FILE_DEFAULT, index, write_end);
                if (within_size_orders > 0)
                        return within_size_orders;

                fallthrough;
        case SHMEM_HUGE_ADVISE:
                if (vm_flags & VM_HUGEPAGE)
                        return THP_ORDERS_ALL_FILE_DEFAULT;
                fallthrough;
        default:
                return 0;
        }
}

static int shmem_parse_huge(const char *str)
{
        int huge;

        if (!str)
                return -EINVAL;

        if (!strcmp(str, "never"))
                huge = SHMEM_HUGE_NEVER;
        else if (!strcmp(str, "always"))
                huge = SHMEM_HUGE_ALWAYS;
        else if (!strcmp(str, "within_size"))
                huge = SHMEM_HUGE_WITHIN_SIZE;
        else if (!strcmp(str, "advise"))
                huge = SHMEM_HUGE_ADVISE;
        else if (!strcmp(str, "deny"))
                huge = SHMEM_HUGE_DENY;
        else if (!strcmp(str, "force"))
                huge = SHMEM_HUGE_FORCE;
        else
                return -EINVAL;

        if (!has_transparent_hugepage() &&
            huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
                return -EINVAL;

        /* Do not override huge allocation policy with non-PMD sized mTHP */
        if (huge == SHMEM_HUGE_FORCE &&
            huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
                return -EINVAL;

        return huge;
}

#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static const char *shmem_format_huge(int huge)
{
        switch (huge) {
        case SHMEM_HUGE_NEVER:
                return "never";
        case SHMEM_HUGE_ALWAYS:
                return "always";
        case SHMEM_HUGE_WITHIN_SIZE:
                return "within_size";
        case SHMEM_HUGE_ADVISE:
                return "advise";
        case SHMEM_HUGE_DENY:
                return "deny";
        case SHMEM_HUGE_FORCE:
                return "force";
        default:
                VM_BUG_ON(1);
                return "bad_val";
        }
}
#endif

static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
                struct shrink_control *sc, unsigned long nr_to_free)
{
        LIST_HEAD(list), *pos, *next;
        struct inode *inode;
        struct shmem_inode_info *info;
        struct folio *folio;
        unsigned long batch = sc ? sc->nr_to_scan : 128;
        unsigned long split = 0, freed = 0;

        if (list_empty(&sbinfo->shrinklist))
                return SHRINK_STOP;

        spin_lock(&sbinfo->shrinklist_lock);
        list_for_each_safe(pos, next, &sbinfo->shrinklist) {
                info = list_entry(pos, struct shmem_inode_info, shrinklist);

                /* pin the inode */
                inode = igrab(&info->vfs_inode);

                /* inode is about to be evicted */
                if (!inode) {
                        list_del_init(&info->shrinklist);
                        goto next;
                }

                list_move(&info->shrinklist, &list);
next:
                sbinfo->shrinklist_len--;
                if (!--batch)
                        break;
        }
        spin_unlock(&sbinfo->shrinklist_lock);

        list_for_each_safe(pos, next, &list) {
                pgoff_t next, end;
                loff_t i_size;
                int ret;

                info = list_entry(pos, struct shmem_inode_info, shrinklist);
                inode = &info->vfs_inode;

                if (nr_to_free && freed >= nr_to_free)
                        goto move_back;

                i_size = i_size_read(inode);
                folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE);
                if (!folio || xa_is_value(folio))
                        goto drop;

                /* No large folio at the end of the file: nothing to split */
                if (!folio_test_large(folio)) {
                        folio_put(folio);
                        goto drop;
                }

                /* Check if there is anything to gain from splitting */
                next = folio_next_index(folio);
                end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
                if (end <= folio->index || end >= next) {
                        folio_put(folio);
                        goto drop;
                }

                /*
                 * Move the inode on the list back to shrinklist if we failed
                 * to lock the page at this time.
                 *
                 * Waiting for the lock may lead to deadlock in the
                 * reclaim path.
                 */
                if (!folio_trylock(folio)) {
                        folio_put(folio);
                        goto move_back;
                }

                ret = split_folio(folio);
                folio_unlock(folio);
                folio_put(folio);

                /* If split failed move the inode on the list back to shrinklist */
                if (ret)
                        goto move_back;

                freed += next - end;
                split++;
drop:
                list_del_init(&info->shrinklist);
                goto put;
move_back:
                /*
                 * Make sure the inode is either on the global list or deleted
                 * from any local list before iput() since it could be deleted
                 * in another thread once we put the inode (then the local list
                 * is corrupted).
                 */
                spin_lock(&sbinfo->shrinklist_lock);
                list_move(&info->shrinklist, &sbinfo->shrinklist);
                sbinfo->shrinklist_len++;
                spin_unlock(&sbinfo->shrinklist_lock);
put:
                iput(inode);
        }

        return split;
}

static long shmem_unused_huge_scan(struct super_block *sb,
                struct shrink_control *sc)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

        if (!READ_ONCE(sbinfo->shrinklist_len))
                return SHRINK_STOP;

        return shmem_unused_huge_shrink(sbinfo, sc, 0);
}

static long shmem_unused_huge_count(struct super_block *sb,
                struct shrink_control *sc)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        return READ_ONCE(sbinfo->shrinklist_len);
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */

#define shmem_huge SHMEM_HUGE_DENY

static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
                struct shrink_control *sc, unsigned long nr_to_free)
{
        return 0;
}

static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
                                              loff_t write_end, bool shmem_huge_force,
                                              struct vm_area_struct *vma,
                                              vm_flags_t vm_flags)
{
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static void shmem_update_stats(struct folio *folio, int nr_pages)
{
        if (folio_test_pmd_mappable(folio))
                lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
        lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
        lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
}

/*
 * Somewhat like filemap_add_folio, but error if expected item has gone.
 */
int shmem_add_to_page_cache(struct folio *folio,
                            struct address_space *mapping,
                            pgoff_t index, void *expected, gfp_t gfp)
{
        XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
        unsigned long nr = folio_nr_pages(folio);
        swp_entry_t iter, swap;
        void *entry;

        VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);

        folio_ref_add(folio, nr);
        folio->mapping = mapping;
        folio->index = index;

        gfp &= GFP_RECLAIM_MASK;
        folio_throttle_swaprate(folio, gfp);
        swap = radix_to_swp_entry(expected);

        do {
                iter = swap;
                xas_lock_irq(&xas);
                xas_for_each_conflict(&xas, entry) {
                        /*
                         * The range must either be empty, or filled with
                         * expected swap entries. Shmem swap entries are never
                         * partially freed without split of both entry and
                         * folio, so there shouldn't be any holes.
                         */
                        if (!expected || entry != swp_to_radix_entry(iter)) {
                                xas_set_err(&xas, -EEXIST);
                                goto unlock;
                        }
                        iter.val += 1 << xas_get_order(&xas);
                }
                if (expected && iter.val - nr != swap.val) {
                        xas_set_err(&xas, -EEXIST);
                        goto unlock;
                }
                xas_store(&xas, folio);
                if (xas_error(&xas))
                        goto unlock;
                shmem_update_stats(folio, nr);
                mapping->nrpages += nr;
unlock:
                xas_unlock_irq(&xas);
        } while (xas_nomem(&xas, gfp));

        if (xas_error(&xas)) {
                folio->mapping = NULL;
                folio_ref_sub(folio, nr);
                return xas_error(&xas);
        }

        return 0;
}

/*
 * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
 */
static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{
        struct address_space *mapping = folio->mapping;
        long nr = folio_nr_pages(folio);
        int error;

        xa_lock_irq(&mapping->i_pages);
        error = shmem_replace_entry(mapping, folio->index, folio, radswap);
        folio->mapping = NULL;
        mapping->nrpages -= nr;
        shmem_update_stats(folio, -nr);
        xa_unlock_irq(&mapping->i_pages);
        folio_put_refs(folio, nr);
        BUG_ON(error);
}

/*
 * Remove swap entry from page cache, free the swap and its page cache. Returns
 * the number of pages being freed. 0 means entry not found in XArray (0 pages
 * being freed).
 */
static long shmem_free_swap(struct address_space *mapping,
                            pgoff_t index, pgoff_t end, void *radswap)
{
        XA_STATE(xas, &mapping->i_pages, index);
        unsigned int nr_pages = 0;
        pgoff_t base;
        void *entry;

        xas_lock_irq(&xas);
        entry = xas_load(&xas);
        if (entry == radswap) {
                nr_pages = 1 << xas_get_order(&xas);
                base = round_down(xas.xa_index, nr_pages);
                if (base < index || base + nr_pages - 1 > end)
                        nr_pages = 0;
                else
                        xas_store(&xas, NULL);
        }
        xas_unlock_irq(&xas);

        if (nr_pages)
                swap_put_entries_direct(radix_to_swp_entry(radswap), nr_pages);

        return nr_pages;
}

/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given offsets are swapped out.
 *
 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                                                pgoff_t start, pgoff_t end)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct folio *folio;
        unsigned long swapped = 0;
        unsigned long max = end - 1;

        rcu_read_lock();
        xas_for_each(&xas, folio, max) {
                if (xas_retry(&xas, folio))
                        continue;
                if (xa_is_value(folio))
                        swapped += 1 << xas_get_order(&xas);
                if (xas.xa_index == max)
                        break;
                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();

        return swapped << PAGE_SHIFT;
}

/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given vma is swapped out.
 *
 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
        struct inode *inode = file_inode(vma->vm_file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct address_space *mapping = inode->i_mapping;
        unsigned long swapped;

        /* Be careful as we don't hold info->lock */
        swapped = READ_ONCE(info->swapped);

        /*
         * The easier cases are when the shmem object has nothing in swap, or
         * the vma maps it whole. Then we can simply use the stats that we
         * already track.
         */
        if (!swapped)
                return 0;

        if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
                return swapped << PAGE_SHIFT;

        /* Here comes the more involved part */
        return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
                                        vma->vm_pgoff + vma_pages(vma));
}

/*
 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 */
void shmem_unlock_mapping(struct address_space *mapping)
{
        struct folio_batch fbatch;
        pgoff_t index = 0;

        folio_batch_init(&fbatch);
        /*
         * Minor point, but we might as well stop if someone else SHM_LOCKs it.
         */
        while (!mapping_unevictable(mapping) &&
               filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
                check_move_unevictable_folios(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
}

static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
{
        struct folio *folio;

        /*
         * At first avoid shmem_get_folio(,,,SGP_READ): that fails
         * beyond i_size, and reports fallocated folios as holes.
         */
        folio = filemap_get_entry(inode->i_mapping, index);
        if (!folio)
                return folio;
        if (!xa_is_value(folio)) {
                folio_lock(folio);
                if (folio->mapping == inode->i_mapping)
                        return folio;
                /* The folio has been swapped out */
                folio_unlock(folio);
                folio_put(folio);
        }
        /*
         * But read a folio back from swap if any of it is within i_size
         * (although in some cases this is just a waste of time).
         */
        folio = NULL;
        shmem_get_folio(inode, index, 0, &folio, SGP_READ);
        return folio;
}

/*
 * Remove range of pages and swap entries from page cache, and free them.
 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 */
static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend,
                                                                 bool unfalloc)
{
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
        pgoff_t end = (lend + 1) >> PAGE_SHIFT;
        struct folio_batch fbatch;
        pgoff_t indices[FOLIO_BATCH_SIZE];
        struct folio *folio;
        bool same_folio;
        long nr_swaps_freed = 0;
        pgoff_t index;
        int i;

        if (lend == -1)
                end = -1;        /* unsigned, so actually very big */

        if (info->fallocend > start && info->fallocend <= end && !unfalloc)
                info->fallocend = start;

        folio_batch_init(&fbatch);
        index = start;
        while (index < end && find_lock_entries(mapping, &index, end - 1,
                        &fbatch, indices)) {
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        folio = fbatch.folios[i];

                        if (xa_is_value(folio)) {
                                if (unfalloc)
                                        continue;
                                nr_swaps_freed += shmem_free_swap(mapping, indices[i],
                                                                  end - 1, folio);
                                continue;
                        }

                        if (!unfalloc || !folio_test_uptodate(folio))
                                truncate_inode_folio(mapping, folio);
                        folio_unlock(folio);
                }
                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }

        /*
         * When undoing a failed fallocate, we want none of the partial folio
         * zeroing and splitting below, but shall want to truncate the whole
         * folio when !uptodate indicates that it was added by this fallocate,
         * even when [lstart, lend] covers only a part of the folio.
         */
        if (unfalloc)
                goto whole_folios;

        same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
        folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
        if (folio) {
                same_folio = lend < folio_next_pos(folio);
                folio_mark_dirty(folio);
                if (!truncate_inode_partial_folio(folio, lstart, lend)) {
                        start = folio_next_index(folio);
                        if (same_folio)
                                end = folio->index;
                }
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
        }

        if (!same_folio)
                folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
        if (folio) {
                folio_mark_dirty(folio);
                if (!truncate_inode_partial_folio(folio, lstart, lend))
                        end = folio->index;
                folio_unlock(folio);
                folio_put(folio);
        }

whole_folios:

        index = start;
        while (index < end) {
                cond_resched();

                if (!find_get_entries(mapping, &index, end - 1, &fbatch,
                                indices)) {
                        /* If all gone or hole-punch or unfalloc, we're done */
                        if (index == start || end != -1)
                                break;
                        /* But if truncating, restart to make sure all gone */
                        index = start;
                        continue;
                }
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        folio = fbatch.folios[i];

                        if (xa_is_value(folio)) {
                                int order;
                                long swaps_freed;

                                if (unfalloc)
                                        continue;
                                swaps_freed = shmem_free_swap(mapping, indices[i],
                                                              end - 1, folio);
                                if (!swaps_freed) {
                                        pgoff_t base = indices[i];

                                        order = shmem_confirm_swap(mapping, indices[i],
                                                                   radix_to_swp_entry(folio));
                                        /*
                                         * If found a large swap entry cross the end or start
                                         * border, skip it as the truncate_inode_partial_folio
                                         * above should have at least zerod its content once.
                                         */
                                        if (order > 0) {
                                                base = round_down(base, 1 << order);
                                                if (base < start || base + (1 << order) > end)
                                                        continue;
                                        }
                                        /* Swap was replaced by page or extended, retry */
                                        index = base;
                                        break;
                                }
                                nr_swaps_freed += swaps_freed;
                                continue;
                        }

                        folio_lock(folio);

                        if (!unfalloc || !folio_test_uptodate(folio)) {
                                if (folio_mapping(folio) != mapping) {
                                        /* Page was replaced by swap: retry */
                                        folio_unlock(folio);
                                        index = indices[i];
                                        break;
                                }
                                VM_BUG_ON_FOLIO(folio_test_writeback(folio),
                                                folio);

                                if (!folio_test_large(folio)) {
                                        truncate_inode_folio(mapping, folio);
                                } else if (truncate_inode_partial_folio(folio, lstart, lend)) {
                                        /*
                                         * If we split a page, reset the loop so
                                         * that we pick up the new sub pages.
                                         * Otherwise the THP was entirely
                                         * dropped or the target range was
                                         * zeroed, so just continue the loop as
                                         * is.
                                         */
                                        if (!folio_test_large(folio)) {
                                                folio_unlock(folio);
                                                index = start;
                                                break;
                                        }
                                }
                        }
                        folio_unlock(folio);
                }
                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
        }

        shmem_recalc_inode(inode, 0, -nr_swaps_freed);
}

void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
{
        shmem_undo_range(inode, lstart, lend, false);
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);

static int shmem_getattr(struct mnt_idmap *idmap,
                         const struct path *path, struct kstat *stat,
                         u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = path->dentry->d_inode;
        struct shmem_inode_info *info = SHMEM_I(inode);

        if (info->alloced - info->swapped != inode->i_mapping->nrpages)
                shmem_recalc_inode(inode, 0, 0);

        if (info->fsflags & FS_APPEND_FL)
                stat->attributes |= STATX_ATTR_APPEND;
        if (info->fsflags & FS_IMMUTABLE_FL)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (info->fsflags & FS_NODUMP_FL)
                stat->attributes |= STATX_ATTR_NODUMP;
        stat->attributes_mask |= (STATX_ATTR_APPEND |
                        STATX_ATTR_IMMUTABLE |
                        STATX_ATTR_NODUMP);
        generic_fillattr(idmap, request_mask, inode, stat);

        if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
                stat->blksize = HPAGE_PMD_SIZE;

        if (request_mask & STATX_BTIME) {
                stat->result_mask |= STATX_BTIME;
                stat->btime.tv_sec = info->i_crtime.tv_sec;
                stat->btime.tv_nsec = info->i_crtime.tv_nsec;
        }

        return 0;
}

static int shmem_setattr(struct mnt_idmap *idmap,
                         struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int error;
        bool update_mtime = false;
        bool update_ctime = true;

        error = setattr_prepare(idmap, dentry, attr);
        if (error)
                return error;

        if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
                if ((inode->i_mode ^ attr->ia_mode) & 0111) {
                        return -EPERM;
                }
        }

        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;

                /* protected by i_rwsem */
                if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
                    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
                        return -EPERM;

                if (newsize != oldsize) {
                        if (info->flags & SHMEM_F_MAPPING_FROZEN)
                                return -EPERM;
                        error = shmem_reacct_size(SHMEM_I(inode)->flags,
                                        oldsize, newsize);
                        if (error)
                                return error;
                        i_size_write(inode, newsize);
                        update_mtime = true;
                } else {
                        update_ctime = false;
                }
                if (newsize <= oldsize) {
                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
                        if (oldsize > holebegin)
                                unmap_mapping_range(inode->i_mapping,
                                                        holebegin, 0, 1);
                        if (info->alloced)
                                shmem_truncate_range(inode,
                                                        newsize, (loff_t)-1);
                        /* unmap again to remove racily COWed private pages */
                        if (oldsize > holebegin)
                                unmap_mapping_range(inode->i_mapping,
                                                        holebegin, 0, 1);
                }
        }

        if (is_quota_modification(idmap, inode, attr)) {
                error = dquot_initialize(inode);
                if (error)
                        return error;
        }

        /* Transfer quota accounting */
        if (i_uid_needs_update(idmap, attr, inode) ||
            i_gid_needs_update(idmap, attr, inode)) {
                error = dquot_transfer(idmap, inode, attr);
                if (error)
                        return error;
        }

        setattr_copy(idmap, inode, attr);
        if (attr->ia_valid & ATTR_MODE)
                error = posix_acl_chmod(idmap, dentry, inode->i_mode);
        if (!error && update_ctime) {
                inode_set_ctime_current(inode);
                if (update_mtime)
                        inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
                inode_inc_iversion(inode);
        }
        return error;
}

static void shmem_evict_inode(struct inode *inode)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        size_t freed = 0;

        if (shmem_mapping(inode->i_mapping)) {
                shmem_unacct_size(info->flags, inode->i_size);
                inode->i_size = 0;
                mapping_set_exiting(inode->i_mapping);
                shmem_truncate_range(inode, 0, (loff_t)-1);
                if (!list_empty(&info->shrinklist)) {
                        spin_lock(&sbinfo->shrinklist_lock);
                        if (!list_empty(&info->shrinklist)) {
                                list_del_init(&info->shrinklist);
                                sbinfo->shrinklist_len--;
                        }
                        spin_unlock(&sbinfo->shrinklist_lock);
                }
                while (!list_empty(&info->swaplist)) {
                        /* Wait while shmem_unuse() is scanning this inode... */
                        wait_var_event(&info->stop_eviction,
                                       !atomic_read(&info->stop_eviction));
                        spin_lock(&shmem_swaplist_lock);
                        /* ...but beware of the race if we peeked too early */
                        if (!atomic_read(&info->stop_eviction))
                                list_del_init(&info->swaplist);
                        spin_unlock(&shmem_swaplist_lock);
                }
        }

        if (info->xattrs) {
                simple_xattrs_free(info->xattrs, sbinfo->max_inodes ? &freed : NULL);
                kfree(info->xattrs);
        }
        shmem_free_inode(inode->i_sb, freed);
        WARN_ON(inode->i_blocks);
        clear_inode(inode);
#ifdef CONFIG_TMPFS_QUOTA
        dquot_free_inode(inode);
        dquot_drop(inode);
#endif
}

static unsigned int shmem_find_swap_entries(struct address_space *mapping,
                                pgoff_t start, struct folio_batch *fbatch,
                                pgoff_t *indices, unsigned int type)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct folio *folio;
        swp_entry_t entry;

        rcu_read_lock();
        xas_for_each(&xas, folio, ULONG_MAX) {
                if (xas_retry(&xas, folio))
                        continue;

                if (!xa_is_value(folio))
                        continue;

                entry = radix_to_swp_entry(folio);
                /*
                 * swapin error entries can be found in the mapping. But they're
                 * deliberately ignored here as we've done everything we can do.
                 */
                if (swp_type(entry) != type)
                        continue;

                indices[folio_batch_count(fbatch)] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;

                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}

/*
 * Move the swapped pages for an inode to page cache. Returns the count
 * of pages swapped in, or the error in case of failure.
 */
static int shmem_unuse_swap_entries(struct inode *inode,
                struct folio_batch *fbatch, pgoff_t *indices)
{
        int i = 0;
        int ret = 0;
        int error = 0;
        struct address_space *mapping = inode->i_mapping;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
                                        mapping_gfp_mask(mapping), NULL, NULL);
                if (error == 0) {
                        folio_unlock(folio);
                        folio_put(folio);
                        ret++;
                }
                if (error == -ENOMEM)
                        break;
                error = 0;
        }
        return error ? error : ret;
}

/*
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
static int shmem_unuse_inode(struct inode *inode, unsigned int type)
{
        struct address_space *mapping = inode->i_mapping;
        pgoff_t start = 0;
        struct folio_batch fbatch;
        pgoff_t indices[FOLIO_BATCH_SIZE];
        int ret = 0;

        do {
                folio_batch_init(&fbatch);
                if (!shmem_find_swap_entries(mapping, start, &fbatch,
                                             indices, type)) {
                        ret = 0;
                        break;
                }

                ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
                if (ret < 0)
                        break;

                start = indices[folio_batch_count(&fbatch) - 1];
        } while (true);

        return ret;
}

/*
 * Read all the shared memory data that resides in the swap
 * device 'type' back into memory, so the swap device can be
 * unused.
 */
int shmem_unuse(unsigned int type)
{
        struct shmem_inode_info *info, *next;
        int error = 0;

        if (list_empty(&shmem_swaplist))
                return 0;

        spin_lock(&shmem_swaplist_lock);
start_over:
        list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
                if (!info->swapped) {
                        list_del_init(&info->swaplist);
                        continue;
                }
                /*
                 * Drop the swaplist mutex while searching the inode for swap;
                 * but before doing so, make sure shmem_evict_inode() will not
                 * remove placeholder inode from swaplist, nor let it be freed
                 * (igrab() would protect from unlink, but not from unmount).
                 */
                atomic_inc(&info->stop_eviction);
                spin_unlock(&shmem_swaplist_lock);

                error = shmem_unuse_inode(&info->vfs_inode, type);
                cond_resched();

                spin_lock(&shmem_swaplist_lock);
                if (atomic_dec_and_test(&info->stop_eviction))
                        wake_up_var(&info->stop_eviction);
                if (error)
                        break;
                if (list_empty(&info->swaplist))
                        goto start_over;
                next = list_next_entry(info, swaplist);
                if (!info->swapped)
                        list_del_init(&info->swaplist);
        }
        spin_unlock(&shmem_swaplist_lock);

        return error;
}

/**
 * shmem_writeout - Write the folio to swap
 * @folio: The folio to write
 * @plug: swap plug
 * @folio_list: list to put back folios on split
 *
 * Move the folio from the page cache to the swap cache.
 */
int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
                struct list_head *folio_list)
{
        struct address_space *mapping = folio->mapping;
        struct inode *inode = mapping->host;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        pgoff_t index;
        int nr_pages;
        bool split = false;

        if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap)
                goto redirty;

        if (!total_swap_pages)
                goto redirty;

        /*
         * If CONFIG_THP_SWAP is not enabled, the large folio should be
         * split when swapping.
         *
         * And shrinkage of pages beyond i_size does not split swap, so
         * swapout of a large folio crossing i_size needs to split too
         * (unless fallocate has been used to preallocate beyond EOF).
         */
        if (folio_test_large(folio)) {
                index = shmem_fallocend(inode,
                        DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
                if ((index > folio->index && index < folio_next_index(folio)) ||
                    !IS_ENABLED(CONFIG_THP_SWAP))
                        split = true;
        }

        if (split) {
                int order;

try_split:
                order = folio_order(folio);
                /* Ensure the subpages are still dirty */
                folio_test_set_dirty(folio);
                if (split_folio_to_list(folio, folio_list))
                        goto redirty;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                if (order >= HPAGE_PMD_ORDER) {
                        count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
                        count_vm_event(THP_SWPOUT_FALLBACK);
                }
#endif
                count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);

                folio_clear_dirty(folio);
        }

        index = folio->index;
        nr_pages = folio_nr_pages(folio);

        /*
         * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
         * value into swapfile.c, the only way we can correctly account for a
         * fallocated folio arriving here is now to initialize it and write it.
         *
         * That's okay for a folio already fallocated earlier, but if we have
         * not yet completed the fallocation, then (a) we want to keep track
         * of this folio in case we have to undo it, and (b) it may not be a
         * good idea to continue anyway, once we're pushing into swap.  So
         * reactivate the folio, and let shmem_fallocate() quit when too many.
         */
        if (!folio_test_uptodate(folio)) {
                if (inode->i_private) {
                        struct shmem_falloc *shmem_falloc;
                        spin_lock(&inode->i_lock);
                        shmem_falloc = inode->i_private;
                        if (shmem_falloc &&
                            !shmem_falloc->waitq &&
                            index >= shmem_falloc->start &&
                            index < shmem_falloc->next)
                                shmem_falloc->nr_unswapped += nr_pages;
                        else
                                shmem_falloc = NULL;
                        spin_unlock(&inode->i_lock);
                        if (shmem_falloc)
                                goto redirty;
                }
                folio_zero_range(folio, 0, folio_size(folio));
                flush_dcache_folio(folio);
                folio_mark_uptodate(folio);
        }

        if (!folio_alloc_swap(folio)) {
                bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
                int error;

                /*
                 * Add inode to shmem_unuse()'s list of swapped-out inodes,
                 * if it's not already there.  Do it now before the folio is
                 * removed from page cache, when its pagelock no longer
                 * protects the inode from eviction.  And do it now, after
                 * we've incremented swapped, because shmem_unuse() will
                 * prune a !swapped inode from the swaplist.
                 */
                if (first_swapped) {
                        spin_lock(&shmem_swaplist_lock);
                        if (list_empty(&info->swaplist))
                                list_add(&info->swaplist, &shmem_swaplist);
                        spin_unlock(&shmem_swaplist_lock);
                }

                folio_dup_swap(folio, NULL);
                shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));

                BUG_ON(folio_mapped(folio));
                error = swap_writeout(folio, plug);
                if (error != AOP_WRITEPAGE_ACTIVATE) {
                        /* folio has been unlocked */
                        return error;
                }

                /*
                 * The intention here is to avoid holding on to the swap when
                 * zswap was unable to compress and unable to writeback; but
                 * it will be appropriate if other reactivate cases are added.
                 */
                error = shmem_add_to_page_cache(folio, mapping, index,
                                swp_to_radix_entry(folio->swap),
                                __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
                /* Swap entry might be erased by racing shmem_free_swap() */
                if (!error) {
                        shmem_recalc_inode(inode, 0, -nr_pages);
                        folio_put_swap(folio, NULL);
                }

                /*
                 * The swap_cache_del_folio() below could be left for
                 * shrink_folio_list()'s folio_free_swap() to dispose of;
                 * but I'm a little nervous about letting this folio out of
                 * shmem_writeout() in a hybrid half-tmpfs-half-swap state
                 * e.g. folio_mapping(folio) might give an unexpected answer.
                 */
                swap_cache_del_folio(folio);
                goto redirty;
        }
        if (nr_pages > 1)
                goto try_split;
redirty:
        folio_mark_dirty(folio);
        return AOP_WRITEPAGE_ACTIVATE;        /* Return with folio locked */
}
EXPORT_SYMBOL_GPL(shmem_writeout);

#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
        char buffer[64];

        if (!mpol || mpol->mode == MPOL_DEFAULT)
                return;                /* show nothing */

        mpol_to_str(buffer, sizeof(buffer), mpol);

        seq_printf(seq, ",mpol=%s", buffer);
}

static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
        struct mempolicy *mpol = NULL;
        if (sbinfo->mpol) {
                raw_spin_lock(&sbinfo->stat_lock);        /* prevent replace/use races */
                mpol = sbinfo->mpol;
                mpol_get(mpol);
                raw_spin_unlock(&sbinfo->stat_lock);
        }
        return mpol;
}
#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
}
static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
        return NULL;
}
#endif /* CONFIG_NUMA && CONFIG_TMPFS */

static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
                        pgoff_t index, unsigned int order, pgoff_t *ilx);

static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
                        struct shmem_inode_info *info, pgoff_t index)
{
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct folio *folio;

        mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
        folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
        mpol_cond_put(mpol);

        return folio;
}

/*
 * Make sure huge_gfp is always more limited than limit_gfp.
 * Some of the flags set permissions, while others set limitations.
 */
static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
{
        gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
        gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
        gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
        gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);

        /* Allow allocations only from the originally specified zones. */
        result |= zoneflags;

        /*
         * Minimize the result gfp by taking the union with the deny flags,
         * and the intersection of the allow flags.
         */
        result |= (limit_gfp & denyflags);
        result |= (huge_gfp & limit_gfp) & allowflags;

        return result;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool shmem_hpage_pmd_enabled(void)
{
        if (shmem_huge == SHMEM_HUGE_DENY)
                return false;
        if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always))
                return true;
        if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise))
                return true;
        if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size))
                return true;
        if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
            shmem_huge != SHMEM_HUGE_NEVER)
                return true;

        return false;
}

unsigned long shmem_allowable_huge_orders(struct inode *inode,
                                struct vm_area_struct *vma, pgoff_t index,
                                loff_t write_end, bool shmem_huge_force)
{
        unsigned long mask = READ_ONCE(huge_shmem_orders_always);
        unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
        vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
        unsigned int global_orders;

        if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force)))
                return 0;

        global_orders = shmem_huge_global_enabled(inode, index, write_end,
                                                  shmem_huge_force, vma, vm_flags);
        /* Tmpfs huge pages allocation */
        if (!vma || !vma_is_anon_shmem(vma))
                return global_orders;

        /*
         * Following the 'deny' semantics of the top level, force the huge
         * option off from all mounts.
         */
        if (shmem_huge == SHMEM_HUGE_DENY)
                return 0;

        /*
         * Only allow inherit orders if the top-level value is 'force', which
         * means non-PMD sized THP can not override 'huge' mount option now.
         */
        if (shmem_huge == SHMEM_HUGE_FORCE)
                return READ_ONCE(huge_shmem_orders_inherit);

        /* Allow mTHP that will be fully within i_size. */
        mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);

        if (vm_flags & VM_HUGEPAGE)
                mask |= READ_ONCE(huge_shmem_orders_madvise);

        if (global_orders > 0)
                mask |= READ_ONCE(huge_shmem_orders_inherit);

        return THP_ORDERS_ALL_FILE_DEFAULT & mask;
}

static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
                                           struct address_space *mapping, pgoff_t index,
                                           unsigned long orders)
{
        struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
        pgoff_t aligned_index;
        unsigned long pages;
        int order;

        if (vma) {
                orders = thp_vma_suitable_orders(vma, vmf->address, orders);
                if (!orders)
                        return 0;
        }

        /* Find the highest order that can add into the page cache */
        order = highest_order(orders);
        while (orders) {
                pages = 1UL << order;
                aligned_index = round_down(index, pages);
                /*
                 * Check for conflict before waiting on a huge allocation.
                 * Conflict might be that a huge page has just been allocated
                 * and added to page cache by a racing thread, or that there
                 * is already at least one small page in the huge extent.
                 * Be careful to retry when appropriate, but not forever!
                 * Elsewhere -EEXIST would be the right code, but not here.
                 */
                if (!xa_find(&mapping->i_pages, &aligned_index,
                             aligned_index + pages - 1, XA_PRESENT))
                        break;
                order = next_order(&orders, order);
        }

        return orders;
}
#else
static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
                                           struct address_space *mapping, pgoff_t index,
                                           unsigned long orders)
{
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static struct folio *shmem_alloc_folio(gfp_t gfp, int order,
                struct shmem_inode_info *info, pgoff_t index)
{
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct folio *folio;

        mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
        folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
        mpol_cond_put(mpol);

        return folio;
}

static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
                gfp_t gfp, struct inode *inode, pgoff_t index,
                struct mm_struct *fault_mm, unsigned long orders)
{
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        unsigned long suitable_orders = 0;
        struct folio *folio = NULL;
        pgoff_t aligned_index;
        long pages;
        int error, order;

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                orders = 0;

        if (orders > 0) {
                suitable_orders = shmem_suitable_orders(inode, vmf,
                                                        mapping, index, orders);

                order = highest_order(suitable_orders);
                while (suitable_orders) {
                        pages = 1UL << order;
                        aligned_index = round_down(index, pages);
                        folio = shmem_alloc_folio(gfp, order, info, aligned_index);
                        if (folio) {
                                index = aligned_index;
                                goto allocated;
                        }

                        if (pages == HPAGE_PMD_NR)
                                count_vm_event(THP_FILE_FALLBACK);
                        count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
                        order = next_order(&suitable_orders, order);
                }
        } else {
                pages = 1;
                folio = shmem_alloc_folio(gfp, 0, info, index);
        }
        if (!folio)
                return ERR_PTR(-ENOMEM);

allocated:
        __folio_set_locked(folio);
        __folio_set_swapbacked(folio);

        gfp &= GFP_RECLAIM_MASK;
        error = mem_cgroup_charge(folio, fault_mm, gfp);
        if (error) {
                if (xa_find(&mapping->i_pages, &index,
                                index + pages - 1, XA_PRESENT)) {
                        error = -EEXIST;
                } else if (pages > 1) {
                        if (pages == HPAGE_PMD_NR) {
                                count_vm_event(THP_FILE_FALLBACK);
                                count_vm_event(THP_FILE_FALLBACK_CHARGE);
                        }
                        count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
                        count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
                }
                goto unlock;
        }

        error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
        if (error)
                goto unlock;

        error = shmem_inode_acct_blocks(inode, pages);
        if (error) {
                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
                long freed;
                /*
                 * Try to reclaim some space by splitting a few
                 * large folios beyond i_size on the filesystem.
                 */
                shmem_unused_huge_shrink(sbinfo, NULL, pages);
                /*
                 * And do a shmem_recalc_inode() to account for freed pages:
                 * except our folio is there in cache, so not quite balanced.
                 */
                spin_lock(&info->lock);
                freed = pages + info->alloced - info->swapped -
                        READ_ONCE(mapping->nrpages);
                if (freed > 0)
                        info->alloced -= freed;
                spin_unlock(&info->lock);
                if (freed > 0)
                        shmem_inode_unacct_blocks(inode, freed);
                error = shmem_inode_acct_blocks(inode, pages);
                if (error) {
                        filemap_remove_folio(folio);
                        goto unlock;
                }
        }

        shmem_recalc_inode(inode, pages, 0);
        folio_add_lru(folio);
        return folio;

unlock:
        folio_unlock(folio);
        folio_put(folio);
        return ERR_PTR(error);
}

static struct folio *shmem_swap_alloc_folio(struct inode *inode,
                struct vm_area_struct *vma, pgoff_t index,
                swp_entry_t entry, int order, gfp_t gfp)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct folio *new, *swapcache;
        int nr_pages = 1 << order;
        gfp_t alloc_gfp = gfp;

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                if (WARN_ON_ONCE(order))
                        return ERR_PTR(-EINVAL);
        } else if (order) {
                /*
                 * If uffd is active for the vma, we need per-page fault
                 * fidelity to maintain the uffd semantics, then fallback
                 * to swapin order-0 folio, as well as for zswap case.
                 * Any existing sub folio in the swap cache also blocks
                 * mTHP swapin.
                 */
                if ((vma && unlikely(userfaultfd_armed(vma))) ||
                     !zswap_never_enabled() ||
                     non_swapcache_batch(entry, nr_pages) != nr_pages)
                        goto fallback;

                alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
        }
retry:
        new = shmem_alloc_folio(alloc_gfp, order, info, index);
        if (!new) {
                new = ERR_PTR(-ENOMEM);
                goto fallback;
        }

        if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
                                           alloc_gfp, entry)) {
                folio_put(new);
                new = ERR_PTR(-ENOMEM);
                goto fallback;
        }

        swapcache = swapin_folio(entry, new);
        if (swapcache != new) {
                folio_put(new);
                if (!swapcache) {
                        /*
                         * The new folio is charged already, swapin can
                         * only fail due to another raced swapin.
                         */
                        new = ERR_PTR(-EEXIST);
                        goto fallback;
                }
        }
        return swapcache;
fallback:
        /* Order 0 swapin failed, nothing to fallback to, abort */
        if (!order)
                return new;
        entry.val += index - round_down(index, nr_pages);
        alloc_gfp = gfp;
        nr_pages = 1;
        order = 0;
        goto retry;
}

/*
 * When a page is moved from swapcache to shmem filecache (either by the
 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
 * shmem_unuse_inode()), it may have been read in earlier from swap, in
 * ignorance of the mapping it belongs to.  If that mapping has special
 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
 * we may need to copy to a suitable page before moving to filecache.
 *
 * In a future release, this may well be extended to respect cpuset and
 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
 * but for now it is a simple matter of zone.
 */
static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
{
        return folio_zonenum(folio) > gfp_zone(gfp);
}

static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
                                struct shmem_inode_info *info, pgoff_t index,
                                struct vm_area_struct *vma)
{
        struct swap_cluster_info *ci;
        struct folio *new, *old = *foliop;
        swp_entry_t entry = old->swap;
        int nr_pages = folio_nr_pages(old);
        int error = 0;

        /*
         * We have arrived here because our zones are constrained, so don't
         * limit chance of success by further cpuset and node constraints.
         */
        gfp &= ~GFP_CONSTRAINT_MASK;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (nr_pages > 1) {
                gfp_t huge_gfp = vma_thp_gfp_mask(vma);

                gfp = limit_gfp_mask(huge_gfp, gfp);
        }
#endif

        new = shmem_alloc_folio(gfp, folio_order(old), info, index);
        if (!new)
                return -ENOMEM;

        folio_ref_add(new, nr_pages);
        folio_copy(new, old);
        flush_dcache_folio(new);

        __folio_set_locked(new);
        __folio_set_swapbacked(new);
        folio_mark_uptodate(new);
        new->swap = entry;
        folio_set_swapcache(new);

        ci = swap_cluster_get_and_lock_irq(old);
        __swap_cache_replace_folio(ci, old, new);
        mem_cgroup_replace_folio(old, new);
        shmem_update_stats(new, nr_pages);
        shmem_update_stats(old, -nr_pages);
        swap_cluster_unlock_irq(ci);

        folio_add_lru(new);
        *foliop = new;

        folio_clear_swapcache(old);
        old->private = NULL;

        folio_unlock(old);
        /*
         * The old folio are removed from swap cache, drop the 'nr_pages'
         * reference, as well as one temporary reference getting from swap
         * cache.
         */
        folio_put_refs(old, nr_pages + 1);
        return error;
}

static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
                                         struct folio *folio, swp_entry_t swap)
{
        struct address_space *mapping = inode->i_mapping;
        swp_entry_t swapin_error;
        void *old;
        int nr_pages;

        swapin_error = make_poisoned_swp_entry();
        old = xa_cmpxchg_irq(&mapping->i_pages, index,
                             swp_to_radix_entry(swap),
                             swp_to_radix_entry(swapin_error), 0);
        if (old != swp_to_radix_entry(swap))
                return;

        nr_pages = folio_nr_pages(folio);
        folio_wait_writeback(folio);
        folio_put_swap(folio, NULL);
        swap_cache_del_folio(folio);
        /*
         * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
         * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
         * in shmem_evict_inode().
         */
        shmem_recalc_inode(inode, -nr_pages, -nr_pages);
}

static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
                                   swp_entry_t swap, gfp_t gfp)
{
        struct address_space *mapping = inode->i_mapping;
        XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
        int split_order = 0;
        int i;

        /* Convert user data gfp flags to xarray node gfp flags */
        gfp &= GFP_RECLAIM_MASK;

        for (;;) {
                void *old = NULL;
                int cur_order;
                pgoff_t swap_index;

                xas_lock_irq(&xas);
                old = xas_load(&xas);
                if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) {
                        xas_set_err(&xas, -EEXIST);
                        goto unlock;
                }

                cur_order = xas_get_order(&xas);
                if (!cur_order)
                        goto unlock;

                /* Try to split large swap entry in pagecache */
                swap_index = round_down(index, 1 << cur_order);
                split_order = xas_try_split_min_order(cur_order);

                while (cur_order > 0) {
                        pgoff_t aligned_index =
                                round_down(index, 1 << cur_order);
                        pgoff_t swap_offset = aligned_index - swap_index;

                        xas_set_order(&xas, index, split_order);
                        xas_try_split(&xas, old, cur_order);
                        if (xas_error(&xas))
                                goto unlock;

                        /*
                         * Re-set the swap entry after splitting, and the swap
                         * offset of the original large entry must be continuous.
                         */
                        for (i = 0; i < 1 << cur_order;
                             i += (1 << split_order)) {
                                swp_entry_t tmp;

                                tmp = swp_entry(swp_type(swap),
                                                swp_offset(swap) + swap_offset +
                                                        i);
                                __xa_store(&mapping->i_pages, aligned_index + i,
                                           swp_to_radix_entry(tmp), 0);
                        }
                        cur_order = split_order;
                        split_order = xas_try_split_min_order(split_order);
                }

unlock:
                xas_unlock_irq(&xas);

                if (!xas_nomem(&xas, gfp))
                        break;
        }

        if (xas_error(&xas))
                return xas_error(&xas);

        return 0;
}

/*
 * Swap in the folio pointed to by *foliop.
 * Caller has to make sure that *foliop contains a valid swapped folio.
 * Returns 0 and the folio in foliop if success. On failure, returns the
 * error code and NULL in *foliop.
 */
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                             struct folio **foliop, enum sgp_type sgp,
                             gfp_t gfp, struct vm_area_struct *vma,
                             vm_fault_t *fault_type)
{
        struct address_space *mapping = inode->i_mapping;
        struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
        struct shmem_inode_info *info = SHMEM_I(inode);
        swp_entry_t swap;
        softleaf_t index_entry;
        struct swap_info_struct *si;
        struct folio *folio = NULL;
        int error, nr_pages, order;
        pgoff_t offset;

        VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
        index_entry = radix_to_swp_entry(*foliop);
        swap = index_entry;
        *foliop = NULL;

        if (softleaf_is_poison_marker(index_entry))
                return -EIO;

        si = get_swap_device(index_entry);
        order = shmem_confirm_swap(mapping, index, index_entry);
        if (unlikely(!si)) {
                if (order < 0)
                        return -EEXIST;
                else
                        return -EINVAL;
        }
        if (unlikely(order < 0)) {
                put_swap_device(si);
                return -EEXIST;
        }

        /* index may point to the middle of a large entry, get the sub entry */
        if (order) {
                offset = index - round_down(index, 1 << order);
                swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
        }

        /* Look it up and read it in.. */
        folio = swap_cache_get_folio(swap);
        if (!folio) {
                if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
                        /* Direct swapin skipping swap cache & readahead */
                        folio = shmem_swap_alloc_folio(inode, vma, index,
                                                       index_entry, order, gfp);
                        if (IS_ERR(folio)) {
                                error = PTR_ERR(folio);
                                folio = NULL;
                                goto failed;
                        }
                } else {
                        /* Cached swapin only supports order 0 folio */
                        folio = shmem_swapin_cluster(swap, gfp, info, index);
                        if (!folio) {
                                error = -ENOMEM;
                                goto failed;
                        }
                }
                if (fault_type) {
                        *fault_type |= VM_FAULT_MAJOR;
                        count_vm_event(PGMAJFAULT);
                        count_memcg_event_mm(fault_mm, PGMAJFAULT);
                }
        } else {
                swap_update_readahead(folio, NULL, 0);
        }

        if (order > folio_order(folio)) {
                /*
                 * Swapin may get smaller folios due to various reasons:
                 * It may fallback to order 0 due to memory pressure or race,
                 * swap readahead may swap in order 0 folios into swapcache
                 * asynchronously, while the shmem mapping can still stores
                 * large swap entries. In such cases, we should split the
                 * large swap entry to prevent possible data corruption.
                 */
                error = shmem_split_large_entry(inode, index, index_entry, gfp);
                if (error)
                        goto failed_nolock;
        }

        /*
         * If the folio is large, round down swap and index by folio size.
         * No matter what race occurs, the swap layer ensures we either get
         * a valid folio that has its swap entry aligned by size, or a
         * temporarily invalid one which we'll abort very soon and retry.
         *
         * shmem_add_to_page_cache ensures the whole range contains expected
         * entries and prevents any corruption, so any race split is fine
         * too, it will succeed as long as the entries are still there.
         */
        nr_pages = folio_nr_pages(folio);
        if (nr_pages > 1) {
                swap.val = round_down(swap.val, nr_pages);
                index = round_down(index, nr_pages);
        }

        /*
         * We have to do this with the folio locked to prevent races.
         * The shmem_confirm_swap below only checks if the first swap
         * entry matches the folio, that's enough to ensure the folio
         * is not used outside of shmem, as shmem swap entries
         * and swap cache folios are never partially freed.
         */
        folio_lock(folio);
        if (!folio_matches_swap_entry(folio, swap) ||
            shmem_confirm_swap(mapping, index, swap) < 0) {
                error = -EEXIST;
                goto unlock;
        }
        if (!folio_test_uptodate(folio)) {
                error = -EIO;
                goto failed;
        }
        folio_wait_writeback(folio);

        /*
         * Some architectures may have to restore extra metadata to the
         * folio after reading from swap.
         */
        arch_swap_restore(folio_swap(swap, folio), folio);

        if (shmem_should_replace_folio(folio, gfp)) {
                error = shmem_replace_folio(&folio, gfp, info, index, vma);
                if (error)
                        goto failed;
        }

        error = shmem_add_to_page_cache(folio, mapping, index,
                                        swp_to_radix_entry(swap), gfp);
        if (error)
                goto failed;

        shmem_recalc_inode(inode, 0, -nr_pages);

        if (sgp == SGP_WRITE)
                folio_mark_accessed(folio);

        folio_put_swap(folio, NULL);
        swap_cache_del_folio(folio);
        folio_mark_dirty(folio);
        put_swap_device(si);

        *foliop = folio;
        return 0;
failed:
        if (shmem_confirm_swap(mapping, index, swap) < 0)
                error = -EEXIST;
        if (error == -EIO)
                shmem_set_folio_swapin_error(inode, index, folio, swap);
unlock:
        if (folio)
                folio_unlock(folio);
failed_nolock:
        if (folio)
                folio_put(folio);
        put_swap_device(si);

        return error;
}

/*
 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache.
 *
 * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
 */
static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
                loff_t write_end, struct folio **foliop, enum sgp_type sgp,
                gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
{
        struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
        struct mm_struct *fault_mm;
        struct folio *folio;
        int error;
        bool alloced;
        unsigned long orders = 0;

        if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
                return -EINVAL;

        if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
                return -EFBIG;
repeat:
        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
                return -EINVAL;

        alloced = false;
        fault_mm = vma ? vma->vm_mm : NULL;

        folio = filemap_get_entry(inode->i_mapping, index);
        if (folio && vma && userfaultfd_minor(vma)) {
                if (!xa_is_value(folio))
                        folio_put(folio);
                *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
                return 0;
        }

        if (xa_is_value(folio)) {
                error = shmem_swapin_folio(inode, index, &folio,
                                           sgp, gfp, vma, fault_type);
                if (error == -EEXIST)
                        goto repeat;

                *foliop = folio;
                return error;
        }

        if (folio) {
                folio_lock(folio);

                /* Has the folio been truncated or swapped out? */
                if (unlikely(folio->mapping != inode->i_mapping)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto repeat;
                }
                if (sgp == SGP_WRITE)
                        folio_mark_accessed(folio);
                if (folio_test_uptodate(folio))
                        goto out;
                /* fallocated folio */
                if (sgp != SGP_READ)
                        goto clear;
                folio_unlock(folio);
                folio_put(folio);
        }

        /*
         * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
         * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
         */
        *foliop = NULL;
        if (sgp == SGP_READ)
                return 0;
        if (sgp == SGP_NOALLOC)
                return -ENOENT;

        /*
         * Fast cache lookup and swap lookup did not find it: allocate.
         */

        if (vma && userfaultfd_missing(vma)) {
                *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
                return 0;
        }

        /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
        orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
        if (orders > 0) {
                gfp_t huge_gfp;

                huge_gfp = vma_thp_gfp_mask(vma);
                huge_gfp = limit_gfp_mask(huge_gfp, gfp);
                folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
                                inode, index, fault_mm, orders);
                if (!IS_ERR(folio)) {
                        if (folio_test_pmd_mappable(folio))
                                count_vm_event(THP_FILE_ALLOC);
                        count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
                        goto alloced;
                }
                if (PTR_ERR(folio) == -EEXIST)
                        goto repeat;
        }

        folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0);
        if (IS_ERR(folio)) {
                error = PTR_ERR(folio);
                if (error == -EEXIST)
                        goto repeat;
                folio = NULL;
                goto unlock;
        }

alloced:
        alloced = true;
        if (folio_test_large(folio) &&
            DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
                                        folio_next_index(folio)) {
                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
                struct shmem_inode_info *info = SHMEM_I(inode);
                /*
                 * Part of the large folio is beyond i_size: subject
                 * to shrink under memory pressure.
                 */
                spin_lock(&sbinfo->shrinklist_lock);
                /*
                 * _careful to defend against unlocked access to
                 * ->shrink_list in shmem_unused_huge_shrink()
                 */
                if (list_empty_careful(&info->shrinklist)) {
                        list_add_tail(&info->shrinklist,
                                      &sbinfo->shrinklist);
                        sbinfo->shrinklist_len++;
                }
                spin_unlock(&sbinfo->shrinklist_lock);
        }

        if (sgp == SGP_WRITE)
                folio_set_referenced(folio);
        /*
         * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
         */
        if (sgp == SGP_FALLOC)
                sgp = SGP_WRITE;
clear:
        /*
         * Let SGP_WRITE caller clear ends if write does not fill folio;
         * but SGP_FALLOC on a folio fallocated earlier must initialize
         * it now, lest undo on failure cancel our earlier guarantee.
         */
        if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
                long i, n = folio_nr_pages(folio);

                for (i = 0; i < n; i++)
                        clear_highpage(folio_page(folio, i));
                flush_dcache_folio(folio);
                folio_mark_uptodate(folio);
        }

        /* Perhaps the file has been truncated since we checked */
        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
                error = -EINVAL;
                goto unlock;
        }
out:
        *foliop = folio;
        return 0;

        /*
         * Error recovery.
         */
unlock:
        if (alloced)
                filemap_remove_folio(folio);
        shmem_recalc_inode(inode, 0, 0);
        if (folio) {
                folio_unlock(folio);
                folio_put(folio);
        }
        return error;
}

/**
 * shmem_get_folio - find, and lock a shmem folio.
 * @inode:        inode to search
 * @index:        the page index.
 * @write_end:        end of a write, could extend inode size
 * @foliop:        pointer to the folio if found
 * @sgp:        SGP_* flags to control behavior
 *
 * Looks up the page cache entry at @inode & @index.  If a folio is
 * present, it is returned locked with an increased refcount.
 *
 * If the caller modifies data in the folio, it must call folio_mark_dirty()
 * before unlocking the folio to ensure that the folio is not reclaimed.
 * There is no need to reserve space before calling folio_mark_dirty().
 *
 * When no folio is found, the behavior depends on @sgp:
 *  - for SGP_READ, *@foliop is %NULL and 0 is returned
 *  - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
 *  - for all other flags a new folio is allocated, inserted into the
 *    page cache and returned locked in @foliop.
 *
 * Context: May sleep.
 * Return: 0 if successful, else a negative error code.
 */
int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
                    struct folio **foliop, enum sgp_type sgp)
{
        return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
                        mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
EXPORT_SYMBOL_GPL(shmem_get_folio);

/*
 * This is like autoremove_wake_function, but it removes the wait queue
 * entry unconditionally - even if something else had already woken the
 * target.
 */
static int synchronous_wake_function(wait_queue_entry_t *wait,
                        unsigned int mode, int sync, void *key)
{
        int ret = default_wake_function(wait, mode, sync, key);
        list_del_init(&wait->entry);
        return ret;
}

/*
 * Trinity finds that probing a hole which tmpfs is punching can
 * prevent the hole-punch from ever completing: which in turn
 * locks writers out with its hold on i_rwsem.  So refrain from
 * faulting pages into the hole while it's being punched.  Although
 * shmem_undo_range() does remove the additions, it may be unable to
 * keep up, as each new page needs its own unmap_mapping_range() call,
 * and the i_mmap tree grows ever slower to scan if new vmas are added.
 *
 * It does not matter if we sometimes reach this check just before the
 * hole-punch begins, so that one fault then races with the punch:
 * we just need to make racing faults a rare case.
 *
 * The implementation below would be much simpler if we just used a
 * standard mutex or completion: but we cannot take i_rwsem in fault,
 * and bloating every shmem inode for this unlikely case would be sad.
 */
static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
{
        struct shmem_falloc *shmem_falloc;
        struct file *fpin = NULL;
        vm_fault_t ret = 0;

        spin_lock(&inode->i_lock);
        shmem_falloc = inode->i_private;
        if (shmem_falloc &&
            shmem_falloc->waitq &&
            vmf->pgoff >= shmem_falloc->start &&
            vmf->pgoff < shmem_falloc->next) {
                wait_queue_head_t *shmem_falloc_waitq;
                DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);

                ret = VM_FAULT_NOPAGE;
                fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                shmem_falloc_waitq = shmem_falloc->waitq;
                prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
                                TASK_UNINTERRUPTIBLE);
                spin_unlock(&inode->i_lock);
                schedule();

                /*
                 * shmem_falloc_waitq points into the shmem_fallocate()
                 * stack of the hole-punching task: shmem_falloc_waitq
                 * is usually invalid by the time we reach here, but
                 * finish_wait() does not dereference it in that case;
                 * though i_lock needed lest racing with wake_up_all().
                 */
                spin_lock(&inode->i_lock);
                finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
        }
        spin_unlock(&inode->i_lock);
        if (fpin) {
                fput(fpin);
                ret = VM_FAULT_RETRY;
        }
        return ret;
}

static vm_fault_t shmem_fault(struct vm_fault *vmf)
{
        struct inode *inode = file_inode(vmf->vma->vm_file);
        gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
        struct folio *folio = NULL;
        vm_fault_t ret = 0;
        int err;

        /*
         * Trinity finds that probing a hole which tmpfs is punching can
         * prevent the hole-punch from ever completing: noted in i_private.
         */
        if (unlikely(inode->i_private)) {
                ret = shmem_falloc_wait(vmf, inode);
                if (ret)
                        return ret;
        }

        WARN_ON_ONCE(vmf->page != NULL);
        err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
                                  gfp, vmf, &ret);
        if (err)
                return vmf_error(err);
        if (folio) {
                vmf->page = folio_file_page(folio, vmf->pgoff);
                ret |= VM_FAULT_LOCKED;
        }
        return ret;
}

unsigned long shmem_get_unmapped_area(struct file *file,
                                      unsigned long uaddr, unsigned long len,
                                      unsigned long pgoff, unsigned long flags)
{
        unsigned long addr;
        unsigned long offset;
        unsigned long inflated_len;
        unsigned long inflated_addr;
        unsigned long inflated_offset;
        unsigned long hpage_size;

        if (len > TASK_SIZE)
                return -ENOMEM;

        addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags);

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return addr;
        if (IS_ERR_VALUE(addr))
                return addr;
        if (addr & ~PAGE_MASK)
                return addr;
        if (addr > TASK_SIZE - len)
                return addr;

        if (shmem_huge == SHMEM_HUGE_DENY)
                return addr;
        if (flags & MAP_FIXED)
                return addr;
        /*
         * Our priority is to support MAP_SHARED mapped hugely;
         * and support MAP_PRIVATE mapped hugely too, until it is COWed.
         * But if caller specified an address hint and we allocated area there
         * successfully, respect that as before.
         */
        if (uaddr == addr)
                return addr;

        hpage_size = HPAGE_PMD_SIZE;
        if (shmem_huge != SHMEM_HUGE_FORCE) {
                struct super_block *sb;
                unsigned long __maybe_unused hpage_orders;
                int order = 0;

                if (file) {
                        VM_BUG_ON(file->f_op != &shmem_file_operations);
                        sb = file_inode(file)->i_sb;
                } else {
                        /*
                         * Called directly from mm/mmap.c, or drivers/char/mem.c
                         * for "/dev/zero", to create a shared anonymous object.
                         */
                        if (IS_ERR(shm_mnt))
                                return addr;
                        sb = shm_mnt->mnt_sb;

                        /*
                         * Find the highest mTHP order used for anonymous shmem to
                         * provide a suitable alignment address.
                         */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        hpage_orders = READ_ONCE(huge_shmem_orders_always);
                        hpage_orders |= READ_ONCE(huge_shmem_orders_within_size);
                        hpage_orders |= READ_ONCE(huge_shmem_orders_madvise);
                        if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
                                hpage_orders |= READ_ONCE(huge_shmem_orders_inherit);

                        if (hpage_orders > 0) {
                                order = highest_order(hpage_orders);
                                hpage_size = PAGE_SIZE << order;
                        }
#endif
                }
                if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order)
                        return addr;
        }

        if (len < hpage_size)
                return addr;

        offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1);
        if (offset && offset + len < 2 * hpage_size)
                return addr;
        if ((addr & (hpage_size - 1)) == offset)
                return addr;

        inflated_len = len + hpage_size - PAGE_SIZE;
        if (inflated_len > TASK_SIZE)
                return addr;
        if (inflated_len < len)
                return addr;

        inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags);
        if (IS_ERR_VALUE(inflated_addr))
                return addr;
        if (inflated_addr & ~PAGE_MASK)
                return addr;

        inflated_offset = inflated_addr & (hpage_size - 1);
        inflated_addr += offset - inflated_offset;
        if (inflated_offset > offset)
                inflated_addr += hpage_size;

        if (inflated_addr > TASK_SIZE - len)
                return addr;
        return inflated_addr;
}

#ifdef CONFIG_NUMA
static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
        struct inode *inode = file_inode(vma->vm_file);
        return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
}

static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
                                          unsigned long addr, pgoff_t *ilx)
{
        struct inode *inode = file_inode(vma->vm_file);
        pgoff_t index;

        /*
         * Bias interleave by inode number to distribute better across nodes;
         * but this interface is independent of which page order is used, so
         * supplies only that bias, letting caller apply the offset (adjusted
         * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
         */
        *ilx = inode->i_ino;
        index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}

static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
                        pgoff_t index, unsigned int order, pgoff_t *ilx)
{
        struct mempolicy *mpol;

        /* Bias interleave by inode number to distribute better across nodes */
        *ilx = info->vfs_inode.i_ino + (index >> order);

        mpol = mpol_shared_policy_lookup(&info->policy, index);
        return mpol ? mpol : get_task_policy(current);
}
#else
static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
                        pgoff_t index, unsigned int order, pgoff_t *ilx)
{
        *ilx = 0;
        return NULL;
}
#endif /* CONFIG_NUMA */

int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
        struct inode *inode = file_inode(file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int retval = -ENOMEM;

        /*
         * What serializes the accesses to info->flags?
         * ipc_lock_object() when called from shmctl_do_lock(),
         * no serialization needed when called from shm_destroy().
         */
        if (lock && !(info->flags & SHMEM_F_LOCKED)) {
                if (!user_shm_lock(inode->i_size, ucounts))
                        goto out_nomem;
                info->flags |= SHMEM_F_LOCKED;
                mapping_set_unevictable(file->f_mapping);
        }
        if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) {
                user_shm_unlock(inode->i_size, ucounts);
                info->flags &= ~SHMEM_F_LOCKED;
                mapping_clear_unevictable(file->f_mapping);
        }
        retval = 0;

out_nomem:
        return retval;
}

static int shmem_mmap_prepare(struct vm_area_desc *desc)
{
        struct file *file = desc->file;
        struct inode *inode = file_inode(file);

        file_accessed(file);
        /* This is anonymous shared memory if it is unlinked at the time of mmap */
        if (inode->i_nlink)
                desc->vm_ops = &shmem_vm_ops;
        else
                desc->vm_ops = &shmem_anon_vm_ops;
        return 0;
}

static int shmem_file_open(struct inode *inode, struct file *file)
{
        file->f_mode |= FMODE_CAN_ODIRECT;
        return generic_file_open(inode, file);
}

#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);

#if IS_ENABLED(CONFIG_UNICODE)
/*
 * shmem_inode_casefold_flags - Deal with casefold file attribute flag
 *
 * The casefold file attribute needs some special checks. I can just be added to
 * an empty dir, and can't be removed from a non-empty dir.
 */
static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
                                      struct dentry *dentry, unsigned int *i_flags)
{
        unsigned int old = inode->i_flags;
        struct super_block *sb = inode->i_sb;

        if (fsflags & FS_CASEFOLD_FL) {
                if (!(old & S_CASEFOLD)) {
                        if (!sb->s_encoding)
                                return -EOPNOTSUPP;

                        if (!S_ISDIR(inode->i_mode))
                                return -ENOTDIR;

                        if (dentry && !simple_empty(dentry))
                                return -ENOTEMPTY;
                }

                *i_flags = *i_flags | S_CASEFOLD;
        } else if (old & S_CASEFOLD) {
                if (dentry && !simple_empty(dentry))
                        return -ENOTEMPTY;
        }

        return 0;
}
#else
static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
                                      struct dentry *dentry, unsigned int *i_flags)
{
        if (fsflags & FS_CASEFOLD_FL)
                return -EOPNOTSUPP;

        return 0;
}
#endif

/*
 * chattr's fsflags are unrelated to extended attributes,
 * but tmpfs has chosen to enable them under the same config option.
 */
static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
{
        unsigned int i_flags = 0;
        int ret;

        ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags);
        if (ret)
                return ret;

        if (fsflags & FS_NOATIME_FL)
                i_flags |= S_NOATIME;
        if (fsflags & FS_APPEND_FL)
                i_flags |= S_APPEND;
        if (fsflags & FS_IMMUTABLE_FL)
                i_flags |= S_IMMUTABLE;
        /*
         * But FS_NODUMP_FL does not require any action in i_flags.
         */
        inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD);

        return 0;
}
#else
static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
{
}
#define shmem_initxattrs NULL
#endif

static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
{
        return &SHMEM_I(inode)->dir_offsets;
}

static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
                                       struct super_block *sb,
                                       struct inode *dir, umode_t mode,
                                       dev_t dev, vma_flags_t flags)
{
        struct inode *inode;
        struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        ino_t ino;
        int err;

        err = shmem_reserve_inode(sb, &ino);
        if (err)
                return ERR_PTR(err);

        inode = new_inode(sb);
        if (!inode) {
                shmem_free_inode(sb, 0);
                return ERR_PTR(-ENOSPC);
        }

        inode->i_ino = ino;
        inode_init_owner(idmap, inode, dir, mode);
        inode->i_blocks = 0;
        simple_inode_init_ts(inode);
        inode->i_generation = get_random_u32();
        info = SHMEM_I(inode);
        memset(info, 0, (char *)inode - (char *)info);
        spin_lock_init(&info->lock);
        atomic_set(&info->stop_eviction, 0);
        info->seals = F_SEAL_SEAL;
        info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT)
                ? SHMEM_F_NORESERVE : 0;
        info->i_crtime = inode_get_mtime(inode);
        info->fsflags = (dir == NULL) ? 0 :
                SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
        if (info->fsflags)
                shmem_set_inode_flags(inode, info->fsflags, NULL);
        INIT_LIST_HEAD(&info->shrinklist);
        INIT_LIST_HEAD(&info->swaplist);
        cache_no_acl(inode);
        if (sbinfo->noswap)
                mapping_set_unevictable(inode->i_mapping);

        /* Don't consider 'deny' for emergencies and 'force' for testing */
        if (sbinfo->huge)
                mapping_set_large_folios(inode->i_mapping);

        switch (mode & S_IFMT) {
        default:
                inode->i_op = &shmem_special_inode_operations;
                init_special_inode(inode, mode, dev);
                break;
        case S_IFREG:
                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_inode_operations;
                inode->i_fop = &shmem_file_operations;
                mpol_shared_policy_init(&info->policy,
                                         shmem_get_sbmpol(sbinfo));
                break;
        case S_IFDIR:
                inc_nlink(inode);
                /* Some things misbehave if size == 0 on a directory */
                inode->i_size = 2 * BOGO_DIRENT_SIZE;
                inode->i_op = &shmem_dir_inode_operations;
                inode->i_fop = &simple_offset_dir_operations;
                simple_offset_init(shmem_get_offset_ctx(inode));
                break;
        case S_IFLNK:
                /*
                 * Must not load anything in the rbtree,
                 * mpol_free_shared_policy will not be called.
                 */
                mpol_shared_policy_init(&info->policy, NULL);
                break;
        }

        lockdep_annotate_inode_mutex_key(inode);
        return inode;
}

#ifdef CONFIG_TMPFS_QUOTA
static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
                                     struct super_block *sb, struct inode *dir,
                                     umode_t mode, dev_t dev, vma_flags_t flags)
{
        int err;
        struct inode *inode;

        inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
        if (IS_ERR(inode))
                return inode;

        err = dquot_initialize(inode);
        if (err)
                goto errout;

        err = dquot_alloc_inode(inode);
        if (err) {
                dquot_drop(inode);
                goto errout;
        }
        return inode;

errout:
        inode->i_flags |= S_NOQUOTA;
        iput(inode);
        return ERR_PTR(err);
}
#else
static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
                                     struct super_block *sb, struct inode *dir,
                                     umode_t mode, dev_t dev, vma_flags_t flags)
{
        return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
}
#endif /* CONFIG_TMPFS_QUOTA */

#ifdef CONFIG_USERFAULTFD
static struct folio *shmem_mfill_folio_alloc(struct vm_area_struct *vma,
                                             unsigned long addr)
{
        struct inode *inode = file_inode(vma->vm_file);
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t pgoff = linear_page_index(vma, addr);
        gfp_t gfp = mapping_gfp_mask(mapping);
        struct folio *folio;

        if (unlikely(pgoff >= DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)))
                return NULL;

        folio = shmem_alloc_folio(gfp, 0, info, pgoff);
        if (!folio)
                return NULL;

        if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) {
                folio_put(folio);
                return NULL;
        }

        return folio;
}

static int shmem_mfill_filemap_add(struct folio *folio,
                                   struct vm_area_struct *vma,
                                   unsigned long addr)
{
        struct inode *inode = file_inode(vma->vm_file);
        struct address_space *mapping = inode->i_mapping;
        pgoff_t pgoff = linear_page_index(vma, addr);
        gfp_t gfp = mapping_gfp_mask(mapping);
        int err;

        __folio_set_locked(folio);
        __folio_set_swapbacked(folio);

        err = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
        if (err)
                goto err_unlock;

        if (shmem_inode_acct_blocks(inode, 1)) {
                err = -ENOMEM;
                goto err_delete_from_cache;
        }

        folio_add_lru(folio);
        shmem_recalc_inode(inode, 1, 0);

        return 0;

err_delete_from_cache:
        filemap_remove_folio(folio);
err_unlock:
        folio_unlock(folio);
        return err;
}

static void shmem_mfill_filemap_remove(struct folio *folio,
                                       struct vm_area_struct *vma)
{
        struct inode *inode = file_inode(vma->vm_file);

        filemap_remove_folio(folio);
        shmem_recalc_inode(inode, 0, 0);
        folio_unlock(folio);
}

static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t pgoff)
{
        struct folio *folio;
        int err;

        err = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
        if (err)
                return ERR_PTR(err);

        return folio;
}

static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags)
{
        return true;
}

static const struct vm_uffd_ops shmem_uffd_ops = {
        .can_userfault                = shmem_can_userfault,
        .get_folio_noalloc        = shmem_get_folio_noalloc,
        .alloc_folio                = shmem_mfill_folio_alloc,
        .filemap_add                = shmem_mfill_filemap_add,
        .filemap_remove                = shmem_mfill_filemap_remove,
};
#endif /* CONFIG_USERFAULTFD */

#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;

static int
shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
                  loff_t pos, unsigned len,
                  struct folio **foliop, void **fsdata)
{
        struct inode *inode = mapping->host;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t index = pos >> PAGE_SHIFT;
        struct folio *folio;
        int ret = 0;

        /* i_rwsem is held by caller */
        if (unlikely(info->seals & (F_SEAL_GROW |
                                   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
                if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
                        return -EPERM;
                if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
                        return -EPERM;
        }

        if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) &&
                     pos + len > inode->i_size))
                return -EPERM;

        ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
        if (ret)
                return ret;

        if (folio_contain_hwpoisoned_page(folio)) {
                folio_unlock(folio);
                folio_put(folio);
                return -EIO;
        }

        *foliop = folio;
        return 0;
}

static int
shmem_write_end(const struct kiocb *iocb, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned copied,
                struct folio *folio, void *fsdata)
{
        struct inode *inode = mapping->host;

        if (pos + copied > inode->i_size)
                i_size_write(inode, pos + copied);

        if (!folio_test_uptodate(folio)) {
                if (copied < folio_size(folio)) {
                        size_t from = offset_in_folio(folio, pos);
                        folio_zero_segments(folio, 0, from,
                                        from + copied, folio_size(folio));
                }
                folio_mark_uptodate(folio);
        }
        folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);

        return copied;
}

static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct address_space *mapping = inode->i_mapping;
        pgoff_t index;
        unsigned long offset;
        int error = 0;
        ssize_t retval = 0;

        for (;;) {
                struct folio *folio = NULL;
                struct page *page = NULL;
                unsigned long nr, ret;
                loff_t end_offset, i_size = i_size_read(inode);
                bool fallback_page_copy = false;
                size_t fsize;

                if (unlikely(iocb->ki_pos >= i_size))
                        break;

                index = iocb->ki_pos >> PAGE_SHIFT;
                error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
                if (error) {
                        if (error == -EINVAL)
                                error = 0;
                        break;
                }
                if (folio) {
                        folio_unlock(folio);

                        page = folio_file_page(folio, index);
                        if (PageHWPoison(page)) {
                                folio_put(folio);
                                error = -EIO;
                                break;
                        }

                        if (folio_test_large(folio) &&
                            folio_test_has_hwpoisoned(folio))
                                fallback_page_copy = true;
                }

                /*
                 * We must evaluate after, since reads (unlike writes)
                 * are called without i_rwsem protection against truncate
                 */
                i_size = i_size_read(inode);
                if (unlikely(iocb->ki_pos >= i_size)) {
                        if (folio)
                                folio_put(folio);
                        break;
                }
                end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count);
                if (folio && likely(!fallback_page_copy))
                        fsize = folio_size(folio);
                else
                        fsize = PAGE_SIZE;
                offset = iocb->ki_pos & (fsize - 1);
                nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset);

                if (folio) {
                        /*
                         * If users can be writing to this page using arbitrary
                         * virtual addresses, take care about potential aliasing
                         * before reading the page on the kernel side.
                         */
                        if (mapping_writably_mapped(mapping)) {
                                if (likely(!fallback_page_copy))
                                        flush_dcache_folio(folio);
                                else
                                        flush_dcache_page(page);
                        }

                        /*
                         * Mark the folio accessed if we read the beginning.
                         */
                        if (!offset)
                                folio_mark_accessed(folio);
                        /*
                         * Ok, we have the page, and it's up-to-date, so
                         * now we can copy it to user space...
                         */
                        if (likely(!fallback_page_copy))
                                ret = copy_folio_to_iter(folio, offset, nr, to);
                        else
                                ret = copy_page_to_iter(page, offset, nr, to);
                        folio_put(folio);
                } else if (user_backed_iter(to)) {
                        /*
                         * Copy to user tends to be so well optimized, but
                         * clear_user() not so much, that it is noticeably
                         * faster to copy the zero page instead of clearing.
                         */
                        ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
                } else {
                        /*
                         * But submitting the same page twice in a row to
                         * splice() - or others? - can result in confusion:
                         * so don't attempt that optimization on pipes etc.
                         */
                        ret = iov_iter_zero(nr, to);
                }

                retval += ret;
                iocb->ki_pos += ret;

                if (!iov_iter_count(to))
                        break;
                if (ret < nr) {
                        error = -EFAULT;
                        break;
                }
                cond_resched();
        }

        file_accessed(file);
        return retval ? retval : error;
}

static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;

        inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret <= 0)
                goto unlock;
        ret = file_remove_privs(file);
        if (ret)
                goto unlock;
        ret = file_update_time(file);
        if (ret)
                goto unlock;
        ret = generic_perform_write(iocb, from);
unlock:
        inode_unlock(inode);
        return ret;
}

static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
                              struct pipe_buffer *buf)
{
        return true;
}

static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
                                  struct pipe_buffer *buf)
{
}

static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf)
{
        return false;
}

static const struct pipe_buf_operations zero_pipe_buf_ops = {
        .release        = zero_pipe_buf_release,
        .try_steal        = zero_pipe_buf_try_steal,
        .get                = zero_pipe_buf_get,
};

static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
                                        loff_t fpos, size_t size)
{
        size_t offset = fpos & ~PAGE_MASK;

        size = min_t(size_t, size, PAGE_SIZE - offset);

        if (!pipe_is_full(pipe)) {
                struct pipe_buffer *buf = pipe_head_buf(pipe);

                *buf = (struct pipe_buffer) {
                        .ops        = &zero_pipe_buf_ops,
                        .page        = ZERO_PAGE(0),
                        .offset        = offset,
                        .len        = size,
                };
                pipe->head++;
        }

        return size;
}

static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
                                      struct pipe_inode_info *pipe,
                                      size_t len, unsigned int flags)
{
        struct inode *inode = file_inode(in);
        struct address_space *mapping = inode->i_mapping;
        struct folio *folio = NULL;
        size_t total_spliced = 0, used, npages, n, part;
        loff_t isize;
        int error = 0;

        /* Work out how much data we can actually add into the pipe */
        used = pipe_buf_usage(pipe);
        npages = max_t(ssize_t, pipe->max_usage - used, 0);
        len = min_t(size_t, len, npages * PAGE_SIZE);

        do {
                bool fallback_page_splice = false;
                struct page *page = NULL;
                pgoff_t index;
                size_t size;

                if (*ppos >= i_size_read(inode))
                        break;

                index = *ppos >> PAGE_SHIFT;
                error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
                if (error) {
                        if (error == -EINVAL)
                                error = 0;
                        break;
                }
                if (folio) {
                        folio_unlock(folio);

                        page = folio_file_page(folio, index);
                        if (PageHWPoison(page)) {
                                error = -EIO;
                                break;
                        }

                        if (folio_test_large(folio) &&
                            folio_test_has_hwpoisoned(folio))
                                fallback_page_splice = true;
                }

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(inode);
                if (unlikely(*ppos >= isize))
                        break;
                /*
                 * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned
                 * pages.
                 */
                size = len;
                if (unlikely(fallback_page_splice)) {
                        size_t offset = *ppos & ~PAGE_MASK;

                        size = umin(size, PAGE_SIZE - offset);
                }
                part = min_t(loff_t, isize - *ppos, size);

                if (folio) {
                        /*
                         * If users can be writing to this page using arbitrary
                         * virtual addresses, take care about potential aliasing
                         * before reading the page on the kernel side.
                         */
                        if (mapping_writably_mapped(mapping)) {
                                if (likely(!fallback_page_splice))
                                        flush_dcache_folio(folio);
                                else
                                        flush_dcache_page(page);
                        }
                        folio_mark_accessed(folio);
                        /*
                         * Ok, we have the page, and it's up-to-date, so we can
                         * now splice it into the pipe.
                         */
                        n = splice_folio_into_pipe(pipe, folio, *ppos, part);
                        folio_put(folio);
                        folio = NULL;
                } else {
                        n = splice_zeropage_into_pipe(pipe, *ppos, part);
                }

                if (!n)
                        break;
                len -= n;
                total_spliced += n;
                *ppos += n;
                in->f_ra.prev_pos = *ppos;
                if (pipe_is_full(pipe))
                        break;

                cond_resched();
        } while (len);

        if (folio)
                folio_put(folio);

        file_accessed(in);
        return total_spliced ? total_spliced : error;
}

static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;

        if (whence != SEEK_DATA && whence != SEEK_HOLE)
                return generic_file_llseek_size(file, offset, whence,
                                        MAX_LFS_FILESIZE, i_size_read(inode));
        if (offset < 0)
                return -ENXIO;

        inode_lock(inode);
        /* We're holding i_rwsem so we can access i_size directly */
        offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
        if (offset >= 0)
                offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
        inode_unlock(inode);
        return offset;
}

static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                                         loff_t len)
{
        struct inode *inode = file_inode(file);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_falloc shmem_falloc;
        pgoff_t start, index, end, undo_fallocend;
        int error;

        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;

        inode_lock(inode);

        if (info->flags & SHMEM_F_MAPPING_FROZEN) {
                error = -EPERM;
                goto out;
        }

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                struct address_space *mapping = file->f_mapping;
                loff_t unmap_start = round_up(offset, PAGE_SIZE);
                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);

                /* protected by i_rwsem */
                if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
                        error = -EPERM;
                        goto out;
                }

                shmem_falloc.waitq = &shmem_falloc_waitq;
                shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
                spin_lock(&inode->i_lock);
                inode->i_private = &shmem_falloc;
                spin_unlock(&inode->i_lock);

                if ((u64)unmap_end > (u64)unmap_start)
                        unmap_mapping_range(mapping, unmap_start,
                                            1 + unmap_end - unmap_start, 0);
                shmem_truncate_range(inode, offset, offset + len - 1);
                /* No need to unmap again: hole-punching leaves COWed pages */

                spin_lock(&inode->i_lock);
                inode->i_private = NULL;
                wake_up_all(&shmem_falloc_waitq);
                WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
                spin_unlock(&inode->i_lock);
                error = 0;
                goto out;
        }

        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
        error = inode_newsize_ok(inode, offset + len);
        if (error)
                goto out;

        if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
                error = -EPERM;
                goto out;
        }

        start = offset >> PAGE_SHIFT;
        end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        /* Try to avoid a swapstorm if len is impossible to satisfy */
        if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
                error = -ENOSPC;
                goto out;
        }

        shmem_falloc.waitq = NULL;
        shmem_falloc.start = start;
        shmem_falloc.next  = start;
        shmem_falloc.nr_falloced = 0;
        shmem_falloc.nr_unswapped = 0;
        spin_lock(&inode->i_lock);
        inode->i_private = &shmem_falloc;
        spin_unlock(&inode->i_lock);

        /*
         * info->fallocend is only relevant when huge pages might be
         * involved: to prevent split_huge_page() freeing fallocated
         * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
         */
        undo_fallocend = info->fallocend;
        if (info->fallocend < end)
                info->fallocend = end;

        for (index = start; index < end; ) {
                struct folio *folio;

                /*
                 * Check for fatal signal so that we abort early in OOM
                 * situations. We don't want to abort in case of non-fatal
                 * signals as large fallocate can take noticeable time and
                 * e.g. periodic timers may result in fallocate constantly
                 * restarting.
                 */
                if (fatal_signal_pending(current))
                        error = -EINTR;
                else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
                        error = -ENOMEM;
                else
                        error = shmem_get_folio(inode, index, offset + len,
                                                &folio, SGP_FALLOC);
                if (error) {
                        info->fallocend = undo_fallocend;
                        /* Remove the !uptodate folios we added */
                        if (index > start) {
                                shmem_undo_range(inode,
                                    (loff_t)start << PAGE_SHIFT,
                                    ((loff_t)index << PAGE_SHIFT) - 1, true);
                        }
                        goto undone;
                }

                /*
                 * Here is a more important optimization than it appears:
                 * a second SGP_FALLOC on the same large folio will clear it,
                 * making it uptodate and un-undoable if we fail later.
                 */
                index = folio_next_index(folio);
                /* Beware 32-bit wraparound */
                if (!index)
                        index--;

                /*
                 * Inform shmem_writeout() how far we have reached.
                 * No need for lock or barrier: we have the page lock.
                 */
                if (!folio_test_uptodate(folio))
                        shmem_falloc.nr_falloced += index - shmem_falloc.next;
                shmem_falloc.next = index;

                /*
                 * If !uptodate, leave it that way so that freeable folios
                 * can be recognized if we need to rollback on error later.
                 * But mark it dirty so that memory pressure will swap rather
                 * than free the folios we are allocating (and SGP_CACHE folios
                 * might still be clean: we now need to mark those dirty too).
                 */
                folio_mark_dirty(folio);
                folio_unlock(folio);
                folio_put(folio);
                cond_resched();
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
                i_size_write(inode, offset + len);
undone:
        spin_lock(&inode->i_lock);
        inode->i_private = NULL;
        spin_unlock(&inode->i_lock);
out:
        if (!error)
                file_modified(file);
        inode_unlock(inode);
        return error;
}

static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);

        buf->f_type = TMPFS_MAGIC;
        buf->f_bsize = PAGE_SIZE;
        buf->f_namelen = NAME_MAX;
        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
                buf->f_bavail =
                buf->f_bfree  = sbinfo->max_blocks -
                                percpu_counter_sum(&sbinfo->used_blocks);
        }
        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
                buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
        }
        /* else leave those fields 0 like simple_statfs */

        buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);

        return 0;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
static int
shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
            struct dentry *dentry, umode_t mode, dev_t dev)
{
        struct inode *inode;
        int error;

        if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
                return -EINVAL;

        inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev,
                                mk_vma_flags(VMA_NORESERVE_BIT));
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        error = simple_acl_create(dir, inode);
        if (error)
                goto out_iput;
        error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;

        error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
        if (error)
                goto out_iput;

        dir->i_size += BOGO_DIRENT_SIZE;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        inode_inc_iversion(dir);

        d_make_persistent(dentry, inode);
        return error;

out_iput:
        iput(inode);
        return error;
}

static int
shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
              struct file *file, umode_t mode)
{
        struct inode *inode;
        int error;

        inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0,
                                mk_vma_flags(VMA_NORESERVE_BIT));
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto err_out;
        }
        error = security_inode_init_security(inode, dir, NULL,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;
        error = simple_acl_create(dir, inode);
        if (error)
                goto out_iput;
        d_tmpfile(file, inode);

err_out:
        return finish_open_simple(file, error);
out_iput:
        iput(inode);
        return error;
}

static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                                  struct dentry *dentry, umode_t mode)
{
        int error;

        error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
        if (error)
                return ERR_PTR(error);
        inc_nlink(dir);
        return NULL;
}

static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
                        struct dentry *dentry, umode_t mode, bool excl)
{
        return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}

/*
 * Link a file..
 */
static int shmem_link(struct dentry *old_dentry, struct inode *dir,
                      struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);
        int ret;

        /*
         * No ordinary (disk based) filesystem counts links as inodes;
         * but each new link needs a new dentry, pinning lowmem, and
         * tmpfs dentries cannot be pruned until they are unlinked.
         * But if an O_TMPFILE file is linked into the tmpfs, the
         * first link must skip that, to get the accounting right.
         */
        if (inode->i_nlink) {
                ret = shmem_reserve_inode(inode->i_sb, NULL);
                if (ret)
                        return ret;
        }

        ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
        if (ret) {
                if (inode->i_nlink)
                        shmem_free_inode(inode->i_sb, 0);
                return ret;
        }

        dir->i_size += BOGO_DIRENT_SIZE;
        inode_inc_iversion(dir);
        return simple_link(old_dentry, dir, dentry);
}

static int shmem_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
                shmem_free_inode(inode->i_sb, 0);

        simple_offset_remove(shmem_get_offset_ctx(dir), dentry);

        dir->i_size -= BOGO_DIRENT_SIZE;
        inode_inc_iversion(dir);
        simple_unlink(dir, dentry);

        /*
         * For now, VFS can't deal with case-insensitive negative dentries, so
         * we invalidate them
         */
        if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
                d_invalidate(dentry);

        return 0;
}

static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (!simple_empty(dentry))
                return -ENOTEMPTY;

        drop_nlink(d_inode(dentry));
        drop_nlink(dir);
        return shmem_unlink(dir, dentry);
}

static int shmem_whiteout(struct mnt_idmap *idmap,
                          struct inode *old_dir, struct dentry *old_dentry)
{
        struct dentry *whiteout;
        int error;

        whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
        if (!whiteout)
                return -ENOMEM;
        error = shmem_mknod(idmap, old_dir, whiteout,
                            S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
        dput(whiteout);
        return error;
}

/*
 * The VFS layer already does all the dentry stuff for rename,
 * we just have to decrement the usage count for the target if
 * it exists so that the VFS layer correctly free's it when it
 * gets overwritten.
 */
static int shmem_rename2(struct mnt_idmap *idmap,
                         struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        struct inode *inode = d_inode(old_dentry);
        int they_are_dirs = S_ISDIR(inode->i_mode);
        bool had_offset = false;
        int error;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;

        if (flags & RENAME_EXCHANGE)
                return simple_offset_rename_exchange(old_dir, old_dentry,
                                                     new_dir, new_dentry);

        if (!simple_empty(new_dentry))
                return -ENOTEMPTY;

        error = simple_offset_add(shmem_get_offset_ctx(new_dir), new_dentry);
        if (error == -EBUSY)
                had_offset = true;
        else if (unlikely(error))
                return error;

        if (flags & RENAME_WHITEOUT) {
                error = shmem_whiteout(idmap, old_dir, old_dentry);
                if (error) {
                        if (!had_offset)
                                simple_offset_remove(shmem_get_offset_ctx(new_dir),
                                                     new_dentry);
                        return error;
                }
        }

        simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
        if (d_really_is_positive(new_dentry)) {
                (void) shmem_unlink(new_dir, new_dentry);
                if (they_are_dirs) {
                        drop_nlink(d_inode(new_dentry));
                        drop_nlink(old_dir);
                }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);
        }

        old_dir->i_size -= BOGO_DIRENT_SIZE;
        new_dir->i_size += BOGO_DIRENT_SIZE;
        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        inode_inc_iversion(old_dir);
        inode_inc_iversion(new_dir);
        return 0;
}

static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
                         struct dentry *dentry, const char *symname)
{
        int error;
        int len;
        struct inode *inode;
        struct folio *folio;
        char *link;

        len = strlen(symname) + 1;
        if (len > PAGE_SIZE)
                return -ENAMETOOLONG;

        inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
                                mk_vma_flags(VMA_NORESERVE_BIT));
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;

        error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
        if (error)
                goto out_iput;

        inode->i_size = len-1;
        if (len <= SHORT_SYMLINK_LEN) {
                link = kmemdup(symname, len, GFP_KERNEL);
                if (!link) {
                        error = -ENOMEM;
                        goto out_remove_offset;
                }
                inode->i_op = &shmem_short_symlink_operations;
                inode_set_cached_link(inode, link, len - 1);
        } else {
                inode_nohighmem(inode);
                inode->i_mapping->a_ops = &shmem_aops;
                error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
                if (error)
                        goto out_remove_offset;
                inode->i_op = &shmem_symlink_inode_operations;
                memcpy(folio_address(folio), symname, len);
                folio_mark_uptodate(folio);
                folio_mark_dirty(folio);
                folio_unlock(folio);
                folio_put(folio);
        }
        dir->i_size += BOGO_DIRENT_SIZE;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        inode_inc_iversion(dir);
        d_make_persistent(dentry, inode);
        return 0;

out_remove_offset:
        simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
out_iput:
        iput(inode);
        return error;
}

static void shmem_put_link(void *arg)
{
        folio_mark_accessed(arg);
        folio_put(arg);
}

static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
                                  struct delayed_call *done)
{
        struct folio *folio = NULL;
        int error;

        if (!dentry) {
                folio = filemap_get_folio(inode->i_mapping, 0);
                if (IS_ERR(folio))
                        return ERR_PTR(-ECHILD);
                if (PageHWPoison(folio_page(folio, 0)) ||
                    !folio_test_uptodate(folio)) {
                        folio_put(folio);
                        return ERR_PTR(-ECHILD);
                }
        } else {
                error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
                if (error)
                        return ERR_PTR(error);
                if (!folio)
                        return ERR_PTR(-ECHILD);
                if (PageHWPoison(folio_page(folio, 0))) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return ERR_PTR(-ECHILD);
                }
                folio_unlock(folio);
        }
        set_delayed_call(done, shmem_put_link, folio);
        return folio_address(folio);
}

#ifdef CONFIG_TMPFS_XATTR

static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));

        fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);

        return 0;
}

static int shmem_fileattr_set(struct mnt_idmap *idmap,
                              struct dentry *dentry, struct file_kattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int ret, flags;

        if (fileattr_has_fsx(fa))
                return -EOPNOTSUPP;
        if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
                return -EOPNOTSUPP;

        flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
                (fa->flags & SHMEM_FL_USER_MODIFIABLE);

        ret = shmem_set_inode_flags(inode, flags, dentry);

        if (ret)
                return ret;

        info->fsflags = flags;

        inode_set_ctime_current(inode);
        inode_inc_iversion(inode);
        return 0;
}

/*
 * Superblocks without xattr inode operations may get some security.* xattr
 * support from the LSM "for free". As soon as we have any other xattrs
 * like ACLs, we also need to implement the security.* handlers at
 * filesystem level, though.
 */

/*
 * Callback for security_inode_init_security() for acquiring xattrs.
 */
static int shmem_initxattrs(struct inode *inode,
                            const struct xattr *xattr_array, void *fs_info)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        const struct xattr *xattr;
        size_t ispace = 0;
        size_t len;

        CLASS(simple_xattrs, xattrs)();
        if (IS_ERR(xattrs))
                return PTR_ERR(xattrs);

        if (sbinfo->max_inodes) {
                for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                        ispace += simple_xattr_space(xattr->name,
                                xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
                }
                if (ispace) {
                        raw_spin_lock(&sbinfo->stat_lock);
                        if (sbinfo->free_ispace < ispace)
                                ispace = 0;
                        else
                                sbinfo->free_ispace -= ispace;
                        raw_spin_unlock(&sbinfo->stat_lock);
                        if (!ispace)
                                return -ENOSPC;
                }
        }

        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len);
                if (IS_ERR(new_xattr))
                        break;

                len = strlen(xattr->name) + 1;
                new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
                                          GFP_KERNEL_ACCOUNT);
                if (!new_xattr->name)
                        break;

                memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
                       XATTR_SECURITY_PREFIX_LEN);
                memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
                       xattr->name, len);

                if (simple_xattr_add(xattrs, new_xattr))
                        break;
                retain_and_null_ptr(new_xattr);
        }

        if (xattr->name != NULL) {
                if (ispace) {
                        raw_spin_lock(&sbinfo->stat_lock);
                        sbinfo->free_ispace += ispace;
                        raw_spin_unlock(&sbinfo->stat_lock);
                }
                return -ENOMEM;
        }

        smp_store_release(&info->xattrs, no_free_ptr(xattrs));
        return 0;
}

static int shmem_xattr_handler_get(const struct xattr_handler *handler,
                                   struct dentry *unused, struct inode *inode,
                                   const char *name, void *buffer, size_t size)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct simple_xattrs *xattrs;

        xattrs = READ_ONCE(info->xattrs);
        if (!xattrs)
                return -ENODATA;

        name = xattr_full_name(handler, name);
        return simple_xattr_get(xattrs, name, buffer, size);
}

static int shmem_xattr_handler_set(const struct xattr_handler *handler,
                                   struct mnt_idmap *idmap,
                                   struct dentry *unused, struct inode *inode,
                                   const char *name, const void *value,
                                   size_t size, int flags)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct simple_xattrs *xattrs;
        struct simple_xattr *old_xattr;
        size_t ispace = 0;

        name = xattr_full_name(handler, name);

        xattrs = simple_xattrs_lazy_alloc(&info->xattrs, value, flags);
        if (IS_ERR_OR_NULL(xattrs))
                return PTR_ERR(xattrs);

        if (value && sbinfo->max_inodes) {
                ispace = simple_xattr_space(name, size);
                raw_spin_lock(&sbinfo->stat_lock);
                if (sbinfo->free_ispace < ispace)
                        ispace = 0;
                else
                        sbinfo->free_ispace -= ispace;
                raw_spin_unlock(&sbinfo->stat_lock);
                if (!ispace)
                        return -ENOSPC;
        }

        old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
        if (!IS_ERR(old_xattr)) {
                ispace = 0;
                if (old_xattr && sbinfo->max_inodes)
                        ispace = simple_xattr_space(old_xattr->name,
                                                    old_xattr->size);
                simple_xattr_free_rcu(old_xattr);
                old_xattr = NULL;
                inode_set_ctime_current(inode);
                inode_inc_iversion(inode);
        }
        if (ispace) {
                raw_spin_lock(&sbinfo->stat_lock);
                sbinfo->free_ispace += ispace;
                raw_spin_unlock(&sbinfo->stat_lock);
        }
        return PTR_ERR(old_xattr);
}

static const struct xattr_handler shmem_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler shmem_trusted_xattr_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler shmem_user_xattr_handler = {
        .prefix = XATTR_USER_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler * const shmem_xattr_handlers[] = {
        &shmem_security_xattr_handler,
        &shmem_trusted_xattr_handler,
        &shmem_user_xattr_handler,
        NULL
};

static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));

        return simple_xattr_list(d_inode(dentry), READ_ONCE(info->xattrs),
                                 buffer, size);
}
#endif /* CONFIG_TMPFS_XATTR */

static const struct inode_operations shmem_short_symlink_operations = {
        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
        .get_link        = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
};

static const struct inode_operations shmem_symlink_inode_operations = {
        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
        .get_link        = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
};

static struct dentry *shmem_get_parent(struct dentry *child)
{
        return ERR_PTR(-ESTALE);
}

static int shmem_match(struct inode *ino, void *vfh)
{
        __u32 *fh = vfh;
        __u64 inum = fh[2];
        inum = (inum << 32) | fh[1];
        return ino->i_ino == inum && fh[0] == ino->i_generation;
}

/* Find any alias of inode, but prefer a hashed alias */
static struct dentry *shmem_find_alias(struct inode *inode)
{
        struct dentry *alias = d_find_alias(inode);

        return alias ?: d_find_any_alias(inode);
}

static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
{
        struct inode *inode;
        struct dentry *dentry = NULL;
        u64 inum;

        if (fh_len < 3)
                return NULL;

        inum = fid->raw[2];
        inum = (inum << 32) | fid->raw[1];

        inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
                        shmem_match, fid->raw);
        if (inode) {
                dentry = shmem_find_alias(inode);
                iput(inode);
        }

        return dentry;
}

static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
                                struct inode *parent)
{
        if (*len < 3) {
                *len = 3;
                return FILEID_INVALID;
        }

        if (inode_unhashed(inode)) {
                /* Unfortunately insert_inode_hash is not idempotent,
                 * so as we hash inodes here rather than at creation
                 * time, we need a lock to ensure we only try
                 * to do it once
                 */
                static DEFINE_SPINLOCK(lock);
                spin_lock(&lock);
                if (inode_unhashed(inode))
                        __insert_inode_hash(inode,
                                            inode->i_ino + inode->i_generation);
                spin_unlock(&lock);
        }

        fh[0] = inode->i_generation;
        fh[1] = inode->i_ino;
        fh[2] = ((__u64)inode->i_ino) >> 32;

        *len = 3;
        return 1;
}

static const struct export_operations shmem_export_ops = {
        .get_parent     = shmem_get_parent,
        .encode_fh      = shmem_encode_fh,
        .fh_to_dentry        = shmem_fh_to_dentry,
};

enum shmem_param {
        Opt_gid,
        Opt_huge,
        Opt_mode,
        Opt_mpol,
        Opt_nr_blocks,
        Opt_nr_inodes,
        Opt_size,
        Opt_uid,
        Opt_inode32,
        Opt_inode64,
        Opt_noswap,
        Opt_quota,
        Opt_usrquota,
        Opt_grpquota,
        Opt_usrquota_block_hardlimit,
        Opt_usrquota_inode_hardlimit,
        Opt_grpquota_block_hardlimit,
        Opt_grpquota_inode_hardlimit,
        Opt_casefold_version,
        Opt_casefold,
        Opt_strict_encoding,
};

static const struct constant_table shmem_param_enums_huge[] = {
        {"never",        SHMEM_HUGE_NEVER },
        {"always",        SHMEM_HUGE_ALWAYS },
        {"within_size",        SHMEM_HUGE_WITHIN_SIZE },
        {"advise",        SHMEM_HUGE_ADVISE },
        {}
};

const struct fs_parameter_spec shmem_fs_parameters[] = {
        fsparam_gid   ("gid",                Opt_gid),
        fsparam_enum  ("huge",                Opt_huge,  shmem_param_enums_huge),
        fsparam_u32oct("mode",                Opt_mode),
        fsparam_string("mpol",                Opt_mpol),
        fsparam_string("nr_blocks",        Opt_nr_blocks),
        fsparam_string("nr_inodes",        Opt_nr_inodes),
        fsparam_string("size",                Opt_size),
        fsparam_uid   ("uid",                Opt_uid),
        fsparam_flag  ("inode32",        Opt_inode32),
        fsparam_flag  ("inode64",        Opt_inode64),
        fsparam_flag  ("noswap",        Opt_noswap),
#ifdef CONFIG_TMPFS_QUOTA
        fsparam_flag  ("quota",                Opt_quota),
        fsparam_flag  ("usrquota",        Opt_usrquota),
        fsparam_flag  ("grpquota",        Opt_grpquota),
        fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
        fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
        fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
        fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
#endif
        fsparam_string("casefold",        Opt_casefold_version),
        fsparam_flag  ("casefold",        Opt_casefold),
        fsparam_flag  ("strict_encoding", Opt_strict_encoding),
        {}
};

#if IS_ENABLED(CONFIG_UNICODE)
static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
                                    bool latest_version)
{
        struct shmem_options *ctx = fc->fs_private;
        int version = UTF8_LATEST;
        struct unicode_map *encoding;
        char *version_str = param->string + 5;

        if (!latest_version) {
                if (strncmp(param->string, "utf8-", 5))
                        return invalfc(fc, "Only UTF-8 encodings are supported "
                                       "in the format: utf8-<version number>");

                version = utf8_parse_version(version_str);
                if (version < 0)
                        return invalfc(fc, "Invalid UTF-8 version: %s", version_str);
        }

        encoding = utf8_load(version);

        if (IS_ERR(encoding)) {
                return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n",
                               unicode_major(version), unicode_minor(version),
                               unicode_rev(version));
        }

        pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n",
                unicode_major(version), unicode_minor(version), unicode_rev(version));

        ctx->encoding = encoding;

        return 0;
}
#else
static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
                                    bool latest_version)
{
        return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
}
#endif

static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
{
        struct shmem_options *ctx = fc->fs_private;
        struct fs_parse_result result;
        unsigned long long size;
        char *rest;
        int opt;
        kuid_t kuid;
        kgid_t kgid;

        opt = fs_parse(fc, shmem_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_size:
                size = memparse(param->string, &rest);
                if (*rest == '%') {
                        size <<= PAGE_SHIFT;
                        size *= totalram_pages();
                        do_div(size, 100);
                        rest++;
                }
                if (*rest)
                        goto bad_value;
                ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
                ctx->seen |= SHMEM_SEEN_BLOCKS;
                break;
        case Opt_nr_blocks:
                ctx->blocks = memparse(param->string, &rest);
                if (*rest || ctx->blocks > LONG_MAX)
                        goto bad_value;
                ctx->seen |= SHMEM_SEEN_BLOCKS;
                break;
        case Opt_nr_inodes:
                ctx->inodes = memparse(param->string, &rest);
                if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
                        goto bad_value;
                ctx->seen |= SHMEM_SEEN_INODES;
                break;
        case Opt_mode:
                ctx->mode = result.uint_32 & 07777;
                break;
        case Opt_uid:
                kuid = result.uid;

                /*
                 * The requested uid must be representable in the
                 * filesystem's idmapping.
                 */
                if (!kuid_has_mapping(fc->user_ns, kuid))
                        goto bad_value;

                ctx->uid = kuid;
                break;
        case Opt_gid:
                kgid = result.gid;

                /*
                 * The requested gid must be representable in the
                 * filesystem's idmapping.
                 */
                if (!kgid_has_mapping(fc->user_ns, kgid))
                        goto bad_value;

                ctx->gid = kgid;
                break;
        case Opt_huge:
                ctx->huge = result.uint_32;
                if (ctx->huge != SHMEM_HUGE_NEVER &&
                    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                      has_transparent_hugepage()))
                        goto unsupported_parameter;
                ctx->seen |= SHMEM_SEEN_HUGE;
                break;
        case Opt_mpol:
                if (IS_ENABLED(CONFIG_NUMA)) {
                        mpol_put(ctx->mpol);
                        ctx->mpol = NULL;
                        if (mpol_parse_str(param->string, &ctx->mpol))
                                goto bad_value;
                        break;
                }
                goto unsupported_parameter;
        case Opt_inode32:
                ctx->full_inums = false;
                ctx->seen |= SHMEM_SEEN_INUMS;
                break;
        case Opt_inode64:
                if (sizeof(ino_t) < 8) {
                        return invalfc(fc,
                                       "Cannot use inode64 with <64bit inums in kernel\n");
                }
                ctx->full_inums = true;
                ctx->seen |= SHMEM_SEEN_INUMS;
                break;
        case Opt_noswap:
                if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
                        return invalfc(fc,
                                       "Turning off swap in unprivileged tmpfs mounts unsupported");
                }
                ctx->noswap = true;
                break;
        case Opt_quota:
                if (fc->user_ns != &init_user_ns)
                        return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
                ctx->seen |= SHMEM_SEEN_QUOTA;
                ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
                break;
        case Opt_usrquota:
                if (fc->user_ns != &init_user_ns)
                        return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
                ctx->seen |= SHMEM_SEEN_QUOTA;
                ctx->quota_types |= QTYPE_MASK_USR;
                break;
        case Opt_grpquota:
                if (fc->user_ns != &init_user_ns)
                        return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
                ctx->seen |= SHMEM_SEEN_QUOTA;
                ctx->quota_types |= QTYPE_MASK_GRP;
                break;
        case Opt_usrquota_block_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
                        return invalfc(fc,
                                       "User quota block hardlimit too large.");
                ctx->qlimits.usrquota_bhardlimit = size;
                break;
        case Opt_grpquota_block_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
                        return invalfc(fc,
                                       "Group quota block hardlimit too large.");
                ctx->qlimits.grpquota_bhardlimit = size;
                break;
        case Opt_usrquota_inode_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
                        return invalfc(fc,
                                       "User quota inode hardlimit too large.");
                ctx->qlimits.usrquota_ihardlimit = size;
                break;
        case Opt_grpquota_inode_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
                        return invalfc(fc,
                                       "Group quota inode hardlimit too large.");
                ctx->qlimits.grpquota_ihardlimit = size;
                break;
        case Opt_casefold_version:
                return shmem_parse_opt_casefold(fc, param, false);
        case Opt_casefold:
                return shmem_parse_opt_casefold(fc, param, true);
        case Opt_strict_encoding:
#if IS_ENABLED(CONFIG_UNICODE)
                ctx->strict_encoding = true;
                break;
#else
                return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
#endif
        }
        return 0;

unsupported_parameter:
        return invalfc(fc, "Unsupported parameter '%s'", param->key);
bad_value:
        return invalfc(fc, "Bad value for '%s'", param->key);
}

static char *shmem_next_opt(char **s)
{
        char *sbegin = *s;
        char *p;

        if (sbegin == NULL)
                return NULL;

        /*
         * NUL-terminate this option: unfortunately,
         * mount options form a comma-separated list,
         * but mpol's nodelist may also contain commas.
         */
        for (;;) {
                p = strchr(*s, ',');
                if (p == NULL)
                        break;
                *s = p + 1;
                if (!isdigit(*(p+1))) {
                        *p = '\0';
                        return sbegin;
                }
        }

        *s = NULL;
        return sbegin;
}

static int shmem_parse_monolithic(struct fs_context *fc, void *data)
{
        return vfs_parse_monolithic_sep(fc, data, shmem_next_opt);
}

/*
 * Reconfigure a shmem filesystem.
 */
static int shmem_reconfigure(struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;
        struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
        unsigned long used_isp;
        struct mempolicy *mpol = NULL;
        const char *err;

        raw_spin_lock(&sbinfo->stat_lock);
        used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;

        if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
                if (!sbinfo->max_blocks) {
                        err = "Cannot retroactively limit size";
                        goto out;
                }
                if (percpu_counter_compare(&sbinfo->used_blocks,
                                           ctx->blocks) > 0) {
                        err = "Too small a size for current use";
                        goto out;
                }
        }
        if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
                if (!sbinfo->max_inodes) {
                        err = "Cannot retroactively limit inodes";
                        goto out;
                }
                if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
                        err = "Too few inodes for current use";
                        goto out;
                }
        }

        if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
            sbinfo->next_ino > UINT_MAX) {
                err = "Current inum too high to switch to 32-bit inums";
                goto out;
        }

        /*
         * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap"
         * counterpart for (re-)enabling swap.
         */
        if (ctx->noswap && !sbinfo->noswap) {
                err = "Cannot disable swap on remount";
                goto out;
        }

        if (ctx->seen & SHMEM_SEEN_QUOTA &&
            !sb_any_quota_loaded(fc->root->d_sb)) {
                err = "Cannot enable quota on remount";
                goto out;
        }

#ifdef CONFIG_TMPFS_QUOTA
#define CHANGED_LIMIT(name)                                                \
        (ctx->qlimits.name## hardlimit &&                                \
        (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))

        if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
            CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
                err = "Cannot change global quota limit on remount";
                goto out;
        }
#endif /* CONFIG_TMPFS_QUOTA */

        if (ctx->seen & SHMEM_SEEN_HUGE)
                sbinfo->huge = ctx->huge;
        if (ctx->seen & SHMEM_SEEN_INUMS)
                sbinfo->full_inums = ctx->full_inums;
        if (ctx->seen & SHMEM_SEEN_BLOCKS)
                sbinfo->max_blocks  = ctx->blocks;
        if (ctx->seen & SHMEM_SEEN_INODES) {
                sbinfo->max_inodes  = ctx->inodes;
                sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
        }

        /*
         * Preserve previous mempolicy unless mpol remount option was specified.
         */
        if (ctx->mpol) {
                mpol = sbinfo->mpol;
                sbinfo->mpol = ctx->mpol;        /* transfers initial ref */
                ctx->mpol = NULL;
        }

        if (ctx->noswap)
                sbinfo->noswap = true;

        raw_spin_unlock(&sbinfo->stat_lock);
        mpol_put(mpol);
        return 0;
out:
        raw_spin_unlock(&sbinfo->stat_lock);
        return invalfc(fc, "%s", err);
}

static int shmem_show_options(struct seq_file *seq, struct dentry *root)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
        struct mempolicy *mpol;

        if (sbinfo->max_blocks != shmem_default_max_blocks())
                seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
        if (sbinfo->max_inodes != shmem_default_max_inodes())
                seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
        if (sbinfo->mode != (0777 | S_ISVTX))
                seq_printf(seq, ",mode=%03ho", sbinfo->mode);
        if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
                seq_printf(seq, ",uid=%u",
                                from_kuid_munged(&init_user_ns, sbinfo->uid));
        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u",
                                from_kgid_munged(&init_user_ns, sbinfo->gid));

        /*
         * Showing inode{64,32} might be useful even if it's the system default,
         * since then people don't have to resort to checking both here and
         * /proc/config.gz to confirm 64-bit inums were successfully applied
         * (which may not even exist if IKCONFIG_PROC isn't enabled).
         *
         * We hide it when inode64 isn't the default and we are using 32-bit
         * inodes, since that probably just means the feature isn't even under
         * consideration.
         *
         * As such:
         *
         *                     +-----------------+-----------------+
         *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
         *  +------------------+-----------------+-----------------+
         *  | full_inums=true  | show            | show            |
         *  | full_inums=false | show            | hide            |
         *  +------------------+-----------------+-----------------+
         *
         */
        if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
                seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
        if (sbinfo->huge)
                seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
#endif
        mpol = shmem_get_sbmpol(sbinfo);
        shmem_show_mpol(seq, mpol);
        mpol_put(mpol);
        if (sbinfo->noswap)
                seq_printf(seq, ",noswap");
#ifdef CONFIG_TMPFS_QUOTA
        if (sb_has_quota_active(root->d_sb, USRQUOTA))
                seq_printf(seq, ",usrquota");
        if (sb_has_quota_active(root->d_sb, GRPQUOTA))
                seq_printf(seq, ",grpquota");
        if (sbinfo->qlimits.usrquota_bhardlimit)
                seq_printf(seq, ",usrquota_block_hardlimit=%lld",
                           sbinfo->qlimits.usrquota_bhardlimit);
        if (sbinfo->qlimits.grpquota_bhardlimit)
                seq_printf(seq, ",grpquota_block_hardlimit=%lld",
                           sbinfo->qlimits.grpquota_bhardlimit);
        if (sbinfo->qlimits.usrquota_ihardlimit)
                seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
                           sbinfo->qlimits.usrquota_ihardlimit);
        if (sbinfo->qlimits.grpquota_ihardlimit)
                seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
                           sbinfo->qlimits.grpquota_ihardlimit);
#endif
        return 0;
}

#endif /* CONFIG_TMPFS */

static void shmem_put_super(struct super_block *sb)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

#if IS_ENABLED(CONFIG_UNICODE)
        if (sb->s_encoding)
                utf8_unload(sb->s_encoding);
#endif

#ifdef CONFIG_TMPFS_QUOTA
        shmem_disable_quotas(sb);
#endif
        free_percpu(sbinfo->ino_batch);
        percpu_counter_destroy(&sbinfo->used_blocks);
        mpol_put(sbinfo->mpol);
        kfree(sbinfo);
        sb->s_fs_info = NULL;
}

#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS)
static const struct dentry_operations shmem_ci_dentry_ops = {
        .d_hash = generic_ci_d_hash,
        .d_compare = generic_ci_d_compare,
};
#endif

static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;
        struct inode *inode;
        struct shmem_sb_info *sbinfo;
        int error = -ENOMEM;

        /* Round up to L1_CACHE_BYTES to resist false sharing */
        sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
                                L1_CACHE_BYTES), GFP_KERNEL);
        if (!sbinfo)
                return error;

        sb->s_fs_info = sbinfo;

#ifdef CONFIG_TMPFS
        /*
         * Per default we only allow half of the physical ram per
         * tmpfs instance, limiting inodes to one per page of lowmem;
         * but the internal instance is left unlimited.
         */
        if (!(sb->s_flags & SB_KERNMOUNT)) {
                if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
                        ctx->blocks = shmem_default_max_blocks();
                if (!(ctx->seen & SHMEM_SEEN_INODES))
                        ctx->inodes = shmem_default_max_inodes();
                if (!(ctx->seen & SHMEM_SEEN_INUMS))
                        ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
                sbinfo->noswap = ctx->noswap;
        } else {
                sb->s_flags |= SB_NOUSER;
        }
        sb->s_export_op = &shmem_export_ops;
        sb->s_flags |= SB_NOSEC;

#if IS_ENABLED(CONFIG_UNICODE)
        if (!ctx->encoding && ctx->strict_encoding) {
                pr_err("tmpfs: strict_encoding option without encoding is forbidden\n");
                error = -EINVAL;
                goto failed;
        }

        if (ctx->encoding) {
                sb->s_encoding = ctx->encoding;
                set_default_d_op(sb, &shmem_ci_dentry_ops);
                if (ctx->strict_encoding)
                        sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
        }
#endif

#else
        sb->s_flags |= SB_NOUSER;
#endif /* CONFIG_TMPFS */
        sb->s_d_flags |= DCACHE_DONTCACHE;
        sbinfo->max_blocks = ctx->blocks;
        sbinfo->max_inodes = ctx->inodes;
        sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
        if (sb->s_flags & SB_KERNMOUNT) {
                sbinfo->ino_batch = alloc_percpu(ino_t);
                if (!sbinfo->ino_batch)
                        goto failed;
        }
        sbinfo->uid = ctx->uid;
        sbinfo->gid = ctx->gid;
        sbinfo->full_inums = ctx->full_inums;
        sbinfo->mode = ctx->mode;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (ctx->seen & SHMEM_SEEN_HUGE)
                sbinfo->huge = ctx->huge;
        else
                sbinfo->huge = tmpfs_huge;
#endif
        sbinfo->mpol = ctx->mpol;
        ctx->mpol = NULL;

        raw_spin_lock_init(&sbinfo->stat_lock);
        if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
                goto failed;
        spin_lock_init(&sbinfo->shrinklist_lock);
        INIT_LIST_HEAD(&sbinfo->shrinklist);

        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = PAGE_SIZE;
        sb->s_blocksize_bits = PAGE_SHIFT;
        sb->s_magic = TMPFS_MAGIC;
        sb->s_op = &shmem_ops;
        sb->s_time_gran = 1;
#ifdef CONFIG_TMPFS_XATTR
        sb->s_xattr = shmem_xattr_handlers;
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        sb->s_flags |= SB_POSIXACL;
#endif
        uuid_t uuid;
        uuid_gen(&uuid);
        super_set_uuid(sb, uuid.b, sizeof(uuid));

#ifdef CONFIG_TMPFS_QUOTA
        if (ctx->seen & SHMEM_SEEN_QUOTA) {
                sb->dq_op = &shmem_quota_operations;
                sb->s_qcop = &dquot_quotactl_sysfile_ops;
                sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;

                /* Copy the default limits from ctx into sbinfo */
                memcpy(&sbinfo->qlimits, &ctx->qlimits,
                       sizeof(struct shmem_quota_limits));

                if (shmem_enable_quotas(sb, ctx->quota_types))
                        goto failed;
        }
#endif /* CONFIG_TMPFS_QUOTA */

        inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
                                S_IFDIR | sbinfo->mode, 0,
                                mk_vma_flags(VMA_NORESERVE_BIT));
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto failed;
        }
        inode->i_uid = sbinfo->uid;
        inode->i_gid = sbinfo->gid;
        sb->s_root = d_make_root(inode);
        if (!sb->s_root)
                goto failed;
        return 0;

failed:
        shmem_put_super(sb);
        return error;
}

static int shmem_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, shmem_fill_super);
}

static void shmem_free_fc(struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;

        if (ctx) {
                mpol_put(ctx->mpol);
                kfree(ctx);
        }
}

static const struct fs_context_operations shmem_fs_context_ops = {
        .free                        = shmem_free_fc,
        .get_tree                = shmem_get_tree,
#ifdef CONFIG_TMPFS
        .parse_monolithic        = shmem_parse_monolithic,
        .parse_param                = shmem_parse_one,
        .reconfigure                = shmem_reconfigure,
#endif
};

static struct kmem_cache *shmem_inode_cachep __ro_after_init;

static struct inode *shmem_alloc_inode(struct super_block *sb)
{
        struct shmem_inode_info *info;
        info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
        if (!info)
                return NULL;
        return &info->vfs_inode;
}

static void shmem_free_in_core_inode(struct inode *inode)
{
        if (S_ISLNK(inode->i_mode))
                kfree(inode->i_link);
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}

static void shmem_destroy_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
        if (S_ISDIR(inode->i_mode))
                simple_offset_destroy(shmem_get_offset_ctx(inode));
}

static void shmem_init_inode(void *foo)
{
        struct shmem_inode_info *info = foo;
        inode_init_once(&info->vfs_inode);
}

static void __init shmem_init_inodecache(void)
{
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
                                0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
}

static void __init shmem_destroy_inodecache(void)
{
        kmem_cache_destroy(shmem_inode_cachep);
}

/* Keep the page in page cache instead of truncating it */
static int shmem_error_remove_folio(struct address_space *mapping,
                                   struct folio *folio)
{
        return 0;
}

static const struct address_space_operations shmem_aops = {
        .dirty_folio        = noop_dirty_folio,
#ifdef CONFIG_TMPFS
        .write_begin        = shmem_write_begin,
        .write_end        = shmem_write_end,
#endif
#ifdef CONFIG_MIGRATION
        .migrate_folio        = migrate_folio,
#endif
        .error_remove_folio = shmem_error_remove_folio,
};

static const struct file_operations shmem_file_operations = {
        .mmap_prepare        = shmem_mmap_prepare,
        .open                = shmem_file_open,
        .get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
        .llseek                = shmem_file_llseek,
        .read_iter        = shmem_file_read_iter,
        .write_iter        = shmem_file_write_iter,
        .fsync                = noop_fsync,
        .splice_read        = shmem_file_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = shmem_fallocate,
        .setlease        = generic_setlease,
#endif
};

static const struct inode_operations shmem_inode_operations = {
        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
        .set_acl        = simple_set_acl,
        .fileattr_get        = shmem_fileattr_get,
        .fileattr_set        = shmem_fileattr_set,
#endif
};

static const struct inode_operations shmem_dir_inode_operations = {
#ifdef CONFIG_TMPFS
        .getattr        = shmem_getattr,
        .create                = shmem_create,
        .lookup                = simple_lookup,
        .link                = shmem_link,
        .unlink                = shmem_unlink,
        .symlink        = shmem_symlink,
        .mkdir                = shmem_mkdir,
        .rmdir                = shmem_rmdir,
        .mknod                = shmem_mknod,
        .rename                = shmem_rename2,
        .tmpfile        = shmem_tmpfile,
        .get_offset_ctx        = shmem_get_offset_ctx,
#endif
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
        .fileattr_get        = shmem_fileattr_get,
        .fileattr_set        = shmem_fileattr_set,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
        .set_acl        = simple_set_acl,
#endif
};

static const struct inode_operations shmem_special_inode_operations = {
        .getattr        = shmem_getattr,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
        .set_acl        = simple_set_acl,
#endif
};

static const struct super_operations shmem_ops = {
        .alloc_inode        = shmem_alloc_inode,
        .free_inode        = shmem_free_in_core_inode,
        .destroy_inode        = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
        .statfs                = shmem_statfs,
        .show_options        = shmem_show_options,
#endif
#ifdef CONFIG_TMPFS_QUOTA
        .get_dquots        = shmem_get_dquots,
#endif
        .evict_inode        = shmem_evict_inode,
        .drop_inode        = inode_just_drop,
        .put_super        = shmem_put_super,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        .nr_cached_objects        = shmem_unused_huge_count,
        .free_cached_objects        = shmem_unused_huge_scan,
#endif
};

static const struct vm_operations_struct shmem_vm_ops = {
        .fault                = shmem_fault,
        .map_pages        = filemap_map_pages,
#ifdef CONFIG_NUMA
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
#endif
#ifdef CONFIG_USERFAULTFD
        .uffd_ops        = &shmem_uffd_ops,
#endif
};

static const struct vm_operations_struct shmem_anon_vm_ops = {
        .fault                = shmem_fault,
        .map_pages        = filemap_map_pages,
#ifdef CONFIG_NUMA
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
#endif
#ifdef CONFIG_USERFAULTFD
        .uffd_ops        = &shmem_uffd_ops,
#endif
};

int shmem_init_fs_context(struct fs_context *fc)
{
        struct shmem_options *ctx;

        ctx = kzalloc_obj(struct shmem_options);
        if (!ctx)
                return -ENOMEM;

        ctx->mode = 0777 | S_ISVTX;
        ctx->uid = current_fsuid();
        ctx->gid = current_fsgid();

#if IS_ENABLED(CONFIG_UNICODE)
        ctx->encoding = NULL;
#endif

        fc->fs_private = ctx;
        fc->ops = &shmem_fs_context_ops;
#ifdef CONFIG_TMPFS
        fc->sb_flags |= SB_I_VERSION;
#endif
        return 0;
}

static struct file_system_type shmem_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "tmpfs",
        .init_fs_context = shmem_init_fs_context,
#ifdef CONFIG_TMPFS
        .parameters        = shmem_fs_parameters,
#endif
        .kill_sb        = kill_anon_super,
        .fs_flags        = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
};

#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)

#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)                        \
{                                                                        \
        .attr        = { .name = __stringify(_name), .mode = _mode },        \
        .show        = _show,                                                \
        .store        = _store,                                                \
}

#define TMPFS_ATTR_W(_name, _store)                                \
        static struct kobj_attribute tmpfs_attr_##_name =        \
                        __INIT_KOBJ_ATTR(_name, 0200, NULL, _store)

#define TMPFS_ATTR_RW(_name, _show, _store)                        \
        static struct kobj_attribute tmpfs_attr_##_name =        \
                        __INIT_KOBJ_ATTR(_name, 0644, _show, _store)

#define TMPFS_ATTR_RO(_name, _show)                                \
        static struct kobj_attribute tmpfs_attr_##_name =        \
                        __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)

#if IS_ENABLED(CONFIG_UNICODE)
static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a,
                        char *buf)
{
                return sysfs_emit(buf, "supported\n");
}
TMPFS_ATTR_RO(casefold, casefold_show);
#endif

static struct attribute *tmpfs_attributes[] = {
#if IS_ENABLED(CONFIG_UNICODE)
        &tmpfs_attr_casefold.attr,
#endif
        NULL
};

static const struct attribute_group tmpfs_attribute_group = {
        .attrs = tmpfs_attributes,
        .name = "features"
};

static struct kobject *tmpfs_kobj;

static int __init tmpfs_sysfs_init(void)
{
        int ret;

        tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj);
        if (!tmpfs_kobj)
                return -ENOMEM;

        ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group);
        if (ret)
                kobject_put(tmpfs_kobj);

        return ret;
}
#endif /* CONFIG_SYSFS && CONFIG_TMPFS */

void __init shmem_init(void)
{
        int error;

        shmem_init_inodecache();

#ifdef CONFIG_TMPFS_QUOTA
        register_quota_format(&shmem_quota_format);
#endif

        error = register_filesystem(&shmem_fs_type);
        if (error) {
                pr_err("Could not register tmpfs\n");
                goto out2;
        }

        shm_mnt = kern_mount(&shmem_fs_type);
        if (IS_ERR(shm_mnt)) {
                error = PTR_ERR(shm_mnt);
                pr_err("Could not kern_mount tmpfs\n");
                goto out1;
        }

#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
        error = tmpfs_sysfs_init();
        if (error) {
                pr_err("Could not init tmpfs sysfs\n");
                goto out1;
        }
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
        else
                shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */

        /*
         * Default to setting PMD-sized THP to inherit the global setting and
         * disable all other multi-size THPs.
         */
        if (!shmem_orders_configured)
                huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
#endif
        return;

out1:
        unregister_filesystem(&shmem_fs_type);
out2:
#ifdef CONFIG_TMPFS_QUOTA
        unregister_quota_format(&shmem_quota_format);
#endif
        shmem_destroy_inodecache();
        shm_mnt = ERR_PTR(error);
}

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
static ssize_t shmem_enabled_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        static const int values[] = {
                SHMEM_HUGE_ALWAYS,
                SHMEM_HUGE_WITHIN_SIZE,
                SHMEM_HUGE_ADVISE,
                SHMEM_HUGE_NEVER,
                SHMEM_HUGE_DENY,
                SHMEM_HUGE_FORCE,
        };
        int len = 0;
        int i;

        for (i = 0; i < ARRAY_SIZE(values); i++) {
                len += sysfs_emit_at(buf, len,
                                shmem_huge == values[i] ? "%s[%s]" : "%s%s",
                                i ? " " : "", shmem_format_huge(values[i]));
        }
        len += sysfs_emit_at(buf, len, "\n");

        return len;
}

static ssize_t shmem_enabled_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        char tmp[16];
        int huge, err;

        if (count + 1 > sizeof(tmp))
                return -EINVAL;
        memcpy(tmp, buf, count);
        tmp[count] = '\0';
        if (count && tmp[count - 1] == '\n')
                tmp[count - 1] = '\0';

        huge = shmem_parse_huge(tmp);
        if (huge == -EINVAL)
                return huge;

        shmem_huge = huge;
        if (shmem_huge > SHMEM_HUGE_DENY)
                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;

        err = start_stop_khugepaged();
        return err ? err : count;
}

struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
static DEFINE_SPINLOCK(huge_shmem_orders_lock);

static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
                                          struct kobj_attribute *attr, char *buf)
{
        int order = to_thpsize(kobj)->order;
        const char *output;

        if (test_bit(order, &huge_shmem_orders_always))
                output = "[always] inherit within_size advise never";
        else if (test_bit(order, &huge_shmem_orders_inherit))
                output = "always [inherit] within_size advise never";
        else if (test_bit(order, &huge_shmem_orders_within_size))
                output = "always inherit [within_size] advise never";
        else if (test_bit(order, &huge_shmem_orders_madvise))
                output = "always inherit within_size [advise] never";
        else
                output = "always inherit within_size advise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
                                           struct kobj_attribute *attr,
                                           const char *buf, size_t count)
{
        int order = to_thpsize(kobj)->order;
        ssize_t ret = count;

        if (sysfs_streq(buf, "always")) {
                spin_lock(&huge_shmem_orders_lock);
                clear_bit(order, &huge_shmem_orders_inherit);
                clear_bit(order, &huge_shmem_orders_madvise);
                clear_bit(order, &huge_shmem_orders_within_size);
                set_bit(order, &huge_shmem_orders_always);
                spin_unlock(&huge_shmem_orders_lock);
        } else if (sysfs_streq(buf, "inherit")) {
                /* Do not override huge allocation policy with non-PMD sized mTHP */
                if (shmem_huge == SHMEM_HUGE_FORCE && !is_pmd_order(order))
                        return -EINVAL;

                spin_lock(&huge_shmem_orders_lock);
                clear_bit(order, &huge_shmem_orders_always);
                clear_bit(order, &huge_shmem_orders_madvise);
                clear_bit(order, &huge_shmem_orders_within_size);
                set_bit(order, &huge_shmem_orders_inherit);
                spin_unlock(&huge_shmem_orders_lock);
        } else if (sysfs_streq(buf, "within_size")) {
                spin_lock(&huge_shmem_orders_lock);
                clear_bit(order, &huge_shmem_orders_always);
                clear_bit(order, &huge_shmem_orders_inherit);
                clear_bit(order, &huge_shmem_orders_madvise);
                set_bit(order, &huge_shmem_orders_within_size);
                spin_unlock(&huge_shmem_orders_lock);
        } else if (sysfs_streq(buf, "advise")) {
                spin_lock(&huge_shmem_orders_lock);
                clear_bit(order, &huge_shmem_orders_always);
                clear_bit(order, &huge_shmem_orders_inherit);
                clear_bit(order, &huge_shmem_orders_within_size);
                set_bit(order, &huge_shmem_orders_madvise);
                spin_unlock(&huge_shmem_orders_lock);
        } else if (sysfs_streq(buf, "never")) {
                spin_lock(&huge_shmem_orders_lock);
                clear_bit(order, &huge_shmem_orders_always);
                clear_bit(order, &huge_shmem_orders_inherit);
                clear_bit(order, &huge_shmem_orders_within_size);
                clear_bit(order, &huge_shmem_orders_madvise);
                spin_unlock(&huge_shmem_orders_lock);
        } else {
                ret = -EINVAL;
        }

        if (ret > 0) {
                int err = start_stop_khugepaged();

                if (err)
                        ret = err;
        }
        return ret;
}

struct kobj_attribute thpsize_shmem_enabled_attr =
        __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */

#if defined(CONFIG_TRANSPARENT_HUGEPAGE)

static int __init setup_transparent_hugepage_shmem(char *str)
{
        int huge;

        huge = shmem_parse_huge(str);
        if (huge == -EINVAL) {
                pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n");
                return huge;
        }

        shmem_huge = huge;
        return 1;
}
__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);

static int __init setup_transparent_hugepage_tmpfs(char *str)
{
        int huge;

        huge = shmem_parse_huge(str);
        if (huge < 0) {
                pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
                return huge;
        }

        tmpfs_huge = huge;
        return 1;
}
__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);

static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_shmem(char *str)
{
        char *token, *range, *policy, *subtoken;
        unsigned long always, inherit, madvise, within_size;
        char *start_size, *end_size;
        int start, end, nr;
        char *p;

        if (!str || strlen(str) + 1 > PAGE_SIZE)
                goto err;
        strscpy(str_dup, str);

        always = huge_shmem_orders_always;
        inherit = huge_shmem_orders_inherit;
        madvise = huge_shmem_orders_madvise;
        within_size = huge_shmem_orders_within_size;
        p = str_dup;
        while ((token = strsep(&p, ";")) != NULL) {
                range = strsep(&token, ":");
                policy = token;

                if (!policy)
                        goto err;

                while ((subtoken = strsep(&range, ",")) != NULL) {
                        if (strchr(subtoken, '-')) {
                                start_size = strsep(&subtoken, "-");
                                end_size = subtoken;

                                start = get_order_from_str(start_size,
                                                           THP_ORDERS_ALL_FILE_DEFAULT);
                                end = get_order_from_str(end_size,
                                                         THP_ORDERS_ALL_FILE_DEFAULT);
                        } else {
                                start_size = end_size = subtoken;
                                start = end = get_order_from_str(subtoken,
                                                                 THP_ORDERS_ALL_FILE_DEFAULT);
                        }

                        if (start < 0) {
                                pr_err("invalid size %s in thp_shmem boot parameter\n",
                                       start_size);
                                goto err;
                        }

                        if (end < 0) {
                                pr_err("invalid size %s in thp_shmem boot parameter\n",
                                       end_size);
                                goto err;
                        }

                        if (start > end)
                                goto err;

                        nr = end - start + 1;
                        if (!strcmp(policy, "always")) {
                                bitmap_set(&always, start, nr);
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&within_size, start, nr);
                        } else if (!strcmp(policy, "advise")) {
                                bitmap_set(&madvise, start, nr);
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&always, start, nr);
                                bitmap_clear(&within_size, start, nr);
                        } else if (!strcmp(policy, "inherit")) {
                                bitmap_set(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&always, start, nr);
                                bitmap_clear(&within_size, start, nr);
                        } else if (!strcmp(policy, "within_size")) {
                                bitmap_set(&within_size, start, nr);
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&always, start, nr);
                        } else if (!strcmp(policy, "never")) {
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&always, start, nr);
                                bitmap_clear(&within_size, start, nr);
                        } else {
                                pr_err("invalid policy %s in thp_shmem boot parameter\n", policy);
                                goto err;
                        }
                }
        }

        huge_shmem_orders_always = always;
        huge_shmem_orders_madvise = madvise;
        huge_shmem_orders_inherit = inherit;
        huge_shmem_orders_within_size = within_size;
        shmem_orders_configured = true;
        return 1;

err:
        pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str);
        return 0;
}
__setup("thp_shmem=", setup_thp_shmem);

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#else /* !CONFIG_SHMEM */

/*
 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
 *
 * This is intended for small system where the benefits of the full
 * shmem code (swap-backed and resource-limited) are outweighed by
 * their complexity. On systems without swap this code should be
 * effectively equivalent, but much lighter weight.
 */

static struct file_system_type shmem_fs_type = {
        .name                = "tmpfs",
        .init_fs_context = ramfs_init_fs_context,
        .parameters        = ramfs_fs_parameters,
        .kill_sb        = ramfs_kill_sb,
        .fs_flags        = FS_USERNS_MOUNT,
};

void __init shmem_init(void)
{
        BUG_ON(register_filesystem(&shmem_fs_type) != 0);

        shm_mnt = kern_mount(&shmem_fs_type);
        BUG_ON(IS_ERR(shm_mnt));
}

int shmem_unuse(unsigned int type)
{
        return 0;
}

int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
        return 0;
}

void shmem_unlock_mapping(struct address_space *mapping)
{
}

#ifdef CONFIG_MMU
unsigned long shmem_get_unmapped_area(struct file *file,
                                      unsigned long addr, unsigned long len,
                                      unsigned long pgoff, unsigned long flags)
{
        return mm_get_unmapped_area(file, addr, len, pgoff, flags);
}
#endif

void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
{
        truncate_inode_pages_range(inode->i_mapping, lstart, lend);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);

#define shmem_vm_ops                                generic_file_vm_ops
#define shmem_anon_vm_ops                        generic_file_vm_ops
#define shmem_file_operations                        ramfs_file_operations

static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
        return 0;
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
}

static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
                                struct super_block *sb, struct inode *dir,
                                umode_t mode, dev_t dev, vma_flags_t flags)
{
        struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
        return inode ? inode : ERR_PTR(-ENOSPC);
}

#endif /* CONFIG_SHMEM */

/* common code */

static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
                                       loff_t size, vma_flags_t flags,
                                       unsigned int i_flags)
{
        const unsigned long shmem_flags =
                vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0;
        struct inode *inode;
        struct file *res;

        if (IS_ERR(mnt))
                return ERR_CAST(mnt);

        if (size < 0 || size > MAX_LFS_FILESIZE)
                return ERR_PTR(-EINVAL);

        if (is_idmapped_mnt(mnt))
                return ERR_PTR(-EINVAL);

        if (shmem_acct_size(shmem_flags, size))
                return ERR_PTR(-ENOMEM);

        inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
                                S_IFREG | S_IRWXUGO, 0, flags);
        if (IS_ERR(inode)) {
                shmem_unacct_size(shmem_flags, size);
                return ERR_CAST(inode);
        }
        inode->i_flags |= i_flags;
        inode->i_size = size;
        clear_nlink(inode);        /* It is unlinked */
        res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
        if (!IS_ERR(res))
                res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
                                &shmem_file_operations);
        if (IS_ERR(res))
                iput(inode);
        return res;
}

/**
 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
 *         kernel internal.  There will be NO LSM permission checks against the
 *         underlying inode.  So users of this interface must do LSM checks at a
 *        higher layer.  The users are the big_key and shm implementations.  LSM
 *        checks are provided at the key or shm level rather than the inode.
 * @name: name for dentry (to be seen in /proc/<pid>/maps)
 * @size: size to be set for the file
 * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
 */
struct file *shmem_kernel_file_setup(const char *name, loff_t size,
                                     vma_flags_t flags)
{
        return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}
EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);

/**
 * shmem_file_setup - get an unlinked file living in tmpfs
 * @name: name for dentry (to be seen in /proc/<pid>/maps)
 * @size: size to be set for the file
 * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
 */
struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags)
{
        return __shmem_file_setup(shm_mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup);

/**
 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
 * @mnt: the tmpfs mount where the file will be created
 * @name: name for dentry (to be seen in /proc/<pid>/maps)
 * @size: size to be set for the file
 * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
 */
struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
                                       loff_t size, vma_flags_t flags)
{
        return __shmem_file_setup(mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);

static struct file *__shmem_zero_setup(unsigned long start, unsigned long end,
                vma_flags_t flags)
{
        loff_t size = end - start;

        /*
         * Cloning a new file under mmap_lock leads to a lock ordering conflict
         * between XFS directory reading and selinux: since this file is only
         * accessible to the user through its mapping, use S_PRIVATE flag to
         * bypass file security, in the same way as shmem_kernel_file_setup().
         */
        return shmem_kernel_file_setup("dev/zero", size, flags);
}

/**
 * shmem_zero_setup - setup a shared anonymous mapping
 * @vma: the vma to be mmapped is prepared by do_mmap
 * Returns: 0 on success, or error
 */
int shmem_zero_setup(struct vm_area_struct *vma)
{
        struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->flags);

        if (IS_ERR(file))
                return PTR_ERR(file);

        if (vma->vm_file)
                fput(vma->vm_file);
        vma->vm_file = file;
        vma->vm_ops = &shmem_anon_vm_ops;

        return 0;
}

/**
 * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA
 * descriptor for convenience.
 * @desc: Describes VMA
 * Returns: 0 on success, or error
 */
int shmem_zero_setup_desc(struct vm_area_desc *desc)
{
        struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vma_flags);

        if (IS_ERR(file))
                return PTR_ERR(file);

        desc->vm_file = file;
        desc->vm_ops = &shmem_anon_vm_ops;

        return 0;
}

/**
 * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
 * @mapping:        the folio's address_space
 * @index:        the folio index
 * @gfp:        the page allocator flags to use if allocating
 *
 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
 * with any new page allocations done using the specified allocation flags.
 * But read_cache_page_gfp() uses the ->read_folio() method: which does not
 * suit tmpfs, since it may have pages in swapcache, and needs to find those
 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
 *
 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
 */
struct folio *shmem_read_folio_gfp(struct address_space *mapping,
                pgoff_t index, gfp_t gfp)
{
#ifdef CONFIG_SHMEM
        struct inode *inode = mapping->host;
        struct folio *folio;
        int error;

        error = shmem_get_folio_gfp(inode, index, i_size_read(inode),
                                    &folio, SGP_CACHE, gfp, NULL, NULL);
        if (error)
                return ERR_PTR(error);

        folio_unlock(folio);
        return folio;
#else
        /*
         * The tiny !SHMEM case uses ramfs without swap
         */
        return mapping_read_folio_gfp(mapping, index, gfp);
#endif
}
EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);

struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                         pgoff_t index, gfp_t gfp)
{
        struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
        struct page *page;

        if (IS_ERR(folio))
                return &folio->page;

        page = folio_file_page(folio, index);
        if (PageHWPoison(page)) {
                folio_put(folio);
                return ERR_PTR(-EIO);
        }

        return page;
}
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);










    1 


















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM hugetlbfs

#if !defined(_TRACE_HUGETLBFS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_HUGETLBFS_H

#include <linux/tracepoint.h>

TRACE_EVENT(hugetlbfs_alloc_inode,

        TP_PROTO(struct inode *inode, struct inode *dir, int mode),

        TP_ARGS(inode, dir, mode),

        TP_STRUCT__entry(
                __field(u64,                ino)
                __field(u64,                dir)
                __field(dev_t,                dev)
                __field(__u16,                mode)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->dir                = dir ? dir->i_ino : 0;
                __entry->mode                = mode;
        ),

        TP_printk("dev %d,%d ino %llu dir %llu mode 0%o",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                __entry->ino,
                __entry->dir, __entry->mode)
);

DECLARE_EVENT_CLASS(hugetlbfs__inode,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(u64,                ino)
                __field(loff_t,                size)
                __field(blkcnt_t,        blocks)
                __field(dev_t,                dev)
                __field(unsigned int,        nlink)
                __field(unsigned int,        seals)
                __field(__u16,                mode)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->mode                = inode->i_mode;
                __entry->size                = inode->i_size;
                __entry->nlink                = inode->i_nlink;
                __entry->seals                = HUGETLBFS_I(inode)->seals;
                __entry->blocks                = inode->i_blocks;
        ),

        TP_printk("dev %d,%d ino %llu mode 0%o size %lld nlink %u seals %u blocks %llu",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
                __entry->mode, __entry->size, __entry->nlink, __entry->seals,
                (unsigned long long)__entry->blocks)
);

DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_evict_inode,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_free_inode,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

TRACE_EVENT(hugetlbfs_setattr,

        TP_PROTO(struct inode *inode, struct dentry *dentry,
                struct iattr *attr),

        TP_ARGS(inode, dentry, attr),

        TP_STRUCT__entry(
                __field(u64,                ino)
                __field(loff_t,                old_size)
                __field(loff_t,                ia_size)
                __field(dev_t,                dev)
                __field(unsigned int,        d_len)
                __string(d_name,        dentry->d_name.name)
                __field(unsigned int,        ia_valid)
                __field(unsigned int,        ia_mode)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->d_len                = dentry->d_name.len;
                __assign_str(d_name);
                __entry->ia_valid        = attr->ia_valid;
                __entry->ia_mode        = attr->ia_mode;
                __entry->old_size        = inode->i_size;
                __entry->ia_size        = attr->ia_size;
        ),

        TP_printk("dev %d,%d ino %llu name %.*s valid %#x mode 0%o old_size %lld size %lld",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
                __entry->d_len, __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
                __entry->old_size, __entry->ia_size)
);

TRACE_EVENT(hugetlbfs_fallocate,

        TP_PROTO(struct inode *inode, int mode,
                loff_t offset, loff_t len, int ret),

        TP_ARGS(inode, mode, offset, len, ret),

        TP_STRUCT__entry(
                __field(u64,                ino)
                __field(loff_t,                offset)
                __field(loff_t,                len)
                __field(loff_t,                size)
                __field(dev_t,                dev)
                __field(int,                mode)
                __field(int,                ret)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->mode                = mode;
                __entry->offset                = offset;
                __entry->len                = len;
                __entry->size                = inode->i_size;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %llu mode 0%o offset %lld len %lld size %lld ret %d",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                __entry->ino, __entry->mode,
                (unsigned long long)__entry->offset,
                (unsigned long long)__entry->len,
                (unsigned long long)__entry->size,
                __entry->ret)
);

#endif /* _TRACE_HUGETLBFS_H */

 /* This part must be outside protection */
#include <trace/define_trace.h>























































































































































































    5 


    5 























































































































































































    3 
    3 




















































































































   14 


   13 
















   11 
















   14 



















   11 















   11 


































































   14 



   11 


   12 













   13 









































































   14 





   13 



   14 



















   14 



























   12 










   12 


   13 






   11 

   14 

   11 

   13 








   13 
    2 


    3 

    1 






   13 



    2 















































   13 
















   11 

















   14 

   13 


    5 


   11 



































































































   13 



   10 
   14 
    2 






   14 





















































































    1 











    2 





   12 




































    4 
   12 














































   13 
   11 








   13 

















    4 
    3 
   13 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
 *        -  July2000
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-pm.h>
#include <linux/blk-integrity.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/fault-inject.h>
#include <linux/list_sort.h>
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
#include <linux/part_stat.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>

#define CREATE_TRACE_POINTS
#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-cgroup.h"
#include "blk-throttle.h"
#include "blk-ioprio.h"

struct dentry *blk_debugfs_root;

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);

static DEFINE_IDA(blk_queue_ida);

/*
 * For queue allocation
 */
static struct kmem_cache *blk_requestq_cachep;

/*
 * Controlling structure to kblockd
 */
static struct workqueue_struct *kblockd_workqueue;

/**
 * blk_queue_flag_set - atomically set a queue flag
 * @flag: flag to be set
 * @q: request queue
 */
void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
{
        set_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_flag_set);

/**
 * blk_queue_flag_clear - atomically clear a queue flag
 * @flag: flag to be cleared
 * @q: request queue
 */
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
{
        clear_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_flag_clear);

#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
static const char *const blk_op_name[] = {
        REQ_OP_NAME(READ),
        REQ_OP_NAME(WRITE),
        REQ_OP_NAME(FLUSH),
        REQ_OP_NAME(DISCARD),
        REQ_OP_NAME(SECURE_ERASE),
        REQ_OP_NAME(ZONE_RESET),
        REQ_OP_NAME(ZONE_RESET_ALL),
        REQ_OP_NAME(ZONE_OPEN),
        REQ_OP_NAME(ZONE_CLOSE),
        REQ_OP_NAME(ZONE_FINISH),
        REQ_OP_NAME(ZONE_APPEND),
        REQ_OP_NAME(WRITE_ZEROES),
        REQ_OP_NAME(DRV_IN),
        REQ_OP_NAME(DRV_OUT),
};
#undef REQ_OP_NAME

/**
 * blk_op_str - Return the string "name" for an operation REQ_OP_name.
 * @op: a request operation.
 *
 * Convert a request operation REQ_OP_name into the string "name". Useful for
 * debugging and tracing BIOs and requests. For an invalid request operation
 * code, the string "UNKNOWN" is returned.
 */
inline const char *blk_op_str(enum req_op op)
{
        const char *op_str = "UNKNOWN";

        if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
                op_str = blk_op_name[op];

        return op_str;
}
EXPORT_SYMBOL_GPL(blk_op_str);

static const struct {
        int                errno;
        const char        *name;
} blk_errors[] = {
        [BLK_STS_OK]                = { 0,                "" },
        [BLK_STS_NOTSUPP]        = { -EOPNOTSUPP, "operation not supported" },
        [BLK_STS_TIMEOUT]        = { -ETIMEDOUT,        "timeout" },
        [BLK_STS_NOSPC]                = { -ENOSPC,        "critical space allocation" },
        [BLK_STS_TRANSPORT]        = { -ENOLINK,        "recoverable transport" },
        [BLK_STS_TARGET]        = { -EREMOTEIO,        "critical target" },
        [BLK_STS_RESV_CONFLICT]        = { -EBADE,        "reservation conflict" },
        [BLK_STS_MEDIUM]        = { -ENODATA,        "critical medium" },
        [BLK_STS_PROTECTION]        = { -EILSEQ,        "protection" },
        [BLK_STS_RESOURCE]        = { -ENOMEM,        "kernel resource" },
        [BLK_STS_DEV_RESOURCE]        = { -EBUSY,        "device resource" },
        [BLK_STS_AGAIN]                = { -EAGAIN,        "nonblocking retry" },
        [BLK_STS_OFFLINE]        = { -ENODEV,        "device offline" },

        /* device mapper special case, should not leak out: */
        [BLK_STS_DM_REQUEUE]        = { -EREMCHG, "dm internal retry" },

        /* zone device specific errors */
        [BLK_STS_ZONE_OPEN_RESOURCE]        = { -ETOOMANYREFS, "open zones exceeded" },
        [BLK_STS_ZONE_ACTIVE_RESOURCE]        = { -EOVERFLOW, "active zones exceeded" },

        /* Command duration limit device-side timeout */
        [BLK_STS_DURATION_LIMIT]        = { -ETIME, "duration limit exceeded" },

        [BLK_STS_INVAL]                = { -EINVAL,        "invalid" },

        /* everything else not covered above: */
        [BLK_STS_IOERR]                = { -EIO,        "I/O" },
};

blk_status_t errno_to_blk_status(int errno)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
                if (blk_errors[i].errno == errno)
                        return (__force blk_status_t)i;
        }

        return BLK_STS_IOERR;
}
EXPORT_SYMBOL_GPL(errno_to_blk_status);

int blk_status_to_errno(blk_status_t status)
{
        int idx = (__force int)status;

        if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
                return -EIO;
        return blk_errors[idx].errno;
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);

const char *blk_status_to_str(blk_status_t status)
{
        int idx = (__force int)status;

        if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
                return "<null>";
        return blk_errors[idx].name;
}
EXPORT_SYMBOL_GPL(blk_status_to_str);

/**
 * blk_sync_queue - cancel any pending callbacks on a queue
 * @q: the queue
 *
 * Description:
 *     The block layer may perform asynchronous callback activity
 *     on a queue, such as calling the unplug function after a timeout.
 *     A block device may call blk_sync_queue to ensure that any
 *     such activity is cancelled, thus allowing it to release resources
 *     that the callbacks might use. The caller must already have made sure
 *     that its ->submit_bio will not re-add plugging prior to calling
 *     this function.
 *
 *     This function does not cancel any asynchronous activity arising
 *     out of elevator or throttling code. That would require elevator_exit()
 *     and blkcg_exit_queue() to be called with queue lock initialized.
 *
 */
void blk_sync_queue(struct request_queue *q)
{
        timer_delete_sync(&q->timeout);
        cancel_work_sync(&q->timeout_work);
}
EXPORT_SYMBOL(blk_sync_queue);

/**
 * blk_set_pm_only - increment pm_only counter
 * @q: request queue pointer
 */
void blk_set_pm_only(struct request_queue *q)
{
        atomic_inc(&q->pm_only);
}
EXPORT_SYMBOL_GPL(blk_set_pm_only);

void blk_clear_pm_only(struct request_queue *q)
{
        int pm_only;

        pm_only = atomic_dec_return(&q->pm_only);
        WARN_ON_ONCE(pm_only < 0);
        if (pm_only == 0)
                wake_up_all(&q->mq_freeze_wq);
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);

static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
        struct request_queue *q = container_of(rcu_head,
                        struct request_queue, rcu_head);

        percpu_ref_exit(&q->q_usage_counter);
        kmem_cache_free(blk_requestq_cachep, q);
}

static void blk_free_queue(struct request_queue *q)
{
        blk_free_queue_stats(q->stats);
        if (queue_is_mq(q))
                blk_mq_release(q);

        ida_free(&blk_queue_ida, q->id);
        lockdep_unregister_key(&q->io_lock_cls_key);
        lockdep_unregister_key(&q->q_lock_cls_key);
        call_rcu(&q->rcu_head, blk_free_queue_rcu);
}

/**
 * blk_put_queue - decrement the request_queue refcount
 * @q: the request_queue structure to decrement the refcount for
 *
 * Decrements the refcount of the request_queue and free it when the refcount
 * reaches 0.
 */
void blk_put_queue(struct request_queue *q)
{
        if (refcount_dec_and_test(&q->refs))
                blk_free_queue(q);
}
EXPORT_SYMBOL(blk_put_queue);

bool blk_queue_start_drain(struct request_queue *q)
{
        /*
         * When queue DYING flag is set, we need to block new req
         * entering queue, so we call blk_freeze_queue_start() to
         * prevent I/O from crossing blk_queue_enter().
         */
        bool freeze = __blk_freeze_queue_start(q, current);
        if (queue_is_mq(q))
                blk_mq_wake_waiters(q);
        /* Make blk_queue_enter() reexamine the DYING flag. */
        wake_up_all(&q->mq_freeze_wq);

        return freeze;
}

/**
 * blk_queue_enter() - try to increase q->q_usage_counter
 * @q: request queue pointer
 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
 */
int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
{
        const bool pm = flags & BLK_MQ_REQ_PM;

        while (!blk_try_enter_queue(q, pm)) {
                if (flags & BLK_MQ_REQ_NOWAIT)
                        return -EAGAIN;

                /*
                 * read pair of barrier in blk_freeze_queue_start(), we need to
                 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
                 * reading .mq_freeze_depth or queue dying flag, otherwise the
                 * following wait may never return if the two reads are
                 * reordered.
                 */
                smp_rmb();
                wait_event(q->mq_freeze_wq,
                           (!q->mq_freeze_depth &&
                            blk_pm_resume_queue(pm, q)) ||
                           blk_queue_dying(q));
                if (blk_queue_dying(q))
                        return -ENODEV;
        }

        rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_);
        rwsem_release(&q->q_lockdep_map, _RET_IP_);
        return 0;
}

int __bio_queue_enter(struct request_queue *q, struct bio *bio)
{
        while (!blk_try_enter_queue(q, false)) {
                struct gendisk *disk = bio->bi_bdev->bd_disk;

                if (bio->bi_opf & REQ_NOWAIT) {
                        if (test_bit(GD_DEAD, &disk->state))
                                goto dead;
                        bio_wouldblock_error(bio);
                        return -EAGAIN;
                }

                /*
                 * read pair of barrier in blk_freeze_queue_start(), we need to
                 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
                 * reading .mq_freeze_depth or queue dying flag, otherwise the
                 * following wait may never return if the two reads are
                 * reordered.
                 */
                smp_rmb();
                wait_event(q->mq_freeze_wq,
                           (!q->mq_freeze_depth &&
                            blk_pm_resume_queue(false, q)) ||
                           test_bit(GD_DEAD, &disk->state));
                if (test_bit(GD_DEAD, &disk->state))
                        goto dead;
        }

        rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
        rwsem_release(&q->io_lockdep_map, _RET_IP_);
        return 0;
dead:
        bio_io_error(bio);
        return -ENODEV;
}

void blk_queue_exit(struct request_queue *q)
{
        percpu_ref_put(&q->q_usage_counter);
}

static void blk_queue_usage_counter_release(struct percpu_ref *ref)
{
        struct request_queue *q =
                container_of(ref, struct request_queue, q_usage_counter);

        wake_up_all(&q->mq_freeze_wq);
}

static void blk_rq_timed_out_timer(struct timer_list *t)
{
        struct request_queue *q = timer_container_of(q, t, timeout);

        kblockd_schedule_work(&q->timeout_work);
}

static void blk_timeout_work(struct work_struct *work)
{
}

struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
{
        struct request_queue *q;
        int error;

        q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
                                  node_id);
        if (!q)
                return ERR_PTR(-ENOMEM);

        q->last_merge = NULL;

        q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
        if (q->id < 0) {
                error = q->id;
                goto fail_q;
        }

        q->stats = blk_alloc_queue_stats();
        if (!q->stats) {
                error = -ENOMEM;
                goto fail_id;
        }

        error = blk_set_default_limits(lim);
        if (error)
                goto fail_stats;
        q->limits = *lim;

        q->node = node_id;

        atomic_set(&q->nr_active_requests_shared_tags, 0);

        timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
        INIT_WORK(&q->timeout_work, blk_timeout_work);
        INIT_LIST_HEAD(&q->icq_list);

        refcount_set(&q->refs, 1);
        mutex_init(&q->debugfs_mutex);
        mutex_init(&q->elevator_lock);
        mutex_init(&q->sysfs_lock);
        mutex_init(&q->limits_lock);
        mutex_init(&q->rq_qos_mutex);
        spin_lock_init(&q->queue_lock);

        init_waitqueue_head(&q->mq_freeze_wq);
        mutex_init(&q->mq_freeze_lock);

        blkg_init_queue(q);

        /*
         * Init percpu_ref in atomic mode so that it's faster to shutdown.
         * See blk_register_queue() for details.
         */
        error = percpu_ref_init(&q->q_usage_counter,
                                blk_queue_usage_counter_release,
                                PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
        if (error)
                goto fail_stats;
        lockdep_register_key(&q->io_lock_cls_key);
        lockdep_register_key(&q->q_lock_cls_key);
        lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)",
                         &q->io_lock_cls_key, 0);
        lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)",
                         &q->q_lock_cls_key, 0);

        /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */
        fs_reclaim_acquire(GFP_KERNEL);
        rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
        rwsem_release(&q->io_lockdep_map, _RET_IP_);
        fs_reclaim_release(GFP_KERNEL);

        q->nr_requests = BLKDEV_DEFAULT_RQ;
        q->async_depth = BLKDEV_DEFAULT_RQ;

        return q;

fail_stats:
        blk_free_queue_stats(q->stats);
fail_id:
        ida_free(&blk_queue_ida, q->id);
fail_q:
        kmem_cache_free(blk_requestq_cachep, q);
        return ERR_PTR(error);
}

/**
 * blk_get_queue - increment the request_queue refcount
 * @q: the request_queue structure to increment the refcount for
 *
 * Increment the refcount of the request_queue kobject.
 *
 * Context: Any context.
 */
bool blk_get_queue(struct request_queue *q)
{
        if (unlikely(blk_queue_dying(q)))
                return false;
        refcount_inc(&q->refs);
        return true;
}
EXPORT_SYMBOL(blk_get_queue);

#ifdef CONFIG_FAIL_MAKE_REQUEST

static DECLARE_FAULT_ATTR(fail_make_request);

static int __init setup_fail_make_request(char *str)
{
        return setup_fault_attr(&fail_make_request, str);
}
__setup("fail_make_request=", setup_fail_make_request);

bool should_fail_request(struct block_device *part, unsigned int bytes)
{
        return bdev_test_flag(part, BD_MAKE_IT_FAIL) &&
               should_fail(&fail_make_request, bytes);
}

static int __init fail_make_request_debugfs(void)
{
        struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
                                                NULL, &fail_make_request);

        return PTR_ERR_OR_ZERO(dir);
}

late_initcall(fail_make_request_debugfs);
#endif /* CONFIG_FAIL_MAKE_REQUEST */

static inline void bio_check_ro(struct bio *bio)
{
        if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
                if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
                        return;

                if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED))
                        return;

                bdev_set_flag(bio->bi_bdev, BD_RO_WARNED);

                /*
                 * Use ioctl to set underlying disk of raid/dm to read-only
                 * will trigger this.
                 */
                pr_warn("Trying to write to read-only block-device %pg\n",
                        bio->bi_bdev);
        }
}

int should_fail_bio(struct bio *bio)
{
        if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
                return -EIO;
        return 0;
}
ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);

/*
 * Check whether this bio extends beyond the end of the device or partition.
 * This may well happen - the kernel calls bread() without checking the size of
 * the device, e.g., when mounting a file system.
 */
static inline int bio_check_eod(struct bio *bio)
{
        sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
        unsigned int nr_sectors = bio_sectors(bio);

        if (nr_sectors &&
            (nr_sectors > maxsector ||
             bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
                if (!maxsector)
                        return -EIO;
                pr_info_ratelimited("%s: attempt to access beyond end of device\n"
                                    "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
                                    current->comm, bio->bi_bdev, bio->bi_opf,
                                    bio->bi_iter.bi_sector, nr_sectors, maxsector);
                return -EIO;
        }
        return 0;
}

/*
 * Remap block n of partition p to block n+start(p) of the disk.
 */
static int blk_partition_remap(struct bio *bio)
{
        struct block_device *p = bio->bi_bdev;

        if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
                return -EIO;
        if (bio_sectors(bio)) {
                bio->bi_iter.bi_sector += p->bd_start_sect;
                trace_block_bio_remap(bio, p->bd_dev,
                                      bio->bi_iter.bi_sector -
                                      p->bd_start_sect);
        }
        bio_set_flag(bio, BIO_REMAPPED);
        return 0;
}

/*
 * Check write append to a zoned block device.
 */
static inline blk_status_t blk_check_zone_append(struct request_queue *q,
                                                 struct bio *bio)
{
        int nr_sectors = bio_sectors(bio);

        /* Only applicable to zoned block devices */
        if (!bdev_is_zoned(bio->bi_bdev))
                return BLK_STS_NOTSUPP;

        /* The bio sector must point to the start of a sequential zone */
        if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
                return BLK_STS_IOERR;

        /*
         * Not allowed to cross zone boundaries. Otherwise, the BIO will be
         * split and could result in non-contiguous sectors being written in
         * different zones.
         */
        if (nr_sectors > q->limits.chunk_sectors)
                return BLK_STS_IOERR;

        /* Make sure the BIO is small enough and will not get split */
        if (nr_sectors > q->limits.max_zone_append_sectors)
                return BLK_STS_IOERR;

        bio->bi_opf |= REQ_NOMERGE;

        return BLK_STS_OK;
}

static void __submit_bio(struct bio *bio)
{
        /* If plug is not used, add new plug here to cache nsecs time. */
        struct blk_plug plug;

        blk_start_plug(&plug);

        if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
                blk_mq_submit_bio(bio);
        } else if (likely(bio_queue_enter(bio) == 0)) {
                struct gendisk *disk = bio->bi_bdev->bd_disk;
        
                if ((bio->bi_opf & REQ_POLLED) &&
                    !(disk->queue->limits.features & BLK_FEAT_POLL)) {
                        bio->bi_status = BLK_STS_NOTSUPP;
                        bio_endio(bio);
                } else {
                        disk->fops->submit_bio(bio);
                }
                blk_queue_exit(disk->queue);
        }

        blk_finish_plug(&plug);
}

/*
 * The loop in this function may be a bit non-obvious, and so deserves some
 * explanation:
 *
 *  - Before entering the loop, bio->bi_next is NULL (as all callers ensure
 *    that), so we have a list with a single bio.
 *  - We pretend that we have just taken it off a longer list, so we assign
 *    bio_list to a pointer to the bio_list_on_stack, thus initialising the
 *    bio_list of new bios to be added.  ->submit_bio() may indeed add some more
 *    bios through a recursive call to submit_bio_noacct.  If it did, we find a
 *    non-NULL value in bio_list and re-enter the loop from the top.
 *  - In this case we really did just take the bio off the top of the list (no
 *    pretending) and so remove it from bio_list, and call into ->submit_bio()
 *    again.
 *
 * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
 * bio_list_on_stack[1] contains bios that were submitted before the current
 *        ->submit_bio(), but that haven't been processed yet.
 */
static void __submit_bio_noacct(struct bio *bio)
{
        struct bio_list bio_list_on_stack[2];

        BUG_ON(bio->bi_next);

        bio_list_init(&bio_list_on_stack[0]);
        current->bio_list = bio_list_on_stack;

        do {
                struct request_queue *q = bdev_get_queue(bio->bi_bdev);
                struct bio_list lower, same;

                /*
                 * Create a fresh bio_list for all subordinate requests.
                 */
                bio_list_on_stack[1] = bio_list_on_stack[0];
                bio_list_init(&bio_list_on_stack[0]);

                __submit_bio(bio);

                /*
                 * Sort new bios into those for a lower level and those for the
                 * same level.
                 */
                bio_list_init(&lower);
                bio_list_init(&same);
                while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
                        if (q == bdev_get_queue(bio->bi_bdev))
                                bio_list_add(&same, bio);
                        else
                                bio_list_add(&lower, bio);

                /*
                 * Now assemble so we handle the lowest level first.
                 */
                bio_list_merge(&bio_list_on_stack[0], &lower);
                bio_list_merge(&bio_list_on_stack[0], &same);
                bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
        } while ((bio = bio_list_pop(&bio_list_on_stack[0])));

        current->bio_list = NULL;
}

static void __submit_bio_noacct_mq(struct bio *bio)
{
        struct bio_list bio_list[2] = { };

        current->bio_list = bio_list;

        do {
                __submit_bio(bio);
        } while ((bio = bio_list_pop(&bio_list[0])));

        current->bio_list = NULL;
}

void submit_bio_noacct_nocheck(struct bio *bio, bool split)
{
        blk_cgroup_bio_start(bio);

        if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
                trace_block_bio_queue(bio);
                /*
                 * Now that enqueuing has been traced, we need to trace
                 * completion as well.
                 */
                bio_set_flag(bio, BIO_TRACE_COMPLETION);
        }

        /*
         * We only want one ->submit_bio to be active at a time, else stack
         * usage with stacked devices could be a problem.  Use current->bio_list
         * to collect a list of requests submitted by a ->submit_bio method
         * while it is active, and then process them after it returned.
         */
        if (current->bio_list) {
                if (split)
                        bio_list_add_head(&current->bio_list[0], bio);
                else
                        bio_list_add(&current->bio_list[0], bio);
        } else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
                __submit_bio_noacct_mq(bio);
        } else {
                __submit_bio_noacct(bio);
        }
}

static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
                                                 struct bio *bio)
{
        if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q))
                return BLK_STS_INVAL;

        if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q))
                return BLK_STS_INVAL;

        return BLK_STS_OK;
}

/**
 * submit_bio_noacct - re-submit a bio to the block device layer for I/O
 * @bio:  The bio describing the location in memory and on the device.
 *
 * This is a version of submit_bio() that shall only be used for I/O that is
 * resubmitted to lower level drivers by stacking block drivers.  All file
 * systems and other upper level users of the block layer should use
 * submit_bio() instead.
 */
void submit_bio_noacct(struct bio *bio)
{
        struct block_device *bdev = bio->bi_bdev;
        struct request_queue *q = bdev_get_queue(bdev);
        blk_status_t status = BLK_STS_IOERR;

        might_sleep();

        /*
         * For a REQ_NOWAIT based request, return -EOPNOTSUPP
         * if queue does not support NOWAIT.
         */
        if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
                goto not_supported;

        if (bio_has_crypt_ctx(bio)) {
                if (WARN_ON_ONCE(!bio_has_data(bio)))
                        goto end_io;
                if (!blk_crypto_supported(bio))
                        goto not_supported;
        }

        if (should_fail_bio(bio))
                goto end_io;
        bio_check_ro(bio);
        if (!bio_flagged(bio, BIO_REMAPPED)) {
                if (unlikely(bio_check_eod(bio)))
                        goto end_io;
                if (bdev_is_partition(bdev) &&
                    unlikely(blk_partition_remap(bio)))
                        goto end_io;
        }

        /*
         * Filter flush bio's early so that bio based drivers without flush
         * support don't have to worry about them.
         */
        if (op_is_flush(bio->bi_opf)) {
                if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
                                 bio_op(bio) != REQ_OP_ZONE_APPEND))
                        goto end_io;
                if (!bdev_write_cache(bdev)) {
                        bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
                        if (!bio_sectors(bio)) {
                                status = BLK_STS_OK;
                                goto end_io;
                        }
                }
        }

        switch (bio_op(bio)) {
        case REQ_OP_READ:
                break;
        case REQ_OP_WRITE:
                if (bio->bi_opf & REQ_ATOMIC) {
                        status = blk_validate_atomic_write_op_size(q, bio);
                        if (status != BLK_STS_OK)
                                goto end_io;
                }
                break;
        case REQ_OP_FLUSH:
                /*
                 * REQ_OP_FLUSH can't be submitted through bios, it is only
                 * synthetized in struct request by the flush state machine.
                 */
                goto not_supported;
        case REQ_OP_DISCARD:
                if (!bdev_max_discard_sectors(bdev))
                        goto not_supported;
                break;
        case REQ_OP_SECURE_ERASE:
                if (!bdev_max_secure_erase_sectors(bdev))
                        goto not_supported;
                break;
        case REQ_OP_ZONE_APPEND:
                status = blk_check_zone_append(q, bio);
                if (status != BLK_STS_OK)
                        goto end_io;
                break;
        case REQ_OP_WRITE_ZEROES:
                if (!q->limits.max_write_zeroes_sectors)
                        goto not_supported;
                break;
        case REQ_OP_ZONE_RESET:
        case REQ_OP_ZONE_OPEN:
        case REQ_OP_ZONE_CLOSE:
        case REQ_OP_ZONE_FINISH:
        case REQ_OP_ZONE_RESET_ALL:
                if (!bdev_is_zoned(bio->bi_bdev))
                        goto not_supported;
                break;
        case REQ_OP_DRV_IN:
        case REQ_OP_DRV_OUT:
                /*
                 * Driver private operations are only used with passthrough
                 * requests.
                 */
                fallthrough;
        default:
                goto not_supported;
        }

        if (blk_throtl_bio(bio))
                return;
        submit_bio_noacct_nocheck(bio, false);
        return;

not_supported:
        status = BLK_STS_NOTSUPP;
end_io:
        bio->bi_status = status;
        bio_endio(bio);
}
EXPORT_SYMBOL(submit_bio_noacct);

static void bio_set_ioprio(struct bio *bio)
{
        /* Nobody set ioprio so far? Initialize it based on task's nice value */
        if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
                bio->bi_ioprio = get_current_ioprio();
        blkcg_set_ioprio(bio);
}

/**
 * submit_bio - submit a bio to the block device layer for I/O
 * @bio: The &struct bio which describes the I/O
 *
 * submit_bio() is used to submit I/O requests to block devices.  It is passed a
 * fully set up &struct bio that describes the I/O that needs to be done.  The
 * bio will be sent to the device described by the bi_bdev field.
 *
 * The success/failure status of the request, along with notification of
 * completion, is delivered asynchronously through the ->bi_end_io() callback
 * in @bio.  The bio must NOT be touched by the caller until ->bi_end_io() has
 * been called.
 */
void submit_bio(struct bio *bio)
{
        if (bio_op(bio) == REQ_OP_READ) {
                task_io_account_read(bio->bi_iter.bi_size);
                count_vm_events(PGPGIN, bio_sectors(bio));
        } else if (bio_op(bio) == REQ_OP_WRITE) {
                count_vm_events(PGPGOUT, bio_sectors(bio));
        }

        bio_set_ioprio(bio);
        submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);

/**
 * bio_poll - poll for BIO completions
 * @bio: bio to poll for
 * @iob: batches of IO
 * @flags: BLK_POLL_* flags that control the behavior
 *
 * Poll for completions on queue associated with the bio. Returns number of
 * completed entries found.
 *
 * Note: the caller must either be the context that submitted @bio, or
 * be in a RCU critical section to prevent freeing of @bio.
 */
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
{
        blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
        struct block_device *bdev;
        struct request_queue *q;
        int ret = 0;

        bdev = READ_ONCE(bio->bi_bdev);
        if (!bdev)
                return 0;

        q = bdev_get_queue(bdev);
        if (cookie == BLK_QC_T_NONE)
                return 0;

        blk_flush_plug(current->plug, false);

        /*
         * We need to be able to enter a frozen queue, similar to how
         * timeouts also need to do that. If that is blocked, then we can
         * have pending IO when a queue freeze is started, and then the
         * wait for the freeze to finish will wait for polled requests to
         * timeout as the poller is preventer from entering the queue and
         * completing them. As long as we prevent new IO from being queued,
         * that should be all that matters.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return 0;
        if (queue_is_mq(q)) {
                ret = blk_mq_poll(q, cookie, iob, flags);
        } else {
                struct gendisk *disk = q->disk;

                if ((q->limits.features & BLK_FEAT_POLL) && disk &&
                    disk->fops->poll_bio)
                        ret = disk->fops->poll_bio(bio, iob, flags);
        }
        blk_queue_exit(q);
        return ret;
}
EXPORT_SYMBOL_GPL(bio_poll);

/*
 * Helper to implement file_operations.iopoll.  Requires the bio to be stored
 * in iocb->private, and cleared before freeing the bio.
 */
int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
                    unsigned int flags)
{
        struct bio *bio;
        int ret = 0;

        /*
         * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
         * point to a freshly allocated bio at this point.  If that happens
         * we have a few cases to consider:
         *
         *  1) the bio is being initialized and bi_bdev is NULL.  We can just
         *     simply nothing in this case
         *  2) the bio points to a not poll enabled device.  bio_poll will catch
         *     this and return 0
         *  3) the bio points to a poll capable device, including but not
         *     limited to the one that the original bio pointed to.  In this
         *     case we will call into the actual poll method and poll for I/O,
         *     even if we don't need to, but it won't cause harm either.
         *
         * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
         * is still allocated. Because partitions hold a reference to the whole
         * device bdev and thus disk, the disk is also still valid.  Grabbing
         * a reference to the queue in bio_poll() ensures the hctxs and requests
         * are still valid as well.
         */
        rcu_read_lock();
        bio = READ_ONCE(kiocb->private);
        if (bio)
                ret = bio_poll(bio, iob, flags);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(iocb_bio_iopoll);

void update_io_ticks(struct block_device *part, unsigned long now, bool end)
{
        unsigned long stamp;
again:
        stamp = READ_ONCE(part->bd_stamp);
        if (unlikely(time_after(now, stamp)) &&
            likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
            (end || bdev_count_inflight(part)))
                __part_stat_add(part, io_ticks, now - stamp);

        if (bdev_is_partition(part)) {
                part = bdev_whole(part);
                goto again;
        }
}

unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
                                 unsigned long start_time)
{
        part_stat_lock();
        update_io_ticks(bdev, start_time, false);
        part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
        part_stat_unlock();

        return start_time;
}
EXPORT_SYMBOL(bdev_start_io_acct);

/**
 * bio_start_io_acct - start I/O accounting for bio based drivers
 * @bio:        bio to start account for
 *
 * Returns the start time that should be passed back to bio_end_io_acct().
 */
unsigned long bio_start_io_acct(struct bio *bio)
{
        return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
}
EXPORT_SYMBOL_GPL(bio_start_io_acct);

void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
                      unsigned int sectors, unsigned long start_time)
{
        const int sgrp = op_stat_group(op);
        unsigned long now = READ_ONCE(jiffies);
        unsigned long duration = now - start_time;

        part_stat_lock();
        update_io_ticks(bdev, now, true);
        part_stat_inc(bdev, ios[sgrp]);
        part_stat_add(bdev, sectors[sgrp], sectors);
        part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
        part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
        part_stat_unlock();
}
EXPORT_SYMBOL(bdev_end_io_acct);

void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
                              struct block_device *orig_bdev)
{
        bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
}
EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);

/**
 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
 * @q : the queue of the device being checked
 *
 * Description:
 *    Check if underlying low-level drivers of a device are busy.
 *    If the drivers want to export their busy state, they must set own
 *    exporting function using blk_queue_lld_busy() first.
 *
 *    Basically, this function is used only by request stacking drivers
 *    to stop dispatching requests to underlying devices when underlying
 *    devices are busy.  This behavior helps more I/O merging on the queue
 *    of the request stacking driver and prevents I/O throughput regression
 *    on burst I/O load.
 *
 * Return:
 *    0 - Not busy (The request stacking driver should dispatch request)
 *    1 - Busy (The request stacking driver should stop dispatching request)
 */
int blk_lld_busy(struct request_queue *q)
{
        if (queue_is_mq(q) && q->mq_ops->busy)
                return q->mq_ops->busy(q);

        return 0;
}
EXPORT_SYMBOL_GPL(blk_lld_busy);

int kblockd_schedule_work(struct work_struct *work)
{
        return queue_work(kblockd_workqueue, work);
}
EXPORT_SYMBOL(kblockd_schedule_work);

int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
                                unsigned long delay)
{
        return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);

void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
{
        struct task_struct *tsk = current;

        /*
         * If this is a nested plug, don't actually assign it.
         */
        if (tsk->plug)
                return;

        plug->cur_ktime = 0;
        rq_list_init(&plug->mq_list);
        rq_list_init(&plug->cached_rqs);
        plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
        plug->rq_count = 0;
        plug->multiple_queues = false;
        plug->has_elevator = false;
        INIT_LIST_HEAD(&plug->cb_list);

        /*
         * Store ordering should not be needed here, since a potential
         * preempt will imply a full memory barrier
         */
        tsk->plug = plug;
}

/**
 * blk_start_plug - initialize blk_plug and track it inside the task_struct
 * @plug:        The &struct blk_plug that needs to be initialized
 *
 * Description:
 *   blk_start_plug() indicates to the block layer an intent by the caller
 *   to submit multiple I/O requests in a batch.  The block layer may use
 *   this hint to defer submitting I/Os from the caller until blk_finish_plug()
 *   is called.  However, the block layer may choose to submit requests
 *   before a call to blk_finish_plug() if the number of queued I/Os
 *   exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
 *   %BLK_PLUG_FLUSH_SIZE.  The queued I/Os may also be submitted early if
 *   the task schedules (see below).
 *
 *   Tracking blk_plug inside the task_struct will help with auto-flushing the
 *   pending I/O should the task end up blocking between blk_start_plug() and
 *   blk_finish_plug(). This is important from a performance perspective, but
 *   also ensures that we don't deadlock. For instance, if the task is blocking
 *   for a memory allocation, memory reclaim could end up wanting to free a
 *   page belonging to that request that is currently residing in our private
 *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
 *   this kind of deadlock.
 */
void blk_start_plug(struct blk_plug *plug)
{
        blk_start_plug_nr_ios(plug, 1);
}
EXPORT_SYMBOL(blk_start_plug);

static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
{
        LIST_HEAD(callbacks);

        while (!list_empty(&plug->cb_list)) {
                list_splice_init(&plug->cb_list, &callbacks);

                while (!list_empty(&callbacks)) {
                        struct blk_plug_cb *cb = list_first_entry(&callbacks,
                                                          struct blk_plug_cb,
                                                          list);
                        list_del(&cb->list);
                        cb->callback(cb, from_schedule);
                }
        }
}

struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
                                      int size)
{
        struct blk_plug *plug = current->plug;
        struct blk_plug_cb *cb;

        if (!plug)
                return NULL;

        list_for_each_entry(cb, &plug->cb_list, list)
                if (cb->callback == unplug && cb->data == data)
                        return cb;

        /* Not currently on the callback list */
        BUG_ON(size < sizeof(*cb));
        cb = kzalloc(size, GFP_ATOMIC);
        if (cb) {
                cb->data = data;
                cb->callback = unplug;
                list_add(&cb->list, &plug->cb_list);
        }
        return cb;
}
EXPORT_SYMBOL(blk_check_plugged);

void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
{
        if (!list_empty(&plug->cb_list))
                flush_plug_callbacks(plug, from_schedule);
        blk_mq_flush_plug_list(plug, from_schedule);
        /*
         * Unconditionally flush out cached requests, even if the unplug
         * event came from schedule. Since we know hold references to the
         * queue for cached requests, we don't want a blocked task holding
         * up a queue freeze/quiesce event.
         */
        if (unlikely(!rq_list_empty(&plug->cached_rqs)))
                blk_mq_free_plug_rqs(plug);

        plug->cur_ktime = 0;
        current->flags &= ~PF_BLOCK_TS;
}

/**
 * blk_finish_plug - mark the end of a batch of submitted I/O
 * @plug:        The &struct blk_plug passed to blk_start_plug()
 *
 * Description:
 * Indicate that a batch of I/O submissions is complete.  This function
 * must be paired with an initial call to blk_start_plug().  The intent
 * is to allow the block layer to optimize I/O submission.  See the
 * documentation for blk_start_plug() for more information.
 */
void blk_finish_plug(struct blk_plug *plug)
{
        if (plug == current->plug) {
                __blk_flush_plug(plug, false);
                current->plug = NULL;
        }
}
EXPORT_SYMBOL(blk_finish_plug);

void blk_io_schedule(void)
{
        /* Prevent hang_check timer from firing at us during very long I/O */
        unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;

        if (timeout)
                io_schedule_timeout(timeout);
        else
                io_schedule();
}
EXPORT_SYMBOL_GPL(blk_io_schedule);

int __init blk_dev_init(void)
{
        BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS));
        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
                        sizeof_field(struct request, cmd_flags));
        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
                        sizeof_field(struct bio, bi_opf));

        /* used for unplugging and affects IO latency/throughput - HIGHPRI */
        kblockd_workqueue = alloc_workqueue("kblockd",
                                            WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
        if (!kblockd_workqueue)
                panic("Failed to create kblockd\n");

        blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC);

        blk_debugfs_root = debugfs_create_dir("block", NULL);

        return 0;
}

































    9 
    4 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FIB_LOOKUP_H
#define _FIB_LOOKUP_H

#include <linux/types.h>
#include <linux/list.h>
#include <net/inet_dscp.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>

struct fib_alias {
        struct hlist_node        fa_list;
        struct fib_info                *fa_info;
        dscp_t                        fa_dscp;
        u8                        fa_type;
        u8                        fa_state;
        u8                        fa_slen;
        u32                        tb_id;
        s16                        fa_default;
        u8                        offload;
        u8                        trap;
        u8                        offload_failed;
        struct rcu_head                rcu;
};

#define FA_S_ACCESSED        0x01

/* Don't write on fa_state unless needed, to keep it shared on all cpus */
static inline void fib_alias_accessed(struct fib_alias *fa)
{
        u8 fa_state = READ_ONCE(fa->fa_state);

        if (!(fa_state & FA_S_ACCESSED))
                WRITE_ONCE(fa->fa_state, fa_state | FA_S_ACCESSED);
}

/* Exported by fib_semantics.c */
void fib_release_info(struct fib_info *);
struct fib_info *fib_create_info(struct fib_config *cfg,
                                 struct netlink_ext_ack *extack);
int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
                 struct netlink_ext_ack *extack);
bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
                  const struct fib_rt_info *fri, unsigned int flags);
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
               u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
size_t fib_nlmsg_size(struct fib_info *fi);

static inline void fib_result_assign(struct fib_result *res,
                                     struct fib_info *fi)
{
        /* we used to play games with refcounts, but we now use RCU */
        res->fi = fi;
        res->nhc = fib_info_nhc(fi, 0);
}

struct fib_prop {
        int        error;
        u8        scope;
};

extern const struct fib_prop fib_props[RTN_MAX + 1];

#endif /* _FIB_LOOKUP_H */















































































































    1 

































































































    1 


























































































































































































































































































































































































































































































































































































































































































































































































    1 

























































    1 




    1 










    1 




    1 







    1 






    1 









    1 







































































    1 








    1 
















    1 




    1 




    1 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Code partially derived from nft_hash
 * Rewritten with rehash code from br_multicast plus single list
 * pointer as suggested by Josh Triplett
 */

#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/log2.h>
#include <linux/sched.h>
#include <linux/rculist.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <linux/rhashtable.h>
#include <linux/err.h>
#include <linux/export.h>

#define HASH_DEFAULT_SIZE        64UL
#define HASH_MIN_SIZE                4U

union nested_table {
        union nested_table __rcu *table;
        struct rhash_lock_head __rcu *bucket;
};

static u32 head_hashfn(struct rhashtable *ht,
                       const struct bucket_table *tbl,
                       const struct rhash_head *he)
{
        return rht_head_hashfn(ht, tbl, he, ht->p);
}

#ifdef CONFIG_PROVE_LOCKING
#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))

int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
        return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
}
EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);

int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
{
        if (!debug_locks)
                return 1;
        if (unlikely(tbl->nest))
                return 1;
        return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
}
EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
#else
#define ASSERT_RHT_MUTEX(HT)
#endif

static inline union nested_table *nested_table_top(
        const struct bucket_table *tbl)
{
        /* The top-level bucket entry does not need RCU protection
         * because it's set at the same time as tbl->nest.
         */
        return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
}

static void nested_table_free(union nested_table *ntbl, unsigned int size)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        const unsigned int len = 1 << shift;
        unsigned int i;

        ntbl = rcu_dereference_protected(ntbl->table, 1);
        if (!ntbl)
                return;

        if (size > len) {
                size >>= shift;
                for (i = 0; i < len; i++)
                        nested_table_free(ntbl + i, size);
        }

        kfree(ntbl);
}

static void nested_bucket_table_free(const struct bucket_table *tbl)
{
        unsigned int size = tbl->size >> tbl->nest;
        unsigned int len = 1 << tbl->nest;
        union nested_table *ntbl;
        unsigned int i;

        ntbl = nested_table_top(tbl);

        for (i = 0; i < len; i++)
                nested_table_free(ntbl + i, size);

        kfree(ntbl);
}

static void bucket_table_free(const struct bucket_table *tbl)
{
        if (tbl->nest)
                nested_bucket_table_free(tbl);

        kvfree(tbl);
}

static void bucket_table_free_rcu(struct rcu_head *head)
{
        bucket_table_free(container_of(head, struct bucket_table, rcu));
}

static union nested_table *nested_table_alloc(struct rhashtable *ht,
                                              union nested_table __rcu **prev,
                                              bool leaf)
{
        union nested_table *ntbl;
        int i;

        ntbl = rcu_dereference(*prev);
        if (ntbl)
                return ntbl;

        ntbl = alloc_hooks_tag(ht->alloc_tag,
                        kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO));

        if (ntbl && leaf) {
                for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
                        INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
        }

        if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
                return ntbl;
        /* Raced with another thread. */
        kfree(ntbl);
        return rcu_dereference(*prev);
}

static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
                                                      size_t nbuckets,
                                                      gfp_t gfp)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        struct bucket_table *tbl;
        size_t size;

        if (nbuckets < (1 << (shift + 1)))
                return NULL;

        size = sizeof(*tbl) + sizeof(tbl->buckets[0]);

        tbl = alloc_hooks_tag(ht->alloc_tag,
                        kmalloc_noprof(size, gfp|__GFP_ZERO));
        if (!tbl)
                return NULL;

        if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
                                false)) {
                kfree(tbl);
                return NULL;
        }

        tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;

        return tbl;
}

static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
                                               size_t nbuckets,
                                               gfp_t gfp)
{
        struct bucket_table *tbl = NULL;
        size_t size;
        int i;
        static struct lock_class_key __key;

        tbl = alloc_hooks_tag(ht->alloc_tag,
                        kvmalloc_node_align_noprof(struct_size(tbl, buckets, nbuckets),
                                             1, gfp|__GFP_ZERO, NUMA_NO_NODE));

        size = nbuckets;

        if (tbl == NULL && !gfpflags_allow_blocking(gfp)) {
                tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
                nbuckets = 0;
        }

        if (tbl == NULL)
                return NULL;

        lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0);

        tbl->size = size;

        rcu_head_init(&tbl->rcu);
        INIT_LIST_HEAD(&tbl->walkers);

        tbl->hash_rnd = get_random_u32();

        for (i = 0; i < nbuckets; i++)
                INIT_RHT_NULLS_HEAD(tbl->buckets[i]);

        return tbl;
}

static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
                                                  struct bucket_table *tbl)
{
        struct bucket_table *new_tbl;

        do {
                new_tbl = tbl;
                tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        } while (tbl);

        return new_tbl;
}

static int rhashtable_rehash_one(struct rhashtable *ht,
                                 struct rhash_lock_head __rcu **bkt,
                                 unsigned int old_hash)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
        int err = -EAGAIN;
        struct rhash_head *head, *next, *entry;
        struct rhash_head __rcu **pprev = NULL;
        unsigned int new_hash;
        unsigned long flags;

        if (new_tbl->nest)
                goto out;

        err = -ENOENT;

        rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
                          old_tbl, old_hash) {
                err = 0;
                next = rht_dereference_bucket(entry->next, old_tbl, old_hash);

                if (rht_is_a_nulls(next))
                        break;

                pprev = &entry->next;
        }

        if (err)
                goto out;

        new_hash = head_hashfn(ht, new_tbl, entry);

        flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash],
                                SINGLE_DEPTH_NESTING);

        head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);

        RCU_INIT_POINTER(entry->next, head);

        rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags);

        if (pprev)
                rcu_assign_pointer(*pprev, next);
        else
                /* Need to preserved the bit lock. */
                rht_assign_locked(bkt, next);

out:
        return err;
}

static int rhashtable_rehash_chain(struct rhashtable *ht,
                                    unsigned int old_hash)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
        unsigned long flags;
        int err;

        if (!bkt)
                return 0;
        flags = rht_lock(old_tbl, bkt);

        while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
                ;

        if (err == -ENOENT)
                err = 0;
        rht_unlock(old_tbl, bkt, flags);

        return err;
}

static int rhashtable_rehash_attach(struct rhashtable *ht,
                                    struct bucket_table *old_tbl,
                                    struct bucket_table *new_tbl)
{
        /* Make insertions go into the new, empty table right away. Deletions
         * and lookups will be attempted in both tables until we synchronize.
         * As cmpxchg() provides strong barriers, we do not need
         * rcu_assign_pointer().
         */

        if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
                    new_tbl) != NULL)
                return -EEXIST;

        return 0;
}

static int rhashtable_rehash_table(struct rhashtable *ht)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct bucket_table *new_tbl;
        struct rhashtable_walker *walker;
        unsigned int old_hash;
        int err;

        new_tbl = rht_dereference(old_tbl->future_tbl, ht);
        if (!new_tbl)
                return 0;

        for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
                err = rhashtable_rehash_chain(ht, old_hash);
                if (err)
                        return err;
                cond_resched();
        }

        /* Publish the new table pointer. */
        rcu_assign_pointer(ht->tbl, new_tbl);

        spin_lock(&ht->lock);
        list_for_each_entry(walker, &old_tbl->walkers, list)
                walker->tbl = NULL;

        /* Wait for readers. All new readers will see the new
         * table, and thus no references to the old table will
         * remain.
         * We do this inside the locked region so that
         * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
         * to check if it should not re-link the table.
         */
        call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
        spin_unlock(&ht->lock);

        return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
}

static int rhashtable_rehash_alloc(struct rhashtable *ht,
                                   struct bucket_table *old_tbl,
                                   unsigned int size)
        __must_hold(&ht->mutex)
{
        struct bucket_table *new_tbl;
        int err;

        ASSERT_RHT_MUTEX(ht);

        new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
        if (new_tbl == NULL)
                return -ENOMEM;

        err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
        if (err)
                bucket_table_free(new_tbl);

        return err;
}

/**
 * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
 * @ht:                the hash table to shrink
 *
 * This function shrinks the hash table to fit, i.e., the smallest
 * size would not cause it to expand right away automatically.
 *
 * The caller must ensure that no concurrent resizing occurs by holding
 * ht->mutex.
 *
 * The caller must ensure that no concurrent table mutations take place.
 * It is however valid to have concurrent lookups if they are RCU protected.
 *
 * It is valid to have concurrent insertions and deletions protected by per
 * bucket locks or concurrent RCU protected lookups and traversals.
 */
static int rhashtable_shrink(struct rhashtable *ht)
        __must_hold(&ht->mutex)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        unsigned int nelems = atomic_read(&ht->nelems);
        unsigned int size = 0;

        if (nelems)
                size = roundup_pow_of_two(nelems * 3 / 2);
        if (size < ht->p.min_size)
                size = ht->p.min_size;

        if (old_tbl->size <= size)
                return 0;

        if (rht_dereference(old_tbl->future_tbl, ht))
                return -EEXIST;

        return rhashtable_rehash_alloc(ht, old_tbl, size);
}

static void rht_deferred_worker(struct work_struct *work)
{
        struct rhashtable *ht;
        struct bucket_table *tbl;
        int err = 0;

        ht = container_of(work, struct rhashtable, run_work);
        mutex_lock(&ht->mutex);

        tbl = rht_dereference(ht->tbl, ht);
        tbl = rhashtable_last_table(ht, tbl);

        if (rht_grow_above_75(ht, tbl))
                err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
        else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
                err = rhashtable_shrink(ht);
        else if (tbl->nest)
                err = rhashtable_rehash_alloc(ht, tbl, tbl->size);

        if (!err || err == -EEXIST) {
                int nerr;

                nerr = rhashtable_rehash_table(ht);
                err = err ?: nerr;
        }

        mutex_unlock(&ht->mutex);

        if (err)
                schedule_work(&ht->run_work);
}

static int rhashtable_insert_rehash(struct rhashtable *ht,
                                    struct bucket_table *tbl)
{
        struct bucket_table *old_tbl;
        struct bucket_table *new_tbl;
        unsigned int size;
        int err;

        old_tbl = rht_dereference_rcu(ht->tbl, ht);

        size = tbl->size;

        err = -EBUSY;

        if (rht_grow_above_75(ht, tbl))
                size *= 2;
        /* Do not schedule more than one rehash */
        else if (old_tbl != tbl)
                goto fail;

        err = -ENOMEM;

        new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
        if (new_tbl == NULL)
                goto fail;

        err = rhashtable_rehash_attach(ht, tbl, new_tbl);
        if (err) {
                bucket_table_free(new_tbl);
                if (err == -EEXIST)
                        err = 0;
        } else
                schedule_work(&ht->run_work);

        return err;

fail:
        /* Do not fail the insert if someone else did a rehash. */
        if (likely(rcu_access_pointer(tbl->future_tbl)))
                return 0;

        /* Schedule async rehash to retry allocation in process context. */
        if (err == -ENOMEM)
                schedule_work(&ht->run_work);

        return err;
}

static void *rhashtable_lookup_one(struct rhashtable *ht,
                                   struct rhash_lock_head __rcu **bkt,
                                   struct bucket_table *tbl, unsigned int hash,
                                   const void *key, struct rhash_head *obj)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_head __rcu **pprev = NULL;
        struct rhash_head *head;
        int elasticity;

        elasticity = RHT_ELASTICITY;
        rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *list;
                struct rhlist_head *plist;

                elasticity--;
                if (!key ||
                    (ht->p.obj_cmpfn ?
                     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
                        pprev = &head->next;
                        continue;
                }

                if (!ht->rhlist)
                        return rht_obj(ht, head);

                list = container_of(obj, struct rhlist_head, rhead);
                plist = container_of(head, struct rhlist_head, rhead);

                RCU_INIT_POINTER(list->next, plist);
                head = rht_dereference_bucket(head->next, tbl, hash);
                RCU_INIT_POINTER(list->rhead.next, head);
                if (pprev)
                        rcu_assign_pointer(*pprev, obj);
                else
                        /* Need to preserve the bit lock */
                        rht_assign_locked(bkt, obj);

                return NULL;
        }

        if (elasticity <= 0)
                return ERR_PTR(-EAGAIN);

        return ERR_PTR(-ENOENT);
}

static struct bucket_table *rhashtable_insert_one(
        struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
        struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
        void *data)
{
        struct bucket_table *new_tbl;
        struct rhash_head *head;

        if (!IS_ERR_OR_NULL(data))
                return ERR_PTR(-EEXIST);

        if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
                return ERR_CAST(data);

        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (new_tbl)
                return new_tbl;

        if (PTR_ERR(data) != -ENOENT)
                return ERR_CAST(data);

        if (unlikely(rht_grow_above_max(ht, tbl)))
                return ERR_PTR(-E2BIG);

        if (unlikely(rht_grow_above_100(ht, tbl)))
                return ERR_PTR(-EAGAIN);

        head = rht_ptr(bkt, tbl, hash);

        RCU_INIT_POINTER(obj->next, head);
        if (ht->rhlist) {
                struct rhlist_head *list;

                list = container_of(obj, struct rhlist_head, rhead);
                RCU_INIT_POINTER(list->next, NULL);
        }

        /* bkt is always the head of the list, so it holds
         * the lock, which we need to preserve
         */
        rht_assign_locked(bkt, obj);

        return NULL;
}

static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
                                   struct rhash_head *obj)
{
        struct bucket_table *new_tbl;
        struct bucket_table *tbl;
        struct rhash_lock_head __rcu **bkt;
        unsigned long flags;
        unsigned int hash;
        void *data;

        new_tbl = rcu_dereference(ht->tbl);

        do {
                tbl = new_tbl;
                hash = rht_head_hashfn(ht, tbl, obj, ht->p);
                if (rcu_access_pointer(tbl->future_tbl))
                        /* Failure is OK */
                        bkt = rht_bucket_var(tbl, hash);
                else
                        bkt = rht_bucket_insert(ht, tbl, hash);
                if (bkt == NULL) {
                        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
                        data = ERR_PTR(-EAGAIN);
                } else {
                        bool inserted;

                        flags = rht_lock(tbl, bkt);
                        data = rhashtable_lookup_one(ht, bkt, tbl,
                                                     hash, key, obj);
                        new_tbl = rhashtable_insert_one(ht, bkt, tbl,
                                                        hash, obj, data);
                        inserted = data && !new_tbl;
                        if (inserted)
                                atomic_inc(&ht->nelems);
                        if (PTR_ERR(new_tbl) != -EEXIST)
                                data = ERR_CAST(new_tbl);

                        rht_unlock(tbl, bkt, flags);

                        if (inserted && rht_grow_above_75(ht, tbl))
                                schedule_work(&ht->run_work);
                }
        } while (!IS_ERR_OR_NULL(new_tbl));

        if (PTR_ERR(data) == -EAGAIN)
                data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
                               -EAGAIN);

        return data;
}

void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
                             struct rhash_head *obj)
{
        void *data;

        do {
                rcu_read_lock();
                data = rhashtable_try_insert(ht, key, obj);
                rcu_read_unlock();
        } while (PTR_ERR(data) == -EAGAIN);

        return data;
}
EXPORT_SYMBOL_GPL(rhashtable_insert_slow);

/**
 * rhashtable_walk_enter - Initialise an iterator
 * @ht:                Table to walk over
 * @iter:        Hash table Iterator
 *
 * This function prepares a hash table walk.
 *
 * Note that if you restart a walk after rhashtable_walk_stop you
 * may see the same object twice.  Also, you may miss objects if
 * there are removals in between rhashtable_walk_stop and the next
 * call to rhashtable_walk_start.
 *
 * For a completely stable walk you should construct your own data
 * structure outside the hash table.
 *
 * This function may be called from any process context, including
 * non-preemptible context, but cannot be called from softirq or
 * hardirq context.
 *
 * You must call rhashtable_walk_exit after this function returns.
 */
void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
{
        iter->ht = ht;
        iter->p = NULL;
        iter->slot = 0;
        iter->skip = 0;
        iter->end_of_table = 0;

        spin_lock(&ht->lock);
        iter->walker.tbl =
                rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
        list_add(&iter->walker.list, &iter->walker.tbl->walkers);
        spin_unlock(&ht->lock);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_enter);

/**
 * rhashtable_walk_exit - Free an iterator
 * @iter:        Hash table Iterator
 *
 * This function frees resources allocated by rhashtable_walk_enter.
 */
void rhashtable_walk_exit(struct rhashtable_iter *iter)
{
        spin_lock(&iter->ht->lock);
        if (iter->walker.tbl)
                list_del(&iter->walker.list);
        spin_unlock(&iter->ht->lock);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_exit);

/**
 * rhashtable_walk_start_check - Start a hash table walk
 * @iter:        Hash table iterator
 *
 * Start a hash table walk at the current iterator position.  Note that we take
 * the RCU lock in all cases including when we return an error.  So you must
 * always call rhashtable_walk_stop to clean up.
 *
 * Returns zero if successful.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may use it immediately
 * by calling rhashtable_walk_next.
 *
 * rhashtable_walk_start is defined as an inline variant that returns
 * void. This is preferred in cases where the caller would ignore
 * resize events and always continue.
 */
int rhashtable_walk_start_check(struct rhashtable_iter *iter)
        __acquires_shared(RCU)
{
        struct rhashtable *ht = iter->ht;
        bool rhlist = ht->rhlist;

        rcu_read_lock();

        spin_lock(&ht->lock);
        if (iter->walker.tbl)
                list_del(&iter->walker.list);
        spin_unlock(&ht->lock);

        if (iter->end_of_table)
                return 0;
        if (!iter->walker.tbl) {
                iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
                iter->slot = 0;
                iter->skip = 0;
                return -EAGAIN;
        }

        if (iter->p && !rhlist) {
                /*
                 * We need to validate that 'p' is still in the table, and
                 * if so, update 'skip'
                 */
                struct rhash_head *p;
                int skip = 0;
                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
                        skip++;
                        if (p == iter->p) {
                                iter->skip = skip;
                                goto found;
                        }
                }
                iter->p = NULL;
        } else if (iter->p && rhlist) {
                /* Need to validate that 'list' is still in the table, and
                 * if so, update 'skip' and 'p'.
                 */
                struct rhash_head *p;
                struct rhlist_head *list;
                int skip = 0;
                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
                        for (list = container_of(p, struct rhlist_head, rhead);
                             list;
                             list = rcu_dereference(list->next)) {
                                skip++;
                                if (list == iter->list) {
                                        iter->p = p;
                                        iter->skip = skip;
                                        goto found;
                                }
                        }
                }
                iter->p = NULL;
        }
found:
        return 0;
}
EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);

/**
 * __rhashtable_walk_find_next - Find the next element in a table (or the first
 * one in case of a new walk).
 *
 * @iter:        Hash table iterator
 *
 * Returns the found object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.
 */
static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
{
        struct bucket_table *tbl = iter->walker.tbl;
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;
        bool rhlist = ht->rhlist;

        if (!tbl)
                return NULL;

        for (; iter->slot < tbl->size; iter->slot++) {
                int skip = iter->skip;

                rht_for_each_rcu(p, tbl, iter->slot) {
                        if (rhlist) {
                                list = container_of(p, struct rhlist_head,
                                                    rhead);
                                do {
                                        if (!skip)
                                                goto next;
                                        skip--;
                                        list = rcu_dereference(list->next);
                                } while (list);

                                continue;
                        }
                        if (!skip)
                                break;
                        skip--;
                }

next:
                if (!rht_is_a_nulls(p)) {
                        iter->skip++;
                        iter->p = p;
                        iter->list = list;
                        return rht_obj(ht, rhlist ? &list->rhead : p);
                }

                iter->skip = 0;
        }

        iter->p = NULL;

        /* Ensure we see any new tables. */
        smp_rmb();

        iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (iter->walker.tbl) {
                iter->slot = 0;
                iter->skip = 0;
                return ERR_PTR(-EAGAIN);
        } else {
                iter->end_of_table = true;
        }

        return NULL;
}

/**
 * rhashtable_walk_next - Return the next object and advance the iterator
 * @iter:        Hash table iterator
 *
 * Note that you must call rhashtable_walk_stop when you are finished
 * with the walk.
 *
 * Returns the next object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may continue to use it.
 */
void *rhashtable_walk_next(struct rhashtable_iter *iter)
{
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;
        bool rhlist = ht->rhlist;

        if (p) {
                if (!rhlist || !(list = rcu_dereference(list->next))) {
                        p = rcu_dereference(p->next);
                        list = container_of(p, struct rhlist_head, rhead);
                }
                if (!rht_is_a_nulls(p)) {
                        iter->skip++;
                        iter->p = p;
                        iter->list = list;
                        return rht_obj(ht, rhlist ? &list->rhead : p);
                }

                /* At the end of this slot, switch to next one and then find
                 * next entry from that point.
                 */
                iter->skip = 0;
                iter->slot++;
        }

        return __rhashtable_walk_find_next(iter);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_next);

/**
 * rhashtable_walk_peek - Return the next object but don't advance the iterator
 * @iter:        Hash table iterator
 *
 * Returns the next object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may continue to use it.
 */
void *rhashtable_walk_peek(struct rhashtable_iter *iter)
{
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;

        if (p)
                return rht_obj(ht, ht->rhlist ? &list->rhead : p);

        /* No object found in current iter, find next one in the table. */

        if (iter->skip) {
                /* A nonzero skip value points to the next entry in the table
                 * beyond that last one that was found. Decrement skip so
                 * we find the current value. __rhashtable_walk_find_next
                 * will restore the original value of skip assuming that
                 * the table hasn't changed.
                 */
                iter->skip--;
        }

        return __rhashtable_walk_find_next(iter);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_peek);

/**
 * rhashtable_walk_stop - Finish a hash table walk
 * @iter:        Hash table iterator
 *
 * Finish a hash table walk.  Does not reset the iterator to the start of the
 * hash table.
 */
void rhashtable_walk_stop(struct rhashtable_iter *iter)
{
        struct rhashtable *ht;
        struct bucket_table *tbl = iter->walker.tbl;

        if (!tbl)
                goto out;

        ht = iter->ht;

        spin_lock(&ht->lock);
        if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
                /* This bucket table is being freed, don't re-link it. */
                iter->walker.tbl = NULL;
        else
                list_add(&iter->walker.list, &tbl->walkers);
        spin_unlock(&ht->lock);

out:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rhashtable_walk_stop);

static size_t rounded_hashtable_size(const struct rhashtable_params *params)
{
        size_t retsize;

        if (params->nelem_hint)
                retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
                              (unsigned long)params->min_size);
        else
                retsize = max(HASH_DEFAULT_SIZE,
                              (unsigned long)params->min_size);

        return retsize;
}

static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
{
        return jhash2(key, length, seed);
}

/**
 * rhashtable_init - initialize a new hash table
 * @ht:                hash table to be initialized
 * @params:        configuration parameters
 *
 * Initializes a new hash table based on the provided configuration
 * parameters. A table can be configured either with a variable or
 * fixed length key:
 *
 * Configuration Example 1: Fixed length keys
 * struct test_obj {
 *        int                        key;
 *        void *                        my_member;
 *        struct rhash_head        node;
 * };
 *
 * struct rhashtable_params params = {
 *        .head_offset = offsetof(struct test_obj, node),
 *        .key_offset = offsetof(struct test_obj, key),
 *        .key_len = sizeof(int),
 *        .hashfn = jhash,
 * };
 *
 * Configuration Example 2: Variable length keys
 * struct test_obj {
 *        [...]
 *        struct rhash_head        node;
 * };
 *
 * u32 my_hash_fn(const void *data, u32 len, u32 seed)
 * {
 *        struct test_obj *obj = data;
 *
 *        return [... hash ...];
 * }
 *
 * struct rhashtable_params params = {
 *        .head_offset = offsetof(struct test_obj, node),
 *        .hashfn = jhash,
 *        .obj_hashfn = my_hash_fn,
 * };
 */
int rhashtable_init_noprof(struct rhashtable *ht,
                    const struct rhashtable_params *params)
{
        struct bucket_table *tbl;
        size_t size;

        if ((!params->key_len && !params->obj_hashfn) ||
            (params->obj_hashfn && !params->obj_cmpfn))
                return -EINVAL;

        memset(ht, 0, sizeof(*ht));
        mutex_init(&ht->mutex);
        spin_lock_init(&ht->lock);
        memcpy(&ht->p, params, sizeof(*params));

        alloc_tag_record(ht->alloc_tag);

        if (params->min_size)
                ht->p.min_size = roundup_pow_of_two(params->min_size);

        /* Cap total entries at 2^31 to avoid nelems overflow. */
        ht->max_elems = 1u << 31;

        if (params->max_size) {
                ht->p.max_size = rounddown_pow_of_two(params->max_size);
                if (ht->p.max_size < ht->max_elems / 2)
                        ht->max_elems = ht->p.max_size * 2;
        }

        ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);

        size = rounded_hashtable_size(&ht->p);

        ht->key_len = ht->p.key_len;
        if (!params->hashfn) {
                ht->p.hashfn = jhash;

                if (!(ht->key_len & (sizeof(u32) - 1))) {
                        ht->key_len /= sizeof(u32);
                        ht->p.hashfn = rhashtable_jhash2;
                }
        }

        /*
         * This is api initialization and thus we need to guarantee the
         * initial rhashtable allocation. Upon failure, retry with the
         * smallest possible size with __GFP_NOFAIL semantics.
         */
        tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
        if (unlikely(tbl == NULL)) {
                size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
                tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
        }

        atomic_set(&ht->nelems, 0);

        RCU_INIT_POINTER(ht->tbl, tbl);

        INIT_WORK(&ht->run_work, rht_deferred_worker);

        return 0;
}
EXPORT_SYMBOL_GPL(rhashtable_init_noprof);

/**
 * rhltable_init - initialize a new hash list table
 * @hlt:        hash list table to be initialized
 * @params:        configuration parameters
 *
 * Initializes a new hash list table.
 *
 * See documentation for rhashtable_init.
 */
int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params)
{
        int err;

        err = rhashtable_init_noprof(&hlt->ht, params);
        hlt->ht.rhlist = true;
        return err;
}
EXPORT_SYMBOL_GPL(rhltable_init_noprof);

static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
                                void (*free_fn)(void *ptr, void *arg),
                                void *arg)
{
        struct rhlist_head *list;

        if (!ht->rhlist) {
                free_fn(rht_obj(ht, obj), arg);
                return;
        }

        list = container_of(obj, struct rhlist_head, rhead);
        do {
                obj = &list->rhead;
                list = rht_dereference(list->next, ht);
                free_fn(rht_obj(ht, obj), arg);
        } while (list);
}

/**
 * rhashtable_free_and_destroy - free elements and destroy hash table
 * @ht:                the hash table to destroy
 * @free_fn:        callback to release resources of element
 * @arg:        pointer passed to free_fn
 *
 * Stops an eventual async resize. If defined, invokes free_fn for each
 * element to releasal resources. Please note that RCU protected
 * readers may still be accessing the elements. Releasing of resources
 * must occur in a compatible manner. Then frees the bucket array.
 *
 * This function will eventually sleep to wait for an async resize
 * to complete. The caller is responsible that no further write operations
 * occurs in parallel.
 */
void rhashtable_free_and_destroy(struct rhashtable *ht,
                                 void (*free_fn)(void *ptr, void *arg),
                                 void *arg)
{
        struct bucket_table *tbl, *next_tbl;
        unsigned int i;

        cancel_work_sync(&ht->run_work);

        mutex_lock(&ht->mutex);
        tbl = rht_dereference(ht->tbl, ht);
restart:
        if (free_fn) {
                for (i = 0; i < tbl->size; i++) {
                        struct rhash_head *pos, *next;

                        cond_resched();
                        for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
                             next = !rht_is_a_nulls(pos) ?
                                        rht_dereference(pos->next, ht) : NULL;
                             !rht_is_a_nulls(pos);
                             pos = next,
                             next = !rht_is_a_nulls(pos) ?
                                        rht_dereference(pos->next, ht) : NULL)
                                rhashtable_free_one(ht, pos, free_fn, arg);
                }
        }

        next_tbl = rht_dereference(tbl->future_tbl, ht);
        bucket_table_free(tbl);
        if (next_tbl) {
                tbl = next_tbl;
                goto restart;
        }
        mutex_unlock(&ht->mutex);
}
EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);

void rhashtable_destroy(struct rhashtable *ht)
{
        return rhashtable_free_and_destroy(ht, NULL, NULL);
}
EXPORT_SYMBOL_GPL(rhashtable_destroy);

struct rhash_lock_head __rcu **__rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        unsigned int index = hash & ((1 << tbl->nest) - 1);
        unsigned int size = tbl->size >> tbl->nest;
        unsigned int subhash = hash;
        union nested_table *ntbl;

        ntbl = nested_table_top(tbl);
        ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
        subhash >>= tbl->nest;

        while (ntbl && size > (1 << shift)) {
                index = subhash & ((1 << shift) - 1);
                ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
                                                  tbl, hash);
                size >>= shift;
                subhash >>= shift;
        }

        if (!ntbl)
                return NULL;

        return &ntbl[subhash].bucket;

}
EXPORT_SYMBOL_GPL(__rht_bucket_nested);

struct rhash_lock_head __rcu **rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash)
{
        static struct rhash_lock_head __rcu *rhnull;

        if (!rhnull)
                INIT_RHT_NULLS_HEAD(rhnull);
        return __rht_bucket_nested(tbl, hash) ?: &rhnull;
}
EXPORT_SYMBOL_GPL(rht_bucket_nested);

struct rhash_lock_head __rcu **rht_bucket_nested_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        unsigned int index = hash & ((1 << tbl->nest) - 1);
        unsigned int size = tbl->size >> tbl->nest;
        union nested_table *ntbl;

        ntbl = nested_table_top(tbl);
        hash >>= tbl->nest;
        ntbl = nested_table_alloc(ht, &ntbl[index].table,
                                  size <= (1 << shift));

        while (ntbl && size > (1 << shift)) {
                index = hash & ((1 << shift) - 1);
                size >>= shift;
                hash >>= shift;
                ntbl = nested_table_alloc(ht, &ntbl[index].table,
                                          size <= (1 << shift));
        }

        if (!ntbl)
                return NULL;

        return &ntbl[hash].bucket;

}
EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
































    6 

    1 
















    6 



    6 
    3 
















    6 





    6 















    6 
    1 























   14 








   14 























   14 




   14 





   14 











    5 


    6 

    6 










    3 


















    6 













    6 


    4 

    6 
    6 








    6 



























    5 


    3 












    6 

















































    3 










    6 



























    6 






































    3 













    3 























    6 



    6 


    6 

































































































































    2 







    3 


    2 









    3 











    3 





    3 










    3 

    6 



    6 






   18 


   18 











    1 











    3 











    3 



    1 

    4 
























    4 










   17 
    3 
   18 



   19 


































    3 

















    3 



    3 




    3 









































































































    3 





    3 
    3 
    3 





















































































































































    6 
   14 





   14 
   16 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
// SPDX-License-Identifier: GPL-2.0
/*
 * kobject.c - library routines for handling generic kernel objects
 *
 * Copyright (c) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2006-2007 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (c) 2006-2007 Novell Inc.
 *
 * Please see the file Documentation/core-api/kobject.rst for critical information
 * about using the kobject interface.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/random.h>

/**
 * kobject_namespace() - Return @kobj's namespace tag.
 * @kobj: kobject in question
 *
 * Returns namespace tag of @kobj if its parent has namespace ops enabled
 * and thus @kobj should have a namespace tag associated with it.  Returns
 * %NULL otherwise.
 */
const struct ns_common *kobject_namespace(const struct kobject *kobj)
{
        const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj);

        if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE)
                return NULL;

        return kobj->ktype->namespace(kobj);
}

/**
 * kobject_get_ownership() - Get sysfs ownership data for @kobj.
 * @kobj: kobject in question
 * @uid: kernel user ID for sysfs objects
 * @gid: kernel group ID for sysfs objects
 *
 * Returns initial uid/gid pair that should be used when creating sysfs
 * representation of given kobject. Normally used to adjust ownership of
 * objects in a container.
 */
void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;

        if (kobj->ktype->get_ownership)
                kobj->ktype->get_ownership(kobj, uid, gid);
}

static bool kobj_ns_type_is_valid(enum kobj_ns_type type)
{
        if ((type <= KOBJ_NS_TYPE_NONE) || (type >= KOBJ_NS_TYPES))
                return false;

        return true;
}

static int create_dir(struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);
        const struct kobj_ns_type_operations *ops;
        int error;

        error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
        if (error)
                return error;

        if (ktype) {
                error = sysfs_create_groups(kobj, ktype->default_groups);
                if (error) {
                        sysfs_remove_dir(kobj);
                        return error;
                }
        }

        /*
         * @kobj->sd may be deleted by an ancestor going away.  Hold an
         * extra reference so that it stays until @kobj is gone.
         */
        sysfs_get(kobj->sd);

        /*
         * If @kobj has ns_ops, its children need to be filtered based on
         * their namespace tags.  Enable namespace support on @kobj->sd.
         */
        ops = kobj_child_ns_ops(kobj);
        if (ops) {
                BUG_ON(!kobj_ns_type_is_valid(ops->type));
                BUG_ON(!kobj_ns_type_registered(ops->type));

                sysfs_enable_ns(kobj->sd);
        }

        return 0;
}

static int get_kobj_path_length(const struct kobject *kobj)
{
        int length = 1;
        const struct kobject *parent = kobj;

        /* walk up the ancestors until we hit the one pointing to the
         * root.
         * Add 1 to strlen for leading '/' of each level.
         */
        do {
                if (kobject_name(parent) == NULL)
                        return 0;
                length += strlen(kobject_name(parent)) + 1;
                parent = parent->parent;
        } while (parent);
        return length;
}

static int fill_kobj_path(const struct kobject *kobj, char *path, int length)
{
        const struct kobject *parent;

        --length;
        for (parent = kobj; parent; parent = parent->parent) {
                int cur = strlen(kobject_name(parent));
                /* back up enough to print this name with '/' */
                length -= cur;
                if (length <= 0)
                        return -EINVAL;
                memcpy(path + length, kobject_name(parent), cur);
                *(path + --length) = '/';
        }

        pr_debug("'%s' (%p): %s: path = '%s'\n", kobject_name(kobj),
                 kobj, __func__, path);

        return 0;
}

/**
 * kobject_get_path() - Allocate memory and fill in the path for @kobj.
 * @kobj:        kobject in question, with which to build the path
 * @gfp_mask:        the allocation type used to allocate the path
 *
 * Return: The newly allocated memory, caller must free with kfree().
 */
char *kobject_get_path(const struct kobject *kobj, gfp_t gfp_mask)
{
        char *path;
        int len;

retry:
        len = get_kobj_path_length(kobj);
        if (len == 0)
                return NULL;
        path = kzalloc(len, gfp_mask);
        if (!path)
                return NULL;
        if (fill_kobj_path(kobj, path, len)) {
                kfree(path);
                goto retry;
        }

        return path;
}
EXPORT_SYMBOL_GPL(kobject_get_path);

/* add the kobject to its kset's list */
static void kobj_kset_join(struct kobject *kobj)
{
        if (!kobj->kset)
                return;

        kset_get(kobj->kset);
        spin_lock(&kobj->kset->list_lock);
        list_add_tail(&kobj->entry, &kobj->kset->list);
        spin_unlock(&kobj->kset->list_lock);
}

/* remove the kobject from its kset's list */
static void kobj_kset_leave(struct kobject *kobj)
{
        if (!kobj->kset)
                return;

        spin_lock(&kobj->kset->list_lock);
        list_del_init(&kobj->entry);
        spin_unlock(&kobj->kset->list_lock);
        kset_put(kobj->kset);
}

static void kobject_init_internal(struct kobject *kobj)
{
        if (!kobj)
                return;
        kref_init(&kobj->kref);
        INIT_LIST_HEAD(&kobj->entry);
        kobj->state_in_sysfs = 0;
        kobj->state_add_uevent_sent = 0;
        kobj->state_remove_uevent_sent = 0;
        kobj->state_initialized = 1;
}


static int kobject_add_internal(struct kobject *kobj)
{
        int error = 0;
        struct kobject *parent;

        if (!kobj)
                return -ENOENT;

        if (!kobj->name || !kobj->name[0]) {
                WARN(1,
                     "kobject: (%p): attempted to be registered with empty name!\n",
                     kobj);
                return -EINVAL;
        }

        parent = kobject_get(kobj->parent);

        /* join kset if set, use it as parent if we do not already have one */
        if (kobj->kset) {
                if (!parent)
                        parent = kobject_get(&kobj->kset->kobj);
                kobj_kset_join(kobj);
                kobj->parent = parent;
        }

        pr_debug("'%s' (%p): %s: parent: '%s', set: '%s'\n",
                 kobject_name(kobj), kobj, __func__,
                 parent ? kobject_name(parent) : "<NULL>",
                 kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");

        error = create_dir(kobj);
        if (error) {
                kobj_kset_leave(kobj);
                kobject_put(parent);
                kobj->parent = NULL;

                /* be noisy on error issues */
                if (error == -EEXIST)
                        pr_err("%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n",
                               __func__, kobject_name(kobj));
                else
                        pr_err("%s failed for %s (error: %d parent: %s)\n",
                               __func__, kobject_name(kobj), error,
                               parent ? kobject_name(parent) : "'none'");
        } else
                kobj->state_in_sysfs = 1;

        return error;
}

/**
 * kobject_set_name_vargs() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 * @vargs: vargs to format the string.
 */
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
                                  va_list vargs)
{
        const char *s;

        if (kobj->name && !fmt)
                return 0;

        s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
        if (!s)
                return -ENOMEM;

        /*
         * ewww... some of these buggers have '/' in the name ... If
         * that's the case, we need to make sure we have an actual
         * allocated copy to modify, since kvasprintf_const may have
         * returned something from .rodata.
         */
        if (strchr(s, '/')) {
                char *t;

                t = kstrdup(s, GFP_KERNEL);
                kfree_const(s);
                if (!t)
                        return -ENOMEM;
                s = strreplace(t, '/', '!');
        }
        kfree_const(kobj->name);
        kobj->name = s;

        return 0;
}

/**
 * kobject_set_name() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 *
 * This sets the name of the kobject.  If you have already added the
 * kobject to the system, you must call kobject_rename() in order to
 * change the name of the kobject.
 */
int kobject_set_name(struct kobject *kobj, const char *fmt, ...)
{
        va_list vargs;
        int retval;

        va_start(vargs, fmt);
        retval = kobject_set_name_vargs(kobj, fmt, vargs);
        va_end(vargs);

        return retval;
}
EXPORT_SYMBOL(kobject_set_name);

/**
 * kobject_init() - Initialize a kobject structure.
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 *
 * This function will properly initialize a kobject such that it can then
 * be passed to the kobject_add() call.
 *
 * After this function is called, the kobject MUST be cleaned up by a call
 * to kobject_put(), not by a call to kfree directly to ensure that all of
 * the memory is cleaned up properly.
 */
void kobject_init(struct kobject *kobj, const struct kobj_type *ktype)
{
        char *err_str;

        if (!kobj) {
                err_str = "invalid kobject pointer!";
                goto error;
        }
        if (!ktype) {
                err_str = "must have a ktype to be initialized properly!\n";
                goto error;
        }
        if (kobj->state_initialized) {
                /* do not error out as sometimes we can recover */
                pr_err("kobject (%p): tried to init an initialized object, something is seriously wrong.\n",
                       kobj);
                dump_stack_lvl(KERN_ERR);
        }

        kobject_init_internal(kobj);
        kobj->ktype = ktype;
        return;

error:
        pr_err("kobject (%p): %s\n", kobj, err_str);
        dump_stack_lvl(KERN_ERR);
}
EXPORT_SYMBOL(kobject_init);

static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
                                           struct kobject *parent,
                                           const char *fmt, va_list vargs)
{
        int retval;

        retval = kobject_set_name_vargs(kobj, fmt, vargs);
        if (retval) {
                pr_err("can not set name properly!\n");
                return retval;
        }
        kobj->parent = parent;
        return kobject_add_internal(kobj);
}

/**
 * kobject_add() - The main kobject add function.
 * @kobj: the kobject to add
 * @parent: pointer to the parent of the kobject.
 * @fmt: format to name the kobject with.
 *
 * The kobject name is set and added to the kobject hierarchy in this
 * function.
 *
 * If @parent is set, then the parent of the @kobj will be set to it.
 * If @parent is NULL, then the parent of the @kobj will be set to the
 * kobject associated with the kset assigned to this kobject.  If no kset
 * is assigned to the kobject, then the kobject will be located in the
 * root of the sysfs tree.
 *
 * Note, no "add" uevent will be created with this call, the caller should set
 * up all of the necessary sysfs files for the object and then call
 * kobject_uevent() with the UEVENT_ADD parameter to ensure that
 * userspace is properly notified of this kobject's creation.
 *
 * Return: If this function returns an error, kobject_put() must be
 *         called to properly clean up the memory associated with the
 *         object.  Under no instance should the kobject that is passed
 *         to this function be directly freed with a call to kfree(),
 *         that can leak memory.
 *
 *         If this function returns success, kobject_put() must also be called
 *         in order to properly clean up the memory associated with the object.
 *
 *         In short, once this function is called, kobject_put() MUST be called
 *         when the use of the object is finished in order to properly free
 *         everything.
 */
int kobject_add(struct kobject *kobj, struct kobject *parent,
                const char *fmt, ...)
{
        va_list args;
        int retval;

        if (!kobj)
                return -EINVAL;

        if (!kobj->state_initialized) {
                pr_err("kobject '%s' (%p): tried to add an uninitialized object, something is seriously wrong.\n",
                       kobject_name(kobj), kobj);
                dump_stack_lvl(KERN_ERR);
                return -EINVAL;
        }
        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);

        return retval;
}
EXPORT_SYMBOL(kobject_add);

/**
 * kobject_init_and_add() - Initialize a kobject structure and add it to
 *                          the kobject hierarchy.
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 * @parent: pointer to the parent of this kobject.
 * @fmt: the name of the kobject.
 *
 * This function combines the call to kobject_init() and kobject_add().
 *
 * If this function returns an error, kobject_put() must be called to
 * properly clean up the memory associated with the object.  This is the
 * same type of error handling after a call to kobject_add() and kobject
 * lifetime rules are the same here.
 */
int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
                         struct kobject *parent, const char *fmt, ...)
{
        va_list args;
        int retval;

        kobject_init(kobj, ktype);

        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);

        return retval;
}
EXPORT_SYMBOL_GPL(kobject_init_and_add);

/**
 * kobject_rename() - Change the name of an object.
 * @kobj: object in question.
 * @new_name: object's new name
 *
 * It is the responsibility of the caller to provide mutual
 * exclusion between two different calls of kobject_rename
 * on the same kobject and to ensure that new_name is valid and
 * won't conflict with other kobjects.
 */
int kobject_rename(struct kobject *kobj, const char *new_name)
{
        int error = 0;
        const char *devpath = NULL;
        const char *dup_name = NULL, *name;
        char *devpath_string = NULL;
        char *envp[2];

        kobj = kobject_get(kobj);
        if (!kobj)
                return -EINVAL;
        if (!kobj->parent) {
                kobject_put(kobj);
                return -EINVAL;
        }

        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                error = -ENOMEM;
                goto out;
        }
        devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
        if (!devpath_string) {
                error = -ENOMEM;
                goto out;
        }
        sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
        envp[0] = devpath_string;
        envp[1] = NULL;

        name = dup_name = kstrdup_const(new_name, GFP_KERNEL);
        if (!name) {
                error = -ENOMEM;
                goto out;
        }

        error = sysfs_rename_dir_ns(kobj, new_name, kobject_namespace(kobj));
        if (error)
                goto out;

        /* Install the new kobject name */
        dup_name = kobj->name;
        kobj->name = name;

        /* This function is mostly/only used for network interface.
         * Some hotplug package track interfaces by their name and
         * therefore want to know when the name is changed by the user. */
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);

out:
        kfree_const(dup_name);
        kfree(devpath_string);
        kfree(devpath);
        kobject_put(kobj);

        return error;
}
EXPORT_SYMBOL_GPL(kobject_rename);

/**
 * kobject_move() - Move object to another parent.
 * @kobj: object in question.
 * @new_parent: object's new parent (can be NULL)
 */
int kobject_move(struct kobject *kobj, struct kobject *new_parent)
{
        int error;
        struct kobject *old_parent;
        const char *devpath = NULL;
        char *devpath_string = NULL;
        char *envp[2];

        kobj = kobject_get(kobj);
        if (!kobj)
                return -EINVAL;
        new_parent = kobject_get(new_parent);
        if (!new_parent) {
                if (kobj->kset)
                        new_parent = kobject_get(&kobj->kset->kobj);
        }

        /* old object path */
        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                error = -ENOMEM;
                goto out;
        }
        devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
        if (!devpath_string) {
                error = -ENOMEM;
                goto out;
        }
        sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
        envp[0] = devpath_string;
        envp[1] = NULL;
        error = sysfs_move_dir_ns(kobj, new_parent, kobject_namespace(kobj));
        if (error)
                goto out;
        old_parent = kobj->parent;
        kobj->parent = new_parent;
        new_parent = NULL;
        kobject_put(old_parent);
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);
out:
        kobject_put(new_parent);
        kobject_put(kobj);
        kfree(devpath_string);
        kfree(devpath);
        return error;
}
EXPORT_SYMBOL_GPL(kobject_move);

static void __kobject_del(struct kobject *kobj)
{
        struct kernfs_node *sd;
        const struct kobj_type *ktype;

        sd = kobj->sd;
        ktype = get_ktype(kobj);

        if (ktype)
                sysfs_remove_groups(kobj, ktype->default_groups);

        /* send "remove" if the caller did not do it but sent "add" */
        if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {
                pr_debug("'%s' (%p): auto cleanup 'remove' event\n",
                         kobject_name(kobj), kobj);
                kobject_uevent(kobj, KOBJ_REMOVE);
        }

        sysfs_remove_dir(kobj);
        sysfs_put(sd);

        kobj->state_in_sysfs = 0;
        kobj_kset_leave(kobj);
        kobj->parent = NULL;
}

/**
 * kobject_del() - Unlink kobject from hierarchy.
 * @kobj: object.
 *
 * This is the function that should be called to delete an object
 * successfully added via kobject_add().
 */
void kobject_del(struct kobject *kobj)
{
        struct kobject *parent;

        if (!kobj)
                return;

        parent = kobj->parent;
        __kobject_del(kobj);
        kobject_put(parent);
}
EXPORT_SYMBOL(kobject_del);

/**
 * kobject_get() - Increment refcount for object.
 * @kobj: object.
 */
struct kobject *kobject_get(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_get() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_get(&kobj->kref);
        }
        return kobj;
}
EXPORT_SYMBOL(kobject_get);

struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
{
        if (!kobj)
                return NULL;
        if (!kref_get_unless_zero(&kobj->kref))
                kobj = NULL;
        return kobj;
}
EXPORT_SYMBOL(kobject_get_unless_zero);

/*
 * kobject_cleanup - free kobject resources.
 * @kobj: object to cleanup
 */
static void kobject_cleanup(struct kobject *kobj)
{
        struct kobject *parent = kobj->parent;
        const struct kobj_type *t = get_ktype(kobj);
        const char *name = kobj->name;

        pr_debug("'%s' (%p): %s, parent %p\n",
                 kobject_name(kobj), kobj, __func__, kobj->parent);

        if (t && !t->release)
                pr_debug("'%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
                         kobject_name(kobj), kobj);

        /* remove from sysfs if the caller did not do it */
        if (kobj->state_in_sysfs) {
                pr_debug("'%s' (%p): auto cleanup kobject_del\n",
                         kobject_name(kobj), kobj);
                __kobject_del(kobj);
        } else {
                /* avoid dropping the parent reference unnecessarily */
                parent = NULL;
        }

        if (t && t->release) {
                pr_debug("'%s' (%p): calling ktype release\n",
                         kobject_name(kobj), kobj);
                t->release(kobj);
        }

        /* free name if we allocated it */
        if (name) {
                pr_debug("'%s': free name\n", name);
                kfree_const(name);
        }

        kobject_put(parent);
}

#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
static void kobject_delayed_cleanup(struct work_struct *work)
{
        kobject_cleanup(container_of(to_delayed_work(work),
                                     struct kobject, release));
}
#endif

static void kobject_release(struct kref *kref)
{
        struct kobject *kobj = container_of(kref, struct kobject, kref);
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
        unsigned long delay = HZ + HZ * get_random_u32_below(4);
        pr_info("'%s' (%p): %s, parent %p (delayed %ld)\n",
                kobject_name(kobj), kobj, __func__, kobj->parent, delay);
        INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);

        schedule_delayed_work(&kobj->release, delay);
#else
        kobject_cleanup(kobj);
#endif
}

/**
 * kobject_put() - Decrement refcount for object.
 * @kobj: object.
 *
 * Decrement the refcount, and if 0, call kobject_cleanup().
 */
void kobject_put(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_put() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_put(&kobj->kref, kobject_release);
        }
}
EXPORT_SYMBOL(kobject_put);

static void dynamic_kobj_release(struct kobject *kobj)
{
        pr_debug("(%p): %s\n", kobj, __func__);
        kfree(kobj);
}

static const struct kobj_type dynamic_kobj_ktype = {
        .release        = dynamic_kobj_release,
        .sysfs_ops        = &kobj_sysfs_ops,
};

/**
 * kobject_create() - Create a struct kobject dynamically.
 *
 * This function creates a kobject structure dynamically and sets it up
 * to be a "dynamic" kobject with a default release function set up.
 *
 * If the kobject was not able to be created, NULL will be returned.
 * The kobject structure returned from here must be cleaned up with a
 * call to kobject_put() and not kfree(), as kobject_init() has
 * already been called on this structure.
 */
static struct kobject *kobject_create(void)
{
        struct kobject *kobj;

        kobj = kzalloc_obj(*kobj);
        if (!kobj)
                return NULL;

        kobject_init(kobj, &dynamic_kobj_ktype);
        return kobj;
}

/**
 * kobject_create_and_add() - Create a struct kobject dynamically and
 *                            register it with sysfs.
 * @name: the name for the kobject
 * @parent: the parent kobject of this kobject, if any.
 *
 * This function creates a kobject structure dynamically and registers it
 * with sysfs.  When you are finished with this structure, call
 * kobject_put() and the structure will be dynamically freed when
 * it is no longer being used.
 *
 * If the kobject was not able to be created, NULL will be returned.
 */
struct kobject *kobject_create_and_add(const char *name, struct kobject *parent)
{
        struct kobject *kobj;
        int retval;

        kobj = kobject_create();
        if (!kobj)
                return NULL;

        retval = kobject_add(kobj, parent, "%s", name);
        if (retval) {
                pr_warn("%s: kobject_add error: %d\n", __func__, retval);
                kobject_put(kobj);
                kobj = NULL;
        }
        return kobj;
}
EXPORT_SYMBOL_GPL(kobject_create_and_add);

/**
 * kset_init() - Initialize a kset for use.
 * @k: kset
 */
void kset_init(struct kset *k)
{
        kobject_init_internal(&k->kobj);
        INIT_LIST_HEAD(&k->list);
        spin_lock_init(&k->list_lock);
}

/* default kobject attribute operations */
static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,
                              char *buf)
{
        struct kobj_attribute *kattr;
        ssize_t ret = -EIO;

        kattr = container_of(attr, struct kobj_attribute, attr);
        if (kattr->show)
                ret = kattr->show(kobj, kattr, buf);
        return ret;
}

static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,
                               const char *buf, size_t count)
{
        struct kobj_attribute *kattr;
        ssize_t ret = -EIO;

        kattr = container_of(attr, struct kobj_attribute, attr);
        if (kattr->store)
                ret = kattr->store(kobj, kattr, buf, count);
        return ret;
}

const struct sysfs_ops kobj_sysfs_ops = {
        .show        = kobj_attr_show,
        .store        = kobj_attr_store,
};
EXPORT_SYMBOL_GPL(kobj_sysfs_ops);

/**
 * kset_register() - Initialize and add a kset.
 * @k: kset.
 *
 * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
 * is freed, it can not be used any more.
 */
int kset_register(struct kset *k)
{
        int err;

        if (!k)
                return -EINVAL;

        if (!k->kobj.ktype) {
                pr_err("must have a ktype to be initialized properly!\n");
                return -EINVAL;
        }

        kset_init(k);
        err = kobject_add_internal(&k->kobj);
        if (err) {
                kfree_const(k->kobj.name);
                /* Set it to NULL to avoid accessing bad pointer in callers. */
                k->kobj.name = NULL;
                return err;
        }
        kobject_uevent(&k->kobj, KOBJ_ADD);
        return 0;
}
EXPORT_SYMBOL(kset_register);

/**
 * kset_unregister() - Remove a kset.
 * @k: kset.
 */
void kset_unregister(struct kset *k)
{
        if (!k)
                return;
        kobject_del(&k->kobj);
        kobject_put(&k->kobj);
}
EXPORT_SYMBOL(kset_unregister);

/**
 * kset_find_obj() - Search for object in kset.
 * @kset: kset we're looking in.
 * @name: object's name.
 *
 * Lock kset via @kset->subsys, and iterate over @kset->list,
 * looking for a matching kobject. If matching object is found
 * take a reference and return the object.
 */
struct kobject *kset_find_obj(struct kset *kset, const char *name)
{
        struct kobject *k;
        struct kobject *ret = NULL;

        spin_lock(&kset->list_lock);

        list_for_each_entry(k, &kset->list, entry) {
                if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
                        ret = kobject_get_unless_zero(k);
                        break;
                }
        }

        spin_unlock(&kset->list_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(kset_find_obj);

static void kset_release(struct kobject *kobj)
{
        struct kset *kset = container_of(kobj, struct kset, kobj);
        pr_debug("'%s' (%p): %s\n",
                 kobject_name(kobj), kobj, __func__);
        kfree(kset);
}

static void kset_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        if (kobj->parent)
                kobject_get_ownership(kobj->parent, uid, gid);
}

static const struct kobj_type kset_ktype = {
        .sysfs_ops        = &kobj_sysfs_ops,
        .release        = kset_release,
        .get_ownership        = kset_get_ownership,
};

/**
 * kset_create() - Create a struct kset dynamically.
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically.  This structure can
 * then be registered with the system and show up in sysfs with a call to
 * kset_register().  When you are finished with this structure, if
 * kset_register() has been called, call kset_unregister() and the
 * structure will be dynamically freed when it is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
static struct kset *kset_create(const char *name,
                                const struct kset_uevent_ops *uevent_ops,
                                struct kobject *parent_kobj)
{
        struct kset *kset;
        int retval;

        kset = kzalloc_obj(*kset);
        if (!kset)
                return NULL;
        retval = kobject_set_name(&kset->kobj, "%s", name);
        if (retval) {
                kfree(kset);
                return NULL;
        }
        kset->uevent_ops = uevent_ops;
        kset->kobj.parent = parent_kobj;

        /*
         * The kobject of this kset will have a type of kset_ktype and belong to
         * no kset itself.  That way we can properly free it when it is
         * finished being used.
         */
        kset->kobj.ktype = &kset_ktype;
        kset->kobj.kset = NULL;

        return kset;
}

/**
 * kset_create_and_add() - Create a struct kset dynamically and add it to sysfs.
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically and registers it
 * with sysfs.  When you are finished with this structure, call
 * kset_unregister() and the structure will be dynamically freed when it
 * is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
struct kset *kset_create_and_add(const char *name,
                                 const struct kset_uevent_ops *uevent_ops,
                                 struct kobject *parent_kobj)
{
        struct kset *kset;
        int error;

        kset = kset_create(name, uevent_ops, parent_kobj);
        if (!kset)
                return NULL;
        error = kset_register(kset);
        if (error) {
                kfree(kset);
                return NULL;
        }
        return kset;
}
EXPORT_SYMBOL_GPL(kset_create_and_add);


static DEFINE_SPINLOCK(kobj_ns_type_lock);
static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES];

int kobj_ns_type_register(const struct kobj_ns_type_operations *ops)
{
        enum kobj_ns_type type = ops->type;
        int error;

        spin_lock(&kobj_ns_type_lock);

        error = -EINVAL;
        if (!kobj_ns_type_is_valid(type))
                goto out;

        error = -EBUSY;
        if (kobj_ns_ops_tbl[type])
                goto out;

        error = 0;
        kobj_ns_ops_tbl[type] = ops;

out:
        spin_unlock(&kobj_ns_type_lock);
        return error;
}

int kobj_ns_type_registered(enum kobj_ns_type type)
{
        int registered = 0;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type))
                registered = kobj_ns_ops_tbl[type] != NULL;
        spin_unlock(&kobj_ns_type_lock);

        return registered;
}

const struct kobj_ns_type_operations *kobj_child_ns_ops(const struct kobject *parent)
{
        const struct kobj_ns_type_operations *ops = NULL;

        if (parent && parent->ktype && parent->ktype->child_ns_type)
                ops = parent->ktype->child_ns_type(parent);

        return ops;
}

const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj)
{
        return kobj_child_ns_ops(kobj->parent);
}

bool kobj_ns_current_may_mount(enum kobj_ns_type type)
{
        bool may_mount = true;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                may_mount = kobj_ns_ops_tbl[type]->current_may_mount();
        spin_unlock(&kobj_ns_type_lock);

        return may_mount;
}

struct ns_common *kobj_ns_grab_current(enum kobj_ns_type type)
{
        struct ns_common *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->grab_current_ns();
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}
EXPORT_SYMBOL_GPL(kobj_ns_grab_current);

void kobj_ns_drop(enum kobj_ns_type type, struct ns_common *ns)
{
        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) &&
            kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns)
                kobj_ns_ops_tbl[type]->drop_ns(ns);
        spin_unlock(&kobj_ns_type_lock);
}
EXPORT_SYMBOL_GPL(kobj_ns_drop);
























































































































































































































































































































































    1 



    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Force feedback support for Linux input subsystem
 *
 *  Copyright (c) 2006 Anssi Hannula <anssi.hannula@gmail.com>
 *  Copyright (c) 2006 Dmitry Torokhov <dtor@mail.ru>
 */

/* #define DEBUG */

#include <linux/export.h>
#include <linux/input.h>
#include <linux/limits.h>
#include <linux/mutex.h>
#include <linux/overflow.h>
#include <linux/sched.h>
#include <linux/slab.h>

/*
 * Check that the effect_id is a valid effect and whether the user
 * is the owner
 */
static int check_effect_access(struct ff_device *ff, int effect_id,
                                struct file *file)
{
        if (effect_id < 0 || effect_id >= ff->max_effects ||
            !ff->effect_owners[effect_id])
                return -EINVAL;

        if (file && ff->effect_owners[effect_id] != file)
                return -EACCES;

        return 0;
}

/*
 * Checks whether 2 effects can be combined together
 */
static inline int check_effects_compatible(struct ff_effect *e1,
                                           struct ff_effect *e2)
{
        return e1->type == e2->type &&
               (e1->type != FF_PERIODIC ||
                e1->u.periodic.waveform == e2->u.periodic.waveform);
}

/*
 * Convert an effect into compatible one
 */
static int compat_effect(struct ff_device *ff, struct ff_effect *effect)
{
        int magnitude;

        switch (effect->type) {
        case FF_RUMBLE:
                if (!test_bit(FF_PERIODIC, ff->ffbit))
                        return -EINVAL;

                /*
                 * calculate magnitude of sine wave as average of rumble's
                 * 2/3 of strong magnitude and 1/3 of weak magnitude
                 */
                magnitude = effect->u.rumble.strong_magnitude / 3 +
                            effect->u.rumble.weak_magnitude / 6;

                effect->type = FF_PERIODIC;
                effect->u.periodic.waveform = FF_SINE;
                effect->u.periodic.period = 50;
                effect->u.periodic.magnitude = magnitude;
                effect->u.periodic.offset = 0;
                effect->u.periodic.phase = 0;
                effect->u.periodic.envelope.attack_length = 0;
                effect->u.periodic.envelope.attack_level = 0;
                effect->u.periodic.envelope.fade_length = 0;
                effect->u.periodic.envelope.fade_level = 0;

                return 0;

        default:
                /* Let driver handle conversion */
                return 0;
        }
}

/**
 * input_ff_upload() - upload effect into force-feedback device
 * @dev: input device
 * @effect: effect to be uploaded
 * @file: owner of the effect
 */
int input_ff_upload(struct input_dev *dev, struct ff_effect *effect,
                    struct file *file)
{
        struct ff_device *ff = dev->ff;
        struct ff_effect *old;
        int error;
        int id;

        if (!test_bit(EV_FF, dev->evbit))
                return -ENOSYS;

        if (effect->type < FF_EFFECT_MIN || effect->type > FF_EFFECT_MAX ||
            !test_bit(effect->type, dev->ffbit)) {
                dev_dbg(&dev->dev, "invalid or not supported effect type in upload\n");
                return -EINVAL;
        }

        if (effect->type == FF_PERIODIC &&
            (effect->u.periodic.waveform < FF_WAVEFORM_MIN ||
             effect->u.periodic.waveform > FF_WAVEFORM_MAX ||
             !test_bit(effect->u.periodic.waveform, dev->ffbit))) {
                dev_dbg(&dev->dev, "invalid or not supported wave form in upload\n");
                return -EINVAL;
        }

        if (!test_bit(effect->type, ff->ffbit)) {
                error = compat_effect(ff, effect);
                if (error)
                        return error;
        }

        guard(mutex)(&ff->mutex);

        if (effect->id == -1) {
                for (id = 0; id < ff->max_effects; id++)
                        if (!ff->effect_owners[id])
                                break;

                if (id >= ff->max_effects)
                        return -ENOSPC;

                effect->id = id;
                old = NULL;

        } else {
                id = effect->id;

                error = check_effect_access(ff, id, file);
                if (error)
                        return error;

                old = &ff->effects[id];

                if (!check_effects_compatible(effect, old))
                        return -EINVAL;
        }

        error = ff->upload(dev, effect, old);
        if (error)
                return error;

        scoped_guard(spinlock_irq, &dev->event_lock) {
                ff->effects[id] = *effect;
                ff->effect_owners[id] = file;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(input_ff_upload);

/*
 * Erases the effect if the requester is also the effect owner. The mutex
 * should already be locked before calling this function.
 */
static int erase_effect(struct input_dev *dev, int effect_id,
                        struct file *file)
{
        struct ff_device *ff = dev->ff;
        int error;

        error = check_effect_access(ff, effect_id, file);
        if (error)
                return error;

        scoped_guard(spinlock_irq, &dev->event_lock) {
                ff->playback(dev, effect_id, 0);
                ff->effect_owners[effect_id] = NULL;
        }

        if (ff->erase) {
                error = ff->erase(dev, effect_id);
                if (error) {
                        scoped_guard(spinlock_irq, &dev->event_lock)
                                ff->effect_owners[effect_id] = file;

                        return error;
                }
        }

        return 0;
}

/**
 * input_ff_erase - erase a force-feedback effect from device
 * @dev: input device to erase effect from
 * @effect_id: id of the effect to be erased
 * @file: purported owner of the request
 *
 * This function erases a force-feedback effect from specified device.
 * The effect will only be erased if it was uploaded through the same
 * file handle that is requesting erase.
 */
int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file)
{
        struct ff_device *ff = dev->ff;

        if (!test_bit(EV_FF, dev->evbit))
                return -ENOSYS;

        guard(mutex)(&ff->mutex);
        return erase_effect(dev, effect_id, file);
}
EXPORT_SYMBOL_GPL(input_ff_erase);

/*
 * input_ff_flush - erase all effects owned by a file handle
 * @dev: input device to erase effect from
 * @file: purported owner of the effects
 *
 * This function erases all force-feedback effects associated with
 * the given owner from specified device. Note that @file may be %NULL,
 * in which case all effects will be erased.
 */
int input_ff_flush(struct input_dev *dev, struct file *file)
{
        struct ff_device *ff = dev->ff;
        int i;

        dev_dbg(&dev->dev, "flushing now\n");

        guard(mutex)(&ff->mutex);

        for (i = 0; i < ff->max_effects; i++)
                erase_effect(dev, i, file);

        return 0;
}
EXPORT_SYMBOL_GPL(input_ff_flush);

/**
 * input_ff_event() - generic handler for force-feedback events
 * @dev: input device to send the effect to
 * @type: event type (anything but EV_FF is ignored)
 * @code: event code
 * @value: event value
 */
int input_ff_event(struct input_dev *dev, unsigned int type,
                   unsigned int code, int value)
{
        struct ff_device *ff = dev->ff;

        if (type != EV_FF)
                return 0;

        switch (code) {
        case FF_GAIN:
                if (!test_bit(FF_GAIN, dev->ffbit) || value > 0xffffU)
                        break;

                ff->set_gain(dev, value);
                break;

        case FF_AUTOCENTER:
                if (!test_bit(FF_AUTOCENTER, dev->ffbit) || value > 0xffffU)
                        break;

                ff->set_autocenter(dev, value);
                break;

        default:
                if (check_effect_access(ff, code, NULL) == 0)
                        ff->playback(dev, code, value);
                break;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(input_ff_event);

/**
 * input_ff_create() - create force-feedback device
 * @dev: input device supporting force-feedback
 * @max_effects: maximum number of effects supported by the device
 *
 * This function allocates all necessary memory for a force feedback
 * portion of an input device and installs all default handlers.
 * @dev->ffbit should be already set up before calling this function.
 * Once ff device is created you need to setup its upload, erase,
 * playback and other handlers before registering input device
 */
int input_ff_create(struct input_dev *dev, unsigned int max_effects)
{
        int i;

        if (!max_effects) {
                dev_err(&dev->dev, "cannot allocate device without any effects\n");
                return -EINVAL;
        }

        if (max_effects > FF_MAX_EFFECTS) {
                dev_err(&dev->dev, "cannot allocate more than FF_MAX_EFFECTS effects\n");
                return -EINVAL;
        }

        struct ff_device *ff __free(kfree) =
                kzalloc_flex(*ff, effect_owners, max_effects);
        if (!ff)
                return -ENOMEM;

        ff->effects = kzalloc_objs(*ff->effects, max_effects);
        if (!ff->effects)
                return -ENOMEM;

        ff->max_effects = max_effects;
        mutex_init(&ff->mutex);

        dev->flush = input_ff_flush;
        dev->event = input_ff_event;
        __set_bit(EV_FF, dev->evbit);

        /* Copy "true" bits into ff device bitmap */
        for_each_set_bit(i, dev->ffbit, FF_CNT)
                __set_bit(i, ff->ffbit);

        /* we can emulate RUMBLE with periodic effects */
        if (test_bit(FF_PERIODIC, ff->ffbit))
                __set_bit(FF_RUMBLE, dev->ffbit);

        dev->ff = no_free_ptr(ff);

        return 0;
}
EXPORT_SYMBOL_GPL(input_ff_create);

/**
 * input_ff_destroy() - frees force feedback portion of input device
 * @dev: input device supporting force feedback
 *
 * This function is only needed in error path as input core will
 * automatically free force feedback structures when device is
 * destroyed.
 */
void input_ff_destroy(struct input_dev *dev)
{
        struct ff_device *ff = dev->ff;

        __clear_bit(EV_FF, dev->evbit);
        if (ff) {
                if (ff->destroy)
                        ff->destroy(ff);
                kfree(ff->private);
                kfree(ff->effects);
                kfree(ff);
                dev->ff = NULL;
        }
}
EXPORT_SYMBOL_GPL(input_ff_destroy);




























































   16 


































    1 





































































































































   29 







   31 




















   22 
   15 




    3 
   19 















   32 
   23 
   22 






















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  

  linux/include/linux/rbtree.h

  To use rbtrees you'll have to implement your own insert and search cores.
  This will avoid us to use callbacks and to drop drammatically performances.
  I know it's not the cleaner way,  but in C (not in C++) to get
  performances and genericity...

  See Documentation/core-api/rbtree.rst for documentation and samples.
*/

#ifndef        _LINUX_RBTREE_H
#define        _LINUX_RBTREE_H

#include <linux/container_of.h>
#include <linux/rbtree_types.h>

#include <linux/stddef.h>
#include <linux/rcupdate.h>

#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))

#define        rb_entry(ptr, type, member) container_of(ptr, type, member)

#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)

/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
#define RB_EMPTY_NODE(node)  \
        ((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node)  \
        ((node)->__rb_parent_color = (unsigned long)(node))

#define RB_EMPTY_LINKED_NODE(lnode)  RB_EMPTY_NODE(&(lnode)->node)
#define RB_CLEAR_LINKED_NODE(lnode)  ({                                        \
        RB_CLEAR_NODE(&(lnode)->node);                                        \
        (lnode)->prev = (lnode)->next = NULL;                                \
})

extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);
extern bool rb_erase_linked(struct rb_node_linked *, struct rb_root_linked *);

/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);

/*
 * This function returns the first node (in sort order) of the tree.
 */
static inline struct rb_node *rb_first(const struct rb_root *root)
{
        struct rb_node        *n;

        n = root->rb_node;
        if (!n)
                return NULL;
        while (n->rb_left)
                n = n->rb_left;
        return n;
}

/*
 * This function returns the last node (in sort order) of the tree.
 */
static inline struct rb_node *rb_last(const struct rb_root *root)
{
        struct rb_node        *n;

        n = root->rb_node;
        if (!n)
                return NULL;
        while (n->rb_right)
                n = n->rb_right;
        return n;
}

/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);

/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
                            struct rb_root *root);
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
                                struct rb_root *root);

static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
                                struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        *rb_link = node;
}

static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
                                    struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        rcu_assign_pointer(*rb_link, node);
}

#define rb_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? rb_entry(____ptr, type, member) : NULL; \
        })

/**
 * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
 * given type allowing the backing memory of @pos to be invalidated
 *
 * @pos:        the 'type *' to use as a loop cursor.
 * @n:                another 'type *' to use as temporary storage
 * @root:        'rb_root *' of the rbtree.
 * @field:        the name of the rb_node field within 'type'.
 *
 * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
 * list_for_each_entry_safe() and allows the iteration to continue independent
 * of changes to @pos by the body of the loop.
 *
 * Note, however, that it cannot handle other modifications that re-order the
 * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
 * rb_erase() may rebalance the tree, causing us to miss some nodes.
 */
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
        for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
             pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
                        typeof(*pos), field); 1; }); \
             pos = n)

/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost

static inline void rb_insert_color_cached(struct rb_node *node,
                                          struct rb_root_cached *root,
                                          bool leftmost)
{
        if (leftmost)
                root->rb_leftmost = node;
        rb_insert_color(node, &root->rb_root);
}


static inline struct rb_node *
rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
{
        struct rb_node *leftmost = NULL;

        if (root->rb_leftmost == node)
                leftmost = root->rb_leftmost = rb_next(node);

        rb_erase(node, &root->rb_root);

        return leftmost;
}

static inline void rb_replace_node_cached(struct rb_node *victim,
                                          struct rb_node *new,
                                          struct rb_root_cached *root)
{
        if (root->rb_leftmost == victim)
                root->rb_leftmost = new;
        rb_replace_node(victim, new, &root->rb_root);
}

/*
 * The below helper functions use 2 operators with 3 different
 * calling conventions. The operators are related like:
 *
 *        comp(a->key,b) < 0  := less(a,b)
 *        comp(a->key,b) > 0  := less(b,a)
 *        comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
 *
 * If these operators define a partial order on the elements we make no
 * guarantee on which of the elements matching the key is found. See
 * rb_find().
 *
 * The reason for this is to allow the find() interface without requiring an
 * on-stack dummy object, which might not be feasible due to object size.
 */

/**
 * rb_add_cached() - insert @node into the leftmost cached tree @tree
 * @node: node to insert
 * @tree: leftmost cached tree to insert @node into
 * @less: operator defining the (partial) node order
 *
 * Returns @node when it is the new leftmost, or NULL.
 */
static __always_inline struct rb_node *
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
              bool (*less)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        while (*link) {
                parent = *link;
                if (less(node, parent)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(node, parent, link);
        rb_insert_color_cached(node, tree, leftmost);

        return leftmost ? node : NULL;
}

static __always_inline void
__rb_add(struct rb_node *node, struct rb_root *tree,
         bool (*less)(struct rb_node *, const struct rb_node *),
         void (*linkop)(struct rb_node *, struct rb_node *, struct rb_node **))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;

        while (*link) {
                parent = *link;
                if (less(node, parent))
                        link = &parent->rb_left;
                else
                        link = &parent->rb_right;
        }

        linkop(node, parent, link);
        rb_link_node(node, parent, link);
        rb_insert_color(node, tree);
}

#define __node_2_linked_node(_n) \
        rb_entry((_n), struct rb_node_linked, node)

static inline void
rb_link_linked_node(struct rb_node *node, struct rb_node *parent, struct rb_node **link)
{
        if (!parent)
                return;

        struct rb_node_linked *nnew = __node_2_linked_node(node);
        struct rb_node_linked *npar = __node_2_linked_node(parent);

        if (link == &parent->rb_left) {
                nnew->prev = npar->prev;
                nnew->next = npar;
                npar->prev = nnew;
                if (nnew->prev)
                        nnew->prev->next = nnew;
        } else {
                nnew->next = npar->next;
                nnew->prev = npar;
                npar->next = nnew;
                if (nnew->next)
                        nnew->next->prev = nnew;
        }
}

/**
 * rb_add_linked() - insert @node into the leftmost linked tree @tree
 * @node: node to insert
 * @tree: linked tree to insert @node into
 * @less: operator defining the (partial) node order
 *
 * Returns @true when @node is the new leftmost, @false otherwise.
 */
static __always_inline bool
rb_add_linked(struct rb_node_linked *node, struct rb_root_linked *tree,
              bool (*less)(struct rb_node *, const struct rb_node *))
{
        __rb_add(&node->node, &tree->rb_root, less, rb_link_linked_node);
        if (!node->prev)
                tree->rb_leftmost = node;
        return !node->prev;
}

/* Empty linkop function which is optimized away by the compiler */
static __always_inline void
rb_link_noop(struct rb_node *n, struct rb_node *p, struct rb_node **l) { }

/**
 * rb_add() - insert @node into @tree
 * @node: node to insert
 * @tree: tree to insert @node into
 * @less: operator defining the (partial) node order
 */
static __always_inline void
rb_add(struct rb_node *node, struct rb_root *tree,
       bool (*less)(struct rb_node *, const struct rb_node *))
{
        __rb_add(node, tree, less, rb_link_noop);
}

/**
 * rb_find_add_cached() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree,
            int (*cmp)(const struct rb_node *new, const struct rb_node *exist))
{
        bool leftmost = true;
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0) {
                        link = &parent->rb_left;
                } else if (c > 0) {
                        link = &parent->rb_right;
                        leftmost = false;
                } else {
                        return parent;
                }
        }

        rb_link_node(node, parent, link);
        rb_insert_color_cached(node, tree, leftmost);
        return NULL;
}

/**
 * rb_find_add() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add(struct rb_node *node, struct rb_root *tree,
            int (*cmp)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0)
                        link = &parent->rb_left;
                else if (c > 0)
                        link = &parent->rb_right;
                else
                        return parent;
        }

        rb_link_node(node, parent, link);
        rb_insert_color(node, tree);
        return NULL;
}

/**
 * rb_find_add_rcu() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Adds a Store-Release for link_node.
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
                int (*cmp)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0)
                        link = &parent->rb_left;
                else if (c > 0)
                        link = &parent->rb_right;
                else
                        return parent;
        }

        rb_link_node_rcu(node, parent, link);
        rb_insert_color(node, tree);
        return NULL;
}

/**
 * rb_find() - find @key in tree @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @key or NULL.
 */
static __always_inline struct rb_node *
rb_find(const void *key, const struct rb_root *tree,
        int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;

        while (node) {
                int c = cmp(key, node);

                if (c < 0)
                        node = node->rb_left;
                else if (c > 0)
                        node = node->rb_right;
                else
                        return node;
        }

        return NULL;
}

/**
 * rb_find_rcu() - find @key in tree @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining the node order
 *
 * Notably, tree descent vs concurrent tree rotations is unsound and can result
 * in false-negatives.
 *
 * Returns the rb_node matching @key or NULL.
 */
static __always_inline struct rb_node *
rb_find_rcu(const void *key, const struct rb_root *tree,
            int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;

        while (node) {
                int c = cmp(key, node);

                if (c < 0)
                        node = rcu_dereference_raw(node->rb_left);
                else if (c > 0)
                        node = rcu_dereference_raw(node->rb_right);
                else
                        return node;
        }

        return NULL;
}

/**
 * rb_find_first() - find the first @key in @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 *
 * Returns the leftmost node matching @key, or NULL.
 */
static __always_inline struct rb_node *
rb_find_first(const void *key, const struct rb_root *tree,
              int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;
        struct rb_node *match = NULL;

        while (node) {
                int c = cmp(key, node);

                if (c <= 0) {
                        if (!c)
                                match = node;
                        node = node->rb_left;
                } else if (c > 0) {
                        node = node->rb_right;
                }
        }

        return match;
}

/**
 * rb_next_match() - find the next @key in @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 *
 * Returns the next node matching @key, or NULL.
 */
static __always_inline struct rb_node *
rb_next_match(const void *key, struct rb_node *node,
              int (*cmp)(const void *key, const struct rb_node *))
{
        node = rb_next(node);
        if (node && cmp(key, node))
                node = NULL;
        return node;
}

/**
 * rb_for_each() - iterates a subtree matching @key
 * @node: iterator
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 */
#define rb_for_each(node, key, tree, cmp) \
        for ((node) = rb_find_first((key), (tree), (cmp)); \
             (node); (node) = rb_next_match((key), (node), (cmp)))

#endif        /* _LINUX_RBTREE_H */



































































































































































































   18 




   16 









   18 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * internal.h - printk internal definitions
 */
#include <linux/console.h>
#include <linux/types.h>
#include <linux/sysctl.h>

#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
void __init printk_sysctl_init(void);
int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos);
#else
#define printk_sysctl_init() do { } while (0)
#endif

#define con_printk(lvl, con, fmt, ...)                                \
        printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt),                \
                (con->flags & CON_NBCON) ? "" : "legacy ",        \
                (con->flags & CON_BOOT) ? "boot" : "",                \
                con->name, con->index, ##__VA_ARGS__)

/*
 * Identify if legacy printing is forced in a dedicated kthread. If
 * true, all printing via console lock occurs within a dedicated
 * legacy printer thread. The only exception is on panic, after the
 * nbcon consoles have had their chance to print the panic messages
 * first.
 */
#ifdef CONFIG_PREEMPT_RT
# define force_legacy_kthread()        (true)
#else
# define force_legacy_kthread()        (false)
#endif

#ifdef CONFIG_PRINTK

#ifdef CONFIG_PRINTK_CALLER
#define PRINTK_PREFIX_MAX        48
#else
#define PRINTK_PREFIX_MAX        32
#endif

/*
 * the maximum size of a formatted record (i.e. with prefix added
 * per line and dropped messages or in extended message format)
 */
#define PRINTK_MESSAGE_MAX        2048

/* the maximum size allowed to be reserved for a record */
#define PRINTKRB_RECORD_MAX        1024

/* Flags for a single printk record. */
enum printk_info_flags {
        /* always show on console, ignore console_loglevel */
        LOG_FORCE_CON        = 1,
        LOG_NEWLINE        = 2,        /* text ended with a newline */
        LOG_CONT        = 8,        /* text is a fragment of a continuation line */
};

struct printk_ringbuffer;
struct dev_printk_info;

extern struct printk_ringbuffer *prb;
extern bool printk_kthreads_running;
extern bool printk_kthreads_ready;
extern bool debug_non_panic_cpus;

__printf(4, 0)
int vprintk_store(int facility, int level,
                  const struct dev_printk_info *dev_info,
                  const char *fmt, va_list args);

__printf(1, 0) int vprintk_default(const char *fmt, va_list args);

void __printk_safe_enter(void);
void __printk_safe_exit(void);

bool printk_percpu_data_ready(void);

#define printk_safe_enter_irqsave(flags)        \
        do {                                        \
                local_irq_save(flags);                \
                __printk_safe_enter();                \
        } while (0)

#define printk_safe_exit_irqrestore(flags)        \
        do {                                        \
                __printk_safe_exit();                \
                local_irq_restore(flags);        \
        } while (0)

void defer_console_output(void);
bool is_printk_legacy_deferred(void);
bool is_printk_force_console(void);

u16 printk_parse_prefix(const char *text, int *level,
                        enum printk_info_flags *flags);
void console_lock_spinning_enable(void);
int console_lock_spinning_disable_and_check(int cookie);

u64 nbcon_seq_read(struct console *con);
void nbcon_seq_force(struct console *con, u64 seq);
bool nbcon_alloc(struct console *con);
void nbcon_free(struct console *con);
enum nbcon_prio nbcon_get_default_prio(void);
void nbcon_atomic_flush_pending(void);
bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
                                   int cookie, bool use_atomic);
bool nbcon_kthread_create(struct console *con);
void nbcon_kthread_stop(struct console *con);
void nbcon_kthreads_wake(void);

/**
 * nbcon_kthread_wake - Wake up a console printing thread
 * @con:        Console to operate on
 */
static inline void nbcon_kthread_wake(struct console *con)
{
        /*
         * Guarantee any new records can be seen by tasks preparing to wait
         * before this context checks if the rcuwait is empty.
         *
         * The full memory barrier in rcuwait_wake_up() pairs with the full
         * memory barrier within set_current_state() of
         * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait()
         * adds the waiter but before it has checked the wait condition.
         *
         * This pairs with nbcon_kthread_func:A.
         */
        rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */
}

#else

#define PRINTK_PREFIX_MAX        0
#define PRINTK_MESSAGE_MAX        0
#define PRINTKRB_RECORD_MAX        0

#define printk_kthreads_running (false)
#define printk_kthreads_ready (false)

/*
 * In !PRINTK builds we still export console_sem
 * semaphore and some of console functions (console_unlock()/etc.), so
 * printk-safe must preserve the existing local IRQ guarantees.
 */
#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)

static inline bool printk_percpu_data_ready(void) { return false; }
static inline void defer_console_output(void) { }
static inline bool is_printk_legacy_deferred(void) { return false; }
static inline u64 nbcon_seq_read(struct console *con) { return 0; }
static inline void nbcon_seq_force(struct console *con, u64 seq) { }
static inline bool nbcon_alloc(struct console *con) { return false; }
static inline void nbcon_free(struct console *con) { }
static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; }
static inline void nbcon_atomic_flush_pending(void) { }
static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
                                                 int cookie, bool use_atomic) { return false; }
static inline void nbcon_kthread_wake(struct console *con) { }
static inline void nbcon_kthreads_wake(void) { }

#endif /* CONFIG_PRINTK */

extern bool have_boot_console;
extern bool have_nbcon_console;
extern bool have_legacy_console;
extern bool legacy_allow_panic_sync;

/**
 * struct console_flush_type - Define available console flush methods
 * @nbcon_atomic:        Flush directly using nbcon_atomic() callback
 * @nbcon_offload:        Offload flush to printer thread
 * @legacy_direct:        Call the legacy loop in this context
 * @legacy_offload:        Offload the legacy loop into IRQ or legacy thread
 *
 * Note that the legacy loop also flushes the nbcon consoles.
 */
struct console_flush_type {
        bool        nbcon_atomic;
        bool        nbcon_offload;
        bool        legacy_direct;
        bool        legacy_offload;
};

extern bool console_irqwork_blocked;

/*
 * Identify which console flushing methods should be used in the context of
 * the caller.
 */
static inline void printk_get_console_flush_type(struct console_flush_type *ft)
{
        memset(ft, 0, sizeof(*ft));

        switch (nbcon_get_default_prio()) {
        case NBCON_PRIO_NORMAL:
                if (have_nbcon_console && !have_boot_console) {
                        if (printk_kthreads_running && !console_irqwork_blocked)
                                ft->nbcon_offload = true;
                        else
                                ft->nbcon_atomic = true;
                }

                /* Legacy consoles are flushed directly when possible. */
                if (have_legacy_console || have_boot_console) {
                        if (!is_printk_legacy_deferred())
                                ft->legacy_direct = true;
                        else if (!console_irqwork_blocked)
                                ft->legacy_offload = true;
                }
                break;

        case NBCON_PRIO_EMERGENCY:
                if (have_nbcon_console && !have_boot_console)
                        ft->nbcon_atomic = true;

                /* Legacy consoles are flushed directly when possible. */
                if (have_legacy_console || have_boot_console) {
                        if (!is_printk_legacy_deferred())
                                ft->legacy_direct = true;
                        else if (!console_irqwork_blocked)
                                ft->legacy_offload = true;
                }
                break;

        case NBCON_PRIO_PANIC:
                /*
                 * In panic, the nbcon consoles will directly print. But
                 * only allowed if there are no boot consoles.
                 */
                if (have_nbcon_console && !have_boot_console)
                        ft->nbcon_atomic = true;

                if (have_legacy_console || have_boot_console) {
                        /*
                         * This is the same decision as NBCON_PRIO_NORMAL
                         * except that offloading never occurs in panic.
                         *
                         * Note that console_flush_on_panic() will flush
                         * legacy consoles anyway, even if unsafe.
                         */
                        if (!is_printk_legacy_deferred())
                                ft->legacy_direct = true;

                        /*
                         * In panic, if nbcon atomic printing occurs,
                         * the legacy consoles must remain silent until
                         * explicitly allowed.
                         */
                        if (ft->nbcon_atomic && !legacy_allow_panic_sync)
                                ft->legacy_direct = false;
                }
                break;

        default:
                WARN_ON_ONCE(1);
                break;
        }
}

extern struct printk_buffers printk_shared_pbufs;

/**
 * struct printk_buffers - Buffers to read/format/output printk messages.
 * @outbuf:        After formatting, contains text to output.
 * @scratchbuf:        Used as temporary ringbuffer reading and string-print space.
 */
struct printk_buffers {
        char        outbuf[PRINTK_MESSAGE_MAX];
        char        scratchbuf[PRINTKRB_RECORD_MAX];
};

/**
 * struct printk_message - Container for a prepared printk message.
 * @pbufs:        printk buffers used to prepare the message.
 * @outbuf_len:        The length of prepared text in @pbufs->outbuf to output. This
 *                does not count the terminator. A value of 0 means there is
 *                nothing to output and this record should be skipped.
 * @seq:        The sequence number of the record used for @pbufs->outbuf.
 * @dropped:        The number of dropped records from reading @seq.
 * @cpu:        CPU on which the message was generated.
 * @pid:        PID of the task that generated the message
 * @comm:        Name of the task that generated the message.
 */
struct printk_message {
        struct printk_buffers        *pbufs;
        unsigned int                outbuf_len;
        u64                        seq;
        unsigned long                dropped;
#ifdef CONFIG_PRINTK_EXECUTION_CTX
        int                        cpu;
        pid_t                        pid;
        char                        comm[TASK_COMM_LEN];
#endif
};

bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
                             bool is_extended, bool may_supress);

#ifdef CONFIG_PRINTK
void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
void console_prepend_replay(struct printk_message *pmsg);
#endif

#ifdef CONFIG_SMP
bool is_printk_cpu_sync_owner(void);
#else
static inline bool is_printk_cpu_sync_owner(void) { return false; }
#endif















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Supervisor Mode Access Prevention support
 *
 * Copyright (C) 2012 Intel Corporation
 * Author: H. Peter Anvin <hpa@linux.intel.com>
 */

#ifndef _ASM_X86_SMAP_H
#define _ASM_X86_SMAP_H

#include <asm/nops.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>

#ifdef __ASSEMBLER__

#define ASM_CLAC \
        ALTERNATIVE "", "clac", X86_FEATURE_SMAP

#define ASM_STAC \
        ALTERNATIVE "", "stac", X86_FEATURE_SMAP

#else /* __ASSEMBLER__ */

/*
 * The CLAC/STAC instructions toggle the enforcement of
 * X86_FEATURE_SMAP along with X86_FEATURE_LASS.
 *
 * SMAP enforcement is based on the _PAGE_BIT_USER bit in the page
 * tables. The kernel is not allowed to touch pages with that bit set
 * unless the AC bit is set.
 *
 * Use stac()/clac() when accessing userspace (_PAGE_USER) mappings,
 * regardless of location.
 *
 * Note: a barrier is implicit in alternative().
 */

static __always_inline void clac(void)
{
        alternative("", "clac", X86_FEATURE_SMAP);
}

static __always_inline void stac(void)
{
        alternative("", "stac", X86_FEATURE_SMAP);
}

/*
 * LASS enforcement is based on bit 63 of the virtual address. The
 * kernel is not allowed to touch memory in the lower half of the
 * virtual address space.
 *
 * Use lass_stac()/lass_clac() to toggle the AC bit for kernel data
 * accesses (!_PAGE_USER) that are blocked by LASS, but not by SMAP.
 *
 * Even with the AC bit set, LASS will continue to block instruction
 * fetches from the user half of the address space. To allow those,
 * clear CR4.LASS to disable the LASS mechanism entirely.
 *
 * Note: a barrier is implicit in alternative().
 */

static __always_inline void lass_clac(void)
{
        alternative("", "clac", X86_FEATURE_LASS);
}

static __always_inline void lass_stac(void)
{
        alternative("", "stac", X86_FEATURE_LASS);
}

static __always_inline unsigned long smap_save(void)
{
        unsigned long flags;

        asm volatile ("# smap_save\n\t"
                      ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "\n\t"
                                  "", "pushf; pop %0; clac",
                                  X86_FEATURE_SMAP)
                      : "=rm" (flags) : : "memory", "cc");

        return flags;
}

static __always_inline void smap_restore(unsigned long flags)
{
        asm volatile ("# smap_restore\n\t"
                      ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "\n\t"
                                  "", "push %0; popf",
                                  X86_FEATURE_SMAP)
                      : : "g" (flags) : "memory", "cc");
}

/* These macros can be used in asm() statements */
#define ASM_CLAC \
        ALTERNATIVE("", "clac", X86_FEATURE_SMAP)
#define ASM_STAC \
        ALTERNATIVE("", "stac", X86_FEATURE_SMAP)

#define ASM_CLAC_UNSAFE \
        ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "\n\t" "clac", X86_FEATURE_SMAP)
#define ASM_STAC_UNSAFE \
        ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "\n\t" "stac", X86_FEATURE_SMAP)

#endif /* __ASSEMBLER__ */

#endif /* _ASM_X86_SMAP_H */

























   66 
























    2 
   27 



















   68 








   76 



















   72 























   27 
   27 












































































































































































































































































































































































































































































































   65 



   66 





   65 






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"

#include <linux/sched/cputime.h>

#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>

#include <trace/events/cgroup.h>

static DEFINE_SPINLOCK(rstat_base_lock);
static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);

/*
 * Determines whether a given css can participate in rstat.
 * css's that are cgroup::self use rstat for base stats.
 * Other css's associated with a subsystem use rstat only when
 * they define the ss->css_rstat_flush callback.
 */
static inline bool css_uses_rstat(struct cgroup_subsys_state *css)
{
        return css_is_self(css) || css->ss->css_rstat_flush != NULL;
}

static struct css_rstat_cpu *css_rstat_cpu(
                struct cgroup_subsys_state *css, int cpu)
{
        return per_cpu_ptr(css->rstat_cpu, cpu);
}

static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu(
                struct cgroup *cgrp, int cpu)
{
        return per_cpu_ptr(cgrp->rstat_base_cpu, cpu);
}

static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
{
        if (ss)
                return &ss->rstat_ss_lock;

        return &rstat_base_lock;
}

static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
{
        if (ss)
                return per_cpu_ptr(ss->lhead, cpu);
        return per_cpu_ptr(&rstat_backlog_list, cpu);
}

/**
 * css_rstat_updated - keep track of updated rstat_cpu
 * @css: target cgroup subsystem state
 * @cpu: cpu on which rstat_cpu was updated
 *
 * Atomically inserts the css in the ss's llist for the given cpu. This is
 * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
 * will be processed at the flush time to create the update tree.
 *
 * NOTE: if the user needs the guarantee that the updater either add itself in
 * the lockless list or the concurrent flusher flushes its updated stats, a
 * memory barrier is needed before the call to css_rstat_updated() i.e. a
 * barrier after updating the per-cpu stats and before calling
 * css_rstat_updated().
 */
__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
        struct llist_head *lhead;
        struct css_rstat_cpu *rstatc;
        struct llist_node *self;

        /*
         * Since bpf programs can call this function, prevent access to
         * uninitialized rstat pointers.
         */
        if (!css_uses_rstat(css))
                return;

        lockdep_assert_preemption_disabled();

        /*
         * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
         * the requests from nmi context.
         */
        if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
             !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
                return;

        rstatc = css_rstat_cpu(css, cpu);
        /*
         * If already on list return. This check is racy and smp_mb() is needed
         * to pair it with the smp_mb() in css_process_update_tree() if the
         * guarantee that the updated stats are visible to concurrent flusher is
         * needed.
         */
        if (llist_on_list(&rstatc->lnode))
                return;

        /*
         * This function can be renentered by irqs and nmis for the same cgroup
         * and may try to insert the same per-cpu lnode into the llist. Note
         * that llist_add() does not protect against such scenarios. In addition
         * this same per-cpu lnode can be modified through init_llist_node()
         * from css_rstat_flush() running on a different CPU.
         *
         * To protect against such stacked contexts of irqs/nmis, we use the
         * fact that lnode points to itself when not on a list and then use
         * try_cmpxchg() to atomically set to NULL to select the winner
         * which will call llist_add(). The losers can assume the insertion is
         * successful and the winner will eventually add the per-cpu lnode to
         * the llist.
         *
         * Please note that we can not use this_cpu_cmpxchg() here as on some
         * archs it is not safe against modifications from multiple CPUs.
         */
        self = &rstatc->lnode;
        if (!try_cmpxchg(&rstatc->lnode.next, &self, NULL))
                return;

        lhead = ss_lhead_cpu(css->ss, cpu);
        llist_add(&rstatc->lnode, lhead);
}

static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
{
        /* put @css and all ancestors on the corresponding updated lists */
        while (true) {
                struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
                struct cgroup_subsys_state *parent = css->parent;
                struct css_rstat_cpu *prstatc;

                /*
                 * Both additions and removals are bottom-up.  If a cgroup
                 * is already in the tree, all ancestors are.
                 */
                if (rstatc->updated_next)
                        break;

                /* Root has no parent to link it to, but mark it busy */
                if (!parent) {
                        rstatc->updated_next = css;
                        break;
                }

                prstatc = css_rstat_cpu(parent, cpu);
                rstatc->updated_next = prstatc->updated_children;
                prstatc->updated_children = css;

                css = parent;
        }
}

static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
{
        struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
        struct llist_node *lnode;

        while ((lnode = llist_del_first_init(lhead))) {
                struct css_rstat_cpu *rstatc;

                /*
                 * smp_mb() is needed here (more specifically in between
                 * init_llist_node() and per-cpu stats flushing) if the
                 * guarantee is required by a rstat user where etiher the
                 * updater should add itself on the lockless list or the
                 * flusher flush the stats updated by the updater who have
                 * observed that they are already on the list. The
                 * corresponding barrier pair for this one should be before
                 * css_rstat_updated() by the user.
                 *
                 * For now, there aren't any such user, so not adding the
                 * barrier here but if such a use-case arise, please add
                 * smp_mb() here.
                 */

                rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
                __css_process_update_tree(rstatc->owner, cpu);
        }
}

/**
 * css_rstat_push_children - push children css's into the given list
 * @head: current head of the list (= subtree root)
 * @child: first child of the root
 * @cpu: target cpu
 * Return: A new singly linked list of css's to be flushed
 *
 * Iteratively traverse down the css_rstat_cpu updated tree level by
 * level and push all the parents first before their next level children
 * into a singly linked list via the rstat_flush_next pointer built from the
 * tail backward like "pushing" css's into a stack. The root is pushed by
 * the caller.
 */
static struct cgroup_subsys_state *css_rstat_push_children(
                struct cgroup_subsys_state *head,
                struct cgroup_subsys_state *child, int cpu)
{
        struct cgroup_subsys_state *cnext = child;        /* Next head of child css level */
        struct cgroup_subsys_state *ghead = NULL;        /* Head of grandchild css level */
        struct cgroup_subsys_state *parent, *grandchild;
        struct css_rstat_cpu *crstatc;

        child->rstat_flush_next = NULL;

        /*
         * The subsystem rstat lock must be held for the whole duration from
         * here as the rstat_flush_next list is being constructed to when
         * it is consumed later in css_rstat_flush().
         */
        lockdep_assert_held(ss_rstat_lock(head->ss));

        /*
         * Notation: -> updated_next pointer
         *             => rstat_flush_next pointer
         *
         * Assuming the following sample updated_children lists:
         *  P: C1 -> C2 -> P
         *  C1: G11 -> G12 -> C1
         *  C2: G21 -> G22 -> C2
         *
         * After 1st iteration:
         *  head => C2 => C1 => NULL
         *  ghead => G21 => G11 => NULL
         *
         * After 2nd iteration:
         *  head => G12 => G11 => G22 => G21 => C2 => C1 => NULL
         */
next_level:
        while (cnext) {
                child = cnext;
                cnext = child->rstat_flush_next;
                parent = child->parent;

                /* updated_next is parent cgroup terminated if !NULL */
                while (child != parent) {
                        child->rstat_flush_next = head;
                        head = child;
                        crstatc = css_rstat_cpu(child, cpu);
                        grandchild = crstatc->updated_children;
                        if (grandchild != child) {
                                /* Push the grand child to the next level */
                                crstatc->updated_children = child;
                                grandchild->rstat_flush_next = ghead;
                                ghead = grandchild;
                        }
                        child = crstatc->updated_next;
                        crstatc->updated_next = NULL;
                }
        }

        if (ghead) {
                cnext = ghead;
                ghead = NULL;
                goto next_level;
        }
        return head;
}

/**
 * css_rstat_updated_list - build a list of updated css's to be flushed
 * @root: root of the css subtree to traverse
 * @cpu: target cpu
 * Return: A singly linked list of css's to be flushed
 *
 * Walks the updated rstat_cpu tree on @cpu from @root.  During traversal,
 * each returned css is unlinked from the updated tree.
 *
 * The only ordering guarantee is that, for a parent and a child pair
 * covered by a given traversal, the child is before its parent in
 * the list.
 *
 * Note that updated_children is self terminated and points to a list of
 * child css's if not empty. Whereas updated_next is like a sibling link
 * within the children list and terminated by the parent css. An exception
 * here is the css root whose updated_next can be self terminated.
 */
static struct cgroup_subsys_state *css_rstat_updated_list(
                struct cgroup_subsys_state *root, int cpu)
{
        struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
        struct cgroup_subsys_state *head = NULL, *parent, *child;

        css_process_update_tree(root->ss, cpu);

        /* Return NULL if this subtree is not on-list */
        if (!rstatc->updated_next)
                return NULL;

        /*
         * Unlink @root from its parent. As the updated_children list is
         * singly linked, we have to walk it to find the removal point.
         */
        parent = root->parent;
        if (parent) {
                struct css_rstat_cpu *prstatc;
                struct cgroup_subsys_state **nextp;

                prstatc = css_rstat_cpu(parent, cpu);
                nextp = &prstatc->updated_children;
                while (*nextp != root) {
                        struct css_rstat_cpu *nrstatc;

                        nrstatc = css_rstat_cpu(*nextp, cpu);
                        WARN_ON_ONCE(*nextp == parent);
                        nextp = &nrstatc->updated_next;
                }
                *nextp = rstatc->updated_next;
        }

        rstatc->updated_next = NULL;

        /* Push @root to the list first before pushing the children */
        head = root;
        root->rstat_flush_next = NULL;
        child = rstatc->updated_children;
        rstatc->updated_children = root;
        if (child != root)
                head = css_rstat_push_children(head, child, cpu);

        return head;
}

/*
 * A hook for bpf stat collectors to attach to and flush their stats.
 * Together with providing bpf kfuncs for css_rstat_updated() and
 * css_rstat_flush(), this enables a complete workflow where bpf progs that
 * collect cgroup stats can integrate with rstat for efficient flushing.
 *
 * A static noinline declaration here could cause the compiler to optimize away
 * the function. A global noinline declaration will keep the definition, but may
 * optimize away the callsite. Therefore, __weak is needed to ensure that the
 * call is still emitted, by telling the compiler that we don't know what the
 * function might eventually be.
 */

__bpf_hook_start();

__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
                                     struct cgroup *parent, int cpu)
{
}

__bpf_hook_end();

/*
 * Helper functions for locking.
 *
 * This makes it easier to diagnose locking issues and contention in
 * production environments.  The parameter @cpu_in_loop indicate lock
 * was released and re-taken when collection data from the CPUs. The
 * value -1 is used when obtaining the main lock else this is the CPU
 * number processed last.
 */
static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
                int cpu_in_loop)
        __acquires(ss_rstat_lock(css->ss))
{
        struct cgroup *cgrp = css->cgroup;
        spinlock_t *lock;
        bool contended;

        lock = ss_rstat_lock(css->ss);
        contended = !spin_trylock_irq(lock);
        if (contended) {
                trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
                spin_lock_irq(lock);
        }
        trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
}

static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
                                      int cpu_in_loop)
        __releases(ss_rstat_lock(css->ss))
{
        struct cgroup *cgrp = css->cgroup;
        spinlock_t *lock;

        lock = ss_rstat_lock(css->ss);
        trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
        spin_unlock_irq(lock);
}

/**
 * css_rstat_flush - flush stats in @css's rstat subtree
 * @css: target cgroup subsystem state
 *
 * Collect all per-cpu stats in @css's subtree into the global counters
 * and propagate them upwards. After this function returns, all rstat
 * nodes in the subtree have up-to-date ->stat.
 *
 * This also gets all rstat nodes in the subtree including @css off the
 * ->updated_children lists.
 *
 * This function may block.
 */
__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
{
        int cpu;
        bool is_self = css_is_self(css);

        /*
         * Since bpf programs can call this function, prevent access to
         * uninitialized rstat pointers.
         */
        if (!css_uses_rstat(css))
                return;

        might_sleep();
        for_each_possible_cpu(cpu) {
                struct cgroup_subsys_state *pos;

                /* Reacquire for each CPU to avoid disabling IRQs too long */
                __css_rstat_lock(css, cpu);
                pos = css_rstat_updated_list(css, cpu);
                for (; pos; pos = pos->rstat_flush_next) {
                        if (is_self) {
                                cgroup_base_stat_flush(pos->cgroup, cpu);
                                bpf_rstat_flush(pos->cgroup,
                                                cgroup_parent(pos->cgroup), cpu);
                        } else
                                pos->ss->css_rstat_flush(pos, cpu);
                }
                __css_rstat_unlock(css, cpu);
                if (!cond_resched())
                        cpu_relax();
        }
}

int css_rstat_init(struct cgroup_subsys_state *css)
{
        struct cgroup *cgrp = css->cgroup;
        int cpu;
        bool is_self = css_is_self(css);

        if (is_self) {
                /* the root cgrp has rstat_base_cpu preallocated */
                if (!cgrp->rstat_base_cpu) {
                        cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu);
                        if (!cgrp->rstat_base_cpu)
                                return -ENOMEM;
                }
        } else if (css->ss->css_rstat_flush == NULL)
                return 0;

        /* the root cgrp's self css has rstat_cpu preallocated */
        if (!css->rstat_cpu) {
                css->rstat_cpu = alloc_percpu(struct css_rstat_cpu);
                if (!css->rstat_cpu) {
                        if (is_self)
                                free_percpu(cgrp->rstat_base_cpu);

                        return -ENOMEM;
                }
        }

        /* ->updated_children list is self terminated */
        for_each_possible_cpu(cpu) {
                struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);

                rstatc->owner = rstatc->updated_children = css;
                init_llist_node(&rstatc->lnode);

                if (is_self) {
                        struct cgroup_rstat_base_cpu *rstatbc;

                        rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
                        u64_stats_init(&rstatbc->bsync);
                }
        }

        return 0;
}

void css_rstat_exit(struct cgroup_subsys_state *css)
{
        int cpu;

        if (!css_uses_rstat(css))
                return;

        if (!css->rstat_cpu)
                return;

        css_rstat_flush(css);

        /* sanity check */
        for_each_possible_cpu(cpu) {
                struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);

                if (WARN_ON_ONCE(rstatc->updated_children != css) ||
                    WARN_ON_ONCE(rstatc->updated_next))
                        return;
        }

        if (css_is_self(css)) {
                struct cgroup *cgrp = css->cgroup;

                free_percpu(cgrp->rstat_base_cpu);
                cgrp->rstat_base_cpu = NULL;
        }

        free_percpu(css->rstat_cpu);
        css->rstat_cpu = NULL;
}

/**
 * ss_rstat_init - subsystem-specific rstat initialization
 * @ss: target subsystem
 *
 * If @ss is NULL, the static locks associated with the base stats
 * are initialized. If @ss is non-NULL, the subsystem-specific locks
 * are initialized.
 */
int __init ss_rstat_init(struct cgroup_subsys *ss)
{
        int cpu;

        if (ss) {
                ss->lhead = alloc_percpu(struct llist_head);
                if (!ss->lhead)
                        return -ENOMEM;
        }

        spin_lock_init(ss_rstat_lock(ss));
        for_each_possible_cpu(cpu)
                init_llist_head(ss_lhead_cpu(ss, cpu));

        return 0;
}

/*
 * Functions for cgroup basic resource statistics implemented on top of
 * rstat.
 */
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime += src_bstat->cputime.utime;
        dst_bstat->cputime.stime += src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
        dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
        dst_bstat->ntime += src_bstat->ntime;
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime -= src_bstat->cputime.utime;
        dst_bstat->cputime.stime -= src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
        dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
        dst_bstat->ntime -= src_bstat->ntime;
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
        struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_rstat_base_cpu *prstatbc;
        struct cgroup_base_stat delta;
        unsigned seq;

        /* Root-level stats are sourced from system-wide CPU stats */
        if (!parent)
                return;

        /* fetch the current per-cpu values */
        do {
                seq = __u64_stats_fetch_begin(&rstatbc->bsync);
                delta = rstatbc->bstat;
        } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq));

        /* propagate per-cpu delta to cgroup and per-cpu global statistics */
        cgroup_base_stat_sub(&delta, &rstatbc->last_bstat);
        cgroup_base_stat_add(&cgrp->bstat, &delta);
        cgroup_base_stat_add(&rstatbc->last_bstat, &delta);
        cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta);

        /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
        if (cgroup_parent(parent)) {
                delta = cgrp->bstat;
                cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
                cgroup_base_stat_add(&parent->bstat, &delta);
                cgroup_base_stat_add(&cgrp->last_bstat, &delta);

                delta = rstatbc->subtree_bstat;
                prstatbc = cgroup_rstat_base_cpu(parent, cpu);
                cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat);
                cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta);
                cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta);
        }
}

static struct cgroup_rstat_base_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
        struct cgroup_rstat_base_cpu *rstatbc;

        rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu);
        *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync);
        return rstatbc;
}

static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
                                                 struct cgroup_rstat_base_cpu *rstatbc,
                                                 unsigned long flags)
{
        u64_stats_update_end_irqrestore(&rstatbc->bsync, flags);
        css_rstat_updated(&cgrp->self, smp_processor_id());
        put_cpu_ptr(rstatbc);
}

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
        struct cgroup_rstat_base_cpu *rstatbc;
        unsigned long flags;

        rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
        rstatbc->bstat.cputime.sum_exec_runtime += delta_exec;
        cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
}

void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec)
{
        struct cgroup_rstat_base_cpu *rstatbc;
        unsigned long flags;

        rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);

        switch (index) {
        case CPUTIME_NICE:
                rstatbc->bstat.ntime += delta_exec;
                fallthrough;
        case CPUTIME_USER:
                rstatbc->bstat.cputime.utime += delta_exec;
                break;
        case CPUTIME_SYSTEM:
        case CPUTIME_IRQ:
        case CPUTIME_SOFTIRQ:
                rstatbc->bstat.cputime.stime += delta_exec;
                break;
#ifdef CONFIG_SCHED_CORE
        case CPUTIME_FORCEIDLE:
                rstatbc->bstat.forceidle_sum += delta_exec;
                break;
#endif
        default:
                break;
        }

        cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
}

/*
 * compute the cputime for the root cgroup by getting the per cpu data
 * at a global level, then categorizing the fields in a manner consistent
 * with how it is done by __cgroup_account_cputime_field for each bit of
 * cpu time attributed to a cgroup.
 */
static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
        struct task_cputime *cputime = &bstat->cputime;
        int i;

        memset(bstat, 0, sizeof(*bstat));
        for_each_possible_cpu(i) {
                struct kernel_cpustat kcpustat;
                u64 *cpustat = kcpustat.cpustat;
                u64 user = 0;
                u64 sys = 0;

                kcpustat_cpu_fetch(&kcpustat, i);

                user += cpustat[CPUTIME_USER];
                user += cpustat[CPUTIME_NICE];
                cputime->utime += user;

                sys += cpustat[CPUTIME_SYSTEM];
                sys += cpustat[CPUTIME_IRQ];
                sys += cpustat[CPUTIME_SOFTIRQ];
                cputime->stime += sys;

                cputime->sum_exec_runtime += user;
                cputime->sum_exec_runtime += sys;

#ifdef CONFIG_SCHED_CORE
                bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
                bstat->ntime += cpustat[CPUTIME_NICE];
        }
}


static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
{
#ifdef CONFIG_SCHED_CORE
        u64 forceidle_time = bstat->forceidle_sum;

        do_div(forceidle_time, NSEC_PER_USEC);
        seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}

void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct cgroup_base_stat bstat;

        if (cgroup_parent(cgrp)) {
                css_rstat_flush(&cgrp->self);
                __css_rstat_lock(&cgrp->self, -1);
                bstat = cgrp->bstat;
                cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
                               &bstat.cputime.utime, &bstat.cputime.stime);
                __css_rstat_unlock(&cgrp->self, -1);
        } else {
                root_cgroup_cputime(&bstat);
        }

        do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
        do_div(bstat.cputime.utime, NSEC_PER_USEC);
        do_div(bstat.cputime.stime, NSEC_PER_USEC);
        do_div(bstat.ntime, NSEC_PER_USEC);

        seq_printf(seq, "usage_usec %llu\n"
                        "user_usec %llu\n"
                        "system_usec %llu\n"
                        "nice_usec %llu\n",
                        bstat.cputime.sum_exec_runtime,
                        bstat.cputime.utime,
                        bstat.cputime.stime,
                        bstat.ntime);

        cgroup_force_idle_show(seq, &bstat);
}

/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */
BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
BTF_ID_FLAGS(func, css_rstat_updated)
BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE)
BTF_KFUNCS_END(bpf_rstat_kfunc_ids)

static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
        .owner          = THIS_MODULE,
        .set            = &bpf_rstat_kfunc_ids,
};

static int __init bpf_rstat_kfunc_init(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
                                         &bpf_rstat_kfunc_set);
}
late_initcall(bpf_rstat_kfunc_init);


























































































































































































   19 




   12 


    5 









    5 








    2 
   17 










   19 




















   17 
   19 















   17 




























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions
 *
 * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2014 Red Hat Inc.
 * Copyright 2025 Google LLC
 */

#include <crypto/hmac.h>
#include <crypto/sha2.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/unaligned.h>
#include <linux/wordpart.h>
#include "fips.h"

static const struct sha256_block_state sha224_iv = {
        .h = {
                SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
                SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
        },
};

static const struct sha256_ctx initial_sha256_ctx = {
        .ctx = {
                .state = {
                        .h = {
                                SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
                                SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
                        },
                },
                .bytecount = 0,
        },
};

#define sha256_iv (initial_sha256_ctx.ctx.state)

static const u32 sha256_K[64] = {
        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
        0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
        0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
        0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
        0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
        0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};

#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
#define e0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
#define e1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
#define s0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
#define s1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))

static inline void LOAD_OP(int I, u32 *W, const u8 *input)
{
        W[I] = get_unaligned_be32((__u32 *)input + I);
}

static inline void BLEND_OP(int I, u32 *W)
{
        W[I] = s1(W[I - 2]) + W[I - 7] + s0(W[I - 15]) + W[I - 16];
}

#define SHA256_ROUND(i, a, b, c, d, e, f, g, h)                    \
        do {                                                       \
                u32 t1, t2;                                        \
                t1 = h + e1(e) + Ch(e, f, g) + sha256_K[i] + W[i]; \
                t2 = e0(a) + Maj(a, b, c);                         \
                d += t1;                                           \
                h = t1 + t2;                                       \
        } while (0)

static void sha256_block_generic(struct sha256_block_state *state,
                                 const u8 *input, u32 W[64])
{
        u32 a, b, c, d, e, f, g, h;
        int i;

        /* load the input */
        for (i = 0; i < 16; i += 8) {
                LOAD_OP(i + 0, W, input);
                LOAD_OP(i + 1, W, input);
                LOAD_OP(i + 2, W, input);
                LOAD_OP(i + 3, W, input);
                LOAD_OP(i + 4, W, input);
                LOAD_OP(i + 5, W, input);
                LOAD_OP(i + 6, W, input);
                LOAD_OP(i + 7, W, input);
        }

        /* now blend */
        for (i = 16; i < 64; i += 8) {
                BLEND_OP(i + 0, W);
                BLEND_OP(i + 1, W);
                BLEND_OP(i + 2, W);
                BLEND_OP(i + 3, W);
                BLEND_OP(i + 4, W);
                BLEND_OP(i + 5, W);
                BLEND_OP(i + 6, W);
                BLEND_OP(i + 7, W);
        }

        /* load the state into our registers */
        a = state->h[0];
        b = state->h[1];
        c = state->h[2];
        d = state->h[3];
        e = state->h[4];
        f = state->h[5];
        g = state->h[6];
        h = state->h[7];

        /* now iterate */
        for (i = 0; i < 64; i += 8) {
                SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
                SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
                SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
                SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
                SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
                SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
                SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
                SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
        }

        state->h[0] += a;
        state->h[1] += b;
        state->h[2] += c;
        state->h[3] += d;
        state->h[4] += e;
        state->h[5] += f;
        state->h[6] += g;
        state->h[7] += h;
}

static void __maybe_unused
sha256_blocks_generic(struct sha256_block_state *state,
                      const u8 *data, size_t nblocks)
{
        u32 W[64];

        do {
                sha256_block_generic(state, data, W);
                data += SHA256_BLOCK_SIZE;
        } while (--nblocks);

        memzero_explicit(W, sizeof(W));
}

#if defined(CONFIG_CRYPTO_LIB_SHA256_ARCH) && !defined(__DISABLE_EXPORTS)
#include "sha256.h" /* $(SRCARCH)/sha256.h */
#else
#define sha256_blocks sha256_blocks_generic
#endif

static void __sha256_init(struct __sha256_ctx *ctx,
                          const struct sha256_block_state *iv,
                          u64 initial_bytecount)
{
        ctx->state = *iv;
        ctx->bytecount = initial_bytecount;
}

void sha224_init(struct sha224_ctx *ctx)
{
        __sha256_init(&ctx->ctx, &sha224_iv, 0);
}
EXPORT_SYMBOL_GPL(sha224_init);

void sha256_init(struct sha256_ctx *ctx)
{
        __sha256_init(&ctx->ctx, &sha256_iv, 0);
}
EXPORT_SYMBOL_GPL(sha256_init);

void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len)
{
        size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;

        ctx->bytecount += len;

        if (partial + len >= SHA256_BLOCK_SIZE) {
                size_t nblocks;

                if (partial) {
                        size_t l = SHA256_BLOCK_SIZE - partial;

                        memcpy(&ctx->buf[partial], data, l);
                        data += l;
                        len -= l;

                        sha256_blocks(&ctx->state, ctx->buf, 1);
                }

                nblocks = len / SHA256_BLOCK_SIZE;
                len %= SHA256_BLOCK_SIZE;

                if (nblocks) {
                        sha256_blocks(&ctx->state, data, nblocks);
                        data += nblocks * SHA256_BLOCK_SIZE;
                }
                partial = 0;
        }
        if (len)
                memcpy(&ctx->buf[partial], data, len);
}
EXPORT_SYMBOL(__sha256_update);

static void __sha256_final(struct __sha256_ctx *ctx,
                           u8 *out, size_t digest_size)
{
        u64 bitcount = ctx->bytecount << 3;
        size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;

        ctx->buf[partial++] = 0x80;
        if (partial > SHA256_BLOCK_SIZE - 8) {
                memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - partial);
                sha256_blocks(&ctx->state, ctx->buf, 1);
                partial = 0;
        }
        memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - 8 - partial);
        *(__be64 *)&ctx->buf[SHA256_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
        sha256_blocks(&ctx->state, ctx->buf, 1);

        for (size_t i = 0; i < digest_size; i += 4)
                put_unaligned_be32(ctx->state.h[i / 4], out + i);
}

void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE])
{
        __sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
        memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL(sha224_final);

void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE])
{
        __sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
        memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL(sha256_final);

void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE])
{
        struct sha224_ctx ctx;

        sha224_init(&ctx);
        sha224_update(&ctx, data, len);
        sha224_final(&ctx, out);
}
EXPORT_SYMBOL(sha224);

void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
{
        struct sha256_ctx ctx;

        sha256_init(&ctx);
        sha256_update(&ctx, data, len);
        sha256_final(&ctx, out);
}
EXPORT_SYMBOL(sha256);

/*
 * Pre-boot environments (as indicated by __DISABLE_EXPORTS being defined) just
 * need the generic SHA-256 code.  Omit all other features from them.
 */
#ifndef __DISABLE_EXPORTS

#ifndef sha256_finup_2x_arch
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
                                 const u8 *data1, const u8 *data2, size_t len,
                                 u8 out1[SHA256_DIGEST_SIZE],
                                 u8 out2[SHA256_DIGEST_SIZE])
{
        return false;
}
static bool sha256_finup_2x_is_optimized_arch(void)
{
        return false;
}
#endif

/* Sequential fallback implementation of sha256_finup_2x() */
static noinline_for_stack void sha256_finup_2x_sequential(
        const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
        size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
{
        struct __sha256_ctx mut_ctx;

        mut_ctx = *ctx;
        __sha256_update(&mut_ctx, data1, len);
        __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);

        mut_ctx = *ctx;
        __sha256_update(&mut_ctx, data2, len);
        __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
}

void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
                     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
                     u8 out2[SHA256_DIGEST_SIZE])
{
        if (ctx == NULL)
                ctx = &initial_sha256_ctx;

        if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
                                        out2)))
                return;
        sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
}
EXPORT_SYMBOL_GPL(sha256_finup_2x);

bool sha256_finup_2x_is_optimized(void)
{
        return sha256_finup_2x_is_optimized_arch();
}
EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);

static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
                                     struct sha256_block_state *ostate,
                                     const u8 *raw_key, size_t raw_key_len,
                                     const struct sha256_block_state *iv)
{
        union {
                u8 b[SHA256_BLOCK_SIZE];
                unsigned long w[SHA256_BLOCK_SIZE / sizeof(unsigned long)];
        } derived_key = { 0 };

        if (unlikely(raw_key_len > SHA256_BLOCK_SIZE)) {
                if (iv == &sha224_iv)
                        sha224(raw_key, raw_key_len, derived_key.b);
                else
                        sha256(raw_key, raw_key_len, derived_key.b);
        } else {
                memcpy(derived_key.b, raw_key, raw_key_len);
        }

        for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
                derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
        *istate = *iv;
        sha256_blocks(istate, derived_key.b, 1);

        for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
                derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
                                                HMAC_IPAD_VALUE);
        *ostate = *iv;
        sha256_blocks(ostate, derived_key.b, 1);

        memzero_explicit(&derived_key, sizeof(derived_key));
}

void hmac_sha224_preparekey(struct hmac_sha224_key *key,
                            const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
                                 raw_key, raw_key_len, &sha224_iv);
}
EXPORT_SYMBOL_GPL(hmac_sha224_preparekey);

void hmac_sha256_preparekey(struct hmac_sha256_key *key,
                            const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
                                 raw_key, raw_key_len, &sha256_iv);
}
EXPORT_SYMBOL_GPL(hmac_sha256_preparekey);

void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx,
                        const struct __hmac_sha256_key *key)
{
        __sha256_init(&ctx->sha_ctx, &key->istate, SHA256_BLOCK_SIZE);
        ctx->ostate = key->ostate;
}
EXPORT_SYMBOL_GPL(__hmac_sha256_init);

void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx,
                                  const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
                                 raw_key, raw_key_len, &sha224_iv);
        ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
}
EXPORT_SYMBOL_GPL(hmac_sha224_init_usingrawkey);

void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx,
                                  const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
                                 raw_key, raw_key_len, &sha256_iv);
        ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
}
EXPORT_SYMBOL_GPL(hmac_sha256_init_usingrawkey);

static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx,
                                u8 *out, size_t digest_size)
{
        /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
        __sha256_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
        memset(&ctx->sha_ctx.buf[digest_size], 0,
               SHA256_BLOCK_SIZE - digest_size);
        ctx->sha_ctx.buf[digest_size] = 0x80;
        *(__be32 *)&ctx->sha_ctx.buf[SHA256_BLOCK_SIZE - 4] =
                cpu_to_be32(8 * (SHA256_BLOCK_SIZE + digest_size));

        /* Compute the outer hash, which gives the HMAC value. */
        sha256_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
        for (size_t i = 0; i < digest_size; i += 4)
                put_unaligned_be32(ctx->ostate.h[i / 4], out + i);

        memzero_explicit(ctx, sizeof(*ctx));
}

void hmac_sha224_final(struct hmac_sha224_ctx *ctx,
                       u8 out[SHA224_DIGEST_SIZE])
{
        __hmac_sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
}
EXPORT_SYMBOL_GPL(hmac_sha224_final);

void hmac_sha256_final(struct hmac_sha256_ctx *ctx,
                       u8 out[SHA256_DIGEST_SIZE])
{
        __hmac_sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
}
EXPORT_SYMBOL_GPL(hmac_sha256_final);

void hmac_sha224(const struct hmac_sha224_key *key,
                 const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE])
{
        struct hmac_sha224_ctx ctx;

        hmac_sha224_init(&ctx, key);
        hmac_sha224_update(&ctx, data, data_len);
        hmac_sha224_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha224);

void hmac_sha256(const struct hmac_sha256_key *key,
                 const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE])
{
        struct hmac_sha256_ctx ctx;

        hmac_sha256_init(&ctx, key);
        hmac_sha256_update(&ctx, data, data_len);
        hmac_sha256_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha256);

void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
                             const u8 *data, size_t data_len,
                             u8 out[SHA224_DIGEST_SIZE])
{
        struct hmac_sha224_ctx ctx;

        hmac_sha224_init_usingrawkey(&ctx, raw_key, raw_key_len);
        hmac_sha224_update(&ctx, data, data_len);
        hmac_sha224_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey);

void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
                             const u8 *data, size_t data_len,
                             u8 out[SHA256_DIGEST_SIZE])
{
        struct hmac_sha256_ctx ctx;

        hmac_sha256_init_usingrawkey(&ctx, raw_key, raw_key_len);
        hmac_sha256_update(&ctx, data, data_len);
        hmac_sha256_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);

#if defined(sha256_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
static int __init sha256_mod_init(void)
{
#ifdef sha256_mod_init_arch
        sha256_mod_init_arch();
#endif
        if (fips_enabled) {
                /*
                 * FIPS cryptographic algorithm self-test.  As per the FIPS
                 * Implementation Guidance, testing HMAC-SHA256 satisfies the
                 * test requirement for SHA-224, SHA-256, and HMAC-SHA224 too.
                 */
                u8 mac[SHA256_DIGEST_SIZE];

                hmac_sha256_usingrawkey(fips_test_key, sizeof(fips_test_key),
                                        fips_test_data, sizeof(fips_test_data),
                                        mac);
                if (memcmp(fips_test_hmac_sha256_value, mac, sizeof(mac)) != 0)
                        panic("sha256: FIPS self-test failed\n");
        }
        return 0;
}
subsys_initcall(sha256_mod_init);

static void __exit sha256_mod_exit(void)
{
}
module_exit(sha256_mod_exit);
#endif

#endif /* !__DISABLE_EXPORTS */

MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
MODULE_LICENSE("GPL");










































































    1 




















    1 

























    1 















    1 





























    1 
































    1 




    1 




















    1 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 

















































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
// SPDX-License-Identifier: GPL-2.0-or-later
/* Userspace key control operations
 *
 * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/init.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/fs.h>
#include <linux/capability.h>
#include <linux/cred.h>
#include <linux/string.h>
#include <linux/err.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
#include <keys/request_key_auth-type.h>
#include "internal.h"

#define KEY_MAX_DESC_SIZE 4096

static const unsigned char keyrings_capabilities[2] = {
        [0] = (KEYCTL_CAPS0_CAPABILITIES |
               (IS_ENABLED(CONFIG_PERSISTENT_KEYRINGS)        ? KEYCTL_CAPS0_PERSISTENT_KEYRINGS : 0) |
               (IS_ENABLED(CONFIG_KEY_DH_OPERATIONS)        ? KEYCTL_CAPS0_DIFFIE_HELLMAN : 0) |
               (IS_ENABLED(CONFIG_ASYMMETRIC_KEY_TYPE)        ? KEYCTL_CAPS0_PUBLIC_KEY : 0) |
               (IS_ENABLED(CONFIG_BIG_KEYS)                ? KEYCTL_CAPS0_BIG_KEY : 0) |
               KEYCTL_CAPS0_INVALIDATE |
               KEYCTL_CAPS0_RESTRICT_KEYRING |
               KEYCTL_CAPS0_MOVE
               ),
        [1] = (KEYCTL_CAPS1_NS_KEYRING_NAME |
               KEYCTL_CAPS1_NS_KEY_TAG |
               (IS_ENABLED(CONFIG_KEY_NOTIFICATIONS)        ? KEYCTL_CAPS1_NOTIFICATIONS : 0)
               ),
};

static int key_get_type_from_user(char *type,
                                  const char __user *_type,
                                  unsigned len)
{
        int ret;

        ret = strncpy_from_user(type, _type, len);
        if (ret < 0)
                return ret;
        if (ret == 0 || ret >= len)
                return -EINVAL;
        if (type[0] == '.')
                return -EPERM;
        type[len - 1] = '\0';
        return 0;
}

/*
 * Extract the description of a new key from userspace and either add it as a
 * new key to the specified keyring or update a matching key in that keyring.
 *
 * If the description is NULL or an empty string, the key type is asked to
 * generate one from the payload.
 *
 * The keyring must be writable so that we can attach the key to it.
 *
 * If successful, the new key's serial number is returned, otherwise an error
 * code is returned.
 */
SYSCALL_DEFINE5(add_key, const char __user *, _type,
                const char __user *, _description,
                const void __user *, _payload,
                size_t, plen,
                key_serial_t, ringid)
{
        key_ref_t keyring_ref, key_ref;
        char type[32], *description;
        void *payload;
        long ret;

        ret = -EINVAL;
        if (plen > 1024 * 1024 - 1)
                goto error;

        /* draw all the data into kernel space */
        ret = key_get_type_from_user(type, _type, sizeof(type));
        if (ret < 0)
                goto error;

        description = NULL;
        if (_description) {
                description = strndup_user(_description, KEY_MAX_DESC_SIZE);
                if (IS_ERR(description)) {
                        ret = PTR_ERR(description);
                        goto error;
                }
                if (!*description) {
                        kfree(description);
                        description = NULL;
                } else if ((description[0] == '.') &&
                           (strncmp(type, "keyring", 7) == 0)) {
                        ret = -EPERM;
                        goto error2;
                }
        }

        /* pull the payload in if one was supplied */
        payload = NULL;

        if (plen) {
                ret = -ENOMEM;
                payload = kvmalloc(plen, GFP_KERNEL);
                if (!payload)
                        goto error2;

                ret = -EFAULT;
                if (copy_from_user(payload, _payload, plen) != 0)
                        goto error3;
        }

        /* find the target keyring (which must be writable) */
        keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE);
        if (IS_ERR(keyring_ref)) {
                ret = PTR_ERR(keyring_ref);
                goto error3;
        }

        /* create or update the requested key and add it to the target
         * keyring */
        key_ref = key_create_or_update(keyring_ref, type, description,
                                       payload, plen, KEY_PERM_UNDEF,
                                       KEY_ALLOC_IN_QUOTA);
        if (!IS_ERR(key_ref)) {
                ret = key_ref_to_ptr(key_ref)->serial;
                key_ref_put(key_ref);
        }
        else {
                ret = PTR_ERR(key_ref);
        }

        key_ref_put(keyring_ref);
 error3:
        kvfree_sensitive(payload, plen);
 error2:
        kfree(description);
 error:
        return ret;
}

/*
 * Search the process keyrings and keyring trees linked from those for a
 * matching key.  Keyrings must have appropriate Search permission to be
 * searched.
 *
 * If a key is found, it will be attached to the destination keyring if there's
 * one specified and the serial number of the key will be returned.
 *
 * If no key is found, /sbin/request-key will be invoked if _callout_info is
 * non-NULL in an attempt to create a key.  The _callout_info string will be
 * passed to /sbin/request-key to aid with completing the request.  If the
 * _callout_info string is "" then it will be changed to "-".
 */
SYSCALL_DEFINE4(request_key, const char __user *, _type,
                const char __user *, _description,
                const char __user *, _callout_info,
                key_serial_t, destringid)
{
        struct key_type *ktype;
        struct key *key;
        key_ref_t dest_ref;
        size_t callout_len;
        char type[32], *description, *callout_info;
        long ret;

        /* pull the type into kernel space */
        ret = key_get_type_from_user(type, _type, sizeof(type));
        if (ret < 0)
                goto error;

        /* pull the description into kernel space */
        description = strndup_user(_description, KEY_MAX_DESC_SIZE);
        if (IS_ERR(description)) {
                ret = PTR_ERR(description);
                goto error;
        }

        /* pull the callout info into kernel space */
        callout_info = NULL;
        callout_len = 0;
        if (_callout_info) {
                callout_info = strndup_user(_callout_info, PAGE_SIZE);
                if (IS_ERR(callout_info)) {
                        ret = PTR_ERR(callout_info);
                        goto error2;
                }
                callout_len = strlen(callout_info);
        }

        /* get the destination keyring if specified */
        dest_ref = NULL;
        if (destringid) {
                dest_ref = lookup_user_key(destringid, KEY_LOOKUP_CREATE,
                                           KEY_NEED_WRITE);
                if (IS_ERR(dest_ref)) {
                        ret = PTR_ERR(dest_ref);
                        goto error3;
                }
        }

        /* find the key type */
        ktype = key_type_lookup(type);
        if (IS_ERR(ktype)) {
                ret = PTR_ERR(ktype);
                goto error4;
        }

        /* do the search */
        key = request_key_and_link(ktype, description, NULL, callout_info,
                                   callout_len, NULL, key_ref_to_ptr(dest_ref),
                                   KEY_ALLOC_IN_QUOTA);
        if (IS_ERR(key)) {
                ret = PTR_ERR(key);
                goto error5;
        }

        /* wait for the key to finish being constructed */
        ret = wait_for_key_construction(key, 1);
        if (ret < 0)
                goto error6;

        ret = key->serial;

error6:
         key_put(key);
error5:
        key_type_put(ktype);
error4:
        key_ref_put(dest_ref);
error3:
        kfree(callout_info);
error2:
        kfree(description);
error:
        return ret;
}

/*
 * Get the ID of the specified process keyring.
 *
 * The requested keyring must have search permission to be found.
 *
 * If successful, the ID of the requested keyring will be returned.
 */
long keyctl_get_keyring_ID(key_serial_t id, int create)
{
        key_ref_t key_ref;
        unsigned long lflags;
        long ret;

        lflags = create ? KEY_LOOKUP_CREATE : 0;
        key_ref = lookup_user_key(id, lflags, KEY_NEED_SEARCH);
        if (IS_ERR(key_ref)) {
                ret = PTR_ERR(key_ref);
                goto error;
        }

        ret = key_ref_to_ptr(key_ref)->serial;
        key_ref_put(key_ref);
error:
        return ret;
}

/*
 * Join a (named) session keyring.
 *
 * Create and join an anonymous session keyring or join a named session
 * keyring, creating it if necessary.  A named session keyring must have Search
 * permission for it to be joined.  Session keyrings without this permit will
 * be skipped over.  It is not permitted for userspace to create or join
 * keyrings whose name begin with a dot.
 *
 * If successful, the ID of the joined session keyring will be returned.
 */
long keyctl_join_session_keyring(const char __user *_name)
{
        char *name;
        long ret;

        /* fetch the name from userspace */
        name = NULL;
        if (_name) {
                name = strndup_user(_name, KEY_MAX_DESC_SIZE);
                if (IS_ERR(name)) {
                        ret = PTR_ERR(name);
                        goto error;
                }

                ret = -EPERM;
                if (name[0] == '.')
                        goto error_name;
        }

        /* join the session */
        ret = join_session_keyring(name);
error_name:
        kfree(name);
error:
        return ret;
}

/*
 * Update a key's data payload from the given data.
 *
 * The key must grant the caller Write permission and the key type must support
 * updating for this to work.  A negative key can be positively instantiated
 * with this call.
 *
 * If successful, 0 will be returned.  If the key type does not support
 * updating, then -EOPNOTSUPP will be returned.
 */
long keyctl_update_key(key_serial_t id,
                       const void __user *_payload,
                       size_t plen)
{
        key_ref_t key_ref;
        void *payload;
        long ret;

        ret = -EINVAL;
        if (plen > PAGE_SIZE)
                goto error;

        /* pull the payload in if one was supplied */
        payload = NULL;
        if (plen) {
                ret = -ENOMEM;
                payload = kvmalloc(plen, GFP_KERNEL);
                if (!payload)
                        goto error;

                ret = -EFAULT;
                if (copy_from_user(payload, _payload, plen) != 0)
                        goto error2;
        }

        /* find the target key (which must be writable) */
        key_ref = lookup_user_key(id, 0, KEY_NEED_WRITE);
        if (IS_ERR(key_ref)) {
                ret = PTR_ERR(key_ref);
                goto error2;
        }

        /* update the key */
        ret = key_update(key_ref, payload, plen);

        key_ref_put(key_ref);
error2:
        kvfree_sensitive(payload, plen);
error:
        return ret;
}

/*
 * Revoke a key.
 *
 * The key must be grant the caller Write or Setattr permission for this to
 * work.  The key type should give up its quota claim when revoked.  The key
 * and any links to the key will be automatically garbage collected after a
 * certain amount of time (/proc/sys/kernel/keys/gc_delay).
 *
 * Keys with KEY_FLAG_KEEP set should not be revoked.
 *
 * If successful, 0 is returned.
 */
long keyctl_revoke_key(key_serial_t id)
{
        key_ref_t key_ref;
        struct key *key;
        long ret;

        key_ref = lookup_user_key(id, 0, KEY_NEED_WRITE);
        if (IS_ERR(key_ref)) {
                ret = PTR_ERR(key_ref);
                if (ret != -EACCES)
                        goto error;
                key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR);
                if (IS_ERR(key_ref)) {
                        ret = PTR_ERR(key_ref);
                        goto error;
                }
        }

        key = key_ref_to_ptr(key_ref);
        ret = 0;
        if (test_bit(KEY_FLAG_KEEP, &key->flags))
                ret = -EPERM;
        else
                key_revoke(key);

        key_ref_put(key_ref);
error:
        return ret;
}

/*
 * Invalidate a key.
 *
 * The key must be grant the caller Invalidate permission for this to work.
 * The key and any links to the key will be automatically garbage collected
 * immediately.
 *
 * Keys with KEY_FLAG_KEEP set should not be invalidated.
 *
 * If successful, 0 is returned.
 */
long keyctl_invalidate_key(key_serial_t id)
{
        key_ref_t key_ref;
        struct key *key;
        long ret;

        kenter("%d", id);

        key_ref = lookup_user_key(id, 0, KEY_NEED_SEARCH);
        if (IS_ERR(key_ref)) {
                ret = PTR_ERR(key_ref);

                /* Root is permitted to invalidate certain special keys */
                if (capable(CAP_SYS_ADMIN)) {
                        key_ref = lookup_user_key(id, 0, KEY_SYSADMIN_OVERRIDE);
                        if (IS_ERR(key_ref))
                                goto error;
                        if (test_bit(KEY_FLAG_ROOT_CAN_INVAL,
                                     &key_ref_to_ptr(key_ref)->flags))
                                goto invalidate;
                        goto error_put;
                }

                goto error;
        }

invalidate:
        key = key_ref_to_ptr(key_ref);
        ret = 0;
        if (test_bit(KEY_FLAG_KEEP, &key->flags))
                ret = -EPERM;
        else
                key_invalidate(key);
error_put:
        key_ref_put(key_ref);
error:
        kleave(" = %ld", ret);
        return ret;
}

/*
 * Clear the specified keyring, creating an empty process keyring if one of the
 * special keyring IDs is used.
 *
 * The keyring must grant the caller Write permission and not have
 * KEY_FLAG_KEEP set for this to work.  If successful, 0 will be returned.
 */
long keyctl_keyring_clear(key_serial_t ringid)
{
        key_ref_t keyring_ref;
        struct key *keyring;
        long ret;

        keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE);
        if (IS_ERR(keyring_ref)) {
                ret = PTR_ERR(keyring_ref);

                /* Root is permitted to invalidate certain special keyrings */
                if (capable(CAP_SYS_ADMIN)) {
                        keyring_ref = lookup_user_key(ringid, 0,
                                                      KEY_SYSADMIN_OVERRIDE);
                        if (IS_ERR(keyring_ref))
                                goto error;
                        if (test_bit(KEY_FLAG_ROOT_CAN_CLEAR,
                                     &key_ref_to_ptr(keyring_ref)->flags))
                                goto clear;
                        goto error_put;
                }

                goto error;
        }

clear:
        keyring = key_ref_to_ptr(keyring_ref);
        if (test_bit(KEY_FLAG_KEEP, &keyring->flags))
                ret = -EPERM;
        else
                ret = keyring_clear(keyring);
error_put:
        key_ref_put(keyring_ref);
error:
        return ret;
}

/*
 * Create a link from a keyring to a key if there's no matching key in the
 * keyring, otherwise replace the link to the matching key with a link to the
 * new key.
 *
 * The key must grant the caller Link permission and the keyring must grant
 * the caller Write permission.  Furthermore, if an additional link is created,
 * the keyring's quota will be extended.
 *
 * If successful, 0 will be returned.
 */
long keyctl_keyring_link(key_serial_t id, key_serial_t ringid)
{
        key_ref_t keyring_ref, key_ref;
        long ret;

        keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE);
        if (IS_ERR(keyring_ref)) {
                ret = PTR_ERR(keyring_ref);
                goto error;
        }

        key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_LINK);
        if (IS_ERR(key_ref)) {
                ret = PTR_ERR(key_ref);
                goto error2;
        }

        ret = key_link(key_ref_to_ptr(keyring_ref), key_ref_to_ptr(key_ref));

        key_ref_put(key_ref);
error2:
        key_ref_put(keyring_ref);
error:
        return ret;
}

/*
 * Unlink a key from a keyring.
 *
 * The keyring must grant the caller Write permission for this to work; the key
 * itself need not grant the caller anything.  If the last link to a key is
 * removed then that key will be scheduled for destruction.
 *
 * Keys or keyrings with KEY_FLAG_KEEP set should not be unlinked.
 *
 * If successful, 0 will be returned.
 */
long keyctl_keyring_unlink(key_serial_t id, key_serial_t ringid)
{
        key_ref_t keyring_ref, key_ref;
        struct key *keyring, *key;
        long ret;

        keyring_ref = lookup_user_key(ringid, 0, KEY_NEED_WRITE);
        if (IS_ERR(keyring_ref)) {
                ret = PTR_ERR(keyring_ref);
                goto error;
        }

        key_ref = lookup_user_key(id, KEY_LOOKUP_PARTIAL, KEY_NEED_UNLINK);
        if (IS_ERR(key_ref)) {
                ret = PTR_ERR(key_ref);
                goto error2;
        }

        keyring = key_ref_to_ptr(keyring_ref);
        key = key_ref_to_ptr(key_ref);
        if (test_bit(KEY_FLAG_KEEP, &keyring->flags) &&
            test_bit(KEY_FLAG_KEEP, &key->flags))
                ret = -EPERM;
        else
                ret = key_unlink(keyring, key);

        key_ref_put(key_ref);
error2:
        key_ref_put(keyring_ref);
error:
        return ret;
}

/*
 * Move a link to a key from one keyring to another, displacing any matching
 * key from the destination keyring.
 *
 * The key must grant the caller Link permission and both keyrings must grant
 * the caller Write permission.  There must also be a link in the from keyring
 * to the key.  If both keyrings are the same, nothing is done.
 *
 * If successful, 0 will be returned.
 */
long keyctl_keyring_move(key_serial_t id, key_serial_t from_ringid,
                         key_serial_t to_ringid, unsigned int flags)
{
        key_ref_t key_ref, from_ref, to_ref;
        long ret;

        if (flags & ~KEYCTL_MOVE_EXCL)
                return -EINVAL;

        key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_LINK);
        if (IS_ERR(key_ref))
                return PTR_ERR(key_ref);

        from_ref = lookup_user_key(from_ringid, 0, KEY_NEED_WRITE);
        if (IS_ERR(from_ref)) {
                ret = PTR_ERR(from_ref);
                goto error2;
        }

        to_ref = lookup_user_key(to_ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE);
        if (IS_ERR(to_ref)) {
                ret = PTR_ERR(to_ref);
                goto error3;
        }

        ret = key_move(key_ref_to_ptr(key_ref), key_ref_to_ptr(from_ref),
                       key_ref_to_ptr(to_ref), flags);

        key_ref_put(to_ref);
error3:
        key_ref_put(from_ref);
error2:
        key_ref_put(key_ref);
        return ret;
}

/*
 * Return a description of a key to userspace.
 *
 * The key must grant the caller View permission for this to work.
 *
 * If there's a buffer, we place up to buflen bytes of data into it formatted
 * in the following way:
 *
 *        type;uid;gid;perm;description<NUL>
 *
 * If successful, we return the amount of description available, irrespective
 * of how much we may have copied into the buffer.
 */
long keyctl_describe_key(key_serial_t keyid,
                         char __user *buffer,
                         size_t buflen)
{
        struct key *key, *instkey;
        key_ref_t key_ref;
        char *infobuf;
        long ret;
        int desclen, infolen;

        key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_NEED_VIEW);
        if (IS_ERR(key_ref)) {
                /* viewing a key under construction is permitted if we have the
                 * authorisation token handy */
                if (PTR_ERR(key_ref) == -EACCES) {
                        instkey = key_get_instantiation_authkey(keyid);
                        if (!IS_ERR(instkey)) {
                                key_put(instkey);
                                key_ref = lookup_user_key(keyid,
                                                          KEY_LOOKUP_PARTIAL,
                                                          KEY_AUTHTOKEN_OVERRIDE);
                                if (!IS_ERR(key_ref))
                                        goto okay;
                        }
                }

                ret = PTR_ERR(key_ref);
                goto error;
        }

okay:
        key = key_ref_to_ptr(key_ref);
        desclen = strlen(key->description);

        /* calculate how much information we're going to return */
        ret = -ENOMEM;
        infobuf = kasprintf(GFP_KERNEL,
                            "%s;%d;%d;%08x;",
                            key->type->name,
                            from_kuid_munged(current_user_ns(), key->uid),
                            from_kgid_munged(current_user_ns(), key->gid),
                            key->perm);
        if (!infobuf)
                goto error2;
        infolen = strlen(infobuf);
        ret = infolen + desclen + 1;

        /* consider returning the data */
        if (buffer && buflen >= ret) {
                if (copy_to_user(buffer, infobuf, infolen) != 0 ||
                    copy_to_user(buffer + infolen, key->description,
                                 desclen + 1) != 0)
                        ret = -EFAULT;
        }

        kfree(infobuf);
error2:
        key_ref_put(key_ref);
error:
        return ret;
}

/*
 * Search the specified keyring and any keyrings it links to for a matching
 * key.  Only keyrings that grant the caller Search permission will be searched
 * (this includes the starting keyring).  Only keys with Search permission can
 * be found.
 *
 * If successful, the found key will be linked to the destination keyring if
 * supplied and the key has Link permission, and the found key ID will be
 * returned.
 */
long keyctl_keyring_search(key_serial_t ringid,
                           const char __user *_type,
                           const char __user *_description,
                           key_serial_t destringid)
{
        struct key_type *ktype;
        key_ref_t keyring_ref, key_ref, dest_ref;
        char type[32], *description;
        long ret;

        /* pull the type and description into kernel space */
        ret = key_get_type_from_user(type, _type, sizeof(type));
        if (ret < 0)
                goto error;

        description = strndup_user(_description, KEY_MAX_DESC_SIZE);
        if (IS_ERR(description)) {
                ret = PTR_ERR(description);
                goto error;
        }

        /* get the keyring at which to begin the search */
        keyring_ref = lookup_user_key(ringid, 0, KEY_NEED_SEARCH);
        if (IS_ERR(keyring_ref)) {
                ret = PTR_ERR(keyring_ref);
                goto error2;
        }

        /* get the destination keyring if specified */
        dest_ref = NULL;
        if (destringid) {
                dest_ref = lookup_user_key(destringid, KEY_LOOKUP_CREATE,
                                           KEY_NEED_WRITE);
                if (IS_ERR(dest_ref)) {
                        ret = PTR_ERR(dest_ref);
                        goto error3;
                }
        }

        /* find the key type */
        ktype = key_type_lookup(type);
        if (IS_ERR(ktype)) {
                ret = PTR_ERR(ktype);
                goto error4;
        }

        /* do the search */
        key_ref = keyring_search(keyring_ref, ktype, description, true);
        if (IS_ERR(key_ref)) {
                ret = PTR_ERR(key_ref);

                /* treat lack or presence of a negative key the same */
                if (ret == -EAGAIN)
                        ret = -ENOKEY;
                goto error5;
        }

        /* link the resulting key to the destination keyring if we can */
        if (dest_ref) {
                ret = key_permission(key_ref, KEY_NEED_LINK);
                if (ret < 0)
                        goto error6;

                ret = key_link(key_ref_to_ptr(dest_ref), key_ref_to_ptr(key_ref));
                if (ret < 0)
                        goto error6;
        }

        ret = key_ref_to_ptr(key_ref)->serial;

error6:
        key_ref_put(key_ref);
error5:
        key_type_put(ktype);
error4:
        key_ref_put(dest_ref);
error3:
        key_ref_put(keyring_ref);
error2:
        kfree(description);
error:
        return ret;
}

/*
 * Call the read method
 */
static long __keyctl_read_key(struct key *key, char *buffer, size_t buflen)
{
        long ret;

        down_read(&key->sem);
        ret = key_validate(key);
        if (ret == 0)
                ret = key->type->read(key, buffer, buflen);
        up_read(&key->sem);
        return ret;
}

/*
 * Read a key's payload.
 *
 * The key must either grant the caller Read permission, or it must grant the
 * caller Search permission when searched for from the process keyrings.
 *
 * If successful, we place up to buflen bytes of data into the buffer, if one
 * is provided, and return the amount of data that is available in the key,
 * irrespective of how much we copied into the buffer.
 */
long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen)
{
        struct key *key;
        key_ref_t key_ref;
        long ret;
        char *key_data = NULL;
        size_t key_data_len;

        /* find the key first */
        key_ref = lookup_user_key(keyid, 0, KEY_DEFER_PERM_CHECK);
        if (IS_ERR(key_ref)) {
                ret = -ENOKEY;
                goto out;
        }

        key = key_ref_to_ptr(key_ref);

        ret = key_read_state(key);
        if (ret < 0)
                goto key_put_out; /* Negatively instantiated */

        /* see if we can read it directly */
        ret = key_permission(key_ref, KEY_NEED_READ);
        if (ret == 0)
                goto can_read_key;
        if (ret != -EACCES)
                goto key_put_out;

        /* we can't; see if it's searchable from this process's keyrings
         * - we automatically take account of the fact that it may be
         *   dangling off an instantiation key
         */
        if (!is_key_possessed(key_ref)) {
                ret = -EACCES;
                goto key_put_out;
        }

        /* the key is probably readable - now try to read it */
can_read_key:
        if (!key->type->read) {
                ret = -EOPNOTSUPP;
                goto key_put_out;
        }

        if (!buffer || !buflen) {
                /* Get the key length from the read method */
                ret = __keyctl_read_key(key, NULL, 0);
                goto key_put_out;
        }

        /*
         * Read the data with the semaphore held (since we might sleep)
         * to protect against the key being updated or revoked.
         *
         * Allocating a temporary buffer to hold the keys before
         * transferring them to user buffer to avoid potential
         * deadlock involving page fault and mmap_lock.
         *
         * key_data_len = (buflen <= PAGE_SIZE)
         *                ? buflen : actual length of key data
         *
         * This prevents allocating arbitrary large buffer which can
         * be much larger than the actual key length. In the latter case,
         * at least 2 passes of this loop is required.
         */
        key_data_len = (buflen <= PAGE_SIZE) ? buflen : 0;
        for (;;) {
                if (key_data_len) {
                        key_data = kvmalloc(key_data_len, GFP_KERNEL);
                        if (!key_data) {
                                ret = -ENOMEM;
                                goto key_put_out;
                        }
                }

                ret = __keyctl_read_key(key, key_data, key_data_len);

                /*
                 * Read methods will just return the required length without
                 * any copying if the provided length isn't large enough.
                 */
                if (ret <= 0 || ret > buflen)
                        break;

                /*
                 * The key may change (unlikely) in between 2 consecutive
                 * __keyctl_read_key() calls. In this case, we reallocate
                 * a larger buffer and redo the key read when
                 * key_data_len < ret <= buflen.
                 */
                if (ret > key_data_len) {
                        if (unlikely(key_data))
                                kvfree_sensitive(key_data, key_data_len);
                        key_data_len = ret;
                        continue;        /* Allocate buffer */
                }

                if (copy_to_user(buffer, key_data, ret))
                        ret = -EFAULT;
                break;
        }
        kvfree_sensitive(key_data, key_data_len);

key_put_out:
        key_put(key);
out:
        return ret;
}

/*
 * Change the ownership of a key
 *
 * The key must grant the caller Setattr permission for this to work, though
 * the key need not be fully instantiated yet.  For the UID to be changed, or
 * for the GID to be changed to a group the caller is not a member of, the
 * caller must have sysadmin capability.  If either uid or gid is -1 then that
 * attribute is not changed.
 *
 * If the UID is to be changed, the new user must have sufficient quota to
 * accept the key.  The quota deduction will be removed from the old user to
 * the new user should the attribute be changed.
 *
 * If successful, 0 will be returned.
 */
long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group)
{
        struct key_user *newowner, *zapowner = NULL;
        struct key *key;
        key_ref_t key_ref;
        long ret;
        kuid_t uid;
        kgid_t gid;
        unsigned long flags;

        uid = make_kuid(current_user_ns(), user);
        gid = make_kgid(current_user_ns(), group);
        ret = -EINVAL;
        if ((user != (uid_t) -1) && !uid_valid(uid))
                goto error;
        if ((group != (gid_t) -1) && !gid_valid(gid))
                goto error;

        ret = 0;
        if (user == (uid_t) -1 && group == (gid_t) -1)
                goto error;

        key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL,
                                  KEY_NEED_SETATTR);
        if (IS_ERR(key_ref)) {
                ret = PTR_ERR(key_ref);
                goto error;
        }

        key = key_ref_to_ptr(key_ref);

        /* make the changes with the locks held to prevent chown/chown races */
        ret = -EACCES;
        down_write(&key->sem);

        {
                bool is_privileged_op = false;

                /* only the sysadmin can chown a key to some other UID */
                if (user != (uid_t) -1 && !uid_eq(key->uid, uid))
                        is_privileged_op = true;

                /* only the sysadmin can set the key's GID to a group other
                 * than one of those that the current process subscribes to */
                if (group != (gid_t) -1 && !gid_eq(gid, key->gid) && !in_group_p(gid))
                        is_privileged_op = true;

                if (is_privileged_op && !capable(CAP_SYS_ADMIN))
                        goto error_put;
        }

        /* change the UID */
        if (user != (uid_t) -1 && !uid_eq(uid, key->uid)) {
                ret = -ENOMEM;
                newowner = key_user_lookup(uid);
                if (!newowner)
                        goto error_put;

                /* transfer the quota burden to the new user */
                if (test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) {
                        unsigned maxkeys = uid_eq(uid, GLOBAL_ROOT_UID) ?
                                key_quota_root_maxkeys : key_quota_maxkeys;
                        unsigned maxbytes = uid_eq(uid, GLOBAL_ROOT_UID) ?
                                key_quota_root_maxbytes : key_quota_maxbytes;

                        spin_lock_irqsave(&newowner->lock, flags);
                        if (newowner->qnkeys + 1 > maxkeys ||
                            newowner->qnbytes + key->quotalen > maxbytes ||
                            newowner->qnbytes + key->quotalen <
                            newowner->qnbytes)
                                goto quota_overrun;

                        newowner->qnkeys++;
                        newowner->qnbytes += key->quotalen;
                        spin_unlock_irqrestore(&newowner->lock, flags);

                        spin_lock_irqsave(&key->user->lock, flags);
                        key->user->qnkeys--;
                        key->user->qnbytes -= key->quotalen;
                        spin_unlock_irqrestore(&key->user->lock, flags);
                }

                atomic_dec(&key->user->nkeys);
                atomic_inc(&newowner->nkeys);

                if (key->state != KEY_IS_UNINSTANTIATED) {
                        atomic_dec(&key->user->nikeys);
                        atomic_inc(&newowner->nikeys);
                }

                zapowner = key->user;
                key->user = newowner;
                key->uid = uid;
        }

        /* change the GID */
        if (group != (gid_t) -1)
                key->gid = gid;

        notify_key(key, NOTIFY_KEY_SETATTR, 0);
        ret = 0;

error_put:
        up_write(&key->sem);
        key_put(key);
        if (zapowner)
                key_user_put(zapowner);
error:
        return ret;

quota_overrun:
        spin_unlock_irqrestore(&newowner->lock, flags);
        zapowner = newowner;
        ret = -EDQUOT;
        goto error_put;
}

/*
 * Change the permission mask on a key.
 *
 * The key must grant the caller Setattr permission for this to work, though
 * the key need not be fully instantiated yet.  If the caller does not have
 * sysadmin capability, it may only change the permission on keys that it owns.
 */
long keyctl_setperm_key(key_serial_t id, key_perm_t perm)
{
        struct key *key;
        key_ref_t key_ref;
        long ret;

        ret = -EINVAL;
        if (perm & ~(KEY_POS_ALL | KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL))
                goto error;

        key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL,
                                  KEY_NEED_SETATTR);
        if (IS_ERR(key_ref)) {
                ret = PTR_ERR(key_ref);
                goto error;
        }

        key = key_ref_to_ptr(key_ref);

        /* make the changes with the locks held to prevent chown/chmod races */
        ret = -EACCES;
        down_write(&key->sem);

        /* if we're not the sysadmin, we can only change a key that we own */
        if (uid_eq(key->uid, current_fsuid()) || capable(CAP_SYS_ADMIN)) {
                key->perm = perm;
                notify_key(key, NOTIFY_KEY_SETATTR, 0);
                ret = 0;
        }

        up_write(&key->sem);
        key_put(key);
error:
        return ret;
}

/*
 * Get the destination keyring for instantiation and check that the caller has
 * Write permission on it.
 */
static long get_instantiation_keyring(key_serial_t ringid,
                                      struct request_key_auth *rka,
                                      struct key **_dest_keyring)
{
        key_ref_t dkref;

        *_dest_keyring = NULL;

        /* just return a NULL pointer if we weren't asked to make a link */
        if (ringid == 0)
                return 0;

        /* if a specific keyring is nominated by ID, then use that */
        if (ringid > 0) {
                dkref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE);
                if (IS_ERR(dkref))
                        return PTR_ERR(dkref);
                *_dest_keyring = key_ref_to_ptr(dkref);
                return 0;
        }

        if (ringid == KEY_SPEC_REQKEY_AUTH_KEY)
                return -EINVAL;

        /* otherwise specify the destination keyring recorded in the
         * authorisation key (any KEY_SPEC_*_KEYRING) */
        if (ringid >= KEY_SPEC_REQUESTOR_KEYRING) {
                *_dest_keyring = key_get(rka->dest_keyring);
                return 0;
        }

        return -ENOKEY;
}

/*
 * Change the request_key authorisation key on the current process.
 */
static int keyctl_change_reqkey_auth(struct key *key)
{
        struct cred *new;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        key_put(new->request_key_auth);
        new->request_key_auth = key_get(key);

        return commit_creds(new);
}

/*
 * Instantiate a key with the specified payload and link the key into the
 * destination keyring if one is given.
 *
 * The caller must have the appropriate instantiation permit set for this to
 * work (see keyctl_assume_authority).  No other permissions are required.
 *
 * If successful, 0 will be returned.
 */
static long keyctl_instantiate_key_common(key_serial_t id,
                                   struct iov_iter *from,
                                   key_serial_t ringid)
{
        const struct cred *cred = current_cred();
        struct request_key_auth *rka;
        struct key *instkey, *dest_keyring;
        size_t plen = from ? iov_iter_count(from) : 0;
        void *payload;
        long ret;

        kenter("%d,,%zu,%d", id, plen, ringid);

        if (!plen)
                from = NULL;

        ret = -EINVAL;
        if (plen > 1024 * 1024 - 1)
                goto error;

        /* the appropriate instantiation authorisation key must have been
         * assumed before calling this */
        ret = -EPERM;
        instkey = cred->request_key_auth;
        if (!instkey)
                goto error;

        rka = instkey->payload.data[0];
        if (rka->target_key->serial != id)
                goto error;

        /* pull the payload in if one was supplied */
        payload = NULL;

        if (from) {
                ret = -ENOMEM;
                payload = kvmalloc(plen, GFP_KERNEL);
                if (!payload)
                        goto error;

                ret = -EFAULT;
                if (!copy_from_iter_full(payload, plen, from))
                        goto error2;
        }

        /* find the destination keyring amongst those belonging to the
         * requesting task */
        ret = get_instantiation_keyring(ringid, rka, &dest_keyring);
        if (ret < 0)
                goto error2;

        /* instantiate the key and link it into a keyring */
        ret = key_instantiate_and_link(rka->target_key, payload, plen,
                                       dest_keyring, instkey);

        key_put(dest_keyring);

        /* discard the assumed authority if it's just been disabled by
         * instantiation of the key */
        if (ret == 0)
                keyctl_change_reqkey_auth(NULL);

error2:
        kvfree_sensitive(payload, plen);
error:
        return ret;
}

/*
 * Instantiate a key with the specified payload and link the key into the
 * destination keyring if one is given.
 *
 * The caller must have the appropriate instantiation permit set for this to
 * work (see keyctl_assume_authority).  No other permissions are required.
 *
 * If successful, 0 will be returned.
 */
long keyctl_instantiate_key(key_serial_t id,
                            const void __user *_payload,
                            size_t plen,
                            key_serial_t ringid)
{
        if (_payload && plen) {
                struct iov_iter from;
                int ret;

                ret = import_ubuf(ITER_SOURCE, (void __user *)_payload, plen,
                                  &from);
                if (unlikely(ret))
                        return ret;

                return keyctl_instantiate_key_common(id, &from, ringid);
        }

        return keyctl_instantiate_key_common(id, NULL, ringid);
}

/*
 * Instantiate a key with the specified multipart payload and link the key into
 * the destination keyring if one is given.
 *
 * The caller must have the appropriate instantiation permit set for this to
 * work (see keyctl_assume_authority).  No other permissions are required.
 *
 * If successful, 0 will be returned.
 */
long keyctl_instantiate_key_iov(key_serial_t id,
                                const struct iovec __user *_payload_iov,
                                unsigned ioc,
                                key_serial_t ringid)
{
        struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
        struct iov_iter from;
        long ret;

        if (!_payload_iov)
                ioc = 0;

        ret = import_iovec(ITER_SOURCE, _payload_iov, ioc,
                                    ARRAY_SIZE(iovstack), &iov, &from);
        if (ret < 0)
                return ret;
        ret = keyctl_instantiate_key_common(id, &from, ringid);
        kfree(iov);
        return ret;
}

/*
 * Negatively instantiate the key with the given timeout (in seconds) and link
 * the key into the destination keyring if one is given.
 *
 * The caller must have the appropriate instantiation permit set for this to
 * work (see keyctl_assume_authority).  No other permissions are required.
 *
 * The key and any links to the key will be automatically garbage collected
 * after the timeout expires.
 *
 * Negative keys are used to rate limit repeated request_key() calls by causing
 * them to return -ENOKEY until the negative key expires.
 *
 * If successful, 0 will be returned.
 */
long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
{
        return keyctl_reject_key(id, timeout, ENOKEY, ringid);
}

/*
 * Negatively instantiate the key with the given timeout (in seconds) and error
 * code and link the key into the destination keyring if one is given.
 *
 * The caller must have the appropriate instantiation permit set for this to
 * work (see keyctl_assume_authority).  No other permissions are required.
 *
 * The key and any links to the key will be automatically garbage collected
 * after the timeout expires.
 *
 * Negative keys are used to rate limit repeated request_key() calls by causing
 * them to return the specified error code until the negative key expires.
 *
 * If successful, 0 will be returned.
 */
long keyctl_reject_key(key_serial_t id, unsigned timeout, unsigned error,
                       key_serial_t ringid)
{
        const struct cred *cred = current_cred();
        struct request_key_auth *rka;
        struct key *instkey, *dest_keyring;
        long ret;

        kenter("%d,%u,%u,%d", id, timeout, error, ringid);

        /* must be a valid error code and mustn't be a kernel special */
        if (error <= 0 ||
            error >= MAX_ERRNO ||
            error == ERESTARTSYS ||
            error == ERESTARTNOINTR ||
            error == ERESTARTNOHAND ||
            error == ERESTART_RESTARTBLOCK)
                return -EINVAL;

        /* the appropriate instantiation authorisation key must have been
         * assumed before calling this */
        ret = -EPERM;
        instkey = cred->request_key_auth;
        if (!instkey)
                goto error;

        rka = instkey->payload.data[0];
        if (rka->target_key->serial != id)
                goto error;

        /* find the destination keyring if present (which must also be
         * writable) */
        ret = get_instantiation_keyring(ringid, rka, &dest_keyring);
        if (ret < 0)
                goto error;

        /* instantiate the key and link it into a keyring */
        ret = key_reject_and_link(rka->target_key, timeout, error,
                                  dest_keyring, instkey);

        key_put(dest_keyring);

        /* discard the assumed authority if it's just been disabled by
         * instantiation of the key */
        if (ret == 0)
                keyctl_change_reqkey_auth(NULL);

error:
        return ret;
}

/*
 * Read or set the default keyring in which request_key() will cache keys and
 * return the old setting.
 *
 * If a thread or process keyring is specified then it will be created if it
 * doesn't yet exist.  The old setting will be returned if successful.
 */
long keyctl_set_reqkey_keyring(int reqkey_defl)
{
        struct cred *new;
        int ret, old_setting;

        old_setting = current_cred_xxx(jit_keyring);

        if (reqkey_defl == KEY_REQKEY_DEFL_NO_CHANGE)
                return old_setting;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        switch (reqkey_defl) {
        case KEY_REQKEY_DEFL_THREAD_KEYRING:
                ret = install_thread_keyring_to_cred(new);
                if (ret < 0)
                        goto error;
                goto set;

        case KEY_REQKEY_DEFL_PROCESS_KEYRING:
                ret = install_process_keyring_to_cred(new);
                if (ret < 0)
                        goto error;
                goto set;

        case KEY_REQKEY_DEFL_DEFAULT:
        case KEY_REQKEY_DEFL_SESSION_KEYRING:
        case KEY_REQKEY_DEFL_USER_KEYRING:
        case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
        case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
                goto set;

        case KEY_REQKEY_DEFL_NO_CHANGE:
        case KEY_REQKEY_DEFL_GROUP_KEYRING:
        default:
                ret = -EINVAL;
                goto error;
        }

set:
        new->jit_keyring = reqkey_defl;
        commit_creds(new);
        return old_setting;
error:
        abort_creds(new);
        return ret;
}

/*
 * Set or clear the timeout on a key.
 *
 * Either the key must grant the caller Setattr permission or else the caller
 * must hold an instantiation authorisation token for the key.
 *
 * The timeout is either 0 to clear the timeout, or a number of seconds from
 * the current time.  The key and any links to the key will be automatically
 * garbage collected after the timeout expires.
 *
 * Keys with KEY_FLAG_KEEP set should not be timed out.
 *
 * If successful, 0 is returned.
 */
long keyctl_set_timeout(key_serial_t id, unsigned timeout)
{
        struct key *key, *instkey;
        key_ref_t key_ref;
        long ret;

        key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL,
                                  KEY_NEED_SETATTR);
        if (IS_ERR(key_ref)) {
                /* setting the timeout on a key under construction is permitted
                 * if we have the authorisation token handy */
                if (PTR_ERR(key_ref) == -EACCES) {
                        instkey = key_get_instantiation_authkey(id);
                        if (!IS_ERR(instkey)) {
                                key_put(instkey);
                                key_ref = lookup_user_key(id,
                                                          KEY_LOOKUP_PARTIAL,
                                                          KEY_AUTHTOKEN_OVERRIDE);
                                if (!IS_ERR(key_ref))
                                        goto okay;
                        }
                }

                ret = PTR_ERR(key_ref);
                goto error;
        }

okay:
        key = key_ref_to_ptr(key_ref);
        ret = 0;
        if (test_bit(KEY_FLAG_KEEP, &key->flags)) {
                ret = -EPERM;
        } else {
                key_set_timeout(key, timeout);
                notify_key(key, NOTIFY_KEY_SETATTR, 0);
        }
        key_put(key);

error:
        return ret;
}

/*
 * Assume (or clear) the authority to instantiate the specified key.
 *
 * This sets the authoritative token currently in force for key instantiation.
 * This must be done for a key to be instantiated.  It has the effect of making
 * available all the keys from the caller of the request_key() that created a
 * key to request_key() calls made by the caller of this function.
 *
 * The caller must have the instantiation key in their process keyrings with a
 * Search permission grant available to the caller.
 *
 * If the ID given is 0, then the setting will be cleared and 0 returned.
 *
 * If the ID given has a matching an authorisation key, then that key will be
 * set and its ID will be returned.  The authorisation key can be read to get
 * the callout information passed to request_key().
 */
long keyctl_assume_authority(key_serial_t id)
{
        struct key *authkey;
        long ret;

        /* special key IDs aren't permitted */
        ret = -EINVAL;
        if (id < 0)
                goto error;

        /* we divest ourselves of authority if given an ID of 0 */
        if (id == 0) {
                ret = keyctl_change_reqkey_auth(NULL);
                goto error;
        }

        /* attempt to assume the authority temporarily granted to us whilst we
         * instantiate the specified key
         * - the authorisation key must be in the current task's keyrings
         *   somewhere
         */
        authkey = key_get_instantiation_authkey(id);
        if (IS_ERR(authkey)) {
                ret = PTR_ERR(authkey);
                goto error;
        }

        ret = keyctl_change_reqkey_auth(authkey);
        if (ret == 0)
                ret = authkey->serial;
        key_put(authkey);
error:
        return ret;
}

/*
 * Get a key's the LSM security label.
 *
 * The key must grant the caller View permission for this to work.
 *
 * If there's a buffer, then up to buflen bytes of data will be placed into it.
 *
 * If successful, the amount of information available will be returned,
 * irrespective of how much was copied (including the terminal NUL).
 */
long keyctl_get_security(key_serial_t keyid,
                         char __user *buffer,
                         size_t buflen)
{
        struct key *key, *instkey;
        key_ref_t key_ref;
        char *context;
        long ret;

        key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_NEED_VIEW);
        if (IS_ERR(key_ref)) {
                if (PTR_ERR(key_ref) != -EACCES)
                        return PTR_ERR(key_ref);

                /* viewing a key under construction is also permitted if we
                 * have the authorisation token handy */
                instkey = key_get_instantiation_authkey(keyid);
                if (IS_ERR(instkey))
                        return PTR_ERR(instkey);
                key_put(instkey);

                key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL,
                                          KEY_AUTHTOKEN_OVERRIDE);
                if (IS_ERR(key_ref))
                        return PTR_ERR(key_ref);
        }

        key = key_ref_to_ptr(key_ref);
        ret = security_key_getsecurity(key, &context);
        if (ret == 0) {
                /* if no information was returned, give userspace an empty
                 * string */
                ret = 1;
                if (buffer && buflen > 0 &&
                    copy_to_user(buffer, "", 1) != 0)
                        ret = -EFAULT;
        } else if (ret > 0) {
                /* return as much data as there's room for */
                if (buffer && buflen > 0) {
                        if (buflen > ret)
                                buflen = ret;

                        if (copy_to_user(buffer, context, buflen) != 0)
                                ret = -EFAULT;
                }

                kfree(context);
        }

        key_ref_put(key_ref);
        return ret;
}

/*
 * Attempt to install the calling process's session keyring on the process's
 * parent process.
 *
 * The keyring must exist and must grant the caller LINK permission, and the
 * parent process must be single-threaded and must have the same effective
 * ownership as this process and mustn't be SUID/SGID.
 *
 * The keyring will be emplaced on the parent when it next resumes userspace.
 *
 * If successful, 0 will be returned.
 */
long keyctl_session_to_parent(void)
{
        struct task_struct *me, *parent;
        const struct cred *mycred, *pcred;
        struct callback_head *newwork, *oldwork;
        key_ref_t keyring_r;
        struct cred *cred;
        int ret;

        keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_NEED_LINK);
        if (IS_ERR(keyring_r))
                return PTR_ERR(keyring_r);

        ret = -ENOMEM;

        /* our parent is going to need a new cred struct, a new tgcred struct
         * and new security data, so we allocate them here to prevent ENOMEM in
         * our parent */
        cred = cred_alloc_blank();
        if (!cred)
                goto error_keyring;
        newwork = &cred->rcu;

        cred->session_keyring = key_ref_to_ptr(keyring_r);
        keyring_r = NULL;
        init_task_work(newwork, key_change_session_keyring);

        me = current;
        rcu_read_lock();
        write_lock_irq(&tasklist_lock);

        ret = -EPERM;
        oldwork = NULL;
        parent = rcu_dereference_protected(me->real_parent,
                                           lockdep_is_held(&tasklist_lock));

        /* the parent mustn't be init and mustn't be a kernel thread */
        if (parent->pid <= 1 || !parent->mm)
                goto unlock;

        /* the parent must be single threaded */
        if (!thread_group_empty(parent))
                goto unlock;

        /* the parent and the child must have different session keyrings or
         * there's no point */
        mycred = current_cred();
        pcred = __task_cred(parent);
        if (mycred == pcred ||
            mycred->session_keyring == pcred->session_keyring) {
                ret = 0;
                goto unlock;
        }

        /* the parent must have the same effective ownership and mustn't be
         * SUID/SGID */
        if (!uid_eq(pcred->uid,         mycred->euid) ||
            !uid_eq(pcred->euid, mycred->euid) ||
            !uid_eq(pcred->suid, mycred->euid) ||
            !gid_eq(pcred->gid,         mycred->egid) ||
            !gid_eq(pcred->egid, mycred->egid) ||
            !gid_eq(pcred->sgid, mycred->egid))
                goto unlock;

        /* the keyrings must have the same UID */
        if ((pcred->session_keyring &&
             !uid_eq(pcred->session_keyring->uid, mycred->euid)) ||
            !uid_eq(mycred->session_keyring->uid, mycred->euid))
                goto unlock;

        /* cancel an already pending keyring replacement */
        oldwork = task_work_cancel_func(parent, key_change_session_keyring);

        /* the replacement session keyring is applied just prior to userspace
         * restarting */
        ret = task_work_add(parent, newwork, TWA_RESUME);
        if (!ret)
                newwork = NULL;
unlock:
        write_unlock_irq(&tasklist_lock);
        rcu_read_unlock();
        if (oldwork)
                put_cred(container_of(oldwork, struct cred, rcu));
        if (newwork)
                put_cred(cred);
        return ret;

error_keyring:
        key_ref_put(keyring_r);
        return ret;
}

/*
 * Apply a restriction to a given keyring.
 *
 * The caller must have Setattr permission to change keyring restrictions.
 *
 * The requested type name may be a NULL pointer to reject all attempts
 * to link to the keyring.  In this case, _restriction must also be NULL.
 * Otherwise, both _type and _restriction must be non-NULL.
 *
 * Returns 0 if successful.
 */
long keyctl_restrict_keyring(key_serial_t id, const char __user *_type,
                             const char __user *_restriction)
{
        key_ref_t key_ref;
        char type[32];
        char *restriction = NULL;
        long ret;

        key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR);
        if (IS_ERR(key_ref))
                return PTR_ERR(key_ref);

        ret = -EINVAL;
        if (_type) {
                if (!_restriction)
                        goto error;

                ret = key_get_type_from_user(type, _type, sizeof(type));
                if (ret < 0)
                        goto error;

                restriction = strndup_user(_restriction, PAGE_SIZE);
                if (IS_ERR(restriction)) {
                        ret = PTR_ERR(restriction);
                        goto error;
                }
        } else {
                if (_restriction)
                        goto error;
        }

        ret = keyring_restrict(key_ref, _type ? type : NULL, restriction);
        kfree(restriction);
error:
        key_ref_put(key_ref);
        return ret;
}

#ifdef CONFIG_KEY_NOTIFICATIONS
/*
 * Watch for changes to a key.
 *
 * The caller must have View permission to watch a key or keyring.
 */
long keyctl_watch_key(key_serial_t id, int watch_queue_fd, int watch_id)
{
        struct watch_queue *wqueue;
        struct watch_list *wlist = NULL;
        struct watch *watch = NULL;
        struct key *key;
        key_ref_t key_ref;
        long ret;

        if (watch_id < -1 || watch_id > 0xff)
                return -EINVAL;

        key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_VIEW);
        if (IS_ERR(key_ref))
                return PTR_ERR(key_ref);
        key = key_ref_to_ptr(key_ref);

        wqueue = get_watch_queue(watch_queue_fd);
        if (IS_ERR(wqueue)) {
                ret = PTR_ERR(wqueue);
                goto err_key;
        }

        if (watch_id >= 0) {
                ret = -ENOMEM;
                if (!key->watchers) {
                        wlist = kzalloc_obj(*wlist);
                        if (!wlist)
                                goto err_wqueue;
                        init_watch_list(wlist, NULL);
                }

                watch = kzalloc_obj(*watch);
                if (!watch)
                        goto err_wlist;

                init_watch(watch, wqueue);
                watch->id        = key->serial;
                watch->info_id        = (u32)watch_id << WATCH_INFO_ID__SHIFT;

                ret = security_watch_key(key);
                if (ret < 0)
                        goto err_watch;

                down_write(&key->sem);
                if (!key->watchers) {
                        key->watchers = wlist;
                        wlist = NULL;
                }

                ret = add_watch_to_object(watch, key->watchers);
                up_write(&key->sem);

                if (ret == 0)
                        watch = NULL;
        } else {
                ret = -EBADSLT;
                if (key->watchers) {
                        down_write(&key->sem);
                        ret = remove_watch_from_object(key->watchers,
                                                       wqueue, key_serial(key),
                                                       false);
                        up_write(&key->sem);
                }
        }

err_watch:
        kfree(watch);
err_wlist:
        kfree(wlist);
err_wqueue:
        put_watch_queue(wqueue);
err_key:
        key_put(key);
        return ret;
}
#endif /* CONFIG_KEY_NOTIFICATIONS */

/*
 * Get keyrings subsystem capabilities.
 */
long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen)
{
        size_t size = buflen;

        if (size > 0) {
                if (size > sizeof(keyrings_capabilities))
                        size = sizeof(keyrings_capabilities);
                if (copy_to_user(_buffer, keyrings_capabilities, size) != 0)
                        return -EFAULT;
                if (size < buflen &&
                    clear_user(_buffer + size, buflen - size) != 0)
                        return -EFAULT;
        }

        return sizeof(keyrings_capabilities);
}

/*
 * The key control system call
 */
SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
{
        switch (option) {
        case KEYCTL_GET_KEYRING_ID:
                return keyctl_get_keyring_ID((key_serial_t) arg2,
                                             (int) arg3);

        case KEYCTL_JOIN_SESSION_KEYRING:
                return keyctl_join_session_keyring((const char __user *) arg2);

        case KEYCTL_UPDATE:
                return keyctl_update_key((key_serial_t) arg2,
                                         (const void __user *) arg3,
                                         (size_t) arg4);

        case KEYCTL_REVOKE:
                return keyctl_revoke_key((key_serial_t) arg2);

        case KEYCTL_DESCRIBE:
                return keyctl_describe_key((key_serial_t) arg2,
                                           (char __user *) arg3,
                                           (unsigned) arg4);

        case KEYCTL_CLEAR:
                return keyctl_keyring_clear((key_serial_t) arg2);

        case KEYCTL_LINK:
                return keyctl_keyring_link((key_serial_t) arg2,
                                           (key_serial_t) arg3);

        case KEYCTL_UNLINK:
                return keyctl_keyring_unlink((key_serial_t) arg2,
                                             (key_serial_t) arg3);

        case KEYCTL_SEARCH:
                return keyctl_keyring_search((key_serial_t) arg2,
                                             (const char __user *) arg3,
                                             (const char __user *) arg4,
                                             (key_serial_t) arg5);

        case KEYCTL_READ:
                return keyctl_read_key((key_serial_t) arg2,
                                       (char __user *) arg3,
                                       (size_t) arg4);

        case KEYCTL_CHOWN:
                return keyctl_chown_key((key_serial_t) arg2,
                                        (uid_t) arg3,
                                        (gid_t) arg4);

        case KEYCTL_SETPERM:
                return keyctl_setperm_key((key_serial_t) arg2,
                                          (key_perm_t) arg3);

        case KEYCTL_INSTANTIATE:
                return keyctl_instantiate_key((key_serial_t) arg2,
                                              (const void __user *) arg3,
                                              (size_t) arg4,
                                              (key_serial_t) arg5);

        case KEYCTL_NEGATE:
                return keyctl_negate_key((key_serial_t) arg2,
                                         (unsigned) arg3,
                                         (key_serial_t) arg4);

        case KEYCTL_SET_REQKEY_KEYRING:
                return keyctl_set_reqkey_keyring(arg2);

        case KEYCTL_SET_TIMEOUT:
                return keyctl_set_timeout((key_serial_t) arg2,
                                          (unsigned) arg3);

        case KEYCTL_ASSUME_AUTHORITY:
                return keyctl_assume_authority((key_serial_t) arg2);

        case KEYCTL_GET_SECURITY:
                return keyctl_get_security((key_serial_t) arg2,
                                           (char __user *) arg3,
                                           (size_t) arg4);

        case KEYCTL_SESSION_TO_PARENT:
                return keyctl_session_to_parent();

        case KEYCTL_REJECT:
                return keyctl_reject_key((key_serial_t) arg2,
                                         (unsigned) arg3,
                                         (unsigned) arg4,
                                         (key_serial_t) arg5);

        case KEYCTL_INSTANTIATE_IOV:
                return keyctl_instantiate_key_iov(
                        (key_serial_t) arg2,
                        (const struct iovec __user *) arg3,
                        (unsigned) arg4,
                        (key_serial_t) arg5);

        case KEYCTL_INVALIDATE:
                return keyctl_invalidate_key((key_serial_t) arg2);

        case KEYCTL_GET_PERSISTENT:
                return keyctl_get_persistent((uid_t)arg2, (key_serial_t)arg3);

        case KEYCTL_DH_COMPUTE:
                return keyctl_dh_compute((struct keyctl_dh_params __user *) arg2,
                                         (char __user *) arg3, (size_t) arg4,
                                         (struct keyctl_kdf_params __user *) arg5);

        case KEYCTL_RESTRICT_KEYRING:
                return keyctl_restrict_keyring((key_serial_t) arg2,
                                               (const char __user *) arg3,
                                               (const char __user *) arg4);

        case KEYCTL_PKEY_QUERY:
                if (arg3 != 0)
                        return -EINVAL;
                return keyctl_pkey_query((key_serial_t)arg2,
                                         (const char __user *)arg4,
                                         (struct keyctl_pkey_query __user *)arg5);

        case KEYCTL_PKEY_ENCRYPT:
        case KEYCTL_PKEY_DECRYPT:
        case KEYCTL_PKEY_SIGN:
                return keyctl_pkey_e_d_s(
                        option,
                        (const struct keyctl_pkey_params __user *)arg2,
                        (const char __user *)arg3,
                        (const void __user *)arg4,
                        (void __user *)arg5);

        case KEYCTL_PKEY_VERIFY:
                return keyctl_pkey_verify(
                        (const struct keyctl_pkey_params __user *)arg2,
                        (const char __user *)arg3,
                        (const void __user *)arg4,
                        (const void __user *)arg5);

        case KEYCTL_MOVE:
                return keyctl_keyring_move((key_serial_t)arg2,
                                           (key_serial_t)arg3,
                                           (key_serial_t)arg4,
                                           (unsigned int)arg5);

        case KEYCTL_CAPABILITIES:
                return keyctl_capabilities((unsigned char __user *)arg2, (size_t)arg3);

        case KEYCTL_WATCH_KEY:
                return keyctl_watch_key((key_serial_t)arg2, (int)arg3, (int)arg4);

        default:
                return -EOPNOTSUPP;
        }
}

































































   18 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ERR_H
#define _LINUX_ERR_H

#include <linux/compiler.h>
#include <linux/types.h>

#include <asm/errno.h>

/*
 * Kernel pointers have redundant information, so we can use a
 * scheme where we can return either an error code or a normal
 * pointer with the same return value.
 *
 * This should be a per-architecture thing, to allow different
 * error and pointer decisions.
 */
#define MAX_ERRNO        4095

#ifndef __ASSEMBLY__

/**
 * IS_ERR_VALUE - Detect an error pointer.
 * @x: The pointer to check.
 *
 * Like IS_ERR(), but does not generate a compiler warning if result is unused.
 */
#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)

/**
 * ERR_PTR - Create an error pointer.
 * @error: A negative error code.
 *
 * Encodes @error into a pointer value. Users should consider the result
 * opaque and not assume anything about how the error is encoded.
 *
 * Return: A pointer with @error encoded within its value.
 */
static inline void * __must_check ERR_PTR(long error)
{
        return (void *) error;
}

/**
 * INIT_ERR_PTR - Init a const error pointer.
 * @error: A negative error code.
 *
 * Like ERR_PTR(), but usable to initialize static variables.
 */
#define INIT_ERR_PTR(error) ((void *)(error))

/* Return the pointer in the percpu address space. */
#define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error))

/* Cast an error pointer to __iomem. */
#define IOMEM_ERR_PTR(error) (__force void __iomem *)ERR_PTR(error)

/**
 * PTR_ERR - Extract the error code from an error pointer.
 * @ptr: An error pointer.
 * Return: The error code within @ptr.
 */
static inline long __must_check PTR_ERR(__force const void *ptr)
{
        return (long) ptr;
}

/* Read an error pointer from the percpu address space. */
#define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr)))

/**
 * IS_ERR - Detect an error pointer.
 * @ptr: The pointer to check.
 * Return: true if @ptr is an error pointer, false otherwise.
 */
static inline bool __must_check IS_ERR(__force const void *ptr)
{
        return IS_ERR_VALUE((unsigned long)ptr);
}

/* Read an error pointer from the percpu address space. */
#define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr)))

/**
 * IS_ERR_OR_NULL - Detect an error pointer or a null pointer.
 * @ptr: The pointer to check.
 *
 * Like IS_ERR(), but also returns true for a null pointer.
 */
static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
{
        return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
}

/**
 * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
 * @ptr: The pointer to cast.
 *
 * Explicitly cast an error-valued pointer to another pointer type in such a
 * way as to make it clear that's what's going on.
 */
static inline void * __must_check ERR_CAST(__force const void *ptr)
{
        /* cast away the const */
        return (void *) ptr;
}

/**
 * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one.
 * @ptr: A potential error pointer.
 *
 * Convenience function that can be used inside a function that returns
 * an error code to propagate errors received as error pointers.
 * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces:
 *
 * .. code-block:: c
 *
 *        if (IS_ERR(ptr))
 *                return PTR_ERR(ptr);
 *        else
 *                return 0;
 *
 * Return: The error code within @ptr if it is an error pointer; 0 otherwise.
 */
static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
{
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);
        else
                return 0;
}

#endif

#endif /* _LINUX_ERR_H */























   23 







































    6 

































   20 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM vmalloc

#if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_VMALLOC_H

#include <linux/tracepoint.h>

/**
 * alloc_vmap_area - called when a new vmap allocation occurs
 * @addr:        an allocated address
 * @size:        a requested size
 * @align:        a requested alignment
 * @vstart:        a requested start range
 * @vend:        a requested end range
 * @failed:        an allocation failed or not
 *
 * This event is used for a debug purpose, it can give an extra
 * information for a developer about how often it occurs and which
 * parameters are passed for further validation.
 */
TRACE_EVENT(alloc_vmap_area,

        TP_PROTO(unsigned long addr, unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend, int failed),

        TP_ARGS(addr, size, align, vstart, vend, failed),

        TP_STRUCT__entry(
                __field(unsigned long, addr)
                __field(unsigned long, size)
                __field(unsigned long, align)
                __field(unsigned long, vstart)
                __field(unsigned long, vend)
                __field(int, failed)
        ),

        TP_fast_assign(
                __entry->addr = addr;
                __entry->size = size;
                __entry->align = align;
                __entry->vstart = vstart;
                __entry->vend = vend;
                __entry->failed = failed;
        ),

        TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d",
                __entry->addr, __entry->size, __entry->align,
                __entry->vstart, __entry->vend, __entry->failed)
);

/**
 * purge_vmap_area_lazy - called when vmap areas were lazily freed
 * @start:                purging start address
 * @end:                purging end address
 * @npurged:        numbed of purged vmap areas
 *
 * This event is used for a debug purpose. It gives some
 * indication about start:end range and how many objects
 * are released.
 */
TRACE_EVENT(purge_vmap_area_lazy,

        TP_PROTO(unsigned long start, unsigned long end,
                unsigned int npurged),

        TP_ARGS(start, end, npurged),

        TP_STRUCT__entry(
                __field(unsigned long, start)
                __field(unsigned long, end)
                __field(unsigned int, npurged)
        ),

        TP_fast_assign(
                __entry->start = start;
                __entry->end = end;
                __entry->npurged = npurged;
        ),

        TP_printk("start=0x%lx end=0x%lx num_purged=%u",
                __entry->start, __entry->end, __entry->npurged)
);

/**
 * free_vmap_area_noflush - called when a vmap area is freed
 * @va_start:                a start address of VA
 * @nr_lazy:                number of current lazy pages
 * @nr_lazy_max:        number of maximum lazy pages
 *
 * This event is used for a debug purpose. It gives some
 * indication about a VA that is released, number of current
 * outstanding areas and a maximum allowed threshold before
 * dropping all of them.
 */
TRACE_EVENT(free_vmap_area_noflush,

        TP_PROTO(unsigned long va_start, unsigned long nr_lazy,
                unsigned long nr_lazy_max),

        TP_ARGS(va_start, nr_lazy, nr_lazy_max),

        TP_STRUCT__entry(
                __field(unsigned long, va_start)
                __field(unsigned long, nr_lazy)
                __field(unsigned long, nr_lazy_max)
        ),

        TP_fast_assign(
                __entry->va_start = va_start;
                __entry->nr_lazy = nr_lazy;
                __entry->nr_lazy_max = nr_lazy_max;
        ),

        TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu",
                __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max)
);

#endif /*  _TRACE_VMALLOC_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





























    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* request_key authorisation token key type
 *
 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _KEYS_REQUEST_KEY_AUTH_TYPE_H
#define _KEYS_REQUEST_KEY_AUTH_TYPE_H

#include <linux/key.h>

/*
 * Authorisation record for request_key().
 */
struct request_key_auth {
        struct rcu_head                rcu;
        struct key                *target_key;
        struct key                *dest_keyring;
        const struct cred        *cred;
        void                        *callout_info;
        size_t                        callout_len;
        pid_t                        pid;
        char                        op[8];
} __randomize_layout;

static inline struct request_key_auth *get_request_key_auth(const struct key *key)
{
        return key->payload.data[0];
}


#endif /* _KEYS_REQUEST_KEY_AUTH_TYPE_H */





































































    3 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    2 
















    3 


    3 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
// SPDX-License-Identifier: GPL-2.0
/*
 * Software nodes for the firmware node framework.
 *
 * Copyright (C) 2018, Intel Corporation
 * Author: Heikki Krogerus <heikki.krogerus@linux.intel.com>
 */

#include <linux/container_of.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/kstrtox.h>
#include <linux/list.h>
#include <linux/property.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/types.h>

#include "base.h"

struct swnode {
        struct kobject kobj;
        struct fwnode_handle fwnode;
        const struct software_node *node;
        int id;

        /* hierarchy */
        struct ida child_ids;
        struct list_head entry;
        struct list_head children;
        struct swnode *parent;

        unsigned int allocated:1;
        unsigned int managed:1;
};

static DEFINE_IDA(swnode_root_ids);
static struct kset *swnode_kset;

#define kobj_to_swnode(_kobj_) container_of(_kobj_, struct swnode, kobj)

static const struct fwnode_operations software_node_ops;

bool is_software_node(const struct fwnode_handle *fwnode)
{
        return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &software_node_ops;
}
EXPORT_SYMBOL_GPL(is_software_node);

#define to_swnode(__fwnode)                                                \
        ({                                                                \
                typeof(__fwnode) __to_swnode_fwnode = __fwnode;                \
                                                                        \
                is_software_node(__to_swnode_fwnode) ?                        \
                        container_of(__to_swnode_fwnode,                \
                                     struct swnode, fwnode) : NULL;        \
        })

static inline struct swnode *dev_to_swnode(struct device *dev)
{
        struct fwnode_handle *fwnode = dev_fwnode(dev);

        if (!fwnode)
                return NULL;

        if (!is_software_node(fwnode))
                fwnode = fwnode->secondary;

        return to_swnode(fwnode);
}

static struct swnode *
software_node_to_swnode(const struct software_node *node)
{
        struct swnode *swnode = NULL;
        struct kobject *k;

        if (!node)
                return NULL;

        spin_lock(&swnode_kset->list_lock);

        list_for_each_entry(k, &swnode_kset->list, entry) {
                swnode = kobj_to_swnode(k);
                if (swnode->node == node)
                        break;
                swnode = NULL;
        }

        spin_unlock(&swnode_kset->list_lock);

        return swnode;
}

const struct software_node *to_software_node(const struct fwnode_handle *fwnode)
{
        const struct swnode *swnode = to_swnode(fwnode);

        return swnode ? swnode->node : NULL;
}
EXPORT_SYMBOL_GPL(to_software_node);

struct fwnode_handle *software_node_fwnode(const struct software_node *node)
{
        struct swnode *swnode = software_node_to_swnode(node);

        return swnode ? &swnode->fwnode : NULL;
}
EXPORT_SYMBOL_GPL(software_node_fwnode);

/* -------------------------------------------------------------------------- */
/* property_entry processing */

static const struct property_entry *
property_entry_get(const struct property_entry *prop, const char *name)
{
        if (!prop)
                return NULL;

        for (; prop->name; prop++)
                if (!strcmp(name, prop->name))
                        return prop;

        return NULL;
}

static const void *property_get_pointer(const struct property_entry *prop)
{
        if (!prop->length)
                return NULL;

        return prop->is_inline ? &prop->value : prop->pointer;
}

static const void *property_entry_find(const struct property_entry *props,
                                       const char *propname, size_t length)
{
        const struct property_entry *prop;
        const void *pointer;

        prop = property_entry_get(props, propname);
        if (!prop)
                return ERR_PTR(-EINVAL);
        pointer = property_get_pointer(prop);
        if (!pointer)
                return ERR_PTR(-ENODATA);
        if (length > prop->length)
                return ERR_PTR(-EOVERFLOW);
        return pointer;
}

static int
property_entry_count_elems_of_size(const struct property_entry *props,
                                   const char *propname, size_t length)
{
        const struct property_entry *prop;

        prop = property_entry_get(props, propname);
        if (!prop)
                return -EINVAL;

        return prop->length / length;
}

static int property_entry_read_int_array(const struct property_entry *props,
                                         const char *name,
                                         unsigned int elem_size, void *val,
                                         size_t nval)
{
        const void *pointer;
        size_t length;

        if (!val)
                return property_entry_count_elems_of_size(props, name,
                                                          elem_size);

        if (!is_power_of_2(elem_size) || elem_size > sizeof(u64))
                return -ENXIO;

        length = nval * elem_size;

        pointer = property_entry_find(props, name, length);
        if (IS_ERR(pointer))
                return PTR_ERR(pointer);

        memcpy(val, pointer, length);
        return 0;
}

static int property_entry_read_string_array(const struct property_entry *props,
                                            const char *propname,
                                            const char **strings, size_t nval)
{
        const void *pointer;
        size_t length;
        int array_len;

        /* Find out the array length. */
        array_len = property_entry_count_elems_of_size(props, propname,
                                                       sizeof(const char *));
        if (array_len < 0)
                return array_len;

        /* Return how many there are if strings is NULL. */
        if (!strings)
                return array_len;

        array_len = min_t(size_t, nval, array_len);
        length = array_len * sizeof(*strings);

        pointer = property_entry_find(props, propname, length);
        if (IS_ERR(pointer))
                return PTR_ERR(pointer);

        memcpy(strings, pointer, length);

        return array_len;
}

static void property_entry_free_data(const struct property_entry *p)
{
        const char * const *src_str;
        size_t i, nval;

        if (p->type == DEV_PROP_STRING) {
                src_str = property_get_pointer(p);
                nval = p->length / sizeof(*src_str);
                for (i = 0; i < nval; i++)
                        kfree(src_str[i]);
        }

        if (!p->is_inline)
                kfree(p->pointer);

        kfree(p->name);
}

static bool property_copy_string_array(const char **dst_ptr,
                                       const char * const *src_ptr,
                                       size_t nval)
{
        int i;

        for (i = 0; i < nval; i++) {
                dst_ptr[i] = kstrdup(src_ptr[i], GFP_KERNEL);
                if (!dst_ptr[i] && src_ptr[i]) {
                        while (--i >= 0)
                                kfree(dst_ptr[i]);
                        return false;
                }
        }

        return true;
}

static int property_entry_copy_data(struct property_entry *dst,
                                    const struct property_entry *src)
{
        const void *pointer = property_get_pointer(src);
        void *dst_ptr;
        size_t nval;

        /*
         * Properties with no data should not be marked as stored
         * out of line.
         */
        if (!src->is_inline && !src->length)
                return -ENODATA;

        /*
         * Reference properties are never stored inline as
         * they are too big.
         */
        if (src->type == DEV_PROP_REF && src->is_inline)
                return -EINVAL;

        if (src->length <= sizeof(dst->value)) {
                dst_ptr = &dst->value;
                dst->is_inline = true;
        } else {
                dst_ptr = kmalloc(src->length, GFP_KERNEL);
                if (!dst_ptr)
                        return -ENOMEM;
                dst->pointer = dst_ptr;
        }

        if (src->type == DEV_PROP_STRING) {
                nval = src->length / sizeof(const char *);
                if (!property_copy_string_array(dst_ptr, pointer, nval)) {
                        if (!dst->is_inline)
                                kfree(dst->pointer);
                        return -ENOMEM;
                }
        } else {
                memcpy(dst_ptr, pointer, src->length);
        }

        dst->length = src->length;
        dst->type = src->type;
        dst->name = kstrdup(src->name, GFP_KERNEL);
        if (!dst->name) {
                property_entry_free_data(dst);
                return -ENOMEM;
        }

        return 0;
}

/**
 * property_entries_dup - duplicate array of properties
 * @properties: array of properties to copy
 *
 * This function creates a deep copy of the given NULL-terminated array
 * of property entries.
 */
struct property_entry *
property_entries_dup(const struct property_entry *properties)
{
        struct property_entry *p;
        int i, n = 0;
        int ret;

        if (!properties)
                return NULL;

        while (properties[n].name)
                n++;

        p = kzalloc_objs(*p, n + 1);
        if (!p)
                return ERR_PTR(-ENOMEM);

        for (i = 0; i < n; i++) {
                ret = property_entry_copy_data(&p[i], &properties[i]);
                if (ret) {
                        while (--i >= 0)
                                property_entry_free_data(&p[i]);
                        kfree(p);
                        return ERR_PTR(ret);
                }
        }

        return p;
}
EXPORT_SYMBOL_GPL(property_entries_dup);

/**
 * property_entries_free - free previously allocated array of properties
 * @properties: array of properties to destroy
 *
 * This function frees given NULL-terminated array of property entries,
 * along with their data.
 */
void property_entries_free(const struct property_entry *properties)
{
        const struct property_entry *p;

        if (!properties)
                return;

        for (p = properties; p->name; p++)
                property_entry_free_data(p);

        kfree(properties);
}
EXPORT_SYMBOL_GPL(property_entries_free);

/* -------------------------------------------------------------------------- */
/* fwnode operations */

static struct fwnode_handle *software_node_get(struct fwnode_handle *fwnode)
{
        struct swnode *swnode = to_swnode(fwnode);

        kobject_get(&swnode->kobj);

        return &swnode->fwnode;
}

static void software_node_put(struct fwnode_handle *fwnode)
{
        struct swnode *swnode = to_swnode(fwnode);

        kobject_put(&swnode->kobj);
}

static bool software_node_property_present(const struct fwnode_handle *fwnode,
                                           const char *propname)
{
        struct swnode *swnode = to_swnode(fwnode);

        return !!property_entry_get(swnode->node->properties, propname);
}

static int software_node_read_int_array(const struct fwnode_handle *fwnode,
                                        const char *propname,
                                        unsigned int elem_size, void *val,
                                        size_t nval)
{
        struct swnode *swnode = to_swnode(fwnode);

        return property_entry_read_int_array(swnode->node->properties, propname,
                                             elem_size, val, nval);
}

static int software_node_read_string_array(const struct fwnode_handle *fwnode,
                                           const char *propname,
                                           const char **val, size_t nval)
{
        struct swnode *swnode = to_swnode(fwnode);

        return property_entry_read_string_array(swnode->node->properties,
                                                propname, val, nval);
}

static const char *
software_node_get_name(const struct fwnode_handle *fwnode)
{
        const struct swnode *swnode = to_swnode(fwnode);

        return kobject_name(&swnode->kobj);
}

static const char *
software_node_get_name_prefix(const struct fwnode_handle *fwnode)
{
        struct fwnode_handle *parent;
        const char *prefix;

        parent = fwnode_get_parent(fwnode);
        if (!parent)
                return "";

        /* Figure out the prefix from the parents. */
        while (is_software_node(parent))
                parent = fwnode_get_next_parent(parent);

        prefix = fwnode_get_name_prefix(parent);
        fwnode_handle_put(parent);

        /* Guess something if prefix was NULL. */
        return prefix ?: "/";
}

static struct fwnode_handle *
software_node_get_parent(const struct fwnode_handle *fwnode)
{
        struct swnode *swnode = to_swnode(fwnode);

        if (!swnode || !swnode->parent)
                return NULL;

        return fwnode_handle_get(&swnode->parent->fwnode);
}

static struct fwnode_handle *
software_node_get_next_child(const struct fwnode_handle *fwnode,
                             struct fwnode_handle *child)
{
        struct swnode *p = to_swnode(fwnode);
        struct swnode *c = to_swnode(child);

        if (!p || list_empty(&p->children) ||
            (c && list_is_last(&c->entry, &p->children))) {
                fwnode_handle_put(child);
                return NULL;
        }

        if (c)
                c = list_next_entry(c, entry);
        else
                c = list_first_entry(&p->children, struct swnode, entry);

        fwnode_handle_put(child);
        return fwnode_handle_get(&c->fwnode);
}

static struct fwnode_handle *
software_node_get_named_child_node(const struct fwnode_handle *fwnode,
                                   const char *childname)
{
        struct swnode *swnode = to_swnode(fwnode);
        struct swnode *child;

        if (!swnode || list_empty(&swnode->children))
                return NULL;

        list_for_each_entry(child, &swnode->children, entry) {
                if (!strcmp(childname, kobject_name(&child->kobj))) {
                        kobject_get(&child->kobj);
                        return &child->fwnode;
                }
        }
        return NULL;
}

static int
software_node_get_reference_args(const struct fwnode_handle *fwnode,
                                 const char *propname, const char *nargs_prop,
                                 unsigned int nargs, unsigned int index,
                                 struct fwnode_reference_args *args)
{
        struct swnode *swnode = to_swnode(fwnode);
        const struct software_node_ref_args *ref_array;
        const struct software_node_ref_args *ref;
        const struct property_entry *prop;
        struct fwnode_handle *refnode;
        u32 nargs_prop_val;
        int error;
        int i;

        prop = property_entry_get(swnode->node->properties, propname);
        if (!prop)
                return -ENOENT;

        if (prop->type != DEV_PROP_REF)
                return -EINVAL;

        /*
         * We expect that references are never stored inline, even
         * single ones, as they are too big.
         */
        if (prop->is_inline)
                return -EINVAL;

        if ((index + 1) * sizeof(*ref) > prop->length)
                return -ENOENT;

        ref_array = prop->pointer;
        ref = &ref_array[index];

        /*
         * A software node can reference other software nodes or firmware
         * nodes (which are the abstraction layer sitting on top of them).
         * This is done to ensure we can create references to static software
         * nodes before they're registered with the firmware node framework.
         * At the time the reference is being resolved, we expect the swnodes
         * in question to already have been registered and to be backed by
         * a firmware node. This is why we use the fwnode API below to read the
         * relevant properties and bump the reference count.
         */

        if (ref->swnode)
                refnode = software_node_fwnode(ref->swnode);
        else if (ref->fwnode)
                refnode = ref->fwnode;
        else
                return -EINVAL;

        if (!refnode)
                return -ENOTCONN;

        if (nargs_prop) {
                error = fwnode_property_read_u32(refnode, nargs_prop, &nargs_prop_val);
                if (error)
                        return error;

                nargs = nargs_prop_val;
        }

        if (nargs > NR_FWNODE_REFERENCE_ARGS)
                return -EINVAL;

        if (!args)
                return 0;

        args->fwnode = fwnode_handle_get(refnode);
        args->nargs = nargs;

        for (i = 0; i < nargs; i++)
                args->args[i] = ref->args[i];

        return 0;
}

static struct fwnode_handle *
swnode_graph_find_next_port(const struct fwnode_handle *parent,
                            struct fwnode_handle *port)
{
        struct fwnode_handle *old = port;

        while ((port = software_node_get_next_child(parent, old))) {
                /*
                 * fwnode ports have naming style "port@", so we search for any
                 * children that follow that convention.
                 */
                if (!strncmp(to_swnode(port)->node->name, "port@",
                             strlen("port@")))
                        return port;
                old = port;
        }

        return NULL;
}

static struct fwnode_handle *
software_node_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
                                      struct fwnode_handle *endpoint)
{
        struct swnode *swnode = to_swnode(fwnode);
        struct fwnode_handle *parent;
        struct fwnode_handle *port;

        if (!swnode)
                return NULL;

        if (endpoint) {
                port = software_node_get_parent(endpoint);
                parent = software_node_get_parent(port);
        } else {
                parent = software_node_get_named_child_node(fwnode, "ports");
                if (!parent)
                        parent = software_node_get(&swnode->fwnode);

                port = swnode_graph_find_next_port(parent, NULL);
        }

        for (; port; port = swnode_graph_find_next_port(parent, port)) {
                endpoint = software_node_get_next_child(port, endpoint);
                if (endpoint) {
                        fwnode_handle_put(port);
                        break;
                }
        }

        fwnode_handle_put(parent);

        return endpoint;
}

static struct fwnode_handle *
software_node_graph_get_remote_endpoint(const struct fwnode_handle *fwnode)
{
        struct swnode *swnode = to_swnode(fwnode);
        const struct software_node_ref_args *ref;
        const struct property_entry *prop;

        if (!swnode)
                return NULL;

        prop = property_entry_get(swnode->node->properties, "remote-endpoint");
        if (!prop || prop->type != DEV_PROP_REF || prop->is_inline)
                return NULL;

        ref = prop->pointer;

        if (!ref->swnode)
                return NULL;

        return software_node_get(software_node_fwnode(ref->swnode));
}

static struct fwnode_handle *
software_node_graph_get_port_parent(struct fwnode_handle *fwnode)
{
        struct swnode *swnode = to_swnode(fwnode);

        swnode = swnode->parent;
        if (swnode && !strcmp(swnode->node->name, "ports"))
                swnode = swnode->parent;

        return swnode ? software_node_get(&swnode->fwnode) : NULL;
}

static int
software_node_graph_parse_endpoint(const struct fwnode_handle *fwnode,
                                   struct fwnode_endpoint *endpoint)
{
        struct swnode *swnode = to_swnode(fwnode);
        const char *parent_name = swnode->parent->node->name;
        int ret;

        if (strlen("port@") >= strlen(parent_name) ||
            strncmp(parent_name, "port@", strlen("port@")))
                return -EINVAL;

        /* Ports have naming style "port@n", we need to select the n */
        ret = kstrtou32(parent_name + strlen("port@"), 10, &endpoint->port);
        if (ret)
                return ret;

        endpoint->id = swnode->id;
        endpoint->local_fwnode = fwnode;

        return 0;
}

static const struct fwnode_operations software_node_ops = {
        .get = software_node_get,
        .put = software_node_put,
        .property_present = software_node_property_present,
        .property_read_bool = software_node_property_present,
        .property_read_int_array = software_node_read_int_array,
        .property_read_string_array = software_node_read_string_array,
        .get_name = software_node_get_name,
        .get_name_prefix = software_node_get_name_prefix,
        .get_parent = software_node_get_parent,
        .get_next_child_node = software_node_get_next_child,
        .get_named_child_node = software_node_get_named_child_node,
        .get_reference_args = software_node_get_reference_args,
        .graph_get_next_endpoint = software_node_graph_get_next_endpoint,
        .graph_get_remote_endpoint = software_node_graph_get_remote_endpoint,
        .graph_get_port_parent = software_node_graph_get_port_parent,
        .graph_parse_endpoint = software_node_graph_parse_endpoint,
};

/* -------------------------------------------------------------------------- */

/**
 * software_node_find_by_name - Find software node by name
 * @parent: Parent of the software node
 * @name: Name of the software node
 *
 * The function will find a node that is child of @parent and that is named
 * @name. If no node is found, the function returns NULL.
 *
 * NOTE: you will need to drop the reference with fwnode_handle_put() after use.
 */
const struct software_node *
software_node_find_by_name(const struct software_node *parent, const char *name)
{
        struct swnode *swnode = NULL;
        struct kobject *k;

        if (!name)
                return NULL;

        spin_lock(&swnode_kset->list_lock);

        list_for_each_entry(k, &swnode_kset->list, entry) {
                swnode = kobj_to_swnode(k);
                if (parent == swnode->node->parent && swnode->node->name &&
                    !strcmp(name, swnode->node->name)) {
                        kobject_get(&swnode->kobj);
                        break;
                }
                swnode = NULL;
        }

        spin_unlock(&swnode_kset->list_lock);

        return swnode ? swnode->node : NULL;
}
EXPORT_SYMBOL_GPL(software_node_find_by_name);

static struct software_node *software_node_alloc(const struct property_entry *properties)
{
        struct property_entry *props;
        struct software_node *node;

        props = property_entries_dup(properties);
        if (IS_ERR(props))
                return ERR_CAST(props);

        node = kzalloc_obj(*node);
        if (!node) {
                property_entries_free(props);
                return ERR_PTR(-ENOMEM);
        }

        node->properties = props;

        return node;
}

static void software_node_free(const struct software_node *node)
{
        property_entries_free(node->properties);
        kfree(node);
}

static void software_node_release(struct kobject *kobj)
{
        struct swnode *swnode = kobj_to_swnode(kobj);

        if (swnode->parent) {
                ida_free(&swnode->parent->child_ids, swnode->id);
                list_del(&swnode->entry);
        } else {
                ida_free(&swnode_root_ids, swnode->id);
        }

        if (swnode->allocated)
                software_node_free(swnode->node);

        ida_destroy(&swnode->child_ids);
        kfree(swnode);
}

static const struct kobj_type software_node_type = {
        .release = software_node_release,
        .sysfs_ops = &kobj_sysfs_ops,
};

static struct fwnode_handle *
swnode_register(const struct software_node *node, struct swnode *parent,
                unsigned int allocated)
{
        struct swnode *swnode;
        int ret;

        swnode = kzalloc_obj(*swnode);
        if (!swnode)
                return ERR_PTR(-ENOMEM);

        ret = ida_alloc(parent ? &parent->child_ids : &swnode_root_ids,
                        GFP_KERNEL);
        if (ret < 0) {
                kfree(swnode);
                return ERR_PTR(ret);
        }

        swnode->id = ret;
        swnode->node = node;
        swnode->parent = parent;
        swnode->kobj.kset = swnode_kset;
        fwnode_init(&swnode->fwnode, &software_node_ops);

        ida_init(&swnode->child_ids);
        INIT_LIST_HEAD(&swnode->entry);
        INIT_LIST_HEAD(&swnode->children);

        if (node->name)
                ret = kobject_init_and_add(&swnode->kobj, &software_node_type,
                                           parent ? &parent->kobj : NULL,
                                           "%s", node->name);
        else
                ret = kobject_init_and_add(&swnode->kobj, &software_node_type,
                                           parent ? &parent->kobj : NULL,
                                           "node%d", swnode->id);
        if (ret) {
                kobject_put(&swnode->kobj);
                return ERR_PTR(ret);
        }

        /*
         * Assign the flag only in the successful case, so
         * the above kobject_put() won't mess up with properties.
         */
        swnode->allocated = allocated;

        if (parent)
                list_add_tail(&swnode->entry, &parent->children);

        kobject_uevent(&swnode->kobj, KOBJ_ADD);
        return &swnode->fwnode;
}

/**
 * software_node_register_node_group - Register a group of software nodes
 * @node_group: NULL terminated array of software node pointers to be registered
 *
 * Register multiple software nodes at once. If any node in the array
 * has its .parent pointer set (which can only be to another software_node),
 * then its parent **must** have been registered before it is; either outside
 * of this function or by ordering the array such that parent comes before
 * child.
 */
int software_node_register_node_group(const struct software_node * const *node_group)
{
        unsigned int i;
        int ret;

        if (!node_group)
                return 0;

        for (i = 0; node_group[i]; i++) {
                ret = software_node_register(node_group[i]);
                if (ret) {
                        software_node_unregister_node_group(node_group);
                        return ret;
                }
        }

        return 0;
}
EXPORT_SYMBOL_GPL(software_node_register_node_group);

/**
 * software_node_unregister_node_group - Unregister a group of software nodes
 * @node_group: NULL terminated array of software node pointers to be unregistered
 *
 * Unregister multiple software nodes at once. If parent pointers are set up
 * in any of the software nodes then the array **must** be ordered such that
 * parents come before their children.
 *
 * NOTE: If you are uncertain whether the array is ordered such that
 * parents will be unregistered before their children, it is wiser to
 * remove the nodes individually, in the correct order (child before
 * parent).
 */
void software_node_unregister_node_group(const struct software_node * const *node_group)
{
        unsigned int i = 0;

        if (!node_group)
                return;

        while (node_group[i])
                i++;

        while (i--)
                software_node_unregister(node_group[i]);
}
EXPORT_SYMBOL_GPL(software_node_unregister_node_group);

/**
 * software_node_register - Register static software node
 * @node: The software node to be registered
 */
int software_node_register(const struct software_node *node)
{
        struct swnode *parent = software_node_to_swnode(node->parent);

        if (software_node_to_swnode(node))
                return -EEXIST;

        if (node->parent && !parent)
                return -EINVAL;

        return PTR_ERR_OR_ZERO(swnode_register(node, parent, 0));
}
EXPORT_SYMBOL_GPL(software_node_register);

/**
 * software_node_unregister - Unregister static software node
 * @node: The software node to be unregistered
 */
void software_node_unregister(const struct software_node *node)
{
        struct swnode *swnode;

        swnode = software_node_to_swnode(node);
        if (swnode)
                fwnode_remove_software_node(&swnode->fwnode);
}
EXPORT_SYMBOL_GPL(software_node_unregister);

struct fwnode_handle *
fwnode_create_software_node(const struct property_entry *properties,
                            const struct fwnode_handle *parent)
{
        struct fwnode_handle *fwnode;
        struct software_node *node;
        struct swnode *p;

        if (IS_ERR(parent))
                return ERR_CAST(parent);

        p = to_swnode(parent);
        if (parent && !p)
                return ERR_PTR(-EINVAL);

        node = software_node_alloc(properties);
        if (IS_ERR(node))
                return ERR_CAST(node);

        node->parent = p ? p->node : NULL;

        fwnode = swnode_register(node, p, 1);
        if (IS_ERR(fwnode))
                software_node_free(node);

        return fwnode;
}
EXPORT_SYMBOL_GPL(fwnode_create_software_node);

void fwnode_remove_software_node(struct fwnode_handle *fwnode)
{
        struct swnode *swnode = to_swnode(fwnode);

        if (!swnode)
                return;

        kobject_put(&swnode->kobj);
}
EXPORT_SYMBOL_GPL(fwnode_remove_software_node);

/**
 * device_add_software_node - Assign software node to a device
 * @dev: The device the software node is meant for.
 * @node: The software node.
 *
 * This function will make @node the secondary firmware node pointer of @dev. If
 * @dev has no primary node, then @node will become the primary node. The
 * function will register @node automatically if it wasn't already registered.
 */
int device_add_software_node(struct device *dev, const struct software_node *node)
{
        struct swnode *swnode;
        int ret;

        /* Only one software node per device. */
        if (dev_to_swnode(dev))
                return -EBUSY;

        swnode = software_node_to_swnode(node);
        if (swnode) {
                kobject_get(&swnode->kobj);
        } else {
                ret = software_node_register(node);
                if (ret)
                        return ret;

                swnode = software_node_to_swnode(node);
        }

        set_secondary_fwnode(dev, &swnode->fwnode);

        /*
         * If the device has been fully registered by the time this function is
         * called, software_node_notify() must be called separately so that the
         * symlinks get created and the reference count of the node is kept in
         * balance.
         */
        if (device_is_registered(dev))
                software_node_notify(dev);

        return 0;
}
EXPORT_SYMBOL_GPL(device_add_software_node);

/**
 * device_remove_software_node - Remove device's software node
 * @dev: The device with the software node.
 *
 * This function will unregister the software node of @dev.
 */
void device_remove_software_node(struct device *dev)
{
        struct swnode *swnode;

        swnode = dev_to_swnode(dev);
        if (!swnode)
                return;

        if (device_is_registered(dev))
                software_node_notify_remove(dev);

        set_secondary_fwnode(dev, NULL);
        kobject_put(&swnode->kobj);
}
EXPORT_SYMBOL_GPL(device_remove_software_node);

/**
 * device_create_managed_software_node - Create a software node for a device
 * @dev: The device the software node is assigned to.
 * @properties: Device properties for the software node.
 * @parent: Parent of the software node.
 *
 * Creates a software node as a managed resource for @dev, which means the
 * lifetime of the newly created software node is tied to the lifetime of @dev.
 * Software nodes created with this function should not be reused or shared
 * because of that. The function takes a deep copy of @properties for the
 * software node.
 *
 * Since the new software node is assigned directly to @dev, and since it should
 * not be shared, it is not returned to the caller. The function returns 0 on
 * success, and errno in case of an error.
 */
int device_create_managed_software_node(struct device *dev,
                                        const struct property_entry *properties,
                                        const struct software_node *parent)
{
        struct fwnode_handle *p = software_node_fwnode(parent);
        struct fwnode_handle *fwnode;

        if (parent && !p)
                return -EINVAL;

        fwnode = fwnode_create_software_node(properties, p);
        if (IS_ERR(fwnode))
                return PTR_ERR(fwnode);

        to_swnode(fwnode)->managed = true;
        set_secondary_fwnode(dev, fwnode);

        if (device_is_registered(dev))
                software_node_notify(dev);

        return 0;
}
EXPORT_SYMBOL_GPL(device_create_managed_software_node);

void software_node_notify(struct device *dev)
{
        struct swnode *swnode;
        int ret;

        swnode = dev_to_swnode(dev);
        if (!swnode)
                return;

        kobject_get(&swnode->kobj);
        ret = sysfs_create_link(&dev->kobj, &swnode->kobj, "software_node");
        if (ret)
                return;

        ret = sysfs_create_link(&swnode->kobj, &dev->kobj, dev_name(dev));
        if (ret) {
                sysfs_remove_link(&dev->kobj, "software_node");
                return;
        }
}

void software_node_notify_remove(struct device *dev)
{
        struct swnode *swnode;

        swnode = dev_to_swnode(dev);
        if (!swnode)
                return;

        sysfs_remove_link(&swnode->kobj, dev_name(dev));
        sysfs_remove_link(&dev->kobj, "software_node");
        kobject_put(&swnode->kobj);

        if (swnode->managed) {
                set_secondary_fwnode(dev, NULL);
                kobject_put(&swnode->kobj);
        }
}

void __init software_node_init(void)
{
        swnode_kset = kset_create_and_add("software_nodes", NULL, kernel_kobj);
        if (!swnode_kset)
                pr_err("failed to register software nodes\n");
}





























































































































































    7 







    7 





























































































    8 






































    1 




    8 






















































































































































































































   14 






































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H

#include <linux/mm_types.h>

#include <linux/fs.h> /* only for vma_is_dax() */
#include <linux/kobject.h>

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
bool huge_pmd_set_accessed(struct vm_fault *vmf);
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                  struct vm_area_struct *vma);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
#else
static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
}
#endif

vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf);
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                           pmd_t *pmd, unsigned long addr, unsigned long next);
bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd,
                  unsigned long addr);
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud,
                 unsigned long addr);
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd);
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags);

vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
                              bool write);
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
                              bool write);
vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
                                bool write);
vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
                                bool write);

enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_UNSUPPORTED,
        TRANSPARENT_HUGEPAGE_FLAG,
        TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
        TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
};

struct kobject;
struct kobj_attribute;

ssize_t single_hugepage_flag_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count,
                                   enum transparent_hugepage_flag flag);
ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf,
                                  enum transparent_hugepage_flag flag);
extern struct kobj_attribute shmem_enabled_attr;
extern struct kobj_attribute thpsize_shmem_enabled_attr;

/*
 * Mask of all large folio orders supported for anonymous THP; all orders up to
 * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
 * (which is a limitation of the THP implementation).
 */
#define THP_ORDERS_ALL_ANON        ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))

/*
 * Mask of all large folio orders supported for file THP. Folios in a DAX
 * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to
 * it.  Same to PFNMAPs where there's neither page* nor pagecache.
 */
#define THP_ORDERS_ALL_SPECIAL_DAX        \
        (BIT(PMD_ORDER) | BIT(PUD_ORDER))
#define THP_ORDERS_ALL_FILE_DEFAULT        \
        ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))

/*
 * Mask of all large folio orders supported for THP.
 */
#define THP_ORDERS_ALL        \
        (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL_DAX | THP_ORDERS_ALL_FILE_DEFAULT)

enum tva_type {
        TVA_SMAPS,                /* Exposing "THPeligible:" in smaps. */
        TVA_PAGEFAULT,                /* Serving a page fault. */
        TVA_KHUGEPAGED,                /* Khugepaged collapse. */
        TVA_FORCED_COLLAPSE,        /* Forced collapse (e.g. MADV_COLLAPSE). */
};

#define thp_vma_allowable_order(vma, vm_flags, type, order) \
        (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order)))

#define split_folio(f) split_folio_to_list(f, NULL)

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PUD_SHIFT PUD_SHIFT
#else
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
#endif

#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#define HPAGE_PMD_MASK        (~(HPAGE_PMD_SIZE - 1))
#define HPAGE_PMD_SIZE        ((1UL) << HPAGE_PMD_SHIFT)

#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
#define HPAGE_PUD_MASK        (~(HPAGE_PUD_SIZE - 1))
#define HPAGE_PUD_SIZE        ((1UL) << HPAGE_PUD_SHIFT)

enum mthp_stat_item {
        MTHP_STAT_ANON_FAULT_ALLOC,
        MTHP_STAT_ANON_FAULT_FALLBACK,
        MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
        MTHP_STAT_ZSWPOUT,
        MTHP_STAT_SWPIN,
        MTHP_STAT_SWPIN_FALLBACK,
        MTHP_STAT_SWPIN_FALLBACK_CHARGE,
        MTHP_STAT_SWPOUT,
        MTHP_STAT_SWPOUT_FALLBACK,
        MTHP_STAT_SHMEM_ALLOC,
        MTHP_STAT_SHMEM_FALLBACK,
        MTHP_STAT_SHMEM_FALLBACK_CHARGE,
        MTHP_STAT_SPLIT,
        MTHP_STAT_SPLIT_FAILED,
        MTHP_STAT_SPLIT_DEFERRED,
        MTHP_STAT_NR_ANON,
        MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
        __MTHP_STAT_COUNT
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
struct mthp_stat {
        unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
};

DECLARE_PER_CPU(struct mthp_stat, mthp_stats);

static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
{
        if (order <= 0 || order > PMD_ORDER)
                return;

        this_cpu_add(mthp_stats.stats[order][item], delta);
}

static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{
        mod_mthp_stat(order, item, 1);
}

#else
static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
{
}

static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

extern unsigned long transparent_hugepage_flags;
extern unsigned long huge_anon_orders_always;
extern unsigned long huge_anon_orders_madvise;
extern unsigned long huge_anon_orders_inherit;

static inline bool hugepage_global_enabled(void)
{
        return transparent_hugepage_flags &
                        ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
                        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
}

static inline bool hugepage_global_always(void)
{
        return transparent_hugepage_flags &
                        (1<<TRANSPARENT_HUGEPAGE_FLAG);
}

static inline int highest_order(unsigned long orders)
{
        return fls_long(orders) - 1;
}

static inline int next_order(unsigned long *orders, int prev)
{
        *orders &= ~BIT(prev);
        return highest_order(*orders);
}

/*
 * Do the below checks:
 *   - For file vma, check if the linear page offset of vma is
 *     order-aligned within the file.  The hugepage is
 *     guaranteed to be order-aligned within the file, but we must
 *     check that the order-aligned addresses in the VMA map to
 *     order-aligned offsets within the file, else the hugepage will
 *     not be mappable.
 *   - For all vmas, check if the haddr is in an aligned hugepage
 *     area.
 */
static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
                unsigned long addr, int order)
{
        unsigned long hpage_size = PAGE_SIZE << order;
        unsigned long haddr;

        /* Don't have to check pgoff for anonymous vma */
        if (!vma_is_anonymous(vma)) {
                if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
                                hpage_size >> PAGE_SHIFT))
                        return false;
        }

        haddr = ALIGN_DOWN(addr, hpage_size);

        if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end)
                return false;
        return true;
}

/*
 * Filter the bitfield of input orders to the ones suitable for use in the vma.
 * See thp_vma_suitable_order().
 * All orders that pass the checks are returned as a bitfield.
 */
static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
                unsigned long addr, unsigned long orders)
{
        int order;

        /*
         * Iterate over orders, highest to lowest, removing orders that don't
         * meet alignment requirements from the set. Exit loop at first order
         * that meets requirements, since all lower orders must also meet
         * requirements.
         */

        order = highest_order(orders);

        while (orders) {
                if (thp_vma_suitable_order(vma, addr, order))
                        break;
                order = next_order(&orders, order);
        }

        return orders;
}

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
                                         vm_flags_t vm_flags,
                                         enum tva_type type,
                                         unsigned long orders);

/**
 * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
 * @vma:  the vm area to check
 * @vm_flags: use these vm_flags instead of vma->vm_flags
 * @type: TVA type
 * @orders: bitfield of all orders to consider
 *
 * Calculates the intersection of the requested hugepage orders and the allowed
 * hugepage orders for the provided vma. Permitted orders are encoded as a set
 * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3
 * corresponds to order-3, etc). Order-0 is never considered a hugepage order.
 *
 * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage
 * orders are allowed.
 */
static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
                                       vm_flags_t vm_flags,
                                       enum tva_type type,
                                       unsigned long orders)
{
        /*
         * Optimization to check if required orders are enabled early. Only
         * forced collapse ignores sysfs configs.
         */
        if (type != TVA_FORCED_COLLAPSE && vma_is_anonymous(vma)) {
                unsigned long mask = READ_ONCE(huge_anon_orders_always);

                if (vm_flags & VM_HUGEPAGE)
                        mask |= READ_ONCE(huge_anon_orders_madvise);
                if (hugepage_global_always() ||
                    ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
                        mask |= READ_ONCE(huge_anon_orders_inherit);

                orders &= mask;
                if (!orders)
                        return 0;
        }

        return __thp_vma_allowable_orders(vma, vm_flags, type, orders);
}

struct thpsize {
        struct kobject kobj;
        struct list_head node;
        int order;
};

#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)

#define transparent_hugepage_use_zero_page()                                \
        (transparent_hugepage_flags &                                        \
         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))

/*
 * Check whether THPs are explicitly disabled for this VMA, for example,
 * through madvise or prctl.
 */
static inline bool vma_thp_disabled(struct vm_area_struct *vma,
                vm_flags_t vm_flags, bool forced_collapse)
{
        /* Are THPs disabled for this VMA? */
        if (vm_flags & VM_NOHUGEPAGE)
                return true;
        /* Are THPs disabled for all VMAs in the whole process? */
        if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, vma->vm_mm))
                return true;
        /*
         * Are THPs disabled only for VMAs where we didn't get an explicit
         * advise to use them?
         */
        if (vm_flags & VM_HUGEPAGE)
                return false;
        /*
         * Forcing a collapse (e.g., madv_collapse), is a clear advice to
         * use THPs.
         */
        if (forced_collapse)
                return false;
        return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm);
}

static inline bool thp_disabled_by_hw(void)
{
        /* If the hardware/firmware marked hugepage support disabled. */
        return transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED);
}

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags);
unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags,
                vm_flags_t vm_flags);

enum split_type {
        SPLIT_TYPE_UNIFORM,
        SPLIT_TYPE_NON_UNIFORM,
};

int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                unsigned int new_order);
int folio_split_unmapped(struct folio *folio, unsigned int new_order);
unsigned int min_order_for_split(struct folio *folio);
int split_folio_to_list(struct folio *folio, struct list_head *list);
int folio_check_splittable(struct folio *folio, unsigned int new_order,
                           enum split_type split_type);
int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
                struct list_head *list);

static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                unsigned int new_order)
{
        return __split_huge_page_to_list_to_order(page, list, new_order);
}
static inline int split_huge_page_to_order(struct page *page, unsigned int new_order)
{
        return split_huge_page_to_list_to_order(page, NULL, new_order);
}

/**
 * try_folio_split_to_order() - try to split a @folio at @page to @new_order
 * using non uniform split.
 * @folio: folio to be split
 * @page: split to @new_order at the given page
 * @new_order: the target split order
 *
 * Try to split a @folio at @page using non uniform split to @new_order, if
 * non uniform split is not supported, fall back to uniform split. After-split
 * folios are put back to LRU list. Use min_order_for_split() to get the lower
 * bound of @new_order.
 *
 * Return: 0 - split is successful, otherwise split failed.
 */
static inline int try_folio_split_to_order(struct folio *folio,
                struct page *page, unsigned int new_order)
{
        if (folio_check_splittable(folio, new_order, SPLIT_TYPE_NON_UNIFORM))
                return split_huge_page_to_order(&folio->page, new_order);
        return folio_split(folio, new_order, page, NULL);
}
static inline int split_huge_page(struct page *page)
{
        return split_huge_page_to_list_to_order(page, NULL, 0);
}
void deferred_split_folio(struct folio *folio, bool partially_mapped);
#ifdef CONFIG_MEMCG
void reparent_deferred_split_queue(struct mem_cgroup *memcg);
#endif

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze);

/**
 * pmd_is_huge() - Is this PMD either a huge PMD entry or a software leaf entry?
 * @pmd: The PMD to check.
 *
 * A huge PMD entry is a non-empty entry which is present and marked huge or a
 * software leaf entry. This check be performed without the appropriate locks
 * held, in which case the condition should be rechecked after they are
 * acquired.
 *
 * Returns: true if this PMD is huge, false otherwise.
 */
static inline bool pmd_is_huge(pmd_t pmd)
{
        if (pmd_present(pmd)) {
                return pmd_trans_huge(pmd);
        } else if (!pmd_none(pmd)) {
                /*
                 * Non-present PMDs must be valid huge non-present entries. We
                 * cannot assert that here due to header dependency issues.
                 */
                return true;
        }

        return false;
}

#define split_huge_pmd(__vma, __pmd, __address)                                \
        do {                                                                \
                pmd_t *____pmd = (__pmd);                                \
                if (pmd_is_huge(*____pmd))                                \
                        __split_huge_pmd(__vma, __pmd, __address,        \
                                         false);                        \
        }  while (0)

void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
                bool freeze);

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pud_t *pudp, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags);
#else
static inline int
change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
                pud_t *pudp, unsigned long addr, pgprot_t newprot,
                unsigned long cp_flags) { return 0; }
#endif

#define split_huge_pud(__vma, __pud, __address)                                \
        do {                                                                \
                pud_t *____pud = (__pud);                                \
                if (pud_trans_huge(*____pud))                                \
                        __split_huge_pud(__vma, __pud, __address);        \
        }  while (0)

int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags,
                     int advice);
int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
                     unsigned long end, bool *lock_dropped);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
                           unsigned long end, struct vm_area_struct *next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);

/* mmap_lock must be held on entry */
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
                struct vm_area_struct *vma)
{
        if (pmd_is_huge(*pmd))
                return __pmd_trans_huge_lock(pmd, vma);

        return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
                struct vm_area_struct *vma)
{
        if (pud_trans_huge(*pud))
                return __pud_trans_huge_lock(pud, vma);
        else
                return NULL;
}

/**
 * folio_test_pmd_mappable - Can we map this folio with a PMD?
 * @folio: The folio to test
 *
 * Return: true - @folio can be mapped, false - @folio cannot be mapped.
 */
static inline bool folio_test_pmd_mappable(struct folio *folio)
{
        return folio_order(folio) >= HPAGE_PMD_ORDER;
}

vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);

vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);

extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;

static inline bool is_huge_zero_folio(const struct folio *folio)
{
        VM_WARN_ON_ONCE(!folio);

        return READ_ONCE(huge_zero_folio) == folio;
}

static inline bool is_huge_zero_pfn(unsigned long pfn)
{
        return READ_ONCE(huge_zero_pfn) == (pfn & ~(HPAGE_PMD_NR - 1));
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
{
        return pmd_present(pmd) && is_huge_zero_pfn(pmd_pfn(pmd));
}

struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);

static inline struct folio *get_persistent_huge_zero_folio(void)
{
        if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
                return NULL;

        if (unlikely(!huge_zero_folio))
                return NULL;

        return huge_zero_folio;
}

static inline bool thp_migration_supported(void)
{
        return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}

void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
                           pmd_t *pmd, bool freeze);
bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
                           pmd_t *pmdp, struct folio *folio);
void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd,
                struct vm_area_struct *vma, unsigned long haddr);

#else /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline bool folio_test_pmd_mappable(struct folio *folio)
{
        return false;
}

static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
                unsigned long addr, int order)
{
        return false;
}

static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
                unsigned long addr, unsigned long orders)
{
        return 0;
}

static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
                                        vm_flags_t vm_flags,
                                        enum tva_type type,
                                        unsigned long orders)
{
        return 0;
}

#define transparent_hugepage_flags 0UL

#define thp_get_unmapped_area        NULL

static inline unsigned long
thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                              unsigned long len, unsigned long pgoff,
                              unsigned long flags, vm_flags_t vm_flags)
{
        return 0;
}

static inline bool
can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
{
        return false;
}
static inline int
split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                unsigned int new_order)
{
        VM_WARN_ON_ONCE_PAGE(1, page);
        return -EINVAL;
}
static inline int split_huge_page_to_order(struct page *page, unsigned int new_order)
{
        VM_WARN_ON_ONCE_PAGE(1, page);
        return -EINVAL;
}
static inline int split_huge_page(struct page *page)
{
        VM_WARN_ON_ONCE_PAGE(1, page);
        return -EINVAL;
}

static inline unsigned int min_order_for_split(struct folio *folio)
{
        VM_WARN_ON_ONCE_FOLIO(1, folio);
        return 0;
}

static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
{
        VM_WARN_ON_ONCE_FOLIO(1, folio);
        return -EINVAL;
}

static inline int try_folio_split_to_order(struct folio *folio,
                struct page *page, unsigned int new_order)
{
        VM_WARN_ON_ONCE_FOLIO(1, folio);
        return -EINVAL;
}

static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
#define split_huge_pmd(__vma, __pmd, __address)        \
        do { } while (0)

static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze) {}
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
                unsigned long address, bool freeze) {}
static inline void split_huge_pmd_locked(struct vm_area_struct *vma,
                                         unsigned long address, pmd_t *pmd,
                                         bool freeze) {}

static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
                                         unsigned long addr, pmd_t *pmdp,
                                         struct folio *folio)
{
        return false;
}

#define split_huge_pud(__vma, __pmd, __address)        \
        do { } while (0)

static inline int hugepage_madvise(struct vm_area_struct *vma,
                                   vm_flags_t *vm_flags, int advice)
{
        return -EINVAL;
}

static inline int madvise_collapse(struct vm_area_struct *vma,
                                   unsigned long start,
                                   unsigned long end, bool *lock_dropped)
{
        return -EINVAL;
}

static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         unsigned long start,
                                         unsigned long end,
                                         struct vm_area_struct *next)
{
}
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
                struct vm_area_struct *vma)
{
        return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
                struct vm_area_struct *vma)
{
        return NULL;
}

static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
        return 0;
}

static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
{
        return 0;
}

static inline bool is_huge_zero_folio(const struct folio *folio)
{
        return false;
}

static inline bool is_huge_zero_pfn(unsigned long pfn)
{
        return false;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
{
        return false;
}

static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
{
        return;
}

static inline bool thp_migration_supported(void)
{
        return false;
}

static inline int highest_order(unsigned long orders)
{
        return 0;
}

static inline int next_order(unsigned long *orders, int prev)
{
        return 0;
}

static inline void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                                    unsigned long address)
{
}

static inline int change_huge_pud(struct mmu_gather *tlb,
                                  struct vm_area_struct *vma, pud_t *pudp,
                                  unsigned long addr, pgprot_t newprot,
                                  unsigned long cp_flags)
{
        return 0;
}

static inline struct folio *get_persistent_huge_zero_folio(void)
{
        return NULL;
}

static inline bool pmd_is_huge(pmd_t pmd)
{
        return false;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline bool is_pmd_order(unsigned int order)
{
        return order == HPAGE_PMD_ORDER;
}

static inline int split_folio_to_list_to_order(struct folio *folio,
                struct list_head *list, int new_order)
{
        return split_huge_page_to_list_to_order(&folio->page, list, new_order);
}

static inline int split_folio_to_order(struct folio *folio, int new_order)
{
        return split_folio_to_list_to_order(folio, NULL, new_order);
}

/**
 * largest_zero_folio - Get the largest zero size folio available
 *
 * This function shall be used when mm_get_huge_zero_folio() cannot be
 * used as there is no appropriate mm lifetime to tie the huge zero folio
 * from the caller.
 *
 * Deduce the size of the folio with folio_size instead of assuming the
 * folio size.
 *
 * Return: pointer to PMD sized zero folio if CONFIG_PERSISTENT_HUGE_ZERO_FOLIO
 * is enabled or a single page sized zero folio
 */
static inline struct folio *largest_zero_folio(void)
{
        struct folio *folio = get_persistent_huge_zero_folio();

        if (folio)
                return folio;

        return page_folio(ZERO_PAGE(0));
}
#endif /* _LINUX_HUGE_MM_H */






































































































































































































































































    1 

































































































    1 





































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
// SPDX-License-Identifier: GPL-2.0+
/*
 * ext4_jbd2.h
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
 *
 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
 *
 * Ext4-specific journaling extensions.
 */

#ifndef _EXT4_JBD2_H
#define _EXT4_JBD2_H

#include <linux/fs.h>
#include <linux/jbd2.h>
#include "ext4.h"

#define EXT4_JOURNAL(inode)        (EXT4_SB((inode)->i_sb)->s_journal)

/* Define the number of blocks we need to account to a transaction to
 * modify one block of data.
 *
 * We may have to touch one inode, one bitmap buffer, up to three
 * indirection blocks, the group and superblock summaries, and the data
 * block to complete the transaction.
 *
 * For extents-enabled fs we may have to allocate and modify up to
 * 5 levels of tree, data block (for each of these we need bitmap + group
 * summaries), root which is stored in the inode, sb
 */

#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                                \
        (ext4_has_feature_extents(sb) ? 20U : 8U)

/* Extended attribute operations touch at most two data buffers,
 * two bitmap buffers, and two group summaries, in addition to the inode
 * and the superblock, which are already accounted for. */

#define EXT4_XATTR_TRANS_BLOCKS                6U

/* Define the minimum size for a transaction which modifies data.  This
 * needs to take into account the fact that we may end up modifying two
 * quota files too (one for the group, one for the user quota).  The
 * superblock only gets updated once, of course, so don't bother
 * counting that again for the quota updates. */

#define EXT4_DATA_TRANS_BLOCKS(sb)        (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
                                         EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/*
 * Define the number of metadata blocks we need to account to modify data.
 *
 * This include super block, inode block, quota blocks and xattr blocks
 */
#define EXT4_META_TRANS_BLOCKS(sb)        (EXT4_XATTR_TRANS_BLOCKS + \
                                        EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/* Define an arbitrary limit for the amount of data we will anticipate
 * writing to any given transaction.  For unbounded transactions such as
 * write(2) and truncate(2) we can write more than this, but we always
 * start off at the maximum transaction size and grow the transaction
 * optimistically as we go. */

#define EXT4_MAX_TRANS_DATA                64U

/* We break up a large truncate or write transaction once the handle's
 * buffer credits gets this low, we need either to extend the
 * transaction or to start a new one.  Reserve enough space here for
 * inode, bitmap, superblock, group and indirection updates for at least
 * one block, plus two quota updates.  Quota allocations are not
 * needed. */

#define EXT4_RESERVE_TRANS_BLOCKS        12U

/*
 * Number of credits needed if we need to insert an entry into a
 * directory.  For each new index block, we need 4 blocks (old index
 * block, new index block, bitmap block, bg summary).  For normal
 * htree directories there are 2 levels; if the largedir feature
 * enabled it's 3 levels.
 */
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS        12U

#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
 * allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_INIT_REWRITE) : 0)

#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))

/*
 * Ext4 handle operation types -- for logging purposes
 */
#define EXT4_HT_MISC             0
#define EXT4_HT_INODE            1
#define EXT4_HT_WRITE_PAGE       2
#define EXT4_HT_MAP_BLOCKS       3
#define EXT4_HT_DIR              4
#define EXT4_HT_TRUNCATE         5
#define EXT4_HT_QUOTA            6
#define EXT4_HT_RESIZE           7
#define EXT4_HT_MIGRATE          8
#define EXT4_HT_MOVE_EXTENTS     9
#define EXT4_HT_XATTR           10
#define EXT4_HT_EXT_CONVERT     11
#define EXT4_HT_MAX             12

int
ext4_mark_iloc_dirty(handle_t *handle,
                     struct inode *inode,
                     struct ext4_iloc *iloc);

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                        struct ext4_iloc *iloc);

#define ext4_mark_inode_dirty(__h, __i)                                        \
                __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__)
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line);

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc);
/*
 * Wrapper functions with which ext4 calls into JBD.
 */
int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct super_block *sb,
                                    struct buffer_head *bh,
                                    enum ext4_journal_trigger_type trigger_type);

int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
                  int is_metadata, struct inode *inode,
                  struct buffer_head *bh, ext4_fsblk_t blocknr);

int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct super_block *sb,
                                struct buffer_head *bh,
                                enum ext4_journal_trigger_type trigger_type);

int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 handle_t *handle, struct inode *inode,
                                 struct buffer_head *bh);

#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \
        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \
                                        (bh), (trigger_type))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
        __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
                      (bh), (block_nr))
#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \
        __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \
                                         (bh), (trigger_type))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
        __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
                                     (bh))

handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb,
                                  unsigned int line, int type, int blocks,
                                  int rsv_blocks, int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);

#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)

/* Note:  Do not use this for NULL handles.  This is only to determine if
 * a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
        if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
                return 0;
        return 1;
}

static inline void ext4_handle_sync(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                handle->h_sync = 1;
}

static inline int ext4_handle_is_aborted(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                return is_handle_aborted(handle);
        return 0;
}

static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
                                                    int blocks)
{
        /* Freeing each metadata block can result in freeing one cluster */
        return blocks * EXT4_SB(sb)->s_cluster_ratio;
}

static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
{
        return ext4_free_metadata_revoke_credits(sb, 8);
}

#define ext4_journal_start_sb(sb, type, nblocks)                        \
        __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\
                                ext4_trans_default_revoke_credits(sb))

#define ext4_journal_start(inode, type, nblocks)                        \
        __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0,        \
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
        __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
        __ext4_journal_start((inode), __LINE__, (type), (blocks), 0,        \
                             (revoke_creds))

static inline handle_t *__ext4_journal_start(struct inode *inode,
                                             unsigned int line, int type,
                                             int blocks, int rsv_blocks,
                                             int revoke_creds)
{
        return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks,
                                       rsv_blocks, revoke_creds);
}

#define ext4_journal_stop(handle) \
        __ext4_journal_stop(__func__, __LINE__, (handle))

#define ext4_journal_start_reserved(handle, type) \
        __ext4_journal_start_reserved((handle), __LINE__, (type))

handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                                        int type);

static inline handle_t *ext4_journal_current_handle(void)
{
        return journal_current_handle();
}

static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_extend(handle, nblocks, revoke);
        return 0;
}

static inline int ext4_journal_restart(handle_t *handle, int nblocks,
                                       int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
        return 0;
}

int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
                                  int extend_cred, int revoke_cred);


/*
 * Ensure @handle has at least @check_creds credits available. If not,
 * transaction will be extended or restarted to contain at least @extend_cred
 * credits. Before restarting transaction @fn is executed to allow for cleanup
 * before the transaction is restarted.
 *
 * The return value is < 0 in case of error, 0 in case the handle has enough
 * credits or transaction extension succeeded, 1 in case transaction had to be
 * restarted.
 */
#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred,        \
                                       revoke_cred, fn) \
({                                                                        \
        __label__ __ensure_end;                                                \
        int err = __ext4_journal_ensure_credits((handle), (check_cred),        \
                                        (extend_cred), (revoke_cred));        \
                                                                        \
        if (err <= 0)                                                        \
                goto __ensure_end;                                        \
        err = (fn);                                                        \
        if (err < 0)                                                        \
                goto __ensure_end;                                        \
        err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
        if (err == 0)                                                        \
                err = 1;                                                \
__ensure_end:                                                                \
        err;                                                                \
})

/*
 * Ensure given handle has at least requested amount of credits available,
 * possibly restarting transaction if needed. We also make sure the transaction
 * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
 * as freeing one or two blocks is very common pattern and requesting this is
 * very cheap.
 */
static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
                                              int revoke_creds)
{
        return ext4_journal_ensure_credits_fn(handle, credits, credits,
                                revoke_creds, 0);
}

static inline int ext4_journal_blocks_per_folio(struct inode *inode)
{
        if (EXT4_JOURNAL(inode) != NULL)
                return jbd2_journal_blocks_per_folio(inode);
        return 0;
}

static inline int ext4_journal_force_commit(journal_t *journal)
{
        if (journal)
                return jbd2_journal_force_commit(journal);
        return 0;
}

static inline int ext4_jbd2_inode_add_write(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_write(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_wait(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline void ext4_update_inode_fsync_trans(handle_t *handle,
                                                 struct inode *inode,
                                                 int datasync)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
                ei->i_sync_tid = handle->h_transaction->t_tid;
                if (datasync)
                        ei->i_datasync_tid = handle->h_transaction->t_tid;
        }
}

/* super.c */
int ext4_force_commit(struct super_block *sb);

/*
 * Ext4 inode journal modes
 */
#define EXT4_INODE_JOURNAL_DATA_MODE        0x01 /* journal data mode */
#define EXT4_INODE_ORDERED_DATA_MODE        0x02 /* ordered data mode */
#define EXT4_INODE_WRITEBACK_DATA_MODE        0x04 /* writeback data mode */

int ext4_inode_journal_mode(struct inode *inode);

static inline int ext4_should_journal_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
}

static inline int ext4_should_order_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
}

static inline int ext4_should_writeback_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}

static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
{
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                return 0;
        if (!ext4_should_journal_data(inode))
                return 0;
        /*
         * Data blocks in one extent are contiguous, just account for partial
         * clusters at extent boundaries
         */
        return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
}

/*
 * This function controls whether or not we should try to go down the
 * dioread_nolock code paths, which makes it safe to avoid taking
 * i_rwsem for direct I/O reads.  This only works for extent-based
 * files, and it doesn't work if data journaling is enabled, since the
 * dioread_nolock code uses b_private to pass information back to the
 * I/O completion handler, and this conflicts with the jbd's use of
 * b_private.
 */
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
        if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
        /* temporary fix to prevent generic/422 test failures */
        if (!test_opt(inode->i_sb, DELALLOC))
                return 0;
        return 1;
}

/*
 * Pass journal explicitly as it may not be cached in the sbi->s_journal in some
 * cases
 */
static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal)
{
        int err = 0;

        /*
         * At this point only two things can be operating on the journal.
         * JBD2 thread performing transaction commit and s_sb_upd_work
         * issuing sb update through the journal. Once we set
         * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not
         * queue s_sb_upd_work and ext4_force_commit() makes sure any
         * ext4_handle_error() calls from the running transaction commit are
         * finished. Hence no new s_sb_upd_work can be queued after we
         * flush it here.
         */
        ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY);

        ext4_force_commit(sbi->s_sb);
        flush_work(&sbi->s_sb_upd_work);

        err = jbd2_journal_destroy(journal);
        sbi->s_journal = NULL;

        return err;
}

#endif        /* _EXT4_JBD2_H */

























































































































































































































































































   14 












   13 























   14 





   14 


   14 



   14 






















































   14 










   14 







    3 




   14 













    2 




    3 


    3 





    2 







































   13 














   14 
    3 



























    3 
   13 








   14 

































   13 
   11 







    3 

   14 







   14 








    3 



    3 

















   14 









































   14 













   13 



















   14 































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
// SPDX-License-Identifier: GPL-2.0
/*
 * kernel userspace event delivery
 *
 * Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 * Copyright (C) 2004 Novell, Inc.  All rights reserved.
 * Copyright (C) 2004 IBM, Inc. All rights reserved.
 *
 * Authors:
 *        Robert Love                <rml@novell.com>
 *        Kay Sievers                <kay.sievers@vrfy.org>
 *        Arjan van de Ven        <arjanv@redhat.com>
 *        Greg Kroah-Hartman        <greg@kroah.com>
 */

#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/kobject.h>
#include <linux/export.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/uidgid.h>
#include <linux/uuid.h>
#include <linux/ctype.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>


atomic64_t uevent_seqnum;
#ifdef CONFIG_UEVENT_HELPER
char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH;
#endif

struct uevent_sock {
        struct list_head list;
        struct sock *sk;
};

#ifdef CONFIG_NET
static LIST_HEAD(uevent_sock_list);
/* This lock protects uevent_sock_list */
static DEFINE_MUTEX(uevent_sock_mutex);
#endif

/* the strings here must match the enum in include/linux/kobject.h */
static const char *kobject_actions[] = {
        [KOBJ_ADD] =                "add",
        [KOBJ_REMOVE] =                "remove",
        [KOBJ_CHANGE] =                "change",
        [KOBJ_MOVE] =                "move",
        [KOBJ_ONLINE] =                "online",
        [KOBJ_OFFLINE] =        "offline",
        [KOBJ_BIND] =                "bind",
        [KOBJ_UNBIND] =                "unbind",
};

static int kobject_action_type(const char *buf, size_t count,
                               enum kobject_action *type,
                               const char **args)
{
        enum kobject_action action;
        size_t count_first;
        const char *args_start;
        int ret = -EINVAL;

        if (count && (buf[count-1] == '\n' || buf[count-1] == '\0'))
                count--;

        if (!count)
                goto out;

        args_start = strnchr(buf, count, ' ');
        if (args_start) {
                count_first = args_start - buf;
                args_start = args_start + 1;
        } else
                count_first = count;

        for (action = 0; action < ARRAY_SIZE(kobject_actions); action++) {
                if (strncmp(kobject_actions[action], buf, count_first) != 0)
                        continue;
                if (kobject_actions[action][count_first] != '\0')
                        continue;
                if (args)
                        *args = args_start;
                *type = action;
                ret = 0;
                break;
        }
out:
        return ret;
}

static const char *action_arg_word_end(const char *buf, const char *buf_end,
                                       char delim)
{
        const char *next = buf;

        while (next <= buf_end && *next != delim)
                if (!isalnum(*next++))
                        return NULL;

        if (next == buf)
                return NULL;

        return next;
}

static int kobject_action_args(const char *buf, size_t count,
                               struct kobj_uevent_env **ret_env)
{
        struct kobj_uevent_env *env = NULL;
        const char *next, *buf_end, *key;
        int key_len;
        int r = -EINVAL;

        if (count && (buf[count - 1] == '\n' || buf[count - 1] == '\0'))
                count--;

        if (!count)
                return -EINVAL;

        env = kzalloc_obj(*env);
        if (!env)
                return -ENOMEM;

        /* first arg is UUID */
        if (count < UUID_STRING_LEN || !uuid_is_valid(buf) ||
            add_uevent_var(env, "SYNTH_UUID=%.*s", UUID_STRING_LEN, buf))
                goto out;

        /*
         * the rest are custom environment variables in KEY=VALUE
         * format with ' ' delimiter between each KEY=VALUE pair
         */
        next = buf + UUID_STRING_LEN;
        buf_end = buf + count - 1;

        while (next <= buf_end) {
                if (*next != ' ')
                        goto out;

                /* skip the ' ', key must follow */
                key = ++next;
                if (key > buf_end)
                        goto out;

                buf = next;
                next = action_arg_word_end(buf, buf_end, '=');
                if (!next || next > buf_end || *next != '=')
                        goto out;
                key_len = next - buf;

                /* skip the '=', value must follow */
                if (++next > buf_end)
                        goto out;

                buf = next;
                next = action_arg_word_end(buf, buf_end, ' ');
                if (!next)
                        goto out;

                if (add_uevent_var(env, "SYNTH_ARG_%.*s=%.*s",
                                   key_len, key, (int) (next - buf), buf))
                        goto out;
        }

        r = 0;
out:
        if (r)
                kfree(env);
        else
                *ret_env = env;
        return r;
}

/**
 * kobject_synth_uevent - send synthetic uevent with arguments
 *
 * @kobj: struct kobject for which synthetic uevent is to be generated
 * @buf: buffer containing action type and action args, newline is ignored
 * @count: length of buffer
 *
 * Returns 0 if kobject_synthetic_uevent() is completed with success or the
 * corresponding error when it fails.
 */
int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count)
{
        char *no_uuid_envp[] = { "SYNTH_UUID=0", NULL };
        enum kobject_action action;
        const char *action_args;
        struct kobj_uevent_env *env;
        const char *msg = NULL, *devpath;
        int r;

        r = kobject_action_type(buf, count, &action, &action_args);
        if (r) {
                msg = "unknown uevent action string";
                goto out;
        }

        if (!action_args) {
                r = kobject_uevent_env(kobj, action, no_uuid_envp);
                goto out;
        }

        r = kobject_action_args(action_args,
                                count - (action_args - buf), &env);
        if (r == -EINVAL) {
                msg = "incorrect uevent action arguments";
                goto out;
        }

        if (r)
                goto out;

        r = kobject_uevent_env(kobj, action, env->envp);
        kfree(env);
out:
        if (r) {
                devpath = kobject_get_path(kobj, GFP_KERNEL);
                pr_warn("synth uevent: %s: %s\n",
                       devpath ?: "unknown device",
                       msg ?: "failed to send uevent");
                kfree(devpath);
        }
        return r;
}

#ifdef CONFIG_UEVENT_HELPER
static int kobj_usermode_filter(struct kobject *kobj)
{
        const struct kobj_ns_type_operations *ops;

        ops = kobj_ns_ops(kobj);
        if (ops) {
                const struct ns_common *init_ns, *ns;

                ns = kobj->ktype->namespace(kobj);
                init_ns = ops->initial_ns();
                return ns != init_ns;
        }

        return 0;
}

static int init_uevent_argv(struct kobj_uevent_env *env, const char *subsystem)
{
        int buffer_size = sizeof(env->buf) - env->buflen;
        int len;

        len = strscpy(&env->buf[env->buflen], subsystem, buffer_size);
        if (len < 0) {
                pr_warn("%s: insufficient buffer space (%u left) for %s\n",
                        __func__, buffer_size, subsystem);
                return -ENOMEM;
        }

        env->argv[0] = uevent_helper;
        env->argv[1] = &env->buf[env->buflen];
        env->argv[2] = NULL;

        env->buflen += len + 1;
        return 0;
}

static void cleanup_uevent_env(struct subprocess_info *info)
{
        kfree(info->data);
}
#endif

#ifdef CONFIG_NET
static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env,
                                        const char *action_string,
                                        const char *devpath)
{
        struct netlink_skb_parms *parms;
        struct sk_buff *skb = NULL;
        char *scratch;
        size_t len;

        /* allocate message with maximum possible size */
        len = strlen(action_string) + strlen(devpath) + 2;
        skb = alloc_skb(len + env->buflen, GFP_KERNEL);
        if (!skb)
                return NULL;

        /* add header */
        scratch = skb_put(skb, len);
        sprintf(scratch, "%s@%s", action_string, devpath);

        skb_put_data(skb, env->buf, env->buflen);

        parms = &NETLINK_CB(skb);
        parms->creds.uid = GLOBAL_ROOT_UID;
        parms->creds.gid = GLOBAL_ROOT_GID;
        parms->dst_group = 1;
        parms->portid = 0;

        return skb;
}

static int uevent_net_broadcast_untagged(struct kobj_uevent_env *env,
                                         const char *action_string,
                                         const char *devpath)
{
        struct sk_buff *skb = NULL;
        struct uevent_sock *ue_sk;
        int retval = 0;

        /* send netlink message */
        mutex_lock(&uevent_sock_mutex);
        list_for_each_entry(ue_sk, &uevent_sock_list, list) {
                struct sock *uevent_sock = ue_sk->sk;

                if (!netlink_has_listeners(uevent_sock, 1))
                        continue;

                if (!skb) {
                        retval = -ENOMEM;
                        skb = alloc_uevent_skb(env, action_string, devpath);
                        if (!skb)
                                continue;
                }

                retval = netlink_broadcast(uevent_sock, skb_get(skb), 0, 1,
                                           GFP_KERNEL);
                /* ENOBUFS should be handled in userspace */
                if (retval == -ENOBUFS || retval == -ESRCH)
                        retval = 0;
        }
        mutex_unlock(&uevent_sock_mutex);
        consume_skb(skb);

        return retval;
}

static int uevent_net_broadcast_tagged(struct sock *usk,
                                       struct kobj_uevent_env *env,
                                       const char *action_string,
                                       const char *devpath)
{
        struct user_namespace *owning_user_ns = sock_net(usk)->user_ns;
        struct sk_buff *skb = NULL;
        int ret = 0;

        skb = alloc_uevent_skb(env, action_string, devpath);
        if (!skb)
                return -ENOMEM;

        /* fix credentials */
        if (owning_user_ns != &init_user_ns) {
                struct netlink_skb_parms *parms = &NETLINK_CB(skb);
                kuid_t root_uid;
                kgid_t root_gid;

                /* fix uid */
                root_uid = make_kuid(owning_user_ns, 0);
                if (uid_valid(root_uid))
                        parms->creds.uid = root_uid;

                /* fix gid */
                root_gid = make_kgid(owning_user_ns, 0);
                if (gid_valid(root_gid))
                        parms->creds.gid = root_gid;
        }

        ret = netlink_broadcast(usk, skb, 0, 1, GFP_KERNEL);
        /* ENOBUFS should be handled in userspace */
        if (ret == -ENOBUFS || ret == -ESRCH)
                ret = 0;

        return ret;
}
#endif

static int kobject_uevent_net_broadcast(struct kobject *kobj,
                                        struct kobj_uevent_env *env,
                                        const char *action_string,
                                        const char *devpath)
{
        int ret = 0;

#ifdef CONFIG_NET
        const struct kobj_ns_type_operations *ops;
        const struct ns_common *ns = NULL;

        ops = kobj_ns_ops(kobj);
        if (!ops && kobj->kset) {
                struct kobject *ksobj = &kobj->kset->kobj;

                if (ksobj->parent != NULL)
                        ops = kobj_ns_ops(ksobj->parent);
        }

        /* kobjects currently only carry network namespace tags and they
         * are the only tag relevant here since we want to decide which
         * network namespaces to broadcast the uevent into.
         */
        if (ops && ops->netlink_ns && kobj->ktype->namespace)
                if (ops->type == KOBJ_NS_TYPE_NET)
                        ns = kobj->ktype->namespace(kobj);

        if (!ns)
                ret = uevent_net_broadcast_untagged(env, action_string,
                                                    devpath);
        else {
                const struct net *net = container_of(ns, struct net, ns);

                ret = uevent_net_broadcast_tagged(net->uevent_sock->sk, env,
                                                  action_string, devpath);
        }
#endif

        return ret;
}

static void zap_modalias_env(struct kobj_uevent_env *env)
{
        static const char modalias_prefix[] = "MODALIAS=";
        size_t len;
        int i, j;

        for (i = 0; i < env->envp_idx;) {
                if (strncmp(env->envp[i], modalias_prefix,
                            sizeof(modalias_prefix) - 1)) {
                        i++;
                        continue;
                }

                len = strlen(env->envp[i]) + 1;

                if (i != env->envp_idx - 1) {
                        /* @env->envp[] contains pointers to @env->buf[]
                         * with @env->buflen chars, and we are removing
                         * variable MODALIAS here pointed by @env->envp[i]
                         * with length @len as shown below:
                         *
                         * 0               @env->buf[]      @env->buflen
                         * ---------------------------------------------
                         * ^             ^              ^              ^
                         * |             |->   @len   <-| target block |
                         * @env->envp[0] @env->envp[i]  @env->envp[i + 1]
                         *
                         * so the "target block" indicated above is moved
                         * backward by @len, and its right size is
                         * @env->buflen - (@env->envp[i + 1] - @env->envp[0]).
                         */
                        memmove(env->envp[i], env->envp[i + 1],
                                env->buflen - (env->envp[i + 1] - env->envp[0]));

                        for (j = i; j < env->envp_idx - 1; j++)
                                env->envp[j] = env->envp[j + 1] - len;
                }

                env->envp_idx--;
                env->buflen -= len;
        }
}

/**
 * kobject_uevent_env - send an uevent with environmental data
 *
 * @kobj: struct kobject that the action is happening to
 * @action: action that is happening
 * @envp_ext: pointer to environmental data
 *
 * Returns 0 if kobject_uevent_env() is completed with success or the
 * corresponding error when it fails.
 */
int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
                       char *envp_ext[])
{
        struct kobj_uevent_env *env;
        const char *action_string = kobject_actions[action];
        const char *devpath = NULL;
        const char *subsystem;
        struct kobject *top_kobj;
        struct kset *kset;
        const struct kset_uevent_ops *uevent_ops;
        int i = 0;
        int retval = 0;

        /*
         * Mark "remove" event done regardless of result, for some subsystems
         * do not want to re-trigger "remove" event via automatic cleanup.
         */
        if (action == KOBJ_REMOVE)
                kobj->state_remove_uevent_sent = 1;

        pr_debug("kobject: '%s' (%p): %s\n",
                 kobject_name(kobj), kobj, __func__);

        /* search the kset we belong to */
        top_kobj = kobj;
        while (!top_kobj->kset && top_kobj->parent)
                top_kobj = top_kobj->parent;

        if (!top_kobj->kset) {
                pr_debug("kobject: '%s' (%p): %s: attempted to send uevent "
                         "without kset!\n", kobject_name(kobj), kobj,
                         __func__);
                return -EINVAL;
        }

        kset = top_kobj->kset;
        uevent_ops = kset->uevent_ops;

        /* skip the event, if uevent_suppress is set*/
        if (kobj->uevent_suppress) {
                pr_debug("kobject: '%s' (%p): %s: uevent_suppress "
                                 "caused the event to drop!\n",
                                 kobject_name(kobj), kobj, __func__);
                return 0;
        }
        /* skip the event, if the filter returns zero. */
        if (uevent_ops && uevent_ops->filter)
                if (!uevent_ops->filter(kobj)) {
                        pr_debug("kobject: '%s' (%p): %s: filter function "
                                 "caused the event to drop!\n",
                                 kobject_name(kobj), kobj, __func__);
                        return 0;
                }

        /* originating subsystem */
        if (uevent_ops && uevent_ops->name)
                subsystem = uevent_ops->name(kobj);
        else
                subsystem = kobject_name(&kset->kobj);
        if (!subsystem) {
                pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the "
                         "event to drop!\n", kobject_name(kobj), kobj,
                         __func__);
                return 0;
        }

        /* environment buffer */
        env = kzalloc_obj(struct kobj_uevent_env);
        if (!env)
                return -ENOMEM;

        /* complete object path */
        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                retval = -ENOENT;
                goto exit;
        }

        /* default keys */
        retval = add_uevent_var(env, "ACTION=%s", action_string);
        if (retval)
                goto exit;
        retval = add_uevent_var(env, "DEVPATH=%s", devpath);
        if (retval)
                goto exit;
        retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem);
        if (retval)
                goto exit;

        /* keys passed in from the caller */
        if (envp_ext) {
                for (i = 0; envp_ext[i]; i++) {
                        retval = add_uevent_var(env, "%s", envp_ext[i]);
                        if (retval)
                                goto exit;
                }
        }

        /* let the kset specific function add its stuff */
        if (uevent_ops && uevent_ops->uevent) {
                retval = uevent_ops->uevent(kobj, env);
                if (retval) {
                        pr_debug("kobject: '%s' (%p): %s: uevent() returned "
                                 "%d\n", kobject_name(kobj), kobj,
                                 __func__, retval);
                        goto exit;
                }
        }

        switch (action) {
        case KOBJ_ADD:
                /*
                 * Mark "add" event so we can make sure we deliver "remove"
                 * event to userspace during automatic cleanup. If
                 * the object did send an "add" event, "remove" will
                 * automatically generated by the core, if not already done
                 * by the caller.
                 */
                kobj->state_add_uevent_sent = 1;
                break;

        case KOBJ_UNBIND:
                zap_modalias_env(env);
                break;

        default:
                break;
        }

        /* we will send an event, so request a new sequence number */
        retval = add_uevent_var(env, "SEQNUM=%llu",
                                atomic64_inc_return(&uevent_seqnum));
        if (retval)
                goto exit;

        retval = kobject_uevent_net_broadcast(kobj, env, action_string,
                                              devpath);

#ifdef CONFIG_UEVENT_HELPER
        /* call uevent_helper, usually only enabled during early boot */
        if (uevent_helper[0] && !kobj_usermode_filter(kobj)) {
                struct subprocess_info *info;

                retval = add_uevent_var(env, "HOME=/");
                if (retval)
                        goto exit;
                retval = add_uevent_var(env,
                                        "PATH=/sbin:/bin:/usr/sbin:/usr/bin");
                if (retval)
                        goto exit;
                retval = init_uevent_argv(env, subsystem);
                if (retval)
                        goto exit;

                retval = -ENOMEM;
                info = call_usermodehelper_setup(env->argv[0], env->argv,
                                                 env->envp, GFP_KERNEL,
                                                 NULL, cleanup_uevent_env, env);
                if (info) {
                        retval = call_usermodehelper_exec(info, UMH_NO_WAIT);
                        env = NULL;        /* freed by cleanup_uevent_env */
                }
        }
#endif

exit:
        kfree(devpath);
        kfree(env);
        return retval;
}
EXPORT_SYMBOL_GPL(kobject_uevent_env);

/**
 * kobject_uevent - notify userspace by sending an uevent
 *
 * @kobj: struct kobject that the action is happening to
 * @action: action that is happening
 *
 * Returns 0 if kobject_uevent() is completed with success or the
 * corresponding error when it fails.
 */
int kobject_uevent(struct kobject *kobj, enum kobject_action action)
{
        return kobject_uevent_env(kobj, action, NULL);
}
EXPORT_SYMBOL_GPL(kobject_uevent);

/**
 * add_uevent_var - add key value string to the environment buffer
 * @env: environment buffer structure
 * @format: printf format for the key=value pair
 *
 * Returns 0 if environment variable was added successfully or -ENOMEM
 * if no space was available.
 */
int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...)
{
        va_list args;
        int len;

        if (env->envp_idx >= ARRAY_SIZE(env->envp)) {
                WARN(1, KERN_ERR "add_uevent_var: too many keys\n");
                return -ENOMEM;
        }

        va_start(args, format);
        len = vsnprintf(&env->buf[env->buflen],
                        sizeof(env->buf) - env->buflen,
                        format, args);
        va_end(args);

        if (len >= (sizeof(env->buf) - env->buflen)) {
                WARN(1, KERN_ERR "add_uevent_var: buffer size too small\n");
                return -ENOMEM;
        }

        env->envp[env->envp_idx++] = &env->buf[env->buflen];
        env->buflen += len + 1;
        return 0;
}
EXPORT_SYMBOL_GPL(add_uevent_var);

#if defined(CONFIG_NET)
static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb,
                                struct netlink_ext_ack *extack)
{
        /* u64 to chars: 2^64 - 1 = 21 chars */
        char buf[sizeof("SEQNUM=") + 21];
        struct sk_buff *skbc;
        int ret;

        /* bump and prepare sequence number */
        ret = snprintf(buf, sizeof(buf), "SEQNUM=%llu",
                       atomic64_inc_return(&uevent_seqnum));
        if (ret < 0 || (size_t)ret >= sizeof(buf))
                return -ENOMEM;
        ret++;

        /* verify message does not overflow */
        if ((skb->len + ret) > UEVENT_BUFFER_SIZE) {
                NL_SET_ERR_MSG(extack, "uevent message too big");
                return -EINVAL;
        }

        /* copy skb and extend to accommodate sequence number */
        skbc = skb_copy_expand(skb, 0, ret, GFP_KERNEL);
        if (!skbc)
                return -ENOMEM;

        /* append sequence number */
        skb_put_data(skbc, buf, ret);

        /* remove msg header */
        skb_pull(skbc, NLMSG_HDRLEN);

        /* set portid 0 to inform userspace message comes from kernel */
        NETLINK_CB(skbc).portid = 0;
        NETLINK_CB(skbc).dst_group = 1;

        ret = netlink_broadcast(usk, skbc, 0, 1, GFP_KERNEL);
        /* ENOBUFS should be handled in userspace */
        if (ret == -ENOBUFS || ret == -ESRCH)
                ret = 0;

        return ret;
}

static int uevent_net_rcv_skb(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct net *net;
        int ret;

        if (!nlmsg_data(nlh))
                return -EINVAL;

        /*
         * Verify that we are allowed to send messages to the target
         * network namespace. The caller must have CAP_SYS_ADMIN in the
         * owning user namespace of the target network namespace.
         */
        net = sock_net(NETLINK_CB(skb).sk);
        if (!netlink_ns_capable(skb, net->user_ns, CAP_SYS_ADMIN)) {
                NL_SET_ERR_MSG(extack, "missing CAP_SYS_ADMIN capability");
                return -EPERM;
        }

        ret = uevent_net_broadcast(net->uevent_sock->sk, skb, extack);

        return ret;
}

static void uevent_net_rcv(struct sk_buff *skb)
{
        netlink_rcv_skb(skb, &uevent_net_rcv_skb);
}

static int uevent_net_init(struct net *net)
{
        struct uevent_sock *ue_sk;
        struct netlink_kernel_cfg cfg = {
                .groups        = 1,
                .input = uevent_net_rcv,
                .flags        = NL_CFG_F_NONROOT_RECV
        };

        ue_sk = kzalloc_obj(*ue_sk);
        if (!ue_sk)
                return -ENOMEM;

        ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, &cfg);
        if (!ue_sk->sk) {
                pr_err("kobject_uevent: unable to create netlink socket!\n");
                kfree(ue_sk);
                return -ENODEV;
        }

        net->uevent_sock = ue_sk;

        /* Restrict uevents to initial user namespace. */
        if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
                mutex_lock(&uevent_sock_mutex);
                list_add_tail(&ue_sk->list, &uevent_sock_list);
                mutex_unlock(&uevent_sock_mutex);
        }

        return 0;
}

static void uevent_net_exit(struct net *net)
{
        struct uevent_sock *ue_sk = net->uevent_sock;

        if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
                mutex_lock(&uevent_sock_mutex);
                list_del(&ue_sk->list);
                mutex_unlock(&uevent_sock_mutex);
        }

        netlink_kernel_release(ue_sk->sk);
        kfree(ue_sk);
}

static struct pernet_operations uevent_net_ops = {
        .init        = uevent_net_init,
        .exit        = uevent_net_exit,
};

static int __init kobject_uevent_init(void)
{
        return register_pernet_subsys(&uevent_net_ops);
}


postcore_initcall(kobject_uevent_init);
#endif

#ifdef CONFIG_UEVENT_HELPER
static const struct ctl_table uevent_helper_sysctl_table[] = {
        {
                .procname        = "hotplug",
                .data                = &uevent_helper,
                .maxlen                = UEVENT_HELPER_PATH_LEN,
                .mode                = 0644,
                .proc_handler        = proc_dostring,
        },
};

static int __init init_uevent_helper_sysctl(void)
{
        register_sysctl_init("kernel", uevent_helper_sysctl_table);
        return 0;
}

postcore_initcall(init_uevent_helper_sysctl);
#endif


































    1 












    1 




    1 





    1 


































    1 















    1 




















    1 









































































































































    1 






















    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
// SPDX-License-Identifier: GPL-2.0-only
/*
 * kexec.c - kexec_load system call
 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/security.h>
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>

#include "kexec_internal.h"

static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
                             unsigned long nr_segments,
                             struct kexec_segment *segments,
                             unsigned long flags)
{
        int ret;
        struct kimage *image;
        bool kexec_on_panic = flags & KEXEC_ON_CRASH;

#ifdef CONFIG_CRASH_DUMP
        if (kexec_on_panic) {
                /* Verify we have a valid entry point */
                if ((entry < phys_to_boot_phys(crashk_res.start)) ||
                    (entry > phys_to_boot_phys(crashk_res.end)))
                        return -EADDRNOTAVAIL;
        }
#endif

        /* Allocate and initialize a controlling structure */
        image = do_kimage_alloc_init();
        if (!image)
                return -ENOMEM;

        image->start = entry;
        image->nr_segments = nr_segments;
        memcpy(image->segment, segments, nr_segments * sizeof(*segments));

#ifdef CONFIG_CRASH_DUMP
        if (kexec_on_panic) {
                /* Enable special crash kernel control page alloc policy. */
                image->control_page = crashk_res.start;
                image->type = KEXEC_TYPE_CRASH;
        }
#endif

        ret = sanity_check_segment_list(image);
        if (ret)
                goto out_free_image;

        /*
         * Find a location for the control code buffer, and add it
         * the vector of segments so that it's pages will also be
         * counted as destination pages.
         */
        ret = -ENOMEM;
        image->control_code_page = kimage_alloc_control_pages(image,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                pr_err("Could not allocate control_code_buffer\n");
                goto out_free_image;
        }

        if (!kexec_on_panic) {
                image->swap_page = kimage_alloc_control_pages(image, 0);
                if (!image->swap_page) {
                        pr_err("Could not allocate swap buffer\n");
                        goto out_free_control_pages;
                }
        }

        *rimage = image;
        return 0;
out_free_control_pages:
        kimage_free_page_list(&image->control_pages);
out_free_image:
        kfree(image);
        return ret;
}

static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
                struct kexec_segment *segments, unsigned long flags)
{
        struct kimage **dest_image, *image;
        unsigned long i;
        int ret;

        /*
         * Because we write directly to the reserved memory region when loading
         * crash kernels we need a serialization here to prevent multiple crash
         * kernels from attempting to load simultaneously.
         */
        if (!kexec_trylock())
                return -EBUSY;

#ifdef CONFIG_CRASH_DUMP
        if (flags & KEXEC_ON_CRASH) {
                dest_image = &kexec_crash_image;
                if (kexec_crash_image)
                        arch_kexec_unprotect_crashkres();
        } else
#endif
                dest_image = &kexec_image;

        if (nr_segments == 0) {
                /* Uninstall image */
                kimage_free(xchg(dest_image, NULL));
                ret = 0;
                goto out_unlock;
        }
        if (flags & KEXEC_ON_CRASH) {
                /*
                 * Loading another kernel to switch to if this one
                 * crashes.  Free any current crash dump kernel before
                 * we corrupt it.
                 */
                kimage_free(xchg(&kexec_crash_image, NULL));
        }

        ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
        if (ret)
                goto out_unlock;

        if (flags & KEXEC_PRESERVE_CONTEXT)
                image->preserve_context = 1;

#ifdef CONFIG_CRASH_HOTPLUG
        if ((flags & KEXEC_ON_CRASH) && arch_crash_hotplug_support(image, flags))
                image->hotplug_support = 1;
#endif

        ret = machine_kexec_prepare(image);
        if (ret)
                goto out;

        /*
         * Some architecture(like S390) may touch the crash memory before
         * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
         */
        ret = kimage_crash_copy_vmcoreinfo(image);
        if (ret)
                goto out;

        for (i = 0; i < nr_segments; i++) {
                ret = kimage_load_segment(image, i);
                if (ret)
                        goto out;
        }

        kimage_terminate(image);

        ret = machine_kexec_post_load(image);
        if (ret)
                goto out;

        /* Install the new kernel and uninstall the old */
        image = xchg(dest_image, image);

out:
#ifdef CONFIG_CRASH_DUMP
        if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
                arch_kexec_protect_crashkres();
#endif

        kimage_free(image);
out_unlock:
        kexec_unlock();
        return ret;
}

/*
 * Exec Kernel system call: for obvious reasons only root may call it.
 *
 * This call breaks up into three pieces.
 * - A generic part which loads the new kernel from the current
 *   address space, and very carefully places the data in the
 *   allocated pages.
 *
 * - A generic part that interacts with the kernel and tells all of
 *   the devices to shut down.  Preventing on-going dmas, and placing
 *   the devices in a consistent state so a later kernel can
 *   reinitialize them.
 *
 * - A machine specific part that includes the syscall number
 *   and then copies the image to it's final destination.  And
 *   jumps into the image at entry.
 *
 * kexec does not sync, or unmount filesystems so if you need
 * that to happen you need to do that yourself.
 */

static inline int kexec_load_check(unsigned long nr_segments,
                                   unsigned long flags)
{
        int image_type = (flags & KEXEC_ON_CRASH) ?
                         KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
        int result;

        /* We only trust the superuser with rebooting the system. */
        if (!kexec_load_permitted(image_type))
                return -EPERM;

        /* Permit LSMs and IMA to fail the kexec */
        result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false);
        if (result < 0)
                return result;

        /*
         * kexec can be used to circumvent module loading restrictions, so
         * prevent loading in that case
         */
        result = security_locked_down(LOCKDOWN_KEXEC);
        if (result)
                return result;

        /*
         * Verify we have a legal set of flags
         * This leaves us room for future extensions.
         */
        if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
                return -EINVAL;

        /* Put an artificial cap on the number
         * of segments passed to kexec_load.
         */
        if (nr_segments > KEXEC_SEGMENT_MAX)
                return -EINVAL;

        return 0;
}

SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                struct kexec_segment __user *, segments, unsigned long, flags)
{
        struct kexec_segment *ksegments;
        unsigned long result;

        result = kexec_load_check(nr_segments, flags);
        if (result)
                return result;

        /* Verify we are on the appropriate architecture */
        if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
                ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
                return -EINVAL;

        ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0]));
        if (IS_ERR(ksegments))
                return PTR_ERR(ksegments);

        result = do_kexec_load(entry, nr_segments, ksegments, flags);
        kfree(ksegments);

        return result;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
                       compat_ulong_t, nr_segments,
                       struct compat_kexec_segment __user *, segments,
                       compat_ulong_t, flags)
{
        struct compat_kexec_segment in;
        struct kexec_segment *ksegments;
        unsigned long i, result;

        result = kexec_load_check(nr_segments, flags);
        if (result)
                return result;

        /* Don't allow clients that don't understand the native
         * architecture to do anything.
         */
        if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
                return -EINVAL;

        ksegments = kmalloc_objs(ksegments[0], nr_segments);
        if (!ksegments)
                return -ENOMEM;

        for (i = 0; i < nr_segments; i++) {
                result = copy_from_user(&in, &segments[i], sizeof(in));
                if (result)
                        goto fail;

                ksegments[i].buf   = compat_ptr(in.buf);
                ksegments[i].bufsz = in.bufsz;
                ksegments[i].mem   = in.mem;
                ksegments[i].memsz = in.memsz;
        }

        result = do_kexec_load(entry, nr_segments, ksegments, flags);

fail:
        kfree(ksegments);
        return result;
}
#endif


























































































































































































































































































































































































































































































































































































   15 


















   15 



   14 








































































   10 




   11 

   11 





















    3 







    3 








    3 



















    2 





    3 



























































































































































    6 


















    6 




    6 




    6 

    5 













    6 



    6 

    5 



























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
 */

#include <linux/types.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/udp.h>
#include <linux/tcp.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>

#include <linux/dccp.h>
#include <linux/sctp.h>
#include <net/sctp/checksum.h>

#include <linux/netfilter.h>
#include <net/netfilter/nf_nat.h>

#include <linux/ipv6.h>
#include <linux/netfilter_ipv6.h>
#include <net/checksum.h>
#include <net/ip6_checksum.h>
#include <net/ip6_route.h>
#include <net/xfrm.h>
#include <net/ipv6.h>
#include <net/pptp.h>

#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack.h>
#include <linux/netfilter/nfnetlink_conntrack.h>

static void nf_csum_update(struct sk_buff *skb,
                           unsigned int iphdroff, __sum16 *check,
                           const struct nf_conntrack_tuple *t,
                           enum nf_nat_manip_type maniptype);

static void
__udp_manip_pkt(struct sk_buff *skb,
                unsigned int iphdroff, struct udphdr *hdr,
                const struct nf_conntrack_tuple *tuple,
                enum nf_nat_manip_type maniptype, bool do_csum)
{
        __be16 *portptr, newport;

        if (maniptype == NF_NAT_MANIP_SRC) {
                /* Get rid of src port */
                newport = tuple->src.u.udp.port;
                portptr = &hdr->source;
        } else {
                /* Get rid of dst port */
                newport = tuple->dst.u.udp.port;
                portptr = &hdr->dest;
        }
        if (do_csum) {
                nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
                inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
                                         false);
                if (!hdr->check)
                        hdr->check = CSUM_MANGLED_0;
        }
        *portptr = newport;
}

static bool udp_manip_pkt(struct sk_buff *skb,
                          unsigned int iphdroff, unsigned int hdroff,
                          const struct nf_conntrack_tuple *tuple,
                          enum nf_nat_manip_type maniptype)
{
        struct udphdr *hdr;

        if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
                return false;

        hdr = (struct udphdr *)(skb->data + hdroff);
        __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check);

        return true;
}

static bool
sctp_manip_pkt(struct sk_buff *skb,
               unsigned int iphdroff, unsigned int hdroff,
               const struct nf_conntrack_tuple *tuple,
               enum nf_nat_manip_type maniptype)
{
#ifdef CONFIG_NF_CT_PROTO_SCTP
        struct sctphdr *hdr;
        int hdrsize = 8;

        /* This could be an inner header returned in imcp packet; in such
         * cases we cannot update the checksum field since it is outside
         * of the 8 bytes of transport layer headers we are guaranteed.
         */
        if (skb->len >= hdroff + sizeof(*hdr))
                hdrsize = sizeof(*hdr);

        if (skb_ensure_writable(skb, hdroff + hdrsize))
                return false;

        hdr = (struct sctphdr *)(skb->data + hdroff);

        if (maniptype == NF_NAT_MANIP_SRC) {
                /* Get rid of src port */
                hdr->source = tuple->src.u.sctp.port;
        } else {
                /* Get rid of dst port */
                hdr->dest = tuple->dst.u.sctp.port;
        }

        if (hdrsize < sizeof(*hdr))
                return true;

        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                hdr->checksum = sctp_compute_cksum(skb, hdroff);
                skb->ip_summed = CHECKSUM_NONE;
        }

#endif
        return true;
}

static bool
tcp_manip_pkt(struct sk_buff *skb,
              unsigned int iphdroff, unsigned int hdroff,
              const struct nf_conntrack_tuple *tuple,
              enum nf_nat_manip_type maniptype)
{
        struct tcphdr *hdr;
        __be16 *portptr, newport, oldport;
        int hdrsize = 8; /* TCP connection tracking guarantees this much */

        /* this could be a inner header returned in icmp packet; in such
           cases we cannot update the checksum field since it is outside of
           the 8 bytes of transport layer headers we are guaranteed */
        if (skb->len >= hdroff + sizeof(struct tcphdr))
                hdrsize = sizeof(struct tcphdr);

        if (skb_ensure_writable(skb, hdroff + hdrsize))
                return false;

        hdr = (struct tcphdr *)(skb->data + hdroff);

        if (maniptype == NF_NAT_MANIP_SRC) {
                /* Get rid of src port */
                newport = tuple->src.u.tcp.port;
                portptr = &hdr->source;
        } else {
                /* Get rid of dst port */
                newport = tuple->dst.u.tcp.port;
                portptr = &hdr->dest;
        }

        oldport = *portptr;
        *portptr = newport;

        if (hdrsize < sizeof(*hdr))
                return true;

        nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
        inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false);
        return true;
}

static bool
icmp_manip_pkt(struct sk_buff *skb,
               unsigned int iphdroff, unsigned int hdroff,
               const struct nf_conntrack_tuple *tuple,
               enum nf_nat_manip_type maniptype)
{
        struct icmphdr *hdr;

        if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
                return false;

        hdr = (struct icmphdr *)(skb->data + hdroff);
        switch (hdr->type) {
        case ICMP_ECHO:
        case ICMP_ECHOREPLY:
        case ICMP_TIMESTAMP:
        case ICMP_TIMESTAMPREPLY:
        case ICMP_INFO_REQUEST:
        case ICMP_INFO_REPLY:
        case ICMP_ADDRESS:
        case ICMP_ADDRESSREPLY:
                break;
        default:
                return true;
        }
        inet_proto_csum_replace2(&hdr->checksum, skb,
                                 hdr->un.echo.id, tuple->src.u.icmp.id, false);
        hdr->un.echo.id = tuple->src.u.icmp.id;
        return true;
}

static bool
icmpv6_manip_pkt(struct sk_buff *skb,
                 unsigned int iphdroff, unsigned int hdroff,
                 const struct nf_conntrack_tuple *tuple,
                 enum nf_nat_manip_type maniptype)
{
        struct icmp6hdr *hdr;

        if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
                return false;

        hdr = (struct icmp6hdr *)(skb->data + hdroff);
        nf_csum_update(skb, iphdroff, &hdr->icmp6_cksum, tuple, maniptype);
        if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST ||
            hdr->icmp6_type == ICMPV6_ECHO_REPLY) {
                inet_proto_csum_replace2(&hdr->icmp6_cksum, skb,
                                         hdr->icmp6_identifier,
                                         tuple->src.u.icmp.id, false);
                hdr->icmp6_identifier = tuple->src.u.icmp.id;
        }
        return true;
}

/* manipulate a GRE packet according to maniptype */
static bool
gre_manip_pkt(struct sk_buff *skb,
              unsigned int iphdroff, unsigned int hdroff,
              const struct nf_conntrack_tuple *tuple,
              enum nf_nat_manip_type maniptype)
{
#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
        const struct gre_base_hdr *greh;
        struct pptp_gre_header *pgreh;

        /* pgreh includes two optional 32bit fields which are not required
         * to be there.  That's where the magic '8' comes from */
        if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8))
                return false;

        greh = (void *)skb->data + hdroff;
        pgreh = (struct pptp_gre_header *)greh;

        /* we only have destination manip of a packet, since 'source key'
         * is not present in the packet itself */
        if (maniptype != NF_NAT_MANIP_DST)
                return true;

        switch (greh->flags & GRE_VERSION) {
        case GRE_VERSION_0:
                /* We do not currently NAT any GREv0 packets.
                 * Try to behave like "nf_nat_proto_unknown" */
                break;
        case GRE_VERSION_1:
                pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
                pgreh->call_id = tuple->dst.u.gre.key;
                break;
        default:
                pr_debug("can't nat unknown GRE version\n");
                return false;
        }
#endif
        return true;
}

static bool l4proto_manip_pkt(struct sk_buff *skb,
                              unsigned int iphdroff, unsigned int hdroff,
                              const struct nf_conntrack_tuple *tuple,
                              enum nf_nat_manip_type maniptype)
{
        switch (tuple->dst.protonum) {
        case IPPROTO_TCP:
                return tcp_manip_pkt(skb, iphdroff, hdroff,
                                     tuple, maniptype);
        case IPPROTO_UDP:
                return udp_manip_pkt(skb, iphdroff, hdroff,
                                     tuple, maniptype);
        case IPPROTO_SCTP:
                return sctp_manip_pkt(skb, iphdroff, hdroff,
                                      tuple, maniptype);
        case IPPROTO_ICMP:
                return icmp_manip_pkt(skb, iphdroff, hdroff,
                                      tuple, maniptype);
        case IPPROTO_ICMPV6:
                return icmpv6_manip_pkt(skb, iphdroff, hdroff,
                                        tuple, maniptype);
        case IPPROTO_GRE:
                return gre_manip_pkt(skb, iphdroff, hdroff,
                                     tuple, maniptype);
        }

        /* If we don't know protocol -- no error, pass it unmodified. */
        return true;
}

static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
                                  unsigned int iphdroff,
                                  const struct nf_conntrack_tuple *target,
                                  enum nf_nat_manip_type maniptype)
{
        struct iphdr *iph;
        unsigned int hdroff;

        if (skb_ensure_writable(skb, iphdroff + sizeof(*iph)))
                return false;

        iph = (void *)skb->data + iphdroff;
        hdroff = iphdroff + iph->ihl * 4;

        if (!l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype))
                return false;
        iph = (void *)skb->data + iphdroff;

        if (maniptype == NF_NAT_MANIP_SRC) {
                csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
                iph->saddr = target->src.u3.ip;
        } else {
                csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
                iph->daddr = target->dst.u3.ip;
        }
        return true;
}

static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
                                  unsigned int iphdroff,
                                  const struct nf_conntrack_tuple *target,
                                  enum nf_nat_manip_type maniptype)
{
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6hdr *ipv6h;
        __be16 frag_off;
        int hdroff;
        u8 nexthdr;

        if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h)))
                return false;

        ipv6h = (void *)skb->data + iphdroff;
        nexthdr = ipv6h->nexthdr;
        hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h),
                                  &nexthdr, &frag_off);
        if (hdroff < 0)
                goto manip_addr;

        if ((frag_off & htons(~0x7)) == 0 &&
            !l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype))
                return false;

        /* must reload, offset might have changed */
        ipv6h = (void *)skb->data + iphdroff;

manip_addr:
        if (maniptype == NF_NAT_MANIP_SRC)
                ipv6h->saddr = target->src.u3.in6;
        else
                ipv6h->daddr = target->dst.u3.in6;

#endif
        return true;
}

unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
                              enum nf_nat_manip_type mtype,
                              enum ip_conntrack_dir dir)
{
        struct nf_conntrack_tuple target;

        /* We are aiming to look like inverse of other direction. */
        nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);

        switch (target.src.l3num) {
        case NFPROTO_IPV6:
                if (nf_nat_ipv6_manip_pkt(skb, 0, &target, mtype))
                        return NF_ACCEPT;
                break;
        case NFPROTO_IPV4:
                if (nf_nat_ipv4_manip_pkt(skb, 0, &target, mtype))
                        return NF_ACCEPT;
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        return NF_DROP;
}

static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
                                    unsigned int iphdroff, __sum16 *check,
                                    const struct nf_conntrack_tuple *t,
                                    enum nf_nat_manip_type maniptype)
{
        struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
        __be32 oldip, newip;

        if (maniptype == NF_NAT_MANIP_SRC) {
                oldip = iph->saddr;
                newip = t->src.u3.ip;
        } else {
                oldip = iph->daddr;
                newip = t->dst.u3.ip;
        }
        inet_proto_csum_replace4(check, skb, oldip, newip, true);
}

static void nf_nat_ipv6_csum_update(struct sk_buff *skb,
                                    unsigned int iphdroff, __sum16 *check,
                                    const struct nf_conntrack_tuple *t,
                                    enum nf_nat_manip_type maniptype)
{
#if IS_ENABLED(CONFIG_IPV6)
        const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff);
        const struct in6_addr *oldip, *newip;

        if (maniptype == NF_NAT_MANIP_SRC) {
                oldip = &ipv6h->saddr;
                newip = &t->src.u3.in6;
        } else {
                oldip = &ipv6h->daddr;
                newip = &t->dst.u3.in6;
        }
        inet_proto_csum_replace16(check, skb, oldip->s6_addr32,
                                  newip->s6_addr32, true);
#endif
}

static void nf_csum_update(struct sk_buff *skb,
                           unsigned int iphdroff, __sum16 *check,
                           const struct nf_conntrack_tuple *t,
                           enum nf_nat_manip_type maniptype)
{
        switch (t->src.l3num) {
        case NFPROTO_IPV4:
                nf_nat_ipv4_csum_update(skb, iphdroff, check, t, maniptype);
                return;
        case NFPROTO_IPV6:
                nf_nat_ipv6_csum_update(skb, iphdroff, check, t, maniptype);
                return;
        }
}

static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
                                    u8 proto, void *data, __sum16 *check,
                                    int datalen, int oldlen)
{
        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                const struct iphdr *iph = ip_hdr(skb);

                skb->ip_summed = CHECKSUM_PARTIAL;
                skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
                        ip_hdrlen(skb);
                skb->csum_offset = (void *)check - data;
                *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
                                            proto, 0);
        } else {
                inet_proto_csum_replace2(check, skb,
                                         htons(oldlen), htons(datalen), true);
        }
}

#if IS_ENABLED(CONFIG_IPV6)
static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
                                    u8 proto, void *data, __sum16 *check,
                                    int datalen, int oldlen)
{
        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                const struct ipv6hdr *ipv6h = ipv6_hdr(skb);

                skb->ip_summed = CHECKSUM_PARTIAL;
                skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
                        (data - (void *)skb->data);
                skb->csum_offset = (void *)check - data;
                *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
                                          datalen, proto, 0);
        } else {
                inet_proto_csum_replace2(check, skb,
                                         htons(oldlen), htons(datalen), true);
        }
}
#endif

void nf_nat_csum_recalc(struct sk_buff *skb,
                        u8 nfproto, u8 proto, void *data, __sum16 *check,
                        int datalen, int oldlen)
{
        switch (nfproto) {
        case NFPROTO_IPV4:
                nf_nat_ipv4_csum_recalc(skb, proto, data, check,
                                        datalen, oldlen);
                return;
#if IS_ENABLED(CONFIG_IPV6)
        case NFPROTO_IPV6:
                nf_nat_ipv6_csum_recalc(skb, proto, data, check,
                                        datalen, oldlen);
                return;
#endif
        }

        WARN_ON_ONCE(1);
}

int nf_nat_icmp_reply_translation(struct sk_buff *skb,
                                  struct nf_conn *ct,
                                  enum ip_conntrack_info ctinfo,
                                  unsigned int hooknum)
{
        struct {
                struct icmphdr        icmp;
                struct iphdr        ip;
        } *inside;
        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
        enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
        unsigned int hdrlen = ip_hdrlen(skb);
        struct nf_conntrack_tuple target;
        unsigned long statusbit;

        WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);

        if (skb_ensure_writable(skb, hdrlen + sizeof(*inside)))
                return 0;
        if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP))
                return 0;

        inside = (void *)skb->data + hdrlen;
        if (inside->icmp.type == ICMP_REDIRECT) {
                if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
                        return 0;
                if (ct->status & IPS_NAT_MASK)
                        return 0;
        }

        if (manip == NF_NAT_MANIP_SRC)
                statusbit = IPS_SRC_NAT;
        else
                statusbit = IPS_DST_NAT;

        /* Invert if this is reply direction */
        if (dir == IP_CT_DIR_REPLY)
                statusbit ^= IPS_NAT_MASK;

        if (!(ct->status & statusbit))
                return 1;

        if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
                                   &ct->tuplehash[!dir].tuple, !manip))
                return 0;

        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                /* Reloading "inside" here since manip_pkt may reallocate */
                inside = (void *)skb->data + hdrlen;
                inside->icmp.checksum = 0;
                inside->icmp.checksum =
                        csum_fold(skb_checksum(skb, hdrlen,
                                               skb->len - hdrlen, 0));
        }

        /* Change outer to look like the reply to an incoming packet */
        nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
        target.dst.protonum = IPPROTO_ICMP;
        if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip))
                return 0;

        return 1;
}
EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);

static unsigned int
nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
               const struct nf_hook_state *state)
{
        struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct)
                return NF_ACCEPT;

        if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
                if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
                        if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
                                                           state->hook))
                                return NF_DROP;
                        else
                                return NF_ACCEPT;
                }
        }

        return nf_nat_inet_fn(priv, skb, state);
}

static unsigned int
nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb,
                        const struct nf_hook_state *state)
{
        unsigned int ret;
        __be32 daddr = ip_hdr(skb)->daddr;

        ret = nf_nat_ipv4_fn(priv, skb, state);
        if (ret == NF_ACCEPT && daddr != ip_hdr(skb)->daddr)
                skb_dst_drop(skb);

        return ret;
}

#ifdef CONFIG_XFRM
static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
{
        struct sock *sk = skb->sk;
        struct dst_entry *dst;
        unsigned int hh_len;
        struct flowi fl;
        int err;

        err = xfrm_decode_session(net, skb, &fl, family);
        if (err < 0)
                return err;

        dst = skb_dst(skb);
        if (dst->xfrm)
                dst = ((struct xfrm_dst *)dst)->route;
        if (!dst_hold_safe(dst))
                return -EHOSTUNREACH;

        if (sk && !net_eq(net, sock_net(sk)))
                sk = NULL;

        dst = xfrm_lookup(net, dst, &fl, sk, 0);
        if (IS_ERR(dst))
                return PTR_ERR(dst);

        skb_dst_drop(skb);
        skb_dst_set(skb, dst);

        /* Change in oif may mean change in hh_len. */
        hh_len = skb_dst(skb)->dev->hard_header_len;
        if (skb_headroom(skb) < hh_len &&
            pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
                return -ENOMEM;
        return 0;
}
#endif

static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport)
{
        enum ip_conntrack_info ctinfo;
        enum ip_conntrack_dir dir;
        const struct nf_conn *ct;

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct)
                return false;

        switch (nf_ct_protonum(ct)) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
                break;
        default:
                return false;
        }

        dir = CTINFO2DIR(ctinfo);
        if (dir != IP_CT_DIR_ORIGINAL)
                return false;

        return ct->tuplehash[!dir].tuple.dst.u.all != sport;
}

static unsigned int
nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        __be32 saddr = ip_hdr(skb)->saddr;
        struct sock *sk = skb->sk;
        unsigned int ret;

        ret = nf_nat_ipv4_fn(priv, skb, state);

        if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
                return ret;

        /* skb has a socket assigned via tcp edemux. We need to check
         * if nf_nat_ipv4_fn() has mangled the packet in a way that
         * edemux would not have found this socket.
         *
         * This includes both changes to the source address and changes
         * to the source port, which are both handled by the
         * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux
         * might have found a socket for the old (pre-snat) address.
         */
        if (saddr != ip_hdr(skb)->saddr ||
            nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
                skb_orphan(skb); /* TCP edemux obtained wrong socket */

        return ret;
}

static unsigned int
nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
                const struct nf_hook_state *state)
{
#ifdef CONFIG_XFRM
        const struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        int err;
#endif
        unsigned int ret;

        ret = nf_nat_ipv4_fn(priv, skb, state);
#ifdef CONFIG_XFRM
        if (ret != NF_ACCEPT)
                return ret;

        if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
                return ret;

        ct = nf_ct_get(skb, &ctinfo);
        if (ct) {
                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

                if (ct->tuplehash[dir].tuple.src.u3.ip !=
                     ct->tuplehash[!dir].tuple.dst.u3.ip ||
                    (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
                     ct->tuplehash[dir].tuple.src.u.all !=
                     ct->tuplehash[!dir].tuple.dst.u.all)) {
                        err = nf_xfrm_me_harder(state->net, skb, AF_INET);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
        }
#endif
        return ret;
}

static unsigned int
nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        const struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        unsigned int ret;
        int err;

        ret = nf_nat_ipv4_fn(priv, skb, state);
        if (ret != NF_ACCEPT)
                return ret;

        ct = nf_ct_get(skb, &ctinfo);
        if (ct) {
                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

                if (ct->tuplehash[dir].tuple.dst.u3.ip !=
                    ct->tuplehash[!dir].tuple.src.u3.ip) {
                        err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
#ifdef CONFIG_XFRM
                else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
                         ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
                         ct->tuplehash[dir].tuple.dst.u.all !=
                         ct->tuplehash[!dir].tuple.src.u.all) {
                        err = nf_xfrm_me_harder(state->net, skb, AF_INET);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
#endif
        }
        return ret;
}

static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
        /* Before packet filtering, change destination */
        {
                .hook                = nf_nat_ipv4_pre_routing,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_PRE_ROUTING,
                .priority        = NF_IP_PRI_NAT_DST,
        },
        /* After packet filtering, change source */
        {
                .hook                = nf_nat_ipv4_out,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_POST_ROUTING,
                .priority        = NF_IP_PRI_NAT_SRC,
        },
        /* Before packet filtering, change destination */
        {
                .hook                = nf_nat_ipv4_local_fn,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority        = NF_IP_PRI_NAT_DST,
        },
        /* After packet filtering, change source */
        {
                .hook                = nf_nat_ipv4_local_in,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_IN,
                .priority        = NF_IP_PRI_NAT_SRC,
        },
};

int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
{
        return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops,
                                  ARRAY_SIZE(nf_nat_ipv4_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn);

void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
{
        nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn);

#if IS_ENABLED(CONFIG_IPV6)
int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
                                    struct nf_conn *ct,
                                    enum ip_conntrack_info ctinfo,
                                    unsigned int hooknum,
                                    unsigned int hdrlen)
{
        struct {
                struct icmp6hdr        icmp6;
                struct ipv6hdr        ip6;
        } *inside;
        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
        enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
        struct nf_conntrack_tuple target;
        unsigned long statusbit;

        WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);

        if (skb_ensure_writable(skb, hdrlen + sizeof(*inside)))
                return 0;
        if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6))
                return 0;

        inside = (void *)skb->data + hdrlen;
        if (inside->icmp6.icmp6_type == NDISC_REDIRECT) {
                if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
                        return 0;
                if (ct->status & IPS_NAT_MASK)
                        return 0;
        }

        if (manip == NF_NAT_MANIP_SRC)
                statusbit = IPS_SRC_NAT;
        else
                statusbit = IPS_DST_NAT;

        /* Invert if this is reply direction */
        if (dir == IP_CT_DIR_REPLY)
                statusbit ^= IPS_NAT_MASK;

        if (!(ct->status & statusbit))
                return 1;

        if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6),
                                   &ct->tuplehash[!dir].tuple, !manip))
                return 0;

        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                struct ipv6hdr *ipv6h = ipv6_hdr(skb);

                inside = (void *)skb->data + hdrlen;
                inside->icmp6.icmp6_cksum = 0;
                inside->icmp6.icmp6_cksum =
                        csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
                                        skb->len - hdrlen, IPPROTO_ICMPV6,
                                        skb_checksum(skb, hdrlen,
                                                     skb->len - hdrlen, 0));
        }

        nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
        target.dst.protonum = IPPROTO_ICMPV6;
        if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip))
                return 0;

        return 1;
}
EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);

static unsigned int
nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
               const struct nf_hook_state *state)
{
        struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        __be16 frag_off;
        int hdrlen;
        u8 nexthdr;

        ct = nf_ct_get(skb, &ctinfo);
        /* Can't track?  It's not due to stress, or conntrack would
         * have dropped it.  Hence it's the user's responsibilty to
         * packet filter it out, or implement conntrack/NAT for that
         * protocol. 8) --RR
         */
        if (!ct)
                return NF_ACCEPT;

        if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
                nexthdr = ipv6_hdr(skb)->nexthdr;
                hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
                                          &nexthdr, &frag_off);

                if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
                        if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
                                                             state->hook,
                                                             hdrlen))
                                return NF_DROP;
                        else
                                return NF_ACCEPT;
                }
        }

        return nf_nat_inet_fn(priv, skb, state);
}

static unsigned int
nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        struct in6_addr saddr = ipv6_hdr(skb)->saddr;
        struct sock *sk = skb->sk;
        unsigned int ret;

        ret = nf_nat_ipv6_fn(priv, skb, state);

        if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
                return ret;

        /* see nf_nat_ipv4_local_in */
        if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) ||
            nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
                skb_orphan(skb);

        return ret;
}

static unsigned int
nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
               const struct nf_hook_state *state)
{
        unsigned int ret, verdict;
        struct in6_addr daddr = ipv6_hdr(skb)->daddr;

        ret = nf_nat_ipv6_fn(priv, skb, state);
        verdict = ret & NF_VERDICT_MASK;
        if (verdict != NF_DROP && verdict != NF_STOLEN &&
            ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
                skb_dst_drop(skb);

        return ret;
}

static unsigned int
nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
                const struct nf_hook_state *state)
{
#ifdef CONFIG_XFRM
        const struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        int err;
#endif
        unsigned int ret;

        ret = nf_nat_ipv6_fn(priv, skb, state);
#ifdef CONFIG_XFRM
        if (ret != NF_ACCEPT)
                return ret;

        if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
                return ret;
        ct = nf_ct_get(skb, &ctinfo);
        if (ct) {
                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

                if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
                                      &ct->tuplehash[!dir].tuple.dst.u3) ||
                    (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
                     ct->tuplehash[dir].tuple.src.u.all !=
                     ct->tuplehash[!dir].tuple.dst.u.all)) {
                        err = nf_xfrm_me_harder(state->net, skb, AF_INET6);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
        }
#endif

        return ret;
}

static unsigned int
nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        const struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        unsigned int ret;
        int err;

        ret = nf_nat_ipv6_fn(priv, skb, state);
        if (ret != NF_ACCEPT)
                return ret;

        ct = nf_ct_get(skb, &ctinfo);
        if (ct) {
                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

                if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
                                      &ct->tuplehash[!dir].tuple.src.u3)) {
                        err = nf_ip6_route_me_harder(state->net, state->sk, skb);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
#ifdef CONFIG_XFRM
                else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
                         ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
                         ct->tuplehash[dir].tuple.dst.u.all !=
                         ct->tuplehash[!dir].tuple.src.u.all) {
                        err = nf_xfrm_me_harder(state->net, skb, AF_INET6);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
#endif
        }

        return ret;
}

static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
        /* Before packet filtering, change destination */
        {
                .hook                = nf_nat_ipv6_in,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_PRE_ROUTING,
                .priority        = NF_IP6_PRI_NAT_DST,
        },
        /* After packet filtering, change source */
        {
                .hook                = nf_nat_ipv6_out,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_POST_ROUTING,
                .priority        = NF_IP6_PRI_NAT_SRC,
        },
        /* Before packet filtering, change destination */
        {
                .hook                = nf_nat_ipv6_local_fn,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority        = NF_IP6_PRI_NAT_DST,
        },
        /* After packet filtering, change source */
        {
                .hook                = nf_nat_ipv6_local_in,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_LOCAL_IN,
                .priority        = NF_IP6_PRI_NAT_SRC,
        },
};

int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops)
{
        return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops,
                                  ARRAY_SIZE(nf_nat_ipv6_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn);

void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
{
        nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn);
#endif /* CONFIG_IPV6 */

#if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT)
int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops)
{
        int ret;

        if (WARN_ON_ONCE(ops->pf != NFPROTO_INET))
                return -EINVAL;

        ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops,
                                 ARRAY_SIZE(nf_nat_ipv6_ops));
        if (ret)
                return ret;

        ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops,
                                 ARRAY_SIZE(nf_nat_ipv4_ops));
        if (ret)
                nf_nat_unregister_fn(net, NFPROTO_IPV6, ops,
                                        ARRAY_SIZE(nf_nat_ipv6_ops));
        return ret;
}
EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn);

void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
{
        nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
        nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn);
#endif /* NFT INET NAT */






































































































































































































































































































































































































































































   22 














   21 









   23 















































































































   22 
   23 




    1 
   20 






































































   24 














   21 



   24 


    1 
   22 



   21 

































   24 






   23 



    1 
   22 



   24 

   21 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/read_write.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/sched/xacct.h>
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include "internal.h"

#include <linux/uaccess.h>
#include <asm/unistd.h>

const struct file_operations generic_ro_fops = {
        .llseek                = generic_file_llseek,
        .read_iter        = generic_file_read_iter,
        .mmap_prepare        = generic_file_readonly_mmap_prepare,
        .splice_read        = filemap_splice_read,
        .setlease        = generic_setlease,
};

EXPORT_SYMBOL(generic_ro_fops);

static inline bool unsigned_offsets(struct file *file)
{
        return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET;
}

/**
 * vfs_setpos_cookie - update the file offset for lseek and reset cookie
 * @file:        file structure in question
 * @offset:        file offset to seek to
 * @maxsize:        maximum file size
 * @cookie:        cookie to reset
 *
 * Update the file offset to the value specified by @offset if the given
 * offset is valid and it is not equal to the current file offset and
 * reset the specified cookie to indicate that a seek happened.
 *
 * Return the specified offset on success and -EINVAL on invalid offset.
 */
static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
                                loff_t maxsize, u64 *cookie)
{
        if (offset < 0 && !unsigned_offsets(file))
                return -EINVAL;
        if (offset > maxsize)
                return -EINVAL;

        if (offset != file->f_pos) {
                file->f_pos = offset;
                if (cookie)
                        *cookie = 0;
        }
        return offset;
}

/**
 * vfs_setpos - update the file offset for lseek
 * @file:        file structure in question
 * @offset:        file offset to seek to
 * @maxsize:        maximum file size
 *
 * This is a low-level filesystem helper for updating the file offset to
 * the value specified by @offset if the given offset is valid and it is
 * not equal to the current file offset.
 *
 * Return the specified offset on success and -EINVAL on invalid offset.
 */
loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
{
        return vfs_setpos_cookie(file, offset, maxsize, NULL);
}
EXPORT_SYMBOL(vfs_setpos);

/**
 * must_set_pos - check whether f_pos has to be updated
 * @file: file to seek on
 * @offset: offset to use
 * @whence: type of seek operation
 * @eof: end of file
 *
 * Check whether f_pos needs to be updated and update @offset according
 * to @whence.
 *
 * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be
 * updated, and negative error code on failure.
 */
static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof)
{
        switch (whence) {
        case SEEK_END:
                *offset += eof;
                break;
        case SEEK_CUR:
                /*
                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
                 * position-querying operation.  Avoid rewriting the "same"
                 * f_pos value back to the file because a concurrent read(),
                 * write() or lseek() might have altered it
                 */
                if (*offset == 0) {
                        *offset = file->f_pos;
                        return 0;
                }
                break;
        case SEEK_DATA:
                /*
                 * In the generic case the entire file is data, so as long as
                 * offset isn't at the end of the file then the offset is data.
                 */
                if ((unsigned long long)*offset >= eof)
                        return -ENXIO;
                break;
        case SEEK_HOLE:
                /*
                 * There is a virtual hole at the end of the file, so as long as
                 * offset isn't i_size or larger, return i_size.
                 */
                if ((unsigned long long)*offset >= eof)
                        return -ENXIO;
                *offset = eof;
                break;
        }

        return 1;
}

/**
 * generic_file_llseek_size - generic llseek implementation for regular files
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @maxsize:        max size of this file in file system
 * @eof:        offset used for SEEK_END position
 *
 * This is a variant of generic_file_llseek that allows passing in a custom
 * maximum file size and a custom EOF position, for e.g. hashed directories
 *
 * Synchronization:
 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
 * read/writes behave like SEEK_SET against seeks.
 */
loff_t
generic_file_llseek_size(struct file *file, loff_t offset, int whence,
                loff_t maxsize, loff_t eof)
{
        int ret;

        ret = must_set_pos(file, &offset, whence, eof);
        if (ret < 0)
                return ret;
        if (ret == 0)
                return offset;

        if (whence == SEEK_CUR) {
                /*
                 * If the file requires locking via f_pos_lock we know
                 * that mutual exclusion for SEEK_CUR on the same file
                 * is guaranteed. If the file isn't locked, we take
                 * f_lock to protect against f_pos races with other
                 * SEEK_CURs.
                 */
                if (file_seek_cur_needs_f_lock(file)) {
                        guard(spinlock)(&file->f_lock);
                        return vfs_setpos(file, file->f_pos + offset, maxsize);
                }
                return vfs_setpos(file, file->f_pos + offset, maxsize);
        }

        return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);

/**
 * generic_llseek_cookie - versioned llseek implementation
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @cookie:        cookie to update
 *
 * See generic_file_llseek for a general description and locking assumptions.
 *
 * In contrast to generic_file_llseek, this function also resets a
 * specified cookie to indicate a seek took place.
 */
loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
                             u64 *cookie)
{
        struct inode *inode = file->f_mapping->host;
        loff_t maxsize = inode->i_sb->s_maxbytes;
        loff_t eof = i_size_read(inode);
        int ret;

        if (WARN_ON_ONCE(!cookie))
                return -EINVAL;

        /*
         * Require that this is only used for directories that guarantee
         * synchronization between readdir and seek so that an update to
         * @cookie is correctly synchronized with concurrent readdir.
         */
        if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS)))
                return -EINVAL;

        ret = must_set_pos(file, &offset, whence, eof);
        if (ret < 0)
                return ret;
        if (ret == 0)
                return offset;

        /* No need to hold f_lock because we know that f_pos_lock is held. */
        if (whence == SEEK_CUR)
                return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie);

        return vfs_setpos_cookie(file, offset, maxsize, cookie);
}
EXPORT_SYMBOL(generic_llseek_cookie);

/**
 * generic_file_llseek - generic llseek implementation for regular files
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 * This is a generic implementation of ->llseek useable for all normal local
 * filesystems.  It just updates the file offset to the value specified by
 * @offset and @whence.
 */
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;

        return generic_file_llseek_size(file, offset, whence,
                                        inode->i_sb->s_maxbytes,
                                        i_size_read(inode));
}
EXPORT_SYMBOL(generic_file_llseek);

/**
 * fixed_size_llseek - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        size of the file
 *
 */
loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR: case SEEK_END:
                return generic_file_llseek_size(file, offset, whence,
                                                size, size);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(fixed_size_llseek);

/**
 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 */
loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
                                                OFFSET_MAX, 0);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(no_seek_end_llseek);

/**
 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        maximal offset allowed
 *
 */
loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
                                                size, 0);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(no_seek_end_llseek_size);

/**
 * noop_llseek - No Operation Performed llseek implementation
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 * This is an implementation of ->llseek useable for the rare special case when
 * userspace expects the seek to succeed but the (device) file is actually not
 * able to perform the seek. In this case you use noop_llseek() instead of
 * falling back to the default implementation of ->llseek.
 */
loff_t noop_llseek(struct file *file, loff_t offset, int whence)
{
        return file->f_pos;
}
EXPORT_SYMBOL(noop_llseek);

loff_t default_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file_inode(file);
        loff_t retval;

        retval = inode_lock_killable(inode);
        if (retval)
                return retval;
        switch (whence) {
                case SEEK_END:
                        offset += i_size_read(inode);
                        break;
                case SEEK_CUR:
                        if (offset == 0) {
                                retval = file->f_pos;
                                goto out;
                        }
                        offset += file->f_pos;
                        break;
                case SEEK_DATA:
                        /*
                         * In the generic case the entire file is data, so as
                         * long as offset isn't at the end of the file then the
                         * offset is data.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        break;
                case SEEK_HOLE:
                        /*
                         * There is a virtual hole at the end of the file, so
                         * as long as offset isn't i_size or larger, return
                         * i_size.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        offset = inode->i_size;
                        break;
        }
        retval = -EINVAL;
        if (offset >= 0 || unsigned_offsets(file)) {
                if (offset != file->f_pos)
                        file->f_pos = offset;
                retval = offset;
        }
out:
        inode_unlock(inode);
        return retval;
}
EXPORT_SYMBOL(default_llseek);

loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
{
        if (!(file->f_mode & FMODE_LSEEK))
                return -ESPIPE;
        return file->f_op->llseek(file, offset, whence);
}
EXPORT_SYMBOL(vfs_llseek);

static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
        off_t retval;
        CLASS(fd_pos, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        retval = -EINVAL;
        if (whence <= SEEK_MAX) {
                loff_t res = vfs_llseek(fd_file(f), offset, whence);
                retval = res;
                if (res != (loff_t)retval)
                        retval = -EOVERFLOW;        /* LFS: should only happen on 32 bit platforms */
        }
        return retval;
}

SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
        return ksys_lseek(fd, offset, whence);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
{
        return ksys_lseek(fd, offset, whence);
}
#endif

#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
        defined(__ARCH_WANT_SYS_LLSEEK)
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                unsigned long, offset_low, loff_t __user *, result,
                unsigned int, whence)
{
        int retval;
        CLASS(fd_pos, f)(fd);
        loff_t offset;

        if (fd_empty(f))
                return -EBADF;

        if (whence > SEEK_MAX)
                return -EINVAL;

        offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
                        whence);

        retval = (int)offset;
        if (offset >= 0) {
                retval = -EFAULT;
                if (!copy_to_user(result, &offset, sizeof(offset)))
                        retval = 0;
        }
        return retval;
}
#endif

int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
        int mask = read_write == READ ? MAY_READ : MAY_WRITE;
        int ret;

        if (unlikely((ssize_t) count < 0))
                return -EINVAL;

        if (ppos) {
                loff_t pos = *ppos;

                if (unlikely(pos < 0)) {
                        if (!unsigned_offsets(file))
                                return -EINVAL;
                        if (count >= -pos) /* both values are in 0..LLONG_MAX */
                                return -EOVERFLOW;
                } else if (unlikely((loff_t) (pos + count) < 0)) {
                        if (!unsigned_offsets(file))
                                return -EINVAL;
                }
        }

        ret = security_file_permission(file, mask);
        if (ret)
                return ret;

        return fsnotify_file_area_perm(file, mask, ppos, count);
}
EXPORT_SYMBOL(rw_verify_area);

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_ubuf(&iter, ITER_DEST, buf, len);

        ret = filp->f_op->read_iter(&kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

static int warn_unsupported(struct file *file, const char *op)
{
        pr_warn_ratelimited(
                "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
                op, file, current->pid, current->comm);
        return -EINVAL;
}

ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
        struct kvec iov = {
                .iov_base        = buf,
                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
        };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
                return -EINVAL;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
        /*
         * Also fail if ->read_iter and ->read are both wired up as that
         * implies very convoluted semantics.
         */
        if (unlikely(!file->f_op->read_iter || file->f_op->read))
                return warn_unsupported(file, "read");

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos ? *pos : 0;
        iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
        ret = file->f_op->read_iter(&kiocb, &iter);
        if (ret > 0) {
                if (pos)
                        *pos = kiocb.ki_pos;
                fsnotify_access(file);
                add_rchar(current, ret);
        }
        inc_syscr(current);
        return ret;
}

ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        ret = rw_verify_area(READ, file, pos, count);
        if (ret)
                return ret;
        return __kernel_read(file, buf, count, pos);
}
EXPORT_SYMBOL(kernel_read);

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
        if (unlikely(!access_ok(buf, count)))
                return -EFAULT;

        ret = rw_verify_area(READ, file, pos, count);
        if (ret)
                return ret;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;

        if (file->f_op->read)
                ret = file->f_op->read(file, buf, count, pos);
        else if (file->f_op->read_iter)
                ret = new_sync_read(file, buf, count, pos);
        else
                ret = -EINVAL;
        if (ret > 0) {
                fsnotify_access(file);
                add_rchar(current, ret);
        }
        inc_syscr(current);
        return ret;
}

static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);

        ret = filp->f_op->write_iter(&kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ret > 0 && ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
{
        struct kiocb kiocb;
        ssize_t ret;

        if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        /*
         * Also fail if ->write_iter and ->write are both wired up as that
         * implies very convoluted semantics.
         */
        if (unlikely(!file->f_op->write_iter || file->f_op->write))
                return warn_unsupported(file, "write");

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos ? *pos : 0;
        ret = file->f_op->write_iter(&kiocb, from);
        if (ret > 0) {
                if (pos)
                        *pos = kiocb.ki_pos;
                fsnotify_modify(file);
                add_wchar(current, ret);
        }
        inc_syscw(current);
        return ret;
}

/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
{
        struct kvec iov = {
                .iov_base        = (void *)buf,
                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
        };
        struct iov_iter iter;
        iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
        return __kernel_write_iter(file, &iter, pos);
}
/*
 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
 * but autofs is one of the few internal kernel users that actually
 * wants this _and_ can be built as a module. So we need to export
 * this symbol for autofs, even though it really isn't appropriate
 * for any other kernel modules.
 */
EXPORT_SYMBOL_GPL(__kernel_write);

ssize_t kernel_write(struct file *file, const void *buf, size_t count,
                            loff_t *pos)
{
        ssize_t ret;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret)
                return ret;

        file_start_write(file);
        ret =  __kernel_write(file, buf, count, pos);
        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL(kernel_write);

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (unlikely(!access_ok(buf, count)))
                return -EFAULT;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret)
                return ret;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;
        file_start_write(file);
        if (file->f_op->write)
                ret = file->f_op->write(file, buf, count, pos);
        else if (file->f_op->write_iter)
                ret = new_sync_write(file, buf, count, pos);
        else
                ret = -EINVAL;
        if (ret > 0) {
                fsnotify_modify(file);
                add_wchar(current, ret);
        }
        inc_syscw(current);
        file_end_write(file);
        return ret;
}

/* file_ppos returns &file->f_pos or NULL if file is stream */
static inline loff_t *file_ppos(struct file *file)
{
        return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
}

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
        CLASS(fd_pos, f)(fd);
        ssize_t ret = -EBADF;

        if (!fd_empty(f)) {
                loff_t pos, *ppos = file_ppos(fd_file(f));
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_read(fd_file(f), buf, count, ppos);
                if (ret >= 0 && ppos)
                        fd_file(f)->f_pos = pos;
        }
        return ret;
}

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
        return ksys_read(fd, buf, count);
}

ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
        CLASS(fd_pos, f)(fd);
        ssize_t ret = -EBADF;

        if (!fd_empty(f)) {
                loff_t pos, *ppos = file_ppos(fd_file(f));
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_write(fd_file(f), buf, count, ppos);
                if (ret >= 0 && ppos)
                        fd_file(f)->f_pos = pos;
        }

        return ret;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
                size_t, count)
{
        return ksys_write(fd, buf, count);
}

ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
                     loff_t pos)
{
        if (pos < 0)
                return -EINVAL;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        if (fd_file(f)->f_mode & FMODE_PREAD)
                return vfs_read(fd_file(f), buf, count, &pos);

        return -ESPIPE;
}

SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
                        size_t, count, loff_t, pos)
{
        return ksys_pread64(fd, buf, count, pos);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
                       size_t, count, compat_arg_u64_dual(pos))
{
        return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
}
#endif

ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
                      size_t count, loff_t pos)
{
        if (pos < 0)
                return -EINVAL;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        if (fd_file(f)->f_mode & FMODE_PWRITE)
                return vfs_write(fd_file(f), buf, count, &pos);

        return -ESPIPE;
}

SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
                         size_t, count, loff_t, pos)
{
        return ksys_pwrite64(fd, buf, count, pos);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
                       size_t, count, compat_arg_u64_dual(pos))
{
        return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
}
#endif

static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
                loff_t *ppos, int type, rwf_t flags)
{
        struct kiocb kiocb;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        ret = kiocb_set_rw_flags(&kiocb, flags, type);
        if (ret)
                return ret;
        kiocb.ki_pos = (ppos ? *ppos : 0);

        if (type == READ)
                ret = filp->f_op->read_iter(&kiocb, iter);
        else
                ret = filp->f_op->write_iter(&kiocb, iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
                loff_t *ppos, int type, rwf_t flags)
{
        ssize_t ret = 0;

        if (flags & ~RWF_HIPRI)
                return -EOPNOTSUPP;

        while (iov_iter_count(iter)) {
                ssize_t nr;

                if (type == READ) {
                        nr = filp->f_op->read(filp, iter_iov_addr(iter),
                                                iter_iov_len(iter), ppos);
                } else {
                        nr = filp->f_op->write(filp, iter_iov_addr(iter),
                                                iter_iov_len(iter), ppos);
                }

                if (nr < 0) {
                        if (!ret)
                                ret = nr;
                        break;
                }
                ret += nr;
                if (nr != iter_iov_len(iter))
                        break;
                iov_iter_advance(iter, nr);
        }

        return ret;
}

ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->read_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                goto out;
        ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
        if (ret < 0)
                return ret;

        ret = file->f_op->read_iter(iocb, iter);
out:
        if (ret >= 0)
                fsnotify_access(file);
        return ret;
}
EXPORT_SYMBOL(vfs_iocb_iter_read);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                      rwf_t flags)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->read_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                goto out;
        ret = rw_verify_area(READ, file, ppos, tot_len);
        if (ret < 0)
                return ret;

        ret = do_iter_readv_writev(file, iter, ppos, READ, flags);
out:
        if (ret >= 0)
                fsnotify_access(file);
        return ret;
}
EXPORT_SYMBOL(vfs_iter_read);

/*
 * Caller is responsible for calling kiocb_end_write() on completion
 * if async iocb was queued.
 */
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->write_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                return 0;
        ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
        if (ret < 0)
                return ret;

        kiocb_start_write(iocb);
        ret = file->f_op->write_iter(iocb, iter);
        if (ret != -EIOCBQUEUED)
                kiocb_end_write(iocb);
        if (ret > 0)
                fsnotify_modify(file);

        return ret;
}
EXPORT_SYMBOL(vfs_iocb_iter_write);

ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                       rwf_t flags)
{
        size_t tot_len;
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (!file->f_op->write_iter)
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                return 0;

        ret = rw_verify_area(WRITE, file, ppos, tot_len);
        if (ret < 0)
                return ret;

        file_start_write(file);
        ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags);
        if (ret > 0)
                fsnotify_modify(file);
        file_end_write(file);

        return ret;
}
EXPORT_SYMBOL(vfs_iter_write);

static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
                         unsigned long vlen, loff_t *pos, rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        size_t tot_len;
        ssize_t ret = 0;

        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
                           &iter);
        if (ret < 0)
                return ret;

        tot_len = iov_iter_count(&iter);
        if (!tot_len)
                goto out;

        ret = rw_verify_area(READ, file, pos, tot_len);
        if (ret < 0)
                goto out;

        if (file->f_op->read_iter)
                ret = do_iter_readv_writev(file, &iter, pos, READ, flags);
        else
                ret = do_loop_readv_writev(file, &iter, pos, READ, flags);
out:
        if (ret >= 0)
                fsnotify_access(file);
        kfree(iov);
        return ret;
}

static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
                          unsigned long vlen, loff_t *pos, rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        size_t tot_len;
        ssize_t ret = 0;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;

        ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov,
                           &iter);
        if (ret < 0)
                return ret;

        tot_len = iov_iter_count(&iter);
        if (!tot_len)
                goto out;

        ret = rw_verify_area(WRITE, file, pos, tot_len);
        if (ret < 0)
                goto out;

        file_start_write(file);
        if (file->f_op->write_iter)
                ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags);
        else
                ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags);
        if (ret > 0)
                fsnotify_modify(file);
        file_end_write(file);
out:
        kfree(iov);
        return ret;
}

static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
                        unsigned long vlen, rwf_t flags)
{
        CLASS(fd_pos, f)(fd);
        ssize_t ret = -EBADF;

        if (!fd_empty(f)) {
                loff_t pos, *ppos = file_ppos(fd_file(f));
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
                if (ret >= 0 && ppos)
                        fd_file(f)->f_pos = pos;
        }

        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
                         unsigned long vlen, rwf_t flags)
{
        CLASS(fd_pos, f)(fd);
        ssize_t ret = -EBADF;

        if (!fd_empty(f)) {
                loff_t pos, *ppos = file_ppos(fd_file(f));
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
                if (ret >= 0 && ppos)
                        fd_file(f)->f_pos = pos;
        }

        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
{
#define HALF_LONG_BITS (BITS_PER_LONG / 2)
        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
}

static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
                         unsigned long vlen, loff_t pos, rwf_t flags)
{
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        CLASS(fd, f)(fd);
        if (!fd_empty(f)) {
                ret = -ESPIPE;
                if (fd_file(f)->f_mode & FMODE_PREAD)
                        ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
        }

        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
                          unsigned long vlen, loff_t pos, rwf_t flags)
{
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        CLASS(fd, f)(fd);
        if (!fd_empty(f)) {
                ret = -ESPIPE;
                if (fd_file(f)->f_mode & FMODE_PWRITE)
                        ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
        }

        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
{
        return do_readv(fd, vec, vlen, 0);
}

SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
{
        return do_writev(fd, vec, vlen, 0);
}

SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        return do_preadv(fd, vec, vlen, pos, 0);
}

SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
                rwf_t, flags)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);

        return do_preadv(fd, vec, vlen, pos, flags);
}

SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        return do_pwritev(fd, vec, vlen, pos, 0);
}

SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
                rwf_t, flags)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);

        return do_pwritev(fd, vec, vlen, pos, flags);
}

/*
 * Various compat syscalls.  Note that they all pretend to take a native
 * iovec - import_iovec will properly treat those as compat_iovecs based on
 * in_compat_syscall().
 */
#ifdef CONFIG_COMPAT
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos)
{
        return do_preadv(fd, vec, vlen, pos, 0);
}
#endif

COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
                const struct iovec __user *, vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_preadv(fd, vec, vlen, pos, 0);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);
        return do_preadv(fd, vec, vlen, pos, flags);
}
#endif

COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
                const struct iovec __user *, vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
                rwf_t, flags)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);
        return do_preadv(fd, vec, vlen, pos, flags);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos)
{
        return do_pwritev(fd, vec, vlen, pos, 0);
}
#endif

COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
                const struct iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_pwritev(fd, vec, vlen, pos, 0);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);
        return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif

COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
                const struct iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);
        return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif /* CONFIG_COMPAT */

static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                           size_t count, loff_t max)
{
        struct inode *in_inode, *out_inode;
        struct pipe_inode_info *opipe;
        loff_t pos;
        loff_t out_pos;
        ssize_t retval;
        int fl;

        /*
         * Get input file, and verify that it is ok..
         */
        CLASS(fd, in)(in_fd);
        if (fd_empty(in))
                return -EBADF;
        if (!(fd_file(in)->f_mode & FMODE_READ))
                return -EBADF;
        if (!ppos) {
                pos = fd_file(in)->f_pos;
        } else {
                pos = *ppos;
                if (!(fd_file(in)->f_mode & FMODE_PREAD))
                        return -ESPIPE;
        }
        retval = rw_verify_area(READ, fd_file(in), &pos, count);
        if (retval < 0)
                return retval;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;

        /*
         * Get output file, and verify that it is ok..
         */
        CLASS(fd, out)(out_fd);
        if (fd_empty(out))
                return -EBADF;
        if (!(fd_file(out)->f_mode & FMODE_WRITE))
                return -EBADF;
        in_inode = file_inode(fd_file(in));
        out_inode = file_inode(fd_file(out));
        out_pos = fd_file(out)->f_pos;

        if (!max)
                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);

        if (unlikely(pos + count > max)) {
                if (pos >= max)
                        return -EOVERFLOW;
                count = max - pos;
        }

        fl = 0;
#if 0
        /*
         * We need to debate whether we can enable this or not. The
         * man page documents EAGAIN return for the output at least,
         * and the application is arguably buggy if it doesn't expect
         * EAGAIN on a non-blocking file descriptor.
         */
        if (fd_file(in)->f_flags & O_NONBLOCK)
                fl = SPLICE_F_NONBLOCK;
#endif
        opipe = get_pipe_info(fd_file(out), true);
        if (!opipe) {
                retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
                if (retval < 0)
                        return retval;
                retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
                                          count, fl);
        } else {
                if (fd_file(out)->f_flags & O_NONBLOCK)
                        fl |= SPLICE_F_NONBLOCK;

                retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl);
        }

        if (retval > 0) {
                add_rchar(current, retval);
                add_wchar(current, retval);
                fsnotify_access(fd_file(in));
                fsnotify_modify(fd_file(out));
                fd_file(out)->f_pos = out_pos;
                if (ppos)
                        *ppos = pos;
                else
                        fd_file(in)->f_pos = pos;
        }

        inc_syscr(current);
        inc_syscw(current);
        if (pos > max)
                retval = -EOVERFLOW;
        return retval;
}

SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
{
        loff_t pos;
        off_t off;
        ssize_t ret;

        if (offset) {
                if (unlikely(get_user(off, offset)))
                        return -EFAULT;
                pos = off;
                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
{
        loff_t pos;
        ssize_t ret;

        if (offset) {
                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
                        return -EFAULT;
                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
                compat_off_t __user *, offset, compat_size_t, count)
{
        loff_t pos;
        off_t off;
        ssize_t ret;

        if (offset) {
                if (unlikely(get_user(off, offset)))
                        return -EFAULT;
                pos = off;
                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
                compat_loff_t __user *, offset, compat_size_t, count)
{
        loff_t pos;
        ssize_t ret;

        if (offset) {
                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
                        return -EFAULT;
                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif

/*
 * Performs necessary checks before doing a file copy
 *
 * Can adjust amount of bytes to copy via @req_count argument.
 * Returns appropriate error code that caller should return or
 * zero in case the copy should be allowed.
 */
static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    size_t *req_count, unsigned int flags)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);
        uint64_t count = *req_count;
        loff_t size_in;
        int ret;

        ret = generic_file_rw_checks(file_in, file_out);
        if (ret)
                return ret;

        /*
         * We allow some filesystems to handle cross sb copy, but passing
         * a file of the wrong filesystem type to filesystem driver can result
         * in an attempt to dereference the wrong type of ->private_data, so
         * avoid doing that until we really have a good reason.
         *
         * nfs and cifs define several different file_system_type structures
         * and several different sets of file_operations, but they all end up
         * using the same ->copy_file_range() function pointer.
         */
        if (flags & COPY_FILE_SPLICE) {
                /* cross sb splice is allowed */
        } else if (file_out->f_op->copy_file_range) {
                if (file_in->f_op->copy_file_range !=
                    file_out->f_op->copy_file_range)
                        return -EXDEV;
        } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
                return -EXDEV;
        }

        /* Don't touch certain kinds of inodes */
        if (IS_IMMUTABLE(inode_out))
                return -EPERM;

        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
                return -ETXTBSY;

        /* Ensure offsets don't wrap. */
        if (pos_in + count < pos_in || pos_out + count < pos_out)
                return -EOVERFLOW;

        /* Shorten the copy to EOF */
        size_in = i_size_read(inode_in);
        if (pos_in >= size_in)
                count = 0;
        else
                count = min(count, size_in - (uint64_t)pos_in);

        ret = generic_write_check_limits(file_out, pos_out, &count);
        if (ret)
                return ret;

        /* Don't allow overlapped copying within the same file. */
        if (inode_in == inode_out &&
            pos_out + count > pos_in &&
            pos_out < pos_in + count)
                return -EINVAL;

        *req_count = count;
        return 0;
}

/*
 * copy_file_range() differs from regular file read and write in that it
 * specifically allows return partial success.  When it does so is up to
 * the copy_file_range method.
 */
ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
                            struct file *file_out, loff_t pos_out,
                            size_t len, unsigned int flags)
{
        ssize_t ret;
        bool splice = flags & COPY_FILE_SPLICE;
        bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb;

        if (flags & ~COPY_FILE_SPLICE)
                return -EINVAL;

        ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
                                       flags);
        if (unlikely(ret))
                return ret;

        ret = rw_verify_area(READ, file_in, &pos_in, len);
        if (unlikely(ret))
                return ret;

        ret = rw_verify_area(WRITE, file_out, &pos_out, len);
        if (unlikely(ret))
                return ret;

        if (len == 0)
                return 0;

        /*
         * Make sure return value doesn't overflow in 32bit compat mode.  Also
         * limit the size for all cases except when calling ->copy_file_range().
         */
        if (splice || !file_out->f_op->copy_file_range || in_compat_syscall())
                len = min_t(size_t, MAX_RW_COUNT, len);

        file_start_write(file_out);

        /*
         * Cloning is supported by more file systems, so we implement copy on
         * same sb using clone, but for filesystems where both clone and copy
         * are supported (e.g. nfs,cifs), we only call the copy method.
         */
        if (!splice && file_out->f_op->copy_file_range) {
                ret = file_out->f_op->copy_file_range(file_in, pos_in,
                                                      file_out, pos_out,
                                                      len, flags);
        } else if (!splice && file_in->f_op->remap_file_range && samesb) {
                ret = file_in->f_op->remap_file_range(file_in, pos_in,
                                file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN);
                /* fallback to splice */
                if (ret <= 0)
                        splice = true;
        } else if (samesb) {
                /* Fallback to splice for same sb copy for backward compat */
                splice = true;
        }

        file_end_write(file_out);

        if (!splice)
                goto done;

        /*
         * We can get here for same sb copy of filesystems that do not implement
         * ->copy_file_range() in case filesystem does not support clone or in
         * case filesystem supports clone but rejected the clone request (e.g.
         * because it was not block aligned).
         *
         * In both cases, fall back to kernel copy so we are able to maintain a
         * consistent story about which filesystems support copy_file_range()
         * and which filesystems do not, that will allow userspace tools to
         * make consistent desicions w.r.t using copy_file_range().
         *
         * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
         * for server-side-copy between any two sb.
         *
         * In any case, we call do_splice_direct() and not splice_file_range(),
         * without file_start_write() held, to avoid possible deadlocks related
         * to splicing from input file, while file_start_write() is held on
         * the output file on a different sb.
         */
        ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0);
done:
        if (ret > 0) {
                fsnotify_access(file_in);
                add_rchar(current, ret);
                fsnotify_modify(file_out);
                add_wchar(current, ret);
        }

        inc_syscr(current);
        inc_syscw(current);

        return ret;
}
EXPORT_SYMBOL(vfs_copy_file_range);

SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
                int, fd_out, loff_t __user *, off_out,
                size_t, len, unsigned int, flags)
{
        loff_t pos_in;
        loff_t pos_out;
        ssize_t ret = -EBADF;

        CLASS(fd, f_in)(fd_in);
        if (fd_empty(f_in))
                return -EBADF;

        CLASS(fd, f_out)(fd_out);
        if (fd_empty(f_out))
                return -EBADF;

        if (off_in) {
                if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
                        return -EFAULT;
        } else {
                pos_in = fd_file(f_in)->f_pos;
        }

        if (off_out) {
                if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
                        return -EFAULT;
        } else {
                pos_out = fd_file(f_out)->f_pos;
        }

        if (flags != 0)
                return -EINVAL;

        ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
                                  flags);
        if (ret > 0) {
                pos_in += ret;
                pos_out += ret;

                if (off_in) {
                        if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
                                ret = -EFAULT;
                } else {
                        fd_file(f_in)->f_pos = pos_in;
                }

                if (off_out) {
                        if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
                                ret = -EFAULT;
                } else {
                        fd_file(f_out)->f_pos = pos_out;
                }
        }
        return ret;
}

/*
 * Don't operate on ranges the page cache doesn't support, and don't exceed the
 * LFS limits.  If pos is under the limit it becomes a short access.  If it
 * exceeds the limit we return -EFBIG.
 */
int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
{
        struct inode *inode = file->f_mapping->host;
        loff_t max_size = inode->i_sb->s_maxbytes;
        loff_t limit = rlimit(RLIMIT_FSIZE);

        if (limit != RLIM_INFINITY) {
                if (pos >= limit) {
                        send_sig(SIGXFSZ, current, 0);
                        return -EFBIG;
                }
                *count = min(*count, limit - pos);
        }

        if (!(file->f_flags & O_LARGEFILE))
                max_size = MAX_NON_LFS;

        if (unlikely(pos >= max_size))
                return -EFBIG;

        *count = min(*count, max_size - pos);

        return 0;
}
EXPORT_SYMBOL_GPL(generic_write_check_limits);

/* Like generic_write_checks(), but takes size of write instead of iter. */
int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;

        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        if (!*count)
                return 0;

        if (iocb->ki_flags & IOCB_APPEND)
                iocb->ki_pos = i_size_read(inode);

        if ((iocb->ki_flags & IOCB_NOWAIT) &&
            !((iocb->ki_flags & IOCB_DIRECT) ||
              (file->f_op->fop_flags & FOP_BUFFER_WASYNC)))
                return -EINVAL;

        return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
}
EXPORT_SYMBOL(generic_write_checks_count);

/*
 * Performs necessary checks before doing a write
 *
 * Can adjust writing position or amount of bytes to write.
 * Returns appropriate error code that caller should return or
 * zero in case that write should be allowed.
 */
ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
        loff_t count = iov_iter_count(from);
        int ret;

        ret = generic_write_checks_count(iocb, &count);
        if (ret)
                return ret;

        iov_iter_truncate(from, count);
        return iov_iter_count(from);
}
EXPORT_SYMBOL(generic_write_checks);

/*
 * Performs common checks before doing a file copy/clone
 * from @file_in to @file_out.
 */
int generic_file_rw_checks(struct file *file_in, struct file *file_out)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);

        /* Don't copy dirs, pipes, sockets... */
        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                return -EINVAL;

        if (!(file_in->f_mode & FMODE_READ) ||
            !(file_out->f_mode & FMODE_WRITE) ||
            (file_out->f_flags & O_APPEND))
                return -EBADF;

        return 0;
}

int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
{
        size_t len = iov_iter_count(iter);

        if (!iter_is_ubuf(iter))
                return -EINVAL;

        if (!is_power_of_2(len))
                return -EINVAL;

        if (!IS_ALIGNED(iocb->ki_pos, len))
                return -EINVAL;

        if (!(iocb->ki_flags & IOCB_DIRECT))
                return -EOPNOTSUPP;

        return 0;
}
EXPORT_SYMBOL_GPL(generic_atomic_write_valid);




























    6 




   13 


























   11 
























    1 





















   48 







































   19 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC_H
#define _ASM_X86_ATOMIC_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>

/*
 * Atomic operations that C can't guarantee us.  Useful for
 * resource counting etc..
 */

static __always_inline int arch_atomic_read(const atomic_t *v)
{
        /*
         * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here,
         * it's non-inlined function that increases binary size and stack usage.
         */
        return __READ_ONCE((v)->counter);
}

static __always_inline void arch_atomic_set(atomic_t *v, int i)
{
        __WRITE_ONCE(v->counter, i);
}

static __always_inline void arch_atomic_add(int i, atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "addl %1, %0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

static __always_inline void arch_atomic_sub(int i, atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "subl %1, %0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
}
#define arch_atomic_sub_and_test arch_atomic_sub_and_test

static __always_inline void arch_atomic_inc(atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "incl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_inc arch_atomic_inc

static __always_inline void arch_atomic_dec(atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "decl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_dec arch_atomic_dec

static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
}
#define arch_atomic_dec_and_test arch_atomic_dec_and_test

static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
}
#define arch_atomic_inc_and_test arch_atomic_inc_and_test

static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
}
#define arch_atomic_add_negative arch_atomic_add_negative

static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic_add_return arch_atomic_add_return

#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)

static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic_fetch_add arch_atomic_fetch_add

#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)

static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_cmpxchg arch_atomic_cmpxchg

static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        return arch_try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg

static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic_xchg arch_atomic_xchg

static __always_inline void arch_atomic_and(int i, atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "andl %1, %0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i));

        return val;
}
#define arch_atomic_fetch_and arch_atomic_fetch_and

static __always_inline void arch_atomic_or(int i, atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "orl %1, %0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i));

        return val;
}
#define arch_atomic_fetch_or arch_atomic_fetch_or

static __always_inline void arch_atomic_xor(int i, atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "xorl %1, %0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i));

        return val;
}
#define arch_atomic_fetch_xor arch_atomic_fetch_xor

#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
# include <asm/atomic64_64.h>
#endif

#endif /* _ASM_X86_ATOMIC_H */










































































































































































































































































































































































































































































































































































































































    3 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * kernfs.h - pseudo filesystem decoupled from vfs locking
 */

#ifndef __LINUX_KERNFS_H
#define __LINUX_KERNFS_H

#include <linux/err.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/lockdep.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/uidgid.h>
#include <linux/wait.h>
#include <linux/rwsem.h>
#include <linux/cache.h>

struct file;
struct dentry;
struct iattr;
struct ns_common;
struct seq_file;
struct vm_area_struct;
struct vm_operations_struct;
struct super_block;
struct file_system_type;
struct poll_table_struct;
struct fs_context;

struct kernfs_fs_context;
struct kernfs_open_node;
struct kernfs_iattrs;

/*
 * NR_KERNFS_LOCK_BITS determines size (NR_KERNFS_LOCKS) of hash
 * table of locks.
 * Having a small hash table would impact scalability, since
 * more and more kernfs_node objects will end up using same lock
 * and having a very large hash table would waste memory.
 *
 * At the moment size of hash table of locks is being set based on
 * the number of CPUs as follows:
 *
 * NR_CPU      NR_KERNFS_LOCK_BITS      NR_KERNFS_LOCKS
 *   1                  1                       2
 *  2-3                 2                       4
 *  4-7                 4                       16
 *  8-15                6                       64
 *  16-31               8                       256
 *  32 and more         10                      1024
 *
 * The above relation between NR_CPU and number of locks is based
 * on some internal experimentation which involved booting qemu
 * with different values of smp, performing some sysfs operations
 * on all CPUs and observing how increase in number of locks impacts
 * completion time of these sysfs operations on each CPU.
 */
#ifdef CONFIG_SMP
#define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32)))
#else
#define NR_KERNFS_LOCK_BITS     1
#endif

#define NR_KERNFS_LOCKS     (1 << NR_KERNFS_LOCK_BITS)

/*
 * There's one kernfs_open_file for each open file and one kernfs_open_node
 * for each kernfs_node with one or more open files.
 *
 * filp->private_data points to seq_file whose ->private points to
 * kernfs_open_file.
 *
 * kernfs_open_files are chained at kernfs_open_node->files, which is
 * protected by kernfs_global_locks.open_file_mutex[i].
 *
 * To reduce possible contention in sysfs access, arising due to single
 * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node
 * object address as hash keys to get the index of these locks.
 *
 * Hashed mutexes are safe to use here because operations using these don't
 * rely on global exclusion.
 *
 * In future we intend to replace other global locks with hashed ones as well.
 * kernfs_global_locks acts as a holder for all such hash tables.
 */
struct kernfs_global_locks {
        struct mutex open_file_mutex[NR_KERNFS_LOCKS];
};

enum kernfs_node_type {
        KERNFS_DIR                = 0x0001,
        KERNFS_FILE                = 0x0002,
        KERNFS_LINK                = 0x0004,
};

#define KERNFS_TYPE_MASK                0x000f
#define KERNFS_FLAG_MASK                ~KERNFS_TYPE_MASK

enum kernfs_node_flag {
        KERNFS_ACTIVATED        = 0x0010,
        KERNFS_NS                = 0x0020,
        KERNFS_HAS_SEQ_SHOW        = 0x0040,
        KERNFS_HAS_MMAP                = 0x0080,
        KERNFS_LOCKDEP                = 0x0100,
        KERNFS_HIDDEN                = 0x0200,
        KERNFS_SUICIDAL                = 0x0400,
        KERNFS_SUICIDED                = 0x0800,
        KERNFS_EMPTY_DIR        = 0x1000,
        KERNFS_HAS_RELEASE        = 0x2000,
        KERNFS_REMOVING                = 0x4000,
};

/* @flags for kernfs_create_root() */
enum kernfs_root_flag {
        /*
         * kernfs_nodes are created in the deactivated state and invisible.
         * They require explicit kernfs_activate() to become visible.  This
         * can be used to make related nodes become visible atomically
         * after all nodes are created successfully.
         */
        KERNFS_ROOT_CREATE_DEACTIVATED                = 0x0001,

        /*
         * For regular files, if the opener has CAP_DAC_OVERRIDE, open(2)
         * succeeds regardless of the RW permissions.  sysfs had an extra
         * layer of enforcement where open(2) fails with -EACCES regardless
         * of CAP_DAC_OVERRIDE if the permission doesn't have the
         * respective read or write access at all (none of S_IRUGO or
         * S_IWUGO) or the respective operation isn't implemented.  The
         * following flag enables that behavior.
         */
        KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK        = 0x0002,

        /*
         * The filesystem supports exportfs operation, so userspace can use
         * fhandle to access nodes of the fs.
         */
        KERNFS_ROOT_SUPPORT_EXPORTOP                = 0x0004,

        /*
         * Support user xattrs to be written to nodes rooted at this root.
         */
        KERNFS_ROOT_SUPPORT_USER_XATTR                = 0x0008,

        /*
         * Renames must not change the parent node.
         */
        KERNFS_ROOT_INVARIANT_PARENT                = 0x0010,
};

/* type-specific structures for kernfs_node union members */
struct kernfs_elem_dir {
        unsigned long                subdirs;
        /* children rbtree starts here and goes through kn->rb */
        struct rb_root                children;

        /*
         * The kernfs hierarchy this directory belongs to.  This fits
         * better directly in kernfs_node but is here to save space.
         */
        struct kernfs_root        *root;
        /*
         * Monotonic revision counter, used to identify if a directory
         * node has changed during negative dentry revalidation.
         */
        unsigned long                rev;
};

struct kernfs_elem_symlink {
        struct kernfs_node        *target_kn;
};

struct kernfs_elem_attr {
        const struct kernfs_ops        *ops;
        struct kernfs_open_node __rcu        *open;
        loff_t                        size;
        struct kernfs_node        *notify_next;        /* for kernfs_notify() */
};

/*
 * kernfs_node - the building block of kernfs hierarchy.  Each and every
 * kernfs node is represented by single kernfs_node.  Most fields are
 * private to kernfs and shouldn't be accessed directly by kernfs users.
 *
 * As long as count reference is held, the kernfs_node itself is
 * accessible.  Dereferencing elem or any other outer entity requires
 * active reference.
 */
struct kernfs_node {
        atomic_t                count;
        atomic_t                active;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
        /*
         * Use kernfs_get_parent() and kernfs_name/path() instead of
         * accessing the following two fields directly.  If the node is
         * never moved to a different parent, it is safe to access the
         * parent directly.
         */
        struct kernfs_node        __rcu *__parent;
        const char                __rcu *name;

        struct rb_node                rb;

        const struct ns_common        *ns;        /* namespace tag */
        unsigned int                hash;        /* ns + name hash */
        unsigned short                flags;
        umode_t                        mode;

        union {
                struct kernfs_elem_dir                dir;
                struct kernfs_elem_symlink        symlink;
                struct kernfs_elem_attr                attr;
        };

        /*
         * 64bit unique ID.  On 64bit ino setups, id is the ino.  On 32bit,
         * the low 32bits are ino and upper generation.
         */
        u64                        id;

        void                        *priv;
        struct kernfs_iattrs        *iattr;

        struct rcu_head                rcu;
};

/*
 * kernfs_syscall_ops may be specified on kernfs_create_root() to support
 * syscalls.  These optional callbacks are invoked on the matching syscalls
 * and can perform any kernfs operations which don't necessarily have to be
 * the exact operation requested.  An active reference is held for each
 * kernfs_node parameter.
 */
struct kernfs_syscall_ops {
        int (*show_options)(struct seq_file *sf, struct kernfs_root *root);

        int (*mkdir)(struct kernfs_node *parent, const char *name,
                     umode_t mode);
        int (*rmdir)(struct kernfs_node *kn);
        int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
                      const char *new_name);
        int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
                         struct kernfs_root *root);
};

struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root);

struct kernfs_open_file {
        /* published fields */
        struct kernfs_node        *kn;
        struct file                *file;
        struct seq_file                *seq_file;
        void                        *priv;

        /* private fields, do not use outside kernfs proper */
        struct mutex                mutex;
        struct mutex                prealloc_mutex;
        int                        event;
        struct list_head        list;
        char                        *prealloc_buf;

        size_t                        atomic_write_len;
        bool                        mmapped:1;
        bool                        released:1;
        const struct vm_operations_struct *vm_ops;
};

struct kernfs_ops {
        /*
         * Optional open/release methods.  Both are called with
         * @of->seq_file populated.
         */
        int (*open)(struct kernfs_open_file *of);
        void (*release)(struct kernfs_open_file *of);

        /*
         * Read is handled by either seq_file or raw_read().
         *
         * If seq_show() is present, seq_file path is active.  Other seq
         * operations are optional and if not implemented, the behavior is
         * equivalent to single_open().  @sf->private points to the
         * associated kernfs_open_file.
         *
         * read() is bounced through kernel buffer and a read larger than
         * PAGE_SIZE results in partial operation of PAGE_SIZE.
         */
        int (*seq_show)(struct seq_file *sf, void *v);

        void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
        void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
        void (*seq_stop)(struct seq_file *sf, void *v);

        ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes,
                        loff_t off);

        /*
         * write() is bounced through kernel buffer.  If atomic_write_len
         * is not set, a write larger than PAGE_SIZE results in partial
         * operations of PAGE_SIZE chunks.  If atomic_write_len is set,
         * writes upto the specified size are executed atomically but
         * larger ones are rejected with -E2BIG.
         */
        size_t atomic_write_len;
        /*
         * "prealloc" causes a buffer to be allocated at open for
         * all read/write requests.  As ->seq_show uses seq_read()
         * which does its own allocation, it is incompatible with
         * ->prealloc.  Provide ->read and ->write with ->prealloc.
         */
        bool prealloc;
        ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
                         loff_t off);

        __poll_t (*poll)(struct kernfs_open_file *of,
                         struct poll_table_struct *pt);

        int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
        loff_t (*llseek)(struct kernfs_open_file *of, loff_t offset, int whence);
};

/*
 * The kernfs superblock creation/mount parameter context.
 */
struct kernfs_fs_context {
        struct kernfs_root        *root;                /* Root of the hierarchy being mounted */
        struct ns_common        *ns_tag;        /* Namespace tag of the mount (or NULL) */
        unsigned long                magic;                /* File system specific magic number */

        /* The following are set/used by kernfs_mount() */
        bool                        new_sb_created;        /* Set to T if we allocated a new sb */
};

#ifdef CONFIG_KERNFS

static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
{
        return kn->flags & KERNFS_TYPE_MASK;
}

static inline ino_t kernfs_id_ino(u64 id)
{
        /* id is ino if ino_t is 64bit; otherwise, low 32bits */
        if (sizeof(ino_t) >= sizeof(u64))
                return id;
        else
                return (u32)id;
}

static inline u32 kernfs_id_gen(u64 id)
{
        /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */
        if (sizeof(ino_t) >= sizeof(u64))
                return 1;
        else
                return id >> 32;
}

static inline ino_t kernfs_ino(struct kernfs_node *kn)
{
        return kernfs_id_ino(kn->id);
}

static inline ino_t kernfs_gen(struct kernfs_node *kn)
{
        return kernfs_id_gen(kn->id);
}

/**
 * kernfs_enable_ns - enable namespace under a directory
 * @kn: directory of interest, should be empty
 *
 * This is to be called right after @kn is created to enable namespace
 * under it.  All children of @kn must have non-NULL namespace tags and
 * only the ones which match the super_block's tag will be visible.
 */
static inline void kernfs_enable_ns(struct kernfs_node *kn)
{
        WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
        WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
        kn->flags |= KERNFS_NS;
}

/**
 * kernfs_ns_enabled - test whether namespace is enabled
 * @kn: the node to test
 *
 * Test whether namespace filtering is enabled for the children of @ns.
 */
static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
{
        return kn->flags & KERNFS_NS;
}

int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
int kernfs_path_from_node(struct kernfs_node *kn_to, struct kernfs_node *kn_from,
                          char *buf, size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
void pr_cont_kernfs_path(struct kernfs_node *kn);
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn);
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
                                           const char *name,
                                           const struct ns_common *ns);
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
                                           const char *path,
                                           const struct ns_common *ns);
void kernfs_get(struct kernfs_node *kn);
void kernfs_put(struct kernfs_node *kn);

struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);

struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
                                  struct super_block *sb);
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
                                       unsigned int flags, void *priv);
void kernfs_destroy_root(struct kernfs_root *root);
unsigned int kernfs_root_flags(struct kernfs_node *kn);

struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         void *priv,
                                         const struct ns_common *ns);
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
                                            const char *name);
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         loff_t size,
                                         const struct kernfs_ops *ops,
                                         void *priv,
                                         const struct ns_common *ns,
                                         struct lock_class_key *key);
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
                                       const char *name,
                                       struct kernfs_node *target);
void kernfs_activate(struct kernfs_node *kn);
void kernfs_show(struct kernfs_node *kn, bool show);
void kernfs_remove(struct kernfs_node *kn);
void kernfs_break_active_protection(struct kernfs_node *kn);
void kernfs_unbreak_active_protection(struct kernfs_node *kn);
bool kernfs_remove_self(struct kernfs_node *kn);
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
                             const struct ns_common *ns);
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
                     const char *new_name, const struct ns_common *new_ns);
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
__poll_t kernfs_generic_poll(struct kernfs_open_file *of,
                             struct poll_table_struct *pt);
void kernfs_notify(struct kernfs_node *kn);

int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
                     void *value, size_t size);
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
                     const void *value, size_t size, int flags);

const struct ns_common *kernfs_super_ns(struct super_block *sb);
int kernfs_get_tree(struct fs_context *fc);
void kernfs_free_fs_context(struct fs_context *fc);
void kernfs_kill_sb(struct super_block *sb);

void kernfs_init(void);

struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
                                                   u64 id);
#else        /* CONFIG_KERNFS */

static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
{ return 0; }        /* whatever */

static inline void kernfs_enable_ns(struct kernfs_node *kn) { }

static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
{ return false; }

static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{ return -ENOSYS; }

static inline int kernfs_path_from_node(struct kernfs_node *root_kn,
                                        struct kernfs_node *kn,
                                        char *buf, size_t buflen)
{ return -ENOSYS; }

static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { }
static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { }

static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{ return NULL; }

static inline struct kernfs_node *
kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
                       const struct ns_common *ns)
{ return NULL; }
static inline struct kernfs_node *
kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path,
                       const struct ns_common *ns)
{ return NULL; }

static inline void kernfs_get(struct kernfs_node *kn) { }
static inline void kernfs_put(struct kernfs_node *kn) { }

static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{ return NULL; }

static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
{ return NULL; }

static inline struct inode *
kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{ return NULL; }

static inline struct kernfs_root *
kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
                   void *priv)
{ return ERR_PTR(-ENOSYS); }

static inline void kernfs_destroy_root(struct kernfs_root *root) { }
static inline unsigned int kernfs_root_flags(struct kernfs_node *kn)
{ return 0; }

static inline struct kernfs_node *
kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
                     umode_t mode, kuid_t uid, kgid_t gid,
                     void *priv, const struct ns_common *ns)
{ return ERR_PTR(-ENOSYS); }

static inline struct kernfs_node *
__kernfs_create_file(struct kernfs_node *parent, const char *name,
                     umode_t mode, kuid_t uid, kgid_t gid,
                     loff_t size, const struct kernfs_ops *ops,
                     void *priv, const struct ns_common *ns,
                     struct lock_class_key *key)
{ return ERR_PTR(-ENOSYS); }

static inline struct kernfs_node *
kernfs_create_link(struct kernfs_node *parent, const char *name,
                   struct kernfs_node *target)
{ return ERR_PTR(-ENOSYS); }

static inline void kernfs_activate(struct kernfs_node *kn) { }

static inline void kernfs_remove(struct kernfs_node *kn) { }

static inline bool kernfs_remove_self(struct kernfs_node *kn)
{ return false; }

static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
                                           const char *name,
                                           const struct ns_common *ns)
{ return -ENOSYS; }

static inline int kernfs_rename_ns(struct kernfs_node *kn,
                                   struct kernfs_node *new_parent,
                                   const char *new_name,
                                   const struct ns_common *new_ns)
{ return -ENOSYS; }

static inline int kernfs_setattr(struct kernfs_node *kn,
                                 const struct iattr *iattr)
{ return -ENOSYS; }

static inline __poll_t kernfs_generic_poll(struct kernfs_open_file *of,
                                           struct poll_table_struct *pt)
{ return -ENOSYS; }

static inline void kernfs_notify(struct kernfs_node *kn) { }

static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
                                   void *value, size_t size)
{ return -ENOSYS; }

static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
                                   const void *value, size_t size, int flags)
{ return -ENOSYS; }

static inline const struct ns_common *kernfs_super_ns(struct super_block *sb)
{ return NULL; }

static inline int kernfs_get_tree(struct fs_context *fc)
{ return -ENOSYS; }

static inline void kernfs_free_fs_context(struct fs_context *fc) { }

static inline void kernfs_kill_sb(struct super_block *sb) { }

static inline void kernfs_init(void) { }

#endif        /* CONFIG_KERNFS */

/**
 * kernfs_path - build full path of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * If @kn is NULL result will be "(null)".
 *
 * Returns the length of the full path.  If the full length is equal to or
 * greater than @buflen, @buf contains the truncated path with the trailing
 * '\0'.  On error, -errno is returned.
 */
static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
{
        return kernfs_path_from_node(kn, NULL, buf, buflen);
}

static inline struct kernfs_node *
kernfs_find_and_get(struct kernfs_node *kn, const char *name)
{
        return kernfs_find_and_get_ns(kn, name, NULL);
}

static inline struct kernfs_node *
kernfs_walk_and_get(struct kernfs_node *kn, const char *path)
{
        return kernfs_walk_and_get_ns(kn, path, NULL);
}

static inline struct kernfs_node *
kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
                  void *priv)
{
        return kernfs_create_dir_ns(parent, name, mode,
                                    GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
                                    priv, NULL);
}

static inline int kernfs_remove_by_name(struct kernfs_node *parent,
                                        const char *name)
{
        return kernfs_remove_by_name_ns(parent, name, NULL);
}

static inline int kernfs_rename(struct kernfs_node *kn,
                                struct kernfs_node *new_parent,
                                const char *new_name)
{
        return kernfs_rename_ns(kn, new_parent, new_name, NULL);
}

#endif        /* __LINUX_KERNFS_H */



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Read-Copy Update mechanism for mutual exclusion, adapted for tracing.
 *
 * Copyright (C) 2020 Paul E. McKenney.
 */

#ifndef __LINUX_RCUPDATE_TRACE_H
#define __LINUX_RCUPDATE_TRACE_H

#include <linux/sched.h>
#include <linux/rcupdate.h>
#include <linux/cleanup.h>

#ifdef CONFIG_TASKS_TRACE_RCU
extern struct srcu_struct rcu_tasks_trace_srcu_struct;
#endif // #ifdef CONFIG_TASKS_TRACE_RCU

#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_TASKS_TRACE_RCU)

static inline int rcu_read_lock_trace_held(void)
{
        return srcu_read_lock_held(&rcu_tasks_trace_srcu_struct);
}

#else // #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_TASKS_TRACE_RCU)

static inline int rcu_read_lock_trace_held(void)
{
        return 1;
}

#endif // #else // #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_TASKS_TRACE_RCU)

#ifdef CONFIG_TASKS_TRACE_RCU

/**
 * rcu_read_lock_tasks_trace - mark beginning of RCU-trace read-side critical section
 *
 * When synchronize_rcu_tasks_trace() is invoked by one task, then that
 * task is guaranteed to block until all other tasks exit their read-side
 * critical sections.  Similarly, if call_rcu_trace() is invoked on one
 * task while other tasks are within RCU read-side critical sections,
 * invocation of the corresponding RCU callback is deferred until after
 * the all the other tasks exit their critical sections.
 *
 * For more details, please see the documentation for
 * srcu_read_lock_fast().  For a description of how implicit RCU
 * readers provide the needed ordering for architectures defining the
 * ARCH_WANTS_NO_INSTR Kconfig option (and thus promising never to trace
 * code where RCU is not watching), please see the __srcu_read_lock_fast()
 * (non-kerneldoc) header comment.  Otherwise, the smp_mb() below provided
 * the needed ordering.
 */
static inline struct srcu_ctr __percpu *rcu_read_lock_tasks_trace(void)
{
        struct srcu_ctr __percpu *ret = __srcu_read_lock_fast(&rcu_tasks_trace_srcu_struct);

        rcu_try_lock_acquire(&rcu_tasks_trace_srcu_struct.dep_map);
        if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_NO_MB))
                smp_mb(); // Provide ordering on noinstr-incomplete architectures.
        return ret;
}

/**
 * rcu_read_unlock_tasks_trace - mark end of RCU-trace read-side critical section
 * @scp: return value from corresponding rcu_read_lock_tasks_trace().
 *
 * Pairs with the preceding call to rcu_read_lock_tasks_trace() that
 * returned the value passed in via scp.
 *
 * For more details, please see the documentation for rcu_read_unlock().
 * For memory-ordering information, please see the header comment for the
 * rcu_read_lock_tasks_trace() function.
 */
static inline void rcu_read_unlock_tasks_trace(struct srcu_ctr __percpu *scp)
{
        if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_NO_MB))
                smp_mb(); // Provide ordering on noinstr-incomplete architectures.
        __srcu_read_unlock_fast(&rcu_tasks_trace_srcu_struct, scp);
        srcu_lock_release(&rcu_tasks_trace_srcu_struct.dep_map);
}

/**
 * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section
 *
 * When synchronize_rcu_tasks_trace() is invoked by one task, then that
 * task is guaranteed to block until all other tasks exit their read-side
 * critical sections.  Similarly, if call_rcu_trace() is invoked on one
 * task while other tasks are within RCU read-side critical sections,
 * invocation of the corresponding RCU callback is deferred until after
 * the all the other tasks exit their critical sections.
 *
 * For more details, please see the documentation for rcu_read_lock().
 */
static inline void rcu_read_lock_trace(void)
{
        struct task_struct *t = current;

        rcu_try_lock_acquire(&rcu_tasks_trace_srcu_struct.dep_map);
        if (t->trc_reader_nesting++) {
                // In case we interrupted a Tasks Trace RCU reader.
                return;
        }
        barrier();  // nesting before scp to protect against interrupt handler.
        t->trc_reader_scp = __srcu_read_lock_fast(&rcu_tasks_trace_srcu_struct);
        if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_NO_MB))
                smp_mb(); // Placeholder for more selective ordering
}

/**
 * rcu_read_unlock_trace - mark end of RCU-trace read-side critical section
 *
 * Pairs with a preceding call to rcu_read_lock_trace(), and nesting is
 * allowed.  Invoking a rcu_read_unlock_trace() when there is no matching
 * rcu_read_lock_trace() is verboten, and will result in lockdep complaints.
 *
 * For more details, please see the documentation for rcu_read_unlock().
 */
static inline void rcu_read_unlock_trace(void)
{
        struct srcu_ctr __percpu *scp;
        struct task_struct *t = current;

        scp = t->trc_reader_scp;
        barrier();  // scp before nesting to protect against interrupt handler.
        if (!--t->trc_reader_nesting) {
                if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_NO_MB))
                        smp_mb(); // Placeholder for more selective ordering
                __srcu_read_unlock_fast(&rcu_tasks_trace_srcu_struct, scp);
        }
        srcu_lock_release(&rcu_tasks_trace_srcu_struct.dep_map);
}

/**
 * call_rcu_tasks_trace() - Queue a callback trace task-based grace period
 * @rhp: structure to be used for queueing the RCU updates.
 * @func: actual callback function to be invoked after the grace period
 *
 * The callback function will be invoked some time after a trace rcu-tasks
 * grace period elapses, in other words after all currently executing
 * trace rcu-tasks read-side critical sections have completed. These
 * read-side critical sections are delimited by calls to rcu_read_lock_trace()
 * and rcu_read_unlock_trace().
 *
 * See the description of call_rcu() for more detailed information on
 * memory ordering guarantees.
 */
static inline void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func)
{
        call_srcu(&rcu_tasks_trace_srcu_struct, rhp, func);
}

/**
 * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period
 *
 * Control will return to the caller some time after a trace rcu-tasks
 * grace period has elapsed, in other words after all currently executing
 * trace rcu-tasks read-side critical sections have elapsed. These read-side
 * critical sections are delimited by calls to rcu_read_lock_trace()
 * and rcu_read_unlock_trace().
 *
 * This is a very specialized primitive, intended only for a few uses in
 * tracing and other situations requiring manipulation of function preambles
 * and profiling hooks.  The synchronize_rcu_tasks_trace() function is not
 * (yet) intended for heavy use from multiple CPUs.
 *
 * See the description of synchronize_rcu() for more detailed information
 * on memory ordering guarantees.
 */
static inline void synchronize_rcu_tasks_trace(void)
{
        synchronize_srcu(&rcu_tasks_trace_srcu_struct);
}

/**
 * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks.
 *
 * Note that rcu_barrier_tasks_trace() is not obligated to actually wait,
 * for example, if there are no pending callbacks.
 */
static inline void rcu_barrier_tasks_trace(void)
{
        srcu_barrier(&rcu_tasks_trace_srcu_struct);
}

/**
 * rcu_tasks_trace_expedite_current - Expedite the current Tasks Trace RCU grace period
 *
 * Cause the current Tasks Trace RCU grace period to become expedited.
 * The grace period following the current one might also be expedited.
 * If there is no current grace period, one might be created.  If the
 * current grace period is currently sleeping, that sleep will complete
 * before expediting will take effect.
 */
static inline void rcu_tasks_trace_expedite_current(void)
{
        srcu_expedite_current(&rcu_tasks_trace_srcu_struct);
}

// Placeholders to enable stepwise transition.
void __init rcu_tasks_trace_suppress_unused(void);

#else
/*
 * The BPF JIT forms these addresses even when it doesn't call these
 * functions, so provide definitions that result in runtime errors.
 */
static inline void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) { BUG(); }
static inline void rcu_read_lock_trace(void) { BUG(); }
static inline void rcu_read_unlock_trace(void) { BUG(); }
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */

DEFINE_LOCK_GUARD_0(rcu_tasks_trace,
        rcu_read_lock_trace(),
        rcu_read_unlock_trace())

#endif /* __LINUX_RCUPDATE_TRACE_H */










   37 
















































   26 




   29 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM csd

#if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CSD_H

#include <linux/tracepoint.h>

TRACE_EVENT(csd_queue_cpu,

        TP_PROTO(const unsigned int cpu,
                unsigned long callsite,
                smp_call_func_t func,
                call_single_data_t *csd),

        TP_ARGS(cpu, callsite, func, csd),

        TP_STRUCT__entry(
                __field(unsigned int, cpu)
                __field(void *, callsite)
                __field(void *, func)
                __field(void *, csd)
                ),

            TP_fast_assign(
                __entry->cpu = cpu;
                __entry->callsite = (void *)callsite;
                __entry->func = func;
                __entry->csd  = csd;
                ),

        TP_printk("cpu=%u callsite=%pS func=%ps csd=%p",
                __entry->cpu, __entry->callsite, __entry->func, __entry->csd)
        );

/*
 * Tracepoints for a function which is called as an effect of smp_call_function.*
 */
DECLARE_EVENT_CLASS(csd_function,

        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),

        TP_ARGS(func, csd),

        TP_STRUCT__entry(
                __field(void *,        func)
                __field(void *,        csd)
        ),

        TP_fast_assign(
                __entry->func        = func;
                __entry->csd        = csd;
        ),

        TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd)
);

DEFINE_EVENT(csd_function, csd_function_entry,
        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
        TP_ARGS(func, csd)
);

DEFINE_EVENT(csd_function, csd_function_exit,
        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
        TP_ARGS(func, csd)
);

#endif /* _TRACE_CSD_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





































































































































































































































































   24 




   22 





   24 













































































   22 










































































   23 

   22 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/bvec.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/scatterlist.h>
#include <linux/instrumented.h>
#include <linux/iov_iter.h>

static __always_inline
size_t copy_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        if (should_fail_usercopy())
                return len;
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = raw_copy_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline
size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
                                 size_t len, void *from, void *priv2)
{
        ssize_t res;

        if (should_fail_usercopy())
                return len;

        from += progress;
        res = copy_to_user_nofault(iter_to, from, len);
        return res < 0 ? len : res;
}

static __always_inline
size_t copy_from_user_iter(void __user *iter_from, size_t progress,
                           size_t len, void *to, void *priv2)
{
        size_t res = len;

        if (should_fail_usercopy())
                return len;
        if (can_do_masked_user_access()) {
                iter_from = mask_user_address(iter_from);
        } else {
                if (!access_ok(iter_from, len))
                        return res;

                /*
                 * Ensure that bad access_ok() speculation will not
                 * lead to nasty side effects *after* the copy is
                 * finished:
                 */
                barrier_nospec();
        }
        to += progress;
        instrument_copy_from_user_before(to, iter_from, len);
        res = raw_copy_from_user(to, iter_from, len);
        instrument_copy_from_user_after(to, iter_from, len, res);

        return res;
}

static __always_inline
size_t memcpy_to_iter(void *iter_to, size_t progress,
                      size_t len, void *from, void *priv2)
{
        memcpy(iter_to, from + progress, len);
        return 0;
}

static __always_inline
size_t memcpy_from_iter(void *iter_from, size_t progress,
                        size_t len, void *to, void *priv2)
{
        memcpy(to + progress, iter_from, len);
        return 0;
}

/*
 * fault_in_iov_iter_readable - fault in iov iterator for reading
 * @i: iterator
 * @size: maximum length
 *
 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
 * @size.  For each iovec, fault in each page that constitutes the iovec.
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 *
 * Always returns 0 for non-userspace iterators.
 */
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
{
        if (iter_is_ubuf(i)) {
                size_t n = min(size, iov_iter_count(i));
                n -= fault_in_readable(i->ubuf + i->iov_offset, n);
                return size - n;
        } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;

                size -= count;
                for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
                        size_t len = min(count, p->iov_len - skip);
                        size_t ret;

                        if (unlikely(!len))
                                continue;
                        ret = fault_in_readable(p->iov_base + skip, len);
                        count -= len - ret;
                        if (ret)
                                break;
                }
                return count + size;
        }
        return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_readable);

/*
 * fault_in_iov_iter_writeable - fault in iov iterator for writing
 * @i: iterator
 * @size: maximum length
 *
 * Faults in the iterator using get_user_pages(), i.e., without triggering
 * hardware page faults.  This is primarily useful when we already know that
 * some or all of the pages in @i aren't in memory.
 *
 * Returns the number of bytes not faulted in, like copy_to_user() and
 * copy_from_user().
 *
 * Always returns 0 for non-user-space iterators.
 */
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
{
        if (iter_is_ubuf(i)) {
                size_t n = min(size, iov_iter_count(i));
                n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
                return size - n;
        } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;

                size -= count;
                for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
                        size_t len = min(count, p->iov_len - skip);
                        size_t ret;

                        if (unlikely(!len))
                                continue;
                        ret = fault_in_safe_writeable(p->iov_base + skip, len);
                        count -= len - ret;
                        if (ret)
                                break;
                }
                return count + size;
        }
        return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_writeable);

void iov_iter_init(struct iov_iter *i, unsigned int direction,
                        const struct iovec *iov, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter) {
                .iter_type = ITER_IOVEC,
                .nofault = false,
                .data_source = direction,
                .__iov = iov,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_init);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter, memcpy_to_iter);
}
EXPORT_SYMBOL(_copy_to_iter);

#ifdef CONFIG_ARCH_HAS_COPY_MC
static __always_inline
size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
                            size_t len, void *from, void *priv2)
{
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = copy_mc_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline
size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        return copy_mc_to_kernel(iter_to, from + progress, len);
}

/**
 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 * @addr: source kernel address
 * @bytes: total transfer length
 * @i: destination iterator
 *
 * The pmem driver deploys this for the dax operation
 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
 * successfully copied.
 *
 * The main differences between this and typical _copy_to_iter().
 *
 * * Typical tail/residue handling after a fault retries the copy
 *   byte-by-byte until the fault happens again. Re-triggering machine
 *   checks is potentially fatal so the implementation uses source
 *   alignment and poison alignment assumptions to avoid re-triggering
 *   hardware exceptions.
 *
 * * ITER_KVEC and ITER_BVEC can return short copies.  Compare to
 *   copy_to_iter() where only ITER_IOVEC attempts might return a short copy.
 *
 * Return: number of bytes copied (may be %0)
 */
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter_mc, memcpy_to_iter_mc);
}
EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
#endif /* CONFIG_ARCH_HAS_COPY_MC */

static __always_inline
size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter, memcpy_from_iter);
}

size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        if (user_backed_iter(i))
                might_fault();
        return __copy_from_iter(addr, bytes, i);
}
EXPORT_SYMBOL(_copy_from_iter);

static __always_inline
size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
                                   size_t len, void *to, void *priv2)
{
        return copy_from_user_inatomic_nontemporal(to + progress, iter_from, len);
}

size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter_nocache,
                                   memcpy_from_iter);
}
EXPORT_SYMBOL(_copy_from_iter_nocache);

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
static __always_inline
size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
                                      size_t len, void *to, void *priv2)
{
        return copy_from_user_flushcache(to + progress, iter_from, len);
}

static __always_inline
size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
                                   size_t len, void *to, void *priv2)
{
        memcpy_flushcache(to + progress, iter_from, len);
        return 0;
}

/**
 * _copy_from_iter_flushcache - write destination through cpu cache
 * @addr: destination kernel address
 * @bytes: total transfer length
 * @i: source iterator
 *
 * The pmem driver arranges for filesystem-dax to use this facility via
 * dax_copy_from_iter() for ensuring that writes to persistent memory
 * are flushed through the CPU cache. It is differentiated from
 * _copy_from_iter_nocache() in that guarantees all data is flushed for
 * all iterator types. The _copy_from_iter_nocache() only attempts to
 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 * instructions that strand dirty-data in the cache.
 *
 * Return: number of bytes copied (may be %0)
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter_flushcache,
                                   memcpy_from_iter_flushcache);
}
EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
#endif

static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
{
        struct page *head;
        size_t v = n + offset;

        /*
         * The general case needs to access the page order in order
         * to compute the page size.
         * However, we mostly deal with order-0 pages and thus can
         * avoid a possible cache line miss for requests that fit all
         * page orders.
         */
        if (n <= v && v <= PAGE_SIZE)
                return true;

        head = compound_head(page);
        v += (page - head) << PAGE_SHIFT;

        if (WARN_ON(n > v || v > page_size(head)))
                return false;
        return true;
}

size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t res = 0;
        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
                n = _copy_to_iter(kaddr + offset, n, i);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_to_iter);

size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
                                 struct iov_iter *i)
{
        size_t res = 0;

        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);

                n = iterate_and_advance(i, n, kaddr + offset,
                                        copy_to_user_iter_nofault,
                                        memcpy_to_iter);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_to_iter_nofault);

size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t res = 0;
        if (!page_copy_sane(page, offset, bytes))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
                n = _copy_from_iter(kaddr + offset, n, i);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_from_iter);

static __always_inline
size_t zero_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *priv, void *priv2)
{
        return clear_user(iter_to, len);
}

static __always_inline
size_t zero_to_iter(void *iter_to, size_t progress,
                    size_t len, void *priv, void *priv2)
{
        memset(iter_to, 0, len);
        return 0;
}

size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
        return iterate_and_advance(i, bytes, NULL,
                                   zero_to_user_iter, zero_to_iter);
}
EXPORT_SYMBOL(iov_iter_zero);

size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset,
                size_t bytes, struct iov_iter *i)
{
        size_t n, copied = 0;

        if (!page_copy_sane(&folio->page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        do {
                char *to = kmap_local_folio(folio, offset);

                n = bytes - copied;
                if (folio_test_partial_kmap(folio) &&
                    n > PAGE_SIZE - offset_in_page(offset))
                        n = PAGE_SIZE - offset_in_page(offset);

                pagefault_disable();
                n = __copy_from_iter(to, n, i);
                pagefault_enable();
                kunmap_local(to);
                copied += n;
                offset += n;
        } while (copied != bytes && n > 0);

        return copied;
}
EXPORT_SYMBOL(copy_folio_from_iter_atomic);

static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
{
        const struct bio_vec *bvec, *end;

        if (!i->count)
                return;
        i->count -= size;

        size += i->iov_offset;

        for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
                if (likely(size < bvec->bv_len))
                        break;
                size -= bvec->bv_len;
        }
        i->iov_offset = size;
        i->nr_segs -= bvec - i->bvec;
        i->bvec = bvec;
}

static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
{
        const struct iovec *iov, *end;

        if (!i->count)
                return;
        i->count -= size;

        size += i->iov_offset; // from beginning of current segment
        for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
                if (likely(size < iov->iov_len))
                        break;
                size -= iov->iov_len;
        }
        i->iov_offset = size;
        i->nr_segs -= iov - iter_iov(i);
        i->__iov = iov;
}

static void iov_iter_folioq_advance(struct iov_iter *i, size_t size)
{
        const struct folio_queue *folioq = i->folioq;
        unsigned int slot = i->folioq_slot;

        if (!i->count)
                return;
        i->count -= size;

        if (slot >= folioq_nr_slots(folioq)) {
                folioq = folioq->next;
                slot = 0;
        }

        size += i->iov_offset; /* From beginning of current segment. */
        do {
                size_t fsize = folioq_folio_size(folioq, slot);

                if (likely(size < fsize))
                        break;
                size -= fsize;
                slot++;
                if (slot >= folioq_nr_slots(folioq) && folioq->next) {
                        folioq = folioq->next;
                        slot = 0;
                }
        } while (size);

        i->iov_offset = size;
        i->folioq_slot = slot;
        i->folioq = folioq;
}

void iov_iter_advance(struct iov_iter *i, size_t size)
{
        if (unlikely(i->count < size))
                size = i->count;
        if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
                i->iov_offset += size;
                i->count -= size;
        } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
                /* iovec and kvec have identical layouts */
                iov_iter_iovec_advance(i, size);
        } else if (iov_iter_is_bvec(i)) {
                iov_iter_bvec_advance(i, size);
        } else if (iov_iter_is_folioq(i)) {
                iov_iter_folioq_advance(i, size);
        } else if (iov_iter_is_discard(i)) {
                i->count -= size;
        }
}
EXPORT_SYMBOL(iov_iter_advance);

static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll)
{
        const struct folio_queue *folioq = i->folioq;
        unsigned int slot = i->folioq_slot;

        for (;;) {
                size_t fsize;

                if (slot == 0) {
                        folioq = folioq->prev;
                        slot = folioq_nr_slots(folioq);
                }
                slot--;

                fsize = folioq_folio_size(folioq, slot);
                if (unroll <= fsize) {
                        i->iov_offset = fsize - unroll;
                        break;
                }
                unroll -= fsize;
        }

        i->folioq_slot = slot;
        i->folioq = folioq;
}

void iov_iter_revert(struct iov_iter *i, size_t unroll)
{
        if (!unroll)
                return;
        if (WARN_ON(unroll > MAX_RW_COUNT))
                return;
        i->count += unroll;
        if (unlikely(iov_iter_is_discard(i)))
                return;
        if (unroll <= i->iov_offset) {
                i->iov_offset -= unroll;
                return;
        }
        unroll -= i->iov_offset;
        if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
                BUG(); /* We should never go beyond the start of the specified
                        * range since we might then be straying into pages that
                        * aren't pinned.
                        */
        } else if (iov_iter_is_bvec(i)) {
                const struct bio_vec *bvec = i->bvec;
                while (1) {
                        size_t n = (--bvec)->bv_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->bvec = bvec;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        } else if (iov_iter_is_folioq(i)) {
                i->iov_offset = 0;
                iov_iter_folioq_revert(i, unroll);
        } else { /* same logics for iovec and kvec */
                const struct iovec *iov = iter_iov(i);
                while (1) {
                        size_t n = (--iov)->iov_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->__iov = iov;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        }
}
EXPORT_SYMBOL(iov_iter_revert);

/*
 * Return the count of just the current iov_iter segment.
 */
size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
        if (i->nr_segs > 1) {
                if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                        return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
                if (iov_iter_is_bvec(i))
                        return min(i->count, i->bvec->bv_len - i->iov_offset);
        }
        if (unlikely(iov_iter_is_folioq(i)))
                return !i->count ? 0 :
                        umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count);
        return i->count;
}
EXPORT_SYMBOL(iov_iter_single_seg_count);

void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
                        const struct kvec *kvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter){
                .iter_type = ITER_KVEC,
                .data_source = direction,
                .kvec = kvec,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_kvec);

void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
                        const struct bio_vec *bvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter){
                .iter_type = ITER_BVEC,
                .data_source = direction,
                .bvec = bvec,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_bvec);

/**
 * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @folioq: The starting point in the folio queue.
 * @first_slot: The first slot in the folio queue to use
 * @offset: The offset into the folio in the first slot to start at
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator to either draw data out of the pages attached to an
 * inode or to inject data into those pages.  The pages *must* be prevented
 * from evaporation, either by taking a ref on them or locking them by the
 * caller.
 */
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
                          const struct folio_queue *folioq, unsigned int first_slot,
                          unsigned int offset, size_t count)
{
        BUG_ON(direction & ~1);
        *i = (struct iov_iter) {
                .iter_type = ITER_FOLIOQ,
                .data_source = direction,
                .folioq = folioq,
                .folioq_slot = first_slot,
                .count = count,
                .iov_offset = offset,
        };
}
EXPORT_SYMBOL(iov_iter_folio_queue);

/**
 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @xarray: The xarray to access.
 * @start: The start file position.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator to either draw data out of the pages attached to an
 * inode or to inject data into those pages.  The pages *must* be prevented
 * from evaporation, either by taking a ref on them or locking them by the
 * caller.
 */
void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
                     struct xarray *xarray, loff_t start, size_t count)
{
        BUG_ON(direction & ~1);
        *i = (struct iov_iter) {
                .iter_type = ITER_XARRAY,
                .data_source = direction,
                .xarray = xarray,
                .xarray_start = start,
                .count = count,
                .iov_offset = 0
        };
}
EXPORT_SYMBOL(iov_iter_xarray);

/**
 * iov_iter_discard - Initialise an I/O iterator that discards data
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator that just discards everything that's written to it.
 * It's only available as a READ iterator.
 */
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
{
        BUG_ON(direction != READ);
        *i = (struct iov_iter){
                .iter_type = ITER_DISCARD,
                .data_source = false,
                .count = count,
                .iov_offset = 0
        };
}
EXPORT_SYMBOL(iov_iter_discard);

static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
{
        const struct iovec *iov = iter_iov(i);
        unsigned long res = 0;
        size_t size = i->count;
        size_t skip = i->iov_offset;

        do {
                size_t len = iov->iov_len - skip;
                if (len) {
                        res |= (unsigned long)iov->iov_base + skip;
                        if (len > size)
                                len = size;
                        res |= len;
                        size -= len;
                }
                iov++;
                skip = 0;
        } while (size);
        return res;
}

static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
{
        const struct bio_vec *bvec = i->bvec;
        unsigned res = 0;
        size_t size = i->count;
        unsigned skip = i->iov_offset;

        do {
                size_t len = bvec->bv_len - skip;
                res |= (unsigned long)bvec->bv_offset + skip;
                if (len > size)
                        len = size;
                res |= len;
                bvec++;
                size -= len;
                skip = 0;
        } while (size);

        return res;
}

unsigned long iov_iter_alignment(const struct iov_iter *i)
{
        if (likely(iter_is_ubuf(i))) {
                size_t size = i->count;
                if (size)
                        return ((unsigned long)i->ubuf + i->iov_offset) | size;
                return 0;
        }

        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_iter_alignment_iovec(i);

        if (iov_iter_is_bvec(i))
                return iov_iter_alignment_bvec(i);

        /* With both xarray and folioq types, we're dealing with whole folios. */
        if (iov_iter_is_folioq(i))
                return i->iov_offset | i->count;
        if (iov_iter_is_xarray(i))
                return (i->xarray_start + i->iov_offset) | i->count;

        return 0;
}
EXPORT_SYMBOL(iov_iter_alignment);

unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
{
        unsigned long res = 0;
        unsigned long v = 0;
        size_t size = i->count;
        unsigned k;

        if (iter_is_ubuf(i))
                return 0;

        if (WARN_ON(!iter_is_iovec(i)))
                return ~0U;

        for (k = 0; k < i->nr_segs; k++) {
                const struct iovec *iov = iter_iov(i) + k;
                if (iov->iov_len) {
                        unsigned long base = (unsigned long)iov->iov_base;
                        if (v) // if not the first one
                                res |= base | v; // this start | previous end
                        v = base + iov->iov_len;
                        if (size <= iov->iov_len)
                                break;
                        size -= iov->iov_len;
                }
        }
        return res;
}
EXPORT_SYMBOL(iov_iter_gap_alignment);

static int want_pages_array(struct page ***res, size_t size,
                            size_t start, unsigned int maxpages)
{
        unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE);

        if (count > maxpages)
                count = maxpages;
        WARN_ON(!count);        // caller should've prevented that
        if (!*res) {
                *res = kvmalloc_objs(struct page *, count);
                if (!*res)
                        return 0;
        }
        return count;
}

static ssize_t iter_folioq_get_pages(struct iov_iter *iter,
                                     struct page ***ppages, size_t maxsize,
                                     unsigned maxpages, size_t *_start_offset)
{
        const struct folio_queue *folioq = iter->folioq;
        struct page **pages;
        unsigned int slot = iter->folioq_slot;
        size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset;

        if (slot >= folioq_nr_slots(folioq)) {
                folioq = folioq->next;
                slot = 0;
                if (WARN_ON(iov_offset != 0))
                        return -EIO;
        }

        maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages);
        if (!maxpages)
                return -ENOMEM;
        *_start_offset = iov_offset & ~PAGE_MASK;
        pages = *ppages;

        for (;;) {
                struct folio *folio = folioq_folio(folioq, slot);
                size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot);
                size_t part = PAGE_SIZE - offset % PAGE_SIZE;

                if (offset < fsize) {
                        part = umin(part, umin(maxsize - extracted, fsize - offset));
                        count -= part;
                        iov_offset += part;
                        extracted += part;

                        *pages = folio_page(folio, offset / PAGE_SIZE);
                        get_page(*pages);
                        pages++;
                        maxpages--;
                }

                if (maxpages == 0 || extracted >= maxsize)
                        break;

                if (iov_offset >= fsize) {
                        iov_offset = 0;
                        slot++;
                        if (slot == folioq_nr_slots(folioq) && folioq->next) {
                                folioq = folioq->next;
                                slot = 0;
                        }
                }
        }

        iter->count = count;
        iter->iov_offset = iov_offset;
        iter->folioq = folioq;
        iter->folioq_slot = slot;
        return extracted;
}

static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
                                          pgoff_t index, unsigned int nr_pages)
{
        XA_STATE(xas, xa, index);
        struct folio *folio;
        unsigned int ret = 0;

        rcu_read_lock();
        for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;

                /* Has the folio moved or been split? */
                if (unlikely(folio != xas_reload(&xas))) {
                        xas_reset(&xas);
                        continue;
                }

                pages[ret] = folio_file_page(folio, xas.xa_index);
                folio_get(folio);
                if (++ret == nr_pages)
                        break;
        }
        rcu_read_unlock();
        return ret;
}

static ssize_t iter_xarray_get_pages(struct iov_iter *i,
                                     struct page ***pages, size_t maxsize,
                                     unsigned maxpages, size_t *_start_offset)
{
        unsigned nr, offset, count;
        pgoff_t index;
        loff_t pos;

        pos = i->xarray_start + i->iov_offset;
        index = pos >> PAGE_SHIFT;
        offset = pos & ~PAGE_MASK;
        *_start_offset = offset;

        count = want_pages_array(pages, maxsize, offset, maxpages);
        if (!count)
                return -ENOMEM;
        nr = iter_xarray_populate_pages(*pages, i->xarray, index, count);
        if (nr == 0)
                return 0;

        maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
        i->iov_offset += maxsize;
        i->count -= maxsize;
        return maxsize;
}

/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
{
        size_t skip;
        long k;

        if (iter_is_ubuf(i))
                return (unsigned long)i->ubuf + i->iov_offset;

        for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
                const struct iovec *iov = iter_iov(i) + k;
                size_t len = iov->iov_len - skip;

                if (unlikely(!len))
                        continue;
                if (*size > len)
                        *size = len;
                return (unsigned long)iov->iov_base + skip;
        }
        BUG(); // if it had been empty, we wouldn't get called
}

/* must be done on non-empty ITER_BVEC one */
static struct page *first_bvec_segment(const struct iov_iter *i,
                                       size_t *size, size_t *start)
{
        struct page *page;
        size_t skip = i->iov_offset, len;

        len = i->bvec->bv_len - skip;
        if (*size > len)
                *size = len;
        skip += i->bvec->bv_offset;
        page = i->bvec->bv_page + skip / PAGE_SIZE;
        *start = skip % PAGE_SIZE;
        return page;
}

static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
                   struct page ***pages, size_t maxsize,
                   unsigned int maxpages, size_t *start)
{
        unsigned int n, gup_flags = 0;

        if (maxsize > i->count)
                maxsize = i->count;
        if (!maxsize)
                return 0;
        if (maxsize > MAX_RW_COUNT)
                maxsize = MAX_RW_COUNT;

        if (likely(user_backed_iter(i))) {
                unsigned long addr;
                int res;

                if (iov_iter_rw(i) != WRITE)
                        gup_flags |= FOLL_WRITE;
                if (i->nofault)
                        gup_flags |= FOLL_NOFAULT;

                addr = first_iovec_segment(i, &maxsize);
                *start = addr % PAGE_SIZE;
                addr &= PAGE_MASK;
                n = want_pages_array(pages, maxsize, *start, maxpages);
                if (!n)
                        return -ENOMEM;
                res = get_user_pages_fast(addr, n, gup_flags, *pages);
                if (unlikely(res <= 0))
                        return res;
                maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
                iov_iter_advance(i, maxsize);
                return maxsize;
        }
        if (iov_iter_is_bvec(i)) {
                struct page **p;
                struct page *page;

                page = first_bvec_segment(i, &maxsize, start);
                n = want_pages_array(pages, maxsize, *start, maxpages);
                if (!n)
                        return -ENOMEM;
                p = *pages;
                for (int k = 0; k < n; k++) {
                        struct folio *folio = page_folio(page + k);
                        p[k] = page + k;
                        if (!folio_test_slab(folio))
                                folio_get(folio);
                }
                maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
                i->count -= maxsize;
                i->iov_offset += maxsize;
                if (i->iov_offset == i->bvec->bv_len) {
                        i->iov_offset = 0;
                        i->bvec++;
                        i->nr_segs--;
                }
                return maxsize;
        }
        if (iov_iter_is_folioq(i))
                return iter_folioq_get_pages(i, pages, maxsize, maxpages, start);
        if (iov_iter_is_xarray(i))
                return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
        return -EFAULT;
}

ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
                size_t maxsize, unsigned maxpages, size_t *start)
{
        if (!maxpages)
                return 0;
        BUG_ON(!pages);

        return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
}
EXPORT_SYMBOL(iov_iter_get_pages2);

ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
                struct page ***pages, size_t maxsize, size_t *start)
{
        ssize_t len;

        *pages = NULL;

        len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start);
        if (len <= 0) {
                kvfree(*pages);
                *pages = NULL;
        }
        return len;
}
EXPORT_SYMBOL(iov_iter_get_pages_alloc2);

static int iov_npages(const struct iov_iter *i, int maxpages)
{
        size_t skip = i->iov_offset, size = i->count;
        const struct iovec *p;
        int npages = 0;

        for (p = iter_iov(i); size; skip = 0, p++) {
                unsigned offs = offset_in_page(p->iov_base + skip);
                size_t len = min(p->iov_len - skip, size);

                if (len) {
                        size -= len;
                        npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
                        if (unlikely(npages > maxpages))
                                return maxpages;
                }
        }
        return npages;
}

static int bvec_npages(const struct iov_iter *i, int maxpages)
{
        size_t skip = i->iov_offset, size = i->count;
        const struct bio_vec *p;
        int npages = 0;

        for (p = i->bvec; size; skip = 0, p++) {
                unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
                size_t len = min(p->bv_len - skip, size);

                size -= len;
                npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
                if (unlikely(npages > maxpages))
                        return maxpages;
        }
        return npages;
}

int iov_iter_npages(const struct iov_iter *i, int maxpages)
{
        if (unlikely(!i->count))
                return 0;
        if (likely(iter_is_ubuf(i))) {
                unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
                int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_npages(i, maxpages);
        if (iov_iter_is_bvec(i))
                return bvec_npages(i, maxpages);
        if (iov_iter_is_folioq(i)) {
                unsigned offset = i->iov_offset % PAGE_SIZE;
                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        if (iov_iter_is_xarray(i)) {
                unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        return 0;
}
EXPORT_SYMBOL(iov_iter_npages);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
{
        *new = *old;
        if (iov_iter_is_bvec(new))
                return new->bvec = kmemdup(new->bvec,
                                    new->nr_segs * sizeof(struct bio_vec),
                                    flags);
        else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
                /* iovec and kvec have identical layout */
                return new->__iov = kmemdup(new->__iov,
                                   new->nr_segs * sizeof(struct iovec),
                                   flags);
        return NULL;
}
EXPORT_SYMBOL(dup_iter);

static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uvec, u32 nr_segs)
{
        const struct compat_iovec __user *uiov =
                (const struct compat_iovec __user *)uvec;
        int ret = -EFAULT;
        u32 i;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        for (i = 0; i < nr_segs; i++) {
                compat_uptr_t buf;
                compat_ssize_t len;

                unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);

                /* check for compat_size_t not fitting in compat_ssize_t .. */
                if (len < 0) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov[i].iov_base = compat_ptr(buf);
                iov[i].iov_len = len;
        }

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

static __noclone int copy_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uiov, unsigned long nr_segs)
{
        int ret = -EFAULT;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        do {
                void __user *buf;
                ssize_t len;

                unsafe_get_user(len, &uiov->iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov->iov_base, uaccess_end);

                /* check for size_t not fitting in ssize_t .. */
                if (unlikely(len < 0)) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov->iov_base = buf;
                iov->iov_len = len;

                uiov++; iov++;
        } while (--nr_segs);

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

struct iovec *iovec_from_user(const struct iovec __user *uvec,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat)
{
        struct iovec *iov = fast_iov;
        int ret;

        /*
         * SuS says "The readv() function *may* fail if the iovcnt argument was
         * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
         * traditionally returned zero for zero segments, so...
         */
        if (nr_segs == 0)
                return iov;
        if (nr_segs > UIO_MAXIOV)
                return ERR_PTR(-EINVAL);
        if (nr_segs > fast_segs) {
                iov = kmalloc_objs(struct iovec, nr_segs);
                if (!iov)
                        return ERR_PTR(-ENOMEM);
        }

        if (unlikely(compat))
                ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
        else
                ret = copy_iovec_from_user(iov, uvec, nr_segs);
        if (ret) {
                if (iov != fast_iov)
                        kfree(iov);
                return ERR_PTR(ret);
        }

        return iov;
}

/*
 * Single segment iovec supplied by the user, import it as ITER_UBUF.
 */
static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
                                   struct iovec **iovp, struct iov_iter *i,
                                   bool compat)
{
        struct iovec *iov = *iovp;
        ssize_t ret;

        *iovp = NULL;

        if (compat)
                ret = copy_compat_iovec_from_user(iov, uvec, 1);
        else
                ret = copy_iovec_from_user(iov, uvec, 1);
        if (unlikely(ret))
                return ret;

        ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
        if (unlikely(ret))
                return ret;
        return i->count;
}

ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat)
{
        ssize_t total_len = 0;
        unsigned long seg;
        struct iovec *iov;

        if (nr_segs == 1)
                return __import_iovec_ubuf(type, uvec, iovp, i, compat);

        iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
        if (IS_ERR(iov)) {
                *iovp = NULL;
                return PTR_ERR(iov);
        }

        /*
         * According to the Single Unix Specification we should return EINVAL if
         * an element length is < 0 when cast to ssize_t or if the total length
         * would overflow the ssize_t return value of the system call.
         *
         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
         * overflow case.
         */
        for (seg = 0; seg < nr_segs; seg++) {
                ssize_t len = (ssize_t)iov[seg].iov_len;

                if (!access_ok(iov[seg].iov_base, len)) {
                        if (iov != *iovp)
                                kfree(iov);
                        *iovp = NULL;
                        return -EFAULT;
                }

                if (len > MAX_RW_COUNT - total_len) {
                        len = MAX_RW_COUNT - total_len;
                        iov[seg].iov_len = len;
                }
                total_len += len;
        }

        iov_iter_init(i, type, iov, nr_segs, total_len);
        if (iov == *iovp)
                *iovp = NULL;
        else
                *iovp = iov;
        return total_len;
}

/**
 * import_iovec() - Copy an array of &struct iovec from userspace
 *     into the kernel, check that it is valid, and initialize a new
 *     &struct iov_iter iterator to access it.
 *
 * @type: One of %READ or %WRITE.
 * @uvec: Pointer to the userspace array.
 * @nr_segs: Number of elements in userspace array.
 * @fast_segs: Number of elements in @iov.
 * @iovp: (input and output parameter) Pointer to pointer to (usually small
 *     on-stack) kernel array.
 * @i: Pointer to iterator that will be initialized on success.
 *
 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
 * then this function places %NULL in *@iov on return. Otherwise, a new
 * array will be allocated and the result placed in *@iov. This means that
 * the caller may call kfree() on *@iov regardless of whether the small
 * on-stack array was used or not (and regardless of whether this function
 * returns an error or not).
 *
 * Return: Negative error code on error, bytes imported on success
 */
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs,
                 struct iovec **iovp, struct iov_iter *i)
{
        return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
                              in_compat_syscall());
}
EXPORT_SYMBOL(import_iovec);

int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
{
        if (len > MAX_RW_COUNT)
                len = MAX_RW_COUNT;
        if (unlikely(!access_ok(buf, len)))
                return -EFAULT;

        iov_iter_ubuf(i, rw, buf, len);
        return 0;
}
EXPORT_SYMBOL_GPL(import_ubuf);

/**
 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
 *     iov_iter_save_state() was called.
 *
 * @i: &struct iov_iter to restore
 * @state: state to restore from
 *
 * Used after iov_iter_save_state() to bring restore @i, if operations may
 * have advanced it.
 *
 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
 */
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
        if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
                         !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
                return;
        i->iov_offset = state->iov_offset;
        i->count = state->count;
        if (iter_is_ubuf(i))
                return;
        /*
         * For the *vec iters, nr_segs + iov is constant - if we increment
         * the vec, then we also decrement the nr_segs count. Hence we don't
         * need to track both of these, just one is enough and we can deduct
         * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
         * size, so we can just increment the iov pointer as they are unionzed.
         * ITER_BVEC _may_ be the same size on some archs, but on others it is
         * not. Be safe and handle it separately.
         */
        BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
        if (iov_iter_is_bvec(i))
                i->bvec -= state->nr_segs - i->nr_segs;
        else
                i->__iov -= state->nr_segs - i->nr_segs;
        i->nr_segs = state->nr_segs;
}

/*
 * Extract a list of contiguous pages from an ITER_FOLIOQ iterator.  This does
 * not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i,
                                             struct page ***pages, size_t maxsize,
                                             unsigned int maxpages,
                                             iov_iter_extraction_t extraction_flags,
                                             size_t *offset0)
{
        const struct folio_queue *folioq = i->folioq;
        struct page **p;
        unsigned int nr = 0;
        size_t extracted = 0, offset, slot = i->folioq_slot;

        if (slot >= folioq_nr_slots(folioq)) {
                folioq = folioq->next;
                slot = 0;
                if (WARN_ON(i->iov_offset != 0))
                        return -EIO;
        }

        offset = i->iov_offset & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        for (;;) {
                struct folio *folio = folioq_folio(folioq, slot);
                size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot);
                size_t part = PAGE_SIZE - offset % PAGE_SIZE;

                if (offset < fsize) {
                        part = umin(part, umin(maxsize - extracted, fsize - offset));
                        i->count -= part;
                        i->iov_offset += part;
                        extracted += part;

                        p[nr++] = folio_page(folio, offset / PAGE_SIZE);
                }

                if (nr >= maxpages || extracted >= maxsize)
                        break;

                if (i->iov_offset >= fsize) {
                        i->iov_offset = 0;
                        slot++;
                        if (slot == folioq_nr_slots(folioq) && folioq->next) {
                                folioq = folioq->next;
                                slot = 0;
                        }
                }
        }

        i->folioq = folioq;
        i->folioq_slot = slot;
        return extracted;
}

/*
 * Extract a list of contiguous pages from an ITER_XARRAY iterator.  This does not
 * get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
                                             struct page ***pages, size_t maxsize,
                                             unsigned int maxpages,
                                             iov_iter_extraction_t extraction_flags,
                                             size_t *offset0)
{
        struct page **p;
        struct folio *folio;
        unsigned int nr = 0, offset;
        loff_t pos = i->xarray_start + i->iov_offset;
        XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT);

        offset = pos & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        rcu_read_lock();
        for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;

                /* Has the folio moved or been split? */
                if (unlikely(folio != xas_reload(&xas))) {
                        xas_reset(&xas);
                        continue;
                }

                p[nr++] = folio_file_page(folio, xas.xa_index);
                if (nr == maxpages)
                        break;
        }
        rcu_read_unlock();

        maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
        iov_iter_advance(i, maxsize);
        return maxsize;
}

/*
 * Extract a list of virtually contiguous pages from an ITER_BVEC iterator.
 * This does not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
                                           struct page ***pages, size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        size_t skip = i->iov_offset, size = 0;
        struct bvec_iter bi;
        int k = 0;

        if (i->nr_segs == 0)
                return 0;

        if (i->iov_offset == i->bvec->bv_len) {
                i->iov_offset = 0;
                i->nr_segs--;
                i->bvec++;
                skip = 0;
        }
        bi.bi_idx = 0;
        bi.bi_size = maxsize;
        bi.bi_bvec_done = skip;

        maxpages = want_pages_array(pages, maxsize, skip, maxpages);

        while (bi.bi_size && bi.bi_idx < i->nr_segs) {
                struct bio_vec bv = bvec_iter_bvec(i->bvec, bi);

                /*
                 * The iov_iter_extract_pages interface only allows an offset
                 * into the first page.  Break out of the loop if we see an
                 * offset into subsequent pages, the caller will have to call
                 * iov_iter_extract_pages again for the reminder.
                 */
                if (k) {
                        if (bv.bv_offset)
                                break;
                } else {
                        *offset0 = bv.bv_offset;
                }

                (*pages)[k++] = bv.bv_page;
                size += bv.bv_len;

                if (k >= maxpages)
                        break;

                /*
                 * We are done when the end of the bvec doesn't align to a page
                 * boundary as that would create a hole in the returned space.
                 * The caller will handle this with another call to
                 * iov_iter_extract_pages.
                 */
                if (bv.bv_offset + bv.bv_len != PAGE_SIZE)
                        break;

                bvec_iter_advance_single(i->bvec, &bi, bv.bv_len);
        }

        iov_iter_advance(i, size);
        return size;
}

/*
 * Extract a list of virtually contiguous pages from an ITER_KVEC iterator.
 * This does not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i,
                                           struct page ***pages, size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        struct page **p, *page;
        const void *kaddr;
        size_t skip = i->iov_offset, offset, len, size;
        int k;

        for (;;) {
                if (i->nr_segs == 0)
                        return 0;
                size = min(maxsize, i->kvec->iov_len - skip);
                if (size)
                        break;
                i->iov_offset = 0;
                i->nr_segs--;
                i->kvec++;
                skip = 0;
        }

        kaddr = i->kvec->iov_base + skip;
        offset = (unsigned long)kaddr & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, size, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        kaddr -= offset;
        len = offset + size;
        for (k = 0; k < maxpages; k++) {
                size_t seg = min_t(size_t, len, PAGE_SIZE);

                if (is_vmalloc_or_module_addr(kaddr))
                        page = vmalloc_to_page(kaddr);
                else
                        page = virt_to_page(kaddr);

                p[k] = page;
                len -= seg;
                kaddr += PAGE_SIZE;
        }

        size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
        iov_iter_advance(i, size);
        return size;
}

/*
 * Extract a list of contiguous pages from a user iterator and get a pin on
 * each of them.  This should only be used if the iterator is user-backed
 * (IOBUF/UBUF).
 *
 * It does not get refs on the pages, but the pages must be unpinned by the
 * caller once the transfer is complete.
 *
 * This is safe to be used where background IO/DMA *is* going to be modifying
 * the buffer; using a pin rather than a ref makes forces fork() to give the
 * child a copy of the page.
 */
static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
                                           struct page ***pages,
                                           size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        unsigned long addr;
        unsigned int gup_flags = 0;
        size_t offset;
        int res;

        if (i->data_source == ITER_DEST)
                gup_flags |= FOLL_WRITE;
        if (extraction_flags & ITER_ALLOW_P2PDMA)
                gup_flags |= FOLL_PCI_P2PDMA;
        if (i->nofault)
                gup_flags |= FOLL_NOFAULT;

        addr = first_iovec_segment(i, &maxsize);
        *offset0 = offset = addr % PAGE_SIZE;
        addr &= PAGE_MASK;
        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
        if (unlikely(res <= 0))
                return res;
        maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
        iov_iter_advance(i, maxsize);
        return maxsize;
}

/**
 * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
 * @i: The iterator to extract from
 * @pages: Where to return the list of pages
 * @maxsize: The maximum amount of iterator to extract
 * @maxpages: The maximum size of the list of pages
 * @extraction_flags: Flags to qualify request
 * @offset0: Where to return the starting offset into (*@pages)[0]
 *
 * Extract a list of contiguous pages from the current point of the iterator,
 * advancing the iterator.  The maximum number of pages and the maximum amount
 * of page contents can be set.
 *
 * If *@pages is NULL, a page list will be allocated to the required size and
 * *@pages will be set to its base.  If *@pages is not NULL, it will be assumed
 * that the caller allocated a page list at least @maxpages in size and this
 * will be filled in.
 *
 * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
 * be allowed on the pages extracted.
 *
 * The iov_iter_extract_will_pin() function can be used to query how cleanup
 * should be performed.
 *
 * Extra refs or pins on the pages may be obtained as follows:
 *
 *  (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be
 *      added to the pages, but refs will not be taken.
 *      iov_iter_extract_will_pin() will return true.
 *
 *  (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the
 *      pages are merely listed; no extra refs or pins are obtained.
 *      iov_iter_extract_will_pin() will return 0.
 *
 * Note also:
 *
 *  (*) Use with ITER_DISCARD is not supported as that has no content.
 *
 * On success, the function sets *@pages to the new pagelist, if allocated, and
 * sets *offset0 to the offset into the first page.
 *
 * It may also return -ENOMEM and -EFAULT.
 */
ssize_t iov_iter_extract_pages(struct iov_iter *i,
                               struct page ***pages,
                               size_t maxsize,
                               unsigned int maxpages,
                               iov_iter_extraction_t extraction_flags,
                               size_t *offset0)
{
        maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT);
        if (!maxsize)
                return 0;

        if (likely(user_backed_iter(i)))
                return iov_iter_extract_user_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_kvec(i))
                return iov_iter_extract_kvec_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_bvec(i))
                return iov_iter_extract_bvec_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_folioq(i))
                return iov_iter_extract_folioq_pages(i, pages, maxsize,
                                                     maxpages, extraction_flags,
                                                     offset0);
        if (iov_iter_is_xarray(i))
                return iov_iter_extract_xarray_pages(i, pages, maxsize,
                                                     maxpages, extraction_flags,
                                                     offset0);
        return -EFAULT;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_pages);

static unsigned int get_contig_folio_len(struct page **pages,
                unsigned int *num_pages, size_t left, size_t offset)
{
        struct folio *folio = page_folio(pages[0]);
        size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
        unsigned int max_pages, i;
        size_t folio_offset, len;

        folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
        len = min(folio_size(folio) - folio_offset, left);

        /*
         * We might COW a single page in the middle of a large folio, so we have
         * to check that all pages belong to the same folio.
         */
        left -= contig_sz;
        max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
        for (i = 1; i < max_pages; i++) {
                size_t next = min_t(size_t, PAGE_SIZE, left);

                if (page_folio(pages[i]) != folio ||
                    pages[i] != pages[i - 1] + 1)
                        break;
                contig_sz += next;
                left -= next;
        }

        *num_pages = i;
        return contig_sz;
}

#define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))

/**
 * iov_iter_extract_bvecs - Extract bvecs from an iterator
 * @iter:        the iterator to extract from
 * @bv:                bvec return array
 * @max_size:        maximum size to extract from @iter
 * @nr_vecs:        number of vectors in @bv (on in and output)
 * @max_vecs:        maximum vectors in @bv, including those filled before calling
 * @extraction_flags: flags to qualify request
 *
 * Like iov_iter_extract_pages(), but returns physically contiguous ranges
 * contained in a single folio as a single bvec instead of multiple entries.
 *
 * Returns the number of bytes extracted when successful, or a negative errno.
 * If @nr_vecs was non-zero on entry, the number of successfully extracted bytes
 * can be 0.
 */
ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
                size_t max_size, unsigned short *nr_vecs,
                unsigned short max_vecs, iov_iter_extraction_t extraction_flags)
{
        unsigned short entries_left = max_vecs - *nr_vecs;
        unsigned short nr_pages, i = 0;
        size_t left, offset, len;
        struct page **pages;
        ssize_t size;

        /*
         * Move page array up in the allocated memory for the bio vecs as far as
         * possible so that we can start filling biovecs from the beginning
         * without overwriting the temporary page array.
         */
        BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
        pages = (struct page **)(bv + *nr_vecs) +
                entries_left * (PAGE_PTRS_PER_BVEC - 1);

        size = iov_iter_extract_pages(iter, &pages, max_size, entries_left,
                        extraction_flags, &offset);
        if (unlikely(size <= 0))
                return size ? size : -EFAULT;

        nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
        for (left = size; left > 0; left -= len) {
                unsigned int nr_to_add;

                if (*nr_vecs > 0 &&
                    !zone_device_pages_have_same_pgmap(bv[*nr_vecs - 1].bv_page,
                                pages[i]))
                        break;

                len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
                bvec_set_page(&bv[*nr_vecs], pages[i], len, offset);
                i += nr_to_add;
                (*nr_vecs)++;
                offset = 0;
        }

        iov_iter_revert(iter, left);
        if (iov_iter_extract_will_pin(iter)) {
                while (i < nr_pages)
                        unpin_user_page(pages[i++]);
        }
        return size - left;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_bvecs);

































































   21 





    3 





    1 
























































    1 
























    6 






    6 




   19 















   10 





   21 
   19 

   31 
    2 













   30 


























   25 
   32 

   24 




   30 



























   24 



























   21 

   23 

























    4 



    4 





















   22 
   10 





















   20 





   18 



    5 
















    1 














   21 
    1 












    8 

    6 




    1 











    8 


   12 







































   12 








   12 








    5 


    5 
   12 




















































   22 


    9 

   20 


    9 
    3 




   11 



    7 




    8 











    8 











    6 
    3 




    5 






















   23 










   23 

   24 
   19 








   18 










   26 


   21 


   22 



    9 



   18 




































































   23 
   21 
   14 
















   27 









   23 


   13 





   26 







   11 

   23 
    1 

   24 
   14 










   27 

   27 




   22 

    1 
    1 

   12 




   14 






   25 
































    2 






    5 
    1 





    2 
    3 













    1 



   10 


    8 
   11 

    4 






    4 
    1 















   14 




    1 

   12 
   13 



   11 

















































































































































































































































































































































































































































   22 


   10 





    5 



   23 

   14 




    6 

   16 

   11 






    5 



   12 

































   15 






    1 





    1 






    7 




   12 



    9 



    9 





    1 

   11 





   12 
    3 






   12 




    5 



    4 

    4 




   11 


















   11 









   12 
   10 

    3 

    6 

    7 



    1 




    9 











    1 




    1 
















   28 







   26 












































    1 


























    1 










    1 



    1 






















































































































































































































































































    3 
















    4 

    4 


    4 



































































    2 



    1 
    3 












    1 




    1 


























































































    6 








    6 



    6 





















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
// SPDX-License-Identifier: GPL-2.0+
/*
 * XArray implementation
 * Copyright (c) 2017-2018 Microsoft Corporation
 * Copyright (c) 2018-2020 Oracle
 * Author: Matthew Wilcox <willy@infradead.org>
 */

#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/xarray.h>

#include "radix-tree.h"

/*
 * Coding conventions in this file:
 *
 * @xa is used to refer to the entire xarray.
 * @xas is the 'xarray operation state'.  It may be either a pointer to
 * an xa_state, or an xa_state stored on the stack.  This is an unfortunate
 * ambiguity.
 * @index is the index of the entry being operated on
 * @mark is an xa_mark_t; a small number indicating one of the mark bits.
 * @node refers to an xa_node; usually the primary one being operated on by
 * this function.
 * @offset is the index into the slots array inside an xa_node.
 * @parent refers to the @xa_node closer to the head than @node.
 * @entry refers to something stored in a slot in the xarray
 */

static inline unsigned int xa_lock_type(const struct xarray *xa)
{
        return (__force unsigned int)xa->xa_flags & 3;
}

static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_lock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_lock_bh(xas);
        else
                xas_lock(xas);
}

static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_unlock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_unlock_bh(xas);
        else
                xas_unlock(xas);
}

static inline bool xa_track_free(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_TRACK_FREE;
}

static inline bool xa_zero_busy(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_ZERO_BUSY;
}

static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
{
        if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
                xa->xa_flags |= XA_FLAGS_MARK(mark);
}

static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark)
{
        if (xa->xa_flags & XA_FLAGS_MARK(mark))
                xa->xa_flags &= ~(XA_FLAGS_MARK(mark));
}

static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark)
{
        return node->marks[(__force unsigned)mark];
}

static inline bool node_get_mark(struct xa_node *node,
                unsigned int offset, xa_mark_t mark)
{
        return test_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_set_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_set_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_clear_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_clear_bit(offset, node_marks(node, mark));
}

static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark)
{
        return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE);
}

static inline void node_mark_all(struct xa_node *node, xa_mark_t mark)
{
        bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE);
}

#define mark_inc(mark) do { \
        mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \
} while (0)

/*
 * xas_squash_marks() - Merge all marks to the first entry
 * @xas: Array operation state.
 *
 * Set a mark on the first entry if any entry has it set.  Clear marks on
 * all sibling entries.
 */
static void xas_squash_marks(const struct xa_state *xas)
{
        xa_mark_t mark = 0;
        unsigned int limit = xas->xa_offset + xas->xa_sibs + 1;

        for (;;) {
                unsigned long *marks = node_marks(xas->xa_node, mark);

                if (find_next_bit(marks, limit, xas->xa_offset + 1) != limit) {
                        __set_bit(xas->xa_offset, marks);
                        bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs);
                }
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}

/* extracts the offset within this node from the index */
static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
        return (index >> node->shift) & XA_CHUNK_MASK;
}

static void xas_set_offset(struct xa_state *xas)
{
        xas->xa_offset = get_offset(xas->xa_index, xas->xa_node);
}

/* move the index either forwards (find) or backwards (sibling slot) */
static void xas_move_index(struct xa_state *xas, unsigned long offset)
{
        unsigned int shift = xas->xa_node->shift;
        xas->xa_index &= ~XA_CHUNK_MASK << shift;
        xas->xa_index += offset << shift;
}

static void xas_next_offset(struct xa_state *xas)
{
        xas->xa_offset++;
        xas_move_index(xas, xas->xa_offset);
}

static void *set_bounds(struct xa_state *xas)
{
        xas->xa_node = XAS_BOUNDS;
        return NULL;
}

/*
 * Starts a walk.  If the @xas is already valid, we assume that it's on
 * the right path and just return where we've got to.  If we're in an
 * error state, return NULL.  If the index is outside the current scope
 * of the xarray, return NULL without changing @xas->xa_node.  Otherwise
 * set @xas->xa_node to NULL and return the current head of the array.
 */
static void *xas_start(struct xa_state *xas)
{
        void *entry;

        if (xas_valid(xas))
                return xas_reload(xas);
        if (xas_error(xas))
                return NULL;

        entry = xa_head(xas->xa);
        if (!xa_is_node(entry)) {
                if (xas->xa_index)
                        return set_bounds(xas);
        } else {
                if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK)
                        return set_bounds(xas);
        }

        xas->xa_node = NULL;
        return entry;
}

static __always_inline void *xas_descend(struct xa_state *xas,
                                        struct xa_node *node)
{
        unsigned int offset = get_offset(xas->xa_index, node);
        void *entry = xa_entry(xas->xa, node, offset);

        xas->xa_node = node;
        while (xa_is_sibling(entry)) {
                offset = xa_to_sibling(entry);
                entry = xa_entry(xas->xa, node, offset);
                if (node->shift && xa_is_node(entry))
                        entry = XA_RETRY_ENTRY;
        }

        xas->xa_offset = offset;
        return entry;
}

/**
 * xas_load() - Load an entry from the XArray (advanced).
 * @xas: XArray operation state.
 *
 * Usually walks the @xas to the appropriate state to load the entry
 * stored at xa_index.  However, it will do nothing and return %NULL if
 * @xas is in an error state.  xas_load() will never expand the tree.
 *
 * If the xa_state is set up to operate on a multi-index entry, xas_load()
 * may return %NULL or an internal entry, even if there are entries
 * present within the range specified by @xas.
 *
 * Context: Any context.  The caller should hold the xa_lock or the RCU lock.
 * Return: Usually an entry in the XArray, but see description for exceptions.
 */
void *xas_load(struct xa_state *xas)
{
        void *entry = xas_start(xas);

        while (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);

                if (xas->xa_shift > node->shift)
                        break;
                entry = xas_descend(xas, node);
                if (node->shift == 0)
                        break;
        }
        return entry;
}
EXPORT_SYMBOL_GPL(xas_load);

#define XA_RCU_FREE        ((struct xarray *)1)

static void xa_node_free(struct xa_node *node)
{
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->array = XA_RCU_FREE;
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * xas_destroy() - Free any resources allocated during the XArray operation.
 * @xas: XArray operation state.
 *
 * Most users will not need to call this function; it is called for you
 * by xas_nomem().
 */
void xas_destroy(struct xa_state *xas)
{
        struct xa_node *next, *node = xas->xa_alloc;

        while (node) {
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
                next = rcu_dereference_raw(node->parent);
                radix_tree_node_rcu_free(&node->rcu_head);
                xas->xa_alloc = node = next;
        }
}
EXPORT_SYMBOL_GPL(xas_destroy);

/**
 * xas_nomem() - Allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * If we need to add new nodes to the XArray, we try to allocate memory
 * with GFP_NOWAIT while holding the lock, which will usually succeed.
 * If it fails, @xas is flagged as needing memory to continue.  The caller
 * should drop the lock and call xas_nomem().  If xas_nomem() succeeds,
 * the caller should retry the operation.
 *
 * Forward progress is guaranteed as one node is allocated here and
 * stored in the xa_state where it will be found by xas_alloc().  More
 * nodes will likely be found in the slab allocator, but we do not tie
 * them up here.
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
bool xas_nomem(struct xa_state *xas, gfp_t gfp)
{
        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}
EXPORT_SYMBOL_GPL(xas_nomem);

/*
 * __xas_nomem() - Drop locks and allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * Internal variant of xas_nomem().
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
        __must_hold(xas->xa->xa_lock)
{
        unsigned int lock_type = xa_lock_type(xas->xa);

        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        if (gfpflags_allow_blocking(gfp)) {
                xas_unlock_type(xas, lock_type);
                xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                xas_lock_type(xas, lock_type);
        } else {
                xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
        }
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}

static void xas_update(struct xa_state *xas, struct xa_node *node)
{
        if (xas->xa_update)
                xas->xa_update(node);
        else
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
}

static void *xas_alloc(struct xa_state *xas, unsigned int shift)
{
        struct xa_node *parent = xas->xa_node;
        struct xa_node *node = xas->xa_alloc;

        if (xas_invalid(xas))
                return NULL;

        if (node) {
                xas->xa_alloc = NULL;
        } else {
                gfp_t gfp = GFP_NOWAIT;

                if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                        gfp |= __GFP_ACCOUNT;

                node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                if (!node) {
                        xas_set_err(xas, -ENOMEM);
                        return NULL;
                }
        }

        if (parent) {
                node->offset = xas->xa_offset;
                parent->count++;
                XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE);
                xas_update(xas, parent);
        }
        XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->shift = shift;
        node->count = 0;
        node->nr_values = 0;
        RCU_INIT_POINTER(node->parent, xas->xa_node);
        node->array = xas->xa;

        return node;
}

#ifdef CONFIG_XARRAY_MULTI
/* Returns the number of indices covered by a given xa_state */
static unsigned long xas_size(const struct xa_state *xas)
{
        return (xas->xa_sibs + 1UL) << xas->xa_shift;
}
#endif

/*
 * Use this to calculate the maximum index that will need to be created
 * in order to add the entry described by @xas.  Because we cannot store a
 * multi-index entry at index 0, the calculation is a little more complex
 * than you might expect.
 */
static unsigned long xas_max(struct xa_state *xas)
{
        unsigned long max = xas->xa_index;

#ifdef CONFIG_XARRAY_MULTI
        if (xas->xa_shift || xas->xa_sibs) {
                unsigned long mask = xas_size(xas) - 1;
                max |= mask;
                if (mask == max)
                        max++;
        }
#endif

        return max;
}

/* The maximum index that can be contained in the array without expanding it */
static unsigned long max_index(void *entry)
{
        if (!xa_is_node(entry))
                return 0;
        return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1;
}

static inline void *xa_zero_to_null(void *entry)
{
        return xa_is_zero(entry) ? NULL : entry;
}

static void xas_shrink(struct xa_state *xas)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = xas->xa_node;

        for (;;) {
                void *entry;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count != 1)
                        break;
                entry = xa_entry_locked(xa, node, 0);
                if (!entry)
                        break;
                if (!xa_is_node(entry) && node->shift)
                        break;
                if (xa_zero_busy(xa))
                        entry = xa_zero_to_null(entry);
                xas->xa_node = XAS_BOUNDS;

                RCU_INIT_POINTER(xa->xa_head, entry);
                if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK))
                        xa_mark_clear(xa, XA_FREE_MARK);

                node->count = 0;
                node->nr_values = 0;
                if (!xa_is_node(entry))
                        RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY);
                xas_update(xas, node);
                xa_node_free(node);
                if (!xa_is_node(entry))
                        break;
                node = xa_to_node(entry);
                node->parent = NULL;
        }
}

/*
 * xas_delete_node() - Attempt to delete an xa_node
 * @xas: Array operation state.
 *
 * Attempts to delete the @xas->xa_node.  This will fail if xa->node has
 * a non-zero reference count.
 */
static void xas_delete_node(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        for (;;) {
                struct xa_node *parent;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count)
                        break;

                parent = xa_parent_locked(xas->xa, node);
                xas->xa_node = parent;
                xas->xa_offset = node->offset;
                xa_node_free(node);

                if (!parent) {
                        xas->xa->xa_head = NULL;
                        xas->xa_node = XAS_BOUNDS;
                        return;
                }

                parent->slots[xas->xa_offset] = NULL;
                parent->count--;
                XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE);
                node = parent;
                xas_update(xas, node);
        }

        if (!node->parent)
                xas_shrink(xas);
}

/**
 * xas_free_nodes() - Free this node and all nodes that it references
 * @xas: Array operation state.
 * @top: Node to free
 *
 * This node has been removed from the tree.  We must now free it and all
 * of its subnodes.  There may be RCU walkers with references into the tree,
 * so we must replace all entries with retry markers.
 */
static void xas_free_nodes(struct xa_state *xas, struct xa_node *top)
{
        unsigned int offset = 0;
        struct xa_node *node = top;

        for (;;) {
                void *entry = xa_entry_locked(xas->xa, node, offset);

                if (node->shift && xa_is_node(entry)) {
                        node = xa_to_node(entry);
                        offset = 0;
                        continue;
                }
                if (entry)
                        RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
                offset++;
                while (offset == XA_CHUNK_SIZE) {
                        struct xa_node *parent;

                        parent = xa_parent_locked(xas->xa, node);
                        offset = node->offset + 1;
                        node->count = 0;
                        node->nr_values = 0;
                        xas_update(xas, node);
                        xa_node_free(node);
                        if (node == top)
                                return;
                        node = parent;
                }
        }
}

/*
 * xas_expand adds nodes to the head of the tree until it has reached
 * sufficient height to be able to contain @xas->xa_index
 */
static int xas_expand(struct xa_state *xas, void *head)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = NULL;
        unsigned int shift = 0;
        unsigned long max = xas_max(xas);

        if (!head) {
                if (max == 0)
                        return 0;
                while ((max >> shift) >= XA_CHUNK_SIZE)
                        shift += XA_CHUNK_SHIFT;
                return shift + XA_CHUNK_SHIFT;
        } else if (xa_is_node(head)) {
                node = xa_to_node(head);
                shift = node->shift + XA_CHUNK_SHIFT;
        }
        xas->xa_node = NULL;

        while (max > max_index(head)) {
                xa_mark_t mark = 0;

                XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
                node = xas_alloc(xas, shift);
                if (!node)
                        return -ENOMEM;

                node->count = 1;
                if (xa_is_value(head))
                        node->nr_values = 1;
                RCU_INIT_POINTER(node->slots[0], head);

                /* Propagate the aggregated mark info to the new child */
                for (;;) {
                        if (xa_track_free(xa) && mark == XA_FREE_MARK) {
                                node_mark_all(node, XA_FREE_MARK);
                                if (!xa_marked(xa, XA_FREE_MARK)) {
                                        node_clear_mark(node, 0, XA_FREE_MARK);
                                        xa_mark_set(xa, XA_FREE_MARK);
                                }
                        } else if (xa_marked(xa, mark)) {
                                node_set_mark(node, 0, mark);
                        }
                        if (mark == XA_MARK_MAX)
                                break;
                        mark_inc(mark);
                }

                /*
                 * Now that the new node is fully initialised, we can add
                 * it to the tree
                 */
                if (xa_is_node(head)) {
                        xa_to_node(head)->offset = 0;
                        rcu_assign_pointer(xa_to_node(head)->parent, node);
                }
                head = xa_mk_node(node);
                rcu_assign_pointer(xa->xa_head, head);
                xas_update(xas, node);

                shift += XA_CHUNK_SHIFT;
        }

        xas->xa_node = node;
        return shift;
}

/*
 * xas_create() - Create a slot to store an entry in.
 * @xas: XArray operation state.
 * @allow_root: %true if we can store the entry in the root directly
 *
 * Most users will not need to call this function directly, as it is called
 * by xas_store().  It is useful for doing conditional store operations
 * (see the xa_cmpxchg() implementation for an example).
 *
 * Return: If the slot already existed, returns the contents of this slot.
 * If the slot was newly created, returns %NULL.  If it failed to create the
 * slot, returns %NULL and indicates the error in @xas.
 */
static void *xas_create(struct xa_state *xas, bool allow_root)
{
        struct xarray *xa = xas->xa;
        void *entry;
        void __rcu **slot;
        struct xa_node *node = xas->xa_node;
        int shift;
        unsigned int order = xas->xa_shift;

        if (xas_top(node)) {
                entry = xa_head_locked(xa);
                xas->xa_node = NULL;
                if (!entry && xa_zero_busy(xa))
                        entry = XA_ZERO_ENTRY;
                shift = xas_expand(xas, entry);
                if (shift < 0)
                        return NULL;
                if (!shift && !allow_root)
                        shift = XA_CHUNK_SHIFT;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        } else if (xas_error(xas)) {
                return NULL;
        } else if (node) {
                unsigned int offset = xas->xa_offset;

                shift = node->shift;
                entry = xa_entry_locked(xa, node, offset);
                slot = &node->slots[offset];
        } else {
                shift = 0;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        }

        while (shift > order) {
                shift -= XA_CHUNK_SHIFT;
                if (!entry) {
                        node = xas_alloc(xas, shift);
                        if (!node)
                                break;
                        if (xa_track_free(xa))
                                node_mark_all(node, XA_FREE_MARK);
                        rcu_assign_pointer(*slot, xa_mk_node(node));
                } else if (xa_is_node(entry)) {
                        node = xa_to_node(entry);
                } else {
                        break;
                }
                entry = xas_descend(xas, node);
                slot = &node->slots[xas->xa_offset];
        }

        return entry;
}

/**
 * xas_create_range() - Ensure that stores to this range will succeed
 * @xas: XArray operation state.
 *
 * Creates all of the slots in the range covered by @xas.  Sets @xas to
 * create single-index entries and positions it at the beginning of the
 * range.  This is for the benefit of users which have not yet been
 * converted to use multi-index entries.
 */
void xas_create_range(struct xa_state *xas)
{
        unsigned long index = xas->xa_index;
        unsigned char shift = xas->xa_shift;
        unsigned char sibs = xas->xa_sibs;

        xas->xa_index |= ((sibs + 1UL) << shift) - 1;
        if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift)
                xas->xa_offset |= sibs;
        xas->xa_shift = 0;
        xas->xa_sibs = 0;

        for (;;) {
                xas_create(xas, true);
                if (xas_error(xas))
                        goto restore;
                if (xas->xa_index <= (index | XA_CHUNK_MASK))
                        goto success;
                xas->xa_index -= XA_CHUNK_SIZE;

                for (;;) {
                        struct xa_node *node = xas->xa_node;
                        if (node->shift >= shift)
                                break;
                        xas->xa_node = xa_parent_locked(xas->xa, node);
                        xas->xa_offset = node->offset - 1;
                        if (node->offset != 0)
                                break;
                }
        }

restore:
        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
        xas->xa_index = index;
        return;
success:
        xas->xa_index = index;
        if (xas->xa_node)
                xas_set_offset(xas);
}
EXPORT_SYMBOL_GPL(xas_create_range);

static void update_node(struct xa_state *xas, struct xa_node *node,
                int count, int values)
{
        if (!node || (!count && !values))
                return;

        node->count += count;
        node->nr_values += values;
        XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
        XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE);
        xas_update(xas, node);
        if (count < 0)
                xas_delete_node(xas);
}

/**
 * xas_store() - Store this entry in the XArray.
 * @xas: XArray operation state.
 * @entry: New entry.
 *
 * If @xas is operating on a multi-index entry, the entry returned by this
 * function is essentially meaningless (it may be an internal entry or it
 * may be %NULL, even if there are non-NULL entries at some of the indices
 * covered by the range).  This is not a problem for any current users,
 * and can be changed if needed.
 *
 * Return: The old entry at this index.
 */
void *xas_store(struct xa_state *xas, void *entry)
{
        struct xa_node *node;
        void __rcu **slot = &xas->xa->xa_head;
        unsigned int offset, max;
        int count = 0;
        int values = 0;
        void *first, *next;
        bool value = xa_is_value(entry);

        if (entry) {
                bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry);
                first = xas_create(xas, allow_root);
        } else {
                first = xas_load(xas);
        }

        if (xas_invalid(xas))
                return first;
        node = xas->xa_node;
        if (node && (xas->xa_shift < node->shift))
                xas->xa_sibs = 0;
        if ((first == entry) && !xas->xa_sibs)
                return first;

        next = first;
        offset = xas->xa_offset;
        max = xas->xa_offset + xas->xa_sibs;
        if (node) {
                slot = &node->slots[offset];
                if (xas->xa_sibs)
                        xas_squash_marks(xas);
        }
        if (!entry)
                xas_init_marks(xas);

        for (;;) {
                /*
                 * Must clear the marks before setting the entry to NULL,
                 * otherwise xas_for_each_marked may find a NULL entry and
                 * stop early.  rcu_assign_pointer contains a release barrier
                 * so the mark clearing will appear to happen before the
                 * entry is set to NULL.
                 */
                rcu_assign_pointer(*slot, entry);
                if (xa_is_node(next) && (!node || node->shift))
                        xas_free_nodes(xas, xa_to_node(next));
                if (!node)
                        break;
                count += !next - !entry;
                values += !xa_is_value(first) - !value;
                if (entry) {
                        if (offset == max)
                                break;
                        if (!xa_is_sibling(entry))
                                entry = xa_mk_sibling(xas->xa_offset);
                } else {
                        if (offset == XA_CHUNK_MASK)
                                break;
                }
                next = xa_entry_locked(xas->xa, node, ++offset);
                if (!xa_is_sibling(next)) {
                        if (!entry && (offset > max))
                                break;
                        first = next;
                }
                slot++;
        }

        update_node(xas, node, count, values);
        return first;
}
EXPORT_SYMBOL_GPL(xas_store);

/**
 * xas_get_mark() - Returns the state of this mark.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Return: true if the mark is set, false if the mark is clear or @xas
 * is in an error state.
 */
bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark)
{
        if (xas_invalid(xas))
                return false;
        if (!xas->xa_node)
                return xa_marked(xas->xa, mark);
        return node_get_mark(xas->xa_node, xas->xa_offset, mark);
}
EXPORT_SYMBOL_GPL(xas_get_mark);

/**
 * xas_set_mark() - Sets the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Sets the specified mark on this entry, and walks up the tree setting it
 * on all the ancestor entries.  Does nothing if @xas has not been walked to
 * an entry, or is in an error state.
 */
void xas_set_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (node_set_mark(node, offset, mark))
                        return;
                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (!xa_marked(xas->xa, mark))
                xa_mark_set(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_set_mark);

/**
 * xas_clear_mark() - Clears the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Clears the specified mark on this entry, and walks back to the head
 * attempting to clear it on all the ancestor entries.  Does nothing if
 * @xas has not been walked to an entry, or is in an error state.
 */
void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (!node_clear_mark(node, offset, mark))
                        return;
                if (node_any_mark(node, mark))
                        return;

                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (xa_marked(xas->xa, mark))
                xa_mark_clear(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_clear_mark);

/**
 * xas_init_marks() - Initialise all marks for the entry
 * @xas: Array operations state.
 *
 * Initialise all marks for the entry specified by @xas.  If we're tracking
 * free entries with a mark, we need to set it on all entries.  All other
 * marks are cleared.
 *
 * This implementation is not as efficient as it could be; we may walk
 * up the tree multiple times.
 */
void xas_init_marks(const struct xa_state *xas)
{
        xa_mark_t mark = 0;

        for (;;) {
                if (xa_track_free(xas->xa) && mark == XA_FREE_MARK)
                        xas_set_mark(xas, mark);
                else
                        xas_clear_mark(xas, mark);
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}
EXPORT_SYMBOL_GPL(xas_init_marks);

#ifdef CONFIG_XARRAY_MULTI
static unsigned int node_get_marks(struct xa_node *node, unsigned int offset)
{
        unsigned int marks = 0;
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (node_get_mark(node, offset, mark))
                        marks |= 1 << (__force unsigned int)mark;
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }

        return marks;
}

static inline void node_mark_slots(struct xa_node *node, unsigned int sibs,
                xa_mark_t mark)
{
        int i;

        if (sibs == 0)
                node_mark_all(node, mark);
        else {
                for (i = 0; i < XA_CHUNK_SIZE; i += sibs + 1)
                        node_set_mark(node, i, mark);
        }
}

static void node_set_marks(struct xa_node *node, unsigned int offset,
                        struct xa_node *child, unsigned int sibs,
                        unsigned int marks)
{
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (marks & (1 << (__force unsigned int)mark)) {
                        node_set_mark(node, offset, mark);
                        if (child)
                                node_mark_slots(child, sibs, mark);
                }
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}

static void __xas_init_node_for_split(struct xa_state *xas,
                struct xa_node *node, void *entry)
{
        unsigned int i;
        void *sibling = NULL;
        unsigned int mask = xas->xa_sibs;

        if (!node)
                return;
        node->array = xas->xa;
        for (i = 0; i < XA_CHUNK_SIZE; i++) {
                if ((i & mask) == 0) {
                        RCU_INIT_POINTER(node->slots[i], entry);
                        sibling = xa_mk_sibling(i);
                } else {
                        RCU_INIT_POINTER(node->slots[i], sibling);
                }
        }
}

/**
 * xas_split_alloc() - Allocate memory for splitting an entry.
 * @xas: XArray operation state.
 * @entry: New entry which will be stored in the array.
 * @order: Current entry order.
 * @gfp: Memory allocation flags.
 *
 * This function should be called before calling xas_split().
 * If necessary, it will allocate new nodes (and fill them with @entry)
 * to prepare for the upcoming split of an entry of @order size into
 * entries of the order stored in the @xas.
 *
 * Context: May sleep if @gfp flags permit.
 */
void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order,
                gfp_t gfp)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;

        /* XXX: no support for splitting really large entries yet */
        if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT <= order))
                goto nomem;
        if (xas->xa_shift + XA_CHUNK_SHIFT > order)
                return;

        do {
                struct xa_node *node;

                node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                if (!node)
                        goto nomem;

                __xas_init_node_for_split(xas, node, entry);
                RCU_INIT_POINTER(node->parent, xas->xa_alloc);
                xas->xa_alloc = node;
        } while (sibs-- > 0);

        return;
nomem:
        xas_destroy(xas);
        xas_set_err(xas, -ENOMEM);
}
EXPORT_SYMBOL_GPL(xas_split_alloc);

/**
 * xas_split() - Split a multi-index entry into smaller entries.
 * @xas: XArray operation state.
 * @entry: New entry to store in the array.
 * @order: Current entry order.
 *
 * The size of the new entries is set in @xas.  The value in @entry is
 * copied to all the replacement entries.
 *
 * Context: Any context.  The caller should hold the xa_lock.
 */
void xas_split(struct xa_state *xas, void *entry, unsigned int order)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int offset, marks;
        struct xa_node *node;
        void *curr = xas_load(xas);
        int values = 0;

        node = xas->xa_node;
        if (xas_top(node))
                return;

        marks = node_get_marks(node, xas->xa_offset);

        offset = xas->xa_offset + sibs;
        do {
                if (xas->xa_shift < node->shift) {
                        struct xa_node *child = xas->xa_alloc;

                        xas->xa_alloc = rcu_dereference_raw(child->parent);
                        child->shift = node->shift - XA_CHUNK_SHIFT;
                        child->offset = offset;
                        child->count = XA_CHUNK_SIZE;
                        child->nr_values = xa_is_value(entry) ?
                                        XA_CHUNK_SIZE : 0;
                        RCU_INIT_POINTER(child->parent, node);
                        node_set_marks(node, offset, child, xas->xa_sibs,
                                        marks);
                        rcu_assign_pointer(node->slots[offset],
                                        xa_mk_node(child));
                        if (xa_is_value(curr))
                                values--;
                        xas_update(xas, child);
                } else {
                        unsigned int canon = offset - xas->xa_sibs;

                        node_set_marks(node, canon, NULL, 0, marks);
                        rcu_assign_pointer(node->slots[canon], entry);
                        while (offset > canon)
                                rcu_assign_pointer(node->slots[offset--],
                                                xa_mk_sibling(canon));
                        values += (xa_is_value(entry) - xa_is_value(curr)) *
                                        (xas->xa_sibs + 1);
                }
        } while (offset-- > xas->xa_offset);

        node->nr_values += values;
        xas_update(xas, node);
}
EXPORT_SYMBOL_GPL(xas_split);

/**
 * xas_try_split_min_order() - Minimal split order xas_try_split() can accept
 * @order: Current entry order.
 *
 * xas_try_split() can split a multi-index entry to smaller than @order - 1 if
 * no new xa_node is needed. This function provides the minimal order
 * xas_try_split() supports.
 *
 * Return: the minimal order xas_try_split() supports
 *
 * Context: Any context.
 *
 */
unsigned int xas_try_split_min_order(unsigned int order)
{
        if (order % XA_CHUNK_SHIFT == 0)
                return order == 0 ? 0 : order - 1;

        return order - (order % XA_CHUNK_SHIFT);
}
EXPORT_SYMBOL_GPL(xas_try_split_min_order);

/**
 * xas_try_split() - Try to split a multi-index entry.
 * @xas: XArray operation state.
 * @entry: New entry to store in the array.
 * @order: Current entry order.
 *
 * The size of the new entries is set in @xas.  The value in @entry is
 * copied to all the replacement entries. If and only if one new xa_node is
 * needed, the function will use GFP_NOWAIT to get one if xas->xa_alloc is
 * NULL. If more new xa_node are needed, the function gives EINVAL error.
 *
 * NOTE: use xas_try_split_min_order() to get next split order instead of
 * @order - 1 if you want to minmize xas_try_split() calls.
 *
 * Context: Any context.  The caller should hold the xa_lock.
 */
void xas_try_split(struct xa_state *xas, void *entry, unsigned int order)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int offset, marks;
        struct xa_node *node;
        void *curr = xas_load(xas);
        int values = 0;
        gfp_t gfp = GFP_NOWAIT;

        node = xas->xa_node;
        if (xas_top(node))
                return;

        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;

        marks = node_get_marks(node, xas->xa_offset);

        offset = xas->xa_offset + sibs;

        if (xas->xa_shift < node->shift) {
                struct xa_node *child = xas->xa_alloc;
                unsigned int expected_sibs =
                        (1 << ((order - 1) % XA_CHUNK_SHIFT)) - 1;

                /*
                 * No support for splitting sibling entries
                 * (horizontally) or cascade split (vertically), which
                 * requires two or more new xa_nodes.
                 * Since if one xa_node allocation fails,
                 * it is hard to free the prior allocations.
                 */
                if (sibs || xas->xa_sibs != expected_sibs) {
                        xas_destroy(xas);
                        xas_set_err(xas, -EINVAL);
                        return;
                }

                if (!child) {
                        child = kmem_cache_alloc_lru(radix_tree_node_cachep,
                                                     xas->xa_lru, gfp);
                        if (!child) {
                                xas_destroy(xas);
                                xas_set_err(xas, -ENOMEM);
                                return;
                        }
                        RCU_INIT_POINTER(child->parent, xas->xa_alloc);
                }
                __xas_init_node_for_split(xas, child, entry);

                xas->xa_alloc = rcu_dereference_raw(child->parent);
                child->shift = node->shift - XA_CHUNK_SHIFT;
                child->offset = offset;
                child->count = XA_CHUNK_SIZE;
                child->nr_values = xa_is_value(entry) ?
                                XA_CHUNK_SIZE : 0;
                RCU_INIT_POINTER(child->parent, node);
                node_set_marks(node, offset, child, xas->xa_sibs,
                                marks);
                rcu_assign_pointer(node->slots[offset],
                                xa_mk_node(child));
                if (xa_is_value(curr))
                        values--;
                xas_update(xas, child);

        } else {
                do {
                        unsigned int canon = offset - xas->xa_sibs;

                        node_set_marks(node, canon, NULL, 0, marks);
                        rcu_assign_pointer(node->slots[canon], entry);
                        while (offset > canon)
                                rcu_assign_pointer(node->slots[offset--],
                                                xa_mk_sibling(canon));
                        values += (xa_is_value(entry) - xa_is_value(curr)) *
                                        (xas->xa_sibs + 1);
                } while (offset-- > xas->xa_offset);
        }

        node->nr_values += values;
        xas_update(xas, node);
}
EXPORT_SYMBOL_GPL(xas_try_split);
#endif

/**
 * xas_pause() - Pause a walk to drop a lock.
 * @xas: XArray operation state.
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @xas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call xas_pause(), the xa_for_each()
 * iterator may be more appropriate.
 *
 * Note that xas_pause() only works for forward iteration.  If a user needs
 * to pause a reverse iteration, we will need a xas_pause_rev().
 */
void xas_pause(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (xas_invalid(xas))
                return;

        xas->xa_node = XAS_RESTART;
        if (node) {
                unsigned long offset = xas->xa_offset;
                while (++offset < XA_CHUNK_SIZE) {
                        if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
                                break;
                }
                xas->xa_index &= ~0UL << node->shift;
                xas->xa_index += (offset - xas->xa_offset) << node->shift;
                if (xas->xa_index == 0)
                        xas->xa_node = XAS_BOUNDS;
        } else {
                xas->xa_index++;
        }
}
EXPORT_SYMBOL_GPL(xas_pause);

/*
 * __xas_prev() - Find the previous entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_prev() which handles all the complex cases
 * out of line.
 */
void *__xas_prev(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index--;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset--;

        while (xas->xa_offset == 255) {
                xas->xa_offset = xas->xa_node->offset - 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_prev);

/*
 * __xas_next() - Find the next entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_next() which handles all the complex cases
 * out of line.
 */
void *__xas_next(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index++;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset++;

        while (xas->xa_offset == XA_CHUNK_SIZE) {
                xas->xa_offset = xas->xa_node->offset + 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_next);

/**
 * xas_find() - Find the next present entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * If the @xas has not yet been walked to an entry, return the entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we move to the
 * next entry.
 *
 * If no entry is found and the array is smaller than @max, the iterator
 * is set to the smallest index not yet in the array.  This allows @xas
 * to be immediately passed to xas_store().
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find(struct xa_state *xas, unsigned long max)
{
        void *entry;

        if (xas_error(xas) || xas->xa_node == XAS_BOUNDS)
                return NULL;
        if (xas->xa_index > max)
                return set_bounds(xas);

        if (!xas->xa_node) {
                xas->xa_index = 1;
                return set_bounds(xas);
        } else if (xas->xa_node == XAS_RESTART) {
                entry = xas_load(xas);
                if (entry || xas_not_node(xas->xa_node))
                        return entry;
        } else if (!xas->xa_node->shift &&
                    xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) {
                xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1;
        }

        xas_next_offset(xas);

        while (xas->xa_node && (xas->xa_index <= max)) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (xa_is_node(entry)) {
                        xas->xa_node = xa_to_node(entry);
                        xas->xa_offset = 0;
                        continue;
                }
                if (entry && !xa_is_sibling(entry))
                        return entry;

                xas_next_offset(xas);
        }

        if (!xas->xa_node)
                xas->xa_node = XAS_BOUNDS;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find);

/**
 * xas_find_marked() - Find the next marked entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark number to search for.
 *
 * If the @xas has not yet been walked to an entry, return the marked entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we return the
 * first marked entry with an index > xas.xa_index.
 *
 * If no marked entry is found and the array is smaller than @max, @xas is
 * set to the bounds state and xas->xa_index is set to the smallest index
 * not yet in the array.  This allows @xas to be immediately passed to
 * xas_store().
 *
 * If no entry is found before @max is reached, @xas is set to the restart
 * state.
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
{
        bool advance = true;
        unsigned int offset;
        void *entry;

        if (xas_error(xas))
                return NULL;
        if (xas->xa_index > max)
                goto max;

        if (!xas->xa_node) {
                xas->xa_index = 1;
                goto out;
        } else if (xas_top(xas->xa_node)) {
                advance = false;
                entry = xa_head(xas->xa);
                xas->xa_node = NULL;
                if (xas->xa_index > max_index(entry))
                        goto out;
                if (!xa_is_node(entry)) {
                        if (xa_marked(xas->xa, mark))
                                return entry;
                        xas->xa_index = 1;
                        goto out;
                }
                xas->xa_node = xa_to_node(entry);
                xas->xa_offset = xas->xa_index >> xas->xa_node->shift;
        }

        while (xas->xa_index <= max) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        advance = false;
                        continue;
                }

                if (!advance) {
                        entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                        if (xa_is_sibling(entry)) {
                                xas->xa_offset = xa_to_sibling(entry);
                                xas_move_index(xas, xas->xa_offset);
                        }
                }

                offset = xas_find_chunk(xas, advance, mark);
                if (offset > xas->xa_offset) {
                        advance = false;
                        xas_move_index(xas, offset);
                        /* Mind the wrap */
                        if ((xas->xa_index - 1) >= max)
                                goto max;
                        xas->xa_offset = offset;
                        if (offset == XA_CHUNK_SIZE)
                                continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK))
                        continue;
                if (xa_is_sibling(entry))
                        continue;
                if (!xa_is_node(entry))
                        return entry;
                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }

out:
        if (xas->xa_index > max)
                goto max;
        return set_bounds(xas);
max:
        xas->xa_node = XAS_RESTART;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_marked);

/**
 * xas_find_conflict() - Find the next present entry in a range.
 * @xas: XArray operation state.
 *
 * The @xas describes both a range and a position within that range.
 *
 * Context: Any context.  Expects xa_lock to be held.
 * Return: The next entry in the range covered by @xas or %NULL.
 */
void *xas_find_conflict(struct xa_state *xas)
{
        void *curr;

        if (xas_error(xas))
                return NULL;

        if (!xas->xa_node)
                return NULL;

        if (xas_top(xas->xa_node)) {
                curr = xas_start(xas);
                if (!curr)
                        return NULL;
                while (xa_is_node(curr)) {
                        struct xa_node *node = xa_to_node(curr);
                        curr = xas_descend(xas, node);
                }
                if (curr)
                        return curr;
        }

        if (xas->xa_node->shift > xas->xa_shift)
                return NULL;

        for (;;) {
                if (xas->xa_node->shift == xas->xa_shift) {
                        if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs)
                                break;
                } else if (xas->xa_offset == XA_CHUNK_MASK) {
                        xas->xa_offset = xas->xa_node->offset;
                        xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        continue;
                }
                curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset);
                if (xa_is_sibling(curr))
                        continue;
                while (xa_is_node(curr)) {
                        xas->xa_node = xa_to_node(curr);
                        xas->xa_offset = 0;
                        curr = xa_entry_locked(xas->xa, xas->xa_node, 0);
                }
                if (curr)
                        return curr;
        }
        xas->xa_offset -= xas->xa_sibs;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_conflict);

/**
 * xa_load() - Load an entry from an XArray.
 * @xa: XArray.
 * @index: index into array.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry at @index in @xa.
 */
void *xa_load(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        do {
                entry = xa_zero_to_null(xas_load(&xas));
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        return entry;
}
EXPORT_SYMBOL(xa_load);

static void *xas_result(struct xa_state *xas, void *curr)
{
        if (xas_error(xas))
                curr = xas->xa_node;
        return curr;
}

/**
 * __xa_erase() - Erase this entry from the XArray while locked.
 * @xa: XArray.
 * @index: Index into array.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 * Return: The entry which used to be at this index.
 */
void *__xa_erase(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        return xas_result(&xas, xa_zero_to_null(xas_store(&xas, NULL)));
}
EXPORT_SYMBOL(__xa_erase);

/**
 * xa_erase() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * Return: The entry which used to be at this index.
 */
void *xa_erase(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock(xa);
        entry = __xa_erase(xa, index);
        xa_unlock(xa);

        return entry;
}
EXPORT_SYMBOL(xa_erase);

/**
 * __xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);
        if (xa_track_free(xa) && !entry)
                entry = XA_ZERO_ENTRY;

        do {
                curr = xas_store(&xas, entry);
                if (xa_track_free(xa))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, xa_zero_to_null(curr));
}
EXPORT_SYMBOL(__xa_store);

/**
 * xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from this index will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry
 * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation
 * failed.
 */
void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock(xa);

        return curr;
}
EXPORT_SYMBOL(xa_store);

static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp);

/**
 * __xa_cmpxchg() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * If the entry at @index is the same as @old, replace it with @entry.
 * If the return value is equal to @old, then the exchange was successful.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        return xa_zero_to_null(__xa_cmpxchg_raw(xa, index, old, entry, gfp));
}
EXPORT_SYMBOL(__xa_cmpxchg);

static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);

        do {
                curr = xas_load(&xas);
                if (curr == old) {
                        xas_store(&xas, entry);
                        if (xa_track_free(xa) && entry && !curr)
                                xas_clear_mark(&xas, XA_FREE_MARK);
                }
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, curr);
}

/**
 * __xa_insert() - Store this entry in the XArray if no entry is present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        void *curr;
        int errno;

        if (!entry)
                entry = XA_ZERO_ENTRY;
        curr = __xa_cmpxchg_raw(xa, index, NULL, entry, gfp);
        errno = xa_err(curr);
        if (errno)
                return errno;
        return (curr != NULL) ? -EBUSY : 0;
}
EXPORT_SYMBOL(__xa_insert);

#ifdef CONFIG_XARRAY_MULTI
static void xas_set_range(struct xa_state *xas, unsigned long first,
                unsigned long last)
{
        unsigned int shift = 0;
        unsigned long sibs = last - first;
        unsigned int offset = XA_CHUNK_MASK;

        xas_set(xas, first);

        while ((first & XA_CHUNK_MASK) == 0) {
                if (sibs < XA_CHUNK_MASK)
                        break;
                if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK))
                        break;
                shift += XA_CHUNK_SHIFT;
                if (offset == XA_CHUNK_MASK)
                        offset = sibs & XA_CHUNK_MASK;
                sibs >>= XA_CHUNK_SHIFT;
                first >>= XA_CHUNK_SHIFT;
        }

        offset = first & XA_CHUNK_MASK;
        if (offset + sibs > XA_CHUNK_MASK)
                sibs = XA_CHUNK_MASK - offset;
        if ((((first + sibs + 1) << shift) - 1) > last)
                sibs -= 1;

        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
}

/**
 * xa_store_range() - Store this entry at a range of indices in the XArray.
 * @xa: XArray.
 * @first: First index to affect.
 * @last: Last index to affect.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from any index between @first and @last,
 * inclusive will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in
 * an XArray, or xa_err(-ENOMEM) if memory allocation failed.
 */
void *xa_store_range(struct xarray *xa, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_internal(entry)))
                return XA_ERROR(-EINVAL);
        if (last < first)
                return XA_ERROR(-EINVAL);

        do {
                xas_lock(&xas);
                if (entry) {
                        unsigned int order = BITS_PER_LONG;
                        if (last + 1)
                                order = __ffs(last + 1);
                        xas_set_order(&xas, last, order);
                        xas_create(&xas, true);
                        if (xas_error(&xas))
                                goto unlock;
                }
                do {
                        xas_set_range(&xas, first, last);
                        xas_store(&xas, entry);
                        if (xas_error(&xas))
                                goto unlock;
                        first += xas_size(&xas);
                } while (first <= last);
unlock:
                xas_unlock(&xas);
        } while (xas_nomem(&xas, gfp));

        return xas_result(&xas, NULL);
}
EXPORT_SYMBOL(xa_store_range);

/**
 * xas_get_order() - Get the order of an entry.
 * @xas: XArray operation state.
 *
 * Called after xas_load, the xas should not be in an error state.
 * The xas should not be pointing to a sibling entry.
 *
 * Return: A number between 0 and 63 indicating the order of the entry.
 */
int xas_get_order(struct xa_state *xas)
{
        int order = 0;

        if (!xas->xa_node)
                return 0;

        XA_NODE_BUG_ON(xas->xa_node, xa_is_sibling(xa_entry(xas->xa,
                       xas->xa_node, xas->xa_offset)));
        for (;;) {
                unsigned int slot = xas->xa_offset + (1 << order);

                if (slot >= XA_CHUNK_SIZE)
                        break;
                if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot)))
                        break;
                order++;
        }

        order += xas->xa_node->shift;
        return order;
}
EXPORT_SYMBOL_GPL(xas_get_order);

/**
 * xa_get_order() - Get the order of an entry.
 * @xa: XArray.
 * @index: Index of the entry.
 *
 * Return: A number between 0 and 63 indicating the order of the entry.
 */
int xa_get_order(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        int order = 0;
        void *entry;

        rcu_read_lock();
        entry = xas_load(&xas);
        if (entry)
                order = xas_get_order(&xas);
        rcu_read_unlock();

        return order;
}
EXPORT_SYMBOL(xa_get_order);
#endif /* CONFIG_XARRAY_MULTI */

/**
 * __xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @limit: Range for allocated ID.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;
        if (WARN_ON_ONCE(!xa_track_free(xa)))
                return -EINVAL;

        if (!entry)
                entry = XA_ZERO_ENTRY;

        do {
                xas.xa_index = limit.min;
                xas_find_marked(&xas, limit.max, XA_FREE_MARK);
                if (xas.xa_node == XAS_RESTART)
                        xas_set_err(&xas, -EBUSY);
                else
                        *id = xas.xa_index;
                xas_store(&xas, entry);
                xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_alloc);

/**
 * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        u32 min = limit.min;
        int ret;

        limit.min = max(min, *next);
        ret = __xa_alloc(xa, id, entry, limit, gfp);
        if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }

        if (ret < 0 && limit.min > min) {
                limit.min = min;
                ret = __xa_alloc(xa, id, entry, limit, gfp);
                if (ret == 0)
                        ret = 1;
        }

        if (ret >= 0) {
                *next = *id + 1;
                if (*next == 0)
                        xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED;
        }
        return ret;
}
EXPORT_SYMBOL(__xa_alloc_cyclic);

/**
 * __xa_set_mark() - Set this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_set_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_set_mark);

/**
 * __xa_clear_mark() - Clear this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_clear_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_clear_mark);

/**
 * xa_get_mark() - Inquire whether this mark is set on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * This function uses the RCU read lock, so the result may be out of date
 * by the time it returns.  If you need the result to be stable, use a lock.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: True if the entry at @index has this mark set, false if it doesn't.
 */
bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        entry = xas_start(&xas);
        while (xas_get_mark(&xas, mark)) {
                if (!xa_is_node(entry))
                        goto found;
                entry = xas_descend(&xas, xa_to_node(entry));
        }
        rcu_read_unlock();
        return false;
 found:
        rcu_read_unlock();
        return true;
}
EXPORT_SYMBOL(xa_get_mark);

/**
 * xa_set_mark() - Set this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_set_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_set_mark);

/**
 * xa_clear_mark() - Clear this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Clearing a mark always succeeds.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_clear_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_clear_mark);

/**
 * xa_find() - Search the XArray for an entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter, and has the lowest
 * index that is at least @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may not find
 * entries which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry, if found, otherwise %NULL.
 */
void *xa_find(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp);
        void *entry;

        rcu_read_lock();
        do {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find);

static bool xas_sibling(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        unsigned long mask;

        if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node)
                return false;
        mask = (XA_CHUNK_SIZE << node->shift) - 1;
        return (xas->xa_index & mask) >
                ((unsigned long)xas->xa_offset << node->shift);
}

/**
 * xa_find_after() - Search the XArray for a present entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter and has the lowest
 * index that is above @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may miss entries
 * which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The pointer, if found, otherwise %NULL.
 */
void *xa_find_after(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp + 1);
        void *entry;

        if (xas.xa_index == 0)
                return NULL;

        rcu_read_lock();
        for (;;) {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);

                if (xas_invalid(&xas))
                        break;
                if (xas_sibling(&xas))
                        continue;
                if (!xas_retry(&xas, entry))
                        break;
        }
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find_after);

static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each(xas, entry, max) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

static unsigned int xas_extract_marked(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n, xa_mark_t mark)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each_marked(xas, entry, max, mark) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * xa_extract() - Copy selected entries from the XArray into a normal array.
 * @xa: The source XArray to copy from.
 * @dst: The buffer to copy entries into.
 * @start: The first index in the XArray eligible to be selected.
 * @max: The last index in the XArray eligible to be selected.
 * @n: The maximum number of entries to copy.
 * @filter: Selection criterion.
 *
 * Copies up to @n entries that match @filter from the XArray.  The
 * copied entries will have indices between @start and @max, inclusive.
 *
 * The @filter may be an XArray mark value, in which case entries which are
 * marked with that mark will be copied.  It may also be %XA_PRESENT, in
 * which case all entries which are not %NULL will be copied.
 *
 * The entries returned may not represent a snapshot of the XArray at a
 * moment in time.  For example, if another thread stores to index 5, then
 * index 10, calling xa_extract() may return the old contents of index 5
 * and the new contents of index 10.  Indices not modified while this
 * function is running will not be skipped.
 *
 * If you need stronger guarantees, holding the xa_lock across calls to this
 * function will prevent concurrent modification.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The number of entries copied.
 */
unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
                        unsigned long max, unsigned int n, xa_mark_t filter)
{
        XA_STATE(xas, xa, start);

        if (!n)
                return 0;

        if ((__force unsigned int)filter < XA_MAX_MARKS)
                return xas_extract_marked(&xas, dst, max, n, filter);
        return xas_extract_present(&xas, dst, max, n);
}
EXPORT_SYMBOL(xa_extract);

/**
 * xa_delete_node() - Private interface for workingset code.
 * @node: Node to be removed from the tree.
 * @update: Function to call to update ancestor nodes.
 *
 * Context: xa_lock must be held on entry and will not be released.
 */
void xa_delete_node(struct xa_node *node, xa_update_node_t update)
{
        struct xa_state xas = {
                .xa = node->array,
                .xa_index = (unsigned long)node->offset <<
                                (node->shift + XA_CHUNK_SHIFT),
                .xa_shift = node->shift + XA_CHUNK_SHIFT,
                .xa_offset = node->offset,
                .xa_node = xa_parent_locked(node->array, node),
                .xa_update = update,
        };

        xas_store(&xas, NULL);
}
EXPORT_SYMBOL_GPL(xa_delete_node);        /* For the benefit of the test suite */

/**
 * xa_destroy() - Free all internal data structures.
 * @xa: XArray.
 *
 * After calling this function, the XArray is empty and has freed all memory
 * allocated for its internal data structures.  You are responsible for
 * freeing the objects referenced by the XArray.
 *
 * Context: Any context.  Takes and releases the xa_lock, interrupt-safe.
 */
void xa_destroy(struct xarray *xa)
{
        XA_STATE(xas, xa, 0);
        unsigned long flags;
        void *entry;

        xas.xa_node = NULL;
        xas_lock_irqsave(&xas, flags);
        entry = xa_head_locked(xa);
        RCU_INIT_POINTER(xa->xa_head, NULL);
        xas_init_marks(&xas);
        if (xa_zero_busy(xa))
                xa_mark_clear(xa, XA_FREE_MARK);
        /* lockdep checks we're still holding the lock in xas_free_nodes() */
        if (xa_is_node(entry))
                xas_free_nodes(&xas, xa_to_node(entry));
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(xa_destroy);

#ifdef XA_DEBUG
void xa_dump_node(const struct xa_node *node)
{
        unsigned i, j;

        if (!node)
                return;
        if ((unsigned long)node & 3) {
                pr_cont("node %px\n", node);
                return;
        }

        pr_cont("node %px %s %d parent %px shift %d count %d values %d "
                "array %px list %px %px marks",
                node, node->parent ? "offset" : "max", node->offset,
                node->parent, node->shift, node->count, node->nr_values,
                node->array, node->private_list.prev, node->private_list.next);
        for (i = 0; i < XA_MAX_MARKS; i++)
                for (j = 0; j < XA_MARK_LONGS; j++)
                        pr_cont(" %lx", node->marks[i][j]);
        pr_cont("\n");
}

void xa_dump_index(unsigned long index, unsigned int shift)
{
        if (!shift)
                pr_info("%lu: ", index);
        else if (shift >= BITS_PER_LONG)
                pr_info("0-%lu: ", ~0UL);
        else
                pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1));
}

void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift)
{
        if (!entry)
                return;

        xa_dump_index(index, shift);

        if (xa_is_node(entry)) {
                if (shift == 0) {
                        pr_cont("%px\n", entry);
                } else {
                        unsigned long i;
                        struct xa_node *node = xa_to_node(entry);
                        xa_dump_node(node);
                        for (i = 0; i < XA_CHUNK_SIZE; i++)
                                xa_dump_entry(node->slots[i],
                                      index + (i << node->shift), node->shift);
                }
        } else if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry),
                                                xa_to_value(entry), entry);
        else if (!xa_is_internal(entry))
                pr_cont("%px\n", entry);
        else if (xa_is_retry(entry))
                pr_cont("retry (%ld)\n", xa_to_internal(entry));
        else if (xa_is_sibling(entry))
                pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry));
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else
                pr_cont("UNKNOWN ENTRY (%px)\n", entry);
}

void xa_dump(const struct xarray *xa)
{
        void *entry = xa->xa_head;
        unsigned int shift = 0;

        pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry,
                        xa->xa_flags, xa_marked(xa, XA_MARK_0),
                        xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2));
        if (xa_is_node(entry))
                shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT;
        xa_dump_entry(entry, 0, shift);
}
#endif







































































   13 



























    1 





    1 






































































































































































   13 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_COUNTER_H
#define _LINUX_PERCPU_COUNTER_H
/*
 * A simple "approximate counter" for use in ext2 and ext3 superblocks.
 *
 * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
 */

#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>

/* percpu_counter batch for local add or sub */
#define PERCPU_COUNTER_LOCAL_BATCH        INT_MAX

#ifdef CONFIG_SMP

struct percpu_counter {
        raw_spinlock_t lock;
        s64 count;
#ifdef CONFIG_HOTPLUG_CPU
        struct list_head list;        /* All percpu_counters are on a list */
#endif
        s32 __percpu *counters;
};

extern int percpu_counter_batch;

int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
                               gfp_t gfp, u32 nr_counters,
                               struct lock_class_key *key);

#define percpu_counter_init_many(fbc, value, gfp, nr_counters)                \
        ({                                                                \
                static struct lock_class_key __key;                        \
                                                                        \
                __percpu_counter_init_many(fbc, value, gfp, nr_counters,\
                                           &__key);                        \
        })


#define percpu_counter_init(fbc, value, gfp)                                \
        percpu_counter_init_many(fbc, value, gfp, 1)

void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters);
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
        percpu_counter_destroy_many(fbc, 1);
}

void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
                              s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
                                  s64 amount, s32 batch);
void percpu_counter_sync(struct percpu_counter *fbc);

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        return __percpu_counter_compare(fbc, rhs, percpu_counter_batch);
}

static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}

static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
        return __percpu_counter_limited_add(fbc, limit, amount,
                                            percpu_counter_batch);
}

/*
 * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
 * are accumulated in local per cpu counter and not in fbc->count until
 * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter
 * write efficient.
 * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be
 * used to add up the counts from each CPU to account for all the local
 * counts. So percpu_counter_add_local() and percpu_counter_sub_local()
 * should be used when a counter is updated frequently and read rarely.
 */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH);
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        s64 ret = __percpu_counter_sum(fbc);
        return ret < 0 ? 0 : ret;
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return __percpu_counter_sum(fbc);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * It is possible for the percpu_counter_read() to return a small negative
 * number for some counter which should never be negative.
 *
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        /* Prevent reloads of fbc->count */
        s64 ret = READ_ONCE(fbc->count);

        if (ret >= 0)
                return ret;
        return 0;
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return (fbc->counters != NULL);
}

#else /* !CONFIG_SMP */

struct percpu_counter {
        s64 count;
};

static inline int percpu_counter_init_many(struct percpu_counter *fbc,
                                           s64 amount, gfp_t gfp,
                                           u32 nr_counters)
{
        u32 i;

        for (i = 0; i < nr_counters; i++)
                fbc[i].count = amount;

        return 0;
}

static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
                                      gfp_t gfp)
{
        return percpu_counter_init_many(fbc, amount, gfp, 1);
}

static inline void percpu_counter_destroy_many(struct percpu_counter *fbc,
                                               u32 nr_counters)
{
}

static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
}

static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        fbc->count = amount;
}

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        if (fbc->count > rhs)
                return 1;
        else if (fbc->count < rhs)
                return -1;
        else
                return 0;
}

static inline int
__percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        return percpu_counter_compare(fbc, rhs);
}

static inline void
percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        unsigned long flags;

        local_irq_save(flags);
        fbc->count += amount;
        local_irq_restore(flags);
}

static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
        unsigned long flags;
        bool good = false;
        s64 count;

        if (amount == 0)
                return true;

        local_irq_save(flags);
        count = fbc->count + amount;
        if ((amount > 0 && count <= limit) ||
            (amount < 0 && count >= limit)) {
                fbc->count = count;
                good = true;
        }
        local_irq_restore(flags);
        return good;
}

/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, amount);
}

static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        percpu_counter_add(fbc, amount);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * percpu_counter is intended to track positive numbers. In the UP case the
 * number should never be negative.
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        return fbc->count;
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        return percpu_counter_read_positive(fbc);
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return percpu_counter_read(fbc);
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return true;
}

static inline void percpu_counter_sync(struct percpu_counter *fbc)
{
}
#endif        /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, 1);
}

static inline void percpu_counter_dec(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, -1);
}

static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, -amount);
}

static inline void
percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_local(fbc, -amount);
}

#endif /* _LINUX_PERCPU_COUNTER_H */



































































































































































































































































    3 



















































































































































































































































































































































































































































































































































































































































































































































    3 



    1 
    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   22 
   23 

   20 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 



    2 









    2 














































    2 



















    2 

    2 












    1 








    1 













































































    2 



    2 






    2 








    2 



    1 

    2 









    2 



    2 
    2 
    2 






    2 



    2 



    2 




















    2 














































    2 





    2 

    2 















    2 























    2 


    2 




    2 



    2 



    2 










    2 







    2 























    1 


    1 










    1 













    1 






    2 











































































    1 





    1 











    1 
    1 



    1 






















































































    1 
    1 













    2 





















    2 
    2 
    2 



    2 
    2 























































































































































































































































































































































































    2 






    1 


    2 











    2 
























    2 







    2 
    2 

    2 














    2 



    2 






















    1 

    1 


    2 



    2 

    1 
































    1 











    1 


    1 
    1 



















































































    4 


















    4 
    3 





    3 



























































































































































































































































































































    1 
































    3 



































    3 
















    3 










    1 



    3 





























































































































































































































































































    3 


    3 

    2 


















    3 















    1 




    1 

    1 
























    2 



    2 

    1 

    1 
    2 














































































































































































































   23 









   22 

   22 



   22 








   22 








   21 













   20 

    2 









   22 






   23 







   21 

















   21 




   16 
    2 





    2 




   22 




















   21 

    2 






































   23 








   21 



   22 


   23 





   20 

































   22 
































































































   22 















   22 



































































   22 






   23 










   23 





















































   21 


   23 

   23 
   21 

























































































































































































































































    1 



    1 
    1 













    1 

















    1 














    3 

    3 




































































































































































































































































































































































































































































































































































































































    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




































    1 





































































    1 

































































































































































































































































































































































































































































































































    1 





    1 







    4 




    4 

    1 





















































































































































































    1 












































































    1 






    1 

























    1 





















    1 























    1 












    1 



    1 






    1 

    1 











    1 




    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
13093
13094
13095
13096
13097
13098
13099
13100
13101
13102
13103
13104
13105
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126
13127
13128
13129
13130
13131
13132
13133
13134
13135
13136
13137
13138
13139
13140
13141
13142
13143
13144
13145
13146
13147
13148
13149
13150
13151
13152
13153
13154
13155
13156
13157
13158
13159
13160
13161
13162
13163
13164
13165
13166
13167
13168
13169
13170
13171
13172
13173
13174
13175
13176
13177
13178
13179
13180
13181
13182
13183
13184
13185
13186
13187
13188
13189
13190
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203
13204
13205
13206
13207
13208
13209
13210
13211
13212
13213
13214
13215
13216
13217
13218
13219
13220
13221
13222
13223
13224
13225
13226
13227
13228
13229
13230
13231
13232
13233
13234
13235
13236
13237
13238
13239
13240
13241
13242
13243
13244
13245
13246
13247
13248
13249
13250
13251
13252
13253
13254
13255
13256
13257
13258
13259
13260
13261
13262
13263
13264
13265
13266
13267
13268
13269
13270
13271
13272
13273
13274
13275
13276
13277
13278
13279
13280
13281
13282
13283
13284
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
13343
13344
13345
13346
13347
13348
13349
13350
13351
13352
13353
13354
13355
13356
13357
13358
13359
13360
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *      NET3    Protocol independent device support routines.
 *
 *        Derived from the non IP parts of dev.c 1.0.19
 *              Authors:        Ross Biro
 *                                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *
 *        Additional Authors:
 *                Florian la Roche <rzsfl@rz.uni-sb.de>
 *                Alan Cox <gw4pts@gw4pts.ampr.org>
 *                David Hinds <dahinds@users.sourceforge.net>
 *                Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 *                Adam Sulmicki <adam@cfar.umd.edu>
 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
 *
 *        Changes:
 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
 *                                      to 2 if register_netdev gets called
 *                                      before net_dev_init & also removed a
 *                                      few lines of code in the process.
 *                Alan Cox        :        device private ioctl copies fields back.
 *                Alan Cox        :        Transmit queue code does relevant
 *                                        stunts to keep the queue safe.
 *                Alan Cox        :        Fixed double lock.
 *                Alan Cox        :        Fixed promisc NULL pointer trap
 *                ????????        :        Support the full private ioctl range
 *                Alan Cox        :        Moved ioctl permission check into
 *                                        drivers
 *                Tim Kordas        :        SIOCADDMULTI/SIOCDELMULTI
 *                Alan Cox        :        100 backlog just doesn't cut it when
 *                                        you start doing multicast video 8)
 *                Alan Cox        :        Rewrote net_bh and list manager.
 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
 *                Alan Cox        :        Took out transmit every packet pass
 *                                        Saved a few bytes in the ioctl handler
 *                Alan Cox        :        Network driver sets packet type before
 *                                        calling netif_rx. Saves a function
 *                                        call a packet.
 *                Alan Cox        :        Hashed net_bh()
 *                Richard Kooijman:        Timestamp fixes.
 *                Alan Cox        :        Wrong field in SIOCGIFDSTADDR
 *                Alan Cox        :        Device lock protection.
 *              Alan Cox        :       Fixed nasty side effect of device close
 *                                        changes.
 *                Rudi Cilibrasi        :        Pass the right thing to
 *                                        set_mac_address()
 *                Dave Miller        :        32bit quantity for the device lock to
 *                                        make it work out on a Sparc.
 *                Bjorn Ekwall        :        Added KERNELD hack.
 *                Alan Cox        :        Cleaned up the backlog initialise.
 *                Craig Metz        :        SIOCGIFCONF fix if space for under
 *                                        1 device.
 *            Thomas Bogendoerfer :        Return ENODEV for dev_open, if there
 *                                        is no device open function.
 *                Andi Kleen        :        Fix error reporting for SIOCGIFCONF
 *            Michael Chastain        :        Fix signed/unsigned for SIOCGIFCONF
 *                Cyrus Durgin        :        Cleaned for KMOD
 *                Adam Sulmicki   :        Bug Fix : Network Device Unload
 *                                        A network device unload needs to purge
 *                                        the backlog queue.
 *        Paul Rusty Russell        :        SIOCSIFNAME
 *              Pekka Riikonen  :        Netdev boot-time settings code
 *              Andrew Morton   :       Make unregister_netdevice wait
 *                                      indefinitely on dev->refcnt
 *              J Hadi Salim    :       - Backlog queue sampling
 *                                        - netif_rx() feedback
 */

#include <linux/uaccess.h>
#include <linux/bitmap.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/isolation.h>
#include <linux/sched/mm.h>
#include <linux/smpboot.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/tcx.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <net/iw_handler.h>
#include <asm/current.h>
#include <linux/audit.h>
#include <linux/dmaengine.h>
#include <linux/err.h>
#include <linux/ctype.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <net/ip.h>
#include <net/mpls.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <trace/events/napi.h>
#include <trace/events/net.h>
#include <trace/events/skb.h>
#include <trace/events/qdisc.h>
#include <trace/events/xdp.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
#include <linux/hashtable.h>
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_netdev.h>
#include <linux/crash_dump.h>
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
#include <linux/net_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <net/devlink.h>
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
#include <net/rps.h>
#include <linux/phy_link_topology.h>

#include "dev.h"
#include "devmem.h"
#include "net-sysfs.h"

static DEFINE_SPINLOCK(ptype_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_extack(unsigned long val,
                                           struct net_device *dev,
                                           struct netlink_ext_ack *extack);

static DEFINE_MUTEX(ifalias_mutex);

/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);

static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);

static inline void dev_base_seq_inc(struct net *net)
{
        unsigned int val = net->dev_base_seq + 1;

        WRITE_ONCE(net->dev_base_seq, val ?: 1);
}

static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));

        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
}

static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
{
        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}

#ifndef CONFIG_PREEMPT_RT

static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);

static int __init setup_backlog_napi_threads(char *arg)
{
        static_branch_enable(&use_backlog_threads_key);
        return 0;
}
early_param("thread_backlog_napi", setup_backlog_napi_threads);

static bool use_backlog_threads(void)
{
        return static_branch_unlikely(&use_backlog_threads_key);
}

#else

static bool use_backlog_threads(void)
{
        return true;
}

#endif

static inline void backlog_lock_irq_save(struct softnet_data *sd,
                                         unsigned long *flags)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
        } else {
                local_irq_save(*flags);
                if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                        spin_lock(&sd->input_pkt_queue.lock);
        }
}

static inline void backlog_lock_irq_disable(struct softnet_data *sd)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_lock_irq(&sd->input_pkt_queue.lock);
        else
                local_irq_disable();
}

static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
                                              unsigned long flags)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                spin_unlock_irqrestore(&sd->input_pkt_queue.lock, flags);
        } else {
                if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                        spin_unlock(&sd->input_pkt_queue.lock);
                local_irq_restore(flags);
        }
}

static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_unlock_irq(&sd->input_pkt_queue.lock);
        else
                local_irq_enable();
}

static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
                                                       const char *name)
{
        struct netdev_name_node *name_node;

        name_node = kmalloc_obj(*name_node);
        if (!name_node)
                return NULL;
        INIT_HLIST_NODE(&name_node->hlist);
        name_node->dev = dev;
        name_node->name = name;
        return name_node;
}

static struct netdev_name_node *
netdev_name_node_head_alloc(struct net_device *dev)
{
        struct netdev_name_node *name_node;

        name_node = netdev_name_node_alloc(dev, dev->name);
        if (!name_node)
                return NULL;
        INIT_LIST_HEAD(&name_node->list);
        return name_node;
}

static void netdev_name_node_free(struct netdev_name_node *name_node)
{
        kfree(name_node);
}

static void netdev_name_node_add(struct net *net,
                                 struct netdev_name_node *name_node)
{
        hlist_add_head_rcu(&name_node->hlist,
                           dev_name_hash(net, name_node->name));
}

static void netdev_name_node_del(struct netdev_name_node *name_node)
{
        hlist_del_rcu(&name_node->hlist);
}

static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
                                                        const char *name)
{
        struct hlist_head *head = dev_name_hash(net, name);
        struct netdev_name_node *name_node;

        hlist_for_each_entry(name_node, head, hlist)
                if (!strcmp(name_node->name, name))
                        return name_node;
        return NULL;
}

static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
                                                            const char *name)
{
        struct hlist_head *head = dev_name_hash(net, name);
        struct netdev_name_node *name_node;

        hlist_for_each_entry_rcu(name_node, head, hlist)
                if (!strcmp(name_node->name, name))
                        return name_node;
        return NULL;
}

bool netdev_name_in_use(struct net *net, const char *name)
{
        return netdev_name_node_lookup(net, name);
}
EXPORT_SYMBOL(netdev_name_in_use);

int netdev_name_node_alt_create(struct net_device *dev, const char *name)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        name_node = netdev_name_node_lookup(net, name);
        if (name_node)
                return -EEXIST;
        name_node = netdev_name_node_alloc(dev, name);
        if (!name_node)
                return -ENOMEM;
        netdev_name_node_add(net, name_node);
        /* The node that holds dev->name acts as a head of per-device list. */
        list_add_tail_rcu(&name_node->list, &dev->name_node->list);

        return 0;
}

static void netdev_name_node_alt_free(struct rcu_head *head)
{
        struct netdev_name_node *name_node =
                container_of(head, struct netdev_name_node, rcu);

        kfree(name_node->name);
        netdev_name_node_free(name_node);
}

static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
        netdev_name_node_del(name_node);
        list_del(&name_node->list);
        call_rcu(&name_node->rcu, netdev_name_node_alt_free);
}

int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        name_node = netdev_name_node_lookup(net, name);
        if (!name_node)
                return -ENOENT;
        /* lookup might have found our primary name or a name belonging
         * to another device.
         */
        if (name_node == dev->name_node || name_node->dev != dev)
                return -EINVAL;

        __netdev_name_node_alt_destroy(name_node);
        return 0;
}

static void netdev_name_node_alt_flush(struct net_device *dev)
{
        struct netdev_name_node *name_node, *tmp;

        list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
                list_del(&name_node->list);
                netdev_name_node_alt_free(&name_node->rcu);
        }
}

/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        ASSERT_RTNL();

        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
        netdev_name_node_add(net, dev->name_node);
        hlist_add_head_rcu(&dev->index_hlist,
                           dev_index_hash(net, dev->ifindex));

        netdev_for_each_altname(dev, name_node)
                netdev_name_node_add(net, name_node);

        /* We reserved the ifindex, this can't fail */
        WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));

        dev_base_seq_inc(net);
}

/* Device list removal
 * caller must respect a RCU grace period before freeing/reusing dev
 */
static void unlist_netdevice(struct net_device *dev)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        ASSERT_RTNL();

        xa_erase(&net->dev_by_index, dev->ifindex);

        netdev_for_each_altname(dev, name_node)
                netdev_name_node_del(name_node);

        /* Unlink dev from the device chain */
        list_del_rcu(&dev->dev_list);
        netdev_name_node_del(dev->name_node);
        hlist_del_rcu(&dev->index_hlist);

        dev_base_seq_inc(dev_net(dev));
}

/*
 *        Our notifier list
 */

static RAW_NOTIFIER_HEAD(netdev_chain);

/*
 *        Device drivers call our routines to queue packets here. We empty the
 *        queue in the local softnet handler.
 */

DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
        .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
};
EXPORT_PER_CPU_SYMBOL(softnet_data);

/* Page_pool has a lockless array/stack to alloc/recycle pages.
 * PP consumers must pay attention to run APIs in the appropriate context
 * (e.g. NAPI context).
 */
DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

#ifdef CONFIG_LOCKDEP
/*
 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 * according to dev->type
 */
static const unsigned short netdev_lock_type[] = {
         ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
         ARPHRD_CAN, ARPHRD_MCTP,
         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
         ARPHRD_RAWHDLC, ARPHRD_RAWIP,
         ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
         ARPHRD_IEEE80211_RADIOTAP,
         ARPHRD_IEEE802154, ARPHRD_IEEE802154_MONITOR,
         ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
         ARPHRD_CAIF, ARPHRD_IP6GRE, ARPHRD_NETLINK, ARPHRD_6LOWPAN,
         ARPHRD_VSOCKMON,
         ARPHRD_VOID, ARPHRD_NONE};

static const char *const netdev_lock_name[] = {
        "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
        "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
        "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
        "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
        "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
        "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
        "_xmit_CAN", "_xmit_MCTP",
        "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
        "_xmit_RAWHDLC", "_xmit_RAWIP",
        "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
        "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
        "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
        "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
        "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
        "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
        "_xmit_IEEE80211_RADIOTAP",
        "_xmit_IEEE802154", "_xmit_IEEE802154_MONITOR",
        "_xmit_PHONET", "_xmit_PHONET_PIPE",
        "_xmit_CAIF", "_xmit_IP6GRE", "_xmit_NETLINK", "_xmit_6LOWPAN",
        "_xmit_VSOCKMON",
        "_xmit_VOID", "_xmit_NONE"};

static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];

static inline unsigned short netdev_lock_pos(unsigned short dev_type)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
                if (netdev_lock_type[i] == dev_type)
                        return i;
        /* the last key is used by default */
        WARN_ONCE(1, "netdev_lock_pos() could not find dev_type=%u\n", dev_type);
        return ARRAY_SIZE(netdev_lock_type) - 1;
}

static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
                                                 unsigned short dev_type)
{
        int i;

        i = netdev_lock_pos(dev_type);
        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
                                   netdev_lock_name[i]);
}

static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
        int i;

        i = netdev_lock_pos(dev->type);
        lockdep_set_class_and_name(&dev->addr_list_lock,
                                   &netdev_addr_lock_key[i],
                                   netdev_lock_name[i]);
}
#else
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
                                                 unsigned short dev_type)
{
}

static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
}
#endif

/*******************************************************************************
 *
 *                Protocol management and registration routines
 *
 *******************************************************************************/


/*
 *        Add a protocol ID to the list. Now that the input handler is
 *        smarter we can dispense with all the messy stuff that used to be
 *        here.
 *
 *        BEWARE!!! Protocol handlers, mangling input packets,
 *        MUST BE last in hash buckets and checking protocol handlers
 *        MUST start from promiscuous ptype_all chain in net_bh.
 *        It is true now, do not change it.
 *        Explanation follows: if protocol handler, mangling packet, will
 *        be the first on list, it is not able to sense, that packet
 *        is cloned and should be copied-on-write, so that it will
 *        change it and subsequent readers will get broken packet.
 *                                                        --ANK (980803)
 */

static inline struct list_head *ptype_head(const struct packet_type *pt)
{
        if (pt->type == htons(ETH_P_ALL)) {
                if (!pt->af_packet_net && !pt->dev)
                        return NULL;

                return pt->dev ? &pt->dev->ptype_all :
                                 &pt->af_packet_net->ptype_all;
        }

        if (pt->dev)
                return &pt->dev->ptype_specific;

        return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}

/**
 *        dev_add_pack - add packet handler
 *        @pt: packet type declaration
 *
 *        Add a protocol handler to the networking stack. The passed &packet_type
 *        is linked into kernel lists and may not be freed until it has been
 *        removed from the kernel lists.
 *
 *        This call does not sleep therefore it can not
 *        guarantee all CPU's that are in middle of receiving packets
 *        will see the new packet type (until the next received packet).
 */

void dev_add_pack(struct packet_type *pt)
{
        struct list_head *head = ptype_head(pt);

        if (WARN_ON_ONCE(!head))
                return;

        spin_lock(&ptype_lock);
        list_add_rcu(&pt->list, head);
        spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(dev_add_pack);

/**
 *        __dev_remove_pack         - remove packet handler
 *        @pt: packet type declaration
 *
 *        Remove a protocol handler that was previously added to the kernel
 *        protocol handlers by dev_add_pack(). The passed &packet_type is removed
 *        from the kernel lists and can be freed or reused once this function
 *        returns.
 *
 *      The packet type might still be in use by receivers
 *        and must not be freed until after all the CPU's have gone
 *        through a quiescent state.
 */
void __dev_remove_pack(struct packet_type *pt)
{
        struct list_head *head = ptype_head(pt);
        struct packet_type *pt1;

        if (!head)
                return;

        spin_lock(&ptype_lock);

        list_for_each_entry(pt1, head, list) {
                if (pt == pt1) {
                        list_del_rcu(&pt->list);
                        goto out;
                }
        }

        pr_warn("dev_remove_pack: %p not found\n", pt);
out:
        spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(__dev_remove_pack);

/**
 *        dev_remove_pack         - remove packet handler
 *        @pt: packet type declaration
 *
 *        Remove a protocol handler that was previously added to the kernel
 *        protocol handlers by dev_add_pack(). The passed &packet_type is removed
 *        from the kernel lists and can be freed or reused once this function
 *        returns.
 *
 *        This call sleeps to guarantee that no CPU is looking at the packet
 *        type after return.
 */
void dev_remove_pack(struct packet_type *pt)
{
        __dev_remove_pack(pt);

        synchronize_net();
}
EXPORT_SYMBOL(dev_remove_pack);


/*******************************************************************************
 *
 *                            Device Interface Subroutines
 *
 *******************************************************************************/

/**
 *        dev_get_iflink        - get 'iflink' value of a interface
 *        @dev: targeted interface
 *
 *        Indicates the ifindex the interface is linked to.
 *        Physical interfaces have the same 'ifindex' and 'iflink' values.
 */

int dev_get_iflink(const struct net_device *dev)
{
        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
                return dev->netdev_ops->ndo_get_iflink(dev);

        return READ_ONCE(dev->ifindex);
}
EXPORT_SYMBOL(dev_get_iflink);

/**
 *        dev_fill_metadata_dst - Retrieve tunnel egress information.
 *        @dev: targeted interface
 *        @skb: The packet.
 *
 *        For better visibility of tunnel traffic OVS needs to retrieve
 *        egress tunnel information for a packet. Following API allows
 *        user to get this info.
 */
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
        struct ip_tunnel_info *info;

        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
                return -EINVAL;

        info = skb_tunnel_info_unclone(skb);
        if (!info)
                return -ENOMEM;
        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
                return -EINVAL;

        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);

static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
{
        int k = stack->num_paths++;

        if (k >= NET_DEVICE_PATH_STACK_MAX)
                return NULL;

        return &stack->path[k];
}

int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
                          struct net_device_path_stack *stack)
{
        const struct net_device *last_dev;
        struct net_device_path_ctx ctx = {
                .dev        = dev,
        };
        struct net_device_path *path;
        int ret = 0;

        memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
        stack->num_paths = 0;
        while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
                last_dev = ctx.dev;
                path = dev_fwd_path(stack);
                if (!path)
                        return -1;

                memset(path, 0, sizeof(struct net_device_path));
                ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
                if (ret < 0)
                        return -1;

                if (WARN_ON_ONCE(last_dev == ctx.dev))
                        return -1;
        }

        if (!ctx.dev)
                return ret;

        path = dev_fwd_path(stack);
        if (!path)
                return -1;
        path->type = DEV_PATH_ETHERNET;
        path->dev = ctx.dev;

        return ret;
}
EXPORT_SYMBOL_GPL(dev_fill_forward_path);

/* must be called under rcu_read_lock(), as we dont take a reference */
static struct napi_struct *napi_by_id(unsigned int napi_id)
{
        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
        struct napi_struct *napi;

        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
                if (napi->napi_id == napi_id)
                        return napi;

        return NULL;
}

/* must be called under rcu_read_lock(), as we dont take a reference */
static struct napi_struct *
netdev_napi_by_id(struct net *net, unsigned int napi_id)
{
        struct napi_struct *napi;

        napi = napi_by_id(napi_id);
        if (!napi)
                return NULL;

        if (WARN_ON_ONCE(!napi->dev))
                return NULL;
        if (!net_eq(net, dev_net(napi->dev)))
                return NULL;

        return napi;
}

/**
 *        netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
 *        @net: the applicable net namespace
 *        @napi_id: ID of a NAPI of a target device
 *
 *        Find a NAPI instance with @napi_id. Lock its device.
 *        The device must be in %NETREG_REGISTERED state for lookup to succeed.
 *        netdev_unlock() must be called to release it.
 *
 *        Return: pointer to NAPI, its device with lock held, NULL if not found.
 */
struct napi_struct *
netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
{
        struct napi_struct *napi;
        struct net_device *dev;

        rcu_read_lock();
        napi = netdev_napi_by_id(net, napi_id);
        if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
                rcu_read_unlock();
                return NULL;
        }

        dev = napi->dev;
        dev_hold(dev);
        rcu_read_unlock();

        dev = __netdev_put_lock(dev, net);
        if (!dev)
                return NULL;

        rcu_read_lock();
        napi = netdev_napi_by_id(net, napi_id);
        if (napi && napi->dev != dev)
                napi = NULL;
        rcu_read_unlock();

        if (!napi)
                netdev_unlock(dev);
        return napi;
}

/**
 *        __dev_get_by_name        - find a device by its name
 *        @net: the applicable net namespace
 *        @name: name to find
 *
 *        Find an interface by name. Must be called under RTNL semaphore.
 *        If the name is found a pointer to the device is returned.
 *        If the name is not found then %NULL is returned. The
 *        reference counters are not incremented so the caller must be
 *        careful with locks.
 */

struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
        struct netdev_name_node *node_name;

        node_name = netdev_name_node_lookup(net, name);
        return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);

/**
 * dev_get_by_name_rcu        - find a device by its name
 * @net: the applicable net namespace
 * @name: name to find
 *
 * Find an interface by name.
 * If the name is found a pointer to the device is returned.
 * If the name is not found then %NULL is returned.
 * The reference counters are not incremented so the caller must be
 * careful with locks. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
        struct netdev_name_node *node_name;

        node_name = netdev_name_node_lookup_rcu(net, name);
        return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_name_rcu);

/* Deprecated for new users, call netdev_get_by_name() instead */
struct net_device *dev_get_by_name(struct net *net, const char *name)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = dev_get_by_name_rcu(net, name);
        dev_hold(dev);
        rcu_read_unlock();
        return dev;
}
EXPORT_SYMBOL(dev_get_by_name);

/**
 *        netdev_get_by_name() - find a device by its name
 *        @net: the applicable net namespace
 *        @name: name to find
 *        @tracker: tracking object for the acquired reference
 *        @gfp: allocation flags for the tracker
 *
 *        Find an interface by name. This can be called from any
 *        context and does its own locking. The returned handle has
 *        the usage count incremented and the caller must use netdev_put() to
 *        release it when it is no longer needed. %NULL is returned if no
 *        matching device is found.
 */
struct net_device *netdev_get_by_name(struct net *net, const char *name,
                                      netdevice_tracker *tracker, gfp_t gfp)
{
        struct net_device *dev;

        dev = dev_get_by_name(net, name);
        if (dev)
                netdev_tracker_alloc(dev, tracker, gfp);
        return dev;
}
EXPORT_SYMBOL(netdev_get_by_name);

/**
 *        __dev_get_by_index - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not
 *        had its reference counter increased so the caller must be careful
 *        about locking. The caller must hold the RTNL semaphore.
 */

struct net_device *__dev_get_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;
        struct hlist_head *head = dev_index_hash(net, ifindex);

        hlist_for_each_entry(dev, head, index_hlist)
                if (dev->ifindex == ifindex)
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(__dev_get_by_index);

/**
 *        dev_get_by_index_rcu - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not
 *        had its reference counter increased so the caller must be careful
 *        about locking. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
{
        struct net_device *dev;
        struct hlist_head *head = dev_index_hash(net, ifindex);

        hlist_for_each_entry_rcu(dev, head, index_hlist)
                if (dev->ifindex == ifindex)
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_get_by_index_rcu);

/* Deprecated for new users, call netdev_get_by_index() instead */
struct net_device *dev_get_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = dev_get_by_index_rcu(net, ifindex);
        dev_hold(dev);
        rcu_read_unlock();
        return dev;
}
EXPORT_SYMBOL(dev_get_by_index);

/**
 *        netdev_get_by_index() - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *        @tracker: tracking object for the acquired reference
 *        @gfp: allocation flags for the tracker
 *
 *        Search for an interface by index. Returns NULL if the device
 *        is not found or a pointer to the device. The device returned has
 *        had a reference added and the pointer is safe until the user calls
 *        netdev_put() to indicate they have finished with it.
 */
struct net_device *netdev_get_by_index(struct net *net, int ifindex,
                                       netdevice_tracker *tracker, gfp_t gfp)
{
        struct net_device *dev;

        dev = dev_get_by_index(net, ifindex);
        if (dev)
                netdev_tracker_alloc(dev, tracker, gfp);
        return dev;
}
EXPORT_SYMBOL(netdev_get_by_index);

/**
 *        dev_get_by_napi_id - find a device by napi_id
 *        @napi_id: ID of the NAPI struct
 *
 *        Search for an interface by NAPI ID. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not had
 *        its reference counter increased so the caller must be careful
 *        about locking. The caller must hold RCU lock.
 */
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
        struct napi_struct *napi;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (!napi_id_valid(napi_id))
                return NULL;

        napi = napi_by_id(napi_id);

        return napi ? napi->dev : NULL;
}

/* Release the held reference on the net_device, and if the net_device
 * is still registered try to lock the instance lock. If device is being
 * unregistered NULL will be returned (but the reference has been released,
 * either way!)
 *
 * This helper is intended for locking net_device after it has been looked up
 * using a lockless lookup helper. Lock prevents the instance from going away.
 */
struct net_device *
netdev_put_lock(struct net_device *dev, struct net *net,
                netdevice_tracker *tracker)
{
        netdev_lock(dev);
        if (dev->reg_state > NETREG_REGISTERED ||
            dev->moving_ns || !net_eq(dev_net(dev), net)) {
                netdev_unlock(dev);
                netdev_put(dev, tracker);
                return NULL;
        }
        netdev_put(dev, tracker);
        return dev;
}

static struct net_device *
__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net)
{
        netdev_lock_ops_compat(dev);
        if (dev->reg_state > NETREG_REGISTERED ||
            dev->moving_ns || !net_eq(dev_net(dev), net)) {
                netdev_unlock_ops_compat(dev);
                dev_put(dev);
                return NULL;
        }
        dev_put(dev);
        return dev;
}

/**
 *        netdev_get_by_index_lock() - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. If a valid device
 *        with @ifindex is found it will be returned with netdev->lock held.
 *        netdev_unlock() must be called to release it.
 *
 *        Return: pointer to a device with lock held, NULL if not found.
 */
struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
{
        struct net_device *dev;

        dev = dev_get_by_index(net, ifindex);
        if (!dev)
                return NULL;

        return __netdev_put_lock(dev, net);
}

struct net_device *
netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
{
        struct net_device *dev;

        dev = dev_get_by_index(net, ifindex);
        if (!dev)
                return NULL;

        return __netdev_put_lock_ops_compat(dev, net);
}

struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev,
                    unsigned long *index)
{
        if (dev)
                netdev_unlock(dev);

        do {
                rcu_read_lock();
                dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
                if (!dev) {
                        rcu_read_unlock();
                        return NULL;
                }
                dev_hold(dev);
                rcu_read_unlock();

                dev = __netdev_put_lock(dev, net);
                if (dev)
                        return dev;

                (*index)++;
        } while (true);
}

struct net_device *
netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
                               unsigned long *index)
{
        if (dev)
                netdev_unlock_ops_compat(dev);

        do {
                rcu_read_lock();
                dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
                if (!dev) {
                        rcu_read_unlock();
                        return NULL;
                }
                dev_hold(dev);
                rcu_read_unlock();

                dev = __netdev_put_lock_ops_compat(dev, net);
                if (dev)
                        return dev;

                (*index)++;
        } while (true);
}

static DEFINE_SEQLOCK(netdev_rename_lock);

void netdev_copy_name(struct net_device *dev, char *name)
{
        unsigned int seq;

        do {
                seq = read_seqbegin(&netdev_rename_lock);
                strscpy(name, dev->name, IFNAMSIZ);
        } while (read_seqretry(&netdev_rename_lock, seq));
}

/**
 *        netdev_get_name - get a netdevice name, knowing its ifindex.
 *        @net: network namespace
 *        @name: a pointer to the buffer where the name will be stored.
 *        @ifindex: the ifindex of the interface to get the name from.
 */
int netdev_get_name(struct net *net, char *name, int ifindex)
{
        struct net_device *dev;
        int ret;

        rcu_read_lock();

        dev = dev_get_by_index_rcu(net, ifindex);
        if (!dev) {
                ret = -ENODEV;
                goto out;
        }

        netdev_copy_name(dev, name);

        ret = 0;
out:
        rcu_read_unlock();
        return ret;
}

static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
                         const char *ha)
{
        return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
}

/**
 *        dev_getbyhwaddr_rcu - find a device by its hardware address
 *        @net: the applicable net namespace
 *        @type: media type of device
 *        @ha: hardware address
 *
 *        Search for an interface by MAC address. Returns NULL if the device
 *        is not found or a pointer to the device.
 *        The caller must hold RCU.
 *        The returned device has not had its ref count increased
 *        and the caller must therefore be careful about locking
 *
 */

struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *ha)
{
        struct net_device *dev;

        for_each_netdev_rcu(net, dev)
                if (dev_addr_cmp(dev, type, ha))
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);

/**
 * dev_getbyhwaddr() - find a device by its hardware address
 * @net: the applicable net namespace
 * @type: media type of device
 * @ha: hardware address
 *
 * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
 * rtnl_lock.
 *
 * Context: rtnl_lock() must be held.
 * Return: pointer to the net_device, or NULL if not found
 */
struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
                                   const char *ha)
{
        struct net_device *dev;

        ASSERT_RTNL();
        for_each_netdev(net, dev)
                if (dev_addr_cmp(dev, type, ha))
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr);

struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
        struct net_device *dev, *ret = NULL;

        rcu_read_lock();
        for_each_netdev_rcu(net, dev)
                if (dev->type == type) {
                        dev_hold(dev);
                        ret = dev;
                        break;
                }
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(dev_getfirstbyhwtype);

/**
 * netdev_get_by_flags_rcu - find any device with given flags
 * @net: the applicable net namespace
 * @tracker: tracking object for the acquired reference
 * @if_flags: IFF_* values
 * @mask: bitmask of bits in if_flags to check
 *
 * Search for any interface with the given flags.
 *
 * Context: rcu_read_lock() must be held.
 * Returns: NULL if a device is not found or a pointer to the device.
 */
struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
                                           unsigned short if_flags, unsigned short mask)
{
        struct net_device *dev;

        for_each_netdev_rcu(net, dev) {
                if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) {
                        netdev_hold(dev, tracker, GFP_ATOMIC);
                        return dev;
                }
        }

        return NULL;
}

/**
 *        dev_valid_name - check if name is okay for network device
 *        @name: name string
 *
 *        Network device names need to be valid file names to
 *        allow sysfs to work.  We also disallow any kind of
 *        whitespace.
 */
bool dev_valid_name(const char *name)
{
        if (*name == '\0')
                return false;
        if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
                return false;
        if (!strcmp(name, ".") || !strcmp(name, ".."))
                return false;

        while (*name) {
                if (*name == '/' || *name == ':' || isspace(*name))
                        return false;
                name++;
        }
        return true;
}
EXPORT_SYMBOL(dev_valid_name);

/**
 *        __dev_alloc_name - allocate a name for a device
 *        @net: network namespace to allocate the device name in
 *        @name: name format string
 *        @res: result name string
 *
 *        Passed a format string - eg "lt%d" it will try and find a suitable
 *        id. It scans list of devices to build up a free map, then chooses
 *        the first empty slot. The caller must hold the dev_base or rtnl lock
 *        while allocating the name and adding the device in order to avoid
 *        duplicates.
 *        Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 *        Returns the number of the unit assigned or a negative errno code.
 */

static int __dev_alloc_name(struct net *net, const char *name, char *res)
{
        int i = 0;
        const char *p;
        const int max_netdevices = 8*PAGE_SIZE;
        unsigned long *inuse;
        struct net_device *d;
        char buf[IFNAMSIZ];

        /* Verify the string as this thing may have come from the user.
         * There must be one "%d" and no other "%" characters.
         */
        p = strchr(name, '%');
        if (!p || p[1] != 'd' || strchr(p + 2, '%'))
                return -EINVAL;

        /* Use one page as a bit array of possible slots */
        inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
        if (!inuse)
                return -ENOMEM;

        for_each_netdev(net, d) {
                struct netdev_name_node *name_node;

                netdev_for_each_altname(d, name_node) {
                        if (!sscanf(name_node->name, name, &i))
                                continue;
                        if (i < 0 || i >= max_netdevices)
                                continue;

                        /* avoid cases where sscanf is not exact inverse of printf */
                        snprintf(buf, IFNAMSIZ, name, i);
                        if (!strncmp(buf, name_node->name, IFNAMSIZ))
                                __set_bit(i, inuse);
                }
                if (!sscanf(d->name, name, &i))
                        continue;
                if (i < 0 || i >= max_netdevices)
                        continue;

                /* avoid cases where sscanf is not exact inverse of printf */
                snprintf(buf, IFNAMSIZ, name, i);
                if (!strncmp(buf, d->name, IFNAMSIZ))
                        __set_bit(i, inuse);
        }

        i = find_first_zero_bit(inuse, max_netdevices);
        bitmap_free(inuse);
        if (i == max_netdevices)
                return -ENFILE;

        /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
        strscpy(buf, name, IFNAMSIZ);
        snprintf(res, IFNAMSIZ, buf, i);
        return i;
}

/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
static int dev_prep_valid_name(struct net *net, struct net_device *dev,
                               const char *want_name, char *out_name,
                               int dup_errno)
{
        if (!dev_valid_name(want_name))
                return -EINVAL;

        if (strchr(want_name, '%'))
                return __dev_alloc_name(net, want_name, out_name);

        if (netdev_name_in_use(net, want_name))
                return -dup_errno;
        if (out_name != want_name)
                strscpy(out_name, want_name, IFNAMSIZ);
        return 0;
}

/**
 *        dev_alloc_name - allocate a name for a device
 *        @dev: device
 *        @name: name format string
 *
 *        Passed a format string - eg "lt%d" it will try and find a suitable
 *        id. It scans list of devices to build up a free map, then chooses
 *        the first empty slot. The caller must hold the dev_base or rtnl lock
 *        while allocating the name and adding the device in order to avoid
 *        duplicates.
 *        Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 *        Returns the number of the unit assigned or a negative errno code.
 */

int dev_alloc_name(struct net_device *dev, const char *name)
{
        return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
}
EXPORT_SYMBOL(dev_alloc_name);

static int dev_get_valid_name(struct net *net, struct net_device *dev,
                              const char *name)
{
        int ret;

        ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
        return ret < 0 ? ret : 0;
}

int netif_change_name(struct net_device *dev, const char *newname)
{
        struct net *net = dev_net(dev);
        unsigned char old_assign_type;
        char oldname[IFNAMSIZ];
        int err = 0;
        int ret;

        ASSERT_RTNL_NET(net);

        if (!strncmp(newname, dev->name, IFNAMSIZ))
                return 0;

        memcpy(oldname, dev->name, IFNAMSIZ);

        write_seqlock_bh(&netdev_rename_lock);
        err = dev_get_valid_name(net, dev, newname);
        write_sequnlock_bh(&netdev_rename_lock);

        if (err < 0)
                return err;

        if (oldname[0] && !strchr(oldname, '%'))
                netdev_info(dev, "renamed from %s%s\n", oldname,
                            dev->flags & IFF_UP ? " (while UP)" : "");

        old_assign_type = dev->name_assign_type;
        WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);

rollback:
        ret = device_rename(&dev->dev, dev->name);
        if (ret) {
                write_seqlock_bh(&netdev_rename_lock);
                memcpy(dev->name, oldname, IFNAMSIZ);
                write_sequnlock_bh(&netdev_rename_lock);
                WRITE_ONCE(dev->name_assign_type, old_assign_type);
                return ret;
        }

        netdev_adjacent_rename_links(dev, oldname);

        netdev_name_node_del(dev->name_node);

        synchronize_net();

        netdev_name_node_add(net, dev->name_node);

        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
        ret = notifier_to_errno(ret);

        if (ret) {
                /* err >= 0 after dev_alloc_name() or stores the first errno */
                if (err >= 0) {
                        err = ret;
                        write_seqlock_bh(&netdev_rename_lock);
                        memcpy(dev->name, oldname, IFNAMSIZ);
                        write_sequnlock_bh(&netdev_rename_lock);
                        memcpy(oldname, newname, IFNAMSIZ);
                        WRITE_ONCE(dev->name_assign_type, old_assign_type);
                        old_assign_type = NET_NAME_RENAMED;
                        goto rollback;
                } else {
                        netdev_err(dev, "name change rollback failed: %d\n",
                                   ret);
                }
        }

        return err;
}

int netif_set_alias(struct net_device *dev, const char *alias, size_t len)
{
        struct dev_ifalias *new_alias = NULL;

        if (len >= IFALIASZ)
                return -EINVAL;

        if (len) {
                new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
                if (!new_alias)
                        return -ENOMEM;

                memcpy(new_alias->ifalias, alias, len);
                new_alias->ifalias[len] = 0;
        }

        mutex_lock(&ifalias_mutex);
        new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
                                        mutex_is_locked(&ifalias_mutex));
        mutex_unlock(&ifalias_mutex);

        if (new_alias)
                kfree_rcu(new_alias, rcuhead);

        return len;
}

/**
 *        dev_get_alias - get ifalias of a device
 *        @dev: device
 *        @name: buffer to store name of ifalias
 *        @len: size of buffer
 *
 *        get ifalias for a device.  Caller must make sure dev cannot go
 *        away,  e.g. rcu read lock or own a reference count to device.
 */
int dev_get_alias(const struct net_device *dev, char *name, size_t len)
{
        const struct dev_ifalias *alias;
        int ret = 0;

        rcu_read_lock();
        alias = rcu_dereference(dev->ifalias);
        if (alias)
                ret = snprintf(name, len, "%s", alias->ifalias);
        rcu_read_unlock();

        return ret;
}

/**
 *        netdev_features_change - device changes features
 *        @dev: device to cause notification
 *
 *        Called to indicate a device has changed features.
 */
void netdev_features_change(struct net_device *dev)
{
        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
}
EXPORT_SYMBOL(netdev_features_change);

void netif_state_change(struct net_device *dev)
{
        netdev_ops_assert_locked_or_invisible(dev);

        if (dev->flags & IFF_UP) {
                struct netdev_notifier_change_info change_info = {
                        .info.dev = dev,
                };

                call_netdevice_notifiers_info(NETDEV_CHANGE,
                                              &change_info.info);
                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
        }
}

/**
 * __netdev_notify_peers - notify network peers about existence of @dev,
 * to be called when rtnl lock is already held.
 * @dev: network device
 *
 * Generate traffic such that interested network peers are aware of
 * @dev, such as by generating a gratuitous ARP. This may be used when
 * a device wants to inform the rest of the network about some sort of
 * reconfiguration such as a failover event or virtual machine
 * migration.
 */
void __netdev_notify_peers(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
        call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
}
EXPORT_SYMBOL(__netdev_notify_peers);

/**
 * netdev_notify_peers - notify network peers about existence of @dev
 * @dev: network device
 *
 * Generate traffic such that interested network peers are aware of
 * @dev, such as by generating a gratuitous ARP. This may be used when
 * a device wants to inform the rest of the network about some sort of
 * reconfiguration such as a failover event or virtual machine
 * migration.
 */
void netdev_notify_peers(struct net_device *dev)
{
        rtnl_lock();
        __netdev_notify_peers(dev);
        rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);

static int napi_threaded_poll(void *data);

static int napi_kthread_create(struct napi_struct *n)
{
        int err = 0;

        /* Create and wake up the kthread once to put it in
         * TASK_INTERRUPTIBLE mode to avoid the blocked task
         * warning and work with loadavg.
         */
        n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
                                n->dev->name, n->napi_id);
        if (IS_ERR(n->thread)) {
                err = PTR_ERR(n->thread);
                pr_err("kthread_run failed with err %d\n", err);
                n->thread = NULL;
        }

        return err;
}

static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int ret;

        ASSERT_RTNL();
        dev_addr_check(dev);

        if (!netif_device_present(dev)) {
                /* may be detached because parent is runtime-suspended */
                if (dev->dev.parent)
                        pm_runtime_resume(dev->dev.parent);
                if (!netif_device_present(dev))
                        return -ENODEV;
        }

        /* Block netpoll from trying to do any rx path servicing.
         * If we don't do this there is a chance ndo_poll_controller
         * or ndo_poll may be running while we open the device
         */
        netpoll_poll_disable(dev);

        ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
        ret = notifier_to_errno(ret);
        if (ret)
                return ret;

        set_bit(__LINK_STATE_START, &dev->state);

        netdev_ops_assert_locked(dev);

        if (ops->ndo_validate_addr)
                ret = ops->ndo_validate_addr(dev);

        if (!ret && ops->ndo_open)
                ret = ops->ndo_open(dev);

        netpoll_poll_enable(dev);

        if (ret)
                clear_bit(__LINK_STATE_START, &dev->state);
        else {
                netif_set_up(dev, true);
                dev_set_rx_mode(dev);
                dev_activate(dev);
                add_device_randomness(dev->dev_addr, dev->addr_len);
        }

        return ret;
}

int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
        int ret;

        if (dev->flags & IFF_UP)
                return 0;

        ret = __dev_open(dev, extack);
        if (ret < 0)
                return ret;

        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
        call_netdevice_notifiers(NETDEV_UP, dev);

        return ret;
}
EXPORT_SYMBOL(netif_open);

static void __dev_close_many(struct list_head *head)
{
        struct net_device *dev;

        ASSERT_RTNL();
        might_sleep();

        list_for_each_entry(dev, head, close_list) {
                /* Temporarily disable netpoll until the interface is down */
                netpoll_poll_disable(dev);

                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);

                clear_bit(__LINK_STATE_START, &dev->state);

                /* Synchronize to scheduled poll. We cannot touch poll list, it
                 * can be even on different cpu. So just clear netif_running().
                 *
                 * dev->stop() will invoke napi_disable() on all of it's
                 * napi_struct instances on this device.
                 */
                smp_mb__after_atomic(); /* Commit netif_running(). */
        }

        dev_deactivate_many(head, true);

        list_for_each_entry(dev, head, close_list) {
                const struct net_device_ops *ops = dev->netdev_ops;

                /*
                 *        Call the device specific close. This cannot fail.
                 *        Only if device is UP
                 *
                 *        We allow it to be called even after a DETACH hot-plug
                 *        event.
                 */

                netdev_ops_assert_locked(dev);

                if (ops->ndo_stop)
                        ops->ndo_stop(dev);

                netif_set_up(dev, false);
                netpoll_poll_enable(dev);
        }
}

static void __dev_close(struct net_device *dev)
{
        LIST_HEAD(single);

        list_add(&dev->close_list, &single);
        __dev_close_many(&single);
        list_del(&single);
}

void netif_close_many(struct list_head *head, bool unlink)
{
        struct net_device *dev, *tmp;

        /* Remove the devices that don't need to be closed */
        list_for_each_entry_safe(dev, tmp, head, close_list)
                if (!(dev->flags & IFF_UP))
                        list_del_init(&dev->close_list);

        __dev_close_many(head);

        list_for_each_entry_safe(dev, tmp, head, close_list) {
                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
                call_netdevice_notifiers(NETDEV_DOWN, dev);
                if (unlink)
                        list_del_init(&dev->close_list);
        }
}
EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL");

void netif_close(struct net_device *dev)
{
        if (dev->flags & IFF_UP) {
                LIST_HEAD(single);

                list_add(&dev->close_list, &single);
                netif_close_many(&single, true);
                list_del(&single);
        }
}
EXPORT_SYMBOL(netif_close);

void netif_disable_lro(struct net_device *dev)
{
        struct net_device *lower_dev;
        struct list_head *iter;

        dev->wanted_features &= ~NETIF_F_LRO;
        netdev_update_features(dev);

        if (unlikely(dev->features & NETIF_F_LRO))
                netdev_WARN(dev, "failed to disable LRO!\n");

        netdev_for_each_lower_dev(dev, lower_dev, iter) {
                netdev_lock_ops(lower_dev);
                netif_disable_lro(lower_dev);
                netdev_unlock_ops(lower_dev);
        }
}

/**
 *        dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 *        @dev: device
 *
 *        Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 *        called under RTNL.  This is needed if Generic XDP is installed on
 *        the device.
 */
static void dev_disable_gro_hw(struct net_device *dev)
{
        dev->wanted_features &= ~NETIF_F_GRO_HW;
        netdev_update_features(dev);

        if (unlikely(dev->features & NETIF_F_GRO_HW))
                netdev_WARN(dev, "failed to disable GRO_HW!\n");
}

const char *netdev_cmd_to_name(enum netdev_cmd cmd)
{
#define N(val)                                                 \
        case NETDEV_##val:                                \
                return "NETDEV_" __stringify(val);
        switch (cmd) {
        N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
        N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
        N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
        N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
        N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
        N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
        N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
        N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
        N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
        N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
        N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
        N(XDP_FEAT_CHANGE)
        }
#undef N
        return "UNKNOWN_NETDEV_EVENT";
}
EXPORT_SYMBOL_GPL(netdev_cmd_to_name);

static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
                                   struct net_device *dev)
{
        struct netdev_notifier_info info = {
                .dev = dev,
        };

        return nb->notifier_call(nb, val, &info);
}

static int call_netdevice_register_notifiers(struct notifier_block *nb,
                                             struct net_device *dev)
{
        int err;

        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
        err = notifier_to_errno(err);
        if (err)
                return err;

        if (!(dev->flags & IFF_UP))
                return 0;

        call_netdevice_notifier(nb, NETDEV_UP, dev);
        return 0;
}

static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
                                                struct net_device *dev)
{
        if (dev->flags & IFF_UP) {
                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
                                        dev);
                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
        }
        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}

static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
                                                 struct net *net)
{
        struct net_device *dev;
        int err;

        for_each_netdev(net, dev) {
                netdev_lock_ops(dev);
                err = call_netdevice_register_notifiers(nb, dev);
                netdev_unlock_ops(dev);
                if (err)
                        goto rollback;
        }
        return 0;

rollback:
        for_each_netdev_continue_reverse(net, dev)
                call_netdevice_unregister_notifiers(nb, dev);
        return err;
}

static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
                                                    struct net *net)
{
        struct net_device *dev;

        for_each_netdev(net, dev)
                call_netdevice_unregister_notifiers(nb, dev);
}

static int dev_boot_phase = 1;

/**
 * register_netdevice_notifier - register a network notifier block
 * @nb: notifier
 *
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
 *
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
 */

int register_netdevice_notifier(struct notifier_block *nb)
{
        struct net *net;
        int err;

        /* Close race with setup_net() and cleanup_net() */
        down_write(&pernet_ops_rwsem);

        /* When RTNL is removed, we need protection for netdev_chain. */
        rtnl_lock();

        err = raw_notifier_chain_register(&netdev_chain, nb);
        if (err)
                goto unlock;
        if (dev_boot_phase)
                goto unlock;
        for_each_net(net) {
                __rtnl_net_lock(net);
                err = call_netdevice_register_net_notifiers(nb, net);
                __rtnl_net_unlock(net);
                if (err)
                        goto rollback;
        }

unlock:
        rtnl_unlock();
        up_write(&pernet_ops_rwsem);
        return err;

rollback:
        for_each_net_continue_reverse(net) {
                __rtnl_net_lock(net);
                call_netdevice_unregister_net_notifiers(nb, net);
                __rtnl_net_unlock(net);
        }

        raw_notifier_chain_unregister(&netdev_chain, nb);
        goto unlock;
}
EXPORT_SYMBOL(register_netdevice_notifier);

/**
 * unregister_netdevice_notifier - unregister a network notifier block
 * @nb: notifier
 *
 * Unregister a notifier previously registered by
 * register_netdevice_notifier(). The notifier is unlinked into the
 * kernel structures and may then be reused. A negative errno code
 * is returned on a failure.
 *
 * After unregistering unregister and down device events are synthesized
 * for all devices on the device list to the removed notifier to remove
 * the need for special case cleanup code.
 */

int unregister_netdevice_notifier(struct notifier_block *nb)
{
        struct net *net;
        int err;

        /* Close race with setup_net() and cleanup_net() */
        down_write(&pernet_ops_rwsem);
        rtnl_lock();
        err = raw_notifier_chain_unregister(&netdev_chain, nb);
        if (err)
                goto unlock;

        for_each_net(net) {
                __rtnl_net_lock(net);
                call_netdevice_unregister_net_notifiers(nb, net);
                __rtnl_net_unlock(net);
        }

unlock:
        rtnl_unlock();
        up_write(&pernet_ops_rwsem);
        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier);

static int __register_netdevice_notifier_net(struct net *net,
                                             struct notifier_block *nb,
                                             bool ignore_call_fail)
{
        int err;

        err = raw_notifier_chain_register(&net->netdev_chain, nb);
        if (err)
                return err;
        if (dev_boot_phase)
                return 0;

        err = call_netdevice_register_net_notifiers(nb, net);
        if (err && !ignore_call_fail)
                goto chain_unregister;

        return 0;

chain_unregister:
        raw_notifier_chain_unregister(&net->netdev_chain, nb);
        return err;
}

static int __unregister_netdevice_notifier_net(struct net *net,
                                               struct notifier_block *nb)
{
        int err;

        err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
        if (err)
                return err;

        call_netdevice_unregister_net_notifiers(nb, net);
        return 0;
}

/**
 * register_netdevice_notifier_net - register a per-netns network notifier block
 * @net: network namespace
 * @nb: notifier
 *
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
 *
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
 */

int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
        int err;

        rtnl_net_lock(net);
        err = __register_netdevice_notifier_net(net, nb, false);
        rtnl_net_unlock(net);

        return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);

/**
 * unregister_netdevice_notifier_net - unregister a per-netns
 *                                     network notifier block
 * @net: network namespace
 * @nb: notifier
 *
 * Unregister a notifier previously registered by
 * register_netdevice_notifier_net(). The notifier is unlinked from the
 * kernel structures and may then be reused. A negative errno code
 * is returned on a failure.
 *
 * After unregistering unregister and down device events are synthesized
 * for all devices on the device list to the removed notifier to remove
 * the need for special case cleanup code.
 */

int unregister_netdevice_notifier_net(struct net *net,
                                      struct notifier_block *nb)
{
        int err;

        rtnl_net_lock(net);
        err = __unregister_netdevice_notifier_net(net, nb);
        rtnl_net_unlock(net);

        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);

static void __move_netdevice_notifier_net(struct net *src_net,
                                          struct net *dst_net,
                                          struct notifier_block *nb)
{
        __unregister_netdevice_notifier_net(src_net, nb);
        __register_netdevice_notifier_net(dst_net, nb, true);
}

static void rtnl_net_dev_lock(struct net_device *dev)
{
        bool again;

        do {
                struct net *net;

                again = false;

                /* netns might be being dismantled. */
                rcu_read_lock();
                net = dev_net_rcu(dev);
                net_passive_inc(net);
                rcu_read_unlock();

                rtnl_net_lock(net);

#ifdef CONFIG_NET_NS
                /* dev might have been moved to another netns. */
                if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
                        rtnl_net_unlock(net);
                        net_passive_dec(net);
                        again = true;
                }
#endif
        } while (again);
}

static void rtnl_net_dev_unlock(struct net_device *dev)
{
        struct net *net = dev_net(dev);

        rtnl_net_unlock(net);
        net_passive_dec(net);
}

int register_netdevice_notifier_dev_net(struct net_device *dev,
                                        struct notifier_block *nb,
                                        struct netdev_net_notifier *nn)
{
        int err;

        rtnl_net_dev_lock(dev);
        err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
        if (!err) {
                nn->nb = nb;
                list_add(&nn->list, &dev->net_notifier_list);
        }
        rtnl_net_dev_unlock(dev);

        return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);

int unregister_netdevice_notifier_dev_net(struct net_device *dev,
                                          struct notifier_block *nb,
                                          struct netdev_net_notifier *nn)
{
        int err;

        rtnl_net_dev_lock(dev);
        list_del(&nn->list);
        err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
        rtnl_net_dev_unlock(dev);

        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);

static void move_netdevice_notifiers_dev_net(struct net_device *dev,
                                             struct net *net)
{
        struct netdev_net_notifier *nn;

        list_for_each_entry(nn, &dev->net_notifier_list, list)
                __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
}

/**
 *        call_netdevice_notifiers_info - call all network notifier blocks
 *        @val: value passed unmodified to notifier function
 *        @info: notifier information data
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */

int call_netdevice_notifiers_info(unsigned long val,
                                  struct netdev_notifier_info *info)
{
        struct net *net = dev_net(info->dev);
        int ret;

        ASSERT_RTNL();

        /* Run per-netns notifier block chain first, then run the global one.
         * Hopefully, one day, the global one is going to be removed after
         * all notifier block registrators get converted to be per-netns.
         */
        ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
        if (ret & NOTIFY_STOP_MASK)
                return ret;
        return raw_notifier_call_chain(&netdev_chain, val, info);
}

/**
 *        call_netdevice_notifiers_info_robust - call per-netns notifier blocks
 *                                               for and rollback on error
 *        @val_up: value passed unmodified to notifier function
 *        @val_down: value passed unmodified to the notifier function when
 *                   recovering from an error on @val_up
 *        @info: notifier information data
 *
 *        Call all per-netns network notifier blocks, but not notifier blocks on
 *        the global notifier chain. Parameters and return value are as for
 *        raw_notifier_call_chain_robust().
 */

static int
call_netdevice_notifiers_info_robust(unsigned long val_up,
                                     unsigned long val_down,
                                     struct netdev_notifier_info *info)
{
        struct net *net = dev_net(info->dev);

        ASSERT_RTNL();

        return raw_notifier_call_chain_robust(&net->netdev_chain,
                                              val_up, val_down, info);
}

static int call_netdevice_notifiers_extack(unsigned long val,
                                           struct net_device *dev,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_info info = {
                .dev = dev,
                .extack = extack,
        };

        return call_netdevice_notifiers_info(val, &info);
}

/**
 *        call_netdevice_notifiers - call all network notifier blocks
 *      @val: value passed unmodified to notifier function
 *      @dev: net_device pointer passed unmodified to notifier function
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */

int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
        return call_netdevice_notifiers_extack(val, dev, NULL);
}
EXPORT_SYMBOL(call_netdevice_notifiers);

/**
 *        call_netdevice_notifiers_mtu - call all network notifier blocks
 *        @val: value passed unmodified to notifier function
 *        @dev: net_device pointer passed unmodified to notifier function
 *        @arg: additional u32 argument passed to the notifier function
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */
static int call_netdevice_notifiers_mtu(unsigned long val,
                                        struct net_device *dev, u32 arg)
{
        struct netdev_notifier_info_ext info = {
                .info.dev = dev,
                .ext.mtu = arg,
        };

        BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);

        return call_netdevice_notifiers_info(val, &info.info);
}

#ifdef CONFIG_NET_INGRESS
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);

void net_inc_ingress_queue(void)
{
        static_branch_inc(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);

void net_dec_ingress_queue(void)
{
        static_branch_dec(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif

#ifdef CONFIG_NET_EGRESS
static DEFINE_STATIC_KEY_FALSE(egress_needed_key);

void net_inc_egress_queue(void)
{
        static_branch_inc(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_egress_queue);

void net_dec_egress_queue(void)
{
        static_branch_dec(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif

#ifdef CONFIG_NET_CLS_ACT
DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
EXPORT_SYMBOL(tcf_sw_enabled_key);
#endif

DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
EXPORT_SYMBOL(netstamp_needed_key);
#ifdef CONFIG_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
static void netstamp_clear(struct work_struct *work)
{
        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
        int wanted;

        wanted = atomic_add_return(deferred, &netstamp_wanted);
        if (wanted > 0)
                static_branch_enable(&netstamp_needed_key);
        else
                static_branch_disable(&netstamp_needed_key);
}
static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif

void net_enable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
        int wanted = atomic_read(&netstamp_wanted);

        while (wanted > 0) {
                if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
                        return;
        }
        atomic_inc(&netstamp_needed_deferred);
        schedule_work(&netstamp_work);
#else
        static_branch_inc(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_enable_timestamp);

void net_disable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
        int wanted = atomic_read(&netstamp_wanted);

        while (wanted > 1) {
                if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
                        return;
        }
        atomic_dec(&netstamp_needed_deferred);
        schedule_work(&netstamp_work);
#else
        static_branch_dec(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_disable_timestamp);

static inline void net_timestamp_set(struct sk_buff *skb)
{
        skb->tstamp = 0;
        skb->tstamp_type = SKB_CLOCK_REALTIME;
        if (static_branch_unlikely(&netstamp_needed_key))
                skb->tstamp = ktime_get_real();
}

#define net_timestamp_check(COND, SKB)                                \
        if (static_branch_unlikely(&netstamp_needed_key)) {        \
                if ((COND) && !(SKB)->tstamp)                        \
                        (SKB)->tstamp = ktime_get_real();        \
        }                                                        \

bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
        return __is_skb_forwardable(dev, skb, true);
}
EXPORT_SYMBOL_GPL(is_skb_forwardable);

static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
                              bool check_mtu)
{
        int ret = ____dev_forward_skb(dev, skb, check_mtu);

        if (likely(!ret)) {
                skb->protocol = eth_type_trans(skb, dev);
                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
        }

        return ret;
}

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb2(dev, skb, true);
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);

/**
 * dev_forward_skb - loopback an skb to another netif
 *
 * @dev: destination network device
 * @skb: buffer to forward
 *
 * return values:
 *        NET_RX_SUCCESS        (no congestion)
 *        NET_RX_DROP     (packet was dropped, but freed)
 *
 * dev_forward_skb can be used for injecting an skb from the
 * start_xmit function of one device into the receive queue
 * of another device.
 *
 * The receiving device may be in another namespace, so
 * we have to clear all information in the skb that could
 * impact namespace isolation.
 */
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);

int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
}

static int deliver_skb(struct sk_buff *skb,
                       struct packet_type *pt_prev,
                       struct net_device *orig_dev)
{
        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                return -ENOMEM;
        refcount_inc(&skb->users);
        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

static inline void deliver_ptype_list_skb(struct sk_buff *skb,
                                          struct packet_type **pt,
                                          struct net_device *orig_dev,
                                          __be16 type,
                                          struct list_head *ptype_list)
{
        struct packet_type *ptype, *pt_prev = *pt;

        list_for_each_entry_rcu(ptype, ptype_list, list) {
                if (ptype->type != type)
                        continue;
                if (unlikely(pt_prev))
                        deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }
        *pt = pt_prev;
}

static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
{
        if (!ptype->af_packet_priv || !skb->sk)
                return false;

        if (ptype->id_match)
                return ptype->id_match(ptype, skb->sk);
        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
                return true;

        return false;
}

/**
 * dev_nit_active_rcu - return true if any network interface taps are in use
 *
 * The caller must hold the RCU lock
 *
 * @dev: network device to check for the presence of taps
 */
bool dev_nit_active_rcu(const struct net_device *dev)
{
        /* Callers may hold either RCU or RCU BH lock */
        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());

        return !list_empty(&dev_net(dev)->ptype_all) ||
               !list_empty(&dev->ptype_all);
}
EXPORT_SYMBOL_GPL(dev_nit_active_rcu);

/*
 *        Support routine. Sends outgoing frames to any network
 *        taps currently in use.
 */

void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
        struct packet_type *ptype, *pt_prev = NULL;
        struct list_head *ptype_list;
        struct sk_buff *skb2 = NULL;

        rcu_read_lock();
        ptype_list = &dev_net_rcu(dev)->ptype_all;
again:
        list_for_each_entry_rcu(ptype, ptype_list, list) {
                if (READ_ONCE(ptype->ignore_outgoing))
                        continue;

                /* Never send packets back to the socket
                 * they originated from - MvS (miquels@drinkel.ow.org)
                 */
                if (skb_loop_sk(ptype, skb))
                        continue;

                if (unlikely(pt_prev)) {
                        deliver_skb(skb2, pt_prev, skb->dev);
                        pt_prev = ptype;
                        continue;
                }

                /* need to clone skb, done only once */
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (!skb2)
                        goto out_unlock;

                net_timestamp_set(skb2);

                /* skb->nh should be correctly
                 * set by sender, so that the second statement is
                 * just protection against buggy protocols.
                 */
                skb_reset_mac_header(skb2);

                if (skb_network_header(skb2) < skb2->data ||
                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
                                             ntohs(skb2->protocol),
                                             dev->name);
                        skb_reset_network_header(skb2);
                }

                skb2->transport_header = skb2->network_header;
                skb2->pkt_type = PACKET_OUTGOING;
                pt_prev = ptype;
        }

        if (ptype_list != &dev->ptype_all) {
                ptype_list = &dev->ptype_all;
                goto again;
        }
out_unlock:
        if (pt_prev) {
                if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
                        pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
                else
                        kfree_skb(skb2);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);

/**
 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 * @dev: Network device
 * @txq: number of queues available
 *
 * If real_num_tx_queues is changed the tc mappings may no longer be
 * valid. To resolve this verify the tc mapping remains valid and if
 * not NULL the mapping. With no priorities mapping to this
 * offset/count pair it will no longer be used. In the worst case TC0
 * is invalid nothing can be done so disable priority mappings. If is
 * expected that drivers will fix this mapping if they can before
 * calling netif_set_real_num_tx_queues.
 */
static void netif_setup_tc(struct net_device *dev, unsigned int txq)
{
        int i;
        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];

        /* If TC0 is invalidated disable TC mapping */
        if (tc->offset + tc->count > txq) {
                netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
                dev->num_tc = 0;
                return;
        }

        /* Invalidated prio to tc mappings set to TC0 */
        for (i = 1; i < TC_BITMASK + 1; i++) {
                int q = netdev_get_prio_tc_map(dev, i);

                tc = &dev->tc_to_txq[q];
                if (tc->offset + tc->count > txq) {
                        netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
                                    i, q);
                        netdev_set_prio_tc_map(dev, i, 0);
                }
        }
}

int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
{
        if (dev->num_tc) {
                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
                int i;

                /* walk through the TCs and see if it falls into any of them */
                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
                        if ((txq - tc->offset) < tc->count)
                                return i;
                }

                /* didn't find it, just return -1 to indicate no match */
                return -1;
        }

        return 0;
}
EXPORT_SYMBOL(netdev_txq_to_tc);

#ifdef CONFIG_XPS
static struct static_key xps_needed __read_mostly;
static struct static_key xps_rxqs_needed __read_mostly;
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P)                \
        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))

static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
                             struct xps_dev_maps *old_maps, int tci, u16 index)
{
        struct xps_map *map = NULL;
        int pos;

        map = xmap_dereference(dev_maps->attr_map[tci]);
        if (!map)
                return false;

        for (pos = map->len; pos--;) {
                if (map->queues[pos] != index)
                        continue;

                if (map->len > 1) {
                        map->queues[pos] = map->queues[--map->len];
                        break;
                }

                if (old_maps)
                        RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
                RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                kfree_rcu(map, rcu);
                return false;
        }

        return true;
}

static bool remove_xps_queue_cpu(struct net_device *dev,
                                 struct xps_dev_maps *dev_maps,
                                 int cpu, u16 offset, u16 count)
{
        int num_tc = dev_maps->num_tc;
        bool active = false;
        int tci;

        for (tci = cpu * num_tc; num_tc--; tci++) {
                int i, j;

                for (i = count, j = offset; i--; j++) {
                        if (!remove_xps_queue(dev_maps, NULL, tci, j))
                                break;
                }

                active |= i < 0;
        }

        return active;
}

static void reset_xps_maps(struct net_device *dev,
                           struct xps_dev_maps *dev_maps,
                           enum xps_map_type type)
{
        static_key_slow_dec_cpuslocked(&xps_needed);
        if (type == XPS_RXQS)
                static_key_slow_dec_cpuslocked(&xps_rxqs_needed);

        RCU_INIT_POINTER(dev->xps_maps[type], NULL);

        kfree_rcu(dev_maps, rcu);
}

static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
                           u16 offset, u16 count)
{
        struct xps_dev_maps *dev_maps;
        bool active = false;
        int i, j;

        dev_maps = xmap_dereference(dev->xps_maps[type]);
        if (!dev_maps)
                return;

        for (j = 0; j < dev_maps->nr_ids; j++)
                active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
        if (!active)
                reset_xps_maps(dev, dev_maps, type);

        if (type == XPS_CPUS) {
                for (i = offset + (count - 1); count--; i--)
                        netdev_queue_numa_node_write(
                                netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
        }
}

static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
                                   u16 count)
{
        if (!static_key_false(&xps_needed))
                return;

        cpus_read_lock();
        mutex_lock(&xps_map_mutex);

        if (static_key_false(&xps_rxqs_needed))
                clean_xps_maps(dev, XPS_RXQS, offset, count);

        clean_xps_maps(dev, XPS_CPUS, offset, count);

        mutex_unlock(&xps_map_mutex);
        cpus_read_unlock();
}

static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
{
        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
}

static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
                                      u16 index, bool is_rxqs_map)
{
        struct xps_map *new_map;
        int alloc_len = XPS_MIN_MAP_ALLOC;
        int i, pos;

        for (pos = 0; map && pos < map->len; pos++) {
                if (map->queues[pos] != index)
                        continue;
                return map;
        }

        /* Need to add tx-queue to this CPU's/rx-queue's existing map */
        if (map) {
                if (pos < map->alloc_len)
                        return map;

                alloc_len = map->alloc_len * 2;
        }

        /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
         *  map
         */
        if (is_rxqs_map)
                new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
        else
                new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
                                       cpu_to_node(attr_index));
        if (!new_map)
                return NULL;

        for (i = 0; i < pos; i++)
                new_map->queues[i] = map->queues[i];
        new_map->alloc_len = alloc_len;
        new_map->len = pos;

        return new_map;
}

/* Copy xps maps at a given index */
static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
                              struct xps_dev_maps *new_dev_maps, int index,
                              int tc, bool skip_tc)
{
        int i, tci = index * dev_maps->num_tc;
        struct xps_map *map;

        /* copy maps belonging to foreign traffic classes */
        for (i = 0; i < dev_maps->num_tc; i++, tci++) {
                if (i == tc && skip_tc)
                        continue;

                /* fill in the new device map from the old device map */
                map = xmap_dereference(dev_maps->attr_map[tci]);
                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
        }
}

/* Must be called under cpus_read_lock */
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
                          u16 index, enum xps_map_type type)
{
        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
        const unsigned long *online_mask = NULL;
        bool active = false, copy = false;
        int i, j, tci, numa_node_id = -2;
        int maps_sz, num_tc = 1, tc = 0;
        struct xps_map *map, *new_map;
        unsigned int nr_ids;

        WARN_ON_ONCE(index >= dev->num_tx_queues);

        if (dev->num_tc) {
                /* Do not allow XPS on subordinate device directly */
                num_tc = dev->num_tc;
                if (num_tc < 0)
                        return -EINVAL;

                /* If queue belongs to subordinate dev use its map */
                dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;

                tc = netdev_txq_to_tc(dev, index);
                if (tc < 0)
                        return -EINVAL;
        }

        mutex_lock(&xps_map_mutex);

        dev_maps = xmap_dereference(dev->xps_maps[type]);
        if (type == XPS_RXQS) {
                maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
                nr_ids = dev->num_rx_queues;
        } else {
                maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
                if (num_possible_cpus() > 1)
                        online_mask = cpumask_bits(cpu_online_mask);
                nr_ids = nr_cpu_ids;
        }

        if (maps_sz < L1_CACHE_BYTES)
                maps_sz = L1_CACHE_BYTES;

        /* The old dev_maps could be larger or smaller than the one we're
         * setting up now, as dev->num_tc or nr_ids could have been updated in
         * between. We could try to be smart, but let's be safe instead and only
         * copy foreign traffic classes if the two map sizes match.
         */
        if (dev_maps &&
            dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
                copy = true;

        /* allocate memory for queue storage */
        for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
             j < nr_ids;) {
                if (!new_dev_maps) {
                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
                        if (!new_dev_maps) {
                                mutex_unlock(&xps_map_mutex);
                                return -ENOMEM;
                        }

                        new_dev_maps->nr_ids = nr_ids;
                        new_dev_maps->num_tc = num_tc;
                }

                tci = j * num_tc + tc;
                map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;

                map = expand_xps_map(map, j, index, type == XPS_RXQS);
                if (!map)
                        goto error;

                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
        }

        if (!new_dev_maps)
                goto out_no_new_maps;

        if (!dev_maps) {
                /* Increment static keys at most once per type */
                static_key_slow_inc_cpuslocked(&xps_needed);
                if (type == XPS_RXQS)
                        static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
        }

        for (j = 0; j < nr_ids; j++) {
                bool skip_tc = false;

                tci = j * num_tc + tc;
                if (netif_attr_test_mask(j, mask, nr_ids) &&
                    netif_attr_test_online(j, online_mask, nr_ids)) {
                        /* add tx-queue to CPU/rx-queue maps */
                        int pos = 0;

                        skip_tc = true;

                        map = xmap_dereference(new_dev_maps->attr_map[tci]);
                        while ((pos < map->len) && (map->queues[pos] != index))
                                pos++;

                        if (pos == map->len)
                                map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
                        if (type == XPS_CPUS) {
                                if (numa_node_id == -2)
                                        numa_node_id = cpu_to_node(j);
                                else if (numa_node_id != cpu_to_node(j))
                                        numa_node_id = -1;
                        }
#endif
                }

                if (copy)
                        xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
                                          skip_tc);
        }

        rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);

        /* Cleanup old maps */
        if (!dev_maps)
                goto out_no_old_maps;

        for (j = 0; j < dev_maps->nr_ids; j++) {
                for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
                        map = xmap_dereference(dev_maps->attr_map[tci]);
                        if (!map)
                                continue;

                        if (copy) {
                                new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
                                if (map == new_map)
                                        continue;
                        }

                        RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                        kfree_rcu(map, rcu);
                }
        }

        old_dev_maps = dev_maps;

out_no_old_maps:
        dev_maps = new_dev_maps;
        active = true;

out_no_new_maps:
        if (type == XPS_CPUS)
                /* update Tx queue numa node */
                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
                                             (numa_node_id >= 0) ?
                                             numa_node_id : NUMA_NO_NODE);

        if (!dev_maps)
                goto out_no_maps;

        /* removes tx-queue from unused CPUs/rx-queues */
        for (j = 0; j < dev_maps->nr_ids; j++) {
                tci = j * dev_maps->num_tc;

                for (i = 0; i < dev_maps->num_tc; i++, tci++) {
                        if (i == tc &&
                            netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
                            netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
                                continue;

                        active |= remove_xps_queue(dev_maps,
                                                   copy ? old_dev_maps : NULL,
                                                   tci, index);
                }
        }

        if (old_dev_maps)
                kfree_rcu(old_dev_maps, rcu);

        /* free map if not active */
        if (!active)
                reset_xps_maps(dev, dev_maps, type);

out_no_maps:
        mutex_unlock(&xps_map_mutex);

        return 0;
error:
        /* remove any maps that we added */
        for (j = 0; j < nr_ids; j++) {
                for (i = num_tc, tci = j * num_tc; i--; tci++) {
                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
                        map = copy ?
                              xmap_dereference(dev_maps->attr_map[tci]) :
                              NULL;
                        if (new_map && new_map != map)
                                kfree(new_map);
                }
        }

        mutex_unlock(&xps_map_mutex);

        kfree(new_dev_maps);
        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(__netif_set_xps_queue);

int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index)
{
        int ret;

        cpus_read_lock();
        ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
        cpus_read_unlock();

        return ret;
}
EXPORT_SYMBOL(netif_set_xps_queue);

#endif
static void netdev_unbind_all_sb_channels(struct net_device *dev)
{
        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];

        /* Unbind any subordinate channels */
        while (txq-- != &dev->_tx[0]) {
                if (txq->sb_dev)
                        netdev_unbind_sb_channel(dev, txq->sb_dev);
        }
}

void netdev_reset_tc(struct net_device *dev)
{
#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(dev, 0);
#endif
        netdev_unbind_all_sb_channels(dev);

        /* Reset TC configuration of device */
        dev->num_tc = 0;
        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
}
EXPORT_SYMBOL(netdev_reset_tc);

int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
{
        if (tc >= dev->num_tc)
                return -EINVAL;

#ifdef CONFIG_XPS
        netif_reset_xps_queues(dev, offset, count);
#endif
        dev->tc_to_txq[tc].count = count;
        dev->tc_to_txq[tc].offset = offset;
        return 0;
}
EXPORT_SYMBOL(netdev_set_tc_queue);

int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
{
        if (num_tc > TC_MAX_QUEUE)
                return -EINVAL;

#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(dev, 0);
#endif
        netdev_unbind_all_sb_channels(dev);

        dev->num_tc = num_tc;
        return 0;
}
EXPORT_SYMBOL(netdev_set_num_tc);

void netdev_unbind_sb_channel(struct net_device *dev,
                              struct net_device *sb_dev)
{
        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];

#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(sb_dev, 0);
#endif
        memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
        memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));

        while (txq-- != &dev->_tx[0]) {
                if (txq->sb_dev == sb_dev)
                        txq->sb_dev = NULL;
        }
}
EXPORT_SYMBOL(netdev_unbind_sb_channel);

int netdev_bind_sb_channel_queue(struct net_device *dev,
                                 struct net_device *sb_dev,
                                 u8 tc, u16 count, u16 offset)
{
        /* Make certain the sb_dev and dev are already configured */
        if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
                return -EINVAL;

        /* We cannot hand out queues we don't have */
        if ((offset + count) > dev->real_num_tx_queues)
                return -EINVAL;

        /* Record the mapping */
        sb_dev->tc_to_txq[tc].count = count;
        sb_dev->tc_to_txq[tc].offset = offset;

        /* Provide a way for Tx queue to find the tc_to_txq map or
         * XPS map for itself.
         */
        while (count--)
                netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;

        return 0;
}
EXPORT_SYMBOL(netdev_bind_sb_channel_queue);

int netdev_set_sb_channel(struct net_device *dev, u16 channel)
{
        /* Do not use a multiqueue device to represent a subordinate channel */
        if (netif_is_multiqueue(dev))
                return -ENODEV;

        /* We allow channels 1 - 32767 to be used for subordinate channels.
         * Channel 0 is meant to be "native" mode and used only to represent
         * the main root device. We allow writing 0 to reset the device back
         * to normal mode after being used as a subordinate channel.
         */
        if (channel > S16_MAX)
                return -EINVAL;

        dev->num_tc = -channel;

        return 0;
}
EXPORT_SYMBOL(netdev_set_sb_channel);

/*
 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 */
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
        bool disabling;
        int rc;

        disabling = txq < dev->real_num_tx_queues;

        if (txq < 1 || txq > dev->num_tx_queues)
                return -EINVAL;

        if (dev->reg_state == NETREG_REGISTERED ||
            dev->reg_state == NETREG_UNREGISTERING) {
                netdev_ops_assert_locked(dev);

                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
                                                  txq);
                if (rc)
                        return rc;

                if (dev->num_tc)
                        netif_setup_tc(dev, txq);

                net_shaper_set_real_num_tx_queues(dev, txq);

                dev_qdisc_change_real_num_tx(dev, txq);

                dev->real_num_tx_queues = txq;

                if (disabling) {
                        synchronize_net();
                        qdisc_reset_all_tx_gt(dev, txq);
#ifdef CONFIG_XPS
                        netif_reset_xps_queues_gt(dev, txq);
#endif
                }
        } else {
                dev->real_num_tx_queues = txq;
        }

        return 0;
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);

/**
 *        netif_set_real_num_rx_queues - set actual number of RX queues used
 *        @dev: Network device
 *        @rxq: Actual number of RX queues
 *
 *        This must be called either with the rtnl_lock held or before
 *        registration of the net device.  Returns 0 on success, or a
 *        negative error code.  If called before registration, it always
 *        succeeds.
 */
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
{
        int rc;

        if (rxq < 1 || rxq > dev->num_rx_queues)
                return -EINVAL;

        if (dev->reg_state == NETREG_REGISTERED) {
                netdev_ops_assert_locked(dev);

                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
                                                  rxq);
                if (rc)
                        return rc;
        }

        dev->real_num_rx_queues = rxq;
        return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);

/**
 *        netif_set_real_num_queues - set actual number of RX and TX queues used
 *        @dev: Network device
 *        @txq: Actual number of TX queues
 *        @rxq: Actual number of RX queues
 *
 *        Set the real number of both TX and RX queues.
 *        Does nothing if the number of queues is already correct.
 */
int netif_set_real_num_queues(struct net_device *dev,
                              unsigned int txq, unsigned int rxq)
{
        unsigned int old_rxq = dev->real_num_rx_queues;
        int err;

        if (txq < 1 || txq > dev->num_tx_queues ||
            rxq < 1 || rxq > dev->num_rx_queues)
                return -EINVAL;

        /* Start from increases, so the error path only does decreases -
         * decreases can't fail.
         */
        if (rxq > dev->real_num_rx_queues) {
                err = netif_set_real_num_rx_queues(dev, rxq);
                if (err)
                        return err;
        }
        if (txq > dev->real_num_tx_queues) {
                err = netif_set_real_num_tx_queues(dev, txq);
                if (err)
                        goto undo_rx;
        }
        if (rxq < dev->real_num_rx_queues)
                WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
        if (txq < dev->real_num_tx_queues)
                WARN_ON(netif_set_real_num_tx_queues(dev, txq));

        return 0;
undo_rx:
        WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
        return err;
}
EXPORT_SYMBOL(netif_set_real_num_queues);

/**
 * netif_set_tso_max_size() - set the max size of TSO frames supported
 * @dev:        netdev to update
 * @size:        max skb->len of a TSO frame
 *
 * Set the limit on the size of TSO super-frames the device can handle.
 * Unless explicitly set the stack will assume the value of
 * %GSO_LEGACY_MAX_SIZE.
 */
void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
{
        dev->tso_max_size = min(GSO_MAX_SIZE, size);
        if (size < READ_ONCE(dev->gso_max_size))
                netif_set_gso_max_size(dev, size);
        if (size < READ_ONCE(dev->gso_ipv4_max_size))
                netif_set_gso_ipv4_max_size(dev, size);
}
EXPORT_SYMBOL(netif_set_tso_max_size);

/**
 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
 * @dev:        netdev to update
 * @segs:        max number of TCP segments
 *
 * Set the limit on the number of TCP segments the device can generate from
 * a single TSO super-frame.
 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
 */
void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
{
        dev->tso_max_segs = segs;
        if (segs < READ_ONCE(dev->gso_max_segs))
                netif_set_gso_max_segs(dev, segs);
}
EXPORT_SYMBOL(netif_set_tso_max_segs);

/**
 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
 * @to:                netdev to update
 * @from:        netdev from which to copy the limits
 */
void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
{
        netif_set_tso_max_size(to, from->tso_max_size);
        netif_set_tso_max_segs(to, from->tso_max_segs);
}
EXPORT_SYMBOL(netif_inherit_tso_max);

/**
 * netif_get_num_default_rss_queues - default number of RSS queues
 *
 * Default value is the number of physical cores if there are only 1 or 2, or
 * divided by 2 if there are more.
 */
int netif_get_num_default_rss_queues(void)
{
        cpumask_var_t cpus;
        int cpu, count = 0;

        if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
                return 1;

        cpumask_copy(cpus, cpu_online_mask);
        for_each_cpu(cpu, cpus) {
                ++count;
                cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
        }
        free_cpumask_var(cpus);

        return count > 2 ? DIV_ROUND_UP(count, 2) : count;
}
EXPORT_SYMBOL(netif_get_num_default_rss_queues);

static void __netif_reschedule(struct Qdisc *q)
{
        struct softnet_data *sd;
        unsigned long flags;

        local_irq_save(flags);
        sd = this_cpu_ptr(&softnet_data);
        q->next_sched = NULL;
        *sd->output_queue_tailp = q;
        sd->output_queue_tailp = &q->next_sched;
        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_restore(flags);
}

void __netif_schedule(struct Qdisc *q)
{
        /* If q->defer_list is not empty, at least one thread is
         * in __dev_xmit_skb() before llist_del_all(&q->defer_list).
         * This thread will attempt to run the queue.
         */
        if (!llist_empty(&q->defer_list))
                return;

        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
                __netif_reschedule(q);
}
EXPORT_SYMBOL(__netif_schedule);

struct dev_kfree_skb_cb {
        enum skb_drop_reason reason;
};

static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
{
        return (struct dev_kfree_skb_cb *)skb->cb;
}

void netif_schedule_queue(struct netdev_queue *txq)
{
        rcu_read_lock();
        if (!netif_xmit_stopped(txq)) {
                struct Qdisc *q = rcu_dereference(txq->qdisc);

                __netif_schedule(q);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(netif_schedule_queue);

void netif_tx_wake_queue(struct netdev_queue *dev_queue)
{
        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
                struct Qdisc *q;

                rcu_read_lock();
                q = rcu_dereference(dev_queue->qdisc);
                __netif_schedule(q);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(netif_tx_wake_queue);

void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        unsigned long flags;

        if (unlikely(!skb))
                return;

        if (likely(refcount_read(&skb->users) == 1)) {
                smp_rmb();
                refcount_set(&skb->users, 0);
        } else if (likely(!refcount_dec_and_test(&skb->users))) {
                return;
        }
        get_kfree_skb_cb(skb)->reason = reason;
        local_irq_save(flags);
        skb->next = __this_cpu_read(softnet_data.completion_queue);
        __this_cpu_write(softnet_data.completion_queue, skb);
        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_restore(flags);
}
EXPORT_SYMBOL(dev_kfree_skb_irq_reason);

void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        if (in_hardirq() || irqs_disabled())
                dev_kfree_skb_irq_reason(skb, reason);
        else
                kfree_skb_reason(skb, reason);
}
EXPORT_SYMBOL(dev_kfree_skb_any_reason);


/**
 * netif_device_detach - mark device as removed
 * @dev: network device
 *
 * Mark device as removed from system and therefore no longer available.
 */
void netif_device_detach(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
            netif_running(dev)) {
                netif_tx_stop_all_queues(dev);
        }
}
EXPORT_SYMBOL(netif_device_detach);

/**
 * netif_device_attach - mark device as attached
 * @dev: network device
 *
 * Mark device as attached from system and restart if needed.
 */
void netif_device_attach(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
            netif_running(dev)) {
                netif_tx_wake_all_queues(dev);
                netdev_watchdog_up(dev);
        }
}
EXPORT_SYMBOL(netif_device_attach);

/*
 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 * to be used as a distribution range.
 */
static u16 skb_tx_hash(const struct net_device *dev,
                       const struct net_device *sb_dev,
                       struct sk_buff *skb)
{
        u32 hash;
        u16 qoffset = 0;
        u16 qcount = dev->real_num_tx_queues;

        if (dev->num_tc) {
                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);

                qoffset = sb_dev->tc_to_txq[tc].offset;
                qcount = sb_dev->tc_to_txq[tc].count;
                if (unlikely(!qcount)) {
                        net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
                                             sb_dev->name, qoffset, tc);
                        qoffset = 0;
                        qcount = dev->real_num_tx_queues;
                }
        }

        if (skb_rx_queue_recorded(skb)) {
                DEBUG_NET_WARN_ON_ONCE(qcount == 0);
                hash = skb_get_rx_queue(skb);
                if (hash >= qoffset)
                        hash -= qoffset;
                while (unlikely(hash >= qcount))
                        hash -= qcount;
                return hash + qoffset;
        }

        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}

void skb_warn_bad_offload(const struct sk_buff *skb)
{
        static const netdev_features_t null_features;
        struct net_device *dev = skb->dev;
        const char *name = "";

        if (!net_ratelimit())
                return;

        if (dev) {
                if (dev->dev.parent)
                        name = dev_driver_string(dev->dev.parent);
                else
                        name = netdev_name(dev);
        }
        skb_dump(KERN_WARNING, skb, false);
        WARN(1, "%s: caps=(%pNF, %pNF)\n",
             name, dev ? &dev->features : &null_features,
             skb->sk ? &skb->sk->sk_route_caps : &null_features);
}

/*
 * Invalidate hardware checksum when packet is to be mangled, and
 * complete checksum manually on outgoing path.
 */
int skb_checksum_help(struct sk_buff *skb)
{
        __wsum csum;
        int ret = 0, offset;

        if (skb->ip_summed == CHECKSUM_COMPLETE)
                goto out_set_summed;

        if (unlikely(skb_is_gso(skb))) {
                skb_warn_bad_offload(skb);
                return -EINVAL;
        }

        if (!skb_frags_readable(skb)) {
                return -EFAULT;
        }

        /* Before computing a checksum, we should make sure no frag could
         * be modified by an external entity : checksum could be wrong.
         */
        if (skb_has_shared_frag(skb)) {
                ret = __skb_linearize(skb);
                if (ret)
                        goto out;
        }

        offset = skb_checksum_start_offset(skb);
        ret = -EINVAL;
        if (unlikely(offset >= skb_headlen(skb))) {
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
                WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
                          offset, skb_headlen(skb));
                goto out;
        }
        csum = skb_checksum(skb, offset, skb->len - offset, 0);

        offset += skb->csum_offset;
        if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
                WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
                          offset + sizeof(__sum16), skb_headlen(skb));
                goto out;
        }
        ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
        if (ret)
                goto out;

        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
        skb->ip_summed = CHECKSUM_NONE;
out:
        return ret;
}
EXPORT_SYMBOL(skb_checksum_help);

#ifdef CONFIG_NET_CRC32C
int skb_crc32c_csum_help(struct sk_buff *skb)
{
        u32 crc;
        int ret = 0, offset, start;

        if (skb->ip_summed != CHECKSUM_PARTIAL)
                goto out;

        if (unlikely(skb_is_gso(skb)))
                goto out;

        /* Before computing a checksum, we should make sure no frag could
         * be modified by an external entity : checksum could be wrong.
         */
        if (unlikely(skb_has_shared_frag(skb))) {
                ret = __skb_linearize(skb);
                if (ret)
                        goto out;
        }
        start = skb_checksum_start_offset(skb);
        offset = start + offsetof(struct sctphdr, checksum);
        if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
                ret = -EINVAL;
                goto out;
        }

        ret = skb_ensure_writable(skb, offset + sizeof(__le32));
        if (ret)
                goto out;

        crc = ~skb_crc32c(skb, start, skb->len - start, ~0);
        *(__le32 *)(skb->data + offset) = cpu_to_le32(crc);
        skb_reset_csum_not_inet(skb);
out:
        return ret;
}
EXPORT_SYMBOL(skb_crc32c_csum_help);
#endif /* CONFIG_NET_CRC32C */

__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
        __be16 type = skb->protocol;

        /* Tunnel gso handlers can set protocol to ethernet. */
        if (type == htons(ETH_P_TEB)) {
                struct ethhdr *eth;

                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
                        return 0;

                eth = (struct ethhdr *)skb->data;
                type = eth->h_proto;
        }

        return vlan_get_protocol_and_depth(skb, type, depth);
}


/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
        netdev_err(dev, "hw csum failure\n");
        skb_dump(KERN_ERR, skb, true);
        dump_stack();
}

void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
        DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif

/* XXX: check that highmem exists at all on the given machine. */
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
        int i;

        if (!(dev->features & NETIF_F_HIGHDMA)) {
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        struct page *page = skb_frag_page(frag);

                        if (page && PageHighMem(page))
                                return 1;
                }
        }
#endif
        return 0;
}

/* If MPLS offload request, verify we are testing hardware MPLS features
 * instead of standard features for the netdev.
 */
#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
static netdev_features_t net_mpls_features(struct sk_buff *skb,
                                           netdev_features_t features,
                                           __be16 type)
{
        if (eth_p_mpls(type))
                features &= skb->dev->mpls_features;

        return features;
}
#else
static netdev_features_t net_mpls_features(struct sk_buff *skb,
                                           netdev_features_t features,
                                           __be16 type)
{
        return features;
}
#endif

static netdev_features_t harmonize_features(struct sk_buff *skb,
        netdev_features_t features)
{
        __be16 type;

        type = skb_network_protocol(skb, NULL);
        features = net_mpls_features(skb, features, type);

        if (skb->ip_summed != CHECKSUM_NONE &&
            !can_checksum_protocol(features, type)) {
                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
        }
        if (illegal_highdma(skb->dev, skb))
                features &= ~NETIF_F_SG;

        return features;
}

netdev_features_t passthru_features_check(struct sk_buff *skb,
                                          struct net_device *dev,
                                          netdev_features_t features)
{
        return features;
}
EXPORT_SYMBOL(passthru_features_check);

static netdev_features_t dflt_features_check(struct sk_buff *skb,
                                             struct net_device *dev,
                                             netdev_features_t features)
{
        return vlan_features_check(skb, features);
}

static bool skb_gso_has_extension_hdr(const struct sk_buff *skb)
{
        if (!skb->encapsulation)
                return ((skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 ||
                         (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
                          vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
                        skb_transport_header_was_set(skb) &&
                        skb_network_header_len(skb) != sizeof(struct ipv6hdr));
        else
                return (!skb_inner_network_header_was_set(skb) ||
                        ((skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 ||
                          (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
                           inner_ip_hdr(skb)->version == 6)) &&
                         skb_inner_network_header_len(skb) != sizeof(struct ipv6hdr)));
}

static netdev_features_t gso_features_check(const struct sk_buff *skb,
                                            struct net_device *dev,
                                            netdev_features_t features)
{
        u16 gso_segs = skb_shinfo(skb)->gso_segs;

        if (gso_segs > READ_ONCE(dev->gso_max_segs))
                return features & ~NETIF_F_GSO_MASK;

        if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
                return features & ~NETIF_F_GSO_MASK;

        if (!skb_shinfo(skb)->gso_type) {
                skb_warn_bad_offload(skb);
                return features & ~NETIF_F_GSO_MASK;
        }

        /* Support for GSO partial features requires software
         * intervention before we can actually process the packets
         * so we need to strip support for any partial features now
         * and we can pull them back in after we have partially
         * segmented the frame.
         */
        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
                features &= ~dev->gso_partial_features;

        /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header
         * has the potential to be fragmented so that TSO does not generate
         * segments with the same ID. For encapsulated packets, the ID mangling
         * feature is guaranteed not to use the same ID for the outer IPv4
         * headers of the generated segments if the headers have the potential
         * to be fragmented, so there is no need to clear the IPv4 ID mangling
         * feature (see the section about NETIF_F_TSO_MANGLEID in
         * segmentation-offloads.rst).
         */
        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
                const struct iphdr *iph;
                struct iphdr _iph;
                int nhoff = skb->encapsulation ?
                            skb_inner_network_offset(skb) :
                            skb_network_offset(skb);

                iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);

                if (!iph || !(iph->frag_off & htons(IP_DF)))
                        features &= ~dev->mangleid_features;
        }

        /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
         * so neither does TSO that depends on it.
         */
        if (features & NETIF_F_IPV6_CSUM &&
            skb_gso_has_extension_hdr(skb))
                features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);

        return features;
}

netdev_features_t netif_skb_features(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        netdev_features_t features = dev->features;

        if (skb_is_gso(skb))
                features = gso_features_check(skb, dev, features);

        /* If encapsulation offload request, verify we are testing
         * hardware encapsulation features instead of standard
         * features for the netdev
         */
        if (skb->encapsulation)
                features &= dev->hw_enc_features;

        if (skb_vlan_tagged(skb))
                features = netdev_intersect_features(features,
                                                     dev->vlan_features |
                                                     NETIF_F_HW_VLAN_CTAG_TX |
                                                     NETIF_F_HW_VLAN_STAG_TX);

        if (dev->netdev_ops->ndo_features_check)
                features &= dev->netdev_ops->ndo_features_check(skb, dev,
                                                                features);
        else
                features &= dflt_features_check(skb, dev, features);

        return harmonize_features(skb, features);
}
EXPORT_SYMBOL(netif_skb_features);

static int xmit_one(struct sk_buff *skb, struct net_device *dev,
                    struct netdev_queue *txq, bool more)
{
        unsigned int len;
        int rc;

        if (dev_nit_active_rcu(dev))
                dev_queue_xmit_nit(skb, dev);

        len = skb->len;
        trace_net_dev_start_xmit(skb, dev);
        rc = netdev_start_xmit(skb, dev, txq, more);
        trace_net_dev_xmit(skb, rc, dev, len);

        return rc;
}

struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret)
{
        struct sk_buff *skb = first;
        int rc = NETDEV_TX_OK;

        while (skb) {
                struct sk_buff *next = skb->next;

                skb_mark_not_on_list(skb);
                rc = xmit_one(skb, dev, txq, next != NULL);
                if (unlikely(!dev_xmit_complete(rc))) {
                        skb->next = next;
                        goto out;
                }

                skb = next;
                if (netif_tx_queue_stopped(txq) && skb) {
                        rc = NETDEV_TX_BUSY;
                        break;
                }
        }

out:
        *ret = rc;
        return skb;
}

static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
                                          netdev_features_t features)
{
        if (skb_vlan_tag_present(skb) &&
            !vlan_hw_offload_capable(features, skb->vlan_proto))
                skb = __vlan_hwaccel_push_inside(skb);
        return skb;
}

int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features)
{
        if (unlikely(skb_csum_is_sctp(skb)))
                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
                        skb_crc32c_csum_help(skb);

        if (features & NETIF_F_HW_CSUM)
                return 0;

        if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
                if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
                    skb_network_header_len(skb) != sizeof(struct ipv6hdr))
                        goto sw_checksum;

                switch (skb->csum_offset) {
                case offsetof(struct tcphdr, check):
                case offsetof(struct udphdr, check):
                        return 0;
                }
        }

sw_checksum:
        return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);

/* Checks if this SKB belongs to an HW offloaded socket
 * and whether any SW fallbacks are required based on dev.
 * Check decrypted mark in case skb_orphan() cleared socket.
 */
static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
                                            struct net_device *dev)
{
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev,
                                       struct sk_buff *skb);
        struct sock *sk = skb->sk;

        sk_validate = NULL;
        if (sk) {
                if (sk_fullsock(sk))
                        sk_validate = sk->sk_validate_xmit_skb;
                else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT)
                        sk_validate = inet_twsk(sk)->tw_validate_xmit_skb;
        }

        if (sk_validate) {
                skb = sk_validate(sk, dev, skb);
        } else if (unlikely(skb_is_decrypted(skb))) {
                pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
                kfree_skb(skb);
                skb = NULL;
        }
#endif

        return skb;
}

static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
                                                    struct net_device *dev)
{
        struct skb_shared_info *shinfo;
        struct net_iov *niov;

        if (likely(skb_frags_readable(skb)))
                goto out;

        if (!dev->netmem_tx)
                goto out_free;

        shinfo = skb_shinfo(skb);

        if (shinfo->nr_frags > 0) {
                niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
                if (net_is_devmem_iov(niov) &&
                    READ_ONCE(net_devmem_iov_binding(niov)->dev) != dev)
                        goto out_free;
        }

out:
        return skb;

out_free:
        kfree_skb(skb);
        return NULL;
}

static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
        netdev_features_t features;

        skb = validate_xmit_unreadable_skb(skb, dev);
        if (unlikely(!skb))
                goto out_null;

        features = netif_skb_features(skb);
        skb = validate_xmit_vlan(skb, features);
        if (unlikely(!skb))
                goto out_null;

        skb = sk_validate_xmit_skb(skb, dev);
        if (unlikely(!skb))
                goto out_null;

        if (netif_needs_gso(skb, features)) {
                struct sk_buff *segs;

                segs = skb_gso_segment(skb, features);
                if (IS_ERR(segs)) {
                        goto out_kfree_skb;
                } else if (segs) {
                        consume_skb(skb);
                        skb = segs;
                }
        } else {
                if (skb_needs_linearize(skb, features) &&
                    __skb_linearize(skb))
                        goto out_kfree_skb;

                /* If packet is not checksummed and device does not
                 * support checksumming for this protocol, complete
                 * checksumming here.
                 */
                if (skb->ip_summed == CHECKSUM_PARTIAL) {
                        if (skb->encapsulation)
                                skb_set_inner_transport_header(skb,
                                                               skb_checksum_start_offset(skb));
                        else
                                skb_set_transport_header(skb,
                                                         skb_checksum_start_offset(skb));
                        if (skb_csum_hwoffload_help(skb, features))
                                goto out_kfree_skb;
                }
        }

        skb = validate_xmit_xfrm(skb, features, again);

        return skb;

out_kfree_skb:
        kfree_skb(skb);
out_null:
        dev_core_stats_tx_dropped_inc(dev);
        return NULL;
}

struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
{
        struct sk_buff *next, *head = NULL, *tail;

        for (; skb != NULL; skb = next) {
                next = skb->next;
                skb_mark_not_on_list(skb);

                /* in case skb won't be segmented, point to itself */
                skb->prev = skb;

                skb = validate_xmit_skb(skb, dev, again);
                if (!skb)
                        continue;

                if (!head)
                        head = skb;
                else
                        tail->next = skb;
                /* If skb was segmented, skb->prev points to
                 * the last segment. If not, it still contains skb.
                 */
                tail = skb->prev;
        }
        return head;
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);

static enum skb_drop_reason qdisc_pkt_len_segs_init(struct sk_buff *skb)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);
        unsigned int hdr_len, tlen;
        u16 gso_segs;

        qdisc_skb_cb(skb)->pkt_len = skb->len;
        if (!shinfo->gso_size) {
                qdisc_skb_cb(skb)->pkt_segs = 1;
                return SKB_NOT_DROPPED_YET;
        }

        qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs;

        /* To get more precise estimation of bytes sent on wire,
         * we add to pkt_len the headers size of all segments
         */

        /* mac layer + network layer */
        if (!skb->encapsulation) {
                if (unlikely(!skb_transport_header_was_set(skb)))
                        return SKB_NOT_DROPPED_YET;
                hdr_len = skb_transport_offset(skb);
        } else {
                hdr_len = skb_inner_transport_offset(skb);
        }
        /* + transport layer */
        if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
                const struct tcphdr *th;

                if (!pskb_may_pull(skb, hdr_len + sizeof(struct tcphdr)))
                        return SKB_DROP_REASON_SKB_BAD_GSO;

                th = (const struct tcphdr *)(skb->data + hdr_len);
                tlen = __tcp_hdrlen(th);
                if (tlen < sizeof(*th))
                        return SKB_DROP_REASON_SKB_BAD_GSO;
                hdr_len += tlen;
                if (!pskb_may_pull(skb, hdr_len))
                        return SKB_DROP_REASON_SKB_BAD_GSO;
        } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
                if (!pskb_may_pull(skb, hdr_len + sizeof(struct udphdr)))
                        return SKB_DROP_REASON_SKB_BAD_GSO;
                hdr_len += sizeof(struct udphdr);
        }

        /* prior pskb_may_pull() might have changed skb->head. */
        shinfo = skb_shinfo(skb);
        if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
                int payload = skb->len - hdr_len;

                /* Malicious packet. */
                if (payload <= 0)
                        return SKB_DROP_REASON_SKB_BAD_GSO;
                gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
                shinfo->gso_segs = gso_segs;
                qdisc_skb_cb(skb)->pkt_segs = gso_segs;
        }
        qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
        return SKB_NOT_DROPPED_YET;
}

static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
                             struct sk_buff **to_free,
                             struct netdev_queue *txq)
{
        int rc;

        rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
        if (rc == NET_XMIT_SUCCESS)
                trace_qdisc_enqueue(q, txq, skb);
        return rc;
}

static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                                 struct net_device *dev,
                                 struct netdev_queue *txq)
{
        struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
        spinlock_t *root_lock = qdisc_lock(q);
        struct llist_node *ll_list, *first_n;
        unsigned long defer_count = 0;
        int rc;

        qdisc_calculate_pkt_len(skb, q);

        tcf_set_qdisc_drop_reason(skb, QDISC_DROP_GENERIC);

        if (q->flags & TCQ_F_NOLOCK) {
                if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
                    qdisc_run_begin(q)) {
                        /* Retest nolock_qdisc_is_empty() within the protection
                         * of q->seqlock to protect from racing with requeuing.
                         */
                        if (unlikely(!nolock_qdisc_is_empty(q))) {
                                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                                __qdisc_run(q);
                                to_free2 = qdisc_run_end(q);

                                goto free_skbs;
                        }

                        qdisc_bstats_cpu_update(q, skb);
                        if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
                            !nolock_qdisc_is_empty(q))
                                __qdisc_run(q);

                        to_free2 = qdisc_run_end(q);
                        rc = NET_XMIT_SUCCESS;
                        goto free_skbs;
                }

                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                to_free2 = qdisc_run(q);
                goto free_skbs;
        }

        /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
         * In the try_cmpxchg() loop, we want to increment q->defer_count
         * at most once to limit the number of skbs in defer_list.
         * We perform the defer_count increment only if the list is not empty,
         * because some arches have slow atomic_long_inc_return().
         */
        first_n = READ_ONCE(q->defer_list.first);
        do {
                if (first_n && !defer_count) {
                        defer_count = atomic_long_inc_return(&q->defer_count);
                        if (unlikely(defer_count > READ_ONCE(net_hotdata.qdisc_max_burst))) {
                                kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_BURST_DROP);
                                return NET_XMIT_DROP;
                        }
                }
                skb->ll_node.next = first_n;
        } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));

        /* If defer_list was not empty, we know the cpu which queued
         * the first skb will process the whole list for us.
         */
        if (first_n)
                return NET_XMIT_SUCCESS;

        spin_lock(root_lock);

        ll_list = llist_del_all(&q->defer_list);
        /* There is a small race because we clear defer_count not atomically
         * with the prior llist_del_all(). This means defer_list could grow
         * over qdisc_max_burst.
         */
        atomic_long_set(&q->defer_count, 0);

        ll_list = llist_reverse_order(ll_list);

        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
                llist_for_each_entry_safe(skb, next, ll_list, ll_node)
                        __qdisc_drop(skb, &to_free);
                rc = NET_XMIT_DROP;
                goto unlock;
        }
        if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
            !llist_next(ll_list) && qdisc_run_begin(q)) {
                /*
                 * This is a work-conserving queue; there are no old skbs
                 * waiting to be sent out; and the qdisc is not running -
                 * xmit the skb directly.
                 */

                DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
                                                          struct sk_buff,
                                                          ll_node));
                qdisc_bstats_update(q, skb);
                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
                        __qdisc_run(q);
                to_free2 = qdisc_run_end(q);
                rc = NET_XMIT_SUCCESS;
        } else {
                int count = 0;

                llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
                        if (next) {
                                prefetch(next);
                                prefetch(&next->priority);
                                skb_mark_not_on_list(skb);
                        }
                        rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                        count++;
                }
                to_free2 = qdisc_run(q);
                if (count != 1)
                        rc = NET_XMIT_SUCCESS;
        }
unlock:
        spin_unlock(root_lock);

free_skbs:
        tcf_kfree_skb_list(to_free, q, txq, dev);
        tcf_kfree_skb_list(to_free2, q, txq, dev);
        return rc;
}

#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
static void skb_update_prio(struct sk_buff *skb)
{
        const struct netprio_map *map;
        const struct sock *sk;
        unsigned int prioidx;

        if (skb->priority)
                return;
        map = rcu_dereference_bh(skb->dev->priomap);
        if (!map)
                return;
        sk = skb_to_full_sk(skb);
        if (!sk)
                return;

        prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);

        if (prioidx < map->priomap_len)
                skb->priority = map->priomap[prioidx];
}
#else
#define skb_update_prio(skb)
#endif

/**
 *        dev_loopback_xmit - loop back @skb
 *        @net: network namespace this loopback is happening in
 *        @sk:  sk needed to be a netfilter okfn
 *        @skb: buffer to transmit
 */
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb_reset_mac_header(skb);
        __skb_pull(skb, skb_network_offset(skb));
        skb->pkt_type = PACKET_LOOPBACK;
        if (skb->ip_summed == CHECKSUM_NONE)
                skb->ip_summed = CHECKSUM_UNNECESSARY;
        DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
        skb_dst_force(skb);
        netif_rx(skb);
        return 0;
}
EXPORT_SYMBOL(dev_loopback_xmit);

#ifdef CONFIG_NET_EGRESS
static struct netdev_queue *
netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
{
        int qm = skb_get_queue_mapping(skb);

        return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
}

#ifndef CONFIG_PREEMPT_RT
static bool netdev_xmit_txqueue_skipped(void)
{
        return __this_cpu_read(softnet_data.xmit.skip_txqueue);
}

void netdev_xmit_skip_txqueue(bool skip)
{
        __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);

#else
static bool netdev_xmit_txqueue_skipped(void)
{
        return current->net_xmit.skip_txqueue;
}

void netdev_xmit_skip_txqueue(bool skip)
{
        current->net_xmit.skip_txqueue = skip;
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#endif
#endif /* CONFIG_NET_EGRESS */

#ifdef CONFIG_NET_XGRESS
static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
                  enum skb_drop_reason *drop_reason)
{
        int ret = TC_ACT_UNSPEC;
#ifdef CONFIG_NET_CLS_ACT
        struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
        struct tcf_result res;

        if (!miniq)
                return ret;

        /* Global bypass */
        if (!static_branch_likely(&tcf_sw_enabled_key))
                return ret;

        /* Block-wise bypass */
        if (tcf_block_bypass_sw(miniq->block))
                return ret;

        tc_skb_cb(skb)->mru = 0;
        qdisc_skb_cb(skb)->post_ct = false;
        tcf_set_drop_reason(skb, *drop_reason);

        mini_qdisc_bstats_cpu_update(miniq, skb);
        ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
        /* Only tcf related quirks below. */
        switch (ret) {
        case TC_ACT_SHOT:
                *drop_reason = tcf_get_drop_reason(skb);
                mini_qdisc_qstats_cpu_drop(miniq);
                break;
        case TC_ACT_OK:
        case TC_ACT_RECLASSIFY:
                skb->tc_index = TC_H_MIN(res.classid);
                break;
        }
#endif /* CONFIG_NET_CLS_ACT */
        return ret;
}

static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);

void tcx_inc(void)
{
        static_branch_inc(&tcx_needed_key);
}

void tcx_dec(void)
{
        static_branch_dec(&tcx_needed_key);
}

static __always_inline enum tcx_action_base
tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
        const bool needs_mac)
{
        const struct bpf_mprog_fp *fp;
        const struct bpf_prog *prog;
        int ret = TCX_NEXT;

        if (needs_mac)
                __skb_push(skb, skb->mac_len);
        bpf_mprog_foreach_prog(entry, fp, prog) {
                bpf_compute_data_pointers(skb);
                ret = bpf_prog_run(prog, skb);
                if (ret != TCX_NEXT)
                        break;
        }
        if (needs_mac)
                __skb_pull(skb, skb->mac_len);
        return tcx_action_code(skb, ret);
}

static __always_inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
                   struct net_device *orig_dev, bool *another)
{
        struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int sch_ret;

        if (!entry)
                return skb;

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
        if (unlikely(*pt_prev)) {
                *ret = deliver_skb(skb, *pt_prev, orig_dev);
                *pt_prev = NULL;
        }

        qdisc_pkt_len_segs_init(skb);
        tcx_set_ingress(skb, true);

        if (static_branch_unlikely(&tcx_needed_key)) {
                sch_ret = tcx_run(entry, skb, true);
                if (sch_ret != TC_ACT_UNSPEC)
                        goto ingress_verdict;
        }
        sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
ingress_verdict:
        switch (sch_ret) {
        case TC_ACT_REDIRECT:
                /* skb_mac_header check was done by BPF, so we can safely
                 * push the L2 header back before redirecting to another
                 * netdev.
                 */
                __skb_push(skb, skb->mac_len);
                if (skb_do_redirect(skb) == -EAGAIN) {
                        __skb_pull(skb, skb->mac_len);
                        *another = true;
                        break;
                }
                *ret = NET_RX_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        case TC_ACT_SHOT:
                kfree_skb_reason(skb, drop_reason);
                *ret = NET_RX_DROP;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        /* used by tc_run */
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
        case TC_ACT_TRAP:
                consume_skb(skb);
                fallthrough;
        case TC_ACT_CONSUMED:
                *ret = NET_RX_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        }
        bpf_net_ctx_clear(bpf_net_ctx);

        return skb;
}

static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
        struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int sch_ret;

        if (!entry)
                return skb;

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

        /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
         * already set by the caller.
         */
        if (static_branch_unlikely(&tcx_needed_key)) {
                sch_ret = tcx_run(entry, skb, false);
                if (sch_ret != TC_ACT_UNSPEC)
                        goto egress_verdict;
        }
        sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
egress_verdict:
        switch (sch_ret) {
        case TC_ACT_REDIRECT:
                /* No need to push/pop skb's mac_header here on egress! */
                skb_do_redirect(skb);
                *ret = NET_XMIT_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        case TC_ACT_SHOT:
                kfree_skb_reason(skb, drop_reason);
                *ret = NET_XMIT_DROP;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        /* used by tc_run */
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
        case TC_ACT_TRAP:
                consume_skb(skb);
                fallthrough;
        case TC_ACT_CONSUMED:
                *ret = NET_XMIT_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        }
        bpf_net_ctx_clear(bpf_net_ctx);

        return skb;
}
#else
static __always_inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
                   struct net_device *orig_dev, bool *another)
{
        return skb;
}

static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
        return skb;
}
#endif /* CONFIG_NET_XGRESS */

#ifdef CONFIG_XPS
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
                               struct xps_dev_maps *dev_maps, unsigned int tci)
{
        int tc = netdev_get_prio_tc_map(dev, skb->priority);
        struct xps_map *map;
        int queue_index = -1;

        if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
                return queue_index;

        tci *= dev_maps->num_tc;
        tci += tc;

        map = rcu_dereference(dev_maps->attr_map[tci]);
        if (map) {
                if (map->len == 1)
                        queue_index = map->queues[0];
                else
                        queue_index = map->queues[reciprocal_scale(
                                                skb_get_hash(skb), map->len)];
                if (unlikely(queue_index >= dev->real_num_tx_queues))
                        queue_index = -1;
        }
        return queue_index;
}
#endif

static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
                         struct sk_buff *skb)
{
#ifdef CONFIG_XPS
        struct xps_dev_maps *dev_maps;
        struct sock *sk = skb->sk;
        int queue_index = -1;

        if (!static_key_false(&xps_needed))
                return -1;

        rcu_read_lock();
        if (!static_key_false(&xps_rxqs_needed))
                goto get_cpus_map;

        dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
        if (dev_maps) {
                int tci = sk_rx_queue_get(sk);

                if (tci >= 0)
                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
                                                          tci);
        }

get_cpus_map:
        if (queue_index < 0) {
                dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
                if (dev_maps) {
                        unsigned int tci = skb->sender_cpu - 1;

                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
                                                          tci);
                }
        }
        rcu_read_unlock();

        return queue_index;
#else
        return -1;
#endif
}

u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev)
{
        return 0;
}
EXPORT_SYMBOL(dev_pick_tx_zero);

int sk_tx_queue_get(const struct sock *sk)
{
        int resel, val;

        if (!sk)
                return -1;
        /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
         * and sk_tx_queue_set().
         */
        val = READ_ONCE(sk->sk_tx_queue_mapping);

        if (val == NO_QUEUE_MAPPING)
                return -1;

        if (!sk_fullsock(sk))
                return val;

        resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
        if (resel && time_is_before_jiffies(
                        READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
                return -1;

        return val;
}
EXPORT_SYMBOL(sk_tx_queue_get);

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev)
{
        struct sock *sk = skb->sk;
        int queue_index = sk_tx_queue_get(sk);

        sb_dev = sb_dev ? : dev;

        if (queue_index < 0 || skb->ooo_okay ||
            queue_index >= dev->real_num_tx_queues) {
                int new_index = get_xps_queue(dev, sb_dev, skb);

                if (new_index < 0)
                        new_index = skb_tx_hash(dev, sb_dev, skb);

                if (sk && sk_fullsock(sk) &&
                    rcu_access_pointer(sk->sk_dst_cache))
                        sk_tx_queue_set(sk, new_index);

                queue_index = new_index;
        }

        return queue_index;
}
EXPORT_SYMBOL(netdev_pick_tx);

struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb,
                                         struct net_device *sb_dev)
{
        int queue_index = 0;

#ifdef CONFIG_XPS
        u32 sender_cpu = skb->sender_cpu - 1;

        if (sender_cpu >= (u32)NR_CPUS)
                skb->sender_cpu = raw_smp_processor_id() + 1;
#endif

        if (dev->real_num_tx_queues != 1) {
                const struct net_device_ops *ops = dev->netdev_ops;

                if (ops->ndo_select_queue)
                        queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
                else
                        queue_index = netdev_pick_tx(dev, skb, sb_dev);

                queue_index = netdev_cap_txqueue(dev, queue_index);
        }

        skb_set_queue_mapping(skb, queue_index);
        return netdev_get_tx_queue(dev, queue_index);
}

/**
 * __dev_queue_xmit() - transmit a buffer
 * @skb:        buffer to transmit
 * @sb_dev:        suboordinate device used for L2 forwarding offload
 *
 * Queue a buffer for transmission to a network device. The caller must
 * have set the device and priority and built the buffer before calling
 * this function. The function can be called from an interrupt.
 *
 * When calling this method, interrupts MUST be enabled. This is because
 * the BH enable code must have IRQs enabled so that it will not deadlock.
 *
 * Regardless of the return value, the skb is consumed, so it is currently
 * difficult to retry a send to this method. (You can bump the ref count
 * before sending to hold a reference for retry if you are careful.)
 *
 * Return:
 * * 0                                - buffer successfully transmitted
 * * positive qdisc return code        - NET_XMIT_DROP etc.
 * * negative errno                - other errors
 */
int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
        struct net_device *dev = skb->dev;
        struct netdev_queue *txq = NULL;
        enum skb_drop_reason reason;
        int cpu, rc = -ENOMEM;
        bool again = false;
        struct Qdisc *q;

        skb_reset_mac_header(skb);
        skb_assert_len(skb);

        if (unlikely(skb_shinfo(skb)->tx_flags &
                     (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
                __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);

        reason = qdisc_pkt_len_segs_init(skb);
        if (unlikely(reason)) {
                dev_core_stats_tx_dropped_inc(dev);
                kfree_skb_reason(skb, reason);
                return -EINVAL;
        }
        /* Disable soft irqs for various locks below. Also
         * stops preemption for RCU.
         */
        rcu_read_lock_bh();

        skb_update_prio(skb);

        tcx_set_ingress(skb, false);
#ifdef CONFIG_NET_EGRESS
        if (static_branch_unlikely(&egress_needed_key)) {
                if (nf_hook_egress_active()) {
                        skb = nf_hook_egress(skb, &rc, dev);
                        if (!skb)
                                goto out;
                }

                netdev_xmit_skip_txqueue(false);

                nf_skip_egress(skb, true);
                skb = sch_handle_egress(skb, &rc, dev);
                if (!skb)
                        goto out;
                nf_skip_egress(skb, false);

                if (netdev_xmit_txqueue_skipped())
                        txq = netdev_tx_queue_mapping(dev, skb);
        }
#endif
        /* If device/qdisc don't need skb->dst, release it right now while
         * its hot in this cpu cache.
         */
        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
                skb_dst_drop(skb);
        else
                skb_dst_force(skb);

        if (!txq)
                txq = netdev_core_pick_tx(dev, skb, sb_dev);

        q = rcu_dereference_bh(txq->qdisc);

        trace_net_dev_queue(skb);
        if (q->enqueue) {
                rc = __dev_xmit_skb(skb, q, dev, txq);
                goto out;
        }

        /* The device has no queue. Common case for software devices:
         * loopback, all the sorts of tunnels...

         * Really, it is unlikely that netif_tx_lock protection is necessary
         * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
         * counters.)
         * However, it is possible, that they rely on protection
         * made by us here.

         * Check this and shot the lock. It is not prone from deadlocks.
         *Either shot noqueue qdisc, it is even simpler 8)
         */
        if (unlikely(!(dev->flags & IFF_UP))) {
                reason = SKB_DROP_REASON_DEV_READY;
                goto drop;
        }

        cpu = smp_processor_id(); /* ok because BHs are off */

        if (likely(!netif_tx_owned(txq, cpu))) {
                bool is_list = false;

                if (dev_xmit_recursion())
                        goto recursion_alert;

                skb = validate_xmit_skb(skb, dev, &again);
                if (!skb)
                        goto out;

                HARD_TX_LOCK(dev, txq, cpu);

                if (!netif_xmit_stopped(txq)) {
                        is_list = !!skb->next;

                        dev_xmit_recursion_inc();
                        skb = dev_hard_start_xmit(skb, dev, txq, &rc);
                        dev_xmit_recursion_dec();

                        /* GSO segments a single SKB into a list of frames.
                         * TCP expects error to mean none of the data was sent.
                         */
                        if (is_list)
                                rc = NETDEV_TX_OK;
                }
                HARD_TX_UNLOCK(dev, txq);
                if (!skb) /* xmit completed */
                        goto out;

                net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
                                     dev->name);
                /* NETDEV_TX_BUSY or queue was stopped */
                if (!is_list)
                        rc = -ENETDOWN;
        } else {
                /* Recursion is detected! It is possible unfortunately. */
recursion_alert:
                net_crit_ratelimited("Dead loop on virtual device %s (net %llu), fix it urgently!\n",
                                     dev->name, dev_net(dev)->net_cookie);

                rc = -ENETDOWN;
        }

        reason = SKB_DROP_REASON_RECURSION_LIMIT;
drop:
        rcu_read_unlock_bh();

        dev_core_stats_tx_dropped_inc(dev);
        kfree_skb_list_reason(skb, reason);
        return rc;
out:
        rcu_read_unlock_bh();
        return rc;
}
EXPORT_SYMBOL(__dev_queue_xmit);

int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
        struct net_device *dev = skb->dev;
        struct sk_buff *orig_skb = skb;
        struct netdev_queue *txq;
        int ret = NETDEV_TX_BUSY;
        bool again = false;

        if (unlikely(!netif_running(dev) ||
                     !netif_carrier_ok(dev)))
                goto drop;

        skb = validate_xmit_skb_list(skb, dev, &again);
        if (skb != orig_skb)
                goto drop;

        skb_set_queue_mapping(skb, queue_id);
        txq = skb_get_tx_queue(dev, skb);

        local_bh_disable();

        dev_xmit_recursion_inc();
        HARD_TX_LOCK(dev, txq, smp_processor_id());
        if (!netif_xmit_frozen_or_drv_stopped(txq))
                ret = netdev_start_xmit(skb, dev, txq, false);
        HARD_TX_UNLOCK(dev, txq);
        dev_xmit_recursion_dec();

        local_bh_enable();
        return ret;
drop:
        dev_core_stats_tx_dropped_inc(dev);
        kfree_skb_list(skb);
        return NET_XMIT_DROP;
}
EXPORT_SYMBOL(__dev_direct_xmit);

/*************************************************************************
 *                        Receiver routines
 *************************************************************************/
static DEFINE_PER_CPU(struct task_struct *, backlog_napi);

int weight_p __read_mostly = 64;           /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
                                     struct napi_struct *napi)
{
        struct task_struct *thread;

        lockdep_assert_irqs_disabled();

        if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
                /* Paired with smp_mb__before_atomic() in
                 * napi_enable()/netif_set_threaded().
                 * Use READ_ONCE() to guarantee a complete
                 * read on napi->thread. Only call
                 * wake_up_process() when it's not NULL.
                 */
                thread = READ_ONCE(napi->thread);
                if (thread) {
                        if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
                                goto use_local_napi;

                        set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
                        wake_up_process(thread);
                        return;
                }
        }

use_local_napi:
        DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
        list_add_tail(&napi->poll_list, &sd->poll_list);
        WRITE_ONCE(napi->list_owner, smp_processor_id());
        /* If not called from net_rx_action()
         * we have to raise NET_RX_SOFTIRQ.
         */
        if (!sd->in_net_rx_action)
                raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

#ifdef CONFIG_RPS

struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);

static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr)
{
        return hash_32(hash, rps_tag_to_log(tag_ptr));
}

#ifdef CONFIG_RFS_ACCEL
/**
 * rps_flow_is_active - check whether the flow is recently active.
 * @rflow: Specific flow to check activity.
 * @log: ilog2(hashsize).
 * @cpu: CPU saved in @rflow.
 *
 * If the CPU has processed many packets since the flow's last activity
 * (beyond 10 times the table size), the flow is considered stale.
 *
 * Return: true if flow was recently active.
 */
static bool rps_flow_is_active(struct rps_dev_flow *rflow,
                               u8 log,
                               unsigned int cpu)
{
        unsigned int flow_last_active;
        unsigned int sd_input_head;

        if (cpu >= nr_cpu_ids)
                return false;

        sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
        flow_last_active = READ_ONCE(rflow->last_qtail);

        return (int)(sd_input_head - flow_last_active) <
                (int)(10 << log);
}
#endif

static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
            struct rps_dev_flow *rflow, u16 next_cpu, u32 hash)
{
        if (next_cpu < nr_cpu_ids) {
                u32 head;
#ifdef CONFIG_RFS_ACCEL
                struct netdev_rx_queue *rxqueue;
                struct rps_dev_flow *flow_table;
                struct rps_dev_flow *old_rflow;
                struct rps_dev_flow *tmp_rflow;
                rps_tag_ptr q_tag_ptr;
                unsigned int tmp_cpu;
                u16 rxq_index;
                u32 flow_id;
                int rc;

                /* Should we steer this flow to a different hardware queue? */
                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
                    !(dev->features & NETIF_F_NTUPLE))
                        goto out;
                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
                if (rxq_index == skb_get_rx_queue(skb))
                        goto out;

                rxqueue = dev->_rx + rxq_index;
                q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table);
                if (!q_tag_ptr)
                        goto out;

                flow_id = rfs_slot(hash, q_tag_ptr);
                flow_table = rps_tag_to_table(q_tag_ptr);
                tmp_rflow = flow_table + flow_id;
                tmp_cpu = READ_ONCE(tmp_rflow->cpu);

                if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
                        if (rps_flow_is_active(tmp_rflow,
                                               rps_tag_to_log(q_tag_ptr),
                                               tmp_cpu)) {
                                if (hash != READ_ONCE(tmp_rflow->hash) ||
                                    next_cpu == tmp_cpu)
                                        goto out;
                        }
                }

                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
                                                        rxq_index, flow_id);
                if (rc < 0)
                        goto out;

                old_rflow = rflow;
                rflow = tmp_rflow;
                WRITE_ONCE(rflow->filter, rc);
                WRITE_ONCE(rflow->hash, hash);

                if (old_rflow->filter == rc)
                        WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
        out:
#endif
                head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
                rps_input_queue_tail_save(&rflow->last_qtail, head);
        }

        WRITE_ONCE(rflow->cpu, next_cpu);
        return rflow;
}

/*
 * get_rps_cpu is called from netif_receive_skb and returns the target
 * CPU from the RPS map of the receiving queue for a given skb.
 * rcu_read_lock must be held on entry.
 */
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                       struct rps_dev_flow **rflowp)
{
        struct netdev_rx_queue *rxqueue = dev->_rx;
        rps_tag_ptr global_tag_ptr, q_tag_ptr;
        struct rps_map *map;
        int cpu = -1;
        u32 tcpu;
        u32 hash;

        if (skb_rx_queue_recorded(skb)) {
                u16 index = skb_get_rx_queue(skb);

                if (unlikely(index >= dev->real_num_rx_queues)) {
                        WARN_ONCE(dev->real_num_rx_queues > 1,
                                  "%s received packet on queue %u, but number "
                                  "of RX queues is %u\n",
                                  dev->name, index, dev->real_num_rx_queues);
                        goto done;
                }
                rxqueue += index;
        }

        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */

        q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table);
        map = rcu_dereference(rxqueue->rps_map);
        if (!q_tag_ptr && !map)
                goto done;

        skb_reset_network_header(skb);
        hash = skb_get_hash(skb);
        if (!hash)
                goto done;

        global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
        if (q_tag_ptr && global_tag_ptr) {
                struct rps_sock_flow_table *sock_flow_table;
                struct rps_dev_flow *flow_table;
                struct rps_dev_flow *rflow;
                u32 next_cpu;
                u32 flow_id;
                u32 ident;

                /* First check into global flow table if there is a match.
                 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
                 */
                flow_id = hash & rps_tag_to_mask(global_tag_ptr);
                sock_flow_table = rps_tag_to_table(global_tag_ptr);
                ident = READ_ONCE(sock_flow_table[flow_id].ent);
                if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
                        goto try_rps;

                next_cpu = ident & net_hotdata.rps_cpu_mask;

                /* OK, now we know there is a match,
                 * we can look at the local (per receive queue) flow table
                 */
                flow_id = rfs_slot(hash, q_tag_ptr);
                flow_table = rps_tag_to_table(q_tag_ptr);
                rflow = flow_table + flow_id;
                tcpu = rflow->cpu;

                /*
                 * If the desired CPU (where last recvmsg was done) is
                 * different from current CPU (one in the rx-queue flow
                 * table entry), switch if one of the following holds:
                 *   - Current CPU is unset (>= nr_cpu_ids).
                 *   - Current CPU is offline.
                 *   - The current CPU's queue tail has advanced beyond the
                 *     last packet that was enqueued using this table entry.
                 *     This guarantees that all previous packets for the flow
                 *     have been dequeued, thus preserving in order delivery.
                 */
                if (unlikely(tcpu != next_cpu) &&
                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
                     ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
                      rflow->last_qtail)) >= 0)) {
                        tcpu = next_cpu;
                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash);
                }

                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
                        *rflowp = rflow;
                        cpu = tcpu;
                        goto done;
                }
        }

try_rps:

        if (map) {
                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
                if (cpu_online(tcpu)) {
                        cpu = tcpu;
                        goto done;
                }
        }

done:
        return cpu;
}

#ifdef CONFIG_RFS_ACCEL

/**
 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 * @dev: Device on which the filter was set
 * @rxq_index: RX queue index
 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 *
 * Drivers that implement ndo_rx_flow_steer() should periodically call
 * this function for each installed filter and remove the filters for
 * which it returns %true.
 */
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
                         u32 flow_id, u16 filter_id)
{
        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
        struct rps_dev_flow *flow_table;
        struct rps_dev_flow *rflow;
        rps_tag_ptr q_tag_ptr;
        bool expire = true;
        u8 log;

        rcu_read_lock();
        q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table);
        log = rps_tag_to_log(q_tag_ptr);
        if (q_tag_ptr && flow_id < (1UL << log)) {
                unsigned int cpu;

                flow_table = rps_tag_to_table(q_tag_ptr);
                rflow = flow_table + flow_id;
                cpu = READ_ONCE(rflow->cpu);
                if (READ_ONCE(rflow->filter) == filter_id &&
                    rps_flow_is_active(rflow, log, cpu))
                        expire = false;
        }
        rcu_read_unlock();
        return expire;
}
EXPORT_SYMBOL(rps_may_expire_flow);

#endif /* CONFIG_RFS_ACCEL */

/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
        struct softnet_data *sd = data;

        ____napi_schedule(sd, &sd->backlog);
        /* Pairs with READ_ONCE() in softnet_seq_show() */
        WRITE_ONCE(sd->received_rps, sd->received_rps + 1);
}

#endif /* CONFIG_RPS */

/* Called from hardirq (IPI) context */
static void trigger_rx_softirq(void *data)
{
        struct softnet_data *sd = data;

        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
        smp_store_release(&sd->defer_ipi_scheduled, 0);
}

/*
 * After we queued a packet into sd->input_pkt_queue,
 * we need to make sure this queue is serviced soon.
 *
 * - If this is another cpu queue, link it to our rps_ipi_list,
 *   and make sure we will process rps_ipi_list from net_rx_action().
 *
 * - If this is our own queue, NAPI schedule our backlog.
 *   Note that this also raises NET_RX_SOFTIRQ.
 */
static void napi_schedule_rps(struct softnet_data *sd)
{
        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);

#ifdef CONFIG_RPS
        if (sd != mysd) {
                if (use_backlog_threads()) {
                        __napi_schedule_irqoff(&sd->backlog);
                        return;
                }

                sd->rps_ipi_next = mysd->rps_ipi_list;
                mysd->rps_ipi_list = sd;

                /* If not called from net_rx_action() or napi_threaded_poll()
                 * we have to raise NET_RX_SOFTIRQ.
                 */
                if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
                        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
                return;
        }
#endif /* CONFIG_RPS */
        __napi_schedule_irqoff(&mysd->backlog);
}

void kick_defer_list_purge(unsigned int cpu)
{
        struct softnet_data *sd = &per_cpu(softnet_data, cpu);
        unsigned long flags;

        if (use_backlog_threads()) {
                backlog_lock_irq_save(sd, &flags);

                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
                        __napi_schedule_irqoff(&sd->backlog);

                backlog_unlock_irq_restore(sd, flags);

        } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
                smp_call_function_single_async(cpu, &sd->defer_csd);
        }
}

#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif

static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen,
                           int max_backlog)
{
#ifdef CONFIG_NET_FLOW_LIMIT
        unsigned int old_flow, new_flow;
        const struct softnet_data *sd;
        struct sd_flow_limit *fl;

        if (likely(qlen < (max_backlog >> 1)))
                return false;

        sd = this_cpu_ptr(&softnet_data);

        rcu_read_lock();
        fl = rcu_dereference(sd->flow_limit);
        if (fl) {
                new_flow = hash_32(skb_get_hash(skb), fl->log_buckets);
                old_flow = fl->history[fl->history_head];
                fl->history[fl->history_head] = new_flow;

                fl->history_head++;
                fl->history_head &= FLOW_LIMIT_HISTORY - 1;

                if (likely(fl->buckets[old_flow]))
                        fl->buckets[old_flow]--;

                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
                        /* Pairs with READ_ONCE() in softnet_seq_show() */
                        WRITE_ONCE(fl->count, fl->count + 1);
                        rcu_read_unlock();
                        return true;
                }
        }
        rcu_read_unlock();
#endif
        return false;
}

/*
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 * queue (may be a remote CPU queue).
 */
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
                              unsigned int *qtail)
{
        enum skb_drop_reason reason;
        struct softnet_data *sd;
        unsigned long flags;
        unsigned int qlen;
        int max_backlog;
        u32 tail;

        reason = SKB_DROP_REASON_DEV_READY;
        if (unlikely(!netif_running(skb->dev)))
                goto bad_dev;

        sd = &per_cpu(softnet_data, cpu);

        qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
        max_backlog = READ_ONCE(net_hotdata.max_backlog);
        if (unlikely(qlen > max_backlog) ||
            skb_flow_limit(skb, qlen, max_backlog))
                goto cpu_backlog_drop;
        backlog_lock_irq_save(sd, &flags);
        qlen = skb_queue_len(&sd->input_pkt_queue);
        if (likely(qlen <= max_backlog)) {
                if (!qlen) {
                        /* Schedule NAPI for backlog device. We can use
                         * non atomic operation as we own the queue lock.
                         */
                        if (!__test_and_set_bit(NAPI_STATE_SCHED,
                                                &sd->backlog.state))
                                napi_schedule_rps(sd);
                }
                __skb_queue_tail(&sd->input_pkt_queue, skb);
                tail = rps_input_queue_tail_incr(sd);
                backlog_unlock_irq_restore(sd, flags);

                /* save the tail outside of the critical section */
                rps_input_queue_tail_save(qtail, tail);
                return NET_RX_SUCCESS;
        }

        backlog_unlock_irq_restore(sd, flags);

cpu_backlog_drop:
        reason = SKB_DROP_REASON_CPU_BACKLOG;
        numa_drop_add(&sd->drop_counters, 1);
bad_dev:
        dev_core_stats_rx_dropped_inc(skb->dev);
        kfree_skb_reason(skb, reason);
        return NET_RX_DROP;
}

static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        struct netdev_rx_queue *rxqueue;

        rxqueue = dev->_rx;

        if (skb_rx_queue_recorded(skb)) {
                u16 index = skb_get_rx_queue(skb);

                if (unlikely(index >= dev->real_num_rx_queues)) {
                        WARN_ONCE(dev->real_num_rx_queues > 1,
                                  "%s received packet on queue %u, but number "
                                  "of RX queues is %u\n",
                                  dev->name, index, dev->real_num_rx_queues);

                        return rxqueue; /* Return first rxqueue */
                }
                rxqueue += index;
        }
        return rxqueue;
}

u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                             const struct bpf_prog *xdp_prog)
{
        void *orig_data, *orig_data_end, *hard_start;
        struct netdev_rx_queue *rxqueue;
        bool orig_bcast, orig_host;
        u32 mac_len, frame_sz;
        __be16 orig_eth_type;
        struct ethhdr *eth;
        u32 metalen, act;
        int off;

        /* The XDP program wants to see the packet starting at the MAC
         * header.
         */
        mac_len = skb->data - skb_mac_header(skb);
        hard_start = skb->data - skb_headroom(skb);

        /* SKB "head" area always have tailroom for skb_shared_info */
        frame_sz = (void *)skb_end_pointer(skb) - hard_start;
        frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        rxqueue = netif_get_rxqueue(skb);
        xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
        xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
                         skb_headlen(skb) + mac_len, true);
        if (skb_is_nonlinear(skb)) {
                skb_shinfo(skb)->xdp_frags_size = skb->data_len;
                xdp_buff_set_frags_flag(xdp);
        } else {
                xdp_buff_clear_frags_flag(xdp);
        }

        orig_data_end = xdp->data_end;
        orig_data = xdp->data;
        eth = (struct ethhdr *)xdp->data;
        orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
        orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
        orig_eth_type = eth->h_proto;

        act = bpf_prog_run_xdp(xdp_prog, xdp);

        /* check if bpf_xdp_adjust_head was used */
        off = xdp->data - orig_data;
        if (off) {
                if (off > 0)
                        __skb_pull(skb, off);
                else if (off < 0)
                        __skb_push(skb, -off);

                skb->mac_header += off;
                skb_reset_network_header(skb);
        }

        /* check if bpf_xdp_adjust_tail was used */
        off = xdp->data_end - orig_data_end;
        if (off != 0) {
                skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
                skb->len += off; /* positive on grow, negative on shrink */
        }

        /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
         * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
         */
        if (xdp_buff_has_frags(xdp))
                skb->data_len = skb_shinfo(skb)->xdp_frags_size;
        else
                skb->data_len = 0;

        /* check if XDP changed eth hdr such SKB needs update */
        eth = (struct ethhdr *)xdp->data;
        if ((orig_eth_type != eth->h_proto) ||
            (orig_host != ether_addr_equal_64bits(eth->h_dest,
                                                  skb->dev->dev_addr)) ||
            (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
                __skb_push(skb, ETH_HLEN);
                skb->pkt_type = PACKET_HOST;
                skb->protocol = eth_type_trans(skb, skb->dev);
        }

        /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
         * before calling us again on redirect path. We do not call do_redirect
         * as we leave that up to the caller.
         *
         * Caller is responsible for managing lifetime of skb (i.e. calling
         * kfree_skb in response to actions it cannot handle/XDP_DROP).
         */
        switch (act) {
        case XDP_REDIRECT:
        case XDP_TX:
                __skb_push(skb, mac_len);
                break;
        case XDP_PASS:
                metalen = xdp->data - xdp->data_meta;
                if (metalen)
                        skb_metadata_set(skb, metalen);
                break;
        }

        return act;
}

static int
netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
{
        struct sk_buff *skb = *pskb;
        int err, hroom, troom;

        local_lock_nested_bh(&system_page_pool.bh_lock);
        err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog);
        local_unlock_nested_bh(&system_page_pool.bh_lock);
        if (!err)
                return 0;

        /* In case we have to go down the path and also linearize,
         * then lets do the pskb_expand_head() work just once here.
         */
        hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
        troom = skb->tail + skb->data_len - skb->end;
        err = pskb_expand_head(skb,
                               hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
                               troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
        if (err)
                return err;

        return skb_linearize(skb);
}

static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
                                     struct xdp_buff *xdp,
                                     const struct bpf_prog *xdp_prog)
{
        struct sk_buff *skb = *pskb;
        u32 mac_len, act = XDP_DROP;

        /* Reinjected packets coming from act_mirred or similar should
         * not get XDP generic processing.
         */
        if (skb_is_redirected(skb))
                return XDP_PASS;

        /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
         * bytes. This is the guarantee that also native XDP provides,
         * thus we need to do it here as well.
         */
        mac_len = skb->data - skb_mac_header(skb);
        __skb_push(skb, mac_len);

        if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
                if (netif_skb_check_for_xdp(pskb, xdp_prog))
                        goto do_drop;
        }

        __skb_pull(*pskb, mac_len);

        act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
        switch (act) {
        case XDP_REDIRECT:
        case XDP_TX:
        case XDP_PASS:
                break;
        default:
                bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
                fallthrough;
        case XDP_ABORTED:
                trace_xdp_exception((*pskb)->dev, xdp_prog, act);
                fallthrough;
        case XDP_DROP:
        do_drop:
                kfree_skb(*pskb);
                break;
        }

        return act;
}

/* When doing generic XDP we have to bypass the qdisc layer and the
 * network taps in order to match in-driver-XDP behavior. This also means
 * that XDP packets are able to starve other packets going through a qdisc,
 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
 * queues, so they do not have this starvation issue.
 */
void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
        struct net_device *dev = skb->dev;
        struct netdev_queue *txq;
        bool free_skb = true;
        int cpu, rc;

        txq = netdev_core_pick_tx(dev, skb, NULL);
        cpu = smp_processor_id();
        HARD_TX_LOCK(dev, txq, cpu);
        if (!netif_xmit_frozen_or_drv_stopped(txq)) {
                rc = netdev_start_xmit(skb, dev, txq, 0);
                if (dev_xmit_complete(rc))
                        free_skb = false;
        }
        HARD_TX_UNLOCK(dev, txq);
        if (free_skb) {
                trace_xdp_exception(dev, xdp_prog, XDP_TX);
                dev_core_stats_tx_dropped_inc(dev);
                kfree_skb(skb);
        }
}

static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);

int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;

        if (xdp_prog) {
                struct xdp_buff xdp;
                u32 act;
                int err;

                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
                act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
                if (act != XDP_PASS) {
                        switch (act) {
                        case XDP_REDIRECT:
                                err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
                                                              &xdp, xdp_prog);
                                if (err)
                                        goto out_redir;
                                break;
                        case XDP_TX:
                                generic_xdp_tx(*pskb, xdp_prog);
                                break;
                        }
                        bpf_net_ctx_clear(bpf_net_ctx);
                        return XDP_DROP;
                }
                bpf_net_ctx_clear(bpf_net_ctx);
        }
        return XDP_PASS;
out_redir:
        bpf_net_ctx_clear(bpf_net_ctx);
        kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
        return XDP_DROP;
}
EXPORT_SYMBOL_GPL(do_xdp_generic);

static int netif_rx_internal(struct sk_buff *skb)
{
        int ret;

        net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        trace_netif_rx(skb);

#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                struct rps_dev_flow voidflow, *rflow = &voidflow;
                int cpu;

                rcu_read_lock();

                cpu = get_rps_cpu(skb->dev, skb, &rflow);
                if (cpu < 0)
                        cpu = smp_processor_id();

                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

                rcu_read_unlock();
        } else
#endif
        {
                unsigned int qtail;

                ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
        }
        return ret;
}

/**
 *        __netif_rx        -        Slightly optimized version of netif_rx
 *        @skb: buffer to post
 *
 *        This behaves as netif_rx except that it does not disable bottom halves.
 *        As a result this function may only be invoked from the interrupt context
 *        (either hard or soft interrupt).
 */
int __netif_rx(struct sk_buff *skb)
{
        int ret;

        lockdep_assert_once(hardirq_count() | softirq_count());

        trace_netif_rx_entry(skb);
        ret = netif_rx_internal(skb);
        trace_netif_rx_exit(ret);
        return ret;
}
EXPORT_SYMBOL(__netif_rx);

/**
 *        netif_rx        -        post buffer to the network code
 *        @skb: buffer to post
 *
 *        This function receives a packet from a device driver and queues it for
 *        the upper (protocol) levels to process via the backlog NAPI device. It
 *        always succeeds. The buffer may be dropped during processing for
 *        congestion control or by the protocol layers.
 *        The network buffer is passed via the backlog NAPI device. Modern NIC
 *        driver should use NAPI and GRO.
 *        This function can used from interrupt and from process context. The
 *        caller from process context must not disable interrupts before invoking
 *        this function.
 *
 *        return values:
 *        NET_RX_SUCCESS        (no congestion)
 *        NET_RX_DROP     (packet was dropped)
 *
 */
int netif_rx(struct sk_buff *skb)
{
        bool need_bh_off = !(hardirq_count() | softirq_count());
        int ret;

        if (need_bh_off)
                local_bh_disable();
        trace_netif_rx_entry(skb);
        ret = netif_rx_internal(skb);
        trace_netif_rx_exit(ret);
        if (need_bh_off)
                local_bh_enable();
        return ret;
}
EXPORT_SYMBOL(netif_rx);

static __latent_entropy void net_tx_action(void)
{
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);

        if (sd->completion_queue) {
                struct sk_buff *clist;

                local_irq_disable();
                clist = sd->completion_queue;
                sd->completion_queue = NULL;
                local_irq_enable();

                while (clist) {
                        struct sk_buff *skb = clist;

                        clist = clist->next;

                        WARN_ON(refcount_read(&skb->users));
                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
                                trace_consume_skb(skb, net_tx_action);
                        else
                                trace_kfree_skb(skb, net_tx_action,
                                                get_kfree_skb_cb(skb)->reason, NULL);

                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
                                __kfree_skb(skb);
                        else
                                __napi_kfree_skb(skb,
                                                 get_kfree_skb_cb(skb)->reason);
                }
        }

        if (sd->output_queue) {
                struct Qdisc *head;

                local_irq_disable();
                head = sd->output_queue;
                sd->output_queue = NULL;
                sd->output_queue_tailp = &sd->output_queue;
                local_irq_enable();

                rcu_read_lock();

                while (head) {
                        spinlock_t *root_lock = NULL;
                        struct sk_buff *to_free;
                        struct Qdisc *q = head;

                        head = head->next_sched;

                        /* We need to make sure head->next_sched is read
                         * before clearing __QDISC_STATE_SCHED
                         */
                        smp_mb__before_atomic();

                        if (!(q->flags & TCQ_F_NOLOCK)) {
                                root_lock = qdisc_lock(q);
                                spin_lock(root_lock);
                        } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
                                                     &q->state))) {
                                /* There is a synchronize_net() between
                                 * STATE_DEACTIVATED flag being set and
                                 * qdisc_reset()/some_qdisc_is_busy() in
                                 * dev_deactivate(), so we can safely bail out
                                 * early here to avoid data race between
                                 * qdisc_deactivate() and some_qdisc_is_busy()
                                 * for lockless qdisc.
                                 */
                                clear_bit(__QDISC_STATE_SCHED, &q->state);
                                continue;
                        }

                        clear_bit(__QDISC_STATE_SCHED, &q->state);
                        to_free = qdisc_run(q);
                        if (root_lock)
                                spin_unlock(root_lock);
                        tcf_kfree_skb_list(to_free, q, NULL, qdisc_dev(q));
                }

                rcu_read_unlock();
        }

        xfrm_dev_backlog(sd);
}

#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
                             unsigned char *addr) __read_mostly;
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif

/**
 *        netdev_is_rx_handler_busy - check if receive handler is registered
 *        @dev: device to check
 *
 *        Check if a receive handler is already registered for a given device.
 *        Return true if there one.
 *
 *        The caller must hold the rtnl_mutex.
 */
bool netdev_is_rx_handler_busy(struct net_device *dev)
{
        ASSERT_RTNL();
        return dev && rtnl_dereference(dev->rx_handler);
}
EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);

/**
 *        netdev_rx_handler_register - register receive handler
 *        @dev: device to register a handler for
 *        @rx_handler: receive handler to register
 *        @rx_handler_data: data pointer that is used by rx handler
 *
 *        Register a receive handler for a device. This handler will then be
 *        called from __netif_receive_skb. A negative errno code is returned
 *        on a failure.
 *
 *        The caller must hold the rtnl_mutex.
 *
 *        For a general description of rx_handler, see enum rx_handler_result.
 */
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data)
{
        if (netdev_is_rx_handler_busy(dev))
                return -EBUSY;

        if (dev->priv_flags & IFF_NO_RX_HANDLER)
                return -EINVAL;

        /* Note: rx_handler_data must be set before rx_handler */
        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
        rcu_assign_pointer(dev->rx_handler, rx_handler);

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);

/**
 *        netdev_rx_handler_unregister - unregister receive handler
 *        @dev: device to unregister a handler from
 *
 *        Unregister a receive handler from a device.
 *
 *        The caller must hold the rtnl_mutex.
 */
void netdev_rx_handler_unregister(struct net_device *dev)
{

        ASSERT_RTNL();
        RCU_INIT_POINTER(dev->rx_handler, NULL);
        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
         * section has a guarantee to see a non NULL rx_handler_data
         * as well.
         */
        synchronize_net();
        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);

/*
 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 * the special handling of PFMEMALLOC skbs.
 */
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
        switch (skb->protocol) {
        case htons(ETH_P_ARP):
        case htons(ETH_P_IP):
        case htons(ETH_P_IPV6):
        case htons(ETH_P_8021Q):
        case htons(ETH_P_8021AD):
                return true;
        default:
                return false;
        }
}

static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
                             int *ret, struct net_device *orig_dev)
{
        if (nf_hook_ingress_active(skb)) {
                int ingress_retval;

                if (unlikely(*pt_prev)) {
                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
                        *pt_prev = NULL;
                }

                rcu_read_lock();
                ingress_retval = nf_hook_ingress(skb);
                rcu_read_unlock();
                return ingress_retval;
        }
        return 0;
}

static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
                                    struct packet_type **ppt_prev)
{
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO;
        struct packet_type *ptype, *pt_prev;
        rx_handler_func_t *rx_handler;
        struct sk_buff *skb = *pskb;
        struct net_device *orig_dev;
        bool deliver_exact = false;
        int ret = NET_RX_DROP;
        __be16 type;

        net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        trace_netif_receive_skb(skb);

        orig_dev = skb->dev;

        skb_reset_network_header(skb);
#if !defined(CONFIG_DEBUG_NET)
        /* We plan to no longer reset the transport header here.
         * Give some time to fuzzers and dev build to catch bugs
         * in network stacks.
         */
        if (!skb_transport_header_was_set(skb))
                skb_reset_transport_header(skb);
#endif
        skb_reset_mac_len(skb);

        pt_prev = NULL;

another_round:
        skb->skb_iif = skb->dev->ifindex;

        __this_cpu_inc(softnet_data.processed);

        if (static_branch_unlikely(&generic_xdp_needed_key)) {
                int ret2;

                migrate_disable();
                ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
                                      &skb);
                migrate_enable();

                if (ret2 != XDP_PASS) {
                        ret = NET_RX_DROP;
                        goto out;
                }
        }

        if (eth_type_vlan(skb->protocol)) {
                skb = skb_vlan_untag(skb);
                if (unlikely(!skb))
                        goto out;
        }

        if (skb_skip_tc_classify(skb))
                goto skip_classify;

        if (pfmemalloc)
                goto skip_taps;

        list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
                                list) {
                if (unlikely(pt_prev))
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }

        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
                if (unlikely(pt_prev))
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }

skip_taps:
#ifdef CONFIG_NET_INGRESS
        if (static_branch_unlikely(&ingress_needed_key)) {
                bool another = false;

                nf_skip_egress(skb, true);
                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
                                         &another);
                if (another)
                        goto another_round;
                if (!skb)
                        goto out;

                nf_skip_egress(skb, false);
                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
                        goto out;
        }
#endif
        skb_reset_redirect(skb);
skip_classify:
        if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) {
                drop_reason = SKB_DROP_REASON_PFMEMALLOC;
                goto drop;
        }

        if (skb_vlan_tag_present(skb)) {
                if (unlikely(pt_prev)) {
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = NULL;
                }
                if (vlan_do_receive(&skb))
                        goto another_round;
                else if (unlikely(!skb))
                        goto out;
        }

        rx_handler = rcu_dereference(skb->dev->rx_handler);
        if (rx_handler) {
                if (unlikely(pt_prev)) {
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = NULL;
                }
                switch (rx_handler(&skb)) {
                case RX_HANDLER_CONSUMED:
                        ret = NET_RX_SUCCESS;
                        goto out;
                case RX_HANDLER_ANOTHER:
                        goto another_round;
                case RX_HANDLER_EXACT:
                        deliver_exact = true;
                        break;
                case RX_HANDLER_PASS:
                        break;
                default:
                        BUG();
                }
        }

        if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
                if (skb_vlan_tag_get_id(skb)) {
                        /* Vlan id is non 0 and vlan_do_receive() above couldn't
                         * find vlan device.
                         */
                        skb->pkt_type = PACKET_OTHERHOST;
                } else if (eth_type_vlan(skb->protocol)) {
                        /* Outer header is 802.1P with vlan 0, inner header is
                         * 802.1Q or 802.1AD and vlan_do_receive() above could
                         * not find vlan dev for vlan id 0.
                         */
                        __vlan_hwaccel_clear_tag(skb);
                        skb = skb_vlan_untag(skb);
                        if (unlikely(!skb))
                                goto out;
                        if (vlan_do_receive(&skb))
                                /* After stripping off 802.1P header with vlan 0
                                 * vlan dev is found for inner header.
                                 */
                                goto another_round;
                        else if (unlikely(!skb))
                                goto out;
                        else
                                /* We have stripped outer 802.1P vlan 0 header.
                                 * But could not find vlan dev.
                                 * check again for vlan id to set OTHERHOST.
                                 */
                                goto check_vlan_id;
                }
                /* Note: we might in the future use prio bits
                 * and set skb->priority like in vlan_do_receive()
                 * For the time being, just ignore Priority Code Point
                 */
                __vlan_hwaccel_clear_tag(skb);
        }

        type = skb->protocol;

        /* deliver only exact match when indicated */
        if (likely(!deliver_exact)) {
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &ptype_base[ntohs(type) &
                                                   PTYPE_HASH_MASK]);

                /* orig_dev and skb->dev could belong to different netns;
                 * Even in such case we need to traverse only the list
                 * coming from skb->dev, as the ptype owner (packet socket)
                 * will use dev_net(skb->dev) to do namespace filtering.
                 */
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &dev_net_rcu(skb->dev)->ptype_specific);
        }

        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                               &orig_dev->ptype_specific);

        if (unlikely(skb->dev != orig_dev)) {
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &skb->dev->ptype_specific);
        }

        if (pt_prev) {
                *ppt_prev = pt_prev;
        } else {
drop:
                if (!deliver_exact)
                        dev_core_stats_rx_dropped_inc(skb->dev);
                else
                        dev_core_stats_rx_nohandler_inc(skb->dev);

                kfree_skb_reason(skb, drop_reason);
                /* Jamal, now you will not able to escape explaining
                 * me how you were going to use this. :-)
                 */
                ret = NET_RX_DROP;
        }

out:
        /* The invariant here is that if *ppt_prev is not NULL
         * then skb should also be non-NULL.
         *
         * Apparently *ppt_prev assignment above holds this invariant due to
         * skb dereferencing near it.
         */
        *pskb = skb;
        return ret;
}

static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
        struct net_device *orig_dev = skb->dev;
        struct packet_type *pt_prev = NULL;
        int ret;

        ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
        if (pt_prev)
                ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
                                         skb->dev, pt_prev, orig_dev);
        return ret;
}

/**
 *        netif_receive_skb_core - special purpose version of netif_receive_skb
 *        @skb: buffer to process
 *
 *        More direct receive version of netif_receive_skb().  It should
 *        only be used by callers that have a need to skip RPS and Generic XDP.
 *        Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 *
 *        Return values (usually ignored):
 *        NET_RX_SUCCESS: no congestion
 *        NET_RX_DROP: packet was dropped
 */
int netif_receive_skb_core(struct sk_buff *skb)
{
        int ret;

        rcu_read_lock();
        ret = __netif_receive_skb_one_core(skb, false);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(netif_receive_skb_core);

static inline void __netif_receive_skb_list_ptype(struct list_head *head,
                                                  struct packet_type *pt_prev,
                                                  struct net_device *orig_dev)
{
        struct sk_buff *skb, *next;

        if (!pt_prev)
                return;
        if (list_empty(head))
                return;
        if (pt_prev->list_func != NULL)
                INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
                                   ip_list_rcv, head, pt_prev, orig_dev);
        else
                list_for_each_entry_safe(skb, next, head, list) {
                        skb_list_del_init(skb);
                        pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
                }
}

static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
{
        /* Fast-path assumptions:
         * - There is no RX handler.
         * - Only one packet_type matches.
         * If either of these fails, we will end up doing some per-packet
         * processing in-line, then handling the 'last ptype' for the whole
         * sublist.  This can't cause out-of-order delivery to any single ptype,
         * because the 'last ptype' must be constant across the sublist, and all
         * other ptypes are handled per-packet.
         */
        /* Current (common) ptype of sublist */
        struct packet_type *pt_curr = NULL;
        /* Current (common) orig_dev of sublist */
        struct net_device *od_curr = NULL;
        struct sk_buff *skb, *next;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                struct net_device *orig_dev = skb->dev;
                struct packet_type *pt_prev = NULL;

                skb_list_del_init(skb);
                __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
                if (!pt_prev)
                        continue;
                if (pt_curr != pt_prev || od_curr != orig_dev) {
                        /* dispatch old sublist */
                        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        pt_curr = pt_prev;
                        od_curr = orig_dev;
                }
                list_add_tail(&skb->list, &sublist);
        }

        /* dispatch final sublist */
        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
}

static int __netif_receive_skb(struct sk_buff *skb)
{
        int ret;

        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
                unsigned int noreclaim_flag;

                /*
                 * PFMEMALLOC skbs are special, they should
                 * - be delivered to SOCK_MEMALLOC sockets only
                 * - stay away from userspace
                 * - have bounded memory usage
                 *
                 * Use PF_MEMALLOC as this saves us from propagating the allocation
                 * context down to all allocation sites.
                 */
                noreclaim_flag = memalloc_noreclaim_save();
                ret = __netif_receive_skb_one_core(skb, true);
                memalloc_noreclaim_restore(noreclaim_flag);
        } else
                ret = __netif_receive_skb_one_core(skb, false);

        return ret;
}

static void __netif_receive_skb_list(struct list_head *head)
{
        unsigned long noreclaim_flag = 0;
        struct sk_buff *skb, *next;
        bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */

        list_for_each_entry_safe(skb, next, head, list) {
                if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
                        struct list_head sublist;

                        /* Handle the previous sublist */
                        list_cut_before(&sublist, head, &skb->list);
                        if (!list_empty(&sublist))
                                __netif_receive_skb_list_core(&sublist, pfmemalloc);
                        pfmemalloc = !pfmemalloc;
                        /* See comments in __netif_receive_skb */
                        if (pfmemalloc)
                                noreclaim_flag = memalloc_noreclaim_save();
                        else
                                memalloc_noreclaim_restore(noreclaim_flag);
                }
        }
        /* Handle the remaining sublist */
        if (!list_empty(head))
                __netif_receive_skb_list_core(head, pfmemalloc);
        /* Restore pflags */
        if (pfmemalloc)
                memalloc_noreclaim_restore(noreclaim_flag);
}

static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
{
        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
        struct bpf_prog *new = xdp->prog;
        int ret = 0;

        switch (xdp->command) {
        case XDP_SETUP_PROG:
                rcu_assign_pointer(dev->xdp_prog, new);
                if (old)
                        bpf_prog_put(old);

                if (old && !new) {
                        static_branch_dec(&generic_xdp_needed_key);
                } else if (new && !old) {
                        static_branch_inc(&generic_xdp_needed_key);
                        netif_disable_lro(dev);
                        dev_disable_gro_hw(dev);
                }
                break;

        default:
                ret = -EINVAL;
                break;
        }

        return ret;
}

static int netif_receive_skb_internal(struct sk_buff *skb)
{
        int ret;

        net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        if (skb_defer_rx_timestamp(skb))
                return NET_RX_SUCCESS;

        rcu_read_lock();
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                struct rps_dev_flow voidflow, *rflow = &voidflow;
                int cpu = get_rps_cpu(skb->dev, skb, &rflow);

                if (cpu >= 0) {
                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
                        rcu_read_unlock();
                        return ret;
                }
        }
#endif
        ret = __netif_receive_skb(skb);
        rcu_read_unlock();
        return ret;
}

void netif_receive_skb_list_internal(struct list_head *head)
{
        struct sk_buff *skb, *next;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
                                    skb);
                skb_list_del_init(skb);
                if (!skb_defer_rx_timestamp(skb))
                        list_add_tail(&skb->list, &sublist);
        }
        list_splice_init(&sublist, head);

        rcu_read_lock();
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                list_for_each_entry_safe(skb, next, head, list) {
                        struct rps_dev_flow voidflow, *rflow = &voidflow;
                        int cpu = get_rps_cpu(skb->dev, skb, &rflow);

                        if (cpu >= 0) {
                                /* Will be handled, remove from list */
                                skb_list_del_init(skb);
                                enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
                        }
                }
        }
#endif
        __netif_receive_skb_list(head);
        rcu_read_unlock();
}

/**
 *        netif_receive_skb - process receive buffer from network
 *        @skb: buffer to process
 *
 *        netif_receive_skb() is the main receive data processing function.
 *        It always succeeds. The buffer may be dropped during processing
 *        for congestion control or by the protocol layers.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 *
 *        Return values (usually ignored):
 *        NET_RX_SUCCESS: no congestion
 *        NET_RX_DROP: packet was dropped
 */
int netif_receive_skb(struct sk_buff *skb)
{
        int ret;

        trace_netif_receive_skb_entry(skb);

        ret = netif_receive_skb_internal(skb);
        trace_netif_receive_skb_exit(ret);

        return ret;
}
EXPORT_SYMBOL(netif_receive_skb);

/**
 *        netif_receive_skb_list - process many receive buffers from network
 *        @head: list of skbs to process.
 *
 *        Since return value of netif_receive_skb() is normally ignored, and
 *        wouldn't be meaningful for a list, this function returns void.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 */
void netif_receive_skb_list(struct list_head *head)
{
        struct sk_buff *skb;

        if (list_empty(head))
                return;
        if (trace_netif_receive_skb_list_entry_enabled()) {
                list_for_each_entry(skb, head, list)
                        trace_netif_receive_skb_list_entry(skb);
        }
        netif_receive_skb_list_internal(head);
        trace_netif_receive_skb_list_exit(0);
}
EXPORT_SYMBOL(netif_receive_skb_list);

/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
        struct sk_buff *skb, *tmp;
        struct sk_buff_head list;
        struct softnet_data *sd;

        __skb_queue_head_init(&list);
        local_bh_disable();
        sd = this_cpu_ptr(&softnet_data);

        backlog_lock_irq_disable(sd);
        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
                if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
                        __skb_unlink(skb, &sd->input_pkt_queue);
                        __skb_queue_tail(&list, skb);
                        rps_input_queue_head_incr(sd);
                }
        }
        backlog_unlock_irq_enable(sd);

        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
                if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
                        __skb_unlink(skb, &sd->process_queue);
                        __skb_queue_tail(&list, skb);
                        rps_input_queue_head_incr(sd);
                }
        }
        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
        local_bh_enable();

        __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY);
}

static bool flush_required(int cpu)
{
#if IS_ENABLED(CONFIG_RPS)
        struct softnet_data *sd = &per_cpu(softnet_data, cpu);
        bool do_flush;

        backlog_lock_irq_disable(sd);

        /* as insertion into process_queue happens with the rps lock held,
         * process_queue access may race only with dequeue
         */
        do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
                   !skb_queue_empty_lockless(&sd->process_queue);
        backlog_unlock_irq_enable(sd);

        return do_flush;
#endif
        /* without RPS we can't safely check input_pkt_queue: during a
         * concurrent remote skb_queue_splice() we can detect as empty both
         * input_pkt_queue and process_queue even if the latter could end-up
         * containing a lot of packets.
         */
        return true;
}

struct flush_backlogs {
        cpumask_t                flush_cpus;
        struct work_struct        w[];
};

static struct flush_backlogs *flush_backlogs_alloc(void)
{
        return kmalloc_flex(struct flush_backlogs, w, nr_cpu_ids);
}

static struct flush_backlogs *flush_backlogs_fallback;
static DEFINE_MUTEX(flush_backlogs_mutex);

static void flush_all_backlogs(void)
{
        struct flush_backlogs *ptr = flush_backlogs_alloc();
        unsigned int cpu;

        if (!ptr) {
                mutex_lock(&flush_backlogs_mutex);
                ptr = flush_backlogs_fallback;
        }
        cpumask_clear(&ptr->flush_cpus);

        cpus_read_lock();

        for_each_online_cpu(cpu) {
                if (flush_required(cpu)) {
                        INIT_WORK(&ptr->w[cpu], flush_backlog);
                        queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
                        __cpumask_set_cpu(cpu, &ptr->flush_cpus);
                }
        }

        /* we can have in flight packet[s] on the cpus we are not flushing,
         * synchronize_net() in unregister_netdevice_many() will take care of
         * them.
         */
        for_each_cpu(cpu, &ptr->flush_cpus)
                flush_work(&ptr->w[cpu]);

        cpus_read_unlock();

        if (ptr != flush_backlogs_fallback)
                kfree(ptr);
        else
                mutex_unlock(&flush_backlogs_mutex);
}

static void net_rps_send_ipi(struct softnet_data *remsd)
{
#ifdef CONFIG_RPS
        while (remsd) {
                struct softnet_data *next = remsd->rps_ipi_next;

                if (cpu_online(remsd->cpu))
                        smp_call_function_single_async(remsd->cpu, &remsd->csd);
                remsd = next;
        }
#endif
}

/*
 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 * Note: called with local irq disabled, but exits with local irq enabled.
 */
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        struct softnet_data *remsd = sd->rps_ipi_list;

        if (!use_backlog_threads() && remsd) {
                sd->rps_ipi_list = NULL;

                local_irq_enable();

                /* Send pending IPI's to kick RPS processing on remote cpus. */
                net_rps_send_ipi(remsd);
        } else
#endif
                local_irq_enable();
}

static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        return !use_backlog_threads() && sd->rps_ipi_list;
#else
        return false;
#endif
}

static int process_backlog(struct napi_struct *napi, int quota)
{
        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
        bool again = true;
        int work = 0;

        /* Check if we have pending ipi, its better to send them now,
         * not waiting net_rx_action() end.
         */
        if (sd_has_rps_ipi_waiting(sd)) {
                local_irq_disable();
                net_rps_action_and_irq_enable(sd);
        }

        napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
        while (again) {
                struct sk_buff *skb;

                local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                while ((skb = __skb_dequeue(&sd->process_queue))) {
                        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
                        rcu_read_lock();
                        __netif_receive_skb(skb);
                        rcu_read_unlock();
                        if (++work >= quota) {
                                rps_input_queue_head_add(sd, work);
                                return work;
                        }

                        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                }
                local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);

                backlog_lock_irq_disable(sd);
                if (skb_queue_empty(&sd->input_pkt_queue)) {
                        /*
                         * Inline a custom version of __napi_complete().
                         * only current cpu owns and manipulates this napi,
                         * and NAPI_STATE_SCHED is the only possible flag set
                         * on backlog.
                         * We can use a plain write instead of clear_bit(),
                         * and we dont need an smp_mb() memory barrier.
                         */
                        napi->state &= NAPIF_STATE_THREADED;
                        again = false;
                } else {
                        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
                                                   &sd->process_queue);
                        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
                }
                backlog_unlock_irq_enable(sd);
        }

        if (work)
                rps_input_queue_head_add(sd, work);
        return work;
}

/**
 * __napi_schedule - schedule for receive
 * @n: entry to schedule
 *
 * The entry's receive function will be scheduled to run.
 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 */
void __napi_schedule(struct napi_struct *n)
{
        unsigned long flags;

        local_irq_save(flags);
        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
        local_irq_restore(flags);
}
EXPORT_SYMBOL(__napi_schedule);

/**
 *        napi_schedule_prep - check if napi can be scheduled
 *        @n: napi context
 *
 * Test if NAPI routine is already running, and if not mark
 * it as running.  This is used as a condition variable to
 * insure only one NAPI poll instance runs.  We also make
 * sure there is no pending NAPI disable.
 */
bool napi_schedule_prep(struct napi_struct *n)
{
        unsigned long new, val = READ_ONCE(n->state);

        do {
                if (unlikely(val & NAPIF_STATE_DISABLE))
                        return false;
                new = val | NAPIF_STATE_SCHED;

                /* Sets STATE_MISSED bit if STATE_SCHED was already set
                 * This was suggested by Alexander Duyck, as compiler
                 * emits better code than :
                 * if (val & NAPIF_STATE_SCHED)
                 *     new |= NAPIF_STATE_MISSED;
                 */
                new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
                                                   NAPIF_STATE_MISSED;
        } while (!try_cmpxchg(&n->state, &val, new));

        return !(val & NAPIF_STATE_SCHED);
}
EXPORT_SYMBOL(napi_schedule_prep);

/**
 * __napi_schedule_irqoff - schedule for receive
 * @n: entry to schedule
 *
 * Variant of __napi_schedule() assuming hard irqs are masked.
 *
 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
 * because the interrupt disabled assumption might not be true
 * due to force-threaded interrupts and spinlock substitution.
 */
void __napi_schedule_irqoff(struct napi_struct *n)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                ____napi_schedule(this_cpu_ptr(&softnet_data), n);
        else
                __napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);

bool napi_complete_done(struct napi_struct *n, int work_done)
{
        unsigned long flags, val, new, timeout = 0;
        bool ret = true;

        /*
         * 1) Don't let napi dequeue from the cpu poll list
         *    just in case its running on a different cpu.
         * 2) If we are busy polling, do nothing here, we have
         *    the guarantee we will be called later.
         */
        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
                                 NAPIF_STATE_IN_BUSY_POLL)))
                return false;

        if (work_done) {
                if (n->gro.bitmask)
                        timeout = napi_get_gro_flush_timeout(n);
                n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
        }
        if (n->defer_hard_irqs_count > 0) {
                n->defer_hard_irqs_count--;
                timeout = napi_get_gro_flush_timeout(n);
                if (timeout)
                        ret = false;
        }

        /*
         * When the NAPI instance uses a timeout and keeps postponing
         * it, we need to bound somehow the time packets are kept in
         * the GRO layer.
         */
        gro_flush_normal(&n->gro, !!timeout);

        if (unlikely(!list_empty(&n->poll_list))) {
                /* If n->poll_list is not empty, we need to mask irqs */
                local_irq_save(flags);
                list_del_init(&n->poll_list);
                local_irq_restore(flags);
        }
        WRITE_ONCE(n->list_owner, -1);

        val = READ_ONCE(n->state);
        do {
                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));

                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
                              NAPIF_STATE_SCHED_THREADED |
                              NAPIF_STATE_PREFER_BUSY_POLL);

                /* If STATE_MISSED was set, leave STATE_SCHED set,
                 * because we will call napi->poll() one more time.
                 * This C code was suggested by Alexander Duyck to help gcc.
                 */
                new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
                                                    NAPIF_STATE_SCHED;
        } while (!try_cmpxchg(&n->state, &val, new));

        if (unlikely(val & NAPIF_STATE_MISSED)) {
                __napi_schedule(n);
                return false;
        }

        if (timeout)
                hrtimer_start(&n->timer, ns_to_ktime(timeout),
                              HRTIMER_MODE_REL_PINNED);
        return ret;
}
EXPORT_SYMBOL(napi_complete_done);

static void skb_defer_free_flush(void)
{
        struct llist_node *free_list;
        struct sk_buff *skb, *next;
        struct skb_defer_node *sdn;
        int node;

        for_each_node(node) {
                sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node;

                if (llist_empty(&sdn->defer_list))
                        continue;
                atomic_long_set(&sdn->defer_count, 0);
                free_list = llist_del_all(&sdn->defer_list);

                llist_for_each_entry_safe(skb, next, free_list, ll_node) {
                        prefetch(next);
                        napi_consume_skb(skb, 1);
                }
        }
}

#if defined(CONFIG_NET_RX_BUSY_POLL)

static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
        if (!skip_schedule) {
                gro_normal_list(&napi->gro);
                __napi_schedule(napi);
                return;
        }

        /* Flush too old packets. If HZ < 1000, flush all packets */
        gro_flush_normal(&napi->gro, HZ >= 1000);

        clear_bit(NAPI_STATE_SCHED, &napi->state);
}

enum {
        NAPI_F_PREFER_BUSY_POLL        = 1,
        NAPI_F_END_ON_RESCHED        = 2,
};

static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
                           unsigned flags, u16 budget)
{
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        bool skip_schedule = false;
        unsigned long timeout;
        int rc;

        /* Busy polling means there is a high chance device driver hard irq
         * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
         * set in napi_schedule_prep().
         * Since we are about to call napi->poll() once more, we can safely
         * clear NAPI_STATE_MISSED.
         *
         * Note: x86 could use a single "lock and ..." instruction
         * to perform these two clear_bit()
         */
        clear_bit(NAPI_STATE_MISSED, &napi->state);
        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);

        local_bh_disable();
        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

        if (flags & NAPI_F_PREFER_BUSY_POLL) {
                napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
                timeout = napi_get_gro_flush_timeout(napi);
                if (napi->defer_hard_irqs_count && timeout) {
                        hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
                        skip_schedule = true;
                }
        }

        /* All we really want here is to re-enable device interrupts.
         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
         */
        rc = napi->poll(napi, budget);
        /* We can't gro_normal_list() here, because napi->poll() might have
         * rearmed the napi (napi_complete_done()) in which case it could
         * already be running on another CPU.
         */
        trace_napi_poll(napi, rc, budget);
        netpoll_poll_unlock(have_poll_lock);
        if (rc == budget)
                __busy_poll_stop(napi, skip_schedule);
        bpf_net_ctx_clear(bpf_net_ctx);
        local_bh_enable();
}

static void __napi_busy_loop(unsigned int napi_id,
                      bool (*loop_end)(void *, unsigned long),
                      void *loop_end_arg, unsigned flags, u16 budget)
{
        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
        int (*napi_poll)(struct napi_struct *napi, int budget);
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        void *have_poll_lock = NULL;
        struct napi_struct *napi;

        WARN_ON_ONCE(!rcu_read_lock_held());

restart:
        napi_poll = NULL;

        napi = napi_by_id(napi_id);
        if (!napi)
                return;

        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_disable();
        for (;;) {
                int work = 0;

                local_bh_disable();
                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
                if (!napi_poll) {
                        unsigned long val = READ_ONCE(napi->state);

                        /* If multiple threads are competing for this napi,
                         * we avoid dirtying napi->state as much as we can.
                         */
                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
                                   NAPIF_STATE_IN_BUSY_POLL)) {
                                if (flags & NAPI_F_PREFER_BUSY_POLL)
                                        set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
                        }
                        if (cmpxchg(&napi->state, val,
                                    val | NAPIF_STATE_IN_BUSY_POLL |
                                          NAPIF_STATE_SCHED) != val) {
                                if (flags & NAPI_F_PREFER_BUSY_POLL)
                                        set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
                        }
                        have_poll_lock = netpoll_poll_lock(napi);
                        napi_poll = napi->poll;
                }
                work = napi_poll(napi, budget);
                trace_napi_poll(napi, work, budget);
                gro_normal_list(&napi->gro);
count:
                if (work > 0)
                        __NET_ADD_STATS(dev_net(napi->dev),
                                        LINUX_MIB_BUSYPOLLRXPACKETS, work);
                skb_defer_free_flush();
                bpf_net_ctx_clear(bpf_net_ctx);
                local_bh_enable();

                if (!loop_end || loop_end(loop_end_arg, start_time))
                        break;

                if (unlikely(need_resched())) {
                        if (flags & NAPI_F_END_ON_RESCHED)
                                break;
                        if (napi_poll)
                                busy_poll_stop(napi, have_poll_lock, flags, budget);
                        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                                preempt_enable();
                        rcu_read_unlock();
                        cond_resched();
                        rcu_read_lock();
                        if (loop_end(loop_end_arg, start_time))
                                return;
                        goto restart;
                }
                cpu_relax();
        }
        if (napi_poll)
                busy_poll_stop(napi, have_poll_lock, flags, budget);
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable();
}

void napi_busy_loop_rcu(unsigned int napi_id,
                        bool (*loop_end)(void *, unsigned long),
                        void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
        unsigned flags = NAPI_F_END_ON_RESCHED;

        if (prefer_busy_poll)
                flags |= NAPI_F_PREFER_BUSY_POLL;

        __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
}

void napi_busy_loop(unsigned int napi_id,
                    bool (*loop_end)(void *, unsigned long),
                    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
        unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;

        rcu_read_lock();
        __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
        rcu_read_unlock();
}
EXPORT_SYMBOL(napi_busy_loop);

void napi_suspend_irqs(unsigned int napi_id)
{
        struct napi_struct *napi;

        rcu_read_lock();
        napi = napi_by_id(napi_id);
        if (napi) {
                unsigned long timeout = napi_get_irq_suspend_timeout(napi);

                if (timeout)
                        hrtimer_start(&napi->timer, ns_to_ktime(timeout),
                                      HRTIMER_MODE_REL_PINNED);
        }
        rcu_read_unlock();
}

void napi_resume_irqs(unsigned int napi_id)
{
        struct napi_struct *napi;

        rcu_read_lock();
        napi = napi_by_id(napi_id);
        if (napi) {
                /* If irq_suspend_timeout is set to 0 between the call to
                 * napi_suspend_irqs and now, the original value still
                 * determines the safety timeout as intended and napi_watchdog
                 * will resume irq processing.
                 */
                if (napi_get_irq_suspend_timeout(napi)) {
                        local_bh_disable();
                        napi_schedule(napi);
                        local_bh_enable();
                }
        }
        rcu_read_unlock();
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

static void __napi_hash_add_with_id(struct napi_struct *napi,
                                    unsigned int napi_id)
{
        napi->gro.cached_napi_id = napi_id;

        WRITE_ONCE(napi->napi_id, napi_id);
        hlist_add_head_rcu(&napi->napi_hash_node,
                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
}

static void napi_hash_add_with_id(struct napi_struct *napi,
                                  unsigned int napi_id)
{
        unsigned long flags;

        spin_lock_irqsave(&napi_hash_lock, flags);
        WARN_ON_ONCE(napi_by_id(napi_id));
        __napi_hash_add_with_id(napi, napi_id);
        spin_unlock_irqrestore(&napi_hash_lock, flags);
}

static void napi_hash_add(struct napi_struct *napi)
{
        unsigned long flags;

        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
                return;

        spin_lock_irqsave(&napi_hash_lock, flags);

        /* 0..NR_CPUS range is reserved for sender_cpu use */
        do {
                if (unlikely(!napi_id_valid(++napi_gen_id)))
                        napi_gen_id = MIN_NAPI_ID;
        } while (napi_by_id(napi_gen_id));

        __napi_hash_add_with_id(napi, napi_gen_id);

        spin_unlock_irqrestore(&napi_hash_lock, flags);
}

/* Warning : caller is responsible to make sure rcu grace period
 * is respected before freeing memory containing @napi
 */
static void napi_hash_del(struct napi_struct *napi)
{
        unsigned long flags;

        spin_lock_irqsave(&napi_hash_lock, flags);

        hlist_del_init_rcu(&napi->napi_hash_node);

        spin_unlock_irqrestore(&napi_hash_lock, flags);
}

static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
        struct napi_struct *napi;

        napi = container_of(timer, struct napi_struct, timer);

        /* Note : we use a relaxed variant of napi_schedule_prep() not setting
         * NAPI_STATE_MISSED, since we do not react to a device IRQ.
         */
        if (!napi_disable_pending(napi) &&
            !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
                clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                __napi_schedule_irqoff(napi);
        }

        return HRTIMER_NORESTART;
}

static void napi_stop_kthread(struct napi_struct *napi)
{
        unsigned long val, new;

        /* Wait until the napi STATE_THREADED is unset. */
        while (true) {
                val = READ_ONCE(napi->state);

                /* If napi kthread own this napi or the napi is idle,
                 * STATE_THREADED can be unset here.
                 */
                if ((val & NAPIF_STATE_SCHED_THREADED) ||
                    !(val & NAPIF_STATE_SCHED)) {
                        new = val & (~(NAPIF_STATE_THREADED |
                                       NAPIF_STATE_THREADED_BUSY_POLL));
                } else {
                        msleep(20);
                        continue;
                }

                if (try_cmpxchg(&napi->state, &val, new))
                        break;
        }

        /* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by
         * the kthread.
         */
        while (true) {
                if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state))
                        break;

                msleep(20);
        }

        kthread_stop(napi->thread);
        napi->thread = NULL;
}

static void napi_set_threaded_state(struct napi_struct *napi,
                                    enum netdev_napi_threaded threaded_mode)
{
        bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED;
        bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL;

        assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
        assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll);
}

int napi_set_threaded(struct napi_struct *napi,
                      enum netdev_napi_threaded threaded)
{
        if (threaded) {
                if (!napi->thread) {
                        int err = napi_kthread_create(napi);

                        if (err)
                                return err;
                }
        }

        if (napi->config)
                napi->config->threaded = threaded;

        /* Setting/unsetting threaded mode on a napi might not immediately
         * take effect, if the current napi instance is actively being
         * polled. In this case, the switch between threaded mode and
         * softirq mode will happen in the next round of napi_schedule().
         * This should not cause hiccups/stalls to the live traffic.
         */
        if (!threaded && napi->thread) {
                napi_stop_kthread(napi);
        } else {
                /* Make sure kthread is created before THREADED bit is set. */
                smp_mb__before_atomic();
                napi_set_threaded_state(napi, threaded);
        }

        return 0;
}

int netif_set_threaded(struct net_device *dev,
                       enum netdev_napi_threaded threaded)
{
        struct napi_struct *napi;
        int i, err = 0;

        netdev_assert_locked_or_invisible(dev);

        if (threaded) {
                list_for_each_entry(napi, &dev->napi_list, dev_list) {
                        if (!napi->thread) {
                                err = napi_kthread_create(napi);
                                if (err) {
                                        threaded = NETDEV_NAPI_THREADED_DISABLED;
                                        break;
                                }
                        }
                }
        }

        WRITE_ONCE(dev->threaded, threaded);

        /* The error should not occur as the kthreads are already created. */
        list_for_each_entry(napi, &dev->napi_list, dev_list)
                WARN_ON_ONCE(napi_set_threaded(napi, threaded));

        /* Override the config for all NAPIs even if currently not listed */
        for (i = 0; i < dev->num_napi_configs; i++)
                dev->napi_config[i].threaded = threaded;

        return err;
}

/**
 * netif_threaded_enable() - enable threaded NAPIs
 * @dev: net_device instance
 *
 * Enable threaded mode for the NAPI instances of the device. This may be useful
 * for devices where multiple NAPI instances get scheduled by a single
 * interrupt. Threaded NAPI allows moving the NAPI processing to cores other
 * than the core where IRQ is mapped.
 *
 * This function should be called before @dev is registered.
 */
void netif_threaded_enable(struct net_device *dev)
{
        WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED));
}
EXPORT_SYMBOL(netif_threaded_enable);

/**
 * netif_queue_set_napi - Associate queue with the napi
 * @dev: device to which NAPI and queue belong
 * @queue_index: Index of queue
 * @type: queue type as RX or TX
 * @napi: NAPI context, pass NULL to clear previously set NAPI
 *
 * Set queue with its corresponding napi context. This should be done after
 * registering the NAPI handler for the queue-vector and the queues have been
 * mapped to the corresponding interrupt vector.
 */
void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
                          enum netdev_queue_type type, struct napi_struct *napi)
{
        struct netdev_rx_queue *rxq;
        struct netdev_queue *txq;

        if (WARN_ON_ONCE(napi && !napi->dev))
                return;
        netdev_ops_assert_locked_or_invisible(dev);

        switch (type) {
        case NETDEV_QUEUE_TYPE_RX:
                rxq = __netif_get_rx_queue(dev, queue_index);
                rxq->napi = napi;
                return;
        case NETDEV_QUEUE_TYPE_TX:
                txq = netdev_get_tx_queue(dev, queue_index);
                txq->napi = napi;
                return;
        default:
                return;
        }
}
EXPORT_SYMBOL(netif_queue_set_napi);

static void
netif_napi_irq_notify(struct irq_affinity_notify *notify,
                      const cpumask_t *mask)
{
        struct napi_struct *napi =
                container_of(notify, struct napi_struct, notify);
#ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
        int err;
#endif

        if (napi->config && napi->dev->irq_affinity_auto)
                cpumask_copy(&napi->config->affinity_mask, mask);

#ifdef CONFIG_RFS_ACCEL
        if (napi->dev->rx_cpu_rmap_auto) {
                err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask);
                if (err)
                        netdev_warn(napi->dev, "RMAP update failed (%d)\n",
                                    err);
        }
#endif
}

#ifdef CONFIG_RFS_ACCEL
static void netif_napi_affinity_release(struct kref *ref)
{
        struct napi_struct *napi =
                container_of(ref, struct napi_struct, notify.kref);
        struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;

        netdev_assert_locked(napi->dev);
        WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER,
                                   &napi->state));

        if (!napi->dev->rx_cpu_rmap_auto)
                return;
        rmap->obj[napi->napi_rmap_idx] = NULL;
        napi->napi_rmap_idx = -1;
        cpu_rmap_put(rmap);
}

int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
{
        if (dev->rx_cpu_rmap_auto)
                return 0;

        dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
        if (!dev->rx_cpu_rmap)
                return -ENOMEM;

        dev->rx_cpu_rmap_auto = true;
        return 0;
}
EXPORT_SYMBOL(netif_enable_cpu_rmap);

static void netif_del_cpu_rmap(struct net_device *dev)
{
        struct cpu_rmap *rmap = dev->rx_cpu_rmap;

        if (!dev->rx_cpu_rmap_auto)
                return;

        /* Free the rmap */
        cpu_rmap_put(rmap);
        dev->rx_cpu_rmap = NULL;
        dev->rx_cpu_rmap_auto = false;
}

#else
static void netif_napi_affinity_release(struct kref *ref)
{
}

int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
{
        return 0;
}
EXPORT_SYMBOL(netif_enable_cpu_rmap);

static void netif_del_cpu_rmap(struct net_device *dev)
{
}
#endif

void netif_set_affinity_auto(struct net_device *dev)
{
        unsigned int i, maxqs, numa;

        maxqs = max(dev->num_tx_queues, dev->num_rx_queues);
        numa = dev_to_node(&dev->dev);

        for (i = 0; i < maxqs; i++)
                cpumask_set_cpu(cpumask_local_spread(i, numa),
                                &dev->napi_config[i].affinity_mask);

        dev->irq_affinity_auto = true;
}
EXPORT_SYMBOL(netif_set_affinity_auto);

void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
{
        int rc;

        netdev_assert_locked_or_invisible(napi->dev);

        if (napi->irq == irq)
                return;

        /* Remove existing resources */
        if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
                irq_set_affinity_notifier(napi->irq, NULL);

        napi->irq = irq;
        if (irq < 0 ||
            (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto))
                return;

        /* Abort for buggy drivers */
        if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config))
                return;

#ifdef CONFIG_RFS_ACCEL
        if (napi->dev->rx_cpu_rmap_auto) {
                rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi);
                if (rc < 0)
                        return;

                cpu_rmap_get(napi->dev->rx_cpu_rmap);
                napi->napi_rmap_idx = rc;
        }
#endif

        /* Use core IRQ notifier */
        napi->notify.notify = netif_napi_irq_notify;
        napi->notify.release = netif_napi_affinity_release;
        rc = irq_set_affinity_notifier(irq, &napi->notify);
        if (rc) {
                netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
                            rc);
                goto put_rmap;
        }

        set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state);
        return;

put_rmap:
#ifdef CONFIG_RFS_ACCEL
        if (napi->dev->rx_cpu_rmap_auto) {
                napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL;
                cpu_rmap_put(napi->dev->rx_cpu_rmap);
                napi->napi_rmap_idx = -1;
        }
#endif
        napi->notify.notify = NULL;
        napi->notify.release = NULL;
}
EXPORT_SYMBOL(netif_napi_set_irq_locked);

static void napi_restore_config(struct napi_struct *n)
{
        n->defer_hard_irqs = n->config->defer_hard_irqs;
        n->gro_flush_timeout = n->config->gro_flush_timeout;
        n->irq_suspend_timeout = n->config->irq_suspend_timeout;

        if (n->dev->irq_affinity_auto &&
            test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
                irq_set_affinity(n->irq, &n->config->affinity_mask);

        /* a NAPI ID might be stored in the config, if so use it. if not, use
         * napi_hash_add to generate one for us.
         */
        if (n->config->napi_id) {
                napi_hash_add_with_id(n, n->config->napi_id);
        } else {
                napi_hash_add(n);
                n->config->napi_id = n->napi_id;
        }

        WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded));
}

static void napi_save_config(struct napi_struct *n)
{
        n->config->defer_hard_irqs = n->defer_hard_irqs;
        n->config->gro_flush_timeout = n->gro_flush_timeout;
        n->config->irq_suspend_timeout = n->irq_suspend_timeout;
        napi_hash_del(n);
}

/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
 * inherit an existing ID try to insert it at the right position.
 */
static void
netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
{
        unsigned int new_id, pos_id;
        struct list_head *higher;
        struct napi_struct *pos;

        new_id = UINT_MAX;
        if (napi->config && napi->config->napi_id)
                new_id = napi->config->napi_id;

        higher = &dev->napi_list;
        list_for_each_entry(pos, &dev->napi_list, dev_list) {
                if (napi_id_valid(pos->napi_id))
                        pos_id = pos->napi_id;
                else if (pos->config)
                        pos_id = pos->config->napi_id;
                else
                        pos_id = UINT_MAX;

                if (pos_id <= new_id)
                        break;
                higher = &pos->dev_list;
        }
        list_add_rcu(&napi->dev_list, higher); /* adds after higher */
}

/* Double check that napi_get_frags() allocates skbs with
 * skb->head being backed by slab, not a page fragment.
 * This is to make sure bug fixed in 3226b158e67c
 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
 * does not accidentally come back.
 */
static void napi_get_frags_check(struct napi_struct *napi)
{
        struct sk_buff *skb;

        local_bh_disable();
        skb = napi_get_frags(napi);
        WARN_ON_ONCE(skb && skb->head_frag);
        napi_free_frags(napi);
        local_bh_enable();
}

void netif_napi_add_weight_locked(struct net_device *dev,
                                  struct napi_struct *napi,
                                  int (*poll)(struct napi_struct *, int),
                                  int weight)
{
        netdev_assert_locked(dev);
        if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
                return;

        INIT_LIST_HEAD(&napi->poll_list);
        INIT_HLIST_NODE(&napi->napi_hash_node);
        hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
        gro_init(&napi->gro);
        napi->skb = NULL;
        napi->poll = poll;
        if (weight > NAPI_POLL_WEIGHT)
                netdev_err_once(dev, "%s() called with weight %d\n", __func__,
                                weight);
        napi->weight = weight;
        napi->dev = dev;
#ifdef CONFIG_NETPOLL
        napi->poll_owner = -1;
#endif
        napi->list_owner = -1;
        set_bit(NAPI_STATE_SCHED, &napi->state);
        set_bit(NAPI_STATE_NPSVC, &napi->state);
        netif_napi_dev_list_add(dev, napi);

        /* default settings from sysfs are applied to all NAPIs. any per-NAPI
         * configuration will be loaded in napi_enable
         */
        napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
        napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));

        napi_get_frags_check(napi);
        /* Create kthread for this napi if dev->threaded is set.
         * Clear dev->threaded if kthread creation failed so that
         * threaded mode will not be enabled in napi_enable().
         */
        if (napi_get_threaded_config(dev, napi))
                if (napi_kthread_create(napi))
                        dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
        netif_napi_set_irq_locked(napi, -1);
}
EXPORT_SYMBOL(netif_napi_add_weight_locked);

void napi_disable_locked(struct napi_struct *n)
{
        unsigned long val, new;

        might_sleep();
        netdev_assert_locked(n->dev);

        set_bit(NAPI_STATE_DISABLE, &n->state);

        val = READ_ONCE(n->state);
        do {
                while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
                        usleep_range(20, 200);
                        val = READ_ONCE(n->state);
                }

                new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
                new &= ~(NAPIF_STATE_THREADED |
                         NAPIF_STATE_THREADED_BUSY_POLL |
                         NAPIF_STATE_PREFER_BUSY_POLL);
        } while (!try_cmpxchg(&n->state, &val, new));

        hrtimer_cancel(&n->timer);

        if (n->config)
                napi_save_config(n);
        else
                napi_hash_del(n);

        clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable_locked);

/**
 * napi_disable() - prevent NAPI from scheduling
 * @n: NAPI context
 *
 * Stop NAPI from being scheduled on this context.
 * Waits till any outstanding processing completes.
 * Takes netdev_lock() for associated net_device.
 */
void napi_disable(struct napi_struct *n)
{
        netdev_lock(n->dev);
        napi_disable_locked(n);
        netdev_unlock(n->dev);
}
EXPORT_SYMBOL(napi_disable);

void napi_enable_locked(struct napi_struct *n)
{
        unsigned long new, val = READ_ONCE(n->state);

        if (n->config)
                napi_restore_config(n);
        else
                napi_hash_add(n);

        do {
                BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));

                new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
                if (n->dev->threaded && n->thread)
                        new |= NAPIF_STATE_THREADED;
        } while (!try_cmpxchg(&n->state, &val, new));
}
EXPORT_SYMBOL(napi_enable_locked);

/**
 * napi_enable() - enable NAPI scheduling
 * @n: NAPI context
 *
 * Enable scheduling of a NAPI instance.
 * Must be paired with napi_disable().
 * Takes netdev_lock() for associated net_device.
 */
void napi_enable(struct napi_struct *n)
{
        netdev_lock(n->dev);
        napi_enable_locked(n);
        netdev_unlock(n->dev);
}
EXPORT_SYMBOL(napi_enable);

/* Must be called in process context */
void __netif_napi_del_locked(struct napi_struct *napi)
{
        netdev_assert_locked(napi->dev);

        if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
                return;

        /* Make sure NAPI is disabled (or was never enabled). */
        WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));

        if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
                irq_set_affinity_notifier(napi->irq, NULL);

        if (napi->config) {
                napi->index = -1;
                napi->config = NULL;
        }

        list_del_rcu(&napi->dev_list);
        napi_free_frags(napi);

        gro_cleanup(&napi->gro);

        if (napi->thread) {
                kthread_stop(napi->thread);
                napi->thread = NULL;
        }
}
EXPORT_SYMBOL(__netif_napi_del_locked);

static int __napi_poll(struct napi_struct *n, bool *repoll)
{
        int work, weight;

        weight = n->weight;

        /* This NAPI_STATE_SCHED test is for avoiding a race
         * with netpoll's poll_napi().  Only the entity which
         * obtains the lock and sees NAPI_STATE_SCHED set will
         * actually make the ->poll() call.  Therefore we avoid
         * accidentally calling ->poll() when NAPI is not scheduled.
         */
        work = 0;
        if (napi_is_scheduled(n)) {
                work = n->poll(n, weight);
                trace_napi_poll(n, work, weight);

                xdp_do_check_flushed(n);
        }

        if (unlikely(work > weight))
                netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
                                n->poll, work, weight);

        if (likely(work < weight))
                return work;

        /* Drivers must not modify the NAPI state if they
         * consume the entire weight.  In such cases this code
         * still "owns" the NAPI instance and therefore can
         * move the instance around on the list at-will.
         */
        if (unlikely(napi_disable_pending(n))) {
                napi_complete(n);
                return work;
        }

        /* The NAPI context has more processing work, but busy-polling
         * is preferred. Exit early.
         */
        if (napi_prefer_busy_poll(n)) {
                if (napi_complete_done(n, work)) {
                        /* If timeout is not set, we need to make sure
                         * that the NAPI is re-scheduled.
                         */
                        napi_schedule(n);
                }
                return work;
        }

        /* Flush too old packets. If HZ < 1000, flush all packets */
        gro_flush_normal(&n->gro, HZ >= 1000);

        /* Some drivers may have called napi_schedule
         * prior to exhausting their budget.
         */
        if (unlikely(!list_empty(&n->poll_list))) {
                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
                             n->dev ? n->dev->name : "backlog");
                return work;
        }

        *repoll = true;

        return work;
}

static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
        bool do_repoll = false;
        void *have;
        int work;

        list_del_init(&n->poll_list);

        have = netpoll_poll_lock(n);

        work = __napi_poll(n, &do_repoll);

        if (do_repoll) {
#if defined(CONFIG_DEBUG_NET)
                if (unlikely(!napi_is_scheduled(n)))
                        pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n",
                                n->dev->name, n->poll);
#endif
                list_add_tail(&n->poll_list, repoll);
        }
        netpoll_poll_unlock(have);

        return work;
}

static int napi_thread_wait(struct napi_struct *napi)
{
        set_current_state(TASK_INTERRUPTIBLE);

        while (!kthread_should_stop()) {
                /* Testing SCHED_THREADED bit here to make sure the current
                 * kthread owns this napi and could poll on this napi.
                 * Testing SCHED bit is not enough because SCHED bit might be
                 * set by some other busy poll thread or by napi_disable().
                 */
                if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
                        WARN_ON(!list_empty(&napi->poll_list));
                        __set_current_state(TASK_RUNNING);
                        return 0;
                }

                schedule();
                set_current_state(TASK_INTERRUPTIBLE);
        }
        __set_current_state(TASK_RUNNING);

        return -1;
}

static void napi_threaded_poll_loop(struct napi_struct *napi,
                                    unsigned long *busy_poll_last_qs)
{
        unsigned long last_qs = busy_poll_last_qs ? *busy_poll_last_qs : jiffies;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        struct softnet_data *sd;

        for (;;) {
                bool repoll = false;
                void *have;

                local_bh_disable();
                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

                sd = this_cpu_ptr(&softnet_data);
                sd->in_napi_threaded_poll = true;

                have = netpoll_poll_lock(napi);
                __napi_poll(napi, &repoll);
                netpoll_poll_unlock(have);

                sd->in_napi_threaded_poll = false;
                barrier();

                if (sd_has_rps_ipi_waiting(sd)) {
                        local_irq_disable();
                        net_rps_action_and_irq_enable(sd);
                }
                skb_defer_free_flush();
                bpf_net_ctx_clear(bpf_net_ctx);

                /* When busy poll is enabled, the old packets are not flushed in
                 * napi_complete_done. So flush them here.
                 */
                if (busy_poll_last_qs)
                        gro_flush_normal(&napi->gro, HZ >= 1000);
                local_bh_enable();

                /* Call cond_resched here to avoid watchdog warnings. */
                if (repoll || busy_poll_last_qs) {
                        rcu_softirq_qs_periodic(last_qs);
                        cond_resched();
                }

                if (!repoll)
                        break;
        }

        if (busy_poll_last_qs)
                *busy_poll_last_qs = last_qs;
}

static int napi_threaded_poll(void *data)
{
        struct napi_struct *napi = data;
        unsigned long last_qs = jiffies;
        bool want_busy_poll;
        bool in_busy_poll;
        unsigned long val;

        while (!napi_thread_wait(napi)) {
                val = READ_ONCE(napi->state);

                want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
                in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL;

                if (unlikely(val & NAPIF_STATE_DISABLE))
                        want_busy_poll = false;

                if (want_busy_poll != in_busy_poll)
                        assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
                                   want_busy_poll);

                napi_threaded_poll_loop(napi, want_busy_poll ? &last_qs : NULL);
        }

        return 0;
}

static __latent_entropy void net_rx_action(void)
{
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
        unsigned long time_limit = jiffies +
                usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int budget = READ_ONCE(net_hotdata.netdev_budget);
        LIST_HEAD(list);
        LIST_HEAD(repoll);

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
start:
        sd->in_net_rx_action = true;
        local_irq_disable();
        list_splice_init(&sd->poll_list, &list);
        local_irq_enable();

        for (;;) {
                struct napi_struct *n;

                skb_defer_free_flush();

                if (list_empty(&list)) {
                        if (list_empty(&repoll)) {
                                sd->in_net_rx_action = false;
                                barrier();
                                /* We need to check if ____napi_schedule()
                                 * had refilled poll_list while
                                 * sd->in_net_rx_action was true.
                                 */
                                if (!list_empty(&sd->poll_list))
                                        goto start;
                                if (!sd_has_rps_ipi_waiting(sd))
                                        goto end;
                        }
                        break;
                }

                n = list_first_entry(&list, struct napi_struct, poll_list);
                budget -= napi_poll(n, &repoll);

                /* If softirq window is exhausted then punt.
                 * Allow this to run for 2 jiffies since which will allow
                 * an average latency of 1.5/HZ.
                 */
                if (unlikely(budget <= 0 ||
                             time_after_eq(jiffies, time_limit))) {
                        /* Pairs with READ_ONCE() in softnet_seq_show() */
                        WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1);
                        break;
                }
        }

        local_irq_disable();

        list_splice_tail_init(&sd->poll_list, &list);
        list_splice_tail(&repoll, &list);
        list_splice(&list, &sd->poll_list);
        if (!list_empty(&sd->poll_list))
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
        else
                sd->in_net_rx_action = false;

        net_rps_action_and_irq_enable(sd);
end:
        bpf_net_ctx_clear(bpf_net_ctx);
}

struct netdev_adjacent {
        struct net_device *dev;
        netdevice_tracker dev_tracker;

        /* upper master flag, there can only be one master device per list */
        bool master;

        /* lookup ignore flag */
        bool ignore;

        /* counter for the number of times this device was added to us */
        u16 ref_nr;

        /* private field for the users */
        void *private;

        struct list_head list;
        struct rcu_head rcu;
};

static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
                                                 struct list_head *adj_list)
{
        struct netdev_adjacent *adj;

        list_for_each_entry(adj, adj_list, list) {
                if (adj->dev == adj_dev)
                        return adj;
        }
        return NULL;
}

static int ____netdev_has_upper_dev(struct net_device *upper_dev,
                                    struct netdev_nested_priv *priv)
{
        struct net_device *dev = (struct net_device *)priv->data;

        return upper_dev == dev;
}

/**
 * netdev_has_upper_dev - Check if device is linked to an upper device
 * @dev: device
 * @upper_dev: upper device to check
 *
 * Find out if a device is linked to specified upper device and return true
 * in case it is. Note that this checks only immediate upper device,
 * not through a complete stack of devices. The caller must hold the RTNL lock.
 */
bool netdev_has_upper_dev(struct net_device *dev,
                          struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .data = (void *)upper_dev,
        };

        ASSERT_RTNL();

        return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
                                             &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev);

/**
 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
 * @dev: device
 * @upper_dev: upper device to check
 *
 * Find out if a device is linked to specified upper device and return true
 * in case it is. Note that this checks the entire upper device chain.
 * The caller must hold rcu lock.
 */

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .data = (void *)upper_dev,
        };

        return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
                                               &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);

/**
 * netdev_has_any_upper_dev - Check if device is linked to some device
 * @dev: device
 *
 * Find out if a device is linked to an upper device and return true in case
 * it is. The caller must hold the RTNL lock.
 */
bool netdev_has_any_upper_dev(struct net_device *dev)
{
        ASSERT_RTNL();

        return !list_empty(&dev->adj_list.upper);
}
EXPORT_SYMBOL(netdev_has_any_upper_dev);

/**
 * netdev_master_upper_dev_get - Get master upper device
 * @dev: device
 *
 * Find a master upper device and return pointer to it or NULL in case
 * it's not there. The caller must hold the RTNL lock.
 */
struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        ASSERT_RTNL();

        if (list_empty(&dev->adj_list.upper))
                return NULL;

        upper = list_first_entry(&dev->adj_list.upper,
                                 struct netdev_adjacent, list);
        if (likely(upper->master))
                return upper->dev;
        return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);

static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        ASSERT_RTNL();

        if (list_empty(&dev->adj_list.upper))
                return NULL;

        upper = list_first_entry(&dev->adj_list.upper,
                                 struct netdev_adjacent, list);
        if (likely(upper->master) && !upper->ignore)
                return upper->dev;
        return NULL;
}

/**
 * netdev_has_any_lower_dev - Check if device is linked to some device
 * @dev: device
 *
 * Find out if a device is linked to a lower device and return true in case
 * it is. The caller must hold the RTNL lock.
 */
static bool netdev_has_any_lower_dev(struct net_device *dev)
{
        ASSERT_RTNL();

        return !list_empty(&dev->adj_list.lower);
}

void *netdev_adjacent_get_private(struct list_head *adj_list)
{
        struct netdev_adjacent *adj;

        adj = list_entry(adj_list, struct netdev_adjacent, list);

        return adj->private;
}
EXPORT_SYMBOL(netdev_adjacent_get_private);

/**
 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next device from the dev's upper list, starting from iter
 * position. The caller must hold RCU read lock.
 */
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
                                                 struct list_head **iter)
{
        struct netdev_adjacent *upper;

        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held() &&
                     !lockdep_rtnl_is_held());

        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;

        return upper->dev;
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);

static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
                                                  struct list_head **iter,
                                                  bool *ignore)
{
        struct netdev_adjacent *upper;

        upper = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;
        *ignore = upper->ignore;

        return upper->dev;
}

static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
                                                    struct list_head **iter)
{
        struct netdev_adjacent *upper;

        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());

        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;

        return upper->dev;
}

static int __netdev_walk_all_upper_dev(struct net_device *dev,
                                       int (*fn)(struct net_device *dev,
                                         struct netdev_nested_priv *priv),
                                       struct netdev_nested_priv *priv)
{
        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;
        bool ignore;

        now = dev;
        iter = &dev->adj_list.upper;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        udev = __netdev_next_upper_dev(now, &iter, &ignore);
                        if (!udev)
                                break;
                        if (ignore)
                                continue;

                        next = udev;
                        niter = &udev->adj_list.upper;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}

int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv)
{
        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.upper;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        udev = netdev_next_upper_dev_rcu(now, &iter);
                        if (!udev)
                                break;

                        next = udev;
                        niter = &udev->adj_list.upper;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);

static bool __netdev_has_upper_dev(struct net_device *dev,
                                   struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = (void *)upper_dev,
        };

        ASSERT_RTNL();

        return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
                                           &priv);
}

/**
 * netdev_lower_get_next_private - Get the next ->private from the
 *                                   lower neighbour list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 * list, starting from iter position. The caller must hold either hold the
 * RTNL lock or its own locking that guarantees that the neighbour lower
 * list will remain unchanged.
 */
void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry(*iter, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = lower->list.next;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private);

/**
 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 *                                       lower neighbour list, RCU
 *                                       variant
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 * list, starting from iter position. The caller must hold RCU read lock.
 */
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
                                        struct list_head **iter)
{
        struct netdev_adjacent *lower;

        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());

        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);

/**
 * netdev_lower_get_next - Get the next device from the lower neighbour
 *                         list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent from the dev's lower neighbour
 * list, starting from iter position. The caller must hold RTNL lock or
 * its own locking that guarantees that the neighbour lower
 * list will remain unchanged.
 */
void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry(*iter, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = lower->list.next;

        return lower->dev;
}
EXPORT_SYMBOL(netdev_lower_get_next);

static struct net_device *netdev_next_lower_dev(struct net_device *dev,
                                                struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->dev;
}

static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
                                                  struct list_head **iter,
                                                  bool *ignore)
{
        struct netdev_adjacent *lower;

        lower = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;
        *ignore = lower->ignore;

        return lower->dev;
}

int netdev_walk_all_lower_dev(struct net_device *dev,
                              int (*fn)(struct net_device *dev,
                                        struct netdev_nested_priv *priv),
                              struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = netdev_next_lower_dev(now, &iter);
                        if (!ldev)
                                break;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);

static int __netdev_walk_all_lower_dev(struct net_device *dev,
                                       int (*fn)(struct net_device *dev,
                                         struct netdev_nested_priv *priv),
                                       struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;
        bool ignore;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = __netdev_next_lower_dev(now, &iter, &ignore);
                        if (!ldev)
                                break;
                        if (ignore)
                                continue;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}

struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
                                             struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->dev;
}
EXPORT_SYMBOL(netdev_next_lower_dev_rcu);

static u8 __netdev_upper_depth(struct net_device *dev)
{
        struct net_device *udev;
        struct list_head *iter;
        u8 max_depth = 0;
        bool ignore;

        for (iter = &dev->adj_list.upper,
             udev = __netdev_next_upper_dev(dev, &iter, &ignore);
             udev;
             udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
                if (ignore)
                        continue;
                if (max_depth < udev->upper_level)
                        max_depth = udev->upper_level;
        }

        return max_depth;
}

static u8 __netdev_lower_depth(struct net_device *dev)
{
        struct net_device *ldev;
        struct list_head *iter;
        u8 max_depth = 0;
        bool ignore;

        for (iter = &dev->adj_list.lower,
             ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
             ldev;
             ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
                if (ignore)
                        continue;
                if (max_depth < ldev->lower_level)
                        max_depth = ldev->lower_level;
        }

        return max_depth;
}

static int __netdev_update_upper_level(struct net_device *dev,
                                       struct netdev_nested_priv *__unused)
{
        dev->upper_level = __netdev_upper_depth(dev) + 1;
        return 0;
}

#ifdef CONFIG_LOCKDEP
static LIST_HEAD(net_unlink_list);

static void net_unlink_todo(struct net_device *dev)
{
        if (list_empty(&dev->unlink_list))
                list_add_tail(&dev->unlink_list, &net_unlink_list);
}
#endif

static int __netdev_update_lower_level(struct net_device *dev,
                                       struct netdev_nested_priv *priv)
{
        dev->lower_level = __netdev_lower_depth(dev) + 1;

#ifdef CONFIG_LOCKDEP
        if (!priv)
                return 0;

        if (priv->flags & NESTED_SYNC_IMM)
                dev->nested_level = dev->lower_level - 1;
        if (priv->flags & NESTED_SYNC_TODO)
                net_unlink_todo(dev);
#endif
        return 0;
}

int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = netdev_next_lower_dev_rcu(now, &iter);
                        if (!ldev)
                                break;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);

/**
 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 *                                       lower neighbour list, RCU
 *                                       variant
 * @dev: device
 *
 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 * list. The caller must hold RCU read lock.
 */
void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{
        struct netdev_adjacent *lower;

        lower = list_first_or_null_rcu(&dev->adj_list.lower,
                        struct netdev_adjacent, list);
        if (lower)
                return lower->private;
        return NULL;
}
EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);

/**
 * netdev_master_upper_dev_get_rcu - Get master upper device
 * @dev: device
 *
 * Find a master upper device and return pointer to it or NULL in case
 * it's not there. The caller must hold the RCU read lock.
 */
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        upper = list_first_or_null_rcu(&dev->adj_list.upper,
                                       struct netdev_adjacent, list);
        if (upper && likely(upper->master))
                return upper->dev;
        return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);

static int netdev_adjacent_sysfs_add(struct net_device *dev,
                              struct net_device *adj_dev,
                              struct list_head *dev_list)
{
        char linkname[IFNAMSIZ+7];

        sprintf(linkname, dev_list == &dev->adj_list.upper ?
                "upper_%s" : "lower_%s", adj_dev->name);
        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
                                 linkname);
}
static void netdev_adjacent_sysfs_del(struct net_device *dev,
                               char *name,
                               struct list_head *dev_list)
{
        char linkname[IFNAMSIZ+7];

        sprintf(linkname, dev_list == &dev->adj_list.upper ?
                "upper_%s" : "lower_%s", name);
        sysfs_remove_link(&(dev->dev.kobj), linkname);
}

static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
                                                 struct net_device *adj_dev,
                                                 struct list_head *dev_list)
{
        return (dev_list == &dev->adj_list.upper ||
                dev_list == &dev->adj_list.lower) &&
                net_eq(dev_net(dev), dev_net(adj_dev));
}

static int __netdev_adjacent_dev_insert(struct net_device *dev,
                                        struct net_device *adj_dev,
                                        struct list_head *dev_list,
                                        void *private, bool master)
{
        struct netdev_adjacent *adj;
        int ret;

        adj = __netdev_find_adj(adj_dev, dev_list);

        if (adj) {
                adj->ref_nr += 1;
                pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
                         dev->name, adj_dev->name, adj->ref_nr);

                return 0;
        }

        adj = kmalloc_obj(*adj);
        if (!adj)
                return -ENOMEM;

        adj->dev = adj_dev;
        adj->master = master;
        adj->ref_nr = 1;
        adj->private = private;
        adj->ignore = false;
        netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);

        pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
                 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);

        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
                if (ret)
                        goto free_adj;
        }

        /* Ensure that master link is always the first item in list. */
        if (master) {
                ret = sysfs_create_link(&(dev->dev.kobj),
                                        &(adj_dev->dev.kobj), "master");
                if (ret)
                        goto remove_symlinks;

                list_add_rcu(&adj->list, dev_list);
        } else {
                list_add_tail_rcu(&adj->list, dev_list);
        }

        return 0;

remove_symlinks:
        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
        netdev_put(adj_dev, &adj->dev_tracker);
        kfree(adj);

        return ret;
}

static void __netdev_adjacent_dev_remove(struct net_device *dev,
                                         struct net_device *adj_dev,
                                         u16 ref_nr,
                                         struct list_head *dev_list)
{
        struct netdev_adjacent *adj;

        pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
                 dev->name, adj_dev->name, ref_nr);

        adj = __netdev_find_adj(adj_dev, dev_list);

        if (!adj) {
                pr_err("Adjacency does not exist for device %s from %s\n",
                       dev->name, adj_dev->name);
                WARN_ON(1);
                return;
        }

        if (adj->ref_nr > ref_nr) {
                pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
                         dev->name, adj_dev->name, ref_nr,
                         adj->ref_nr - ref_nr);
                adj->ref_nr -= ref_nr;
                return;
        }

        if (adj->master)
                sysfs_remove_link(&(dev->dev.kobj), "master");

        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);

        list_del_rcu(&adj->list);
        pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
                 adj_dev->name, dev->name, adj_dev->name);
        netdev_put(adj_dev, &adj->dev_tracker);
        kfree_rcu(adj, rcu);
}

static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
                                            struct net_device *upper_dev,
                                            struct list_head *up_list,
                                            struct list_head *down_list,
                                            void *private, bool master)
{
        int ret;

        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
                                           private, master);
        if (ret)
                return ret;

        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
                                           private, false);
        if (ret) {
                __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
                return ret;
        }

        return 0;
}

static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
                                               struct net_device *upper_dev,
                                               u16 ref_nr,
                                               struct list_head *up_list,
                                               struct list_head *down_list)
{
        __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
        __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}

static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
                                                struct net_device *upper_dev,
                                                void *private, bool master)
{
        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
                                                &dev->adj_list.upper,
                                                &upper_dev->adj_list.lower,
                                                private, master);
}

static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
                                                   struct net_device *upper_dev)
{
        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
                                           &dev->adj_list.upper,
                                           &upper_dev->adj_list.lower);
}

static int __netdev_upper_dev_link(struct net_device *dev,
                                   struct net_device *upper_dev, bool master,
                                   void *upper_priv, void *upper_info,
                                   struct netdev_nested_priv *priv,
                                   struct netlink_ext_ack *extack)
{
        struct netdev_notifier_changeupper_info changeupper_info = {
                .info = {
                        .dev = dev,
                        .extack = extack,
                },
                .upper_dev = upper_dev,
                .master = master,
                .linking = true,
                .upper_info = upper_info,
        };
        struct net_device *master_dev;
        int ret = 0;

        ASSERT_RTNL();

        if (dev == upper_dev)
                return -EBUSY;

        /* To prevent loops, check if dev is not upper device to upper_dev. */
        if (__netdev_has_upper_dev(upper_dev, dev))
                return -EBUSY;

        if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
                return -EMLINK;

        if (!master) {
                if (__netdev_has_upper_dev(dev, upper_dev))
                        return -EEXIST;
        } else {
                master_dev = __netdev_master_upper_dev_get(dev);
                if (master_dev)
                        return master_dev == upper_dev ? -EEXIST : -EBUSY;
        }

        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
                                            &changeupper_info.info);
        ret = notifier_to_errno(ret);
        if (ret)
                return ret;

        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
                                                   master);
        if (ret)
                return ret;

        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
                                            &changeupper_info.info);
        ret = notifier_to_errno(ret);
        if (ret)
                goto rollback;

        __netdev_update_upper_level(dev, NULL);
        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);

        __netdev_update_lower_level(upper_dev, priv);
        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
                                    priv);

        return 0;

rollback:
        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);

        return ret;
}

/**
 * netdev_upper_dev_link - Add a link to the upper device
 * @dev: device
 * @upper_dev: new upper device
 * @extack: netlink extended ack
 *
 * Adds a link to device which is upper to this one. The caller must hold
 * the RTNL lock. On a failure a negative errno code is returned.
 * On success the reference counts are adjusted and the function
 * returns zero.
 */
int netdev_upper_dev_link(struct net_device *dev,
                          struct net_device *upper_dev,
                          struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        return __netdev_upper_dev_link(dev, upper_dev, false,
                                       NULL, NULL, &priv, extack);
}
EXPORT_SYMBOL(netdev_upper_dev_link);

/**
 * netdev_master_upper_dev_link - Add a master link to the upper device
 * @dev: device
 * @upper_dev: new upper device
 * @upper_priv: upper device private
 * @upper_info: upper info to be passed down via notifier
 * @extack: netlink extended ack
 *
 * Adds a link to device which is upper to this one. In this case, only
 * one master upper device can be linked, although other non-master devices
 * might be linked as well. The caller must hold the RTNL lock.
 * On a failure a negative errno code is returned. On success the reference
 * counts are adjusted and the function returns zero.
 */
int netdev_master_upper_dev_link(struct net_device *dev,
                                 struct net_device *upper_dev,
                                 void *upper_priv, void *upper_info,
                                 struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        return __netdev_upper_dev_link(dev, upper_dev, true,
                                       upper_priv, upper_info, &priv, extack);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);

static void __netdev_upper_dev_unlink(struct net_device *dev,
                                      struct net_device *upper_dev,
                                      struct netdev_nested_priv *priv)
{
        struct netdev_notifier_changeupper_info changeupper_info = {
                .info = {
                        .dev = dev,
                },
                .upper_dev = upper_dev,
                .linking = false,
        };

        ASSERT_RTNL();

        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;

        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
                                      &changeupper_info.info);

        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);

        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
                                      &changeupper_info.info);

        __netdev_update_upper_level(dev, NULL);
        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);

        __netdev_update_lower_level(upper_dev, priv);
        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
                                    priv);
}

/**
 * netdev_upper_dev_unlink - Removes a link to upper device
 * @dev: device
 * @upper_dev: new upper device
 *
 * Removes a link to device which is upper to this one. The caller must hold
 * the RTNL lock.
 */
void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_TODO,
                .data = NULL,
        };

        __netdev_upper_dev_unlink(dev, upper_dev, &priv);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);

static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
                                      struct net_device *lower_dev,
                                      bool val)
{
        struct netdev_adjacent *adj;

        adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
        if (adj)
                adj->ignore = val;

        adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
        if (adj)
                adj->ignore = val;
}

static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
                                        struct net_device *lower_dev)
{
        __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
}

static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
                                       struct net_device *lower_dev)
{
        __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
}

int netdev_adjacent_change_prepare(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev,
                                   struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = NULL,
        };
        int err;

        if (!new_dev)
                return 0;

        if (old_dev && new_dev != old_dev)
                netdev_adjacent_dev_disable(dev, old_dev);
        err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
                                      extack);
        if (err) {
                if (old_dev && new_dev != old_dev)
                        netdev_adjacent_dev_enable(dev, old_dev);
                return err;
        }

        return 0;
}
EXPORT_SYMBOL(netdev_adjacent_change_prepare);

void netdev_adjacent_change_commit(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        if (!new_dev || !old_dev)
                return;

        if (new_dev == old_dev)
                return;

        netdev_adjacent_dev_enable(dev, old_dev);
        __netdev_upper_dev_unlink(old_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_commit);

void netdev_adjacent_change_abort(struct net_device *old_dev,
                                  struct net_device *new_dev,
                                  struct net_device *dev)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = NULL,
        };

        if (!new_dev)
                return;

        if (old_dev && new_dev != old_dev)
                netdev_adjacent_dev_enable(dev, old_dev);

        __netdev_upper_dev_unlink(new_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_abort);

/**
 * netdev_bonding_info_change - Dispatch event about slave change
 * @dev: device
 * @bonding_info: info to dispatch
 *
 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 * The caller must hold the RTNL lock.
 */
void netdev_bonding_info_change(struct net_device *dev,
                                struct netdev_bonding_info *bonding_info)
{
        struct netdev_notifier_bonding_info info = {
                .info.dev = dev,
        };

        memcpy(&info.bonding_info, bonding_info,
               sizeof(struct netdev_bonding_info));
        call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
                                      &info.info);
}
EXPORT_SYMBOL(netdev_bonding_info_change);

static int netdev_offload_xstats_enable_l3(struct net_device *dev,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
        };
        int err;
        int rc;

        dev->offload_xstats_l3 = kzalloc_obj(*dev->offload_xstats_l3);
        if (!dev->offload_xstats_l3)
                return -ENOMEM;

        rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
                                                  NETDEV_OFFLOAD_XSTATS_DISABLE,
                                                  &info.info);
        err = notifier_to_errno(rc);
        if (err)
                goto free_stats;

        return 0;

free_stats:
        kfree(dev->offload_xstats_l3);
        dev->offload_xstats_l3 = NULL;
        return err;
}

int netdev_offload_xstats_enable(struct net_device *dev,
                                 enum netdev_offload_xstats_type type,
                                 struct netlink_ext_ack *extack)
{
        ASSERT_RTNL();

        if (netdev_offload_xstats_enabled(dev, type))
                return -EALREADY;

        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                return netdev_offload_xstats_enable_l3(dev, extack);
        }

        WARN_ON(1);
        return -EINVAL;
}
EXPORT_SYMBOL(netdev_offload_xstats_enable);

static void netdev_offload_xstats_disable_l3(struct net_device *dev)
{
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
        };

        call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
                                      &info.info);
        kfree(dev->offload_xstats_l3);
        dev->offload_xstats_l3 = NULL;
}

int netdev_offload_xstats_disable(struct net_device *dev,
                                  enum netdev_offload_xstats_type type)
{
        ASSERT_RTNL();

        if (!netdev_offload_xstats_enabled(dev, type))
                return -EALREADY;

        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                netdev_offload_xstats_disable_l3(dev);
                return 0;
        }

        WARN_ON(1);
        return -EINVAL;
}
EXPORT_SYMBOL(netdev_offload_xstats_disable);

static void netdev_offload_xstats_disable_all(struct net_device *dev)
{
        netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
}

static struct rtnl_hw_stats64 *
netdev_offload_xstats_get_ptr(const struct net_device *dev,
                              enum netdev_offload_xstats_type type)
{
        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                return dev->offload_xstats_l3;
        }

        WARN_ON(1);
        return NULL;
}

bool netdev_offload_xstats_enabled(const struct net_device *dev,
                                   enum netdev_offload_xstats_type type)
{
        ASSERT_RTNL();

        return netdev_offload_xstats_get_ptr(dev, type);
}
EXPORT_SYMBOL(netdev_offload_xstats_enabled);

struct netdev_notifier_offload_xstats_ru {
        bool used;
};

struct netdev_notifier_offload_xstats_rd {
        struct rtnl_hw_stats64 stats;
        bool used;
};

static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
                                  const struct rtnl_hw_stats64 *src)
{
        dest->rx_packets          += src->rx_packets;
        dest->tx_packets          += src->tx_packets;
        dest->rx_bytes                  += src->rx_bytes;
        dest->tx_bytes                  += src->tx_bytes;
        dest->rx_errors                  += src->rx_errors;
        dest->tx_errors                  += src->tx_errors;
        dest->rx_dropped          += src->rx_dropped;
        dest->tx_dropped          += src->tx_dropped;
        dest->multicast                  += src->multicast;
}

static int netdev_offload_xstats_get_used(struct net_device *dev,
                                          enum netdev_offload_xstats_type type,
                                          bool *p_used,
                                          struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_ru report_used = {};
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = type,
                .report_used = &report_used,
        };
        int rc;

        WARN_ON(!netdev_offload_xstats_enabled(dev, type));
        rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
                                           &info.info);
        *p_used = report_used.used;
        return notifier_to_errno(rc);
}

static int netdev_offload_xstats_get_stats(struct net_device *dev,
                                           enum netdev_offload_xstats_type type,
                                           struct rtnl_hw_stats64 *p_stats,
                                           bool *p_used,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_rd report_delta = {};
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = type,
                .report_delta = &report_delta,
        };
        struct rtnl_hw_stats64 *stats;
        int rc;

        stats = netdev_offload_xstats_get_ptr(dev, type);
        if (WARN_ON(!stats))
                return -EINVAL;

        rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
                                           &info.info);

        /* Cache whatever we got, even if there was an error, otherwise the
         * successful stats retrievals would get lost.
         */
        netdev_hw_stats64_add(stats, &report_delta.stats);

        if (p_stats)
                *p_stats = *stats;
        *p_used = report_delta.used;

        return notifier_to_errno(rc);
}

int netdev_offload_xstats_get(struct net_device *dev,
                              enum netdev_offload_xstats_type type,
                              struct rtnl_hw_stats64 *p_stats, bool *p_used,
                              struct netlink_ext_ack *extack)
{
        ASSERT_RTNL();

        if (p_stats)
                return netdev_offload_xstats_get_stats(dev, type, p_stats,
                                                       p_used, extack);
        else
                return netdev_offload_xstats_get_used(dev, type, p_used,
                                                      extack);
}
EXPORT_SYMBOL(netdev_offload_xstats_get);

void
netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
                                   const struct rtnl_hw_stats64 *stats)
{
        report_delta->used = true;
        netdev_hw_stats64_add(&report_delta->stats, stats);
}
EXPORT_SYMBOL(netdev_offload_xstats_report_delta);

void
netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
{
        report_used->used = true;
}
EXPORT_SYMBOL(netdev_offload_xstats_report_used);

void netdev_offload_xstats_push_delta(struct net_device *dev,
                                      enum netdev_offload_xstats_type type,
                                      const struct rtnl_hw_stats64 *p_stats)
{
        struct rtnl_hw_stats64 *stats;

        ASSERT_RTNL();

        stats = netdev_offload_xstats_get_ptr(dev, type);
        if (WARN_ON(!stats))
                return;

        netdev_hw_stats64_add(stats, p_stats);
}
EXPORT_SYMBOL(netdev_offload_xstats_push_delta);

/**
 * netdev_get_xmit_slave - Get the xmit slave of master device
 * @dev: device
 * @skb: The packet
 * @all_slaves: assume all the slaves are active
 *
 * The reference counters are not incremented so the caller must be
 * careful with locks. The caller must hold RCU lock.
 * %NULL is returned if no slave is found.
 */

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
                                         bool all_slaves)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_get_xmit_slave)
                return NULL;
        return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
}
EXPORT_SYMBOL(netdev_get_xmit_slave);

static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
                                                  struct sock *sk)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_sk_get_lower_dev)
                return NULL;
        return ops->ndo_sk_get_lower_dev(dev, sk);
}

/**
 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
 * @dev: device
 * @sk: the socket
 *
 * %NULL is returned if no lower device is found.
 */

struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
                                            struct sock *sk)
{
        struct net_device *lower;

        lower = netdev_sk_get_lower_dev(dev, sk);
        while (lower) {
                dev = lower;
                lower = netdev_sk_get_lower_dev(dev, sk);
        }

        return dev;
}
EXPORT_SYMBOL(netdev_sk_get_lowest_dev);

static void netdev_adjacent_add_links(struct net_device *dev)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_add(dev, iter->dev,
                                          &dev->adj_list.upper);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_add(dev, iter->dev,
                                          &dev->adj_list.lower);
        }
}

static void netdev_adjacent_del_links(struct net_device *dev)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, dev->name,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_del(dev, iter->dev->name,
                                          &dev->adj_list.upper);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, dev->name,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_del(dev, iter->dev->name,
                                          &dev->adj_list.lower);
        }
}

void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, oldname,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.lower);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, oldname,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.upper);
        }
}

void *netdev_lower_dev_get_private(struct net_device *dev,
                                   struct net_device *lower_dev)
{
        struct netdev_adjacent *lower;

        if (!lower_dev)
                return NULL;
        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
        if (!lower)
                return NULL;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_dev_get_private);


/**
 * netdev_lower_state_changed - Dispatch event about lower device state change
 * @lower_dev: device
 * @lower_state_info: state to dispatch
 *
 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 * The caller must hold the RTNL lock.
 */
void netdev_lower_state_changed(struct net_device *lower_dev,
                                void *lower_state_info)
{
        struct netdev_notifier_changelowerstate_info changelowerstate_info = {
                .info.dev = lower_dev,
        };

        ASSERT_RTNL();
        changelowerstate_info.lower_state_info = lower_state_info;
        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
                                      &changelowerstate_info.info);
}
EXPORT_SYMBOL(netdev_lower_state_changed);

static void dev_change_rx_flags(struct net_device *dev, int flags)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_change_rx_flags)
                ops->ndo_change_rx_flags(dev, flags);
}

static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
        unsigned int old_flags = dev->flags;
        unsigned int promiscuity, flags;
        kuid_t uid;
        kgid_t gid;

        ASSERT_RTNL();

        promiscuity = dev->promiscuity + inc;
        if (promiscuity == 0) {
                /*
                 * Avoid overflow.
                 * If inc causes overflow, untouch promisc and return error.
                 */
                if (unlikely(inc > 0)) {
                        netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
                        return -EOVERFLOW;
                }
                flags = old_flags & ~IFF_PROMISC;
        } else {
                flags = old_flags | IFF_PROMISC;
        }
        WRITE_ONCE(dev->promiscuity, promiscuity);
        if (flags != old_flags) {
                WRITE_ONCE(dev->flags, flags);
                netdev_info(dev, "%s promiscuous mode\n",
                            dev->flags & IFF_PROMISC ? "entered" : "left");
                if (audit_enabled) {
                        current_uid_gid(&uid, &gid);
                        audit_log(audit_context(), GFP_ATOMIC,
                                  AUDIT_ANOM_PROMISCUOUS,
                                  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
                                  dev->name, (dev->flags & IFF_PROMISC),
                                  (old_flags & IFF_PROMISC),
                                  from_kuid(&init_user_ns, audit_get_loginuid(current)),
                                  from_kuid(&init_user_ns, uid),
                                  from_kgid(&init_user_ns, gid),
                                  audit_get_sessionid(current));
                }

                dev_change_rx_flags(dev, IFF_PROMISC);
        }
        if (notify) {
                /* The ops lock is only required to ensure consistent locking
                 * for `NETDEV_CHANGE` notifiers. This function is sometimes
                 * called without the lock, even for devices that are ops
                 * locked, such as in `dev_uc_sync_multiple` when using
                 * bonding or teaming.
                 */
                netdev_ops_assert_locked(dev);
                __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
        }
        return 0;
}

int netif_set_promiscuity(struct net_device *dev, int inc)
{
        unsigned int old_flags = dev->flags;
        int err;

        err = __dev_set_promiscuity(dev, inc, true);
        if (err < 0)
                return err;
        if (dev->flags != old_flags)
                dev_set_rx_mode(dev);
        return err;
}

int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
{
        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
        unsigned int allmulti, flags;

        ASSERT_RTNL();

        allmulti = dev->allmulti + inc;
        if (allmulti == 0) {
                /*
                 * Avoid overflow.
                 * If inc causes overflow, untouch allmulti and return error.
                 */
                if (unlikely(inc > 0)) {
                        netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
                        return -EOVERFLOW;
                }
                flags = old_flags & ~IFF_ALLMULTI;
        } else {
                flags = old_flags | IFF_ALLMULTI;
        }
        WRITE_ONCE(dev->allmulti, allmulti);
        if (flags != old_flags) {
                WRITE_ONCE(dev->flags, flags);
                netdev_info(dev, "%s allmulticast mode\n",
                            dev->flags & IFF_ALLMULTI ? "entered" : "left");
                dev_change_rx_flags(dev, IFF_ALLMULTI);
                dev_set_rx_mode(dev);
                if (notify)
                        __dev_notify_flags(dev, old_flags,
                                           dev->gflags ^ old_gflags, 0, NULL);
        }
        return 0;
}

/*
 *        Upload unicast and multicast address lists to device and
 *        configure RX filtering. When the device doesn't support unicast
 *        filtering it is put in promiscuous mode while unicast addresses
 *        are present.
 */
void __dev_set_rx_mode(struct net_device *dev)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        /* dev_open will call this function so the list will stay sane. */
        if (!(dev->flags&IFF_UP))
                return;

        if (!netif_device_present(dev))
                return;

        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
                /* Unicast addresses changes may only happen under the rtnl,
                 * therefore calling __dev_set_promiscuity here is safe.
                 */
                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
                        __dev_set_promiscuity(dev, 1, false);
                        dev->uc_promisc = true;
                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
                        __dev_set_promiscuity(dev, -1, false);
                        dev->uc_promisc = false;
                }
        }

        if (ops->ndo_set_rx_mode)
                ops->ndo_set_rx_mode(dev);
}

void dev_set_rx_mode(struct net_device *dev)
{
        netif_addr_lock_bh(dev);
        __dev_set_rx_mode(dev);
        netif_addr_unlock_bh(dev);
}

/**
 * netif_get_flags() - get flags reported to userspace
 * @dev: device
 *
 * Get the combination of flag bits exported through APIs to userspace.
 */
unsigned int netif_get_flags(const struct net_device *dev)
{
        unsigned int flags;

        flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
                                IFF_ALLMULTI |
                                IFF_RUNNING |
                                IFF_LOWER_UP |
                                IFF_DORMANT)) |
                (READ_ONCE(dev->gflags) & (IFF_PROMISC |
                                IFF_ALLMULTI));

        if (netif_running(dev)) {
                if (netif_oper_up(dev))
                        flags |= IFF_RUNNING;
                if (netif_carrier_ok(dev))
                        flags |= IFF_LOWER_UP;
                if (netif_dormant(dev))
                        flags |= IFF_DORMANT;
        }

        return flags;
}
EXPORT_SYMBOL(netif_get_flags);

int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack)
{
        unsigned int old_flags = dev->flags;
        int ret;

        ASSERT_RTNL();

        /*
         *        Set the flags on our device.
         */

        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
                               IFF_AUTOMEDIA)) |
                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
                                    IFF_ALLMULTI));

        /*
         *        Load in the correct multicast list now the flags have changed.
         */

        if ((old_flags ^ flags) & IFF_MULTICAST)
                dev_change_rx_flags(dev, IFF_MULTICAST);

        dev_set_rx_mode(dev);

        /*
         *        Have we downed the interface. We handle IFF_UP ourselves
         *        according to user attempts to set it, rather than blindly
         *        setting it.
         */

        ret = 0;
        if ((old_flags ^ flags) & IFF_UP) {
                if (old_flags & IFF_UP)
                        __dev_close(dev);
                else
                        ret = __dev_open(dev, extack);
        }

        if ((flags ^ dev->gflags) & IFF_PROMISC) {
                int inc = (flags & IFF_PROMISC) ? 1 : -1;
                old_flags = dev->flags;

                dev->gflags ^= IFF_PROMISC;

                if (__dev_set_promiscuity(dev, inc, false) >= 0)
                        if (dev->flags != old_flags)
                                dev_set_rx_mode(dev);
        }

        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
         * is important. Some (broken) drivers set IFF_PROMISC, when
         * IFF_ALLMULTI is requested not asking us and not reporting.
         */
        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;

                dev->gflags ^= IFF_ALLMULTI;
                netif_set_allmulti(dev, inc, false);
        }

        return ret;
}

void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
                        unsigned int gchanges, u32 portid,
                        const struct nlmsghdr *nlh)
{
        unsigned int changes = dev->flags ^ old_flags;

        if (gchanges)
                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);

        if (changes & IFF_UP) {
                if (dev->flags & IFF_UP)
                        call_netdevice_notifiers(NETDEV_UP, dev);
                else
                        call_netdevice_notifiers(NETDEV_DOWN, dev);
        }

        if (dev->flags & IFF_UP &&
            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
                struct netdev_notifier_change_info change_info = {
                        .info = {
                                .dev = dev,
                        },
                        .flags_changed = changes,
                };

                call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
        }
}

int netif_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack)
{
        int ret;
        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;

        ret = __dev_change_flags(dev, flags, extack);
        if (ret < 0)
                return ret;

        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
        __dev_notify_flags(dev, old_flags, changes, 0, NULL);
        return ret;
}

int __netif_set_mtu(struct net_device *dev, int new_mtu)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_change_mtu)
                return ops->ndo_change_mtu(dev, new_mtu);

        /* Pairs with all the lockless reads of dev->mtu in the stack */
        WRITE_ONCE(dev->mtu, new_mtu);
        return 0;
}
EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL");

int dev_validate_mtu(struct net_device *dev, int new_mtu,
                     struct netlink_ext_ack *extack)
{
        /* MTU must be positive, and in range */
        if (new_mtu < 0 || new_mtu < dev->min_mtu) {
                NL_SET_ERR_MSG(extack, "mtu less than device minimum");
                return -EINVAL;
        }

        if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
                NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
                return -EINVAL;
        }
        return 0;
}

/**
 * netif_set_mtu_ext() - Change maximum transfer unit
 * @dev: device
 * @new_mtu: new transfer unit
 * @extack: netlink extended ack
 *
 * Change the maximum transfer size of the network device.
 *
 * Return: 0 on success, -errno on failure.
 */
int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
                      struct netlink_ext_ack *extack)
{
        int err, orig_mtu;

        netdev_ops_assert_locked(dev);

        if (new_mtu == dev->mtu)
                return 0;

        err = dev_validate_mtu(dev, new_mtu, extack);
        if (err)
                return err;

        if (!netif_device_present(dev))
                return -ENODEV;

        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
        err = notifier_to_errno(err);
        if (err)
                return err;

        orig_mtu = dev->mtu;
        err = __netif_set_mtu(dev, new_mtu);

        if (!err) {
                err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
                                                   orig_mtu);
                err = notifier_to_errno(err);
                if (err) {
                        /* setting mtu back and notifying everyone again,
                         * so that they have a chance to revert changes.
                         */
                        __netif_set_mtu(dev, orig_mtu);
                        call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
                                                     new_mtu);
                }
        }
        return err;
}

int netif_set_mtu(struct net_device *dev, int new_mtu)
{
        struct netlink_ext_ack extack;
        int err;

        memset(&extack, 0, sizeof(extack));
        err = netif_set_mtu_ext(dev, new_mtu, &extack);
        if (err && extack._msg)
                net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
        return err;
}
EXPORT_SYMBOL(netif_set_mtu);

int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
        unsigned int orig_len = dev->tx_queue_len;
        int res;

        if (new_len != (unsigned int)new_len)
                return -ERANGE;

        if (new_len != orig_len) {
                WRITE_ONCE(dev->tx_queue_len, new_len);
                res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
                res = notifier_to_errno(res);
                if (res)
                        goto err_rollback;
                res = dev_qdisc_change_tx_queue_len(dev);
                if (res)
                        goto err_rollback;
        }

        return 0;

err_rollback:
        netdev_err(dev, "refused to change device tx_queue_len\n");
        WRITE_ONCE(dev->tx_queue_len, orig_len);
        return res;
}

void netif_set_group(struct net_device *dev, int new_group)
{
        dev->group = new_group;
}

/**
 * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR.
 * @dev: device
 * @addr: new address
 * @extack: netlink extended ack
 *
 * Return: 0 on success, -errno on failure.
 */
int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
                                struct netlink_ext_ack *extack)
{
        struct netdev_notifier_pre_changeaddr_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .dev_addr = addr,
        };
        int rc;

        rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
        return notifier_to_errno(rc);
}
EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL");

int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
                          struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int err;

        if (!ops->ndo_set_mac_address)
                return -EOPNOTSUPP;
        if (ss->ss_family != dev->type)
                return -EINVAL;
        if (!netif_device_present(dev))
                return -ENODEV;
        err = netif_pre_changeaddr_notify(dev, ss->__data, extack);
        if (err)
                return err;
        if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
                err = ops->ndo_set_mac_address(dev, ss);
                if (err)
                        return err;
        }
        dev->addr_assign_type = NET_ADDR_SET;
        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
        add_device_randomness(dev->dev_addr, dev->addr_len);
        return 0;
}

DECLARE_RWSEM(dev_addr_sem);

/* "sa" is a true struct sockaddr with limited "sa_data" member. */
int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
        size_t size = sizeof(sa->sa_data);
        struct net_device *dev;
        int ret = 0;

        down_read(&dev_addr_sem);
        rcu_read_lock();

        dev = dev_get_by_name_rcu(net, dev_name);
        if (!dev) {
                ret = -ENODEV;
                goto unlock;
        }
        if (!dev->addr_len)
                memset(sa->sa_data, 0, size);
        else
                memcpy(sa->sa_data, dev->dev_addr,
                       min_t(size_t, size, dev->addr_len));
        sa->sa_family = dev->type;

unlock:
        rcu_read_unlock();
        up_read(&dev_addr_sem);
        return ret;
}
EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL");

int netif_change_carrier(struct net_device *dev, bool new_carrier)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_change_carrier)
                return -EOPNOTSUPP;
        if (!netif_device_present(dev))
                return -ENODEV;
        return ops->ndo_change_carrier(dev, new_carrier);
}

/**
 *        dev_get_phys_port_id - Get device physical port ID
 *        @dev: device
 *        @ppid: port ID
 *
 *        Get device physical port ID
 */
int dev_get_phys_port_id(struct net_device *dev,
                         struct netdev_phys_item_id *ppid)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_get_phys_port_id)
                return -EOPNOTSUPP;
        return ops->ndo_get_phys_port_id(dev, ppid);
}

/**
 *        dev_get_phys_port_name - Get device physical port name
 *        @dev: device
 *        @name: port name
 *        @len: limit of bytes to copy to name
 *
 *        Get device physical port name
 */
int dev_get_phys_port_name(struct net_device *dev,
                           char *name, size_t len)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int err;

        if (ops->ndo_get_phys_port_name) {
                err = ops->ndo_get_phys_port_name(dev, name, len);
                if (err != -EOPNOTSUPP)
                        return err;
        }
        return devlink_compat_phys_port_name_get(dev, name, len);
}

/**
 * netif_get_port_parent_id() - Get the device's port parent identifier
 * @dev: network device
 * @ppid: pointer to a storage for the port's parent identifier
 * @recurse: allow/disallow recursion to lower devices
 *
 * Get the devices's port parent identifier.
 *
 * Return: 0 on success, -errno on failure.
 */
int netif_get_port_parent_id(struct net_device *dev,
                             struct netdev_phys_item_id *ppid, bool recurse)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        struct netdev_phys_item_id first = { };
        struct net_device *lower_dev;
        struct list_head *iter;
        int err;

        if (ops->ndo_get_port_parent_id) {
                err = ops->ndo_get_port_parent_id(dev, ppid);
                if (err != -EOPNOTSUPP)
                        return err;
        }

        err = devlink_compat_switch_id_get(dev, ppid);
        if (!recurse || err != -EOPNOTSUPP)
                return err;

        netdev_for_each_lower_dev(dev, lower_dev, iter) {
                err = netif_get_port_parent_id(lower_dev, ppid, true);
                if (err)
                        break;
                if (!first.id_len)
                        first = *ppid;
                else if (memcmp(&first, ppid, sizeof(*ppid)))
                        return -EOPNOTSUPP;
        }

        return err;
}
EXPORT_SYMBOL(netif_get_port_parent_id);

/**
 *        netdev_port_same_parent_id - Indicate if two network devices have
 *        the same port parent identifier
 *        @a: first network device
 *        @b: second network device
 */
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
{
        struct netdev_phys_item_id a_id = { };
        struct netdev_phys_item_id b_id = { };

        if (netif_get_port_parent_id(a, &a_id, true) ||
            netif_get_port_parent_id(b, &b_id, true))
                return false;

        return netdev_phys_item_id_same(&a_id, &b_id);
}
EXPORT_SYMBOL(netdev_port_same_parent_id);

int netif_change_proto_down(struct net_device *dev, bool proto_down)
{
        if (!dev->change_proto_down)
                return -EOPNOTSUPP;
        if (!netif_device_present(dev))
                return -ENODEV;
        if (proto_down)
                netif_carrier_off(dev);
        else
                netif_carrier_on(dev);
        WRITE_ONCE(dev->proto_down, proto_down);
        return 0;
}

/**
 *        netdev_change_proto_down_reason_locked - proto down reason
 *
 *        @dev: device
 *        @mask: proto down mask
 *        @value: proto down value
 */
void netdev_change_proto_down_reason_locked(struct net_device *dev,
                                            unsigned long mask, u32 value)
{
        u32 proto_down_reason;
        int b;

        if (!mask) {
                proto_down_reason = value;
        } else {
                proto_down_reason = dev->proto_down_reason;
                for_each_set_bit(b, &mask, 32) {
                        if (value & (1 << b))
                                proto_down_reason |= BIT(b);
                        else
                                proto_down_reason &= ~BIT(b);
                }
        }
        WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
}

struct bpf_xdp_link {
        struct bpf_link link;
        struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
        int flags;
};

static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
{
        if (flags & XDP_FLAGS_HW_MODE)
                return XDP_MODE_HW;
        if (flags & XDP_FLAGS_DRV_MODE)
                return XDP_MODE_DRV;
        if (flags & XDP_FLAGS_SKB_MODE)
                return XDP_MODE_SKB;
        return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
}

static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
{
        switch (mode) {
        case XDP_MODE_SKB:
                return generic_xdp_install;
        case XDP_MODE_DRV:
        case XDP_MODE_HW:
                return dev->netdev_ops->ndo_bpf;
        default:
                return NULL;
        }
}

static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
                                         enum bpf_xdp_mode mode)
{
        return dev->xdp_state[mode].link;
}

static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
                                     enum bpf_xdp_mode mode)
{
        struct bpf_xdp_link *link = dev_xdp_link(dev, mode);

        if (link)
                return link->link.prog;
        return dev->xdp_state[mode].prog;
}

u8 dev_xdp_prog_count(struct net_device *dev)
{
        u8 count = 0;
        int i;

        for (i = 0; i < __MAX_XDP_MODE; i++)
                if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
                        count++;
        return count;
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);

u8 dev_xdp_sb_prog_count(struct net_device *dev)
{
        u8 count = 0;
        int i;

        for (i = 0; i < __MAX_XDP_MODE; i++)
                if (dev->xdp_state[i].prog &&
                    !dev->xdp_state[i].prog->aux->xdp_has_frags)
                        count++;
        return count;
}

int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
{
        if (!dev->netdev_ops->ndo_bpf)
                return -EOPNOTSUPP;

        if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
            bpf->command == XDP_SETUP_PROG &&
            bpf->prog && !bpf->prog->aux->xdp_has_frags) {
                NL_SET_ERR_MSG(bpf->extack,
                               "unable to propagate XDP to device using tcp-data-split");
                return -EBUSY;
        }

        if (dev_get_min_mp_channel_count(dev)) {
                NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
                return -EBUSY;
        }

        return dev->netdev_ops->ndo_bpf(dev, bpf);
}
EXPORT_SYMBOL_GPL(netif_xdp_propagate);

u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
        struct bpf_prog *prog = dev_xdp_prog(dev, mode);

        return prog ? prog->aux->id : 0;
}

static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
                             struct bpf_xdp_link *link)
{
        dev->xdp_state[mode].link = link;
        dev->xdp_state[mode].prog = NULL;
}

static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
                             struct bpf_prog *prog)
{
        dev->xdp_state[mode].link = NULL;
        dev->xdp_state[mode].prog = prog;
}

static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
                           bpf_op_t bpf_op, struct netlink_ext_ack *extack,
                           u32 flags, struct bpf_prog *prog)
{
        struct netdev_bpf xdp;
        int err;

        netdev_ops_assert_locked(dev);

        if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
            prog && !prog->aux->xdp_has_frags) {
                NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
                return -EBUSY;
        }

        if (dev_get_min_mp_channel_count(dev)) {
                NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
                return -EBUSY;
        }

        memset(&xdp, 0, sizeof(xdp));
        xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
        xdp.extack = extack;
        xdp.flags = flags;
        xdp.prog = prog;

        /* Drivers assume refcnt is already incremented (i.e, prog pointer is
         * "moved" into driver), so they don't increment it on their own, but
         * they do decrement refcnt when program is detached or replaced.
         * Given net_device also owns link/prog, we need to bump refcnt here
         * to prevent drivers from underflowing it.
         */
        if (prog)
                bpf_prog_inc(prog);
        err = bpf_op(dev, &xdp);
        if (err) {
                if (prog)
                        bpf_prog_put(prog);
                return err;
        }

        if (mode != XDP_MODE_HW)
                bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);

        return 0;
}

static void dev_xdp_uninstall(struct net_device *dev)
{
        struct bpf_xdp_link *link;
        struct bpf_prog *prog;
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;

        ASSERT_RTNL();

        for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
                prog = dev_xdp_prog(dev, mode);
                if (!prog)
                        continue;

                bpf_op = dev_xdp_bpf_op(dev, mode);
                if (!bpf_op)
                        continue;

                WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));

                /* auto-detach link from net device */
                link = dev_xdp_link(dev, mode);
                if (link)
                        link->dev = NULL;
                else
                        bpf_prog_put(prog);

                dev_xdp_set_link(dev, mode, NULL);
        }
}

static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
                          struct bpf_xdp_link *link, struct bpf_prog *new_prog,
                          struct bpf_prog *old_prog, u32 flags)
{
        unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
        struct bpf_prog *cur_prog;
        struct net_device *upper;
        struct list_head *iter;
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;
        int err;

        ASSERT_RTNL();

        /* either link or prog attachment, never both */
        if (link && (new_prog || old_prog))
                return -EINVAL;
        /* link supports only XDP mode flags */
        if (link && (flags & ~XDP_FLAGS_MODES)) {
                NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
                return -EINVAL;
        }
        /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
        if (num_modes > 1) {
                NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
                return -EINVAL;
        }
        /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
        if (!num_modes && dev_xdp_prog_count(dev) > 1) {
                NL_SET_ERR_MSG(extack,
                               "More than one program loaded, unset mode is ambiguous");
                return -EINVAL;
        }
        /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
        if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
                NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
                return -EINVAL;
        }

        mode = dev_xdp_mode(dev, flags);
        /* can't replace attached link */
        if (dev_xdp_link(dev, mode)) {
                NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
                return -EBUSY;
        }

        /* don't allow if an upper device already has a program */
        netdev_for_each_upper_dev_rcu(dev, upper, iter) {
                if (dev_xdp_prog_count(upper) > 0) {
                        NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
                        return -EEXIST;
                }
        }

        cur_prog = dev_xdp_prog(dev, mode);
        /* can't replace attached prog with link */
        if (link && cur_prog) {
                NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
                return -EBUSY;
        }
        if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
                NL_SET_ERR_MSG(extack, "Active program does not match expected");
                return -EEXIST;
        }

        /* put effective new program into new_prog */
        if (link)
                new_prog = link->link.prog;

        if (new_prog) {
                bool offload = mode == XDP_MODE_HW;
                enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
                                               ? XDP_MODE_DRV : XDP_MODE_SKB;

                if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
                        NL_SET_ERR_MSG(extack, "XDP program already attached");
                        return -EBUSY;
                }
                if (!offload && dev_xdp_prog(dev, other_mode)) {
                        NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
                        return -EEXIST;
                }
                if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
                        NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
                        return -EINVAL;
                }
                if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
                        NL_SET_ERR_MSG(extack, "Program bound to different device");
                        return -EINVAL;
                }
                if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
                        NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
                        return -EINVAL;
                }
                if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
                        NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
                        return -EINVAL;
                }
                if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
                        NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
                        return -EINVAL;
                }
        }

        /* don't call drivers if the effective program didn't change */
        if (new_prog != cur_prog) {
                bpf_op = dev_xdp_bpf_op(dev, mode);
                if (!bpf_op) {
                        NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
                        return -EOPNOTSUPP;
                }

                err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
                if (err)
                        return err;
        }

        if (link)
                dev_xdp_set_link(dev, mode, link);
        else
                dev_xdp_set_prog(dev, mode, new_prog);
        if (cur_prog)
                bpf_prog_put(cur_prog);

        return 0;
}

static int dev_xdp_attach_link(struct net_device *dev,
                               struct netlink_ext_ack *extack,
                               struct bpf_xdp_link *link)
{
        return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
}

static int dev_xdp_detach_link(struct net_device *dev,
                               struct netlink_ext_ack *extack,
                               struct bpf_xdp_link *link)
{
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;

        ASSERT_RTNL();

        mode = dev_xdp_mode(dev, link->flags);
        if (dev_xdp_link(dev, mode) != link)
                return -EINVAL;

        bpf_op = dev_xdp_bpf_op(dev, mode);
        WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
        dev_xdp_set_link(dev, mode, NULL);
        return 0;
}

static void bpf_xdp_link_release(struct bpf_link *link)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);

        rtnl_lock();

        /* if racing with net_device's tear down, xdp_link->dev might be
         * already NULL, in which case link was already auto-detached
         */
        if (xdp_link->dev) {
                netdev_lock_ops(xdp_link->dev);
                WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
                netdev_unlock_ops(xdp_link->dev);
                xdp_link->dev = NULL;
        }

        rtnl_unlock();
}

static int bpf_xdp_link_detach(struct bpf_link *link)
{
        bpf_xdp_link_release(link);
        return 0;
}

static void bpf_xdp_link_dealloc(struct bpf_link *link)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);

        kfree(xdp_link);
}

static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
                                     struct seq_file *seq)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        u32 ifindex = 0;

        rtnl_lock();
        if (xdp_link->dev)
                ifindex = xdp_link->dev->ifindex;
        rtnl_unlock();

        seq_printf(seq, "ifindex:\t%u\n", ifindex);
}

static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
                                       struct bpf_link_info *info)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        u32 ifindex = 0;

        rtnl_lock();
        if (xdp_link->dev)
                ifindex = xdp_link->dev->ifindex;
        rtnl_unlock();

        info->xdp.ifindex = ifindex;
        return 0;
}

static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
                               struct bpf_prog *old_prog)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;
        int err = 0;

        rtnl_lock();

        /* link might have been auto-released already, so fail */
        if (!xdp_link->dev) {
                err = -ENOLINK;
                goto out_unlock;
        }

        if (old_prog && link->prog != old_prog) {
                err = -EPERM;
                goto out_unlock;
        }
        old_prog = link->prog;
        if (old_prog->type != new_prog->type ||
            old_prog->expected_attach_type != new_prog->expected_attach_type) {
                err = -EINVAL;
                goto out_unlock;
        }

        if (old_prog == new_prog) {
                /* no-op, don't disturb drivers */
                bpf_prog_put(new_prog);
                goto out_unlock;
        }

        netdev_lock_ops(xdp_link->dev);
        mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
        bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
        err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
                              xdp_link->flags, new_prog);
        netdev_unlock_ops(xdp_link->dev);
        if (err)
                goto out_unlock;

        old_prog = xchg(&link->prog, new_prog);
        bpf_prog_put(old_prog);

out_unlock:
        rtnl_unlock();
        return err;
}

static const struct bpf_link_ops bpf_xdp_link_lops = {
        .release = bpf_xdp_link_release,
        .dealloc = bpf_xdp_link_dealloc,
        .detach = bpf_xdp_link_detach,
        .show_fdinfo = bpf_xdp_link_show_fdinfo,
        .fill_link_info = bpf_xdp_link_fill_link_info,
        .update_prog = bpf_xdp_link_update,
};

int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct net *net = current->nsproxy->net_ns;
        struct bpf_link_primer link_primer;
        struct netlink_ext_ack extack = {};
        struct bpf_xdp_link *link;
        struct net_device *dev;
        int err, fd;

        rtnl_lock();
        dev = dev_get_by_index(net, attr->link_create.target_ifindex);
        if (!dev) {
                rtnl_unlock();
                return -EINVAL;
        }

        link = kzalloc_obj(*link, GFP_USER);
        if (!link) {
                err = -ENOMEM;
                goto unlock;
        }

        bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog,
                      attr->link_create.attach_type);
        link->dev = dev;
        link->flags = attr->link_create.flags;

        err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                goto unlock;
        }

        netdev_lock_ops(dev);
        err = dev_xdp_attach_link(dev, &extack, link);
        netdev_unlock_ops(dev);
        rtnl_unlock();

        if (err) {
                link->dev = NULL;
                bpf_link_cleanup(&link_primer);
                trace_bpf_xdp_link_attach_failed(extack._msg);
                goto out_put_dev;
        }

        fd = bpf_link_settle(&link_primer);
        /* link itself doesn't hold dev's refcnt to not complicate shutdown */
        dev_put(dev);
        return fd;

unlock:
        rtnl_unlock();

out_put_dev:
        dev_put(dev);
        return err;
}

/**
 *        dev_change_xdp_fd - set or clear a bpf program for a device rx path
 *        @dev: device
 *        @extack: netlink extended ack
 *        @fd: new program fd or negative value to clear
 *        @expected_fd: old program fd that userspace expects to replace or clear
 *        @flags: xdp-related flags
 *
 *        Set or clear a bpf program for a device
 */
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                      int fd, int expected_fd, u32 flags)
{
        enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
        struct bpf_prog *new_prog = NULL, *old_prog = NULL;
        int err;

        ASSERT_RTNL();

        if (fd >= 0) {
                new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
                                                 mode != XDP_MODE_SKB);
                if (IS_ERR(new_prog))
                        return PTR_ERR(new_prog);
        }

        if (expected_fd >= 0) {
                old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
                                                 mode != XDP_MODE_SKB);
                if (IS_ERR(old_prog)) {
                        err = PTR_ERR(old_prog);
                        old_prog = NULL;
                        goto err_out;
                }
        }

        err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);

err_out:
        if (err && new_prog)
                bpf_prog_put(new_prog);
        if (old_prog)
                bpf_prog_put(old_prog);
        return err;
}

u32 dev_get_min_mp_channel_count(const struct net_device *dev)
{
        int i;

        netdev_ops_assert_locked(dev);

        for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
                if (dev->_rx[i].mp_params.mp_priv)
                        /* The channel count is the idx plus 1. */
                        return i + 1;

        return 0;
}

/**
 * dev_index_reserve() - allocate an ifindex in a namespace
 * @net: the applicable net namespace
 * @ifindex: requested ifindex, pass %0 to get one allocated
 *
 * Allocate a ifindex for a new device. Caller must either use the ifindex
 * to store the device (via list_netdevice()) or call dev_index_release()
 * to give the index up.
 *
 * Return: a suitable unique value for a new device interface number or -errno.
 */
static int dev_index_reserve(struct net *net, u32 ifindex)
{
        int err;

        if (ifindex > INT_MAX) {
                DEBUG_NET_WARN_ON_ONCE(1);
                return -EINVAL;
        }

        if (!ifindex)
                err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
                                      xa_limit_31b, &net->ifindex, GFP_KERNEL);
        else
                err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
        if (err < 0)
                return err;

        return ifindex;
}

static void dev_index_release(struct net *net, int ifindex)
{
        /* Expect only unused indexes, unlist_netdevice() removes the used */
        WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}

static bool from_cleanup_net(void)
{
#ifdef CONFIG_NET_NS
        return current == READ_ONCE(cleanup_net_task);
#else
        return false;
#endif
}

/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
atomic_t dev_unreg_count = ATOMIC_INIT(0);

static void net_set_todo(struct net_device *dev)
{
        list_add_tail(&dev->todo_list, &net_todo_list);
}

static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
        struct net_device *upper, netdev_features_t features)
{
        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
        netdev_features_t feature;
        int feature_bit;

        for_each_netdev_feature(upper_disables, feature_bit) {
                feature = __NETIF_F_BIT(feature_bit);
                if (!(upper->wanted_features & feature)
                    && (features & feature)) {
                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
                                   &feature, upper->name);
                        features &= ~feature;
                }
        }

        return features;
}

static void netdev_sync_lower_features(struct net_device *upper,
        struct net_device *lower, netdev_features_t features)
{
        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
        netdev_features_t feature;
        int feature_bit;

        for_each_netdev_feature(upper_disables, feature_bit) {
                feature = __NETIF_F_BIT(feature_bit);
                if (!(features & feature) && (lower->features & feature)) {
                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
                                   &feature, lower->name);
                        netdev_lock_ops(lower);
                        lower->wanted_features &= ~feature;
                        __netdev_update_features(lower);

                        if (unlikely(lower->features & feature))
                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
                                            &feature, lower->name);
                        else
                                netdev_features_change(lower);
                        netdev_unlock_ops(lower);
                }
        }
}

static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
{
        netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
        bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
        bool hw_csum = features & NETIF_F_HW_CSUM;

        return ip_csum || hw_csum;
}

static netdev_features_t netdev_fix_features(struct net_device *dev,
        netdev_features_t features)
{
        /* Fix illegal checksum combinations */
        if ((features & NETIF_F_HW_CSUM) &&
            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
        }

        /* TSO requires that SG is present as well. */
        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
                features &= ~NETIF_F_ALL_TSO;
        }

        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
                                        !(features & NETIF_F_IP_CSUM)) {
                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
                features &= ~NETIF_F_TSO;
                features &= ~NETIF_F_TSO_ECN;
        }

        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
                                         !(features & NETIF_F_IPV6_CSUM)) {
                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
                features &= ~NETIF_F_TSO6;
        }

        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
                features &= ~NETIF_F_TSO_MANGLEID;

        /* TSO ECN requires that TSO is present as well. */
        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
                features &= ~NETIF_F_TSO_ECN;

        /* Software GSO depends on SG. */
        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
                features &= ~NETIF_F_GSO;
        }

        /* GSO partial features require GSO partial be set */
        if ((features & dev->gso_partial_features) &&
            !(features & NETIF_F_GSO_PARTIAL)) {
                netdev_dbg(dev,
                           "Dropping partially supported GSO features since no GSO partial.\n");
                features &= ~dev->gso_partial_features;
        }

        if (!(features & NETIF_F_RXCSUM)) {
                /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
                 * successfully merged by hardware must also have the
                 * checksum verified by hardware.  If the user does not
                 * want to enable RXCSUM, logically, we should disable GRO_HW.
                 */
                if (features & NETIF_F_GRO_HW) {
                        netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
                        features &= ~NETIF_F_GRO_HW;
                }
        }

        /* LRO/HW-GRO features cannot be combined with RX-FCS */
        if (features & NETIF_F_RXFCS) {
                if (features & NETIF_F_LRO) {
                        netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
                        features &= ~NETIF_F_LRO;
                }

                if (features & NETIF_F_GRO_HW) {
                        netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
                        features &= ~NETIF_F_GRO_HW;
                }
        }

        if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
                netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
                features &= ~NETIF_F_LRO;
        }

        if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
                netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
                features &= ~NETIF_F_HW_TLS_TX;
        }

        if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
                netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
                features &= ~NETIF_F_HW_TLS_RX;
        }

        if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
                netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
                features &= ~NETIF_F_GSO_UDP_L4;
        }

        return features;
}

int __netdev_update_features(struct net_device *dev)
{
        struct net_device *upper, *lower;
        netdev_features_t features;
        struct list_head *iter;
        int err = -1;

        ASSERT_RTNL();
        netdev_ops_assert_locked(dev);

        features = netdev_get_wanted_features(dev);

        if (dev->netdev_ops->ndo_fix_features)
                features = dev->netdev_ops->ndo_fix_features(dev, features);

        /* driver might be less strict about feature dependencies */
        features = netdev_fix_features(dev, features);

        /* some features can't be enabled if they're off on an upper device */
        netdev_for_each_upper_dev_rcu(dev, upper, iter)
                features = netdev_sync_upper_features(dev, upper, features);

        if (dev->features == features)
                goto sync_lower;

        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
                &dev->features, &features);

        if (dev->netdev_ops->ndo_set_features)
                err = dev->netdev_ops->ndo_set_features(dev, features);
        else
                err = 0;

        if (unlikely(err < 0)) {
                netdev_err(dev,
                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
                        err, &features, &dev->features);
                /* return non-0 since some features might have changed and
                 * it's better to fire a spurious notification than miss it
                 */
                return -1;
        }

sync_lower:
        /* some features must be disabled on lower devices when disabled
         * on an upper device (think: bonding master or bridge)
         */
        netdev_for_each_lower_dev(dev, lower, iter)
                netdev_sync_lower_features(dev, lower, features);

        if (!err) {
                netdev_features_t diff = features ^ dev->features;

                if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
                        /* udp_tunnel_{get,drop}_rx_info both need
                         * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
                         * device, or they won't do anything.
                         * Thus we need to update dev->features
                         * *before* calling udp_tunnel_get_rx_info,
                         * but *after* calling udp_tunnel_drop_rx_info.
                         */
                        udp_tunnel_nic_lock(dev);
                        if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
                                dev->features = features;
                                udp_tunnel_get_rx_info(dev);
                        } else {
                                udp_tunnel_drop_rx_info(dev);
                        }
                        udp_tunnel_nic_unlock(dev);
                }

                if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
                        if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
                                dev->features = features;
                                err |= vlan_get_rx_ctag_filter_info(dev);
                        } else {
                                vlan_drop_rx_ctag_filter_info(dev);
                        }
                }

                if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
                        if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
                                dev->features = features;
                                err |= vlan_get_rx_stag_filter_info(dev);
                        } else {
                                vlan_drop_rx_stag_filter_info(dev);
                        }
                }

                dev->features = features;
        }

        return err < 0 ? 0 : 1;
}

/**
 *        netdev_update_features - recalculate device features
 *        @dev: the device to check
 *
 *        Recalculate dev->features set and send notifications if it
 *        has changed. Should be called after driver or hardware dependent
 *        conditions might have changed that influence the features.
 */
void netdev_update_features(struct net_device *dev)
{
        if (__netdev_update_features(dev))
                netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_update_features);

/**
 *        netdev_change_features - recalculate device features
 *        @dev: the device to check
 *
 *        Recalculate dev->features set and send notifications even
 *        if they have not changed. Should be called instead of
 *        netdev_update_features() if also dev->vlan_features might
 *        have changed to allow the changes to be propagated to stacked
 *        VLAN devices.
 */
void netdev_change_features(struct net_device *dev)
{
        __netdev_update_features(dev);
        netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_change_features);

/**
 *        netif_stacked_transfer_operstate -        transfer operstate
 *        @rootdev: the root or lower level device to transfer state from
 *        @dev: the device to transfer operstate to
 *
 *        Transfer operational state from root to device. This is normally
 *        called when a stacking relationship exists between the root
 *        device and the device(a leaf device).
 */
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
                                        struct net_device *dev)
{
        if (rootdev->operstate == IF_OPER_DORMANT)
                netif_dormant_on(dev);
        else
                netif_dormant_off(dev);

        if (rootdev->operstate == IF_OPER_TESTING)
                netif_testing_on(dev);
        else
                netif_testing_off(dev);

        if (netif_carrier_ok(rootdev))
                netif_carrier_on(dev);
        else
                netif_carrier_off(dev);
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);

static int netif_alloc_rx_queues(struct net_device *dev)
{
        unsigned int i, count = dev->num_rx_queues;
        struct netdev_rx_queue *rx;
        size_t sz = count * sizeof(*rx);
        int err = 0;

        BUG_ON(count < 1);

        rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!rx)
                return -ENOMEM;

        dev->_rx = rx;

        for (i = 0; i < count; i++) {
                rx[i].dev = dev;

                /* XDP RX-queue setup */
                err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
                if (err < 0)
                        goto err_rxq_info;
        }
        return 0;

err_rxq_info:
        /* Rollback successful reg's and free other resources */
        while (i--)
                xdp_rxq_info_unreg(&rx[i].xdp_rxq);
        kvfree(dev->_rx);
        dev->_rx = NULL;
        return err;
}

static void netif_free_rx_queues(struct net_device *dev)
{
        unsigned int i, count = dev->num_rx_queues;

        /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
        if (!dev->_rx)
                return;

        for (i = 0; i < count; i++)
                xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);

        kvfree(dev->_rx);
}

static void netdev_init_one_queue(struct net_device *dev,
                                  struct netdev_queue *queue, void *_unused)
{
        /* Initialize queue lock */
        spin_lock_init(&queue->_xmit_lock);
        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
        queue->xmit_lock_owner = -1;
        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
        queue->dev = dev;
#ifdef CONFIG_BQL
        dql_init(&queue->dql, HZ);
#endif
}

static void netif_free_tx_queues(struct net_device *dev)
{
        kvfree(dev->_tx);
}

static int netif_alloc_netdev_queues(struct net_device *dev)
{
        unsigned int count = dev->num_tx_queues;
        struct netdev_queue *tx;
        size_t sz = count * sizeof(*tx);

        if (count < 1 || count > 0xffff)
                return -EINVAL;

        tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!tx)
                return -ENOMEM;

        dev->_tx = tx;

        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
        spin_lock_init(&dev->tx_global_lock);

        return 0;
}

void netif_tx_stop_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                netif_tx_stop_queue(txq);
        }
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);

static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
{
        void __percpu *v;

        /* Drivers implementing ndo_get_peer_dev must support tstat
         * accounting, so that skb_do_redirect() can bump the dev's
         * RX stats upon network namespace switch.
         */
        if (dev->netdev_ops->ndo_get_peer_dev &&
            dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
                return -EOPNOTSUPP;

        switch (dev->pcpu_stat_type) {
        case NETDEV_PCPU_STAT_NONE:
                return 0;
        case NETDEV_PCPU_STAT_LSTATS:
                v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
                break;
        case NETDEV_PCPU_STAT_TSTATS:
                v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
                break;
        case NETDEV_PCPU_STAT_DSTATS:
                v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
                break;
        default:
                return -EINVAL;
        }

        return v ? 0 : -ENOMEM;
}

static void netdev_do_free_pcpu_stats(struct net_device *dev)
{
        switch (dev->pcpu_stat_type) {
        case NETDEV_PCPU_STAT_NONE:
                return;
        case NETDEV_PCPU_STAT_LSTATS:
                free_percpu(dev->lstats);
                break;
        case NETDEV_PCPU_STAT_TSTATS:
                free_percpu(dev->tstats);
                break;
        case NETDEV_PCPU_STAT_DSTATS:
                free_percpu(dev->dstats);
                break;
        }
}

static void netdev_free_phy_link_topology(struct net_device *dev)
{
        struct phy_link_topology *topo = dev->link_topo;

        if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
                xa_destroy(&topo->phys);
                kfree(topo);
                dev->link_topo = NULL;
        }
}

/**
 * register_netdevice() - register a network device
 * @dev: device to register
 *
 * Take a prepared network device structure and make it externally accessible.
 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
 * Callers must hold the rtnl lock - you may want register_netdev()
 * instead of this.
 */
int register_netdevice(struct net_device *dev)
{
        int ret;
        struct net *net = dev_net(dev);

        BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
                     NETDEV_FEATURE_COUNT);
        BUG_ON(dev_boot_phase);
        ASSERT_RTNL();

        might_sleep();

        /* When net_device's are persistent, this will be fatal. */
        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
        BUG_ON(!net);

        ret = ethtool_check_ops(dev->ethtool_ops);
        if (ret)
                return ret;

        /* rss ctx ID 0 is reserved for the default context, start from 1 */
        xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
        mutex_init(&dev->ethtool->rss_lock);

        spin_lock_init(&dev->addr_list_lock);
        netdev_set_addr_lockdep_class(dev);

        ret = dev_get_valid_name(net, dev, dev->name);
        if (ret < 0)
                goto out;

        ret = -ENOMEM;
        dev->name_node = netdev_name_node_head_alloc(dev);
        if (!dev->name_node)
                goto out;

        /* Init, if this function is available */
        if (dev->netdev_ops->ndo_init) {
                ret = dev->netdev_ops->ndo_init(dev);
                if (ret) {
                        if (ret > 0)
                                ret = -EIO;
                        goto err_free_name;
                }
        }

        if (((dev->hw_features | dev->features) &
             NETIF_F_HW_VLAN_CTAG_FILTER) &&
            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
                ret = -EINVAL;
                goto err_uninit;
        }

        ret = netdev_do_alloc_pcpu_stats(dev);
        if (ret)
                goto err_uninit;

        ret = dev_index_reserve(net, dev->ifindex);
        if (ret < 0)
                goto err_free_pcpu;
        dev->ifindex = ret;

        /* Transfer changeable features to wanted_features and enable
         * software offloads (GSO and GRO).
         */
        dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
        dev->features |= NETIF_F_SOFT_FEATURES;

        if (dev->udp_tunnel_nic_info) {
                dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
                dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
        }

        dev->wanted_features = dev->features & dev->hw_features;

        if (!(dev->flags & IFF_LOOPBACK))
                dev->hw_features |= NETIF_F_NOCACHE_COPY;

        /* If IPv4 TCP segmentation offload is supported we should also
         * allow the device to enable segmenting the frame with the option
         * of ignoring a static IP ID value.  This doesn't enable the
         * feature itself but allows the user to enable it later.
         */
        if (dev->hw_features & NETIF_F_TSO)
                dev->hw_features |= NETIF_F_TSO_MANGLEID;
        if (dev->vlan_features & NETIF_F_TSO)
                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
        if (dev->mpls_features & NETIF_F_TSO)
                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
        if (dev->hw_enc_features & NETIF_F_TSO)
                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;

        /* TSO_MANGLEID belongs in mangleid_features by definition */
        dev->mangleid_features |= NETIF_F_TSO_MANGLEID;

        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
         */
        dev->vlan_features |= NETIF_F_HIGHDMA;

        /* Make NETIF_F_SG inheritable to tunnel devices.
         */
        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;

        /* Make NETIF_F_SG inheritable to MPLS.
         */
        dev->mpls_features |= NETIF_F_SG;

        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
        ret = notifier_to_errno(ret);
        if (ret)
                goto err_ifindex_release;

        ret = netdev_register_kobject(dev);

        netdev_lock(dev);
        WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
        netdev_unlock(dev);

        if (ret)
                goto err_uninit_notify;

        netdev_lock_ops(dev);
        __netdev_update_features(dev);
        netdev_unlock_ops(dev);

        /*
         *        Default initial state at registry is that the
         *        device is present.
         */

        set_bit(__LINK_STATE_PRESENT, &dev->state);

        linkwatch_init_dev(dev);

        dev_init_scheduler(dev);

        netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
        list_netdevice(dev);

        add_device_randomness(dev->dev_addr, dev->addr_len);

        /* If the device has permanent device address, driver should
         * set dev_addr and also addr_assign_type should be set to
         * NET_ADDR_PERM (default value).
         */
        if (dev->addr_assign_type == NET_ADDR_PERM)
                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);

        /* Notify protocols, that a new device appeared. */
        netdev_lock_ops(dev);
        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
        netdev_unlock_ops(dev);
        ret = notifier_to_errno(ret);
        if (ret) {
                /* Expect explicit free_netdev() on failure */
                dev->needs_free_netdev = false;
                unregister_netdevice_queue(dev, NULL);
                goto out;
        }
        /*
         *        Prevent userspace races by waiting until the network
         *        device is fully setup before sending notifications.
         */
        if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);

out:
        return ret;

err_uninit_notify:
        call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
err_ifindex_release:
        dev_index_release(net, dev->ifindex);
err_free_pcpu:
        netdev_do_free_pcpu_stats(dev);
err_uninit:
        if (dev->netdev_ops->ndo_uninit)
                dev->netdev_ops->ndo_uninit(dev);
        if (dev->priv_destructor)
                dev->priv_destructor(dev);
err_free_name:
        netdev_name_node_free(dev->name_node);
        goto out;
}
EXPORT_SYMBOL(register_netdevice);

/* Initialize the core of a dummy net device.
 * The setup steps dummy netdevs need which normal netdevs get by going
 * through register_netdevice().
 */
static void init_dummy_netdev(struct net_device *dev)
{
        /* make sure we BUG if trying to hit standard
         * register/unregister code path
         */
        dev->reg_state = NETREG_DUMMY;

        /* a dummy interface is started by default */
        set_bit(__LINK_STATE_PRESENT, &dev->state);
        set_bit(__LINK_STATE_START, &dev->state);

        /* Note : We dont allocate pcpu_refcnt for dummy devices,
         * because users of this 'device' dont need to change
         * its refcount.
         */
}

/**
 *        register_netdev        - register a network device
 *        @dev: device to register
 *
 *        Take a completed network device structure and add it to the kernel
 *        interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 *        chain. 0 is returned on success. A negative errno code is returned
 *        on a failure to set up the device, or if the name is a duplicate.
 *
 *        This is a wrapper around register_netdevice that takes the rtnl semaphore
 *        and expands the device name if you passed a format string to
 *        alloc_netdev.
 */
int register_netdev(struct net_device *dev)
{
        struct net *net = dev_net(dev);
        int err;

        if (rtnl_net_lock_killable(net))
                return -EINTR;

        err = register_netdevice(dev);

        rtnl_net_unlock(net);

        return err;
}
EXPORT_SYMBOL(register_netdev);

int netdev_refcnt_read(const struct net_device *dev)
{
#ifdef CONFIG_PCPU_DEV_REFCNT
        int i, refcnt = 0;

        for_each_possible_cpu(i)
                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
        return refcnt;
#else
        return refcount_read(&dev->dev_refcnt);
#endif
}
EXPORT_SYMBOL(netdev_refcnt_read);

int netdev_unregister_timeout_secs __read_mostly = 10;

#define WAIT_REFS_MIN_MSECS 1
#define WAIT_REFS_MAX_MSECS 250
/**
 * netdev_wait_allrefs_any - wait until all references are gone.
 * @list: list of net_devices to wait on
 *
 * This is called when unregistering network devices.
 *
 * Any protocol or device that holds a reference should register
 * for netdevice notification, and cleanup and put back the
 * reference if they receive an UNREGISTER event.
 * We can get stuck here if buggy protocols don't correctly
 * call dev_put.
 */
static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
{
        unsigned long rebroadcast_time, warning_time;
        struct net_device *dev;
        int wait = 0;

        rebroadcast_time = warning_time = jiffies;

        list_for_each_entry(dev, list, todo_list)
                if (netdev_refcnt_read(dev) == 1)
                        return dev;

        while (true) {
                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
                        rtnl_lock();

                        /* Rebroadcast unregister notification */
                        list_for_each_entry(dev, list, todo_list)
                                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);

                        __rtnl_unlock();
                        rcu_barrier();
                        rtnl_lock();

                        list_for_each_entry(dev, list, todo_list)
                                if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
                                             &dev->state)) {
                                        /* We must not have linkwatch events
                                         * pending on unregister. If this
                                         * happens, we simply run the queue
                                         * unscheduled, resulting in a noop
                                         * for this device.
                                         */
                                        linkwatch_run_queue();
                                        break;
                                }

                        __rtnl_unlock();

                        rebroadcast_time = jiffies;
                }

                rcu_barrier();

                if (!wait) {
                        wait = WAIT_REFS_MIN_MSECS;
                } else {
                        msleep(wait);
                        wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
                }

                list_for_each_entry(dev, list, todo_list)
                        if (netdev_refcnt_read(dev) == 1)
                                return dev;

                if (time_after(jiffies, warning_time +
                               READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
                        list_for_each_entry(dev, list, todo_list) {
                                pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
                                         dev->name, netdev_refcnt_read(dev));
                                ref_tracker_dir_print(&dev->refcnt_tracker, 10);
                        }

                        warning_time = jiffies;
                }
        }
}

/* The sequence is:
 *
 *        rtnl_lock();
 *        ...
 *        register_netdevice(x1);
 *        register_netdevice(x2);
 *        ...
 *        unregister_netdevice(y1);
 *        unregister_netdevice(y2);
 *      ...
 *        rtnl_unlock();
 *        free_netdev(y1);
 *        free_netdev(y2);
 *
 * We are invoked by rtnl_unlock().
 * This allows us to deal with problems:
 * 1) We can delete sysfs objects which invoke hotplug
 *    without deadlocking with linkwatch via keventd.
 * 2) Since we run with the RTNL semaphore not held, we can sleep
 *    safely in order to wait for the netdev refcnt to drop to zero.
 *
 * We must not return until all unregister events added during
 * the interval the lock was held have been completed.
 */
void netdev_run_todo(void)
{
        struct net_device *dev, *tmp;
        struct list_head list;
        int cnt;
#ifdef CONFIG_LOCKDEP
        struct list_head unlink_list;

        list_replace_init(&net_unlink_list, &unlink_list);

        while (!list_empty(&unlink_list)) {
                dev = list_first_entry(&unlink_list, struct net_device,
                                       unlink_list);
                list_del_init(&dev->unlink_list);
                dev->nested_level = dev->lower_level - 1;
        }
#endif

        /* Snapshot list, allow later requests */
        list_replace_init(&net_todo_list, &list);

        __rtnl_unlock();

        /* Wait for rcu callbacks to finish before next phase */
        if (!list_empty(&list))
                rcu_barrier();

        list_for_each_entry_safe(dev, tmp, &list, todo_list) {
                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
                        netdev_WARN(dev, "run_todo but not unregistering\n");
                        list_del(&dev->todo_list);
                        continue;
                }

                netdev_lock(dev);
                WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
                netdev_unlock(dev);
                linkwatch_sync_dev(dev);
        }

        cnt = 0;
        while (!list_empty(&list)) {
                dev = netdev_wait_allrefs_any(&list);
                list_del(&dev->todo_list);

                /* paranoia */
                BUG_ON(netdev_refcnt_read(dev) != 1);
                BUG_ON(!list_empty(&dev->ptype_all));
                BUG_ON(!list_empty(&dev->ptype_specific));
                WARN_ON(rcu_access_pointer(dev->ip_ptr));
                WARN_ON(rcu_access_pointer(dev->ip6_ptr));

                netdev_do_free_pcpu_stats(dev);
                if (dev->priv_destructor)
                        dev->priv_destructor(dev);
                if (dev->needs_free_netdev)
                        free_netdev(dev);

                cnt++;

                /* Free network device */
                kobject_put(&dev->dev.kobj);
        }
        if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
                wake_up(&netdev_unregistering_wq);
}

/* Collate per-cpu network dstats statistics
 *
 * Read per-cpu network statistics from dev->dstats and populate the related
 * fields in @s.
 */
static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
                             const struct pcpu_dstats __percpu *dstats)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                u64 rx_packets, rx_bytes, rx_drops;
                u64 tx_packets, tx_bytes, tx_drops;
                const struct pcpu_dstats *stats;
                unsigned int start;

                stats = per_cpu_ptr(dstats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        rx_packets = u64_stats_read(&stats->rx_packets);
                        rx_bytes   = u64_stats_read(&stats->rx_bytes);
                        rx_drops   = u64_stats_read(&stats->rx_drops);
                        tx_packets = u64_stats_read(&stats->tx_packets);
                        tx_bytes   = u64_stats_read(&stats->tx_bytes);
                        tx_drops   = u64_stats_read(&stats->tx_drops);
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                s->rx_packets += rx_packets;
                s->rx_bytes   += rx_bytes;
                s->rx_dropped += rx_drops;
                s->tx_packets += tx_packets;
                s->tx_bytes   += tx_bytes;
                s->tx_dropped += tx_drops;
        }
}

/* ndo_get_stats64 implementation for dtstats-based accounting.
 *
 * Populate @s from dev->stats and dev->dstats. This is used internally by the
 * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
 */
static void dev_get_dstats64(const struct net_device *dev,
                             struct rtnl_link_stats64 *s)
{
        netdev_stats_to_stats64(s, &dev->stats);
        dev_fetch_dstats(s, dev->dstats);
}

/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
 * all the same fields in the same order as net_device_stats, with only
 * the type differing, but rtnl_link_stats64 may have additional fields
 * at the end for newer counters.
 */
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
                             const struct net_device_stats *netdev_stats)
{
        size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
        const atomic_long_t *src = (atomic_long_t *)netdev_stats;
        u64 *dst = (u64 *)stats64;

        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
        for (i = 0; i < n; i++)
                dst[i] = (unsigned long)atomic_long_read(&src[i]);
        /* zero out counters that only exist in rtnl_link_stats64 */
        memset((char *)stats64 + n * sizeof(u64), 0,
               sizeof(*stats64) - n * sizeof(u64));
}
EXPORT_SYMBOL(netdev_stats_to_stats64);

static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
                struct net_device *dev)
{
        struct net_device_core_stats __percpu *p;

        p = alloc_percpu_gfp(struct net_device_core_stats,
                             GFP_ATOMIC | __GFP_NOWARN);

        if (p && cmpxchg(&dev->core_stats, NULL, p))
                free_percpu(p);

        /* This READ_ONCE() pairs with the cmpxchg() above */
        return READ_ONCE(dev->core_stats);
}

noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
{
        /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
        struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
        unsigned long __percpu *field;

        if (unlikely(!p)) {
                p = netdev_core_stats_alloc(dev);
                if (!p)
                        return;
        }

        field = (unsigned long __percpu *)((void __percpu *)p + offset);
        this_cpu_inc(*field);
}
EXPORT_SYMBOL_GPL(netdev_core_stats_inc);

/**
 *        dev_get_stats        - get network device statistics
 *        @dev: device to get statistics from
 *        @storage: place to store stats
 *
 *        Get network statistics from device. Return @storage.
 *        The device driver may provide its own method by setting
 *        dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
 *        otherwise the internal statistics structure is used.
 */
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        const struct net_device_core_stats __percpu *p;

        /*
         * IPv{4,6} and udp tunnels share common stat helpers and use
         * different stat type (NETDEV_PCPU_STAT_TSTATS vs
         * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
         */
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
                     offsetof(struct pcpu_dstats, rx_bytes));
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
                     offsetof(struct pcpu_dstats, rx_packets));
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
                     offsetof(struct pcpu_dstats, tx_bytes));
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
                     offsetof(struct pcpu_dstats, tx_packets));

        if (ops->ndo_get_stats64) {
                memset(storage, 0, sizeof(*storage));
                ops->ndo_get_stats64(dev, storage);
        } else if (ops->ndo_get_stats) {
                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
        } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
                dev_get_tstats64(dev, storage);
        } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
                dev_get_dstats64(dev, storage);
        } else {
                netdev_stats_to_stats64(storage, &dev->stats);
        }

        /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
        p = READ_ONCE(dev->core_stats);
        if (p) {
                const struct net_device_core_stats *core_stats;
                int i;

                for_each_possible_cpu(i) {
                        core_stats = per_cpu_ptr(p, i);
                        storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
                        storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
                        storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
                        storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
                }
        }
        return storage;
}
EXPORT_SYMBOL(dev_get_stats);

/**
 *        dev_fetch_sw_netstats - get per-cpu network device statistics
 *        @s: place to store stats
 *        @netstats: per-cpu network stats to read from
 *
 *        Read per-cpu network statistics and populate the related fields in @s.
 */
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
                           const struct pcpu_sw_netstats __percpu *netstats)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
                const struct pcpu_sw_netstats *stats;
                unsigned int start;

                stats = per_cpu_ptr(netstats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        rx_packets = u64_stats_read(&stats->rx_packets);
                        rx_bytes   = u64_stats_read(&stats->rx_bytes);
                        tx_packets = u64_stats_read(&stats->tx_packets);
                        tx_bytes   = u64_stats_read(&stats->tx_bytes);
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                s->rx_packets += rx_packets;
                s->rx_bytes   += rx_bytes;
                s->tx_packets += tx_packets;
                s->tx_bytes   += tx_bytes;
        }
}
EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);

/**
 *        dev_get_tstats64 - ndo_get_stats64 implementation
 *        @dev: device to get statistics from
 *        @s: place to store stats
 *
 *        Populate @s from dev->stats and dev->tstats. Can be used as
 *        ndo_get_stats64() callback.
 */
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
{
        netdev_stats_to_stats64(s, &dev->stats);
        dev_fetch_sw_netstats(s, dev->tstats);
}
EXPORT_SYMBOL_GPL(dev_get_tstats64);

struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
        struct netdev_queue *queue = dev_ingress_queue(dev);

#ifdef CONFIG_NET_CLS_ACT
        if (queue)
                return queue;
        queue = kzalloc_obj(*queue);
        if (!queue)
                return NULL;
        netdev_init_one_queue(dev, queue, NULL);
        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
        RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
        rcu_assign_pointer(dev->ingress_queue, queue);
#endif
        return queue;
}

static const struct ethtool_ops default_ethtool_ops;

void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops)
{
        if (dev->ethtool_ops == &default_ethtool_ops)
                dev->ethtool_ops = ops;
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);

/**
 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
 * @dev: netdev to enable the IRQ coalescing on
 *
 * Sets a conservative default for SW IRQ coalescing. Users can use
 * sysfs attributes to override the default values.
 */
void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
{
        WARN_ON(dev->reg_state == NETREG_REGISTERED);

        if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
                netdev_set_gro_flush_timeout(dev, 20000);
                netdev_set_defer_hard_irqs(dev, 1);
        }
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);

/**
 * alloc_netdev_mqs - allocate network device
 * @sizeof_priv: size of private data to allocate space for
 * @name: device name format string
 * @name_assign_type: origin of device name
 * @setup: callback to initialize device
 * @txqs: the number of TX subqueues to allocate
 * @rxqs: the number of RX subqueues to allocate
 *
 * Allocates a struct net_device with private data area for driver use
 * and performs basic initialization.  Also allocates subqueue structs
 * for each queue on the device.
 */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
                unsigned char name_assign_type,
                void (*setup)(struct net_device *),
                unsigned int txqs, unsigned int rxqs)
{
        struct net_device *dev;
        size_t napi_config_sz;
        unsigned int maxqs;

        BUG_ON(strlen(name) >= sizeof(dev->name));

        if (txqs < 1) {
                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
                return NULL;
        }

        if (rxqs < 1) {
                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
                return NULL;
        }

        maxqs = max(txqs, rxqs);

        dev = kvzalloc_flex(*dev, priv, sizeof_priv,
                            GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!dev)
                return NULL;

        dev->priv_len = sizeof_priv;

        ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev");
#ifdef CONFIG_PCPU_DEV_REFCNT
        dev->pcpu_refcnt = alloc_percpu(int);
        if (!dev->pcpu_refcnt)
                goto free_dev;
        __dev_hold(dev);
#else
        refcount_set(&dev->dev_refcnt, 1);
#endif

        if (dev_addr_init(dev))
                goto free_pcpu;

        dev_mc_init(dev);
        dev_uc_init(dev);

        dev_net_set(dev, &init_net);

        dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
        dev->xdp_zc_max_segs = 1;
        dev->gso_max_segs = GSO_MAX_SEGS;
        dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
        dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
        dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
        dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
        dev->tso_max_segs = TSO_MAX_SEGS;
        dev->upper_level = 1;
        dev->lower_level = 1;
#ifdef CONFIG_LOCKDEP
        dev->nested_level = 0;
        INIT_LIST_HEAD(&dev->unlink_list);
#endif

        INIT_LIST_HEAD(&dev->napi_list);
        INIT_LIST_HEAD(&dev->unreg_list);
        INIT_LIST_HEAD(&dev->close_list);
        INIT_LIST_HEAD(&dev->link_watch_list);
        INIT_LIST_HEAD(&dev->adj_list.upper);
        INIT_LIST_HEAD(&dev->adj_list.lower);
        INIT_LIST_HEAD(&dev->ptype_all);
        INIT_LIST_HEAD(&dev->ptype_specific);
        INIT_LIST_HEAD(&dev->net_notifier_list);
#ifdef CONFIG_NET_SCHED
        hash_init(dev->qdisc_hash);
#endif

        mutex_init(&dev->lock);

        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
        setup(dev);

        if (!dev->tx_queue_len) {
                dev->priv_flags |= IFF_NO_QUEUE;
                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
        }

        dev->num_tx_queues = txqs;
        dev->real_num_tx_queues = txqs;
        if (netif_alloc_netdev_queues(dev))
                goto free_all;

        dev->num_rx_queues = rxqs;
        dev->real_num_rx_queues = rxqs;
        if (netif_alloc_rx_queues(dev))
                goto free_all;
        dev->ethtool = kzalloc_obj(*dev->ethtool, GFP_KERNEL_ACCOUNT);
        if (!dev->ethtool)
                goto free_all;

        dev->cfg = kzalloc_obj(*dev->cfg, GFP_KERNEL_ACCOUNT);
        if (!dev->cfg)
                goto free_all;
        dev->cfg_pending = dev->cfg;

        dev->num_napi_configs = maxqs;
        napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
        dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
        if (!dev->napi_config)
                goto free_all;

        strscpy(dev->name, name);
        dev->name_assign_type = name_assign_type;
        dev->group = INIT_NETDEV_GROUP;
        if (!dev->ethtool_ops)
                dev->ethtool_ops = &default_ethtool_ops;

        nf_hook_netdev_init(dev);

        return dev;

free_all:
        free_netdev(dev);
        return NULL;

free_pcpu:
#ifdef CONFIG_PCPU_DEV_REFCNT
        free_percpu(dev->pcpu_refcnt);
free_dev:
#endif
        kvfree(dev);
        return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);

static void netdev_napi_exit(struct net_device *dev)
{
        if (!list_empty(&dev->napi_list)) {
                struct napi_struct *p, *n;

                netdev_lock(dev);
                list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
                        __netif_napi_del_locked(p);
                netdev_unlock(dev);

                synchronize_net();
        }

        kvfree(dev->napi_config);
}

/**
 * free_netdev - free network device
 * @dev: device
 *
 * This function does the last stage of destroying an allocated device
 * interface. The reference to the device object is released. If this
 * is the last reference then it will be freed.Must be called in process
 * context.
 */
void free_netdev(struct net_device *dev)
{
        might_sleep();

        /* When called immediately after register_netdevice() failed the unwind
         * handling may still be dismantling the device. Handle that case by
         * deferring the free.
         */
        if (dev->reg_state == NETREG_UNREGISTERING) {
                ASSERT_RTNL();
                dev->needs_free_netdev = true;
                return;
        }

        WARN_ON(dev->cfg != dev->cfg_pending);
        kfree(dev->cfg);
        kfree(dev->ethtool);
        netif_free_tx_queues(dev);
        netif_free_rx_queues(dev);

        kfree(rcu_dereference_protected(dev->ingress_queue, 1));

        /* Flush device addresses */
        dev_addr_flush(dev);

        netdev_napi_exit(dev);

        netif_del_cpu_rmap(dev);

        ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
        free_percpu(dev->pcpu_refcnt);
        dev->pcpu_refcnt = NULL;
#endif
        free_percpu(dev->core_stats);
        dev->core_stats = NULL;
        free_percpu(dev->xdp_bulkq);
        dev->xdp_bulkq = NULL;

        netdev_free_phy_link_topology(dev);

        mutex_destroy(&dev->lock);

        /*  Compatibility with error handling in drivers */
        if (dev->reg_state == NETREG_UNINITIALIZED ||
            dev->reg_state == NETREG_DUMMY) {
                kvfree(dev);
                return;
        }

        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
        WRITE_ONCE(dev->reg_state, NETREG_RELEASED);

        /* will free via device release */
        put_device(&dev->dev);
}
EXPORT_SYMBOL(free_netdev);

/**
 * alloc_netdev_dummy - Allocate and initialize a dummy net device.
 * @sizeof_priv: size of private data to allocate space for
 *
 * Return: the allocated net_device on success, NULL otherwise
 */
struct net_device *alloc_netdev_dummy(int sizeof_priv)
{
        return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
                            init_dummy_netdev);
}
EXPORT_SYMBOL_GPL(alloc_netdev_dummy);

/**
 *        synchronize_net -  Synchronize with packet receive processing
 *
 *        Wait for packets currently being received to be done.
 *        Does not block later packets from starting.
 */
void synchronize_net(void)
{
        might_sleep();
        if (from_cleanup_net() || rtnl_is_locked())
                synchronize_rcu_expedited();
        else
                synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);

static void netdev_rss_contexts_free(struct net_device *dev)
{
        struct ethtool_rxfh_context *ctx;
        unsigned long context;

        mutex_lock(&dev->ethtool->rss_lock);
        xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
                xa_erase(&dev->ethtool->rss_ctx, context);
                dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL);
                kfree(ctx);
        }
        xa_destroy(&dev->ethtool->rss_ctx);
        mutex_unlock(&dev->ethtool->rss_lock);
}

/**
 *        unregister_netdevice_queue - remove device from the kernel
 *        @dev: device
 *        @head: list
 *
 *        This function shuts down a device interface and removes it
 *        from the kernel tables.
 *        If head not NULL, device is queued to be unregistered later.
 *
 *        Callers must hold the rtnl semaphore.  You may want
 *        unregister_netdev() instead of this.
 */

void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
{
        ASSERT_RTNL();

        if (head) {
                list_move_tail(&dev->unreg_list, head);
        } else {
                LIST_HEAD(single);

                list_add(&dev->unreg_list, &single);
                unregister_netdevice_many(&single);
        }
}
EXPORT_SYMBOL(unregister_netdevice_queue);

static void dev_memory_provider_uninstall(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->real_num_rx_queues; i++) {
                struct netdev_rx_queue *rxq = &dev->_rx[i];

                __netif_mp_uninstall_rxq(rxq, &rxq->mp_params);
        }
}

/* devices must be UP and netdev_lock()'d */
static void netif_close_many_and_unlock(struct list_head *close_head)
{
        struct net_device *dev, *tmp;

        netif_close_many(close_head, false);

        /* ... now unlock them */
        list_for_each_entry_safe(dev, tmp, close_head, close_list) {
                netdev_unlock(dev);
                list_del_init(&dev->close_list);
        }
}

static void netif_close_many_and_unlock_cond(struct list_head *close_head)
{
#ifdef CONFIG_LOCKDEP
        /* We can only track up to MAX_LOCK_DEPTH locks per task.
         *
         * Reserve half the available slots for additional locks possibly
         * taken by notifiers and (soft)irqs.
         */
        unsigned int limit = MAX_LOCK_DEPTH / 2;

        if (lockdep_depth(current) > limit)
                netif_close_many_and_unlock(close_head);
#endif
}

bool unregister_netdevice_queued(const struct net_device *dev)
{
        ASSERT_RTNL();
        return !list_empty(&dev->unreg_list);
}

void unregister_netdevice_many_notify(struct list_head *head,
                                      u32 portid, const struct nlmsghdr *nlh)
{
        struct net_device *dev, *tmp;
        LIST_HEAD(close_head);
        int cnt = 0;

        BUG_ON(dev_boot_phase);
        ASSERT_RTNL();

        if (list_empty(head))
                return;

        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
                /* Some devices call without registering
                 * for initialization unwind. Remove those
                 * devices and proceed with the remaining.
                 */
                if (dev->reg_state == NETREG_UNINITIALIZED) {
                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
                                 dev->name, dev);

                        WARN_ON(1);
                        list_del(&dev->unreg_list);
                        continue;
                }
                dev->dismantle = true;
                BUG_ON(dev->reg_state != NETREG_REGISTERED);
        }

        /* If device is running, close it first. Start with ops locked... */
        list_for_each_entry(dev, head, unreg_list) {
                if (!(dev->flags & IFF_UP))
                        continue;
                if (netdev_need_ops_lock(dev)) {
                        list_add_tail(&dev->close_list, &close_head);
                        netdev_lock(dev);
                }
                netif_close_many_and_unlock_cond(&close_head);
        }
        netif_close_many_and_unlock(&close_head);
        /* ... now go over the rest. */
        list_for_each_entry(dev, head, unreg_list) {
                if (!netdev_need_ops_lock(dev))
                        list_add_tail(&dev->close_list, &close_head);
        }
        netif_close_many(&close_head, true);

        list_for_each_entry(dev, head, unreg_list) {
                /* And unlink it from device chain. */
                unlist_netdevice(dev);
                netdev_lock(dev);
                WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
                netdev_unlock(dev);
        }
        flush_all_backlogs();

        synchronize_net();

        list_for_each_entry(dev, head, unreg_list) {
                struct sk_buff *skb = NULL;

                /* Shutdown queueing discipline. */
                netdev_lock_ops(dev);
                dev_shutdown(dev);
                dev_tcx_uninstall(dev);
                dev_xdp_uninstall(dev);
                dev_memory_provider_uninstall(dev);
                netdev_unlock_ops(dev);
                bpf_dev_bound_netdev_unregister(dev);

                netdev_offload_xstats_disable_all(dev);

                /* Notify protocols, that we are about to destroy
                 * this device. They should clean all the things.
                 */
                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);

                if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
                                                     GFP_KERNEL, NULL, 0,
                                                     portid, nlh);

                /*
                 *        Flush the unicast and multicast chains
                 */
                dev_uc_flush(dev);
                dev_mc_flush(dev);

                netdev_name_node_alt_flush(dev);
                netdev_name_node_free(dev->name_node);

                netdev_rss_contexts_free(dev);

                call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);

                if (dev->netdev_ops->ndo_uninit)
                        dev->netdev_ops->ndo_uninit(dev);

                mutex_destroy(&dev->ethtool->rss_lock);

                net_shaper_flush_netdev(dev);

                if (skb)
                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);

                /* Notifier chain MUST detach us all upper devices. */
                WARN_ON(netdev_has_any_upper_dev(dev));
                WARN_ON(netdev_has_any_lower_dev(dev));

                /* Remove entries from kobject tree */
                netdev_unregister_kobject(dev);
#ifdef CONFIG_XPS
                /* Remove XPS queueing entries */
                netif_reset_xps_queues_gt(dev, 0);
#endif
        }

        synchronize_net();

        list_for_each_entry(dev, head, unreg_list) {
                netdev_put(dev, &dev->dev_registered_tracker);
                net_set_todo(dev);
                cnt++;
        }
        atomic_add(cnt, &dev_unreg_count);

        list_del(head);
}

/**
 *        unregister_netdevice_many - unregister many devices
 *        @head: list of devices
 *
 *  Note: As most callers use a stack allocated list_head,
 *  we force a list_del() to make sure stack won't be corrupted later.
 */
void unregister_netdevice_many(struct list_head *head)
{
        unregister_netdevice_many_notify(head, 0, NULL);
}
EXPORT_SYMBOL(unregister_netdevice_many);

/**
 *        unregister_netdev - remove device from the kernel
 *        @dev: device
 *
 *        This function shuts down a device interface and removes it
 *        from the kernel tables.
 *
 *        This is just a wrapper for unregister_netdevice that takes
 *        the rtnl semaphore.  In general you want to use this and not
 *        unregister_netdevice.
 */
void unregister_netdev(struct net_device *dev)
{
        rtnl_net_dev_lock(dev);
        unregister_netdevice(dev);
        rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);

int __dev_change_net_namespace(struct net_device *dev, struct net *net,
                               const char *pat, int new_ifindex,
                               struct netlink_ext_ack *extack)
{
        struct netdev_name_node *name_node;
        struct net *net_old = dev_net(dev);
        char new_name[IFNAMSIZ] = {};
        int err, new_nsid;

        ASSERT_RTNL();

        /* Don't allow namespace local devices to be moved. */
        err = -EINVAL;
        if (dev->netns_immutable) {
                NL_SET_ERR_MSG(extack, "The interface netns is immutable");
                goto out;
        }

        /* Ensure the device has been registered */
        if (dev->reg_state != NETREG_REGISTERED) {
                NL_SET_ERR_MSG(extack, "The interface isn't registered");
                goto out;
        }

        /* Get out if there is nothing todo */
        err = 0;
        if (net_eq(net_old, net))
                goto out;

        /* Pick the destination device name, and ensure
         * we can use it in the destination network namespace.
         */
        err = -EEXIST;
        if (netdev_name_in_use(net, dev->name)) {
                /* We get here if we can't use the current device name */
                if (!pat) {
                        NL_SET_ERR_MSG(extack,
                                       "An interface with the same name exists in the target netns");
                        goto out;
                }
                err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
                if (err < 0) {
                        NL_SET_ERR_MSG_FMT(extack,
                                           "Unable to use '%s' for the new interface name in the target netns",
                                           pat);
                        goto out;
                }
        }
        /* Check that none of the altnames conflicts. */
        err = -EEXIST;
        netdev_for_each_altname(dev, name_node) {
                if (netdev_name_in_use(net, name_node->name)) {
                        NL_SET_ERR_MSG_FMT(extack,
                                           "An interface with the altname %s exists in the target netns",
                                           name_node->name);
                        goto out;
                }
        }

        /* Check that new_ifindex isn't used yet. */
        if (new_ifindex) {
                err = dev_index_reserve(net, new_ifindex);
                if (err < 0) {
                        NL_SET_ERR_MSG_FMT(extack,
                                           "The ifindex %d is not available in the target netns",
                                           new_ifindex);
                        goto out;
                }
        } else {
                /* If there is an ifindex conflict assign a new one */
                err = dev_index_reserve(net, dev->ifindex);
                if (err == -EBUSY)
                        err = dev_index_reserve(net, 0);
                if (err < 0) {
                        NL_SET_ERR_MSG(extack,
                                       "Unable to allocate a new ifindex in the target netns");
                        goto out;
                }
                new_ifindex = err;
        }

        /*
         * And now a mini version of register_netdevice unregister_netdevice.
         */

        netdev_lock_ops(dev);
        /* If device is running close it first. */
        netif_close(dev);
        /* And unlink it from device chain */
        unlist_netdevice(dev);

        if (!netdev_need_ops_lock(dev))
                netdev_lock(dev);
        dev->moving_ns = true;
        netdev_unlock(dev);

        synchronize_net();

        /* Shutdown queueing discipline. */
        netdev_lock_ops(dev);
        dev_shutdown(dev);
        netdev_unlock_ops(dev);

        /* Notify protocols, that we are about to destroy
         * this device. They should clean all the things.
         *
         * Note that dev->reg_state stays at NETREG_REGISTERED.
         * This is wanted because this way 8021q and macvlan know
         * the device is just moving and can keep their slaves up.
         */
        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
        rcu_barrier();

        new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);

        rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
                            new_ifindex);

        /*
         *        Flush the unicast and multicast chains
         */
        dev_uc_flush(dev);
        dev_mc_flush(dev);

        /* Send a netdev-removed uevent to the old namespace */
        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
        netdev_adjacent_del_links(dev);

        /* Move per-net netdevice notifiers that are following the netdevice */
        move_netdevice_notifiers_dev_net(dev, net);

        /* Actually switch the network namespace */
        netdev_lock(dev);
        dev_net_set(dev, net);
        netdev_unlock(dev);
        dev->ifindex = new_ifindex;

        if (new_name[0]) {
                /* Rename the netdev to prepared name */
                write_seqlock_bh(&netdev_rename_lock);
                strscpy(dev->name, new_name, IFNAMSIZ);
                write_sequnlock_bh(&netdev_rename_lock);
        }

        /* Fixup kobjects */
        dev_set_uevent_suppress(&dev->dev, 1);
        err = device_rename(&dev->dev, dev->name);
        dev_set_uevent_suppress(&dev->dev, 0);
        WARN_ON(err);

        /* Send a netdev-add uevent to the new namespace */
        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
        netdev_adjacent_add_links(dev);

        /* Adapt owner in case owning user namespace of target network
         * namespace is different from the original one.
         */
        err = netdev_change_owner(dev, net_old, net);
        WARN_ON(err);

        netdev_lock(dev);
        dev->moving_ns = false;
        if (!netdev_need_ops_lock(dev))
                netdev_unlock(dev);

        /* Add the device back in the hashes */
        list_netdevice(dev);
        /* Notify protocols, that a new device appeared. */
        call_netdevice_notifiers(NETDEV_REGISTER, dev);
        netdev_unlock_ops(dev);

        /*
         *        Prevent userspace races by waiting until the network
         *        device is fully setup before sending notifications.
         */
        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);

        synchronize_net();
        err = 0;
out:
        return err;
}

static int dev_cpu_dead(unsigned int oldcpu)
{
        struct sk_buff **list_skb;
        struct sk_buff *skb;
        unsigned int cpu;
        struct softnet_data *sd, *oldsd, *remsd = NULL;

        local_irq_disable();
        cpu = smp_processor_id();
        sd = &per_cpu(softnet_data, cpu);
        oldsd = &per_cpu(softnet_data, oldcpu);

        /* Find end of our completion_queue. */
        list_skb = &sd->completion_queue;
        while (*list_skb)
                list_skb = &(*list_skb)->next;
        /* Append completion queue from offline CPU. */
        *list_skb = oldsd->completion_queue;
        oldsd->completion_queue = NULL;

        /* Append output queue from offline CPU. */
        if (oldsd->output_queue) {
                *sd->output_queue_tailp = oldsd->output_queue;
                sd->output_queue_tailp = oldsd->output_queue_tailp;
                oldsd->output_queue = NULL;
                oldsd->output_queue_tailp = &oldsd->output_queue;
        }
        /* Append NAPI poll list from offline CPU, with one exception :
         * process_backlog() must be called by cpu owning percpu backlog.
         * We properly handle process_queue & input_pkt_queue later.
         */
        while (!list_empty(&oldsd->poll_list)) {
                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
                                                            struct napi_struct,
                                                            poll_list);

                list_del_init(&napi->poll_list);
                if (napi->poll == process_backlog)
                        napi->state &= NAPIF_STATE_THREADED;
                else
                        ____napi_schedule(sd, napi);
        }

        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_enable();

        if (!use_backlog_threads()) {
#ifdef CONFIG_RPS
                remsd = oldsd->rps_ipi_list;
                oldsd->rps_ipi_list = NULL;
#endif
                /* send out pending IPI's on offline CPU */
                net_rps_send_ipi(remsd);
        }

        /* Process offline CPU's input_pkt_queue */
        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
                netif_rx(skb);
                rps_input_queue_head_incr(oldsd);
        }
        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
                netif_rx(skb);
                rps_input_queue_head_incr(oldsd);
        }

        return 0;
}

/**
 *        netdev_increment_features - increment feature set by one
 *        @all: current feature set
 *        @one: new feature set
 *        @mask: mask feature set
 *
 *        Computes a new feature set after adding a device with feature set
 *        @one to the master device with current feature set @all.  Will not
 *        enable anything that is off in @mask. Returns the new feature set.
 */
netdev_features_t netdev_increment_features(netdev_features_t all,
        netdev_features_t one, netdev_features_t mask)
{
        if (mask & NETIF_F_HW_CSUM)
                mask |= NETIF_F_CSUM_MASK;
        mask |= NETIF_F_VLAN_CHALLENGED;

        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
        all &= one | ~NETIF_F_ALL_FOR_ALL;

        /* If one device supports hw checksumming, set for all. */
        if (all & NETIF_F_HW_CSUM)
                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);

        return all;
}
EXPORT_SYMBOL(netdev_increment_features);

/**
 *        netdev_compute_master_upper_features - compute feature from lowers
 *        @dev: the upper device
 *        @update_header: whether to update upper device's header_len/headroom/tailroom
 *
 *        Recompute the upper device's feature based on all lower devices.
 */
void netdev_compute_master_upper_features(struct net_device *dev, bool update_header)
{
        unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
        netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES;
        netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES;
        netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES;
        netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES;
        netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES;
        unsigned short max_header_len = ETH_HLEN;
        unsigned int tso_max_size = TSO_MAX_SIZE;
        unsigned short max_headroom = 0;
        unsigned short max_tailroom = 0;
        u16 tso_max_segs = TSO_MAX_SEGS;
        struct net_device *lower_dev;
        struct list_head *iter;

        mpls_features = netdev_base_features(mpls_features);
        vlan_features = netdev_base_features(vlan_features);
        enc_features = netdev_base_features(enc_features);

        netdev_for_each_lower_dev(dev, lower_dev, iter) {
                gso_partial_features = netdev_increment_features(gso_partial_features,
                                                                 lower_dev->gso_partial_features,
                                                                 MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES);

                vlan_features = netdev_increment_features(vlan_features,
                                                          lower_dev->vlan_features,
                                                          MASTER_UPPER_DEV_VLAN_FEATURES);

                enc_features = netdev_increment_features(enc_features,
                                                         lower_dev->hw_enc_features,
                                                         MASTER_UPPER_DEV_ENC_FEATURES);

                if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
                        xfrm_features = netdev_increment_features(xfrm_features,
                                                                  lower_dev->hw_enc_features,
                                                                  MASTER_UPPER_DEV_XFRM_FEATURES);

                mpls_features = netdev_increment_features(mpls_features,
                                                          lower_dev->mpls_features,
                                                          MASTER_UPPER_DEV_MPLS_FEATURES);

                dst_release_flag &= lower_dev->priv_flags;

                if (update_header) {
                        max_header_len = max(max_header_len, lower_dev->hard_header_len);
                        max_headroom = max(max_headroom, lower_dev->needed_headroom);
                        max_tailroom = max(max_tailroom, lower_dev->needed_tailroom);
                }

                tso_max_size = min(tso_max_size, lower_dev->tso_max_size);
                tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs);
        }

        dev->gso_partial_features = gso_partial_features;
        dev->vlan_features = vlan_features;
        dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
                               NETIF_F_HW_VLAN_CTAG_TX |
                               NETIF_F_HW_VLAN_STAG_TX;
        if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
                dev->hw_enc_features |= xfrm_features;
        dev->mpls_features = mpls_features;

        dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
        if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
            dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
                dev->priv_flags |= IFF_XMIT_DST_RELEASE;

        if (update_header) {
                dev->hard_header_len = max_header_len;
                dev->needed_headroom = max_headroom;
                dev->needed_tailroom = max_tailroom;
        }

        netif_set_tso_max_segs(dev, tso_max_segs);
        netif_set_tso_max_size(dev, tso_max_size);

        netdev_change_features(dev);
}
EXPORT_SYMBOL(netdev_compute_master_upper_features);

static struct hlist_head * __net_init netdev_create_hash(void)
{
        int i;
        struct hlist_head *hash;

        hash = kmalloc_objs(*hash, NETDEV_HASHENTRIES);
        if (hash != NULL)
                for (i = 0; i < NETDEV_HASHENTRIES; i++)
                        INIT_HLIST_HEAD(&hash[i]);

        return hash;
}

/* Initialize per network namespace state */
static int __net_init netdev_init(struct net *net)
{
        BUILD_BUG_ON(GRO_HASH_BUCKETS >
                     BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));

        INIT_LIST_HEAD(&net->dev_base_head);

        net->dev_name_head = netdev_create_hash();
        if (net->dev_name_head == NULL)
                goto err_name;

        net->dev_index_head = netdev_create_hash();
        if (net->dev_index_head == NULL)
                goto err_idx;

        xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);

        RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);

        return 0;

err_idx:
        kfree(net->dev_name_head);
err_name:
        return -ENOMEM;
}

/**
 *        netdev_drivername - network driver for the device
 *        @dev: network device
 *
 *        Determine network driver for device.
 */
const char *netdev_drivername(const struct net_device *dev)
{
        const struct device_driver *driver;
        const struct device *parent;
        const char *empty = "";

        parent = dev->dev.parent;
        if (!parent)
                return empty;

        driver = parent->driver;
        if (driver && driver->name)
                return driver->name;
        return empty;
}

static void __netdev_printk(const char *level, const struct net_device *dev,
                            struct va_format *vaf)
{
        if (dev && dev->dev.parent) {
                dev_printk_emit(level[1] - '0',
                                dev->dev.parent,
                                "%s %s %s%s: %pV",
                                dev_driver_string(dev->dev.parent),
                                dev_name(dev->dev.parent),
                                netdev_name(dev), netdev_reg_state(dev),
                                vaf);
        } else if (dev) {
                printk("%s%s%s: %pV",
                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
        } else {
                printk("%s(NULL net_device): %pV", level, vaf);
        }
}

void netdev_printk(const char *level, const struct net_device *dev,
                   const char *format, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, format);

        vaf.fmt = format;
        vaf.va = &args;

        __netdev_printk(level, dev, &vaf);

        va_end(args);
}
EXPORT_SYMBOL(netdev_printk);

#define define_netdev_printk_level(func, level)                        \
void func(const struct net_device *dev, const char *fmt, ...)        \
{                                                                \
        struct va_format vaf;                                        \
        va_list args;                                                \
                                                                \
        va_start(args, fmt);                                        \
                                                                \
        vaf.fmt = fmt;                                                \
        vaf.va = &args;                                                \
                                                                \
        __netdev_printk(level, dev, &vaf);                        \
                                                                \
        va_end(args);                                                \
}                                                                \
EXPORT_SYMBOL(func);

define_netdev_printk_level(netdev_emerg, KERN_EMERG);
define_netdev_printk_level(netdev_alert, KERN_ALERT);
define_netdev_printk_level(netdev_crit, KERN_CRIT);
define_netdev_printk_level(netdev_err, KERN_ERR);
define_netdev_printk_level(netdev_warn, KERN_WARNING);
define_netdev_printk_level(netdev_notice, KERN_NOTICE);
define_netdev_printk_level(netdev_info, KERN_INFO);

static void __net_exit netdev_exit(struct net *net)
{
        kfree(net->dev_name_head);
        kfree(net->dev_index_head);
        xa_destroy(&net->dev_by_index);
        if (net != &init_net)
                WARN_ON_ONCE(!list_empty(&net->dev_base_head));
}

static struct pernet_operations __net_initdata netdev_net_ops = {
        .init = netdev_init,
        .exit = netdev_exit,
};

static void __net_exit default_device_exit_net(struct net *net)
{
        struct netdev_name_node *name_node, *tmp;
        struct net_device *dev, *aux;
        /*
         * Push all migratable network devices back to the
         * initial network namespace
         */
        ASSERT_RTNL();
        for_each_netdev_safe(net, dev, aux) {
                int err;
                char fb_name[IFNAMSIZ];

                /* Ignore unmoveable devices (i.e. loopback) */
                if (dev->netns_immutable)
                        continue;

                /* Leave virtual devices for the generic cleanup */
                if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
                        continue;

                /* Push remaining network devices to init_net */
                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
                if (netdev_name_in_use(&init_net, fb_name))
                        snprintf(fb_name, IFNAMSIZ, "dev%%d");

                netdev_for_each_altname_safe(dev, name_node, tmp)
                        if (netdev_name_in_use(&init_net, name_node->name))
                                __netdev_name_node_alt_destroy(name_node);

                err = dev_change_net_namespace(dev, &init_net, fb_name);
                if (err) {
                        pr_emerg("%s: failed to move %s to init_net: %d\n",
                                 __func__, dev->name, err);
                        BUG();
                }
        }
}

static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
        /* At exit all network devices most be removed from a network
         * namespace.  Do this in the reverse order of registration.
         * Do this across as many network namespaces as possible to
         * improve batching efficiency.
         */
        struct net_device *dev;
        struct net *net;
        LIST_HEAD(dev_kill_list);

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list) {
                default_device_exit_net(net);
                cond_resched();
        }

        list_for_each_entry(net, net_list, exit_list) {
                for_each_netdev_reverse(net, dev) {
                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
                        else
                                unregister_netdevice_queue(dev, &dev_kill_list);
                }
        }
        unregister_netdevice_many(&dev_kill_list);
        rtnl_unlock();
}

static struct pernet_operations __net_initdata default_device_ops = {
        .exit_batch = default_device_exit_batch,
};

static void __init net_dev_struct_check(void)
{
        /* TX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
#ifdef CONFIG_XPS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
#endif
#ifdef CONFIG_NET_XGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
#endif
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);

        /* TXRX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);

        /* RX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
#ifdef CONFIG_NETPOLL
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
#endif
#ifdef CONFIG_NET_XGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
}

/*
 *        Initialize the DEV module. At boot time this walks the device list and
 *        unhooks any devices that fail to initialise (normally hardware not
 *        present) and leaves us with a valid list of present and active devices.
 *
 */

/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
#define SYSTEM_PERCPU_PAGE_POOL_SIZE        ((1 << 20) / PAGE_SIZE)

static int net_page_pool_create(int cpuid)
{
#if IS_ENABLED(CONFIG_PAGE_POOL)
        struct page_pool_params page_pool_params = {
                .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
                .flags = PP_FLAG_SYSTEM_POOL,
                .nid = cpu_to_mem(cpuid),
        };
        struct page_pool *pp_ptr;
        int err;

        pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
        if (IS_ERR(pp_ptr))
                return -ENOMEM;

        err = xdp_reg_page_pool(pp_ptr);
        if (err) {
                page_pool_destroy(pp_ptr);
                return err;
        }

        per_cpu(system_page_pool.pool, cpuid) = pp_ptr;
#endif
        return 0;
}

static int backlog_napi_should_run(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
        struct napi_struct *napi = &sd->backlog;

        return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
}

static void run_backlog_napi(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);

        napi_threaded_poll_loop(&sd->backlog, NULL);
}

static void backlog_napi_setup(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
        struct napi_struct *napi = &sd->backlog;

        napi->thread = this_cpu_read(backlog_napi);
        set_bit(NAPI_STATE_THREADED, &napi->state);
}

static struct smp_hotplug_thread backlog_threads = {
        .store                        = &backlog_napi,
        .thread_should_run        = backlog_napi_should_run,
        .thread_fn                = run_backlog_napi,
        .thread_comm                = "backlog_napi/%u",
        .setup                        = backlog_napi_setup,
};

/*
 *       This is called single threaded during boot, so no need
 *       to take the rtnl semaphore.
 */
static int __init net_dev_init(void)
{
        int i, rc = -ENOMEM;

        BUG_ON(!dev_boot_phase);

        net_dev_struct_check();

        if (dev_proc_init())
                goto out;

        if (netdev_kobject_init())
                goto out;

        for (i = 0; i < PTYPE_HASH_SIZE; i++)
                INIT_LIST_HEAD(&ptype_base[i]);

        if (register_pernet_subsys(&netdev_net_ops))
                goto out;

        /*
         *        Initialise the packet receive queues.
         */

        flush_backlogs_fallback = flush_backlogs_alloc();
        if (!flush_backlogs_fallback)
                goto out;

        for_each_possible_cpu(i) {
                struct softnet_data *sd = &per_cpu(softnet_data, i);

                skb_queue_head_init(&sd->input_pkt_queue);
                skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
                skb_queue_head_init(&sd->xfrm_backlog);
#endif
                INIT_LIST_HEAD(&sd->poll_list);
                sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
                INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
                sd->cpu = i;
#endif
                INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);

                gro_init(&sd->backlog.gro);
                sd->backlog.poll = process_backlog;
                sd->backlog.weight = weight_p;
                INIT_LIST_HEAD(&sd->backlog.poll_list);

                if (net_page_pool_create(i))
                        goto out;
        }
        net_hotdata.skb_defer_nodes =
                 __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids,
                                __alignof__(struct skb_defer_node));
        if (!net_hotdata.skb_defer_nodes)
                goto out;
        if (use_backlog_threads())
                smpboot_register_percpu_thread(&backlog_threads);

        dev_boot_phase = 0;

        /* The loopback device is special if any other network devices
         * is present in a network namespace the loopback device must
         * be present. Since we now dynamically allocate and free the
         * loopback device ensure this invariant is maintained by
         * keeping the loopback device as the first device on the
         * list of network devices.  Ensuring the loopback devices
         * is the first device that appears and the last network device
         * that disappears.
         */
        if (register_pernet_device(&loopback_net_ops))
                goto out;

        if (register_pernet_device(&default_device_ops))
                goto out;

        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
        open_softirq(NET_RX_SOFTIRQ, net_rx_action);

        rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
                                       NULL, dev_cpu_dead);
        WARN_ON(rc < 0);
        rc = 0;

        /* avoid static key IPIs to isolated CPUs */
        if (housekeeping_enabled(HK_TYPE_MISC))
                net_enable_timestamp();
out:
        if (rc < 0) {
                for_each_possible_cpu(i) {
                        struct page_pool *pp_ptr;

                        pp_ptr = per_cpu(system_page_pool.pool, i);
                        if (!pp_ptr)
                                continue;

                        xdp_unreg_page_pool(pp_ptr);
                        page_pool_destroy(pp_ptr);
                        per_cpu(system_page_pool.pool, i) = NULL;
                }
        }

        return rc;
}

subsys_initcall(net_dev_init);






































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * property.h - Unified device property interface.
 *
 * Copyright (C) 2014, Intel Corporation
 * Authors: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 *          Mika Westerberg <mika.westerberg@linux.intel.com>
 */

#ifndef _LINUX_PROPERTY_H_
#define _LINUX_PROPERTY_H_

#include <linux/args.h>
#include <linux/array_size.h>
#include <linux/bits.h>
#include <linux/cleanup.h>
#include <linux/fwnode.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/util_macros.h>

struct device;

enum dev_prop_type {
        DEV_PROP_U8,
        DEV_PROP_U16,
        DEV_PROP_U32,
        DEV_PROP_U64,
        DEV_PROP_STRING,
        DEV_PROP_REF,
};

const struct fwnode_handle *__dev_fwnode_const(const struct device *dev);
struct fwnode_handle *__dev_fwnode(struct device *dev);
#define dev_fwnode(dev)                                                        \
        _Generic((dev),                                                        \
                 const struct device *: __dev_fwnode_const,        \
                 struct device *: __dev_fwnode)(dev)

bool device_property_present(const struct device *dev, const char *propname);
bool device_property_read_bool(const struct device *dev, const char *propname);
int device_property_read_u8_array(const struct device *dev, const char *propname,
                                  u8 *val, size_t nval);
int device_property_read_u16_array(const struct device *dev, const char *propname,
                                   u16 *val, size_t nval);
int device_property_read_u32_array(const struct device *dev, const char *propname,
                                   u32 *val, size_t nval);
int device_property_read_u64_array(const struct device *dev, const char *propname,
                                   u64 *val, size_t nval);
int device_property_read_string_array(const struct device *dev, const char *propname,
                                      const char **val, size_t nval);
int device_property_read_string(const struct device *dev, const char *propname,
                                const char **val);
int device_property_match_string(const struct device *dev,
                                 const char *propname, const char *string);

bool fwnode_property_present(const struct fwnode_handle *fwnode,
                             const char *propname);
bool fwnode_property_read_bool(const struct fwnode_handle *fwnode,
                             const char *propname);
int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode,
                                  const char *propname, u8 *val,
                                  size_t nval);
int fwnode_property_read_u16_array(const struct fwnode_handle *fwnode,
                                   const char *propname, u16 *val,
                                   size_t nval);
int fwnode_property_read_u32_array(const struct fwnode_handle *fwnode,
                                   const char *propname, u32 *val,
                                   size_t nval);
int fwnode_property_read_u64_array(const struct fwnode_handle *fwnode,
                                   const char *propname, u64 *val,
                                   size_t nval);
int fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
                                      const char *propname, const char **val,
                                      size_t nval);
int fwnode_property_read_string(const struct fwnode_handle *fwnode,
                                const char *propname, const char **val);
int fwnode_property_match_string(const struct fwnode_handle *fwnode,
                                 const char *propname, const char *string);

bool fwnode_device_is_available(const struct fwnode_handle *fwnode);

static inline bool fwnode_device_is_big_endian(const struct fwnode_handle *fwnode)
{
        if (fwnode_property_present(fwnode, "big-endian"))
                return true;
        if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) &&
            fwnode_property_present(fwnode, "native-endian"))
                return true;
        return false;
}

static inline
bool fwnode_device_is_compatible(const struct fwnode_handle *fwnode, const char *compat)
{
        return fwnode_property_match_string(fwnode, "compatible", compat) >= 0;
}

/**
 * device_is_big_endian - check if a device has BE registers
 * @dev: Pointer to the struct device
 *
 * Returns: true if the device has a "big-endian" property, or if the kernel
 * was compiled for BE *and* the device has a "native-endian" property.
 * Returns false otherwise.
 *
 * Callers would nominally use ioread32be/iowrite32be if
 * device_is_big_endian() == true, or readl/writel otherwise.
 */
static inline bool device_is_big_endian(const struct device *dev)
{
        return fwnode_device_is_big_endian(dev_fwnode(dev));
}

/**
 * device_is_compatible - match 'compatible' property of the device with a given string
 * @dev: Pointer to the struct device
 * @compat: The string to match 'compatible' property with
 *
 * Returns: true if matches, otherwise false.
 */
static inline bool device_is_compatible(const struct device *dev, const char *compat)
{
        return fwnode_device_is_compatible(dev_fwnode(dev), compat);
}

int fwnode_property_match_property_string(const struct fwnode_handle *fwnode,
                                          const char *propname,
                                          const char * const *array, size_t n);

static inline
int device_property_match_property_string(const struct device *dev,
                                          const char *propname,
                                          const char * const *array, size_t n)
{
        return fwnode_property_match_property_string(dev_fwnode(dev), propname, array, n);
}

int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode,
                                       const char *prop, const char *nargs_prop,
                                       unsigned int nargs, unsigned int index,
                                       struct fwnode_reference_args *args);

struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode,
                                            const char *name,
                                            unsigned int index);

const char *fwnode_get_name(const struct fwnode_handle *fwnode);
const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode);
bool fwnode_name_eq(const struct fwnode_handle *fwnode, const char *name);

struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode);

#define fwnode_for_each_parent_node(fwnode, parent)                \
        for (parent = fwnode_get_parent(fwnode); parent;        \
             parent = fwnode_get_next_parent(parent))

unsigned int fwnode_count_parents(const struct fwnode_handle *fwn);
struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwn,
                                            unsigned int depth);
struct fwnode_handle *fwnode_get_next_child_node(
        const struct fwnode_handle *fwnode, struct fwnode_handle *child);
struct fwnode_handle *fwnode_get_next_available_child_node(
        const struct fwnode_handle *fwnode, struct fwnode_handle *child);

#define fwnode_for_each_child_node(fwnode, child)                        \
        for (child = fwnode_get_next_child_node(fwnode, NULL); child;        \
             child = fwnode_get_next_child_node(fwnode, child))

#define fwnode_for_each_named_child_node(fwnode, child, name)                \
        fwnode_for_each_child_node(fwnode, child)                        \
                for_each_if(fwnode_name_eq(child, name))

#define fwnode_for_each_available_child_node(fwnode, child)                       \
        for (child = fwnode_get_next_available_child_node(fwnode, NULL); child;\
             child = fwnode_get_next_available_child_node(fwnode, child))

#define fwnode_for_each_child_node_scoped(fwnode, child)                \
        for (struct fwnode_handle *child __free(fwnode_handle) =        \
                fwnode_get_next_child_node(fwnode, NULL);                \
             child; child = fwnode_get_next_child_node(fwnode, child))

#define fwnode_for_each_available_child_node_scoped(fwnode, child)        \
        for (struct fwnode_handle *child __free(fwnode_handle) =        \
                fwnode_get_next_available_child_node(fwnode, NULL);        \
             child; child = fwnode_get_next_available_child_node(fwnode, child))

struct fwnode_handle *device_get_next_child_node(const struct device *dev,
                                                 struct fwnode_handle *child);

#define device_for_each_child_node(dev, child)                                \
        for (child = device_get_next_child_node(dev, NULL); child;        \
             child = device_get_next_child_node(dev, child))

#define device_for_each_named_child_node(dev, child, name)                \
        device_for_each_child_node(dev, child)                                \
                for_each_if(fwnode_name_eq(child, name))

#define device_for_each_child_node_scoped(dev, child)                        \
        for (struct fwnode_handle *child __free(fwnode_handle) =        \
                device_get_next_child_node(dev, NULL);                        \
             child; child = device_get_next_child_node(dev, child))

#define device_for_each_named_child_node_scoped(dev, child, name)        \
        device_for_each_child_node_scoped(dev, child)                        \
                for_each_if(fwnode_name_eq(child, name))

struct fwnode_handle *fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
                                                  const char *childname);
struct fwnode_handle *device_get_named_child_node(const struct device *dev,
                                                  const char *childname);

struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode);

/**
 * fwnode_handle_put - Drop reference to a device node
 * @fwnode: Pointer to the device node to drop the reference to.
 *
 * This has to be used when terminating device_for_each_child_node() iteration
 * with break or return to prevent stale device node references from being left
 * behind.
 */
static inline void fwnode_handle_put(struct fwnode_handle *fwnode)
{
        fwnode_call_void_op(fwnode, put);
}

DEFINE_FREE(fwnode_handle, struct fwnode_handle *, fwnode_handle_put(_T))

int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index);
int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name);

unsigned int fwnode_get_child_node_count(const struct fwnode_handle *fwnode);

static inline unsigned int device_get_child_node_count(const struct device *dev)
{
        return fwnode_get_child_node_count(dev_fwnode(dev));
}

unsigned int fwnode_get_named_child_node_count(const struct fwnode_handle *fwnode,
                                               const char *name);
static inline unsigned int device_get_named_child_node_count(const struct device *dev,
                                                             const char *name)
{
        return fwnode_get_named_child_node_count(dev_fwnode(dev), name);
}

static inline int device_property_read_u8(const struct device *dev,
                                          const char *propname, u8 *val)
{
        return device_property_read_u8_array(dev, propname, val, 1);
}

static inline int device_property_read_u16(const struct device *dev,
                                           const char *propname, u16 *val)
{
        return device_property_read_u16_array(dev, propname, val, 1);
}

static inline int device_property_read_u32(const struct device *dev,
                                           const char *propname, u32 *val)
{
        return device_property_read_u32_array(dev, propname, val, 1);
}

static inline int device_property_read_u64(const struct device *dev,
                                           const char *propname, u64 *val)
{
        return device_property_read_u64_array(dev, propname, val, 1);
}

static inline int device_property_count_u8(const struct device *dev, const char *propname)
{
        return device_property_read_u8_array(dev, propname, NULL, 0);
}

static inline int device_property_count_u16(const struct device *dev, const char *propname)
{
        return device_property_read_u16_array(dev, propname, NULL, 0);
}

static inline int device_property_count_u32(const struct device *dev, const char *propname)
{
        return device_property_read_u32_array(dev, propname, NULL, 0);
}

static inline int device_property_count_u64(const struct device *dev, const char *propname)
{
        return device_property_read_u64_array(dev, propname, NULL, 0);
}

static inline int device_property_string_array_count(const struct device *dev,
                                                     const char *propname)
{
        return device_property_read_string_array(dev, propname, NULL, 0);
}

static inline int fwnode_property_read_u8(const struct fwnode_handle *fwnode,
                                          const char *propname, u8 *val)
{
        return fwnode_property_read_u8_array(fwnode, propname, val, 1);
}

static inline int fwnode_property_read_u16(const struct fwnode_handle *fwnode,
                                           const char *propname, u16 *val)
{
        return fwnode_property_read_u16_array(fwnode, propname, val, 1);
}

static inline int fwnode_property_read_u32(const struct fwnode_handle *fwnode,
                                           const char *propname, u32 *val)
{
        return fwnode_property_read_u32_array(fwnode, propname, val, 1);
}

static inline int fwnode_property_read_u64(const struct fwnode_handle *fwnode,
                                           const char *propname, u64 *val)
{
        return fwnode_property_read_u64_array(fwnode, propname, val, 1);
}

static inline int fwnode_property_count_u8(const struct fwnode_handle *fwnode,
                                           const char *propname)
{
        return fwnode_property_read_u8_array(fwnode, propname, NULL, 0);
}

static inline int fwnode_property_count_u16(const struct fwnode_handle *fwnode,
                                            const char *propname)
{
        return fwnode_property_read_u16_array(fwnode, propname, NULL, 0);
}

static inline int fwnode_property_count_u32(const struct fwnode_handle *fwnode,
                                            const char *propname)
{
        return fwnode_property_read_u32_array(fwnode, propname, NULL, 0);
}

static inline int fwnode_property_count_u64(const struct fwnode_handle *fwnode,
                                            const char *propname)
{
        return fwnode_property_read_u64_array(fwnode, propname, NULL, 0);
}

static inline int
fwnode_property_string_array_count(const struct fwnode_handle *fwnode,
                                   const char *propname)
{
        return fwnode_property_read_string_array(fwnode, propname, NULL, 0);
}

struct software_node;

/**
 * struct software_node_ref_args - Reference property with additional arguments
 * @swnode: Reference to a software node
 * @fwnode: Alternative reference to a firmware node handle
 * @nargs: Number of elements in @args array
 * @args: Integer arguments
 */
struct software_node_ref_args {
        const struct software_node *swnode;
        struct fwnode_handle *fwnode;
        unsigned int nargs;
        u64 args[NR_FWNODE_REFERENCE_ARGS];
};

#define SOFTWARE_NODE_REFERENCE(_ref_, ...)                        \
(const struct software_node_ref_args) {                                \
        .swnode = _Generic(_ref_,                                \
                           const struct software_node *: _ref_,        \
                           struct software_node *: _ref_,        \
                           default: NULL),                        \
        .fwnode = _Generic(_ref_,                                \
                           struct fwnode_handle *: _ref_,        \
                           default: NULL),                        \
        .nargs = COUNT_ARGS(__VA_ARGS__),                        \
        .args = { __VA_ARGS__ },                                \
}

/**
 * struct property_entry - "Built-in" device property representation.
 * @name: Name of the property.
 * @length: Length of data making up the value.
 * @is_inline: True when the property value is stored inline.
 * @type: Type of the data in unions.
 * @pointer: Pointer to the property when it is not stored inline.
 * @value: Value of the property when it is stored inline.
 */
struct property_entry {
        const char *name;
        size_t length;
        bool is_inline;
        enum dev_prop_type type;
        union {
                const void *pointer;
                union {
                        u8 u8_data[sizeof(u64) / sizeof(u8)];
                        u16 u16_data[sizeof(u64) / sizeof(u16)];
                        u32 u32_data[sizeof(u64) / sizeof(u32)];
                        u64 u64_data[sizeof(u64) / sizeof(u64)];
                        const char *str[sizeof(u64) / sizeof(char *)];
                } value;
        };
};

/*
 * Note: the below initializers for the anonymous union are carefully
 * crafted to avoid gcc-4.4.4's problems with initialization of anon unions
 * and structs.
 */
#define __PROPERTY_ENTRY_ARRAY_LEN(_name_, _elem_, _Type_, _val_, _len_)                \
(struct property_entry) {                                                                \
        .name = _name_,                                                                        \
        .length = (_len_) * sizeof_field(struct property_entry, value._elem_[0]),        \
        .type = DEV_PROP_##_Type_,                                                        \
        { .pointer = _val_ },                                                                \
}

#define PROPERTY_ENTRY_U8_ARRAY_LEN(_name_, _val_, _len_)                \
        __PROPERTY_ENTRY_ARRAY_LEN(_name_, u8_data, U8, _val_, _len_)
#define PROPERTY_ENTRY_U16_ARRAY_LEN(_name_, _val_, _len_)                \
        __PROPERTY_ENTRY_ARRAY_LEN(_name_, u16_data, U16, _val_, _len_)
#define PROPERTY_ENTRY_U32_ARRAY_LEN(_name_, _val_, _len_)                \
        __PROPERTY_ENTRY_ARRAY_LEN(_name_, u32_data, U32, _val_, _len_)
#define PROPERTY_ENTRY_U64_ARRAY_LEN(_name_, _val_, _len_)                \
        __PROPERTY_ENTRY_ARRAY_LEN(_name_, u64_data, U64, _val_, _len_)
#define PROPERTY_ENTRY_STRING_ARRAY_LEN(_name_, _val_, _len_)                \
        __PROPERTY_ENTRY_ARRAY_LEN(_name_, str, STRING, _val_, _len_)

#define PROPERTY_ENTRY_REF_ARRAY_LEN(_name_, _val_, _len_)                \
(struct property_entry) {                                                \
        .name = _name_,                                                        \
        .length = (_len_) * sizeof(struct software_node_ref_args),        \
        .type = DEV_PROP_REF,                                                \
        { .pointer = _val_ },                                                \
}

#define PROPERTY_ENTRY_U8_ARRAY(_name_, _val_)                                \
        PROPERTY_ENTRY_U8_ARRAY_LEN(_name_, _val_, ARRAY_SIZE(_val_))
#define PROPERTY_ENTRY_U16_ARRAY(_name_, _val_)                                \
        PROPERTY_ENTRY_U16_ARRAY_LEN(_name_, _val_, ARRAY_SIZE(_val_))
#define PROPERTY_ENTRY_U32_ARRAY(_name_, _val_)                                \
        PROPERTY_ENTRY_U32_ARRAY_LEN(_name_, _val_, ARRAY_SIZE(_val_))
#define PROPERTY_ENTRY_U64_ARRAY(_name_, _val_)                                \
        PROPERTY_ENTRY_U64_ARRAY_LEN(_name_, _val_, ARRAY_SIZE(_val_))
#define PROPERTY_ENTRY_STRING_ARRAY(_name_, _val_)                        \
        PROPERTY_ENTRY_STRING_ARRAY_LEN(_name_, _val_, ARRAY_SIZE(_val_))
#define PROPERTY_ENTRY_REF_ARRAY(_name_, _val_)                                \
        PROPERTY_ENTRY_REF_ARRAY_LEN(_name_, _val_, ARRAY_SIZE(_val_))

#define __PROPERTY_ENTRY_ELEMENT(_name_, _elem_, _Type_, _val_)                \
(struct property_entry) {                                                \
        .name = _name_,                                                        \
        .length = sizeof_field(struct property_entry, value._elem_[0]),        \
        .is_inline = true,                                                \
        .type = DEV_PROP_##_Type_,                                        \
        { .value = { ._elem_[0] = _val_ } },                                \
}

#define PROPERTY_ENTRY_U8(_name_, _val_)                                \
        __PROPERTY_ENTRY_ELEMENT(_name_, u8_data, U8, _val_)
#define PROPERTY_ENTRY_U16(_name_, _val_)                                \
        __PROPERTY_ENTRY_ELEMENT(_name_, u16_data, U16, _val_)
#define PROPERTY_ENTRY_U32(_name_, _val_)                                \
        __PROPERTY_ENTRY_ELEMENT(_name_, u32_data, U32, _val_)
#define PROPERTY_ENTRY_U64(_name_, _val_)                                \
        __PROPERTY_ENTRY_ELEMENT(_name_, u64_data, U64, _val_)
#define PROPERTY_ENTRY_STRING(_name_, _val_)                                \
        __PROPERTY_ENTRY_ELEMENT(_name_, str, STRING, _val_)

#define PROPERTY_ENTRY_REF(_name_, _ref_, ...)                                \
(struct property_entry) {                                                \
        .name = _name_,                                                        \
        .length = sizeof(struct software_node_ref_args),                \
        .type = DEV_PROP_REF,                                                \
        { .pointer = &SOFTWARE_NODE_REFERENCE(_ref_, ##__VA_ARGS__), },        \
}

#define PROPERTY_ENTRY_BOOL(_name_)                \
(struct property_entry) {                        \
        .name = _name_,                                \
        .is_inline = true,                        \
}

struct property_entry *
property_entries_dup(const struct property_entry *properties);
void property_entries_free(const struct property_entry *properties);

bool device_dma_supported(const struct device *dev);
enum dev_dma_attr device_get_dma_attr(const struct device *dev);

const void *device_get_match_data(const struct device *dev);

int device_get_phy_mode(struct device *dev);
int fwnode_get_phy_mode(const struct fwnode_handle *fwnode);

void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index);

struct fwnode_handle *fwnode_graph_get_next_endpoint(
        const struct fwnode_handle *fwnode, struct fwnode_handle *prev);
struct fwnode_handle *
fwnode_graph_get_port_parent(const struct fwnode_handle *fwnode);
struct fwnode_handle *fwnode_graph_get_remote_port_parent(
        const struct fwnode_handle *fwnode);
struct fwnode_handle *fwnode_graph_get_remote_port(
        const struct fwnode_handle *fwnode);
struct fwnode_handle *fwnode_graph_get_remote_endpoint(
        const struct fwnode_handle *fwnode);

static inline bool fwnode_graph_is_endpoint(const struct fwnode_handle *fwnode)
{
        return fwnode_property_present(fwnode, "remote-endpoint");
}

/*
 * Fwnode lookup flags
 *
 * @FWNODE_GRAPH_ENDPOINT_NEXT: In the case of no exact match, look for the
 *                                closest endpoint ID greater than the specified
 *                                one.
 * @FWNODE_GRAPH_DEVICE_DISABLED: That the device to which the remote
 *                                  endpoint of the given endpoint belongs to,
 *                                  may be disabled, or that the endpoint is not
 *                                  connected.
 */
#define FWNODE_GRAPH_ENDPOINT_NEXT        BIT(0)
#define FWNODE_GRAPH_DEVICE_DISABLED        BIT(1)

struct fwnode_handle *
fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode,
                                u32 port, u32 endpoint, unsigned long flags);
unsigned int fwnode_graph_get_endpoint_count(const struct fwnode_handle *fwnode,
                                             unsigned long flags);

#define fwnode_graph_for_each_endpoint(fwnode, child)                                \
        for (child = fwnode_graph_get_next_endpoint(fwnode, NULL); child;        \
             child = fwnode_graph_get_next_endpoint(fwnode, child))

int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
                                struct fwnode_endpoint *endpoint);

typedef void *(*devcon_match_fn_t)(const struct fwnode_handle *fwnode, const char *id,
                                   void *data);

void *fwnode_connection_find_match(const struct fwnode_handle *fwnode,
                                   const char *con_id, void *data,
                                   devcon_match_fn_t match);

static inline void *device_connection_find_match(const struct device *dev,
                                                 const char *con_id, void *data,
                                                 devcon_match_fn_t match)
{
        return fwnode_connection_find_match(dev_fwnode(dev), con_id, data, match);
}

int fwnode_connection_find_matches(const struct fwnode_handle *fwnode,
                                   const char *con_id, void *data,
                                   devcon_match_fn_t match,
                                   void **matches, unsigned int matches_len);

/* -------------------------------------------------------------------------- */
/* Software fwnode support - when HW description is incomplete or missing */

/**
 * struct software_node - Software node description
 * @name: Name of the software node
 * @parent: Parent of the software node
 * @properties: Array of device properties
 */
struct software_node {
        const char *name;
        const struct software_node *parent;
        const struct property_entry *properties;
};

#define SOFTWARE_NODE(_name_, _properties_, _parent_)        \
        (struct software_node) {                        \
                .name = _name_,                                \
                .properties = _properties_,                \
                .parent = _parent_,                        \
        }

bool is_software_node(const struct fwnode_handle *fwnode);
const struct software_node *
to_software_node(const struct fwnode_handle *fwnode);
struct fwnode_handle *software_node_fwnode(const struct software_node *node);

const struct software_node *
software_node_find_by_name(const struct software_node *parent,
                           const char *name);

int software_node_register_node_group(const struct software_node * const *node_group);
void software_node_unregister_node_group(const struct software_node * const *node_group);

int software_node_register(const struct software_node *node);
void software_node_unregister(const struct software_node *node);

struct fwnode_handle *
fwnode_create_software_node(const struct property_entry *properties,
                            const struct fwnode_handle *parent);
void fwnode_remove_software_node(struct fwnode_handle *fwnode);

int device_add_software_node(struct device *dev, const struct software_node *node);
void device_remove_software_node(struct device *dev);

int device_create_managed_software_node(struct device *dev,
                                        const struct property_entry *properties,
                                        const struct software_node *parent);

#endif /* _LINUX_PROPERTY_H_ */




























































































































































    6 























































    6 
    6 




























































































































    3 








    3 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        RAW sockets for IPv6
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Adapted from linux/net/ipv4/raw.c
 *
 *        Fixes:
 *        Hideaki YOSHIFUJI        :        sin6_scope_id support
 *        YOSHIFUJI,H.@USAGI        :        raw checksum (RFC2292(bis) compliance)
 *        Kazunori MIYAZAWA @USAGI:        change process style to use ip6_append_data
 */

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/slab.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <linux/skbuff.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>

#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/snmp.h>

#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/protocol.h>
#include <net/ip6_route.h>
#include <net/ip6_checksum.h>
#include <net/addrconf.h>
#include <net/transp_v6.h>
#include <net/udp.h>
#include <net/inet_common.h>
#include <net/tcp_states.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/mip6.h>
#endif
#include <linux/mroute6.h>

#include <net/raw.h>
#include <net/rawv6.h>
#include <net/xfrm.h>

#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/export.h>

#define        ICMPV6_HDRLEN        4        /* ICMPv6 header, RFC 4443 Section 2.1 */

struct raw_hashinfo raw_v6_hashinfo;
EXPORT_SYMBOL_GPL(raw_v6_hashinfo);

bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num,
                  const struct in6_addr *loc_addr,
                  const struct in6_addr *rmt_addr, int dif, int sdif)
{
        if (inet_sk(sk)->inet_num != num ||
            !net_eq(sock_net(sk), net) ||
            (!ipv6_addr_any(&sk->sk_v6_daddr) &&
             !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
            !raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
                                 dif, sdif))
                return false;

        if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) ||
            ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr) ||
            (ipv6_addr_is_multicast(loc_addr) &&
             inet6_mc_check(sk, loc_addr, rmt_addr)))
                return true;

        return false;
}
EXPORT_SYMBOL_GPL(raw_v6_match);

/*
 *        0 - deliver
 *        1 - block
 */
static int icmpv6_filter(const struct sock *sk, struct sk_buff *skb)
{
        const struct icmp6hdr *hdr;
        const __u32 *data;
        unsigned int type;

        /* We require only the four bytes of the ICMPv6 header, not any
         * additional bytes of message body in "struct icmp6hdr".
         */
        if (!pskb_may_pull(skb, ICMPV6_HDRLEN))
                return 1;

        hdr = (struct icmp6hdr *)skb->data;
        type = hdr->icmp6_type;

        data = &raw6_sk(sk)->filter.data[0];

        return (data[type >> 5] & (1U << (type & 31))) != 0;
}

#if IS_ENABLED(CONFIG_IPV6_MIP6)
typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb);

static mh_filter_t __rcu *mh_filter __read_mostly;

int rawv6_mh_filter_register(mh_filter_t filter)
{
        rcu_assign_pointer(mh_filter, filter);
        return 0;
}
EXPORT_SYMBOL(rawv6_mh_filter_register);

int rawv6_mh_filter_unregister(mh_filter_t filter)
{
        RCU_INIT_POINTER(mh_filter, NULL);
        synchronize_rcu();
        return 0;
}
EXPORT_SYMBOL(rawv6_mh_filter_unregister);

#endif

/*
 *        demultiplex raw sockets.
 *        (should consider queueing the skb in the sock receive_queue
 *        without calling rawv6.c)
 *
 *        Caller owns SKB so we must make clones.
 */
static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
{
        struct net *net = dev_net(skb->dev);
        const struct ipv6hdr *ip6h;
        struct hlist_head *hlist;
        bool delivered = false;
        struct sock *sk;
        __u8 hash;

        ip6h = ipv6_hdr(skb);

        hash = raw_hashfunc(net, nexthdr);
        hlist = &raw_v6_hashinfo.ht[hash];
        rcu_read_lock();
        sk_for_each_rcu(sk, hlist) {
                int filtered;

                if (!raw_v6_match(net, sk, nexthdr, &ip6h->daddr, &ip6h->saddr,
                                  inet6_iif(skb), inet6_sdif(skb)))
                        continue;

                if (atomic_read(&sk->sk_rmem_alloc) >=
                    READ_ONCE(sk->sk_rcvbuf)) {
                        sk_drops_inc(sk);
                        continue;
                }

                delivered = true;
                switch (nexthdr) {
                case IPPROTO_ICMPV6:
                        filtered = icmpv6_filter(sk, skb);
                        ip6h = ipv6_hdr(skb);
                        break;

#if IS_ENABLED(CONFIG_IPV6_MIP6)
                case IPPROTO_MH:
                {
                        /* XXX: To validate MH only once for each packet,
                         * this is placed here. It should be after checking
                         * xfrm policy, however it doesn't. The checking xfrm
                         * policy is placed in rawv6_rcv() because it is
                         * required for each socket.
                         */
                        mh_filter_t *filter;

                        filter = rcu_dereference(mh_filter);
                        filtered = filter ? (*filter)(sk, skb) : 0;
                        break;
                }
#endif
                default:
                        filtered = 0;
                        break;
                }

                if (filtered < 0)
                        break;
                if (filtered == 0) {
                        struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);

                        /* Not releasing hash table! */
                        if (clone)
                                rawv6_rcv(sk, clone);
                }
        }
        rcu_read_unlock();
        return delivered;
}

bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
{
        return ipv6_raw_deliver(skb, nexthdr);
}

/* This cleans up af_inet6 a bit. -DaveM */
static int rawv6_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
                      int addr_len)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
        __be32 v4addr = 0;
        int addr_type;
        int err;

        if (addr_len < SIN6_LEN_RFC2133)
                return -EINVAL;

        if (addr->sin6_family != AF_INET6)
                return -EINVAL;

        addr_type = ipv6_addr_type(&addr->sin6_addr);

        /* Raw sockets are IPv6 only */
        if (addr_type == IPV6_ADDR_MAPPED)
                return -EADDRNOTAVAIL;

        lock_sock(sk);

        err = -EINVAL;
        if (sk->sk_state != TCP_CLOSE)
                goto out;

        rcu_read_lock();
        /* Check if the address belongs to the host. */
        if (addr_type != IPV6_ADDR_ANY) {
                struct net_device *dev = NULL;

                if (__ipv6_addr_needs_scope_id(addr_type)) {
                        if (addr_len >= sizeof(struct sockaddr_in6) &&
                            addr->sin6_scope_id) {
                                /* Override any existing binding, if another
                                 * one is supplied by user.
                                 */
                                sk->sk_bound_dev_if = addr->sin6_scope_id;
                        }

                        /* Binding to link-local address requires an interface */
                        if (!sk->sk_bound_dev_if)
                                goto out_unlock;
                }

                if (sk->sk_bound_dev_if) {
                        err = -ENODEV;
                        dev = dev_get_by_index_rcu(sock_net(sk),
                                                   sk->sk_bound_dev_if);
                        if (!dev)
                                goto out_unlock;
                }

                /* ipv4 addr of the socket is invalid.  Only the
                 * unspecified and mapped address have a v4 equivalent.
                 */
                v4addr = LOOPBACK4_IPV6;
                if (!(addr_type & IPV6_ADDR_MULTICAST) &&
                    !ipv6_can_nonlocal_bind(sock_net(sk), inet)) {
                        err = -EADDRNOTAVAIL;
                        if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
                                           dev, 0)) {
                                goto out_unlock;
                        }
                }
        }

        inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
        sk->sk_v6_rcv_saddr = addr->sin6_addr;
        if (!(addr_type & IPV6_ADDR_MULTICAST))
                np->saddr = addr->sin6_addr;
        err = 0;
out_unlock:
        rcu_read_unlock();
out:
        release_sock(sk);
        return err;
}

static void rawv6_err(struct sock *sk, struct sk_buff *skb,
                      u8 type, u8 code, int offset, __be32 info)
{
        bool recverr = inet6_test_bit(RECVERR6, sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        int err;
        int harderr;

        /* Report error on raw socket, if:
           1. User requested recverr.
           2. Socket is connected (otherwise the error indication
              is useless without recverr and error is hard.
         */
        if (!recverr && sk->sk_state != TCP_ESTABLISHED)
                return;

        harderr = icmpv6_err_convert(type, code, &err);
        if (type == ICMPV6_PKT_TOOBIG) {
                ip6_sk_update_pmtu(skb, sk, info);
                harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO);
        }
        if (type == NDISC_REDIRECT) {
                ip6_sk_redirect(skb, sk);
                return;
        }
        if (recverr) {
                u8 *payload = skb->data;
                if (!inet_test_bit(HDRINCL, sk))
                        payload += offset;
                ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload);
        }

        if (recverr || harderr) {
                sk->sk_err = err;
                sk_error_report(sk);
        }
}

void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
                u8 type, u8 code, int inner_offset, __be32 info)
{
        struct net *net = dev_net(skb->dev);
        struct hlist_head *hlist;
        struct sock *sk;
        int hash;

        hash = raw_hashfunc(net, nexthdr);
        hlist = &raw_v6_hashinfo.ht[hash];
        rcu_read_lock();
        sk_for_each_rcu(sk, hlist) {
                /* Note: ipv6_hdr(skb) != skb->data */
                const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;

                if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr,
                                  inet6_iif(skb), inet6_iif(skb)))
                        continue;
                rawv6_err(sk, skb, type, code, inner_offset, info);
        }
        rcu_read_unlock();
}

static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason reason;

        if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) &&
            skb_checksum_complete(skb)) {
                sk_drops_inc(sk);
                sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
                return NET_RX_DROP;
        }

        /* Charge it to the socket. */
        skb_dst_drop(skb);
        reason = sock_queue_rcv_skb_reason(sk, skb);
        if (reason) {
                sk_skb_reason_drop(sk, skb, reason);
                return NET_RX_DROP;
        }

        return 0;
}

/*
 *        This is next to useless...
 *        if we demultiplex in network layer we don't need the extra call
 *        just to queue the skb...
 *        maybe we could have the network decide upon a hint if it
 *        should call raw_rcv for demultiplexing
 */
int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct inet_sock *inet = inet_sk(sk);
        struct raw6_sock *rp = raw6_sk(sk);

        if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
                sk_drops_inc(sk);
                sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
                return NET_RX_DROP;
        }
        nf_reset_ct(skb);

        if (!rp->checksum)
                skb->ip_summed = CHECKSUM_UNNECESSARY;

        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                skb_postpull_rcsum(skb, skb_network_header(skb),
                                   skb_network_header_len(skb));
                if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                     &ipv6_hdr(skb)->daddr,
                                     skb->len, inet->inet_num, skb->csum))
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
        }
        if (!skb_csum_unnecessary(skb))
                skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                                         &ipv6_hdr(skb)->daddr,
                                                         skb->len,
                                                         inet->inet_num, 0));

        if (inet_test_bit(HDRINCL, sk)) {
                if (skb_checksum_complete(skb)) {
                        sk_drops_inc(sk);
                        sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
                        return NET_RX_DROP;
                }
        }

        rawv6_rcv_skb(sk, skb);
        return 0;
}


/*
 *        This should be easy, if there is something there
 *        we return it, otherwise we block.
 */

static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                         int flags)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
        struct sk_buff *skb;
        size_t copied;
        int err;

        if (flags & MSG_OOB)
                return -EOPNOTSUPP;

        if (flags & MSG_ERRQUEUE)
                return ipv6_recv_error(sk, msg, len);

        if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu))
                return ipv6_recv_rxpmtu(sk, msg, len);

        skb = skb_recv_datagram(sk, flags, &err);
        if (!skb)
                goto out;

        copied = skb->len;
        if (copied > len) {
                copied = len;
                msg->msg_flags |= MSG_TRUNC;
        }

        if (skb_csum_unnecessary(skb)) {
                err = skb_copy_datagram_msg(skb, 0, msg, copied);
        } else if (msg->msg_flags&MSG_TRUNC) {
                if (__skb_checksum_complete(skb))
                        goto csum_copy_err;
                err = skb_copy_datagram_msg(skb, 0, msg, copied);
        } else {
                err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
                if (err == -EINVAL)
                        goto csum_copy_err;
        }
        if (err)
                goto out_free;

        /* Copy the address. */
        if (sin6) {
                sin6->sin6_family = AF_INET6;
                sin6->sin6_port = 0;
                sin6->sin6_addr = ipv6_hdr(skb)->saddr;
                sin6->sin6_flowinfo = 0;
                sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
                                                          inet6_iif(skb));
                msg->msg_namelen = sizeof(*sin6);
        }

        sock_recv_cmsgs(msg, sk, skb);

        if (np->rxopt.all)
                ip6_datagram_recv_ctl(sk, msg, skb);

        err = copied;
        if (flags & MSG_TRUNC)
                err = skb->len;

out_free:
        skb_free_datagram(sk, skb);
out:
        return err;

csum_copy_err:
        skb_kill_datagram(sk, skb, flags);

        /* Error for blocking case is chosen to masquerade
           as some normal condition.
         */
        err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
        goto out;
}

static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
                                     struct raw6_sock *rp)
{
        struct ipv6_txoptions *opt;
        struct sk_buff *skb;
        int err = 0;
        int offset;
        int len;
        int total_len;
        __wsum tmp_csum;
        __sum16 csum;

        if (!rp->checksum)
                goto send;

        skb = skb_peek(&sk->sk_write_queue);
        if (!skb)
                goto out;

        offset = rp->offset;
        total_len = inet_sk(sk)->cork.base.length;
        opt = inet_sk(sk)->cork.base6.opt;
        total_len -= opt ? opt->opt_flen : 0;

        if (offset >= total_len - 1) {
                err = -EINVAL;
                ip6_flush_pending_frames(sk);
                goto out;
        }

        /* should be check HW csum miyazawa */
        if (skb_queue_len(&sk->sk_write_queue) == 1) {
                /*
                 * Only one fragment on the socket.
                 */
                tmp_csum = skb->csum;
        } else {
                struct sk_buff *csum_skb = NULL;
                tmp_csum = 0;

                skb_queue_walk(&sk->sk_write_queue, skb) {
                        tmp_csum = csum_add(tmp_csum, skb->csum);

                        if (csum_skb)
                                continue;

                        len = skb->len - skb_transport_offset(skb);
                        if (offset >= len) {
                                offset -= len;
                                continue;
                        }

                        csum_skb = skb;
                }

                skb = csum_skb;
        }

        offset += skb_transport_offset(skb);
        err = skb_copy_bits(skb, offset, &csum, 2);
        if (err < 0) {
                ip6_flush_pending_frames(sk);
                goto out;
        }

        /* in case cksum was not initialized */
        if (unlikely(csum))
                tmp_csum = csum_sub(tmp_csum, csum_unfold(csum));

        csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
                               total_len, fl6->flowi6_proto, tmp_csum);

        if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP)
                csum = CSUM_MANGLED_0;

        BUG_ON(skb_store_bits(skb, offset, &csum, 2));

send:
        err = ip6_push_pending_frames(sk);
out:
        return err;
}

static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
                        struct flowi6 *fl6, struct dst_entry **dstp,
                        unsigned int flags, const struct sockcm_cookie *sockc)
{
        struct net *net = sock_net(sk);
        struct ipv6hdr *iph;
        struct sk_buff *skb;
        int err;
        struct rt6_info *rt = dst_rt6_info(*dstp);
        int hlen = LL_RESERVED_SPACE(rt->dst.dev);
        int tlen = rt->dst.dev->needed_tailroom;

        if (length > rt->dst.dev->mtu) {
                ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu);
                return -EMSGSIZE;
        }
        if (length < sizeof(struct ipv6hdr))
                return -EINVAL;
        if (flags&MSG_PROBE)
                goto out;

        skb = sock_alloc_send_skb(sk,
                                  length + hlen + tlen + 15,
                                  flags & MSG_DONTWAIT, &err);
        if (!skb)
                goto error;
        skb_reserve(skb, hlen);

        skb->protocol = htons(ETH_P_IPV6);
        skb->priority = sockc->priority;
        skb->mark = sockc->mark;
        skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);

        skb_put(skb, length);
        skb_reset_network_header(skb);
        iph = ipv6_hdr(skb);

        skb->ip_summed = CHECKSUM_NONE;

        skb_setup_tx_timestamp(skb, sockc);

        if (flags & MSG_CONFIRM)
                skb_set_dst_pending_confirm(skb, 1);

        skb->transport_header = skb->network_header;
        err = memcpy_from_msg(iph, msg, length);
        if (err) {
                err = -EFAULT;
                kfree_skb(skb);
                goto error;
        }

        skb_dst_set(skb, &rt->dst);
        *dstp = NULL;

        /* if egress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
        skb = l3mdev_ip6_out(sk, skb);
        if (unlikely(!skb))
                return 0;

        /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev
         * in the error path. Since skb has been freed, the dst could
         * have been queued for deletion.
         */
        rcu_read_lock();
        IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
        err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
                      NULL, rt->dst.dev, dst_output);
        if (err > 0)
                err = net_xmit_errno(err);
        if (err) {
                IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
                rcu_read_unlock();
                goto error_check;
        }
        rcu_read_unlock();
out:
        return 0;

error:
        IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
error_check:
        if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk))
                err = 0;
        return err;
}

struct raw6_frag_vec {
        struct msghdr *msg;
        int hlen;
        char c[4];
};

static int rawv6_probe_proto_opt(struct raw6_frag_vec *rfv, struct flowi6 *fl6)
{
        int err = 0;
        switch (fl6->flowi6_proto) {
        case IPPROTO_ICMPV6:
                rfv->hlen = 2;
                err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen);
                if (!err) {
                        fl6->fl6_icmp_type = rfv->c[0];
                        fl6->fl6_icmp_code = rfv->c[1];
                }
                break;
        case IPPROTO_MH:
                rfv->hlen = 4;
                err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen);
                if (!err)
                        fl6->fl6_mh_type = rfv->c[2];
        }
        return err;
}

static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
                       struct sk_buff *skb)
{
        struct raw6_frag_vec *rfv = from;

        if (offset < rfv->hlen) {
                int copy = min(rfv->hlen - offset, len);

                if (skb->ip_summed == CHECKSUM_PARTIAL)
                        memcpy(to, rfv->c + offset, copy);
                else
                        skb->csum = csum_block_add(
                                skb->csum,
                                csum_partial_copy_nocheck(rfv->c + offset,
                                                          to, copy),
                                odd);

                odd = 0;
                offset += copy;
                to += copy;
                len -= copy;

                if (!len)
                        return 0;
        }

        offset -= rfv->hlen;

        return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
}

static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
        struct ipv6_txoptions *opt_to_free = NULL;
        struct ipv6_txoptions opt_space;
        DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
        struct in6_addr *daddr, *final_p, final;
        struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct raw6_sock *rp = raw6_sk(sk);
        struct ipv6_txoptions *opt = NULL;
        struct ip6_flowlabel *flowlabel = NULL;
        struct dst_entry *dst = NULL;
        struct raw6_frag_vec rfv;
        struct flowi6 fl6;
        struct ipcm6_cookie ipc6;
        int addr_len = msg->msg_namelen;
        int hdrincl;
        u16 proto;
        int err;

        /* Rough check on arithmetic overflow,
           better check is made in ip6_append_data().
         */
        if (len > INT_MAX)
                return -EMSGSIZE;

        /* Mirror BSD error message compatibility */
        if (msg->msg_flags & MSG_OOB)
                return -EOPNOTSUPP;

        hdrincl = inet_test_bit(HDRINCL, sk);

        ipcm6_init_sk(&ipc6, sk);

        /*
         *        Get and verify the address.
         */
        memset(&fl6, 0, sizeof(fl6));

        fl6.flowi6_mark = ipc6.sockc.mark;
        fl6.flowi6_uid = sk_uid(sk);

        if (sin6) {
                if (addr_len < SIN6_LEN_RFC2133)
                        return -EINVAL;

                if (sin6->sin6_family && sin6->sin6_family != AF_INET6)
                        return -EAFNOSUPPORT;

                /* port is the proto value [0..255] carried in nexthdr */
                proto = ntohs(sin6->sin6_port);

                if (!proto)
                        proto = inet->inet_num;
                else if (proto != inet->inet_num &&
                         inet->inet_num != IPPROTO_RAW)
                        return -EINVAL;

                if (proto > 255)
                        return -EINVAL;

                daddr = &sin6->sin6_addr;
                if (inet6_test_bit(SNDFLOW, sk)) {
                        fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
                        if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
                                flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
                                if (IS_ERR(flowlabel))
                                        return -EINVAL;
                        }
                }

                /*
                 * Otherwise it will be difficult to maintain
                 * sk->sk_dst_cache.
                 */
                if (sk->sk_state == TCP_ESTABLISHED &&
                    ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
                        daddr = &sk->sk_v6_daddr;

                if (addr_len >= sizeof(struct sockaddr_in6) &&
                    sin6->sin6_scope_id &&
                    __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
                        fl6.flowi6_oif = sin6->sin6_scope_id;
        } else {
                if (sk->sk_state != TCP_ESTABLISHED)
                        return -EDESTADDRREQ;

                proto = inet->inet_num;
                daddr = &sk->sk_v6_daddr;
                fl6.flowlabel = np->flow_label;
        }

        if (fl6.flowi6_oif == 0)
                fl6.flowi6_oif = sk->sk_bound_dev_if;

        if (msg->msg_controllen) {
                opt = &opt_space;
                memset(opt, 0, sizeof(struct ipv6_txoptions));
                opt->tot_len = sizeof(struct ipv6_txoptions);
                ipc6.opt = opt;

                err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
                if (err < 0) {
                        fl6_sock_release(flowlabel);
                        return err;
                }
                if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
                        flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
                        if (IS_ERR(flowlabel))
                                return -EINVAL;
                }
                if (!(opt->opt_nflen|opt->opt_flen))
                        opt = NULL;
        }
        if (!opt) {
                opt = txopt_get(np);
                opt_to_free = opt;
        }
        if (flowlabel)
                opt = fl6_merge_options(&opt_space, flowlabel, opt);
        opt = ipv6_fixup_options(&opt_space, opt);

        fl6.flowi6_proto = proto;
        fl6.flowi6_mark = ipc6.sockc.mark;

        if (!hdrincl) {
                rfv.msg = msg;
                rfv.hlen = 0;
                err = rawv6_probe_proto_opt(&rfv, &fl6);
                if (err)
                        goto out;
        }

        if (!ipv6_addr_any(daddr))
                fl6.daddr = *daddr;
        else
                fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
        if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
                fl6.saddr = np->saddr;

        final_p = fl6_update_dst(&fl6, opt, &final);

        if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
                fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
        else if (!fl6.flowi6_oif)
                fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
        security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));

        if (hdrincl)
                fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;

        fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);

        dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
        if (IS_ERR(dst)) {
                err = PTR_ERR(dst);
                goto out;
        }
        if (ipc6.hlimit < 0)
                ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);

        if (msg->msg_flags&MSG_CONFIRM)
                goto do_confirm;

back_from_confirm:
        if (hdrincl)
                err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst,
                                        msg->msg_flags, &ipc6.sockc);
        else {
                ipc6.opt = opt;
                lock_sock(sk);
                err = ip6_append_data(sk, raw6_getfrag, &rfv,
                        len, 0, &ipc6, &fl6, dst_rt6_info(dst),
                        msg->msg_flags);

                if (err)
                        ip6_flush_pending_frames(sk);
                else if (!(msg->msg_flags & MSG_MORE))
                        err = rawv6_push_pending_frames(sk, &fl6, rp);
                release_sock(sk);
        }
done:
        dst_release(dst);
out:
        fl6_sock_release(flowlabel);
        txopt_put(opt_to_free);
        return err < 0 ? err : len;
do_confirm:
        if (msg->msg_flags & MSG_PROBE)
                dst_confirm_neigh(dst, &fl6.daddr);
        if (!(msg->msg_flags & MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
        goto done;
}

static int rawv6_seticmpfilter(struct sock *sk, int optname,
                               sockptr_t optval, int optlen)
{
        switch (optname) {
        case ICMPV6_FILTER:
                if (optlen > sizeof(struct icmp6_filter))
                        optlen = sizeof(struct icmp6_filter);
                if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen))
                        return -EFAULT;
                return 0;
        default:
                return -ENOPROTOOPT;
        }

        return 0;
}

static int rawv6_geticmpfilter(struct sock *sk, int optname,
                               char __user *optval, int __user *optlen)
{
        int len;

        switch (optname) {
        case ICMPV6_FILTER:
                if (get_user(len, optlen))
                        return -EFAULT;
                if (len < 0)
                        return -EINVAL;
                if (len > sizeof(struct icmp6_filter))
                        len = sizeof(struct icmp6_filter);
                if (put_user(len, optlen))
                        return -EFAULT;
                if (copy_to_user(optval, &raw6_sk(sk)->filter, len))
                        return -EFAULT;
                return 0;
        default:
                return -ENOPROTOOPT;
        }

        return 0;
}


static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
                               sockptr_t optval, unsigned int optlen)
{
        struct raw6_sock *rp = raw6_sk(sk);
        int val;

        if (optlen < sizeof(val))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        switch (optname) {
        case IPV6_HDRINCL:
                if (sk->sk_type != SOCK_RAW)
                        return -EINVAL;
                inet_assign_bit(HDRINCL, sk, val);
                return 0;
        case IPV6_CHECKSUM:
                if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 &&
                    level == IPPROTO_IPV6) {
                        /*
                         * RFC3542 tells that IPV6_CHECKSUM socket
                         * option in the IPPROTO_IPV6 level is not
                         * allowed on ICMPv6 sockets.
                         * If you want to set it, use IPPROTO_RAW
                         * level IPV6_CHECKSUM socket option
                         * (Linux extension).
                         */
                        return -EINVAL;
                }

                /* You may get strange result with a positive odd offset;
                   RFC2292bis agrees with me. */
                if (val > 0 && (val&1))
                        return -EINVAL;
                if (val < 0) {
                        rp->checksum = 0;
                } else {
                        rp->checksum = 1;
                        rp->offset = val;
                }

                return 0;

        default:
                return -ENOPROTOOPT;
        }
}

static int rawv6_setsockopt(struct sock *sk, int level, int optname,
                            sockptr_t optval, unsigned int optlen)
{
        switch (level) {
        case SOL_RAW:
                break;

        case SOL_ICMPV6:
                if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
                        return -EOPNOTSUPP;
                return rawv6_seticmpfilter(sk, optname, optval, optlen);
        case SOL_IPV6:
                if (optname == IPV6_CHECKSUM ||
                    optname == IPV6_HDRINCL)
                        break;
                fallthrough;
        default:
                return ipv6_setsockopt(sk, level, optname, optval, optlen);
        }

        return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
}

static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
                            char __user *optval, int __user *optlen)
{
        struct raw6_sock *rp = raw6_sk(sk);
        int val, len;

        if (get_user(len, optlen))
                return -EFAULT;

        switch (optname) {
        case IPV6_HDRINCL:
                val = inet_test_bit(HDRINCL, sk);
                break;
        case IPV6_CHECKSUM:
                /*
                 * We allow getsockopt() for IPPROTO_IPV6-level
                 * IPV6_CHECKSUM socket option on ICMPv6 sockets
                 * since RFC3542 is silent about it.
                 */
                if (rp->checksum == 0)
                        val = -1;
                else
                        val = rp->offset;
                break;

        default:
                return -ENOPROTOOPT;
        }

        len = min_t(unsigned int, sizeof(int), len);

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;
        return 0;
}

static int rawv6_getsockopt(struct sock *sk, int level, int optname,
                          char __user *optval, int __user *optlen)
{
        switch (level) {
        case SOL_RAW:
                break;

        case SOL_ICMPV6:
                if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
                        return -EOPNOTSUPP;
                return rawv6_geticmpfilter(sk, optname, optval, optlen);
        case SOL_IPV6:
                if (optname == IPV6_CHECKSUM ||
                    optname == IPV6_HDRINCL)
                        break;
                fallthrough;
        default:
                return ipv6_getsockopt(sk, level, optname, optval, optlen);
        }

        return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
}

static int rawv6_ioctl(struct sock *sk, int cmd, int *karg)
{
        switch (cmd) {
        case SIOCOUTQ: {
                *karg = sk_wmem_alloc_get(sk);
                return 0;
        }
        case SIOCINQ: {
                struct sk_buff *skb;

                spin_lock_bh(&sk->sk_receive_queue.lock);
                skb = skb_peek(&sk->sk_receive_queue);
                if (skb)
                        *karg = skb->len;
                else
                        *karg = 0;
                spin_unlock_bh(&sk->sk_receive_queue.lock);
                return 0;
        }

        default:
#ifdef CONFIG_IPV6_MROUTE
                return ip6mr_ioctl(sk, cmd, karg);
#else
                return -ENOIOCTLCMD;
#endif
        }
}

#ifdef CONFIG_COMPAT
static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
{
        switch (cmd) {
        case SIOCOUTQ:
        case SIOCINQ:
                return -ENOIOCTLCMD;
        default:
#ifdef CONFIG_IPV6_MROUTE
                return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg));
#else
                return -ENOIOCTLCMD;
#endif
        }
}
#endif

static void rawv6_close(struct sock *sk, long timeout)
{
        if (inet_sk(sk)->inet_num == IPPROTO_RAW)
                ip6_ra_control(sk, -1);
        ip6mr_sk_done(sk);
        sk_common_release(sk);
}

static void raw6_destroy(struct sock *sk)
{
        lock_sock(sk);
        ip6_flush_pending_frames(sk);
        release_sock(sk);
}

static int rawv6_init_sk(struct sock *sk)
{
        struct raw6_sock *rp = raw6_sk(sk);

        sk->sk_drop_counters = &rp->drop_counters;
        switch (inet_sk(sk)->inet_num) {
        case IPPROTO_ICMPV6:
                rp->checksum = 1;
                rp->offset   = 2;
                break;
        case IPPROTO_MH:
                rp->checksum = 1;
                rp->offset   = 4;
                break;
        default:
                break;
        }
        return 0;
}

struct proto rawv6_prot = {
        .name                   = "RAWv6",
        .owner                   = THIS_MODULE,
        .close                   = rawv6_close,
        .destroy           = raw6_destroy,
        .connect           = ip6_datagram_connect_v6_only,
        .disconnect           = __udp_disconnect,
        .ioctl                   = rawv6_ioctl,
        .init                   = rawv6_init_sk,
        .setsockopt           = rawv6_setsockopt,
        .getsockopt           = rawv6_getsockopt,
        .sendmsg           = rawv6_sendmsg,
        .recvmsg           = rawv6_recvmsg,
        .bind                   = rawv6_bind,
        .backlog_rcv           = rawv6_rcv_skb,
        .hash                   = raw_hash_sk,
        .unhash                   = raw_unhash_sk,
        .obj_size           = sizeof(struct raw6_sock),
        .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6),
        .useroffset           = offsetof(struct raw6_sock, filter),
        .usersize           = sizeof_field(struct raw6_sock, filter),
        .h.raw_hash           = &raw_v6_hashinfo,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = compat_rawv6_ioctl,
#endif
        .diag_destroy           = raw_abort,
};

#ifdef CONFIG_PROC_FS
static int raw6_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
        } else {
                struct sock *sp = v;
                __u16 srcp  = inet_sk(sp)->inet_num;
                ip6_dgram_sock_seq_show(seq, v, srcp, 0,
                                        raw_seq_private(seq)->bucket);
        }
        return 0;
}

static const struct seq_operations raw6_seq_ops = {
        .start =        raw_seq_start,
        .next =                raw_seq_next,
        .stop =                raw_seq_stop,
        .show =                raw6_seq_show,
};

static int __net_init raw6_init_net(struct net *net)
{
        if (!proc_create_net_data("raw6", 0444, net->proc_net, &raw6_seq_ops,
                        sizeof(struct raw_iter_state), &raw_v6_hashinfo))
                return -ENOMEM;

        return 0;
}

static void __net_exit raw6_exit_net(struct net *net)
{
        remove_proc_entry("raw6", net->proc_net);
}

static struct pernet_operations raw6_net_ops = {
        .init = raw6_init_net,
        .exit = raw6_exit_net,
};

int __init raw6_proc_init(void)
{
        return register_pernet_subsys(&raw6_net_ops);
}

void raw6_proc_exit(void)
{
        unregister_pernet_subsys(&raw6_net_ops);
}
#endif        /* CONFIG_PROC_FS */

/* Same as inet6_dgram_ops, sans udp_poll.  */
const struct proto_ops inet6_sockraw_ops = {
        .family                   = PF_INET6,
        .owner                   = THIS_MODULE,
        .release           = inet6_release,
        .bind                   = inet6_bind,
        .connect           = inet_dgram_connect,        /* ok                */
        .socketpair           = sock_no_socketpair,        /* a do nothing        */
        .accept                   = sock_no_accept,                /* a do nothing        */
        .getname           = inet6_getname,
        .poll                   = datagram_poll,                /* ok                */
        .ioctl                   = inet6_ioctl,                /* must change  */
        .gettstamp           = sock_gettstamp,
        .listen                   = sock_no_listen,                /* ok                */
        .shutdown           = inet_shutdown,                /* ok                */
        .setsockopt           = sock_common_setsockopt,        /* ok                */
        .getsockopt           = sock_common_getsockopt,        /* ok                */
        .sendmsg           = inet_sendmsg,                /* ok                */
        .recvmsg           = sock_common_recvmsg,        /* ok                */
        .mmap                   = sock_no_mmap,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet6_compat_ioctl,
#endif
};

static struct inet_protosw rawv6_protosw = {
        .type                = SOCK_RAW,
        .protocol        = IPPROTO_IP,        /* wild card */
        .prot                = &rawv6_prot,
        .ops                = &inet6_sockraw_ops,
        .flags                = INET_PROTOSW_REUSE,
};

int __init rawv6_init(void)
{
        return inet6_register_protosw(&rawv6_protosw);
}

void rawv6_exit(void)
{
        inet6_unregister_protosw(&rawv6_protosw);
}
























































































































































































































































































































































































































































































































































































    1 








    1 
























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
/*
 * fs/nfs/idmap.c
 *
 *  UID and GID to name mapping for clients.
 *
 *  Copyright (c) 2002 The Regents of the University of Michigan.
 *  All rights reserved.
 *
 *  Marius Aamodt Eriksen <marius@umich.edu>
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. Neither the name of the University nor the names of its
 *     contributors may be used to endorse or promote products derived
 *     from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
 *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <linux/types.h>
#include <linux/parser.h>
#include <linux/fs.h>
#include <net/net_namespace.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_fs_sb.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
#include <keys/request_key_auth-type.h>
#include <linux/module.h>
#include <linux/user_namespace.h>

#include "internal.h"
#include "netns.h"
#include "nfs4idmap.h"
#include "nfs4trace.h"

#define NFS_UINT_MAXLEN 11

static const struct cred *id_resolver_cache;
static struct key_type key_type_id_resolver_legacy;

struct idmap_legacy_upcalldata {
        struct rpc_pipe_msg pipe_msg;
        struct idmap_msg idmap_msg;
        struct key        *authkey;
        struct idmap *idmap;
};

struct idmap {
        struct rpc_pipe_dir_object idmap_pdo;
        struct rpc_pipe                *idmap_pipe;
        struct idmap_legacy_upcalldata *idmap_upcall_data;
        struct mutex                idmap_mutex;
        struct user_namespace        *user_ns;
};

static struct user_namespace *idmap_userns(const struct idmap *idmap)
{
        if (idmap && idmap->user_ns)
                return idmap->user_ns;
        return &init_user_ns;
}

/**
 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
 * @fattr: fully initialised struct nfs_fattr
 * @owner_name: owner name string cache
 * @group_name: group name string cache
 */
void nfs_fattr_init_names(struct nfs_fattr *fattr,
                struct nfs4_string *owner_name,
                struct nfs4_string *group_name)
{
        fattr->owner_name = owner_name;
        fattr->group_name = group_name;
}

static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
{
        fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
        kfree(fattr->owner_name->data);
}

static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
{
        fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
        kfree(fattr->group_name->data);
}

static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
{
        struct nfs4_string *owner = fattr->owner_name;
        kuid_t uid;

        if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
                return false;
        if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
                fattr->uid = uid;
                fattr->valid |= NFS_ATTR_FATTR_OWNER;
        }
        return true;
}

static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
{
        struct nfs4_string *group = fattr->group_name;
        kgid_t gid;

        if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
                return false;
        if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
                fattr->gid = gid;
                fattr->valid |= NFS_ATTR_FATTR_GROUP;
        }
        return true;
}

/**
 * nfs_fattr_free_names - free up the NFSv4 owner and group strings
 * @fattr: a fully initialised nfs_fattr structure
 */
void nfs_fattr_free_names(struct nfs_fattr *fattr)
{
        if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
                nfs_fattr_free_owner_name(fattr);
        if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
                nfs_fattr_free_group_name(fattr);
}

/**
 * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
 * @server: pointer to the filesystem nfs_server structure
 * @fattr: a fully initialised nfs_fattr structure
 *
 * This helper maps the cached NFSv4 owner/group strings in fattr into
 * their numeric uid/gid equivalents, and then frees the cached strings.
 */
void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
{
        if (nfs_fattr_map_owner_name(server, fattr))
                nfs_fattr_free_owner_name(fattr);
        if (nfs_fattr_map_group_name(server, fattr))
                nfs_fattr_free_group_name(fattr);
}

int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
{
        unsigned long val;
        char buf[16];

        if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
                return 0;
        memcpy(buf, name, namelen);
        buf[namelen] = '\0';
        if (kstrtoul(buf, 0, &val) != 0)
                return 0;
        *res = val;
        return 1;
}
EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);

static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
{
        return snprintf(buf, buflen, "%u", id);
}

static struct key_type key_type_id_resolver = {
        .name                = "id_resolver",
        .preparse        = user_preparse,
        .free_preparse        = user_free_preparse,
        .instantiate        = generic_key_instantiate,
        .revoke                = user_revoke,
        .destroy        = user_destroy,
        .describe        = user_describe,
        .read                = user_read,
};

int nfs_idmap_init(void)
{
        struct cred *cred;
        struct key *keyring;
        int ret = 0;

        printk(KERN_NOTICE "NFS: Registering the %s key type\n",
                key_type_id_resolver.name);

        cred = prepare_kernel_cred(&init_task);
        if (!cred)
                return -ENOMEM;

        keyring = keyring_alloc(".id_resolver",
                                GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
                                (KEY_POS_ALL & ~KEY_POS_SETATTR) |
                                KEY_USR_VIEW | KEY_USR_READ,
                                KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
        if (IS_ERR(keyring)) {
                ret = PTR_ERR(keyring);
                goto failed_put_cred;
        }

        ret = register_key_type(&key_type_id_resolver);
        if (ret < 0)
                goto failed_put_key;

        ret = register_key_type(&key_type_id_resolver_legacy);
        if (ret < 0)
                goto failed_reg_legacy;

        set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
        cred->thread_keyring = keyring;
        cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
        id_resolver_cache = cred;
        return 0;

failed_reg_legacy:
        unregister_key_type(&key_type_id_resolver);
failed_put_key:
        key_put(keyring);
failed_put_cred:
        put_cred(cred);
        return ret;
}

void nfs_idmap_quit(void)
{
        key_revoke(id_resolver_cache->thread_keyring);
        unregister_key_type(&key_type_id_resolver);
        unregister_key_type(&key_type_id_resolver_legacy);
        put_cred(id_resolver_cache);
}

/*
 * Assemble the description to pass to request_key()
 * This function will allocate a new string and update dest to point
 * at it.  The caller is responsible for freeing dest.
 *
 * On error 0 is returned.  Otherwise, the length of dest is returned.
 */
static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
                                const char *type, size_t typelen, char **desc)
{
        char *cp;
        size_t desclen = typelen + namelen + 2;

        *desc = kmalloc(desclen, GFP_KERNEL);
        if (!*desc)
                return -ENOMEM;

        cp = *desc;
        memcpy(cp, type, typelen);
        cp += typelen;
        *cp++ = ':';

        memcpy(cp, name, namelen);
        cp += namelen;
        *cp = '\0';
        return desclen;
}

static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
                                         const char *type, struct idmap *idmap)
{
        char *desc;
        struct key *rkey = ERR_PTR(-EAGAIN);
        ssize_t ret;

        ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
        if (ret < 0)
                return ERR_PTR(ret);

        if (!idmap->user_ns || idmap->user_ns == &init_user_ns)
                rkey = request_key(&key_type_id_resolver, desc, "");
        if (IS_ERR(rkey)) {
                mutex_lock(&idmap->idmap_mutex);
                rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
                                                desc, NULL, "", 0, idmap);
                mutex_unlock(&idmap->idmap_mutex);
        }
        if (!IS_ERR(rkey))
                set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags);

        kfree(desc);
        return rkey;
}

static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
                                 const char *type, void *data,
                                 size_t data_size, struct idmap *idmap)
{
        struct key *rkey;
        const struct user_key_payload *payload;
        ssize_t ret;

        scoped_with_creds(id_resolver_cache)
                rkey = nfs_idmap_request_key(name, namelen, type, idmap);
        if (IS_ERR(rkey)) {
                ret = PTR_ERR(rkey);
                goto out;
        }

        rcu_read_lock();
        rkey->perm |= KEY_USR_VIEW;

        ret = key_validate(rkey);
        if (ret < 0)
                goto out_up;

        payload = user_key_payload_rcu(rkey);
        if (IS_ERR_OR_NULL(payload)) {
                ret = PTR_ERR(payload);
                goto out_up;
        }

        ret = payload->datalen;
        if (ret > 0 && ret <= data_size)
                memcpy(data, payload->data, ret);
        else
                ret = -EINVAL;

out_up:
        rcu_read_unlock();
        key_put(rkey);
out:
        return ret;
}

/* ID -> Name */
static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
                                     size_t buflen, struct idmap *idmap)
{
        char id_str[NFS_UINT_MAXLEN];
        int id_len;
        ssize_t ret;

        id_len = nfs_map_numeric_to_string(id, id_str, sizeof(id_str));
        ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap);
        if (ret < 0)
                return -EINVAL;
        return ret;
}

/* Name -> ID */
static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type,
                               __u32 *id, struct idmap *idmap)
{
        char id_str[NFS_UINT_MAXLEN];
        long id_long;
        ssize_t data_size;
        int ret = 0;

        data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap);
        if (data_size <= 0) {
                ret = -EINVAL;
        } else {
                ret = kstrtol(id_str, 10, &id_long);
                if (!ret)
                        *id = (__u32)id_long;
        }
        return ret;
}

/* idmap classic begins here */

enum {
        Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err
};

static const match_table_t nfs_idmap_tokens = {
        { Opt_find_uid, "uid:%s" },
        { Opt_find_gid, "gid:%s" },
        { Opt_find_user, "user:%s" },
        { Opt_find_group, "group:%s" },
        { Opt_find_err, NULL }
};

static int nfs_idmap_legacy_upcall(struct key *, void *);
static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
                                   size_t);
static void idmap_release_pipe(struct inode *);
static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);

static const struct rpc_pipe_ops idmap_upcall_ops = {
        .upcall                = rpc_pipe_generic_upcall,
        .downcall        = idmap_pipe_downcall,
        .release_pipe        = idmap_release_pipe,
        .destroy_msg        = idmap_pipe_destroy_msg,
};

static struct key_type key_type_id_resolver_legacy = {
        .name                = "id_legacy",
        .preparse        = user_preparse,
        .free_preparse        = user_free_preparse,
        .instantiate        = generic_key_instantiate,
        .revoke                = user_revoke,
        .destroy        = user_destroy,
        .describe        = user_describe,
        .read                = user_read,
        .request_key        = nfs_idmap_legacy_upcall,
};

static void nfs_idmap_pipe_destroy(struct dentry *dir,
                struct rpc_pipe_dir_object *pdo)
{
        struct idmap *idmap = pdo->pdo_data;

        rpc_unlink(idmap->idmap_pipe);
}

static int nfs_idmap_pipe_create(struct dentry *dir,
                struct rpc_pipe_dir_object *pdo)
{
        struct idmap *idmap = pdo->pdo_data;

        return rpc_mkpipe_dentry(dir, "idmap", idmap, idmap->idmap_pipe);
}

static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = {
        .create = nfs_idmap_pipe_create,
        .destroy = nfs_idmap_pipe_destroy,
};

int
nfs_idmap_new(struct nfs_client *clp)
{
        struct idmap *idmap;
        struct rpc_pipe *pipe;
        int error;

        idmap = kzalloc_obj(*idmap);
        if (idmap == NULL)
                return -ENOMEM;

        mutex_init(&idmap->idmap_mutex);
        idmap->user_ns = get_user_ns(clp->cl_rpcclient->cl_cred->user_ns);

        rpc_init_pipe_dir_object(&idmap->idmap_pdo,
                        &nfs_idmap_pipe_dir_object_ops,
                        idmap);

        pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0);
        if (IS_ERR(pipe)) {
                error = PTR_ERR(pipe);
                goto err;
        }
        idmap->idmap_pipe = pipe;

        error = rpc_add_pipe_dir_object(clp->cl_net,
                        &clp->cl_rpcclient->cl_pipedir_objects,
                        &idmap->idmap_pdo);
        if (error)
                goto err_destroy_pipe;

        clp->cl_idmap = idmap;
        return 0;
err_destroy_pipe:
        rpc_destroy_pipe_data(idmap->idmap_pipe);
err:
        put_user_ns(idmap->user_ns);
        kfree(idmap);
        return error;
}

void
nfs_idmap_delete(struct nfs_client *clp)
{
        struct idmap *idmap = clp->cl_idmap;

        if (!idmap)
                return;
        clp->cl_idmap = NULL;
        rpc_remove_pipe_dir_object(clp->cl_net,
                        &clp->cl_rpcclient->cl_pipedir_objects,
                        &idmap->idmap_pdo);
        rpc_destroy_pipe_data(idmap->idmap_pipe);
        put_user_ns(idmap->user_ns);
        kfree(idmap);
}

static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
                                     struct idmap_msg *im,
                                     struct rpc_pipe_msg *msg)
{
        substring_t substr;
        int token, ret;

        im->im_type = IDMAP_TYPE_GROUP;
        token = match_token(desc, nfs_idmap_tokens, &substr);

        switch (token) {
        case Opt_find_uid:
                im->im_type = IDMAP_TYPE_USER;
                fallthrough;
        case Opt_find_gid:
                im->im_conv = IDMAP_CONV_NAMETOID;
                ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ);
                break;

        case Opt_find_user:
                im->im_type = IDMAP_TYPE_USER;
                fallthrough;
        case Opt_find_group:
                im->im_conv = IDMAP_CONV_IDTONAME;
                ret = match_int(&substr, &im->im_id);
                if (ret)
                        goto out;
                break;

        default:
                ret = -EINVAL;
                goto out;
        }

        msg->data = im;
        msg->len  = sizeof(struct idmap_msg);

out:
        return ret;
}

static bool
nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
                struct idmap_legacy_upcalldata *data)
{
        if (idmap->idmap_upcall_data != NULL) {
                WARN_ON_ONCE(1);
                return false;
        }
        idmap->idmap_upcall_data = data;
        return true;
}

static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data,
                                           int ret)
{
        complete_request_key(data->authkey, ret);
        key_put(data->authkey);
        kfree(data);
}

static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap,
                                        struct idmap_legacy_upcalldata *data,
                                        int ret)
{
        if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data)
                nfs_idmap_complete_pipe_upcall(data, ret);
}

static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
{
        struct idmap_legacy_upcalldata *data;
        struct request_key_auth *rka = get_request_key_auth(authkey);
        struct rpc_pipe_msg *msg;
        struct idmap_msg *im;
        struct idmap *idmap = aux;
        struct key *key = rka->target_key;
        int ret = -ENOKEY;

        if (!aux)
                goto out1;

        /* msg and im are freed in idmap_pipe_destroy_msg */
        ret = -ENOMEM;
        data = kzalloc_obj(*data);
        if (!data)
                goto out1;

        msg = &data->pipe_msg;
        im = &data->idmap_msg;
        data->idmap = idmap;
        data->authkey = key_get(authkey);

        ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
        if (ret < 0)
                goto out2;

        ret = -EAGAIN;
        if (!nfs_idmap_prepare_pipe_upcall(idmap, data))
                goto out2;

        ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
        if (ret < 0)
                nfs_idmap_abort_pipe_upcall(idmap, data, ret);

        return ret;
out2:
        kfree(data);
out1:
        complete_request_key(authkey, ret);
        return ret;
}

static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen)
{
        return key_instantiate_and_link(key, data, datalen,
                                        id_resolver_cache->thread_keyring,
                                        authkey);
}

static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,
                struct idmap_msg *upcall,
                struct key *key, struct key *authkey)
{
        char id_str[NFS_UINT_MAXLEN];
        size_t len;
        int ret = -ENOKEY;

        /* ret = -ENOKEY */
        if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv)
                goto out;
        switch (im->im_conv) {
        case IDMAP_CONV_NAMETOID:
                if (strcmp(upcall->im_name, im->im_name) != 0)
                        break;
                /* Note: here we store the NUL terminator too */
                len = 1 + nfs_map_numeric_to_string(im->im_id, id_str,
                                                    sizeof(id_str));
                ret = nfs_idmap_instantiate(key, authkey, id_str, len);
                break;
        case IDMAP_CONV_IDTONAME:
                if (upcall->im_id != im->im_id)
                        break;
                len = strlen(im->im_name);
                ret = nfs_idmap_instantiate(key, authkey, im->im_name, len);
                break;
        default:
                ret = -EINVAL;
        }
out:
        return ret;
}

static ssize_t
idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
{
        struct request_key_auth *rka;
        struct rpc_inode *rpci = RPC_I(file_inode(filp));
        struct idmap *idmap = (struct idmap *)rpci->private;
        struct idmap_legacy_upcalldata *data;
        struct key *authkey;
        struct idmap_msg im;
        size_t namelen_in;
        int ret = -ENOKEY;

        /* If instantiation is successful, anyone waiting for key construction
         * will have been woken up and someone else may now have used
         * idmap_key_cons - so after this point we may no longer touch it.
         */
        data = xchg(&idmap->idmap_upcall_data, NULL);
        if (data == NULL)
                goto out_noupcall;

        authkey = data->authkey;
        rka = get_request_key_auth(authkey);

        if (mlen != sizeof(im)) {
                ret = -ENOSPC;
                goto out;
        }

        if (copy_from_user(&im, src, mlen) != 0) {
                ret = -EFAULT;
                goto out;
        }

        if (!(im.im_status & IDMAP_STATUS_SUCCESS)) {
                ret = -ENOKEY;
                goto out;
        }

        namelen_in = strnlen(im.im_name, IDMAP_NAMESZ);
        if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
                ret = -EINVAL;
                goto out;
        }

        ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg,
                                                rka->target_key, authkey);
        if (ret >= 0) {
                key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
                ret = mlen;
        }

out:
        nfs_idmap_complete_pipe_upcall(data, ret);
out_noupcall:
        return ret;
}

static void
idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
{
        struct idmap_legacy_upcalldata *data = container_of(msg,
                        struct idmap_legacy_upcalldata,
                        pipe_msg);
        struct idmap *idmap = data->idmap;

        if (msg->errno)
                nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno);
}

static void
idmap_release_pipe(struct inode *inode)
{
        struct rpc_inode *rpci = RPC_I(inode);
        struct idmap *idmap = (struct idmap *)rpci->private;
        struct idmap_legacy_upcalldata *data;

        data = xchg(&idmap->idmap_upcall_data, NULL);
        if (data)
                nfs_idmap_complete_pipe_upcall(data, -EPIPE);
}

int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)
{
        struct idmap *idmap = server->nfs_client->cl_idmap;
        __u32 id = -1;
        int ret = 0;

        if (!nfs_map_string_to_numeric(name, namelen, &id))
                ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap);
        if (ret == 0) {
                *uid = make_kuid(idmap_userns(idmap), id);
                if (!uid_valid(*uid))
                        ret = -ERANGE;
        }
        trace_nfs4_map_name_to_uid(name, namelen, id, ret);
        return ret;
}

int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid)
{
        struct idmap *idmap = server->nfs_client->cl_idmap;
        __u32 id = -1;
        int ret = 0;

        if (!nfs_map_string_to_numeric(name, namelen, &id))
                ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap);
        if (ret == 0) {
                *gid = make_kgid(idmap_userns(idmap), id);
                if (!gid_valid(*gid))
                        ret = -ERANGE;
        }
        trace_nfs4_map_group_to_gid(name, namelen, id, ret);
        return ret;
}

int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen)
{
        struct idmap *idmap = server->nfs_client->cl_idmap;
        int ret = -EINVAL;
        __u32 id;

        id = from_kuid_munged(idmap_userns(idmap), uid);
        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
                ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap);
        if (ret < 0)
                ret = nfs_map_numeric_to_string(id, buf, buflen);
        trace_nfs4_map_uid_to_name(buf, ret, id, ret);
        return ret;
}
int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen)
{
        struct idmap *idmap = server->nfs_client->cl_idmap;
        int ret = -EINVAL;
        __u32 id;

        id = from_kgid_munged(idmap_userns(idmap), gid);
        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
                ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap);
        if (ret < 0)
                ret = nfs_map_numeric_to_string(id, buf, buflen);
        trace_nfs4_map_gid_to_group(buf, ret, id, ret);
        return ret;
}
























   12 
































   22 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM skb

#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SKB_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/tracepoint.h>

#undef FN
#define FN(reason)        TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
DEFINE_DROP_REASON(FN, FN)

#undef FN
#undef FNe
#define FN(reason)        { SKB_DROP_REASON_##reason, #reason },
#define FNe(reason)        { SKB_DROP_REASON_##reason, #reason }

/*
 * Tracepoint for free an sk_buff:
 */
TRACE_EVENT(kfree_skb,

        TP_PROTO(struct sk_buff *skb, void *location,
                 enum skb_drop_reason reason, struct sock *rx_sk),

        TP_ARGS(skb, location, reason, rx_sk),

        TP_STRUCT__entry(
                __field(void *,                skbaddr)
                __field(void *,                location)
                __field(void *,                rx_sk)
                __field(unsigned short,        protocol)
                __field(enum skb_drop_reason,        reason)
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
                __entry->rx_sk = rx_sk;
                __entry->protocol = ntohs(skb->protocol);
                __entry->reason = reason;
        ),

        TP_printk("skbaddr=%p rx_sk=%p protocol=%u location=%pS reason: %s",
                  __entry->skbaddr, __entry->rx_sk, __entry->protocol,
                  __entry->location,
                  __print_symbolic(__entry->reason,
                                   DEFINE_DROP_REASON(FN, FNe)))
);

#undef FN
#undef FNe

TRACE_EVENT(consume_skb,

        TP_PROTO(struct sk_buff *skb, void *location),

        TP_ARGS(skb, location),

        TP_STRUCT__entry(
                __field(        void *,        skbaddr)
                __field(        void *,        location)
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
        ),

        TP_printk("skbaddr=%p location=%pS", __entry->skbaddr, __entry->location)
);

TRACE_EVENT(skb_copy_datagram_iovec,

        TP_PROTO(const struct sk_buff *skb, int len),

        TP_ARGS(skb, len),

        TP_STRUCT__entry(
                __field(        const void *,                skbaddr                )
                __field(        int,                        len                )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = len;
        ),

        TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len)
);

#endif /* _TRACE_SKB_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





























































































































   63 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_X86_XSAVE_H
#define __ASM_X86_XSAVE_H

#include <linux/uaccess.h>
#include <linux/types.h>

#include <asm/processor.h>
#include <asm/fpu/api.h>
#include <asm/user.h>

/* Bit 63 of XCR0 is reserved for future expansion */
#define XFEATURE_MASK_EXTEND        (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))

#define FXSAVE_SIZE        512

#define XSAVE_HDR_SIZE            64
#define XSAVE_HDR_OFFSET    FXSAVE_SIZE

#define XSAVE_YMM_SIZE            256
#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)

#define XSAVE_ALIGNMENT     64

/* All currently supported user features */
#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
                                      XFEATURE_MASK_SSE | \
                                      XFEATURE_MASK_YMM | \
                                      XFEATURE_MASK_OPMASK | \
                                      XFEATURE_MASK_ZMM_Hi256 | \
                                      XFEATURE_MASK_Hi16_ZMM         | \
                                      XFEATURE_MASK_PKRU | \
                                      XFEATURE_MASK_BNDREGS | \
                                      XFEATURE_MASK_BNDCSR | \
                                      XFEATURE_MASK_XTILE | \
                                      XFEATURE_MASK_APX)

/*
 * Features which are restored when returning to user space.
 * PKRU is not restored on return to user space because PKRU
 * is switched eagerly in switch_to() and flush_thread()
 */
#define XFEATURE_MASK_USER_RESTORE        \
        (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)

/* Features which are dynamically enabled for a process on request */
#define XFEATURE_MASK_USER_DYNAMIC        XFEATURE_MASK_XTILE_DATA

/* Supervisor features which are enabled only in guest FPUs */
#define XFEATURE_MASK_GUEST_SUPERVISOR        XFEATURE_MASK_CET_KERNEL

/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
                                            XFEATURE_MASK_CET_USER | \
                                            XFEATURE_MASK_GUEST_SUPERVISOR)

/*
 * A supervisor state component may not always contain valuable information,
 * and its size may be huge. Saving/restoring such supervisor state components
 * at each context switch can cause high CPU and space overhead, which should
 * be avoided. Such supervisor state components should only be saved/restored
 * on demand. The on-demand supervisor features are set in this mask.
 *
 * Unlike the existing supported supervisor features, an independent supervisor
 * feature does not allocate a buffer in task->fpu, and the corresponding
 * supervisor state component cannot be saved/restored at each context switch.
 *
 * To support an independent supervisor feature, a developer should follow the
 * dos and don'ts as below:
 * - Do dynamically allocate a buffer for the supervisor state component.
 * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
 *   state component to/from the buffer.
 * - Don't set the bit corresponding to the independent supervisor feature in
 *   IA32_XSS at run time, since it has been set at boot time.
 */
#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR)

/*
 * Unsupported supervisor features. When a supervisor feature in this mask is
 * supported in the future, move it to the supported supervisor feature mask.
 */
#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)

/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
                                      XFEATURE_MASK_INDEPENDENT | \
                                      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)

/*
 * The feature mask required to restore FPU state:
 * - All user states which are not eagerly switched in switch_to()/exec()
 * - The suporvisor states
 */
#define XFEATURE_MASK_FPSTATE        (XFEATURE_MASK_USER_RESTORE | \
                                 XFEATURE_MASK_SUPERVISOR_SUPPORTED)

/*
 * Features in this mask have space allocated in the signal frame, but may not
 * have that space initialized when the feature is in its init state.
 */
#define XFEATURE_MASK_SIGFRAME_INITOPT        (XFEATURE_MASK_XTILE | \
                                         XFEATURE_MASK_USER_DYNAMIC)

extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];

extern void __init update_regset_xstate_info(unsigned int size,
                                             u64 xstate_mask);

int xfeature_size(int xfeature_nr);

void xsaves(struct xregs_state *xsave, u64 mask);
void xrstors(struct xregs_state *xsave, u64 mask);

int xfd_enable_feature(u64 xfd_err);

#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
#endif

#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);

static __always_inline __pure bool fpu_state_size_dynamic(void)
{
        return static_branch_unlikely(&__fpu_state_size_dynamic);
}
#else
static __always_inline __pure bool fpu_state_size_dynamic(void)
{
        return false;
}
#endif

#endif

































































































































    2 













    3 












    3 










    3 



























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
// SPDX-License-Identifier: GPL-2.0
/*
 * devtmpfs - kernel-maintained tmpfs-based /dev
 *
 * Copyright (C) 2009, Kay Sievers <kay.sievers@vrfy.org>
 *
 * During bootup, before any driver core device is registered,
 * devtmpfs, a tmpfs-based filesystem is created. Every driver-core
 * device which requests a device node, will add a node in this
 * filesystem.
 * By default, all devices are named after the name of the device,
 * owned by root and have a default mode of 0600. Subsystems can
 * overwrite the default setting if needed.
 */

#define pr_fmt(fmt) "devtmpfs: " fmt

#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/device.h>
#include <linux/blkdev.h>
#include <linux/namei.h>
#include <linux/fs.h>
#include <linux/shmem_fs.h>
#include <linux/ramfs.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/init_syscalls.h>
#include <uapi/linux/mount.h>
#include "base.h"

#ifdef CONFIG_DEVTMPFS_SAFE
#define DEVTMPFS_MFLAGS       (MS_SILENT | MS_NOEXEC | MS_NOSUID)
#else
#define DEVTMPFS_MFLAGS       (MS_SILENT)
#endif

static struct task_struct *thread;

static int __initdata mount_dev = IS_ENABLED(CONFIG_DEVTMPFS_MOUNT);

static DEFINE_SPINLOCK(req_lock);

static struct req {
        struct req *next;
        struct completion done;
        int err;
        const char *name;
        umode_t mode;        /* 0 => delete */
        kuid_t uid;
        kgid_t gid;
        struct device *dev;
} *requests;

static int __init mount_param(char *str)
{
        return kstrtoint(str, 0, &mount_dev) == 0;
}
__setup("devtmpfs.mount=", mount_param);

static struct vfsmount *mnt;

static struct file_system_type internal_fs_type = {
        .name = "devtmpfs",
#ifdef CONFIG_TMPFS
        .init_fs_context = shmem_init_fs_context,
#else
        .init_fs_context = ramfs_init_fs_context,
#endif
        .kill_sb = kill_anon_super,
};

/* Simply take a ref on the existing mount */
static int devtmpfs_get_tree(struct fs_context *fc)
{
        struct super_block *sb = mnt->mnt_sb;

        atomic_inc(&sb->s_active);
        down_write(&sb->s_umount);
        fc->root = dget(sb->s_root);
        return 0;
}

/* Ops are filled in during init depending on underlying shmem or ramfs type */
static struct fs_context_operations devtmpfs_context_ops = {};

/* Call the underlying initialization and set to our ops */
static int devtmpfs_init_fs_context(struct fs_context *fc)
{
        int ret;
#ifdef CONFIG_TMPFS
        ret = shmem_init_fs_context(fc);
#else
        ret = ramfs_init_fs_context(fc);
#endif
        if (ret < 0)
                return ret;

        fc->ops = &devtmpfs_context_ops;

        return 0;
}

static struct file_system_type dev_fs_type = {
        .name = "devtmpfs",
        .init_fs_context = devtmpfs_init_fs_context,
};

static int devtmpfs_submit_req(struct req *req, const char *tmp)
{
        init_completion(&req->done);

        spin_lock(&req_lock);
        req->next = requests;
        requests = req;
        spin_unlock(&req_lock);

        wake_up_process(thread);
        wait_for_completion(&req->done);

        kfree(tmp);

        return req->err;
}

int devtmpfs_create_node(struct device *dev)
{
        const char *tmp = NULL;
        struct req req;

        if (!thread)
                return 0;

        req.mode = 0;
        req.uid = GLOBAL_ROOT_UID;
        req.gid = GLOBAL_ROOT_GID;
        req.name = device_get_devnode(dev, &req.mode, &req.uid, &req.gid, &tmp);
        if (!req.name)
                return -ENOMEM;

        if (req.mode == 0)
                req.mode = 0600;
        if (is_blockdev(dev))
                req.mode |= S_IFBLK;
        else
                req.mode |= S_IFCHR;

        req.dev = dev;

        return devtmpfs_submit_req(&req, tmp);
}

int devtmpfs_delete_node(struct device *dev)
{
        const char *tmp = NULL;
        struct req req;

        if (!thread)
                return 0;

        req.name = device_get_devnode(dev, NULL, NULL, NULL, &tmp);
        if (!req.name)
                return -ENOMEM;

        req.mode = 0;
        req.dev = dev;

        return devtmpfs_submit_req(&req, tmp);
}

static int dev_mkdir(const char *name, umode_t mode)
{
        struct dentry *dentry;
        struct path path;

        dentry = start_creating_path(AT_FDCWD, name, &path, LOOKUP_DIRECTORY);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, NULL);
        if (!IS_ERR(dentry))
                /* mark as kernel-created inode */
                d_inode(dentry)->i_private = &thread;
        end_creating_path(&path, dentry);
        return PTR_ERR_OR_ZERO(dentry);
}

static int create_path(const char *nodepath)
{
        char *path;
        char *s;
        int err = 0;

        /* parent directories do not exist, create them */
        path = kstrdup(nodepath, GFP_KERNEL);
        if (!path)
                return -ENOMEM;

        s = path;
        for (;;) {
                s = strchr(s, '/');
                if (!s)
                        break;
                s[0] = '\0';
                err = dev_mkdir(path, 0755);
                if (err && err != -EEXIST)
                        break;
                s[0] = '/';
                s++;
        }
        kfree(path);
        return err;
}

static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
                         kgid_t gid, struct device *dev)
{
        struct dentry *dentry;
        struct path path;
        int err;

        dentry = start_creating_path(AT_FDCWD, nodename, &path, 0);
        if (dentry == ERR_PTR(-ENOENT)) {
                create_path(nodename);
                dentry = start_creating_path(AT_FDCWD, nodename, &path, 0);
        }
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode,
                        dev->devt, NULL);
        if (!err) {
                struct iattr newattrs;

                newattrs.ia_mode = mode;
                newattrs.ia_uid = uid;
                newattrs.ia_gid = gid;
                newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID;
                inode_lock(d_inode(dentry));
                notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL);
                inode_unlock(d_inode(dentry));

                /* mark as kernel-created inode */
                d_inode(dentry)->i_private = &thread;
        }
        end_creating_path(&path, dentry);
        return err;
}

static int dev_rmdir(const char *name)
{
        struct path parent;
        struct dentry *dentry;
        int err;

        dentry = start_removing_path(name, &parent);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
        if (d_inode(dentry)->i_private == &thread)
                err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry),
                                dentry, NULL);
        else
                err = -EPERM;

        end_removing_path(&parent, dentry);
        return err;
}

static int delete_path(const char *nodepath)
{
        char *path;
        int err = 0;

        path = kstrdup(nodepath, GFP_KERNEL);
        if (!path)
                return -ENOMEM;

        for (;;) {
                char *base;

                base = strrchr(path, '/');
                if (!base)
                        break;
                base[0] = '\0';
                err = dev_rmdir(path);
                if (err)
                        break;
        }

        kfree(path);
        return err;
}

static int dev_mynode(struct device *dev, struct inode *inode)
{
        /* did we create it */
        if (inode->i_private != &thread)
                return 0;

        /* does the dev_t match */
        if (is_blockdev(dev)) {
                if (!S_ISBLK(inode->i_mode))
                        return 0;
        } else {
                if (!S_ISCHR(inode->i_mode))
                        return 0;
        }
        if (inode->i_rdev != dev->devt)
                return 0;

        /* ours */
        return 1;
}

static int handle_remove(const char *nodename, struct device *dev)
{
        struct path parent;
        struct dentry *dentry;
        struct inode *inode;
        int deleted = 0;
        int err = 0;

        dentry = start_removing_path(nodename, &parent);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        inode = d_inode(dentry);
        if (dev_mynode(dev, inode)) {
                struct iattr newattrs;
                /*
                 * before unlinking this node, reset permissions
                 * of possible references like hardlinks
                 */
                newattrs.ia_uid = GLOBAL_ROOT_UID;
                newattrs.ia_gid = GLOBAL_ROOT_GID;
                newattrs.ia_mode = inode->i_mode & ~0777;
                newattrs.ia_valid =
                        ATTR_UID|ATTR_GID|ATTR_MODE;
                inode_lock(d_inode(dentry));
                notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL);
                inode_unlock(d_inode(dentry));
                err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry),
                                 dentry, NULL);
                if (!err || err == -ENOENT)
                        deleted = 1;
        }
        end_removing_path(&parent, dentry);

        if (deleted && strchr(nodename, '/'))
                delete_path(nodename);
        return err;
}

/*
 * If configured, or requested by the commandline, devtmpfs will be
 * auto-mounted after the kernel mounted the root filesystem.
 */
int __init devtmpfs_mount(void)
{
        int err;

        if (!mount_dev)
                return 0;

        if (!thread)
                return 0;

        err = init_mount("devtmpfs", "dev", "devtmpfs", DEVTMPFS_MFLAGS, NULL);
        if (err)
                pr_info("error mounting %d\n", err);
        else
                pr_info("mounted\n");
        return err;
}

static __initdata DECLARE_COMPLETION(setup_done);

static int handle(const char *name, umode_t mode, kuid_t uid, kgid_t gid,
                  struct device *dev)
{
        if (mode)
                return handle_create(name, mode, uid, gid, dev);
        else
                return handle_remove(name, dev);
}

static void __noreturn devtmpfs_work_loop(void)
{
        while (1) {
                spin_lock(&req_lock);
                while (requests) {
                        struct req *req = requests;
                        requests = NULL;
                        spin_unlock(&req_lock);
                        while (req) {
                                struct req *next = req->next;
                                req->err = handle(req->name, req->mode,
                                                  req->uid, req->gid, req->dev);
                                complete(&req->done);
                                req = next;
                        }
                        spin_lock(&req_lock);
                }
                __set_current_state(TASK_INTERRUPTIBLE);
                spin_unlock(&req_lock);
                schedule();
        }
}

static noinline int __init devtmpfs_setup(void *p)
{
        int err;

        err = ksys_unshare(CLONE_NEWNS);
        if (err)
                goto out;
        err = init_mount("devtmpfs", "/", "devtmpfs", DEVTMPFS_MFLAGS, NULL);
        if (err)
                goto out;
        init_chdir("/.."); /* will traverse into overmounted root */
        init_chroot(".");
out:
        *(int *)p = err;
        return err;
}

/*
 * The __ref is because devtmpfs_setup needs to be __init for the routines it
 * calls.  That call is done while devtmpfs_init, which is marked __init,
 * synchronously waits for it to complete.
 */
static int __ref devtmpfsd(void *p)
{
        int err = devtmpfs_setup(p);

        complete(&setup_done);
        if (err)
                return err;
        devtmpfs_work_loop();
        return 0;
}

/*
 * Get the underlying (shmem/ramfs) context ops to build ours
 */
static int devtmpfs_configure_context(void)
{
        struct fs_context *fc;

        fc = fs_context_for_reconfigure(mnt->mnt_root, mnt->mnt_sb->s_flags,
                                        MS_RMT_MASK);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /* Set up devtmpfs_context_ops based on underlying type */
        devtmpfs_context_ops.free              = fc->ops->free;
        devtmpfs_context_ops.dup              = fc->ops->dup;
        devtmpfs_context_ops.parse_param      = fc->ops->parse_param;
        devtmpfs_context_ops.parse_monolithic = fc->ops->parse_monolithic;
        devtmpfs_context_ops.get_tree              = &devtmpfs_get_tree;
        devtmpfs_context_ops.reconfigure      = fc->ops->reconfigure;

        put_fs_context(fc);

        return 0;
}

/*
 * Create devtmpfs instance, driver-core devices will add their device
 * nodes here.
 */
int __init devtmpfs_init(void)
{
        char opts[] = "mode=0755";
        int err;

        mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts);
        if (IS_ERR(mnt)) {
                pr_err("unable to create devtmpfs %ld\n", PTR_ERR(mnt));
                return PTR_ERR(mnt);
        }

        err = devtmpfs_configure_context();
        if (err) {
                pr_err("unable to configure devtmpfs type %d\n", err);
                return err;
        }

        err = register_filesystem(&dev_fs_type);
        if (err) {
                pr_err("unable to register devtmpfs type %d\n", err);
                return err;
        }

        thread = kthread_run(devtmpfsd, &err, "kdevtmpfs");
        if (!IS_ERR(thread)) {
                wait_for_completion(&setup_done);
        } else {
                err = PTR_ERR(thread);
                thread = NULL;
        }

        if (err) {
                pr_err("unable to create devtmpfs %d\n", err);
                unregister_filesystem(&dev_fs_type);
                thread = NULL;
                return err;
        }

        pr_info("initialized\n");
        return 0;
}













































































































































































































































































































































































































































































































































































































































    1 









































































































































































    1 


















    1 















    1 




















    1 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_MAPLE_TREE_H
#define _LINUX_MAPLE_TREE_H
/*
 * Maple Tree - An RCU-safe adaptive tree for storing ranges
 * Copyright (c) 2018-2022 Oracle
 * Authors:     Liam R. Howlett <Liam.Howlett@Oracle.com>
 *              Matthew Wilcox <willy@infradead.org>
 */

#include <linux/kernel.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
/* #define CONFIG_MAPLE_RCU_DISABLED */

/*
 * Allocated nodes are mutable until they have been inserted into the tree,
 * at which time they cannot change their type until they have been removed
 * from the tree and an RCU grace period has passed.
 *
 * Removed nodes have their ->parent set to point to themselves.  RCU readers
 * check ->parent before relying on the value that they loaded from the
 * slots array.  This lets us reuse the slots array for the RCU head.
 *
 * Nodes in the tree point to their parent unless bit 0 is set.
 */
#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64)
/* 64bit sizes */
#define MAPLE_NODE_SLOTS        31        /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS        16        /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS        10        /* 240 bytes */
#define MAPLE_ALLOC_SLOTS        (MAPLE_NODE_SLOTS - 1)
#else
/* 32bit sizes */
#define MAPLE_NODE_SLOTS        63        /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS        32        /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS        21        /* 240 bytes */
#define MAPLE_ALLOC_SLOTS        (MAPLE_NODE_SLOTS - 2)
#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */

#define MAPLE_NODE_MASK                255UL

/*
 * The node->parent of the root node has bit 0 set and the rest of the pointer
 * is a pointer to the tree itself.  No more bits are available in this pointer
 * (on m68k, the data structure may only be 2-byte aligned).
 *
 * Internal non-root nodes can only have maple_range_* nodes as parents.  The
 * parent pointer is 256B aligned like all other tree nodes.  When storing a 32
 * or 64 bit values, the offset can fit into 4 bits.  The 16 bit values need an
 * extra bit to store the offset.  This extra bit comes from a reuse of the last
 * bit in the node type.  This is possible by using bit 1 to indicate if bit 2
 * is part of the type or the slot.
 *
 * Once the type is decided, the decision of an allocation range type or a
 * range type is done by examining the immutable tree flag for the
 * MT_FLAGS_ALLOC_RANGE flag.
 *
 *  Node types:
 *   0b??1 = Root
 *   0b?00 = 16 bit nodes
 *   0b010 = 32 bit nodes
 *   0b110 = 64 bit nodes
 *
 *  Slot size and location in the parent pointer:
 *   type  : slot location
 *   0b??1 : Root
 *   0b?00 : 16 bit values, type in 0-1, slot in 2-6
 *   0b010 : 32 bit values, type in 0-2, slot in 3-6
 *   0b110 : 64 bit values, type in 0-2, slot in 3-6
 */

/*
 * This metadata is used to optimize the gap updating code and in reverse
 * searching for gaps or any other code that needs to find the end of the data.
 */
struct maple_metadata {
        unsigned char end;        /* end of data */
        unsigned char gap;        /* offset of largest gap */
};

/*
 * Leaf nodes do not store pointers to nodes, they store user data.  Users may
 * store almost any bit pattern.  As noted above, the optimisation of storing an
 * entry at 0 in the root pointer cannot be done for data which have the bottom
 * two bits set to '10'.  We also reserve values with the bottom two bits set to
 * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use.  Some APIs
 * return errnos as a negative errno shifted right by two bits and the bottom
 * two bits set to '10', and while choosing to store these values in the array
 * is not an error, it may lead to confusion if you're testing for an error with
 * mas_is_err().
 *
 * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits
 * 3-6), bit 2 is reserved.  That leaves bits 0-1 unused for now.
 *
 * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
 * indicate that the tree is specifying ranges,  Pivots may appear in the
 * subtree with an entry attached to the value whereas keys are unique to a
 * specific position of a B-tree.  Pivot values are inclusive of the slot with
 * the same index.
 */

struct maple_range_64 {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_RANGE64_SLOTS - 1];
        union {
                void __rcu *slot[MAPLE_RANGE64_SLOTS];
                struct {
                        void __rcu *pad[MAPLE_RANGE64_SLOTS - 1];
                        struct maple_metadata meta;
                };
        };
};

/*
 * At tree creation time, the user can specify that they're willing to trade off
 * storing fewer entries in a tree in return for storing more information in
 * each node.
 *
 * The maple tree supports recording the largest range of NULL entries available
 * in this node, also called gaps.  This optimises the tree for allocating a
 * range.
 */
struct maple_arange_64 {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1];
        void __rcu *slot[MAPLE_ARANGE64_SLOTS];
        unsigned long gap[MAPLE_ARANGE64_SLOTS];
        struct maple_metadata meta;
};

struct maple_topiary {
        struct maple_pnode *parent;
        struct maple_enode *next; /* Overlaps the pivot */
};

enum maple_type {
        maple_dense,
        maple_leaf_64,
        maple_range_64,
        maple_arange_64,
        maple_copy,
};

enum store_type {
        wr_invalid,
        wr_new_root,
        wr_store_root,
        wr_exact_fit,
        wr_spanning_store,
        wr_split_store,
        wr_rebalance,
        wr_append,
        wr_node_store,
        wr_slot_store,
};

struct maple_copy {
        /*
         * min, max, and pivots are values
         * start, end, split are indexes into arrays
         * data is a size
         */

        struct {
                struct maple_node *node;
                unsigned long max;
                enum maple_type mt;
        } dst[3];
        struct {
                struct maple_node *node;
                unsigned long max;
                unsigned char start;
                unsigned char end;
                enum maple_type mt;
        } src[4];
        /* Simulated node */
        void __rcu *slot[3];
        unsigned long gap[3];
        unsigned long min;
        union {
                unsigned long pivot[3];
                struct {
                        void *_pad[2];
                        unsigned long max;
                };
        };
        unsigned char end;

        /*Avoid passing these around */
        unsigned char s_count;
        unsigned char d_count;
        unsigned char split;
        unsigned char data;
        unsigned char height;
};

/**
 * DOC: Maple tree flags
 *
 * * MT_FLAGS_ALLOC_RANGE        - Track gaps in this tree
 * * MT_FLAGS_USE_RCU                - Operate in RCU mode
 * * MT_FLAGS_HEIGHT_OFFSET        - The position of the tree height in the flags
 * * MT_FLAGS_HEIGHT_MASK        - The mask for the maple tree height value
 * * MT_FLAGS_LOCK_MASK                - How the mt_lock is used
 * * MT_FLAGS_LOCK_IRQ                - Acquired irq-safe
 * * MT_FLAGS_LOCK_BH                - Acquired bh-safe
 * * MT_FLAGS_LOCK_EXTERN        - mt_lock is not used
 *
 * MAPLE_HEIGHT_MAX        The largest height that can be stored
 */
#define MT_FLAGS_ALLOC_RANGE        0x01
#define MT_FLAGS_USE_RCU        0x02
#define MT_FLAGS_HEIGHT_OFFSET        0x02
#define MT_FLAGS_HEIGHT_MASK        0x7C
#define MT_FLAGS_LOCK_MASK        0x300
#define MT_FLAGS_LOCK_IRQ        0x100
#define MT_FLAGS_LOCK_BH        0x200
#define MT_FLAGS_LOCK_EXTERN        0x300
#define MT_FLAGS_ALLOC_WRAPPED        0x0800

#define MAPLE_HEIGHT_MAX        31


#define MAPLE_NODE_TYPE_MASK        0x0F
#define MAPLE_NODE_TYPE_SHIFT        0x03

#define MAPLE_RESERVED_RANGE        4096

#ifdef CONFIG_LOCKDEP
#define mt_lock_is_held(mt)                                             \
        (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock))

#define mt_write_lock_is_held(mt)                                        \
        (!(mt)->ma_external_lock ||                                        \
         lock_is_held_type((mt)->ma_external_lock, 0))

#define mt_set_external_lock(mt, lock)                                        \
        (mt)->ma_external_lock = &(lock)->dep_map

#define mt_on_stack(mt)                        (mt).ma_external_lock = NULL
#else
#define mt_lock_is_held(mt)                1
#define mt_write_lock_is_held(mt)        1
#define mt_set_external_lock(mt, lock)        do { } while (0)
#define mt_on_stack(mt)                        do { } while (0)
#endif

/*
 * If the tree contains a single entry at index 0, it is usually stored in
 * tree->ma_root.  To optimise for the page cache, an entry which ends in '00',
 * '01' or '11' is stored in the root, but an entry which ends in '10' will be
 * stored in a node.  Bits 3-6 are used to store enum maple_type.
 *
 * The flags are used both to store some immutable information about this tree
 * (set at tree creation time) and dynamic information set under the spinlock.
 *
 * Another use of flags are to indicate global states of the tree.  This is the
 * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in
 * RCU mode.  This mode was added to allow the tree to reuse nodes instead of
 * re-allocating and RCU freeing nodes when there is a single user.
 */
struct maple_tree {
        union {
                spinlock_t                ma_lock;
#ifdef CONFIG_LOCKDEP
                struct lockdep_map        *ma_external_lock;
#endif
        };
        unsigned int        ma_flags;
        void __rcu      *ma_root;
};

/**
 * MTREE_INIT() - Initialize a maple tree
 * @name: The maple tree name
 * @__flags: The maple tree flags
 *
 */
#define MTREE_INIT(name, __flags) {                                        \
        .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock),                \
        .ma_flags = __flags,                                                \
        .ma_root = NULL,                                                \
}

/**
 * MTREE_INIT_EXT() - Initialize a maple tree with an external lock.
 * @name: The tree name
 * @__flags: The maple tree flags
 * @__lock: The external lock
 */
#ifdef CONFIG_LOCKDEP
#define MTREE_INIT_EXT(name, __flags, __lock) {                                \
        .ma_external_lock = &(__lock).dep_map,                                \
        .ma_flags = (__flags),                                                \
        .ma_root = NULL,                                                \
}
#else
#define MTREE_INIT_EXT(name, __flags, __lock)        MTREE_INIT(name, __flags)
#endif

#define DEFINE_MTREE(name)                                                \
        struct maple_tree name = MTREE_INIT(name, 0)

#define mtree_lock(mt)                spin_lock((&(mt)->ma_lock))
#define mtree_lock_nested(mas, subclass) \
                spin_lock_nested((&(mt)->ma_lock), subclass)
#define mtree_unlock(mt)        spin_unlock((&(mt)->ma_lock))

/*
 * The Maple Tree squeezes various bits in at various points which aren't
 * necessarily obvious.  Usually, this is done by observing that pointers are
 * N-byte aligned and thus the bottom log_2(N) bits are available for use.  We
 * don't use the high bits of pointers to store additional information because
 * we don't know what bits are unused on any given architecture.
 *
 * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8
 * low bits for our own purposes.  Nodes are currently of 4 types:
 * 1. Single pointer (Range is 0-0)
 * 2. Non-leaf Allocation Range nodes
 * 3. Non-leaf Range nodes
 * 4. Leaf Range nodes All nodes consist of a number of node slots,
 *    pivots, and a parent pointer.
 */

struct maple_node {
        union {
                struct {
                        struct maple_pnode *parent;
                        void __rcu *slot[MAPLE_NODE_SLOTS];
                };
                struct {
                        void *pad;
                        struct rcu_head rcu;
                        struct maple_enode *piv_parent;
                        unsigned char parent_slot;
                        enum maple_type type;
                        unsigned char slot_len;
                        unsigned int ma_flags;
                };
                struct maple_range_64 mr64;
                struct maple_arange_64 ma64;
                struct maple_copy cp;
        };
};

/*
 * More complicated stores can cause two nodes to become one or three and
 * potentially alter the height of the tree.  Either half of the tree may need
 * to be rebalanced against the other.  The ma_topiary struct is used to track
 * which nodes have been 'cut' from the tree so that the change can be done
 * safely at a later date.  This is done to support RCU.
 */
struct ma_topiary {
        struct maple_enode *head;
        struct maple_enode *tail;
        struct maple_tree *mtree;
};

void *mtree_load(struct maple_tree *mt, unsigned long index);

int mtree_insert(struct maple_tree *mt, unsigned long index,
                void *entry, gfp_t gfp);
int mtree_insert_range(struct maple_tree *mt, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp);
int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp);
int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp);
int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp);

int mtree_store_range(struct maple_tree *mt, unsigned long first,
                      unsigned long last, void *entry, gfp_t gfp);
int mtree_store(struct maple_tree *mt, unsigned long index,
                void *entry, gfp_t gfp);
void *mtree_erase(struct maple_tree *mt, unsigned long index);

int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);

void mtree_destroy(struct maple_tree *mt);
void __mt_destroy(struct maple_tree *mt);

/**
 * mtree_empty() - Determine if a tree has any present entries.
 * @mt: Maple Tree.
 *
 * Context: Any context.
 * Return: %true if the tree contains only NULL pointers.
 */
static inline bool mtree_empty(const struct maple_tree *mt)
{
        return mt->ma_root == NULL;
}

/* Advanced API */

/*
 * Maple State Status
 * ma_active means the maple state is pointing to a node and offset and can
 * continue operating on the tree.
 * ma_start means we have not searched the tree.
 * ma_root means we have searched the tree and the entry we found lives in
 * the root of the tree (ie it has index 0, length 1 and is the only entry in
 * the tree).
 * ma_none means we have searched the tree and there is no node in the
 * tree for this entry.  For example, we searched for index 1 in an empty
 * tree.  Or we have a tree which points to a full leaf node and we
 * searched for an entry which is larger than can be contained in that
 * leaf node.
 * ma_pause means the data within the maple state may be stale, restart the
 * operation
 * ma_overflow means the search has reached the upper limit of the search
 * ma_underflow means the search has reached the lower limit of the search
 * ma_error means there was an error, check the node for the error number.
 */
enum maple_status {
        ma_active,
        ma_start,
        ma_root,
        ma_none,
        ma_pause,
        ma_overflow,
        ma_underflow,
        ma_error,
};

/*
 * The maple state is defined in the struct ma_state and is used to keep track
 * of information during operations, and even between operations when using the
 * advanced API.
 *
 * If state->node has bit 0 set then it references a tree location which is not
 * a node (eg the root).  If bit 1 is set, the rest of the bits are a negative
 * errno.  Bit 2 (the 'unallocated slots' bit) is clear.  Bits 3-6 indicate the
 * node type.
 *
 * state->alloc either has a request number of nodes or an allocated node.  If
 * stat->alloc has a requested number of nodes, the first bit will be set (0x1)
 * and the remaining bits are the value.  If state->alloc is a node, then the
 * node will be of type maple_alloc.  maple_alloc has MAPLE_NODE_SLOTS - 1 for
 * storing more allocated nodes, a total number of nodes allocated, and the
 * node_count in this node.  node_count is the number of allocated nodes in this
 * node.  The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further
 * nodes into state->alloc->slot[0]'s node.  Nodes are taken from state->alloc
 * by removing a node from the state->alloc node until state->alloc->node_count
 * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted
 * to state->alloc.  Nodes are pushed onto state->alloc by putting the current
 * state->alloc into the pushed node's slot[0].
 *
 * The state also contains the implied min/max of the state->node, the depth of
 * this search, and the offset. The implied min/max are either from the parent
 * node or are 0-oo for the root node.  The depth is incremented or decremented
 * every time a node is walked down or up.  The offset is the slot/pivot of
 * interest in the node - either for reading or writing.
 *
 * When returning a value the maple state index and last respectively contain
 * the start and end of the range for the entry.  Ranges are inclusive in the
 * Maple Tree.
 *
 * The status of the state is used to determine how the next action should treat
 * the state.  For instance, if the status is ma_start then the next action
 * should start at the root of the tree and walk down.  If the status is
 * ma_pause then the node may be stale data and should be discarded.  If the
 * status is ma_overflow, then the last action hit the upper limit.
 *
 */
struct ma_state {
        struct maple_tree *tree;        /* The tree we're operating in */
        unsigned long index;                /* The index we're operating on - range start */
        unsigned long last;                /* The last index we're operating on - range end */
        struct maple_enode *node;        /* The node containing this entry */
        unsigned long min;                /* The minimum index of this node - implied pivot min */
        unsigned long max;                /* The maximum index of this node - implied pivot max */
        struct slab_sheaf *sheaf;        /* Allocated nodes for this operation */
        struct maple_node *alloc;        /* A single allocated node for fast path writes */
        unsigned long node_request;        /* The number of nodes to allocate for this operation */
        enum maple_status status;        /* The status of the state (active, start, none, etc) */
        unsigned char depth;                /* depth of tree descent during write */
        unsigned char offset;
        unsigned char mas_flags;
        unsigned char end;                /* The end of the node */
        enum store_type store_type;        /* The type of store needed for this operation */
};

struct ma_wr_state {
        struct ma_state *mas;
        struct maple_node *node;        /* Decoded mas->node */
        unsigned long r_min;                /* range min */
        unsigned long r_max;                /* range max */
        enum maple_type type;                /* mas->node type */
        unsigned char offset_end;        /* The offset where the write ends */
        unsigned long *pivots;                /* mas->node->pivots pointer */
        unsigned long end_piv;                /* The pivot at the offset end */
        void __rcu **slots;                /* mas->node->slots pointer */
        void *entry;                        /* The entry to write */
        void *content;                        /* The existing entry that is being overwritten */
        unsigned char vacant_height;        /* Height of lowest node with free space */
        unsigned char sufficient_height;/* Height of lowest node with min sufficiency + 1 nodes */
};

#define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
#define mas_lock_nested(mas, subclass) \
                spin_lock_nested(&((mas)->tree->ma_lock), subclass)
#define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))

/*
 * Special values for ma_state.node.
 * MA_ERROR represents an errno.  After dropping the lock and attempting
 * to resolve the error, the walk would have to be restarted from the
 * top of the tree as the tree may have been modified.
 */
#define MA_ERROR(err) \
                ((struct maple_enode *)(((unsigned long)err << 2) | 2UL))

/*
 * When changing MA_STATE, remember to also change rust/kernel/maple_tree.rs
 */
#define MA_STATE(name, mt, first, end)                                        \
        struct ma_state name = {                                        \
                .tree = mt,                                                \
                .index = first,                                                \
                .last = end,                                                \
                .node = NULL,                                                \
                .status = ma_start,                                        \
                .min = 0,                                                \
                .max = ULONG_MAX,                                        \
                .sheaf = NULL,                                                \
                .alloc = NULL,                                                \
                .node_request = 0,                                        \
                .mas_flags = 0,                                                \
                .store_type = wr_invalid,                                \
        }

#define MA_WR_STATE(name, ma_state, wr_entry)                                \
        struct ma_wr_state name = {                                        \
                .mas = ma_state,                                        \
                .content = NULL,                                        \
                .entry = wr_entry,                                        \
                .vacant_height = 0,                                        \
                .sufficient_height = 0                                        \
        }

#define MA_TOPIARY(name, tree)                                                \
        struct ma_topiary name = {                                        \
                .head = NULL,                                                \
                .tail = NULL,                                                \
                .mtree = tree,                                                \
        }

void *mas_walk(struct ma_state *mas);
void *mas_store(struct ma_state *mas, void *entry);
void *mas_erase(struct ma_state *mas);
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
void mas_store_prealloc(struct ma_state *mas, void *entry);
void *mas_find(struct ma_state *mas, unsigned long max);
void *mas_find_range(struct ma_state *mas, unsigned long max);
void *mas_find_rev(struct ma_state *mas, unsigned long min);
void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp);

bool mas_nomem(struct ma_state *mas, gfp_t gfp);
void mas_pause(struct ma_state *mas);
void maple_tree_init(void);
void mas_destroy(struct ma_state *mas);

void *mas_prev(struct ma_state *mas, unsigned long min);
void *mas_prev_range(struct ma_state *mas, unsigned long max);
void *mas_next(struct ma_state *mas, unsigned long max);
void *mas_next_range(struct ma_state *mas, unsigned long max);

int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
                   unsigned long size);
/*
 * This finds an empty area from the highest address to the lowest.
 * AKA "Topdown" version,
 */
int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
                       unsigned long max, unsigned long size);

static inline void mas_init(struct ma_state *mas, struct maple_tree *tree,
                            unsigned long addr)
{
        memset(mas, 0, sizeof(struct ma_state));
        mas->tree = tree;
        mas->index = mas->last = addr;
        mas->max = ULONG_MAX;
        mas->status = ma_start;
        mas->node = NULL;
}

static inline bool mas_is_active(struct ma_state *mas)
{
        return mas->status == ma_active;
}

static inline bool mas_is_err(struct ma_state *mas)
{
        return mas->status == ma_error;
}

/**
 * mas_reset() - Reset a Maple Tree operation state.
 * @mas: Maple Tree operation state.
 *
 * Resets the error or walk state of the @mas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * lock and want to reuse the ma_state.
 *
 * Context: Any context.
 */
static __always_inline void mas_reset(struct ma_state *mas)
{
        mas->status = ma_start;
        mas->node = NULL;
}

/**
 * mas_for_each() - Iterate over a range of the maple tree.
 * @__mas: Maple Tree operation state (maple_state)
 * @__entry: Entry retrieved from the tree
 * @__max: maximum index to retrieve from the tree
 *
 * When returned, mas->index and mas->last will hold the entire range for the
 * entry.
 *
 * Note: may return the zero entry.
 */
#define mas_for_each(__mas, __entry, __max) \
        while (((__entry) = mas_find((__mas), (__max))) != NULL)

/**
 * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order.
 * @__mas: Maple Tree operation state (maple_state)
 * @__entry: Entry retrieved from the tree
 * @__min: minimum index to retrieve from the tree
 *
 * When returned, mas->index and mas->last will hold the entire range for the
 * entry.
 *
 * Note: may return the zero entry.
 */
#define mas_for_each_rev(__mas, __entry, __min) \
        while (((__entry) = mas_find_rev((__mas), (__min))) != NULL)

#ifdef CONFIG_DEBUG_MAPLE_TREE
enum mt_dump_format {
        mt_dump_dec,
        mt_dump_hex,
};

extern atomic_t maple_tree_tests_run;
extern atomic_t maple_tree_tests_passed;

void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
void mas_dump(const struct ma_state *mas);
void mas_wr_dump(const struct ma_wr_state *wr_mas);
void mt_validate(struct maple_tree *mt);
void mt_cache_shrink(void);
#define MT_BUG_ON(__tree, __x) do {                                        \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mt_dump(__tree, mt_dump_hex);                                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MAS_BUG_ON(__mas, __x) do {                                        \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_dump(__mas);                                        \
                mt_dump((__mas)->tree, mt_dump_hex);                        \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MAS_WR_BUG_ON(__wrmas, __x) do {                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_wr_dump(__wrmas);                                        \
                mas_dump((__wrmas)->mas);                                \
                mt_dump((__wrmas)->mas->tree, mt_dump_hex);                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MT_WARN_ON(__tree, __x)  ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mt_dump(__tree, mt_dump_hex);                                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})

#define MAS_WARN_ON(__mas, __x) ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_dump(__mas);                                        \
                mt_dump((__mas)->tree, mt_dump_hex);                        \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})

#define MAS_WR_WARN_ON(__wrmas, __x) ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_wr_dump(__wrmas);                                        \
                mas_dump((__wrmas)->mas);                                \
                mt_dump((__wrmas)->mas->tree, mt_dump_hex);                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})
#else
#define MT_BUG_ON(__tree, __x)                BUG_ON(__x)
#define MAS_BUG_ON(__mas, __x)                BUG_ON(__x)
#define MAS_WR_BUG_ON(__mas, __x)        BUG_ON(__x)
#define MT_WARN_ON(__tree, __x)                WARN_ON(__x)
#define MAS_WARN_ON(__mas, __x)                WARN_ON(__x)
#define MAS_WR_WARN_ON(__mas, __x)        WARN_ON(__x)
#endif /* CONFIG_DEBUG_MAPLE_TREE */

/**
 * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
 * current location.
 * @mas: Maple Tree operation state.
 * @start: New start of range in the Maple Tree.
 * @last: New end of range in the Maple Tree.
 *
 * set the internal maple state values to a sub-range.
 * Please use mas_set_range() if you do not know where you are in the tree.
 */
static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
                unsigned long last)
{
        /* Ensure the range starts within the current slot */
        MAS_WARN_ON(mas, mas_is_active(mas) &&
                   (mas->index > start || mas->last < start));
        mas->index = start;
        mas->last = last;
}

/**
 * mas_set_range() - Set up Maple Tree operation state for a different index.
 * @mas: Maple Tree operation state.
 * @start: New start of range in the Maple Tree.
 * @last: New end of range in the Maple Tree.
 *
 * Move the operation state to refer to a different range.  This will
 * have the effect of starting a walk from the top; see mas_next()
 * to move to an adjacent index.
 */
static inline
void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
{
        mas_reset(mas);
        __mas_set_range(mas, start, last);
}

/**
 * mas_set() - Set up Maple Tree operation state for a different index.
 * @mas: Maple Tree operation state.
 * @index: New index into the Maple Tree.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see mas_next()
 * to move to an adjacent index.
 */
static inline void mas_set(struct ma_state *mas, unsigned long index)
{

        mas_set_range(mas, index, index);
}

static inline bool mt_external_lock(const struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN;
}

/**
 * mt_init_flags() - Initialise an empty maple tree with flags.
 * @mt: Maple Tree
 * @flags: maple tree flags.
 *
 * If you need to initialise a Maple Tree with special flags (eg, an
 * allocation tree), use this function.
 *
 * Context: Any context.
 */
static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags)
{
        mt->ma_flags = flags;
        if (!mt_external_lock(mt))
                spin_lock_init(&mt->ma_lock);
        rcu_assign_pointer(mt->ma_root, NULL);
}

/**
 * mt_init() - Initialise an empty maple tree.
 * @mt: Maple Tree
 *
 * An empty Maple Tree.
 *
 * Context: Any context.
 */
static inline void mt_init(struct maple_tree *mt)
{
        mt_init_flags(mt, 0);
}

static inline bool mt_in_rcu(struct maple_tree *mt)
{
#ifdef CONFIG_MAPLE_RCU_DISABLED
        return false;
#endif
        return mt->ma_flags & MT_FLAGS_USE_RCU;
}

/**
 * mt_clear_in_rcu() - Switch the tree to non-RCU mode.
 * @mt: The Maple Tree
 */
static inline void mt_clear_in_rcu(struct maple_tree *mt)
{
        if (!mt_in_rcu(mt))
                return;

        if (mt_external_lock(mt)) {
                WARN_ON(!mt_lock_is_held(mt));
                mt->ma_flags &= ~MT_FLAGS_USE_RCU;
        } else {
                mtree_lock(mt);
                mt->ma_flags &= ~MT_FLAGS_USE_RCU;
                mtree_unlock(mt);
        }
}

/**
 * mt_set_in_rcu() - Switch the tree to RCU safe mode.
 * @mt: The Maple Tree
 */
static inline void mt_set_in_rcu(struct maple_tree *mt)
{
        if (mt_in_rcu(mt))
                return;

        if (mt_external_lock(mt)) {
                WARN_ON(!mt_lock_is_held(mt));
                mt->ma_flags |= MT_FLAGS_USE_RCU;
        } else {
                mtree_lock(mt);
                mt->ma_flags |= MT_FLAGS_USE_RCU;
                mtree_unlock(mt);
        }
}

static inline unsigned int mt_height(const struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET;
}

void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max);
void *mt_find_after(struct maple_tree *mt, unsigned long *index,
                    unsigned long max);
void *mt_prev(struct maple_tree *mt, unsigned long index,  unsigned long min);
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);

/**
 * mt_for_each - Iterate over each entry starting at index until max.
 * @__tree: The Maple Tree
 * @__entry: The current entry
 * @__index: The index to start the search from. Subsequently used as iterator.
 * @__max: The maximum limit for @index
 *
 * This iterator skips all entries, which resolve to a NULL pointer,
 * e.g. entries which has been reserved with XA_ZERO_ENTRY.
 */
#define mt_for_each(__tree, __entry, __index, __max) \
        for (__entry = mt_find(__tree, &(__index), __max); \
                __entry; __entry = mt_find_after(__tree, &(__index), __max))

#endif /*_LINUX_MAPLE_TREE_H */

















































































































































































   12 
   12 



































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Fast and scalable bitmaps.
 *
 * Copyright (C) 2016 Facebook
 * Copyright (C) 2013-2014 Jens Axboe
 */

#ifndef __LINUX_SCALE_BITMAP_H
#define __LINUX_SCALE_BITMAP_H

#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/minmax.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/types.h>
#include <linux/wait.h>

struct seq_file;

/**
 * struct sbitmap_word - Word in a &struct sbitmap.
 */
struct sbitmap_word {
        /**
         * @word: word holding free bits
         */
        unsigned long word;

        /**
         * @cleared: word holding cleared bits
         */
        unsigned long cleared ____cacheline_aligned_in_smp;

        /**
         * @swap_lock: serializes simultaneous updates of ->word and ->cleared
         */
        raw_spinlock_t swap_lock;
} ____cacheline_aligned_in_smp;

/**
 * struct sbitmap - Scalable bitmap.
 *
 * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This
 * trades off higher memory usage for better scalability.
 */
struct sbitmap {
        /**
         * @depth: Number of bits used in the whole bitmap.
         */
        unsigned int depth;

        /**
         * @shift: log2(number of bits used per word)
         */
        unsigned int shift;

        /**
         * @map_nr: Number of words (cachelines) being used for the bitmap.
         */
        unsigned int map_nr;

        /**
         * @round_robin: Allocate bits in strict round-robin order.
         */
        bool round_robin;

        /**
         * @map: Allocated bitmap.
         */
        struct sbitmap_word *map;

        /**
         * @alloc_hint: Cache of last successfully allocated or freed bit.
         *
         * This is per-cpu, which allows multiple users to stick to different
         * cachelines until the map is exhausted.
         */
        unsigned int __percpu *alloc_hint;
};

#define SBQ_WAIT_QUEUES 8
#define SBQ_WAKE_BATCH 8

/**
 * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue.
 */
struct sbq_wait_state {
        /**
         * @wait: Wait queue.
         */
        wait_queue_head_t wait;
} ____cacheline_aligned_in_smp;

/**
 * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free
 * bits.
 *
 * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to
 * avoid contention on the wait queue spinlock. This ensures that we don't hit a
 * scalability wall when we run out of free bits and have to start putting tasks
 * to sleep.
 */
struct sbitmap_queue {
        /**
         * @sb: Scalable bitmap.
         */
        struct sbitmap sb;

        /**
         * @wake_batch: Number of bits which must be freed before we wake up any
         * waiters.
         */
        unsigned int wake_batch;

        /**
         * @wake_index: Next wait queue in @ws to wake up.
         */
        atomic_t wake_index;

        /**
         * @ws: Wait queues.
         */
        struct sbq_wait_state *ws;

        /**
         * @ws_active: count of currently active ws waitqueues
         */
        atomic_t ws_active;

        /**
         * @min_shallow_depth: The minimum shallow depth which may be passed to
         * sbitmap_queue_get_shallow()
         */
        unsigned int min_shallow_depth;

        /**
         * @completion_cnt: Number of bits cleared passed to the
         * wakeup function.
         */
        atomic_t completion_cnt;

        /**
         * @wakeup_cnt: Number of thread wake ups issued.
         */
        atomic_t wakeup_cnt;
};

/**
 * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node.
 * @sb: Bitmap to initialize.
 * @depth: Number of bits to allocate.
 * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if
 *         given, a good default is chosen.
 * @flags: Allocation flags.
 * @node: Memory node to allocate on.
 * @round_robin: If true, be stricter about allocation order; always allocate
 *               starting from the last allocated bit. This is less efficient
 *               than the default behavior (false).
 * @alloc_hint: If true, apply percpu hint for where to start searching for
 *              a free bit.
 *
 * Return: Zero on success or negative errno on failure.
 */
int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
                      gfp_t flags, int node, bool round_robin, bool alloc_hint);

/* sbitmap internal helper */
static inline unsigned int __map_depth(const struct sbitmap *sb, int index)
{
        if (index == sb->map_nr - 1)
                return sb->depth - (index << sb->shift);
        return 1U << sb->shift;
}

/**
 * sbitmap_free() - Free memory used by a &struct sbitmap.
 * @sb: Bitmap to free.
 */
static inline void sbitmap_free(struct sbitmap *sb)
{
        free_percpu(sb->alloc_hint);
        kvfree(sb->map);
        sb->map = NULL;
}

/**
 * sbitmap_resize() - Resize a &struct sbitmap.
 * @sb: Bitmap to resize.
 * @depth: New number of bits to resize to.
 *
 * Doesn't reallocate anything. It's up to the caller to ensure that the new
 * depth doesn't exceed the depth that the sb was initialized with.
 */
void sbitmap_resize(struct sbitmap *sb, unsigned int depth);

/**
 * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap.
 * @sb: Bitmap to allocate from.
 *
 * This operation provides acquire barrier semantics if it succeeds.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int sbitmap_get(struct sbitmap *sb);

/**
 * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
 * @sb: Bitmap to check.
 *
 * Return: true if any bit in the bitmap is set, false otherwise.
 */
bool sbitmap_any_bit_set(const struct sbitmap *sb);

#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))

typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);

/**
 * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
 * @start: Where to start the iteration.
 * @sb: Bitmap to iterate over.
 * @fn: Callback. Should return true to continue or false to break early.
 * @data: Pointer to pass to callback.
 *
 * This is inline even though it's non-trivial so that the function calls to the
 * callback will hopefully get optimized away.
 */
static inline void __sbitmap_for_each_set(struct sbitmap *sb,
                                          unsigned int start,
                                          sb_for_each_fn fn, void *data)
{
        unsigned int index;
        unsigned int nr;
        unsigned int scanned = 0;

        if (start >= sb->depth)
                start = 0;
        index = SB_NR_TO_INDEX(sb, start);
        nr = SB_NR_TO_BIT(sb, start);

        while (scanned < sb->depth) {
                unsigned long word;
                unsigned int depth = min_t(unsigned int,
                                           __map_depth(sb, index) - nr,
                                           sb->depth - scanned);

                scanned += depth;
                word = sb->map[index].word & ~sb->map[index].cleared;
                if (!word)
                        goto next;

                /*
                 * On the first iteration of the outer loop, we need to add the
                 * bit offset back to the size of the word for find_next_bit().
                 * On all other iterations, nr is zero, so this is a noop.
                 */
                depth += nr;
                while (1) {
                        nr = find_next_bit(&word, depth, nr);
                        if (nr >= depth)
                                break;
                        if (!fn(sb, (index << sb->shift) + nr, data))
                                return;

                        nr++;
                }
next:
                nr = 0;
                if (++index >= sb->map_nr)
                        index = 0;
        }
}

/**
 * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
 * @sb: Bitmap to iterate over.
 * @fn: Callback. Should return true to continue or false to break early.
 * @data: Pointer to pass to callback.
 */
static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
                                        void *data)
{
        __sbitmap_for_each_set(sb, 0, fn, data);
}

static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
                                            unsigned int bitnr)
{
        return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word;
}

/* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */

static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr)
{
        set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
{
        clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

/*
 * This one is special, since it doesn't actually clear the bit, rather it
 * sets the corresponding bit in the ->cleared mask instead. Paired with
 * the caller doing sbitmap_deferred_clear() if a given index is full, which
 * will clear the previously freed entries in the corresponding ->word.
 */
static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
{
        unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;

        set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
}

/*
 * Pair of sbitmap_get, and this one applies both cleared bit and
 * allocation hint.
 */
static inline void sbitmap_put(struct sbitmap *sb, unsigned int bitnr)
{
        sbitmap_deferred_clear_bit(sb, bitnr);

        if (likely(sb->alloc_hint && !sb->round_robin && bitnr < sb->depth))
                *raw_cpu_ptr(sb->alloc_hint) = bitnr;
}

static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
{
        return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

static inline int sbitmap_calculate_shift(unsigned int depth)
{
        int        shift = ilog2(BITS_PER_LONG);

        /*
         * If the bitmap is small, shrink the number of bits per word so
         * we spread over a few cachelines, at least. If less than 4
         * bits, just forget about it, it's not going to work optimally
         * anyway.
         */
        if (depth >= 4) {
                while ((4U << shift) > depth)
                        shift--;
        }

        return shift;
}

/**
 * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
 * @sb: Bitmap to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The format may change at any time.
 */
void sbitmap_show(struct sbitmap *sb, struct seq_file *m);


/**
 * sbitmap_weight() - Return how many set and not cleared bits in a &struct
 * sbitmap.
 * @sb: Bitmap to check.
 *
 * Return: How many set and not cleared bits set
 */
unsigned int sbitmap_weight(const struct sbitmap *sb);

/**
 * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct
 * seq_file.
 * @sb: Bitmap to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The output isn't guaranteed to be internally
 * consistent.
 */
void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m);

/**
 * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific
 * memory node.
 * @sbq: Bitmap queue to initialize.
 * @depth: See sbitmap_init_node().
 * @shift: See sbitmap_init_node().
 * @round_robin: See sbitmap_get().
 * @flags: Allocation flags.
 * @node: Memory node to allocate on.
 *
 * Return: Zero on success or negative errno on failure.
 */
int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
                            int shift, bool round_robin, gfp_t flags, int node);

/**
 * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue.
 *
 * @sbq: Bitmap queue to free.
 */
static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
{
        kfree(sbq->ws);
        sbitmap_free(&sbq->sb);
}

/**
 * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
 * @sbq: Bitmap queue to recalculate wake batch.
 * @users: Number of shares.
 *
 * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
 * by depth. This interface is for HCTX shared tags or queue shared tags.
 */
void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
                                            unsigned int users);

/**
 * sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
 * @sbq: Bitmap queue to resize.
 * @depth: New number of bits to resize to.
 *
 * Like sbitmap_resize(), this doesn't reallocate anything. It has to do
 * some extra work on the &struct sbitmap_queue, so it's not safe to just
 * resize the underlying &struct sbitmap.
 */
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);

/**
 * __sbitmap_queue_get() - Try to allocate a free bit from a &struct
 * sbitmap_queue with preemption already disabled.
 * @sbq: Bitmap queue to allocate from.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int __sbitmap_queue_get(struct sbitmap_queue *sbq);

/**
 * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits
 * @sbq: Bitmap queue to allocate from.
 * @nr_tags: number of tags requested
 * @offset: offset to add to returned bits
 *
 * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is
 * a bit in the mask returned, and the caller must add @offset to the value to
 * get the absolute tag value.
 */
unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
                                        unsigned int *offset);

/**
 * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
 * sbitmap_queue, limiting the depth used from each word, with preemption
 * already disabled.
 * @sbq: Bitmap queue to allocate from.
 * @shallow_depth: The maximum number of bits to allocate from the queue.
 * See sbitmap_get_shallow().
 *
 * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
 * initializing @sbq.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
                              unsigned int shallow_depth);

/**
 * sbitmap_queue_get() - Try to allocate a free bit from a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to allocate from.
 * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
 *       sbitmap_queue_clear()).
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
                                    unsigned int *cpu)
{
        int nr;

        *cpu = get_cpu();
        nr = __sbitmap_queue_get(sbq);
        put_cpu();
        return nr;
}

/**
 * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the
 * minimum shallow depth that will be used.
 * @sbq: Bitmap queue in question.
 * @min_shallow_depth: The minimum shallow depth that will be passed to
 * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
 *
 * sbitmap_queue_clear() batches wakeups as an optimization. The batch size
 * depends on the depth of the bitmap. Since the shallow allocation functions
 * effectively operate with a different depth, the shallow depth must be taken
 * into account when calculating the batch size. This function must be called
 * with the minimum shallow depth that will be used. Failure to do so can result
 * in missed wakeups.
 */
void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
                                     unsigned int min_shallow_depth);

/**
 * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
 * &struct sbitmap_queue.
 * @sbq: Bitmap to free from.
 * @nr: Bit number to free.
 * @cpu: CPU the bit was allocated on.
 */
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                         unsigned int cpu);

/**
 * sbitmap_queue_clear_batch() - Free a batch of allocated bits
 * &struct sbitmap_queue.
 * @sbq: Bitmap to free from.
 * @offset: offset for each tag in array
 * @tags: array of tags
 * @nr_tags: number of tags in array
 */
void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
                                int *tags, int nr_tags);

static inline int sbq_index_inc(int index)
{
        return (index + 1) & (SBQ_WAIT_QUEUES - 1);
}

static inline void sbq_index_atomic_inc(atomic_t *index)
{
        int old = atomic_read(index);
        int new = sbq_index_inc(old);
        atomic_cmpxchg(index, old, new);
}

/**
 * sbq_wait_ptr() - Get the next wait queue to use for a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to wait on.
 * @wait_index: A counter per "user" of @sbq.
 *
 * Return: Next wait queue to be used
 */
static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
                                                  atomic_t *wait_index)
{
        struct sbq_wait_state *ws;

        ws = &sbq->ws[atomic_read(wait_index)];
        sbq_index_atomic_inc(wait_index);
        return ws;
}

/**
 * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to wake up.
 */
void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);

/**
 * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
 * on a &struct sbitmap_queue.
 * @sbq: Bitmap queue to wake up.
 * @nr: Number of bits cleared.
 */
void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr);

/**
 * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
 * seq_file.
 * @sbq: Bitmap queue to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The format may change at any time.
 */
void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);

struct sbq_wait {
        struct sbitmap_queue *sbq;        /* if set, sbq_wait is accounted */
        struct wait_queue_entry wait;
};

#define DEFINE_SBQ_WAIT(name)                                                        \
        struct sbq_wait name = {                                                \
                .sbq = NULL,                                                        \
                .wait = {                                                        \
                        .private        = current,                                \
                        .func                = autoremove_wake_function,                \
                        .entry                = LIST_HEAD_INIT((name).wait.entry),        \
                }                                                                \
        }

/*
 * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
 * internal state.
 */
void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
                                struct sbq_wait_state *ws,
                                struct sbq_wait *sbq_wait, int state);

/*
 * Must be paired with sbitmap_prepare_to_wait().
 */
void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
                                struct sbq_wait *sbq_wait);

/*
 * Wrapper around add_wait_queue(), which maintains some extra internal state
 */
void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
                            struct sbq_wait_state *ws,
                            struct sbq_wait *sbq_wait);

/*
 * Must be paired with sbitmap_add_wait_queue()
 */
void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait);

#endif /* __LINUX_SCALE_BITMAP_H */

















































































































































































































































































































































   13 

















































   13 


   13 





































































    5 
   13 






























































































































































































































































































































































































































































































































































































































































































































































































































   13 





















   14 


   14 
   12 

   13 











































































   14 









   14 























   14 
   13 































   13 



   13 
   12 















   14 


   13 









   13 








   14 











   14 

   14 










   13 







   13 
























   14 
   14 



   13 

   14 





   13 
   13 









   14 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NETLINK      Kernel-user communication protocol.
 *
 *                 Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
 *                                 Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 *                                 Patrick McHardy <kaber@trash.net>
 *
 * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 *                                  use nlk_sk, as sk->protinfo is on a diet 8)
 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
 *                                  - inc module use count of module that owns
 *                                    the kernel socket in case userspace opens
 *                                    socket of same protocol
 *                                  - remove all module support, since netlink is
 *                                    mandatory if CONFIG_NET=y these days
 */

#include <linux/module.h>

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/filter.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/audit.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/if_arp.h>
#include <linux/rhashtable.h>
#include <asm/cacheflush.h>
#include <linux/hash.h>
#include <linux/net_namespace.h>
#include <linux/nospec.h>
#include <linux/btf_ids.h>

#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/scm.h>
#include <net/netlink.h>
#define CREATE_TRACE_POINTS
#include <trace/events/netlink.h>

#include "af_netlink.h"
#include "genetlink.h"

struct listeners {
        struct rcu_head                rcu;
        unsigned long                masks[];
};

/* state bits */
#define NETLINK_S_CONGESTED                0x0

static inline int netlink_is_kernel(struct sock *sk)
{
        return nlk_test_bit(KERNEL_SOCKET, sk);
}

struct netlink_table *nl_table __read_mostly;
EXPORT_SYMBOL_GPL(nl_table);

static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS];

static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = {
        "nlk_cb_mutex-ROUTE",
        "nlk_cb_mutex-1",
        "nlk_cb_mutex-USERSOCK",
        "nlk_cb_mutex-FIREWALL",
        "nlk_cb_mutex-SOCK_DIAG",
        "nlk_cb_mutex-NFLOG",
        "nlk_cb_mutex-XFRM",
        "nlk_cb_mutex-SELINUX",
        "nlk_cb_mutex-ISCSI",
        "nlk_cb_mutex-AUDIT",
        "nlk_cb_mutex-FIB_LOOKUP",
        "nlk_cb_mutex-CONNECTOR",
        "nlk_cb_mutex-NETFILTER",
        "nlk_cb_mutex-IP6_FW",
        "nlk_cb_mutex-DNRTMSG",
        "nlk_cb_mutex-KOBJECT_UEVENT",
        "nlk_cb_mutex-GENERIC",
        "nlk_cb_mutex-17",
        "nlk_cb_mutex-SCSITRANSPORT",
        "nlk_cb_mutex-ECRYPTFS",
        "nlk_cb_mutex-RDMA",
        "nlk_cb_mutex-CRYPTO",
        "nlk_cb_mutex-SMC",
        "nlk_cb_mutex-23",
        "nlk_cb_mutex-24",
        "nlk_cb_mutex-25",
        "nlk_cb_mutex-26",
        "nlk_cb_mutex-27",
        "nlk_cb_mutex-28",
        "nlk_cb_mutex-29",
        "nlk_cb_mutex-30",
        "nlk_cb_mutex-31",
        "nlk_cb_mutex-MAX_LINKS"
};

static int netlink_dump(struct sock *sk, bool lock_taken);

/* nl_table locking explained:
 * Lookup and traversal are protected with an RCU read-side lock. Insertion
 * and removal are protected with per bucket lock while using RCU list
 * modification primitives and may run in parallel to RCU protected lookups.
 * Destruction of the Netlink socket may only occur *after* nl_table_lock has
 * been acquired * either during or after the socket has been removed from
 * the list and after an RCU grace period.
 */
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
static atomic_t nl_table_users = ATOMIC_INIT(0);

#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));

static BLOCKING_NOTIFIER_HEAD(netlink_chain);


static const struct rhashtable_params netlink_rhashtable_params;

void do_trace_netlink_extack(const char *msg)
{
        trace_netlink_extack(msg);
}
EXPORT_SYMBOL(do_trace_netlink_extack);

static inline u32 netlink_group_mask(u32 group)
{
        if (group > 32)
                return 0;
        return group ? 1 << (group - 1) : 0;
}

static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
                                           gfp_t gfp_mask)
{
        unsigned int len = skb->len;
        struct sk_buff *new;

        new = alloc_skb(len, gfp_mask);
        if (new == NULL)
                return NULL;

        NETLINK_CB(new).portid = NETLINK_CB(skb).portid;
        NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group;
        NETLINK_CB(new).creds = NETLINK_CB(skb).creds;

        skb_put_data(new, skb->data, len);
        return new;
}

static unsigned int netlink_tap_net_id;

struct netlink_tap_net {
        struct list_head netlink_tap_all;
        struct mutex netlink_tap_lock;
};

int netlink_add_tap(struct netlink_tap *nt)
{
        struct net *net = dev_net(nt->dev);
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        if (unlikely(nt->dev->type != ARPHRD_NETLINK))
                return -EINVAL;

        mutex_lock(&nn->netlink_tap_lock);
        list_add_rcu(&nt->list, &nn->netlink_tap_all);
        mutex_unlock(&nn->netlink_tap_lock);

        __module_get(nt->module);

        return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);

static int __netlink_remove_tap(struct netlink_tap *nt)
{
        struct net *net = dev_net(nt->dev);
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
        bool found = false;
        struct netlink_tap *tmp;

        mutex_lock(&nn->netlink_tap_lock);

        list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
                if (nt == tmp) {
                        list_del_rcu(&nt->list);
                        found = true;
                        goto out;
                }
        }

        pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
        mutex_unlock(&nn->netlink_tap_lock);

        if (found)
                module_put(nt->module);

        return found ? 0 : -ENODEV;
}

int netlink_remove_tap(struct netlink_tap *nt)
{
        int ret;

        ret = __netlink_remove_tap(nt);
        synchronize_net();

        return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);

static __net_init int netlink_tap_init_net(struct net *net)
{
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        INIT_LIST_HEAD(&nn->netlink_tap_all);
        mutex_init(&nn->netlink_tap_lock);
        return 0;
}

static struct pernet_operations netlink_tap_net_ops = {
        .init = netlink_tap_init_net,
        .id   = &netlink_tap_net_id,
        .size = sizeof(struct netlink_tap_net),
};

static bool netlink_filter_tap(const struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        /* We take the more conservative approach and
         * whitelist socket protocols that may pass.
         */
        switch (sk->sk_protocol) {
        case NETLINK_ROUTE:
        case NETLINK_USERSOCK:
        case NETLINK_SOCK_DIAG:
        case NETLINK_NFLOG:
        case NETLINK_XFRM:
        case NETLINK_FIB_LOOKUP:
        case NETLINK_NETFILTER:
        case NETLINK_GENERIC:
                return true;
        }

        return false;
}

static int __netlink_deliver_tap_skb(struct sk_buff *skb,
                                     struct net_device *dev)
{
        struct sk_buff *nskb;
        struct sock *sk = skb->sk;
        int ret = -ENOMEM;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                return 0;

        dev_hold(dev);

        if (is_vmalloc_addr(skb->head))
                nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
        else
                nskb = skb_clone(skb, GFP_ATOMIC);
        if (nskb) {
                nskb->dev = dev;
                nskb->protocol = htons((u16) sk->sk_protocol);
                nskb->pkt_type = netlink_is_kernel(sk) ?
                                 PACKET_KERNEL : PACKET_USER;
                skb_reset_network_header(nskb);
                ret = dev_queue_xmit(nskb);
                if (unlikely(ret > 0))
                        ret = net_xmit_errno(ret);
        }

        dev_put(dev);
        return ret;
}

static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn)
{
        int ret;
        struct netlink_tap *tmp;

        if (!netlink_filter_tap(skb))
                return;

        list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
                ret = __netlink_deliver_tap_skb(skb, tmp->dev);
                if (unlikely(ret))
                        break;
        }
}

static void netlink_deliver_tap(struct net *net, struct sk_buff *skb)
{
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        rcu_read_lock();

        if (unlikely(!list_empty(&nn->netlink_tap_all)))
                __netlink_deliver_tap(skb, nn);

        rcu_read_unlock();
}

static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
                                       struct sk_buff *skb)
{
        if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
                netlink_deliver_tap(sock_net(dst), skb);
}

static void netlink_overrun(struct sock *sk)
{
        if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) {
                if (!test_and_set_bit(NETLINK_S_CONGESTED,
                                      &nlk_sk(sk)->state)) {
                        WRITE_ONCE(sk->sk_err, ENOBUFS);
                        sk_error_report(sk);
                }
        }
        sk_drops_inc(sk);
}

static void netlink_rcv_wake(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (skb_queue_empty_lockless(&sk->sk_receive_queue))
                clear_bit(NETLINK_S_CONGESTED, &nlk->state);
        if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
                wake_up_interruptible(&nlk->wait);
}

static void netlink_skb_destructor(struct sk_buff *skb)
{
        if (is_vmalloc_addr(skb->head)) {
                if (!skb->cloned ||
                    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
                        vfree_atomic(skb->head);

                skb->head = NULL;
        }
        if (skb->sk != NULL)
                sock_rfree(skb);
}

static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        WARN_ON(skb->sk != NULL);
        skb->sk = sk;
        skb->destructor = netlink_skb_destructor;
        sk_mem_charge(sk, skb->truesize);
}

static void netlink_sock_destruct(struct sock *sk)
{
        skb_queue_purge(&sk->sk_receive_queue);

        if (!sock_flag(sk, SOCK_DEAD)) {
                printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
                return;
        }

        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON(refcount_read(&sk->sk_wmem_alloc));
        WARN_ON(nlk_sk(sk)->groups);
}

/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
 * SMP. Look, when several writers sleep and reader wakes them up, all but one
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines.
 */

void netlink_table_grab(void)
        __acquires(nl_table_lock)
{
        might_sleep();

        write_lock_irq(&nl_table_lock);

        if (atomic_read(&nl_table_users)) {
                DECLARE_WAITQUEUE(wait, current);

                add_wait_queue_exclusive(&nl_table_wait, &wait);
                for (;;) {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        if (atomic_read(&nl_table_users) == 0)
                                break;
                        write_unlock_irq(&nl_table_lock);
                        schedule();
                        write_lock_irq(&nl_table_lock);
                }

                __set_current_state(TASK_RUNNING);
                remove_wait_queue(&nl_table_wait, &wait);
        }
}

void netlink_table_ungrab(void)
        __releases(nl_table_lock)
{
        write_unlock_irq(&nl_table_lock);
        wake_up(&nl_table_wait);
}

static inline void
netlink_lock_table(void)
{
        unsigned long flags;

        /* read_lock() synchronizes us to netlink_table_grab */

        read_lock_irqsave(&nl_table_lock, flags);
        atomic_inc(&nl_table_users);
        read_unlock_irqrestore(&nl_table_lock, flags);
}

static inline void
netlink_unlock_table(void)
{
        if (atomic_dec_and_test(&nl_table_users))
                wake_up(&nl_table_wait);
}

struct netlink_compare_arg
{
        possible_net_t pnet;
        u32 portid;
};

/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */
#define netlink_compare_arg_len \
        (offsetof(struct netlink_compare_arg, portid) + sizeof(u32))

static inline int netlink_compare(struct rhashtable_compare_arg *arg,
                                  const void *ptr)
{
        const struct netlink_compare_arg *x = arg->key;
        const struct netlink_sock *nlk = ptr;

        return nlk->portid != x->portid ||
               !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet));
}

static void netlink_compare_arg_init(struct netlink_compare_arg *arg,
                                     struct net *net, u32 portid)
{
        memset(arg, 0, sizeof(*arg));
        write_pnet(&arg->pnet, net);
        arg->portid = portid;
}

static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
                                     struct net *net)
{
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, net, portid);
        return rhashtable_lookup_fast(&table->hash, &arg,
                                      netlink_rhashtable_params);
}

static int __netlink_insert(struct netlink_table *table, struct sock *sk)
{
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
        return rhashtable_lookup_insert_key(&table->hash, &arg,
                                            &nlk_sk(sk)->node,
                                            netlink_rhashtable_params);
}

static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
{
        struct netlink_table *table = &nl_table[protocol];
        struct sock *sk;

        rcu_read_lock();
        sk = __netlink_lookup(table, portid, net);
        if (sk)
                sock_hold(sk);
        rcu_read_unlock();

        return sk;
}

static const struct proto_ops netlink_ops;

static void
netlink_update_listeners(struct sock *sk)
{
        struct netlink_table *tbl = &nl_table[sk->sk_protocol];
        unsigned long mask;
        unsigned int i;
        struct listeners *listeners;

        listeners = nl_deref_protected(tbl->listeners);
        if (!listeners)
                return;

        for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
                mask = 0;
                sk_for_each_bound(sk, &tbl->mc_list) {
                        if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
                                mask |= nlk_sk(sk)->groups[i];
                }
                listeners->masks[i] = mask;
        }
        /* this function is only called with the netlink table "grabbed", which
         * makes sure updates are visible before bind or setsockopt return. */
}

static int netlink_insert(struct sock *sk, u32 portid)
{
        struct netlink_table *table = &nl_table[sk->sk_protocol];
        int err;

        lock_sock(sk);

        err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY;
        if (nlk_sk(sk)->bound)
                goto err;

        /* portid can be read locklessly from netlink_getname(). */
        WRITE_ONCE(nlk_sk(sk)->portid, portid);

        sock_hold(sk);

        err = __netlink_insert(table, sk);
        if (err) {
                /* In case the hashtable backend returns with -EBUSY
                 * from here, it must not escape to the caller.
                 */
                if (unlikely(err == -EBUSY))
                        err = -EOVERFLOW;
                if (err == -EEXIST)
                        err = -EADDRINUSE;
                sock_put(sk);
                goto err;
        }

        /* We need to ensure that the socket is hashed and visible. */
        smp_wmb();
        /* Paired with lockless reads from netlink_bind(),
         * netlink_connect() and netlink_sendmsg().
         */
        WRITE_ONCE(nlk_sk(sk)->bound, portid);

err:
        release_sock(sk);
        return err;
}

static void netlink_remove(struct sock *sk)
{
        struct netlink_table *table;

        table = &nl_table[sk->sk_protocol];
        if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
                                    netlink_rhashtable_params))
                __sock_put(sk);

        netlink_table_grab();
        if (nlk_sk(sk)->subscriptions) {
                __sk_del_bind_node(sk);
                netlink_update_listeners(sk);
        }
        if (sk->sk_protocol == NETLINK_GENERIC)
                atomic_inc(&genl_sk_destructing_cnt);
        netlink_table_ungrab();
}

static struct proto netlink_proto = {
        .name          = "NETLINK",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct netlink_sock),
};

static int __netlink_create(struct net *net, struct socket *sock,
                            int protocol, int kern)
{
        struct sock *sk;
        struct netlink_sock *nlk;

        sock->ops = &netlink_ops;

        sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
        if (!sk)
                return -ENOMEM;

        sock_init_data(sock, sk);

        nlk = nlk_sk(sk);
        mutex_init(&nlk->nl_cb_mutex);
        lockdep_set_class_and_name(&nlk->nl_cb_mutex,
                                           nlk_cb_mutex_keys + protocol,
                                           nlk_cb_mutex_key_strings[protocol]);
        init_waitqueue_head(&nlk->wait);

        sk->sk_destruct = netlink_sock_destruct;
        sk->sk_protocol = protocol;
        return 0;
}

static int netlink_create(struct net *net, struct socket *sock, int protocol,
                          int kern)
{
        struct module *module = NULL;
        struct netlink_sock *nlk;
        int (*bind)(struct net *net, int group);
        void (*unbind)(struct net *net, int group);
        void (*release)(struct sock *sock, unsigned long *groups);
        int err = 0;

        sock->state = SS_UNCONNECTED;

        if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
                return -ESOCKTNOSUPPORT;

        if (protocol < 0 || protocol >= MAX_LINKS)
                return -EPROTONOSUPPORT;
        protocol = array_index_nospec(protocol, MAX_LINKS);

        netlink_lock_table();
#ifdef CONFIG_MODULES
        if (!nl_table[protocol].registered) {
                netlink_unlock_table();
                request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
                netlink_lock_table();
        }
#endif
        if (nl_table[protocol].registered &&
            try_module_get(nl_table[protocol].module))
                module = nl_table[protocol].module;
        else
                err = -EPROTONOSUPPORT;
        bind = nl_table[protocol].bind;
        unbind = nl_table[protocol].unbind;
        release = nl_table[protocol].release;
        netlink_unlock_table();

        if (err < 0)
                goto out;

        err = __netlink_create(net, sock, protocol, kern);
        if (err < 0)
                goto out_module;

        sock_prot_inuse_add(net, &netlink_proto, 1);

        nlk = nlk_sk(sock->sk);
        nlk->module = module;
        nlk->netlink_bind = bind;
        nlk->netlink_unbind = unbind;
        nlk->netlink_release = release;
out:
        return err;

out_module:
        module_put(module);
        goto out;
}

static void deferred_put_nlk_sk(struct rcu_head *head)
{
        struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
        struct sock *sk = &nlk->sk;

        kfree(nlk->groups);
        nlk->groups = NULL;

        if (!refcount_dec_and_test(&sk->sk_refcnt))
                return;

        sk_free(sk);
}

static int netlink_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk;

        if (!sk)
                return 0;

        netlink_remove(sk);
        sock_orphan(sk);
        nlk = nlk_sk(sk);

        /*
         * OK. Socket is unlinked, any packets that arrive now
         * will be purged.
         */
        if (nlk->netlink_release)
                nlk->netlink_release(sk, nlk->groups);

        /* must not acquire netlink_table_lock in any way again before unbind
         * and notifying genetlink is done as otherwise it might deadlock
         */
        if (nlk->netlink_unbind) {
                int i;

                for (i = 0; i < nlk->ngroups; i++)
                        if (test_bit(i, nlk->groups))
                                nlk->netlink_unbind(sock_net(sk), i + 1);
        }
        if (sk->sk_protocol == NETLINK_GENERIC &&
            atomic_dec_return(&genl_sk_destructing_cnt) == 0)
                wake_up(&genl_sk_destructing_waitq);

        sock->sk = NULL;
        wake_up_interruptible_all(&nlk->wait);

        skb_queue_purge(&sk->sk_write_queue);

        if (nlk->portid && nlk->bound) {
                struct netlink_notify n = {
                                                .net = sock_net(sk),
                                                .protocol = sk->sk_protocol,
                                                .portid = nlk->portid,
                                          };
                blocking_notifier_call_chain(&netlink_chain,
                                NETLINK_URELEASE, &n);
        }

        /* Terminate any outstanding dump */
        if (nlk->cb_running) {
                if (nlk->cb.done)
                        nlk->cb.done(&nlk->cb);
                module_put(nlk->cb.module);
                kfree_skb(nlk->cb.skb);
                WRITE_ONCE(nlk->cb_running, false);
        }

        module_put(nlk->module);

        if (netlink_is_kernel(sk)) {
                netlink_table_grab();
                BUG_ON(nl_table[sk->sk_protocol].registered == 0);
                if (--nl_table[sk->sk_protocol].registered == 0) {
                        struct listeners *old;

                        old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
                        RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
                        kfree_rcu(old, rcu);
                        nl_table[sk->sk_protocol].module = NULL;
                        nl_table[sk->sk_protocol].bind = NULL;
                        nl_table[sk->sk_protocol].unbind = NULL;
                        nl_table[sk->sk_protocol].flags = 0;
                        nl_table[sk->sk_protocol].registered = 0;
                }
                netlink_table_ungrab();
        }

        sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);

        call_rcu(&nlk->rcu, deferred_put_nlk_sk);
        return 0;
}

static int netlink_autobind(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct netlink_table *table = &nl_table[sk->sk_protocol];
        s32 portid = task_tgid_vnr(current);
        int err;
        s32 rover = -4096;
        bool ok;

retry:
        cond_resched();
        rcu_read_lock();
        ok = !__netlink_lookup(table, portid, net);
        rcu_read_unlock();
        if (!ok) {
                /* Bind collision, search negative portid values. */
                if (rover == -4096)
                        /* rover will be in range [S32_MIN, -4097] */
                        rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN);
                else if (rover >= -4096)
                        rover = -4097;
                portid = rover--;
                goto retry;
        }

        err = netlink_insert(sk, portid);
        if (err == -EADDRINUSE)
                goto retry;

        /* If 2 threads race to autobind, that is fine.  */
        if (err == -EBUSY)
                err = 0;

        return err;
}

/**
 * __netlink_ns_capable - General netlink message capability test
 * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in the user namespace @user_ns.
 */
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
                        struct user_namespace *user_ns, int cap)
{
        return ((nsp->flags & NETLINK_SKB_DST) ||
                file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
                ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(__netlink_ns_capable);

/**
 * netlink_ns_capable - General netlink message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in the user namespace @user_ns.
 */
bool netlink_ns_capable(const struct sk_buff *skb,
                        struct user_namespace *user_ns, int cap)
{
        return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
}
EXPORT_SYMBOL(netlink_ns_capable);

/**
 * netlink_capable - Netlink global message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in all user namespaces.
 */
bool netlink_capable(const struct sk_buff *skb, int cap)
{
        return netlink_ns_capable(skb, &init_user_ns, cap);
}
EXPORT_SYMBOL(netlink_capable);

/**
 * netlink_net_capable - Netlink network namespace message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap over the network namespace of
 * the socket we received the message from.
 */
bool netlink_net_capable(const struct sk_buff *skb, int cap)
{
        return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
}
EXPORT_SYMBOL(netlink_net_capable);

static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
{
        return (nl_table[sock->sk->sk_protocol].flags & flag) ||
                ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
}

static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (nlk->subscriptions && !subscriptions)
                __sk_del_bind_node(sk);
        else if (!nlk->subscriptions && subscriptions)
                sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
        nlk->subscriptions = subscriptions;
}

static int netlink_realloc_groups(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int groups;
        unsigned long *new_groups;
        int err = 0;

        netlink_table_grab();

        groups = nl_table[sk->sk_protocol].groups;
        if (!nl_table[sk->sk_protocol].registered) {
                err = -ENOENT;
                goto out_unlock;
        }

        if (nlk->ngroups >= groups)
                goto out_unlock;

        new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
        if (new_groups == NULL) {
                err = -ENOMEM;
                goto out_unlock;
        }
        memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
               NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

        nlk->groups = new_groups;
        nlk->ngroups = groups;
 out_unlock:
        netlink_table_ungrab();
        return err;
}

static void netlink_undo_bind(int group, long unsigned int groups,
                              struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int undo;

        if (!nlk->netlink_unbind)
                return;

        for (undo = 0; undo < group; undo++)
                if (test_bit(undo, &groups))
                        nlk->netlink_unbind(sock_net(sk), undo + 1);
}

static int netlink_bind(struct socket *sock, struct sockaddr_unsized *addr,
                        int addr_len)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
        int err = 0;
        unsigned long groups;
        bool bound;

        if (addr_len < sizeof(struct sockaddr_nl))
                return -EINVAL;

        if (nladdr->nl_family != AF_NETLINK)
                return -EINVAL;
        groups = nladdr->nl_groups;

        /* Only superuser is allowed to listen multicasts */
        if (groups) {
                if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
                        return -EPERM;
                err = netlink_realloc_groups(sk);
                if (err)
                        return err;
        }

        if (nlk->ngroups < BITS_PER_LONG)
                groups &= (1UL << nlk->ngroups) - 1;

        /* Paired with WRITE_ONCE() in netlink_insert() */
        bound = READ_ONCE(nlk->bound);
        if (bound) {
                /* Ensure nlk->portid is up-to-date. */
                smp_rmb();

                if (nladdr->nl_pid != nlk->portid)
                        return -EINVAL;
        }

        if (nlk->netlink_bind && groups) {
                int group;

                /* nl_groups is a u32, so cap the maximum groups we can bind */
                for (group = 0; group < BITS_PER_TYPE(u32); group++) {
                        if (!test_bit(group, &groups))
                                continue;
                        err = nlk->netlink_bind(net, group + 1);
                        if (!err)
                                continue;
                        netlink_undo_bind(group, groups, sk);
                        return err;
                }
        }

        /* No need for barriers here as we return to user-space without
         * using any of the bound attributes.
         */
        netlink_lock_table();
        if (!bound) {
                err = nladdr->nl_pid ?
                        netlink_insert(sk, nladdr->nl_pid) :
                        netlink_autobind(sock);
                if (err) {
                        netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk);
                        goto unlock;
                }
        }

        if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
                goto unlock;
        netlink_unlock_table();

        netlink_table_grab();
        netlink_update_subscriptions(sk, nlk->subscriptions +
                                         hweight32(groups) -
                                         hweight32(nlk->groups[0]));
        nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
        netlink_update_listeners(sk);
        netlink_table_ungrab();

        return 0;

unlock:
        netlink_unlock_table();
        return err;
}

static int netlink_connect(struct socket *sock, struct sockaddr_unsized *addr,
                           int alen, int flags)
{
        int err = 0;
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;

        if (alen < sizeof(addr->sa_family))
                return -EINVAL;

        if (addr->sa_family == AF_UNSPEC) {
                /* paired with READ_ONCE() in netlink_getsockbyportid() */
                WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED);
                /* dst_portid and dst_group can be read locklessly */
                WRITE_ONCE(nlk->dst_portid, 0);
                WRITE_ONCE(nlk->dst_group, 0);
                return 0;
        }
        if (addr->sa_family != AF_NETLINK)
                return -EINVAL;

        if (alen < sizeof(struct sockaddr_nl))
                return -EINVAL;

        if ((nladdr->nl_groups || nladdr->nl_pid) &&
            !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
                return -EPERM;

        /* No need for barriers here as we return to user-space without
         * using any of the bound attributes.
         * Paired with WRITE_ONCE() in netlink_insert().
         */
        if (!READ_ONCE(nlk->bound))
                err = netlink_autobind(sock);

        if (err == 0) {
                /* paired with READ_ONCE() in netlink_getsockbyportid() */
                WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED);
                /* dst_portid and dst_group can be read locklessly */
                WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid);
                WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups));
        }

        return err;
}

static int netlink_getname(struct socket *sock, struct sockaddr *addr,
                           int peer)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);

        nladdr->nl_family = AF_NETLINK;
        nladdr->nl_pad = 0;

        if (peer) {
                /* Paired with WRITE_ONCE() in netlink_connect() */
                nladdr->nl_pid = READ_ONCE(nlk->dst_portid);
                nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group));
        } else {
                /* Paired with WRITE_ONCE() in netlink_insert() */
                nladdr->nl_pid = READ_ONCE(nlk->portid);
                netlink_lock_table();
                nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
                netlink_unlock_table();
        }
        return sizeof(*nladdr);
}

static int netlink_ioctl(struct socket *sock, unsigned int cmd,
                         unsigned long arg)
{
        /* try to hand this ioctl down to the NIC drivers.
         */
        return -ENOIOCTLCMD;
}

static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
{
        struct sock *sock;
        struct netlink_sock *nlk;

        sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
        if (!sock)
                return ERR_PTR(-ECONNREFUSED);

        /* Don't bother queuing skb if kernel socket has no input function */
        nlk = nlk_sk(sock);
        /* dst_portid and sk_state can be changed in netlink_connect() */
        if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED &&
            READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) {
                sock_put(sock);
                return ERR_PTR(-ECONNREFUSED);
        }
        return sock;
}

struct sock *netlink_getsockbyfd(int fd)
{
        CLASS(fd, f)(fd);
        struct inode *inode;
        struct sock *sock;

        if (fd_empty(f))
                return ERR_PTR(-EBADF);

        inode = file_inode(fd_file(f));
        if (!S_ISSOCK(inode->i_mode))
                return ERR_PTR(-ENOTSOCK);

        sock = SOCKET_I(inode)->sk;
        if (sock->sk_family != AF_NETLINK)
                return ERR_PTR(-EINVAL);

        sock_hold(sock);
        return sock;
}

struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast)
{
        size_t head_size = SKB_HEAD_ALIGN(size);
        struct sk_buff *skb;
        void *data;

        if (head_size <= PAGE_SIZE || broadcast)
                return alloc_skb(size, GFP_KERNEL);

        data = kvmalloc(head_size, GFP_KERNEL);
        if (!data)
                return NULL;

        skb = __build_skb(data, head_size);
        if (!skb)
                kvfree(data);
        else if (is_vmalloc_addr(data))
                skb->destructor = netlink_skb_destructor;

        return skb;
}

/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
                      long *timeo, struct sock *ssk)
{
        DECLARE_WAITQUEUE(wait, current);
        struct netlink_sock *nlk;
        unsigned int rmem;

        nlk = nlk_sk(sk);
        rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);

        if ((rmem == skb->truesize || rmem <= READ_ONCE(sk->sk_rcvbuf)) &&
            !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
                netlink_skb_set_owner_r(skb, sk);
                return 0;
        }

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);

        if (!*timeo) {
                if (!ssk || netlink_is_kernel(ssk))
                        netlink_overrun(sk);
                sock_put(sk);
                kfree_skb(skb);
                return -EAGAIN;
        }

        __set_current_state(TASK_INTERRUPTIBLE);
        add_wait_queue(&nlk->wait, &wait);
        rmem = atomic_read(&sk->sk_rmem_alloc);

        if (((rmem && rmem + skb->truesize > READ_ONCE(sk->sk_rcvbuf)) ||
             test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
            !sock_flag(sk, SOCK_DEAD))
                *timeo = schedule_timeout(*timeo);

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&nlk->wait, &wait);
        sock_put(sk);

        if (signal_pending(current)) {
                kfree_skb(skb);
                return sock_intr_errno(*timeo);
        }

        return 1;
}

static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
        int len = skb->len;

        netlink_deliver_tap(sock_net(sk), skb);

        skb_queue_tail(&sk->sk_receive_queue, skb);
        sk->sk_data_ready(sk);
        return len;
}

int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
        int len = __netlink_sendskb(sk, skb);

        sock_put(sk);
        return len;
}

void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
        kfree_skb(skb);
        sock_put(sk);
}

static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
{
        int delta;

        skb_assert_len(skb);
        WARN_ON(skb->sk != NULL);
        delta = skb->end - skb->tail;
        if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
                return skb;

        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, allocation);
                if (!nskb)
                        return skb;
                consume_skb(skb);
                skb = nskb;
        }

        pskb_expand_head(skb, 0, -delta,
                         (allocation & ~__GFP_DIRECT_RECLAIM) |
                         __GFP_NOWARN | __GFP_NORETRY);
        return skb;
}

static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
                                  struct sock *ssk)
{
        int ret;
        struct netlink_sock *nlk = nlk_sk(sk);

        ret = -ECONNREFUSED;
        if (nlk->netlink_rcv != NULL) {
                ret = skb->len;
                atomic_add(skb->truesize, &sk->sk_rmem_alloc);
                netlink_skb_set_owner_r(skb, sk);
                NETLINK_CB(skb).sk = ssk;
                netlink_deliver_tap_kernel(sk, ssk, skb);
                nlk->netlink_rcv(skb);
                consume_skb(skb);
        } else {
                kfree_skb(skb);
        }
        sock_put(sk);
        return ret;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
                    u32 portid, int nonblock)
{
        struct sock *sk;
        int err;
        long timeo;

        skb = netlink_trim(skb, gfp_any());

        timeo = sock_sndtimeo(ssk, nonblock);
retry:
        sk = netlink_getsockbyportid(ssk, portid);
        if (IS_ERR(sk)) {
                kfree_skb(skb);
                return PTR_ERR(sk);
        }
        if (netlink_is_kernel(sk))
                return netlink_unicast_kernel(sk, skb, ssk);

        if (sk_filter(sk, skb)) {
                err = skb->len;
                kfree_skb(skb);
                sock_put(sk);
                return err;
        }

        err = netlink_attachskb(sk, skb, &timeo, ssk);
        if (err == 1)
                goto retry;
        if (err)
                return err;

        return netlink_sendskb(sk, skb);
}
EXPORT_SYMBOL(netlink_unicast);

int netlink_has_listeners(struct sock *sk, unsigned int group)
{
        int res = 0;
        struct listeners *listeners;

        BUG_ON(!netlink_is_kernel(sk));

        rcu_read_lock();
        listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);

        if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
                res = test_bit(group - 1, listeners->masks);

        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);

bool netlink_strict_get_check(struct sk_buff *skb)
{
        return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);
}
EXPORT_SYMBOL_GPL(netlink_strict_get_check);

static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int rmem, rcvbuf;

        rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
        rcvbuf = READ_ONCE(sk->sk_rcvbuf);

        if ((rmem == skb->truesize || rmem <= rcvbuf) &&
            !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
                netlink_skb_set_owner_r(skb, sk);
                __netlink_sendskb(sk, skb);
                return rmem > (rcvbuf >> 1);
        }

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
        return -1;
}

struct netlink_broadcast_data {
        struct sock *exclude_sk;
        struct net *net;
        u32 portid;
        u32 group;
        int failure;
        int delivery_failure;
        int congested;
        int delivered;
        gfp_t allocation;
        struct sk_buff *skb, *skb2;
        int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
        void *tx_data;
};

static void do_one_broadcast(struct sock *sk,
                                    struct netlink_broadcast_data *p)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int val;

        if (p->exclude_sk == sk)
                return;

        if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
            !test_bit(p->group - 1, nlk->groups))
                return;

        if (!net_eq(sock_net(sk), p->net)) {
                if (!nlk_test_bit(LISTEN_ALL_NSID, sk))
                        return;

                if (!peernet_has_id(sock_net(sk), p->net))
                        return;

                if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
                                     CAP_NET_BROADCAST))
                        return;
        }

        if (p->failure) {
                netlink_overrun(sk);
                return;
        }

        sock_hold(sk);
        if (p->skb2 == NULL) {
                if (skb_shared(p->skb)) {
                        p->skb2 = skb_clone(p->skb, p->allocation);
                } else {
                        p->skb2 = skb_get(p->skb);
                        /*
                         * skb ownership may have been set when
                         * delivered to a previous socket.
                         */
                        skb_orphan(p->skb2);
                }
        }
        if (p->skb2 == NULL) {
                netlink_overrun(sk);
                /* Clone failed. Notify ALL listeners. */
                p->failure = 1;
                if (nlk_test_bit(BROADCAST_SEND_ERROR, sk))
                        p->delivery_failure = 1;
                goto out;
        }

        if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
                kfree_skb(p->skb2);
                p->skb2 = NULL;
                goto out;
        }

        if (sk_filter(sk, p->skb2)) {
                kfree_skb(p->skb2);
                p->skb2 = NULL;
                goto out;
        }
        NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
        if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED)
                NETLINK_CB(p->skb2).nsid_is_set = true;
        val = netlink_broadcast_deliver(sk, p->skb2);
        if (val < 0) {
                netlink_overrun(sk);
                if (nlk_test_bit(BROADCAST_SEND_ERROR, sk))
                        p->delivery_failure = 1;
        } else {
                p->congested |= val;
                p->delivered = 1;
                p->skb2 = NULL;
        }
out:
        sock_put(sk);
}

int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
                               u32 portid,
                               u32 group, gfp_t allocation,
                               netlink_filter_fn filter,
                               void *filter_data)
{
        struct net *net = sock_net(ssk);
        struct netlink_broadcast_data info;
        struct sock *sk;

        skb = netlink_trim(skb, allocation);

        info.exclude_sk = ssk;
        info.net = net;
        info.portid = portid;
        info.group = group;
        info.failure = 0;
        info.delivery_failure = 0;
        info.congested = 0;
        info.delivered = 0;
        info.allocation = allocation;
        info.skb = skb;
        info.skb2 = NULL;
        info.tx_filter = filter;
        info.tx_data = filter_data;

        /* While we sleep in clone, do not allow to change socket list */

        netlink_lock_table();

        sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
                do_one_broadcast(sk, &info);

        consume_skb(skb);

        netlink_unlock_table();

        if (info.delivery_failure) {
                kfree_skb(info.skb2);
                return -ENOBUFS;
        }
        consume_skb(info.skb2);

        if (info.delivered) {
                if (info.congested && gfpflags_allow_blocking(allocation))
                        yield();
                return 0;
        }
        return -ESRCH;
}
EXPORT_SYMBOL(netlink_broadcast_filtered);

int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
                      u32 group, gfp_t allocation)
{
        return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
                                          NULL, NULL);
}
EXPORT_SYMBOL(netlink_broadcast);

struct netlink_set_err_data {
        struct sock *exclude_sk;
        u32 portid;
        u32 group;
        int code;
};

static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int ret = 0;

        if (sk == p->exclude_sk)
                goto out;

        if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
                goto out;

        if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
            !test_bit(p->group - 1, nlk->groups))
                goto out;

        if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) {
                ret = 1;
                goto out;
        }

        WRITE_ONCE(sk->sk_err, p->code);
        sk_error_report(sk);
out:
        return ret;
}

/**
 * netlink_set_err - report error to broadcast listeners
 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
 * @portid: the PORTID of a process that we want to skip (if any)
 * @group: the broadcast group that will notice the error
 * @code: error code, must be negative (as usual in kernelspace)
 *
 * This function returns the number of broadcast listeners that have set the
 * NETLINK_NO_ENOBUFS socket option.
 */
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
{
        struct netlink_set_err_data info;
        unsigned long flags;
        struct sock *sk;
        int ret = 0;

        info.exclude_sk = ssk;
        info.portid = portid;
        info.group = group;
        /* sk->sk_err wants a positive error value */
        info.code = -code;

        read_lock_irqsave(&nl_table_lock, flags);

        sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
                ret += do_one_set_err(sk, &info);

        read_unlock_irqrestore(&nl_table_lock, flags);
        return ret;
}
EXPORT_SYMBOL(netlink_set_err);

/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
                                     unsigned int group,
                                     int is_new)
{
        int old, new = !!is_new, subscriptions;

        old = test_bit(group - 1, nlk->groups);
        subscriptions = nlk->subscriptions - old + new;
        __assign_bit(group - 1, nlk->groups, new);
        netlink_update_subscriptions(&nlk->sk, subscriptions);
        netlink_update_listeners(&nlk->sk);
}

static int netlink_setsockopt(struct socket *sock, int level, int optname,
                              sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int val = 0;
        int nr = -1;

        if (level != SOL_NETLINK)
                return -ENOPROTOOPT;

        if (optlen >= sizeof(int) &&
            copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        switch (optname) {
        case NETLINK_PKTINFO:
                nr = NETLINK_F_RECV_PKTINFO;
                break;
        case NETLINK_ADD_MEMBERSHIP:
        case NETLINK_DROP_MEMBERSHIP: {
                int err;

                if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
                        return -EPERM;
                err = netlink_realloc_groups(sk);
                if (err)
                        return err;
                if (!val || val - 1 >= nlk->ngroups)
                        return -EINVAL;
                if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
                        err = nlk->netlink_bind(sock_net(sk), val);
                        if (err)
                                return err;
                }
                netlink_table_grab();
                netlink_update_socket_mc(nlk, val,
                                         optname == NETLINK_ADD_MEMBERSHIP);
                netlink_table_ungrab();
                if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
                        nlk->netlink_unbind(sock_net(sk), val);

                break;
        }
        case NETLINK_BROADCAST_ERROR:
                nr = NETLINK_F_BROADCAST_SEND_ERROR;
                break;
        case NETLINK_NO_ENOBUFS:
                assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val);
                if (val) {
                        clear_bit(NETLINK_S_CONGESTED, &nlk->state);
                        wake_up_interruptible(&nlk->wait);
                }
                break;
        case NETLINK_LISTEN_ALL_NSID:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
                        return -EPERM;
                nr = NETLINK_F_LISTEN_ALL_NSID;
                break;
        case NETLINK_CAP_ACK:
                nr = NETLINK_F_CAP_ACK;
                break;
        case NETLINK_EXT_ACK:
                nr = NETLINK_F_EXT_ACK;
                break;
        case NETLINK_GET_STRICT_CHK:
                nr = NETLINK_F_STRICT_CHK;
                break;
        default:
                return -ENOPROTOOPT;
        }
        if (nr >= 0)
                assign_bit(nr, &nlk->flags, val);
        return 0;
}

static int netlink_getsockopt(struct socket *sock, int level, int optname,
                              char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int flag;
        int len, val;

        if (level != SOL_NETLINK)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case NETLINK_PKTINFO:
                flag = NETLINK_F_RECV_PKTINFO;
                break;
        case NETLINK_BROADCAST_ERROR:
                flag = NETLINK_F_BROADCAST_SEND_ERROR;
                break;
        case NETLINK_NO_ENOBUFS:
                flag = NETLINK_F_RECV_NO_ENOBUFS;
                break;
        case NETLINK_LIST_MEMBERSHIPS: {
                int pos, idx, shift, err = 0;

                netlink_lock_table();
                for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
                        if (len - pos < sizeof(u32))
                                break;

                        idx = pos / sizeof(unsigned long);
                        shift = (pos % sizeof(unsigned long)) * 8;
                        if (put_user((u32)(nlk->groups[idx] >> shift),
                                     (u32 __user *)(optval + pos))) {
                                err = -EFAULT;
                                break;
                        }
                }
                if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen))
                        err = -EFAULT;
                netlink_unlock_table();
                return err;
        }
        case NETLINK_LISTEN_ALL_NSID:
                flag = NETLINK_F_LISTEN_ALL_NSID;
                break;
        case NETLINK_CAP_ACK:
                flag = NETLINK_F_CAP_ACK;
                break;
        case NETLINK_EXT_ACK:
                flag = NETLINK_F_EXT_ACK;
                break;
        case NETLINK_GET_STRICT_CHK:
                flag = NETLINK_F_STRICT_CHK;
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        val = test_bit(flag, &nlk->flags);

        if (put_user(len, optlen) ||
            copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
        struct nl_pktinfo info;

        info.group = NETLINK_CB(skb).dst_group;
        put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}

static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
                                         struct sk_buff *skb)
{
        if (!NETLINK_CB(skb).nsid_is_set)
                return;

        put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
                 &NETLINK_CB(skb).nsid);
}

static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
        u32 dst_portid;
        u32 dst_group;
        struct sk_buff *skb;
        int err;
        struct scm_cookie scm;
        u32 netlink_skb_flags = 0;

        if (msg->msg_flags & MSG_OOB)
                return -EOPNOTSUPP;

        if (len == 0) {
                pr_warn_once("Zero length message leads to an empty skb\n");
                return -ENODATA;
        }

        err = scm_send(sock, msg, &scm, true);
        if (err < 0)
                return err;

        if (msg->msg_namelen) {
                err = -EINVAL;
                if (msg->msg_namelen < sizeof(struct sockaddr_nl))
                        goto out;
                if (addr->nl_family != AF_NETLINK)
                        goto out;
                dst_portid = addr->nl_pid;
                dst_group = ffs(addr->nl_groups);
                err =  -EPERM;
                if ((dst_group || dst_portid) &&
                    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
                        goto out;
                netlink_skb_flags |= NETLINK_SKB_DST;
        } else {
                /* Paired with WRITE_ONCE() in netlink_connect() */
                dst_portid = READ_ONCE(nlk->dst_portid);
                dst_group = READ_ONCE(nlk->dst_group);
        }

        /* Paired with WRITE_ONCE() in netlink_insert() */
        if (!READ_ONCE(nlk->bound)) {
                err = netlink_autobind(sock);
                if (err)
                        goto out;
        } else {
                /* Ensure nlk is hashed and visible. */
                smp_rmb();
        }

        err = -EMSGSIZE;
        if (len > sk->sk_sndbuf - 32)
                goto out;
        err = -ENOBUFS;
        skb = netlink_alloc_large_skb(len, dst_group);
        if (skb == NULL)
                goto out;

        NETLINK_CB(skb).portid        = nlk->portid;
        NETLINK_CB(skb).dst_group = dst_group;
        NETLINK_CB(skb).creds        = scm.creds;
        NETLINK_CB(skb).flags        = netlink_skb_flags;

        err = -EFAULT;
        if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
                kfree_skb(skb);
                goto out;
        }

        err = security_netlink_send(sk, skb);
        if (err) {
                kfree_skb(skb);
                goto out;
        }

        if (dst_group) {
                refcount_inc(&skb->users);
                netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
        }
        err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT);

out:
        scm_destroy(&scm);
        return err;
}

static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                           int flags)
{
        struct scm_cookie scm;
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        size_t copied, max_recvmsg_len;
        struct sk_buff *skb, *data_skb;
        int err, ret;

        if (flags & MSG_OOB)
                return -EOPNOTSUPP;

        copied = 0;

        skb = skb_recv_datagram(sk, flags, &err);
        if (skb == NULL)
                goto out;

        data_skb = skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
        if (unlikely(skb_shinfo(skb)->frag_list)) {
                /*
                 * If this skb has a frag_list, then here that means that we
                 * will have to use the frag_list skb's data for compat tasks
                 * and the regular skb's data for normal (non-compat) tasks.
                 *
                 * If we need to send the compat skb, assign it to the
                 * 'data_skb' variable so that it will be used below for data
                 * copying. We keep 'skb' for everything else, including
                 * freeing both later.
                 */
                if (flags & MSG_CMSG_COMPAT)
                        data_skb = skb_shinfo(skb)->frag_list;
        }
#endif

        /* Record the max length of recvmsg() calls for future allocations */
        max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len);
        max_recvmsg_len = min_t(size_t, max_recvmsg_len,
                                SKB_WITH_OVERHEAD(32768));
        WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len);

        copied = data_skb->len;
        if (len < copied) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }

        err = skb_copy_datagram_msg(data_skb, 0, msg, copied);

        if (msg->msg_name) {
                DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
                addr->nl_family = AF_NETLINK;
                addr->nl_pad    = 0;
                addr->nl_pid        = NETLINK_CB(skb).portid;
                addr->nl_groups        = netlink_group_mask(NETLINK_CB(skb).dst_group);
                msg->msg_namelen = sizeof(*addr);
        }

        if (nlk_test_bit(RECV_PKTINFO, sk))
                netlink_cmsg_recv_pktinfo(msg, skb);
        if (nlk_test_bit(LISTEN_ALL_NSID, sk))
                netlink_cmsg_listen_all_nsid(sk, msg, skb);

        memset(&scm, 0, sizeof(scm));
        scm.creds = *NETLINK_CREDS(skb);
        if (flags & MSG_TRUNC)
                copied = data_skb->len;

        skb_free_datagram(sk, skb);

        if (READ_ONCE(nlk->cb_running) &&
            atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
                ret = netlink_dump(sk, false);
                if (ret) {
                        WRITE_ONCE(sk->sk_err, -ret);
                        sk_error_report(sk);
                }
        }

        scm_recv(sock, msg, &scm, flags);
out:
        netlink_rcv_wake(sk);
        return err ? : copied;
}

static void netlink_data_ready(struct sock *sk)
{
        BUG();
}

/*
 *        We export these functions to other modules. They provide a
 *        complete set of kernel non-blocking support for message
 *        queueing.
 */

struct sock *
__netlink_kernel_create(struct net *net, int unit, struct module *module,
                        struct netlink_kernel_cfg *cfg)
{
        struct socket *sock;
        struct sock *sk;
        struct netlink_sock *nlk;
        struct listeners *listeners = NULL;
        unsigned int groups;

        BUG_ON(!nl_table);

        if (unit < 0 || unit >= MAX_LINKS)
                return NULL;

        if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
                return NULL;

        if (__netlink_create(net, sock, unit, 1) < 0)
                goto out_sock_release_nosk;

        sk = sock->sk;

        if (!cfg || cfg->groups < 32)
                groups = 32;
        else
                groups = cfg->groups;

        listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
        if (!listeners)
                goto out_sock_release;

        sk->sk_data_ready = netlink_data_ready;
        if (cfg && cfg->input)
                nlk_sk(sk)->netlink_rcv = cfg->input;

        if (netlink_insert(sk, 0))
                goto out_sock_release;

        nlk = nlk_sk(sk);
        set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags);

        netlink_table_grab();
        if (!nl_table[unit].registered) {
                nl_table[unit].groups = groups;
                rcu_assign_pointer(nl_table[unit].listeners, listeners);
                nl_table[unit].module = module;
                if (cfg) {
                        nl_table[unit].bind = cfg->bind;
                        nl_table[unit].unbind = cfg->unbind;
                        nl_table[unit].release = cfg->release;
                        nl_table[unit].flags = cfg->flags;
                }
                nl_table[unit].registered = 1;
        } else {
                kfree(listeners);
                nl_table[unit].registered++;
        }
        netlink_table_ungrab();
        return sk;

out_sock_release:
        kfree(listeners);
        netlink_kernel_release(sk);
        return NULL;

out_sock_release_nosk:
        sock_release(sock);
        return NULL;
}
EXPORT_SYMBOL(__netlink_kernel_create);

void
netlink_kernel_release(struct sock *sk)
{
        if (sk == NULL || sk->sk_socket == NULL)
                return;

        sock_release(sk->sk_socket);
}
EXPORT_SYMBOL(netlink_kernel_release);

int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
        struct listeners *new, *old;
        struct netlink_table *tbl = &nl_table[sk->sk_protocol];

        if (groups < 32)
                groups = 32;

        if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
                new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
                if (!new)
                        return -ENOMEM;
                old = nl_deref_protected(tbl->listeners);
                memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
                rcu_assign_pointer(tbl->listeners, new);

                kfree_rcu(old, rcu);
        }
        tbl->groups = groups;

        return 0;
}

/**
 * netlink_change_ngroups - change number of multicast groups
 *
 * This changes the number of multicast groups that are available
 * on a certain netlink family. Note that it is not possible to
 * change the number of groups to below 32. Also note that it does
 * not implicitly clear listeners from groups that are removed when
 * the number of groups is reduced.
 *
 * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
 * @groups: The new number of groups.
 */
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
        int err;

        netlink_table_grab();
        err = __netlink_change_ngroups(sk, groups);
        netlink_table_ungrab();

        return err;
}

void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
        struct sock *sk;
        struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
        struct hlist_node *tmp;

        sk_for_each_bound_safe(sk, tmp, &tbl->mc_list)
                netlink_update_socket_mc(nlk_sk(sk), group, 0);
}

struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
{
        struct nlmsghdr *nlh;
        int size = nlmsg_msg_size(len);

        nlh = skb_put(skb, NLMSG_ALIGN(size));
        nlh->nlmsg_type = type;
        nlh->nlmsg_len = size;
        nlh->nlmsg_flags = flags;
        nlh->nlmsg_pid = portid;
        nlh->nlmsg_seq = seq;
        if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
                memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
        return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);

static size_t
netlink_ack_tlv_len(struct netlink_sock *nlk, int err,
                    const struct netlink_ext_ack *extack)
{
        size_t tlvlen;

        if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags))
                return 0;

        tlvlen = 0;
        if (extack->_msg)
                tlvlen += nla_total_size(strlen(extack->_msg) + 1);
        if (extack->cookie_len)
                tlvlen += nla_total_size(extack->cookie_len);

        /* Following attributes are only reported as error (not warning) */
        if (!err)
                return tlvlen;

        if (extack->bad_attr)
                tlvlen += nla_total_size(sizeof(u32));
        if (extack->policy)
                tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy);
        if (extack->miss_type)
                tlvlen += nla_total_size(sizeof(u32));
        if (extack->miss_nest)
                tlvlen += nla_total_size(sizeof(u32));

        return tlvlen;
}

static bool nlmsg_check_in_payload(const struct nlmsghdr *nlh, const void *addr)
{
        return !WARN_ON(addr < nlmsg_data(nlh) ||
                        addr - (const void *) nlh >= nlh->nlmsg_len);
}

static void
netlink_ack_tlv_fill(struct sk_buff *skb, const struct nlmsghdr *nlh, int err,
                     const struct netlink_ext_ack *extack)
{
        if (extack->_msg)
                WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg));
        if (extack->cookie_len)
                WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
                                extack->cookie_len, extack->cookie));

        if (!err)
                return;

        if (extack->bad_attr && nlmsg_check_in_payload(nlh, extack->bad_attr))
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
                                    (u8 *)extack->bad_attr - (const u8 *)nlh));
        if (extack->policy)
                netlink_policy_dump_write_attr(skb, extack->policy,
                                               NLMSGERR_ATTR_POLICY);
        if (extack->miss_type)
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE,
                                    extack->miss_type));
        if (extack->miss_nest && nlmsg_check_in_payload(nlh, extack->miss_nest))
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST,
                                    (u8 *)extack->miss_nest - (const u8 *)nlh));
}

/*
 * It looks a bit ugly.
 * It would be better to create kernel thread.
 */

static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb,
                             struct netlink_callback *cb,
                             struct netlink_ext_ack *extack)
{
        struct nlmsghdr *nlh;
        size_t extack_len;

        nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno),
                               NLM_F_MULTI | cb->answer_flags);
        if (WARN_ON(!nlh))
                return -ENOBUFS;

        nl_dump_check_consistent(cb, nlh);
        memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno));

        extack_len = netlink_ack_tlv_len(nlk, nlk->dump_done_errno, extack);
        if (extack_len) {
                nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
                if (skb_tailroom(skb) >= extack_len) {
                        netlink_ack_tlv_fill(skb, cb->nlh,
                                             nlk->dump_done_errno, extack);
                        nlmsg_end(skb, nlh);
                }
        }

        return 0;
}

static int netlink_dump(struct sock *sk, bool lock_taken)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        struct netlink_ext_ack extack = {};
        struct netlink_callback *cb;
        struct sk_buff *skb = NULL;
        unsigned int rmem, rcvbuf;
        size_t max_recvmsg_len;
        struct module *module;
        int err = -ENOBUFS;
        int alloc_min_size;
        int alloc_size;

        if (!lock_taken)
                mutex_lock(&nlk->nl_cb_mutex);
        if (!nlk->cb_running) {
                err = -EINVAL;
                goto errout_skb;
        }

        /* NLMSG_GOODSIZE is small to avoid high order allocations being
         * required, but it makes sense to _attempt_ a 32KiB allocation
         * to reduce number of system calls on dump operations, if user
         * ever provided a big enough buffer.
         */
        cb = &nlk->cb;
        alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);

        max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len);
        if (alloc_min_size < max_recvmsg_len) {
                alloc_size = max_recvmsg_len;
                skb = alloc_skb(alloc_size,
                                (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
                                __GFP_NOWARN | __GFP_NORETRY);
        }
        if (!skb) {
                alloc_size = alloc_min_size;
                skb = alloc_skb(alloc_size, GFP_KERNEL);
        }
        if (!skb)
                goto errout_skb;

        rcvbuf = READ_ONCE(sk->sk_rcvbuf);
        rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
        if (rmem != skb->truesize && rmem >= rcvbuf) {
                atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
                goto errout_skb;
        }

        /* Trim skb to allocated size. User is expected to provide buffer as
         * large as max(min_dump_alloc, 32KiB (max_recvmsg_len capped at
         * netlink_recvmsg())). dump will pack as many smaller messages as
         * could fit within the allocated skb. skb is typically allocated
         * with larger space than required (could be as much as near 2x the
         * requested size with align to next power of 2 approach). Allowing
         * dump to use the excess space makes it difficult for a user to have a
         * reasonable static buffer based on the expected largest dump of a
         * single netdev. The outcome is MSG_TRUNC error.
         */
        skb_reserve(skb, skb_tailroom(skb) - alloc_size);

        /* Make sure malicious BPF programs can not read unitialized memory
         * from skb->head -> skb->data
         */
        skb_reset_network_header(skb);
        skb_reset_mac_header(skb);

        netlink_skb_set_owner_r(skb, sk);

        if (nlk->dump_done_errno > 0) {
                cb->extack = &extack;

                nlk->dump_done_errno = cb->dump(skb, cb);

                /* EMSGSIZE plus something already in the skb means
                 * that there's more to dump but current skb has filled up.
                 * If the callback really wants to return EMSGSIZE to user space
                 * it needs to do so again, on the next cb->dump() call,
                 * without putting data in the skb.
                 */
                if (nlk->dump_done_errno == -EMSGSIZE && skb->len)
                        nlk->dump_done_errno = skb->len;

                cb->extack = NULL;
        }

        if (nlk->dump_done_errno > 0 ||
            skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) {
                mutex_unlock(&nlk->nl_cb_mutex);

                if (sk_filter(sk, skb))
                        kfree_skb(skb);
                else
                        __netlink_sendskb(sk, skb);
                return 0;
        }

        if (netlink_dump_done(nlk, skb, cb, &extack))
                goto errout_skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
        /* frag_list skb's data is used for compat tasks
         * and the regular skb's data for normal (non-compat) tasks.
         * See netlink_recvmsg().
         */
        if (unlikely(skb_shinfo(skb)->frag_list)) {
                if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack))
                        goto errout_skb;
        }
#endif

        if (sk_filter(sk, skb))
                kfree_skb(skb);
        else
                __netlink_sendskb(sk, skb);

        if (cb->done)
                cb->done(cb);

        WRITE_ONCE(nlk->cb_running, false);
        module = cb->module;
        skb = cb->skb;
        mutex_unlock(&nlk->nl_cb_mutex);
        module_put(module);
        consume_skb(skb);
        return 0;

errout_skb:
        mutex_unlock(&nlk->nl_cb_mutex);
        kfree_skb(skb);
        return err;
}

int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                         const struct nlmsghdr *nlh,
                         struct netlink_dump_control *control)
{
        struct netlink_callback *cb;
        struct netlink_sock *nlk;
        struct sock *sk;
        int ret;

        refcount_inc(&skb->users);

        sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
        if (sk == NULL) {
                ret = -ECONNREFUSED;
                goto error_free;
        }

        nlk = nlk_sk(sk);
        mutex_lock(&nlk->nl_cb_mutex);
        /* A dump is in progress... */
        if (nlk->cb_running) {
                ret = -EBUSY;
                goto error_unlock;
        }
        /* add reference of module which cb->dump belongs to */
        if (!try_module_get(control->module)) {
                ret = -EPROTONOSUPPORT;
                goto error_unlock;
        }

        cb = &nlk->cb;
        memset(cb, 0, sizeof(*cb));
        cb->dump = control->dump;
        cb->done = control->done;
        cb->nlh = nlh;
        cb->data = control->data;
        cb->module = control->module;
        cb->min_dump_alloc = control->min_dump_alloc;
        cb->flags = control->flags;
        cb->skb = skb;

        cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);

        if (control->start) {
                cb->extack = control->extack;
                ret = control->start(cb);
                cb->extack = NULL;
                if (ret)
                        goto error_put;
        }

        WRITE_ONCE(nlk->cb_running, true);
        nlk->dump_done_errno = INT_MAX;

        ret = netlink_dump(sk, true);

        sock_put(sk);

        if (ret)
                return ret;

        /* We successfully started a dump, by returning -EINTR we
         * signal not to send ACK even if it was requested.
         */
        return -EINTR;

error_put:
        module_put(control->module);
error_unlock:
        sock_put(sk);
        mutex_unlock(&nlk->nl_cb_mutex);
error_free:
        kfree_skb(skb);
        return ret;
}
EXPORT_SYMBOL(__netlink_dump_start);

void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
                 const struct netlink_ext_ack *extack)
{
        struct sk_buff *skb;
        struct nlmsghdr *rep;
        struct nlmsgerr *errmsg;
        size_t payload = sizeof(*errmsg);
        struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
        unsigned int flags = 0;
        size_t tlvlen;

        /* Error messages get the original request appended, unless the user
         * requests to cap the error message, and get extra error data if
         * requested.
         */
        if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags))
                payload += nlmsg_len(nlh);
        else
                flags |= NLM_F_CAPPED;

        tlvlen = netlink_ack_tlv_len(nlk, err, extack);
        if (tlvlen)
                flags |= NLM_F_ACK_TLVS;

        skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
        if (!skb)
                goto err_skb;

        rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                        NLMSG_ERROR, sizeof(*errmsg), flags);
        if (!rep)
                goto err_bad_put;
        errmsg = nlmsg_data(rep);
        errmsg->error = err;
        errmsg->msg = *nlh;

        if (!(flags & NLM_F_CAPPED)) {
                if (!nlmsg_append(skb, nlmsg_len(nlh)))
                        goto err_bad_put;

                memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh),
                       nlmsg_len(nlh));
        }

        if (tlvlen)
                netlink_ack_tlv_fill(skb, nlh, err, extack);

        nlmsg_end(skb, rep);

        nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid);

        return;

err_bad_put:
        nlmsg_free(skb);
err_skb:
        WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS);
        sk_error_report(NETLINK_CB(in_skb).sk);
}
EXPORT_SYMBOL(netlink_ack);

int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
                                                   struct nlmsghdr *,
                                                   struct netlink_ext_ack *))
{
        struct netlink_ext_ack extack;
        struct nlmsghdr *nlh;
        int err;

        while (skb->len >= nlmsg_total_size(0)) {
                int msglen;

                memset(&extack, 0, sizeof(extack));
                nlh = nlmsg_hdr(skb);
                err = 0;

                if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
                        return 0;

                /* Only requests are handled by the kernel */
                if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
                        goto ack;

                /* Skip control messages */
                if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
                        goto ack;

                err = cb(skb, nlh, &extack);
                if (err == -EINTR)
                        goto skip;

ack:
                if (nlh->nlmsg_flags & NLM_F_ACK || err)
                        netlink_ack(skb, nlh, err, &extack);

skip:
                msglen = NLMSG_ALIGN(nlh->nlmsg_len);
                if (msglen > skb->len)
                        msglen = skb->len;
                skb_pull(skb, msglen);
        }

        return 0;
}
EXPORT_SYMBOL(netlink_rcv_skb);

/**
 * nlmsg_notify - send a notification netlink message
 * @sk: netlink socket to use
 * @skb: notification message
 * @portid: destination netlink portid for reports or 0
 * @group: destination multicast group or 0
 * @report: 1 to report back, 0 to disable
 * @flags: allocation flags
 */
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
                 unsigned int group, int report, gfp_t flags)
{
        int err = 0;

        if (group) {
                int exclude_portid = 0;

                if (report) {
                        refcount_inc(&skb->users);
                        exclude_portid = portid;
                }

                /* errors reported via destination sk->sk_err, but propagate
                 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
                err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
                if (err == -ESRCH)
                        err = 0;
        }

        if (report) {
                int err2;

                err2 = nlmsg_unicast(sk, skb, portid);
                if (!err)
                        err = err2;
        }

        return err;
}
EXPORT_SYMBOL(nlmsg_notify);

#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
        struct seq_net_private p;
        struct rhashtable_iter hti;
        int link;
};

static void netlink_walk_start(struct nl_seq_iter *iter)
{
        rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti);
        rhashtable_walk_start(&iter->hti);
}

static void netlink_walk_stop(struct nl_seq_iter *iter)
{
        rhashtable_walk_stop(&iter->hti);
        rhashtable_walk_exit(&iter->hti);
}

static void *__netlink_seq_next(struct seq_file *seq)
{
        struct nl_seq_iter *iter = seq->private;
        struct netlink_sock *nlk;

        do {
                for (;;) {
                        nlk = rhashtable_walk_next(&iter->hti);

                        if (IS_ERR(nlk)) {
                                if (PTR_ERR(nlk) == -EAGAIN)
                                        continue;

                                return nlk;
                        }

                        if (nlk)
                                break;

                        netlink_walk_stop(iter);
                        if (++iter->link >= MAX_LINKS)
                                return NULL;

                        netlink_walk_start(iter);
                }
        } while (sock_net(&nlk->sk) != seq_file_net(seq));

        return nlk;
}

static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
        __acquires(RCU)
{
        struct nl_seq_iter *iter = seq->private;
        void *obj = SEQ_START_TOKEN;
        loff_t pos;

        iter->link = 0;

        netlink_walk_start(iter);

        for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
                obj = __netlink_seq_next(seq);

        return obj;
}

static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;
        return __netlink_seq_next(seq);
}

static void netlink_native_seq_stop(struct seq_file *seq, void *v)
{
        struct nl_seq_iter *iter = seq->private;

        if (iter->link >= MAX_LINKS)
                return;

        netlink_walk_stop(iter);
}


static int netlink_native_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "sk               Eth Pid        Groups   "
                         "Rmem     Wmem     Dump  Locks    Drops    Inode\n");
        } else {
                struct sock *s = v;
                struct netlink_sock *nlk = nlk_sk(s);

                seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8llu\n",
                           s,
                           s->sk_protocol,
                           nlk->portid,
                           nlk->groups ? (u32)nlk->groups[0] : 0,
                           sk_rmem_alloc_get(s),
                           sk_wmem_alloc_get(s),
                           READ_ONCE(nlk->cb_running),
                           refcount_read(&s->sk_refcnt),
                           sk_drops_read(s),
                           sock_i_ino(s)
                        );

        }
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__netlink {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct netlink_sock *, sk);
};

DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)

static int netlink_prog_seq_show(struct bpf_prog *prog,
                                  struct bpf_iter_meta *meta,
                                  void *v)
{
        struct bpf_iter__netlink ctx;

        meta->seq_num--;  /* skip SEQ_START_TOKEN */
        ctx.meta = meta;
        ctx.sk = nlk_sk((struct sock *)v);
        return bpf_iter_run_prog(prog, &ctx);
}

static int netlink_seq_show(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        if (!prog)
                return netlink_native_seq_show(seq, v);

        if (v != SEQ_START_TOKEN)
                return netlink_prog_seq_show(prog, &meta, v);

        return 0;
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)netlink_prog_seq_show(prog, &meta, v);
        }

        netlink_native_seq_stop(seq, v);
}
#else
static int netlink_seq_show(struct seq_file *seq, void *v)
{
        return netlink_native_seq_show(seq, v);
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
        netlink_native_seq_stop(seq, v);
}
#endif

static const struct seq_operations netlink_seq_ops = {
        .start  = netlink_seq_start,
        .next   = netlink_seq_next,
        .stop   = netlink_seq_stop,
        .show   = netlink_seq_show,
};
#endif

int netlink_register_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_register_notifier);

int netlink_unregister_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_unregister_notifier);

static const struct proto_ops netlink_ops = {
        .family =        PF_NETLINK,
        .owner =        THIS_MODULE,
        .release =        netlink_release,
        .bind =                netlink_bind,
        .connect =        netlink_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        netlink_getname,
        .poll =                datagram_poll,
        .ioctl =        netlink_ioctl,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        netlink_setsockopt,
        .getsockopt =        netlink_getsockopt,
        .sendmsg =        netlink_sendmsg,
        .recvmsg =        netlink_recvmsg,
        .mmap =                sock_no_mmap,
};

static const struct net_proto_family netlink_family_ops = {
        .family = PF_NETLINK,
        .create = netlink_create,
        .owner        = THIS_MODULE,        /* for consistency 8) */
};

static int __net_init netlink_net_init(struct net *net)
{
#ifdef CONFIG_PROC_FS
        if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops,
                        sizeof(struct nl_seq_iter)))
                return -ENOMEM;
#endif
        return 0;
}

static void __net_exit netlink_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
        remove_proc_entry("netlink", net->proc_net);
#endif
}

static void __init netlink_add_usersock_entry(void)
{
        struct listeners *listeners;
        int groups = 32;

        listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
        if (!listeners)
                panic("netlink_add_usersock_entry: Cannot allocate listeners\n");

        netlink_table_grab();

        nl_table[NETLINK_USERSOCK].groups = groups;
        rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
        nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
        nl_table[NETLINK_USERSOCK].registered = 1;
        nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;

        netlink_table_ungrab();
}

static struct pernet_operations __net_initdata netlink_net_ops = {
        .init = netlink_net_init,
        .exit = netlink_net_exit,
};

static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
{
        const struct netlink_sock *nlk = data;
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
        return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
}

static const struct rhashtable_params netlink_rhashtable_params = {
        .head_offset = offsetof(struct netlink_sock, node),
        .key_len = netlink_compare_arg_len,
        .obj_hashfn = netlink_hash,
        .obj_cmpfn = netlink_compare,
        .automatic_shrinking = true,
};

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
BTF_ID_LIST_SINGLE(btf_netlink_sock_id, struct, netlink_sock)

static const struct bpf_iter_seq_info netlink_seq_info = {
        .seq_ops                = &netlink_seq_ops,
        .init_seq_private        = bpf_iter_init_seq_net,
        .fini_seq_private        = bpf_iter_fini_seq_net,
        .seq_priv_size                = sizeof(struct nl_seq_iter),
};

static struct bpf_iter_reg netlink_reg_info = {
        .target                        = "netlink",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__netlink, sk),
                  PTR_TO_BTF_ID_OR_NULL },
        },
        .seq_info                = &netlink_seq_info,
};

static int __init bpf_iter_register(void)
{
        netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id;
        return bpf_iter_reg_target(&netlink_reg_info);
}
#endif

static int __init netlink_proto_init(void)
{
        int i;
        int err = proto_register(&netlink_proto, 0);

        if (err != 0)
                goto out;

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        err = bpf_iter_register();
        if (err)
                goto out;
#endif

        BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));

        nl_table = kzalloc_objs(*nl_table, MAX_LINKS);
        if (!nl_table)
                goto panic;

        for (i = 0; i < MAX_LINKS; i++) {
                if (rhashtable_init(&nl_table[i].hash,
                                    &netlink_rhashtable_params) < 0)
                        goto panic;
        }

        netlink_add_usersock_entry();

        sock_register(&netlink_family_ops);
        register_pernet_subsys(&netlink_net_ops);
        register_pernet_subsys(&netlink_tap_net_ops);
        /* The netlink device handler may be needed early. */
        rtnetlink_init();
out:
        return err;
panic:
        panic("netlink_init: Cannot allocate nl_table\n");
}

core_initcall(netlink_proto_init);














    3 



    1 



    3 



















































    1 




    1 






    1 








    1 






























































































    3 




    3 


    3 














    1 






    1 




















    1 





    1 

























    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
// SPDX-License-Identifier: GPL-2.0-only
/*
 * IPv6 library code, needed by static components when full IPv6 support is
 * not configured or static.
 */
#include <linux/export.h>
#include <net/ipv6.h>

/*
 * find out if nexthdr is a well-known extension header or a protocol
 */

bool ipv6_ext_hdr(u8 nexthdr)
{
        /*
         * find out if nexthdr is an extension header or a protocol
         */
        return   (nexthdr == NEXTHDR_HOP)        ||
                 (nexthdr == NEXTHDR_ROUTING)        ||
                 (nexthdr == NEXTHDR_FRAGMENT)        ||
                 (nexthdr == NEXTHDR_AUTH)        ||
                 (nexthdr == NEXTHDR_NONE)        ||
                 (nexthdr == NEXTHDR_DEST);
}
EXPORT_SYMBOL(ipv6_ext_hdr);

/*
 * Skip any extension headers. This is used by the ICMP module.
 *
 * Note that strictly speaking this conflicts with RFC 2460 4.0:
 * ...The contents and semantics of each extension header determine whether
 * or not to proceed to the next header.  Therefore, extension headers must
 * be processed strictly in the order they appear in the packet; a
 * receiver must not, for example, scan through a packet looking for a
 * particular kind of extension header and process that header prior to
 * processing all preceding ones.
 *
 * We do exactly this. This is a protocol bug. We can't decide after a
 * seeing an unknown discard-with-error flavour TLV option if it's a
 * ICMP error message or not (errors should never be send in reply to
 * ICMP error messages).
 *
 * But I see no other way to do this. This might need to be reexamined
 * when Linux implements ESP (and maybe AUTH) headers.
 * --AK
 *
 * This function parses (probably truncated) exthdr set "hdr".
 * "nexthdrp" initially points to some place,
 * where type of the first header can be found.
 *
 * It skips all well-known exthdrs, and returns pointer to the start
 * of unparsable area i.e. the first header with unknown type.
 * If it is not NULL *nexthdr is updated by type/protocol of this header.
 *
 * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
 *        - it may return pointer pointing beyond end of packet,
 *            if the last recognized header is truncated in the middle.
 *        - if packet is truncated, so that all parsed headers are skipped,
 *            it returns NULL.
 *          - First fragment header is skipped, not-first ones
 *            are considered as unparsable.
 *          - Reports the offset field of the final fragment header so it is
 *            possible to tell whether this is a first fragment, later fragment,
 *            or not fragmented.
 *          - ESP is unparsable for now and considered like
 *            normal payload protocol.
 *          - Note also special handling of AUTH header. Thanks to IPsec wizards.
 *
 * --ANK (980726)
 */

int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
                     __be16 *frag_offp)
{
        u8 nexthdr = *nexthdrp;

        *frag_offp = 0;

        while (ipv6_ext_hdr(nexthdr)) {
                struct ipv6_opt_hdr _hdr, *hp;
                int hdrlen;

                if (nexthdr == NEXTHDR_NONE)
                        return -1;
                hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
                if (!hp)
                        return -1;
                if (nexthdr == NEXTHDR_FRAGMENT) {
                        __be16 _frag_off, *fp;
                        fp = skb_header_pointer(skb,
                                                start+offsetof(struct frag_hdr,
                                                               frag_off),
                                                sizeof(_frag_off),
                                                &_frag_off);
                        if (!fp)
                                return -1;

                        *frag_offp = *fp;
                        if (ntohs(*frag_offp) & ~0x7)
                                break;
                        hdrlen = 8;
                } else if (nexthdr == NEXTHDR_AUTH)
                        hdrlen = ipv6_authlen(hp);
                else
                        hdrlen = ipv6_optlen(hp);

                nexthdr = hp->nexthdr;
                start += hdrlen;
        }

        *nexthdrp = nexthdr;
        return start;
}
EXPORT_SYMBOL(ipv6_skip_exthdr);

int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type)
{
        const unsigned char *nh = skb_network_header(skb);
        int packet_len = skb_tail_pointer(skb) - skb_network_header(skb);
        struct ipv6_opt_hdr *hdr;
        int len;

        if (offset + 2 > packet_len)
                goto bad;
        hdr = (struct ipv6_opt_hdr *)(nh + offset);
        len = ((hdr->hdrlen + 1) << 3);

        if (offset + len > packet_len)
                goto bad;

        offset += 2;
        len -= 2;

        while (len > 0) {
                int opttype = nh[offset];
                int optlen;

                if (opttype == type)
                        return offset;

                switch (opttype) {
                case IPV6_TLV_PAD1:
                        optlen = 1;
                        break;
                default:
                        if (len < 2)
                                goto bad;
                        optlen = nh[offset + 1] + 2;
                        if (optlen > len)
                                goto bad;
                        break;
                }
                offset += optlen;
                len -= optlen;
        }
        /* not_found */
 bad:
        return -1;
}
EXPORT_SYMBOL_GPL(ipv6_find_tlv);

/*
 * find the offset to specified header or the protocol number of last header
 * if target < 0. "last header" is transport protocol header, ESP, or
 * "No next header".
 *
 * Note that *offset is used as input/output parameter, and if it is not zero,
 * then it must be a valid offset to an inner IPv6 header. This can be used
 * to explore inner IPv6 header, eg. ICMPv6 error messages.
 *
 * If target header is found, its offset is set in *offset and return protocol
 * number. Otherwise, return -1.
 *
 * If the first fragment doesn't contain the final protocol header or
 * NEXTHDR_NONE it is considered invalid.
 *
 * Note that non-1st fragment is special case that "the protocol number
 * of last header" is "next header" field in Fragment header. In this case,
 * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
 * isn't NULL.
 *
 * if flags is not NULL and it's a fragment, then the frag flag
 * IP6_FH_F_FRAG will be set. If it's an AH header, the
 * IP6_FH_F_AUTH flag is set and target < 0, then this function will
 * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this
 * function will skip all those routing headers, where segements_left was 0.
 */
int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
                  int target, unsigned short *fragoff, int *flags)
{
        unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
        u8 nexthdr = ipv6_hdr(skb)->nexthdr;
        bool found;

        if (fragoff)
                *fragoff = 0;

        if (*offset) {
                struct ipv6hdr _ip6, *ip6;

                ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6);
                if (!ip6 || (ip6->version != 6))
                        return -EBADMSG;
                start = *offset + sizeof(struct ipv6hdr);
                nexthdr = ip6->nexthdr;
        }

        do {
                struct ipv6_opt_hdr _hdr, *hp;
                unsigned int hdrlen;
                found = (nexthdr == target);

                if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
                        if (target < 0 || found)
                                break;
                        return -ENOENT;
                }

                hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
                if (!hp)
                        return -EBADMSG;

                if (nexthdr == NEXTHDR_ROUTING) {
                        struct ipv6_rt_hdr _rh, *rh;

                        rh = skb_header_pointer(skb, start, sizeof(_rh),
                                                &_rh);
                        if (!rh)
                                return -EBADMSG;

                        if (flags && (*flags & IP6_FH_F_SKIP_RH) &&
                            rh->segments_left == 0)
                                found = false;
                }

                if (nexthdr == NEXTHDR_FRAGMENT) {
                        unsigned short _frag_off;
                        __be16 *fp;

                        if (flags)        /* Indicate that this is a fragment */
                                *flags |= IP6_FH_F_FRAG;
                        fp = skb_header_pointer(skb,
                                                start+offsetof(struct frag_hdr,
                                                               frag_off),
                                                sizeof(_frag_off),
                                                &_frag_off);
                        if (!fp)
                                return -EBADMSG;

                        _frag_off = ntohs(*fp) & ~0x7;
                        if (_frag_off) {
                                if (target < 0 &&
                                    ((!ipv6_ext_hdr(hp->nexthdr)) ||
                                     hp->nexthdr == NEXTHDR_NONE)) {
                                        if (fragoff)
                                                *fragoff = _frag_off;
                                        return hp->nexthdr;
                                }
                                if (!found)
                                        return -ENOENT;
                                if (fragoff)
                                        *fragoff = _frag_off;
                                break;
                        }
                        hdrlen = 8;
                } else if (nexthdr == NEXTHDR_AUTH) {
                        if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0))
                                break;
                        hdrlen = ipv6_authlen(hp);
                } else
                        hdrlen = ipv6_optlen(hp);

                if (!found) {
                        nexthdr = hp->nexthdr;
                        start += hdrlen;
                }
        } while (!found);

        *offset = start;
        return nexthdr;
}
EXPORT_SYMBOL(ipv6_find_hdr);














    3 




    1 




    3 




















































    3 















































    1 









    3 







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_SUPER_H
#define _LINUX_FS_SUPER_H

#include <linux/fs/super_types.h>
#include <linux/unicode.h>

/*
 * These are internal functions, please use sb_start_{write,pagefault,intwrite}
 * instead.
 */
static inline void __sb_end_write(struct super_block *sb, int level)
{
        percpu_up_read(sb->s_writers.rw_sem + level - 1);
}

static inline void __sb_start_write(struct super_block *sb, int level)
{
        percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true);
}

static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
{
        return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
}

#define __sb_writers_acquired(sb, lev) \
        percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev) - 1], 1, _THIS_IP_)
#define __sb_writers_release(sb, lev) \
        percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev) - 1], _THIS_IP_)

/**
 * __sb_write_started - check if sb freeze level is held
 * @sb: the super we write to
 * @level: the freeze level
 *
 * * > 0 - sb freeze level is held
 * *   0 - sb freeze level is not held
 * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
 */
static inline int __sb_write_started(const struct super_block *sb, int level)
{
        return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
}

/**
 * sb_write_started - check if SB_FREEZE_WRITE is held
 * @sb: the super we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 */
static inline bool sb_write_started(const struct super_block *sb)
{
        return __sb_write_started(sb, SB_FREEZE_WRITE);
}

/**
 * sb_write_not_started - check if SB_FREEZE_WRITE is not held
 * @sb: the super we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 */
static inline bool sb_write_not_started(const struct super_block *sb)
{
        return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
}

/**
 * sb_end_write - drop write access to a superblock
 * @sb: the super we wrote to
 *
 * Decrement number of writers to the filesystem. Wake up possible waiters
 * wanting to freeze the filesystem.
 */
static inline void sb_end_write(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_WRITE);
}

/**
 * sb_end_pagefault - drop write access to a superblock from a page fault
 * @sb: the super we wrote to
 *
 * Decrement number of processes handling write page fault to the filesystem.
 * Wake up possible waiters wanting to freeze the filesystem.
 */
static inline void sb_end_pagefault(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_end_intwrite - drop write access to a superblock for internal fs purposes
 * @sb: the super we wrote to
 *
 * Decrement fs-internal number of writers to the filesystem.  Wake up possible
 * waiters wanting to freeze the filesystem.
 */
static inline void sb_end_intwrite(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_FS);
}

/**
 * sb_start_write - get write access to a superblock
 * @sb: the super we write to
 *
 * When a process wants to write data or metadata to a file system (i.e. dirty
 * a page or an inode), it should embed the operation in a sb_start_write() -
 * sb_end_write() pair to get exclusion against file system freezing. This
 * function increments number of writers preventing freezing. If the file
 * system is already frozen, the function waits until the file system is
 * thawed.
 *
 * Since freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. Generally,
 * freeze protection should be the outermost lock. In particular, we have:
 *
 * sb_start_write
 *   -> i_rwsem                        (write path, truncate, directory ops, ...)
 *   -> s_umount                (freeze_super, thaw_super)
 */
static inline void sb_start_write(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_WRITE);
}

DEFINE_GUARD(super_write,
             struct super_block *,
             sb_start_write(_T),
             sb_end_write(_T))

static inline bool sb_start_write_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
}

/**
 * sb_start_pagefault - get write access to a superblock from a page fault
 * @sb: the super we write to
 *
 * When a process starts handling write page fault, it should embed the
 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
 * exclusion against file system freezing. This is needed since the page fault
 * is going to dirty a page. This function increments number of running page
 * faults preventing freezing. If the file system is already frozen, the
 * function waits until the file system is thawed.
 *
 * Since page fault freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. It is advised to
 * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
 * handling code implies lock dependency:
 *
 * mmap_lock
 *   -> sb_start_pagefault
 */
static inline void sb_start_pagefault(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_start_intwrite - get write access to a superblock for internal fs purposes
 * @sb: the super we write to
 *
 * This is the third level of protection against filesystem freezing. It is
 * free for use by a filesystem. The only requirement is that it must rank
 * below sb_start_pagefault.
 *
 * For example filesystem can call sb_start_intwrite() when starting a
 * transaction which somewhat eases handling of freezing for internal sources
 * of filesystem changes (internal fs threads, discarding preallocation on file
 * close, etc.).
 */
static inline void sb_start_intwrite(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_FS);
}

static inline bool sb_start_intwrite_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_FS);
}

static inline bool sb_rdonly(const struct super_block *sb)
{
        return sb->s_flags & SB_RDONLY;
}

static inline bool sb_is_blkdev_sb(struct super_block *sb)
{
        return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
}

#if IS_ENABLED(CONFIG_UNICODE)
static inline struct unicode_map *sb_encoding(const struct super_block *sb)
{
        return sb->s_encoding;
}

/* Compare if two super blocks have the same encoding and flags */
static inline bool sb_same_encoding(const struct super_block *sb1,
                                    const struct super_block *sb2)
{
        if (sb1->s_encoding == sb2->s_encoding)
                return true;

        return (sb1->s_encoding && sb2->s_encoding &&
                (sb1->s_encoding->version == sb2->s_encoding->version) &&
                (sb1->s_encoding_flags == sb2->s_encoding_flags));
}
#else
static inline struct unicode_map *sb_encoding(const struct super_block *sb)
{
        return NULL;
}

static inline bool sb_same_encoding(const struct super_block *sb1,
                                    const struct super_block *sb2)
{
        return true;
}
#endif

static inline bool sb_has_encoding(const struct super_block *sb)
{
        return !!sb_encoding(sb);
}

int sb_set_blocksize(struct super_block *sb, int size);
int __must_check sb_min_blocksize(struct super_block *sb, int size);

int freeze_super(struct super_block *super, enum freeze_holder who,
                 const void *freeze_owner);
int thaw_super(struct super_block *super, enum freeze_holder who,
               const void *freeze_owner);

#endif /* _LINUX_FS_SUPER_H */




















































































































































































































































































































   70 




























   71 








   71 










   70 








   16 


   73 






   16 































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
// SPDX-License-Identifier: GPL-2.0
/*
 * This file contains functions which manage clock event devices.
 *
 * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
 */

#include <linux/clockchips.h>
#include <linux/hrtimer.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/smp.h>
#include <linux/device.h>

#include "tick-internal.h"

/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
static LIST_HEAD(clockevents_released);
/* Protection for the above */
static DEFINE_RAW_SPINLOCK(clockevents_lock);
/* Protection for unbind operations */
static DEFINE_MUTEX(clockevents_mutex);

struct ce_unbind {
        struct clock_event_device *ce;
        int res;
};

static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
                        bool ismax)
{
        u64 clc = (u64) latch << evt->shift;
        u64 rnd;

        if (WARN_ON(!evt->mult))
                evt->mult = 1;
        rnd = (u64) evt->mult - 1;

        /*
         * Upper bound sanity check. If the backwards conversion is
         * not equal latch, we know that the above shift overflowed.
         */
        if ((clc >> evt->shift) != (u64)latch)
                clc = ~0ULL;

        /*
         * Scaled math oddities:
         *
         * For mult <= (1 << shift) we can safely add mult - 1 to
         * prevent integer rounding loss. So the backwards conversion
         * from nsec to device ticks will be correct.
         *
         * For mult > (1 << shift), i.e. device frequency is > 1GHz we
         * need to be careful. Adding mult - 1 will result in a value
         * which when converted back to device ticks can be larger
         * than latch by up to (mult - 1) >> shift. For the min_delta
         * calculation we still want to apply this in order to stay
         * above the minimum device ticks limit. For the upper limit
         * we would end up with a latch value larger than the upper
         * limit of the device, so we omit the add to stay below the
         * device upper boundary.
         *
         * Also omit the add if it would overflow the u64 boundary.
         */
        if ((~0ULL - clc > rnd) &&
            (!ismax || evt->mult <= (1ULL << evt->shift)))
                clc += rnd;

        do_div(clc, evt->mult);

        /* Deltas less than 1usec are pointless noise */
        return clc > 1000 ? clc : 1000;
}

/**
 * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds
 * @latch:        value to convert
 * @evt:        pointer to clock event device descriptor
 *
 * Math helper, returns latch value converted to nanoseconds (bound checked)
 */
u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
{
        return cev_delta2ns(latch, evt, false);
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);

static int __clockevents_switch_state(struct clock_event_device *dev,
                                      enum clock_event_state state)
{
        if (dev->features & CLOCK_EVT_FEAT_DUMMY)
                return 0;

        /* Transition with new state-specific callbacks */
        switch (state) {
        case CLOCK_EVT_STATE_DETACHED:
                /* The clockevent device is getting replaced. Shut it down. */

        case CLOCK_EVT_STATE_SHUTDOWN:
                if (dev->set_state_shutdown)
                        return dev->set_state_shutdown(dev);
                return 0;

        case CLOCK_EVT_STATE_PERIODIC:
                /* Core internal bug */
                if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
                        return -ENOSYS;
                if (dev->set_state_periodic)
                        return dev->set_state_periodic(dev);
                return 0;

        case CLOCK_EVT_STATE_ONESHOT:
                /* Core internal bug */
                if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
                        return -ENOSYS;
                if (dev->set_state_oneshot)
                        return dev->set_state_oneshot(dev);
                return 0;

        case CLOCK_EVT_STATE_ONESHOT_STOPPED:
                /* Core internal bug */
                if (WARN_ONCE(!clockevent_state_oneshot(dev),
                              "Current state: %d\n",
                              clockevent_get_state(dev)))
                        return -EINVAL;

                if (dev->set_state_oneshot_stopped)
                        return dev->set_state_oneshot_stopped(dev);
                else
                        return -ENOSYS;

        default:
                return -ENOSYS;
        }
}

/**
 * clockevents_switch_state - set the operating state of a clock event device
 * @dev:        device to modify
 * @state:        new state
 *
 * Must be called with interrupts disabled !
 */
void clockevents_switch_state(struct clock_event_device *dev,
                              enum clock_event_state state)
{
        if (clockevent_get_state(dev) != state) {
                if (__clockevents_switch_state(dev, state))
                        return;

                clockevent_set_state(dev, state);

                /*
                 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
                 * on it, so fix it up and emit a warning:
                 */
                if (clockevent_state_oneshot(dev)) {
                        if (WARN_ON(!dev->mult))
                                dev->mult = 1;
                }
        }
}

/**
 * clockevents_shutdown - shutdown the device and clear next_event
 * @dev:        device to shutdown
 */
void clockevents_shutdown(struct clock_event_device *dev)
{
        clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
        dev->next_event = KTIME_MAX;
        dev->next_event_forced = 0;
}

/**
 * clockevents_tick_resume -        Resume the tick device before using it again
 * @dev:                        device to resume
 */
int clockevents_tick_resume(struct clock_event_device *dev)
{
        int ret = 0;

        if (dev->tick_resume)
                ret = dev->tick_resume(dev);

        return ret;
}

#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST

/* Limit min_delta to a jiffy */
#define MIN_DELTA_LIMIT                (NSEC_PER_SEC / HZ)

/**
 * clockevents_increase_min_delta - raise minimum delta of a clock event device
 * @dev:       device to increase the minimum delta
 *
 * Returns 0 on success, -ETIME when the minimum delta reached the limit.
 */
static int clockevents_increase_min_delta(struct clock_event_device *dev)
{
        /* Nothing to do if we already reached the limit */
        if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
                printk_deferred(KERN_WARNING
                                "CE: Reprogramming failure. Giving up\n");
                dev->next_event = KTIME_MAX;
                return -ETIME;
        }

        if (dev->min_delta_ns < 5000)
                dev->min_delta_ns = 5000;
        else
                dev->min_delta_ns += dev->min_delta_ns >> 1;

        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
                dev->min_delta_ns = MIN_DELTA_LIMIT;

        printk_deferred(KERN_WARNING
                        "CE: %s increased min_delta_ns to %llu nsec\n",
                        dev->name ? dev->name : "?",
                        (unsigned long long) dev->min_delta_ns);
        return 0;
}

/**
 * clockevents_program_min_delta - Set clock event device to the minimum delay.
 * @dev:        device to program
 *
 * Returns 0 on success, -ETIME when the retry loop failed.
 */
static int clockevents_program_min_delta(struct clock_event_device *dev)
{
        unsigned long long clc;
        int64_t delta;
        int i;

        for (i = 0;;) {
                delta = dev->min_delta_ns;
                dev->next_event = ktime_add_ns(ktime_get(), delta);

                if (clockevent_state_shutdown(dev))
                        return 0;

                dev->retries++;
                clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
                if (dev->set_next_event((unsigned long) clc, dev) == 0)
                        return 0;

                if (++i > 2) {
                        /*
                         * We tried 3 times to program the device with the
                         * given min_delta_ns. Try to increase the minimum
                         * delta, if that fails as well get out of here.
                         */
                        if (clockevents_increase_min_delta(dev))
                                return -ETIME;
                        i = 0;
                }
        }
}

#else  /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */

/**
 * clockevents_program_min_delta - Set clock event device to the minimum delay.
 * @dev:        device to program
 *
 * Returns 0 on success, -ETIME when the retry loop failed.
 */
static int clockevents_program_min_delta(struct clock_event_device *dev)
{
        unsigned long long clc;
        int64_t delta = 0;
        int i;

        for (i = 0; i < 10; i++) {
                delta += dev->min_delta_ns;
                dev->next_event = ktime_add_ns(ktime_get(), delta);

                if (clockevent_state_shutdown(dev))
                        return 0;

                dev->retries++;
                clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
                if (dev->set_next_event((unsigned long) clc, dev) == 0)
                        return 0;
        }
        return -ETIME;
}

#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */

#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED
#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE
#include <asm/clock_inlined.h>
#else
static __always_inline void
arch_inlined_clockevent_set_next_coupled(u64 u64 cycles, struct clock_event_device *dev) { }
#endif

static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
{
        u64 cycles;

        if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED)))
                return false;

        if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles)))
                return false;

        if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE))
                arch_inlined_clockevent_set_next_coupled(cycles, dev);
        else
                dev->set_next_coupled(cycles, dev);
        return true;
}

#else
static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
{
        return false;
}
#endif

/**
 * clockevents_program_event - Reprogram the clock event device.
 * @dev:        device to program
 * @expires:        absolute expiry time (monotonic clock)
 * @force:        program minimum delay if expires can not be set
 *
 * Returns 0 on success, -ETIME when the event is in the past.
 */
int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force)
{
        int64_t delta;
        u64 cycles;

        if (WARN_ON_ONCE(expires < 0))
                return -ETIME;

        dev->next_event = expires;

        if (clockevent_state_shutdown(dev))
                return 0;

        /* We must be in ONESHOT state here */
        WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
                  clockevent_get_state(dev));

        /* ktime_t based reprogramming for the broadcast hrtimer device */
        if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER))
                return dev->set_next_ktime(expires, dev);

        if (likely(clockevent_set_next_coupled(dev, expires)))
                return 0;

        delta = ktime_to_ns(ktime_sub(expires, ktime_get()));

        /* Required for tick_periodic() during early boot */
        if (delta <= 0 && !force)
                return -ETIME;

        if (delta > (int64_t)dev->min_delta_ns) {
                delta = min(delta, (int64_t) dev->max_delta_ns);
                cycles = ((u64)delta * dev->mult) >> dev->shift;
                if (!dev->set_next_event((unsigned long) cycles, dev))
                        return 0;
        }

        if (dev->next_event_forced)
                return 0;

        if (dev->set_next_event(dev->min_delta_ticks, dev)) {
                if (!force || clockevents_program_min_delta(dev))
                        return -ETIME;
        }
        dev->next_event_forced = 1;
        return 0;
}

/*
 * Called after a clockevent has been added which might
 * have replaced a current regular or broadcast device. A
 * released normal device might be a suitable replacement
 * for the current broadcast device. Similarly a released
 * broadcast device might be a suitable replacement for a
 * normal device.
 */
static void clockevents_notify_released(void)
{
        struct clock_event_device *dev;

        /*
         * Keep iterating as long as tick_check_new_device()
         * replaces a device.
         */
        while (!list_empty(&clockevents_released)) {
                dev = list_entry(clockevents_released.next,
                                 struct clock_event_device, list);
                list_move(&dev->list, &clockevent_devices);
                tick_check_new_device(dev);
        }
}

/*
 * Try to install a replacement clock event device
 */
static int clockevents_replace(struct clock_event_device *ced)
{
        struct clock_event_device *dev, *newdev = NULL;

        list_for_each_entry(dev, &clockevent_devices, list) {
                if (dev == ced || !clockevent_state_detached(dev))
                        continue;

                if (!tick_check_replacement(newdev, dev))
                        continue;

                if (!try_module_get(dev->owner))
                        continue;

                if (newdev)
                        module_put(newdev->owner);
                newdev = dev;
        }
        if (newdev) {
                tick_install_replacement(newdev);
                list_del_init(&ced->list);
        }
        return newdev ? 0 : -EBUSY;
}

/*
 * Called with clockevents_mutex and clockevents_lock held
 */
static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
{
        /* Fast track. Device is unused */
        if (clockevent_state_detached(ced)) {
                list_del_init(&ced->list);
                return 0;
        }

        return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY;
}

/*
 * SMP function call to unbind a device
 */
static void __clockevents_unbind(void *arg)
{
        struct ce_unbind *cu = arg;
        int res;

        raw_spin_lock(&clockevents_lock);
        res = __clockevents_try_unbind(cu->ce, smp_processor_id());
        if (res == -EAGAIN)
                res = clockevents_replace(cu->ce);
        cu->res = res;
        raw_spin_unlock(&clockevents_lock);
}

/*
 * Issues smp function call to unbind a per cpu device. Called with
 * clockevents_mutex held.
 */
static int clockevents_unbind(struct clock_event_device *ced, int cpu)
{
        struct ce_unbind cu = { .ce = ced, .res = -ENODEV };

        smp_call_function_single(cpu, __clockevents_unbind, &cu, 1);
        return cu.res;
}

/*
 * Unbind a clockevents device.
 */
int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
{
        int ret;

        mutex_lock(&clockevents_mutex);
        ret = clockevents_unbind(ced, cpu);
        mutex_unlock(&clockevents_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(clockevents_unbind_device);

/**
 * clockevents_register_device - register a clock event device
 * @dev:        device to register
 */
void clockevents_register_device(struct clock_event_device *dev)
{
        unsigned long flags;

        /* Initialize state to DETACHED */
        clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);

        if (!dev->cpumask) {
                WARN_ON(num_possible_cpus() > 1);
                dev->cpumask = cpumask_of(smp_processor_id());
        }

        if (dev->cpumask == cpu_all_mask) {
                WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n",
                     dev->name);
                dev->cpumask = cpu_possible_mask;
        }

        raw_spin_lock_irqsave(&clockevents_lock, flags);

        list_add(&dev->list, &clockevent_devices);
        tick_check_new_device(dev);
        clockevents_notify_released();

        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
}
EXPORT_SYMBOL_GPL(clockevents_register_device);

static void clockevents_config(struct clock_event_device *dev, u32 freq)
{
        u64 sec;

        if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
                return;

        /*
         * Calculate the maximum number of seconds we can sleep. Limit
         * to 10 minutes for hardware which can program more than
         * 32bit ticks so we still get reasonable conversion values.
         */
        sec = dev->max_delta_ticks;
        do_div(sec, freq);
        if (!sec)
                sec = 1;
        else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
                sec = 600;

        clockevents_calc_mult_shift(dev, freq, sec);
        dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false);
        dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);
}

/**
 * clockevents_config_and_register - Configure and register a clock event device
 * @dev:        device to register
 * @freq:        The clock frequency
 * @min_delta:        The minimum clock ticks to program in oneshot mode
 * @max_delta:        The maximum clock ticks to program in oneshot mode
 *
 * min/max_delta can be 0 for devices which do not support oneshot mode.
 */
void clockevents_config_and_register(struct clock_event_device *dev,
                                     u32 freq, unsigned long min_delta,
                                     unsigned long max_delta)
{
        dev->min_delta_ticks = min_delta;
        dev->max_delta_ticks = max_delta;
        clockevents_config(dev, freq);
        clockevents_register_device(dev);
}
EXPORT_SYMBOL_GPL(clockevents_config_and_register);

int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
        clockevents_config(dev, freq);

        if (clockevent_state_oneshot(dev))
                return clockevents_program_event(dev, dev->next_event, false);

        if (clockevent_state_periodic(dev))
                return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);

        return 0;
}

/**
 * clockevents_update_freq - Update frequency and reprogram a clock event device.
 * @dev:        device to modify
 * @freq:        new device frequency
 *
 * Reconfigure and reprogram a clock event device in oneshot
 * mode. Must be called on the cpu for which the device delivers per
 * cpu timer events. If called for the broadcast device the core takes
 * care of serialization.
 *
 * Returns 0 on success, -ETIME when the event is in the past.
 */
int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
        unsigned long flags;
        int ret;

        local_irq_save(flags);
        ret = tick_broadcast_update_freq(dev, freq);
        if (ret == -ENODEV)
                ret = __clockevents_update_freq(dev, freq);
        local_irq_restore(flags);
        return ret;
}

/*
 * Noop handler when we shut down an event device
 */
void clockevents_handle_noop(struct clock_event_device *dev)
{
}

/**
 * clockevents_exchange_device - release and request clock devices
 * @old:        device to release (can be NULL)
 * @new:        device to request (can be NULL)
 *
 * Called from various tick functions with clockevents_lock held and
 * interrupts disabled.
 */
void clockevents_exchange_device(struct clock_event_device *old,
                                 struct clock_event_device *new)
{
        /*
         * Caller releases a clock event device. We queue it into the
         * released list and do a notify add later.
         */
        if (old) {
                module_put(old->owner);
                clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED);
                list_move(&old->list, &clockevents_released);
        }

        if (new) {
                BUG_ON(!clockevent_state_detached(new));
                clockevents_shutdown(new);
        }
}

/**
 * clockevents_suspend - suspend clock devices
 */
void clockevents_suspend(void)
{
        struct clock_event_device *dev;

        list_for_each_entry_reverse(dev, &clockevent_devices, list)
                if (dev->suspend && !clockevent_state_detached(dev))
                        dev->suspend(dev);
}

/**
 * clockevents_resume - resume clock devices
 */
void clockevents_resume(void)
{
        struct clock_event_device *dev;

        list_for_each_entry(dev, &clockevent_devices, list)
                if (dev->resume && !clockevent_state_detached(dev))
                        dev->resume(dev);
}

#ifdef CONFIG_HOTPLUG_CPU

/**
 * tick_offline_cpu - Shutdown all clock events related
 *                    to this CPU and take it out of the
 *                    broadcast mechanism.
 * @cpu:        The outgoing CPU
 *
 * Called by the dying CPU during teardown.
 */
void tick_offline_cpu(unsigned int cpu)
{
        struct clock_event_device *dev, *tmp;

        raw_spin_lock(&clockevents_lock);

        tick_broadcast_offline(cpu);
        tick_shutdown();

        /*
         * Unregister the clock event devices which were
         * released above.
         */
        list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
                list_del(&dev->list);

        /*
         * Now check whether the CPU has left unused per cpu devices
         */
        list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
                if (cpumask_test_cpu(cpu, dev->cpumask) &&
                    cpumask_weight(dev->cpumask) == 1 &&
                    !tick_is_broadcast_device(dev)) {
                        BUG_ON(!clockevent_state_detached(dev));
                        list_del(&dev->list);
                }
        }

        raw_spin_unlock(&clockevents_lock);
}
#endif

#ifdef CONFIG_SYSFS
static const struct bus_type clockevents_subsys = {
        .name                = "clockevents",
        .dev_name       = "clockevent",
};

static DEFINE_PER_CPU(struct device, tick_percpu_dev);
static struct tick_device *tick_get_tick_dev(struct device *dev);

static ssize_t current_device_show(struct device *dev,
                                   struct device_attribute *attr,
                                   char *buf)
{
        struct tick_device *td;
        ssize_t count = 0;

        raw_spin_lock_irq(&clockevents_lock);
        td = tick_get_tick_dev(dev);
        if (td && td->evtdev)
                count = sysfs_emit(buf, "%s\n", td->evtdev->name);
        raw_spin_unlock_irq(&clockevents_lock);
        return count;
}
static DEVICE_ATTR_RO(current_device);

/* We don't support the abomination of removable broadcast devices */
static ssize_t unbind_device_store(struct device *dev,
                                   struct device_attribute *attr,
                                   const char *buf, size_t count)
{
        char name[CS_NAME_LEN];
        ssize_t ret = sysfs_get_uname(buf, name, count);
        struct clock_event_device *ce = NULL, *iter;

        if (ret < 0)
                return ret;

        ret = -ENODEV;
        mutex_lock(&clockevents_mutex);
        raw_spin_lock_irq(&clockevents_lock);
        list_for_each_entry(iter, &clockevent_devices, list) {
                if (!strcmp(iter->name, name)) {
                        ret = __clockevents_try_unbind(iter, dev->id);
                        ce = iter;
                        break;
                }
        }
        raw_spin_unlock_irq(&clockevents_lock);
        /*
         * We hold clockevents_mutex, so ce can't go away
         */
        if (ret == -EAGAIN)
                ret = clockevents_unbind(ce, dev->id);
        mutex_unlock(&clockevents_mutex);
        return ret ? ret : count;
}
static DEVICE_ATTR_WO(unbind_device);

#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
static struct device tick_bc_dev = {
        .init_name        = "broadcast",
        .id                = 0,
        .bus                = &clockevents_subsys,
};

static struct tick_device *tick_get_tick_dev(struct device *dev)
{
        return dev == &tick_bc_dev ? tick_get_broadcast_device() :
                &per_cpu(tick_cpu_device, dev->id);
}

static __init int tick_broadcast_init_sysfs(void)
{
        int err = device_register(&tick_bc_dev);

        if (!err)
                err = device_create_file(&tick_bc_dev, &dev_attr_current_device);
        return err;
}
#else
static struct tick_device *tick_get_tick_dev(struct device *dev)
{
        return &per_cpu(tick_cpu_device, dev->id);
}
static inline int tick_broadcast_init_sysfs(void) { return 0; }
#endif

static int __init tick_init_sysfs(void)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct device *dev = &per_cpu(tick_percpu_dev, cpu);
                int err;

                dev->id = cpu;
                dev->bus = &clockevents_subsys;
                err = device_register(dev);
                if (!err)
                        err = device_create_file(dev, &dev_attr_current_device);
                if (!err)
                        err = device_create_file(dev, &dev_attr_unbind_device);
                if (err)
                        return err;
        }
        return tick_broadcast_init_sysfs();
}

static int __init clockevents_init_sysfs(void)
{
        int err = subsys_system_register(&clockevents_subsys, NULL);

        if (!err)
                err = tick_init_sysfs();
        return err;
}
device_initcall(clockevents_init_sysfs);
#endif /* SYSFS */

























































































































    1 











    1 
















    1 












    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
// SPDX-License-Identifier: GPL-2.0
/*
 * DMA memory management for framework level HCD code (hc_driver)
 *
 * This implementation plugs in through generic "usb_bus" level methods,
 * and should work with all USB controllers, regardless of bus type.
 *
 * Released under the GPLv2 only.
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/device.h>
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/dma-mapping.h>
#include <linux/dmapool.h>
#include <linux/genalloc.h>
#include <linux/usb.h>
#include <linux/usb/hcd.h>


/*
 * DMA-Coherent Buffers
 */

/* FIXME tune these based on pool statistics ... */
static size_t pool_max[HCD_BUFFER_POOLS] = {
        32, 128, 512, 2048,
};

void __init usb_init_pool_max(void)
{
        /*
         * The pool_max values must never be smaller than
         * ARCH_DMA_MINALIGN.
         */
        if (ARCH_DMA_MINALIGN <= 32)
                ;                        /* Original value is okay */
        else if (ARCH_DMA_MINALIGN <= 64)
                pool_max[0] = 64;
        else if (ARCH_DMA_MINALIGN <= 128)
                pool_max[0] = 0;        /* Don't use this pool */
        else
                BUILD_BUG();                /* We don't allow this */
}

/* SETUP primitives */

/**
 * hcd_buffer_create - initialize buffer pools
 * @hcd: the bus whose buffer pools are to be initialized
 *
 * Context: task context, might sleep
 *
 * Call this as part of initializing a host controller that uses the dma
 * memory allocators.  It initializes some pools of dma-coherent memory that
 * will be shared by all drivers using that controller.
 *
 * Call hcd_buffer_destroy() to clean up after using those pools.
 *
 * Return: 0 if successful. A negative errno value otherwise.
 */
int hcd_buffer_create(struct usb_hcd *hcd)
{
        char                name[16];
        int                i, size;

        if (hcd->localmem_pool || !hcd_uses_dma(hcd))
                return 0;

        for (i = 0; i < HCD_BUFFER_POOLS; i++) {
                size = pool_max[i];
                if (!size)
                        continue;
                snprintf(name, sizeof(name), "buffer-%d", size);
                hcd->pool[i] = dma_pool_create(name, hcd->self.sysdev,
                                size, size, 0);
                if (!hcd->pool[i]) {
                        hcd_buffer_destroy(hcd);
                        return -ENOMEM;
                }
        }
        return 0;
}


/**
 * hcd_buffer_destroy - deallocate buffer pools
 * @hcd: the bus whose buffer pools are to be destroyed
 *
 * Context: task context, might sleep
 *
 * This frees the buffer pools created by hcd_buffer_create().
 */
void hcd_buffer_destroy(struct usb_hcd *hcd)
{
        int i;

        if (!IS_ENABLED(CONFIG_HAS_DMA))
                return;

        for (i = 0; i < HCD_BUFFER_POOLS; i++) {
                dma_pool_destroy(hcd->pool[i]);
                hcd->pool[i] = NULL;
        }
}


/* sometimes alloc/free could use kmalloc with GFP_DMA, for
 * better sharing and to leverage mm/slab.c intelligence.
 */

void *hcd_buffer_alloc(
        struct usb_bus                *bus,
        size_t                        size,
        gfp_t                        mem_flags,
        dma_addr_t                *dma
)
{
        struct usb_hcd                *hcd = bus_to_hcd(bus);
        int                        i;

        if (size == 0)
                return NULL;

        if (hcd->localmem_pool)
                return gen_pool_dma_alloc(hcd->localmem_pool, size, dma);

        /* some USB hosts just use PIO */
        if (!hcd_uses_dma(hcd)) {
                *dma = ~(dma_addr_t) 0;
                return kmalloc(size, mem_flags);
        }

        for (i = 0; i < HCD_BUFFER_POOLS; i++) {
                if (size <= pool_max[i])
                        return dma_pool_alloc(hcd->pool[i], mem_flags, dma);
        }
        return dma_alloc_coherent(hcd->self.sysdev, size, dma, mem_flags);
}

void hcd_buffer_free(
        struct usb_bus                *bus,
        size_t                        size,
        void                        *addr,
        dma_addr_t                dma
)
{
        struct usb_hcd                *hcd = bus_to_hcd(bus);
        int                        i;

        if (!addr)
                return;

        if (hcd->localmem_pool) {
                gen_pool_free(hcd->localmem_pool, (unsigned long)addr, size);
                return;
        }

        if (!hcd_uses_dma(hcd)) {
                kfree(addr);
                return;
        }

        for (i = 0; i < HCD_BUFFER_POOLS; i++) {
                if (size <= pool_max[i]) {
                        dma_pool_free(hcd->pool[i], addr, dma);
                        return;
                }
        }
        dma_free_coherent(hcd->self.sysdev, size, addr, dma);
}

void *hcd_buffer_alloc_pages(struct usb_hcd *hcd,
                size_t size, gfp_t mem_flags, dma_addr_t *dma)
{
        if (size == 0)
                return NULL;

        if (hcd->localmem_pool)
                return gen_pool_dma_alloc_align(hcd->localmem_pool,
                                size, dma, PAGE_SIZE);

        /* some USB hosts just use PIO */
        if (!hcd_uses_dma(hcd)) {
                *dma = DMA_MAPPING_ERROR;
                return (void *)__get_free_pages(mem_flags,
                                get_order(size));
        }

        return dma_alloc_coherent(hcd->self.sysdev,
                        size, dma, mem_flags);
}

void hcd_buffer_free_pages(struct usb_hcd *hcd,
                size_t size, void *addr, dma_addr_t dma)
{
        if (!addr)
                return;

        if (hcd->localmem_pool) {
                gen_pool_free(hcd->localmem_pool,
                                (unsigned long)addr, size);
                return;
        }

        if (!hcd_uses_dma(hcd)) {
                free_pages((unsigned long)addr, get_order(size));
                return;
        }

        dma_free_coherent(hcd->self.sysdev, size, addr, dma);
}





















































































































































































































   13 




   12 

   14 













    1 





















































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
// SPDX-License-Identifier: GPL-2.0
/*
 *        linux/mm/mlock.c
 *
 *  (C) Copyright 1995 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 */

#include <linux/capability.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/sched/user.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/folio_batch.h>
#include <linux/pagewalk.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/secretmem.h>

#include "internal.h"

struct mlock_fbatch {
        local_lock_t lock;
        struct folio_batch fbatch;
};

static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

bool can_do_mlock(void)
{
        if (rlimit(RLIMIT_MEMLOCK) != 0)
                return true;
        if (capable(CAP_IPC_LOCK))
                return true;
        return false;
}
EXPORT_SYMBOL(can_do_mlock);

/*
 * Mlocked folios are marked with the PG_mlocked flag for efficient testing
 * in vmscan and, possibly, the fault path; and to support semi-accurate
 * statistics.
 *
 * An mlocked folio [folio_test_mlocked(folio)] is unevictable.  As such, it
 * will be ostensibly placed on the LRU "unevictable" list (actually no such
 * list exists), rather than the [in]active lists. PG_unevictable is set to
 * indicate the unevictable state.
 */

static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec)
{
        /* There is nothing more we can do while it's off LRU */
        if (!folio_test_clear_lru(folio))
                return lruvec;

        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        if (unlikely(folio_evictable(folio))) {
                /*
                 * This is a little surprising, but quite possible: PG_mlocked
                 * must have got cleared already by another CPU.  Could this
                 * folio be unevictable?  I'm not sure, but move it now if so.
                 */
                if (folio_test_unevictable(folio)) {
                        lruvec_del_folio(lruvec, folio);
                        folio_clear_unevictable(folio);
                        lruvec_add_folio(lruvec, folio);

                        __count_vm_events(UNEVICTABLE_PGRESCUED,
                                          folio_nr_pages(folio));
                }
                goto out;
        }

        if (folio_test_unevictable(folio)) {
                if (folio_test_mlocked(folio))
                        folio->mlock_count++;
                goto out;
        }

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        folio_set_unevictable(folio);
        folio->mlock_count = !!folio_test_mlocked(folio);
        lruvec_add_folio(lruvec, folio);
        __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
out:
        folio_set_lru(folio);
        return lruvec;
}

static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec)
{
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        /* As above, this is a little surprising, but possible */
        if (unlikely(folio_evictable(folio)))
                goto out;

        folio_set_unevictable(folio);
        folio->mlock_count = !!folio_test_mlocked(folio);
        __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
out:
        lruvec_add_folio(lruvec, folio);
        folio_set_lru(folio);
        return lruvec;
}

static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec)
{
        int nr_pages = folio_nr_pages(folio);
        bool isolated = false;

        if (!folio_test_clear_lru(folio))
                goto munlock;

        isolated = true;
        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        if (folio_test_unevictable(folio)) {
                /* Then mlock_count is maintained, but might undercount */
                if (folio->mlock_count)
                        folio->mlock_count--;
                if (folio->mlock_count)
                        goto out;
        }
        /* else assume that was the last mlock: reclaim will fix it if not */

munlock:
        if (folio_test_clear_mlocked(folio)) {
                __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
                if (isolated || !folio_test_unevictable(folio))
                        __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
                else
                        __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
        }

        /* folio_evictable() has to be checked *after* clearing Mlocked */
        if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) {
                lruvec_del_folio(lruvec, folio);
                folio_clear_unevictable(folio);
                lruvec_add_folio(lruvec, folio);
                __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        }
out:
        if (isolated)
                folio_set_lru(folio);
        return lruvec;
}

/*
 * Flags held in the low bits of a struct folio pointer on the mlock_fbatch.
 */
#define LRU_FOLIO 0x1
#define NEW_FOLIO 0x2
static inline struct folio *mlock_lru(struct folio *folio)
{
        return (struct folio *)((unsigned long)folio + LRU_FOLIO);
}

static inline struct folio *mlock_new(struct folio *folio)
{
        return (struct folio *)((unsigned long)folio + NEW_FOLIO);
}

/*
 * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can
 * make use of such folio pointer flags in future, but for now just keep it for
 * mlock.  We could use three separate folio batches instead, but one feels
 * better (munlocking a full folio batch does not need to drain mlocking folio
 * batches first).
 */
static void mlock_folio_batch(struct folio_batch *fbatch)
{
        struct lruvec *lruvec = NULL;
        unsigned long mlock;
        struct folio *folio;
        int i;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                folio = fbatch->folios[i];
                mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO);
                folio = (struct folio *)((unsigned long)folio - mlock);
                fbatch->folios[i] = folio;

                if (mlock & LRU_FOLIO)
                        lruvec = __mlock_folio(folio, lruvec);
                else if (mlock & NEW_FOLIO)
                        lruvec = __mlock_new_folio(folio, lruvec);
                else
                        lruvec = __munlock_folio(folio, lruvec);
        }

        if (lruvec)
                lruvec_unlock_irq(lruvec);
        folios_put(fbatch);
}

void mlock_drain_local(void)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        if (folio_batch_count(fbatch))
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

void mlock_drain_remote(int cpu)
{
        struct folio_batch *fbatch;

        WARN_ON_ONCE(cpu_online(cpu));
        fbatch = &per_cpu(mlock_fbatch.fbatch, cpu);
        if (folio_batch_count(fbatch))
                mlock_folio_batch(fbatch);
}

bool need_mlock_drain(int cpu)
{
        return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu));
}

/**
 * mlock_folio - mlock a folio already on (or temporarily off) LRU
 * @folio: folio to be mlocked.
 */
void mlock_folio(struct folio *folio)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);

        if (!folio_test_set_mlocked(folio)) {
                int nr_pages = folio_nr_pages(folio);

                zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
                __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
        }

        folio_get(folio);
        if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
            !folio_may_be_lru_cached(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

/**
 * mlock_new_folio - mlock a newly allocated folio not yet on LRU
 * @folio: folio to be mlocked, either normal or a THP head.
 */
void mlock_new_folio(struct folio *folio)
{
        struct folio_batch *fbatch;
        int nr_pages = folio_nr_pages(folio);

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        folio_set_mlocked(folio);

        zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
        __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);

        folio_get(folio);
        if (!folio_batch_add(fbatch, mlock_new(folio)) ||
            !folio_may_be_lru_cached(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

/**
 * munlock_folio - munlock a folio
 * @folio: folio to be munlocked, either normal or a THP head.
 */
void munlock_folio(struct folio *folio)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        /*
         * folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
         * which will check whether the folio is multiply mlocked.
         */
        folio_get(folio);
        if (!folio_batch_add(fbatch, folio) ||
            !folio_may_be_lru_cached(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

static inline unsigned int folio_mlock_step(struct folio *folio,
                pte_t *pte, unsigned long addr, unsigned long end)
{
        unsigned int count = (end - addr) >> PAGE_SHIFT;
        pte_t ptent = ptep_get(pte);

        if (!folio_test_large(folio))
                return 1;

        return folio_pte_batch(folio, pte, ptent, count);
}

static inline bool allow_mlock_munlock(struct folio *folio,
                struct vm_area_struct *vma, unsigned long start,
                unsigned long end, unsigned int step)
{
        /*
         * For unlock, allow munlock large folio which is partially
         * mapped to VMA. As it's possible that large folio is
         * mlocked and VMA is split later.
         *
         * During memory pressure, such kind of large folio can
         * be split. And the pages are not in VM_LOCKed VMA
         * can be reclaimed.
         */
        if (!(vma->vm_flags & VM_LOCKED))
                return true;

        /* folio_within_range() cannot take KSM, but any small folio is OK */
        if (!folio_test_large(folio))
                return true;

        /* folio not in range [start, end), skip mlock */
        if (!folio_within_range(folio, vma, start, end))
                return false;

        /* folio is not fully mapped, skip mlock */
        if (step != folio_nr_pages(folio))
                return false;

        return true;
}

static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
                           unsigned long end, struct mm_walk *walk)

{
        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
        pte_t *start_pte, *pte;
        pte_t ptent;
        struct folio *folio;
        unsigned int step = 1;
        unsigned long start = addr;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                if (!pmd_present(*pmd))
                        goto out;
                if (is_huge_zero_pmd(*pmd))
                        goto out;
                folio = pmd_folio(*pmd);
                if (folio_is_zone_device(folio))
                        goto out;
                if (vma->vm_flags & VM_LOCKED)
                        mlock_folio(folio);
                else
                        munlock_folio(folio);
                goto out;
        }

        start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (!start_pte) {
                walk->action = ACTION_AGAIN;
                return 0;
        }

        for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = ptep_get(pte);
                if (!pte_present(ptent))
                        continue;
                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;

                step = folio_mlock_step(folio, pte, addr, end);
                if (!allow_mlock_munlock(folio, vma, start, end, step))
                        goto next_entry;

                if (vma->vm_flags & VM_LOCKED)
                        mlock_folio(folio);
                else
                        munlock_folio(folio);

next_entry:
                pte += step - 1;
                addr += (step - 1) << PAGE_SHIFT;
        }
        pte_unmap(start_pte);
out:
        spin_unlock(ptl);
        cond_resched();
        return 0;
}

/*
 * mlock_vma_pages_range() - mlock any pages already in the range,
 *                           or munlock all pages in the range.
 * @vma - vma containing range to be mlock()ed or munlock()ed
 * @start - start address in @vma of the range
 * @end - end of range in @vma
 * @new_vma_flags - the new set of flags for @vma.
 *
 * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
 * called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
 */
static void mlock_vma_pages_range(struct vm_area_struct *vma,
        unsigned long start, unsigned long end,
        vma_flags_t *new_vma_flags)
{
        static const struct mm_walk_ops mlock_walk_ops = {
                .pmd_entry = mlock_pte_range,
                .walk_lock = PGWALK_WRLOCK_VERIFY,
        };

        /*
         * There is a slight chance that concurrent page migration,
         * or page reclaim finding a page of this now-VM_LOCKED vma,
         * will call mlock_vma_folio() and raise page's mlock_count:
         * double counting, leaving the page unevictable indefinitely.
         * Communicate this danger to mlock_vma_folio() with VM_IO,
         * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
         * mmap_lock is held in write mode here, so this weird
         * combination should not be visible to other mmap_lock users;
         * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
         */
        if (vma_flags_test(new_vma_flags, VMA_LOCKED_BIT))
                vma_flags_set(new_vma_flags, VMA_IO_BIT);
        vma_start_write(vma);
        vma_flags_reset_once(vma, new_vma_flags);

        lru_add_drain();
        walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
        lru_add_drain();

        if (vma_flags_test(new_vma_flags, VMA_IO_BIT)) {
                vma_flags_clear(new_vma_flags, VMA_IO_BIT);
                vma_flags_reset_once(vma, new_vma_flags);
        }
}

/*
 * mlock_fixup  - handle mlock[all]/munlock[all] requests.
 *
 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
 * munlock is a no-op.  However, for some special vmas, we go ahead and
 * populate the ptes.
 *
 * For vmas that pass the filters, merge/split as appropriate.
 */
static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
               struct vm_area_struct **prev, unsigned long start,
               unsigned long end, vm_flags_t newflags)
{
        vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags);
        const vma_flags_t old_vma_flags = vma->flags;
        struct mm_struct *mm = vma->vm_mm;
        int nr_pages;
        int ret = 0;

        if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags) ||
            vma_is_secretmem(vma) || !vma_supports_mlock(vma)) {
                /*
                 * Don't set VM_LOCKED or VM_LOCKONFAULT and don't count.
                 * For secretmem, don't allow the memory to be unlocked.
                 */
                goto out;
        }

        vma = vma_modify_flags(vmi, *prev, vma, start, end, &new_vma_flags);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
        }

        /*
         * Keep track of amount of locked VM.
         */
        nr_pages = (end - start) >> PAGE_SHIFT;
        if (!vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT))
                nr_pages = -nr_pages;
        else if (vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT))
                nr_pages = 0;
        mm->locked_vm += nr_pages;

        /*
         * vm_flags is protected by the mmap_lock held in write mode.
         * It's okay if try_to_unmap_one unmaps a page just after we
         * set VM_LOCKED, populate_vma_page_range will bring it back.
         */
        if (vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT) &&
            vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) {
                /* No work to do, and mlocking twice would be wrong */
                vma_start_write(vma);
                vma->flags = new_vma_flags;
        } else {
                mlock_vma_pages_range(vma, start, end, &new_vma_flags);
        }
out:
        *prev = vma;
        return ret;
}

static int apply_vma_lock_flags(unsigned long start, size_t len,
                                vm_flags_t flags)
{
        unsigned long nstart, end, tmp;
        struct vm_area_struct *vma, *prev;
        VMA_ITERATOR(vmi, current->mm, start);

        VM_BUG_ON(offset_in_page(start));
        VM_BUG_ON(len != PAGE_ALIGN(len));
        end = start + len;
        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;
        vma = vma_iter_load(&vmi);
        if (!vma)
                return -ENOMEM;

        prev = vma_prev(&vmi);
        if (start > vma->vm_start)
                prev = vma;

        nstart = start;
        tmp = vma->vm_start;
        for_each_vma_range(vmi, vma, end) {
                int error;
                vm_flags_t newflags;

                if (vma->vm_start != tmp)
                        return -ENOMEM;

                newflags = vma->vm_flags & ~VM_LOCKED_MASK;
                newflags |= flags;
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
                tmp = vma->vm_end;
                if (tmp > end)
                        tmp = end;
                error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
                if (error)
                        return error;
                tmp = vma_iter_end(&vmi);
                nstart = tmp;
        }

        if (tmp < end)
                return -ENOMEM;

        return 0;
}

/*
 * Go through vma areas and sum size of mlocked
 * vma pages, as return value.
 * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
 * is also counted.
 * Return value: previously mlocked page counts
 */
static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
                unsigned long start, size_t len)
{
        struct vm_area_struct *vma;
        unsigned long count = 0;
        unsigned long end;
        VMA_ITERATOR(vmi, mm, start);

        /* Don't overflow past ULONG_MAX */
        if (unlikely(ULONG_MAX - len < start))
                end = ULONG_MAX;
        else
                end = start + len;

        for_each_vma_range(vmi, vma, end) {
                if (vma->vm_flags & VM_LOCKED) {
                        if (start > vma->vm_start)
                                count -= (start - vma->vm_start);
                        if (end < vma->vm_end) {
                                count += end - vma->vm_start;
                                break;
                        }
                        count += vma->vm_end - vma->vm_start;
                }
        }

        return count >> PAGE_SHIFT;
}

/*
 * convert get_user_pages() return value to posix mlock() error
 */
static int __mlock_posix_error_return(long retval)
{
        if (retval == -EFAULT)
                retval = -ENOMEM;
        else if (retval == -ENOMEM)
                retval = -EAGAIN;
        return retval;
}

static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
        unsigned long locked;
        unsigned long lock_limit;
        int error = -ENOMEM;

        start = untagged_addr(start);

        if (!can_do_mlock())
                return -EPERM;

        len = PAGE_ALIGN(len + (offset_in_page(start)));
        start &= PAGE_MASK;

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = len >> PAGE_SHIFT;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;

        locked += current->mm->locked_vm;
        if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
                /*
                 * It is possible that the regions requested intersect with
                 * previously mlocked areas, that part area in "mm->locked_vm"
                 * should not be counted to new mlock increment count. So check
                 * and adjust locked count if necessary.
                 */
                locked -= count_mm_mlocked_page_nr(current->mm,
                                start, len);
        }

        /* check against resource limits */
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
                error = apply_vma_lock_flags(start, len, flags);

        mmap_write_unlock(current->mm);
        if (error)
                return error;

        error = __mm_populate(start, len, 0);
        if (error)
                return __mlock_posix_error_return(error);
        return 0;
}

SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
        return do_mlock(start, len, VM_LOCKED);
}

SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
{
        vm_flags_t vm_flags = VM_LOCKED;

        if (flags & ~MLOCK_ONFAULT)
                return -EINVAL;

        if (flags & MLOCK_ONFAULT)
                vm_flags |= VM_LOCKONFAULT;

        return do_mlock(start, len, vm_flags);
}

SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
        int ret;

        start = untagged_addr(start);

        len = PAGE_ALIGN(len + (offset_in_page(start)));
        start &= PAGE_MASK;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
        ret = apply_vma_lock_flags(start, len, 0);
        mmap_write_unlock(current->mm);

        return ret;
}

/*
 * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
 * and translate into the appropriate modifications to mm->def_flags and/or the
 * flags for all current VMAs.
 *
 * There are a couple of subtleties with this.  If mlockall() is called multiple
 * times with different flags, the values do not necessarily stack.  If mlockall
 * is called once including the MCL_FUTURE flag and then a second time without
 * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
 */
static int apply_mlockall_flags(int flags)
{
        VMA_ITERATOR(vmi, current->mm, 0);
        struct vm_area_struct *vma, *prev = NULL;
        vm_flags_t to_add = 0;

        current->mm->def_flags &= ~VM_LOCKED_MASK;
        if (flags & MCL_FUTURE) {
                current->mm->def_flags |= VM_LOCKED;

                if (flags & MCL_ONFAULT)
                        current->mm->def_flags |= VM_LOCKONFAULT;

                if (!(flags & MCL_CURRENT))
                        goto out;
        }

        if (flags & MCL_CURRENT) {
                to_add |= VM_LOCKED;
                if (flags & MCL_ONFAULT)
                        to_add |= VM_LOCKONFAULT;
        }

        for_each_vma(vmi, vma) {
                int error;
                vm_flags_t newflags;

                newflags = vma->vm_flags & ~VM_LOCKED_MASK;
                newflags |= to_add;

                error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
                                    newflags);
                /* Ignore errors, but prev needs fixing up. */
                if (error)
                        prev = vma;
                cond_resched();
        }
out:
        return 0;
}

SYSCALL_DEFINE1(mlockall, int, flags)
{
        unsigned long lock_limit;
        int ret;

        if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
            flags == MCL_ONFAULT)
                return -EINVAL;

        if (!can_do_mlock())
                return -EPERM;

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;

        ret = -ENOMEM;
        if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
            capable(CAP_IPC_LOCK))
                ret = apply_mlockall_flags(flags);
        mmap_write_unlock(current->mm);
        if (!ret && (flags & MCL_CURRENT))
                mm_populate(0, TASK_SIZE);

        return ret;
}

SYSCALL_DEFINE0(munlockall)
{
        int ret;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
        ret = apply_mlockall_flags(0);
        mmap_write_unlock(current->mm);
        return ret;
}

/*
 * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
 * shm segments) get accounted against the user_struct instead.
 */
static DEFINE_SPINLOCK(shmlock_user_lock);

int user_shm_lock(size_t size, struct ucounts *ucounts)
{
        unsigned long lock_limit, locked;
        long memlock;
        int allowed = 0;

        locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        if (lock_limit != RLIM_INFINITY)
                lock_limit >>= PAGE_SHIFT;
        spin_lock(&shmlock_user_lock);
        memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);

        if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
                dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
                goto out;
        }
        if (!get_ucounts(ucounts)) {
                dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
                allowed = 0;
                goto out;
        }
        allowed = 1;
out:
        spin_unlock(&shmlock_user_lock);
        return allowed;
}

void user_shm_unlock(size_t size, struct ucounts *ucounts)
{
        spin_lock(&shmlock_user_lock);
        dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
        spin_unlock(&shmlock_user_lock);
        put_ucounts(ucounts);
}













































































































































































































































































































































































































































































































































































































































































   11 





















































































   11 






































































   11 










































    9 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
// SPDX-License-Identifier: GPL-2.0
#include <linux/capability.h>
#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/blktrace_api.h>
#include <linux/pr.h>
#include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/io_uring/cmd.h>
#include <linux/blk-integrity.h>
#include <uapi/linux/blkdev.h>
#include "blk.h"
#include "blk-crypto-internal.h"

static int blkpg_do_ioctl(struct block_device *bdev,
                          struct blkpg_partition __user *upart, int op)
{
        struct gendisk *disk = bdev->bd_disk;
        struct blkpg_partition p;
        sector_t start, length, capacity, end;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
        if (copy_from_user(&p, upart, sizeof(struct blkpg_partition)))
                return -EFAULT;
        if (bdev_is_partition(bdev))
                return -EINVAL;

        if (p.pno <= 0)
                return -EINVAL;

        if (op == BLKPG_DEL_PARTITION)
                return bdev_del_partition(disk, p.pno);

        if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start)
                return -EINVAL;
        /* Check that the partition is aligned to the block size */
        if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
                return -EINVAL;

        start = p.start >> SECTOR_SHIFT;
        length = p.length >> SECTOR_SHIFT;
        capacity = get_capacity(disk);

        if (check_add_overflow(start, length, &end))
                return -EINVAL;

        if (start >= capacity || end > capacity)
                return -EINVAL;

        switch (op) {
        case BLKPG_ADD_PARTITION:
                return bdev_add_partition(disk, p.pno, start, length);
        case BLKPG_RESIZE_PARTITION:
                return bdev_resize_partition(disk, p.pno, start, length);
        default:
                return -EINVAL;
        }
}

static int blkpg_ioctl(struct block_device *bdev,
                       struct blkpg_ioctl_arg __user *arg)
{
        struct blkpg_partition __user *udata;
        int op;

        if (get_user(op, &arg->op) || get_user(udata, &arg->data))
                return -EFAULT;

        return blkpg_do_ioctl(bdev, udata, op);
}

#ifdef CONFIG_COMPAT
struct compat_blkpg_ioctl_arg {
        compat_int_t op;
        compat_int_t flags;
        compat_int_t datalen;
        compat_caddr_t data;
};

static int compat_blkpg_ioctl(struct block_device *bdev,
                              struct compat_blkpg_ioctl_arg __user *arg)
{
        compat_caddr_t udata;
        int op;

        if (get_user(op, &arg->op) || get_user(udata, &arg->data))
                return -EFAULT;

        return blkpg_do_ioctl(bdev, compat_ptr(udata), op);
}
#endif

/*
 * Check that [start, start + len) is a valid range from the block device's
 * perspective, including verifying that it can be correctly translated into
 * logical block addresses.
 */
static int blk_validate_byte_range(struct block_device *bdev,
                                   uint64_t start, uint64_t len)
{
        unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
        uint64_t end;

        if ((start | len) & bs_mask)
                return -EINVAL;
        if (!len)
                return -EINVAL;
        if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev))
                return -EINVAL;

        return 0;
}

static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
                unsigned long arg)
{
        uint64_t range[2], start, len;
        struct bio *prev = NULL, *bio;
        sector_t sector, nr_sects;
        struct blk_plug plug;
        int err;

        if (copy_from_user(range, (void __user *)arg, sizeof(range)))
                return -EFAULT;
        start = range[0];
        len = range[1];

        if (!bdev_max_discard_sectors(bdev))
                return -EOPNOTSUPP;

        if (!(mode & BLK_OPEN_WRITE))
                return -EBADF;
        if (bdev_read_only(bdev))
                return -EPERM;
        err = blk_validate_byte_range(bdev, start, len);
        if (err)
                return err;

        inode_lock(bdev->bd_mapping->host);
        filemap_invalidate_lock(bdev->bd_mapping);
        err = truncate_bdev_range(bdev, mode, start, start + len - 1);
        if (err)
                goto fail;

        sector = start >> SECTOR_SHIFT;
        nr_sects = len >> SECTOR_SHIFT;

        blk_start_plug(&plug);
        while (!fatal_signal_pending(current)) {
                bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
                                GFP_KERNEL);
                if (!bio)
                        break;
                prev = bio_chain_and_submit(prev, bio);
        }
        if (prev) {
                err = bio_submit_or_kill(prev, BLKDEV_ZERO_KILLABLE);
                if (err == -EOPNOTSUPP)
                        err = 0;
                bio_put(prev);
        }
        blk_finish_plug(&plug);
fail:
        filemap_invalidate_unlock(bdev->bd_mapping);
        inode_unlock(bdev->bd_mapping->host);
        return err;
}

static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
                void __user *argp)
{
        uint64_t start, len, end;
        uint64_t range[2];
        int err;

        if (!(mode & BLK_OPEN_WRITE))
                return -EBADF;
        if (!bdev_max_secure_erase_sectors(bdev))
                return -EOPNOTSUPP;
        if (copy_from_user(range, argp, sizeof(range)))
                return -EFAULT;

        start = range[0];
        len = range[1];
        if ((start & 511) || (len & 511))
                return -EINVAL;
        if (check_add_overflow(start, len, &end) ||
            end > bdev_nr_bytes(bdev))
                return -EINVAL;

        inode_lock(bdev->bd_mapping->host);
        filemap_invalidate_lock(bdev->bd_mapping);
        err = truncate_bdev_range(bdev, mode, start, end - 1);
        if (!err)
                err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
                                                GFP_KERNEL);
        filemap_invalidate_unlock(bdev->bd_mapping);
        inode_unlock(bdev->bd_mapping->host);
        return err;
}


static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
                unsigned long arg)
{
        uint64_t range[2];
        uint64_t start, end, len;
        int err;

        if (!(mode & BLK_OPEN_WRITE))
                return -EBADF;

        if (copy_from_user(range, (void __user *)arg, sizeof(range)))
                return -EFAULT;

        start = range[0];
        len = range[1];
        end = start + len - 1;

        if (start & 511)
                return -EINVAL;
        if (len & 511)
                return -EINVAL;
        if (end >= (uint64_t)bdev_nr_bytes(bdev))
                return -EINVAL;
        if (end < start)
                return -EINVAL;

        /* Invalidate the page cache, including dirty pages */
        inode_lock(bdev->bd_mapping->host);
        filemap_invalidate_lock(bdev->bd_mapping);
        err = truncate_bdev_range(bdev, mode, start, end);
        if (err)
                goto fail;

        err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
                                   BLKDEV_ZERO_NOUNMAP | BLKDEV_ZERO_KILLABLE);

fail:
        filemap_invalidate_unlock(bdev->bd_mapping);
        inode_unlock(bdev->bd_mapping->host);
        return err;
}

static int put_ushort(unsigned short __user *argp, unsigned short val)
{
        return put_user(val, argp);
}

static int put_int(int __user *argp, int val)
{
        return put_user(val, argp);
}

static int put_uint(unsigned int __user *argp, unsigned int val)
{
        return put_user(val, argp);
}

static int put_long(long __user *argp, long val)
{
        return put_user(val, argp);
}

static int put_ulong(unsigned long __user *argp, unsigned long val)
{
        return put_user(val, argp);
}

static int put_u64(u64 __user *argp, u64 val)
{
        return put_user(val, argp);
}

#ifdef CONFIG_COMPAT
static int compat_put_long(compat_long_t __user *argp, long val)
{
        return put_user(val, argp);
}

static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val)
{
        return put_user(val, argp);
}
#endif

#ifdef CONFIG_COMPAT
/*
 * This is the equivalent of compat_ptr_ioctl(), to be used by block
 * drivers that implement only commands that are completely compatible
 * between 32-bit and 64-bit user space
 */
int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode,
                        unsigned cmd, unsigned long arg)
{
        struct gendisk *disk = bdev->bd_disk;

        if (disk->fops->ioctl)
                return disk->fops->ioctl(bdev, mode, cmd,
                                         (unsigned long)compat_ptr(arg));

        return -ENOIOCTLCMD;
}
EXPORT_SYMBOL(blkdev_compat_ptr_ioctl);
#endif

enum pr_direction {
        PR_IN,  /* read from device */
        PR_OUT, /* write to device */
};

static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode,
                enum pr_direction dir)
{
        /* no sense to make reservations for partitions */
        if (bdev_is_partition(bdev))
                return false;

        if (capable(CAP_SYS_ADMIN))
                return true;

        /*
         * Only allow unprivileged reservation _out_ commands if the file
         * descriptor is open for writing. Allow reservation _in_ commands if
         * the file descriptor is open for reading since they do not modify the
         * device.
         */
        if (dir == PR_IN)
                return mode & BLK_OPEN_READ;
        else
                return mode & BLK_OPEN_WRITE;
}

static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode,
                struct pr_registration __user *arg)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_registration reg;

        if (!blkdev_pr_allowed(bdev, mode, PR_OUT))
                return -EPERM;
        if (!ops || !ops->pr_register)
                return -EOPNOTSUPP;
        if (copy_from_user(&reg, arg, sizeof(reg)))
                return -EFAULT;

        if (reg.flags & ~PR_FL_IGNORE_KEY)
                return -EOPNOTSUPP;
        return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags);
}

static int blkdev_pr_reserve(struct block_device *bdev, blk_mode_t mode,
                struct pr_reservation __user *arg)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_reservation rsv;

        if (!blkdev_pr_allowed(bdev, mode, PR_OUT))
                return -EPERM;
        if (!ops || !ops->pr_reserve)
                return -EOPNOTSUPP;
        if (copy_from_user(&rsv, arg, sizeof(rsv)))
                return -EFAULT;

        if (rsv.flags & ~PR_FL_IGNORE_KEY)
                return -EOPNOTSUPP;
        return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags);
}

static int blkdev_pr_release(struct block_device *bdev, blk_mode_t mode,
                struct pr_reservation __user *arg)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_reservation rsv;

        if (!blkdev_pr_allowed(bdev, mode, PR_OUT))
                return -EPERM;
        if (!ops || !ops->pr_release)
                return -EOPNOTSUPP;
        if (copy_from_user(&rsv, arg, sizeof(rsv)))
                return -EFAULT;

        if (rsv.flags)
                return -EOPNOTSUPP;
        return ops->pr_release(bdev, rsv.key, rsv.type);
}

static int blkdev_pr_preempt(struct block_device *bdev, blk_mode_t mode,
                struct pr_preempt __user *arg, bool abort)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_preempt p;

        if (!blkdev_pr_allowed(bdev, mode, PR_OUT))
                return -EPERM;
        if (!ops || !ops->pr_preempt)
                return -EOPNOTSUPP;
        if (copy_from_user(&p, arg, sizeof(p)))
                return -EFAULT;

        if (p.flags)
                return -EOPNOTSUPP;
        return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort);
}

static int blkdev_pr_clear(struct block_device *bdev, blk_mode_t mode,
                struct pr_clear __user *arg)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_clear c;

        if (!blkdev_pr_allowed(bdev, mode, PR_OUT))
                return -EPERM;
        if (!ops || !ops->pr_clear)
                return -EOPNOTSUPP;
        if (copy_from_user(&c, arg, sizeof(c)))
                return -EFAULT;

        if (c.flags)
                return -EOPNOTSUPP;
        return ops->pr_clear(bdev, c.key);
}

static int blkdev_pr_read_keys(struct block_device *bdev, blk_mode_t mode,
                struct pr_read_keys __user *arg)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_keys *keys_info;
        struct pr_read_keys read_keys;
        u64 __user *keys_ptr;
        size_t keys_info_len;
        size_t keys_copy_len;
        int ret;

        if (!blkdev_pr_allowed(bdev, mode, PR_IN))
                return -EPERM;
        if (!ops || !ops->pr_read_keys)
                return -EOPNOTSUPP;

        if (copy_from_user(&read_keys, arg, sizeof(read_keys)))
                return -EFAULT;

        if (read_keys.num_keys > PR_KEYS_MAX)
                return -EINVAL;

        keys_info_len = struct_size(keys_info, keys, read_keys.num_keys);

        keys_info = kvzalloc(keys_info_len, GFP_KERNEL);
        if (!keys_info)
                return -ENOMEM;

        keys_info->num_keys = read_keys.num_keys;

        ret = ops->pr_read_keys(bdev, keys_info);
        if (ret)
                goto out;

        /* Copy out individual keys */
        keys_ptr = u64_to_user_ptr(read_keys.keys_ptr);
        keys_copy_len = min(read_keys.num_keys, keys_info->num_keys) *
                        sizeof(keys_info->keys[0]);

        if (copy_to_user(keys_ptr, keys_info->keys, keys_copy_len)) {
                ret = -EFAULT;
                goto out;
        }

        /* Copy out the arg struct */
        read_keys.generation = keys_info->generation;
        read_keys.num_keys = keys_info->num_keys;

        if (copy_to_user(arg, &read_keys, sizeof(read_keys)))
                ret = -EFAULT;
out:
        kvfree(keys_info);
        return ret;
}

static int blkdev_pr_read_reservation(struct block_device *bdev,
                blk_mode_t mode, struct pr_read_reservation __user *arg)
{
        const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
        struct pr_held_reservation rsv = {};
        struct pr_read_reservation out = {};
        int ret;

        if (!blkdev_pr_allowed(bdev, mode, PR_IN))
                return -EPERM;
        if (!ops || !ops->pr_read_reservation)
                return -EOPNOTSUPP;

        ret = ops->pr_read_reservation(bdev, &rsv);
        if (ret)
                return ret;

        out.key = rsv.key;
        out.generation = rsv.generation;
        out.type = rsv.type;

        if (copy_to_user(arg, &out, sizeof(out)))
                return -EFAULT;
        return 0;
}

static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd,
                unsigned long arg)
{
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        mutex_lock(&bdev->bd_holder_lock);
        if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync)
                bdev->bd_holder_ops->sync(bdev);
        else {
                mutex_unlock(&bdev->bd_holder_lock);
                sync_blockdev(bdev);
        }

        invalidate_bdev(bdev);
        return 0;
}

static int blkdev_roset(struct block_device *bdev, unsigned cmd,
                unsigned long arg)
{
        int ret, n;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (get_user(n, (int __user *)arg))
                return -EFAULT;
        if (bdev->bd_disk->fops->set_read_only) {
                ret = bdev->bd_disk->fops->set_read_only(bdev, n);
                if (ret)
                        return ret;
        }
        if (n)
                bdev_set_flag(bdev, BD_READ_ONLY);
        else
                bdev_clear_flag(bdev, BD_READ_ONLY);
        return 0;
}

static int blkdev_getgeo(struct block_device *bdev,
                struct hd_geometry __user *argp)
{
        struct gendisk *disk = bdev->bd_disk;
        struct hd_geometry geo;
        int ret;

        if (!argp)
                return -EINVAL;
        if (!disk->fops->getgeo)
                return -ENOTTY;

        /*
         * We need to set the startsect first, the driver may
         * want to override it.
         */
        memset(&geo, 0, sizeof(geo));
        geo.start = get_start_sect(bdev);
        ret = disk->fops->getgeo(disk, &geo);
        if (ret)
                return ret;
        if (copy_to_user(argp, &geo, sizeof(geo)))
                return -EFAULT;
        return 0;
}

#ifdef CONFIG_COMPAT
struct compat_hd_geometry {
        unsigned char heads;
        unsigned char sectors;
        unsigned short cylinders;
        u32 start;
};

static int compat_hdio_getgeo(struct block_device *bdev,
                              struct compat_hd_geometry __user *ugeo)
{
        struct gendisk *disk = bdev->bd_disk;
        struct hd_geometry geo;
        int ret;

        if (!ugeo)
                return -EINVAL;
        if (!disk->fops->getgeo)
                return -ENOTTY;

        memset(&geo, 0, sizeof(geo));
        /*
         * We need to set the startsect first, the driver may
         * want to override it.
         */
        geo.start = get_start_sect(bdev);
        ret = disk->fops->getgeo(disk, &geo);
        if (ret)
                return ret;

        ret = copy_to_user(ugeo, &geo, 4);
        ret |= put_user(geo.start, &ugeo->start);
        if (ret)
                ret = -EFAULT;

        return ret;
}
#endif

/* set the logical block size */
static int blkdev_bszset(struct file *file, blk_mode_t mode,
                int __user *argp)
{
        // this one might be file_inode(file)->i_rdev - a rare valid
        // use of file_inode() for those.
        dev_t dev = I_BDEV(file->f_mapping->host)->bd_dev;
        struct file *excl_file;
        int ret, n;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
        if (!argp)
                return -EINVAL;
        if (get_user(n, argp))
                return -EFAULT;

        if (mode & BLK_OPEN_EXCL)
                return set_blocksize(file, n);

        excl_file = bdev_file_open_by_dev(dev, mode, &dev, NULL);
        if (IS_ERR(excl_file))
                return -EBUSY;
        ret = set_blocksize(excl_file, n);
        fput(excl_file);
        return ret;
}

/*
 * Common commands that are handled the same way on native and compat
 * user space. Note the separate arg/argp parameters that are needed
 * to deal with the compat_ptr() conversion.
 */
static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
                               unsigned int cmd, unsigned long arg,
                               void __user *argp)
{
        unsigned int max_sectors;

        switch (cmd) {
        case BLKFLSBUF:
                return blkdev_flushbuf(bdev, cmd, arg);
        case BLKROSET:
                return blkdev_roset(bdev, cmd, arg);
        case BLKDISCARD:
                return blk_ioctl_discard(bdev, mode, arg);
        case BLKSECDISCARD:
                return blk_ioctl_secure_erase(bdev, mode, argp);
        case BLKZEROOUT:
                return blk_ioctl_zeroout(bdev, mode, arg);
        case BLKGETDISKSEQ:
                return put_u64(argp, bdev->bd_disk->diskseq);
        case BLKREPORTZONE:
        case BLKREPORTZONEV2:
                return blkdev_report_zones_ioctl(bdev, cmd, arg);
        case BLKRESETZONE:
        case BLKOPENZONE:
        case BLKCLOSEZONE:
        case BLKFINISHZONE:
                return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg);
        case BLKGETZONESZ:
                return put_uint(argp, bdev_zone_sectors(bdev));
        case BLKGETNRZONES:
                return put_uint(argp, bdev_nr_zones(bdev));
        case BLKROGET:
                return put_int(argp, bdev_read_only(bdev) != 0);
        case BLKSSZGET: /* get block device logical block size */
                return put_int(argp, bdev_logical_block_size(bdev));
        case BLKPBSZGET: /* get block device physical block size */
                return put_uint(argp, bdev_physical_block_size(bdev));
        case BLKIOMIN:
                return put_uint(argp, bdev_io_min(bdev));
        case BLKIOOPT:
                return put_uint(argp, bdev_io_opt(bdev));
        case BLKALIGNOFF:
                return put_int(argp, bdev_alignment_offset(bdev));
        case BLKDISCARDZEROES:
                return put_uint(argp, 0);
        case BLKSECTGET:
                max_sectors = min_t(unsigned int, USHRT_MAX,
                                    queue_max_sectors(bdev_get_queue(bdev)));
                return put_ushort(argp, max_sectors);
        case BLKROTATIONAL:
                return put_ushort(argp, bdev_rot(bdev));
        case BLKRASET:
        case BLKFRASET:
                if(!capable(CAP_SYS_ADMIN))
                        return -EACCES;
                bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE;
                return 0;
        case BLKRRPART:
                if (!capable(CAP_SYS_ADMIN))
                        return -EACCES;
                if (bdev_is_partition(bdev))
                        return -EINVAL;
                return disk_scan_partitions(bdev->bd_disk,
                                mode | BLK_OPEN_STRICT_SCAN);
        case BLKTRACESTART:
        case BLKTRACESTOP:
        case BLKTRACETEARDOWN:
                return blk_trace_ioctl(bdev, cmd, argp);
        case BLKCRYPTOIMPORTKEY:
        case BLKCRYPTOGENERATEKEY:
        case BLKCRYPTOPREPAREKEY:
                return blk_crypto_ioctl(bdev, cmd, argp);
        case IOC_PR_REGISTER:
                return blkdev_pr_register(bdev, mode, argp);
        case IOC_PR_RESERVE:
                return blkdev_pr_reserve(bdev, mode, argp);
        case IOC_PR_RELEASE:
                return blkdev_pr_release(bdev, mode, argp);
        case IOC_PR_PREEMPT:
                return blkdev_pr_preempt(bdev, mode, argp, false);
        case IOC_PR_PREEMPT_ABORT:
                return blkdev_pr_preempt(bdev, mode, argp, true);
        case IOC_PR_CLEAR:
                return blkdev_pr_clear(bdev, mode, argp);
        case IOC_PR_READ_KEYS:
                return blkdev_pr_read_keys(bdev, mode, argp);
        case IOC_PR_READ_RESERVATION:
                return blkdev_pr_read_reservation(bdev, mode, argp);
        default:
                return blk_get_meta_cap(bdev, cmd, argp);
        }
}

/*
 * Always keep this in sync with compat_blkdev_ioctl()
 * to handle all incompatible commands in both functions.
 *
 * New commands must be compatible and go into blkdev_common_ioctl
 */
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
        struct block_device *bdev = I_BDEV(file->f_mapping->host);
        void __user *argp = (void __user *)arg;
        blk_mode_t mode = file_to_blk_mode(file);
        int ret;

        switch (cmd) {
        /* These need separate implementations for the data structure */
        case HDIO_GETGEO:
                return blkdev_getgeo(bdev, argp);
        case BLKPG:
                return blkpg_ioctl(bdev, argp);

        /* Compat mode returns 32-bit data instead of 'long' */
        case BLKRAGET:
        case BLKFRAGET:
                if (!argp)
                        return -EINVAL;
                return put_long(argp,
                        (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
        case BLKGETSIZE:
                if (bdev_nr_sectors(bdev) > ~0UL)
                        return -EFBIG;
                return put_ulong(argp, bdev_nr_sectors(bdev));

        /* The data is compatible, but the command number is different */
        case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
                return put_int(argp, block_size(bdev));
        case BLKBSZSET:
                return blkdev_bszset(file, mode, argp);
        case BLKGETSIZE64:
                return put_u64(argp, bdev_nr_bytes(bdev));

        /* Incompatible alignment on i386 */
        case BLKTRACESETUP:
        case BLKTRACESETUP2:
                return blk_trace_ioctl(bdev, cmd, argp);
        default:
                break;
        }

        ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
        if (ret != -ENOIOCTLCMD)
                return ret;

        if (!bdev->bd_disk->fops->ioctl)
                return -ENOTTY;
        return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
}

#ifdef CONFIG_COMPAT

#define BLKBSZGET_32                _IOR(0x12, 112, int)
#define BLKBSZSET_32                _IOW(0x12, 113, int)
#define BLKGETSIZE64_32                _IOR(0x12, 114, int)

/* Most of the generic ioctls are handled in the normal fallback path.
   This assumes the blkdev's low level compat_ioctl always returns
   ENOIOCTLCMD for unknown ioctls. */
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
        int ret;
        void __user *argp = compat_ptr(arg);
        struct block_device *bdev = I_BDEV(file->f_mapping->host);
        struct gendisk *disk = bdev->bd_disk;
        blk_mode_t mode = file_to_blk_mode(file);

        switch (cmd) {
        /* These need separate implementations for the data structure */
        case HDIO_GETGEO:
                return compat_hdio_getgeo(bdev, argp);
        case BLKPG:
                return compat_blkpg_ioctl(bdev, argp);

        /* Compat mode returns 32-bit data instead of 'long' */
        case BLKRAGET:
        case BLKFRAGET:
                if (!argp)
                        return -EINVAL;
                return compat_put_long(argp,
                        (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
        case BLKGETSIZE:
                if (bdev_nr_sectors(bdev) > ~(compat_ulong_t)0)
                        return -EFBIG;
                return compat_put_ulong(argp, bdev_nr_sectors(bdev));

        /* The data is compatible, but the command number is different */
        case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
                return put_int(argp, bdev_logical_block_size(bdev));
        case BLKBSZSET_32:
                return blkdev_bszset(file, mode, argp);
        case BLKGETSIZE64_32:
                return put_u64(argp, bdev_nr_bytes(bdev));

        /* Incompatible alignment on i386 */
        case BLKTRACESETUP32:
                return blk_trace_ioctl(bdev, cmd, argp);
        default:
                break;
        }

        ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
        if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl)
                ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg);

        return ret;
}
#endif

struct blk_iou_cmd {
        int res;
        bool nowait;
};

static void blk_cmd_complete(struct io_tw_req tw_req, io_tw_token_t tw)
{
        struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
        struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);

        if (bic->res == -EAGAIN && bic->nowait)
                io_uring_cmd_issue_blocking(cmd);
        else
                io_uring_cmd_done(cmd, bic->res,
                                  IO_URING_CMD_TASK_WORK_ISSUE_FLAGS);
}

static void bio_cmd_bio_end_io(struct bio *bio)
{
        struct io_uring_cmd *cmd = bio->bi_private;
        struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);

        if (unlikely(bio->bi_status) && !bic->res)
                bic->res = blk_status_to_errno(bio->bi_status);

        io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete);
        bio_put(bio);
}

static int blkdev_cmd_discard(struct io_uring_cmd *cmd,
                              struct block_device *bdev,
                              uint64_t start, uint64_t len, bool nowait)
{
        struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
        gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL;
        sector_t sector = start >> SECTOR_SHIFT;
        sector_t nr_sects = len >> SECTOR_SHIFT;
        struct bio *prev = NULL, *bio;
        int err;

        if (!bdev_max_discard_sectors(bdev))
                return -EOPNOTSUPP;
        if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE))
                return -EBADF;
        if (bdev_read_only(bdev))
                return -EPERM;
        err = blk_validate_byte_range(bdev, start, len);
        if (err)
                return err;

        err = filemap_invalidate_pages(bdev->bd_mapping, start,
                                        start + len - 1, nowait);
        if (err)
                return err;

        while (true) {
                bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, gfp);
                if (!bio)
                        break;
                if (nowait) {
                        /*
                         * Don't allow multi-bio non-blocking submissions as
                         * subsequent bios may fail but we won't get a direct
                         * indication of that. Normally, the caller should
                         * retry from a blocking context.
                         */
                        if (unlikely(nr_sects)) {
                                bio_put(bio);
                                return -EAGAIN;
                        }
                        bio->bi_opf |= REQ_NOWAIT;
                }

                prev = bio_chain_and_submit(prev, bio);
        }
        if (unlikely(!prev))
                return -EAGAIN;
        if (unlikely(nr_sects))
                bic->res = -EAGAIN;

        prev->bi_private = cmd;
        prev->bi_end_io = bio_cmd_bio_end_io;
        submit_bio(prev);
        return -EIOCBQUEUED;
}

int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
        struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host);
        struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
        const struct io_uring_sqe *sqe = cmd->sqe;
        u32 cmd_op = cmd->cmd_op;
        uint64_t start, len;

        if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len ||
                     sqe->rw_flags || sqe->file_index))
                return -EINVAL;

        bic->res = 0;
        bic->nowait = issue_flags & IO_URING_F_NONBLOCK;

        start = READ_ONCE(sqe->addr);
        len = READ_ONCE(sqe->addr3);

        switch (cmd_op) {
        case BLOCK_URING_CMD_DISCARD:
                return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait);
        }
        return -EINVAL;
}





























   15 







   17 





   18 




   79 





   73 





   18 











   17 



   16 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * printk_safe.c - Safe printk for printk-deadlock-prone contexts
 */

#include <linux/preempt.h>
#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/printk.h>
#include <linux/kprobes.h>

#include "internal.h"

/* Context where printk messages are never suppressed */
static atomic_t force_con;

void printk_force_console_enter(void)
{
        atomic_inc(&force_con);
}

void printk_force_console_exit(void)
{
        atomic_dec(&force_con);
}

bool is_printk_force_console(void)
{
        return atomic_read(&force_con);
}

static DEFINE_PER_CPU(int, printk_context);

/* Can be preempted by NMI. */
void __printk_safe_enter(void)
{
        this_cpu_inc(printk_context);
}

/* Can be preempted by NMI. */
void __printk_safe_exit(void)
{
        this_cpu_dec(printk_context);
}

void __printk_deferred_enter(void)
{
        cant_migrate();
        __printk_safe_enter();
}

void __printk_deferred_exit(void)
{
        cant_migrate();
        __printk_safe_exit();
}

bool is_printk_legacy_deferred(void)
{
        /*
         * The per-CPU variable @printk_context can be read safely in any
         * context. CPU migration is always disabled when set.
         *
         * A context holding the printk_cpu_sync must not spin waiting for
         * another CPU. For legacy printing, it could be the console_lock
         * or the port lock.
         */
        return (force_legacy_kthread() ||
                this_cpu_read(printk_context) ||
                in_nmi() ||
                is_printk_cpu_sync_owner());
}

asmlinkage int vprintk(const char *fmt, va_list args)
{
#ifdef CONFIG_KGDB_KDB
        /* Allow to pass printk() to kdb but avoid a recursion. */
        if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
                return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
#endif
        return vprintk_default(fmt, args);
}
EXPORT_SYMBOL(vprintk);




























































































































































































   12 





   13 



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
// SPDX-License-Identifier: GPL-2.0-only
/*
 * The "user cache".
 *
 * (C) Copyright 1991-2000 Linus Torvalds
 *
 * We have a per-user structure to keep track of how many
 * processes, files etc the user has claimed, in order to be
 * able to have per-user limits for system resources. 
 */

#include <linux/init.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/key.h>
#include <linux/sched/user.h>
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
#include <linux/proc_ns.h>

#if IS_ENABLED(CONFIG_BINFMT_MISC)
struct binfmt_misc init_binfmt_misc = {
        .entries = LIST_HEAD_INIT(init_binfmt_misc.entries),
        .enabled = true,
        .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock),
};
EXPORT_SYMBOL_GPL(init_binfmt_misc);
#endif

/*
 * userns count is 1 for root user, 1 for init_uts_ns,
 * and 1 for... ?
 */
struct user_namespace init_user_ns = {
        .ns = NS_COMMON_INIT(init_user_ns),
        .uid_map = {
                {
                        .extent[0] = {
                                .first = 0,
                                .lower_first = 0,
                                .count = 4294967295U,
                        },
                        .nr_extents = 1,
                },
        },
        .gid_map = {
                {
                        .extent[0] = {
                                .first = 0,
                                .lower_first = 0,
                                .count = 4294967295U,
                        },
                        .nr_extents = 1,
                },
        },
        .projid_map = {
                {
                        .extent[0] = {
                                .first = 0,
                                .lower_first = 0,
                                .count = 4294967295U,
                        },
                        .nr_extents = 1,
                },
        },
        .owner = GLOBAL_ROOT_UID,
        .group = GLOBAL_ROOT_GID,
        .flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_KEYS
        .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
        .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
#endif
#if IS_ENABLED(CONFIG_BINFMT_MISC)
        .binfmt_misc = &init_binfmt_misc,
#endif
};
EXPORT_SYMBOL_GPL(init_user_ns);

/*
 * UID task count cache, to get fast user lookup in "alloc_uid"
 * when changing user ID's (ie setuid() and friends).
 */

#define UIDHASH_BITS        (IS_ENABLED(CONFIG_BASE_SMALL) ? 3 : 7)
#define UIDHASH_SZ        (1 << UIDHASH_BITS)
#define UIDHASH_MASK                (UIDHASH_SZ - 1)
#define __uidhashfn(uid)        (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
#define uidhashentry(uid)        (uidhash_table + __uidhashfn((__kuid_val(uid))))

static struct kmem_cache *uid_cachep;
static struct hlist_head uidhash_table[UIDHASH_SZ];

/*
 * The uidhash_lock is mostly taken from process context, but it is
 * occasionally also taken from softirq/tasklet context, when
 * task-structs get RCU-freed. Hence all locking must be softirq-safe.
 * But free_uid() is also called with local interrupts disabled, and running
 * local_bh_enable() with local interrupts disabled is an error - we'll run
 * softirq callbacks, and they can unconditionally enable interrupts, and
 * the caller of free_uid() didn't expect that..
 */
static DEFINE_SPINLOCK(uidhash_lock);

/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
        .__count        = REFCOUNT_INIT(1),
        .uid                = GLOBAL_ROOT_UID,
        .ratelimit        = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
};

/*
 * These routines must be called with the uidhash spinlock held!
 */
static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
{
        hlist_add_head(&up->uidhash_node, hashent);
}

static void uid_hash_remove(struct user_struct *up)
{
        hlist_del_init(&up->uidhash_node);
}

static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
{
        struct user_struct *user;

        hlist_for_each_entry(user, hashent, uidhash_node) {
                if (uid_eq(user->uid, uid)) {
                        refcount_inc(&user->__count);
                        return user;
                }
        }

        return NULL;
}

static int user_epoll_alloc(struct user_struct *up)
{
#ifdef CONFIG_EPOLL
        return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL);
#else
        return 0;
#endif
}

static void user_epoll_free(struct user_struct *up)
{
#ifdef CONFIG_EPOLL
        percpu_counter_destroy(&up->epoll_watches);
#endif
}

/* IRQs are disabled and uidhash_lock is held upon function entry.
 * IRQ state (as stored in flags) is restored and uidhash_lock released
 * upon function exit.
 */
static void free_user(struct user_struct *up, unsigned long flags)
        __releases(&uidhash_lock)
{
        uid_hash_remove(up);
        spin_unlock_irqrestore(&uidhash_lock, flags);
        user_epoll_free(up);
        kmem_cache_free(uid_cachep, up);
}

/*
 * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
 * caller must undo that ref with free_uid().
 *
 * If the user_struct could not be found, return NULL.
 */
struct user_struct *find_user(kuid_t uid)
{
        struct user_struct *ret;
        unsigned long flags;

        spin_lock_irqsave(&uidhash_lock, flags);
        ret = uid_hash_find(uid, uidhashentry(uid));
        spin_unlock_irqrestore(&uidhash_lock, flags);
        return ret;
}

void free_uid(struct user_struct *up)
{
        unsigned long flags;

        if (!up)
                return;

        if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags))
                free_user(up, flags);
}
EXPORT_SYMBOL_GPL(free_uid);

struct user_struct *alloc_uid(kuid_t uid)
{
        struct hlist_head *hashent = uidhashentry(uid);
        struct user_struct *up, *new;

        spin_lock_irq(&uidhash_lock);
        up = uid_hash_find(uid, hashent);
        spin_unlock_irq(&uidhash_lock);

        if (!up) {
                new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
                if (!new)
                        return NULL;

                new->uid = uid;
                refcount_set(&new->__count, 1);
                if (user_epoll_alloc(new)) {
                        kmem_cache_free(uid_cachep, new);
                        return NULL;
                }
                ratelimit_state_init(&new->ratelimit, HZ, 100);
                ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);

                /*
                 * Before adding this, check whether we raced
                 * on adding the same user already..
                 */
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
                        user_epoll_free(new);
                        kmem_cache_free(uid_cachep, new);
                } else {
                        uid_hash_insert(new, hashent);
                        up = new;
                }
                spin_unlock_irq(&uidhash_lock);
        }

        return up;
}

static int __init uid_cache_init(void)
{
        int n;

        uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);

        for(n = 0; n < UIDHASH_SZ; ++n)
                INIT_HLIST_HEAD(uidhash_table + n);

        if (user_epoll_alloc(&root_user))
                panic("root_user epoll percpu counter alloc failed");

        /* Insert the root user immediately (init already runs as root) */
        spin_lock_irq(&uidhash_lock);
        uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
        spin_unlock_irq(&uidhash_lock);

        return 0;
}
subsys_initcall(uid_cache_init);




























































































































































































































    1 











    1 












    1 

    1 
    1 






   21 






   22 


   21 




   19 









   19 







   21 






   20 


   23 































































































































    1 






    1 












    1 





    1 

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
// SPDX-License-Identifier: GPL-2.0-or-later

#define pr_fmt(fmt) "ref_tracker: " fmt

#include <linux/export.h>
#include <linux/list_sort.h>
#include <linux/ref_tracker.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/stackdepot.h>
#include <linux/seq_file.h>

#define REF_TRACKER_STACK_ENTRIES 16
#define STACK_BUF_SIZE 1024

struct ref_tracker {
        struct list_head        head;   /* anchor into dir->list or dir->quarantine */
        bool                        dead;
        depot_stack_handle_t        alloc_stack_handle;
        depot_stack_handle_t        free_stack_handle;
};

struct ref_tracker_dir_stats {
        int total;
        int count;
        struct {
                depot_stack_handle_t stack_handle;
                unsigned int count;
        } stacks[];
};

#ifdef CONFIG_DEBUG_FS
#include <linux/xarray.h>

/*
 * ref_tracker_dir_init() is usually called in allocation-safe contexts, but
 * the same is not true of ref_tracker_dir_exit() which can be called from
 * anywhere an object is freed. Removing debugfs dentries is a blocking
 * operation, so we defer that work to the debugfs_reap_worker.
 *
 * Each dentry is tracked in the appropriate xarray.  When
 * ref_tracker_dir_exit() is called, its entries in the xarrays are marked and
 * the workqueue job is scheduled. The worker then runs and deletes any marked
 * dentries asynchronously.
 */
static struct xarray                debugfs_dentries;
static struct xarray                debugfs_symlinks;
static struct work_struct        debugfs_reap_worker;

#define REF_TRACKER_DIR_DEAD        XA_MARK_0
static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir)
{
        unsigned long flags;

        xa_lock_irqsave(&debugfs_dentries, flags);
        __xa_set_mark(&debugfs_dentries, (unsigned long)dir, REF_TRACKER_DIR_DEAD);
        xa_unlock_irqrestore(&debugfs_dentries, flags);

        xa_lock_irqsave(&debugfs_symlinks, flags);
        __xa_set_mark(&debugfs_symlinks, (unsigned long)dir, REF_TRACKER_DIR_DEAD);
        xa_unlock_irqrestore(&debugfs_symlinks, flags);

        schedule_work(&debugfs_reap_worker);
}
#else
static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir)
{
}
#endif

static struct ref_tracker_dir_stats *
ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit)
{
        struct ref_tracker_dir_stats *stats;
        struct ref_tracker *tracker;

        stats = kmalloc_flex(*stats, stacks, limit, GFP_NOWAIT);
        if (!stats)
                return ERR_PTR(-ENOMEM);
        stats->total = 0;
        stats->count = 0;

        list_for_each_entry(tracker, &dir->list, head) {
                depot_stack_handle_t stack = tracker->alloc_stack_handle;
                int i;

                ++stats->total;
                for (i = 0; i < stats->count; ++i)
                        if (stats->stacks[i].stack_handle == stack)
                                break;
                if (i >= limit)
                        continue;
                if (i >= stats->count) {
                        stats->stacks[i].stack_handle = stack;
                        stats->stacks[i].count = 0;
                        ++stats->count;
                }
                ++stats->stacks[i].count;
        }

        return stats;
}

struct ostream {
        void __ostream_printf (*func)(struct ostream *stream, char *fmt, ...);
        char *prefix;
        char *buf;
        struct seq_file *seq;
        int size, used;
};

static void __ostream_printf pr_ostream_log(struct ostream *stream, char *fmt, ...)
{
        va_list args;

        va_start(args, fmt);
        vprintk(fmt, args);
        va_end(args);
}

static void __ostream_printf pr_ostream_buf(struct ostream *stream, char *fmt, ...)
{
        int ret, len = stream->size - stream->used;
        va_list args;

        va_start(args, fmt);
        ret = vsnprintf(stream->buf + stream->used, len, fmt, args);
        va_end(args);
        if (ret > 0)
                stream->used += min(ret, len);
}

#define pr_ostream(stream, fmt, args...) \
({ \
        struct ostream *_s = (stream); \
\
        _s->func(_s, fmt, ##args); \
})

static void
__ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
                             unsigned int display_limit, struct ostream *s)
{
        struct ref_tracker_dir_stats *stats;
        unsigned int i = 0, skipped;
        depot_stack_handle_t stack;
        char *sbuf;

        lockdep_assert_held(&dir->lock);

        if (list_empty(&dir->list))
                return;

        stats = ref_tracker_get_stats(dir, display_limit);
        if (IS_ERR(stats)) {
                pr_ostream(s, "%s%s@%p: couldn't get stats, error %pe\n",
                           s->prefix, dir->class, dir, stats);
                return;
        }

        sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT);

        for (i = 0, skipped = stats->total; i < stats->count; ++i) {
                stack = stats->stacks[i].stack_handle;
                if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4))
                        sbuf[0] = 0;
                pr_ostream(s, "%s%s@%p has %d/%d users at\n%s\n", s->prefix,
                           dir->class, dir, stats->stacks[i].count,
                           stats->total, sbuf);
                skipped -= stats->stacks[i].count;
        }

        if (skipped)
                pr_ostream(s, "%s%s@%p skipped reports about %d/%d users.\n",
                           s->prefix, dir->class, dir, skipped, stats->total);

        kfree(sbuf);

        kfree(stats);
}

void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir,
                                  unsigned int display_limit)
{
        struct ostream os = { .func = pr_ostream_log,
                              .prefix = "ref_tracker: " };

        __ref_tracker_dir_pr_ostream(dir, display_limit, &os);
}
EXPORT_SYMBOL(ref_tracker_dir_print_locked);

void ref_tracker_dir_print(struct ref_tracker_dir *dir,
                           unsigned int display_limit)
{
        unsigned long flags;

        spin_lock_irqsave(&dir->lock, flags);
        ref_tracker_dir_print_locked(dir, display_limit);
        spin_unlock_irqrestore(&dir->lock, flags);
}
EXPORT_SYMBOL(ref_tracker_dir_print);

int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size)
{
        struct ostream os = { .func = pr_ostream_buf,
                              .prefix = "ref_tracker: ",
                              .buf = buf,
                              .size = size };
        unsigned long flags;

        spin_lock_irqsave(&dir->lock, flags);
        __ref_tracker_dir_pr_ostream(dir, 16, &os);
        spin_unlock_irqrestore(&dir->lock, flags);

        return os.used;
}
EXPORT_SYMBOL(ref_tracker_dir_snprint);

void ref_tracker_dir_exit(struct ref_tracker_dir *dir)
{
        struct ref_tracker *tracker, *n;
        unsigned long flags;
        bool leak = false;

        dir->dead = true;
        /*
         * The xarray entries must be marked before the dir->lock is taken to
         * protect simultaneous debugfs readers.
         */
        ref_tracker_debugfs_mark(dir);
        spin_lock_irqsave(&dir->lock, flags);
        list_for_each_entry_safe(tracker, n, &dir->quarantine, head) {
                list_del(&tracker->head);
                kfree(tracker);
                dir->quarantine_avail++;
        }
        if (!list_empty(&dir->list)) {
                ref_tracker_dir_print_locked(dir, 16);
                leak = true;
                list_for_each_entry_safe(tracker, n, &dir->list, head) {
                        list_del(&tracker->head);
                        kfree(tracker);
                }
        }
        spin_unlock_irqrestore(&dir->lock, flags);
        WARN_ON_ONCE(leak);
        WARN_ON_ONCE(refcount_read(&dir->untracked) != 1);
        WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1);
}
EXPORT_SYMBOL(ref_tracker_dir_exit);

int ref_tracker_alloc(struct ref_tracker_dir *dir,
                      struct ref_tracker **trackerp,
                      gfp_t gfp)
{
        unsigned long entries[REF_TRACKER_STACK_ENTRIES];
        struct ref_tracker *tracker;
        unsigned int nr_entries;
        gfp_t gfp_mask = gfp | __GFP_NOWARN;
        unsigned long flags;

        WARN_ON_ONCE(dir->dead);

        if (!trackerp) {
                refcount_inc(&dir->no_tracker);
                return 0;
        }
        if (gfp & __GFP_DIRECT_RECLAIM)
                gfp_mask |= __GFP_NOFAIL;
        *trackerp = tracker = kzalloc_obj(*tracker, gfp_mask);
        if (unlikely(!tracker)) {
                pr_err_once("memory allocation failure, unreliable refcount tracker.\n");
                refcount_inc(&dir->untracked);
                return -ENOMEM;
        }
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
        tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp);

        spin_lock_irqsave(&dir->lock, flags);
        list_add(&tracker->head, &dir->list);
        spin_unlock_irqrestore(&dir->lock, flags);
        return 0;
}
EXPORT_SYMBOL_GPL(ref_tracker_alloc);

int ref_tracker_free(struct ref_tracker_dir *dir,
                     struct ref_tracker **trackerp)
{
        unsigned long entries[REF_TRACKER_STACK_ENTRIES];
        depot_stack_handle_t stack_handle;
        struct ref_tracker *tracker;
        unsigned int nr_entries;
        unsigned long flags;

        WARN_ON_ONCE(dir->dead);

        if (!trackerp) {
                refcount_dec(&dir->no_tracker);
                return 0;
        }
        tracker = *trackerp;
        if (!tracker) {
                refcount_dec(&dir->untracked);
                return -EEXIST;
        }
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
        stack_handle = stack_depot_save(entries, nr_entries,
                                        GFP_NOWAIT);

        spin_lock_irqsave(&dir->lock, flags);
        if (tracker->dead) {
                pr_err("reference already released.\n");
                if (tracker->alloc_stack_handle) {
                        pr_err("allocated in:\n");
                        stack_depot_print(tracker->alloc_stack_handle);
                }
                if (tracker->free_stack_handle) {
                        pr_err("freed in:\n");
                        stack_depot_print(tracker->free_stack_handle);
                }
                spin_unlock_irqrestore(&dir->lock, flags);
                WARN_ON_ONCE(1);
                return -EINVAL;
        }
        tracker->dead = true;

        tracker->free_stack_handle = stack_handle;

        list_move_tail(&tracker->head, &dir->quarantine);
        if (!dir->quarantine_avail) {
                tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head);
                list_del(&tracker->head);
        } else {
                dir->quarantine_avail--;
                tracker = NULL;
        }
        spin_unlock_irqrestore(&dir->lock, flags);

        kfree(tracker);
        return 0;
}
EXPORT_SYMBOL_GPL(ref_tracker_free);

#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>

static struct dentry *ref_tracker_debug_dir = (struct dentry *)-ENOENT;

static void __ostream_printf pr_ostream_seq(struct ostream *stream, char *fmt, ...)
{
        va_list args;

        va_start(args, fmt);
        seq_vprintf(stream->seq, fmt, args);
        va_end(args);
}

static int ref_tracker_dir_seq_print(struct ref_tracker_dir *dir, struct seq_file *seq)
{
        struct ostream os = { .func = pr_ostream_seq,
                              .prefix = "",
                              .seq = seq };

        __ref_tracker_dir_pr_ostream(dir, 16, &os);

        return os.used;
}

static int ref_tracker_debugfs_show(struct seq_file *f, void *v)
{
        struct ref_tracker_dir *dir = f->private;
        unsigned long index = (unsigned long)dir;
        unsigned long flags;
        int ret;

        /*
         * "dir" may not exist at this point if ref_tracker_dir_exit() has
         * already been called. Take care not to dereference it until its
         * legitimacy is established.
         *
         * The xa_lock is necessary to ensure that "dir" doesn't disappear
         * before its lock can be taken. If it's in the hash and not marked
         * dead, then it's safe to take dir->lock which prevents
         * ref_tracker_dir_exit() from completing. Once the dir->lock is
         * acquired, the xa_lock can be released. All of this must be IRQ-safe.
         */
        xa_lock_irqsave(&debugfs_dentries, flags);
        if (!xa_load(&debugfs_dentries, index) ||
            xa_get_mark(&debugfs_dentries, index, REF_TRACKER_DIR_DEAD)) {
                xa_unlock_irqrestore(&debugfs_dentries, flags);
                return -ENODATA;
        }

        spin_lock(&dir->lock);
        xa_unlock(&debugfs_dentries);
        ret = ref_tracker_dir_seq_print(dir, f);
        spin_unlock_irqrestore(&dir->lock, flags);
        return ret;
}

static int ref_tracker_debugfs_open(struct inode *inode, struct file *filp)
{
        struct ref_tracker_dir *dir = inode->i_private;

        return single_open(filp, ref_tracker_debugfs_show, dir);
}

static const struct file_operations ref_tracker_debugfs_fops = {
        .owner                = THIS_MODULE,
        .open                = ref_tracker_debugfs_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = single_release,
};

/**
 * ref_tracker_dir_debugfs - create debugfs file for ref_tracker_dir
 * @dir: ref_tracker_dir to be associated with debugfs file
 *
 * In most cases, a debugfs file will be created automatically for every
 * ref_tracker_dir. If the object was created before debugfs is brought up
 * then that may fail. In those cases, it is safe to call this at a later
 * time to create the file.
 */
void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir)
{
        char name[NAME_MAX + 1];
        struct dentry *dentry;
        int ret;

        /* No-op if already created */
        dentry = xa_load(&debugfs_dentries, (unsigned long)dir);
        if (dentry && !xa_is_err(dentry))
                return;

        ret = snprintf(name, sizeof(name), "%s@%p", dir->class, dir);
        name[sizeof(name) - 1] = '\0';

        if (ret < sizeof(name)) {
                dentry = debugfs_create_file(name, S_IFREG | 0400,
                                             ref_tracker_debug_dir, dir,
                                             &ref_tracker_debugfs_fops);
                if (!IS_ERR(dentry)) {
                        void *old;

                        old = xa_store_irq(&debugfs_dentries, (unsigned long)dir,
                                           dentry, GFP_KERNEL);

                        if (xa_is_err(old))
                                debugfs_remove(dentry);
                        else
                                WARN_ON_ONCE(old);
                }
        }
}
EXPORT_SYMBOL(ref_tracker_dir_debugfs);

void __ostream_printf ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...)
{
        char name[NAME_MAX + 1];
        struct dentry *symlink, *dentry;
        va_list args;
        int ret;

        symlink = xa_load(&debugfs_symlinks, (unsigned long)dir);
        dentry = xa_load(&debugfs_dentries, (unsigned long)dir);

        /* Already created?*/
        if (symlink && !xa_is_err(symlink))
                return;

        if (!dentry || xa_is_err(dentry))
                return;

        va_start(args, fmt);
        ret = vsnprintf(name, sizeof(name), fmt, args);
        va_end(args);
        name[sizeof(name) - 1] = '\0';

        if (ret < sizeof(name)) {
                symlink = debugfs_create_symlink(name, ref_tracker_debug_dir,
                                                 dentry->d_name.name);
                if (!IS_ERR(symlink)) {
                        void *old;

                        old = xa_store_irq(&debugfs_symlinks, (unsigned long)dir,
                                           symlink, GFP_KERNEL);
                        if (xa_is_err(old))
                                debugfs_remove(symlink);
                        else
                                WARN_ON_ONCE(old);
                }
        }
}
EXPORT_SYMBOL(ref_tracker_dir_symlink);

static void debugfs_reap_work(struct work_struct *work)
{
        struct dentry *dentry;
        unsigned long index;
        bool reaped;

        do {
                reaped = false;
                xa_for_each_marked(&debugfs_symlinks, index, dentry, REF_TRACKER_DIR_DEAD) {
                        xa_erase_irq(&debugfs_symlinks, index);
                        debugfs_remove(dentry);
                        reaped = true;
                }
                xa_for_each_marked(&debugfs_dentries, index, dentry, REF_TRACKER_DIR_DEAD) {
                        xa_erase_irq(&debugfs_dentries, index);
                        debugfs_remove(dentry);
                        reaped = true;
                }
        } while (reaped);
}

static int __init ref_tracker_debugfs_postcore_init(void)
{
        INIT_WORK(&debugfs_reap_worker, debugfs_reap_work);
        xa_init_flags(&debugfs_dentries, XA_FLAGS_LOCK_IRQ);
        xa_init_flags(&debugfs_symlinks, XA_FLAGS_LOCK_IRQ);
        return 0;
}
postcore_initcall(ref_tracker_debugfs_postcore_init);

static int __init ref_tracker_debugfs_late_init(void)
{
        ref_tracker_debug_dir = debugfs_create_dir("ref_tracker", NULL);
        return 0;
}
late_initcall(ref_tracker_debugfs_late_init);
#endif /* CONFIG_DEBUG_FS */

































   10 


















   17 





















































































    3 






















































    4 





























   30 































































































   19 










   71 


















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM timer

#if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TIMER_H

#include <linux/tracepoint.h>
#include <linux/hrtimer.h>
#include <linux/timer.h>

DECLARE_EVENT_CLASS(timer_class,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer),

        TP_STRUCT__entry(
                __field( void *,        timer        )
        ),

        TP_fast_assign(
                __entry->timer        = timer;
        ),

        TP_printk("timer=%p", __entry->timer)
);

/**
 * timer_init - called when the timer is initialized
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_init,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

#define decode_timer_flags(flags)                        \
        __print_flags(flags, "|",                        \
                {  TIMER_MIGRATING,        "M" },                \
                {  TIMER_DEFERRABLE,        "D" },                \
                {  TIMER_PINNED,        "P" },                \
                {  TIMER_IRQSAFE,        "I" })

/**
 * timer_start - called when the timer is started
 * @timer:                pointer to struct timer_list
 * @bucket_expiry:        the bucket expiry time
 */
TRACE_EVENT(timer_start,

        TP_PROTO(struct timer_list *timer,
                unsigned long bucket_expiry),

        TP_ARGS(timer, bucket_expiry),

        TP_STRUCT__entry(
                __field( void *,        timer                )
                __field( void *,        function        )
                __field( unsigned long,        expires                )
                __field( unsigned long,        bucket_expiry        )
                __field( unsigned long,        now                )
                __field( unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->function        = timer->function;
                __entry->expires        = timer->expires;
                __entry->bucket_expiry        = bucket_expiry;
                __entry->now                = jiffies;
                __entry->flags                = timer->flags;
        ),

        TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s",
                  __entry->timer, __entry->function, __entry->expires,
                  (long)__entry->expires - __entry->now,
                  __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK,
                  __entry->flags >> TIMER_ARRAYSHIFT,
                  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
);

/**
 * timer_expire_entry - called immediately before the timer callback
 * @timer:        pointer to struct timer_list
 * @baseclk:        value of timer_base::clk when timer expires
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(timer_expire_entry,

        TP_PROTO(struct timer_list *timer, unsigned long baseclk),

        TP_ARGS(timer, baseclk),

        TP_STRUCT__entry(
                __field( void *,        timer        )
                __field( unsigned long,        now        )
                __field( void *,        function)
                __field( unsigned long,        baseclk        )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->now                = jiffies;
                __entry->function        = timer->function;
                __entry->baseclk        = baseclk;
        ),

        TP_printk("timer=%p function=%ps now=%lu baseclk=%lu",
                  __entry->timer, __entry->function, __entry->now,
                  __entry->baseclk)
);

/**
 * timer_expire_exit - called immediately after the timer callback returns
 * @timer:        pointer to struct timer_list
 *
 * When used in combination with the timer_expire_entry tracepoint we can
 * determine the runtime of the timer callback function.
 *
 * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might
 * be invalid. We solely track the pointer.
 */
DEFINE_EVENT(timer_class, timer_expire_exit,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

/**
 * timer_cancel - called when the timer is canceled
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_cancel,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

TRACE_EVENT(timer_base_idle,

        TP_PROTO(bool is_idle, unsigned int cpu),

        TP_ARGS(is_idle, cpu),

        TP_STRUCT__entry(
                __field( bool,                is_idle        )
                __field( unsigned int,        cpu        )
        ),

        TP_fast_assign(
                __entry->is_idle        = is_idle;
                __entry->cpu                = cpu;
        ),

        TP_printk("is_idle=%d cpu=%d",
                  __entry->is_idle, __entry->cpu)
);

#define decode_clockid(type)                                                \
        __print_symbolic(type,                                                \
                { CLOCK_REALTIME,        "CLOCK_REALTIME"        },        \
                { CLOCK_MONOTONIC,        "CLOCK_MONOTONIC"        },        \
                { CLOCK_BOOTTIME,        "CLOCK_BOOTTIME"        },        \
                { CLOCK_TAI,                "CLOCK_TAI"                })

#define decode_hrtimer_mode(mode)                                        \
        __print_symbolic(mode,                                                \
                { HRTIMER_MODE_ABS,                "ABS"                },        \
                { HRTIMER_MODE_REL,                "REL"                },        \
                { HRTIMER_MODE_ABS_PINNED,        "ABS|PINNED"        },        \
                { HRTIMER_MODE_REL_PINNED,        "REL|PINNED"        },        \
                { HRTIMER_MODE_ABS_SOFT,        "ABS|SOFT"        },        \
                { HRTIMER_MODE_REL_SOFT,        "REL|SOFT"        },        \
                { HRTIMER_MODE_ABS_PINNED_SOFT,        "ABS|PINNED|SOFT" },        \
                { HRTIMER_MODE_REL_PINNED_SOFT,        "REL|PINNED|SOFT" },        \
                { HRTIMER_MODE_ABS_HARD,        "ABS|HARD" },                \
                { HRTIMER_MODE_REL_HARD,        "REL|HARD" },                \
                { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" },        \
                { HRTIMER_MODE_REL_PINNED_HARD,        "REL|PINNED|HARD" })

/**
 * hrtimer_setup - called when the hrtimer is initialized
 * @hrtimer:        pointer to struct hrtimer
 * @clockid:        the hrtimers clock
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_setup,

        TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid,
                 enum hrtimer_mode mode),

        TP_ARGS(hrtimer, clockid, mode),

        TP_STRUCT__entry(
                __field( void *,                hrtimer                )
                __field( clockid_t,                clockid                )
                __field( enum hrtimer_mode,        mode                )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->clockid        = clockid;
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer,
                  decode_clockid(__entry->clockid),
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_start - called when the hrtimer is started
 * @hrtimer:        pointer to struct hrtimer
 * @mode:        the hrtimers mode
 * @was_armed:        Was armed when hrtimer_start*() was invoked
 */
TRACE_EVENT(hrtimer_start,

        TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode, bool was_armed),

        TP_ARGS(hrtimer, mode, was_armed),

        TP_STRUCT__entry(
                __field( void *,        hrtimer                )
                __field( void *,        function        )
                __field( s64,                expires                )
                __field( s64,                softexpires        )
                __field( enum hrtimer_mode,        mode        )
                __field( bool,                was_armed        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->function        = ACCESS_PRIVATE(hrtimer, function);
                __entry->expires        = hrtimer_get_expires(hrtimer);
                __entry->softexpires        = hrtimer_get_softexpires(hrtimer);
                __entry->mode                = mode;
                __entry->was_armed        = was_armed;
        ),

        TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu "
                  "mode=%s was_armed=%d", __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->expires,
                  (unsigned long long) __entry->softexpires,
                  decode_hrtimer_mode(__entry->mode), __entry->was_armed)
);

/**
 * hrtimer_expire_entry - called immediately before the hrtimer callback
 * @hrtimer:        pointer to struct hrtimer
 * @now:        variable which contains current time of the timers base.
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(hrtimer_expire_entry,

        TP_PROTO(struct hrtimer *hrtimer, ktime_t now),

        TP_ARGS(hrtimer, now),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
                __field( s64,                now        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->now                = now;
                __entry->function        = ACCESS_PRIVATE(hrtimer, function);
        ),

        TP_printk("hrtimer=%p function=%ps now=%llu",
                  __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->now)
);

DECLARE_EVENT_CLASS(hrtimer_class,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
        ),

        TP_printk("hrtimer=%p", __entry->hrtimer)
);

/**
 * hrtimer_expire_exit - called immediately after the hrtimer callback returns
 * @hrtimer:        pointer to struct hrtimer
 *
 * When used in combination with the hrtimer_expire_entry tracepoint we can
 * determine the runtime of the callback function.
 */
DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * hrtimer_cancel - called when the hrtimer is canceled
 * @hrtimer:        pointer to struct hrtimer
 */
DEFINE_EVENT(hrtimer_class, hrtimer_cancel,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * hrtimer_rearm - Invoked when the clockevent device is rearmed
 * @next_event:        The next expiry time (CLOCK_MONOTONIC)
 */
TRACE_EVENT(hrtimer_rearm,

        TP_PROTO(ktime_t next_event, bool deferred),

        TP_ARGS(next_event, deferred),

        TP_STRUCT__entry(
                __field( s64,                next_event        )
                __field( bool,                deferred        )
        ),

        TP_fast_assign(
                __entry->next_event        = next_event;
                __entry->deferred        = deferred;
        ),

        TP_printk("next_event=%llu deferred=%d",
                  (unsigned long long) __entry->next_event, __entry->deferred)
);

/**
 * itimer_state - called when itimer is started or canceled
 * @which:        name of the interval timer
 * @value:        the itimers value, itimer is canceled if value->it_value is
 *                zero, otherwise it is started
 * @expires:        the itimers expiry time
 */
TRACE_EVENT(itimer_state,

        TP_PROTO(int which, const struct itimerspec64 *const value,
                 unsigned long long expires),

        TP_ARGS(which, value, expires),

        TP_STRUCT__entry(
                __field(        int,                        which                )
                __field(        unsigned long long,        expires                )
                __field(        long,                        value_sec        )
                __field(        long,                        value_nsec        )
                __field(        long,                        interval_sec        )
                __field(        long,                        interval_nsec        )
        ),

        TP_fast_assign(
                __entry->which                = which;
                __entry->expires        = expires;
                __entry->value_sec        = value->it_value.tv_sec;
                __entry->value_nsec        = value->it_value.tv_nsec;
                __entry->interval_sec        = value->it_interval.tv_sec;
                __entry->interval_nsec        = value->it_interval.tv_nsec;
        ),

        TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld",
                  __entry->which, __entry->expires,
                  __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC,
                  __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC)
);

/**
 * itimer_expire - called when itimer expires
 * @which:        type of the interval timer
 * @pid:        pid of the process which owns the timer
 * @now:        current time, used to calculate the latency of itimer
 */
TRACE_EVENT(itimer_expire,

        TP_PROTO(int which, struct pid *pid, unsigned long long now),

        TP_ARGS(which, pid, now),

        TP_STRUCT__entry(
                __field( int ,                        which        )
                __field( pid_t,                        pid        )
                __field( unsigned long long,        now        )
        ),

        TP_fast_assign(
                __entry->which        = which;
                __entry->now        = now;
                __entry->pid        = pid_nr(pid);
        ),

        TP_printk("which=%d pid=%d now=%llu", __entry->which,
                  (int) __entry->pid, __entry->now)
);

#ifdef CONFIG_NO_HZ_COMMON

#define TICK_DEP_NAMES                                        \
                tick_dep_mask_name(NONE)                \
                tick_dep_name(POSIX_TIMER)                \
                tick_dep_name(PERF_EVENTS)                \
                tick_dep_name(SCHED)                        \
                tick_dep_name(CLOCK_UNSTABLE)                \
                tick_dep_name(RCU)                        \
                tick_dep_name_end(RCU_EXP)

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

/* The MASK will convert to their bits and they need to be processed too */
#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
/* NONE only has a mask defined for it */
#define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);

TICK_DEP_NAMES

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }

#define show_tick_dep_name(val)                                \
        __print_symbolic(val, TICK_DEP_NAMES)

TRACE_EVENT(tick_stop,

        TP_PROTO(int success, int dependency),

        TP_ARGS(success, dependency),

        TP_STRUCT__entry(
                __field( int ,                success        )
                __field( int ,                dependency )
        ),

        TP_fast_assign(
                __entry->success        = success;
                __entry->dependency        = dependency;
        ),

        TP_printk("success=%d dependency=%s",  __entry->success, \
                        show_tick_dep_name(__entry->dependency))
);
#endif

#endif /*  _TRACE_TIMER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






















   10 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the ICMP protocol.
 *
 * Version:        @(#)icmp.h        1.0.3        04/28/93
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_ICMP_H
#define        _LINUX_ICMP_H

#include <linux/skbuff.h>
#include <uapi/linux/icmp.h>
#include <uapi/linux/errqueue.h>

static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb)
{
        return (struct icmphdr *)skb_transport_header(skb);
}

static inline bool icmp_is_err(int type)
{
        switch (type) {
        case ICMP_DEST_UNREACH:
        case ICMP_SOURCE_QUENCH:
        case ICMP_REDIRECT:
        case ICMP_TIME_EXCEEDED:
        case ICMP_PARAMETERPROB:
                return true;
        }

        return false;
}

void ip_icmp_error_rfc4884(const struct sk_buff *skb,
                           struct sock_ee_data_rfc4884 *out,
                           int thlen, int off);

/* RFC 4884 */
#define ICMP_EXT_ORIG_DGRAM_MIN_LEN        128
#define ICMP_EXT_VERSION_2                2

/* ICMP Extension Object Classes */
#define ICMP_EXT_OBJ_CLASS_IIO                2        /* RFC 5837 */

/* Interface Information Object - RFC 5837 */
enum {
        ICMP_EXT_CTYPE_IIO_ROLE_IIF,
};

#define ICMP_EXT_CTYPE_IIO_ROLE(ROLE)        ((ROLE) << 6)
#define ICMP_EXT_CTYPE_IIO_MTU                BIT(0)
#define ICMP_EXT_CTYPE_IIO_NAME                BIT(1)
#define ICMP_EXT_CTYPE_IIO_IPADDR        BIT(2)
#define ICMP_EXT_CTYPE_IIO_IFINDEX        BIT(3)

struct icmp_ext_iio_name_subobj {
        u8 len;
        char name[IFNAMSIZ];
};

enum {
        /* RFC 5837 - Incoming IP Interface Role */
        ICMP_ERR_EXT_IIO_IIF,
        /* Add new constants above. Used by "icmp_errors_extension_mask"
         * sysctl.
         */
        ICMP_ERR_EXT_COUNT,
};

#endif        /* _LINUX_ICMP_H */















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * acpi.h - ACPI Interface
 *
 * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
 */

#ifndef _LINUX_ACPI_H
#define _LINUX_ACPI_H

#include <linux/cleanup.h>
#include <linux/errno.h>
#include <linux/ioport.h>        /* for struct resource */
#include <linux/resource_ext.h>
#include <linux/device.h>
#include <linux/mod_devicetable.h>
#include <linux/property.h>
#include <linux/uuid.h>
#include <linux/node.h>

struct irq_domain;
struct irq_domain_ops;

#ifndef _LINUX
#define _LINUX
#endif
#include <acpi/acpi.h>
#include <acpi/acpi_numa.h>

#ifdef        CONFIG_ACPI

#include <linux/list.h>
#include <linux/dynamic_debug.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/fw_table.h>

#include <acpi/acpi_bus.h>
#include <acpi/acpi_drivers.h>
#include <acpi/acpi_io.h>
#include <asm/acpi.h>

#ifdef CONFIG_ACPI_TABLE_LIB
#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, "ACPI")
#define __init_or_acpilib
#define __initdata_or_acpilib
#else
#define EXPORT_SYMBOL_ACPI_LIB(x)
#define __init_or_acpilib __init
#define __initdata_or_acpilib __initdata
#endif

static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
{
        return adev ? adev->handle : NULL;
}

#define ACPI_COMPANION(dev)                to_acpi_device_node((dev)->fwnode)
#define ACPI_COMPANION_SET(dev, adev)        set_primary_fwnode(dev, (adev) ? \
        acpi_fwnode_handle(adev) : NULL)
#define ACPI_HANDLE(dev)                acpi_device_handle(ACPI_COMPANION(dev))
#define ACPI_HANDLE_FWNODE(fwnode)        \
                                acpi_device_handle(to_acpi_device_node(fwnode))

static inline struct fwnode_handle *acpi_alloc_fwnode_static(void)
{
        struct fwnode_handle *fwnode;

        fwnode = kzalloc_obj(struct fwnode_handle);
        if (!fwnode)
                return NULL;

        fwnode_init(fwnode, &acpi_static_fwnode_ops);

        return fwnode;
}

static inline void acpi_free_fwnode_static(struct fwnode_handle *fwnode)
{
        if (WARN_ON(!is_acpi_static_node(fwnode)))
                return;

        kfree(fwnode);
}

static inline bool has_acpi_companion(struct device *dev)
{
        return is_acpi_device_node(dev->fwnode);
}

static inline void acpi_preset_companion(struct device *dev,
                                         struct acpi_device *parent, u64 addr)
{
        ACPI_COMPANION_SET(dev, acpi_find_child_device(parent, addr, false));
}

static inline const char *acpi_dev_name(struct acpi_device *adev)
{
        return dev_name(&adev->dev);
}

struct device *acpi_get_first_physical_node(struct acpi_device *adev);

enum acpi_irq_model_id {
        ACPI_IRQ_MODEL_PIC = 0,
        ACPI_IRQ_MODEL_IOAPIC,
        ACPI_IRQ_MODEL_IOSAPIC,
        ACPI_IRQ_MODEL_PLATFORM,
        ACPI_IRQ_MODEL_GIC,
        ACPI_IRQ_MODEL_GIC_V5,
        ACPI_IRQ_MODEL_LPIC,
        ACPI_IRQ_MODEL_RINTC,
        ACPI_IRQ_MODEL_COUNT
};

extern enum acpi_irq_model_id        acpi_irq_model;

enum acpi_interrupt_id {
        ACPI_INTERRUPT_PMI        = 1,
        ACPI_INTERRUPT_INIT,
        ACPI_INTERRUPT_CPEI,
        ACPI_INTERRUPT_COUNT
};

#define        ACPI_SPACE_MEM                0

enum acpi_address_range_id {
        ACPI_ADDRESS_RANGE_MEMORY = 1,
        ACPI_ADDRESS_RANGE_RESERVED = 2,
        ACPI_ADDRESS_RANGE_ACPI = 3,
        ACPI_ADDRESS_RANGE_NVS        = 4,
        ACPI_ADDRESS_RANGE_COUNT
};


/* Table Handlers */
typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);

/* Debugger support */

struct acpi_debugger_ops {
        int (*create_thread)(acpi_osd_exec_callback function, void *context);
        ssize_t (*write_log)(const char *msg);
        ssize_t (*read_cmd)(char *buffer, size_t length);
        int (*wait_command_ready)(bool single_step, char *buffer, size_t length);
        int (*notify_command_complete)(void);
};

struct acpi_debugger {
        const struct acpi_debugger_ops *ops;
        struct module *owner;
        struct mutex lock;
};

#ifdef CONFIG_ACPI_DEBUGGER
int __init acpi_debugger_init(void);
int acpi_register_debugger(struct module *owner,
                           const struct acpi_debugger_ops *ops);
void acpi_unregister_debugger(const struct acpi_debugger_ops *ops);
int acpi_debugger_create_thread(acpi_osd_exec_callback function, void *context);
ssize_t acpi_debugger_write_log(const char *msg);
ssize_t acpi_debugger_read_cmd(char *buffer, size_t buffer_length);
int acpi_debugger_wait_command_ready(void);
int acpi_debugger_notify_command_complete(void);
#else
static inline int acpi_debugger_init(void)
{
        return -ENODEV;
}

static inline int acpi_register_debugger(struct module *owner,
                                         const struct acpi_debugger_ops *ops)
{
        return -ENODEV;
}

static inline void acpi_unregister_debugger(const struct acpi_debugger_ops *ops)
{
}

static inline int acpi_debugger_create_thread(acpi_osd_exec_callback function,
                                              void *context)
{
        return -ENODEV;
}

static inline int acpi_debugger_write_log(const char *msg)
{
        return -ENODEV;
}

static inline int acpi_debugger_read_cmd(char *buffer, u32 buffer_length)
{
        return -ENODEV;
}

static inline int acpi_debugger_wait_command_ready(void)
{
        return -ENODEV;
}

static inline int acpi_debugger_notify_command_complete(void)
{
        return -ENODEV;
}
#endif

#define BAD_MADT_ENTRY(entry, end) (                                            \
                (!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
                ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))

void __iomem *__acpi_map_table(unsigned long phys, unsigned long size);
void __acpi_unmap_table(void __iomem *map, unsigned long size);
int early_acpi_boot_init(void);
int acpi_boot_init (void);
void acpi_boot_table_prepare (void);
void acpi_boot_table_init (void);
int acpi_mps_check (void);
int acpi_numa_init (void);

int acpi_locate_initial_tables (void);
void acpi_reserve_initial_tables (void);
void acpi_table_init_complete (void);
int acpi_table_init (void);

static inline struct acpi_table_header *acpi_get_table_pointer(char *signature, u32 instance)
{
        struct acpi_table_header *table;
        int status = acpi_get_table(signature, instance, &table);

        if (ACPI_FAILURE(status))
                return ERR_PTR(-ENOENT);
        return table;
}
DEFINE_FREE(acpi_put_table, struct acpi_table_header *, if (!IS_ERR_OR_NULL(_T)) acpi_put_table(_T))

int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
int __init_or_acpilib acpi_table_parse_entries(char *id,
                unsigned long table_size, int entry_id,
                acpi_tbl_entry_handler handler, unsigned int max_entries);
int __init_or_acpilib acpi_table_parse_entries_array(char *id,
                unsigned long table_size, struct acpi_subtable_proc *proc,
                int proc_num, unsigned int max_entries);
int acpi_table_parse_madt(enum acpi_madt_type id,
                          acpi_tbl_entry_handler handler,
                          unsigned int max_entries);
int __init_or_acpilib
acpi_table_parse_cedt(enum acpi_cedt_type id,
                      acpi_tbl_entry_handler_arg handler_arg, void *arg);

int acpi_parse_mcfg (struct acpi_table_header *header);
void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);

#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa);
#else
static inline void
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { }
#endif

void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa);

#if defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH)
void acpi_arch_dma_setup(struct device *dev);
#else
static inline void acpi_arch_dma_setup(struct device *dev) { }
#endif

#ifdef CONFIG_ARM64
void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
#else
static inline void
acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
#endif

#ifdef CONFIG_RISCV
void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa);
#else
static inline void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa) { }
#endif

#ifndef PHYS_CPUID_INVALID
typedef u32 phys_cpuid_t;
#define PHYS_CPUID_INVALID (phys_cpuid_t)(-1)
#endif

static inline bool invalid_logical_cpuid(u32 cpuid)
{
        return (int)cpuid < 0;
}

static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id)
{
        return phys_id == PHYS_CPUID_INVALID;
}


int __init acpi_get_madt_revision(void);

/* Validate the processor object's proc_id */
bool acpi_duplicate_processor_id(int proc_id);
/* Processor _CTS control */
struct acpi_processor_power;

#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
bool acpi_processor_claim_cst_control(void);
int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
                                struct acpi_processor_power *info);
#else
static inline bool acpi_processor_claim_cst_control(void) { return false; }
static inline int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
                                              struct acpi_processor_power *info)
{
        return -ENODEV;
}
#endif

#ifdef CONFIG_ACPI_HOTPLUG_CPU
/* Arch dependent functions for cpu hotplug support */
int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
                 int *pcpu);
int acpi_unmap_cpu(int cpu);
#endif /* CONFIG_ACPI_HOTPLUG_CPU */

acpi_handle acpi_get_processor_handle(int cpu);

/**
 * acpi_get_cpu_uid() - Get ACPI Processor UID of from MADT table
 * @cpu: Logical CPU number (0-based)
 * @uid: Pointer to store ACPI Processor UID
 *
 * Return: 0 on success (ACPI Processor ID stored in *uid);
 *         -EINVAL if CPU number is invalid or out of range;
 *         -ENODEV if ACPI Processor UID for the CPU is not found.
 */
int acpi_get_cpu_uid(unsigned int cpu, u32 *uid);

#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr);
#endif

int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base);
int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base);
int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base);
void acpi_irq_stats_init(void);
extern u32 acpi_irq_handled;
extern u32 acpi_irq_not_handled;
extern unsigned int acpi_sci_irq;
extern bool acpi_no_s5;
#define INVALID_ACPI_IRQ        ((unsigned)-1)
static inline bool acpi_sci_irq_valid(void)
{
        return acpi_sci_irq != INVALID_ACPI_IRQ;
}

extern int sbf_port;

int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);

typedef struct fwnode_handle *(*acpi_gsi_domain_disp_fn)(u32);

void acpi_set_irq_model(enum acpi_irq_model_id model,
                        acpi_gsi_domain_disp_fn fn);
acpi_gsi_domain_disp_fn acpi_get_gsi_dispatcher(void);
void acpi_set_gsi_to_irq_fallback(u32 (*)(u32));

struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
                                             unsigned int size,
                                             struct fwnode_handle *fwnode,
                                             const struct irq_domain_ops *ops,
                                             void *host_data);

#ifdef CONFIG_X86_IO_APIC
extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
#else
static inline int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
{
        return -1;
}
#endif
/*
 * This function undoes the effect of one call to acpi_register_gsi().
 * If this matches the last registration, any IRQ resources for gsi
 * are freed.
 */
void acpi_unregister_gsi (u32 gsi);

struct pci_dev;

struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin);
int acpi_pci_irq_enable (struct pci_dev *dev);
void acpi_penalize_isa_irq(int irq, int active);
bool acpi_isa_irq_available(int irq);
#ifdef CONFIG_PCI
void acpi_penalize_sci_irq(int irq, int trigger, int polarity);
#else
static inline void acpi_penalize_sci_irq(int irq, int trigger,
                                        int polarity)
{
}
#endif
void acpi_pci_irq_disable (struct pci_dev *dev);

extern int ec_read(u8 addr, u8 *val);
extern int ec_write(u8 addr, u8 val);
extern int ec_transaction(u8 command,
                          const u8 *wdata, unsigned wdata_len,
                          u8 *rdata, unsigned rdata_len);
extern acpi_handle ec_get_handle(void);

extern bool acpi_is_pnp_device(struct acpi_device *);

#if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE)

typedef void (*wmi_notify_handler) (union acpi_object *data, void *context);

int wmi_instance_count(const char *guid);

extern acpi_status wmi_evaluate_method(const char *guid, u8 instance,
                                        u32 method_id,
                                        const struct acpi_buffer *in,
                                        struct acpi_buffer *out);
extern acpi_status wmi_query_block(const char *guid, u8 instance,
                                        struct acpi_buffer *out);
extern acpi_status wmi_set_block(const char *guid, u8 instance,
                                        const struct acpi_buffer *in);
extern acpi_status wmi_install_notify_handler(const char *guid,
                                        wmi_notify_handler handler, void *data);
extern acpi_status wmi_remove_notify_handler(const char *guid);
extern bool wmi_has_guid(const char *guid);
extern char *wmi_get_acpi_device_uid(const char *guid);

#endif        /* CONFIG_ACPI_WMI */

#define ACPI_VIDEO_OUTPUT_SWITCHING                        0x0001
#define ACPI_VIDEO_DEVICE_POSTING                        0x0002
#define ACPI_VIDEO_ROM_AVAILABLE                        0x0004
#define ACPI_VIDEO_BACKLIGHT                                0x0008
#define ACPI_VIDEO_BACKLIGHT_FORCE_VENDOR                0x0010
#define ACPI_VIDEO_BACKLIGHT_FORCE_VIDEO                0x0020
#define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VENDOR        0x0040
#define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VIDEO                0x0080
#define ACPI_VIDEO_BACKLIGHT_DMI_VENDOR                        0x0100
#define ACPI_VIDEO_BACKLIGHT_DMI_VIDEO                        0x0200
#define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR                0x0400
#define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO                0x0800

extern char acpi_video_backlight_string[];
extern long acpi_is_video_device(acpi_handle handle);

extern void acpi_osi_setup(char *str);
extern bool acpi_osi_is_win8(void);

#ifdef CONFIG_ACPI_THERMAL_LIB
int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp);
int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp);
int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp);
int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp);
#endif

#ifdef CONFIG_ACPI_HMAT
int acpi_get_genport_coordinates(u32 uid, struct access_coordinate *coord);
#else
static inline int acpi_get_genport_coordinates(u32 uid,
                                               struct access_coordinate *coord)
{
        return -EOPNOTSUPP;
}
#endif

#ifdef CONFIG_ACPI_NUMA
int acpi_map_pxm_to_node(int pxm);
int acpi_get_node(acpi_handle handle);

/**
 * pxm_to_online_node - Map proximity ID to online node
 * @pxm: ACPI proximity ID
 *
 * This is similar to pxm_to_node(), but always returns an online
 * node.  When the mapped node from a given proximity ID is offline, it
 * looks up the node distance table and returns the nearest online node.
 *
 * ACPI device drivers, which are called after the NUMA initialization has
 * completed in the kernel, can call this interface to obtain their device
 * NUMA topology from ACPI tables.  Such drivers do not have to deal with
 * offline nodes.  A node may be offline when SRAT memory entry does not exist,
 * or NUMA is disabled, ex. "numa=off" on x86.
 */
static inline int pxm_to_online_node(int pxm)
{
        int node = pxm_to_node(pxm);

        return numa_map_to_online_node(node);
}
#else
static inline int pxm_to_online_node(int pxm)
{
        return 0;
}
static inline int acpi_map_pxm_to_node(int pxm)
{
        return 0;
}
static inline int acpi_get_node(acpi_handle handle)
{
        return 0;
}
#endif
extern int pnpacpi_disabled;

#define PXM_INVAL        (-1)

bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res);
bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res);
bool acpi_dev_resource_address_space(struct acpi_resource *ares,
                                     struct resource_win *win);
bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares,
                                         struct resource_win *win);
unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable, u8 wake_capable);
unsigned int acpi_dev_get_irq_type(int triggering, int polarity);
bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
                                 struct resource *res);

void acpi_dev_free_resource_list(struct list_head *list);
int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
                           int (*preproc)(struct acpi_resource *, void *),
                           void *preproc_data);
int acpi_dev_get_dma_resources(struct acpi_device *adev,
                               struct list_head *list);
int acpi_dev_get_memory_resources(struct acpi_device *adev, struct list_head *list);
int acpi_dev_filter_resource_type(struct acpi_resource *ares,
                                  unsigned long types);

static inline int acpi_dev_filter_resource_type_cb(struct acpi_resource *ares,
                                                   void *arg)
{
        return acpi_dev_filter_resource_type(ares, (unsigned long)arg);
}

struct acpi_device *acpi_resource_consumer(struct resource *res);

int acpi_check_resource_conflict(const struct resource *res);

int acpi_check_region(resource_size_t start, resource_size_t n,
                      const char *name);

int acpi_resources_are_enforced(void);

#ifdef CONFIG_HIBERNATION
extern int acpi_check_s4_hw_signature;
#endif

#ifdef CONFIG_PM_SLEEP
void __init acpi_old_suspend_ordering(void);
void __init acpi_nvs_nosave(void);
void __init acpi_nvs_nosave_s3(void);
void __init acpi_sleep_no_blacklist(void);
#endif /* CONFIG_PM_SLEEP */

int acpi_register_wakeup_handler(
        int wake_irq, bool (*wakeup)(void *context), void *context);
void acpi_unregister_wakeup_handler(
        bool (*wakeup)(void *context), void *context);

struct acpi_osc_context {
        char *uuid_str;                        /* UUID string */
        int rev;
        struct acpi_buffer cap;                /* list of DWORD capabilities */
        struct acpi_buffer ret;                /* free by caller if success */
};

acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);

/* Number of _OSC capability DWORDS depends on bridge type */
#define OSC_PCI_CAPABILITY_DWORDS                3
#define OSC_CXL_CAPABILITY_DWORDS                5

/* Indexes into _OSC Capabilities Buffer (DWORDs 2 to 5 are device-specific) */
#define OSC_QUERY_DWORD                                0        /* DWORD 1 */
#define OSC_SUPPORT_DWORD                        1        /* DWORD 2 */
#define OSC_CONTROL_DWORD                        2        /* DWORD 3 */
#define OSC_EXT_SUPPORT_DWORD                        3        /* DWORD 4 */
#define OSC_EXT_CONTROL_DWORD                        4        /* DWORD 5 */

/* _OSC Capabilities DWORD 1: Query/Control and Error Returns (generic) */
#define OSC_QUERY_ENABLE                        0x00000001  /* input */
#define OSC_REQUEST_ERROR                        0x00000002  /* return */
#define OSC_INVALID_UUID_ERROR                        0x00000004  /* return */
#define OSC_INVALID_REVISION_ERROR                0x00000008  /* return */
#define OSC_CAPABILITIES_MASK_ERROR                0x00000010  /* return */

/* Platform-Wide Capabilities _OSC: Capabilities DWORD 2: Support Field */
#define OSC_SB_PAD_SUPPORT                        0x00000001
#define OSC_SB_PPC_OST_SUPPORT                        0x00000002
#define OSC_SB_PR3_SUPPORT                        0x00000004
#define OSC_SB_HOTPLUG_OST_SUPPORT                0x00000008
#define OSC_SB_APEI_SUPPORT                        0x00000010
#define OSC_SB_CPC_SUPPORT                        0x00000020
#define OSC_SB_CPCV2_SUPPORT                        0x00000040
#define OSC_SB_PCLPI_SUPPORT                        0x00000080
#define OSC_SB_OSLPI_SUPPORT                        0x00000100
#define OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT        0x00000200
#define OSC_SB_OVER_16_PSTATES_SUPPORT                0x00000400
#define OSC_SB_GED_SUPPORT                        0x00000800
#define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT                0x00001000
#define OSC_SB_IRQ_RESOURCE_SOURCE_SUPPORT        0x00002000
#define OSC_SB_CPC_FLEXIBLE_ADR_SPACE                0x00004000
#define OSC_SB_GENERIC_INITIATOR_SUPPORT        0x00020000
#define OSC_SB_NATIVE_USB4_SUPPORT                0x00040000
#define OSC_SB_BATTERY_CHARGE_LIMITING_SUPPORT        0x00080000
#define OSC_SB_PRM_SUPPORT                        0x00200000
#define OSC_SB_FFH_OPR_SUPPORT                        0x00400000

extern bool osc_sb_apei_support_acked;
extern bool osc_pc_lpi_support_confirmed;
extern bool osc_sb_native_usb4_support_confirmed;
extern bool osc_sb_cppc2_support_acked;
extern bool osc_cpc_flexible_adr_space_confirmed;

/* USB4 Capabilities */
#define OSC_USB_USB3_TUNNELING                        0x00000001
#define OSC_USB_DP_TUNNELING                        0x00000002
#define OSC_USB_PCIE_TUNNELING                        0x00000004
#define OSC_USB_XDOMAIN                                0x00000008

extern u32 osc_sb_native_usb4_control;

/* PCI Host Bridge _OSC: Capabilities DWORD 2: Support Field */
#define OSC_PCI_EXT_CONFIG_SUPPORT                0x00000001
#define OSC_PCI_ASPM_SUPPORT                        0x00000002
#define OSC_PCI_CLOCK_PM_SUPPORT                0x00000004
#define OSC_PCI_SEGMENT_GROUPS_SUPPORT                0x00000008
#define OSC_PCI_MSI_SUPPORT                        0x00000010
#define OSC_PCI_EDR_SUPPORT                        0x00000080
#define OSC_PCI_HPX_TYPE_3_SUPPORT                0x00000100

/* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */
#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL        0x00000001
#define OSC_PCI_SHPC_NATIVE_HP_CONTROL                0x00000002
#define OSC_PCI_EXPRESS_PME_CONTROL                0x00000004
#define OSC_PCI_EXPRESS_AER_CONTROL                0x00000008
#define OSC_PCI_EXPRESS_CAPABILITY_CONTROL        0x00000010
#define OSC_PCI_EXPRESS_LTR_CONTROL                0x00000020
#define OSC_PCI_EXPRESS_DPC_CONTROL                0x00000080

/* CXL _OSC: Capabilities DWORD 4: Support Field */
#define OSC_CXL_1_1_PORT_REG_ACCESS_SUPPORT        0x00000001
#define OSC_CXL_2_0_PORT_DEV_REG_ACCESS_SUPPORT        0x00000002
#define OSC_CXL_PROTOCOL_ERR_REPORTING_SUPPORT        0x00000004
#define OSC_CXL_NATIVE_HP_SUPPORT                0x00000008

/* CXL _OSC: Capabilities DWORD 5: Control Field */
#define OSC_CXL_ERROR_REPORTING_CONTROL                0x00000001

static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context)
{
        u32 *ret = context->ret.pointer;

        return ret[OSC_CONTROL_DWORD];
}

static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context)
{
        u32 *ret = context->ret.pointer;

        return ret[OSC_EXT_CONTROL_DWORD];
}

#define ACPI_GSB_ACCESS_ATTRIB_QUICK                0x00000002
#define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV         0x00000004
#define ACPI_GSB_ACCESS_ATTRIB_BYTE                0x00000006
#define ACPI_GSB_ACCESS_ATTRIB_WORD                0x00000008
#define ACPI_GSB_ACCESS_ATTRIB_BLOCK                0x0000000A
#define ACPI_GSB_ACCESS_ATTRIB_MULTIBYTE        0x0000000B
#define ACPI_GSB_ACCESS_ATTRIB_WORD_CALL        0x0000000C
#define ACPI_GSB_ACCESS_ATTRIB_BLOCK_CALL        0x0000000D
#define ACPI_GSB_ACCESS_ATTRIB_RAW_BYTES        0x0000000E
#define ACPI_GSB_ACCESS_ATTRIB_RAW_PROCESS        0x0000000F

/* Enable _OST when all relevant hotplug operations are enabled */
#if defined(CONFIG_ACPI_HOTPLUG_CPU) &&                        \
        defined(CONFIG_ACPI_HOTPLUG_MEMORY) &&                \
        defined(CONFIG_ACPI_CONTAINER)
#define ACPI_HOTPLUG_OST
#endif

/* _OST Source Event Code (OSPM Action) */
#define ACPI_OST_EC_OSPM_SHUTDOWN                0x100
#define ACPI_OST_EC_OSPM_EJECT                        0x103
#define ACPI_OST_EC_OSPM_INSERTION                0x200

/* _OST General Processing Status Code */
#define ACPI_OST_SC_SUCCESS                        0x0
#define ACPI_OST_SC_NON_SPECIFIC_FAILURE        0x1
#define ACPI_OST_SC_UNRECOGNIZED_NOTIFY                0x2

/* _OST OS Shutdown Processing (0x100) Status Code */
#define ACPI_OST_SC_OS_SHUTDOWN_DENIED                0x80
#define ACPI_OST_SC_OS_SHUTDOWN_IN_PROGRESS        0x81
#define ACPI_OST_SC_OS_SHUTDOWN_COMPLETED        0x82
#define ACPI_OST_SC_OS_SHUTDOWN_NOT_SUPPORTED        0x83

/* _OST Ejection Request (0x3, 0x103) Status Code */
#define ACPI_OST_SC_EJECT_NOT_SUPPORTED                0x80
#define ACPI_OST_SC_DEVICE_IN_USE                0x81
#define ACPI_OST_SC_DEVICE_BUSY                        0x82
#define ACPI_OST_SC_EJECT_DEPENDENCY_BUSY        0x83
#define ACPI_OST_SC_EJECT_IN_PROGRESS                0x84

/* _OST Insertion Request (0x200) Status Code */
#define ACPI_OST_SC_INSERT_IN_PROGRESS                0x80
#define ACPI_OST_SC_DRIVER_LOAD_FAILURE                0x81
#define ACPI_OST_SC_INSERT_NOT_SUPPORTED        0x82

enum acpi_predicate {
        all_versions,
        less_than_or_equal,
        equal,
        greater_than_or_equal,
};

/* Table must be terminted by a NULL entry */
struct acpi_platform_list {
        char        oem_id[ACPI_OEM_ID_SIZE+1];
        char        oem_table_id[ACPI_OEM_TABLE_ID_SIZE+1];
        u32        oem_revision;
        char        *table;
        enum acpi_predicate pred;
        char        *reason;
        u32        data;
};
int acpi_match_platform_list(const struct acpi_platform_list *plat);

extern void acpi_early_init(void);
extern void acpi_subsystem_init(void);

extern int acpi_nvs_register(__u64 start, __u64 size);

extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
                                    void *data);

const struct acpi_device_id *acpi_match_acpi_device(const struct acpi_device_id *ids,
                                                    const struct acpi_device *adev);

const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
                                               const struct device *dev);

const void *acpi_device_get_match_data(const struct device *dev);
extern bool acpi_driver_match_device(struct device *dev,
                                     const struct device_driver *drv);
int acpi_device_uevent_modalias(const struct device *, struct kobj_uevent_env *);
int acpi_device_modalias(struct device *, char *, int);

struct platform_device *acpi_create_platform_device(struct acpi_device *,
                                                    const struct property_entry *);
#define ACPI_PTR(_ptr)        (_ptr)

static inline void acpi_device_set_enumerated(struct acpi_device *adev)
{
        adev->flags.visited = true;
}

static inline void acpi_device_clear_enumerated(struct acpi_device *adev)
{
        adev->flags.visited = false;
}

enum acpi_reconfig_event  {
        ACPI_RECONFIG_DEVICE_ADD = 0,
        ACPI_RECONFIG_DEVICE_REMOVE,
};

int acpi_reconfig_notifier_register(struct notifier_block *nb);
int acpi_reconfig_notifier_unregister(struct notifier_block *nb);

#ifdef CONFIG_ACPI_GTDT
int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
int acpi_gtdt_map_ppi(int type);
bool acpi_gtdt_c3stop(int type);
#endif

#ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER
static __always_inline void acpi_arch_set_root_pointer(u64 addr)
{
}
#endif

#ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER
static __always_inline u64 acpi_arch_get_root_pointer(void)
{
        return 0;
}
#endif

int acpi_get_local_u64_address(acpi_handle handle, u64 *addr);
int acpi_get_local_address(acpi_handle handle, u32 *addr);
const char *acpi_get_subsystem_id(acpi_handle handle);

#ifdef CONFIG_ACPI_MRRM
int acpi_mrrm_max_mem_region(void);
#endif

#define ACPI_CMOS_RTC_IDS        \
        { "PNP0B00", },                \
        { "PNP0B01", },                \
        { "PNP0B02", },                \
        { "", }

extern bool cmos_rtc_platform_device_present;

#else        /* !CONFIG_ACPI */

#define acpi_disabled 1

#define ACPI_COMPANION(dev)                (NULL)
#define ACPI_COMPANION_SET(dev, adev)        do { } while (0)
#define ACPI_HANDLE(dev)                (NULL)
#define ACPI_HANDLE_FWNODE(fwnode)        (NULL)

/* Get rid of the -Wunused-variable for adev */
#define acpi_dev_uid_match(adev, uid2)                        (adev && false)
#define acpi_dev_hid_uid_match(adev, hid2, uid2)        (adev && false)

struct fwnode_handle;

static inline bool acpi_dev_found(const char *hid)
{
        return false;
}

static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
{
        return false;
}

struct acpi_device;

static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer)
{
        return -ENODEV;
}

static inline struct acpi_device *
acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv)
{
        return NULL;
}

static inline bool acpi_reduced_hardware(void)
{
        return false;
}

static inline void acpi_dev_put(struct acpi_device *adev) {}

static inline bool is_acpi_node(const struct fwnode_handle *fwnode)
{
        return false;
}

static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode)
{
        return false;
}

static inline struct acpi_device *to_acpi_device_node(const struct fwnode_handle *fwnode)
{
        return NULL;
}

static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode)
{
        return false;
}

static inline struct acpi_data_node *to_acpi_data_node(const struct fwnode_handle *fwnode)
{
        return NULL;
}

static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode,
                                        const char *name)
{
        return false;
}

static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev)
{
        return NULL;
}

static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
{
        return NULL;
}

static inline bool has_acpi_companion(struct device *dev)
{
        return false;
}

static inline void acpi_preset_companion(struct device *dev,
                                         struct acpi_device *parent, u64 addr)
{
}

static inline const char *acpi_dev_name(struct acpi_device *adev)
{
        return NULL;
}

static inline struct device *acpi_get_first_physical_node(struct acpi_device *adev)
{
        return NULL;
}

static inline void acpi_early_init(void) { }
static inline void acpi_subsystem_init(void) { }

static inline int early_acpi_boot_init(void)
{
        return 0;
}
static inline int acpi_boot_init(void)
{
        return 0;
}

static inline void acpi_boot_table_prepare(void)
{
}

static inline void acpi_boot_table_init(void)
{
}

static inline int acpi_mps_check(void)
{
        return 0;
}

static inline int acpi_check_resource_conflict(struct resource *res)
{
        return 0;
}

static inline int acpi_check_region(resource_size_t start, resource_size_t n,
                                    const char *name)
{
        return 0;
}

struct acpi_table_header;
static inline int acpi_table_parse(char *id,
                                int (*handler)(struct acpi_table_header *))
{
        return -ENODEV;
}

static inline int acpi_nvs_register(__u64 start, __u64 size)
{
        return 0;
}

static inline int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
                                           void *data)
{
        return 0;
}

struct acpi_device_id;

static inline const struct acpi_device_id *acpi_match_acpi_device(
        const struct acpi_device_id *ids, const struct acpi_device *adev)
{
        return NULL;
}

static inline const struct acpi_device_id *acpi_match_device(
        const struct acpi_device_id *ids, const struct device *dev)
{
        return NULL;
}

static inline const void *acpi_device_get_match_data(const struct device *dev)
{
        return NULL;
}

static inline bool acpi_driver_match_device(struct device *dev,
                                            const struct device_driver *drv)
{
        return false;
}

static inline bool acpi_check_dsm(acpi_handle handle, const guid_t *guid,
                                  u64 rev, u64 funcs)
{
        return false;
}

static inline union acpi_object *acpi_evaluate_dsm(acpi_handle handle,
                                                   const guid_t *guid,
                                                   u64 rev, u64 func,
                                                   union acpi_object *argv4)
{
        return NULL;
}

static inline union acpi_object *acpi_evaluate_dsm_typed(acpi_handle handle,
                                                         const guid_t *guid,
                                                         u64 rev, u64 func,
                                                         union acpi_object *argv4,
                                                         acpi_object_type type)
{
        return NULL;
}

static inline int acpi_device_uevent_modalias(const struct device *dev,
                                struct kobj_uevent_env *env)
{
        return -ENODEV;
}

static inline int acpi_device_modalias(struct device *dev,
                                char *buf, int size)
{
        return -ENODEV;
}

static inline struct platform_device *
acpi_create_platform_device(struct acpi_device *adev,
                            const struct property_entry *properties)
{
        return NULL;
}

static inline bool acpi_dma_supported(const struct acpi_device *adev)
{
        return false;
}

static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
{
        return DEV_DMA_NOT_SUPPORTED;
}

static inline int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map)
{
        return -ENODEV;
}

static inline int acpi_dma_configure(struct device *dev,
                                     enum dev_dma_attr attr)
{
        return 0;
}

static inline int acpi_dma_configure_id(struct device *dev,
                                        enum dev_dma_attr attr,
                                        const u32 *input_id)
{
        return 0;
}

#define ACPI_PTR(_ptr)        (NULL)

static inline void acpi_device_set_enumerated(struct acpi_device *adev)
{
}

static inline void acpi_device_clear_enumerated(struct acpi_device *adev)
{
}

static inline int acpi_reconfig_notifier_register(struct notifier_block *nb)
{
        return -EINVAL;
}

static inline int acpi_reconfig_notifier_unregister(struct notifier_block *nb)
{
        return -EINVAL;
}

static inline struct acpi_device *acpi_resource_consumer(struct resource *res)
{
        return NULL;
}

static inline int acpi_get_local_address(acpi_handle handle, u32 *addr)
{
        return -ENODEV;
}

static inline const char *acpi_get_subsystem_id(acpi_handle handle)
{
        return ERR_PTR(-ENODEV);
}

static inline int acpi_register_wakeup_handler(int wake_irq,
        bool (*wakeup)(void *context), void *context)
{
        return -ENXIO;
}

static inline void acpi_unregister_wakeup_handler(
        bool (*wakeup)(void *context), void *context) { }

struct acpi_osc_context;
static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context)
{
        return 0;
}

static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context)
{
        return 0;
}

static inline bool acpi_sleep_state_supported(u8 sleep_state)
{
        return false;
}

static inline acpi_handle acpi_get_processor_handle(int cpu)
{
        return NULL;
}

static inline int acpi_mrrm_max_mem_region(void)
{
        return 1;
}

#define cmos_rtc_platform_device_present        false

#endif        /* !CONFIG_ACPI */

#ifdef CONFIG_ACPI_HMAT
int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid,
                                        resource_size_t *size);
#else
static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res,
                                                      int nid, resource_size_t *size)
{
        return -EOPNOTSUPP;
}
#endif

extern void arch_post_acpi_subsys_init(void);

#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
int acpi_ioapic_add(acpi_handle root);
#else
static inline int acpi_ioapic_add(acpi_handle root) { return 0; }
#endif

#ifdef CONFIG_ACPI
void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
                               u32 pm1a_ctrl,  u32 pm1b_ctrl));

acpi_status acpi_os_prepare_sleep(u8 sleep_state,
                                  u32 pm1a_control, u32 pm1b_control);

void acpi_os_set_prepare_extended_sleep(int (*func)(u8 sleep_state,
                                        u32 val_a,  u32 val_b));

acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state,
                                           u32 val_a, u32 val_b);
struct acpi_s2idle_dev_ops {
        struct list_head list_node;
        void (*prepare)(void);
        void (*check)(void);
        void (*restore)(void);
};
#if defined(CONFIG_SUSPEND) && defined(CONFIG_X86)
int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg);
void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg);
#else /* CONFIG_SUSPEND && CONFIG_X86 */
static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg)
{
        return -ENODEV;
}
static inline void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg)
{
}
#endif /* CONFIG_SUSPEND && CONFIG_X86 */
void arch_reserve_mem_area(acpi_physical_address addr, size_t size);
#else
#define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0)
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_PM)
int acpi_dev_suspend(struct device *dev, bool wakeup);
int acpi_dev_resume(struct device *dev);
int acpi_subsys_runtime_suspend(struct device *dev);
int acpi_subsys_runtime_resume(struct device *dev);
int acpi_dev_pm_attach(struct device *dev, bool power_on);
bool acpi_storage_d3(struct device *dev);
bool acpi_dev_state_d0(struct device *dev);
#else
static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; }
static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; }
static inline int acpi_dev_pm_attach(struct device *dev, bool power_on)
{
        return 0;
}
static inline bool acpi_storage_d3(struct device *dev)
{
        return false;
}
static inline bool acpi_dev_state_d0(struct device *dev)
{
        return true;
}
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP)
int acpi_subsys_prepare(struct device *dev);
void acpi_subsys_complete(struct device *dev);
int acpi_subsys_suspend_late(struct device *dev);
int acpi_subsys_suspend_noirq(struct device *dev);
int acpi_subsys_suspend(struct device *dev);
int acpi_subsys_freeze(struct device *dev);
int acpi_subsys_poweroff(struct device *dev);
int acpi_subsys_restore_early(struct device *dev);
#else
static inline int acpi_subsys_prepare(struct device *dev) { return 0; }
static inline void acpi_subsys_complete(struct device *dev) {}
static inline int acpi_subsys_suspend_late(struct device *dev) { return 0; }
static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; }
static inline int acpi_subsys_suspend(struct device *dev) { return 0; }
static inline int acpi_subsys_freeze(struct device *dev) { return 0; }
static inline int acpi_subsys_poweroff(struct device *dev) { return 0; }
static inline int acpi_subsys_restore_early(struct device *dev) { return 0; }
#endif

#if defined(CONFIG_ACPI_EC) && defined(CONFIG_PM_SLEEP)
void acpi_ec_mark_gpe_for_wake(void);
void acpi_ec_set_gpe_wake_mask(u8 action);
#else
static inline void acpi_ec_mark_gpe_for_wake(void) {}
static inline void acpi_ec_set_gpe_wake_mask(u8 action) {}
#endif

#ifdef CONFIG_ACPI
char *acpi_handle_path(acpi_handle handle);
__printf(3, 4)
void acpi_handle_printk(const char *level, acpi_handle handle,
                        const char *fmt, ...);
void acpi_evaluation_failure_warn(acpi_handle handle, const char *name,
                                  acpi_status status);
#else        /* !CONFIG_ACPI */
static inline __printf(3, 4) void
acpi_handle_printk(const char *level, void *handle, const char *fmt, ...) {}
static inline void acpi_evaluation_failure_warn(acpi_handle handle,
                                                const char *name,
                                                acpi_status status) {}
#endif        /* !CONFIG_ACPI */

#if defined(CONFIG_ACPI) && defined(CONFIG_DYNAMIC_DEBUG)
__printf(3, 4)
void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const char *fmt, ...);
#endif

/*
 * acpi_handle_<level>: Print message with ACPI prefix and object path
 *
 * These interfaces acquire the global namespace mutex to obtain an object
 * path.  In interrupt context, it shows the object path as <n/a>.
 */
#define acpi_handle_emerg(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_EMERG, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_alert(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_ALERT, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_crit(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_CRIT, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_err(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_ERR, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_warn(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_WARNING, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_notice(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_NOTICE, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_info(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_INFO, handle, fmt, ##__VA_ARGS__)

#if defined(DEBUG)
#define acpi_handle_debug(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__)
#else
#if defined(CONFIG_DYNAMIC_DEBUG)
#define acpi_handle_debug(handle, fmt, ...)                                \
        _dynamic_func_call(fmt, __acpi_handle_debug,                        \
                           handle, pr_fmt(fmt), ##__VA_ARGS__)
#else
#define acpi_handle_debug(handle, fmt, ...)                                \
({                                                                        \
        if (0)                                                                \
                acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__); \
        0;                                                                \
})
#endif
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_GPIOLIB)
bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
                                struct acpi_resource_gpio **agpio);
bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
                               struct acpi_resource_gpio **agpio);
int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int index,
                                  bool *wake_capable);
#else
static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
                                              struct acpi_resource_gpio **agpio)
{
        return false;
}
static inline bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
                                             struct acpi_resource_gpio **agpio)
{
        return false;
}
static inline int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id,
                                                int index, bool *wake_capable)
{
        return -ENXIO;
}
#endif

static inline int acpi_dev_gpio_irq_wake_get(struct acpi_device *adev, int index,
                                             bool *wake_capable)
{
        return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, wake_capable);
}

static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *con_id,
                                           int index)
{
        return acpi_dev_gpio_irq_wake_get_by(adev, con_id, index, NULL);
}

static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
{
        return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, NULL);
}

/* Device properties */

#ifdef CONFIG_ACPI
int acpi_dev_get_property(const struct acpi_device *adev, const char *name,
                          acpi_object_type type, const union acpi_object **obj);
int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
                                const char *name, size_t index, size_t num_args,
                                struct fwnode_reference_args *args);

static inline int acpi_node_get_property_reference(
                                const struct fwnode_handle *fwnode,
                                const char *name, size_t index,
                                struct fwnode_reference_args *args)
{
        return __acpi_node_get_property_reference(fwnode, name, index,
                NR_FWNODE_REFERENCE_ARGS, args);
}

static inline bool acpi_dev_has_props(const struct acpi_device *adev)
{
        return !list_empty(&adev->data.properties);
}

struct acpi_device_properties *
acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
                    union acpi_object *properties);

int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
                       void **valptr);

struct acpi_probe_entry;
typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
                                                 struct acpi_probe_entry *);

#define ACPI_TABLE_ID_LEN        5

/**
 * struct acpi_probe_entry - boot-time probing entry
 * @id:                        ACPI table name
 * @type:                Optional subtable type to match
 *                        (if @id contains subtables)
 * @subtable_valid:        Optional callback to check the validity of
 *                        the subtable
 * @probe_table:        Callback to the driver being probed when table
 *                        match is successful
 * @probe_subtbl:        Callback to the driver being probed when table and
 *                        subtable match (and optional callback is successful)
 * @driver_data:        Sideband data provided back to the driver
 */
struct acpi_probe_entry {
        __u8 id[ACPI_TABLE_ID_LEN];
        __u8 type;
        acpi_probe_entry_validate_subtbl subtable_valid;
        union {
                acpi_tbl_table_handler probe_table;
                acpi_tbl_entry_handler probe_subtbl;
        };
        kernel_ulong_t driver_data;
};

void arch_sort_irqchip_probe(struct acpi_probe_entry *ap_head, int nr);

#define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable,        \
                                 valid, data, fn)                        \
        static const struct acpi_probe_entry __acpi_probe_##name        \
                __used __section("__" #table "_acpi_probe_table") = {        \
                        .id = table_id,                                        \
                        .type = subtable,                                \
                        .subtable_valid = valid,                        \
                        .probe_table = fn,                                \
                        .driver_data = data,                                \
                }

#define ACPI_DECLARE_SUBTABLE_PROBE_ENTRY(table, name, table_id,        \
                                          subtable, valid, data, fn)        \
        static const struct acpi_probe_entry __acpi_probe_##name        \
                __used __section("__" #table "_acpi_probe_table") = {        \
                        .id = table_id,                                        \
                        .type = subtable,                                \
                        .subtable_valid = valid,                        \
                        .probe_subtbl = fn,                                \
                        .driver_data = data,                                \
                }

#define ACPI_PROBE_TABLE(name)                __##name##_acpi_probe_table
#define ACPI_PROBE_TABLE_END(name)        __##name##_acpi_probe_table_end

int __acpi_probe_device_table(struct acpi_probe_entry *start, int nr);

#define acpi_probe_device_table(t)                                        \
        ({                                                                 \
                extern struct acpi_probe_entry ACPI_PROBE_TABLE(t),        \
                                               ACPI_PROBE_TABLE_END(t);        \
                __acpi_probe_device_table(&ACPI_PROBE_TABLE(t),                \
                                          (&ACPI_PROBE_TABLE_END(t) -        \
                                           &ACPI_PROBE_TABLE(t)));        \
        })
#else
static inline int acpi_dev_get_property(struct acpi_device *adev,
                                        const char *name, acpi_object_type type,
                                        const union acpi_object **obj)
{
        return -ENXIO;
}

static inline int
__acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
                                const char *name, size_t index, size_t num_args,
                                struct fwnode_reference_args *args)
{
        return -ENXIO;
}

static inline int
acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
                                 const char *name, size_t index,
                                 struct fwnode_reference_args *args)
{
        return -ENXIO;
}

static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode,
                                     const char *propname,
                                     void **valptr)
{
        return -ENXIO;
}

static inline struct fwnode_handle *
acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
                             struct fwnode_handle *prev)
{
        return ERR_PTR(-ENXIO);
}

static inline int
acpi_graph_get_remote_endpoint(const struct fwnode_handle *fwnode,
                               struct fwnode_handle **remote,
                               struct fwnode_handle **port,
                               struct fwnode_handle **endpoint)
{
        return -ENXIO;
}

#define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \
        static const void * __acpi_table_##name[]                        \
                __attribute__((unused))                                        \
                 = { (void *) table_id,                                        \
                     (void *) subtable,                                        \
                     (void *) valid,                                        \
                     (void *) fn,                                        \
                     (void *) data }

#define acpi_probe_device_table(t)        ({ int __r = 0; __r;})
#endif

#ifdef CONFIG_ACPI_TABLE_UPGRADE
void acpi_table_upgrade(void);
#else
static inline void acpi_table_upgrade(void) { }
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_ACPI_WATCHDOG)
extern bool acpi_has_watchdog(void);
#else
static inline bool acpi_has_watchdog(void) { return false; }
#endif

#ifdef CONFIG_ACPI_SPCR_TABLE
extern bool qdf2400_e44_present;
int acpi_parse_spcr(bool enable_earlycon, bool enable_console);
#else
static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console)
{
        return -ENODEV;
}
#endif

#if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI)
int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res);
const struct cpumask *acpi_irq_get_affinity(acpi_handle handle,
                                            unsigned int index);
#else
static inline
int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res)
{
        return -EINVAL;
}
static inline const struct cpumask *acpi_irq_get_affinity(acpi_handle handle,
                                                          unsigned int index)
{
        return NULL;
}
#endif

#ifdef CONFIG_ACPI_LPIT
int lpit_read_residency_count_address(u64 *address);
#else
static inline int lpit_read_residency_count_address(u64 *address)
{
        return -EINVAL;
}
#endif

#ifdef CONFIG_ACPI_PROCESSOR_IDLE
#ifndef arch_get_idle_state_flags
static inline unsigned int arch_get_idle_state_flags(u32 arch_flags)
{
        return 0;
}
#endif
#endif /* CONFIG_ACPI_PROCESSOR_IDLE */

#ifdef CONFIG_ACPI_PPTT
int acpi_pptt_cpu_is_thread(unsigned int cpu);
int find_acpi_cpu_topology(unsigned int cpu, int level);
int find_acpi_cpu_topology_cluster(unsigned int cpu);
int find_acpi_cpu_topology_package(unsigned int cpu);
int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus);
int find_acpi_cache_level_from_id(u32 cache_id);
int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus);
#else
static inline int acpi_pptt_cpu_is_thread(unsigned int cpu)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology(unsigned int cpu, int level)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology_cluster(unsigned int cpu)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology_package(unsigned int cpu)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
{
        return -EINVAL;
}
static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id,
                                                     cpumask_t *cpus) { }
static inline int find_acpi_cache_level_from_id(u32 cache_id)
{
        return -ENOENT;
}
static inline int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id,
                                                      cpumask_t *cpus)
{
        return -ENOENT;
}
#endif

void acpi_arch_init(void);

#ifdef CONFIG_ACPI_PCC
void acpi_init_pcc(void);
#else
static inline void acpi_init_pcc(void) { }
#endif

#ifdef CONFIG_ACPI_FFH
void acpi_init_ffh(void);
extern int acpi_ffh_address_space_arch_setup(void *handler_ctxt,
                                             void **region_ctxt);
extern int acpi_ffh_address_space_arch_handler(acpi_integer *value,
                                               void *region_context);
#else
static inline void acpi_init_ffh(void) { }
#endif

#ifdef CONFIG_ACPI
extern void acpi_device_notify(struct device *dev);
extern void acpi_device_notify_remove(struct device *dev);
#else
static inline void acpi_device_notify(struct device *dev) { }
static inline void acpi_device_notify_remove(struct device *dev) { }
#endif

static inline void acpi_use_parent_companion(struct device *dev)
{
        ACPI_COMPANION_SET(dev, ACPI_COMPANION(dev->parent));
}

#ifdef CONFIG_ACPI_NUMA
bool acpi_node_backed_by_real_pxm(int nid);
#else
static inline bool acpi_node_backed_by_real_pxm(int nid)
{
        return false;
}
#endif

#endif        /*_LINUX_ACPI_H*/

























































































































































































































































































































    1 

    1 
    1 



    1 





















































   28 





   37 






   17 
   34 






   33 





   36 






   34 
   30 








































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
// SPDX-License-Identifier: GPL-2.0-only
/*
 * lib/bitmap.c
 * Helper functions for bitmap.h.
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/slab.h>

/**
 * DOC: bitmap introduction
 *
 * bitmaps provide an array of bits, implemented using an
 * array of unsigned longs.  The number of valid bits in a
 * given bitmap does _not_ need to be an exact multiple of
 * BITS_PER_LONG.
 *
 * The possible unused bits in the last, partially used word
 * of a bitmap are 'don't care'.  The implementation makes
 * no particular effort to keep them zero.  It ensures that
 * their value will not affect the results of any operation.
 * The bitmap operations that return Boolean (bitmap_empty,
 * for example) or scalar (bitmap_weight, for example) results
 * carefully filter out these unused bits from impacting their
 * results.
 *
 * The byte ordering of bitmaps is more natural on little
 * endian architectures.  See the big-endian headers
 * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h
 * for the best explanations of this ordering.
 */

bool __bitmap_equal(const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] != bitmap2[k])
                        return false;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return false;

        return true;
}
EXPORT_SYMBOL(__bitmap_equal);

bool __bitmap_or_equal(const unsigned long *bitmap1,
                       const unsigned long *bitmap2,
                       const unsigned long *bitmap3,
                       unsigned int bits)
{
        unsigned int k, lim = bits / BITS_PER_LONG;
        unsigned long tmp;

        for (k = 0; k < lim; ++k) {
                if ((bitmap1[k] | bitmap2[k]) != bitmap3[k])
                        return false;
        }

        if (!(bits % BITS_PER_LONG))
                return true;

        tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k];
        return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0;
}
EXPORT_SYMBOL(__bitmap_or_equal);

void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits)
{
        unsigned int k, lim = BITS_TO_LONGS(bits);
        for (k = 0; k < lim; ++k)
                dst[k] = ~src[k];
}
EXPORT_SYMBOL(__bitmap_complement);

/**
 * __bitmap_shift_right - logical right shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting right (dividing) means moving bits in the MS -> LS bit
 * direction.  Zeros are fed into the vacated MS positions and the
 * LS bits shifted off the bottom are lost.
 */
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                        unsigned shift, unsigned nbits)
{
        unsigned k, lim = BITS_TO_LONGS(nbits);
        unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        unsigned long mask = BITMAP_LAST_WORD_MASK(nbits);
        for (k = 0; off + k < lim; ++k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take lower rem bits of
                 * word above and make them the top rem bits of result.
                 */
                if (!rem || off + k + 1 >= lim)
                        upper = 0;
                else {
                        upper = src[off + k + 1];
                        if (off + k + 1 == lim - 1)
                                upper &= mask;
                        upper <<= (BITS_PER_LONG - rem);
                }
                lower = src[off + k];
                if (off + k == lim - 1)
                        lower &= mask;
                lower >>= rem;
                dst[k] = lower | upper;
        }
        if (off)
                memset(&dst[lim - off], 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_right);


/**
 * __bitmap_shift_left - logical left shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting left (multiplying) means moving bits in the LS -> MS
 * direction.  Zeros are fed into the vacated LS bit positions
 * and those MS bits shifted off the top are lost.
 */

void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                        unsigned int shift, unsigned int nbits)
{
        int k;
        unsigned int lim = BITS_TO_LONGS(nbits);
        unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        for (k = lim - off - 1; k >= 0; --k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take upper rem bits of
                 * word below and make them the bottom rem bits of result.
                 */
                if (rem && k > 0)
                        lower = src[k - 1] >> (BITS_PER_LONG - rem);
                else
                        lower = 0;
                upper = src[k] << rem;
                dst[k + off] = lower | upper;
        }
        if (off)
                memset(dst, 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_left);

/**
 * bitmap_cut() - remove bit region from bitmap and right shift remaining bits
 * @dst: destination bitmap, might overlap with src
 * @src: source bitmap
 * @first: start bit of region to be removed
 * @cut: number of bits to remove
 * @nbits: bitmap size, in bits
 *
 * Set the n-th bit of @dst iff the n-th bit of @src is set and
 * n is less than @first, or the m-th bit of @src is set for any
 * m such that @first <= n < nbits, and m = n + @cut.
 *
 * In pictures, example for a big-endian 32-bit architecture:
 *
 * The @src bitmap is::
 *
 *   31                                   63
 *   |                                    |
 *   10000000 11000001 11110010 00010101  10000000 11000001 01110010 00010101
 *                   |  |              |                                    |
 *                  16  14             0                                   32
 *
 * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is::
 *
 *   31                                   63
 *   |                                    |
 *   10110000 00011000 00110010 00010101  00010000 00011000 00101110 01000010
 *                      |              |                                    |
 *                      14 (bit 17     0                                   32
 *                          from @src)
 *
 * Note that @dst and @src might overlap partially or entirely.
 *
 * This is implemented in the obvious way, with a shift and carry
 * step for each moved bit. Optimisation is left as an exercise
 * for the compiler.
 */
void bitmap_cut(unsigned long *dst, const unsigned long *src,
                unsigned int first, unsigned int cut, unsigned int nbits)
{
        unsigned int len = BITS_TO_LONGS(nbits);
        unsigned long keep = 0, carry;
        int i;

        if (first % BITS_PER_LONG) {
                keep = src[first / BITS_PER_LONG] &
                       (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG));
        }

        memmove(dst, src, len * sizeof(*dst));

        while (cut--) {
                for (i = first / BITS_PER_LONG; i < len; i++) {
                        if (i < len - 1)
                                carry = dst[i + 1] & 1UL;
                        else
                                carry = 0;

                        dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1));
                }
        }

        dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG);
        dst[first / BITS_PER_LONG] |= keep;
}
EXPORT_SYMBOL(bitmap_cut);

bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_and);

void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] | bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_or);

void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] ^ bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_xor);

bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_andnot);

void __bitmap_replace(unsigned long *dst,
                      const unsigned long *old, const unsigned long *new,
                      const unsigned long *mask, unsigned int nbits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(nbits);

        for (k = 0; k < nr; k++)
                dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]);
}
EXPORT_SYMBOL(__bitmap_replace);

bool __bitmap_intersects(const unsigned long *bitmap1,
                         const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & bitmap2[k])
                        return true;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return true;
        return false;
}
EXPORT_SYMBOL(__bitmap_intersects);

bool __bitmap_subset(const unsigned long *bitmap1,
                     const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & ~bitmap2[k])
                        return false;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return false;
        return true;
}
EXPORT_SYMBOL(__bitmap_subset);

#define BITMAP_WEIGHT(FETCH, bits)        \
({                                                                                \
        unsigned int __bits = (bits), idx, w = 0;                                \
                                                                                \
        for (idx = 0; idx < __bits / BITS_PER_LONG; idx++)                        \
                w += hweight_long(FETCH);                                        \
                                                                                \
        if (__bits % BITS_PER_LONG)                                                \
                w += hweight_long((FETCH) & BITMAP_LAST_WORD_MASK(__bits));        \
                                                                                \
        w;                                                                        \
})

unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight);

unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap1[idx] & bitmap2[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight_and);

unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap1[idx] & ~bitmap2[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight_andnot);

unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1,
                                  const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits);
}
EXPORT_SYMBOL(__bitmap_weighted_or);

unsigned int __bitmap_weighted_xor(unsigned long *dst, const unsigned long *bitmap1,
                                  const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] ^ bitmap2[idx]; dst[idx]; }), bits);
}
EXPORT_SYMBOL(__bitmap_weighted_xor);

void __bitmap_set(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_set >= 0) {
                *p |= mask_to_set;
                len -= bits_to_set;
                bits_to_set = BITS_PER_LONG;
                mask_to_set = ~0UL;
                p++;
        }
        if (len) {
                mask_to_set &= BITMAP_LAST_WORD_MASK(size);
                *p |= mask_to_set;
        }
}
EXPORT_SYMBOL(__bitmap_set);

void __bitmap_clear(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_clear >= 0) {
                *p &= ~mask_to_clear;
                len -= bits_to_clear;
                bits_to_clear = BITS_PER_LONG;
                mask_to_clear = ~0UL;
                p++;
        }
        if (len) {
                mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
                *p &= ~mask_to_clear;
        }
}
EXPORT_SYMBOL(__bitmap_clear);

/**
 * bitmap_find_next_zero_area_off - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 * @align_offset: Alignment offset for zero area.
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds plus @align_offset
 * is multiple of that power of 2.
 */
unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                             unsigned long size,
                                             unsigned long start,
                                             unsigned int nr,
                                             unsigned long align_mask,
                                             unsigned long align_offset)
{
        unsigned long index, end, i;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                start = i + 1;
                goto again;
        }
        return index;
}
EXPORT_SYMBOL(bitmap_find_next_zero_area_off);

/**
 * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
 *        @buf: pointer to a bitmap
 *        @pos: a bit position in @buf (0 <= @pos < @nbits)
 *        @nbits: number of valid bit positions in @buf
 *
 * Map the bit at position @pos in @buf (of length @nbits) to the
 * ordinal of which set bit it is.  If it is not set or if @pos
 * is not a valid bit position, map to -1.
 *
 * If for example, just bits 4 through 7 are set in @buf, then @pos
 * values 4 through 7 will get mapped to 0 through 3, respectively,
 * and other @pos values will get mapped to -1.  When @pos value 7
 * gets mapped to (returns) @ord value 3 in this example, that means
 * that bit 7 is the 3rd (starting with 0th) set bit in @buf.
 *
 * The bit positions 0 through @bits are valid positions in @buf.
 */
static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits)
{
        if (pos >= nbits || !test_bit(pos, buf))
                return -1;

        return bitmap_weight(buf, pos);
}

/**
 * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap
 *        @dst: remapped result
 *        @src: subset to be remapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @nbits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * If either of the @old and @new bitmaps are empty, or if @src and
 * @dst point to the same location, then this routine copies @src
 * to @dst.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identity map).
 *
 * Apply the above specified mapping to @src, placing the result in
 * @dst, clearing any bits previously set in @dst.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @src comes into this routine
 * with bits 1, 5 and 7 set, then @dst should leave with bits 1,
 * 13 and 15 set.
 */
void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new,
                unsigned int nbits)
{
        unsigned int oldbit, w;

        if (dst == src)                /* following doesn't handle inplace remaps */
                return;
        bitmap_zero(dst, nbits);

        w = bitmap_weight(new, nbits);
        for_each_set_bit(oldbit, src, nbits) {
                int n = bitmap_pos_to_ord(old, oldbit, nbits);

                if (n < 0 || w == 0)
                        set_bit(oldbit, dst);        /* identity map */
                else
                        set_bit(find_nth_bit(new, nbits, n % w), dst);
        }
}
EXPORT_SYMBOL(bitmap_remap);

/**
 * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit
 *        @oldbit: bit position to be mapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @bits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identity map).
 *
 * Apply the above specified mapping to bit position @oldbit, returning
 * the new bit position.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @oldbit is 5, then this routine
 * returns 13.
 */
int bitmap_bitremap(int oldbit, const unsigned long *old,
                                const unsigned long *new, int bits)
{
        int w = bitmap_weight(new, bits);
        int n = bitmap_pos_to_ord(old, oldbit, bits);
        if (n < 0 || w == 0)
                return oldbit;
        else
                return find_nth_bit(new, bits, n % w);
}
EXPORT_SYMBOL(bitmap_bitremap);

#ifdef CONFIG_NUMA
/**
 * bitmap_onto - translate one bitmap relative to another
 *        @dst: resulting translated bitmap
 *         @orig: original untranslated bitmap
 *         @relmap: bitmap relative to which translated
 *        @bits: number of bits in each of these bitmaps
 *
 * Set the n-th bit of @dst iff there exists some m such that the
 * n-th bit of @relmap is set, the m-th bit of @orig is set, and
 * the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
 * (If you understood the previous sentence the first time your
 * read it, you're overqualified for your current job.)
 *
 * In other words, @orig is mapped onto (surjectively) @dst,
 * using the map { <n, m> | the n-th bit of @relmap is the
 * m-th set bit of @relmap }.
 *
 * Any set bits in @orig above bit number W, where W is the
 * weight of (number of set bits in) @relmap are mapped nowhere.
 * In particular, if for all bits m set in @orig, m >= W, then
 * @dst will end up empty.  In situations where the possibility
 * of such an empty result is not desired, one way to avoid it is
 * to use the bitmap_fold() operator, below, to first fold the
 * @orig bitmap over itself so that all its set bits x are in the
 * range 0 <= x < W.  The bitmap_fold() operator does this by
 * setting the bit (m % W) in @dst, for each bit (m) set in @orig.
 *
 * Example [1] for bitmap_onto():
 *  Let's say @relmap has bits 30-39 set, and @orig has bits
 *  1, 3, 5, 7, 9 and 11 set.  Then on return from this routine,
 *  @dst will have bits 31, 33, 35, 37 and 39 set.
 *
 *  When bit 0 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the first bit (if any)
 *  that is turned on in @relmap.  Since bit 0 was off in the
 *  above example, we leave off that bit (bit 30) in @dst.
 *
 *  When bit 1 is set in @orig (as in the above example), it
 *  means turn on the bit in @dst corresponding to whatever
 *  is the second bit that is turned on in @relmap.  The second
 *  bit in @relmap that was turned on in the above example was
 *  bit 31, so we turned on bit 31 in @dst.
 *
 *  Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
 *  because they were the 4th, 6th, 8th and 10th set bits
 *  set in @relmap, and the 4th, 6th, 8th and 10th bits of
 *  @orig (i.e. bits 3, 5, 7 and 9) were also set.
 *
 *  When bit 11 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the twelfth bit that is
 *  turned on in @relmap.  In the above example, there were
 *  only ten bits turned on in @relmap (30..39), so that bit
 *  11 was set in @orig had no affect on @dst.
 *
 * Example [2] for bitmap_fold() + bitmap_onto():
 *  Let's say @relmap has these ten bits set::
 *
 *                40 41 42 43 45 48 53 61 74 95
 *
 *  (for the curious, that's 40 plus the first ten terms of the
 *  Fibonacci sequence.)
 *
 *  Further lets say we use the following code, invoking
 *  bitmap_fold() then bitmap_onto, as suggested above to
 *  avoid the possibility of an empty @dst result::
 *
 *        unsigned long *tmp;        // a temporary bitmap's bits
 *
 *        bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
 *        bitmap_onto(dst, tmp, relmap, bits);
 *
 *  Then this table shows what various values of @dst would be, for
 *  various @orig's.  I list the zero-based positions of each set bit.
 *  The tmp column shows the intermediate result, as computed by
 *  using bitmap_fold() to fold the @orig bitmap modulo ten
 *  (the weight of @relmap):
 *
 *      =============== ============== =================
 *      @orig           tmp            @dst
 *      0                0             40
 *      1                1             41
 *      9                9             95
 *      10               0             40 [#f1]_
 *      1 3 5 7          1 3 5 7       41 43 48 61
 *      0 1 2 3 4        0 1 2 3 4     40 41 42 43 45
 *      0 9 18 27        0 9 8 7       40 61 74 95
 *      0 10 20 30       0             40
 *      0 11 22 33       0 1 2 3       40 41 42 43
 *      0 12 24 36       0 2 4 6       40 42 45 53
 *      78 102 211       1 2 8         41 42 74 [#f1]_
 *      =============== ============== =================
 *
 * .. [#f1]
 *
 *     For these marked lines, if we hadn't first done bitmap_fold()
 *     into tmp, then the @dst result would have been empty.
 *
 * If either of @orig or @relmap is empty (no set bits), then @dst
 * will be returned empty.
 *
 * If (as explained above) the only set bits in @orig are in positions
 * m where m >= W, (where W is the weight of @relmap) then @dst will
 * once again be returned empty.
 *
 * All bits in @dst not set by the above rule are cleared.
 */
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                        const unsigned long *relmap, unsigned int bits)
{
        unsigned int n, m;        /* same meaning as in above comment */

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, bits);

        /*
         * The following code is a more efficient, but less
         * obvious, equivalent to the loop:
         *        for (m = 0; m < bitmap_weight(relmap, bits); m++) {
         *                n = find_nth_bit(orig, bits, m);
         *                if (test_bit(m, orig))
         *                        set_bit(n, dst);
         *        }
         */

        m = 0;
        for_each_set_bit(n, relmap, bits) {
                /* m == bitmap_pos_to_ord(relmap, n, bits) */
                if (test_bit(m, orig))
                        set_bit(n, dst);
                m++;
        }
}

/**
 * bitmap_fold - fold larger bitmap into smaller, modulo specified size
 *        @dst: resulting smaller bitmap
 *        @orig: original larger bitmap
 *        @sz: specified size
 *        @nbits: number of bits in each of these bitmaps
 *
 * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
 * Clear all other bits in @dst.  See further the comment and
 * Example [2] for bitmap_onto() for why and how to use this.
 */
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                        unsigned int sz, unsigned int nbits)
{
        unsigned int oldbit;

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, nbits);

        for_each_set_bit(oldbit, orig, nbits)
                set_bit(oldbit % sz, dst);
}
#endif /* CONFIG_NUMA */

unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
{
        return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                             flags);
}
EXPORT_SYMBOL(bitmap_alloc);

unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags)
{
        return bitmap_alloc(nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(bitmap_zalloc);

unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node)
{
        return kmalloc_array_node(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                                  flags, node);
}
EXPORT_SYMBOL(bitmap_alloc_node);

unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node)
{
        return bitmap_alloc_node(nbits, flags | __GFP_ZERO, node);
}
EXPORT_SYMBOL(bitmap_zalloc_node);

void bitmap_free(const unsigned long *bitmap)
{
        kfree(bitmap);
}
EXPORT_SYMBOL(bitmap_free);

static void devm_bitmap_free(void *data)
{
        unsigned long *bitmap = data;

        bitmap_free(bitmap);
}

unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags)
{
        unsigned long *bitmap;
        int ret;

        bitmap = bitmap_alloc(nbits, flags);
        if (!bitmap)
                return NULL;

        ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap);
        if (ret)
                return NULL;

        return bitmap;
}
EXPORT_SYMBOL_GPL(devm_bitmap_alloc);

unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags)
{
        return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL_GPL(devm_bitmap_zalloc);

#if BITS_PER_LONG == 64
/**
 * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u32 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                bitmap[i/2] = (unsigned long) buf[i];
                if (++i < halfwords)
                        bitmap[i/2] |= ((unsigned long) buf[i]) << 32;
        }

        /* Clear tail bits in last word beyond nbits. */
        if (nbits % BITS_PER_LONG)
                bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr32);

/**
 * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits
 *        @buf: array of u32 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                buf[i] = (u32) (bitmap[i/2] & UINT_MAX);
                if (++i < halfwords)
                        buf[i] = (u32) (bitmap[i/2] >> 32);
        }

        /* Clear tail bits in last element of array beyond nbits. */
        if (nbits % BITS_PER_LONG)
                buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31));
}
EXPORT_SYMBOL(bitmap_to_arr32);
#endif

#if BITS_PER_LONG == 32
/**
 * bitmap_from_arr64 - copy the contents of u64 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u64 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits)
{
        int n;

        for (n = nbits; n > 0; n -= 64) {
                u64 val = *buf++;

                *bitmap++ = val;
                if (n > 32)
                        *bitmap++ = val >> 32;
        }

        /*
         * Clear tail bits in the last word beyond nbits.
         *
         * Negative index is OK because here we point to the word next
         * to the last word of the bitmap, except for nbits == 0, which
         * is tested implicitly.
         */
        if (nbits % BITS_PER_LONG)
                bitmap[-1] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr64);

/**
 * bitmap_to_arr64 - copy the contents of bitmap to a u64 array of bits
 *        @buf: array of u64 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        const unsigned long *end = bitmap + BITS_TO_LONGS(nbits);

        while (bitmap < end) {
                *buf = *bitmap++;
                if (bitmap < end)
                        *buf |= (u64)(*bitmap++) << 32;
                buf++;
        }

        /* Clear tail bits in the last element of array beyond nbits. */
        if (nbits % 64)
                buf[-1] &= GENMASK_ULL((nbits - 1) % 64, 0);
}
EXPORT_SYMBOL(bitmap_to_arr64);
#endif




















































































































































































































































































































































































































































































































































































































































    9 












   10 





    9 






































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
// SPDX-License-Identifier: GPL-2.0
/*
 * Workingset detection
 *
 * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
 */

#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/shmem_fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include "swap_table.h"
#include "internal.h"

/*
 *                Double CLOCK lists
 *
 * Per node, two clock lists are maintained for file pages: the
 * inactive and the active list.  Freshly faulted pages start out at
 * the head of the inactive list and page reclaim scans pages from the
 * tail.  Pages that are accessed multiple times on the inactive list
 * are promoted to the active list, to protect them from reclaim,
 * whereas active pages are demoted to the inactive list when the
 * active list grows too big.
 *
 *   fault ------------------------+
 *                                 |
 *              +--------------+   |            +-------------+
 *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
 *              +--------------+                +-------------+    |
 *                     |                                           |
 *                     +-------------- promotion ------------------+
 *
 *
 *                Access frequency and refault distance
 *
 * A workload is thrashing when its pages are frequently used but they
 * are evicted from the inactive list every time before another access
 * would have promoted them to the active list.
 *
 * In cases where the average access distance between thrashing pages
 * is bigger than the size of memory there is nothing that can be
 * done - the thrashing set could never fit into memory under any
 * circumstance.
 *
 * However, the average access distance could be bigger than the
 * inactive list, yet smaller than the size of memory.  In this case,
 * the set could fit into memory if it weren't for the currently
 * active pages - which may be used more, hopefully less frequently:
 *
 *      +-memory available to cache-+
 *      |                           |
 *      +-inactive------+-active----+
 *  a b | c d e f g h i | J K L M N |
 *      +---------------+-----------+
 *
 * It is prohibitively expensive to accurately track access frequency
 * of pages.  But a reasonable approximation can be made to measure
 * thrashing on the inactive list, after which refaulting pages can be
 * activated optimistically to compete with the existing active pages.
 *
 * Approximating inactive page access frequency - Observations:
 *
 * 1. When a page is accessed for the first time, it is added to the
 *    head of the inactive list, slides every existing inactive page
 *    towards the tail by one slot, and pushes the current tail page
 *    out of memory.
 *
 * 2. When a page is accessed for the second time, it is promoted to
 *    the active list, shrinking the inactive list by one slot.  This
 *    also slides all inactive pages that were faulted into the cache
 *    more recently than the activated page towards the tail of the
 *    inactive list.
 *
 * Thus:
 *
 * 1. The sum of evictions and activations between any two points in
 *    time indicate the minimum number of inactive pages accessed in
 *    between.
 *
 * 2. Moving one inactive page N page slots towards the tail of the
 *    list requires at least N inactive page accesses.
 *
 * Combining these:
 *
 * 1. When a page is finally evicted from memory, the number of
 *    inactive pages accessed while the page was in cache is at least
 *    the number of page slots on the inactive list.
 *
 * 2. In addition, measuring the sum of evictions and activations (E)
 *    at the time of a page's eviction, and comparing it to another
 *    reading (R) at the time the page faults back into memory tells
 *    the minimum number of accesses while the page was not cached.
 *    This is called the refault distance.
 *
 * Because the first access of the page was the fault and the second
 * access the refault, we combine the in-cache distance with the
 * out-of-cache distance to get the complete minimum access distance
 * of this page:
 *
 *      NR_inactive + (R - E)
 *
 * And knowing the minimum access distance of a page, we can easily
 * tell if the page would be able to stay in cache assuming all page
 * slots in the cache were available:
 *
 *   NR_inactive + (R - E) <= NR_inactive + NR_active
 *
 * If we have swap we should consider about NR_inactive_anon and
 * NR_active_anon, so for page cache and anonymous respectively:
 *
 *   NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
 *   + NR_inactive_anon + NR_active_anon
 *
 *   NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
 *   + NR_inactive_file + NR_active_file
 *
 * Which can be further simplified to:
 *
 *   (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
 *
 *   (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
 *
 * Put into words, the refault distance (out-of-cache) can be seen as
 * a deficit in inactive list space (in-cache).  If the inactive list
 * had (R - E) more page slots, the page would not have been evicted
 * in between accesses, but activated instead.  And on a full system,
 * the only thing eating into inactive list space is active pages.
 *
 *
 *                Refaulting inactive pages
 *
 * All that is known about the active list is that the pages have been
 * accessed more than once in the past.  This means that at any given
 * time there is actually a good chance that pages on the active list
 * are no longer in active use.
 *
 * So when a refault distance of (R - E) is observed and there are at
 * least (R - E) pages in the userspace workingset, the refaulting page
 * is activated optimistically in the hope that (R - E) pages are actually
 * used less frequently than the refaulting page - or even not used at
 * all anymore.
 *
 * That means if inactive cache is refaulting with a suitable refault
 * distance, we assume the cache workingset is transitioning and put
 * pressure on the current workingset.
 *
 * If this is wrong and demotion kicks in, the pages which are truly
 * used more frequently will be reactivated while the less frequently
 * used once will be evicted from memory.
 *
 * But if this is right, the stale pages will be pushed out of memory
 * and the used pages get to stay in cache.
 *
 *                Refaulting active pages
 *
 * If on the other hand the refaulting pages have recently been
 * deactivated, it means that the active list is no longer protecting
 * actively used cache from reclaim. The cache is NOT transitioning to
 * a different workingset; the existing workingset is thrashing in the
 * space allocated to the page cache.
 *
 *
 *                Implementation
 *
 * For each node's LRU lists, a counter for inactive evictions and
 * activations is maintained (node->nonresident_age).
 *
 * On eviction, a snapshot of this counter (along with some bits to
 * identify the node) is stored in the now empty page cache
 * slot of the evicted page.  This is called a shadow entry.
 *
 * On cache misses for which there are shadow entries, an eligible
 * refault distance will immediately activate the refaulting page.
 */

#define WORKINGSET_SHIFT 1
#define EVICTION_SHIFT        ((BITS_PER_LONG - BITS_PER_XA_VALUE) +        \
                         WORKINGSET_SHIFT + NODES_SHIFT + \
                         MEM_CGROUP_ID_SHIFT)
#define EVICTION_SHIFT_ANON        (EVICTION_SHIFT + SWAP_COUNT_SHIFT)
#define EVICTION_MASK        (~0UL >> EVICTION_SHIFT)
#define EVICTION_MASK_ANON        (~0UL >> EVICTION_SHIFT_ANON)

/*
 * Eviction timestamps need to be able to cover the full range of
 * actionable refaults. However, bits are tight in the xarray
 * entry, and after storing the identifier for the lruvec there might
 * not be enough left to represent every single actionable refault. In
 * that case, we have to sacrifice granularity for distance, and group
 * evictions into coarser buckets by shaving off lower timestamp bits.
 */
static unsigned int bucket_order[ANON_AND_FILE] __read_mostly;

static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
                         bool workingset, bool file)
{
        eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON;
        eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
        eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
        eviction = (eviction << WORKINGSET_SHIFT) | workingset;

        return xa_mk_value(eviction);
}

static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
                          unsigned long *evictionp, bool *workingsetp)
{
        unsigned long entry = xa_to_value(shadow);
        int memcgid, nid;
        bool workingset;

        workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
        entry >>= WORKINGSET_SHIFT;
        nid = entry & ((1UL << NODES_SHIFT) - 1);
        entry >>= NODES_SHIFT;
        memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
        entry >>= MEM_CGROUP_ID_SHIFT;

        *memcgidp = memcgid;
        *pgdat = NODE_DATA(nid);
        *evictionp = entry;
        *workingsetp = workingset;
}

#ifdef CONFIG_LRU_GEN

static void *lru_gen_eviction(struct folio *folio)
{
        int hist;
        unsigned long token;
        unsigned long min_seq;
        struct lruvec *lruvec;
        struct lru_gen_folio *lrugen;
        int type = folio_is_file_lru(folio);
        int delta = folio_nr_pages(folio);
        int refs = folio_lru_refs(folio);
        bool workingset = folio_test_workingset(folio);
        int tier = lru_tier_from_refs(refs, workingset);
        struct mem_cgroup *memcg;
        struct pglist_data *pgdat = folio_pgdat(folio);
        unsigned short memcg_id;

        BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH >
                     BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));

        rcu_read_lock();
        memcg = folio_memcg(folio);
        lruvec = mem_cgroup_lruvec(memcg, pgdat);
        lrugen = &lruvec->lrugen;
        min_seq = READ_ONCE(lrugen->min_seq[type]);
        token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);

        hist = lru_hist_from_seq(min_seq);
        atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
        memcg_id = mem_cgroup_private_id(memcg);
        rcu_read_unlock();

        return pack_shadow(memcg_id, pgdat, token, workingset, type);
}

/*
 * Tests if the shadow entry is for a folio that was recently evicted.
 * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
 */
static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
                                unsigned long *token, bool *workingset, bool file)
{
        int memcg_id;
        unsigned long max_seq;
        struct mem_cgroup *memcg;
        struct pglist_data *pgdat;

        unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);

        memcg = mem_cgroup_from_private_id(memcg_id);
        *lruvec = mem_cgroup_lruvec(memcg, pgdat);

        max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
        max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH;

        return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
}

static void lru_gen_refault(struct folio *folio, void *shadow)
{
        bool recent;
        int hist, tier, refs;
        bool workingset;
        unsigned long token;
        struct lruvec *lruvec;
        struct lru_gen_folio *lrugen;
        int type = folio_is_file_lru(folio);
        int delta = folio_nr_pages(folio);

        rcu_read_lock();

        recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type);
        if (lruvec != folio_lruvec(folio))
                goto unlock;

        mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);

        if (!recent)
                goto unlock;

        lrugen = &lruvec->lrugen;

        hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
        refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
        tier = lru_tier_from_refs(refs, workingset);

        atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);

        /* see folio_add_lru() where folio_set_active() will be called */
        if (lru_gen_in_fault())
                mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);

        if (workingset) {
                folio_set_workingset(folio);
                mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
        } else
                set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
        rcu_read_unlock();
}

#else /* !CONFIG_LRU_GEN */

static void *lru_gen_eviction(struct folio *folio)
{
        return NULL;
}

static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
                                unsigned long *token, bool *workingset, bool file)
{
        return false;
}

static void lru_gen_refault(struct folio *folio, void *shadow)
{
}

#endif /* CONFIG_LRU_GEN */

/**
 * workingset_age_nonresident - age non-resident entries as LRU ages
 * @lruvec: the lruvec that was aged
 * @nr_pages: the number of pages to count
 *
 * As in-memory pages are aged, non-resident pages need to be aged as
 * well, in order for the refault distances later on to be comparable
 * to the in-memory dimensions. This function allows reclaim and LRU
 * operations to drive the non-resident aging along in parallel.
 */
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
{
        /*
         * Reclaiming a cgroup means reclaiming all its children in a
         * round-robin fashion. That means that each cgroup has an LRU
         * order that is composed of the LRU orders of its child
         * cgroups; and every page has an LRU position not just in the
         * cgroup that owns it, but in all of that group's ancestors.
         *
         * So when the physical inactive list of a leaf cgroup ages,
         * the virtual inactive lists of all its parents, including
         * the root cgroup's, age as well.
         */
        do {
                atomic_long_add(nr_pages, &lruvec->nonresident_age);
        } while ((lruvec = parent_lruvec(lruvec)));
}

/**
 * workingset_eviction - note the eviction of a folio from memory
 * @target_memcg: the cgroup that is causing the reclaim
 * @folio: the folio being evicted
 *
 * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
 * of the evicted @folio so that a later refault can be detected.
 */
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
{
        struct pglist_data *pgdat = folio_pgdat(folio);
        int file = folio_is_file_lru(folio);
        unsigned long eviction;
        struct lruvec *lruvec;
        int memcgid;

        /* Folio is fully exclusive and pins folio's memory cgroup pointer */
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
        VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (lru_gen_enabled())
                return lru_gen_eviction(folio);

        lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
        /* XXX: target_memcg can be NULL, go through lruvec */
        memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec));
        eviction = atomic_long_read(&lruvec->nonresident_age);
        eviction >>= bucket_order[file];
        workingset_age_nonresident(lruvec, folio_nr_pages(folio));
        return pack_shadow(memcgid, pgdat, eviction,
                           folio_test_workingset(folio), file);
}

/**
 * workingset_test_recent - tests if the shadow entry is for a folio that was
 * recently evicted. Also fills in @workingset with the value unpacked from
 * shadow.
 * @shadow: the shadow entry to be tested.
 * @file: whether the corresponding folio is from the file lru.
 * @workingset: where the workingset value unpacked from shadow should
 * be stored.
 * @flush: whether to flush cgroup rstat.
 *
 * Return: true if the shadow is for a recently evicted folio; false otherwise.
 */
bool workingset_test_recent(void *shadow, bool file, bool *workingset,
                                bool flush)
{
        struct mem_cgroup *eviction_memcg;
        struct lruvec *eviction_lruvec;
        unsigned long refault_distance;
        unsigned long workingset_size;
        unsigned long refault;
        int memcgid;
        struct pglist_data *pgdat;
        unsigned long eviction;

        if (lru_gen_enabled()) {
                bool recent;

                rcu_read_lock();
                recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction,
                                             workingset, file);
                rcu_read_unlock();
                return recent;
        }

        rcu_read_lock();
        unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
        eviction <<= bucket_order[file];

        /*
         * Look up the memcg associated with the stored ID. It might
         * have been deleted since the folio's eviction.
         *
         * Note that in rare events the ID could have been recycled
         * for a new cgroup that refaults a shared folio. This is
         * impossible to tell from the available data. However, this
         * should be a rare and limited disturbance, and activations
         * are always speculative anyway. Ultimately, it's the aging
         * algorithm's job to shake out the minimum access frequency
         * for the active cache.
         *
         * XXX: On !CONFIG_MEMCG, this will always return NULL; it
         * would be better if the root_mem_cgroup existed in all
         * configurations instead.
         */
        eviction_memcg = mem_cgroup_from_private_id(memcgid);
        if (!mem_cgroup_tryget(eviction_memcg))
                eviction_memcg = NULL;
        rcu_read_unlock();

        if (!mem_cgroup_disabled() && !eviction_memcg)
                return false;
        /*
         * Flush stats (and potentially sleep) outside the RCU read section.
         *
         * Note that workingset_test_recent() itself might be called in RCU read
         * section (for e.g, in cachestat) - these callers need to skip flushing
         * stats (via the flush argument).
         *
         * XXX: With per-memcg flushing and thresholding, is ratelimiting
         * still needed here?
         */
        if (flush)
                mem_cgroup_flush_stats_ratelimited(eviction_memcg);

        eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
        refault = atomic_long_read(&eviction_lruvec->nonresident_age);

        /*
         * Calculate the refault distance
         *
         * The unsigned subtraction here gives an accurate distance
         * across nonresident_age overflows in most cases. There is a
         * special case: usually, shadow entries have a short lifetime
         * and are either refaulted or reclaimed along with the inode
         * before they get too old.  But it is not impossible for the
         * nonresident_age to lap a shadow entry in the field, which
         * can then result in a false small refault distance, leading
         * to a false activation should this old entry actually
         * refault again.  However, earlier kernels used to deactivate
         * unconditionally with *every* reclaim invocation for the
         * longest time, so the occasional inappropriate activation
         * leading to pressure on the active list is not a problem.
         */
        refault_distance = ((refault - eviction) &
                            (file ? EVICTION_MASK : EVICTION_MASK_ANON));

        /*
         * Compare the distance to the existing workingset size. We
         * don't activate pages that couldn't stay resident even if
         * all the memory was available to the workingset. Whether
         * workingset competition needs to consider anon or not depends
         * on having free swap space.
         */
        workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
        if (!file) {
                workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_INACTIVE_FILE);
        }
        if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
                workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_ACTIVE_ANON);
                if (file) {
                        workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_INACTIVE_ANON);
                }
        }

        mem_cgroup_put(eviction_memcg);
        return refault_distance <= workingset_size;
}

/**
 * workingset_refault - Evaluate the refault of a previously evicted folio.
 * @folio: The freshly allocated replacement folio.
 * @shadow: Shadow entry of the evicted folio.
 *
 * Calculates and evaluates the refault distance of the previously
 * evicted folio in the context of the node and the memcg whose memory
 * pressure caused the eviction.
 */
void workingset_refault(struct folio *folio, void *shadow)
{
        bool file = folio_is_file_lru(folio);
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;
        bool workingset;
        long nr;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (lru_gen_enabled()) {
                lru_gen_refault(folio, shadow);
                return;
        }

        /*
         * The activation decision for this folio is made at the level
         * where the eviction occurred, as that is where the LRU order
         * during folio reclaim is being determined.
         *
         * However, the cgroup that will own the folio is the one that
         * is actually experiencing the refault event. Make sure the folio is
         * locked to guarantee folio_memcg() stability throughout.
         */
        nr = folio_nr_pages(folio);
        memcg = get_mem_cgroup_from_folio(folio);
        lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio));
        mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);

        if (!workingset_test_recent(shadow, file, &workingset, true))
                goto out;

        folio_set_active(folio);
        workingset_age_nonresident(lruvec, nr);
        mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);

        /* Folio was active prior to eviction */
        if (workingset) {
                folio_set_workingset(folio);
                /*
                 * XXX: Move to folio_add_lru() when it supports new vs
                 * putback
                 */
                lru_note_cost_refault(folio);
                mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
        }
out:
        mem_cgroup_put(memcg);
}

/**
 * workingset_activation - note a page activation
 * @folio: Folio that is being activated.
 */
void workingset_activation(struct folio *folio)
{
        /*
         * Filter non-memcg pages here, e.g. unmap can call
         * mark_page_accessed() on VDSO pages.
         */
        if (mem_cgroup_disabled() || folio_memcg_charged(folio)) {
                rcu_read_lock();
                workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
                rcu_read_unlock();
        }
}

/*
 * Shadow entries reflect the share of the working set that does not
 * fit into memory, so their number depends on the access pattern of
 * the workload.  In most cases, they will refault or get reclaimed
 * along with the inode, but a (malicious) workload that streams
 * through files with a total size several times that of available
 * memory, while preventing the inodes from being reclaimed, can
 * create excessive amounts of shadow nodes.  To keep a lid on this,
 * track shadow nodes and reclaim them when they grow way past the
 * point where they would still be useful.
 */

struct list_lru shadow_nodes;

void workingset_update_node(struct xa_node *node)
{
        struct page *page = virt_to_page(node);

        /*
         * Track non-empty nodes that contain only shadow entries;
         * unlink those that contain pages or are being freed.
         *
         * Avoid acquiring the list_lru lock when the nodes are
         * already where they should be. The list_empty() test is safe
         * as node->private_list is protected by the i_pages lock.
         */
        lockdep_assert_held(&node->array->xa_lock);

        if (node->count && node->count == node->nr_values) {
                if (list_empty(&node->private_list)) {
                        list_lru_add_obj(&shadow_nodes, &node->private_list);
                        __inc_node_page_state(page, WORKINGSET_NODES);
                }
        } else {
                if (!list_empty(&node->private_list)) {
                        list_lru_del_obj(&shadow_nodes, &node->private_list);
                        __dec_node_page_state(page, WORKINGSET_NODES);
                }
        }
}

static unsigned long count_shadow_nodes(struct shrinker *shrinker,
                                        struct shrink_control *sc)
{
        unsigned long max_nodes;
        unsigned long nodes;
        unsigned long pages;

        nodes = list_lru_shrink_count(&shadow_nodes, sc);
        if (!nodes)
                return SHRINK_EMPTY;

        /*
         * Approximate a reasonable limit for the nodes
         * containing shadow entries. We don't need to keep more
         * shadow entries than possible pages on the active list,
         * since refault distances bigger than that are dismissed.
         *
         * The size of the active list converges toward 100% of
         * overall page cache as memory grows, with only a tiny
         * inactive list. Assume the total cache size for that.
         *
         * Nodes might be sparsely populated, with only one shadow
         * entry in the extreme case. Obviously, we cannot keep one
         * node for every eligible shadow entry, so compromise on a
         * worst-case density of 1/8th. Below that, not all eligible
         * refaults can be detected anymore.
         *
         * On 64-bit with 7 xa_nodes per page and 64 slots
         * each, this will reclaim shadow entries when they consume
         * ~1.8% of available memory:
         *
         * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
         */
#ifdef CONFIG_MEMCG
        if (sc->memcg) {
                struct lruvec *lruvec;
                int i;

                mem_cgroup_flush_stats_ratelimited(sc->memcg);
                lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));

                for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
                        pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1);

                pages += lruvec_page_state_local(
                        lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
                pages += lruvec_page_state_local(
                        lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
        } else
#endif
                pages = node_present_pages(sc->nid);

        max_nodes = pages >> (XA_CHUNK_SHIFT - 3);

        if (nodes <= max_nodes)
                return 0;
        return nodes - max_nodes;
}

static enum lru_status shadow_lru_isolate(struct list_head *item,
                                          struct list_lru_one *lru,
                                          void *arg) __must_hold(lru->lock)
{
        struct xa_node *node = container_of(item, struct xa_node, private_list);
        struct address_space *mapping;
        int ret;

        /*
         * Page cache insertions and deletions synchronously maintain
         * the shadow node LRU under the i_pages lock and the
         * &lru->lock. Because the page cache tree is emptied before
         * the inode can be destroyed, holding the &lru->lock pins any
         * address_space that has nodes on the LRU.
         *
         * We can then safely transition to the i_pages lock to
         * pin only the address_space of the particular node we want
         * to reclaim, take the node off-LRU, and drop the &lru->lock.
         */

        mapping = container_of(node->array, struct address_space, i_pages);

        /* Coming from the list, invert the lock order */
        if (!xa_trylock(&mapping->i_pages)) {
                spin_unlock_irq(&lru->lock);
                ret = LRU_RETRY;
                goto out;
        }

        /* For page cache we need to hold i_lock */
        if (mapping->host != NULL) {
                if (!spin_trylock(&mapping->host->i_lock)) {
                        xa_unlock(&mapping->i_pages);
                        spin_unlock_irq(&lru->lock);
                        ret = LRU_RETRY;
                        goto out;
                }
        }

        list_lru_isolate(lru, item);
        __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);

        spin_unlock(&lru->lock);

        /*
         * The nodes should only contain one or more shadow entries,
         * no pages, so we expect to be able to remove them all and
         * delete and free the empty node afterwards.
         */
        if (WARN_ON_ONCE(!node->nr_values))
                goto out_invalid;
        if (WARN_ON_ONCE(node->count != node->nr_values))
                goto out_invalid;
        xa_delete_node(node, workingset_update_node);
        mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1);

out_invalid:
        xa_unlock_irq(&mapping->i_pages);
        if (mapping->host != NULL) {
                if (mapping_shrinkable(mapping))
                        inode_lru_list_add(mapping->host);
                spin_unlock(&mapping->host->i_lock);
        }
        ret = LRU_REMOVED_RETRY;
out:
        cond_resched();
        return ret;
}

static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
                                       struct shrink_control *sc)
{
        /* list_lru lock nests inside the IRQ-safe i_pages lock */
        return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
                                        NULL);
}

/*
 * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
 * i_pages lock.
 */
static struct lock_class_key shadow_nodes_key;

static int __init workingset_init(void)
{
        unsigned int timestamp_bits, timestamp_bits_anon;
        struct shrinker *workingset_shadow_shrinker;
        unsigned int max_order;
        int ret = -ENOMEM;

        BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
        /*
         * Calculate the eviction bucket size to cover the longest
         * actionable refault distance, which is currently half of
         * memory (totalram_pages/2). However, memory hotplug may add
         * some more pages at runtime, so keep working with up to
         * double the initial memory by using totalram_pages as-is.
         */
        timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
        timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON;
        max_order = fls_long(totalram_pages() - 1);
        if (max_order > (BITS_PER_LONG - EVICTION_SHIFT))
                bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits;
        if (max_order > timestamp_bits_anon)
                bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon;
        pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n",
                timestamp_bits, timestamp_bits_anon, max_order,
                bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]);

        workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
                                                    SHRINKER_MEMCG_AWARE,
                                                    "mm-shadow");
        if (!workingset_shadow_shrinker)
                goto err;

        ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
                                      &shadow_nodes_key);
        if (ret)
                goto err_list_lru;

        workingset_shadow_shrinker->count_objects = count_shadow_nodes;
        workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
        /* ->count reports only fully expendable nodes */
        workingset_shadow_shrinker->seeks = 0;

        shrinker_register(workingset_shadow_shrinker);
        return 0;
err_list_lru:
        shrinker_free(workingset_shadow_shrinker);
err:
        return ret;
}
module_init(workingset_init);








































































































































































































































































































































































































































    2 




    2 










































    2 






    2 


















    2 





































    3 














    3 


















    3 























































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Generic INET transport hashtables
 *
 * Authors:        Lotsa people, from code originally in tcp
 */

#include <linux/module.h>
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/vmalloc.h>
#include <linux/memblock.h>
#include <linux/gcd.h>

#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/inet6_hashtables.h>
#endif
#include <net/hotdata.h>
#include <net/ip.h>
#include <net/rps.h>
#include <net/secure_seq.h>
#include <net/sock_reuseport.h>
#include <net/tcp.h>

static void inet_init_ehash_secret(void)
{
        net_get_random_sleepable_once(&inet_ehash_secret,
                                      sizeof(inet_ehash_secret));
}

u32 inet_ehashfn(const struct net *net, const __be32 laddr,
                 const __u16 lport, const __be32 faddr,
                 const __be16 fport)
{
        return lport + __inet_ehashfn(laddr, 0, faddr, fport,
                                      inet_ehash_secret + net_hash_mix(net));
}
EXPORT_SYMBOL_GPL(inet_ehashfn);

/* This function handles inet_sock, but also timewait and request sockets
 * for IPv4/IPv6.
 */
static u32 sk_ehashfn(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6 &&
            !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
                return inet6_ehashfn(sock_net(sk),
                                     &sk->sk_v6_rcv_saddr, sk->sk_num,
                                     &sk->sk_v6_daddr, sk->sk_dport);
#endif
        return inet_ehashfn(sock_net(sk),
                            sk->sk_rcv_saddr, sk->sk_num,
                            sk->sk_daddr, sk->sk_dport);
}

static bool sk_is_connect_bind(const struct sock *sk)
{
        if (sk->sk_state == TCP_TIME_WAIT)
                return inet_twsk(sk)->tw_connect_bind;
        else
                return sk->sk_userlocks & SOCK_CONNECT_BIND;
}

/*
 * Allocate and initialize a new local port bind bucket.
 * The bindhash mutex for snum's hash chain must be held here.
 */
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
                                                 struct net *net,
                                                 struct inet_bind_hashbucket *head,
                                                 const unsigned short snum,
                                                 int l3mdev)
{
        struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);

        if (tb) {
                write_pnet(&tb->ib_net, net);
                tb->l3mdev    = l3mdev;
                tb->port      = snum;
                tb->fastreuse = 0;
                tb->fastreuseport = 0;
                INIT_HLIST_HEAD(&tb->bhash2);
                hlist_add_head_rcu(&tb->node, &head->chain);
        }
        return tb;
}

/*
 * Caller must hold hashbucket lock for this tb with local BH disabled
 */
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
{
        const struct inet_bind2_bucket *tb2;

        if (hlist_empty(&tb->bhash2)) {
                hlist_del_rcu(&tb->node);
                kfree_rcu(tb, rcu);
                return;
        }

        if (tb->fastreuse == -1 && tb->fastreuseport == -1)
                return;
        hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) {
                if (tb2->fastreuse != -1 || tb2->fastreuseport != -1)
                        return;
        }
        tb->fastreuse = -1;
        tb->fastreuseport = -1;
}

bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
                            unsigned short port, int l3mdev)
{
        return net_eq(ib_net(tb), net) && tb->port == port &&
                tb->l3mdev == l3mdev;
}

static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
                                   struct net *net,
                                   struct inet_bind_hashbucket *head,
                                   struct inet_bind_bucket *tb,
                                   const struct sock *sk)
{
        write_pnet(&tb2->ib_net, net);
        tb2->l3mdev = tb->l3mdev;
        tb2->port = tb->port;
#if IS_ENABLED(CONFIG_IPV6)
        BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED));
        if (sk->sk_family == AF_INET6) {
                tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
                tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
        } else {
                tb2->addr_type = IPV6_ADDR_MAPPED;
                ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr);
        }
#else
        tb2->rcv_saddr = sk->sk_rcv_saddr;
#endif
        tb2->fastreuse = 0;
        tb2->fastreuseport = 0;
        INIT_HLIST_HEAD(&tb2->owners);
        hlist_add_head(&tb2->node, &head->chain);
        hlist_add_head(&tb2->bhash_node, &tb->bhash2);
}

struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
                                                   struct net *net,
                                                   struct inet_bind_hashbucket *head,
                                                   struct inet_bind_bucket *tb,
                                                   const struct sock *sk)
{
        struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC);

        if (tb2)
                inet_bind2_bucket_init(tb2, net, head, tb, sk);

        return tb2;
}

/* Caller must hold hashbucket lock for this tb with local BH disabled */
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{
        const struct sock *sk;

        if (hlist_empty(&tb->owners)) {
                __hlist_del(&tb->node);
                __hlist_del(&tb->bhash_node);
                kmem_cache_free(cachep, tb);
                return;
        }

        if (tb->fastreuse == -1 && tb->fastreuseport == -1)
                return;
        sk_for_each_bound(sk, &tb->owners) {
                if (!sk_is_connect_bind(sk))
                        return;
        }
        tb->fastreuse = -1;
        tb->fastreuseport = -1;
}

static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
                                         const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);

        if (tb2->addr_type != IPV6_ADDR_MAPPED)
                return false;
#endif
        return tb2->rcv_saddr == sk->sk_rcv_saddr;
}

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
                    struct inet_bind2_bucket *tb2, unsigned short port)
{
        WRITE_ONCE(inet_sk(sk)->inet_num, port);
        inet_csk(sk)->icsk_bind_hash = tb;
        inet_csk(sk)->icsk_bind2_hash = tb2;
        sk_add_bind_node(sk, &tb2->owners);
}

/*
 * Get rid of any references to a local port held by the given sock.
 */
static void __inet_put_port(struct sock *sk)
{
        struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
        struct inet_bind_hashbucket *head, *head2;
        struct net *net = sock_net(sk);
        struct inet_bind_bucket *tb;
        int bhash;

        bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size);
        head = &hashinfo->bhash[bhash];
        head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num);

        spin_lock(&head->lock);
        tb = inet_csk(sk)->icsk_bind_hash;
        inet_csk(sk)->icsk_bind_hash = NULL;
        WRITE_ONCE(inet_sk(sk)->inet_num, 0);
        sk->sk_userlocks &= ~SOCK_CONNECT_BIND;

        spin_lock(&head2->lock);
        if (inet_csk(sk)->icsk_bind2_hash) {
                struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash;

                __sk_del_bind_node(sk);
                inet_csk(sk)->icsk_bind2_hash = NULL;
                inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
        }
        spin_unlock(&head2->lock);

        inet_bind_bucket_destroy(tb);
        spin_unlock(&head->lock);
}

void inet_put_port(struct sock *sk)
{
        local_bh_disable();
        __inet_put_port(sk);
        local_bh_enable();
}
EXPORT_SYMBOL(inet_put_port);

int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
        struct inet_hashinfo *table = tcp_get_hashinfo(sk);
        unsigned short port = inet_sk(child)->inet_num;
        struct inet_bind_hashbucket *head, *head2;
        bool created_inet_bind_bucket = false;
        struct net *net = sock_net(sk);
        bool update_fastreuse = false;
        struct inet_bind2_bucket *tb2;
        struct inet_bind_bucket *tb;
        int bhash, l3mdev;

        bhash = inet_bhashfn(net, port, table->bhash_size);
        head = &table->bhash[bhash];
        head2 = inet_bhashfn_portaddr(table, child, net, port);

        spin_lock(&head->lock);
        spin_lock(&head2->lock);
        tb = inet_csk(sk)->icsk_bind_hash;
        tb2 = inet_csk(sk)->icsk_bind2_hash;
        if (unlikely(!tb || !tb2)) {
                spin_unlock(&head2->lock);
                spin_unlock(&head->lock);
                return -ENOENT;
        }
        if (tb->port != port) {
                l3mdev = inet_sk_bound_l3mdev(sk);

                /* NOTE: using tproxy and redirecting skbs to a proxy
                 * on a different listener port breaks the assumption
                 * that the listener socket's icsk_bind_hash is the same
                 * as that of the child socket. We have to look up or
                 * create a new bind bucket for the child here. */
                inet_bind_bucket_for_each(tb, &head->chain) {
                        if (inet_bind_bucket_match(tb, net, port, l3mdev))
                                break;
                }
                if (!tb) {
                        tb = inet_bind_bucket_create(table->bind_bucket_cachep,
                                                     net, head, port, l3mdev);
                        if (!tb) {
                                spin_unlock(&head2->lock);
                                spin_unlock(&head->lock);
                                return -ENOMEM;
                        }
                        created_inet_bind_bucket = true;
                }
                update_fastreuse = true;

                goto bhash2_find;
        } else if (!inet_bind2_bucket_addr_match(tb2, child)) {
                l3mdev = inet_sk_bound_l3mdev(sk);

bhash2_find:
                tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child);
                if (!tb2) {
                        tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
                                                       net, head2, tb, child);
                        if (!tb2)
                                goto error;
                }
        }
        if (update_fastreuse)
                inet_csk_update_fastreuse(child, tb, tb2);
        inet_bind_hash(child, tb, tb2, port);
        spin_unlock(&head2->lock);
        spin_unlock(&head->lock);

        return 0;

error:
        if (created_inet_bind_bucket)
                inet_bind_bucket_destroy(tb);
        spin_unlock(&head2->lock);
        spin_unlock(&head->lock);
        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);

static struct inet_listen_hashbucket *
inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
{
        u32 hash;

#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                hash = ipv6_portaddr_hash(sock_net(sk),
                                          &sk->sk_v6_rcv_saddr,
                                          inet_sk(sk)->inet_num);
        else
#endif
                hash = ipv4_portaddr_hash(sock_net(sk),
                                          inet_sk(sk)->inet_rcv_saddr,
                                          inet_sk(sk)->inet_num);
        return inet_lhash2_bucket(h, hash);
}

static inline int compute_score(struct sock *sk, const struct net *net,
                                const unsigned short hnum, const __be32 daddr,
                                const int dif, const int sdif)
{
        int score = -1;

        if (net_eq(sock_net(sk), net) && READ_ONCE(sk->sk_num) == hnum &&
                        !ipv6_only_sock(sk)) {
                if (sk->sk_rcv_saddr != daddr)
                        return -1;

                if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
                        return -1;
                score =  sk->sk_bound_dev_if ? 2 : 1;

                if (sk->sk_family == PF_INET)
                        score++;
                if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
                        score++;
        }
        return score;
}

/**
 * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
 * @net: network namespace.
 * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
 * @skb: context for a potential SK_REUSEPORT program.
 * @doff: header offset.
 * @saddr: source address.
 * @sport: source port.
 * @daddr: destination address.
 * @hnum: destination port in host byte order.
 * @ehashfn: hash function used to generate the fallback hash.
 *
 * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
 *         the selected sock or an error.
 */
struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk,
                                   struct sk_buff *skb, int doff,
                                   __be32 saddr, __be16 sport,
                                   __be32 daddr, unsigned short hnum,
                                   inet_ehashfn_t *ehashfn)
{
        struct sock *reuse_sk = NULL;
        u32 phash;

        if (sk->sk_reuseport) {
                phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn,
                                        net, daddr, hnum, saddr, sport);
                reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
        }
        return reuse_sk;
}
EXPORT_SYMBOL_GPL(inet_lookup_reuseport);

/*
 * Here are some nice properties to exploit here. The BSD API
 * does not allow a listening sock to specify the remote port nor the
 * remote address for the connection. So always assume those are both
 * wildcarded during the search since they can never be otherwise.
 */

/* called with rcu_read_lock() : No refcount taken on the socket */
static struct sock *inet_lhash2_lookup(const struct net *net,
                                struct inet_listen_hashbucket *ilb2,
                                struct sk_buff *skb, int doff,
                                const __be32 saddr, __be16 sport,
                                const __be32 daddr, const unsigned short hnum,
                                const int dif, const int sdif)
{
        struct sock *sk, *result = NULL;
        struct hlist_nulls_node *node;
        int score, hiscore = 0;

        sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
                score = compute_score(sk, net, hnum, daddr, dif, sdif);
                if (score > hiscore) {
                        result = inet_lookup_reuseport(net, sk, skb, doff,
                                                       saddr, sport, daddr, hnum, inet_ehashfn);
                        if (result)
                                return result;

                        result = sk;
                        hiscore = score;
                }
        }

        return result;
}

struct sock *inet_lookup_run_sk_lookup(const struct net *net,
                                       int protocol,
                                       struct sk_buff *skb, int doff,
                                       __be32 saddr, __be16 sport,
                                       __be32 daddr, u16 hnum, const int dif,
                                       inet_ehashfn_t *ehashfn)
{
        struct sock *sk, *reuse_sk;
        bool no_reuseport;

        no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport,
                                            daddr, hnum, dif, &sk);
        if (no_reuseport || IS_ERR_OR_NULL(sk))
                return sk;

        reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
                                         ehashfn);
        if (reuse_sk)
                sk = reuse_sk;
        return sk;
}

struct sock *__inet_lookup_listener(const struct net *net,
                                    struct sk_buff *skb, int doff,
                                    const __be32 saddr, __be16 sport,
                                    const __be32 daddr, const unsigned short hnum,
                                    const int dif, const int sdif)
{
        struct inet_listen_hashbucket *ilb2;
        struct inet_hashinfo *hashinfo;
        struct sock *result = NULL;
        unsigned int hash2;

        /* Lookup redirect from BPF */
        if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
                result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
                                                   saddr, sport, daddr, hnum, dif,
                                                   inet_ehashfn);
                if (result)
                        goto done;
        }

        hashinfo = net->ipv4.tcp_death_row.hashinfo;
        hash2 = ipv4_portaddr_hash(net, daddr, hnum);
        ilb2 = inet_lhash2_bucket(hashinfo, hash2);

        result = inet_lhash2_lookup(net, ilb2, skb, doff,
                                    saddr, sport, daddr, hnum,
                                    dif, sdif);
        if (result)
                goto done;

        /* Lookup lhash2 with INADDR_ANY */
        hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
        ilb2 = inet_lhash2_bucket(hashinfo, hash2);

        result = inet_lhash2_lookup(net, ilb2, skb, doff,
                                    saddr, sport, htonl(INADDR_ANY), hnum,
                                    dif, sdif);
done:
        if (IS_ERR(result))
                return NULL;
        return result;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);

/* All sockets share common refcount, but have different destructors */
void sock_gen_put(struct sock *sk)
{
        if (!refcount_dec_and_test(&sk->sk_refcnt))
                return;

        if (sk->sk_state == TCP_TIME_WAIT)
                inet_twsk_free(inet_twsk(sk));
        else if (sk->sk_state == TCP_NEW_SYN_RECV)
                reqsk_free(inet_reqsk(sk));
        else
                sk_free(sk);
}
EXPORT_SYMBOL_GPL(sock_gen_put);

void sock_edemux(struct sk_buff *skb)
{
        sock_gen_put(skb->sk);
}
EXPORT_SYMBOL(sock_edemux);

struct sock *__inet_lookup_established(const struct net *net,
                                       const __be32 saddr, const __be16 sport,
                                       const __be32 daddr, const u16 hnum,
                                       const int dif, const int sdif)
{
        const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
        INET_ADDR_COOKIE(acookie, saddr, daddr);
        const struct hlist_nulls_node *node;
        struct inet_ehash_bucket *head;
        struct inet_hashinfo *hashinfo;
        unsigned int hash, slot;
        struct sock *sk;

        hashinfo = net->ipv4.tcp_death_row.hashinfo;
        hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
        slot = hash & hashinfo->ehash_mask;
        head = &hashinfo->ehash[slot];

begin:
        sk_nulls_for_each_rcu(sk, node, &head->chain) {
                if (sk->sk_hash != hash)
                        continue;
                if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
                        if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
                                goto out;
                        if (unlikely(!inet_match(net, sk, acookie,
                                                 ports, dif, sdif))) {
                                sock_gen_put(sk);
                                goto begin;
                        }
                        goto found;
                }
        }
        /*
         * if the nulls value we got at the end of this lookup is
         * not the expected one, we must restart lookup.
         * We probably met an item that was moved to another chain.
         */
        if (get_nulls_value(node) != slot)
                goto begin;
out:
        sk = NULL;
found:
        return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);

/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
                                    struct sock *sk, __u16 lport,
                                    struct inet_timewait_sock **twp,
                                    bool rcu_lookup,
                                    u32 hash)
{
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        struct inet_sock *inet = inet_sk(sk);
        __be32 daddr = inet->inet_rcv_saddr;
        __be32 saddr = inet->inet_daddr;
        int dif = sk->sk_bound_dev_if;
        struct net *net = sock_net(sk);
        int sdif = l3mdev_master_ifindex_by_index(net, dif);
        INET_ADDR_COOKIE(acookie, saddr, daddr);
        const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
        struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
        struct inet_timewait_sock *tw = NULL;
        const struct hlist_nulls_node *node;
        struct sock *sk2;
        spinlock_t *lock;

        if (rcu_lookup) {
                sk_nulls_for_each(sk2, node, &head->chain) {
                        if (sk2->sk_hash != hash ||
                            !inet_match(net, sk2, acookie, ports, dif, sdif))
                                continue;
                        if (sk2->sk_state == TCP_TIME_WAIT)
                                break;
                        return -EADDRNOTAVAIL;
                }
                return 0;
        }

        lock = inet_ehash_lockp(hinfo, hash);
        spin_lock(lock);

        sk_nulls_for_each(sk2, node, &head->chain) {
                if (sk2->sk_hash != hash)
                        continue;

                if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
                        if (sk2->sk_state == TCP_TIME_WAIT) {
                                tw = inet_twsk(sk2);
                                if (tcp_twsk_unique(sk, sk2, twp))
                                        break;
                        }
                        goto not_unique;
                }
        }

        /* Must record num and sport now. Otherwise we will see
         * in hash table socket with a funny identity.
         */
        inet->inet_num = lport;
        inet->inet_sport = htons(lport);
        sk->sk_hash = hash;
        WARN_ON(!sk_unhashed(sk));
        __sk_nulls_add_node_rcu(sk, &head->chain);
        if (tw) {
                sk_nulls_del_node_init_rcu((struct sock *)tw);
                __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
        }
        spin_unlock(lock);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

        if (twp) {
                *twp = tw;
        } else if (tw) {
                /* Silly. Should hash-dance instead... */
                inet_twsk_deschedule_put(tw);
        }
        return 0;

not_unique:
        spin_unlock(lock);
        return -EADDRNOTAVAIL;
}

static u64 inet_sk_port_offset(const struct sock *sk)
{
        const struct inet_sock *inet = inet_sk(sk);

        return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
                                          inet->inet_daddr,
                                          inet->inet_dport);
}

/* Searches for an exsiting socket in the ehash bucket list.
 * Returns true if found, false otherwise.
 */
static bool inet_ehash_lookup_by_sk(struct sock *sk,
                                    struct hlist_nulls_head *list)
{
        const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
        const int sdif = sk->sk_bound_dev_if;
        const int dif = sk->sk_bound_dev_if;
        const struct hlist_nulls_node *node;
        struct net *net = sock_net(sk);
        struct sock *esk;

        INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);

        sk_nulls_for_each_rcu(esk, node, list) {
                if (esk->sk_hash != sk->sk_hash)
                        continue;
                if (sk->sk_family == AF_INET) {
                        if (unlikely(inet_match(net, esk, acookie,
                                                ports, dif, sdif))) {
                                return true;
                        }
                }
#if IS_ENABLED(CONFIG_IPV6)
                else if (sk->sk_family == AF_INET6) {
                        if (unlikely(inet6_match(net, esk,
                                                 &sk->sk_v6_daddr,
                                                 &sk->sk_v6_rcv_saddr,
                                                 ports, dif, sdif))) {
                                return true;
                        }
                }
#endif
        }
        return false;
}

/* Insert a socket into ehash, and eventually remove another one
 * (The another one can be a SYN_RECV or TIMEWAIT)
 * If an existing socket already exists, socket sk is not inserted,
 * and sets found_dup_sk parameter to true.
 */
bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{
        struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
        struct inet_ehash_bucket *head;
        struct hlist_nulls_head *list;
        spinlock_t *lock;
        bool ret = true;

        WARN_ON_ONCE(!sk_unhashed(sk));

        sk->sk_hash = sk_ehashfn(sk);
        head = inet_ehash_bucket(hashinfo, sk->sk_hash);
        list = &head->chain;
        lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

        spin_lock(lock);
        if (osk) {
                WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
                ret = sk_nulls_replace_node_init_rcu(osk, sk);
                goto unlock;
        }

        if (found_dup_sk) {
                *found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
                if (*found_dup_sk)
                        ret = false;
        }

        if (ret)
                __sk_nulls_add_node_rcu(sk, list);

unlock:
        spin_unlock(lock);

        return ret;
}

bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{
        bool ok = inet_ehash_insert(sk, osk, found_dup_sk);

        if (ok) {
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        } else {
                tcp_orphan_count_inc();
                inet_sk_set_state(sk, TCP_CLOSE);
                sock_set_flag(sk, SOCK_DEAD);
                inet_csk_destroy_sock(sk);
        }
        return ok;
}

static int inet_reuseport_add_sock(struct sock *sk,
                                   struct inet_listen_hashbucket *ilb)
{
        struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
        const struct hlist_nulls_node *node;
        kuid_t uid = sk_uid(sk);
        struct sock *sk2;

        sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
                if (sk2 != sk &&
                    sk2->sk_family == sk->sk_family &&
                    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
                    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
                    inet_csk(sk2)->icsk_bind_hash == tb &&
                    sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
                    inet_rcv_saddr_equal(sk, sk2, false))
                        return reuseport_add_sock(sk, sk2,
                                                  inet_rcv_saddr_any(sk));
        }

        return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}

int inet_hash(struct sock *sk)
{
        struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
        struct inet_listen_hashbucket *ilb2;
        int err = 0;

        if (sk->sk_state == TCP_CLOSE)
                return 0;

        if (sk->sk_state != TCP_LISTEN) {
                local_bh_disable();
                inet_ehash_nolisten(sk, NULL, NULL);
                local_bh_enable();
                return 0;
        }

#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                inet6_init_ehash_secret();
#endif
        inet_init_ehash_secret();

        WARN_ON(!sk_unhashed(sk));
        ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);

        spin_lock(&ilb2->lock);
        if (sk->sk_reuseport) {
                err = inet_reuseport_add_sock(sk, ilb2);
                if (err)
                        goto unlock;
        }
        sock_set_flag(sk, SOCK_RCU_FREE);
        if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
                sk->sk_family == AF_INET6)
                __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
        else
                __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
        spin_unlock(&ilb2->lock);

        return err;
}

void inet_unhash(struct sock *sk)
{
        struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);

        if (sk_unhashed(sk))
                return;

        sock_rps_delete_flow(sk);
        if (sk->sk_state == TCP_LISTEN) {
                struct inet_listen_hashbucket *ilb2;

                ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
                /* Don't disable bottom halves while acquiring the lock to
                 * avoid circular locking dependency on PREEMPT_RT.
                 */
                spin_lock(&ilb2->lock);
                if (rcu_access_pointer(sk->sk_reuseport_cb))
                        reuseport_stop_listen_sock(sk);

                __sk_nulls_del_node_init_rcu(sk);
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
                spin_unlock(&ilb2->lock);
        } else {
                spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

                spin_lock_bh(lock);
                __sk_nulls_del_node_init_rcu(sk);
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
                spin_unlock_bh(lock);
        }
}

static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
                                    const struct net *net, unsigned short port,
                                    int l3mdev, const struct sock *sk)
{
        if (!net_eq(ib2_net(tb), net) || tb->port != port ||
            tb->l3mdev != l3mdev)
                return false;

        return inet_bind2_bucket_addr_match(tb, sk);
}

bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net,
                                      unsigned short port, int l3mdev, const struct sock *sk)
{
        if (!net_eq(ib2_net(tb), net) || tb->port != port ||
            tb->l3mdev != l3mdev)
                return false;

#if IS_ENABLED(CONFIG_IPV6)
        if (tb->addr_type == IPV6_ADDR_ANY)
                return true;

        if (tb->addr_type != IPV6_ADDR_MAPPED)
                return false;

        if (sk->sk_family == AF_INET6 &&
            !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
                return false;
#endif
        return tb->rcv_saddr == 0;
}

/* The socket's bhash2 hashbucket spinlock must be held when this is called */
struct inet_bind2_bucket *
inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net,
                       unsigned short port, int l3mdev, const struct sock *sk)
{
        struct inet_bind2_bucket *bhash2 = NULL;

        inet_bind_bucket_for_each(bhash2, &head->chain)
                if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
                        break;

        return bhash2;
}

struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
{
        struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
        u32 hash;

#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                hash = ipv6_portaddr_hash(net, &in6addr_any, port);
        else
#endif
                hash = ipv4_portaddr_hash(net, 0, port);

        return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}

static void inet_update_saddr(struct sock *sk, void *saddr, int family)
{
        if (family == AF_INET) {
                inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
                sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
        }
#if IS_ENABLED(CONFIG_IPV6)
        else {
                sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
        }
#endif
}

static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
{
        struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
        struct inet_bind_hashbucket *head, *head2;
        struct inet_bind2_bucket *tb2, *new_tb2;
        int l3mdev = inet_sk_bound_l3mdev(sk);
        int port = inet_sk(sk)->inet_num;
        struct net *net = sock_net(sk);
        int bhash;

        if (!inet_csk(sk)->icsk_bind2_hash) {
                /* Not bind()ed before. */
                if (reset)
                        inet_reset_saddr(sk);
                else
                        inet_update_saddr(sk, saddr, family);

                return 0;
        }

        /* Allocate a bind2 bucket ahead of time to avoid permanently putting
         * the bhash2 table in an inconsistent state if a new tb2 bucket
         * allocation fails.
         */
        new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
        if (!new_tb2) {
                if (reset) {
                        /* The (INADDR_ANY, port) bucket might have already
                         * been freed, then we cannot fixup icsk_bind2_hash,
                         * so we give up and unlink sk from bhash/bhash2 not
                         * to leave inconsistency in bhash2.
                         */
                        inet_put_port(sk);
                        inet_reset_saddr(sk);
                }

                return -ENOMEM;
        }

        bhash = inet_bhashfn(net, port, hinfo->bhash_size);
        head = &hinfo->bhash[bhash];
        head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);

        /* If we change saddr locklessly, another thread
         * iterating over bhash might see corrupted address.
         */
        spin_lock_bh(&head->lock);

        spin_lock(&head2->lock);
        __sk_del_bind_node(sk);
        inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
        spin_unlock(&head2->lock);

        if (reset)
                inet_reset_saddr(sk);
        else
                inet_update_saddr(sk, saddr, family);

        head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);

        spin_lock(&head2->lock);
        tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
        if (!tb2) {
                tb2 = new_tb2;
                inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
                if (sk_is_connect_bind(sk)) {
                        tb2->fastreuse = -1;
                        tb2->fastreuseport = -1;
                }
        }
        inet_csk(sk)->icsk_bind2_hash = tb2;
        sk_add_bind_node(sk, &tb2->owners);
        spin_unlock(&head2->lock);

        spin_unlock_bh(&head->lock);

        if (tb2 != new_tb2)
                kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);

        return 0;
}

int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
{
        return __inet_bhash2_update_saddr(sk, saddr, family, false);
}

void inet_bhash2_reset_saddr(struct sock *sk)
{
        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
                __inet_bhash2_update_saddr(sk, NULL, 0, true);
}

/* RFC 6056 3.3.4.  Algorithm 4: Double-Hash Port Selection Algorithm
 * Note that we use 32bit integers (vs RFC 'short integers')
 * because 2^16 is not a multiple of num_ephemeral and this
 * property might be used by clever attacker.
 *
 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
 * attacks were since demonstrated, thus we use 65536 by default instead
 * to really give more isolation and privacy, at the expense of 256kB
 * of kernel memory.
 */
#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER)
static u32 *table_perturb;

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
                struct sock *sk, u64 port_offset,
                u32 hash_port0,
                int (*check_established)(struct inet_timewait_death_row *,
                        struct sock *, __u16, struct inet_timewait_sock **,
                        bool rcu_lookup, u32 hash))
{
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        struct inet_bind_hashbucket *head, *head2;
        struct inet_timewait_sock *tw = NULL;
        int port = inet_sk(sk)->inet_num;
        struct net *net = sock_net(sk);
        struct inet_bind2_bucket *tb2;
        struct inet_bind_bucket *tb;
        int step, scan_step, l3mdev;
        u32 index, max_rand_step;
        bool tb_created = false;
        u32 remaining, offset;
        int ret, i, low, high;
        bool local_ports;

        if (port) {
                local_bh_disable();
                ret = check_established(death_row, sk, port, NULL, false,
                                        hash_port0 + port);
                local_bh_enable();
                return ret;
        }

        l3mdev = inet_sk_bound_l3mdev(sk);

        local_ports = inet_sk_get_local_port_range(sk, &low, &high);
        step = local_ports ? 1 : 2;
        scan_step = step;
        max_rand_step = READ_ONCE(net->ipv4.sysctl_ip_local_port_step_width);

        high++; /* [32768, 60999] -> [32768, 61000[ */
        remaining = high - low;
        if (!local_ports && remaining > 1)
                remaining &= ~1U;

        get_random_sleepable_once(table_perturb,
                                  INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
        index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);

        offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
        offset %= remaining;

        /* In first pass we try ports of @low parity.
         * inet_csk_get_port() does the opposite choice.
         */
        if (!local_ports)
                offset &= ~1U;

        if (max_rand_step && remaining > 1) {
                u32 range = remaining / step;
                u32 upper_bound;

                upper_bound = min(range, max_rand_step);
                scan_step = get_random_u32_inclusive(1, upper_bound);
                while (gcd(scan_step, range) != 1) {
                        scan_step++;
                        /* if both scan_step and range are even gcd won't be 1 */
                        if (!(scan_step & 1) && !(range & 1))
                                scan_step++;
                        if (unlikely(scan_step > upper_bound)) {
                                scan_step = 1;
                                break;
                        }
                }
                scan_step *= step;
        }
other_parity_scan:
        port = low + offset;
        for (i = 0; i < remaining; i += step, port += scan_step) {
                if (unlikely(port >= high))
                        port -= remaining;
                if (inet_is_local_reserved_port(net, port))
                        continue;
                head = &hinfo->bhash[inet_bhashfn(net, port,
                                                  hinfo->bhash_size)];
                rcu_read_lock();
                hlist_for_each_entry_rcu(tb, &head->chain, node) {
                        if (!inet_bind_bucket_match(tb, net, port, l3mdev))
                                continue;
                        if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
                                rcu_read_unlock();
                                goto next_port;
                        }
                        if (!check_established(death_row, sk, port, &tw, true,
                                               hash_port0 + port))
                                break;
                        rcu_read_unlock();
                        goto next_port;
                }
                rcu_read_unlock();

                spin_lock_bh(&head->lock);

                /* Does not bother with rcv_saddr checks, because
                 * the established check is already unique enough.
                 */
                inet_bind_bucket_for_each(tb, &head->chain) {
                        if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
                                if (tb->fastreuse >= 0 ||
                                    tb->fastreuseport >= 0)
                                        goto next_port_unlock;
                                WARN_ON(hlist_empty(&tb->bhash2));
                                if (!check_established(death_row, sk,
                                                       port, &tw, false,
                                                       hash_port0 + port))
                                        goto ok;
                                goto next_port_unlock;
                        }
                }

                tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
                                             net, head, port, l3mdev);
                if (!tb) {
                        spin_unlock_bh(&head->lock);
                        return -ENOMEM;
                }
                tb_created = true;
                tb->fastreuse = -1;
                tb->fastreuseport = -1;
                goto ok;
next_port_unlock:
                spin_unlock_bh(&head->lock);
next_port:
                cond_resched();
        }

        if (!local_ports) {
                offset++;
                if ((offset & 1) && remaining > 1)
                        goto other_parity_scan;
        }
        return -EADDRNOTAVAIL;

ok:
        /* Find the corresponding tb2 bucket since we need to
         * add the socket to the bhash2 table as well
         */
        head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
        spin_lock(&head2->lock);

        tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
        if (!tb2) {
                tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
                                               head2, tb, sk);
                if (!tb2)
                        goto error;
                tb2->fastreuse = -1;
                tb2->fastreuseport = -1;
        }

        /* Here we want to add a little bit of randomness to the next source
         * port that will be chosen. We use a max() with a random here so that
         * on low contention the randomness is maximal and on high contention
         * it may be inexistent.
         */
        i = max_t(int, i, get_random_u32_below(8) * step);
        WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);

        /* Head lock still held and bh's disabled */
        inet_bind_hash(sk, tb, tb2, port);
        sk->sk_userlocks |= SOCK_CONNECT_BIND;

        if (sk_unhashed(sk)) {
                inet_sk(sk)->inet_sport = htons(port);
                inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
        }
        if (tw)
                inet_twsk_bind_unhash(tw, hinfo);

        spin_unlock(&head2->lock);
        spin_unlock(&head->lock);

        if (tw)
                inet_twsk_deschedule_put(tw);
        local_bh_enable();
        return 0;

error:
        if (sk_hashed(sk)) {
                spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash);

                sock_prot_inuse_add(net, sk->sk_prot, -1);

                spin_lock(lock);
                __sk_nulls_del_node_init_rcu(sk);
                spin_unlock(lock);

                sk->sk_hash = 0;
                inet_sk(sk)->inet_sport = 0;
                WRITE_ONCE(inet_sk(sk)->inet_num, 0);

                if (tw)
                        inet_twsk_bind_unhash(tw, hinfo);
        }

        spin_unlock(&head2->lock);
        if (tb_created)
                inet_bind_bucket_destroy(tb);
        spin_unlock(&head->lock);

        if (tw)
                inet_twsk_deschedule_put(tw);

        local_bh_enable();

        return -ENOMEM;
}

/*
 * Bind a port for a connect operation and hash it.
 */
int inet_hash_connect(struct inet_timewait_death_row *death_row,
                      struct sock *sk)
{
        const struct inet_sock *inet = inet_sk(sk);
        const struct net *net = sock_net(sk);
        u64 port_offset = 0;
        u32 hash_port0;

        if (!inet_sk(sk)->inet_num)
                port_offset = inet_sk_port_offset(sk);

        inet_init_ehash_secret();

        hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0,
                                  inet->inet_daddr, inet->inet_dport);

        return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
                                   __inet_check_established);
}

void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
                                unsigned long numentries, int scale,
                                unsigned long low_limit,
                                unsigned long high_limit)
{
        unsigned int i;

        h->lhash2 = alloc_large_system_hash(name,
                                            sizeof(*h->lhash2),
                                            numentries,
                                            scale,
                                            0,
                                            NULL,
                                            &h->lhash2_mask,
                                            low_limit,
                                            high_limit);

        for (i = 0; i <= h->lhash2_mask; i++) {
                spin_lock_init(&h->lhash2[i].lock);
                INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
                                      i + LISTENING_NULLS_BASE);
        }

        /* this one is used for source ports of outgoing connections */
        table_perturb = alloc_large_system_hash("Table-perturb",
                                                sizeof(*table_perturb),
                                                INET_TABLE_PERTURB_SIZE,
                                                0, 0, NULL, NULL,
                                                INET_TABLE_PERTURB_SIZE,
                                                INET_TABLE_PERTURB_SIZE);
}

int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
        unsigned int locksz = sizeof(spinlock_t);
        unsigned int i, nblocks = 1;
        spinlock_t *ptr = NULL;

        if (locksz == 0)
                goto set_mask;

        /* Allocate 2 cache lines or at least one spinlock per cpu. */
        nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus();

        /* At least one page per NUMA node. */
        nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz);

        nblocks = roundup_pow_of_two(nblocks);

        /* No more locks than number of hash buckets. */
        nblocks = min(nblocks, hashinfo->ehash_mask + 1);

        if (num_online_nodes() > 1) {
                /* Use vmalloc() to allow NUMA policy to spread pages
                 * on all available nodes if desired.
                 */
                ptr = vmalloc_array(nblocks, locksz);
        }
        if (!ptr) {
                ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
                if (!ptr)
                        return -ENOMEM;
        }
        for (i = 0; i < nblocks; i++)
                spin_lock_init(&ptr[i]);
        hashinfo->ehash_locks = ptr;
set_mask:
        hashinfo->ehash_locks_mask = nblocks - 1;
        return 0;
}

struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
                                                 unsigned int ehash_entries)
{
        struct inet_hashinfo *new_hashinfo;
        int i;

        new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
        if (!new_hashinfo)
                goto err;

        new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
                                           GFP_KERNEL_ACCOUNT);
        if (!new_hashinfo->ehash)
                goto free_hashinfo;

        new_hashinfo->ehash_mask = ehash_entries - 1;

        if (inet_ehash_locks_alloc(new_hashinfo))
                goto free_ehash;

        for (i = 0; i < ehash_entries; i++)
                INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);

        new_hashinfo->pernet = true;

        return new_hashinfo;

free_ehash:
        vfree(new_hashinfo->ehash);
free_hashinfo:
        kfree(new_hashinfo);
err:
        return NULL;
}

void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
{
        if (!hashinfo->pernet)
                return;

        inet_ehash_locks_free(hashinfo);
        vfree(hashinfo->ehash);
        kfree(hashinfo);
}






































































    4 


    4 







    4 
















































































































































































































    1 












    1 

















































































































































































































































































    1 



    1 



















    1 
    1 






    1 










    1 



    1 
    1 






    1 

    1 











    1 


































































































































































































































































































































































































































































    2 


    2 

















    1 



    1 






































































































































































































































































































































































































































































































    1 














    2 























































































































































































































































































































































    1 







    1 

























   11 















   11 











   11 




























































































    8 














































































































































































    2 






    2 





    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        fs/libfs.c
 *        Library for filesystems writers.
 */

#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/filelock.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mount.h>
#include <linux/vfs.h>
#include <linux/quotaops.h>
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
#include <linux/iversion.h>
#include <linux/writeback.h>
#include <linux/fs_context.h>
#include <linux/pseudo_fs.h>
#include <linux/fsnotify.h>
#include <linux/unicode.h>
#include <linux/fscrypt.h>
#include <linux/pidfs.h>

#include <linux/uaccess.h>

#include "internal.h"

int simple_getattr(struct mnt_idmap *idmap, const struct path *path,
                   struct kstat *stat, u32 request_mask,
                   unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
        return 0;
}
EXPORT_SYMBOL(simple_getattr);

int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        u64 id = huge_encode_dev(dentry->d_sb->s_dev);

        buf->f_fsid = u64_to_fsid(id);
        buf->f_type = dentry->d_sb->s_magic;
        buf->f_bsize = PAGE_SIZE;
        buf->f_namelen = NAME_MAX;
        return 0;
}
EXPORT_SYMBOL(simple_statfs);

/*
 * Retaining negative dentries for an in-memory filesystem just wastes
 * memory and lookup time: arrange for them to be deleted immediately.
 */
int always_delete_dentry(const struct dentry *dentry)
{
        return 1;
}
EXPORT_SYMBOL(always_delete_dentry);

/*
 * Lookup the data. This is trivial - if the dentry didn't already
 * exist, we know it is negative.  Set d_op to delete negative dentries.
 */
struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
        if (!dentry->d_op && !(dentry->d_flags & DCACHE_DONTCACHE)) {
                spin_lock(&dentry->d_lock);
                dentry->d_flags |= DCACHE_DONTCACHE;
                spin_unlock(&dentry->d_lock);
        }
        if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
                return NULL;

        d_add(dentry, NULL);
        return NULL;
}
EXPORT_SYMBOL(simple_lookup);

int dcache_dir_open(struct inode *inode, struct file *file)
{
        file->private_data = d_alloc_cursor(file->f_path.dentry);

        return file->private_data ? 0 : -ENOMEM;
}
EXPORT_SYMBOL(dcache_dir_open);

int dcache_dir_close(struct inode *inode, struct file *file)
{
        dput(file->private_data);
        return 0;
}
EXPORT_SYMBOL(dcache_dir_close);

/* parent is locked at least shared */
/*
 * Returns an element of siblings' list.
 * We are looking for <count>th positive after <p>; if
 * found, dentry is grabbed and returned to caller.
 * If no such element exists, NULL is returned.
 */
static struct dentry *scan_positives(struct dentry *cursor,
                                        struct hlist_node **p,
                                        loff_t count,
                                        struct dentry *last)
{
        struct dentry *dentry = cursor->d_parent, *found = NULL;

        spin_lock(&dentry->d_lock);
        while (*p) {
                struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
                p = &d->d_sib.next;
                // we must at least skip cursors, to avoid livelocks
                if (d->d_flags & DCACHE_DENTRY_CURSOR)
                        continue;
                if (simple_positive(d) && !--count) {
                        spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                        if (simple_positive(d))
                                found = dget_dlock(d);
                        spin_unlock(&d->d_lock);
                        if (likely(found))
                                break;
                        count = 1;
                }
                if (need_resched()) {
                        if (!hlist_unhashed(&cursor->d_sib))
                                __hlist_del(&cursor->d_sib);
                        hlist_add_behind(&cursor->d_sib, &d->d_sib);
                        p = &cursor->d_sib.next;
                        spin_unlock(&dentry->d_lock);
                        cond_resched();
                        spin_lock(&dentry->d_lock);
                }
        }
        spin_unlock(&dentry->d_lock);
        dput(last);
        return found;
}

loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
{
        struct dentry *dentry = file->f_path.dentry;
        switch (whence) {
                case 1:
                        offset += file->f_pos;
                        fallthrough;
                case 0:
                        if (offset >= 0)
                                break;
                        fallthrough;
                default:
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
                struct dentry *cursor = file->private_data;
                struct dentry *to = NULL;

                inode_lock_shared(dentry->d_inode);

                if (offset > 2)
                        to = scan_positives(cursor, &dentry->d_children.first,
                                            offset - 2, NULL);
                spin_lock(&dentry->d_lock);
                hlist_del_init(&cursor->d_sib);
                if (to)
                        hlist_add_behind(&cursor->d_sib, &to->d_sib);
                spin_unlock(&dentry->d_lock);
                dput(to);

                file->f_pos = offset;

                inode_unlock_shared(dentry->d_inode);
        }
        return offset;
}
EXPORT_SYMBOL(dcache_dir_lseek);

/*
 * Directory is locked and all positive dentries in it are safe, since
 * for ramfs-type trees they can't go away without unlink() or rmdir(),
 * both impossible due to the lock on directory.
 */

int dcache_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dentry = file->f_path.dentry;
        struct dentry *cursor = file->private_data;
        struct dentry *next = NULL;
        struct hlist_node **p;

        if (!dir_emit_dots(file, ctx))
                return 0;

        if (ctx->pos == 2)
                p = &dentry->d_children.first;
        else
                p = &cursor->d_sib.next;

        while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
                if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
                              d_inode(next)->i_ino,
                              fs_umode_to_dtype(d_inode(next)->i_mode)))
                        break;
                ctx->pos++;
                p = &next->d_sib.next;
        }
        spin_lock(&dentry->d_lock);
        hlist_del_init(&cursor->d_sib);
        if (next)
                hlist_add_before(&cursor->d_sib, &next->d_sib);
        spin_unlock(&dentry->d_lock);
        dput(next);

        return 0;
}
EXPORT_SYMBOL(dcache_readdir);

ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
{
        return -EISDIR;
}
EXPORT_SYMBOL(generic_read_dir);

const struct file_operations simple_dir_operations = {
        .open                = dcache_dir_open,
        .release        = dcache_dir_close,
        .llseek                = dcache_dir_lseek,
        .read                = generic_read_dir,
        .iterate_shared        = dcache_readdir,
        .fsync                = noop_fsync,
};
EXPORT_SYMBOL(simple_dir_operations);

const struct inode_operations simple_dir_inode_operations = {
        .lookup                = simple_lookup,
};
EXPORT_SYMBOL(simple_dir_inode_operations);

/* simple_offset_add() never assigns these to a dentry */
enum {
        DIR_OFFSET_FIRST        = 2,                /* Find first real entry */
        DIR_OFFSET_EOD                = S32_MAX,
};

/* simple_offset_add() allocation range */
enum {
        DIR_OFFSET_MIN                = DIR_OFFSET_FIRST + 1,
        DIR_OFFSET_MAX                = DIR_OFFSET_EOD - 1,
};

static void offset_set(struct dentry *dentry, long offset)
{
        dentry->d_fsdata = (void *)offset;
}

static long dentry2offset(struct dentry *dentry)
{
        return (long)dentry->d_fsdata;
}

static struct lock_class_key simple_offset_lock_class;

/**
 * simple_offset_init - initialize an offset_ctx
 * @octx: directory offset map to be initialized
 *
 */
void simple_offset_init(struct offset_ctx *octx)
{
        mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE);
        lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class);
        octx->next_offset = DIR_OFFSET_MIN;
}

/**
 * simple_offset_add - Add an entry to a directory's offset map
 * @octx: directory offset ctx to be updated
 * @dentry: new dentry being added
 *
 * Returns zero on success. @octx and the dentry's offset are updated.
 * Otherwise, a negative errno value is returned.
 */
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
{
        unsigned long offset;
        int ret;

        if (dentry2offset(dentry) != 0)
                return -EBUSY;

        ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
                                 DIR_OFFSET_MAX, &octx->next_offset,
                                 GFP_KERNEL);
        if (unlikely(ret < 0))
                return ret == -EBUSY ? -ENOSPC : ret;

        offset_set(dentry, offset);
        return 0;
}

static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry,
                                 long offset)
{
        int ret;

        ret = mtree_store(&octx->mt, offset, dentry, GFP_KERNEL);
        if (ret)
                return ret;
        offset_set(dentry, offset);
        return 0;
}

/**
 * simple_offset_remove - Remove an entry to a directory's offset map
 * @octx: directory offset ctx to be updated
 * @dentry: dentry being removed
 *
 */
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
{
        long offset;

        offset = dentry2offset(dentry);
        if (offset == 0)
                return;

        mtree_erase(&octx->mt, offset);
        offset_set(dentry, 0);
}

/**
 * simple_offset_rename - handle directory offsets for rename
 * @old_dir: parent directory of source entry
 * @old_dentry: dentry of source entry
 * @new_dir: parent_directory of destination entry
 * @new_dentry: dentry of destination
 *
 * Caller provides appropriate serialization.
 *
 * User space expects the directory offset value of the replaced
 * (new) directory entry to be unchanged after a rename.
 *
 * Caller must have grabbed a slot for new_dentry in the maple_tree
 * associated with new_dir, even if dentry is negative.
 */
void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry)
{
        struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
        struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
        long new_offset = dentry2offset(new_dentry);

        if (WARN_ON(!new_offset))
                return;

        simple_offset_remove(old_ctx, old_dentry);
        offset_set(new_dentry, 0);
        WARN_ON(simple_offset_replace(new_ctx, old_dentry, new_offset));
}

/**
 * simple_offset_rename_exchange - exchange rename with directory offsets
 * @old_dir: parent of dentry being moved
 * @old_dentry: dentry being moved
 * @new_dir: destination parent
 * @new_dentry: destination dentry
 *
 * This API preserves the directory offset values. Caller provides
 * appropriate serialization.
 *
 * Returns zero on success. Otherwise a negative errno is returned and the
 * rename is rolled back.
 */
int simple_offset_rename_exchange(struct inode *old_dir,
                                  struct dentry *old_dentry,
                                  struct inode *new_dir,
                                  struct dentry *new_dentry)
{
        struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
        struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
        long old_index = dentry2offset(old_dentry);
        long new_index = dentry2offset(new_dentry);
        int ret;

        if (WARN_ON(!old_index || !new_index))
                return -EINVAL;

        ret = mtree_store(&new_ctx->mt, new_index, old_dentry, GFP_KERNEL);
        if (WARN_ON(ret))
                return ret;

        ret = mtree_store(&old_ctx->mt, old_index, new_dentry, GFP_KERNEL);
        if (WARN_ON(ret)) {
                mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL);
                return ret;
        }

        offset_set(old_dentry, new_index);
        offset_set(new_dentry, old_index);
        simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
        return 0;
}

/**
 * simple_offset_destroy - Release offset map
 * @octx: directory offset ctx that is about to be destroyed
 *
 * During fs teardown (eg. umount), a directory's offset map might still
 * contain entries. xa_destroy() cleans out anything that remains.
 */
void simple_offset_destroy(struct offset_ctx *octx)
{
        mtree_destroy(&octx->mt);
}

/**
 * offset_dir_llseek - Advance the read position of a directory descriptor
 * @file: an open directory whose position is to be updated
 * @offset: a byte offset
 * @whence: enumerator describing the starting position for this update
 *
 * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
 *
 * Returns the updated read position if successful; otherwise a
 * negative errno is returned and the read position remains unchanged.
 */
static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
{
        switch (whence) {
        case SEEK_CUR:
                offset += file->f_pos;
                fallthrough;
        case SEEK_SET:
                if (offset >= 0)
                        break;
                fallthrough;
        default:
                return -EINVAL;
        }

        return vfs_setpos(file, offset, LONG_MAX);
}

static struct dentry *find_positive_dentry(struct dentry *parent,
                                           struct dentry *dentry,
                                           bool next)
{
        struct dentry *found = NULL;

        spin_lock(&parent->d_lock);
        if (next)
                dentry = d_next_sibling(dentry);
        else if (!dentry)
                dentry = d_first_child(parent);
        hlist_for_each_entry_from(dentry, d_sib) {
                if (!simple_positive(dentry))
                        continue;
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                if (simple_positive(dentry))
                        found = dget_dlock(dentry);
                spin_unlock(&dentry->d_lock);
                if (likely(found))
                        break;
        }
        spin_unlock(&parent->d_lock);
        return found;
}

static noinline_for_stack struct dentry *
offset_dir_lookup(struct dentry *parent, loff_t offset)
{
        struct inode *inode = d_inode(parent);
        struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
        struct dentry *child, *found = NULL;

        MA_STATE(mas, &octx->mt, offset, offset);

        if (offset == DIR_OFFSET_FIRST)
                found = find_positive_dentry(parent, NULL, false);
        else {
                rcu_read_lock();
                child = mas_find_rev(&mas, DIR_OFFSET_MIN);
                found = find_positive_dentry(parent, child, false);
                rcu_read_unlock();
        }
        return found;
}

static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len,
                        inode->i_ino, fs_umode_to_dtype(inode->i_mode));
}

static void offset_iterate_dir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dir = file->f_path.dentry;
        struct dentry *dentry;

        dentry = offset_dir_lookup(dir, ctx->pos);
        if (!dentry)
                goto out_eod;
        while (true) {
                struct dentry *next;

                ctx->pos = dentry2offset(dentry);
                if (!offset_dir_emit(ctx, dentry))
                        break;

                next = find_positive_dentry(dir, dentry, true);
                dput(dentry);

                if (!next)
                        goto out_eod;
                dentry = next;
        }
        dput(dentry);
        return;

out_eod:
        ctx->pos = DIR_OFFSET_EOD;
}

/**
 * offset_readdir - Emit entries starting at offset @ctx->pos
 * @file: an open directory to iterate over
 * @ctx: directory iteration context
 *
 * Caller must hold @file's i_rwsem to prevent insertion or removal of
 * entries during this call.
 *
 * On entry, @ctx->pos contains an offset that represents the first entry
 * to be read from the directory.
 *
 * The operation continues until there are no more entries to read, or
 * until the ctx->actor indicates there is no more space in the caller's
 * output buffer.
 *
 * On return, @ctx->pos contains an offset that will read the next entry
 * in this directory when offset_readdir() is called again with @ctx.
 * Caller places this value in the d_off field of the last entry in the
 * user's buffer.
 *
 * Return values:
 *   %0 - Complete
 */
static int offset_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dir = file->f_path.dentry;

        lockdep_assert_held(&d_inode(dir)->i_rwsem);

        if (!dir_emit_dots(file, ctx))
                return 0;
        if (ctx->pos != DIR_OFFSET_EOD)
                offset_iterate_dir(file, ctx);
        return 0;
}

const struct file_operations simple_offset_dir_operations = {
        .llseek                = offset_dir_llseek,
        .iterate_shared        = offset_readdir,
        .read                = generic_read_dir,
        .fsync                = noop_fsync,
        .setlease        = generic_setlease,
};

struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
        struct dentry *child = NULL, *d;

        spin_lock(&parent->d_lock);
        d = prev ? d_next_sibling(prev) : d_first_child(parent);
        hlist_for_each_entry_from(d, d_sib) {
                if (simple_positive(d)) {
                        spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                        if (simple_positive(d))
                                child = dget_dlock(d);
                        spin_unlock(&d->d_lock);
                        if (likely(child))
                                break;
                }
        }
        spin_unlock(&parent->d_lock);
        dput(prev);
        return child;
}
EXPORT_SYMBOL(find_next_child);

static void __simple_recursive_removal(struct dentry *dentry,
                              void (*callback)(struct dentry *),
                              bool locked)
{
        struct dentry *this = dget(dentry);
        while (true) {
                struct dentry *victim = NULL, *child;
                struct inode *inode = this->d_inode;

                inode_lock_nested(inode, I_MUTEX_CHILD);
                if (d_is_dir(this))
                        inode->i_flags |= S_DEAD;
                while ((child = find_next_child(this, victim)) == NULL) {
                        // kill and ascend
                        // update metadata while it's still locked
                        inode_set_ctime_current(inode);
                        clear_nlink(inode);
                        inode_unlock(inode);
                        victim = this;
                        this = this->d_parent;
                        inode = this->d_inode;
                        if (!locked || victim != dentry)
                                inode_lock_nested(inode, I_MUTEX_CHILD);
                        if (simple_positive(victim)) {
                                d_invalidate(victim);        // avoid lost mounts
                                if (callback)
                                        callback(victim);
                                fsnotify_delete(inode, d_inode(victim), victim);
                                d_make_discardable(victim);
                        }
                        if (victim == dentry) {
                                inode_set_mtime_to_ts(inode,
                                                      inode_set_ctime_current(inode));
                                if (d_is_dir(dentry))
                                        drop_nlink(inode);
                                if (!locked)
                                        inode_unlock(inode);
                                dput(dentry);
                                return;
                        }
                }
                inode_unlock(inode);
                this = child;
        }
}

void simple_recursive_removal(struct dentry *dentry,
                              void (*callback)(struct dentry *))
{
        return __simple_recursive_removal(dentry, callback, false);
}
EXPORT_SYMBOL(simple_recursive_removal);

void simple_remove_by_name(struct dentry *parent, const char *name,
                           void (*callback)(struct dentry *))
{
        struct dentry *dentry;

        dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent);
        if (!IS_ERR(dentry)) {
                simple_recursive_removal(dentry, callback);
                dput(dentry);        // paired with lookup_noperm_positive_unlocked()
        }
}
EXPORT_SYMBOL(simple_remove_by_name);

/* caller holds parent directory with I_MUTEX_PARENT */
void locked_recursive_removal(struct dentry *dentry,
                              void (*callback)(struct dentry *))
{
        return __simple_recursive_removal(dentry, callback, true);
}
EXPORT_SYMBOL(locked_recursive_removal);

static const struct super_operations simple_super_operations = {
        .statfs                = simple_statfs,
};

static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = fc->fs_private;
        struct inode *root;

        s->s_maxbytes = MAX_LFS_FILESIZE;
        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = ctx->magic;
        s->s_op = ctx->ops ?: &simple_super_operations;
        s->s_export_op = ctx->eops;
        s->s_xattr = ctx->xattr;
        s->s_time_gran = 1;
        s->s_d_flags |= ctx->s_d_flags;
        root = new_inode(s);
        if (!root)
                return -ENOMEM;

        /*
         * since this is the first inode, make it number 1. New inodes created
         * after this must take care not to collide with it (by passing
         * max_reserved of 1 to iunique).
         */
        root->i_ino = 1;
        root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
        simple_inode_init_ts(root);
        s->s_root = d_make_root(root);
        if (!s->s_root)
                return -ENOMEM;
        set_default_d_op(s, ctx->dops);
        return 0;
}

static int pseudo_fs_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, pseudo_fs_fill_super);
}

static void pseudo_fs_free(struct fs_context *fc)
{
        kfree(fc->fs_private);
}

static const struct fs_context_operations pseudo_fs_context_ops = {
        .free                = pseudo_fs_free,
        .get_tree        = pseudo_fs_get_tree,
};

/*
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 */
struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
                                        unsigned long magic)
{
        struct pseudo_fs_context *ctx;

        ctx = kzalloc_obj(struct pseudo_fs_context);
        if (likely(ctx)) {
                ctx->magic = magic;
                fc->fs_private = ctx;
                fc->ops = &pseudo_fs_context_ops;
                fc->sb_flags |= SB_NOUSER;
                fc->global = true;
        }
        return ctx;
}
EXPORT_SYMBOL(init_pseudo);

int simple_open(struct inode *inode, struct file *file)
{
        if (inode->i_private)
                file->private_data = inode->i_private;
        return 0;
}
EXPORT_SYMBOL(simple_open);

int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);

        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inc_nlink(inode);
        ihold(inode);
        d_make_persistent(dentry, inode);
        return 0;
}
EXPORT_SYMBOL(simple_link);

int simple_empty(struct dentry *dentry)
{
        struct dentry *child;
        int ret = 0;

        spin_lock(&dentry->d_lock);
        hlist_for_each_entry(child, &dentry->d_children, d_sib) {
                spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                if (simple_positive(child)) {
                        spin_unlock(&child->d_lock);
                        goto out;
                }
                spin_unlock(&child->d_lock);
        }
        ret = 1;
out:
        spin_unlock(&dentry->d_lock);
        return ret;
}
EXPORT_SYMBOL(simple_empty);

void __simple_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        drop_nlink(inode);
}
EXPORT_SYMBOL(__simple_unlink);

void __simple_rmdir(struct inode *dir, struct dentry *dentry)
{
        drop_nlink(d_inode(dentry));
        __simple_unlink(dir, dentry);
        drop_nlink(dir);
}
EXPORT_SYMBOL(__simple_rmdir);

int simple_unlink(struct inode *dir, struct dentry *dentry)
{
        __simple_unlink(dir, dentry);
        d_make_discardable(dentry);
        return 0;
}
EXPORT_SYMBOL(simple_unlink);

int simple_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (!simple_empty(dentry))
                return -ENOTEMPTY;

        __simple_rmdir(dir, dentry);
        d_make_discardable(dentry);
        return 0;
}
EXPORT_SYMBOL(simple_rmdir);

/**
 * simple_rename_timestamp - update the various inode timestamps for rename
 * @old_dir: old parent directory
 * @old_dentry: dentry that is being renamed
 * @new_dir: new parent directory
 * @new_dentry: target for rename
 *
 * POSIX mandates that the old and new parent directories have their ctime and
 * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have
 * their ctime updated.
 */
void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry)
{
        struct inode *newino = d_inode(new_dentry);

        inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
        if (new_dir != old_dir)
                inode_set_mtime_to_ts(new_dir,
                                      inode_set_ctime_current(new_dir));
        inode_set_ctime_current(d_inode(old_dentry));
        if (newino)
                inode_set_ctime_current(newino);
}
EXPORT_SYMBOL_GPL(simple_rename_timestamp);

int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry)
{
        bool old_is_dir = d_is_dir(old_dentry);
        bool new_is_dir = d_is_dir(new_dentry);

        if (old_dir != new_dir && old_is_dir != new_is_dir) {
                if (old_is_dir) {
                        drop_nlink(old_dir);
                        inc_nlink(new_dir);
                } else {
                        drop_nlink(new_dir);
                        inc_nlink(old_dir);
                }
        }
        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        return 0;
}
EXPORT_SYMBOL_GPL(simple_rename_exchange);

int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir,
                  struct dentry *old_dentry, struct inode *new_dir,
                  struct dentry *new_dentry, unsigned int flags)
{
        int they_are_dirs = d_is_dir(old_dentry);

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
                return -EINVAL;

        if (flags & RENAME_EXCHANGE)
                return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);

        if (!simple_empty(new_dentry))
                return -ENOTEMPTY;

        if (d_really_is_positive(new_dentry)) {
                simple_unlink(new_dir, new_dentry);
                if (they_are_dirs) {
                        drop_nlink(d_inode(new_dentry));
                        drop_nlink(old_dir);
                }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);
        }

        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        return 0;
}
EXPORT_SYMBOL(simple_rename);

/**
 * simple_setattr - setattr for simple filesystem
 * @idmap: idmap of the target mount
 * @dentry: dentry
 * @iattr: iattr structure
 *
 * Returns 0 on success, -error on failure.
 *
 * simple_setattr is a simple ->setattr implementation without a proper
 * implementation of size changes.
 *
 * It can either be used for in-memory filesystems or special files
 * on simple regular filesystems.  Anything that needs to change on-disk
 * or wire state on size changes needs its own setattr method.
 */
int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                   struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        int error;

        error = setattr_prepare(idmap, dentry, iattr);
        if (error)
                return error;

        if (iattr->ia_valid & ATTR_SIZE)
                truncate_setsize(inode, iattr->ia_size);
        setattr_copy(idmap, inode, iattr);
        mark_inode_dirty(inode);
        return 0;
}
EXPORT_SYMBOL(simple_setattr);

static int simple_read_folio(struct file *file, struct folio *folio)
{
        folio_zero_range(folio, 0, folio_size(folio));
        flush_dcache_folio(folio);
        folio_mark_uptodate(folio);
        folio_unlock(folio);
        return 0;
}

int simple_write_begin(const struct kiocb *iocb, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct folio **foliop, void **fsdata)
{
        struct folio *folio;

        folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        *foliop = folio;

        if (!folio_test_uptodate(folio) && (len != folio_size(folio))) {
                size_t from = offset_in_folio(folio, pos);

                folio_zero_segments(folio, 0, from,
                                from + len, folio_size(folio));
        }
        return 0;
}
EXPORT_SYMBOL(simple_write_begin);

/**
 * simple_write_end - .write_end helper for non-block-device FSes
 * @iocb: kernel I/O control block
 * @mapping:                 "
 * @pos:                 "
 * @len:                 "
 * @copied:                 "
 * @folio:                 "
 * @fsdata:                 "
 *
 * simple_write_end does the minimum needed for updating a folio after
 * writing is done. It has the same API signature as the .write_end of
 * address_space_operations vector. So it can just be set onto .write_end for
 * FSes that don't need any other processing. i_rwsem is assumed to be held
 * exclusively.
 * Block based filesystems should use generic_write_end().
 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
 * is not called, so a filesystem that actually does store data in .write_inode
 * should extend on what's done here with a call to mark_inode_dirty() in the
 * case that i_size has changed.
 *
 * Use *ONLY* with simple_read_folio()
 */
static int simple_write_end(const struct kiocb *iocb,
                            struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned copied,
                            struct folio *folio, void *fsdata)
{
        struct inode *inode = folio->mapping->host;
        loff_t last_pos = pos + copied;

        /* zero the stale part of the folio if we did a short copy */
        if (!folio_test_uptodate(folio)) {
                if (copied < len) {
                        size_t from = offset_in_folio(folio, pos);

                        folio_zero_range(folio, from + copied, len - copied);
                }
                folio_mark_uptodate(folio);
        }
        /*
         * No need to use i_size_read() here, the i_size
         * cannot change under us because we hold the i_rwsem.
         */
        if (last_pos > inode->i_size)
                i_size_write(inode, last_pos);

        folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);

        return copied;
}

/*
 * Provides ramfs-style behavior: data in the pagecache, but no writeback.
 */
const struct address_space_operations ram_aops = {
        .read_folio        = simple_read_folio,
        .write_begin        = simple_write_begin,
        .write_end        = simple_write_end,
        .dirty_folio        = noop_dirty_folio,
};
EXPORT_SYMBOL(ram_aops);

/*
 * the inodes created here are not hashed. If you use iunique to generate
 * unique inode values later for this filesystem, then you must take care
 * to pass it an appropriate max_reserved value to avoid collisions.
 */
int simple_fill_super(struct super_block *s, unsigned long magic,
                      const struct tree_descr *files)
{
        struct inode *inode;
        struct dentry *dentry;
        int i;

        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = magic;
        s->s_op = &simple_super_operations;
        s->s_time_gran = 1;

        inode = new_inode(s);
        if (!inode)
                return -ENOMEM;
        /*
         * because the root inode is 1, the files array must not contain an
         * entry at index 1
         */
        inode->i_ino = 1;
        inode->i_mode = S_IFDIR | 0755;
        simple_inode_init_ts(inode);
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
        set_nlink(inode, 2);
        s->s_root = d_make_root(inode);
        if (!s->s_root)
                return -ENOMEM;
        for (i = 0; !files->name || files->name[0]; i++, files++) {
                if (!files->name)
                        continue;

                /* warn if it tries to conflict with the root inode */
                if (unlikely(i == 1))
                        printk(KERN_WARNING "%s: %s passed in a files array"
                                "with an index of 1!\n", __func__,
                                s->s_type->name);

                dentry = d_alloc_name(s->s_root, files->name);
                if (!dentry)
                        return -ENOMEM;
                inode = new_inode(s);
                if (!inode) {
                        dput(dentry);
                        return -ENOMEM;
                }
                inode->i_mode = S_IFREG | files->mode;
                simple_inode_init_ts(inode);
                inode->i_fop = files->ops;
                inode->i_ino = i;
                d_make_persistent(dentry, inode);
                dput(dentry);
        }
        return 0;
}
EXPORT_SYMBOL(simple_fill_super);

static DEFINE_SPINLOCK(pin_fs_lock);

int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
{
        struct vfsmount *mnt = NULL;
        spin_lock(&pin_fs_lock);
        if (unlikely(!*mount)) {
                spin_unlock(&pin_fs_lock);
                mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
                if (IS_ERR(mnt))
                        return PTR_ERR(mnt);
                spin_lock(&pin_fs_lock);
                if (!*mount)
                        *mount = mnt;
        }
        mntget(*mount);
        ++*count;
        spin_unlock(&pin_fs_lock);
        mntput(mnt);
        return 0;
}
EXPORT_SYMBOL(simple_pin_fs);

void simple_release_fs(struct vfsmount **mount, int *count)
{
        struct vfsmount *mnt;
        spin_lock(&pin_fs_lock);
        mnt = *mount;
        if (!--*count)
                *mount = NULL;
        spin_unlock(&pin_fs_lock);
        mntput(mnt);
}
EXPORT_SYMBOL(simple_release_fs);

/**
 * simple_read_from_buffer - copy data from the buffer to user space
 * @to: the user space buffer to read to
 * @count: the maximum number of bytes to read
 * @ppos: the current position in the buffer
 * @from: the buffer to read from
 * @available: the size of the buffer
 *
 * The simple_read_from_buffer() function reads up to @count bytes from the
 * buffer @from at offset @ppos into the user space address starting at @to.
 *
 * On success, the number of bytes read is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
{
        loff_t pos = *ppos;
        size_t ret;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
        ret = copy_to_user(to, from + pos, count);
        if (ret == count)
                return -EFAULT;
        count -= ret;
        *ppos = pos + count;
        return count;
}
EXPORT_SYMBOL(simple_read_from_buffer);

/**
 * simple_write_to_buffer - copy data from user space to the buffer
 * @to: the buffer to write to
 * @available: the size of the buffer
 * @ppos: the current position in the buffer
 * @from: the user space buffer to read from
 * @count: the maximum number of bytes to read
 *
 * The simple_write_to_buffer() function reads up to @count bytes from the user
 * space address starting at @from into the buffer @to at offset @ppos.
 *
 * On success, the number of bytes written is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count)
{
        loff_t pos = *ppos;
        size_t res;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
        res = copy_from_user(to + pos, from, count);
        if (res == count)
                return -EFAULT;
        count -= res;
        *ppos = pos + count;
        return count;
}
EXPORT_SYMBOL(simple_write_to_buffer);

/**
 * memory_read_from_buffer - copy data from the buffer
 * @to: the kernel space buffer to read to
 * @count: the maximum number of bytes to read
 * @ppos: the current position in the buffer
 * @from: the buffer to read from
 * @available: the size of the buffer
 *
 * The memory_read_from_buffer() function reads up to @count bytes from the
 * buffer @from at offset @ppos into the kernel space address starting at @to.
 *
 * On success, the number of bytes read is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
{
        loff_t pos = *ppos;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available)
                return 0;
        if (count > available - pos)
                count = available - pos;
        memcpy(to, from + pos, count);
        *ppos = pos + count;

        return count;
}
EXPORT_SYMBOL(memory_read_from_buffer);

/*
 * Transaction based IO.
 * The file expects a single write which triggers the transaction, and then
 * possibly a read which collects the result - which is stored in a
 * file-local buffer.
 */

void simple_transaction_set(struct file *file, size_t n)
{
        struct simple_transaction_argresp *ar = file->private_data;

        BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);

        /*
         * The barrier ensures that ar->size will really remain zero until
         * ar->data is ready for reading.
         */
        smp_mb();
        ar->size = n;
}
EXPORT_SYMBOL(simple_transaction_set);

char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
{
        struct simple_transaction_argresp *ar;
        static DEFINE_SPINLOCK(simple_transaction_lock);

        if (size > SIMPLE_TRANSACTION_LIMIT - 1)
                return ERR_PTR(-EFBIG);

        ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
        if (!ar)
                return ERR_PTR(-ENOMEM);

        spin_lock(&simple_transaction_lock);

        /* only one write allowed per open */
        if (file->private_data) {
                spin_unlock(&simple_transaction_lock);
                free_page((unsigned long)ar);
                return ERR_PTR(-EBUSY);
        }

        file->private_data = ar;

        spin_unlock(&simple_transaction_lock);

        if (copy_from_user(ar->data, buf, size))
                return ERR_PTR(-EFAULT);

        return ar->data;
}
EXPORT_SYMBOL(simple_transaction_get);

ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
{
        struct simple_transaction_argresp *ar = file->private_data;

        if (!ar)
                return 0;
        return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
}
EXPORT_SYMBOL(simple_transaction_read);

int simple_transaction_release(struct inode *inode, struct file *file)
{
        free_page((unsigned long)file->private_data);
        return 0;
}
EXPORT_SYMBOL(simple_transaction_release);

/* Simple attribute files */

struct simple_attr {
        int (*get)(void *, u64 *);
        int (*set)(void *, u64);
        char get_buf[24];        /* enough to store a u64 and "\n\0" */
        char set_buf[24];
        void *data;
        const char *fmt;        /* format for read operation */
        struct mutex mutex;        /* protects access to these buffers */
};

/* simple_attr_open is called by an actual attribute open file operation
 * to set the attribute specific access operations. */
int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt)
{
        struct simple_attr *attr;

        attr = kzalloc_obj(*attr);
        if (!attr)
                return -ENOMEM;

        attr->get = get;
        attr->set = set;
        attr->data = inode->i_private;
        attr->fmt = fmt;
        mutex_init(&attr->mutex);

        file->private_data = attr;

        return nonseekable_open(inode, file);
}
EXPORT_SYMBOL_GPL(simple_attr_open);

int simple_attr_release(struct inode *inode, struct file *file)
{
        kfree(file->private_data);
        return 0;
}
EXPORT_SYMBOL_GPL(simple_attr_release);        /* GPL-only?  This?  Really? */

/* read from the buffer that is filled with the get function */
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos)
{
        struct simple_attr *attr;
        size_t size;
        ssize_t ret;

        attr = file->private_data;

        if (!attr->get)
                return -EACCES;

        ret = mutex_lock_interruptible(&attr->mutex);
        if (ret)
                return ret;

        if (*ppos && attr->get_buf[0]) {
                /* continued read */
                size = strlen(attr->get_buf);
        } else {
                /* first read */
                u64 val;
                ret = attr->get(attr->data, &val);
                if (ret)
                        goto out;

                size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
                                 attr->fmt, (unsigned long long)val);
        }

        ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
out:
        mutex_unlock(&attr->mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(simple_attr_read);

/* interpret the buffer as a number to call the set function with */
static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos, bool is_signed)
{
        struct simple_attr *attr;
        unsigned long long val;
        size_t size;
        ssize_t ret;

        attr = file->private_data;
        if (!attr->set)
                return -EACCES;

        ret = mutex_lock_interruptible(&attr->mutex);
        if (ret)
                return ret;

        ret = -EFAULT;
        size = min(sizeof(attr->set_buf) - 1, len);
        if (copy_from_user(attr->set_buf, buf, size))
                goto out;

        attr->set_buf[size] = '\0';
        if (is_signed)
                ret = kstrtoll(attr->set_buf, 0, &val);
        else
                ret = kstrtoull(attr->set_buf, 0, &val);
        if (ret)
                goto out;
        ret = attr->set(attr->data, val);
        if (ret == 0)
                ret = len; /* on success, claim we got the whole input */
out:
        mutex_unlock(&attr->mutex);
        return ret;
}

ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos)
{
        return simple_attr_write_xsigned(file, buf, len, ppos, false);
}
EXPORT_SYMBOL_GPL(simple_attr_write);

ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos)
{
        return simple_attr_write_xsigned(file, buf, len, ppos, true);
}
EXPORT_SYMBOL_GPL(simple_attr_write_signed);

/**
 * generic_encode_ino32_fh - generic export_operations->encode_fh function
 * @inode:   the object to encode
 * @fh:      where to store the file handle fragment
 * @max_len: maximum length to store there (in 4 byte units)
 * @parent:  parent directory inode, if wanted
 *
 * This generic encode_fh function assumes that the 32 inode number
 * is suitable for locating an inode, and that the generation number
 * can be used to check that it is still valid.  It places them in the
 * filehandle fragment where export_decode_fh expects to find them.
 */
int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
                            struct inode *parent)
{
        struct fid *fid = (void *)fh;
        int len = *max_len;
        int type = FILEID_INO32_GEN;

        if (parent && (len < 4)) {
                *max_len = 4;
                return FILEID_INVALID;
        } else if (len < 2) {
                *max_len = 2;
                return FILEID_INVALID;
        }

        len = 2;
        fid->i32.ino = inode->i_ino;
        fid->i32.gen = inode->i_generation;
        if (parent) {
                fid->i32.parent_ino = parent->i_ino;
                fid->i32.parent_gen = parent->i_generation;
                len = 4;
                type = FILEID_INO32_GEN_PARENT;
        }
        *max_len = len;
        return type;
}
EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);

/**
 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
 * @sb:                filesystem to do the file handle conversion on
 * @fid:        file handle to convert
 * @fh_len:        length of the file handle in bytes
 * @fh_type:        type of file handle
 * @get_inode:        filesystem callback to retrieve inode
 *
 * This function decodes @fid as long as it has one of the well-known
 * Linux filehandle types and calls @get_inode on it to retrieve the
 * inode for the object specified in the file handle.
 */
struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type, struct inode *(*get_inode)
                        (struct super_block *sb, u64 ino, u32 gen))
{
        struct inode *inode = NULL;

        if (fh_len < 2)
                return NULL;

        switch (fh_type) {
        case FILEID_INO32_GEN:
        case FILEID_INO32_GEN_PARENT:
                inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
                break;
        }

        return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_dentry);

/**
 * generic_fh_to_parent - generic helper for the fh_to_parent export operation
 * @sb:                filesystem to do the file handle conversion on
 * @fid:        file handle to convert
 * @fh_len:        length of the file handle in bytes
 * @fh_type:        type of file handle
 * @get_inode:        filesystem callback to retrieve inode
 *
 * This function decodes @fid as long as it has one of the well-known
 * Linux filehandle types and calls @get_inode on it to retrieve the
 * inode for the _parent_ object specified in the file handle if it
 * is specified in the file handle, or NULL otherwise.
 */
struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type, struct inode *(*get_inode)
                        (struct super_block *sb, u64 ino, u32 gen))
{
        struct inode *inode = NULL;

        if (fh_len <= 2)
                return NULL;

        switch (fh_type) {
        case FILEID_INO32_GEN_PARENT:
                inode = get_inode(sb, fid->i32.parent_ino,
                                  (fh_len > 3 ? fid->i32.parent_gen : 0));
                break;
        }

        return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_parent);

/**
 * simple_fsync_noflush - generic fsync implementation for simple filesystems
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This function is an fsync handler for simple filesystems. It writes out
 * dirty data, inode (if dirty), but does not issue a cache flush.
 */
int simple_fsync_noflush(struct file *file, loff_t start, loff_t end,
                         int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret = 0;

        err = file_write_and_wait_range(file, start, end);
        if (err)
                return err;

        if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
                goto out;

        ret = sync_inode_metadata(inode, 1);
out:
        /* check and advance again to catch errors after syncing out buffers */
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        return ret;
}
EXPORT_SYMBOL(simple_fsync_noflush);

/**
 * simple_fsync - fsync implementation for simple filesystems with flush
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This function is an fsync handler for simple filesystems. It writes out
 * dirty data, inode (if dirty), and issues a cache flush.
 */
int simple_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;

        err = simple_fsync_noflush(file, start, end, datasync);
        if (err)
                return err;
        return blkdev_issue_flush(inode->i_sb->s_bdev);
}
EXPORT_SYMBOL(simple_fsync);

/**
 * generic_check_addressable - Check addressability of file system
 * @blocksize_bits:        log of file system block size
 * @num_blocks:                number of blocks in file system
 *
 * Determine whether a file system with @num_blocks blocks (and a
 * block size of 2**@blocksize_bits) is addressable by the sector_t
 * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
 */
int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
{
        u64 last_fs_block = num_blocks - 1;
        u64 last_fs_page, max_bytes;

        if (check_shl_overflow(num_blocks, blocksize_bits, &max_bytes))
                return -EFBIG;

        last_fs_page = (max_bytes >> PAGE_SHIFT) - 1;

        if (unlikely(num_blocks == 0))
                return 0;

        if (blocksize_bits < 9)
                return -EINVAL;

        if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
            (last_fs_page > (pgoff_t)(~0ULL))) {
                return -EFBIG;
        }
        return 0;
}
EXPORT_SYMBOL(generic_check_addressable);

/*
 * No-op implementation of ->fsync for in-memory filesystems.
 */
int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
        return 0;
}
EXPORT_SYMBOL(noop_fsync);

ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        /*
         * iomap based filesystems support direct I/O without need for
         * this callback. However, it still needs to be set in
         * inode->a_ops so that open/fcntl know that direct I/O is
         * generally supported.
         */
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(noop_direct_IO);

/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
void kfree_link(void *p)
{
        kfree(p);
}
EXPORT_SYMBOL(kfree_link);

struct inode *alloc_anon_inode(struct super_block *s)
{
        static const struct address_space_operations anon_aops = {
                .dirty_folio        = noop_dirty_folio,
        };
        struct inode *inode = new_inode_pseudo(s);

        if (!inode)
                return ERR_PTR(-ENOMEM);

        inode->i_ino = get_next_ino();
        inode->i_mapping->a_ops = &anon_aops;

        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
         * list because mark_inode_dirty() will think
         * that it already _is_ on the dirty list.
         */
        inode_state_assign_raw(inode, I_DIRTY);
        /*
         * Historically anonymous inodes don't have a type at all and
         * userspace has come to rely on this.
         */
        inode->i_mode = S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_flags |= S_PRIVATE | S_ANON_INODE;
        simple_inode_init_ts(inode);
        return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);

/**
 * simple_get_link - generic helper to get the target of "fast" symlinks
 * @dentry: not used here
 * @inode: the symlink inode
 * @done: not used here
 *
 * Generic helper for filesystems to use for symlink inodes where a pointer to
 * the symlink target is stored in ->i_link.  NOTE: this isn't normally called,
 * since as an optimization the path lookup code uses any non-NULL ->i_link
 * directly, without calling ->get_link().  But ->get_link() still must be set,
 * to mark the inode_operations as being for a symlink.
 *
 * Return: the symlink target
 */
const char *simple_get_link(struct dentry *dentry, struct inode *inode,
                            struct delayed_call *done)
{
        return inode->i_link;
}
EXPORT_SYMBOL(simple_get_link);

const struct inode_operations simple_symlink_inode_operations = {
        .get_link = simple_get_link,
};
EXPORT_SYMBOL(simple_symlink_inode_operations);

/*
 * Operations for a permanently empty directory.
 */
static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return ERR_PTR(-ENOENT);
}

static int empty_dir_setattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct iattr *attr)
{
        return -EPERM;
}

static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
{
        return -EOPNOTSUPP;
}

static const struct inode_operations empty_dir_inode_operations = {
        .lookup                = empty_dir_lookup,
        .setattr        = empty_dir_setattr,
        .listxattr        = empty_dir_listxattr,
};

static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
{
        /* An empty directory has two entries . and .. at offsets 0 and 1 */
        return generic_file_llseek_size(file, offset, whence, 2, 2);
}

static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
{
        dir_emit_dots(file, ctx);
        return 0;
}

static const struct file_operations empty_dir_operations = {
        .llseek                = empty_dir_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = empty_dir_readdir,
        .fsync                = noop_fsync,
};


void make_empty_dir_inode(struct inode *inode)
{
        set_nlink(inode, 2);
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
        inode->i_uid = GLOBAL_ROOT_UID;
        inode->i_gid = GLOBAL_ROOT_GID;
        inode->i_rdev = 0;
        inode->i_size = 0;
        inode->i_blkbits = PAGE_SHIFT;
        inode->i_blocks = 0;

        inode->i_op = &empty_dir_inode_operations;
        inode->i_opflags &= ~IOP_XATTR;
        inode->i_fop = &empty_dir_operations;
}

bool is_empty_dir_inode(struct inode *inode)
{
        return (inode->i_fop == &empty_dir_operations) &&
                (inode->i_op == &empty_dir_inode_operations);
}

#if IS_ENABLED(CONFIG_UNICODE)
/**
 * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
 * @dentry:        dentry whose name we are checking against
 * @len:        len of name of dentry
 * @str:        str pointer to name of dentry
 * @name:        Name to compare against
 *
 * Return: 0 if names match, 1 if mismatch, or -ERRNO
 */
int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                         const char *str, const struct qstr *name)
{
        const struct dentry *parent;
        const struct inode *dir;
        union shortname_store strbuf;
        struct qstr qstr;

        /*
         * Attempt a case-sensitive match first. It is cheaper and
         * should cover most lookups, including all the sane
         * applications that expect a case-sensitive filesystem.
         *
         * This comparison is safe under RCU because the caller
         * guarantees the consistency between str and len. See
         * __d_lookup_rcu_op_compare() for details.
         */
        if (len == name->len && !memcmp(str, name->name, len))
                return 0;

        parent = READ_ONCE(dentry->d_parent);
        dir = READ_ONCE(parent->d_inode);
        if (!dir || !IS_CASEFOLDED(dir))
                return 1;

        qstr.len = len;
        qstr.name = str;
        /*
         * If the dentry name is stored in-line, then it may be concurrently
         * modified by a rename.  If this happens, the VFS will eventually retry
         * the lookup, so it doesn't matter what ->d_compare() returns.
         * However, it's unsafe to call utf8_strncasecmp() with an unstable
         * string.  Therefore, we have to copy the name into a temporary buffer.
         * As above, len is guaranteed to match str, so the shortname case
         * is exactly when str points to ->d_shortname.
         */
        if (qstr.name == dentry->d_shortname.string) {
                strbuf = dentry->d_shortname; // NUL is guaranteed to be in there
                qstr.name = strbuf.string;
                /* prevent compiler from optimizing out the temporary buffer */
                barrier();
        }

        return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
}
EXPORT_SYMBOL(generic_ci_d_compare);

/**
 * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
 * @dentry:        dentry of the parent directory
 * @str:        qstr of name whose hash we should fill in
 *
 * Return: 0 if hash was successful or unchanged, and -EINVAL on error
 */
int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
        const struct inode *dir = READ_ONCE(dentry->d_inode);
        struct super_block *sb = dentry->d_sb;
        const struct unicode_map *um = sb->s_encoding;
        int ret;

        if (!dir || !IS_CASEFOLDED(dir))
                return 0;

        ret = utf8_casefold_hash(um, dentry, str);
        if (ret < 0 && sb_has_strict_encoding(sb))
                return -EINVAL;
        return 0;
}
EXPORT_SYMBOL(generic_ci_d_hash);

static const struct dentry_operations generic_ci_dentry_ops = {
        .d_hash = generic_ci_d_hash,
        .d_compare = generic_ci_d_compare,
#ifdef CONFIG_FS_ENCRYPTION
        .d_revalidate = fscrypt_d_revalidate,
#endif
};

/**
 * generic_ci_match() - Match a name (case-insensitively) with a dirent.
 * This is a filesystem helper for comparison with directory entries.
 * generic_ci_d_compare should be used in VFS' ->d_compare instead.
 *
 * @parent: Inode of the parent of the dirent under comparison
 * @name: name under lookup.
 * @folded_name: Optional pre-folded name under lookup
 * @de_name: Dirent name.
 * @de_name_len: dirent name length.
 *
 * Test whether a case-insensitive directory entry matches the filename
 * being searched.  If @folded_name is provided, it is used instead of
 * recalculating the casefold of @name.
 *
 * Return: > 0 if the directory entry matches, 0 if it doesn't match, or
 * < 0 on error.
 */
int generic_ci_match(const struct inode *parent,
                     const struct qstr *name,
                     const struct qstr *folded_name,
                     const u8 *de_name, u32 de_name_len)
{
        const struct super_block *sb = parent->i_sb;
        const struct unicode_map *um = sb->s_encoding;
        struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
        struct qstr dirent = QSTR_INIT(de_name, de_name_len);
        int res = 0;

        if (IS_ENCRYPTED(parent)) {
                const struct fscrypt_str encrypted_name =
                        FSTR_INIT((u8 *) de_name, de_name_len);

                if (WARN_ON_ONCE(!fscrypt_has_encryption_key(parent)))
                        return -EINVAL;

                decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
                if (!decrypted_name.name)
                        return -ENOMEM;
                res = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name,
                                                &decrypted_name);
                if (res < 0) {
                        kfree(decrypted_name.name);
                        return res;
                }
                dirent.name = decrypted_name.name;
                dirent.len = decrypted_name.len;
        }

        /*
         * Attempt a case-sensitive match first. It is cheaper and
         * should cover most lookups, including all the sane
         * applications that expect a case-sensitive filesystem.
         */

        if (dirent.len == name->len &&
            !memcmp(name->name, dirent.name, dirent.len))
                goto out;

        if (folded_name->name)
                res = utf8_strncasecmp_folded(um, folded_name, &dirent);
        else
                res = utf8_strncasecmp(um, name, &dirent);

out:
        kfree(decrypted_name.name);
        if (res < 0 && sb_has_strict_encoding(sb)) {
                pr_err_ratelimited("Directory contains filename that is invalid UTF-8");
                return 0;
        }
        return !res;
}
EXPORT_SYMBOL(generic_ci_match);
#endif

#ifdef CONFIG_FS_ENCRYPTION
static const struct dentry_operations generic_encrypted_dentry_ops = {
        .d_revalidate = fscrypt_d_revalidate,
};
#endif

/**
 * generic_set_sb_d_ops - helper for choosing the set of
 * filesystem-wide dentry operations for the enabled features
 * @sb: superblock to be configured
 *
 * Filesystems supporting casefolding and/or fscrypt can call this
 * helper at mount-time to configure default dentry_operations to the
 * best set of dentry operations required for the enabled features.
 * The helper must be called after these have been configured, but
 * before the root dentry is created.
 */
void generic_set_sb_d_ops(struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
        if (sb->s_encoding) {
                set_default_d_op(sb, &generic_ci_dentry_ops);
                return;
        }
#endif
#ifdef CONFIG_FS_ENCRYPTION
        if (sb->s_cop) {
                set_default_d_op(sb, &generic_encrypted_dentry_ops);
                return;
        }
#endif
}
EXPORT_SYMBOL(generic_set_sb_d_ops);

/**
 * inode_maybe_inc_iversion - increments i_version
 * @inode: inode with the i_version that should be updated
 * @force: increment the counter even if it's not necessary?
 *
 * Every time the inode is modified, the i_version field must be seen to have
 * changed by any observer.
 *
 * If "force" is set or the QUERIED flag is set, then ensure that we increment
 * the value, and clear the queried flag.
 *
 * In the common case where neither is set, then we can return "false" without
 * updating i_version.
 *
 * If this function returns false, and no other metadata has changed, then we
 * can avoid logging the metadata.
 */
bool inode_maybe_inc_iversion(struct inode *inode, bool force)
{
        u64 cur, new;

        /*
         * The i_version field is not strictly ordered with any other inode
         * information, but the legacy inode_inc_iversion code used a spinlock
         * to serialize increments.
         *
         * We add a full memory barrier to ensure that any de facto ordering
         * with other state is preserved (either implicitly coming from cmpxchg
         * or explicitly from smp_mb if we don't know upfront if we will execute
         * the former).
         *
         * These barriers pair with inode_query_iversion().
         */
        cur = inode_peek_iversion_raw(inode);
        if (!force && !(cur & I_VERSION_QUERIED)) {
                smp_mb();
                cur = inode_peek_iversion_raw(inode);
        }

        do {
                /* If flag is clear then we needn't do anything */
                if (!force && !(cur & I_VERSION_QUERIED))
                        return false;

                /* Since lowest bit is flag, add 2 to avoid it */
                new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
        } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
        return true;
}
EXPORT_SYMBOL(inode_maybe_inc_iversion);

/**
 * inode_query_iversion - read i_version for later use
 * @inode: inode from which i_version should be read
 *
 * Read the inode i_version counter. This should be used by callers that wish
 * to store the returned i_version for later comparison. This will guarantee
 * that a later query of the i_version will result in a different value if
 * anything has changed.
 *
 * In this implementation, we fetch the current value, set the QUERIED flag and
 * then try to swap it into place with a cmpxchg, if it wasn't already set. If
 * that fails, we try again with the newly fetched value from the cmpxchg.
 */
u64 inode_query_iversion(struct inode *inode)
{
        u64 cur, new;
        bool fenced = false;

        /*
         * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with
         * inode_maybe_inc_iversion(), see that routine for more details.
         */
        cur = inode_peek_iversion_raw(inode);
        do {
                /* If flag is already set, then no need to swap */
                if (cur & I_VERSION_QUERIED) {
                        if (!fenced)
                                smp_mb();
                        break;
                }

                fenced = true;
                new = cur | I_VERSION_QUERIED;
        } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
        return cur >> I_VERSION_QUERIED_SHIFT;
}
EXPORT_SYMBOL(inode_query_iversion);

ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t direct_written, ssize_t buffered_written)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        loff_t pos = iocb->ki_pos - buffered_written;
        loff_t end = iocb->ki_pos - 1;
        int err;

        /*
         * If the buffered write fallback returned an error, we want to return
         * the number of bytes which were written by direct I/O, or the error
         * code if that was zero.
         *
         * Note that this differs from normal direct-io semantics, which will
         * return -EFOO even if some bytes were written.
         */
        if (unlikely(buffered_written < 0)) {
                if (direct_written)
                        return direct_written;
                return buffered_written;
        }

        /*
         * We need to ensure that the page cache pages are written to disk and
         * invalidated to preserve the expected O_DIRECT semantics.
         */
        err = filemap_write_and_wait_range(mapping, pos, end);
        if (err < 0) {
                /*
                 * We don't know how much we wrote, so just return the number of
                 * bytes which were direct-written
                 */
                iocb->ki_pos -= buffered_written;
                if (direct_written)
                        return direct_written;
                return err;
        }
        invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
        return direct_written + buffered_written;
}
EXPORT_SYMBOL_GPL(direct_write_fallback);

/**
 * simple_inode_init_ts - initialize the timestamps for a new inode
 * @inode: inode to be initialized
 *
 * When a new inode is created, most filesystems set the timestamps to the
 * current time. Add a helper to do this.
 */
struct timespec64 simple_inode_init_ts(struct inode *inode)
{
        struct timespec64 ts = inode_set_ctime_current(inode);

        inode_set_atime_to_ts(inode, ts);
        inode_set_mtime_to_ts(inode, ts);
        return ts;
}
EXPORT_SYMBOL(simple_inode_init_ts);

struct dentry *stashed_dentry_get(struct dentry **stashed)
{
        struct dentry *dentry;

        guard(rcu)();
        dentry = rcu_dereference(*stashed);
        if (!dentry)
                return NULL;
        if (IS_ERR(dentry))
                return dentry;
        if (!lockref_get_not_dead(&dentry->d_lockref))
                return NULL;
        return dentry;
}

static struct dentry *prepare_anon_dentry(struct dentry **stashed,
                                          struct super_block *sb,
                                          void *data)
{
        struct dentry *dentry;
        struct inode *inode;
        const struct stashed_operations *sops = sb->s_fs_info;
        int ret;

        inode = new_inode_pseudo(sb);
        if (!inode) {
                sops->put_data(data);
                return ERR_PTR(-ENOMEM);
        }

        inode->i_flags |= S_IMMUTABLE;
        inode->i_mode = S_IFREG;
        simple_inode_init_ts(inode);

        ret = sops->init_inode(inode, data);
        if (ret < 0) {
                iput(inode);
                return ERR_PTR(ret);
        }

        /* Notice when this is changed. */
        WARN_ON_ONCE(!S_ISREG(inode->i_mode));

        dentry = d_alloc_anon(sb);
        if (!dentry) {
                iput(inode);
                return ERR_PTR(-ENOMEM);
        }

        /* Store address of location where dentry's supposed to be stashed. */
        dentry->d_fsdata = stashed;

        /* @data is now owned by the fs */
        d_instantiate(dentry, inode);
        return dentry;
}

struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry)
{
        guard(rcu)();
        for (;;) {
                struct dentry *old;

                /* Assume any old dentry was cleared out. */
                old = cmpxchg(stashed, NULL, dentry);
                if (likely(!old))
                        return dentry;

                /* Check if somebody else installed a reusable dentry. */
                if (lockref_get_not_dead(&old->d_lockref))
                        return old;

                /* There's an old dead dentry there, try to take it over. */
                if (likely(try_cmpxchg(stashed, &old, dentry)))
                        return dentry;
        }
}

/**
 * path_from_stashed - create path from stashed or new dentry
 * @stashed:    where to retrieve or stash dentry
 * @mnt:        mnt of the filesystems to use
 * @data:       data to store in inode->i_private
 * @path:       path to create
 *
 * The function tries to retrieve a stashed dentry from @stashed. If the dentry
 * is still valid then it will be reused. If the dentry isn't able the function
 * will allocate a new dentry and inode. It will then check again whether it
 * can reuse an existing dentry in case one has been added in the meantime or
 * update @stashed with the newly added dentry.
 *
 * Special-purpose helper for nsfs and pidfs.
 *
 * Return: On success zero and on failure a negative error is returned.
 */
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
                      struct path *path)
{
        struct dentry *dentry, *res;
        const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;

        /* See if dentry can be reused. */
        res = stashed_dentry_get(stashed);
        if (IS_ERR(res))
                return PTR_ERR(res);
        if (res) {
                sops->put_data(data);
                goto make_path;
        }

        /* Allocate a new dentry. */
        dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        /* Added a new dentry. @data is now owned by the filesystem. */
        if (sops->stash_dentry)
                res = sops->stash_dentry(stashed, dentry);
        else
                res = stash_dentry(stashed, dentry);
        if (IS_ERR(res)) {
                dput(dentry);
                return PTR_ERR(res);
        }
        if (res != dentry)
                dput(dentry);

make_path:
        path->dentry = res;
        path->mnt = mntget(mnt);
        VFS_WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
        VFS_WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
        return 0;
}

void stashed_dentry_prune(struct dentry *dentry)
{
        struct dentry **stashed = dentry->d_fsdata;
        struct inode *inode = d_inode(dentry);

        if (WARN_ON_ONCE(!stashed))
                return;

        if (!inode)
                return;

        /*
         * Only replace our own @dentry as someone else might've
         * already cleared out @dentry and stashed their own
         * dentry in there.
         */
        cmpxchg(stashed, dentry, NULL);
}

/**
 * simple_start_creating - prepare to create a given name
 * @parent: directory in which to prepare to create the name
 * @name:   the name to be created
 *
 * Required lock is taken and a lookup in performed prior to creating an
 * object in a directory.  No permission checking is performed.
 *
 * Returns: a negative dentry on which vfs_create() or similar may
 *  be attempted, or an error.
 */
struct dentry *simple_start_creating(struct dentry *parent, const char *name)
{
        struct qstr qname = QSTR(name);
        int err;

        err = lookup_noperm_common(&qname, parent);
        if (err)
                return ERR_PTR(err);
        return start_dirop(parent, &qname, LOOKUP_CREATE | LOOKUP_EXCL);
}
EXPORT_SYMBOL(simple_start_creating);

/* parent must have been held exclusive since simple_start_creating() */
void simple_done_creating(struct dentry *child)
{
        end_creating(child);
}
EXPORT_SYMBOL(simple_done_creating);



















































































































































































































































































































































































































   13 


















   13 

































































































   13 




   14 

   13 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/sched/ext.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/memblock.h>
#include <linux/nsproxy.h>
#include <linux/ns/ns_common_types.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/kstack_erase.h>
#include <linux/kasan.h>
#include <linux/randomize_kstack.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>
#include <linux/tick.h>
#include <linux/unwind_deferred.h>
#include <linux/pgalloc.h>
#include <linux/uaccess.h>

#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

/* For dup_mmap(). */
#include "../mm/internal.h"

#include <trace/events/sched.h>

#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

#include <kunit/visibility.h>

/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;        /* Handle normal Linux uptimes. */
int nr_threads;                        /* The idle threads do not count.. */

static int max_threads;                /* tunable limit on nr_threads */

#define NAMED_ARRAY_INDEX(x)        [x] = __stringify(x)

static const char * const resident_page_types[] = {
        NAMED_ARRAY_INDEX(MM_FILEPAGES),
        NAMED_ARRAY_INDEX(MM_ANONPAGES),
        NAMED_ARRAY_INDEX(MM_SWAPENTS),
        NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};

DEFINE_PER_CPU(unsigned long, process_counts) = 0;

__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
        return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */

int nr_processes(void)
{
        int cpu;
        int total = 0;

        for_each_possible_cpu(cpu)
                total += per_cpu(process_counts, cpu);

        return total;
}

void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

static struct kmem_cache *task_struct_cachep;

static inline struct task_struct *alloc_task_struct_node(int node)
{
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
        kmem_cache_free(task_struct_cachep, tsk);
}

#ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
/*
 * Allocated stacks are cached and later reused by new threads, so memcg
 * accounting is performed by the code assigning/releasing stacks to tasks.
 * We need a zeroed memory without __GFP_ACCOUNT.
 */
#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)

struct vm_stack {
        struct rcu_head rcu;
        struct vm_struct *stack_vm_area;
};

static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node)
{
        struct vm_struct *vm_area;
        unsigned int i;

        /*
         * If the node has memory, we are guaranteed the stacks are backed by local pages.
         * Otherwise the pages are arbitrary.
         *
         * Note that depending on cpuset it is possible we will get migrated to a different
         * node immediately after allocating here, so this does *not* guarantee locality for
         * arbitrary callers.
         */
        scoped_guard(preempt) {
                if (node != NUMA_NO_NODE && numa_node_id() != node)
                        return NULL;

                for (i = 0; i < NR_CACHED_STACKS; i++) {
                        vm_area = this_cpu_xchg(cached_stacks[i], NULL);
                        if (vm_area)
                                return vm_area;
                }
        }

        return NULL;
}

static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
{
        unsigned int i;
        int nid;

        /*
         * Don't cache stacks if any of the pages don't match the local domain, unless
         * there is no local memory to begin with.
         *
         * Note that lack of local memory does not automatically mean it makes no difference
         * performance-wise which other domain backs the stack. In this case we are merely
         * trying to avoid constantly going to vmalloc.
         */
        scoped_guard(preempt) {
                nid = numa_node_id();
                if (node_state(nid, N_MEMORY)) {
                        for (i = 0; i < vm_area->nr_pages; i++) {
                                struct page *page = vm_area->pages[i];
                                if (page_to_nid(page) != nid)
                                        return false;
                        }
                }

                for (i = 0; i < NR_CACHED_STACKS; i++) {
                        struct vm_struct *tmp = NULL;

                        if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
                                return true;
                }
        }
        return false;
}

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
        struct vm_struct *vm_area = vm_stack->stack_vm_area;

        if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
                return;

        vfree(vm_area->addr);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct vm_stack *vm_stack = tsk->stack;

        vm_stack->stack_vm_area = tsk->stack_vm_area;
        call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
}

static int free_vm_stack_cache(unsigned int cpu)
{
        struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *vm_area = cached_vm_stack_areas[i];

                if (!vm_area)
                        continue;

                vfree(vm_area->addr);
                cached_vm_stack_areas[i] = NULL;
        }

        return 0;
}

static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
{
        int i;
        int ret;
        int nr_charged = 0;

        BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);

        for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
                if (ret)
                        goto err;
                nr_charged++;
        }
        return 0;
err:
        for (i = 0; i < nr_charged; i++)
                memcg_kmem_uncharge_page(vm_area->pages[i], 0);
        return ret;
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        struct vm_struct *vm_area;
        void *stack;

        vm_area = alloc_thread_stack_node_from_cache(tsk, node);
        if (vm_area) {
                if (memcg_charge_kernel_stack(vm_area)) {
                        vfree(vm_area->addr);
                        return -ENOMEM;
                }

                /* Reset stack metadata. */
                kasan_unpoison_range(vm_area->addr, THREAD_SIZE);

                stack = kasan_reset_tag(vm_area->addr);

                /* Clear stale pointers from reused stack. */
                clear_pages(vm_area->addr, vm_area->nr_pages);

                tsk->stack_vm_area = vm_area;
                tsk->stack = stack;
                return 0;
        }

        stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
                                     GFP_VMAP_STACK,
                                     node, __builtin_return_address(0));
        if (!stack)
                return -ENOMEM;

        vm_area = find_vm_area(stack);
        if (memcg_charge_kernel_stack(vm_area)) {
                vfree(stack);
                return -ENOMEM;
        }
        /*
         * We can't call find_vm_area() in interrupt context, and
         * free_thread_stack() can be called in interrupt context,
         * so cache the vm_struct.
         */
        tsk->stack_vm_area = vm_area;
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return 0;
}

static void free_thread_stack(struct task_struct *tsk)
{
        if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
                thread_stack_delayed_free(tsk);

        tsk->stack = NULL;
        tsk->stack_vm_area = NULL;
}

#else /* !CONFIG_VMAP_STACK */

/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
#if THREAD_SIZE >= PAGE_SIZE

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct rcu_head *rh = tsk->stack;

        call_rcu(rh, thread_stack_free_rcu);
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);

        if (likely(page)) {
                tsk->stack = kasan_reset_tag(page_address(page));
                return 0;
        }
        return -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
{
        thread_stack_delayed_free(tsk);
        tsk->stack = NULL;
}

#else /* !(THREAD_SIZE >= PAGE_SIZE) */

static struct kmem_cache *thread_stack_cache;

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        kmem_cache_free(thread_stack_cache, rh);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct rcu_head *rh = tsk->stack;

        call_rcu(rh, thread_stack_free_rcu);
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return stack ? 0 : -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
{
        thread_stack_delayed_free(tsk);
        tsk->stack = NULL;
}

void thread_stack_cache_init(void)
{
        thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
                                        THREAD_SIZE, THREAD_SIZE, 0, 0,
                                        THREAD_SIZE, NULL);
        BUG_ON(thread_stack_cache == NULL);
}

#endif /* THREAD_SIZE >= PAGE_SIZE */
#endif /* CONFIG_VMAP_STACK */

/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;

/* SLAB cache for sighand_struct structures (tsk->sighand) */
struct kmem_cache *sighand_cachep;

/* SLAB cache for files_struct structures (tsk->files) */
struct kmem_cache *files_cachep;

/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;

/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;

static void account_kernel_stack(struct task_struct *tsk, int account)
{
        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                struct vm_struct *vm_area = task_stack_vm_area(tsk);
                int i;

                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
                                              account * (PAGE_SIZE / 1024));
        } else {
                void *stack = task_stack_page(tsk);

                /* All stack pages are in the same node. */
                mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
                                      account * (THREAD_SIZE / 1024));
        }
}

void exit_task_stack_account(struct task_struct *tsk)
{
        account_kernel_stack(tsk, -1);

        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                struct vm_struct *vm_area;
                int i;

                vm_area = task_stack_vm_area(tsk);
                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        memcg_kmem_uncharge_page(vm_area->pages[i], 0);
        }
}

static void release_task_stack(struct task_struct *tsk)
{
        if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
                return;  /* Better to leak the stack than to free prematurely */

        free_thread_stack(tsk);
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
        if (refcount_dec_and_test(&tsk->stack_refcount))
                release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
#ifdef CONFIG_SECCOMP
        WARN_ON_ONCE(tsk->seccomp.filter);
#endif
        release_user_cpus_ptr(tsk);
        scs_release(tsk);

#ifndef CONFIG_THREAD_INFO_IN_TASK
        /*
         * The task is finally done with both the stack and thread_info,
         * so free both.
         */
        release_task_stack(tsk);
#else
        /*
         * If the task had a separate stack allocation, it should be gone
         * by now.
         */
        WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        arch_release_task_struct(tsk);
        if (tsk->flags & PF_KTHREAD)
                free_kthread_struct(tsk);
        bpf_task_storage_free(tsk);
        free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
{
        struct file *exe_file;

        exe_file = get_mm_exe_file(oldmm);
        RCU_INIT_POINTER(mm->exe_file, exe_file);
        /*
         * We depend on the oldmm having properly denied write access to the
         * exe_file already.
         */
        if (exe_file && exe_file_deny_write_access(exe_file))
                pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
}

#ifdef CONFIG_MMU
static inline int mm_alloc_pgd(struct mm_struct *mm)
{
        mm->pgd = pgd_alloc(mm);
        if (unlikely(!mm->pgd))
                return -ENOMEM;
        return 0;
}

static inline void mm_free_pgd(struct mm_struct *mm)
{
        pgd_free(mm, mm->pgd);
}
#else
#define mm_alloc_pgd(mm)        (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

#ifdef CONFIG_MM_ID
static DEFINE_IDA(mm_ida);

static inline int mm_alloc_id(struct mm_struct *mm)
{
        int ret;

        ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
        if (ret < 0)
                return ret;
        mm->mm_id = ret;
        return 0;
}

static inline void mm_free_id(struct mm_struct *mm)
{
        const mm_id_t id = mm->mm_id;

        mm->mm_id = MM_ID_DUMMY;
        if (id == MM_ID_DUMMY)
                return;
        if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX))
                return;
        ida_free(&mm_ida, id);
}
#else /* !CONFIG_MM_ID */
static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
static inline void mm_free_id(struct mm_struct *mm) {}
#endif /* CONFIG_MM_ID */

static void check_mm(struct mm_struct *mm)
{
        int i;

        BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
                         "Please make sure 'struct resident_page_types[]' is updated as well");

        for (i = 0; i < NR_MM_COUNTERS; i++) {
                long x = percpu_counter_sum(&mm->rss_stat[i]);

                if (unlikely(x)) {
                        pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
                                 mm, resident_page_types[i], x,
                                 current->comm,
                                 task_pid_nr(current));
                }
        }

        if (mm_pgtables_bytes(mm))
                pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
                                mm_pgtables_bytes(mm));

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}

#define allocate_mm()        (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm)        (kmem_cache_free(mm_cachep, (mm)))

static void do_check_lazy_tlb(void *arg)
{
        struct mm_struct *mm = arg;

        WARN_ON_ONCE(current->active_mm == mm);
}

static void do_shoot_lazy_tlb(void *arg)
{
        struct mm_struct *mm = arg;

        if (current->active_mm == mm) {
                WARN_ON_ONCE(current->mm);
                current->active_mm = &init_mm;
                switch_mm(mm, &init_mm, current);
        }
}

static void cleanup_lazy_tlbs(struct mm_struct *mm)
{
        if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
                /*
                 * In this case, lazy tlb mms are refounted and would not reach
                 * __mmdrop until all CPUs have switched away and mmdrop()ed.
                 */
                return;
        }

        /*
         * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
         * requires lazy mm users to switch to another mm when the refcount
         * drops to zero, before the mm is freed. This requires IPIs here to
         * switch kernel threads to init_mm.
         *
         * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
         * switch with the final userspace teardown TLB flush which leaves the
         * mm lazy on this CPU but no others, reducing the need for additional
         * IPIs here. There are cases where a final IPI is still required here,
         * such as the final mmdrop being performed on a different CPU than the
         * one exiting, or kernel threads using the mm when userspace exits.
         *
         * IPI overheads have not found to be expensive, but they could be
         * reduced in a number of possible ways, for example (roughly
         * increasing order of complexity):
         * - The last lazy reference created by exit_mm() could instead switch
         *   to init_mm, however it's probable this will run on the same CPU
         *   immediately afterwards, so this may not reduce IPIs much.
         * - A batch of mms requiring IPIs could be gathered and freed at once.
         * - CPUs store active_mm where it can be remotely checked without a
         *   lock, to filter out false-positives in the cpumask.
         * - After mm_users or mm_count reaches zero, switching away from the
         *   mm could clear mm_cpumask to reduce some IPIs, perhaps together
         *   with some batching or delaying of the final IPIs.
         * - A delayed freeing and RCU-like quiescing sequence based on mm
         *   switching to avoid IPIs completely.
         */
        on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
        if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
                on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
}

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
void __mmdrop(struct mm_struct *mm)
{
        BUG_ON(mm == &init_mm);
        WARN_ON_ONCE(mm == current->mm);

        /* Ensure no CPUs are using this as their lazy tlb mm */
        cleanup_lazy_tlbs(mm);

        WARN_ON_ONCE(mm == current->active_mm);
        mm_free_pgd(mm);
        mm_free_id(mm);
        destroy_context(mm);
        mmu_notifier_subscriptions_destroy(mm);
        check_mm(mm);
        put_user_ns(mm->user_ns);
        mm_pasid_drop(mm);
        mm_destroy_cid(mm);
        percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);

        free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);

static void mmdrop_async_fn(struct work_struct *work)
{
        struct mm_struct *mm;

        mm = container_of(work, struct mm_struct, async_put_work);
        __mmdrop(mm);
}

static void mmdrop_async(struct mm_struct *mm)
{
        if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
                INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
                schedule_work(&mm->async_put_work);
        }
}

static inline void free_signal_struct(struct signal_struct *sig)
{
        taskstats_tgid_free(sig);
        sched_autogroup_exit(sig);
        /*
         * __mmdrop is not safe to call from softirq context on x86 due to
         * pgd_dtor so postpone it to the async context
         */
        if (sig->oom_mm)
                mmdrop_async(sig->oom_mm);
        kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
        if (refcount_dec_and_test(&sig->sigcnt))
                free_signal_struct(sig);
}

void __put_task_struct(struct task_struct *tsk)
{
        WARN_ON(!tsk->exit_state);
        WARN_ON(refcount_read(&tsk->usage));
        WARN_ON(tsk == current);

        unwind_task_free(tsk);
        io_uring_free(tsk);
        cgroup_task_free(tsk);
        task_numa_free(tsk, true);
        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
        sched_core_free(tsk);
        free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);

void __put_task_struct_rcu_cb(struct rcu_head *rhp)
{
        struct task_struct *task = container_of(rhp, struct task_struct, rcu);

        __put_task_struct(task);
}
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);

void __init __weak arch_task_cache_init(void) { }

/*
 * set_max_threads
 */
static void __init set_max_threads(unsigned int max_threads_suggested)
{
        u64 threads;
        unsigned long nr_pages = memblock_estimated_nr_free_pages();

        /*
         * The number of threads shall be limited such that the thread
         * structures may only consume a small part of the available memory.
         */
        if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
                threads = MAX_THREADS;
        else
                threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
                                    (u64) THREAD_SIZE * 8UL);

        if (threads > max_threads_suggested)
                threads = max_threads_suggested;

        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
}

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif

static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        /* Fetch thread_struct whitelist for the architecture. */
        arch_thread_struct_whitelist(offset, size);

        /*
         * Handle zero-sized whitelist or empty thread_struct, otherwise
         * adjust offset to position of thread_struct in task_struct.
         */
        if (unlikely(*size == 0))
                *offset = 0;
        else
                *offset += offsetof(struct task_struct, thread);
}

void __init fork_init(void)
{
        int i;
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN        0
#endif
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
        unsigned long useroffset, usersize;

        /* create a slab on which task_structs can be allocated */
        task_struct_whitelist(&useroffset, &usersize);
        task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align,
                        SLAB_PANIC|SLAB_ACCOUNT,
                        useroffset, usersize, NULL);

        /* do the arch specific task caches init */
        arch_task_cache_init();

        set_max_threads(MAX_THREADS);

        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];

        for (i = 0; i < UCOUNT_COUNTS; i++)
                init_user_ns.ucount_max[i] = max_threads/2;

        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC,      RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE,   RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK,    RLIM_INFINITY);

#ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
#endif

        scs_init();

        lockdep_init_task(&init_task);
        uprobes_init();
}

int __weak arch_dup_task_struct(struct task_struct *dst,
                                               struct task_struct *src)
{
        *dst = *src;
        return 0;
}

void set_task_stack_end_magic(struct task_struct *tsk)
{
        unsigned long *stackend;

        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;        /* for overflow detection */
}

static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
        struct task_struct *tsk;
        int err;

        if (node == NUMA_NO_NODE)
                node = tsk_fork_get_node(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;

        err = arch_dup_task_struct(tsk, orig);
        if (err)
                goto free_tsk;

        err = alloc_thread_stack_node(tsk, node);
        if (err)
                goto free_tsk;

#ifdef CONFIG_THREAD_INFO_IN_TASK
        refcount_set(&tsk->stack_refcount, 1);
#endif
        account_kernel_stack(tsk, 1);

        err = scs_prepare(tsk, node);
        if (err)
                goto free_stack;

#ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
         * the sighand lock in case orig has changed between now and
         * then. Until then, filter must be NULL to avoid messing up
         * the usage counts on the error path calling free_task.
         */
        tsk->seccomp.filter = NULL;
#endif

        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        set_task_stack_end_magic(tsk);
        clear_syscall_work_syscall_user_dispatch(tsk);

#ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
#endif
        if (orig->cpus_ptr == &orig->cpus_mask)
                tsk->cpus_ptr = &tsk->cpus_mask;
        dup_user_cpus_ptr(tsk, orig, node);

        /*
         * One for the user space visible state that goes away when reaped.
         * One for the scheduler.
         */
        refcount_set(&tsk->rcu_users, 2);
        /* One for the rcu users */
        refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
#endif
        tsk->splice_pipe = NULL;
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
        tsk->worker_private = NULL;

        kcov_task_init(tsk);
        kmsan_task_create(tsk);
        kmap_local_fork(tsk);

#ifdef CONFIG_FAULT_INJECTION
        tsk->fail_nth = 0;
#endif

#ifdef CONFIG_BLK_CGROUP
        tsk->throttle_disk = NULL;
        tsk->use_memdelay = 0;
#endif

#ifdef CONFIG_ARCH_HAS_CPU_PASID
        tsk->pasid_activated = 0;
#endif

#ifdef CONFIG_MEMCG
        tsk->active_memcg = NULL;
#endif

#ifdef CONFIG_X86_BUS_LOCK_DETECT
        tsk->reported_split_lock = 0;
#endif

#ifdef CONFIG_SCHED_MM_CID
        tsk->mm_cid.cid = MM_CID_UNSET;
        tsk->mm_cid.active = 0;
        INIT_HLIST_NODE(&tsk->mm_cid.node);
#endif
        return tsk;

free_stack:
        exit_task_stack_account(tsk);
        free_thread_stack(tsk);
free_tsk:
        free_task_struct(tsk);
        return NULL;
}

__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);

static unsigned long coredump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
        if (kstrtoul(s, 0, &coredump_filter))
                return 0;
        coredump_filter <<= MMF_DUMP_FILTER_SHIFT;
        coredump_filter &= MMF_DUMP_FILTER_MASK;
        return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

#include <linux/init_task.h>

static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
        spin_lock_init(&mm->ioctx_lock);
        mm->ioctx_table = NULL;
#endif
}

static __always_inline void mm_clear_owner(struct mm_struct *mm,
                                           struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        if (mm->owner == p)
                WRITE_ONCE(mm->owner, NULL);
#endif
}

static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        mm->owner = p;
#endif
}

static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
        mm->uprobes_state.xol_area = NULL;
        arch_uprobe_init_state(mm);
#endif
}

static void mmap_init_lock(struct mm_struct *mm)
{
        init_rwsem(&mm->mmap_lock);
        mm_lock_seqcount_init(mm);
#ifdef CONFIG_PER_VMA_LOCK
        rcuwait_init(&mm->vma_writer_wait);
#endif
}

static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        struct user_namespace *user_ns)
{
        mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
        mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        seqcount_init(&mm->write_protect_seq);
        mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
        mm_pgtables_bytes_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
        atomic64_set(&mm->pinned_vm, 0);
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
        spin_lock_init(&mm->arg_lock);
        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
        mm_pasid_init(mm);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_subscriptions_init(mm);
        init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
        mm->pmd_huge_pte = NULL;
#endif
        mm_init_uprobes_state(mm);
        hugetlb_count_init(mm);

        mm_flags_clear_all(mm);
        if (current->mm) {
                unsigned long flags = __mm_flags_get_word(current->mm);

                __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
                mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
        } else {
                __mm_flags_overwrite_word(mm, coredump_filter);
                mm->def_flags = 0;
        }

        if (futex_mm_init(mm))
                goto fail_mm_init;

        if (mm_alloc_pgd(mm))
                goto fail_nopgd;

        if (mm_alloc_id(mm))
                goto fail_noid;

        if (init_new_context(p, mm))
                goto fail_nocontext;

        if (mm_alloc_cid(mm, p))
                goto fail_cid;

        if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
                                     NR_MM_COUNTERS))
                goto fail_pcpu;

        mm->user_ns = get_user_ns(user_ns);
        lru_gen_init_mm(mm);
        return mm;

fail_pcpu:
        mm_destroy_cid(mm);
fail_cid:
        destroy_context(mm);
fail_nocontext:
        mm_free_id(mm);
fail_noid:
        mm_free_pgd(mm);
fail_nopgd:
        futex_hash_free(mm);
fail_mm_init:
        free_mm(mm);
        return NULL;
}

/*
 * Allocate and initialize an mm_struct.
 */
struct mm_struct *mm_alloc(void)
{
        struct mm_struct *mm;

        mm = allocate_mm();
        if (!mm)
                return NULL;

        memset(mm, 0, sizeof(*mm));
        return mm_init(mm, current, current_user_ns());
}
EXPORT_SYMBOL_IF_KUNIT(mm_alloc);

static inline void __mmput(struct mm_struct *mm)
{
        VM_BUG_ON(atomic_read(&mm->mm_users));

        uprobe_clear_state(mm);
        exit_aio(mm);
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
        mm_put_huge_zero_folio(mm);
        set_mm_exe_file(mm, NULL);
        if (!list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
                list_del(&mm->mmlist);
                spin_unlock(&mmlist_lock);
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
        lru_gen_del_mm(mm);
        futex_hash_free(mm);
        mmdrop(mm);
}

/*
 * Decrement the use count and release all resources for an mm.
 */
void mmput(struct mm_struct *mm)
{
        might_sleep();

        if (atomic_dec_and_test(&mm->mm_users))
                __mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);

#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
static void mmput_async_fn(struct work_struct *work)
{
        struct mm_struct *mm = container_of(work, struct mm_struct,
                                            async_put_work);

        __mmput(mm);
}

void mmput_async(struct mm_struct *mm)
{
        if (atomic_dec_and_test(&mm->mm_users)) {
                INIT_WORK(&mm->async_put_work, mmput_async_fn);
                schedule_work(&mm->async_put_work);
        }
}
EXPORT_SYMBOL_GPL(mmput_async);
#endif

/**
 * set_mm_exe_file - change a reference to the mm's executable file
 * @mm: The mm to change.
 * @new_exe_file: The new file to use.
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main users are mmput() and sys_execve(). Callers prevent concurrent
 * invocations: in mmput() nobody alive left, in execve it happens before
 * the new mm is made visible to anyone.
 *
 * Can only fail if new_exe_file != NULL.
 */
int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct file *old_exe_file;

        /*
         * It is safe to dereference the exe_file without RCU as
         * this function is only called if nobody else can access
         * this mm -- see comment above for justification.
         */
        old_exe_file = rcu_dereference_raw(mm->exe_file);

        if (new_exe_file) {
                /*
                 * We expect the caller (i.e., sys_execve) to already denied
                 * write access, so this is unlikely to fail.
                 */
                if (unlikely(exe_file_deny_write_access(new_exe_file)))
                        return -EACCES;
                get_file(new_exe_file);
        }
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        if (old_exe_file) {
                exe_file_allow_write_access(old_exe_file);
                fput(old_exe_file);
        }
        return 0;
}

/**
 * replace_mm_exe_file - replace a reference to the mm's executable file
 * @mm: The mm to change.
 * @new_exe_file: The new file to use.
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
 */
int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct vm_area_struct *vma;
        struct file *old_exe_file;
        int ret = 0;

        /* Forbid mm->exe_file change if old file still mapped. */
        old_exe_file = get_mm_exe_file(mm);
        if (old_exe_file) {
                VMA_ITERATOR(vmi, mm, 0);
                mmap_read_lock(mm);
                for_each_vma(vmi, vma) {
                        if (!vma->vm_file)
                                continue;
                        if (path_equal(&vma->vm_file->f_path,
                                       &old_exe_file->f_path)) {
                                ret = -EBUSY;
                                break;
                        }
                }
                mmap_read_unlock(mm);
                fput(old_exe_file);
                if (ret)
                        return ret;
        }

        ret = exe_file_deny_write_access(new_exe_file);
        if (ret)
                return -EACCES;
        get_file(new_exe_file);

        /* set the new file */
        mmap_write_lock(mm);
        old_exe_file = rcu_dereference_raw(mm->exe_file);
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        mmap_write_unlock(mm);

        if (old_exe_file) {
                exe_file_allow_write_access(old_exe_file);
                fput(old_exe_file);
        }
        return 0;
}

/**
 * get_mm_exe_file - acquire a reference to the mm's executable file
 * @mm: The mm of interest.
 *
 * Returns %NULL if mm has no associated executable file.
 * User must release file via fput().
 */
struct file *get_mm_exe_file(struct mm_struct *mm)
{
        struct file *exe_file;

        rcu_read_lock();
        exe_file = get_file_rcu(&mm->exe_file);
        rcu_read_unlock();
        return exe_file;
}

/**
 * get_task_exe_file - acquire a reference to the task's executable file
 * @task: The task.
 *
 * Returns %NULL if task's mm (if any) has no associated executable file or
 * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
 * User must release file via fput().
 */
struct file *get_task_exe_file(struct task_struct *task)
{
        struct file *exe_file = NULL;
        struct mm_struct *mm;

        if (task->flags & PF_KTHREAD)
                return NULL;

        task_lock(task);
        mm = task->mm;
        if (mm)
                exe_file = get_mm_exe_file(mm);
        task_unlock(task);
        return exe_file;
}

/**
 * get_task_mm - acquire a reference to the task's mm
 * @task: The task.
 *
 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 * this kernel workthread has transiently adopted a user mm with kthread_use_mm,
 * to do its AIO) is not set and if so returns a reference to it, after
 * bumping up the use count.  User must release the mm via mmput()
 * after use.  Typically used by /proc and ptrace.
 */
struct mm_struct *get_task_mm(struct task_struct *task)
{
        struct mm_struct *mm;

        if (task->flags & PF_KTHREAD)
                return NULL;

        task_lock(task);
        mm = task->mm;
        if (mm)
                mmget(mm);
        task_unlock(task);
        return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);

static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode)
{
        if (mm == current->mm)
                return true;
        if (ptrace_may_access(task, mode))
                return true;
        if ((mode & PTRACE_MODE_READ) && perfmon_capable())
                return true;
        return false;
}

struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
        struct mm_struct *mm;
        int err;

        err =  down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return ERR_PTR(err);

        mm = get_task_mm(task);
        if (!mm) {
                mm = ERR_PTR(-ESRCH);
        } else if (!may_access_mm(mm, task, mode)) {
                mmput(mm);
                mm = ERR_PTR(-EACCES);
        }
        up_read(&task->signal->exec_update_lock);

        return mm;
}

static void complete_vfork_done(struct task_struct *tsk)
{
        struct completion *vfork;

        task_lock(tsk);
        vfork = tsk->vfork_done;
        if (likely(vfork)) {
                tsk->vfork_done = NULL;
                complete(vfork);
        }
        task_unlock(tsk);
}

static int wait_for_vfork_done(struct task_struct *child,
                                struct completion *vfork)
{
        unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
        int killed;

        cgroup_enter_frozen();
        killed = wait_for_completion_state(vfork, state);
        cgroup_leave_frozen(false);

        if (killed) {
                task_lock(child);
                child->vfork_done = NULL;
                task_unlock(child);
        }

        put_task_struct(child);
        return killed;
}

/* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
 *
 * mm_release is called after a mm_struct has been removed
 * from the current process.
 *
 * This difference is important for error handling, when we
 * only half set up a mm_struct for a new process and need to restore
 * the old one.  Because we mmput the new mm_struct before
 * restoring the old one. . .
 * Eric Biederman 10 January 1998
 */
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        uprobe_free_utask(tsk);

        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);

        /*
         * Signal userspace if we're not exiting with a core dump
         * because we want to leave the value intact for debugging
         * purposes.
         */
        if (tsk->clear_child_tid) {
                if (atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
                         * not set up a proper pointer then tough luck.
                         */
                        put_user(0, tsk->clear_child_tid);
                        do_futex(tsk->clear_child_tid, FUTEX_WAKE,
                                        1, NULL, NULL, 0, 0);
                }
                tsk->clear_child_tid = NULL;
        }

        /*
         * All done, finally we can wake up parent and return this mm to him.
         * Also kthread_stop() uses this completion for synchronization.
         */
        if (tsk->vfork_done)
                complete_vfork_done(tsk);
}

void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exit_release(tsk);
        mm_release(tsk, mm);
}

void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exec_release(tsk);
        mm_release(tsk, mm);
}

/**
 * dup_mm() - duplicates an existing mm structure
 * @tsk: the task_struct with which the new mm will be associated.
 * @oldmm: the mm to duplicate.
 *
 * Allocates a new mm structure and duplicates the provided @oldmm structure
 * content into it.
 *
 * Return: the duplicated mm or NULL on failure.
 */
static struct mm_struct *dup_mm(struct task_struct *tsk,
                                struct mm_struct *oldmm)
{
        struct mm_struct *mm;
        int err;

        mm = allocate_mm();
        if (!mm)
                goto fail_nomem;

        memcpy(mm, oldmm, sizeof(*mm));

        if (!mm_init(mm, tsk, mm->user_ns))
                goto fail_nomem;

        uprobe_start_dup_mmap();
        err = dup_mmap(mm, oldmm);
        if (err)
                goto free_pt;
        uprobe_end_dup_mmap();

        mm->hiwater_rss = get_mm_rss(mm);
        mm->hiwater_vm = mm->total_vm;

        if (mm->binfmt && !try_module_get(mm->binfmt->module))
                goto free_pt;

        return mm;

free_pt:
        /* don't put binfmt in mmput, we haven't got module yet */
        mm->binfmt = NULL;
        mm_init_owner(mm, NULL);
        mmput(mm);
        if (err)
                uprobe_end_dup_mmap();

fail_nomem:
        return NULL;
}

static int copy_mm(u64 clone_flags, struct task_struct *tsk)
{
        struct mm_struct *mm, *oldmm;

        tsk->min_flt = tsk->maj_flt = 0;
        tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
        tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
        tsk->last_switch_time = 0;
#endif

        tsk->mm = NULL;
        tsk->active_mm = NULL;

        /*
         * Are we cloning a kernel thread?
         *
         * We need to steal a active VM for that..
         */
        oldmm = current->mm;
        if (!oldmm)
                return 0;

        if (clone_flags & CLONE_VM) {
                mmget(oldmm);
                mm = oldmm;
        } else {
                mm = dup_mm(tsk, current->mm);
                if (!mm)
                        return -ENOMEM;
        }

        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;
}

static int copy_fs(u64 clone_flags, struct task_struct *tsk)
{
        struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
                /* tsk->fs is already what we want */
                read_seqlock_excl(&fs->seq);
                /* "users" and "in_exec" locked for check_unsafe_exec() */
                if (fs->in_exec) {
                        read_sequnlock_excl(&fs->seq);
                        return -EAGAIN;
                }
                fs->users++;
                read_sequnlock_excl(&fs->seq);
                return 0;
        }
        tsk->fs = copy_fs_struct(fs);
        if (!tsk->fs)
                return -ENOMEM;
        return 0;
}

static int copy_files(u64 clone_flags, struct task_struct *tsk,
                      int no_files)
{
        struct files_struct *oldf, *newf;

        /*
         * A background process may not have any files ...
         */
        oldf = current->files;
        if (!oldf)
                return 0;

        if (no_files) {
                tsk->files = NULL;
                return 0;
        }

        if (clone_flags & CLONE_FILES) {
                atomic_inc(&oldf->count);
                return 0;
        }

        newf = dup_fd(oldf, NULL);
        if (IS_ERR(newf))
                return PTR_ERR(newf);

        tsk->files = newf;
        return 0;
}

static int copy_sighand(u64 clone_flags, struct task_struct *tsk)
{
        struct sighand_struct *sig;

        if (clone_flags & CLONE_SIGHAND) {
                refcount_inc(&current->sighand->count);
                return 0;
        }
        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
        RCU_INIT_POINTER(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;

        refcount_set(&sig->count, 1);
        spin_lock_irq(&current->sighand->siglock);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        spin_unlock_irq(&current->sighand->siglock);

        /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
        if (clone_flags & CLONE_CLEAR_SIGHAND)
                flush_signal_handlers(tsk, 0);

        return 0;
}

void __cleanup_sighand(struct sighand_struct *sighand)
{
        if (refcount_dec_and_test(&sighand->count)) {
                signalfd_cleanup(sighand);
                /*
                 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
                 * without an RCU grace period, see __lock_task_sighand().
                 */
                kmem_cache_free(sighand_cachep, sighand);
        }
}

/*
 * Initialize POSIX timer handling for a thread group.
 */
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
        struct posix_cputimers *pct = &sig->posix_cputimers;
        unsigned long cpu_limit;

        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        posix_cputimers_group_init(pct, cpu_limit);
}

static int copy_signal(u64 clone_flags, struct task_struct *tsk)
{
        struct signal_struct *sig;

        if (clone_flags & CLONE_THREAD)
                return 0;

        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;

        sig->nr_threads = 1;
        sig->quick_threads = 1;
        atomic_set(&sig->live, 1);
        refcount_set(&sig->sigcnt, 1);

        /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
        sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
        tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);

        init_waitqueue_head(&sig->wait_chldexit);
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_HLIST_HEAD(&sig->multiprocess);
        seqlock_init(&sig->stats_lock);
        prev_cputime_init(&sig->prev_cputime);

#ifdef CONFIG_POSIX_TIMERS
        INIT_HLIST_HEAD(&sig->posix_timers);
        INIT_HLIST_HEAD(&sig->ignored_posix_timers);
        hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
#endif

        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);

        posix_cpu_timers_init_group(sig);

        tty_audit_fork(sig);
        sched_autogroup_fork(sig);

#ifdef CONFIG_CGROUPS
        init_rwsem(&sig->cgroup_threadgroup_rwsem);
#endif

        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;

        mutex_init(&sig->cred_guard_mutex);
        init_rwsem(&sig->exec_update_lock);

        return 0;
}

static void copy_seccomp(struct task_struct *p)
{
#ifdef CONFIG_SECCOMP
        /*
         * Must be called with sighand->lock held, which is common to
         * all threads in the group. Holding cred_guard_mutex is not
         * needed because this new task is not yet running and cannot
         * be racing exec.
         */
        assert_spin_locked(&current->sighand->siglock);

        /* Ref-count the new filter user, and assign it. */
        get_seccomp_filter(current);
        p->seccomp = current->seccomp;

        /*
         * Explicitly enable no_new_privs here in case it got set
         * between the task_struct being duplicated and holding the
         * sighand lock. The seccomp state and nnp must be in sync.
         */
        if (task_no_new_privs(current))
                task_set_no_new_privs(p);

        /*
         * If the parent gained a seccomp mode after copying thread
         * flags and between before we held the sighand lock, we have
         * to manually enable the seccomp thread flag here.
         */
        if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
                set_task_syscall_work(p, SECCOMP);
#endif
}

SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
        current->clear_child_tid = tidptr;

        return task_pid_vnr(current);
}

static void rt_mutex_init_task(struct task_struct *p)
{
        raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
        p->pi_waiters = RB_ROOT_CACHED;
        p->pi_top_task = NULL;
        p->pi_blocked_on = NULL;
#endif
}

static inline void init_task_pid_links(struct task_struct *task)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_NODE(&task->pid_links[type]);
}

static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
        if (type == PIDTYPE_PID)
                task->thread_pid = pid;
        else
                task->signal->pids[type] = pid;
}

static inline void rcu_copy_process(struct task_struct *p)
{
#ifdef CONFIG_PREEMPT_RCU
        p->rcu_read_lock_nesting = 0;
        p->rcu_read_unlock_special.s = 0;
        p->rcu_blocked_node = NULL;
        INIT_LIST_HEAD(&p->rcu_node_entry);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
        p->rcu_tasks_holdout = false;
        INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
        p->rcu_tasks_idle_cpu = -1;
        INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
        p->trc_reader_nesting = 0;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}

/**
 * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret_file: return the new pidfs file
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper verifies that @pid is still in use, without PIDFD_THREAD the
 * task identified by @pid must be a thread-group leader.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
{
        struct file *pidfs_file;

        /*
         * PIDFD_STALE is only allowed to be passed if the caller knows
         * that @pid is already registered in pidfs and thus
         * PIDFD_INFO_EXIT information is guaranteed to be available.
         */
        if (!(flags & PIDFD_STALE)) {
                /*
                 * While holding the pidfd waitqueue lock removing the
                 * task linkage for the thread-group leader pid
                 * (PIDTYPE_TGID) isn't possible. Thus, if there's still
                 * task linkage for PIDTYPE_PID not having thread-group
                 * leader linkage for the pid means it wasn't a
                 * thread-group leader in the first place.
                 */
                guard(spinlock_irq)(&pid->wait_pidfd.lock);

                /* Task has already been reaped. */
                if (!pid_has_task(pid, PIDTYPE_PID))
                        return -ESRCH;
                /*
                 * If this struct pid isn't used as a thread-group
                 * leader but the caller requested to create a
                 * thread-group leader pidfd then report ENOENT.
                 */
                if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
                        return -ENOENT;
        }

        CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
        if (pidfd < 0)
                return pidfd;

        pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
        if (IS_ERR(pidfs_file))
                return PTR_ERR(pidfs_file);

        *ret_file = pidfs_file;
        return take_fd(pidfd);
}

static void __delayed_free_task(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        free_task(tsk);
}

static __always_inline void delayed_free_task(struct task_struct *tsk)
{
        if (IS_ENABLED(CONFIG_MEMCG))
                call_rcu(&tsk->rcu, __delayed_free_task);
        else
                free_task(tsk);
}

static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
{
        /* Skip if kernel thread */
        if (!tsk->mm)
                return;

        /* Skip if spawning a thread or using vfork */
        if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
                return;

        /* We need to synchronize with __set_oom_adj */
        mutex_lock(&oom_adj_mutex);
        mm_flags_set(MMF_MULTIPROCESS, tsk->mm);
        /* Update the values in case they were changed after copy_signal */
        tsk->signal->oom_score_adj = current->signal->oom_score_adj;
        tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_unlock(&oom_adj_mutex);
}

#ifdef CONFIG_RV
static void rv_task_fork(struct task_struct *p)
{
        memset(&p->rv, 0, sizeof(p->rv));
}
#else
#define rv_task_fork(p) do {} while (0)
#endif

static bool need_futex_hash_allocate_default(u64 clone_flags)
{
        if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
                return false;
        return true;
}

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
__latent_entropy struct task_struct *copy_process(
                                        struct pid *pid,
                                        int trace,
                                        int node,
                                        struct kernel_clone_args *args)
{
        int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
        struct file *pidfile = NULL;
        const u64 clone_flags = args->flags;
        struct nsproxy *nsp = current->nsproxy;

        /*
         * Don't allow sharing the root directory with processes in a different
         * namespace
         */
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);

        if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
                return ERR_PTR(-EINVAL);

        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
         */
        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
                return ERR_PTR(-EINVAL);

        /*
         * Shared signal handlers imply shared VM. By way of the above,
         * thread groups also imply shared VM. Blocking this case allows
         * for various simplifications in other code.
         */
        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
                return ERR_PTR(-EINVAL);

        /*
         * Siblings of global init remain as zombies on exit since they are
         * not reaped by their parent (swapper). To solve this and to avoid
         * multi-rooted process trees, prevent global and container-inits
         * from creating siblings.
         */
        if ((clone_flags & CLONE_PARENT) &&
                                current->signal->flags & SIGNAL_UNKILLABLE)
                return ERR_PTR(-EINVAL);

        /*
         * If the new process will be in a different pid or user namespace
         * do not allow it to share a thread group with the forking task.
         */
        if (clone_flags & CLONE_THREAD) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) != nsp->pid_ns_for_children))
                        return ERR_PTR(-EINVAL);
        }

        if (clone_flags & CLONE_PIDFD) {
                /*
                 * - CLONE_DETACHED is blocked so that we can potentially
                 *   reuse it later for CLONE_PIDFD.
                 */
                if (clone_flags & CLONE_DETACHED)
                        return ERR_PTR(-EINVAL);
        }

        if (clone_flags & CLONE_AUTOREAP) {
                if (clone_flags & CLONE_THREAD)
                        return ERR_PTR(-EINVAL);
                if (clone_flags & CLONE_PARENT)
                        return ERR_PTR(-EINVAL);
                if (args->exit_signal)
                        return ERR_PTR(-EINVAL);
        }

        if ((clone_flags & CLONE_PARENT) && current->signal->autoreap)
                return ERR_PTR(-EINVAL);

        if (clone_flags & CLONE_NNP) {
                if (clone_flags & CLONE_THREAD)
                        return ERR_PTR(-EINVAL);
        }

        if (clone_flags & CLONE_PIDFD_AUTOKILL) {
                if (!(clone_flags & CLONE_PIDFD))
                        return ERR_PTR(-EINVAL);
                if (!(clone_flags & CLONE_AUTOREAP))
                        return ERR_PTR(-EINVAL);
                if (clone_flags & CLONE_THREAD)
                        return ERR_PTR(-EINVAL);
                /*
                 * Without CLONE_NNP the child could escalate privileges
                 * after being spawned, so require CAP_SYS_ADMIN.
                 * With CLONE_NNP the child can't gain new privileges,
                 * so allow unprivileged usage.
                 */
                if (!(clone_flags & CLONE_NNP) &&
                    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
                        return ERR_PTR(-EPERM);
        }

        /*
         * Force any signals received before this point to be delivered
         * before the fork happens.  Collect up signals sent to multiple
         * processes that happen during the fork and delay them so that
         * they appear to happen after the fork.
         */
        sigemptyset(&delayed.signal);
        INIT_HLIST_NODE(&delayed.node);

        spin_lock_irq(&current->sighand->siglock);
        if (!(clone_flags & CLONE_THREAD))
                hlist_add_head(&delayed.node, &current->signal->multiprocess);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
        retval = -ERESTARTNOINTR;
        if (task_sigpending(current))
                goto fork_out;

        retval = -ENOMEM;
        p = dup_task_struct(current, node);
        if (!p)
                goto fork_out;
        p->flags &= ~PF_KTHREAD;
        if (args->kthread)
                p->flags |= PF_KTHREAD;
        if (args->user_worker) {
                /*
                 * Mark us a user worker, and block any signal that isn't
                 * fatal or STOP
                 */
                p->flags |= PF_USER_WORKER;
                siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
        }
        if (args->io_thread)
                p->flags |= PF_IO_WORKER;

        if (args->name)
                strscpy_pad(p->comm, args->name, sizeof(p->comm));

        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
        /*
         * TID is cleared in mm_release() when the task exits
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;

        ftrace_graph_init_task(p);

        rt_mutex_init_task(p);
        raw_spin_lock_init(&p->blocked_lock);

        lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
        retval = copy_creds(p, clone_flags);
        if (retval < 0)
                goto bad_fork_free;

        retval = -EAGAIN;
        if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_cleanup_count;
        }
        current->flags &= ~PF_NPROC_EXCEEDED;

        /*
         * If multiple threads are within copy_process(), then this check
         * triggers too late. This doesn't hurt, the check is only there
         * to stop root fork bombs.
         */
        retval = -EAGAIN;
        if (data_race(nr_threads >= max_threads))
                goto bad_fork_cleanup_count;

        delayacct_tsk_init(p);        /* Must remain after dup_task_struct() */
        p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
        p->flags |= PF_FORKNOEXEC;
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        rcu_copy_process(p);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);

        init_sigpending(&p->pending);

        p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        p->utimescaled = p->stimescaled = 0;
#endif
        prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        seqcount_init(&p->vtime.seqcount);
        p->vtime.starttime = 0;
        p->vtime.state = VTIME_INACTIVE;
#endif

#ifdef CONFIG_IO_URING
        p->io_uring = NULL;
        retval = io_uring_fork(p);
        if (unlikely(retval))
                goto bad_fork_cleanup_delayacct;
        retval = -EAGAIN;
#endif

        p->default_timer_slack_ns = current->timer_slack_ns;

#ifdef CONFIG_PSI
        p->psi_flags = 0;
#endif

        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);

        posix_cputimers_init(&p->posix_cputimers);
        tick_dep_init_task(p);

        p->io_context = NULL;
        audit_set_context(p, NULL);
        cgroup_fork(p);
        if (args->kthread) {
                if (!set_kthread_struct(p))
                        goto bad_fork_cleanup_delayacct;
        }
#ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
        if (IS_ERR(p->mempolicy)) {
                retval = PTR_ERR(p->mempolicy);
                p->mempolicy = NULL;
                goto bad_fork_cleanup_delayacct;
        }
#endif
#ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
        memset(&p->irqtrace, 0, sizeof(p->irqtrace));
        p->irqtrace.hardirq_disable_ip        = _THIS_IP_;
        p->irqtrace.softirq_enable_ip        = _THIS_IP_;
        p->softirqs_enabled                = 1;
        p->softirq_context                = 0;
#endif

        p->pagefault_disabled = 0;

        lockdep_init_task(p);

        p->blocked_on = NULL; /* not blocked yet */

#ifdef CONFIG_BCACHE
        p->sequential_io        = 0;
        p->sequential_io_avg        = 0;
#endif
#ifdef CONFIG_BPF_SYSCALL
        RCU_INIT_POINTER(p->bpf_storage, NULL);
        p->bpf_ctx = NULL;
#endif

        unwind_task_init(p);

        /* Perform scheduler related setup. Assign this task to a CPU. */
        retval = sched_fork(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_policy;

        retval = perf_event_init_task(p, clone_flags);
        if (retval)
                goto bad_fork_sched_cancel_fork;
        retval = audit_alloc(p);
        if (retval)
                goto bad_fork_cleanup_perf;
        /* copy all the process information */
        shm_init_task(p);
        retval = security_task_alloc(p, clone_flags);
        if (retval)
                goto bad_fork_cleanup_audit;
        retval = copy_semundo(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_security;
        retval = copy_files(clone_flags, p, args->no_files);
        if (retval)
                goto bad_fork_cleanup_semundo;
        retval = copy_fs(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_files;
        retval = copy_sighand(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_fs;
        retval = copy_signal(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_sighand;
        retval = copy_mm(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_signal;
        retval = copy_namespaces(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_mm;
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
        retval = copy_thread(p, args);
        if (retval)
                goto bad_fork_cleanup_io;

        stackleak_task_init(p);

        if (pid != &init_struct_pid) {
                pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
                                args->set_tid_size);
                if (IS_ERR(pid)) {
                        retval = PTR_ERR(pid);
                        goto bad_fork_cleanup_thread;
                }
        }

        /*
         * This has to happen after we've potentially unshared the file
         * descriptor table (so that the pidfd doesn't leak into the child
         * if the fd table isn't shared).
         */
        if (clone_flags & CLONE_PIDFD) {
                unsigned flags = PIDFD_STALE;

                if (clone_flags & CLONE_THREAD)
                        flags |= PIDFD_THREAD;
                if (clone_flags & CLONE_PIDFD_AUTOKILL)
                        flags |= PIDFD_AUTOKILL;

                /*
                 * Note that no task has been attached to @pid yet indicate
                 * that via CLONE_PIDFD.
                 */
                retval = pidfd_prepare(pid, flags, &pidfile);
                if (retval < 0)
                        goto bad_fork_free_pid;
                pidfd = retval;

                retval = put_user(pidfd, args->pidfd);
                if (retval)
                        goto bad_fork_put_pidfd;
        }

#ifdef CONFIG_BLOCK
        p->plug = NULL;
#endif
        futex_init_task(p);

        /*
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
                sas_ss_reset(p);

        /*
         * Syscall tracing and stepping should be turned off in the
         * child regardless of CLONE_PTRACE.
         */
        user_disable_single_step(p);
        clear_task_syscall_work(p, SYSCALL_TRACE);
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
        clear_task_syscall_work(p, SYSCALL_EMU);
#endif
        clear_tsk_latency_tracing(p);

        /* ok, now we should be set up.. */
        p->pid = pid_nr(pid);
        if (clone_flags & CLONE_THREAD) {
                p->group_leader = current->group_leader;
                p->tgid = current->tgid;
        } else {
                p->group_leader = p;
                p->tgid = p->pid;
        }

        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        p->dirty_paused_when = 0;

        p->pdeath_signal = 0;
        p->task_works = NULL;
        clear_posix_cputimers_work(p);

#ifdef CONFIG_KRETPROBES
        p->kretprobe_instances.first = NULL;
#endif
#ifdef CONFIG_RETHOOK
        p->rethooks.first = NULL;
#endif

        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted that the new process's css_set can be changed
         * between here and cgroup_post_fork() if an organisation operation is in
         * progress.
         */
        retval = cgroup_can_fork(p, args);
        if (retval)
                goto bad_fork_put_pidfd;

        /*
         * Now that the cgroups are pinned, re-clone the parent cgroup and put
         * the new task on the correct runqueue. All this *before* the task
         * becomes visible.
         *
         * This isn't part of ->can_fork() because while the re-cloning is
         * cgroup specific, it unconditionally needs to place the task on a
         * runqueue.
         */
        retval = sched_cgroup_fork(p, args);
        if (retval)
                goto bad_fork_cancel_cgroup;

        /*
         * Allocate a default futex hash for the user process once the first
         * thread spawns.
         */
        if (need_futex_hash_allocate_default(clone_flags)) {
                retval = futex_hash_allocate_default();
                if (retval)
                        goto bad_fork_cancel_cgroup;
                /*
                 * If we fail beyond this point we don't free the allocated
                 * futex hash map. We assume that another thread will be created
                 * and makes use of it. The hash map will be freed once the main
                 * thread terminates.
                 */
        }
        /*
         * From this point on we must avoid any synchronous user-space
         * communication until we take the tasklist-lock. In particular, we do
         * not want user-space to be able to predict the process start-time by
         * stalling fork(2) after we recorded the start_time but before it is
         * visible to the system.
         */

        p->start_time = ktime_get_ns();
        p->start_boottime = ktime_get_boottime_ns();

        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
         * Need tasklist lock for parent etc handling!
         */
        write_lock_irq(&tasklist_lock);

        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
                p->parent_exec_id = current->parent_exec_id;
                if (clone_flags & CLONE_THREAD)
                        p->exit_signal = -1;
                else
                        p->exit_signal = current->group_leader->exit_signal;
        } else {
                p->real_parent = current;
                p->parent_exec_id = current->self_exec_id;
                p->exit_signal = args->exit_signal;
        }

        klp_copy_process(p);

        sched_core_fork(p);

        spin_lock(&current->sighand->siglock);

        rv_task_fork(p);

        rseq_fork(p, clone_flags);

        /*
         * If zap_pid_ns_processes() was called after alloc_pid(), the new
         * child missed SIGKILL.  If current is not in the same namespace,
         * we can't rely on fatal_signal_pending() below.
         */
        if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
                retval = -ENOMEM;
                goto bad_fork_core_free;
        }

        /* Let kill terminate clone/fork in the middle */
        if (fatal_signal_pending(current)) {
                retval = -EINTR;
                goto bad_fork_core_free;
        }

        /* No more failure paths after this point. */

        /*
         * Copy seccomp details explicitly here, in case they were changed
         * before holding sighand lock.
         */
        copy_seccomp(p);

        if (clone_flags & CLONE_NNP)
                task_set_no_new_privs(p);

        init_task_pid_links(p);
        if (likely(p->pid)) {
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

                init_task_pid(p, PIDTYPE_PID, pid);
                if (thread_group_leader(p)) {
                        init_task_pid(p, PIDTYPE_TGID, pid);
                        init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        init_task_pid(p, PIDTYPE_SID, task_session(current));

                        if (is_child_reaper(pid)) {
                                struct pid_namespace *ns = ns_of_pid(pid);

                                ASSERT_EXCLUSIVE_WRITER(ns->child_reaper);
                                WRITE_ONCE(ns->child_reaper, p);
                                p->signal->flags |= SIGNAL_UNKILLABLE;
                        }
                        p->signal->shared_pending.signal = delayed.signal;
                        p->signal->tty = tty_kref_get(current->signal->tty);
                        /*
                         * Inherit has_child_subreaper flag under the same
                         * tasklist_lock with adding child to the process tree
                         * for propagate_has_child_subreaper optimization.
                         */
                        p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                                                         p->real_parent->signal->is_child_subreaper;
                        if (clone_flags & CLONE_AUTOREAP)
                                p->signal->autoreap = 1;
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        attach_pid(p, PIDTYPE_TGID);
                        attach_pid(p, PIDTYPE_PGID);
                        attach_pid(p, PIDTYPE_SID);
                        __this_cpu_inc(process_counts);
                } else {
                        current->signal->nr_threads++;
                        current->signal->quick_threads++;
                        atomic_inc(&current->signal->live);
                        refcount_inc(&current->signal->sigcnt);
                        task_join_group_stop(p);
                        list_add_tail_rcu(&p->thread_node,
                                          &p->signal->thread_head);
                }
                attach_pid(p, PIDTYPE_PID);
                nr_threads++;
        }
        total_forks++;
        hlist_del_init(&delayed.node);
        spin_unlock(&current->sighand->siglock);
        syscall_tracepoint_update(p);
        write_unlock_irq(&tasklist_lock);

        if (pidfile)
                fd_install(pidfd, pidfile);

        proc_fork_connector(p);
        /*
         * sched_ext needs @p to be associated with its cgroup in its post_fork
         * hook. cgroup_post_fork() should come before sched_post_fork().
         */
        cgroup_post_fork(p, args);
        sched_post_fork(p);
        perf_event_fork(p);

        trace_task_newtask(p, clone_flags);
        uprobe_copy_process(p, clone_flags);
        user_events_fork(p, clone_flags);

        copy_oom_score_adj(clone_flags, p);

        return p;

bad_fork_core_free:
        sched_core_free(p);
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
bad_fork_cancel_cgroup:
        cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
        if (clone_flags & CLONE_PIDFD) {
                fput(pidfile);
                put_unused_fd(pidfd);
        }
bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
bad_fork_cleanup_thread:
        exit_thread(p);
bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
bad_fork_cleanup_namespaces:
        exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
        if (p->mm) {
                mm_clear_owner(p->mm, p);
                mmput(p->mm);
        }
bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
bad_fork_cleanup_files:
        exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
        exit_sem(p);
bad_fork_cleanup_security:
        security_task_free(p);
bad_fork_cleanup_audit:
        audit_free(p);
bad_fork_cleanup_perf:
        perf_event_free_task(p);
bad_fork_sched_cancel_fork:
        sched_cancel_fork(p);
bad_fork_cleanup_policy:
        lockdep_free_task(p);
#ifdef CONFIG_NUMA
        mpol_put(p->mempolicy);
#endif
bad_fork_cleanup_delayacct:
        io_uring_free(p);
        delayacct_tsk_free(p);
bad_fork_cleanup_count:
        dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        exit_cred_namespaces(p);
        exit_creds(p);
bad_fork_free:
        WRITE_ONCE(p->__state, TASK_DEAD);
        exit_task_stack_account(p);
        put_task_stack(p);
        delayed_free_task(p);
fork_out:
        spin_lock_irq(&current->sighand->siglock);
        hlist_del_init(&delayed.node);
        spin_unlock_irq(&current->sighand->siglock);
        return ERR_PTR(retval);
}

static inline void init_idle_pids(struct task_struct *idle)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
                INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
                init_task_pid(idle, type, &init_struct_pid);
        }
}

static int idle_dummy(void *dummy)
{
        /* This function is never called */
        return 0;
}

struct task_struct * __init fork_idle(int cpu)
{
        struct task_struct *task;
        struct kernel_clone_args args = {
                .flags                = CLONE_VM,
                .fn                = &idle_dummy,
                .fn_arg                = NULL,
                .kthread        = 1,
                .idle                = 1,
        };

        task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
        if (!IS_ERR(task)) {
                init_idle_pids(task);
                init_idle(task, cpu);
        }

        return task;
}

/*
 * This is like kernel_clone(), but shaved down and tailored to just
 * creating io_uring workers. It returns a created task, or an error pointer.
 * The returned task is inactive, and the caller must fire it up through
 * wake_up_new_task(p). All signals are blocked in the created task.
 */
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
        unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
                              CLONE_IO|CLONE_VM|CLONE_UNTRACED;
        struct kernel_clone_args args = {
                .flags                = flags,
                .fn                = fn,
                .fn_arg                = arg,
                .io_thread        = 1,
                .user_worker        = 1,
        };

        return copy_process(NULL, 0, node, &args);
}

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 *
 * args->exit_signal is expected to be checked for sanity by the caller.
 */
pid_t kernel_clone(struct kernel_clone_args *args)
{
        u64 clone_flags = args->flags;
        struct completion vfork;
        struct pid *pid;
        struct task_struct *p;
        int trace = 0;
        pid_t nr;

        /*
         * Creating an empty mount namespace implies creating a new mount
         * namespace.  Set this before copy_process() so that the
         * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly.
         */
        if (clone_flags & CLONE_EMPTY_MNTNS) {
                clone_flags |= CLONE_NEWNS;
                args->flags = clone_flags;
        }

        /*
         * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
         * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
         * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
         * field in struct clone_args and it still doesn't make sense to have
         * them both point at the same memory location. Performing this check
         * here has the advantage that we don't need to have a separate helper
         * to check for legacy clone().
         */
        if ((clone_flags & CLONE_PIDFD) &&
            (clone_flags & CLONE_PARENT_SETTID) &&
            (args->pidfd == args->parent_tid))
                return -EINVAL;

        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if (args->exit_signal != SIGCHLD)
                        trace = PTRACE_EVENT_CLONE;
                else
                        trace = PTRACE_EVENT_FORK;

                if (likely(!ptrace_event_enabled(current, trace)))
                        trace = 0;
        }

        p = copy_process(NULL, trace, NUMA_NO_NODE, args);
        add_latent_entropy();

        if (IS_ERR(p))
                return PTR_ERR(p);

        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
                put_user(nr, args->parent_tid);

        if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
                get_task_struct(p);
        }

        if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
                /* lock the task to synchronize with memcg migration */
                task_lock(p);
                lru_gen_add_mm(p->mm);
                task_unlock(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
                ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
                if (!wait_for_vfork_done(p, &vfork))
                        ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
        return nr;
}

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
                    unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (flags & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
                .name                = name,
                .kthread        = 1,
        };

        return kernel_clone(&args);
}

/*
 * Create a user mode thread.
 */
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (flags & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
        };

        return kernel_clone(&args);
}

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
        struct kernel_clone_args args = {
                .exit_signal = SIGCHLD,
        };

        return kernel_clone(&args);
#else
        /* can not support in nommu mode */
        return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
        struct kernel_clone_args args = {
                .flags                = CLONE_VFORK | CLONE_VM,
                .exit_signal        = SIGCHLD,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 unsigned long, tls,
                 int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
                int, stack_size,
                int __user *, parent_tidptr,
                int __user *, child_tidptr,
                unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#endif
{
        struct kernel_clone_args args = {
                .flags                = (lower_32_bits(clone_flags) & ~CSIGNAL),
                .pidfd                = parent_tidptr,
                .child_tid        = child_tidptr,
                .parent_tid        = parent_tidptr,
                .exit_signal        = (lower_32_bits(clone_flags) & CSIGNAL),
                .stack                = newsp,
                .tls                = tls,
        };

        return kernel_clone(&args);
}
#endif

static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                                              struct clone_args __user *uargs,
                                              size_t usize)
{
        int err;
        struct clone_args args;
        pid_t *kset_tid = kargs->set_tid;

        BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
                     CLONE_ARGS_SIZE_VER0);
        BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
                     CLONE_ARGS_SIZE_VER1);
        BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
                     CLONE_ARGS_SIZE_VER2);
        BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
                return -EINVAL;

        err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (err)
                return err;

        if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
                return -EINVAL;

        if (unlikely(!args.set_tid && args.set_tid_size > 0))
                return -EINVAL;

        if (unlikely(args.set_tid && args.set_tid_size == 0))
                return -EINVAL;

        /*
         * Verify that higher 32bits of exit_signal are unset and that
         * it is a valid signal
         */
        if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
                     !valid_signal(args.exit_signal)))
                return -EINVAL;

        if ((args.flags & CLONE_INTO_CGROUP) &&
            (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
                return -EINVAL;

        *kargs = (struct kernel_clone_args){
                .flags                = args.flags,
                .pidfd                = u64_to_user_ptr(args.pidfd),
                .child_tid        = u64_to_user_ptr(args.child_tid),
                .parent_tid        = u64_to_user_ptr(args.parent_tid),
                .exit_signal        = args.exit_signal,
                .stack                = args.stack,
                .stack_size        = args.stack_size,
                .tls                = args.tls,
                .set_tid_size        = args.set_tid_size,
                .cgroup                = args.cgroup,
        };

        if (args.set_tid &&
                copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
                        (kargs->set_tid_size * sizeof(pid_t))))
                return -EFAULT;

        kargs->set_tid = kset_tid;

        return 0;
}

/**
 * clone3_stack_valid - check and prepare stack
 * @kargs: kernel clone args
 *
 * Verify that the stack arguments userspace gave us are sane.
 * In addition, set the stack direction for userspace since it's easy for us to
 * determine.
 */
static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
{
        if (kargs->stack == 0) {
                if (kargs->stack_size > 0)
                        return false;
        } else {
                if (kargs->stack_size == 0)
                        return false;

                if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
                        return false;

#if !defined(CONFIG_STACK_GROWSUP)
                kargs->stack += kargs->stack_size;
#endif
        }

        return true;
}

static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
        /* Verify that no unknown flags are passed along. */
        if (kargs->flags &
            ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
              CLONE_INTO_CGROUP | CLONE_AUTOREAP | CLONE_NNP |
              CLONE_PIDFD_AUTOKILL | CLONE_EMPTY_MNTNS))
                return false;

        /*
         * - make the CLONE_DETACHED bit reusable for clone3
         * - make the CSIGNAL bits reusable for clone3
         */
        if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
                return false;

        if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
            (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
                return false;

        if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
            kargs->exit_signal)
                return false;

        if (!clone3_stack_valid(kargs))
                return false;

        return true;
}

/**
 * sys_clone3 - create a new process with specific properties
 * @uargs: argument structure
 * @size:  size of @uargs
 *
 * clone3() is the extensible successor to clone()/clone2().
 * It takes a struct as argument that is versioned by its size.
 *
 * Return: On success, a positive PID for the child process.
 *         On error, a negative errno number.
 */
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
{
        int err;

        struct kernel_clone_args kargs;
        pid_t set_tid[MAX_PID_NS_LEVEL];

#ifdef __ARCH_BROKEN_SYS_CLONE3
#warning clone3() entry point is missing, please fix
        return -ENOSYS;
#endif

        kargs.set_tid = set_tid;

        err = copy_clone_args_from_user(&kargs, uargs, size);
        if (err)
                return err;

        if (!clone3_args_valid(&kargs))
                return -EINVAL;

        return kernel_clone(&kargs);
}

void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
        struct task_struct *leader, *parent, *child;
        int res;

        read_lock(&tasklist_lock);
        leader = top = top->group_leader;
down:
        for_each_thread(leader, parent) {
                list_for_each_entry(child, &parent->children, sibling) {
                        res = visitor(child, data);
                        if (res) {
                                if (res < 0)
                                        goto out;
                                leader = child;
                                goto down;
                        }
up:
                        ;
                }
        }

        if (leader != top) {
                child = leader;
                parent = child->real_parent;
                leader = parent->group_leader;
                goto up;
        }
out:
        read_unlock(&tasklist_lock);
}

#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif

static void sighand_ctor(void *data)
{
        struct sighand_struct *sighand = data;

        spin_lock_init(&sighand->siglock);
        init_waitqueue_head(&sighand->signalfd_wqh);
}

void __init mm_cache_init(void)
{
        unsigned int mm_size;

        /*
         * The mm_cpumask is located at the end of mm_struct, and is
         * dynamically sized based on the maximum CPU number this system
         * can have, taking hotplug into account (nr_cpu_ids).
         */
        mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();

        mm_cachep = kmem_cache_create_usercopy("mm_struct",
                        mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        offsetof(struct mm_struct, saved_auxv),
                        sizeof_field(struct mm_struct, saved_auxv),
                        NULL);
}

void __init proc_caches_init(void)
{
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        SLAB_ACCOUNT, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        files_cachep = kmem_cache_create("files_cache",
                        sizeof(struct files_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        mmap_init();
        nsproxy_cache_init();
}

/*
 * Check constraints on flags passed to the unshare system call.
 */
static int check_unshare_flags(unsigned long unshare_flags)
{
        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_SIGHAND|
                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
                                CLONE_NS_ALL | UNSHARE_EMPTY_MNTNS))
                return -EINVAL;
        /*
         * Not implemented, but pretend it works if there is nothing
         * to unshare.  Note that unsharing the address space or the
         * signal handlers also need to unshare the signal queues (aka
         * CLONE_THREAD).
         */
        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
                if (!thread_group_empty(current))
                        return -EINVAL;
        }
        if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
                if (refcount_read(&current->sighand->count) > 1)
                        return -EINVAL;
        }
        if (unshare_flags & CLONE_VM) {
                if (!current_is_single_threaded())
                        return -EINVAL;
        }

        return 0;
}

/*
 * Unshare the filesystem structure if it is being shared
 */
static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
        struct fs_struct *fs = current->fs;

        if (!(unshare_flags & CLONE_FS) || !fs)
                return 0;

        /* don't need lock here; in the worst case we'll do useless copy */
        if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1)
                return 0;

        *new_fsp = copy_fs_struct(fs);
        if (!*new_fsp)
                return -ENOMEM;

        return 0;
}

/*
 * Unshare file descriptor table if it is being shared
 */
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
{
        struct files_struct *fd = current->files;

        if ((unshare_flags & CLONE_FILES) &&
            (fd && atomic_read(&fd->count) > 1)) {
                fd = dup_fd(fd, NULL);
                if (IS_ERR(fd))
                        return PTR_ERR(fd);
                *new_fdp = fd;
        }

        return 0;
}

/*
 * unshare allows a process to 'unshare' part of the process
 * context which was originally shared using clone.  copy_*
 * functions used by kernel_clone() cannot be used here directly
 * because they modify an inactive task_struct that is being
 * constructed. Here we are modifying the current, active,
 * task_struct.
 */
int ksys_unshare(unsigned long unshare_flags)
{
        struct fs_struct *fs, *new_fs = NULL;
        struct files_struct *new_fd = NULL;
        struct cred *new_cred = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
        int err;

        /*
         * If unsharing a user namespace must also unshare the thread group
         * and unshare the filesystem root and working directories.
         */
        if (unshare_flags & CLONE_NEWUSER)
                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
         * If unsharing vm, must also unshare signal handlers.
         */
        if (unshare_flags & CLONE_VM)
                unshare_flags |= CLONE_SIGHAND;
        /*
         * If unsharing a signal handlers, must also unshare the signal queues.
         */
        if (unshare_flags & CLONE_SIGHAND)
                unshare_flags |= CLONE_THREAD;
        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & UNSHARE_EMPTY_MNTNS)
                unshare_flags |= CLONE_NEWNS;
        if (unshare_flags & CLONE_NEWNS)
                unshare_flags |= CLONE_FS;

        err = check_unshare_flags(unshare_flags);
        if (err)
                goto bad_unshare_out;
        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
        err = unshare_fs(unshare_flags, &new_fs);
        if (err)
                goto bad_unshare_out;
        err = unshare_fd(unshare_flags, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
        err = unshare_userns(unshare_flags, &new_cred);
        if (err)
                goto bad_unshare_cleanup_fd;
        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                                         new_cred, new_fs);
        if (err)
                goto bad_unshare_cleanup_cred;
        if (new_cred) {
                err = set_cred_ucounts(new_cred);
                if (err)
                        goto bad_unshare_cleanup_nsproxy;
        }

        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
                         */
                        exit_sem(current);
                }
                if (unshare_flags & CLONE_NEWIPC) {
                        /* Orphan segments in old ns (see sem above). */
                        exit_shm(current);
                        shm_init_task(current);
                }

                if (new_nsproxy) {
                        switch_task_namespaces(current, new_nsproxy);
                        new_nsproxy = NULL;
                }

                task_lock(current);

                if (new_fs) {
                        fs = current->fs;
                        read_seqlock_excl(&fs->seq);
                        current->fs = new_fs;
                        if (--fs->users)
                                new_fs = NULL;
                        else
                                new_fs = fs;
                        read_sequnlock_excl(&fs->seq);
                }

                if (new_fd)
                        swap(current->files, new_fd);

                task_unlock(current);

                if (new_cred) {
                        /* Install the new user namespace */
                        commit_creds(new_cred);
                        new_cred = NULL;
                }
        }

        perf_event_namespaces(current);

bad_unshare_cleanup_nsproxy:
        if (new_nsproxy)
                put_nsproxy(new_nsproxy);
bad_unshare_cleanup_cred:
        if (new_cred)
                put_cred(new_cred);
bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);
bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);

bad_unshare_out:
        return err;
}

SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
        return ksys_unshare(unshare_flags);
}

/*
 *        Helper to unshare the files of the current task.
 *        We don't want to expose copy_files internals to
 *        the exec layer of the kernel.
 */

int unshare_files(void)
{
        struct task_struct *task = current;
        struct files_struct *old, *copy = NULL;
        int error;

        error = unshare_fd(CLONE_FILES, &copy);
        if (error || !copy)
                return error;

        old = task->files;
        task_lock(task);
        task->files = copy;
        task_unlock(task);
        put_files_struct(old);
        return 0;
}

static int sysctl_max_threads(const struct ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int ret;
        int threads = max_threads;
        int min = 1;
        int max = MAX_THREADS;

        t = *table;
        t.data = &threads;
        t.extra1 = &min;
        t.extra2 = &max;

        ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_threads = threads;

        return 0;
}

static const struct ctl_table fork_sysctl_table[] = {
        {
                .procname        = "threads-max",
                .data                = NULL,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = sysctl_max_threads,
        },
};

static int __init init_fork_sysctl(void)
{
        register_sysctl_init("kernel", fork_sysctl_table);
        return 0;
}

subsys_initcall(init_fork_sysctl);





































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * USB PHY defines
 *
 * These APIs may be used between USB controllers.  USB device drivers
 * (for either host or peripheral roles) don't use these calls; they
 * continue to use just usb_device and usb_gadget.
 */

#ifndef __LINUX_USB_PHY_H
#define __LINUX_USB_PHY_H

#include <linux/extcon.h>
#include <linux/notifier.h>
#include <linux/usb.h>
#include <uapi/linux/usb/charger.h>

enum usb_phy_interface {
        USBPHY_INTERFACE_MODE_UNKNOWN,
        USBPHY_INTERFACE_MODE_UTMI,
        USBPHY_INTERFACE_MODE_UTMIW,
        USBPHY_INTERFACE_MODE_ULPI,
        USBPHY_INTERFACE_MODE_SERIAL,
        USBPHY_INTERFACE_MODE_HSIC,
};

enum usb_phy_events {
        USB_EVENT_NONE,         /* no events or cable disconnected */
        USB_EVENT_VBUS,         /* vbus valid event */
        USB_EVENT_ID,           /* id was grounded */
        USB_EVENT_CHARGER,      /* usb dedicated charger */
        USB_EVENT_ENUMERATED,   /* gadget driver enumerated */
};

/* associate a type with PHY */
enum usb_phy_type {
        USB_PHY_TYPE_UNDEFINED,
        USB_PHY_TYPE_USB2,
        USB_PHY_TYPE_USB3,
};

/* OTG defines lots of enumeration states before device reset */
enum usb_otg_state {
        OTG_STATE_UNDEFINED = 0,

        /* single-role peripheral, and dual-role default-b */
        OTG_STATE_B_IDLE,
        OTG_STATE_B_SRP_INIT,
        OTG_STATE_B_PERIPHERAL,

        /* extra dual-role default-b states */
        OTG_STATE_B_WAIT_ACON,
        OTG_STATE_B_HOST,

        /* dual-role default-a */
        OTG_STATE_A_IDLE,
        OTG_STATE_A_WAIT_VRISE,
        OTG_STATE_A_WAIT_BCON,
        OTG_STATE_A_HOST,
        OTG_STATE_A_SUSPEND,
        OTG_STATE_A_PERIPHERAL,
        OTG_STATE_A_WAIT_VFALL,
        OTG_STATE_A_VBUS_ERR,
};

struct usb_phy;
struct usb_otg;

/* for phys connected thru an ULPI interface, the user must
 * provide access ops
 */
struct usb_phy_io_ops {
        int (*read)(struct usb_phy *x, u32 reg);
        int (*write)(struct usb_phy *x, u32 val, u32 reg);
};

struct usb_charger_current {
        unsigned int sdp_min;
        unsigned int sdp_max;
        unsigned int dcp_min;
        unsigned int dcp_max;
        unsigned int cdp_min;
        unsigned int cdp_max;
        unsigned int aca_min;
        unsigned int aca_max;
};

struct usb_phy {
        struct device                *dev;
        const char                *label;
        unsigned int                 flags;

        enum usb_phy_type        type;
        enum usb_phy_events        last_event;

        struct usb_otg                *otg;

        struct device                *io_dev;
        struct usb_phy_io_ops        *io_ops;
        void __iomem                *io_priv;

        /* to support extcon device */
        struct extcon_dev        *edev;
        struct extcon_dev        *id_edev;
        struct notifier_block        vbus_nb;
        struct notifier_block        id_nb;
        struct notifier_block        type_nb;

        /* Support USB charger */
        enum usb_charger_type        chg_type;
        enum usb_charger_state        chg_state;
        struct usb_charger_current        chg_cur;
        struct work_struct                chg_work;

        /* for notification of usb_phy_events */
        struct atomic_notifier_head        notifier;

        /* to pass extra port status to the root hub */
        u16                        port_status;
        u16                        port_change;

        /* to support controllers that have multiple phys */
        struct list_head        head;

        /* initialize/shutdown the phy */
        int        (*init)(struct usb_phy *x);
        void        (*shutdown)(struct usb_phy *x);

        /* enable/disable VBUS */
        int        (*set_vbus)(struct usb_phy *x, int on);

        /* effective for B devices, ignored for A-peripheral */
        int        (*set_power)(struct usb_phy *x,
                                unsigned mA);

        /* Set phy into suspend mode */
        int        (*set_suspend)(struct usb_phy *x,
                                int suspend);

        /*
         * Set wakeup enable for PHY, in that case, the PHY can be
         * woken up from suspend status due to external events,
         * like vbus change, dp/dm change and id.
         */
        int        (*set_wakeup)(struct usb_phy *x, bool enabled);

        /* notify phy connect status change */
        int        (*notify_connect)(struct usb_phy *x,
                        enum usb_device_speed speed);
        int        (*notify_disconnect)(struct usb_phy *x,
                        enum usb_device_speed speed);

        /*
         * Charger detection method can be implemented if you need to
         * manually detect the charger type.
         */
        enum usb_charger_type (*charger_detect)(struct usb_phy *x);
};

/* for board-specific init logic */
extern int usb_add_phy(struct usb_phy *, enum usb_phy_type type);
extern int usb_add_phy_dev(struct usb_phy *);
extern void usb_remove_phy(struct usb_phy *);

/* helpers for direct access thru low-level io interface */
static inline int usb_phy_io_read(struct usb_phy *x, u32 reg)
{
        if (x && x->io_ops && x->io_ops->read)
                return x->io_ops->read(x, reg);

        return -EINVAL;
}

static inline int usb_phy_io_write(struct usb_phy *x, u32 val, u32 reg)
{
        if (x && x->io_ops && x->io_ops->write)
                return x->io_ops->write(x, val, reg);

        return -EINVAL;
}

static inline int
usb_phy_init(struct usb_phy *x)
{
        if (x && x->init)
                return x->init(x);

        return 0;
}

static inline void
usb_phy_shutdown(struct usb_phy *x)
{
        if (x && x->shutdown)
                x->shutdown(x);
}

static inline int
usb_phy_vbus_on(struct usb_phy *x)
{
        if (!x || !x->set_vbus)
                return 0;

        return x->set_vbus(x, true);
}

static inline int
usb_phy_vbus_off(struct usb_phy *x)
{
        if (!x || !x->set_vbus)
                return 0;

        return x->set_vbus(x, false);
}

/* for usb host and peripheral controller drivers */
#if IS_ENABLED(CONFIG_USB_PHY)
extern struct usb_phy *usb_get_phy(enum usb_phy_type type);
extern struct usb_phy *devm_usb_get_phy(struct device *dev,
        enum usb_phy_type type);
extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
        const char *phandle, u8 index);
extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev,
        struct device_node *node, struct notifier_block *nb);
extern void usb_put_phy(struct usb_phy *);
extern void usb_phy_set_event(struct usb_phy *x, unsigned long event);
extern void usb_phy_set_charger_current(struct usb_phy *usb_phy,
                                        unsigned int mA);
extern void usb_phy_get_charger_current(struct usb_phy *usb_phy,
                                        unsigned int *min, unsigned int *max);
extern void usb_phy_set_charger_state(struct usb_phy *usb_phy,
                                      enum usb_charger_state state);
#else
static inline struct usb_phy *usb_get_phy(enum usb_phy_type type)
{
        return ERR_PTR(-ENXIO);
}

static inline struct usb_phy *devm_usb_get_phy(struct device *dev,
        enum usb_phy_type type)
{
        return ERR_PTR(-ENXIO);
}

static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
        const char *phandle, u8 index)
{
        return ERR_PTR(-ENXIO);
}

static inline struct usb_phy *devm_usb_get_phy_by_node(struct device *dev,
        struct device_node *node, struct notifier_block *nb)
{
        return ERR_PTR(-ENXIO);
}

static inline void usb_put_phy(struct usb_phy *x)
{
}

static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event)
{
}

static inline void usb_phy_set_charger_current(struct usb_phy *usb_phy,
                                               unsigned int mA)
{
}

static inline void usb_phy_get_charger_current(struct usb_phy *usb_phy,
                                               unsigned int *min,
                                               unsigned int *max)
{
}

static inline void usb_phy_set_charger_state(struct usb_phy *usb_phy,
                                             enum usb_charger_state state)
{
}
#endif

static inline int
usb_phy_set_power(struct usb_phy *x, unsigned mA)
{
        if (!x)
                return 0;

        usb_phy_set_charger_current(x, mA);

        if (x->set_power)
                return x->set_power(x, mA);
        return 0;
}

/* Context: can sleep */
static inline int
usb_phy_set_suspend(struct usb_phy *x, int suspend)
{
        if (x && x->set_suspend != NULL)
                return x->set_suspend(x, suspend);
        else
                return 0;
}

static inline int
usb_phy_set_wakeup(struct usb_phy *x, bool enabled)
{
        if (x && x->set_wakeup)
                return x->set_wakeup(x, enabled);
        else
                return 0;
}

static inline int
usb_phy_notify_connect(struct usb_phy *x, enum usb_device_speed speed)
{
        if (x && x->notify_connect)
                return x->notify_connect(x, speed);
        else
                return 0;
}

static inline int
usb_phy_notify_disconnect(struct usb_phy *x, enum usb_device_speed speed)
{
        if (x && x->notify_disconnect)
                return x->notify_disconnect(x, speed);
        else
                return 0;
}

/* notifiers */
static inline int
usb_register_notifier(struct usb_phy *x, struct notifier_block *nb)
{
        return atomic_notifier_chain_register(&x->notifier, nb);
}

static inline void
usb_unregister_notifier(struct usb_phy *x, struct notifier_block *nb)
{
        atomic_notifier_chain_unregister(&x->notifier, nb);
}

static inline const char *usb_phy_type_string(enum usb_phy_type type)
{
        switch (type) {
        case USB_PHY_TYPE_USB2:
                return "USB2 PHY";
        case USB_PHY_TYPE_USB3:
                return "USB3 PHY";
        default:
                return "UNKNOWN PHY TYPE";
        }
}
#endif /* __LINUX_USB_PHY_H */
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/* SPDX-License-Identifier: GPL-2.0 */
/* Perform sanity checking for object sizes for uaccess.h and uio.h. */
#ifndef __LINUX_UCOPYSIZE_H__
#define __LINUX_UCOPYSIZE_H__

#include <linux/bug.h>

#ifdef CONFIG_HARDENED_USERCOPY
#include <linux/jump_label.h>
extern void __check_object_size(const void *ptr, unsigned long n,
                                        bool to_user);

DECLARE_STATIC_KEY_MAYBE(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
                           validate_usercopy_range);

static __always_inline void check_object_size(const void *ptr, unsigned long n,
                                              bool to_user)
{
        if (!__builtin_constant_p(n) &&
            static_branch_maybe(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
                                &validate_usercopy_range)) {
                __check_object_size(ptr, n, to_user);
        }
}
#else
static inline void check_object_size(const void *ptr, unsigned long n,
                                     bool to_user)
{ }
#endif /* CONFIG_HARDENED_USERCOPY */

extern void __compiletime_error("copy source size is too small")
__bad_copy_from(void);
extern void __compiletime_error("copy destination size is too small")
__bad_copy_to(void);

void __copy_overflow(int size, unsigned long count);

static inline void copy_overflow(int size, unsigned long count)
{
        if (IS_ENABLED(CONFIG_BUG))
                __copy_overflow(size, count);
}

static __always_inline __must_check bool
check_copy_size(const void *addr, size_t bytes, bool is_source)
{
        int sz = __builtin_object_size(addr, 0);
        if (unlikely(sz >= 0 && sz < bytes)) {
                if (!__builtin_constant_p(bytes))
                        copy_overflow(sz, bytes);
                else if (is_source)
                        __bad_copy_from();
                else
                        __bad_copy_to();
                return false;
        }
        if (WARN_ON_ONCE(bytes > INT_MAX))
                return false;
        check_object_size(addr, bytes, is_source);
        return true;
}

#endif /* __LINUX_UCOPYSIZE_H__ */



























    7 
    5 


    1 





    3 
















    5 





    6 
    6 





    7 
    1 









    5 













    2 



















    3 




















    3 


    3 




























































































    3 






    3 







































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
// SPDX-License-Identifier: GPL-2.0
/*
 * Convert integer string representation to an integer.
 * If an integer doesn't fit into specified type, -E is returned.
 *
 * Integer starts with optional sign.
 * kstrtou*() functions do not accept sign "-".
 *
 * Radix 0 means autodetection: leading "0x" implies radix 16,
 * leading "0" implies radix 8, otherwise radix is 10.
 * Autodetection hints work after optional sign, but not before.
 *
 * If -E is returned, result is not touched.
 */
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/kstrtox.h>
#include <linux/math64.h>
#include <linux/types.h>
#include <linux/uaccess.h>

#include "kstrtox.h"

noinline
const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
{
        if (*base == 0) {
                if (s[0] == '0') {
                        if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
                                *base = 16;
                        else
                                *base = 8;
                } else
                        *base = 10;
        }
        if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
                s += 2;
        return s;
}

/*
 * Convert non-negative integer string representation in explicitly given radix
 * to an integer. A maximum of max_chars characters will be converted.
 *
 * Return number of characters consumed maybe or-ed with overflow bit.
 * If overflow occurs, result integer (incorrect) is still returned.
 *
 * Don't you dare use this function.
 */
noinline
unsigned int _parse_integer_limit(const char *s, unsigned int base, unsigned long long *p,
                                  size_t max_chars)
{
        unsigned long long res;
        unsigned int rv;

        res = 0;
        rv = 0;
        while (max_chars--) {
                unsigned int c = *s;
                unsigned int lc = _tolower(c);
                unsigned int val;

                if ('0' <= c && c <= '9')
                        val = c - '0';
                else if ('a' <= lc && lc <= 'f')
                        val = lc - 'a' + 10;
                else
                        break;

                if (val >= base)
                        break;
                /*
                 * Check for overflow only if we are within range of
                 * it in the max base we support (16)
                 */
                if (unlikely(res & (~0ull << 60))) {
                        if (res > div_u64(ULLONG_MAX - val, base))
                                rv |= KSTRTOX_OVERFLOW;
                }
                res = res * base + val;
                rv++;
                s++;
        }
        *p = res;
        return rv;
}

noinline
unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p)
{
        return _parse_integer_limit(s, base, p, INT_MAX);
}

static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
{
        unsigned long long _res;
        unsigned int rv;

        s = _parse_integer_fixup_radix(s, &base);
        rv = _parse_integer(s, base, &_res);
        if (rv & KSTRTOX_OVERFLOW)
                return -ERANGE;
        if (rv == 0)
                return -EINVAL;
        s += rv;
        if (*s == '\n')
                s++;
        if (*s)
                return -EINVAL;
        *res = _res;
        return 0;
}

/**
 * kstrtoull - convert a string to an unsigned long long
 * @s: The start of the string. The string must be null-terminated, and may also
 *  include a single newline before its terminating null. The first character
 *  may also be a plus sign, but not a minus sign.
 * @base: The number base to use. The maximum supported base is 16. If base is
 *  given as 0, then the base of the string is automatically detected with the
 *  conventional semantics - If it begins with 0x the number will be parsed as a
 *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
 *  parsed as an octal number. Otherwise it will be parsed as a decimal.
 * @res: Where to write the result of the conversion on success.
 *
 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
 * Preferred over simple_strtoull(). Return code must be checked.
 */
noinline
int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
{
        if (s[0] == '+')
                s++;
        return _kstrtoull(s, base, res);
}
EXPORT_SYMBOL(kstrtoull);

/**
 * kstrtoll - convert a string to a long long
 * @s: The start of the string. The string must be null-terminated, and may also
 *  include a single newline before its terminating null. The first character
 *  may also be a plus sign or a minus sign.
 * @base: The number base to use. The maximum supported base is 16. If base is
 *  given as 0, then the base of the string is automatically detected with the
 *  conventional semantics - If it begins with 0x the number will be parsed as a
 *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
 *  parsed as an octal number. Otherwise it will be parsed as a decimal.
 * @res: Where to write the result of the conversion on success.
 *
 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
 * Preferred over simple_strtoll(). Return code must be checked.
 */
noinline
int kstrtoll(const char *s, unsigned int base, long long *res)
{
        unsigned long long tmp;
        int rv;

        if (s[0] == '-') {
                rv = _kstrtoull(s + 1, base, &tmp);
                if (rv < 0)
                        return rv;
                if ((long long)-tmp > 0)
                        return -ERANGE;
                *res = -tmp;
        } else {
                rv = kstrtoull(s, base, &tmp);
                if (rv < 0)
                        return rv;
                if ((long long)tmp < 0)
                        return -ERANGE;
                *res = tmp;
        }
        return 0;
}
EXPORT_SYMBOL(kstrtoll);

/* Internal, do not use. */
int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
{
        unsigned long long tmp;
        int rv;

        rv = kstrtoull(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (unsigned long)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(_kstrtoul);

/* Internal, do not use. */
int _kstrtol(const char *s, unsigned int base, long *res)
{
        long long tmp;
        int rv;

        rv = kstrtoll(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (long)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(_kstrtol);

/**
 * kstrtouint - convert a string to an unsigned int
 * @s: The start of the string. The string must be null-terminated, and may also
 *  include a single newline before its terminating null. The first character
 *  may also be a plus sign, but not a minus sign.
 * @base: The number base to use. The maximum supported base is 16. If base is
 *  given as 0, then the base of the string is automatically detected with the
 *  conventional semantics - If it begins with 0x the number will be parsed as a
 *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
 *  parsed as an octal number. Otherwise it will be parsed as a decimal.
 * @res: Where to write the result of the conversion on success.
 *
 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
 * Preferred over simple_strtoul(). Return code must be checked.
 */
noinline
int kstrtouint(const char *s, unsigned int base, unsigned int *res)
{
        unsigned long long tmp;
        int rv;

        rv = kstrtoull(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (unsigned int)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtouint);

/**
 * kstrtoint - convert a string to an int
 * @s: The start of the string. The string must be null-terminated, and may also
 *  include a single newline before its terminating null. The first character
 *  may also be a plus sign or a minus sign.
 * @base: The number base to use. The maximum supported base is 16. If base is
 *  given as 0, then the base of the string is automatically detected with the
 *  conventional semantics - If it begins with 0x the number will be parsed as a
 *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
 *  parsed as an octal number. Otherwise it will be parsed as a decimal.
 * @res: Where to write the result of the conversion on success.
 *
 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
 * Preferred over simple_strtol(). Return code must be checked.
 */
noinline
int kstrtoint(const char *s, unsigned int base, int *res)
{
        long long tmp;
        int rv;

        rv = kstrtoll(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (int)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtoint);

noinline
int kstrtou16(const char *s, unsigned int base, u16 *res)
{
        unsigned long long tmp;
        int rv;

        rv = kstrtoull(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (u16)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtou16);

noinline
int kstrtos16(const char *s, unsigned int base, s16 *res)
{
        long long tmp;
        int rv;

        rv = kstrtoll(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (s16)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtos16);

noinline
int kstrtou8(const char *s, unsigned int base, u8 *res)
{
        unsigned long long tmp;
        int rv;

        rv = kstrtoull(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (u8)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtou8);

noinline
int kstrtos8(const char *s, unsigned int base, s8 *res)
{
        long long tmp;
        int rv;

        rv = kstrtoll(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (s8)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtos8);

/**
 * kstrtobool - convert common user inputs into boolean values
 * @s: input string
 * @res: result
 *
 * This routine returns 0 iff the first character is one of 'EeYyTt1DdNnFf0',
 * or [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL.  Value
 * pointed to by res is updated upon finding a match.
 */
noinline
int kstrtobool(const char *s, bool *res)
{
        if (!s)
                return -EINVAL;

        switch (s[0]) {
        case 'e':
        case 'E':
        case 'y':
        case 'Y':
        case 't':
        case 'T':
        case '1':
                *res = true;
                return 0;
        case 'd':
        case 'D':
        case 'n':
        case 'N':
        case 'f':
        case 'F':
        case '0':
                *res = false;
                return 0;
        case 'o':
        case 'O':
                switch (s[1]) {
                case 'n':
                case 'N':
                        *res = true;
                        return 0;
                case 'f':
                case 'F':
                        *res = false;
                        return 0;
                default:
                        break;
                }
                break;
        default:
                break;
        }

        return -EINVAL;
}
EXPORT_SYMBOL(kstrtobool);

/*
 * Since "base" would be a nonsense argument, this open-codes the
 * _from_user helper instead of using the helper macro below.
 */
int kstrtobool_from_user(const char __user *s, size_t count, bool *res)
{
        /* Longest string needed to differentiate, newline, terminator */
        char buf[4];

        count = min(count, sizeof(buf) - 1);
        if (copy_from_user(buf, s, count))
                return -EFAULT;
        buf[count] = '\0';
        return kstrtobool(buf, res);
}
EXPORT_SYMBOL(kstrtobool_from_user);

#define kstrto_from_user(f, g, type)                                        \
int f(const char __user *s, size_t count, unsigned int base, type *res)        \
{                                                                        \
        /* sign, base 2 representation, newline, terminator */                \
        char buf[1 + sizeof(type) * 8 + 1 + 1];                                \
                                                                        \
        count = min(count, sizeof(buf) - 1);                                \
        if (copy_from_user(buf, s, count))                                \
                return -EFAULT;                                                \
        buf[count] = '\0';                                                \
        return g(buf, base, res);                                        \
}                                                                        \
EXPORT_SYMBOL(f)

kstrto_from_user(kstrtoull_from_user,        kstrtoull,        unsigned long long);
kstrto_from_user(kstrtoll_from_user,        kstrtoll,        long long);
kstrto_from_user(kstrtoul_from_user,        kstrtoul,        unsigned long);
kstrto_from_user(kstrtol_from_user,        kstrtol,        long);
kstrto_from_user(kstrtouint_from_user,        kstrtouint,        unsigned int);
kstrto_from_user(kstrtoint_from_user,        kstrtoint,        int);
kstrto_from_user(kstrtou16_from_user,        kstrtou16,        u16);
kstrto_from_user(kstrtos16_from_user,        kstrtos16,        s16);
kstrto_from_user(kstrtou8_from_user,        kstrtou8,        u8);
kstrto_from_user(kstrtos8_from_user,        kstrtos8,        s8);

































































































































































































































































































































































































































































































































































































































































































































    1 


    1 




































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
/*
 * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/in.h>
#include <linux/ipv6.h>
#include <linux/poll.h>
#include <net/sock.h>

#include "rds.h"

/* this is just used for stats gathering :/ */
static DEFINE_SPINLOCK(rds_sock_lock);
static unsigned long rds_sock_count;
static LIST_HEAD(rds_sock_list);
DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);

/*
 * This is called as the final descriptor referencing this socket is closed.
 * We have to unbind the socket so that another socket can be bound to the
 * address it was using.
 *
 * We have to be careful about racing with the incoming path.  sock_orphan()
 * sets SOCK_DEAD and we use that as an indicator to the rx path that new
 * messages shouldn't be queued.
 */
static int rds_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct rds_sock *rs;

        if (!sk)
                goto out;

        rs = rds_sk_to_rs(sk);

        sock_orphan(sk);
        /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
         * that ensures the recv path has completed messing
         * with the socket. */
        rds_clear_recv_queue(rs);
        rds_cong_remove_socket(rs);

        rds_remove_bound(rs);

        rds_send_drop_to(rs, NULL);
        rds_rdma_drop_keys(rs);
        rds_notify_queue_get(rs, NULL);
        rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue);

        spin_lock_bh(&rds_sock_lock);
        list_del_init(&rs->rs_item);
        rds_sock_count--;
        spin_unlock_bh(&rds_sock_lock);

        rds_trans_put(rs->rs_transport);

        sock->sk = NULL;
        sock_put(sk);
out:
        return 0;
}

/*
 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
 * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
 * this seems more conservative.
 * NB - normally, one would use sk_callback_lock for this, but we can
 * get here from interrupts, whereas the network code grabs sk_callback_lock
 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
 */
void rds_wake_sk_sleep(struct rds_sock *rs)
{
        unsigned long flags;

        read_lock_irqsave(&rs->rs_recv_lock, flags);
        __rds_wake_sk_sleep(rds_rs_to_sk(rs));
        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
}

static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
                       int peer)
{
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
        struct sockaddr_in6 *sin6;
        struct sockaddr_in *sin;
        int uaddr_len;

        /* racey, don't care */
        if (peer) {
                if (ipv6_addr_any(&rs->rs_conn_addr))
                        return -ENOTCONN;

                if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
                        sin = (struct sockaddr_in *)uaddr;
                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
                        sin->sin_family = AF_INET;
                        sin->sin_port = rs->rs_conn_port;
                        sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
                        uaddr_len = sizeof(*sin);
                } else {
                        sin6 = (struct sockaddr_in6 *)uaddr;
                        sin6->sin6_family = AF_INET6;
                        sin6->sin6_port = rs->rs_conn_port;
                        sin6->sin6_addr = rs->rs_conn_addr;
                        sin6->sin6_flowinfo = 0;
                        /* scope_id is the same as in the bound address. */
                        sin6->sin6_scope_id = rs->rs_bound_scope_id;
                        uaddr_len = sizeof(*sin6);
                }
        } else {
                /* If socket is not yet bound and the socket is connected,
                 * set the return address family to be the same as the
                 * connected address, but with 0 address value.  If it is not
                 * connected, set the family to be AF_UNSPEC (value 0) and
                 * the address size to be that of an IPv4 address.
                 */
                if (ipv6_addr_any(&rs->rs_bound_addr)) {
                        if (ipv6_addr_any(&rs->rs_conn_addr)) {
                                sin = (struct sockaddr_in *)uaddr;
                                memset(sin, 0, sizeof(*sin));
                                sin->sin_family = AF_UNSPEC;
                                return sizeof(*sin);
                        }

#if IS_ENABLED(CONFIG_IPV6)
                        if (!(ipv6_addr_type(&rs->rs_conn_addr) &
                              IPV6_ADDR_MAPPED)) {
                                sin6 = (struct sockaddr_in6 *)uaddr;
                                memset(sin6, 0, sizeof(*sin6));
                                sin6->sin6_family = AF_INET6;
                                return sizeof(*sin6);
                        }
#endif

                        sin = (struct sockaddr_in *)uaddr;
                        memset(sin, 0, sizeof(*sin));
                        sin->sin_family = AF_INET;
                        return sizeof(*sin);
                }
                if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
                        sin = (struct sockaddr_in *)uaddr;
                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
                        sin->sin_family = AF_INET;
                        sin->sin_port = rs->rs_bound_port;
                        sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
                        uaddr_len = sizeof(*sin);
                } else {
                        sin6 = (struct sockaddr_in6 *)uaddr;
                        sin6->sin6_family = AF_INET6;
                        sin6->sin6_port = rs->rs_bound_port;
                        sin6->sin6_addr = rs->rs_bound_addr;
                        sin6->sin6_flowinfo = 0;
                        sin6->sin6_scope_id = rs->rs_bound_scope_id;
                        uaddr_len = sizeof(*sin6);
                }
        }

        return uaddr_len;
}

/*
 * RDS' poll is without a doubt the least intuitive part of the interface,
 * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from
 * a network protocol.
 *
 * EPOLLIN is asserted if
 *  -        there is data on the receive queue.
 *  -        to signal that a previously congested destination may have become
 *        uncongested
 *  -        A notification has been queued to the socket (this can be a congestion
 *        update, or a RDMA completion, or a MSG_ZEROCOPY completion).
 *
 * EPOLLOUT is asserted if there is room on the send queue. This does not mean
 * however, that the next sendmsg() call will succeed. If the application tries
 * to send to a congested destination, the system call may still fail (and
 * return ENOBUFS).
 */
static __poll_t rds_poll(struct file *file, struct socket *sock,
                             poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct rds_sock *rs = rds_sk_to_rs(sk);
        __poll_t mask = 0;
        unsigned long flags;

        poll_wait(file, sk_sleep(sk), wait);

        if (rs->rs_seen_congestion)
                poll_wait(file, &rds_poll_waitq, wait);

        read_lock_irqsave(&rs->rs_recv_lock, flags);
        if (!rs->rs_cong_monitor) {
                /* When a congestion map was updated, we signal EPOLLIN for
                 * "historical" reasons. Applications can also poll for
                 * WRBAND instead. */
                if (rds_cong_updated_since(&rs->rs_cong_track))
                        mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND);
        } else {
                spin_lock(&rs->rs_lock);
                if (rs->rs_cong_notify)
                        mask |= (EPOLLIN | EPOLLRDNORM);
                spin_unlock(&rs->rs_lock);
        }
        if (!list_empty(&rs->rs_recv_queue) ||
            !list_empty(&rs->rs_notify_queue) ||
            !list_empty(&rs->rs_zcookie_queue.zcookie_head))
                mask |= (EPOLLIN | EPOLLRDNORM);
        if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
                mask |= (EPOLLOUT | EPOLLWRNORM);
        if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
                mask |= EPOLLERR;
        read_unlock_irqrestore(&rs->rs_recv_lock, flags);

        /* clear state any time we wake a seen-congested socket */
        if (mask)
                rs->rs_seen_congestion = 0;

        return mask;
}

static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
        rds_tos_t utos, tos = 0;

        switch (cmd) {
        case SIOCRDSSETTOS:
                if (get_user(utos, (rds_tos_t __user *)arg))
                        return -EFAULT;

                if (rs->rs_transport &&
                    rs->rs_transport->get_tos_map)
                        tos = rs->rs_transport->get_tos_map(utos);
                else
                        return -ENOIOCTLCMD;

                spin_lock_bh(&rds_sock_lock);
                if (rs->rs_tos || rs->rs_conn) {
                        spin_unlock_bh(&rds_sock_lock);
                        return -EINVAL;
                }
                rs->rs_tos = tos;
                spin_unlock_bh(&rds_sock_lock);
                break;
        case SIOCRDSGETTOS:
                spin_lock_bh(&rds_sock_lock);
                tos = rs->rs_tos;
                spin_unlock_bh(&rds_sock_lock);
                if (put_user(tos, (rds_tos_t __user *)arg))
                        return -EFAULT;
                break;
        default:
                return -ENOIOCTLCMD;
        }

        return 0;
}

static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len)
{
        struct sockaddr_in6 sin6;
        struct sockaddr_in sin;
        int ret = 0;

        /* racing with another thread binding seems ok here */
        if (ipv6_addr_any(&rs->rs_bound_addr)) {
                ret = -ENOTCONN; /* XXX not a great errno */
                goto out;
        }

        if (len < sizeof(struct sockaddr_in)) {
                ret = -EINVAL;
                goto out;
        } else if (len < sizeof(struct sockaddr_in6)) {
                /* Assume IPv4 */
                if (copy_from_sockptr(&sin, optval,
                                sizeof(struct sockaddr_in))) {
                        ret = -EFAULT;
                        goto out;
                }
                ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
                sin6.sin6_port = sin.sin_port;
        } else {
                if (copy_from_sockptr(&sin6, optval,
                                   sizeof(struct sockaddr_in6))) {
                        ret = -EFAULT;
                        goto out;
                }
        }

        rds_send_drop_to(rs, &sin6);
out:
        return ret;
}

static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval,
                               int optlen)
{
        int value;

        if (optlen < sizeof(int))
                return -EINVAL;
        if (copy_from_sockptr(&value, optval, sizeof(int)))
                return -EFAULT;
        *optvar = !!value;
        return 0;
}

static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen)
{
        int ret;

        ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
        if (ret == 0) {
                if (rs->rs_cong_monitor) {
                        rds_cong_add_socket(rs);
                } else {
                        rds_cong_remove_socket(rs);
                        rs->rs_cong_mask = 0;
                        rs->rs_cong_notify = 0;
                }
        }
        return ret;
}

static int rds_set_transport(struct net *net, struct rds_sock *rs,
                             sockptr_t optval, int optlen)
{
        int t_type;

        if (rs->rs_transport)
                return -EOPNOTSUPP; /* previously attached to transport */

        if (optlen != sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&t_type, optval, sizeof(t_type)))
                return -EFAULT;

        if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
                return -EINVAL;

        /* RDS/IB is restricted to the initial network namespace */
        if (t_type != RDS_TRANS_TCP && !net_eq(net, &init_net))
                return -EPROTOTYPE;

        rs->rs_transport = rds_trans_get(t_type);

        return rs->rs_transport ? 0 : -ENOPROTOOPT;
}

static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval,
                                 int optlen, int optname)
{
        int val, valbool;

        if (optlen != sizeof(int))
                return -EFAULT;

        if (copy_from_sockptr(&val, optval, sizeof(int)))
                return -EFAULT;

        valbool = val ? 1 : 0;

        if (optname == SO_TIMESTAMP_NEW)
                sock_set_flag(sk, SOCK_TSTAMP_NEW);

        if (valbool)
                sock_set_flag(sk, SOCK_RCVTSTAMP);
        else
                sock_reset_flag(sk, SOCK_RCVTSTAMP);

        return 0;
}

static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
                                  int optlen)
{
        struct rds_rx_trace_so trace;
        int i;

        if (optlen != sizeof(struct rds_rx_trace_so))
                return -EFAULT;

        if (copy_from_sockptr(&trace, optval, sizeof(trace)))
                return -EFAULT;

        if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
                return -EFAULT;

        rs->rs_rx_traces = trace.rx_traces;
        for (i = 0; i < rs->rs_rx_traces; i++) {
                if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) {
                        rs->rs_rx_traces = 0;
                        return -EFAULT;
                }
                rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
        }

        return 0;
}

static int rds_setsockopt(struct socket *sock, int level, int optname,
                          sockptr_t optval, unsigned int optlen)
{
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
        struct net *net = sock_net(sock->sk);
        int ret;

        if (level != SOL_RDS) {
                ret = -ENOPROTOOPT;
                goto out;
        }

        switch (optname) {
        case RDS_CANCEL_SENT_TO:
                ret = rds_cancel_sent_to(rs, optval, optlen);
                break;
        case RDS_GET_MR:
                ret = rds_get_mr(rs, optval, optlen);
                break;
        case RDS_GET_MR_FOR_DEST:
                ret = rds_get_mr_for_dest(rs, optval, optlen);
                break;
        case RDS_FREE_MR:
                ret = rds_free_mr(rs, optval, optlen);
                break;
        case RDS_RECVERR:
                ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
                break;
        case RDS_CONG_MONITOR:
                ret = rds_cong_monitor(rs, optval, optlen);
                break;
        case SO_RDS_TRANSPORT:
                lock_sock(sock->sk);
                ret = rds_set_transport(net, rs, optval, optlen);
                release_sock(sock->sk);
                break;
        case SO_TIMESTAMP_OLD:
        case SO_TIMESTAMP_NEW:
                lock_sock(sock->sk);
                ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname);
                release_sock(sock->sk);
                break;
        case SO_RDS_MSG_RXPATH_LATENCY:
                ret = rds_recv_track_latency(rs, optval, optlen);
                break;
        default:
                ret = -ENOPROTOOPT;
        }
out:
        return ret;
}

static int rds_getsockopt(struct socket *sock, int level, int optname,
                          char __user *optval, int __user *optlen)
{
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
        int ret = -ENOPROTOOPT, len;
        int trans;

        if (level != SOL_RDS)
                goto out;

        if (get_user(len, optlen)) {
                ret = -EFAULT;
                goto out;
        }

        switch (optname) {
        case RDS_INFO_FIRST ... RDS_INFO_LAST:
                ret = rds_info_getsockopt(sock, optname, optval,
                                          optlen);
                break;

        case RDS_RECVERR:
                if (len < sizeof(int))
                        ret = -EINVAL;
                else
                if (put_user(rs->rs_recverr, (int __user *) optval) ||
                    put_user(sizeof(int), optlen))
                        ret = -EFAULT;
                else
                        ret = 0;
                break;
        case SO_RDS_TRANSPORT:
                if (len < sizeof(int)) {
                        ret = -EINVAL;
                        break;
                }
                trans = (rs->rs_transport ? rs->rs_transport->t_type :
                         RDS_TRANS_NONE); /* unbound */
                if (put_user(trans, (int __user *)optval) ||
                    put_user(sizeof(int), optlen))
                        ret = -EFAULT;
                else
                        ret = 0;
                break;
        default:
                break;
        }

out:
        return ret;

}

static int rds_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
                       int addr_len, int flags)
{
        struct sock *sk = sock->sk;
        struct sockaddr_in *sin;
        struct rds_sock *rs = rds_sk_to_rs(sk);
        int ret = 0;

        if (addr_len < offsetofend(struct sockaddr, sa_family))
                return -EINVAL;

        lock_sock(sk);

        switch (uaddr->sa_family) {
        case AF_INET:
                sin = (struct sockaddr_in *)uaddr;
                if (addr_len < sizeof(struct sockaddr_in)) {
                        ret = -EINVAL;
                        break;
                }
                if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
                        ret = -EDESTADDRREQ;
                        break;
                }
                if (ipv4_is_multicast(sin->sin_addr.s_addr) ||
                    sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
                        ret = -EINVAL;
                        break;
                }
                ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
                rs->rs_conn_port = sin->sin_port;
                break;

#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6: {
                struct sockaddr_in6 *sin6;
                int addr_type;

                sin6 = (struct sockaddr_in6 *)uaddr;
                if (addr_len < sizeof(struct sockaddr_in6)) {
                        ret = -EINVAL;
                        break;
                }
                addr_type = ipv6_addr_type(&sin6->sin6_addr);
                if (!(addr_type & IPV6_ADDR_UNICAST)) {
                        __be32 addr4;

                        if (!(addr_type & IPV6_ADDR_MAPPED)) {
                                ret = -EPROTOTYPE;
                                break;
                        }

                        /* It is a mapped address.  Need to do some sanity
                         * checks.
                         */
                        addr4 = sin6->sin6_addr.s6_addr32[3];
                        if (addr4 == htonl(INADDR_ANY) ||
                            addr4 == htonl(INADDR_BROADCAST) ||
                            ipv4_is_multicast(addr4)) {
                                ret = -EPROTOTYPE;
                                break;
                        }
                }

                if (addr_type & IPV6_ADDR_LINKLOCAL) {
                        /* If socket is already bound to a link local address,
                         * the peer address must be on the same link.
                         */
                        if (sin6->sin6_scope_id == 0 ||
                            (!ipv6_addr_any(&rs->rs_bound_addr) &&
                             rs->rs_bound_scope_id &&
                             sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
                                ret = -EINVAL;
                                break;
                        }
                        /* Remember the connected address scope ID.  It will
                         * be checked against the binding local address when
                         * the socket is bound.
                         */
                        rs->rs_bound_scope_id = sin6->sin6_scope_id;
                }
                rs->rs_conn_addr = sin6->sin6_addr;
                rs->rs_conn_port = sin6->sin6_port;
                break;
        }
#endif

        default:
                ret = -EAFNOSUPPORT;
                break;
        }

        release_sock(sk);
        return ret;
}

static struct proto rds_proto = {
        .name          = "RDS",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct rds_sock),
};

static const struct proto_ops rds_proto_ops = {
        .family =        AF_RDS,
        .owner =        THIS_MODULE,
        .release =        rds_release,
        .bind =                rds_bind,
        .connect =        rds_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        rds_getname,
        .poll =                rds_poll,
        .ioctl =        rds_ioctl,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        rds_setsockopt,
        .getsockopt =        rds_getsockopt,
        .sendmsg =        rds_sendmsg,
        .recvmsg =        rds_recvmsg,
        .mmap =                sock_no_mmap,
};

static void rds_sock_destruct(struct sock *sk)
{
        struct rds_sock *rs = rds_sk_to_rs(sk);

        WARN_ON((&rs->rs_item != rs->rs_item.next ||
                 &rs->rs_item != rs->rs_item.prev));
}

static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
{
        struct rds_sock *rs;

        sock_init_data(sock, sk);
        sock->ops                = &rds_proto_ops;
        sk->sk_protocol                = protocol;
        sk->sk_destruct                = rds_sock_destruct;

        rs = rds_sk_to_rs(sk);
        spin_lock_init(&rs->rs_lock);
        rwlock_init(&rs->rs_recv_lock);
        INIT_LIST_HEAD(&rs->rs_send_queue);
        INIT_LIST_HEAD(&rs->rs_recv_queue);
        INIT_LIST_HEAD(&rs->rs_notify_queue);
        INIT_LIST_HEAD(&rs->rs_cong_list);
        rds_message_zcopy_queue_init(&rs->rs_zcookie_queue);
        spin_lock_init(&rs->rs_rdma_lock);
        rs->rs_rdma_keys = RB_ROOT;
        rs->rs_rx_traces = 0;
        rs->rs_tos = 0;
        rs->rs_conn = NULL;

        spin_lock_bh(&rds_sock_lock);
        list_add_tail(&rs->rs_item, &rds_sock_list);
        rds_sock_count++;
        spin_unlock_bh(&rds_sock_lock);

        return 0;
}

static int rds_create(struct net *net, struct socket *sock, int protocol,
                      int kern)
{
        struct sock *sk;

        if (sock->type != SOCK_SEQPACKET || protocol)
                return -ESOCKTNOSUPPORT;

        sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern);
        if (!sk)
                return -ENOMEM;

        return __rds_create(sock, sk, protocol);
}

void rds_sock_addref(struct rds_sock *rs)
{
        sock_hold(rds_rs_to_sk(rs));
}

void rds_sock_put(struct rds_sock *rs)
{
        sock_put(rds_rs_to_sk(rs));
}

static const struct net_proto_family rds_family_ops = {
        .family =        AF_RDS,
        .create =        rds_create,
        .owner        =        THIS_MODULE,
};

static void rds_sock_inc_info(struct socket *sock, unsigned int len,
                              struct rds_info_iterator *iter,
                              struct rds_info_lengths *lens)
{
        struct rds_sock *rs;
        struct rds_incoming *inc;
        unsigned int total = 0;

        len /= sizeof(struct rds_info_message);

        spin_lock_bh(&rds_sock_lock);

        list_for_each_entry(rs, &rds_sock_list, rs_item) {
                /* This option only supports IPv4 sockets. */
                if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
                        continue;

                read_lock(&rs->rs_recv_lock);

                /* XXX too lazy to maintain counts.. */
                list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
                        total++;
                        if (total <= len)
                                rds_inc_info_copy(inc, iter,
                                                  inc->i_saddr.s6_addr32[3],
                                                  rs->rs_bound_addr_v4,
                                                  1);
                }

                read_unlock(&rs->rs_recv_lock);
        }

        spin_unlock_bh(&rds_sock_lock);

        lens->nr = total;
        lens->each = sizeof(struct rds_info_message);
}

#if IS_ENABLED(CONFIG_IPV6)
static void rds6_sock_inc_info(struct socket *sock, unsigned int len,
                               struct rds_info_iterator *iter,
                               struct rds_info_lengths *lens)
{
        struct rds_incoming *inc;
        unsigned int total = 0;
        struct rds_sock *rs;

        len /= sizeof(struct rds6_info_message);

        spin_lock_bh(&rds_sock_lock);

        list_for_each_entry(rs, &rds_sock_list, rs_item) {
                read_lock(&rs->rs_recv_lock);

                list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
                        total++;
                        if (total <= len)
                                rds6_inc_info_copy(inc, iter, &inc->i_saddr,
                                                   &rs->rs_bound_addr, 1);
                }

                read_unlock(&rs->rs_recv_lock);
        }

        spin_unlock_bh(&rds_sock_lock);

        lens->nr = total;
        lens->each = sizeof(struct rds6_info_message);
}
#endif

static void rds_sock_info(struct socket *sock, unsigned int len,
                          struct rds_info_iterator *iter,
                          struct rds_info_lengths *lens)
{
        struct rds_info_socket sinfo;
        unsigned int cnt = 0;
        struct rds_sock *rs;

        len /= sizeof(struct rds_info_socket);

        spin_lock_bh(&rds_sock_lock);

        if (len < rds_sock_count) {
                cnt = rds_sock_count;
                goto out;
        }

        list_for_each_entry(rs, &rds_sock_list, rs_item) {
                /* This option only supports IPv4 sockets. */
                if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
                        continue;
                sinfo.sndbuf = rds_sk_sndbuf(rs);
                sinfo.rcvbuf = rds_sk_rcvbuf(rs);
                sinfo.bound_addr = rs->rs_bound_addr_v4;
                sinfo.connected_addr = rs->rs_conn_addr_v4;
                sinfo.bound_port = rs->rs_bound_port;
                sinfo.connected_port = rs->rs_conn_port;
                sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));

                rds_info_copy(iter, &sinfo, sizeof(sinfo));
                cnt++;
        }

out:
        lens->nr = cnt;
        lens->each = sizeof(struct rds_info_socket);

        spin_unlock_bh(&rds_sock_lock);
}

#if IS_ENABLED(CONFIG_IPV6)
static void rds6_sock_info(struct socket *sock, unsigned int len,
                           struct rds_info_iterator *iter,
                           struct rds_info_lengths *lens)
{
        struct rds6_info_socket sinfo6;
        struct rds_sock *rs;

        len /= sizeof(struct rds6_info_socket);

        spin_lock_bh(&rds_sock_lock);

        if (len < rds_sock_count)
                goto out;

        list_for_each_entry(rs, &rds_sock_list, rs_item) {
                sinfo6.sndbuf = rds_sk_sndbuf(rs);
                sinfo6.rcvbuf = rds_sk_rcvbuf(rs);
                sinfo6.bound_addr = rs->rs_bound_addr;
                sinfo6.connected_addr = rs->rs_conn_addr;
                sinfo6.bound_port = rs->rs_bound_port;
                sinfo6.connected_port = rs->rs_conn_port;
                sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs));

                rds_info_copy(iter, &sinfo6, sizeof(sinfo6));
        }

 out:
        lens->nr = rds_sock_count;
        lens->each = sizeof(struct rds6_info_socket);

        spin_unlock_bh(&rds_sock_lock);
}
#endif

static void rds_exit(void)
{
        sock_unregister(rds_family_ops.family);
        proto_unregister(&rds_proto);
        rds_conn_exit();
        rds_cong_exit();
        rds_sysctl_exit();
        rds_threads_exit();
        rds_stats_exit();
        rds_page_exit();
        rds_bind_lock_destroy();
        rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
        rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
#if IS_ENABLED(CONFIG_IPV6)
        rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info);
        rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
#endif
}
module_exit(rds_exit);

u32 rds_gen_num;

static int __init rds_init(void)
{
        int ret;

        net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));

        ret = rds_bind_lock_init();
        if (ret)
                goto out;

        ret = rds_conn_init();
        if (ret)
                goto out_bind;

        ret = rds_threads_init();
        if (ret)
                goto out_conn;
        ret = rds_sysctl_init();
        if (ret)
                goto out_threads;
        ret = rds_stats_init();
        if (ret)
                goto out_sysctl;
        ret = proto_register(&rds_proto, 1);
        if (ret)
                goto out_stats;
        ret = sock_register(&rds_family_ops);
        if (ret)
                goto out_proto;

        rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
        rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
#if IS_ENABLED(CONFIG_IPV6)
        rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info);
        rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
#endif

        goto out;

out_proto:
        proto_unregister(&rds_proto);
out_stats:
        rds_stats_exit();
out_sysctl:
        rds_sysctl_exit();
out_threads:
        rds_threads_exit();
out_conn:
        rds_conn_exit();
        rds_cong_exit();
        rds_page_exit();
out_bind:
        rds_bind_lock_destroy();
out:
        return ret;
}
module_init(rds_init);

#define DRV_VERSION     "4.0"
#define DRV_RELDATE     "Feb 12, 2009"

MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
                   " v" DRV_VERSION " (" DRV_RELDATE ")");
MODULE_VERSION(DRV_VERSION);
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NETPROTO(PF_RDS);

























































   36 





   38 














   20 

   24 


   14 

   24 










































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Wrapper functions for accessing the file_struct fd array.
 */

#ifndef __LINUX_FILE_H
#define __LINUX_FILE_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/posix_types.h>
#include <linux/errno.h>
#include <linux/cleanup.h>
#include <linux/err.h>

struct file;

extern void fput(struct file *);

struct file_operations;
struct task_struct;
struct vfsmount;
struct dentry;
struct inode;
struct path;
extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_clone(struct file *, int flags,
        const struct file_operations *);

/* either a reference to struct file + flags
 * (cloned vs. borrowed, pos locked), with
 * flags stored in lower bits of value,
 * or empty (represented by 0).
 */
struct fd {
        unsigned long word;
};
#define FDPUT_FPUT       1
#define FDPUT_POS_UNLOCK 2

#define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK)))
static inline bool fd_empty(struct fd f)
{
        return unlikely(!f.word);
}

#define EMPTY_FD (struct fd){0}
static inline struct fd BORROWED_FD(struct file *f)
{
        return (struct fd){(unsigned long)f};
}
static inline struct fd CLONED_FD(struct file *f)
{
        return (struct fd){(unsigned long)f | FDPUT_FPUT};
}

static inline void fdput(struct fd fd)
{
        if (unlikely(fd.word & FDPUT_FPUT))
                fput(fd_file(fd));
}

extern struct file *fget(unsigned int fd);
extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd);
extern void __f_unlock_pos(struct file *);

struct fd fdget(unsigned int fd);
struct fd fdget_raw(unsigned int fd);
struct fd fdget_pos(unsigned int fd);

static inline void fdput_pos(struct fd f)
{
        if (f.word & FDPUT_POS_UNLOCK)
                __f_unlock_pos(fd_file(f));
        fdput(f);
}

DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd)
DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd)
DEFINE_CLASS(fd_pos, struct fd, fdput_pos(_T), fdget_pos(fd), int fd)

extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
extern void set_close_on_exec(unsigned int fd, int flag);
extern bool get_close_on_exec(unsigned int fd);
extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile);
extern int get_unused_fd_flags(unsigned flags);
extern void put_unused_fd(unsigned int fd);

DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
             get_unused_fd_flags(flags), unsigned flags)
DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T))

/*
 * take_fd() will take care to set @fd to -EBADF ensuring that
 * CLASS(get_unused_fd) won't call put_unused_fd(). This makes it
 * easier to rely on CLASS(get_unused_fd):
 *
 * struct file *f;
 *
 * CLASS(get_unused_fd, fd)(O_CLOEXEC);
 * if (fd < 0)
 *         return fd;
 *
 * f = dentry_open(&path, O_RDONLY, current_cred());
 * if (IS_ERR(f))
 *         return PTR_ERR(f);
 *
 * fd_install(fd, f);
 * return take_fd(fd);
 */
#define take_fd(fd) __get_and_null(fd, -EBADF)

extern void fd_install(unsigned int fd, struct file *file);

int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags);

int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);

extern void flush_delayed_fput(void);
extern void __fput_sync(struct file *);

extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;

/*
 * fd_prepare: Combined fd + file allocation cleanup class.
 * @err: Error code to indicate if allocation succeeded.
 * @__fd: Allocated fd (may not be accessed directly)
 * @__file: Allocated struct file pointer (may not be accessed directly)
 *
 * Allocates an fd and a file together. On error paths, automatically cleans
 * up whichever resource was successfully allocated. Allows flexible file
 * allocation with different functions per usage.
 *
 * Do not use directly.
 */
struct fd_prepare {
        s32 err;
        s32 __fd; /* do not access directly */
        struct file *__file; /* do not access directly */
};

/* Typedef for fd_prepare cleanup guards. */
typedef struct fd_prepare class_fd_prepare_t;

/*
 * Accessors for fd_prepare class members.
 * _Generic() is used for zero-cost type safety.
 */
#define fd_prepare_fd(_fdf) \
        (_Generic((_fdf), struct fd_prepare: (_fdf).__fd))

#define fd_prepare_file(_fdf) \
        (_Generic((_fdf), struct fd_prepare: (_fdf).__file))

/* Do not use directly. */
static inline void class_fd_prepare_destructor(const struct fd_prepare *fdf)
{
        if (unlikely(fdf->__fd >= 0))
                put_unused_fd(fdf->__fd);
        if (unlikely(!IS_ERR_OR_NULL(fdf->__file)))
                fput(fdf->__file);
}

/* Do not use directly. */
static inline int class_fd_prepare_lock_err(const struct fd_prepare *fdf)
{
        if (unlikely(fdf->err))
                return fdf->err;
        if (unlikely(fdf->__fd < 0))
                return fdf->__fd;
        if (unlikely(IS_ERR(fdf->__file)))
                return PTR_ERR(fdf->__file);
        if (unlikely(!fdf->__file))
                return -ENOMEM;
        return 0;
}

/*
 * __FD_PREPARE_INIT - Helper to initialize fd_prepare class.
 * @_fd_flags: flags for get_unused_fd_flags()
 * @_file_owned: expression that returns struct file *
 *
 * Returns a struct fd_prepare with fd, file, and err set.
 * If fd allocation fails, fd will be negative and err will be set. If
 * fd succeeds but file_init_expr fails, file will be ERR_PTR and err
 * will be set. The err field is the single source of truth for error
 * checking.
 */
#define __FD_PREPARE_INIT(_fd_flags, _file_owned)                 \
        ({                                                        \
                struct fd_prepare fdf = {                         \
                        .__fd = get_unused_fd_flags((_fd_flags)), \
                };                                                \
                if (likely(fdf.__fd >= 0))                        \
                        fdf.__file = (_file_owned);               \
                fdf.err = ACQUIRE_ERR(fd_prepare, &fdf);          \
                fdf;                                              \
        })

/*
 * FD_PREPARE - Macro to declare and initialize an fd_prepare variable.
 *
 * Declares and initializes an fd_prepare variable with automatic
 * cleanup. No separate scope required - cleanup happens when variable
 * goes out of scope.
 *
 * @_fdf: name of struct fd_prepare variable to define
 * @_fd_flags: flags for get_unused_fd_flags()
 * @_file_owned: struct file to take ownership of (can be expression)
 */
#define FD_PREPARE(_fdf, _fd_flags, _file_owned) \
        CLASS_INIT(fd_prepare, _fdf, __FD_PREPARE_INIT(_fd_flags, _file_owned))

/*
 * fd_publish - Publish prepared fd and file to the fd table.
 * @_fdf: struct fd_prepare variable
 */
#define fd_publish(_fdf)                                       \
        ({                                                     \
                struct fd_prepare *fdp = &(_fdf);              \
                VFS_WARN_ON_ONCE(fdp->err);                    \
                VFS_WARN_ON_ONCE(fdp->__fd < 0);               \
                VFS_WARN_ON_ONCE(IS_ERR_OR_NULL(fdp->__file)); \
                fd_install(fdp->__fd, fdp->__file);            \
                retain_and_null_ptr(fdp->__file);              \
                take_fd(fdp->__fd);                            \
        })

/* Do not use directly. */
#define __FD_ADD(_fdf, _fd_flags, _file_owned)            \
        ({                                                \
                FD_PREPARE(_fdf, _fd_flags, _file_owned); \
                s32 ret = _fdf.err;                       \
                if (likely(!ret))                         \
                        ret = fd_publish(_fdf);           \
                ret;                                      \
        })

/*
 * FD_ADD - Allocate and install an fd and file in one step.
 * @_fd_flags: flags for get_unused_fd_flags()
 * @_file_owned: struct file to take ownership of
 *
 * Returns the allocated fd number, or negative error code on failure.
 */
#define FD_ADD(_fd_flags, _file_owned) \
        __FD_ADD(__UNIQUE_ID(fd_prepare), _fd_flags, _file_owned)

#endif /* __LINUX_FILE_H */































   19 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * SHA-256 optimized for x86_64
 *
 * Copyright 2025 Google LLC
 */
#include <asm/fpu/api.h>
#include <linux/static_call.h>

static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);

DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);

#define DEFINE_X86_SHA256_FN(c_fn, asm_fn)                                 \
        asmlinkage void asm_fn(struct sha256_block_state *state,           \
                               const u8 *data, size_t nblocks);            \
        static void c_fn(struct sha256_block_state *state, const u8 *data, \
                         size_t nblocks)                                   \
        {                                                                  \
                if (likely(irq_fpu_usable())) {                            \
                        kernel_fpu_begin();                                \
                        asm_fn(state, data, nblocks);                      \
                        kernel_fpu_end();                                  \
                } else {                                                   \
                        sha256_blocks_generic(state, data, nblocks);       \
                }                                                          \
        }

DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3);
DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx);
DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx);
DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform);

#define PHE_ALIGNMENT 16
static void sha256_blocks_phe(struct sha256_block_state *state,
                              const u8 *data, size_t nblocks)
{
        /*
         * On Zhaoxin processors, XSHA256 requires the %rdi register
         * in 64-bit mode (or %edi in 32-bit mode) to point to
         * a 32-byte, 16-byte-aligned buffer.
         */
        u8 buf[32 + PHE_ALIGNMENT - 1];
        u8 *dst = PTR_ALIGN(&buf[0], PHE_ALIGNMENT);
        size_t padding = -1;

        memcpy(dst, state, SHA256_DIGEST_SIZE);
        asm volatile(".byte 0xf3,0x0f,0xa6,0xd0" /* REP XSHA256 */
                     : "+a"(padding), "+c"(nblocks), "+S"(data)
                     : "D"(dst)
                     : "memory");
        memcpy(state, dst, SHA256_DIGEST_SIZE);
}

static void sha256_blocks(struct sha256_block_state *state,
                          const u8 *data, size_t nblocks)
{
        static_call(sha256_blocks_x86)(state, data, nblocks);
}

static_assert(offsetof(struct __sha256_ctx, state) == 0);
static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
static_assert(offsetof(struct __sha256_ctx, buf) == 40);
asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
                                  const u8 *data1, const u8 *data2, int len,
                                  u8 out1[SHA256_DIGEST_SIZE],
                                  u8 out2[SHA256_DIGEST_SIZE]);

#define sha256_finup_2x_arch sha256_finup_2x_arch
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
                                 const u8 *data1, const u8 *data2, size_t len,
                                 u8 out1[SHA256_DIGEST_SIZE],
                                 u8 out2[SHA256_DIGEST_SIZE])
{
        /*
         * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
         * Further limit len to 65536 to avoid spending too long with preemption
         * disabled.  (Of course, in practice len is nearly always 4096 anyway.)
         */
        if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
            len <= 65536 && likely(irq_fpu_usable())) {
                kernel_fpu_begin();
                sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
                kernel_fpu_end();
                kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
                kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
                return true;
        }
        return false;
}

static bool sha256_finup_2x_is_optimized_arch(void)
{
        return static_key_enabled(&have_sha_ni);
}

#define sha256_mod_init_arch sha256_mod_init_arch
static void sha256_mod_init_arch(void)
{
        if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
                static_call_update(sha256_blocks_x86, sha256_blocks_ni);
                static_branch_enable(&have_sha_ni);
        } else if (IS_ENABLED(CONFIG_CPU_SUP_ZHAOXIN) &&
                   boot_cpu_has(X86_FEATURE_PHE_EN) &&
                   boot_cpu_data.x86 >= 0x07) {
                static_call_update(sha256_blocks_x86, sha256_blocks_phe);
        } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
                                     NULL) &&
                   boot_cpu_has(X86_FEATURE_AVX)) {
                if (boot_cpu_has(X86_FEATURE_AVX2) &&
                    boot_cpu_has(X86_FEATURE_BMI2))
                        static_call_update(sha256_blocks_x86,
                                           sha256_blocks_avx2);
                else
                        static_call_update(sha256_blocks_x86,
                                           sha256_blocks_avx);
        } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
                static_call_update(sha256_blocks_x86, sha256_blocks_ssse3);
        }
}


































   90 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/fault-inject.h>
#include <linux/fault-inject-usercopy.h>

static struct {
        struct fault_attr attr;
} fail_usercopy = {
        .attr = FAULT_ATTR_INITIALIZER,
};

static int __init setup_fail_usercopy(char *str)
{
        return setup_fault_attr(&fail_usercopy.attr, str);
}
__setup("fail_usercopy=", setup_fail_usercopy);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_usercopy_debugfs(void)
{
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_usercopy", NULL,
                                        &fail_usercopy.attr);

        return PTR_ERR_OR_ZERO(dir);
}

late_initcall(fail_usercopy_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

bool should_fail_usercopy(void)
{
        return should_fail(&fail_usercopy.attr, 1);
}
EXPORT_SYMBOL_GPL(should_fail_usercopy);












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   10 
   11 




   20 








   13 

   18 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/blkdev.h>
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/kthread.h>
#include <linux/backing-dev.h>
#include <linux/blk-cgroup.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
#include <trace/events/writeback.h>
#include "internal.h"

struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);

static const char *bdi_unknown_name = "(unknown)";

/*
 * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
 * reader side locking.
 */
DEFINE_SPINLOCK(bdi_lock);
static u64 bdi_id_cursor;
static struct rb_root bdi_tree = RB_ROOT;
LIST_HEAD(bdi_list);

/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;

#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>

struct wb_stats {
        unsigned long nr_dirty;
        unsigned long nr_io;
        unsigned long nr_more_io;
        unsigned long nr_dirty_time;
        unsigned long nr_writeback;
        unsigned long nr_reclaimable;
        unsigned long nr_dirtied;
        unsigned long nr_written;
        unsigned long dirty_thresh;
        unsigned long wb_thresh;
};

static struct dentry *bdi_debug_root;

static void bdi_debug_init(void)
{
        bdi_debug_root = debugfs_create_dir("bdi", NULL);
}

static void collect_wb_stats(struct wb_stats *stats,
                             struct bdi_writeback *wb)
{
        struct inode *inode;

        spin_lock(&wb->list_lock);
        list_for_each_entry(inode, &wb->b_dirty, i_io_list)
                stats->nr_dirty++;
        list_for_each_entry(inode, &wb->b_io, i_io_list)
                stats->nr_io++;
        list_for_each_entry(inode, &wb->b_more_io, i_io_list)
                stats->nr_more_io++;
        list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
                if (inode_state_read_once(inode) & I_DIRTY_TIME)
                        stats->nr_dirty_time++;
        spin_unlock(&wb->list_lock);

        stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
        stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
        stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
        stats->nr_written += wb_stat(wb, WB_WRITTEN);
        stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
}

#ifdef CONFIG_CGROUP_WRITEBACK
static void bdi_collect_stats(struct backing_dev_info *bdi,
                              struct wb_stats *stats)
{
        struct bdi_writeback *wb;

        rcu_read_lock();
        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
                if (!wb_tryget(wb))
                        continue;

                collect_wb_stats(stats, wb);
                wb_put(wb);
        }
        rcu_read_unlock();
}
#else
static void bdi_collect_stats(struct backing_dev_info *bdi,
                              struct wb_stats *stats)
{
        collect_wb_stats(stats, &bdi->wb);
}
#endif

static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
        struct backing_dev_info *bdi = m->private;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        struct wb_stats stats;
        unsigned long tot_bw;

        global_dirty_limits(&background_thresh, &dirty_thresh);

        memset(&stats, 0, sizeof(stats));
        stats.dirty_thresh = dirty_thresh;
        bdi_collect_stats(bdi, &stats);
        tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);

        seq_printf(m,
                   "BdiWriteback:       %10lu kB\n"
                   "BdiReclaimable:     %10lu kB\n"
                   "BdiDirtyThresh:     %10lu kB\n"
                   "DirtyThresh:        %10lu kB\n"
                   "BackgroundThresh:   %10lu kB\n"
                   "BdiDirtied:         %10lu kB\n"
                   "BdiWritten:         %10lu kB\n"
                   "BdiWriteBandwidth:  %10lu kBps\n"
                   "b_dirty:            %10lu\n"
                   "b_io:               %10lu\n"
                   "b_more_io:          %10lu\n"
                   "b_dirty_time:       %10lu\n"
                   "bdi_list:           %10u\n"
                   "state:              %10lx\n",
                   K(stats.nr_writeback),
                   K(stats.nr_reclaimable),
                   K(stats.wb_thresh),
                   K(dirty_thresh),
                   K(background_thresh),
                   K(stats.nr_dirtied),
                   K(stats.nr_written),
                   K(tot_bw),
                   stats.nr_dirty,
                   stats.nr_io,
                   stats.nr_more_io,
                   stats.nr_dirty_time,
                   !list_empty(&bdi->bdi_list), bdi->wb.state);

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);

static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
                          struct wb_stats *stats)
{

        seq_printf(m,
                   "WbCgIno:           %10lu\n"
                   "WbWriteback:       %10lu kB\n"
                   "WbReclaimable:     %10lu kB\n"
                   "WbDirtyThresh:     %10lu kB\n"
                   "WbDirtied:         %10lu kB\n"
                   "WbWritten:         %10lu kB\n"
                   "WbWriteBandwidth:  %10lu kBps\n"
                   "b_dirty:           %10lu\n"
                   "b_io:              %10lu\n"
                   "b_more_io:         %10lu\n"
                   "b_dirty_time:      %10lu\n"
                   "state:             %10lx\n\n",
#ifdef CONFIG_CGROUP_WRITEBACK
                   cgroup_ino(wb->memcg_css->cgroup),
#else
                   1ul,
#endif
                   K(stats->nr_writeback),
                   K(stats->nr_reclaimable),
                   K(stats->wb_thresh),
                   K(stats->nr_dirtied),
                   K(stats->nr_written),
                   K(wb->avg_write_bandwidth),
                   stats->nr_dirty,
                   stats->nr_io,
                   stats->nr_more_io,
                   stats->nr_dirty_time,
                   wb->state);
}

static int cgwb_debug_stats_show(struct seq_file *m, void *v)
{
        struct backing_dev_info *bdi = m->private;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        struct bdi_writeback *wb;

        global_dirty_limits(&background_thresh, &dirty_thresh);

        rcu_read_lock();
        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
                struct wb_stats stats = { .dirty_thresh = dirty_thresh };

                if (!wb_tryget(wb))
                        continue;

                collect_wb_stats(&stats, wb);

                /*
                 * Calculate thresh of wb in writeback cgroup which is min of
                 * thresh in global domain and thresh in cgroup domain. Drop
                 * rcu lock because cgwb_calc_thresh may sleep in
                 * cgroup_rstat_flush. We can do so here because we have a ref.
                 */
                if (mem_cgroup_wb_domain(wb)) {
                        rcu_read_unlock();
                        stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
                        rcu_read_lock();
                }

                wb_stats_show(m, wb, &stats);

                wb_put(wb);
        }
        rcu_read_unlock();

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);

static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
        bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);

        debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
                            &bdi_debug_stats_fops);
        debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
                            &cgwb_debug_stats_fops);
}

static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
        debugfs_remove_recursive(bdi->debug_dir);
}
#else /* CONFIG_DEBUG_FS */
static inline void bdi_debug_init(void)
{
}
static inline void bdi_debug_register(struct backing_dev_info *bdi,
                                      const char *name)
{
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
}
#endif /* CONFIG_DEBUG_FS */

static ssize_t read_ahead_kb_store(struct device *dev,
                                  struct device_attribute *attr,
                                  const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned long read_ahead_kb;
        ssize_t ret;

        ret = kstrtoul(buf, 10, &read_ahead_kb);
        if (ret < 0)
                return ret;

        bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);

        return count;
}

#define BDI_SHOW(name, expr)                                                \
static ssize_t name##_show(struct device *dev,                                \
                           struct device_attribute *attr, char *buf)        \
{                                                                        \
        struct backing_dev_info *bdi = dev_get_drvdata(dev);                \
                                                                        \
        return sysfs_emit(buf, "%lld\n", (long long)expr);                \
}                                                                        \
static DEVICE_ATTR_RW(name);

BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))

static ssize_t min_ratio_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_ratio(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE)

static ssize_t min_ratio_fine_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_ratio_no_scale(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(min_ratio_fine, bdi->min_ratio)

static ssize_t max_ratio_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_ratio(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE)

static ssize_t max_ratio_fine_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_ratio_no_scale(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(max_ratio_fine, bdi->max_ratio)

static ssize_t min_bytes_show(struct device *dev,
                              struct device_attribute *attr,
                              char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi));
}

static ssize_t min_bytes_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        u64 bytes;
        ssize_t ret;

        ret = kstrtoull(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_bytes(bdi, bytes);
        if (!ret)
                ret = count;

        return ret;
}
static DEVICE_ATTR_RW(min_bytes);

static ssize_t max_bytes_show(struct device *dev,
                              struct device_attribute *attr,
                              char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi));
}

static ssize_t max_bytes_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        u64 bytes;
        ssize_t ret;

        ret = kstrtoull(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_bytes(bdi, bytes);
        if (!ret)
                ret = count;

        return ret;
}
static DEVICE_ATTR_RW(max_bytes);

static ssize_t stable_pages_required_show(struct device *dev,
                                          struct device_attribute *attr,
                                          char *buf)
{
        dev_warn_once(dev,
                "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
        return sysfs_emit(buf, "%d\n", 0);
}
static DEVICE_ATTR_RO(stable_pages_required);

static ssize_t strict_limit_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int strict_limit;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &strict_limit);
        if (ret < 0)
                return ret;

        ret = bdi_set_strict_limit(bdi, strict_limit);
        if (!ret)
                ret = count;

        return ret;
}

static ssize_t strict_limit_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%d\n",
                        !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
}
static DEVICE_ATTR_RW(strict_limit);

static struct attribute *bdi_dev_attrs[] = {
        &dev_attr_read_ahead_kb.attr,
        &dev_attr_min_ratio.attr,
        &dev_attr_min_ratio_fine.attr,
        &dev_attr_max_ratio.attr,
        &dev_attr_max_ratio_fine.attr,
        &dev_attr_min_bytes.attr,
        &dev_attr_max_bytes.attr,
        &dev_attr_stable_pages_required.attr,
        &dev_attr_strict_limit.attr,
        NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);

static const struct class bdi_class = {
        .name                = "bdi",
        .dev_groups        = bdi_dev_groups,
};

static __init int bdi_class_init(void)
{
        int ret;

        ret = class_register(&bdi_class);
        if (ret)
                return ret;

        bdi_debug_init();

        return 0;
}
postcore_initcall(bdi_class_init);

static int __init default_bdi_init(void)
{
        bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
                                 WQ_SYSFS, 0);
        if (!bdi_wq)
                return -ENOMEM;
        return 0;
}
subsys_initcall(default_bdi_init);

static void wb_update_bandwidth_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                struct bdi_writeback, bw_dwork);

        wb_update_bandwidth(wb);
}

/*
 * Initial write bandwidth: 100 MB/s
 */
#define INIT_BW                MB_TO_PAGES(100)

static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
                   gfp_t gfp)
{
        int err;

        memset(wb, 0, sizeof(*wb));

        wb->bdi = bdi;
        wb->last_old_flush = jiffies;
        INIT_LIST_HEAD(&wb->b_dirty);
        INIT_LIST_HEAD(&wb->b_io);
        INIT_LIST_HEAD(&wb->b_more_io);
        INIT_LIST_HEAD(&wb->b_dirty_time);
        spin_lock_init(&wb->list_lock);

        atomic_set(&wb->writeback_inodes, 0);
        wb->bw_time_stamp = jiffies;
        wb->balanced_dirty_ratelimit = INIT_BW;
        wb->dirty_ratelimit = INIT_BW;
        wb->write_bandwidth = INIT_BW;
        wb->avg_write_bandwidth = INIT_BW;

        spin_lock_init(&wb->work_lock);
        INIT_LIST_HEAD(&wb->work_list);
        INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
        INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);

        err = fprop_local_init_percpu(&wb->completions, gfp);
        if (err)
                return err;

        err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS);
        if (err)
                fprop_local_destroy_percpu(&wb->completions);

        return err;
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);

/*
 * Remove bdi from the global list and shutdown any threads we have running
 */
static void wb_shutdown(struct bdi_writeback *wb)
{
        /* Make sure nobody queues further work */
        spin_lock_irq(&wb->work_lock);
        if (!test_and_clear_bit(WB_registered, &wb->state)) {
                spin_unlock_irq(&wb->work_lock);
                return;
        }
        spin_unlock_irq(&wb->work_lock);

        cgwb_remove_from_bdi_list(wb);
        /*
         * Drain work list and shutdown the delayed_work.  !WB_registered
         * tells wb_workfn() that @wb is dying and its work_list needs to
         * be drained no matter what.
         */
        mod_delayed_work(bdi_wq, &wb->dwork, 0);
        flush_delayed_work(&wb->dwork);
        WARN_ON(!list_empty(&wb->work_list));
        flush_delayed_work(&wb->bw_dwork);
}

static void wb_exit(struct bdi_writeback *wb)
{
        WARN_ON(delayed_work_pending(&wb->dwork));
        percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS);
        fprop_local_destroy_percpu(&wb->completions);
}

#ifdef CONFIG_CGROUP_WRITEBACK

#include <linux/memcontrol.h>

/*
 * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
 * memcg->cgwb_list.  bdi->cgwb_tree is also RCU protected.
 */
static DEFINE_SPINLOCK(cgwb_lock);
static struct workqueue_struct *cgwb_release_wq;

static LIST_HEAD(offline_cgwbs);
static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);

static void cgwb_free_rcu(struct rcu_head *rcu_head)
{
        struct bdi_writeback *wb = container_of(rcu_head,
                        struct bdi_writeback, rcu);

        percpu_ref_exit(&wb->refcnt);
        kfree(wb);
}

static void cgwb_release_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
                                                release_work);
        struct backing_dev_info *bdi = wb->bdi;

        mutex_lock(&wb->bdi->cgwb_release_mutex);
        wb_shutdown(wb);

        css_put(wb->memcg_css);

        /* triggers blkg destruction if no online users left */
        blkcg_unpin_online(wb->blkcg_css);

        css_put(wb->blkcg_css);
        mutex_unlock(&wb->bdi->cgwb_release_mutex);

        fprop_local_destroy_percpu(&wb->memcg_completions);

        spin_lock_irq(&cgwb_lock);
        list_del(&wb->offline_node);
        spin_unlock_irq(&cgwb_lock);

        wb_exit(wb);
        bdi_put(bdi);
        WARN_ON_ONCE(!list_empty(&wb->b_attached));
        WARN_ON_ONCE(work_pending(&wb->switch_work));
        call_rcu(&wb->rcu, cgwb_free_rcu);
}

static void cgwb_release(struct percpu_ref *refcnt)
{
        struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
                                                refcnt);
        queue_work(cgwb_release_wq, &wb->release_work);
}

static void cgwb_kill(struct bdi_writeback *wb)
{
        lockdep_assert_held(&cgwb_lock);

        WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
        list_del(&wb->memcg_node);
        list_del(&wb->blkcg_node);
        list_add(&wb->offline_node, &offline_cgwbs);
        percpu_ref_kill(&wb->refcnt);
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
{
        spin_lock_irq(&cgwb_lock);
        list_del_rcu(&wb->bdi_node);
        spin_unlock_irq(&cgwb_lock);
}

static int cgwb_create(struct backing_dev_info *bdi,
                       struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
        struct mem_cgroup *memcg;
        struct cgroup_subsys_state *blkcg_css;
        struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
        struct bdi_writeback *wb;
        unsigned long flags;
        int ret = 0;

        memcg = mem_cgroup_from_css(memcg_css);
        blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
        memcg_cgwb_list = &memcg->cgwb_list;
        blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);

        /* look up again under lock and discard on blkcg mismatch */
        spin_lock_irqsave(&cgwb_lock, flags);
        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
        if (wb && wb->blkcg_css != blkcg_css) {
                cgwb_kill(wb);
                wb = NULL;
        }
        spin_unlock_irqrestore(&cgwb_lock, flags);
        if (wb)
                goto out_put;

        /* need to create a new one */
        wb = kmalloc_obj(*wb, gfp);
        if (!wb) {
                ret = -ENOMEM;
                goto out_put;
        }

        ret = wb_init(wb, bdi, gfp);
        if (ret)
                goto err_free;

        ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
        if (ret)
                goto err_wb_exit;

        ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
        if (ret)
                goto err_ref_exit;

        wb->memcg_css = memcg_css;
        wb->blkcg_css = blkcg_css;
        INIT_LIST_HEAD(&wb->b_attached);
        INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
        init_llist_head(&wb->switch_wbs_ctxs);
        INIT_WORK(&wb->release_work, cgwb_release_workfn);
        set_bit(WB_registered, &wb->state);
        bdi_get(bdi);

        /*
         * The root wb determines the registered state of the whole bdi and
         * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
         * whether they're still online.  Don't link @wb if any is dead.
         * See wb_memcg_offline() and wb_blkcg_offline().
         */
        ret = -ENODEV;
        spin_lock_irqsave(&cgwb_lock, flags);
        if (test_bit(WB_registered, &bdi->wb.state) &&
            blkcg_cgwb_list->next && memcg_cgwb_list->next) {
                /* we might have raced another instance of this function */
                ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
                if (!ret) {
                        list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
                        list_add(&wb->memcg_node, memcg_cgwb_list);
                        list_add(&wb->blkcg_node, blkcg_cgwb_list);
                        blkcg_pin_online(blkcg_css);
                        css_get(memcg_css);
                        css_get(blkcg_css);
                }
        }
        spin_unlock_irqrestore(&cgwb_lock, flags);
        if (ret) {
                if (ret == -EEXIST)
                        ret = 0;
                goto err_fprop_exit;
        }
        goto out_put;

err_fprop_exit:
        bdi_put(bdi);
        fprop_local_destroy_percpu(&wb->memcg_completions);
err_ref_exit:
        percpu_ref_exit(&wb->refcnt);
err_wb_exit:
        wb_exit(wb);
err_free:
        kfree(wb);
out_put:
        css_put(blkcg_css);
        return ret;
}

/**
 * wb_get_lookup - get wb for a given memcg
 * @bdi: target bdi
 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 *
 * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
 * refcount incremented.
 *
 * This function uses css_get() on @memcg_css and thus expects its refcnt
 * to be positive on invocation.  IOW, rcu_read_lock() protection on
 * @memcg_css isn't enough.  try_get it before calling this function.
 *
 * A wb is keyed by its associated memcg.  As blkcg implicitly enables
 * memcg on the default hierarchy, memcg association is guaranteed to be
 * more specific (equal or descendant to the associated blkcg) and thus can
 * identify both the memcg and blkcg associations.
 *
 * Because the blkcg associated with a memcg may change as blkcg is enabled
 * and disabled closer to root in the hierarchy, each wb keeps track of
 * both the memcg and blkcg associated with it and verifies the blkcg on
 * each lookup.  On mismatch, the existing wb is discarded and a new one is
 * created.
 */
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css)
{
        struct bdi_writeback *wb;

        if (!memcg_css->parent)
                return &bdi->wb;

        rcu_read_lock();
        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
        if (wb) {
                struct cgroup_subsys_state *blkcg_css;

                /* see whether the blkcg association has changed */
                blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
                if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
                        wb = NULL;
                css_put(blkcg_css);
        }
        rcu_read_unlock();

        return wb;
}

/**
 * wb_get_create - get wb for a given memcg, create if necessary
 * @bdi: target bdi
 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 * @gfp: allocation mask to use
 *
 * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
 * create one.  See wb_get_lookup() for more details.
 */
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css,
                                    gfp_t gfp)
{
        struct bdi_writeback *wb;

        might_alloc(gfp);

        do {
                wb = wb_get_lookup(bdi, memcg_css);
        } while (!wb && !cgwb_create(bdi, memcg_css, gfp));

        return wb;
}

static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
        int ret;

        INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
        mutex_init(&bdi->cgwb_release_mutex);
        init_rwsem(&bdi->wb_switch_rwsem);

        ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
        if (!ret) {
                bdi->wb.memcg_css = &root_mem_cgroup->css;
                bdi->wb.blkcg_css = blkcg_root_css;
                INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
                init_llist_head(&bdi->wb.switch_wbs_ctxs);
        }
        return ret;
}

static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
        struct radix_tree_iter iter;
        void **slot;
        struct bdi_writeback *wb;

        WARN_ON(test_bit(WB_registered, &bdi->wb.state));

        spin_lock_irq(&cgwb_lock);
        radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
                cgwb_kill(*slot);
        spin_unlock_irq(&cgwb_lock);

        mutex_lock(&bdi->cgwb_release_mutex);
        spin_lock_irq(&cgwb_lock);
        while (!list_empty(&bdi->wb_list)) {
                wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
                                      bdi_node);
                spin_unlock_irq(&cgwb_lock);
                wb_shutdown(wb);
                spin_lock_irq(&cgwb_lock);
        }
        spin_unlock_irq(&cgwb_lock);
        mutex_unlock(&bdi->cgwb_release_mutex);
}

/*
 * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
 *
 * Try to release dying cgwbs by switching attached inodes to the nearest
 * living ancestor's writeback. Processed wbs are placed at the end
 * of the list to guarantee the forward progress.
 */
static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb;
        LIST_HEAD(processed);

        spin_lock_irq(&cgwb_lock);

        while (!list_empty(&offline_cgwbs)) {
                wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
                                      offline_node);
                list_move(&wb->offline_node, &processed);

                /*
                 * If wb is dirty, cleaning up the writeback by switching
                 * attached inodes will result in an effective removal of any
                 * bandwidth restrictions, which isn't the goal.  Instead,
                 * it can be postponed until the next time, when all io
                 * will be likely completed.  If in the meantime some inodes
                 * will get re-dirtied, they should be eventually switched to
                 * a new cgwb.
                 */
                if (wb_has_dirty_io(wb))
                        continue;

                if (!wb_tryget(wb))
                        continue;

                spin_unlock_irq(&cgwb_lock);
                while (cleanup_offline_cgwb(wb))
                        cond_resched();
                spin_lock_irq(&cgwb_lock);

                wb_put(wb);
        }

        if (!list_empty(&processed))
                list_splice_tail(&processed, &offline_cgwbs);

        spin_unlock_irq(&cgwb_lock);
}

/**
 * wb_memcg_offline - kill all wb's associated with a memcg being offlined
 * @memcg: memcg being offlined
 *
 * Also prevents creation of any new wb's associated with @memcg.
 */
void wb_memcg_offline(struct mem_cgroup *memcg)
{
        struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
        struct bdi_writeback *wb, *next;

        spin_lock_irq(&cgwb_lock);
        list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
                cgwb_kill(wb);
        memcg_cgwb_list->next = NULL;        /* prevent new wb's */
        spin_unlock_irq(&cgwb_lock);

        queue_work(system_dfl_wq, &cleanup_offline_cgwbs_work);
}

/**
 * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
 * @css: blkcg being offlined
 *
 * Also prevents creation of any new wb's associated with @blkcg.
 */
void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
        struct bdi_writeback *wb, *next;
        struct list_head *list = blkcg_get_cgwb_list(css);

        spin_lock_irq(&cgwb_lock);
        list_for_each_entry_safe(wb, next, list, blkcg_node)
                cgwb_kill(wb);
        list->next = NULL;        /* prevent new wb's */
        spin_unlock_irq(&cgwb_lock);
}

static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
        spin_lock_irq(&cgwb_lock);
        list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
        spin_unlock_irq(&cgwb_lock);
}

static int __init cgwb_init(void)
{
        /*
         * There can be many concurrent release work items overwhelming
         * system_percpu_wq.  Put them in a separate wq and limit concurrency.
         * There's no point in executing many of these in parallel.
         */
        cgwb_release_wq = alloc_workqueue("cgwb_release", WQ_PERCPU, 1);
        if (!cgwb_release_wq)
                return -ENOMEM;

        return 0;
}
subsys_initcall(cgwb_init);

#else        /* CONFIG_CGROUP_WRITEBACK */

static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
        return wb_init(&bdi->wb, bdi, GFP_KERNEL);
}

static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }

static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
        list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
{
        list_del_rcu(&wb->bdi_node);
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

int bdi_init(struct backing_dev_info *bdi)
{
        bdi->dev = NULL;

        kref_init(&bdi->refcnt);
        bdi->min_ratio = 0;
        bdi->max_ratio = 100 * BDI_RATIO_SCALE;
        bdi->max_prop_frac = FPROP_FRAC_BASE;
        INIT_LIST_HEAD(&bdi->bdi_list);
        INIT_LIST_HEAD(&bdi->wb_list);
        init_waitqueue_head(&bdi->wb_waitq);
        bdi->last_bdp_sleep = jiffies;

        return cgwb_bdi_init(bdi);
}

struct backing_dev_info *bdi_alloc(int node_id)
{
        struct backing_dev_info *bdi;

        bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
        if (!bdi)
                return NULL;

        if (bdi_init(bdi)) {
                kfree(bdi);
                return NULL;
        }
        bdi->capabilities = BDI_CAP_WRITEBACK;
        bdi->ra_pages = VM_READAHEAD_PAGES;
        bdi->io_pages = VM_READAHEAD_PAGES;
        return bdi;
}
EXPORT_SYMBOL(bdi_alloc);

static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
{
        struct rb_node **p = &bdi_tree.rb_node;
        struct rb_node *parent = NULL;
        struct backing_dev_info *bdi;

        lockdep_assert_held(&bdi_lock);

        while (*p) {
                parent = *p;
                bdi = rb_entry(parent, struct backing_dev_info, rb_node);

                if (bdi->id > id)
                        p = &(*p)->rb_left;
                else if (bdi->id < id)
                        p = &(*p)->rb_right;
                else
                        break;
        }

        if (parentp)
                *parentp = parent;
        return p;
}

/**
 * bdi_get_by_id - lookup and get bdi from its id
 * @id: bdi id to lookup
 *
 * Find bdi matching @id and get it.  Returns NULL if the matching bdi
 * doesn't exist or is already unregistered.
 */
struct backing_dev_info *bdi_get_by_id(u64 id)
{
        struct backing_dev_info *bdi = NULL;
        struct rb_node **p;

        spin_lock_bh(&bdi_lock);
        p = bdi_lookup_rb_node(id, NULL);
        if (*p) {
                bdi = rb_entry(*p, struct backing_dev_info, rb_node);
                bdi_get(bdi);
        }
        spin_unlock_bh(&bdi_lock);

        return bdi;
}

int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
        struct device *dev;
        struct rb_node *parent, **p;

        if (bdi->dev)        /* The driver needs to use separate queues per device */
                return 0;

        vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
        dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        cgwb_bdi_register(bdi);
        bdi->dev = dev;

        bdi_debug_register(bdi, dev_name(dev));
        set_bit(WB_registered, &bdi->wb.state);

        spin_lock_bh(&bdi_lock);

        bdi->id = ++bdi_id_cursor;

        p = bdi_lookup_rb_node(bdi->id, &parent);
        rb_link_node(&bdi->rb_node, parent, p);
        rb_insert_color(&bdi->rb_node, &bdi_tree);

        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);

        spin_unlock_bh(&bdi_lock);

        trace_writeback_bdi_register(bdi);
        return 0;
}

int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
        va_list args;
        int ret;

        va_start(args, fmt);
        ret = bdi_register_va(bdi, fmt, args);
        va_end(args);
        return ret;
}
EXPORT_SYMBOL(bdi_register);

void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
{
        WARN_ON_ONCE(bdi->owner);
        bdi->owner = owner;
        get_device(owner);
}

/*
 * Remove bdi from bdi_list, and ensure that it is no longer visible
 */
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
        spin_lock_bh(&bdi_lock);
        rb_erase(&bdi->rb_node, &bdi_tree);
        list_del_rcu(&bdi->bdi_list);
        spin_unlock_bh(&bdi_lock);

        synchronize_rcu_expedited();
}

void bdi_unregister(struct backing_dev_info *bdi)
{
        /* make sure nobody finds us on the bdi_list anymore */
        bdi_remove_from_list(bdi);
        wb_shutdown(&bdi->wb);
        cgwb_bdi_unregister(bdi);

        /*
         * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
         * update the global bdi_min_ratio.
         */
        if (bdi->min_ratio)
                bdi_set_min_ratio(bdi, 0);

        if (bdi->dev) {
                bdi_debug_unregister(bdi);
                device_unregister(bdi->dev);
                bdi->dev = NULL;
        }

        if (bdi->owner) {
                put_device(bdi->owner);
                bdi->owner = NULL;
        }
}
EXPORT_SYMBOL(bdi_unregister);

static void release_bdi(struct kref *ref)
{
        struct backing_dev_info *bdi =
                        container_of(ref, struct backing_dev_info, refcnt);

        WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
        WARN_ON_ONCE(bdi->dev);
        wb_exit(&bdi->wb);
        kfree(bdi);
}

void bdi_put(struct backing_dev_info *bdi)
{
        kref_put(&bdi->refcnt, release_bdi);
}
EXPORT_SYMBOL(bdi_put);

struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
        struct super_block *sb;

        if (!inode)
                return &noop_backing_dev_info;

        sb = inode->i_sb;
#ifdef CONFIG_BLOCK
        if (sb_is_blkdev_sb(sb))
                return I_BDEV(inode)->bd_disk->bdi;
#endif
        return sb->s_bdi;
}
EXPORT_SYMBOL(inode_to_bdi);

const char *bdi_dev_name(struct backing_dev_info *bdi)
{
        if (!bdi || !bdi->dev)
                return bdi_unknown_name;
        return bdi->dev_name;
}
EXPORT_SYMBOL_GPL(bdi_dev_name);

































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
/* SPDX-License-Identifier: GPL-2.0
 *
 *        Network memory
 *
 *        Author:        Mina Almasry <almasrymina@google.com>
 */

#ifndef _NET_NETMEM_H
#define _NET_NETMEM_H

#include <linux/dma-mapping.h>
#include <linux/mm.h>
#include <net/net_debug.h>

/* These fields in struct page are used by the page_pool and net stack:
 *
 *        struct {
 *                unsigned long pp_magic;
 *                struct page_pool *pp;
 *                unsigned long _pp_mapping_pad;
 *                unsigned long dma_addr;
 *                atomic_long_t pp_ref_count;
 *        };
 *
 * We mirror the page_pool fields here so the page_pool can access these
 * fields without worrying whether the underlying fields belong to a
 * page or netmem_desc.
 *
 * CAUTION: Do not update the fields in netmem_desc without also
 * updating the anonymous aliasing union in struct net_iov.
 */
struct netmem_desc {
        unsigned long _flags;
        unsigned long pp_magic;
        struct page_pool *pp;
        unsigned long _pp_mapping_pad;
        unsigned long dma_addr;
        atomic_long_t pp_ref_count;
};

#define NETMEM_DESC_ASSERT_OFFSET(pg, desc)        \
        static_assert(offsetof(struct page, pg) == \
                      offsetof(struct netmem_desc, desc))
NETMEM_DESC_ASSERT_OFFSET(flags, _flags);
NETMEM_DESC_ASSERT_OFFSET(pp_magic, pp_magic);
NETMEM_DESC_ASSERT_OFFSET(pp, pp);
NETMEM_DESC_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad);
NETMEM_DESC_ASSERT_OFFSET(dma_addr, dma_addr);
NETMEM_DESC_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
#undef NETMEM_DESC_ASSERT_OFFSET

/*
 * Since struct netmem_desc uses the space in struct page, the size
 * should be checked, until struct netmem_desc has its own instance from
 * slab, to avoid conflicting with other members within struct page.
 */
static_assert(sizeof(struct netmem_desc) <= offsetof(struct page, _refcount));

/* net_iov */

DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);

/*  We overload the LSB of the struct page pointer to indicate whether it's
 *  a page or net_iov.
 */
#define NET_IOV 0x01UL

enum net_iov_type {
        NET_IOV_DMABUF,
        NET_IOV_IOURING,
};

/* A memory descriptor representing abstract networking I/O vectors,
 * generally for non-pages memory that doesn't have its corresponding
 * struct page and needs to be explicitly allocated through slab.
 *
 * net_iovs are allocated and used by networking code, and the size of
 * the chunk is PAGE_SIZE.
 *
 * This memory can be any form of non-struct paged memory.  Examples
 * include imported dmabuf memory and imported io_uring memory.  See
 * net_iov_type for all the supported types.
 *
 * @pp_magic:        pp field, similar to the one in struct page/struct
 *                netmem_desc.
 * @pp:                the pp this net_iov belongs to, if any.
 * @dma_addr:        the dma addrs of the net_iov. Needed for the network
 *                card to send/receive this net_iov.
 * @pp_ref_count: the pp ref count of this net_iov, exactly the same
 *                usage as struct page/struct netmem_desc.
 * @owner:        the net_iov_area this net_iov belongs to, if any.
 * @type:        the type of the memory.  Different types of net_iovs are
 *                supported.
 */
struct net_iov {
        struct netmem_desc desc;
        unsigned int page_type;
        enum net_iov_type type;
        struct net_iov_area *owner;
};

/* Make sure 'the offset of page_type in struct page == the offset of
 * type in struct net_iov'.
 */
#define NET_IOV_ASSERT_OFFSET(pg, iov)                        \
        static_assert(offsetof(struct page, pg) ==        \
                      offsetof(struct net_iov, iov))
NET_IOV_ASSERT_OFFSET(page_type, page_type);
#undef NET_IOV_ASSERT_OFFSET

struct net_iov_area {
        /* Array of net_iovs for this area. */
        struct net_iov *niovs;
        size_t num_niovs;

        /* Offset into the dma-buf where this chunk starts.  */
        unsigned long base_virtual;
};

static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov)
{
        return niov->owner;
}

static inline unsigned int net_iov_idx(const struct net_iov *niov)
{
        return niov - net_iov_owner(niov)->niovs;
}

/* netmem */

/**
 * typedef netmem_ref - a nonexistent type marking a reference to generic
 * network memory.
 *
 * A netmem_ref can be a struct page* or a struct net_iov* underneath.
 *
 * Use the supplied helpers to obtain the underlying memory pointer and fields.
 */
typedef unsigned long __bitwise netmem_ref;

static inline bool netmem_is_net_iov(const netmem_ref netmem)
{
        return (__force unsigned long)netmem & NET_IOV;
}

/**
 * __netmem_to_page - unsafely get pointer to the &page backing @netmem
 * @netmem: netmem reference to convert
 *
 * Unsafe version of netmem_to_page(). When @netmem is always page-backed,
 * e.g. when it's a header buffer, performs faster and generates smaller
 * object code (no check for the LSB, no WARN). When @netmem points to IOV,
 * provokes undefined behaviour.
 *
 * Return: pointer to the &page (garbage if @netmem is not page-backed).
 */
static inline struct page *__netmem_to_page(netmem_ref netmem)
{
        return (__force struct page *)netmem;
}

static inline struct page *netmem_to_page(netmem_ref netmem)
{
        if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
                return NULL;

        return __netmem_to_page(netmem);
}

static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return (struct net_iov *)((__force unsigned long)netmem &
                                          ~NET_IOV);

        DEBUG_NET_WARN_ON_ONCE(true);
        return NULL;
}

static inline netmem_ref net_iov_to_netmem(struct net_iov *niov)
{
        return (__force netmem_ref)((unsigned long)niov | NET_IOV);
}

#define page_to_netmem(p)        (_Generic((p),                        \
        const struct page * :        (__force const netmem_ref)(p),        \
        struct page * :                (__force netmem_ref)(p)))

/**
 * virt_to_netmem - convert virtual memory pointer to a netmem reference
 * @data: host memory pointer to convert
 *
 * Return: netmem reference to the &page backing this virtual address.
 */
static inline netmem_ref virt_to_netmem(const void *data)
{
        return page_to_netmem(virt_to_page(data));
}

static inline int netmem_ref_count(netmem_ref netmem)
{
        /* The non-pp refcount of net_iov is always 1. On net_iov, we only
         * support pp refcounting which uses the pp_ref_count field.
         */
        if (netmem_is_net_iov(netmem))
                return 1;

        return page_ref_count(netmem_to_page(netmem));
}

static inline unsigned long netmem_pfn_trace(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return 0;

        return page_to_pfn(netmem_to_page(netmem));
}

/* XXX: How to extract netmem_desc from page must be changed, once
 * netmem_desc no longer overlays on page and will be allocated through
 * slab.
 */
#define __pp_page_to_nmdesc(p)        (_Generic((p),                                \
        const struct page * :        (const struct netmem_desc *)(p),        \
        struct page * :                (struct netmem_desc *)(p)))

/* CAUTION: Check if the page is a pp page before calling this helper or
 * know it's a pp page.
 */
#define pp_page_to_nmdesc(p)                                                \
({                                                                        \
        DEBUG_NET_WARN_ON_ONCE(!PageNetpp(p));                                \
        __pp_page_to_nmdesc(p);                                                \
})

/**
 * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing
 * @netmem
 * @netmem: netmem reference to convert
 *
 * Unsafe version that can be used only when @netmem is always backed by
 * system memory, performs faster and generates smaller object code (no
 * check for the LSB, no WARN). When @netmem points to IOV, provokes
 * undefined behaviour.
 *
 * Return: pointer to the &netmem_desc (garbage if @netmem is not backed
 * by system memory).
 */
static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem)
{
        return (__force struct netmem_desc *)netmem;
}

/* netmem_to_nmdesc - convert netmem_ref to struct netmem_desc * for
 * access to common fields.
 * @netmem: netmem reference to get netmem_desc.
 *
 * All the sub types of netmem_ref (netmem_desc, net_iov) have the same
 * pp, pp_magic, dma_addr, and pp_ref_count fields via netmem_desc.
 *
 * Return: the pointer to struct netmem_desc * regardless of its
 * underlying type.
 */
static inline struct netmem_desc *netmem_to_nmdesc(netmem_ref netmem)
{
        void *p = (void *)((__force unsigned long)netmem & ~NET_IOV);

        if (netmem_is_net_iov(netmem))
                return &((struct net_iov *)p)->desc;

        return __pp_page_to_nmdesc((struct page *)p);
}

/**
 * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem
 * @netmem: netmem reference to get the pointer from
 *
 * Unsafe version of netmem_get_pp(). When @netmem is always page-backed,
 * e.g. when it's a header buffer, performs faster and generates smaller
 * object code (avoids clearing the LSB). When @netmem points to IOV,
 * provokes invalid memory access.
 *
 * Return: pointer to the &page_pool (garbage if @netmem is not page-backed).
 */
static inline struct page_pool *__netmem_get_pp(netmem_ref netmem)
{
        return __netmem_to_nmdesc(netmem)->pp;
}

static inline struct page_pool *netmem_get_pp(netmem_ref netmem)
{
        return netmem_to_nmdesc(netmem)->pp;
}

static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem)
{
        return &netmem_to_nmdesc(netmem)->pp_ref_count;
}

static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid)
{
        /* NUMA node preference only makes sense if we're allocating
         * system memory. Memory providers (which give us net_iovs)
         * choose for us.
         */
        if (netmem_is_net_iov(netmem))
                return true;

        return page_to_nid(netmem_to_page(netmem)) == pref_nid;
}

static inline netmem_ref netmem_compound_head(netmem_ref netmem)
{
        /* niov are never compounded */
        if (netmem_is_net_iov(netmem))
                return netmem;

        return page_to_netmem(compound_head(netmem_to_page(netmem)));
}

/**
 * __netmem_address - unsafely get pointer to the memory backing @netmem
 * @netmem: netmem reference to get the pointer for
 *
 * Unsafe version of netmem_address(). When @netmem is always page-backed,
 * e.g. when it's a header buffer, performs faster and generates smaller
 * object code (no check for the LSB). When @netmem points to IOV, provokes
 * undefined behaviour.
 *
 * Return: pointer to the memory (garbage if @netmem is not page-backed).
 */
static inline void *__netmem_address(netmem_ref netmem)
{
        return page_address(__netmem_to_page(netmem));
}

static inline void *netmem_address(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return NULL;

        return __netmem_address(netmem);
}

/**
 * netmem_is_pfmemalloc - check if @netmem was allocated under memory pressure
 * @netmem: netmem reference to check
 *
 * Return: true if @netmem is page-backed and the page was allocated under
 * memory pressure, false otherwise.
 */
static inline bool netmem_is_pfmemalloc(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return false;

        return page_is_pfmemalloc(netmem_to_page(netmem));
}

static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
{
        return netmem_to_nmdesc(netmem)->dma_addr;
}

#if defined(CONFIG_NET_DEVMEM)
static inline bool net_is_devmem_iov(const struct net_iov *niov)
{
        return niov->type == NET_IOV_DMABUF;
}
#else
static inline bool net_is_devmem_iov(const struct net_iov *niov)
{
        return false;
}
#endif

void __get_netmem(netmem_ref netmem);
void __put_netmem(netmem_ref netmem);

static __always_inline void get_netmem(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                __get_netmem(netmem);
        else
                get_page(netmem_to_page(netmem));
}

static __always_inline void put_netmem(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                __put_netmem(netmem);
        else
                put_page(netmem_to_page(netmem));
}

#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL)   \
        do {                                                     \
                if (!netmem_is_net_iov(NETMEM))                  \
                        dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \
                else                                             \
                        dma_unmap_addr_set(PTR, ADDR_NAME, 0);   \
        } while (0)

static inline void netmem_dma_unmap_page_attrs(struct device *dev,
                                               dma_addr_t addr, size_t size,
                                               enum dma_data_direction dir,
                                               unsigned long attrs)
{
        if (!addr)
                return;

        dma_unmap_page_attrs(dev, addr, size, dir, attrs);
}

#endif /* _NET_NETMEM_H */






















































    2 






    2 




































































































































































    2 



    2 

    2 













    2 









































































































    2 

































































    2 






    2 





























































































































































































































































































































































































































































































































    1 









    1 
    1 
    1 







    1 
    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS dat/inode allocator
 *
 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Originally written by Koji Sato.
 * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji.
 */

#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include "mdt.h"
#include "alloc.h"


/**
 * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
 *                                        descriptor block can maintain
 * @inode: inode of metadata file using this allocator
 *
 * Return: Number of groups that a group descriptor block can maintain.
 */
static inline unsigned long
nilfs_palloc_groups_per_desc_block(const struct inode *inode)
{
        return i_blocksize(inode) /
                sizeof(struct nilfs_palloc_group_desc);
}

/**
 * nilfs_palloc_groups_count - get maximum number of groups
 * @inode: inode of metadata file using this allocator
 *
 * Return: Maximum number of groups.
 */
static inline unsigned long
nilfs_palloc_groups_count(const struct inode *inode)
{
        return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
}

/**
 * nilfs_palloc_init_blockgroup - initialize private variables for allocator
 * @inode: inode of metadata file using this allocator
 * @entry_size: size of the persistent object
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size)
{
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);

        mi->mi_bgl = kmalloc_obj(*mi->mi_bgl, GFP_NOFS);
        if (!mi->mi_bgl)
                return -ENOMEM;

        bgl_lock_init(mi->mi_bgl);

        nilfs_mdt_set_entry_size(inode, entry_size, 0);

        mi->mi_blocks_per_group =
                DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
                             mi->mi_entries_per_block) + 1;
                /*
                 * Number of blocks in a group including entry blocks
                 * and a bitmap block
                 */
        mi->mi_blocks_per_desc_block =
                nilfs_palloc_groups_per_desc_block(inode) *
                mi->mi_blocks_per_group + 1;
                /*
                 * Number of blocks per descriptor including the
                 * descriptor block
                 */
        return 0;
}

/**
 * nilfs_palloc_group - get group number and offset from an entry number
 * @inode: inode of metadata file using this allocator
 * @nr: serial number of the entry (e.g. inode number)
 * @offset: pointer to store offset number in the group
 *
 * Return: Number of the group that contains the entry with the index
 * specified by @nr.
 */
static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
                                        unsigned long *offset)
{
        __u64 group = nr;

        *offset = do_div(group, nilfs_palloc_entries_per_group(inode));
        return group;
}

/**
 * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
 * @inode: inode of metadata file using this allocator
 * @group: group number
 *
 * Return: Index number in the metadata file of the descriptor block of
 * the group specified by @group.
 */
static unsigned long
nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
{
        unsigned long desc_block =
                group / nilfs_palloc_groups_per_desc_block(inode);
        return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
}

/**
 * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
 * @inode: inode of metadata file using this allocator
 * @group: group number
 *
 * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
 * block used to allocate/deallocate entries in the specified group.
 *
 * Return: Index number in the metadata file of the bitmap block of
 * the group specified by @group.
 */
static unsigned long
nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
{
        unsigned long desc_offset =
                group % nilfs_palloc_groups_per_desc_block(inode);
        return nilfs_palloc_desc_blkoff(inode, group) + 1 +
                desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
}

/**
 * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
 * @desc: pointer to descriptor structure for the group
 * @lock: spin lock protecting @desc
 *
 * Return: Number of free entries written in the group descriptor @desc.
 */
static unsigned long
nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
                               spinlock_t *lock)
{
        unsigned long nfree;

        spin_lock(lock);
        nfree = le32_to_cpu(desc->pg_nfrees);
        spin_unlock(lock);
        return nfree;
}

/**
 * nilfs_palloc_group_desc_add_entries - adjust count of free entries
 * @desc: pointer to descriptor structure for the group
 * @lock: spin lock protecting @desc
 * @n: delta to be added
 *
 * Return: Number of free entries after adjusting the group descriptor
 * @desc.
 */
static u32
nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
                                    spinlock_t *lock, u32 n)
{
        u32 nfree;

        spin_lock(lock);
        le32_add_cpu(&desc->pg_nfrees, n);
        nfree = le32_to_cpu(desc->pg_nfrees);
        spin_unlock(lock);
        return nfree;
}

/**
 * nilfs_palloc_entry_blkoff - get block offset of an entry block
 * @inode: inode of metadata file using this allocator
 * @nr: serial number of the entry (e.g. inode number)
 *
 * Return: Index number in the metadata file of the block containing
 * the entry specified by @nr.
 */
static unsigned long
nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
{
        unsigned long group, group_offset;

        group = nilfs_palloc_group(inode, nr, &group_offset);

        return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
                group_offset / NILFS_MDT(inode)->mi_entries_per_block;
}

/**
 * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
 * @inode: inode of metadata file
 * @bh: buffer head of the buffer to be initialized
 * @from: kernel address mapped for a chunk of the block
 *
 * This function does not yet support the case where block size > PAGE_SIZE.
 */
static void nilfs_palloc_desc_block_init(struct inode *inode,
                                         struct buffer_head *bh, void *from)
{
        struct nilfs_palloc_group_desc *desc = from;
        unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
        __le32 nfrees;

        nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
        while (n-- > 0) {
                desc->pg_nfrees = nfrees;
                desc++;
        }
}

static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
                                  int create,
                                  void (*init_block)(struct inode *,
                                                     struct buffer_head *,
                                                     void *),
                                  struct buffer_head **bhp,
                                  struct nilfs_bh_assoc *prev,
                                  spinlock_t *lock)
{
        int ret;

        spin_lock(lock);
        if (prev->bh && blkoff == prev->blkoff &&
            likely(buffer_uptodate(prev->bh))) {
                get_bh(prev->bh);
                *bhp = prev->bh;
                spin_unlock(lock);
                return 0;
        }
        spin_unlock(lock);

        ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp);
        if (!ret) {
                spin_lock(lock);
                /*
                 * The following code must be safe for change of the
                 * cache contents during the get block call.
                 */
                brelse(prev->bh);
                get_bh(*bhp);
                prev->bh = *bhp;
                prev->blkoff = blkoff;
                spin_unlock(lock);
        }
        return ret;
}

/**
 * nilfs_palloc_delete_block - delete a block on the persistent allocator file
 * @inode: inode of metadata file using this allocator
 * @blkoff: block offset
 * @prev: nilfs_bh_assoc struct of the last used buffer
 * @lock: spin lock protecting @prev
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - Non-existent block.
 * * %-ENOMEM        - Insufficient memory available.
 */
static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
                                     struct nilfs_bh_assoc *prev,
                                     spinlock_t *lock)
{
        spin_lock(lock);
        if (prev->bh && blkoff == prev->blkoff) {
                brelse(prev->bh);
                prev->bh = NULL;
        }
        spin_unlock(lock);
        return nilfs_mdt_delete_block(inode, blkoff);
}

/**
 * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
 * @inode: inode of metadata file using this allocator
 * @group: group number
 * @create: create flag
 * @bhp: pointer to store the resultant buffer head
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int nilfs_palloc_get_desc_block(struct inode *inode,
                                       unsigned long group,
                                       int create, struct buffer_head **bhp)
{
        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;

        return nilfs_palloc_get_block(inode,
                                      nilfs_palloc_desc_blkoff(inode, group),
                                      create, nilfs_palloc_desc_block_init,
                                      bhp, &cache->prev_desc, &cache->lock);
}

/**
 * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
 * @inode: inode of metadata file using this allocator
 * @group: group number
 * @create: create flag
 * @bhp: pointer to store the resultant buffer head
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                         unsigned long group,
                                         int create, struct buffer_head **bhp)
{
        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;

        return nilfs_palloc_get_block(inode,
                                      nilfs_palloc_bitmap_blkoff(inode, group),
                                      create, NULL, bhp,
                                      &cache->prev_bitmap, &cache->lock);
}

/**
 * nilfs_palloc_delete_bitmap_block - delete a bitmap block
 * @inode: inode of metadata file using this allocator
 * @group: group number
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
                                            unsigned long group)
{
        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;

        return nilfs_palloc_delete_block(inode,
                                         nilfs_palloc_bitmap_blkoff(inode,
                                                                    group),
                                         &cache->prev_bitmap, &cache->lock);
}

/**
 * nilfs_palloc_get_entry_block - get buffer head of an entry block
 * @inode: inode of metadata file using this allocator
 * @nr: serial number of the entry (e.g. inode number)
 * @create: create flag
 * @bhp: pointer to store the resultant buffer head
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                 int create, struct buffer_head **bhp)
{
        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;

        return nilfs_palloc_get_block(inode,
                                      nilfs_palloc_entry_blkoff(inode, nr),
                                      create, NULL, bhp,
                                      &cache->prev_entry, &cache->lock);
}

/**
 * nilfs_palloc_delete_entry_block - delete an entry block
 * @inode: inode of metadata file using this allocator
 * @nr: serial number of the entry
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
{
        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;

        return nilfs_palloc_delete_block(inode,
                                         nilfs_palloc_entry_blkoff(inode, nr),
                                         &cache->prev_entry, &cache->lock);
}

/**
 * nilfs_palloc_group_desc_offset - calculate the byte offset of a group
 *                                  descriptor in the folio containing it
 * @inode: inode of metadata file using this allocator
 * @group: group number
 * @bh:    buffer head of the group descriptor block
 *
 * Return: Byte offset in the folio of the group descriptor for @group.
 */
static size_t nilfs_palloc_group_desc_offset(const struct inode *inode,
                                             unsigned long group,
                                             const struct buffer_head *bh)
{
        return offset_in_folio(bh->b_folio, bh->b_data) +
                sizeof(struct nilfs_palloc_group_desc) *
                (group % nilfs_palloc_groups_per_desc_block(inode));
}

/**
 * nilfs_palloc_bitmap_offset - calculate the byte offset of a bitmap block
 *                              in the folio containing it
 * @bh: buffer head of the bitmap block
 *
 * Return: Byte offset in the folio of the bitmap block for @bh.
 */
static size_t nilfs_palloc_bitmap_offset(const struct buffer_head *bh)
{
        return offset_in_folio(bh->b_folio, bh->b_data);
}

/**
 * nilfs_palloc_entry_offset - calculate the byte offset of an entry in the
 *                             folio containing it
 * @inode: inode of metadata file using this allocator
 * @nr:    serial number of the entry (e.g. inode number)
 * @bh:    buffer head of the entry block
 *
 * Return: Byte offset in the folio of the entry @nr.
 */
size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr,
                                 const struct buffer_head *bh)
{
        unsigned long entry_index_in_group, entry_index_in_block;

        nilfs_palloc_group(inode, nr, &entry_index_in_group);
        entry_index_in_block = entry_index_in_group %
                NILFS_MDT(inode)->mi_entries_per_block;

        return offset_in_folio(bh->b_folio, bh->b_data) +
                entry_index_in_block * NILFS_MDT(inode)->mi_entry_size;
}

/**
 * nilfs_palloc_find_available_slot - find available slot in a group
 * @bitmap: bitmap of the group
 * @target: offset number of an entry in the group (start point)
 * @bsize: size in bits
 * @lock: spin lock protecting @bitmap
 * @wrap: whether to wrap around
 *
 * Return: Offset number within the group of the found free entry, or
 * %-ENOSPC if not found.
 */
static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
                                            unsigned long target,
                                            unsigned int bsize,
                                            spinlock_t *lock, bool wrap)
{
        int pos, end = bsize;

        if (likely(target < bsize)) {
                pos = target;
                do {
                        pos = nilfs_find_next_zero_bit(bitmap, end, pos);
                        if (pos >= end)
                                break;
                        if (!nilfs_set_bit_atomic(lock, pos, bitmap))
                                return pos;
                } while (++pos < end);

                end = target;
        }
        if (!wrap)
                return -ENOSPC;

        /* wrap around */
        for (pos = 0; pos < end; pos++) {
                pos = nilfs_find_next_zero_bit(bitmap, end, pos);
                if (pos >= end)
                        break;
                if (!nilfs_set_bit_atomic(lock, pos, bitmap))
                        return pos;
        }

        return -ENOSPC;
}

/**
 * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
 *                                            in a group descriptor block
 * @inode: inode of metadata file using this allocator
 * @curr: current group number
 * @max: maximum number of groups
 *
 * Return: Number of remaining descriptors (= groups) managed by the descriptor
 * block.
 */
static unsigned long
nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
                                       unsigned long curr, unsigned long max)
{
        return min_t(unsigned long,
                     nilfs_palloc_groups_per_desc_block(inode) -
                     curr % nilfs_palloc_groups_per_desc_block(inode),
                     max - curr + 1);
}

/**
 * nilfs_palloc_count_desc_blocks - count descriptor blocks number
 * @inode: inode of metadata file using this allocator
 * @desc_blocks: descriptor blocks number [out]
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int nilfs_palloc_count_desc_blocks(struct inode *inode,
                                            unsigned long *desc_blocks)
{
        __u64 blknum;
        int ret;

        ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
        if (likely(!ret))
                *desc_blocks = DIV_ROUND_UP(
                        (unsigned long)blknum,
                        NILFS_MDT(inode)->mi_blocks_per_desc_block);
        return ret;
}

/**
 * nilfs_palloc_mdt_file_can_grow - check potential opportunity for
 *                                        MDT file growing
 * @inode: inode of metadata file using this allocator
 * @desc_blocks: known current descriptor blocks count
 *
 * Return: true if a group can be added in the metadata file, false if not.
 */
static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
                                                    unsigned long desc_blocks)
{
        return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) <
                        nilfs_palloc_groups_count(inode);
}

/**
 * nilfs_palloc_count_max_entries - count max number of entries that can be
 *                                        described by descriptor blocks count
 * @inode: inode of metadata file using this allocator
 * @nused: current number of used entries
 * @nmaxp: max number of entries [out]
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 * * %-ERANGE        - Number of entries in use is out of range.
 */
int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
{
        unsigned long desc_blocks = 0;
        u64 entries_per_desc_block, nmax;
        int err;

        err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks);
        if (unlikely(err))
                return err;

        entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) *
                                nilfs_palloc_groups_per_desc_block(inode);
        nmax = entries_per_desc_block * desc_blocks;

        if (nused == nmax &&
                        nilfs_palloc_mdt_file_can_grow(inode, desc_blocks))
                nmax += entries_per_desc_block;

        if (nused > nmax)
                return -ERANGE;

        *nmaxp = nmax;
        return 0;
}

/**
 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
 * @inode: inode of metadata file using this allocator
 * @req: nilfs_palloc_req structure exchanged for the allocation
 * @wrap: whether to wrap around
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 * * %-ENOSPC        - Entries exhausted (No entries available for allocation).
 * * %-EROFS        - Read only filesystem
 */
int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req, bool wrap)
{
        struct buffer_head *desc_bh, *bitmap_bh;
        struct nilfs_palloc_group_desc *desc;
        unsigned char *bitmap;
        size_t doff, boff;
        unsigned long group, maxgroup, ngroups;
        unsigned long group_offset, maxgroup_offset;
        unsigned long n, entries_per_group;
        unsigned long i, j;
        spinlock_t *lock;
        int pos, ret;

        ngroups = nilfs_palloc_groups_count(inode);
        maxgroup = ngroups - 1;
        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
        entries_per_group = nilfs_palloc_entries_per_group(inode);

        for (i = 0; i < ngroups; i += n) {
                if (group >= ngroups && wrap) {
                        /* wrap around */
                        group = 0;
                        maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
                                                      &maxgroup_offset) - 1;
                }
                ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
                if (ret < 0)
                        return ret;

                doff = nilfs_palloc_group_desc_offset(inode, group, desc_bh);
                desc = kmap_local_folio(desc_bh->b_folio, doff);
                n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
                                                           maxgroup);
                for (j = 0; j < n; j++, group++, group_offset = 0) {
                        lock = nilfs_mdt_bgl_lock(inode, group);
                        if (nilfs_palloc_group_desc_nfrees(&desc[j], lock) == 0)
                                continue;

                        kunmap_local(desc);
                        ret = nilfs_palloc_get_bitmap_block(inode, group, 1,
                                                            &bitmap_bh);
                        if (unlikely(ret < 0)) {
                                brelse(desc_bh);
                                return ret;
                        }

                        /*
                         * Re-kmap the folio containing the first (and
                         * subsequent) group descriptors.
                         */
                        desc = kmap_local_folio(desc_bh->b_folio, doff);

                        boff = nilfs_palloc_bitmap_offset(bitmap_bh);
                        bitmap = kmap_local_folio(bitmap_bh->b_folio, boff);
                        pos = nilfs_palloc_find_available_slot(
                                bitmap, group_offset, entries_per_group, lock,
                                wrap);
                        /*
                         * Since the search for a free slot in the second and
                         * subsequent bitmap blocks always starts from the
                         * beginning, the wrap flag only has an effect on the
                         * first search.
                         */
                        kunmap_local(bitmap);
                        if (pos >= 0)
                                goto found;

                        brelse(bitmap_bh);
                }

                kunmap_local(desc);
                brelse(desc_bh);
        }

        /* no entries left */
        return -ENOSPC;

found:
        /* found a free entry */
        nilfs_palloc_group_desc_add_entries(&desc[j], lock, -1);
        req->pr_entry_nr = entries_per_group * group + pos;
        kunmap_local(desc);

        req->pr_desc_bh = desc_bh;
        req->pr_bitmap_bh = bitmap_bh;
        return 0;
}

/**
 * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
 * @inode: inode of metadata file using this allocator
 * @req: nilfs_palloc_req structure exchanged for the allocation
 */
void nilfs_palloc_commit_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req)
{
        mark_buffer_dirty(req->pr_bitmap_bh);
        mark_buffer_dirty(req->pr_desc_bh);
        nilfs_mdt_mark_dirty(inode);

        brelse(req->pr_bitmap_bh);
        brelse(req->pr_desc_bh);
}

/**
 * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
 * @inode: inode of metadata file using this allocator
 * @req: nilfs_palloc_req structure exchanged for the removal
 */
void nilfs_palloc_commit_free_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
{
        unsigned long group, group_offset;
        size_t doff, boff;
        struct nilfs_palloc_group_desc *desc;
        unsigned char *bitmap;
        spinlock_t *lock;

        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
        doff = nilfs_palloc_group_desc_offset(inode, group, req->pr_desc_bh);
        desc = kmap_local_folio(req->pr_desc_bh->b_folio, doff);

        boff = nilfs_palloc_bitmap_offset(req->pr_bitmap_bh);
        bitmap = kmap_local_folio(req->pr_bitmap_bh->b_folio, boff);
        lock = nilfs_mdt_bgl_lock(inode, group);

        if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
                nilfs_warn(inode->i_sb,
                           "%s (ino=%llu): entry number %llu already freed",
                           __func__, inode->i_ino,
                           (unsigned long long)req->pr_entry_nr);
        else
                nilfs_palloc_group_desc_add_entries(desc, lock, 1);

        kunmap_local(bitmap);
        kunmap_local(desc);

        mark_buffer_dirty(req->pr_desc_bh);
        mark_buffer_dirty(req->pr_bitmap_bh);
        nilfs_mdt_mark_dirty(inode);

        brelse(req->pr_bitmap_bh);
        brelse(req->pr_desc_bh);
}

/**
 * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
 * @inode: inode of metadata file using this allocator
 * @req: nilfs_palloc_req structure exchanged for the allocation
 */
void nilfs_palloc_abort_alloc_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
{
        struct nilfs_palloc_group_desc *desc;
        size_t doff, boff;
        unsigned char *bitmap;
        unsigned long group, group_offset;
        spinlock_t *lock;

        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
        doff = nilfs_palloc_group_desc_offset(inode, group, req->pr_desc_bh);
        desc = kmap_local_folio(req->pr_desc_bh->b_folio, doff);

        boff = nilfs_palloc_bitmap_offset(req->pr_bitmap_bh);
        bitmap = kmap_local_folio(req->pr_bitmap_bh->b_folio, boff);
        lock = nilfs_mdt_bgl_lock(inode, group);

        if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
                nilfs_warn(inode->i_sb,
                           "%s (ino=%llu): entry number %llu already freed",
                           __func__, inode->i_ino,
                           (unsigned long long)req->pr_entry_nr);
        else
                nilfs_palloc_group_desc_add_entries(desc, lock, 1);

        kunmap_local(bitmap);
        kunmap_local(desc);

        brelse(req->pr_bitmap_bh);
        brelse(req->pr_desc_bh);

        req->pr_entry_nr = 0;
        req->pr_bitmap_bh = NULL;
        req->pr_desc_bh = NULL;
}

/**
 * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
 * @inode: inode of metadata file using this allocator
 * @req: nilfs_palloc_req structure exchanged for the removal
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_palloc_prepare_free_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
{
        struct buffer_head *desc_bh, *bitmap_bh;
        unsigned long group, group_offset;
        int ret;

        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
        ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
        if (ret < 0)
                return ret;
        ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
        if (ret < 0) {
                brelse(desc_bh);
                return ret;
        }

        req->pr_desc_bh = desc_bh;
        req->pr_bitmap_bh = bitmap_bh;
        return 0;
}

/**
 * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
 * @inode: inode of metadata file using this allocator
 * @req: nilfs_palloc_req structure exchanged for the removal
 */
void nilfs_palloc_abort_free_entry(struct inode *inode,
                                   struct nilfs_palloc_req *req)
{
        brelse(req->pr_bitmap_bh);
        brelse(req->pr_desc_bh);

        req->pr_entry_nr = 0;
        req->pr_bitmap_bh = NULL;
        req->pr_desc_bh = NULL;
}

/**
 * nilfs_palloc_freev - deallocate a set of persistent objects
 * @inode: inode of metadata file using this allocator
 * @entry_nrs: array of entry numbers to be deallocated
 * @nitems: number of entries stored in @entry_nrs
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
{
        struct buffer_head *desc_bh, *bitmap_bh;
        struct nilfs_palloc_group_desc *desc;
        unsigned char *bitmap;
        size_t doff, boff;
        unsigned long group, group_offset;
        __u64 group_min_nr, last_nrs[8];
        const unsigned long epg = nilfs_palloc_entries_per_group(inode);
        const unsigned int epb = NILFS_MDT(inode)->mi_entries_per_block;
        unsigned int entry_start, end, pos;
        spinlock_t *lock;
        int i, j, k, ret;
        u32 nfree;

        for (i = 0; i < nitems; i = j) {
                int change_group = false;
                int nempties = 0, n = 0;

                group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
                ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
                if (ret < 0)
                        return ret;
                ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
                                                    &bitmap_bh);
                if (ret < 0) {
                        brelse(desc_bh);
                        return ret;
                }

                /* Get the first entry number of the group */
                group_min_nr = (__u64)group * epg;

                boff = nilfs_palloc_bitmap_offset(bitmap_bh);
                bitmap = kmap_local_folio(bitmap_bh->b_folio, boff);
                lock = nilfs_mdt_bgl_lock(inode, group);

                j = i;
                entry_start = rounddown(group_offset, epb);
                do {
                        if (!nilfs_clear_bit_atomic(lock, group_offset,
                                                    bitmap)) {
                                nilfs_warn(inode->i_sb,
                                           "%s (ino=%llu): entry number %llu already freed",
                                           __func__, inode->i_ino,
                                           (unsigned long long)entry_nrs[j]);
                        } else {
                                n++;
                        }

                        j++;
                        if (j >= nitems || entry_nrs[j] < group_min_nr ||
                            entry_nrs[j] >= group_min_nr + epg) {
                                change_group = true;
                        } else {
                                group_offset = entry_nrs[j] - group_min_nr;
                                if (group_offset >= entry_start &&
                                    group_offset < entry_start + epb) {
                                        /* This entry is in the same block */
                                        continue;
                                }
                        }

                        /* Test if the entry block is empty or not */
                        end = entry_start + epb;
                        pos = nilfs_find_next_bit(bitmap, end, entry_start);
                        if (pos >= end) {
                                last_nrs[nempties++] = entry_nrs[j - 1];
                                if (nempties >= ARRAY_SIZE(last_nrs))
                                        break;
                        }

                        if (change_group)
                                break;

                        /* Go on to the next entry block */
                        entry_start = rounddown(group_offset, epb);
                } while (true);

                kunmap_local(bitmap);
                mark_buffer_dirty(bitmap_bh);
                brelse(bitmap_bh);

                for (k = 0; k < nempties; k++) {
                        ret = nilfs_palloc_delete_entry_block(inode,
                                                              last_nrs[k]);
                        if (ret && ret != -ENOENT)
                                nilfs_warn(inode->i_sb,
                                           "error %d deleting block that object (entry=%llu, ino=%llu) belongs to",
                                           ret, (unsigned long long)last_nrs[k],
                                           inode->i_ino);
                }

                doff = nilfs_palloc_group_desc_offset(inode, group, desc_bh);
                desc = kmap_local_folio(desc_bh->b_folio, doff);
                nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
                kunmap_local(desc);
                mark_buffer_dirty(desc_bh);
                nilfs_mdt_mark_dirty(inode);
                brelse(desc_bh);

                if (nfree == nilfs_palloc_entries_per_group(inode)) {
                        ret = nilfs_palloc_delete_bitmap_block(inode, group);
                        if (ret && ret != -ENOENT)
                                nilfs_warn(inode->i_sb,
                                           "error %d deleting bitmap block of group=%lu, ino=%llu",
                                           ret, group, inode->i_ino);
                }
        }
        return 0;
}

void nilfs_palloc_setup_cache(struct inode *inode,
                              struct nilfs_palloc_cache *cache)
{
        NILFS_MDT(inode)->mi_palloc_cache = cache;
        spin_lock_init(&cache->lock);
}

void nilfs_palloc_clear_cache(struct inode *inode)
{
        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;

        spin_lock(&cache->lock);
        brelse(cache->prev_desc.bh);
        brelse(cache->prev_bitmap.bh);
        brelse(cache->prev_entry.bh);
        cache->prev_desc.bh = NULL;
        cache->prev_bitmap.bh = NULL;
        cache->prev_entry.bh = NULL;
        spin_unlock(&cache->lock);
}

void nilfs_palloc_destroy_cache(struct inode *inode)
{
        nilfs_palloc_clear_cache(inode);
        NILFS_MDT(inode)->mi_palloc_cache = NULL;
}































































































































































































































































































































































































































































































































































































































































































































    2 



    1 




    1 





































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook  */
#include <linux/rculist.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/bpf.h>
#include <linux/btf_ids.h>
#include <linux/bpf_local_storage.h>
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>

#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)

static struct bpf_local_storage_map_bucket *
select_bucket(struct bpf_local_storage_map *smap,
              struct bpf_local_storage *local_storage)
{
        return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)];
}

static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
{
        struct bpf_map *map = &smap->map;

        if (!map->ops->map_local_storage_charge)
                return 0;

        return map->ops->map_local_storage_charge(smap, owner, size);
}

static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
                         u32 size)
{
        struct bpf_map *map = &smap->map;

        if (map->ops->map_local_storage_uncharge)
                map->ops->map_local_storage_uncharge(smap, owner, size);
}

static struct bpf_local_storage __rcu **
owner_storage(struct bpf_local_storage_map *smap, void *owner)
{
        struct bpf_map *map = &smap->map;

        return map->ops->map_owner_storage_ptr(owner);
}

static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
{
        return !hlist_unhashed_lockless(&selem->snode);
}

static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
{
        return !hlist_unhashed(&selem->snode);
}

static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
{
        return !hlist_unhashed(&selem->map_node);
}

struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
                void *value, bool swap_uptrs)
{
        struct bpf_local_storage_elem *selem;

        if (mem_charge(smap, owner, smap->elem_size))
                return NULL;

        selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
                                       __GFP_ZERO, NUMA_NO_NODE);

        if (selem) {
                RCU_INIT_POINTER(SDATA(selem)->smap, smap);
                atomic_set(&selem->state, 0);

                if (value) {
                        /* No need to call check_and_init_map_value as memory is zero init */
                        copy_map_value(&smap->map, SDATA(selem)->data, value);
                        if (swap_uptrs)
                                bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
                }
                return selem;
        }

        mem_uncharge(smap, owner, smap->elem_size);

        return NULL;
}

static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
        struct bpf_local_storage *local_storage;

        /*
         * RCU Tasks Trace grace period implies RCU grace period, do
         * kfree() directly.
         */
        local_storage = container_of(rcu, struct bpf_local_storage, rcu);
        kfree(local_storage);
}

static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
                                   bool reuse_now)
{
        if (!local_storage)
                return;

        if (reuse_now) {
                kfree_rcu(local_storage, rcu);
                return;
        }

        call_rcu_tasks_trace(&local_storage->rcu,
                             bpf_local_storage_free_trace_rcu);
}

static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
        struct bpf_local_storage_elem *selem;
        struct bpf_local_storage_map *smap;

        selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
        /* The bpf_local_storage_map_free will wait for rcu_barrier */
        smap = rcu_dereference_check(SDATA(selem)->smap, 1);

        if (smap)
                bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
        /*
         * RCU Tasks Trace grace period implies RCU grace period, do
         * kfree() directly.
         */
        kfree(selem);
}

void bpf_selem_free(struct bpf_local_storage_elem *selem,
                    bool reuse_now)
{
        struct bpf_local_storage_map *smap;

        smap = rcu_dereference_check(SDATA(selem)->smap, 1);

        if (reuse_now) {
                if (smap)
                        bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
                kfree_rcu(selem, rcu);
                return;
        }

        call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
}

static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
{
        struct bpf_local_storage_elem *selem;
        struct hlist_node *n;

        /* The "_safe" iteration is needed.
         * The loop is not removing the selem from the list
         * but bpf_selem_free will use the selem->rcu_head
         * which is union-ized with the selem->free_node.
         */
        hlist_for_each_entry_safe(selem, n, list, free_node)
                bpf_selem_free(selem, reuse_now);
}

static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
                                                 struct bpf_local_storage_map *smap,
                                                 struct bpf_local_storage *local_storage,
                                                 bool free_local_storage, bool pin_owner)
{
        void *owner = local_storage->owner;
        u32 uncharge = smap->elem_size;

        if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
            SDATA(selem))
                RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);

        if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt))
                return;

        uncharge += free_local_storage ? sizeof(*local_storage) : 0;
        mem_uncharge(smap, local_storage->owner, uncharge);
        local_storage->mem_charge -= uncharge;

        if (free_local_storage) {
                local_storage->owner = NULL;

                /* After this RCU_INIT, owner may be freed and cannot be used */
                RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
        }

        if (pin_owner)
                refcount_dec(&local_storage->owner_refcnt);
}

/* local_storage->lock must be held and selem->local_storage == local_storage.
 * The caller must ensure selem->smap is still valid to be
 * dereferenced for its smap->elem_size and smap->cache_idx.
 */
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
                                            struct bpf_local_storage_elem *selem,
                                            struct hlist_head *free_selem_list)
{
        struct bpf_local_storage_map *smap;
        bool free_local_storage;

        smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());

        free_local_storage = hlist_is_singular_node(&selem->snode,
                                                    &local_storage->list);

        bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage,
                                             free_local_storage, false);

        hlist_del_init_rcu(&selem->snode);

        hlist_add_head(&selem->free_node, free_selem_list);

        return free_local_storage;
}

void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
                                   struct bpf_local_storage_elem *selem)
{
        struct bpf_local_storage_map *smap;

        smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
        local_storage->mem_charge += smap->elem_size;

        RCU_INIT_POINTER(selem->local_storage, local_storage);
        hlist_add_head_rcu(&selem->snode, &local_storage->list);
}

static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
{
        struct bpf_local_storage *local_storage;
        struct bpf_local_storage_map *smap;
        struct bpf_local_storage_map_bucket *b;
        unsigned long flags;
        int err;

        local_storage = rcu_dereference_check(selem->local_storage,
                                              bpf_rcu_lock_held());
        smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
        b = select_bucket(smap, local_storage);
        err = raw_res_spin_lock_irqsave(&b->lock, flags);
        if (err)
                return err;

        hlist_del_init_rcu(&selem->map_node);
        raw_res_spin_unlock_irqrestore(&b->lock, flags);

        return 0;
}

static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem)
{
        hlist_del_init_rcu(&selem->map_node);
}

int bpf_selem_link_map(struct bpf_local_storage_map *smap,
                       struct bpf_local_storage *local_storage,
                       struct bpf_local_storage_elem *selem)
{
        struct bpf_local_storage_map_bucket *b;
        unsigned long flags;
        int err;

        b = select_bucket(smap, local_storage);

        err = raw_res_spin_lock_irqsave(&b->lock, flags);
        if (err)
                return err;

        hlist_add_head_rcu(&selem->map_node, &b->list);
        raw_res_spin_unlock_irqrestore(&b->lock, flags);

        return 0;
}

static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
                                      struct bpf_local_storage_elem *selem)
{
        hlist_add_head_rcu(&selem->map_node, &b->list);
}

/*
 * Unlink an selem from map and local storage with lock held.
 * This is the common path used by local storages to delete an selem.
 */
int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
{
        struct bpf_local_storage *local_storage;
        bool free_local_storage = false;
        HLIST_HEAD(selem_free_list);
        unsigned long flags;
        int err;

        if (in_nmi())
                return -EOPNOTSUPP;

        if (unlikely(!selem_linked_to_storage_lockless(selem)))
                /* selem has already been unlinked from sk */
                return 0;

        local_storage = rcu_dereference_check(selem->local_storage,
                                              bpf_rcu_lock_held());

        err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
        if (err)
                return err;

        if (likely(selem_linked_to_storage(selem))) {
                /* Always unlink from map before unlinking from local_storage
                 * because selem will be freed after successfully unlinked from
                 * the local_storage.
                 */
                err = bpf_selem_unlink_map(selem);
                if (err)
                        goto out;

                free_local_storage = bpf_selem_unlink_storage_nolock(
                        local_storage, selem, &selem_free_list);
        }
out:
        raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);

        bpf_selem_free_list(&selem_free_list, false);

        if (free_local_storage)
                bpf_local_storage_free(local_storage, false);

        return err;
}

/*
 * Unlink an selem from map and local storage with lockless fallback if callers
 * are racing or rqspinlock returns error. It should only be called by
 * bpf_local_storage_destroy() or bpf_local_storage_map_free().
 */
static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
                                    struct bpf_local_storage_map_bucket *b)
{
        bool in_map_free = !!b, free_storage = false;
        struct bpf_local_storage *local_storage;
        struct bpf_local_storage_map *smap;
        unsigned long flags;
        int err, unlink = 0;

        local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
        smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());

        if (smap) {
                b = b ? : select_bucket(smap, local_storage);
                err = raw_res_spin_lock_irqsave(&b->lock, flags);
                if (!err) {
                        /*
                         * Call bpf_obj_free_fields() under b->lock to make sure it is done
                         * exactly once for an selem. Safe to free special fields immediately
                         * as no BPF program should be referencing the selem.
                         */
                        if (likely(selem_linked_to_map(selem))) {
                                hlist_del_init_rcu(&selem->map_node);
                                bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
                                unlink++;
                        }
                        raw_res_spin_unlock_irqrestore(&b->lock, flags);
                }
                /*
                 * Highly unlikely scenario: resource leak
                 *
                 * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
                 * and both selem belong to the same bucket, if destroy(selem2) acquired
                 * b->lock and block for too long, neither map_free(selem1) and
                 * destroy(selem1) will be able to free the special field associated
                 * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
                 */
                WARN_ON_ONCE(err && in_map_free);
                if (!err || in_map_free)
                        RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
        }

        if (local_storage) {
                err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
                if (!err) {
                        if (likely(selem_linked_to_storage(selem))) {
                                free_storage = hlist_is_singular_node(&selem->snode,
                                                                      &local_storage->list);
                                 /*
                                  * Okay to skip clearing owner_storage and storage->owner in
                                  * destroy() since the owner is going away. No user or bpf
                                  * programs should be able to reference it.
                                  */
                                if (smap && in_map_free)
                                        bpf_selem_unlink_storage_nolock_misc(
                                                selem, smap, local_storage,
                                                free_storage, true);
                                hlist_del_init_rcu(&selem->snode);
                                unlink++;
                        }
                        raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
                }
                /*
                 * Highly unlikely scenario: memory leak
                 *
                 * When destroy() fails to acqurire local_storage->lock and initializes
                 * selem->local_storage to NULL before any racing map_free() sees the same
                 * selem, no one will free the local storage.
                 */
                WARN_ON_ONCE(err && !in_map_free);
                if (!err || !in_map_free)
                        RCU_INIT_POINTER(selem->local_storage, NULL);
        }

        if (unlink != 2)
                atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state);

        /*
         * Normally, an selem can be unlinked under local_storage->lock and b->lock, and
         * then freed after an RCU grace period. However, if destroy() and map_free() are
         * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
         * the selem only after both map_free() and destroy() see the selem.
         */
        if (unlink == 2 ||
            atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED)
                bpf_selem_free(selem, true);

        if (free_storage)
                bpf_local_storage_free(local_storage, true);
}

void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
                                      struct bpf_local_storage_map *smap,
                                      struct bpf_local_storage_elem *selem)
{
        unsigned long flags;
        int err;

        /* spinlock is needed to avoid racing with the
         * parallel delete.  Otherwise, publishing an already
         * deleted sdata to the cache will become a use-after-free
         * problem in the next bpf_local_storage_lookup().
         */
        err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
        if (err)
                return;

        if (selem_linked_to_storage(selem))
                rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
        raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
}

static int check_flags(const struct bpf_local_storage_data *old_sdata,
                       u64 map_flags)
{
        if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
                /* elem already exists */
                return -EEXIST;

        if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
                /* elem doesn't exist, cannot update it */
                return -ENOENT;

        return 0;
}

int bpf_local_storage_alloc(void *owner,
                            struct bpf_local_storage_map *smap,
                            struct bpf_local_storage_elem *first_selem)
{
        struct bpf_local_storage *prev_storage, *storage;
        struct bpf_local_storage **owner_storage_ptr;
        struct bpf_local_storage_map_bucket *b;
        unsigned long flags;
        int err;

        err = mem_charge(smap, owner, sizeof(*storage));
        if (err)
                return err;

        storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
                                         __GFP_ZERO, NUMA_NO_NODE);
        if (!storage) {
                err = -ENOMEM;
                goto uncharge;
        }

        INIT_HLIST_HEAD(&storage->list);
        raw_res_spin_lock_init(&storage->lock);
        storage->owner = owner;
        storage->mem_charge = sizeof(*storage);
        refcount_set(&storage->owner_refcnt, 1);

        bpf_selem_link_storage_nolock(storage, first_selem);

        b = select_bucket(smap, storage);
        err = raw_res_spin_lock_irqsave(&b->lock, flags);
        if (err)
                goto uncharge;

        bpf_selem_link_map_nolock(b, first_selem);

        owner_storage_ptr =
                (struct bpf_local_storage **)owner_storage(smap, owner);
        /* Publish storage to the owner.
         * Instead of using any lock of the kernel object (i.e. owner),
         * cmpxchg will work with any kernel object regardless what
         * the running context is, bh, irq...etc.
         *
         * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
         * is protected by the storage->lock.  Hence, when freeing
         * the owner->storage, the storage->lock must be held before
         * setting owner->storage ptr to NULL.
         */
        prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
        if (unlikely(prev_storage)) {
                bpf_selem_unlink_map_nolock(first_selem);
                raw_res_spin_unlock_irqrestore(&b->lock, flags);
                err = -EAGAIN;
                goto uncharge;
        }
        raw_res_spin_unlock_irqrestore(&b->lock, flags);

        return 0;

uncharge:
        bpf_local_storage_free(storage, true);
        mem_uncharge(smap, owner, sizeof(*storage));
        return err;
}

/* sk cannot be going away because it is linking new elem
 * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
 * Otherwise, it will become a leak (and other memory issues
 * during map destruction).
 */
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
                         void *value, u64 map_flags, bool swap_uptrs)
{
        struct bpf_local_storage_data *old_sdata = NULL;
        struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
        struct bpf_local_storage *local_storage;
        struct bpf_local_storage_map_bucket *b;
        HLIST_HEAD(old_selem_free_list);
        unsigned long flags, b_flags;
        int err;

        /* BPF_EXIST and BPF_NOEXIST cannot be both set */
        if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
            /* BPF_F_LOCK can only be used in a value with spin_lock */
            unlikely((map_flags & BPF_F_LOCK) &&
                     !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
                return ERR_PTR(-EINVAL);

        local_storage = rcu_dereference_check(*owner_storage(smap, owner),
                                              bpf_rcu_lock_held());
        if (!local_storage || hlist_empty(&local_storage->list)) {
                /* Very first elem for the owner */
                err = check_flags(NULL, map_flags);
                if (err)
                        return ERR_PTR(err);

                selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
                if (!selem)
                        return ERR_PTR(-ENOMEM);

                err = bpf_local_storage_alloc(owner, smap, selem);
                if (err) {
                        bpf_selem_free(selem, true);
                        mem_uncharge(smap, owner, smap->elem_size);
                        return ERR_PTR(err);
                }

                return SDATA(selem);
        }

        if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
                /* Hoping to find an old_sdata to do inline update
                 * such that it can avoid taking the local_storage->lock
                 * and changing the lists.
                 */
                old_sdata =
                        bpf_local_storage_lookup(local_storage, smap, false);
                err = check_flags(old_sdata, map_flags);
                if (err)
                        return ERR_PTR(err);
                if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
                        copy_map_value_locked(&smap->map, old_sdata->data,
                                              value, false);
                        return old_sdata;
                }
        }

        /* A lookup has just been done before and concluded a new selem is
         * needed. The chance of an unnecessary alloc is unlikely.
         */
        alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
        if (!alloc_selem)
                return ERR_PTR(-ENOMEM);

        err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
        if (err)
                goto free_selem;

        /* Recheck local_storage->list under local_storage->lock */
        if (unlikely(hlist_empty(&local_storage->list))) {
                /* A parallel del is happening and local_storage is going
                 * away.  It has just been checked before, so very
                 * unlikely.  Return instead of retry to keep things
                 * simple.
                 */
                err = -EAGAIN;
                goto unlock;
        }

        old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
        err = check_flags(old_sdata, map_flags);
        if (err)
                goto unlock;

        if (old_sdata && (map_flags & BPF_F_LOCK)) {
                copy_map_value_locked(&smap->map, old_sdata->data, value,
                                      false);
                selem = SELEM(old_sdata);
                goto unlock;
        }

        b = select_bucket(smap, local_storage);

        err = raw_res_spin_lock_irqsave(&b->lock, b_flags);
        if (err)
                goto unlock;

        alloc_selem = NULL;
        /* First, link the new selem to the map */
        bpf_selem_link_map_nolock(b, selem);

        /* Second, link (and publish) the new selem to local_storage */
        bpf_selem_link_storage_nolock(local_storage, selem);

        /* Third, remove old selem, SELEM(old_sdata) */
        if (old_sdata) {
                bpf_selem_unlink_map_nolock(SELEM(old_sdata));
                bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
                                                &old_selem_free_list);
        }

        raw_res_spin_unlock_irqrestore(&b->lock, b_flags);
unlock:
        raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
free_selem:
        bpf_selem_free_list(&old_selem_free_list, false);
        if (alloc_selem) {
                mem_uncharge(smap, owner, smap->elem_size);
                bpf_selem_free(alloc_selem, true);
        }
        return err ? ERR_PTR(err) : SDATA(selem);
}

static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
{
        u64 min_usage = U64_MAX;
        u16 i, res = 0;

        spin_lock(&cache->idx_lock);

        for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
                if (cache->idx_usage_counts[i] < min_usage) {
                        min_usage = cache->idx_usage_counts[i];
                        res = i;

                        /* Found a free cache_idx */
                        if (!min_usage)
                                break;
                }
        }
        cache->idx_usage_counts[res]++;

        spin_unlock(&cache->idx_lock);

        return res;
}

static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
                                             u16 idx)
{
        spin_lock(&cache->idx_lock);
        cache->idx_usage_counts[idx]--;
        spin_unlock(&cache->idx_lock);
}

int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
{
        if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
            !(attr->map_flags & BPF_F_NO_PREALLOC) ||
            attr->max_entries ||
            attr->key_size != sizeof(int) || !attr->value_size ||
            /* Enforce BTF for userspace sk dumping */
            !attr->btf_key_type_id || !attr->btf_value_type_id)
                return -EINVAL;

        if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
                return -E2BIG;

        return 0;
}

int bpf_local_storage_map_check_btf(struct bpf_map *map,
                                    const struct btf *btf,
                                    const struct btf_type *key_type,
                                    const struct btf_type *value_type)
{
        if (!btf_type_is_i32(key_type))
                return -EINVAL;

        return 0;
}

/*
 * Destroy local storage when the owner is going away. Caller must uncharge memory
 * if memory charging is used.
 */
u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
        struct bpf_local_storage_elem *selem;

        /* Neither the bpf_prog nor the bpf_map's syscall
         * could be modifying the local_storage->list now.
         * Thus, no elem can be added to or deleted from the
         * local_storage->list by the bpf_prog or by the bpf_map's syscall.
         *
         * It is racing with bpf_local_storage_map_free() alone
         * when unlinking elem from the local_storage->list and
         * the map's bucket->list.
         */
        hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
                bpf_selem_unlink_nofail(selem, NULL);

        if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
                while (refcount_read(&local_storage->owner_refcnt))
                        cpu_relax();
                /*
                 * Paired with refcount_dec() in bpf_selem_unlink_nofail()
                 * to make sure destroy() sees the correct local_storage->mem_charge.
                 */
                smp_mb();
        }

        return local_storage->mem_charge;
}

u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
{
        struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
        u64 usage = sizeof(*smap);

        /* The dynamically callocated selems are not counted currently. */
        usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
        return usage;
}

struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
                            struct bpf_local_storage_cache *cache)
{
        struct bpf_local_storage_map *smap;
        unsigned int i;
        u32 nbuckets;
        int err;

        smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
        if (!smap)
                return ERR_PTR(-ENOMEM);
        bpf_map_init_from_attr(&smap->map, attr);

        nbuckets = roundup_pow_of_two(num_possible_cpus());
        /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
        nbuckets = max_t(u32, 2, nbuckets);
        smap->bucket_log = ilog2(nbuckets);

        smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
                                         sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
        if (!smap->buckets) {
                err = -ENOMEM;
                goto free_smap;
        }

        for (i = 0; i < nbuckets; i++) {
                INIT_HLIST_HEAD(&smap->buckets[i].list);
                raw_res_spin_lock_init(&smap->buckets[i].lock);
        }

        smap->elem_size = offsetof(struct bpf_local_storage_elem,
                                   sdata.data[attr->value_size]);

        smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
        return &smap->map;

free_smap:
        kvfree(smap->buckets);
        bpf_map_area_free(smap);
        return ERR_PTR(err);
}

void bpf_local_storage_map_free(struct bpf_map *map,
                                struct bpf_local_storage_cache *cache)
{
        struct bpf_local_storage_map_bucket *b;
        struct bpf_local_storage_elem *selem;
        struct bpf_local_storage_map *smap;
        unsigned int i;

        smap = (struct bpf_local_storage_map *)map;
        bpf_local_storage_cache_idx_free(cache, smap->cache_idx);

        /* Note that this map might be concurrently cloned from
         * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
         * RCU read section to finish before proceeding. New RCU
         * read sections should be prevented via bpf_map_inc_not_zero.
         */
        synchronize_rcu();

        /* bpf prog and the userspace can no longer access this map
         * now.  No new selem (of this map) can be added
         * to the owner->storage or to the map bucket's list.
         *
         * The elem of this map can be cleaned up here
         * or when the storage is freed e.g.
         * by bpf_sk_storage_free() during __sk_destruct().
         */
        for (i = 0; i < (1U << smap->bucket_log); i++) {
                b = &smap->buckets[i];

                rcu_read_lock();
                /* No one is adding to b->list now */
restart:
                hlist_for_each_entry_rcu(selem, &b->list, map_node) {
                        bpf_selem_unlink_nofail(selem, b);

                        if (need_resched()) {
                                cond_resched_rcu();
                                goto restart;
                        }
                }
                rcu_read_unlock();
        }

        /* While freeing the storage we may still need to access the map.
         *
         * e.g. when bpf_sk_storage_free() has unlinked selem from the map
         * which then made the above while((selem = ...)) loop
         * exit immediately.
         *
         * However, while freeing the storage one still needs to access the
         * smap->elem_size to do the uncharging in
         * bpf_selem_unlink_storage_nolock().
         *
         * Hence, wait another rcu grace period for the storage to be freed.
         */
        synchronize_rcu();

        /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
        rcu_barrier_tasks_trace();
        rcu_barrier();
        kvfree(smap->buckets);
        bpf_map_area_free(smap);
}























    1 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions of the Internet Protocol.
 *
 * Version:        @(#)in.h        1.0.1        04/21/93
 *
 * Authors:        Original taken from the GNU Project <netinet/in.h> file.
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_IN_H
#define _LINUX_IN_H


#include <linux/errno.h>
#include <uapi/linux/in.h>

static inline int proto_ports_offset(int proto)
{
        switch (proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_DCCP:
        case IPPROTO_ESP:        /* SPI */
        case IPPROTO_SCTP:
        case IPPROTO_UDPLITE:
                return 0;
        case IPPROTO_AH:        /* SPI */
                return 4;
        default:
                return -EINVAL;
        }
}

static inline bool ipv4_is_loopback(__be32 addr)
{
        return (addr & htonl(0xff000000)) == htonl(0x7f000000);
}

static inline bool ipv4_is_multicast(__be32 addr)
{
        return (addr & htonl(0xf0000000)) == htonl(0xe0000000);
}

static inline bool ipv4_is_local_multicast(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xe0000000);
}

static inline bool ipv4_is_lbcast(__be32 addr)
{
        /* limited broadcast */
        return addr == htonl(INADDR_BROADCAST);
}

static inline bool ipv4_is_all_snoopers(__be32 addr)
{
        return addr == htonl(INADDR_ALLSNOOPERS_GROUP);
}

static inline bool ipv4_is_zeronet(__be32 addr)
{
        return (addr == 0);
}

/* Special-Use IPv4 Addresses (RFC3330) */

static inline bool ipv4_is_private_10(__be32 addr)
{
        return (addr & htonl(0xff000000)) == htonl(0x0a000000);
}

static inline bool ipv4_is_private_172(__be32 addr)
{
        return (addr & htonl(0xfff00000)) == htonl(0xac100000);
}

static inline bool ipv4_is_private_192(__be32 addr)
{
        return (addr & htonl(0xffff0000)) == htonl(0xc0a80000);
}

static inline bool ipv4_is_linklocal_169(__be32 addr)
{
        return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000);
}

static inline bool ipv4_is_anycast_6to4(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xc0586300);
}

static inline bool ipv4_is_test_192(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xc0000200);
}

static inline bool ipv4_is_test_198(__be32 addr)
{
        return (addr & htonl(0xfffe0000)) == htonl(0xc6120000);
}
#endif        /* _LINUX_IN_H */


















































































































































































































































    2 




    3 











    3 




    3 





























    3 









    3 




    2 
    3 

    3 






    2 























    2 

    2 
    2 
















    2 
    2 












    2 

    1 








    2 








    2 





    2 











    1 
    1 


    1 
    1 








    2 











    2 






    2 






    2 





    1 

    1 









    1 














    1 




    1 









    2 




    2 



    2 











































































































































    3 
    3 



    3 





    3 



    3 



    3 



    2 



    3 














    2 







    2 



    2 



    2 



    2 









    2 










    2 


    2 














































































    1 

    2 







    2 
    2 
    1 









    3 



    3 
    3 









    3 

    2 
    1 


    3 

    2 
    3 





    3 






    3 












    3 







    1 





    2 
    2 


    1 








    3 


    3 




    2 























    3 





























    3 


























































































































































































































































































































































































































































































































































































































































































































































    3 





















    3 


    2 



























    3 


    3 






















    3 















































    3 



    3 























    3 


    3 


















































    3 




















































































































































    3 
    3 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/usb/core/driver.c - most of the driver model stuff for usb
 *
 * (C) Copyright 2005 Greg Kroah-Hartman <gregkh@suse.de>
 *
 * based on drivers/usb/usb.c which had the following copyrights:
 *        (C) Copyright Linus Torvalds 1999
 *        (C) Copyright Johannes Erdfelt 1999-2001
 *        (C) Copyright Andreas Gal 1999
 *        (C) Copyright Gregory P. Smith 1999
 *        (C) Copyright Deti Fliegl 1999 (new USB architecture)
 *        (C) Copyright Randy Dunlap 2000
 *        (C) Copyright David Brownell 2000-2004
 *        (C) Copyright Yggdrasil Computing, Inc. 2000
 *                (usb_device_id matching changes by Adam J. Richter)
 *        (C) Copyright Greg Kroah-Hartman 2002-2003
 *
 * Released under the GPLv2 only.
 *
 * NOTE! This is not actually a driver at all, rather this is
 * just a collection of helper routines that implement the
 * matching, probing, releasing, suspending and resuming for
 * real drivers.
 *
 */

#include <linux/device.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/usb.h>
#include <linux/usb/quirks.h>
#include <linux/usb/hcd.h>

#include "usb.h"


/*
 * Adds a new dynamic USBdevice ID to this driver,
 * and cause the driver to probe for all devices again.
 */
ssize_t usb_store_new_id(struct usb_dynids *dynids,
                         const struct usb_device_id *id_table,
                         struct device_driver *driver,
                         const char *buf, size_t count)
{
        struct usb_dynid *dynid;
        u32 idVendor = 0;
        u32 idProduct = 0;
        unsigned int bInterfaceClass = 0;
        u32 refVendor, refProduct;
        int fields = 0;
        int retval = 0;

        fields = sscanf(buf, "%x %x %x %x %x", &idVendor, &idProduct,
                        &bInterfaceClass, &refVendor, &refProduct);
        if (fields < 2)
                return -EINVAL;

        dynid = kzalloc_obj(*dynid);
        if (!dynid)
                return -ENOMEM;

        INIT_LIST_HEAD(&dynid->node);
        dynid->id.idVendor = idVendor;
        dynid->id.idProduct = idProduct;
        dynid->id.match_flags = USB_DEVICE_ID_MATCH_DEVICE;
        if (fields > 2 && bInterfaceClass) {
                if (bInterfaceClass > 255) {
                        retval = -EINVAL;
                        goto fail;
                }

                dynid->id.bInterfaceClass = (u8)bInterfaceClass;
                dynid->id.match_flags |= USB_DEVICE_ID_MATCH_INT_CLASS;
        }

        if (fields > 4) {
                const struct usb_device_id *id = id_table;

                if (!id) {
                        retval = -ENODEV;
                        goto fail;
                }

                for (; id->match_flags; id++)
                        if (id->idVendor == refVendor && id->idProduct == refProduct)
                                break;

                if (id->match_flags) {
                        dynid->id.driver_info = id->driver_info;
                } else {
                        retval = -ENODEV;
                        goto fail;
                }
        }

        mutex_lock(&usb_dynids_lock);
        list_add_tail(&dynid->node, &dynids->list);
        mutex_unlock(&usb_dynids_lock);

        retval = driver_attach(driver);

        if (retval)
                return retval;
        return count;

fail:
        kfree(dynid);
        return retval;
}
EXPORT_SYMBOL_GPL(usb_store_new_id);

ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf)
{
        struct usb_dynid *dynid;
        size_t count = 0;

        guard(mutex)(&usb_dynids_lock);
        list_for_each_entry(dynid, &dynids->list, node)
                if (dynid->id.bInterfaceClass != 0)
                        count += sysfs_emit_at(buf, count, "%04x %04x %02x\n",
                                           dynid->id.idVendor, dynid->id.idProduct,
                                           dynid->id.bInterfaceClass);
                else
                        count += sysfs_emit_at(buf, count, "%04x %04x\n",
                                           dynid->id.idVendor, dynid->id.idProduct);
        return count;
}
EXPORT_SYMBOL_GPL(usb_show_dynids);

static ssize_t new_id_show(struct device_driver *driver, char *buf)
{
        struct usb_driver *usb_drv = to_usb_driver(driver);

        return usb_show_dynids(&usb_drv->dynids, buf);
}

static ssize_t new_id_store(struct device_driver *driver,
                            const char *buf, size_t count)
{
        struct usb_driver *usb_drv = to_usb_driver(driver);

        return usb_store_new_id(&usb_drv->dynids, usb_drv->id_table, driver, buf, count);
}
static DRIVER_ATTR_RW(new_id);

/*
 * Remove a USB device ID from this driver
 */
static ssize_t remove_id_store(struct device_driver *driver, const char *buf,
                               size_t count)
{
        struct usb_dynid *dynid, *n;
        struct usb_driver *usb_driver = to_usb_driver(driver);
        u32 idVendor;
        u32 idProduct;
        int fields;

        fields = sscanf(buf, "%x %x", &idVendor, &idProduct);
        if (fields < 2)
                return -EINVAL;

        guard(mutex)(&usb_dynids_lock);
        list_for_each_entry_safe(dynid, n, &usb_driver->dynids.list, node) {
                struct usb_device_id *id = &dynid->id;

                if ((id->idVendor == idVendor) &&
                    (id->idProduct == idProduct)) {
                        list_del(&dynid->node);
                        kfree(dynid);
                        break;
                }
        }
        return count;
}

static ssize_t remove_id_show(struct device_driver *driver, char *buf)
{
        return new_id_show(driver, buf);
}
static DRIVER_ATTR_RW(remove_id);

static int usb_create_newid_files(struct usb_driver *usb_drv)
{
        int error = 0;

        if (usb_drv->no_dynamic_id)
                goto exit;

        if (usb_drv->probe != NULL) {
                error = driver_create_file(&usb_drv->driver,
                                           &driver_attr_new_id);
                if (error == 0) {
                        error = driver_create_file(&usb_drv->driver,
                                        &driver_attr_remove_id);
                        if (error)
                                driver_remove_file(&usb_drv->driver,
                                                &driver_attr_new_id);
                }
        }
exit:
        return error;
}

static void usb_remove_newid_files(struct usb_driver *usb_drv)
{
        if (usb_drv->no_dynamic_id)
                return;

        if (usb_drv->probe != NULL) {
                driver_remove_file(&usb_drv->driver,
                                &driver_attr_remove_id);
                driver_remove_file(&usb_drv->driver,
                                   &driver_attr_new_id);
        }
}

static void usb_free_dynids(struct usb_driver *usb_drv)
{
        struct usb_dynid *dynid, *n;

        guard(mutex)(&usb_dynids_lock);
        list_for_each_entry_safe(dynid, n, &usb_drv->dynids.list, node) {
                list_del(&dynid->node);
                kfree(dynid);
        }
}

static const struct usb_device_id *usb_match_dynamic_id(struct usb_interface *intf,
                                                        const struct usb_driver *drv)
{
        struct usb_dynid *dynid;

        guard(mutex)(&usb_dynids_lock);
        list_for_each_entry(dynid, &drv->dynids.list, node) {
                if (usb_match_one_id(intf, &dynid->id)) {
                        return &dynid->id;
                }
        }
        return NULL;
}


/* called from driver core with dev locked */
static int usb_probe_device(struct device *dev)
{
        struct usb_device_driver *udriver = to_usb_device_driver(dev->driver);
        struct usb_device *udev = to_usb_device(dev);
        int error = 0;

        dev_dbg(dev, "%s\n", __func__);

        /* TODO: Add real matching code */

        /* The device should always appear to be in use
         * unless the driver supports autosuspend.
         */
        if (!udriver->supports_autosuspend)
                error = usb_autoresume_device(udev);
        if (error)
                return error;

        if (udriver->generic_subclass)
                error = usb_generic_driver_probe(udev);
        if (error)
                return error;

        /* Probe the USB device with the driver in hand, but only
         * defer to a generic driver in case the current USB
         * device driver has an id_table or a match function; i.e.,
         * when the device driver was explicitly matched against
         * a device.
         *
         * If the device driver does not have either of these,
         * then we assume that it can bind to any device and is
         * not truly a more specialized/non-generic driver, so a
         * return value of -ENODEV should not force the device
         * to be handled by the generic USB driver, as there
         * can still be another, more specialized, device driver.
         *
         * This accommodates the usbip driver.
         *
         * TODO: What if, in the future, there are multiple
         * specialized USB device drivers for a particular device?
         * In such cases, there is a need to try all matching
         * specialised device drivers prior to setting the
         * use_generic_driver bit.
         */
        if (udriver->probe)
                error = udriver->probe(udev);
        else if (!udriver->generic_subclass)
                error = -EINVAL;
        if (error == -ENODEV && udriver != &usb_generic_driver &&
            (udriver->id_table || udriver->match)) {
                udev->use_generic_driver = 1;
                return -EPROBE_DEFER;
        }
        return error;
}

/* called from driver core with dev locked */
static int usb_unbind_device(struct device *dev)
{
        struct usb_device *udev = to_usb_device(dev);
        struct usb_device_driver *udriver = to_usb_device_driver(dev->driver);

        if (udriver->disconnect)
                udriver->disconnect(udev);
        if (udriver->generic_subclass)
                usb_generic_driver_disconnect(udev);
        if (!udriver->supports_autosuspend)
                usb_autosuspend_device(udev);
        return 0;
}

/* called from driver core with dev locked */
static int usb_probe_interface(struct device *dev)
{
        struct usb_driver *driver = to_usb_driver(dev->driver);
        struct usb_interface *intf = to_usb_interface(dev);
        struct usb_device *udev = interface_to_usbdev(intf);
        const struct usb_device_id *id;
        int error = -ENODEV;
        int lpm_disable_error = -ENODEV;

        dev_dbg(dev, "%s\n", __func__);

        intf->needs_binding = 0;

        if (usb_device_is_owned(udev))
                return error;

        if (udev->authorized == 0) {
                dev_info(&intf->dev, "Device is not authorized for usage\n");
                return error;
        } else if (intf->authorized == 0) {
                dev_info(&intf->dev, "Interface %d is not authorized for usage\n",
                                intf->altsetting->desc.bInterfaceNumber);
                return error;
        }

        id = usb_match_dynamic_id(intf, driver);
        if (!id)
                id = usb_match_id(intf, driver->id_table);
        if (!id)
                return error;

        dev_dbg(dev, "%s - got id\n", __func__);

        error = usb_autoresume_device(udev);
        if (error)
                return error;

        intf->condition = USB_INTERFACE_BINDING;

        /* Probed interfaces are initially active.  They are
         * runtime-PM-enabled only if the driver has autosuspend support.
         * They are sensitive to their children's power states.
         */
        pm_runtime_set_active(dev);
        pm_suspend_ignore_children(dev, false);
        if (driver->supports_autosuspend)
                pm_runtime_enable(dev);

        /* If the new driver doesn't allow hub-initiated LPM, and we can't
         * disable hub-initiated LPM, then fail the probe.
         *
         * Otherwise, leaving LPM enabled should be harmless, because the
         * endpoint intervals should remain the same, and the U1/U2 timeouts
         * should remain the same.
         *
         * If we need to install alt setting 0 before probe, or another alt
         * setting during probe, that should also be fine.  usb_set_interface()
         * will attempt to disable LPM, and fail if it can't disable it.
         */
        if (driver->disable_hub_initiated_lpm) {
                lpm_disable_error = usb_unlocked_disable_lpm(udev);
                if (lpm_disable_error) {
                        dev_err(&intf->dev, "%s Failed to disable LPM for driver %s\n",
                                __func__, driver->name);
                        error = lpm_disable_error;
                        goto err;
                }
        }

        /* Carry out a deferred switch to altsetting 0 */
        if (intf->needs_altsetting0) {
                error = usb_set_interface(udev, intf->altsetting[0].
                                desc.bInterfaceNumber, 0);
                if (error < 0)
                        goto err;
                intf->needs_altsetting0 = 0;
        }

        error = driver->probe(intf, id);
        if (error)
                goto err;

        intf->condition = USB_INTERFACE_BOUND;

        /* If the LPM disable succeeded, balance the ref counts. */
        if (!lpm_disable_error)
                usb_unlocked_enable_lpm(udev);

        usb_autosuspend_device(udev);
        return error;

 err:
        usb_set_intfdata(intf, NULL);
        intf->needs_remote_wakeup = 0;
        intf->condition = USB_INTERFACE_UNBOUND;

        /* If the LPM disable succeeded, balance the ref counts. */
        if (!lpm_disable_error)
                usb_unlocked_enable_lpm(udev);

        /* Unbound interfaces are always runtime-PM-disabled and -suspended */
        if (driver->supports_autosuspend)
                pm_runtime_disable(dev);
        pm_runtime_set_suspended(dev);

        usb_autosuspend_device(udev);
        return error;
}

/* called from driver core with dev locked */
static int usb_unbind_interface(struct device *dev)
{
        struct usb_driver *driver = to_usb_driver(dev->driver);
        struct usb_interface *intf = to_usb_interface(dev);
        struct usb_host_endpoint *ep, **eps = NULL;
        struct usb_device *udev;
        int i, j, error, r;
        int lpm_disable_error = -ENODEV;

        intf->condition = USB_INTERFACE_UNBINDING;

        /* Autoresume for set_interface call below */
        udev = interface_to_usbdev(intf);
        error = usb_autoresume_device(udev);

        /* If hub-initiated LPM policy may change, attempt to disable LPM until
         * the driver is unbound.  If LPM isn't disabled, that's fine because it
         * wouldn't be enabled unless all the bound interfaces supported
         * hub-initiated LPM.
         */
        if (driver->disable_hub_initiated_lpm)
                lpm_disable_error = usb_unlocked_disable_lpm(udev);

        /*
         * Terminate all URBs for this interface unless the driver
         * supports "soft" unbinding and the device is still present.
         */
        if (!driver->soft_unbind || udev->state == USB_STATE_NOTATTACHED)
                usb_disable_interface(udev, intf, false);

        driver->disconnect(intf);

        /* Free streams */
        for (i = 0, j = 0; i < intf->cur_altsetting->desc.bNumEndpoints; i++) {
                ep = &intf->cur_altsetting->endpoint[i];
                if (ep->streams == 0)
                        continue;
                if (j == 0) {
                        eps = kmalloc_array(USB_MAXENDPOINTS, sizeof(void *),
                                      GFP_KERNEL);
                        if (!eps)
                                break;
                }
                eps[j++] = ep;
        }
        if (j) {
                usb_free_streams(intf, eps, j, GFP_KERNEL);
                kfree(eps);
        }

        /* Reset other interface state.
         * We cannot do a Set-Interface if the device is suspended or
         * if it is prepared for a system sleep (since installing a new
         * altsetting means creating new endpoint device entries).
         * When either of these happens, defer the Set-Interface.
         */
        if (intf->cur_altsetting->desc.bAlternateSetting == 0) {
                /* Already in altsetting 0 so skip Set-Interface.
                 * Just re-enable it without affecting the endpoint toggles.
                 */
                usb_enable_interface(udev, intf, false);
        } else if (!error && !intf->dev.power.is_prepared) {
                r = usb_set_interface(udev, intf->altsetting[0].
                                desc.bInterfaceNumber, 0);
                if (r < 0)
                        intf->needs_altsetting0 = 1;
        } else {
                intf->needs_altsetting0 = 1;
        }
        usb_set_intfdata(intf, NULL);

        intf->condition = USB_INTERFACE_UNBOUND;
        intf->needs_remote_wakeup = 0;

        /* Attempt to re-enable USB3 LPM, if the disable succeeded. */
        if (!lpm_disable_error)
                usb_unlocked_enable_lpm(udev);

        /* Unbound interfaces are always runtime-PM-disabled and -suspended */
        if (driver->supports_autosuspend)
                pm_runtime_disable(dev);
        pm_runtime_set_suspended(dev);

        if (!error)
                usb_autosuspend_device(udev);

        return 0;
}

static void usb_shutdown_interface(struct device *dev)
{
        struct usb_interface *intf = to_usb_interface(dev);
        struct usb_driver *driver;

        if (!dev->driver)
                return;

        driver = to_usb_driver(dev->driver);
        if (driver->shutdown)
                driver->shutdown(intf);
}

/**
 * usb_driver_claim_interface - bind a driver to an interface
 * @driver: the driver to be bound
 * @iface: the interface to which it will be bound; must be in the
 *        usb device's active configuration
 * @data: driver data associated with that interface
 *
 * This is used by usb device drivers that need to claim more than one
 * interface on a device when probing (audio and acm are current examples).
 * No device driver should directly modify internal usb_interface or
 * usb_device structure members.
 *
 * Callers must own the device lock, so driver probe() entries don't need
 * extra locking, but other call contexts may need to explicitly claim that
 * lock.
 *
 * Return: 0 on success.
 */
int usb_driver_claim_interface(struct usb_driver *driver,
                                struct usb_interface *iface, void *data)
{
        struct device *dev;
        int retval = 0;

        if (!iface)
                return -ENODEV;

        dev = &iface->dev;
        if (dev->driver)
                return -EBUSY;

        /* reject claim if interface is not authorized */
        if (!iface->authorized)
                return -ENODEV;

        dev->driver = &driver->driver;
        usb_set_intfdata(iface, data);
        iface->needs_binding = 0;

        iface->condition = USB_INTERFACE_BOUND;

        /* Claimed interfaces are initially inactive (suspended) and
         * runtime-PM-enabled, but only if the driver has autosuspend
         * support.  Otherwise they are marked active, to prevent the
         * device from being autosuspended, but left disabled.  In either
         * case they are sensitive to their children's power states.
         */
        pm_suspend_ignore_children(dev, false);
        if (driver->supports_autosuspend)
                pm_runtime_enable(dev);
        else
                pm_runtime_set_active(dev);

        /* if interface was already added, bind now; else let
         * the future device_add() bind it, bypassing probe()
         */
        if (device_is_registered(dev))
                retval = device_bind_driver(dev);

        if (retval) {
                dev->driver = NULL;
                usb_set_intfdata(iface, NULL);
                iface->needs_remote_wakeup = 0;
                iface->condition = USB_INTERFACE_UNBOUND;

                /*
                 * Unbound interfaces are always runtime-PM-disabled
                 * and runtime-PM-suspended
                 */
                if (driver->supports_autosuspend)
                        pm_runtime_disable(dev);
                pm_runtime_set_suspended(dev);
        }

        return retval;
}
EXPORT_SYMBOL_GPL(usb_driver_claim_interface);

/**
 * usb_driver_release_interface - unbind a driver from an interface
 * @driver: the driver to be unbound
 * @iface: the interface from which it will be unbound
 *
 * This can be used by drivers to release an interface without waiting
 * for their disconnect() methods to be called.  In typical cases this
 * also causes the driver disconnect() method to be called.
 *
 * This call is synchronous, and may not be used in an interrupt context.
 * Callers must own the device lock, so driver disconnect() entries don't
 * need extra locking, but other call contexts may need to explicitly claim
 * that lock.
 */
void usb_driver_release_interface(struct usb_driver *driver,
                                        struct usb_interface *iface)
{
        struct device *dev = &iface->dev;

        /* this should never happen, don't release something that's not ours */
        if (!dev->driver || dev->driver != &driver->driver)
                return;

        /* don't release from within disconnect() */
        if (iface->condition != USB_INTERFACE_BOUND)
                return;
        iface->condition = USB_INTERFACE_UNBINDING;

        /* Release via the driver core only if the interface
         * has already been registered
         */
        if (device_is_registered(dev)) {
                device_release_driver(dev);
        } else {
                device_lock(dev);
                usb_unbind_interface(dev);
                dev->driver = NULL;
                device_unlock(dev);
        }
}
EXPORT_SYMBOL_GPL(usb_driver_release_interface);

/* returns 0 if no match, 1 if match */
int usb_match_device(struct usb_device *dev, const struct usb_device_id *id)
{
        if ((id->match_flags & USB_DEVICE_ID_MATCH_VENDOR) &&
            id->idVendor != le16_to_cpu(dev->descriptor.idVendor))
                return 0;

        if ((id->match_flags & USB_DEVICE_ID_MATCH_PRODUCT) &&
            id->idProduct != le16_to_cpu(dev->descriptor.idProduct))
                return 0;

        /* No need to test id->bcdDevice_lo != 0, since 0 is never
           greater than any unsigned number. */
        if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_LO) &&
            (id->bcdDevice_lo > le16_to_cpu(dev->descriptor.bcdDevice)))
                return 0;

        if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_HI) &&
            (id->bcdDevice_hi < le16_to_cpu(dev->descriptor.bcdDevice)))
                return 0;

        if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_CLASS) &&
            (id->bDeviceClass != dev->descriptor.bDeviceClass))
                return 0;

        if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_SUBCLASS) &&
            (id->bDeviceSubClass != dev->descriptor.bDeviceSubClass))
                return 0;

        if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_PROTOCOL) &&
            (id->bDeviceProtocol != dev->descriptor.bDeviceProtocol))
                return 0;

        return 1;
}

/* returns 0 if no match, 1 if match */
int usb_match_one_id_intf(struct usb_device *dev,
                          struct usb_host_interface *intf,
                          const struct usb_device_id *id)
{
        /* The interface class, subclass, protocol and number should never be
         * checked for a match if the device class is Vendor Specific,
         * unless the match record specifies the Vendor ID. */
        if (dev->descriptor.bDeviceClass == USB_CLASS_VENDOR_SPEC &&
                        !(id->match_flags & USB_DEVICE_ID_MATCH_VENDOR) &&
                        (id->match_flags & (USB_DEVICE_ID_MATCH_INT_CLASS |
                                USB_DEVICE_ID_MATCH_INT_SUBCLASS |
                                USB_DEVICE_ID_MATCH_INT_PROTOCOL |
                                USB_DEVICE_ID_MATCH_INT_NUMBER)))
                return 0;

        if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_CLASS) &&
            (id->bInterfaceClass != intf->desc.bInterfaceClass))
                return 0;

        if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_SUBCLASS) &&
            (id->bInterfaceSubClass != intf->desc.bInterfaceSubClass))
                return 0;

        if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_PROTOCOL) &&
            (id->bInterfaceProtocol != intf->desc.bInterfaceProtocol))
                return 0;

        if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_NUMBER) &&
            (id->bInterfaceNumber != intf->desc.bInterfaceNumber))
                return 0;

        return 1;
}

/* returns 0 if no match, 1 if match */
int usb_match_one_id(struct usb_interface *interface,
                     const struct usb_device_id *id)
{
        struct usb_host_interface *intf;
        struct usb_device *dev;

        /* proc_connectinfo in devio.c may call us with id == NULL. */
        if (id == NULL)
                return 0;

        intf = interface->cur_altsetting;
        dev = interface_to_usbdev(interface);

        if (!usb_match_device(dev, id))
                return 0;

        return usb_match_one_id_intf(dev, intf, id);
}
EXPORT_SYMBOL_GPL(usb_match_one_id);

/**
 * usb_match_id - find first usb_device_id matching device or interface
 * @interface: the interface of interest
 * @id: array of usb_device_id structures, terminated by zero entry
 *
 * usb_match_id searches an array of usb_device_id's and returns
 * the first one matching the device or interface, or null.
 * This is used when binding (or rebinding) a driver to an interface.
 * Most USB device drivers will use this indirectly, through the usb core,
 * but some layered driver frameworks use it directly.
 * These device tables are exported with MODULE_DEVICE_TABLE, through
 * modutils, to support the driver loading functionality of USB hotplugging.
 *
 * Return: The first matching usb_device_id, or %NULL.
 *
 * What Matches:
 *
 * The "match_flags" element in a usb_device_id controls which
 * members are used.  If the corresponding bit is set, the
 * value in the device_id must match its corresponding member
 * in the device or interface descriptor, or else the device_id
 * does not match.
 *
 * "driver_info" is normally used only by device drivers,
 * but you can create a wildcard "matches anything" usb_device_id
 * as a driver's "modules.usbmap" entry if you provide an id with
 * only a nonzero "driver_info" field.  If you do this, the USB device
 * driver's probe() routine should use additional intelligence to
 * decide whether to bind to the specified interface.
 *
 * What Makes Good usb_device_id Tables:
 *
 * The match algorithm is very simple, so that intelligence in
 * driver selection must come from smart driver id records.
 * Unless you have good reasons to use another selection policy,
 * provide match elements only in related groups, and order match
 * specifiers from specific to general.  Use the macros provided
 * for that purpose if you can.
 *
 * The most specific match specifiers use device descriptor
 * data.  These are commonly used with product-specific matches;
 * the USB_DEVICE macro lets you provide vendor and product IDs,
 * and you can also match against ranges of product revisions.
 * These are widely used for devices with application or vendor
 * specific bDeviceClass values.
 *
 * Matches based on device class/subclass/protocol specifications
 * are slightly more general; use the USB_DEVICE_INFO macro, or
 * its siblings.  These are used with single-function devices
 * where bDeviceClass doesn't specify that each interface has
 * its own class.
 *
 * Matches based on interface class/subclass/protocol are the
 * most general; they let drivers bind to any interface on a
 * multiple-function device.  Use the USB_INTERFACE_INFO
 * macro, or its siblings, to match class-per-interface style
 * devices (as recorded in bInterfaceClass).
 *
 * Note that an entry created by USB_INTERFACE_INFO won't match
 * any interface if the device class is set to Vendor-Specific.
 * This is deliberate; according to the USB spec the meanings of
 * the interface class/subclass/protocol for these devices are also
 * vendor-specific, and hence matching against a standard product
 * class wouldn't work anyway.  If you really want to use an
 * interface-based match for such a device, create a match record
 * that also specifies the vendor ID.  (Unforunately there isn't a
 * standard macro for creating records like this.)
 *
 * Within those groups, remember that not all combinations are
 * meaningful.  For example, don't give a product version range
 * without vendor and product IDs; or specify a protocol without
 * its associated class and subclass.
 */
const struct usb_device_id *usb_match_id(struct usb_interface *interface,
                                         const struct usb_device_id *id)
{
        /* proc_connectinfo in devio.c may call us with id == NULL. */
        if (id == NULL)
                return NULL;

        /* It is important to check that id->driver_info is nonzero,
           since an entry that is all zeroes except for a nonzero
           id->driver_info is the way to create an entry that
           indicates that the driver want to examine every
           device and interface. */
        for (; id->idVendor || id->idProduct || id->bDeviceClass ||
               id->bInterfaceClass || id->driver_info; id++) {
                if (usb_match_one_id(interface, id))
                        return id;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(usb_match_id);

const struct usb_device_id *usb_device_match_id(struct usb_device *udev,
                                const struct usb_device_id *id)
{
        if (!id)
                return NULL;

        for (; id->idVendor || id->idProduct ; id++) {
                if (usb_match_device(udev, id))
                        return id;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(usb_device_match_id);

bool usb_driver_applicable(struct usb_device *udev,
                           const struct usb_device_driver *udrv)
{
        if (udrv->id_table && udrv->match)
                return usb_device_match_id(udev, udrv->id_table) != NULL &&
                       udrv->match(udev);

        if (udrv->id_table)
                return usb_device_match_id(udev, udrv->id_table) != NULL;

        if (udrv->match)
                return udrv->match(udev);

        return false;
}

static int usb_device_match(struct device *dev, const struct device_driver *drv)
{
        /* devices and interfaces are handled separately */
        if (is_usb_device(dev)) {
                struct usb_device *udev;
                const struct usb_device_driver *udrv;

                /* interface drivers never match devices */
                if (!is_usb_device_driver(drv))
                        return 0;

                udev = to_usb_device(dev);
                udrv = to_usb_device_driver(drv);

                /* If the device driver under consideration does not have a
                 * id_table or a match function, then let the driver's probe
                 * function decide.
                 */
                if (!udrv->id_table && !udrv->match)
                        return 1;

                return usb_driver_applicable(udev, udrv);

        } else if (is_usb_interface(dev)) {
                struct usb_interface *intf;
                const struct usb_driver *usb_drv;
                const struct usb_device_id *id;

                /* device drivers never match interfaces */
                if (is_usb_device_driver(drv))
                        return 0;

                intf = to_usb_interface(dev);
                usb_drv = to_usb_driver(drv);

                id = usb_match_id(intf, usb_drv->id_table);
                if (id)
                        return 1;

                id = usb_match_dynamic_id(intf, usb_drv);
                if (id)
                        return 1;
        }

        return 0;
}

static int usb_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        const struct usb_device *usb_dev;

        if (is_usb_device(dev)) {
                usb_dev = to_usb_device(dev);
        } else if (is_usb_interface(dev)) {
                const struct usb_interface *intf = to_usb_interface(dev);

                usb_dev = interface_to_usbdev(intf);
        } else {
                return 0;
        }

        if (usb_dev->devnum < 0) {
                /* driver is often null here; dev_dbg() would oops */
                pr_debug("usb %s: already deleted?\n", dev_name(dev));
                return -ENODEV;
        }
        if (!usb_dev->bus) {
                pr_debug("usb %s: bus removed?\n", dev_name(dev));
                return -ENODEV;
        }

        /* per-device configurations are common */
        if (add_uevent_var(env, "PRODUCT=%x/%x/%x",
                           le16_to_cpu(usb_dev->descriptor.idVendor),
                           le16_to_cpu(usb_dev->descriptor.idProduct),
                           le16_to_cpu(usb_dev->descriptor.bcdDevice)))
                return -ENOMEM;

        /* class-based driver binding models */
        if (add_uevent_var(env, "TYPE=%d/%d/%d",
                           usb_dev->descriptor.bDeviceClass,
                           usb_dev->descriptor.bDeviceSubClass,
                           usb_dev->descriptor.bDeviceProtocol))
                return -ENOMEM;

        return 0;
}

static int __usb_bus_reprobe_drivers(struct device *dev, void *data)
{
        struct usb_device_driver *new_udriver = data;
        struct usb_device *udev;
        int ret;

        /* Don't reprobe if current driver isn't usb_generic_driver */
        if (dev->driver != &usb_generic_driver.driver)
                return 0;

        udev = to_usb_device(dev);
        if (!usb_driver_applicable(udev, new_udriver))
                return 0;

        ret = device_reprobe(dev);
        if (ret && ret != -EPROBE_DEFER)
                dev_err(dev, "Failed to reprobe device (error %d)\n", ret);

        return 0;
}

bool is_usb_device_driver(const struct device_driver *drv)
{
        return drv->probe == usb_probe_device;
}

/**
 * usb_register_device_driver - register a USB device (not interface) driver
 * @new_udriver: USB operations for the device driver
 * @owner: module owner of this driver.
 *
 * Registers a USB device driver with the USB core.  The list of
 * unattached devices will be rescanned whenever a new driver is
 * added, allowing the new driver to attach to any recognized devices.
 *
 * Return: A negative error code on failure and 0 on success.
 */
int usb_register_device_driver(struct usb_device_driver *new_udriver,
                struct module *owner)
{
        int retval = 0;

        if (usb_disabled())
                return -ENODEV;

        new_udriver->driver.name = new_udriver->name;
        new_udriver->driver.bus = &usb_bus_type;
        new_udriver->driver.probe = usb_probe_device;
        new_udriver->driver.remove = usb_unbind_device;
        new_udriver->driver.owner = owner;
        new_udriver->driver.dev_groups = new_udriver->dev_groups;

        retval = driver_register(&new_udriver->driver);

        if (!retval) {
                pr_info("%s: registered new device driver %s\n",
                        usbcore_name, new_udriver->name);
                /*
                 * Check whether any device could be better served with
                 * this new driver
                 */
                bus_for_each_dev(&usb_bus_type, NULL, new_udriver,
                                 __usb_bus_reprobe_drivers);
        } else {
                pr_err("%s: error %d registering device driver %s\n",
                        usbcore_name, retval, new_udriver->name);
        }

        return retval;
}
EXPORT_SYMBOL_GPL(usb_register_device_driver);

/**
 * usb_deregister_device_driver - unregister a USB device (not interface) driver
 * @udriver: USB operations of the device driver to unregister
 * Context: must be able to sleep
 *
 * Unlinks the specified driver from the internal USB driver list.
 */
void usb_deregister_device_driver(struct usb_device_driver *udriver)
{
        pr_info("%s: deregistering device driver %s\n",
                        usbcore_name, udriver->name);

        driver_unregister(&udriver->driver);
}
EXPORT_SYMBOL_GPL(usb_deregister_device_driver);

/**
 * usb_register_driver - register a USB interface driver
 * @new_driver: USB operations for the interface driver
 * @owner: module owner of this driver.
 * @mod_name: module name string
 *
 * Registers a USB interface driver with the USB core.  The list of
 * unattached interfaces will be rescanned whenever a new driver is
 * added, allowing the new driver to attach to any recognized interfaces.
 *
 * Return: A negative error code on failure and 0 on success.
 *
 * NOTE: if you want your driver to use the USB major number, you must call
 * usb_register_dev() to enable that functionality.  This function no longer
 * takes care of that.
 */
int usb_register_driver(struct usb_driver *new_driver, struct module *owner,
                        const char *mod_name)
{
        int retval = 0;

        if (usb_disabled())
                return -ENODEV;

        new_driver->driver.name = new_driver->name;
        new_driver->driver.bus = &usb_bus_type;
        new_driver->driver.probe = usb_probe_interface;
        new_driver->driver.remove = usb_unbind_interface;
        new_driver->driver.shutdown = usb_shutdown_interface;
        new_driver->driver.owner = owner;
        new_driver->driver.mod_name = mod_name;
        new_driver->driver.dev_groups = new_driver->dev_groups;
        INIT_LIST_HEAD(&new_driver->dynids.list);

        retval = driver_register(&new_driver->driver);
        if (retval)
                goto out;

        retval = usb_create_newid_files(new_driver);
        if (retval)
                goto out_newid;

        pr_info("%s: registered new interface driver %s\n",
                        usbcore_name, new_driver->name);

        return 0;

out_newid:
        driver_unregister(&new_driver->driver);
out:
        pr_err("%s: error %d registering interface driver %s\n",
                usbcore_name, retval, new_driver->name);
        return retval;
}
EXPORT_SYMBOL_GPL(usb_register_driver);

/**
 * usb_deregister - unregister a USB interface driver
 * @driver: USB operations of the interface driver to unregister
 * Context: must be able to sleep
 *
 * Unlinks the specified driver from the internal USB driver list.
 *
 * NOTE: If you called usb_register_dev(), you still need to call
 * usb_deregister_dev() to clean up your driver's allocated minor numbers,
 * this * call will no longer do it for you.
 */
void usb_deregister(struct usb_driver *driver)
{
        pr_info("%s: deregistering interface driver %s\n",
                        usbcore_name, driver->name);

        usb_remove_newid_files(driver);
        driver_unregister(&driver->driver);
        usb_free_dynids(driver);
}
EXPORT_SYMBOL_GPL(usb_deregister);

/* Forced unbinding of a USB interface driver, either because
 * it doesn't support pre_reset/post_reset/reset_resume or
 * because it doesn't support suspend/resume.
 *
 * The caller must hold @intf's device's lock, but not @intf's lock.
 */
void usb_forced_unbind_intf(struct usb_interface *intf)
{
        struct usb_driver *driver = to_usb_driver(intf->dev.driver);

        dev_dbg(&intf->dev, "forced unbind\n");
        usb_driver_release_interface(driver, intf);

        /* Mark the interface for later rebinding */
        intf->needs_binding = 1;
}

/*
 * Unbind drivers for @udev's marked interfaces.  These interfaces have
 * the needs_binding flag set, for example by usb_resume_interface().
 *
 * The caller must hold @udev's device lock.
 */
static void unbind_marked_interfaces(struct usb_device *udev)
{
        struct usb_host_config        *config;
        int                        i;
        struct usb_interface        *intf;

        config = udev->actconfig;
        if (config) {
                for (i = 0; i < config->desc.bNumInterfaces; ++i) {
                        intf = config->interface[i];
                        if (intf->dev.driver && intf->needs_binding)
                                usb_forced_unbind_intf(intf);
                }
        }
}

/* Delayed forced unbinding of a USB interface driver and scan
 * for rebinding.
 *
 * The caller must hold @intf's device's lock, but not @intf's lock.
 *
 * Note: Rebinds will be skipped if a system sleep transition is in
 * progress and the PM "complete" callback hasn't occurred yet.
 */
static void usb_rebind_intf(struct usb_interface *intf)
{
        int rc;

        /* Delayed unbind of an existing driver */
        if (intf->dev.driver)
                usb_forced_unbind_intf(intf);

        /* Try to rebind the interface */
        if (!intf->dev.power.is_prepared) {
                intf->needs_binding = 0;
                rc = device_attach(&intf->dev);
                if (rc < 0 && rc != -EPROBE_DEFER)
                        dev_warn(&intf->dev, "rebind failed: %d\n", rc);
        }
}

/*
 * Rebind drivers to @udev's marked interfaces.  These interfaces have
 * the needs_binding flag set.
 *
 * The caller must hold @udev's device lock.
 */
static void rebind_marked_interfaces(struct usb_device *udev)
{
        struct usb_host_config        *config;
        int                        i;
        struct usb_interface        *intf;

        config = udev->actconfig;
        if (config) {
                for (i = 0; i < config->desc.bNumInterfaces; ++i) {
                        intf = config->interface[i];
                        if (intf->needs_binding)
                                usb_rebind_intf(intf);
                }
        }
}

/*
 * Unbind all of @udev's marked interfaces and then rebind all of them.
 * This ordering is necessary because some drivers claim several interfaces
 * when they are first probed.
 *
 * The caller must hold @udev's device lock.
 */
void usb_unbind_and_rebind_marked_interfaces(struct usb_device *udev)
{
        unbind_marked_interfaces(udev);
        rebind_marked_interfaces(udev);
}

#ifdef CONFIG_PM

/* Unbind drivers for @udev's interfaces that don't support suspend/resume
 * There is no check for reset_resume here because it can be determined
 * only during resume whether reset_resume is needed.
 *
 * The caller must hold @udev's device lock.
 */
static void unbind_no_pm_drivers_interfaces(struct usb_device *udev)
{
        struct usb_host_config        *config;
        int                        i;
        struct usb_interface        *intf;
        struct usb_driver        *drv;

        config = udev->actconfig;
        if (config) {
                for (i = 0; i < config->desc.bNumInterfaces; ++i) {
                        intf = config->interface[i];

                        if (intf->dev.driver) {
                                drv = to_usb_driver(intf->dev.driver);
                                if (!drv->suspend || !drv->resume)
                                        usb_forced_unbind_intf(intf);
                        }
                }
        }
}

static int usb_suspend_device(struct usb_device *udev, pm_message_t msg)
{
        struct usb_device_driver        *udriver;
        int                                status = 0;

        if (udev->state == USB_STATE_NOTATTACHED ||
                        udev->state == USB_STATE_SUSPENDED)
                goto done;

        /* For devices that don't have a driver, we do a generic suspend. */
        if (udev->dev.driver)
                udriver = to_usb_device_driver(udev->dev.driver);
        else {
                udev->do_remote_wakeup = 0;
                udriver = &usb_generic_driver;
        }
        if (udriver->suspend)
                status = udriver->suspend(udev, msg);
        if (status == 0 && udriver->generic_subclass)
                status = usb_generic_driver_suspend(udev, msg);

 done:
        dev_vdbg(&udev->dev, "%s: status %d\n", __func__, status);
        return status;
}

static int usb_resume_device(struct usb_device *udev, pm_message_t msg)
{
        struct usb_device_driver        *udriver;
        int                                status = 0;

        if (udev->state == USB_STATE_NOTATTACHED)
                goto done;

        /* Can't resume it if it doesn't have a driver. */
        if (udev->dev.driver == NULL) {
                status = -ENOTCONN;
                goto done;
        }

        /* Non-root devices on a full/low-speed bus must wait for their
         * companion high-speed root hub, in case a handoff is needed.
         */
        if (!PMSG_IS_AUTO(msg) && udev->parent && udev->bus->hs_companion)
                device_pm_wait_for_dev(&udev->dev,
                                &udev->bus->hs_companion->root_hub->dev);

        if (udev->quirks & USB_QUIRK_RESET_RESUME)
                udev->reset_resume = 1;

        udriver = to_usb_device_driver(udev->dev.driver);
        if (udriver->generic_subclass)
                status = usb_generic_driver_resume(udev, msg);
        if (status == 0 && udriver->resume)
                status = udriver->resume(udev, msg);

 done:
        dev_vdbg(&udev->dev, "%s: status %d\n", __func__, status);
        return status;
}

static int usb_suspend_interface(struct usb_device *udev,
                struct usb_interface *intf, pm_message_t msg)
{
        struct usb_driver        *driver;
        int                        status = 0;

        if (udev->state == USB_STATE_NOTATTACHED ||
                        intf->condition == USB_INTERFACE_UNBOUND)
                goto done;
        driver = to_usb_driver(intf->dev.driver);

        /* at this time we know the driver supports suspend */
        status = driver->suspend(intf, msg);
        if (status && !PMSG_IS_AUTO(msg))
                dev_err(&intf->dev, "suspend error %d\n", status);

 done:
        dev_vdbg(&intf->dev, "%s: status %d\n", __func__, status);
        return status;
}

static int usb_resume_interface(struct usb_device *udev,
                struct usb_interface *intf, pm_message_t msg, int reset_resume)
{
        struct usb_driver        *driver;
        int                        status = 0;

        if (udev->state == USB_STATE_NOTATTACHED)
                goto done;

        /* Don't let autoresume interfere with unbinding */
        if (intf->condition == USB_INTERFACE_UNBINDING)
                goto done;

        /* Can't resume it if it doesn't have a driver. */
        if (intf->condition == USB_INTERFACE_UNBOUND) {

                /* Carry out a deferred switch to altsetting 0 */
                if (intf->needs_altsetting0 && !intf->dev.power.is_prepared) {
                        usb_set_interface(udev, intf->altsetting[0].
                                        desc.bInterfaceNumber, 0);
                        intf->needs_altsetting0 = 0;
                }
                goto done;
        }

        /* Don't resume if the interface is marked for rebinding */
        if (intf->needs_binding)
                goto done;
        driver = to_usb_driver(intf->dev.driver);

        if (reset_resume) {
                if (driver->reset_resume) {
                        status = driver->reset_resume(intf);
                        if (status)
                                dev_err(&intf->dev, "%s error %d\n",
                                                "reset_resume", status);
                } else {
                        intf->needs_binding = 1;
                        dev_dbg(&intf->dev, "no reset_resume for driver %s?\n",
                                        driver->name);
                }
        } else {
                status = driver->resume(intf);
                if (status)
                        dev_err(&intf->dev, "resume error %d\n", status);
        }

done:
        dev_vdbg(&intf->dev, "%s: status %d\n", __func__, status);

        /* Later we will unbind the driver and/or reprobe, if necessary */
        return status;
}

/**
 * usb_suspend_both - suspend a USB device and its interfaces
 * @udev: the usb_device to suspend
 * @msg: Power Management message describing this state transition
 *
 * This is the central routine for suspending USB devices.  It calls the
 * suspend methods for all the interface drivers in @udev and then calls
 * the suspend method for @udev itself.  When the routine is called in
 * autosuspend, if an error occurs at any stage, all the interfaces
 * which were suspended are resumed so that they remain in the same
 * state as the device, but when called from system sleep, all error
 * from suspend methods of interfaces and the non-root-hub device itself
 * are simply ignored, so all suspended interfaces are only resumed
 * to the device's state when @udev is root-hub and its suspend method
 * returns failure.
 *
 * Autosuspend requests originating from a child device or an interface
 * driver may be made without the protection of @udev's device lock, but
 * all other suspend calls will hold the lock.  Usbcore will insure that
 * method calls do not arrive during bind, unbind, or reset operations.
 * However drivers must be prepared to handle suspend calls arriving at
 * unpredictable times.
 *
 * This routine can run only in process context.
 *
 * Return: 0 if the suspend succeeded.
 */
static int usb_suspend_both(struct usb_device *udev, pm_message_t msg)
{
        int                        status = 0;
        int                        i = 0, n = 0;
        struct usb_interface        *intf;
        bool                        offload_active = false;

        if (udev->state == USB_STATE_NOTATTACHED ||
                        udev->state == USB_STATE_SUSPENDED)
                goto done;

        usb_offload_set_pm_locked(udev, true);
        if (msg.event == PM_EVENT_SUSPEND && usb_offload_check(udev)) {
                dev_dbg(&udev->dev, "device offloaded, skip suspend.\n");
                offload_active = true;
        }

        /* Suspend all the interfaces and then udev itself */
        if (udev->actconfig) {
                n = udev->actconfig->desc.bNumInterfaces;
                for (i = n - 1; i >= 0; --i) {
                        intf = udev->actconfig->interface[i];
                        /*
                         * Don't suspend interfaces with remote wakeup while
                         * the controller is active. This preserves pending
                         * interrupt urbs, allowing interrupt events to be
                         * handled during system suspend.
                         */
                        if (offload_active && intf->needs_remote_wakeup) {
                                dev_dbg(&intf->dev,
                                        "device offloaded, skip suspend.\n");
                                continue;
                        }
                        status = usb_suspend_interface(udev, intf, msg);

                        /* Ignore errors during system sleep transitions */
                        if (!PMSG_IS_AUTO(msg))
                                status = 0;
                        if (status != 0)
                                break;
                }
        }
        if (status == 0) {
                if (!offload_active)
                        status = usb_suspend_device(udev, msg);

                /*
                 * Ignore errors from non-root-hub devices during
                 * system sleep transitions.  For the most part,
                 * these devices should go to low power anyway when
                 * the entire bus is suspended.
                 */
                if (udev->parent && !PMSG_IS_AUTO(msg))
                        status = 0;

                /*
                 * If the device is inaccessible, don't try to resume
                 * suspended interfaces and just return the error.
                 */
                if (status && status != -EBUSY) {
                        int err;
                        u16 devstat;

                        err = usb_get_std_status(udev, USB_RECIP_DEVICE, 0,
                                                 &devstat);
                        if (err) {
                                dev_err(&udev->dev,
                                        "Failed to suspend device, error %d\n",
                                        status);
                                goto done;
                        }
                }
        }

        /* If the suspend failed, resume interfaces that did get suspended */
        if (status != 0) {
                if (udev->actconfig) {
                        msg.event ^= (PM_EVENT_SUSPEND | PM_EVENT_RESUME);
                        while (++i < n) {
                                intf = udev->actconfig->interface[i];
                                usb_resume_interface(udev, intf, msg, 0);
                        }
                }

        /* If the suspend succeeded then prevent any more URB submissions
         * and flush any outstanding URBs.
         */
        } else {
                udev->can_submit = 0;
                if (!offload_active) {
                        for (i = 0; i < 16; ++i) {
                                usb_hcd_flush_endpoint(udev, udev->ep_out[i]);
                                usb_hcd_flush_endpoint(udev, udev->ep_in[i]);
                        }
                }
        }

 done:
        if (status != 0)
                usb_offload_set_pm_locked(udev, false);
        dev_vdbg(&udev->dev, "%s: status %d\n", __func__, status);
        return status;
}

/**
 * usb_resume_both - resume a USB device and its interfaces
 * @udev: the usb_device to resume
 * @msg: Power Management message describing this state transition
 *
 * This is the central routine for resuming USB devices.  It calls the
 * resume method for @udev and then calls the resume methods for all
 * the interface drivers in @udev.
 *
 * Autoresume requests originating from a child device or an interface
 * driver may be made without the protection of @udev's device lock, but
 * all other resume calls will hold the lock.  Usbcore will insure that
 * method calls do not arrive during bind, unbind, or reset operations.
 * However drivers must be prepared to handle resume calls arriving at
 * unpredictable times.
 *
 * This routine can run only in process context.
 *
 * Return: 0 on success.
 */
static int usb_resume_both(struct usb_device *udev, pm_message_t msg)
{
        int                        status = 0;
        int                        i;
        struct usb_interface        *intf;
        bool                        offload_active = false;

        if (udev->state == USB_STATE_NOTATTACHED) {
                status = -ENODEV;
                goto done;
        }
        udev->can_submit = 1;
        if (msg.event == PM_EVENT_RESUME)
                offload_active = usb_offload_check(udev);

        /* Resume the device */
        if (udev->state == USB_STATE_SUSPENDED || udev->reset_resume) {
                if (!offload_active)
                        status = usb_resume_device(udev, msg);
                else
                        dev_dbg(&udev->dev,
                                "device offloaded, skip resume.\n");
        }

        /* Resume the interfaces */
        if (status == 0 && udev->actconfig) {
                for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) {
                        intf = udev->actconfig->interface[i];
                        /*
                         * Interfaces with remote wakeup aren't suspended
                         * while the controller is active. This preserves
                         * pending interrupt urbs, allowing interrupt events
                         * to be handled during system suspend.
                         */
                        if (offload_active && intf->needs_remote_wakeup) {
                                dev_dbg(&intf->dev,
                                        "device offloaded, skip resume.\n");
                                continue;
                        }
                        usb_resume_interface(udev, intf, msg,
                                        udev->reset_resume);
                }
        }
        usb_mark_last_busy(udev);

 done:
        dev_vdbg(&udev->dev, "%s: status %d\n", __func__, status);
        usb_offload_set_pm_locked(udev, false);
        if (!status)
                udev->reset_resume = 0;
        return status;
}

static void choose_wakeup(struct usb_device *udev, pm_message_t msg)
{
        int        w;

        /*
         * For FREEZE/QUIESCE, disable remote wakeups so no interrupts get
         * generated.
         */
        if (msg.event == PM_EVENT_FREEZE || msg.event == PM_EVENT_QUIESCE) {
                w = 0;

        } else {
                /*
                 * Enable remote wakeup if it is allowed, even if no interface
                 * drivers actually want it.
                 */
                w = device_may_wakeup(&udev->dev);
        }

        /*
         * If the device is autosuspended with the wrong wakeup setting,
         * autoresume now so the setting can be changed.
         */
        if (udev->state == USB_STATE_SUSPENDED && w != udev->do_remote_wakeup)
                pm_runtime_resume(&udev->dev);
        udev->do_remote_wakeup = w;
}

/* The device lock is held by the PM core */
int usb_suspend(struct device *dev, pm_message_t msg)
{
        struct usb_device        *udev = to_usb_device(dev);
        int r;

        unbind_no_pm_drivers_interfaces(udev);

        /* From now on we are sure all drivers support suspend/resume
         * but not necessarily reset_resume()
         * so we may still need to unbind and rebind upon resume
         */
        choose_wakeup(udev, msg);
        r = usb_suspend_both(udev, msg);
        if (r)
                return r;

        if (udev->quirks & USB_QUIRK_DISCONNECT_SUSPEND)
                usb_port_disable(udev);

        return 0;
}

/* The device lock is held by the PM core */
int usb_resume_complete(struct device *dev)
{
        struct usb_device *udev = to_usb_device(dev);

        /* For PM complete calls, all we do is rebind interfaces
         * whose needs_binding flag is set
         */
        if (udev->state != USB_STATE_NOTATTACHED)
                rebind_marked_interfaces(udev);
        return 0;
}

/* The device lock is held by the PM core */
int usb_resume(struct device *dev, pm_message_t msg)
{
        struct usb_device        *udev = to_usb_device(dev);
        int                        status;

        /* For all calls, take the device back to full power and
         * tell the PM core in case it was autosuspended previously.
         * Unbind the interfaces that will need rebinding later,
         * because they fail to support reset_resume.
         * (This can't be done in usb_resume_interface()
         * above because it doesn't own the right set of locks.)
         */
        status = usb_resume_both(udev, msg);
        if (status == 0) {
                pm_runtime_disable(dev);
                pm_runtime_set_active(dev);
                pm_runtime_enable(dev);
                unbind_marked_interfaces(udev);
        }

        /* Avoid PM error messages for devices disconnected while suspended
         * as we'll display regular disconnect messages just a bit later.
         */
        if (status == -ENODEV || status == -ESHUTDOWN)
                status = 0;
        return status;
}

/**
 * usb_enable_autosuspend - allow a USB device to be autosuspended
 * @udev: the USB device which may be autosuspended
 *
 * This routine allows @udev to be autosuspended.  An autosuspend won't
 * take place until the autosuspend_delay has elapsed and all the other
 * necessary conditions are satisfied.
 *
 * The caller must hold @udev's device lock.
 */
void usb_enable_autosuspend(struct usb_device *udev)
{
        pm_runtime_allow(&udev->dev);
}
EXPORT_SYMBOL_GPL(usb_enable_autosuspend);

/**
 * usb_disable_autosuspend - prevent a USB device from being autosuspended
 * @udev: the USB device which may not be autosuspended
 *
 * This routine prevents @udev from being autosuspended and wakes it up
 * if it is already autosuspended.
 *
 * The caller must hold @udev's device lock.
 */
void usb_disable_autosuspend(struct usb_device *udev)
{
        pm_runtime_forbid(&udev->dev);
}
EXPORT_SYMBOL_GPL(usb_disable_autosuspend);

/**
 * usb_autosuspend_device - delayed autosuspend of a USB device and its interfaces
 * @udev: the usb_device to autosuspend
 *
 * This routine should be called when a core subsystem is finished using
 * @udev and wants to allow it to autosuspend.  Examples would be when
 * @udev's device file in usbfs is closed or after a configuration change.
 *
 * @udev's usage counter is decremented; if it drops to 0 and all the
 * interfaces are inactive then a delayed autosuspend will be attempted.
 * The attempt may fail (see autosuspend_check()).
 *
 * The caller must hold @udev's device lock.
 *
 * This routine can run only in process context.
 */
void usb_autosuspend_device(struct usb_device *udev)
{
        int        status;

        usb_mark_last_busy(udev);
        status = pm_runtime_put_sync_autosuspend(&udev->dev);
        dev_vdbg(&udev->dev, "%s: cnt %d -> %d\n",
                        __func__, atomic_read(&udev->dev.power.usage_count),
                        status);
}

/**
 * usb_autoresume_device - immediately autoresume a USB device and its interfaces
 * @udev: the usb_device to autoresume
 *
 * This routine should be called when a core subsystem wants to use @udev
 * and needs to guarantee that it is not suspended.  No autosuspend will
 * occur until usb_autosuspend_device() is called.  (Note that this will
 * not prevent suspend events originating in the PM core.)  Examples would
 * be when @udev's device file in usbfs is opened or when a remote-wakeup
 * request is received.
 *
 * @udev's usage counter is incremented to prevent subsequent autosuspends.
 * However if the autoresume fails then the usage counter is re-decremented.
 *
 * The caller must hold @udev's device lock.
 *
 * This routine can run only in process context.
 *
 * Return: 0 on success. A negative error code otherwise.
 */
int usb_autoresume_device(struct usb_device *udev)
{
        int        status;

        status = pm_runtime_resume_and_get(&udev->dev);
        dev_vdbg(&udev->dev, "%s: cnt %d -> %d\n",
                        __func__, atomic_read(&udev->dev.power.usage_count),
                        status);
        return status;
}

/**
 * usb_autopm_put_interface - decrement a USB interface's PM-usage counter
 * @intf: the usb_interface whose counter should be decremented
 *
 * This routine should be called by an interface driver when it is
 * finished using @intf and wants to allow it to autosuspend.  A typical
 * example would be a character-device driver when its device file is
 * closed.
 *
 * The routine decrements @intf's usage counter.  When the counter reaches
 * 0, a delayed autosuspend request for @intf's device is attempted.  The
 * attempt may fail (see autosuspend_check()).
 *
 * This routine can run only in process context.
 */
void usb_autopm_put_interface(struct usb_interface *intf)
{
        struct usb_device        *udev = interface_to_usbdev(intf);
        int                        status;

        usb_mark_last_busy(udev);
        status = pm_runtime_put_sync(&intf->dev);
        dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n",
                        __func__, atomic_read(&intf->dev.power.usage_count),
                        status);
}
EXPORT_SYMBOL_GPL(usb_autopm_put_interface);

/**
 * usb_autopm_put_interface_async - decrement a USB interface's PM-usage counter
 * @intf: the usb_interface whose counter should be decremented
 *
 * This routine does much the same thing as usb_autopm_put_interface():
 * It decrements @intf's usage counter and schedules a delayed
 * autosuspend request if the counter is <= 0.  The difference is that it
 * does not perform any synchronization; callers should hold a private
 * lock and handle all synchronization issues themselves.
 *
 * Typically a driver would call this routine during an URB's completion
 * handler, if no more URBs were pending.
 *
 * This routine can run in atomic context.
 */
void usb_autopm_put_interface_async(struct usb_interface *intf)
{
        struct usb_device        *udev = interface_to_usbdev(intf);

        usb_mark_last_busy(udev);
        pm_runtime_put(&intf->dev);
        dev_vdbg(&intf->dev, "%s: cnt %d\n",
                        __func__, atomic_read(&intf->dev.power.usage_count));
}
EXPORT_SYMBOL_GPL(usb_autopm_put_interface_async);

/**
 * usb_autopm_put_interface_no_suspend - decrement a USB interface's PM-usage counter
 * @intf: the usb_interface whose counter should be decremented
 *
 * This routine decrements @intf's usage counter but does not carry out an
 * autosuspend.
 *
 * This routine can run in atomic context.
 */
void usb_autopm_put_interface_no_suspend(struct usb_interface *intf)
{
        struct usb_device        *udev = interface_to_usbdev(intf);

        usb_mark_last_busy(udev);
        pm_runtime_put_noidle(&intf->dev);
}
EXPORT_SYMBOL_GPL(usb_autopm_put_interface_no_suspend);

/**
 * usb_autopm_get_interface - increment a USB interface's PM-usage counter
 * @intf: the usb_interface whose counter should be incremented
 *
 * This routine should be called by an interface driver when it wants to
 * use @intf and needs to guarantee that it is not suspended.  In addition,
 * the routine prevents @intf from being autosuspended subsequently.  (Note
 * that this will not prevent suspend events originating in the PM core.)
 * This prevention will persist until usb_autopm_put_interface() is called
 * or @intf is unbound.  A typical example would be a character-device
 * driver when its device file is opened.
 *
 * @intf's usage counter is incremented to prevent subsequent autosuspends.
 * However if the autoresume fails then the counter is re-decremented.
 *
 * This routine can run only in process context.
 *
 * Return: 0 on success.
 */
int usb_autopm_get_interface(struct usb_interface *intf)
{
        int        status;

        status = pm_runtime_resume_and_get(&intf->dev);
        dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n",
                        __func__, atomic_read(&intf->dev.power.usage_count),
                        status);
        return status;
}
EXPORT_SYMBOL_GPL(usb_autopm_get_interface);

/**
 * usb_autopm_get_interface_async - increment a USB interface's PM-usage counter
 * @intf: the usb_interface whose counter should be incremented
 *
 * This routine does much the same thing as
 * usb_autopm_get_interface(): It increments @intf's usage counter and
 * queues an autoresume request if the device is suspended.  The
 * differences are that it does not perform any synchronization (callers
 * should hold a private lock and handle all synchronization issues
 * themselves), and it does not autoresume the device directly (it only
 * queues a request).  After a successful call, the device may not yet be
 * resumed.
 *
 * This routine can run in atomic context.
 *
 * Return: 0 on success. A negative error code otherwise.
 */
int usb_autopm_get_interface_async(struct usb_interface *intf)
{
        int        status;

        status = pm_runtime_get(&intf->dev);
        if (status < 0 && status != -EINPROGRESS)
                pm_runtime_put_noidle(&intf->dev);
        dev_vdbg(&intf->dev, "%s: cnt %d -> %d\n",
                        __func__, atomic_read(&intf->dev.power.usage_count),
                        status);
        if (status > 0 || status == -EINPROGRESS)
                status = 0;
        return status;
}
EXPORT_SYMBOL_GPL(usb_autopm_get_interface_async);

/**
 * usb_autopm_get_interface_no_resume - increment a USB interface's PM-usage counter
 * @intf: the usb_interface whose counter should be incremented
 *
 * This routine increments @intf's usage counter but does not carry out an
 * autoresume.
 *
 * This routine can run in atomic context.
 */
void usb_autopm_get_interface_no_resume(struct usb_interface *intf)
{
        struct usb_device        *udev = interface_to_usbdev(intf);

        usb_mark_last_busy(udev);
        pm_runtime_get_noresume(&intf->dev);
}
EXPORT_SYMBOL_GPL(usb_autopm_get_interface_no_resume);

/* Internal routine to check whether we may autosuspend a device. */
static int autosuspend_check(struct usb_device *udev)
{
        int                        w, i;
        struct usb_interface        *intf;

        if (udev->state == USB_STATE_NOTATTACHED)
                return -ENODEV;

        /* Fail if autosuspend is disabled, or any interfaces are in use, or
         * any interface drivers require remote wakeup but it isn't available.
         */
        w = 0;
        if (udev->actconfig) {
                for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) {
                        intf = udev->actconfig->interface[i];

                        /* We don't need to check interfaces that are
                         * disabled for runtime PM.  Either they are unbound
                         * or else their drivers don't support autosuspend
                         * and so they are permanently active.
                         */
                        if (intf->dev.power.disable_depth)
                                continue;
                        if (atomic_read(&intf->dev.power.usage_count) > 0)
                                return -EBUSY;
                        w |= intf->needs_remote_wakeup;

                        /* Don't allow autosuspend if the device will need
                         * a reset-resume and any of its interface drivers
                         * doesn't include support or needs remote wakeup.
                         */
                        if (udev->quirks & USB_QUIRK_RESET_RESUME) {
                                struct usb_driver *driver;

                                driver = to_usb_driver(intf->dev.driver);
                                if (!driver->reset_resume ||
                                                intf->needs_remote_wakeup)
                                        return -EOPNOTSUPP;
                        }
                }
        }
        if (w && !device_can_wakeup(&udev->dev)) {
                dev_dbg(&udev->dev, "remote wakeup needed for autosuspend\n");
                return -EOPNOTSUPP;
        }

        /*
         * If the device is a direct child of the root hub and the HCD
         * doesn't handle wakeup requests, don't allow autosuspend when
         * wakeup is needed.
         */
        if (w && udev->parent == udev->bus->root_hub &&
                        bus_to_hcd(udev->bus)->cant_recv_wakeups) {
                dev_dbg(&udev->dev, "HCD doesn't handle wakeup requests\n");
                return -EOPNOTSUPP;
        }

        udev->do_remote_wakeup = w;
        return 0;
}

int usb_runtime_suspend(struct device *dev)
{
        struct usb_device        *udev = to_usb_device(dev);
        int                        status;

        /* A USB device can be suspended if it passes the various autosuspend
         * checks.  Runtime suspend for a USB device means suspending all the
         * interfaces and then the device itself.
         */
        if (autosuspend_check(udev) != 0)
                return -EAGAIN;

        status = usb_suspend_both(udev, PMSG_AUTO_SUSPEND);

        /* Allow a retry if autosuspend failed temporarily */
        if (status == -EAGAIN || status == -EBUSY)
                usb_mark_last_busy(udev);

        /*
         * The PM core reacts badly unless the return code is 0,
         * -EAGAIN, or -EBUSY, so always return -EBUSY on an error
         * (except for root hubs, because they don't suspend through
         * an upstream port like other USB devices).
         */
        if (status != 0 && udev->parent)
                return -EBUSY;
        return status;
}

int usb_runtime_resume(struct device *dev)
{
        struct usb_device        *udev = to_usb_device(dev);
        int                        status;

        /* Runtime resume for a USB device means resuming both the device
         * and all its interfaces.
         */
        status = usb_resume_both(udev, PMSG_AUTO_RESUME);
        return status;
}

int usb_runtime_idle(struct device *dev)
{
        struct usb_device        *udev = to_usb_device(dev);

        /* An idle USB device can be suspended if it passes the various
         * autosuspend checks.
         */
        if (autosuspend_check(udev) == 0)
                pm_runtime_autosuspend(dev);
        /* Tell the core not to suspend it, though. */
        return -EBUSY;
}

static int usb_set_usb2_hardware_lpm(struct usb_device *udev, int enable)
{
        struct usb_hcd *hcd = bus_to_hcd(udev->bus);
        int ret = -EPERM;

        if (hcd->driver->set_usb2_hw_lpm) {
                ret = hcd->driver->set_usb2_hw_lpm(hcd, udev, enable);
                if (!ret)
                        udev->usb2_hw_lpm_enabled = enable;
        }

        return ret;
}

int usb_enable_usb2_hardware_lpm(struct usb_device *udev)
{
        if (!udev->usb2_hw_lpm_capable ||
            !udev->usb2_hw_lpm_allowed ||
            udev->usb2_hw_lpm_enabled)
                return 0;

        return usb_set_usb2_hardware_lpm(udev, 1);
}

int usb_disable_usb2_hardware_lpm(struct usb_device *udev)
{
        if (!udev->usb2_hw_lpm_enabled)
                return 0;

        return usb_set_usb2_hardware_lpm(udev, 0);
}

#endif /* CONFIG_PM */

const struct bus_type usb_bus_type = {
        .name =                "usb",
        .match =        usb_device_match,
        .uevent =        usb_uevent,
        .need_parent_lock =        true,
};














































































































































































































































































































































































































































































































   14 
















































































   14 






























































   14 












   14 











































   14 





   14 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ioctl.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/capability.h>
#include <linux/compat.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/falloc.h>
#include <linux/sched/signal.h>
#include <linux/fiemap.h>
#include <linux/mount.h>
#include <linux/fscrypt.h>
#include <linux/fileattr.h>

#include "internal.h"

#include <asm/ioctls.h>

/* So that the fiemap access checks can't overflow on 32 bit machines. */
#define FIEMAP_MAX_EXTENTS        (UINT_MAX / sizeof(struct fiemap_extent))

/**
 * vfs_ioctl - call filesystem specific ioctl methods
 * @filp:        open file to invoke ioctl method on
 * @cmd:        ioctl command to execute
 * @arg:        command-specific argument for ioctl
 *
 * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
 * returns -ENOTTY.
 *
 * Returns 0 on success, -errno on error.
 */
static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        int error = -ENOTTY;

        if (!filp->f_op->unlocked_ioctl)
                goto out;

        error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
        if (error == -ENOIOCTLCMD)
                error = -ENOTTY;
 out:
        return error;
}

static int ioctl_fibmap(struct file *filp, int __user *p)
{
        struct inode *inode = file_inode(filp);
        struct super_block *sb = inode->i_sb;
        int error, ur_block;
        sector_t block;

        if (!capable(CAP_SYS_RAWIO))
                return -EPERM;

        error = get_user(ur_block, p);
        if (error)
                return error;

        if (ur_block < 0)
                return -EINVAL;

        block = ur_block;
        error = bmap(inode, &block);

        if (block > INT_MAX) {
                error = -ERANGE;
                pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n",
                                    current->comm, task_pid_nr(current),
                                    sb->s_id, filp);
        }

        if (error)
                ur_block = 0;
        else
                ur_block = block;

        if (put_user(ur_block, p))
                error = -EFAULT;

        return error;
}

/**
 * fiemap_fill_next_extent - Fiemap helper function
 * @fieinfo:        Fiemap context passed into ->fiemap
 * @logical:        Extent logical start offset, in bytes
 * @phys:        Extent physical start offset, in bytes
 * @len:        Extent length, in bytes
 * @flags:        FIEMAP_EXTENT flags that describe this extent
 *
 * Called from file system ->fiemap callback. Will populate extent
 * info as passed in via arguments and copy to user memory. On
 * success, extent count on fieinfo is incremented.
 *
 * Returns 0 on success, -errno on error, 1 if this was the last
 * extent that will fit in user array.
 */
int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
                            u64 phys, u64 len, u32 flags)
{
        struct fiemap_extent extent;
        struct fiemap_extent __user *dest = fieinfo->fi_extents_start;

        /* only count the extents */
        if (fieinfo->fi_extents_max == 0) {
                fieinfo->fi_extents_mapped++;
                return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
        }

        if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
                return 1;

#define SET_UNKNOWN_FLAGS        (FIEMAP_EXTENT_DELALLOC)
#define SET_NO_UNMOUNTED_IO_FLAGS        (FIEMAP_EXTENT_DATA_ENCRYPTED)
#define SET_NOT_ALIGNED_FLAGS        (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)

        if (flags & SET_UNKNOWN_FLAGS)
                flags |= FIEMAP_EXTENT_UNKNOWN;
        if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
                flags |= FIEMAP_EXTENT_ENCODED;
        if (flags & SET_NOT_ALIGNED_FLAGS)
                flags |= FIEMAP_EXTENT_NOT_ALIGNED;

        memset(&extent, 0, sizeof(extent));
        extent.fe_logical = logical;
        extent.fe_physical = phys;
        extent.fe_length = len;
        extent.fe_flags = flags;

        dest += fieinfo->fi_extents_mapped;
        if (copy_to_user(dest, &extent, sizeof(extent)))
                return -EFAULT;

        fieinfo->fi_extents_mapped++;
        if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
                return 1;
        return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
}
EXPORT_SYMBOL(fiemap_fill_next_extent);

/**
 * fiemap_prep - check validity of requested flags for fiemap
 * @inode:        Inode to operate on
 * @fieinfo:        Fiemap context passed into ->fiemap
 * @start:        Start of the mapped range
 * @len:        Length of the mapped range, can be truncated by this function.
 * @supported_flags:        Set of fiemap flags that the file system understands
 *
 * This function must be called from each ->fiemap instance to validate the
 * fiemap request against the file system parameters.
 *
 * Returns 0 on success, or a negative error on failure.
 */
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
                u64 start, u64 *len, u32 supported_flags)
{
        u64 maxbytes = inode->i_sb->s_maxbytes;
        u32 incompat_flags;
        int ret = 0;

        if (*len == 0)
                return -EINVAL;
        if (start >= maxbytes)
                return -EFBIG;

        /*
         * Shrink request scope to what the fs can actually handle.
         */
        if (*len > maxbytes || (maxbytes - *len) < start)
                *len = maxbytes - start;

        supported_flags |= FIEMAP_FLAG_SYNC;
        supported_flags &= FIEMAP_FLAGS_COMPAT;
        incompat_flags = fieinfo->fi_flags & ~supported_flags;
        if (incompat_flags) {
                fieinfo->fi_flags = incompat_flags;
                return -EBADR;
        }

        if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
                ret = filemap_write_and_wait(inode->i_mapping);
        return ret;
}
EXPORT_SYMBOL(fiemap_prep);

static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
{
        struct fiemap fiemap;
        struct fiemap_extent_info fieinfo = { 0, };
        struct inode *inode = file_inode(filp);
        int error;

        if (!inode->i_op->fiemap)
                return -EOPNOTSUPP;

        if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
                return -EFAULT;

        if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
                return -EINVAL;

        fieinfo.fi_flags = fiemap.fm_flags;
        fieinfo.fi_extents_max = fiemap.fm_extent_count;
        fieinfo.fi_extents_start = ufiemap->fm_extents;

        error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start,
                        fiemap.fm_length);

        fiemap.fm_flags = fieinfo.fi_flags;
        fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
        if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
                error = -EFAULT;

        return error;
}

static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
                            u64 off, u64 olen, u64 destoff)
{
        CLASS(fd, src_file)(srcfd);
        loff_t cloned;
        int ret;

        if (fd_empty(src_file))
                return -EBADF;
        cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff,
                                      olen, 0);
        if (cloned < 0)
                ret = cloned;
        else if (olen && cloned != olen)
                ret = -EINVAL;
        else
                ret = 0;
        return ret;
}

static int ioctl_file_clone_range(struct file *file,
                                  struct file_clone_range __user *argp)
{
        struct file_clone_range args;

        if (copy_from_user(&args, argp, sizeof(args)))
                return -EFAULT;
        return ioctl_file_clone(file, args.src_fd, args.src_offset,
                                args.src_length, args.dest_offset);
}

/*
 * This provides compatibility with legacy XFS pre-allocation ioctls
 * which predate the fallocate syscall.
 *
 * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
 * are used here, rest are ignored.
 */
static int ioctl_preallocate(struct file *filp, int mode, void __user *argp)
{
        struct inode *inode = file_inode(filp);
        struct space_resv sr;

        if (copy_from_user(&sr, argp, sizeof(sr)))
                return -EFAULT;

        switch (sr.l_whence) {
        case SEEK_SET:
                break;
        case SEEK_CUR:
                sr.l_start += filp->f_pos;
                break;
        case SEEK_END:
                sr.l_start += i_size_read(inode);
                break;
        default:
                return -EINVAL;
        }

        return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start,
                        sr.l_len);
}

/* on ia32 l_start is on a 32-bit boundary */
#if defined CONFIG_COMPAT && defined(CONFIG_X86_64)
/* just account for different alignment */
static int compat_ioctl_preallocate(struct file *file, int mode,
                                    struct space_resv_32 __user *argp)
{
        struct inode *inode = file_inode(file);
        struct space_resv_32 sr;

        if (copy_from_user(&sr, argp, sizeof(sr)))
                return -EFAULT;

        switch (sr.l_whence) {
        case SEEK_SET:
                break;
        case SEEK_CUR:
                sr.l_start += file->f_pos;
                break;
        case SEEK_END:
                sr.l_start += i_size_read(inode);
                break;
        default:
                return -EINVAL;
        }

        return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
}
#endif

static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p)
{
        switch (cmd) {
        case FIBMAP:
                return ioctl_fibmap(filp, p);
        case FS_IOC_RESVSP:
        case FS_IOC_RESVSP64:
                return ioctl_preallocate(filp, 0, p);
        case FS_IOC_UNRESVSP:
        case FS_IOC_UNRESVSP64:
                return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p);
        case FS_IOC_ZERO_RANGE:
                return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p);
        }

        return -ENOIOCTLCMD;
}

static int ioctl_fionbio(struct file *filp, int __user *argp)
{
        unsigned int flag;
        int on, error;

        error = get_user(on, argp);
        if (error)
                return error;
        flag = O_NONBLOCK;
#ifdef __sparc__
        /* SunOS compatibility item. */
        if (O_NONBLOCK != O_NDELAY)
                flag |= O_NDELAY;
#endif
        spin_lock(&filp->f_lock);
        if (on)
                filp->f_flags |= flag;
        else
                filp->f_flags &= ~flag;
        spin_unlock(&filp->f_lock);
        return error;
}

static int ioctl_fioasync(unsigned int fd, struct file *filp,
                          int __user *argp)
{
        unsigned int flag;
        int on, error;

        error = get_user(on, argp);
        if (error)
                return error;
        flag = on ? FASYNC : 0;

        /* Did FASYNC state change ? */
        if ((flag ^ filp->f_flags) & FASYNC) {
                if (filp->f_op->fasync)
                        /* fasync() adjusts filp->f_flags */
                        error = filp->f_op->fasync(fd, filp, on);
                else
                        error = -ENOTTY;
        }
        return error < 0 ? error : 0;
}

static int ioctl_fsfreeze(struct file *filp)
{
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* If filesystem doesn't support freeze feature, return. */
        if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL)
                return -EOPNOTSUPP;

        /* Freeze */
        if (sb->s_op->freeze_super)
                return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
        return freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
}

static int ioctl_fsthaw(struct file *filp)
{
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Thaw */
        if (sb->s_op->thaw_super)
                return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
        return thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
}

static int ioctl_file_dedupe_range(struct file *file,
                                   struct file_dedupe_range __user *argp)
{
        struct file_dedupe_range *same = NULL;
        int ret;
        unsigned long size;
        u16 count;

        if (get_user(count, &argp->dest_count)) {
                ret = -EFAULT;
                goto out;
        }

        size = struct_size(same, info, count);
        if (size > PAGE_SIZE) {
                ret = -ENOMEM;
                goto out;
        }

        same = memdup_user(argp, size);
        if (IS_ERR(same)) {
                ret = PTR_ERR(same);
                same = NULL;
                goto out;
        }

        same->dest_count = count;
        ret = vfs_dedupe_file_range(file, same);
        if (ret)
                goto out;

        ret = copy_to_user(argp, same, size);
        if (ret)
                ret = -EFAULT;

out:
        kfree(same);
        return ret;
}

static int ioctl_getfsuuid(struct file *file, void __user *argp)
{
        struct super_block *sb = file_inode(file)->i_sb;
        struct fsuuid2 u = { .len = sb->s_uuid_len, };

        if (!sb->s_uuid_len)
                return -ENOTTY;

        memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len);

        return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
}

static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp)
{
        struct super_block *sb = file_inode(file)->i_sb;

        if (!strlen(sb->s_sysfs_name))
                return -ENOTTY;

        struct fs_sysfs_path u = {};

        u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name);

        return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
}

/*
 * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
 * It's just a simple helper for sys_ioctl and compat_sys_ioctl.
 *
 * When you add any new common ioctls to the switches above and below,
 * please ensure they have compatible arguments in compat mode.
 *
 * The LSM mailing list should also be notified of any command additions or
 * changes, as specific LSMs may be affected.
 */
static int do_vfs_ioctl(struct file *filp, unsigned int fd,
                        unsigned int cmd, unsigned long arg)
{
        void __user *argp = (void __user *)arg;
        struct inode *inode = file_inode(filp);

        switch (cmd) {
        case FIOCLEX:
                set_close_on_exec(fd, 1);
                return 0;

        case FIONCLEX:
                set_close_on_exec(fd, 0);
                return 0;

        case FIONBIO:
                return ioctl_fionbio(filp, argp);

        case FIOASYNC:
                return ioctl_fioasync(fd, filp, argp);

        case FIOQSIZE:
                if (S_ISDIR(inode->i_mode) ||
                    (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) ||
                    S_ISLNK(inode->i_mode)) {
                        loff_t res = inode_get_bytes(inode);
                        return copy_to_user(argp, &res, sizeof(res)) ?
                                            -EFAULT : 0;
                }

                return -ENOTTY;

        case FIFREEZE:
                return ioctl_fsfreeze(filp);

        case FITHAW:
                return ioctl_fsthaw(filp);

        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, argp);

        case FIGETBSZ:
                /* anon_bdev filesystems may not have a block size */
                if (!inode->i_sb->s_blocksize)
                        return -EINVAL;

                return put_user(inode->i_sb->s_blocksize, (int __user *)argp);

        case FICLONE:
                return ioctl_file_clone(filp, arg, 0, 0, 0);

        case FICLONERANGE:
                return ioctl_file_clone_range(filp, argp);

        case FIDEDUPERANGE:
                return ioctl_file_dedupe_range(filp, argp);

        case FIONREAD:
                if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode))
                        return vfs_ioctl(filp, cmd, arg);

                return put_user(i_size_read(inode) - filp->f_pos,
                                (int __user *)argp);

        case FS_IOC_GETFLAGS:
                return ioctl_getflags(filp, argp);

        case FS_IOC_SETFLAGS:
                return ioctl_setflags(filp, argp);

        case FS_IOC_FSGETXATTR:
                return ioctl_fsgetxattr(filp, argp);

        case FS_IOC_FSSETXATTR:
                return ioctl_fssetxattr(filp, argp);

        case FS_IOC_GETFSUUID:
                return ioctl_getfsuuid(filp, argp);

        case FS_IOC_GETFSSYSFSPATH:
                return ioctl_get_fs_sysfs_path(filp, argp);

        default:
                if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode))
                        return file_ioctl(filp, cmd, argp);
                break;
        }

        return -ENOIOCTLCMD;
}

SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
        CLASS(fd, f)(fd);
        int error;

        if (fd_empty(f))
                return -EBADF;

        error = security_file_ioctl(fd_file(f), cmd, arg);
        if (error)
                return error;

        error = do_vfs_ioctl(fd_file(f), fd, cmd, arg);
        if (error == -ENOIOCTLCMD)
                error = vfs_ioctl(fd_file(f), cmd, arg);

        return error;
}

#ifdef CONFIG_COMPAT
/**
 * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
 * @file: The file to operate on.
 * @cmd: The ioctl command number.
 * @arg: The argument to the ioctl.
 *
 * This is not normally called as a function, but instead set in struct
 * file_operations as
 *
 *     .compat_ioctl = compat_ptr_ioctl,
 *
 * On most architectures, the compat_ptr_ioctl() just passes all arguments
 * to the corresponding ->ioctl handler. The exception is arch/s390, where
 * compat_ptr() clears the top bit of a 32-bit pointer value, so user space
 * pointers to the second 2GB alias the first 2GB, as is the case for
 * native 32-bit s390 user space.
 *
 * The compat_ptr_ioctl() function must therefore be used only with ioctl
 * functions that either ignore the argument or pass a pointer to a
 * compatible data type.
 *
 * If any ioctl command handled by fops->unlocked_ioctl passes a plain
 * integer instead of a pointer, or any of the passed data types
 * is incompatible between 32-bit and 64-bit architectures, a proper
 * handler is required instead of compat_ptr_ioctl.
 */
long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        if (!file->f_op->unlocked_ioctl)
                return -ENOIOCTLCMD;

        return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
}
EXPORT_SYMBOL(compat_ptr_ioctl);

COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        CLASS(fd, f)(fd);
        int error;

        if (fd_empty(f))
                return -EBADF;

        error = security_file_ioctl_compat(fd_file(f), cmd, arg);
        if (error)
                return error;

        switch (cmd) {
        /* FICLONE takes an int argument, so don't use compat_ptr() */
        case FICLONE:
                error = ioctl_file_clone(fd_file(f), arg, 0, 0, 0);
                break;

#if defined(CONFIG_X86_64)
        /* these get messy on amd64 due to alignment differences */
        case FS_IOC_RESVSP_32:
        case FS_IOC_RESVSP64_32:
                error = compat_ioctl_preallocate(fd_file(f), 0, compat_ptr(arg));
                break;
        case FS_IOC_UNRESVSP_32:
        case FS_IOC_UNRESVSP64_32:
                error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_PUNCH_HOLE,
                                compat_ptr(arg));
                break;
        case FS_IOC_ZERO_RANGE_32:
                error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_ZERO_RANGE,
                                compat_ptr(arg));
                break;
#endif

        /*
         * These access 32-bit values anyway so no further handling is
         * necessary.
         */
        case FS_IOC32_GETFLAGS:
        case FS_IOC32_SETFLAGS:
                cmd = (cmd == FS_IOC32_GETFLAGS) ?
                        FS_IOC_GETFLAGS : FS_IOC_SETFLAGS;
                fallthrough;
        /*
         * everything else in do_vfs_ioctl() takes either a compatible
         * pointer argument or no argument -- call it with a modified
         * argument.
         */
        default:
                error = do_vfs_ioctl(fd_file(f), fd, cmd,
                                     (unsigned long)compat_ptr(arg));
                if (error != -ENOIOCTLCMD)
                        break;

                if (fd_file(f)->f_op->compat_ioctl)
                        error = fd_file(f)->f_op->compat_ioctl(fd_file(f), cmd, arg);
                if (error == -ENOIOCTLCMD)
                        error = -ENOTTY;
                break;
        }
        return error;
}
#endif
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   23 





    2 
   20 















   20 







   22 
















   20 


   22 






   21 





   23 










































































































































































































































    4 




    4 

    4 






















    4 




































































































































   20 
   23 


















   19 



































































































































































































































































































































































































































































   14 


   15 



    1 
   14 

   15 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Generic socket support routines. Memory allocators, socket lock/release
 *                handler for protocols to use and generic option handler.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Alan Cox, <A.Cox@swansea.ac.uk>
 *
 * Fixes:
 *                Alan Cox        :         Numerous verify_area() problems
 *                Alan Cox        :        Connecting on a connecting socket
 *                                        now returns an error for tcp.
 *                Alan Cox        :        sock->protocol is set correctly.
 *                                        and is not sometimes left as 0.
 *                Alan Cox        :        connect handles icmp errors on a
 *                                        connect properly. Unfortunately there
 *                                        is a restart syscall nasty there. I
 *                                        can't match BSD without hacking the C
 *                                        library. Ideas urgently sought!
 *                Alan Cox        :        Disallow bind() to addresses that are
 *                                        not ours - especially broadcast ones!!
 *                Alan Cox        :        Socket 1024 _IS_ ok for users. (fencepost)
 *                Alan Cox        :        sock_wfree/sock_rfree don't destroy sockets,
 *                                        instead they leave that for the DESTROY timer.
 *                Alan Cox        :        Clean up error flag in accept
 *                Alan Cox        :        TCP ack handling is buggy, the DESTROY timer
 *                                        was buggy. Put a remove_sock() in the handler
 *                                        for memory when we hit 0. Also altered the timer
 *                                        code. The ACK stuff can wait and needs major
 *                                        TCP layer surgery.
 *                Alan Cox        :        Fixed TCP ack bug, removed remove sock
 *                                        and fixed timer/inet_bh race.
 *                Alan Cox        :        Added zapped flag for TCP
 *                Alan Cox        :        Move kfree_skb into skbuff.c and tidied up surplus code
 *                Alan Cox        :        for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
 *                Alan Cox        :        kfree_s calls now are kfree_skbmem so we can track skb resources
 *                Alan Cox        :        Supports socket option broadcast now as does udp. Packet and raw need fixing.
 *                Alan Cox        :        Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
 *                Rick Sladkey        :        Relaxed UDP rules for matching packets.
 *                C.E.Hawkins        :        IFF_PROMISC/SIOCGHWADDR support
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Fixed connect() taking signals I think.
 *                Alan Cox        :        SO_LINGER supported
 *                Alan Cox        :        Error reporting fixes
 *                Anonymous        :        inet_create tidied up (sk->reuse setting)
 *                Alan Cox        :        inet sockets don't set sk->type!
 *                Alan Cox        :        Split socket option code
 *                Alan Cox        :        Callbacks
 *                Alan Cox        :        Nagle flag for Charles & Johannes stuff
 *                Alex                :        Removed restriction on inet fioctl
 *                Alan Cox        :        Splitting INET from NET core
 *                Alan Cox        :        Fixed bogus SO_TYPE handling in getsockopt()
 *                Adam Caldwell        :        Missing return in SO_DONTROUTE/SO_DEBUG code
 *                Alan Cox        :        Split IP from generic code
 *                Alan Cox        :        New kfree_skbmem()
 *                Alan Cox        :        Make SO_DEBUG superuser only.
 *                Alan Cox        :        Allow anyone to clear SO_DEBUG
 *                                        (compatibility fix)
 *                Alan Cox        :        Added optimistic memory grabbing for AF_UNIX throughput.
 *                Alan Cox        :        Allocator for a socket is settable.
 *                Alan Cox        :        SO_ERROR includes soft errors.
 *                Alan Cox        :        Allow NULL arguments on some SO_ opts
 *                Alan Cox        :         Generic socket allocation to make hooks
 *                                        easier (suggested by Craig Metz).
 *                Michael Pall        :        SO_ERROR returns positive errno again
 *              Steve Whitehouse:       Added default destructor to free
 *                                      protocol private data.
 *              Steve Whitehouse:       Added various other default routines
 *                                      common to several socket families.
 *              Chris Evans     :       Call suser() check last on F_SETOWN
 *                Jay Schulist        :        Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
 *                Andi Kleen        :        Add sock_kmalloc()/sock_kfree_s()
 *                Andi Kleen        :        Fix write_space callback
 *                Chris Evans        :        Security fixes - signedness again
 *                Arnaldo C. Melo :       cleanups, use skb_queue_purge
 *
 * To Fix:
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/poll.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/user_namespace.h>
#include <linux/static_key.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
#include <linux/compat.h>
#include <linux/mroute.h>
#include <linux/mroute6.h>
#include <linux/icmpv6.h>

#include <linux/uaccess.h>

#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <linux/skbuff_ref.h>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
#include <net/proto_memory.h>
#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
#include <net/netprio_cgroup.h>
#include <linux/sock_diag.h>

#include <linux/filter.h>
#include <net/sock_reuseport.h>
#include <net/bpf_sk_storage.h>

#include <trace/events/sock.h>

#include <net/tcp.h>
#include <net/busy_poll.h>
#include <net/phonet/phonet.h>

#include <linux/ethtool.h>

#include <uapi/linux/pidfd.h>

#include "dev.h"

static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);

static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
static void sock_def_write_space(struct sock *sk);

/**
 * sk_ns_capable - General socket capability test
 * @sk: Socket to use a capability on or through
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket had when the socket was
 * created and the current process has the capability @cap in the user
 * namespace @user_ns.
 */
bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap)
{
        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
                ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(sk_ns_capable);

/**
 * sk_capable - Socket global capability test
 * @sk: Socket to use a capability on or through
 * @cap: The global capability to use
 *
 * Test to see if the opener of the socket had when the socket was
 * created and the current process has the capability @cap in all user
 * namespaces.
 */
bool sk_capable(const struct sock *sk, int cap)
{
        return sk_ns_capable(sk, &init_user_ns, cap);
}
EXPORT_SYMBOL(sk_capable);

/**
 * sk_net_capable - Network namespace socket capability test
 * @sk: Socket to use a capability on or through
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket had when the socket was created
 * and the current process has the capability @cap over the network namespace
 * the socket is a member of.
 */
bool sk_net_capable(const struct sock *sk, int cap)
{
        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
}
EXPORT_SYMBOL(sk_net_capable);

/*
 * Each address family might have different locking rules, so we have
 * one slock key per address family and separate keys for internal and
 * userspace sockets.
 */
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_kern_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
static struct lock_class_key af_family_kern_slock_keys[AF_MAX];

/*
 * Make lock validator output more readable. (we pre-construct these
 * strings build-time, so that runtime initialization of socket
 * locks is fast):
 */

#define _sock_locks(x)                                                  \
  x "AF_UNSPEC",        x "AF_UNIX"     ,        x "AF_INET"     , \
  x "AF_AX25"  ,        x "AF_IPX"      ,        x "AF_APPLETALK", \
  x "AF_NETROM",        x "AF_BRIDGE"   ,        x "AF_ATMPVC"   , \
  x "AF_X25"   ,        x "AF_INET6"    ,        x "AF_ROSE"     , \
  x "AF_DECnet",        x "AF_NETBEUI"  ,        x "AF_SECURITY" , \
  x "AF_KEY"   ,        x "AF_NETLINK"  ,        x "AF_PACKET"   , \
  x "AF_ASH"   ,        x "AF_ECONET"   ,        x "AF_ATMSVC"   , \
  x "AF_RDS"   ,        x "AF_SNA"      ,        x "AF_IRDA"     , \
  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,        x "AF_LLC"      , \
  x "27"       ,        x "28"          ,        x "AF_CAN"      , \
  x "AF_TIPC"  ,        x "AF_BLUETOOTH",        x "IUCV"        , \
  x "AF_RXRPC" ,        x "AF_ISDN"     ,        x "AF_PHONET"   , \
  x "AF_IEEE802154",        x "AF_CAIF"        ,        x "AF_ALG"      , \
  x "AF_NFC"   ,        x "AF_VSOCK"    ,        x "AF_KCM"      , \
  x "AF_QIPCRTR",        x "AF_SMC"        ,        x "AF_XDP"        , \
  x "AF_MCTP"  , \
  x "AF_MAX"

static const char *const af_family_key_strings[AF_MAX+1] = {
        _sock_locks("sk_lock-")
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
        _sock_locks("slock-")
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
        _sock_locks("clock-")
};

static const char *const af_family_kern_key_strings[AF_MAX+1] = {
        _sock_locks("k-sk_lock-")
};
static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
        _sock_locks("k-slock-")
};
static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
        _sock_locks("k-clock-")
};
static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
        _sock_locks("rlock-")
};
static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
        _sock_locks("wlock-")
};
static const char *const af_family_elock_key_strings[AF_MAX+1] = {
        _sock_locks("elock-")
};

/*
 * sk_callback_lock and sk queues locking rules are per-address-family,
 * so split the lock classes by using a per-AF key:
 */
static struct lock_class_key af_callback_keys[AF_MAX];
static struct lock_class_key af_rlock_keys[AF_MAX];
static struct lock_class_key af_wlock_keys[AF_MAX];
static struct lock_class_key af_elock_keys[AF_MAX];
static struct lock_class_key af_kern_callback_keys[AF_MAX];

/* Run time adjustable parameters. */
__u32 sysctl_wmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_wmem_max);
__u32 sysctl_rmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;

DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);

/**
 * sk_set_memalloc - sets %SOCK_MEMALLOC
 * @sk: socket to set it on
 *
 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 * It's the responsibility of the admin to adjust min_free_kbytes
 * to meet the requirements
 */
void sk_set_memalloc(struct sock *sk)
{
        sock_set_flag(sk, SOCK_MEMALLOC);
        sk->sk_allocation |= __GFP_MEMALLOC;
        static_branch_inc(&memalloc_socks_key);
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);

void sk_clear_memalloc(struct sock *sk)
{
        sock_reset_flag(sk, SOCK_MEMALLOC);
        sk->sk_allocation &= ~__GFP_MEMALLOC;
        static_branch_dec(&memalloc_socks_key);

        /*
         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
         * progress of swapping. SOCK_MEMALLOC may be cleared while
         * it has rmem allocations due to the last swapfile being deactivated
         * but there is a risk that the socket is unusable due to exceeding
         * the rmem limits. Reclaim the reserves and obey rmem limits again.
         */
        sk_mem_reclaim(sk);
}
EXPORT_SYMBOL_GPL(sk_clear_memalloc);

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        int ret;
        unsigned int noreclaim_flag;

        /* these should have been dropped before queueing */
        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));

        noreclaim_flag = memalloc_noreclaim_save();
        ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
                                 tcp_v6_do_rcv,
                                 tcp_v4_do_rcv,
                                 sk, skb);
        memalloc_noreclaim_restore(noreclaim_flag);

        return ret;
}
EXPORT_SYMBOL(__sk_backlog_rcv);

void sk_error_report(struct sock *sk)
{
        sk->sk_error_report(sk);

        switch (sk->sk_family) {
        case AF_INET:
                fallthrough;
        case AF_INET6:
                trace_inet_sk_error_report(sk);
                break;
        default:
                break;
        }
}
EXPORT_SYMBOL(sk_error_report);

int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
        struct __kernel_sock_timeval tv;

        if (timeo == MAX_SCHEDULE_TIMEOUT) {
                tv.tv_sec = 0;
                tv.tv_usec = 0;
        } else {
                tv.tv_sec = timeo / HZ;
                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
        }

        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
                *(struct old_timeval32 *)optval = tv32;
                return sizeof(tv32);
        }

        if (old_timeval) {
                struct __kernel_old_timeval old_tv;
                old_tv.tv_sec = tv.tv_sec;
                old_tv.tv_usec = tv.tv_usec;
                *(struct __kernel_old_timeval *)optval = old_tv;
                return sizeof(old_tv);
        }

        *(struct __kernel_sock_timeval *)optval = tv;
        return sizeof(tv);
}
EXPORT_SYMBOL(sock_get_timeout);

int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
                           sockptr_t optval, int optlen, bool old_timeval)
{
        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct old_timeval32 tv32;

                if (optlen < sizeof(tv32))
                        return -EINVAL;

                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
                        return -EFAULT;
                tv->tv_sec = tv32.tv_sec;
                tv->tv_usec = tv32.tv_usec;
        } else if (old_timeval) {
                struct __kernel_old_timeval old_tv;

                if (optlen < sizeof(old_tv))
                        return -EINVAL;
                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
                        return -EFAULT;
                tv->tv_sec = old_tv.tv_sec;
                tv->tv_usec = old_tv.tv_usec;
        } else {
                if (optlen < sizeof(*tv))
                        return -EINVAL;
                if (copy_from_sockptr(tv, optval, sizeof(*tv)))
                        return -EFAULT;
        }

        return 0;
}
EXPORT_SYMBOL(sock_copy_user_timeval);

static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
                            bool old_timeval)
{
        struct __kernel_sock_timeval tv;
        int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
        long val;

        if (err)
                return err;

        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
                return -EDOM;

        if (tv.tv_sec < 0) {
                static int warned __read_mostly;

                WRITE_ONCE(*timeo_p, 0);
                if (warned < 10 && net_ratelimit()) {
                        warned++;
                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
                                __func__, current->comm, task_pid_nr(current));
                }
                return 0;
        }
        val = MAX_SCHEDULE_TIMEOUT;
        if ((tv.tv_sec || tv.tv_usec) &&
            (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
                val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
                                                    USEC_PER_SEC / HZ);
        WRITE_ONCE(*timeo_p, val);
        return 0;
}

static bool sk_set_prio_allowed(const struct sock *sk, int val)
{
        return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
                sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
                sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
}

static bool sock_needs_netstamp(const struct sock *sk)
{
        switch (sk->sk_family) {
        case AF_UNSPEC:
        case AF_UNIX:
                return false;
        default:
                return true;
        }
}

static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
        if (sk->sk_flags & flags) {
                sk->sk_flags &= ~flags;
                if (sock_needs_netstamp(sk) &&
                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
                        net_disable_timestamp();
        }
}


int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        unsigned long flags;
        struct sk_buff_head *list = &sk->sk_receive_queue;

        if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
                sk_drops_inc(sk);
                trace_sock_rcvqueue_full(sk, skb);
                return -ENOMEM;
        }

        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
                sk_drops_inc(sk);
                return -ENOBUFS;
        }

        skb->dev = NULL;
        skb_set_owner_r(skb, sk);

        /* we escape from rcu protected region, make sure we dont leak
         * a norefcounted dst
         */
        skb_dst_force(skb);

        spin_lock_irqsave(&list->lock, flags);
        sock_skb_set_dropcount(sk, skb);
        __skb_queue_tail(list, skb);
        spin_unlock_irqrestore(&list->lock, flags);

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_data_ready(sk);
        return 0;
}
EXPORT_SYMBOL(__sock_queue_rcv_skb);

enum skb_drop_reason
sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason drop_reason;
        int err;

        drop_reason = sk_filter_reason(sk, skb);
        if (drop_reason)
                return drop_reason;

        err = __sock_queue_rcv_skb(sk, skb);
        switch (err) {
        case -ENOMEM:
                return SKB_DROP_REASON_SOCKET_RCVBUFF;
        case -ENOBUFS:
                return SKB_DROP_REASON_PROTO_MEM;
        }
        return SKB_NOT_DROPPED_YET;
}
EXPORT_SYMBOL(sock_queue_rcv_skb_reason);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                     const int nested, unsigned int trim_cap, bool refcounted)
{
        enum skb_drop_reason reason;
        int rc = NET_RX_SUCCESS;
        int err;

        reason = sk_filter_trim_cap(sk, skb, trim_cap);
        if (reason)
                goto discard_and_relse;

        skb->dev = NULL;

        if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
                sk_drops_inc(sk);
                reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
                goto discard_and_relse;
        }
        if (nested)
                bh_lock_sock_nested(sk);
        else
                bh_lock_sock(sk);
        if (!sock_owned_by_user(sk)) {
                /*
                 * trylock + unlock semantics:
                 */
                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);

                rc = sk_backlog_rcv(sk, skb);

                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
        } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
                bh_unlock_sock(sk);
                if (err == -ENOMEM)
                        reason = SKB_DROP_REASON_PFMEMALLOC;
                if (err == -ENOBUFS)
                        reason = SKB_DROP_REASON_SOCKET_BACKLOG;
                sk_drops_inc(sk);
                goto discard_and_relse;
        }

        bh_unlock_sock(sk);
out:
        if (refcounted)
                sock_put(sk);
        return rc;
discard_and_relse:
        sk_skb_reason_drop(sk, skb, reason);
        goto out;
}
EXPORT_SYMBOL(__sk_receive_skb);

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
                                                          u32));
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
        struct dst_entry *dst = __sk_dst_get(sk);

        if (dst && READ_ONCE(dst->obsolete) &&
            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
                               dst, cookie) == NULL) {
                sk_tx_queue_clear(sk);
                WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
                dst_release(dst);
                return NULL;
        }

        return dst;
}
EXPORT_SYMBOL(__sk_dst_check);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
        struct dst_entry *dst = sk_dst_get(sk);

        if (dst && READ_ONCE(dst->obsolete) &&
            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
                               dst, cookie) == NULL) {
                sk_dst_reset(sk);
                dst_release(dst);
                return NULL;
        }

        return dst;
}
EXPORT_SYMBOL(sk_dst_check);

static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);

        /* Sorry... */
        ret = -EPERM;
        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
                goto out;

        ret = -EINVAL;
        if (ifindex < 0)
                goto out;

        /* Paired with all READ_ONCE() done locklessly. */
        WRITE_ONCE(sk->sk_bound_dev_if, ifindex);

        if (sk->sk_prot->rehash)
                sk->sk_prot->rehash(sk);
        sk_dst_reset(sk);

        ret = 0;

out:
#endif

        return ret;
}

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
{
        int ret;

        if (lock_sk)
                lock_sock(sk);
        ret = sock_bindtoindex_locked(sk, ifindex);
        if (lock_sk)
                release_sock(sk);

        return ret;
}
EXPORT_SYMBOL(sock_bindtoindex);

static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];
        int index;

        ret = -EINVAL;
        if (optlen < 0)
                goto out;

        /* Bind this socket to a particular device like "eth0",
         * as specified in the passed interface name. If the
         * name is "" or the option length is zero the socket
         * is not bound.
         */
        if (optlen > IFNAMSIZ - 1)
                optlen = IFNAMSIZ - 1;
        memset(devname, 0, sizeof(devname));

        ret = -EFAULT;
        if (copy_from_sockptr(devname, optval, optlen))
                goto out;

        index = 0;
        if (devname[0] != '\0') {
                struct net_device *dev;

                rcu_read_lock();
                dev = dev_get_by_name_rcu(net, devname);
                if (dev)
                        index = dev->ifindex;
                rcu_read_unlock();
                ret = -ENODEV;
                if (!dev)
                        goto out;
        }

        sockopt_lock_sock(sk);
        ret = sock_bindtoindex_locked(sk, index);
        sockopt_release_sock(sk);
out:
#endif

        return ret;
}

static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
                                sockptr_t optlen, int len)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
        struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];

        if (bound_dev_if == 0) {
                len = 0;
                goto zero;
        }

        ret = -EINVAL;
        if (len < IFNAMSIZ)
                goto out;

        ret = netdev_get_name(net, devname, bound_dev_if);
        if (ret)
                goto out;

        len = strlen(devname) + 1;

        ret = -EFAULT;
        if (copy_to_sockptr(optval, devname, len))
                goto out;

zero:
        ret = -EFAULT;
        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                goto out;

        ret = 0;

out:
#endif

        return ret;
}

bool sk_mc_loop(const struct sock *sk)
{
        if (dev_recursion_level())
                return false;
        if (!sk)
                return true;
        /* IPV6_ADDRFORM can change sk->sk_family under us. */
        switch (READ_ONCE(sk->sk_family)) {
        case AF_INET:
                return inet_test_bit(MC_LOOP, sk);
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                return inet6_test_bit(MC6_LOOP, sk);
#endif
        }
        WARN_ON_ONCE(1);
        return true;
}
EXPORT_SYMBOL(sk_mc_loop);

void sock_set_reuseaddr(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_reuse = SK_CAN_REUSE;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_reuseaddr);

void sock_set_reuseport(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_reuseport = true;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_reuseport);

void sock_no_linger(struct sock *sk)
{
        lock_sock(sk);
        WRITE_ONCE(sk->sk_lingertime, 0);
        sock_set_flag(sk, SOCK_LINGER);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_no_linger);

void sock_set_priority(struct sock *sk, u32 priority)
{
        WRITE_ONCE(sk->sk_priority, priority);
}
EXPORT_SYMBOL(sock_set_priority);

void sock_set_sndtimeo(struct sock *sk, s64 secs)
{
        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
                WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
        else
                WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
}
EXPORT_SYMBOL(sock_set_sndtimeo);

static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
{
        sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
        sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
        if (val)  {
                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        }
}

void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
{
        switch (optname) {
        case SO_TIMESTAMP_OLD:
                __sock_set_timestamps(sk, valbool, false, false);
                break;
        case SO_TIMESTAMP_NEW:
                __sock_set_timestamps(sk, valbool, true, false);
                break;
        case SO_TIMESTAMPNS_OLD:
                __sock_set_timestamps(sk, valbool, false, true);
                break;
        case SO_TIMESTAMPNS_NEW:
                __sock_set_timestamps(sk, valbool, true, true);
                break;
        }
}

static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
{
        struct net *net = sock_net(sk);
        struct net_device *dev = NULL;
        bool match = false;
        int *vclock_index;
        int i, num;

        if (sk->sk_bound_dev_if)
                dev = dev_get_by_index(net, sk->sk_bound_dev_if);

        if (!dev) {
                pr_err("%s: sock not bind to device\n", __func__);
                return -EOPNOTSUPP;
        }

        num = ethtool_get_phc_vclocks(dev, &vclock_index);
        dev_put(dev);

        for (i = 0; i < num; i++) {
                if (*(vclock_index + i) == phc_index) {
                        match = true;
                        break;
                }
        }

        if (num > 0)
                kfree(vclock_index);

        if (!match)
                return -EINVAL;

        WRITE_ONCE(sk->sk_bind_phc, phc_index);

        return 0;
}

int sock_set_timestamping(struct sock *sk, int optname,
                          struct so_timestamping timestamping)
{
        int val = timestamping.flags;
        int ret;

        if (val & ~SOF_TIMESTAMPING_MASK)
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
            !(val & SOF_TIMESTAMPING_OPT_ID))
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_OPT_ID &&
            !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
                if (sk_is_tcp(sk)) {
                        if ((1 << sk->sk_state) &
                            (TCPF_CLOSE | TCPF_LISTEN))
                                return -EINVAL;
                        if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
                                atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
                        else
                                atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
                } else {
                        atomic_set(&sk->sk_tskey, 0);
                }
        }

        if (val & SOF_TIMESTAMPING_OPT_STATS &&
            !(val & SOF_TIMESTAMPING_OPT_TSONLY))
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_BIND_PHC) {
                ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
                if (ret)
                        return ret;
        }

        WRITE_ONCE(sk->sk_tsflags, val);
        sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
        sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));

        if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
                sock_enable_timestamp(sk,
                                      SOCK_TIMESTAMPING_RX_SOFTWARE);
        else
                sock_disable_timestamp(sk,
                                       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
        return 0;
}

#if defined(CONFIG_CGROUP_BPF)
void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
{
        struct bpf_sock_ops_kern sock_ops;

        memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
        sock_ops.op = op;
        sock_ops.is_fullsock = 1;
        sock_ops.sk = sk;
        bpf_skops_init_skb(&sock_ops, skb, 0);
        __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
}
#endif

void sock_set_keepalive(struct sock *sk)
{
        lock_sock(sk);
        if (sk->sk_prot->keepalive)
                sk->sk_prot->keepalive(sk, true);
        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_keepalive);

static void __sock_set_rcvbuf(struct sock *sk, int val)
{
        struct socket *sock = sk->sk_socket;

        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
         * as a negative value.
         */
        val = min_t(int, val, INT_MAX / 2);
        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;

        /* We double it on the way in to account for "struct sk_buff" etc.
         * overhead.   Applications assume that the SO_RCVBUF setting they make
         * will allow that much actual data to be received on that socket.
         *
         * Applications are unaware that "struct sk_buff" and other overheads
         * allocate from the receive buffer during socket buffer allocation.
         *
         * And after considering the possible alternatives, returning the value
         * we actually used in getsockopt is the most desirable behavior.
         */
        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));

        if (sock) {
                const struct proto_ops *ops = READ_ONCE(sock->ops);

                if (ops->set_rcvbuf)
                        ops->set_rcvbuf(sk, sk->sk_rcvbuf);
        }
}

void sock_set_rcvbuf(struct sock *sk, int val)
{
        lock_sock(sk);
        __sock_set_rcvbuf(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_rcvbuf);

static void __sock_set_mark(struct sock *sk, u32 val)
{
        if (val != sk->sk_mark) {
                WRITE_ONCE(sk->sk_mark, val);
                sk_dst_reset(sk);
        }
}

void sock_set_mark(struct sock *sk, u32 val)
{
        lock_sock(sk);
        __sock_set_mark(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_mark);

static void sock_release_reserved_memory(struct sock *sk, int bytes)
{
        /* Round down bytes to multiple of pages */
        bytes = round_down(bytes, PAGE_SIZE);

        WARN_ON(bytes > sk->sk_reserved_mem);
        WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
        sk_mem_reclaim(sk);
}

static int sock_reserve_memory(struct sock *sk, int bytes)
{
        long allocated;
        bool charged;
        int pages;

        if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
                return -EOPNOTSUPP;

        if (!bytes)
                return 0;

        pages = sk_mem_pages(bytes);

        /* pre-charge to memcg */
        charged = mem_cgroup_sk_charge(sk, pages,
                                       GFP_KERNEL | __GFP_RETRY_MAYFAIL);
        if (!charged)
                return -ENOMEM;

        if (sk->sk_bypass_prot_mem)
                goto success;

        /* pre-charge to forward_alloc */
        sk_memory_allocated_add(sk, pages);
        allocated = sk_memory_allocated(sk);

        /* If the system goes into memory pressure with this
         * precharge, give up and return error.
         */
        if (allocated > sk_prot_mem_limits(sk, 1)) {
                sk_memory_allocated_sub(sk, pages);
                mem_cgroup_sk_uncharge(sk, pages);
                return -ENOMEM;
        }

success:
        sk_forward_alloc_add(sk, pages << PAGE_SHIFT);

        WRITE_ONCE(sk->sk_reserved_mem,
                   sk->sk_reserved_mem + (pages << PAGE_SHIFT));

        return 0;
}

#ifdef CONFIG_PAGE_POOL

/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
 * in 1 syscall. The limit exists to limit the amount of memory the kernel
 * allocates to copy these tokens, and to prevent looping over the frags for
 * too long.
 */
#define MAX_DONTNEED_TOKENS 128
#define MAX_DONTNEED_FRAGS 1024

static noinline_for_stack int
sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
{
        unsigned int num_tokens, i, j, k, netmem_num = 0;
        struct dmabuf_token *tokens;
        int ret = 0, num_frags = 0;
        netmem_ref netmems[16];

        if (!sk_is_tcp(sk))
                return -EBADF;

        if (optlen % sizeof(*tokens) ||
            optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
                return -EINVAL;

        num_tokens = optlen / sizeof(*tokens);
        tokens = kvmalloc_objs(*tokens, num_tokens);
        if (!tokens)
                return -ENOMEM;

        if (copy_from_sockptr(tokens, optval, optlen)) {
                kvfree(tokens);
                return -EFAULT;
        }

        xa_lock_bh(&sk->sk_user_frags);
        for (i = 0; i < num_tokens; i++) {
                for (j = 0; j < tokens[i].token_count; j++) {
                        if (++num_frags > MAX_DONTNEED_FRAGS)
                                goto frag_limit_reached;

                        netmem_ref netmem = (__force netmem_ref)__xa_erase(
                                &sk->sk_user_frags, tokens[i].token_start + j);

                        if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
                                continue;

                        netmems[netmem_num++] = netmem;
                        if (netmem_num == ARRAY_SIZE(netmems)) {
                                xa_unlock_bh(&sk->sk_user_frags);
                                for (k = 0; k < netmem_num; k++)
                                        WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
                                netmem_num = 0;
                                xa_lock_bh(&sk->sk_user_frags);
                        }
                        ret++;
                }
        }

frag_limit_reached:
        xa_unlock_bh(&sk->sk_user_frags);
        for (k = 0; k < netmem_num; k++)
                WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));

        kvfree(tokens);
        return ret;
}
#endif

void sockopt_lock_sock(struct sock *sk)
{
        /* When current->bpf_ctx is set, the setsockopt is called from
         * a bpf prog.  bpf has ensured the sk lock has been
         * acquired before calling setsockopt().
         */
        if (has_current_bpf_ctx())
                return;

        lock_sock(sk);
}
EXPORT_SYMBOL(sockopt_lock_sock);

void sockopt_release_sock(struct sock *sk)
{
        if (has_current_bpf_ctx())
                return;

        release_sock(sk);
}
EXPORT_SYMBOL(sockopt_release_sock);

bool sockopt_ns_capable(struct user_namespace *ns, int cap)
{
        return has_current_bpf_ctx() || ns_capable(ns, cap);
}
EXPORT_SYMBOL(sockopt_ns_capable);

bool sockopt_capable(int cap)
{
        return has_current_bpf_ctx() || capable(cap);
}
EXPORT_SYMBOL(sockopt_capable);

static int sockopt_validate_clockid(__kernel_clockid_t value)
{
        switch (value) {
        case CLOCK_REALTIME:
        case CLOCK_MONOTONIC:
        case CLOCK_TAI:
                return 0;
        }
        return -EINVAL;
}

/*
 *        This is meant for all protocols to use and covers goings on
 *        at the socket level. Everything here is generic.
 */

int sk_setsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, unsigned int optlen)
{
        struct so_timestamping timestamping;
        struct socket *sock = sk->sk_socket;
        struct sock_txtime sk_txtime;
        int val;
        int valbool;
        struct linger ling;
        int ret = 0;

        /*
         *        Options without arguments
         */

        if (optname == SO_BINDTODEVICE)
                return sock_setbindtodevice(sk, optval, optlen);

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        valbool = val ? 1 : 0;

        /* handle options which do not require locking the socket. */
        switch (optname) {
        case SO_PRIORITY:
                if (sk_set_prio_allowed(sk, val)) {
                        sock_set_priority(sk, val);
                        return 0;
                }
                return -EPERM;
        case SO_TYPE:
        case SO_PROTOCOL:
        case SO_DOMAIN:
        case SO_ERROR:
                return -ENOPROTOOPT;
#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_BUSY_POLL:
                if (val < 0)
                        return -EINVAL;
                WRITE_ONCE(sk->sk_ll_usec, val);
                return 0;
        case SO_PREFER_BUSY_POLL:
                if (valbool && !sockopt_capable(CAP_NET_ADMIN))
                        return -EPERM;
                WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
                return 0;
        case SO_BUSY_POLL_BUDGET:
                if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
                    !sockopt_capable(CAP_NET_ADMIN))
                        return -EPERM;
                if (val < 0 || val > U16_MAX)
                        return -EINVAL;
                WRITE_ONCE(sk->sk_busy_poll_budget, val);
                return 0;
#endif
        case SO_MAX_PACING_RATE:
                {
                unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
                unsigned long pacing_rate;

                if (sizeof(ulval) != sizeof(val) &&
                    optlen >= sizeof(ulval) &&
                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
                        return -EFAULT;
                }
                if (ulval != ~0UL)
                        cmpxchg(&sk->sk_pacing_status,
                                SK_PACING_NONE,
                                SK_PACING_NEEDED);
                /* Pairs with READ_ONCE() from sk_getsockopt() */
                WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
                pacing_rate = READ_ONCE(sk->sk_pacing_rate);
                if (ulval < pacing_rate)
                        WRITE_ONCE(sk->sk_pacing_rate, ulval);
                return 0;
                }
        case SO_TXREHASH:
                if (!sk_is_tcp(sk))
                        return -EOPNOTSUPP;
                if (val < -1 || val > 1)
                        return -EINVAL;
                if ((u8)val == SOCK_TXREHASH_DEFAULT)
                        val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
                /* Paired with READ_ONCE() in tcp_rtx_synack()
                 * and sk_getsockopt().
                 */
                WRITE_ONCE(sk->sk_txrehash, (u8)val);
                return 0;
        case SO_PEEK_OFF:
                {
                int (*set_peek_off)(struct sock *sk, int val);

                set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
                if (set_peek_off)
                        ret = set_peek_off(sk, val);
                else
                        ret = -EOPNOTSUPP;
                return ret;
                }
#ifdef CONFIG_PAGE_POOL
        case SO_DEVMEM_DONTNEED:
                return sock_devmem_dontneed(sk, optval, optlen);
#endif
        case SO_SNDTIMEO_OLD:
        case SO_SNDTIMEO_NEW:
                return sock_set_timeout(&sk->sk_sndtimeo, optval,
                                        optlen, optname == SO_SNDTIMEO_OLD);
        case SO_RCVTIMEO_OLD:
        case SO_RCVTIMEO_NEW:
                return sock_set_timeout(&sk->sk_rcvtimeo, optval,
                                        optlen, optname == SO_RCVTIMEO_OLD);
        }

        sockopt_lock_sock(sk);

        switch (optname) {
        case SO_DEBUG:
                if (val && !sockopt_capable(CAP_NET_ADMIN))
                        ret = -EACCES;
                else
                        sock_valbool_flag(sk, SOCK_DBG, valbool);
                break;
        case SO_REUSEADDR:
                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
                break;
        case SO_REUSEPORT:
                if (valbool && !sk_is_inet(sk))
                        ret = -EOPNOTSUPP;
                else
                        sk->sk_reuseport = valbool;
                break;
        case SO_DONTROUTE:
                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
                sk_dst_reset(sk);
                break;
        case SO_BROADCAST:
                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
                break;
        case SO_SNDBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
                val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
                /* Ensure val * 2 fits into an int, to prevent max_t()
                 * from treating it as a negative value.
                 */
                val = min_t(int, val, INT_MAX / 2);
                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
                WRITE_ONCE(sk->sk_sndbuf,
                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
                /* Wake up sending tasks if we upped the value. */
                sk->sk_write_space(sk);
                break;

        case SO_SNDBUFFORCE:
                if (!sockopt_capable(CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                /* No negative values (to prevent underflow, as val will be
                 * multiplied by 2).
                 */
                if (val < 0)
                        val = 0;
                goto set_sndbuf;

        case SO_RCVBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
                __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
                break;

        case SO_RCVBUFFORCE:
                if (!sockopt_capable(CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                /* No negative values (to prevent underflow, as val will be
                 * multiplied by 2).
                 */
                __sock_set_rcvbuf(sk, max(val, 0));
                break;

        case SO_KEEPALIVE:
                if (sk->sk_prot->keepalive)
                        sk->sk_prot->keepalive(sk, valbool);
                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
                break;

        case SO_OOBINLINE:
                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
                break;

        case SO_NO_CHECK:
                sk->sk_no_check_tx = valbool;
                break;

        case SO_LINGER:
                if (optlen < sizeof(ling)) {
                        ret = -EINVAL;        /* 1003.1g */
                        break;
                }
                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
                        ret = -EFAULT;
                        break;
                }
                if (!ling.l_onoff) {
                        sock_reset_flag(sk, SOCK_LINGER);
                } else {
                        unsigned long t_sec = ling.l_linger;

                        if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
                                WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
                        else
                                WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
                        sock_set_flag(sk, SOCK_LINGER);
                }
                break;

        case SO_BSDCOMPAT:
                break;

        case SO_TIMESTAMP_OLD:
        case SO_TIMESTAMP_NEW:
        case SO_TIMESTAMPNS_OLD:
        case SO_TIMESTAMPNS_NEW:
                sock_set_timestamp(sk, optname, valbool);
                break;

        case SO_TIMESTAMPING_NEW:
        case SO_TIMESTAMPING_OLD:
                if (optlen == sizeof(timestamping)) {
                        if (copy_from_sockptr(&timestamping, optval,
                                              sizeof(timestamping))) {
                                ret = -EFAULT;
                                break;
                        }
                } else {
                        memset(&timestamping, 0, sizeof(timestamping));
                        timestamping.flags = val;
                }
                ret = sock_set_timestamping(sk, optname, timestamping);
                break;

        case SO_RCVLOWAT:
                {
                int (*set_rcvlowat)(struct sock *sk, int val) = NULL;

                if (val < 0)
                        val = INT_MAX;
                if (sock)
                        set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
                if (set_rcvlowat)
                        ret = set_rcvlowat(sk, val);
                else
                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
                break;
                }
        case SO_ATTACH_FILTER: {
                struct sock_fprog fprog;

                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
                if (!ret)
                        ret = sk_attach_filter(&fprog, sk);
                break;
        }
        case SO_ATTACH_BPF:
                ret = -EINVAL;
                if (optlen == sizeof(u32)) {
                        u32 ufd;

                        ret = -EFAULT;
                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
                                break;

                        ret = sk_attach_bpf(ufd, sk);
                }
                break;

        case SO_ATTACH_REUSEPORT_CBPF: {
                struct sock_fprog fprog;

                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
                if (!ret)
                        ret = sk_reuseport_attach_filter(&fprog, sk);
                break;
        }
        case SO_ATTACH_REUSEPORT_EBPF:
                ret = -EINVAL;
                if (optlen == sizeof(u32)) {
                        u32 ufd;

                        ret = -EFAULT;
                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
                                break;

                        ret = sk_reuseport_attach_bpf(ufd, sk);
                }
                break;

        case SO_DETACH_REUSEPORT_BPF:
                ret = reuseport_detach_prog(sk);
                break;

        case SO_DETACH_FILTER:
                ret = sk_detach_filter(sk);
                break;

        case SO_LOCK_FILTER:
                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
                        ret = -EPERM;
                else
                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
                break;

        case SO_MARK:
                if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
                    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                __sock_set_mark(sk, val);
                break;
        case SO_RCVMARK:
                sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
                break;

        case SO_RCVPRIORITY:
                sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
                break;

        case SO_RXQ_OVFL:
                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
                break;

        case SO_WIFI_STATUS:
                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
                break;

        case SO_NOFCS:
                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
                break;

        case SO_SELECT_ERR_QUEUE:
                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
                break;

        case SO_PASSCRED:
                if (sk_may_scm_recv(sk))
                        sk->sk_scm_credentials = valbool;
                else
                        ret = -EOPNOTSUPP;
                break;

        case SO_PASSSEC:
                if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
                        sk->sk_scm_security = valbool;
                else
                        ret = -EOPNOTSUPP;
                break;

        case SO_PASSPIDFD:
                if (sk_is_unix(sk))
                        sk->sk_scm_pidfd = valbool;
                else
                        ret = -EOPNOTSUPP;
                break;

        case SO_PASSRIGHTS:
                if (sk_is_unix(sk))
                        sk->sk_scm_rights = valbool;
                else
                        ret = -EOPNOTSUPP;
                break;

        case SO_INCOMING_CPU:
                reuseport_update_incoming_cpu(sk, val);
                break;

        case SO_CNX_ADVICE:
                if (val == 1)
                        dst_negative_advice(sk);
                break;

        case SO_ZEROCOPY:
                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
                        if (!(sk_is_tcp(sk) ||
                              (sk->sk_type == SOCK_DGRAM &&
                               sk->sk_protocol == IPPROTO_UDP)))
                                ret = -EOPNOTSUPP;
                } else if (sk->sk_family != PF_RDS) {
                        ret = -EOPNOTSUPP;
                }
                if (!ret) {
                        if (val < 0 || val > 1)
                                ret = -EINVAL;
                        else
                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
                }
                break;

        case SO_TXTIME:
                if (optlen != sizeof(struct sock_txtime)) {
                        ret = -EINVAL;
                        break;
                } else if (copy_from_sockptr(&sk_txtime, optval,
                           sizeof(struct sock_txtime))) {
                        ret = -EFAULT;
                        break;
                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
                        ret = -EINVAL;
                        break;
                }
                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
                 * scheduler has enough safe guards.
                 */
                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
                    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                ret = sockopt_validate_clockid(sk_txtime.clockid);
                if (ret)
                        break;

                sock_valbool_flag(sk, SOCK_TXTIME, true);
                sk->sk_clockid = sk_txtime.clockid;
                sk->sk_txtime_deadline_mode =
                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
                sk->sk_txtime_report_errors =
                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
                break;

        case SO_BINDTOIFINDEX:
                ret = sock_bindtoindex_locked(sk, val);
                break;

        case SO_BUF_LOCK:
                if (val & ~SOCK_BUF_LOCK_MASK) {
                        ret = -EINVAL;
                        break;
                }
                sk->sk_userlocks = val | (sk->sk_userlocks &
                                          ~SOCK_BUF_LOCK_MASK);
                break;

        case SO_RESERVE_MEM:
        {
                int delta;

                if (val < 0) {
                        ret = -EINVAL;
                        break;
                }

                delta = val - sk->sk_reserved_mem;
                if (delta < 0)
                        sock_release_reserved_memory(sk, -delta);
                else
                        ret = sock_reserve_memory(sk, delta);
                break;
        }

        default:
                ret = -ENOPROTOOPT;
                break;
        }
        sockopt_release_sock(sk);
        return ret;
}

int sock_setsockopt(struct socket *sock, int level, int optname,
                    sockptr_t optval, unsigned int optlen)
{
        return sk_setsockopt(sock->sk, level, optname,
                             optval, optlen);
}
EXPORT_SYMBOL(sock_setsockopt);

static const struct cred *sk_get_peer_cred(struct sock *sk)
{
        const struct cred *cred;

        spin_lock(&sk->sk_peer_lock);
        cred = get_cred(sk->sk_peer_cred);
        spin_unlock(&sk->sk_peer_lock);

        return cred;
}

static void cred_to_ucred(struct pid *pid, const struct cred *cred,
                          struct ucred *ucred)
{
        ucred->pid = pid_vnr(pid);
        ucred->uid = ucred->gid = -1;
        if (cred) {
                struct user_namespace *current_ns = current_user_ns();

                ucred->uid = from_kuid_munged(current_ns, cred->euid);
                ucred->gid = from_kgid_munged(current_ns, cred->egid);
        }
}

static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
        struct user_namespace *user_ns = current_user_ns();
        int i;

        for (i = 0; i < src->ngroups; i++) {
                gid_t gid = from_kgid_munged(user_ns, src->gid[i]);

                if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
                        return -EFAULT;
        }

        return 0;
}

int sk_getsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, sockptr_t optlen)
{
        struct socket *sock = sk->sk_socket;

        union {
                int val;
                u64 val64;
                unsigned long ulval;
                struct linger ling;
                struct old_timeval32 tm32;
                struct __kernel_old_timeval tm;
                struct  __kernel_sock_timeval stm;
                struct sock_txtime txtime;
                struct so_timestamping timestamping;
        } v;

        int lv = sizeof(int);
        int len;

        if (copy_from_sockptr(&len, optlen, sizeof(int)))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        memset(&v, 0, sizeof(v));

        switch (optname) {
        case SO_DEBUG:
                v.val = sock_flag(sk, SOCK_DBG);
                break;

        case SO_DONTROUTE:
                v.val = sock_flag(sk, SOCK_LOCALROUTE);
                break;

        case SO_BROADCAST:
                v.val = sock_flag(sk, SOCK_BROADCAST);
                break;

        case SO_SNDBUF:
                v.val = READ_ONCE(sk->sk_sndbuf);
                break;

        case SO_RCVBUF:
                v.val = READ_ONCE(sk->sk_rcvbuf);
                break;

        case SO_REUSEADDR:
                v.val = sk->sk_reuse;
                break;

        case SO_REUSEPORT:
                v.val = sk->sk_reuseport;
                break;

        case SO_KEEPALIVE:
                v.val = sock_flag(sk, SOCK_KEEPOPEN);
                break;

        case SO_TYPE:
                v.val = sk->sk_type;
                break;

        case SO_PROTOCOL:
                v.val = sk->sk_protocol;
                break;

        case SO_DOMAIN:
                v.val = sk->sk_family;
                break;

        case SO_ERROR:
                v.val = -sock_error(sk);
                if (v.val == 0)
                        v.val = xchg(&sk->sk_err_soft, 0);
                break;

        case SO_OOBINLINE:
                v.val = sock_flag(sk, SOCK_URGINLINE);
                break;

        case SO_NO_CHECK:
                v.val = sk->sk_no_check_tx;
                break;

        case SO_PRIORITY:
                v.val = READ_ONCE(sk->sk_priority);
                break;

        case SO_LINGER:
                lv                = sizeof(v.ling);
                v.ling.l_onoff        = sock_flag(sk, SOCK_LINGER);
                v.ling.l_linger        = READ_ONCE(sk->sk_lingertime) / HZ;
                break;

        case SO_BSDCOMPAT:
                break;

        case SO_TIMESTAMP_OLD:
                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
                break;

        case SO_TIMESTAMPNS_OLD:
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMP_NEW:
                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMPNS_NEW:
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMPING_OLD:
        case SO_TIMESTAMPING_NEW:
                lv = sizeof(v.timestamping);
                /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
                 * returning the flags when they were set through the same option.
                 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
                 */
                if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
                        v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
                        v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
                }
                break;

        case SO_RCVTIMEO_OLD:
        case SO_RCVTIMEO_NEW:
                lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
                                      SO_RCVTIMEO_OLD == optname);
                break;

        case SO_SNDTIMEO_OLD:
        case SO_SNDTIMEO_NEW:
                lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
                                      SO_SNDTIMEO_OLD == optname);
                break;

        case SO_RCVLOWAT:
                v.val = READ_ONCE(sk->sk_rcvlowat);
                break;

        case SO_SNDLOWAT:
                v.val = 1;
                break;

        case SO_PASSCRED:
                if (!sk_may_scm_recv(sk))
                        return -EOPNOTSUPP;

                v.val = sk->sk_scm_credentials;
                break;

        case SO_PASSPIDFD:
                if (!sk_is_unix(sk))
                        return -EOPNOTSUPP;

                v.val = sk->sk_scm_pidfd;
                break;

        case SO_PASSRIGHTS:
                if (!sk_is_unix(sk))
                        return -EOPNOTSUPP;

                v.val = sk->sk_scm_rights;
                break;

        case SO_PEERCRED:
        {
                struct ucred peercred;
                if (len > sizeof(peercred))
                        len = sizeof(peercred);

                spin_lock(&sk->sk_peer_lock);
                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
                spin_unlock(&sk->sk_peer_lock);

                if (copy_to_sockptr(optval, &peercred, len))
                        return -EFAULT;
                goto lenout;
        }

        case SO_PEERPIDFD:
        {
                struct pid *peer_pid;
                struct file *pidfd_file = NULL;
                unsigned int flags = 0;
                int pidfd;

                if (len > sizeof(pidfd))
                        len = sizeof(pidfd);

                spin_lock(&sk->sk_peer_lock);
                peer_pid = get_pid(sk->sk_peer_pid);
                spin_unlock(&sk->sk_peer_lock);

                if (!peer_pid)
                        return -ENODATA;

                /* The use of PIDFD_STALE requires stashing of struct pid
                 * on pidfs with pidfs_register_pid() and only AF_UNIX
                 * were prepared for this.
                 */
                if (sk->sk_family == AF_UNIX)
                        flags = PIDFD_STALE;

                pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
                put_pid(peer_pid);
                if (pidfd < 0)
                        return pidfd;

                if (copy_to_sockptr(optval, &pidfd, len) ||
                    copy_to_sockptr(optlen, &len, sizeof(int))) {
                        put_unused_fd(pidfd);
                        fput(pidfd_file);

                        return -EFAULT;
                }

                fd_install(pidfd, pidfd_file);
                return 0;
        }

        case SO_PEERGROUPS:
        {
                const struct cred *cred;
                int ret, n;

                cred = sk_get_peer_cred(sk);
                if (!cred)
                        return -ENODATA;

                n = cred->group_info->ngroups;
                if (len < n * sizeof(gid_t)) {
                        len = n * sizeof(gid_t);
                        put_cred(cred);
                        return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
                }
                len = n * sizeof(gid_t);

                ret = groups_to_user(optval, cred->group_info);
                put_cred(cred);
                if (ret)
                        return ret;
                goto lenout;
        }

        case SO_PEERNAME:
        {
                struct sockaddr_storage address;

                lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
                if (lv < 0)
                        return -ENOTCONN;
                if (lv < len)
                        return -EINVAL;
                if (copy_to_sockptr(optval, &address, len))
                        return -EFAULT;
                goto lenout;
        }

        /* Dubious BSD thing... Probably nobody even uses it, but
         * the UNIX standard wants it for whatever reason... -DaveM
         */
        case SO_ACCEPTCONN:
                v.val = sk->sk_state == TCP_LISTEN;
                break;

        case SO_PASSSEC:
                if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
                        return -EOPNOTSUPP;

                v.val = sk->sk_scm_security;
                break;

        case SO_PEERSEC:
                return security_socket_getpeersec_stream(sock,
                                                         optval, optlen, len);

        case SO_MARK:
                v.val = READ_ONCE(sk->sk_mark);
                break;

        case SO_RCVMARK:
                v.val = sock_flag(sk, SOCK_RCVMARK);
                break;

        case SO_RCVPRIORITY:
                v.val = sock_flag(sk, SOCK_RCVPRIORITY);
                break;

        case SO_RXQ_OVFL:
                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
                break;

        case SO_WIFI_STATUS:
                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
                break;

        case SO_PEEK_OFF:
                if (!READ_ONCE(sock->ops)->set_peek_off)
                        return -EOPNOTSUPP;

                v.val = READ_ONCE(sk->sk_peek_off);
                break;
        case SO_NOFCS:
                v.val = sock_flag(sk, SOCK_NOFCS);
                break;

        case SO_BINDTODEVICE:
                return sock_getbindtodevice(sk, optval, optlen, len);

        case SO_GET_FILTER:
                len = sk_get_filter(sk, optval, len);
                if (len < 0)
                        return len;

                goto lenout;

        case SO_LOCK_FILTER:
                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
                break;

        case SO_BPF_EXTENSIONS:
                v.val = bpf_tell_extensions();
                break;

        case SO_SELECT_ERR_QUEUE:
                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
                break;

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_BUSY_POLL:
                v.val = READ_ONCE(sk->sk_ll_usec);
                break;
        case SO_PREFER_BUSY_POLL:
                v.val = READ_ONCE(sk->sk_prefer_busy_poll);
                break;
#endif

        case SO_MAX_PACING_RATE:
                /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
                        lv = sizeof(v.ulval);
                        v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
                } else {
                        /* 32bit version */
                        v.val = min_t(unsigned long, ~0U,
                                      READ_ONCE(sk->sk_max_pacing_rate));
                }
                break;

        case SO_INCOMING_CPU:
                v.val = READ_ONCE(sk->sk_incoming_cpu);
                break;

        case SO_MEMINFO:
        {
                u32 meminfo[SK_MEMINFO_VARS];

                sk_get_meminfo(sk, meminfo);

                len = min_t(unsigned int, len, sizeof(meminfo));
                if (copy_to_sockptr(optval, &meminfo, len))
                        return -EFAULT;

                goto lenout;
        }

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_INCOMING_NAPI_ID:
                v.val = READ_ONCE(sk->sk_napi_id);

                /* aggregate non-NAPI IDs down to 0 */
                if (!napi_id_valid(v.val))
                        v.val = 0;

                break;
#endif

        case SO_COOKIE:
                lv = sizeof(u64);
                if (len < lv)
                        return -EINVAL;
                v.val64 = sock_gen_cookie(sk);
                break;

        case SO_ZEROCOPY:
                v.val = sock_flag(sk, SOCK_ZEROCOPY);
                break;

        case SO_TXTIME:
                lv = sizeof(v.txtime);
                v.txtime.clockid = sk->sk_clockid;
                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
                                  SOF_TXTIME_DEADLINE_MODE : 0;
                v.txtime.flags |= sk->sk_txtime_report_errors ?
                                  SOF_TXTIME_REPORT_ERRORS : 0;
                break;

        case SO_BINDTOIFINDEX:
                v.val = READ_ONCE(sk->sk_bound_dev_if);
                break;

        case SO_NETNS_COOKIE:
                lv = sizeof(u64);
                if (len != lv)
                        return -EINVAL;
                v.val64 = sock_net(sk)->net_cookie;
                break;

        case SO_BUF_LOCK:
                v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
                break;

        case SO_RESERVE_MEM:
                v.val = READ_ONCE(sk->sk_reserved_mem);
                break;

        case SO_TXREHASH:
                if (!sk_is_tcp(sk))
                        return -EOPNOTSUPP;

                /* Paired with WRITE_ONCE() in sk_setsockopt() */
                v.val = READ_ONCE(sk->sk_txrehash);
                break;

        default:
                /* We implement the SO_SNDLOWAT etc to not be settable
                 * (1003.1g 7).
                 */
                return -ENOPROTOOPT;
        }

        if (len > lv)
                len = lv;
        if (copy_to_sockptr(optval, &v, len))
                return -EFAULT;
lenout:
        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                return -EFAULT;
        return 0;
}

/*
 * Initialize an sk_lock.
 *
 * (We also register the sk_lock with the lock validator.)
 */
static inline void sock_lock_init(struct sock *sk)
{
        sk_owner_clear(sk);

        if (sk->sk_kern_sock)
                sock_lock_init_class_and_name(
                        sk,
                        af_family_kern_slock_key_strings[sk->sk_family],
                        af_family_kern_slock_keys + sk->sk_family,
                        af_family_kern_key_strings[sk->sk_family],
                        af_family_kern_keys + sk->sk_family);
        else
                sock_lock_init_class_and_name(
                        sk,
                        af_family_slock_key_strings[sk->sk_family],
                        af_family_slock_keys + sk->sk_family,
                        af_family_key_strings[sk->sk_family],
                        af_family_keys + sk->sk_family);
}

/*
 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 * even temporarily, because of RCU lookups. sk_node should also be left as is.
 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
 */
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
        const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
        void *sptr = nsk->sk_security;
#endif

        /* If we move sk_tx_queue_mapping out of the private section,
         * we must check if sk_tx_queue_clear() is called after
         * sock_copy() in sk_clone_lock().
         */
        BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
                     offsetof(struct sock, sk_dontcopy_begin) ||
                     offsetof(struct sock, sk_tx_queue_mapping) >=
                     offsetof(struct sock, sk_dontcopy_end));

        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));

        unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
                      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
                      /* alloc is larger than struct, see sk_prot_alloc() */);

#ifdef CONFIG_SECURITY_NETWORK
        nsk->sk_security = sptr;
        security_sk_clone(osk, nsk);
#endif
}

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
                int family)
{
        struct sock *sk;
        struct kmem_cache *slab;

        slab = prot->slab;
        if (slab != NULL) {
                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
                if (!sk)
                        return sk;
                if (want_init_on_alloc(priority))
                        sk_prot_clear_nulls(sk, prot->obj_size);
        } else
                sk = kmalloc(prot->obj_size, priority);

        if (sk != NULL) {
                if (security_sk_alloc(sk, family, priority))
                        goto out_free;

                if (!try_module_get(prot->owner))
                        goto out_free_sec;
        }

        return sk;

out_free_sec:
        security_sk_free(sk);
out_free:
        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        return NULL;
}

static void sk_prot_free(struct proto *prot, struct sock *sk)
{
        struct kmem_cache *slab;
        struct module *owner;

        owner = prot->owner;
        slab = prot->slab;

        cgroup_sk_free(&sk->sk_cgrp_data);
        mem_cgroup_sk_free(sk);
        security_sk_free(sk);

        sk_owner_put(sk);

        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        module_put(owner);
}

/**
 *        sk_alloc - All socket objects are allocated here
 *        @net: the applicable net namespace
 *        @family: protocol family
 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *        @prot: struct proto associated with this new sock instance
 *        @kern: is this to be a kernel socket?
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern)
{
        struct sock *sk;

        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
        if (sk) {
                sk->sk_family = family;
                /*
                 * See comment in struct sock definition to understand
                 * why we need sk_prot_creator -acme
                 */
                sk->sk_prot = sk->sk_prot_creator = prot;

                if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
                        sk->sk_bypass_prot_mem = 1;

                sk->sk_kern_sock = kern;
                sock_lock_init(sk);

                sk->sk_net_refcnt = kern ? 0 : 1;
                if (likely(sk->sk_net_refcnt)) {
                        get_net_track(net, &sk->ns_tracker, priority);
                        sock_inuse_add(net, 1);
                } else {
                        net_passive_inc(net);
                        __netns_tracker_alloc(net, &sk->ns_tracker,
                                              false, priority);
                }

                sock_net_set(sk, net);
                refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);

                mem_cgroup_sk_alloc(sk);
                cgroup_sk_alloc(&sk->sk_cgrp_data);
                sock_update_classid(&sk->sk_cgrp_data);
                sock_update_netprioidx(&sk->sk_cgrp_data);
                sk_tx_queue_clear(sk);
        }

        return sk;
}
EXPORT_SYMBOL(sk_alloc);

/* Sockets having SOCK_RCU_FREE will call this function after one RCU
 * grace period. This is the case for UDP sockets and TCP listeners.
 */
static void __sk_destruct(struct rcu_head *head)
{
        struct sock *sk = container_of(head, struct sock, sk_rcu);
        struct net *net = sock_net(sk);
        struct sk_filter *filter;

        if (sk->sk_destruct)
                sk->sk_destruct(sk);

        filter = rcu_dereference_check(sk->sk_filter,
                                       refcount_read(&sk->sk_wmem_alloc) == 0);
        if (filter) {
                sk_filter_uncharge(sk, filter);
                RCU_INIT_POINTER(sk->sk_filter, NULL);
        }

        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);

#ifdef CONFIG_BPF_SYSCALL
        bpf_sk_storage_free(sk);
#endif

        if (atomic_read(&sk->sk_omem_alloc))
                pr_debug("%s: optmem leakage (%d bytes) detected\n",
                         __func__, atomic_read(&sk->sk_omem_alloc));

        if (sk->sk_frag.page) {
                put_page(sk->sk_frag.page);
                sk->sk_frag.page = NULL;
        }

        /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
        put_cred(sk->sk_peer_cred);
        put_pid(sk->sk_peer_pid);

        if (likely(sk->sk_net_refcnt)) {
                put_net_track(net, &sk->ns_tracker);
        } else {
                __netns_tracker_free(net, &sk->ns_tracker, false);
                net_passive_dec(net);
        }
        sk_prot_free(sk->sk_prot_creator, sk);
}

void sk_net_refcnt_upgrade(struct sock *sk)
{
        struct net *net = sock_net(sk);

        WARN_ON_ONCE(sk->sk_net_refcnt);
        __netns_tracker_free(net, &sk->ns_tracker, false);
        net_passive_dec(net);
        sk->sk_net_refcnt = 1;
        get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
        sock_inuse_add(net, 1);
}
EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);

void sk_destruct(struct sock *sk)
{
        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);

        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
                reuseport_detach_sock(sk);
                use_call_rcu = true;
        }

        if (use_call_rcu)
                call_rcu(&sk->sk_rcu, __sk_destruct);
        else
                __sk_destruct(&sk->sk_rcu);
}

static void __sk_free(struct sock *sk)
{
        if (likely(sk->sk_net_refcnt))
                sock_inuse_add(sock_net(sk), -1);

        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
                sock_diag_broadcast_destroy(sk);
        else
                sk_destruct(sk);
}

void sk_free(struct sock *sk)
{
        /*
         * We subtract one from sk_wmem_alloc and can know if
         * some packets are still in some tx queue.
         * If not null, sock_wfree() will call __sk_free(sk) later
         */
        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
                __sk_free(sk);
}
EXPORT_SYMBOL(sk_free);

static void sk_init_common(struct sock *sk)
{
        skb_queue_head_init(&sk->sk_receive_queue);
        skb_queue_head_init(&sk->sk_write_queue);
        skb_queue_head_init(&sk->sk_error_queue);

        rwlock_init(&sk->sk_callback_lock);
        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
                        af_rlock_keys + sk->sk_family,
                        af_family_rlock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
                        af_wlock_keys + sk->sk_family,
                        af_family_wlock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
                        af_elock_keys + sk->sk_family,
                        af_family_elock_key_strings[sk->sk_family]);
        if (sk->sk_kern_sock)
                lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_kern_callback_keys + sk->sk_family,
                        af_family_kern_clock_key_strings[sk->sk_family]);
        else
                lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_callback_keys + sk->sk_family,
                        af_family_clock_key_strings[sk->sk_family]);
}

/**
 * sk_clone - clone a socket
 * @sk: the socket to clone
 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 * @lock: if true, lock the cloned sk
 *
 * If @lock is true, the clone is locked by bh_lock_sock(), and
 * caller must unlock socket even in error path by bh_unlock_sock().
 */
struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
                      bool lock)
{
        struct proto *prot = READ_ONCE(sk->sk_prot);
        struct sk_filter *filter;
        bool is_charged = true;
        struct sock *newsk;

        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
        if (!newsk)
                goto out;

        sock_copy(newsk, sk);

        newsk->sk_prot_creator = prot;

        /* SANITY */
        if (likely(newsk->sk_net_refcnt)) {
                get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
                sock_inuse_add(sock_net(newsk), 1);
        } else {
                /* Kernel sockets are not elevating the struct net refcount.
                 * Instead, use a tracker to more easily detect if a layer
                 * is not properly dismantling its kernel sockets at netns
                 * destroy time.
                 */
                net_passive_inc(sock_net(newsk));
                __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
                                      false, priority);
        }

        sk_node_init(&newsk->sk_node);
        sock_lock_init(newsk);

        if (lock)
                bh_lock_sock(newsk);

        newsk->sk_backlog.head        = newsk->sk_backlog.tail = NULL;
        newsk->sk_backlog.len = 0;

        atomic_set(&newsk->sk_rmem_alloc, 0);

        refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);

        atomic_set(&newsk->sk_omem_alloc, 0);
        sk_init_common(newsk);

        newsk->sk_dst_cache        = NULL;
        newsk->sk_dst_pending_confirm = 0;
        newsk->sk_wmem_queued        = 0;
        newsk->sk_forward_alloc = 0;
        newsk->sk_reserved_mem  = 0;
        DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
        sk_drops_reset(newsk);
        newsk->sk_send_head        = NULL;
        newsk->sk_userlocks        = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
        atomic_set(&newsk->sk_zckey, 0);

        sock_reset_flag(newsk, SOCK_DONE);

#ifdef CONFIG_MEMCG
        /* sk->sk_memcg will be populated at accept() time */
        newsk->sk_memcg = NULL;
#endif

        cgroup_sk_clone(&newsk->sk_cgrp_data);

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter != NULL)
                /* though it's an empty new sock, the charging may fail
                 * if sysctl_optmem_max was changed between creation of
                 * original socket and cloning
                 */
                is_charged = sk_filter_charge(newsk, filter);
        RCU_INIT_POINTER(newsk->sk_filter, filter);
        rcu_read_unlock();

        if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
                /* We need to make sure that we don't uncharge the new
                 * socket if we couldn't charge it in the first place
                 * as otherwise we uncharge the parent's filter.
                 */
                if (!is_charged)
                        RCU_INIT_POINTER(newsk->sk_filter, NULL);

                goto free;
        }

        RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);

        if (bpf_sk_storage_clone(sk, newsk))
                goto free;

        /* Clear sk_user_data if parent had the pointer tagged
         * as not suitable for copying when cloning.
         */
        if (sk_user_data_is_nocopy(newsk))
                newsk->sk_user_data = NULL;

        newsk->sk_err           = 0;
        newsk->sk_err_soft = 0;
        newsk->sk_priority = 0;
        newsk->sk_incoming_cpu = raw_smp_processor_id();

        /* Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.rst for details)
         */
        smp_wmb();
        refcount_set(&newsk->sk_refcnt, 2);

        sk_set_socket(newsk, NULL);
        sk_tx_queue_clear(newsk);
        sk_rx_queue_clear(newsk);
        RCU_INIT_POINTER(newsk->sk_wq, NULL);

        if (newsk->sk_prot->sockets_allocated)
                sk_sockets_allocated_inc(newsk);

        if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
                net_enable_timestamp();
out:
        return newsk;
free:
        /* It is still raw copy of parent, so invalidate
         * destructor and make plain sk_free()
         */
        newsk->sk_destruct = NULL;
        if (lock)
                bh_unlock_sock(newsk);
        sk_free(newsk);
        newsk = NULL;
        goto out;
}
EXPORT_SYMBOL_GPL(sk_clone);

static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
{
        bool is_ipv6 = false;
        u32 max_size;

#if IS_ENABLED(CONFIG_IPV6)
        is_ipv6 = (sk->sk_family == AF_INET6 &&
                   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
#endif
        /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
        max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
                        READ_ONCE(dev->gso_ipv4_max_size);
        if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
                max_size = GSO_LEGACY_MAX_SIZE;

        return max_size - (MAX_TCP_HEADER + 1);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
        const struct net_device *dev;
        u32 max_segs = 1;

        rcu_read_lock();
        dev = dst_dev_rcu(dst);
        sk->sk_route_caps = dev->features;
        if (sk_is_tcp(sk)) {
                struct inet_connection_sock *icsk = inet_csk(sk);

                sk->sk_route_caps |= NETIF_F_GSO;
                icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
        }
        if (sk->sk_route_caps & NETIF_F_GSO)
                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
        if (unlikely(sk->sk_gso_disabled))
                sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
        if (sk_can_gso(sk)) {
                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
                } else {
                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
                        sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
                        max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
                }
        }
        sk->sk_gso_max_segs = max_segs;
        sk_dst_set(sk, dst);
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(sk_setup_caps);

/*
 *        Simple resource managers for sockets.
 */


/*
 * Write buffer destructor automatically called from kfree_skb.
 */
void sock_wfree(struct sk_buff *skb)
{
        unsigned int len = skb->truesize;
        struct sock *sk = skb->sk;
        bool free;
        int old;

        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
                if (sock_flag(sk, SOCK_RCU_FREE) &&
                    sk->sk_write_space == sock_def_write_space) {
                        rcu_read_lock();
                        free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
                                                       &old);
                        sock_def_write_space_wfree(sk, old - len);
                        rcu_read_unlock();
                        if (unlikely(free))
                                __sk_free(sk);
                        return;
                }

                /*
                 * Keep a reference on sk_wmem_alloc, this will be released
                 * after sk_write_space() call
                 */
                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
                sk->sk_write_space(sk);
                len = 1;
        }
        /*
         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
         * could not do because of in-flight packets
         */
        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
                __sk_free(sk);
}
EXPORT_SYMBOL(sock_wfree);

/* This variant of sock_wfree() is used by TCP,
 * since it sets SOCK_USE_WRITE_QUEUE.
 */
void __sock_wfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
                __sk_free(sk);
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
        int old_wmem;

        skb_orphan(skb);
#ifdef CONFIG_INET
        if (unlikely(!sk_fullsock(sk)))
                return skb_set_owner_edemux(skb, sk);
#endif
        skb->sk = sk;
        skb->destructor = sock_wfree;
        skb_set_hash_from_sk(skb, sk);
        /*
         * We used to take a refcount on sk, but following operation
         * is enough to guarantee sk_free() won't free this sock until
         * all in-flight packets are completed
         */
        __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem);

        /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket
         * is in a host queue (qdisc, NIC queue).
         * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
         * based on XPS for better performance.
         * Otherwise clear ooo_okay to not risk Out Of Order delivery.
         */
        skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
}
EXPORT_SYMBOL(skb_set_owner_w);

static bool can_skb_orphan_partial(const struct sk_buff *skb)
{
        /* Drivers depend on in-order delivery for crypto offload,
         * partial orphan breaks out-of-order-OK logic.
         */
        if (skb_is_decrypted(skb))
                return false;

        return (skb->destructor == sock_wfree ||
                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
}

/* This helper is used by netem, as it can hold packets in its
 * delay queue. We want to allow the owner socket to send more
 * packets, as if they were already TX completed by a typical driver.
 * But we also want to keep skb->sk set because some packet schedulers
 * rely on it (sch_fq for example).
 */
void skb_orphan_partial(struct sk_buff *skb)
{
        if (skb_is_tcp_pure_ack(skb))
                return;

        if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
                return;

        skb_orphan(skb);
}
EXPORT_SYMBOL(skb_orphan_partial);

/*
 * Read buffer destructor automatically called from kfree_skb.
 */
void sock_rfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        unsigned int len = skb->truesize;

        atomic_sub(len, &sk->sk_rmem_alloc);
        sk_mem_uncharge(sk, len);
}
EXPORT_SYMBOL(sock_rfree);

/*
 * Buffer destructor for skbs that are not used directly in read or write
 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
 */
void sock_efree(struct sk_buff *skb)
{
        sock_put(skb->sk);
}
EXPORT_SYMBOL(sock_efree);

/* Buffer destructor for prefetch/receive path where reference count may
 * not be held, e.g. for listen sockets.
 */
#ifdef CONFIG_INET
void sock_pfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        if (!sk_is_refcounted(sk))
                return;

        if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
                inet_reqsk(sk)->rsk_listener = NULL;
                reqsk_free(inet_reqsk(sk));
                return;
        }

        sock_gen_put(sk);
}
EXPORT_SYMBOL(sock_pfree);
#endif /* CONFIG_INET */

/*
 * Allocate a skb from the socket's send buffer.
 */
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority)
{
        if (force ||
            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
                struct sk_buff *skb = alloc_skb(size, priority);

                if (skb) {
                        skb_set_owner_w(skb, sk);
                        return skb;
                }
        }
        return NULL;
}
EXPORT_SYMBOL(sock_wmalloc);

static void sock_ofree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
}

struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority)
{
        struct sk_buff *skb;

        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
            READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
                return NULL;

        skb = alloc_skb(size, priority);
        if (!skb)
                return NULL;

        atomic_add(skb->truesize, &sk->sk_omem_alloc);
        skb->sk = sk;
        skb->destructor = sock_ofree;
        return skb;
}

/*
 * Allocate a memory block from the socket's option memory buffer.
 */
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
        int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);

        if ((unsigned int)size <= optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
                void *mem;
                /* First do the add, to avoid the race if kmalloc
                 * might sleep.
                 */
                atomic_add(size, &sk->sk_omem_alloc);
                mem = kmalloc(size, priority);
                if (mem)
                        return mem;
                atomic_sub(size, &sk->sk_omem_alloc);
        }
        return NULL;
}
EXPORT_SYMBOL(sock_kmalloc);

/*
 * Duplicate the input "src" memory block using the socket's
 * option memory buffer.
 */
void *sock_kmemdup(struct sock *sk, const void *src,
                   int size, gfp_t priority)
{
        void *mem;

        mem = sock_kmalloc(sk, size, priority);
        if (mem)
                memcpy(mem, src, size);
        return mem;
}
EXPORT_SYMBOL(sock_kmemdup);

/* Free an option memory block. Note, we actually want the inline
 * here as this allows gcc to detect the nullify and fold away the
 * condition entirely.
 */
static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
                                  const bool nullify)
{
        if (WARN_ON_ONCE(!mem))
                return;
        if (nullify)
                kfree_sensitive(mem);
        else
                kfree(mem);
        atomic_sub(size, &sk->sk_omem_alloc);
}

void sock_kfree_s(struct sock *sk, void *mem, int size)
{
        __sock_kfree_s(sk, mem, size, false);
}
EXPORT_SYMBOL(sock_kfree_s);

void sock_kzfree_s(struct sock *sk, void *mem, int size)
{
        __sock_kfree_s(sk, mem, size, true);
}
EXPORT_SYMBOL(sock_kzfree_s);

/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
   I think, these locks should be removed for datagram sockets.
 */
static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
        DEFINE_WAIT(wait);

        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
        for (;;) {
                if (!timeo)
                        break;
                if (signal_pending(current))
                        break;
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
                        break;
                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                        break;
                if (READ_ONCE(sk->sk_err))
                        break;
                timeo = schedule_timeout(timeo);
        }
        finish_wait(sk_sleep(sk), &wait);
        return timeo;
}


/*
 *        Generic send/receive buffer handlers
 */

struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order)
{
        struct sk_buff *skb;
        long timeo;
        int err;

        timeo = sock_sndtimeo(sk, noblock);
        for (;;) {
                err = sock_error(sk);
                if (err != 0)
                        goto failure;

                err = -EPIPE;
                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                        goto failure;

                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
                        break;

                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                err = -EAGAIN;
                if (!timeo)
                        goto failure;
                if (signal_pending(current))
                        goto interrupted;
                timeo = sock_wait_for_wmem(sk, timeo);
        }
        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
                                   errcode, sk->sk_allocation);
        if (skb)
                skb_set_owner_w(skb, sk);
        return skb;

interrupted:
        err = sock_intr_errno(timeo);
failure:
        *errcode = err;
        return NULL;
}
EXPORT_SYMBOL(sock_alloc_send_pskb);

int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc)
{
        u32 tsflags;

        BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));

        switch (cmsg->cmsg_type) {
        case SO_MARK:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
                break;
        case SO_TIMESTAMPING_OLD:
        case SO_TIMESTAMPING_NEW:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;

                tsflags = *(u32 *)CMSG_DATA(cmsg);
                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
                        return -EINVAL;

                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
                sockc->tsflags |= tsflags;
                break;
        case SCM_TXTIME:
                if (!sock_flag(sk, SOCK_TXTIME))
                        return -EINVAL;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
                        return -EINVAL;
                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
                break;
        case SCM_TS_OPT_ID:
                if (sk_is_tcp(sk))
                        return -EINVAL;
                tsflags = READ_ONCE(sk->sk_tsflags);
                if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
                        return -EINVAL;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
                sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
                break;
        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
        case SCM_RIGHTS:
        case SCM_CREDENTIALS:
                break;
        case SO_PRIORITY:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
                        return -EPERM;
                sockc->priority = *(u32 *)CMSG_DATA(cmsg);
                break;
        case SCM_DEVMEM_DMABUF:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
                break;
        default:
                return -EINVAL;
        }
        return 0;
}
EXPORT_SYMBOL(__sock_cmsg_send);

int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc)
{
        struct cmsghdr *cmsg;
        int ret;

        for_each_cmsghdr(cmsg, msg) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;
                if (cmsg->cmsg_level != SOL_SOCKET)
                        continue;
                ret = __sock_cmsg_send(sk, cmsg, sockc);
                if (ret)
                        return ret;
        }
        return 0;
}
EXPORT_SYMBOL(sock_cmsg_send);

static void sk_enter_memory_pressure(struct sock *sk)
{
        if (!sk->sk_prot->enter_memory_pressure)
                return;

        sk->sk_prot->enter_memory_pressure(sk);
}

static void sk_leave_memory_pressure(struct sock *sk)
{
        if (sk->sk_prot->leave_memory_pressure) {
                INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
                                     tcp_leave_memory_pressure, sk);
        } else {
                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;

                if (memory_pressure && READ_ONCE(*memory_pressure))
                        WRITE_ONCE(*memory_pressure, 0);
        }
}

DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

/**
 * skb_page_frag_refill - check that a page_frag contains enough room
 * @sz: minimum size of the fragment we want to get
 * @pfrag: pointer to page_frag
 * @gfp: priority for memory allocation
 *
 * Note: While this allocator tries to use high order pages, there is
 * no guarantee that allocations succeed. Therefore, @sz MUST be
 * less or equal than PAGE_SIZE.
 */
bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
        if (pfrag->page) {
                if (page_ref_count(pfrag->page) == 1) {
                        pfrag->offset = 0;
                        return true;
                }
                if (pfrag->offset + sz <= pfrag->size)
                        return true;
                put_page(pfrag->page);
        }

        pfrag->offset = 0;
        if (SKB_FRAG_PAGE_ORDER &&
            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
                /* Avoid direct reclaim but allow kswapd to wake */
                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
                                          __GFP_COMP | __GFP_NOWARN |
                                          __GFP_NORETRY,
                                          SKB_FRAG_PAGE_ORDER);
                if (likely(pfrag->page)) {
                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
                        return true;
                }
        }
        pfrag->page = alloc_page(gfp);
        if (likely(pfrag->page)) {
                pfrag->size = PAGE_SIZE;
                return true;
        }
        return false;
}
EXPORT_SYMBOL(skb_page_frag_refill);

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
{
        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
                return true;

        if (!sk->sk_bypass_prot_mem)
                sk_enter_memory_pressure(sk);

        sk_stream_moderate_sndbuf(sk);

        return false;
}
EXPORT_SYMBOL(sk_page_frag_refill);

static void __lock_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
{
        DEFINE_WAIT(wait);

        for (;;) {
                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
                                        TASK_UNINTERRUPTIBLE);
                spin_unlock_bh(&sk->sk_lock.slock);
                schedule();
                spin_lock_bh(&sk->sk_lock.slock);
                if (!sock_owned_by_user(sk))
                        break;
        }
        finish_wait(&sk->sk_lock.wq, &wait);
}

void __release_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
{
        struct sk_buff *skb, *next;
        int nb = 0;

        while ((skb = sk->sk_backlog.head) != NULL) {
                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;

                spin_unlock_bh(&sk->sk_lock.slock);

                while (1) {
                        next = skb->next;
                        prefetch(next);
                        DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
                        skb_mark_not_on_list(skb);
                        sk_backlog_rcv(sk, skb);

                        skb = next;
                        if (!skb)
                                break;

                        if (!(++nb & 15))
                                cond_resched();
                }

                spin_lock_bh(&sk->sk_lock.slock);
        }

        /*
         * Doing the zeroing here guarantee we can not loop forever
         * while a wild producer attempts to flood us.
         */
        sk->sk_backlog.len = 0;
}

void __sk_flush_backlog(struct sock *sk)
{
        spin_lock_bh(&sk->sk_lock.slock);
        __release_sock(sk);

        if (sk->sk_prot->release_cb)
                INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
                                     tcp_release_cb, sk);

        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL_GPL(__sk_flush_backlog);

/**
 * sk_wait_data - wait for data to arrive at sk_receive_queue
 * @sk:    sock to wait on
 * @timeo: for how long
 * @skb:   last skb seen on sk_receive_queue
 *
 * Now socket state including sk->sk_err is changed only under lock,
 * hence we may omit checks after joining wait queue.
 * We check receive queue before schedule() only as optimization;
 * it is very likely that release_sock() added new data.
 */
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        int rc;

        add_wait_queue(sk_sleep(sk), &wait);
        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        remove_wait_queue(sk_sleep(sk), &wait);
        return rc;
}
EXPORT_SYMBOL(sk_wait_data);

/**
 *        __sk_mem_raise_allocated - increase memory_allocated
 *        @sk: socket
 *        @size: memory size to allocate
 *        @amt: pages to allocate
 *        @kind: allocation type
 *
 *        Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
 *
 *        Unlike the globally shared limits among the sockets under same protocol,
 *        consuming the budget of a memcg won't have direct effect on other ones.
 *        So be optimistic about memcg's tolerance, and leave the callers to decide
 *        whether or not to raise allocated through sk_under_memory_pressure() or
 *        its variants.
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
        bool memcg_enabled = false, charged = false;
        struct proto *prot = sk->sk_prot;
        long allocated = 0;

        if (!sk->sk_bypass_prot_mem) {
                sk_memory_allocated_add(sk, amt);
                allocated = sk_memory_allocated(sk);
        }

        if (mem_cgroup_sk_enabled(sk)) {
                memcg_enabled = true;
                charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
                if (!charged)
                        goto suppress_allocation;
        }

        if (!allocated)
                return 1;

        /* Under limit. */
        if (allocated <= sk_prot_mem_limits(sk, 0)) {
                sk_leave_memory_pressure(sk);
                return 1;
        }

        /* Under pressure. */
        if (allocated > sk_prot_mem_limits(sk, 1))
                sk_enter_memory_pressure(sk);

        /* Over hard limit. */
        if (allocated > sk_prot_mem_limits(sk, 2))
                goto suppress_allocation;

        /* Guarantee minimum buffer size under pressure (either global
         * or memcg) to make sure features described in RFC 7323 (TCP
         * Extensions for High Performance) work properly.
         *
         * This rule does NOT stand when exceeds global or memcg's hard
         * limit, or else a DoS attack can be taken place by spawning
         * lots of sockets whose usage are under minimum buffer size.
         */
        if (kind == SK_MEM_RECV) {
                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
                        return 1;

        } else { /* SK_MEM_SEND */
                int wmem0 = sk_get_wmem0(sk, prot);

                if (sk->sk_type == SOCK_STREAM) {
                        if (sk->sk_wmem_queued < wmem0)
                                return 1;
                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
                                return 1;
                }
        }

        if (sk_has_memory_pressure(sk)) {
                u64 alloc;

                /* The following 'average' heuristic is within the
                 * scope of global accounting, so it only makes
                 * sense for global memory pressure.
                 */
                if (!sk_under_global_memory_pressure(sk))
                        return 1;

                /* Try to be fair among all the sockets under global
                 * pressure by allowing the ones that below average
                 * usage to raise.
                 */
                alloc = sk_sockets_allocated_read_positive(sk);
                if (sk_prot_mem_limits(sk, 2) > alloc *
                    sk_mem_pages(sk->sk_wmem_queued +
                                 atomic_read(&sk->sk_rmem_alloc) +
                                 sk->sk_forward_alloc))
                        return 1;
        }

suppress_allocation:

        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
                sk_stream_moderate_sndbuf(sk);

                /* Fail only if socket is _under_ its sndbuf.
                 * In this case we cannot block, so that we have to fail.
                 */
                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
                        /* Force charge with __GFP_NOFAIL */
                        if (memcg_enabled && !charged)
                                mem_cgroup_sk_charge(sk, amt,
                                                     gfp_memcg_charge() | __GFP_NOFAIL);
                        return 1;
                }
        }

        trace_sock_exceed_buf_limit(sk, prot, allocated, kind);

        if (allocated)
                sk_memory_allocated_sub(sk, amt);

        if (charged)
                mem_cgroup_sk_uncharge(sk, amt);

        return 0;
}

/**
 *        __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
 *        @sk: socket
 *        @size: memory size to allocate
 *        @kind: allocation type
 *
 *        If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
 *        rmem allocation. This function assumes that protocols which have
 *        memory_pressure use sk_wmem_queued as write buffer accounting.
 */
int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
        int ret, amt = sk_mem_pages(size);

        sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
        if (!ret)
                sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
        return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);

/**
 *        __sk_mem_reduce_allocated - reclaim memory_allocated
 *        @sk: socket
 *        @amount: number of quanta
 *
 *        Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
 */
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
        if (mem_cgroup_sk_enabled(sk))
                mem_cgroup_sk_uncharge(sk, amount);

        if (sk->sk_bypass_prot_mem)
                return;

        sk_memory_allocated_sub(sk, amount);

        if (sk_under_global_memory_pressure(sk) &&
            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
                sk_leave_memory_pressure(sk);
}

/**
 *        __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
 *        @sk: socket
 *        @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
 */
void __sk_mem_reclaim(struct sock *sk, int amount)
{
        amount >>= PAGE_SHIFT;
        sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
        __sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);

void __sk_charge(struct sock *sk, gfp_t gfp)
{
        int amt;

        gfp |= __GFP_NOFAIL;
        if (mem_cgroup_from_sk(sk)) {
                /* The socket has not been accepted yet, no need
                 * to look at newsk->sk_wmem_queued.
                 */
                amt = sk_mem_pages(sk->sk_forward_alloc +
                                   atomic_read(&sk->sk_rmem_alloc));
                if (amt)
                        mem_cgroup_sk_charge(sk, amt, gfp);
        }

        kmem_cache_charge(sk, gfp);
}

int sk_set_peek_off(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_peek_off, val);
        return 0;
}
EXPORT_SYMBOL_GPL(sk_set_peek_off);

/*
 * Set of default routines for initialising struct proto_ops when
 * the protocol does not support a particular function. In certain
 * cases where it makes no sense for a protocol to have a "do nothing"
 * function, some default processing is provided.
 */

int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_bind);

int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr,
                    int len, int flags)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_connect);

int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_socketpair);

int sock_no_accept(struct socket *sock, struct socket *newsock,
                   struct proto_accept_arg *arg)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_accept);

int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
                    int peer)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_getname);

int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_ioctl);

int sock_no_listen(struct socket *sock, int backlog)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_listen);

int sock_no_shutdown(struct socket *sock, int how)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_shutdown);

int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg);

int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg_locked);

int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
                    int flags)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_recvmsg);

int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
        /* Mirror missing mmap method error code */
        return -ENODEV;
}
EXPORT_SYMBOL(sock_no_mmap);

/*
 * When a file is received (via SCM_RIGHTS, etc), we must bump the
 * various sock-based usage counts.
 */
void __receive_sock(struct file *file)
{
        struct socket *sock;

        sock = sock_from_file(file);
        if (sock) {
                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
                sock_update_classid(&sock->sk->sk_cgrp_data);
        }
}

/*
 *        Default Socket Callbacks
 */

static void sock_def_wakeup(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_all(&wq->wait);
        rcu_read_unlock();
}

static void sock_def_error_report(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
        sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
        rcu_read_unlock();
}

void sock_def_readable(struct sock *sk)
{
        struct socket_wq *wq;

        trace_sk_data_ready(sk);

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
                                                EPOLLRDNORM | EPOLLRDBAND);
        sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        rcu_read_unlock();
}

static void sock_def_write_space(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();

        /* Do not wake up a writer until he can make "significant"
         * progress.  --DaveM
         */
        if (sock_writeable(sk)) {
                wq = rcu_dereference(sk->sk_wq);
                if (skwq_has_sleeper(wq))
                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

                /* Should agree with poll, otherwise some programs break */
                sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }

        rcu_read_unlock();
}

/* An optimised version of sock_def_write_space(), should only be called
 * for SOCK_RCU_FREE sockets under RCU read section and after putting
 * ->sk_wmem_alloc.
 */
static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
{
        /* Do not wake up a writer until he can make "significant"
         * progress.  --DaveM
         */
        if (__sock_writeable(sk, wmem_alloc)) {
                struct socket_wq *wq = rcu_dereference(sk->sk_wq);

                /* rely on refcount_sub from sock_wfree() */
                smp_mb__after_atomic();
                if (wq && waitqueue_active(&wq->wait))
                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

                /* Should agree with poll, otherwise some programs break */
                sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
}

static void sock_def_destruct(struct sock *sk)
{
}

void sk_send_sigurg(struct sock *sk)
{
        if (sk->sk_socket && sk->sk_socket->file)
                if (send_sigurg(sk->sk_socket->file))
                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
EXPORT_SYMBOL(sk_send_sigurg);

void sk_reset_timer(struct sock *sk, struct timer_list* timer,
                    unsigned long expires)
{
        if (!mod_timer(timer, expires))
                sock_hold(sk);
}
EXPORT_SYMBOL(sk_reset_timer);

void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
        if (timer_delete(timer))
                __sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
{
        if (timer_delete_sync(timer))
                __sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer_sync);

void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
{
        sk_init_common(sk);
        sk->sk_send_head        =        NULL;

        timer_setup(&sk->sk_timer, NULL, 0);

        sk->sk_allocation        =        GFP_KERNEL;
        sk->sk_rcvbuf                =        READ_ONCE(sysctl_rmem_default);
        sk->sk_sndbuf                =        READ_ONCE(sysctl_wmem_default);
        sk->sk_state                =        TCP_CLOSE;
        sk->sk_use_task_frag        =        true;
        sk_set_socket(sk, sock);

        sock_set_flag(sk, SOCK_ZAPPED);

        if (sock) {
                sk->sk_type        =        sock->type;
                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
                sock->sk        =        sk;
        } else {
                RCU_INIT_POINTER(sk->sk_wq, NULL);
        }
        sk->sk_uid        =        uid;

        sk->sk_state_change        =        sock_def_wakeup;
        sk->sk_data_ready        =        sock_def_readable;
        sk->sk_write_space        =        sock_def_write_space;
        sk->sk_error_report        =        sock_def_error_report;
        sk->sk_destruct                =        sock_def_destruct;

        sk->sk_frag.page        =        NULL;
        sk->sk_frag.offset        =        0;
        sk->sk_peek_off                =        -1;

        sk->sk_peer_pid         =        NULL;
        sk->sk_peer_cred        =        NULL;
        spin_lock_init(&sk->sk_peer_lock);

        sk->sk_write_pending        =        0;
        sk->sk_rcvlowat                =        1;
        sk->sk_rcvtimeo                =        MAX_SCHEDULE_TIMEOUT;
        sk->sk_sndtimeo                =        MAX_SCHEDULE_TIMEOUT;

        sk->sk_stamp = SK_DEFAULT_STAMP;
#if BITS_PER_LONG==32
        seqlock_init(&sk->sk_stamp_seq);
#endif
        atomic_set(&sk->sk_zckey, 0);

#ifdef CONFIG_NET_RX_BUSY_POLL
        sk->sk_napi_id                =        0;
        sk->sk_ll_usec                =        READ_ONCE(sysctl_net_busy_read);
#endif

        sk->sk_max_pacing_rate = ~0UL;
        sk->sk_pacing_rate = ~0UL;
        WRITE_ONCE(sk->sk_pacing_shift, 10);
        sk->sk_incoming_cpu = -1;

        sk_rx_queue_clear(sk);
        /*
         * Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.rst for details)
         */
        smp_wmb();
        refcount_set(&sk->sk_refcnt, 1);
        sk_drops_reset(sk);
}
EXPORT_SYMBOL(sock_init_data_uid);

void sock_init_data(struct socket *sock, struct sock *sk)
{
        kuid_t uid = sock ?
                SOCK_INODE(sock)->i_uid :
                make_kuid(sock_net(sk)->user_ns, 0);

        sock_init_data_uid(sock, sk, uid);
}
EXPORT_SYMBOL(sock_init_data);

void noinline lock_sock_nested(struct sock *sk, int subclass)
{
        /* The sk_lock has mutex_lock() semantics here. */
        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);

        might_sleep();
#ifdef CONFIG_64BIT
        if (sizeof(struct slock_owned) == sizeof(long)) {
                socket_lock_t tmp = {
                        .slock = __SPIN_LOCK_UNLOCKED(tmp.slock),
                        .owned = 1,
                };
                socket_lock_t old = {
                        .slock = __SPIN_LOCK_UNLOCKED(old.slock),
                        .owned = 0,
                };

                if (likely(try_cmpxchg(&sk->sk_lock.combined,
                                       &old.combined, tmp.combined)))
                        return;
        }
#endif
        spin_lock_bh(&sk->sk_lock.slock);
        if (unlikely(sock_owned_by_user_nocheck(sk)))
                __lock_sock(sk);
        sk->sk_lock.owned = 1;
        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(lock_sock_nested);

void release_sock(struct sock *sk)
{
        spin_lock_bh(&sk->sk_lock.slock);

        if (unlikely(sk->sk_backlog.tail))
                __release_sock(sk);

        if (sk->sk_prot->release_cb) {
                if (!tcp_release_cb_cond(sk))
                        sk->sk_prot->release_cb(sk);
        }
        sock_release_ownership(sk);
        if (unlikely(waitqueue_active(&sk->sk_lock.wq)))
                wake_up(&sk->sk_lock.wq);

        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(release_sock);

bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
{
        might_sleep();
        spin_lock_bh(&sk->sk_lock.slock);

        if (likely(!sock_owned_by_user_nocheck(sk))) {
                /*
                 * Fast path return with bottom halves disabled and
                 * sock::sk_lock.slock held.
                 *
                 * The 'mutex' is not contended and holding
                 * sock::sk_lock.slock prevents all other lockers to
                 * proceed so the corresponding unlock_sock_fast() can
                 * avoid the slow path of release_sock() completely and
                 * just release slock.
                 *
                 * From a semantical POV this is equivalent to 'acquiring'
                 * the 'mutex', hence the corresponding lockdep
                 * mutex_release() has to happen in the fast path of
                 * unlock_sock_fast().
                 */
                return false;
        }

        __lock_sock(sk);
        sk->sk_lock.owned = 1;
        __acquire(&sk->sk_lock.slock);
        spin_unlock_bh(&sk->sk_lock.slock);
        return true;
}
EXPORT_SYMBOL(__lock_sock_fast);

int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32)
{
        struct sock *sk = sock->sk;
        struct timespec64 ts;

        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        ts = ktime_to_timespec64(sock_read_timestamp(sk));
        if (ts.tv_sec == -1)
                return -ENOENT;
        if (ts.tv_sec == 0) {
                ktime_t kt = ktime_get_real();
                sock_write_timestamp(sk, kt);
                ts = ktime_to_timespec64(kt);
        }

        if (timeval)
                ts.tv_nsec /= 1000;

#ifdef CONFIG_COMPAT_32BIT_TIME
        if (time32)
                return put_old_timespec32(&ts, userstamp);
#endif
#ifdef CONFIG_SPARC64
        /* beware of padding in sparc64 timeval */
        if (timeval && !in_compat_syscall()) {
                struct __kernel_old_timeval __user tv = {
                        .tv_sec = ts.tv_sec,
                        .tv_usec = ts.tv_nsec,
                };
                if (copy_to_user(userstamp, &tv, sizeof(tv)))
                        return -EFAULT;
                return 0;
        }
#endif
        return put_timespec64(&ts, userstamp);
}
EXPORT_SYMBOL(sock_gettstamp);

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
{
        if (!sock_flag(sk, flag)) {
                unsigned long previous_flags = sk->sk_flags;

                sock_set_flag(sk, flag);
                /*
                 * we just set one of the two flags which require net
                 * time stamping, but time stamping might have been on
                 * already because of the other one
                 */
                if (sock_needs_netstamp(sk) &&
                    !(previous_flags & SK_FLAGS_TIMESTAMP))
                        net_enable_timestamp();
        }
}

int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
                       int level, int type)
{
        struct sock_extended_err ee;
        struct sk_buff *skb;
        int copied, err;

        err = -EAGAIN;
        skb = sock_dequeue_err_skb(sk);
        if (skb == NULL)
                goto out;

        copied = skb->len;
        if (copied > len) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }
        err = skb_copy_datagram_msg(skb, 0, msg, copied);
        if (err)
                goto out_free_skb;

        sock_recv_timestamp(msg, sk, skb);

        /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */
        ee = SKB_EXT_ERR(skb)->ee;
        put_cmsg(msg, level, type, sizeof(ee), &ee);

        msg->msg_flags |= MSG_ERRQUEUE;
        err = copied;

out_free_skb:
        kfree_skb(skb);
out:
        return err;
}
EXPORT_SYMBOL(sock_recv_errqueue);

/*
 *        Get a socket option on an socket.
 *
 *        FIX: POSIX 1003.1g is very ambiguous here. It states that
 *        asynchronous errors should be reported by getsockopt. We assume
 *        this means if you specify SO_ERROR (otherwise what is the point of it).
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                           char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);

int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags)
{
        struct sock *sk = sock->sk;

        return sk->sk_prot->recvmsg(sk, msg, size, flags);
}
EXPORT_SYMBOL(sock_common_recvmsg);

/*
 *        Set socket options on an inet socket.
 */
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_setsockopt);

void sk_common_release(struct sock *sk)
{
        if (sk->sk_prot->destroy)
                sk->sk_prot->destroy(sk);

        /*
         * Observation: when sk_common_release is called, processes have
         * no access to socket. But net still has.
         * Step one, detach it from networking:
         *
         * A. Remove from hash tables.
         */

        sk->sk_prot->unhash(sk);

        /*
         * In this point socket cannot receive new packets, but it is possible
         * that some packets are in flight because some CPU runs receiver and
         * did hash table lookup before we unhashed socket. They will achieve
         * receive queue and will be purged by socket destructor.
         *
         * Also we still have packets pending on receive queue and probably,
         * our own packets waiting in device queues. sock_destroy will drain
         * receive queue, but transmitted packets will delay socket destruction
         * until the last reference will be released.
         */

        sock_orphan(sk);

        xfrm_sk_free_policy(sk);

        sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);

void sk_get_meminfo(const struct sock *sk, u32 *mem)
{
        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);

        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
        mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
        mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
}

#ifdef CONFIG_PROC_FS
static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);

int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
        int cpu, idx = prot->inuse_idx;
        int res = 0;

        for_each_possible_cpu(cpu)
                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];

        return res >= 0 ? res : 0;
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_get);

int sock_inuse_get(struct net *net)
{
        int cpu, res = 0;

        for_each_possible_cpu(cpu)
                res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;

        return res;
}

EXPORT_SYMBOL_GPL(sock_inuse_get);

static int __net_init sock_inuse_init_net(struct net *net)
{
        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
        if (net->core.prot_inuse == NULL)
                return -ENOMEM;
        return 0;
}

static void __net_exit sock_inuse_exit_net(struct net *net)
{
        free_percpu(net->core.prot_inuse);
}

static struct pernet_operations net_inuse_ops = {
        .init = sock_inuse_init_net,
        .exit = sock_inuse_exit_net,
};

static __init int net_inuse_init(void)
{
        if (register_pernet_subsys(&net_inuse_ops))
                panic("Cannot initialize net inuse counters");

        return 0;
}

core_initcall(net_inuse_init);

static int assign_proto_idx(struct proto *prot)
{
        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);

        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
                pr_err("PROTO_INUSE_NR exhausted\n");
                return -ENOSPC;
        }

        set_bit(prot->inuse_idx, proto_inuse_idx);
        return 0;
}

static void release_proto_idx(struct proto *prot)
{
        if (prot->inuse_idx != PROTO_INUSE_NR)
                clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
static inline int assign_proto_idx(struct proto *prot)
{
        return 0;
}

static inline void release_proto_idx(struct proto *prot)
{
}

#endif

static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
{
        if (!twsk_prot)
                return;
        kfree(twsk_prot->twsk_slab_name);
        twsk_prot->twsk_slab_name = NULL;
        kmem_cache_destroy(twsk_prot->twsk_slab);
        twsk_prot->twsk_slab = NULL;
}

static int tw_prot_init(const struct proto *prot)
{
        struct timewait_sock_ops *twsk_prot = prot->twsk_prot;

        if (!twsk_prot)
                return 0;

        twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
                                              prot->name);
        if (!twsk_prot->twsk_slab_name)
                return -ENOMEM;

        twsk_prot->twsk_slab =
                kmem_cache_create(twsk_prot->twsk_slab_name,
                                  twsk_prot->twsk_obj_size, 0,
                                  SLAB_ACCOUNT | prot->slab_flags,
                                  NULL);
        if (!twsk_prot->twsk_slab) {
                pr_crit("%s: Can't create timewait sock SLAB cache!\n",
                        prot->name);
                return -ENOMEM;
        }

        return 0;
}

static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
{
        if (!rsk_prot)
                return;
        kfree(rsk_prot->slab_name);
        rsk_prot->slab_name = NULL;
        kmem_cache_destroy(rsk_prot->slab);
        rsk_prot->slab = NULL;
}

static int req_prot_init(const struct proto *prot)
{
        struct request_sock_ops *rsk_prot = prot->rsk_prot;

        if (!rsk_prot)
                return 0;

        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
                                        prot->name);
        if (!rsk_prot->slab_name)
                return -ENOMEM;

        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
                                           rsk_prot->obj_size, 0,
                                           SLAB_ACCOUNT | prot->slab_flags,
                                           NULL);

        if (!rsk_prot->slab) {
                pr_crit("%s: Can't create request sock SLAB cache!\n",
                        prot->name);
                return -ENOMEM;
        }
        return 0;
}

int proto_register(struct proto *prot, int alloc_slab)
{
        int ret = -ENOBUFS;

        if (prot->memory_allocated && !prot->sysctl_mem) {
                pr_err("%s: missing sysctl_mem\n", prot->name);
                return -EINVAL;
        }
        if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
                pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
                return -EINVAL;
        }
        if (alloc_slab) {
                struct kmem_cache_args args = {
                        .useroffset        = prot->useroffset,
                        .usersize        = prot->usersize,
                        .freeptr_offset = prot->freeptr_offset,
                        .use_freeptr_offset = !!prot->freeptr_offset,
                };

                prot->slab = kmem_cache_create(prot->name, prot->obj_size,
                                        &args,
                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
                                        prot->slab_flags);
                if (prot->slab == NULL) {
                        pr_crit("%s: Can't create sock SLAB cache!\n",
                                prot->name);
                        goto out;
                }

                if (req_prot_init(prot))
                        goto out_free_request_sock_slab;

                if (tw_prot_init(prot))
                        goto out_free_timewait_sock_slab;
        }

        mutex_lock(&proto_list_mutex);
        ret = assign_proto_idx(prot);
        if (ret) {
                mutex_unlock(&proto_list_mutex);
                goto out_free_timewait_sock_slab;
        }
        list_add(&prot->node, &proto_list);
        mutex_unlock(&proto_list_mutex);
        return ret;

out_free_timewait_sock_slab:
        if (alloc_slab)
                tw_prot_cleanup(prot->twsk_prot);
out_free_request_sock_slab:
        if (alloc_slab) {
                req_prot_cleanup(prot->rsk_prot);

                kmem_cache_destroy(prot->slab);
                prot->slab = NULL;
        }
out:
        return ret;
}
EXPORT_SYMBOL(proto_register);

void proto_unregister(struct proto *prot)
{
        mutex_lock(&proto_list_mutex);
        release_proto_idx(prot);
        list_del(&prot->node);
        mutex_unlock(&proto_list_mutex);

        kmem_cache_destroy(prot->slab);
        prot->slab = NULL;

        req_prot_cleanup(prot->rsk_prot);
        tw_prot_cleanup(prot->twsk_prot);
}
EXPORT_SYMBOL(proto_unregister);

int sock_load_diag_module(int family, int protocol)
{
        if (!protocol) {
                if (!sock_is_registered(family))
                        return -ENOENT;

                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
                                      NETLINK_SOCK_DIAG, family);
        }

#ifdef CONFIG_INET
        if (family == AF_INET &&
            protocol != IPPROTO_RAW &&
            protocol < MAX_INET_PROTOS &&
            !rcu_access_pointer(inet_protos[protocol]))
                return -ENOENT;
#endif

        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
                              NETLINK_SOCK_DIAG, family, protocol);
}
EXPORT_SYMBOL(sock_load_diag_module);

#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(proto_list_mutex)
{
        mutex_lock(&proto_list_mutex);
        return seq_list_start_head(&proto_list, *pos);
}

static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        return seq_list_next(v, &proto_list, pos);
}

static void proto_seq_stop(struct seq_file *seq, void *v)
        __releases(proto_list_mutex)
{
        mutex_unlock(&proto_list_mutex);
}

static char proto_method_implemented(const void *method)
{
        return method == NULL ? 'n' : 'y';
}
static long sock_prot_memory_allocated(struct proto *proto)
{
        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
}

static const char *sock_prot_memory_pressure(struct proto *proto)
{
        return proto->memory_pressure != NULL ?
        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
}

static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{

        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
                   proto->name,
                   proto->obj_size,
                   sock_prot_inuse_get(seq_file_net(seq), proto),
                   sock_prot_memory_allocated(proto),
                   sock_prot_memory_pressure(proto),
                   proto->max_header,
                   proto->slab == NULL ? "no" : "yes",
                   module_name(proto->owner),
                   proto_method_implemented(proto->close),
                   proto_method_implemented(proto->connect),
                   proto_method_implemented(proto->disconnect),
                   proto_method_implemented(proto->accept),
                   proto_method_implemented(proto->ioctl),
                   proto_method_implemented(proto->init),
                   proto_method_implemented(proto->destroy),
                   proto_method_implemented(proto->shutdown),
                   proto_method_implemented(proto->setsockopt),
                   proto_method_implemented(proto->getsockopt),
                   proto_method_implemented(proto->sendmsg),
                   proto_method_implemented(proto->recvmsg),
                   proto_method_implemented(proto->bind),
                   proto_method_implemented(proto->backlog_rcv),
                   proto_method_implemented(proto->hash),
                   proto_method_implemented(proto->unhash),
                   proto_method_implemented(proto->get_port),
                   proto_method_implemented(proto->enter_memory_pressure));
}

static int proto_seq_show(struct seq_file *seq, void *v)
{
        if (v == &proto_list)
                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
                           "protocol",
                           "size",
                           "sockets",
                           "memory",
                           "press",
                           "maxhdr",
                           "slab",
                           "module",
                           "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
        else
                proto_seq_printf(seq, list_entry(v, struct proto, node));
        return 0;
}

static const struct seq_operations proto_seq_ops = {
        .start  = proto_seq_start,
        .next   = proto_seq_next,
        .stop   = proto_seq_stop,
        .show   = proto_seq_show,
};

static __net_init int proto_init_net(struct net *net)
{
        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
                        sizeof(struct seq_net_private)))
                return -ENOMEM;

        return 0;
}

static __net_exit void proto_exit_net(struct net *net)
{
        remove_proc_entry("protocols", net->proc_net);
}


static __net_initdata struct pernet_operations proto_net_ops = {
        .init = proto_init_net,
        .exit = proto_exit_net,
};

static int __init proto_init(void)
{
        return register_pernet_subsys(&proto_net_ops);
}

subsys_initcall(proto_init);

#endif /* PROC_FS */

#ifdef CONFIG_NET_RX_BUSY_POLL
bool sk_busy_loop_end(void *p, unsigned long start_time)
{
        struct sock *sk = p;

        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                return true;

        if (sk_is_udp(sk) &&
            !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
                return true;

        return sk_busy_loop_timeout(sk, start_time);
}
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */

int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len)
{
        if (!sk->sk_prot->bind_add)
                return -EOPNOTSUPP;
        return sk->sk_prot->bind_add(sk, addr, addr_len);
}
EXPORT_SYMBOL(sock_bind_add);

/* Copy 'size' bytes from userspace and return `size` back to userspace */
int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
                     void __user *arg, void *karg, size_t size)
{
        int ret;

        if (copy_from_user(karg, arg, size))
                return -EFAULT;

        ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
        if (ret)
                return ret;

        if (copy_to_user(arg, karg, size))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(sock_ioctl_inout);

/* This is the most common ioctl prep function, where the result (4 bytes) is
 * copied back to userspace if the ioctl() returns successfully. No input is
 * copied from userspace as input argument.
 */
static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
{
        int ret, karg = 0;

        ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
        if (ret)
                return ret;

        return put_user(karg, (int __user *)arg);
}

/* A wrapper around sock ioctls, which copies the data from userspace
 * (depending on the protocol/ioctl), and copies back the result to userspace.
 * The main motivation for this function is to pass kernel memory to the
 * protocol ioctl callbacks, instead of userspace memory.
 */
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
        int rc = 1;

        if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
                rc = ipmr_sk_ioctl(sk, cmd, arg);
        else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
                rc = ip6mr_sk_ioctl(sk, cmd, arg);
        else if (sk_is_phonet(sk))
                rc = phonet_sk_ioctl(sk, cmd, arg);

        /* If ioctl was processed, returns its value */
        if (rc <= 0)
                return rc;

        /* Otherwise call the default handler */
        return sock_ioctl_out(sk, cmd, arg);
}
EXPORT_SYMBOL(sk_ioctl);

static int __init sock_struct_check(void)
{
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
#ifdef CONFIG_MEMCG
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
#endif

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
        return 0;
}

core_initcall(sock_struct_check);


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 















    3 
    3 



    3 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
// SPDX-License-Identifier: GPL-2.0+
/*****************************************************************************/

/*
 *      devio.c  --  User space communication with USB devices.
 *
 *      Copyright (C) 1999-2000  Thomas Sailer (sailer@ife.ee.ethz.ch)
 *
 *  This file implements the usbfs/x/y files, where
 *  x is the bus number and y the device number.
 *
 *  It allows user space programs/"drivers" to communicate directly
 *  with USB devices without intervening kernel driver.
 *
 *  Revision history
 *    22.12.1999   0.1   Initial release (split from proc_usb.c)
 *    04.01.2000   0.2   Turned into its own filesystem
 *    30.09.2005   0.3   Fix user-triggerable oops in async URB delivery
 *                             (CAN-2005-3055)
 */

/*****************************************************************************/

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/signal.h>
#include <linux/poll.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/usb.h>
#include <linux/usbdevice_fs.h>
#include <linux/usb/hcd.h>        /* for usbcore internals */
#include <linux/usb/quirks.h>
#include <linux/cdev.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/user_namespace.h>
#include <linux/scatterlist.h>
#include <linux/uaccess.h>
#include <linux/dma-mapping.h>
#include <asm/byteorder.h>
#include <linux/moduleparam.h>

#include "usb.h"

#ifdef CONFIG_PM
#define MAYBE_CAP_SUSPEND        USBDEVFS_CAP_SUSPEND
#else
#define MAYBE_CAP_SUSPEND        0
#endif

#define USB_MAXBUS                        64
#define USB_DEVICE_MAX                        (USB_MAXBUS * 128)
#define USB_SG_SIZE                        16384 /* split-size for large txs */

/* Mutual exclusion for ps->list in resume vs. release and remove */
static DEFINE_MUTEX(usbfs_mutex);

struct usb_dev_state {
        struct list_head list;      /* state list */
        struct usb_device *dev;
        struct file *file;
        spinlock_t lock;            /* protects the async urb lists */
        struct list_head async_pending;
        struct list_head async_completed;
        struct list_head memory_list;
        wait_queue_head_t wait;     /* wake up if a request completed */
        wait_queue_head_t wait_for_resume;   /* wake up upon runtime resume */
        unsigned int discsignr;
        struct pid *disc_pid;
        const struct cred *cred;
        sigval_t disccontext;
        unsigned long ifclaimed;
        u32 disabled_bulk_eps;
        unsigned long interface_allowed_mask;
        int not_yet_resumed;
        bool suspend_allowed;
        bool privileges_dropped;
};

struct usb_memory {
        struct list_head memlist;
        int vma_use_count;
        int urb_use_count;
        u32 size;
        void *mem;
        dma_addr_t dma_handle;
        unsigned long vm_start;
        struct usb_dev_state *ps;
};

struct async {
        struct list_head asynclist;
        struct usb_dev_state *ps;
        struct pid *pid;
        const struct cred *cred;
        unsigned int signr;
        unsigned int ifnum;
        void __user *userbuffer;
        void __user *userurb;
        sigval_t userurb_sigval;
        struct urb *urb;
        struct usb_memory *usbm;
        unsigned int mem_usage;
        int status;
        u8 bulk_addr;
        u8 bulk_status;
};

static bool usbfs_snoop;
module_param(usbfs_snoop, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(usbfs_snoop, "true to log all usbfs traffic");

static unsigned usbfs_snoop_max = 65536;
module_param(usbfs_snoop_max, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(usbfs_snoop_max,
                "maximum number of bytes to print while snooping");

#define snoop(dev, format, arg...)                                \
        do {                                                        \
                if (usbfs_snoop)                                \
                        dev_info(dev, format, ## arg);                \
        } while (0)

enum snoop_when {
        SUBMIT, COMPLETE
};

#define USB_DEVICE_DEV                MKDEV(USB_DEVICE_MAJOR, 0)

/* Limit on the total amount of memory we can allocate for transfers */
static u32 usbfs_memory_mb = 16;
module_param(usbfs_memory_mb, uint, 0644);
MODULE_PARM_DESC(usbfs_memory_mb,
                "maximum MB allowed for usbfs buffers (0 = no limit)");

/* Hard limit, necessary to avoid arithmetic overflow */
#define USBFS_XFER_MAX         (UINT_MAX / 2 - 1000000)

static DEFINE_SPINLOCK(usbfs_memory_usage_lock);
static u64 usbfs_memory_usage;        /* Total memory currently allocated */

/* Check whether it's okay to allocate more memory for a transfer */
static int usbfs_increase_memory_usage(u64 amount)
{
        u64 lim, total_mem;
        unsigned long flags;
        int ret;

        lim = READ_ONCE(usbfs_memory_mb);
        lim <<= 20;

        ret = 0;
        spin_lock_irqsave(&usbfs_memory_usage_lock, flags);
        total_mem = usbfs_memory_usage + amount;
        if (lim > 0 && total_mem > lim)
                ret = -ENOMEM;
        else
                usbfs_memory_usage = total_mem;
        spin_unlock_irqrestore(&usbfs_memory_usage_lock, flags);

        return ret;
}

/* Memory for a transfer is being deallocated */
static void usbfs_decrease_memory_usage(u64 amount)
{
        unsigned long flags;

        spin_lock_irqsave(&usbfs_memory_usage_lock, flags);
        if (amount > usbfs_memory_usage)
                usbfs_memory_usage = 0;
        else
                usbfs_memory_usage -= amount;
        spin_unlock_irqrestore(&usbfs_memory_usage_lock, flags);
}

static int connected(struct usb_dev_state *ps)
{
        return (!list_empty(&ps->list) &&
                        ps->dev->state != USB_STATE_NOTATTACHED);
}

static void dec_usb_memory_use_count(struct usb_memory *usbm, int *count)
{
        struct usb_dev_state *ps = usbm->ps;
        struct usb_hcd *hcd = bus_to_hcd(ps->dev->bus);
        unsigned long flags;

        spin_lock_irqsave(&ps->lock, flags);
        --*count;
        if (usbm->urb_use_count == 0 && usbm->vma_use_count == 0) {
                list_del(&usbm->memlist);
                spin_unlock_irqrestore(&ps->lock, flags);

                hcd_buffer_free_pages(hcd, usbm->size,
                                usbm->mem, usbm->dma_handle);
                usbfs_decrease_memory_usage(
                        usbm->size + sizeof(struct usb_memory));
                kfree(usbm);
        } else {
                spin_unlock_irqrestore(&ps->lock, flags);
        }
}

static void usbdev_vm_open(struct vm_area_struct *vma)
{
        struct usb_memory *usbm = vma->vm_private_data;
        unsigned long flags;

        spin_lock_irqsave(&usbm->ps->lock, flags);
        ++usbm->vma_use_count;
        spin_unlock_irqrestore(&usbm->ps->lock, flags);
}

static void usbdev_vm_close(struct vm_area_struct *vma)
{
        struct usb_memory *usbm = vma->vm_private_data;

        dec_usb_memory_use_count(usbm, &usbm->vma_use_count);
}

static const struct vm_operations_struct usbdev_vm_ops = {
        .open = usbdev_vm_open,
        .close = usbdev_vm_close
};

static int usbdev_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct usb_memory *usbm = NULL;
        struct usb_dev_state *ps = file->private_data;
        struct usb_hcd *hcd = bus_to_hcd(ps->dev->bus);
        size_t size = vma->vm_end - vma->vm_start;
        void *mem;
        unsigned long flags;
        dma_addr_t dma_handle = DMA_MAPPING_ERROR;
        int ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EPERM;

        ret = usbfs_increase_memory_usage(size + sizeof(struct usb_memory));
        if (ret)
                goto error;

        usbm = kzalloc_obj(struct usb_memory);
        if (!usbm) {
                ret = -ENOMEM;
                goto error_decrease_mem;
        }

        mem = hcd_buffer_alloc_pages(hcd,
                        size, GFP_USER | __GFP_NOWARN, &dma_handle);
        if (!mem) {
                ret = -ENOMEM;
                goto error_free_usbm;
        }

        memset(mem, 0, size);

        usbm->mem = mem;
        usbm->dma_handle = dma_handle;
        usbm->size = size;
        usbm->ps = ps;
        usbm->vm_start = vma->vm_start;
        usbm->vma_use_count = 1;
        INIT_LIST_HEAD(&usbm->memlist);

        /*
         * In DMA-unavailable cases, hcd_buffer_alloc_pages allocates
         * normal pages and assigns DMA_MAPPING_ERROR to dma_handle. Check
         * whether we are in such cases, and then use remap_pfn_range (or
         * dma_mmap_coherent) to map normal (or DMA) pages into the user
         * space, respectively.
         */
        if (dma_handle == DMA_MAPPING_ERROR) {
                if (remap_pfn_range(vma, vma->vm_start,
                                    virt_to_phys(usbm->mem) >> PAGE_SHIFT,
                                    size, vma->vm_page_prot) < 0) {
                        dec_usb_memory_use_count(usbm, &usbm->vma_use_count);
                        return -EAGAIN;
                }
        } else {
                if (dma_mmap_coherent(hcd->self.sysdev, vma, mem, dma_handle,
                                      size)) {
                        dec_usb_memory_use_count(usbm, &usbm->vma_use_count);
                        return -EAGAIN;
                }
        }

        vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP);
        vma->vm_ops = &usbdev_vm_ops;
        vma->vm_private_data = usbm;

        spin_lock_irqsave(&ps->lock, flags);
        list_add_tail(&usbm->memlist, &ps->memory_list);
        spin_unlock_irqrestore(&ps->lock, flags);

        return 0;

error_free_usbm:
        kfree(usbm);
error_decrease_mem:
        usbfs_decrease_memory_usage(size + sizeof(struct usb_memory));
error:
        return ret;
}

static ssize_t usbdev_read(struct file *file, char __user *buf, size_t nbytes,
                           loff_t *ppos)
{
        struct usb_dev_state *ps = file->private_data;
        struct usb_device *dev = ps->dev;
        ssize_t ret = 0;
        unsigned len;
        loff_t pos;
        int i;

        pos = *ppos;
        usb_lock_device(dev);
        if (!connected(ps)) {
                ret = -ENODEV;
                goto err;
        } else if (pos < 0) {
                ret = -EINVAL;
                goto err;
        }

        if (pos < sizeof(struct usb_device_descriptor)) {
                /* 18 bytes - fits on the stack */
                struct usb_device_descriptor temp_desc;

                memcpy(&temp_desc, &dev->descriptor, sizeof(dev->descriptor));
                le16_to_cpus(&temp_desc.bcdUSB);
                le16_to_cpus(&temp_desc.idVendor);
                le16_to_cpus(&temp_desc.idProduct);
                le16_to_cpus(&temp_desc.bcdDevice);

                len = sizeof(struct usb_device_descriptor) - pos;
                if (len > nbytes)
                        len = nbytes;
                if (copy_to_user(buf, ((char *)&temp_desc) + pos, len)) {
                        ret = -EFAULT;
                        goto err;
                }

                *ppos += len;
                buf += len;
                nbytes -= len;
                ret += len;
        }

        pos = sizeof(struct usb_device_descriptor);
        for (i = 0; nbytes && i < dev->descriptor.bNumConfigurations; i++) {
                struct usb_config_descriptor *config =
                        (struct usb_config_descriptor *)dev->rawdescriptors[i];
                unsigned int length = le16_to_cpu(config->wTotalLength);

                if (*ppos < pos + length) {

                        /* The descriptor may claim to be longer than it
                         * really is.  Here is the actual allocated length. */
                        unsigned alloclen =
                                le16_to_cpu(dev->config[i].desc.wTotalLength);

                        len = length - (*ppos - pos);
                        if (len > nbytes)
                                len = nbytes;

                        /* Simply don't write (skip over) unallocated parts */
                        if (alloclen > (*ppos - pos)) {
                                alloclen -= (*ppos - pos);
                                if (copy_to_user(buf,
                                    dev->rawdescriptors[i] + (*ppos - pos),
                                    min(len, alloclen))) {
                                        ret = -EFAULT;
                                        goto err;
                                }
                        }

                        *ppos += len;
                        buf += len;
                        nbytes -= len;
                        ret += len;
                }

                pos += length;
        }

err:
        usb_unlock_device(dev);
        return ret;
}

/*
 * async list handling
 */

static struct async *alloc_async(unsigned int numisoframes)
{
        struct async *as;

        as = kzalloc_obj(struct async);
        if (!as)
                return NULL;
        as->urb = usb_alloc_urb(numisoframes, GFP_KERNEL);
        if (!as->urb) {
                kfree(as);
                return NULL;
        }
        return as;
}

static void free_async(struct async *as)
{
        int i;

        put_pid(as->pid);
        if (as->cred)
                put_cred(as->cred);
        for (i = 0; i < as->urb->num_sgs; i++) {
                if (sg_page(&as->urb->sg[i]))
                        kfree(sg_virt(&as->urb->sg[i]));
        }

        kfree(as->urb->sg);
        if (as->usbm == NULL)
                kfree(as->urb->transfer_buffer);
        else
                dec_usb_memory_use_count(as->usbm, &as->usbm->urb_use_count);

        kfree(as->urb->setup_packet);
        usb_free_urb(as->urb);
        usbfs_decrease_memory_usage(as->mem_usage);
        kfree(as);
}

static void async_newpending(struct async *as)
{
        struct usb_dev_state *ps = as->ps;
        unsigned long flags;

        spin_lock_irqsave(&ps->lock, flags);
        list_add_tail(&as->asynclist, &ps->async_pending);
        spin_unlock_irqrestore(&ps->lock, flags);
}

static void async_removepending(struct async *as)
{
        struct usb_dev_state *ps = as->ps;
        unsigned long flags;

        spin_lock_irqsave(&ps->lock, flags);
        list_del_init(&as->asynclist);
        spin_unlock_irqrestore(&ps->lock, flags);
}

static struct async *async_getcompleted(struct usb_dev_state *ps)
{
        unsigned long flags;
        struct async *as = NULL;

        spin_lock_irqsave(&ps->lock, flags);
        if (!list_empty(&ps->async_completed)) {
                as = list_entry(ps->async_completed.next, struct async,
                                asynclist);
                list_del_init(&as->asynclist);
        }
        spin_unlock_irqrestore(&ps->lock, flags);
        return as;
}

static struct async *async_getpending(struct usb_dev_state *ps,
                                             void __user *userurb)
{
        struct async *as;

        list_for_each_entry(as, &ps->async_pending, asynclist)
                if (as->userurb == userurb) {
                        list_del_init(&as->asynclist);
                        return as;
                }

        return NULL;
}

static void snoop_urb(struct usb_device *udev,
                void __user *userurb, int pipe, unsigned length,
                int timeout_or_status, enum snoop_when when,
                unsigned char *data, unsigned data_len)
{
        static const char *types[] = {"isoc", "int", "ctrl", "bulk"};
        static const char *dirs[] = {"out", "in"};
        int ep;
        const char *t, *d;

        if (!usbfs_snoop)
                return;

        ep = usb_pipeendpoint(pipe);
        t = types[usb_pipetype(pipe)];
        d = dirs[!!usb_pipein(pipe)];

        if (userurb) {                /* Async */
                if (when == SUBMIT)
                        dev_info(&udev->dev, "userurb %px, ep%d %s-%s, "
                                        "length %u\n",
                                        userurb, ep, t, d, length);
                else
                        dev_info(&udev->dev, "userurb %px, ep%d %s-%s, "
                                        "actual_length %u status %d\n",
                                        userurb, ep, t, d, length,
                                        timeout_or_status);
        } else {
                if (when == SUBMIT)
                        dev_info(&udev->dev, "ep%d %s-%s, length %u, "
                                        "timeout %d\n",
                                        ep, t, d, length, timeout_or_status);
                else
                        dev_info(&udev->dev, "ep%d %s-%s, actual_length %u, "
                                        "status %d\n",
                                        ep, t, d, length, timeout_or_status);
        }

        data_len = min(data_len, usbfs_snoop_max);
        if (data && data_len > 0) {
                print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_NONE, 32, 1,
                        data, data_len, 1);
        }
}

static void snoop_urb_data(struct urb *urb, unsigned len)
{
        int i, size;

        len = min(len, usbfs_snoop_max);
        if (!usbfs_snoop || len == 0)
                return;

        if (urb->num_sgs == 0) {
                print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_NONE, 32, 1,
                        urb->transfer_buffer, len, 1);
                return;
        }

        for (i = 0; i < urb->num_sgs && len; i++) {
                size = (len > USB_SG_SIZE) ? USB_SG_SIZE : len;
                print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_NONE, 32, 1,
                        sg_virt(&urb->sg[i]), size, 1);
                len -= size;
        }
}

static int copy_urb_data_to_user(u8 __user *userbuffer, struct urb *urb)
{
        unsigned i, len, size;

        if (urb->number_of_packets > 0)                /* Isochronous */
                len = urb->transfer_buffer_length;
        else                                        /* Non-Isoc */
                len = urb->actual_length;

        if (urb->num_sgs == 0) {
                if (copy_to_user(userbuffer, urb->transfer_buffer, len))
                        return -EFAULT;
                return 0;
        }

        for (i = 0; i < urb->num_sgs && len; i++) {
                size = (len > USB_SG_SIZE) ? USB_SG_SIZE : len;
                if (copy_to_user(userbuffer, sg_virt(&urb->sg[i]), size))
                        return -EFAULT;
                userbuffer += size;
                len -= size;
        }

        return 0;
}

#define AS_CONTINUATION        1
#define AS_UNLINK        2

static void cancel_bulk_urbs(struct usb_dev_state *ps, unsigned bulk_addr)
__releases(ps->lock)
__acquires(ps->lock)
{
        struct urb *urb;
        struct async *as;

        /* Mark all the pending URBs that match bulk_addr, up to but not
         * including the first one without AS_CONTINUATION.  If such an
         * URB is encountered then a new transfer has already started so
         * the endpoint doesn't need to be disabled; otherwise it does.
         */
        list_for_each_entry(as, &ps->async_pending, asynclist) {
                if (as->bulk_addr == bulk_addr) {
                        if (as->bulk_status != AS_CONTINUATION)
                                goto rescan;
                        as->bulk_status = AS_UNLINK;
                        as->bulk_addr = 0;
                }
        }
        ps->disabled_bulk_eps |= (1 << bulk_addr);

        /* Now carefully unlink all the marked pending URBs */
 rescan:
        list_for_each_entry_reverse(as, &ps->async_pending, asynclist) {
                if (as->bulk_status == AS_UNLINK) {
                        as->bulk_status = 0;                /* Only once */
                        urb = as->urb;
                        usb_get_urb(urb);
                        spin_unlock(&ps->lock);                /* Allow completions */
                        usb_unlink_urb(urb);
                        usb_put_urb(urb);
                        spin_lock(&ps->lock);
                        goto rescan;
                }
        }
}

static void async_completed(struct urb *urb)
{
        struct async *as = urb->context;
        struct usb_dev_state *ps = as->ps;
        struct pid *pid = NULL;
        const struct cred *cred = NULL;
        unsigned long flags;
        sigval_t addr;
        int signr, errno;

        spin_lock_irqsave(&ps->lock, flags);
        list_move_tail(&as->asynclist, &ps->async_completed);
        as->status = urb->status;
        signr = as->signr;
        if (signr) {
                errno = as->status;
                addr = as->userurb_sigval;
                pid = get_pid(as->pid);
                cred = get_cred(as->cred);
        }
        snoop(&urb->dev->dev, "urb complete\n");
        snoop_urb(urb->dev, as->userurb, urb->pipe, urb->actual_length,
                        as->status, COMPLETE, NULL, 0);
        if (usb_urb_dir_in(urb))
                snoop_urb_data(urb, urb->actual_length);

        if (as->status < 0 && as->bulk_addr && as->status != -ECONNRESET &&
                        as->status != -ENOENT)
                cancel_bulk_urbs(ps, as->bulk_addr);

        wake_up(&ps->wait);
        spin_unlock_irqrestore(&ps->lock, flags);

        if (signr) {
                kill_pid_usb_asyncio(signr, errno, addr, pid, cred);
                put_pid(pid);
                put_cred(cred);
        }
}

static void destroy_async(struct usb_dev_state *ps, struct list_head *list)
{
        struct urb *urb;
        struct async *as;
        unsigned long flags;

        spin_lock_irqsave(&ps->lock, flags);
        while (!list_empty(list)) {
                as = list_last_entry(list, struct async, asynclist);
                list_del_init(&as->asynclist);
                urb = as->urb;
                usb_get_urb(urb);

                /* drop the spinlock so the completion handler can run */
                spin_unlock_irqrestore(&ps->lock, flags);
                usb_kill_urb(urb);
                usb_put_urb(urb);
                spin_lock_irqsave(&ps->lock, flags);
        }
        spin_unlock_irqrestore(&ps->lock, flags);
}

static void destroy_async_on_interface(struct usb_dev_state *ps,
                                       unsigned int ifnum)
{
        struct list_head *p, *q, hitlist;
        unsigned long flags;

        INIT_LIST_HEAD(&hitlist);
        spin_lock_irqsave(&ps->lock, flags);
        list_for_each_safe(p, q, &ps->async_pending)
                if (ifnum == list_entry(p, struct async, asynclist)->ifnum)
                        list_move_tail(p, &hitlist);
        spin_unlock_irqrestore(&ps->lock, flags);
        destroy_async(ps, &hitlist);
}

static void destroy_all_async(struct usb_dev_state *ps)
{
        destroy_async(ps, &ps->async_pending);
}

/*
 * interface claims are made only at the request of user level code,
 * which can also release them (explicitly or by closing files).
 * they're also undone when devices disconnect.
 */

static int driver_probe(struct usb_interface *intf,
                        const struct usb_device_id *id)
{
        return -ENODEV;
}

static void driver_disconnect(struct usb_interface *intf)
{
        struct usb_dev_state *ps = usb_get_intfdata(intf);
        unsigned int ifnum = intf->altsetting->desc.bInterfaceNumber;

        if (!ps)
                return;

        /* NOTE:  this relies on usbcore having canceled and completed
         * all pending I/O requests; 2.6 does that.
         */

        if (likely(ifnum < 8*sizeof(ps->ifclaimed)))
                clear_bit(ifnum, &ps->ifclaimed);
        else
                dev_warn(&intf->dev, "interface number %u out of range\n",
                         ifnum);

        usb_set_intfdata(intf, NULL);

        /* force async requests to complete */
        destroy_async_on_interface(ps, ifnum);
}

/* We don't care about suspend/resume of claimed interfaces */
static int driver_suspend(struct usb_interface *intf, pm_message_t msg)
{
        return 0;
}

static int driver_resume(struct usb_interface *intf)
{
        return 0;
}

#ifdef CONFIG_PM
/* The following routines apply to the entire device, not interfaces */
void usbfs_notify_suspend(struct usb_device *udev)
{
        /* We don't need to handle this */
}

void usbfs_notify_resume(struct usb_device *udev)
{
        struct usb_dev_state *ps;

        /* Protect against simultaneous remove or release */
        mutex_lock(&usbfs_mutex);
        list_for_each_entry(ps, &udev->filelist, list) {
                WRITE_ONCE(ps->not_yet_resumed, 0);
                wake_up_all(&ps->wait_for_resume);
        }
        mutex_unlock(&usbfs_mutex);
}
#endif

struct usb_driver usbfs_driver = {
        .name =                "usbfs",
        .probe =        driver_probe,
        .disconnect =        driver_disconnect,
        .suspend =        driver_suspend,
        .resume =        driver_resume,
        .supports_autosuspend = 1,
};

static int claimintf(struct usb_dev_state *ps, unsigned int ifnum)
{
        struct usb_device *dev = ps->dev;
        struct usb_interface *intf;
        int err;

        if (ifnum >= 8*sizeof(ps->ifclaimed))
                return -EINVAL;
        /* already claimed */
        if (test_bit(ifnum, &ps->ifclaimed))
                return 0;

        if (ps->privileges_dropped &&
                        !test_bit(ifnum, &ps->interface_allowed_mask))
                return -EACCES;

        intf = usb_ifnum_to_if(dev, ifnum);
        if (!intf)
                err = -ENOENT;
        else {
                unsigned int old_suppress;

                /* suppress uevents while claiming interface */
                old_suppress = dev_get_uevent_suppress(&intf->dev);
                dev_set_uevent_suppress(&intf->dev, 1);
                err = usb_driver_claim_interface(&usbfs_driver, intf, ps);
                dev_set_uevent_suppress(&intf->dev, old_suppress);
        }
        if (err == 0)
                set_bit(ifnum, &ps->ifclaimed);
        return err;
}

static int releaseintf(struct usb_dev_state *ps, unsigned int ifnum)
{
        struct usb_device *dev;
        struct usb_interface *intf;
        int err;

        err = -EINVAL;
        if (ifnum >= 8*sizeof(ps->ifclaimed))
                return err;
        dev = ps->dev;
        intf = usb_ifnum_to_if(dev, ifnum);
        if (!intf)
                err = -ENOENT;
        else if (test_and_clear_bit(ifnum, &ps->ifclaimed)) {
                unsigned int old_suppress;

                /* suppress uevents while releasing interface */
                old_suppress = dev_get_uevent_suppress(&intf->dev);
                dev_set_uevent_suppress(&intf->dev, 1);
                usb_driver_release_interface(&usbfs_driver, intf);
                dev_set_uevent_suppress(&intf->dev, old_suppress);
                err = 0;
        }
        return err;
}

static int checkintf(struct usb_dev_state *ps, unsigned int ifnum)
{
        if (ps->dev->state != USB_STATE_CONFIGURED)
                return -EHOSTUNREACH;
        if (ifnum >= 8*sizeof(ps->ifclaimed))
                return -EINVAL;
        if (test_bit(ifnum, &ps->ifclaimed))
                return 0;
        /* if not yet claimed, claim it for the driver */
        dev_warn(&ps->dev->dev, "usbfs: process %d (%s) did not claim "
                 "interface %u before use\n", task_pid_nr(current),
                 current->comm, ifnum);
        return claimintf(ps, ifnum);
}

static int findintfep(struct usb_device *dev, unsigned int ep)
{
        unsigned int i, j, e;
        struct usb_interface *intf;
        struct usb_host_interface *alts;
        struct usb_endpoint_descriptor *endpt;

        if (ep & ~(USB_DIR_IN|0xf))
                return -EINVAL;
        if (!dev->actconfig)
                return -ESRCH;
        for (i = 0; i < dev->actconfig->desc.bNumInterfaces; i++) {
                intf = dev->actconfig->interface[i];
                for (j = 0; j < intf->num_altsetting; j++) {
                        alts = &intf->altsetting[j];
                        for (e = 0; e < alts->desc.bNumEndpoints; e++) {
                                endpt = &alts->endpoint[e].desc;
                                if (endpt->bEndpointAddress == ep)
                                        return alts->desc.bInterfaceNumber;
                        }
                }
        }
        return -ENOENT;
}

static int check_ctrlrecip(struct usb_dev_state *ps, unsigned int requesttype,
                           unsigned int request, unsigned int index)
{
        int ret = 0;
        struct usb_host_interface *alt_setting;

        if (ps->dev->state != USB_STATE_UNAUTHENTICATED
         && ps->dev->state != USB_STATE_ADDRESS
         && ps->dev->state != USB_STATE_CONFIGURED)
                return -EHOSTUNREACH;
        if (USB_TYPE_VENDOR == (USB_TYPE_MASK & requesttype))
                return 0;

        /*
         * check for the special corner case 'get_device_id' in the printer
         * class specification, which we always want to allow as it is used
         * to query things like ink level, etc.
         */
        if (requesttype == 0xa1 && request == 0) {
                alt_setting = usb_find_alt_setting(ps->dev->actconfig,
                                                   index >> 8, index & 0xff);
                if (alt_setting
                 && alt_setting->desc.bInterfaceClass == USB_CLASS_PRINTER)
                        return 0;
        }

        index &= 0xff;
        switch (requesttype & USB_RECIP_MASK) {
        case USB_RECIP_ENDPOINT:
                if ((index & ~USB_DIR_IN) == 0)
                        return 0;
                ret = findintfep(ps->dev, index);
                if (ret < 0) {
                        /*
                         * Some not fully compliant Win apps seem to get
                         * index wrong and have the endpoint number here
                         * rather than the endpoint address (with the
                         * correct direction). Win does let this through,
                         * so we'll not reject it here but leave it to
                         * the device to not break KVM. But we warn.
                         */
                        ret = findintfep(ps->dev, index ^ 0x80);
                        if (ret >= 0)
                                dev_info(&ps->dev->dev,
                                        "%s: process %i (%s) requesting ep %02x but needs %02x\n",
                                        __func__, task_pid_nr(current),
                                        current->comm, index, index ^ 0x80);
                }
                if (ret >= 0)
                        ret = checkintf(ps, ret);
                break;

        case USB_RECIP_INTERFACE:
                ret = checkintf(ps, index);
                break;
        }
        return ret;
}

static struct usb_host_endpoint *ep_to_host_endpoint(struct usb_device *dev,
                                                     unsigned char ep)
{
        if (ep & USB_ENDPOINT_DIR_MASK)
                return dev->ep_in[ep & USB_ENDPOINT_NUMBER_MASK];
        else
                return dev->ep_out[ep & USB_ENDPOINT_NUMBER_MASK];
}

static int parse_usbdevfs_streams(struct usb_dev_state *ps,
                                  struct usbdevfs_streams __user *streams,
                                  unsigned int *num_streams_ret,
                                  unsigned int *num_eps_ret,
                                  struct usb_host_endpoint ***eps_ret,
                                  struct usb_interface **intf_ret)
{
        unsigned int i, num_streams, num_eps;
        struct usb_host_endpoint **eps;
        struct usb_interface *intf = NULL;
        unsigned char ep;
        int ifnum, ret;

        if (get_user(num_streams, &streams->num_streams) ||
            get_user(num_eps, &streams->num_eps))
                return -EFAULT;

        if (num_eps < 1 || num_eps > USB_MAXENDPOINTS)
                return -EINVAL;

        /* The XHCI controller allows max 2 ^ 16 streams */
        if (num_streams_ret && (num_streams < 2 || num_streams > 65536))
                return -EINVAL;

        eps = kmalloc_objs(*eps, num_eps);
        if (!eps)
                return -ENOMEM;

        for (i = 0; i < num_eps; i++) {
                if (get_user(ep, &streams->eps[i])) {
                        ret = -EFAULT;
                        goto error;
                }
                eps[i] = ep_to_host_endpoint(ps->dev, ep);
                if (!eps[i]) {
                        ret = -EINVAL;
                        goto error;
                }

                /* usb_alloc/free_streams operate on an usb_interface */
                ifnum = findintfep(ps->dev, ep);
                if (ifnum < 0) {
                        ret = ifnum;
                        goto error;
                }

                if (i == 0) {
                        ret = checkintf(ps, ifnum);
                        if (ret < 0)
                                goto error;
                        intf = usb_ifnum_to_if(ps->dev, ifnum);
                } else {
                        /* Verify all eps belong to the same interface */
                        if (ifnum != intf->altsetting->desc.bInterfaceNumber) {
                                ret = -EINVAL;
                                goto error;
                        }
                }
        }

        if (num_streams_ret)
                *num_streams_ret = num_streams;
        *num_eps_ret = num_eps;
        *eps_ret = eps;
        *intf_ret = intf;

        return 0;

error:
        kfree(eps);
        return ret;
}

static struct usb_device *usbdev_lookup_by_devt(dev_t devt)
{
        struct device *dev;

        dev = bus_find_device_by_devt(&usb_bus_type, devt);
        if (!dev)
                return NULL;
        return to_usb_device(dev);
}

/*
 * file operations
 */
static int usbdev_open(struct inode *inode, struct file *file)
{
        struct usb_device *dev = NULL;
        struct usb_dev_state *ps;
        int ret;

        ret = -ENOMEM;
        ps = kzalloc_obj(struct usb_dev_state);
        if (!ps)
                goto out_free_ps;

        ret = -ENODEV;

        /* usbdev device-node */
        if (imajor(inode) == USB_DEVICE_MAJOR)
                dev = usbdev_lookup_by_devt(inode->i_rdev);
        if (!dev)
                goto out_free_ps;

        usb_lock_device(dev);
        if (dev->state == USB_STATE_NOTATTACHED)
                goto out_unlock_device;

        ret = usb_autoresume_device(dev);
        if (ret)
                goto out_unlock_device;

        ps->dev = dev;
        ps->file = file;
        ps->interface_allowed_mask = 0xFFFFFFFF; /* 32 bits */
        spin_lock_init(&ps->lock);
        INIT_LIST_HEAD(&ps->list);
        INIT_LIST_HEAD(&ps->async_pending);
        INIT_LIST_HEAD(&ps->async_completed);
        INIT_LIST_HEAD(&ps->memory_list);
        init_waitqueue_head(&ps->wait);
        init_waitqueue_head(&ps->wait_for_resume);
        ps->disc_pid = get_pid(task_pid(current));
        ps->cred = get_current_cred();
        smp_wmb();

        /* Can't race with resume; the device is already active */
        list_add_tail(&ps->list, &dev->filelist);
        file->private_data = ps;
        usb_unlock_device(dev);
        snoop(&dev->dev, "opened by process %d: %s\n", task_pid_nr(current),
                        current->comm);
        return ret;

 out_unlock_device:
        usb_unlock_device(dev);
        usb_put_dev(dev);
 out_free_ps:
        kfree(ps);
        return ret;
}

static int usbdev_release(struct inode *inode, struct file *file)
{
        struct usb_dev_state *ps = file->private_data;
        struct usb_device *dev = ps->dev;
        unsigned int ifnum;
        struct async *as;

        usb_lock_device(dev);
        usb_hub_release_all_ports(dev, ps);

        /* Protect against simultaneous resume */
        mutex_lock(&usbfs_mutex);
        list_del_init(&ps->list);
        mutex_unlock(&usbfs_mutex);

        for (ifnum = 0; ps->ifclaimed && ifnum < 8*sizeof(ps->ifclaimed);
                        ifnum++) {
                if (test_bit(ifnum, &ps->ifclaimed))
                        releaseintf(ps, ifnum);
        }
        destroy_all_async(ps);
        if (!ps->suspend_allowed)
                usb_autosuspend_device(dev);
        usb_unlock_device(dev);
        usb_put_dev(dev);
        put_pid(ps->disc_pid);
        put_cred(ps->cred);

        as = async_getcompleted(ps);
        while (as) {
                free_async(as);
                as = async_getcompleted(ps);
        }

        kfree(ps);
        return 0;
}

static void usbfs_blocking_completion(struct urb *urb)
{
        complete((struct completion *) urb->context);
}

/*
 * Much like usb_start_wait_urb, but returns status separately from
 * actual_length and uses a killable wait.
 */
static int usbfs_start_wait_urb(struct urb *urb, int timeout,
                unsigned int *actlen)
{
        DECLARE_COMPLETION_ONSTACK(ctx);
        unsigned long expire;
        int rc;

        urb->context = &ctx;
        urb->complete = usbfs_blocking_completion;
        *actlen = 0;
        rc = usb_submit_urb(urb, GFP_KERNEL);
        if (unlikely(rc))
                return rc;

        expire = (timeout ? msecs_to_jiffies(timeout) : MAX_SCHEDULE_TIMEOUT);
        rc = wait_for_completion_killable_timeout(&ctx, expire);
        if (rc <= 0) {
                usb_kill_urb(urb);
                *actlen = urb->actual_length;
                if (urb->status != -ENOENT)
                        ;        /* Completed before it was killed */
                else if (rc < 0)
                        return -EINTR;
                else
                        return -ETIMEDOUT;
        }
        *actlen = urb->actual_length;
        return urb->status;
}

static int do_proc_control(struct usb_dev_state *ps,
                struct usbdevfs_ctrltransfer *ctrl)
{
        struct usb_device *dev = ps->dev;
        unsigned int tmo;
        unsigned char *tbuf;
        unsigned int wLength, actlen;
        int i, pipe, ret;
        struct urb *urb = NULL;
        struct usb_ctrlrequest *dr = NULL;

        ret = check_ctrlrecip(ps, ctrl->bRequestType, ctrl->bRequest,
                              ctrl->wIndex);
        if (ret)
                return ret;
        wLength = ctrl->wLength;        /* To suppress 64k PAGE_SIZE warning */
        if (wLength > PAGE_SIZE)
                return -EINVAL;
        ret = usbfs_increase_memory_usage(PAGE_SIZE + sizeof(struct urb) +
                        sizeof(struct usb_ctrlrequest));
        if (ret)
                return ret;

        ret = -ENOMEM;
        tbuf = (unsigned char *)__get_free_page(GFP_KERNEL);
        if (!tbuf)
                goto done;
        urb = usb_alloc_urb(0, GFP_NOIO);
        if (!urb)
                goto done;
        dr = kmalloc_obj(struct usb_ctrlrequest, GFP_NOIO);
        if (!dr)
                goto done;

        dr->bRequestType = ctrl->bRequestType;
        dr->bRequest = ctrl->bRequest;
        dr->wValue = cpu_to_le16(ctrl->wValue);
        dr->wIndex = cpu_to_le16(ctrl->wIndex);
        dr->wLength = cpu_to_le16(ctrl->wLength);

        tmo = ctrl->timeout;
        snoop(&dev->dev, "control urb: bRequestType=%02x "
                "bRequest=%02x wValue=%04x "
                "wIndex=%04x wLength=%04x\n",
                ctrl->bRequestType, ctrl->bRequest, ctrl->wValue,
                ctrl->wIndex, ctrl->wLength);

        if ((ctrl->bRequestType & USB_DIR_IN) && wLength) {
                pipe = usb_rcvctrlpipe(dev, 0);
                usb_fill_control_urb(urb, dev, pipe, (unsigned char *) dr, tbuf,
                                wLength, NULL, NULL);
                snoop_urb(dev, NULL, pipe, wLength, tmo, SUBMIT, NULL, 0);

                usb_unlock_device(dev);
                i = usbfs_start_wait_urb(urb, tmo, &actlen);

                /* Linger a bit, prior to the next control message. */
                if (dev->quirks & USB_QUIRK_DELAY_CTRL_MSG)
                        msleep(200);
                usb_lock_device(dev);
                snoop_urb(dev, NULL, pipe, actlen, i, COMPLETE, tbuf, actlen);
                if (!i && actlen) {
                        if (copy_to_user(ctrl->data, tbuf, actlen)) {
                                ret = -EFAULT;
                                goto done;
                        }
                }
        } else {
                if (wLength) {
                        if (copy_from_user(tbuf, ctrl->data, wLength)) {
                                ret = -EFAULT;
                                goto done;
                        }
                }
                pipe = usb_sndctrlpipe(dev, 0);
                usb_fill_control_urb(urb, dev, pipe, (unsigned char *) dr, tbuf,
                                wLength, NULL, NULL);
                snoop_urb(dev, NULL, pipe, wLength, tmo, SUBMIT, tbuf, wLength);

                usb_unlock_device(dev);
                i = usbfs_start_wait_urb(urb, tmo, &actlen);

                /* Linger a bit, prior to the next control message. */
                if (dev->quirks & USB_QUIRK_DELAY_CTRL_MSG)
                        msleep(200);
                usb_lock_device(dev);
                snoop_urb(dev, NULL, pipe, actlen, i, COMPLETE, NULL, 0);
        }
        if (i < 0 && i != -EPIPE) {
                dev_printk(KERN_DEBUG, &dev->dev, "usbfs: USBDEVFS_CONTROL "
                           "failed cmd %s rqt %u rq %u len %u ret %d\n",
                           current->comm, ctrl->bRequestType, ctrl->bRequest,
                           ctrl->wLength, i);
        }
        ret = (i < 0 ? i : actlen);

 done:
        kfree(dr);
        usb_free_urb(urb);
        free_page((unsigned long) tbuf);
        usbfs_decrease_memory_usage(PAGE_SIZE + sizeof(struct urb) +
                        sizeof(struct usb_ctrlrequest));
        return ret;
}

static int proc_control(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_ctrltransfer ctrl;

        if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
                return -EFAULT;
        return do_proc_control(ps, &ctrl);
}

static int do_proc_bulk(struct usb_dev_state *ps,
                struct usbdevfs_bulktransfer *bulk)
{
        struct usb_device *dev = ps->dev;
        unsigned int tmo, len1, len2, pipe;
        unsigned char *tbuf;
        int i, ret;
        struct urb *urb = NULL;
        struct usb_host_endpoint *ep;

        ret = findintfep(ps->dev, bulk->ep);
        if (ret < 0)
                return ret;
        ret = checkintf(ps, ret);
        if (ret)
                return ret;

        len1 = bulk->len;
        if (len1 >= (INT_MAX - sizeof(struct urb)))
                return -EINVAL;

        if (bulk->ep & USB_DIR_IN)
                pipe = usb_rcvbulkpipe(dev, bulk->ep & 0x7f);
        else
                pipe = usb_sndbulkpipe(dev, bulk->ep & 0x7f);
        ep = usb_pipe_endpoint(dev, pipe);
        if (!ep || !usb_endpoint_maxp(&ep->desc))
                return -EINVAL;
        ret = usbfs_increase_memory_usage(len1 + sizeof(struct urb));
        if (ret)
                return ret;

        /*
         * len1 can be almost arbitrarily large.  Don't WARN if it's
         * too big, just fail the request.
         */
        ret = -ENOMEM;
        tbuf = kmalloc(len1, GFP_KERNEL | __GFP_NOWARN);
        if (!tbuf)
                goto done;
        urb = usb_alloc_urb(0, GFP_KERNEL);
        if (!urb)
                goto done;

        if ((ep->desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                        USB_ENDPOINT_XFER_INT) {
                pipe = (pipe & ~(3 << 30)) | (PIPE_INTERRUPT << 30);
                usb_fill_int_urb(urb, dev, pipe, tbuf, len1,
                                NULL, NULL, ep->desc.bInterval);
        } else {
                usb_fill_bulk_urb(urb, dev, pipe, tbuf, len1, NULL, NULL);
        }

        tmo = bulk->timeout;
        if (bulk->ep & 0x80) {
                snoop_urb(dev, NULL, pipe, len1, tmo, SUBMIT, NULL, 0);

                usb_unlock_device(dev);
                i = usbfs_start_wait_urb(urb, tmo, &len2);
                usb_lock_device(dev);
                snoop_urb(dev, NULL, pipe, len2, i, COMPLETE, tbuf, len2);

                if (!i && len2) {
                        if (copy_to_user(bulk->data, tbuf, len2)) {
                                ret = -EFAULT;
                                goto done;
                        }
                }
        } else {
                if (len1) {
                        if (copy_from_user(tbuf, bulk->data, len1)) {
                                ret = -EFAULT;
                                goto done;
                        }
                }
                snoop_urb(dev, NULL, pipe, len1, tmo, SUBMIT, tbuf, len1);

                usb_unlock_device(dev);
                i = usbfs_start_wait_urb(urb, tmo, &len2);
                usb_lock_device(dev);
                snoop_urb(dev, NULL, pipe, len2, i, COMPLETE, NULL, 0);
        }
        ret = (i < 0 ? i : len2);
 done:
        usb_free_urb(urb);
        kfree(tbuf);
        usbfs_decrease_memory_usage(len1 + sizeof(struct urb));
        return ret;
}

static int proc_bulk(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_bulktransfer bulk;

        if (copy_from_user(&bulk, arg, sizeof(bulk)))
                return -EFAULT;
        return do_proc_bulk(ps, &bulk);
}

static void check_reset_of_active_ep(struct usb_device *udev,
                unsigned int epnum, char *ioctl_name)
{
        struct usb_host_endpoint **eps;
        struct usb_host_endpoint *ep;

        eps = (epnum & USB_DIR_IN) ? udev->ep_in : udev->ep_out;
        ep = eps[epnum & 0x0f];
        if (ep && !list_empty(&ep->urb_list))
                dev_warn(&udev->dev, "Process %d (%s) called USBDEVFS_%s for active endpoint 0x%02x\n",
                                task_pid_nr(current), current->comm,
                                ioctl_name, epnum);
}

static int proc_resetep(struct usb_dev_state *ps, void __user *arg)
{
        unsigned int ep;
        int ret;

        if (get_user(ep, (unsigned int __user *)arg))
                return -EFAULT;
        ret = findintfep(ps->dev, ep);
        if (ret < 0)
                return ret;
        ret = checkintf(ps, ret);
        if (ret)
                return ret;
        check_reset_of_active_ep(ps->dev, ep, "RESETEP");
        usb_reset_endpoint(ps->dev, ep);
        return 0;
}

static int proc_clearhalt(struct usb_dev_state *ps, void __user *arg)
{
        unsigned int ep;
        int pipe;
        int ret;

        if (get_user(ep, (unsigned int __user *)arg))
                return -EFAULT;
        ret = findintfep(ps->dev, ep);
        if (ret < 0)
                return ret;
        ret = checkintf(ps, ret);
        if (ret)
                return ret;
        check_reset_of_active_ep(ps->dev, ep, "CLEAR_HALT");
        if (ep & USB_DIR_IN)
                pipe = usb_rcvbulkpipe(ps->dev, ep & 0x7f);
        else
                pipe = usb_sndbulkpipe(ps->dev, ep & 0x7f);

        return usb_clear_halt(ps->dev, pipe);
}

static int proc_getdriver(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_getdriver gd;
        struct usb_interface *intf;
        int ret;

        if (copy_from_user(&gd, arg, sizeof(gd)))
                return -EFAULT;
        intf = usb_ifnum_to_if(ps->dev, gd.interface);
        if (!intf || !intf->dev.driver)
                ret = -ENODATA;
        else {
                strscpy(gd.driver, intf->dev.driver->name,
                                sizeof(gd.driver));
                ret = (copy_to_user(arg, &gd, sizeof(gd)) ? -EFAULT : 0);
        }
        return ret;
}

static int proc_connectinfo(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_connectinfo ci;

        memset(&ci, 0, sizeof(ci));
        ci.devnum = ps->dev->devnum;
        ci.slow = ps->dev->speed == USB_SPEED_LOW;

        if (copy_to_user(arg, &ci, sizeof(ci)))
                return -EFAULT;
        return 0;
}

static int proc_conninfo_ex(struct usb_dev_state *ps,
                            void __user *arg, size_t size)
{
        struct usbdevfs_conninfo_ex ci;
        struct usb_device *udev = ps->dev;

        if (size < sizeof(ci.size))
                return -EINVAL;

        memset(&ci, 0, sizeof(ci));
        ci.size = sizeof(ci);
        ci.busnum = udev->bus->busnum;
        ci.devnum = udev->devnum;
        ci.speed = udev->speed;

        while (udev && udev->portnum != 0) {
                if (++ci.num_ports <= ARRAY_SIZE(ci.ports))
                        ci.ports[ARRAY_SIZE(ci.ports) - ci.num_ports] =
                                        udev->portnum;
                udev = udev->parent;
        }

        if (ci.num_ports < ARRAY_SIZE(ci.ports))
                memmove(&ci.ports[0],
                        &ci.ports[ARRAY_SIZE(ci.ports) - ci.num_ports],
                        ci.num_ports);

        if (copy_to_user(arg, &ci, min(sizeof(ci), size)))
                return -EFAULT;

        return 0;
}

static int proc_resetdevice(struct usb_dev_state *ps)
{
        struct usb_host_config *actconfig = ps->dev->actconfig;
        struct usb_interface *interface;
        int i, number;

        /* Don't allow a device reset if the process has dropped the
         * privilege to do such things and any of the interfaces are
         * currently claimed.
         */
        if (ps->privileges_dropped && actconfig) {
                for (i = 0; i < actconfig->desc.bNumInterfaces; ++i) {
                        interface = actconfig->interface[i];
                        number = interface->cur_altsetting->desc.bInterfaceNumber;
                        if (usb_interface_claimed(interface) &&
                                        !test_bit(number, &ps->ifclaimed)) {
                                dev_warn(&ps->dev->dev,
                                        "usbfs: interface %d claimed by %s while '%s' resets device\n",
                                        number,        interface->dev.driver->name, current->comm);
                                return -EACCES;
                        }
                }
        }

        return usb_reset_device(ps->dev);
}

static int proc_setintf(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_setinterface setintf;
        int ret;

        if (copy_from_user(&setintf, arg, sizeof(setintf)))
                return -EFAULT;
        ret = checkintf(ps, setintf.interface);
        if (ret)
                return ret;

        destroy_async_on_interface(ps, setintf.interface);

        return usb_set_interface(ps->dev, setintf.interface,
                        setintf.altsetting);
}

static int proc_setconfig(struct usb_dev_state *ps, void __user *arg)
{
        int u;
        int status = 0;
        struct usb_host_config *actconfig;

        if (get_user(u, (int __user *)arg))
                return -EFAULT;

        actconfig = ps->dev->actconfig;

        /* Don't touch the device if any interfaces are claimed.
         * It could interfere with other drivers' operations, and if
         * an interface is claimed by usbfs it could easily deadlock.
         */
        if (actconfig) {
                int i;

                for (i = 0; i < actconfig->desc.bNumInterfaces; ++i) {
                        if (usb_interface_claimed(actconfig->interface[i])) {
                                dev_warn(&ps->dev->dev,
                                        "usbfs: interface %d claimed by %s "
                                        "while '%s' sets config #%d\n",
                                        actconfig->interface[i]
                                                ->cur_altsetting
                                                ->desc.bInterfaceNumber,
                                        actconfig->interface[i]
                                                ->dev.driver->name,
                                        current->comm, u);
                                status = -EBUSY;
                                break;
                        }
                }
        }

        /* SET_CONFIGURATION is often abused as a "cheap" driver reset,
         * so avoid usb_set_configuration()'s kick to sysfs
         */
        if (status == 0) {
                if (actconfig && actconfig->desc.bConfigurationValue == u)
                        status = usb_reset_configuration(ps->dev);
                else
                        status = usb_set_configuration(ps->dev, u);
        }

        return status;
}

static struct usb_memory *
find_memory_area(struct usb_dev_state *ps, const struct usbdevfs_urb *uurb)
{
        struct usb_memory *usbm = NULL, *iter;
        unsigned long flags;
        unsigned long uurb_start = (unsigned long)uurb->buffer;

        spin_lock_irqsave(&ps->lock, flags);
        list_for_each_entry(iter, &ps->memory_list, memlist) {
                if (uurb_start >= iter->vm_start &&
                                uurb_start < iter->vm_start + iter->size) {
                        if (uurb->buffer_length > iter->vm_start + iter->size -
                                        uurb_start) {
                                usbm = ERR_PTR(-EINVAL);
                        } else {
                                usbm = iter;
                                usbm->urb_use_count++;
                        }
                        break;
                }
        }
        spin_unlock_irqrestore(&ps->lock, flags);
        return usbm;
}

static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb,
                        struct usbdevfs_iso_packet_desc __user *iso_frame_desc,
                        void __user *arg, sigval_t userurb_sigval)
{
        struct usbdevfs_iso_packet_desc *isopkt = NULL;
        struct usb_host_endpoint *ep;
        struct async *as = NULL;
        struct usb_ctrlrequest *dr = NULL;
        unsigned int u, totlen, isofrmlen;
        int i, ret, num_sgs = 0, ifnum = -1;
        int number_of_packets = 0;
        unsigned int stream_id = 0;
        void *buf;
        bool is_in;
        bool allow_short = false;
        bool allow_zero = false;
        unsigned long mask =        USBDEVFS_URB_SHORT_NOT_OK |
                                USBDEVFS_URB_BULK_CONTINUATION |
                                USBDEVFS_URB_NO_FSBR |
                                USBDEVFS_URB_ZERO_PACKET |
                                USBDEVFS_URB_NO_INTERRUPT;
        /* USBDEVFS_URB_ISO_ASAP is a special case */
        if (uurb->type == USBDEVFS_URB_TYPE_ISO)
                mask |= USBDEVFS_URB_ISO_ASAP;

        if (uurb->flags & ~mask)
                        return -EINVAL;

        if ((unsigned int)uurb->buffer_length >= USBFS_XFER_MAX)
                return -EINVAL;
        if (uurb->buffer_length > 0 && !uurb->buffer)
                return -EINVAL;
        if (!(uurb->type == USBDEVFS_URB_TYPE_CONTROL &&
            (uurb->endpoint & ~USB_ENDPOINT_DIR_MASK) == 0)) {
                ifnum = findintfep(ps->dev, uurb->endpoint);
                if (ifnum < 0)
                        return ifnum;
                ret = checkintf(ps, ifnum);
                if (ret)
                        return ret;
        }
        ep = ep_to_host_endpoint(ps->dev, uurb->endpoint);
        if (!ep)
                return -ENOENT;
        is_in = (uurb->endpoint & USB_ENDPOINT_DIR_MASK) != 0;

        u = 0;
        switch (uurb->type) {
        case USBDEVFS_URB_TYPE_CONTROL:
                if (!usb_endpoint_xfer_control(&ep->desc))
                        return -EINVAL;
                /* min 8 byte setup packet */
                if (uurb->buffer_length < 8)
                        return -EINVAL;
                dr = kmalloc_obj(struct usb_ctrlrequest);
                if (!dr)
                        return -ENOMEM;
                if (copy_from_user(dr, uurb->buffer, 8)) {
                        ret = -EFAULT;
                        goto error;
                }
                if (uurb->buffer_length < (le16_to_cpu(dr->wLength) + 8)) {
                        ret = -EINVAL;
                        goto error;
                }
                ret = check_ctrlrecip(ps, dr->bRequestType, dr->bRequest,
                                      le16_to_cpu(dr->wIndex));
                if (ret)
                        goto error;
                uurb->buffer_length = le16_to_cpu(dr->wLength);
                uurb->buffer += 8;
                if ((dr->bRequestType & USB_DIR_IN) && uurb->buffer_length) {
                        is_in = true;
                        uurb->endpoint |= USB_DIR_IN;
                } else {
                        is_in = false;
                        uurb->endpoint &= ~USB_DIR_IN;
                }
                if (is_in)
                        allow_short = true;
                snoop(&ps->dev->dev, "control urb: bRequestType=%02x "
                        "bRequest=%02x wValue=%04x "
                        "wIndex=%04x wLength=%04x\n",
                        dr->bRequestType, dr->bRequest,
                        __le16_to_cpu(dr->wValue),
                        __le16_to_cpu(dr->wIndex),
                        __le16_to_cpu(dr->wLength));
                u = sizeof(struct usb_ctrlrequest);
                break;

        case USBDEVFS_URB_TYPE_BULK:
                if (!is_in)
                        allow_zero = true;
                else
                        allow_short = true;
                switch (usb_endpoint_type(&ep->desc)) {
                case USB_ENDPOINT_XFER_CONTROL:
                case USB_ENDPOINT_XFER_ISOC:
                        return -EINVAL;
                case USB_ENDPOINT_XFER_INT:
                        /* allow single-shot interrupt transfers */
                        uurb->type = USBDEVFS_URB_TYPE_INTERRUPT;
                        goto interrupt_urb;
                }
                num_sgs = DIV_ROUND_UP(uurb->buffer_length, USB_SG_SIZE);
                if (num_sgs == 1 || num_sgs > ps->dev->bus->sg_tablesize)
                        num_sgs = 0;
                if (ep->streams)
                        stream_id = uurb->stream_id;
                break;

        case USBDEVFS_URB_TYPE_INTERRUPT:
                if (!usb_endpoint_xfer_int(&ep->desc))
                        return -EINVAL;
 interrupt_urb:
                if (!is_in)
                        allow_zero = true;
                else
                        allow_short = true;
                break;

        case USBDEVFS_URB_TYPE_ISO:
                /* arbitrary limit */
                if (uurb->number_of_packets < 1 ||
                    uurb->number_of_packets > 128)
                        return -EINVAL;
                if (!usb_endpoint_xfer_isoc(&ep->desc))
                        return -EINVAL;
                number_of_packets = uurb->number_of_packets;
                isofrmlen = sizeof(struct usbdevfs_iso_packet_desc) *
                                   number_of_packets;
                isopkt = memdup_user(iso_frame_desc, isofrmlen);
                if (IS_ERR(isopkt)) {
                        ret = PTR_ERR(isopkt);
                        isopkt = NULL;
                        goto error;
                }
                for (totlen = u = 0; u < number_of_packets; u++) {
                        /*
                         * arbitrary limit need for USB 3.1 Gen2
                         * sizemax: 96 DPs at SSP, 96 * 1024 = 98304
                         */
                        if (isopkt[u].length > 98304) {
                                ret = -EINVAL;
                                goto error;
                        }
                        totlen += isopkt[u].length;
                }
                u *= sizeof(struct usb_iso_packet_descriptor);
                uurb->buffer_length = totlen;
                break;

        default:
                return -EINVAL;
        }

        if (uurb->buffer_length > 0 &&
                        !access_ok(uurb->buffer, uurb->buffer_length)) {
                ret = -EFAULT;
                goto error;
        }
        as = alloc_async(number_of_packets);
        if (!as) {
                ret = -ENOMEM;
                goto error;
        }

        as->usbm = find_memory_area(ps, uurb);
        if (IS_ERR(as->usbm)) {
                ret = PTR_ERR(as->usbm);
                as->usbm = NULL;
                goto error;
        }

        /* do not use SG buffers when memory mapped segments
         * are in use
         */
        if (as->usbm)
                num_sgs = 0;

        u += sizeof(struct async) + sizeof(struct urb) +
             (as->usbm ? 0 : uurb->buffer_length) +
             num_sgs * sizeof(struct scatterlist);
        ret = usbfs_increase_memory_usage(u);
        if (ret)
                goto error;
        as->mem_usage = u;

        if (num_sgs) {
                as->urb->sg = kmalloc_objs(struct scatterlist, num_sgs,
                                           GFP_KERNEL | __GFP_NOWARN);
                if (!as->urb->sg) {
                        ret = -ENOMEM;
                        goto error;
                }
                as->urb->num_sgs = num_sgs;
                sg_init_table(as->urb->sg, as->urb->num_sgs);

                totlen = uurb->buffer_length;
                for (i = 0; i < as->urb->num_sgs; i++) {
                        u = (totlen > USB_SG_SIZE) ? USB_SG_SIZE : totlen;
                        buf = kmalloc(u, GFP_KERNEL);
                        if (!buf) {
                                ret = -ENOMEM;
                                goto error;
                        }
                        sg_set_buf(&as->urb->sg[i], buf, u);

                        if (!is_in) {
                                if (copy_from_user(buf, uurb->buffer, u)) {
                                        ret = -EFAULT;
                                        goto error;
                                }
                                uurb->buffer += u;
                        }
                        totlen -= u;
                }
        } else if (uurb->buffer_length > 0) {
                if (as->usbm) {
                        unsigned long uurb_start = (unsigned long)uurb->buffer;

                        as->urb->transfer_buffer = as->usbm->mem +
                                        (uurb_start - as->usbm->vm_start);
                } else {
                        as->urb->transfer_buffer = kmalloc(uurb->buffer_length,
                                        GFP_KERNEL | __GFP_NOWARN);
                        if (!as->urb->transfer_buffer) {
                                ret = -ENOMEM;
                                goto error;
                        }
                        if (!is_in) {
                                if (copy_from_user(as->urb->transfer_buffer,
                                                   uurb->buffer,
                                                   uurb->buffer_length)) {
                                        ret = -EFAULT;
                                        goto error;
                                }
                        } else if (uurb->type == USBDEVFS_URB_TYPE_ISO) {
                                /*
                                 * Isochronous input data may end up being
                                 * discontiguous if some of the packets are
                                 * short. Clear the buffer so that the gaps
                                 * don't leak kernel data to userspace.
                                 */
                                memset(as->urb->transfer_buffer, 0,
                                                uurb->buffer_length);
                        }
                }
        }
        as->urb->dev = ps->dev;
        as->urb->pipe = (uurb->type << 30) |
                        __create_pipe(ps->dev, uurb->endpoint & 0xf) |
                        (uurb->endpoint & USB_DIR_IN);

        /* This tedious sequence is necessary because the URB_* flags
         * are internal to the kernel and subject to change, whereas
         * the USBDEVFS_URB_* flags are a user API and must not be changed.
         */
        u = (is_in ? URB_DIR_IN : URB_DIR_OUT);
        if (uurb->flags & USBDEVFS_URB_ISO_ASAP)
                u |= URB_ISO_ASAP;
        if (allow_short && uurb->flags & USBDEVFS_URB_SHORT_NOT_OK)
                u |= URB_SHORT_NOT_OK;
        if (allow_zero && uurb->flags & USBDEVFS_URB_ZERO_PACKET)
                u |= URB_ZERO_PACKET;
        if (uurb->flags & USBDEVFS_URB_NO_INTERRUPT)
                u |= URB_NO_INTERRUPT;
        as->urb->transfer_flags = u;

        if (!allow_short && uurb->flags & USBDEVFS_URB_SHORT_NOT_OK)
                dev_warn(&ps->dev->dev, "Requested nonsensical USBDEVFS_URB_SHORT_NOT_OK.\n");
        if (!allow_zero && uurb->flags & USBDEVFS_URB_ZERO_PACKET)
                dev_warn(&ps->dev->dev, "Requested nonsensical USBDEVFS_URB_ZERO_PACKET.\n");

        as->urb->transfer_buffer_length = uurb->buffer_length;
        as->urb->setup_packet = (unsigned char *)dr;
        dr = NULL;
        as->urb->start_frame = uurb->start_frame;
        as->urb->number_of_packets = number_of_packets;
        as->urb->stream_id = stream_id;

        if (ep->desc.bInterval) {
                if (uurb->type == USBDEVFS_URB_TYPE_ISO ||
                                ps->dev->speed == USB_SPEED_HIGH ||
                                ps->dev->speed >= USB_SPEED_SUPER)
                        as->urb->interval = 1 <<
                                        min(15, ep->desc.bInterval - 1);
                else
                        as->urb->interval = ep->desc.bInterval;
        }

        as->urb->context = as;
        as->urb->complete = async_completed;
        for (totlen = u = 0; u < number_of_packets; u++) {
                as->urb->iso_frame_desc[u].offset = totlen;
                as->urb->iso_frame_desc[u].length = isopkt[u].length;
                totlen += isopkt[u].length;
        }
        kfree(isopkt);
        isopkt = NULL;
        as->ps = ps;
        as->userurb = arg;
        as->userurb_sigval = userurb_sigval;
        if (as->usbm) {
                unsigned long uurb_start = (unsigned long)uurb->buffer;

                as->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
                as->urb->transfer_dma = as->usbm->dma_handle +
                                (uurb_start - as->usbm->vm_start);
        } else if (is_in && uurb->buffer_length > 0)
                as->userbuffer = uurb->buffer;
        as->signr = uurb->signr;
        as->ifnum = ifnum;
        as->pid = get_pid(task_pid(current));
        as->cred = get_current_cred();
        snoop_urb(ps->dev, as->userurb, as->urb->pipe,
                        as->urb->transfer_buffer_length, 0, SUBMIT,
                        NULL, 0);
        if (!is_in)
                snoop_urb_data(as->urb, as->urb->transfer_buffer_length);

        async_newpending(as);

        if (usb_endpoint_xfer_bulk(&ep->desc)) {
                spin_lock_irq(&ps->lock);

                /* Not exactly the endpoint address; the direction bit is
                 * shifted to the 0x10 position so that the value will be
                 * between 0 and 31.
                 */
                as->bulk_addr = usb_endpoint_num(&ep->desc) |
                        ((ep->desc.bEndpointAddress & USB_ENDPOINT_DIR_MASK)
                                >> 3);

                /* If this bulk URB is the start of a new transfer, re-enable
                 * the endpoint.  Otherwise mark it as a continuation URB.
                 */
                if (uurb->flags & USBDEVFS_URB_BULK_CONTINUATION)
                        as->bulk_status = AS_CONTINUATION;
                else
                        ps->disabled_bulk_eps &= ~(1 << as->bulk_addr);

                /* Don't accept continuation URBs if the endpoint is
                 * disabled because of an earlier error.
                 */
                if (ps->disabled_bulk_eps & (1 << as->bulk_addr))
                        ret = -EREMOTEIO;
                else
                        ret = usb_submit_urb(as->urb, GFP_ATOMIC);
                spin_unlock_irq(&ps->lock);
        } else {
                ret = usb_submit_urb(as->urb, GFP_KERNEL);
        }

        if (ret) {
                dev_printk(KERN_DEBUG, &ps->dev->dev,
                           "usbfs: usb_submit_urb returned %d\n", ret);
                snoop_urb(ps->dev, as->userurb, as->urb->pipe,
                                0, ret, COMPLETE, NULL, 0);
                async_removepending(as);
                goto error;
        }
        return 0;

 error:
        kfree(isopkt);
        kfree(dr);
        if (as)
                free_async(as);
        return ret;
}

static int proc_submiturb(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_urb uurb;
        sigval_t userurb_sigval;

        if (copy_from_user(&uurb, arg, sizeof(uurb)))
                return -EFAULT;

        memset(&userurb_sigval, 0, sizeof(userurb_sigval));
        userurb_sigval.sival_ptr = arg;

        return proc_do_submiturb(ps, &uurb,
                        (((struct usbdevfs_urb __user *)arg)->iso_frame_desc),
                        arg, userurb_sigval);
}

static int proc_unlinkurb(struct usb_dev_state *ps, void __user *arg)
{
        struct urb *urb;
        struct async *as;
        unsigned long flags;

        spin_lock_irqsave(&ps->lock, flags);
        as = async_getpending(ps, arg);
        if (!as) {
                spin_unlock_irqrestore(&ps->lock, flags);
                return -EINVAL;
        }

        urb = as->urb;
        usb_get_urb(urb);
        spin_unlock_irqrestore(&ps->lock, flags);

        usb_kill_urb(urb);
        usb_put_urb(urb);

        return 0;
}

static void compute_isochronous_actual_length(struct urb *urb)
{
        unsigned int i;

        if (urb->number_of_packets > 0) {
                urb->actual_length = 0;
                for (i = 0; i < urb->number_of_packets; i++)
                        urb->actual_length +=
                                        urb->iso_frame_desc[i].actual_length;
        }
}

static int processcompl(struct async *as, void __user * __user *arg)
{
        struct urb *urb = as->urb;
        struct usbdevfs_urb __user *userurb = as->userurb;
        void __user *addr = as->userurb;
        unsigned int i;

        compute_isochronous_actual_length(urb);
        if (as->userbuffer && urb->actual_length) {
                if (copy_urb_data_to_user(as->userbuffer, urb))
                        goto err_out;
        }
        if (put_user(as->status, &userurb->status))
                goto err_out;
        if (put_user(urb->actual_length, &userurb->actual_length))
                goto err_out;
        if (put_user(urb->error_count, &userurb->error_count))
                goto err_out;

        if (usb_endpoint_xfer_isoc(&urb->ep->desc)) {
                for (i = 0; i < urb->number_of_packets; i++) {
                        if (put_user(urb->iso_frame_desc[i].actual_length,
                                     &userurb->iso_frame_desc[i].actual_length))
                                goto err_out;
                        if (put_user(urb->iso_frame_desc[i].status,
                                     &userurb->iso_frame_desc[i].status))
                                goto err_out;
                }
        }

        if (put_user(addr, (void __user * __user *)arg))
                return -EFAULT;
        return 0;

err_out:
        return -EFAULT;
}

static struct async *reap_as(struct usb_dev_state *ps)
{
        DECLARE_WAITQUEUE(wait, current);
        struct async *as = NULL;
        struct usb_device *dev = ps->dev;

        add_wait_queue(&ps->wait, &wait);
        for (;;) {
                __set_current_state(TASK_INTERRUPTIBLE);
                as = async_getcompleted(ps);
                if (as || !connected(ps))
                        break;
                if (signal_pending(current))
                        break;
                usb_unlock_device(dev);
                schedule();
                usb_lock_device(dev);
        }
        remove_wait_queue(&ps->wait, &wait);
        set_current_state(TASK_RUNNING);
        return as;
}

static int proc_reapurb(struct usb_dev_state *ps, void __user *arg)
{
        struct async *as = reap_as(ps);

        if (as) {
                int retval;

                snoop(&ps->dev->dev, "reap %px\n", as->userurb);
                retval = processcompl(as, (void __user * __user *)arg);
                free_async(as);
                return retval;
        }
        if (signal_pending(current))
                return -EINTR;
        return -ENODEV;
}

static int proc_reapurbnonblock(struct usb_dev_state *ps, void __user *arg)
{
        int retval;
        struct async *as;

        as = async_getcompleted(ps);
        if (as) {
                snoop(&ps->dev->dev, "reap %px\n", as->userurb);
                retval = processcompl(as, (void __user * __user *)arg);
                free_async(as);
        } else {
                retval = (connected(ps) ? -EAGAIN : -ENODEV);
        }
        return retval;
}

#ifdef CONFIG_COMPAT
static int proc_control_compat(struct usb_dev_state *ps,
                                struct usbdevfs_ctrltransfer32 __user *p32)
{
        struct usbdevfs_ctrltransfer ctrl;
        u32 udata;

        if (copy_from_user(&ctrl, p32, sizeof(*p32) - sizeof(compat_caddr_t)) ||
            get_user(udata, &p32->data))
                return -EFAULT;
        ctrl.data = compat_ptr(udata);
        return do_proc_control(ps, &ctrl);
}

static int proc_bulk_compat(struct usb_dev_state *ps,
                        struct usbdevfs_bulktransfer32 __user *p32)
{
        struct usbdevfs_bulktransfer bulk;
        compat_caddr_t addr;

        if (get_user(bulk.ep, &p32->ep) ||
            get_user(bulk.len, &p32->len) ||
            get_user(bulk.timeout, &p32->timeout) ||
            get_user(addr, &p32->data))
                return -EFAULT;
        bulk.data = compat_ptr(addr);
        return do_proc_bulk(ps, &bulk);
}

static int proc_disconnectsignal_compat(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_disconnectsignal32 ds;

        if (copy_from_user(&ds, arg, sizeof(ds)))
                return -EFAULT;
        ps->discsignr = ds.signr;
        ps->disccontext.sival_int = ds.context;
        return 0;
}

static int get_urb32(struct usbdevfs_urb *kurb,
                     struct usbdevfs_urb32 __user *uurb)
{
        struct usbdevfs_urb32 urb32;
        if (copy_from_user(&urb32, uurb, sizeof(*uurb)))
                return -EFAULT;
        kurb->type = urb32.type;
        kurb->endpoint = urb32.endpoint;
        kurb->status = urb32.status;
        kurb->flags = urb32.flags;
        kurb->buffer = compat_ptr(urb32.buffer);
        kurb->buffer_length = urb32.buffer_length;
        kurb->actual_length = urb32.actual_length;
        kurb->start_frame = urb32.start_frame;
        kurb->number_of_packets = urb32.number_of_packets;
        kurb->error_count = urb32.error_count;
        kurb->signr = urb32.signr;
        kurb->usercontext = compat_ptr(urb32.usercontext);
        return 0;
}

static int proc_submiturb_compat(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_urb uurb;
        sigval_t userurb_sigval;

        if (get_urb32(&uurb, (struct usbdevfs_urb32 __user *)arg))
                return -EFAULT;

        memset(&userurb_sigval, 0, sizeof(userurb_sigval));
        userurb_sigval.sival_int = ptr_to_compat(arg);

        return proc_do_submiturb(ps, &uurb,
                        ((struct usbdevfs_urb32 __user *)arg)->iso_frame_desc,
                        arg, userurb_sigval);
}

static int processcompl_compat(struct async *as, void __user * __user *arg)
{
        struct urb *urb = as->urb;
        struct usbdevfs_urb32 __user *userurb = as->userurb;
        void __user *addr = as->userurb;
        unsigned int i;

        compute_isochronous_actual_length(urb);
        if (as->userbuffer && urb->actual_length) {
                if (copy_urb_data_to_user(as->userbuffer, urb))
                        return -EFAULT;
        }
        if (put_user(as->status, &userurb->status))
                return -EFAULT;
        if (put_user(urb->actual_length, &userurb->actual_length))
                return -EFAULT;
        if (put_user(urb->error_count, &userurb->error_count))
                return -EFAULT;

        if (usb_endpoint_xfer_isoc(&urb->ep->desc)) {
                for (i = 0; i < urb->number_of_packets; i++) {
                        if (put_user(urb->iso_frame_desc[i].actual_length,
                                     &userurb->iso_frame_desc[i].actual_length))
                                return -EFAULT;
                        if (put_user(urb->iso_frame_desc[i].status,
                                     &userurb->iso_frame_desc[i].status))
                                return -EFAULT;
                }
        }

        if (put_user(ptr_to_compat(addr), (u32 __user *)arg))
                return -EFAULT;
        return 0;
}

static int proc_reapurb_compat(struct usb_dev_state *ps, void __user *arg)
{
        struct async *as = reap_as(ps);

        if (as) {
                int retval;

                snoop(&ps->dev->dev, "reap %px\n", as->userurb);
                retval = processcompl_compat(as, (void __user * __user *)arg);
                free_async(as);
                return retval;
        }
        if (signal_pending(current))
                return -EINTR;
        return -ENODEV;
}

static int proc_reapurbnonblock_compat(struct usb_dev_state *ps, void __user *arg)
{
        int retval;
        struct async *as;

        as = async_getcompleted(ps);
        if (as) {
                snoop(&ps->dev->dev, "reap %px\n", as->userurb);
                retval = processcompl_compat(as, (void __user * __user *)arg);
                free_async(as);
        } else {
                retval = (connected(ps) ? -EAGAIN : -ENODEV);
        }
        return retval;
}


#endif

static int proc_disconnectsignal(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_disconnectsignal ds;

        if (copy_from_user(&ds, arg, sizeof(ds)))
                return -EFAULT;
        ps->discsignr = ds.signr;
        ps->disccontext.sival_ptr = ds.context;
        return 0;
}

static int proc_claiminterface(struct usb_dev_state *ps, void __user *arg)
{
        unsigned int ifnum;

        if (get_user(ifnum, (unsigned int __user *)arg))
                return -EFAULT;
        return claimintf(ps, ifnum);
}

static int proc_releaseinterface(struct usb_dev_state *ps, void __user *arg)
{
        unsigned int ifnum;
        int ret;

        if (get_user(ifnum, (unsigned int __user *)arg))
                return -EFAULT;
        ret = releaseintf(ps, ifnum);
        if (ret < 0)
                return ret;
        destroy_async_on_interface(ps, ifnum);
        return 0;
}

static int proc_ioctl(struct usb_dev_state *ps, struct usbdevfs_ioctl *ctl)
{
        int                        size;
        void                        *buf = NULL;
        int                        retval = 0;
        struct usb_interface    *intf = NULL;
        struct usb_driver       *driver = NULL;

        if (ps->privileges_dropped)
                return -EACCES;

        if (!connected(ps))
                return -ENODEV;

        /* alloc buffer */
        size = _IOC_SIZE(ctl->ioctl_code);
        if (size > 0) {
                buf = kmalloc(size, GFP_KERNEL);
                if (buf == NULL)
                        return -ENOMEM;
                if ((_IOC_DIR(ctl->ioctl_code) & _IOC_WRITE)) {
                        if (copy_from_user(buf, ctl->data, size)) {
                                kfree(buf);
                                return -EFAULT;
                        }
                } else {
                        memset(buf, 0, size);
                }
        }

        if (ps->dev->state != USB_STATE_CONFIGURED)
                retval = -EHOSTUNREACH;
        else if (!(intf = usb_ifnum_to_if(ps->dev, ctl->ifno)))
                retval = -EINVAL;
        else switch (ctl->ioctl_code) {

        /* disconnect kernel driver from interface */
        case USBDEVFS_DISCONNECT:
                if (intf->dev.driver) {
                        driver = to_usb_driver(intf->dev.driver);
                        dev_dbg(&intf->dev, "disconnect by usbfs\n");
                        usb_driver_release_interface(driver, intf);
                } else
                        retval = -ENODATA;
                break;

        /* let kernel drivers try to (re)bind to the interface */
        case USBDEVFS_CONNECT:
                if (!intf->dev.driver)
                        retval = device_attach(&intf->dev);
                else
                        retval = -EBUSY;
                break;

        /* talk directly to the interface's driver */
        default:
                if (intf->dev.driver)
                        driver = to_usb_driver(intf->dev.driver);
                if (driver == NULL || driver->unlocked_ioctl == NULL) {
                        retval = -ENOTTY;
                } else {
                        retval = driver->unlocked_ioctl(intf, ctl->ioctl_code, buf);
                        if (retval == -ENOIOCTLCMD)
                                retval = -ENOTTY;
                }
        }

        /* cleanup and return */
        if (retval >= 0
                        && (_IOC_DIR(ctl->ioctl_code) & _IOC_READ) != 0
                        && size > 0
                        && copy_to_user(ctl->data, buf, size) != 0)
                retval = -EFAULT;

        kfree(buf);
        return retval;
}

static int proc_ioctl_default(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_ioctl        ctrl;

        if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
                return -EFAULT;
        return proc_ioctl(ps, &ctrl);
}

#ifdef CONFIG_COMPAT
static int proc_ioctl_compat(struct usb_dev_state *ps, compat_uptr_t arg)
{
        struct usbdevfs_ioctl32 ioc32;
        struct usbdevfs_ioctl ctrl;

        if (copy_from_user(&ioc32, compat_ptr(arg), sizeof(ioc32)))
                return -EFAULT;
        ctrl.ifno = ioc32.ifno;
        ctrl.ioctl_code = ioc32.ioctl_code;
        ctrl.data = compat_ptr(ioc32.data);
        return proc_ioctl(ps, &ctrl);
}
#endif

static int proc_claim_port(struct usb_dev_state *ps, void __user *arg)
{
        unsigned portnum;
        int rc;

        if (get_user(portnum, (unsigned __user *) arg))
                return -EFAULT;
        rc = usb_hub_claim_port(ps->dev, portnum, ps);
        if (rc == 0)
                snoop(&ps->dev->dev, "port %d claimed by process %d: %s\n",
                        portnum, task_pid_nr(current), current->comm);
        return rc;
}

static int proc_release_port(struct usb_dev_state *ps, void __user *arg)
{
        unsigned portnum;

        if (get_user(portnum, (unsigned __user *) arg))
                return -EFAULT;
        return usb_hub_release_port(ps->dev, portnum, ps);
}

static int proc_get_capabilities(struct usb_dev_state *ps, void __user *arg)
{
        __u32 caps;

        caps = USBDEVFS_CAP_ZERO_PACKET | USBDEVFS_CAP_NO_PACKET_SIZE_LIM |
                        USBDEVFS_CAP_REAP_AFTER_DISCONNECT | USBDEVFS_CAP_MMAP |
                        USBDEVFS_CAP_DROP_PRIVILEGES |
                        USBDEVFS_CAP_CONNINFO_EX | MAYBE_CAP_SUSPEND;
        if (!ps->dev->bus->no_stop_on_short)
                caps |= USBDEVFS_CAP_BULK_CONTINUATION;
        if (ps->dev->bus->sg_tablesize)
                caps |= USBDEVFS_CAP_BULK_SCATTER_GATHER;

        if (put_user(caps, (__u32 __user *)arg))
                return -EFAULT;

        return 0;
}

static int proc_disconnect_claim(struct usb_dev_state *ps, void __user *arg)
{
        struct usbdevfs_disconnect_claim dc;
        struct usb_interface *intf;

        if (copy_from_user(&dc, arg, sizeof(dc)))
                return -EFAULT;

        intf = usb_ifnum_to_if(ps->dev, dc.interface);
        if (!intf)
                return -EINVAL;

        if (intf->dev.driver) {
                struct usb_driver *driver = to_usb_driver(intf->dev.driver);

                if (ps->privileges_dropped)
                        return -EACCES;

                if ((dc.flags & USBDEVFS_DISCONNECT_CLAIM_IF_DRIVER) &&
                                strncmp(dc.driver, intf->dev.driver->name,
                                        sizeof(dc.driver)) != 0)
                        return -EBUSY;

                if ((dc.flags & USBDEVFS_DISCONNECT_CLAIM_EXCEPT_DRIVER) &&
                                strncmp(dc.driver, intf->dev.driver->name,
                                        sizeof(dc.driver)) == 0)
                        return -EBUSY;

                dev_dbg(&intf->dev, "disconnect by usbfs\n");
                usb_driver_release_interface(driver, intf);
        }

        return claimintf(ps, dc.interface);
}

static int proc_alloc_streams(struct usb_dev_state *ps, void __user *arg)
{
        unsigned num_streams, num_eps;
        struct usb_host_endpoint **eps;
        struct usb_interface *intf;
        int r;

        r = parse_usbdevfs_streams(ps, arg, &num_streams, &num_eps,
                                   &eps, &intf);
        if (r)
                return r;

        destroy_async_on_interface(ps,
                                   intf->altsetting[0].desc.bInterfaceNumber);

        r = usb_alloc_streams(intf, eps, num_eps, num_streams, GFP_KERNEL);
        kfree(eps);
        return r;
}

static int proc_free_streams(struct usb_dev_state *ps, void __user *arg)
{
        unsigned num_eps;
        struct usb_host_endpoint **eps;
        struct usb_interface *intf;
        int r;

        r = parse_usbdevfs_streams(ps, arg, NULL, &num_eps, &eps, &intf);
        if (r)
                return r;

        destroy_async_on_interface(ps,
                                   intf->altsetting[0].desc.bInterfaceNumber);

        r = usb_free_streams(intf, eps, num_eps, GFP_KERNEL);
        kfree(eps);
        return r;
}

static int proc_drop_privileges(struct usb_dev_state *ps, void __user *arg)
{
        u32 data;

        if (copy_from_user(&data, arg, sizeof(data)))
                return -EFAULT;

        /* This is a one way operation. Once privileges are
         * dropped, you cannot regain them. You may however reissue
         * this ioctl to shrink the allowed interfaces mask.
         */
        ps->interface_allowed_mask &= data;
        ps->privileges_dropped = true;

        return 0;
}

static int proc_forbid_suspend(struct usb_dev_state *ps)
{
        int ret = 0;

        if (ps->suspend_allowed) {
                ret = usb_autoresume_device(ps->dev);
                if (ret == 0)
                        ps->suspend_allowed = false;
                else if (ret != -ENODEV)
                        ret = -EIO;
        }
        return ret;
}

static int proc_allow_suspend(struct usb_dev_state *ps)
{
        if (!connected(ps))
                return -ENODEV;

        WRITE_ONCE(ps->not_yet_resumed, 1);
        if (!ps->suspend_allowed) {
                usb_autosuspend_device(ps->dev);
                ps->suspend_allowed = true;
        }
        return 0;
}

static int proc_wait_for_resume(struct usb_dev_state *ps)
{
        int ret;

        usb_unlock_device(ps->dev);
        ret = wait_event_interruptible(ps->wait_for_resume,
                        READ_ONCE(ps->not_yet_resumed) == 0);
        usb_lock_device(ps->dev);

        if (ret != 0)
                return -EINTR;
        return proc_forbid_suspend(ps);
}

/*
 * NOTE:  All requests here that have interface numbers as parameters
 * are assuming that somehow the configuration has been prevented from
 * changing.  But there's no mechanism to ensure that...
 */
static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
                                void __user *p)
{
        struct usb_dev_state *ps = file->private_data;
        struct inode *inode = file_inode(file);
        struct usb_device *dev = ps->dev;
        int ret = -ENOTTY;

        if (!(file->f_mode & FMODE_WRITE))
                return -EPERM;

        usb_lock_device(dev);

        /* Reap operations are allowed even after disconnection */
        switch (cmd) {
        case USBDEVFS_REAPURB:
                snoop(&dev->dev, "%s: REAPURB\n", __func__);
                ret = proc_reapurb(ps, p);
                goto done;

        case USBDEVFS_REAPURBNDELAY:
                snoop(&dev->dev, "%s: REAPURBNDELAY\n", __func__);
                ret = proc_reapurbnonblock(ps, p);
                goto done;

#ifdef CONFIG_COMPAT
        case USBDEVFS_REAPURB32:
                snoop(&dev->dev, "%s: REAPURB32\n", __func__);
                ret = proc_reapurb_compat(ps, p);
                goto done;

        case USBDEVFS_REAPURBNDELAY32:
                snoop(&dev->dev, "%s: REAPURBNDELAY32\n", __func__);
                ret = proc_reapurbnonblock_compat(ps, p);
                goto done;
#endif
        }

        if (!connected(ps)) {
                usb_unlock_device(dev);
                return -ENODEV;
        }

        switch (cmd) {
        case USBDEVFS_CONTROL:
                snoop(&dev->dev, "%s: CONTROL\n", __func__);
                ret = proc_control(ps, p);
                if (ret >= 0)
                        inode_set_mtime_to_ts(inode,
                                              inode_set_ctime_current(inode));
                break;

        case USBDEVFS_BULK:
                snoop(&dev->dev, "%s: BULK\n", __func__);
                ret = proc_bulk(ps, p);
                if (ret >= 0)
                        inode_set_mtime_to_ts(inode,
                                              inode_set_ctime_current(inode));
                break;

        case USBDEVFS_RESETEP:
                snoop(&dev->dev, "%s: RESETEP\n", __func__);
                ret = proc_resetep(ps, p);
                if (ret >= 0)
                        inode_set_mtime_to_ts(inode,
                                              inode_set_ctime_current(inode));
                break;

        case USBDEVFS_RESET:
                snoop(&dev->dev, "%s: RESET\n", __func__);
                ret = proc_resetdevice(ps);
                break;

        case USBDEVFS_CLEAR_HALT:
                snoop(&dev->dev, "%s: CLEAR_HALT\n", __func__);
                ret = proc_clearhalt(ps, p);
                if (ret >= 0)
                        inode_set_mtime_to_ts(inode,
                                              inode_set_ctime_current(inode));
                break;

        case USBDEVFS_GETDRIVER:
                snoop(&dev->dev, "%s: GETDRIVER\n", __func__);
                ret = proc_getdriver(ps, p);
                break;

        case USBDEVFS_CONNECTINFO:
                snoop(&dev->dev, "%s: CONNECTINFO\n", __func__);
                ret = proc_connectinfo(ps, p);
                break;

        case USBDEVFS_SETINTERFACE:
                snoop(&dev->dev, "%s: SETINTERFACE\n", __func__);
                ret = proc_setintf(ps, p);
                break;

        case USBDEVFS_SETCONFIGURATION:
                snoop(&dev->dev, "%s: SETCONFIGURATION\n", __func__);
                ret = proc_setconfig(ps, p);
                break;

        case USBDEVFS_SUBMITURB:
                snoop(&dev->dev, "%s: SUBMITURB\n", __func__);
                ret = proc_submiturb(ps, p);
                if (ret >= 0)
                        inode_set_mtime_to_ts(inode,
                                              inode_set_ctime_current(inode));
                break;

#ifdef CONFIG_COMPAT
        case USBDEVFS_CONTROL32:
                snoop(&dev->dev, "%s: CONTROL32\n", __func__);
                ret = proc_control_compat(ps, p);
                if (ret >= 0)
                        inode_set_mtime_to_ts(inode,
                                              inode_set_ctime_current(inode));
                break;

        case USBDEVFS_BULK32:
                snoop(&dev->dev, "%s: BULK32\n", __func__);
                ret = proc_bulk_compat(ps, p);
                if (ret >= 0)
                        inode_set_mtime_to_ts(inode,
                                              inode_set_ctime_current(inode));
                break;

        case USBDEVFS_DISCSIGNAL32:
                snoop(&dev->dev, "%s: DISCSIGNAL32\n", __func__);
                ret = proc_disconnectsignal_compat(ps, p);
                break;

        case USBDEVFS_SUBMITURB32:
                snoop(&dev->dev, "%s: SUBMITURB32\n", __func__);
                ret = proc_submiturb_compat(ps, p);
                if (ret >= 0)
                        inode_set_mtime_to_ts(inode,
                                              inode_set_ctime_current(inode));
                break;

        case USBDEVFS_IOCTL32:
                snoop(&dev->dev, "%s: IOCTL32\n", __func__);
                ret = proc_ioctl_compat(ps, ptr_to_compat(p));
                break;
#endif

        case USBDEVFS_DISCARDURB:
                snoop(&dev->dev, "%s: DISCARDURB %px\n", __func__, p);
                ret = proc_unlinkurb(ps, p);
                break;

        case USBDEVFS_DISCSIGNAL:
                snoop(&dev->dev, "%s: DISCSIGNAL\n", __func__);
                ret = proc_disconnectsignal(ps, p);
                break;

        case USBDEVFS_CLAIMINTERFACE:
                snoop(&dev->dev, "%s: CLAIMINTERFACE\n", __func__);
                ret = proc_claiminterface(ps, p);
                break;

        case USBDEVFS_RELEASEINTERFACE:
                snoop(&dev->dev, "%s: RELEASEINTERFACE\n", __func__);
                ret = proc_releaseinterface(ps, p);
                break;

        case USBDEVFS_IOCTL:
                snoop(&dev->dev, "%s: IOCTL\n", __func__);
                ret = proc_ioctl_default(ps, p);
                break;

        case USBDEVFS_CLAIM_PORT:
                snoop(&dev->dev, "%s: CLAIM_PORT\n", __func__);
                ret = proc_claim_port(ps, p);
                break;

        case USBDEVFS_RELEASE_PORT:
                snoop(&dev->dev, "%s: RELEASE_PORT\n", __func__);
                ret = proc_release_port(ps, p);
                break;
        case USBDEVFS_GET_CAPABILITIES:
                ret = proc_get_capabilities(ps, p);
                break;
        case USBDEVFS_DISCONNECT_CLAIM:
                ret = proc_disconnect_claim(ps, p);
                break;
        case USBDEVFS_ALLOC_STREAMS:
                ret = proc_alloc_streams(ps, p);
                break;
        case USBDEVFS_FREE_STREAMS:
                ret = proc_free_streams(ps, p);
                break;
        case USBDEVFS_DROP_PRIVILEGES:
                ret = proc_drop_privileges(ps, p);
                break;
        case USBDEVFS_GET_SPEED:
                ret = ps->dev->speed;
                break;
        case USBDEVFS_FORBID_SUSPEND:
                ret = proc_forbid_suspend(ps);
                break;
        case USBDEVFS_ALLOW_SUSPEND:
                ret = proc_allow_suspend(ps);
                break;
        case USBDEVFS_WAIT_FOR_RESUME:
                ret = proc_wait_for_resume(ps);
                break;
        }

        /* Handle variable-length commands */
        switch (cmd & ~IOCSIZE_MASK) {
        case USBDEVFS_CONNINFO_EX(0):
                ret = proc_conninfo_ex(ps, p, _IOC_SIZE(cmd));
                break;
        }

 done:
        usb_unlock_device(dev);
        if (ret >= 0)
                inode_set_atime_to_ts(inode, current_time(inode));
        return ret;
}

static long usbdev_ioctl(struct file *file, unsigned int cmd,
                        unsigned long arg)
{
        int ret;

        ret = usbdev_do_ioctl(file, cmd, (void __user *)arg);

        return ret;
}

/* No kernel lock - fine */
static __poll_t usbdev_poll(struct file *file,
                                struct poll_table_struct *wait)
{
        struct usb_dev_state *ps = file->private_data;
        __poll_t mask = 0;

        poll_wait(file, &ps->wait, wait);
        if (file->f_mode & FMODE_WRITE && !list_empty(&ps->async_completed))
                mask |= EPOLLOUT | EPOLLWRNORM;
        if (!connected(ps))
                mask |= EPOLLHUP;
        if (list_empty(&ps->list))
                mask |= EPOLLERR;
        return mask;
}

const struct file_operations usbdev_file_operations = {
        .owner =          THIS_MODULE,
        .llseek =          no_seek_end_llseek,
        .read =                  usbdev_read,
        .poll =                  usbdev_poll,
        .unlocked_ioctl = usbdev_ioctl,
        .compat_ioctl =   compat_ptr_ioctl,
        .mmap =           usbdev_mmap,
        .open =                  usbdev_open,
        .release =          usbdev_release,
};

static void usbdev_remove(struct usb_device *udev)
{
        struct usb_dev_state *ps;

        /* Protect against simultaneous resume */
        mutex_lock(&usbfs_mutex);
        while (!list_empty(&udev->filelist)) {
                ps = list_entry(udev->filelist.next, struct usb_dev_state, list);
                destroy_all_async(ps);
                wake_up_all(&ps->wait);
                WRITE_ONCE(ps->not_yet_resumed, 0);
                wake_up_all(&ps->wait_for_resume);
                list_del_init(&ps->list);
                if (ps->discsignr)
                        kill_pid_usb_asyncio(ps->discsignr, EPIPE, ps->disccontext,
                                             ps->disc_pid, ps->cred);
        }
        mutex_unlock(&usbfs_mutex);
}

static int usbdev_notify(struct notifier_block *self,
                               unsigned long action, void *dev)
{
        switch (action) {
        case USB_DEVICE_ADD:
                break;
        case USB_DEVICE_REMOVE:
                usbdev_remove(dev);
                break;
        }
        return NOTIFY_OK;
}

static struct notifier_block usbdev_nb = {
        .notifier_call =        usbdev_notify,
};

static struct cdev usb_device_cdev;

int __init usb_devio_init(void)
{
        int retval;

        retval = register_chrdev_region(USB_DEVICE_DEV, USB_DEVICE_MAX,
                                        "usb_device");
        if (retval) {
                printk(KERN_ERR "Unable to register minors for usb_device\n");
                goto out;
        }
        cdev_init(&usb_device_cdev, &usbdev_file_operations);
        retval = cdev_add(&usb_device_cdev, USB_DEVICE_DEV, USB_DEVICE_MAX);
        if (retval) {
                printk(KERN_ERR "Unable to get usb_device major %d\n",
                       USB_DEVICE_MAJOR);
                goto error_cdev;
        }
        usb_register_notify(&usbdev_nb);
out:
        return retval;

error_cdev:
        unregister_chrdev_region(USB_DEVICE_DEV, USB_DEVICE_MAX);
        goto out;
}

void usb_devio_cleanup(void)
{
        usb_unregister_notify(&usbdev_nb);
        cdev_del(&usb_device_cdev);
        unregister_chrdev_region(USB_DEVICE_DEV, USB_DEVICE_MAX);
}























   17 











    3 





























































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KERNEL_PRINTK__
#define __KERNEL_PRINTK__

#include <linux/stdarg.h>
#include <linux/init.h>
#include <linux/kern_levels.h>
#include <linux/linkage.h>
#include <linux/ratelimit_types.h>
#include <linux/once_lite.h>

struct console;

extern const char linux_banner[];
extern const char linux_proc_banner[];

extern int oops_in_progress;        /* If set, an oops, panic(), BUG() or die() is in progress */

#define PRINTK_MAX_SINGLE_HEADER_LEN 2

static inline int printk_get_level(const char *buffer)
{
        if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
                switch (buffer[1]) {
                case '0' ... '7':
                case 'c':        /* KERN_CONT */
                        return buffer[1];
                }
        }
        return 0;
}

static inline const char *printk_skip_level(const char *buffer)
{
        if (printk_get_level(buffer))
                return buffer + 2;

        return buffer;
}

static inline const char *printk_skip_headers(const char *buffer)
{
        while (printk_get_level(buffer))
                buffer = printk_skip_level(buffer);

        return buffer;
}

/* printk's without a loglevel use this.. */
#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT

/* We show everything that is MORE important than this.. */
#define CONSOLE_LOGLEVEL_SILENT  0 /* Mum's the word */
#define CONSOLE_LOGLEVEL_MIN         1 /* Minimum loglevel we let people use */
#define CONSOLE_LOGLEVEL_DEBUG        10 /* issue debug messages */
#define CONSOLE_LOGLEVEL_MOTORMOUTH 15        /* You can't shut this one up */

/*
 * Default used to be hard-coded at 7, quiet used to be hardcoded at 4,
 * we're now allowing both to be set from kernel config.
 */
#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
#define CONSOLE_LOGLEVEL_QUIET         CONFIG_CONSOLE_LOGLEVEL_QUIET

int match_devname_and_update_preferred_console(const char *match,
                                               const char *name,
                                               const short idx);

extern int console_printk[];

#define console_loglevel (console_printk[0])
#define default_message_loglevel (console_printk[1])
#define minimum_console_loglevel (console_printk[2])
#define default_console_loglevel (console_printk[3])

extern void console_verbose(void);

/* strlen("ratelimit") + 1 */
#define DEVKMSG_STR_MAX_SIZE 10
extern char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE];

extern int suppress_printk;

struct va_format {
        const char *fmt;
        va_list *va;
};

/*
 * FW_BUG
 * Add this to a message where you are sure the firmware is buggy or behaves
 * really stupid or out of spec. Be aware that the responsible BIOS developer
 * should be able to fix this issue or at least get a concrete idea of the
 * problem by reading your message without the need of looking at the kernel
 * code.
 *
 * Use it for definite and high priority BIOS bugs.
 *
 * FW_WARN
 * Use it for not that clear (e.g. could the kernel messed up things already?)
 * and medium priority BIOS bugs.
 *
 * FW_INFO
 * Use this one if you want to tell the user or vendor about something
 * suspicious, but generally harmless related to the firmware.
 *
 * Use it for information or very low priority BIOS bugs.
 */
#define FW_BUG                "[Firmware Bug]: "
#define FW_WARN                "[Firmware Warn]: "
#define FW_INFO                "[Firmware Info]: "

/*
 * HW_ERR
 * Add this to a message for hardware errors, so that user can report
 * it to hardware vendor instead of LKML or software vendor.
 */
#define HW_ERR                "[Hardware Error]: "

/*
 * DEPRECATED
 * Add this to a message whenever you want to warn user space about the use
 * of a deprecated aspect of an API so they can stop using it
 */
#define DEPRECATED        "[Deprecated]: "

/*
 * Dummy printk for disabled debugging statements to use whilst maintaining
 * gcc's format checking.
 */
#define no_printk(fmt, ...)                                \
({                                                        \
        if (0)                                                \
                _printk(fmt, ##__VA_ARGS__);                \
        0;                                                \
})

#ifdef CONFIG_EARLY_PRINTK
extern asmlinkage __printf(1, 2)
void early_printk(const char *fmt, ...);
#else
static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif

struct dev_printk_info;

#ifdef CONFIG_PRINTK
asmlinkage __printf(4, 0)
int vprintk_emit(int facility, int level,
                 const struct dev_printk_info *dev_info,
                 const char *fmt, va_list args);

asmlinkage __printf(1, 0)
int vprintk(const char *fmt, va_list args);
__printf(1, 0)
int vprintk_deferred(const char *fmt, va_list args);

asmlinkage __printf(1, 2) __cold
int _printk(const char *fmt, ...);

/*
 * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
 */
__printf(1, 2) __cold int _printk_deferred(const char *fmt, ...);

extern void __printk_deferred_enter(void);
extern void __printk_deferred_exit(void);

extern void printk_force_console_enter(void);
extern void printk_force_console_exit(void);

/*
 * The printk_deferred_enter/exit macros are available only as a hack for
 * some code paths that need to defer all printk console printing. Interrupts
 * must be disabled for the deferred duration.
 */
#define printk_deferred_enter() __printk_deferred_enter()
#define printk_deferred_exit() __printk_deferred_exit()

/*
 * Please don't use printk_ratelimit(), because it shares ratelimiting state
 * with all other unrelated printk_ratelimit() callsites.  Instead use
 * printk_ratelimited() or plain old __ratelimit().
 */
extern int __printk_ratelimit(const char *func);
#define printk_ratelimit() __printk_ratelimit(__func__)
extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                   unsigned int interval_msec);

extern int printk_delay_msec;
extern int dmesg_restrict;

extern void wake_up_klogd(void);

char *log_buf_addr_get(void);
u32 log_buf_len_get(void);
void log_buf_vmcoreinfo_setup(void);
void __init setup_log_buf(int early);
__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
extern asmlinkage void dump_stack(void) __cold;
void printk_trigger_flush(void);
void console_try_replay_all(void);
void printk_legacy_allow_panic_sync(void);
extern bool nbcon_device_try_acquire(struct console *con);
extern void nbcon_device_release(struct console *con);
void nbcon_atomic_flush_unsafe(void);
bool pr_flush(int timeout_ms, bool reset_on_progress);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
{
        return 0;
}
static inline __printf(1, 0)
int vprintk_deferred(const char *fmt, va_list args)
{
        return 0;
}
static inline __printf(1, 2) __cold
int _printk(const char *s, ...)
{
        return 0;
}
static inline __printf(1, 2) __cold
int _printk_deferred(const char *s, ...)
{
        return 0;
}

static inline void printk_deferred_enter(void)
{
}

static inline void printk_deferred_exit(void)
{
}

static inline void printk_force_console_enter(void)
{
}

static inline void printk_force_console_exit(void)
{
}

static inline int printk_ratelimit(void)
{
        return 0;
}
static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                          unsigned int interval_msec)
{
        return false;
}

static inline void wake_up_klogd(void)
{
}

static inline char *log_buf_addr_get(void)
{
        return NULL;
}

static inline u32 log_buf_len_get(void)
{
        return 0;
}

static inline void log_buf_vmcoreinfo_setup(void)
{
}

static inline void setup_log_buf(int early)
{
}

static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...)
{
}

static inline void dump_stack_print_info(const char *log_lvl)
{
}

static inline void show_regs_print_info(const char *log_lvl)
{
}

static inline void dump_stack_lvl(const char *log_lvl)
{
}

static inline void dump_stack(void)
{
}
static inline void printk_trigger_flush(void)
{
}
static inline void console_try_replay_all(void)
{
}

static inline void printk_legacy_allow_panic_sync(void)
{
}

static inline bool nbcon_device_try_acquire(struct console *con)
{
        return false;
}

static inline void nbcon_device_release(struct console *con)
{
}

static inline void nbcon_atomic_flush_unsafe(void)
{
}

static inline bool pr_flush(int timeout_ms, bool reset_on_progress)
{
        return true;
}

#endif

#ifdef CONFIG_SMP
extern int __printk_cpu_sync_try_get(void);
extern void __printk_cpu_sync_wait(void);
extern void __printk_cpu_sync_put(void);

#else

#define __printk_cpu_sync_try_get() true
#define __printk_cpu_sync_wait()
#define __printk_cpu_sync_put()
#endif /* CONFIG_SMP */

/**
 * printk_cpu_sync_get_irqsave() - Disable interrupts and acquire the printk
 *                                 cpu-reentrant spinning lock.
 * @flags: Stack-allocated storage for saving local interrupt state,
 *         to be passed to printk_cpu_sync_put_irqrestore().
 *
 * If the lock is owned by another CPU, spin until it becomes available.
 * Interrupts are restored while spinning.
 *
 * CAUTION: This function must be used carefully. It does not behave like a
 * typical lock. Here are important things to watch out for...
 *
 *     * This function is reentrant on the same CPU. Therefore the calling
 *       code must not assume exclusive access to data if code accessing the
 *       data can run reentrant or within NMI context on the same CPU.
 *
 *     * If there exists usage of this function from NMI context, it becomes
 *       unsafe to perform any type of locking or spinning to wait for other
 *       CPUs after calling this function from any context. This includes
 *       using spinlocks or any other busy-waiting synchronization methods.
 */
#define printk_cpu_sync_get_irqsave(flags)                \
        for (;;) {                                        \
                local_irq_save(flags);                        \
                if (__printk_cpu_sync_try_get())        \
                        break;                                \
                local_irq_restore(flags);                \
                __printk_cpu_sync_wait();                \
        }

/**
 * printk_cpu_sync_put_irqrestore() - Release the printk cpu-reentrant spinning
 *                                    lock and restore interrupts.
 * @flags: Caller's saved interrupt state, from printk_cpu_sync_get_irqsave().
 */
#define printk_cpu_sync_put_irqrestore(flags)        \
        do {                                        \
                __printk_cpu_sync_put();        \
                local_irq_restore(flags);        \
        } while (0)

extern int kptr_restrict;

/**
 * pr_fmt - used by the pr_*() macros to generate the printk format string
 * @fmt: format string passed from a pr_*() macro
 *
 * This macro can be used to generate a unified format string for pr_*()
 * macros. A common use is to prefix all pr_*() messages in a file with a common
 * string. For example, defining this at the top of a source file:
 *
 *        #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 *
 * would prefix all pr_info, pr_emerg... messages in the file with the module
 * name.
 */
#ifndef pr_fmt
#define pr_fmt(fmt) fmt
#endif

struct module;

#ifdef CONFIG_PRINTK_INDEX
struct pi_entry {
        const char *fmt;
        const char *func;
        const char *file;
        unsigned int line;

        /*
         * While printk and pr_* have the level stored in the string at compile
         * time, some subsystems dynamically add it at runtime through the
         * format string. For these dynamic cases, we allow the subsystem to
         * tell us the level at compile time.
         *
         * NULL indicates that the level, if any, is stored in fmt.
         */
        const char *level;

        /*
         * The format string used by various subsystem specific printk()
         * wrappers to prefix the message.
         *
         * Note that the static prefix defined by the pr_fmt() macro is stored
         * directly in the message format (@fmt), not here.
         */
        const char *subsys_fmt_prefix;
} __packed;

#define __printk_index_emit(_fmt, _level, _subsys_fmt_prefix)                \
        do {                                                                \
                if (__builtin_constant_p(_fmt) && __builtin_constant_p(_level)) { \
                        /*
                         * We check __builtin_constant_p multiple times here
                         * for the same input because GCC will produce an error
                         * if we try to assign a static variable to fmt if it
                         * is not a constant, even with the outer if statement.
                         */                                                \
                        static const struct pi_entry _entry                \
                        __used = {                                        \
                                .fmt = __builtin_constant_p(_fmt) ? (_fmt) : NULL, \
                                .func = __func__,                        \
                                .file = __FILE__,                        \
                                .line = __LINE__,                        \
                                .level = __builtin_constant_p(_level) ? (_level) : NULL, \
                                .subsys_fmt_prefix = _subsys_fmt_prefix,\
                        };                                                \
                        static const struct pi_entry *_entry_ptr        \
                        __used __section(".printk_index") = &_entry;        \
                }                                                        \
        } while (0)

#else /* !CONFIG_PRINTK_INDEX */
#define __printk_index_emit(...) do {} while (0)
#endif /* CONFIG_PRINTK_INDEX */

/*
 * Some subsystems have their own custom printk that applies a va_format to a
 * generic format, for example, to include a device number or other metadata
 * alongside the format supplied by the caller.
 *
 * In order to store these in the way they would be emitted by the printk
 * infrastructure, the subsystem provides us with the start, fixed string, and
 * any subsequent text in the format string.
 *
 * We take a variable argument list as pr_fmt/dev_fmt/etc are sometimes passed
 * as multiple arguments (eg: `"%s: ", "blah"`), and we must only take the
 * first one.
 *
 * subsys_fmt_prefix must be known at compile time, or compilation will fail
 * (since this is a mistake). If fmt or level is not known at compile time, no
 * index entry will be made (since this can legitimately happen).
 */
#define printk_index_subsys_emit(subsys_fmt_prefix, level, fmt, ...) \
        __printk_index_emit(fmt, level, subsys_fmt_prefix)

#define printk_index_wrap(_p_func, _fmt, ...)                                \
        ({                                                                \
                __printk_index_emit(_fmt, NULL, NULL);                        \
                _p_func(_fmt, ##__VA_ARGS__);                                \
        })


/**
 * printk - print a kernel message
 * @fmt: format string
 *
 * This is printk(). It can be called from any context. We want it to work.
 *
 * If printk indexing is enabled, _printk() is called from printk_index_wrap.
 * Otherwise, printk is simply #defined to _printk.
 *
 * We try to grab the console_lock. If we succeed, it's easy - we log the
 * output and call the console drivers.  If we fail to get the semaphore, we
 * place the output into the log buffer and return. The current holder of
 * the console_sem will notice the new output in console_unlock(); and will
 * send it to the consoles before releasing the lock.
 *
 * One effect of this deferred printing is that code which calls printk() and
 * then changes console_loglevel may break. This is because console_loglevel
 * is inspected when the actual printing occurs.
 *
 * See also:
 * printf(3)
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
#define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
#define printk_deferred(fmt, ...)                                        \
        printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__)

/**
 * pr_emerg - Print an emergency-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_emerg(fmt, ...) \
        printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_alert - Print an alert-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_alert(fmt, ...) \
        printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_crit - Print a critical-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_crit(fmt, ...) \
        printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_err - Print an error-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_err(fmt, ...) \
        printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_warn - Print a warning-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt()
 * to generate the format string.
 */
#define pr_warn(fmt, ...) \
        printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_notice - Print a notice-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_notice(fmt, ...) \
        printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_info - Print an info-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_info(fmt, ...) \
        printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)

/**
 * pr_cont - Continues a previous log message in the same line.
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_CONT loglevel. It should only be
 * used when continuing a log message with no newline ('\n') enclosed. Otherwise
 * it defaults back to KERN_DEFAULT loglevel.
 */
#define pr_cont(fmt, ...) \
        printk(KERN_CONT fmt, ##__VA_ARGS__)

/**
 * pr_devel - Print a debug-level message conditionally
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is
 * defined. Otherwise it does nothing.
 *
 * It uses pr_fmt() to generate the format string.
 */
#ifdef DEBUG
#define pr_devel(fmt, ...) \
        printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif


/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#include <linux/dynamic_debug.h>

/**
 * pr_debug - Print a debug-level message conditionally
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is
 * set. Otherwise, if DEBUG is defined, it's equivalent to a printk with
 * KERN_DEBUG loglevel. If DEBUG is not defined it does nothing.
 *
 * It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses
 * pr_fmt() internally).
 */
#define pr_debug(fmt, ...)                        \
        dynamic_pr_debug(fmt, ##__VA_ARGS__)
#elif defined(DEBUG)
#define pr_debug(fmt, ...) \
        printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/*
 * Print a one-time message (analogous to WARN_ONCE() et al):
 */

#ifdef CONFIG_PRINTK
#define printk_once(fmt, ...)                                        \
        DO_ONCE_LITE(printk, fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...)                                \
        DO_ONCE_LITE(printk_deferred, fmt, ##__VA_ARGS__)
#else
#define printk_once(fmt, ...)                                        \
        no_printk(fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...)                                \
        no_printk(fmt, ##__VA_ARGS__)
#endif

#define pr_emerg_once(fmt, ...)                                        \
        printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_once(fmt, ...)                                        \
        printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_once(fmt, ...)                                        \
        printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_once(fmt, ...)                                        \
        printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_once(fmt, ...)                                        \
        printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_once(fmt, ...)                                \
        printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_once(fmt, ...)                                        \
        printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_once, don't do that... */

#if defined(DEBUG)
#define pr_devel_once(fmt, ...)                                        \
        printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_once(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/* If you are writing a driver, please use dev_dbg instead */
#if defined(DEBUG)
#define pr_debug_once(fmt, ...)                                        \
        printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_once(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/*
 * ratelimited messages with local ratelimit_state,
 * no local ratelimit_state used in the !PRINTK case
 */
#ifdef CONFIG_PRINTK
#define printk_ratelimited(fmt, ...)                                        \
({                                                                        \
        static DEFINE_RATELIMIT_STATE(_rs,                                \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);                \
                                                                        \
        if (__ratelimit(&_rs))                                                \
                printk(fmt, ##__VA_ARGS__);                                \
})
#else
#define printk_ratelimited(fmt, ...)                                        \
        no_printk(fmt, ##__VA_ARGS__)
#endif

#define pr_emerg_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_ratelimited, don't do that... */

#if defined(DEBUG)
#define pr_devel_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_ratelimited(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
#define pr_debug_ratelimited(fmt, ...)                                        \
do {                                                                        \
        static DEFINE_RATELIMIT_STATE(_rs,                                \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);                \
        DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt));                \
        if (DYNAMIC_DEBUG_BRANCH(descriptor) &&                                \
            __ratelimit(&_rs))                                                \
                __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__);        \
} while (0)
#elif defined(DEBUG)
#define pr_debug_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_ratelimited(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

extern const struct file_operations kmsg_fops;

enum {
        DUMP_PREFIX_NONE,
        DUMP_PREFIX_ADDRESS,
        DUMP_PREFIX_OFFSET
};
extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
                              int groupsize, char *linebuf, size_t linebuflen,
                              bool ascii);
#ifdef CONFIG_PRINTK
extern void print_hex_dump(const char *level, const char *prefix_str,
                           int prefix_type, int rowsize, int groupsize,
                           const void *buf, size_t len, bool ascii);
#else
static inline void print_hex_dump(const char *level, const char *prefix_str,
                                  int prefix_type, int rowsize, int groupsize,
                                  const void *buf, size_t len, bool ascii)
{
}
static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                                        const void *buf, size_t len)
{
}

#endif

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize,        \
                             groupsize, buf, len, ascii)        \
        dynamic_hex_dump(prefix_str, prefix_type, rowsize,        \
                         groupsize, buf, len, ascii)
#elif defined(DEBUG)
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize,                \
                             groupsize, buf, len, ascii)                \
        print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,        \
                       groupsize, buf, len, ascii)
#else
static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
                                        int rowsize, int groupsize,
                                        const void *buf, size_t len, bool ascii)
{
}
#endif

#if defined(DEBUG)
#define print_hex_dump_devel(prefix_str, prefix_type, rowsize,                \
                             groupsize, buf, len, ascii)                \
        print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,        \
                       groupsize, buf, len, ascii)
#else
static inline void print_hex_dump_devel(const char *prefix_str, int prefix_type,
                                        int rowsize, int groupsize,
                                        const void *buf, size_t len, bool ascii)
{
}
#endif

/**
 * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
 * @prefix_str: string to prefix each line with;
 *  caller supplies trailing spaces for alignment if desired
 * @prefix_type: controls whether prefix of an offset, address, or none
 *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
 * @buf: data blob to dump
 * @len: number of bytes in the @buf
 *
 * Calls print_hex_dump(), with log level of KERN_DEBUG,
 * rowsize of 16, groupsize of 1, and ASCII output included.
 */
#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len)        \
        print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true)

#endif
















































































































































































































































    1 
















    1 


    1 








    1 




    1 



    1 



























    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
// SPDX-License-Identifier: GPL-2.0
/*
 * xfrm6_input.c: based on net/ipv4/xfrm4_input.c
 *
 * Authors:
 *        Mitsuru KANDA @USAGI
 *        Kazunori MIYAZAWA @USAGI
 *        Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *        YOSHIFUJI Hideaki @USAGI
 *                IPv6 support
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <net/ipv6.h>
#include <net/xfrm.h>
#include <net/protocol.h>
#include <net/gro.h>

int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
                  struct ip6_tnl *t)
{
        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
        XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
        return xfrm_input(skb, nexthdr, spi, 0);
}
EXPORT_SYMBOL(xfrm6_rcv_spi);

static int xfrm6_transport_finish2(struct net *net, struct sock *sk,
                                   struct sk_buff *skb)
{
        if (xfrm_trans_queue(skb, ip6_rcv_finish)) {
                kfree_skb(skb);
                return NET_RX_DROP;
        }

        return 0;
}

int xfrm6_transport_finish(struct sk_buff *skb, int async)
{
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct net_device *dev = skb->dev;
        int nhlen = -skb_network_offset(skb);

        skb_network_header(skb)[IP6CB(skb)->nhoff] =
                XFRM_MODE_SKB_CB(skb)->protocol;

#ifndef CONFIG_NETFILTER
        if (!async)
                return 1;
#endif

        __skb_push(skb, nhlen);
        ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
        skb_postpush_rcsum(skb, skb_network_header(skb), nhlen);

        if (xo && (xo->flags & XFRM_GRO)) {
                /* The full l2 header needs to be preserved so that re-injecting the packet at l2
                 * works correctly in the presence of vlan tags.
                 */
                skb_mac_header_rebuild_full(skb, xo->orig_mac_len);
                skb_reset_network_header(skb);
                skb_reset_transport_header(skb);
                return 0;
        }

        NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
                dev_net(dev), NULL, skb, dev, NULL,
                xfrm6_transport_finish2);
        if (async)
                dev_put(dev);
        return 0;
}

static int __xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull)
{
        struct udp_sock *up = udp_sk(sk);
        struct udphdr *uh;
        struct ipv6hdr *ip6h;
        int len;
        int ip6hlen = sizeof(struct ipv6hdr);
        __u8 *udpdata;
        __be32 *udpdata32;
        u16 encap_type;

        encap_type = READ_ONCE(up->encap_type);
        /* if this is not encapsulated socket, then just return now */
        if (!encap_type)
                return 1;

        /* If this is a paged skb, make sure we pull up
         * whatever data we need to look at. */
        len = skb->len - sizeof(struct udphdr);
        if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8)))
                return 1;

        /* Now we can get the pointers */
        uh = udp_hdr(skb);
        udpdata = (__u8 *)uh + sizeof(struct udphdr);
        udpdata32 = (__be32 *)udpdata;

        switch (encap_type) {
        default:
        case UDP_ENCAP_ESPINUDP:
                /* Check if this is a keepalive packet.  If so, eat it. */
                if (len == 1 && udpdata[0] == 0xff) {
                        return -EINVAL;
                } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
                        /* ESP Packet without Non-ESP header */
                        len = sizeof(struct udphdr);
                } else
                        /* Must be an IKE packet.. pass it through */
                        return 1;
                break;
        }

        /* At this point we are sure that this is an ESPinUDP packet,
         * so we need to remove 'len' bytes from the packet (the UDP
         * header and optional ESP marker bytes) and then modify the
         * protocol to ESP, and then call into the transform receiver.
         */
        if (skb_unclone(skb, GFP_ATOMIC))
                return -EINVAL;

        /* Now we can update and verify the packet length... */
        ip6h = ipv6_hdr(skb);
        ip6h->payload_len = htons(ntohs(ip6h->payload_len) - len);
        if (skb->len < ip6hlen + len) {
                /* packet is too small!?! */
                return -EINVAL;
        }

        /* pull the data buffer up to the ESP header and set the
         * transport header to point to ESP.  Keep UDP on the stack
         * for later.
         */
        if (pull) {
                __skb_pull(skb, len);
                skb_reset_transport_header(skb);
        } else {
                skb_set_transport_header(skb, len);
        }

        /* process ESP */
        return 0;
}

/* If it's a keepalive packet, then just eat it.
 * If it's an encapsulated packet, then pass it to the
 * IPsec xfrm input.
 * Returns 0 if skb passed to xfrm or was dropped.
 * Returns >0 if skb should be passed to UDP.
 * Returns <0 if skb should be resubmitted (-ret is protocol)
 */
int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
{
        int ret;

        if (skb->protocol == htons(ETH_P_IP))
                return xfrm4_udp_encap_rcv(sk, skb);

        ret = __xfrm6_udp_encap_rcv(sk, skb, true);
        if (!ret)
                return xfrm6_rcv_encap(skb, IPPROTO_ESP, 0,
                                       udp_sk(sk)->encap_type);

        if (ret < 0) {
                kfree_skb(skb);
                return 0;
        }

        return ret;
}

struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
                                        struct sk_buff *skb)
{
        int offset = skb_gro_offset(skb);
        const struct net_offload *ops;
        struct sk_buff *pp = NULL;
        int len, dlen;
        __u8 *udpdata;
        __be32 *udpdata32;

        if (skb->protocol == htons(ETH_P_IP))
                return xfrm4_gro_udp_encap_rcv(sk, head, skb);

        len = skb->len - offset;
        dlen = offset + min(len, 8);
        udpdata = skb_gro_header(skb, dlen, offset);
        udpdata32 = (__be32 *)udpdata;
        if (unlikely(!udpdata))
                return NULL;

        rcu_read_lock();
        ops = rcu_dereference(inet6_offloads[IPPROTO_ESP]);
        if (!ops || !ops->callbacks.gro_receive)
                goto out;

        /* check if it is a keepalive or IKE packet */
        if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0)
                goto out;

        /* set the transport header to ESP */
        skb_set_transport_header(skb, offset);

        NAPI_GRO_CB(skb)->proto = IPPROTO_UDP;

        pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
        rcu_read_unlock();

        return pp;

out:
        rcu_read_unlock();
        NAPI_GRO_CB(skb)->same_flow = 0;
        NAPI_GRO_CB(skb)->flush = 1;

        return NULL;
}

int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t)
{
        return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
                             0, t);
}
EXPORT_SYMBOL(xfrm6_rcv_tnl);

int xfrm6_rcv(struct sk_buff *skb)
{
        return xfrm6_rcv_tnl(skb, NULL);
}
EXPORT_SYMBOL(xfrm6_rcv);
int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
                     xfrm_address_t *saddr, u8 proto)
{
        struct net *net = dev_net(skb->dev);
        struct xfrm_state *x = NULL;
        struct sec_path *sp;
        int i = 0;

        sp = secpath_set(skb);
        if (!sp) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
                goto drop;
        }

        if (1 + sp->len == XFRM_MAX_DEPTH) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
                goto drop;
        }

        for (i = 0; i < 3; i++) {
                xfrm_address_t *dst, *src;

                switch (i) {
                case 0:
                        dst = daddr;
                        src = saddr;
                        break;
                case 1:
                        /* lookup state with wild-card source address */
                        dst = daddr;
                        src = (xfrm_address_t *)&in6addr_any;
                        break;
                default:
                        /* lookup state with wild-card addresses */
                        dst = (xfrm_address_t *)&in6addr_any;
                        src = (xfrm_address_t *)&in6addr_any;
                        break;
                }

                x = xfrm_state_lookup_byaddr(net, skb->mark, dst, src, proto, AF_INET6);
                if (!x)
                        continue;

                if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR);
                        xfrm_state_put(x);
                        x = NULL;
                        continue;
                }

                spin_lock(&x->lock);

                if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) &&
                    likely(x->km.state == XFRM_STATE_VALID) &&
                    !xfrm_state_check_expire(x)) {
                        spin_unlock(&x->lock);
                        if (x->type->input(x, skb) > 0) {
                                /* found a valid state */
                                break;
                        }
                } else
                        spin_unlock(&x->lock);

                xfrm_state_put(x);
                x = NULL;
        }

        if (!x) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
                xfrm_audit_state_notfound_simple(skb, AF_INET6);
                goto drop;
        }

        sp->xvec[sp->len++] = x;

        spin_lock(&x->lock);

        x->curlft.bytes += skb->len;
        x->curlft.packets++;

        spin_unlock(&x->lock);

        return 1;

drop:
        return -1;
}
EXPORT_SYMBOL(xfrm6_input_addr);


























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SMP_H
#define __LINUX_SMP_H

/*
 *        Generic SMP support
 *                Alan Cox. <alan@redhat.com>
 */

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/cpumask.h>
#include <linux/init.h>
#include <linux/smp_types.h>

typedef void (*smp_call_func_t)(void *info);
typedef bool (*smp_cond_func_t)(int cpu, void *info);

/*
 * structure shares (partial) layout with struct irq_work
 */
struct __call_single_data {
        struct __call_single_node node;
        smp_call_func_t func;
        void *info;
};

#define CSD_INIT(_func, _info) \
        (struct __call_single_data){ .func = (_func), .info = (_info), }

/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
typedef struct __call_single_data call_single_data_t
        __aligned(sizeof(struct __call_single_data));

#define INIT_CSD(_csd, _func, _info)                \
do {                                                \
        *(_csd) = CSD_INIT((_func), (_info));        \
} while (0)

/*
 * Enqueue a llist_node on the call_single_queue; be very careful, read
 * flush_smp_call_function_queue() in detail.
 */
extern void __smp_call_single_queue(int cpu, struct llist_node *node);

/* total number of cpus in this system (may exceed NR_CPUS) */
extern unsigned int total_cpus;

int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
                             int wait);

void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
                           void *info, bool wait, const struct cpumask *mask);

int smp_call_function_single_async(int cpu, call_single_data_t *csd);

/*
 * Cpus stopping functions in panic. All have default weak definitions.
 * Architecture-dependent code may override them.
 */
void __noreturn panic_smp_self_stop(void);
void __noreturn nmi_panic_self_stop(struct pt_regs *regs);
void crash_smp_send_stop(void);
int panic_smp_redirect_cpu(int target_cpu, void *msg);

/*
 * Call a function on all processors
 */
static inline void on_each_cpu(smp_call_func_t func, void *info, int wait)
{
        on_each_cpu_cond_mask(NULL, func, info, wait, cpu_online_mask);
}

/**
 * on_each_cpu_mask() - Run a function on processors specified by
 * cpumask, which may include the local processor.
 * @mask: The set of cpus to run on (only runs on online subset).
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
 * If @wait is true, then returns once @func has returned.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.  The
 * exception is that it may be used during early boot while
 * early_boot_irqs_disabled is set.
 */
static inline void on_each_cpu_mask(const struct cpumask *mask,
                                    smp_call_func_t func, void *info, bool wait)
{
        on_each_cpu_cond_mask(NULL, func, info, wait, mask);
}

/*
 * Call a function on each processor for which the supplied function
 * cond_func returns a positive value. This may include the local
 * processor.  May be used during early boot while early_boot_irqs_disabled is
 * set. Use local_irq_save/restore() instead of local_irq_disable/enable().
 */
static inline void on_each_cpu_cond(smp_cond_func_t cond_func,
                                    smp_call_func_t func, void *info, bool wait)
{
        on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
}

/*
 * Architecture specific boot CPU setup.  Defined as empty weak function in
 * init/main.c. Architectures can override it.
 */
void __init smp_prepare_boot_cpu(void);

#ifdef CONFIG_SMP

#include <linux/preempt.h>
#include <linux/compiler.h>
#include <linux/thread_info.h>
#include <asm/smp.h>

/*
 * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
 * (defined in asm header):
 */

/*
 * stops all CPUs but the current one:
 */
extern void smp_send_stop(void);

/*
 * sends a 'reschedule' event to another CPU:
 */
extern void arch_smp_send_reschedule(int cpu);
/*
 * scheduler_ipi() is inline so can't be passed as callback reason, but the
 * callsite IP should be sufficient for root-causing IPIs sent from here.
 */
#define smp_send_reschedule(cpu) ({                  \
        trace_ipi_send_cpu(cpu, _RET_IP_, NULL);  \
        arch_smp_send_reschedule(cpu);                  \
})

/*
 * Prepare machine for booting other CPUs.
 */
extern void smp_prepare_cpus(unsigned int max_cpus);

/*
 * Bring a CPU up
 */
extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle);

/*
 * Final polishing of CPUs
 */
extern void smp_cpus_done(unsigned int max_cpus);

/*
 * Call a function on all other processors
 */
void smp_call_function(smp_call_func_t func, void *info, int wait);
void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait);

int smp_call_function_any(const struct cpumask *mask,
                          smp_call_func_t func, void *info, int wait);

void kick_all_cpus_sync(void);
void wake_up_all_idle_cpus(void);
bool cpus_peek_for_pending_ipi(const struct cpumask *mask);

/*
 * Generic and arch helpers
 */
void __init call_function_init(void);
void generic_smp_call_function_single_interrupt(void);
#define generic_smp_call_function_interrupt \
        generic_smp_call_function_single_interrupt

extern unsigned int setup_max_cpus;
extern void __init setup_nr_cpu_ids(void);
extern void __init smp_init(void);

extern int __boot_cpu_id;

static inline int get_boot_cpu_id(void)
{
        return __boot_cpu_id;
}

#else /* !SMP */

static inline void smp_send_stop(void) { }

/*
 *        These macros fold the SMP functionality into a single CPU system
 */
#define raw_smp_processor_id()                        0
static inline void up_smp_call_function(smp_call_func_t func, void *info)
{
}
#define smp_call_function(func, info, wait) \
                        (up_smp_call_function(func, info))

static inline void smp_send_reschedule(int cpu) { }
#define smp_call_function_many(mask, func, info, wait) \
                        (up_smp_call_function(func, info))
static inline void call_function_init(void) { }

static inline int
smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
                      void *info, int wait)
{
        return smp_call_function_single(0, func, info, wait);
}

static inline void kick_all_cpus_sync(void) {  }
static inline void wake_up_all_idle_cpus(void) {  }
static inline bool cpus_peek_for_pending_ipi(const struct cpumask *mask)
{
        return false;
}

#define setup_max_cpus 0

#ifdef CONFIG_UP_LATE_INIT
extern void __init up_late_init(void);
static __always_inline void smp_init(void) { up_late_init(); }
#else
static inline void smp_init(void) { }
#endif

static inline int get_boot_cpu_id(void)
{
        return 0;
}

#endif /* !SMP */

/*
 * raw_smp_processor_id() - get the current (unstable) CPU id
 *
 * raw_smp_processor_id() is arch-specific/arch-defined and
 * may be a macro or a static inline function.
 *
 * For when you know what you are doing and need an unstable
 * CPU id.
 */

/*
 * Allow the architecture to differentiate between a stable and unstable read.
 * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a
 * regular asm read for the stable.
 */
#ifndef __smp_processor_id
#define __smp_processor_id() raw_smp_processor_id()
#endif

#ifdef CONFIG_DEBUG_PREEMPT
  extern unsigned int debug_smp_processor_id(void);
# define smp_processor_id() debug_smp_processor_id()

#else
/**
 * smp_processor_id() - get the current (stable) CPU id
 *
 * This is the normal accessor to the CPU id and should be used
 * whenever possible.
 *
 * The CPU id is stable when:
 *
 *  - IRQs are disabled;
 *  - preemption is disabled;
 *  - the task is CPU affine.
 *
 * When CONFIG_DEBUG_PREEMPT=y, we verify these assumptions and WARN
 * when smp_processor_id() is used when the CPU id is not stable.
 */

# define smp_processor_id() __smp_processor_id()
#endif

#define get_cpu()                ({ preempt_disable(); __smp_processor_id(); })
#define put_cpu()                preempt_enable()

/*
 * Callback to arch code if there's nosmp or maxcpus=0 on the
 * boot command line:
 */
extern void arch_disable_smp_support(void);

extern void arch_thaw_secondary_cpus_begin(void);
extern void arch_thaw_secondary_cpus_end(void);

void smp_setup_processor_id(void);

int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par,
                    bool phys);

/* SMP core functions */
int smpcfd_prepare_cpu(unsigned int cpu);
int smpcfd_dead_cpu(unsigned int cpu);
int smpcfd_dying_cpu(unsigned int cpu);

#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
bool csd_lock_is_stuck(void);
#else
static inline bool csd_lock_is_stuck(void) { return false; }
#endif

#endif /* __LINUX_SMP_H */

















   61 
   70 






   29 
   29 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/kernel.h>
#include <linux/nospec.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/wordpart.h>

/* out-of-line parts */

#if !defined(INLINE_COPY_FROM_USER)
unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
{
        return _inline_copy_from_user(to, from, n);
}
EXPORT_SYMBOL(_copy_from_user);
#endif

#if !defined(INLINE_COPY_TO_USER)
unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
{
        return _inline_copy_to_user(to, from, n);
}
EXPORT_SYMBOL(_copy_to_user);
#endif

/**
 * check_zeroed_user: check if a userspace buffer only contains zero bytes
 * @from: Source address, in userspace.
 * @size: Size of buffer.
 *
 * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
 * userspace addresses (and is more efficient because we don't care where the
 * first non-zero byte is).
 *
 * Returns:
 *  * 0: There were non-zero bytes present in the buffer.
 *  * 1: The buffer was full of zero bytes.
 *  * -EFAULT: access to userspace failed.
 */
int check_zeroed_user(const void __user *from, size_t size)
{
        unsigned long val;
        uintptr_t align = (uintptr_t) from % sizeof(unsigned long);

        if (unlikely(size == 0))
                return 1;

        from -= align;
        size += align;

        if (!user_read_access_begin(from, size))
                return -EFAULT;

        unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        if (align)
                val &= ~aligned_byte_mask(align);

        while (size > sizeof(unsigned long)) {
                if (unlikely(val))
                        goto done;

                from += sizeof(unsigned long);
                size -= sizeof(unsigned long);

                unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        }

        if (size < sizeof(unsigned long))
                val &= aligned_byte_mask(size);

done:
        user_read_access_end();
        return (val == 0);
err_fault:
        user_read_access_end();
        return -EFAULT;
}
EXPORT_SYMBOL(check_zeroed_user);


































































    3 


































































































    6 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIME64_H
#define _LINUX_TIME64_H

#include <linux/math64.h>
#include <vdso/time64.h>

typedef __s64 time64_t;
typedef __u64 timeu64_t;

#include <uapi/linux/time.h>

struct timespec64 {
        time64_t        tv_sec;                        /* seconds */
        long                tv_nsec;                /* nanoseconds */
};

struct itimerspec64 {
        struct timespec64 it_interval;
        struct timespec64 it_value;
};

/* Parameters used to convert the timespec values: */
#define PSEC_PER_NSEC                        1000L

/* Located here for timespec[64]_valid_strict */
#define TIME64_MAX                        ((s64)~((u64)1 << 63))
#define TIME64_MIN                        (-TIME64_MAX - 1)

#define KTIME_MAX                        ((s64)~((u64)1 << 63))
#define KTIME_MIN                        (-KTIME_MAX - 1)
#define KTIME_SEC_MAX                        (KTIME_MAX / NSEC_PER_SEC)
#define KTIME_SEC_MIN                        (KTIME_MIN / NSEC_PER_SEC)

/*
 * Limits for settimeofday():
 *
 * To prevent setting the time close to the wraparound point time setting
 * is limited so a reasonable uptime can be accomodated. Uptime of 30 years
 * should be really sufficient, which means the cutoff is 2232. At that
 * point the cutoff is just a small part of the larger problem.
 */
#define TIME_UPTIME_SEC_MAX                (30LL * 365 * 24 *3600)
#define TIME_SETTOD_SEC_MAX                (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX)

static inline int timespec64_equal(const struct timespec64 *a,
                                   const struct timespec64 *b)
{
        return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
}

static inline bool timespec64_is_epoch(const struct timespec64 *ts)
{
        return ts->tv_sec == 0 && ts->tv_nsec == 0;
}

/*
 * lhs < rhs:  return <0
 * lhs == rhs: return 0
 * lhs > rhs:  return >0
 */
static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs)
{
        if (lhs->tv_sec < rhs->tv_sec)
                return -1;
        if (lhs->tv_sec > rhs->tv_sec)
                return 1;
        return lhs->tv_nsec - rhs->tv_nsec;
}

extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);

static inline struct timespec64 timespec64_add(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec,
                                lhs.tv_nsec + rhs.tv_nsec);
        return ts_delta;
}

/*
 * sub = lhs - rhs, in normalized form
 */
static inline struct timespec64 timespec64_sub(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec,
                                lhs.tv_nsec - rhs.tv_nsec);
        return ts_delta;
}

/*
 * Returns true if the timespec64 is norm, false if denorm:
 */
static inline bool timespec64_valid(const struct timespec64 *ts)
{
        /* Dates before 1970 are bogus */
        if (ts->tv_sec < 0)
                return false;
        /* Can't have more nanoseconds then a second */
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return false;
        return true;
}

static inline bool timespec64_valid_strict(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values that could overflow ktime_t */
        if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
                return false;
        return true;
}

static inline bool timespec64_valid_settod(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */
        if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX)
                return false;
        return true;
}

/**
 * timespec64_to_ns - Convert timespec64 to nanoseconds
 * @ts:                pointer to the timespec64 variable to be converted
 *
 * Returns the scalar nanosecond representation of the timespec64
 * parameter.
 */
static inline s64 timespec64_to_ns(const struct timespec64 *ts)
{
        /* Prevent multiplication overflow / underflow */
        if (ts->tv_sec >= KTIME_SEC_MAX)
                return KTIME_MAX;

        if (ts->tv_sec <= KTIME_SEC_MIN)
                return KTIME_MIN;

        return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
}

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:        the nanoseconds value to be converted
 *
 * Returns the timespec64 representation of the nsec parameter.
 */
extern struct timespec64 ns_to_timespec64(s64 nsec);

/**
 * timespec64_add_ns - Adds nanoseconds to a timespec64
 * @a:                pointer to timespec64 to be incremented
 * @ns:                unsigned nanoseconds value to be added
 *
 * This must always be inlined because its used from the x86-64 vdso,
 * which cannot call other kernel functions.
 */
static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
{
        a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns);
        a->tv_nsec = ns;
}

/*
 * timespec64_add_safe assumes both values are positive and checks for
 * overflow. It will return TIME64_MAX in case of overflow.
 */
extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                         const struct timespec64 rhs);

#endif /* _LINUX_TIME64_H */































































































































































































    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Definitions for key type implementations
 *
 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_KEY_TYPE_H
#define _LINUX_KEY_TYPE_H

#include <linux/key.h>
#include <linux/errno.h>

#ifdef CONFIG_KEYS

struct kernel_pkey_query;
struct kernel_pkey_params;

/*
 * Pre-parsed payload, used by key add, update and instantiate.
 *
 * This struct will be cleared and data and datalen will be set with the data
 * and length parameters from the caller and quotalen will be set from
 * def_datalen from the key type.  Then if the preparse() op is provided by the
 * key type, that will be called.  Then the struct will be passed to the
 * instantiate() or the update() op.
 *
 * If the preparse() op is given, the free_preparse() op will be called to
 * clear the contents.
 */
struct key_preparsed_payload {
        const char        *orig_description; /* Actual or proposed description (maybe NULL) */
        char                *description;        /* Proposed key description (or NULL) */
        union key_payload payload;        /* Proposed payload */
        const void        *data;                /* Raw data */
        size_t                datalen;        /* Raw datalen */
        size_t                quotalen;        /* Quota length for proposed payload */
        time64_t        expiry;                /* Expiry time of key */
} __randomize_layout;

typedef int (*request_key_actor_t)(struct key *auth_key, void *aux);

/*
 * Preparsed matching criterion.
 */
struct key_match_data {
        /* Comparison function, defaults to exact description match, but can be
         * overridden by type->match_preparse().  Should return true if a match
         * is found and false if not.
         */
        bool (*cmp)(const struct key *key,
                    const struct key_match_data *match_data);

        const void        *raw_data;        /* Raw match data */
        void                *preparsed;        /* For ->match_preparse() to stash stuff */
        unsigned        lookup_type;        /* Type of lookup for this search. */
#define KEYRING_SEARCH_LOOKUP_DIRECT        0x0000        /* Direct lookup by description. */
#define KEYRING_SEARCH_LOOKUP_ITERATE        0x0001        /* Iterative search. */
};

/*
 * kernel managed key type definition
 */
struct key_type {
        /* name of the type */
        const char *name;

        /* default payload length for quota precalculation (optional)
         * - this can be used instead of calling key_payload_reserve(), that
         *   function only needs to be called if the real datalen is different
         */
        size_t def_datalen;

        unsigned int flags;
#define KEY_TYPE_NET_DOMAIN        0x00000001 /* Keys of this type have a net namespace domain */
#define KEY_TYPE_INSTANT_REAP        0x00000002 /* Keys of this type don't have a delay after expiring */

        /* vet a description */
        int (*vet_description)(const char *description);

        /* Preparse the data blob from userspace that is to be the payload,
         * generating a proposed description and payload that will be handed to
         * the instantiate() and update() ops.
         */
        int (*preparse)(struct key_preparsed_payload *prep);

        /* Free a preparse data structure.
         */
        void (*free_preparse)(struct key_preparsed_payload *prep);

        /* instantiate a key of this type
         * - this method should call key_payload_reserve() to determine if the
         *   user's quota will hold the payload
         */
        int (*instantiate)(struct key *key, struct key_preparsed_payload *prep);

        /* update a key of this type (optional)
         * - this method should call key_payload_reserve() to recalculate the
         *   quota consumption
         * - the key must be locked against read when modifying
         */
        int (*update)(struct key *key, struct key_preparsed_payload *prep);

        /* Preparse the data supplied to ->match() (optional).  The
         * data to be preparsed can be found in match_data->raw_data.
         * The lookup type can also be set by this function.
         */
        int (*match_preparse)(struct key_match_data *match_data);

        /*
         * Free preparsed match data (optional).  This should be supplied if
         * ->match_preparse() is supplied.
         */
        void (*match_free)(struct key_match_data *match_data);

        /*
         * Clear some of the data from a key on revocation (optional).
         * - the key's semaphore will be write-locked by the caller
         */
        void (*revoke)(struct key *key);

        /* clear the data from a key (optional) */
        void (*destroy)(struct key *key);

        /* describe a key */
        void (*describe)(const struct key *key, struct seq_file *p);

        /* read a key's data (optional)
         * - permission checks will be done by the caller
         * - the key's semaphore will be readlocked by the caller
         * - should return the amount of data that could be read, no matter how
         *   much is copied into the buffer
         * - shouldn't do the copy if the buffer is NULL
         */
        long (*read)(const struct key *key, char *buffer, size_t buflen);

        /* handle request_key() for this type instead of invoking
         * /sbin/request-key (optional)
         * - key is the key to instantiate
         * - authkey is the authority to assume when instantiating this key
         * - op is the operation to be done, usually "create"
         * - the call must not return until the instantiation process has run
         *   its course
         */
        request_key_actor_t request_key;

        /* Look up a keyring access restriction (optional)
         *
         * - NULL is a valid return value (meaning the requested restriction
         *   is known but will never block addition of a key)
         * - should return -EINVAL if the restriction is unknown
         */
        struct key_restriction *(*lookup_restriction)(const char *params);

        /* Asymmetric key accessor functions. */
        int (*asym_query)(const struct kernel_pkey_params *params,
                          struct kernel_pkey_query *info);
        int (*asym_eds_op)(struct kernel_pkey_params *params,
                           const void *in, void *out);
        int (*asym_verify_signature)(struct kernel_pkey_params *params,
                                     const void *in, const void *in2);

        /* internal fields */
        struct list_head        link;                /* link in types list */
        struct lock_class_key        lock_class;        /* key->sem lock class */
} __randomize_layout;

extern struct key_type key_type_keyring;

extern int register_key_type(struct key_type *ktype);
extern void unregister_key_type(struct key_type *ktype);

extern int key_payload_reserve(struct key *key, size_t datalen);
extern int key_instantiate_and_link(struct key *key,
                                    const void *data,
                                    size_t datalen,
                                    struct key *keyring,
                                    struct key *authkey);
extern int key_reject_and_link(struct key *key,
                               unsigned timeout,
                               unsigned error,
                               struct key *keyring,
                               struct key *authkey);
extern void complete_request_key(struct key *authkey, int error);

static inline int key_negate_and_link(struct key *key,
                                      unsigned timeout,
                                      struct key *keyring,
                                      struct key *authkey)
{
        return key_reject_and_link(key, timeout, ENOKEY, keyring, authkey);
}

extern int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep);

#endif /* CONFIG_KEYS */
#endif /* _LINUX_KEY_TYPE_H */




































































    3 




    3 





























































































































    3 














    3 
    3 













    3 






    3 







    3 












































































































    3 





    3 





    3 




































    3 


    3 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Link physical devices with ACPI devices support
 *
 * Copyright (c) 2005 David Shaohua Li <shaohua.li@intel.com>
 * Copyright (c) 2005 Intel Corp.
 */

#define pr_fmt(fmt) "ACPI: " fmt

#include <linux/acpi_iort.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/device.h>
#include <linux/slab.h>
#include <linux/rwsem.h>
#include <linux/acpi.h>
#include <linux/dma-mapping.h>
#include <linux/pci.h>
#include <linux/pci-acpi.h>
#include <linux/platform_device.h>

#include "internal.h"

static LIST_HEAD(bus_type_list);
static DECLARE_RWSEM(bus_type_sem);

#define PHYSICAL_NODE_STRING "physical_node"
#define PHYSICAL_NODE_NAME_SIZE (sizeof(PHYSICAL_NODE_STRING) + 10)

int register_acpi_bus_type(struct acpi_bus_type *type)
{
        if (acpi_disabled)
                return -ENODEV;
        if (type && type->match && type->find_companion) {
                down_write(&bus_type_sem);
                list_add_tail(&type->list, &bus_type_list);
                up_write(&bus_type_sem);
                pr_info("bus type %s registered\n", type->name);
                return 0;
        }
        return -ENODEV;
}
EXPORT_SYMBOL_GPL(register_acpi_bus_type);

int unregister_acpi_bus_type(struct acpi_bus_type *type)
{
        if (acpi_disabled)
                return 0;
        if (type) {
                down_write(&bus_type_sem);
                list_del_init(&type->list);
                up_write(&bus_type_sem);
                pr_info("bus type %s unregistered\n", type->name);
                return 0;
        }
        return -ENODEV;
}
EXPORT_SYMBOL_GPL(unregister_acpi_bus_type);

static struct acpi_bus_type *acpi_get_bus_type(struct device *dev)
{
        struct acpi_bus_type *tmp, *ret = NULL;

        down_read(&bus_type_sem);
        list_for_each_entry(tmp, &bus_type_list, list) {
                if (tmp->match(dev)) {
                        ret = tmp;
                        break;
                }
        }
        up_read(&bus_type_sem);
        return ret;
}

#define FIND_CHILD_MIN_SCORE        1
#define FIND_CHILD_MID_SCORE        2
#define FIND_CHILD_MAX_SCORE        3

static int match_any(struct acpi_device *adev, void *not_used)
{
        return 1;
}

static bool acpi_dev_has_children(struct acpi_device *adev)
{
        return acpi_dev_for_each_child(adev, match_any, NULL) > 0;
}

static int find_child_checks(struct acpi_device *adev, bool check_children)
{
        unsigned long long sta;
        acpi_status status;

        if (check_children && !acpi_dev_has_children(adev))
                return -ENODEV;

        status = acpi_evaluate_integer(adev->handle, "_STA", NULL, &sta);
        if (status == AE_NOT_FOUND) {
                /*
                 * Special case: backlight device objects without _STA are
                 * preferred to other objects with the same _ADR value, because
                 * it is more likely that they are actually useful.
                 */
                if (adev->pnp.type.backlight)
                        return FIND_CHILD_MID_SCORE;

                return FIND_CHILD_MIN_SCORE;
        }

        if (ACPI_FAILURE(status) || !(sta & ACPI_STA_DEVICE_ENABLED))
                return -ENODEV;

        /*
         * If the device has a _HID returning a valid ACPI/PNP device ID, it is
         * better to make it look less attractive here, so that the other device
         * with the same _ADR value (that may not have a valid device ID) can be
         * matched going forward.  [This means a second spec violation in a row,
         * so whatever we do here is best effort anyway.]
         */
        if (adev->pnp.type.platform_id)
                return FIND_CHILD_MIN_SCORE;

        return FIND_CHILD_MAX_SCORE;
}

struct find_child_walk_data {
        struct acpi_device *adev;
        u64 address;
        int score;
        bool check_sta;
        bool check_children;
};

static int check_one_child(struct acpi_device *adev, void *data)
{
        struct find_child_walk_data *wd = data;
        int score;

        if (!adev->pnp.type.bus_address || acpi_device_adr(adev) != wd->address)
                return 0;

        if (!wd->adev) {
                /*
                 * This is the first matching object, so save it.  If it is not
                 * necessary to look for any other matching objects, stop the
                 * search.
                 */
                wd->adev = adev;
                return !(wd->check_sta || wd->check_children);
        }

        /*
         * There is more than one matching device object with the same _ADR
         * value.  That really is unexpected, so we are kind of beyond the scope
         * of the spec here.  We have to choose which one to return, though.
         *
         * First, get the score for the previously found object and terminate
         * the walk if it is maximum.
        */
        if (!wd->score) {
                score = find_child_checks(wd->adev, wd->check_children);
                if (score == FIND_CHILD_MAX_SCORE)
                        return 1;

                wd->score = score;
        }
        /*
         * Second, if the object that has just been found has a better score,
         * replace the previously found one with it and terminate the walk if
         * the new score is maximum.
         */
        score = find_child_checks(adev, wd->check_children);
        if (score > wd->score) {
                wd->adev = adev;
                if (score == FIND_CHILD_MAX_SCORE)
                        return 1;

                wd->score = score;
        }

        /* Continue, because there may be better matches. */
        return 0;
}

static struct acpi_device *acpi_find_child(struct acpi_device *parent,
                                           u64 address, bool check_children,
                                           bool check_sta)
{
        struct find_child_walk_data wd = {
                .address = address,
                .check_children = check_children,
                .check_sta = check_sta,
                .adev = NULL,
                .score = 0,
        };

        if (parent)
                acpi_dev_for_each_child(parent, check_one_child, &wd);

        return wd.adev;
}

struct acpi_device *acpi_find_child_device(struct acpi_device *parent,
                                           u64 address, bool check_children)
{
        return acpi_find_child(parent, address, check_children, true);
}
EXPORT_SYMBOL_GPL(acpi_find_child_device);

struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev,
                                           acpi_bus_address adr)
{
        return acpi_find_child(adev, adr, false, false);
}
EXPORT_SYMBOL_GPL(acpi_find_child_by_adr);

static void acpi_physnode_link_name(char *buf, unsigned int node_id)
{
        if (node_id > 0)
                snprintf(buf, PHYSICAL_NODE_NAME_SIZE,
                         PHYSICAL_NODE_STRING "%u", node_id);
        else
                strcpy(buf, PHYSICAL_NODE_STRING);
}

int acpi_bind_one(struct device *dev, struct acpi_device *acpi_dev)
{
        struct acpi_device_physical_node *physical_node, *pn;
        char physical_node_name[PHYSICAL_NODE_NAME_SIZE];
        struct list_head *physnode_list;
        unsigned int node_id;
        int retval = -EINVAL;

        if (has_acpi_companion(dev)) {
                if (acpi_dev) {
                        dev_warn(dev, "ACPI companion already set\n");
                        return -EINVAL;
                } else {
                        acpi_dev = ACPI_COMPANION(dev);
                }
        }
        if (!acpi_dev)
                return -EINVAL;

        acpi_dev_get(acpi_dev);
        get_device(dev);
        physical_node = kzalloc_obj(*physical_node);
        if (!physical_node) {
                retval = -ENOMEM;
                goto err;
        }

        mutex_lock(&acpi_dev->physical_node_lock);

        /*
         * Keep the list sorted by node_id so that the IDs of removed nodes can
         * be recycled easily.
         */
        physnode_list = &acpi_dev->physical_node_list;
        node_id = 0;
        list_for_each_entry(pn, &acpi_dev->physical_node_list, node) {
                /* Sanity check. */
                if (pn->dev == dev) {
                        mutex_unlock(&acpi_dev->physical_node_lock);

                        dev_warn(dev, "Already associated with ACPI node\n");
                        kfree(physical_node);
                        if (ACPI_COMPANION(dev) != acpi_dev)
                                goto err;

                        put_device(dev);
                        acpi_dev_put(acpi_dev);
                        return 0;
                }
                if (pn->node_id == node_id) {
                        physnode_list = &pn->node;
                        node_id++;
                }
        }

        physical_node->node_id = node_id;
        physical_node->dev = dev;
        list_add(&physical_node->node, physnode_list);
        acpi_dev->physical_node_count++;

        if (!has_acpi_companion(dev))
                ACPI_COMPANION_SET(dev, acpi_dev);

        acpi_physnode_link_name(physical_node_name, node_id);
        retval = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj,
                                   physical_node_name);
        if (retval)
                dev_err(&acpi_dev->dev, "Failed to create link %s (%d)\n",
                        physical_node_name, retval);

        retval = sysfs_create_link(&dev->kobj, &acpi_dev->dev.kobj,
                                   "firmware_node");
        if (retval)
                dev_err(dev, "Failed to create link firmware_node (%d)\n",
                        retval);

        mutex_unlock(&acpi_dev->physical_node_lock);

        if (acpi_dev->wakeup.flags.valid)
                device_set_wakeup_capable(dev, true);

        return 0;

 err:
        ACPI_COMPANION_SET(dev, NULL);
        put_device(dev);
        acpi_dev_put(acpi_dev);
        return retval;
}
EXPORT_SYMBOL_GPL(acpi_bind_one);

int acpi_unbind_one(struct device *dev)
{
        struct acpi_device *acpi_dev = ACPI_COMPANION(dev);
        struct acpi_device_physical_node *entry;

        if (!acpi_dev)
                return 0;

        mutex_lock(&acpi_dev->physical_node_lock);

        list_for_each_entry(entry, &acpi_dev->physical_node_list, node)
                if (entry->dev == dev) {
                        char physnode_name[PHYSICAL_NODE_NAME_SIZE];

                        list_del(&entry->node);
                        acpi_dev->physical_node_count--;

                        acpi_physnode_link_name(physnode_name, entry->node_id);
                        sysfs_remove_link(&acpi_dev->dev.kobj, physnode_name);
                        sysfs_remove_link(&dev->kobj, "firmware_node");
                        ACPI_COMPANION_SET(dev, NULL);
                        /* Drop references taken by acpi_bind_one(). */
                        put_device(dev);
                        acpi_dev_put(acpi_dev);
                        kfree(entry);
                        break;
                }

        mutex_unlock(&acpi_dev->physical_node_lock);
        return 0;
}
EXPORT_SYMBOL_GPL(acpi_unbind_one);

void acpi_device_notify(struct device *dev)
{
        struct acpi_device *adev;
        int ret;

        ret = acpi_bind_one(dev, NULL);
        if (ret) {
                struct acpi_bus_type *type = acpi_get_bus_type(dev);

                if (!type)
                        goto err;

                adev = type->find_companion(dev);
                if (!adev) {
                        dev_dbg(dev, "ACPI companion not found\n");
                        goto err;
                }
                ret = acpi_bind_one(dev, adev);
                if (ret)
                        goto err;

                if (type->setup) {
                        type->setup(dev);
                        goto done;
                }
        } else {
                adev = ACPI_COMPANION(dev);

                if (dev_is_pci(dev)) {
                        pci_acpi_setup(dev, adev);
                        goto done;
                } else if (dev_is_platform(dev)) {
                        acpi_configure_pmsi_domain(dev);
                }
        }

        if (adev->handler && adev->handler->bind)
                adev->handler->bind(dev);

done:
        acpi_handle_debug(ACPI_HANDLE(dev), "Bound to device %s\n",
                          dev_name(dev));

        return;

err:
        dev_dbg(dev, "No ACPI support\n");
}

void acpi_device_notify_remove(struct device *dev)
{
        struct acpi_device *adev = ACPI_COMPANION(dev);

        if (!adev)
                return;

        if (dev_is_pci(dev))
                pci_acpi_cleanup(dev, adev);
        else if (adev->handler && adev->handler->unbind)
                adev->handler->unbind(dev);

        acpi_unbind_one(dev);
}




















































































































































































































    1 










    1 





    1 








    1 





    1 









































































































































































































































































































    1 













    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 












































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
// SPDX-License-Identifier: GPL-2.0

/*
 * fs/ext4/fast_commit.c
 *
 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
 *
 * Ext4 fast commits routines.
 */
#include "ext4.h"
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "mballoc.h"

#include <linux/lockdep.h>
#include <linux/wait_bit.h>
/*
 * Ext4 Fast Commits
 * -----------------
 *
 * Ext4 fast commits implement fine grained journalling for Ext4.
 *
 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
 * TLV during the recovery phase. For the scenarios for which we currently
 * don't have replay code, fast commit falls back to full commits.
 * Fast commits record delta in one of the following three categories.
 *
 * (A) Directory entry updates:
 *
 * - EXT4_FC_TAG_UNLINK                - records directory entry unlink
 * - EXT4_FC_TAG_LINK                - records directory entry link
 * - EXT4_FC_TAG_CREAT                - records inode and directory entry creation
 *
 * (B) File specific data range updates:
 *
 * - EXT4_FC_TAG_ADD_RANGE        - records addition of new blocks to an inode
 * - EXT4_FC_TAG_DEL_RANGE        - records deletion of blocks from an inode
 *
 * (C) Inode metadata (mtime / ctime etc):
 *
 * - EXT4_FC_TAG_INODE                - record the inode that should be replayed
 *                                  during recovery. Note that iblocks field is
 *                                  not replayed and instead derived during
 *                                  replay.
 * Commit Operation
 * ----------------
 * With fast commits, we maintain all the directory entry operations in the
 * order in which they are issued in an in-memory queue. This queue is flushed
 * to disk during the commit operation. We also maintain a list of inodes
 * that need to be committed during a fast commit in another in memory queue of
 * inodes. During the commit operation, we commit in the following order:
 *
 * [1] Prepare all the inodes to write out their data by setting
 *     "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
 *     deleted while it is being flushed.
 * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
 *     state.
 * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
 *     all the exsiting handles finish and no new handles can start.
 * [4] Mark all the fast commit eligible inodes as undergoing fast commit
 *     by setting "EXT4_STATE_FC_COMMITTING" state.
 * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
 *     starting of new handles. If new handles try to start an update on
 *     any of the inodes that are being committed, ext4_fc_track_inode()
 *     will block until those inodes have finished the fast commit.
 * [6] Commit all the directory entry updates in the fast commit space.
 * [7] Commit all the changed inodes in the fast commit space and clear
 *     "EXT4_STATE_FC_COMMITTING" for these inodes.
 * [8] Write tail tag (this tag ensures the atomicity, please read the following
 *     section for more details).
 *
 * All the inode updates must be enclosed within jbd2_jounrnal_start()
 * and jbd2_journal_stop() similar to JBD2 journaling.
 *
 * Fast Commit Ineligibility
 * -------------------------
 *
 * Not all operations are supported by fast commits today (e.g extended
 * attributes). Fast commit ineligibility is marked by calling
 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
 * to full commit.
 *
 * Atomicity of commits
 * --------------------
 * In order to guarantee atomicity during the commit operation, fast commit
 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
 * tag contains CRC of the contents and TID of the transaction after which
 * this fast commit should be applied. Recovery code replays fast commit
 * logs only if there's at least 1 valid tail present. For every fast commit
 * operation, there is 1 tail. This means, we may end up with multiple tails
 * in the fast commit space. Here's an example:
 *
 * - Create a new file A and remove existing file B
 * - fsync()
 * - Append contents to file A
 * - Truncate file A
 * - fsync()
 *
 * The fast commit space at the end of above operations would look like this:
 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 *
 * Replay code should thus check for all the valid tails in the FC area.
 *
 * Fast Commit Replay Idempotence
 * ------------------------------
 *
 * Fast commits tags are idempotent in nature provided the recovery code follows
 * certain rules. The guiding principle that the commit path follows while
 * committing is that it stores the result of a particular operation instead of
 * storing the procedure.
 *
 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 * was associated with inode 10. During fast commit, instead of storing this
 * operation as a procedure "rename a to b", we store the resulting file system
 * state as a "series" of outcomes:
 *
 * - Link dirent b to inode 10
 * - Unlink dirent a
 * - Inode <10> with valid refcount
 *
 * Now when recovery code runs, it needs "enforce" this state on the file
 * system. This is what guarantees idempotence of fast commit replay.
 *
 * Let's take an example of a procedure that is not idempotent and see how fast
 * commits make it idempotent. Consider following sequence of operations:
 *
 *     rm A;    mv B A;    read A
 *  (x)     (y)        (z)
 *
 * (x), (y) and (z) are the points at which we can crash. If we store this
 * sequence of operations as is then the replay is not idempotent. Let's say
 * while in replay, we crash at (z). During the second replay, file A (which was
 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 * file named A would be absent when we try to read A. So, this sequence of
 * operations is not idempotent. However, as mentioned above, instead of storing
 * the procedure fast commits store the outcome of each procedure. Thus the fast
 * commit log for above procedure would be as follows:
 *
 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 * inode 11 before the replay)
 *
 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 * (w)          (x)                    (y)          (z)
 *
 * If we crash at (z), we will have file A linked to inode 11. During the second
 * replay, we will remove file A (inode 11). But we will create it back and make
 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 * similarly. Thus, by converting a non-idempotent procedure into a series of
 * idempotent outcomes, fast commits ensured idempotence during the replay.
 *
 * Locking
 * -------
 * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
 * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
 * inode. Most of the code avoids acquiring both the locks, but if one must do
 * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
 *
 * TODOs
 * -----
 *
 * 0) Fast commit replay path hardening: Fast commit replay code should use
 *    journal handles to make sure all the updates it does during the replay
 *    path are atomic. With that if we crash during fast commit replay, after
 *    trying to do recovery again, we will find a file system where fast commit
 *    area is invalid (because new full commit would be found). In order to deal
 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 *    superblock state is persisted before starting the replay, so that after
 *    the crash, fast commit recovery code can look at that flag and perform
 *    fast commit recovery even if that area is invalidated by later full
 *    commits.
 *
 * 1) Handle more ineligible cases.
 *
 * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
 *    status tree. This would get rid of the need to call ext4_fc_track_inode()
 *    before acquiring i_data_sem. To do that we would need to ensure that
 *    modified extents from the extent status tree are not evicted from memory.
 */

#include <trace/events/ext4.h>
static struct kmem_cache *ext4_fc_dentry_cachep;

static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
        BUFFER_TRACE(bh, "");
        if (uptodate) {
                ext4_debug("%s: Block %lld up-to-date",
                           __func__, bh->b_blocknr);
                set_buffer_uptodate(bh);
        } else {
                ext4_debug("%s: Block %lld not up-to-date",
                           __func__, bh->b_blocknr);
                clear_buffer_uptodate(bh);
        }

        unlock_buffer(bh);
}

static inline void ext4_fc_reset_inode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        ei->i_fc_lblk_start = 0;
        ei->i_fc_lblk_len = 0;
}

void ext4_fc_init_inode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        ext4_fc_reset_inode(inode);
        ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
        INIT_LIST_HEAD(&ei->i_fc_list);
        INIT_LIST_HEAD(&ei->i_fc_dilist);
}

static bool ext4_fc_disabled(struct super_block *sb)
{
        return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
                (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
}

static bool ext4_fc_eligible(struct super_block *sb)
{
        return !ext4_fc_disabled(sb) &&
                !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE));
}

/*
 * Remove inode from fast commit list. If the inode is being committed
 * we wait until inode commit is done.
 */
void ext4_fc_del(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_fc_dentry_update *fc_dentry;
        wait_queue_head_t *wq;
        int alloc_ctx;

        if (ext4_fc_disabled(inode->i_sb))
                return;

        alloc_ctx = ext4_fc_lock(inode->i_sb);
        if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
                ext4_fc_unlock(inode->i_sb, alloc_ctx);
                return;
        }

        /*
         * Since ext4_fc_del is called from ext4_evict_inode while having a
         * handle open, there is no need for us to wait here even if a fast
         * commit is going on. That is because, if this inode is being
         * committed, ext4_mark_inode_dirty would have waited for inode commit
         * operation to finish before we come here. So, by the time we come
         * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
         * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
         * here.
         *
         * We may come here without any handles open in the "no_delete" case of
         * ext4_evict_inode as well. However, if that happens, we first mark the
         * file system as fast commit ineligible anyway. So, even in that case,
         * it is okay to remove the inode from the fc list.
         */
        WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
                && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
        while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
#if (BITS_PER_LONG < 64)
                DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
                                EXT4_STATE_FC_FLUSHING_DATA);
                wq = bit_waitqueue(&ei->i_state_flags,
                                   EXT4_STATE_FC_FLUSHING_DATA);
#else
                DEFINE_WAIT_BIT(wait, &ei->i_flags,
                                EXT4_STATE_FC_FLUSHING_DATA);
                wq = bit_waitqueue(&ei->i_flags,
                                   EXT4_STATE_FC_FLUSHING_DATA);
#endif
                prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
                if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
                        ext4_fc_unlock(inode->i_sb, alloc_ctx);
                        schedule();
                        alloc_ctx = ext4_fc_lock(inode->i_sb);
                }
                finish_wait(wq, &wait.wq_entry);
        }
        list_del_init(&ei->i_fc_list);

        /*
         * Since this inode is getting removed, let's also remove all FC
         * dentry create references, since it is not needed to log it anyways.
         */
        if (list_empty(&ei->i_fc_dilist)) {
                ext4_fc_unlock(inode->i_sb, alloc_ctx);
                return;
        }

        fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
        WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
        list_del_init(&fc_dentry->fcd_list);
        list_del_init(&fc_dentry->fcd_dilist);

        WARN_ON(!list_empty(&ei->i_fc_dilist));
        ext4_fc_unlock(inode->i_sb, alloc_ctx);

        release_dentry_name_snapshot(&fc_dentry->fcd_name);
        kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
}

/*
 * Mark file system as fast commit ineligible, and record latest
 * ineligible transaction tid. This means until the recorded
 * transaction, commit operation would result in a full jbd2 commit.
 */
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        tid_t tid;
        bool has_transaction = true;
        bool is_ineligible;
        int alloc_ctx;

        if (ext4_fc_disabled(sb))
                return;

        if (!IS_ERR_OR_NULL(handle))
                tid = handle->h_transaction->t_tid;
        else {
                read_lock(&sbi->s_journal->j_state_lock);
                if (sbi->s_journal->j_running_transaction)
                        tid = sbi->s_journal->j_running_transaction->t_tid;
                else
                        has_transaction = false;
                read_unlock(&sbi->s_journal->j_state_lock);
        }
        alloc_ctx = ext4_fc_lock(sb);
        is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
                sbi->s_fc_ineligible_tid = tid;
        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        ext4_fc_unlock(sb, alloc_ctx);
        WARN_ON(reason >= EXT4_FC_REASON_MAX);
        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}

/*
 * Generic fast commit tracking function. If this is the first time this we are
 * called after a full commit, we initialize fast commit fields and then call
 * __fc_track_fn() with update = 0. If we have already been called after a full
 * commit, we pass update = 1. Based on that, the track function can determine
 * if it needs to track a field for the first time or if it needs to just
 * update the previously tracked value.
 *
 * If enqueue is set, this function enqueues the inode in fast commit list.
 */
static int ext4_fc_track_template(
        handle_t *handle, struct inode *inode,
        int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool),
        void *args, int enqueue)
{
        bool update = false;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        tid_t tid = 0;
        int alloc_ctx;
        int ret;

        tid = handle->h_transaction->t_tid;
        spin_lock(&ei->i_fc_lock);
        if (tid == ei->i_sync_tid) {
                update = true;
        } else {
                ext4_fc_reset_inode(inode);
                ei->i_sync_tid = tid;
        }
        ret = __fc_track_fn(handle, inode, args, update);
        spin_unlock(&ei->i_fc_lock);
        if (!enqueue)
                return ret;

        alloc_ctx = ext4_fc_lock(inode->i_sb);
        if (list_empty(&EXT4_I(inode)->i_fc_list))
                list_add_tail(&EXT4_I(inode)->i_fc_list,
                                (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
                                 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
                                &sbi->s_fc_q[FC_Q_STAGING] :
                                &sbi->s_fc_q[FC_Q_MAIN]);
        ext4_fc_unlock(inode->i_sb, alloc_ctx);

        return ret;
}

struct __track_dentry_update_args {
        struct dentry *dentry;
        int op;
};

/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
static int __track_dentry_update(handle_t *handle, struct inode *inode,
                                 void *arg, bool update)
{
        struct ext4_fc_dentry_update *node;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct __track_dentry_update_args *dentry_update =
                (struct __track_dentry_update_args *)arg;
        struct dentry *dentry = dentry_update->dentry;
        struct inode *dir = dentry->d_parent->d_inode;
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int alloc_ctx;

        spin_unlock(&ei->i_fc_lock);

        if (IS_ENCRYPTED(dir)) {
                ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
                                        handle);
                spin_lock(&ei->i_fc_lock);
                return -EOPNOTSUPP;
        }

        node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
        if (!node) {
                ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
                spin_lock(&ei->i_fc_lock);
                return -ENOMEM;
        }

        node->fcd_op = dentry_update->op;
        node->fcd_parent = dir->i_ino;
        node->fcd_ino = inode->i_ino;
        take_dentry_name_snapshot(&node->fcd_name, dentry);
        INIT_LIST_HEAD(&node->fcd_dilist);
        INIT_LIST_HEAD(&node->fcd_list);
        alloc_ctx = ext4_fc_lock(sb);
        if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
                sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
                list_add_tail(&node->fcd_list,
                                &sbi->s_fc_dentry_q[FC_Q_STAGING]);
        else
                list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);

        /*
         * This helps us keep a track of all fc_dentry updates which is part of
         * this ext4 inode. So in case the inode is getting unlinked, before
         * even we get a chance to fsync, we could remove all fc_dentry
         * references while evicting the inode in ext4_fc_del().
         * Also with this, we don't need to loop over all the inodes in
         * sbi->s_fc_q to get the corresponding inode in
         * ext4_fc_commit_dentry_updates().
         */
        if (dentry_update->op == EXT4_FC_TAG_CREAT) {
                WARN_ON(!list_empty(&ei->i_fc_dilist));
                list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
        }
        ext4_fc_unlock(sb, alloc_ctx);
        spin_lock(&ei->i_fc_lock);

        return 0;
}

void __ext4_fc_track_unlink(handle_t *handle,
                struct inode *inode, struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_UNLINK;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
}

void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (ext4_fc_eligible(inode->i_sb))
                __ext4_fc_track_unlink(handle, inode, dentry);
}

void __ext4_fc_track_link(handle_t *handle,
        struct inode *inode, struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_LINK;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_link(handle, inode, dentry, ret);
}

void ext4_fc_track_link(handle_t *handle, struct inode *inode,
                        struct dentry *dentry)
{
        if (ext4_fc_eligible(inode->i_sb))
                __ext4_fc_track_link(handle, inode, dentry);
}

void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
                          struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_CREAT;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_create(handle, inode, dentry, ret);
}

void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (ext4_fc_eligible(inode->i_sb))
                __ext4_fc_track_create(handle, inode, dentry);
}

/* __track_fn for inode tracking */
static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
                         bool update)
{
        if (update)
                return -EEXIST;

        EXT4_I(inode)->i_fc_lblk_len = 0;

        return 0;
}

void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        wait_queue_head_t *wq;
        int ret;

        if (S_ISDIR(inode->i_mode))
                return;

        if (ext4_should_journal_data(inode)) {
                ext4_fc_mark_ineligible(inode->i_sb,
                                        EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
                return;
        }

        if (!ext4_fc_eligible(inode->i_sb))
                return;

        /*
         * If we come here, we may sleep while waiting for the inode to
         * commit. We shouldn't be holding i_data_sem when we go to sleep since
         * the commit path needs to grab the lock while committing the inode.
         */
        lockdep_assert_not_held(&ei->i_data_sem);

        while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
#if (BITS_PER_LONG < 64)
                DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
                                EXT4_STATE_FC_COMMITTING);
                wq = bit_waitqueue(&ei->i_state_flags,
                                   EXT4_STATE_FC_COMMITTING);
#else
                DEFINE_WAIT_BIT(wait, &ei->i_flags,
                                EXT4_STATE_FC_COMMITTING);
                wq = bit_waitqueue(&ei->i_flags,
                                   EXT4_STATE_FC_COMMITTING);
#endif
                prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
                if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
                        schedule();
                finish_wait(wq, &wait.wq_entry);
        }

        /*
         * From this point on, this inode will not be committed either
         * by fast or full commit as long as the handle is open.
         */
        ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
        trace_ext4_fc_track_inode(handle, inode, ret);
}

struct __track_range_args {
        ext4_lblk_t start, end;
};

/* __track_fn for tracking data updates */
static int __track_range(handle_t *handle, struct inode *inode, void *arg,
                         bool update)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_lblk_t oldstart;
        struct __track_range_args *__arg =
                (struct __track_range_args *)arg;

        if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
                ext4_debug("Special inode %llu being modified\n", inode->i_ino);
                return -ECANCELED;
        }

        oldstart = ei->i_fc_lblk_start;

        if (update && ei->i_fc_lblk_len > 0) {
                ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
                ei->i_fc_lblk_len =
                        max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
                                ei->i_fc_lblk_start + 1;
        } else {
                ei->i_fc_lblk_start = __arg->start;
                ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
        }

        return 0;
}

void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
                         ext4_lblk_t end)
{
        struct __track_range_args args;
        int ret;

        if (S_ISDIR(inode->i_mode))
                return;

        if (!ext4_fc_eligible(inode->i_sb))
                return;

        if (ext4_has_inline_data(inode)) {
                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
                                        handle);
                return;
        }

        args.start = start;
        args.end = end;

        ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);

        trace_ext4_fc_track_range(handle, inode, start, end, ret);
}

static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
{
        blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS;
        struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;

        /* Add REQ_FUA | REQ_PREFLUSH only its tail */
        if (test_opt(sb, BARRIER) && is_tail)
                write_flags |= REQ_FUA | REQ_PREFLUSH;
        lock_buffer(bh);
        set_buffer_dirty(bh);
        set_buffer_uptodate(bh);
        bh->b_end_io = ext4_end_buffer_io_sync;
        submit_bh(REQ_OP_WRITE | write_flags, bh);
        EXT4_SB(sb)->s_fc_bh = NULL;
}

/* Ext4 commit path routines */

/*
 * Allocate len bytes on a fast commit buffer.
 *
 * During the commit time this function is used to manage fast commit
 * block space. We don't split a fast commit log onto different
 * blocks. So this function makes sure that if there's not enough space
 * on the current block, the remaining space in the current block is
 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 * new block is from jbd2 and CRC is updated to reflect the padding
 * we added.
 */
static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
{
        struct ext4_fc_tl tl;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh;
        int bsize = sbi->s_journal->j_blocksize;
        int ret, off = sbi->s_fc_bytes % bsize;
        int remaining;
        u8 *dst;

        /*
         * If 'len' is too long to fit in any block alongside a PAD tlv, then we
         * cannot fulfill the request.
         */
        if (len > bsize - EXT4_FC_TAG_BASE_LEN)
                return NULL;

        if (!sbi->s_fc_bh) {
                ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
                if (ret)
                        return NULL;
                sbi->s_fc_bh = bh;
        }
        dst = sbi->s_fc_bh->b_data + off;

        /*
         * Allocate the bytes in the current block if we can do so while still
         * leaving enough space for a PAD tlv.
         */
        remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
        if (len <= remaining) {
                sbi->s_fc_bytes += len;
                return dst;
        }

        /*
         * Else, terminate the current block with a PAD tlv, then allocate a new
         * block and allocate the bytes at the start of that new block.
         */

        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
        tl.fc_len = cpu_to_le16(remaining);
        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
        *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize);

        ext4_fc_submit_bh(sb, false);

        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
        if (ret)
                return NULL;
        sbi->s_fc_bh = bh;
        sbi->s_fc_bytes += bsize - off + len;
        return sbi->s_fc_bh->b_data;
}

/*
 * Complete a fast commit by writing tail tag.
 *
 * Writing tail tag marks the end of a fast commit. In order to guarantee
 * atomicity, after writing tail tag, even if there's space remaining
 * in the block, next commit shouldn't use it. That's why tail tag
 * has the length as that of the remaining space on the block.
 */
static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_tl tl;
        struct ext4_fc_tail tail;
        int off, bsize = sbi->s_journal->j_blocksize;
        u8 *dst;

        /*
         * ext4_fc_reserve_space takes care of allocating an extra block if
         * there's no enough space on this block for accommodating this tail.
         */
        dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
        if (!dst)
                return -ENOSPC;

        off = sbi->s_fc_bytes % bsize;

        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
        tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
        sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);

        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        dst += EXT4_FC_TAG_BASE_LEN;
        tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
        memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
        dst += sizeof(tail.fc_tid);
        crc = ext4_chksum(crc, sbi->s_fc_bh->b_data,
                          dst - (u8 *)sbi->s_fc_bh->b_data);
        tail.fc_crc = cpu_to_le32(crc);
        memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
        dst += sizeof(tail.fc_crc);
        memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */

        ext4_fc_submit_bh(sb, true);

        return 0;
}

/*
 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 * Returns false if there's not enough space.
 */
static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
                           u32 *crc)
{
        struct ext4_fc_tl tl;
        u8 *dst;

        dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
        if (!dst)
                return false;

        tl.fc_tag = cpu_to_le16(tag);
        tl.fc_len = cpu_to_le16(len);

        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);

        return true;
}

/* Same as above, but adds dentry tlv. */
static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
                                   struct ext4_fc_dentry_update *fc_dentry)
{
        struct ext4_fc_dentry_info fcd;
        struct ext4_fc_tl tl;
        int dlen = fc_dentry->fcd_name.name.len;
        u8 *dst = ext4_fc_reserve_space(sb,
                        EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);

        if (!dst)
                return false;

        fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
        fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
        tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
        tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        dst += EXT4_FC_TAG_BASE_LEN;
        memcpy(dst, &fcd, sizeof(fcd));
        dst += sizeof(fcd);
        memcpy(dst, fc_dentry->fcd_name.name.name, dlen);

        return true;
}

/*
 * Writes inode in the fast commit space under TLV with tag @tag.
 * Returns 0 on success, error on failure.
 */
static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
        int ret;
        struct ext4_iloc iloc;
        struct ext4_fc_inode fc_inode;
        struct ext4_fc_tl tl;
        u8 *dst;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
                inode_len = EXT4_INODE_SIZE(inode->i_sb);
        else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
                inode_len += ei->i_extra_isize;

        fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
        tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));

        ret = -ECANCELED;
        dst = ext4_fc_reserve_space(inode->i_sb,
                EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
        if (!dst)
                goto err;

        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        dst += EXT4_FC_TAG_BASE_LEN;
        memcpy(dst, &fc_inode, sizeof(fc_inode));
        dst += sizeof(fc_inode);
        memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
        ret = 0;
err:
        brelse(iloc.bh);
        return ret;
}

/*
 * Writes updated data ranges for the inode in question. Updates CRC.
 * Returns 0 on success, error otherwise.
 */
static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
{
        ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_map_blocks map;
        struct ext4_fc_add_range fc_ext;
        struct ext4_fc_del_range lrange;
        struct ext4_extent *ex;
        int ret;

        spin_lock(&ei->i_fc_lock);
        if (ei->i_fc_lblk_len == 0) {
                spin_unlock(&ei->i_fc_lock);
                return 0;
        }
        old_blk_size = ei->i_fc_lblk_start;
        new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
        ei->i_fc_lblk_len = 0;
        spin_unlock(&ei->i_fc_lock);

        cur_lblk_off = old_blk_size;
        ext4_debug("will try writing %d to %d for inode %llu\n",
                   cur_lblk_off, new_blk_size, inode->i_ino);

        while (cur_lblk_off <= new_blk_size) {
                map.m_lblk = cur_lblk_off;
                map.m_len = new_blk_size - cur_lblk_off + 1;
                ret = ext4_map_blocks(NULL, inode, &map,
                                      EXT4_GET_BLOCKS_IO_SUBMIT |
                                      EXT4_EX_NOCACHE);
                if (ret < 0)
                        return -ECANCELED;

                if (map.m_len == 0) {
                        cur_lblk_off++;
                        continue;
                }

                if (ret == 0) {
                        lrange.fc_ino = cpu_to_le32(inode->i_ino);
                        lrange.fc_lblk = cpu_to_le32(map.m_lblk);
                        lrange.fc_len = cpu_to_le32(map.m_len);
                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
                                            sizeof(lrange), (u8 *)&lrange, crc))
                                return -ENOSPC;
                } else {
                        unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
                                EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;

                        /* Limit the number of blocks in one extent */
                        map.m_len = min(max, map.m_len);

                        fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
                        ex = (struct ext4_extent *)&fc_ext.fc_ex;
                        ex->ee_block = cpu_to_le32(map.m_lblk);
                        ex->ee_len = cpu_to_le16(map.m_len);
                        ext4_ext_store_pblock(ex, map.m_pblk);
                        if (map.m_flags & EXT4_MAP_UNWRITTEN)
                                ext4_ext_mark_unwritten(ex);
                        else
                                ext4_ext_mark_initialized(ex);
                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
                                            sizeof(fc_ext), (u8 *)&fc_ext, crc))
                                return -ENOSPC;
                }

                cur_lblk_off += map.m_len;
        }

        return 0;
}


/* Flushes data of all the inodes in the commit queue. */
static int ext4_fc_flush_data(journal_t *journal)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *ei;
        int ret = 0;

        list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                ret = jbd2_submit_inode_data(journal, READ_ONCE(ei->jinode));
                if (ret)
                        return ret;
        }

        list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                ret = jbd2_wait_inode_data(journal, READ_ONCE(ei->jinode));
                if (ret)
                        return ret;
        }

        return 0;
}

/* Commit all the directory entry updates */
static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
        struct inode *inode;
        struct ext4_inode_info *ei;
        int ret;

        if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
                return 0;
        list_for_each_entry_safe(fc_dentry, fc_dentry_n,
                                 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
                if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
                        if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
                                return -ENOSPC;
                        continue;
                }
                /*
                 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
                 * corresponding inode. Also, the corresponding inode could have been
                 * deleted, in which case, we don't need to do anything.
                 */
                if (list_empty(&fc_dentry->fcd_dilist))
                        continue;
                ei = list_first_entry(&fc_dentry->fcd_dilist,
                                struct ext4_inode_info, i_fc_dilist);
                inode = &ei->vfs_inode;
                WARN_ON(inode->i_ino != fc_dentry->fcd_ino);

                /*
                 * We first write the inode and then the create dirent. This
                 * allows the recovery code to create an unnamed inode first
                 * and then link it to a directory entry. This allows us
                 * to use namei.c routines almost as is and simplifies
                 * the recovery code.
                 */
                ret = ext4_fc_write_inode(inode, crc);
                if (ret)
                        return ret;
                ret = ext4_fc_write_inode_data(inode, crc);
                if (ret)
                        return ret;
                if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
                        return -ENOSPC;
        }
        return 0;
}

static int ext4_fc_perform_commit(journal_t *journal)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *iter;
        struct ext4_fc_head head;
        struct inode *inode;
        struct blk_plug plug;
        int ret = 0;
        u32 crc = 0;
        int alloc_ctx;

        /*
         * Step 1: Mark all inodes on s_fc_q[MAIN] with
         * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
         * freed until the data flush is over.
         */
        alloc_ctx = ext4_fc_lock(sb);
        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                ext4_set_inode_state(&iter->vfs_inode,
                                     EXT4_STATE_FC_FLUSHING_DATA);
        }
        ext4_fc_unlock(sb, alloc_ctx);

        /* Step 2: Flush data for all the eligible inodes. */
        ret = ext4_fc_flush_data(journal);

        /*
         * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
         * any error from step 2. This ensures that waiters waiting on
         * EXT4_STATE_FC_FLUSHING_DATA can resume.
         */
        alloc_ctx = ext4_fc_lock(sb);
        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                ext4_clear_inode_state(&iter->vfs_inode,
                                       EXT4_STATE_FC_FLUSHING_DATA);
#if (BITS_PER_LONG < 64)
                wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
#else
                wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
#endif
        }

        /*
         * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
         * the waiter checks the bit. Pairs with implicit barrier in
         * prepare_to_wait() in ext4_fc_del().
         */
        smp_mb();
        ext4_fc_unlock(sb, alloc_ctx);

        /*
         * If we encountered error in Step 2, return it now after clearing
         * EXT4_STATE_FC_FLUSHING_DATA bit.
         */
        if (ret)
                return ret;


        /* Step 4: Mark all inodes as being committed. */
        jbd2_journal_lock_updates(journal);
        /*
         * The journal is now locked. No more handles can start and all the
         * previous handles are now drained. We now mark the inodes on the
         * commit queue as being committed.
         */
        alloc_ctx = ext4_fc_lock(sb);
        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                ext4_set_inode_state(&iter->vfs_inode,
                                     EXT4_STATE_FC_COMMITTING);
        }
        ext4_fc_unlock(sb, alloc_ctx);
        jbd2_journal_unlock_updates(journal);

        /*
         * Step 5: If file system device is different from journal device,
         * issue a cache flush before we start writing fast commit blocks.
         */
        if (journal->j_fs_dev != journal->j_dev)
                blkdev_issue_flush(journal->j_fs_dev);

        blk_start_plug(&plug);
        alloc_ctx = ext4_fc_lock(sb);
        /* Step 6: Write fast commit blocks to disk. */
        if (sbi->s_fc_bytes == 0) {
                /*
                 * Step 6.1: Add a head tag only if this is the first fast
                 * commit in this TID.
                 */
                head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
                head.fc_tid = cpu_to_le32(
                        sbi->s_journal->j_running_transaction->t_tid);
                if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
                        (u8 *)&head, &crc)) {
                        ret = -ENOSPC;
                        goto out;
                }
        }

        /* Step 6.2: Now write all the dentry updates. */
        ret = ext4_fc_commit_dentry_updates(journal, &crc);
        if (ret)
                goto out;

        /* Step 6.3: Now write all the changed inodes to disk. */
        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                inode = &iter->vfs_inode;
                if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
                        continue;

                ret = ext4_fc_write_inode_data(inode, &crc);
                if (ret)
                        goto out;
                ret = ext4_fc_write_inode(inode, &crc);
                if (ret)
                        goto out;
        }
        /* Step 6.4: Finally write tail tag to conclude this fast commit. */
        ret = ext4_fc_write_tail(sb, crc);

out:
        ext4_fc_unlock(sb, alloc_ctx);
        blk_finish_plug(&plug);
        return ret;
}

static void ext4_fc_update_stats(struct super_block *sb, int status,
                                 u64 commit_time, int nblks, tid_t commit_tid)
{
        struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;

        ext4_debug("Fast commit ended with status = %d for tid %u",
                        status, commit_tid);
        if (status == EXT4_FC_STATUS_OK) {
                stats->fc_num_commits++;
                stats->fc_numblks += nblks;
                if (likely(stats->s_fc_avg_commit_time))
                        stats->s_fc_avg_commit_time =
                                (commit_time +
                                 stats->s_fc_avg_commit_time * 3) / 4;
                else
                        stats->s_fc_avg_commit_time = commit_time;
        } else if (status == EXT4_FC_STATUS_FAILED ||
                   status == EXT4_FC_STATUS_INELIGIBLE) {
                if (status == EXT4_FC_STATUS_FAILED)
                        stats->fc_failed_commits++;
                stats->fc_ineligible_commits++;
        } else {
                stats->fc_skipped_commits++;
        }
        trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
}

/*
 * The main commit entry point. Performs a fast commit for transaction
 * commit_tid if needed. If it's not possible to perform a fast commit
 * due to various reasons, we fall back to full commit. Returns 0
 * on success, error otherwise.
 */
int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int nblks = 0, ret, bsize = journal->j_blocksize;
        int subtid = atomic_read(&sbi->s_fc_subtid);
        int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
        ktime_t start_time, commit_time;
        int old_ioprio, journal_ioprio;

        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
                return jbd2_complete_transaction(journal, commit_tid);

        trace_ext4_fc_commit_start(sb, commit_tid);

        start_time = ktime_get();
        old_ioprio = get_current_ioprio();

restart_fc:
        ret = jbd2_fc_begin_commit(journal, commit_tid);
        if (ret == -EALREADY) {
                /* There was an ongoing commit, check if we need to restart */
                if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
                    tid_gt(commit_tid, journal->j_commit_sequence))
                        goto restart_fc;
                ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
                                commit_tid);
                return 0;
        } else if (ret) {
                /*
                 * Commit couldn't start. Just update stats and perform a
                 * full commit.
                 */
                ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
                                commit_tid);
                return jbd2_complete_transaction(journal, commit_tid);
        }

        /*
         * After establishing journal barrier via jbd2_fc_begin_commit(), check
         * if we are fast commit ineligible.
         */
        if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
                status = EXT4_FC_STATUS_INELIGIBLE;
                goto fallback;
        }

        /*
         * Now that we know that this thread is going to do a fast commit,
         * elevate the priority to match that of the journal thread.
         */
        if (journal->j_task->io_context)
                journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
        else
                journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
        set_task_ioprio(current, journal_ioprio);
        fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
        ret = ext4_fc_perform_commit(journal);
        if (ret < 0) {
                status = EXT4_FC_STATUS_FAILED;
                goto fallback;
        }
        nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
        ret = jbd2_fc_wait_bufs(journal, nblks);
        if (ret < 0) {
                status = EXT4_FC_STATUS_FAILED;
                goto fallback;
        }
        atomic_inc(&sbi->s_fc_subtid);
        ret = jbd2_fc_end_commit(journal);
        set_task_ioprio(current, old_ioprio);
        /*
         * weight the commit time higher than the average time so we
         * don't react too strongly to vast changes in the commit time
         */
        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
        ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
        return ret;

fallback:
        set_task_ioprio(current, old_ioprio);
        ret = jbd2_fc_end_commit_fallback(journal);
        ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
        return ret;
}

/*
 * Fast commit cleanup routine. This is called after every fast commit and
 * full commit. full is true if we are called after a full commit.
 */
static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *ei;
        struct ext4_fc_dentry_update *fc_dentry;
        int alloc_ctx;

        if (full && sbi->s_fc_bh)
                sbi->s_fc_bh = NULL;

        trace_ext4_fc_cleanup(journal, full, tid);
        jbd2_fc_release_bufs(journal);

        alloc_ctx = ext4_fc_lock(sb);
        while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
                ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
                                        struct ext4_inode_info,
                                        i_fc_list);
                list_del_init(&ei->i_fc_list);
                ext4_clear_inode_state(&ei->vfs_inode,
                                       EXT4_STATE_FC_COMMITTING);
                if (tid_geq(tid, ei->i_sync_tid)) {
                        ext4_fc_reset_inode(&ei->vfs_inode);
                } else if (full) {
                        /*
                         * We are called after a full commit, inode has been
                         * modified while the commit was running. Re-enqueue
                         * the inode into STAGING, which will then be splice
                         * back into MAIN. This cannot happen during
                         * fastcommit because the journal is locked all the
                         * time in that case (and tid doesn't increase so
                         * tid check above isn't reliable).
                         */
                        list_add_tail(&ei->i_fc_list,
                                      &sbi->s_fc_q[FC_Q_STAGING]);
                }
                /*
                 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
                 * visible before we send the wakeup. Pairs with implicit
                 * barrier in prepare_to_wait() in ext4_fc_track_inode().
                 */
                smp_mb();
#if (BITS_PER_LONG < 64)
                wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
                wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
#endif
        }

        while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
                fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
                                             struct ext4_fc_dentry_update,
                                             fcd_list);
                list_del_init(&fc_dentry->fcd_list);
                list_del_init(&fc_dentry->fcd_dilist);

                release_dentry_name_snapshot(&fc_dentry->fcd_name);
                kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
        }

        list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
                                &sbi->s_fc_dentry_q[FC_Q_MAIN]);
        list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
                                &sbi->s_fc_q[FC_Q_MAIN]);

        if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
                sbi->s_fc_ineligible_tid = 0;
                ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        }

        if (full)
                sbi->s_fc_bytes = 0;
        ext4_fc_unlock(sb, alloc_ctx);
        trace_ext4_fc_stats(sb);
}

/* Ext4 Replay Path Routines */

/* Helper struct for dentry replay routines */
struct dentry_info_args {
        int parent_ino, dname_len, ino, inode_len;
        char *dname;
};

/* Same as struct ext4_fc_tl, but uses native endianness fields */
struct ext4_fc_tl_mem {
        u16 fc_tag;
        u16 fc_len;
};

static inline void tl_to_darg(struct dentry_info_args *darg,
                              struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_dentry_info fcd;

        memcpy(&fcd, val, sizeof(fcd));

        darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
        darg->ino = le32_to_cpu(fcd.fc_ino);
        darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
        darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
}

static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_tl tl_disk;

        memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
        tl->fc_len = le16_to_cpu(tl_disk.fc_len);
        tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
}

/* Unlink replay function */
static int ext4_fc_replay_unlink(struct super_block *sb,
                                 struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct inode *inode, *old_parent;
        struct qstr entry;
        struct dentry_info_args darg;
        int ret = 0;

        tl_to_darg(&darg, tl, val);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
                        darg.parent_ino, darg.dname_len);

        entry.name = darg.dname;
        entry.len = darg.dname_len;
        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);

        if (IS_ERR(inode)) {
                ext4_debug("Inode %d not found", darg.ino);
                return 0;
        }

        old_parent = ext4_iget(sb, darg.parent_ino,
                                EXT4_IGET_NORMAL);
        if (IS_ERR(old_parent)) {
                ext4_debug("Dir with inode %d not found", darg.parent_ino);
                iput(inode);
                return 0;
        }

        ret = __ext4_unlink(old_parent, &entry, inode, NULL);
        /* -ENOENT ok coz it might not exist anymore. */
        if (ret == -ENOENT)
                ret = 0;
        iput(old_parent);
        iput(inode);
        return ret;
}

static int ext4_fc_replay_link_internal(struct super_block *sb,
                                struct dentry_info_args *darg,
                                struct inode *inode)
{
        struct inode *dir = NULL;
        struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
        int ret = 0;

        dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
        if (IS_ERR(dir)) {
                ext4_debug("Dir with inode %d not found.", darg->parent_ino);
                dir = NULL;
                goto out;
        }

        ret = __ext4_link(dir, inode, &qstr_dname, NULL);
        /*
         * It's possible that link already existed since data blocks
         * for the dir in question got persisted before we crashed OR
         * we replayed this tag and crashed before the entire replay
         * could complete.
         */
        if (ret && ret != -EEXIST) {
                ext4_debug("Failed to link\n");
                goto out;
        }

        ret = 0;
out:
        if (dir)
                iput(dir);

        return ret;
}

/* Link replay function */
static int ext4_fc_replay_link(struct super_block *sb,
                               struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct inode *inode;
        struct dentry_info_args darg;
        int ret = 0;

        tl_to_darg(&darg, tl, val);
        trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
                        darg.parent_ino, darg.dname_len);

        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode not found.");
                return 0;
        }

        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
        iput(inode);
        return ret;
}

/*
 * Record all the modified inodes during replay. We use this later to setup
 * block bitmaps correctly.
 */
static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
{
        struct ext4_fc_replay_state *state;
        int i;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_modified_inodes_used; i++)
                if (state->fc_modified_inodes[i] == ino)
                        return 0;
        if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
                int *fc_modified_inodes;

                fc_modified_inodes = krealloc(state->fc_modified_inodes,
                                sizeof(int) * (state->fc_modified_inodes_size +
                                EXT4_FC_REPLAY_REALLOC_INCREMENT),
                                GFP_KERNEL);
                if (!fc_modified_inodes)
                        return -ENOMEM;
                state->fc_modified_inodes = fc_modified_inodes;
                state->fc_modified_inodes_size +=
                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
        }
        state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
        return 0;
}

/*
 * Inode replay function
 */
static int ext4_fc_replay_inode(struct super_block *sb,
                                struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_inode fc_inode;
        struct ext4_inode *raw_inode;
        struct ext4_inode *raw_fc_inode;
        struct inode *inode = NULL;
        struct ext4_iloc iloc;
        int inode_len, ino, ret, tag = tl->fc_tag;
        struct ext4_extent_header *eh;
        size_t off_gen = offsetof(struct ext4_inode, i_generation);

        memcpy(&fc_inode, val, sizeof(fc_inode));

        ino = le32_to_cpu(fc_inode.fc_ino);
        trace_ext4_fc_replay(sb, tag, ino, 0, 0);

        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (!IS_ERR(inode)) {
                ext4_ext_clear_bb(inode);
                iput(inode);
        }
        inode = NULL;

        ret = ext4_fc_record_modified_inode(sb, ino);
        if (ret)
                goto out;

        raw_fc_inode = (struct ext4_inode *)
                (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
        ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
        if (ret)
                goto out;

        inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
        raw_inode = ext4_raw_inode(&iloc);

        memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
        memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
               inode_len - off_gen);
        if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
                eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
                if (eh->eh_magic != EXT4_EXT_MAGIC) {
                        memset(eh, 0, sizeof(*eh));
                        eh->eh_magic = EXT4_EXT_MAGIC;
                        eh->eh_max = cpu_to_le16(
                                (sizeof(raw_inode->i_block) -
                                 sizeof(struct ext4_extent_header))
                                 / sizeof(struct ext4_extent));
                }
        } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
                memcpy(raw_inode->i_block, raw_fc_inode->i_block,
                        sizeof(raw_inode->i_block));
        }

        /* Immediately update the inode on disk. */
        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
        if (ret)
                goto out_brelse;
        ret = sync_dirty_buffer(iloc.bh);
        if (ret)
                goto out_brelse;
        ret = ext4_mark_inode_used(sb, ino);
        if (ret)
                goto out_brelse;

        /* Given that we just wrote the inode on disk, this SHOULD succeed. */
        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode not found.");
                inode = NULL;
                ret = -EFSCORRUPTED;
                goto out_brelse;
        }

        /*
         * Our allocator could have made different decisions than before
         * crashing. This should be fixed but until then, we calculate
         * the number of blocks the inode.
         */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
                ext4_ext_replay_set_iblocks(inode);

        inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
        ext4_reset_inode_seed(inode);

        ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
        sync_dirty_buffer(iloc.bh);
out_brelse:
        brelse(iloc.bh);
out:
        iput(inode);
        if (!ret)
                blkdev_issue_flush(sb->s_bdev);

        return ret;
}

/*
 * Dentry create replay function.
 *
 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
 * inode for which we are trying to create a dentry here, should already have
 * been replayed before we start here.
 */
static int ext4_fc_replay_create(struct super_block *sb,
                                 struct ext4_fc_tl_mem *tl, u8 *val)
{
        int ret = 0;
        struct inode *inode = NULL;
        struct inode *dir = NULL;
        struct dentry_info_args darg;

        tl_to_darg(&darg, tl, val);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
                        darg.parent_ino, darg.dname_len);

        /* This takes care of update group descriptor and other metadata */
        ret = ext4_mark_inode_used(sb, darg.ino);
        if (ret)
                goto out;

        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("inode %d not found.", darg.ino);
                inode = NULL;
                ret = -EINVAL;
                goto out;
        }

        if (S_ISDIR(inode->i_mode)) {
                /*
                 * If we are creating a directory, we need to make sure that the
                 * dot and dot dot dirents are setup properly.
                 */
                dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
                if (IS_ERR(dir)) {
                        ext4_debug("Dir %d not found.", darg.ino);
                        goto out;
                }
                ret = ext4_init_new_dir(NULL, dir, inode);
                iput(dir);
                if (ret) {
                        ret = 0;
                        goto out;
                }
        }
        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
        if (ret)
                goto out;
        set_nlink(inode, 1);
        ext4_mark_inode_dirty(NULL, inode);
out:
        iput(inode);
        return ret;
}

/*
 * Record physical disk regions which are in use as per fast commit area,
 * and used by inodes during replay phase. Our simple replay phase
 * allocator excludes these regions from allocation.
 */
int ext4_fc_record_regions(struct super_block *sb, int ino,
                ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
{
        struct ext4_fc_replay_state *state;
        struct ext4_fc_alloc_region *region;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        /*
         * during replay phase, the fc_regions_valid may not same as
         * fc_regions_used, update it when do new additions.
         */
        if (replay && state->fc_regions_used != state->fc_regions_valid)
                state->fc_regions_used = state->fc_regions_valid;
        if (state->fc_regions_used == state->fc_regions_size) {
                struct ext4_fc_alloc_region *fc_regions;

                fc_regions = krealloc(state->fc_regions,
                                      sizeof(struct ext4_fc_alloc_region) *
                                      (state->fc_regions_size +
                                       EXT4_FC_REPLAY_REALLOC_INCREMENT),
                                      GFP_KERNEL);
                if (!fc_regions)
                        return -ENOMEM;
                state->fc_regions_size +=
                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
                state->fc_regions = fc_regions;
        }
        region = &state->fc_regions[state->fc_regions_used++];
        region->ino = ino;
        region->lblk = lblk;
        region->pblk = pblk;
        region->len = len;

        if (replay)
                state->fc_regions_valid++;

        return 0;
}

/* Replay add range tag */
static int ext4_fc_replay_add_range(struct super_block *sb, u8 *val)
{
        struct ext4_fc_add_range fc_add_ex;
        struct ext4_extent newex, *ex;
        struct inode *inode;
        ext4_lblk_t start, cur;
        int remaining, len;
        ext4_fsblk_t start_pblk;
        struct ext4_map_blocks map;
        struct ext4_ext_path *path = NULL;
        int ret;

        memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
        ex = (struct ext4_extent *)&fc_add_ex.fc_ex;

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
                le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
                ext4_ext_get_actual_len(ex));

        inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode not found.");
                return 0;
        }

        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
        if (ret)
                goto out;

        start = le32_to_cpu(ex->ee_block);
        start_pblk = ext4_ext_pblock(ex);
        len = ext4_ext_get_actual_len(ex);

        cur = start;
        remaining = len;
        ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %llu\n",
                  start, start_pblk, len, ext4_ext_is_unwritten(ex),
                  inode->i_ino);

        while (remaining > 0) {
                map.m_lblk = cur;
                map.m_len = remaining;
                map.m_pblk = 0;
                ret = ext4_map_blocks(NULL, inode, &map, 0);

                if (ret < 0)
                        goto out;

                if (ret == 0) {
                        /* Range is not mapped */
                        path = ext4_find_extent(inode, cur, path, 0);
                        if (IS_ERR(path))
                                goto out;
                        memset(&newex, 0, sizeof(newex));
                        newex.ee_block = cpu_to_le32(cur);
                        ext4_ext_store_pblock(
                                &newex, start_pblk + cur - start);
                        newex.ee_len = cpu_to_le16(map.m_len);
                        if (ext4_ext_is_unwritten(ex))
                                ext4_ext_mark_unwritten(&newex);
                        down_write(&EXT4_I(inode)->i_data_sem);
                        path = ext4_ext_insert_extent(NULL, inode,
                                                      path, &newex, 0);
                        up_write((&EXT4_I(inode)->i_data_sem));
                        if (IS_ERR(path))
                                goto out;
                        goto next;
                }

                if (start_pblk + cur - start != map.m_pblk) {
                        /*
                         * Logical to physical mapping changed. This can happen
                         * if this range was removed and then reallocated to
                         * map to new physical blocks during a fast commit.
                         */
                        ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
                                        ext4_ext_is_unwritten(ex),
                                        start_pblk + cur - start);
                        if (ret)
                                goto out;
                        /*
                         * Mark the old blocks as free since they aren't used
                         * anymore. We maintain an array of all the modified
                         * inodes. In case these blocks are still used at either
                         * a different logical range in the same inode or in
                         * some different inode, we will mark them as allocated
                         * at the end of the FC replay using our array of
                         * modified inodes.
                         */
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                        goto next;
                }

                /* Range is mapped and needs a state change */
                ext4_debug("Converting from %ld to %d %lld",
                                map.m_flags & EXT4_MAP_UNWRITTEN,
                        ext4_ext_is_unwritten(ex), map.m_pblk);
                ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
                                        ext4_ext_is_unwritten(ex), map.m_pblk);
                if (ret)
                        goto out;
                /*
                 * We may have split the extent tree while toggling the state.
                 * Try to shrink the extent tree now.
                 */
                ext4_ext_replay_shrink_inode(inode, start + len);
next:
                cur += map.m_len;
                remaining -= map.m_len;
        }
        ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
                                        sb->s_blocksize_bits);
out:
        ext4_free_ext_path(path);
        iput(inode);
        return 0;
}

/* Replay DEL_RANGE tag */
static int
ext4_fc_replay_del_range(struct super_block *sb, u8 *val)
{
        struct inode *inode;
        struct ext4_fc_del_range lrange;
        struct ext4_map_blocks map;
        ext4_lblk_t cur, remaining;
        int ret;

        memcpy(&lrange, val, sizeof(lrange));
        cur = le32_to_cpu(lrange.fc_lblk);
        remaining = le32_to_cpu(lrange.fc_len);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
                le32_to_cpu(lrange.fc_ino), cur, remaining);

        inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
                return 0;
        }

        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
        if (ret)
                goto out;

        ext4_debug("DEL_RANGE, inode %llu, lblk %d, len %d\n",
                        inode->i_ino, le32_to_cpu(lrange.fc_lblk),
                        le32_to_cpu(lrange.fc_len));
        while (remaining > 0) {
                map.m_lblk = cur;
                map.m_len = remaining;

                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        goto out;
                if (ret > 0) {
                        remaining -= ret;
                        cur += ret;
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                } else {
                        remaining -= map.m_len;
                        cur += map.m_len;
                }
        }

        down_write(&EXT4_I(inode)->i_data_sem);
        ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
                                le32_to_cpu(lrange.fc_lblk) +
                                le32_to_cpu(lrange.fc_len) - 1);
        up_write(&EXT4_I(inode)->i_data_sem);
        if (ret)
                goto out;
        ext4_ext_replay_shrink_inode(inode,
                i_size_read(inode) >> sb->s_blocksize_bits);
        ext4_mark_inode_dirty(NULL, inode);
out:
        iput(inode);
        return 0;
}

static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
{
        struct ext4_fc_replay_state *state;
        struct inode *inode;
        struct ext4_ext_path *path = NULL;
        struct ext4_map_blocks map;
        int i, ret, j;
        ext4_lblk_t cur, end;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_modified_inodes_used; i++) {
                inode = ext4_iget(sb, state->fc_modified_inodes[i],
                        EXT4_IGET_NORMAL);
                if (IS_ERR(inode)) {
                        ext4_debug("Inode %d not found.",
                                state->fc_modified_inodes[i]);
                        continue;
                }
                cur = 0;
                end = EXT_MAX_BLOCKS;
                if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
                        iput(inode);
                        continue;
                }
                while (cur < end) {
                        map.m_lblk = cur;
                        map.m_len = end - cur;

                        ret = ext4_map_blocks(NULL, inode, &map, 0);
                        if (ret < 0)
                                break;

                        if (ret > 0) {
                                path = ext4_find_extent(inode, map.m_lblk, path, 0);
                                if (!IS_ERR(path)) {
                                        for (j = 0; j < path->p_depth; j++)
                                                ext4_mb_mark_bb(inode->i_sb,
                                                        path[j].p_block, 1, true);
                                } else {
                                        path = NULL;
                                }
                                cur += ret;
                                ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
                                                        map.m_len, true);
                        } else {
                                cur = cur + (map.m_len ? map.m_len : 1);
                        }
                }
                iput(inode);
        }

        ext4_free_ext_path(path);
}

/*
 * Check if block is in excluded regions for block allocation. The simple
 * allocator that runs during replay phase is calls this function to see
 * if it is okay to use a block.
 */
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
{
        int i;
        struct ext4_fc_replay_state *state;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_regions_valid; i++) {
                if (state->fc_regions[i].ino == 0 ||
                        state->fc_regions[i].len == 0)
                        continue;
                if (in_range(blk, state->fc_regions[i].pblk,
                                        state->fc_regions[i].len))
                        return true;
        }
        return false;
}

/* Cleanup function called after replay */
void ext4_fc_replay_cleanup(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        sbi->s_mount_state &= ~EXT4_FC_REPLAY;
        kfree(sbi->s_fc_replay_state.fc_regions);
        kfree(sbi->s_fc_replay_state.fc_modified_inodes);
}

static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
                                      int tag, int len)
{
        switch (tag) {
        case EXT4_FC_TAG_ADD_RANGE:
                return len == sizeof(struct ext4_fc_add_range);
        case EXT4_FC_TAG_DEL_RANGE:
                return len == sizeof(struct ext4_fc_del_range);
        case EXT4_FC_TAG_CREAT:
        case EXT4_FC_TAG_LINK:
        case EXT4_FC_TAG_UNLINK:
                len -= sizeof(struct ext4_fc_dentry_info);
                return len >= 1 && len <= EXT4_NAME_LEN;
        case EXT4_FC_TAG_INODE:
                len -= sizeof(struct ext4_fc_inode);
                return len >= EXT4_GOOD_OLD_INODE_SIZE &&
                        len <= sbi->s_inode_size;
        case EXT4_FC_TAG_PAD:
                return true; /* padding can have any length */
        case EXT4_FC_TAG_TAIL:
                return len >= sizeof(struct ext4_fc_tail);
        case EXT4_FC_TAG_HEAD:
                return len == sizeof(struct ext4_fc_head);
        }
        return false;
}

/*
 * Recovery Scan phase handler
 *
 * This function is called during the scan phase and is responsible
 * for doing following things:
 * - Make sure the fast commit area has valid tags for replay
 * - Count number of tags that need to be replayed by the replay handler
 * - Verify CRC
 * - Create a list of excluded blocks for allocation during replay phase
 *
 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
 * to indicate that scan has finished and JBD2 can now start replay phase.
 * It returns a negative error to indicate that there was an error. At the end
 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
 * to indicate the number of tags that need to replayed during the replay phase.
 */
static int ext4_fc_replay_scan(journal_t *journal,
                                struct buffer_head *bh, int off,
                                tid_t expected_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_replay_state *state;
        int ret = JBD2_FC_REPLAY_CONTINUE;
        struct ext4_fc_add_range ext;
        struct ext4_fc_tl_mem tl;
        struct ext4_fc_tail tail;
        __u8 *start, *end, *cur, *val;
        struct ext4_fc_head head;
        struct ext4_extent *ex;

        state = &sbi->s_fc_replay_state;

        start = (u8 *)bh->b_data;
        end = start + journal->j_blocksize;

        if (state->fc_replay_expected_off == 0) {
                state->fc_cur_tag = 0;
                state->fc_replay_num_tags = 0;
                state->fc_crc = 0;
                state->fc_regions = NULL;
                state->fc_regions_valid = state->fc_regions_used =
                        state->fc_regions_size = 0;
                /* Check if we can stop early */
                if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
                        != EXT4_FC_TAG_HEAD)
                        return 0;
        }

        if (off != state->fc_replay_expected_off) {
                ret = -EFSCORRUPTED;
                goto out_err;
        }

        state->fc_replay_expected_off++;
        for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
             cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
                ext4_fc_get_tl(&tl, cur);
                val = cur + EXT4_FC_TAG_BASE_LEN;
                if (tl.fc_len > end - val ||
                    !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
                        ret = state->fc_replay_num_tags ?
                                JBD2_FC_REPLAY_STOP : -ECANCELED;
                        goto out_err;
                }
                ext4_debug("Scan phase, tag:%s, blk %lld\n",
                           tag2str(tl.fc_tag), bh->b_blocknr);
                switch (tl.fc_tag) {
                case EXT4_FC_TAG_ADD_RANGE:
                        memcpy(&ext, val, sizeof(ext));
                        ex = (struct ext4_extent *)&ext.fc_ex;
                        ret = ext4_fc_record_regions(sb,
                                le32_to_cpu(ext.fc_ino),
                                le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
                                ext4_ext_get_actual_len(ex), 0);
                        if (ret < 0)
                                break;
                        ret = JBD2_FC_REPLAY_CONTINUE;
                        fallthrough;
                case EXT4_FC_TAG_DEL_RANGE:
                case EXT4_FC_TAG_LINK:
                case EXT4_FC_TAG_UNLINK:
                case EXT4_FC_TAG_CREAT:
                case EXT4_FC_TAG_INODE:
                case EXT4_FC_TAG_PAD:
                        state->fc_cur_tag++;
                        state->fc_crc = ext4_chksum(state->fc_crc, cur,
                                EXT4_FC_TAG_BASE_LEN + tl.fc_len);
                        break;
                case EXT4_FC_TAG_TAIL:
                        state->fc_cur_tag++;
                        memcpy(&tail, val, sizeof(tail));
                        state->fc_crc = ext4_chksum(state->fc_crc, cur,
                                                EXT4_FC_TAG_BASE_LEN +
                                                offsetof(struct ext4_fc_tail,
                                                fc_crc));
                        if (le32_to_cpu(tail.fc_tid) == expected_tid &&
                                le32_to_cpu(tail.fc_crc) == state->fc_crc) {
                                state->fc_replay_num_tags = state->fc_cur_tag;
                                state->fc_regions_valid =
                                        state->fc_regions_used;
                        } else {
                                ret = state->fc_replay_num_tags ?
                                        JBD2_FC_REPLAY_STOP : -EFSBADCRC;
                        }
                        state->fc_crc = 0;
                        break;
                case EXT4_FC_TAG_HEAD:
                        memcpy(&head, val, sizeof(head));
                        if (le32_to_cpu(head.fc_features) &
                                ~EXT4_FC_SUPPORTED_FEATURES) {
                                ret = -EOPNOTSUPP;
                                break;
                        }
                        if (le32_to_cpu(head.fc_tid) != expected_tid) {
                                ret = JBD2_FC_REPLAY_STOP;
                                break;
                        }
                        state->fc_cur_tag++;
                        state->fc_crc = ext4_chksum(state->fc_crc, cur,
                                EXT4_FC_TAG_BASE_LEN + tl.fc_len);
                        break;
                default:
                        ret = state->fc_replay_num_tags ?
                                JBD2_FC_REPLAY_STOP : -ECANCELED;
                }
                if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
                        break;
        }

out_err:
        trace_ext4_fc_replay_scan(sb, ret, off);
        return ret;
}

/*
 * Main recovery path entry point.
 * The meaning of return codes is similar as above.
 */
static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
                                enum passtype pass, int off, tid_t expected_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_tl_mem tl;
        __u8 *start, *end, *cur, *val;
        int ret = JBD2_FC_REPLAY_CONTINUE;
        struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
        struct ext4_fc_tail tail;

        if (pass == PASS_SCAN) {
                state->fc_current_pass = PASS_SCAN;
                return ext4_fc_replay_scan(journal, bh, off, expected_tid);
        }

        if (state->fc_current_pass != pass) {
                state->fc_current_pass = pass;
                sbi->s_mount_state |= EXT4_FC_REPLAY;
        }
        if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
                ext4_debug("Replay stops\n");
                ext4_fc_set_bitmaps_and_counters(sb);
                return 0;
        }

#ifdef CONFIG_EXT4_DEBUG
        if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
                pr_warn("Dropping fc block %d because max_replay set\n", off);
                return JBD2_FC_REPLAY_STOP;
        }
#endif

        start = (u8 *)bh->b_data;
        end = start + journal->j_blocksize;

        for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
             cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
                ext4_fc_get_tl(&tl, cur);
                val = cur + EXT4_FC_TAG_BASE_LEN;

                if (state->fc_replay_num_tags == 0) {
                        ret = JBD2_FC_REPLAY_STOP;
                        ext4_fc_set_bitmaps_and_counters(sb);
                        break;
                }

                ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
                state->fc_replay_num_tags--;
                switch (tl.fc_tag) {
                case EXT4_FC_TAG_LINK:
                        ret = ext4_fc_replay_link(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_UNLINK:
                        ret = ext4_fc_replay_unlink(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_ADD_RANGE:
                        ret = ext4_fc_replay_add_range(sb, val);
                        break;
                case EXT4_FC_TAG_CREAT:
                        ret = ext4_fc_replay_create(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_DEL_RANGE:
                        ret = ext4_fc_replay_del_range(sb, val);
                        break;
                case EXT4_FC_TAG_INODE:
                        ret = ext4_fc_replay_inode(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_PAD:
                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
                                             tl.fc_len, 0);
                        break;
                case EXT4_FC_TAG_TAIL:
                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
                                             0, tl.fc_len, 0);
                        memcpy(&tail, val, sizeof(tail));
                        WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
                        break;
                case EXT4_FC_TAG_HEAD:
                        break;
                default:
                        trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
                        ret = -ECANCELED;
                        break;
                }
                if (ret < 0)
                        break;
                ret = JBD2_FC_REPLAY_CONTINUE;
        }
        return ret;
}

void ext4_fc_init(struct super_block *sb, journal_t *journal)
{
        /*
         * We set replay callback even if fast commit disabled because we may
         * could still have fast commit blocks that need to be replayed even if
         * fast commit has now been turned off.
         */
        journal->j_fc_replay_callback = ext4_fc_replay;
        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
                return;
        journal->j_fc_cleanup_callback = ext4_fc_cleanup;
}

static const char * const fc_ineligible_reasons[] = {
        [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
        [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
        [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
        [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
        [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
        [EXT4_FC_REASON_RESIZE] = "Resize",
        [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
        [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
        [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
        [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
        [EXT4_FC_REASON_MIGRATE] = "Inode format migration",
        [EXT4_FC_REASON_VERITY] = "fs-verity enable",
        [EXT4_FC_REASON_MOVE_EXT] = "Move extents",
};

int ext4_fc_info_show(struct seq_file *seq, void *v)
{
        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
        struct ext4_fc_stats *stats = &sbi->s_fc_stats;
        int i;

        if (v != SEQ_START_TOKEN)
                return 0;

        seq_printf(seq,
                "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
                   stats->fc_num_commits, stats->fc_ineligible_commits,
                   stats->fc_numblks,
                   div_u64(stats->s_fc_avg_commit_time, 1000));
        seq_puts(seq, "Ineligible reasons:\n");
        for (i = 0; i < EXT4_FC_REASON_MAX; i++)
                seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
                        stats->fc_ineligible_reason_count[i]);

        return 0;
}

int __init ext4_fc_init_dentry_cache(void)
{
        ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
                                           SLAB_RECLAIM_ACCOUNT);

        if (ext4_fc_dentry_cachep == NULL)
                return -ENOMEM;

        return 0;
}

void ext4_fc_destroy_dentry_cache(void)
{
        kmem_cache_destroy(ext4_fc_dentry_cachep);
}



























































































































































































































































































































































































































































































































































































































    1 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  linux/include/linux/hfsplus_fs.h
 *
 * Copyright (C) 1999
 * Brad Boyer (flar@pants.nu)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 */

#ifndef _LINUX_HFSPLUS_FS_H
#define _LINUX_HFSPLUS_FS_H

#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/fs_context.h>
#include "hfsplus_raw.h"

/* Runtime config options */
#define HFSPLUS_DEF_CR_TYPE    0x3F3F3F3F  /* '????' */

#define HFSPLUS_TYPE_DATA 0x00
#define HFSPLUS_TYPE_RSRC 0xFF

typedef int (*btree_keycmp)(const hfsplus_btree_key *,
                const hfsplus_btree_key *);

#define NODE_HASH_SIZE        256

/* B-tree mutex nested subclasses */
enum hfsplus_btree_mutex_classes {
        CATALOG_BTREE_MUTEX,
        EXTENTS_BTREE_MUTEX,
        ATTR_BTREE_MUTEX,
};

/* An HFS+ BTree held in memory */
struct hfs_btree {
        struct super_block *sb;
        struct inode *inode;
        btree_keycmp keycmp;

        u32 cnid;
        u32 root;
        u32 leaf_count;
        u32 leaf_head;
        u32 leaf_tail;
        u32 node_count;
        u32 free_nodes;
        u32 attributes;

        unsigned int node_size;
        unsigned int node_size_shift;
        unsigned int max_key_len;
        unsigned int depth;

        struct mutex tree_lock;

        unsigned int pages_per_bnode;
        spinlock_t hash_lock;
        struct hfs_bnode *node_hash[NODE_HASH_SIZE];
        int node_hash_cnt;
};

struct page;

/* An HFS+ BTree node in memory */
struct hfs_bnode {
        struct hfs_btree *tree;

        u32 prev;
        u32 this;
        u32 next;
        u32 parent;

        u16 num_recs;
        u8 type;
        u8 height;

        struct hfs_bnode *next_hash;
        unsigned long flags;
        wait_queue_head_t lock_wq;
        atomic_t refcnt;
        unsigned int page_offset;
        struct page *page[];
};

#define HFS_BNODE_LOCK                0
#define HFS_BNODE_ERROR                1
#define HFS_BNODE_NEW                2
#define HFS_BNODE_DIRTY                3
#define HFS_BNODE_DELETED        4

/*
 * Attributes file states
 */
#define HFSPLUS_EMPTY_ATTR_TREE                0
#define HFSPLUS_CREATING_ATTR_TREE        1
#define HFSPLUS_VALID_ATTR_TREE                2
#define HFSPLUS_FAILED_ATTR_TREE        3

/*
 * HFS+ superblock info (built from Volume Header on disk)
 */

struct hfsplus_vh;
struct hfs_btree;

struct hfsplus_sb_info {
        void *s_vhdr_buf;
        struct hfsplus_vh *s_vhdr;
        void *s_backup_vhdr_buf;
        struct hfsplus_vh *s_backup_vhdr;
        struct hfs_btree *ext_tree;
        struct hfs_btree *cat_tree;
        struct hfs_btree *attr_tree;
        atomic_t attr_tree_state;
        struct inode *alloc_file;
        struct inode *hidden_dir;
        struct nls_table *nls;

        /* Runtime variables */
        u32 blockoffset;
        u32 min_io_size;
        sector_t part_start;
        sector_t sect_count;
        int fs_shift;

        /* immutable data from the volume header */
        u32 alloc_blksz;
        int alloc_blksz_shift;
        u32 total_blocks;
        u32 data_clump_blocks, rsrc_clump_blocks;

        /* mutable data from the volume header, protected by alloc_mutex */
        u32 free_blocks;
        struct mutex alloc_mutex;

        /* mutable data from the volume header, protected by vh_mutex */
        u32 next_cnid;
        u32 file_count;
        u32 folder_count;
        struct mutex vh_mutex;

        /* Config options */
        u32 creator;
        u32 type;

        umode_t umask;
        kuid_t uid;
        kgid_t gid;

        int part, session;
        unsigned long flags;

        int work_queued;               /* non-zero delayed work is queued */
        struct delayed_work sync_work; /* FS sync delayed work */
        spinlock_t work_lock;          /* protects sync_work and work_queued */
        struct rcu_head rcu;
};

#define HFSPLUS_SB_WRITEBACKUP        0
#define HFSPLUS_SB_NODECOMPOSE        1
#define HFSPLUS_SB_FORCE        2
#define HFSPLUS_SB_HFSX                3
#define HFSPLUS_SB_CASEFOLD        4
#define HFSPLUS_SB_NOBARRIER        5
#define HFSPLUS_SB_UID                6
#define HFSPLUS_SB_GID                7

static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}


struct hfsplus_inode_info {
        atomic_t opencnt;

        /*
         * Extent allocation information, protected by extents_lock.
         */
        u32 first_blocks;
        u32 clump_blocks;
        u32 alloc_blocks;
        u32 cached_start;
        u32 cached_blocks;
        hfsplus_extent_rec first_extents;
        hfsplus_extent_rec cached_extents;
        unsigned int extent_state;
        struct mutex extents_lock;

        /*
         * Immutable data.
         */
        struct inode *rsrc_inode;
        __be32 create_date;

        /*
         * Protected by sbi->vh_mutex.
         */
        u32 linkid;

        /*
         * Accessed using atomic bitops.
         */
        unsigned long flags;

        /*
         * Protected by i_mutex.
         */
        sector_t fs_blocks;
        u8 userflags;                /* BSD user file flags */
        u32 subfolders;                /* Subfolder count (HFSX only) */
        struct list_head open_dir_list;
        spinlock_t open_dir_lock;
        loff_t phys_size;

        struct inode vfs_inode;
};

#define HFSPLUS_EXT_DIRTY        0x0001
#define HFSPLUS_EXT_NEW                0x0002

#define HFSPLUS_I_RSRC                0        /* represents a resource fork */
#define HFSPLUS_I_CAT_DIRTY        1        /* has changes in the catalog tree */
#define HFSPLUS_I_EXT_DIRTY        2        /* has changes in the extent tree */
#define HFSPLUS_I_ALLOC_DIRTY        3        /* has changes in the allocation file */
#define HFSPLUS_I_ATTR_DIRTY        4        /* has changes in the attributes tree */

#define HFSPLUS_IS_RSRC(inode) \
        test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)

static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
{
        return container_of(inode, struct hfsplus_inode_info, vfs_inode);
}

#define HFSPLUS_CAT_TREE_I(sb) \
        HFSPLUS_SB(sb)->cat_tree->inode
#define HFSPLUS_EXT_TREE_I(sb) \
        HFSPLUS_SB(sb)->ext_tree->inode
#define HFSPLUS_ATTR_TREE_I(sb) \
        HFSPLUS_SB(sb)->attr_tree->inode

/*
 * Mark an inode dirty, and also mark the btree in which the
 * specific type of metadata is stored.
 * For data or metadata that gets written back by into the catalog btree
 * by hfsplus_write_inode a plain mark_inode_dirty call is enough.
 */
static inline void hfsplus_mark_inode_dirty(struct inode *inode,
                unsigned int flag)
{
        set_bit(flag, &HFSPLUS_I(inode)->flags);
        mark_inode_dirty(inode);
}

struct hfs_find_data {
        /* filled by caller */
        hfsplus_btree_key *search_key;
        hfsplus_btree_key *key;
        /* filled by find */
        struct hfs_btree *tree;
        struct hfs_bnode *bnode;
        /* filled by findrec */
        int record;
        int keyoffset, keylength;
        int entryoffset, entrylength;
};

struct hfsplus_readdir_data {
        struct list_head list;
        struct file *file;
        struct hfsplus_cat_key key;
};

/*
 * Find minimum acceptible I/O size for an hfsplus sb.
 */
static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
{
        return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size,
                     HFSPLUS_SECTOR_SIZE);
}

#define hfs_btree_open hfsplus_btree_open
#define hfs_btree_close hfsplus_btree_close
#define hfs_btree_write hfsplus_btree_write
#define hfs_bmap_reserve hfsplus_bmap_reserve
#define hfs_bmap_alloc hfsplus_bmap_alloc
#define hfs_bmap_free hfsplus_bmap_free
#define hfs_bnode_read hfsplus_bnode_read
#define hfs_bnode_read_u16 hfsplus_bnode_read_u16
#define hfs_bnode_read_u8 hfsplus_bnode_read_u8
#define hfs_bnode_read_key hfsplus_bnode_read_key
#define hfs_bnode_write hfsplus_bnode_write
#define hfs_bnode_write_u16 hfsplus_bnode_write_u16
#define hfs_bnode_clear hfsplus_bnode_clear
#define hfs_bnode_copy hfsplus_bnode_copy
#define hfs_bnode_move hfsplus_bnode_move
#define hfs_bnode_dump hfsplus_bnode_dump
#define hfs_bnode_unlink hfsplus_bnode_unlink
#define hfs_bnode_findhash hfsplus_bnode_findhash
#define hfs_bnode_find hfsplus_bnode_find
#define hfs_bnode_unhash hfsplus_bnode_unhash
#define hfs_bnode_free hfsplus_bnode_free
#define hfs_bnode_create hfsplus_bnode_create
#define hfs_bnode_get hfsplus_bnode_get
#define hfs_bnode_put hfsplus_bnode_put
#define hfs_brec_lenoff hfsplus_brec_lenoff
#define hfs_brec_keylen hfsplus_brec_keylen
#define hfs_brec_insert hfsplus_brec_insert
#define hfs_brec_remove hfsplus_brec_remove
#define hfs_find_init hfsplus_find_init
#define hfs_find_exit hfsplus_find_exit
#define __hfs_brec_find __hfsplus_brec_find
#define hfs_brec_find hfsplus_brec_find
#define hfs_brec_read hfsplus_brec_read
#define hfs_brec_goto hfsplus_brec_goto
#define hfs_part_find hfsplus_part_find

/*
 * hfs+-specific ioctl for making the filesystem bootable
 */
#define HFSPLUS_IOC_BLESS _IO('h', 0x80)

typedef int (*search_strategy_t)(struct hfs_bnode *,
                                struct hfs_find_data *,
                                int *, int *, int *);

/*
 * Functions in any *.c used in other files
 */

/* attributes.c */
int __init hfsplus_create_attr_tree_cache(void);
void hfsplus_destroy_attr_tree_cache(void);
int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1,
                             const hfsplus_btree_key *k2);
int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
                           u32 cnid, const char *name);
hfsplus_attr_entry *hfsplus_alloc_attr_entry(void);
void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry);
int hfsplus_find_attr(struct super_block *sb, u32 cnid, const char *name,
                      struct hfs_find_data *fd);
int hfsplus_attr_exists(struct inode *inode, const char *name);
int hfsplus_create_attr(struct inode *inode, const char *name,
                        const void *value, size_t size);
int hfsplus_delete_attr(struct inode *inode, const char *name);
int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid);
int hfsplus_replace_attr(struct inode *inode,
                         const char *name,
                         const void *value, size_t size);

/* bitmap.c */
int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset,
                           u32 *max);
int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count);

/* btree.c */
u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors,
                                  int file_id);
struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id);
void hfs_btree_close(struct hfs_btree *tree);
int hfs_btree_write(struct hfs_btree *tree);
int hfs_bmap_reserve(struct hfs_btree *tree, u32 rsvd_nodes);
struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree);
void hfs_bmap_free(struct hfs_bnode *node);

/* bnode.c */
void hfs_bnode_read(struct hfs_bnode *node, void *buf, u32 off, u32 len);
u16 hfs_bnode_read_u16(struct hfs_bnode *node, u32 off);
u8 hfs_bnode_read_u8(struct hfs_bnode *node, u32 off);
void hfs_bnode_read_key(struct hfs_bnode *node, void *key, u32 off);
void hfs_bnode_write(struct hfs_bnode *node, void *buf, u32 off, u32 len);
void hfs_bnode_write_u16(struct hfs_bnode *node, u32 off, u16 data);
void hfs_bnode_clear(struct hfs_bnode *node, u32 off, u32 len);
void hfs_bnode_copy(struct hfs_bnode *dst_node, u32 dst,
                    struct hfs_bnode *src_node, u32 src, u32 len);
void hfs_bnode_move(struct hfs_bnode *node, u32 dst, u32 src, u32 len);
void hfs_bnode_dump(struct hfs_bnode *node);
void hfs_bnode_unlink(struct hfs_bnode *node);
struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid);
void hfs_bnode_unhash(struct hfs_bnode *node);
struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num);
void hfs_bnode_free(struct hfs_bnode *node);
struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num);
void hfs_bnode_get(struct hfs_bnode *node);
void hfs_bnode_put(struct hfs_bnode *node);
bool hfs_bnode_need_zeroout(struct hfs_btree *tree);

/* brec.c */
u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off);
u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec);
int hfs_brec_insert(struct hfs_find_data *fd, void *entry, u32 entry_len);
int hfs_brec_remove(struct hfs_find_data *fd);

/* bfind.c */
int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd);
void hfs_find_exit(struct hfs_find_data *fd);
int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd,
                             int *begin, int *end, int *cur_rec);
int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd,
                        int *begin, int *end, int *cur_rec);
int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd,
                    search_strategy_t rec_found);
int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare);
int hfs_brec_read(struct hfs_find_data *fd, void *rec, u32 rec_len);
int hfs_brec_goto(struct hfs_find_data *fd, int cnt);

/* catalog.c */
int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1,
                             const hfsplus_btree_key *k2);
int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
                            const hfsplus_btree_key *k2);
int hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
                           u32 parent, const struct qstr *str);
void hfsplus_cat_build_key_with_cnid(struct super_block *sb,
                                     hfsplus_btree_key *key, u32 parent);
void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
int hfsplus_find_cat(struct super_block *sb, u32 cnid,
                     struct hfs_find_data *fd);
int hfsplus_create_cat(u32 cnid, struct inode *dir, const struct qstr *str,
                       struct inode *inode);
int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str);
int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
                       struct inode *dst_dir, const struct qstr *dst_name);

/* dir.c */
extern const struct inode_operations hfsplus_dir_inode_operations;
extern const struct file_operations hfsplus_dir_operations;

/* extents.c */
int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1,
                        const hfsplus_btree_key *k2);
int hfsplus_ext_write_extent(struct inode *inode);
int hfsplus_get_block(struct inode *inode, sector_t iblock,
                      struct buffer_head *bh_result, int create);
int hfsplus_free_fork(struct super_block *sb, u32 cnid,
                      struct hfsplus_fork_raw *fork, int type);
int hfsplus_file_extend(struct inode *inode, bool zeroout);
void hfsplus_file_truncate(struct inode *inode);

/* inode.c */
extern const struct address_space_operations hfsplus_aops;
extern const struct address_space_operations hfsplus_btree_aops;
extern const struct dentry_operations hfsplus_dentry_operations;

int hfsplus_write_begin(const struct kiocb *iocb,
                        struct address_space *mapping,
                        loff_t pos, unsigned len, struct folio **foliop,
                        void **fsdata);
struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
                                umode_t mode);
void hfsplus_delete_inode(struct inode *inode);
void hfsplus_inode_read_fork(struct inode *inode,
                             struct hfsplus_fork_raw *fork);
void hfsplus_inode_write_fork(struct inode *inode,
                              struct hfsplus_fork_raw *fork);
int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd);
int hfsplus_cat_write_inode(struct inode *inode);
int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path,
                    struct kstat *stat, u32 request_mask,
                    unsigned int query_flags);
int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
                       int datasync);
int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int hfsplus_fileattr_set(struct mnt_idmap *idmap,
                         struct dentry *dentry, struct file_kattr *fa);

/* ioctl.c */
long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);

/* options.c */
void hfsplus_fill_defaults(struct hfsplus_sb_info *opts);
int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param);
int hfsplus_show_options(struct seq_file *seq, struct dentry *root);

/* part_tbl.c */
int hfs_part_find(struct super_block *sb, sector_t *part_start,
                  sector_t *part_size);

/* super.c */
struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino);
void hfsplus_mark_mdb_dirty(struct super_block *sb);
void hfsplus_prepare_volume_header_for_commit(struct hfsplus_vh *vhdr);
int hfsplus_commit_superblock(struct super_block *sb);

/* tables.c */
extern u16 hfsplus_case_fold_table[];
extern u16 hfsplus_decompose_table[];
extern u16 hfsplus_compose_table[];

/* unicode.c */
int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
                       const struct hfsplus_unistr *s2);
int hfsplus_strcmp(const struct hfsplus_unistr *s1,
                   const struct hfsplus_unistr *s2);
int hfsplus_uni2asc_str(struct super_block *sb,
                        const struct hfsplus_unistr *ustr, char *astr,
                        int *len_p);
int hfsplus_uni2asc_xattr_str(struct super_block *sb,
                              const struct hfsplus_attr_unistr *ustr,
                              char *astr, int *len_p);
int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
                    int max_unistr_len, const char *astr, int len,
                    int name_type);
int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
int hfsplus_compare_dentry(const struct dentry *dentry, unsigned int len,
                           const char *str, const struct qstr *name);

/* wrapper.c */
int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf,
                       void **data, blk_opf_t opf);
int hfsplus_read_wrapper(struct super_block *sb);

static inline u32 hfsplus_cat_thread_size(const struct hfsplus_cat_thread *thread)
{
        return offsetof(struct hfsplus_cat_thread, nodeName) +
               offsetof(struct hfsplus_unistr, unicode) +
               be16_to_cpu(thread->nodeName.length) * sizeof(hfsplus_unichr);
}

int hfsplus_brec_read_cat(struct hfs_find_data *fd, hfsplus_cat_entry *entry);

/*
 * time helpers: convert between 1904-base and 1970-base timestamps
 *
 * HFS+ implementations are highly inconsistent, this one matches the
 * traditional behavior of 64-bit Linux, giving the most useful
 * time range between 1970 and 2106, by treating any on-disk timestamp
 * under HFSPLUS_UTC_OFFSET (Jan 1 1970) as a time between 2040 and 2106.
 */
#define HFSPLUS_UTC_OFFSET 2082844800U

static inline time64_t __hfsp_mt2ut(__be32 mt)
{
        time64_t ut = (u32)(be32_to_cpu(mt) - HFSPLUS_UTC_OFFSET);

        return ut;
}

static inline __be32 __hfsp_ut2mt(time64_t ut)
{
        return cpu_to_be32(lower_32_bits(ut) + HFSPLUS_UTC_OFFSET);
}

static inline enum hfsplus_btree_mutex_classes
hfsplus_btree_lock_class(struct hfs_btree *tree)
{
        enum hfsplus_btree_mutex_classes class;

        switch (tree->cnid) {
        case HFSPLUS_CAT_CNID:
                class = CATALOG_BTREE_MUTEX;
                break;
        case HFSPLUS_EXT_CNID:
                class = EXTENTS_BTREE_MUTEX;
                break;
        case HFSPLUS_ATTR_CNID:
                class = ATTR_BTREE_MUTEX;
                break;
        default:
                BUG();
        }
        return class;
}

static inline
bool is_bnode_offset_valid(struct hfs_bnode *node, u32 off)
{
        bool is_valid;

        if (!node || !node->tree)
                return false;

        is_valid = off < node->tree->node_size;

        if (!is_valid) {
                pr_err("requested invalid offset: "
                       "NODE: id %u, type %#x, height %u, "
                       "node_size %u, offset %u\n",
                       node->this, node->type, node->height,
                       node->tree->node_size, off);
        }

        return is_valid;
}

static inline
u32 check_and_correct_requested_length(struct hfs_bnode *node, u32 off, u32 len)
{
        unsigned int node_size;

        if (!is_bnode_offset_valid(node, off))
                return 0;

        node_size = node->tree->node_size;

        if ((off + len) > node_size) {
                u32 new_len = node_size - off;

                pr_err("requested length has been corrected: "
                       "NODE: id %u, type %#x, height %u, "
                       "node_size %u, offset %u, "
                       "requested_len %u, corrected_len %u\n",
                       node->this, node->type, node->height,
                       node->tree->node_size, off, len, new_len);

                return new_len;
        }

        return len;
}

/* compatibility */
#define hfsp_mt2ut(t)                (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) }
#define hfsp_ut2mt(t)                __hfsp_ut2mt((t).tv_sec)
#define hfsp_now2mt()                __hfsp_ut2mt(ktime_get_real_seconds())

#endif

























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 */
#ifndef _LINUX_RADIX_TREE_H
#define _LINUX_RADIX_TREE_H

#include <linux/bitops.h>
#include <linux/gfp_types.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/math.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/xarray.h>
#include <linux/local_lock.h>

/* Keep unconverted code working */
#define radix_tree_root                xarray
#define radix_tree_node                xa_node

struct radix_tree_preload {
        local_lock_t lock;
        unsigned nr;
        /* nodes->parent points to next preallocated node */
        struct radix_tree_node *nodes;
};
DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads);

/*
 * The bottom two bits of the slot determine how the remaining bits in the
 * slot are interpreted:
 *
 * 00 - data pointer
 * 10 - internal entry
 * x1 - value entry
 *
 * The internal entry may be a pointer to the next level in the tree, a
 * sibling entry, or an indicator that the entry in this slot has been moved
 * to another location in the tree and the lookup should be restarted.  While
 * NULL fits the 'data pointer' pattern, it means that there is no entry in
 * the tree for this index (no matter what level of the tree it is found at).
 * This means that storing a NULL entry in the tree is the same as deleting
 * the entry from the tree.
 */
#define RADIX_TREE_ENTRY_MASK                3UL
#define RADIX_TREE_INTERNAL_NODE        2UL

static inline bool radix_tree_is_internal_node(void *ptr)
{
        return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
                                RADIX_TREE_INTERNAL_NODE;
}

/*** radix-tree API starts here ***/

#define RADIX_TREE_MAP_SHIFT        XA_CHUNK_SHIFT
#define RADIX_TREE_MAP_SIZE        (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK        (RADIX_TREE_MAP_SIZE-1)

#define RADIX_TREE_MAX_TAGS        XA_MAX_MARKS
#define RADIX_TREE_TAG_LONGS        XA_MARK_LONGS

#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
                                          RADIX_TREE_MAP_SHIFT))

/* The IDR tag is stored in the low bits of xa_flags */
#define ROOT_IS_IDR        ((__force gfp_t)4)
/* The top bits of xa_flags are used to store the root tags */
#define ROOT_TAG_SHIFT        (__GFP_BITS_SHIFT)

#define RADIX_TREE_INIT(name, mask)        XARRAY_INIT(name, mask)

#define RADIX_TREE(name, mask) \
        struct radix_tree_root name = RADIX_TREE_INIT(name, mask)

#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask)

static inline bool radix_tree_empty(const struct radix_tree_root *root)
{
        return root->xa_head == NULL;
}

/**
 * struct radix_tree_iter - radix tree iterator state
 *
 * @index:        index of current slot
 * @next_index:        one beyond the last index for this chunk
 * @tags:        bit-mask for tag-iterating
 * @node:        node that contains current slot
 *
 * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
 * subinterval of slots contained within one radix tree leaf node.  It is
 * described by a pointer to its first slot and a struct radix_tree_iter
 * which holds the chunk's position in the tree and its size.  For tagged
 * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
 * radix tree tag.
 */
struct radix_tree_iter {
        unsigned long        index;
        unsigned long        next_index;
        unsigned long        tags;
        struct radix_tree_node *node;
};

/**
 * Radix-tree synchronization
 *
 * The radix-tree API requires that users provide all synchronisation (with
 * specific exceptions, noted below).
 *
 * Synchronization of access to the data items being stored in the tree, and
 * management of their lifetimes must be completely managed by API users.
 *
 * For API usage, in general,
 * - any function _modifying_ the tree or tags (inserting or deleting
 *   items, setting or clearing tags) must exclude other modifications, and
 *   exclude any functions reading the tree.
 * - any function _reading_ the tree or tags (looking up items or tags,
 *   gang lookups) must exclude modifications to the tree, but may occur
 *   concurrently with other readers.
 *
 * The notable exceptions to this rule are the following functions:
 * __radix_tree_lookup
 * radix_tree_lookup
 * radix_tree_lookup_slot
 * radix_tree_tag_get
 * radix_tree_gang_lookup
 * radix_tree_gang_lookup_tag
 * radix_tree_gang_lookup_tag_slot
 * radix_tree_tagged
 *
 * The first 7 functions are able to be called locklessly, using RCU. The
 * caller must ensure calls to these functions are made within rcu_read_lock()
 * regions. Other readers (lock-free or otherwise) and modifications may be
 * running concurrently.
 *
 * It is still required that the caller manage the synchronization and lifetimes
 * of the items. So if RCU lock-free lookups are used, typically this would mean
 * that the items have their own locks, or are amenable to lock-free access; and
 * that the items are freed by RCU (or only freed after having been deleted from
 * the radix tree *and* a synchronize_rcu() grace period).
 *
 * (Note, rcu_assign_pointer and rcu_dereference are not needed to control
 * access to data items when inserting into or looking up from the radix tree)
 *
 * Note that the value returned by radix_tree_tag_get() may not be relied upon
 * if only the RCU read lock is held.  Functions to set/clear tags and to
 * delete nodes running concurrently with it may affect its result such that
 * two consecutive reads in the same locked section may return different
 * values.  If reliability is required, modification functions must also be
 * excluded from concurrency.
 *
 * radix_tree_tagged is able to be called without locking or RCU.
 */

/**
 * radix_tree_deref_slot - dereference a slot
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * For use with radix_tree_lookup_slot().  Caller must hold tree at least read
 * locked across slot lookup and dereference. Not required if write lock is
 * held (ie. items cannot be concurrently inserted).
 *
 * radix_tree_deref_retry must be used to confirm validity of the pointer if
 * only the read lock is held.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot(void __rcu **slot)
{
        return rcu_dereference(*slot);
}

/**
 * radix_tree_deref_slot_protected - dereference a slot with tree lock held
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * Similar to radix_tree_deref_slot.  The caller does not hold the RCU read
 * lock but it must hold the tree lock to prevent parallel updates.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot_protected(void __rcu **slot,
                                                        spinlock_t *treelock)
{
        return rcu_dereference_protected(*slot, lockdep_is_held(treelock));
}

/**
 * radix_tree_deref_retry        - check radix_tree_deref_slot
 * @arg:        pointer returned by radix_tree_deref_slot
 * Returns:        0 if retry is not required, otherwise retry is required
 *
 * radix_tree_deref_retry must be used with radix_tree_deref_slot.
 */
static inline int radix_tree_deref_retry(void *arg)
{
        return unlikely(radix_tree_is_internal_node(arg));
}

/**
 * radix_tree_exception        - radix_tree_deref_slot returned either exception?
 * @arg:        value returned by radix_tree_deref_slot
 * Returns:        0 if well-aligned pointer, non-0 if either kind of exception.
 */
static inline int radix_tree_exception(void *arg)
{
        return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}

int radix_tree_insert(struct radix_tree_root *, unsigned long index,
                        void *);
void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
                          struct radix_tree_node **nodep, void __rcu ***slotp);
void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
                                        unsigned long index);
void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
                          void __rcu **slot, void *entry);
void radix_tree_iter_replace(struct radix_tree_root *,
                const struct radix_tree_iter *, void __rcu **slot, void *entry);
void radix_tree_replace_slot(struct radix_tree_root *,
                             void __rcu **slot, void *entry);
void radix_tree_iter_delete(struct radix_tree_root *,
                        struct radix_tree_iter *iter, void __rcu **slot);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
                        void **results, unsigned long first_index,
                        unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void *radix_tree_tag_clear(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
int radix_tree_tag_get(const struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void radix_tree_iter_tag_clear(struct radix_tree_root *,
                const struct radix_tree_iter *iter, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
                void **results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);

static inline void radix_tree_preload_end(void)
{
        local_unlock(&radix_tree_preloads.lock);
}

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max);

enum {
        RADIX_TREE_ITER_TAG_MASK = 0x0f,        /* tag index in lower nybble */
        RADIX_TREE_ITER_TAGGED   = 0x10,        /* lookup tagged slots */
        RADIX_TREE_ITER_CONTIG   = 0x20,        /* stop at first hole */
};

/**
 * radix_tree_iter_init - initialize radix tree iterator
 *
 * @iter:        pointer to iterator state
 * @start:        iteration starting index
 * Returns:        NULL
 */
static __always_inline void __rcu **
radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
{
        /*
         * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it
         * in the case of a successful tagged chunk lookup.  If the lookup was
         * unsuccessful or non-tagged then nobody cares about ->tags.
         *
         * Set index to zero to bypass next_index overflow protection.
         * See the comment in radix_tree_next_chunk() for details.
         */
        iter->index = 0;
        iter->next_index = start;
        return NULL;
}

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if there no more left
 *
 * This function looks up the next chunk in the radix tree starting from
 * @iter->next_index.  It returns a pointer to the chunk's first slot.
 * Also it fills @iter with data about chunk: position in the tree (index),
 * its end (next_index), and constructs a bit mask for tagged iterating (tags).
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *,
                             struct radix_tree_iter *iter, unsigned flags);

/**
 * radix_tree_iter_lookup - look up an index in the radix tree
 * @root: radix tree root
 * @iter: iterator state
 * @index: key to look up
 *
 * If @index is present in the radix tree, this function returns the slot
 * containing it and updates @iter to describe the entry.  If @index is not
 * present, it returns NULL.
 */
static inline void __rcu **
radix_tree_iter_lookup(const struct radix_tree_root *root,
                        struct radix_tree_iter *iter, unsigned long index)
{
        radix_tree_iter_init(iter, index);
        return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG);
}

/**
 * radix_tree_iter_retry - retry this chunk of the iteration
 * @iter:        iterator state
 *
 * If we iterate over a tree protected only by the RCU lock, a race
 * against deletion or creation may result in seeing a slot for which
 * radix_tree_deref_retry() returns true.  If so, call this function
 * and continue the iteration.
 */
static inline __must_check
void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
{
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}

static inline unsigned long
__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
{
        return iter->index + slots;
}

/**
 * radix_tree_iter_resume - resume iterating when the chunk may be invalid
 * @slot: pointer to current slot
 * @iter: iterator state
 * Returns: New slot pointer
 *
 * If the iterator needs to release then reacquire a lock, the chunk may
 * have been invalidated by an insertion or deletion.  Call this function
 * before releasing the lock to continue the iteration from the next index.
 */
void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter);

/**
 * radix_tree_chunk_size - get current chunk size
 *
 * @iter:        pointer to radix tree iterator
 * Returns:        current chunk size
 */
static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
        return iter->next_index - iter->index;
}

/**
 * radix_tree_next_slot - find next slot in chunk
 *
 * @slot:        pointer to current slot
 * @iter:        pointer to iterator state
 * @flags:        RADIX_TREE_ITER_*, should be constant
 * Returns:        pointer to next slot, or NULL if there no more left
 *
 * This function updates @iter->index in the case of a successful lookup.
 * For tagged lookup it also eats @iter->tags.
 *
 * There are several cases where 'slot' can be passed in as NULL to this
 * function.  These cases result from the use of radix_tree_iter_resume() or
 * radix_tree_iter_retry().  In these cases we don't end up dereferencing
 * 'slot' because either:
 * a) we are doing tagged iteration and iter->tags has been set to 0, or
 * b) we are doing non-tagged iteration, and iter->index and iter->next_index
 *    have been set up so that radix_tree_chunk_size() returns 1 or 0.
 */
static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
                                struct radix_tree_iter *iter, unsigned flags)
{
        if (flags & RADIX_TREE_ITER_TAGGED) {
                iter->tags >>= 1;
                if (unlikely(!iter->tags))
                        return NULL;
                if (likely(iter->tags & 1ul)) {
                        iter->index = __radix_tree_iter_add(iter, 1);
                        slot++;
                        goto found;
                }
                if (!(flags & RADIX_TREE_ITER_CONTIG)) {
                        unsigned offset = __ffs(iter->tags);

                        iter->tags >>= offset++;
                        iter->index = __radix_tree_iter_add(iter, offset);
                        slot += offset;
                        goto found;
                }
        } else {
                long count = radix_tree_chunk_size(iter);

                while (--count > 0) {
                        slot++;
                        iter->index = __radix_tree_iter_add(iter, 1);

                        if (likely(*slot))
                                goto found;
                        if (flags & RADIX_TREE_ITER_CONTIG) {
                                /* forbid switching to the next chunk */
                                iter->next_index = 0;
                                break;
                        }
                }
        }
        return NULL;

 found:
        return slot;
}

/**
 * radix_tree_for_each_slot - iterate over non-empty slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_slot(slot, root, iter, start)                \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter, 0)) ;        \
             slot = radix_tree_next_slot(slot, iter, 0))

/**
 * radix_tree_for_each_tagged - iterate over tagged slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 * @tag:        tag index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_tagged(slot, root, iter, start, tag)        \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter,                \
                              RADIX_TREE_ITER_TAGGED | tag)) ;                \
             slot = radix_tree_next_slot(slot, iter,                        \
                                RADIX_TREE_ITER_TAGGED | tag))

#endif /* _LINUX_RADIX_TREE_H */





























    1 









    1 




















    1 





























    1 









    1 




















    1 

























    1 
























































































































































































    1 




















    1 




    1 





    1 





    1 


















    1 



















    1 








    1 







    1 






    1 



















































































































































    1 



































































































































































    1 









    1 




    1 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
/* inflate.c -- zlib decompression
 * Copyright (C) 1995-2005 Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 *
 * Based on zlib 1.2.3 but modified for the Linux Kernel by
 * Richard Purdie <richard@openedhand.com>
 *
 * Changes mainly for static instead of dynamic memory allocation
 *
 */

#include <linux/zutil.h>
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"
#include "infutil.h"

/* architecture-specific bits */
#ifdef CONFIG_ZLIB_DFLTCC
#  include "../zlib_dfltcc/dfltcc_inflate.h"
#else
#define INFLATE_RESET_HOOK(strm) do {} while (0)
#define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0)
#define INFLATE_NEED_UPDATEWINDOW(strm) 1
#define INFLATE_NEED_CHECKSUM(strm) 1
#endif

int zlib_inflate_workspacesize(void)
{
    return sizeof(struct inflate_workspace);
}

int zlib_inflateReset(z_streamp strm)
{
    struct inflate_state *state;

    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
    state = (struct inflate_state *)strm->state;
    strm->total_in = strm->total_out = state->total = 0;
    strm->msg = NULL;
    strm->adler = 1;        /* to support ill-conceived Java test suite */
    state->mode = HEAD;
    state->last = 0;
    state->havedict = 0;
    state->dmax = 32768U;
    state->hold = 0;
    state->bits = 0;
    state->lencode = state->distcode = state->next = state->codes;

    /* Initialise Window */
    state->wsize = 1U << state->wbits;
    state->write = 0;
    state->whave = 0;

    INFLATE_RESET_HOOK(strm);
    return Z_OK;
}

int zlib_inflateInit2(z_streamp strm, int windowBits)
{
    struct inflate_state *state;

    if (strm == NULL) return Z_STREAM_ERROR;
    strm->msg = NULL;                 /* in case we return an error */

    state = &WS(strm)->inflate_state;
    strm->state = (struct internal_state *)state;

    if (windowBits < 0) {
        state->wrap = 0;
        windowBits = -windowBits;
    }
    else {
        state->wrap = (windowBits >> 4) + 1;
    }
    if (windowBits < 8 || windowBits > 15) {
        return Z_STREAM_ERROR;
    }
    state->wbits = (unsigned)windowBits;
#ifdef CONFIG_ZLIB_DFLTCC
    /*
     * DFLTCC requires the window to be page aligned.
     * Thus, we overallocate and take the aligned portion of the buffer.
     */
    state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE);
#else
    state->window = &WS(strm)->working_window[0];
#endif

    return zlib_inflateReset(strm);
}

/*
   Return state with length and distance decoding tables and index sizes set to
   fixed code decoding.  This returns fixed tables from inffixed.h.
 */
static void zlib_fixedtables(struct inflate_state *state)
{
#   include "inffixed.h"
    state->lencode = lenfix;
    state->lenbits = 9;
    state->distcode = distfix;
    state->distbits = 5;
}


/*
   Update the window with the last wsize (normally 32K) bytes written before
   returning. This is only called when a window is already in use, or when
   output has been written during this inflate call, but the end of the deflate
   stream has not been reached yet. It is also called to window dictionary data
   when a dictionary is loaded.

   Providing output buffers larger than 32K to inflate() should provide a speed
   advantage, since only the last 32K of output is copied to the sliding window
   upon return from inflate(), and since all distances after the first 32K of
   output will fall in the output data, making match copies simpler and faster.
   The advantage may be dependent on the size of the processor's data caches.
 */
static void zlib_updatewindow(z_streamp strm, unsigned out)
{
    struct inflate_state *state;
    unsigned copy, dist;

    state = (struct inflate_state *)strm->state;

    /* copy state->wsize or less output bytes into the circular window */
    copy = out - strm->avail_out;
    if (copy >= state->wsize) {
        memcpy(state->window, strm->next_out - state->wsize, state->wsize);
        state->write = 0;
        state->whave = state->wsize;
    }
    else {
        dist = state->wsize - state->write;
        if (dist > copy) dist = copy;
        memcpy(state->window + state->write, strm->next_out - copy, dist);
        copy -= dist;
        if (copy) {
            memcpy(state->window, strm->next_out - copy, copy);
            state->write = copy;
            state->whave = state->wsize;
        }
        else {
            state->write += dist;
            if (state->write == state->wsize) state->write = 0;
            if (state->whave < state->wsize) state->whave += dist;
        }
    }
}


/*
 * At the end of a Deflate-compressed PPP packet, we expect to have seen
 * a `stored' block type value but not the (zero) length bytes.
 */
/*
   Returns true if inflate is currently at the end of a block generated by
   Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
   implementation to provide an additional safety check. PPP uses
   Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
   block. When decompressing, PPP checks that at the end of input packet,
   inflate is waiting for these length bytes.
 */
static int zlib_inflateSyncPacket(z_streamp strm)
{
    struct inflate_state *state;

    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
    state = (struct inflate_state *)strm->state;

    if (state->mode == STORED && state->bits == 0) {
        state->mode = TYPE;
        return Z_OK;
    }
    return Z_DATA_ERROR;
}

/* Macros for inflate(): */

/* check function to use adler32() for zlib or crc32() for gzip */
#define UPDATE(check, buf, len) zlib_adler32(check, buf, len)

/* Load registers with state in inflate() for speed */
#define LOAD() \
    do { \
        put = strm->next_out; \
        left = strm->avail_out; \
        next = strm->next_in; \
        have = strm->avail_in; \
        hold = state->hold; \
        bits = state->bits; \
    } while (0)

/* Restore state from registers in inflate() */
#define RESTORE() \
    do { \
        strm->next_out = put; \
        strm->avail_out = left; \
        strm->next_in = next; \
        strm->avail_in = have; \
        state->hold = hold; \
        state->bits = bits; \
    } while (0)

/* Clear the input bit accumulator */
#define INITBITS() \
    do { \
        hold = 0; \
        bits = 0; \
    } while (0)

/* Get a byte of input into the bit accumulator, or return from inflate()
   if there is no input available. */
#define PULLBYTE() \
    do { \
        if (have == 0) goto inf_leave; \
        have--; \
        hold += (unsigned long)(*next++) << bits; \
        bits += 8; \
    } while (0)

/* Assure that there are at least n bits in the bit accumulator.  If there is
   not enough available input to do that, then return from inflate(). */
#define NEEDBITS(n) \
    do { \
        while (bits < (unsigned)(n)) \
            PULLBYTE(); \
    } while (0)

/* Return the low n bits of the bit accumulator (n < 16) */
#define BITS(n) \
    ((unsigned)hold & ((1U << (n)) - 1))

/* Remove n bits from the bit accumulator */
#define DROPBITS(n) \
    do { \
        hold >>= (n); \
        bits -= (unsigned)(n); \
    } while (0)

/* Remove zero to seven bits as needed to go to a byte boundary */
#define BYTEBITS() \
    do { \
        hold >>= bits & 7; \
        bits -= bits & 7; \
    } while (0)

/*
   inflate() uses a state machine to process as much input data and generate as
   much output data as possible before returning.  The state machine is
   structured roughly as follows:

    for (;;) switch (state) {
    ...
    case STATEn:
        if (not enough input data or output space to make progress)
            return;
        ... make progress ...
        state = STATEm;
        break;
    ...
    }

   so when inflate() is called again, the same case is attempted again, and
   if the appropriate resources are provided, the machine proceeds to the
   next state.  The NEEDBITS() macro is usually the way the state evaluates
   whether it can proceed or should return.  NEEDBITS() does the return if
   the requested bits are not available.  The typical use of the BITS macros
   is:

        NEEDBITS(n);
        ... do something with BITS(n) ...
        DROPBITS(n);

   where NEEDBITS(n) either returns from inflate() if there isn't enough
   input left to load n bits into the accumulator, or it continues.  BITS(n)
   gives the low n bits in the accumulator.  When done, DROPBITS(n) drops
   the low n bits off the accumulator.  INITBITS() clears the accumulator
   and sets the number of available bits to zero.  BYTEBITS() discards just
   enough bits to put the accumulator on a byte boundary.  After BYTEBITS()
   and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.

   NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
   if there is no input available.  The decoding of variable length codes uses
   PULLBYTE() directly in order to pull just enough bytes to decode the next
   code, and no more.

   Some states loop until they get enough input, making sure that enough
   state information is maintained to continue the loop where it left off
   if NEEDBITS() returns in the loop.  For example, want, need, and keep
   would all have to actually be part of the saved state in case NEEDBITS()
   returns:

    case STATEw:
        while (want < need) {
            NEEDBITS(n);
            keep[want++] = BITS(n);
            DROPBITS(n);
        }
        state = STATEx;
    case STATEx:

   As shown above, if the next state is also the next case, then the break
   is omitted.

   A state may also return if there is not enough output space available to
   complete that state.  Those states are copying stored data, writing a
   literal byte, and copying a matching string.

   When returning, a "goto inf_leave" is used to update the total counters,
   update the check value, and determine whether any progress has been made
   during that inflate() call in order to return the proper return code.
   Progress is defined as a change in either strm->avail_in or strm->avail_out.
   When there is a window, goto inf_leave will update the window with the last
   output written.  If a goto inf_leave occurs in the middle of decompression
   and there is no window currently, goto inf_leave will create one and copy
   output to the window for the next call of inflate().

   In this implementation, the flush parameter of inflate() only affects the
   return code (per zlib.h).  inflate() always writes as much as possible to
   strm->next_out, given the space available and the provided input--the effect
   documented in zlib.h of Z_SYNC_FLUSH.  Furthermore, inflate() always defers
   the allocation of and copying into a sliding window until necessary, which
   provides the effect documented in zlib.h for Z_FINISH when the entire input
   stream available.  So the only thing the flush parameter actually does is:
   when flush is set to Z_FINISH, inflate() cannot return Z_OK.  Instead it
   will return Z_BUF_ERROR if it has not reached the end of the stream.
 */

int zlib_inflate(z_streamp strm, int flush)
{
    struct inflate_state *state;
    const unsigned char *next;  /* next input */
    unsigned char *put;         /* next output */
    unsigned have, left;        /* available input and output */
    unsigned long hold;         /* bit buffer */
    unsigned bits;              /* bits in bit buffer */
    unsigned in, out;           /* save starting available input and output */
    unsigned copy;              /* number of stored or match bytes to copy */
    unsigned char *from;        /* where to copy match bytes from */
    code this;                  /* current decoding table entry */
    code last;                  /* parent table entry */
    unsigned len;               /* length to copy for repeats, bits to drop */
    int ret;                    /* return code */
    static const unsigned short order[19] = /* permutation of code lengths */
        {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};

    /* Do not check for strm->next_out == NULL here as ppc zImage
       inflates to strm->next_out = 0 */

    if (strm == NULL || strm->state == NULL ||
        (strm->next_in == NULL && strm->avail_in != 0))
        return Z_STREAM_ERROR;

    state = (struct inflate_state *)strm->state;

    if (state->mode == TYPE) state->mode = TYPEDO;      /* skip check */
    LOAD();
    in = have;
    out = left;
    ret = Z_OK;
    for (;;)
        switch (state->mode) {
        case HEAD:
            if (state->wrap == 0) {
                state->mode = TYPEDO;
                break;
            }
            NEEDBITS(16);
            if (
                ((BITS(8) << 8) + (hold >> 8)) % 31) {
                strm->msg = (char *)"incorrect header check";
                state->mode = BAD;
                break;
            }
            if (BITS(4) != Z_DEFLATED) {
                strm->msg = (char *)"unknown compression method";
                state->mode = BAD;
                break;
            }
            DROPBITS(4);
            len = BITS(4) + 8;
            if (len > state->wbits) {
                strm->msg = (char *)"invalid window size";
                state->mode = BAD;
                break;
            }
            state->dmax = 1U << len;
            strm->adler = state->check = zlib_adler32(0L, NULL, 0);
            state->mode = hold & 0x200 ? DICTID : TYPE;
            INITBITS();
            break;
        case DICTID:
            NEEDBITS(32);
            strm->adler = state->check = REVERSE(hold);
            INITBITS();
            state->mode = DICT;
            fallthrough;
        case DICT:
            if (state->havedict == 0) {
                RESTORE();
                return Z_NEED_DICT;
            }
            strm->adler = state->check = zlib_adler32(0L, NULL, 0);
            state->mode = TYPE;
            fallthrough;
        case TYPE:
            if (flush == Z_BLOCK) goto inf_leave;
            fallthrough;
        case TYPEDO:
            INFLATE_TYPEDO_HOOK(strm, flush);
            if (state->last) {
                BYTEBITS();
                state->mode = CHECK;
                break;
            }
            NEEDBITS(3);
            state->last = BITS(1);
            DROPBITS(1);
            switch (BITS(2)) {
            case 0:                             /* stored block */
                state->mode = STORED;
                break;
            case 1:                             /* fixed block */
                zlib_fixedtables(state);
                state->mode = LEN;              /* decode codes */
                break;
            case 2:                             /* dynamic block */
                state->mode = TABLE;
                break;
            case 3:
                strm->msg = (char *)"invalid block type";
                state->mode = BAD;
            }
            DROPBITS(2);
            break;
        case STORED:
            BYTEBITS();                         /* go to byte boundary */
            NEEDBITS(32);
            if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
                strm->msg = (char *)"invalid stored block lengths";
                state->mode = BAD;
                break;
            }
            state->length = (unsigned)hold & 0xffff;
            INITBITS();
            state->mode = COPY;
            fallthrough;
        case COPY:
            copy = state->length;
            if (copy) {
                if (copy > have) copy = have;
                if (copy > left) copy = left;
                if (copy == 0) goto inf_leave;
                memcpy(put, next, copy);
                have -= copy;
                next += copy;
                left -= copy;
                put += copy;
                state->length -= copy;
                break;
            }
            state->mode = TYPE;
            break;
        case TABLE:
            NEEDBITS(14);
            state->nlen = BITS(5) + 257;
            DROPBITS(5);
            state->ndist = BITS(5) + 1;
            DROPBITS(5);
            state->ncode = BITS(4) + 4;
            DROPBITS(4);
#ifndef PKZIP_BUG_WORKAROUND
            if (state->nlen > 286 || state->ndist > 30) {
                strm->msg = (char *)"too many length or distance symbols";
                state->mode = BAD;
                break;
            }
#endif
            state->have = 0;
            state->mode = LENLENS;
            fallthrough;
        case LENLENS:
            while (state->have < state->ncode) {
                NEEDBITS(3);
                state->lens[order[state->have++]] = (unsigned short)BITS(3);
                DROPBITS(3);
            }
            while (state->have < 19)
                state->lens[order[state->have++]] = 0;
            state->next = state->codes;
            state->lencode = (code const *)(state->next);
            state->lenbits = 7;
            ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next),
                                &(state->lenbits), state->work);
            if (ret) {
                strm->msg = (char *)"invalid code lengths set";
                state->mode = BAD;
                break;
            }
            state->have = 0;
            state->mode = CODELENS;
            fallthrough;
        case CODELENS:
            while (state->have < state->nlen + state->ndist) {
                for (;;) {
                    this = state->lencode[BITS(state->lenbits)];
                    if ((unsigned)(this.bits) <= bits) break;
                    PULLBYTE();
                }
                if (this.val < 16) {
                    NEEDBITS(this.bits);
                    DROPBITS(this.bits);
                    state->lens[state->have++] = this.val;
                }
                else {
                    if (this.val == 16) {
                        NEEDBITS(this.bits + 2);
                        DROPBITS(this.bits);
                        if (state->have == 0) {
                            strm->msg = (char *)"invalid bit length repeat";
                            state->mode = BAD;
                            break;
                        }
                        len = state->lens[state->have - 1];
                        copy = 3 + BITS(2);
                        DROPBITS(2);
                    }
                    else if (this.val == 17) {
                        NEEDBITS(this.bits + 3);
                        DROPBITS(this.bits);
                        len = 0;
                        copy = 3 + BITS(3);
                        DROPBITS(3);
                    }
                    else {
                        NEEDBITS(this.bits + 7);
                        DROPBITS(this.bits);
                        len = 0;
                        copy = 11 + BITS(7);
                        DROPBITS(7);
                    }
                    if (state->have + copy > state->nlen + state->ndist) {
                        strm->msg = (char *)"invalid bit length repeat";
                        state->mode = BAD;
                        break;
                    }
                    while (copy--)
                        state->lens[state->have++] = (unsigned short)len;
                }
            }

            /* handle error breaks in while */
            if (state->mode == BAD) break;

            /* build code tables */
            state->next = state->codes;
            state->lencode = (code const *)(state->next);
            state->lenbits = 9;
            ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next),
                                &(state->lenbits), state->work);
            if (ret) {
                strm->msg = (char *)"invalid literal/lengths set";
                state->mode = BAD;
                break;
            }
            state->distcode = (code const *)(state->next);
            state->distbits = 6;
            ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist,
                            &(state->next), &(state->distbits), state->work);
            if (ret) {
                strm->msg = (char *)"invalid distances set";
                state->mode = BAD;
                break;
            }
            state->mode = LEN;
            fallthrough;
        case LEN:
            if (have >= 6 && left >= 258) {
                RESTORE();
                inflate_fast(strm, out);
                LOAD();
                break;
            }
            for (;;) {
                this = state->lencode[BITS(state->lenbits)];
                if ((unsigned)(this.bits) <= bits) break;
                PULLBYTE();
            }
            if (this.op && (this.op & 0xf0) == 0) {
                last = this;
                for (;;) {
                    this = state->lencode[last.val +
                            (BITS(last.bits + last.op) >> last.bits)];
                    if ((unsigned)(last.bits + this.bits) <= bits) break;
                    PULLBYTE();
                }
                DROPBITS(last.bits);
            }
            DROPBITS(this.bits);
            state->length = (unsigned)this.val;
            if ((int)(this.op) == 0) {
                state->mode = LIT;
                break;
            }
            if (this.op & 32) {
                state->mode = TYPE;
                break;
            }
            if (this.op & 64) {
                strm->msg = (char *)"invalid literal/length code";
                state->mode = BAD;
                break;
            }
            state->extra = (unsigned)(this.op) & 15;
            state->mode = LENEXT;
            fallthrough;
        case LENEXT:
            if (state->extra) {
                NEEDBITS(state->extra);
                state->length += BITS(state->extra);
                DROPBITS(state->extra);
            }
            state->mode = DIST;
            fallthrough;
        case DIST:
            for (;;) {
                this = state->distcode[BITS(state->distbits)];
                if ((unsigned)(this.bits) <= bits) break;
                PULLBYTE();
            }
            if ((this.op & 0xf0) == 0) {
                last = this;
                for (;;) {
                    this = state->distcode[last.val +
                            (BITS(last.bits + last.op) >> last.bits)];
                    if ((unsigned)(last.bits + this.bits) <= bits) break;
                    PULLBYTE();
                }
                DROPBITS(last.bits);
            }
            DROPBITS(this.bits);
            if (this.op & 64) {
                strm->msg = (char *)"invalid distance code";
                state->mode = BAD;
                break;
            }
            state->offset = (unsigned)this.val;
            state->extra = (unsigned)(this.op) & 15;
            state->mode = DISTEXT;
            fallthrough;
        case DISTEXT:
            if (state->extra) {
                NEEDBITS(state->extra);
                state->offset += BITS(state->extra);
                DROPBITS(state->extra);
            }
#ifdef INFLATE_STRICT
            if (state->offset > state->dmax) {
                strm->msg = (char *)"invalid distance too far back";
                state->mode = BAD;
                break;
            }
#endif
            if (state->offset > state->whave + out - left) {
                strm->msg = (char *)"invalid distance too far back";
                state->mode = BAD;
                break;
            }
            state->mode = MATCH;
            fallthrough;
        case MATCH:
            if (left == 0) goto inf_leave;
            copy = out - left;
            if (state->offset > copy) {         /* copy from window */
                copy = state->offset - copy;
                if (copy > state->write) {
                    copy -= state->write;
                    from = state->window + (state->wsize - copy);
                }
                else
                    from = state->window + (state->write - copy);
                if (copy > state->length) copy = state->length;
            }
            else {                              /* copy from output */
                from = put - state->offset;
                copy = state->length;
            }
            if (copy > left) copy = left;
            left -= copy;
            state->length -= copy;
            do {
                *put++ = *from++;
            } while (--copy);
            if (state->length == 0) state->mode = LEN;
            break;
        case LIT:
            if (left == 0) goto inf_leave;
            *put++ = (unsigned char)(state->length);
            left--;
            state->mode = LEN;
            break;
        case CHECK:
            if (state->wrap) {
                NEEDBITS(32);
                out -= left;
                strm->total_out += out;
                state->total += out;
                if (INFLATE_NEED_CHECKSUM(strm) && out)
                    strm->adler = state->check =
                        UPDATE(state->check, put - out, out);
                out = left;
                if ((
                     REVERSE(hold)) != state->check) {
                    strm->msg = (char *)"incorrect data check";
                    state->mode = BAD;
                    break;
                }
                INITBITS();
            }
            state->mode = DONE;
            fallthrough;
        case DONE:
            ret = Z_STREAM_END;
            goto inf_leave;
        case BAD:
            ret = Z_DATA_ERROR;
            goto inf_leave;
        case MEM:
            return Z_MEM_ERROR;
        case SYNC:
        default:
            return Z_STREAM_ERROR;
        }

    /*
       Return from inflate(), updating the total counts and the check value.
       If there was no progress during the inflate() call, return a buffer
       error.  Call zlib_updatewindow() to create and/or update the window state.
     */
  inf_leave:
    RESTORE();
    if (INFLATE_NEED_UPDATEWINDOW(strm) &&
            (state->wsize || (state->mode < CHECK && out != strm->avail_out)))
        zlib_updatewindow(strm, out);

    in -= strm->avail_in;
    out -= strm->avail_out;
    strm->total_in += in;
    strm->total_out += out;
    state->total += out;
    if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out)
        strm->adler = state->check =
            UPDATE(state->check, strm->next_out - out, out);

    strm->data_type = state->bits + (state->last ? 64 : 0) +
                      (state->mode == TYPE ? 128 : 0);

    if (flush == Z_PACKET_FLUSH && ret == Z_OK &&
            strm->avail_out != 0 && strm->avail_in == 0)
                return zlib_inflateSyncPacket(strm);

    if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
        ret = Z_BUF_ERROR;

    return ret;
}

int zlib_inflateEnd(z_streamp strm)
{
    if (strm == NULL || strm->state == NULL)
        return Z_STREAM_ERROR;
    return Z_OK;
}

/*
 * This subroutine adds the data at next_in/avail_in to the output history
 * without performing any output.  The output buffer must be "caught up";
 * i.e. no pending output but this should always be the case. The state must
 * be waiting on the start of a block (i.e. mode == TYPE or HEAD).  On exit,
 * the output will also be caught up, and the checksum will have been updated
 * if need be.
 */
int zlib_inflateIncomp(z_stream *z)
{
    struct inflate_state *state = (struct inflate_state *)z->state;
    Byte *saved_no = z->next_out;
    uInt saved_ao = z->avail_out;

    if (state->mode != TYPE && state->mode != HEAD)
        return Z_DATA_ERROR;

    /* Setup some variables to allow misuse of updateWindow */
    z->avail_out = 0;
    z->next_out = (unsigned char*)z->next_in + z->avail_in;

    zlib_updatewindow(z, z->avail_in);

    /* Restore saved variables */
    z->avail_out = saved_ao;
    z->next_out = saved_no;

    z->adler = state->check =
        UPDATE(state->check, z->next_in, z->avail_in);

    z->total_out += z->avail_in;
    z->total_in += z->avail_in;
    z->next_in += z->avail_in;
    state->total += z->avail_in;
    z->avail_in = 0;

    return Z_OK;
}


















































































































   12 







































































































































































































































































































































    2 





































    3 


    3 










    3 







    3 








    3 



    3 


    3 





















    8 
























    3 















    3 










    3 




















































































































    2 



































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the IP module.
 *
 * Version:        @(#)ip.h        1.0.2        05/07/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *
 * Changes:
 *                Mike McLagan    :       Routing by source
 */
#ifndef _IP_H
#define _IP_H

#include <linux/types.h>
#include <linux/ip.h>
#include <linux/in.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/sockptr.h>
#include <linux/static_key.h>

#include <net/inet_sock.h>
#include <net/route.h>
#include <net/snmp.h>
#include <net/flow.h>
#include <net/flow_dissector.h>
#include <net/netns/hash.h>
#include <net/lwtunnel.h>
#include <net/inet_dscp.h>

#define IPV4_MAX_PMTU                65535U                /* RFC 2675, Section 5.1 */
#define IPV4_MIN_MTU                68                        /* RFC 791 */

extern unsigned int sysctl_fib_sync_mem;
extern unsigned int sysctl_fib_sync_mem_min;
extern unsigned int sysctl_fib_sync_mem_max;

struct sock;

struct inet_skb_parm {
        int                        iif;
        struct ip_options        opt;                /* Compiled IP options                */
        u16                        flags;

#define IPSKB_FORWARDED                BIT(0)
#define IPSKB_XFRM_TUNNEL_SIZE        BIT(1)
#define IPSKB_XFRM_TRANSFORMED        BIT(2)
#define IPSKB_FRAG_COMPLETE        BIT(3)
#define IPSKB_REROUTED                BIT(4)
#define IPSKB_DOREDIRECT        BIT(5)
#define IPSKB_FRAG_PMTU                BIT(6)
#define IPSKB_L3SLAVE                BIT(7)
#define IPSKB_NOPOLICY                BIT(8)
#define IPSKB_MULTIPATH                BIT(9)
#define IPSKB_MCROUTE                BIT(10)

        u16                        frag_max_size;
};

static inline bool ipv4_l3mdev_skb(u16 flags)
{
        return !!(flags & IPSKB_L3SLAVE);
}

static inline unsigned int ip_hdrlen(const struct sk_buff *skb)
{
        return ip_hdr(skb)->ihl * 4;
}

struct ipcm_cookie {
        struct sockcm_cookie        sockc;
        __be32                        addr;
        int                        oif;
        struct ip_options_rcu        *opt;
        __u8                        protocol;
        __u8                        ttl;
        __s16                        tos;
        __u16                        gso_size;
};

static inline void ipcm_init(struct ipcm_cookie *ipcm)
{
        *ipcm = (struct ipcm_cookie) { .tos = -1 };
}

static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
                                const struct inet_sock *inet)
{
        *ipcm = (struct ipcm_cookie) {
                .tos = READ_ONCE(inet->tos),
        };

        sockcm_init(&ipcm->sockc, &inet->sk);

        ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
        ipcm->addr = inet->inet_saddr;
        ipcm->protocol = READ_ONCE(inet->inet_num);
}

#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
#define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb))

/* return enslaved device index if relevant */
static inline int inet_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
                return IPCB(skb)->iif;
#endif
        return 0;
}

/* Special input handler for packets caught by router alert option.
   They are selected only by protocol field, and then processed likely
   local ones; but only if someone wants them! Otherwise, router
   not running rsvpd will kill RSVP.

   It is user level problem, what it will make with them.
   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
   but receiver should be enough clever f.e. to forward mtrace requests,
   sent to multicast group to reach destination designated router.
 */

struct ip_ra_chain {
        struct ip_ra_chain __rcu *next;
        struct sock                *sk;
        union {
                void                        (*destructor)(struct sock *);
                struct sock                *saved_sk;
        };
        struct rcu_head                rcu;
};

/* IP flags. */
#define IP_CE                0x8000                /* Flag: "Congestion"                */
#define IP_DF                0x4000                /* Flag: "Don't Fragment"        */
#define IP_MF                0x2000                /* Flag: "More Fragments"        */
#define IP_OFFSET        0x1FFF                /* "Fragment Offset" part        */

#define IP_FRAG_TIME        (30 * HZ)                /* fragment lifetime        */

struct msghdr;
struct net_device;
struct packet_type;
struct rtable;
struct sockaddr;

int igmp_mc_init(void);

/*
 *        Functions provided by ip.c
 */

int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
                          __be32 saddr, __be32 daddr,
                          struct ip_options_rcu *opt, u8 tos);
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
           struct net_device *orig_dev);
void ip_list_rcv(struct list_head *head, struct packet_type *pt,
                 struct net_device *orig_dev);
int ip_local_deliver(struct sk_buff *skb);
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
int ip_mr_input(struct sk_buff *skb);
int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                   int (*output)(struct net *, struct sock *, struct sk_buff *));

struct ip_fraglist_iter {
        struct sk_buff        *frag;
        struct iphdr        *iph;
        int                offset;
        unsigned int        hlen;
};

void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
                      unsigned int hlen, struct ip_fraglist_iter *iter);
void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter);

static inline struct sk_buff *ip_fraglist_next(struct ip_fraglist_iter *iter)
{
        struct sk_buff *skb = iter->frag;

        iter->frag = skb->next;
        skb_mark_not_on_list(skb);

        return skb;
}

struct ip_frag_state {
        bool                DF;
        unsigned int        hlen;
        unsigned int        ll_rs;
        unsigned int        mtu;
        unsigned int        left;
        int                offset;
        int                ptr;
        __be16                not_last_frag;
};

void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs,
                  unsigned int mtu, bool DF, struct ip_frag_state *state);
struct sk_buff *ip_frag_next(struct sk_buff *skb,
                             struct ip_frag_state *state);

void ip_send_check(struct iphdr *ip);
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);

int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                    __u8 tos);
void ip_init(void);
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
                   int getfrag(void *from, char *to, int offset, int len,
                               int odd, struct sk_buff *skb),
                   void *from, int len, int protolen,
                   struct ipcm_cookie *ipc,
                   struct rtable **rt,
                   unsigned int flags);
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd,
                       struct sk_buff *skb);
struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4,
                              struct sk_buff_head *queue,
                              struct inet_cork *cork);
int ip_send_skb(struct net *net, struct sk_buff *skb);
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4);
void ip_flush_pending_frames(struct sock *sk);
struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4,
                            int getfrag(void *from, char *to, int offset,
                                        int len, int odd, struct sk_buff *skb),
                            void *from, int length, int transhdrlen,
                            struct ipcm_cookie *ipc, struct rtable **rtp,
                            struct inet_cork *cork, unsigned int flags);

int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);

static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
{
        return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
}

/* Get the route scope that should be used when sending a packet. */
static inline u8 ip_sendmsg_scope(const struct inet_sock *inet,
                                  const struct ipcm_cookie *ipc,
                                  const struct msghdr *msg)
{
        if (sock_flag(&inet->sk, SOCK_LOCALROUTE) ||
            msg->msg_flags & MSG_DONTROUTE ||
            (ipc->opt && ipc->opt->opt.is_strictroute))
                return RT_SCOPE_LINK;

        return RT_SCOPE_UNIVERSE;
}

/* datagram.c */
int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);

void ip4_datagram_release_cb(struct sock *sk);

struct ip_reply_arg {
        struct kvec iov[1];
        int            flags;
        __wsum             csum;
        int            csumoffset; /* u16 offset of csum in iov[0].iov_base */
                                /* -1 if not needed */
        int            bound_dev_if;
        u8              tos;
        kuid_t            uid;
};

#define IP_REPLY_ARG_NOSRCCHECK 1

static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
{
        return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
}

void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
                           struct sk_buff *skb,
                           const struct ip_options *sopt,
                           __be32 daddr, __be32 saddr,
                           const struct ip_reply_arg *arg,
                           unsigned int len, u64 transmit_time, u32 txhash);

#define IP_INC_STATS(net, field)        SNMP_INC_STATS64((net)->mib.ip_statistics, field)
#define __IP_INC_STATS(net, field)        __SNMP_INC_STATS64((net)->mib.ip_statistics, field)
#define IP_ADD_STATS(net, field, val)        SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val)
#define __IP_ADD_STATS(net, field, val) __SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val)
#define IP_UPD_PO_STATS(net, field, val) SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val)
#define __IP_UPD_PO_STATS(net, field, val) __SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val)
#define NET_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.net_statistics, field)
#define __NET_INC_STATS(net, field)        __SNMP_INC_STATS((net)->mib.net_statistics, field)
#define NET_ADD_STATS(net, field, adnd)        SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd)
#define __NET_ADD_STATS(net, field, adnd) __SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd)

static inline u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt)
{
        return  *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt);
}

unsigned long snmp_fold_field(void __percpu *mib, int offt);
#if BITS_PER_LONG==32
u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
                         size_t syncp_offset);
u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off);
#else
static inline u64  snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
                                        size_t syncp_offset)
{
        return snmp_get_cpu_field(mib, cpu, offct);

}

static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off)
{
        return snmp_fold_field(mib, offt);
}
#endif

#define snmp_get_cpu_field64_batch_cnt(buff64, stats_list, cnt,        \
                                       mib_statistic, offset)        \
{ \
        int i, c; \
        for_each_possible_cpu(c) { \
                for (i = 0; i < cnt; i++) \
                        buff64[i] += snmp_get_cpu_field64( \
                                        mib_statistic, \
                                        c, stats_list[i].entry, \
                                        offset); \
        } \
}

#define snmp_get_cpu_field_batch_cnt(buff, stats_list, cnt, mib_statistic) \
{ \
        int i, c; \
        for_each_possible_cpu(c) { \
                for (i = 0; i < cnt; i++) \
                        buff[i] += snmp_get_cpu_field( \
                                                mib_statistic, \
                                                c, stats_list[i].entry); \
        } \
}

static inline void inet_get_local_port_range(const struct net *net, int *low, int *high)
{
        u32 range = READ_ONCE(net->ipv4.ip_local_ports.range);

        *low = range & 0xffff;
        *high = range >> 16;
}
bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);

#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(const struct net *net, unsigned short port)
{
        if (!net->ipv4.sysctl_local_reserved_ports)
                return false;
        return test_bit(port, net->ipv4.sysctl_local_reserved_ports);
}

static inline bool sysctl_dev_name_is_allowed(const char *name)
{
        return strcmp(name, "default") != 0  && strcmp(name, "all") != 0;
}

static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port)
{
        return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock);
}

#else
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
{
        return false;
}

static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port)
{
        return port < PROT_SOCK;
}
#endif

__be32 inet_current_timestamp(void);

/* From inetpeer.c */
extern int inet_peer_threshold;
extern int inet_peer_minttl;
extern int inet_peer_maxttl;

void ipfrag_init(void);

void ip_static_sysctl_init(void);

#define IP4_REPLY_MARK(net, mark) \
        (READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0)

static inline bool ip_is_fragment(const struct iphdr *iph)
{
        return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0;
}

#ifdef CONFIG_INET
#include <net/dst.h>

/* The function in 2.2 was invalid, producing wrong result for
 * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
static inline
int ip_decrease_ttl(struct iphdr *iph)
{
        u32 check = (__force u32)iph->check;
        check += (__force u32)htons(0x0100);
        iph->check = (__force __sum16)(check + (check>=0xFFFF));
        return --iph->ttl;
}

static inline dscp_t ip4h_dscp(const struct iphdr *ip4h)
{
        return inet_dsfield_to_dscp(ip4h->tos);
}

static inline int ip_mtu_locked(const struct dst_entry *dst)
{
        const struct rtable *rt = dst_rtable(dst);

        return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU);
}

static inline
int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
{
        u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);

        return  pmtudisc == IP_PMTUDISC_DO ||
                (pmtudisc == IP_PMTUDISC_WANT &&
                 !ip_mtu_locked(dst));
}

static inline bool ip_sk_accept_pmtu(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);

        return pmtudisc != IP_PMTUDISC_INTERFACE &&
               pmtudisc != IP_PMTUDISC_OMIT;
}

static inline bool ip_sk_use_pmtu(const struct sock *sk)
{
        return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE;
}

static inline bool ip_sk_ignore_df(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);

        return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT;
}

static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
                                                    bool forwarding)
{
        const struct rtable *rt = dst_rtable(dst);
        const struct net_device *dev;
        unsigned int mtu, res;
        struct net *net;

        rcu_read_lock();

        dev = dst_dev_rcu(dst);
        net = dev_net_rcu(dev);
        if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
            ip_mtu_locked(dst) ||
            !forwarding) {
                mtu = rt->rt_pmtu;
                if (mtu && time_before(jiffies, READ_ONCE(rt->dst.expires)))
                        goto out;
        }

        /* 'forwarding = true' case should always honour route mtu */
        mtu = dst_metric_raw(dst, RTAX_MTU);
        if (mtu)
                goto out;

        mtu = READ_ONCE(dev->mtu);

        if (unlikely(ip_mtu_locked(dst))) {
                if (rt->rt_uses_gateway && mtu > 576)
                        mtu = 576;
        }

out:
        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);

        res = mtu - lwtunnel_headroom(dst->lwtstate, mtu);

        rcu_read_unlock();

        return res;
}

static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
                                          const struct sk_buff *skb)
{
        const struct dst_entry *dst = skb_dst(skb);
        unsigned int mtu;

        if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) {
                bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;

                return ip_dst_mtu_maybe_forward(dst, forwarding);
        }

        mtu = min(READ_ONCE(dst_dev(dst)->mtu), IP_MAX_MTU);
        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}

struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len,
                                        struct netlink_ext_ack *extack);
static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics)
{
        if (fib_metrics != &dst_default_metrics &&
            refcount_dec_and_test(&fib_metrics->refcnt))
                kfree(fib_metrics);
}

/* ipv4 and ipv6 both use refcounted metrics if it is not the default */
static inline
void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics)
{
        dst_init_metrics(dst, fib_metrics->metrics, true);

        if (fib_metrics != &dst_default_metrics) {
                dst->_metrics |= DST_METRICS_REFCOUNTED;
                refcount_inc(&fib_metrics->refcnt);
        }
}

static inline
void ip_dst_metrics_put(struct dst_entry *dst)
{
        struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);

        if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
                kfree(p);
}

void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);

static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb,
                                        struct sock *sk, int segs)
{
        struct iphdr *iph = ip_hdr(skb);

        /* We had many attacks based on IPID, use the private
         * generator as much as we can.
         */
        if (sk && inet_sk(sk)->inet_daddr) {
                int val;

                /* avoid atomic operations for TCP,
                 * as we hold socket lock at this point.
                 */
                if (sk_is_tcp(sk)) {
                        sock_owned_by_me(sk);
                        val = atomic_read(&inet_sk(sk)->inet_id);
                        atomic_set(&inet_sk(sk)->inet_id, val + segs);
                } else {
                        val = atomic_add_return(segs, &inet_sk(sk)->inet_id);
                }
                iph->id = htons(val);
                return;
        }
        if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) {
                iph->id = 0;
        } else {
                /* Unfortunately we need the big hammer to get a suitable IPID */
                __ip_select_ident(net, iph, segs);
        }
}

static inline void ip_select_ident(struct net *net, struct sk_buff *skb,
                                   struct sock *sk)
{
        ip_select_ident_segs(net, skb, sk, 1);
}

static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto)
{
        return csum_tcpudp_nofold(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
                                  skb->len, proto, 0);
}

/* copy IPv4 saddr & daddr to flow_keys, possibly using 64bit load/store
 * Equivalent to :        flow->v4addrs.src = iph->saddr;
 *                        flow->v4addrs.dst = iph->daddr;
 */
static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow,
                                            const struct iphdr *iph)
{
        BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) !=
                     offsetof(typeof(flow->addrs), v4addrs.src) +
                              sizeof(flow->addrs.v4addrs.src));
        memcpy(&flow->addrs.v4addrs, &iph->addrs, sizeof(flow->addrs.v4addrs));
        flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
}

/*
 *        Map a multicast IP onto multicast MAC for type ethernet.
 */

static inline void ip_eth_mc_map(__be32 naddr, char *buf)
{
        __u32 addr=ntohl(naddr);
        buf[0]=0x01;
        buf[1]=0x00;
        buf[2]=0x5e;
        buf[5]=addr&0xFF;
        addr>>=8;
        buf[4]=addr&0xFF;
        addr>>=8;
        buf[3]=addr&0x7F;
}

/*
 *        Map a multicast IP onto multicast MAC for type IP-over-InfiniBand.
 *        Leave P_Key as 0 to be filled in by driver.
 */

static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
{
        __u32 addr;
        unsigned char scope = broadcast[5] & 0xF;

        buf[0]  = 0;                /* Reserved */
        buf[1]  = 0xff;                /* Multicast QPN */
        buf[2]  = 0xff;
        buf[3]  = 0xff;
        addr    = ntohl(naddr);
        buf[4]  = 0xff;
        buf[5]  = 0x10 | scope;        /* scope from broadcast address */
        buf[6]  = 0x40;                /* IPv4 signature */
        buf[7]  = 0x1b;
        buf[8]  = broadcast[8];                /* P_Key */
        buf[9]  = broadcast[9];
        buf[10] = 0;
        buf[11] = 0;
        buf[12] = 0;
        buf[13] = 0;
        buf[14] = 0;
        buf[15] = 0;
        buf[19] = addr & 0xff;
        addr  >>= 8;
        buf[18] = addr & 0xff;
        addr  >>= 8;
        buf[17] = addr & 0xff;
        addr  >>= 8;
        buf[16] = addr & 0x0f;
}

static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
{
        if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0)
                memcpy(buf, broadcast, 4);
        else
                memcpy(buf, &naddr, sizeof(naddr));
}

#if IS_ENABLED(CONFIG_IPV6)
#include <linux/ipv6.h>
#endif

static __inline__ void inet_reset_saddr(struct sock *sk)
{
        inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == PF_INET6) {
                struct ipv6_pinfo *np = inet6_sk(sk);

                memset(&np->saddr, 0, sizeof(np->saddr));
                memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
        }
#endif
}

#endif

static inline unsigned int ipv4_addr_hash(__be32 ip)
{
        return (__force unsigned int) ip;
}

static inline u32 __ipv4_addr_hash(const __be32 ip, const u32 initval)
{
        return jhash_1word((__force u32)ip, initval);
}

static inline u32 ipv4_portaddr_hash(const struct net *net,
                                     __be32 saddr,
                                     unsigned int port)
{
        return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
}

bool ip_call_ra_chain(struct sk_buff *skb);

/*
 *        Functions provided by ip_fragment.c
 */

enum ip_defrag_users {
        IP_DEFRAG_LOCAL_DELIVER,
        IP_DEFRAG_CALL_RA_CHAIN,
        IP_DEFRAG_CONNTRACK_IN,
        __IP_DEFRAG_CONNTRACK_IN_END        = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX,
        IP_DEFRAG_CONNTRACK_OUT,
        __IP_DEFRAG_CONNTRACK_OUT_END        = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
        IP_DEFRAG_CONNTRACK_BRIDGE_IN,
        __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
        IP_DEFRAG_VS_IN,
        IP_DEFRAG_VS_OUT,
        IP_DEFRAG_VS_FWD,
        IP_DEFRAG_AF_PACKET,
        IP_DEFRAG_MACVLAN,
};

/* Return true if the value of 'user' is between 'lower_bond'
 * and 'upper_bond' inclusively.
 */
static inline bool ip_defrag_user_in_between(u32 user,
                                             enum ip_defrag_users lower_bond,
                                             enum ip_defrag_users upper_bond)
{
        return user >= lower_bond && user <= upper_bond;
}

int ip_defrag(struct net *net, struct sk_buff *skb, u32 user);
#ifdef CONFIG_INET
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user);
#else
static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
        return skb;
}
#endif

/*
 *        Functions provided by ip_forward.c
 */

int ip_forward(struct sk_buff *skb);

/*
 *        Functions provided by ip_options.c
 */

void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
                      __be32 daddr, struct rtable *rt);

int __ip_options_echo(struct net *net, struct ip_options *dopt,
                      struct sk_buff *skb, const struct ip_options *sopt);
static inline int ip_options_echo(struct net *net, struct ip_options *dopt,
                                  struct sk_buff *skb)
{
        return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt);
}

void ip_options_fragment(struct sk_buff *skb);
int __ip_options_compile(struct net *net, struct ip_options *opt,
                         struct sk_buff *skb, __be32 *info);
int ip_options_compile(struct net *net, struct ip_options *opt,
                       struct sk_buff *skb);
int ip_options_get(struct net *net, struct ip_options_rcu **optp,
                   sockptr_t data, int optlen);
void ip_options_undo(struct ip_options *opt);
void ip_forward_options(struct sk_buff *skb);
int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev);

/*
 *        Functions provided by ip_sockglue.c
 */

void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst);
void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
                         struct sk_buff *skb, int tlen, int offset);
int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
                 struct ipcm_cookie *ipc, bool allow_ipv6);
DECLARE_STATIC_KEY_FALSE(ip4_min_ttl);
int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                     unsigned int optlen);
int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                  unsigned int optlen);
int do_ip_getsockopt(struct sock *sk, int level, int optname,
                     sockptr_t optval, sockptr_t optlen);
int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                  int __user *optlen);
int ip_ra_control(struct sock *sk, unsigned char on,
                  void (*destructor)(struct sock *));

int ip_recv_error(struct sock *sk, struct msghdr *msg, int len);
void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
                   u32 info, u8 *payload);
void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
                    u32 info);

static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
{
        ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0);
}

bool icmp_global_allow(struct net *net);
void icmp_global_consume(struct net *net);

#ifdef CONFIG_PROC_FS
int ip_misc_proc_init(void);
#endif

int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family,
                                struct netlink_ext_ack *extack);

static inline bool inetdev_valid_mtu(unsigned int mtu)
{
        return likely(mtu >= IPV4_MIN_MTU);
}

void ip_sock_set_freebind(struct sock *sk);
int ip_sock_set_mtu_discover(struct sock *sk, int val);
void ip_sock_set_pktinfo(struct sock *sk);
void ip_sock_set_recverr(struct sock *sk);
void ip_sock_set_tos(struct sock *sk, int val);
void  __ip_sock_set_tos(struct sock *sk, int val);

#endif        /* _IP_H */













































































































































































































































































































































































































































   70 
   67 


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
// SPDX-License-Identifier: GPL-2.0-only
/*
 * x86 APERF/MPERF KHz calculation for
 * /sys/.../cpufreq/scaling_cur_freq
 *
 * Copyright (C) 2017 Intel Corp.
 * Author: Len Brown <len.brown@intel.com>
 */
#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/sched/isolation.h>
#include <linux/sched/topology.h>
#include <linux/smp.h>
#include <linux/syscore_ops.h>

#include <asm/cpu.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include <asm/msr.h>

#include "cpu.h"

struct aperfmperf {
        seqcount_t        seq;
        unsigned long        last_update;
        u64                acnt;
        u64                mcnt;
        u64                aperf;
        u64                mperf;
};

static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
        .seq = SEQCNT_ZERO(cpu_samples.seq)
};

static void init_counter_refs(void *data)
{
        u64 aperf, mperf;

        rdmsrq(MSR_IA32_APERF, aperf);
        rdmsrq(MSR_IA32_MPERF, mperf);

        this_cpu_write(cpu_samples.aperf, aperf);
        this_cpu_write(cpu_samples.mperf, mperf);
}

#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
 * APERF/MPERF frequency ratio computation.
 *
 * The scheduler wants to do frequency invariant accounting and needs a <1
 * ratio to account for the 'current' frequency, corresponding to
 * freq_curr / freq_max.
 *
 * Since the frequency freq_curr on x86 is controlled by micro-controller and
 * our P-state setting is little more than a request/hint, we need to observe
 * the effective frequency 'BusyMHz', i.e. the average frequency over a time
 * interval after discarding idle time. This is given by:
 *
 *   BusyMHz = delta_APERF / delta_MPERF * freq_base
 *
 * where freq_base is the max non-turbo P-state.
 *
 * The freq_max term has to be set to a somewhat arbitrary value, because we
 * can't know which turbo states will be available at a given point in time:
 * it all depends on the thermal headroom of the entire package. We set it to
 * the turbo level with 4 cores active.
 *
 * Benchmarks show that's a good compromise between the 1C turbo ratio
 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
 * which would ignore the entire turbo range (a conspicuous part, making
 * freq_curr/freq_max always maxed out).
 *
 * An exception to the heuristic above is the Atom uarch, where we choose the
 * highest turbo level for freq_max since Atom's are generally oriented towards
 * power efficiency.
 *
 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
 */

DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);

static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;

void arch_set_max_freq_ratio(bool turbo_disabled)
{
        arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
                                        arch_turbo_freq_ratio;
}
EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);

static bool __init turbo_disabled(void)
{
        u64 misc_en;
        int err;

        err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en);
        if (err)
                return false;

        return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
}

static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
        int err;

        err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq);
        if (err)
                return false;

        err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
        if (err)
                return false;

        *base_freq = (*base_freq >> 16) & 0x3F;     /* max P state */
        *turbo_freq = *turbo_freq & 0x3F;           /* 1C turbo    */

        return true;
}

#define X86_MATCH(vfm)                                                \
        X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)

static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
        X86_MATCH(INTEL_XEON_PHI_KNL),
        X86_MATCH(INTEL_XEON_PHI_KNM),
        {}
};

static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
        X86_MATCH(INTEL_SKYLAKE_X),
        {}
};

static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
        X86_MATCH(INTEL_ATOM_GOLDMONT),
        X86_MATCH(INTEL_ATOM_GOLDMONT_D),
        X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
        {}
};

static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
                                          int num_delta_fratio)
{
        int fratio, delta_fratio, found;
        int err, i;
        u64 msr;

        err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
        if (err)
                return false;

        *base_freq = (*base_freq >> 8) & 0xFF;            /* max P state */

        err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
        if (err)
                return false;

        fratio = (msr >> 8) & 0xFF;
        i = 16;
        found = 0;
        do {
                if (found >= num_delta_fratio) {
                        *turbo_freq = fratio;
                        return true;
                }

                delta_fratio = (msr >> (i + 5)) & 0x7;

                if (delta_fratio) {
                        found += 1;
                        fratio -= delta_fratio;
                }

                i += 8;
        } while (i < 64);

        return true;
}

static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
{
        u64 ratios, counts;
        u32 group_size;
        int err, i;

        err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
        if (err)
                return false;

        *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */

        err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
        if (err)
                return false;

        err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
        if (err)
                return false;

        for (i = 0; i < 64; i += 8) {
                group_size = (counts >> i) & 0xFF;
                if (group_size >= size) {
                        *turbo_freq = (ratios >> i) & 0xFF;
                        return true;
                }
        }

        return false;
}

static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
        u64 msr;
        int err;

        err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
        if (err)
                return false;

        err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
        if (err)
                return false;

        *base_freq = (*base_freq >> 8) & 0xFF;    /* max P state */
        *turbo_freq = (msr >> 24) & 0xFF;         /* 4C turbo    */

        /* The CPU may have less than 4 cores */
        if (!*turbo_freq)
                *turbo_freq = msr & 0xFF;         /* 1C turbo    */

        return true;
}

static bool __init intel_set_max_freq_ratio(void)
{
        u64 base_freq, turbo_freq;
        u64 turbo_ratio;

        if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
                goto out;

        if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
            skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
                goto out;

        if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
            knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
                goto out;

        if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
            skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
                goto out;

        if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
                goto out;

        return false;

out:
        /*
         * Some hypervisors advertise X86_FEATURE_APERFMPERF
         * but then fill all MSR's with zeroes.
         * Some CPUs have turbo boost but don't declare any turbo ratio
         * in MSR_TURBO_RATIO_LIMIT.
         */
        if (!base_freq || !turbo_freq) {
                pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
                return false;
        }

        turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
        if (!turbo_ratio) {
                pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
                return false;
        }

        arch_turbo_freq_ratio = turbo_ratio;
        arch_set_max_freq_ratio(turbo_disabled());

        return true;
}

#ifdef CONFIG_PM_SLEEP
static const struct syscore_ops freq_invariance_syscore_ops = {
        .resume = init_counter_refs,
};

static struct syscore freq_invariance_syscore = {
        .ops = &freq_invariance_syscore_ops,
};

static void register_freq_invariance_syscore(void)
{
        register_syscore(&freq_invariance_syscore);
}
#else
static inline void register_freq_invariance_syscore(void) {}
#endif

static void freq_invariance_enable(void)
{
        if (static_branch_unlikely(&arch_scale_freq_key)) {
                WARN_ON_ONCE(1);
                return;
        }
        static_branch_enable_cpuslocked(&arch_scale_freq_key);
        register_freq_invariance_syscore();
        pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
}

void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
{
        arch_turbo_freq_ratio = ratio;
        arch_set_max_freq_ratio(turbo_disabled);
        freq_invariance_enable();
}

static void __init bp_init_freq_invariance(void)
{
        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return;

        if (intel_set_max_freq_ratio()) {
                guard(cpus_read_lock)();
                freq_invariance_enable();
        }
}

static void disable_freq_invariance_workfn(struct work_struct *work)
{
        int cpu;

        static_branch_disable(&arch_scale_freq_key);

        /*
         * Set arch_freq_scale to a default value on all cpus
         * This negates the effect of scaling
         */
        for_each_possible_cpu(cpu)
                per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
}

static DECLARE_WORK(disable_freq_invariance_work,
                    disable_freq_invariance_workfn);

DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);

static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);

struct arch_hybrid_cpu_scale {
        unsigned long capacity;
        unsigned long freq_ratio;
};

static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;

/**
 * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
 *
 * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
 * initialize it and set the static key controlling its code paths.
 *
 * Must be called before arch_set_cpu_capacity().
 */
bool arch_enable_hybrid_capacity_scale(void)
{
        int cpu;

        if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
                WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
                return true;
        }

        arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
        if (!arch_cpu_scale)
                return false;

        for_each_possible_cpu(cpu) {
                per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
                per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
        }

        static_branch_enable(&arch_hybrid_cap_scale_key);

        pr_info("Hybrid CPU capacity scaling enabled\n");

        return true;
}

/**
 * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
 * @cpu: Target CPU.
 * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
 * @max_cap: System-wide maximum CPU capacity.
 * @cap_freq: Frequency of @cpu corresponding to @cap.
 * @base_freq: Frequency of @cpu at which MPERF counts.
 *
 * The units in which @cap and @max_cap are expressed do not matter, so long
 * as they are consistent, because the former is effectively divided by the
 * latter.  Analogously for @cap_freq and @base_freq.
 *
 * After calling this function for all CPUs, call arch_rebuild_sched_domains()
 * to let the scheduler know that capacity-aware scheduling can be used going
 * forward.
 */
void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
                           unsigned long cap_freq, unsigned long base_freq)
{
        if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
                WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
                           div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
                WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
                           div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
        } else {
                WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
        }
}

unsigned long arch_scale_cpu_capacity(int cpu)
{
        if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
                return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);

        return SCHED_CAPACITY_SCALE;
}
EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);

static void scale_freq_tick(u64 acnt, u64 mcnt)
{
        u64 freq_scale, freq_ratio;

        if (!arch_scale_freq_invariant())
                return;

        if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
                goto error;

        if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
                freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
        else
                freq_ratio = arch_max_freq_ratio;

        if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
                goto error;

        freq_scale = div64_u64(acnt, mcnt);
        if (!freq_scale)
                goto error;

        if (freq_scale > SCHED_CAPACITY_SCALE)
                freq_scale = SCHED_CAPACITY_SCALE;

        this_cpu_write(arch_freq_scale, freq_scale);
        return;

error:
        pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
        schedule_work(&disable_freq_invariance_work);
}
#else
static inline void bp_init_freq_invariance(void) { }
static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
#endif /* CONFIG_X86_64 && CONFIG_SMP */

void arch_scale_freq_tick(void)
{
        struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
        u64 acnt, mcnt, aperf, mperf;

        if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
                return;

        rdmsrq(MSR_IA32_APERF, aperf);
        rdmsrq(MSR_IA32_MPERF, mperf);
        acnt = aperf - s->aperf;
        mcnt = mperf - s->mperf;

        s->aperf = aperf;
        s->mperf = mperf;

        raw_write_seqcount_begin(&s->seq);
        s->last_update = jiffies;
        s->acnt = acnt;
        s->mcnt = mcnt;
        raw_write_seqcount_end(&s->seq);

        scale_freq_tick(acnt, mcnt);
}

/*
 * Discard samples older than the define maximum sample age of 20ms. There
 * is no point in sending IPIs in such a case. If the scheduler tick was
 * not running then the CPU is either idle or isolated.
 */
#define MAX_SAMPLE_AGE        ((unsigned long)HZ / 50)

int arch_freq_get_on_cpu(int cpu)
{
        struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
        unsigned int seq, freq;
        unsigned long last;
        u64 acnt, mcnt;

        if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
                goto fallback;

        do {
                seq = raw_read_seqcount_begin(&s->seq);
                last = s->last_update;
                acnt = s->acnt;
                mcnt = s->mcnt;
        } while (read_seqcount_retry(&s->seq, seq));

        /*
         * Bail on invalid count and when the last update was too long ago,
         * which covers idle and NOHZ full CPUs.
         */
        if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
                goto fallback;

        return div64_u64((cpu_khz * acnt), mcnt);

fallback:
        freq = cpufreq_quick_get(cpu);
        return freq ? freq : cpu_khz;
}

static int __init bp_init_aperfmperf(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
                return 0;

        init_counter_refs(NULL);
        bp_init_freq_invariance();
        return 0;
}
early_initcall(bp_init_aperfmperf);

void ap_init_aperfmperf(void)
{
        if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
                init_counter_refs(NULL);
}

















































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCATTERLIST_H
#define _LINUX_SCATTERLIST_H

#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <asm/io.h>

struct scatterlist {
        unsigned long        page_link;
        unsigned int        offset;
        unsigned int        length;
        dma_addr_t        dma_address;
#ifdef CONFIG_NEED_SG_DMA_LENGTH
        unsigned int        dma_length;
#endif
#ifdef CONFIG_NEED_SG_DMA_FLAGS
        unsigned int    dma_flags;
#endif
};

/*
 * These macros should be used after a dma_map_sg call has been done
 * to get bus addresses of each of the SG entries and their lengths.
 * You should only work with the number of sg entries dma_map_sg
 * returns, or alternatively stop on the first sg_dma_len(sg) which
 * is 0.
 */
#define sg_dma_address(sg)        ((sg)->dma_address)

#ifdef CONFIG_NEED_SG_DMA_LENGTH
#define sg_dma_len(sg)                ((sg)->dma_length)
#else
#define sg_dma_len(sg)                ((sg)->length)
#endif

struct sg_table {
        struct scatterlist *sgl;        /* the list */
        unsigned int nents;                /* number of mapped entries */
        unsigned int orig_nents;        /* original size of list */
};

struct sg_append_table {
        struct sg_table sgt;                /* The scatter list table */
        struct scatterlist *prv;        /* last populated sge in the table */
        unsigned int total_nents;        /* Total entries in the table */
};

/*
 * Notes on SG table design.
 *
 * We use the unsigned long page_link field in the scatterlist struct to place
 * the page pointer AND encode information about the sg table as well. The two
 * lower bits are reserved for this information.
 *
 * If bit 0 is set, then the page_link contains a pointer to the next sg
 * table list. Otherwise the next entry is at sg + 1.
 *
 * If bit 1 is set, then this sg entry is the last element in a list.
 *
 * See sg_next().
 *
 */

#define SG_CHAIN        0x01UL
#define SG_END                0x02UL

/*
 * We overload the LSB of the page pointer to indicate whether it's
 * a valid sg entry, or whether it points to the start of a new scatterlist.
 * Those low bits are there for everyone! (thanks mason :-)
 */
#define SG_PAGE_LINK_MASK (SG_CHAIN | SG_END)

static inline unsigned int __sg_flags(struct scatterlist *sg)
{
        return sg->page_link & SG_PAGE_LINK_MASK;
}

static inline struct scatterlist *sg_chain_ptr(struct scatterlist *sg)
{
        return (struct scatterlist *)(sg->page_link & ~SG_PAGE_LINK_MASK);
}

static inline bool sg_is_chain(struct scatterlist *sg)
{
        return __sg_flags(sg) & SG_CHAIN;
}

static inline bool sg_is_last(struct scatterlist *sg)
{
        return __sg_flags(sg) & SG_END;
}

/**
 * sg_next - return the next scatterlist entry in a list
 * @sg:                The current sg entry
 *
 * Description:
 *   Usually the next entry will be @sg + 1, but if this sg element is part
 *   of a chained scatterlist, it could jump to the start of a new
 *   scatterlist array.
 *
 **/
static inline struct scatterlist *sg_next(struct scatterlist *sg)
{
        if (sg_is_last(sg))
                return NULL;

        sg++;
        if (unlikely(sg_is_chain(sg)))
                sg = sg_chain_ptr(sg);

        return sg;
}

/**
 * sg_assign_page - Assign a given page to an SG entry
 * @sg:                    SG entry
 * @page:            The page
 *
 * Description:
 *   Assign page to sg entry. Also see sg_set_page(), the most commonly used
 *   variant.
 *
 **/
static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
{
        unsigned long page_link = sg->page_link & (SG_CHAIN | SG_END);

        /*
         * In order for the low bit stealing approach to work, pages
         * must be aligned at a 32-bit boundary as a minimum.
         */
        BUG_ON((unsigned long)page & SG_PAGE_LINK_MASK);
#ifdef CONFIG_DEBUG_SG
        BUG_ON(sg_is_chain(sg));
#endif
        sg->page_link = page_link | (unsigned long) page;
}

/**
 * sg_set_page - Set sg entry to point at given page
 * @sg:                 SG entry
 * @page:         The page
 * @len:         Length of data
 * @offset:         Offset into page
 *
 * Description:
 *   Use this function to set an sg entry pointing at a page, never assign
 *   the page directly. We encode sg table information in the lower bits
 *   of the page pointer. See sg_page() for looking up the page belonging
 *   to an sg entry.
 *
 **/
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
                               unsigned int len, unsigned int offset)
{
        VM_WARN_ON_ONCE(!page_range_contiguous(page, ALIGN(len + offset, PAGE_SIZE) / PAGE_SIZE));
        sg_assign_page(sg, page);
        sg->offset = offset;
        sg->length = len;
}

/**
 * sg_set_folio - Set sg entry to point at given folio
 * @sg:                 SG entry
 * @folio:         The folio
 * @len:         Length of data
 * @offset:         Offset into folio
 *
 * Description:
 *   Use this function to set an sg entry pointing at a folio, never assign
 *   the folio directly. We encode sg table information in the lower bits
 *   of the folio pointer. See sg_page() for looking up the page belonging
 *   to an sg entry.
 *
 **/
static inline void sg_set_folio(struct scatterlist *sg, struct folio *folio,
                               size_t len, size_t offset)
{
        WARN_ON_ONCE(len > UINT_MAX);
        WARN_ON_ONCE(offset > UINT_MAX);
        sg_assign_page(sg, &folio->page);
        sg->offset = offset;
        sg->length = len;
}

static inline struct page *sg_page(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
        BUG_ON(sg_is_chain(sg));
#endif
        return (struct page *)((sg)->page_link & ~SG_PAGE_LINK_MASK);
}

/**
 * sg_set_buf - Set sg entry to point at given data
 * @sg:                 SG entry
 * @buf:         Data
 * @buflen:         Data length
 *
 **/
static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
                              unsigned int buflen)
{
#ifdef CONFIG_DEBUG_SG
        BUG_ON(!virt_addr_valid(buf));
#endif
        sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
}

/*
 * Loop over each sg element, following the pointer to a new list if necessary
 */
#define for_each_sg(sglist, sg, nr, __i)        \
        for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))

/*
 * Loop over each sg element in the given sg_table object.
 */
#define for_each_sgtable_sg(sgt, sg, i)                \
        for_each_sg((sgt)->sgl, sg, (sgt)->orig_nents, i)

/*
 * Loop over each sg element in the given *DMA mapped* sg_table object.
 * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses
 * of the each element.
 */
#define for_each_sgtable_dma_sg(sgt, sg, i)        \
        for_each_sg((sgt)->sgl, sg, (sgt)->nents, i)

static inline void __sg_chain(struct scatterlist *chain_sg,
                              struct scatterlist *sgl)
{
        /*
         * offset and length are unused for chain entry. Clear them.
         */
        chain_sg->offset = 0;
        chain_sg->length = 0;

        /*
         * Set lowest bit to indicate a link pointer, and make sure to clear
         * the termination bit if it happens to be set.
         */
        chain_sg->page_link = ((unsigned long) sgl | SG_CHAIN) & ~SG_END;
}

/**
 * sg_chain - Chain two sglists together
 * @prv:        First scatterlist
 * @prv_nents:        Number of entries in prv
 * @sgl:        Second scatterlist
 *
 * Description:
 *   Links @prv and @sgl together, to form a longer scatterlist.
 *
 **/
static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
                            struct scatterlist *sgl)
{
        __sg_chain(&prv[prv_nents - 1], sgl);
}

/**
 * sg_mark_end - Mark the end of the scatterlist
 * @sg:                 SG entryScatterlist
 *
 * Description:
 *   Marks the passed in sg entry as the termination point for the sg
 *   table. A call to sg_next() on this entry will return NULL.
 *
 **/
static inline void sg_mark_end(struct scatterlist *sg)
{
        /*
         * Set termination bit, clear potential chain bit
         */
        sg->page_link |= SG_END;
        sg->page_link &= ~SG_CHAIN;
}

/**
 * sg_unmark_end - Undo setting the end of the scatterlist
 * @sg:                 SG entryScatterlist
 *
 * Description:
 *   Removes the termination marker from the given entry of the scatterlist.
 *
 **/
static inline void sg_unmark_end(struct scatterlist *sg)
{
        sg->page_link &= ~SG_END;
}

/*
 * On 64-bit architectures there is a 4-byte padding in struct scatterlist
 * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA
 * flags bits to indicate when a specific dma address is a bus address or the
 * buffer may have been bounced via SWIOTLB.
 */
#ifdef CONFIG_NEED_SG_DMA_FLAGS

#define SG_DMA_BUS_ADDRESS        (1 << 0)
#define SG_DMA_SWIOTLB                (1 << 1)

/**
 * sg_dma_is_bus_address - Return whether a given segment was marked
 *                           as a bus address
 * @sg:                 SG entry
 *
 * Description:
 *   Returns true if sg_dma_mark_bus_address() has been called on
 *   this segment.
 **/
static inline bool sg_dma_is_bus_address(struct scatterlist *sg)
{
        return sg->dma_flags & SG_DMA_BUS_ADDRESS;
}

/**
 * sg_dma_mark_bus_address - Mark the scatterlist entry as a bus address
 * @sg:                 SG entry
 *
 * Description:
 *   Marks the passed in sg entry to indicate that the dma_address is
 *   a bus address and doesn't need to be unmapped. This should only be
 *   used by dma_map_sg() implementations to mark bus addresses
 *   so they can be properly cleaned up in dma_unmap_sg().
 **/
static inline void sg_dma_mark_bus_address(struct scatterlist *sg)
{
        sg->dma_flags |= SG_DMA_BUS_ADDRESS;
}

/**
 * sg_dma_unmark_bus_address - Unmark the scatterlist entry as a bus address
 * @sg:                 SG entry
 *
 * Description:
 *   Clears the bus address mark.
 **/
static inline void sg_dma_unmark_bus_address(struct scatterlist *sg)
{
        sg->dma_flags &= ~SG_DMA_BUS_ADDRESS;
}

/**
 * sg_dma_is_swiotlb - Return whether the scatterlist was marked for SWIOTLB
 *                        bouncing
 * @sg:                SG entry
 *
 * Description:
 *   Returns true if the scatterlist was marked for SWIOTLB bouncing. Not all
 *   elements may have been bounced, so the caller would have to check
 *   individual SG entries with swiotlb_find_pool().
 */
static inline bool sg_dma_is_swiotlb(struct scatterlist *sg)
{
        return sg->dma_flags & SG_DMA_SWIOTLB;
}

/**
 * sg_dma_mark_swiotlb - Mark the scatterlist for SWIOTLB bouncing
 * @sg:                SG entry
 *
 * Description:
 *   Marks a a scatterlist for SWIOTLB bounce. Not all SG entries may be
 *   bounced.
 */
static inline void sg_dma_mark_swiotlb(struct scatterlist *sg)
{
        sg->dma_flags |= SG_DMA_SWIOTLB;
}

#else

static inline bool sg_dma_is_bus_address(struct scatterlist *sg)
{
        return false;
}
static inline void sg_dma_mark_bus_address(struct scatterlist *sg)
{
}
static inline void sg_dma_unmark_bus_address(struct scatterlist *sg)
{
}
static inline bool sg_dma_is_swiotlb(struct scatterlist *sg)
{
        return false;
}
static inline void sg_dma_mark_swiotlb(struct scatterlist *sg)
{
}

#endif        /* CONFIG_NEED_SG_DMA_FLAGS */

/**
 * sg_phys - Return physical address of an sg entry
 * @sg:             SG entry
 *
 * Description:
 *   This calls page_to_phys() on the page in this sg entry, and adds the
 *   sg offset. The caller must know that it is legal to call page_to_phys()
 *   on the sg page.
 *
 **/
static inline dma_addr_t sg_phys(struct scatterlist *sg)
{
        return page_to_phys(sg_page(sg)) + sg->offset;
}

/**
 * sg_virt - Return virtual address of an sg entry
 * @sg:      SG entry
 *
 * Description:
 *   This calls page_address() on the page in this sg entry, and adds the
 *   sg offset. The caller must know that the sg page has a valid virtual
 *   mapping.
 *
 **/
static inline void *sg_virt(struct scatterlist *sg)
{
        return page_address(sg_page(sg)) + sg->offset;
}

/**
 * sg_init_marker - Initialize markers in sg table
 * @sgl:           The SG table
 * @nents:           Number of entries in table
 *
 **/
static inline void sg_init_marker(struct scatterlist *sgl,
                                  unsigned int nents)
{
        sg_mark_end(&sgl[nents - 1]);
}

int sg_nents(struct scatterlist *sg);
int sg_nents_for_len(struct scatterlist *sg, u64 len);
int sg_nents_for_dma(struct scatterlist *sgl, unsigned int sglen, size_t len);

struct scatterlist *sg_last(struct scatterlist *s, unsigned int);
void sg_init_table(struct scatterlist *, unsigned int);
void sg_init_one(struct scatterlist *, const void *, unsigned int);
int sg_split(struct scatterlist *in, const int in_mapped_nents,
             const off_t skip, const int nb_splits,
             const size_t *split_sizes,
             struct scatterlist **out, int *out_mapped_nents,
             gfp_t gfp_mask);

typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t);
typedef void (sg_free_fn)(struct scatterlist *, unsigned int);

void __sg_free_table(struct sg_table *, unsigned int, unsigned int,
                     sg_free_fn *, unsigned int);
void sg_free_table(struct sg_table *);
void sg_free_append_table(struct sg_append_table *sgt);
int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
                     struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *);
int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
int sg_alloc_append_table_from_pages(struct sg_append_table *sgt,
                                     struct page **pages, unsigned int n_pages,
                                     unsigned int offset, unsigned long size,
                                     unsigned int max_segment,
                                     unsigned int left_pages, gfp_t gfp_mask);
int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages,
                                      unsigned int n_pages, unsigned int offset,
                                      unsigned long size,
                                      unsigned int max_segment, gfp_t gfp_mask);

/**
 * sg_alloc_table_from_pages - Allocate and initialize an sg table from
 *                               an array of pages
 * @sgt:         The sg table header to use
 * @pages:         Pointer to an array of page pointers
 * @n_pages:         Number of pages in the pages array
 * @offset:      Offset from start of the first page to the start of a buffer
 * @size:        Number of valid bytes in the buffer (after offset)
 * @gfp_mask:         GFP allocation mask
 *
 *  Description:
 *    Allocate and initialize an sg table from a list of pages. Contiguous
 *    ranges of the pages are squashed into a single scatterlist node. A user
 *    may provide an offset at a start and a size of valid data in a buffer
 *    specified by the page array. The returned sg table is released by
 *    sg_free_table.
 *
 * Returns:
 *   0 on success, negative error on failure
 */
static inline int sg_alloc_table_from_pages(struct sg_table *sgt,
                                            struct page **pages,
                                            unsigned int n_pages,
                                            unsigned int offset,
                                            unsigned long size, gfp_t gfp_mask)
{
        return sg_alloc_table_from_pages_segment(sgt, pages, n_pages, offset,
                                                 size, UINT_MAX, gfp_mask);
}

#ifdef CONFIG_SGL_ALLOC
struct scatterlist *sgl_alloc_order(unsigned long long length,
                                    unsigned int order, bool chainable,
                                    gfp_t gfp, unsigned int *nent_p);
struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
                              unsigned int *nent_p);
void sgl_free_n_order(struct scatterlist *sgl, int nents, int order);
void sgl_free_order(struct scatterlist *sgl, int order);
void sgl_free(struct scatterlist *sgl);
#endif /* CONFIG_SGL_ALLOC */

size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
                      size_t buflen, off_t skip, bool to_buffer);

size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
                           const void *buf, size_t buflen);
size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
                         void *buf, size_t buflen);

size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
                            const void *buf, size_t buflen, off_t skip);
size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
                          void *buf, size_t buflen, off_t skip);
size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents,
                       size_t buflen, off_t skip);

/*
 * Maximum number of entries that will be allocated in one piece, if
 * a list larger than this is required then chaining will be utilized.
 */
#define SG_MAX_SINGLE_ALLOC                (PAGE_SIZE / sizeof(struct scatterlist))

/*
 * The maximum number of SG segments that we will put inside a
 * scatterlist (unless chaining is used). Should ideally fit inside a
 * single page, to avoid a higher order allocation.  We could define this
 * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order.  The
 * minimum value is 32
 */
#define SG_CHUNK_SIZE        128

/*
 * Like SG_CHUNK_SIZE, but for archs that have sg chaining. This limit
 * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
 */
#ifdef CONFIG_ARCH_NO_SG_CHAIN
#define SG_MAX_SEGMENTS        SG_CHUNK_SIZE
#else
#define SG_MAX_SEGMENTS        2048
#endif

#ifdef CONFIG_SG_POOL
void sg_free_table_chained(struct sg_table *table,
                           unsigned nents_first_chunk);
int sg_alloc_table_chained(struct sg_table *table, int nents,
                           struct scatterlist *first_chunk,
                           unsigned nents_first_chunk);
#endif

/*
 * sg page iterator
 *
 * Iterates over sg entries page-by-page.  On each successful iteration, you
 * can call sg_page_iter_page(@piter) to get the current page.
 * @piter->sg will point to the sg holding this page and @piter->sg_pgoffset to
 * the page's page offset within the sg. The iteration will stop either when a
 * maximum number of sg entries was reached or a terminating sg
 * (sg_last(sg) == true) was reached.
 */
struct sg_page_iter {
        struct scatterlist        *sg;                /* sg holding the page */
        unsigned int                sg_pgoffset;        /* page offset within the sg */

        /* these are internal states, keep away */
        unsigned int                __nents;        /* remaining sg entries */
        int                        __pg_advance;        /* nr pages to advance at the
                                                 * next step */
};

/*
 * sg page iterator for DMA addresses
 *
 * This is the same as sg_page_iter however you can call
 * sg_page_iter_dma_address(@dma_iter) to get the page's DMA
 * address. sg_page_iter_page() cannot be called on this iterator.
 */
struct sg_dma_page_iter {
        struct sg_page_iter base;
};

bool __sg_page_iter_next(struct sg_page_iter *piter);
bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter);
void __sg_page_iter_start(struct sg_page_iter *piter,
                          struct scatterlist *sglist, unsigned int nents,
                          unsigned long pgoffset);
/**
 * sg_page_iter_page - get the current page held by the page iterator
 * @piter:        page iterator holding the page
 */
static inline struct page *sg_page_iter_page(struct sg_page_iter *piter)
{
        return sg_page(piter->sg) + piter->sg_pgoffset;
}

/**
 * sg_page_iter_dma_address - get the dma address of the current page held by
 * the page iterator.
 * @dma_iter:        page iterator holding the page
 */
static inline dma_addr_t
sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter)
{
        return sg_dma_address(dma_iter->base.sg) +
               (dma_iter->base.sg_pgoffset << PAGE_SHIFT);
}

/**
 * for_each_sg_page - iterate over the pages of the given sg list
 * @sglist:        sglist to iterate over
 * @piter:        page iterator to hold current page, sg, sg_pgoffset
 * @nents:        maximum number of sg entries to iterate over
 * @pgoffset:        starting page offset (in pages)
 *
 * Callers may use sg_page_iter_page() to get each page pointer.
 * In each loop it operates on PAGE_SIZE unit.
 */
#define for_each_sg_page(sglist, piter, nents, pgoffset)                   \
        for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \
             __sg_page_iter_next(piter);)

/**
 * for_each_sg_dma_page - iterate over the pages of the given sg list
 * @sglist:        sglist to iterate over
 * @dma_iter:        DMA page iterator to hold current page
 * @dma_nents:        maximum number of sg entries to iterate over, this is the value
 *              returned from dma_map_sg
 * @pgoffset:        starting page offset (in pages)
 *
 * Callers may use sg_page_iter_dma_address() to get each page's DMA address.
 * In each loop it operates on PAGE_SIZE unit.
 */
#define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset)            \
        for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents,        \
                                  pgoffset);                                   \
             __sg_page_iter_dma_next(dma_iter);)

/**
 * for_each_sgtable_page - iterate over all pages in the sg_table object
 * @sgt:        sg_table object to iterate over
 * @piter:        page iterator to hold current page
 * @pgoffset:        starting page offset (in pages)
 *
 * Iterates over the all memory pages in the buffer described by
 * a scatterlist stored in the given sg_table object.
 * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit.
 */
#define for_each_sgtable_page(sgt, piter, pgoffset)        \
        for_each_sg_page((sgt)->sgl, piter, (sgt)->orig_nents, pgoffset)

/**
 * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object
 * @sgt:        sg_table object to iterate over
 * @dma_iter:        DMA page iterator to hold current page
 * @pgoffset:        starting page offset (in pages)
 *
 * Iterates over the all DMA mapped pages in the buffer described by
 * a scatterlist stored in the given sg_table object.
 * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE
 * unit.
 */
#define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset)        \
        for_each_sg_dma_page((sgt)->sgl, dma_iter, (sgt)->nents, pgoffset)


/*
 * Mapping sg iterator
 *
 * Iterates over sg entries mapping page-by-page.  On each successful
 * iteration, @miter->page points to the mapped page and
 * @miter->length bytes of data can be accessed at @miter->addr.  As
 * long as an iteration is enclosed between start and stop, the user
 * is free to choose control structure and when to stop.
 *
 * @miter->consumed is set to @miter->length on each iteration.  It
 * can be adjusted if the user can't consume all the bytes in one go.
 * Also, a stopped iteration can be resumed by calling next on it.
 * This is useful when iteration needs to release all resources and
 * continue later (e.g. at the next interrupt).
 */

#define SG_MITER_ATOMIC                (1 << 0)         /* use kmap_atomic */
#define SG_MITER_TO_SG                (1 << 1)        /* flush back to phys on unmap */
#define SG_MITER_FROM_SG        (1 << 2)        /* nop */
#define SG_MITER_LOCAL                (1 << 3)         /* use kmap_local */

struct sg_mapping_iter {
        /* the following three fields can be accessed directly */
        struct page                *page;                /* currently mapped page */
        void                        *addr;                /* pointer to the mapped area */
        size_t                        length;                /* length of the mapped area */
        size_t                        consumed;        /* number of consumed bytes */
        struct sg_page_iter        piter;                /* page iterator */

        /* these are internal states, keep away */
        unsigned int                __offset;        /* offset within page */
        unsigned int                __remaining;        /* remaining bytes on page */
        unsigned int                __flags;
};

void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl,
                    unsigned int nents, unsigned int flags);
bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset);
bool sg_miter_next(struct sg_mapping_iter *miter);
void sg_miter_stop(struct sg_mapping_iter *miter);

#endif /* _LINUX_SCATTERLIST_H */











    3 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM icmp

#if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_ICMP_H

#include <linux/icmp.h>
#include <linux/tracepoint.h>

TRACE_EVENT(icmp_send,

                TP_PROTO(const struct sk_buff *skb, int type, int code),

                TP_ARGS(skb, type, code),

                TP_STRUCT__entry(
                        __field(const void *, skbaddr)
                        __field(int, type)
                        __field(int, code)
                        __array(__u8, saddr, 4)
                        __array(__u8, daddr, 4)
                        __field(__u16, sport)
                        __field(__u16, dport)
                        __field(unsigned short, ulen)
                ),

                TP_fast_assign(
                        struct iphdr *iph = ip_hdr(skb);
                        struct udphdr *uh = udp_hdr(skb);
                        int proto_4 = iph->protocol;
                        __be32 *p32;

                        __entry->skbaddr = skb;
                        __entry->type = type;
                        __entry->code = code;

                        if (proto_4 != IPPROTO_UDP || (u8 *)uh < skb->head ||
                                (u8 *)uh + sizeof(struct udphdr)
                                > skb_tail_pointer(skb)) {
                                __entry->sport = 0;
                                __entry->dport = 0;
                                __entry->ulen = 0;
                        } else {
                                __entry->sport = ntohs(uh->source);
                                __entry->dport = ntohs(uh->dest);
                                __entry->ulen = ntohs(uh->len);
                        }

                        p32 = (__be32 *) __entry->saddr;
                        *p32 = iph->saddr;

                        p32 = (__be32 *) __entry->daddr;
                        *p32 = iph->daddr;
                ),

                TP_printk("icmp_send: type=%d, code=%d. From %pI4:%u to %pI4:%u ulen=%d skbaddr=%p",
                        __entry->type, __entry->code,
                        __entry->saddr, __entry->sport, __entry->daddr,
                        __entry->dport, __entry->ulen, __entry->skbaddr)
);

#endif /* _TRACE_ICMP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






















































































































































































































































































































































































    1 










    1 



















   12 

















   13 


























   12 
























































   14 






























   12 






   13 


   14 






   14 
   14 





   11 




























   12 
   14 

































































































































































































    2 

    2 











    2 





    2 
    2 



    2 


    2 

    2 




    2 


    2 

    2 


    2 
    2 































    2 
    2 




























































































    2 




    2 




    2 







    2 


    2 





    2 






    2 



    1 











    2 


    2 



    2 


    2 












    2 








































    2 
    2 











    2 
    2 


    2 


































   13 
   11 



   14 



   13 








   14 






   13 





    2 

    2 




    2 

    2 

    2 

    2 










    2 


    2 












































































































    1 






























    1 







    2 




    1 




    2 















    2 

    2 












   13 


   12 



   13 


   14 


   14 





   14 


   13 

















   12 



   14 
    1 







   13 





   14 

   13 















































































































































































    1 




























































































































































































































































































































































































































   11 






























































































































































































































































































































































































































































































































































































































































































   11 
    4 







   14 




   10 








   10 














   11 


















   11 


   10 


















































   11 


    9 





   12 

   12 



   12 





   12 


    1 
   11 































    2 
































   14 


   14 







    9 
    2 
    2 



   11 





























































   13 









    3 











   13 












































    2 
    1 

   12 






















    9 


   13 








   13 













   13 




















































   12 












   13 



















   14 













   11 





   14 



   13 



   12 
   12 


   14 





   13 


   14 







   12 

   14 

   14 

   12 







   14 


   13 



   13 


















    1 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
// SPDX-License-Identifier: GPL-2.0
/*
 * Block multiqueue core code
 *
 * Copyright (C) 2013-2014 Jens Axboe
 * Copyright (C) 2013-2014 Christoph Hellwig
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/llist.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/suspend.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>
#include <linux/sched/isolation.h>

#include <trace/events/block.h>

#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-pm.h"
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"

static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
static DEFINE_MUTEX(blk_mq_cpuhp_lock);

static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
static void blk_mq_request_bypass_insert(struct request *rq,
                blk_insert_t flags);
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                struct list_head *list);
static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
                         struct io_comp_batch *iob, unsigned int flags);

/*
 * Check if any of the ctx, dispatch list or elevator
 * have pending work in this hardware queue.
 */
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
        return !list_empty_careful(&hctx->dispatch) ||
                sbitmap_any_bit_set(&hctx->ctx_map) ||
                        blk_mq_sched_has_work(hctx);
}

/*
 * Mark this ctx as having pending work in this hardware queue
 */
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
                                     struct blk_mq_ctx *ctx)
{
        const int bit = ctx->index_hw[hctx->type];

        if (!sbitmap_test_bit(&hctx->ctx_map, bit))
                sbitmap_set_bit(&hctx->ctx_map, bit);
}

static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
                                      struct blk_mq_ctx *ctx)
{
        const int bit = ctx->index_hw[hctx->type];

        sbitmap_clear_bit(&hctx->ctx_map, bit);
}

struct mq_inflight {
        struct block_device *part;
        unsigned int inflight[2];
};

static bool blk_mq_check_in_driver(struct request *rq, void *priv)
{
        struct mq_inflight *mi = priv;

        if (rq->rq_flags & RQF_IO_STAT &&
            (!bdev_is_partition(mi->part) || rq->part == mi->part) &&
            blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
                mi->inflight[rq_data_dir(rq)]++;

        return true;
}

void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
{
        struct mq_inflight mi = { .part = part };

        blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver,
                                   &mi);
        inflight[READ] = mi.inflight[READ];
        inflight[WRITE] = mi.inflight[WRITE];
}

#ifdef CONFIG_LOCKDEP
static bool blk_freeze_set_owner(struct request_queue *q,
                                 struct task_struct *owner)
{
        if (!owner)
                return false;

        if (!q->mq_freeze_depth) {
                q->mq_freeze_owner = owner;
                q->mq_freeze_owner_depth = 1;
                q->mq_freeze_disk_dead = !q->disk ||
                        test_bit(GD_DEAD, &q->disk->state) ||
                        !blk_queue_registered(q);
                q->mq_freeze_queue_dying = blk_queue_dying(q);
                return true;
        }

        if (owner == q->mq_freeze_owner)
                q->mq_freeze_owner_depth += 1;
        return false;
}

/* verify the last unfreeze in owner context */
static bool blk_unfreeze_check_owner(struct request_queue *q)
{
        if (q->mq_freeze_owner != current)
                return false;
        if (--q->mq_freeze_owner_depth == 0) {
                q->mq_freeze_owner = NULL;
                return true;
        }
        return false;
}

#else

static bool blk_freeze_set_owner(struct request_queue *q,
                                 struct task_struct *owner)
{
        return false;
}

static bool blk_unfreeze_check_owner(struct request_queue *q)
{
        return false;
}
#endif

bool __blk_freeze_queue_start(struct request_queue *q,
                              struct task_struct *owner)
{
        bool freeze;

        mutex_lock(&q->mq_freeze_lock);
        freeze = blk_freeze_set_owner(q, owner);
        if (++q->mq_freeze_depth == 1) {
                percpu_ref_kill(&q->q_usage_counter);
                mutex_unlock(&q->mq_freeze_lock);
                if (queue_is_mq(q))
                        blk_mq_run_hw_queues(q, false);
        } else {
                mutex_unlock(&q->mq_freeze_lock);
        }

        return freeze;
}

void blk_freeze_queue_start(struct request_queue *q)
{
        if (__blk_freeze_queue_start(q, current))
                blk_freeze_acquire_lock(q);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);

void blk_mq_freeze_queue_wait(struct request_queue *q)
{
        wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);

int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout)
{
        return wait_event_timeout(q->mq_freeze_wq,
                                        percpu_ref_is_zero(&q->q_usage_counter),
                                        timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);

void blk_mq_freeze_queue_nomemsave(struct request_queue *q)
{
        blk_freeze_queue_start(q);
        blk_mq_freeze_queue_wait(q);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave);

bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
        bool unfreeze;

        mutex_lock(&q->mq_freeze_lock);
        if (force_atomic)
                q->q_usage_counter.data->force_atomic = true;
        q->mq_freeze_depth--;
        WARN_ON_ONCE(q->mq_freeze_depth < 0);
        if (!q->mq_freeze_depth) {
                percpu_ref_resurrect(&q->q_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }
        unfreeze = blk_unfreeze_check_owner(q);
        mutex_unlock(&q->mq_freeze_lock);

        return unfreeze;
}

void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q)
{
        if (__blk_mq_unfreeze_queue(q, false))
                blk_unfreeze_release_lock(q);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore);

/*
 * non_owner variant of blk_freeze_queue_start
 *
 * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen
 * by the same task.  This is fragile and should not be used if at all
 * possible.
 */
void blk_freeze_queue_start_non_owner(struct request_queue *q)
{
        __blk_freeze_queue_start(q, NULL);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);

/* non_owner variant of blk_mq_unfreeze_queue */
void blk_mq_unfreeze_queue_non_owner(struct request_queue *q)
{
        __blk_mq_unfreeze_queue(q, false);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner);

/*
 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 * mpt3sas driver such that this function can be removed.
 */
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
        unsigned long flags;

        spin_lock_irqsave(&q->queue_lock, flags);
        if (!q->quiesce_depth++)
                blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
        spin_unlock_irqrestore(&q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);

/**
 * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
 * @set: tag_set to wait on
 *
 * Note: it is driver's responsibility for making sure that quiesce has
 * been started on or more of the request_queues of the tag_set.  This
 * function only waits for the quiesce on those request_queues that had
 * the quiesce flag set using blk_mq_quiesce_queue_nowait.
 */
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
{
        if (set->flags & BLK_MQ_F_BLOCKING)
                synchronize_srcu(set->srcu);
        else
                synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);

/**
 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 * @q: request queue.
 *
 * Note: this function does not prevent that the struct request end_io()
 * callback function is invoked. Once this function is returned, we make
 * sure no dispatch can happen until the queue is unquiesced via
 * blk_mq_unquiesce_queue().
 */
void blk_mq_quiesce_queue(struct request_queue *q)
{
        blk_mq_quiesce_queue_nowait(q);
        /* nothing to wait for non-mq queues */
        if (queue_is_mq(q))
                blk_mq_wait_quiesce_done(q->tag_set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);

/*
 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 * @q: request queue.
 *
 * This function recovers queue into the state before quiescing
 * which is done by blk_mq_quiesce_queue.
 */
void blk_mq_unquiesce_queue(struct request_queue *q)
{
        unsigned long flags;
        bool run_queue = false;

        spin_lock_irqsave(&q->queue_lock, flags);
        if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
                ;
        } else if (!--q->quiesce_depth) {
                blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
                run_queue = true;
        }
        spin_unlock_irqrestore(&q->queue_lock, flags);

        /* dispatch requests which are inserted during quiescing */
        if (run_queue)
                blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);

void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
{
        struct request_queue *q;

        rcu_read_lock();
        list_for_each_entry_rcu(q, &set->tag_list, tag_set_list) {
                if (!blk_queue_skip_tagset_quiesce(q))
                        blk_mq_quiesce_queue_nowait(q);
        }
        rcu_read_unlock();

        blk_mq_wait_quiesce_done(set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);

void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
{
        struct request_queue *q;

        rcu_read_lock();
        list_for_each_entry_rcu(q, &set->tag_list, tag_set_list) {
                if (!blk_queue_skip_tagset_quiesce(q))
                        blk_mq_unquiesce_queue(q);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);

void blk_mq_wake_waiters(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                if (blk_mq_hw_queue_mapped(hctx))
                        blk_mq_tag_wakeup_all(hctx->tags, true);
}

void blk_rq_init(struct request_queue *q, struct request *rq)
{
        memset(rq, 0, sizeof(*rq));

        INIT_LIST_HEAD(&rq->queuelist);
        rq->q = q;
        rq->__sector = (sector_t) -1;
        rq->phys_gap_bit = 0;
        INIT_HLIST_NODE(&rq->hash);
        RB_CLEAR_NODE(&rq->rb_node);
        rq->tag = BLK_MQ_NO_TAG;
        rq->internal_tag = BLK_MQ_NO_TAG;
        rq->start_time_ns = blk_time_get_ns();
        blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);

/* Set start and alloc time when the allocated request is actually used */
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
        if (blk_queue_rq_alloc_time(rq->q))
                rq->alloc_time_ns = alloc_time_ns;
        else
                rq->alloc_time_ns = 0;
#endif
}

static inline void blk_mq_bio_issue_init(struct request_queue *q,
                                         struct bio *bio)
{
#ifdef CONFIG_BLK_CGROUP
        if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags))
                bio->issue_time_ns = blk_time_get_ns();
#endif
}

static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                struct blk_mq_tags *tags, unsigned int tag)
{
        struct blk_mq_ctx *ctx = data->ctx;
        struct blk_mq_hw_ctx *hctx = data->hctx;
        struct request_queue *q = data->q;
        struct request *rq = tags->static_rqs[tag];

        rq->q = q;
        rq->mq_ctx = ctx;
        rq->mq_hctx = hctx;
        rq->cmd_flags = data->cmd_flags;

        if (data->flags & BLK_MQ_REQ_PM)
                data->rq_flags |= RQF_PM;
        rq->rq_flags = data->rq_flags;

        if (data->rq_flags & RQF_SCHED_TAGS) {
                rq->tag = BLK_MQ_NO_TAG;
                rq->internal_tag = tag;
        } else {
                rq->tag = tag;
                rq->internal_tag = BLK_MQ_NO_TAG;
        }
        rq->timeout = 0;

        rq->part = NULL;
        rq->io_start_time_ns = 0;
        rq->stats_sectors = 0;
        rq->nr_phys_segments = 0;
        rq->nr_integrity_segments = 0;
        rq->end_io = NULL;
        rq->end_io_data = NULL;

        blk_crypto_rq_set_defaults(rq);
        INIT_LIST_HEAD(&rq->queuelist);
        /* tag was already set */
        WRITE_ONCE(rq->deadline, 0);
        req_ref_set(rq, 1);

        if (rq->rq_flags & RQF_USE_SCHED) {
                struct elevator_queue *e = data->q->elevator;

                INIT_HLIST_NODE(&rq->hash);
                RB_CLEAR_NODE(&rq->rb_node);

                if (e->type->ops.prepare_request)
                        e->type->ops.prepare_request(rq);
        }

        return rq;
}

static inline struct request *
__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
{
        unsigned int tag, tag_offset;
        struct blk_mq_tags *tags;
        struct request *rq;
        unsigned long tag_mask;
        int i, nr = 0;

        do {
                tag_mask = blk_mq_get_tags(data, data->nr_tags - nr, &tag_offset);
                if (unlikely(!tag_mask)) {
                        if (nr == 0)
                                return NULL;
                        break;
                }
                tags = blk_mq_tags_from_data(data);
                for (i = 0; tag_mask; i++) {
                        if (!(tag_mask & (1UL << i)))
                                continue;
                        tag = tag_offset + i;
                        prefetch(tags->static_rqs[tag]);
                        tag_mask &= ~(1UL << i);
                        rq = blk_mq_rq_ctx_init(data, tags, tag);
                        rq_list_add_head(data->cached_rqs, rq);
                        nr++;
                }
        } while (data->nr_tags > nr);

        if (!(data->rq_flags & RQF_SCHED_TAGS))
                blk_mq_add_active_requests(data->hctx, nr);
        /* caller already holds a reference, add for remainder */
        percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
        data->nr_tags -= nr;

        return rq_list_pop(data->cached_rqs);
}

static void blk_mq_limit_depth(struct blk_mq_alloc_data *data)
{
        struct elevator_mq_ops *ops;

        /* If no I/O scheduler has been configured, don't limit requests */
        if (!data->q->elevator) {
                blk_mq_tag_busy(data->hctx);
                return;
        }

        /*
         * All requests use scheduler tags when an I/O scheduler is
         * enabled for the queue.
         */
        data->rq_flags |= RQF_SCHED_TAGS;

        /*
         * Flush/passthrough requests are special and go directly to the
         * dispatch list, they are not subject to the async_depth limit.
         */
        if ((data->cmd_flags & REQ_OP_MASK) == REQ_OP_FLUSH ||
            blk_op_is_passthrough(data->cmd_flags))
                return;

        WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);
        data->rq_flags |= RQF_USE_SCHED;

        /*
         * By default, sync requests have no limit, and async requests are
         * limited to async_depth.
         */
        ops = &data->q->elevator->type->ops;
        if (ops->limit_depth)
                ops->limit_depth(data->cmd_flags, data);
}

static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
{
        struct request_queue *q = data->q;
        u64 alloc_time_ns = 0;
        struct request *rq;
        unsigned int tag;

        /* alloc_time includes depth and tag waits */
        if (blk_queue_rq_alloc_time(q))
                alloc_time_ns = blk_time_get_ns();

        if (data->cmd_flags & REQ_NOWAIT)
                data->flags |= BLK_MQ_REQ_NOWAIT;

retry:
        data->ctx = blk_mq_get_ctx(q);
        data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);

        blk_mq_limit_depth(data);
        if (data->flags & BLK_MQ_REQ_RESERVED)
                data->rq_flags |= RQF_RESV;

        /*
         * Try batched alloc if we want more than 1 tag.
         */
        if (data->nr_tags > 1) {
                rq = __blk_mq_alloc_requests_batch(data);
                if (rq) {
                        blk_mq_rq_time_init(rq, alloc_time_ns);
                        return rq;
                }
                data->nr_tags = 1;
        }

        /*
         * Waiting allocations only fail because of an inactive hctx.  In that
         * case just retry the hctx assignment and tag allocation as CPU hotplug
         * should have migrated us to an online CPU by now.
         */
        tag = blk_mq_get_tag(data);
        if (tag == BLK_MQ_NO_TAG) {
                if (data->flags & BLK_MQ_REQ_NOWAIT)
                        return NULL;
                /*
                 * Give up the CPU and sleep for a random short time to
                 * ensure that thread using a realtime scheduling class
                 * are migrated off the CPU, and thus off the hctx that
                 * is going away.
                 */
                msleep(3);
                goto retry;
        }

        if (!(data->rq_flags & RQF_SCHED_TAGS))
                blk_mq_inc_active_requests(data->hctx);
        rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
        blk_mq_rq_time_init(rq, alloc_time_ns);
        return rq;
}

static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
                                            struct blk_plug *plug,
                                            blk_opf_t opf,
                                            blk_mq_req_flags_t flags)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .flags                = flags,
                .shallow_depth        = 0,
                .cmd_flags        = opf,
                .rq_flags        = 0,
                .nr_tags        = plug->nr_ios,
                .cached_rqs        = &plug->cached_rqs,
                .ctx                = NULL,
                .hctx                = NULL
        };
        struct request *rq;

        if (blk_queue_enter(q, flags))
                return NULL;

        plug->nr_ios = 1;

        rq = __blk_mq_alloc_requests(&data);
        if (unlikely(!rq))
                blk_queue_exit(q);
        return rq;
}

static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
                                                   blk_opf_t opf,
                                                   blk_mq_req_flags_t flags)
{
        struct blk_plug *plug = current->plug;
        struct request *rq;

        if (!plug)
                return NULL;

        if (rq_list_empty(&plug->cached_rqs)) {
                if (plug->nr_ios == 1)
                        return NULL;
                rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
                if (!rq)
                        return NULL;
        } else {
                rq = rq_list_peek(&plug->cached_rqs);
                if (!rq || rq->q != q)
                        return NULL;

                if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
                        return NULL;
                if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
                        return NULL;

                rq_list_pop(&plug->cached_rqs);
                blk_mq_rq_time_init(rq, blk_time_get_ns());
        }

        rq->cmd_flags = opf;
        INIT_LIST_HEAD(&rq->queuelist);
        return rq;
}

struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
                blk_mq_req_flags_t flags)
{
        struct request *rq;

        rq = blk_mq_alloc_cached_request(q, opf, flags);
        if (!rq) {
                struct blk_mq_alloc_data data = {
                        .q                = q,
                        .flags                = flags,
                        .shallow_depth        = 0,
                        .cmd_flags        = opf,
                        .rq_flags        = 0,
                        .nr_tags        = 1,
                        .cached_rqs        = NULL,
                        .ctx                = NULL,
                        .hctx                = NULL
                };
                int ret;

                ret = blk_queue_enter(q, flags);
                if (ret)
                        return ERR_PTR(ret);

                rq = __blk_mq_alloc_requests(&data);
                if (!rq)
                        goto out_queue_exit;
        }
        rq->__data_len = 0;
        rq->phys_gap_bit = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
        return rq;
out_queue_exit:
        blk_queue_exit(q);
        return ERR_PTR(-EWOULDBLOCK);
}
EXPORT_SYMBOL(blk_mq_alloc_request);

struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
        blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .flags                = flags,
                .shallow_depth        = 0,
                .cmd_flags        = opf,
                .rq_flags        = 0,
                .nr_tags        = 1,
                .cached_rqs        = NULL,
                .ctx                = NULL,
                .hctx                = NULL
        };
        u64 alloc_time_ns = 0;
        struct request *rq;
        unsigned int cpu;
        unsigned int tag;
        int ret;

        /* alloc_time includes depth and tag waits */
        if (blk_queue_rq_alloc_time(q))
                alloc_time_ns = blk_time_get_ns();

        /*
         * If the tag allocator sleeps we could get an allocation for a
         * different hardware context.  No need to complicate the low level
         * allocator for this for the rare use case of a command tied to
         * a specific queue.
         */
        if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
            WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
                return ERR_PTR(-EINVAL);

        if (hctx_idx >= q->nr_hw_queues)
                return ERR_PTR(-EIO);

        ret = blk_queue_enter(q, flags);
        if (ret)
                return ERR_PTR(ret);

        /*
         * Check if the hardware context is actually mapped to anything.
         * If not tell the caller that it should skip this queue.
         */
        ret = -EXDEV;
        data.hctx = q->queue_hw_ctx[hctx_idx];
        if (!blk_mq_hw_queue_mapped(data.hctx))
                goto out_queue_exit;
        cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
        if (cpu >= nr_cpu_ids)
                goto out_queue_exit;
        data.ctx = __blk_mq_get_ctx(q, cpu);

        if (q->elevator)
                data.rq_flags |= RQF_SCHED_TAGS;
        else
                blk_mq_tag_busy(data.hctx);

        if (flags & BLK_MQ_REQ_RESERVED)
                data.rq_flags |= RQF_RESV;

        ret = -EWOULDBLOCK;
        tag = blk_mq_get_tag(&data);
        if (tag == BLK_MQ_NO_TAG)
                goto out_queue_exit;
        if (!(data.rq_flags & RQF_SCHED_TAGS))
                blk_mq_inc_active_requests(data.hctx);
        rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
        blk_mq_rq_time_init(rq, alloc_time_ns);
        rq->__data_len = 0;
        rq->phys_gap_bit = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
        return rq;

out_queue_exit:
        blk_queue_exit(q);
        return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

static void blk_mq_finish_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        blk_zone_finish_request(rq);

        if (rq->rq_flags & RQF_USE_SCHED) {
                q->elevator->type->ops.finish_request(rq);
                /*
                 * For postflush request that may need to be
                 * completed twice, we should clear this flag
                 * to avoid double finish_request() on the rq.
                 */
                rq->rq_flags &= ~RQF_USE_SCHED;
        }
}

static void __blk_mq_free_request(struct request *rq)
{
        struct request_queue *q = rq->q;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        const int sched_tag = rq->internal_tag;

        blk_crypto_free_request(rq);
        blk_pm_mark_last_busy(rq);
        rq->mq_hctx = NULL;

        if (rq->tag != BLK_MQ_NO_TAG) {
                blk_mq_dec_active_requests(hctx);
                blk_mq_put_tag(hctx->tags, ctx, rq->tag);
        }
        if (sched_tag != BLK_MQ_NO_TAG)
                blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
        blk_mq_sched_restart(hctx);
        blk_queue_exit(q);
}

void blk_mq_free_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        blk_mq_finish_request(rq);

        rq_qos_done(q, rq);

        WRITE_ONCE(rq->state, MQ_RQ_IDLE);
        if (req_ref_put_and_test(rq))
                __blk_mq_free_request(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);

void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
        struct request *rq;

        while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
                blk_mq_free_request(rq);
}

void blk_dump_rq_flags(struct request *rq, char *msg)
{
        printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
                rq->q->disk ? rq->q->disk->disk_name : "?",
                (__force unsigned long long) rq->cmd_flags);

        printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
               (unsigned long long)blk_rq_pos(rq),
               blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
        printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
               rq->bio, rq->biotail, blk_rq_bytes(rq));
}
EXPORT_SYMBOL(blk_dump_rq_flags);

static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
        if (req->rq_flags & RQF_IO_STAT) {
                const int sgrp = op_stat_group(req_op(req));

                part_stat_lock();
                part_stat_add(req->part, sectors[sgrp], bytes >> 9);
                part_stat_unlock();
        }
}

static void blk_print_req_error(struct request *req, blk_status_t status)
{
        printk_ratelimited(KERN_ERR
                "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
                "phys_seg %u prio class %u\n",
                blk_status_to_str(status),
                req->q->disk ? req->q->disk->disk_name : "?",
                blk_rq_pos(req), (__force u32)req_op(req),
                blk_op_str(req_op(req)),
                (__force u32)(req->cmd_flags & ~REQ_OP_MASK),
                req->nr_phys_segments,
                IOPRIO_PRIO_CLASS(req_get_ioprio(req)));
}

/*
 * Fully end IO on a request. Does not support partial completions, or
 * errors.
 */
static void blk_complete_request(struct request *req)
{
        const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
        int total_bytes = blk_rq_bytes(req);
        struct bio *bio = req->bio;

        trace_block_rq_complete(req, BLK_STS_OK, total_bytes);

        if (!bio)
                return;

        if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
                blk_integrity_complete(req, total_bytes);

        /*
         * Upper layers may call blk_crypto_evict_key() anytime after the last
         * bio_endio().  Therefore, the keyslot must be released before that.
         */
        blk_crypto_rq_put_keyslot(req);

        blk_account_io_completion(req, total_bytes);

        do {
                struct bio *next = bio->bi_next;

                /* Completion has already been traced */
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);

                if (blk_req_bio_is_zone_append(req, bio))
                        blk_zone_append_update_request_bio(req, bio);

                if (!is_flush)
                        bio_endio(bio);
                bio = next;
        } while (bio);

        /*
         * Reset counters so that the request stacking driver
         * can find how many bytes remain in the request
         * later.
         */
        if (!req->end_io) {
                req->bio = NULL;
                req->__data_len = 0;
        }
}

/**
 * blk_update_request - Complete multiple bytes without completing the request
 * @req:      the request being processed
 * @error:    block status code
 * @nr_bytes: number of bytes to complete for @req
 *
 * Description:
 *     Ends I/O on a number of bytes attached to @req, but doesn't complete
 *     the request structure even if @req doesn't have leftover.
 *     If @req has leftover, sets it up for the next range of segments.
 *
 *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
 *     %false return from this function.
 *
 * Note:
 *        The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
 *      except in the consistency check at the end of this function.
 *
 * Return:
 *     %false - this request doesn't have any more data
 *     %true  - this request has more data
 **/
bool blk_update_request(struct request *req, blk_status_t error,
                unsigned int nr_bytes)
{
        bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
        bool quiet = req->rq_flags & RQF_QUIET;
        int total_bytes;

        trace_block_rq_complete(req, error, nr_bytes);

        if (!req->bio)
                return false;

        if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
            error == BLK_STS_OK)
                blk_integrity_complete(req, nr_bytes);

        /*
         * Upper layers may call blk_crypto_evict_key() anytime after the last
         * bio_endio().  Therefore, the keyslot must be released before that.
         */
        if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
                __blk_crypto_rq_put_keyslot(req);

        if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
            !test_bit(GD_DEAD, &req->q->disk->state)) {
                blk_print_req_error(req, error);
                trace_block_rq_error(req, error, nr_bytes);
        }

        blk_account_io_completion(req, nr_bytes);

        total_bytes = 0;
        while (req->bio) {
                struct bio *bio = req->bio;
                unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);

                if (unlikely(error))
                        bio->bi_status = error;

                if (bio_bytes == bio->bi_iter.bi_size) {
                        req->bio = bio->bi_next;
                } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
                        /*
                         * Partial zone append completions cannot be supported
                         * as the BIO fragments may end up not being written
                         * sequentially.
                         */
                        bio->bi_status = BLK_STS_IOERR;
                }

                /* Completion has already been traced */
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);
                if (unlikely(quiet))
                        bio_set_flag(bio, BIO_QUIET);

                bio_advance(bio, bio_bytes);

                /* Don't actually finish bio if it's part of flush sequence */
                if (!bio->bi_iter.bi_size) {
                        if (blk_req_bio_is_zone_append(req, bio))
                                blk_zone_append_update_request_bio(req, bio);
                        if (!is_flush)
                                bio_endio(bio);
                }

                total_bytes += bio_bytes;
                nr_bytes -= bio_bytes;

                if (!nr_bytes)
                        break;
        }

        /*
         * completely done
         */
        if (!req->bio) {
                /*
                 * Reset counters so that the request stacking driver
                 * can find how many bytes remain in the request
                 * later.
                 */
                req->__data_len = 0;
                return false;
        }

        req->__data_len -= total_bytes;

        /* update sector only for requests with clear definition of sector */
        if (!blk_rq_is_passthrough(req))
                req->__sector += total_bytes >> 9;

        /* mixed attributes always follow the first bio */
        if (req->rq_flags & RQF_MIXED_MERGE) {
                req->cmd_flags &= ~REQ_FAILFAST_MASK;
                req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
        }

        if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
                /*
                 * If total number of sectors is less than the first segment
                 * size, something has gone terribly wrong.
                 */
                if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
                        blk_dump_rq_flags(req, "request botched");
                        req->__data_len = blk_rq_cur_bytes(req);
                }

                /* recalculate the number of segments */
                req->nr_phys_segments = blk_recalc_rq_segments(req);
        }

        return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);

static inline void blk_account_io_done(struct request *req, u64 now)
{
        trace_block_io_done(req);

        /*
         * Account IO completion.  flush_rq isn't accounted as a
         * normal IO on queueing nor completion.  Accounting the
         * containing request is enough.
         */
        if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) {
                const int sgrp = op_stat_group(req_op(req));

                part_stat_lock();
                update_io_ticks(req->part, jiffies, true);
                part_stat_inc(req->part, ios[sgrp]);
                part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
                part_stat_local_dec(req->part,
                                    in_flight[op_is_write(req_op(req))]);
                part_stat_unlock();
        }
}

static inline bool blk_rq_passthrough_stats(struct request *req)
{
        struct bio *bio = req->bio;

        if (!blk_queue_passthrough_stat(req->q))
                return false;

        /* Requests without a bio do not transfer data. */
        if (!bio)
                return false;

        /*
         * Stats are accumulated in the bdev, so must have one attached to a
         * bio to track stats. Most drivers do not set the bdev for passthrough
         * requests, but nvme is one that will set it.
         */
        if (!bio->bi_bdev)
                return false;

        /*
         * We don't know what a passthrough command does, but we know the
         * payload size and data direction. Ensuring the size is aligned to the
         * block size filters out most commands with payloads that don't
         * represent sector access.
         */
        if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1))
                return false;
        return true;
}

static inline void blk_account_io_start(struct request *req)
{
        trace_block_io_start(req);

        if (!blk_queue_io_stat(req->q))
                return;
        if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req))
                return;

        req->rq_flags |= RQF_IO_STAT;
        req->start_time_ns = blk_time_get_ns();

        /*
         * All non-passthrough requests are created from a bio with one
         * exception: when a flush command that is part of a flush sequence
         * generated by the state machine in blk-flush.c is cloned onto the
         * lower device by dm-multipath we can get here without a bio.
         */
        if (req->bio)
                req->part = req->bio->bi_bdev;
        else
                req->part = req->q->disk->part0;

        part_stat_lock();
        update_io_ticks(req->part, jiffies, false);
        part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]);
        part_stat_unlock();
}

static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
{
        if (rq->rq_flags & RQF_STATS)
                blk_stat_add(rq, now);

        blk_mq_sched_completed_request(rq, now);
        blk_account_io_done(rq, now);
}

inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
        if (blk_mq_need_time_stamp(rq))
                __blk_mq_end_request_acct(rq, blk_time_get_ns());

        blk_mq_finish_request(rq);

        if (rq->end_io) {
                rq_qos_done(rq->q, rq);
                if (rq->end_io(rq, error, NULL) == RQ_END_IO_FREE)
                        blk_mq_free_request(rq);
        } else {
                blk_mq_free_request(rq);
        }
}
EXPORT_SYMBOL(__blk_mq_end_request);

void blk_mq_end_request(struct request *rq, blk_status_t error)
{
        if (blk_update_request(rq, error, blk_rq_bytes(rq)))
                BUG();
        __blk_mq_end_request(rq, error);
}
EXPORT_SYMBOL(blk_mq_end_request);

#define TAG_COMP_BATCH                32

static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
                                          int *tag_array, int nr_tags)
{
        struct request_queue *q = hctx->queue;

        blk_mq_sub_active_requests(hctx, nr_tags);

        blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
        percpu_ref_put_many(&q->q_usage_counter, nr_tags);
}

void blk_mq_end_request_batch(struct io_comp_batch *iob)
{
        int tags[TAG_COMP_BATCH], nr_tags = 0;
        struct blk_mq_hw_ctx *cur_hctx = NULL;
        struct request *rq;
        u64 now = 0;

        if (iob->need_ts)
                now = blk_time_get_ns();

        while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
                prefetch(rq->bio);
                prefetch(rq->rq_next);

                blk_complete_request(rq);
                if (iob->need_ts)
                        __blk_mq_end_request_acct(rq, now);

                blk_mq_finish_request(rq);

                rq_qos_done(rq->q, rq);

                /*
                 * If end_io handler returns NONE, then it still has
                 * ownership of the request.
                 */
                if (rq->end_io && rq->end_io(rq, 0, iob) == RQ_END_IO_NONE)
                        continue;

                WRITE_ONCE(rq->state, MQ_RQ_IDLE);
                if (!req_ref_put_and_test(rq))
                        continue;

                blk_crypto_free_request(rq);
                blk_pm_mark_last_busy(rq);

                if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
                        if (cur_hctx)
                                blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
                        nr_tags = 0;
                        cur_hctx = rq->mq_hctx;
                }
                tags[nr_tags++] = rq->tag;
        }

        if (nr_tags)
                blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
}
EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);

static void blk_complete_reqs(struct llist_head *list)
{
        struct llist_node *entry = llist_reverse_order(llist_del_all(list));
        struct request *rq, *next;

        llist_for_each_entry_safe(rq, next, entry, ipi_list)
                rq->q->mq_ops->complete(rq);
}

static __latent_entropy void blk_done_softirq(void)
{
        blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
}

static int blk_softirq_cpu_dead(unsigned int cpu)
{
        blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
        return 0;
}

static void __blk_mq_complete_request_remote(void *data)
{
        __raise_softirq_irqoff(BLOCK_SOFTIRQ);
}

static inline bool blk_mq_complete_need_ipi(struct request *rq)
{
        int cpu = raw_smp_processor_id();

        if (!IS_ENABLED(CONFIG_SMP) ||
            !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
                return false;
        /*
         * With force threaded interrupts enabled, raising softirq from an SMP
         * function call will always result in waking the ksoftirqd thread.
         * This is probably worse than completing the request on a different
         * cache domain.
         */
        if (force_irqthreads())
                return false;

        /* same CPU or cache domain and capacity?  Complete locally */
        if (cpu == rq->mq_ctx->cpu ||
            (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
             cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
             cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
                return false;

        /* don't try to IPI to an offline CPU */
        return cpu_online(rq->mq_ctx->cpu);
}

static void blk_mq_complete_send_ipi(struct request *rq)
{
        unsigned int cpu;

        cpu = rq->mq_ctx->cpu;
        if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu)))
                smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu));
}

static void blk_mq_raise_softirq(struct request *rq)
{
        struct llist_head *list;

        preempt_disable();
        list = this_cpu_ptr(&blk_cpu_done);
        if (llist_add(&rq->ipi_list, list))
                raise_softirq(BLOCK_SOFTIRQ);
        preempt_enable();
}

bool blk_mq_complete_request_remote(struct request *rq)
{
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);

        /*
         * For request which hctx has only one ctx mapping,
         * or a polled request, always complete locally,
         * it's pointless to redirect the completion.
         */
        if ((rq->mq_hctx->nr_ctx == 1 &&
             rq->mq_ctx->cpu == raw_smp_processor_id()) ||
             rq->cmd_flags & REQ_POLLED)
                return false;

        if (blk_mq_complete_need_ipi(rq)) {
                blk_mq_complete_send_ipi(rq);
                return true;
        }

        if (rq->q->nr_hw_queues == 1) {
                blk_mq_raise_softirq(rq);
                return true;
        }
        return false;
}
EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);

/**
 * blk_mq_complete_request - end I/O on a request
 * @rq:                the request being processed
 *
 * Description:
 *        Complete a request by scheduling the ->complete_rq operation.
 **/
void blk_mq_complete_request(struct request *rq)
{
        if (!blk_mq_complete_request_remote(rq))
                rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);

/**
 * blk_mq_start_request - Start processing a request
 * @rq: Pointer to request to be started
 *
 * Function used by device drivers to notify the block layer that a request
 * is going to be processed now, so blk layer can do proper initializations
 * such as starting the timeout timer.
 */
void blk_mq_start_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        trace_block_rq_issue(rq);

        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
            !blk_rq_is_passthrough(rq)) {
                rq->io_start_time_ns = blk_time_get_ns();
                rq->stats_sectors = blk_rq_sectors(rq);
                rq->rq_flags |= RQF_STATS;
                rq_qos_issue(q, rq);
        }

        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);

        blk_add_timer(rq);
        WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
        rq->mq_hctx->tags->rqs[rq->tag] = rq;

        if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
                blk_integrity_prepare(rq);

        if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
                WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
}
EXPORT_SYMBOL(blk_mq_start_request);

/*
 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
 * queues. This is important for md arrays to benefit from merging
 * requests.
 */
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
{
        if (plug->multiple_queues)
                return BLK_MAX_REQUEST_COUNT * 2;
        return BLK_MAX_REQUEST_COUNT;
}

static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
{
        struct request *last = rq_list_peek(&plug->mq_list);

        if (!plug->rq_count) {
                trace_block_plug(rq->q);
        } else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
                   (!blk_queue_nomerges(rq->q) &&
                    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
                blk_mq_flush_plug_list(plug, false);
                last = NULL;
                trace_block_plug(rq->q);
        }

        if (!plug->multiple_queues && last && last->q != rq->q)
                plug->multiple_queues = true;
        /*
         * Any request allocated from sched tags can't be issued to
         * ->queue_rqs() directly
         */
        if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
                plug->has_elevator = true;
        rq_list_add_tail(&plug->mq_list, rq);
        plug->rq_count++;
}

/**
 * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
 * @rq:                request to insert
 * @at_head:    insert request at head or tail of queue
 *
 * Description:
 *    Insert a fully prepared request at the back of the I/O scheduler queue
 *    for execution.  Don't wait for completion.
 *
 * Note:
 *    This function will invoke @done directly if the queue is dead.
 */
void blk_execute_rq_nowait(struct request *rq, bool at_head)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        WARN_ON(irqs_disabled());
        WARN_ON(!blk_rq_is_passthrough(rq));

        blk_account_io_start(rq);

        if (current->plug && !at_head) {
                blk_add_rq_to_plug(current->plug, rq);
                return;
        }

        blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
        blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
}
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);

struct blk_rq_wait {
        struct completion done;
        blk_status_t ret;
};

static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret,
                                          const struct io_comp_batch *iob)
{
        struct blk_rq_wait *wait = rq->end_io_data;

        wait->ret = ret;
        complete(&wait->done);
        return RQ_END_IO_NONE;
}

bool blk_rq_is_poll(struct request *rq)
{
        if (!rq->mq_hctx)
                return false;
        if (rq->mq_hctx->type != HCTX_TYPE_POLL)
                return false;
        return true;
}
EXPORT_SYMBOL_GPL(blk_rq_is_poll);

static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
{
        do {
                blk_hctx_poll(rq->q, rq->mq_hctx, NULL, BLK_POLL_ONESHOT);
                cond_resched();
        } while (!completion_done(wait));
}

/**
 * blk_execute_rq - insert a request into queue for execution
 * @rq:                request to insert
 * @at_head:    insert request at head or tail of queue
 *
 * Description:
 *    Insert a fully prepared request at the back of the I/O scheduler queue
 *    for execution and wait for completion.
 * Return: The blk_status_t result provided to blk_mq_end_request().
 */
blk_status_t blk_execute_rq(struct request *rq, bool at_head)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        struct blk_rq_wait wait = {
                .done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
        };

        WARN_ON(irqs_disabled());
        WARN_ON(!blk_rq_is_passthrough(rq));

        rq->end_io_data = &wait;
        rq->end_io = blk_end_sync_rq;

        blk_account_io_start(rq);
        blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
        blk_mq_run_hw_queue(hctx, false);

        if (blk_rq_is_poll(rq))
                blk_rq_poll_completion(rq, &wait.done);
        else
                blk_wait_io(&wait.done);

        return wait.ret;
}
EXPORT_SYMBOL(blk_execute_rq);

static void __blk_mq_requeue_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        blk_mq_put_driver_tag(rq);

        trace_block_rq_requeue(rq);
        rq_qos_requeue(q, rq);

        if (blk_mq_request_started(rq)) {
                WRITE_ONCE(rq->state, MQ_RQ_IDLE);
                rq->rq_flags &= ~RQF_TIMED_OUT;
        }
}

void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
        struct request_queue *q = rq->q;
        unsigned long flags;

        __blk_mq_requeue_request(rq);

        /* this request will be re-inserted to io scheduler queue */
        blk_mq_sched_requeue_request(rq);

        spin_lock_irqsave(&q->requeue_lock, flags);
        list_add_tail(&rq->queuelist, &q->requeue_list);
        spin_unlock_irqrestore(&q->requeue_lock, flags);

        if (kick_requeue_list)
                blk_mq_kick_requeue_list(q);
}
EXPORT_SYMBOL(blk_mq_requeue_request);

static void blk_mq_requeue_work(struct work_struct *work)
{
        struct request_queue *q =
                container_of(work, struct request_queue, requeue_work.work);
        LIST_HEAD(rq_list);
        LIST_HEAD(flush_list);
        struct request *rq;

        spin_lock_irq(&q->requeue_lock);
        list_splice_init(&q->requeue_list, &rq_list);
        list_splice_init(&q->flush_list, &flush_list);
        spin_unlock_irq(&q->requeue_lock);

        while (!list_empty(&rq_list)) {
                rq = list_entry(rq_list.next, struct request, queuelist);
                list_del_init(&rq->queuelist);
                /*
                 * If RQF_DONTPREP is set, the request has been started by the
                 * driver already and might have driver-specific data allocated
                 * already.  Insert it into the hctx dispatch list to avoid
                 * block layer merges for the request.
                 */
                if (rq->rq_flags & RQF_DONTPREP)
                        blk_mq_request_bypass_insert(rq, 0);
                else
                        blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
        }

        while (!list_empty(&flush_list)) {
                rq = list_entry(flush_list.next, struct request, queuelist);
                list_del_init(&rq->queuelist);
                blk_mq_insert_request(rq, 0);
        }

        blk_mq_run_hw_queues(q, false);
}

void blk_mq_kick_requeue_list(struct request_queue *q)
{
        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);

void blk_mq_delay_kick_requeue_list(struct request_queue *q,
                                    unsigned long msecs)
{
        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
                                    msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);

static bool blk_is_flush_data_rq(struct request *rq)
{
        return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq);
}

static bool blk_mq_rq_inflight(struct request *rq, void *priv)
{
        /*
         * If we find a request that isn't idle we know the queue is busy
         * as it's checked in the iter.
         * Return false to stop the iteration.
         *
         * In case of queue quiesce, if one flush data request is completed,
         * don't count it as inflight given the flush sequence is suspended,
         * and the original flush data request is invisible to driver, just
         * like other pending requests because of quiesce
         */
        if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
                                blk_is_flush_data_rq(rq) &&
                                blk_mq_request_completed(rq))) {
                bool *busy = priv;

                *busy = true;
                return false;
        }

        return true;
}

bool blk_mq_queue_inflight(struct request_queue *q)
{
        bool busy = false;

        blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
        return busy;
}
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);

static void blk_mq_rq_timed_out(struct request *req)
{
        req->rq_flags |= RQF_TIMED_OUT;
        if (req->q->mq_ops->timeout) {
                enum blk_eh_timer_return ret;

                ret = req->q->mq_ops->timeout(req);
                if (ret == BLK_EH_DONE)
                        return;
                WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
        }

        blk_add_timer(req);
}

struct blk_expired_data {
        bool has_timedout_rq;
        unsigned long next;
        unsigned long timeout_start;
};

static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
{
        unsigned long deadline;

        if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
                return false;
        if (rq->rq_flags & RQF_TIMED_OUT)
                return false;

        deadline = READ_ONCE(rq->deadline);
        if (time_after_eq(expired->timeout_start, deadline))
                return true;

        if (expired->next == 0)
                expired->next = deadline;
        else if (time_after(expired->next, deadline))
                expired->next = deadline;
        return false;
}

void blk_mq_put_rq_ref(struct request *rq)
{
        if (is_flush_rq(rq)) {
                if (rq->end_io(rq, 0, NULL) == RQ_END_IO_FREE)
                        blk_mq_free_request(rq);
        } else if (req_ref_put_and_test(rq)) {
                __blk_mq_free_request(rq);
        }
}

static bool blk_mq_check_expired(struct request *rq, void *priv)
{
        struct blk_expired_data *expired = priv;

        /*
         * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
         * be reallocated underneath the timeout handler's processing, then
         * the expire check is reliable. If the request is not expired, then
         * it was completed and reallocated as a new request after returning
         * from blk_mq_check_expired().
         */
        if (blk_mq_req_expired(rq, expired)) {
                expired->has_timedout_rq = true;
                return false;
        }
        return true;
}

static bool blk_mq_handle_expired(struct request *rq, void *priv)
{
        struct blk_expired_data *expired = priv;

        if (blk_mq_req_expired(rq, expired))
                blk_mq_rq_timed_out(rq);
        return true;
}

static void blk_mq_timeout_work(struct work_struct *work)
{
        struct request_queue *q =
                container_of(work, struct request_queue, timeout_work);
        struct blk_expired_data expired = {
                .timeout_start = jiffies,
        };
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        /* A deadlock might occur if a request is stuck requiring a
         * timeout at the same time a queue freeze is waiting
         * completion, since the timeout code would not be able to
         * acquire the queue reference here.
         *
         * That's why we don't use blk_queue_enter here; instead, we use
         * percpu_ref_tryget directly, because we need to be able to
         * obtain a reference even in the short window between the queue
         * starting to freeze, by dropping the first reference in
         * blk_freeze_queue_start, and the moment the last request is
         * consumed, marked by the instant q_usage_counter reaches
         * zero.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;

        /* check if there is any timed-out request */
        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
        if (expired.has_timedout_rq) {
                /*
                 * Before walking tags, we must ensure any submit started
                 * before the current time has finished. Since the submit
                 * uses srcu or rcu, wait for a synchronization point to
                 * ensure all running submits have finished
                 */
                blk_mq_wait_quiesce_done(q->tag_set);

                expired.next = 0;
                blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
        }

        if (expired.next != 0) {
                mod_timer(&q->timeout, expired.next);
        } else {
                /*
                 * Request timeouts are handled as a forward rolling timer. If
                 * we end up here it means that no requests are pending and
                 * also that no request has been pending for a while. Mark
                 * each hctx as idle.
                 */
                queue_for_each_hw_ctx(q, hctx, i) {
                        /* the hctx may be unmapped, so check it here */
                        if (blk_mq_hw_queue_mapped(hctx))
                                blk_mq_tag_idle(hctx);
                }
        }
        blk_queue_exit(q);
}

struct flush_busy_ctx_data {
        struct blk_mq_hw_ctx *hctx;
        struct list_head *list;
};

static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
        struct flush_busy_ctx_data *flush_data = data;
        struct blk_mq_hw_ctx *hctx = flush_data->hctx;
        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
        enum hctx_type type = hctx->type;

        spin_lock(&ctx->lock);
        list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
        sbitmap_clear_bit(sb, bitnr);
        spin_unlock(&ctx->lock);
        return true;
}

/*
 * Process software queues that have been marked busy, splicing them
 * to the for-dispatch
 */
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
        struct flush_busy_ctx_data data = {
                .hctx = hctx,
                .list = list,
        };

        sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
}

struct dispatch_rq_data {
        struct blk_mq_hw_ctx *hctx;
        struct request *rq;
};

static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
                void *data)
{
        struct dispatch_rq_data *dispatch_data = data;
        struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
        enum hctx_type type = hctx->type;

        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->rq_lists[type])) {
                dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
                list_del_init(&dispatch_data->rq->queuelist);
                if (list_empty(&ctx->rq_lists[type]))
                        sbitmap_clear_bit(sb, bitnr);
        }
        spin_unlock(&ctx->lock);

        return !dispatch_data->rq;
}

struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
                                        struct blk_mq_ctx *start)
{
        unsigned off = start ? start->index_hw[hctx->type] : 0;
        struct dispatch_rq_data data = {
                .hctx = hctx,
                .rq   = NULL,
        };

        __sbitmap_for_each_set(&hctx->ctx_map, off,
                               dispatch_rq_from_ctx, &data);

        return data.rq;
}

bool __blk_mq_alloc_driver_tag(struct request *rq)
{
        struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
        unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
        int tag;

        blk_mq_tag_busy(rq->mq_hctx);

        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
                bt = &rq->mq_hctx->tags->breserved_tags;
                tag_offset = 0;
        } else {
                if (!hctx_may_queue(rq->mq_hctx, bt))
                        return false;
        }

        tag = __sbitmap_queue_get(bt);
        if (tag == BLK_MQ_NO_TAG)
                return false;

        rq->tag = tag + tag_offset;
        blk_mq_inc_active_requests(rq->mq_hctx);
        return true;
}

static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
                                int flags, void *key)
{
        struct blk_mq_hw_ctx *hctx;

        hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);

        spin_lock(&hctx->dispatch_wait_lock);
        if (!list_empty(&wait->entry)) {
                struct sbitmap_queue *sbq;

                list_del_init(&wait->entry);
                sbq = &hctx->tags->bitmap_tags;
                atomic_dec(&sbq->ws_active);
        }
        spin_unlock(&hctx->dispatch_wait_lock);

        blk_mq_run_hw_queue(hctx, true);
        return 1;
}

/*
 * Mark us waiting for a tag. For shared tags, this involves hooking us into
 * the tag wakeups. For non-shared tags, we can simply mark us needing a
 * restart. For both cases, take care to check the condition again after
 * marking us as waiting.
 */
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
                                 struct request *rq)
{
        struct sbitmap_queue *sbq;
        struct wait_queue_head *wq;
        wait_queue_entry_t *wait;
        bool ret;

        if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
            !(blk_mq_is_shared_tags(hctx->flags))) {
                blk_mq_sched_mark_restart_hctx(hctx);

                /*
                 * It's possible that a tag was freed in the window between the
                 * allocation failure and adding the hardware queue to the wait
                 * queue.
                 *
                 * Don't clear RESTART here, someone else could have set it.
                 * At most this will cost an extra queue run.
                 */
                return blk_mq_get_driver_tag(rq);
        }

        wait = &hctx->dispatch_wait;
        if (!list_empty_careful(&wait->entry))
                return false;

        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag))
                sbq = &hctx->tags->breserved_tags;
        else
                sbq = &hctx->tags->bitmap_tags;
        wq = &bt_wait_ptr(sbq, hctx)->wait;

        spin_lock_irq(&wq->lock);
        spin_lock(&hctx->dispatch_wait_lock);
        if (!list_empty(&wait->entry)) {
                spin_unlock(&hctx->dispatch_wait_lock);
                spin_unlock_irq(&wq->lock);
                return false;
        }

        atomic_inc(&sbq->ws_active);
        wait->flags &= ~WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq, wait);

        /*
         * Add one explicit barrier since blk_mq_get_driver_tag() may
         * not imply barrier in case of failure.
         *
         * Order adding us to wait queue and allocating driver tag.
         *
         * The pair is the one implied in sbitmap_queue_wake_up() which
         * orders clearing sbitmap tag bits and waitqueue_active() in
         * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
         *
         * Otherwise, re-order of adding wait queue and getting driver tag
         * may cause __sbitmap_queue_wake_up() to wake up nothing because
         * the waitqueue_active() may not observe us in wait queue.
         */
        smp_mb();

        /*
         * It's possible that a tag was freed in the window between the
         * allocation failure and adding the hardware queue to the wait
         * queue.
         */
        ret = blk_mq_get_driver_tag(rq);
        if (!ret) {
                spin_unlock(&hctx->dispatch_wait_lock);
                spin_unlock_irq(&wq->lock);
                return false;
        }

        /*
         * We got a tag, remove ourselves from the wait queue to ensure
         * someone else gets the wakeup.
         */
        list_del_init(&wait->entry);
        atomic_dec(&sbq->ws_active);
        spin_unlock(&hctx->dispatch_wait_lock);
        spin_unlock_irq(&wq->lock);

        return true;
}

#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
/*
 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
 * - EWMA is one simple way to compute running average value
 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
 * - take 4 as factor for avoiding to get too small(0) result, and this
 *   factor doesn't matter because EWMA decreases exponentially
 */
static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
        unsigned int ewma;

        ewma = hctx->dispatch_busy;

        if (!ewma && !busy)
                return;

        ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
        if (busy)
                ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
        ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;

        hctx->dispatch_busy = ewma;
}

#define BLK_MQ_RESOURCE_DELAY        3                /* ms units */

static void blk_mq_handle_dev_resource(struct request *rq,
                                       struct list_head *list)
{
        list_add(&rq->queuelist, list);
        __blk_mq_requeue_request(rq);
}

enum prep_dispatch {
        PREP_DISPATCH_OK,
        PREP_DISPATCH_NO_TAG,
        PREP_DISPATCH_NO_BUDGET,
};

static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
                                                  bool need_budget)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        int budget_token = -1;

        if (need_budget) {
                budget_token = blk_mq_get_dispatch_budget(rq->q);
                if (budget_token < 0) {
                        blk_mq_put_driver_tag(rq);
                        return PREP_DISPATCH_NO_BUDGET;
                }
                blk_mq_set_rq_budget_token(rq, budget_token);
        }

        if (!blk_mq_get_driver_tag(rq)) {
                /*
                 * The initial allocation attempt failed, so we need to
                 * rerun the hardware queue when a tag is freed. The
                 * waitqueue takes care of that. If the queue is run
                 * before we add this entry back on the dispatch list,
                 * we'll re-run it below.
                 */
                if (!blk_mq_mark_tag_wait(hctx, rq)) {
                        /*
                         * All budgets not got from this function will be put
                         * together during handling partial dispatch
                         */
                        if (need_budget)
                                blk_mq_put_dispatch_budget(rq->q, budget_token);
                        return PREP_DISPATCH_NO_TAG;
                }
        }

        return PREP_DISPATCH_OK;
}

/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
static void blk_mq_release_budgets(struct request_queue *q,
                struct list_head *list)
{
        struct request *rq;

        list_for_each_entry(rq, list, queuelist) {
                int budget_token = blk_mq_get_rq_budget_token(rq);

                if (budget_token >= 0)
                        blk_mq_put_dispatch_budget(q, budget_token);
        }
}

/*
 * blk_mq_commit_rqs will notify driver using bd->last that there is no
 * more requests. (See comment in struct blk_mq_ops for commit_rqs for
 * details)
 * Attention, we should explicitly call this in unusual cases:
 *  1) did not queue everything initially scheduled to queue
 *  2) the last attempt to queue a request failed
 */
static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
                              bool from_schedule)
{
        if (hctx->queue->mq_ops->commit_rqs && queued) {
                trace_block_unplug(hctx->queue, queued, !from_schedule);
                hctx->queue->mq_ops->commit_rqs(hctx);
        }
}

/*
 * Returns true if we did some work AND can potentially do more.
 */
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
                             bool get_budget)
{
        enum prep_dispatch prep;
        struct request_queue *q = hctx->queue;
        struct request *rq;
        int queued;
        blk_status_t ret = BLK_STS_OK;
        bool needs_resource = false;

        if (list_empty(list))
                return false;

        /*
         * Now process all the entries, sending them to the driver.
         */
        queued = 0;
        do {
                struct blk_mq_queue_data bd;

                rq = list_first_entry(list, struct request, queuelist);

                WARN_ON_ONCE(hctx != rq->mq_hctx);
                prep = blk_mq_prep_dispatch_rq(rq, get_budget);
                if (prep != PREP_DISPATCH_OK)
                        break;

                list_del_init(&rq->queuelist);

                bd.rq = rq;
                bd.last = list_empty(list);

                ret = q->mq_ops->queue_rq(hctx, &bd);
                switch (ret) {
                case BLK_STS_OK:
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
                        needs_resource = true;
                        fallthrough;
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_handle_dev_resource(rq, list);
                        goto out;
                default:
                        blk_mq_end_request(rq, ret);
                }
        } while (!list_empty(list));
out:
        /* If we didn't flush the entire list, we could have told the driver
         * there was more coming, but that turned out to be a lie.
         */
        if (!list_empty(list) || ret != BLK_STS_OK)
                blk_mq_commit_rqs(hctx, queued, false);

        /*
         * Any items that need requeuing? Stuff them into hctx->dispatch,
         * that is where we will continue on next queue run.
         */
        if (!list_empty(list)) {
                bool needs_restart;
                /* For non-shared tags, the RESTART check will suffice */
                bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
                        ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
                        blk_mq_is_shared_tags(hctx->flags));

                /*
                 * If the caller allocated budgets, free the budgets of the
                 * requests that have not yet been passed to the block driver.
                 */
                if (!get_budget)
                        blk_mq_release_budgets(q, list);

                spin_lock(&hctx->lock);
                list_splice_tail_init(list, &hctx->dispatch);
                spin_unlock(&hctx->lock);

                /*
                 * Order adding requests to hctx->dispatch and checking
                 * SCHED_RESTART flag. The pair of this smp_mb() is the one
                 * in blk_mq_sched_restart(). Avoid restart code path to
                 * miss the new added requests to hctx->dispatch, meantime
                 * SCHED_RESTART is observed here.
                 */
                smp_mb();

                /*
                 * If SCHED_RESTART was set by the caller of this function and
                 * it is no longer set that means that it was cleared by another
                 * thread and hence that a queue rerun is needed.
                 *
                 * If 'no_tag' is set, that means that we failed getting
                 * a driver tag with an I/O scheduler attached. If our dispatch
                 * waitqueue is no longer active, ensure that we run the queue
                 * AFTER adding our entries back to the list.
                 *
                 * If no I/O scheduler has been configured it is possible that
                 * the hardware queue got stopped and restarted before requests
                 * were pushed back onto the dispatch list. Rerun the queue to
                 * avoid starvation. Notes:
                 * - blk_mq_run_hw_queue() checks whether or not a queue has
                 *   been stopped before rerunning a queue.
                 * - Some but not all block drivers stop a queue before
                 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
                 *   and dm-rq.
                 *
                 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
                 * bit is set, run queue after a delay to avoid IO stalls
                 * that could otherwise occur if the queue is idle.  We'll do
                 * similar if we couldn't get budget or couldn't lock a zone
                 * and SCHED_RESTART is set.
                 */
                needs_restart = blk_mq_sched_needs_restart(hctx);
                if (prep == PREP_DISPATCH_NO_BUDGET)
                        needs_resource = true;
                if (!needs_restart ||
                    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                        blk_mq_run_hw_queue(hctx, true);
                else if (needs_resource)
                        blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);

                blk_mq_update_dispatch_busy(hctx, true);
                return false;
        }

        blk_mq_update_dispatch_busy(hctx, false);
        return true;
}

static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
{
        int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);

        if (cpu >= nr_cpu_ids)
                cpu = cpumask_first(hctx->cpumask);
        return cpu;
}

/*
 * ->next_cpu is always calculated from hctx->cpumask, so simply use
 * it for speeding up the check
 */
static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
{
        return hctx->next_cpu >= nr_cpu_ids;
}

/*
 * It'd be great if the workqueue API had a way to pass
 * in a mask and had some smarts for more clever placement.
 * For now we just round-robin here, switching for every
 * BLK_MQ_CPU_WORK_BATCH queued items.
 */
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
        bool tried = false;
        int next_cpu = hctx->next_cpu;

        /* Switch to unbound if no allowable CPUs in this hctx */
        if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
                return WORK_CPU_UNBOUND;

        if (--hctx->next_cpu_batch <= 0) {
select_cpu:
                next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
                                cpu_online_mask);
                if (next_cpu >= nr_cpu_ids)
                        next_cpu = blk_mq_first_mapped_cpu(hctx);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }

        /*
         * Do unbound schedule if we can't find a online CPU for this hctx,
         * and it should only happen in the path of handling CPU DEAD.
         */
        if (!cpu_online(next_cpu)) {
                if (!tried) {
                        tried = true;
                        goto select_cpu;
                }

                /*
                 * Make sure to re-select CPU next time once after CPUs
                 * in hctx->cpumask become online again.
                 */
                hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = 1;
                return WORK_CPU_UNBOUND;
        }

        hctx->next_cpu = next_cpu;
        return next_cpu;
}

/**
 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
 * @hctx: Pointer to the hardware queue to run.
 * @msecs: Milliseconds of delay to wait before running the queue.
 *
 * Run a hardware queue asynchronously with a delay of @msecs.
 */
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
        if (unlikely(blk_mq_hctx_stopped(hctx)))
                return;
        kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
                                    msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);

static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx)
{
        bool need_run;

        /*
         * When queue is quiesced, we may be switching io scheduler, or
         * updating nr_hw_queues, or other things, and we can't run queue
         * any more, even blk_mq_hctx_has_pending() can't be called safely.
         *
         * And queue will be rerun in blk_mq_unquiesce_queue() if it is
         * quiesced.
         */
        __blk_mq_run_dispatch_ops(hctx->queue, false,
                need_run = !blk_queue_quiesced(hctx->queue) &&
                blk_mq_hctx_has_pending(hctx));
        return need_run;
}

/**
 * blk_mq_run_hw_queue - Start to run a hardware queue.
 * @hctx: Pointer to the hardware queue to run.
 * @async: If we want to run the queue asynchronously.
 *
 * Check if the request queue is not in a quiesced state and if there are
 * pending requests to be sent. If this is true, run the queue to send requests
 * to hardware.
 */
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
        bool need_run;

        /*
         * We can't run the queue inline with interrupts disabled.
         */
        WARN_ON_ONCE(!async && in_interrupt());

        might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);

        need_run = blk_mq_hw_queue_need_run(hctx);
        if (!need_run) {
                unsigned long flags;

                /*
                 * Synchronize with blk_mq_unquiesce_queue(), because we check
                 * if hw queue is quiesced locklessly above, we need the use
                 * ->queue_lock to make sure we see the up-to-date status to
                 * not miss rerunning the hw queue.
                 */
                spin_lock_irqsave(&hctx->queue->queue_lock, flags);
                need_run = blk_mq_hw_queue_need_run(hctx);
                spin_unlock_irqrestore(&hctx->queue->queue_lock, flags);

                if (!need_run)
                        return;
        }

        if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
                blk_mq_delay_run_hw_queue(hctx, 0);
                return;
        }

        blk_mq_run_dispatch_ops(hctx->queue,
                                blk_mq_sched_dispatch_requests(hctx));
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);

/*
 * Return prefered queue to dispatch from (if any) for non-mq aware IO
 * scheduler.
 */
static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
{
        struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
        /*
         * If the IO scheduler does not respect hardware queues when
         * dispatching, we just don't bother with multiple HW queues and
         * dispatch from hctx for the current CPU since running multiple queues
         * just causes lock contention inside the scheduler and pointless cache
         * bouncing.
         */
        struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];

        if (!blk_mq_hctx_stopped(hctx))
                return hctx;
        return NULL;
}

/**
 * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
 * @q: Pointer to the request queue to run.
 * @async: If we want to run the queue asynchronously.
 */
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
        struct blk_mq_hw_ctx *hctx, *sq_hctx;
        unsigned long i;

        sq_hctx = NULL;
        if (blk_queue_sq_sched(q))
                sq_hctx = blk_mq_get_sq_hctx(q);
        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_hctx_stopped(hctx))
                        continue;
                /*
                 * Dispatch from this hctx either if there's no hctx preferred
                 * by IO scheduler or if it has requests that bypass the
                 * scheduler.
                 */
                if (!sq_hctx || sq_hctx == hctx ||
                    !list_empty_careful(&hctx->dispatch))
                        blk_mq_run_hw_queue(hctx, async);
        }
}
EXPORT_SYMBOL(blk_mq_run_hw_queues);

/**
 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
 * @q: Pointer to the request queue to run.
 * @msecs: Milliseconds of delay to wait before running the queues.
 */
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
        struct blk_mq_hw_ctx *hctx, *sq_hctx;
        unsigned long i;

        sq_hctx = NULL;
        if (blk_queue_sq_sched(q))
                sq_hctx = blk_mq_get_sq_hctx(q);
        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_hctx_stopped(hctx))
                        continue;
                /*
                 * If there is already a run_work pending, leave the
                 * pending delay untouched. Otherwise, a hctx can stall
                 * if another hctx is re-delaying the other's work
                 * before the work executes.
                 */
                if (delayed_work_pending(&hctx->run_work))
                        continue;
                /*
                 * Dispatch from this hctx either if there's no hctx preferred
                 * by IO scheduler or if it has requests that bypass the
                 * scheduler.
                 */
                if (!sq_hctx || sq_hctx == hctx ||
                    !list_empty_careful(&hctx->dispatch))
                        blk_mq_delay_run_hw_queue(hctx, msecs);
        }
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);

/*
 * This function is often used for pausing .queue_rq() by driver when
 * there isn't enough resource or some conditions aren't satisfied, and
 * BLK_STS_RESOURCE is usually returned.
 *
 * We do not guarantee that dispatch can be drained or blocked
 * after blk_mq_stop_hw_queue() returns. Please use
 * blk_mq_quiesce_queue() for that requirement.
 */
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
        cancel_delayed_work(&hctx->run_work);

        set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queue);

/*
 * This function is often used for pausing .queue_rq() by driver when
 * there isn't enough resource or some conditions aren't satisfied, and
 * BLK_STS_RESOURCE is usually returned.
 *
 * We do not guarantee that dispatch can be drained or blocked
 * after blk_mq_stop_hw_queues() returns. Please use
 * blk_mq_quiesce_queue() for that requirement.
 */
void blk_mq_stop_hw_queues(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_stop_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queues);

void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{
        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);

        blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
}
EXPORT_SYMBOL(blk_mq_start_hw_queue);

void blk_mq_start_hw_queues(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_start_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);

void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
        if (!blk_mq_hctx_stopped(hctx))
                return;

        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
        /*
         * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the
         * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch
         * list in the subsequent routine.
         */
        smp_mb__after_atomic();
        blk_mq_run_hw_queue(hctx, async);
}
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);

void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_start_stopped_hw_queue(hctx, async ||
                                        (hctx->flags & BLK_MQ_F_BLOCKING));
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);

static void blk_mq_run_work_fn(struct work_struct *work)
{
        struct blk_mq_hw_ctx *hctx =
                container_of(work, struct blk_mq_hw_ctx, run_work.work);

        blk_mq_run_dispatch_ops(hctx->queue,
                                blk_mq_sched_dispatch_requests(hctx));
}

/**
 * blk_mq_request_bypass_insert - Insert a request at dispatch list.
 * @rq: Pointer to request to be inserted.
 * @flags: BLK_MQ_INSERT_*
 *
 * Should only be used carefully, when the caller knows we want to
 * bypass a potential IO scheduler on the target device.
 */
static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        spin_lock(&hctx->lock);
        if (flags & BLK_MQ_INSERT_AT_HEAD)
                list_add(&rq->queuelist, &hctx->dispatch);
        else
                list_add_tail(&rq->queuelist, &hctx->dispatch);
        spin_unlock(&hctx->lock);
}

static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
                struct blk_mq_ctx *ctx, struct list_head *list,
                bool run_queue_async)
{
        struct request *rq;
        enum hctx_type type = hctx->type;

        /*
         * Try to issue requests directly if the hw queue isn't busy to save an
         * extra enqueue & dequeue to the sw queue.
         */
        if (!hctx->dispatch_busy && !run_queue_async) {
                blk_mq_run_dispatch_ops(hctx->queue,
                        blk_mq_try_issue_list_directly(hctx, list));
                if (list_empty(list))
                        goto out;
        }

        /*
         * preemption doesn't flush plug list, so it's possible ctx->cpu is
         * offline now
         */
        list_for_each_entry(rq, list, queuelist) {
                BUG_ON(rq->mq_ctx != ctx);
                trace_block_rq_insert(rq);
                if (rq->cmd_flags & REQ_NOWAIT)
                        run_queue_async = true;
        }

        spin_lock(&ctx->lock);
        list_splice_tail_init(list, &ctx->rq_lists[type]);
        blk_mq_hctx_mark_pending(hctx, ctx);
        spin_unlock(&ctx->lock);
out:
        blk_mq_run_hw_queue(hctx, run_queue_async);
}

static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
{
        struct request_queue *q = rq->q;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        if (blk_rq_is_passthrough(rq)) {
                /*
                 * Passthrough request have to be added to hctx->dispatch
                 * directly.  The device may be in a situation where it can't
                 * handle FS request, and always returns BLK_STS_RESOURCE for
                 * them, which gets them added to hctx->dispatch.
                 *
                 * If a passthrough request is required to unblock the queues,
                 * and it is added to the scheduler queue, there is no chance to
                 * dispatch it given we prioritize requests in hctx->dispatch.
                 */
                blk_mq_request_bypass_insert(rq, flags);
        } else if (req_op(rq) == REQ_OP_FLUSH) {
                /*
                 * Firstly normal IO request is inserted to scheduler queue or
                 * sw queue, meantime we add flush request to dispatch queue(
                 * hctx->dispatch) directly and there is at most one in-flight
                 * flush request for each hw queue, so it doesn't matter to add
                 * flush request to tail or front of the dispatch queue.
                 *
                 * Secondly in case of NCQ, flush request belongs to non-NCQ
                 * command, and queueing it will fail when there is any
                 * in-flight normal IO request(NCQ command). When adding flush
                 * rq to the front of hctx->dispatch, it is easier to introduce
                 * extra time to flush rq's latency because of S_SCHED_RESTART
                 * compared with adding to the tail of dispatch queue, then
                 * chance of flush merge is increased, and less flush requests
                 * will be issued to controller. It is observed that ~10% time
                 * is saved in blktests block/004 on disk attached to AHCI/NCQ
                 * drive when adding flush rq to the front of hctx->dispatch.
                 *
                 * Simply queue flush rq to the front of hctx->dispatch so that
                 * intensive flush workloads can benefit in case of NCQ HW.
                 */
                blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD);
        } else if (q->elevator) {
                LIST_HEAD(list);

                WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG);

                list_add(&rq->queuelist, &list);
                q->elevator->type->ops.insert_requests(hctx, &list, flags);
        } else {
                trace_block_rq_insert(rq);

                spin_lock(&ctx->lock);
                if (flags & BLK_MQ_INSERT_AT_HEAD)
                        list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]);
                else
                        list_add_tail(&rq->queuelist,
                                      &ctx->rq_lists[hctx->type]);
                blk_mq_hctx_mark_pending(hctx, ctx);
                spin_unlock(&ctx->lock);
        }
}

static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
                unsigned int nr_segs)
{
        int err;

        if (bio->bi_opf & REQ_RAHEAD)
                rq->cmd_flags |= REQ_FAILFAST_MASK;

        rq->bio = rq->biotail = bio;
        rq->__sector = bio->bi_iter.bi_sector;
        rq->__data_len = bio->bi_iter.bi_size;
        rq->phys_gap_bit = bio->bi_bvec_gap_bit;

        rq->nr_phys_segments = nr_segs;
        if (bio_integrity(bio))
                rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
                                                                      bio);

        /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
        err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
        WARN_ON_ONCE(err);

        blk_account_io_start(rq);
}

static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
                                            struct request *rq, bool last)
{
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
                .rq = rq,
                .last = last,
        };
        blk_status_t ret;

        /*
         * For OK queue, we are done. For error, caller may kill it.
         * Any other error (busy), just add it to our list as we
         * previously would have done.
         */
        ret = q->mq_ops->queue_rq(hctx, &bd);
        switch (ret) {
        case BLK_STS_OK:
                blk_mq_update_dispatch_busy(hctx, false);
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
                blk_mq_update_dispatch_busy(hctx, true);
                __blk_mq_requeue_request(rq);
                break;
        default:
                blk_mq_update_dispatch_busy(hctx, false);
                break;
        }

        return ret;
}

static bool blk_mq_get_budget_and_tag(struct request *rq)
{
        int budget_token;

        budget_token = blk_mq_get_dispatch_budget(rq->q);
        if (budget_token < 0)
                return false;
        blk_mq_set_rq_budget_token(rq, budget_token);
        if (!blk_mq_get_driver_tag(rq)) {
                blk_mq_put_dispatch_budget(rq->q, budget_token);
                return false;
        }
        return true;
}

/**
 * blk_mq_try_issue_directly - Try to send a request directly to device driver.
 * @hctx: Pointer of the associated hardware queue.
 * @rq: Pointer to request to be sent.
 *
 * If the device has enough resources to accept a new request now, send the
 * request directly to device driver. Else, insert at hctx->dispatch queue, so
 * we can try send it another time in the future. Requests inserted at this
 * queue have higher priority.
 */
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                struct request *rq)
{
        blk_status_t ret;

        if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
                blk_mq_insert_request(rq, 0);
                blk_mq_run_hw_queue(hctx, false);
                return;
        }

        if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
                blk_mq_insert_request(rq, 0);
                blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT);
                return;
        }

        ret = __blk_mq_issue_directly(hctx, rq, true);
        switch (ret) {
        case BLK_STS_OK:
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
                blk_mq_request_bypass_insert(rq, 0);
                blk_mq_run_hw_queue(hctx, false);
                break;
        default:
                blk_mq_end_request(rq, ret);
                break;
        }
}

static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
                blk_mq_insert_request(rq, 0);
                blk_mq_run_hw_queue(hctx, false);
                return BLK_STS_OK;
        }

        if (!blk_mq_get_budget_and_tag(rq))
                return BLK_STS_RESOURCE;
        return __blk_mq_issue_directly(hctx, rq, last);
}

static void blk_mq_issue_direct(struct rq_list *rqs)
{
        struct blk_mq_hw_ctx *hctx = NULL;
        struct request *rq;
        int queued = 0;
        blk_status_t ret = BLK_STS_OK;

        while ((rq = rq_list_pop(rqs))) {
                bool last = rq_list_empty(rqs);

                if (hctx != rq->mq_hctx) {
                        if (hctx) {
                                blk_mq_commit_rqs(hctx, queued, false);
                                queued = 0;
                        }
                        hctx = rq->mq_hctx;
                }

                ret = blk_mq_request_issue_directly(rq, last);
                switch (ret) {
                case BLK_STS_OK:
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_request_bypass_insert(rq, 0);
                        blk_mq_run_hw_queue(hctx, false);
                        goto out;
                default:
                        blk_mq_end_request(rq, ret);
                        break;
                }
        }

out:
        if (ret != BLK_STS_OK)
                blk_mq_commit_rqs(hctx, queued, false);
}

static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs)
{
        if (blk_queue_quiesced(q))
                return;
        q->mq_ops->queue_rqs(rqs);
}

static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs,
                                              struct rq_list *queue_rqs)
{
        struct request *rq = rq_list_pop(rqs);
        struct request_queue *this_q = rq->q;
        struct request **prev = &rqs->head;
        struct rq_list matched_rqs = {};
        struct request *last = NULL;
        unsigned depth = 1;

        rq_list_add_tail(&matched_rqs, rq);
        while ((rq = *prev)) {
                if (rq->q == this_q) {
                        /* move rq from rqs to matched_rqs */
                        *prev = rq->rq_next;
                        rq_list_add_tail(&matched_rqs, rq);
                        depth++;
                } else {
                        /* leave rq in rqs */
                        prev = &rq->rq_next;
                        last = rq;
                }
        }

        rqs->tail = last;
        *queue_rqs = matched_rqs;
        return depth;
}

static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth)
{
        struct request_queue *q = rq_list_peek(rqs)->q;

        trace_block_unplug(q, depth, true);

        /*
         * Peek first request and see if we have a ->queue_rqs() hook.
         * If we do, we can dispatch the whole list in one go.
         * We already know at this point that all requests belong to the
         * same queue, caller must ensure that's the case.
         */
        if (q->mq_ops->queue_rqs) {
                blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs));
                if (rq_list_empty(rqs))
                        return;
        }

        blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs));
}

static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched)
{
        struct blk_mq_hw_ctx *this_hctx = NULL;
        struct blk_mq_ctx *this_ctx = NULL;
        struct rq_list requeue_list = {};
        unsigned int depth = 0;
        bool is_passthrough = false;
        LIST_HEAD(list);

        do {
                struct request *rq = rq_list_pop(rqs);

                if (!this_hctx) {
                        this_hctx = rq->mq_hctx;
                        this_ctx = rq->mq_ctx;
                        is_passthrough = blk_rq_is_passthrough(rq);
                } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
                           is_passthrough != blk_rq_is_passthrough(rq)) {
                        rq_list_add_tail(&requeue_list, rq);
                        continue;
                }
                list_add_tail(&rq->queuelist, &list);
                depth++;
        } while (!rq_list_empty(rqs));

        *rqs = requeue_list;
        trace_block_unplug(this_hctx->queue, depth, !from_sched);

        percpu_ref_get(&this_hctx->queue->q_usage_counter);
        /* passthrough requests should never be issued to the I/O scheduler */
        if (is_passthrough) {
                spin_lock(&this_hctx->lock);
                list_splice_tail_init(&list, &this_hctx->dispatch);
                spin_unlock(&this_hctx->lock);
                blk_mq_run_hw_queue(this_hctx, from_sched);
        } else if (this_hctx->queue->elevator) {
                this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
                                &list, 0);
                blk_mq_run_hw_queue(this_hctx, from_sched);
        } else {
                blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched);
        }
        percpu_ref_put(&this_hctx->queue->q_usage_counter);
}

static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs)
{
        do {
                struct rq_list queue_rqs;
                unsigned depth;

                depth = blk_mq_extract_queue_requests(rqs, &queue_rqs);
                blk_mq_dispatch_queue_requests(&queue_rqs, depth);
                while (!rq_list_empty(&queue_rqs))
                        blk_mq_dispatch_list(&queue_rqs, false);
        } while (!rq_list_empty(rqs));
}

void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
        unsigned int depth;

        /*
         * We may have been called recursively midway through handling
         * plug->mq_list via a schedule() in the driver's queue_rq() callback.
         * To avoid mq_list changing under our feet, clear rq_count early and
         * bail out specifically if rq_count is 0 rather than checking
         * whether the mq_list is empty.
         */
        if (plug->rq_count == 0)
                return;
        depth = plug->rq_count;
        plug->rq_count = 0;

        if (!plug->has_elevator && !from_schedule) {
                if (plug->multiple_queues) {
                        blk_mq_dispatch_multiple_queue_requests(&plug->mq_list);
                        return;
                }

                blk_mq_dispatch_queue_requests(&plug->mq_list, depth);
                if (rq_list_empty(&plug->mq_list))
                        return;
        }

        do {
                blk_mq_dispatch_list(&plug->mq_list, from_schedule);
        } while (!rq_list_empty(&plug->mq_list));
}

static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                struct list_head *list)
{
        int queued = 0;
        blk_status_t ret = BLK_STS_OK;

        while (!list_empty(list)) {
                struct request *rq = list_first_entry(list, struct request,
                                queuelist);

                list_del_init(&rq->queuelist);
                ret = blk_mq_request_issue_directly(rq, list_empty(list));
                switch (ret) {
                case BLK_STS_OK:
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_request_bypass_insert(rq, 0);
                        if (list_empty(list))
                                blk_mq_run_hw_queue(hctx, false);
                        goto out;
                default:
                        blk_mq_end_request(rq, ret);
                        break;
                }
        }

out:
        if (ret != BLK_STS_OK)
                blk_mq_commit_rqs(hctx, queued, false);
}

static bool blk_mq_attempt_bio_merge(struct request_queue *q,
                                     struct bio *bio, unsigned int nr_segs)
{
        if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
                if (blk_attempt_plug_merge(q, bio, nr_segs))
                        return true;
                if (blk_mq_sched_bio_merge(q, bio, nr_segs))
                        return true;
        }
        return false;
}

static struct request *blk_mq_get_new_requests(struct request_queue *q,
                                               struct blk_plug *plug,
                                               struct bio *bio)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .flags                = 0,
                .shallow_depth        = 0,
                .cmd_flags        = bio->bi_opf,
                .rq_flags        = 0,
                .nr_tags        = 1,
                .cached_rqs        = NULL,
                .ctx                = NULL,
                .hctx                = NULL
        };
        struct request *rq;

        rq_qos_throttle(q, bio);

        if (plug) {
                data.nr_tags = plug->nr_ios;
                plug->nr_ios = 1;
                data.cached_rqs = &plug->cached_rqs;
        }

        rq = __blk_mq_alloc_requests(&data);
        if (unlikely(!rq))
                rq_qos_cleanup(q, bio);
        return rq;
}

/*
 * Check if there is a suitable cached request and return it.
 */
static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
                struct request_queue *q, blk_opf_t opf)
{
        enum hctx_type type = blk_mq_get_hctx_type(opf);
        struct request *rq;

        if (!plug)
                return NULL;
        rq = rq_list_peek(&plug->cached_rqs);
        if (!rq || rq->q != q)
                return NULL;
        if (type != rq->mq_hctx->type &&
            (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT))
                return NULL;
        if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
                return NULL;
        return rq;
}

static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
                struct bio *bio)
{
        if (rq_list_pop(&plug->cached_rqs) != rq)
                WARN_ON_ONCE(1);

        /*
         * If any qos ->throttle() end up blocking, we will have flushed the
         * plug and hence killed the cached_rq list as well. Pop this entry
         * before we throttle.
         */
        rq_qos_throttle(rq->q, bio);

        blk_mq_rq_time_init(rq, blk_time_get_ns());
        rq->cmd_flags = bio->bi_opf;
        INIT_LIST_HEAD(&rq->queuelist);
}

static bool bio_unaligned(const struct bio *bio, struct request_queue *q)
{
        unsigned int bs_mask = queue_logical_block_size(q) - 1;

        /* .bi_sector of any zero sized bio need to be initialized */
        if ((bio->bi_iter.bi_size & bs_mask) ||
            ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask))
                return true;
        return false;
}

/**
 * blk_mq_submit_bio - Create and send a request to block device.
 * @bio: Bio pointer.
 *
 * Builds up a request structure from @q and @bio and send to the device. The
 * request may not be queued directly to hardware if:
 * * This request can be merged with another one
 * * We want to place request at plug queue for possible future merging
 * * There is an IO scheduler active at this queue
 *
 * It will not queue the request if there is an error with the bio, or at the
 * request creation.
 */
void blk_mq_submit_bio(struct bio *bio)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        struct blk_plug *plug = current->plug;
        const int is_sync = op_is_sync(bio->bi_opf);
        unsigned int integrity_action;
        struct blk_mq_hw_ctx *hctx;
        unsigned int nr_segs;
        struct request *rq;
        blk_status_t ret;

        /*
         * If the plug has a cached request for this queue, try to use it.
         */
        rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);

        /*
         * A BIO that was released from a zone write plug has already been
         * through the preparation in this function, already holds a reference
         * on the queue usage counter, and is the only write BIO in-flight for
         * the target zone. Go straight to preparing a request for it.
         */
        if (bio_zone_write_plugging(bio)) {
                nr_segs = bio->__bi_nr_segments;
                if (rq)
                        blk_queue_exit(q);
                goto new_request;
        }

        /*
         * The cached request already holds a q_usage_counter reference and we
         * don't have to acquire a new one if we use it.
         */
        if (!rq) {
                if (unlikely(bio_queue_enter(bio)))
                        return;
        }

        /*
         * Device reconfiguration may change logical block size or reduce the
         * number of poll queues, so the checks for alignment and poll support
         * have to be done with queue usage counter held.
         */
        if (unlikely(bio_unaligned(bio, q))) {
                bio_io_error(bio);
                goto queue_exit;
        }

        if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
                bio->bi_status = BLK_STS_NOTSUPP;
                bio_endio(bio);
                goto queue_exit;
        }

        bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
        if (!bio)
                goto queue_exit;

        integrity_action = bio_integrity_action(bio);
        if (integrity_action)
                bio_integrity_prep(bio, integrity_action);

        blk_mq_bio_issue_init(q, bio);
        if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
                goto queue_exit;

        if (bio_needs_zone_write_plugging(bio)) {
                if (blk_zone_plug_bio(bio, nr_segs))
                        goto queue_exit;
        }

new_request:
        if (rq) {
                blk_mq_use_cached_rq(rq, plug, bio);
        } else {
                rq = blk_mq_get_new_requests(q, plug, bio);
                if (unlikely(!rq)) {
                        if (bio->bi_opf & REQ_NOWAIT)
                                bio_wouldblock_error(bio);
                        goto queue_exit;
                }
        }

        trace_block_getrq(bio);

        rq_qos_track(q, rq, bio);

        blk_mq_bio_to_request(rq, bio, nr_segs);

        ret = blk_crypto_rq_get_keyslot(rq);
        if (ret != BLK_STS_OK) {
                bio->bi_status = ret;
                bio_endio(bio);
                blk_mq_free_request(rq);
                return;
        }

        if (bio_zone_write_plugging(bio))
                blk_zone_write_plug_init_request(rq);

        if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
                return;

        if (plug) {
                blk_add_rq_to_plug(plug, rq);
                return;
        }

        hctx = rq->mq_hctx;
        if ((rq->rq_flags & RQF_USE_SCHED) ||
            (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
                blk_mq_insert_request(rq, 0);
                blk_mq_run_hw_queue(hctx, true);
        } else {
                blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
        }
        return;

queue_exit:
        /*
         * Don't drop the queue reference if we were trying to use a cached
         * request and thus didn't acquire one.
         */
        if (!rq)
                blk_queue_exit(q);
}

#ifdef CONFIG_BLK_MQ_STACKING
/**
 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
 * @rq: the request being queued
 */
blk_status_t blk_insert_cloned_request(struct request *rq)
{
        struct request_queue *q = rq->q;
        unsigned int max_sectors = blk_queue_get_max_sectors(rq);
        unsigned int max_segments = blk_rq_get_max_segments(rq);
        blk_status_t ret;

        if (blk_rq_sectors(rq) > max_sectors) {
                /*
                 * SCSI device does not have a good way to return if
                 * Write Same/Zero is actually supported. If a device rejects
                 * a non-read/write command (discard, write same,etc.) the
                 * low-level device driver will set the relevant queue limit to
                 * 0 to prevent blk-lib from issuing more of the offending
                 * operations. Commands queued prior to the queue limit being
                 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
                 * errors being propagated to upper layers.
                 */
                if (max_sectors == 0)
                        return BLK_STS_NOTSUPP;

                printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
                        __func__, blk_rq_sectors(rq), max_sectors);
                return BLK_STS_IOERR;
        }

        /*
         * The queue settings related to segment counting may differ from the
         * original queue.
         */
        rq->nr_phys_segments = blk_recalc_rq_segments(rq);
        if (rq->nr_phys_segments > max_segments) {
                printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n",
                        __func__, rq->nr_phys_segments, max_segments);
                return BLK_STS_IOERR;
        }

        if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
                return BLK_STS_IOERR;

        ret = blk_crypto_rq_get_keyslot(rq);
        if (ret != BLK_STS_OK)
                return ret;

        blk_account_io_start(rq);

        /*
         * Since we have a scheduler attached on the top device,
         * bypass a potential scheduler on the bottom device for
         * insert.
         */
        blk_mq_run_dispatch_ops(q,
                        ret = blk_mq_request_issue_directly(rq, true));
        if (ret)
                blk_account_io_done(rq, blk_time_get_ns());
        return ret;
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);

/**
 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
 * @rq: the clone request to be cleaned up
 *
 * Description:
 *     Free all bios in @rq for a cloned request.
 */
void blk_rq_unprep_clone(struct request *rq)
{
        struct bio *bio;

        while ((bio = rq->bio) != NULL) {
                rq->bio = bio->bi_next;

                bio_put(bio);
        }
}
EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);

/**
 * blk_rq_prep_clone - Helper function to setup clone request
 * @rq: the request to be setup
 * @rq_src: original request to be cloned
 * @bs: bio_set that bios for clone are allocated from
 * @gfp_mask: memory allocation mask for bio
 * @bio_ctr: setup function to be called for each clone bio.
 *           Returns %0 for success, non %0 for failure.
 * @data: private data to be passed to @bio_ctr
 *
 * Description:
 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
 *     Also, pages which the original bios are pointing to are not copied
 *     and the cloned bios just point same pages.
 *     So cloned bios must be completed before original bios, which means
 *     the caller must complete @rq before @rq_src.
 */
int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
                      struct bio_set *bs, gfp_t gfp_mask,
                      int (*bio_ctr)(struct bio *, struct bio *, void *),
                      void *data)
{
        struct bio *bio_src;

        if (!bs)
                bs = &fs_bio_set;

        __rq_for_each_bio(bio_src, rq_src) {
                struct bio *bio         = bio_alloc_clone(rq->q->disk->part0, bio_src,
                                        gfp_mask, bs);
                if (!bio)
                        goto free_and_out;

                if (bio_ctr && bio_ctr(bio, bio_src, data)) {
                        bio_put(bio);
                        goto free_and_out;
                }

                if (rq->bio) {
                        rq->biotail->bi_next = bio;
                        rq->biotail = bio;
                } else {
                        rq->bio = rq->biotail = bio;
                }
        }

        /* Copy attributes of the original request to the clone request. */
        rq->__sector = blk_rq_pos(rq_src);
        rq->__data_len = blk_rq_bytes(rq_src);
        if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
                rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
                rq->special_vec = rq_src->special_vec;
        }
        rq->nr_phys_segments = rq_src->nr_phys_segments;
        rq->nr_integrity_segments = rq_src->nr_integrity_segments;
        rq->phys_gap_bit = rq_src->phys_gap_bit;

        if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
                goto free_and_out;

        return 0;

free_and_out:
        blk_rq_unprep_clone(rq);

        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
#endif /* CONFIG_BLK_MQ_STACKING */

/*
 * Steal bios from a request and add them to a bio list.
 * The request must not have been partially completed before.
 */
void blk_steal_bios(struct bio_list *list, struct request *rq)
{
        struct bio *bio;

        for (bio = rq->bio; bio; bio = bio->bi_next) {
                if (bio->bi_opf & REQ_POLLED) {
                        bio->bi_opf &= ~REQ_POLLED;
                        bio->bi_cookie = BLK_QC_T_NONE;
                }
                /*
                 * The alternate request queue that we may end up submitting
                 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
                 * will fail the I/O immediately with EAGAIN to the issuer.
                 * We are not in the issuer context which cannot block. Clear
                 * the flag to avoid spurious EAGAIN I/O failures.
                 */
                bio->bi_opf &= ~REQ_NOWAIT;
                bio_clear_flag(bio, BIO_QOS_THROTTLED);
                bio_clear_flag(bio, BIO_QOS_MERGED);
        }

        if (rq->bio) {
                if (list->tail)
                        list->tail->bi_next = rq->bio;
                else
                        list->head = rq->bio;
                list->tail = rq->biotail;

                rq->bio = NULL;
                rq->biotail = NULL;
        }

        rq->__data_len = 0;
}
EXPORT_SYMBOL_GPL(blk_steal_bios);

static size_t order_to_size(unsigned int order)
{
        return (size_t)PAGE_SIZE << order;
}

/* called before freeing request pool in @tags */
static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
                                    struct blk_mq_tags *tags)
{
        struct page *page;

        /*
         * There is no need to clear mapping if driver tags is not initialized
         * or the mapping belongs to the driver tags.
         */
        if (!drv_tags || drv_tags == tags)
                return;

        list_for_each_entry(page, &tags->page_list, lru) {
                unsigned long start = (unsigned long)page_address(page);
                unsigned long end = start + order_to_size(page->private);
                int i;

                for (i = 0; i < drv_tags->nr_tags; i++) {
                        struct request *rq = drv_tags->rqs[i];
                        unsigned long rq_addr = (unsigned long)rq;

                        if (rq_addr >= start && rq_addr < end) {
                                WARN_ON_ONCE(req_ref_read(rq) != 0);
                                cmpxchg(&drv_tags->rqs[i], rq, NULL);
                        }
                }
        }
}

void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx)
{
        struct blk_mq_tags *drv_tags;

        if (list_empty(&tags->page_list))
                return;

        if (blk_mq_is_shared_tags(set->flags))
                drv_tags = set->shared_tags;
        else
                drv_tags = set->tags[hctx_idx];

        if (tags->static_rqs && set->ops->exit_request) {
                int i;

                for (i = 0; i < tags->nr_tags; i++) {
                        struct request *rq = tags->static_rqs[i];

                        if (!rq)
                                continue;
                        set->ops->exit_request(set, rq, hctx_idx);
                        tags->static_rqs[i] = NULL;
                }
        }

        blk_mq_clear_rq_mapping(drv_tags, tags);
        /*
         * Free request pages in SRCU callback, which is called from
         * blk_mq_free_tags().
         */
}

void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags)
{
        kfree(tags->rqs);
        tags->rqs = NULL;
        kfree(tags->static_rqs);
        tags->static_rqs = NULL;

        blk_mq_free_tags(set, tags);
}

static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
                unsigned int hctx_idx)
{
        int i;

        for (i = 0; i < set->nr_maps; i++) {
                unsigned int start = set->map[i].queue_offset;
                unsigned int end = start + set->map[i].nr_queues;

                if (hctx_idx >= start && hctx_idx < end)
                        break;
        }

        if (i >= set->nr_maps)
                i = HCTX_TYPE_DEFAULT;

        return i;
}

static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
                unsigned int hctx_idx)
{
        enum hctx_type type = hctx_idx_to_type(set, hctx_idx);

        return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
}

static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                               unsigned int hctx_idx,
                                               unsigned int nr_tags,
                                               unsigned int reserved_tags)
{
        int node = blk_mq_get_hctx_node(set, hctx_idx);
        struct blk_mq_tags *tags;

        if (node == NUMA_NO_NODE)
                node = set->numa_node;

        tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node);
        if (!tags)
                return NULL;

        tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                 node);
        if (!tags->rqs)
                goto err_free_tags;

        tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
                                        GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                        node);
        if (!tags->static_rqs)
                goto err_free_rqs;

        return tags;

err_free_rqs:
        kfree(tags->rqs);
err_free_tags:
        blk_mq_free_tags(set, tags);
        return NULL;
}

static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
                               unsigned int hctx_idx, int node)
{
        int ret;

        if (set->ops->init_request) {
                ret = set->ops->init_request(set, rq, hctx_idx, node);
                if (ret)
                        return ret;
        }

        WRITE_ONCE(rq->state, MQ_RQ_IDLE);
        return 0;
}

static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
                            struct blk_mq_tags *tags,
                            unsigned int hctx_idx, unsigned int depth)
{
        unsigned int i, j, entries_per_page, max_order = 4;
        int node = blk_mq_get_hctx_node(set, hctx_idx);
        size_t rq_size, left;

        if (node == NUMA_NO_NODE)
                node = set->numa_node;

        /*
         * rq_size is the size of the request plus driver payload, rounded
         * to the cacheline size
         */
        rq_size = round_up(sizeof(struct request) + set->cmd_size,
                                cache_line_size());
        left = rq_size * depth;

        for (i = 0; i < depth; ) {
                int this_order = max_order;
                struct page *page;
                int to_do;
                void *p;

                while (this_order && left < order_to_size(this_order - 1))
                        this_order--;

                do {
                        page = alloc_pages_node(node,
                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
                                this_order);
                        if (page)
                                break;
                        if (!this_order--)
                                break;
                        if (order_to_size(this_order) < rq_size)
                                break;
                } while (1);

                if (!page)
                        goto fail;

                page->private = this_order;
                list_add_tail(&page->lru, &tags->page_list);

                p = page_address(page);
                /*
                 * Allow kmemleak to scan these pages as they contain pointers
                 * to additional allocations like via ops->init_request().
                 */
                kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
                entries_per_page = order_to_size(this_order) / rq_size;
                to_do = min(entries_per_page, depth - i);
                left -= to_do * rq_size;
                for (j = 0; j < to_do; j++) {
                        struct request *rq = p;

                        tags->static_rqs[i] = rq;
                        if (blk_mq_init_request(set, rq, hctx_idx, node)) {
                                tags->static_rqs[i] = NULL;
                                goto fail;
                        }

                        p += rq_size;
                        i++;
                }
        }
        return 0;

fail:
        blk_mq_free_rqs(set, tags, hctx_idx);
        return -ENOMEM;
}

struct rq_iter_data {
        struct blk_mq_hw_ctx *hctx;
        bool has_rq;
};

static bool blk_mq_has_request(struct request *rq, void *data)
{
        struct rq_iter_data *iter_data = data;

        if (rq->mq_hctx != iter_data->hctx)
                return true;
        iter_data->has_rq = true;
        return false;
}

static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
{
        struct blk_mq_tags *tags = hctx->sched_tags ?
                        hctx->sched_tags : hctx->tags;
        struct rq_iter_data data = {
                .hctx        = hctx,
        };
        int srcu_idx;

        srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu);
        blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
        srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx);

        return data.has_rq;
}

static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
                unsigned int this_cpu)
{
        enum hctx_type type = hctx->type;
        int cpu;

        /*
         * hctx->cpumask has to rule out isolated CPUs, but userspace still
         * might submit IOs on these isolated CPUs, so use the queue map to
         * check if all CPUs mapped to this hctx are offline
         */
        for_each_online_cpu(cpu) {
                struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
                                type, cpu);

                if (h != hctx)
                        continue;

                /* this hctx has at least one online CPU */
                if (this_cpu != cpu)
                        return true;
        }

        return false;
}

static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
                        struct blk_mq_hw_ctx, cpuhp_online);
        int ret = 0;

        if (!hctx->nr_ctx || blk_mq_hctx_has_online_cpu(hctx, cpu))
                return 0;

        /*
         * Prevent new request from being allocated on the current hctx.
         *
         * The smp_mb__after_atomic() Pairs with the implied barrier in
         * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
         * seen once we return from the tag allocator.
         */
        set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
        smp_mb__after_atomic();

        /*
         * Try to grab a reference to the queue and wait for any outstanding
         * requests.  If we could not grab a reference the queue has been
         * frozen and there are no requests.
         */
        if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
                while (blk_mq_hctx_has_requests(hctx)) {
                        /*
                         * The wakeup capable IRQ handler of block device is
                         * not called during suspend. Skip the loop by checking
                         * pm_wakeup_pending to prevent the deadlock and improve
                         * suspend latency.
                         */
                        if (pm_wakeup_pending()) {
                                clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
                                ret = -EBUSY;
                                break;
                        }
                        msleep(5);
                }
                percpu_ref_put(&hctx->queue->q_usage_counter);
        }

        return ret;
}

/*
 * Check if one CPU is mapped to the specified hctx
 *
 * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed
 * to be used for scheduling kworker only. For other usage, please call this
 * helper for checking if one CPU belongs to the specified hctx
 */
static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu,
                const struct blk_mq_hw_ctx *hctx)
{
        struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue,
                        hctx->type, cpu);

        return mapped_hctx == hctx;
}

static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
                        struct blk_mq_hw_ctx, cpuhp_online);

        if (blk_mq_cpu_mapped_to_hctx(cpu, hctx))
                clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
        return 0;
}

/*
 * 'cpu' is going away. splice any existing rq_list entries from this
 * software queue to the hw queue dispatch list, and ensure that it
 * gets run.
 */
static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        LIST_HEAD(tmp);
        enum hctx_type type;

        hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
        if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx))
                return 0;

        ctx = __blk_mq_get_ctx(hctx->queue, cpu);
        type = hctx->type;

        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->rq_lists[type])) {
                list_splice_init(&ctx->rq_lists[type], &tmp);
                blk_mq_hctx_clear_pending(hctx, ctx);
        }
        spin_unlock(&ctx->lock);

        if (list_empty(&tmp))
                return 0;

        spin_lock(&hctx->lock);
        list_splice_tail_init(&tmp, &hctx->dispatch);
        spin_unlock(&hctx->lock);

        blk_mq_run_hw_queue(hctx, true);
        return 0;
}

static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
        lockdep_assert_held(&blk_mq_cpuhp_lock);

        if (!(hctx->flags & BLK_MQ_F_STACKING) &&
            !hlist_unhashed(&hctx->cpuhp_online)) {
                cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
                                                    &hctx->cpuhp_online);
                INIT_HLIST_NODE(&hctx->cpuhp_online);
        }

        if (!hlist_unhashed(&hctx->cpuhp_dead)) {
                cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
                                                    &hctx->cpuhp_dead);
                INIT_HLIST_NODE(&hctx->cpuhp_dead);
        }
}

static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
        mutex_lock(&blk_mq_cpuhp_lock);
        __blk_mq_remove_cpuhp(hctx);
        mutex_unlock(&blk_mq_cpuhp_lock);
}

static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx)
{
        lockdep_assert_held(&blk_mq_cpuhp_lock);

        if (!(hctx->flags & BLK_MQ_F_STACKING) &&
            hlist_unhashed(&hctx->cpuhp_online))
                cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
                                &hctx->cpuhp_online);

        if (hlist_unhashed(&hctx->cpuhp_dead))
                cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD,
                                &hctx->cpuhp_dead);
}

static void __blk_mq_remove_cpuhp_list(struct list_head *head)
{
        struct blk_mq_hw_ctx *hctx;

        lockdep_assert_held(&blk_mq_cpuhp_lock);

        list_for_each_entry(hctx, head, hctx_list)
                __blk_mq_remove_cpuhp(hctx);
}

/*
 * Unregister cpuhp callbacks from exited hw queues
 *
 * Safe to call if this `request_queue` is live
 */
static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q)
{
        LIST_HEAD(hctx_list);

        spin_lock(&q->unused_hctx_lock);
        list_splice_init(&q->unused_hctx_list, &hctx_list);
        spin_unlock(&q->unused_hctx_lock);

        mutex_lock(&blk_mq_cpuhp_lock);
        __blk_mq_remove_cpuhp_list(&hctx_list);
        mutex_unlock(&blk_mq_cpuhp_lock);

        spin_lock(&q->unused_hctx_lock);
        list_splice(&hctx_list, &q->unused_hctx_list);
        spin_unlock(&q->unused_hctx_lock);
}

/*
 * Register cpuhp callbacks from all hw queues
 *
 * Safe to call if this `request_queue` is live
 */
static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        mutex_lock(&blk_mq_cpuhp_lock);
        queue_for_each_hw_ctx(q, hctx, i)
                __blk_mq_add_cpuhp(hctx);
        mutex_unlock(&blk_mq_cpuhp_lock);
}

/*
 * Before freeing hw queue, clearing the flush request reference in
 * tags->rqs[] for avoiding potential UAF.
 */
static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
                unsigned int queue_depth, struct request *flush_rq)
{
        int i;

        /* The hw queue may not be mapped yet */
        if (!tags)
                return;

        WARN_ON_ONCE(req_ref_read(flush_rq) != 0);

        for (i = 0; i < queue_depth; i++)
                cmpxchg(&tags->rqs[i], flush_rq, NULL);
}

static void blk_free_flush_queue_callback(struct rcu_head *head)
{
        struct blk_flush_queue *fq =
                container_of(head, struct blk_flush_queue, rcu_head);

        blk_free_flush_queue(fq);
}

/* hctx->ctxs will be freed in queue's release handler */
static void blk_mq_exit_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
        struct request *flush_rq = hctx->fq->flush_rq;

        if (blk_mq_hw_queue_mapped(hctx))
                blk_mq_tag_idle(hctx);

        if (blk_queue_init_done(q))
                blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
                                set->queue_depth, flush_rq);
        if (set->ops->exit_request)
                set->ops->exit_request(set, flush_rq, hctx_idx);

        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);

        call_srcu(&set->tags_srcu, &hctx->fq->rcu_head,
                        blk_free_flush_queue_callback);
        hctx->fq = NULL;

        spin_lock(&q->unused_hctx_lock);
        list_add(&hctx->hctx_list, &q->unused_hctx_list);
        spin_unlock(&q->unused_hctx_lock);
}

static void blk_mq_exit_hw_queues(struct request_queue *q,
                struct blk_mq_tag_set *set, int nr_queue)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (i == nr_queue)
                        break;
                blk_mq_remove_cpuhp(hctx);
                blk_mq_exit_hctx(q, set, hctx, i);
        }
}

static int blk_mq_init_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
{
        gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;

        hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
        if (!hctx->fq)
                goto fail;

        hctx->queue_num = hctx_idx;

        hctx->tags = set->tags[hctx_idx];

        if (set->ops->init_hctx &&
            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
                goto fail_free_fq;

        if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
                                hctx->numa_node))
                goto exit_hctx;

        return 0;

 exit_hctx:
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
 fail_free_fq:
        blk_free_flush_queue(hctx->fq);
        hctx->fq = NULL;
 fail:
        return -1;
}

static struct blk_mq_hw_ctx *
blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
                int node)
{
        struct blk_mq_hw_ctx *hctx;
        gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;

        hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
        if (!hctx)
                goto fail_alloc_hctx;

        if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
                goto free_hctx;

        atomic_set(&hctx->nr_active, 0);
        if (node == NUMA_NO_NODE)
                node = set->numa_node;
        hctx->numa_node = node;

        INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
        spin_lock_init(&hctx->lock);
        INIT_LIST_HEAD(&hctx->dispatch);
        INIT_HLIST_NODE(&hctx->cpuhp_dead);
        INIT_HLIST_NODE(&hctx->cpuhp_online);
        hctx->queue = q;
        hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;

        INIT_LIST_HEAD(&hctx->hctx_list);

        /*
         * Allocate space for all possible cpus to avoid allocation at
         * runtime
         */
        hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
                        gfp, node);
        if (!hctx->ctxs)
                goto free_cpumask;

        if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
                                gfp, node, false, false))
                goto free_ctxs;
        hctx->nr_ctx = 0;

        spin_lock_init(&hctx->dispatch_wait_lock);
        init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
        INIT_LIST_HEAD(&hctx->dispatch_wait.entry);

        blk_mq_hctx_kobj_init(hctx);

        return hctx;

 free_ctxs:
        kfree(hctx->ctxs);
 free_cpumask:
        free_cpumask_var(hctx->cpumask);
 free_hctx:
        kfree(hctx);
 fail_alloc_hctx:
        return NULL;
}

static void blk_mq_init_cpu_queues(struct request_queue *q,
                                   unsigned int nr_hw_queues)
{
        struct blk_mq_tag_set *set = q->tag_set;
        unsigned int i, j;

        for_each_possible_cpu(i) {
                struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
                struct blk_mq_hw_ctx *hctx;
                int k;

                __ctx->cpu = i;
                spin_lock_init(&__ctx->lock);
                for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
                        INIT_LIST_HEAD(&__ctx->rq_lists[k]);

                __ctx->queue = q;

                /*
                 * Set local node, IFF we have more than one hw queue. If
                 * not, we remain on the home node of the device
                 */
                for (j = 0; j < set->nr_maps; j++) {
                        hctx = blk_mq_map_queue_type(q, j, i);
                        if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
                                hctx->numa_node = cpu_to_node(i);
                }
        }
}

struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
                                             unsigned int hctx_idx,
                                             unsigned int depth)
{
        struct blk_mq_tags *tags;
        int ret;

        tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
        if (!tags)
                return NULL;

        ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
        if (ret) {
                blk_mq_free_rq_map(set, tags);
                return NULL;
        }

        return tags;
}

static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
                                       int hctx_idx)
{
        if (blk_mq_is_shared_tags(set->flags)) {
                set->tags[hctx_idx] = set->shared_tags;

                return true;
        }

        set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
                                                       set->queue_depth);

        return set->tags[hctx_idx];
}

void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
                             struct blk_mq_tags *tags,
                             unsigned int hctx_idx)
{
        if (tags) {
                blk_mq_free_rqs(set, tags, hctx_idx);
                blk_mq_free_rq_map(set, tags);
        }
}

static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
                                      unsigned int hctx_idx)
{
        if (!blk_mq_is_shared_tags(set->flags))
                blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);

        set->tags[hctx_idx] = NULL;
}

static void blk_mq_map_swqueue(struct request_queue *q)
{
        unsigned int j, hctx_idx;
        unsigned long i;
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        struct blk_mq_tag_set *set = q->tag_set;

        queue_for_each_hw_ctx(q, hctx, i) {
                cpumask_clear(hctx->cpumask);
                hctx->nr_ctx = 0;
                hctx->dispatch_from = NULL;
        }

        /*
         * Map software to hardware queues.
         *
         * If the cpu isn't present, the cpu is mapped to first hctx.
         */
        for_each_possible_cpu(i) {

                ctx = per_cpu_ptr(q->queue_ctx, i);
                for (j = 0; j < set->nr_maps; j++) {
                        if (!set->map[j].nr_queues) {
                                ctx->hctxs[j] = blk_mq_map_queue_type(q,
                                                HCTX_TYPE_DEFAULT, i);
                                continue;
                        }
                        hctx_idx = set->map[j].mq_map[i];
                        /* unmapped hw queue can be remapped after CPU topo changed */
                        if (!set->tags[hctx_idx] &&
                            !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
                                /*
                                 * If tags initialization fail for some hctx,
                                 * that hctx won't be brought online.  In this
                                 * case, remap the current ctx to hctx[0] which
                                 * is guaranteed to always have tags allocated
                                 */
                                set->map[j].mq_map[i] = 0;
                        }

                        hctx = blk_mq_map_queue_type(q, j, i);
                        ctx->hctxs[j] = hctx;
                        /*
                         * If the CPU is already set in the mask, then we've
                         * mapped this one already. This can happen if
                         * devices share queues across queue maps.
                         */
                        if (cpumask_test_cpu(i, hctx->cpumask))
                                continue;

                        cpumask_set_cpu(i, hctx->cpumask);
                        hctx->type = j;
                        ctx->index_hw[hctx->type] = hctx->nr_ctx;
                        hctx->ctxs[hctx->nr_ctx++] = ctx;

                        /*
                         * If the nr_ctx type overflows, we have exceeded the
                         * amount of sw queues we can support.
                         */
                        BUG_ON(!hctx->nr_ctx);
                }

                for (; j < HCTX_MAX_TYPES; j++)
                        ctx->hctxs[j] = blk_mq_map_queue_type(q,
                                        HCTX_TYPE_DEFAULT, i);
        }

        queue_for_each_hw_ctx(q, hctx, i) {
                int cpu;

                /*
                 * If no software queues are mapped to this hardware queue,
                 * disable it and free the request entries.
                 */
                if (!hctx->nr_ctx) {
                        /* Never unmap queue 0.  We need it as a
                         * fallback in case of a new remap fails
                         * allocation
                         */
                        if (i)
                                __blk_mq_free_map_and_rqs(set, i);

                        hctx->tags = NULL;
                        continue;
                }

                hctx->tags = set->tags[i];
                WARN_ON(!hctx->tags);

                /*
                 * Set the map size to the number of mapped software queues.
                 * This is more accurate and more efficient than looping
                 * over all possibly mapped software queues.
                 */
                sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);

                /*
                 * Rule out isolated CPUs from hctx->cpumask to avoid
                 * running block kworker on isolated CPUs.
                 * FIXME: cpuset should propagate further changes to isolated CPUs
                 * here.
                 */
                rcu_read_lock();
                for_each_cpu(cpu, hctx->cpumask) {
                        if (cpu_is_isolated(cpu))
                                cpumask_clear_cpu(cpu, hctx->cpumask);
                }
                rcu_read_unlock();

                /*
                 * Initialize batch roundrobin counts
                 */
                hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }
}

/*
 * Caller needs to ensure that we're either frozen/quiesced, or that
 * the queue isn't live yet.
 */
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (shared) {
                        hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
                } else {
                        blk_mq_tag_idle(hctx);
                        hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
                }
        }
}

static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
                                         bool shared)
{
        struct request_queue *q;
        unsigned int memflags;

        lockdep_assert_held(&set->tag_list_lock);

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                memflags = blk_mq_freeze_queue(q);
                queue_set_hctx_shared(q, shared);
                blk_mq_unfreeze_queue(q, memflags);
        }
}

static void blk_mq_del_queue_tag_set(struct request_queue *q)
{
        struct blk_mq_tag_set *set = q->tag_set;

        mutex_lock(&set->tag_list_lock);
        list_del_rcu(&q->tag_set_list);
        if (list_is_singular(&set->tag_list)) {
                /* just transitioned to unshared */
                set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
                /* update existing queue */
                blk_mq_update_tag_set_shared(set, false);
        }
        mutex_unlock(&set->tag_list_lock);
}

static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
                                     struct request_queue *q)
{
        mutex_lock(&set->tag_list_lock);

        /*
         * Check to see if we're transitioning to shared (from 1 to 2 queues).
         */
        if (!list_empty(&set->tag_list) &&
            !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
                set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
                /* update existing queue */
                blk_mq_update_tag_set_shared(set, true);
        }
        if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                queue_set_hctx_shared(q, true);
        list_add_tail_rcu(&q->tag_set_list, &set->tag_list);

        mutex_unlock(&set->tag_list_lock);
}

/* All allocations will be freed in release handler of q->mq_kobj */
static int blk_mq_alloc_ctxs(struct request_queue *q)
{
        struct blk_mq_ctxs *ctxs;
        int cpu;

        ctxs = kzalloc_obj(*ctxs);
        if (!ctxs)
                return -ENOMEM;

        ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
        if (!ctxs->queue_ctx)
                goto fail;

        for_each_possible_cpu(cpu) {
                struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
                ctx->ctxs = ctxs;
        }

        q->mq_kobj = &ctxs->kobj;
        q->queue_ctx = ctxs->queue_ctx;

        return 0;
 fail:
        kfree(ctxs);
        return -ENOMEM;
}

/*
 * It is the actual release handler for mq, but we do it from
 * request queue's release handler for avoiding use-after-free
 * and headache because q->mq_kobj shouldn't have been introduced,
 * but we can't group ctx/kctx kobj without it.
 */
void blk_mq_release(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx, *next;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));

        /* all hctx are in .unused_hctx_list now */
        list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
                list_del_init(&hctx->hctx_list);
                kobject_put(&hctx->kobj);
        }

        kfree(q->queue_hw_ctx);

        /*
         * release .mq_kobj and sw queue's kobject now because
         * both share lifetime with request queue.
         */
        blk_mq_sysfs_deinit(q);
}

struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata)
{
        struct queue_limits default_lim = { };
        struct request_queue *q;
        int ret;

        if (!lim)
                lim = &default_lim;
        lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
        if (set->nr_maps > HCTX_TYPE_POLL)
                lim->features |= BLK_FEAT_POLL;

        q = blk_alloc_queue(lim, set->numa_node);
        if (IS_ERR(q))
                return q;
        q->queuedata = queuedata;
        ret = blk_mq_init_allocated_queue(set, q);
        if (ret) {
                blk_put_queue(q);
                return ERR_PTR(ret);
        }
        return q;
}
EXPORT_SYMBOL(blk_mq_alloc_queue);

/**
 * blk_mq_destroy_queue - shutdown a request queue
 * @q: request queue to shutdown
 *
 * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future
 * requests will be failed with -ENODEV. The caller is responsible for dropping
 * the reference from blk_mq_alloc_queue() by calling blk_put_queue().
 *
 * Context: can sleep
 */
void blk_mq_destroy_queue(struct request_queue *q)
{
        WARN_ON_ONCE(!queue_is_mq(q));
        WARN_ON_ONCE(blk_queue_registered(q));

        might_sleep();

        blk_queue_flag_set(QUEUE_FLAG_DYING, q);
        blk_queue_start_drain(q);
        blk_mq_freeze_queue_wait(q);

        blk_sync_queue(q);
        blk_mq_cancel_work_sync(q);
        blk_mq_exit_queue(q);
}
EXPORT_SYMBOL(blk_mq_destroy_queue);

struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata,
                struct lock_class_key *lkclass)
{
        struct request_queue *q;
        struct gendisk *disk;

        q = blk_mq_alloc_queue(set, lim, queuedata);
        if (IS_ERR(q))
                return ERR_CAST(q);

        disk = __alloc_disk_node(q, set->numa_node, lkclass);
        if (!disk) {
                blk_mq_destroy_queue(q);
                blk_put_queue(q);
                return ERR_PTR(-ENOMEM);
        }
        set_bit(GD_OWNS_QUEUE, &disk->state);
        return disk;
}
EXPORT_SYMBOL(__blk_mq_alloc_disk);

struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
                struct lock_class_key *lkclass)
{
        struct gendisk *disk;

        if (!blk_get_queue(q))
                return NULL;
        disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass);
        if (!disk)
                blk_put_queue(q);
        return disk;
}
EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);

/*
 * Only hctx removed from cpuhp list can be reused
 */
static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx)
{
        return hlist_unhashed(&hctx->cpuhp_online) &&
                hlist_unhashed(&hctx->cpuhp_dead);
}

static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
                struct blk_mq_tag_set *set, struct request_queue *q,
                int hctx_idx, int node)
{
        struct blk_mq_hw_ctx *hctx = NULL, *tmp;

        /* reuse dead hctx first */
        spin_lock(&q->unused_hctx_lock);
        list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
                if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) {
                        hctx = tmp;
                        break;
                }
        }
        if (hctx)
                list_del_init(&hctx->hctx_list);
        spin_unlock(&q->unused_hctx_lock);

        if (!hctx)
                hctx = blk_mq_alloc_hctx(q, set, node);
        if (!hctx)
                goto fail;

        if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
                goto free_hctx;

        return hctx;

 free_hctx:
        kobject_put(&hctx->kobj);
 fail:
        return NULL;
}

static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                                     struct request_queue *q)
{
        int i, j, end;
        struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;

        if (q->nr_hw_queues < set->nr_hw_queues) {
                struct blk_mq_hw_ctx **new_hctxs;

                new_hctxs = kcalloc_node(set->nr_hw_queues,
                                       sizeof(*new_hctxs), GFP_KERNEL,
                                       set->numa_node);
                if (!new_hctxs)
                        return;
                if (hctxs)
                        memcpy(new_hctxs, hctxs, q->nr_hw_queues *
                               sizeof(*hctxs));
                rcu_assign_pointer(q->queue_hw_ctx, new_hctxs);
                /*
                 * Make sure reading the old queue_hw_ctx from other
                 * context concurrently won't trigger uaf.
                 */
                kfree_rcu_mightsleep(hctxs);
                hctxs = new_hctxs;
        }

        for (i = 0; i < set->nr_hw_queues; i++) {
                int old_node;
                int node = blk_mq_get_hctx_node(set, i);
                struct blk_mq_hw_ctx *old_hctx = hctxs[i];

                if (old_hctx) {
                        old_node = old_hctx->numa_node;
                        blk_mq_exit_hctx(q, set, old_hctx, i);
                }

                hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node);
                if (!hctxs[i]) {
                        if (!old_hctx)
                                break;
                        pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
                                        node, old_node);
                        hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i,
                                        old_node);
                        WARN_ON_ONCE(!hctxs[i]);
                }
        }
        /*
         * Increasing nr_hw_queues fails. Free the newly allocated
         * hctxs and keep the previous q->nr_hw_queues.
         */
        if (i != set->nr_hw_queues) {
                j = q->nr_hw_queues;
                end = i;
        } else {
                j = i;
                end = q->nr_hw_queues;
                q->nr_hw_queues = set->nr_hw_queues;
        }

        for (; j < end; j++) {
                struct blk_mq_hw_ctx *hctx = hctxs[j];

                if (hctx) {
                        blk_mq_exit_hctx(q, set, hctx, j);
                        hctxs[j] = NULL;
                }
        }
}

static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                                   struct request_queue *q)
{
        __blk_mq_realloc_hw_ctxs(set, q);

        /* unregister cpuhp callbacks for exited hctxs */
        blk_mq_remove_hw_queues_cpuhp(q);

        /* register cpuhp for new initialized hctxs */
        blk_mq_add_hw_queues_cpuhp(q);
}

int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                struct request_queue *q)
{
        /* mark the queue as mq asap */
        q->mq_ops = set->ops;

        /*
         * ->tag_set has to be setup before initialize hctx, which cpuphp
         * handler needs it for checking queue mapping
         */
        q->tag_set = set;

        if (blk_mq_alloc_ctxs(q))
                goto err_exit;

        /* init q->mq_kobj and sw queues' kobjects */
        blk_mq_sysfs_init(q);

        INIT_LIST_HEAD(&q->unused_hctx_list);
        spin_lock_init(&q->unused_hctx_lock);

        blk_mq_realloc_hw_ctxs(set, q);
        if (!q->nr_hw_queues)
                goto err_hctxs;

        INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
        blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);

        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;

        INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
        INIT_LIST_HEAD(&q->flush_list);
        INIT_LIST_HEAD(&q->requeue_list);
        spin_lock_init(&q->requeue_lock);

        q->nr_requests = set->queue_depth;
        q->async_depth = set->queue_depth;

        blk_mq_init_cpu_queues(q, set->nr_hw_queues);
        blk_mq_map_swqueue(q);
        blk_mq_add_queue_tag_set(set, q);
        return 0;

err_hctxs:
        blk_mq_release(q);
err_exit:
        q->mq_ops = NULL;
        return -ENOMEM;
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);

/* tags can _not_ be used after returning from blk_mq_exit_queue */
void blk_mq_exit_queue(struct request_queue *q)
{
        struct blk_mq_tag_set *set = q->tag_set;

        /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
        /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
        blk_mq_del_queue_tag_set(q);
}

static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
        int i;

        if (blk_mq_is_shared_tags(set->flags)) {
                set->shared_tags = blk_mq_alloc_map_and_rqs(set,
                                                BLK_MQ_NO_HCTX_IDX,
                                                set->queue_depth);
                if (!set->shared_tags)
                        return -ENOMEM;
        }

        for (i = 0; i < set->nr_hw_queues; i++) {
                if (!__blk_mq_alloc_map_and_rqs(set, i))
                        goto out_unwind;
                cond_resched();
        }

        return 0;

out_unwind:
        while (--i >= 0)
                __blk_mq_free_map_and_rqs(set, i);

        if (blk_mq_is_shared_tags(set->flags)) {
                blk_mq_free_map_and_rqs(set, set->shared_tags,
                                        BLK_MQ_NO_HCTX_IDX);
        }

        return -ENOMEM;
}

/*
 * Allocate the request maps associated with this tag_set. Note that this
 * may reduce the depth asked for, if memory is tight. set->queue_depth
 * will be updated to reflect the allocated depth.
 */
static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
{
        unsigned int depth;
        int err;

        depth = set->queue_depth;
        do {
                err = __blk_mq_alloc_rq_maps(set);
                if (!err)
                        break;

                set->queue_depth >>= 1;
                if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
                        err = -ENOMEM;
                        break;
                }
        } while (set->queue_depth);

        if (!set->queue_depth || err) {
                pr_err("blk-mq: failed to allocate request map\n");
                return -ENOMEM;
        }

        if (depth != set->queue_depth)
                pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
                                                depth, set->queue_depth);

        return 0;
}

static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
        /*
         * blk_mq_map_queues() and multiple .map_queues() implementations
         * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
         * number of hardware queues.
         */
        if (set->nr_maps == 1)
                set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;

        if (set->ops->map_queues) {
                int i;

                /*
                 * transport .map_queues is usually done in the following
                 * way:
                 *
                 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
                 *         mask = get_cpu_mask(queue)
                 *         for_each_cpu(cpu, mask)
                 *                 set->map[x].mq_map[cpu] = queue;
                 * }
                 *
                 * When we need to remap, the table has to be cleared for
                 * killing stale mapping since one CPU may not be mapped
                 * to any hw queue.
                 */
                for (i = 0; i < set->nr_maps; i++)
                        blk_mq_clear_mq_map(&set->map[i]);

                set->ops->map_queues(set);
        } else {
                BUG_ON(set->nr_maps > 1);
                blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
        }
}

static struct blk_mq_tags **blk_mq_prealloc_tag_set_tags(
                                struct blk_mq_tag_set *set,
                                int new_nr_hw_queues)
{
        struct blk_mq_tags **new_tags;
        int i;

        if (set->nr_hw_queues >= new_nr_hw_queues)
                return NULL;

        new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
                                GFP_KERNEL, set->numa_node);
        if (!new_tags)
                return ERR_PTR(-ENOMEM);

        if (set->tags)
                memcpy(new_tags, set->tags, set->nr_hw_queues *
                       sizeof(*set->tags));

        for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
                if (blk_mq_is_shared_tags(set->flags)) {
                        new_tags[i] = set->shared_tags;
                } else {
                        new_tags[i] = blk_mq_alloc_map_and_rqs(set, i,
                                        set->queue_depth);
                        if (!new_tags[i])
                                goto out_unwind;
                }
                cond_resched();
        }

        return new_tags;
out_unwind:
        while (--i >= set->nr_hw_queues) {
                if (!blk_mq_is_shared_tags(set->flags))
                        blk_mq_free_map_and_rqs(set, new_tags[i], i);
        }
        kfree(new_tags);
        return ERR_PTR(-ENOMEM);
}

/*
 * Alloc a tag set to be associated with one or more request queues.
 * May fail with EINVAL for various error conditions. May adjust the
 * requested depth down, if it's too large. In that case, the set
 * value will be stored in set->queue_depth.
 */
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
        int i, ret;

        BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);

        if (!set->nr_hw_queues)
                return -EINVAL;
        if (!set->queue_depth)
                return -EINVAL;
        if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
                return -EINVAL;

        if (!set->ops->queue_rq)
                return -EINVAL;

        if (!set->ops->get_budget ^ !set->ops->put_budget)
                return -EINVAL;

        if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
                pr_info("blk-mq: reduced tag depth to %u\n",
                        BLK_MQ_MAX_DEPTH);
                set->queue_depth = BLK_MQ_MAX_DEPTH;
        }

        if (!set->nr_maps)
                set->nr_maps = 1;
        else if (set->nr_maps > HCTX_MAX_TYPES)
                return -EINVAL;

        /*
         * If a crashdump is active, then we are potentially in a very
         * memory constrained environment. Limit us to  64 tags to prevent
         * using too much memory.
         */
        if (is_kdump_kernel())
                set->queue_depth = min(64U, set->queue_depth);

        /*
         * There is no use for more h/w queues than cpus if we just have
         * a single map
         */
        if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
                set->nr_hw_queues = nr_cpu_ids;

        if (set->flags & BLK_MQ_F_BLOCKING) {
                set->srcu = kmalloc_obj(*set->srcu);
                if (!set->srcu)
                        return -ENOMEM;
                ret = init_srcu_struct(set->srcu);
                if (ret)
                        goto out_free_srcu;
        }
        ret = init_srcu_struct(&set->tags_srcu);
        if (ret)
                goto out_cleanup_srcu;

        init_rwsem(&set->update_nr_hwq_lock);

        ret = -ENOMEM;
        set->tags = kcalloc_node(set->nr_hw_queues,
                                 sizeof(struct blk_mq_tags *), GFP_KERNEL,
                                 set->numa_node);
        if (!set->tags)
                goto out_cleanup_tags_srcu;

        for (i = 0; i < set->nr_maps; i++) {
                set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
                                                  sizeof(set->map[i].mq_map[0]),
                                                  GFP_KERNEL, set->numa_node);
                if (!set->map[i].mq_map)
                        goto out_free_mq_map;
                set->map[i].nr_queues = set->nr_hw_queues;
        }

        blk_mq_update_queue_map(set);

        ret = blk_mq_alloc_set_map_and_rqs(set);
        if (ret)
                goto out_free_mq_map;

        mutex_init(&set->tag_list_lock);
        INIT_LIST_HEAD(&set->tag_list);

        return 0;

out_free_mq_map:
        for (i = 0; i < set->nr_maps; i++) {
                kfree(set->map[i].mq_map);
                set->map[i].mq_map = NULL;
        }
        kfree(set->tags);
        set->tags = NULL;
out_cleanup_tags_srcu:
        cleanup_srcu_struct(&set->tags_srcu);
out_cleanup_srcu:
        if (set->flags & BLK_MQ_F_BLOCKING)
                cleanup_srcu_struct(set->srcu);
out_free_srcu:
        if (set->flags & BLK_MQ_F_BLOCKING)
                kfree(set->srcu);
        return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);

/* allocate and initialize a tagset for a simple single-queue device */
int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
                const struct blk_mq_ops *ops, unsigned int queue_depth,
                unsigned int set_flags)
{
        memset(set, 0, sizeof(*set));
        set->ops = ops;
        set->nr_hw_queues = 1;
        set->nr_maps = 1;
        set->queue_depth = queue_depth;
        set->numa_node = NUMA_NO_NODE;
        set->flags = set_flags;
        return blk_mq_alloc_tag_set(set);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);

void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
        int i, j;

        for (i = 0; i < set->nr_hw_queues; i++)
                __blk_mq_free_map_and_rqs(set, i);

        if (blk_mq_is_shared_tags(set->flags)) {
                blk_mq_free_map_and_rqs(set, set->shared_tags,
                                        BLK_MQ_NO_HCTX_IDX);
        }

        for (j = 0; j < set->nr_maps; j++) {
                kfree(set->map[j].mq_map);
                set->map[j].mq_map = NULL;
        }

        kfree(set->tags);
        set->tags = NULL;

        srcu_barrier(&set->tags_srcu);
        cleanup_srcu_struct(&set->tags_srcu);
        if (set->flags & BLK_MQ_F_BLOCKING) {
                cleanup_srcu_struct(set->srcu);
                kfree(set->srcu);
        }
}
EXPORT_SYMBOL(blk_mq_free_tag_set);

struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
                                                struct elevator_tags *et,
                                                unsigned int nr)
{
        struct blk_mq_tag_set *set = q->tag_set;
        struct elevator_tags *old_et = NULL;
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        blk_mq_quiesce_queue(q);

        if (blk_mq_is_shared_tags(set->flags)) {
                /*
                 * Shared tags, for sched tags, we allocate max initially hence
                 * tags can't grow, see blk_mq_alloc_sched_tags().
                 */
                if (q->elevator)
                        blk_mq_tag_update_sched_shared_tags(q, nr);
                else
                        blk_mq_tag_resize_shared_tags(set, nr);
        } else if (!q->elevator) {
                /*
                 * Non-shared hardware tags, nr is already checked from
                 * queue_requests_store() and tags can't grow.
                 */
                queue_for_each_hw_ctx(q, hctx, i) {
                        if (!hctx->tags)
                                continue;
                        sbitmap_queue_resize(&hctx->tags->bitmap_tags,
                                nr - hctx->tags->nr_reserved_tags);
                }
        } else if (nr <= q->elevator->et->nr_requests) {
                /* Non-shared sched tags, and tags don't grow. */
                queue_for_each_hw_ctx(q, hctx, i) {
                        if (!hctx->sched_tags)
                                continue;
                        sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags,
                                nr - hctx->sched_tags->nr_reserved_tags);
                }
        } else {
                /* Non-shared sched tags, and tags grow */
                queue_for_each_hw_ctx(q, hctx, i)
                        hctx->sched_tags = et->tags[i];
                old_et =  q->elevator->et;
                q->elevator->et = et;
        }

        /*
         * Preserve relative value, both nr and async_depth are at most 16 bit
         * value, no need to worry about overflow.
         */
        q->async_depth = max(q->async_depth * nr / q->nr_requests, 1);
        q->nr_requests = nr;
        if (q->elevator && q->elevator->type->ops.depth_updated)
                q->elevator->type->ops.depth_updated(q);

        blk_mq_unquiesce_queue(q);
        return old_et;
}

/*
 * Switch back to the elevator type stored in the xarray.
 */
static void blk_mq_elv_switch_back(struct request_queue *q,
                struct xarray *elv_tbl)
{
        struct elv_change_ctx *ctx = xa_load(elv_tbl, q->id);

        if (WARN_ON_ONCE(!ctx))
                return;

        /* The elv_update_nr_hw_queues unfreezes the queue. */
        elv_update_nr_hw_queues(q, ctx);

        /* Drop the reference acquired in blk_mq_elv_switch_none. */
        if (ctx->type)
                elevator_put(ctx->type);
}

/*
 * Stores elevator name and type in ctx and set current elevator to none.
 */
static int blk_mq_elv_switch_none(struct request_queue *q,
                struct xarray *elv_tbl)
{
        struct elv_change_ctx *ctx;

        lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock);

        /*
         * Accessing q->elevator without holding q->elevator_lock is safe here
         * because we're called from nr_hw_queue update which is protected by
         * set->update_nr_hwq_lock in the writer context. So, scheduler update/
         * switch code (which acquires the same lock in the reader context)
         * can't run concurrently.
         */
        if (q->elevator) {
                ctx = xa_load(elv_tbl, q->id);
                if (WARN_ON_ONCE(!ctx))
                        return -ENOENT;

                ctx->name = q->elevator->type->elevator_name;

                /*
                 * Before we switch elevator to 'none', take a reference to
                 * the elevator module so that while nr_hw_queue update is
                 * running, no one can remove elevator module. We'd put the
                 * reference to elevator module later when we switch back
                 * elevator.
                 */
                __elevator_get(q->elevator->type);

                /*
                 * Store elevator type so that we can release the reference
                 * taken above later.
                 */
                ctx->type = q->elevator->type;
                elevator_set_none(q);
        }
        return 0;
}

static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
                                                        int nr_hw_queues)
{
        struct request_queue *q;
        int prev_nr_hw_queues = set->nr_hw_queues;
        unsigned int memflags;
        int i;
        struct xarray elv_tbl;
        struct blk_mq_tags **new_tags;
        bool queues_frozen = false;

        lockdep_assert_held(&set->tag_list_lock);

        if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
                nr_hw_queues = nr_cpu_ids;
        if (nr_hw_queues < 1)
                return;
        if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
                return;

        memflags = memalloc_noio_save();

        xa_init(&elv_tbl);
        if (blk_mq_alloc_sched_ctx_batch(&elv_tbl, set) < 0)
                goto out_free_ctx;

        if (blk_mq_alloc_sched_res_batch(&elv_tbl, set, nr_hw_queues) < 0)
                goto out_free_ctx;

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_debugfs_unregister_hctxs(q);
                blk_mq_sysfs_unregister_hctxs(q);
        }

        /*
         * Switch IO scheduler to 'none', cleaning up the data associated
         * with the previous scheduler. We will switch back once we are done
         * updating the new sw to hw queue mappings.
         */
        list_for_each_entry(q, &set->tag_list, tag_set_list)
                if (blk_mq_elv_switch_none(q, &elv_tbl))
                        goto switch_back;

        new_tags = blk_mq_prealloc_tag_set_tags(set, nr_hw_queues);
        if (IS_ERR(new_tags))
                goto switch_back;

        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_freeze_queue_nomemsave(q);
        queues_frozen = true;
        if (new_tags) {
                kfree(set->tags);
                set->tags = new_tags;
        }
        set->nr_hw_queues = nr_hw_queues;

fallback:
        blk_mq_update_queue_map(set);
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                __blk_mq_realloc_hw_ctxs(set, q);

                if (q->nr_hw_queues != set->nr_hw_queues) {
                        int i = prev_nr_hw_queues;

                        pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
                                        nr_hw_queues, prev_nr_hw_queues);
                        for (; i < set->nr_hw_queues; i++)
                                __blk_mq_free_map_and_rqs(set, i);

                        set->nr_hw_queues = prev_nr_hw_queues;
                        goto fallback;
                }
                blk_mq_map_swqueue(q);
        }
switch_back:
        /* The blk_mq_elv_switch_back unfreezes queue for us. */
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                /* switch_back expects queue to be frozen */
                if (!queues_frozen)
                        blk_mq_freeze_queue_nomemsave(q);
                blk_mq_elv_switch_back(q, &elv_tbl);
        }

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_sysfs_register_hctxs(q);
                blk_mq_debugfs_register_hctxs(q);

                blk_mq_remove_hw_queues_cpuhp(q);
                blk_mq_add_hw_queues_cpuhp(q);
        }

out_free_ctx:
        blk_mq_free_sched_ctx_batch(&elv_tbl);
        xa_destroy(&elv_tbl);
        memalloc_noio_restore(memflags);

        /* Free the excess tags when nr_hw_queues shrink. */
        for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
                __blk_mq_free_map_and_rqs(set, i);
}

void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
        down_write(&set->update_nr_hwq_lock);
        mutex_lock(&set->tag_list_lock);
        __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
        mutex_unlock(&set->tag_list_lock);
        up_write(&set->update_nr_hwq_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);

static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
                         struct io_comp_batch *iob, unsigned int flags)
{
        int ret;

        do {
                ret = q->mq_ops->poll(hctx, iob);
                if (ret > 0)
                        return ret;
                if (task_sigpending(current))
                        return 1;
                if (ret < 0 || (flags & BLK_POLL_ONESHOT))
                        break;
                cpu_relax();
        } while (!need_resched());

        return 0;
}

int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
                struct io_comp_batch *iob, unsigned int flags)
{
        if (!blk_mq_can_poll(q))
                return 0;
        return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags);
}

int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
                unsigned int poll_flags)
{
        struct request_queue *q = rq->q;
        int ret;

        if (!blk_rq_is_poll(rq))
                return 0;
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return 0;

        ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags);
        blk_queue_exit(q);

        return ret;
}
EXPORT_SYMBOL_GPL(blk_rq_poll);

unsigned int blk_mq_rq_cpu(struct request *rq)
{
        return rq->mq_ctx->cpu;
}
EXPORT_SYMBOL(blk_mq_rq_cpu);

void blk_mq_cancel_work_sync(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        cancel_delayed_work_sync(&q->requeue_work);

        queue_for_each_hw_ctx(q, hctx, i)
                cancel_delayed_work_sync(&hctx->run_work);
}

static int __init blk_mq_init(void)
{
        int i;

        for_each_possible_cpu(i)
                init_llist_head(&per_cpu(blk_cpu_done, i));
        for_each_possible_cpu(i)
                INIT_CSD(&per_cpu(blk_cpu_csd, i),
                         __blk_mq_complete_request_remote, NULL);
        open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);

        cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
                                  "block/softirq:dead", NULL,
                                  blk_softirq_cpu_dead);
        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                blk_mq_hctx_notify_dead);
        cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
                                blk_mq_hctx_notify_online,
                                blk_mq_hctx_notify_offline);
        return 0;
}
subsys_initcall(blk_mq_init);




































































































































































   22 






   22 

   21 

   21 







   23 































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/domain.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"

#include <linux/binfmts.h>
#include <linux/slab.h>
#include <linux/rculist.h>

/* Variables definitions.*/

/* The initial domain. */
struct tomoyo_domain_info tomoyo_kernel_domain;

/**
 * tomoyo_update_policy - Update an entry for exception policy.
 *
 * @new_entry:       Pointer to "struct tomoyo_acl_info".
 * @size:            Size of @new_entry in bytes.
 * @param:           Pointer to "struct tomoyo_acl_param".
 * @check_duplicate: Callback function to find duplicated entry.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)(const struct tomoyo_acl_head
                                                 *,
                                                 const struct tomoyo_acl_head
                                                 *))
{
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        struct tomoyo_acl_head *entry;
        struct list_head *list = param->list;

        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                return -ENOMEM;
        list_for_each_entry_rcu(entry, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS)
                        continue;
                if (!check_duplicate(entry, new_entry))
                        continue;
                entry->is_deleted = param->is_delete;
                error = 0;
                break;
        }
        if (error && !param->is_delete) {
                entry = tomoyo_commit_ok(new_entry, size);
                if (entry) {
                        list_add_tail_rcu(&entry->list, list);
                        error = 0;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
        return error;
}

/**
 * tomoyo_same_acl_head - Check for duplicated "struct tomoyo_acl_info" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_acl_head(const struct tomoyo_acl_info *a,
                                        const struct tomoyo_acl_info *b)
{
        return a->type == b->type && a->cond == b->cond;
}

/**
 * tomoyo_update_domain - Update an entry for domain policy.
 *
 * @new_entry:       Pointer to "struct tomoyo_acl_info".
 * @size:            Size of @new_entry in bytes.
 * @param:           Pointer to "struct tomoyo_acl_param".
 * @check_duplicate: Callback function to find duplicated entry.
 * @merge_duplicate: Callback function to merge duplicated entry.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)(const struct tomoyo_acl_info
                                                 *,
                                                 const struct tomoyo_acl_info
                                                 *),
                         bool (*merge_duplicate)(struct tomoyo_acl_info *,
                                                 struct tomoyo_acl_info *,
                                                 const bool))
{
        const bool is_delete = param->is_delete;
        int error = is_delete ? -ENOENT : -ENOMEM;
        struct tomoyo_acl_info *entry;
        struct list_head * const list = param->list;

        if (param->data[0]) {
                new_entry->cond = tomoyo_get_condition(param);
                if (!new_entry->cond)
                        return -EINVAL;
                /*
                 * Domain transition preference is allowed for only
                 * "file execute" entries.
                 */
                if (new_entry->cond->transit &&
                    !(new_entry->type == TOMOYO_TYPE_PATH_ACL &&
                      container_of(new_entry, struct tomoyo_path_acl, head)
                      ->perm == 1 << TOMOYO_TYPE_EXECUTE))
                        goto out;
        }
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        list_for_each_entry_rcu(entry, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS)
                        continue;
                if (!tomoyo_same_acl_head(entry, new_entry) ||
                    !check_duplicate(entry, new_entry))
                        continue;
                if (merge_duplicate)
                        entry->is_deleted = merge_duplicate(entry, new_entry,
                                                            is_delete);
                else
                        entry->is_deleted = is_delete;
                error = 0;
                break;
        }
        if (error && !is_delete) {
                entry = tomoyo_commit_ok(new_entry, size);
                if (entry) {
                        list_add_tail_rcu(&entry->list, list);
                        error = 0;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        tomoyo_put_condition(new_entry->cond);
        return error;
}

/**
 * tomoyo_check_acl - Do permission check.
 *
 * @r:           Pointer to "struct tomoyo_request_info".
 * @check_entry: Callback function to check type specific parameters.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
void tomoyo_check_acl(struct tomoyo_request_info *r,
                      bool (*check_entry)(struct tomoyo_request_info *,
                                          const struct tomoyo_acl_info *))
{
        const struct tomoyo_domain_info *domain = r->domain;
        struct tomoyo_acl_info *ptr;
        const struct list_head *list = &domain->acl_info_list;
        u16 i = 0;

retry:
        list_for_each_entry_rcu(ptr, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (ptr->is_deleted || ptr->type != r->param_type)
                        continue;
                if (!check_entry(r, ptr))
                        continue;
                if (!tomoyo_condition(r, ptr->cond))
                        continue;
                r->matched_acl = ptr;
                r->granted = true;
                return;
        }
        for (; i < TOMOYO_MAX_ACL_GROUPS; i++) {
                if (!test_bit(i, domain->group))
                        continue;
                list = &domain->ns->acl_group[i++];
                goto retry;
        }
        r->granted = false;
}

/* The list for "struct tomoyo_domain_info". */
LIST_HEAD(tomoyo_domain_list);

/**
 * tomoyo_last_word - Get last component of a domainname.
 *
 * @name: Domainname to check.
 *
 * Returns the last word of @domainname.
 */
static const char *tomoyo_last_word(const char *name)
{
        const char *cp = strrchr(name, ' ');

        if (cp)
                return cp + 1;
        return name;
}

/**
 * tomoyo_same_transition_control - Check for duplicated "struct tomoyo_transition_control" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_transition_control(const struct tomoyo_acl_head *a,
                                           const struct tomoyo_acl_head *b)
{
        const struct tomoyo_transition_control *p1 = container_of(a,
                                                                  typeof(*p1),
                                                                  head);
        const struct tomoyo_transition_control *p2 = container_of(b,
                                                                  typeof(*p2),
                                                                  head);

        return p1->type == p2->type && p1->is_last_name == p2->is_last_name
                && p1->domainname == p2->domainname
                && p1->program == p2->program;
}

/**
 * tomoyo_write_transition_control - Write "struct tomoyo_transition_control" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @type:  Type of this entry.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_write_transition_control(struct tomoyo_acl_param *param,
                                    const u8 type)
{
        struct tomoyo_transition_control e = { .type = type };
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        char *program = param->data;
        char *domainname = strstr(program, " from ");

        if (domainname) {
                *domainname = '\0';
                domainname += 6;
        } else if (type == TOMOYO_TRANSITION_CONTROL_NO_KEEP ||
                   type == TOMOYO_TRANSITION_CONTROL_KEEP) {
                domainname = program;
                program = NULL;
        }
        if (program && strcmp(program, "any")) {
                if (!tomoyo_correct_path(program))
                        return -EINVAL;
                e.program = tomoyo_get_name(program);
                if (!e.program)
                        goto out;
        }
        if (domainname && strcmp(domainname, "any")) {
                if (!tomoyo_correct_domain(domainname)) {
                        if (!tomoyo_correct_path(domainname))
                                goto out;
                        e.is_last_name = true;
                }
                e.domainname = tomoyo_get_name(domainname);
                if (!e.domainname)
                        goto out;
        }
        param->list = &param->ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL];
        error = tomoyo_update_policy(&e.head, sizeof(e), param,
                                     tomoyo_same_transition_control);
out:
        tomoyo_put_name(e.domainname);
        tomoyo_put_name(e.program);
        return error;
}

/**
 * tomoyo_scan_transition - Try to find specific domain transition type.
 *
 * @list:       Pointer to "struct list_head".
 * @domainname: The name of current domain.
 * @program:    The name of requested program.
 * @last_name:  The last component of @domainname.
 * @type:       One of values in "enum tomoyo_transition_type".
 *
 * Returns true if found one, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static inline bool tomoyo_scan_transition
(const struct list_head *list, const struct tomoyo_path_info *domainname,
 const struct tomoyo_path_info *program, const char *last_name,
 const enum tomoyo_transition_type type)
{
        const struct tomoyo_transition_control *ptr;

        list_for_each_entry_rcu(ptr, list, head.list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (ptr->head.is_deleted || ptr->type != type)
                        continue;
                if (ptr->domainname) {
                        if (!ptr->is_last_name) {
                                if (ptr->domainname != domainname)
                                        continue;
                        } else {
                                /*
                                 * Use direct strcmp() since this is
                                 * unlikely used.
                                 */
                                if (strcmp(ptr->domainname->name, last_name))
                                        continue;
                        }
                }
                if (ptr->program && tomoyo_pathcmp(ptr->program, program))
                        continue;
                return true;
        }
        return false;
}

/**
 * tomoyo_transition_type - Get domain transition type.
 *
 * @ns:         Pointer to "struct tomoyo_policy_namespace".
 * @domainname: The name of current domain.
 * @program:    The name of requested program.
 *
 * Returns TOMOYO_TRANSITION_CONTROL_TRANSIT if executing @program causes
 * domain transition across namespaces, TOMOYO_TRANSITION_CONTROL_INITIALIZE if
 * executing @program reinitializes domain transition within that namespace,
 * TOMOYO_TRANSITION_CONTROL_KEEP if executing @program stays at @domainname ,
 * others otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static enum tomoyo_transition_type tomoyo_transition_type
(const struct tomoyo_policy_namespace *ns,
 const struct tomoyo_path_info *domainname,
 const struct tomoyo_path_info *program)
{
        const char *last_name = tomoyo_last_word(domainname->name);
        enum tomoyo_transition_type type = TOMOYO_TRANSITION_CONTROL_NO_RESET;

        while (type < TOMOYO_MAX_TRANSITION_TYPE) {
                const struct list_head * const list =
                        &ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL];

                if (!tomoyo_scan_transition(list, domainname, program,
                                            last_name, type)) {
                        type++;
                        continue;
                }
                if (type != TOMOYO_TRANSITION_CONTROL_NO_RESET &&
                    type != TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE)
                        break;
                /*
                 * Do not check for reset_domain if no_reset_domain matched.
                 * Do not check for initialize_domain if no_initialize_domain
                 * matched.
                 */
                type++;
                type++;
        }
        return type;
}

/**
 * tomoyo_same_aggregator - Check for duplicated "struct tomoyo_aggregator" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_aggregator(const struct tomoyo_acl_head *a,
                                   const struct tomoyo_acl_head *b)
{
        const struct tomoyo_aggregator *p1 = container_of(a, typeof(*p1),
                                                          head);
        const struct tomoyo_aggregator *p2 = container_of(b, typeof(*p2),
                                                          head);

        return p1->original_name == p2->original_name &&
                p1->aggregated_name == p2->aggregated_name;
}

/**
 * tomoyo_write_aggregator - Write "struct tomoyo_aggregator" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_write_aggregator(struct tomoyo_acl_param *param)
{
        struct tomoyo_aggregator e = { };
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        const char *original_name = tomoyo_read_token(param);
        const char *aggregated_name = tomoyo_read_token(param);

        if (!tomoyo_correct_word(original_name) ||
            !tomoyo_correct_path(aggregated_name))
                return -EINVAL;
        e.original_name = tomoyo_get_name(original_name);
        e.aggregated_name = tomoyo_get_name(aggregated_name);
        if (!e.original_name || !e.aggregated_name ||
            e.aggregated_name->is_patterned) /* No patterns allowed. */
                goto out;
        param->list = &param->ns->policy_list[TOMOYO_ID_AGGREGATOR];
        error = tomoyo_update_policy(&e.head, sizeof(e), param,
                                     tomoyo_same_aggregator);
out:
        tomoyo_put_name(e.original_name);
        tomoyo_put_name(e.aggregated_name);
        return error;
}

/**
 * tomoyo_find_namespace - Find specified namespace.
 *
 * @name: Name of namespace to find.
 * @len:  Length of @name.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" if found,
 * NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static struct tomoyo_policy_namespace *tomoyo_find_namespace
(const char *name, const unsigned int len)
{
        struct tomoyo_policy_namespace *ns;

        list_for_each_entry(ns, &tomoyo_namespace_list, namespace_list) {
                if (strncmp(name, ns->name, len) ||
                    (name[len] && name[len] != ' '))
                        continue;
                return ns;
        }
        return NULL;
}

/**
 * tomoyo_assign_namespace - Create a new namespace.
 *
 * @domainname: Name of namespace to create.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" on success,
 * NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_policy_namespace *tomoyo_assign_namespace(const char *domainname)
{
        struct tomoyo_policy_namespace *ptr;
        struct tomoyo_policy_namespace *entry;
        const char *cp = domainname;
        unsigned int len = 0;

        while (*cp && *cp++ != ' ')
                len++;
        ptr = tomoyo_find_namespace(domainname, len);
        if (ptr)
                return ptr;
        if (len >= TOMOYO_EXEC_TMPSIZE - 10 || !tomoyo_domain_def(domainname))
                return NULL;
        entry = kzalloc(sizeof(*entry) + len + 1, GFP_NOFS | __GFP_NOWARN);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        ptr = tomoyo_find_namespace(domainname, len);
        if (!ptr && tomoyo_memory_ok(entry)) {
                char *name = (char *) (entry + 1);

                ptr = entry;
                memmove(name, domainname, len);
                name[len] = '\0';
                entry->name = name;
                tomoyo_init_policy_namespace(entry);
                entry = NULL;
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        kfree(entry);
        return ptr;
}

/**
 * tomoyo_namespace_jump - Check for namespace jump.
 *
 * @domainname: Name of domain.
 *
 * Returns true if namespace differs, false otherwise.
 */
static bool tomoyo_namespace_jump(const char *domainname)
{
        const char *namespace = tomoyo_current_namespace()->name;
        const int len = strlen(namespace);

        return strncmp(domainname, namespace, len) ||
                (domainname[len] && domainname[len] != ' ');
}

/**
 * tomoyo_assign_domain - Create a domain or a namespace.
 *
 * @domainname: The name of domain.
 * @transit:    True if transit to domain found or created.
 *
 * Returns pointer to "struct tomoyo_domain_info" on success, NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
                                                const bool transit)
{
        struct tomoyo_domain_info e = { };
        struct tomoyo_domain_info *entry = tomoyo_find_domain(domainname);
        bool created = false;

        if (entry) {
                if (transit) {
                        /*
                         * Since namespace is created at runtime, profiles may
                         * not be created by the moment the process transits to
                         * that domain. Do not perform domain transition if
                         * profile for that domain is not yet created.
                         */
                        if (tomoyo_policy_loaded &&
                            !entry->ns->profile_ptr[entry->profile])
                                return NULL;
                }
                return entry;
        }
        /* Requested domain does not exist. */
        /* Don't create requested domain if domainname is invalid. */
        if (strlen(domainname) >= TOMOYO_EXEC_TMPSIZE - 10 ||
            !tomoyo_correct_domain(domainname))
                return NULL;
        /*
         * Since definition of profiles and acl_groups may differ across
         * namespaces, do not inherit "use_profile" and "use_group" settings
         * by automatically creating requested domain upon domain transition.
         */
        if (transit && tomoyo_namespace_jump(domainname))
                return NULL;
        e.ns = tomoyo_assign_namespace(domainname);
        if (!e.ns)
                return NULL;
        /*
         * "use_profile" and "use_group" settings for automatically created
         * domains are inherited from current domain. These are 0 for manually
         * created domains.
         */
        if (transit) {
                const struct tomoyo_domain_info *domain = tomoyo_domain();

                e.profile = domain->profile;
                memcpy(e.group, domain->group, sizeof(e.group));
        }
        e.domainname = tomoyo_get_name(domainname);
        if (!e.domainname)
                return NULL;
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        entry = tomoyo_find_domain(domainname);
        if (!entry) {
                entry = tomoyo_commit_ok(&e, sizeof(e));
                if (entry) {
                        INIT_LIST_HEAD(&entry->acl_info_list);
                        list_add_tail_rcu(&entry->list, &tomoyo_domain_list);
                        created = true;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        tomoyo_put_name(e.domainname);
        if (entry && transit) {
                if (created) {
                        struct tomoyo_request_info r;
                        int i;

                        tomoyo_init_request_info(&r, entry,
                                                 TOMOYO_MAC_FILE_EXECUTE);
                        r.granted = false;
                        tomoyo_write_log(&r, "use_profile %u\n",
                                         entry->profile);
                        for (i = 0; i < TOMOYO_MAX_ACL_GROUPS; i++)
                                if (test_bit(i, entry->group))
                                        tomoyo_write_log(&r, "use_group %u\n",
                                                         i);
                        tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
                }
        }
        return entry;
}

/**
 * tomoyo_environ - Check permission for environment variable names.
 *
 * @ee: Pointer to "struct tomoyo_execve".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_environ(struct tomoyo_execve *ee)
        __must_hold_shared(&tomoyo_ss)
{
        struct tomoyo_request_info *r = &ee->r;
        struct linux_binprm *bprm = ee->bprm;
        /* env_page.data is allocated by tomoyo_dump_page(). */
        struct tomoyo_page_dump env_page = { };
        char *arg_ptr; /* Size is TOMOYO_EXEC_TMPSIZE bytes */
        int arg_len = 0;
        unsigned long pos = bprm->p;
        int offset = pos % PAGE_SIZE;
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        int error = -ENOMEM;

        ee->r.type = TOMOYO_MAC_ENVIRON;
        ee->r.profile = r->domain->profile;
        ee->r.mode = tomoyo_get_mode(r->domain->ns, ee->r.profile,
                                     TOMOYO_MAC_ENVIRON);
        if (!r->mode || !envp_count)
                return 0;
        arg_ptr = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS);
        if (!arg_ptr)
                goto out;
        while (error == -ENOMEM) {
                if (!tomoyo_dump_page(bprm, pos, &env_page))
                        goto out;
                pos += PAGE_SIZE - offset;
                /* Read. */
                while (argv_count && offset < PAGE_SIZE) {
                        if (!env_page.data[offset++])
                                argv_count--;
                }
                if (argv_count) {
                        offset = 0;
                        continue;
                }
                while (offset < PAGE_SIZE) {
                        const unsigned char c = env_page.data[offset++];

                        if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) {
                                if (c == '=') {
                                        arg_ptr[arg_len++] = '\0';
                                } else if (c == '\\') {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = '\\';
                                } else if (c > ' ' && c < 127) {
                                        arg_ptr[arg_len++] = c;
                                } else {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = (c >> 6) + '0';
                                        arg_ptr[arg_len++]
                                                = ((c >> 3) & 7) + '0';
                                        arg_ptr[arg_len++] = (c & 7) + '0';
                                }
                        } else {
                                arg_ptr[arg_len] = '\0';
                        }
                        if (c)
                                continue;
                        if (tomoyo_env_perm(r, arg_ptr)) {
                                error = -EPERM;
                                break;
                        }
                        if (!--envp_count) {
                                error = 0;
                                break;
                        }
                        arg_len = 0;
                }
                offset = 0;
        }
out:
        if (r->mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        kfree(env_page.data);
        kfree(arg_ptr);
        return error;
}

/**
 * tomoyo_find_next_domain - Find a domain.
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_find_next_domain(struct linux_binprm *bprm)
{
        struct tomoyo_domain_info *old_domain = tomoyo_domain();
        struct tomoyo_domain_info *domain = NULL;
        const char *original_name = bprm->filename;
        int retval = -ENOMEM;
        bool reject_on_transition_failure = false;
        const struct tomoyo_path_info *candidate;
        struct tomoyo_path_info exename;
        struct tomoyo_execve *ee = kzalloc_obj(*ee, GFP_NOFS);

        if (!ee)
                return -ENOMEM;
        ee->tmp = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS);
        if (!ee->tmp) {
                kfree(ee);
                return -ENOMEM;
        }
        /* ee->dump->data is allocated by tomoyo_dump_page(). */
        tomoyo_init_request_info(&ee->r, NULL, TOMOYO_MAC_FILE_EXECUTE);
        ee->r.ee = ee;
        ee->bprm = bprm;
        ee->r.obj = &ee->obj;
        ee->obj.path1 = bprm->file->f_path;
        /*
         * Get symlink's pathname of program, but fallback to realpath if
         * symlink's pathname does not exist or symlink's pathname refers
         * to proc filesystem (e.g. /dev/fd/<num> or /proc/self/fd/<num> ).
         */
        exename.name = tomoyo_realpath_nofollow(original_name);
        if (exename.name && !strncmp(exename.name, "proc:/", 6)) {
                kfree(exename.name);
                exename.name = NULL;
        }
        if (!exename.name) {
                exename.name = tomoyo_realpath_from_path(&bprm->file->f_path);
                if (!exename.name)
                        goto out;
        }
        tomoyo_fill_path_info(&exename);
retry:
        /* Check 'aggregator' directive. */
        {
                struct tomoyo_aggregator *ptr;
                struct list_head *list =
                        &old_domain->ns->policy_list[TOMOYO_ID_AGGREGATOR];

                /* Check 'aggregator' directive. */
                candidate = &exename;
                list_for_each_entry_rcu(ptr, list, head.list,
                                        srcu_read_lock_held(&tomoyo_ss)) {
                        if (ptr->head.is_deleted ||
                            !tomoyo_path_matches_pattern(&exename,
                                                         ptr->original_name))
                                continue;
                        candidate = ptr->aggregated_name;
                        break;
                }
        }

        /* Check execute permission. */
        retval = tomoyo_execute_permission(&ee->r, candidate);
        if (retval == TOMOYO_RETRY_REQUEST)
                goto retry;
        if (retval < 0)
                goto out;
        /*
         * To be able to specify domainnames with wildcards, use the
         * pathname specified in the policy (which may contain
         * wildcard) rather than the pathname passed to execve()
         * (which never contains wildcard).
         */
        if (ee->r.param.path.matched_path)
                candidate = ee->r.param.path.matched_path;

        /*
         * Check for domain transition preference if "file execute" matched.
         * If preference is given, make execve() fail if domain transition
         * has failed, for domain transition preference should be used with
         * destination domain defined.
         */
        if (ee->transition) {
                const char *domainname = ee->transition->name;

                reject_on_transition_failure = true;
                if (!strcmp(domainname, "keep"))
                        goto force_keep_domain;
                if (!strcmp(domainname, "child"))
                        goto force_child_domain;
                if (!strcmp(domainname, "reset"))
                        goto force_reset_domain;
                if (!strcmp(domainname, "initialize"))
                        goto force_initialize_domain;
                if (!strcmp(domainname, "parent")) {
                        char *cp;

                        strscpy(ee->tmp, old_domain->domainname->name, TOMOYO_EXEC_TMPSIZE);
                        cp = strrchr(ee->tmp, ' ');
                        if (cp)
                                *cp = '\0';
                } else if (*domainname == '<')
                        strscpy(ee->tmp, domainname, TOMOYO_EXEC_TMPSIZE);
                else
                        snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                                 old_domain->domainname->name, domainname);
                goto force_jump_domain;
        }
        /*
         * No domain transition preference specified.
         * Calculate domain to transit to.
         */
        switch (tomoyo_transition_type(old_domain->ns, old_domain->domainname,
                                       candidate)) {
        case TOMOYO_TRANSITION_CONTROL_RESET:
force_reset_domain:
                /* Transit to the root of specified namespace. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "<%s>",
                         candidate->name);
                /*
                 * Make execve() fail if domain transition across namespaces
                 * has failed.
                 */
                reject_on_transition_failure = true;
                break;
        case TOMOYO_TRANSITION_CONTROL_INITIALIZE:
force_initialize_domain:
                /* Transit to the child of current namespace's root. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                         old_domain->ns->name, candidate->name);
                break;
        case TOMOYO_TRANSITION_CONTROL_KEEP:
force_keep_domain:
                /* Keep current domain. */
                domain = old_domain;
                break;
        default:
                if (old_domain == &tomoyo_kernel_domain &&
                    !tomoyo_policy_loaded) {
                        /*
                         * Needn't to transit from kernel domain before
                         * starting /sbin/init. But transit from kernel domain
                         * if executing initializers because they might start
                         * before /sbin/init.
                         */
                        domain = old_domain;
                        break;
                }
force_child_domain:
                /* Normal domain transition. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                         old_domain->domainname->name, candidate->name);
                break;
        }
force_jump_domain:
        if (!domain)
                domain = tomoyo_assign_domain(ee->tmp, true);
        if (domain)
                retval = 0;
        else if (reject_on_transition_failure) {
                pr_warn("ERROR: Domain '%s' not ready.\n", ee->tmp);
                retval = -ENOMEM;
        } else if (ee->r.mode == TOMOYO_CONFIG_ENFORCING)
                retval = -ENOMEM;
        else {
                retval = 0;
                if (!old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED]) {
                        old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED] = true;
                        ee->r.granted = false;
                        tomoyo_write_log(&ee->r, "%s", tomoyo_dif
                                         [TOMOYO_DIF_TRANSITION_FAILED]);
                        pr_warn("ERROR: Domain '%s' not defined.\n", ee->tmp);
                }
        }
 out:
        if (!domain)
                domain = old_domain;
        /* Update reference count on "struct tomoyo_domain_info". */
        {
                struct tomoyo_task *s = tomoyo_task(current);

                s->old_domain_info = s->domain_info;
                s->domain_info = domain;
                atomic_inc(&domain->users);
        }
        kfree(exename.name);
        if (!retval) {
                ee->r.domain = domain;
                retval = tomoyo_environ(ee);
        }
        kfree(ee->tmp);
        kfree(ee->dump.data);
        kfree(ee);
        return retval;
}

/**
 * tomoyo_dump_page - Dump a page to buffer.
 *
 * @bprm: Pointer to "struct linux_binprm".
 * @pos:  Location to dump.
 * @dump: Pointer to "struct tomoyo_page_dump".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
                      struct tomoyo_page_dump *dump)
{
        struct page *page;
#ifdef CONFIG_MMU
        int ret;
#endif

        /* dump->data is released by tomoyo_find_next_domain(). */
        if (!dump->data) {
                dump->data = kzalloc(PAGE_SIZE, GFP_NOFS);
                if (!dump->data)
                        return false;
        }
        /* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */
#ifdef CONFIG_MMU
        /*
         * This is called at execve() time in order to dig around
         * in the argv/environment of the new process
         * (represented by bprm).
         */
        mmap_read_lock(bprm->mm);
        ret = get_user_pages_remote(bprm->mm, pos, 1,
                                    FOLL_FORCE, &page, NULL);
        mmap_read_unlock(bprm->mm);
        if (ret <= 0)
                return false;
#else
        page = bprm->page[pos / PAGE_SIZE];
#endif
        if (page != dump->page) {
                const unsigned int offset = pos % PAGE_SIZE;
                char *kaddr = kmap_local_page(page);

                dump->page = page;
                memcpy(dump->data + offset, kaddr + offset,
                       PAGE_SIZE - offset);
                kunmap_local(kaddr);
        }
        /* Same with put_arg_page(page) in fs/exec.c */
#ifdef CONFIG_MMU
        put_page(page);
#endif
        return true;
}












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 
    2 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
// SPDX-License-Identifier: GPL-2.0-only
/*
 * ACPI device specific properties support.
 *
 * Copyright (C) 2014 - 2023, Intel Corporation
 * All rights reserved.
 *
 * Authors: Mika Westerberg <mika.westerberg@linux.intel.com>
 *          Darren Hart <dvhart@linux.intel.com>
 *          Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 *          Sakari Ailus <sakari.ailus@linux.intel.com>
 */

#define pr_fmt(fmt) "ACPI: " fmt

#include <linux/acpi.h>
#include <linux/device.h>
#include <linux/export.h>

#include "internal.h"

static int acpi_data_get_property_array(const struct acpi_device_data *data,
                                        const char *name,
                                        acpi_object_type type,
                                        const union acpi_object **obj);

/*
 * The GUIDs here are made equivalent to each other in order to avoid extra
 * complexity in the properties handling code, with the caveat that the
 * kernel will accept certain combinations of GUID and properties that are
 * not defined without a warning. For instance if any of the properties
 * from different GUID appear in a property list of another, it will be
 * accepted by the kernel. Firmware validation tools should catch these.
 *
 * References:
 *
 * [1] UEFI DSD Guide.
 *     https://github.com/UEFI/DSD-Guide/blob/main/src/dsd-guide.adoc
 */
static const guid_t prp_guids[] = {
        /* ACPI _DSD device properties GUID [1]: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 */
        GUID_INIT(0xdaffd814, 0x6eba, 0x4d8c,
                  0x8a, 0x91, 0xbc, 0x9b, 0xbf, 0x4a, 0xa3, 0x01),
        /* Hotplug in D3 GUID: 6211e2c0-58a3-4af3-90e1-927a4e0c55a4 */
        GUID_INIT(0x6211e2c0, 0x58a3, 0x4af3,
                  0x90, 0xe1, 0x92, 0x7a, 0x4e, 0x0c, 0x55, 0xa4),
        /* External facing port GUID: efcc06cc-73ac-4bc3-bff0-76143807c389 */
        GUID_INIT(0xefcc06cc, 0x73ac, 0x4bc3,
                  0xbf, 0xf0, 0x76, 0x14, 0x38, 0x07, 0xc3, 0x89),
        /* Thunderbolt GUID for IMR_VALID: c44d002f-69f9-4e7d-a904-a7baabdf43f7 */
        GUID_INIT(0xc44d002f, 0x69f9, 0x4e7d,
                  0xa9, 0x04, 0xa7, 0xba, 0xab, 0xdf, 0x43, 0xf7),
        /* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */
        GUID_INIT(0x6c501103, 0xc189, 0x4296,
                  0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d),
        /* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */
        GUID_INIT(0x5025030f, 0x842f, 0x4ab4,
                  0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0),
};

/* ACPI _DSD data subnodes GUID [1]: dbb8e3e6-5886-4ba6-8795-1319f52a966b */
static const guid_t ads_guid =
        GUID_INIT(0xdbb8e3e6, 0x5886, 0x4ba6,
                  0x87, 0x95, 0x13, 0x19, 0xf5, 0x2a, 0x96, 0x6b);

/* ACPI _DSD data buffer GUID [1]: edb12dd0-363d-4085-a3d2-49522ca160c4 */
static const guid_t buffer_prop_guid =
        GUID_INIT(0xedb12dd0, 0x363d, 0x4085,
                  0xa3, 0xd2, 0x49, 0x52, 0x2c, 0xa1, 0x60, 0xc4);

static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
                                           union acpi_object *desc,
                                           struct acpi_device_data *data,
                                           struct fwnode_handle *parent);
static bool acpi_extract_properties(acpi_handle handle,
                                    union acpi_object *desc,
                                    struct acpi_device_data *data);

static bool acpi_nondev_subnode_extract(union acpi_object *desc,
                                        acpi_handle handle,
                                        const union acpi_object *link,
                                        struct list_head *list,
                                        struct fwnode_handle *parent)
{
        struct acpi_data_node *dn;
        acpi_handle scope = NULL;
        bool result;

        if (acpi_graph_ignore_port(handle))
                return false;

        dn = kzalloc_obj(*dn);
        if (!dn)
                return false;

        dn->name = link->package.elements[0].string.pointer;
        fwnode_init(&dn->fwnode, &acpi_data_fwnode_ops);
        dn->parent = parent;
        INIT_LIST_HEAD(&dn->data.properties);
        INIT_LIST_HEAD(&dn->data.subnodes);

        /*
         * The scope for the completion of relative pathname segments and
         * subnode object lookup is the one of the namespace node (device)
         * containing the object that has returned the package.  That is, it's
         * the scope of that object's parent device.
         */
        if (handle)
                acpi_get_parent(handle, &scope);

        /*
         * Extract properties from the _DSD-equivalent package pointed to by
         * desc and use scope (if not NULL) for the completion of relative
         * pathname segments.
         *
         * The extracted properties will be held in the new data node dn.
         */
        result = acpi_extract_properties(scope, desc, &dn->data);
        /*
         * Look for subnodes in the _DSD-equivalent package pointed to by desc
         * and create child nodes of dn if there are any.
         */
        if (acpi_enumerate_nondev_subnodes(scope, desc, &dn->data, &dn->fwnode))
                result = true;

        if (!result) {
                kfree(dn);
                acpi_handle_debug(handle, "Invalid properties/subnodes data, skipping\n");
                return false;
        }

        /*
         * This will be NULL if the desc package is embedded in an outer
         * _DSD-equivalent package and its scope cannot be determined.
         */
        dn->handle = handle;
        dn->data.pointer = desc;
        list_add_tail(&dn->sibling, list);

        return true;
}

static bool acpi_nondev_subnode_ok(acpi_handle scope,
                                   const union acpi_object *link,
                                   struct list_head *list,
                                   struct fwnode_handle *parent)
{
        struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
        acpi_handle handle;
        acpi_status status;

        /*
         * If the scope is unknown, the _DSD-equivalent package being parsed
         * was embedded in an outer _DSD-equivalent package as a result of
         * direct evaluation of an object pointed to by a reference.  In that
         * case, using a pathname as the target object pointer is invalid.
         */
        if (!scope)
                return false;

        status = acpi_get_handle(scope, link->package.elements[1].string.pointer,
                                 &handle);
        if (ACPI_FAILURE(status))
                return false;

        status = acpi_evaluate_object_typed(handle, NULL, NULL, &buf,
                                            ACPI_TYPE_PACKAGE);
        if (ACPI_FAILURE(status))
                return false;

        if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list,
                                        parent))
                return true;

        ACPI_FREE(buf.pointer);
        return false;
}

static bool acpi_add_nondev_subnodes(acpi_handle scope,
                                     union acpi_object *links,
                                     struct list_head *list,
                                     struct fwnode_handle *parent)
{
        bool ret = false;
        int i;

        /*
         * Every element in the links package is expected to represent a link
         * to a non-device node in a tree containing device-specific data.
         */
        for (i = 0; i < links->package.count; i++) {
                union acpi_object *link, *desc;
                bool result;

                link = &links->package.elements[i];
                /* Only two elements allowed. */
                if (link->package.count != 2)
                        continue;

                /* The first one (the key) must be a string. */
                if (link->package.elements[0].type != ACPI_TYPE_STRING)
                        continue;

                /* The second one (the target) may be a string or a package. */
                switch (link->package.elements[1].type) {
                case ACPI_TYPE_STRING:
                        /*
                         * The string is expected to be a full pathname or a
                         * pathname segment relative to the given scope.  That
                         * pathname is expected to point to an object returning
                         * a package that contains _DSD-equivalent information.
                         */
                        result = acpi_nondev_subnode_ok(scope, link, list,
                                                         parent);
                        break;
                case ACPI_TYPE_PACKAGE:
                        /*
                         * This happens when a reference is used in AML to
                         * point to the target.  Since the target is expected
                         * to be a named object, a reference to it will cause it
                         * to be avaluated in place and its return package will
                         * be embedded in the links package at the location of
                         * the reference.
                         *
                         * The target package is expected to contain _DSD-
                         * equivalent information, but the scope in which it
                         * is located in the original AML is unknown.  Thus
                         * it cannot contain pathname segments represented as
                         * strings because there is no way to build full
                         * pathnames out of them.
                         */
                        acpi_handle_debug(scope, "subnode %s: Unknown scope\n",
                                          link->package.elements[0].string.pointer);
                        desc = &link->package.elements[1];
                        result = acpi_nondev_subnode_extract(desc, NULL, link,
                                                             list, parent);
                        break;
                case ACPI_TYPE_LOCAL_REFERENCE:
                        /*
                         * It is not expected to see any local references in
                         * the links package because referencing a named object
                         * should cause it to be evaluated in place.
                         */
                        acpi_handle_info(scope, "subnode %s: Unexpected reference\n",
                                         link->package.elements[0].string.pointer);
                        fallthrough;
                default:
                        result = false;
                        break;
                }
                ret = ret || result;
        }

        return ret;
}

static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
                                           union acpi_object *desc,
                                           struct acpi_device_data *data,
                                           struct fwnode_handle *parent)
{
        int i;

        /* Look for the ACPI data subnodes GUID. */
        for (i = 0; i < desc->package.count; i += 2) {
                const union acpi_object *guid;
                union acpi_object *links;

                guid = &desc->package.elements[i];
                links = &desc->package.elements[i + 1];

                /*
                 * The first element must be a GUID and the second one must be
                 * a package.
                 */
                if (guid->type != ACPI_TYPE_BUFFER ||
                    guid->buffer.length != 16 ||
                    links->type != ACPI_TYPE_PACKAGE)
                        break;

                if (!guid_equal((guid_t *)guid->buffer.pointer, &ads_guid))
                        continue;

                return acpi_add_nondev_subnodes(scope, links, &data->subnodes,
                                                parent);
        }

        return false;
}

static bool acpi_property_value_ok(const union acpi_object *value)
{
        int j;

        /*
         * The value must be an integer, a string, a reference, or a package
         * whose every element must be an integer, a string, or a reference.
         */
        switch (value->type) {
        case ACPI_TYPE_INTEGER:
        case ACPI_TYPE_STRING:
        case ACPI_TYPE_LOCAL_REFERENCE:
                return true;

        case ACPI_TYPE_PACKAGE:
                for (j = 0; j < value->package.count; j++)
                        switch (value->package.elements[j].type) {
                        case ACPI_TYPE_INTEGER:
                        case ACPI_TYPE_STRING:
                        case ACPI_TYPE_LOCAL_REFERENCE:
                                continue;

                        default:
                                return false;
                        }

                return true;
        }
        return false;
}

static bool acpi_properties_format_valid(const union acpi_object *properties)
{
        int i;

        for (i = 0; i < properties->package.count; i++) {
                const union acpi_object *property;

                property = &properties->package.elements[i];
                /*
                 * Only two elements allowed, the first one must be a string and
                 * the second one has to satisfy certain conditions.
                 */
                if (property->package.count != 2
                    || property->package.elements[0].type != ACPI_TYPE_STRING
                    || !acpi_property_value_ok(&property->package.elements[1]))
                        return false;
        }
        return true;
}

static void acpi_init_of_compatible(struct acpi_device *adev)
{
        const union acpi_object *of_compatible;
        int ret;

        ret = acpi_data_get_property_array(&adev->data, "compatible",
                                           ACPI_TYPE_STRING, &of_compatible);
        if (ret) {
                ret = acpi_dev_get_property(adev, "compatible",
                                            ACPI_TYPE_STRING, &of_compatible);
                if (ret) {
                        struct acpi_device *parent;

                        parent = acpi_dev_parent(adev);
                        if (parent && parent->flags.of_compatible_ok)
                                goto out;

                        return;
                }
        }
        adev->data.of_compatible = of_compatible;

 out:
        adev->flags.of_compatible_ok = 1;
}

static bool acpi_is_property_guid(const guid_t *guid)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(prp_guids); i++) {
                if (guid_equal(guid, &prp_guids[i]))
                        return true;
        }

        return false;
}

struct acpi_device_properties *
acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
                    union acpi_object *properties)
{
        struct acpi_device_properties *props;

        props = kzalloc_obj(*props);
        if (props) {
                INIT_LIST_HEAD(&props->list);
                props->guid = guid;
                props->properties = properties;
                list_add_tail(&props->list, &data->properties);
        }

        return props;
}

static void acpi_nondev_subnode_tag(acpi_handle handle, void *context)
{
}

static void acpi_untie_nondev_subnodes(struct acpi_device_data *data)
{
        struct acpi_data_node *dn;

        list_for_each_entry(dn, &data->subnodes, sibling) {
                if (!dn->handle)
                        continue;

                acpi_detach_data(dn->handle, acpi_nondev_subnode_tag);

                acpi_untie_nondev_subnodes(&dn->data);
        }
}

static bool acpi_tie_nondev_subnodes(struct acpi_device_data *data)
{
        struct acpi_data_node *dn;

        list_for_each_entry(dn, &data->subnodes, sibling) {
                acpi_status status;
                bool ret;

                if (!dn->handle)
                        continue;

                status = acpi_attach_data(dn->handle, acpi_nondev_subnode_tag, dn);
                if (ACPI_FAILURE(status) && status != AE_ALREADY_EXISTS) {
                        acpi_handle_err(dn->handle, "Can't tag data node\n");
                        return false;
                }

                ret = acpi_tie_nondev_subnodes(&dn->data);
                if (!ret)
                        return ret;
        }

        return true;
}

static void acpi_data_add_buffer_props(acpi_handle handle,
                                       struct acpi_device_data *data,
                                       union acpi_object *properties)
{
        struct acpi_device_properties *props;
        union acpi_object *package;
        size_t alloc_size;
        unsigned int i;
        u32 *count;

        if (check_mul_overflow((size_t)properties->package.count,
                               sizeof(*package) + sizeof(void *),
                               &alloc_size) ||
            check_add_overflow(sizeof(*props) + sizeof(*package), alloc_size,
                               &alloc_size)) {
                acpi_handle_warn(handle,
                                 "can't allocate memory for %u buffer props",
                                 properties->package.count);
                return;
        }

        props = kvzalloc(alloc_size, GFP_KERNEL);
        if (!props)
                return;

        props->guid = &buffer_prop_guid;
        props->bufs = (void *)(props + 1);
        props->properties = (void *)(props->bufs + properties->package.count);

        /* Outer package */
        package = props->properties;
        package->type = ACPI_TYPE_PACKAGE;
        package->package.elements = package + 1;
        count = &package->package.count;
        *count = 0;

        /* Inner packages */
        package++;

        for (i = 0; i < properties->package.count; i++) {
                struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
                union acpi_object *property = &properties->package.elements[i];
                union acpi_object *prop, *obj, *buf_obj;
                acpi_status status;

                if (property->type != ACPI_TYPE_PACKAGE ||
                    property->package.count != 2) {
                        acpi_handle_warn(handle,
                                         "buffer property %u has %u entries\n",
                                         i, property->package.count);
                        continue;
                }

                prop = &property->package.elements[0];
                obj = &property->package.elements[1];

                if (prop->type != ACPI_TYPE_STRING ||
                    obj->type != ACPI_TYPE_STRING) {
                        acpi_handle_warn(handle,
                                         "wrong object types %u and %u\n",
                                         prop->type, obj->type);
                        continue;
                }

                status = acpi_evaluate_object_typed(handle, obj->string.pointer,
                                                    NULL, &buf,
                                                    ACPI_TYPE_BUFFER);
                if (ACPI_FAILURE(status)) {
                        acpi_handle_warn(handle,
                                         "can't evaluate \"%*pE\" as buffer\n",
                                         obj->string.length,
                                         obj->string.pointer);
                        continue;
                }

                package->type = ACPI_TYPE_PACKAGE;
                package->package.elements = prop;
                package->package.count = 2;

                buf_obj = buf.pointer;

                /* Replace the string object with a buffer object */
                obj->type = ACPI_TYPE_BUFFER;
                obj->buffer.length = buf_obj->buffer.length;
                obj->buffer.pointer = buf_obj->buffer.pointer;

                props->bufs[i] = buf.pointer;
                package++;
                (*count)++;
        }

        if (*count)
                list_add(&props->list, &data->properties);
        else
                kvfree(props);
}

static bool acpi_extract_properties(acpi_handle scope, union acpi_object *desc,
                                    struct acpi_device_data *data)
{
        int i;

        if (desc->package.count % 2)
                return false;

        /* Look for the device properties GUID. */
        for (i = 0; i < desc->package.count; i += 2) {
                const union acpi_object *guid;
                union acpi_object *properties;

                guid = &desc->package.elements[i];
                properties = &desc->package.elements[i + 1];

                /*
                 * The first element must be a GUID and the second one must be
                 * a package.
                 */
                if (guid->type != ACPI_TYPE_BUFFER ||
                    guid->buffer.length != 16 ||
                    properties->type != ACPI_TYPE_PACKAGE)
                        break;

                if (guid_equal((guid_t *)guid->buffer.pointer,
                               &buffer_prop_guid)) {
                        acpi_data_add_buffer_props(scope, data, properties);
                        continue;
                }

                if (!acpi_is_property_guid((guid_t *)guid->buffer.pointer))
                        continue;

                /*
                 * We found the matching GUID. Now validate the format of the
                 * package immediately following it.
                 */
                if (!acpi_properties_format_valid(properties))
                        continue;

                acpi_data_add_props(data, (const guid_t *)guid->buffer.pointer,
                                    properties);
        }

        return !list_empty(&data->properties);
}

void acpi_init_properties(struct acpi_device *adev)
{
        struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
        struct acpi_hardware_id *hwid;
        acpi_status status;
        bool acpi_of = false;

        INIT_LIST_HEAD(&adev->data.properties);
        INIT_LIST_HEAD(&adev->data.subnodes);

        if (!adev->handle)
                return;

        /*
         * Check if ACPI_DT_NAMESPACE_HID is present and inthat case we fill in
         * Device Tree compatible properties for this device.
         */
        list_for_each_entry(hwid, &adev->pnp.ids, list) {
                if (!strcmp(hwid->id, ACPI_DT_NAMESPACE_HID)) {
                        acpi_of = true;
                        break;
                }
        }

        status = acpi_evaluate_object_typed(adev->handle, "_DSD", NULL, &buf,
                                            ACPI_TYPE_PACKAGE);
        if (ACPI_FAILURE(status))
                goto out;

        if (acpi_extract_properties(adev->handle, buf.pointer, &adev->data)) {
                adev->data.pointer = buf.pointer;
                if (acpi_of)
                        acpi_init_of_compatible(adev);
        }
        if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer,
                                        &adev->data, acpi_fwnode_handle(adev)))
                adev->data.pointer = buf.pointer;

        if (!adev->data.pointer) {
                acpi_handle_debug(adev->handle, "Invalid _DSD data, skipping\n");
                ACPI_FREE(buf.pointer);
        } else {
                if (!acpi_tie_nondev_subnodes(&adev->data))
                        acpi_untie_nondev_subnodes(&adev->data);
        }

 out:
        if (acpi_of && !adev->flags.of_compatible_ok)
                acpi_handle_info(adev->handle,
                         ACPI_DT_NAMESPACE_HID " requires 'compatible' property\n");

        if (!adev->data.pointer)
                acpi_extract_apple_properties(adev);
}

static void acpi_free_device_properties(struct list_head *list)
{
        struct acpi_device_properties *props, *tmp;

        list_for_each_entry_safe(props, tmp, list, list) {
                u32 i;

                list_del(&props->list);
                /* Buffer data properties were separately allocated */
                if (props->bufs)
                        for (i = 0; i < props->properties->package.count; i++)
                                ACPI_FREE(props->bufs[i]);
                kvfree(props);
        }
}

static void acpi_destroy_nondev_subnodes(struct list_head *list)
{
        struct acpi_data_node *dn, *next;

        if (list_empty(list))
                return;

        list_for_each_entry_safe_reverse(dn, next, list, sibling) {
                acpi_destroy_nondev_subnodes(&dn->data.subnodes);
                wait_for_completion(&dn->kobj_done);
                list_del(&dn->sibling);
                ACPI_FREE((void *)dn->data.pointer);
                acpi_free_device_properties(&dn->data.properties);
                kfree(dn);
        }
}

void acpi_free_properties(struct acpi_device *adev)
{
        acpi_untie_nondev_subnodes(&adev->data);
        acpi_destroy_nondev_subnodes(&adev->data.subnodes);
        ACPI_FREE((void *)adev->data.pointer);
        adev->data.of_compatible = NULL;
        adev->data.pointer = NULL;
        acpi_free_device_properties(&adev->data.properties);
}

/**
 * acpi_data_get_property - return an ACPI property with given name
 * @data: ACPI device deta object to get the property from
 * @name: Name of the property
 * @type: Expected property type
 * @obj: Location to store the property value (if not %NULL)
 *
 * Look up a property with @name and store a pointer to the resulting ACPI
 * object at the location pointed to by @obj if found.
 *
 * Callers must not attempt to free the returned objects.  These objects will be
 * freed by the ACPI core automatically during the removal of @data.
 *
 * Return: %0 if property with @name has been found (success),
 *         %-EINVAL if the arguments are invalid,
 *         %-EINVAL if the property doesn't exist,
 *         %-EPROTO if the property value type doesn't match @type.
 */
static int acpi_data_get_property(const struct acpi_device_data *data,
                                  const char *name, acpi_object_type type,
                                  const union acpi_object **obj)
{
        const struct acpi_device_properties *props;

        if (!data || !name)
                return -EINVAL;

        if (!data->pointer || list_empty(&data->properties))
                return -EINVAL;

        list_for_each_entry(props, &data->properties, list) {
                const union acpi_object *properties;
                unsigned int i;

                properties = props->properties;
                for (i = 0; i < properties->package.count; i++) {
                        const union acpi_object *propname, *propvalue;
                        const union acpi_object *property;

                        property = &properties->package.elements[i];

                        propname = &property->package.elements[0];
                        propvalue = &property->package.elements[1];

                        if (!strcmp(name, propname->string.pointer)) {
                                if (type != ACPI_TYPE_ANY &&
                                    propvalue->type != type)
                                        return -EPROTO;
                                if (obj)
                                        *obj = propvalue;

                                return 0;
                        }
                }
        }
        return -EINVAL;
}

/**
 * acpi_dev_get_property - return an ACPI property with given name.
 * @adev: ACPI device to get the property from.
 * @name: Name of the property.
 * @type: Expected property type.
 * @obj: Location to store the property value (if not %NULL).
 */
int acpi_dev_get_property(const struct acpi_device *adev, const char *name,
                          acpi_object_type type, const union acpi_object **obj)
{
        return adev ? acpi_data_get_property(&adev->data, name, type, obj) : -EINVAL;
}
EXPORT_SYMBOL_GPL(acpi_dev_get_property);

static const struct acpi_device_data *
acpi_device_data_of_node(const struct fwnode_handle *fwnode)
{
        if (is_acpi_device_node(fwnode)) {
                const struct acpi_device *adev = to_acpi_device_node(fwnode);
                return &adev->data;
        }
        if (is_acpi_data_node(fwnode)) {
                const struct acpi_data_node *dn = to_acpi_data_node(fwnode);
                return &dn->data;
        }
        return NULL;
}

/**
 * acpi_node_prop_get - return an ACPI property with given name.
 * @fwnode: Firmware node to get the property from.
 * @propname: Name of the property.
 * @valptr: Location to store a pointer to the property value (if not %NULL).
 */
int acpi_node_prop_get(const struct fwnode_handle *fwnode,
                       const char *propname, void **valptr)
{
        return acpi_data_get_property(acpi_device_data_of_node(fwnode),
                                      propname, ACPI_TYPE_ANY,
                                      (const union acpi_object **)valptr);
}

/**
 * acpi_data_get_property_array - return an ACPI array property with given name
 * @data: ACPI data object to get the property from
 * @name: Name of the property
 * @type: Expected type of array elements
 * @obj: Location to store a pointer to the property value (if not NULL)
 *
 * Look up an array property with @name and store a pointer to the resulting
 * ACPI object at the location pointed to by @obj if found.
 *
 * Callers must not attempt to free the returned objects.  Those objects will be
 * freed by the ACPI core automatically during the removal of @data.
 *
 * Return: %0 if array property (package) with @name has been found (success),
 *         %-EINVAL if the arguments are invalid,
 *         %-EINVAL if the property doesn't exist,
 *         %-EPROTO if the property is not a package or the type of its elements
 *           doesn't match @type.
 */
static int acpi_data_get_property_array(const struct acpi_device_data *data,
                                        const char *name,
                                        acpi_object_type type,
                                        const union acpi_object **obj)
{
        const union acpi_object *prop;
        int ret, i;

        ret = acpi_data_get_property(data, name, ACPI_TYPE_PACKAGE, &prop);
        if (ret)
                return ret;

        if (type != ACPI_TYPE_ANY) {
                /* Check that all elements are of correct type. */
                for (i = 0; i < prop->package.count; i++)
                        if (prop->package.elements[i].type != type)
                                return -EPROTO;
        }
        if (obj)
                *obj = prop;

        return 0;
}

static struct fwnode_handle *
acpi_fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
                                 const char *childname)
{
        struct fwnode_handle *child;

        fwnode_for_each_child_node(fwnode, child) {
                if (is_acpi_data_node(child)) {
                        if (acpi_data_node_match(child, childname))
                                return child;
                        continue;
                }

                if (!strncmp(acpi_device_bid(to_acpi_device_node(child)),
                             childname, ACPI_NAMESEG_SIZE))
                        return child;
        }

        return NULL;
}

static unsigned int acpi_fwnode_get_args_count(struct fwnode_handle *fwnode,
                                               const char *nargs_prop)
{
        const struct acpi_device_data *data;
        const union acpi_object *obj;
        int ret;

        data = acpi_device_data_of_node(fwnode);
        if (!data)
                return 0;

        ret = acpi_data_get_property(data, nargs_prop, ACPI_TYPE_INTEGER, &obj);
        if (ret)
                return 0;

        return obj->integer.value;
}

static int acpi_get_ref_args(struct fwnode_reference_args *args,
                             struct fwnode_handle *ref_fwnode,
                             const char *nargs_prop,
                             const union acpi_object **element,
                             const union acpi_object *end, size_t num_args)
{
        u32 nargs = 0, i;

        if (nargs_prop)
                num_args = acpi_fwnode_get_args_count(ref_fwnode, nargs_prop);

        /*
         * Assume the following integer elements are all args. Stop counting on
         * the first reference (possibly represented as a string) or end of the
         * package arguments. In case of neither reference, nor integer, return
         * an error, we can't parse it.
         */
        for (i = 0; (*element) + i < end && i < num_args; i++) {
                acpi_object_type type = (*element)[i].type;

                if (type == ACPI_TYPE_LOCAL_REFERENCE || type == ACPI_TYPE_STRING)
                        break;

                if (type == ACPI_TYPE_INTEGER)
                        nargs++;
                else
                        return -EINVAL;
        }

        if (nargs > NR_FWNODE_REFERENCE_ARGS)
                return -EINVAL;

        if (args) {
                args->fwnode = ref_fwnode;
                args->nargs = nargs;
                for (i = 0; i < nargs; i++)
                        args->args[i] = (*element)[i].integer.value;
        }

        (*element) += nargs;

        return 0;
}

static struct fwnode_handle *acpi_parse_string_ref(const struct fwnode_handle *fwnode,
                                                   const char *refstring)
{
        acpi_handle scope, handle;
        struct acpi_data_node *dn;
        struct acpi_device *device;
        acpi_status status;

        if (is_acpi_device_node(fwnode)) {
                scope = to_acpi_device_node(fwnode)->handle;
        } else if (is_acpi_data_node(fwnode)) {
                scope = to_acpi_data_node(fwnode)->handle;
        } else {
                pr_debug("Bad node type for node %pfw\n", fwnode);
                return NULL;
        }

        status = acpi_get_handle(scope, refstring, &handle);
        if (ACPI_FAILURE(status)) {
                acpi_handle_debug(scope, "Unable to get an ACPI handle for %s\n",
                                  refstring);
                return NULL;
        }

        device = acpi_fetch_acpi_dev(handle);
        if (device)
                return acpi_fwnode_handle(device);

        status = acpi_get_data_full(handle, acpi_nondev_subnode_tag,
                                    (void **)&dn, NULL);
        if (ACPI_FAILURE(status) || !dn) {
                acpi_handle_debug(handle, "Subnode not found\n");
                return NULL;
        }

        return &dn->fwnode;
}

static int acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode,
                                          const char *propname, const char *nargs_prop,
                                          unsigned int args_count, unsigned int index,
                                          struct fwnode_reference_args *args)
{
        const union acpi_object *element, *end;
        const union acpi_object *obj;
        const struct acpi_device_data *data;
        struct fwnode_handle *ref_fwnode;
        struct acpi_device *device;
        int ret, idx = 0;

        data = acpi_device_data_of_node(fwnode);
        if (!data)
                return -ENOENT;

        ret = acpi_data_get_property(data, propname, ACPI_TYPE_ANY, &obj);
        if (ret)
                return ret == -EINVAL ? -ENOENT : -EINVAL;

        switch (obj->type) {
        case ACPI_TYPE_LOCAL_REFERENCE:
                /* Plain single reference without arguments. */
                if (index)
                        return -ENOENT;

                device = acpi_fetch_acpi_dev(obj->reference.handle);
                if (!device)
                        return -EINVAL;

                if (!args)
                        return 0;

                args->fwnode = acpi_fwnode_handle(device);
                args->nargs = 0;

                return 0;
        case ACPI_TYPE_STRING:
                if (index)
                        return -ENOENT;

                ref_fwnode = acpi_parse_string_ref(fwnode, obj->string.pointer);
                if (!ref_fwnode)
                        return -EINVAL;

                args->fwnode = ref_fwnode;
                args->nargs = 0;

                return 0;
        case ACPI_TYPE_PACKAGE:
                /*
                 * If it is not a single reference, then it is a package of
                 * references, followed by number of ints as follows:
                 *
                 *  Package () { REF, INT, REF, INT, INT }
                 *
                 * Here, REF may be either a local reference or a string. The
                 * index argument is then used to determine which reference the
                 * caller wants (along with the arguments).
                 */
                break;
        default:
                return -EINVAL;
        }

        if (index >= obj->package.count)
                return -ENOENT;

        element = obj->package.elements;
        end = element + obj->package.count;

        while (element < end) {
                switch (element->type) {
                case ACPI_TYPE_LOCAL_REFERENCE:
                        device = acpi_fetch_acpi_dev(element->reference.handle);
                        if (!device)
                                return -EINVAL;

                        element++;
                        ret = acpi_get_ref_args(idx == index ? args : NULL,
                                                acpi_fwnode_handle(device),
                                                nargs_prop, &element, end,
                                                args_count);
                        if (ret < 0)
                                return ret;

                        if (idx == index)
                                return 0;

                        break;
                case ACPI_TYPE_STRING:
                        ref_fwnode = acpi_parse_string_ref(fwnode,
                                                           element->string.pointer);
                        if (!ref_fwnode)
                                return -EINVAL;

                        element++;
                        ret = acpi_get_ref_args(idx == index ? args : NULL,
                                                ref_fwnode, nargs_prop, &element, end,
                                                args_count);
                        if (ret < 0)
                                return ret;

                        if (idx == index)
                                return 0;

                        break;
                case ACPI_TYPE_INTEGER:
                        if (idx == index)
                                return -ENOENT;
                        element++;
                        break;
                default:
                        return -EINVAL;
                }

                idx++;
        }

        return -ENOENT;
}

/**
 * __acpi_node_get_property_reference - returns handle to the referenced object
 * @fwnode: Firmware node to get the property from
 * @propname: Name of the property
 * @index: Index of the reference to return
 * @num_args: Maximum number of arguments after each reference
 * @args: Location to store the returned reference with optional arguments
 *          (may be NULL)
 *
 * Find property with @name, verifify that it is a package containing at least
 * one object reference and if so, store the ACPI device object pointer to the
 * target object in @args->adev.  If the reference includes arguments, store
 * them in the @args->args[] array.
 *
 * If there's more than one reference in the property value package, @index is
 * used to select the one to return.
 *
 * It is possible to leave holes in the property value set like in the
 * example below:
 *
 * Package () {
 *     "cs-gpios",
 *     Package () {
 *        ^GPIO, 19, 0, 0,
 *        ^GPIO, 20, 0, 0,
 *        0,
 *        ^GPIO, 21, 0, 0,
 *     }
 * }
 *
 * Calling this function with index %2 or index %3 return %-ENOENT. If the
 * property does not contain any more values %-ENOENT is returned. The NULL
 * entry must be single integer and preferably contain value %0.
 *
 * Return: %0 on success, negative error code on failure.
 */
int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
                                       const char *propname, size_t index,
                                       size_t num_args,
                                       struct fwnode_reference_args *args)
{
        return acpi_fwnode_get_reference_args(fwnode, propname, NULL, num_args, index, args);
}
EXPORT_SYMBOL_GPL(__acpi_node_get_property_reference);

static int acpi_data_prop_read_single(const struct acpi_device_data *data,
                                      const char *propname,
                                      enum dev_prop_type proptype, void *val)
{
        const union acpi_object *obj;
        int ret = 0;

        if (proptype >= DEV_PROP_U8 && proptype <= DEV_PROP_U64)
                ret = acpi_data_get_property(data, propname, ACPI_TYPE_INTEGER, &obj);
        else if (proptype == DEV_PROP_STRING)
                ret = acpi_data_get_property(data, propname, ACPI_TYPE_STRING, &obj);
        if (ret)
                return ret;

        switch (proptype) {
        case DEV_PROP_U8:
                if (obj->integer.value > U8_MAX)
                        return -EOVERFLOW;
                if (val)
                        *(u8 *)val = obj->integer.value;
                break;
        case DEV_PROP_U16:
                if (obj->integer.value > U16_MAX)
                        return -EOVERFLOW;
                if (val)
                        *(u16 *)val = obj->integer.value;
                break;
        case DEV_PROP_U32:
                if (obj->integer.value > U32_MAX)
                        return -EOVERFLOW;
                if (val)
                        *(u32 *)val = obj->integer.value;
                break;
        case DEV_PROP_U64:
                if (val)
                        *(u64 *)val = obj->integer.value;
                break;
        case DEV_PROP_STRING:
                if (val)
                        *(char **)val = obj->string.pointer;
                return 1;
        default:
                return -EINVAL;
        }

        /* When no storage provided return number of available values */
        return val ? 0 : 1;
}

#define acpi_copy_property_array_uint(items, val, nval)                        \
        ({                                                                \
                typeof(items) __items = items;                                \
                typeof(val) __val = val;                                \
                typeof(nval) __nval = nval;                                \
                size_t i;                                                \
                int ret = 0;                                                \
                                                                        \
                for (i = 0; i < __nval; i++) {                                \
                        if (__items->type == ACPI_TYPE_BUFFER) {        \
                                __val[i] = __items->buffer.pointer[i];        \
                                continue;                                \
                        }                                                \
                        if (__items[i].type != ACPI_TYPE_INTEGER) {        \
                                ret = -EPROTO;                                \
                                break;                                        \
                        }                                                \
                        if (__items[i].integer.value > _Generic(__val,        \
                                                                u8 *: U8_MAX, \
                                                                u16 *: U16_MAX, \
                                                                u32 *: U32_MAX, \
                                                                u64 *: U64_MAX)) { \
                                ret = -EOVERFLOW;                        \
                                break;                                        \
                        }                                                \
                                                                        \
                        __val[i] = __items[i].integer.value;                \
                }                                                        \
                ret;                                                        \
        })

static int acpi_copy_property_array_string(const union acpi_object *items,
                                           char **val, size_t nval)
{
        int i;

        for (i = 0; i < nval; i++) {
                if (items[i].type != ACPI_TYPE_STRING)
                        return -EPROTO;

                val[i] = items[i].string.pointer;
        }
        return nval;
}

static int acpi_data_prop_read(const struct acpi_device_data *data,
                               const char *propname,
                               enum dev_prop_type proptype,
                               void *val, size_t nval)
{
        const union acpi_object *obj;
        const union acpi_object *items;
        int ret;

        if (nval == 1 || !val) {
                ret = acpi_data_prop_read_single(data, propname, proptype, val);
                /*
                 * The overflow error means that the property is there and it is
                 * single-value, but its type does not match, so return.
                 */
                if (ret >= 0 || ret == -EOVERFLOW)
                        return ret;

                /*
                 * Reading this property as a single-value one failed, but its
                 * value may still be represented as one-element array, so
                 * continue.
                 */
        }

        ret = acpi_data_get_property_array(data, propname, ACPI_TYPE_ANY, &obj);
        if (ret && proptype >= DEV_PROP_U8 && proptype <= DEV_PROP_U64)
                ret = acpi_data_get_property(data, propname, ACPI_TYPE_BUFFER,
                                             &obj);
        if (ret)
                return ret;

        if (!val) {
                if (obj->type == ACPI_TYPE_BUFFER)
                        return obj->buffer.length;

                return obj->package.count;
        }

        switch (proptype) {
        case DEV_PROP_STRING:
                break;
        default:
                if (obj->type == ACPI_TYPE_BUFFER) {
                        if (nval > obj->buffer.length)
                                return -EOVERFLOW;
                } else {
                        if (nval > obj->package.count)
                                return -EOVERFLOW;
                }
                break;
        }

        if (obj->type == ACPI_TYPE_BUFFER) {
                if (proptype != DEV_PROP_U8)
                        return -EPROTO;
                items = obj;
        } else {
                items = obj->package.elements;
        }

        switch (proptype) {
        case DEV_PROP_U8:
                ret = acpi_copy_property_array_uint(items, (u8 *)val, nval);
                break;
        case DEV_PROP_U16:
                ret = acpi_copy_property_array_uint(items, (u16 *)val, nval);
                break;
        case DEV_PROP_U32:
                ret = acpi_copy_property_array_uint(items, (u32 *)val, nval);
                break;
        case DEV_PROP_U64:
                ret = acpi_copy_property_array_uint(items, (u64 *)val, nval);
                break;
        case DEV_PROP_STRING:
                nval = min(nval, obj->package.count);
                if (nval == 0)
                        return -ENODATA;

                ret = acpi_copy_property_array_string(items, (char **)val, nval);
                break;
        default:
                ret = -EINVAL;
                break;
        }
        return ret;
}

/**
 * acpi_node_prop_read - retrieve the value of an ACPI property with given name.
 * @fwnode: Firmware node to get the property from.
 * @propname: Name of the property.
 * @proptype: Expected property type.
 * @val: Location to store the property value (if not %NULL).
 * @nval: Size of the array pointed to by @val.
 *
 * If @val is %NULL, return the number of array elements comprising the value
 * of the property.  Otherwise, read at most @nval values to the array at the
 * location pointed to by @val.
 */
static int acpi_node_prop_read(const struct fwnode_handle *fwnode,
                               const char *propname, enum dev_prop_type proptype,
                               void *val, size_t nval)
{
        return acpi_data_prop_read(acpi_device_data_of_node(fwnode),
                                   propname, proptype, val, nval);
}

static int stop_on_next(struct acpi_device *adev, void *data)
{
        struct acpi_device **ret_p = data;

        if (!*ret_p) {
                *ret_p = adev;
                return 1;
        }

        /* Skip until the "previous" object is found. */
        if (*ret_p == adev)
                *ret_p = NULL;

        return 0;
}

/*
 * acpi_get_next_subnode - Return the next child node handle for a fwnode
 * @fwnode: Firmware node to find the next child node for.
 * @child: Handle to one of the device's child nodes or a null handle.
 */
static struct fwnode_handle *
acpi_get_next_subnode(const struct fwnode_handle *fwnode,
                      struct fwnode_handle *child)
{
        struct acpi_device *adev = to_acpi_device_node(fwnode);

        if ((!child || is_acpi_device_node(child)) && adev) {
                struct acpi_device *child_adev = to_acpi_device_node(child);

                acpi_dev_for_each_child(adev, stop_on_next, &child_adev);
                if (child_adev)
                        return acpi_fwnode_handle(child_adev);

                child = NULL;
        }

        if (!child || is_acpi_data_node(child)) {
                const struct acpi_data_node *data = to_acpi_data_node(fwnode);
                const struct list_head *head;
                struct list_head *next;
                struct acpi_data_node *dn;

                /*
                 * We can have a combination of device and data nodes, e.g. with
                 * hierarchical _DSD properties. Make sure the adev pointer is
                 * restored before going through data nodes, otherwise we will
                 * be looking for data_nodes below the last device found instead
                 * of the common fwnode shared by device_nodes and data_nodes.
                 */
                adev = to_acpi_device_node(fwnode);
                if (adev)
                        head = &adev->data.subnodes;
                else if (data)
                        head = &data->data.subnodes;
                else
                        return NULL;

                if (list_empty(head))
                        return NULL;

                if (child) {
                        dn = to_acpi_data_node(child);
                        next = dn->sibling.next;
                        if (next == head)
                                return NULL;

                        dn = list_entry(next, struct acpi_data_node, sibling);
                } else {
                        dn = list_first_entry(head, struct acpi_data_node, sibling);
                }
                return &dn->fwnode;
        }
        return NULL;
}

/*
 * acpi_get_next_present_subnode - Return the next present child node handle
 * @fwnode: Firmware node to find the next child node for.
 * @child: Handle to one of the device's child nodes or a null handle.
 *
 * Like acpi_get_next_subnode(), but the device nodes returned by
 * acpi_get_next_present_subnode() are guaranteed to be present.
 *
 * Returns: The fwnode handle of the next present sub-node.
 */
static struct fwnode_handle *
acpi_get_next_present_subnode(const struct fwnode_handle *fwnode,
                              struct fwnode_handle *child)
{
        do {
                child = acpi_get_next_subnode(fwnode, child);
        } while (is_acpi_device_node(child) &&
                 !acpi_device_is_present(to_acpi_device_node(child)));

        return child;
}

/**
 * acpi_node_get_parent - Return parent fwnode of this fwnode
 * @fwnode: Firmware node whose parent to get
 *
 * Returns parent node of an ACPI device or data firmware node or %NULL if
 * not available.
 */
static struct fwnode_handle *
acpi_node_get_parent(const struct fwnode_handle *fwnode)
{
        if (is_acpi_data_node(fwnode)) {
                /* All data nodes have parent pointer so just return that */
                return to_acpi_data_node(fwnode)->parent;
        }
        if (is_acpi_device_node(fwnode)) {
                struct acpi_device *parent;

                parent = acpi_dev_parent(to_acpi_device_node(fwnode));
                if (parent)
                        return acpi_fwnode_handle(parent);
        }

        return NULL;
}

/*
 * Return true if the node is an ACPI graph node. Called on either ports
 * or endpoints.
 */
static bool is_acpi_graph_node(struct fwnode_handle *fwnode,
                               const char *str)
{
        unsigned int len = strlen(str);
        const char *name;

        if (!len || !is_acpi_data_node(fwnode))
                return false;

        name = to_acpi_data_node(fwnode)->name;

        return (fwnode_property_present(fwnode, "reg") &&
                !strncmp(name, str, len) && name[len] == '@') ||
                fwnode_property_present(fwnode, str);
}

/**
 * acpi_graph_get_next_endpoint - Get next endpoint ACPI firmware node
 * @fwnode: Pointer to the parent firmware node
 * @prev: Previous endpoint node or %NULL to get the first
 *
 * Looks up next endpoint ACPI firmware node below a given @fwnode. Returns
 * %NULL if there is no next endpoint or in case of error. In case of success
 * the next endpoint is returned.
 */
static struct fwnode_handle *acpi_graph_get_next_endpoint(
        const struct fwnode_handle *fwnode, struct fwnode_handle *prev)
{
        struct fwnode_handle *port = NULL;
        struct fwnode_handle *endpoint;

        if (!prev) {
                do {
                        port = acpi_get_next_subnode(fwnode, port);
                        /*
                         * The names of the port nodes begin with "port@"
                         * followed by the number of the port node and they also
                         * have a "reg" property that also has the number of the
                         * port node. For compatibility reasons a node is also
                         * recognised as a port node from the "port" property.
                         */
                        if (is_acpi_graph_node(port, "port"))
                                break;
                } while (port);
        } else {
                port = fwnode_get_parent(prev);
        }

        if (!port)
                return NULL;

        do {
                endpoint = acpi_get_next_subnode(port, prev);
                if (endpoint)
                        break;

                prev = NULL;

                do {
                        port = acpi_get_next_subnode(fwnode, port);
                } while (port && !is_acpi_graph_node(port, "port"));
        } while (port);

        /*
         * The names of the endpoint nodes begin with "endpoint@" followed by
         * the number of the endpoint node and they also have a "reg" property
         * that also has the number of the endpoint node. For compatibility
         * reasons a node is also recognised as an endpoint node from the
         * "endpoint" property.
         */
        if (!is_acpi_graph_node(endpoint, "endpoint"))
                return NULL;

        return endpoint;
}

/**
 * acpi_graph_get_child_prop_value - Return a child with a given property value
 * @fwnode: device fwnode
 * @prop_name: The name of the property to look for
 * @val: the desired property value
 *
 * Return the port node corresponding to a given port number. Returns
 * the child node on success, NULL otherwise.
 */
static struct fwnode_handle *acpi_graph_get_child_prop_value(
        const struct fwnode_handle *fwnode, const char *prop_name,
        unsigned int val)
{
        struct fwnode_handle *child;

        fwnode_for_each_child_node(fwnode, child) {
                u32 nr;

                if (fwnode_property_read_u32(child, prop_name, &nr))
                        continue;

                if (val == nr)
                        return child;
        }

        return NULL;
}


/**
 * acpi_graph_get_remote_endpoint - Parses and returns remote end of an endpoint
 * @__fwnode: Endpoint firmware node pointing to a remote device
 *
 * Returns the remote endpoint corresponding to @__fwnode. NULL on error.
 */
static struct fwnode_handle *
acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode)
{
        struct fwnode_handle *fwnode;
        unsigned int port_nr, endpoint_nr;
        struct fwnode_reference_args args;
        int ret;

        memset(&args, 0, sizeof(args));
        ret = acpi_node_get_property_reference(__fwnode, "remote-endpoint", 0,
                                               &args);
        if (ret)
                return NULL;

        /* Direct endpoint reference? */
        if (!is_acpi_device_node(args.fwnode))
                return args.nargs ? NULL : args.fwnode;

        /*
         * Always require two arguments with the reference: port and
         * endpoint indices.
         */
        if (args.nargs != 2)
                return NULL;

        fwnode = args.fwnode;
        port_nr = args.args[0];
        endpoint_nr = args.args[1];

        fwnode = acpi_graph_get_child_prop_value(fwnode, "port", port_nr);

        return acpi_graph_get_child_prop_value(fwnode, "endpoint", endpoint_nr);
}

static bool acpi_fwnode_device_is_available(const struct fwnode_handle *fwnode)
{
        if (!is_acpi_device_node(fwnode))
                return true;

        return acpi_device_is_present(to_acpi_device_node(fwnode));
}

static const void *
acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
                                  const struct device *dev)
{
        return acpi_device_get_match_data(dev);
}

static bool acpi_fwnode_device_dma_supported(const struct fwnode_handle *fwnode)
{
        return acpi_dma_supported(to_acpi_device_node(fwnode));
}

static enum dev_dma_attr
acpi_fwnode_device_get_dma_attr(const struct fwnode_handle *fwnode)
{
        return acpi_get_dma_attr(to_acpi_device_node(fwnode));
}

static bool acpi_fwnode_property_present(const struct fwnode_handle *fwnode,
                                         const char *propname)
{
        return !acpi_node_prop_get(fwnode, propname, NULL);
}

static int
acpi_fwnode_property_read_int_array(const struct fwnode_handle *fwnode,
                                    const char *propname,
                                    unsigned int elem_size, void *val,
                                    size_t nval)
{
        enum dev_prop_type type;

        switch (elem_size) {
        case sizeof(u8):
                type = DEV_PROP_U8;
                break;
        case sizeof(u16):
                type = DEV_PROP_U16;
                break;
        case sizeof(u32):
                type = DEV_PROP_U32;
                break;
        case sizeof(u64):
                type = DEV_PROP_U64;
                break;
        default:
                return -ENXIO;
        }

        return acpi_node_prop_read(fwnode, propname, type, val, nval);
}

static int
acpi_fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
                                       const char *propname, const char **val,
                                       size_t nval)
{
        return acpi_node_prop_read(fwnode, propname, DEV_PROP_STRING,
                                   val, nval);
}

static const char *acpi_fwnode_get_name(const struct fwnode_handle *fwnode)
{
        const struct acpi_device *adev;
        struct fwnode_handle *parent;

        /* Is this the root node? */
        parent = fwnode_get_parent(fwnode);
        if (!parent)
                return "\\";

        fwnode_handle_put(parent);

        if (is_acpi_data_node(fwnode)) {
                const struct acpi_data_node *dn = to_acpi_data_node(fwnode);

                return dn->name;
        }

        adev = to_acpi_device_node(fwnode);
        if (WARN_ON(!adev))
                return NULL;

        return acpi_device_bid(adev);
}

static const char *
acpi_fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
{
        struct fwnode_handle *parent;

        /* Is this the root node? */
        parent = fwnode_get_parent(fwnode);
        if (!parent)
                return "";

        /* Is this 2nd node from the root? */
        parent = fwnode_get_next_parent(parent);
        if (!parent)
                return "";

        fwnode_handle_put(parent);

        /* ACPI device or data node. */
        return ".";
}

static struct fwnode_handle *
acpi_fwnode_get_parent(struct fwnode_handle *fwnode)
{
        return acpi_node_get_parent(fwnode);
}

static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
                                            struct fwnode_endpoint *endpoint)
{
        struct fwnode_handle *port_fwnode = fwnode_get_parent(fwnode);

        endpoint->local_fwnode = fwnode;

        if (fwnode_property_read_u32(port_fwnode, "reg", &endpoint->port))
                fwnode_property_read_u32(port_fwnode, "port", &endpoint->port);
        if (fwnode_property_read_u32(fwnode, "reg", &endpoint->id))
                fwnode_property_read_u32(fwnode, "endpoint", &endpoint->id);

        fwnode_handle_put(port_fwnode);
        return 0;
}

static int acpi_fwnode_irq_get(const struct fwnode_handle *fwnode,
                               unsigned int index)
{
        struct resource res;
        int ret;

        ret = acpi_irq_get(ACPI_HANDLE_FWNODE(fwnode), index, &res);
        if (ret)
                return ret;

        return res.start;
}

#define DECLARE_ACPI_FWNODE_OPS(ops) \
        const struct fwnode_operations ops = {                                \
                .device_is_available = acpi_fwnode_device_is_available, \
                .device_get_match_data = acpi_fwnode_device_get_match_data, \
                .device_dma_supported =                                \
                        acpi_fwnode_device_dma_supported,                \
                .device_get_dma_attr = acpi_fwnode_device_get_dma_attr,        \
                .property_present = acpi_fwnode_property_present,        \
                .property_read_bool = acpi_fwnode_property_present,        \
                .property_read_int_array =                                \
                        acpi_fwnode_property_read_int_array,                \
                .property_read_string_array =                                \
                        acpi_fwnode_property_read_string_array,                \
                .get_parent = acpi_node_get_parent,                        \
                .get_next_child_node = acpi_get_next_present_subnode,        \
                .get_named_child_node = acpi_fwnode_get_named_child_node, \
                .get_name = acpi_fwnode_get_name,                        \
                .get_name_prefix = acpi_fwnode_get_name_prefix,                \
                .get_reference_args = acpi_fwnode_get_reference_args,        \
                .graph_get_next_endpoint =                                \
                        acpi_graph_get_next_endpoint,                        \
                .graph_get_remote_endpoint =                                \
                        acpi_graph_get_remote_endpoint,                        \
                .graph_get_port_parent = acpi_fwnode_get_parent,        \
                .graph_parse_endpoint = acpi_fwnode_graph_parse_endpoint, \
                .irq_get = acpi_fwnode_irq_get,                                \
        };                                                                \
        EXPORT_SYMBOL_GPL(ops)

DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops);
DECLARE_ACPI_FWNODE_OPS(acpi_data_fwnode_ops);
const struct fwnode_operations acpi_static_fwnode_ops;

bool is_acpi_device_node(const struct fwnode_handle *fwnode)
{
        return !IS_ERR_OR_NULL(fwnode) &&
                fwnode->ops == &acpi_device_fwnode_ops;
}
EXPORT_SYMBOL(is_acpi_device_node);

bool is_acpi_data_node(const struct fwnode_handle *fwnode)
{
        return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops;
}
EXPORT_SYMBOL(is_acpi_data_node);




































































































    2 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BEN_VLAN_802_1Q_INC__
#define __BEN_VLAN_802_1Q_INC__

#include <linux/if_vlan.h>
#include <linux/u64_stats_sync.h>
#include <linux/list.h>

/* if this changes, algorithm will have to be reworked because this
 * depends on completely exhausting the VLAN identifier space.  Thus
 * it gives constant time look-up, but in many cases it wastes memory.
 */
#define VLAN_GROUP_ARRAY_SPLIT_PARTS  8
#define VLAN_GROUP_ARRAY_PART_LEN     (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS)

enum vlan_protos {
        VLAN_PROTO_8021Q        = 0,
        VLAN_PROTO_8021AD,
        VLAN_PROTO_NUM,
};

struct vlan_group {
        unsigned int                nr_vlan_devs;
        struct hlist_node        hlist;        /* linked list */
        struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM]
                                               [VLAN_GROUP_ARRAY_SPLIT_PARTS];
};

struct vlan_info {
        struct net_device        *real_dev; /* The ethernet(like) device
                                            * the vlan is attached to.
                                            */
        struct vlan_group        grp;
        struct list_head        vid_list;
        unsigned int                nr_vids;
        bool                        auto_vid0;
        struct rcu_head                rcu;
};

static inline int vlan_proto_idx(__be16 proto)
{
        switch (proto) {
        case htons(ETH_P_8021Q):
                return VLAN_PROTO_8021Q;
        case htons(ETH_P_8021AD):
                return VLAN_PROTO_8021AD;
        default:
                WARN(1, "invalid VLAN protocol: 0x%04x\n", ntohs(proto));
                return -EINVAL;
        }
}

static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg,
                                                         unsigned int pidx,
                                                         u16 vlan_id)
{
        struct net_device **array;

        array = vg->vlan_devices_arrays[pidx]
                                       [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];

        /* paired with smp_wmb() in vlan_group_prealloc_vid() */
        smp_rmb();

        return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL;
}

static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
                                                       __be16 vlan_proto,
                                                       u16 vlan_id)
{
        int pidx = vlan_proto_idx(vlan_proto);

        if (pidx < 0)
                return NULL;

        return __vlan_group_get_device(vg, pidx, vlan_id);
}

static inline void vlan_group_set_device(struct vlan_group *vg,
                                         __be16 vlan_proto, u16 vlan_id,
                                         struct net_device *dev)
{
        int pidx = vlan_proto_idx(vlan_proto);
        struct net_device **array;

        if (!vg || pidx < 0)
                return;
        array = vg->vlan_devices_arrays[pidx]
                                       [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
        array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
}

/* Must be invoked with rcu_read_lock or with RTNL. */
static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
                                               __be16 vlan_proto, u16 vlan_id)
{
        struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info);

        if (vlan_info)
                return vlan_group_get_device(&vlan_info->grp,
                                             vlan_proto, vlan_id);

        return NULL;
}

static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev)
{
        netdev_features_t ret;

        ret = real_dev->hw_enc_features &
              (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE |
               NETIF_F_GSO_ENCAP_ALL);

        if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK))
                return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM;
        return 0;
}

#define vlan_group_for_each_dev(grp, i, dev) \
        for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \
                if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
                                                            (i) % VLAN_N_VID)))

int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto);
void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto);

/* found in vlan_dev.c */
void vlan_dev_set_ingress_priority(const struct net_device *dev,
                                   u32 skb_prio, u16 vlan_prio);
int vlan_dev_set_egress_priority(const struct net_device *dev,
                                 u32 skb_prio, u16 vlan_prio);
void vlan_dev_free_egress_priority(const struct net_device *dev);
int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
void vlan_dev_get_realdev_name(const struct net_device *dev, char *result,
                               size_t size);

int vlan_check_real_dev(struct net_device *real_dev,
                        __be16 protocol, u16 vlan_id,
                        struct netlink_ext_ack *extack);
void vlan_setup(struct net_device *dev);
int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
bool vlan_dev_inherit_address(struct net_device *dev,
                              struct net_device *real_dev);

static inline u32 vlan_get_ingress_priority(struct net_device *dev,
                                            u16 vlan_tci)
{
        struct vlan_dev_priv *vip = vlan_dev_priv(dev);

        return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7];
}

#ifdef CONFIG_VLAN_8021Q_GVRP
int vlan_gvrp_request_join(const struct net_device *dev);
void vlan_gvrp_request_leave(const struct net_device *dev);
int vlan_gvrp_init_applicant(struct net_device *dev);
void vlan_gvrp_uninit_applicant(struct net_device *dev);
int vlan_gvrp_init(void);
void vlan_gvrp_uninit(void);
#else
static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; }
static inline void vlan_gvrp_request_leave(const struct net_device *dev) {}
static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; }
static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {}
static inline int vlan_gvrp_init(void) { return 0; }
static inline void vlan_gvrp_uninit(void) {}
#endif

#ifdef CONFIG_VLAN_8021Q_MVRP
int vlan_mvrp_request_join(const struct net_device *dev);
void vlan_mvrp_request_leave(const struct net_device *dev);
int vlan_mvrp_init_applicant(struct net_device *dev);
void vlan_mvrp_uninit_applicant(struct net_device *dev);
int vlan_mvrp_init(void);
void vlan_mvrp_uninit(void);
#else
static inline int vlan_mvrp_request_join(const struct net_device *dev) { return 0; }
static inline void vlan_mvrp_request_leave(const struct net_device *dev) {}
static inline int vlan_mvrp_init_applicant(struct net_device *dev) { return 0; }
static inline void vlan_mvrp_uninit_applicant(struct net_device *dev) {}
static inline int vlan_mvrp_init(void) { return 0; }
static inline void vlan_mvrp_uninit(void) {}
#endif

extern const char vlan_fullname[];
extern const char vlan_version[];
int vlan_netlink_init(void);
void vlan_netlink_fini(void);

extern struct rtnl_link_ops vlan_link_ops;

extern unsigned int vlan_net_id;

struct proc_dir_entry;

struct vlan_net {
        /* /proc/net/vlan */
        struct proc_dir_entry *proc_vlan_dir;
        /* /proc/net/vlan/config */
        struct proc_dir_entry *proc_vlan_conf;
        /* Determines interface naming scheme. */
        unsigned short name_type;
};

#endif /* !(__BEN_VLAN_802_1Q_INC__) */































































































   49 

   49 




































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
// SPDX-License-Identifier: GPL-2.0
/*
 * SafeSetID Linux Security Module
 *
 * Author: Micah Morton <mortonm@chromium.org>
 *
 * Copyright (C) 2018 The Chromium OS Authors.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2, as
 * published by the Free Software Foundation.
 *
 */

#define pr_fmt(fmt) "SafeSetID: " fmt

#include <linux/lsm_hooks.h>
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <uapi/linux/lsm.h>
#include "lsm.h"

/* Flag indicating whether initialization completed */
int safesetid_initialized __initdata;

struct setid_ruleset __rcu *safesetid_setuid_rules;
struct setid_ruleset __rcu *safesetid_setgid_rules;


/* Compute a decision for a transition from @src to @dst under @policy. */
enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy,
                kid_t src, kid_t dst)
{
        struct setid_rule *rule;
        enum sid_policy_type result = SIDPOL_DEFAULT;

        if (policy->type == UID) {
                hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) {
                        if (!uid_eq(rule->src_id.uid, src.uid))
                                continue;
                        if (uid_eq(rule->dst_id.uid, dst.uid))
                                return SIDPOL_ALLOWED;
                        result = SIDPOL_CONSTRAINED;
                }
        } else if (policy->type == GID) {
                hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) {
                        if (!gid_eq(rule->src_id.gid, src.gid))
                                continue;
                        if (gid_eq(rule->dst_id.gid, dst.gid)){
                                return SIDPOL_ALLOWED;
                        }
                        result = SIDPOL_CONSTRAINED;
                }
        } else {
                /* Should not reach here, report the ID as contrainsted */
                result = SIDPOL_CONSTRAINED;
        }
        return result;
}

/*
 * Compute a decision for a transition from @src to @dst under the active
 * policy.
 */
static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type)
{
        enum sid_policy_type result = SIDPOL_DEFAULT;
        struct setid_ruleset *pol;

        rcu_read_lock();
        if (new_type == UID)
                pol = rcu_dereference(safesetid_setuid_rules);
        else if (new_type == GID)
                pol = rcu_dereference(safesetid_setgid_rules);
        else { /* Should not reach here */
                result = SIDPOL_CONSTRAINED;
                rcu_read_unlock();
                return result;
        }

        if (pol) {
                pol->type = new_type;
                result = _setid_policy_lookup(pol, src, dst);
        }
        rcu_read_unlock();
        return result;
}

static int safesetid_security_capable(const struct cred *cred,
                                      struct user_namespace *ns,
                                      int cap,
                                      unsigned int opts)
{
        /* We're only interested in CAP_SETUID and CAP_SETGID. */
        if (cap != CAP_SETUID && cap != CAP_SETGID)
                return 0;

        /*
         * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we
         * want to let it go through here; the real security check happens later, in
         * the task_fix_set{u/g}id or task_fix_setgroups hooks.
         */
        if ((opts & CAP_OPT_INSETID) != 0)
                return 0;

        switch (cap) {
        case CAP_SETUID:
                /*
                * If no policy applies to this task, allow the use of CAP_SETUID for
                * other purposes.
                */
                if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT)
                        return 0;
                /*
                 * Reject use of CAP_SETUID for functionality other than calling
                 * set*uid() (e.g. setting up userns uid mappings).
                 */
                pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n",
                        __kuid_val(cred->uid));
                return -EPERM;
        case CAP_SETGID:
                /*
                * If no policy applies to this task, allow the use of CAP_SETGID for
                * other purposes.
                */
                if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                        return 0;
                /*
                 * Reject use of CAP_SETUID for functionality other than calling
                 * set*gid() (e.g. setting up userns gid mappings).
                 */
                pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n",
                        __kgid_val(cred->gid));
                return -EPERM;
        default:
                /* Error, the only capabilities were checking for is CAP_SETUID/GID */
                return 0;
        }
        return 0;
}

/*
 * Check whether a caller with old credentials @old is allowed to switch to
 * credentials that contain @new_id.
 */
static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type)
{
        bool permitted;

        /* If our old creds already had this ID in it, it's fine. */
        if (new_type == UID) {
                if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) ||
                        uid_eq(new_id.uid, old->suid))
                        return true;
        } else if (new_type == GID){
                if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) ||
                        gid_eq(new_id.gid, old->sgid))
                        return true;
        } else /* Error, new_type is an invalid type */
                return false;

        /*
         * Transitions to new UIDs require a check against the policy of the old
         * RUID.
         */
        permitted =
            setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED;

        if (!permitted) {
                if (new_type == UID) {
                        pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n",
                                __kuid_val(old->uid), __kuid_val(old->euid),
                                __kuid_val(old->suid), __kuid_val(new_id.uid));
                } else if (new_type == GID) {
                        pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n",
                                __kgid_val(old->gid), __kgid_val(old->egid),
                                __kgid_val(old->sgid), __kgid_val(new_id.gid));
                } else /* Error, new_type is an invalid type */
                        return false;
        }
        return permitted;
}

/*
 * Check whether there is either an exception for user under old cred struct to
 * set*uid to user under new cred struct, or the UID transition is allowed (by
 * Linux set*uid rules) even without CAP_SETUID.
 */
static int safesetid_task_fix_setuid(struct cred *new,
                                     const struct cred *old,
                                     int flags)
{

        /* Do nothing if there are no setuid restrictions for our old RUID. */
        if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT)
                return 0;

        if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID))
                return 0;

        /*
         * Kill this process to avoid potential security vulnerabilities
         * that could arise from a missing allowlist entry preventing a
         * privileged process from dropping to a lesser-privileged one.
         */
        force_sig(SIGKILL);
        return -EACCES;
}

static int safesetid_task_fix_setgid(struct cred *new,
                                     const struct cred *old,
                                     int flags)
{

        /* Do nothing if there are no setgid restrictions for our old RGID. */
        if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                return 0;

        if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID))
                return 0;

        /*
         * Kill this process to avoid potential security vulnerabilities
         * that could arise from a missing allowlist entry preventing a
         * privileged process from dropping to a lesser-privileged one.
         */
        force_sig(SIGKILL);
        return -EACCES;
}

static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old)
{
        int i;

        /* Do nothing if there are no setgid restrictions for our old RGID. */
        if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                return 0;

        get_group_info(new->group_info);
        for (i = 0; i < new->group_info->ngroups; i++) {
                if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) {
                        put_group_info(new->group_info);
                        /*
                         * Kill this process to avoid potential security vulnerabilities
                         * that could arise from a missing allowlist entry preventing a
                         * privileged process from dropping to a lesser-privileged one.
                         */
                        force_sig(SIGKILL);
                        return -EACCES;
                }
        }

        put_group_info(new->group_info);
        return 0;
}

static const struct lsm_id safesetid_lsmid = {
        .name = "safesetid",
        .id = LSM_ID_SAFESETID,
};

static struct security_hook_list safesetid_security_hooks[] = {
        LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid),
        LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid),
        LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups),
        LSM_HOOK_INIT(capable, safesetid_security_capable)
};

static int __init safesetid_security_init(void)
{
        security_add_hooks(safesetid_security_hooks,
                           ARRAY_SIZE(safesetid_security_hooks),
                           &safesetid_lsmid);

        /* Report that SafeSetID successfully initialized */
        safesetid_initialized = 1;

        return 0;
}

DEFINE_LSM(safesetid_security_init) = {
        .id = &safesetid_lsmid,
        .init = safesetid_security_init,
        .initcall_fs = safesetid_init_securityfs,
};











    1 




    1 












    1 


















































































    1 








    1 



    1 










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFRM_HASH_H
#define _XFRM_HASH_H

#include <linux/xfrm.h>
#include <linux/socket.h>
#include <linux/jhash.h>

static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr)
{
        return ntohl(addr->a4);
}

static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr)
{
        return jhash2((__force u32 *)addr->a6, 4, 0);
}

static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr)
{
        u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4;
        return ntohl((__force __be32)sum);
}

static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr)
{
        return __xfrm6_addr_hash(daddr) ^ __xfrm6_addr_hash(saddr);
}

static inline u32 __bits2mask32(__u8 bits)
{
        u32 mask32 = 0xffffffff;

        if (bits == 0)
                mask32 = 0;
        else if (bits < 32)
                mask32 <<= (32 - bits);

        return mask32;
}

static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr,
                                                    __u8 dbits,
                                                    __u8 sbits)
{
        return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits),
                            ntohl(saddr->a4) & __bits2mask32(sbits),
                            0);
}

static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr,
                                             __u8 prefixlen)
{
        unsigned int pdw;
        unsigned int pbi;
        u32 initval = 0;

        pdw = prefixlen >> 5;     /* num of whole u32 in prefix */
        pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */

        if (pbi) {
                __be32 mask;

                mask = htonl((0xffffffff) << (32 - pbi));

                initval = (__force u32)(addr->a6[pdw] & mask);
        }

        return jhash2((__force u32 *)addr->a6, pdw, initval);
}

static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr,
                                                    __u8 dbits,
                                                    __u8 sbits)
{
        return __xfrm6_pref_hash(daddr, dbits) ^
               __xfrm6_pref_hash(saddr, sbits);
}

static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr,
                                           const xfrm_address_t *saddr,
                                           u32 reqid, unsigned short family,
                                           unsigned int hmask)
{
        unsigned int h = family ^ reqid;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
                break;
        }
        return (h ^ (h >> 16)) & hmask;
}

static inline unsigned int __xfrm_src_hash(const xfrm_address_t *daddr,
                                           const xfrm_address_t *saddr,
                                           unsigned short family,
                                           unsigned int hmask)
{
        unsigned int h = family;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
                break;
        }
        return (h ^ (h >> 16)) & hmask;
}

static inline unsigned int
__xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto,
                unsigned short family, unsigned int hmask)
{
        unsigned int h = (__force u32)spi ^ proto;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_addr_hash(daddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_addr_hash(daddr);
                break;
        }
        return (h ^ (h >> 10) ^ (h >> 20)) & hmask;
}

static inline unsigned int
__xfrm_seq_hash(u32 seq, unsigned int hmask)
{
        unsigned int h = seq;
        return (h ^ (h >> 10) ^ (h >> 20)) & hmask;
}

static inline unsigned int __idx_hash(u32 index, unsigned int hmask)
{
        return (index ^ (index >> 8)) & hmask;
}

static inline unsigned int __sel_hash(const struct xfrm_selector *sel,
                                      unsigned short family, unsigned int hmask,
                                      u8 dbits, u8 sbits)
{
        const xfrm_address_t *daddr = &sel->daddr;
        const xfrm_address_t *saddr = &sel->saddr;
        unsigned int h = 0;

        switch (family) {
        case AF_INET:
                if (sel->prefixlen_d < dbits ||
                    sel->prefixlen_s < sbits)
                        return hmask + 1;

                h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;

        case AF_INET6:
                if (sel->prefixlen_d < dbits ||
                    sel->prefixlen_s < sbits)
                        return hmask + 1;

                h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;
        }
        h ^= (h >> 16);
        return h & hmask;
}

static inline unsigned int __addr_hash(const xfrm_address_t *daddr,
                                       const xfrm_address_t *saddr,
                                       unsigned short family,
                                       unsigned int hmask,
                                       u8 dbits, u8 sbits)
{
        unsigned int h = 0;

        switch (family) {
        case AF_INET:
                h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;

        case AF_INET6:
                h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;
        }
        h ^= (h >> 16);
        return h & hmask;
}

struct hlist_head *xfrm_hash_alloc(unsigned int sz);
void xfrm_hash_free(struct hlist_head *n, unsigned int sz);

#endif /* _XFRM_HASH_H */





















    1 









































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfsplus/brec.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Handle individual btree records
 */

#include "hfsplus_fs.h"
#include "hfsplus_raw.h"

static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd);
static int hfs_brec_update_parent(struct hfs_find_data *fd);
static int hfs_btree_inc_height(struct hfs_btree *);

/* Get the length and offset of the given record in the given node */
u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off)
{
        __be16 retval[2];
        u16 dataoff;

        dataoff = node->tree->node_size - (rec + 2) * 2;
        hfs_bnode_read(node, retval, dataoff, 4);
        *off = be16_to_cpu(retval[1]);
        return be16_to_cpu(retval[0]) - *off;
}

/* Get the length of the key from a keyed record */
u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
{
        u16 retval, recoff;

        if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF)
                return 0;

        if ((node->type == HFS_NODE_INDEX) &&
           !(node->tree->attributes & HFS_TREE_VARIDXKEYS) &&
           (node->tree->cnid != HFSPLUS_ATTR_CNID)) {
                retval = node->tree->max_key_len + 2;
        } else {
                recoff = hfs_bnode_read_u16(node,
                        node->tree->node_size - (rec + 1) * 2);
                if (!recoff)
                        return 0;
                if (recoff > node->tree->node_size - 2) {
                        pr_err("recoff %d too large\n", recoff);
                        return 0;
                }

                retval = hfs_bnode_read_u16(node, recoff) + 2;
                if (retval > node->tree->max_key_len + 2) {
                        pr_err("keylen %d too large\n",
                                retval);
                        retval = 0;
                }
        }
        return retval;
}

int hfs_brec_insert(struct hfs_find_data *fd, void *entry, u32 entry_len)
{
        struct hfs_btree *tree;
        struct hfs_bnode *node, *new_node;
        int size, key_len, rec;
        int data_off, end_off;
        int idx_rec_off, data_rec_off, end_rec_off;
        __be32 cnid;

        tree = fd->tree;
        if (!fd->bnode) {
                if (!tree->root)
                        hfs_btree_inc_height(tree);
                node = hfs_bnode_find(tree, tree->leaf_head);
                if (IS_ERR(node))
                        return PTR_ERR(node);
                fd->bnode = node;
                fd->record = -1;
        }
        new_node = NULL;
        key_len = be16_to_cpu(fd->search_key->key_len) + 2;
again:
        /* new record idx and complete record size */
        rec = fd->record + 1;
        size = key_len + entry_len;

        node = fd->bnode;
        hfs_bnode_dump(node);
        /* get last offset */
        end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
        end_off = hfs_bnode_read_u16(node, end_rec_off);
        end_rec_off -= 2;
        hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n",
                rec, size, end_off, end_rec_off);
        if (size > end_rec_off - end_off) {
                if (new_node)
                        panic("not enough room!\n");
                new_node = hfs_bnode_split(fd);
                if (IS_ERR(new_node))
                        return PTR_ERR(new_node);
                goto again;
        }
        if (node->type == HFS_NODE_LEAF) {
                tree->leaf_count++;
                mark_inode_dirty(tree->inode);
        }
        node->num_recs++;
        /* write new last offset */
        hfs_bnode_write_u16(node,
                offsetof(struct hfs_bnode_desc, num_recs),
                node->num_recs);
        hfs_bnode_write_u16(node, end_rec_off, end_off + size);
        data_off = end_off;
        data_rec_off = end_rec_off + 2;
        idx_rec_off = tree->node_size - (rec + 1) * 2;
        if (idx_rec_off == data_rec_off)
                goto skip;
        /* move all following entries */
        do {
                data_off = hfs_bnode_read_u16(node, data_rec_off + 2);
                hfs_bnode_write_u16(node, data_rec_off, data_off + size);
                data_rec_off += 2;
        } while (data_rec_off < idx_rec_off);

        /* move data away */
        hfs_bnode_move(node, data_off + size, data_off,
                       end_off - data_off);

skip:
        hfs_bnode_write(node, fd->search_key, data_off, key_len);
        hfs_bnode_write(node, entry, data_off + key_len, entry_len);
        hfs_bnode_dump(node);

        /*
         * update parent key if we inserted a key
         * at the start of the node and it is not the new node
         */
        if (!rec && new_node != node) {
                hfs_bnode_read_key(node, fd->search_key, data_off + size);
                hfs_brec_update_parent(fd);
        }

        if (new_node) {
                hfs_bnode_put(fd->bnode);
                if (!new_node->parent) {
                        hfs_btree_inc_height(tree);
                        new_node->parent = tree->root;
                }
                fd->bnode = hfs_bnode_find(tree, new_node->parent);

                /* create index data entry */
                cnid = cpu_to_be32(new_node->this);
                entry = &cnid;
                entry_len = sizeof(cnid);

                /* get index key */
                hfs_bnode_read_key(new_node, fd->search_key, 14);
                __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key);

                hfs_bnode_put(new_node);
                new_node = NULL;

                if ((tree->attributes & HFS_TREE_VARIDXKEYS) ||
                                (tree->cnid == HFSPLUS_ATTR_CNID))
                        key_len = be16_to_cpu(fd->search_key->key_len) + 2;
                else {
                        fd->search_key->key_len =
                                cpu_to_be16(tree->max_key_len);
                        key_len = tree->max_key_len + 2;
                }
                goto again;
        }

        return 0;
}

int hfs_brec_remove(struct hfs_find_data *fd)
{
        struct hfs_btree *tree;
        struct hfs_bnode *node, *parent;
        int end_off, rec_off, data_off, size;

        tree = fd->tree;
        node = fd->bnode;
again:
        rec_off = tree->node_size - (fd->record + 2) * 2;
        end_off = tree->node_size - (node->num_recs + 1) * 2;

        if (node->type == HFS_NODE_LEAF) {
                tree->leaf_count--;
                mark_inode_dirty(tree->inode);
        }
        hfs_bnode_dump(node);
        hfs_dbg("rec %d, len %d\n",
                fd->record, fd->keylength + fd->entrylength);
        if (!--node->num_recs) {
                hfs_bnode_unlink(node);
                if (!node->parent)
                        return 0;
                parent = hfs_bnode_find(tree, node->parent);
                if (IS_ERR(parent))
                        return PTR_ERR(parent);
                hfs_bnode_put(node);
                node = fd->bnode = parent;

                __hfs_brec_find(node, fd, hfs_find_rec_by_key);
                goto again;
        }
        hfs_bnode_write_u16(node,
                offsetof(struct hfs_bnode_desc, num_recs),
                node->num_recs);

        if (rec_off == end_off)
                goto skip;
        size = fd->keylength + fd->entrylength;

        do {
                data_off = hfs_bnode_read_u16(node, rec_off);
                hfs_bnode_write_u16(node, rec_off + 2, data_off - size);
                rec_off -= 2;
        } while (rec_off >= end_off);

        /* fill hole */
        hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size,
                       data_off - fd->keyoffset - size);
skip:
        hfs_bnode_dump(node);
        if (!fd->record)
                hfs_brec_update_parent(fd);
        return 0;
}

static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
{
        struct hfs_btree *tree;
        struct hfs_bnode *node, *new_node, *next_node;
        struct hfs_bnode_desc node_desc;
        int num_recs, new_rec_off, new_off, old_rec_off;
        int data_start, data_end, size;
        size_t rec_off_tbl_size;
        size_t node_desc_size = sizeof(struct hfs_bnode_desc);
        size_t rec_size = sizeof(__be16);

        tree = fd->tree;
        node = fd->bnode;
        new_node = hfs_bmap_alloc(tree);
        if (IS_ERR(new_node))
                return new_node;
        hfs_bnode_get(node);
        hfs_dbg("this %d - new %d - next %d\n",
                node->this, new_node->this, node->next);
        new_node->next = node->next;
        new_node->prev = node->this;
        new_node->parent = node->parent;
        new_node->type = node->type;
        new_node->height = node->height;

        if (node->next)
                next_node = hfs_bnode_find(tree, node->next);
        else
                next_node = NULL;

        if (IS_ERR(next_node)) {
                hfs_bnode_put(node);
                hfs_bnode_put(new_node);
                return next_node;
        }

        rec_off_tbl_size = node->num_recs * rec_size;
        size = tree->node_size / 2;
        size -= node_desc_size;
        size -= rec_off_tbl_size;
        old_rec_off = tree->node_size - (2 * rec_size);

        num_recs = 1;
        for (;;) {
                data_start = hfs_bnode_read_u16(node, old_rec_off);
                if (data_start > size)
                        break;
                old_rec_off -= rec_size;
                if (++num_recs < node->num_recs)
                        continue;
                hfs_bnode_put(node);
                hfs_bnode_unlink(new_node);
                hfs_bnode_put(new_node);
                if (next_node)
                        hfs_bnode_put(next_node);
                return ERR_PTR(-ENOSPC);
        }

        if (fd->record + 1 < num_recs) {
                /* new record is in the lower half,
                 * so leave some more space there
                 */
                old_rec_off += rec_size;
                num_recs--;
                data_start = hfs_bnode_read_u16(node, old_rec_off);
        } else {
                hfs_bnode_put(node);
                hfs_bnode_get(new_node);
                fd->bnode = new_node;
                fd->record -= num_recs;
                fd->keyoffset -= data_start - node_desc_size;
                fd->entryoffset -= data_start - node_desc_size;
        }
        new_node->num_recs = node->num_recs - num_recs;
        node->num_recs = num_recs;

        new_rec_off = tree->node_size - rec_size;
        new_off = node_desc_size;
        size = data_start - new_off;
        num_recs = new_node->num_recs;
        data_end = data_start;
        while (num_recs) {
                hfs_bnode_write_u16(new_node, new_rec_off, new_off);
                old_rec_off -= rec_size;
                new_rec_off -= rec_size;
                data_end = hfs_bnode_read_u16(node, old_rec_off);
                new_off = data_end - size;
                num_recs--;
        }
        hfs_bnode_write_u16(new_node, new_rec_off, new_off);
        hfs_bnode_copy(new_node, node_desc_size,
                        node, data_start, data_end - data_start);

        /* update new bnode header */
        node_desc.next = cpu_to_be32(new_node->next);
        node_desc.prev = cpu_to_be32(new_node->prev);
        node_desc.type = new_node->type;
        node_desc.height = new_node->height;
        node_desc.num_recs = cpu_to_be16(new_node->num_recs);
        node_desc.reserved = 0;
        hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));

        /* update previous bnode header */
        node->next = new_node->this;
        hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc));
        node_desc.next = cpu_to_be32(node->next);
        node_desc.num_recs = cpu_to_be16(node->num_recs);
        hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));

        /* update next bnode header */
        if (next_node) {
                next_node->prev = new_node->this;
                hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
                node_desc.prev = cpu_to_be32(next_node->prev);
                hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc));
                hfs_bnode_put(next_node);
        } else if (node->this == tree->leaf_tail) {
                /* if there is no next node, this might be the new tail */
                tree->leaf_tail = new_node->this;
                mark_inode_dirty(tree->inode);
        }

        hfs_bnode_dump(node);
        hfs_bnode_dump(new_node);
        hfs_bnode_put(node);

        return new_node;
}

static int hfs_brec_update_parent(struct hfs_find_data *fd)
{
        struct hfs_btree *tree;
        struct hfs_bnode *node, *new_node, *parent;
        int newkeylen, diff;
        int rec, rec_off, end_rec_off;
        int start_off, end_off;

        tree = fd->tree;
        node = fd->bnode;
        new_node = NULL;
        if (!node->parent)
                return 0;

again:
        parent = hfs_bnode_find(tree, node->parent);
        if (IS_ERR(parent))
                return PTR_ERR(parent);
        __hfs_brec_find(parent, fd, hfs_find_rec_by_key);
        if (fd->record < 0)
                return -ENOENT;
        hfs_bnode_dump(parent);
        rec = fd->record;

        /* size difference between old and new key */
        if ((tree->attributes & HFS_TREE_VARIDXKEYS) ||
                                (tree->cnid == HFSPLUS_ATTR_CNID))
                newkeylen = hfs_bnode_read_u16(node, 14) + 2;
        else
                fd->keylength = newkeylen = tree->max_key_len + 2;
        hfs_dbg("rec %d, keylength %d, newkeylen %d\n",
                rec, fd->keylength, newkeylen);

        rec_off = tree->node_size - (rec + 2) * 2;
        end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
        diff = newkeylen - fd->keylength;
        if (!diff)
                goto skip;
        if (diff > 0) {
                end_off = hfs_bnode_read_u16(parent, end_rec_off);
                if (end_rec_off - end_off < diff) {

                        hfs_dbg("splitting index node\n");
                        fd->bnode = parent;
                        new_node = hfs_bnode_split(fd);
                        if (IS_ERR(new_node))
                                return PTR_ERR(new_node);
                        parent = fd->bnode;
                        rec = fd->record;
                        rec_off = tree->node_size - (rec + 2) * 2;
                        end_rec_off = tree->node_size -
                                (parent->num_recs + 1) * 2;
                }
        }

        end_off = start_off = hfs_bnode_read_u16(parent, rec_off);
        hfs_bnode_write_u16(parent, rec_off, start_off + diff);
        start_off -= 4;        /* move previous cnid too */

        while (rec_off > end_rec_off) {
                rec_off -= 2;
                end_off = hfs_bnode_read_u16(parent, rec_off);
                hfs_bnode_write_u16(parent, rec_off, end_off + diff);
        }
        hfs_bnode_move(parent, start_off + diff, start_off,
                       end_off - start_off);
skip:
        hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen);
        hfs_bnode_dump(parent);

        hfs_bnode_put(node);
        node = parent;

        if (new_node) {
                __be32 cnid;

                if (!new_node->parent) {
                        hfs_btree_inc_height(tree);
                        new_node->parent = tree->root;
                }
                fd->bnode = hfs_bnode_find(tree, new_node->parent);
                /* create index key and entry */
                hfs_bnode_read_key(new_node, fd->search_key, 14);
                cnid = cpu_to_be32(new_node->this);

                __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key);
                hfs_brec_insert(fd, &cnid, sizeof(cnid));
                hfs_bnode_put(fd->bnode);
                hfs_bnode_put(new_node);

                if (!rec) {
                        if (new_node == node)
                                goto out;
                        /* restore search_key */
                        hfs_bnode_read_key(node, fd->search_key, 14);
                }
                new_node = NULL;
        }

        if (!rec && node->parent)
                goto again;
out:
        fd->bnode = node;
        return 0;
}

static int hfs_btree_inc_height(struct hfs_btree *tree)
{
        struct hfs_bnode *node, *new_node;
        struct hfs_bnode_desc node_desc;
        int key_size, rec;
        __be32 cnid;

        node = NULL;
        if (tree->root) {
                node = hfs_bnode_find(tree, tree->root);
                if (IS_ERR(node))
                        return PTR_ERR(node);
        }
        new_node = hfs_bmap_alloc(tree);
        if (IS_ERR(new_node)) {
                hfs_bnode_put(node);
                return PTR_ERR(new_node);
        }

        tree->root = new_node->this;
        if (!tree->depth) {
                tree->leaf_head = tree->leaf_tail = new_node->this;
                new_node->type = HFS_NODE_LEAF;
                new_node->num_recs = 0;
        } else {
                new_node->type = HFS_NODE_INDEX;
                new_node->num_recs = 1;
        }
        new_node->parent = 0;
        new_node->next = 0;
        new_node->prev = 0;
        new_node->height = ++tree->depth;

        node_desc.next = cpu_to_be32(new_node->next);
        node_desc.prev = cpu_to_be32(new_node->prev);
        node_desc.type = new_node->type;
        node_desc.height = new_node->height;
        node_desc.num_recs = cpu_to_be16(new_node->num_recs);
        node_desc.reserved = 0;
        hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));

        rec = tree->node_size - 2;
        hfs_bnode_write_u16(new_node, rec, 14);

        if (node) {
                /* insert old root idx into new root */
                node->parent = tree->root;
                if (node->type == HFS_NODE_LEAF ||
                                tree->attributes & HFS_TREE_VARIDXKEYS ||
                                tree->cnid == HFSPLUS_ATTR_CNID)
                        key_size = hfs_bnode_read_u16(node, 14) + 2;
                else
                        key_size = tree->max_key_len + 2;
                hfs_bnode_copy(new_node, 14, node, 14, key_size);

                if (!(tree->attributes & HFS_TREE_VARIDXKEYS) &&
                                (tree->cnid != HFSPLUS_ATTR_CNID)) {
                        key_size = tree->max_key_len + 2;
                        hfs_bnode_write_u16(new_node, 14, tree->max_key_len);
                }
                cnid = cpu_to_be32(node->this);
                hfs_bnode_write(new_node, &cnid, 14 + key_size, 4);

                rec -= 2;
                hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4);

                hfs_bnode_put(node);
        }
        hfs_bnode_put(new_node);
        mark_inode_dirty(tree->inode);

        return 0;
}













































































































































































    1 
    1 






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
// SPDX-License-Identifier: GPL-2.0
/*
  File: fs/ext4/xattr.h

  On-disk format of extended attributes for the ext4 filesystem.

  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/

#include <linux/xattr.h>

/* Magic value in attribute blocks */
#define EXT4_XATTR_MAGIC                0xEA020000

/* Maximum number of references to one attribute block */
#define EXT4_XATTR_REFCOUNT_MAX                1024

/* Name indexes */
#define EXT4_XATTR_INDEX_USER                        1
#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS        2
#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT        3
#define EXT4_XATTR_INDEX_TRUSTED                4
#define        EXT4_XATTR_INDEX_LUSTRE                        5
#define EXT4_XATTR_INDEX_SECURITY                6
#define EXT4_XATTR_INDEX_SYSTEM                        7
#define EXT4_XATTR_INDEX_RICHACL                8
#define EXT4_XATTR_INDEX_ENCRYPTION                9
#define EXT4_XATTR_INDEX_HURD                        10 /* Reserved for Hurd */

struct ext4_xattr_header {
        __le32        h_magic;        /* magic number for identification */
        __le32        h_refcount;        /* reference count */
        __le32        h_blocks;        /* number of disk blocks used */
        __le32        h_hash;                /* hash value of all attributes */
        __le32        h_checksum;        /* crc32c(uuid+blknum+xattrblock) */
        __u32        h_reserved[3];        /* zero right now */
};

struct ext4_xattr_ibody_header {
        __le32        h_magic;        /* magic number for identification */
};

struct ext4_xattr_entry {
        __u8        e_name_len;        /* length of name */
        __u8        e_name_index;        /* attribute name index */
        __le16        e_value_offs;        /* offset in disk block of value */
        __le32        e_value_inum;        /* inode in which the value is stored */
        __le32        e_value_size;        /* size of attribute value */
        __le32        e_hash;                /* hash value of name and value */
        char        e_name[];        /* attribute name */
};

#define EXT4_XATTR_PAD_BITS                2
#define EXT4_XATTR_PAD                (1<<EXT4_XATTR_PAD_BITS)
#define EXT4_XATTR_ROUND                (EXT4_XATTR_PAD-1)
#define EXT4_XATTR_LEN(name_len) \
        (((name_len) + EXT4_XATTR_ROUND + \
        sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
#define EXT4_XATTR_NEXT(entry) \
        ((struct ext4_xattr_entry *)( \
         (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
#define EXT4_XATTR_SIZE(size) \
        (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)

#define IHDR(inode, raw_inode) \
        ((struct ext4_xattr_ibody_header *) \
                ((void *)raw_inode + \
                EXT4_GOOD_OLD_INODE_SIZE + \
                EXT4_I(inode)->i_extra_isize))
#define ITAIL(inode, raw_inode) \
        ((void *)(raw_inode) + \
         EXT4_SB((inode)->i_sb)->s_inode_size)
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))

/*
 * XATTR_SIZE_MAX is currently 64k, but for the purposes of checking
 * for file system consistency errors, we use a somewhat bigger value.
 * This allows XATTR_SIZE_MAX to grow in the future, but by using this
 * instead of INT_MAX for certain consistency checks, we don't need to
 * worry about arithmetic overflows.  (Actually XATTR_SIZE_MAX is
 * defined in include/uapi/linux/limits.h, so changing it is going
 * not going to be trivial....)
 */
#define EXT4_XATTR_SIZE_MAX (1 << 24)

/*
 * The minimum size of EA value when you start storing it in an external inode
 * size of block - size of header - size of 1 entry - 4 null bytes
 */
#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)                                        \
        ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)

#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
#define BFIRST(bh) ENTRY(BHDR(bh)+1)
#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)

#define EXT4_ZERO_XATTR_VALUE ((void *)-1)

/*
 * If we want to add an xattr to the inode, we should make sure that
 * i_extra_isize is not 0 and that the inode size is not less than
 * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
 *   EXT4_GOOD_OLD_INODE_SIZE   extra_isize header   entry   pad  data
 * |--------------------------|------------|------|---------|---|-------|
 */
#define EXT4_INODE_HAS_XATTR_SPACE(inode)                                \
        ((EXT4_I(inode)->i_extra_isize != 0) &&                                \
         (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize +        \
          sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <=        \
          EXT4_INODE_SIZE((inode)->i_sb)))

struct ext4_xattr_info {
        const char *name;
        const void *value;
        size_t value_len;
        int name_index;
        int in_inode;
};

struct ext4_xattr_search {
        struct ext4_xattr_entry *first;
        void *base;
        void *end;
        struct ext4_xattr_entry *here;
        int not_found;
};

struct ext4_xattr_ibody_find {
        struct ext4_xattr_search s;
        struct ext4_iloc iloc;
};

struct ext4_xattr_inode_array {
        unsigned int count;
        struct inode *inodes[] __counted_by(count);
};

extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
extern const struct xattr_handler ext4_xattr_hurd_handler;

#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"

/*
 * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes.
 * The first is to signal that there the inline xattrs and data are
 * taking up so much space that we might as well not keep trying to
 * expand it.  The second is that xattr_sem is taken for writing, so
 * we shouldn't try to recurse into the inode expansion.  For this
 * second case, we need to make sure that we take save and restore the
 * NO_EXPAND state flag appropriately.
 */
static inline void ext4_write_lock_xattr(struct inode *inode, int *save)
{
        down_write(&EXT4_I(inode)->xattr_sem);
        *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
}

static inline int ext4_write_trylock_xattr(struct inode *inode, int *save)
{
        if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0)
                return 0;
        *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
        return 1;
}

static inline void ext4_write_unlock_xattr(struct inode *inode, int *save)
{
        if (*save == 0)
                ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
        up_write(&EXT4_I(inode)->xattr_sem);
}

extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);

extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
                                  bool is_create, int *credits);
extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
                                struct buffer_head *block_bh, size_t value_len,
                                bool is_create);

extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
                                   struct ext4_xattr_inode_array **array,
                                   int extra_credits);
extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);

extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
extern void ext4_evict_ea_inode(struct inode *inode);

extern const struct xattr_handler * const ext4_xattr_handlers[];

extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
                                 struct ext4_xattr_ibody_find *is);
extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
                                const char *name,
                                void *buffer, size_t buffer_size);
extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
                                struct ext4_xattr_info *i,
                                struct ext4_xattr_ibody_find *is);

extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);

extern int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
                    void *end, const char *function, unsigned int line);

#define xattr_check_inode(inode, header, end) \
        __xattr_check_inode((inode), (header), (end), __func__, __LINE__)

#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
                              struct inode *dir, const struct qstr *qstr);
#else
static inline int ext4_init_security(handle_t *handle, struct inode *inode,
                                     struct inode *dir, const struct qstr *qstr)
{
        return 0;
}
#endif

#ifdef CONFIG_LOCKDEP
extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
#else
static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
#endif

extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage);

































































































    1 






























































































































    2 
    5 










   10 
   12 
   12 












































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/writeback.h
 */
#ifndef WRITEBACK_H
#define WRITEBACK_H

#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/fs.h>
#include <linux/flex_proportions.h>
#include <linux/backing-dev-defs.h>
#include <linux/blk_types.h>
#include <linux/folio_batch.h>

struct bio;

DECLARE_PER_CPU(int, dirty_throttle_leaks);

/*
 * The global dirty threshold is normally equal to the global dirty limit,
 * except when the system suddenly allocates a lot of anonymous memory and
 * knocks down the global dirty threshold quickly, in which case the global
 * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
 */
#define DIRTY_SCOPE                8

struct backing_dev_info;

/*
 * fs/fs-writeback.c
 */
enum writeback_sync_modes {
        WB_SYNC_NONE,        /* Don't wait on anything */
        WB_SYNC_ALL,        /* Wait on every mapping */
};

/*
 * A control structure which tells the writeback code what to do.  These are
 * always on the stack, and hence need no locking.  They are always initialised
 * in a manner such that unspecified fields are set to zero.
 */
struct writeback_control {
        /* public fields that can be set and/or consumed by the caller: */
        long nr_to_write;                /* Write this many pages, and decrement
                                           this for each page written */
        long pages_skipped;                /* Pages which were not written */

        /*
         * For a_ops->writepages(): if start or end are non-zero then this is
         * a hint that the filesystem need only write out the pages inside that
         * byterange.  The byte at `end' is included in the writeout request.
         */
        loff_t range_start;
        loff_t range_end;

        enum writeback_sync_modes sync_mode;

        unsigned for_kupdate:1;                /* A kupdate writeback */
        unsigned for_background:1;        /* A background writeback */
        unsigned tagged_writepages:1;        /* tag-and-write to avoid livelock */
        unsigned range_cyclic:1;        /* range_start is cyclic */
        unsigned for_sync:1;                /* sync(2) WB_SYNC_ALL writeback */
        unsigned unpinned_netfs_wb:1;        /* Cleared I_PINNING_NETFS_WB */

        /*
         * When writeback IOs are bounced through async layers, only the
         * initial synchronous phase should be accounted towards inode
         * cgroup ownership arbitration to avoid confusion.  Later stages
         * can set the following flag to disable the accounting.
         */
        unsigned no_cgroup_owner:1;

        /* internal fields used by the ->writepages implementation: */
        struct folio_batch fbatch;
        pgoff_t index;
        int saved_err;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback *wb;        /* wb this writeback is issued under */
        struct inode *inode;                /* inode being written out */

        /* foreign inode detection, see wbc_detach_inode() */
        int wb_id;                        /* current wb id */
        int wb_lcand_id;                /* last foreign candidate wb id */
        int wb_tcand_id;                /* this foreign candidate wb id */
        size_t wb_bytes;                /* bytes written by current wb */
        size_t wb_lcand_bytes;                /* bytes written by last candidate */
        size_t wb_tcand_bytes;                /* bytes written by this candidate */
#endif
};

static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc)
{
        blk_opf_t flags = 0;

        if (wbc->sync_mode == WB_SYNC_ALL)
                flags |= REQ_SYNC;
        else if (wbc->for_kupdate || wbc->for_background)
                flags |= REQ_BACKGROUND;

        return flags;
}

#ifdef CONFIG_CGROUP_WRITEBACK
#define wbc_blkcg_css(wbc) \
        ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css)
#else
#define wbc_blkcg_css(wbc)                (blkcg_root_css)
#endif /* CONFIG_CGROUP_WRITEBACK */

/*
 * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
 * and are measured against each other in.  There always is one global
 * domain, global_wb_domain, that every wb in the system is a member of.
 * This allows measuring the relative bandwidth of each wb to distribute
 * dirtyable memory accordingly.
 */
struct wb_domain {
        spinlock_t lock;

        /*
         * Scale the writeback cache size proportional to the relative
         * writeout speed.
         *
         * We do this by keeping a floating proportion between BDIs, based
         * on page writeback completions [end_page_writeback()]. Those
         * devices that write out pages fastest will get the larger share,
         * while the slower will get a smaller share.
         *
         * We use page writeout completions because we are interested in
         * getting rid of dirty pages. Having them written out is the
         * primary goal.
         *
         * We introduce a concept of time, a period over which we measure
         * these events, because demand can/will vary over time. The length
         * of this period itself is measured in page writeback completions.
         */
        struct fprop_global completions;
        struct timer_list period_timer;        /* timer for aging of completions */
        unsigned long period_time;

        /*
         * The dirtyable memory and dirty threshold could be suddenly
         * knocked down by a large amount (eg. on the startup of KVM in a
         * swapless system). This may throw the system into deep dirty
         * exceeded state and throttle heavy/light dirtiers alike. To
         * retain good responsiveness, maintain global_dirty_limit for
         * tracking slowly down to the knocked down dirty threshold.
         *
         * Both fields are protected by ->lock.
         */
        unsigned long dirty_limit_tstamp;
        unsigned long dirty_limit;
};

/**
 * wb_domain_size_changed - memory available to a wb_domain has changed
 * @dom: wb_domain of interest
 *
 * This function should be called when the amount of memory available to
 * @dom has changed.  It resets @dom's dirty limit parameters to prevent
 * the past values which don't match the current configuration from skewing
 * dirty throttling.  Without this, when memory size of a wb_domain is
 * greatly reduced, the dirty throttling logic may allow too many pages to
 * be dirtied leading to consecutive unnecessary OOMs and may get stuck in
 * that situation.
 */
static inline void wb_domain_size_changed(struct wb_domain *dom)
{
        spin_lock(&dom->lock);
        dom->dirty_limit_tstamp = jiffies;
        dom->dirty_limit = 0;
        spin_unlock(&dom->lock);
}

/*
 * fs/fs-writeback.c
 */        
struct bdi_writeback;
void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
                                                        enum wb_reason reason);
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason);
void sync_inodes_sb(struct super_block *);
void wakeup_flusher_threads(enum wb_reason reason);
void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                enum wb_reason reason);
void inode_wait_for_writeback(struct inode *inode);
void inode_io_list_del(struct inode *inode);

static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc)
{
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                return PAGECACHE_TAG_TOWRITE;
        return PAGECACHE_TAG_DIRTY;
}

#ifdef CONFIG_CGROUP_WRITEBACK

#include <linux/cgroup.h>
#include <linux/bio.h>

void __inode_attach_wb(struct inode *inode, struct folio *folio);
void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
                              size_t bytes);
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
                           enum wb_reason reason, struct wb_completion *done);
void cgroup_writeback_umount(struct super_block *sb);
bool cleanup_offline_cgwb(struct bdi_writeback *wb);

/**
 * inode_attach_wb - associate an inode with its wb
 * @inode: inode of interest
 * @folio: folio being dirtied (may be NULL)
 *
 * If @inode doesn't have its wb, associate it with the wb matching the
 * memcg of @folio or, if @folio is NULL, %current.  May be called w/ or w/o
 * @inode->i_lock.
 */
static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
{
        if (!inode->i_wb)
                __inode_attach_wb(inode, folio);
}

/**
 * inode_detach_wb - disassociate an inode from its wb
 * @inode: inode of interest
 *
 * @inode is being freed.  Detach from its wb.
 */
static inline void inode_detach_wb(struct inode *inode)
{
        if (inode->i_wb) {
                WARN_ON_ONCE(!(inode_state_read_once(inode) & I_CLEAR));
                wb_put(inode->i_wb);
                inode->i_wb = NULL;
        }
}

void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                struct inode *inode);

/**
 * wbc_init_bio - writeback specific initializtion of bio
 * @wbc: writeback_control for the writeback in progress
 * @bio: bio to be initialized
 *
 * @bio is a part of the writeback in progress controlled by @wbc.  Perform
 * writeback specific initialization.  This is used to apply the cgroup
 * writeback context.  Must be called after the bio has been associated with
 * a device.
 */
static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.  This is intentional as we don't want the function to block
         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
         * regular writeback instead of writing things out itself.
         */
        if (wbc->wb)
                bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}

void inode_switch_wbs_work_fn(struct work_struct *work);

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
{
}

static inline void inode_detach_wb(struct inode *inode)
{
}

static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                                               struct inode *inode)
{
}

static inline void wbc_detach_inode(struct writeback_control *wbc)
{
}

static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
}

static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
                                            struct folio *folio, size_t bytes)
{
}

static inline void cgroup_writeback_umount(struct super_block *sb)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * mm/page-writeback.c
 */
/* consolidated parameters for balance_dirty_pages() and its subroutines */
struct dirty_throttle_control {
#ifdef CONFIG_CGROUP_WRITEBACK
        struct wb_domain        *dom;
        struct dirty_throttle_control *gdtc;        /* only set in memcg dtc's */
#endif
        struct bdi_writeback        *wb;
        struct fprop_local_percpu *wb_completions;

        unsigned long                avail;                /* dirtyable */
        unsigned long                dirty;                /* file_dirty + write + nfs */
        unsigned long                thresh;                /* dirty threshold */
        unsigned long                bg_thresh;        /* dirty background threshold */
        unsigned long                limit;                /* hard dirty limit */

        unsigned long                wb_dirty;        /* per-wb counterparts */
        unsigned long                wb_thresh;
        unsigned long                wb_bg_thresh;

        unsigned long                pos_ratio;
        bool                        freerun;
        bool                        dirty_exceeded;
};

bool node_dirty_ok(struct pglist_data *pgdat);
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom);
#endif

extern struct wb_domain global_wb_domain;

/* These are exported to sysctl. */
extern unsigned int dirty_writeback_interval;
extern unsigned int dirty_expire_interval;

void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
unsigned long cgwb_calc_thresh(struct bdi_writeback *wb);

void wb_update_bandwidth(struct bdi_writeback *wb);

/* Invoke balance dirty pages in async mode. */
#define BDP_ASYNC 0x0001

void balance_dirty_pages_ratelimited(struct address_space *mapping);
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
                unsigned int flags);

bool wb_over_bg_thresh(struct bdi_writeback *wb);

struct folio *writeback_iter(struct address_space *mapping,
                struct writeback_control *wbc, struct folio *folio, int *error);

int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end);

bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio);
bool folio_redirty_for_writepage(struct writeback_control *, struct folio *);
bool redirty_page_for_writepage(struct writeback_control *, struct page *);

void sb_mark_inode_writeback(struct inode *inode);
void sb_clear_inode_writeback(struct inode *inode);

/*
 * 4MB minimal write chunk size
 */
#define MIN_WRITEBACK_PAGES        (4096UL >> (PAGE_SHIFT - 10))

#endif                /* WRITEBACK_H */











































































































































































































































































































































    1 












































































































    1 















    1 






    1 









    1 












































































    1 























    1 
    1 





























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
// SPDX-License-Identifier: GPL-2.0-or-later
/* RxRPC key management
 *
 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * RxRPC keys should have a description of describing their purpose:
 *        "afs@example.com"
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <crypto/skcipher.h>
#include <linux/module.h>
#include <linux/net.h>
#include <linux/overflow.h>
#include <linux/skbuff.h>
#include <linux/key-type.h>
#include <linux/ctype.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <keys/rxrpc-type.h>
#include <keys/user-type.h>
#include "ar-internal.h"

static int rxrpc_preparse(struct key_preparsed_payload *);
static void rxrpc_free_preparse(struct key_preparsed_payload *);
static void rxrpc_destroy(struct key *);
static void rxrpc_describe(const struct key *, struct seq_file *);
static long rxrpc_read(const struct key *, char *, size_t);

/*
 * rxrpc defined keys take an arbitrary string as the description and an
 * arbitrary blob of data as the payload
 */
struct key_type key_type_rxrpc = {
        .name                = "rxrpc",
        .flags                = KEY_TYPE_NET_DOMAIN,
        .preparse        = rxrpc_preparse,
        .free_preparse        = rxrpc_free_preparse,
        .instantiate        = generic_key_instantiate,
        .destroy        = rxrpc_destroy,
        .describe        = rxrpc_describe,
        .read                = rxrpc_read,
};
EXPORT_SYMBOL(key_type_rxrpc);

/*
 * parse an RxKAD type XDR format token
 * - the caller guarantees we have at least 4 words
 */
static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep,
                                    size_t datalen,
                                    const __be32 *xdr, unsigned int toklen)
{
        struct rxrpc_key_token *token, **pptoken;
        time64_t expiry;
        size_t plen;
        u32 tktlen;

        _enter(",{%x,%x,%x,%x},%u",
               ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
               toklen);

        if (toklen <= 8 * 4)
                return -EKEYREJECTED;
        tktlen = ntohl(xdr[7]);
        _debug("tktlen: %x", tktlen);
        if (tktlen > AFSTOKEN_RK_TIX_MAX)
                return -EKEYREJECTED;
        if (toklen < 8 * 4 + tktlen)
                return -EKEYREJECTED;

        plen = sizeof(*token) + sizeof(*token->kad) + tktlen;
        prep->quotalen += datalen + plen;

        plen -= sizeof(*token);
        token = kzalloc_obj(*token);
        if (!token)
                return -ENOMEM;

        token->kad = kzalloc(plen, GFP_KERNEL);
        if (!token->kad) {
                kfree(token);
                return -ENOMEM;
        }

        token->security_index        = RXRPC_SECURITY_RXKAD;
        token->kad->ticket_len        = tktlen;
        token->kad->vice_id        = ntohl(xdr[0]);
        token->kad->kvno        = ntohl(xdr[1]);
        token->kad->start        = ntohl(xdr[4]);
        token->kad->expiry        = ntohl(xdr[5]);
        token->kad->primary_flag = ntohl(xdr[6]);
        memcpy(&token->kad->session_key, &xdr[2], 8);
        memcpy(&token->kad->ticket, &xdr[8], tktlen);

        _debug("SCIX: %u", token->security_index);
        _debug("TLEN: %u", token->kad->ticket_len);
        _debug("EXPY: %x", token->kad->expiry);
        _debug("KVNO: %u", token->kad->kvno);
        _debug("PRIM: %u", token->kad->primary_flag);
        _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x",
               token->kad->session_key[0], token->kad->session_key[1],
               token->kad->session_key[2], token->kad->session_key[3],
               token->kad->session_key[4], token->kad->session_key[5],
               token->kad->session_key[6], token->kad->session_key[7]);
        if (token->kad->ticket_len >= 8)
                _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x",
                       token->kad->ticket[0], token->kad->ticket[1],
                       token->kad->ticket[2], token->kad->ticket[3],
                       token->kad->ticket[4], token->kad->ticket[5],
                       token->kad->ticket[6], token->kad->ticket[7]);

        /* count the number of tokens attached */
        prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1);

        /* attach the data */
        for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0];
             *pptoken;
             pptoken = &(*pptoken)->next)
                continue;
        *pptoken = token;
        expiry = rxrpc_u32_to_time64(token->kad->expiry);
        if (expiry < prep->expiry)
                prep->expiry = expiry;

        _leave(" = 0");
        return 0;
}

static u64 xdr_dec64(const __be32 *xdr)
{
        return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]);
}

static time64_t rxrpc_s64_to_time64(s64 time_in_100ns)
{
        bool neg = false;
        u64 tmp = time_in_100ns;

        if (time_in_100ns < 0) {
                tmp = -time_in_100ns;
                neg = true;
        }
        do_div(tmp, 10000000);
        return neg ? -tmp : tmp;
}

/*
 * Parse a YFS-RxGK type XDR format token
 * - the caller guarantees we have at least 4 words
 *
 * struct token_rxgk {
 *        opr_time begintime;
 *        opr_time endtime;
 *        afs_int64 level;
 *        afs_int64 lifetime;
 *        afs_int64 bytelife;
 *        afs_int64 enctype;
 *        opaque key<>;
 *        opaque ticket<>;
 * };
 */
static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep,
                                       size_t datalen,
                                       const __be32 *xdr, unsigned int toklen)
{
        struct rxrpc_key_token *token, **pptoken;
        time64_t expiry;
        size_t plen;
        const __be32 *ticket, *key;
        s64 tmp;
        size_t raw_keylen, raw_tktlen, keylen, tktlen;

        _enter(",{%x,%x,%x,%x},%x",
               ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
               toklen);

        if (6 * 2 + 2 > toklen / 4)
                goto reject;

        key = xdr + (6 * 2 + 1);
        raw_keylen = ntohl(key[-1]);
        _debug("keylen: %zx", raw_keylen);
        if (raw_keylen > AFSTOKEN_GK_KEY_MAX)
                goto reject;
        keylen = round_up(raw_keylen, 4);
        if ((6 * 2 + 2) * 4 + keylen > toklen)
                goto reject;

        ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1);
        raw_tktlen = ntohl(ticket[-1]);
        _debug("tktlen: %zx", raw_tktlen);
        if (raw_tktlen > AFSTOKEN_GK_TOKEN_MAX)
                goto reject;
        tktlen = round_up(raw_tktlen, 4);
        if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) {
                kleave(" = -EKEYREJECTED [%zx!=%x, %zx,%zx]",
                       (6 * 2 + 2) * 4 + keylen + tktlen, toklen,
                       keylen, tktlen);
                goto reject;
        }

        plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen;
        prep->quotalen += datalen + plen;

        plen -= sizeof(*token);
        token = kzalloc_obj(*token);
        if (!token)
                goto nomem;

        token->rxgk = kzalloc(struct_size_t(struct rxgk_key, _key, raw_keylen), GFP_KERNEL);
        if (!token->rxgk)
                goto nomem_token;

        token->security_index        = RXRPC_SECURITY_YFS_RXGK;
        token->rxgk->begintime        = xdr_dec64(xdr + 0 * 2);
        token->rxgk->endtime        = xdr_dec64(xdr + 1 * 2);
        token->rxgk->level        = tmp = xdr_dec64(xdr + 2 * 2);
        if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT)
                goto reject_token;
        token->rxgk->lifetime        = xdr_dec64(xdr + 3 * 2);
        token->rxgk->bytelife        = xdr_dec64(xdr + 4 * 2);
        token->rxgk->enctype        = tmp = xdr_dec64(xdr + 5 * 2);
        if (tmp < 0 || tmp > UINT_MAX)
                goto reject_token;
        token->rxgk->key.len        = raw_keylen;
        token->rxgk->key.data        = token->rxgk->_key;
        token->rxgk->ticket.len = raw_tktlen;

        if (token->rxgk->endtime != 0) {
                expiry = rxrpc_s64_to_time64(token->rxgk->endtime);
                if (expiry < 0)
                        goto expired;
                if (expiry < prep->expiry)
                        prep->expiry = expiry;
        }

        memcpy(token->rxgk->key.data, key, token->rxgk->key.len);

        /* Pad the ticket so that we can use it directly in XDR */
        token->rxgk->ticket.data = kzalloc(tktlen, GFP_KERNEL);
        if (!token->rxgk->ticket.data)
                goto nomem_yrxgk;
        memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len);

        _debug("SCIX: %u",        token->security_index);
        _debug("EXPY: %llx",        token->rxgk->endtime);
        _debug("LIFE: %llx",        token->rxgk->lifetime);
        _debug("BYTE: %llx",        token->rxgk->bytelife);
        _debug("ENC : %u",        token->rxgk->enctype);
        _debug("LEVL: %u",        token->rxgk->level);
        _debug("KLEN: %u",        token->rxgk->key.len);
        _debug("TLEN: %u",        token->rxgk->ticket.len);
        _debug("KEY0: %*phN",        token->rxgk->key.len, token->rxgk->key.data);
        _debug("TICK: %*phN",
               min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data);

        /* count the number of tokens attached */
        prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1);

        /* attach the data */
        for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0];
             *pptoken;
             pptoken = &(*pptoken)->next)
                continue;
        *pptoken = token;

        _leave(" = 0");
        return 0;

nomem_yrxgk:
        kfree(token->rxgk);
nomem_token:
        kfree(token);
nomem:
        return -ENOMEM;
reject_token:
        kfree(token->rxgk);
        kfree(token);
reject:
        return -EKEYREJECTED;
expired:
        kfree(token->rxgk);
        kfree(token);
        return -EKEYEXPIRED;
}

/*
 * attempt to parse the data as the XDR format
 * - the caller guarantees we have more than 7 words
 */
static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
{
        const __be32 *xdr = prep->data, *token, *p;
        const char *cp;
        unsigned int len, paddedlen, loop, ntoken, toklen, sec_ix;
        size_t datalen = prep->datalen;
        int ret, ret2;

        _enter(",{%x,%x,%x,%x},%zu",
               ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
               prep->datalen);

        if (datalen > AFSTOKEN_LENGTH_MAX)
                goto not_xdr;

        /* XDR is an array of __be32's */
        if (datalen & 3)
                goto not_xdr;

        /* the flags should be 0 (the setpag bit must be handled by
         * userspace) */
        if (ntohl(*xdr++) != 0)
                goto not_xdr;
        datalen -= 4;

        /* check the cell name */
        len = ntohl(*xdr++);
        if (len < 1 || len > AFSTOKEN_CELL_MAX)
                goto not_xdr;
        datalen -= 4;
        paddedlen = (len + 3) & ~3;
        if (paddedlen > datalen)
                goto not_xdr;

        cp = (const char *) xdr;
        for (loop = 0; loop < len; loop++)
                if (!isprint(cp[loop]))
                        goto not_xdr;
        for (; loop < paddedlen; loop++)
                if (cp[loop])
                        goto not_xdr;
        _debug("cellname: [%u/%u] '%*.*s'",
               len, paddedlen, len, len, (const char *) xdr);
        datalen -= paddedlen;
        xdr += paddedlen >> 2;

        /* get the token count */
        if (datalen < 12)
                goto not_xdr;
        ntoken = ntohl(*xdr++);
        datalen -= 4;
        _debug("ntoken: %x", ntoken);
        if (ntoken < 1 || ntoken > AFSTOKEN_MAX)
                goto not_xdr;

        /* check each token wrapper */
        p = xdr;
        loop = ntoken;
        do {
                if (datalen < 8)
                        goto not_xdr;
                toklen = ntohl(*p++);
                sec_ix = ntohl(*p);
                datalen -= 4;
                _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix);
                paddedlen = (toklen + 3) & ~3;
                if (toklen < 20 || toklen > datalen || paddedlen > datalen)
                        goto not_xdr;
                datalen -= paddedlen;
                p += paddedlen >> 2;

        } while (--loop > 0);

        _debug("remainder: %zu", datalen);
        if (datalen != 0)
                goto not_xdr;

        /* okay: we're going to assume it's valid XDR format
         * - we ignore the cellname, relying on the key to be correctly named
         */
        ret = -EPROTONOSUPPORT;
        do {
                toklen = ntohl(*xdr++);
                token = xdr;
                xdr += (toklen + 3) / 4;

                sec_ix = ntohl(*token++);
                toklen -= 4;

                _debug("TOKEN type=%x len=%x", sec_ix, toklen);

                switch (sec_ix) {
                case RXRPC_SECURITY_RXKAD:
                        ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen);
                        break;
                case RXRPC_SECURITY_YFS_RXGK:
                        ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen);
                        break;
                default:
                        ret2 = -EPROTONOSUPPORT;
                        break;
                }

                switch (ret2) {
                case 0:
                        ret = 0;
                        break;
                case -EPROTONOSUPPORT:
                        break;
                case -ENOPKG:
                        if (ret != 0)
                                ret = -ENOPKG;
                        break;
                default:
                        ret = ret2;
                        goto error;
                }

        } while (--ntoken > 0);

error:
        _leave(" = %d", ret);
        return ret;

not_xdr:
        _leave(" = -EPROTO");
        return -EPROTO;
}

/*
 * Preparse an rxrpc defined key.
 *
 * Data should be of the form:
 *        OFFSET        LEN        CONTENT
 *        0        4        key interface version number
 *        4        2        security index (type)
 *        6        2        ticket length
 *        8        4        key expiry time (time_t)
 *        12        4        kvno
 *        16        8        session key
 *        24        [len]        ticket
 *
 * if no data is provided, then a no-security key is made
 */
static int rxrpc_preparse(struct key_preparsed_payload *prep)
{
        const struct rxrpc_key_data_v1 *v1;
        struct rxrpc_key_token *token, **pp;
        time64_t expiry;
        size_t plen;
        u32 kver;
        int ret;

        _enter("%zu", prep->datalen);

        /* handle a no-security key */
        if (!prep->data && prep->datalen == 0)
                return 0;

        /* determine if the XDR payload format is being used */
        if (prep->datalen > 7 * 4) {
                ret = rxrpc_preparse_xdr(prep);
                if (ret != -EPROTO)
                        return ret;
        }

        /* get the key interface version number */
        ret = -EINVAL;
        if (prep->datalen <= 4 || !prep->data)
                goto error;
        memcpy(&kver, prep->data, sizeof(kver));
        prep->data += sizeof(kver);
        prep->datalen -= sizeof(kver);
        prep->quotalen = 0;

        _debug("KEY I/F VERSION: %u", kver);

        ret = -EKEYREJECTED;
        if (kver != 1)
                goto error;

        /* deal with a version 1 key */
        ret = -EINVAL;
        if (prep->datalen < sizeof(*v1))
                goto error;

        v1 = prep->data;
        if (prep->datalen != sizeof(*v1) + v1->ticket_length)
                goto error;

        _debug("SCIX: %u", v1->security_index);
        _debug("TLEN: %u", v1->ticket_length);
        _debug("EXPY: %x", v1->expiry);
        _debug("KVNO: %u", v1->kvno);
        _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x",
               v1->session_key[0], v1->session_key[1],
               v1->session_key[2], v1->session_key[3],
               v1->session_key[4], v1->session_key[5],
               v1->session_key[6], v1->session_key[7]);
        if (v1->ticket_length >= 8)
                _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x",
                       v1->ticket[0], v1->ticket[1],
                       v1->ticket[2], v1->ticket[3],
                       v1->ticket[4], v1->ticket[5],
                       v1->ticket[6], v1->ticket[7]);

        ret = -EPROTONOSUPPORT;
        if (v1->security_index != RXRPC_SECURITY_RXKAD)
                goto error;

        plen = sizeof(*token->kad) + v1->ticket_length;
        prep->quotalen += plen + sizeof(*token);

        ret = -ENOMEM;
        token = kzalloc_obj(*token);
        if (!token)
                goto error;
        token->kad = kzalloc(plen, GFP_KERNEL);
        if (!token->kad)
                goto error_free;

        token->security_index                = RXRPC_SECURITY_RXKAD;
        token->kad->ticket_len                = v1->ticket_length;
        token->kad->expiry                = v1->expiry;
        token->kad->kvno                = v1->kvno;
        memcpy(&token->kad->session_key, &v1->session_key, 8);
        memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length);

        /* count the number of tokens attached */
        prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1);

        /* attach the data */
        pp = (struct rxrpc_key_token **)&prep->payload.data[0];
        while (*pp)
                pp = &(*pp)->next;
        *pp = token;
        expiry = rxrpc_u32_to_time64(token->kad->expiry);
        if (expiry < prep->expiry)
                prep->expiry = expiry;
        token = NULL;
        ret = 0;

error_free:
        kfree(token);
error:
        return ret;
}

/*
 * Free token list.
 */
static void rxrpc_free_token_list(struct rxrpc_key_token *token)
{
        struct rxrpc_key_token *next;

        for (; token; token = next) {
                next = token->next;
                switch (token->security_index) {
                case RXRPC_SECURITY_RXKAD:
                        kfree(token->kad);
                        break;
                case RXRPC_SECURITY_YFS_RXGK:
                        kfree(token->rxgk->ticket.data);
                        kfree(token->rxgk);
                        break;
                default:
                        pr_err("Unknown token type %x on rxrpc key\n",
                               token->security_index);
                        BUG();
                }

                kfree(token);
        }
}

/*
 * Clean up preparse data.
 */
static void rxrpc_free_preparse(struct key_preparsed_payload *prep)
{
        rxrpc_free_token_list(prep->payload.data[0]);
}

/*
 * dispose of the data dangling from the corpse of a rxrpc key
 */
static void rxrpc_destroy(struct key *key)
{
        rxrpc_free_token_list(key->payload.data[0]);
}

/*
 * describe the rxrpc key
 */
static void rxrpc_describe(const struct key *key, struct seq_file *m)
{
        const struct rxrpc_key_token *token;
        const char *sep = ": ";

        seq_puts(m, key->description);

        for (token = key->payload.data[0]; token; token = token->next) {
                seq_puts(m, sep);

                switch (token->security_index) {
                case RXRPC_SECURITY_RXKAD:
                        seq_puts(m, "ka");
                        break;
                case RXRPC_SECURITY_YFS_RXGK:
                        seq_puts(m, "ygk");
                        break;
                default: /* we have a ticket we can't encode */
                        seq_printf(m, "%u", token->security_index);
                        break;
                }

                sep = " ";
        }
}

/*
 * grab the security key for a socket
 */
int rxrpc_request_key(struct rxrpc_sock *rx, sockptr_t optval, int optlen)
{
        struct key *key;
        char *description;

        _enter("");

        if (optlen <= 0 || optlen > PAGE_SIZE - 1 || rx->key)
                return -EINVAL;

        description = memdup_sockptr_nul(optval, optlen);
        if (IS_ERR(description))
                return PTR_ERR(description);

        key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk), NULL);
        if (IS_ERR(key)) {
                kfree(description);
                _leave(" = %ld", PTR_ERR(key));
                return PTR_ERR(key);
        }

        rx->key = key;
        kfree(description);
        _leave(" = 0 [key %x]", key->serial);
        return 0;
}

/*
 * generate a server data key
 */
int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
                              const void *session_key,
                              time64_t expiry,
                              u32 kvno)
{
        const struct cred *cred = current_cred();
        struct key *key;
        int ret;

        struct {
                u32 kver;
                struct rxrpc_key_data_v1 v1;
        } data;

        _enter("");

        key = key_alloc(&key_type_rxrpc, "x",
                        GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0,
                        KEY_ALLOC_NOT_IN_QUOTA, NULL);
        if (IS_ERR(key)) {
                _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
                return -ENOMEM;
        }

        _debug("key %d", key_serial(key));

        data.kver = 1;
        data.v1.security_index = RXRPC_SECURITY_RXKAD;
        data.v1.ticket_length = 0;
        data.v1.expiry = rxrpc_time64_to_u32(expiry);
        data.v1.kvno = 0;

        memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key));

        ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL);
        if (ret < 0)
                goto error;

        conn->key = key;
        _leave(" = 0 [%d]", key_serial(key));
        return 0;

error:
        key_revoke(key);
        key_put(key);
        _leave(" = -ENOMEM [ins %d]", ret);
        return -ENOMEM;
}
EXPORT_SYMBOL(rxrpc_get_server_data_key);

/**
 * rxrpc_get_null_key - Generate a null RxRPC key
 * @keyname: The name to give the key.
 *
 * Generate a null RxRPC key that can be used to indicate anonymous security is
 * required for a particular domain.
 *
 * Return: The new key or a negative error code.
 */
struct key *rxrpc_get_null_key(const char *keyname)
{
        const struct cred *cred = current_cred();
        struct key *key;
        int ret;

        key = key_alloc(&key_type_rxrpc, keyname,
                        GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
                        KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL);
        if (IS_ERR(key))
                return key;

        ret = key_instantiate_and_link(key, NULL, 0, NULL, NULL);
        if (ret < 0) {
                key_revoke(key);
                key_put(key);
                return ERR_PTR(ret);
        }

        return key;
}
EXPORT_SYMBOL(rxrpc_get_null_key);

/*
 * read the contents of an rxrpc key
 * - this returns the result in XDR form
 */
static long rxrpc_read(const struct key *key,
                       char *buffer, size_t buflen)
{
        const struct rxrpc_key_token *token;
        size_t size;
        __be32 *xdr, *oldxdr;
        u32 cnlen, toksize, ntoks, tok, zero;
        u16 toksizes[AFSTOKEN_MAX];

        _enter("");

        /* we don't know what form we should return non-AFS keys in */
        if (memcmp(key->description, "afs@", 4) != 0)
                return -EOPNOTSUPP;
        cnlen = strlen(key->description + 4);

#define RND(X) (((X) + 3) & ~3)

        /* AFS keys we return in XDR form, so we need to work out the size of
         * the XDR */
        size = 2 * 4;        /* flags, cellname len */
        size += RND(cnlen);        /* cellname */
        size += 1 * 4;        /* token count */

        ntoks = 0;
        for (token = key->payload.data[0]; token; token = token->next) {
                toksize = 4;        /* sec index */

                switch (token->security_index) {
                case RXRPC_SECURITY_RXKAD:
                        toksize += 8 * 4;        /* viceid, kvno, key*2, begin,
                                                 * end, primary, tktlen */
                        if (!token->no_leak_key)
                                toksize += RND(token->kad->ticket_len);
                        break;

                case RXRPC_SECURITY_YFS_RXGK:
                        toksize += 6 * 8 + 2 * 4;
                        if (!token->no_leak_key)
                                toksize += RND(token->rxgk->key.len);
                        toksize += RND(token->rxgk->ticket.len);
                        break;

                default: /* we have a ticket we can't encode */
                        pr_err("Unsupported key token type (%u)\n",
                               token->security_index);
                        return -ENOPKG;
                }

                _debug("token[%u]: toksize=%u", ntoks, toksize);
                if (WARN_ON(toksize > AFSTOKEN_LENGTH_MAX))
                        return -EIO;

                toksizes[ntoks++] = toksize;
                size += toksize + 4; /* each token has a length word */
        }

#undef RND

        if (!buffer || buflen < size)
                return size;

        xdr = (__be32 *)buffer;
        zero = 0;
#define ENCODE(x)                                \
        do {                                        \
                *xdr++ = htonl(x);                \
        } while(0)
#define ENCODE_DATA(l, s)                                                \
        do {                                                                \
                u32 _l = (l);                                                \
                ENCODE(l);                                                \
                memcpy(xdr, (s), _l);                                        \
                if (_l & 3)                                                \
                        memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3));        \
                xdr += (_l + 3) >> 2;                                        \
        } while(0)
#define ENCODE_BYTES(l, s)                                                \
        do {                                                                \
                u32 _l = (l);                                                \
                memcpy(xdr, (s), _l);                                        \
                if (_l & 3)                                                \
                        memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3));        \
                xdr += (_l + 3) >> 2;                                        \
        } while(0)
#define ENCODE64(x)                                        \
        do {                                                \
                __be64 y = cpu_to_be64(x);                \
                memcpy(xdr, &y, 8);                        \
                xdr += 8 >> 2;                                \
        } while(0)
#define ENCODE_STR(s)                                \
        do {                                        \
                const char *_s = (s);                \
                ENCODE_DATA(strlen(_s), _s);        \
        } while(0)

        ENCODE(0);                                        /* flags */
        ENCODE_DATA(cnlen, key->description + 4);        /* cellname */
        ENCODE(ntoks);

        tok = 0;
        for (token = key->payload.data[0]; token; token = token->next) {
                toksize = toksizes[tok++];
                ENCODE(toksize);
                oldxdr = xdr;
                ENCODE(token->security_index);

                switch (token->security_index) {
                case RXRPC_SECURITY_RXKAD:
                        ENCODE(token->kad->vice_id);
                        ENCODE(token->kad->kvno);
                        ENCODE_BYTES(8, token->kad->session_key);
                        ENCODE(token->kad->start);
                        ENCODE(token->kad->expiry);
                        ENCODE(token->kad->primary_flag);
                        if (token->no_leak_key)
                                ENCODE(0);
                        else
                                ENCODE_DATA(token->kad->ticket_len, token->kad->ticket);
                        break;

                case RXRPC_SECURITY_YFS_RXGK:
                        ENCODE64(token->rxgk->begintime);
                        ENCODE64(token->rxgk->endtime);
                        ENCODE64(token->rxgk->level);
                        ENCODE64(token->rxgk->lifetime);
                        ENCODE64(token->rxgk->bytelife);
                        ENCODE64(token->rxgk->enctype);
                        if (token->no_leak_key)
                                ENCODE(0);
                        else
                                ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data);
                        ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data);
                        break;

                default:
                        pr_err("Unsupported key token type (%u)\n",
                               token->security_index);
                        return -ENOPKG;
                }

                if (WARN_ON((unsigned long)xdr - (unsigned long)oldxdr !=
                            toksize))
                        return -EIO;
        }

#undef ENCODE_STR
#undef ENCODE_DATA
#undef ENCODE64
#undef ENCODE

        if (WARN_ON(tok != ntoks))
                return -EIO;
        if (WARN_ON((unsigned long)xdr - (unsigned long)buffer != size))
                return -EIO;
        _leave(" = %zu", size);
        return size;
}



















































































    3 











































    4 





















    5 





    5 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_BL_H
#define _LINUX_LIST_BL_H

#include <linux/list.h>
#include <linux/bit_spinlock.h>

/*
 * Special version of lists, where head of the list has a lock in the lowest
 * bit. This is useful for scalable hash tables without increasing memory
 * footprint overhead.
 *
 * For modification operations, the 0 bit of hlist_bl_head->first
 * pointer must be set.
 *
 * With some small modifications, this can easily be adapted to store several
 * arbitrary bits (not just a single lock bit), if the need arises to store
 * some fast and compact auxiliary data.
 */

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
#define LIST_BL_LOCKMASK        1UL
#else
#define LIST_BL_LOCKMASK        0UL
#endif

#ifdef CONFIG_DEBUG_LIST
#define LIST_BL_BUG_ON(x) BUG_ON(x)
#else
#define LIST_BL_BUG_ON(x)
#endif


struct hlist_bl_head {
        struct hlist_bl_node *first;
};

struct hlist_bl_node {
        struct hlist_bl_node *next, **pprev;
};
#define INIT_HLIST_BL_HEAD(ptr) \
        ((ptr)->first = NULL)

static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)

static inline bool  hlist_bl_unhashed(const struct hlist_bl_node *h)
{
        return !h->pprev;
}

static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)h->first & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_set_first(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
}

static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
{
        return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_add_head(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;
        hlist_bl_set_first(h, n);
}

static inline void hlist_bl_add_before(struct hlist_bl_node *n,
                                       struct hlist_bl_node *next)
{
        struct hlist_bl_node **pprev = next->pprev;

        n->pprev = pprev;
        n->next = next;
        next->pprev = &n->next;

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
}

static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
                                       struct hlist_bl_node *prev)
{
        n->next = prev->next;
        n->pprev = &prev->next;
        prev->next = n;

        if (n->next)
                n->next->pprev = &n->next;
}

static inline void __hlist_bl_del(struct hlist_bl_node *n)
{
        struct hlist_bl_node *next = n->next;
        struct hlist_bl_node **pprev = n->pprev;

        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((unsigned long)next |
                         ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
        if (next)
                next->pprev = pprev;
}

static inline void hlist_bl_del(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

static inline void hlist_bl_del_init(struct hlist_bl_node *n)
{
        if (!hlist_bl_unhashed(n)) {
                __hlist_bl_del(n);
                INIT_HLIST_BL_NODE(n);
        }
}

static inline void hlist_bl_lock(struct hlist_bl_head *b)
        __acquires(__bitlock(0, b))
{
        bit_spin_lock(0, (unsigned long *)b);
}

static inline void hlist_bl_unlock(struct hlist_bl_head *b)
        __releases(__bitlock(0, b))
{
        __bit_spin_unlock(0, (unsigned long *)b);
}

static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
{
        return bit_spin_is_locked(0, (unsigned long *)b);
}

/**
 * hlist_bl_for_each_entry        - iterate over list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_bl_for_each_entry(tpos, pos, head, member)                \
        for (pos = hlist_bl_first(head);                                \
             pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

/**
 * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @n:                another &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member)         \
        for (pos = hlist_bl_first(head);                                 \
             pos && ({ n = pos->next; 1; }) &&                                  \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = n)

#endif



























































































































































    3 























































    4 









    1 

















































































































































    8 











    8 









    8 







    8 



















































    8 




























































































































































































































































































   31 

























































































































































































































































































    1 

















































































































   35 








   26 










   34 































































































































































































































































































































































































































































   27 



   14 

























    1 





    1 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H

#include <linux/mem_encrypt.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>

/*
 * Macro to mark a page protection value as UC-
 */
#define pgprot_noncached(prot)                                                \
        ((boot_cpu_data.x86 > 3)                                        \
         ? (__pgprot(pgprot_val(prot) |                                        \
                     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))        \
         : (prot))

#ifndef __ASSEMBLER__
#include <linux/spinlock.h>
#include <asm/x86_init.h>
#include <asm/pkru.h>
#include <asm/fpu/api.h>
#include <asm/coco.h>
#include <asm-generic/pgtable_uffd.h>
#include <linux/page_table_check.h>

extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);

struct seq_file;
void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
                                   bool user);
bool ptdump_walk_pgd_level_checkwx(void);
#define ptdump_check_wx ptdump_walk_pgd_level_checkwx
void ptdump_walk_user_pgd_level_checkwx(void);

/*
 * Macros to add or remove encryption attribute
 */
#define pgprot_encrypted(prot)        __pgprot(cc_mkenc(pgprot_val(prot)))
#define pgprot_decrypted(prot)        __pgprot(cc_mkdec(pgprot_val(prot)))

#ifdef CONFIG_DEBUG_WX
#define debug_checkwx_user()        ptdump_walk_user_pgd_level_checkwx()
#else
#define debug_checkwx_user()        do { } while (0)
#endif

extern spinlock_t pgd_lock;
extern struct list_head pgd_list;

extern struct mm_struct *pgd_page_get_mm(struct page *page);

extern pmdval_t early_pmd_flags;

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else  /* !CONFIG_PARAVIRT_XXL */
#define set_pte(ptep, pte)                native_set_pte(ptep, pte)

#define set_pte_atomic(ptep, pte)                                        \
        native_set_pte_atomic(ptep, pte)

#define set_pmd(pmdp, pmd)                native_set_pmd(pmdp, pmd)

#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd)                native_set_pgd(pgdp, pgd)
#define pgd_clear(pgd)                        (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
#endif

#ifndef set_p4d
# define set_p4d(p4dp, p4d)                native_set_p4d(p4dp, p4d)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define p4d_clear(p4d)                        native_p4d_clear(p4d)
#endif

#ifndef set_pud
# define set_pud(pudp, pud)                native_set_pud(pudp, pud)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_clear(pud)                        native_pud_clear(pud)
#endif

#define pte_clear(mm, addr, ptep)        native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd)                        native_pmd_clear(pmd)

#define pgd_val(x)        native_pgd_val(x)
#define __pgd(x)        native_make_pgd(x)

#ifndef __PAGETABLE_P4D_FOLDED
#define p4d_val(x)        native_p4d_val(x)
#define __p4d(x)        native_make_p4d(x)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_val(x)        native_pud_val(x)
#define __pud(x)        native_make_pud(x)
#endif

#ifndef __PAGETABLE_PMD_FOLDED
#define pmd_val(x)        native_pmd_val(x)
#define __pmd(x)        native_make_pmd(x)
#endif

#define pte_val(x)        native_pte_val(x)
#define __pte(x)        native_make_pte(x)

#define arch_end_context_switch(prev)        do {} while(0)
static inline void arch_flush_lazy_mmu_mode(void) {}
#endif        /* CONFIG_PARAVIRT_XXL */

static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v | set);
}

static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v & ~clear);
}

static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v | set);
}

static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v & ~clear);
}

/*
 * The following only work if pte_present() is true.
 * Undefined behaviour if not..
 */
static inline bool pte_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_DIRTY_BITS;
}

static inline bool pte_shstk(pte_t pte)
{
        return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
               (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
}

static inline int pte_young(pte_t pte)
{
        return pte_flags(pte) & _PAGE_ACCESSED;
}

static inline bool pte_decrypted(pte_t pte)
{
        return cc_mkdec(pte_val(pte)) == pte_val(pte);
}

#define pmd_dirty pmd_dirty
static inline bool pmd_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
}

static inline bool pmd_shstk(pmd_t pmd)
{
        return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
               (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
               (_PAGE_DIRTY | _PAGE_PSE);
}

#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_ACCESSED;
}

static inline bool pud_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_DIRTY_BITS;
}

static inline int pud_young(pud_t pud)
{
        return pud_flags(pud) & _PAGE_ACCESSED;
}

static inline bool pud_shstk(pud_t pud)
{
        return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
               (pud_flags(pud) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
               (_PAGE_DIRTY | _PAGE_PSE);
}

static inline int pte_write(pte_t pte)
{
        /*
         * Shadow stack pages are logically writable, but do not have
         * _PAGE_RW.  Check for them separately from _PAGE_RW itself.
         */
        return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
}

#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
        /*
         * Shadow stack pages are logically writable, but do not have
         * _PAGE_RW.  Check for them separately from _PAGE_RW itself.
         */
        return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd);
}

#define pud_write pud_write
static inline int pud_write(pud_t pud)
{
        return pud_flags(pud) & _PAGE_RW;
}

static inline int pte_huge(pte_t pte)
{
        return pte_flags(pte) & _PAGE_PSE;
}

static inline int pte_global(pte_t pte)
{
        return pte_flags(pte) & _PAGE_GLOBAL;
}

static inline int pte_exec(pte_t pte)
{
        return !(pte_flags(pte) & _PAGE_NX);
}

static inline int pte_special(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SPECIAL;
}

/* Entries that were set to PROT_NONE are inverted */

static inline u64 protnone_mask(u64 val);

#define PFN_PTE_SHIFT        PAGE_SHIFT

static inline unsigned long pte_pfn(pte_t pte)
{
        phys_addr_t pfn = pte_val(pte);
        pfn ^= protnone_mask(pfn);
        return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
}

static inline unsigned long pmd_pfn(pmd_t pmd)
{
        phys_addr_t pfn = pmd_val(pmd);
        pfn ^= protnone_mask(pfn);
        return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}

#define pud_pfn pud_pfn
static inline unsigned long pud_pfn(pud_t pud)
{
        phys_addr_t pfn = pud_val(pud);
        pfn ^= protnone_mask(pfn);
        return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}

static inline unsigned long p4d_pfn(p4d_t p4d)
{
        return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
}

static inline unsigned long pgd_pfn(pgd_t pgd)
{
        return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}

#define pte_page(pte)        pfn_to_page(pte_pfn(pte))

#define pmd_leaf pmd_leaf
static inline bool pmd_leaf(pmd_t pte)
{
        return pmd_flags(pte) & _PAGE_PSE;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
        return (pmd_val(pmd) & _PAGE_PSE) == _PAGE_PSE;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_trans_huge(pud_t pud)
{
        return (pud_val(pud) & _PAGE_PSE) == _PAGE_PSE;
}
#endif

#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
        return boot_cpu_has(X86_FEATURE_PSE);
}

#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
static inline bool pmd_special(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SPECIAL;
}

static inline pmd_t pmd_mkspecial(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SPECIAL);
}
#endif        /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */

#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
static inline bool pud_special(pud_t pud)
{
        return pud_flags(pud) & _PAGE_SPECIAL;
}

static inline pud_t pud_mkspecial(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_SPECIAL);
}
#endif        /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v | set);
}

static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v & ~clear);
}

/*
 * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the
 * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So
 * when creating dirty, write-protected memory, a software bit is used:
 * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the
 * Dirty bit to SavedDirty, and vice-vesra.
 *
 * This shifting is only done if needed. In the case of shifting
 * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of
 * shifting SavedDirty->Dirty, the condition is Write=1.
 */
static inline pgprotval_t mksaveddirty_shift(pgprotval_t v)
{
        pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1;

        v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY;
        v &= ~(cond << _PAGE_BIT_DIRTY);

        return v;
}

static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v)
{
        pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1;

        v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY;
        v &= ~(cond << _PAGE_BIT_SAVED_DIRTY);

        return v;
}

static inline pte_t pte_mksaveddirty(pte_t pte)
{
        pteval_t v = native_pte_val(pte);

        v = mksaveddirty_shift(v);
        return native_make_pte(v);
}

static inline pte_t pte_clear_saveddirty(pte_t pte)
{
        pteval_t v = native_pte_val(pte);

        v = clear_saveddirty_shift(v);
        return native_make_pte(v);
}

static inline pte_t pte_wrprotect(pte_t pte)
{
        pte = pte_clear_flags(pte, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PTE (Write=0,Dirty=1). Move the hardware
         * dirty value to the software bit, if present.
         */
        return pte_mksaveddirty(pte);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_UFFD_WP;
}

static inline pte_t pte_mkuffd_wp(pte_t pte)
{
        return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
}

static inline pte_t pte_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pte_t pte_mkclean(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}

static inline pte_t pte_mkold(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_mkexec(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_NX);
}

static inline pte_t pte_mkdirty(pte_t pte)
{
        pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pte_mksaveddirty(pte);
}

static inline pte_t pte_mkwrite_shstk(pte_t pte)
{
        pte = pte_clear_flags(pte, _PAGE_RW);

        return pte_set_flags(pte, _PAGE_DIRTY);
}

static inline pte_t pte_mkyoung(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_mkwrite_novma(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_RW);
}

struct vm_area_struct;
pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma);
#define pte_mkwrite pte_mkwrite

static inline pte_t pte_mkhuge(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_clrhuge(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_mkglobal(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_clrglobal(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SPECIAL);
}

/* See comments above mksaveddirty_shift() */
static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
{
        pmdval_t v = native_pmd_val(pmd);

        v = mksaveddirty_shift(v);
        return native_make_pmd(v);
}

/* See comments above mksaveddirty_shift() */
static inline pmd_t pmd_clear_saveddirty(pmd_t pmd)
{
        pmdval_t v = native_pmd_val(pmd);

        v = clear_saveddirty_shift(v);
        return native_make_pmd(v);
}

static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
        pmd = pmd_clear_flags(pmd, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PMD (RW=0, Dirty=1). Move the hardware
         * dirty value to the software bit.
         */
        return pmd_mksaveddirty(pmd);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pmd_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_UFFD_WP;
}

static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
{
        return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP));
}

static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pmd_t pmd_mkold(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkclean(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}

static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
        pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pmd_mksaveddirty(pmd);
}

static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
{
        pmd = pmd_clear_flags(pmd, _PAGE_RW);

        return pmd_set_flags(pmd, _PAGE_DIRTY);
}

static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_PSE);
}

static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_RW);
}

pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
#define pmd_mkwrite pmd_mkwrite

/* See comments above mksaveddirty_shift() */
static inline pud_t pud_mksaveddirty(pud_t pud)
{
        pudval_t v = native_pud_val(pud);

        v = mksaveddirty_shift(v);
        return native_make_pud(v);
}

/* See comments above mksaveddirty_shift() */
static inline pud_t pud_clear_saveddirty(pud_t pud)
{
        pudval_t v = native_pud_val(pud);

        v = clear_saveddirty_shift(v);
        return native_make_pud(v);
}

static inline pud_t pud_mkold(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkclean(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}

static inline pud_t pud_wrprotect(pud_t pud)
{
        pud = pud_clear_flags(pud, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PUD (RW=0, Dirty=1). Move the hardware
         * dirty value to the software bit.
         */
        return pud_mksaveddirty(pud);
}

static inline pud_t pud_mkdirty(pud_t pud)
{
        pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pud_mksaveddirty(pud);
}

static inline pud_t pud_mkhuge(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_PSE);
}

static inline pud_t pud_mkyoung(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkwrite(pud_t pud)
{
        pud = pud_set_flags(pud, _PAGE_RW);

        return pud_clear_saveddirty(pud);
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SOFT_DIRTY;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
}

static inline int pud_soft_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_SOFT_DIRTY;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_mksoft_dirty(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_clear_soft_dirty(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
}

#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */

/*
 * Mask out unsupported bits in a present pgprot.  Non-present pgprots
 * can use those bits for other purposes, so leave them be.
 */
static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
{
        pgprotval_t protval = pgprot_val(pgprot);

        if (protval & _PAGE_PRESENT)
                protval &= __supported_pte_mask;

        return protval;
}

static inline pgprotval_t check_pgprot(pgprot_t pgprot)
{
        pgprotval_t massaged_val = massage_pgprot(pgprot);

        /* mmdebug.h can not be included here because of dependencies */
#ifdef CONFIG_DEBUG_VM
        WARN_ONCE(pgprot_val(pgprot) != massaged_val,
                  "attempted to set unsupported pgprot: %016llx "
                  "bits: %016llx supported: %016llx\n",
                  (u64)pgprot_val(pgprot),
                  (u64)pgprot_val(pgprot) ^ massaged_val,
                  (u64)__supported_pte_mask);
#endif

        return massaged_val;
}

static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        /* This bit combination is used to mark shadow stacks */
        WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) ==
                        _PAGE_DIRTY);
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PTE_PFN_MASK;
        return __pte(pfn | check_pgprot(pgprot));
}

static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PMD_PAGE_MASK;
        return __pmd(pfn | check_pgprot(pgprot));
}

static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PUD_PAGE_MASK;
        return __pud(pfn | check_pgprot(pgprot));
}

static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
        return pfn_pmd(pmd_pfn(pmd),
                      __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}

static inline pud_t pud_mkinvalid(pud_t pud)
{
        return pfn_pud(pud_pfn(pud),
                       __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}

static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);

static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
        pteval_t val = pte_val(pte), oldval = val;
        pte_t pte_result;

        /*
         * Chop off the NX bit (if present), and add the NX portion of
         * the newprot (if present):
         */
        val &= _PAGE_CHG_MASK;
        val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);

        pte_result = __pte(val);

        /*
         * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid:
         *  1. Marking Write=0 PTEs Dirty=1
         *  2. Marking Dirty=1 PTEs Write=0
         *
         * The first case cannot happen because the _PAGE_CHG_MASK will filter
         * out any Dirty bit passed in newprot. Handle the second case by
         * going through the mksaveddirty exercise. Only do this if the old
         * value was Write=1 to avoid doing this on Shadow Stack PTEs.
         */
        if (oldval & _PAGE_RW)
                pte_result = pte_mksaveddirty(pte_result);
        else
                pte_result = pte_clear_saveddirty(pte_result);

        return pte_result;
}

static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
        pmdval_t val = pmd_val(pmd), oldval = val;
        pmd_t pmd_result;

        val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY);
        val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);

        pmd_result = __pmd(val);

        /*
         * Avoid creating shadow stack PMD by accident.  See comment in
         * pte_modify().
         */
        if (oldval & _PAGE_RW)
                pmd_result = pmd_mksaveddirty(pmd_result);
        else
                pmd_result = pmd_clear_saveddirty(pmd_result);

        return pmd_result;
}

static inline pud_t pud_modify(pud_t pud, pgprot_t newprot)
{
        pudval_t val = pud_val(pud), oldval = val;
        pud_t pud_result;

        val &= _HPAGE_CHG_MASK;
        val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PHYSICAL_PUD_PAGE_MASK);

        pud_result = __pud(val);

        /*
         * Avoid creating shadow stack PUD by accident.  See comment in
         * pte_modify().
         */
        if (oldval & _PAGE_RW)
                pud_result = pud_mksaveddirty(pud_result);
        else
                pud_result = pud_clear_saveddirty(pud_result);

        return pud_result;
}

/*
 * mprotect needs to preserve PAT and encryption bits when updating
 * vm_page_prot
 */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
        pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK;
        return __pgprot(preservebits | addbits);
}

#define pte_pgprot(x) __pgprot(pte_flags(x))
#define pmd_pgprot(x) __pgprot(pmd_flags(x))
#define pud_pgprot(x) __pgprot(pud_flags(x))
#define p4d_pgprot(x) __pgprot(p4d_flags(x))

#define canon_pgprot(p) __pgprot(massage_pgprot(p))

static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
                                         enum page_cache_mode pcm,
                                         enum page_cache_mode new_pcm)
{
        /*
         * PAT type is always WB for untracked ranges, so no need to check.
         */
        if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
                return 1;

        /*
         * Certain new memtypes are not allowed with certain
         * requested memtype:
         * - request is uncached, return cannot be write-back
         * - request is write-combine, return cannot be write-back
         * - request is write-through, return cannot be write-back
         * - request is write-through, return cannot be write-combine
         */
        if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WC &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WC)) {
                return 0;
        }

        return 1;
}

pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);

/*
 * Take a PGD location (pgdp) and a pgd value that needs to be set there.
 * Populates the user and returns the resulting PGD that must be set in
 * the kernel copy of the page tables.
 */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        if (!static_cpu_has(X86_FEATURE_PTI))
                return pgd;
        return __pti_set_user_pgtbl(pgdp, pgd);
}
#else   /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        return pgd;
}
#endif  /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */

#endif        /* __ASSEMBLER__ */


#ifdef CONFIG_X86_32
# include <asm/pgtable_32.h>
#else
# include <asm/pgtable_64.h>
#endif

#ifndef __ASSEMBLER__
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/log2.h>
#include <asm/fixmap.h>

static inline int pte_none(pte_t pte)
{
        return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
}

#define __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t a, pte_t b)
{
        return a.pte == b.pte;
}

static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
        if (__pte_needs_invert(pte_val(pte)))
                return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT));
        return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#define pte_advance_pfn        pte_advance_pfn

static inline int pte_present(pte_t a)
{
        return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}

#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
        if (pte_flags(a) & _PAGE_PRESENT)
                return true;

        if ((pte_flags(a) & _PAGE_PROTNONE) &&
                        atomic_read(&mm->tlb_flush_pending))
                return true;

        return false;
}

static inline int pmd_present(pmd_t pmd)
{
        /*
         * Checking for _PAGE_PSE is needed too because
         * split_huge_page will temporarily clear the present bit (but
         * the _PAGE_PSE flag will remain set at all times while the
         * _PAGE_PRESENT bit is clear).
         */
        return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * These work without NUMA balancing but the kernel does not care. See the
 * comment in include/linux/pgtable.h
 */
static inline int pte_protnone(pte_t pte)
{
        return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}
#endif /* CONFIG_NUMA_BALANCING */

static inline int pmd_none(pmd_t pmd)
{
        /* Only check low word on 32-bit platforms, since it might be
           out of sync with upper half. */
        unsigned long val = native_pmd_val(pmd);
        return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
}

static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
        return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pmd_page(pmd)        pfn_to_page(pmd_pfn(pmd))

static inline int pmd_bad(pmd_t pmd)
{
        return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
               (_KERNPG_TABLE & ~_PAGE_ACCESSED);
}

static inline unsigned long pages_to_mb(unsigned long npg)
{
        return npg >> (20 - PAGE_SHIFT);
}

#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
        return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int pud_present(pud_t pud)
{
        return pud_flags(pud) & _PAGE_PRESENT;
}

static inline pmd_t *pud_pgtable(pud_t pud)
{
        return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pud_page(pud)        pfn_to_page(pud_pfn(pud))

#define pud_leaf pud_leaf
static inline bool pud_leaf(pud_t pud)
{
        return pud_val(pud) & _PAGE_PSE;
}

static inline int pud_bad(pud_t pud)
{
        return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
}
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3
static inline int p4d_none(p4d_t p4d)
{
        return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int p4d_present(p4d_t p4d)
{
        return p4d_flags(p4d) & _PAGE_PRESENT;
}

static inline pud_t *p4d_pgtable(p4d_t p4d)
{
        return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define p4d_page(p4d)        pfn_to_page(p4d_pfn(p4d))

static inline int p4d_bad(p4d_t p4d)
{
        unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;

        if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
#endif  /* CONFIG_PGTABLE_LEVELS > 3 */

static inline unsigned long p4d_index(unsigned long address)
{
        return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
}

#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 1;
        return pgd_flags(pgd) & _PAGE_PRESENT;
}

static inline unsigned long pgd_page_vaddr(pgd_t pgd)
{
        return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pgd_page(pgd)        pfn_to_page(pgd_pfn(pgd))

/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
        if (!pgtable_l5_enabled())
                return (p4d_t *)pgd;
        return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}

static inline int pgd_bad(pgd_t pgd)
{
        unsigned long ignore_flags = _PAGE_USER;

        if (!pgtable_l5_enabled())
                return 0;

        if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}

static inline int pgd_none(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 0;
        /*
         * There is no need to do a workaround for the KNL stray
         * A/D bit erratum here.  PGDs only point to page tables
         * except on 32-bit non-PAE which is not supported on
         * KNL.
         */
        return !native_pgd_val(pgd);
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */

#endif        /* __ASSEMBLER__ */

#define KERNEL_PGD_BOUNDARY        pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS                (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)

#ifndef __ASSEMBLER__

extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
void __init poking_init(void);
unsigned long init_memory_mapping(unsigned long start,
                                  unsigned long end, pgprot_t prot);

#ifdef CONFIG_X86_64
extern pgd_t trampoline_pgd_entry;
#endif

/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
{
        pte_t res = *ptep;

        /* Pure native function needs no input for mm, addr */
        native_pte_clear(NULL, 0, ptep);
        return res;
}

static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
{
        pmd_t res = *pmdp;

        native_pmd_clear(pmdp);
        return res;
}

static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
{
        pud_t res = *pudp;

        native_pud_clear(pudp);
        return res;
}

static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t pmd)
{
        page_table_check_pmd_set(mm, addr, pmdp, pmd);
        set_pmd(pmdp, pmd);
}

static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
                              pud_t *pudp, pud_t pud)
{
        page_table_check_pud_set(mm, addr, pudp, pud);
        native_set_pud(pudp, pud);
}

/*
 * We only update the dirty/accessed state if we set
 * the dirty bit by hand in the kernel, since the hardware
 * will do the accessed bit for us, and we don't want to
 * race with other CPU's that might be updating the dirty
 * bit at the same time.
 */
struct vm_area_struct;

#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);

#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
bool ptep_test_and_clear_young(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
bool ptep_clear_flush_young(struct vm_area_struct *vma,
                unsigned long address, pte_t *ptep);

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pte_t *ptep)
{
        pte_t pte = native_ptep_get_and_clear(ptep);
        page_table_check_pte_clear(mm, addr, pte);
        return pte;
}

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long addr, pte_t *ptep,
                                            int full)
{
        pte_t pte;
        if (full) {
                /*
                 * Full address destruction in progress; paravirt does not
                 * care about updates and native needs no locking
                 */
                pte = native_local_ptep_get_and_clear(ptep);
                page_table_check_pte_clear(mm, addr, pte);
        } else {
                pte = ptep_get_and_clear(mm, addr, ptep);
        }
        return pte;
}

#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pte_t *ptep)
{
        /*
         * Avoid accidentally creating shadow stack PTEs
         * (Write=0,Dirty=1).  Use cmpxchg() to prevent races with
         * the hardware setting Dirty=1.
         */
        pte_t old_pte, new_pte;

        old_pte = READ_ONCE(*ptep);
        do {
                new_pte = pte_wrprotect(old_pte);
        } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
}

#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)

#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);

#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
bool pmdp_test_and_clear_young(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmdp);
bool pudp_test_and_clear_young(struct vm_area_struct *vma,
                unsigned long addr, pud_t *pudp);

#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
bool pmdp_clear_flush_young(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp);


#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pmd_t *pmdp)
{
        pmd_t pmd = native_pmdp_get_and_clear(pmdp);

        page_table_check_pmd_clear(mm, addr, pmd);

        return pmd;
}

#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                        unsigned long addr, pud_t *pudp)
{
        pud_t pud = native_pudp_get_and_clear(pudp);

        page_table_check_pud_clear(mm, addr, pud);

        return pud;
}

#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pmd_t *pmdp)
{
        /*
         * Avoid accidentally creating shadow stack PTEs
         * (Write=0,Dirty=1).  Use cmpxchg() to prevent races with
         * the hardware setting Dirty=1.
         */
        pmd_t old_pmd, new_pmd;

        old_pmd = READ_ONCE(*pmdp);
        do {
                new_pmd = pmd_wrprotect(old_pmd);
        } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd));
}

#ifndef pmdp_establish
#define pmdp_establish pmdp_establish
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
        if (IS_ENABLED(CONFIG_SMP)) {
                return xchg(pmdp, pmd);
        } else {
                pmd_t old = *pmdp;
                WRITE_ONCE(*pmdp, pmd);
                return old;
        }
}
#endif

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline pud_t pudp_establish(struct vm_area_struct *vma,
                unsigned long address, pud_t *pudp, pud_t pud)
{
        page_table_check_pud_set(vma->vm_mm, address, pudp, pud);
        if (IS_ENABLED(CONFIG_SMP)) {
                return xchg(pudp, pud);
        } else {
                pud_t old = *pudp;
                WRITE_ONCE(*pudp, pud);
                return old;
        }
}
#endif

#define __HAVE_ARCH_PMDP_INVALIDATE_AD
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
                                unsigned long address, pmd_t *pmdp);

pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
                      pud_t *pudp);

/*
 * Page table pages are page-aligned.  The lower half of the top
 * level is used for userspace and the top half for the kernel.
 *
 * Returns true for parts of the PGD that map userspace and
 * false for the parts that map the kernel.
 */
static inline bool pgdp_maps_userspace(void *__ptr)
{
        unsigned long ptr = (unsigned long)__ptr;

        return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
}

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
 * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages
 * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
 * the user one is in the last 4k.  To switch between them, you
 * just need to flip the 12th bit in their addresses.
 */
#define PTI_PGTABLE_SWITCH_BIT        PAGE_SHIFT

/*
 * This generates better code than the inline assembly in
 * __set_bit().
 */
static inline void *ptr_set_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr |= BIT(bit);
        return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr &= ~BIT(bit);
        return (void *)__ptr;
}

static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
        return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
        return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
        return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
        return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */

/*
 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
 *
 *  dst - pointer to pgd range anywhere on a pgd page
 *  src - ""
 *  count - the number of pgds to copy.
 *
 * dst and src can be on the same page, but the range must not overlap,
 * and must not cross a page boundary.
 */
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
        memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
        if (!static_cpu_has(X86_FEATURE_PTI))
                return;
        /* Clone the user space pgd as well */
        memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
               count * sizeof(pgd_t));
#endif
}

#define PTE_SHIFT ilog2(PTRS_PER_PTE)
static inline int page_level_shift(enum pg_level level)
{
        return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
}
static inline unsigned long page_level_size(enum pg_level level)
{
        return 1UL << page_level_shift(level);
}
static inline unsigned long page_level_mask(enum pg_level level)
{
        return ~(page_level_size(level) - 1);
}

/*
 * The x86 doesn't have any external MMU info: the kernel page
 * tables contain all the necessary information.
 */
static inline void update_mmu_cache(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
}
static inline void update_mmu_cache_range(struct vm_fault *vmf,
                struct vm_area_struct *vma, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
}
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmd)
{
}
static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
                unsigned long addr, pud_t *pud)
{
}
static inline pte_t pte_swp_mkexclusive(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
}

static inline bool pte_swp_exclusive(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE;
}

static inline pte_t pte_swp_clear_exclusive(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE);
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}
#endif
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline int pte_swp_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
}

static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP);
}

static inline int pmd_swp_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP;
}

static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline u16 pte_flags_pkey(unsigned long pte_flags)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        /* ifdef to avoid doing 59-bit shift on 32-bit values */
        return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
#else
        return 0;
#endif
}

static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
        u32 pkru = read_pkru();

        if (!__pkru_allows_read(pkru, pkey))
                return false;
        if (write && !__pkru_allows_write(pkru, pkey))
                return false;

        return true;
}

/*
 * 'pteval' can come from a PTE, PMD or PUD.  We only check
 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
 * same value on all 3 types.
 */
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
        unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;

        /*
         * Write=0,Dirty=1 PTEs are shadow stack, which the kernel
         * shouldn't generally allow access to, but since they
         * are already Write=0, the below logic covers both cases.
         */
        if (write)
                need_pte_bits |= _PAGE_RW;

        if ((pteval & need_pte_bits) != need_pte_bits)
                return 0;

        return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
}

#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
        return __pte_access_permitted(pte_val(pte), write);
}

#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
{
        return __pte_access_permitted(pmd_val(pmd), write);
}

#define pud_access_permitted pud_access_permitted
static inline bool pud_access_permitted(pud_t pud, bool write)
{
        return __pte_access_permitted(pud_val(pud), write);
}

#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);

static inline bool arch_has_pfn_modify_check(void)
{
        return boot_cpu_has_bug(X86_BUG_L1TF);
}

#define arch_check_zapped_pte arch_check_zapped_pte
void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);

#define arch_check_zapped_pmd arch_check_zapped_pmd
void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd);

#define arch_check_zapped_pud arch_check_zapped_pud
void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud);

#ifdef CONFIG_XEN_PV
#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
        return !cpu_feature_enabled(X86_FEATURE_XENPV);
}
#endif

#ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte, unsigned long addr)
{
        return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
}

static inline bool pmd_user_accessible_page(pmd_t pmd, unsigned long addr)
{
        return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER);
}

static inline bool pud_user_accessible_page(pud_t pud, unsigned long addr)
{
        return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER);
}
#endif

#ifdef CONFIG_X86_SGX
int arch_memory_failure(unsigned long pfn, int flags);
#define arch_memory_failure arch_memory_failure

bool arch_is_platform_page(u64 paddr);
#define arch_is_platform_page arch_is_platform_page
#endif

/*
 * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
 * TLB flush will be required as a result of the "set". For example, use
 * in scenarios where it is known ahead of time that the routine is
 * setting non-present entries, or re-setting an existing entry to the
 * same value. Otherwise, use the typical "set" helpers and flush the
 * TLB.
 */
#define set_pte_safe(ptep, pte) \
({ \
        WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
        set_pte(ptep, pte); \
})

#define set_pmd_safe(pmdp, pmd) \
({ \
        WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
        set_pmd(pmdp, pmd); \
})

#define set_pud_safe(pudp, pud) \
({ \
        WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
        set_pud(pudp, pud); \
})

#define set_p4d_safe(p4dp, p4d) \
({ \
        WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
        set_p4d(p4dp, p4d); \
})

#define set_pgd_safe(pgdp, pgd) \
({ \
        WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
        set_pgd(pgdp, pgd); \
})
#endif        /* __ASSEMBLER__ */

#endif /* _ASM_X86_PGTABLE_H */




































































































































































































































































































































































































































































































   26 












































































   19 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Sleepable Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright (C) IBM Corporation, 2006
 * Copyright (C) Fujitsu, 2012
 *
 * Author: Paul McKenney <paulmck@linux.ibm.com>
 *           Lai Jiangshan <laijs@cn.fujitsu.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                Documentation/RCU/ *.txt
 *
 */

#ifndef _LINUX_SRCU_H
#define _LINUX_SRCU_H

#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/rcu_segcblist.h>

context_lock_struct(srcu_struct, __reentrant_ctx_lock);

#ifdef CONFIG_DEBUG_LOCK_ALLOC

int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key);
#ifndef CONFIG_TINY_SRCU
int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key);
int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name,
                                   struct lock_class_key *key);
#endif // #ifndef CONFIG_TINY_SRCU

#define init_srcu_struct(ssp) \
({ \
        static struct lock_class_key __srcu_key; \
        \
        __init_srcu_struct((ssp), #ssp, &__srcu_key); \
})

#define init_srcu_struct_fast(ssp) \
({ \
        static struct lock_class_key __srcu_key; \
        \
        __init_srcu_struct_fast((ssp), #ssp, &__srcu_key); \
})

#define init_srcu_struct_fast_updown(ssp) \
({ \
        static struct lock_class_key __srcu_key; \
        \
        __init_srcu_struct_fast_updown((ssp), #ssp, &__srcu_key); \
})

#define __SRCU_DEP_MAP_INIT(srcu_name)        .dep_map = { .name = #srcu_name },
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

int init_srcu_struct(struct srcu_struct *ssp);
#ifndef CONFIG_TINY_SRCU
int init_srcu_struct_fast(struct srcu_struct *ssp);
int init_srcu_struct_fast_updown(struct srcu_struct *ssp);
#endif // #ifndef CONFIG_TINY_SRCU

#define __SRCU_DEP_MAP_INIT(srcu_name)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

/* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */
#define SRCU_READ_FLAVOR_NORMAL                0x1                // srcu_read_lock().
#define SRCU_READ_FLAVOR_NMI                0x2                // srcu_read_lock_nmisafe().
//                                        0x4                // SRCU-lite is no longer with us.
#define SRCU_READ_FLAVOR_FAST                0x4                // srcu_read_lock_fast(), also NMI-safe.
#define SRCU_READ_FLAVOR_FAST_UPDOWN        0x8                // srcu_read_lock_fast_updown().
#define SRCU_READ_FLAVOR_ALL                (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \
                                         SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN)
                                                // All of the above.
#define SRCU_READ_FLAVOR_SLOWGP                (SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN)
                                                // Flavors requiring synchronize_rcu()
                                                // instead of smp_mb().
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases_shared(ssp);

#ifdef CONFIG_TINY_SRCU
#include <linux/srcutiny.h>
#elif defined(CONFIG_TREE_SRCU)
#include <linux/srcutree.h>
#else
#error "Unknown SRCU implementation specified to kernel configuration"
#endif

void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
                void (*func)(struct rcu_head *head));
void cleanup_srcu_struct(struct srcu_struct *ssp);
void synchronize_srcu(struct srcu_struct *ssp);

#define SRCU_GET_STATE_COMPLETED 0x1

/**
 * get_completed_synchronize_srcu - Return a pre-completed polled state cookie
 *
 * Returns a value that poll_state_synchronize_srcu() will always treat
 * as a cookie whose grace period has already completed.
 */
static inline unsigned long get_completed_synchronize_srcu(void)
{
        return SRCU_GET_STATE_COMPLETED;
}

unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);

// Maximum number of unsigned long values corresponding to
// not-yet-completed SRCU grace periods.
#define NUM_ACTIVE_SRCU_POLL_OLDSTATE 2

/**
 * same_state_synchronize_srcu - Are two old-state values identical?
 * @oldstate1: First old-state value.
 * @oldstate2: Second old-state value.
 *
 * The two old-state values must have been obtained from either
 * get_state_synchronize_srcu(), start_poll_synchronize_srcu(), or
 * get_completed_synchronize_srcu().  Returns @true if the two values are
 * identical and @false otherwise.  This allows structures whose lifetimes
 * are tracked by old-state values to push these values to a list header,
 * allowing those structures to be slightly smaller.
 */
static inline bool same_state_synchronize_srcu(unsigned long oldstate1, unsigned long oldstate2)
{
        return oldstate1 == oldstate2;
}

#ifdef CONFIG_NEED_SRCU_NMI_SAFE
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires_shared(ssp);
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases_shared(ssp);
#else
static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
        __acquires_shared(ssp)
{
        return __srcu_read_lock(ssp);
}
static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
        __releases_shared(ssp)
{
        __srcu_read_unlock(ssp, idx);
}
#endif /* CONFIG_NEED_SRCU_NMI_SAFE */

void srcu_init(void);

#ifdef CONFIG_DEBUG_LOCK_ALLOC

/**
 * srcu_read_lock_held - might we be in SRCU read-side critical section?
 * @ssp: The srcu_struct structure to check
 *
 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
 * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
 * this assumes we are in an SRCU read-side critical section unless it can
 * prove otherwise.
 *
 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
 * and while lockdep is disabled.
 *
 * Note that SRCU is based on its own statemachine and it doesn't
 * relies on normal RCU, it can be called from the CPU which
 * is in the idle loop from an RCU point of view or offline.
 */
static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        if (!debug_lockdep_rcu_enabled())
                return 1;
        return lock_is_held(&ssp->dep_map);
}

/*
 * Annotations provide deadlock detection for SRCU.
 *
 * Similar to other lockdep annotations, except there is an additional
 * srcu_lock_sync(), which is basically an empty *write*-side critical section,
 * see lock_sync() for more information.
 */

/* Annotates a srcu_read_lock() */
static inline void srcu_lock_acquire(struct lockdep_map *map)
{
        lock_map_acquire_read(map);
}

/* Annotates a srcu_read_lock() */
static inline void srcu_lock_release(struct lockdep_map *map)
{
        lock_map_release(map);
}

/* Annotates a synchronize_srcu() */
static inline void srcu_lock_sync(struct lockdep_map *map)
{
        lock_map_sync(map);
}

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        return 1;
}

#define srcu_lock_acquire(m) do { } while (0)
#define srcu_lock_release(m) do { } while (0)
#define srcu_lock_sync(m) do { } while (0)

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

/*
 * No-op helper to denote that ssp must be held. Because SRCU-protected pointers
 * should still be marked with __rcu_guarded, and we do not want to mark them
 * with __guarded_by(ssp) as it would complicate annotations for writers, we
 * choose the following strategy: srcu_dereference_check() calls this helper
 * that checks that the passed ssp is held, and then fake-acquires 'RCU'.
 */
static inline void __srcu_read_lock_must_hold(const struct srcu_struct *ssp) __must_hold_shared(ssp) { }

/**
 * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 * @c: condition to check for update-side use
 *
 * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
 * critical section will result in an RCU-lockdep splat, unless @c evaluates
 * to 1.  The @c argument will normally be a logical expression containing
 * lockdep_is_held() calls.
 */
#define srcu_dereference_check(p, ssp, c)                                        \
({                                                                                \
        __srcu_read_lock_must_hold(ssp);                                        \
        __acquire_shared_ctx_lock(RCU);                                        \
        __auto_type __v = __rcu_dereference_check((p), __UNIQUE_ID(rcu),        \
                                (c) || srcu_read_lock_held(ssp), __rcu);        \
        __release_shared_ctx_lock(RCU);                                        \
        __v;                                                                        \
})

/**
 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 *
 * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
 * is enabled, invoking this outside of an RCU read-side critical
 * section will result in an RCU-lockdep splat.
 */
#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0)

/**
 * srcu_dereference_notrace - no tracing and no lockdep calls from here
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 */
#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1)

/**
 * srcu_read_lock - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section.  Note that SRCU read-side
 * critical sections may be nested.  However, it is illegal to
 * call anything that waits on an SRCU grace period for the same
 * srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().
 *
 * The return value from srcu_read_lock() is guaranteed to be
 * non-negative.  This value must be passed unaltered to the matching
 * srcu_read_unlock().  Note that srcu_read_lock() and the matching
 * srcu_read_unlock() must occur in the same context, for example, it is
 * illegal to invoke srcu_read_unlock() in an irq handler if the matching
 * srcu_read_lock() was invoked in process context.  Or, for that matter to
 * invoke srcu_read_unlock() from one task and the matching srcu_read_lock()
 * from another.
 */
static inline int srcu_read_lock(struct srcu_struct *ssp)
        __acquires_shared(ssp)
{
        int retval;

        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        retval = __srcu_read_lock(ssp);
        srcu_lock_acquire(&ssp->dep_map);
        return retval;
}

/**
 * srcu_read_lock_fast - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section, but for a light-weight
 * smp_mb()-free reader.  See srcu_read_lock() for more information.  This
 * function is NMI-safe, in a manner similar to srcu_read_lock_nmisafe().
 *
 * For srcu_read_lock_fast() to be used on an srcu_struct structure,
 * that structure must have been defined using either DEFINE_SRCU_FAST()
 * or DEFINE_STATIC_SRCU_FAST() on the one hand or initialized with
 * init_srcu_struct_fast() on the other.  Such an srcu_struct structure
 * cannot be passed to any non-fast variant of srcu_read_{,un}lock() or
 * srcu_{down,up}_read().  In kernels built with CONFIG_PROVE_RCU=y,
 * __srcu_check_read_flavor() will complain bitterly if you ignore this
 * restriction.
 *
 * Grace-period auto-expediting is disabled for SRCU-fast srcu_struct
 * structures because SRCU-fast expedited grace periods invoke
 * synchronize_rcu_expedited(), IPIs and all.  If you need expedited
 * SRCU-fast grace periods, use synchronize_srcu_expedited().
 *
 * The srcu_read_lock_fast() function can be invoked only from those
 * contexts where RCU is watching, that is, from contexts where it would
 * be legal to invoke rcu_read_lock().  Otherwise, lockdep will complain.
 */
static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires_shared(ssp)
        __acquires_shared(ssp)
{
        struct srcu_ctr __percpu *retval;

        RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast().");
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
        retval = __srcu_read_lock_fast(ssp);
        rcu_try_lock_acquire(&ssp->dep_map);
        return retval;
}

/**
 * srcu_read_lock_fast_updown - register a new reader for an SRCU-fast-updown structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section, but for a light-weight
 * smp_mb()-free reader.  See srcu_read_lock() for more information.
 * This function is compatible with srcu_down_read_fast(), but is not
 * NMI-safe.
 *
 * For srcu_read_lock_fast_updown() to be used on an srcu_struct
 * structure, that structure must have been defined using either
 * DEFINE_SRCU_FAST_UPDOWN() or DEFINE_STATIC_SRCU_FAST_UPDOWN() on the one
 * hand or initialized with init_srcu_struct_fast_updown() on the other.
 * Such an srcu_struct structure cannot be passed to any non-fast-updown
 * variant of srcu_read_{,un}lock() or srcu_{down,up}_read().  In kernels
 * built with CONFIG_PROVE_RCU=y, __srcu_check_read_flavor() will complain
 * bitterly if you ignore this * restriction.
 *
 * Grace-period auto-expediting is disabled for SRCU-fast-updown
 * srcu_struct structures because SRCU-fast-updown expedited grace periods
 * invoke synchronize_rcu_expedited(), IPIs and all.  If you need expedited
 * SRCU-fast-updown grace periods, use synchronize_srcu_expedited().
 *
 * The srcu_read_lock_fast_updown() function can be invoked only from
 * those contexts where RCU is watching, that is, from contexts where
 * it would be legal to invoke rcu_read_lock().  Otherwise, lockdep will
 * complain.
 */
static inline struct srcu_ctr __percpu *srcu_read_lock_fast_updown(struct srcu_struct *ssp)
        __acquires_shared(ssp)
{
        struct srcu_ctr __percpu *retval;

        RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast_updown().");
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN);
        retval = __srcu_read_lock_fast_updown(ssp);
        rcu_try_lock_acquire(&ssp->dep_map);
        return retval;
}

/*
 * Used by tracing, cannot be traced and cannot call lockdep.
 * See srcu_read_lock_fast() for more information.
 */
static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_struct *ssp)
        __acquires_shared(ssp)
{
        struct srcu_ctr __percpu *retval;

        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
        retval = __srcu_read_lock_fast(ssp);
        return retval;
}

/**
 * srcu_down_read_fast - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter a semaphore-like SRCU read-side critical section, but for
 * a light-weight smp_mb()-free reader.  See srcu_read_lock_fast() and
 * srcu_down_read() for more information.
 *
 * The same srcu_struct may be used concurrently by srcu_down_read_fast()
 * and srcu_read_lock_fast().  However, the same definition/initialization
 * requirements called out for srcu_read_lock_safe() apply.
 */
static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires_shared(ssp)
{
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_down_read_fast().");
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN);
        return __srcu_read_lock_fast_updown(ssp);
}

/**
 * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section, but in an NMI-safe manner.
 * See srcu_read_lock() for more information.
 *
 * If srcu_read_lock_nmisafe() is ever used on an srcu_struct structure,
 * then none of the other flavors may be used, whether before, during,
 * or after.
 */
static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp)
        __acquires_shared(ssp)
{
        int retval;

        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI);
        retval = __srcu_read_lock_nmisafe(ssp);
        rcu_try_lock_acquire(&ssp->dep_map);
        return retval;
}

/* Used by tracing, cannot be traced and cannot invoke lockdep. */
static inline notrace int
srcu_read_lock_notrace(struct srcu_struct *ssp)
        __acquires_shared(ssp)
{
        int retval;

        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        retval = __srcu_read_lock(ssp);
        return retval;
}

/**
 * srcu_down_read - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter a semaphore-like SRCU read-side critical section.  Note that
 * SRCU read-side critical sections may be nested.  However, it is
 * illegal to call anything that waits on an SRCU grace period for the
 * same srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().  But if you want lockdep to help you
 * keep this stuff straight, you should instead use srcu_read_lock().
 *
 * The semaphore-like nature of srcu_down_read() means that the matching
 * srcu_up_read() can be invoked from some other context, for example,
 * from some other task or from an irq handler.  However, neither
 * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler.
 *
 * Calls to srcu_down_read() may be nested, similar to the manner in
 * which calls to down_read() may be nested.  The same srcu_struct may be
 * used concurrently by srcu_down_read() and srcu_read_lock().
 */
static inline int srcu_down_read(struct srcu_struct *ssp)
        __acquires_shared(ssp)
{
        WARN_ON_ONCE(in_nmi());
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        return __srcu_read_lock(ssp);
}

/**
 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section.
 */
static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
        __releases_shared(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        srcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock(ssp, idx);
}

/**
 * srcu_read_unlock_fast - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @scp: return value from corresponding srcu_read_lock_fast().
 *
 * Exit a light-weight SRCU read-side critical section.
 */
static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
        __releases_shared(ssp)
{
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
        srcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock_fast(ssp, scp);
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast().");
}

/**
 * srcu_read_unlock_fast_updown - unregister a old reader from an SRCU-fast-updown structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @scp: return value from corresponding srcu_read_lock_fast_updown().
 *
 * Exit an SRCU-fast-updown read-side critical section.
 */
static inline void
srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) __releases_shared(ssp)
{
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN);
        srcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock_fast_updown(ssp, scp);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "RCU must be watching srcu_read_unlock_fast_updown().");
}

/*
 * Used by tracing, cannot be traced and cannot call lockdep.
 * See srcu_read_unlock_fast() for more information.
 */
static inline void srcu_read_unlock_fast_notrace(struct srcu_struct *ssp,
                                                 struct srcu_ctr __percpu *scp) __releases_shared(ssp)
{
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
        __srcu_read_unlock_fast(ssp, scp);
}

/**
 * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @scp: return value from corresponding srcu_read_lock_fast().
 *
 * Exit an SRCU read-side critical section, but not necessarily from
 * the same context as the maching srcu_down_read_fast().
 */
static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
        __releases_shared(ssp)
{
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN);
        __srcu_read_unlock_fast_updown(ssp, scp);
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_up_read_fast_updown().");
}

/**
 * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock_nmisafe().
 *
 * Exit an SRCU read-side critical section, but in an NMI-safe manner.
 */
static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
        __releases_shared(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI);
        rcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock_nmisafe(ssp, idx);
}

/* Used by tracing, cannot be traced and cannot call lockdep. */
static inline notrace void
srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases_shared(ssp)
{
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        __srcu_read_unlock(ssp, idx);
}

/**
 * srcu_up_read - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section, but not necessarily from
 * the same context as the maching srcu_down_read().
 */
static inline void srcu_up_read(struct srcu_struct *ssp, int idx)
        __releases_shared(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        WARN_ON_ONCE(in_nmi());
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        __srcu_read_unlock(ssp, idx);
}

/**
 * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
 *
 * Converts the preceding srcu_read_unlock into a two-way memory barrier.
 *
 * Call this after srcu_read_unlock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_unlock will appear to happen after
 * the preceding srcu_read_unlock.
 */
static inline void smp_mb__after_srcu_read_unlock(void)
{
        /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}

/**
 * smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock
 *
 * Converts the preceding srcu_read_lock into a two-way memory barrier.
 *
 * Call this after srcu_read_lock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_lock will appear to happen after
 * the preceding srcu_read_lock.
 */
static inline void smp_mb__after_srcu_read_lock(void)
{
        /* __srcu_read_lock has smp_mb() internally so nothing to do here. */
}

DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
                    _T->idx = srcu_read_lock(_T->lock),
                    srcu_read_unlock(_T->lock, _T->idx),
                    int idx)
DECLARE_LOCK_GUARD_1_ATTRS(srcu, __acquires_shared(_T), __releases_shared(*(struct srcu_struct **)_T))
#define class_srcu_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(srcu, _T)

DEFINE_LOCK_GUARD_1(srcu_fast, struct srcu_struct,
                    _T->scp = srcu_read_lock_fast(_T->lock),
                    srcu_read_unlock_fast(_T->lock, _T->scp),
                    struct srcu_ctr __percpu *scp)
DECLARE_LOCK_GUARD_1_ATTRS(srcu_fast, __acquires_shared(_T), __releases_shared(*(struct srcu_struct **)_T))
#define class_srcu_fast_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(srcu_fast, _T)

DEFINE_LOCK_GUARD_1(srcu_fast_notrace, struct srcu_struct,
                    _T->scp = srcu_read_lock_fast_notrace(_T->lock),
                    srcu_read_unlock_fast_notrace(_T->lock, _T->scp),
                    struct srcu_ctr __percpu *scp)
DECLARE_LOCK_GUARD_1_ATTRS(srcu_fast_notrace, __acquires_shared(_T), __releases_shared(*(struct srcu_struct **)_T))
#define class_srcu_fast_notrace_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(srcu_fast_notrace, _T)

#endif


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2009 IBM Corporation
 * Author: Mimi Zohar <zohar@us.ibm.com>
 */

#ifndef _LINUX_INTEGRITY_H
#define _LINUX_INTEGRITY_H

#include <linux/fs.h>
#include <linux/iversion.h>

enum integrity_status {
        INTEGRITY_PASS = 0,
        INTEGRITY_PASS_IMMUTABLE,
        INTEGRITY_FAIL,
        INTEGRITY_FAIL_IMMUTABLE,
        INTEGRITY_NOLABEL,
        INTEGRITY_NOXATTRS,
        INTEGRITY_UNKNOWN,
};

#ifdef CONFIG_INTEGRITY
extern void __init integrity_load_keys(void);

#else
static inline void integrity_load_keys(void)
{
}
#endif /* CONFIG_INTEGRITY */

/* An inode's attributes for detection of changes */
struct integrity_inode_attributes {
        u64 version;                /* track inode changes */
        unsigned long ino;
        dev_t dev;
};

/*
 * On stacked filesystems the i_version alone is not enough to detect file data
 * or metadata change. Additional metadata is required.
 */
static inline void
integrity_inode_attrs_store(struct integrity_inode_attributes *attrs,
                            u64 i_version, const struct inode *inode)
{
        attrs->version = i_version;
        attrs->dev = inode->i_sb->s_dev;
        attrs->ino = inode->i_ino;
}

/*
 * On stacked filesystems detect whether the inode or its content has changed.
 */
static inline bool
integrity_inode_attrs_changed(const struct integrity_inode_attributes *attrs,
                              const struct inode *inode)
{
        return (inode->i_sb->s_dev != attrs->dev ||
                inode->i_ino != attrs->ino ||
                !inode_eq_iversion(inode, attrs->version));
}


#endif /* _LINUX_INTEGRITY_H */











































































































































































































































    1 
    1 




































































































































































































    1 














    3 


    2 


    1 

















    3 
    3 







    1 
















































































































































































































































































































    3 



    3 


































    3 





    3 



























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/power/wakeup.c - System wakeup events framework
 *
 * Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
 */
#define pr_fmt(fmt) "PM: " fmt

#include <linux/device.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/capability.h>
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pm_wakeirq.h>
#include <trace/events/power.h>

#include "power.h"

#define list_for_each_entry_rcu_locked(pos, head, member) \
        list_for_each_entry_rcu(pos, head, member, \
                srcu_read_lock_held(&wakeup_srcu))
/*
 * If set, the suspend/hibernate code will abort transitions to a sleep state
 * if wakeup events are registered during or immediately before the transition.
 */
bool events_check_enabled __read_mostly;

/* First wakeup IRQ seen by the kernel in the last cycle. */
static unsigned int wakeup_irq[2] __read_mostly;
static DEFINE_RAW_SPINLOCK(wakeup_irq_lock);

/* If greater than 0 and the system is suspending, terminate the suspend. */
static atomic_t pm_abort_suspend __read_mostly;

/*
 * Combined counters of registered wakeup events and wakeup events in progress.
 * They need to be modified together atomically, so it's better to use one
 * atomic variable to hold them both.
 */
static atomic_t combined_event_count = ATOMIC_INIT(0);

#define IN_PROGRESS_BITS        (sizeof(int) * 4)
#define MAX_IN_PROGRESS                ((1 << IN_PROGRESS_BITS) - 1)

static void split_counters(unsigned int *cnt, unsigned int *inpr)
{
        unsigned int comb = atomic_read(&combined_event_count);

        *cnt = (comb >> IN_PROGRESS_BITS);
        *inpr = comb & MAX_IN_PROGRESS;
}

/* A preserved old value of the events counter. */
static unsigned int saved_count;

static DEFINE_RAW_SPINLOCK(events_lock);

static void pm_wakeup_timer_fn(struct timer_list *t);

static LIST_HEAD(wakeup_sources);

static DECLARE_WAIT_QUEUE_HEAD(wakeup_count_wait_queue);

DEFINE_STATIC_SRCU(wakeup_srcu);

static struct wakeup_source deleted_ws = {
        .name = "deleted",
        .lock =  __SPIN_LOCK_UNLOCKED(deleted_ws.lock),
};

static DEFINE_IDA(wakeup_ida);

/**
 * wakeup_source_create - Create a struct wakeup_source object.
 * @name: Name of the new wakeup source.
 */
static struct wakeup_source *wakeup_source_create(const char *name)
{
        struct wakeup_source *ws;
        const char *ws_name;
        int id;

        ws = kzalloc_obj(*ws);
        if (!ws)
                goto err_ws;

        ws_name = kstrdup_const(name, GFP_KERNEL);
        if (!ws_name)
                goto err_name;
        ws->name = ws_name;

        id = ida_alloc(&wakeup_ida, GFP_KERNEL);
        if (id < 0)
                goto err_id;
        ws->id = id;

        return ws;

err_id:
        kfree_const(ws->name);
err_name:
        kfree(ws);
err_ws:
        return NULL;
}

/*
 * Record wakeup_source statistics being deleted into a dummy wakeup_source.
 */
static void wakeup_source_record(struct wakeup_source *ws)
{
        unsigned long flags;

        spin_lock_irqsave(&deleted_ws.lock, flags);

        if (ws->event_count) {
                deleted_ws.total_time =
                        ktime_add(deleted_ws.total_time, ws->total_time);
                deleted_ws.prevent_sleep_time =
                        ktime_add(deleted_ws.prevent_sleep_time,
                                  ws->prevent_sleep_time);
                deleted_ws.max_time =
                        ktime_compare(deleted_ws.max_time, ws->max_time) > 0 ?
                                deleted_ws.max_time : ws->max_time;
                deleted_ws.event_count += ws->event_count;
                deleted_ws.active_count += ws->active_count;
                deleted_ws.relax_count += ws->relax_count;
                deleted_ws.expire_count += ws->expire_count;
                deleted_ws.wakeup_count += ws->wakeup_count;
        }

        spin_unlock_irqrestore(&deleted_ws.lock, flags);
}

static void wakeup_source_free(struct wakeup_source *ws)
{
        ida_free(&wakeup_ida, ws->id);
        kfree_const(ws->name);
        kfree(ws);
}

/**
 * wakeup_source_destroy - Destroy a struct wakeup_source object.
 * @ws: Wakeup source to destroy.
 *
 * Use only for wakeup source objects created with wakeup_source_create().
 */
static void wakeup_source_destroy(struct wakeup_source *ws)
{
        if (!ws)
                return;

        __pm_relax(ws);
        wakeup_source_record(ws);
        wakeup_source_free(ws);
}

/**
 * wakeup_source_add - Add given object to the list of wakeup sources.
 * @ws: Wakeup source object to add to the list.
 */
static void wakeup_source_add(struct wakeup_source *ws)
{
        unsigned long flags;

        if (WARN_ON(!ws))
                return;

        spin_lock_init(&ws->lock);
        timer_setup(&ws->timer, pm_wakeup_timer_fn, 0);
        ws->active = false;

        raw_spin_lock_irqsave(&events_lock, flags);
        list_add_rcu(&ws->entry, &wakeup_sources);
        raw_spin_unlock_irqrestore(&events_lock, flags);
}

/**
 * wakeup_source_remove - Remove given object from the wakeup sources list.
 * @ws: Wakeup source object to remove from the list.
 */
static void wakeup_source_remove(struct wakeup_source *ws)
{
        unsigned long flags;

        if (WARN_ON(!ws))
                return;

        /*
         * After shutting down the timer, wakeup_source_activate() will warn if
         * the given wakeup source is passed to it.
         */
        timer_shutdown_sync(&ws->timer);

        raw_spin_lock_irqsave(&events_lock, flags);
        list_del_rcu(&ws->entry);
        raw_spin_unlock_irqrestore(&events_lock, flags);
        synchronize_srcu(&wakeup_srcu);
}

/**
 * wakeup_source_register - Create wakeup source and add it to the list.
 * @dev: Device this wakeup source is associated with (or NULL if virtual).
 * @name: Name of the wakeup source to register.
 */
struct wakeup_source *wakeup_source_register(struct device *dev,
                                             const char *name)
{
        struct wakeup_source *ws;
        int ret;

        ws = wakeup_source_create(name);
        if (ws) {
                if (!dev || device_is_registered(dev)) {
                        ret = wakeup_source_sysfs_add(dev, ws);
                        if (ret) {
                                wakeup_source_free(ws);
                                return NULL;
                        }
                }
                wakeup_source_add(ws);
        }
        return ws;
}
EXPORT_SYMBOL_GPL(wakeup_source_register);

/**
 * wakeup_source_unregister - Remove wakeup source from the list and remove it.
 * @ws: Wakeup source object to unregister.
 */
void wakeup_source_unregister(struct wakeup_source *ws)
{
        if (ws) {
                wakeup_source_remove(ws);
                if (ws->dev)
                        wakeup_source_sysfs_remove(ws);

                wakeup_source_destroy(ws);
        }
}
EXPORT_SYMBOL_GPL(wakeup_source_unregister);

/**
 * wakeup_sources_read_lock - Lock wakeup source list for read.
 *
 * Returns an index of srcu lock for struct wakeup_srcu.
 * This index must be passed to the matching wakeup_sources_read_unlock().
 */
int wakeup_sources_read_lock(void)
{
        return srcu_read_lock(&wakeup_srcu);
}
EXPORT_SYMBOL_GPL(wakeup_sources_read_lock);

/**
 * wakeup_sources_read_unlock - Unlock wakeup source list.
 * @idx: return value from corresponding wakeup_sources_read_lock()
 */
void wakeup_sources_read_unlock(int idx)
{
        srcu_read_unlock(&wakeup_srcu, idx);
}
EXPORT_SYMBOL_GPL(wakeup_sources_read_unlock);

/**
 * wakeup_sources_walk_start - Begin a walk on wakeup source list
 *
 * Returns first object of the list of wakeup sources.
 *
 * Note that to be safe, wakeup sources list needs to be locked by calling
 * wakeup_source_read_lock() for this.
 */
struct wakeup_source *wakeup_sources_walk_start(void)
{
        return list_first_or_null_rcu(&wakeup_sources, struct wakeup_source, entry);
}
EXPORT_SYMBOL_GPL(wakeup_sources_walk_start);

/**
 * wakeup_sources_walk_next - Get next wakeup source from the list
 * @ws: Previous wakeup source object
 *
 * Note that to be safe, wakeup sources list needs to be locked by calling
 * wakeup_source_read_lock() for this.
 */
struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws)
{
        struct list_head *ws_head = &wakeup_sources;

        return list_next_or_null_rcu(ws_head, &ws->entry,
                                struct wakeup_source, entry);
}
EXPORT_SYMBOL_GPL(wakeup_sources_walk_next);

/**
 * device_wakeup_attach - Attach a wakeup source object to a device object.
 * @dev: Device to handle.
 * @ws: Wakeup source object to attach to @dev.
 *
 * This causes @dev to be treated as a wakeup device.
 */
static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws)
{
        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                spin_unlock_irq(&dev->power.lock);
                return -EEXIST;
        }
        dev->power.wakeup = ws;
        if (dev->power.wakeirq)
                device_wakeup_attach_irq(dev, dev->power.wakeirq);
        spin_unlock_irq(&dev->power.lock);
        return 0;
}

/**
 * device_wakeup_enable - Enable given device to be a wakeup source.
 * @dev: Device to handle.
 *
 * Create a wakeup source object, register it and attach it to @dev.
 */
int device_wakeup_enable(struct device *dev)
{
        struct wakeup_source *ws;
        int ret;

        if (!dev || !dev->power.can_wakeup)
                return -EINVAL;

        if (pm_sleep_transition_in_progress())
                dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__);

        ws = wakeup_source_register(dev, dev_name(dev));
        if (!ws)
                return -ENOMEM;

        ret = device_wakeup_attach(dev, ws);
        if (ret)
                wakeup_source_unregister(ws);

        return ret;
}
EXPORT_SYMBOL_GPL(device_wakeup_enable);

/**
 * device_wakeup_attach_irq - Attach a wakeirq to a wakeup source
 * @dev: Device to handle
 * @wakeirq: Device specific wakeirq entry
 *
 * Attach a device wakeirq to the wakeup source so the device
 * wake IRQ can be configured automatically for suspend and
 * resume.
 *
 * Call under the device's power.lock lock.
 */
void device_wakeup_attach_irq(struct device *dev,
                             struct wake_irq *wakeirq)
{
        struct wakeup_source *ws;

        ws = dev->power.wakeup;
        if (!ws)
                return;

        if (ws->wakeirq)
                dev_err(dev, "Leftover wakeup IRQ found, overriding\n");

        ws->wakeirq = wakeirq;
}

/**
 * device_wakeup_detach_irq - Detach a wakeirq from a wakeup source
 * @dev: Device to handle
 *
 * Removes a device wakeirq from the wakeup source.
 *
 * Call under the device's power.lock lock.
 */
void device_wakeup_detach_irq(struct device *dev)
{
        struct wakeup_source *ws;

        ws = dev->power.wakeup;
        if (ws)
                ws->wakeirq = NULL;
}

/**
 * device_wakeup_arm_wake_irqs -
 *
 * Iterates over the list of device wakeirqs to arm them.
 */
void device_wakeup_arm_wake_irqs(void)
{
        struct wakeup_source *ws;
        int srcuidx;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry)
                dev_pm_arm_wake_irq(ws->wakeirq);
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}

/**
 * device_wakeup_disarm_wake_irqs -
 *
 * Iterates over the list of device wakeirqs to disarm them.
 */
void device_wakeup_disarm_wake_irqs(void)
{
        struct wakeup_source *ws;
        int srcuidx;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry)
                dev_pm_disarm_wake_irq(ws->wakeirq);
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}

/**
 * device_wakeup_detach - Detach a device's wakeup source object from it.
 * @dev: Device to detach the wakeup source object from.
 *
 * After it returns, @dev will not be treated as a wakeup device any more.
 */
static struct wakeup_source *device_wakeup_detach(struct device *dev)
{
        struct wakeup_source *ws;

        spin_lock_irq(&dev->power.lock);
        ws = dev->power.wakeup;
        dev->power.wakeup = NULL;
        spin_unlock_irq(&dev->power.lock);
        return ws;
}

/**
 * device_wakeup_disable - Do not regard a device as a wakeup source any more.
 * @dev: Device to handle.
 *
 * Detach the @dev's wakeup source object from it, unregister this wakeup source
 * object and destroy it.
 */
void device_wakeup_disable(struct device *dev)
{
        struct wakeup_source *ws;

        if (!dev || !dev->power.can_wakeup)
                return;

        ws = device_wakeup_detach(dev);
        wakeup_source_unregister(ws);
}
EXPORT_SYMBOL_GPL(device_wakeup_disable);

/**
 * device_set_wakeup_capable - Set/reset device wakeup capability flag.
 * @dev: Device to handle.
 * @capable: Whether or not @dev is capable of waking up the system from sleep.
 *
 * If @capable is set, set the @dev's power.can_wakeup flag and add its
 * wakeup-related attributes to sysfs.  Otherwise, unset the @dev's
 * power.can_wakeup flag and remove its wakeup-related attributes from sysfs.
 *
 * This function may sleep and it can't be called from any context where
 * sleeping is not allowed.
 */
void device_set_wakeup_capable(struct device *dev, bool capable)
{
        if (!!dev->power.can_wakeup == !!capable)
                return;

        dev->power.can_wakeup = capable;
        if (device_is_registered(dev) && !list_empty(&dev->power.entry)) {
                if (capable) {
                        int ret = wakeup_sysfs_add(dev);

                        if (ret)
                                dev_info(dev, "Wakeup sysfs attributes not added\n");
                } else {
                        wakeup_sysfs_remove(dev);
                }
        }
}
EXPORT_SYMBOL_GPL(device_set_wakeup_capable);

/**
 * device_set_wakeup_enable - Enable or disable a device to wake up the system.
 * @dev: Device to handle.
 * @enable: enable/disable flag
 */
int device_set_wakeup_enable(struct device *dev, bool enable)
{
        if (enable)
                return device_wakeup_enable(dev);

        device_wakeup_disable(dev);
        return 0;
}
EXPORT_SYMBOL_GPL(device_set_wakeup_enable);

/**
 * wakeup_source_not_usable - validate the given wakeup source.
 * @ws: Wakeup source to be validated.
 */
static bool wakeup_source_not_usable(struct wakeup_source *ws)
{
        /*
         * Use the timer struct to check if the given wakeup source has been
         * initialized by wakeup_source_add() and it is not going away.
         */
        return ws->timer.function != pm_wakeup_timer_fn;
}

/*
 * The functions below use the observation that each wakeup event starts a
 * period in which the system should not be suspended.  The moment this period
 * will end depends on how the wakeup event is going to be processed after being
 * detected and all of the possible cases can be divided into two distinct
 * groups.
 *
 * First, a wakeup event may be detected by the same functional unit that will
 * carry out the entire processing of it and possibly will pass it to user space
 * for further processing.  In that case the functional unit that has detected
 * the event may later "close" the "no suspend" period associated with it
 * directly as soon as it has been dealt with.  The pair of pm_stay_awake() and
 * pm_relax(), balanced with each other, is supposed to be used in such
 * situations.
 *
 * Second, a wakeup event may be detected by one functional unit and processed
 * by another one.  In that case the unit that has detected it cannot really
 * "close" the "no suspend" period associated with it, unless it knows in
 * advance what's going to happen to the event during processing.  This
 * knowledge, however, may not be available to it, so it can simply specify time
 * to wait before the system can be suspended and pass it as the second
 * argument of pm_wakeup_event().
 *
 * It is valid to call pm_relax() after pm_wakeup_event(), in which case the
 * "no suspend" period will be ended either by the pm_relax(), or by the timer
 * function executed when the timer expires, whichever comes first.
 */

/**
 * wakeup_source_activate - Mark given wakeup source as active.
 * @ws: Wakeup source to handle.
 *
 * Update the @ws' statistics and, if @ws has just been activated, notify the PM
 * core of the event by incrementing the counter of the wakeup events being
 * processed.
 */
static void wakeup_source_activate(struct wakeup_source *ws)
{
        unsigned int cec;

        if (WARN_ONCE(wakeup_source_not_usable(ws), "unusable wakeup source\n"))
                return;

        ws->active = true;
        ws->active_count++;
        ws->last_time = ktime_get();
        if (ws->autosleep_enabled)
                ws->start_prevent_time = ws->last_time;

        /* Increment the counter of events in progress. */
        cec = atomic_inc_return(&combined_event_count);

        trace_wakeup_source_activate(ws->name, cec);
}

/**
 * wakeup_source_report_event - Report wakeup event using the given source.
 * @ws: Wakeup source to report the event for.
 * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
 */
static void wakeup_source_report_event(struct wakeup_source *ws, bool hard)
{
        ws->event_count++;
        /* This is racy, but the counter is approximate anyway. */
        if (events_check_enabled)
                ws->wakeup_count++;

        if (!ws->active)
                wakeup_source_activate(ws);

        if (hard)
                pm_system_wakeup();
}

/**
 * __pm_stay_awake - Notify the PM core of a wakeup event.
 * @ws: Wakeup source object associated with the source of the event.
 *
 * It is safe to call this function from interrupt context.
 */
void __pm_stay_awake(struct wakeup_source *ws)
{
        unsigned long flags;

        if (!ws)
                return;

        spin_lock_irqsave(&ws->lock, flags);

        wakeup_source_report_event(ws, false);
        timer_delete(&ws->timer);
        ws->timer_expires = 0;

        spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(__pm_stay_awake);

/**
 * pm_stay_awake - Notify the PM core that a wakeup event is being processed.
 * @dev: Device the wakeup event is related to.
 *
 * Notify the PM core of a wakeup event (signaled by @dev) by calling
 * __pm_stay_awake for the @dev's wakeup source object.
 *
 * Call this function after detecting of a wakeup event if pm_relax() is going
 * to be called directly after processing the event (and possibly passing it to
 * user space for further processing).
 */
void pm_stay_awake(struct device *dev)
{
        unsigned long flags;

        if (!dev)
                return;

        spin_lock_irqsave(&dev->power.lock, flags);
        __pm_stay_awake(dev->power.wakeup);
        spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_stay_awake);

#ifdef CONFIG_PM_AUTOSLEEP
static void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now)
{
        ktime_t delta = ktime_sub(now, ws->start_prevent_time);
        ws->prevent_sleep_time = ktime_add(ws->prevent_sleep_time, delta);
}
#else
static inline void update_prevent_sleep_time(struct wakeup_source *ws,
                                             ktime_t now) {}
#endif

/**
 * wakeup_source_deactivate - Mark given wakeup source as inactive.
 * @ws: Wakeup source to handle.
 *
 * Update the @ws' statistics and notify the PM core that the wakeup source has
 * become inactive by decrementing the counter of wakeup events being processed
 * and incrementing the counter of registered wakeup events.
 */
static void wakeup_source_deactivate(struct wakeup_source *ws)
{
        unsigned int cnt, inpr, cec;
        ktime_t duration;
        ktime_t now;

        ws->relax_count++;
        /*
         * __pm_relax() may be called directly or from a timer function.
         * If it is called directly right after the timer function has been
         * started, but before the timer function calls __pm_relax(), it is
         * possible that __pm_stay_awake() will be called in the meantime and
         * will set ws->active.  Then, ws->active may be cleared immediately
         * by the __pm_relax() called from the timer function, but in such a
         * case ws->relax_count will be different from ws->active_count.
         */
        if (ws->relax_count != ws->active_count) {
                ws->relax_count--;
                return;
        }

        ws->active = false;

        now = ktime_get();
        duration = ktime_sub(now, ws->last_time);
        ws->total_time = ktime_add(ws->total_time, duration);
        if (ktime_to_ns(duration) > ktime_to_ns(ws->max_time))
                ws->max_time = duration;

        ws->last_time = now;
        timer_delete(&ws->timer);
        ws->timer_expires = 0;

        if (ws->autosleep_enabled)
                update_prevent_sleep_time(ws, now);

        /*
         * Increment the counter of registered wakeup events and decrement the
         * counter of wakeup events in progress simultaneously.
         */
        cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count);
        trace_wakeup_source_deactivate(ws->name, cec);

        split_counters(&cnt, &inpr);
        if (!inpr && waitqueue_active(&wakeup_count_wait_queue))
                wake_up(&wakeup_count_wait_queue);
}

/**
 * __pm_relax - Notify the PM core that processing of a wakeup event has ended.
 * @ws: Wakeup source object associated with the source of the event.
 *
 * Call this function for wakeup events whose processing started with calling
 * __pm_stay_awake().
 *
 * It is safe to call it from interrupt context.
 */
void __pm_relax(struct wakeup_source *ws)
{
        unsigned long flags;

        if (!ws)
                return;

        spin_lock_irqsave(&ws->lock, flags);
        if (ws->active)
                wakeup_source_deactivate(ws);
        spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(__pm_relax);

/**
 * pm_relax - Notify the PM core that processing of a wakeup event has ended.
 * @dev: Device that signaled the event.
 *
 * Execute __pm_relax() for the @dev's wakeup source object.
 */
void pm_relax(struct device *dev)
{
        unsigned long flags;

        if (!dev)
                return;

        spin_lock_irqsave(&dev->power.lock, flags);
        __pm_relax(dev->power.wakeup);
        spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_relax);

/**
 * pm_wakeup_timer_fn - Delayed finalization of a wakeup event.
 * @t: timer list
 *
 * Call wakeup_source_deactivate() for the wakeup source whose address is stored
 * in @data if it is currently active and its timer has not been canceled and
 * the expiration time of the timer is not in future.
 */
static void pm_wakeup_timer_fn(struct timer_list *t)
{
        struct wakeup_source *ws = timer_container_of(ws, t, timer);
        unsigned long flags;

        spin_lock_irqsave(&ws->lock, flags);

        if (ws->active && ws->timer_expires
            && time_after_eq(jiffies, ws->timer_expires)) {
                wakeup_source_deactivate(ws);
                ws->expire_count++;
        }

        spin_unlock_irqrestore(&ws->lock, flags);
}

/**
 * pm_wakeup_ws_event - Notify the PM core of a wakeup event.
 * @ws: Wakeup source object associated with the event source.
 * @msec: Anticipated event processing time (in milliseconds).
 * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
 *
 * Notify the PM core of a wakeup event whose source is @ws that will take
 * approximately @msec milliseconds to be processed by the kernel.  If @ws is
 * not active, activate it.  If @msec is nonzero, set up the @ws' timer to
 * execute pm_wakeup_timer_fn() in future.
 *
 * It is safe to call this function from interrupt context.
 */
void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard)
{
        unsigned long flags;
        unsigned long expires;

        if (!ws)
                return;

        spin_lock_irqsave(&ws->lock, flags);

        wakeup_source_report_event(ws, hard);

        if (!msec) {
                wakeup_source_deactivate(ws);
                goto unlock;
        }

        expires = jiffies + msecs_to_jiffies(msec);
        if (!expires)
                expires = 1;

        if (!ws->timer_expires || time_after(expires, ws->timer_expires)) {
                mod_timer(&ws->timer, expires);
                ws->timer_expires = expires;
        }

 unlock:
        spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(pm_wakeup_ws_event);

/**
 * pm_wakeup_dev_event - Notify the PM core of a wakeup event.
 * @dev: Device the wakeup event is related to.
 * @msec: Anticipated event processing time (in milliseconds).
 * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
 *
 * Call pm_wakeup_ws_event() for the @dev's wakeup source object.
 */
void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard)
{
        unsigned long flags;

        if (!dev)
                return;

        spin_lock_irqsave(&dev->power.lock, flags);
        pm_wakeup_ws_event(dev->power.wakeup, msec, hard);
        spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_wakeup_dev_event);

void pm_print_active_wakeup_sources(void)
{
        struct wakeup_source *ws;
        int srcuidx, active = 0;
        struct wakeup_source *last_activity_ws = NULL;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
                if (ws->active) {
                        pm_pr_dbg("active wakeup source: %s\n", ws->name);
                        active = 1;
                } else if (!active &&
                           (!last_activity_ws ||
                            ktime_to_ns(ws->last_time) >
                            ktime_to_ns(last_activity_ws->last_time))) {
                        last_activity_ws = ws;
                }
        }

        if (!active && last_activity_ws)
                pm_pr_dbg("last active wakeup source: %s\n",
                        last_activity_ws->name);
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}
EXPORT_SYMBOL_GPL(pm_print_active_wakeup_sources);

/**
 * pm_wakeup_pending - Check if power transition in progress should be aborted.
 *
 * Compare the current number of registered wakeup events with its preserved
 * value from the past and return true if new wakeup events have been registered
 * since the old value was stored.  Also return true if the current number of
 * wakeup events being processed is different from zero.
 */
bool pm_wakeup_pending(void)
{
        unsigned long flags;
        bool ret = false;

        raw_spin_lock_irqsave(&events_lock, flags);
        if (events_check_enabled) {
                unsigned int cnt, inpr;

                split_counters(&cnt, &inpr);
                ret = (cnt != saved_count || inpr > 0);
                events_check_enabled = !ret;
        }
        raw_spin_unlock_irqrestore(&events_lock, flags);

        if (ret) {
                pm_pr_dbg("Wakeup pending, aborting suspend\n");
                pm_print_active_wakeup_sources();
        }

        return ret || atomic_read(&pm_abort_suspend) > 0;
}
EXPORT_SYMBOL_GPL(pm_wakeup_pending);

void pm_system_wakeup(void)
{
        atomic_inc(&pm_abort_suspend);
        s2idle_wake();
}
EXPORT_SYMBOL_GPL(pm_system_wakeup);

void pm_system_cancel_wakeup(void)
{
        atomic_dec_if_positive(&pm_abort_suspend);
}

void pm_wakeup_clear(unsigned int irq_number)
{
        raw_spin_lock_irq(&wakeup_irq_lock);

        if (irq_number && wakeup_irq[0] == irq_number)
                wakeup_irq[0] = wakeup_irq[1];
        else
                wakeup_irq[0] = 0;

        wakeup_irq[1] = 0;

        raw_spin_unlock_irq(&wakeup_irq_lock);

        if (!irq_number)
                atomic_set(&pm_abort_suspend, 0);
}

void pm_system_irq_wakeup(unsigned int irq_number)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&wakeup_irq_lock, flags);

        if (wakeup_irq[0] == 0)
                wakeup_irq[0] = irq_number;
        else if (wakeup_irq[1] == 0)
                wakeup_irq[1] = irq_number;
        else
                irq_number = 0;

        pm_pr_dbg("Triggering wakeup from IRQ %d\n", irq_number);

        raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags);

        if (irq_number)
                pm_system_wakeup();
}

unsigned int pm_wakeup_irq(void)
{
        return wakeup_irq[0];
}

/**
 * pm_get_wakeup_count - Read the number of registered wakeup events.
 * @count: Address to store the value at.
 * @block: Whether or not to block.
 *
 * Store the number of registered wakeup events at the address in @count.  If
 * @block is set, block until the current number of wakeup events being
 * processed is zero.
 *
 * Return 'false' if the current number of wakeup events being processed is
 * nonzero.  Otherwise return 'true'.
 */
bool pm_get_wakeup_count(unsigned int *count, bool block)
{
        unsigned int cnt, inpr;

        if (block) {
                DEFINE_WAIT(wait);

                for (;;) {
                        prepare_to_wait(&wakeup_count_wait_queue, &wait,
                                        TASK_INTERRUPTIBLE);
                        split_counters(&cnt, &inpr);
                        if (inpr == 0 || signal_pending(current))
                                break;
                        pm_print_active_wakeup_sources();
                        schedule();
                }
                finish_wait(&wakeup_count_wait_queue, &wait);
        }

        split_counters(&cnt, &inpr);
        *count = cnt;
        return !inpr;
}

/**
 * pm_save_wakeup_count - Save the current number of registered wakeup events.
 * @count: Value to compare with the current number of registered wakeup events.
 *
 * If @count is equal to the current number of registered wakeup events and the
 * current number of wakeup events being processed is zero, store @count as the
 * old number of registered wakeup events for pm_check_wakeup_events(), enable
 * wakeup events detection and return 'true'.  Otherwise disable wakeup events
 * detection and return 'false'.
 */
bool pm_save_wakeup_count(unsigned int count)
{
        unsigned int cnt, inpr;
        unsigned long flags;

        events_check_enabled = false;
        raw_spin_lock_irqsave(&events_lock, flags);
        split_counters(&cnt, &inpr);
        if (cnt == count && inpr == 0) {
                saved_count = count;
                events_check_enabled = true;
        }
        raw_spin_unlock_irqrestore(&events_lock, flags);
        return events_check_enabled;
}

#ifdef CONFIG_PM_AUTOSLEEP
/**
 * pm_wakep_autosleep_enabled - Modify autosleep_enabled for all wakeup sources.
 * @set: Whether to set or to clear the autosleep_enabled flags.
 */
void pm_wakep_autosleep_enabled(bool set)
{
        struct wakeup_source *ws;
        ktime_t now = ktime_get();
        int srcuidx;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
                spin_lock_irq(&ws->lock);
                if (ws->autosleep_enabled != set) {
                        ws->autosleep_enabled = set;
                        if (ws->active) {
                                if (set)
                                        ws->start_prevent_time = now;
                                else
                                        update_prevent_sleep_time(ws, now);
                        }
                }
                spin_unlock_irq(&ws->lock);
        }
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}
#endif /* CONFIG_PM_AUTOSLEEP */

/**
 * print_wakeup_source_stats - Print wakeup source statistics information.
 * @m: seq_file to print the statistics into.
 * @ws: Wakeup source object to print the statistics for.
 */
static int print_wakeup_source_stats(struct seq_file *m,
                                     struct wakeup_source *ws)
{
        unsigned long flags;
        ktime_t total_time;
        ktime_t max_time;
        unsigned long active_count;
        ktime_t active_time;
        ktime_t prevent_sleep_time;

        spin_lock_irqsave(&ws->lock, flags);

        total_time = ws->total_time;
        max_time = ws->max_time;
        prevent_sleep_time = ws->prevent_sleep_time;
        active_count = ws->active_count;
        if (ws->active) {
                ktime_t now = ktime_get();

                active_time = ktime_sub(now, ws->last_time);
                total_time = ktime_add(total_time, active_time);
                if (active_time > max_time)
                        max_time = active_time;

                if (ws->autosleep_enabled)
                        prevent_sleep_time = ktime_add(prevent_sleep_time,
                                ktime_sub(now, ws->start_prevent_time));
        } else {
                active_time = 0;
        }

        seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
                   ws->name, active_count, ws->event_count,
                   ws->wakeup_count, ws->expire_count,
                   ktime_to_ms(active_time), ktime_to_ms(total_time),
                   ktime_to_ms(max_time), ktime_to_ms(ws->last_time),
                   ktime_to_ms(prevent_sleep_time));

        spin_unlock_irqrestore(&ws->lock, flags);

        return 0;
}

static void *wakeup_sources_stats_seq_start(struct seq_file *m,
                                        loff_t *pos)
{
        struct wakeup_source *ws;
        loff_t n = *pos;
        int *srcuidx = m->private;

        if (n == 0) {
                seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
                        "expire_count\tactive_since\ttotal_time\tmax_time\t"
                        "last_change\tprevent_suspend_time\n");
        }

        *srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
                if (n-- <= 0)
                        return ws;
        }

        return NULL;
}

static void *wakeup_sources_stats_seq_next(struct seq_file *m,
                                        void *v, loff_t *pos)
{
        struct wakeup_source *ws = v;
        struct wakeup_source *next_ws = NULL;

        ++(*pos);

        list_for_each_entry_continue_rcu(ws, &wakeup_sources, entry) {
                next_ws = ws;
                break;
        }

        if (!next_ws)
                print_wakeup_source_stats(m, &deleted_ws);

        return next_ws;
}

static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
{
        int *srcuidx = m->private;

        srcu_read_unlock(&wakeup_srcu, *srcuidx);
}

/**
 * wakeup_sources_stats_seq_show - Print wakeup sources statistics information.
 * @m: seq_file to print the statistics into.
 * @v: wakeup_source of each iteration
 */
static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
{
        struct wakeup_source *ws = v;

        print_wakeup_source_stats(m, ws);

        return 0;
}

static const struct seq_operations wakeup_sources_stats_seq_ops = {
        .start = wakeup_sources_stats_seq_start,
        .next  = wakeup_sources_stats_seq_next,
        .stop  = wakeup_sources_stats_seq_stop,
        .show  = wakeup_sources_stats_seq_show,
};

static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
{
        return seq_open_private(file, &wakeup_sources_stats_seq_ops, sizeof(int));
}

static const struct file_operations wakeup_sources_stats_fops = {
        .owner = THIS_MODULE,
        .open = wakeup_sources_stats_open,
        .read = seq_read,
        .llseek = seq_lseek,
        .release = seq_release_private,
};

static int __init wakeup_sources_debugfs_init(void)
{
        debugfs_create_file("wakeup_sources", 0444, NULL, NULL,
                            &wakeup_sources_stats_fops);
        return 0;
}

postcore_initcall(wakeup_sources_debugfs_init);













































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BITOPS_H
#define _LINUX_BITOPS_H

#include <asm/types.h>
#include <linux/bits.h>
#include <linux/typecheck.h>

#include <uapi/linux/kernel.h>

#define BITS_TO_LONGS(nr)        __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
#define BITS_TO_U64(nr)                __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64))
#define BITS_TO_U32(nr)                __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32))
#define BITS_TO_BYTES(nr)        __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char))

#define BYTES_TO_BITS(nb)        ((nb) * BITS_PER_BYTE)

extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned long __sw_hweight64(__u64 w);

/*
 * Defined here because those may be needed by architecture-specific static
 * inlines.
 */

#include <asm-generic/bitops/generic-non-atomic.h>

/*
 * Many architecture-specific non-atomic bitops contain inline asm code and due
 * to that the compiler can't optimize them to compile-time expressions or
 * constants. In contrary, generic_*() helpers are defined in pure C and
 * compilers optimize them just well.
 * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively
 * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when
 * the arguments can be resolved at compile time. That expression itself is a
 * constant and doesn't bring any functional changes to the rest of cases.
 * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when
 * passing a bitmap from .bss or .data (-> `!!addr` is always true).
 */
#define bitop(op, nr, addr)                                                \
        ((__builtin_constant_p(nr) &&                                        \
          __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) &&        \
          (uintptr_t)(addr) != (uintptr_t)NULL &&                        \
          __builtin_constant_p(*(const unsigned long *)(addr))) ?        \
         const##op(nr, addr) : op(nr, addr))

/*
 * The following macros are non-atomic versions of their non-underscored
 * counterparts.
 */
#define __set_bit(nr, addr)                bitop(___set_bit, nr, addr)
#define __clear_bit(nr, addr)                bitop(___clear_bit, nr, addr)
#define __change_bit(nr, addr)                bitop(___change_bit, nr, addr)
#define __test_and_set_bit(nr, addr)        bitop(___test_and_set_bit, nr, addr)
#define __test_and_clear_bit(nr, addr)        bitop(___test_and_clear_bit, nr, addr)
#define __test_and_change_bit(nr, addr)        bitop(___test_and_change_bit, nr, addr)

#define test_bit(nr, addr)                bitop(_test_bit, nr, addr)
#define test_bit_acquire(nr, addr)        bitop(_test_bit_acquire, nr, addr)

/*
 * Include this here because some architectures need generic_ffs/fls in
 * scope
 */
#include <asm/bitops.h>

/* Check that the bitops prototypes are sane */
#define __check_bitop_pr(name)                                                \
        static_assert(__same_type(arch_##name, generic_##name) &&        \
                      __same_type(const_##name, generic_##name) &&        \
                      __same_type(_##name, generic_##name))

__check_bitop_pr(__set_bit);
__check_bitop_pr(__clear_bit);
__check_bitop_pr(__change_bit);
__check_bitop_pr(__test_and_set_bit);
__check_bitop_pr(__test_and_clear_bit);
__check_bitop_pr(__test_and_change_bit);
__check_bitop_pr(test_bit);
__check_bitop_pr(test_bit_acquire);

#undef __check_bitop_pr

static inline int get_bitmask_order(unsigned int count)
{
        int order;

        order = fls(count);
        return order;        /* We could be slightly more clever with -1 here... */
}

static __always_inline unsigned long hweight_long(unsigned long w)
{
        return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w);
}

/**
 * rol64 - rotate a 64-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u64 rol64(__u64 word, unsigned int shift)
{
        return (word << (shift & 63)) | (word >> ((-shift) & 63));
}

/**
 * ror64 - rotate a 64-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u64 ror64(__u64 word, unsigned int shift)
{
        return (word >> (shift & 63)) | (word << ((-shift) & 63));
}

/**
 * rol32 - rotate a 32-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u32 rol32(__u32 word, unsigned int shift)
{
        return (word << (shift & 31)) | (word >> ((-shift) & 31));
}

/**
 * ror32 - rotate a 32-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u32 ror32(__u32 word, unsigned int shift)
{
        return (word >> (shift & 31)) | (word << ((-shift) & 31));
}

/**
 * rol16 - rotate a 16-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u16 rol16(__u16 word, unsigned int shift)
{
        return (word << (shift & 15)) | (word >> ((-shift) & 15));
}

/**
 * ror16 - rotate a 16-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u16 ror16(__u16 word, unsigned int shift)
{
        return (word >> (shift & 15)) | (word << ((-shift) & 15));
}

/**
 * rol8 - rotate an 8-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u8 rol8(__u8 word, unsigned int shift)
{
        return (word << (shift & 7)) | (word >> ((-shift) & 7));
}

/**
 * ror8 - rotate an 8-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u8 ror8(__u8 word, unsigned int shift)
{
        return (word >> (shift & 7)) | (word << ((-shift) & 7));
}

/**
 * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit
 * @value: value to sign extend
 * @index: 0 based bit index (0 <= index < 32) to sign bit
 *
 * This is safe to use for 16- and 8-bit types as well.
 *
 * Return: 32-bit sign extended value
 */
static __always_inline __s32 sign_extend32(__u32 value, int index)
{
        __u8 shift = 31 - index;
        return (__s32)(value << shift) >> shift;
}

/**
 * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit
 * @value: value to sign extend
 * @index: 0 based bit index (0 <= index < 64) to sign bit
 *
 * This is safe to use for 32-, 16- and 8-bit types as well.
 *
 * Return: 64-bit sign extended value
 */
static __always_inline __s64 sign_extend64(__u64 value, int index)
{
        __u8 shift = 63 - index;
        return (__s64)(value << shift) >> shift;
}

static inline unsigned int fls_long(unsigned long l)
{
        if (sizeof(l) == 4)
                return fls(l);
        return fls64(l);
}

static inline int get_count_order(unsigned int count)
{
        if (count == 0)
                return -1;

        return fls(--count);
}

/**
 * get_count_order_long - get order after rounding @l up to power of 2
 * @l: parameter
 *
 * it is same as get_count_order() but with long type parameter
 */
static inline int get_count_order_long(unsigned long l)
{
        if (l == 0UL)
                return -1;
        return (int)fls_long(--l);
}

/**
 * parity8 - get the parity of an u8 value
 * @val: the value to be examined
 *
 * Determine the parity of the u8 argument.
 *
 * Returns:
 * 0 for even parity, 1 for odd parity
 *
 * Note: This function informs you about the current parity. Example to bail
 * out when parity is odd:
 *
 *        if (parity8(val) == 1)
 *                return -EBADMSG;
 *
 * If you need to calculate a parity bit, you need to draw the conclusion from
 * this result yourself. Example to enforce odd parity, parity bit is bit 7:
 *
 *        if (parity8(val) == 0)
 *                val ^= BIT(7);
 */
static inline int parity8(u8 val)
{
        /*
         * One explanation of this algorithm:
         * https://funloop.org/codex/problem/parity/README.html
         */
        val ^= val >> 4;
        return (0x6996 >> (val & 0xf)) & 1;
}

/**
 * __ffs64 - find first set bit in a 64 bit word
 * @word: The 64 bit word
 *
 * On 64 bit arches this is a synonym for __ffs
 * The result is not defined if no bits are set, so check that @word
 * is non-zero before calling this.
 */
static inline __attribute_const__ unsigned int __ffs64(u64 word)
{
#if BITS_PER_LONG == 32
        if (((u32)word) == 0UL)
                return __ffs((u32)(word >> 32)) + 32;
#elif BITS_PER_LONG != 64
#error BITS_PER_LONG not 32 or 64
#endif
        return __ffs((unsigned long)word);
}

/**
 * fns - find N'th set bit in a word
 * @word: The word to search
 * @n: Bit to find
 */
static inline unsigned int fns(unsigned long word, unsigned int n)
{
        while (word && n--)
                word &= word - 1;

        return word ? __ffs(word) : BITS_PER_LONG;
}

/**
 * assign_bit - Assign value to a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 * @value: the value to assign
 */
#define assign_bit(nr, addr, value)                                        \
        ((value) ? set_bit((nr), (addr)) : clear_bit((nr), (addr)))

#define __assign_bit(nr, addr, value)                                        \
        ((value) ? __set_bit((nr), (addr)) : __clear_bit((nr), (addr)))

/**
 * __ptr_set_bit - Set bit in a pointer's value
 * @nr: the bit to set
 * @addr: the address of the pointer variable
 *
 * Example:
 *        void *p = foo();
 *        __ptr_set_bit(bit, &p);
 */
#define __ptr_set_bit(nr, addr)                         \
        ({                                              \
                typecheck_pointer(*(addr));             \
                __set_bit(nr, (unsigned long *)(addr)); \
        })

/**
 * __ptr_clear_bit - Clear bit in a pointer's value
 * @nr: the bit to clear
 * @addr: the address of the pointer variable
 *
 * Example:
 *        void *p = foo();
 *        __ptr_clear_bit(bit, &p);
 */
#define __ptr_clear_bit(nr, addr)                         \
        ({                                                \
                typecheck_pointer(*(addr));               \
                __clear_bit(nr, (unsigned long *)(addr)); \
        })

/**
 * __ptr_test_bit - Test bit in a pointer's value
 * @nr: the bit to test
 * @addr: the address of the pointer variable
 *
 * Example:
 *        void *p = foo();
 *        if (__ptr_test_bit(bit, &p)) {
 *                ...
 *        } else {
 *                ...
 *        }
 */
#define __ptr_test_bit(nr, addr)                       \
        ({                                             \
                typecheck_pointer(*(addr));            \
                test_bit(nr, (unsigned long *)(addr)); \
        })

#ifdef __KERNEL__

#ifndef set_mask_bits
#define set_mask_bits(ptr, mask, bits)        \
({                                                                \
        const typeof(*(ptr)) mask__ = (mask), bits__ = (bits);        \
        typeof(*(ptr)) old__, new__;                                \
                                                                \
        old__ = READ_ONCE(*(ptr));                                \
        do {                                                        \
                new__ = (old__ & ~mask__) | bits__;                \
        } while (!try_cmpxchg(ptr, &old__, new__));                \
                                                                \
        old__;                                                        \
})
#endif

#ifndef bit_clear_unless
#define bit_clear_unless(ptr, clear, test)        \
({                                                                \
        const typeof(*(ptr)) clear__ = (clear), test__ = (test);\
        typeof(*(ptr)) old__, new__;                                \
                                                                \
        old__ = READ_ONCE(*(ptr));                                \
        do {                                                        \
                if (old__ & test__)                                \
                        break;                                        \
                new__ = old__ & ~clear__;                        \
        } while (!try_cmpxchg(ptr, &old__, new__));                \
                                                                \
        !(old__ & test__);                                        \
})
#endif

#endif /* __KERNEL__ */
#endif



























































































































































































































































   24 




































































































































































































































    7 



   11 














    1 
























    1 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FILELOCK_H
#define _LINUX_FILELOCK_H

#include <linux/fs.h>

#define FL_POSIX        1
#define FL_FLOCK        2
#define FL_DELEG        4        /* NFSv4 delegation */
#define FL_ACCESS        8        /* not trying to lock, just looking */
#define FL_EXISTS        16        /* when unlocking, test for existence */
#define FL_LEASE        32        /* lease held on this file */
#define FL_CLOSE        64        /* unlock on close */
#define FL_SLEEP        128        /* A blocking lock */
#define FL_DOWNGRADE_PENDING        256 /* Lease is being downgraded */
#define FL_UNLOCK_PENDING        512 /* Lease is being broken */
#define FL_OFDLCK        1024        /* lock is "owned" by struct file */
#define FL_LAYOUT        2048        /* outstanding pNFS layout */
#define FL_RECLAIM        4096        /* reclaiming from a reboot server */

#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)

/*
 * Special return value from posix_lock_file() and vfs_lock_file() for
 * asynchronous locking.
 */
#define FILE_LOCK_DEFERRED 1

struct file_lock;
struct file_lease;

struct file_lock_operations {
        void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
        void (*fl_release_private)(struct file_lock *);
};

struct lock_manager_operations {
        void *lm_mod_owner;
        fl_owner_t (*lm_get_owner)(fl_owner_t);
        void (*lm_put_owner)(fl_owner_t);
        void (*lm_notify)(struct file_lock *);        /* unblock callback */
        int (*lm_grant)(struct file_lock *, int);
        bool (*lm_lock_expirable)(struct file_lock *cfl);
        void (*lm_expire_lock)(void);
};

struct lease_manager_operations {
        bool (*lm_break)(struct file_lease *);
        int (*lm_change)(struct file_lease *, int, struct list_head *);
        void (*lm_setup)(struct file_lease *, void **);
        bool (*lm_breaker_owns_lease)(struct file_lease *);
        int (*lm_open_conflict)(struct file *, int);
        bool (*lm_breaker_timedout)(struct file_lease *fl);
};

struct lock_manager {
        struct list_head list;
        /*
         * NFSv4 and up also want opens blocked during the grace period;
         * NLM doesn't care:
         */
        bool block_opens;
};

struct net;
void locks_start_grace(struct net *, struct lock_manager *);
void locks_end_grace(struct lock_manager *);
bool locks_in_grace(struct net *);
bool opens_in_grace(struct net *);

/*
 * struct file_lock has a union that some filesystems use to track
 * their own private info. The NFS side of things is defined here:
 */
#include <linux/nfs_fs_i.h>

/*
 * struct file_lock represents a generic "file lock". It's used to represent
 * POSIX byte range locks, BSD (flock) locks, and leases. It's important to
 * note that the same struct is used to represent both a request for a lock and
 * the lock itself, but the same object is never used for both.
 *
 * FIXME: should we create a separate "struct lock_request" to help distinguish
 * these two uses?
 *
 * The varous i_flctx lists are ordered by:
 *
 * 1) lock owner
 * 2) lock range start
 * 3) lock range end
 *
 * Obviously, the last two criteria only matter for POSIX locks.
 */

struct file_lock_core {
        struct file_lock_core *flc_blocker;        /* The lock that is blocking us */
        struct list_head flc_list;        /* link into file_lock_context */
        struct hlist_node flc_link;        /* node in global lists */
        struct list_head flc_blocked_requests;        /* list of requests with
                                                 * ->fl_blocker pointing here
                                                 */
        struct list_head flc_blocked_member;        /* node in
                                                 * ->fl_blocker->fl_blocked_requests
                                                 */
        fl_owner_t flc_owner;
        unsigned int flc_flags;
        unsigned char flc_type;
        pid_t flc_pid;
        int flc_link_cpu;                /* what cpu's list is this on? */
        wait_queue_head_t flc_wait;
        struct file *flc_file;
};

struct file_lock {
        struct file_lock_core c;
        loff_t fl_start;
        loff_t fl_end;

        const struct file_lock_operations *fl_ops;        /* Callbacks for filesystems */
        const struct lock_manager_operations *fl_lmops;        /* Callbacks for lockmanagers */
        union {
                struct nfs_lock_info        nfs_fl;
                struct nfs4_lock_info        nfs4_fl;
                struct {
                        struct list_head link;        /* link in AFS vnode's pending_locks list */
                        int state;                /* state of grant or error if -ve */
                        unsigned int        debug_id;
                } afs;
                struct {
                        struct inode *inode;
                } ceph;
        } fl_u;
} __randomize_layout;

struct file_lease {
        struct file_lock_core c;
        struct fasync_struct *        fl_fasync; /* for lease break notifications */
        /* for lease breaks: */
        unsigned long fl_break_time;
        unsigned long fl_downgrade_time;
        const struct lease_manager_operations *fl_lmops; /* Callbacks for lease managers */
} __randomize_layout;

struct file_lock_context {
        spinlock_t                flc_lock;
        struct list_head        flc_flock;
        struct list_head        flc_posix;
        struct list_head        flc_lease;
};

#ifdef CONFIG_FILE_LOCKING
int fcntl_getlk(struct file *, unsigned int, struct flock *);
int fcntl_setlk(unsigned int, struct file *, unsigned int,
                        struct flock *);

#if BITS_PER_LONG == 32
int fcntl_getlk64(struct file *, unsigned int, struct flock64 *);
int fcntl_setlk64(unsigned int, struct file *, unsigned int,
                        struct flock64 *);
#endif

int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
int fcntl_getlease(struct file *filp);
int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg);
int fcntl_getdeleg(struct file *filp, struct delegation *deleg);

static inline bool lock_is_unlock(struct file_lock *fl)
{
        return fl->c.flc_type == F_UNLCK;
}

static inline bool lock_is_read(struct file_lock *fl)
{
        return fl->c.flc_type == F_RDLCK;
}

static inline bool lock_is_write(struct file_lock *fl)
{
        return fl->c.flc_type == F_WRLCK;
}

static inline void locks_wake_up_waiter(struct file_lock_core *flc)
{
        wake_up(&flc->flc_wait);
}

static inline void locks_wake_up(struct file_lock *fl)
{
        locks_wake_up_waiter(&fl->c);
}

static inline bool locks_can_async_lock(const struct file_operations *fops)
{
        return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK;
}

/* fs/locks.c */
void locks_free_lock_context(struct inode *inode);
void locks_free_lock(struct file_lock *fl);
void locks_init_lock(struct file_lock *);
struct file_lock *locks_alloc_lock(void);
void locks_copy_lock(struct file_lock *, struct file_lock *);
void locks_copy_conflock(struct file_lock *, struct file_lock *);
void locks_remove_posix(struct file *, fl_owner_t);
void locks_remove_file(struct file *);
void locks_release_private(struct file_lock *);
void posix_test_lock(struct file *, struct file_lock *);
int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
int locks_delete_block(struct file_lock *);
int vfs_test_lock(struct file *, struct file_lock *);
int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
bool vfs_inode_has_locks(struct inode *inode);
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);

void locks_init_lease(struct file_lease *);
void locks_free_lease(struct file_lease *fl);
struct file_lease *locks_alloc_lease(void);

#define LEASE_BREAK_LEASE                BIT(0)        // break leases and delegations
#define LEASE_BREAK_DELEG                BIT(1)        // break delegations only
#define LEASE_BREAK_LAYOUT                BIT(2)        // break layouts only
#define LEASE_BREAK_NONBLOCK                BIT(3)        // non-blocking break
#define LEASE_BREAK_OPEN_RDONLY                BIT(4)        // readonly open event

int __break_lease(struct inode *inode, unsigned int flags);
void lease_get_mtime(struct inode *, struct timespec64 *time);
int generic_setlease(struct file *, int, struct file_lease **, void **priv);
int kernel_setlease(struct file *, int, struct file_lease **, void **);
int vfs_setlease(struct file *, int, struct file_lease **, void **);
int lease_modify(struct file_lease *, int, struct list_head *);

struct notifier_block;
int lease_register_notifier(struct notifier_block *);
void lease_unregister_notifier(struct notifier_block *);

struct files_struct;
void show_fd_locks(struct seq_file *f,
                         struct file *filp, struct files_struct *files);
bool locks_owner_has_blockers(struct file_lock_context *flctx,
                        fl_owner_t owner);

static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
        /*
         * Paired with smp_store_release in locks_get_lock_context().
         *
         * Ensures ->i_flctx will be visible if we spotted the flag.
         */
        if (likely(!(smp_load_acquire(&inode->i_opflags) & IOP_FLCTX)))
                return NULL;
        return READ_ONCE(inode->i_flctx);
}

#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, unsigned int cmd,
                              struct flock __user *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk(unsigned int fd, struct file *file,
                              unsigned int cmd, struct flock __user *user)
{
        return -EACCES;
}

#if BITS_PER_LONG == 32
static inline int fcntl_getlk64(struct file *file, unsigned int cmd,
                                struct flock64 *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk64(unsigned int fd, struct file *file,
                                unsigned int cmd, struct flock64 *user)
{
        return -EACCES;
}
#endif
static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
        return -EINVAL;
}

static inline int fcntl_getlease(struct file *filp)
{
        return F_UNLCK;
}

static inline int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg)
{
        return -EINVAL;
}

static inline int fcntl_getdeleg(struct file *filp, struct delegation *deleg)
{
        return -EINVAL;
}

static inline bool lock_is_unlock(struct file_lock *fl)
{
        return false;
}

static inline bool lock_is_read(struct file_lock *fl)
{
        return false;
}

static inline bool lock_is_write(struct file_lock *fl)
{
        return false;
}

static inline void locks_wake_up(struct file_lock *fl)
{
}

static inline void
locks_free_lock_context(struct inode *inode)
{
}

static inline void locks_init_lock(struct file_lock *fl)
{
        return;
}

static inline void locks_init_lease(struct file_lease *fl)
{
        return;
}

static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        return;
}

static inline void locks_remove_file(struct file *filp)
{
        return;
}

static inline void posix_test_lock(struct file *filp, struct file_lock *fl)
{
        return;
}

static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
                                  struct file_lock *conflock)
{
        return -ENOLCK;
}

static inline int locks_delete_block(struct file_lock *waiter)
{
        return -ENOENT;
}

static inline int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline int vfs_lock_file(struct file *filp, unsigned int cmd,
                                struct file_lock *fl, struct file_lock *conf)
{
        return -ENOLCK;
}

static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline bool vfs_inode_has_locks(struct inode *inode)
{
        return false;
}

static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        return -ENOLCK;
}

static inline int __break_lease(struct inode *inode, unsigned int flags)
{
        return 0;
}

static inline void lease_get_mtime(struct inode *inode,
                                   struct timespec64 *time)
{
        return;
}

static inline int generic_setlease(struct file *filp, int arg,
                                    struct file_lease **flp, void **priv)
{
        return -EINVAL;
}

static inline int kernel_setlease(struct file *filp, int arg,
                               struct file_lease **lease, void **priv)
{
        return -EINVAL;
}

static inline int vfs_setlease(struct file *filp, int arg,
                               struct file_lease **lease, void **priv)
{
        return -EINVAL;
}

static inline int lease_modify(struct file_lease *fl, int arg,
                               struct list_head *dispose)
{
        return -EINVAL;
}

struct files_struct;
static inline void show_fd_locks(struct seq_file *f,
                        struct file *filp, struct files_struct *files) {}
static inline bool locks_owner_has_blockers(struct file_lock_context *flctx,
                        fl_owner_t owner)
{
        return false;
}

static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
        return NULL;
}

#endif /* !CONFIG_FILE_LOCKING */

/* for walking lists of file_locks linked by fl_list */
#define for_each_file_lock(_fl, _head)        list_for_each_entry(_fl, _head, c.flc_list)

static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
{
        return locks_lock_inode_wait(file_inode(filp), fl);
}

#ifdef CONFIG_FILE_LOCKING
static inline unsigned int openmode_to_lease_flags(unsigned int mode)
{
        unsigned int flags = 0;

        if ((mode & O_ACCMODE) == O_RDONLY)
                flags |= LEASE_BREAK_OPEN_RDONLY;
        if (mode & O_NONBLOCK)
                flags |= LEASE_BREAK_NONBLOCK;
        return flags;
}

static inline int break_lease(struct inode *inode, unsigned int mode)
{
        struct file_lock_context *flctx;

        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        flctx = locks_inode_context(inode);
        if (!flctx)
                return 0;
        smp_mb();
        if (!list_empty_careful(&flctx->flc_lease))
                return __break_lease(inode, LEASE_BREAK_LEASE | openmode_to_lease_flags(mode));
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int flags)
{
        struct file_lock_context *flctx;

        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        flctx = locks_inode_context(inode);
        if (!flctx)
                return 0;
        smp_mb();
        if (!list_empty_careful(&flctx->flc_lease)) {
                flags |= LEASE_BREAK_DELEG;
                return __break_lease(inode, flags);
        }
        return 0;
}

struct delegated_inode {
        struct inode *di_inode;
};

static inline bool is_delegated(struct delegated_inode *di)
{
        return di->di_inode;
}

static inline int try_break_deleg(struct inode *inode,
                                  struct delegated_inode *di)
{
        int ret;

        ret = break_deleg(inode, LEASE_BREAK_NONBLOCK);
        if (ret == -EWOULDBLOCK && di) {
                di->di_inode = inode;
                ihold(inode);
        }
        return ret;
}

static inline int break_deleg_wait(struct delegated_inode *di)
{
        int ret;

        ret = break_deleg(di->di_inode, 0);
        iput(di->di_inode);
        di->di_inode = NULL;
        return ret;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        struct file_lock_context *flctx;

        smp_mb();
        flctx = locks_inode_context(inode);
        if (flctx && !list_empty_careful(&flctx->flc_lease)) {
                unsigned int flags = LEASE_BREAK_LAYOUT;

                if (!wait)
                        flags |= LEASE_BREAK_NONBLOCK;

                return __break_lease(inode, flags);
        }
        return 0;
}

#else /* !CONFIG_FILE_LOCKING */
struct delegated_inode { };

static inline bool is_delegated(struct delegated_inode *di)
{
        return false;
}

static inline int break_lease(struct inode *inode, bool wait)
{
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int flags)
{
        return 0;
}

static inline int try_break_deleg(struct inode *inode,
                                  struct delegated_inode *delegated_inode)
{
        return 0;
}

static inline int break_deleg_wait(struct delegated_inode *delegated_inode)
{
        BUG();
        return 0;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        return 0;
}

#endif /* CONFIG_FILE_LOCKING */

#endif /* _LINUX_FILELOCK_H */




















































































































    1 

    1 


    1 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/exec.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * #!-checking implemented by tytso.
 */
/*
 * Demand-loading implemented 01.12.91 - no need to read anything but
 * the header into memory. The inode of the executable is put into
 * "current->executable", and page faults do the actual loading. Clean.
 *
 * Once more I can proudly say that linux stood up to being changed: it
 * was less than 2 hours work to get demand-loading completely implemented.
 *
 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
 * current->executable is only used by the procfs.  This allows a dispatch
 * table to check for several different types  of binary formats.  We keep
 * trying until we recognize the file or we run out of supported binary
 * formats.
 */

#include <linux/kernel_read_file.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
#include <linux/time_namespace.h>
#include <linux/user_events.h>
#include <linux/rseq.h>
#include <linux/ksm.h>

#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>

#include <trace/events/task.h>
#include "internal.h"

#include <trace/events/sched.h>

/* For vma exec functions. */
#include "../mm/internal.h"

static int bprm_creds_from_file(struct linux_binprm *bprm);

int suid_dumpable = 0;

static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);

void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
        write_lock(&binfmt_lock);
        insert ? list_add(&fmt->lh, &formats) :
                 list_add_tail(&fmt->lh, &formats);
        write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(__register_binfmt);

void unregister_binfmt(struct linux_binfmt * fmt)
{
        write_lock(&binfmt_lock);
        list_del(&fmt->lh);
        write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(unregister_binfmt);

static inline void put_binfmt(struct linux_binfmt * fmt)
{
        module_put(fmt->module);
}

bool path_noexec(const struct path *path)
{
        /* If it's an anonymous inode make sure that we catch any shenanigans. */
        VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) &&
                         !(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC));
        return (path->mnt->mnt_flags & MNT_NOEXEC) ||
               (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}

#ifdef CONFIG_MMU
/*
 * The nascent bprm->mm is not visible until exec_mmap() but it can
 * use a lot of memory, account these pages in current->mm temporary
 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
 * change the counter back via acct_arg_size(0).
 */
static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
        struct mm_struct *mm = current->mm;
        long diff = (long)(pages - bprm->vma_pages);

        if (!mm || !diff)
                return;

        bprm->vma_pages = pages;
        add_mm_counter(mm, MM_ANONPAGES, diff);
}

static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
{
        struct page *page;
        struct vm_area_struct *vma = bprm->vma;
        struct mm_struct *mm = bprm->mm;
        int ret;

        /*
         * Avoid relying on expanding the stack down in GUP (which
         * does not work for STACK_GROWSUP anyway), and just do it
         * ahead of time.
         */
        if (!mmap_read_lock_maybe_expand(mm, vma, pos, write))
                return NULL;

        /*
         * We are doing an exec().  'current' is the process
         * doing the exec and 'mm' is the new process's mm.
         */
        ret = get_user_pages_remote(mm, pos, 1,
                        write ? FOLL_WRITE : 0,
                        &page, NULL);
        mmap_read_unlock(mm);
        if (ret <= 0)
                return NULL;

        if (write)
                acct_arg_size(bprm, vma_pages(vma));

        return page;
}

static void put_arg_page(struct page *page)
{
        put_page(page);
}

static void free_arg_pages(struct linux_binprm *bprm)
{
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                struct page *page)
{
        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
        return len <= MAX_ARG_STRLEN;
}

#else

static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
}

static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
{
        struct page *page;

        page = bprm->page[pos / PAGE_SIZE];
        if (!page && write) {
                page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
                if (!page)
                        return NULL;
                bprm->page[pos / PAGE_SIZE] = page;
        }

        return page;
}

static void put_arg_page(struct page *page)
{
}

static void free_arg_page(struct linux_binprm *bprm, int i)
{
        if (bprm->page[i]) {
                __free_page(bprm->page[i]);
                bprm->page[i] = NULL;
        }
}

static void free_arg_pages(struct linux_binprm *bprm)
{
        int i;

        for (i = 0; i < MAX_ARG_PAGES; i++)
                free_arg_page(bprm, i);
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                struct page *page)
{
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
        return len <= bprm->p;
}

#endif /* CONFIG_MMU */

/*
 * Create a new mm_struct and populate it with a temporary stack
 * vm_area_struct.  We don't have enough context at this point to set the stack
 * flags, permissions, and offset, so we use temporary values.  We'll update
 * them later in setup_arg_pages().
 */
static int bprm_mm_init(struct linux_binprm *bprm)
{
        int err;
        struct mm_struct *mm = NULL;

        bprm->mm = mm = mm_alloc();
        err = -ENOMEM;
        if (!mm)
                goto err;

        /* Save current stack limit for all calculations made during exec. */
        task_lock(current->group_leader);
        bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
        task_unlock(current->group_leader);

#ifndef CONFIG_MMU
        bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
#else
        err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p);
        if (err)
                goto err;
#endif

        return 0;

err:
        if (mm) {
                bprm->mm = NULL;
                mmdrop(mm);
        }

        return err;
}

struct user_arg_ptr {
#ifdef CONFIG_COMPAT
        bool is_compat;
#endif
        union {
                const char __user *const __user *native;
#ifdef CONFIG_COMPAT
                const compat_uptr_t __user *compat;
#endif
        } ptr;
};

static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
{
        const char __user *native;

#ifdef CONFIG_COMPAT
        if (unlikely(argv.is_compat)) {
                compat_uptr_t compat;

                if (get_user(compat, argv.ptr.compat + nr))
                        return ERR_PTR(-EFAULT);

                return compat_ptr(compat);
        }
#endif

        if (get_user(native, argv.ptr.native + nr))
                return ERR_PTR(-EFAULT);

        return native;
}

/*
 * count() counts the number of strings in array ARGV.
 */
static int count(struct user_arg_ptr argv, int max)
{
        int i = 0;

        if (argv.ptr.native != NULL) {
                for (;;) {
                        const char __user *p = get_user_arg_ptr(argv, i);

                        if (!p)
                                break;

                        if (IS_ERR(p))
                                return -EFAULT;

                        if (i >= max)
                                return -E2BIG;
                        ++i;

                        if (fatal_signal_pending(current))
                                return -ERESTARTNOHAND;
                        cond_resched();
                }
        }
        return i;
}

static int count_strings_kernel(const char *const *argv)
{
        int i;

        if (!argv)
                return 0;

        for (i = 0; argv[i]; ++i) {
                if (i >= MAX_ARG_STRINGS)
                        return -E2BIG;
                if (fatal_signal_pending(current))
                        return -ERESTARTNOHAND;
                cond_resched();
        }
        return i;
}

static inline int bprm_set_stack_limit(struct linux_binprm *bprm,
                                       unsigned long limit)
{
#ifdef CONFIG_MMU
        /* Avoid a pathological bprm->p. */
        if (bprm->p < limit)
                return -E2BIG;
        bprm->argmin = bprm->p - limit;
#endif
        return 0;
}
static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm)
{
#ifdef CONFIG_MMU
        return bprm->p < bprm->argmin;
#else
        return false;
#endif
}

/*
 * Calculate bprm->argmin from:
 * - _STK_LIM
 * - ARG_MAX
 * - bprm->rlim_stack.rlim_cur
 * - bprm->argc
 * - bprm->envc
 * - bprm->p
 */
static int bprm_stack_limits(struct linux_binprm *bprm)
{
        unsigned long limit, ptr_size;

        /*
         * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
         * (whichever is smaller) for the argv+env strings.
         * This ensures that:
         *  - the remaining binfmt code will not run out of stack space,
         *  - the program will have a reasonable amount of stack left
         *    to work from.
         */
        limit = _STK_LIM / 4 * 3;
        limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
        /*
         * We've historically supported up to 32 pages (ARG_MAX)
         * of argument strings even with small stacks
         */
        limit = max_t(unsigned long, limit, ARG_MAX);
        /* Reject totally pathological counts. */
        if (bprm->argc < 0 || bprm->envc < 0)
                return -E2BIG;
        /*
         * We must account for the size of all the argv and envp pointers to
         * the argv and envp strings, since they will also take up space in
         * the stack. They aren't stored until much later when we can't
         * signal to the parent that the child has run out of stack space.
         * Instead, calculate it here so it's possible to fail gracefully.
         *
         * In the case of argc = 0, make sure there is space for adding a
         * empty string (which will bump argc to 1), to ensure confused
         * userspace programs don't start processing from argv[1], thinking
         * argc can never be 0, to keep them from walking envp by accident.
         * See do_execveat_common().
         */
        if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) ||
            check_mul_overflow(ptr_size, sizeof(void *), &ptr_size))
                return -E2BIG;
        if (limit <= ptr_size)
                return -E2BIG;
        limit -= ptr_size;

        return bprm_set_stack_limit(bprm, limit);
}

/*
 * 'copy_strings()' copies argument/environment strings from the old
 * processes's memory to the new process's stack.  The call to get_user_pages()
 * ensures the destination page is created and not swapped out.
 */
static int copy_strings(int argc, struct user_arg_ptr argv,
                        struct linux_binprm *bprm)
{
        struct page *kmapped_page = NULL;
        char *kaddr = NULL;
        unsigned long kpos = 0;
        int ret;

        while (argc-- > 0) {
                const char __user *str;
                int len;
                unsigned long pos;

                ret = -EFAULT;
                str = get_user_arg_ptr(argv, argc);
                if (IS_ERR(str))
                        goto out;

                len = strnlen_user(str, MAX_ARG_STRLEN);
                if (!len)
                        goto out;

                ret = -E2BIG;
                if (!valid_arg_len(bprm, len))
                        goto out;

                /* We're going to work our way backwards. */
                pos = bprm->p;
                str += len;
                bprm->p -= len;
                if (bprm_hit_stack_limit(bprm))
                        goto out;

                while (len > 0) {
                        int offset, bytes_to_copy;

                        if (fatal_signal_pending(current)) {
                                ret = -ERESTARTNOHAND;
                                goto out;
                        }
                        cond_resched();

                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;

                        bytes_to_copy = offset;
                        if (bytes_to_copy > len)
                                bytes_to_copy = len;

                        offset -= bytes_to_copy;
                        pos -= bytes_to_copy;
                        str -= bytes_to_copy;
                        len -= bytes_to_copy;

                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                                struct page *page;

                                page = get_arg_page(bprm, pos, 1);
                                if (!page) {
                                        ret = -E2BIG;
                                        goto out;
                                }

                                if (kmapped_page) {
                                        flush_dcache_page(kmapped_page);
                                        kunmap_local(kaddr);
                                        put_arg_page(kmapped_page);
                                }
                                kmapped_page = page;
                                kaddr = kmap_local_page(kmapped_page);
                                kpos = pos & PAGE_MASK;
                                flush_arg_page(bprm, kpos, kmapped_page);
                        }
                        if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
                                ret = -EFAULT;
                                goto out;
                        }
                }
        }
        ret = 0;
out:
        if (kmapped_page) {
                flush_dcache_page(kmapped_page);
                kunmap_local(kaddr);
                put_arg_page(kmapped_page);
        }
        return ret;
}

/*
 * Copy and argument/environment string from the kernel to the processes stack.
 */
int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
{
        int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
        unsigned long pos = bprm->p;

        if (len == 0)
                return -EFAULT;
        if (!valid_arg_len(bprm, len))
                return -E2BIG;

        /* We're going to work our way backwards. */
        arg += len;
        bprm->p -= len;
        if (bprm_hit_stack_limit(bprm))
                return -E2BIG;

        while (len > 0) {
                unsigned int bytes_to_copy = min(len,
                                min_not_zero(offset_in_page(pos), PAGE_SIZE));
                struct page *page;

                pos -= bytes_to_copy;
                arg -= bytes_to_copy;
                len -= bytes_to_copy;

                page = get_arg_page(bprm, pos, 1);
                if (!page)
                        return -E2BIG;
                flush_arg_page(bprm, pos & PAGE_MASK, page);
                memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy);
                put_arg_page(page);
        }

        return 0;
}
EXPORT_SYMBOL(copy_string_kernel);

static int copy_strings_kernel(int argc, const char *const *argv,
                               struct linux_binprm *bprm)
{
        while (argc-- > 0) {
                int ret = copy_string_kernel(argv[argc], bprm);
                if (ret < 0)
                        return ret;
                if (fatal_signal_pending(current))
                        return -ERESTARTNOHAND;
                cond_resched();
        }
        return 0;
}

#ifdef CONFIG_MMU

/*
 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 * the stack is optionally relocated, and some extra space is added.
 */
int setup_arg_pages(struct linux_binprm *bprm,
                    unsigned long stack_top,
                    int executable_stack)
{
        int ret;
        unsigned long stack_shift;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = bprm->vma;
        struct vm_area_struct *prev = NULL;
        vm_flags_t vm_flags;
        unsigned long stack_base;
        unsigned long stack_size;
        unsigned long stack_expand;
        unsigned long rlim_stack;
        struct mmu_gather tlb;
        struct vma_iterator vmi;

#ifdef CONFIG_STACK_GROWSUP
        /* Limit stack size */
        stack_base = bprm->rlim_stack.rlim_max;

        stack_base = calc_max_stack_size(stack_base);

        /* Add space for stack randomization. */
        if (current->flags & PF_RANDOMIZE)
                stack_base += (STACK_RND_MASK << PAGE_SHIFT);

        /* Make sure we didn't let the argument array grow too large. */
        if (vma->vm_end - vma->vm_start > stack_base)
                return -ENOMEM;

        stack_base = PAGE_ALIGN(stack_top - stack_base);

        stack_shift = vma->vm_start - stack_base;
        mm->arg_start = bprm->p - stack_shift;
        bprm->p = vma->vm_end - stack_shift;
#else
        stack_top = arch_align_stack(stack_top);
        stack_top = PAGE_ALIGN(stack_top);

        if (unlikely(stack_top < mmap_min_addr) ||
            unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
                return -ENOMEM;

        stack_shift = vma->vm_end - stack_top;

        bprm->p -= stack_shift;
        mm->arg_start = bprm->p;
#endif

        bprm->exec -= stack_shift;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        vm_flags = VM_STACK_FLAGS;

        /*
         * Adjust stack execute permissions; explicitly enable for
         * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
         * (arch default) otherwise.
         */
        if (unlikely(executable_stack == EXSTACK_ENABLE_X))
                vm_flags |= VM_EXEC;
        else if (executable_stack == EXSTACK_DISABLE_X)
                vm_flags &= ~VM_EXEC;
        vm_flags |= mm->def_flags;
        vm_flags |= VM_STACK_INCOMPLETE_SETUP;

        vma_iter_init(&vmi, mm, vma->vm_start);

        tlb_gather_mmu(&tlb, mm);
        ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end,
                        vm_flags);
        tlb_finish_mmu(&tlb);

        if (ret)
                goto out_unlock;
        BUG_ON(prev != vma);

        if (unlikely(vm_flags & VM_EXEC)) {
                pr_warn_once("process '%pD4' started with executable stack\n",
                             bprm->file);
        }

        /* Move stack pages down in memory. */
        if (stack_shift) {
                /*
                 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
                 * the binfmt code determines where the new stack should reside, we shift it to
                 * its final location.
                 */
                ret = relocate_vma_down(vma, stack_shift);
                if (ret)
                        goto out_unlock;
        }

        /* mprotect_fixup is overkill to remove the temporary stack flags */
        vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP);

        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
        stack_size = vma->vm_end - vma->vm_start;
        /*
         * Align this down to a page boundary as expand_stack
         * will align it up.
         */
        rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;

        stack_expand = min(rlim_stack, stack_size + stack_expand);

#ifdef CONFIG_STACK_GROWSUP
        stack_base = vma->vm_start + stack_expand;
#else
        stack_base = vma->vm_end - stack_expand;
#endif
        current->mm->start_stack = bprm->p;
        ret = expand_stack_locked(vma, stack_base);
        if (ret)
                ret = -EFAULT;

out_unlock:
        mmap_write_unlock(mm);
        return ret;
}
EXPORT_SYMBOL(setup_arg_pages);

#else

/*
 * Transfer the program arguments and environment from the holding pages
 * onto the stack. The provided stack pointer is adjusted accordingly.
 */
int transfer_args_to_stack(struct linux_binprm *bprm,
                           unsigned long *sp_location)
{
        unsigned long index, stop, sp;
        int ret = 0;

        stop = bprm->p >> PAGE_SHIFT;
        sp = *sp_location;

        for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
                unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
                char *src = kmap_local_page(bprm->page[index]) + offset;
                sp -= PAGE_SIZE - offset;
                if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
                        ret = -EFAULT;
                kunmap_local(src);
                if (ret)
                        goto out;
        }

        bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE;
        *sp_location = sp;

out:
        return ret;
}
EXPORT_SYMBOL(transfer_args_to_stack);

#endif /* CONFIG_MMU */

/*
 * On success, caller must call do_close_execat() on the returned
 * struct file to close it.
 */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
        int err;
        struct file *file __free(fput) = NULL;
        struct open_flags open_exec_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_EXEC,
                .intent = LOOKUP_OPEN,
                .lookup_flags = LOOKUP_FOLLOW,
        };

        if ((flags &
             ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_EXECVE_CHECK)) != 0)
                return ERR_PTR(-EINVAL);
        if (flags & AT_SYMLINK_NOFOLLOW)
                open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;

        file = do_file_open(fd, name, &open_exec_flags);
        if (IS_ERR(file))
                return file;

        if (path_noexec(&file->f_path))
                return ERR_PTR(-EACCES);

        /*
         * In the past the regular type check was here. It moved to may_open() in
         * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
         * an invariant that all non-regular files error out before we get here.
         */
        if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)))
                return ERR_PTR(-EACCES);

        err = exe_file_deny_write_access(file);
        if (err)
                return ERR_PTR(err);

        return no_free_ptr(file);
}

/**
 * open_exec - Open a path name for execution
 *
 * @name: path name to open with the intent of executing it.
 *
 * Returns ERR_PTR on failure or allocated struct file on success.
 *
 * As this is a wrapper for the internal do_open_execat(), callers
 * must call exe_file_allow_write_access() before fput() on release. Also see
 * do_close_execat().
 */
struct file *open_exec(const char *name)
{
        CLASS(filename_kernel, filename)(name);
        return do_open_execat(AT_FDCWD, filename, 0);
}
EXPORT_SYMBOL(open_exec);

#if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC)
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
        ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
        if (res > 0)
                flush_icache_user_range(addr, addr + len);
        return res;
}
EXPORT_SYMBOL(read_code);
#endif

/*
 * Maps the mm_struct mm into the current task struct.
 * On success, this function returns with exec_update_lock
 * held for writing.
 */
static int exec_mmap(struct mm_struct *mm)
{
        struct task_struct *tsk;
        struct mm_struct *old_mm, *active_mm;
        int ret;

        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
        old_mm = current->mm;
        exec_mm_release(tsk, old_mm);

        ret = down_write_killable(&tsk->signal->exec_update_lock);
        if (ret)
                return ret;

        if (old_mm) {
                /*
                 * If there is a pending fatal signal perhaps a signal
                 * whose default action is to create a coredump get
                 * out and die instead of going through with the exec.
                 */
                ret = mmap_read_lock_killable(old_mm);
                if (ret) {
                        up_write(&tsk->signal->exec_update_lock);
                        return ret;
                }
        }

        task_lock(tsk);
        membarrier_exec_mmap(mm);

        local_irq_disable();
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
        mm_init_cid(mm, tsk);
        /*
         * This prevents preemption while active_mm is being loaded and
         * it and mm are being updated, which could cause problems for
         * lazy tlb mm refcounting when these are updated by context
         * switches. Not all architectures can handle irqs off over
         * activate_mm yet.
         */
        if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        activate_mm(active_mm, mm);
        if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        lru_gen_add_mm(mm);
        task_unlock(tsk);
        lru_gen_use_mm(mm);
        if (old_mm) {
                mmap_read_unlock(old_mm);
                BUG_ON(active_mm != old_mm);
                setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
                mm_update_next_owner(old_mm);
                mmput(old_mm);
                return 0;
        }
        mmdrop_lazy_tlb(active_mm);
        return 0;
}

static int de_thread(struct task_struct *tsk)
{
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;

        if (thread_group_empty(tsk))
                goto no_thread_group;

        /*
         * Kill all other threads in the thread group.
         */
        spin_lock_irq(lock);
        if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) {
                /*
                 * Another group action in progress, just
                 * return so that the signal is processed.
                 */
                spin_unlock_irq(lock);
                return -EAGAIN;
        }

        sig->group_exec_task = tsk;
        sig->notify_count = zap_other_threads(tsk);
        if (!thread_group_leader(tsk))
                sig->notify_count--;

        while (sig->notify_count) {
                __set_current_state(TASK_KILLABLE);
                spin_unlock_irq(lock);
                schedule();
                if (__fatal_signal_pending(tsk))
                        goto killed;
                spin_lock_irq(lock);
        }
        spin_unlock_irq(lock);

        /*
         * At this point all other threads have exited, all we have to
         * do is to wait for the thread group leader to become inactive,
         * and to assume its PID:
         */
        if (!thread_group_leader(tsk)) {
                struct task_struct *leader = tsk->group_leader;

                for (;;) {
                        cgroup_threadgroup_change_begin(tsk);
                        write_lock_irq(&tasklist_lock);
                        /*
                         * Do this under tasklist_lock to ensure that
                         * exit_notify() can't miss ->group_exec_task
                         */
                        sig->notify_count = -1;
                        if (likely(leader->exit_state))
                                break;
                        __set_current_state(TASK_KILLABLE);
                        write_unlock_irq(&tasklist_lock);
                        cgroup_threadgroup_change_end(tsk);
                        schedule();
                        if (__fatal_signal_pending(tsk))
                                goto killed;
                }

                /*
                 * The only record we have of the real-time age of a
                 * process, regardless of execs it's done, is start_time.
                 * All the past CPU time is accumulated in signal_struct
                 * from sister threads now dead.  But in this non-leader
                 * exec, nothing survives from the original leader thread,
                 * whose birth marks the true age of this process now.
                 * When we take on its identity by switching to its PID, we
                 * also take its birthdate (always earlier than our own).
                 */
                tsk->start_time = leader->start_time;
                tsk->start_boottime = leader->start_boottime;

                BUG_ON(!same_thread_group(leader, tsk));
                /*
                 * An exec() starts a new thread group with the
                 * TGID of the previous thread group. Rehash the
                 * two threads with a switched PID, and release
                 * the former thread group leader:
                 */

                /* Become a process group leader with the old leader's pid.
                 * The old leader becomes a thread of the this thread group.
                 */
                exchange_tids(tsk, leader);
                transfer_pid(leader, tsk, PIDTYPE_TGID);
                transfer_pid(leader, tsk, PIDTYPE_PGID);
                transfer_pid(leader, tsk, PIDTYPE_SID);

                list_replace_rcu(&leader->tasks, &tsk->tasks);
                list_replace_init(&leader->sibling, &tsk->sibling);

                tsk->group_leader = tsk;
                leader->group_leader = tsk;

                tsk->exit_signal = SIGCHLD;
                leader->exit_signal = -1;

                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;
                /*
                 * We are going to release_task()->ptrace_unlink() silently,
                 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
                 * the tracer won't block again waiting for this thread.
                 */
                if (unlikely(leader->ptrace))
                        __wake_up_parent(leader, leader->parent);
                write_unlock_irq(&tasklist_lock);
                cgroup_threadgroup_change_end(tsk);

                release_task(leader);
        }

        sig->group_exec_task = NULL;
        sig->notify_count = 0;

no_thread_group:
        /* we have changed execution domain */
        tsk->exit_signal = SIGCHLD;

        BUG_ON(!thread_group_leader(tsk));
        return 0;

killed:
        /* protects against exit_notify() and __exit_signal() */
        read_lock(&tasklist_lock);
        sig->group_exec_task = NULL;
        sig->notify_count = 0;
        read_unlock(&tasklist_lock);
        return -EAGAIN;
}


/*
 * This function makes sure the current process has its own signal table,
 * so that flush_signal_handlers can later reset the handlers without
 * disturbing other processes.  (Other processes might share the signal
 * table via the CLONE_SIGHAND option to clone().)
 */
static int unshare_sighand(struct task_struct *me)
{
        struct sighand_struct *oldsighand = me->sighand;

        if (refcount_read(&oldsighand->count) != 1) {
                struct sighand_struct *newsighand;
                /*
                 * This ->sighand is shared with the CLONE_SIGHAND
                 * but not CLONE_THREAD task, switch to the new one.
                 */
                newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
                if (!newsighand)
                        return -ENOMEM;

                refcount_set(&newsighand->count, 1);

                write_lock_irq(&tasklist_lock);
                spin_lock(&oldsighand->siglock);
                memcpy(newsighand->action, oldsighand->action,
                       sizeof(newsighand->action));
                rcu_assign_pointer(me->sighand, newsighand);
                spin_unlock(&oldsighand->siglock);
                write_unlock_irq(&tasklist_lock);

                __cleanup_sighand(oldsighand);
        }
        return 0;
}

/*
 * This is unlocked -- the string will always be NUL-terminated, but
 * may show overlapping contents if racing concurrent reads.
 */
void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
        size_t len = strnlen(buf, sizeof(tsk->comm) - 1);

        trace_task_rename(tsk, buf);
        memcpy(tsk->comm, buf, len);
        memset(&tsk->comm[len], 0, sizeof(tsk->comm) - len);
        perf_event_comm(tsk, exec);
}

/*
 * Calling this is the point of no return. None of the failures will be
 * seen by userspace since either the process is already taking a fatal
 * signal (via de_thread() or coredump), or will have SEGV raised
 * (after exec_mmap()) by search_binary_handler (see below).
 */
int begin_new_exec(struct linux_binprm * bprm)
{
        struct task_struct *me = current;
        int retval;

        /* Once we are committed compute the creds */
        retval = bprm_creds_from_file(bprm);
        if (retval)
                return retval;

        /*
         * This tracepoint marks the point before flushing the old exec where
         * the current task is still unchanged, but errors are fatal (point of
         * no return). The later "sched_process_exec" tracepoint is called after
         * the current task has successfully switched to the new exec.
         */
        trace_sched_prepare_exec(current, bprm);

        /*
         * Ensure all future errors are fatal.
         */
        bprm->point_of_no_return = true;

        /* Make this the only thread in the thread group */
        retval = de_thread(me);
        if (retval)
                goto out;
        /* see the comment in check_unsafe_exec() */
        current->fs->in_exec = 0;
        /*
         * Cancel any io_uring activity across execve
         */
        io_uring_task_cancel();

        /* Ensure the files table is not shared. */
        retval = unshare_files();
        if (retval)
                goto out;

        /*
         * Must be called _before_ exec_mmap() as bprm->mm is
         * not visible until then. Doing it here also ensures
         * we don't race against replace_mm_exe_file().
         */
        retval = set_mm_exe_file(bprm->mm, bprm->file);
        if (retval)
                goto out;

        /* If the binary is not readable then enforce mm->dumpable=0 */
        would_dump(bprm, bprm->file);
        if (bprm->have_execfd)
                would_dump(bprm, bprm->executable);

        /*
         * Release all of the old mmap stuff
         */
        acct_arg_size(bprm, 0);
        retval = exec_mmap(bprm->mm);
        if (retval)
                goto out;

        bprm->mm = NULL;

        retval = exec_task_namespaces();
        if (retval)
                goto out_unlock;

#ifdef CONFIG_POSIX_TIMERS
        spin_lock_irq(&me->sighand->siglock);
        posix_cpu_timers_exit(me);
        spin_unlock_irq(&me->sighand->siglock);
        exit_itimers(me);
        flush_itimer_signals();
#endif

        /*
         * Make the signal table private.
         */
        retval = unshare_sighand(me);
        if (retval)
                goto out_unlock;

        me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
                                        PF_NOFREEZE | PF_NO_SETAFFINITY);
        flush_thread();
        me->personality &= ~bprm->per_clear;

        clear_syscall_work_syscall_user_dispatch(me);

        /*
         * We have to apply CLOEXEC before we change whether the process is
         * dumpable (in setup_new_exec) to avoid a race with a process in userspace
         * trying to access the should-be-closed file descriptors of a process
         * undergoing exec(2).
         */
        do_close_on_exec(me->files);

        if (bprm->secureexec) {
                /* Make sure parent cannot signal privileged process. */
                me->pdeath_signal = 0;

                /*
                 * For secureexec, reset the stack limit to sane default to
                 * avoid bad behavior from the prior rlimits. This has to
                 * happen before arch_pick_mmap_layout(), which examines
                 * RLIMIT_STACK, but after the point of no return to avoid
                 * needing to clean up the change on failure.
                 */
                if (bprm->rlim_stack.rlim_cur > _STK_LIM)
                        bprm->rlim_stack.rlim_cur = _STK_LIM;
        }

        me->sas_ss_sp = me->sas_ss_size = 0;

        /*
         * Figure out dumpability. Note that this checking only of current
         * is wrong, but userspace depends on it. This should be testing
         * bprm->secureexec instead.
         */
        if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
            !(uid_eq(current_euid(), current_uid()) &&
              gid_eq(current_egid(), current_gid())))
                set_dumpable(current->mm, suid_dumpable);
        else
                set_dumpable(current->mm, SUID_DUMP_USER);

        perf_event_exec();

        /*
         * If the original filename was empty, alloc_bprm() made up a path
         * that will probably not be useful to admins running ps or similar.
         * Let's fix it up to be something reasonable.
         */
        if (bprm->comm_from_dentry) {
                /*
                 * Hold RCU lock to keep the name from being freed behind our back.
                 * Use acquire semantics to make sure the terminating NUL from
                 * __d_alloc() is seen.
                 *
                 * Note, we're deliberately sloppy here. We don't need to care about
                 * detecting a concurrent rename and just want a terminated name.
                 */
                rcu_read_lock();
                __set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name),
                                true);
                rcu_read_unlock();
        } else {
                __set_task_comm(me, kbasename(bprm->filename), true);
        }

        /* An exec changes our domain. We are no longer part of the thread
           group */
        WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
        flush_signal_handlers(me, 0);

        retval = set_cred_ucounts(bprm->cred);
        if (retval < 0)
                goto out_unlock;

        /*
         * install the new credentials for this executable
         */
        security_bprm_committing_creds(bprm);

        commit_creds(bprm->cred);
        bprm->cred = NULL;

        /*
         * Disable monitoring for regular users
         * when executing setuid binaries. Must
         * wait until new credentials are committed
         * by commit_creds() above
         */
        if (get_dumpable(me->mm) != SUID_DUMP_USER)
                perf_event_exit_task(me);
        /*
         * cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
         * credentials; any time after this it may be unlocked.
         */
        security_bprm_committed_creds(bprm);

        /* Pass the opened binary to the interpreter. */
        if (bprm->have_execfd) {
                retval = FD_ADD(0, bprm->executable);
                if (retval < 0)
                        goto out_unlock;
                bprm->executable = NULL;
                bprm->execfd = retval;
        }
        return 0;

out_unlock:
        up_write(&me->signal->exec_update_lock);
        if (!bprm->cred)
                mutex_unlock(&me->signal->cred_guard_mutex);

out:
        return retval;
}
EXPORT_SYMBOL(begin_new_exec);

void would_dump(struct linux_binprm *bprm, struct file *file)
{
        struct inode *inode = file_inode(file);
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        if (inode_permission(idmap, inode, MAY_READ) < 0) {
                struct user_namespace *old, *user_ns;
                bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;

                /* Ensure mm->user_ns contains the executable */
                user_ns = old = bprm->mm->user_ns;
                while ((user_ns != &init_user_ns) &&
                       !privileged_wrt_inode_uidgid(user_ns, idmap, inode))
                        user_ns = user_ns->parent;

                if (old != user_ns) {
                        bprm->mm->user_ns = get_user_ns(user_ns);
                        put_user_ns(old);
                }
        }
}
EXPORT_SYMBOL(would_dump);

void setup_new_exec(struct linux_binprm * bprm)
{
        /* Setup things that can depend upon the personality */
        struct task_struct *me = current;

        arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);

        arch_setup_new_exec();

        /* Set the new mm task size. We have to do that late because it may
         * depend on TIF_32BIT which is only updated in flush_thread() on
         * some architectures like powerpc
         */
        me->mm->task_size = TASK_SIZE;
        up_write(&me->signal->exec_update_lock);
        mutex_unlock(&me->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(setup_new_exec);

/* Runs immediately before start_thread() takes over. */
void finalize_exec(struct linux_binprm *bprm)
{
        /* Store any stack rlimit changes before starting thread. */
        task_lock(current->group_leader);
        current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
        task_unlock(current->group_leader);
}
EXPORT_SYMBOL(finalize_exec);

/*
 * Prepare credentials and lock ->cred_guard_mutex.
 * setup_new_exec() commits the new creds and drops the lock.
 * Or, if exec fails before, free_bprm() should release ->cred
 * and unlock.
 */
static int prepare_bprm_creds(struct linux_binprm *bprm)
{
        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
                return -ERESTARTNOINTR;

        bprm->cred = prepare_exec_creds();
        if (likely(bprm->cred))
                return 0;

        mutex_unlock(&current->signal->cred_guard_mutex);
        return -ENOMEM;
}

/* Matches do_open_execat() */
static void do_close_execat(struct file *file)
{
        if (!file)
                return;
        exe_file_allow_write_access(file);
        fput(file);
}

static void free_bprm(struct linux_binprm *bprm)
{
        if (bprm->mm) {
                acct_arg_size(bprm, 0);
                mmput(bprm->mm);
        }
        free_arg_pages(bprm);
        if (bprm->cred) {
                /* in case exec fails before de_thread() succeeds */
                current->fs->in_exec = 0;
                mutex_unlock(&current->signal->cred_guard_mutex);
                abort_creds(bprm->cred);
        }
        do_close_execat(bprm->file);
        if (bprm->executable)
                fput(bprm->executable);
        /* If a binfmt changed the interp, free it. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
        kfree(bprm->fdpath);
        kfree(bprm);
}

static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags)
{
        struct linux_binprm *bprm;
        struct file *file;
        int retval = -ENOMEM;

        file = do_open_execat(fd, filename, flags);
        if (IS_ERR(file))
                return ERR_CAST(file);

        bprm = kzalloc_obj(*bprm);
        if (!bprm) {
                do_close_execat(file);
                return ERR_PTR(-ENOMEM);
        }

        bprm->file = file;

        if (fd == AT_FDCWD || filename->name[0] == '/') {
                bprm->filename = filename->name;
        } else {
                if (filename->name[0] == '\0') {
                        bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
                        bprm->comm_from_dentry = 1;
                } else {
                        bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
                                                  fd, filename->name);
                }
                if (!bprm->fdpath)
                        goto out_free;

                /*
                 * Record that a name derived from an O_CLOEXEC fd will be
                 * inaccessible after exec.  This allows the code in exec to
                 * choose to fail when the executable is not mmaped into the
                 * interpreter and an open file descriptor is not passed to
                 * the interpreter.  This makes for a better user experience
                 * than having the interpreter start and then immediately fail
                 * when it finds the executable is inaccessible.
                 */
                if (get_close_on_exec(fd))
                        bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;

                bprm->filename = bprm->fdpath;
        }
        bprm->interp = bprm->filename;

        /*
         * At this point, security_file_open() has already been called (with
         * __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will
         * stop just after the security_bprm_creds_for_exec() call in
         * bprm_execve().  Indeed, the kernel should not try to parse the
         * content of the file with exec_binprm() nor change the calling
         * thread, which means that the following security functions will not
         * be called:
         * - security_bprm_check()
         * - security_bprm_creds_from_file()
         * - security_bprm_committing_creds()
         * - security_bprm_committed_creds()
         */
        bprm->is_check = !!(flags & AT_EXECVE_CHECK);

        retval = bprm_mm_init(bprm);
        if (!retval)
                return bprm;

out_free:
        free_bprm(bprm);
        return ERR_PTR(retval);
}

DEFINE_CLASS(bprm, struct linux_binprm *, if (!IS_ERR(_T)) free_bprm(_T),
        alloc_bprm(fd, name, flags), int fd, struct filename *name, int flags)

int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
{
        /* If a binfmt changed the interp, free it first. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
        bprm->interp = kstrdup(interp, GFP_KERNEL);
        if (!bprm->interp)
                return -ENOMEM;
        return 0;
}
EXPORT_SYMBOL(bprm_change_interp);

/*
 * determine how safe it is to execute the proposed program
 * - the caller must hold ->cred_guard_mutex to protect against
 *   PTRACE_ATTACH or seccomp thread-sync
 */
static void check_unsafe_exec(struct linux_binprm *bprm)
{
        struct task_struct *p = current, *t;
        unsigned n_fs;

        if (p->ptrace)
                bprm->unsafe |= LSM_UNSAFE_PTRACE;

        /*
         * This isn't strictly necessary, but it makes it harder for LSMs to
         * mess up.
         */
        if (task_no_new_privs(current))
                bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;

        /*
         * If another task is sharing our fs, we cannot safely
         * suid exec because the differently privileged task
         * will be able to manipulate the current directory, etc.
         * It would be nice to force an unshare instead...
         *
         * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS)
         * from another sub-thread until de_thread() succeeds, this
         * state is protected by cred_guard_mutex we hold.
         */
        n_fs = 1;
        read_seqlock_excl(&p->fs->seq);
        rcu_read_lock();
        for_other_threads(p, t) {
                if (t->fs == p->fs)
                        n_fs++;
        }
        rcu_read_unlock();

        /* "users" and "in_exec" locked for copy_fs() */
        if (p->fs->users > n_fs)
                bprm->unsafe |= LSM_UNSAFE_SHARE;
        else
                p->fs->in_exec = 1;
        read_sequnlock_excl(&p->fs->seq);
}

static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
{
        /* Handle suid and sgid on files */
        struct mnt_idmap *idmap;
        struct inode *inode = file_inode(file);
        unsigned int mode;
        vfsuid_t vfsuid;
        vfsgid_t vfsgid;
        int err;

        if (!mnt_may_suid(file->f_path.mnt))
                return;

        if (task_no_new_privs(current))
                return;

        mode = READ_ONCE(inode->i_mode);
        if (!(mode & (S_ISUID|S_ISGID)))
                return;

        idmap = file_mnt_idmap(file);

        /* Be careful if suid/sgid is set */
        inode_lock(inode);

        /* Atomically reload and check mode/uid/gid now that lock held. */
        mode = inode->i_mode;
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        vfsgid = i_gid_into_vfsgid(idmap, inode);
        err = inode_permission(idmap, inode, MAY_EXEC);
        inode_unlock(inode);

        /* Did the exec bit vanish out from under us? Give up. */
        if (err)
                return;

        /* We ignore suid/sgid if there are no mappings for them in the ns */
        if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
            !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid))
                return;

        if (mode & S_ISUID) {
                bprm->per_clear |= PER_CLEAR_ON_SETID;
                bprm->cred->euid = vfsuid_into_kuid(vfsuid);
        }

        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
                bprm->per_clear |= PER_CLEAR_ON_SETID;
                bprm->cred->egid = vfsgid_into_kgid(vfsgid);
        }
}

/*
 * Compute brpm->cred based upon the final binary.
 */
static int bprm_creds_from_file(struct linux_binprm *bprm)
{
        /* Compute creds based on which file? */
        struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;

        bprm_fill_uid(bprm, file);
        return security_bprm_creds_from_file(bprm, file);
}

/*
 * Fill the binprm structure from the inode.
 * Read the first BINPRM_BUF_SIZE bytes
 *
 * This may be called multiple times for binary chains (scripts for example).
 */
static int prepare_binprm(struct linux_binprm *bprm)
{
        loff_t pos = 0;

        memset(bprm->buf, 0, BINPRM_BUF_SIZE);
        return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
}

/*
 * Arguments are '\0' separated strings found at the location bprm->p
 * points to; chop off the first by relocating brpm->p to right after
 * the first '\0' encountered.
 */
int remove_arg_zero(struct linux_binprm *bprm)
{
        unsigned long offset;
        char *kaddr;
        struct page *page;

        if (!bprm->argc)
                return 0;

        do {
                offset = bprm->p & ~PAGE_MASK;
                page = get_arg_page(bprm, bprm->p, 0);
                if (!page)
                        return -EFAULT;
                kaddr = kmap_local_page(page);

                for (; offset < PAGE_SIZE && kaddr[offset];
                                offset++, bprm->p++)
                        ;

                kunmap_local(kaddr);
                put_arg_page(page);
        } while (offset == PAGE_SIZE);

        bprm->p++;
        bprm->argc--;

        return 0;
}
EXPORT_SYMBOL(remove_arg_zero);

/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
static int search_binary_handler(struct linux_binprm *bprm)
{
        struct linux_binfmt *fmt;
        int retval;

        retval = prepare_binprm(bprm);
        if (retval < 0)
                return retval;

        retval = security_bprm_check(bprm);
        if (retval)
                return retval;

        read_lock(&binfmt_lock);
        list_for_each_entry(fmt, &formats, lh) {
                if (!try_module_get(fmt->module))
                        continue;
                read_unlock(&binfmt_lock);

                retval = fmt->load_binary(bprm);

                read_lock(&binfmt_lock);
                put_binfmt(fmt);
                if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
                        read_unlock(&binfmt_lock);
                        return retval;
                }
        }
        read_unlock(&binfmt_lock);

        return -ENOEXEC;
}

/* binfmt handlers will call back into begin_new_exec() on success. */
static int exec_binprm(struct linux_binprm *bprm)
{
        pid_t old_pid, old_vpid;
        int ret, depth;

        /* Need to fetch pid before load_binary changes it */
        old_pid = current->pid;
        rcu_read_lock();
        old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
        rcu_read_unlock();

        /* This allows 4 levels of binfmt rewrites before failing hard. */
        for (depth = 0;; depth++) {
                struct file *exec;
                if (depth > 5)
                        return -ELOOP;

                ret = search_binary_handler(bprm);
                if (ret < 0)
                        return ret;
                if (!bprm->interpreter)
                        break;

                exec = bprm->file;
                bprm->file = bprm->interpreter;
                bprm->interpreter = NULL;

                exe_file_allow_write_access(exec);
                if (unlikely(bprm->have_execfd)) {
                        if (bprm->executable) {
                                fput(exec);
                                return -ENOEXEC;
                        }
                        bprm->executable = exec;
                } else
                        fput(exec);
        }

        audit_bprm(bprm);
        trace_sched_process_exec(current, old_pid, bprm);
        ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
        proc_exec_connector(current);
        return 0;
}

static int bprm_execve(struct linux_binprm *bprm)
{
        int retval;

        retval = prepare_bprm_creds(bprm);
        if (retval)
                return retval;

        /*
         * Check for unsafe execution states before exec_binprm(), which
         * will call back into begin_new_exec(), into bprm_creds_from_file(),
         * where setuid-ness is evaluated.
         */
        check_unsafe_exec(bprm);
        current->in_execve = 1;
        sched_mm_cid_before_execve(current);

        sched_exec();

        /* Set the unchanging part of bprm->cred */
        retval = security_bprm_creds_for_exec(bprm);
        if (retval || bprm->is_check)
                goto out;

        retval = exec_binprm(bprm);
        if (retval < 0)
                goto out;

        sched_mm_cid_after_execve(current);
        rseq_execve(current);
        /* execve succeeded */
        current->in_execve = 0;
        user_events_execve(current);
        acct_update_integrals(current);
        task_numa_free(current, false);
        return retval;

out:
        /*
         * If past the point of no return ensure the code never
         * returns to the userspace process.  Use an existing fatal
         * signal if present otherwise terminate the process with
         * SIGSEGV.
         */
        if (bprm->point_of_no_return && !fatal_signal_pending(current))
                force_fatal_sig(SIGSEGV);

        sched_mm_cid_after_execve(current);
        rseq_force_update();
        current->in_execve = 0;

        return retval;
}

static int do_execveat_common(int fd, struct filename *filename,
                              struct user_arg_ptr argv,
                              struct user_arg_ptr envp,
                              int flags)
{
        int retval;

        /*
         * We move the actual failure in case of RLIMIT_NPROC excess from
         * set*uid() to execve() because too many poorly written programs
         * don't check setuid() return code.  Here we additionally recheck
         * whether NPROC limit is still exceeded.
         */
        if ((current->flags & PF_NPROC_EXCEEDED) &&
            is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)))
                return -EAGAIN;

        /* We're below the limit (still or again), so we don't want to make
         * further execve() calls fail. */
        current->flags &= ~PF_NPROC_EXCEEDED;

        CLASS(bprm, bprm)(fd, filename, flags);
        if (IS_ERR(bprm))
                return PTR_ERR(bprm);

        retval = count(argv, MAX_ARG_STRINGS);
        if (retval < 0)
                return retval;
        bprm->argc = retval;

        retval = count(envp, MAX_ARG_STRINGS);
        if (retval < 0)
                return retval;
        bprm->envc = retval;

        retval = bprm_stack_limits(bprm);
        if (retval < 0)
                return retval;

        retval = copy_string_kernel(bprm->filename, bprm);
        if (retval < 0)
                return retval;
        bprm->exec = bprm->p;

        retval = copy_strings(bprm->envc, envp, bprm);
        if (retval < 0)
                return retval;

        retval = copy_strings(bprm->argc, argv, bprm);
        if (retval < 0)
                return retval;

        /*
         * When argv is empty, add an empty string ("") as argv[0] to
         * ensure confused userspace programs that start processing
         * from argv[1] won't end up walking envp. See also
         * bprm_stack_limits().
         */
        if (bprm->argc == 0) {
                retval = copy_string_kernel("", bprm);
                if (retval < 0)
                        return retval;
                bprm->argc = 1;

                pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
                             current->comm, bprm->filename);
        }

        return bprm_execve(bprm);
}

int kernel_execve(const char *kernel_filename,
                  const char *const *argv, const char *const *envp)
{
        int retval;

        /* It is non-sense for kernel threads to call execve */
        if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
                return -EINVAL;

        CLASS(filename_kernel, filename)(kernel_filename);
        CLASS(bprm, bprm)(AT_FDCWD, filename, 0);
        if (IS_ERR(bprm))
                return PTR_ERR(bprm);

        retval = count_strings_kernel(argv);
        if (WARN_ON_ONCE(retval == 0))
                return -EINVAL;
        if (retval < 0)
                return retval;
        bprm->argc = retval;

        retval = count_strings_kernel(envp);
        if (retval < 0)
                return retval;
        bprm->envc = retval;

        retval = bprm_stack_limits(bprm);
        if (retval < 0)
                return retval;

        retval = copy_string_kernel(bprm->filename, bprm);
        if (retval < 0)
                return retval;
        bprm->exec = bprm->p;

        retval = copy_strings_kernel(bprm->envc, envp, bprm);
        if (retval < 0)
                return retval;

        retval = copy_strings_kernel(bprm->argc, argv, bprm);
        if (retval < 0)
                return retval;

        return bprm_execve(bprm);
}

void set_binfmt(struct linux_binfmt *new)
{
        struct mm_struct *mm = current->mm;

        if (mm->binfmt)
                module_put(mm->binfmt->module);

        mm->binfmt = new;
        if (new)
                __module_get(new->module);
}
EXPORT_SYMBOL(set_binfmt);

/*
 * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
 */
void set_dumpable(struct mm_struct *mm, int value)
{
        if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
                return;

        __mm_flags_set_mask_dumpable(mm, value);
}

static inline struct user_arg_ptr native_arg(const char __user *const __user *p)
{
        return (struct user_arg_ptr){.ptr.native = p};
}

SYSCALL_DEFINE3(execve,
                const char __user *, filename,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp)
{
        CLASS(filename, name)(filename);
        return do_execveat_common(AT_FDCWD, name,
                                  native_arg(argv), native_arg(envp), 0);
}

SYSCALL_DEFINE5(execveat,
                int, fd, const char __user *, filename,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp,
                int, flags)
{
        CLASS(filename_uflags, name)(filename, flags);
        return do_execveat_common(fd, name,
                                  native_arg(argv), native_arg(envp), flags);
}

#ifdef CONFIG_COMPAT

static inline struct user_arg_ptr compat_arg(const compat_uptr_t __user *p)
{
        return (struct user_arg_ptr){.is_compat = true, .ptr.compat = p};
}

COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
        const compat_uptr_t __user *, argv,
        const compat_uptr_t __user *, envp)
{
        CLASS(filename, name)(filename);
        return do_execveat_common(AT_FDCWD, name,
                                  compat_arg(argv), compat_arg(envp), 0);
}

COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
                       const char __user *, filename,
                       const compat_uptr_t __user *, argv,
                       const compat_uptr_t __user *, envp,
                       int,  flags)
{
        CLASS(filename_uflags, name)(filename, flags);
        return do_execveat_common(fd, name,
                                  compat_arg(argv), compat_arg(envp), flags);
}
#endif

#ifdef CONFIG_SYSCTL

static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (!error && write)
                validate_coredump_safety();
        return error;
}

static const struct ctl_table fs_exec_sysctls[] = {
        {
                .procname        = "suid_dumpable",
                .data                = &suid_dumpable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax_coredump,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
};

static int __init init_fs_exec_sysctls(void)
{
        register_sysctl_init("fs", fs_exec_sysctls);
        return 0;
}

fs_initcall(init_fs_exec_sysctls);
#endif /* CONFIG_SYSCTL */

#ifdef CONFIG_EXEC_KUNIT_TEST
#include "tests/exec_kunit.c"
#endif































































































   27 
   32 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Generic Timer-queue
 *
 *  Manages a simple queue of timers, ordered by expiration time.
 *  Uses rbtrees for quick list adds and expiration.
 *
 *  NOTE: All of the following functions need to be serialized
 *  to avoid races. No locking is done by this library code.
 */

#include <linux/bug.h>
#include <linux/timerqueue.h>
#include <linux/rbtree.h>
#include <linux/export.h>

#define __node_2_tq(_n) \
        rb_entry((_n), struct timerqueue_node, node)

static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b)
{
        return __node_2_tq(a)->expires < __node_2_tq(b)->expires;
}

/**
 * timerqueue_add - Adds timer to timerqueue.
 *
 * @head: head of timerqueue
 * @node: timer node to be added
 *
 * Adds the timer node to the timerqueue, sorted by the node's expires
 * value. Returns true if the newly added timer is the first expiring timer in
 * the queue.
 */
bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
{
        /* Make sure we don't add nodes that are already added */
        WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));

        return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less);
}
EXPORT_SYMBOL_GPL(timerqueue_add);

/**
 * timerqueue_del - Removes a timer from the timerqueue.
 *
 * @head: head of timerqueue
 * @node: timer node to be removed
 *
 * Removes the timer node from the timerqueue. Returns true if the queue is
 * not empty after the remove.
 */
bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));

        rb_erase_cached(&node->node, &head->rb_root);
        RB_CLEAR_NODE(&node->node);

        return !RB_EMPTY_ROOT(&head->rb_root.rb_root);
}
EXPORT_SYMBOL_GPL(timerqueue_del);

/**
 * timerqueue_iterate_next - Returns the timer after the provided timer
 *
 * @node: Pointer to a timer.
 *
 * Provides the timer that is after the given node. This is used, when
 * necessary, to iterate through the list of timers in a timer list
 * without modifying the list.
 */
struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
{
        struct rb_node *next;

        if (!node)
                return NULL;
        next = rb_next(&node->node);
        if (!next)
                return NULL;
        return container_of(next, struct timerqueue_node, node);
}
EXPORT_SYMBOL_GPL(timerqueue_iterate_next);

#define __node_2_tq_linked(_n) \
        container_of(rb_entry((_n), struct rb_node_linked, node), struct timerqueue_linked_node, node)

static __always_inline bool __tq_linked_less(struct rb_node *a, const struct rb_node *b)
{
        return __node_2_tq_linked(a)->expires < __node_2_tq_linked(b)->expires;
}

bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node)
{
        return rb_add_linked(&node->node, &head->rb_root, __tq_linked_less);
}
EXPORT_SYMBOL_GPL(timerqueue_linked_add);














    2 




























































    2 


















































































    2 






   22 






    2 























































































   23 













    2 





































   21 






    2 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM net

#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NET_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/tracepoint.h>

TRACE_EVENT(net_dev_start_xmit,

        TP_PROTO(const struct sk_buff *skb, const struct net_device *dev),

        TP_ARGS(skb, dev),

        TP_STRUCT__entry(
                __string(        name,                        dev->name        )
                __field(        u16,                        queue_mapping        )
                __field(        const void *,                skbaddr                )
                __field(        bool,                        vlan_tagged        )
                __field(        u16,                        vlan_proto        )
                __field(        u16,                        vlan_tci        )
                __field(        u16,                        protocol        )
                __field(        u8,                        ip_summed        )
                __field(        unsigned int,                len                )
                __field(        unsigned int,                data_len        )
                __field(        int,                        network_offset        )
                __field(        bool,                        transport_offset_valid)
                __field(        int,                        transport_offset)
                __field(        u8,                        tx_flags        )
                __field(        u16,                        gso_size        )
                __field(        u16,                        gso_segs        )
                __field(        u16,                        gso_type        )
                __field(        u64,                        net_cookie        )
        ),

        TP_fast_assign(
                __assign_str(name);
                __entry->queue_mapping = skb->queue_mapping;
                __entry->skbaddr = skb;
                __entry->vlan_tagged = skb_vlan_tag_present(skb);
                __entry->vlan_proto = ntohs(skb->vlan_proto);
                __entry->vlan_tci = skb_vlan_tag_get(skb);
                __entry->protocol = ntohs(skb->protocol);
                __entry->ip_summed = skb->ip_summed;
                __entry->len = skb->len;
                __entry->data_len = skb->data_len;
                __entry->network_offset = skb_network_offset(skb);
                __entry->transport_offset_valid =
                        skb_transport_header_was_set(skb);
                __entry->transport_offset = skb_transport_header_was_set(skb) ?
                        skb_transport_offset(skb) : 0;
                __entry->tx_flags = skb_shinfo(skb)->tx_flags;
                __entry->gso_size = skb_shinfo(skb)->gso_size;
                __entry->gso_segs = skb_shinfo(skb)->gso_segs;
                __entry->gso_type = skb_shinfo(skb)->gso_type;
                __entry->net_cookie = dev_net(dev)->net_cookie;
        ),

        TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x net_cookie=%llu",
                  __get_str(name), __entry->queue_mapping, __entry->skbaddr,
                  __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
                  __entry->protocol, __entry->ip_summed, __entry->len,
                  __entry->data_len,
                  __entry->network_offset, __entry->transport_offset_valid,
                  __entry->transport_offset, __entry->tx_flags,
                  __entry->gso_size, __entry->gso_segs,
                  __entry->gso_type, __entry->net_cookie)
);

TRACE_EVENT(net_dev_xmit,

        TP_PROTO(struct sk_buff *skb,
                 int rc,
                 struct net_device *dev,
                 unsigned int skb_len),

        TP_ARGS(skb, rc, dev, skb_len),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        unsigned int,        len                )
                __field(        int,                rc                )
                __string(        name,                dev->name        )
                __field(        u64,                net_cookie        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = skb_len;
                __entry->rc = rc;
                __entry->net_cookie = dev_net(dev)->net_cookie;
                __assign_str(name);
        ),

        TP_printk("dev=%s skbaddr=%p len=%u rc=%d net_cookie=%llu",
                __get_str(name), __entry->skbaddr,
                __entry->len, __entry->rc,
                __entry->net_cookie)
);

TRACE_EVENT(net_dev_xmit_timeout,

        TP_PROTO(struct net_device *dev,
                 int queue_index),

        TP_ARGS(dev, queue_index),

        TP_STRUCT__entry(
                __string(        name,                dev->name        )
                __string(        driver,                netdev_drivername(dev))
                __field(        int,                queue_index        )
                __field(        u64,                net_cookie        )
        ),

        TP_fast_assign(
                __assign_str(name);
                __assign_str(driver);
                __entry->queue_index = queue_index;
                __entry->net_cookie = dev_net(dev)->net_cookie;
        ),

        TP_printk("dev=%s driver=%s queue=%d net_cookie=%llu",
                __get_str(name), __get_str(driver),
                __entry->queue_index, __entry->net_cookie)
);

DECLARE_EVENT_CLASS(net_dev_template,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        unsigned int,        len                )
                __string(        name,                skb->dev->name        )
                __field(        u64,                net_cookie        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = skb->len;
                __entry->net_cookie = dev_net(skb->dev)->net_cookie;
                __assign_str(name);
        ),

        TP_printk("dev=%s skbaddr=%p len=%u net_cookie=%llu",
                __get_str(name), __entry->skbaddr,
                __entry->len,
                __entry->net_cookie)
)

DEFINE_EVENT(net_dev_template, net_dev_queue,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_template, netif_receive_skb,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_template, netif_rx,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __string(        name,                        skb->dev->name        )
                __field(        unsigned int,                napi_id                )
                __field(        u16,                        queue_mapping        )
                __field(        const void *,                skbaddr                )
                __field(        bool,                        vlan_tagged        )
                __field(        u16,                        vlan_proto        )
                __field(        u16,                        vlan_tci        )
                __field(        u16,                        protocol        )
                __field(        u8,                        ip_summed        )
                __field(        u32,                        hash                )
                __field(        bool,                        l4_hash                )
                __field(        unsigned int,                len                )
                __field(        unsigned int,                data_len        )
                __field(        unsigned int,                truesize        )
                __field(        bool,                        mac_header_valid)
                __field(        int,                        mac_header        )
                __field(        unsigned char,                nr_frags        )
                __field(        u16,                        gso_size        )
                __field(        u16,                        gso_type        )
                __field(        u64,                        net_cookie        )
        ),

        TP_fast_assign(
                __assign_str(name);
#ifdef CONFIG_NET_RX_BUSY_POLL
                __entry->napi_id = skb->napi_id;
#else
                __entry->napi_id = 0;
#endif
                __entry->queue_mapping = skb->queue_mapping;
                __entry->skbaddr = skb;
                __entry->vlan_tagged = skb_vlan_tag_present(skb);
                __entry->vlan_proto = ntohs(skb->vlan_proto);
                __entry->vlan_tci = skb_vlan_tag_get(skb);
                __entry->protocol = ntohs(skb->protocol);
                __entry->ip_summed = skb->ip_summed;
                __entry->hash = skb->hash;
                __entry->l4_hash = skb->l4_hash;
                __entry->len = skb->len;
                __entry->data_len = skb->data_len;
                __entry->truesize = skb->truesize;
                __entry->mac_header_valid = skb_mac_header_was_set(skb);
                __entry->mac_header = skb_mac_header(skb) - skb->data;
                __entry->nr_frags = skb_shinfo(skb)->nr_frags;
                __entry->gso_size = skb_shinfo(skb)->gso_size;
                __entry->gso_type = skb_shinfo(skb)->gso_type;
                __entry->net_cookie = dev_net(skb->dev)->net_cookie;
        ),

        TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x net_cookie=%llu",
                  __get_str(name), __entry->napi_id, __entry->queue_mapping,
                  __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
                  __entry->vlan_tci, __entry->protocol, __entry->ip_summed,
                  __entry->hash, __entry->l4_hash, __entry->len,
                  __entry->data_len, __entry->truesize,
                  __entry->mac_header_valid, __entry->mac_header,
                  __entry->nr_frags, __entry->gso_size,
                  __entry->gso_type, __entry->net_cookie)
);

DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DECLARE_EVENT_CLASS(net_dev_rx_exit_template,

        TP_PROTO(int ret),

        TP_ARGS(ret),

        TP_STRUCT__entry(
                __field(int,        ret)
        ),

        TP_fast_assign(
                __entry->ret = ret;
        ),

        TP_printk("ret=%d", __entry->ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

#endif /* _TRACE_NET_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

















    1 
















































    1 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
// SPDX-License-Identifier: GPL-2.0-or-later
/* 32-bit compatibility syscall for 64-bit systems
 *
 * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/syscalls.h>
#include <linux/keyctl.h>
#include <linux/compat.h>
#include <linux/slab.h>
#include "internal.h"

/*
 * The key control system call, 32-bit compatibility version for 64-bit archs
 */
COMPAT_SYSCALL_DEFINE5(keyctl, u32, option,
                       u32, arg2, u32, arg3, u32, arg4, u32, arg5)
{
        switch (option) {
        case KEYCTL_GET_KEYRING_ID:
                return keyctl_get_keyring_ID(arg2, arg3);

        case KEYCTL_JOIN_SESSION_KEYRING:
                return keyctl_join_session_keyring(compat_ptr(arg2));

        case KEYCTL_UPDATE:
                return keyctl_update_key(arg2, compat_ptr(arg3), arg4);

        case KEYCTL_REVOKE:
                return keyctl_revoke_key(arg2);

        case KEYCTL_DESCRIBE:
                return keyctl_describe_key(arg2, compat_ptr(arg3), arg4);

        case KEYCTL_CLEAR:
                return keyctl_keyring_clear(arg2);

        case KEYCTL_LINK:
                return keyctl_keyring_link(arg2, arg3);

        case KEYCTL_UNLINK:
                return keyctl_keyring_unlink(arg2, arg3);

        case KEYCTL_SEARCH:
                return keyctl_keyring_search(arg2, compat_ptr(arg3),
                                             compat_ptr(arg4), arg5);

        case KEYCTL_READ:
                return keyctl_read_key(arg2, compat_ptr(arg3), arg4);

        case KEYCTL_CHOWN:
                return keyctl_chown_key(arg2, arg3, arg4);

        case KEYCTL_SETPERM:
                return keyctl_setperm_key(arg2, arg3);

        case KEYCTL_INSTANTIATE:
                return keyctl_instantiate_key(arg2, compat_ptr(arg3), arg4,
                                              arg5);

        case KEYCTL_NEGATE:
                return keyctl_negate_key(arg2, arg3, arg4);

        case KEYCTL_SET_REQKEY_KEYRING:
                return keyctl_set_reqkey_keyring(arg2);

        case KEYCTL_SET_TIMEOUT:
                return keyctl_set_timeout(arg2, arg3);

        case KEYCTL_ASSUME_AUTHORITY:
                return keyctl_assume_authority(arg2);

        case KEYCTL_GET_SECURITY:
                return keyctl_get_security(arg2, compat_ptr(arg3), arg4);

        case KEYCTL_SESSION_TO_PARENT:
                return keyctl_session_to_parent();

        case KEYCTL_REJECT:
                return keyctl_reject_key(arg2, arg3, arg4, arg5);

        case KEYCTL_INSTANTIATE_IOV:
                return keyctl_instantiate_key_iov(arg2, compat_ptr(arg3), arg4,
                                                  arg5);

        case KEYCTL_INVALIDATE:
                return keyctl_invalidate_key(arg2);

        case KEYCTL_GET_PERSISTENT:
                return keyctl_get_persistent(arg2, arg3);

        case KEYCTL_DH_COMPUTE:
                return compat_keyctl_dh_compute(compat_ptr(arg2),
                                                compat_ptr(arg3),
                                                arg4, compat_ptr(arg5));

        case KEYCTL_RESTRICT_KEYRING:
                return keyctl_restrict_keyring(arg2, compat_ptr(arg3),
                                               compat_ptr(arg4));

        case KEYCTL_PKEY_QUERY:
                if (arg3 != 0)
                        return -EINVAL;
                return keyctl_pkey_query(arg2,
                                         compat_ptr(arg4),
                                         compat_ptr(arg5));

        case KEYCTL_PKEY_ENCRYPT:
        case KEYCTL_PKEY_DECRYPT:
        case KEYCTL_PKEY_SIGN:
                return keyctl_pkey_e_d_s(option,
                                         compat_ptr(arg2), compat_ptr(arg3),
                                         compat_ptr(arg4), compat_ptr(arg5));

        case KEYCTL_PKEY_VERIFY:
                return keyctl_pkey_verify(compat_ptr(arg2), compat_ptr(arg3),
                                          compat_ptr(arg4), compat_ptr(arg5));

        case KEYCTL_MOVE:
                return keyctl_keyring_move(arg2, arg3, arg4, arg5);

        case KEYCTL_CAPABILITIES:
                return keyctl_capabilities(compat_ptr(arg2), arg3);

        case KEYCTL_WATCH_KEY:
                return keyctl_watch_key(arg2, arg3, arg4);

        default:
                return -EOPNOTSUPP;
        }
}

















































































   23 





    1 






























































































































   23 








    1 















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/* SPDX-License-Identifier: GPL-2.0 */
/* rwsem.h: R/W semaphores, public interface
 *
 * Written by David Howells (dhowells@redhat.com).
 * Derived from asm-i386/semaphore.h
 */

#ifndef _LINUX_RWSEM_H
#define _LINUX_RWSEM_H

#include <linux/linkage.h>

#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/err.h>
#include <linux/cleanup.h>

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __RWSEM_DEP_MAP_INIT(lockname)                        \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_SLEEP,        \
        },
#else
# define __RWSEM_DEP_MAP_INIT(lockname)
#endif

#ifndef CONFIG_PREEMPT_RT

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#include <linux/osq_lock.h>
#endif

/*
 * For an uncontended rwsem, count and owner are the only fields a task
 * needs to touch when acquiring the rwsem. So they are put next to each
 * other to increase the chance that they will share the same cacheline.
 *
 * In a contended rwsem, the owner is likely the most frequently accessed
 * field in the structure as the optimistic waiter that holds the osq lock
 * will spin on owner. For an embedded rwsem, other hot fields in the
 * containing structure should be moved further away from the rwsem to
 * reduce the chance that they will share the same cacheline causing
 * cacheline bouncing problem.
 */
context_lock_struct(rw_semaphore) {
        atomic_long_t count;
        /*
         * Write owner or one of the read owners as well flags regarding
         * the current state of the rwsem. Can be used as a speculative
         * check to see if the write owner is running on the cpu.
         */
        atomic_long_t owner;
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
        struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
        raw_spinlock_t wait_lock;
        struct rwsem_waiter *first_waiter __guarded_by(&wait_lock);
#ifdef CONFIG_DEBUG_RWSEMS
        void *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#define RWSEM_UNLOCKED_VALUE                0UL
#define RWSEM_WRITER_LOCKED                (1UL << 0)
#define __RWSEM_COUNT_INIT(name)        .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE)

static inline int rwsem_is_locked(struct rw_semaphore *sem)
{
        return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE;
}

static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem)
        __assumes_ctx_lock(sem)
{
        WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE);
}

static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem)
        __assumes_ctx_lock(sem)
{
        WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED));
}

/* Common initializer macros and functions */

#ifdef CONFIG_DEBUG_RWSEMS
# define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname,
#else
# define __RWSEM_DEBUG_INIT(lockname)
#endif

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED,
#else
#define __RWSEM_OPT_INIT(lockname)
#endif

#define __RWSEM_INITIALIZER(name)                                \
        { __RWSEM_COUNT_INIT(name),                                \
          .owner = ATOMIC_LONG_INIT(0),                                \
          __RWSEM_OPT_INIT(name)                                \
          .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
          .first_waiter = NULL,                                        \
          __RWSEM_DEBUG_INIT(name)                                \
          __RWSEM_DEP_MAP_INIT(name) }

#define DECLARE_RWSEM(name) \
        struct rw_semaphore name = __RWSEM_INITIALIZER(name)

extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
                         struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

/*
 * This is the same regardless of which rwsem implementation that is being used.
 * It is just a heuristic meant to be called by somebody already holding the
 * rwsem to see if somebody from an incompatible type is wanting access to the
 * lock.
 */
static inline bool rwsem_is_contended(struct rw_semaphore *sem)
{
        return data_race(sem->first_waiter != NULL);
}

#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
/*
 * Return just the real task structure pointer of the owner
 */
extern struct task_struct *rwsem_owner(struct rw_semaphore *sem);

/*
 * Return true if the rwsem is owned by a reader.
 */
extern bool is_rwsem_reader_owned(struct rw_semaphore *sem);
#endif

#else /* !CONFIG_PREEMPT_RT */

#include <linux/rwbase_rt.h>

context_lock_struct(rw_semaphore) {
        struct rwbase_rt        rwbase;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#define __RWSEM_INITIALIZER(name)                                \
        {                                                        \
                .rwbase = __RWBASE_INITIALIZER(name),                \
                __RWSEM_DEP_MAP_INIT(name)                        \
        }

#define DECLARE_RWSEM(lockname) \
        struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)

extern void  __init_rwsem(struct rw_semaphore *rwsem, const char *name,
                          struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem)
{
        return rw_base_is_locked(&sem->rwbase);
}

static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem)
        __assumes_ctx_lock(sem)
{
        WARN_ON(!rwsem_is_locked(sem));
}

static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem)
        __assumes_ctx_lock(sem)
{
        WARN_ON(!rw_base_is_write_locked(&sem->rwbase));
}

static __always_inline int rwsem_is_contended(struct rw_semaphore *sem)
{
        return rw_base_is_contended(&sem->rwbase);
}

#endif /* CONFIG_PREEMPT_RT */

/*
 * The functions below are the same for all rwsem implementations including
 * the RT specific variant.
 */

static inline void rwsem_assert_held(const struct rw_semaphore *sem)
        __assumes_ctx_lock(sem)
{
        if (IS_ENABLED(CONFIG_LOCKDEP))
                lockdep_assert_held(sem);
        else
                rwsem_assert_held_nolockdep(sem);
}

static inline void rwsem_assert_held_write(const struct rw_semaphore *sem)
        __assumes_ctx_lock(sem)
{
        if (IS_ENABLED(CONFIG_LOCKDEP))
                lockdep_assert_held_write(sem);
        else
                rwsem_assert_held_write_nolockdep(sem);
}

/*
 * lock for reading
 */
extern void down_read(struct rw_semaphore *sem) __acquires_shared(sem);
extern int __must_check down_read_interruptible(struct rw_semaphore *sem) __cond_acquires_shared(0, sem);
extern int __must_check down_read_killable(struct rw_semaphore *sem) __cond_acquires_shared(0, sem);

/*
 * trylock for reading -- returns 1 if successful, 0 if contention
 */
extern int down_read_trylock(struct rw_semaphore *sem) __cond_acquires_shared(true, sem);

/*
 * lock for writing
 */
extern void down_write(struct rw_semaphore *sem) __acquires(sem);
extern int __must_check down_write_killable(struct rw_semaphore *sem) __cond_acquires(0, sem);

/*
 * trylock for writing -- returns 1 if successful, 0 if contention
 */
extern int down_write_trylock(struct rw_semaphore *sem) __cond_acquires(true, sem);

/*
 * release a read lock
 */
extern void up_read(struct rw_semaphore *sem) __releases_shared(sem);

/*
 * release a write lock
 */
extern void up_write(struct rw_semaphore *sem) __releases(sem);

DEFINE_LOCK_GUARD_1(rwsem_read, struct rw_semaphore, down_read(_T->lock), up_read(_T->lock))
DEFINE_LOCK_GUARD_1_COND(rwsem_read, _try, down_read_trylock(_T->lock))
DEFINE_LOCK_GUARD_1_COND(rwsem_read, _intr, down_read_interruptible(_T->lock), _RET == 0)

DECLARE_LOCK_GUARD_1_ATTRS(rwsem_read, __acquires_shared(_T), __releases_shared(*(struct rw_semaphore **)_T))
#define class_rwsem_read_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rwsem_read, _T)
DECLARE_LOCK_GUARD_1_ATTRS(rwsem_read_try, __acquires_shared(_T), __releases_shared(*(struct rw_semaphore **)_T))
#define class_rwsem_read_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rwsem_read_try, _T)
DECLARE_LOCK_GUARD_1_ATTRS(rwsem_read_intr, __acquires_shared(_T), __releases_shared(*(struct rw_semaphore **)_T))
#define class_rwsem_read_intr_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rwsem_read_intr, _T)

DEFINE_LOCK_GUARD_1(rwsem_write, struct rw_semaphore, down_write(_T->lock), up_write(_T->lock))
DEFINE_LOCK_GUARD_1_COND(rwsem_write, _try, down_write_trylock(_T->lock))
DEFINE_LOCK_GUARD_1_COND(rwsem_write, _kill, down_write_killable(_T->lock), _RET == 0)

DECLARE_LOCK_GUARD_1_ATTRS(rwsem_write, __acquires(_T), __releases(*(struct rw_semaphore **)_T))
#define class_rwsem_write_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rwsem_write, _T)
DECLARE_LOCK_GUARD_1_ATTRS(rwsem_write_try, __acquires(_T), __releases(*(struct rw_semaphore **)_T))
#define class_rwsem_write_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rwsem_write_try, _T)
DECLARE_LOCK_GUARD_1_ATTRS(rwsem_write_kill, __acquires(_T), __releases(*(struct rw_semaphore **)_T))
#define class_rwsem_write_kill_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rwsem_write_kill, _T)

DEFINE_LOCK_GUARD_1(rwsem_init, struct rw_semaphore, init_rwsem(_T->lock), /* */)
DECLARE_LOCK_GUARD_1_ATTRS(rwsem_init, __acquires(_T), __releases(*(struct rw_semaphore **)_T))
#define class_rwsem_init_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rwsem_init, _T)

/*
 * downgrade write lock to read lock
 */
extern void downgrade_write(struct rw_semaphore *sem) __releases(sem) __acquires_shared(sem);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
 * nested locking. NOTE: rwsems are not allowed to recurse
 * (which occurs if the same task tries to acquire the same
 * lock instance multiple times), but multiple locks of the
 * same lock class might be taken, if the order of the locks
 * is always the same. This ordering rule can be expressed
 * to lockdep via the _nested() APIs, but enumerating the
 * subclasses that are used. (If the nesting relationship is
 * static then another method for expressing nested locking is
 * the explicit definition of lock class keys and the use of
 * lockdep_set_class() at lock initialization time.
 * See Documentation/locking/lockdep-design.rst for more details.)
 */
extern void down_read_nested(struct rw_semaphore *sem, int subclass) __acquires_shared(sem);
extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass) __cond_acquires_shared(0, sem);
extern void down_write_nested(struct rw_semaphore *sem, int subclass) __acquires(sem);
extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass) __cond_acquires(0, sem);
extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock) __acquires(sem);

# define down_write_nest_lock(sem, nest_lock)                        \
do {                                                                \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map);        \
        _down_write_nest_lock(sem, &(nest_lock)->dep_map);        \
} while (0)

/*
 * Take/release a lock when not the owner will release it.
 *
 * [ This API should be avoided as much as possible - the
 *   proper abstraction for this case is completions. ]
 */
extern void down_read_non_owner(struct rw_semaphore *sem) __acquires_shared(sem);
extern void up_read_non_owner(struct rw_semaphore *sem) __releases_shared(sem);
#else
# define down_read_nested(sem, subclass)                down_read(sem)
# define down_read_killable_nested(sem, subclass)        down_read_killable(sem)
# define down_write_nest_lock(sem, nest_lock)        down_write(sem)
# define down_write_nested(sem, subclass)        down_write(sem)
# define down_write_killable_nested(sem, subclass)        down_write_killable(sem)
# define down_read_non_owner(sem)                down_read(sem)
# define up_read_non_owner(sem)                        up_read(sem)
#endif

#endif /* _LINUX_RWSEM_H */




























































































































































































































































































































































































































































































































































































































    6 














    1 

    5 
















































































































































































































































































    1 










































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 */

#ifndef _NET_IPV6_H
#define _NET_IPV6_H

#include <linux/ipv6.h>
#include <linux/hardirq.h>
#include <linux/jhash.h>
#include <linux/refcount.h>
#include <linux/jump_label_ratelimit.h>
#include <net/if_inet6.h>
#include <net/flow.h>
#include <net/flow_dissector.h>
#include <net/inet_dscp.h>
#include <net/snmp.h>
#include <net/netns/hash.h>

struct ip_tunnel_info;

#define SIN6_LEN_RFC2133        24

/*
 *        NextHeader field of IPv6 header
 */

#define NEXTHDR_HOP                0        /* Hop-by-hop option header. */
#define NEXTHDR_IPV4                4        /* IPv4 in IPv6 */
#define NEXTHDR_TCP                6        /* TCP segment. */
#define NEXTHDR_UDP                17        /* UDP message. */
#define NEXTHDR_IPV6                41        /* IPv6 in IPv6 */
#define NEXTHDR_ROUTING                43        /* Routing header. */
#define NEXTHDR_FRAGMENT        44        /* Fragmentation/reassembly header. */
#define NEXTHDR_GRE                47        /* GRE header. */
#define NEXTHDR_ESP                50        /* Encapsulating security payload. */
#define NEXTHDR_AUTH                51        /* Authentication header. */
#define NEXTHDR_ICMP                58        /* ICMP for IPv6. */
#define NEXTHDR_NONE                59        /* No next header */
#define NEXTHDR_DEST                60        /* Destination options header. */
#define NEXTHDR_SCTP                132        /* SCTP message. */
#define NEXTHDR_MOBILITY        135        /* Mobility header. */

#define NEXTHDR_MAX                255

#define IPV6_DEFAULT_HOPLIMIT   64
#define IPV6_DEFAULT_MCASTHOPS        1

/* Limits on Hop-by-Hop and Destination options.
 *
 * Per RFC8200 there is no limit on the maximum number or lengths of options in
 * Hop-by-Hop or Destination options other then the packet must fit in an MTU.
 * We allow configurable limits in order to mitigate potential denial of
 * service attacks.
 *
 * There are three limits that may be set:
 *   - Limit the number of options in a Hop-by-Hop or Destination options
 *     extension header
 *   - Limit the byte length of a Hop-by-Hop or Destination options extension
 *     header
 *   - Disallow unknown options
 *
 * The limits are expressed in corresponding sysctls:
 *
 * ipv6.sysctl.max_dst_opts_cnt
 * ipv6.sysctl.max_hbh_opts_cnt
 * ipv6.sysctl.max_dst_opts_len
 * ipv6.sysctl.max_hbh_opts_len
 *
 * max_*_opts_cnt is the number of TLVs that are allowed for Destination
 * options or Hop-by-Hop options. If the number is less than zero then unknown
 * TLVs are disallowed and the number of known options that are allowed is the
 * absolute value. Setting the value to INT_MAX indicates no limit.
 *
 * max_*_opts_len is the length limit in bytes of a Destination or
 * Hop-by-Hop options extension header. Setting the value to INT_MAX
 * indicates no length limit.
 *
 * If a limit is exceeded when processing an extension header the packet is
 * silently discarded.
 */

/* Default limits for Hop-by-Hop and Destination options */
#define IP6_DEFAULT_MAX_DST_OPTS_CNT         8
#define IP6_DEFAULT_MAX_HBH_OPTS_CNT         8
#define IP6_DEFAULT_MAX_DST_OPTS_LEN         INT_MAX /* No limit */
#define IP6_DEFAULT_MAX_HBH_OPTS_LEN         INT_MAX /* No limit */

/*
 *        Addr type
 *        
 *        type        -        unicast | multicast
 *        scope        -        local        | site            | global
 *        v4        -        compat
 *        v4mapped
 *        any
 *        loopback
 */

#define IPV6_ADDR_ANY                0x0000U

#define IPV6_ADDR_UNICAST        0x0001U
#define IPV6_ADDR_MULTICAST        0x0002U

#define IPV6_ADDR_LOOPBACK        0x0010U
#define IPV6_ADDR_LINKLOCAL        0x0020U
#define IPV6_ADDR_SITELOCAL        0x0040U

#define IPV6_ADDR_COMPATv4        0x0080U

#define IPV6_ADDR_SCOPE_MASK        0x00f0U

#define IPV6_ADDR_MAPPED        0x1000U

/*
 *        Addr scopes
 */
#define IPV6_ADDR_MC_SCOPE(a)        \
        ((a)->s6_addr[1] & 0x0f)        /* nonstandard */
#define __IPV6_ADDR_SCOPE_INVALID        -1
#define IPV6_ADDR_SCOPE_NODELOCAL        0x01
#define IPV6_ADDR_SCOPE_LINKLOCAL        0x02
#define IPV6_ADDR_SCOPE_SITELOCAL        0x05
#define IPV6_ADDR_SCOPE_ORGLOCAL        0x08
#define IPV6_ADDR_SCOPE_GLOBAL                0x0e

/*
 *        Addr flags
 */
#define IPV6_ADDR_MC_FLAG_TRANSIENT(a)        \
        ((a)->s6_addr[1] & 0x10)
#define IPV6_ADDR_MC_FLAG_PREFIX(a)        \
        ((a)->s6_addr[1] & 0x20)
#define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a)        \
        ((a)->s6_addr[1] & 0x40)

/*
 *        fragmentation header
 */

struct frag_hdr {
        __u8        nexthdr;
        __u8        reserved;
        __be16        frag_off;
        __be32        identification;
};

#define        IP6_MF                0x0001
#define        IP6_OFFSET        0xFFF8

struct ip6_fraglist_iter {
        struct ipv6hdr        *tmp_hdr;
        struct sk_buff        *frag;
        int                offset;
        unsigned int        hlen;
        __be32                frag_id;
        u8                nexthdr;
};

int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
                      u8 nexthdr, __be32 frag_id,
                      struct ip6_fraglist_iter *iter);
void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter);

static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter)
{
        struct sk_buff *skb = iter->frag;

        iter->frag = skb->next;
        skb_mark_not_on_list(skb);

        return skb;
}

struct ip6_frag_state {
        u8                *prevhdr;
        unsigned int        hlen;
        unsigned int        mtu;
        unsigned int        left;
        int                offset;
        int                ptr;
        int                hroom;
        int                troom;
        __be32                frag_id;
        u8                nexthdr;
};

void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
                   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
                   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state);
struct sk_buff *ip6_frag_next(struct sk_buff *skb,
                              struct ip6_frag_state *state);

#define IP6_REPLY_MARK(net, mark) \
        ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0)

#include <net/sock.h>

/* sysctls */
extern int sysctl_mld_max_msf;
extern int sysctl_mld_qrv;

#define _DEVINC(net, statname, mod, idev, field)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_INC_STATS64((_idev)->stats.statname, (field));\
        mod##SNMP_INC_STATS64((net)->mib.statname##_statistics, (field));\
})

/* per device counters are atomic_long_t */
#define _DEVINCATOMIC(net, statname, mod, idev, field)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \
        mod##SNMP_INC_STATS((net)->mib.statname##_statistics, (field));\
})

/* per device and per net counters are atomic_long_t */
#define _DEVINC_ATOMIC_ATOMIC(net, statname, idev, field)                \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \
        SNMP_INC_STATS_ATOMIC_LONG((net)->mib.statname##_statistics, (field));\
})

#define _DEVADD(net, statname, mod, idev, field, val)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        unsigned long _field = (field);                                        \
        unsigned long _val = (val);                                        \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_ADD_STATS((_idev)->stats.statname, _field,  _val); \
        mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, _field, _val);\
})

#define _DEVUPD(net, statname, mod, idev, field, val)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        unsigned long _val = (val);                                        \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, _val); \
        mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, _val);\
})

/* MIBs */

#define IP6_INC_STATS(net, idev,field)                \
                _DEVINC(net, ipv6, , idev, field)
#define __IP6_INC_STATS(net, idev,field)        \
                _DEVINC(net, ipv6, __, idev, field)
#define IP6_ADD_STATS(net, idev,field,val)        \
                _DEVADD(net, ipv6, , idev, field, val)
#define __IP6_ADD_STATS(net, idev,field,val)        \
                _DEVADD(net, ipv6, __, idev, field, val)
#define IP6_UPD_PO_STATS(net, idev,field,val)   \
                _DEVUPD(net, ipv6, , idev, field, val)
#define __IP6_UPD_PO_STATS(net, idev,field,val)   \
                _DEVUPD(net, ipv6, __, idev, field, val)
#define ICMP6_INC_STATS(net, idev, field)        \
                _DEVINCATOMIC(net, icmpv6, , idev, field)
#define __ICMP6_INC_STATS(net, idev, field)        \
                _DEVINCATOMIC(net, icmpv6, __, idev, field)

#define ICMP6MSGOUT_INC_STATS(net, idev, field)                \
        _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field +256)
#define ICMP6MSGIN_INC_STATS(net, idev, field)        \
        _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field)

struct ip6_ra_chain {
        struct ip6_ra_chain        *next;
        struct sock                *sk;
        int                        sel;
        void                        (*destructor)(struct sock *);
};

extern struct ip6_ra_chain        *ip6_ra_chain;
extern rwlock_t ip6_ra_lock;

/*
   This structure is prepared by protocol, when parsing
   ancillary data and passed to IPv6.
 */

struct ipv6_txoptions {
        refcount_t                refcnt;
        /* Length of this structure */
        int                        tot_len;

        /* length of extension headers   */

        __u16                        opt_flen;        /* after fragment hdr */
        __u16                        opt_nflen;        /* before fragment hdr */

        struct ipv6_opt_hdr        *hopopt;
        struct ipv6_opt_hdr        *dst0opt;
        struct ipv6_rt_hdr        *srcrt;        /* Routing Header */
        struct ipv6_opt_hdr        *dst1opt;
        struct rcu_head                rcu;
        /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */
};

/* flowlabel_reflect sysctl values */
enum flowlabel_reflect {
        FLOWLABEL_REFLECT_ESTABLISHED                = 1,
        FLOWLABEL_REFLECT_TCP_RESET                = 2,
        FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES        = 4,
};

struct ip6_flowlabel {
        struct ip6_flowlabel __rcu *next;
        __be32                        label;
        atomic_t                users;
        struct in6_addr                dst;
        struct ipv6_txoptions        *opt;
        unsigned long                linger;
        struct rcu_head                rcu;
        u8                        share;
        union {
                struct pid *pid;
                kuid_t uid;
        } owner;
        unsigned long                lastuse;
        unsigned long                expires;
        struct net                *fl_net;
};

#define IPV6_FLOWINFO_MASK                cpu_to_be32(0x0FFFFFFF)
#define IPV6_FLOWLABEL_MASK                cpu_to_be32(0x000FFFFF)
#define IPV6_FLOWLABEL_STATELESS_FLAG        cpu_to_be32(0x00080000)

#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
#define IPV6_TCLASS_SHIFT        20

struct ipv6_fl_socklist {
        struct ipv6_fl_socklist        __rcu        *next;
        struct ip6_flowlabel                *fl;
        struct rcu_head                        rcu;
};

struct ipcm6_cookie {
        struct sockcm_cookie sockc;
        __s16 hlimit;
        __s16 tclass;
        __u16 gso_size;
        __s8  dontfrag;
        struct ipv6_txoptions *opt;
};

static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6,
                                 const struct sock *sk)
{
        *ipc6 = (struct ipcm6_cookie) {
                .hlimit = -1,
                .tclass = inet6_sk(sk)->tclass,
                .dontfrag = inet6_test_bit(DONTFRAG, sk),
        };

        sockcm_init(&ipc6->sockc, sk);
}

static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
{
        struct ipv6_txoptions *opt;

        rcu_read_lock();
        opt = rcu_dereference(np->opt);
        if (opt) {
                if (!refcount_inc_not_zero(&opt->refcnt))
                        opt = NULL;
                else
                        opt = rcu_pointer_handoff(opt);
        }
        rcu_read_unlock();
        return opt;
}

static inline void txopt_put(struct ipv6_txoptions *opt)
{
        if (opt && refcount_dec_and_test(&opt->refcnt))
                kfree_rcu(opt, rcu);
}

#if IS_ENABLED(CONFIG_IPV6)
struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label);

extern struct static_key_false_deferred ipv6_flowlabel_exclusive;
static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk,
                                                    __be32 label)
{
        if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) &&
            READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl))
                return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT);

        return NULL;
}
#endif

struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
                                         struct ip6_flowlabel *fl,
                                         struct ipv6_txoptions *fopt);
void fl6_free_socklist(struct sock *sk);
int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen);
int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
                           int flags);
int ip6_flowlabel_init(void);
void ip6_flowlabel_cleanup(void);
bool ip6_autoflowlabel(struct net *net, const struct sock *sk);

static inline void fl6_sock_release(struct ip6_flowlabel *fl)
{
        if (fl)
                atomic_dec(&fl->users);
}

enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
                                   u8 code, __be32 info);

void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
                                struct icmp6hdr *thdr, int len);

int ip6_ra_control(struct sock *sk, int sel);

int ipv6_parse_hopopts(struct sk_buff *skb);

struct ipv6_txoptions *ipv6_dup_options(struct sock *sk,
                                        struct ipv6_txoptions *opt);
struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
                                          struct ipv6_txoptions *opt,
                                          int newtype,
                                          struct ipv6_opt_hdr *newopt);
struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
                                            struct ipv6_txoptions *opt);

static inline struct ipv6_txoptions *
ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt)
{
        if (!opt)
                return NULL;
        return __ipv6_fixup_options(opt_space, opt);
}

bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
                       const struct inet6_skb_parm *opt);
struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
                                           struct ipv6_txoptions *opt);

static inline bool ipv6_accept_ra(const struct inet6_dev *idev)
{
        s32 accept_ra = READ_ONCE(idev->cnf.accept_ra);

        /* If forwarding is enabled, RA are not accepted unless the special
         * hybrid mode (accept_ra=2) is enabled.
         */
        return READ_ONCE(idev->cnf.forwarding) ? accept_ra == 2 :
                accept_ra;
}

#define IPV6_FRAG_HIGH_THRESH        (4 * 1024*1024)        /* 4194304 */
#define IPV6_FRAG_LOW_THRESH        (3 * 1024*1024)        /* 3145728 */
#define IPV6_FRAG_TIMEOUT        (60 * HZ)        /* 60 seconds */

int __ipv6_addr_type(const struct in6_addr *addr);
static inline int ipv6_addr_type(const struct in6_addr *addr)
{
        return __ipv6_addr_type(addr) & 0xffff;
}

static inline int ipv6_addr_scope(const struct in6_addr *addr)
{
        return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK;
}

static inline int __ipv6_addr_src_scope(int type)
{
        return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16);
}

static inline int ipv6_addr_src_scope(const struct in6_addr *addr)
{
        return __ipv6_addr_src_scope(__ipv6_addr_type(addr));
}

static inline bool __ipv6_addr_needs_scope_id(int type)
{
        return type & IPV6_ADDR_LINKLOCAL ||
               (type & IPV6_ADDR_MULTICAST &&
                (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)));
}

static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface)
{
        return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0;
}

static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2)
{
        return memcmp(a1, a2, sizeof(struct in6_addr));
}

static inline bool
ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m,
                     const struct in6_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ulm = (const unsigned long *)m;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return !!(((ul1[0] ^ ul2[0]) & ulm[0]) |
                  ((ul1[1] ^ ul2[1]) & ulm[1]));
#else
        return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) |
                  ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) |
                  ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) |
                  ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3]));
#endif
}

static inline void ipv6_addr_prefix(struct in6_addr *pfx,
                                    const struct in6_addr *addr,
                                    int plen)
{
        /* caller must guarantee 0 <= plen <= 128 */
        int o = plen >> 3,
            b = plen & 0x7;

        memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr));
        memcpy(pfx->s6_addr, addr, o);
        if (b != 0)
                pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b);
}

static inline void ipv6_addr_prefix_copy(struct in6_addr *addr,
                                         const struct in6_addr *pfx,
                                         int plen)
{
        /* caller must guarantee 0 <= plen <= 128 */
        int o = plen >> 3,
            b = plen & 0x7;

        memcpy(addr->s6_addr, pfx, o);
        if (b != 0) {
                addr->s6_addr[o] &= ~(0xff00 >> b);
                addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b));
        }
}

static inline void __ipv6_addr_set_half(__be32 *addr,
                                        __be32 wh, __be32 wl)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
#if defined(__BIG_ENDIAN)
        if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) {
                *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl));
                return;
        }
#elif defined(__LITTLE_ENDIAN)
        if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) {
                *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh));
                return;
        }
#endif
#endif
        addr[0] = wh;
        addr[1] = wl;
}

static inline void ipv6_addr_set(struct in6_addr *addr,
                                     __be32 w1, __be32 w2,
                                     __be32 w3, __be32 w4)
{
        __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2);
        __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4);
}

static inline bool ipv6_addr_equal(const struct in6_addr *a1,
                                   const struct in6_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL;
#else
        return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) |
                (a1->s6_addr32[1] ^ a2->s6_addr32[1]) |
                (a1->s6_addr32[2] ^ a2->s6_addr32[2]) |
                (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0;
#endif
}

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
static inline bool __ipv6_prefix_equal64_half(const __be64 *a1,
                                              const __be64 *a2,
                                              unsigned int len)
{
        if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len))))
                return false;
        return true;
}

static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
                                     const struct in6_addr *addr2,
                                     unsigned int prefixlen)
{
        const __be64 *a1 = (const __be64 *)addr1;
        const __be64 *a2 = (const __be64 *)addr2;

        if (prefixlen >= 64) {
                if (a1[0] ^ a2[0])
                        return false;
                return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64);
        }
        return __ipv6_prefix_equal64_half(a1, a2, prefixlen);
}
#else
static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
                                     const struct in6_addr *addr2,
                                     unsigned int prefixlen)
{
        const __be32 *a1 = addr1->s6_addr32;
        const __be32 *a2 = addr2->s6_addr32;
        unsigned int pdw, pbi;

        /* check complete u32 in prefix */
        pdw = prefixlen >> 5;
        if (pdw && memcmp(a1, a2, pdw << 2))
                return false;

        /* check incomplete u32 in prefix */
        pbi = prefixlen & 0x1f;
        if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi))))
                return false;

        return true;
}
#endif

static inline bool ipv6_addr_any(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul = (const unsigned long *)a;

        return (ul[0] | ul[1]) == 0UL;
#else
        return (a->s6_addr32[0] | a->s6_addr32[1] |
                a->s6_addr32[2] | a->s6_addr32[3]) == 0;
#endif
}

static inline u32 ipv6_addr_hash(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul = (const unsigned long *)a;
        unsigned long x = ul[0] ^ ul[1];

        return (u32)(x ^ (x >> 32));
#else
        return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^
                             a->s6_addr32[2] ^ a->s6_addr32[3]);
#endif
}

/* more secured version of ipv6_addr_hash() */
static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval)
{
        return jhash2((__force const u32 *)a->s6_addr32,
                      ARRAY_SIZE(a->s6_addr32), initval);
}

static inline bool ipv6_addr_loopback(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const __be64 *be = (const __be64 *)a;

        return (be[0] | (be[1] ^ cpu_to_be64(1))) == 0UL;
#else
        return (a->s6_addr32[0] | a->s6_addr32[1] |
                a->s6_addr32[2] | (a->s6_addr32[3] ^ cpu_to_be32(1))) == 0;
#endif
}

/*
 * Note that we must __force cast these to unsigned long to make sparse happy,
 * since all of the endian-annotated types are fixed size regardless of arch.
 */
static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
{
        return (
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
                *(unsigned long *)a |
#else
                (__force unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) |
#endif
                (__force unsigned long)(a->s6_addr32[2] ^
                                        cpu_to_be32(0x0000ffff))) == 0UL;
}

static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a)
{
        return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]);
}

static inline u32 ipv6_portaddr_hash(const struct net *net,
                                     const struct in6_addr *addr6,
                                     unsigned int port)
{
        unsigned int hash, mix = net_hash_mix(net);

        if (ipv6_addr_any(addr6))
                hash = jhash_1word(0, mix);
        else if (ipv6_addr_v4mapped(addr6))
                hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
        else
                hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);

        return hash ^ port;
}

/*
 * Check for a RFC 4843 ORCHID address
 * (Overlay Routable Cryptographic Hash Identifiers)
 */
static inline bool ipv6_addr_orchid(const struct in6_addr *a)
{
        return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010);
}

static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr)
{
        return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000);
}

static inline void ipv6_addr_set_v4mapped(const __be32 addr,
                                          struct in6_addr *v4mapped)
{
        ipv6_addr_set(v4mapped,
                        0, 0,
                        htonl(0x0000FFFF),
                        addr);
}

/*
 * find the first different bit between two addresses
 * length of address must be a multiple of 32bits
 */
static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen)
{
        const __be32 *a1 = token1, *a2 = token2;
        int i;

        addrlen >>= 2;

        for (i = 0; i < addrlen; i++) {
                __be32 xb = a1[i] ^ a2[i];
                if (xb)
                        return i * 32 + 31 - __fls(ntohl(xb));
        }

        /*
         *        we should *never* get to this point since that
         *        would mean the addrs are equal
         *
         *        However, we do get to it 8) And exactly, when
         *        addresses are equal 8)
         *
         *        ip route add 1111::/128 via ...
         *        ip route add 1111::/64 via ...
         *        and we are here.
         *
         *        Ideally, this function should stop comparison
         *        at prefix length. It does not, but it is still OK,
         *        if returned value is greater than prefix length.
         *                                        --ANK (980803)
         */
        return addrlen << 5;
}

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen)
{
        const __be64 *a1 = token1, *a2 = token2;
        int i;

        addrlen >>= 3;

        for (i = 0; i < addrlen; i++) {
                __be64 xb = a1[i] ^ a2[i];
                if (xb)
                        return i * 64 + 63 - __fls(be64_to_cpu(xb));
        }

        return addrlen << 6;
}
#endif

static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        if (__builtin_constant_p(addrlen) && !(addrlen & 7))
                return __ipv6_addr_diff64(token1, token2, addrlen);
#endif
        return __ipv6_addr_diff32(token1, token2, addrlen);
}

static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2)
{
        return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
}

__be32 ipv6_select_ident(struct net *net,
                         const struct in6_addr *daddr,
                         const struct in6_addr *saddr);
__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);

int ip6_dst_hoplimit(struct dst_entry *dst);

static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
                                      struct dst_entry *dst)
{
        int hlimit;

        if (ipv6_addr_is_multicast(&fl6->daddr))
                hlimit = READ_ONCE(np->mcast_hops);
        else
                hlimit = READ_ONCE(np->hop_limit);
        if (hlimit < 0)
                hlimit = ip6_dst_hoplimit(dst);
        return hlimit;
}

/* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store
 * Equivalent to :        flow->v6addrs.src = iph->saddr;
 *                        flow->v6addrs.dst = iph->daddr;
 */
static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
                                            const struct ipv6hdr *iph)
{
        BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) !=
                     offsetof(typeof(flow->addrs), v6addrs.src) +
                     sizeof(flow->addrs.v6addrs.src));
        memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs));
        flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}

#if IS_ENABLED(CONFIG_IPV6)

static inline bool ipv6_can_nonlocal_bind(const struct net *net,
                                          const struct inet_sock *inet)
{
        return READ_ONCE(net->ipv6.sysctl.ip_nonlocal_bind) ||
                test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) ||
                test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags);
}

/* Sysctl settings for net ipv6.auto_flowlabels */
#define IP6_AUTO_FLOW_LABEL_OFF                0
#define IP6_AUTO_FLOW_LABEL_OPTOUT        1
#define IP6_AUTO_FLOW_LABEL_OPTIN        2
#define IP6_AUTO_FLOW_LABEL_FORCED        3

#define IP6_AUTO_FLOW_LABEL_MAX                IP6_AUTO_FLOW_LABEL_FORCED

#define IP6_DEFAULT_AUTO_FLOW_LABELS        IP6_AUTO_FLOW_LABEL_OPTOUT

static inline __be32 ip6_make_flowlabel(const struct net *net,
                                        struct sk_buff *skb,
                                        __be32 flowlabel, bool autolabel,
                                        struct flowi6 *fl6)
{
        u8 auto_flowlabels;
        u32 hash;

        /* @flowlabel may include more than a flow label, eg, the traffic class.
         * Here we want only the flow label value.
         */
        flowlabel &= IPV6_FLOWLABEL_MASK;

        if (flowlabel)
                return flowlabel;

        auto_flowlabels = READ_ONCE(net->ipv6.sysctl.auto_flowlabels);
        if (auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF ||
            (!autolabel && auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED))
                return flowlabel;

        hash = skb_get_hash_flowi6(skb, fl6);

        /* Since this is being sent on the wire obfuscate hash a bit
         * to minimize possibility that any useful information to an
         * attacker is leaked. Only lower 20 bits are relevant.
         */
        hash = rol32(hash, 16);

        flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;

        if (READ_ONCE(net->ipv6.sysctl.flowlabel_state_ranges))
                flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG;

        return flowlabel;
}

static inline int ip6_default_np_autolabel(const struct net *net)
{
        switch (READ_ONCE(net->ipv6.sysctl.auto_flowlabels)) {
        case IP6_AUTO_FLOW_LABEL_OFF:
        case IP6_AUTO_FLOW_LABEL_OPTIN:
        default:
                return 0;
        case IP6_AUTO_FLOW_LABEL_OPTOUT:
        case IP6_AUTO_FLOW_LABEL_FORCED:
                return 1;
        }
}
#else
static inline __be32 ip6_make_flowlabel(const struct net *net, struct sk_buff *skb,
                                        __be32 flowlabel, bool autolabel,
                                        struct flowi6 *fl6)
{
        return flowlabel;
}
static inline int ip6_default_np_autolabel(const struct net *net)
{
        return 0;
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static inline int ip6_multipath_hash_policy(const struct net *net)
{
        return READ_ONCE(net->ipv6.sysctl.multipath_hash_policy);
}
static inline u32 ip6_multipath_hash_fields(const struct net *net)
{
        return READ_ONCE(net->ipv6.sysctl.multipath_hash_fields);
}
#else
static inline int ip6_multipath_hash_policy(const struct net *net)
{
        return 0;
}
static inline u32 ip6_multipath_hash_fields(const struct net *net)
{
        return 0;
}
#endif

/*
 *        Header manipulation
 */
static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass,
                                __be32 flowlabel)
{
        *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel;
}

static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr)
{
        return *(__be32 *)hdr & IPV6_FLOWINFO_MASK;
}

static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr)
{
        return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK;
}

static inline u8 ip6_tclass(__be32 flowinfo)
{
        return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT;
}

static inline dscp_t ip6_dscp(__be32 flowinfo)
{
        return inet_dsfield_to_dscp(ip6_tclass(flowinfo));
}

static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
{
        return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
}

static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6)
{
        return fl6->flowlabel & IPV6_FLOWLABEL_MASK;
}

/*
 *        Prototypes exported by ipv6
 */

/*
 *        rcv function (called from netdevice level)
 */

int ipv6_rcv(struct sk_buff *skb, struct net_device *dev,
             struct packet_type *pt, struct net_device *orig_dev);
void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
                   struct net_device *orig_dev);

int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb);

/*
 *        upper-layer output functions
 */
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
             __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority);

int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr);

int ip6_append_data(struct sock *sk,
                    int getfrag(void *from, char *to, int offset, int len,
                                int odd, struct sk_buff *skb),
                    void *from, size_t length, int transhdrlen,
                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
                    struct rt6_info *rt, unsigned int flags);

int ip6_push_pending_frames(struct sock *sk);

void ip6_flush_pending_frames(struct sock *sk);

int ip6_send_skb(struct sk_buff *skb);

struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue,
                               struct inet_cork_full *cork);
struct sk_buff *ip6_make_skb(struct sock *sk,
                             int getfrag(void *from, char *to, int offset,
                                         int len, int odd, struct sk_buff *skb),
                             void *from, size_t length, int transhdrlen,
                             struct ipcm6_cookie *ipc6,
                             struct rt6_info *rt, unsigned int flags,
                             struct inet_cork_full *cork);

static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
{
        return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
}

int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
                   struct flowi6 *fl6);
#if IS_ENABLED(CONFIG_IPV6)
struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
                                      const struct in6_addr *final_dst);
#else
static inline struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk,
                                                    struct flowi6 *fl6,
                                                    const struct in6_addr *final_dst)
{
        return ERR_PTR(-EAFNOSUPPORT);
}
#endif

struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
                                         const struct in6_addr *final_dst,
                                         bool connected);
struct dst_entry *ip6_blackhole_route(struct net *net,
                                      struct dst_entry *orig_dst);

/*
 *        skb processing functions
 */

int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip6_forward(struct sk_buff *skb);
int ip6_input(struct sk_buff *skb);
int ip6_mc_input(struct sk_buff *skb);
void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
                              bool have_final);

int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);

/*
 *        Extension header (options) processing
 */

u8 ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
                        u8 proto, struct in6_addr **daddr_p,
                        struct in6_addr *saddr);
u8 ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
                       u8 proto);

int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp,
                     __be16 *frag_offp);

bool ipv6_ext_hdr(u8 nexthdr);

enum {
        IP6_FH_F_FRAG                = (1 << 0),
        IP6_FH_F_AUTH                = (1 << 1),
        IP6_FH_F_SKIP_RH        = (1 << 2),
};

/* find specified header and get offset to it */
int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target,
                  unsigned short *fragoff, int *fragflg);

int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type);

struct in6_addr *__fl6_update_dst(struct flowi6 *fl6,
                                  const struct ipv6_txoptions *opt,
                                  struct in6_addr *orig);

static inline struct in6_addr *
fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt,
               struct in6_addr *orig)
{
        if (likely(!opt))
                return NULL;

        return __fl6_update_dst(fl6, opt, orig);
}

/*
 *        socket options (ipv6_sockglue.c)
 */
DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount);

int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                       unsigned int optlen);
int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                    unsigned int optlen);
int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
                       sockptr_t optval, sockptr_t optlen);
int ipv6_getsockopt(struct sock *sk, int level, int optname,
                    char __user *optval, int __user *optlen);

int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr,
                           int addr_len);
int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr, int addr_len);
int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *addr,
                                 int addr_len);
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
void ip6_datagram_release_cb(struct sock *sk);

int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len);
int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len);
void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
                     u32 info, u8 *payload);
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info);
void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);

void inet6_cleanup_sock(struct sock *sk);
void inet6_sock_destruct(struct sock *sk);
int inet6_release(struct socket *sock);
int __inet6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
                 u32 flags);
int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len);
int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
                  int peer);
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int inet6_compat_ioctl(struct socket *sock, unsigned int cmd,
                unsigned long arg);

int inet6_hash_connect(struct inet_timewait_death_row *death_row,
                              struct sock *sk);
int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                  int flags);

/*
 * reassembly.c
 */
extern const struct proto_ops inet6_stream_ops;
extern const struct proto_ops inet6_dgram_ops;
extern const struct proto_ops inet6_sockraw_ops;

struct group_source_req;
struct group_filter;

int ip6_mc_source(int add, int omode, struct sock *sk,
                  struct group_source_req *pgsr);
int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
                  struct sockaddr_storage *list);
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
                  sockptr_t optval, size_t ss_offset);

#ifdef CONFIG_PROC_FS
int ac6_proc_init(struct net *net);
void ac6_proc_exit(struct net *net);
int raw6_proc_init(void);
void raw6_proc_exit(void);
int tcp6_proc_init(struct net *net);
void tcp6_proc_exit(struct net *net);
int udp6_proc_init(struct net *net);
void udp6_proc_exit(struct net *net);
int ipv6_misc_proc_init(void);
void ipv6_misc_proc_exit(void);
int snmp6_register_dev(struct inet6_dev *idev);
int snmp6_unregister_dev(struct inet6_dev *idev);

#else
static inline int ac6_proc_init(struct net *net) { return 0; }
static inline void ac6_proc_exit(struct net *net) { }
static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; }
static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; }
#endif

#ifdef CONFIG_SYSCTL
struct ctl_table *ipv6_icmp_sysctl_init(struct net *net);
size_t ipv6_icmp_sysctl_table_size(void);
struct ctl_table *ipv6_route_sysctl_init(struct net *net);
size_t ipv6_route_sysctl_table_size(struct net *net);
int ipv6_sysctl_register(void);
void ipv6_sysctl_unregister(void);
#endif

int ipv6_sock_mc_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
                          const struct in6_addr *addr, unsigned int mode);
int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);

static inline int ip6_sock_set_v6only(struct sock *sk)
{
        int ret = 0;

        lock_sock(sk);
        if (inet_sk(sk)->inet_num)
                ret = -EINVAL;
        else
                sk->sk_ipv6only = true;
        release_sock(sk);
        return ret;
}

static inline void ip6_sock_set_recverr(struct sock *sk)
{
        inet6_set_bit(RECVERR6, sk);
}

#define IPV6_PREFER_SRC_MASK (IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBLIC | \
                              IPV6_PREFER_SRC_COA)

static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val)
{
        unsigned int prefmask = ~IPV6_PREFER_SRC_MASK;
        unsigned int pref = 0;

        /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
        switch (val & (IPV6_PREFER_SRC_PUBLIC |
                       IPV6_PREFER_SRC_TMP |
                       IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
        case IPV6_PREFER_SRC_PUBLIC:
                pref |= IPV6_PREFER_SRC_PUBLIC;
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case IPV6_PREFER_SRC_TMP:
                pref |= IPV6_PREFER_SRC_TMP;
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case 0:
                break;
        default:
                return -EINVAL;
        }

        /* check HOME/COA conflicts */
        switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) {
        case IPV6_PREFER_SRC_HOME:
                prefmask &= ~IPV6_PREFER_SRC_COA;
                break;
        case IPV6_PREFER_SRC_COA:
                pref |= IPV6_PREFER_SRC_COA;
                break;
        case 0:
                break;
        default:
                return -EINVAL;
        }

        /* check CGA/NONCGA conflicts */
        switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
        case IPV6_PREFER_SRC_CGA:
        case IPV6_PREFER_SRC_NONCGA:
        case 0:
                break;
        default:
                return -EINVAL;
        }

        WRITE_ONCE(inet6_sk(sk)->srcprefs,
                   (READ_ONCE(inet6_sk(sk)->srcprefs) & prefmask) | pref);
        return 0;
}

static inline void ip6_sock_set_recvpktinfo(struct sock *sk)
{
        lock_sock(sk);
        inet6_sk(sk)->rxopt.bits.rxinfo = true;
        release_sock(sk);
}

#define IPV6_ADDR_WORDS 4

static inline void ipv6_addr_cpu_to_be32(__be32 *dst, const u32 *src)
{
        cpu_to_be32_array(dst, src, IPV6_ADDR_WORDS);
}

static inline void ipv6_addr_be32_to_cpu(u32 *dst, const __be32 *src)
{
        be32_to_cpu_array(dst, src, IPV6_ADDR_WORDS);
}

#endif /* _NET_IPV6_H */
















































































































    3 































    3 








    3 


    3 









    2 







    3 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 
    1 









    3 




    3 
    3 
    3 
    3 
    3 
    3 
    3 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/power/main.c - Where the driver meets power management.
 *
 * Copyright (c) 2003 Patrick Mochel
 * Copyright (c) 2003 Open Source Development Lab
 *
 * The driver model core calls device_pm_add() when a device is registered.
 * This will initialize the embedded device_pm_info object in the device
 * and add it to the list of power-controlled devices. sysfs entries for
 * controlling device power management will also be added.
 *
 * A separate list is used for keeping track of power info, because the power
 * domain dependencies may differ from the ancestral dependencies that the
 * subsystem list maintains.
 */

#define pr_fmt(fmt) "PM: " fmt
#define dev_fmt pr_fmt

#include <linux/device.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/pm.h>
#include <linux/pm_runtime.h>
#include <linux/pm-trace.h>
#include <linux/pm_wakeirq.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/async.h>
#include <linux/suspend.h>
#include <trace/events/power.h>
#include <linux/cpufreq.h>
#include <linux/devfreq.h>
#include <linux/thermal.h>
#include <linux/timer.h>
#include <linux/nmi.h>

#include "../base.h"
#include "power.h"

typedef int (*pm_callback_t)(struct device *);

/*
 * The entries in the dpm_list list are in a depth first order, simply
 * because children are guaranteed to be discovered after parents, and
 * are inserted at the back of the list on discovery.
 *
 * Since device_pm_add() may be called with a device lock held,
 * we must never try to acquire a device lock while holding
 * dpm_list_mutex.
 */

LIST_HEAD(dpm_list);
static LIST_HEAD(dpm_prepared_list);
static LIST_HEAD(dpm_suspended_list);
static LIST_HEAD(dpm_late_early_list);
static LIST_HEAD(dpm_noirq_list);

static DEFINE_MUTEX(dpm_list_mtx);
static pm_message_t pm_transition;

static DEFINE_MUTEX(async_wip_mtx);
static int async_error;

/**
 * pm_hibernate_is_recovering - if recovering from hibernate due to error.
 *
 * Used to query if dev_pm_ops.thaw() is called for normal hibernation case or
 * recovering from some error.
 *
 * Return: true for error case, false for normal case.
 */
bool pm_hibernate_is_recovering(void)
{
        return pm_transition.event == PM_EVENT_RECOVER;
}
EXPORT_SYMBOL_GPL(pm_hibernate_is_recovering);

static const char *pm_verb(int event)
{
        switch (event) {
        case PM_EVENT_SUSPEND:
                return "suspend";
        case PM_EVENT_RESUME:
                return "resume";
        case PM_EVENT_FREEZE:
                return "freeze";
        case PM_EVENT_QUIESCE:
                return "quiesce";
        case PM_EVENT_HIBERNATE:
                return "hibernate";
        case PM_EVENT_THAW:
                return "thaw";
        case PM_EVENT_RESTORE:
                return "restore";
        case PM_EVENT_RECOVER:
                return "recover";
        case PM_EVENT_POWEROFF:
                return "poweroff";
        default:
                return "(unknown PM event)";
        }
}

/**
 * device_pm_sleep_init - Initialize system suspend-related device fields.
 * @dev: Device object being initialized.
 */
void device_pm_sleep_init(struct device *dev)
{
        dev->power.is_prepared = false;
        dev->power.is_suspended = false;
        dev->power.is_noirq_suspended = false;
        dev->power.is_late_suspended = false;
        init_completion(&dev->power.completion);
        complete_all(&dev->power.completion);
        dev->power.wakeup = NULL;
        INIT_LIST_HEAD(&dev->power.entry);
}

/**
 * device_pm_lock - Lock the list of active devices used by the PM core.
 */
void device_pm_lock(void)
{
        mutex_lock(&dpm_list_mtx);
}

/**
 * device_pm_unlock - Unlock the list of active devices used by the PM core.
 */
void device_pm_unlock(void)
{
        mutex_unlock(&dpm_list_mtx);
}

/**
 * device_pm_add - Add a device to the PM core's list of active devices.
 * @dev: Device to add to the list.
 */
void device_pm_add(struct device *dev)
{
        /* Skip PM setup/initialization. */
        if (device_pm_not_required(dev))
                return;

        pr_debug("Adding info for %s:%s\n",
                 dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
        device_pm_check_callbacks(dev);
        mutex_lock(&dpm_list_mtx);
        if (dev->parent && dev->parent->power.is_prepared)
                dev_warn(dev, "parent %s should not be sleeping\n",
                        dev_name(dev->parent));
        list_add_tail(&dev->power.entry, &dpm_list);
        dev->power.in_dpm_list = true;
        mutex_unlock(&dpm_list_mtx);
}

/**
 * device_pm_remove - Remove a device from the PM core's list of active devices.
 * @dev: Device to be removed from the list.
 */
void device_pm_remove(struct device *dev)
{
        if (device_pm_not_required(dev))
                return;

        pr_debug("Removing info for %s:%s\n",
                 dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
        complete_all(&dev->power.completion);
        mutex_lock(&dpm_list_mtx);
        list_del_init(&dev->power.entry);
        dev->power.in_dpm_list = false;
        mutex_unlock(&dpm_list_mtx);
        device_wakeup_disable(dev);
        pm_runtime_remove(dev);
        device_pm_check_callbacks(dev);
}

/**
 * device_pm_move_before - Move device in the PM core's list of active devices.
 * @deva: Device to move in dpm_list.
 * @devb: Device @deva should come before.
 */
void device_pm_move_before(struct device *deva, struct device *devb)
{
        pr_debug("Moving %s:%s before %s:%s\n",
                 deva->bus ? deva->bus->name : "No Bus", dev_name(deva),
                 devb->bus ? devb->bus->name : "No Bus", dev_name(devb));
        /* Delete deva from dpm_list and reinsert before devb. */
        list_move_tail(&deva->power.entry, &devb->power.entry);
}

/**
 * device_pm_move_after - Move device in the PM core's list of active devices.
 * @deva: Device to move in dpm_list.
 * @devb: Device @deva should come after.
 */
void device_pm_move_after(struct device *deva, struct device *devb)
{
        pr_debug("Moving %s:%s after %s:%s\n",
                 deva->bus ? deva->bus->name : "No Bus", dev_name(deva),
                 devb->bus ? devb->bus->name : "No Bus", dev_name(devb));
        /* Delete deva from dpm_list and reinsert after devb. */
        list_move(&deva->power.entry, &devb->power.entry);
}

/**
 * device_pm_move_last - Move device to end of the PM core's list of devices.
 * @dev: Device to move in dpm_list.
 */
void device_pm_move_last(struct device *dev)
{
        pr_debug("Moving %s:%s to end of list\n",
                 dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
        list_move_tail(&dev->power.entry, &dpm_list);
}

static ktime_t initcall_debug_start(struct device *dev, void *cb)
{
        if (!pm_print_times_enabled)
                return 0;

        dev_info(dev, "calling %ps @ %i, parent: %s\n", cb,
                 task_pid_nr(current),
                 dev->parent ? dev_name(dev->parent) : "none");
        return ktime_get();
}

static void initcall_debug_report(struct device *dev, ktime_t calltime,
                                  void *cb, int error)
{
        ktime_t rettime;

        if (!pm_print_times_enabled)
                return;

        rettime = ktime_get();
        dev_info(dev, "%ps returned %d after %Ld usecs\n", cb, error,
                 (unsigned long long)ktime_us_delta(rettime, calltime));
}

/**
 * dpm_wait - Wait for a PM operation to complete.
 * @dev: Device to wait for.
 * @async: If unset, wait only if the device's power.async_suspend flag is set.
 */
static void dpm_wait(struct device *dev, bool async)
{
        if (!dev)
                return;

        if (async || (pm_async_enabled && dev->power.async_suspend))
                wait_for_completion(&dev->power.completion);
}

static int dpm_wait_fn(struct device *dev, void *async_ptr)
{
        dpm_wait(dev, *((bool *)async_ptr));
        return 0;
}

static void dpm_wait_for_children(struct device *dev, bool async)
{
        device_for_each_child(dev, &async, dpm_wait_fn);
}

static void dpm_wait_for_suppliers(struct device *dev, bool async)
{
        struct device_link *link;
        int idx;

        idx = device_links_read_lock();

        /*
         * If the supplier goes away right after we've checked the link to it,
         * we'll wait for its completion to change the state, but that's fine,
         * because the only things that will block as a result are the SRCU
         * callbacks freeing the link objects for the links in the list we're
         * walking.
         */
        dev_for_each_link_to_supplier(link, dev)
                if (READ_ONCE(link->status) != DL_STATE_DORMANT &&
                    !device_link_flag_is_sync_state_only(link->flags))
                        dpm_wait(link->supplier, async);

        device_links_read_unlock(idx);
}

static bool dpm_wait_for_superior(struct device *dev, bool async)
{
        struct device *parent;

        /*
         * If the device is resumed asynchronously and the parent's callback
         * deletes both the device and the parent itself, the parent object may
         * be freed while this function is running, so avoid that by reference
         * counting the parent once more unless the device has been deleted
         * already (in which case return right away).
         */
        mutex_lock(&dpm_list_mtx);

        if (!device_pm_initialized(dev)) {
                mutex_unlock(&dpm_list_mtx);
                return false;
        }

        parent = get_device(dev->parent);

        mutex_unlock(&dpm_list_mtx);

        dpm_wait(parent, async);
        put_device(parent);

        dpm_wait_for_suppliers(dev, async);

        /*
         * If the parent's callback has deleted the device, attempting to resume
         * it would be invalid, so avoid doing that then.
         */
        return device_pm_initialized(dev);
}

static void dpm_wait_for_consumers(struct device *dev, bool async)
{
        struct device_link *link;
        int idx;

        idx = device_links_read_lock();

        /*
         * The status of a device link can only be changed from "dormant" by a
         * probe, but that cannot happen during system suspend/resume.  In
         * theory it can change to "dormant" at that time, but then it is
         * reasonable to wait for the target device anyway (eg. if it goes
         * away, it's better to wait for it to go away completely and then
         * continue instead of trying to continue in parallel with its
         * unregistration).
         */
        dev_for_each_link_to_consumer(link, dev)
                if (READ_ONCE(link->status) != DL_STATE_DORMANT &&
                    !device_link_flag_is_sync_state_only(link->flags))
                        dpm_wait(link->consumer, async);

        device_links_read_unlock(idx);
}

static void dpm_wait_for_subordinate(struct device *dev, bool async)
{
        dpm_wait_for_children(dev, async);
        dpm_wait_for_consumers(dev, async);
}

/**
 * pm_op - Return the PM operation appropriate for given PM event.
 * @ops: PM operations to choose from.
 * @state: PM transition of the system being carried out.
 */
static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state)
{
        switch (state.event) {
#ifdef CONFIG_SUSPEND
        case PM_EVENT_SUSPEND:
                return ops->suspend;
        case PM_EVENT_RESUME:
                return ops->resume;
#endif /* CONFIG_SUSPEND */
#ifdef CONFIG_HIBERNATE_CALLBACKS
        case PM_EVENT_FREEZE:
        case PM_EVENT_QUIESCE:
                return ops->freeze;
        case PM_EVENT_POWEROFF:
        case PM_EVENT_HIBERNATE:
                return ops->poweroff;
        case PM_EVENT_THAW:
        case PM_EVENT_RECOVER:
                return ops->thaw;
        case PM_EVENT_RESTORE:
                return ops->restore;
#endif /* CONFIG_HIBERNATE_CALLBACKS */
        }

        return NULL;
}

/**
 * pm_late_early_op - Return the PM operation appropriate for given PM event.
 * @ops: PM operations to choose from.
 * @state: PM transition of the system being carried out.
 *
 * Runtime PM is disabled for @dev while this function is being executed.
 */
static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops,
                                      pm_message_t state)
{
        switch (state.event) {
#ifdef CONFIG_SUSPEND
        case PM_EVENT_SUSPEND:
                return ops->suspend_late;
        case PM_EVENT_RESUME:
                return ops->resume_early;
#endif /* CONFIG_SUSPEND */
#ifdef CONFIG_HIBERNATE_CALLBACKS
        case PM_EVENT_FREEZE:
        case PM_EVENT_QUIESCE:
                return ops->freeze_late;
        case PM_EVENT_POWEROFF:
        case PM_EVENT_HIBERNATE:
                return ops->poweroff_late;
        case PM_EVENT_THAW:
        case PM_EVENT_RECOVER:
                return ops->thaw_early;
        case PM_EVENT_RESTORE:
                return ops->restore_early;
#endif /* CONFIG_HIBERNATE_CALLBACKS */
        }

        return NULL;
}

/**
 * pm_noirq_op - Return the PM operation appropriate for given PM event.
 * @ops: PM operations to choose from.
 * @state: PM transition of the system being carried out.
 *
 * The driver of @dev will not receive interrupts while this function is being
 * executed.
 */
static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state)
{
        switch (state.event) {
#ifdef CONFIG_SUSPEND
        case PM_EVENT_SUSPEND:
                return ops->suspend_noirq;
        case PM_EVENT_RESUME:
                return ops->resume_noirq;
#endif /* CONFIG_SUSPEND */
#ifdef CONFIG_HIBERNATE_CALLBACKS
        case PM_EVENT_FREEZE:
        case PM_EVENT_QUIESCE:
                return ops->freeze_noirq;
        case PM_EVENT_POWEROFF:
        case PM_EVENT_HIBERNATE:
                return ops->poweroff_noirq;
        case PM_EVENT_THAW:
        case PM_EVENT_RECOVER:
                return ops->thaw_noirq;
        case PM_EVENT_RESTORE:
                return ops->restore_noirq;
#endif /* CONFIG_HIBERNATE_CALLBACKS */
        }

        return NULL;
}

static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info)
{
        dev_dbg(dev, "%s%s%s driver flags: %x\n", info, pm_verb(state.event),
                ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ?
                ", may wakeup" : "", dev->power.driver_flags);
}

static void pm_dev_err(struct device *dev, pm_message_t state, const char *info,
                        int error)
{
        dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info,
                error);
}

static void dpm_show_time(ktime_t starttime, pm_message_t state, int error,
                          const char *info)
{
        ktime_t calltime;
        u64 usecs64;
        int usecs;

        calltime = ktime_get();
        usecs64 = ktime_to_ns(ktime_sub(calltime, starttime));
        do_div(usecs64, NSEC_PER_USEC);
        usecs = usecs64;
        if (usecs == 0)
                usecs = 1;

        pm_pr_dbg("%s%s%s of devices %s after %ld.%03ld msecs\n",
                  info ?: "", info ? " " : "", pm_verb(state.event),
                  error ? "aborted" : "complete",
                  usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC);
}

static int dpm_run_callback(pm_callback_t cb, struct device *dev,
                            pm_message_t state, const char *info)
{
        ktime_t calltime;
        int error;

        if (!cb)
                return 0;

        calltime = initcall_debug_start(dev, cb);

        pm_dev_dbg(dev, state, info);
        trace_device_pm_callback_start(dev, info, state.event);
        error = cb(dev);
        trace_device_pm_callback_end(dev, error);
        suspend_report_result(dev, cb, error);

        initcall_debug_report(dev, calltime, cb, error);

        return error;
}

#ifdef CONFIG_DPM_WATCHDOG
struct dpm_watchdog {
        struct device                *dev;
        struct task_struct        *tsk;
        struct timer_list        timer;
        bool                        fatal;
};

#define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
        struct dpm_watchdog wd

static bool __read_mostly dpm_watchdog_all_cpu_backtrace;
module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644);
MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace,
                 "Backtrace all CPUs on DPM watchdog timeout");

/**
 * dpm_watchdog_handler - Driver suspend / resume watchdog handler.
 * @t: The timer that PM watchdog depends on.
 *
 * Called when a driver has timed out suspending or resuming.
 * There's not much we can do here to recover so panic() to
 * capture a crash-dump in pstore.
 */
static void dpm_watchdog_handler(struct timer_list *t)
{
        struct dpm_watchdog *wd = timer_container_of(wd, t, timer);
        struct timer_list *timer = &wd->timer;
        unsigned int time_left;

        if (wd->fatal) {
                unsigned int this_cpu = smp_processor_id();

                dev_emerg(wd->dev, "**** DPM device timeout ****\n");
                show_stack(wd->tsk, NULL, KERN_EMERG);
                if (dpm_watchdog_all_cpu_backtrace)
                        trigger_allbutcpu_cpu_backtrace(this_cpu);
                panic("%s %s: unrecoverable failure\n",
                        dev_driver_string(wd->dev), dev_name(wd->dev));
        }

        time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
        dev_warn(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n",
                 CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left);
        show_stack(wd->tsk, NULL, KERN_WARNING);

        wd->fatal = true;
        mod_timer(timer, jiffies + HZ * time_left);
}

/**
 * dpm_watchdog_set - Enable pm watchdog for given device.
 * @wd: Watchdog. Must be allocated on the stack.
 * @dev: Device to handle.
 */
static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev)
{
        struct timer_list *timer = &wd->timer;

        wd->dev = dev;
        wd->tsk = current;
        wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;

        timer_setup_on_stack(timer, dpm_watchdog_handler, 0);
        /* use same timeout value for both suspend and resume */
        timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT;
        add_timer(timer);
}

/**
 * dpm_watchdog_clear - Disable suspend/resume watchdog.
 * @wd: Watchdog to disable.
 */
static void dpm_watchdog_clear(struct dpm_watchdog *wd)
{
        struct timer_list *timer = &wd->timer;

        timer_delete_sync(timer);
        timer_destroy_on_stack(timer);
}
#else
#define DECLARE_DPM_WATCHDOG_ON_STACK(wd)
#define dpm_watchdog_set(x, y)
#define dpm_watchdog_clear(x)
#endif

/*------------------------- Resume routines -------------------------*/

/**
 * dev_pm_skip_resume - System-wide device resume optimization check.
 * @dev: Target device.
 *
 * Return:
 * - %false if the transition under way is RESTORE.
 * - Return value of dev_pm_skip_suspend() if the transition under way is THAW.
 * - The logical negation of %power.must_resume otherwise (that is, when the
 *   transition under way is RESUME).
 */
bool dev_pm_skip_resume(struct device *dev)
{
        if (pm_transition.event == PM_EVENT_RESTORE)
                return false;

        if (pm_transition.event == PM_EVENT_THAW)
                return dev_pm_skip_suspend(dev);

        return !dev->power.must_resume;
}

static bool is_async(struct device *dev)
{
        return dev->power.async_suspend && pm_async_enabled
                && !pm_trace_is_enabled();
}

static bool __dpm_async(struct device *dev, async_func_t func)
{
        if (dev->power.work_in_progress)
                return true;

        if (!is_async(dev))
                return false;

        dev->power.work_in_progress = true;

        get_device(dev);

        if (async_schedule_dev_nocall(func, dev))
                return true;

        put_device(dev);

        return false;
}

static bool dpm_async_fn(struct device *dev, async_func_t func)
{
        guard(mutex)(&async_wip_mtx);

        return __dpm_async(dev, func);
}

static int dpm_async_with_cleanup(struct device *dev, void *fn)
{
        guard(mutex)(&async_wip_mtx);

        if (!__dpm_async(dev, fn))
                dev->power.work_in_progress = false;

        return 0;
}

static void dpm_async_resume_children(struct device *dev, async_func_t func)
{
        /*
         * Prevent racing with dpm_clear_async_state() during initial list
         * walks in dpm_noirq_resume_devices(), dpm_resume_early(), and
         * dpm_resume().
         */
        guard(mutex)(&dpm_list_mtx);

        /*
         * Start processing "async" children of the device unless it's been
         * started already for them.
         */
        device_for_each_child(dev, func, dpm_async_with_cleanup);
}

static void dpm_async_resume_subordinate(struct device *dev, async_func_t func)
{
        struct device_link *link;
        int idx;

        dpm_async_resume_children(dev, func);

        idx = device_links_read_lock();

        /* Start processing the device's "async" consumers. */
        dev_for_each_link_to_consumer(link, dev)
                if (READ_ONCE(link->status) != DL_STATE_DORMANT)
                        dpm_async_with_cleanup(link->consumer, func);

        device_links_read_unlock(idx);
}

static void dpm_clear_async_state(struct device *dev)
{
        reinit_completion(&dev->power.completion);
        dev->power.work_in_progress = false;
}

static bool dpm_root_device(struct device *dev)
{
        lockdep_assert_held(&dpm_list_mtx);

        /*
         * Since this function is required to run under dpm_list_mtx, the
         * list_empty() below will only return true if the device's list of
         * consumers is actually empty before calling it.
         */
        return !dev->parent && list_empty(&dev->links.suppliers);
}

static void async_resume_noirq(void *data, async_cookie_t cookie);

/**
 * device_resume_noirq - Execute a "noirq resume" callback for given device.
 * @dev: Device to handle.
 * @state: PM transition of the system being carried out.
 * @async: If true, the device is being resumed asynchronously.
 *
 * The driver of @dev will not receive interrupts while this function is being
 * executed.
 */
static void device_resume_noirq(struct device *dev, pm_message_t state, bool async)
{
        pm_callback_t callback = NULL;
        const char *info = NULL;
        bool skip_resume;
        int error = 0;

        TRACE_DEVICE(dev);
        TRACE_RESUME(0);

        if (dev->power.syscore || dev->power.direct_complete)
                goto Out;

        if (!dev->power.is_noirq_suspended) {
                /*
                 * This means that system suspend has been aborted in the noirq
                 * phase before invoking the noirq suspend callback for the
                 * device, so if device_suspend_late() has left it in suspend,
                 * device_resume_early() should leave it in suspend either in
                 * case the early resume of it depends on the noirq resume that
                 * has not run.
                 */
                if (dev_pm_skip_suspend(dev))
                        dev->power.must_resume = false;

                goto Out;
        }

        if (!dpm_wait_for_superior(dev, async))
                goto Out;

        skip_resume = dev_pm_skip_resume(dev);
        /*
         * If the driver callback is skipped below or by the middle layer
         * callback and device_resume_early() also skips the driver callback for
         * this device later, it needs to appear as "suspended" to PM-runtime,
         * so change its status accordingly.
         *
         * Otherwise, the device is going to be resumed, so set its PM-runtime
         * status to "active" unless its power.smart_suspend flag is clear, in
         * which case it is not necessary to update its PM-runtime status.
         */
        if (skip_resume)
                pm_runtime_set_suspended(dev);
        else if (dev_pm_smart_suspend(dev))
                pm_runtime_set_active(dev);

        if (dev->pm_domain) {
                info = "noirq power domain ";
                callback = pm_noirq_op(&dev->pm_domain->ops, state);
        } else if (dev->type && dev->type->pm) {
                info = "noirq type ";
                callback = pm_noirq_op(dev->type->pm, state);
        } else if (dev->class && dev->class->pm) {
                info = "noirq class ";
                callback = pm_noirq_op(dev->class->pm, state);
        } else if (dev->bus && dev->bus->pm) {
                info = "noirq bus ";
                callback = pm_noirq_op(dev->bus->pm, state);
        }
        if (callback)
                goto Run;

        if (skip_resume)
                goto Skip;

        if (dev->driver && dev->driver->pm) {
                info = "noirq driver ";
                callback = pm_noirq_op(dev->driver->pm, state);
        }

Run:
        error = dpm_run_callback(callback, dev, state, info);

Skip:
        dev->power.is_noirq_suspended = false;

Out:
        complete_all(&dev->power.completion);
        TRACE_RESUME(error);

        if (error) {
                WRITE_ONCE(async_error, error);
                dpm_save_failed_dev(dev_name(dev));
                pm_dev_err(dev, state, async ? " async noirq" : " noirq", error);
        }

        dpm_async_resume_subordinate(dev, async_resume_noirq);
}

static void async_resume_noirq(void *data, async_cookie_t cookie)
{
        struct device *dev = data;

        device_resume_noirq(dev, pm_transition, true);
        put_device(dev);
}

static void dpm_noirq_resume_devices(pm_message_t state)
{
        struct device *dev;
        ktime_t starttime = ktime_get();

        trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true);

        async_error = 0;
        pm_transition = state;

        mutex_lock(&dpm_list_mtx);

        /*
         * Start processing "async" root devices upfront so they don't wait for
         * the "sync" devices they don't depend on.
         */
        list_for_each_entry(dev, &dpm_noirq_list, power.entry) {
                dpm_clear_async_state(dev);
                if (dpm_root_device(dev))
                        dpm_async_with_cleanup(dev, async_resume_noirq);
        }

        while (!list_empty(&dpm_noirq_list)) {
                dev = to_device(dpm_noirq_list.next);
                list_move_tail(&dev->power.entry, &dpm_late_early_list);

                if (!dpm_async_fn(dev, async_resume_noirq)) {
                        get_device(dev);

                        mutex_unlock(&dpm_list_mtx);

                        device_resume_noirq(dev, state, false);

                        put_device(dev);

                        mutex_lock(&dpm_list_mtx);
                }
        }
        mutex_unlock(&dpm_list_mtx);
        async_synchronize_full();
        dpm_show_time(starttime, state, 0, "noirq");
        if (READ_ONCE(async_error))
                dpm_save_failed_step(SUSPEND_RESUME_NOIRQ);

        trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false);
}

/**
 * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices.
 * @state: PM transition of the system being carried out.
 *
 * Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and
 * allow device drivers' interrupt handlers to be called.
 */
void dpm_resume_noirq(pm_message_t state)
{
        dpm_noirq_resume_devices(state);

        resume_device_irqs();
        device_wakeup_disarm_wake_irqs();
}

static void async_resume_early(void *data, async_cookie_t cookie);

/**
 * device_resume_early - Execute an "early resume" callback for given device.
 * @dev: Device to handle.
 * @state: PM transition of the system being carried out.
 * @async: If true, the device is being resumed asynchronously.
 *
 * Runtime PM is disabled for @dev while this function is being executed.
 */
static void device_resume_early(struct device *dev, pm_message_t state, bool async)
{
        pm_callback_t callback = NULL;
        const char *info = NULL;
        int error = 0;

        TRACE_DEVICE(dev);
        TRACE_RESUME(0);

        if (dev->power.direct_complete)
                goto Out;

        if (!dev->power.is_late_suspended)
                goto Out;

        if (dev->power.syscore)
                goto Skip;

        if (!dpm_wait_for_superior(dev, async))
                goto Out;

        if (dev->pm_domain) {
                info = "early power domain ";
                callback = pm_late_early_op(&dev->pm_domain->ops, state);
        } else if (dev->type && dev->type->pm) {
                info = "early type ";
                callback = pm_late_early_op(dev->type->pm, state);
        } else if (dev->class && dev->class->pm) {
                info = "early class ";
                callback = pm_late_early_op(dev->class->pm, state);
        } else if (dev->bus && dev->bus->pm) {
                info = "early bus ";
                callback = pm_late_early_op(dev->bus->pm, state);
        }
        if (callback)
                goto Run;

        if (dev_pm_skip_resume(dev))
                goto Skip;

        if (dev->driver && dev->driver->pm) {
                info = "early driver ";
                callback = pm_late_early_op(dev->driver->pm, state);
        }

Run:
        error = dpm_run_callback(callback, dev, state, info);

Skip:
        dev->power.is_late_suspended = false;
        pm_runtime_enable(dev);

Out:
        TRACE_RESUME(error);

        complete_all(&dev->power.completion);

        if (error) {
                WRITE_ONCE(async_error, error);
                dpm_save_failed_dev(dev_name(dev));
                pm_dev_err(dev, state, async ? " async early" : " early", error);
        }

        dpm_async_resume_subordinate(dev, async_resume_early);
}

static void async_resume_early(void *data, async_cookie_t cookie)
{
        struct device *dev = data;

        device_resume_early(dev, pm_transition, true);
        put_device(dev);
}

/**
 * dpm_resume_early - Execute "early resume" callbacks for all devices.
 * @state: PM transition of the system being carried out.
 */
void dpm_resume_early(pm_message_t state)
{
        struct device *dev;
        ktime_t starttime = ktime_get();

        trace_suspend_resume(TPS("dpm_resume_early"), state.event, true);

        async_error = 0;
        pm_transition = state;

        mutex_lock(&dpm_list_mtx);

        /*
         * Start processing "async" root devices upfront so they don't wait for
         * the "sync" devices they don't depend on.
         */
        list_for_each_entry(dev, &dpm_late_early_list, power.entry) {
                dpm_clear_async_state(dev);
                if (dpm_root_device(dev))
                        dpm_async_with_cleanup(dev, async_resume_early);
        }

        while (!list_empty(&dpm_late_early_list)) {
                dev = to_device(dpm_late_early_list.next);
                list_move_tail(&dev->power.entry, &dpm_suspended_list);

                if (!dpm_async_fn(dev, async_resume_early)) {
                        get_device(dev);

                        mutex_unlock(&dpm_list_mtx);

                        device_resume_early(dev, state, false);

                        put_device(dev);

                        mutex_lock(&dpm_list_mtx);
                }
        }
        mutex_unlock(&dpm_list_mtx);
        async_synchronize_full();
        dpm_show_time(starttime, state, 0, "early");
        if (READ_ONCE(async_error))
                dpm_save_failed_step(SUSPEND_RESUME_EARLY);

        trace_suspend_resume(TPS("dpm_resume_early"), state.event, false);
}

/**
 * dpm_resume_start - Execute "noirq" and "early" device callbacks.
 * @state: PM transition of the system being carried out.
 */
void dpm_resume_start(pm_message_t state)
{
        dpm_resume_noirq(state);
        dpm_resume_early(state);
}
EXPORT_SYMBOL_GPL(dpm_resume_start);

static void async_resume(void *data, async_cookie_t cookie);

/**
 * device_resume - Execute "resume" callbacks for given device.
 * @dev: Device to handle.
 * @state: PM transition of the system being carried out.
 * @async: If true, the device is being resumed asynchronously.
 */
static void device_resume(struct device *dev, pm_message_t state, bool async)
{
        pm_callback_t callback = NULL;
        const char *info = NULL;
        int error = 0;
        DECLARE_DPM_WATCHDOG_ON_STACK(wd);

        TRACE_DEVICE(dev);
        TRACE_RESUME(0);

        if (dev->power.syscore)
                goto Complete;

        if (!dev->power.is_suspended)
                goto Complete;

        dev->power.is_suspended = false;

        if (dev->power.direct_complete) {
                /*
                 * Allow new children to be added under the device after this
                 * point if it has no PM callbacks.
                 */
                if (dev->power.no_pm_callbacks)
                        dev->power.is_prepared = false;

                /* Match the pm_runtime_disable() in device_suspend(). */
                pm_runtime_enable(dev);
                goto Complete;
        }

        if (!dpm_wait_for_superior(dev, async))
                goto Complete;

        dpm_watchdog_set(&wd, dev);
        device_lock(dev);

        /*
         * This is a fib.  But we'll allow new children to be added below
         * a resumed device, even if the device hasn't been completed yet.
         */
        dev->power.is_prepared = false;

        if (dev->pm_domain) {
                info = "power domain ";
                callback = pm_op(&dev->pm_domain->ops, state);
                goto Driver;
        }

        if (dev->type && dev->type->pm) {
                info = "type ";
                callback = pm_op(dev->type->pm, state);
                goto Driver;
        }

        if (dev->class && dev->class->pm) {
                info = "class ";
                callback = pm_op(dev->class->pm, state);
                goto Driver;
        }

        if (dev->bus) {
                if (dev->bus->pm) {
                        info = "bus ";
                        callback = pm_op(dev->bus->pm, state);
                } else if (dev->bus->resume) {
                        info = "legacy bus ";
                        callback = dev->bus->resume;
                        goto End;
                }
        }

 Driver:
        if (!callback && dev->driver && dev->driver->pm) {
                info = "driver ";
                callback = pm_op(dev->driver->pm, state);
        }

 End:
        error = dpm_run_callback(callback, dev, state, info);

        device_unlock(dev);
        dpm_watchdog_clear(&wd);

 Complete:
        complete_all(&dev->power.completion);

        TRACE_RESUME(error);

        if (error) {
                WRITE_ONCE(async_error, error);
                dpm_save_failed_dev(dev_name(dev));
                pm_dev_err(dev, state, async ? " async" : "", error);
        }

        dpm_async_resume_subordinate(dev, async_resume);
}

static void async_resume(void *data, async_cookie_t cookie)
{
        struct device *dev = data;

        device_resume(dev, pm_transition, true);
        put_device(dev);
}

/**
 * dpm_resume - Execute "resume" callbacks for non-sysdev devices.
 * @state: PM transition of the system being carried out.
 *
 * Execute the appropriate "resume" callback for all devices whose status
 * indicates that they are suspended.
 */
void dpm_resume(pm_message_t state)
{
        struct device *dev;
        ktime_t starttime = ktime_get();

        trace_suspend_resume(TPS("dpm_resume"), state.event, true);

        pm_transition = state;
        async_error = 0;

        mutex_lock(&dpm_list_mtx);

        /*
         * Start processing "async" root devices upfront so they don't wait for
         * the "sync" devices they don't depend on.
         */
        list_for_each_entry(dev, &dpm_suspended_list, power.entry) {
                dpm_clear_async_state(dev);
                if (dpm_root_device(dev))
                        dpm_async_with_cleanup(dev, async_resume);
        }

        while (!list_empty(&dpm_suspended_list)) {
                dev = to_device(dpm_suspended_list.next);
                list_move_tail(&dev->power.entry, &dpm_prepared_list);

                if (!dpm_async_fn(dev, async_resume)) {
                        get_device(dev);

                        mutex_unlock(&dpm_list_mtx);

                        device_resume(dev, state, false);

                        put_device(dev);

                        mutex_lock(&dpm_list_mtx);
                }
        }
        mutex_unlock(&dpm_list_mtx);
        async_synchronize_full();
        dpm_show_time(starttime, state, 0, NULL);
        if (READ_ONCE(async_error))
                dpm_save_failed_step(SUSPEND_RESUME);

        cpufreq_resume();
        devfreq_resume();
        trace_suspend_resume(TPS("dpm_resume"), state.event, false);
}

/**
 * device_complete - Complete a PM transition for given device.
 * @dev: Device to handle.
 * @state: PM transition of the system being carried out.
 */
static void device_complete(struct device *dev, pm_message_t state)
{
        void (*callback)(struct device *) = NULL;
        const char *info = NULL;

        if (dev->power.syscore)
                goto out;

        device_lock(dev);

        if (dev->pm_domain) {
                info = "completing power domain ";
                callback = dev->pm_domain->ops.complete;
        } else if (dev->type && dev->type->pm) {
                info = "completing type ";
                callback = dev->type->pm->complete;
        } else if (dev->class && dev->class->pm) {
                info = "completing class ";
                callback = dev->class->pm->complete;
        } else if (dev->bus && dev->bus->pm) {
                info = "completing bus ";
                callback = dev->bus->pm->complete;
        }

        if (!callback && dev->driver && dev->driver->pm) {
                info = "completing driver ";
                callback = dev->driver->pm->complete;
        }

        if (callback) {
                pm_dev_dbg(dev, state, info);
                callback(dev);
        }

        device_unlock(dev);

out:
        /* If enabling runtime PM for the device is blocked, unblock it. */
        pm_runtime_unblock(dev);
        pm_runtime_put(dev);
}

/**
 * dpm_complete - Complete a PM transition for all non-sysdev devices.
 * @state: PM transition of the system being carried out.
 *
 * Execute the ->complete() callbacks for all devices whose PM status is not
 * DPM_ON (this allows new devices to be registered).
 */
void dpm_complete(pm_message_t state)
{
        struct list_head list;

        trace_suspend_resume(TPS("dpm_complete"), state.event, true);

        INIT_LIST_HEAD(&list);
        mutex_lock(&dpm_list_mtx);
        while (!list_empty(&dpm_prepared_list)) {
                struct device *dev = to_device(dpm_prepared_list.prev);

                get_device(dev);
                dev->power.is_prepared = false;
                list_move(&dev->power.entry, &list);

                mutex_unlock(&dpm_list_mtx);

                trace_device_pm_callback_start(dev, "", state.event);
                device_complete(dev, state);
                trace_device_pm_callback_end(dev, 0);

                put_device(dev);

                mutex_lock(&dpm_list_mtx);
        }
        list_splice(&list, &dpm_list);
        mutex_unlock(&dpm_list_mtx);

        /* Start resuming thermal control */
        thermal_pm_complete();
        /* Allow device probing and trigger re-probing of deferred devices */
        device_unblock_probing();
        trace_suspend_resume(TPS("dpm_complete"), state.event, false);
}

/**
 * dpm_resume_end - Execute "resume" callbacks and complete system transition.
 * @state: PM transition of the system being carried out.
 *
 * Execute "resume" callbacks for all devices and complete the PM transition of
 * the system.
 */
void dpm_resume_end(pm_message_t state)
{
        dpm_resume(state);
        pm_restore_gfp_mask();
        dpm_complete(state);
}
EXPORT_SYMBOL_GPL(dpm_resume_end);


/*------------------------- Suspend routines -------------------------*/

static bool dpm_leaf_device(struct device *dev)
{
        struct device *child;

        lockdep_assert_held(&dpm_list_mtx);

        child = device_find_any_child(dev);
        if (child) {
                put_device(child);

                return false;
        }

        /*
         * Since this function is required to run under dpm_list_mtx, the
         * list_empty() below will only return true if the device's list of
         * consumers is actually empty before calling it.
         */
        return list_empty(&dev->links.consumers);
}

static bool dpm_async_suspend_parent(struct device *dev, async_func_t func)
{
        guard(mutex)(&dpm_list_mtx);

        /*
         * If the device is suspended asynchronously and the parent's callback
         * deletes both the device and the parent itself, the parent object may
         * be freed while this function is running, so avoid that by checking
         * if the device has been deleted already as the parent cannot be
         * deleted before it.
         */
        if (!device_pm_initialized(dev))
                return false;

        /* Start processing the device's parent if it is "async". */
        if (dev->parent)
                dpm_async_with_cleanup(dev->parent, func);

        return true;
}

static void dpm_async_suspend_superior(struct device *dev, async_func_t func)
{
        struct device_link *link;
        int idx;

        if (!dpm_async_suspend_parent(dev, func))
                return;

        idx = device_links_read_lock();

        /* Start processing the device's "async" suppliers. */
        dev_for_each_link_to_supplier(link, dev)
                if (READ_ONCE(link->status) != DL_STATE_DORMANT)
                        dpm_async_with_cleanup(link->supplier, func);

        device_links_read_unlock(idx);
}

static void dpm_async_suspend_complete_all(struct list_head *device_list)
{
        struct device *dev;

        guard(mutex)(&async_wip_mtx);

        list_for_each_entry_reverse(dev, device_list, power.entry) {
                /*
                 * In case the device is being waited for and async processing
                 * has not started for it yet, let the waiters make progress.
                 */
                if (!dev->power.work_in_progress)
                        complete_all(&dev->power.completion);
        }
}

/**
 * resume_event - Return a "resume" message for given "suspend" sleep state.
 * @sleep_state: PM message representing a sleep state.
 *
 * Return a PM message representing the resume event corresponding to given
 * sleep state.
 */
static pm_message_t resume_event(pm_message_t sleep_state)
{
        switch (sleep_state.event) {
        case PM_EVENT_SUSPEND:
                return PMSG_RESUME;
        case PM_EVENT_FREEZE:
        case PM_EVENT_QUIESCE:
                return PMSG_RECOVER;
        case PM_EVENT_HIBERNATE:
                return PMSG_RESTORE;
        }
        return PMSG_ON;
}

static void dpm_superior_set_must_resume(struct device *dev)
{
        struct device_link *link;
        int idx;

        if (dev->parent)
                dev->parent->power.must_resume = true;

        idx = device_links_read_lock();

        dev_for_each_link_to_supplier(link, dev)
                link->supplier->power.must_resume = true;

        device_links_read_unlock(idx);
}

static void async_suspend_noirq(void *data, async_cookie_t cookie);

/**
 * device_suspend_noirq - Execute a "noirq suspend" callback for given device.
 * @dev: Device to handle.
 * @state: PM transition of the system being carried out.
 * @async: If true, the device is being suspended asynchronously.
 *
 * The driver of @dev will not receive interrupts while this function is being
 * executed.
 */
static void device_suspend_noirq(struct device *dev, pm_message_t state, bool async)
{
        pm_callback_t callback = NULL;
        const char *info = NULL;
        int error = 0;

        TRACE_DEVICE(dev);
        TRACE_SUSPEND(0);

        dpm_wait_for_subordinate(dev, async);

        if (READ_ONCE(async_error))
                goto Complete;

        if (dev->power.syscore || dev->power.direct_complete)
                goto Complete;

        if (dev->pm_domain) {
                info = "noirq power domain ";
                callback = pm_noirq_op(&dev->pm_domain->ops, state);
        } else if (dev->type && dev->type->pm) {
                info = "noirq type ";
                callback = pm_noirq_op(dev->type->pm, state);
        } else if (dev->class && dev->class->pm) {
                info = "noirq class ";
                callback = pm_noirq_op(dev->class->pm, state);
        } else if (dev->bus && dev->bus->pm) {
                info = "noirq bus ";
                callback = pm_noirq_op(dev->bus->pm, state);
        }
        if (callback)
                goto Run;

        if (dev_pm_skip_suspend(dev))
                goto Skip;

        if (dev->driver && dev->driver->pm) {
                info = "noirq driver ";
                callback = pm_noirq_op(dev->driver->pm, state);
        }

Run:
        error = dpm_run_callback(callback, dev, state, info);
        if (error) {
                WRITE_ONCE(async_error, error);
                dpm_save_failed_dev(dev_name(dev));
                pm_dev_err(dev, state, async ? " async noirq" : " noirq", error);
                goto Complete;
        }

Skip:
        dev->power.is_noirq_suspended = true;

        /*
         * Devices must be resumed unless they are explicitly allowed to be left
         * in suspend, but even in that case skipping the resume of devices that
         * were in use right before the system suspend (as indicated by their
         * runtime PM usage counters and child counters) would be suboptimal.
         */
        if (!(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) &&
              dev->power.may_skip_resume) || !pm_runtime_need_not_resume(dev))
                dev->power.must_resume = true;

        if (dev->power.must_resume)
                dpm_superior_set_must_resume(dev);

Complete:
        complete_all(&dev->power.completion);
        TRACE_SUSPEND(error);

        if (error || READ_ONCE(async_error))
                return;

        dpm_async_suspend_superior(dev, async_suspend_noirq);
}

static void async_suspend_noirq(void *data, async_cookie_t cookie)
{
        struct device *dev = data;

        device_suspend_noirq(dev, pm_transition, true);
        put_device(dev);
}

static int dpm_noirq_suspend_devices(pm_message_t state)
{
        ktime_t starttime = ktime_get();
        struct device *dev;
        int error;

        trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true);

        pm_transition = state;
        async_error = 0;

        mutex_lock(&dpm_list_mtx);

        /*
         * Start processing "async" leaf devices upfront so they don't need to
         * wait for the "sync" devices they don't depend on.
         */
        list_for_each_entry_reverse(dev, &dpm_late_early_list, power.entry) {
                dpm_clear_async_state(dev);
                if (dpm_leaf_device(dev))
                        dpm_async_with_cleanup(dev, async_suspend_noirq);
        }

        while (!list_empty(&dpm_late_early_list)) {
                dev = to_device(dpm_late_early_list.prev);

                list_move(&dev->power.entry, &dpm_noirq_list);

                if (dpm_async_fn(dev, async_suspend_noirq))
                        continue;

                get_device(dev);

                mutex_unlock(&dpm_list_mtx);

                device_suspend_noirq(dev, state, false);

                put_device(dev);

                mutex_lock(&dpm_list_mtx);

                if (READ_ONCE(async_error)) {
                        dpm_async_suspend_complete_all(&dpm_late_early_list);
                        /*
                         * Move all devices to the target list to resume them
                         * properly.
                         */
                        list_splice_init(&dpm_late_early_list, &dpm_noirq_list);
                        break;
                }
        }

        mutex_unlock(&dpm_list_mtx);

        async_synchronize_full();

        error = READ_ONCE(async_error);
        if (error)
                dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ);

        dpm_show_time(starttime, state, error, "noirq");
        trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false);
        return error;
}

/**
 * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices.
 * @state: PM transition of the system being carried out.
 *
 * Prevent device drivers' interrupt handlers from being called and invoke
 * "noirq" suspend callbacks for all non-sysdev devices.
 */
int dpm_suspend_noirq(pm_message_t state)
{
        int ret;

        device_wakeup_arm_wake_irqs();
        suspend_device_irqs();

        ret = dpm_noirq_suspend_devices(state);
        if (ret)
                dpm_resume_noirq(resume_event(state));

        return ret;
}

static void dpm_propagate_wakeup_to_parent(struct device *dev)
{
        struct device *parent = dev->parent;

        if (!parent)
                return;

        spin_lock_irq(&parent->power.lock);

        if (device_wakeup_path(dev) && !parent->power.ignore_children)
                parent->power.wakeup_path = true;

        spin_unlock_irq(&parent->power.lock);
}

static void async_suspend_late(void *data, async_cookie_t cookie);

/**
 * device_suspend_late - Execute a "late suspend" callback for given device.
 * @dev: Device to handle.
 * @state: PM transition of the system being carried out.
 * @async: If true, the device is being suspended asynchronously.
 *
 * Runtime PM is disabled for @dev while this function is being executed.
 */
static void device_suspend_late(struct device *dev, pm_message_t state, bool async)
{
        pm_callback_t callback = NULL;
        const char *info = NULL;
        int error = 0;

        TRACE_DEVICE(dev);
        TRACE_SUSPEND(0);

        dpm_wait_for_subordinate(dev, async);

        if (READ_ONCE(async_error))
                goto Complete;

        if (pm_wakeup_pending()) {
                WRITE_ONCE(async_error, -EBUSY);
                goto Complete;
        }

        if (dev->power.direct_complete)
                goto Complete;

        /*
         * After this point, any runtime PM operations targeting the device
         * will fail until the corresponding pm_runtime_enable() call in
         * device_resume_early().
         */
        pm_runtime_disable(dev);

        if (dev->power.syscore)
                goto Skip;

        if (dev->pm_domain) {
                info = "late power domain ";
                callback = pm_late_early_op(&dev->pm_domain->ops, state);
        } else if (dev->type && dev->type->pm) {
                info = "late type ";
                callback = pm_late_early_op(dev->type->pm, state);
        } else if (dev->class && dev->class->pm) {
                info = "late class ";
                callback = pm_late_early_op(dev->class->pm, state);
        } else if (dev->bus && dev->bus->pm) {
                info = "late bus ";
                callback = pm_late_early_op(dev->bus->pm, state);
        }
        if (callback)
                goto Run;

        if (dev_pm_skip_suspend(dev))
                goto Skip;

        if (dev->driver && dev->driver->pm) {
                info = "late driver ";
                callback = pm_late_early_op(dev->driver->pm, state);
        }

Run:
        error = dpm_run_callback(callback, dev, state, info);
        if (error) {
                WRITE_ONCE(async_error, error);
                dpm_save_failed_dev(dev_name(dev));
                pm_dev_err(dev, state, async ? " async late" : " late", error);
                pm_runtime_enable(dev);
                goto Complete;
        }
        dpm_propagate_wakeup_to_parent(dev);

Skip:
        dev->power.is_late_suspended = true;

Complete:
        TRACE_SUSPEND(error);
        complete_all(&dev->power.completion);

        if (error || READ_ONCE(async_error))
                return;

        dpm_async_suspend_superior(dev, async_suspend_late);
}

static void async_suspend_late(void *data, async_cookie_t cookie)
{
        struct device *dev = data;

        device_suspend_late(dev, pm_transition, true);
        put_device(dev);
}

/**
 * dpm_suspend_late - Execute "late suspend" callbacks for all devices.
 * @state: PM transition of the system being carried out.
 */
int dpm_suspend_late(pm_message_t state)
{
        ktime_t starttime = ktime_get();
        struct device *dev;
        int error;

        trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true);

        pm_transition = state;
        async_error = 0;

        wake_up_all_idle_cpus();

        mutex_lock(&dpm_list_mtx);

        /*
         * Start processing "async" leaf devices upfront so they don't need to
         * wait for the "sync" devices they don't depend on.
         */
        list_for_each_entry_reverse(dev, &dpm_suspended_list, power.entry) {
                dpm_clear_async_state(dev);
                if (dpm_leaf_device(dev))
                        dpm_async_with_cleanup(dev, async_suspend_late);
        }

        while (!list_empty(&dpm_suspended_list)) {
                dev = to_device(dpm_suspended_list.prev);

                list_move(&dev->power.entry, &dpm_late_early_list);

                if (dpm_async_fn(dev, async_suspend_late))
                        continue;

                get_device(dev);

                mutex_unlock(&dpm_list_mtx);

                device_suspend_late(dev, state, false);

                put_device(dev);

                mutex_lock(&dpm_list_mtx);

                if (READ_ONCE(async_error)) {
                        dpm_async_suspend_complete_all(&dpm_suspended_list);
                        /*
                         * Move all devices to the target list to resume them
                         * properly.
                         */
                        list_splice_init(&dpm_suspended_list, &dpm_late_early_list);
                        break;
                }
        }

        mutex_unlock(&dpm_list_mtx);

        async_synchronize_full();

        error = READ_ONCE(async_error);
        if (error) {
                dpm_save_failed_step(SUSPEND_SUSPEND_LATE);
                dpm_resume_early(resume_event(state));
        }
        dpm_show_time(starttime, state, error, "late");
        trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false);
        return error;
}

/**
 * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks.
 * @state: PM transition of the system being carried out.
 */
int dpm_suspend_end(pm_message_t state)
{
        ktime_t starttime = ktime_get();
        int error;

        error = dpm_suspend_late(state);
        if (error)
                goto out;

        error = dpm_suspend_noirq(state);
        if (error)
                dpm_resume_early(resume_event(state));

out:
        dpm_show_time(starttime, state, error, "end");
        return error;
}
EXPORT_SYMBOL_GPL(dpm_suspend_end);

/**
 * legacy_suspend - Execute a legacy (bus or class) suspend callback for device.
 * @dev: Device to suspend.
 * @state: PM transition of the system being carried out.
 * @cb: Suspend callback to execute.
 * @info: string description of caller.
 */
static int legacy_suspend(struct device *dev, pm_message_t state,
                          int (*cb)(struct device *dev, pm_message_t state),
                          const char *info)
{
        int error;
        ktime_t calltime;

        calltime = initcall_debug_start(dev, cb);

        trace_device_pm_callback_start(dev, info, state.event);
        error = cb(dev, state);
        trace_device_pm_callback_end(dev, error);
        suspend_report_result(dev, cb, error);

        initcall_debug_report(dev, calltime, cb, error);

        return error;
}

static void dpm_clear_superiors_direct_complete(struct device *dev)
{
        struct device_link *link;
        int idx;

        if (dev->parent) {
                spin_lock_irq(&dev->parent->power.lock);
                dev->parent->power.direct_complete = false;
                spin_unlock_irq(&dev->parent->power.lock);
        }

        idx = device_links_read_lock();

        dev_for_each_link_to_supplier(link, dev) {
                spin_lock_irq(&link->supplier->power.lock);
                link->supplier->power.direct_complete = false;
                spin_unlock_irq(&link->supplier->power.lock);
        }

        device_links_read_unlock(idx);
}

static void async_suspend(void *data, async_cookie_t cookie);

/**
 * device_suspend - Execute "suspend" callbacks for given device.
 * @dev: Device to handle.
 * @state: PM transition of the system being carried out.
 * @async: If true, the device is being suspended asynchronously.
 */
static void device_suspend(struct device *dev, pm_message_t state, bool async)
{
        pm_callback_t callback = NULL;
        const char *info = NULL;
        int error = 0;
        DECLARE_DPM_WATCHDOG_ON_STACK(wd);

        TRACE_DEVICE(dev);
        TRACE_SUSPEND(0);

        dpm_wait_for_subordinate(dev, async);

        if (READ_ONCE(async_error)) {
                dev->power.direct_complete = false;
                goto Complete;
        }

        /*
         * Wait for possible runtime PM transitions of the device in progress
         * to complete and if there's a runtime resume request pending for it,
         * resume it before proceeding with invoking the system-wide suspend
         * callbacks for it.
         *
         * If the system-wide suspend callbacks below change the configuration
         * of the device, they must disable runtime PM for it or otherwise
         * ensure that its runtime-resume callbacks will not be confused by that
         * change in case they are invoked going forward.
         */
        pm_runtime_barrier(dev);

        if (pm_wakeup_pending()) {
                dev->power.direct_complete = false;
                WRITE_ONCE(async_error, -EBUSY);
                goto Complete;
        }

        if (dev->power.syscore)
                goto Complete;

        /* Avoid direct_complete to let wakeup_path propagate. */
        if (device_may_wakeup(dev) || device_wakeup_path(dev))
                dev->power.direct_complete = false;

        if (dev->power.direct_complete) {
                if (pm_runtime_status_suspended(dev)) {
                        pm_runtime_disable(dev);
                        if (pm_runtime_status_suspended(dev)) {
                                pm_dev_dbg(dev, state, "direct-complete ");
                                dev->power.is_suspended = true;
                                goto Complete;
                        }

                        pm_runtime_enable(dev);
                }
                dev->power.direct_complete = false;
        }

        dev->power.may_skip_resume = true;
        dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME);

        dpm_watchdog_set(&wd, dev);
        device_lock(dev);

        if (dev->pm_domain) {
                info = "power domain ";
                callback = pm_op(&dev->pm_domain->ops, state);
                goto Run;
        }

        if (dev->type && dev->type->pm) {
                info = "type ";
                callback = pm_op(dev->type->pm, state);
                goto Run;
        }

        if (dev->class && dev->class->pm) {
                info = "class ";
                callback = pm_op(dev->class->pm, state);
                goto Run;
        }

        if (dev->bus) {
                if (dev->bus->pm) {
                        info = "bus ";
                        callback = pm_op(dev->bus->pm, state);
                } else if (dev->bus->suspend) {
                        pm_dev_dbg(dev, state, "legacy bus ");
                        error = legacy_suspend(dev, state, dev->bus->suspend,
                                                "legacy bus ");
                        goto End;
                }
        }

 Run:
        if (!callback && dev->driver && dev->driver->pm) {
                info = "driver ";
                callback = pm_op(dev->driver->pm, state);
        }

        error = dpm_run_callback(callback, dev, state, info);

 End:
        if (!error) {
                dev->power.is_suspended = true;
                if (device_may_wakeup(dev))
                        dev->power.wakeup_path = true;

                dpm_propagate_wakeup_to_parent(dev);
                dpm_clear_superiors_direct_complete(dev);
        }

        device_unlock(dev);
        dpm_watchdog_clear(&wd);

 Complete:
        if (error) {
                WRITE_ONCE(async_error, error);
                dpm_save_failed_dev(dev_name(dev));
                pm_dev_err(dev, state, async ? " async" : "", error);
        }

        complete_all(&dev->power.completion);
        TRACE_SUSPEND(error);

        if (error || READ_ONCE(async_error))
                return;

        dpm_async_suspend_superior(dev, async_suspend);
}

static void async_suspend(void *data, async_cookie_t cookie)
{
        struct device *dev = data;

        device_suspend(dev, pm_transition, true);
        put_device(dev);
}

/**
 * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices.
 * @state: PM transition of the system being carried out.
 */
int dpm_suspend(pm_message_t state)
{
        ktime_t starttime = ktime_get();
        struct device *dev;
        int error;

        trace_suspend_resume(TPS("dpm_suspend"), state.event, true);
        might_sleep();

        devfreq_suspend();
        cpufreq_suspend();

        pm_transition = state;
        async_error = 0;

        mutex_lock(&dpm_list_mtx);

        /*
         * Start processing "async" leaf devices upfront so they don't need to
         * wait for the "sync" devices they don't depend on.
         */
        list_for_each_entry_reverse(dev, &dpm_prepared_list, power.entry) {
                dpm_clear_async_state(dev);
                if (dpm_leaf_device(dev))
                        dpm_async_with_cleanup(dev, async_suspend);
        }

        while (!list_empty(&dpm_prepared_list)) {
                dev = to_device(dpm_prepared_list.prev);

                list_move(&dev->power.entry, &dpm_suspended_list);

                if (dpm_async_fn(dev, async_suspend))
                        continue;

                get_device(dev);

                mutex_unlock(&dpm_list_mtx);

                device_suspend(dev, state, false);

                put_device(dev);

                mutex_lock(&dpm_list_mtx);

                if (READ_ONCE(async_error)) {
                        dpm_async_suspend_complete_all(&dpm_prepared_list);
                        /*
                         * Move all devices to the target list to resume them
                         * properly.
                         */
                        list_splice_init(&dpm_prepared_list, &dpm_suspended_list);
                        break;
                }
        }

        mutex_unlock(&dpm_list_mtx);

        async_synchronize_full();

        error = READ_ONCE(async_error);
        if (error)
                dpm_save_failed_step(SUSPEND_SUSPEND);

        dpm_show_time(starttime, state, error, NULL);
        trace_suspend_resume(TPS("dpm_suspend"), state.event, false);
        return error;
}

static bool device_prepare_smart_suspend(struct device *dev)
{
        struct device_link *link;
        bool ret = true;
        int idx;

        /*
         * The "smart suspend" feature is enabled for devices whose drivers ask
         * for it and for devices without PM callbacks.
         *
         * However, if "smart suspend" is not enabled for the device's parent
         * or any of its suppliers that take runtime PM into account, it cannot
         * be enabled for the device either.
         */
        if (!dev->power.no_pm_callbacks &&
            !dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND))
                return false;

        if (dev->parent && !dev_pm_smart_suspend(dev->parent) &&
            !dev->parent->power.ignore_children && !pm_runtime_blocked(dev->parent))
                return false;

        idx = device_links_read_lock();

        dev_for_each_link_to_supplier(link, dev) {
                if (!device_link_test(link, DL_FLAG_PM_RUNTIME))
                        continue;

                if (!dev_pm_smart_suspend(link->supplier) &&
                    !pm_runtime_blocked(link->supplier)) {
                        ret = false;
                        break;
                }
        }

        device_links_read_unlock(idx);

        return ret;
}

/**
 * device_prepare - Prepare a device for system power transition.
 * @dev: Device to handle.
 * @state: PM transition of the system being carried out.
 *
 * Execute the ->prepare() callback(s) for given device.  No new children of the
 * device may be registered after this function has returned.
 */
static int device_prepare(struct device *dev, pm_message_t state)
{
        int (*callback)(struct device *) = NULL;
        bool smart_suspend;
        int ret = 0;

        /*
         * If a device's parent goes into runtime suspend at the wrong time,
         * it won't be possible to resume the device.  To prevent this we
         * block runtime suspend here, during the prepare phase, and allow
         * it again during the complete phase.
         */
        pm_runtime_get_noresume(dev);
        /*
         * If runtime PM is disabled for the device at this point and it has
         * never been enabled so far, it should not be enabled until this system
         * suspend-resume cycle is complete, so prepare to trigger a warning on
         * subsequent attempts to enable it.
         */
        smart_suspend = !pm_runtime_block_if_disabled(dev);

        if (dev->power.syscore)
                return 0;

        device_lock(dev);

        dev->power.wakeup_path = false;
        dev->power.out_band_wakeup = false;

        if (dev->power.no_pm_callbacks)
                goto unlock;

        if (dev->pm_domain)
                callback = dev->pm_domain->ops.prepare;
        else if (dev->type && dev->type->pm)
                callback = dev->type->pm->prepare;
        else if (dev->class && dev->class->pm)
                callback = dev->class->pm->prepare;
        else if (dev->bus && dev->bus->pm)
                callback = dev->bus->pm->prepare;

        if (!callback && dev->driver && dev->driver->pm)
                callback = dev->driver->pm->prepare;

        if (callback)
                ret = callback(dev);

unlock:
        device_unlock(dev);

        if (ret < 0) {
                suspend_report_result(dev, callback, ret);
                pm_runtime_put(dev);
                return ret;
        }
        /* Do not enable "smart suspend" for devices with disabled runtime PM. */
        if (smart_suspend)
                smart_suspend = device_prepare_smart_suspend(dev);

        spin_lock_irq(&dev->power.lock);

        dev->power.smart_suspend = smart_suspend;
        /*
         * A positive return value from ->prepare() means "this device appears
         * to be runtime-suspended and its state is fine, so if it really is
         * runtime-suspended, you can leave it in that state provided that you
         * will do the same thing with all of its descendants".  This only
         * applies to suspend transitions, however.
         */
        dev->power.direct_complete = state.event == PM_EVENT_SUSPEND &&
                (ret > 0 || dev->power.no_pm_callbacks) &&
                !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE);

        spin_unlock_irq(&dev->power.lock);

        return 0;
}

/**
 * dpm_prepare - Prepare all non-sysdev devices for a system PM transition.
 * @state: PM transition of the system being carried out.
 *
 * Execute the ->prepare() callback(s) for all devices.
 */
int dpm_prepare(pm_message_t state)
{
        int error = 0;

        trace_suspend_resume(TPS("dpm_prepare"), state.event, true);

        /*
         * Give a chance for the known devices to complete their probes, before
         * disable probing of devices. This sync point is important at least
         * at boot time + hibernation restore.
         */
        wait_for_device_probe();
        /*
         * It is unsafe if probing of devices will happen during suspend or
         * hibernation and system behavior will be unpredictable in this case.
         * So, let's prohibit device's probing here and defer their probes
         * instead. The normal behavior will be restored in dpm_complete().
         */
        device_block_probing();
        /* Suspend thermal control. */
        thermal_pm_prepare();

        mutex_lock(&dpm_list_mtx);
        while (!list_empty(&dpm_list) && !error) {
                struct device *dev = to_device(dpm_list.next);

                get_device(dev);

                mutex_unlock(&dpm_list_mtx);

                trace_device_pm_callback_start(dev, "", state.event);
                error = device_prepare(dev, state);
                trace_device_pm_callback_end(dev, error);

                mutex_lock(&dpm_list_mtx);

                if (!error) {
                        dev->power.is_prepared = true;
                        if (!list_empty(&dev->power.entry))
                                list_move_tail(&dev->power.entry, &dpm_prepared_list);
                } else if (error == -EAGAIN) {
                        error = 0;
                } else {
                        dev_info(dev, "not prepared for power transition: code %d\n",
                                 error);
                }

                mutex_unlock(&dpm_list_mtx);

                put_device(dev);

                mutex_lock(&dpm_list_mtx);
        }
        mutex_unlock(&dpm_list_mtx);
        trace_suspend_resume(TPS("dpm_prepare"), state.event, false);
        return error;
}

/**
 * dpm_suspend_start - Prepare devices for PM transition and suspend them.
 * @state: PM transition of the system being carried out.
 *
 * Prepare all non-sysdev devices for system PM transition and execute "suspend"
 * callbacks for them.
 */
int dpm_suspend_start(pm_message_t state)
{
        ktime_t starttime = ktime_get();
        int error;

        error = dpm_prepare(state);
        if (error)
                dpm_save_failed_step(SUSPEND_PREPARE);
        else {
                pm_restrict_gfp_mask();
                error = dpm_suspend(state);
        }

        dpm_show_time(starttime, state, error, "start");
        return error;
}
EXPORT_SYMBOL_GPL(dpm_suspend_start);

void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret)
{
        if (ret)
                dev_err(dev, "%s(): %ps returns %d\n", function, fn, ret);
}
EXPORT_SYMBOL_GPL(__suspend_report_result);

/**
 * device_pm_wait_for_dev - Wait for suspend/resume of a device to complete.
 * @subordinate: Device that needs to wait for @dev.
 * @dev: Device to wait for.
 */
int device_pm_wait_for_dev(struct device *subordinate, struct device *dev)
{
        dpm_wait(dev, subordinate->power.async_suspend);
        return async_error;
}
EXPORT_SYMBOL_GPL(device_pm_wait_for_dev);

/**
 * dpm_for_each_dev - device iterator.
 * @data: data for the callback.
 * @fn: function to be called for each device.
 *
 * Iterate over devices in dpm_list, and call @fn for each device,
 * passing it @data.
 */
void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *))
{
        struct device *dev;

        if (!fn)
                return;

        device_pm_lock();
        list_for_each_entry(dev, &dpm_list, power.entry)
                fn(dev, data);
        device_pm_unlock();
}
EXPORT_SYMBOL_GPL(dpm_for_each_dev);

static bool pm_ops_is_empty(const struct dev_pm_ops *ops)
{
        if (!ops)
                return true;

        return !ops->prepare &&
               !ops->suspend &&
               !ops->suspend_late &&
               !ops->suspend_noirq &&
               !ops->resume_noirq &&
               !ops->resume_early &&
               !ops->resume &&
               !ops->complete;
}

void device_pm_check_callbacks(struct device *dev)
{
        unsigned long flags;

        spin_lock_irqsave(&dev->power.lock, flags);
        dev->power.no_pm_callbacks =
                (!dev->bus || (pm_ops_is_empty(dev->bus->pm) &&
                 !dev->bus->suspend && !dev->bus->resume)) &&
                (!dev->class || pm_ops_is_empty(dev->class->pm)) &&
                (!dev->type || pm_ops_is_empty(dev->type->pm)) &&
                (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) &&
                (!dev->driver || (pm_ops_is_empty(dev->driver->pm) &&
                 !dev->driver->suspend && !dev->driver->resume));
        spin_unlock_irqrestore(&dev->power.lock, flags);
}

bool dev_pm_skip_suspend(struct device *dev)
{
        return dev_pm_smart_suspend(dev) && pm_runtime_status_suspended(dev);
}







































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  pm_wakeup.h - Power management wakeup interface
 *
 *  Copyright (C) 2008 Alan Stern
 *  Copyright (C) 2010 Rafael J. Wysocki, Novell Inc.
 */

#ifndef _LINUX_PM_WAKEUP_H
#define _LINUX_PM_WAKEUP_H

#ifndef _DEVICE_H_
# error "Please do not include this file directly."
#endif

#include <linux/types.h>

struct wake_irq;

/**
 * struct wakeup_source - Representation of wakeup sources
 *
 * @name: Name of the wakeup source
 * @id: Wakeup source id
 * @entry: Wakeup source list entry
 * @lock: Wakeup source lock
 * @wakeirq: Optional device specific wakeirq
 * @timer: Wakeup timer list
 * @timer_expires: Wakeup timer expiration
 * @total_time: Total time this wakeup source has been active.
 * @max_time: Maximum time this wakeup source has been continuously active.
 * @last_time: Monotonic clock when the wakeup source's was touched last time.
 * @prevent_sleep_time: Total time this source has been preventing autosleep.
 * @event_count: Number of signaled wakeup events.
 * @active_count: Number of times the wakeup source was activated.
 * @relax_count: Number of times the wakeup source was deactivated.
 * @expire_count: Number of times the wakeup source's timeout has expired.
 * @wakeup_count: Number of times the wakeup source might abort suspend.
 * @dev: Struct device for sysfs statistics about the wakeup source.
 * @active: Status of the wakeup source.
 * @autosleep_enabled: Autosleep is active, so update @prevent_sleep_time.
 */
struct wakeup_source {
        const char                 *name;
        int                        id;
        struct list_head        entry;
        spinlock_t                lock;
        struct wake_irq                *wakeirq;
        struct timer_list        timer;
        unsigned long                timer_expires;
        ktime_t total_time;
        ktime_t max_time;
        ktime_t last_time;
        ktime_t start_prevent_time;
        ktime_t prevent_sleep_time;
        unsigned long                event_count;
        unsigned long                active_count;
        unsigned long                relax_count;
        unsigned long                expire_count;
        unsigned long                wakeup_count;
        struct device                *dev;
        bool                        active:1;
        bool                        autosleep_enabled:1;
};

#define for_each_wakeup_source(ws) \
        for ((ws) = wakeup_sources_walk_start();        \
             (ws);                                        \
             (ws) = wakeup_sources_walk_next((ws)))

#ifdef CONFIG_PM_SLEEP

/*
 * Changes to device_may_wakeup take effect on the next pm state change.
 */

static inline bool device_can_wakeup(struct device *dev)
{
        return dev->power.can_wakeup;
}

static inline bool device_may_wakeup(struct device *dev)
{
        return dev->power.can_wakeup && !!dev->power.wakeup;
}

static inline bool device_wakeup_path(struct device *dev)
{
        return dev->power.wakeup_path;
}

static inline void device_set_wakeup_path(struct device *dev)
{
        dev->power.wakeup_path = true;
}

static inline void device_set_out_band_wakeup(struct device *dev)
{
        dev->power.out_band_wakeup = true;
}

static inline bool device_out_band_wakeup(struct device *dev)
{
        return dev->power.out_band_wakeup;
}

/* drivers/base/power/wakeup.c */
extern struct wakeup_source *wakeup_source_register(struct device *dev,
                                                    const char *name);
extern void wakeup_source_unregister(struct wakeup_source *ws);
extern int wakeup_sources_read_lock(void);
extern void wakeup_sources_read_unlock(int idx);
extern struct wakeup_source *wakeup_sources_walk_start(void);
extern struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws);
extern int device_wakeup_enable(struct device *dev);
extern void device_wakeup_disable(struct device *dev);
extern void device_set_wakeup_capable(struct device *dev, bool capable);
extern int device_set_wakeup_enable(struct device *dev, bool enable);
extern void __pm_stay_awake(struct wakeup_source *ws);
extern void pm_stay_awake(struct device *dev);
extern void __pm_relax(struct wakeup_source *ws);
extern void pm_relax(struct device *dev);
extern void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard);
extern void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard);

#else /* !CONFIG_PM_SLEEP */

static inline void device_set_wakeup_capable(struct device *dev, bool capable)
{
        dev->power.can_wakeup = capable;
}

static inline bool device_can_wakeup(struct device *dev)
{
        return dev->power.can_wakeup;
}

static inline struct wakeup_source *wakeup_source_register(struct device *dev,
                                                           const char *name)
{
        return NULL;
}

static inline void wakeup_source_unregister(struct wakeup_source *ws) {}

static inline int device_wakeup_enable(struct device *dev)
{
        dev->power.should_wakeup = true;
        return 0;
}

static inline void device_wakeup_disable(struct device *dev)
{
        dev->power.should_wakeup = false;
}

static inline int device_set_wakeup_enable(struct device *dev, bool enable)
{
        dev->power.should_wakeup = enable;
        return 0;
}

static inline bool device_may_wakeup(struct device *dev)
{
        return dev->power.can_wakeup && dev->power.should_wakeup;
}

static inline bool device_wakeup_path(struct device *dev)
{
        return false;
}

static inline void device_set_wakeup_path(struct device *dev) {}

static inline void device_set_out_band_wakeup(struct device *dev) {}

static inline bool device_out_band_wakeup(struct device *dev)
{
        return false;
}

static inline void __pm_stay_awake(struct wakeup_source *ws) {}

static inline void pm_stay_awake(struct device *dev) {}

static inline void __pm_relax(struct wakeup_source *ws) {}

static inline void pm_relax(struct device *dev) {}

static inline void pm_wakeup_ws_event(struct wakeup_source *ws,
                                      unsigned int msec, bool hard) {}

static inline void pm_wakeup_dev_event(struct device *dev, unsigned int msec,
                                       bool hard) {}

#endif /* !CONFIG_PM_SLEEP */

static inline bool device_awake_path(struct device *dev)
{
        return device_wakeup_path(dev);
}

static inline void device_set_awake_path(struct device *dev)
{
        device_set_wakeup_path(dev);
}

static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
{
        pm_wakeup_ws_event(ws, msec, false);
}

static inline void pm_wakeup_event(struct device *dev, unsigned int msec)
{
        pm_wakeup_dev_event(dev, msec, false);
}

static inline void pm_wakeup_hard_event(struct device *dev)
{
        pm_wakeup_dev_event(dev, 0, true);
}

/**
 * device_init_wakeup - Device wakeup initialization.
 * @dev: Device to handle.
 * @enable: Whether or not to enable @dev as a wakeup device.
 *
 * By default, most devices should leave wakeup disabled.  The exceptions are
 * devices that everyone expects to be wakeup sources: keyboards, power buttons,
 * possibly network interfaces, etc.  Also, devices that don't generate their
 * own wakeup requests but merely forward requests from one bus to another
 * (like PCI bridges) should have wakeup enabled by default.
 */
static inline int device_init_wakeup(struct device *dev, bool enable)
{
        if (enable) {
                device_set_wakeup_capable(dev, true);
                return device_wakeup_enable(dev);
        }
        device_wakeup_disable(dev);
        device_set_wakeup_capable(dev, false);
        return 0;
}

static void device_disable_wakeup(void *dev)
{
        device_init_wakeup(dev, false);
}

/**
 * devm_device_init_wakeup - Resource managed device wakeup initialization.
 * @dev: Device to handle.
 *
 * This function is the devm managed version of device_init_wakeup(dev, true).
 */
static inline int devm_device_init_wakeup(struct device *dev)
{
        device_init_wakeup(dev, true);
        return devm_add_action_or_reset(dev, device_disable_wakeup, dev);
}

#endif /* _LINUX_PM_WAKEUP_H */














































   33 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_RT_H
#define _LINUX_SCHED_RT_H

#include <linux/sched.h>

struct task_struct;

static inline bool rt_prio(int prio)
{
        return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO);
}

static inline bool rt_or_dl_prio(int prio)
{
        return unlikely(prio < MAX_RT_PRIO);
}

/*
 * Returns true if a task has a priority that belongs to RT class. PI-boosted
 * tasks will return true. Use rt_policy() to ignore PI-boosted tasks.
 */
static inline bool rt_task(struct task_struct *p)
{
        return rt_prio(p->prio);
}

/*
 * Returns true if a task has a priority that belongs to RT or DL classes.
 * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore
 * PI-boosted tasks.
 */
static inline bool rt_or_dl_task(struct task_struct *p)
{
        return rt_or_dl_prio(p->prio);
}

/*
 * Returns true if a task has a policy that belongs to RT or DL classes.
 * PI-boosted tasks will return false.
 */
static inline bool rt_or_dl_task_policy(struct task_struct *tsk)
{
        int policy = tsk->policy;

        if (policy == SCHED_FIFO || policy == SCHED_RR)
                return true;
        if (policy == SCHED_DEADLINE)
                return true;
        return false;
}

#ifdef CONFIG_RT_MUTEXES
extern void rt_mutex_pre_schedule(void);
extern void rt_mutex_schedule(void);
extern void rt_mutex_post_schedule(void);

/*
 * Must hold either p->pi_lock or task_rq(p)->lock.
 */
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
{
        return p->pi_top_task;
}
extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
#else
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
        return NULL;
}
# define rt_mutex_adjust_pi(p)                do { } while (0)
#endif

extern void normalize_rt_tasks(void);


/*
 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
 * Timeslices get refilled after they expire.
 */
#define RR_TIMESLICE                (100 * HZ / 1000)

#endif /* _LINUX_SCHED_RT_H */





























































   19 



























   16 

















   18 



   18 





   16 

   19 





   16 










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
 *
 * Provides a framework for enqueueing and running callbacks from hardirq
 * context. The enqueueing is NMI-safe.
 */

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/irq_work.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/tick.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/smpboot.h>
#include <asm/processor.h>
#include <linux/kasan.h>

#include <trace/events/ipi.h>

static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
static DEFINE_PER_CPU(struct task_struct *, irq_workd);

static void wake_irq_workd(void)
{
        struct task_struct *tsk = __this_cpu_read(irq_workd);

        if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk)
                wake_up_process(tsk);
}

#ifdef CONFIG_SMP
static void irq_work_wake(struct irq_work *entry)
{
        wake_irq_workd();
}

static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) =
        IRQ_WORK_INIT_HARD(irq_work_wake);
#endif

static int irq_workd_should_run(unsigned int cpu)
{
        return !llist_empty(this_cpu_ptr(&lazy_list));
}

/*
 * Claim the entry so that no one else will poke at it.
 */
static bool irq_work_claim(struct irq_work *work)
{
        int oflags;

        oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
        /*
         * If the work is already pending, no need to raise the IPI.
         * The pairing smp_mb() in irq_work_single() makes sure
         * everything we did before is visible.
         */
        if (oflags & IRQ_WORK_PENDING)
                return false;
        return true;
}

void __weak arch_irq_work_raise(void)
{
        /*
         * Lame architectures will get the timer tick callback
         */
}

static __always_inline void irq_work_raise(struct irq_work *work)
{
        if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
                trace_call__ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);

        arch_irq_work_raise();
}

/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
        struct llist_head *list;
        bool rt_lazy_work = false;
        bool lazy_work = false;
        int work_flags;

        work_flags = atomic_read(&work->node.a_flags);
        if (work_flags & IRQ_WORK_LAZY)
                lazy_work = true;
        else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
                 !(work_flags & IRQ_WORK_HARD_IRQ))
                rt_lazy_work = true;

        if (lazy_work || rt_lazy_work)
                list = this_cpu_ptr(&lazy_list);
        else
                list = this_cpu_ptr(&raised_list);

        if (!llist_add(&work->node.llist, list))
                return;

        /* If the work is "lazy", handle it from next tick if any */
        if (!lazy_work || tick_nohz_tick_stopped())
                irq_work_raise(work);
}

/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;

        /* Queue the entry and raise the IPI if needed. */
        preempt_disable();
        __irq_work_queue_local(work);
        preempt_enable();

        return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);

/*
 * Enqueue the irq_work @work on @cpu unless it's already pending
 * somewhere.
 *
 * Can be re-enqueued while the callback is still in progress.
 */
bool irq_work_queue_on(struct irq_work *work, int cpu)
{
#ifndef CONFIG_SMP
        return irq_work_queue(work);

#else /* CONFIG_SMP: */
        /* All work should have been flushed before going offline */
        WARN_ON_ONCE(cpu_is_offline(cpu));

        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;

        kasan_record_aux_stack(work);

        preempt_disable();
        if (cpu != smp_processor_id()) {
                /* Arch remote IPI send/receive backend aren't NMI safe */
                WARN_ON_ONCE(in_nmi());

                /*
                 * On PREEMPT_RT the items which are not marked as
                 * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work
                 * item is used on the remote CPU to wake the thread.
                 */
                if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
                    !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {

                        if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
                                goto out;

                        work = &per_cpu(irq_work_wakeup, cpu);
                        if (!irq_work_claim(work))
                                goto out;
                }

                __smp_call_single_queue(cpu, &work->node.llist);
        } else {
                __irq_work_queue_local(work);
        }
out:
        preempt_enable();

        return true;
#endif /* CONFIG_SMP */
}

bool irq_work_needs_cpu(void)
{
        struct llist_head *raised, *lazy;

        raised = this_cpu_ptr(&raised_list);
        lazy = this_cpu_ptr(&lazy_list);

        if (llist_empty(raised) || arch_irq_work_has_interrupt())
                if (llist_empty(lazy))
                        return false;

        /* All work should have been flushed before going offline */
        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));

        return true;
}

void irq_work_single(void *arg)
{
        struct irq_work *work = arg;
        int flags;

        /*
         * Clear the PENDING bit, after this point the @work can be re-used.
         * The PENDING bit acts as a lock, and we own it, so we can clear it
         * without atomic ops.
         */
        flags = atomic_read(&work->node.a_flags);
        flags &= ~IRQ_WORK_PENDING;
        atomic_set(&work->node.a_flags, flags);

        /*
         * See irq_work_claim().
         */
        smp_mb();

        lockdep_irq_work_enter(flags);
        work->func(work);
        lockdep_irq_work_exit(flags);

        /*
         * Clear the BUSY bit, if set, and return to the free state if no-one
         * else claimed it meanwhile.
         */
        (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);

        if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
            !arch_irq_work_has_interrupt())
                rcuwait_wake_up(&work->irqwait);
}

static void irq_work_run_list(struct llist_head *list)
{
        struct irq_work *work, *tmp;
        struct llist_node *llnode;

        /*
         * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed
         * in a per-CPU thread in preemptible context. Only the items which are
         * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context.
         */
        BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT));

        if (llist_empty(list))
                return;

        llnode = llist_del_all(list);
        llist_for_each_entry_safe(work, tmp, llnode, node.llist)
                irq_work_single(work);
}

/*
 * hotplug calls this through:
 *  hotplug_cfd() -> flush_smp_call_function_queue()
 */
void irq_work_run(void)
{
        irq_work_run_list(this_cpu_ptr(&raised_list));
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                irq_work_run_list(this_cpu_ptr(&lazy_list));
        else
                wake_irq_workd();
}
EXPORT_SYMBOL_GPL(irq_work_run);

void irq_work_tick(void)
{
        struct llist_head *raised = this_cpu_ptr(&raised_list);

        if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
                irq_work_run_list(raised);

        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                irq_work_run_list(this_cpu_ptr(&lazy_list));
        else
                wake_irq_workd();
}

/*
 * Synchronize against the irq_work @entry, ensures the entry is not
 * currently in use.
 */
void irq_work_sync(struct irq_work *work)
{
        lockdep_assert_irqs_enabled();
        might_sleep();

        if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
            !arch_irq_work_has_interrupt()) {
                rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
                                   TASK_UNINTERRUPTIBLE);
                return;
        }

        while (irq_work_is_busy(work))
                cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);

static void run_irq_workd(unsigned int cpu)
{
        irq_work_run_list(this_cpu_ptr(&lazy_list));
}

static void irq_workd_setup(unsigned int cpu)
{
        sched_set_fifo_low(current);
}

static struct smp_hotplug_thread irqwork_threads = {
        .store                  = &irq_workd,
        .setup                        = irq_workd_setup,
        .thread_should_run      = irq_workd_should_run,
        .thread_fn              = run_irq_workd,
        .thread_comm            = "irq_work/%u",
};

static __init int irq_work_init_threads(void)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                BUG_ON(smpboot_register_percpu_thread(&irqwork_threads));
        return 0;
}
early_initcall(irq_work_init_threads);

























































































































































































































    1 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Landlock LSM - Ruleset management
 *
 * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
 * Copyright © 2018-2020 ANSSI
 */

#ifndef _SECURITY_LANDLOCK_RULESET_H
#define _SECURITY_LANDLOCK_RULESET_H

#include <linux/cleanup.h>
#include <linux/err.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>

#include "access.h"
#include "limits.h"
#include "object.h"

struct landlock_hierarchy;

/**
 * struct landlock_layer - Access rights for a given layer
 */
struct landlock_layer {
        /**
         * @level: Position of this layer in the layer stack.  Starts from 1.
         */
        u16 level;
        /**
         * @access: Bitfield of allowed actions on the kernel object.  They are
         * relative to the object type (e.g. %LANDLOCK_ACTION_FS_READ).
         */
        access_mask_t access;
};

/**
 * union landlock_key - Key of a ruleset's red-black tree
 */
union landlock_key {
        /**
         * @object: Pointer to identify a kernel object (e.g. an inode).
         */
        struct landlock_object *object;
        /**
         * @data: Raw data to identify an arbitrary 32-bit value
         * (e.g. a TCP port).
         */
        uintptr_t data;
};

/**
 * enum landlock_key_type - Type of &union landlock_key
 */
enum landlock_key_type {
        /**
         * @LANDLOCK_KEY_INODE: Type of &landlock_ruleset.root_inode's node
         * keys.
         */
        LANDLOCK_KEY_INODE = 1,
        /**
         * @LANDLOCK_KEY_NET_PORT: Type of &landlock_ruleset.root_net_port's
         * node keys.
         */
        LANDLOCK_KEY_NET_PORT,
};

/**
 * struct landlock_id - Unique rule identifier for a ruleset
 */
struct landlock_id {
        /**
         * @key: Identifies either a kernel object (e.g. an inode) or
         * a raw value (e.g. a TCP port).
         */
        union landlock_key key;
        /**
         * @type: Type of a landlock_ruleset's root tree.
         */
        const enum landlock_key_type type;
};

/**
 * struct landlock_rule - Access rights tied to an object
 */
struct landlock_rule {
        /**
         * @node: Node in the ruleset's red-black tree.
         */
        struct rb_node node;
        /**
         * @key: A union to identify either a kernel object (e.g. an inode) or
         * a raw data value (e.g. a network socket port). This is used as a key
         * for this ruleset element.  The pointer is set once and never
         * modified.  It always points to an allocated object because each rule
         * increments the refcount of its object.
         */
        union landlock_key key;
        /**
         * @num_layers: Number of entries in @layers.
         */
        u32 num_layers;
        /**
         * @layers: Stack of layers, from the latest to the newest, implemented
         * as a flexible array member (FAM).
         */
        struct landlock_layer layers[] __counted_by(num_layers);
};

/**
 * struct landlock_ruleset - Landlock ruleset
 *
 * This data structure must contain unique entries, be updatable, and quick to
 * match an object.
 */
struct landlock_ruleset {
        /**
         * @root_inode: Root of a red-black tree containing &struct
         * landlock_rule nodes with inode object.  Once a ruleset is tied to a
         * process (i.e. as a domain), this tree is immutable until @usage
         * reaches zero.
         */
        struct rb_root root_inode;

#if IS_ENABLED(CONFIG_INET)
        /**
         * @root_net_port: Root of a red-black tree containing &struct
         * landlock_rule nodes with network port. Once a ruleset is tied to a
         * process (i.e. as a domain), this tree is immutable until @usage
         * reaches zero.
         */
        struct rb_root root_net_port;
#endif /* IS_ENABLED(CONFIG_INET) */

        /**
         * @hierarchy: Enables hierarchy identification even when a parent
         * domain vanishes.  This is needed for the ptrace protection.
         */
        struct landlock_hierarchy *hierarchy;
        union {
                /**
                 * @work_free: Enables to free a ruleset within a lockless
                 * section.  This is only used by
                 * landlock_put_ruleset_deferred() when @usage reaches zero.
                 * The fields @lock, @usage, @num_rules, @num_layers and
                 * @access_masks are then unused.
                 */
                struct work_struct work_free;
                struct {
                        /**
                         * @lock: Protects against concurrent modifications of
                         * @root, if @usage is greater than zero.
                         */
                        struct mutex lock;
                        /**
                         * @usage: Number of processes (i.e. domains) or file
                         * descriptors referencing this ruleset.
                         */
                        refcount_t usage;
                        /**
                         * @num_rules: Number of non-overlapping (i.e. not for
                         * the same object) rules in this ruleset.
                         */
                        u32 num_rules;
                        /**
                         * @num_layers: Number of layers that are used in this
                         * ruleset.  This enables to check that all the layers
                         * allow an access request.  A value of 0 identifies a
                         * non-merged ruleset (i.e. not a domain).
                         */
                        u32 num_layers;
                        /**
                         * @access_masks: Contains the subset of filesystem and
                         * network actions that are restricted by a ruleset.
                         * A domain saves all layers of merged rulesets in a
                         * stack (FAM), starting from the first layer to the
                         * last one.  These layers are used when merging
                         * rulesets, for user space backward compatibility
                         * (i.e. future-proof), and to properly handle merged
                         * rulesets without overlapping access rights.  These
                         * layers are set once and never changed for the
                         * lifetime of the ruleset.
                         */
                        struct access_masks access_masks[];
                };
        };
};

struct landlock_ruleset *
landlock_create_ruleset(const access_mask_t access_mask_fs,
                        const access_mask_t access_mask_net,
                        const access_mask_t scope_mask);

void landlock_put_ruleset(struct landlock_ruleset *const ruleset);
void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset);

DEFINE_FREE(landlock_put_ruleset, struct landlock_ruleset *,
            if (!IS_ERR_OR_NULL(_T)) landlock_put_ruleset(_T))

int landlock_insert_rule(struct landlock_ruleset *const ruleset,
                         const struct landlock_id id,
                         const access_mask_t access);

struct landlock_ruleset *
landlock_merge_ruleset(struct landlock_ruleset *const parent,
                       struct landlock_ruleset *const ruleset);

const struct landlock_rule *
landlock_find_rule(const struct landlock_ruleset *const ruleset,
                   const struct landlock_id id);

static inline void landlock_get_ruleset(struct landlock_ruleset *const ruleset)
{
        if (ruleset)
                refcount_inc(&ruleset->usage);
}

/**
 * landlock_union_access_masks - Return all access rights handled in the
 *                                 domain
 *
 * @domain: Landlock ruleset (used as a domain)
 *
 * Return: An access_masks result of the OR of all the domain's access masks.
 */
static inline struct access_masks
landlock_union_access_masks(const struct landlock_ruleset *const domain)
{
        union access_masks_all matches = {};
        size_t layer_level;

        for (layer_level = 0; layer_level < domain->num_layers; layer_level++) {
                union access_masks_all layer = {
                        .masks = domain->access_masks[layer_level],
                };

                matches.all |= layer.all;
        }

        return matches.masks;
}

static inline void
landlock_add_fs_access_mask(struct landlock_ruleset *const ruleset,
                            const access_mask_t fs_access_mask,
                            const u16 layer_level)
{
        access_mask_t fs_mask = fs_access_mask & LANDLOCK_MASK_ACCESS_FS;

        /* Should already be checked in sys_landlock_create_ruleset(). */
        WARN_ON_ONCE(fs_access_mask != fs_mask);
        ruleset->access_masks[layer_level].fs |= fs_mask;
}

static inline void
landlock_add_net_access_mask(struct landlock_ruleset *const ruleset,
                             const access_mask_t net_access_mask,
                             const u16 layer_level)
{
        access_mask_t net_mask = net_access_mask & LANDLOCK_MASK_ACCESS_NET;

        /* Should already be checked in sys_landlock_create_ruleset(). */
        WARN_ON_ONCE(net_access_mask != net_mask);
        ruleset->access_masks[layer_level].net |= net_mask;
}

static inline void
landlock_add_scope_mask(struct landlock_ruleset *const ruleset,
                        const access_mask_t scope_mask, const u16 layer_level)
{
        access_mask_t mask = scope_mask & LANDLOCK_MASK_SCOPE;

        /* Should already be checked in sys_landlock_create_ruleset(). */
        WARN_ON_ONCE(scope_mask != mask);
        ruleset->access_masks[layer_level].scope |= mask;
}

static inline access_mask_t
landlock_get_fs_access_mask(const struct landlock_ruleset *const ruleset,
                            const u16 layer_level)
{
        /* Handles all initially denied by default access rights. */
        return ruleset->access_masks[layer_level].fs |
               _LANDLOCK_ACCESS_FS_INITIALLY_DENIED;
}

static inline access_mask_t
landlock_get_net_access_mask(const struct landlock_ruleset *const ruleset,
                             const u16 layer_level)
{
        return ruleset->access_masks[layer_level].net;
}

static inline access_mask_t
landlock_get_scope_mask(const struct landlock_ruleset *const ruleset,
                        const u16 layer_level)
{
        return ruleset->access_masks[layer_level].scope;
}

bool landlock_unmask_layers(const struct landlock_rule *const rule,
                            struct layer_access_masks *masks);

access_mask_t
landlock_init_layer_masks(const struct landlock_ruleset *const domain,
                          const access_mask_t access_request,
                          struct layer_access_masks *masks,
                          const enum landlock_key_type key_type);

#endif /* _SECURITY_LANDLOCK_RULESET_H */






















































    2 






    2 

    1 


























    2 
























    1 
    1 








    2 
















    1 











    1 








    1 
    1 




















    2 












    1 


    1 


    1 

    1 

    1 
















    2 



    2 


























    2 











    2 






























    1 







    1 






    1 






















    2 












    1 

    1 













    1 
    1 



    1 





    1 




































    2 





















    2 








    2 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
 * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * cache.c
 */

/*
 * Blocks in Squashfs are compressed.  To avoid repeatedly decompressing
 * recently accessed data Squashfs uses two small metadata and fragment caches.
 *
 * This file implements a generic cache implementation used for both caches,
 * plus functions layered ontop of the generic cache implementation to
 * access the metadata and fragment caches.
 *
 * To avoid out of memory and fragmentation issues with vmalloc the cache
 * uses sequences of kmalloced PAGE_SIZE buffers.
 *
 * It should be noted that the cache is not used for file datablocks, these
 * are decompressed and cached in the page-cache in the normal way.  The
 * cache is only used to temporarily cache fragment and metadata blocks
 * which have been read as as a result of a metadata (i.e. inode or
 * directory) or fragment access.  Because metadata and fragments are packed
 * together into blocks (to gain greater compression) the read of a particular
 * piece of metadata or fragment will retrieve other metadata/fragments which
 * have been packed with it, these because of locality-of-reference may be read
 * in the near future. Temporarily caching them ensures they are available for
 * near future access without requiring an additional read and decompress.
 */

#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
#include <linux/pagemap.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs.h"
#include "page_actor.h"

/*
 * Look-up block in cache, and increment usage count.  If not in cache, read
 * and decompress it from disk.
 */
struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
        struct squashfs_cache *cache, u64 block, int length)
{
        int i, n;
        struct squashfs_cache_entry *entry;

        spin_lock(&cache->lock);

        while (1) {
                for (i = cache->curr_blk, n = 0; n < cache->entries; n++) {
                        if (cache->entry[i].block == block) {
                                cache->curr_blk = i;
                                break;
                        }
                        i = (i + 1) % cache->entries;
                }

                if (n == cache->entries) {
                        /*
                         * Block not in cache, if all cache entries are used
                         * go to sleep waiting for one to become available.
                         */
                        if (cache->unused == 0) {
                                cache->num_waiters++;
                                spin_unlock(&cache->lock);
                                wait_event(cache->wait_queue, cache->unused);
                                spin_lock(&cache->lock);
                                cache->num_waiters--;
                                continue;
                        }

                        /*
                         * At least one unused cache entry.  A simple
                         * round-robin strategy is used to choose the entry to
                         * be evicted from the cache.
                         */
                        i = cache->next_blk;
                        for (n = 0; n < cache->entries; n++) {
                                if (cache->entry[i].refcount == 0)
                                        break;
                                i = (i + 1) % cache->entries;
                        }

                        cache->next_blk = (i + 1) % cache->entries;
                        entry = &cache->entry[i];

                        /*
                         * Initialise chosen cache entry, and fill it in from
                         * disk.
                         */
                        cache->unused--;
                        entry->block = block;
                        entry->refcount = 1;
                        entry->pending = 1;
                        entry->num_waiters = 0;
                        entry->error = 0;
                        spin_unlock(&cache->lock);

                        entry->length = squashfs_read_data(sb, block, length,
                                &entry->next_index, entry->actor);

                        spin_lock(&cache->lock);

                        if (entry->length < 0)
                                entry->error = entry->length;

                        entry->pending = 0;

                        /*
                         * While filling this entry one or more other processes
                         * have looked it up in the cache, and have slept
                         * waiting for it to become available.
                         */
                        if (entry->num_waiters) {
                                spin_unlock(&cache->lock);
                                wake_up_all(&entry->wait_queue);
                        } else
                                spin_unlock(&cache->lock);

                        goto out;
                }

                /*
                 * Block already in cache.  Increment refcount so it doesn't
                 * get reused until we're finished with it, if it was
                 * previously unused there's one less cache entry available
                 * for reuse.
                 */
                entry = &cache->entry[i];
                if (entry->refcount == 0)
                        cache->unused--;
                entry->refcount++;

                /*
                 * If the entry is currently being filled in by another process
                 * go to sleep waiting for it to become available.
                 */
                if (entry->pending) {
                        entry->num_waiters++;
                        spin_unlock(&cache->lock);
                        wait_event(entry->wait_queue, !entry->pending);
                } else
                        spin_unlock(&cache->lock);

                goto out;
        }

out:
        TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
                cache->name, i, entry->block, entry->refcount, entry->error);

        if (entry->error)
                ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
                                                        block);
        return entry;
}


/*
 * Release cache entry, once usage count is zero it can be reused.
 */
void squashfs_cache_put(struct squashfs_cache_entry *entry)
{
        struct squashfs_cache *cache = entry->cache;

        spin_lock(&cache->lock);
        entry->refcount--;
        if (entry->refcount == 0) {
                cache->unused++;
                /*
                 * If there's any processes waiting for a block to become
                 * available, wake one up.
                 */
                if (cache->num_waiters) {
                        spin_unlock(&cache->lock);
                        wake_up(&cache->wait_queue);
                        return;
                }
        }
        spin_unlock(&cache->lock);
}

/*
 * Delete cache reclaiming all kmalloced buffers.
 */
void squashfs_cache_delete(struct squashfs_cache *cache)
{
        int i, j;

        if (IS_ERR(cache) || cache == NULL)
                return;

        for (i = 0; i < cache->entries; i++) {
                if (cache->entry[i].data) {
                        for (j = 0; j < cache->pages; j++)
                                kfree(cache->entry[i].data[j]);
                        kfree(cache->entry[i].data);
                }
                kfree(cache->entry[i].actor);
        }

        kfree(cache->entry);
        kfree(cache);
}


/*
 * Initialise cache allocating the specified number of entries, each of
 * size block_size.  To avoid vmalloc fragmentation issues each entry
 * is allocated as a sequence of kmalloced PAGE_SIZE buffers.
 */
struct squashfs_cache *squashfs_cache_init(char *name, int entries,
        int block_size)
{
        int i, j;
        struct squashfs_cache *cache;

        if (entries == 0)
                return NULL;

        cache = kzalloc_obj(*cache);
        if (cache == NULL) {
                ERROR("Failed to allocate %s cache\n", name);
                return ERR_PTR(-ENOMEM);
        }

        cache->entry = kzalloc_objs(*(cache->entry), entries);
        if (cache->entry == NULL) {
                ERROR("Failed to allocate %s cache\n", name);
                goto cleanup;
        }

        cache->curr_blk = 0;
        cache->next_blk = 0;
        cache->unused = entries;
        cache->entries = entries;
        cache->block_size = block_size;
        cache->pages = block_size >> PAGE_SHIFT;
        cache->pages = cache->pages ? cache->pages : 1;
        cache->name = name;
        cache->num_waiters = 0;
        spin_lock_init(&cache->lock);
        init_waitqueue_head(&cache->wait_queue);

        for (i = 0; i < entries; i++) {
                struct squashfs_cache_entry *entry = &cache->entry[i];

                init_waitqueue_head(&cache->entry[i].wait_queue);
                entry->cache = cache;
                entry->block = SQUASHFS_INVALID_BLK;
                entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
                if (entry->data == NULL) {
                        ERROR("Failed to allocate %s cache entry\n", name);
                        goto cleanup;
                }

                for (j = 0; j < cache->pages; j++) {
                        entry->data[j] = kmalloc(PAGE_SIZE, GFP_KERNEL);
                        if (entry->data[j] == NULL) {
                                ERROR("Failed to allocate %s buffer\n", name);
                                goto cleanup;
                        }
                }

                entry->actor = squashfs_page_actor_init(entry->data,
                                                cache->pages, 0);
                if (entry->actor == NULL) {
                        ERROR("Failed to allocate %s cache entry\n", name);
                        goto cleanup;
                }
        }

        return cache;

cleanup:
        squashfs_cache_delete(cache);
        return ERR_PTR(-ENOMEM);
}


/*
 * Copy up to length bytes from cache entry to buffer starting at offset bytes
 * into the cache entry.  If there's not length bytes then copy the number of
 * bytes available.  In all cases return the number of bytes copied.
 */
int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
                int offset, int length)
{
        int remaining = length;

        if (length == 0)
                return 0;
        else if (buffer == NULL)
                return min(length, entry->length - offset);

        while (offset < entry->length) {
                void *buff = entry->data[offset / PAGE_SIZE]
                                + (offset % PAGE_SIZE);
                int bytes = min_t(int, entry->length - offset,
                                PAGE_SIZE - (offset % PAGE_SIZE));

                if (bytes >= remaining) {
                        memcpy(buffer, buff, remaining);
                        remaining = 0;
                        break;
                }

                memcpy(buffer, buff, bytes);
                buffer += bytes;
                remaining -= bytes;
                offset += bytes;
        }

        return length - remaining;
}


/*
 * Read length bytes from metadata position <block, offset> (block is the
 * start of the compressed block on disk, and offset is the offset into
 * the block once decompressed).  Data is packed into consecutive blocks,
 * and length bytes may require reading more than one block.
 */
int squashfs_read_metadata(struct super_block *sb, void *buffer,
                u64 *block, int *offset, int length)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        int bytes, res = length;
        struct squashfs_cache_entry *entry;

        TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);

        if (unlikely(length < 0))
                return -EIO;

        if (unlikely(*offset < 0 || *offset >= SQUASHFS_METADATA_SIZE))
                return -EIO;

        while (length) {
                entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
                if (entry->error) {
                        res = entry->error;
                        goto error;
                } else if (*offset >= entry->length) {
                        res = -EIO;
                        goto error;
                }

                bytes = squashfs_copy_data(buffer, entry, *offset, length);
                if (buffer)
                        buffer += bytes;
                length -= bytes;
                *offset += bytes;

                if (*offset == entry->length) {
                        *block = entry->next_index;
                        *offset = 0;
                }

                squashfs_cache_put(entry);
        }

        return res;

error:
        squashfs_cache_put(entry);
        return res;
}


/*
 * Look-up in the fragmment cache the fragment located at <start_block> in the
 * filesystem.  If necessary read and decompress it from disk.
 */
struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb,
                                u64 start_block, int length)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;

        return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
                length);
}


/*
 * Read and decompress the datablock located at <start_block> in the
 * filesystem.  The cache is used here to avoid duplicating locking and
 * read/decompress code.
 */
struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
                                u64 start_block, int length)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;

        return squashfs_cache_get(sb, msblk->read_page, start_block, length);
}


/*
 * Read a filesystem table (uncompressed sequence of bytes) from disk
 */
void *squashfs_read_table(struct super_block *sb, u64 block, int length)
{
        int pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
        int i, res;
        void *table, *buffer, **data;
        struct squashfs_page_actor *actor;

        table = buffer = kmalloc(length, GFP_KERNEL);
        if (table == NULL)
                return ERR_PTR(-ENOMEM);

        data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
        if (data == NULL) {
                res = -ENOMEM;
                goto failed;
        }

        actor = squashfs_page_actor_init(data, pages, length);
        if (actor == NULL) {
                res = -ENOMEM;
                goto failed2;
        }

        for (i = 0; i < pages; i++, buffer += PAGE_SIZE)
                data[i] = buffer;

        res = squashfs_read_data(sb, block, length |
                SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor);

        kfree(data);
        kfree(actor);

        if (res < 0)
                goto failed;

        return table;

failed2:
        kfree(data);
failed:
        kfree(table);
        return ERR_PTR(res);
}



























































































































































































































































































































































































































































































































































































































































































































































































    2 







































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Dynamic loading of modules into the kernel.
 *
 * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996
 * Rewritten again by Rusty Russell, 2002
 */

#ifndef _LINUX_MODULE_H
#define _LINUX_MODULE_H

#include <linux/list.h>
#include <linux/stat.h>
#include <linux/buildid.h>
#include <linux/compiler.h>
#include <linux/cache.h>
#include <linux/cleanup.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/elf.h>
#include <linux/stringify.h>
#include <linux/kobject.h>
#include <linux/moduleparam.h>
#include <linux/jump_label.h>
#include <linux/export.h>
#include <linux/rbtree_latch.h>
#include <linux/error-injection.h>
#include <linux/tracepoint-defs.h>
#include <linux/srcu.h>
#include <linux/static_call_types.h>
#include <linux/dynamic_debug.h>

#include <linux/percpu.h>
#include <asm/module.h>

#define MODULE_NAME_LEN __MODULE_NAME_LEN

struct modversion_info {
        unsigned long crc;
        char name[MODULE_NAME_LEN];
};

struct module;
struct exception_table_entry;

struct module_kobject {
        struct kobject kobj;
        struct module *mod;
        struct kobject *drivers_dir;
        struct module_param_attrs *mp;
        struct completion *kobj_completion;
} __randomize_layout;

struct module_attribute {
        struct attribute attr;
        ssize_t (*show)(const struct module_attribute *, struct module_kobject *,
                        char *);
        ssize_t (*store)(const struct module_attribute *, struct module_kobject *,
                         const char *, size_t count);
        void (*setup)(struct module *, const char *);
        int (*test)(struct module *);
        void (*free)(struct module *);
};

struct module_version_attribute {
        struct module_attribute mattr;
        const char *module_name;
        const char *version;
};

extern ssize_t __modver_version_show(const struct module_attribute *,
                                     struct module_kobject *, char *);

extern const struct module_attribute module_uevent;

/* These are either module local, or the kernel's dummy ones. */
extern int init_module(void);
extern void cleanup_module(void);

#ifndef MODULE
/**
 * module_init() - driver initialization entry point
 * @x: function to be run at kernel boot time or module insertion
 *
 * module_init() will either be called during do_initcalls() (if
 * builtin) or at module insertion time (if a module).  There can only
 * be one per module.
 */
#define module_init(x)        __initcall(x);

/**
 * module_exit() - driver exit entry point
 * @x: function to be run when driver is removed
 *
 * module_exit() will wrap the driver clean-up code
 * with cleanup_module() when used with rmmod when
 * the driver is a module.  If the driver is statically
 * compiled into the kernel, module_exit() has no effect.
 * There can only be one per module.
 */
#define module_exit(x)        __exitcall(x);

#else /* MODULE */

/*
 * In most cases loadable modules do not need custom
 * initcall levels. There are still some valid cases where
 * a driver may be needed early if built in, and does not
 * matter when built as a loadable module. Like bus
 * snooping debug drivers.
 */
#define early_initcall(fn)                module_init(fn)
#define core_initcall(fn)                module_init(fn)
#define core_initcall_sync(fn)                module_init(fn)
#define postcore_initcall(fn)                module_init(fn)
#define postcore_initcall_sync(fn)        module_init(fn)
#define arch_initcall(fn)                module_init(fn)
#define subsys_initcall(fn)                module_init(fn)
#define subsys_initcall_sync(fn)        module_init(fn)
#define fs_initcall(fn)                        module_init(fn)
#define fs_initcall_sync(fn)                module_init(fn)
#define rootfs_initcall(fn)                module_init(fn)
#define device_initcall(fn)                module_init(fn)
#define device_initcall_sync(fn)        module_init(fn)
#define late_initcall(fn)                module_init(fn)
#define late_initcall_sync(fn)                module_init(fn)

#define console_initcall(fn)                module_init(fn)

/* Each module must use one module_init(). */
#define module_init(initfn)                                        \
        static inline initcall_t __maybe_unused __inittest(void)                \
        { return initfn; }                                        \
        int init_module(void) __copy(initfn)                        \
                __attribute__((alias(#initfn)));                \
        ___ADDRESSABLE(init_module, __initdata);

/* This is only required if you want to be unloadable. */
#define module_exit(exitfn)                                        \
        static inline exitcall_t __maybe_unused __exittest(void)                \
        { return exitfn; }                                        \
        void cleanup_module(void) __copy(exitfn)                \
                __attribute__((alias(#exitfn)));                \
        ___ADDRESSABLE(cleanup_module, __exitdata);

#endif

/* This means "can be init if no module support, otherwise module load
   may call it." */
#ifdef CONFIG_MODULES
#define __init_or_module
#define __initdata_or_module
#define __initconst_or_module
#else
#define __init_or_module __init
#define __initdata_or_module __initdata
#define __initconst_or_module __initconst
#endif /*CONFIG_MODULES*/

struct module_kobject *lookup_or_create_module_kobject(const char *name);

/* For userspace: you can also call me... */
#define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias)

/* Soft module dependencies. See man modprobe.d for details.
 * Example: MODULE_SOFTDEP("pre: module-foo module-bar post: module-baz")
 */
#define MODULE_SOFTDEP(_softdep) MODULE_INFO(softdep, _softdep)

/*
 * Weak module dependencies. See man modprobe.d for details.
 * Example: MODULE_WEAKDEP("module-foo")
 */
#define MODULE_WEAKDEP(_weakdep) MODULE_INFO(weakdep, _weakdep)

/*
 * MODULE_FILE is used for generating modules.builtin
 * So, make it no-op when this is being built as a module
 */
#ifdef MODULE
#define MODULE_FILE
#else
#define MODULE_FILE        MODULE_INFO(file, KBUILD_MODFILE);
#endif

/*
 * The following license idents are currently accepted as indicating free
 * software modules
 *
 *        "GPL"                                [GNU Public License v2]
 *        "GPL v2"                        [GNU Public License v2]
 *        "GPL and additional rights"        [GNU Public License v2 rights and more]
 *        "Dual BSD/GPL"                        [GNU Public License v2
 *                                         or BSD license choice]
 *        "Dual MIT/GPL"                        [GNU Public License v2
 *                                         or MIT license choice]
 *        "Dual MPL/GPL"                        [GNU Public License v2
 *                                         or Mozilla license choice]
 *
 * The following other idents are available
 *
 *        "Proprietary"                        [Non free products]
 *
 * Both "GPL v2" and "GPL" (the latter also in dual licensed strings) are
 * merely stating that the module is licensed under the GPL v2, but are not
 * telling whether "GPL v2 only" or "GPL v2 or later". The reason why there
 * are two variants is a historic and failed attempt to convey more
 * information in the MODULE_LICENSE string. For module loading the
 * "only/or later" distinction is completely irrelevant and does neither
 * replace the proper license identifiers in the corresponding source file
 * nor amends them in any way. The sole purpose is to make the
 * 'Proprietary' flagging work and to refuse to bind symbols which are
 * exported with EXPORT_SYMBOL_GPL when a non free module is loaded.
 *
 * In the same way "BSD" is not a clear license information. It merely
 * states, that the module is licensed under one of the compatible BSD
 * license variants. The detailed and correct license information is again
 * to be found in the corresponding source files.
 *
 * There are dual licensed components, but when running with Linux it is the
 * GPL that is relevant so this is a non issue. Similarly LGPL linked with GPL
 * is a GPL combined work.
 *
 * This exists for several reasons
 * 1.        So modinfo can show license info for users wanting to vet their setup
 *        is free
 * 2.        So the community can ignore bug reports including proprietary modules
 * 3.        So vendors can do likewise based on their own policies
 */
#define MODULE_LICENSE(_license) MODULE_FILE MODULE_INFO(license, _license)

/*
 * Author(s), use "Name <email>" or just "Name", for multiple
 * authors use multiple MODULE_AUTHOR() statements/lines.
 */
#define MODULE_AUTHOR(_author) MODULE_INFO(author, _author)

/* What your module does. */
#define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description)

/*
 * Format: __mod_device_table__kmod_<modname>__<type>__<name>
 * Parts of the string `__kmod_` and `__` are used as delimiters when parsing
 * a symbol in file2alias.c
 */
#define __mod_device_table(type, name)        \
        __PASTE(__mod_device_table__,        \
        __PASTE(kmod_,                        \
        __PASTE(__KBUILD_MODNAME,        \
        __PASTE(__,                        \
        __PASTE(type,                        \
        __PASTE(__, name))))))

/* Creates an alias so file2alias.c can find device table. */
#define MODULE_DEVICE_TABLE(type, name)                                        \
static typeof(name) __mod_device_table(type, name)                        \
  __attribute__ ((used, alias(__stringify(name))))

/* Version of form [<epoch>:]<version>[-<extra-version>].
 * Or for CVS/RCS ID version, everything but the number is stripped.
 * <epoch>: A (small) unsigned integer which allows you to start versions
 * anew. If not mentioned, it's zero.  eg. "2:1.0" is after
 * "1:2.0".

 * <version>: The <version> may contain only alphanumerics and the
 * character `.'.  Ordered by numeric sort for numeric parts,
 * ascii sort for ascii parts (as per RPM or DEB algorithm).

 * <extraversion>: Like <version>, but inserted for local
 * customizations, eg "rh3" or "rusty1".

 * Using this automatically adds a checksum of the .c files and the
 * local headers in "srcversion".
 */

#if defined(MODULE) || !defined(CONFIG_SYSFS)
#define MODULE_VERSION(_version) MODULE_INFO(version, _version)
#else
#define MODULE_VERSION(_version)                                        \
        MODULE_INFO(version, _version);                                        \
        static const struct module_version_attribute __modver_attr        \
                __used __section("__modver")                                \
                __aligned(__alignof__(struct module_version_attribute)) \
                = {                                                        \
                        .mattr        = {                                        \
                                .attr        = {                                \
                                        .name        = "version",                \
                                        .mode        = S_IRUGO,                \
                                },                                        \
                                .show        = __modver_version_show,        \
                        },                                                \
                        .module_name        = KBUILD_MODNAME,                \
                        .version        = _version,                        \
                }
#endif

/* Optional firmware file (or files) needed by the module
 * format is simply firmware file name.  Multiple firmware
 * files require multiple MODULE_FIRMWARE() specifiers */
#define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware)

#define MODULE_IMPORT_NS(ns)        MODULE_INFO(import_ns, ns)

struct notifier_block;

enum module_state {
        MODULE_STATE_LIVE,        /* Normal state. */
        MODULE_STATE_COMING,        /* Full formed, running module_init. */
        MODULE_STATE_GOING,        /* Going away. */
        MODULE_STATE_UNFORMED,        /* Still setting it up. */
};

struct mod_tree_node {
        struct module *mod;
        struct latch_tree_node node;
};

enum mod_mem_type {
        MOD_TEXT = 0,
        MOD_DATA,
        MOD_RODATA,
        MOD_RO_AFTER_INIT,
        MOD_INIT_TEXT,
        MOD_INIT_DATA,
        MOD_INIT_RODATA,

        MOD_MEM_NUM_TYPES,
        MOD_INVALID = -1,
};

#define mod_mem_type_is_init(type)        \
        ((type) == MOD_INIT_TEXT ||        \
         (type) == MOD_INIT_DATA ||        \
         (type) == MOD_INIT_RODATA)

#define mod_mem_type_is_core(type) (!mod_mem_type_is_init(type))

#define mod_mem_type_is_text(type)        \
         ((type) == MOD_TEXT ||                \
          (type) == MOD_INIT_TEXT)

#define mod_mem_type_is_data(type) (!mod_mem_type_is_text(type))

#define mod_mem_type_is_core_data(type)        \
        (mod_mem_type_is_core(type) &&        \
         mod_mem_type_is_data(type))

#define for_each_mod_mem_type(type)                        \
        for (enum mod_mem_type (type) = 0;                \
             (type) < MOD_MEM_NUM_TYPES; (type)++)

#define for_class_mod_mem_type(type, class)                \
        for_each_mod_mem_type(type)                        \
                if (mod_mem_type_is_##class(type))

struct module_memory {
        void *base;
        bool is_rox;
        unsigned int size;

#ifdef CONFIG_MODULES_TREE_LOOKUP
        struct mod_tree_node mtn;
#endif
};

#ifdef CONFIG_MODULES_TREE_LOOKUP
/* Only touch one cacheline for common rbtree-for-core-layout case. */
#define __module_memory_align ____cacheline_aligned
#else
#define __module_memory_align
#endif

struct mod_kallsyms {
        Elf_Sym *symtab;
        unsigned int num_symtab;
        char *strtab;
        char *typetab;
};

#ifdef CONFIG_LIVEPATCH
/**
 * struct klp_modinfo - ELF information preserved from the livepatch module
 *
 * @hdr: ELF header
 * @sechdrs: Section header table
 * @secstrings: String table for the section headers
 * @symndx: The symbol table section index
 */
struct klp_modinfo {
        Elf_Ehdr hdr;
        Elf_Shdr *sechdrs;
        char *secstrings;
        unsigned int symndx;
};
#endif

struct module {
        enum module_state state;

        /* Member of list of modules */
        struct list_head list;

        /* Unique handle for this module */
        char name[MODULE_NAME_LEN];

#ifdef CONFIG_STACKTRACE_BUILD_ID
        /* Module build ID */
        unsigned char build_id[BUILD_ID_SIZE_MAX];
#endif

        /* Sysfs stuff. */
        struct module_kobject mkobj;
        struct module_attribute *modinfo_attrs;
        const char *version;
        const char *srcversion;
        const char *imported_namespaces;
        struct kobject *holders_dir;

        /* Exported symbols */
        const struct kernel_symbol *syms;
        const u32 *crcs;
        const u8 *flagstab;
        unsigned int num_syms;

#ifdef CONFIG_ARCH_USES_CFI_TRAPS
        s32 *kcfi_traps;
        s32 *kcfi_traps_end;
#endif

        /* Kernel parameters. */
#ifdef CONFIG_SYSFS
        struct mutex param_lock;
#endif
        struct kernel_param *kp;
        unsigned int num_kp;

        /* GPL-only exported symbols. */
        bool using_gplonly_symbols;

#ifdef CONFIG_MODULE_SIG
        /* Signature was verified. */
        bool sig_ok;
#endif

        bool async_probe_requested;

        /* Exception table */
        unsigned int num_exentries;
        struct exception_table_entry *extable;

        /* Startup function. */
        int (*init)(void);

        struct module_memory mem[MOD_MEM_NUM_TYPES] __module_memory_align;

        /* Arch-specific module values */
        struct mod_arch_specific arch;

        unsigned long taints;        /* same bits as kernel:taint_flags */

#ifdef CONFIG_GENERIC_BUG
        /* Support for BUG */
        unsigned num_bugs;
        struct list_head bug_list;
        struct bug_entry *bug_table;
#endif

#ifdef CONFIG_KALLSYMS
        /* Protected by RCU and/or module_mutex: use rcu_dereference() */
        struct mod_kallsyms __rcu *kallsyms;
        struct mod_kallsyms core_kallsyms;

        /* Section attributes */
        struct module_sect_attrs *sect_attrs;

        /* Notes attributes */
        struct module_notes_attrs *notes_attrs;
#endif

        /* The command line arguments (may be mangled).  People like
           keeping pointers to this stuff */
        char *args;

#ifdef CONFIG_SMP
        /* Per-cpu data. */
        void __percpu *percpu;
        unsigned int percpu_size;
#endif
        void *noinstr_text_start;
        unsigned int noinstr_text_size;

#ifdef CONFIG_TRACEPOINTS
        unsigned int num_tracepoints;
        tracepoint_ptr_t *tracepoints_ptrs;
#endif
#ifdef CONFIG_TREE_SRCU
        unsigned int num_srcu_structs;
        struct srcu_struct **srcu_struct_ptrs;
#endif
#ifdef CONFIG_BPF_EVENTS
        unsigned int num_bpf_raw_events;
        struct bpf_raw_event_map *bpf_raw_events;
#endif
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
        unsigned int btf_data_size;
        unsigned int btf_base_data_size;
        void *btf_data;
        void *btf_base_data;
#endif
#ifdef CONFIG_JUMP_LABEL
        struct jump_entry *jump_entries;
        unsigned int num_jump_entries;
#endif
#ifdef CONFIG_TRACING
        unsigned int num_trace_bprintk_fmt;
        const char **trace_bprintk_fmt_start;
#endif
#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call **trace_events;
        unsigned int num_trace_events;
        struct trace_eval_map **trace_evals;
        unsigned int num_trace_evals;
#endif
#ifdef CONFIG_DYNAMIC_FTRACE
        unsigned int num_ftrace_callsites;
        unsigned long *ftrace_callsites;
#endif
#ifdef CONFIG_KPROBES
        void *kprobes_text_start;
        unsigned int kprobes_text_size;
        unsigned long *kprobe_blacklist;
        unsigned int num_kprobe_blacklist;
#endif
#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
        int num_static_call_sites;
        struct static_call_site *static_call_sites;
#endif
#if IS_ENABLED(CONFIG_KUNIT)
        int num_kunit_init_suites;
        struct kunit_suite **kunit_init_suites;
        int num_kunit_suites;
        struct kunit_suite **kunit_suites;
#endif


#ifdef CONFIG_LIVEPATCH
        bool klp; /* Is this a livepatch module? */
        bool klp_alive;

        /* ELF information */
        struct klp_modinfo *klp_info;
#endif

#ifdef CONFIG_PRINTK_INDEX
        unsigned int printk_index_size;
        struct pi_entry **printk_index_start;
#endif

#ifdef CONFIG_MODULE_UNLOAD
        /* What modules depend on me? */
        struct list_head source_list;
        /* What modules do I depend on? */
        struct list_head target_list;

        /* Destruction function. */
        void (*exit)(void);

        atomic_t refcnt;
#endif

#ifdef CONFIG_CONSTRUCTORS
        /* Constructor functions. */
        ctor_fn_t *ctors;
        unsigned int num_ctors;
#endif

#ifdef CONFIG_FUNCTION_ERROR_INJECTION
        struct error_injection_entry *ei_funcs;
        unsigned int num_ei_funcs;
#endif
#ifdef CONFIG_DYNAMIC_DEBUG_CORE
        struct _ddebug_info dyndbg_info;
#endif
} ____cacheline_aligned __randomize_layout;
#ifndef MODULE_ARCH_INIT
#define MODULE_ARCH_INIT {}
#endif

#ifdef CONFIG_MODULES

/* Get/put a kernel symbol (calls must be symmetric) */
void *__symbol_get(const char *symbol);
void *__symbol_get_gpl(const char *symbol);
#define symbol_get(x)        ({ \
        static const char __notrim[] \
                __used __section(".no_trim_symbol") = __stringify(x); \
        (typeof(&x))(__symbol_get(__stringify(x))); })

#ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE
static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym)
{
        return sym->st_value;
}
#endif

/* FIXME: It'd be nice to isolate modules during init, too, so they
   aren't used before they (may) fail.  But presently too much code
   (IDE & SCSI) require entry into the module during init.*/
static inline bool module_is_live(struct module *mod)
{
        return mod->state != MODULE_STATE_GOING;
}

static inline bool module_is_coming(struct module *mod)
{
        return mod->state == MODULE_STATE_COMING;
}

struct module *__module_text_address(unsigned long addr);
struct module *__module_address(unsigned long addr);
bool is_module_address(unsigned long addr);
bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
bool is_module_percpu_address(unsigned long addr);
bool is_module_text_address(unsigned long addr);

static inline bool within_module_mem_type(unsigned long addr,
                                          const struct module *mod,
                                          enum mod_mem_type type)
{
        unsigned long base, size;

        base = (unsigned long)mod->mem[type].base;
        size = mod->mem[type].size;
        return addr - base < size;
}

static inline bool within_module_core(unsigned long addr,
                                      const struct module *mod)
{
        for_class_mod_mem_type(type, core) {
                if (within_module_mem_type(addr, mod, type))
                        return true;
        }
        return false;
}

static inline bool within_module_init(unsigned long addr,
                                      const struct module *mod)
{
        for_class_mod_mem_type(type, init) {
                if (within_module_mem_type(addr, mod, type))
                        return true;
        }
        return false;
}

static inline bool within_module(unsigned long addr, const struct module *mod)
{
        return within_module_init(addr, mod) || within_module_core(addr, mod);
}

/* Search for module by name: must be in a RCU critical section. */
struct module *find_module(const char *name);

extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
                        long code);
#define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)

#ifdef CONFIG_MODULE_UNLOAD
int module_refcount(struct module *mod);
void __symbol_put(const char *symbol);
#define symbol_put(x) __symbol_put(__stringify(x))
void symbol_put_addr(void *addr);

/* Sometimes we know we already have a refcount, and it's easier not
   to handle the error case (which only happens with rmmod --wait). */
extern void __module_get(struct module *module);

/**
 * try_module_get() - take module refcount unless module is being removed
 * @module: the module we should check for
 *
 * Only try to get a module reference count if the module is not being removed.
 * This call will fail if the module is in the process of being removed.
 *
 * Care must also be taken to ensure the module exists and is alive prior to
 * usage of this call. This can be gauranteed through two means:
 *
 * 1) Direct protection: you know an earlier caller must have increased the
 *    module reference through __module_get(). This can typically be achieved
 *    by having another entity other than the module itself increment the
 *    module reference count.
 *
 * 2) Implied protection: there is an implied protection against module
 *    removal. An example of this is the implied protection used by kernfs /
 *    sysfs. The sysfs store / read file operations are guaranteed to exist
 *    through the use of kernfs's active reference (see kernfs_active()) and a
 *    sysfs / kernfs file removal cannot happen unless the same file is not
 *    active. Therefore, if a sysfs file is being read or written to the module
 *    which created it must still exist. It is therefore safe to use
 *    try_module_get() on module sysfs store / read ops.
 *
 * One of the real values to try_module_get() is the module_is_live() check
 * which ensures that the caller of try_module_get() can yield to userspace
 * module removal requests and gracefully fail if the module is on its way out.
 *
 * Returns true if the reference count was successfully incremented.
 */
extern bool try_module_get(struct module *module);

/**
 * module_put() - release a reference count to a module
 * @module: the module we should release a reference count for
 *
 * If you successfully bump a reference count to a module with try_module_get(),
 * when you are finished you must call module_put() to release that reference
 * count.
 */
extern void module_put(struct module *module);

#else /*!CONFIG_MODULE_UNLOAD*/
static inline bool try_module_get(struct module *module)
{
        return !module || module_is_live(module);
}
static inline void module_put(struct module *module)
{
}
static inline void __module_get(struct module *module)
{
}
#define symbol_put(x) do { } while (0)
#define symbol_put_addr(p) do { } while (0)

#endif /* CONFIG_MODULE_UNLOAD */

/* This is a #define so the string doesn't get put in every .o file */
#define module_name(mod)                        \
({                                                \
        struct module *__mod = (mod);                \
        __mod ? __mod->name : "kernel";                \
})

static inline const unsigned char *module_buildid(struct module *mod)
{
#ifdef CONFIG_STACKTRACE_BUILD_ID
        return mod->build_id;
#else
        return NULL;
#endif
}

/* Dereference module function descriptor */
void *dereference_module_function_descriptor(struct module *mod, void *ptr);

int register_module_notifier(struct notifier_block *nb);
int unregister_module_notifier(struct notifier_block *nb);

extern void print_modules(void);

static inline bool module_requested_async_probing(struct module *module)
{
        return module && module->async_probe_requested;
}

static inline bool is_livepatch_module(struct module *mod)
{
#ifdef CONFIG_LIVEPATCH
        return mod->klp;
#else
        return false;
#endif
}

void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data);

#else /* !CONFIG_MODULES... */

static inline struct module *__module_address(unsigned long addr)
{
        return NULL;
}

static inline struct module *__module_text_address(unsigned long addr)
{
        return NULL;
}

static inline bool is_module_address(unsigned long addr)
{
        return false;
}

static inline bool is_module_percpu_address(unsigned long addr)
{
        return false;
}

static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
        return false;
}

static inline bool is_module_text_address(unsigned long addr)
{
        return false;
}

static inline bool within_module_core(unsigned long addr,
                                      const struct module *mod)
{
        return false;
}

static inline bool within_module_init(unsigned long addr,
                                      const struct module *mod)
{
        return false;
}

static inline bool within_module(unsigned long addr, const struct module *mod)
{
        return false;
}

/* Get/put a kernel symbol (calls should be symmetric) */
#define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); })
#define symbol_put(x) do { } while (0)
#define symbol_put_addr(x) do { } while (0)

static inline void __module_get(struct module *module)
{
}

static inline bool try_module_get(struct module *module)
{
        return true;
}

static inline void module_put(struct module *module)
{
}

#define module_name(mod) "kernel"

static inline int register_module_notifier(struct notifier_block *nb)
{
        /* no events will happen anyway, so this can always succeed */
        return 0;
}

static inline int unregister_module_notifier(struct notifier_block *nb)
{
        return 0;
}

#define module_put_and_kthread_exit(code) kthread_exit(code)

static inline void print_modules(void)
{
}

static inline bool module_requested_async_probing(struct module *module)
{
        return false;
}


/* Dereference module function descriptor */
static inline
void *dereference_module_function_descriptor(struct module *mod, void *ptr)
{
        return ptr;
}

static inline bool module_is_coming(struct module *mod)
{
        return false;
}

static inline void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data)
{
}
#endif /* CONFIG_MODULES */

#ifdef CONFIG_SYSFS
extern struct kset *module_kset;
extern const struct kobj_type module_ktype;
#endif /* CONFIG_SYSFS */

#define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x)

/* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */

#define __MODULE_STRING(x) __stringify(x)

#ifdef CONFIG_GENERIC_BUG
void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
                         struct module *);
void module_bug_cleanup(struct module *);

#else        /* !CONFIG_GENERIC_BUG */

static inline void module_bug_finalize(const Elf_Ehdr *hdr,
                                        const Elf_Shdr *sechdrs,
                                        struct module *mod)
{
}
static inline void module_bug_cleanup(struct module *mod) {}
#endif        /* CONFIG_GENERIC_BUG */

#ifdef CONFIG_MITIGATION_RETPOLINE
extern bool retpoline_module_ok(bool has_retpoline);
#else
static inline bool retpoline_module_ok(bool has_retpoline)
{
        return true;
}
#endif

#ifdef CONFIG_MODULE_SIG
bool is_module_sig_enforced(void);

void set_module_sig_enforced(void);

static inline bool module_sig_ok(struct module *module)
{
        return module->sig_ok;
}
#else        /* !CONFIG_MODULE_SIG */
static inline bool is_module_sig_enforced(void)
{
        return false;
}

static inline void set_module_sig_enforced(void)
{
}

static inline bool module_sig_ok(struct module *module)
{
        return true;
}
#endif        /* CONFIG_MODULE_SIG */

#if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS)
int module_kallsyms_on_each_symbol(const char *modname,
                                   int (*fn)(void *, const char *, unsigned long),
                                   void *data);

/* For kallsyms to ask for address resolution.  namebuf should be at
 * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
 * found, otherwise NULL.
 */
int module_address_lookup(unsigned long addr,
                          unsigned long *symbolsize,
                          unsigned long *offset,
                          char **modname, const unsigned char **modbuildid,
                          char *namebuf);
int lookup_module_symbol_name(unsigned long addr, char *symname);
int lookup_module_symbol_attrs(unsigned long addr,
                               unsigned long *size,
                               unsigned long *offset,
                               char *modname,
                               char *name);

/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
 * symnum out of range.
 */
int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                       char *name, char *module_name, int *exported);

/* Look for this name: can be of form module:name. */
unsigned long module_kallsyms_lookup_name(const char *name);

unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);

#else        /* CONFIG_MODULES && CONFIG_KALLSYMS */

static inline int module_kallsyms_on_each_symbol(const char *modname,
                                                 int (*fn)(void *, const char *, unsigned long),
                                                 void *data)
{
        return -EOPNOTSUPP;
}

/* For kallsyms to ask for address resolution.  NULL means not found. */
static inline int module_address_lookup(unsigned long addr,
                                                unsigned long *symbolsize,
                                                unsigned long *offset,
                                                char **modname,
                                                const unsigned char **modbuildid,
                                                char *namebuf)
{
        return 0;
}

static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
{
        return -ERANGE;
}

static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
                                     char *type, char *name,
                                     char *module_name, int *exported)
{
        return -ERANGE;
}

static inline unsigned long module_kallsyms_lookup_name(const char *name)
{
        return 0;
}

static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
                                                       const char *name)
{
        return 0;
}

#endif  /* CONFIG_MODULES && CONFIG_KALLSYMS */

/* Define __free(module_put) macro for struct module *. */
DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T))

#endif /* _LINUX_MODULE_H */
























































































































































   11 








   11 




























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Directory notifications for Linux.
 *
 * Copyright (C) 2000,2001,2002 Stephen Rothwell
 *
 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
 * dnotify was largly rewritten to use the new fsnotify infrastructure
 */
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/dnotify.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/fsnotify_backend.h>

static int dir_notify_enable __read_mostly = 1;
#ifdef CONFIG_SYSCTL
static const struct ctl_table dnotify_sysctls[] = {
        {
                .procname        = "dir-notify-enable",
                .data                = &dir_notify_enable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};
static void __init dnotify_sysctl_init(void)
{
        register_sysctl_init("fs", dnotify_sysctls);
}
#else
#define dnotify_sysctl_init() do { } while (0)
#endif

static struct kmem_cache *dnotify_struct_cache __ro_after_init;
static struct kmem_cache *dnotify_mark_cache __ro_after_init;
static struct fsnotify_group *dnotify_group __ro_after_init;

/*
 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
 * is being watched by dnotify.  If multiple userspace applications are watching
 * the same directory with dnotify their information is chained in dn
 */
struct dnotify_mark {
        struct fsnotify_mark fsn_mark;
        struct dnotify_struct *dn;
};

/*
 * When a process starts or stops watching an inode the set of events which
 * dnotify cares about for that inode may change.  This function runs the
 * list of everything receiving dnotify events about this directory and calculates
 * the set of all those events.  After it updates what dnotify is interested in
 * it calls the fsnotify function so it can update the set of all events relevant
 * to this inode.
 */
static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
{
        __u32 new_mask = 0;
        struct dnotify_struct *dn;
        struct dnotify_mark *dn_mark  = container_of(fsn_mark,
                                                     struct dnotify_mark,
                                                     fsn_mark);

        assert_spin_locked(&fsn_mark->lock);

        for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
                new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
        if (fsn_mark->mask == new_mask)
                return;
        fsn_mark->mask = new_mask;

        fsnotify_recalc_mask(fsn_mark->connector);
}

/*
 * Mains fsnotify call where events are delivered to dnotify.
 * Find the dnotify mark on the relevant inode, run the list of dnotify structs
 * on that mark and determine which of them has expressed interest in receiving
 * events of this type.  When found send the correct process and signal and
 * destroy the dnotify struct if it was not registered to receive multiple
 * events.
 */
static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
                                struct inode *inode, struct inode *dir,
                                const struct qstr *name, u32 cookie)
{
        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct fown_struct *fown;
        __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;

        /* not a dir, dnotify doesn't care */
        if (!dir && !(mask & FS_ISDIR))
                return 0;

        dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);

        spin_lock(&inode_mark->lock);
        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_mask & test_mask) == 0) {
                        prev = &dn->dn_next;
                        continue;
                }
                fown = file_f_owner(dn->dn_filp);
                send_sigio(fown, dn->dn_fd, POLL_MSG);
                if (dn->dn_mask & FS_DN_MULTISHOT)
                        prev = &dn->dn_next;
                else {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
                        dnotify_recalc_inode_mask(inode_mark);
                }
        }

        spin_unlock(&inode_mark->lock);

        return 0;
}

static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
{
        struct dnotify_mark *dn_mark = container_of(fsn_mark,
                                                    struct dnotify_mark,
                                                    fsn_mark);

        BUG_ON(dn_mark->dn);

        kmem_cache_free(dnotify_mark_cache, dn_mark);
}

static const struct fsnotify_ops dnotify_fsnotify_ops = {
        .handle_inode_event = dnotify_handle_event,
        .free_mark = dnotify_free_mark,
};

/*
 * Called every time a file is closed.  Looks first for a dnotify mark on the
 * inode.  If one is found run all of the ->dn structures attached to that
 * mark for one relevant to this process closing the file and remove that
 * dnotify_struct.  If that was the last dnotify_struct also remove the
 * fsnotify_mark.
 */
void dnotify_flush(struct file *filp, fl_owner_t id)
{
        struct fsnotify_mark *fsn_mark;
        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
        bool free = false;

        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode))
                return;

        fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
        if (!fsn_mark)
                return;
        dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);

        fsnotify_group_lock(dnotify_group);

        spin_lock(&fsn_mark->lock);
        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
                        dnotify_recalc_inode_mask(fsn_mark);
                        break;
                }
                prev = &dn->dn_next;
        }

        spin_unlock(&fsn_mark->lock);

        /* nothing else could have found us thanks to the dnotify_groups
           mark_mutex */
        if (dn_mark->dn == NULL) {
                fsnotify_detach_mark(fsn_mark);
                free = true;
        }

        fsnotify_group_unlock(dnotify_group);

        if (free)
                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
}

/* this conversion is done only at watch creation */
static __u32 convert_arg(unsigned int arg)
{
        __u32 new_mask = FS_EVENT_ON_CHILD;

        if (arg & DN_MULTISHOT)
                new_mask |= FS_DN_MULTISHOT;
        if (arg & DN_DELETE)
                new_mask |= (FS_DELETE | FS_MOVED_FROM);
        if (arg & DN_MODIFY)
                new_mask |= FS_MODIFY;
        if (arg & DN_ACCESS)
                new_mask |= FS_ACCESS;
        if (arg & DN_ATTRIB)
                new_mask |= FS_ATTRIB;
        if (arg & DN_RENAME)
                new_mask |= FS_RENAME;
        if (arg & DN_CREATE)
                new_mask |= (FS_CREATE | FS_MOVED_TO);

        return new_mask;
}

/*
 * If multiple processes watch the same inode with dnotify there is only one
 * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
 * onto that mark.  This function either attaches the new dnotify_struct onto
 * that list, or it |= the mask onto an existing dnofiy_struct.
 */
static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
                     fl_owner_t id, int fd, struct file *filp, __u32 mask)
{
        struct dnotify_struct *odn;

        odn = dn_mark->dn;
        while (odn != NULL) {
                /* adding more events to existing dnofiy_struct? */
                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
                        odn->dn_fd = fd;
                        odn->dn_mask |= mask;
                        return -EEXIST;
                }
                odn = odn->dn_next;
        }

        dn->dn_mask = mask;
        dn->dn_fd = fd;
        dn->dn_filp = filp;
        dn->dn_owner = id;
        dn->dn_next = dn_mark->dn;
        dn_mark->dn = dn;

        return 0;
}

/*
 * When a process calls fcntl to attach a dnotify watch to a directory it ends
 * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
 * attached to the fsnotify_mark.
 */
int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
{
        struct dnotify_mark *new_dn_mark, *dn_mark;
        struct fsnotify_mark *new_fsn_mark, *fsn_mark;
        struct dnotify_struct *dn;
        struct inode *inode;
        fl_owner_t id = current->files;
        struct file *f = NULL;
        int destroy = 0, error = 0;
        __u32 mask;

        /* we use these to tell if we need to kfree */
        new_fsn_mark = NULL;
        dn = NULL;

        if (!dir_notify_enable) {
                error = -EINVAL;
                goto out_err;
        }

        /* a 0 mask means we are explicitly removing the watch */
        if ((arg & ~DN_MULTISHOT) == 0) {
                dnotify_flush(filp, id);
                error = 0;
                goto out_err;
        }

        /* dnotify only works on directories */
        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode)) {
                error = -ENOTDIR;
                goto out_err;
        }

        /*
         * convert the userspace DN_* "arg" to the internal FS_*
         * defined in fsnotify
         */
        mask = convert_arg(arg);

        error = security_path_notify(&filp->f_path, mask,
                        FSNOTIFY_OBJ_TYPE_INODE);
        if (error)
                goto out_err;

        /* expect most fcntl to add new rather than augment old */
        dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
        if (!dn) {
                error = -ENOMEM;
                goto out_err;
        }

        error = file_f_owner_allocate(filp);
        if (error)
                goto out_err;

        /* new fsnotify mark, we expect most fcntl calls to add a new mark */
        new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
        if (!new_dn_mark) {
                error = -ENOMEM;
                goto out_err;
        }

        /* set up the new_fsn_mark and new_dn_mark */
        new_fsn_mark = &new_dn_mark->fsn_mark;
        fsnotify_init_mark(new_fsn_mark, dnotify_group);
        new_fsn_mark->mask = mask;
        new_dn_mark->dn = NULL;

        /* this is needed to prevent the fcntl/close race described below */
        fsnotify_group_lock(dnotify_group);

        /* add the new_fsn_mark or find an old one. */
        fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
        if (fsn_mark) {
                dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
                spin_lock(&fsn_mark->lock);
        } else {
                error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
                if (error) {
                        fsnotify_group_unlock(dnotify_group);
                        goto out_err;
                }
                spin_lock(&new_fsn_mark->lock);
                fsn_mark = new_fsn_mark;
                dn_mark = new_dn_mark;
                /* we used new_fsn_mark, so don't free it */
                new_fsn_mark = NULL;
        }

        f = fget_raw(fd);

        /* if (f != filp) means that we lost a race and another task/thread
         * actually closed the fd we are still playing with before we grabbed
         * the dnotify_groups mark_mutex and fsn_mark->lock.  Since closing the
         * fd is the only time we clean up the marks we need to get our mark
         * off the list. */
        if (f != filp) {
                /* if we added ourselves, shoot ourselves, it's possible that
                 * the flush actually did shoot this fsn_mark.  That's fine too
                 * since multiple calls to destroy_mark is perfectly safe, if
                 * we found a dn_mark already attached to the inode, just sod
                 * off silently as the flush at close time dealt with it.
                 */
                if (dn_mark == new_dn_mark)
                        destroy = 1;
                error = 0;
                goto out;
        }

        __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);

        error = attach_dn(dn, dn_mark, id, fd, filp, mask);
        /* !error means that we attached the dn to the dn_mark, so don't free it */
        if (!error)
                dn = NULL;
        /* -EEXIST means that we didn't add this new dn and used an old one.
         * that isn't an error (and the unused dn should be freed) */
        else if (error == -EEXIST)
                error = 0;

        dnotify_recalc_inode_mask(fsn_mark);
out:
        spin_unlock(&fsn_mark->lock);

        if (destroy)
                fsnotify_detach_mark(fsn_mark);
        fsnotify_group_unlock(dnotify_group);
        if (destroy)
                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
out_err:
        if (new_fsn_mark)
                fsnotify_put_mark(new_fsn_mark);
        if (dn)
                kmem_cache_free(dnotify_struct_cache, dn);
        if (f)
                fput(f);
        return error;
}

static int __init dnotify_init(void)
{
        dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
                                          SLAB_PANIC|SLAB_ACCOUNT);
        dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);

        dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops, 0);
        if (IS_ERR(dnotify_group))
                panic("unable to allocate fsnotify group for dnotify\n");
        dnotify_sysctl_init();
        return 0;
}

module_init(dnotify_init)











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 













































   14 










   12 




   12 


   11 


































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2005-2010 IBM Corporation
 *
 * Author:
 * Mimi Zohar <zohar@us.ibm.com>
 * Kylene Hall <kjhall@us.ibm.com>
 *
 * File: evm_main.c
 *        implements evm_inode_setxattr, evm_inode_post_setxattr,
 *        evm_inode_removexattr, evm_verifyxattr, and evm_inode_set_acl.
 */

#define pr_fmt(fmt) "EVM: "fmt

#include <linux/init.h>
#include <linux/audit.h>
#include <linux/xattr.h>
#include <linux/integrity.h>
#include <linux/evm.h>
#include <linux/magic.h>
#include <linux/posix_acl_xattr.h>
#include <linux/lsm_hooks.h>

#include <crypto/hash.h>
#include <crypto/hash_info.h>
#include <crypto/utils.h>
#include "evm.h"

int evm_initialized;

static const char * const integrity_status_msg[] = {
        "pass", "pass_immutable", "fail", "fail_immutable", "no_label",
        "no_xattrs", "unknown"
};
int evm_hmac_attrs;

static struct xattr_list evm_config_default_xattrnames[] = {
        {
         .name = XATTR_NAME_SELINUX,
         .enabled = IS_ENABLED(CONFIG_SECURITY_SELINUX)
        },
        {
         .name = XATTR_NAME_SMACK,
         .enabled = IS_ENABLED(CONFIG_SECURITY_SMACK)
        },
        {
         .name = XATTR_NAME_SMACKEXEC,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_SMACKTRANSMUTE,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_SMACKMMAP,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_APPARMOR,
         .enabled = IS_ENABLED(CONFIG_SECURITY_APPARMOR)
        },
        {
         .name = XATTR_NAME_IMA,
         .enabled = IS_ENABLED(CONFIG_IMA_APPRAISE)
        },
        {
         .name = XATTR_NAME_CAPS,
         .enabled = true
        },
};

LIST_HEAD(evm_config_xattrnames);

static char *evm_cmdline __initdata;
core_param(evm, evm_cmdline, charp, 0);

static int evm_fixmode __ro_after_init;
static void __init evm_set_fixmode(void)
{
        if (!evm_cmdline)
                return;

        if (strncmp(evm_cmdline, "fix", 3) == 0) {
                if (arch_get_secureboot()) {
                        pr_info("Secure boot enabled: ignoring evm=fix");
                        return;
                }
                evm_fixmode = 1;
        } else {
                pr_err("invalid \"%s\" mode", evm_cmdline);
        }
}

static void __init evm_init_config(void)
{
        int i, xattrs;

        xattrs = ARRAY_SIZE(evm_config_default_xattrnames);

        pr_info("Initialising EVM extended attributes:\n");
        for (i = 0; i < xattrs; i++) {
                pr_info("%s%s\n", evm_config_default_xattrnames[i].name,
                        !evm_config_default_xattrnames[i].enabled ?
                        " (disabled)" : "");
                list_add_tail(&evm_config_default_xattrnames[i].list,
                              &evm_config_xattrnames);
        }

#ifdef CONFIG_EVM_ATTR_FSUUID
        evm_hmac_attrs |= EVM_ATTR_FSUUID;
#endif
        pr_info("HMAC attrs: 0x%x\n", evm_hmac_attrs);
}

static bool evm_key_loaded(void)
{
        return (bool)(evm_initialized & EVM_KEY_MASK);
}

/*
 * This function determines whether or not it is safe to ignore verification
 * errors, based on the ability of EVM to calculate HMACs. If the HMAC key
 * is not loaded, and it cannot be loaded in the future due to the
 * EVM_SETUP_COMPLETE initialization flag, allowing an operation despite the
 * attrs/xattrs being found invalid will not make them valid.
 */
static bool evm_hmac_disabled(void)
{
        if (evm_initialized & EVM_INIT_HMAC)
                return false;

        if (!(evm_initialized & EVM_SETUP_COMPLETE))
                return false;

        return true;
}

static bool evm_sigv3_required(void)
{
        if (evm_initialized & EVM_SIGV3_REQUIRED)
                return true;

        return false;
}

static int evm_find_protected_xattrs(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        struct xattr_list *xattr;
        int error;
        int count = 0;

        if (!(inode->i_opflags & IOP_XATTR))
                return -EOPNOTSUPP;

        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                error = __vfs_getxattr(dentry, inode, xattr->name, NULL, 0);
                if (error < 0) {
                        if (error == -ENODATA)
                                continue;
                        return error;
                }
                count++;
        }

        return count;
}

static int is_unsupported_hmac_fs(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        if (inode->i_sb->s_iflags & SB_I_EVM_HMAC_UNSUPPORTED) {
                pr_info_once("%s not supported\n", inode->i_sb->s_type->name);
                return 1;
        }
        return 0;
}

/*
 * evm_verify_hmac - calculate and compare the HMAC with the EVM xattr
 *
 * Compute the HMAC on the dentry's protected set of extended attributes
 * and compare it against the stored security.evm xattr.
 *
 * For performance:
 * - use the previously retrieved xattr value and length to calculate the
 *   HMAC.)
 * - cache the verification result in the iint, when available.
 *
 * Returns integrity status
 */
static enum integrity_status evm_verify_hmac(struct dentry *dentry,
                                             const char *xattr_name,
                                             char *xattr_value,
                                             size_t xattr_value_len)
{
        struct evm_ima_xattr_data *xattr_data = NULL;
        struct signature_v2_hdr *hdr;
        enum integrity_status evm_status = INTEGRITY_PASS;
        struct evm_digest digest;
        struct inode *inode = d_backing_inode(dentry);
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        int rc, xattr_len, evm_immutable = 0;

        if (iint && (iint->evm_status == INTEGRITY_PASS ||
                     iint->evm_status == INTEGRITY_PASS_IMMUTABLE))
                return iint->evm_status;

        /*
         * On unsupported filesystems without EVM_INIT_X509 enabled, skip
         * signature verification.
         */
        if (!(evm_initialized & EVM_INIT_X509) &&
            is_unsupported_hmac_fs(dentry))
                return INTEGRITY_UNKNOWN;

        /* if status is not PASS, try to check again - against -ENOMEM */

        /* first need to know the sig type */
        rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM,
                                (char **)&xattr_data, 0, GFP_NOFS);
        if (rc <= 0) {
                evm_status = INTEGRITY_FAIL;
                if (rc == -ENODATA) {
                        rc = evm_find_protected_xattrs(dentry);
                        if (rc > 0)
                                evm_status = INTEGRITY_NOLABEL;
                        else if (rc == 0)
                                evm_status = INTEGRITY_NOXATTRS; /* new file */
                } else if (rc == -EOPNOTSUPP) {
                        evm_status = INTEGRITY_UNKNOWN;
                }
                goto out;
        }

        xattr_len = rc;

        /* check value type */
        switch (xattr_data->type) {
        case EVM_XATTR_HMAC:
                if (xattr_len != sizeof(struct evm_xattr)) {
                        evm_status = INTEGRITY_FAIL;
                        goto out;
                }

                digest.hdr.algo = HASH_ALGO_SHA1;
                rc = evm_calc_hmac(dentry, xattr_name, xattr_value,
                                   xattr_value_len, &digest, iint);
                if (rc)
                        break;
                rc = crypto_memneq(xattr_data->data, digest.digest,
                                   SHA1_DIGEST_SIZE);
                if (rc)
                        rc = -EINVAL;
                break;
        case EVM_XATTR_PORTABLE_DIGSIG:
                evm_immutable = 1;
                fallthrough;
        case EVM_IMA_XATTR_DIGSIG:
                /* accept xattr with non-empty signature field */
                if (xattr_len <= sizeof(struct signature_v2_hdr)) {
                        evm_status = INTEGRITY_FAIL;
                        goto out;
                }

                hdr = (struct signature_v2_hdr *)xattr_data;

                if (evm_sigv3_required() && hdr->version != 3) {
                        evm_status = INTEGRITY_FAIL;
                        goto out;
                }

                digest.hdr.algo = hdr->hash_algo;
                rc = evm_calc_hash(dentry, xattr_name, xattr_value,
                                   xattr_value_len, xattr_data->type, &digest,
                                   iint);
                if (rc)
                        break;
                rc = integrity_digsig_verify(INTEGRITY_KEYRING_EVM,
                                        (const char *)xattr_data, xattr_len,
                                        digest.digest, digest.hdr.length,
                                        digest.hdr.algo);
                if (!rc) {
                        if (xattr_data->type == EVM_XATTR_PORTABLE_DIGSIG) {
                                if (iint)
                                        iint->flags |= EVM_IMMUTABLE_DIGSIG;
                                evm_status = INTEGRITY_PASS_IMMUTABLE;
                        } else if (!IS_RDONLY(inode) &&
                                   !(inode->i_sb->s_readonly_remount) &&
                                   !IS_IMMUTABLE(inode) &&
                                   !is_unsupported_hmac_fs(dentry)) {
                                evm_update_evmxattr(dentry, xattr_name,
                                                    xattr_value,
                                                    xattr_value_len);
                        }
                }
                break;
        default:
                rc = -EINVAL;
                break;
        }

        if (rc) {
                if (rc == -ENODATA)
                        evm_status = INTEGRITY_NOXATTRS;
                else if (evm_immutable)
                        evm_status = INTEGRITY_FAIL_IMMUTABLE;
                else
                        evm_status = INTEGRITY_FAIL;
        }
        pr_debug("digest: (%d) [%*phN]\n", digest.hdr.length, digest.hdr.length,
                  digest.digest);
out:
        if (iint)
                iint->evm_status = evm_status;
        kfree(xattr_data);
        return evm_status;
}

static int evm_protected_xattr_common(const char *req_xattr_name,
                                      bool all_xattrs)
{
        int namelen;
        int found = 0;
        struct xattr_list *xattr;

        namelen = strlen(req_xattr_name);
        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                if (!all_xattrs && !xattr->enabled)
                        continue;

                if ((strlen(xattr->name) == namelen)
                    && (strncmp(req_xattr_name, xattr->name, namelen) == 0)) {
                        found = 1;
                        break;
                }
                if (strncmp(req_xattr_name,
                            xattr->name + XATTR_SECURITY_PREFIX_LEN,
                            strlen(req_xattr_name)) == 0) {
                        found = 1;
                        break;
                }
        }

        return found;
}

int evm_protected_xattr(const char *req_xattr_name)
{
        return evm_protected_xattr_common(req_xattr_name, false);
}

int evm_protected_xattr_if_enabled(const char *req_xattr_name)
{
        return evm_protected_xattr_common(req_xattr_name, true);
}

/**
 * evm_read_protected_xattrs - read EVM protected xattr names, lengths, values
 * @dentry: dentry of the read xattrs
 * @buffer: buffer xattr names, lengths or values are copied to
 * @buffer_size: size of buffer
 * @type: n: names, l: lengths, v: values
 * @canonical_fmt: data format (true: little endian, false: native format)
 *
 * Read protected xattr names (separated by |), lengths (u32) or values for a
 * given dentry and return the total size of copied data. If buffer is NULL,
 * just return the total size.
 *
 * Returns the total size on success, a negative value on error.
 */
int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer,
                              int buffer_size, char type, bool canonical_fmt)
{
        struct xattr_list *xattr;
        int rc, size, total_size = 0;

        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                rc = __vfs_getxattr(dentry, d_backing_inode(dentry),
                                    xattr->name, NULL, 0);
                if (rc < 0 && rc == -ENODATA)
                        continue;
                else if (rc < 0)
                        return rc;

                switch (type) {
                case 'n':
                        size = strlen(xattr->name) + 1;
                        if (buffer) {
                                if (total_size)
                                        *(buffer + total_size - 1) = '|';

                                memcpy(buffer + total_size, xattr->name, size);
                        }
                        break;
                case 'l':
                        size = sizeof(u32);
                        if (buffer) {
                                if (canonical_fmt)
                                        rc = (__force int)cpu_to_le32(rc);

                                *(u32 *)(buffer + total_size) = rc;
                        }
                        break;
                case 'v':
                        size = rc;
                        if (buffer) {
                                rc = __vfs_getxattr(dentry,
                                        d_backing_inode(dentry), xattr->name,
                                        buffer + total_size,
                                        buffer_size - total_size);
                                if (rc < 0)
                                        return rc;
                        }
                        break;
                default:
                        return -EINVAL;
                }

                total_size += size;
        }

        return total_size;
}

/**
 * evm_verifyxattr - verify the integrity of the requested xattr
 * @dentry: object of the verify xattr
 * @xattr_name: requested xattr
 * @xattr_value: requested xattr value
 * @xattr_value_len: requested xattr value length
 *
 * Calculate the HMAC for the given dentry and verify it against the stored
 * security.evm xattr. For performance, use the xattr value and length
 * previously retrieved to calculate the HMAC.
 *
 * Returns the xattr integrity status.
 *
 * This function requires the caller to lock the inode's i_mutex before it
 * is executed.
 */
enum integrity_status evm_verifyxattr(struct dentry *dentry,
                                      const char *xattr_name,
                                      void *xattr_value, size_t xattr_value_len)
{
        if (!evm_key_loaded() || !evm_protected_xattr(xattr_name))
                return INTEGRITY_UNKNOWN;

        return evm_verify_hmac(dentry, xattr_name, xattr_value,
                                 xattr_value_len);
}
EXPORT_SYMBOL_GPL(evm_verifyxattr);

/*
 * evm_verify_current_integrity - verify the dentry's metadata integrity
 * @dentry: pointer to the affected dentry
 *
 * Verify and return the dentry's metadata integrity. The exceptions are
 * before EVM is initialized or in 'fix' mode.
 */
static enum integrity_status evm_verify_current_integrity(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        if (!evm_key_loaded() || !S_ISREG(inode->i_mode) || evm_fixmode)
                return INTEGRITY_PASS;
        return evm_verify_hmac(dentry, NULL, NULL, 0);
}

/*
 * evm_xattr_change - check if passed xattr value differs from current value
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: requested xattr
 * @xattr_value: requested xattr value
 * @xattr_value_len: requested xattr value length
 *
 * Check if passed xattr value differs from current value.
 *
 * Returns 1 if passed xattr value differs from current value, 0 otherwise.
 */
static int evm_xattr_change(struct mnt_idmap *idmap,
                            struct dentry *dentry, const char *xattr_name,
                            const void *xattr_value, size_t xattr_value_len)
{
        char *xattr_data = NULL;
        int rc = 0;

        rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr_name, &xattr_data,
                                0, GFP_NOFS);
        if (rc < 0) {
                rc = 1;
                goto out;
        }

        if (rc == xattr_value_len)
                rc = !!memcmp(xattr_value, xattr_data, rc);
        else
                rc = 1;

out:
        kfree(xattr_data);
        return rc;
}

/*
 * evm_protect_xattr - protect the EVM extended attribute
 *
 * Prevent security.evm from being modified or removed without the
 * necessary permissions or when the existing value is invalid.
 *
 * The posix xattr acls are 'system' prefixed, which normally would not
 * affect security.evm.  An interesting side affect of writing posix xattr
 * acls is their modifying of the i_mode, which is included in security.evm.
 * For posix xattr acls only, permit security.evm, even if it currently
 * doesn't exist, to be updated unless the EVM signature is immutable.
 */
static int evm_protect_xattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, const char *xattr_name,
                             const void *xattr_value, size_t xattr_value_len)
{
        enum integrity_status evm_status;

        if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) {
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (is_unsupported_hmac_fs(dentry))
                        return -EPERM;
        } else if (!evm_protected_xattr(xattr_name)) {
                if (!posix_xattr_acl(xattr_name))
                        return 0;
                if (is_unsupported_hmac_fs(dentry))
                        return 0;

                evm_status = evm_verify_current_integrity(dentry);
                if ((evm_status == INTEGRITY_PASS) ||
                    (evm_status == INTEGRITY_NOXATTRS))
                        return 0;
                goto out;
        } else if (is_unsupported_hmac_fs(dentry))
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        if (evm_status == INTEGRITY_NOXATTRS) {
                struct evm_iint_cache *iint;

                /* Exception if the HMAC is not going to be calculated. */
                if (evm_hmac_disabled())
                        return 0;

                iint = evm_iint_inode(d_backing_inode(dentry));
                if (iint && (iint->flags & EVM_NEW_FILE))
                        return 0;

                /* exception for pseudo filesystems */
                if (dentry->d_sb->s_magic == TMPFS_MAGIC
                    || dentry->d_sb->s_magic == SYSFS_MAGIC)
                        return 0;

                integrity_audit_msg(AUDIT_INTEGRITY_METADATA,
                                    dentry->d_inode, dentry->d_name.name,
                                    "update_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        }
out:
        /* Exception if the HMAC is not going to be calculated. */
        if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
            evm_status == INTEGRITY_UNKNOWN))
                return 0;

        /*
         * Writing other xattrs is safe for portable signatures, as portable
         * signatures are immutable and can never be updated.
         */
        if (evm_status == INTEGRITY_FAIL_IMMUTABLE)
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_xattr_change(idmap, dentry, xattr_name, xattr_value,
                              xattr_value_len))
                return 0;

        if (evm_status != INTEGRITY_PASS &&
            evm_status != INTEGRITY_PASS_IMMUTABLE)
                integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                                    dentry->d_name.name, "appraise_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        return evm_status == INTEGRITY_PASS ? 0 : -EPERM;
}

/**
 * evm_inode_setxattr - protect the EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 * @xattr_value: pointer to the new extended attribute value
 * @xattr_value_len: pointer to the new extended attribute value length
 * @flags: flags to pass into filesystem operations
 *
 * Before allowing the 'security.evm' protected xattr to be updated,
 * verify the existing value is valid.  As only the kernel should have
 * access to the EVM encrypted key needed to calculate the HMAC, prevent
 * userspace from writing HMAC value.  Writing 'security.evm' requires
 * requires CAP_SYS_ADMIN privileges.
 */
static int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                              const char *xattr_name, const void *xattr_value,
                              size_t xattr_value_len, int flags)
{
        const struct evm_ima_xattr_data *xattr_data = xattr_value;

        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) {
                if (!xattr_value_len)
                        return -EINVAL;
                if (xattr_data->type != EVM_IMA_XATTR_DIGSIG &&
                    xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG)
                        return -EPERM;
        }
        return evm_protect_xattr(idmap, dentry, xattr_name, xattr_value,
                                 xattr_value_len);
}

/**
 * evm_inode_removexattr - protect the EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Removing 'security.evm' requires CAP_SYS_ADMIN privileges and that
 * the current value is valid.
 */
static int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 const char *xattr_name)
{
        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        return evm_protect_xattr(idmap, dentry, xattr_name, NULL, 0);
}

#ifdef CONFIG_FS_POSIX_ACL
static int evm_inode_set_acl_change(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *name,
                                    struct posix_acl *kacl)
{
        int rc;

        umode_t mode;
        struct inode *inode = d_backing_inode(dentry);

        if (!kacl)
                return 1;

        rc = posix_acl_update_mode(idmap, inode, &mode, &kacl);
        if (rc || (inode->i_mode != mode))
                return 1;

        return 0;
}
#else
static inline int evm_inode_set_acl_change(struct mnt_idmap *idmap,
                                           struct dentry *dentry,
                                           const char *name,
                                           struct posix_acl *kacl)
{
        return 0;
}
#endif

/**
 * evm_inode_set_acl - protect the EVM extended attribute from posix acls
 * @idmap: idmap of the idmapped mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 * @kacl: pointer to the posix acls
 *
 * Prevent modifying posix acls causing the EVM HMAC to be re-calculated
 * and 'security.evm' xattr updated, unless the existing 'security.evm' is
 * valid.
 *
 * Return: zero on success, -EPERM on failure.
 */
static int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                             const char *acl_name, struct posix_acl *kacl)
{
        enum integrity_status evm_status;

        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        if ((evm_status == INTEGRITY_PASS) ||
            (evm_status == INTEGRITY_NOXATTRS))
                return 0;

        /* Exception if the HMAC is not going to be calculated. */
        if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
            evm_status == INTEGRITY_UNKNOWN))
                return 0;

        /*
         * Writing other xattrs is safe for portable signatures, as portable
         * signatures are immutable and can never be updated.
         */
        if (evm_status == INTEGRITY_FAIL_IMMUTABLE)
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_inode_set_acl_change(idmap, dentry, acl_name, kacl))
                return 0;

        if (evm_status != INTEGRITY_PASS_IMMUTABLE)
                integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                                    dentry->d_name.name, "appraise_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        return -EPERM;
}

/**
 * evm_inode_remove_acl - Protect the EVM extended attribute from posix acls
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 *
 * Prevent removing posix acls causing the EVM HMAC to be re-calculated
 * and 'security.evm' xattr updated, unless the existing 'security.evm' is
 * valid.
 *
 * Return: zero on success, -EPERM on failure.
 */
static int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                                const char *acl_name)
{
        return evm_inode_set_acl(idmap, dentry, acl_name, NULL);
}

static void evm_reset_status(struct inode *inode)
{
        struct evm_iint_cache *iint;

        iint = evm_iint_inode(inode);
        if (iint)
                iint->evm_status = INTEGRITY_UNKNOWN;
}

/**
 * evm_metadata_changed: Detect changes to the metadata
 * @inode: a file's inode
 * @metadata_inode: metadata inode
 *
 * On a stacked filesystem detect whether the metadata has changed. If this is
 * the case reset the evm_status associated with the inode that represents the
 * file.
 */
bool evm_metadata_changed(struct inode *inode, struct inode *metadata_inode)
{
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        bool ret = false;

        if (iint) {
                ret = (!IS_I_VERSION(metadata_inode) ||
                       integrity_inode_attrs_changed(&iint->metadata_inode,
                                                     metadata_inode));
                if (ret)
                        iint->evm_status = INTEGRITY_UNKNOWN;
        }

        return ret;
}

/**
 * evm_revalidate_status - report whether EVM status re-validation is necessary
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Report whether callers of evm_verifyxattr() should re-validate the
 * EVM status.
 *
 * Return true if re-validation is necessary, false otherwise.
 */
bool evm_revalidate_status(const char *xattr_name)
{
        if (!evm_key_loaded())
                return false;

        /* evm_inode_post_setattr() passes NULL */
        if (!xattr_name)
                return true;

        if (!evm_protected_xattr(xattr_name) && !posix_xattr_acl(xattr_name) &&
            strcmp(xattr_name, XATTR_NAME_EVM))
                return false;

        return true;
}

/**
 * evm_fix_hmac - Calculate the HMAC and add it to security.evm for fix mode
 * @dentry: pointer to the affected dentry which doesn't yet have security.evm
 *          xattr
 * @xattr_name: pointer to the affected extended attribute name
 * @xattr_value: pointer to the new extended attribute value
 * @xattr_value_len: pointer to the new extended attribute value length
 *
 * Expects to be called with i_mutex locked.
 *
 * Return: 0 on success, -EPERM/-ENOMEM/-EOPNOTSUPP on failure
 */
int evm_fix_hmac(struct dentry *dentry, const char *xattr_name,
                 const char *xattr_value, size_t xattr_value_len)

{
        if (!evm_fixmode || !evm_revalidate_status((xattr_name)))
                return -EPERM;

        if (!(evm_initialized & EVM_INIT_HMAC))
                return -EPERM;

        if (is_unsupported_hmac_fs(dentry))
                return -EOPNOTSUPP;

        return evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len);
}

/**
 * evm_inode_post_setxattr - update 'security.evm' to reflect the changes
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 * @xattr_value: pointer to the new extended attribute value
 * @xattr_value_len: pointer to the new extended attribute value length
 * @flags: flags to pass into filesystem operations
 *
 * Update the HMAC stored in 'security.evm' to reflect the change.
 *
 * No need to take the i_mutex lock here, as this function is called from
 * __vfs_setxattr_noperm().  The caller of which has taken the inode's
 * i_mutex lock.
 */
static void evm_inode_post_setxattr(struct dentry *dentry,
                                    const char *xattr_name,
                                    const void *xattr_value,
                                    size_t xattr_value_len,
                                    int flags)
{
        if (!evm_revalidate_status(xattr_name))
                return;

        evm_reset_status(dentry->d_inode);

        if (!strcmp(xattr_name, XATTR_NAME_EVM))
                return;

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        if (is_unsupported_hmac_fs(dentry))
                return;

        evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len);
}

/**
 * evm_inode_post_set_acl - Update the EVM extended attribute from posix acls
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 * @kacl: pointer to the posix acls
 *
 * Update the 'security.evm' xattr with the EVM HMAC re-calculated after setting
 * posix acls.
 */
static void evm_inode_post_set_acl(struct dentry *dentry, const char *acl_name,
                                   struct posix_acl *kacl)
{
        return evm_inode_post_setxattr(dentry, acl_name, NULL, 0, 0);
}

/**
 * evm_inode_post_removexattr - update 'security.evm' after removing the xattr
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Update the HMAC stored in 'security.evm' to reflect removal of the xattr.
 *
 * No need to take the i_mutex lock here, as this function is called from
 * vfs_removexattr() which takes the i_mutex.
 */
static void evm_inode_post_removexattr(struct dentry *dentry,
                                       const char *xattr_name)
{
        if (!evm_revalidate_status(xattr_name))
                return;

        evm_reset_status(dentry->d_inode);

        if (!strcmp(xattr_name, XATTR_NAME_EVM))
                return;

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        evm_update_evmxattr(dentry, xattr_name, NULL, 0);
}

/**
 * evm_inode_post_remove_acl - Update the EVM extended attribute from posix acls
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 *
 * Update the 'security.evm' xattr with the EVM HMAC re-calculated after
 * removing posix acls.
 */
static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap,
                                             struct dentry *dentry,
                                             const char *acl_name)
{
        evm_inode_post_removexattr(dentry, acl_name);
}

static int evm_attr_change(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_backing_inode(dentry);
        unsigned int ia_valid = attr->ia_valid;

        if (!i_uid_needs_update(idmap, attr, inode) &&
            !i_gid_needs_update(idmap, attr, inode) &&
            (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode))
                return 0;

        return 1;
}

/**
 * evm_inode_setattr - prevent updating an invalid EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @attr: iattr structure containing the new file attributes
 *
 * Permit update of file attributes when files have a valid EVM signature,
 * except in the case of them having an immutable portable signature.
 */
static int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                             struct iattr *attr)
{
        unsigned int ia_valid = attr->ia_valid;
        enum integrity_status evm_status;

        /* Policy permits modification of the protected attrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        if (is_unsupported_hmac_fs(dentry))
                return 0;

        if (!(ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        /*
         * Writing attrs is safe for portable signatures, as portable signatures
         * are immutable and can never be updated.
         */
        if ((evm_status == INTEGRITY_PASS) ||
            (evm_status == INTEGRITY_NOXATTRS) ||
            (evm_status == INTEGRITY_FAIL_IMMUTABLE) ||
            (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
             evm_status == INTEGRITY_UNKNOWN)))
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_attr_change(idmap, dentry, attr))
                return 0;

        integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                            dentry->d_name.name, "appraise_metadata",
                            integrity_status_msg[evm_status], -EPERM, 0);
        return -EPERM;
}

/**
 * evm_inode_post_setattr - update 'security.evm' after modifying metadata
 * @idmap: idmap of the idmapped mount
 * @dentry: pointer to the affected dentry
 * @ia_valid: for the UID and GID status
 *
 * For now, update the HMAC stored in 'security.evm' to reflect UID/GID
 * changes.
 *
 * This function is called from notify_change(), which expects the caller
 * to lock the inode's i_mutex.
 */
static void evm_inode_post_setattr(struct mnt_idmap *idmap,
                                   struct dentry *dentry, int ia_valid)
{
        if (!evm_revalidate_status(NULL))
                return;

        evm_reset_status(dentry->d_inode);

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        if (is_unsupported_hmac_fs(dentry))
                return;

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
                evm_update_evmxattr(dentry, NULL, NULL, 0);
}

static int evm_inode_copy_up_xattr(struct dentry *src, const char *name)
{
        struct evm_ima_xattr_data *xattr_data = NULL;
        int rc;

        if (strcmp(name, XATTR_NAME_EVM) != 0)
                return -EOPNOTSUPP;

        /* first need to know the sig type */
        rc = vfs_getxattr_alloc(&nop_mnt_idmap, src, XATTR_NAME_EVM,
                                (char **)&xattr_data, 0, GFP_NOFS);
        if (rc <= 0)
                return -EPERM;

        if (rc < offsetof(struct evm_ima_xattr_data, type) +
                 sizeof(xattr_data->type))
                return -EPERM;

        switch (xattr_data->type) {
        case EVM_XATTR_PORTABLE_DIGSIG:
                rc = 0; /* allow copy-up */
                break;
        case EVM_XATTR_HMAC:
        case EVM_IMA_XATTR_DIGSIG:
        default:
                rc = -ECANCELED; /* discard */
        }

        kfree(xattr_data);
        return rc;
}

/*
 * evm_inode_init_security - initializes security.evm HMAC value
 */
int evm_inode_init_security(struct inode *inode, struct inode *dir,
                            const struct qstr *qstr, struct xattr *xattrs,
                            int *xattr_count)
{
        struct evm_xattr *xattr_data;
        struct xattr *xattr, *evm_xattr;
        bool evm_protected_xattrs = false;
        int rc;

        if (!(evm_initialized & EVM_INIT_HMAC) || !xattrs)
                return 0;

        /*
         * security_inode_init_security() makes sure that the xattrs array is
         * contiguous, there is enough space for security.evm, and that there is
         * a terminator at the end of the array.
         */
        for (xattr = xattrs; xattr->name; xattr++) {
                if (evm_protected_xattr(xattr->name))
                        evm_protected_xattrs = true;
        }

        /* EVM xattr not needed. */
        if (!evm_protected_xattrs)
                return 0;

        evm_xattr = lsm_get_xattr_slot(xattrs, xattr_count);
        /*
         * Array terminator (xattr name = NULL) must be the first non-filled
         * xattr slot.
         */
        WARN_ONCE(evm_xattr != xattr,
                  "%s: xattrs terminator is not the first non-filled slot\n",
                  __func__);

        xattr_data = kzalloc_obj(*xattr_data, GFP_NOFS);
        if (!xattr_data)
                return -ENOMEM;

        xattr_data->data.type = EVM_XATTR_HMAC;
        rc = evm_init_hmac(inode, xattrs, xattr_data->digest);
        if (rc < 0)
                goto out;

        evm_xattr->value = xattr_data;
        evm_xattr->value_len = sizeof(*xattr_data);
        evm_xattr->name = XATTR_EVM_SUFFIX;
        return 0;
out:
        kfree(xattr_data);
        return rc;
}
EXPORT_SYMBOL_GPL(evm_inode_init_security);

static int evm_inode_alloc_security(struct inode *inode)
{
        struct evm_iint_cache *iint = evm_iint_inode(inode);

        /* Called by security_inode_alloc(), it cannot be NULL. */
        iint->flags = 0UL;
        iint->evm_status = INTEGRITY_UNKNOWN;

        return 0;
}

static void evm_file_release(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        fmode_t mode = file->f_mode;

        if (!S_ISREG(inode->i_mode) || !(mode & FMODE_WRITE))
                return;

        if (iint && iint->flags & EVM_NEW_FILE &&
            atomic_read(&inode->i_writecount) == 1)
                iint->flags &= ~EVM_NEW_FILE;
}

static void evm_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        struct evm_iint_cache *iint = evm_iint_inode(inode);

        if (!S_ISREG(inode->i_mode))
                return;

        if (iint)
                iint->flags |= EVM_NEW_FILE;
}

#ifdef CONFIG_EVM_LOAD_X509
void __init evm_load_x509(void)
{
        int rc;

        rc = integrity_load_x509(INTEGRITY_KEYRING_EVM, CONFIG_EVM_X509_PATH);
        if (!rc)
                evm_initialized |= EVM_INIT_X509;
}
#endif

static int __init init_evm(void)
{
        int error;
        struct list_head *pos, *q;

        evm_init_config();

        evm_set_fixmode();

        error = integrity_init_keyring(INTEGRITY_KEYRING_EVM);
        if (error)
                goto error;

        error = evm_init_secfs();
        if (error < 0) {
                pr_info("Error registering secfs\n");
                goto error;
        }

error:
        if (error != 0) {
                if (!list_empty(&evm_config_xattrnames)) {
                        list_for_each_safe(pos, q, &evm_config_xattrnames)
                                list_del(pos);
                }
        }

        return error;
}

static struct security_hook_list evm_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(inode_setattr, evm_inode_setattr),
        LSM_HOOK_INIT(inode_post_setattr, evm_inode_post_setattr),
        LSM_HOOK_INIT(inode_copy_up_xattr, evm_inode_copy_up_xattr),
        LSM_HOOK_INIT(inode_setxattr, evm_inode_setxattr),
        LSM_HOOK_INIT(inode_post_setxattr, evm_inode_post_setxattr),
        LSM_HOOK_INIT(inode_set_acl, evm_inode_set_acl),
        LSM_HOOK_INIT(inode_post_set_acl, evm_inode_post_set_acl),
        LSM_HOOK_INIT(inode_remove_acl, evm_inode_remove_acl),
        LSM_HOOK_INIT(inode_post_remove_acl, evm_inode_post_remove_acl),
        LSM_HOOK_INIT(inode_removexattr, evm_inode_removexattr),
        LSM_HOOK_INIT(inode_post_removexattr, evm_inode_post_removexattr),
        LSM_HOOK_INIT(inode_init_security, evm_inode_init_security),
        LSM_HOOK_INIT(inode_alloc_security, evm_inode_alloc_security),
        LSM_HOOK_INIT(file_release, evm_file_release),
        LSM_HOOK_INIT(path_post_mknod, evm_post_path_mknod),
};

static const struct lsm_id evm_lsmid = {
        .name = "evm",
        .id = LSM_ID_EVM,
};

static int __init init_evm_lsm(void)
{
        security_add_hooks(evm_hooks, ARRAY_SIZE(evm_hooks), &evm_lsmid);
        return 0;
}

struct lsm_blob_sizes evm_blob_sizes __ro_after_init = {
        .lbs_inode = sizeof(struct evm_iint_cache),
        .lbs_xattr_count = 1,
};

DEFINE_LSM(evm) = {
        .id = &evm_lsmid,
        .init = init_evm_lsm,
        .order = LSM_ORDER_LAST,
        .blobs = &evm_blob_sizes,
        .initcall_late = init_evm,
};


































    1 



    1 
    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CGROUP_NAMESPACE_H
#define _LINUX_CGROUP_NAMESPACE_H

#include <linux/ns_common.h>

struct cgroup_namespace {
        struct ns_common        ns;
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct css_set          *root_cset;
};

extern struct cgroup_namespace init_cgroup_ns;

#ifdef CONFIG_CGROUPS

static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
{
        return container_of(ns, struct cgroup_namespace, ns);
}

void free_cgroup_ns(struct cgroup_namespace *ns);

struct cgroup_namespace *copy_cgroup_ns(u64 flags,
                                        struct user_namespace *user_ns,
                                        struct cgroup_namespace *old_ns);

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns);

static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
        ns_ref_inc(ns);
}

static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns_ref_put(ns))
                free_cgroup_ns(ns);
}

#else /* !CONFIG_CGROUPS */

static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
static inline struct cgroup_namespace *
copy_cgroup_ns(u64 flags, struct user_namespace *user_ns,
               struct cgroup_namespace *old_ns)
{
        return old_ns;
}

static inline void get_cgroup_ns(struct cgroup_namespace *ns) { }
static inline void put_cgroup_ns(struct cgroup_namespace *ns) { }

#endif /* !CONFIG_CGROUPS */

#endif /* _LINUX_CGROUP_NAMESPACE_H */






























































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_DMA_MAPPING_H
#define _LINUX_DMA_MAPPING_H

#include <linux/device.h>
#include <linux/err.h>
#include <linux/dma-direction.h>
#include <linux/scatterlist.h>
#include <linux/bug.h>
#include <linux/cache.h>

/*
 * List of possible attributes associated with a DMA mapping. The semantics
 * of each attribute should be defined in Documentation/core-api/dma-attributes.rst.
 */

/*
 * DMA_ATTR_WEAK_ORDERING: Specifies that reads and writes to the mapping
 * may be weakly ordered, that is that reads and writes may pass each other.
 */
#define DMA_ATTR_WEAK_ORDERING                (1UL << 1)
/*
 * DMA_ATTR_WRITE_COMBINE: Specifies that writes to the mapping may be
 * buffered to improve performance.
 */
#define DMA_ATTR_WRITE_COMBINE                (1UL << 2)
/*
 * DMA_ATTR_NO_KERNEL_MAPPING: Lets the platform to avoid creating a kernel
 * virtual mapping for the allocated buffer.
 */
#define DMA_ATTR_NO_KERNEL_MAPPING        (1UL << 4)
/*
 * DMA_ATTR_SKIP_CPU_SYNC: Allows platform code to skip synchronization of
 * the CPU cache for the given buffer assuming that it has been already
 * transferred to 'device' domain.
 */
#define DMA_ATTR_SKIP_CPU_SYNC                (1UL << 5)
/*
 * DMA_ATTR_FORCE_CONTIGUOUS: Forces contiguous allocation of the buffer
 * in physical memory.
 */
#define DMA_ATTR_FORCE_CONTIGUOUS        (1UL << 6)
/*
 * DMA_ATTR_ALLOC_SINGLE_PAGES: This is a hint to the DMA-mapping subsystem
 * that it's probably not worth the time to try to allocate memory to in a way
 * that gives better TLB efficiency.
 */
#define DMA_ATTR_ALLOC_SINGLE_PAGES        (1UL << 7)
/*
 * DMA_ATTR_NO_WARN: This tells the DMA-mapping subsystem to suppress
 * allocation failure reports (similarly to __GFP_NOWARN).
 */
#define DMA_ATTR_NO_WARN        (1UL << 8)

/*
 * DMA_ATTR_PRIVILEGED: used to indicate that the buffer is fully
 * accessible at an elevated privilege level (and ideally inaccessible or
 * at least read-only at lesser-privileged levels).
 */
#define DMA_ATTR_PRIVILEGED                (1UL << 9)

/*
 * DMA_ATTR_MMIO - Indicates memory-mapped I/O (MMIO) region for DMA mapping
 *
 * This attribute indicates the physical address is not normal system
 * memory. It may not be used with kmap*()/phys_to_virt()/phys_to_page()
 * functions, it may not be cacheable, and access using CPU load/store
 * instructions may not be allowed.
 *
 * Usually this will be used to describe MMIO addresses, or other non-cacheable
 * register addresses. When DMA mapping this sort of address we call
 * the operation Peer to Peer as a one device is DMA'ing to another device.
 * For PCI devices the p2pdma APIs must be used to determine if DMA_ATTR_MMIO
 * is appropriate.
 *
 * For architectures that require cache flushing for DMA coherence
 * DMA_ATTR_MMIO will not perform any cache flushing. The address
 * provided must never be mapped cacheable into the CPU.
 */
#define DMA_ATTR_MMIO                (1UL << 10)

/*
 * DMA_ATTR_DEBUGGING_IGNORE_CACHELINES: Indicates the CPU cache line can be
 * overlapped. All mappings sharing a cacheline must have this attribute for
 * this to be considered safe.
 */
#define DMA_ATTR_DEBUGGING_IGNORE_CACHELINES        (1UL << 11)

/*
 * DMA_ATTR_REQUIRE_COHERENT: Indicates that DMA coherency is required.
 * All mappings that carry this attribute can't work with SWIOTLB and cache
 * flushing.
 */
#define DMA_ATTR_REQUIRE_COHERENT        (1UL << 12)
/*
 * DMA_ATTR_CC_SHARED: Indicates the DMA mapping is shared (decrypted) for
 * confidential computing guests. For normal system memory the caller must have
 * called set_memory_decrypted(), and pgprot_decrypted must be used when
 * creating CPU PTEs for the mapping. The same shared semantic may be passed
 * to the vIOMMU when it sets up the IOPTE. For MMIO use together with
 * DMA_ATTR_MMIO to indicate shared MMIO. Unless DMA_ATTR_MMIO is provided
 * a struct page is required.
 */
#define DMA_ATTR_CC_SHARED        (1UL << 13)

/*
 * A dma_addr_t can hold any valid DMA or bus address for the platform.  It can
 * be given to a device to use as a DMA source or target.  It is specific to a
 * given device and there may be a translation between the CPU physical address
 * space and the bus address space.
 *
 * DMA_MAPPING_ERROR is the magic error code if a mapping failed.  It should not
 * be used directly in drivers, but checked for using dma_mapping_error()
 * instead.
 */
#define DMA_MAPPING_ERROR                (~(dma_addr_t)0)

#define DMA_BIT_MASK(n)        GENMASK_ULL((n) - 1, 0)

struct dma_iova_state {
        dma_addr_t addr;
        u64 __size;
};

/*
 * Use the high bit to mark if we used swiotlb for one or more ranges.
 */
#define DMA_IOVA_USE_SWIOTLB                (1ULL << 63)

static inline size_t dma_iova_size(struct dma_iova_state *state)
{
        /* Casting is needed for 32-bits systems */
        return (size_t)(state->__size & ~DMA_IOVA_USE_SWIOTLB);
}

#ifdef CONFIG_DMA_API_DEBUG
void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
void debug_dma_map_single(struct device *dev, const void *addr,
                unsigned long len);
#else
static inline void debug_dma_mapping_error(struct device *dev,
                dma_addr_t dma_addr)
{
}
static inline void debug_dma_map_single(struct device *dev, const void *addr,
                unsigned long len)
{
}
#endif /* CONFIG_DMA_API_DEBUG */

#ifdef CONFIG_HAS_DMA
static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
        debug_dma_mapping_error(dev, dma_addr);

        if (unlikely(dma_addr == DMA_MAPPING_ERROR))
                return -ENOMEM;
        return 0;
}

dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
                size_t offset, size_t size, enum dma_data_direction dir,
                unsigned long attrs);
void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
                enum dma_data_direction dir, unsigned long attrs);
dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
                enum dma_data_direction dir, unsigned long attrs);
void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
                enum dma_data_direction dir, unsigned long attrs);
unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
                int nents, enum dma_data_direction dir, unsigned long attrs);
void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
                                      int nents, enum dma_data_direction dir,
                                      unsigned long attrs);
int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
                enum dma_data_direction dir, unsigned long attrs);
dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
                size_t size, enum dma_data_direction dir, unsigned long attrs);
void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
                enum dma_data_direction dir, unsigned long attrs);
void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
                gfp_t flag, unsigned long attrs);
void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
                dma_addr_t dma_handle, unsigned long attrs);
void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
                gfp_t gfp, unsigned long attrs);
void dmam_free_coherent(struct device *dev, size_t size, void *vaddr,
                dma_addr_t dma_handle);
int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs);
int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs);
bool dma_can_mmap(struct device *dev);
bool dma_pci_p2pdma_supported(struct device *dev);
int dma_set_mask(struct device *dev, u64 mask);
int dma_set_coherent_mask(struct device *dev, u64 mask);
u64 dma_get_required_mask(struct device *dev);
bool dma_addressing_limited(struct device *dev);
size_t dma_max_mapping_size(struct device *dev);
size_t dma_opt_mapping_size(struct device *dev);
unsigned long dma_get_merge_boundary(struct device *dev);
struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
                enum dma_data_direction dir, gfp_t gfp, unsigned long attrs);
void dma_free_noncontiguous(struct device *dev, size_t size,
                struct sg_table *sgt, enum dma_data_direction dir);
void *dma_vmap_noncontiguous(struct device *dev, size_t size,
                struct sg_table *sgt);
void dma_vunmap_noncontiguous(struct device *dev, void *vaddr);
int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
                size_t size, struct sg_table *sgt);
#else /* CONFIG_HAS_DMA */
static inline dma_addr_t dma_map_page_attrs(struct device *dev,
                struct page *page, size_t offset, size_t size,
                enum dma_data_direction dir, unsigned long attrs)
{
        return DMA_MAPPING_ERROR;
}
static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
}
static inline dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
        return DMA_MAPPING_ERROR;
}
static inline void dma_unmap_phys(struct device *dev, dma_addr_t addr,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
}
static inline unsigned int dma_map_sg_attrs(struct device *dev,
                struct scatterlist *sg, int nents, enum dma_data_direction dir,
                unsigned long attrs)
{
        return 0;
}
static inline void dma_unmap_sg_attrs(struct device *dev,
                struct scatterlist *sg, int nents, enum dma_data_direction dir,
                unsigned long attrs)
{
}
static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
                enum dma_data_direction dir, unsigned long attrs)
{
        return -EOPNOTSUPP;
}
static inline dma_addr_t dma_map_resource(struct device *dev,
                phys_addr_t phys_addr, size_t size, enum dma_data_direction dir,
                unsigned long attrs)
{
        return DMA_MAPPING_ERROR;
}
static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
}
static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
        return -ENOMEM;
}
static inline void *dma_alloc_attrs(struct device *dev, size_t size,
                dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs)
{
        return NULL;
}
static inline void dma_free_attrs(struct device *dev, size_t size,
                void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs)
{
}
static inline void *dmam_alloc_attrs(struct device *dev, size_t size,
                dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
        return NULL;
}
static inline void dmam_free_coherent(struct device *dev, size_t size,
                void *vaddr, dma_addr_t dma_handle)
{
}
static inline int dma_get_sgtable_attrs(struct device *dev,
                struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr,
                size_t size, unsigned long attrs)
{
        return -ENXIO;
}
static inline int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs)
{
        return -ENXIO;
}
static inline bool dma_can_mmap(struct device *dev)
{
        return false;
}
static inline bool dma_pci_p2pdma_supported(struct device *dev)
{
        return false;
}
static inline int dma_set_mask(struct device *dev, u64 mask)
{
        return -EIO;
}
static inline int dma_set_coherent_mask(struct device *dev, u64 mask)
{
        return -EIO;
}
static inline u64 dma_get_required_mask(struct device *dev)
{
        return 0;
}
static inline bool dma_addressing_limited(struct device *dev)
{
        return false;
}
static inline size_t dma_max_mapping_size(struct device *dev)
{
        return 0;
}
static inline size_t dma_opt_mapping_size(struct device *dev)
{
        return 0;
}
static inline unsigned long dma_get_merge_boundary(struct device *dev)
{
        return 0;
}
static inline struct sg_table *dma_alloc_noncontiguous(struct device *dev,
                size_t size, enum dma_data_direction dir, gfp_t gfp,
                unsigned long attrs)
{
        return NULL;
}
static inline void dma_free_noncontiguous(struct device *dev, size_t size,
                struct sg_table *sgt, enum dma_data_direction dir)
{
}
static inline void *dma_vmap_noncontiguous(struct device *dev, size_t size,
                struct sg_table *sgt)
{
        return NULL;
}
static inline void dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
{
}
static inline int dma_mmap_noncontiguous(struct device *dev,
                struct vm_area_struct *vma, size_t size, struct sg_table *sgt)
{
        return -EINVAL;
}
#endif /* CONFIG_HAS_DMA */

#ifdef CONFIG_IOMMU_DMA
/**
 * dma_use_iova - check if the IOVA API is used for this state
 * @state: IOVA state
 *
 * Return %true if the DMA transfers uses the dma_iova_*() calls or %false if
 * they can't be used.
 */
static inline bool dma_use_iova(struct dma_iova_state *state)
{
        return state->__size != 0;
}

bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state,
                phys_addr_t phys, size_t size);
void dma_iova_free(struct device *dev, struct dma_iova_state *state);
void dma_iova_destroy(struct device *dev, struct dma_iova_state *state,
                size_t mapped_len, enum dma_data_direction dir,
                unsigned long attrs);
int dma_iova_sync(struct device *dev, struct dma_iova_state *state,
                size_t offset, size_t size);
int dma_iova_link(struct device *dev, struct dma_iova_state *state,
                phys_addr_t phys, size_t offset, size_t size,
                enum dma_data_direction dir, unsigned long attrs);
void dma_iova_unlink(struct device *dev, struct dma_iova_state *state,
                size_t offset, size_t size, enum dma_data_direction dir,
                unsigned long attrs);
#else /* CONFIG_IOMMU_DMA */
static inline bool dma_use_iova(struct dma_iova_state *state)
{
        return false;
}
static inline bool dma_iova_try_alloc(struct device *dev,
                struct dma_iova_state *state, phys_addr_t phys, size_t size)
{
        return false;
}
static inline void dma_iova_free(struct device *dev,
                struct dma_iova_state *state)
{
}
static inline void dma_iova_destroy(struct device *dev,
                struct dma_iova_state *state, size_t mapped_len,
                enum dma_data_direction dir, unsigned long attrs)
{
}
static inline int dma_iova_sync(struct device *dev,
                struct dma_iova_state *state, size_t offset, size_t size)
{
        return -EOPNOTSUPP;
}
static inline int dma_iova_link(struct device *dev,
                struct dma_iova_state *state, phys_addr_t phys, size_t offset,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
        return -EOPNOTSUPP;
}
static inline void dma_iova_unlink(struct device *dev,
                struct dma_iova_state *state, size_t offset, size_t size,
                enum dma_data_direction dir, unsigned long attrs)
{
}
#endif /* CONFIG_IOMMU_DMA */

#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
                enum dma_data_direction dir);
void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
                size_t size, enum dma_data_direction dir);
void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
                int nelems, enum dma_data_direction dir);
void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
                int nelems, enum dma_data_direction dir);
bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr);

static inline bool dma_dev_need_sync(const struct device *dev)
{
        /* Always call DMA sync operations when debugging is enabled */
        return !dev->dma_skip_sync || IS_ENABLED(CONFIG_DMA_API_DEBUG);
}

static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
                size_t size, enum dma_data_direction dir)
{
        if (dma_dev_need_sync(dev))
                __dma_sync_single_for_cpu(dev, addr, size, dir);
}

static inline void dma_sync_single_for_device(struct device *dev,
                dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
        if (dma_dev_need_sync(dev))
                __dma_sync_single_for_device(dev, addr, size, dir);
}

static inline void dma_sync_sg_for_cpu(struct device *dev,
                struct scatterlist *sg, int nelems, enum dma_data_direction dir)
{
        if (dma_dev_need_sync(dev))
                __dma_sync_sg_for_cpu(dev, sg, nelems, dir);
}

static inline void dma_sync_sg_for_device(struct device *dev,
                struct scatterlist *sg, int nelems, enum dma_data_direction dir)
{
        if (dma_dev_need_sync(dev))
                __dma_sync_sg_for_device(dev, sg, nelems, dir);
}

static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
        return dma_dev_need_sync(dev) ? __dma_need_sync(dev, dma_addr) : false;
}
bool dma_need_unmap(struct device *dev);
#else /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */
static inline bool dma_dev_need_sync(const struct device *dev)
{
        return false;
}
static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
                size_t size, enum dma_data_direction dir)
{
}
static inline void dma_sync_single_for_device(struct device *dev,
                dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
}
static inline void dma_sync_sg_for_cpu(struct device *dev,
                struct scatterlist *sg, int nelems, enum dma_data_direction dir)
{
}
static inline void dma_sync_sg_for_device(struct device *dev,
                struct scatterlist *sg, int nelems, enum dma_data_direction dir)
{
}
static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
        return false;
}
static inline bool dma_need_unmap(struct device *dev)
{
        return false;
}
#endif /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */

struct page *dma_alloc_pages(struct device *dev, size_t size,
                dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
void dma_free_pages(struct device *dev, size_t size, struct page *page,
                dma_addr_t dma_handle, enum dma_data_direction dir);
int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
                size_t size, struct page *page);

static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
                dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
{
        struct page *page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
        return page ? page_address(page) : NULL;
}

static inline void dma_free_noncoherent(struct device *dev, size_t size,
                void *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir)
{
        dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
}

static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
        /* DMA must never operate on areas that might be remapped. */
        if (dev_WARN_ONCE(dev, is_vmalloc_addr(ptr),
                          "rejecting DMA map of vmalloc memory\n"))
                return DMA_MAPPING_ERROR;
        debug_dma_map_single(dev, ptr, size);
        return dma_map_page_attrs(dev, virt_to_page(ptr), offset_in_page(ptr),
                        size, dir, attrs);
}

static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
        return dma_unmap_page_attrs(dev, addr, size, dir, attrs);
}

static inline void dma_sync_single_range_for_cpu(struct device *dev,
                dma_addr_t addr, unsigned long offset, size_t size,
                enum dma_data_direction dir)
{
        return dma_sync_single_for_cpu(dev, addr + offset, size, dir);
}

static inline void dma_sync_single_range_for_device(struct device *dev,
                dma_addr_t addr, unsigned long offset, size_t size,
                enum dma_data_direction dir)
{
        return dma_sync_single_for_device(dev, addr + offset, size, dir);
}

/**
 * dma_unmap_sgtable - Unmap the given buffer for DMA
 * @dev:        The device for which to perform the DMA operation
 * @sgt:        The sg_table object describing the buffer
 * @dir:        DMA direction
 * @attrs:        Optional DMA attributes for the unmap operation
 *
 * Unmaps a buffer described by a scatterlist stored in the given sg_table
 * object for the @dir DMA operation by the @dev device. After this function
 * the ownership of the buffer is transferred back to the CPU domain.
 */
static inline void dma_unmap_sgtable(struct device *dev, struct sg_table *sgt,
                enum dma_data_direction dir, unsigned long attrs)
{
        dma_unmap_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
}

/**
 * dma_sync_sgtable_for_cpu - Synchronize the given buffer for CPU access
 * @dev:        The device for which to perform the DMA operation
 * @sgt:        The sg_table object describing the buffer
 * @dir:        DMA direction
 *
 * Performs the needed cache synchronization and moves the ownership of the
 * buffer back to the CPU domain, so it is safe to perform any access to it
 * by the CPU. Before doing any further DMA operations, one has to transfer
 * the ownership of the buffer back to the DMA domain by calling the
 * dma_sync_sgtable_for_device().
 */
static inline void dma_sync_sgtable_for_cpu(struct device *dev,
                struct sg_table *sgt, enum dma_data_direction dir)
{
        dma_sync_sg_for_cpu(dev, sgt->sgl, sgt->orig_nents, dir);
}

/**
 * dma_sync_sgtable_for_device - Synchronize the given buffer for DMA
 * @dev:        The device for which to perform the DMA operation
 * @sgt:        The sg_table object describing the buffer
 * @dir:        DMA direction
 *
 * Performs the needed cache synchronization and moves the ownership of the
 * buffer back to the DMA domain, so it is safe to perform the DMA operation.
 * Once finished, one has to call dma_sync_sgtable_for_cpu() or
 * dma_unmap_sgtable().
 */
static inline void dma_sync_sgtable_for_device(struct device *dev,
                struct sg_table *sgt, enum dma_data_direction dir)
{
        dma_sync_sg_for_device(dev, sgt->sgl, sgt->orig_nents, dir);
}

#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0)
#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0)
#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0)
#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0)
#define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0)
#define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0)
#define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0)
#define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, 0)

bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size);

static inline void *dma_alloc_coherent(struct device *dev, size_t size,
                dma_addr_t *dma_handle, gfp_t gfp)
{
        return dma_alloc_attrs(dev, size, dma_handle, gfp,
                        (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0);
}

static inline void dma_free_coherent(struct device *dev, size_t size,
                void *cpu_addr, dma_addr_t dma_handle)
{
        return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0);
}


static inline u64 dma_get_mask(struct device *dev)
{
        if (dev->dma_mask && *dev->dma_mask)
                return *dev->dma_mask;
        return DMA_BIT_MASK(32);
}

/*
 * Set both the DMA mask and the coherent DMA mask to the same thing.
 * Note that we don't check the return value from dma_set_coherent_mask()
 * as the DMA API guarantees that the coherent DMA mask can be set to
 * the same or smaller than the streaming DMA mask.
 */
static inline int dma_set_mask_and_coherent(struct device *dev, u64 mask)
{
        int rc = dma_set_mask(dev, mask);
        if (rc == 0)
                dma_set_coherent_mask(dev, mask);
        return rc;
}

/*
 * Similar to the above, except it deals with the case where the device
 * does not have dev->dma_mask appropriately setup.
 */
static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
{
        dev->dma_mask = &dev->coherent_dma_mask;
        return dma_set_mask_and_coherent(dev, mask);
}

static inline unsigned int dma_get_max_seg_size(struct device *dev)
{
        if (dev->dma_parms && dev->dma_parms->max_segment_size)
                return dev->dma_parms->max_segment_size;
        return SZ_64K;
}

static inline void dma_set_max_seg_size(struct device *dev, unsigned int size)
{
        if (WARN_ON_ONCE(!dev->dma_parms))
                return;
        dev->dma_parms->max_segment_size = size;
}

static inline unsigned long dma_get_seg_boundary(struct device *dev)
{
        if (dev->dma_parms && dev->dma_parms->segment_boundary_mask)
                return dev->dma_parms->segment_boundary_mask;
        return ULONG_MAX;
}

/**
 * dma_get_seg_boundary_nr_pages - return the segment boundary in "page" units
 * @dev: device to guery the boundary for
 * @page_shift: ilog() of the IOMMU page size
 *
 * Return the segment boundary in IOMMU page units (which may be different from
 * the CPU page size) for the passed in device.
 *
 * If @dev is NULL a boundary of U32_MAX is assumed, this case is just for
 * non-DMA API callers.
 */
static inline unsigned long dma_get_seg_boundary_nr_pages(struct device *dev,
                unsigned int page_shift)
{
        if (!dev)
                return (U32_MAX >> page_shift) + 1;
        return (dma_get_seg_boundary(dev) >> page_shift) + 1;
}

static inline void dma_set_seg_boundary(struct device *dev, unsigned long mask)
{
        if (WARN_ON_ONCE(!dev->dma_parms))
                return;
        dev->dma_parms->segment_boundary_mask = mask;
}

static inline unsigned int dma_get_min_align_mask(struct device *dev)
{
        if (dev->dma_parms)
                return dev->dma_parms->min_align_mask;
        return 0;
}

static inline void dma_set_min_align_mask(struct device *dev,
                unsigned int min_align_mask)
{
        if (WARN_ON_ONCE(!dev->dma_parms))
                return;
        dev->dma_parms->min_align_mask = min_align_mask;
}

#ifndef dma_get_cache_alignment
static inline int dma_get_cache_alignment(void)
{
#ifdef ARCH_HAS_DMA_MINALIGN
        return ARCH_DMA_MINALIGN;
#endif
        return 1;
}
#endif

#ifdef ARCH_HAS_DMA_MINALIGN
#define ____dma_from_device_aligned __aligned(ARCH_DMA_MINALIGN)
#else
#define ____dma_from_device_aligned
#endif
/* Mark start of DMA buffer */
#define __dma_from_device_group_begin(GROUP)                        \
        __cacheline_group_begin(GROUP) ____dma_from_device_aligned
/* Mark end of DMA buffer */
#define __dma_from_device_group_end(GROUP)                        \
        __cacheline_group_end(GROUP) ____dma_from_device_aligned

static inline void *dmam_alloc_coherent(struct device *dev, size_t size,
                dma_addr_t *dma_handle, gfp_t gfp)
{
        return dmam_alloc_attrs(dev, size, dma_handle, gfp,
                        (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0);
}

static inline void *dma_alloc_wc(struct device *dev, size_t size,
                                 dma_addr_t *dma_addr, gfp_t gfp)
{
        unsigned long attrs = DMA_ATTR_WRITE_COMBINE;

        if (gfp & __GFP_NOWARN)
                attrs |= DMA_ATTR_NO_WARN;

        return dma_alloc_attrs(dev, size, dma_addr, gfp, attrs);
}

static inline void dma_free_wc(struct device *dev, size_t size,
                               void *cpu_addr, dma_addr_t dma_addr)
{
        return dma_free_attrs(dev, size, cpu_addr, dma_addr,
                              DMA_ATTR_WRITE_COMBINE);
}

static inline int dma_mmap_wc(struct device *dev,
                              struct vm_area_struct *vma,
                              void *cpu_addr, dma_addr_t dma_addr,
                              size_t size)
{
        return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size,
                              DMA_ATTR_WRITE_COMBINE);
}

#ifdef CONFIG_NEED_DMA_MAP_STATE
#define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME)        dma_addr_t ADDR_NAME
#define DEFINE_DMA_UNMAP_LEN(LEN_NAME)          __u32 LEN_NAME
#define dma_unmap_addr(PTR, ADDR_NAME)           ((PTR)->ADDR_NAME)
#define dma_unmap_addr_set(PTR, ADDR_NAME, VAL)  (((PTR)->ADDR_NAME) = (VAL))
#define dma_unmap_len(PTR, LEN_NAME)             ((PTR)->LEN_NAME)
#define dma_unmap_len_set(PTR, LEN_NAME, VAL)    (((PTR)->LEN_NAME) = (VAL))
#else
#define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME)
#define DEFINE_DMA_UNMAP_LEN(LEN_NAME)
#define dma_unmap_addr(PTR, ADDR_NAME)           \
        ({ typeof(PTR) __p __maybe_unused = PTR; 0; })
#define dma_unmap_addr_set(PTR, ADDR_NAME, VAL)  \
        do { typeof(PTR) __p __maybe_unused = PTR; } while (0)
#define dma_unmap_len(PTR, LEN_NAME)             \
        ({ typeof(PTR) __p __maybe_unused = PTR; 0; })
#define dma_unmap_len_set(PTR, LEN_NAME, VAL)    \
        do { typeof(PTR) __p __maybe_unused = PTR; } while (0)
#endif

#endif /* _LINUX_DMA_MAPPING_H */























































































































































































































































































































   28 













   28 





   30 








   31 



















    5 






























    5 





    5 








    5 
















   26 

   28 
















    4 

    4 














































   13 

   18 
















    2 

    3 













































    1 

    1 
















    1 

    1 




































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/projid.h>
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/nstree.h>

static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *map);
static void free_user_ns(struct work_struct *work);

static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
{
        return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
}

static void dec_user_namespaces(struct ucounts *ucounts)
{
        return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
}

static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
{
        /* Start with the same capabilities as init but useless for doing
         * anything as the capabilities are bound to the new user namespace.
         */
        cred->securebits = SECUREBITS_DEFAULT;
        cred->cap_inheritable = CAP_EMPTY_SET;
        cred->cap_permitted = CAP_FULL_SET;
        cred->cap_effective = CAP_FULL_SET;
        cred->cap_ambient = CAP_EMPTY_SET;
        cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
        key_put(cred->request_key_auth);
        cred->request_key_auth = NULL;
#endif
        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
        cred->user_ns = user_ns;
}

static unsigned long enforced_nproc_rlimit(void)
{
        unsigned long limit = RLIM_INFINITY;

        /* Is RLIMIT_NPROC currently enforced? */
        if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
            (current_user_ns() != &init_user_ns))
                limit = rlimit(RLIMIT_NPROC);

        return limit;
}

/*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
 * new namespace.
 *
 * This is called by copy_creds(), which will finish setting the target task's
 * credentials.
 */
int create_user_ns(struct cred *new)
{
        struct user_namespace *ns, *parent_ns = new->user_ns;
        kuid_t owner = new->euid;
        kgid_t group = new->egid;
        struct ucounts *ucounts;
        int ret, i;

        ret = -ENOSPC;
        if (parent_ns->level > 32)
                goto fail;

        ucounts = inc_user_namespaces(parent_ns, owner);
        if (!ucounts)
                goto fail;

        /*
         * Verify that we can not violate the policy of which files
         * may be accessed that is specified by the root directory,
         * by verifying that the root directory is at the root of the
         * mount namespace which allows all files to be accessed.
         */
        ret = -EPERM;
        if (current_chrooted())
                goto fail_dec;

        /* The creator needs a mapping in the parent user namespace
         * or else we won't be able to reasonably tell userspace who
         * created a user_namespace.
         */
        ret = -EPERM;
        if (!kuid_has_mapping(parent_ns, owner) ||
            !kgid_has_mapping(parent_ns, group))
                goto fail_dec;

        ret = security_create_user_ns(new);
        if (ret < 0)
                goto fail_dec;

        ret = -ENOMEM;
        ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                goto fail_dec;

        ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);

        ret = ns_common_init(ns);
        if (ret)
                goto fail_free;

        /* Leave the new->user_ns reference with the new user namespace. */
        ns->parent = parent_ns;
        ns->level = parent_ns->level + 1;
        ns->owner = owner;
        ns->group = group;
        INIT_WORK(&ns->work, free_user_ns);
        for (i = 0; i < UCOUNT_COUNTS; i++) {
                ns->ucount_max[i] = INT_MAX;
        }
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
        ns->ucounts = ucounts;

        /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
        mutex_lock(&userns_state_mutex);
        ns->flags = parent_ns->flags;
        mutex_unlock(&userns_state_mutex);

#ifdef CONFIG_KEYS
        INIT_LIST_HEAD(&ns->keyring_name_list);
        init_rwsem(&ns->keyring_sem);
#endif
        ret = -ENOMEM;
        if (!setup_userns_sysctls(ns))
                goto fail_keyring;

        set_cred_user_ns(new, ns);
        ns_tree_add(ns);
        return 0;
fail_keyring:
#ifdef CONFIG_PERSISTENT_KEYRINGS
        key_put(ns->persistent_keyring_register);
#endif
        ns_common_free(ns);
fail_free:
        kmem_cache_free(user_ns_cachep, ns);
fail_dec:
        dec_user_namespaces(ucounts);
fail:
        return ret;
}

int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
{
        struct cred *cred;
        int err = -ENOMEM;

        if (!(unshare_flags & CLONE_NEWUSER))
                return 0;

        cred = prepare_creds();
        if (cred) {
                err = create_user_ns(cred);
                if (err)
                        put_cred(cred);
                else
                        *new_cred = cred;
        }

        return err;
}

static void free_user_ns(struct work_struct *work)
{
        struct user_namespace *parent, *ns =
                container_of(work, struct user_namespace, work);

        do {
                struct ucounts *ucounts = ns->ucounts;
                parent = ns->parent;
                ns_tree_remove(ns);
                if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->gid_map.forward);
                        kfree(ns->gid_map.reverse);
                }
                if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->uid_map.forward);
                        kfree(ns->uid_map.reverse);
                }
                if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->projid_map.forward);
                        kfree(ns->projid_map.reverse);
                }
#if IS_ENABLED(CONFIG_BINFMT_MISC)
                kfree(ns->binfmt_misc);
#endif
                retire_userns_sysctls(ns);
                key_free_user_ns(ns);
                ns_common_free(ns);
                /* Concurrent nstree traversal depends on a grace period. */
                kfree_rcu(ns, ns.ns_rcu);
                dec_user_namespaces(ucounts);
                ns = parent;
        } while (ns_ref_put(parent));
}

void __put_user_ns(struct user_namespace *ns)
{
        schedule_work(&ns->work);
}
EXPORT_SYMBOL(__put_user_ns);

/*
 * struct idmap_key - holds the information necessary to find an idmapping in a
 * sorted idmap array. It is passed to cmp_map_id() as first argument.
 */
struct idmap_key {
        bool map_up; /* true  -> id from kid; false -> kid from id */
        u32 id; /* id to find */
        u32 count;
};

/*
 * cmp_map_id - Function to be passed to bsearch() to find the requested
 * idmapping. Expects struct idmap_key to be passed via @k.
 */
static int cmp_map_id(const void *k, const void *e)
{
        u32 first, last, id2;
        const struct idmap_key *key = k;
        const struct uid_gid_extent *el = e;

        id2 = key->id + key->count - 1;

        /* handle map_id_{down,up}() */
        if (key->map_up)
                first = el->lower_first;
        else
                first = el->first;

        last = first + el->count - 1;

        if (key->id >= first && key->id <= last &&
            (id2 >= first && id2 <= last))
                return 0;

        if (key->id < first || id2 < first)
                return -1;

        return 1;
}

/*
 * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        struct idmap_key key;

        key.map_up = false;
        key.count = count;
        key.id = id;

        return bsearch(&key, map->forward, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

/*
 * map_id_range_down_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        unsigned idx;
        u32 first, last, id2;

        id2 = id + count - 1;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last &&
                    (id2 >= first && id2 <= last))
                        return &map->extent[idx];
        }
        return NULL;
}

static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_range_down_base(extents, map, id, count);
        else
                extent = map_id_range_down_max(extents, map, id, count);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->first) + extent->lower_first;
        else
                id = (u32) -1;

        return id;
}

u32 map_id_down(struct uid_gid_map *map, u32 id)
{
        return map_id_range_down(map, id, 1);
}

/*
 * map_id_up_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        unsigned idx;
        u32 first, last, id2;

        id2 = id + count - 1;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].lower_first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last &&
                    (id2 >= first && id2 <= last))
                        return &map->extent[idx];
        }
        return NULL;
}

/*
 * map_id_up_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        struct idmap_key key;

        key.map_up = true;
        key.count = count;
        key.id = id;

        return bsearch(&key, map->reverse, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_range_up_base(extents, map, id, count);
        else
                extent = map_id_range_up_max(extents, map, id, count);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->lower_first) + extent->first;
        else
                id = (u32) -1;

        return id;
}

u32 map_id_up(struct uid_gid_map *map, u32 id)
{
        return map_id_range_up(map, id, 1);
}

/**
 *        make_kuid - Map a user-namespace uid pair into a kuid.
 *        @ns:  User namespace that the uid is in
 *        @uid: User identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace uid
 *        pair INVALID_UID is returned.  Callers are expected to test
 *        for and handle INVALID_UID being returned.  INVALID_UID
 *        may be tested for using uid_valid().
 */
kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
{
        /* Map the uid to a global kernel uid */
        return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
}
EXPORT_SYMBOL(make_kuid);

/**
 *        from_kuid - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kuid has no mapping in @targ (uid_t)-1 is returned.
 */
uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->uid_map, __kuid_val(kuid));
}
EXPORT_SYMBOL(from_kuid);

/**
 *        from_kuid_munged - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kuid from_kuid_munged never fails and always
 *        returns a valid uid.  This makes from_kuid_munged appropriate
 *        for use in syscalls like stat and getuid where failing the
 *        system call and failing to provide a valid uid are not an
 *        options.
 *
 *        If @kuid has no mapping in @targ overflowuid is returned.
 */
uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
        uid_t uid;
        uid = from_kuid(targ, kuid);

        if (uid == (uid_t) -1)
                uid = overflowuid;
        return uid;
}
EXPORT_SYMBOL(from_kuid_munged);

/**
 *        make_kgid - Map a user-namespace gid pair into a kgid.
 *        @ns:  User namespace that the gid is in
 *        @gid: group identifier
 *
 *        Maps a user-namespace gid pair into a kernel internal kgid,
 *        and returns that kgid.
 *
 *        When there is no mapping defined for the user-namespace gid
 *        pair INVALID_GID is returned.  Callers are expected to test
 *        for and handle INVALID_GID being returned.  INVALID_GID may be
 *        tested for using gid_valid().
 */
kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
{
        /* Map the gid to a global kernel gid */
        return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
}
EXPORT_SYMBOL(make_kgid);

/**
 *        from_kgid - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kgid has no mapping in @targ (gid_t)-1 is returned.
 */
gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
{
        /* Map the gid from a global kernel gid */
        return map_id_up(&targ->gid_map, __kgid_val(kgid));
}
EXPORT_SYMBOL(from_kgid);

/**
 *        from_kgid_munged - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kgid from_kgid_munged never fails and always
 *        returns a valid gid.  This makes from_kgid_munged appropriate
 *        for use in syscalls like stat and getgid where failing the
 *        system call and failing to provide a valid gid are not options.
 *
 *        If @kgid has no mapping in @targ overflowgid is returned.
 */
gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
{
        gid_t gid;
        gid = from_kgid(targ, kgid);

        if (gid == (gid_t) -1)
                gid = overflowgid;
        return gid;
}
EXPORT_SYMBOL(from_kgid_munged);

/**
 *        make_kprojid - Map a user-namespace projid pair into a kprojid.
 *        @ns:  User namespace that the projid is in
 *        @projid: Project identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace projid
 *        pair INVALID_PROJID is returned.  Callers are expected to test
 *        for and handle INVALID_PROJID being returned.  INVALID_PROJID
 *        may be tested for using projid_valid().
 */
kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
{
        /* Map the uid to a global kernel uid */
        return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
}
EXPORT_SYMBOL(make_kprojid);

/**
 *        from_kprojid - Create a projid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal project identifier to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kprojid has no mapping in @targ (projid_t)-1 is returned.
 */
projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
}
EXPORT_SYMBOL(from_kprojid);

/**
 *        from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal projid to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kprojid from_kprojid_munged never fails and always
 *        returns a valid projid.  This makes from_kprojid_munged
 *        appropriate for use in syscalls like stat and where
 *        failing the system call and failing to provide a valid projid are
 *        not an options.
 *
 *        If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
 */
projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
{
        projid_t projid;
        projid = from_kprojid(targ, kprojid);

        if (projid == (projid_t) -1)
                projid = OVERFLOW_PROJID;
        return projid;
}
EXPORT_SYMBOL(from_kprojid_munged);


static int uid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        uid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int gid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        gid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int projid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        projid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static void *m_start(struct seq_file *seq, loff_t *ppos,
                     struct uid_gid_map *map)
{
        loff_t pos = *ppos;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (pos >= extents)
                return NULL;

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return &map->extent[pos];

        return &map->forward[pos];
}

static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->uid_map);
}

static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->gid_map);
}

static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->projid_map);
}

static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
{
        (*pos)++;
        return seq->op->start(seq, pos);
}

static void m_stop(struct seq_file *seq, void *v)
{
        return;
}

const struct seq_operations proc_uid_seq_operations = {
        .start = uid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = uid_m_show,
};

const struct seq_operations proc_gid_seq_operations = {
        .start = gid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = gid_m_show,
};

const struct seq_operations proc_projid_seq_operations = {
        .start = projid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = projid_m_show,
};

static bool mappings_overlap(struct uid_gid_map *new_map,
                             struct uid_gid_extent *extent)
{
        u32 upper_first, lower_first, upper_last, lower_last;
        unsigned idx;

        upper_first = extent->first;
        lower_first = extent->lower_first;
        upper_last = upper_first + extent->count - 1;
        lower_last = lower_first + extent->count - 1;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                u32 prev_upper_first, prev_lower_first;
                u32 prev_upper_last, prev_lower_last;
                struct uid_gid_extent *prev;

                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        prev = &new_map->extent[idx];
                else
                        prev = &new_map->forward[idx];

                prev_upper_first = prev->first;
                prev_lower_first = prev->lower_first;
                prev_upper_last = prev_upper_first + prev->count - 1;
                prev_lower_last = prev_lower_first + prev->count - 1;

                /* Does the upper range intersect a previous extent? */
                if ((prev_upper_first <= upper_last) &&
                    (prev_upper_last >= upper_first))
                        return true;

                /* Does the lower range intersect a previous extent? */
                if ((prev_lower_first <= lower_last) &&
                    (prev_lower_last >= lower_first))
                        return true;
        }
        return false;
}

/*
 * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
 * Takes care to allocate a 4K block of memory if the number of mappings exceeds
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
{
        struct uid_gid_extent *dest;

        if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
                struct uid_gid_extent *forward;

                /* Allocate memory for 340 mappings. */
                forward = kmalloc_objs(struct uid_gid_extent,
                                       UID_GID_MAP_MAX_EXTENTS);
                if (!forward)
                        return -ENOMEM;

                /* Copy over memory. Only set up memory for the forward pointer.
                 * Defer the memory setup for the reverse pointer.
                 */
                memcpy(forward, map->extent,
                       map->nr_extents * sizeof(map->extent[0]));

                map->forward = forward;
                map->reverse = NULL;
        }

        if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
                dest = &map->extent[map->nr_extents];
        else
                dest = &map->forward[map->nr_extents];

        *dest = *extent;
        map->nr_extents++;
        return 0;
}

/* cmp function to sort() forward mappings */
static int cmp_extents_forward(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->first < e2->first)
                return -1;

        if (e1->first > e2->first)
                return 1;

        return 0;
}

/* cmp function to sort() reverse mappings */
static int cmp_extents_reverse(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->lower_first < e2->lower_first)
                return -1;

        if (e1->lower_first > e2->lower_first)
                return 1;

        return 0;
}

/*
 * sort_idmaps - Sorts an array of idmap entries.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int sort_idmaps(struct uid_gid_map *map)
{
        if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return 0;

        /* Sort forward array. */
        sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_forward, NULL);

        /* Only copy the memory from forward we actually need. */
        map->reverse = kmemdup_array(map->forward, map->nr_extents,
                                     sizeof(struct uid_gid_extent), GFP_KERNEL);
        if (!map->reverse)
                return -ENOMEM;

        /* Sort reverse array. */
        sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_reverse, NULL);

        return 0;
}

/**
 * verify_root_map() - check the uid 0 mapping
 * @file: idmapping file
 * @map_ns: user namespace of the target process
 * @new_map: requested idmap
 *
 * If a process requests mapping parent uid 0 into the new ns, verify that the
 * process writing the map had the CAP_SETFCAP capability as the target process
 * will be able to write fscaps that are valid in ancestor user namespaces.
 *
 * Return: true if the mapping is allowed, false if not.
 */
static bool verify_root_map(const struct file *file,
                            struct user_namespace *map_ns,
                            struct uid_gid_map *new_map)
{
        int idx;
        const struct user_namespace *file_ns = file->f_cred->user_ns;
        struct uid_gid_extent *extent0 = NULL;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        extent0 = &new_map->extent[idx];
                else
                        extent0 = &new_map->forward[idx];
                if (extent0->lower_first == 0)
                        break;

                extent0 = NULL;
        }

        if (!extent0)
                return true;

        if (map_ns == file_ns) {
                /* The process unshared its ns and is writing to its own
                 * /proc/self/uid_map.  User already has full capabilites in
                 * the new namespace.  Verify that the parent had CAP_SETFCAP
                 * when it unshared.
                 * */
                if (!file_ns->parent_could_setfcap)
                        return false;
        } else {
                /* Process p1 is writing to uid_map of p2, who is in a child
                 * user namespace to p1's.  Verify that the opener of the map
                 * file has CAP_SETFCAP against the parent of the new map
                 * namespace */
                if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
                        return false;
        }

        return true;
}

static ssize_t map_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos,
                         int cap_setid,
                         struct uid_gid_map *map,
                         struct uid_gid_map *parent_map)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *map_ns = seq->private;
        struct uid_gid_map new_map;
        unsigned idx;
        struct uid_gid_extent extent;
        char *kbuf, *pos, *next_line;
        ssize_t ret;

        /* Only allow < page size writes at the beginning of the file */
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                return -EINVAL;

        /* Slurp in the user data */
        kbuf = memdup_user_nul(buf, count);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);

        /*
         * The userns_state_mutex serializes all writes to any given map.
         *
         * Any map is only ever written once.
         *
         * An id map fits within 1 cache line on most architectures.
         *
         * On read nothing needs to be done unless you are on an
         * architecture with a crazy cache coherency model like alpha.
         *
         * There is a one time data dependency between reading the
         * count of the extents and the values of the extents.  The
         * desired behavior is to see the values of the extents that
         * were written before the count of the extents.
         *
         * To achieve this smp_wmb() is used on guarantee the write
         * order and smp_rmb() is guaranteed that we don't have crazy
         * architectures returning stale data.
         */
        mutex_lock(&userns_state_mutex);

        memset(&new_map, 0, sizeof(struct uid_gid_map));

        ret = -EPERM;
        /* Only allow one successful write to the map */
        if (map->nr_extents != 0)
                goto out;

        /*
         * Adjusting namespace settings requires capabilities on the target.
         */
        if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
                goto out;

        /* Parse the user data */
        ret = -EINVAL;
        pos = kbuf;
        for (; pos; pos = next_line) {

                /* Find the end of line and ensure I don't look past it */
                next_line = strchr(pos, '\n');
                if (next_line) {
                        *next_line = '\0';
                        next_line++;
                        if (*next_line == '\0')
                                next_line = NULL;
                }

                pos = skip_spaces(pos);
                extent.first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.lower_first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.count = simple_strtoul(pos, &pos, 10);
                if (*pos && !isspace(*pos))
                        goto out;

                /* Verify there is not trailing junk on the line */
                pos = skip_spaces(pos);
                if (*pos != '\0')
                        goto out;

                /* Verify we have been given valid starting values */
                if ((extent.first == (u32) -1) ||
                    (extent.lower_first == (u32) -1))
                        goto out;

                /* Verify count is not zero and does not cause the
                 * extent to wrap
                 */
                if ((extent.first + extent.count) <= extent.first)
                        goto out;
                if ((extent.lower_first + extent.count) <=
                     extent.lower_first)
                        goto out;

                /* Do the ranges in extent overlap any previous extents? */
                if (mappings_overlap(&new_map, &extent))
                        goto out;

                if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
                    (next_line != NULL))
                        goto out;

                ret = insert_extent(&new_map, &extent);
                if (ret < 0)
                        goto out;
                ret = -EINVAL;
        }
        /* Be very certain the new map actually exists */
        if (new_map.nr_extents == 0)
                goto out;

        ret = -EPERM;
        /* Validate the user is allowed to use user id's mapped to. */
        if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
                goto out;

        ret = -EPERM;
        /* Map the lower ids from the parent user namespace to the
         * kernel global id space.
         */
        for (idx = 0; idx < new_map.nr_extents; idx++) {
                struct uid_gid_extent *e;
                u32 lower_first;

                if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        e = &new_map.extent[idx];
                else
                        e = &new_map.forward[idx];

                lower_first = map_id_range_down(parent_map,
                                                e->lower_first,
                                                e->count);

                /* Fail if we can not map the specified extent to
                 * the kernel global id space.
                 */
                if (lower_first == (u32) -1)
                        goto out;

                e->lower_first = lower_first;
        }

        /*
         * If we want to use binary search for lookup, this clones the extent
         * array and sorts both copies.
         */
        ret = sort_idmaps(&new_map);
        if (ret < 0)
                goto out;

        /* Install the map */
        if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
                memcpy(map->extent, new_map.extent,
                       new_map.nr_extents * sizeof(new_map.extent[0]));
        } else {
                map->forward = new_map.forward;
                map->reverse = new_map.reverse;
        }
        smp_wmb();
        map->nr_extents = new_map.nr_extents;

        *ppos = count;
        ret = count;
out:
        if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(new_map.forward);
                kfree(new_map.reverse);
                map->forward = NULL;
                map->reverse = NULL;
                map->nr_extents = 0;
        }

        mutex_unlock(&userns_state_mutex);
        kfree(kbuf);
        return ret;
}

ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETUID,
                         &ns->uid_map, &ns->parent->uid_map);
}

ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETGID,
                         &ns->gid_map, &ns->parent->gid_map);
}

ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
                              size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        /* Anyone can set any valid project id no capability needed */
        return map_write(file, buf, size, ppos, -1,
                         &ns->projid_map, &ns->parent->projid_map);
}

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
{
        const struct cred *cred = file->f_cred;

        if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
                return false;

        /* Don't allow mappings that would allow anything that wouldn't
         * be allowed without the establishment of unprivileged mappings.
         */
        if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
            uid_eq(ns->owner, cred->euid)) {
                u32 id = new_map->extent[0].lower_first;
                if (cap_setid == CAP_SETUID) {
                        kuid_t uid = make_kuid(ns->parent, id);
                        if (uid_eq(uid, cred->euid))
                                return true;
                } else if (cap_setid == CAP_SETGID) {
                        kgid_t gid = make_kgid(ns->parent, id);
                        if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
                            gid_eq(gid, cred->egid))
                                return true;
                }
        }

        /* Allow anyone to set a mapping that doesn't require privilege */
        if (!cap_valid(cap_setid))
                return true;

        /* Allow the specified ids if we have the appropriate capability
         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
         * And the opener of the id file also has the appropriate capability.
         */
        if (ns_capable(ns->parent, cap_setid) &&
            file_ns_capable(file, ns->parent, cap_setid))
                return true;

        return false;
}

int proc_setgroups_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        unsigned long userns_flags = READ_ONCE(ns->flags);

        seq_printf(seq, "%s\n",
                   (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
                   "allow" : "deny");
        return 0;
}

ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        char kbuf[8], *pos;
        bool setgroups_allowed;
        ssize_t ret;

        /* Only allow a very narrow range of strings to be written */
        ret = -EINVAL;
        if ((*ppos != 0) || (count >= sizeof(kbuf)))
                goto out;

        /* What was written? */
        ret = -EFAULT;
        if (copy_from_user(kbuf, buf, count))
                goto out;
        kbuf[count] = '\0';
        pos = kbuf;

        /* What is being requested? */
        ret = -EINVAL;
        if (strncmp(pos, "allow", 5) == 0) {
                pos += 5;
                setgroups_allowed = true;
        }
        else if (strncmp(pos, "deny", 4) == 0) {
                pos += 4;
                setgroups_allowed = false;
        }
        else
                goto out;

        /* Verify there is not trailing junk on the line */
        pos = skip_spaces(pos);
        if (*pos != '\0')
                goto out;

        ret = -EPERM;
        mutex_lock(&userns_state_mutex);
        if (setgroups_allowed) {
                /* Enabling setgroups after setgroups has been disabled
                 * is not allowed.
                 */
                if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
                        goto out_unlock;
        } else {
                /* Permanently disabling setgroups after setgroups has
                 * been enabled by writing the gid_map is not allowed.
                 */
                if (ns->gid_map.nr_extents != 0)
                        goto out_unlock;
                ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
        }
        mutex_unlock(&userns_state_mutex);

        /* Report a successful write */
        *ppos = count;
        ret = count;
out:
        return ret;
out_unlock:
        mutex_unlock(&userns_state_mutex);
        goto out;
}

bool userns_may_setgroups(const struct user_namespace *ns)
{
        bool allowed;

        mutex_lock(&userns_state_mutex);
        /* It is not safe to use setgroups until a gid mapping in
         * the user namespace has been established.
         */
        allowed = ns->gid_map.nr_extents != 0;
        /* Is setgroups allowed? */
        allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
        mutex_unlock(&userns_state_mutex);

        return allowed;
}

/*
 * Returns true if @child is the same namespace or a descendant of
 * @ancestor.
 */
bool in_userns(const struct user_namespace *ancestor,
               const struct user_namespace *child)
{
        const struct user_namespace *ns;
        for (ns = child; ns->level > ancestor->level; ns = ns->parent)
                ;
        return (ns == ancestor);
}

bool current_in_userns(const struct user_namespace *target_ns)
{
        return in_userns(target_ns, current_user_ns());
}
EXPORT_SYMBOL(current_in_userns);

static struct ns_common *userns_get(struct task_struct *task)
{
        struct user_namespace *user_ns;

        rcu_read_lock();
        user_ns = get_user_ns(__task_cred(task)->user_ns);
        rcu_read_unlock();

        return user_ns ? &user_ns->ns : NULL;
}

static void userns_put(struct ns_common *ns)
{
        put_user_ns(to_user_ns(ns));
}

static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct user_namespace *user_ns = to_user_ns(ns);
        struct cred *cred;

        /* Don't allow gaining capabilities by reentering
         * the same user namespace.
         */
        if (user_ns == current_user_ns())
                return -EINVAL;

        /* Tasks that share a thread group must share a user namespace */
        if (!thread_group_empty(current))
                return -EINVAL;

        if (current->fs->users != 1)
                return -EINVAL;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        cred = nsset_cred(nsset);
        if (!cred)
                return -EINVAL;

        put_user_ns(cred->user_ns);
        set_cred_user_ns(cred, get_user_ns(user_ns));

        if (set_cred_ucounts(cred) < 0)
                return -EINVAL;

        return 0;
}

struct ns_common *ns_get_owner(struct ns_common *ns)
{
        struct user_namespace *my_user_ns = current_user_ns();
        struct user_namespace *owner, *p;

        /* See if the owner is in the current user namespace */
        owner = p = ns->ops->owner(ns);
        for (;;) {
                if (!p)
                        return ERR_PTR(-EPERM);
                if (p == my_user_ns)
                        break;
                p = p->parent;
        }

        return &get_user_ns(owner)->ns;
}

static struct user_namespace *userns_owner(struct ns_common *ns)
{
        return to_user_ns(ns)->parent;
}

const struct proc_ns_operations userns_operations = {
        .name                = "user",
        .get                = userns_get,
        .put                = userns_put,
        .install        = userns_install,
        .owner                = userns_owner,
        .get_parent        = ns_get_owner,
};

static __init int user_namespaces_init(void)
{
        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
        ns_tree_add(&init_user_ns);
        return 0;
}
subsys_initcall(user_namespaces_init);
































































































































































































































































    1 









    1 
















    1 
    1 







    1 





    1 











    1 
















    1 
    1 





    1 
    1 









































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
// SPDX-License-Identifier: LGPL-2.1
/*
 *
 * Copyright IBM Corporation, 2012
 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
 *
 * Cgroup v2
 * Copyright (C) 2019 Red Hat, Inc.
 * Author: Giuseppe Scrivano <gscrivan@redhat.com>
 *
 */

#include <linux/cgroup.h>
#include <linux/page_counter.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>

#define MEMFILE_PRIVATE(x, val)        (((x) << 16) | (val))
#define MEMFILE_IDX(val)        (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val)        ((val) & 0xffff)

/* Use t->m[0] to encode the offset */
#define MEMFILE_OFFSET(t, m0)        (((offsetof(t, m0) << 16) | sizeof_field(t, m0)))
#define MEMFILE_OFFSET0(val)        (((val) >> 16) & 0xffff)
#define MEMFILE_FIELD_SIZE(val)        ((val) & 0xffff)

#define DFL_TMPL_SIZE                ARRAY_SIZE(hugetlb_dfl_tmpl)
#define LEGACY_TMPL_SIZE        ARRAY_SIZE(hugetlb_legacy_tmpl)

static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
static struct cftype *dfl_files;
static struct cftype *legacy_files;

static inline struct page_counter *
__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
                                     bool rsvd)
{
        if (rsvd)
                return &h_cg->rsvd_hugepage[idx];
        return &h_cg->hugepage[idx];
}

static inline struct page_counter *
hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
{
        return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
}

static inline struct page_counter *
hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
{
        return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
}

static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
{
        return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
}

static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
{
        return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
}

static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
{
        return (h_cg == root_h_cgroup);
}

static inline struct hugetlb_cgroup *
parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
{
        return hugetlb_cgroup_from_css(h_cg->css.parent);
}

static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
{
        struct hstate *h;

        for_each_hstate(h) {
                if (page_counter_read(
                    hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
                        return true;
        }
        return false;
}

static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
                                struct hugetlb_cgroup *parent_h_cgroup)
{
        int idx;

        for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
                struct page_counter *fault, *fault_parent = NULL;
                struct page_counter *rsvd, *rsvd_parent = NULL;
                unsigned long limit;

                if (parent_h_cgroup) {
                        fault_parent = hugetlb_cgroup_counter_from_cgroup(
                                parent_h_cgroup, idx);
                        rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
                                parent_h_cgroup, idx);
                }
                fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx);
                rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx);

                page_counter_init(fault, fault_parent, false);
                page_counter_init(rsvd, rsvd_parent, false);

                if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
                        fault->track_failcnt = true;
                        rsvd->track_failcnt = true;
                }

                limit = round_down(PAGE_COUNTER_MAX,
                                   pages_per_huge_page(&hstates[idx]));

                VM_BUG_ON(page_counter_set_max(fault, limit));
                VM_BUG_ON(page_counter_set_max(rsvd, limit));
        }
}

static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
{
        int node;

        for_each_node(node)
                kfree(h_cgroup->nodeinfo[node]);
        kfree(h_cgroup);
}

static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
        struct hugetlb_cgroup *h_cgroup;
        int node;

        h_cgroup = kzalloc_flex(*h_cgroup, nodeinfo, nr_node_ids);

        if (!h_cgroup)
                return ERR_PTR(-ENOMEM);

        if (!parent_h_cgroup)
                root_h_cgroup = h_cgroup;

        /*
         * TODO: this routine can waste much memory for nodes which will
         * never be onlined. It's better to use memory hotplug callback
         * function.
         */
        for_each_node(node) {
                /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
                int node_to_alloc =
                        node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
                h_cgroup->nodeinfo[node] =
                        kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
                                     GFP_KERNEL, node_to_alloc);
                if (!h_cgroup->nodeinfo[node])
                        goto fail_alloc_nodeinfo;
        }

        hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
        return &h_cgroup->css;

fail_alloc_nodeinfo:
        hugetlb_cgroup_free(h_cgroup);
        return ERR_PTR(-ENOMEM);
}

static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
{
        hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
}

/*
 * Should be called with hugetlb_lock held.
 * Since we are holding hugetlb_lock, pages cannot get moved from
 * active list or uncharged from the cgroup, So no need to get
 * page reference and test for page active here. This function
 * cannot fail.
 */
static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
                                       struct folio *folio)
{
        unsigned int nr_pages;
        struct page_counter *counter;
        struct hugetlb_cgroup *hcg;
        struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);

        hcg = hugetlb_cgroup_from_folio(folio);
        /*
         * We can have pages in active list without any cgroup
         * ie, hugepage with less than 3 pages. We can safely
         * ignore those pages.
         */
        if (!hcg || hcg != h_cg)
                goto out;

        nr_pages = folio_nr_pages(folio);
        if (!parent) {
                parent = root_h_cgroup;
                /* root has no limit */
                page_counter_charge(&parent->hugepage[idx], nr_pages);
        }
        counter = &h_cg->hugepage[idx];
        /* Take the pages off the local counter */
        page_counter_cancel(counter, nr_pages);

        set_hugetlb_cgroup(folio, parent);
out:
        return;
}

/*
 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
 * the parent cgroup.
 */
static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
{
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
        struct hstate *h;
        struct folio *folio;

        do {
                for_each_hstate(h) {
                        spin_lock_irq(&hugetlb_lock);
                        list_for_each_entry(folio, &h->hugepage_activelist, lru)
                                hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio);

                        spin_unlock_irq(&hugetlb_lock);
                }
                cond_resched();
        } while (hugetlb_cgroup_have_usage(h_cg));
}

static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
                                 enum hugetlb_memory_event event)
{
        atomic_long_inc(&hugetlb->events_local[idx][event]);
        cgroup_file_notify(&hugetlb->events_local_file[idx]);

        do {
                atomic_long_inc(&hugetlb->events[idx][event]);
                cgroup_file_notify(&hugetlb->events_file[idx]);
        } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
                 !hugetlb_cgroup_is_root(hugetlb));
}

static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                          struct hugetlb_cgroup **ptr,
                                          bool rsvd)
{
        int ret = 0;
        struct page_counter *counter;
        struct hugetlb_cgroup *h_cg = NULL;

        if (hugetlb_cgroup_disabled())
                goto done;
again:
        rcu_read_lock();
        h_cg = hugetlb_cgroup_from_task(current);
        if (!css_tryget(&h_cg->css)) {
                rcu_read_unlock();
                goto again;
        }
        rcu_read_unlock();

        if (!page_counter_try_charge(
                    __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
                    nr_pages, &counter)) {
                ret = -ENOMEM;
                hugetlb_event(h_cg, idx, HUGETLB_MAX);
                css_put(&h_cg->css);
                goto done;
        }
        /* Reservations take a reference to the css because they do not get
         * reparented.
         */
        if (!rsvd)
                css_put(&h_cg->css);
done:
        *ptr = h_cg;
        return ret;
}

int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                 struct hugetlb_cgroup **ptr)
{
        return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
}

int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
                                      struct hugetlb_cgroup **ptr)
{
        return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
}

/* Should be called with hugetlb_lock held */
static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
                                           struct hugetlb_cgroup *h_cg,
                                           struct folio *folio, bool rsvd)
{
        if (hugetlb_cgroup_disabled() || !h_cg)
                return;
        lockdep_assert_held(&hugetlb_lock);
        __set_hugetlb_cgroup(folio, h_cg, rsvd);
        if (!rsvd) {
                unsigned long usage =
                        h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
                /*
                 * This write is not atomic due to fetching usage and writing
                 * to it, but that's fine because we call this with
                 * hugetlb_lock held anyway.
                 */
                WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
                           usage + nr_pages);
        }
}

void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
                                  struct hugetlb_cgroup *h_cg,
                                  struct folio *folio)
{
        __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
}

void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
                                       struct hugetlb_cgroup *h_cg,
                                       struct folio *folio)
{
        __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
}

/*
 * Should be called with hugetlb_lock held
 */
static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
                                           struct folio *folio, bool rsvd)
{
        struct hugetlb_cgroup *h_cg;

        if (hugetlb_cgroup_disabled())
                return;
        lockdep_assert_held(&hugetlb_lock);
        h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
        if (unlikely(!h_cg))
                return;
        __set_hugetlb_cgroup(folio, NULL, rsvd);

        page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
                                                                   rsvd),
                              nr_pages);

        if (rsvd)
                css_put(&h_cg->css);
        else {
                unsigned long usage =
                        h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
                /*
                 * This write is not atomic due to fetching usage and writing
                 * to it, but that's fine because we call this with
                 * hugetlb_lock held anyway.
                 */
                WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
                           usage - nr_pages);
        }
}

void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
                                  struct folio *folio)
{
        __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
}

void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
                                       struct folio *folio)
{
        __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
}

static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
                                             struct hugetlb_cgroup *h_cg,
                                             bool rsvd)
{
        if (hugetlb_cgroup_disabled() || !h_cg)
                return;

        page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
                                                                   rsvd),
                              nr_pages);

        if (rsvd)
                css_put(&h_cg->css);
}

void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
                                    struct hugetlb_cgroup *h_cg)
{
        __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
}

void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
                                         struct hugetlb_cgroup *h_cg)
{
        __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
}

void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
                                     unsigned long end)
{
        if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
            !resv->css)
                return;

        page_counter_uncharge(resv->reservation_counter,
                              (end - start) * resv->pages_per_hpage);
        css_put(resv->css);
}

void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
                                         struct file_region *rg,
                                         unsigned long nr_pages,
                                         bool region_del)
{
        if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
                return;

        if (rg->reservation_counter && resv->pages_per_hpage &&
            !resv->reservation_counter) {
                page_counter_uncharge(rg->reservation_counter,
                                      nr_pages * resv->pages_per_hpage);
                /*
                 * Only do css_put(rg->css) when we delete the entire region
                 * because one file_region must hold exactly one css reference.
                 */
                if (region_del)
                        css_put(rg->css);
        }
}

enum {
        RES_USAGE,
        RES_RSVD_USAGE,
        RES_LIMIT,
        RES_RSVD_LIMIT,
        RES_MAX_USAGE,
        RES_RSVD_MAX_USAGE,
        RES_FAILCNT,
        RES_RSVD_FAILCNT,
};

static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
{
        int nid;
        struct cftype *cft = seq_cft(seq);
        int idx = MEMFILE_IDX(cft->private);
        bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys);
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
        struct cgroup_subsys_state *css;
        unsigned long usage;

        if (legacy) {
                /* Add up usage across all nodes for the non-hierarchical total. */
                usage = 0;
                for_each_node_state(nid, N_MEMORY)
                        usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
                seq_printf(seq, "total=%lu", usage * PAGE_SIZE);

                /* Simply print the per-node usage for the non-hierarchical total. */
                for_each_node_state(nid, N_MEMORY)
                        seq_printf(seq, " N%d=%lu", nid,
                                   READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
                                           PAGE_SIZE);
                seq_putc(seq, '\n');
        }

        /*
         * The hierarchical total is pretty much the value recorded by the
         * counter, so use that.
         */
        seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
                   page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);

        /*
         * For each node, transverse the css tree to obtain the hierarchical
         * node usage.
         */
        for_each_node_state(nid, N_MEMORY) {
                usage = 0;
                rcu_read_lock();
                css_for_each_descendant_pre(css, &h_cg->css) {
                        usage += READ_ONCE(hugetlb_cgroup_from_css(css)
                                                   ->nodeinfo[nid]
                                                   ->usage[idx]);
                }
                rcu_read_unlock();
                seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
        }

        seq_putc(seq, '\n');

        return 0;
}

static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
                                   struct cftype *cft)
{
        struct page_counter *counter;
        struct page_counter *rsvd_counter;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);

        counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
        rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];

        switch (MEMFILE_ATTR(cft->private)) {
        case RES_USAGE:
                return (u64)page_counter_read(counter) * PAGE_SIZE;
        case RES_RSVD_USAGE:
                return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
        case RES_LIMIT:
                return (u64)counter->max * PAGE_SIZE;
        case RES_RSVD_LIMIT:
                return (u64)rsvd_counter->max * PAGE_SIZE;
        case RES_MAX_USAGE:
                return (u64)counter->watermark * PAGE_SIZE;
        case RES_RSVD_MAX_USAGE:
                return (u64)rsvd_counter->watermark * PAGE_SIZE;
        case RES_FAILCNT:
                return counter->failcnt;
        case RES_RSVD_FAILCNT:
                return rsvd_counter->failcnt;
        default:
                BUG();
        }
}

static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
{
        int idx;
        u64 val;
        struct cftype *cft = seq_cft(seq);
        unsigned long limit;
        struct page_counter *counter;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));

        idx = MEMFILE_IDX(cft->private);
        counter = &h_cg->hugepage[idx];

        limit = round_down(PAGE_COUNTER_MAX,
                           pages_per_huge_page(&hstates[idx]));

        switch (MEMFILE_ATTR(cft->private)) {
        case RES_RSVD_USAGE:
                counter = &h_cg->rsvd_hugepage[idx];
                fallthrough;
        case RES_USAGE:
                val = (u64)page_counter_read(counter);
                seq_printf(seq, "%llu\n", val * PAGE_SIZE);
                break;
        case RES_RSVD_LIMIT:
                counter = &h_cg->rsvd_hugepage[idx];
                fallthrough;
        case RES_LIMIT:
                val = (u64)counter->max;
                if (val == limit)
                        seq_puts(seq, "max\n");
                else
                        seq_printf(seq, "%llu\n", val * PAGE_SIZE);
                break;
        default:
                BUG();
        }

        return 0;
}

static DEFINE_MUTEX(hugetlb_limit_mutex);

static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off,
                                    const char *max)
{
        int ret, idx;
        unsigned long nr_pages;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
        bool rsvd = false;

        if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
                return -EINVAL;

        buf = strstrip(buf);
        ret = page_counter_memparse(buf, max, &nr_pages);
        if (ret)
                return ret;

        idx = MEMFILE_IDX(of_cft(of)->private);
        nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));

        switch (MEMFILE_ATTR(of_cft(of)->private)) {
        case RES_RSVD_LIMIT:
                rsvd = true;
                fallthrough;
        case RES_LIMIT:
                mutex_lock(&hugetlb_limit_mutex);
                ret = page_counter_set_max(
                        __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
                        nr_pages);
                mutex_unlock(&hugetlb_limit_mutex);
                break;
        default:
                ret = -EINVAL;
                break;
        }
        return ret ?: nbytes;
}

static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes, loff_t off)
{
        return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
}

static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
                                        char *buf, size_t nbytes, loff_t off)
{
        return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
}

static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        int ret = 0;
        struct page_counter *counter, *rsvd_counter;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));

        counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
        rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];

        switch (MEMFILE_ATTR(of_cft(of)->private)) {
        case RES_MAX_USAGE:
                page_counter_reset_watermark(counter);
                break;
        case RES_RSVD_MAX_USAGE:
                page_counter_reset_watermark(rsvd_counter);
                break;
        case RES_FAILCNT:
                counter->failcnt = 0;
                break;
        case RES_RSVD_FAILCNT:
                rsvd_counter->failcnt = 0;
                break;
        default:
                ret = -EINVAL;
                break;
        }
        return ret ?: nbytes;
}

static char *mem_fmt(char *buf, int size, unsigned long hsize)
{
        if (hsize >= SZ_1G)
                snprintf(buf, size, "%luGB", hsize / SZ_1G);
        else if (hsize >= SZ_1M)
                snprintf(buf, size, "%luMB", hsize / SZ_1M);
        else
                snprintf(buf, size, "%luKB", hsize / SZ_1K);
        return buf;
}

static int __hugetlb_events_show(struct seq_file *seq, bool local)
{
        int idx;
        long max;
        struct cftype *cft = seq_cft(seq);
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));

        idx = MEMFILE_IDX(cft->private);

        if (local)
                max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
        else
                max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);

        seq_printf(seq, "max %lu\n", max);

        return 0;
}

static int hugetlb_events_show(struct seq_file *seq, void *v)
{
        return __hugetlb_events_show(seq, false);
}

static int hugetlb_events_local_show(struct seq_file *seq, void *v)
{
        return __hugetlb_events_show(seq, true);
}

static struct cftype hugetlb_dfl_tmpl[] = {
        {
                .name = "max",
                .private = RES_LIMIT,
                .seq_show = hugetlb_cgroup_read_u64_max,
                .write = hugetlb_cgroup_write_dfl,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "rsvd.max",
                .private = RES_RSVD_LIMIT,
                .seq_show = hugetlb_cgroup_read_u64_max,
                .write = hugetlb_cgroup_write_dfl,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "current",
                .private = RES_USAGE,
                .seq_show = hugetlb_cgroup_read_u64_max,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "rsvd.current",
                .private = RES_RSVD_USAGE,
                .seq_show = hugetlb_cgroup_read_u64_max,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "events",
                .seq_show = hugetlb_events_show,
                .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]),
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "events.local",
                .seq_show = hugetlb_events_local_show,
                .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]),
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "numa_stat",
                .seq_show = hugetlb_cgroup_read_numa_stat,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        /* don't need terminator here */
};

static struct cftype hugetlb_legacy_tmpl[] = {
        {
                .name = "limit_in_bytes",
                .private = RES_LIMIT,
                .read_u64 = hugetlb_cgroup_read_u64,
                .write = hugetlb_cgroup_write_legacy,
        },
        {
                .name = "rsvd.limit_in_bytes",
                .private = RES_RSVD_LIMIT,
                .read_u64 = hugetlb_cgroup_read_u64,
                .write = hugetlb_cgroup_write_legacy,
        },
        {
                .name = "usage_in_bytes",
                .private = RES_USAGE,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "rsvd.usage_in_bytes",
                .private = RES_RSVD_USAGE,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "max_usage_in_bytes",
                .private = RES_MAX_USAGE,
                .write = hugetlb_cgroup_reset,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "rsvd.max_usage_in_bytes",
                .private = RES_RSVD_MAX_USAGE,
                .write = hugetlb_cgroup_reset,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "failcnt",
                .private = RES_FAILCNT,
                .write = hugetlb_cgroup_reset,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "rsvd.failcnt",
                .private = RES_RSVD_FAILCNT,
                .write = hugetlb_cgroup_reset,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "numa_stat",
                .seq_show = hugetlb_cgroup_read_numa_stat,
        },
        /* don't need terminator here */
};

static void __init
hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft,
                             struct cftype *tmpl, int tmpl_size)
{
        char buf[32];
        int i, idx = hstate_index(h);

        /* format the size */
        mem_fmt(buf, sizeof(buf), huge_page_size(h));

        for (i = 0; i < tmpl_size; cft++, tmpl++, i++) {
                *cft = *tmpl;
                /* rebuild the name */
                scnprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name);
                /* rebuild the private */
                cft->private = MEMFILE_PRIVATE(idx, tmpl->private);
                /* rebuild the file_offset */
                if (tmpl->file_offset) {
                        unsigned int offset = tmpl->file_offset;

                        cft->file_offset = MEMFILE_OFFSET0(offset) +
                                           MEMFILE_FIELD_SIZE(offset) * idx;
                }

                lockdep_register_key(&cft->lockdep_key);
        }
}

static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h)
{
        int idx = hstate_index(h);

        hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE,
                                     hugetlb_dfl_tmpl, DFL_TMPL_SIZE);
}

static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h)
{
        int idx = hstate_index(h);

        hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE,
                                     hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE);
}

static void __init __hugetlb_cgroup_file_init(struct hstate *h)
{
        __hugetlb_cgroup_file_dfl_init(h);
        __hugetlb_cgroup_file_legacy_init(h);
}

static void __init __hugetlb_cgroup_file_pre_init(void)
{
        int cft_count;

        cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */
        dfl_files = kzalloc_objs(struct cftype, cft_count);
        BUG_ON(!dfl_files);
        cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */
        legacy_files = kzalloc_objs(struct cftype, cft_count);
        BUG_ON(!legacy_files);
}

static void __init __hugetlb_cgroup_file_post_init(void)
{
        WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
                                       dfl_files));
        WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
                                          legacy_files));
}

void __init hugetlb_cgroup_file_init(void)
{
        struct hstate *h;

        __hugetlb_cgroup_file_pre_init();
        for_each_hstate(h)
                __hugetlb_cgroup_file_init(h);
        __hugetlb_cgroup_file_post_init();
}

/*
 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
 * when we migrate hugepages
 */
void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
{
        struct hugetlb_cgroup *h_cg;
        struct hugetlb_cgroup *h_cg_rsvd;
        struct hstate *h = folio_hstate(old_folio);

        if (hugetlb_cgroup_disabled())
                return;

        spin_lock_irq(&hugetlb_lock);
        h_cg = hugetlb_cgroup_from_folio(old_folio);
        h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
        set_hugetlb_cgroup(old_folio, NULL);
        set_hugetlb_cgroup_rsvd(old_folio, NULL);

        /* move the h_cg details to new cgroup */
        set_hugetlb_cgroup(new_folio, h_cg);
        set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
        list_move(&new_folio->lru, &h->hugepage_activelist);
        spin_unlock_irq(&hugetlb_lock);
}

static struct cftype hugetlb_files[] = {
        {} /* terminate */
};

struct cgroup_subsys hugetlb_cgrp_subsys = {
        .css_alloc        = hugetlb_cgroup_css_alloc,
        .css_offline        = hugetlb_cgroup_css_offline,
        .css_free        = hugetlb_cgroup_css_free,
        .dfl_cftypes        = hugetlb_files,
        .legacy_cftypes        = hugetlb_files,
};













    6 














































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM fib6

#if !defined(_TRACE_FIB6_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FIB6_H

#include <linux/in6.h>
#include <net/flow.h>
#include <net/ip6_fib.h>
#include <linux/tracepoint.h>

TRACE_EVENT(fib6_table_lookup,

        TP_PROTO(const struct net *net, const struct fib6_result *res,
                 struct fib6_table *table, const struct flowi6 *flp),

        TP_ARGS(net, res, table, flp),

        TP_STRUCT__entry(
                __field(        u32,        tb_id                )
                __field(        int,        err                )
                __field(        int,        oif                )
                __field(        int,        iif                )
                __field(        u32,        flowlabel        )
                __field(        __u8,        tos                )
                __field(        __u8,        scope                )
                __field(        __u8,        flags                )
                __array(        __u8,        src,        16        )
                __array(        __u8,        dst,        16        )
                __field(        u16,        sport                )
                __field(        u16,        dport                )
                __field(        u8,        proto                )
                __field(        u8,        rt_type                )
                __array(                char,        name,        IFNAMSIZ )
                __array(                __u8,        gw,        16         )
        ),

        TP_fast_assign(
                struct in6_addr *in6;

                __entry->tb_id = table->tb6_id;
                __entry->err = ip6_rt_type_to_error(res->fib6_type);
                __entry->oif = flp->flowi6_oif;
                __entry->iif = flp->flowi6_iif;
                __entry->flowlabel = ntohl(flowi6_get_flowlabel(flp));
                __entry->tos = ip6_tclass(flp->flowlabel);
                __entry->scope = flp->flowi6_scope;
                __entry->flags = flp->flowi6_flags;

                in6 = (struct in6_addr *)__entry->src;
                *in6 = flp->saddr;

                in6 = (struct in6_addr *)__entry->dst;
                *in6 = flp->daddr;

                __entry->proto = flp->flowi6_proto;
                if (__entry->proto == IPPROTO_TCP ||
                    __entry->proto == IPPROTO_UDP) {
                        __entry->sport = ntohs(flp->fl6_sport);
                        __entry->dport = ntohs(flp->fl6_dport);
                } else {
                        __entry->sport = 0;
                        __entry->dport = 0;
                }

                if (res->nh && res->nh->fib_nh_dev) {
                        strscpy(__entry->name, res->nh->fib_nh_dev->name, IFNAMSIZ);
                } else {
                        strcpy(__entry->name, "-");
                }
                if (res->f6i == net->ipv6.fib6_null_entry) {
                        in6 = (struct in6_addr *)__entry->gw;
                        *in6 = in6addr_any;
                } else if (res->nh) {
                        in6 = (struct in6_addr *)__entry->gw;
                        *in6 = res->nh->fib_nh_gw6;
                }
        ),

        TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u flowlabel %#x tos %d scope %d flags %x ==> dev %s gw %pI6c err %d",
                  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
                  __entry->src, __entry->sport, __entry->dst, __entry->dport,
                  __entry->flowlabel, __entry->tos, __entry->scope,
                  __entry->flags, __entry->name, __entry->gw, __entry->err)
);

#endif /* _TRACE_FIB6_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

































































   14 










   13 









   13 









   13 































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "%s: " fmt, __func__

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/percpu-refcount.h>

/*
 * Initially, a percpu refcount is just a set of percpu counters. Initially, we
 * don't try to detect the ref hitting 0 - which means that get/put can just
 * increment or decrement the local counter. Note that the counter on a
 * particular cpu can (and will) wrap - this is fine, when we go to shutdown the
 * percpu counters will all sum to the correct value
 *
 * (More precisely: because modular arithmetic is commutative the sum of all the
 * percpu_count vars will be equal to what it would have been if all the gets
 * and puts were done to a single integer, even if some of the percpu integers
 * overflow or underflow).
 *
 * The real trick to implementing percpu refcounts is shutdown. We can't detect
 * the ref hitting 0 on every put - this would require global synchronization
 * and defeat the whole purpose of using percpu refs.
 *
 * What we do is require the user to keep track of the initial refcount; we know
 * the ref can't hit 0 before the user drops the initial ref, so as long as we
 * convert to non percpu mode before the initial ref is dropped everything
 * works.
 *
 * Converting to non percpu mode is done with some RCUish stuff in
 * percpu_ref_kill. Additionally, we need a bias value so that the
 * atomic_long_t can't hit 0 before we've added up all the percpu refs.
 */

#define PERCPU_COUNT_BIAS        (1LU << (BITS_PER_LONG - 1))

static DEFINE_SPINLOCK(percpu_ref_switch_lock);
static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);

static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
{
        return (unsigned long __percpu *)
                (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD);
}

/**
 * percpu_ref_init - initialize a percpu refcount
 * @ref: percpu_ref to initialize
 * @release: function which will be called when refcount hits 0
 * @flags: PERCPU_REF_INIT_* flags
 * @gfp: allocation mask to use
 *
 * Initializes @ref.  @ref starts out in percpu mode with a refcount of 1 unless
 * @flags contains PERCPU_REF_INIT_ATOMIC or PERCPU_REF_INIT_DEAD.  These flags
 * change the start state to atomic with the latter setting the initial refcount
 * to 0.  See the definitions of PERCPU_REF_INIT_* flags for flag behaviors.
 *
 * Note that @release must not sleep - it may potentially be called from RCU
 * callback context by percpu_ref_kill().
 */
int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
                    unsigned int flags, gfp_t gfp)
{
        size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS,
                             __alignof__(unsigned long));
        unsigned long start_count = 0;
        struct percpu_ref_data *data;

        ref->percpu_count_ptr = (unsigned long)
                __alloc_percpu_gfp(sizeof(unsigned long), align, gfp);
        if (!ref->percpu_count_ptr)
                return -ENOMEM;

        data = kzalloc_obj(*ref->data, gfp);
        if (!data) {
                free_percpu((void __percpu *)ref->percpu_count_ptr);
                ref->percpu_count_ptr = 0;
                return -ENOMEM;
        }

        data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC;
        data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT;

        if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) {
                ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
                data->allow_reinit = true;
        } else {
                start_count += PERCPU_COUNT_BIAS;
        }

        if (flags & PERCPU_REF_INIT_DEAD)
                ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
        else
                start_count++;

        atomic_long_set(&data->count, start_count);

        data->release = release;
        data->confirm_switch = NULL;
        data->ref = ref;
        ref->data = data;
        return 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_init);

static void __percpu_ref_exit(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);

        if (percpu_count) {
                /* non-NULL confirm_switch indicates switching in progress */
                WARN_ON_ONCE(ref->data && ref->data->confirm_switch);
                free_percpu(percpu_count);
                ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
        }
}

/**
 * percpu_ref_exit - undo percpu_ref_init()
 * @ref: percpu_ref to exit
 *
 * This function exits @ref.  The caller is responsible for ensuring that
 * @ref is no longer in active use.  The usual places to invoke this
 * function from are the @ref->release() callback or in init failure path
 * where percpu_ref_init() succeeded but other parts of the initialization
 * of the embedding object failed.
 */
void percpu_ref_exit(struct percpu_ref *ref)
{
        struct percpu_ref_data *data = ref->data;
        unsigned long flags;

        __percpu_ref_exit(ref);

        if (!data)
                return;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);
        ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) <<
                __PERCPU_REF_FLAG_BITS;
        ref->data = NULL;
        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);

        kfree(data);
}
EXPORT_SYMBOL_GPL(percpu_ref_exit);

static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
{
        struct percpu_ref_data *data = container_of(rcu,
                        struct percpu_ref_data, rcu);
        struct percpu_ref *ref = data->ref;

        data->confirm_switch(ref);
        data->confirm_switch = NULL;
        wake_up_all(&percpu_ref_switch_waitq);

        if (!data->allow_reinit)
                __percpu_ref_exit(ref);

        /* drop ref from percpu_ref_switch_to_atomic() */
        percpu_ref_put(ref);
}

static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
{
        struct percpu_ref_data *data = container_of(rcu,
                        struct percpu_ref_data, rcu);
        struct percpu_ref *ref = data->ref;
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
        static atomic_t underflows;
        unsigned long count = 0;
        int cpu;

        for_each_possible_cpu(cpu)
                count += *per_cpu_ptr(percpu_count, cpu);

        pr_debug("global %lu percpu %lu\n",
                 atomic_long_read(&data->count), count);

        /*
         * It's crucial that we sum the percpu counters _before_ adding the sum
         * to &ref->count; since gets could be happening on one cpu while puts
         * happen on another, adding a single cpu's count could cause
         * @ref->count to hit 0 before we've got a consistent value - but the
         * sum of all the counts will be consistent and correct.
         *
         * Subtracting the bias value then has to happen _after_ adding count to
         * &ref->count; we need the bias value to prevent &ref->count from
         * reaching 0 before we add the percpu counts. But doing it at the same
         * time is equivalent and saves us atomic operations:
         */
        atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count);

        if (WARN_ONCE(atomic_long_read(&data->count) <= 0,
                      "percpu ref (%ps) <= 0 (%ld) after switching to atomic",
                      data->release, atomic_long_read(&data->count)) &&
            atomic_inc_return(&underflows) < 4) {
                pr_err("%s(): percpu_ref underflow", __func__);
                mem_dump_obj(data);
        }

        /* @ref is viewed as dead on all CPUs, send out switch confirmation */
        percpu_ref_call_confirm_rcu(rcu);
}

static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
{
}

static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                          percpu_ref_func_t *confirm_switch)
{
        if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
                if (confirm_switch)
                        confirm_switch(ref);
                return;
        }

        /* switching from percpu to atomic */
        ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;

        /*
         * Non-NULL ->confirm_switch is used to indicate that switching is
         * in progress.  Use noop one if unspecified.
         */
        ref->data->confirm_switch = confirm_switch ?:
                percpu_ref_noop_confirm_switch;

        percpu_ref_get(ref);        /* put after confirmation */
        call_rcu_hurry(&ref->data->rcu,
                       percpu_ref_switch_to_atomic_rcu);
}

static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
        int cpu;

        BUG_ON(!percpu_count);

        if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
                return;

        if (WARN_ON_ONCE(!ref->data->allow_reinit))
                return;

        atomic_long_add(PERCPU_COUNT_BIAS, &ref->data->count);

        /*
         * Restore per-cpu operation.  smp_store_release() is paired
         * with READ_ONCE() in __ref_is_percpu() and guarantees that the
         * zeroing is visible to all percpu accesses which can see the
         * following __PERCPU_REF_ATOMIC clearing.
         */
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(percpu_count, cpu) = 0;

        smp_store_release(&ref->percpu_count_ptr,
                          ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
}

static void __percpu_ref_switch_mode(struct percpu_ref *ref,
                                     percpu_ref_func_t *confirm_switch)
{
        struct percpu_ref_data *data = ref->data;

        lockdep_assert_held(&percpu_ref_switch_lock);

        /*
         * If the previous ATOMIC switching hasn't finished yet, wait for
         * its completion.  If the caller ensures that ATOMIC switching
         * isn't in progress, this function can be called from any context.
         */
        wait_event_lock_irq(percpu_ref_switch_waitq, !data->confirm_switch,
                            percpu_ref_switch_lock);

        if (data->force_atomic || percpu_ref_is_dying(ref))
                __percpu_ref_switch_to_atomic(ref, confirm_switch);
        else
                __percpu_ref_switch_to_percpu(ref);
}

/**
 * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
 * @ref: percpu_ref to switch to atomic mode
 * @confirm_switch: optional confirmation callback
 *
 * There's no reason to use this function for the usual reference counting.
 * Use percpu_ref_kill[_and_confirm]().
 *
 * Schedule switching of @ref to atomic mode.  All its percpu counts will
 * be collected to the main atomic counter.  On completion, when all CPUs
 * are guaraneed to be in atomic mode, @confirm_switch, which may not
 * block, is invoked.  This function may be invoked concurrently with all
 * the get/put operations and can safely be mixed with kill and reinit
 * operations.  Note that @ref will stay in atomic mode across kill/reinit
 * cycles until percpu_ref_switch_to_percpu() is called.
 *
 * This function may block if @ref is in the process of switching to atomic
 * mode.  If the caller ensures that @ref is not in the process of
 * switching to atomic mode, this function can be called from any context.
 */
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        ref->data->force_atomic = true;
        __percpu_ref_switch_mode(ref, confirm_switch);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);

/**
 * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
 * @ref: percpu_ref to switch to atomic mode
 *
 * Schedule switching the ref to atomic mode, and wait for the
 * switch to complete.  Caller must ensure that no other thread
 * will switch back to percpu mode.
 */
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
{
        percpu_ref_switch_to_atomic(ref, NULL);
        wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);

/**
 * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
 * @ref: percpu_ref to switch to percpu mode
 *
 * There's no reason to use this function for the usual reference counting.
 * To re-use an expired ref, use percpu_ref_reinit().
 *
 * Switch @ref to percpu mode.  This function may be invoked concurrently
 * with all the get/put operations and can safely be mixed with kill and
 * reinit operations.  This function reverses the sticky atomic state set
 * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic().  If @ref is
 * dying or dead, the actual switching takes place on the following
 * percpu_ref_reinit().
 *
 * This function may block if @ref is in the process of switching to atomic
 * mode.  If the caller ensures that @ref is not in the process of
 * switching to atomic mode, this function can be called from any context.
 */
void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        ref->data->force_atomic = false;
        __percpu_ref_switch_mode(ref, NULL);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);

/**
 * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation
 * @ref: percpu_ref to kill
 * @confirm_kill: optional confirmation callback
 *
 * Equivalent to percpu_ref_kill() but also schedules kill confirmation if
 * @confirm_kill is not NULL.  @confirm_kill, which may not block, will be
 * called after @ref is seen as dead from all CPUs at which point all
 * further invocations of percpu_ref_tryget_live() will fail.  See
 * percpu_ref_tryget_live() for details.
 *
 * This function normally doesn't block and can be called from any context
 * but it may block if @confirm_kill is specified and @ref is in the
 * process of switching to atomic mode by percpu_ref_switch_to_atomic().
 *
 * There are no implied RCU grace periods between kill and release.
 */
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        WARN_ONCE(percpu_ref_is_dying(ref),
                  "%s called more than once on %ps!", __func__,
                  ref->data->release);

        ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
        __percpu_ref_switch_mode(ref, confirm_kill);
        percpu_ref_put(ref);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);

/**
 * percpu_ref_is_zero - test whether a percpu refcount reached zero
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref reached zero.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
bool percpu_ref_is_zero(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        unsigned long count, flags;

        if (__ref_is_percpu(ref, &percpu_count))
                return false;

        /* protect us from being destroyed */
        spin_lock_irqsave(&percpu_ref_switch_lock, flags);
        if (ref->data)
                count = atomic_long_read(&ref->data->count);
        else
                count = ref->percpu_count_ptr >> __PERCPU_REF_FLAG_BITS;
        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);

        return count == 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_is_zero);

/**
 * percpu_ref_reinit - re-initialize a percpu refcount
 * @ref: perpcu_ref to re-initialize
 *
 * Re-initialize @ref so that it's in the same state as when it finished
 * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD.  @ref must have been
 * initialized successfully and reached 0 but not exited.
 *
 * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
 * this function is in progress.
 */
void percpu_ref_reinit(struct percpu_ref *ref)
{
        WARN_ON_ONCE(!percpu_ref_is_zero(ref));

        percpu_ref_resurrect(ref);
}
EXPORT_SYMBOL_GPL(percpu_ref_reinit);

/**
 * percpu_ref_resurrect - modify a percpu refcount from dead to live
 * @ref: perpcu_ref to resurrect
 *
 * Modify @ref so that it's in the same state as before percpu_ref_kill() was
 * called. @ref must be dead but must not yet have exited.
 *
 * If @ref->release() frees @ref then the caller is responsible for
 * guaranteeing that @ref->release() does not get called while this
 * function is in progress.
 *
 * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
 * this function is in progress.
 */
void percpu_ref_resurrect(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        WARN_ON_ONCE(!percpu_ref_is_dying(ref));
        WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count));

        ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
        percpu_ref_get(ref);
        __percpu_ref_switch_mode(ref, NULL);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_resurrect);















































   71 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM x86_fpu

#if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FPU_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(x86_fpu,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu),

        TP_STRUCT__entry(
                __field(struct fpu *, fpu)
                __field(bool, load_fpu)
                __field(u64, xfeatures)
                __field(u64, xcomp_bv)
                ),

        TP_fast_assign(
                __entry->fpu                = fpu;
                __entry->load_fpu        = test_thread_flag(TIF_NEED_FPU_LOAD);
                if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
                        __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures;
                        __entry->xcomp_bv  = fpu->fpstate->regs.xsave.header.xcomp_bv;
                }
        ),
        TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
                        __entry->fpu,
                        __entry->load_fpu,
                        __entry->xfeatures,
                        __entry->xcomp_bv
        )
);

DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_after_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_dropped,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH asm/trace/
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE fpu
#endif /* _TRACE_FPU_H */

/* This part must be outside protection */
#include <trace/define_trace.h>













































































   19 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#ifndef _LINUX_SCHED_ISOLATION_H
#define _LINUX_SCHED_ISOLATION_H

#include <linux/cpumask.h>
#include <linux/init.h>
#include <linux/tick.h>

enum hk_type {
        /* Inverse of boot-time isolcpus= argument */
        HK_TYPE_DOMAIN_BOOT,
        /*
         * Same as HK_TYPE_DOMAIN_BOOT but also includes the
         * inverse of cpuset isolated partitions. As such it
         * is always a subset of HK_TYPE_DOMAIN_BOOT.
         */
        HK_TYPE_DOMAIN,
        /* Inverse of boot-time isolcpus=managed_irq argument */
        HK_TYPE_MANAGED_IRQ,
        /* Inverse of boot-time nohz_full= or isolcpus=nohz arguments */
        HK_TYPE_KERNEL_NOISE,
        HK_TYPE_MAX,

        /*
         * The following housekeeping types are only set by the nohz_full
         * boot commandline option. So they can share the same value.
         */
        HK_TYPE_TICK    = HK_TYPE_KERNEL_NOISE,
        HK_TYPE_TIMER   = HK_TYPE_KERNEL_NOISE,
        HK_TYPE_RCU     = HK_TYPE_KERNEL_NOISE,
        HK_TYPE_MISC    = HK_TYPE_KERNEL_NOISE,
        HK_TYPE_WQ      = HK_TYPE_KERNEL_NOISE,
        HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE
};

#ifdef CONFIG_CPU_ISOLATION
DECLARE_STATIC_KEY_FALSE(housekeeping_overridden);
extern int housekeeping_any_cpu(enum hk_type type);
extern const struct cpumask *housekeeping_cpumask(enum hk_type type);
extern bool housekeeping_enabled(enum hk_type type);
extern void housekeeping_affine(struct task_struct *t, enum hk_type type);
extern bool housekeeping_test_cpu(int cpu, enum hk_type type);
extern int housekeeping_update(struct cpumask *isol_mask);
extern void __init housekeeping_init(void);

#else

static inline int housekeeping_any_cpu(enum hk_type type)
{
        return smp_processor_id();
}

static inline const struct cpumask *housekeeping_cpumask(enum hk_type type)
{
        return cpu_possible_mask;
}

static inline bool housekeeping_enabled(enum hk_type type)
{
        return false;
}

static inline void housekeeping_affine(struct task_struct *t,
                                       enum hk_type type) { }

static inline bool housekeeping_test_cpu(int cpu, enum hk_type type)
{
        return true;
}

static inline int housekeeping_update(struct cpumask *isol_mask) { return 0; }
static inline void housekeeping_init(void) { }
#endif /* CONFIG_CPU_ISOLATION */

static inline bool housekeeping_cpu(int cpu, enum hk_type type)
{
#ifdef CONFIG_CPU_ISOLATION
        if (static_branch_unlikely(&housekeeping_overridden))
                return housekeeping_test_cpu(cpu, type);
#endif
        return true;
}

static inline bool cpu_is_isolated(int cpu)
{
        return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN);
}

#endif /* _LINUX_SCHED_ISOLATION_H */

























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#ifndef __LINUX_SPINLOCK_API_SMP_H
#define __LINUX_SPINLOCK_API_SMP_H

#ifndef __LINUX_INSIDE_SPINLOCK_H
# error "Please do not include this file directly."
#endif

/*
 * include/linux/spinlock_api_smp.h
 *
 * spinlock API declarations on SMP (and debug)
 * (implemented in kernel/spinlock.c)
 *
 * portions Copyright 2005, Red Hat, Inc., Ingo Molnar
 * Released under the General Public License (GPL).
 */

int in_lock_functions(unsigned long addr);

#define assert_raw_spin_locked(x)        BUG_ON(!raw_spin_is_locked(x))

void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)                __acquires(lock);
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
                                                                __acquires(lock);
void __lockfunc
_raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map)
                                                                __acquires(lock);
void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)                __acquires(lock);
void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
                                                                __acquires(lock);

unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
                                                                __acquires(lock);
unsigned long __lockfunc
_raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass)
                                                                __acquires(lock);
int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)                __cond_acquires(true, lock);
int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)        __cond_acquires(true, lock);
void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)                __releases(lock);
void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)        __releases(lock);
void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)        __releases(lock);
void __lockfunc
_raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
                                                                __releases(lock);

#ifdef CONFIG_INLINE_SPIN_LOCK
#define _raw_spin_lock(lock) __raw_spin_lock(lock)
#endif

#ifdef CONFIG_INLINE_SPIN_LOCK_BH
#define _raw_spin_lock_bh(lock) __raw_spin_lock_bh(lock)
#endif

#ifdef CONFIG_INLINE_SPIN_LOCK_IRQ
#define _raw_spin_lock_irq(lock) __raw_spin_lock_irq(lock)
#endif

#ifdef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
#define _raw_spin_lock_irqsave(lock) __raw_spin_lock_irqsave(lock)
#endif

#ifdef CONFIG_INLINE_SPIN_TRYLOCK
#define _raw_spin_trylock(lock) __raw_spin_trylock(lock)
#endif

#ifdef CONFIG_INLINE_SPIN_TRYLOCK_BH
#define _raw_spin_trylock_bh(lock) __raw_spin_trylock_bh(lock)
#endif

#ifndef CONFIG_UNINLINE_SPIN_UNLOCK
#define _raw_spin_unlock(lock) __raw_spin_unlock(lock)
#endif

#ifdef CONFIG_INLINE_SPIN_UNLOCK_BH
#define _raw_spin_unlock_bh(lock) __raw_spin_unlock_bh(lock)
#endif

#ifdef CONFIG_INLINE_SPIN_UNLOCK_IRQ
#define _raw_spin_unlock_irq(lock) __raw_spin_unlock_irq(lock)
#endif

#ifdef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
#define _raw_spin_unlock_irqrestore(lock, flags) __raw_spin_unlock_irqrestore(lock, flags)
#endif

static inline int __raw_spin_trylock(raw_spinlock_t *lock)
        __cond_acquires(true, lock)
{
        preempt_disable();
        if (do_raw_spin_trylock(lock)) {
                spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
                return 1;
        }
        preempt_enable();
        return 0;
}

static __always_inline bool _raw_spin_trylock_irq(raw_spinlock_t *lock)
        __cond_acquires(true, lock)
{
        local_irq_disable();
        if (_raw_spin_trylock(lock))
                return true;
        local_irq_enable();
        return false;
}

static __always_inline bool _raw_spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags)
        __cond_acquires(true, lock)
{
        local_irq_save(*flags);
        if (_raw_spin_trylock(lock))
                return true;
        local_irq_restore(*flags);
        return false;
}

/*
 * If lockdep is enabled then we use the non-preemption spin-ops
 * even on CONFIG_PREEMPTION, because lockdep assumes that interrupts are
 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
 */
#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)

static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock)
        __acquires(lock) __no_context_analysis
{
        unsigned long flags;

        local_irq_save(flags);
        preempt_disable();
        spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
        return flags;
}

static inline void __raw_spin_lock_irq(raw_spinlock_t *lock)
        __acquires(lock) __no_context_analysis
{
        local_irq_disable();
        preempt_disable();
        spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
}

static inline void __raw_spin_lock_bh(raw_spinlock_t *lock)
        __acquires(lock) __no_context_analysis
{
        __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
        spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
}

static inline void __raw_spin_lock(raw_spinlock_t *lock)
        __acquires(lock) __no_context_analysis
{
        preempt_disable();
        spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
}

#endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */

static inline void __raw_spin_unlock(raw_spinlock_t *lock)
        __releases(lock)
{
        spin_release(&lock->dep_map, _RET_IP_);
        do_raw_spin_unlock(lock);
        preempt_enable();
}

static inline void __raw_spin_unlock_irqrestore(raw_spinlock_t *lock,
                                            unsigned long flags)
        __releases(lock)
{
        spin_release(&lock->dep_map, _RET_IP_);
        do_raw_spin_unlock(lock);
        local_irq_restore(flags);
        preempt_enable();
}

static inline void __raw_spin_unlock_irq(raw_spinlock_t *lock)
        __releases(lock)
{
        spin_release(&lock->dep_map, _RET_IP_);
        do_raw_spin_unlock(lock);
        local_irq_enable();
        preempt_enable();
}

static inline void __raw_spin_unlock_bh(raw_spinlock_t *lock)
        __releases(lock)
{
        spin_release(&lock->dep_map, _RET_IP_);
        do_raw_spin_unlock(lock);
        __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
}

static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
        __cond_acquires(true, lock)
{
        __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
        if (do_raw_spin_trylock(lock)) {
                spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
                return 1;
        }
        __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
        return 0;
}

/* PREEMPT_RT has its own rwlock implementation */
#ifndef CONFIG_PREEMPT_RT
#include <linux/rwlock_api_smp.h>
#endif

#endif /* __LINUX_SPINLOCK_API_SMP_H */














   21 
    1 













































































































































































































































































































































































































   20 




   21 






















   17 



   19 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <linux/btf.h>

#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)

static int check_abnormal_return(struct bpf_verifier_env *env)
{
        int i;

        for (i = 1; i < env->subprog_cnt; i++) {
                if (env->subprog_info[i].has_ld_abs) {
                        verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
                        return -EINVAL;
                }
                if (env->subprog_info[i].has_tail_call) {
                        verbose(env, "tail_call is not allowed in subprogs without BTF\n");
                        return -EINVAL;
                }
        }
        return 0;
}

/* The minimum supported BTF func info size */
#define MIN_BPF_FUNCINFO_SIZE        8
#define MAX_FUNCINFO_REC_SIZE        252

static int check_btf_func_early(struct bpf_verifier_env *env,
                                const union bpf_attr *attr,
                                bpfptr_t uattr)
{
        u32 krec_size = sizeof(struct bpf_func_info);
        const struct btf_type *type, *func_proto;
        u32 i, nfuncs, urec_size, min_size;
        struct bpf_func_info *krecord;
        struct bpf_prog *prog;
        const struct btf *btf;
        u32 prev_offset = 0;
        bpfptr_t urecord;
        int ret = -ENOMEM;

        nfuncs = attr->func_info_cnt;
        if (!nfuncs) {
                if (check_abnormal_return(env))
                        return -EINVAL;
                return 0;
        }

        urec_size = attr->func_info_rec_size;
        if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
            urec_size > MAX_FUNCINFO_REC_SIZE ||
            urec_size % sizeof(u32)) {
                verbose(env, "invalid func info rec size %u\n", urec_size);
                return -EINVAL;
        }

        prog = env->prog;
        btf = prog->aux->btf;

        urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
        min_size = min_t(u32, krec_size, urec_size);

        krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
        if (!krecord)
                return -ENOMEM;

        for (i = 0; i < nfuncs; i++) {
                ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
                if (ret) {
                        if (ret == -E2BIG) {
                                verbose(env, "nonzero tailing record in func info");
                                /* set the size kernel expects so loader can zero
                                 * out the rest of the record.
                                 */
                                if (copy_to_bpfptr_offset(uattr,
                                                          offsetof(union bpf_attr, func_info_rec_size),
                                                          &min_size, sizeof(min_size)))
                                        ret = -EFAULT;
                        }
                        goto err_free;
                }

                if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
                        ret = -EFAULT;
                        goto err_free;
                }

                /* check insn_off */
                ret = -EINVAL;
                if (i == 0) {
                        if (krecord[i].insn_off) {
                                verbose(env,
                                        "nonzero insn_off %u for the first func info record",
                                        krecord[i].insn_off);
                                goto err_free;
                        }
                } else if (krecord[i].insn_off <= prev_offset) {
                        verbose(env,
                                "same or smaller insn offset (%u) than previous func info record (%u)",
                                krecord[i].insn_off, prev_offset);
                        goto err_free;
                }

                /* check type_id */
                type = btf_type_by_id(btf, krecord[i].type_id);
                if (!type || !btf_type_is_func(type)) {
                        verbose(env, "invalid type id %d in func info",
                                krecord[i].type_id);
                        goto err_free;
                }

                func_proto = btf_type_by_id(btf, type->type);
                if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
                        /* btf_func_check() already verified it during BTF load */
                        goto err_free;

                prev_offset = krecord[i].insn_off;
                bpfptr_add(&urecord, urec_size);
        }

        prog->aux->func_info = krecord;
        prog->aux->func_info_cnt = nfuncs;
        return 0;

err_free:
        kvfree(krecord);
        return ret;
}

static int check_btf_func(struct bpf_verifier_env *env,
                          const union bpf_attr *attr,
                          bpfptr_t uattr)
{
        const struct btf_type *type, *func_proto, *ret_type;
        u32 i, nfuncs, urec_size;
        struct bpf_func_info *krecord;
        struct bpf_func_info_aux *info_aux = NULL;
        struct bpf_prog *prog;
        const struct btf *btf;
        bpfptr_t urecord;
        bool scalar_return;
        int ret = -ENOMEM;

        nfuncs = attr->func_info_cnt;
        if (!nfuncs) {
                if (check_abnormal_return(env))
                        return -EINVAL;
                return 0;
        }
        if (nfuncs != env->subprog_cnt) {
                verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
                return -EINVAL;
        }

        urec_size = attr->func_info_rec_size;

        prog = env->prog;
        btf = prog->aux->btf;

        urecord = make_bpfptr(attr->func_info, uattr.is_kernel);

        krecord = prog->aux->func_info;
        info_aux = kzalloc_objs(*info_aux, nfuncs,
                                GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
        if (!info_aux)
                return -ENOMEM;

        for (i = 0; i < nfuncs; i++) {
                /* check insn_off */
                ret = -EINVAL;

                if (env->subprog_info[i].start != krecord[i].insn_off) {
                        verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
                        goto err_free;
                }

                /* Already checked type_id */
                type = btf_type_by_id(btf, krecord[i].type_id);
                info_aux[i].linkage = BTF_INFO_VLEN(type->info);
                /* Already checked func_proto */
                func_proto = btf_type_by_id(btf, type->type);

                ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
                scalar_return =
                        btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
                if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
                        verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
                        goto err_free;
                }
                if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
                        verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
                        goto err_free;
                }

                env->subprog_info[i].name = btf_name_by_offset(btf, type->name_off);
                bpfptr_add(&urecord, urec_size);
        }

        prog->aux->func_info_aux = info_aux;
        return 0;

err_free:
        kfree(info_aux);
        return ret;
}

#define MIN_BPF_LINEINFO_SIZE        offsetofend(struct bpf_line_info, line_col)
#define MAX_LINEINFO_REC_SIZE        MAX_FUNCINFO_REC_SIZE

static int check_btf_line(struct bpf_verifier_env *env,
                          const union bpf_attr *attr,
                          bpfptr_t uattr)
{
        u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
        struct bpf_subprog_info *sub;
        struct bpf_line_info *linfo;
        struct bpf_prog *prog;
        const struct btf *btf;
        bpfptr_t ulinfo;
        int err;

        nr_linfo = attr->line_info_cnt;
        if (!nr_linfo)
                return 0;
        if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
                return -EINVAL;

        rec_size = attr->line_info_rec_size;
        if (rec_size < MIN_BPF_LINEINFO_SIZE ||
            rec_size > MAX_LINEINFO_REC_SIZE ||
            rec_size & (sizeof(u32) - 1))
                return -EINVAL;

        /* Need to zero it in case the userspace may
         * pass in a smaller bpf_line_info object.
         */
        linfo = kvzalloc_objs(struct bpf_line_info, nr_linfo,
                              GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
        if (!linfo)
                return -ENOMEM;

        prog = env->prog;
        btf = prog->aux->btf;

        s = 0;
        sub = env->subprog_info;
        ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
        expected_size = sizeof(struct bpf_line_info);
        ncopy = min_t(u32, expected_size, rec_size);
        for (i = 0; i < nr_linfo; i++) {
                err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
                if (err) {
                        if (err == -E2BIG) {
                                verbose(env, "nonzero tailing record in line_info");
                                if (copy_to_bpfptr_offset(uattr,
                                                          offsetof(union bpf_attr, line_info_rec_size),
                                                          &expected_size, sizeof(expected_size)))
                                        err = -EFAULT;
                        }
                        goto err_free;
                }

                if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
                        err = -EFAULT;
                        goto err_free;
                }

                /*
                 * Check insn_off to ensure
                 * 1) strictly increasing AND
                 * 2) bounded by prog->len
                 *
                 * The linfo[0].insn_off == 0 check logically falls into
                 * the later "missing bpf_line_info for func..." case
                 * because the first linfo[0].insn_off must be the
                 * first sub also and the first sub must have
                 * subprog_info[0].start == 0.
                 */
                if ((i && linfo[i].insn_off <= prev_offset) ||
                    linfo[i].insn_off >= prog->len) {
                        verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
                                i, linfo[i].insn_off, prev_offset,
                                prog->len);
                        err = -EINVAL;
                        goto err_free;
                }

                if (!prog->insnsi[linfo[i].insn_off].code) {
                        verbose(env,
                                "Invalid insn code at line_info[%u].insn_off\n",
                                i);
                        err = -EINVAL;
                        goto err_free;
                }

                if (!btf_name_by_offset(btf, linfo[i].line_off) ||
                    !btf_name_by_offset(btf, linfo[i].file_name_off)) {
                        verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
                        err = -EINVAL;
                        goto err_free;
                }

                if (s != env->subprog_cnt) {
                        if (linfo[i].insn_off == sub[s].start) {
                                sub[s].linfo_idx = i;
                                s++;
                        } else if (sub[s].start < linfo[i].insn_off) {
                                verbose(env, "missing bpf_line_info for func#%u\n", s);
                                err = -EINVAL;
                                goto err_free;
                        }
                }

                prev_offset = linfo[i].insn_off;
                bpfptr_add(&ulinfo, rec_size);
        }

        if (s != env->subprog_cnt) {
                verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
                        env->subprog_cnt - s, s);
                err = -EINVAL;
                goto err_free;
        }

        prog->aux->linfo = linfo;
        prog->aux->nr_linfo = nr_linfo;

        return 0;

err_free:
        kvfree(linfo);
        return err;
}

#define MIN_CORE_RELO_SIZE        sizeof(struct bpf_core_relo)
#define MAX_CORE_RELO_SIZE        MAX_FUNCINFO_REC_SIZE

static int check_core_relo(struct bpf_verifier_env *env,
                           const union bpf_attr *attr,
                           bpfptr_t uattr)
{
        u32 i, nr_core_relo, ncopy, expected_size, rec_size;
        struct bpf_core_relo core_relo = {};
        struct bpf_prog *prog = env->prog;
        const struct btf *btf = prog->aux->btf;
        struct bpf_core_ctx ctx = {
                .log = &env->log,
                .btf = btf,
        };
        bpfptr_t u_core_relo;
        int err;

        nr_core_relo = attr->core_relo_cnt;
        if (!nr_core_relo)
                return 0;
        if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
                return -EINVAL;

        rec_size = attr->core_relo_rec_size;
        if (rec_size < MIN_CORE_RELO_SIZE ||
            rec_size > MAX_CORE_RELO_SIZE ||
            rec_size % sizeof(u32))
                return -EINVAL;

        u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
        expected_size = sizeof(struct bpf_core_relo);
        ncopy = min_t(u32, expected_size, rec_size);

        /* Unlike func_info and line_info, copy and apply each CO-RE
         * relocation record one at a time.
         */
        for (i = 0; i < nr_core_relo; i++) {
                /* future proofing when sizeof(bpf_core_relo) changes */
                err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
                if (err) {
                        if (err == -E2BIG) {
                                verbose(env, "nonzero tailing record in core_relo");
                                if (copy_to_bpfptr_offset(uattr,
                                                          offsetof(union bpf_attr, core_relo_rec_size),
                                                          &expected_size, sizeof(expected_size)))
                                        err = -EFAULT;
                        }
                        break;
                }

                if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
                        err = -EFAULT;
                        break;
                }

                if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
                        verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
                                i, core_relo.insn_off, prog->len);
                        err = -EINVAL;
                        break;
                }

                err = bpf_core_apply(&ctx, &core_relo, i,
                                     &prog->insnsi[core_relo.insn_off / 8]);
                if (err)
                        break;
                bpfptr_add(&u_core_relo, rec_size);
        }
        return err;
}

int bpf_check_btf_info_early(struct bpf_verifier_env *env,
                             const union bpf_attr *attr,
                             bpfptr_t uattr)
{
        struct btf *btf;
        int err;

        if (!attr->func_info_cnt && !attr->line_info_cnt) {
                if (check_abnormal_return(env))
                        return -EINVAL;
                return 0;
        }

        btf = btf_get_by_fd(attr->prog_btf_fd);
        if (IS_ERR(btf))
                return PTR_ERR(btf);
        if (btf_is_kernel(btf)) {
                btf_put(btf);
                return -EACCES;
        }
        env->prog->aux->btf = btf;

        err = check_btf_func_early(env, attr, uattr);
        if (err)
                return err;
        return 0;
}

int bpf_check_btf_info(struct bpf_verifier_env *env,
                       const union bpf_attr *attr,
                       bpfptr_t uattr)
{
        int err;

        if (!attr->func_info_cnt && !attr->line_info_cnt) {
                if (check_abnormal_return(env))
                        return -EINVAL;
                return 0;
        }

        err = check_btf_func(env, attr, uattr);
        if (err)
                return err;

        err = check_btf_line(env, attr, uattr);
        if (err)
                return err;

        err = check_core_relo(env, attr, uattr);
        if (err)
                return err;

        return 0;
}

















































































































































































































































































   19 



   10 


    9 




























































   17 


   16 





























    3 
































    5 
    5 













    5 



    5 






    5 


















    8 

















    9 
    9 
    9 



   10 
    8 
    9 


















    9 
    9 




    5 


    5 

    5 


















































































































































































































































































    2 









    2 























































































































































































































































































































































































































































































    3 




    3 












    4 































































































    2 






































































































































































   18 




    2 










    1 
   18 




















    4 













    3 

















    4 





    3 














    8 






















    1 


    1 














   18 


    2 











    2 















   12 





    2 























    2 




















    3 






































































































































































    2 



















































    5 
























    4 

    5 

































    1 
    3 






    5 



    1 
    4 
















    5 




    5 












    3 







    4 














    1 



















































































    5 







    5 

















































































































    2 
    3 









    5 
    5 
    5 
    5 
    5 


















    5 




    5 
























    7 


    7 








    7 








   10 







   11 




   11 

































    1 
















    6 















    6 






    7 























    9 








































   16 



























   16 















































    4 






    3 



   19 




    5 






    3 




    2 


   19 
    4 













   19 


    4 




















   17 



































































































































































































































































































































































































    6 




   16 





   19 



   20 
   21 



    4 





    4 



    4 



   17 

























   17 
    4 






   20 








   21 



   11 




   20 
   10 






   18 



    6 




   19 




   20 


    1 




   19 
    3 













   18 




















    2 
    2 




   18 






   18 



   17 

    2 




    2 
























   15 
    3 

    3 


























































    8 





   17 



    6 

    2 





    7 











   16 


    5 
   17 


    8 


    6 

    7 









    5 

    2 






    8 



























   16 


    3 






    1 







   18 


   13 
    1 



   13 



   16 





    1 















   18 



























































































    2 



    2 











    3 













    2 




    2 







































































    1 
    1 











   11 













    1 
    1 













    1 



















    2 


    2 
































    2 



































    2 










    2 



















    2 







    1 
    1 










    1 











































































































































































































































































































































































































































































































    2 






    2 





































































































































   14 



















   13 

    1 

















































































































































































































































































































































    1 

    1 

































































   11 






    5 
















    4 



    4 










   10 




   10 









   10 



    9 




















    1 




























    1 













   11 


















































    1 





























































































    7 



    4 














   11 























    8 







    2 


    1 














   10 





    5 

































    5 




























    6 





















    5 






    5 
    6 

















    6 











    6 


    4 


















    6 



















































    6 







    6 





    6 


























    6 







    5 












    6 








    5 









































































    1 






    2 






















    2 



















































































































































































































































































































































































































































































































































































    2 


















    2 












    2 












    2 

























    2 







































   10 




   11 















































































































    5 
    3 





    7 











    1 




    2 







    2 


















    2 















    1 


















    1 









































    9 



    2 












    6 





    7 




    7 











    6 








    4 





    4 




    4 









    3 










    1 





    1 




    1 

















    1 




    1 










    1 



    1 









































































    1 










    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    6 































    6 






    1 


    5 




    1 

    1 



    5 



















































































































    4 








    4 



























































    5 










































































































































































































    5 


    8 




    2 







    6 
    1 




    5 





    6 






    6 




    1 



    5 



    4 































    5 









    4 




















































































































































































































    4 
    5 
























































































































































































































































































    8 













    7 
    8 
    6 
    6 
    8 











































    8 











    1 



    1 



    1 



















































































    2 

    2 















    2 
























    2 









    2 
















































































































    2 





    2 
































































































    2 








    2 














































    2 





    2 






































    2 












































































































































































































































    1 












































    1 













    1 






    1 













    1 












    4 





    5 






























    4 



    5 









































    5 






    5 





























    6 




    7 


    6 
















    7 


    7 




    7 





    1 









    6 




































































    2 
















































    9 







    9 






    8 





















    8 

























    7 




    6 






    8 







    8 












    8 




    5 
    8 















    5 








    5 






































    5 






































































































































































    5 






























































































































    5 














    5 


















    5 















































    4 
    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 





























































































    2 









































































































































































































































































































































































































































































































































































































































































































    2 










    1 
















































    2 





































































    2 
    2 









    2 























































































   18 










































































    2 





    2 































    2 















    2 


































    2 












































    2 













    2 




    2 






    2 





















































































    2 






    2 



















































































































































































































































































































































































































































































































    1 























    1 






    1 


    1 














































































































































































































































































































































    1 




    1 




    1 

























    1 










































    1 





    1 




    1 





















































































    1 
















































    1 









    3 















    3 












































    1 









    1 













    1 









    1 
























    1 













































































    5 
    1 

    1 





    1 





    5 




















    1 









































































    2 


    3 





    1 

    2 





    3 



    3 





    2 
    1 









    3 

    3 
































































































































































































































































































































































    2 






    2 




    2 


    1 





























































































































































































































    2 





    2 









































































    2 
    2 





























































































































































































































































    3 






    1 




    1 
    1 







    1 







    1 





    2 








    2 
















































    2 

    2 

    2 

















    1 



    1 





    1 


















    2 











































    2 




    2 


















    8 





    8 

























































































































































































    6 




    5 




















































    1 




















































    6 



























    6 





    5 





    6 


    6 


















    6 



    6 













    5 


    6 













    1 








    1 




























































   10 









    8 





    7 

    1 




    2 





































































































    2 






















    3 




    5 
    1 



















    7 


























   17 





   18 


   17 

   18 


    2 


   17 

   18 

   10 






    6 



    6 













































    1 






    2 







    2 







    2 






















































    1 



    2 




    3 













































































    1 












    6 







































































































   16 




   13 


    7 



    6 




    1 
    4 




    1 











    1 







    8 












    2 


    8 





    1 




    7 

    3 








    8 




    8 


























   17 


















   16 


















   18 

    9 






   16 
    1 

   15 








   12 
    6 










   17 






   17 



   17 






    9 

    1 



    8 













   12 











    7 



   18 


   17 
























    6 










































































































































































































































































































































































































    5 








    1 














    2 
    1 




    2 



    5 

















    2 





    3 


















   10 


    8 











    1 







   16 














    1 



    2 









   17 


   10 

    7 






    1 

    5 





    1 






   17 































   16 








   18 



























    7 
































































































   19 














   21 






   20 






















   15 









































   18 

   18 

   16 






   17 

   18 

    1 








   18 


   18 

    2 


    2 




   17 























   16 






















































































   16 











   18 





   18 

    1 
    5 
   18 
























    6 




    6 

















































   12 









   14 



    6 



    5 
    2 













































































































































































































































































































































































































































































































































































































































   18 





















   17 


























































































































   20 


































































































































































































































   19 



























   21 









   19 
   21 





   20 











   20 



















   20 









   21 












    1 



    1 



    1 


   17 













































   12 






    6 








    6 






   15 










    6 















    6 



   20 




   15 
    4 

   20 






   14 

















    6 

























    6 





    6 
   15 




   21 

   20 





   20 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
13093
13094
13095
13096
13097
13098
13099
13100
13101
13102
13103
13104
13105
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126
13127
13128
13129
13130
13131
13132
13133
13134
13135
13136
13137
13138
13139
13140
13141
13142
13143
13144
13145
13146
13147
13148
13149
13150
13151
13152
13153
13154
13155
13156
13157
13158
13159
13160
13161
13162
13163
13164
13165
13166
13167
13168
13169
13170
13171
13172
13173
13174
13175
13176
13177
13178
13179
13180
13181
13182
13183
13184
13185
13186
13187
13188
13189
13190
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203
13204
13205
13206
13207
13208
13209
13210
13211
13212
13213
13214
13215
13216
13217
13218
13219
13220
13221
13222
13223
13224
13225
13226
13227
13228
13229
13230
13231
13232
13233
13234
13235
13236
13237
13238
13239
13240
13241
13242
13243
13244
13245
13246
13247
13248
13249
13250
13251
13252
13253
13254
13255
13256
13257
13258
13259
13260
13261
13262
13263
13264
13265
13266
13267
13268
13269
13270
13271
13272
13273
13274
13275
13276
13277
13278
13279
13280
13281
13282
13283
13284
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
13343
13344
13345
13346
13347
13348
13349
13350
13351
13352
13353
13354
13355
13356
13357
13358
13359
13360
13361
13362
13363
13364
13365
13366
13367
13368
13369
13370
13371
13372
13373
13374
13375
13376
13377
13378
13379
13380
13381
13382
13383
13384
13385
13386
13387
13388
13389
13390
13391
13392
13393
13394
13395
13396
13397
13398
13399
13400
13401
13402
13403
13404
13405
13406
13407
13408
13409
13410
13411
13412
13413
13414
13415
13416
13417
13418
13419
13420
13421
13422
13423
13424
13425
13426
13427
13428
13429
13430
13431
13432
13433
13434
13435
13436
13437
13438
13439
13440
13441
13442
13443
13444
13445
13446
13447
13448
13449
13450
13451
13452
13453
13454
13455
13456
13457
13458
13459
13460
13461
13462
13463
13464
13465
13466
13467
13468
13469
13470
13471
13472
13473
13474
13475
13476
13477
13478
13479
13480
13481
13482
13483
13484
13485
13486
13487
13488
13489
13490
13491
13492
13493
13494
13495
13496
13497
13498
13499
13500
13501
13502
13503
13504
13505
13506
13507
13508
13509
13510
13511
13512
13513
13514
13515
13516
13517
13518
13519
13520
13521
13522
13523
13524
13525
13526
13527
13528
13529
13530
13531
13532
13533
13534
13535
13536
13537
13538
13539
13540
13541
13542
13543
13544
13545
13546
13547
13548
13549
13550
13551
13552
13553
13554
13555
13556
13557
13558
13559
13560
13561
13562
13563
13564
13565
13566
13567
13568
13569
13570
13571
13572
13573
13574
13575
13576
13577
13578
13579
13580
13581
13582
13583
13584
13585
13586
13587
13588
13589
13590
13591
13592
13593
13594
13595
13596
13597
13598
13599
13600
13601
13602
13603
13604
13605
13606
13607
13608
13609
13610
13611
13612
13613
13614
13615
13616
13617
13618
13619
13620
13621
13622
13623
13624
13625
13626
13627
13628
13629
13630
13631
13632
13633
13634
13635
13636
13637
13638
13639
13640
13641
13642
13643
13644
13645
13646
13647
13648
13649
13650
13651
13652
13653
13654
13655
13656
13657
13658
13659
13660
13661
13662
13663
13664
13665
13666
13667
13668
13669
13670
13671
13672
13673
13674
13675
13676
13677
13678
13679
13680
13681
13682
13683
13684
13685
13686
13687
13688
13689
13690
13691
13692
13693
13694
13695
13696
13697
13698
13699
13700
13701
13702
13703
13704
13705
13706
13707
13708
13709
13710
13711
13712
13713
13714
13715
13716
13717
13718
13719
13720
13721
13722
13723
13724
13725
13726
13727
13728
13729
13730
13731
13732
13733
13734
13735
13736
13737
13738
13739
13740
13741
13742
13743
13744
13745
13746
13747
13748
13749
13750
13751
13752
13753
13754
13755
13756
13757
13758
13759
13760
13761
13762
13763
13764
13765
13766
13767
13768
13769
13770
13771
13772
13773
13774
13775
13776
13777
13778
13779
13780
13781
13782
13783
13784
13785
13786
13787
13788
13789
13790
13791
13792
13793
13794
13795
13796
13797
13798
13799
13800
13801
13802
13803
13804
13805
13806
13807
13808
13809
13810
13811
13812
13813
13814
13815
13816
13817
13818
13819
13820
13821
13822
13823
13824
13825
13826
13827
13828
13829
13830
13831
13832
13833
13834
13835
13836
13837
13838
13839
13840
13841
13842
13843
13844
13845
13846
13847
13848
13849
13850
13851
13852
13853
13854
13855
13856
13857
13858
13859
13860
13861
13862
13863
13864
13865
13866
13867
13868
13869
13870
13871
13872
13873
13874
13875
13876
13877
13878
13879
13880
13881
13882
13883
13884
13885
13886
13887
13888
13889
13890
13891
13892
13893
13894
13895
13896
13897
13898
13899
13900
13901
13902
13903
13904
13905
13906
13907
13908
13909
13910
13911
13912
13913
13914
13915
13916
13917
13918
13919
13920
13921
13922
13923
13924
13925
13926
13927
13928
13929
13930
13931
13932
13933
13934
13935
13936
13937
13938
13939
13940
13941
13942
13943
13944
13945
13946
13947
13948
13949
13950
13951
13952
13953
13954
13955
13956
13957
13958
13959
13960
13961
13962
13963
13964
13965
13966
13967
13968
13969
13970
13971
13972
13973
13974
13975
13976
13977
13978
13979
13980
13981
13982
13983
13984
13985
13986
13987
13988
13989
13990
13991
13992
13993
13994
13995
13996
13997
13998
13999
14000
14001
14002
14003
14004
14005
14006
14007
14008
14009
14010
14011
14012
14013
14014
14015
14016
14017
14018
14019
14020
14021
14022
14023
14024
14025
14026
14027
14028
14029
14030
14031
14032
14033
14034
14035
14036
14037
14038
14039
14040
14041
14042
14043
14044
14045
14046
14047
14048
14049
14050
14051
14052
14053
14054
14055
14056
14057
14058
14059
14060
14061
14062
14063
14064
14065
14066
14067
14068
14069
14070
14071
14072
14073
14074
14075
14076
14077
14078
14079
14080
14081
14082
14083
14084
14085
14086
14087
14088
14089
14090
14091
14092
14093
14094
14095
14096
14097
14098
14099
14100
14101
14102
14103
14104
14105
14106
14107
14108
14109
14110
14111
14112
14113
14114
14115
14116
14117
14118
14119
14120
14121
14122
14123
14124
14125
14126
14127
14128
14129
14130
14131
14132
14133
14134
14135
14136
14137
14138
14139
14140
14141
14142
14143
14144
14145
14146
14147
14148
14149
14150
14151
14152
14153
14154
14155
14156
14157
14158
14159
14160
14161
14162
14163
14164
14165
14166
14167
14168
14169
14170
14171
14172
14173
14174
14175
14176
14177
14178
14179
14180
14181
14182
14183
14184
14185
14186
14187
14188
14189
14190
14191
14192
14193
14194
14195
14196
14197
14198
14199
14200
14201
14202
14203
14204
14205
14206
14207
14208
14209
14210
14211
14212
14213
14214
14215
14216
14217
14218
14219
14220
14221
14222
14223
14224
14225
14226
14227
14228
14229
14230
14231
14232
14233
14234
14235
14236
14237
14238
14239
14240
14241
14242
14243
14244
14245
14246
14247
14248
14249
14250
14251
14252
14253
14254
14255
14256
14257
14258
14259
14260
14261
14262
14263
14264
14265
14266
14267
14268
14269
14270
14271
14272
14273
14274
14275
14276
14277
14278
14279
14280
14281
14282
14283
14284
14285
14286
14287
14288
14289
14290
14291
14292
14293
14294
14295
14296
14297
14298
14299
14300
14301
14302
14303
14304
14305
14306
14307
14308
14309
14310
14311
14312
14313
14314
14315
14316
14317
14318
14319
14320
14321
14322
14323
14324
14325
14326
14327
14328
14329
14330
14331
14332
14333
14334
14335
14336
14337
14338
14339
14340
14341
14342
14343
14344
14345
14346
14347
14348
14349
14350
14351
14352
14353
14354
14355
14356
14357
14358
14359
14360
14361
14362
14363
14364
14365
14366
14367
14368
14369
14370
14371
14372
14373
14374
14375
14376
14377
14378
14379
14380
14381
14382
14383
14384
14385
14386
14387
14388
14389
14390
14391
14392
14393
14394
14395
14396
14397
14398
14399
14400
14401
14402
14403
14404
14405
14406
14407
14408
14409
14410
14411
14412
14413
14414
14415
14416
14417
14418
14419
14420
14421
14422
14423
14424
14425
14426
14427
14428
14429
14430
14431
14432
14433
14434
14435
14436
14437
14438
14439
14440
14441
14442
14443
14444
14445
14446
14447
14448
14449
14450
14451
14452
14453
14454
14455
14456
14457
14458
14459
14460
14461
14462
14463
14464
14465
14466
14467
14468
14469
14470
14471
14472
14473
14474
14475
14476
14477
14478
14479
14480
14481
14482
14483
14484
14485
14486
14487
14488
14489
14490
14491
14492
14493
14494
14495
14496
14497
14498
14499
14500
14501
14502
14503
14504
14505
14506
14507
14508
14509
14510
14511
14512
14513
14514
14515
14516
14517
14518
14519
14520
14521
14522
14523
14524
14525
14526
14527
14528
14529
14530
14531
14532
14533
14534
14535
14536
14537
14538
14539
14540
14541
14542
14543
14544
14545
14546
14547
14548
14549
14550
14551
14552
14553
14554
14555
14556
14557
14558
14559
14560
14561
14562
14563
14564
14565
14566
14567
14568
14569
14570
14571
14572
14573
14574
14575
14576
14577
14578
14579
14580
14581
14582
14583
14584
14585
14586
14587
14588
14589
14590
14591
14592
14593
14594
14595
14596
14597
14598
14599
14600
14601
14602
14603
14604
14605
14606
14607
14608
14609
14610
14611
14612
14613
14614
14615
14616
14617
14618
14619
14620
14621
14622
14623
14624
14625
14626
14627
14628
14629
14630
14631
14632
14633
14634
14635
14636
14637
14638
14639
14640
14641
14642
14643
14644
14645
14646
14647
14648
14649
14650
14651
14652
14653
14654
14655
14656
14657
14658
14659
14660
14661
14662
14663
14664
14665
14666
14667
14668
14669
14670
14671
14672
14673
14674
14675
14676
14677
14678
14679
14680
14681
14682
14683
14684
14685
14686
14687
14688
14689
14690
14691
14692
14693
14694
14695
14696
14697
14698
14699
14700
14701
14702
14703
14704
14705
14706
14707
14708
14709
14710
14711
14712
14713
14714
14715
14716
14717
14718
14719
14720
14721
14722
14723
14724
14725
14726
14727
14728
14729
14730
14731
14732
14733
14734
14735
14736
14737
14738
14739
14740
14741
14742
14743
14744
14745
14746
14747
14748
14749
14750
14751
14752
14753
14754
14755
14756
14757
14758
14759
14760
14761
14762
14763
14764
14765
14766
14767
14768
14769
14770
14771
14772
14773
14774
14775
14776
14777
14778
14779
14780
14781
14782
14783
14784
14785
14786
14787
14788
14789
14790
14791
14792
14793
14794
14795
14796
14797
14798
14799
14800
14801
14802
14803
14804
14805
14806
14807
14808
14809
14810
14811
14812
14813
14814
14815
14816
14817
14818
14819
14820
14821
14822
14823
14824
14825
14826
14827
14828
14829
14830
14831
14832
14833
14834
14835
14836
14837
14838
14839
14840
14841
14842
14843
14844
14845
14846
14847
14848
14849
14850
14851
14852
14853
14854
14855
14856
14857
14858
14859
14860
14861
14862
14863
14864
14865
14866
14867
14868
14869
14870
14871
14872
14873
14874
14875
14876
14877
14878
14879
14880
14881
14882
14883
14884
14885
14886
14887
14888
14889
14890
14891
14892
14893
14894
14895
14896
14897
14898
14899
14900
14901
14902
14903
14904
14905
14906
14907
14908
14909
14910
14911
14912
14913
14914
14915
14916
14917
14918
14919
14920
14921
14922
14923
14924
14925
14926
14927
14928
14929
14930
14931
14932
14933
14934
14935
14936
14937
14938
14939
14940
14941
14942
14943
14944
14945
14946
14947
14948
14949
14950
14951
14952
14953
14954
14955
14956
14957
14958
14959
14960
14961
14962
14963
14964
14965
14966
14967
14968
14969
14970
14971
14972
14973
14974
14975
14976
14977
14978
14979
14980
14981
14982
14983
14984
14985
14986
14987
14988
14989
14990
14991
14992
14993
14994
14995
14996
14997
14998
14999
15000
15001
15002
15003
15004
15005
15006
15007
15008
15009
15010
15011
15012
15013
15014
15015
15016
15017
15018
15019
15020
15021
15022
15023
15024
15025
15026
15027
15028
15029
15030
15031
15032
15033
15034
15035
15036
15037
15038
15039
15040
15041
15042
15043
15044
15045
15046
15047
15048
15049
15050
15051
15052
15053
15054
15055
15056
15057
15058
15059
15060
15061
15062
15063
15064
15065
15066
15067
15068
15069
15070
15071
15072
15073
15074
15075
15076
15077
15078
15079
15080
15081
15082
15083
15084
15085
15086
15087
15088
15089
15090
15091
15092
15093
15094
15095
15096
15097
15098
15099
15100
15101
15102
15103
15104
15105
15106
15107
15108
15109
15110
15111
15112
15113
15114
15115
15116
15117
15118
15119
15120
15121
15122
15123
15124
15125
15126
15127
15128
15129
15130
15131
15132
15133
15134
15135
15136
15137
15138
15139
15140
15141
15142
15143
15144
15145
15146
15147
15148
15149
15150
15151
15152
15153
15154
15155
15156
15157
15158
15159
15160
15161
15162
15163
15164
15165
15166
15167
15168
15169
15170
15171
15172
15173
15174
15175
15176
15177
15178
15179
15180
15181
15182
15183
15184
15185
15186
15187
15188
15189
15190
15191
15192
15193
15194
15195
15196
15197
15198
15199
15200
15201
15202
15203
15204
15205
15206
15207
15208
15209
15210
15211
15212
15213
15214
15215
15216
15217
15218
15219
15220
15221
15222
15223
15224
15225
15226
15227
15228
15229
15230
15231
15232
15233
15234
15235
15236
15237
15238
15239
15240
15241
15242
15243
15244
15245
15246
15247
15248
15249
15250
15251
15252
15253
15254
15255
15256
15257
15258
15259
15260
15261
15262
15263
15264
15265
15266
15267
15268
15269
15270
15271
15272
15273
15274
15275
15276
15277
15278
15279
15280
15281
15282
15283
15284
15285
15286
15287
15288
15289
15290
15291
15292
15293
15294
15295
15296
15297
15298
15299
15300
15301
15302
15303
15304
15305
15306
15307
15308
15309
15310
15311
15312
15313
15314
15315
15316
15317
15318
15319
15320
15321
15322
15323
15324
15325
15326
15327
15328
15329
15330
15331
15332
15333
15334
15335
15336
15337
15338
15339
15340
15341
15342
15343
15344
15345
15346
15347
15348
15349
15350
15351
15352
15353
15354
15355
15356
15357
15358
15359
15360
15361
15362
15363
15364
15365
15366
15367
15368
15369
15370
15371
15372
15373
15374
15375
15376
15377
15378
15379
15380
15381
15382
15383
15384
15385
15386
15387
15388
15389
15390
15391
15392
15393
15394
15395
15396
15397
15398
15399
15400
15401
15402
15403
15404
15405
15406
15407
15408
15409
15410
15411
15412
15413
15414
15415
15416
15417
15418
15419
15420
15421
15422
15423
15424
15425
15426
15427
15428
15429
15430
15431
15432
15433
15434
15435
15436
15437
15438
15439
15440
15441
15442
15443
15444
15445
15446
15447
15448
15449
15450
15451
15452
15453
15454
15455
15456
15457
15458
15459
15460
15461
15462
15463
15464
15465
15466
15467
15468
15469
15470
15471
15472
15473
15474
15475
15476
15477
15478
15479
15480
15481
15482
15483
15484
15485
15486
15487
15488
15489
15490
15491
15492
15493
15494
15495
15496
15497
15498
15499
15500
15501
15502
15503
15504
15505
15506
15507
15508
15509
15510
15511
15512
15513
15514
15515
15516
15517
15518
15519
15520
15521
15522
15523
15524
15525
15526
15527
15528
15529
15530
15531
15532
15533
15534
15535
15536
15537
15538
15539
15540
15541
15542
15543
15544
15545
15546
15547
15548
15549
15550
15551
15552
15553
15554
15555
15556
15557
15558
15559
15560
15561
15562
15563
15564
15565
15566
15567
15568
15569
15570
15571
15572
15573
15574
15575
15576
15577
15578
15579
15580
15581
15582
15583
15584
15585
15586
15587
15588
15589
15590
15591
15592
15593
15594
15595
15596
15597
15598
15599
15600
15601
15602
15603
15604
15605
15606
15607
15608
15609
15610
15611
15612
15613
15614
15615
15616
15617
15618
15619
15620
15621
15622
15623
15624
15625
15626
15627
15628
15629
15630
15631
15632
15633
15634
15635
15636
15637
15638
15639
15640
15641
15642
15643
15644
15645
15646
15647
15648
15649
15650
15651
15652
15653
15654
15655
15656
15657
15658
15659
15660
15661
15662
15663
15664
15665
15666
15667
15668
15669
15670
15671
15672
15673
15674
15675
15676
15677
15678
15679
15680
15681
15682
15683
15684
15685
15686
15687
15688
15689
15690
15691
15692
15693
15694
15695
15696
15697
15698
15699
15700
15701
15702
15703
15704
15705
15706
15707
15708
15709
15710
15711
15712
15713
15714
15715
15716
15717
15718
15719
15720
15721
15722
15723
15724
15725
15726
15727
15728
15729
15730
15731
15732
15733
15734
15735
15736
15737
15738
15739
15740
15741
15742
15743
15744
15745
15746
15747
15748
15749
15750
15751
15752
15753
15754
15755
15756
15757
15758
15759
15760
15761
15762
15763
15764
15765
15766
15767
15768
15769
15770
15771
15772
15773
15774
15775
15776
15777
15778
15779
15780
15781
15782
15783
15784
15785
15786
15787
15788
15789
15790
15791
15792
15793
15794
15795
15796
15797
15798
15799
15800
15801
15802
15803
15804
15805
15806
15807
15808
15809
15810
15811
15812
15813
15814
15815
15816
15817
15818
15819
15820
15821
15822
15823
15824
15825
15826
15827
15828
15829
15830
15831
15832
15833
15834
15835
15836
15837
15838
15839
15840
15841
15842
15843
15844
15845
15846
15847
15848
15849
15850
15851
15852
15853
15854
15855
15856
15857
15858
15859
15860
15861
15862
15863
15864
15865
15866
15867
15868
15869
15870
15871
15872
15873
15874
15875
15876
15877
15878
15879
15880
15881
15882
15883
15884
15885
15886
15887
15888
15889
15890
15891
15892
15893
15894
15895
15896
15897
15898
15899
15900
15901
15902
15903
15904
15905
15906
15907
15908
15909
15910
15911
15912
15913
15914
15915
15916
15917
15918
15919
15920
15921
15922
15923
15924
15925
15926
15927
15928
15929
15930
15931
15932
15933
15934
15935
15936
15937
15938
15939
15940
15941
15942
15943
15944
15945
15946
15947
15948
15949
15950
15951
15952
15953
15954
15955
15956
15957
15958
15959
15960
15961
15962
15963
15964
15965
15966
15967
15968
15969
15970
15971
15972
15973
15974
15975
15976
15977
15978
15979
15980
15981
15982
15983
15984
15985
15986
15987
15988
15989
15990
15991
15992
15993
15994
15995
15996
15997
15998
15999
16000
16001
16002
16003
16004
16005
16006
16007
16008
16009
16010
16011
16012
16013
16014
16015
16016
16017
16018
16019
16020
16021
16022
16023
16024
16025
16026
16027
16028
16029
16030
16031
16032
16033
16034
16035
16036
16037
16038
16039
16040
16041
16042
16043
16044
16045
16046
16047
16048
16049
16050
16051
16052
16053
16054
16055
16056
16057
16058
16059
16060
16061
16062
16063
16064
16065
16066
16067
16068
16069
16070
16071
16072
16073
16074
16075
16076
16077
16078
16079
16080
16081
16082
16083
16084
16085
16086
16087
16088
16089
16090
16091
16092
16093
16094
16095
16096
16097
16098
16099
16100
16101
16102
16103
16104
16105
16106
16107
16108
16109
16110
16111
16112
16113
16114
16115
16116
16117
16118
16119
16120
16121
16122
16123
16124
16125
16126
16127
16128
16129
16130
16131
16132
16133
16134
16135
16136
16137
16138
16139
16140
16141
16142
16143
16144
16145
16146
16147
16148
16149
16150
16151
16152
16153
16154
16155
16156
16157
16158
16159
16160
16161
16162
16163
16164
16165
16166
16167
16168
16169
16170
16171
16172
16173
16174
16175
16176
16177
16178
16179
16180
16181
16182
16183
16184
16185
16186
16187
16188
16189
16190
16191
16192
16193
16194
16195
16196
16197
16198
16199
16200
16201
16202
16203
16204
16205
16206
16207
16208
16209
16210
16211
16212
16213
16214
16215
16216
16217
16218
16219
16220
16221
16222
16223
16224
16225
16226
16227
16228
16229
16230
16231
16232
16233
16234
16235
16236
16237
16238
16239
16240
16241
16242
16243
16244
16245
16246
16247
16248
16249
16250
16251
16252
16253
16254
16255
16256
16257
16258
16259
16260
16261
16262
16263
16264
16265
16266
16267
16268
16269
16270
16271
16272
16273
16274
16275
16276
16277
16278
16279
16280
16281
16282
16283
16284
16285
16286
16287
16288
16289
16290
16291
16292
16293
16294
16295
16296
16297
16298
16299
16300
16301
16302
16303
16304
16305
16306
16307
16308
16309
16310
16311
16312
16313
16314
16315
16316
16317
16318
16319
16320
16321
16322
16323
16324
16325
16326
16327
16328
16329
16330
16331
16332
16333
16334
16335
16336
16337
16338
16339
16340
16341
16342
16343
16344
16345
16346
16347
16348
16349
16350
16351
16352
16353
16354
16355
16356
16357
16358
16359
16360
16361
16362
16363
16364
16365
16366
16367
16368
16369
16370
16371
16372
16373
16374
16375
16376
16377
16378
16379
16380
16381
16382
16383
16384
16385
16386
16387
16388
16389
16390
16391
16392
16393
16394
16395
16396
16397
16398
16399
16400
16401
16402
16403
16404
16405
16406
16407
16408
16409
16410
16411
16412
16413
16414
16415
16416
16417
16418
16419
16420
16421
16422
16423
16424
16425
16426
16427
16428
16429
16430
16431
16432
16433
16434
16435
16436
16437
16438
16439
16440
16441
16442
16443
16444
16445
16446
16447
16448
16449
16450
16451
16452
16453
16454
16455
16456
16457
16458
16459
16460
16461
16462
16463
16464
16465
16466
16467
16468
16469
16470
16471
16472
16473
16474
16475
16476
16477
16478
16479
16480
16481
16482
16483
16484
16485
16486
16487
16488
16489
16490
16491
16492
16493
16494
16495
16496
16497
16498
16499
16500
16501
16502
16503
16504
16505
16506
16507
16508
16509
16510
16511
16512
16513
16514
16515
16516
16517
16518
16519
16520
16521
16522
16523
16524
16525
16526
16527
16528
16529
16530
16531
16532
16533
16534
16535
16536
16537
16538
16539
16540
16541
16542
16543
16544
16545
16546
16547
16548
16549
16550
16551
16552
16553
16554
16555
16556
16557
16558
16559
16560
16561
16562
16563
16564
16565
16566
16567
16568
16569
16570
16571
16572
16573
16574
16575
16576
16577
16578
16579
16580
16581
16582
16583
16584
16585
16586
16587
16588
16589
16590
16591
16592
16593
16594
16595
16596
16597
16598
16599
16600
16601
16602
16603
16604
16605
16606
16607
16608
16609
16610
16611
16612
16613
16614
16615
16616
16617
16618
16619
16620
16621
16622
16623
16624
16625
16626
16627
16628
16629
16630
16631
16632
16633
16634
16635
16636
16637
16638
16639
16640
16641
16642
16643
16644
16645
16646
16647
16648
16649
16650
16651
16652
16653
16654
16655
16656
16657
16658
16659
16660
16661
16662
16663
16664
16665
16666
16667
16668
16669
16670
16671
16672
16673
16674
16675
16676
16677
16678
16679
16680
16681
16682
16683
16684
16685
16686
16687
16688
16689
16690
16691
16692
16693
16694
16695
16696
16697
16698
16699
16700
16701
16702
16703
16704
16705
16706
16707
16708
16709
16710
16711
16712
16713
16714
16715
16716
16717
16718
16719
16720
16721
16722
16723
16724
16725
16726
16727
16728
16729
16730
16731
16732
16733
16734
16735
16736
16737
16738
16739
16740
16741
16742
16743
16744
16745
16746
16747
16748
16749
16750
16751
16752
16753
16754
16755
16756
16757
16758
16759
16760
16761
16762
16763
16764
16765
16766
16767
16768
16769
16770
16771
16772
16773
16774
16775
16776
16777
16778
16779
16780
16781
16782
16783
16784
16785
16786
16787
16788
16789
16790
16791
16792
16793
16794
16795
16796
16797
16798
16799
16800
16801
16802
16803
16804
16805
16806
16807
16808
16809
16810
16811
16812
16813
16814
16815
16816
16817
16818
16819
16820
16821
16822
16823
16824
16825
16826
16827
16828
16829
16830
16831
16832
16833
16834
16835
16836
16837
16838
16839
16840
16841
16842
16843
16844
16845
16846
16847
16848
16849
16850
16851
16852
16853
16854
16855
16856
16857
16858
16859
16860
16861
16862
16863
16864
16865
16866
16867
16868
16869
16870
16871
16872
16873
16874
16875
16876
16877
16878
16879
16880
16881
16882
16883
16884
16885
16886
16887
16888
16889
16890
16891
16892
16893
16894
16895
16896
16897
16898
16899
16900
16901
16902
16903
16904
16905
16906
16907
16908
16909
16910
16911
16912
16913
16914
16915
16916
16917
16918
16919
16920
16921
16922
16923
16924
16925
16926
16927
16928
16929
16930
16931
16932
16933
16934
16935
16936
16937
16938
16939
16940
16941
16942
16943
16944
16945
16946
16947
16948
16949
16950
16951
16952
16953
16954
16955
16956
16957
16958
16959
16960
16961
16962
16963
16964
16965
16966
16967
16968
16969
16970
16971
16972
16973
16974
16975
16976
16977
16978
16979
16980
16981
16982
16983
16984
16985
16986
16987
16988
16989
16990
16991
16992
16993
16994
16995
16996
16997
16998
16999
17000
17001
17002
17003
17004
17005
17006
17007
17008
17009
17010
17011
17012
17013
17014
17015
17016
17017
17018
17019
17020
17021
17022
17023
17024
17025
17026
17027
17028
17029
17030
17031
17032
17033
17034
17035
17036
17037
17038
17039
17040
17041
17042
17043
17044
17045
17046
17047
17048
17049
17050
17051
17052
17053
17054
17055
17056
17057
17058
17059
17060
17061
17062
17063
17064
17065
17066
17067
17068
17069
17070
17071
17072
17073
17074
17075
17076
17077
17078
17079
17080
17081
17082
17083
17084
17085
17086
17087
17088
17089
17090
17091
17092
17093
17094
17095
17096
17097
17098
17099
17100
17101
17102
17103
17104
17105
17106
17107
17108
17109
17110
17111
17112
17113
17114
17115
17116
17117
17118
17119
17120
17121
17122
17123
17124
17125
17126
17127
17128
17129
17130
17131
17132
17133
17134
17135
17136
17137
17138
17139
17140
17141
17142
17143
17144
17145
17146
17147
17148
17149
17150
17151
17152
17153
17154
17155
17156
17157
17158
17159
17160
17161
17162
17163
17164
17165
17166
17167
17168
17169
17170
17171
17172
17173
17174
17175
17176
17177
17178
17179
17180
17181
17182
17183
17184
17185
17186
17187
17188
17189
17190
17191
17192
17193
17194
17195
17196
17197
17198
17199
17200
17201
17202
17203
17204
17205
17206
17207
17208
17209
17210
17211
17212
17213
17214
17215
17216
17217
17218
17219
17220
17221
17222
17223
17224
17225
17226
17227
17228
17229
17230
17231
17232
17233
17234
17235
17236
17237
17238
17239
17240
17241
17242
17243
17244
17245
17246
17247
17248
17249
17250
17251
17252
17253
17254
17255
17256
17257
17258
17259
17260
17261
17262
17263
17264
17265
17266
17267
17268
17269
17270
17271
17272
17273
17274
17275
17276
17277
17278
17279
17280
17281
17282
17283
17284
17285
17286
17287
17288
17289
17290
17291
17292
17293
17294
17295
17296
17297
17298
17299
17300
17301
17302
17303
17304
17305
17306
17307
17308
17309
17310
17311
17312
17313
17314
17315
17316
17317
17318
17319
17320
17321
17322
17323
17324
17325
17326
17327
17328
17329
17330
17331
17332
17333
17334
17335
17336
17337
17338
17339
17340
17341
17342
17343
17344
17345
17346
17347
17348
17349
17350
17351
17352
17353
17354
17355
17356
17357
17358
17359
17360
17361
17362
17363
17364
17365
17366
17367
17368
17369
17370
17371
17372
17373
17374
17375
17376
17377
17378
17379
17380
17381
17382
17383
17384
17385
17386
17387
17388
17389
17390
17391
17392
17393
17394
17395
17396
17397
17398
17399
17400
17401
17402
17403
17404
17405
17406
17407
17408
17409
17410
17411
17412
17413
17414
17415
17416
17417
17418
17419
17420
17421
17422
17423
17424
17425
17426
17427
17428
17429
17430
17431
17432
17433
17434
17435
17436
17437
17438
17439
17440
17441
17442
17443
17444
17445
17446
17447
17448
17449
17450
17451
17452
17453
17454
17455
17456
17457
17458
17459
17460
17461
17462
17463
17464
17465
17466
17467
17468
17469
17470
17471
17472
17473
17474
17475
17476
17477
17478
17479
17480
17481
17482
17483
17484
17485
17486
17487
17488
17489
17490
17491
17492
17493
17494
17495
17496
17497
17498
17499
17500
17501
17502
17503
17504
17505
17506
17507
17508
17509
17510
17511
17512
17513
17514
17515
17516
17517
17518
17519
17520
17521
17522
17523
17524
17525
17526
17527
17528
17529
17530
17531
17532
17533
17534
17535
17536
17537
17538
17539
17540
17541
17542
17543
17544
17545
17546
17547
17548
17549
17550
17551
17552
17553
17554
17555
17556
17557
17558
17559
17560
17561
17562
17563
17564
17565
17566
17567
17568
17569
17570
17571
17572
17573
17574
17575
17576
17577
17578
17579
17580
17581
17582
17583
17584
17585
17586
17587
17588
17589
17590
17591
17592
17593
17594
17595
17596
17597
17598
17599
17600
17601
17602
17603
17604
17605
17606
17607
17608
17609
17610
17611
17612
17613
17614
17615
17616
17617
17618
17619
17620
17621
17622
17623
17624
17625
17626
17627
17628
17629
17630
17631
17632
17633
17634
17635
17636
17637
17638
17639
17640
17641
17642
17643
17644
17645
17646
17647
17648
17649
17650
17651
17652
17653
17654
17655
17656
17657
17658
17659
17660
17661
17662
17663
17664
17665
17666
17667
17668
17669
17670
17671
17672
17673
17674
17675
17676
17677
17678
17679
17680
17681
17682
17683
17684
17685
17686
17687
17688
17689
17690
17691
17692
17693
17694
17695
17696
17697
17698
17699
17700
17701
17702
17703
17704
17705
17706
17707
17708
17709
17710
17711
17712
17713
17714
17715
17716
17717
17718
17719
17720
17721
17722
17723
17724
17725
17726
17727
17728
17729
17730
17731
17732
17733
17734
17735
17736
17737
17738
17739
17740
17741
17742
17743
17744
17745
17746
17747
17748
17749
17750
17751
17752
17753
17754
17755
17756
17757
17758
17759
17760
17761
17762
17763
17764
17765
17766
17767
17768
17769
17770
17771
17772
17773
17774
17775
17776
17777
17778
17779
17780
17781
17782
17783
17784
17785
17786
17787
17788
17789
17790
17791
17792
17793
17794
17795
17796
17797
17798
17799
17800
17801
17802
17803
17804
17805
17806
17807
17808
17809
17810
17811
17812
17813
17814
17815
17816
17817
17818
17819
17820
17821
17822
17823
17824
17825
17826
17827
17828
17829
17830
17831
17832
17833
17834
17835
17836
17837
17838
17839
17840
17841
17842
17843
17844
17845
17846
17847
17848
17849
17850
17851
17852
17853
17854
17855
17856
17857
17858
17859
17860
17861
17862
17863
17864
17865
17866
17867
17868
17869
17870
17871
17872
17873
17874
17875
17876
17877
17878
17879
17880
17881
17882
17883
17884
17885
17886
17887
17888
17889
17890
17891
17892
17893
17894
17895
17896
17897
17898
17899
17900
17901
17902
17903
17904
17905
17906
17907
17908
17909
17910
17911
17912
17913
17914
17915
17916
17917
17918
17919
17920
17921
17922
17923
17924
17925
17926
17927
17928
17929
17930
17931
17932
17933
17934
17935
17936
17937
17938
17939
17940
17941
17942
17943
17944
17945
17946
17947
17948
17949
17950
17951
17952
17953
17954
17955
17956
17957
17958
17959
17960
17961
17962
17963
17964
17965
17966
17967
17968
17969
17970
17971
17972
17973
17974
17975
17976
17977
17978
17979
17980
17981
17982
17983
17984
17985
17986
17987
17988
17989
17990
17991
17992
17993
17994
17995
17996
17997
17998
17999
18000
18001
18002
18003
18004
18005
18006
18007
18008
18009
18010
18011
18012
18013
18014
18015
18016
18017
18018
18019
18020
18021
18022
18023
18024
18025
18026
18027
18028
18029
18030
18031
18032
18033
18034
18035
18036
18037
18038
18039
18040
18041
18042
18043
18044
18045
18046
18047
18048
18049
18050
18051
18052
18053
18054
18055
18056
18057
18058
18059
18060
18061
18062
18063
18064
18065
18066
18067
18068
18069
18070
18071
18072
18073
18074
18075
18076
18077
18078
18079
18080
18081
18082
18083
18084
18085
18086
18087
18088
18089
18090
18091
18092
18093
18094
18095
18096
18097
18098
18099
18100
18101
18102
18103
18104
18105
18106
18107
18108
18109
18110
18111
18112
18113
18114
18115
18116
18117
18118
18119
18120
18121
18122
18123
18124
18125
18126
18127
18128
18129
18130
18131
18132
18133
18134
18135
18136
18137
18138
18139
18140
18141
18142
18143
18144
18145
18146
18147
18148
18149
18150
18151
18152
18153
18154
18155
18156
18157
18158
18159
18160
18161
18162
18163
18164
18165
18166
18167
18168
18169
18170
18171
18172
18173
18174
18175
18176
18177
18178
18179
18180
18181
18182
18183
18184
18185
18186
18187
18188
18189
18190
18191
18192
18193
18194
18195
18196
18197
18198
18199
18200
18201
18202
18203
18204
18205
18206
18207
18208
18209
18210
18211
18212
18213
18214
18215
18216
18217
18218
18219
18220
18221
18222
18223
18224
18225
18226
18227
18228
18229
18230
18231
18232
18233
18234
18235
18236
18237
18238
18239
18240
18241
18242
18243
18244
18245
18246
18247
18248
18249
18250
18251
18252
18253
18254
18255
18256
18257
18258
18259
18260
18261
18262
18263
18264
18265
18266
18267
18268
18269
18270
18271
18272
18273
18274
18275
18276
18277
18278
18279
18280
18281
18282
18283
18284
18285
18286
18287
18288
18289
18290
18291
18292
18293
18294
18295
18296
18297
18298
18299
18300
18301
18302
18303
18304
18305
18306
18307
18308
18309
18310
18311
18312
18313
18314
18315
18316
18317
18318
18319
18320
18321
18322
18323
18324
18325
18326
18327
18328
18329
18330
18331
18332
18333
18334
18335
18336
18337
18338
18339
18340
18341
18342
18343
18344
18345
18346
18347
18348
18349
18350
18351
18352
18353
18354
18355
18356
18357
18358
18359
18360
18361
18362
18363
18364
18365
18366
18367
18368
18369
18370
18371
18372
18373
18374
18375
18376
18377
18378
18379
18380
18381
18382
18383
18384
18385
18386
18387
18388
18389
18390
18391
18392
18393
18394
18395
18396
18397
18398
18399
18400
18401
18402
18403
18404
18405
18406
18407
18408
18409
18410
18411
18412
18413
18414
18415
18416
18417
18418
18419
18420
18421
18422
18423
18424
18425
18426
18427
18428
18429
18430
18431
18432
18433
18434
18435
18436
18437
18438
18439
18440
18441
18442
18443
18444
18445
18446
18447
18448
18449
18450
18451
18452
18453
18454
18455
18456
18457
18458
18459
18460
18461
18462
18463
18464
18465
18466
18467
18468
18469
18470
18471
18472
18473
18474
18475
18476
18477
18478
18479
18480
18481
18482
18483
18484
18485
18486
18487
18488
18489
18490
18491
18492
18493
18494
18495
18496
18497
18498
18499
18500
18501
18502
18503
18504
18505
18506
18507
18508
18509
18510
18511
18512
18513
18514
18515
18516
18517
18518
18519
18520
18521
18522
18523
18524
18525
18526
18527
18528
18529
18530
18531
18532
18533
18534
18535
18536
18537
18538
18539
18540
18541
18542
18543
18544
18545
18546
18547
18548
18549
18550
18551
18552
18553
18554
18555
18556
18557
18558
18559
18560
18561
18562
18563
18564
18565
18566
18567
18568
18569
18570
18571
18572
18573
18574
18575
18576
18577
18578
18579
18580
18581
18582
18583
18584
18585
18586
18587
18588
18589
18590
18591
18592
18593
18594
18595
18596
18597
18598
18599
18600
18601
18602
18603
18604
18605
18606
18607
18608
18609
18610
18611
18612
18613
18614
18615
18616
18617
18618
18619
18620
18621
18622
18623
18624
18625
18626
18627
18628
18629
18630
18631
18632
18633
18634
18635
18636
18637
18638
18639
18640
18641
18642
18643
18644
18645
18646
18647
18648
18649
18650
18651
18652
18653
18654
18655
18656
18657
18658
18659
18660
18661
18662
18663
18664
18665
18666
18667
18668
18669
18670
18671
18672
18673
18674
18675
18676
18677
18678
18679
18680
18681
18682
18683
18684
18685
18686
18687
18688
18689
18690
18691
18692
18693
18694
18695
18696
18697
18698
18699
18700
18701
18702
18703
18704
18705
18706
18707
18708
18709
18710
18711
18712
18713
18714
18715
18716
18717
18718
18719
18720
18721
18722
18723
18724
18725
18726
18727
18728
18729
18730
18731
18732
18733
18734
18735
18736
18737
18738
18739
18740
18741
18742
18743
18744
18745
18746
18747
18748
18749
18750
18751
18752
18753
18754
18755
18756
18757
18758
18759
18760
18761
18762
18763
18764
18765
18766
18767
18768
18769
18770
18771
18772
18773
18774
18775
18776
18777
18778
18779
18780
18781
18782
18783
18784
18785
18786
18787
18788
18789
18790
18791
18792
18793
18794
18795
18796
18797
18798
18799
18800
18801
18802
18803
18804
18805
18806
18807
18808
18809
18810
18811
18812
18813
18814
18815
18816
18817
18818
18819
18820
18821
18822
18823
18824
18825
18826
18827
18828
18829
18830
18831
18832
18833
18834
18835
18836
18837
18838
18839
18840
18841
18842
18843
18844
18845
18846
18847
18848
18849
18850
18851
18852
18853
18854
18855
18856
18857
18858
18859
18860
18861
18862
18863
18864
18865
18866
18867
18868
18869
18870
18871
18872
18873
18874
18875
18876
18877
18878
18879
18880
18881
18882
18883
18884
18885
18886
18887
18888
18889
18890
18891
18892
18893
18894
18895
18896
18897
18898
18899
18900
18901
18902
18903
18904
18905
18906
18907
18908
18909
18910
18911
18912
18913
18914
18915
18916
18917
18918
18919
18920
18921
18922
18923
18924
18925
18926
18927
18928
18929
18930
18931
18932
18933
18934
18935
18936
18937
18938
18939
18940
18941
18942
18943
18944
18945
18946
18947
18948
18949
18950
18951
18952
18953
18954
18955
18956
18957
18958
18959
18960
18961
18962
18963
18964
18965
18966
18967
18968
18969
18970
18971
18972
18973
18974
18975
18976
18977
18978
18979
18980
18981
18982
18983
18984
18985
18986
18987
18988
18989
18990
18991
18992
18993
18994
18995
18996
18997
18998
18999
19000
19001
19002
19003
19004
19005
19006
19007
19008
19009
19010
19011
19012
19013
19014
19015
19016
19017
19018
19019
19020
19021
19022
19023
19024
19025
19026
19027
19028
19029
19030
19031
19032
19033
19034
19035
19036
19037
19038
19039
19040
19041
19042
19043
19044
19045
19046
19047
19048
19049
19050
19051
19052
19053
19054
19055
19056
19057
19058
19059
19060
19061
19062
19063
19064
19065
19066
19067
19068
19069
19070
19071
19072
19073
19074
19075
19076
19077
19078
19079
19080
19081
19082
19083
19084
19085
19086
19087
19088
19089
19090
19091
19092
19093
19094
19095
19096
19097
19098
19099
19100
19101
19102
19103
19104
19105
19106
19107
19108
19109
19110
19111
19112
19113
19114
19115
19116
19117
19118
19119
19120
19121
19122
19123
19124
19125
19126
19127
19128
19129
19130
19131
19132
19133
19134
19135
19136
19137
19138
19139
19140
19141
19142
19143
19144
19145
19146
19147
19148
19149
19150
19151
19152
19153
19154
19155
19156
19157
19158
19159
19160
19161
19162
19163
19164
19165
19166
19167
19168
19169
19170
19171
19172
19173
19174
19175
19176
19177
19178
19179
19180
19181
19182
19183
19184
19185
19186
19187
19188
19189
19190
19191
19192
19193
19194
19195
19196
19197
19198
19199
19200
19201
19202
19203
19204
19205
19206
19207
19208
19209
19210
19211
19212
19213
19214
19215
19216
19217
19218
19219
19220
19221
19222
19223
19224
19225
19226
19227
19228
19229
19230
19231
19232
19233
19234
19235
19236
19237
19238
19239
19240
19241
19242
19243
19244
19245
19246
19247
19248
19249
19250
19251
19252
19253
19254
19255
19256
19257
19258
19259
19260
19261
19262
19263
19264
19265
19266
19267
19268
19269
19270
19271
19272
19273
19274
19275
19276
19277
19278
19279
19280
19281
19282
19283
19284
19285
19286
19287
19288
19289
19290
19291
19292
19293
19294
19295
19296
19297
19298
19299
19300
19301
19302
19303
19304
19305
19306
19307
19308
19309
19310
19311
19312
19313
19314
19315
19316
19317
19318
19319
19320
19321
19322
19323
19324
19325
19326
19327
19328
19329
19330
19331
19332
19333
19334
19335
19336
19337
19338
19339
19340
19341
19342
19343
19344
19345
19346
19347
19348
19349
19350
19351
19352
19353
19354
19355
19356
19357
19358
19359
19360
19361
19362
19363
19364
19365
19366
19367
19368
19369
19370
19371
19372
19373
19374
19375
19376
19377
19378
19379
19380
19381
19382
19383
19384
19385
19386
19387
19388
19389
19390
19391
19392
19393
19394
19395
19396
19397
19398
19399
19400
19401
19402
19403
19404
19405
19406
19407
19408
19409
19410
19411
19412
19413
19414
19415
19416
19417
19418
19419
19420
19421
19422
19423
19424
19425
19426
19427
19428
19429
19430
19431
19432
19433
19434
19435
19436
19437
19438
19439
19440
19441
19442
19443
19444
19445
19446
19447
19448
19449
19450
19451
19452
19453
19454
19455
19456
19457
19458
19459
19460
19461
19462
19463
19464
19465
19466
19467
19468
19469
19470
19471
19472
19473
19474
19475
19476
19477
19478
19479
19480
19481
19482
19483
19484
19485
19486
19487
19488
19489
19490
19491
19492
19493
19494
19495
19496
19497
19498
19499
19500
19501
19502
19503
19504
19505
19506
19507
19508
19509
19510
19511
19512
19513
19514
19515
19516
19517
19518
19519
19520
19521
19522
19523
19524
19525
19526
19527
19528
19529
19530
19531
19532
19533
19534
19535
19536
19537
19538
19539
19540
19541
19542
19543
19544
19545
19546
19547
19548
19549
19550
19551
19552
19553
19554
19555
19556
19557
19558
19559
19560
19561
19562
19563
19564
19565
19566
19567
19568
19569
19570
19571
19572
19573
19574
19575
19576
19577
19578
19579
19580
19581
19582
19583
19584
19585
19586
19587
19588
19589
19590
19591
19592
19593
19594
19595
19596
19597
19598
19599
19600
19601
19602
19603
19604
19605
19606
19607
19608
19609
19610
19611
19612
19613
19614
19615
19616
19617
19618
19619
19620
19621
19622
19623
19624
19625
19626
19627
19628
19629
19630
19631
19632
19633
19634
19635
19636
19637
19638
19639
19640
19641
19642
19643
19644
19645
19646
19647
19648
19649
19650
19651
19652
19653
19654
19655
19656
19657
19658
19659
19660
19661
19662
19663
19664
19665
19666
19667
19668
19669
19670
19671
19672
19673
19674
19675
19676
19677
19678
19679
19680
19681
19682
19683
19684
19685
19686
19687
19688
19689
19690
19691
19692
19693
19694
19695
19696
19697
19698
19699
19700
19701
19702
19703
19704
19705
19706
19707
19708
19709
19710
19711
19712
19713
19714
19715
19716
19717
19718
19719
19720
19721
19722
19723
19724
19725
19726
19727
19728
19729
19730
19731
19732
19733
19734
19735
19736
19737
19738
19739
19740
19741
19742
19743
19744
19745
19746
19747
19748
19749
19750
19751
19752
19753
19754
19755
19756
19757
19758
19759
19760
19761
19762
19763
19764
19765
19766
19767
19768
19769
19770
19771
19772
19773
19774
19775
19776
19777
19778
19779
19780
19781
19782
19783
19784
19785
19786
19787
19788
19789
19790
19791
19792
19793
19794
19795
19796
19797
19798
19799
19800
19801
19802
19803
19804
19805
19806
19807
19808
19809
19810
19811
19812
19813
19814
19815
19816
19817
19818
19819
19820
19821
19822
19823
19824
19825
19826
19827
19828
19829
19830
19831
19832
19833
19834
19835
19836
19837
19838
19839
19840
19841
19842
19843
19844
19845
19846
19847
19848
19849
19850
19851
19852
19853
19854
19855
19856
19857
19858
19859
19860
19861
19862
19863
19864
19865
19866
19867
19868
19869
19870
19871
19872
19873
19874
19875
19876
19877
19878
19879
19880
19881
19882
19883
19884
19885
19886
19887
19888
19889
19890
19891
19892
19893
19894
19895
19896
19897
19898
19899
19900
19901
19902
19903
19904
19905
19906
19907
19908
19909
19910
19911
19912
19913
19914
19915
19916
19917
19918
19919
19920
19921
19922
19923
19924
19925
19926
19927
19928
19929
19930
19931
19932
19933
19934
19935
19936
19937
19938
19939
19940
19941
19942
19943
19944
19945
19946
19947
19948
19949
19950
19951
19952
19953
19954
19955
19956
19957
19958
19959
19960
19961
19962
19963
19964
19965
19966
19967
19968
19969
19970
19971
19972
19973
19974
19975
19976
19977
19978
19979
19980
19981
19982
19983
19984
19985
19986
19987
19988
19989
19990
19991
19992
19993
19994
19995
19996
19997
19998
19999
20000
20001
20002
20003
20004
20005
20006
20007
20008
20009
20010
20011
20012
20013
20014
20015
20016
20017
20018
20019
20020
20021
20022
20023
20024
20025
20026
20027
20028
20029
20030
20031
20032
20033
20034
20035
20036
20037
20038
20039
20040
20041
20042
20043
20044
20045
20046
20047
20048
20049
20050
20051
20052
20053
20054
20055
20056
20057
20058
20059
20060
20061
20062
20063
20064
20065
20066
20067
20068
20069
20070
20071
20072
20073
20074
20075
20076
20077
20078
20079
20080
20081
20082
20083
20084
20085
20086
20087
20088
20089
20090
20091
20092
20093
20094
20095
20096
20097
20098
20099
20100
20101
20102
20103
20104
20105
20106
20107
20108
20109
20110
20111
20112
20113
20114
20115
20116
20117
20118
20119
20120
20121
20122
20123
20124
20125
20126
20127
20128
20129
20130
20131
20132
20133
20134
20135
20136
20137
20138
20139
20140
20141
20142
20143
20144
20145
20146
20147
20148
20149
20150
20151
20152
20153
20154
20155
20156
20157
20158
20159
20160
20161
20162
20163
20164
20165
20166
20167
20168
20169
20170
20171
20172
20173
20174
20175
20176
20177
20178
20179
20180
20181
20182
20183
20184
20185
20186
20187
20188
20189
20190
20191
20192
20193
20194
20195
20196
20197
20198
20199
20200
20201
20202
20203
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
 * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
 */
#include <uapi/linux/btf.h>
#include <linux/bpf-cgroup.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <net/netlink.h>
#include <linux/file.h>
#include <linux/vmalloc.h>
#include <linux/stringify.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/perf_event.h>
#include <linux/ctype.h>
#include <linux/error-injection.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/poison.h>
#include <linux/module.h>
#include <linux/cpumask.h>
#include <linux/bpf_mem_alloc.h>
#include <net/xdp.h>
#include <linux/trace_events.h>
#include <linux/kallsyms.h>

#include "disasm.h"

static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        [_id] = & _name ## _verifier_ops,
#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
};

enum bpf_features {
        BPF_FEAT_RDONLY_CAST_TO_VOID = 0,
        BPF_FEAT_STREAMS             = 1,
        __MAX_BPF_FEAT,
};

struct bpf_mem_alloc bpf_global_percpu_ma;
static bool bpf_global_percpu_ma_set;

/* bpf_check() is a static code analyzer that walks eBPF program
 * instruction by instruction and updates register/stack state.
 * All paths of conditional branches are analyzed until 'bpf_exit' insn.
 *
 * The first pass is depth-first-search to check that the program is a DAG.
 * It rejects the following programs:
 * - larger than BPF_MAXINSNS insns
 * - if loop is present (detected via back-edge)
 * - unreachable insns exist (shouldn't be a forest. program = one function)
 * - out of bounds or malformed jumps
 * The second pass is all possible path descent from the 1st insn.
 * Since it's analyzing all paths through the program, the length of the
 * analysis is limited to 64k insn, which may be hit even if total number of
 * insn is less then 4K, but there are too many branches that change stack/regs.
 * Number of 'branches to be analyzed' is limited to 1k
 *
 * On entry to each instruction, each register has a type, and the instruction
 * changes the types of the registers depending on instruction semantics.
 * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
 * copied to R1.
 *
 * All registers are 64-bit.
 * R0 - return register
 * R1-R5 argument passing registers
 * R6-R9 callee saved registers
 * R10 - frame pointer read-only
 *
 * At the start of BPF program the register R1 contains a pointer to bpf_context
 * and has type PTR_TO_CTX.
 *
 * Verifier tracks arithmetic operations on pointers in case:
 *    BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
 *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
 * 1st insn copies R10 (which has FRAME_PTR) type into R1
 * and 2nd arithmetic instruction is pattern matched to recognize
 * that it wants to construct a pointer to some element within stack.
 * So after 2nd insn, the register R1 has type PTR_TO_STACK
 * (and -20 constant is saved for further stack bounds checking).
 * Meaning that this reg is a pointer to stack plus known immediate constant.
 *
 * Most of the time the registers have SCALAR_VALUE type, which
 * means the register has some value, but it's not a valid pointer.
 * (like pointer plus pointer becomes SCALAR_VALUE type)
 *
 * When verifier sees load or store instructions the type of base register
 * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are
 * four pointer types recognized by check_mem_access() function.
 *
 * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
 * and the range of [ptr, ptr + map's value_size) is accessible.
 *
 * registers used to pass values to function calls are checked against
 * function argument constraints.
 *
 * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
 * It means that the register type passed to this function must be
 * PTR_TO_STACK and it will be used inside the function as
 * 'pointer to map element key'
 *
 * For example the argument constraints for bpf_map_lookup_elem():
 *   .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
 *   .arg1_type = ARG_CONST_MAP_PTR,
 *   .arg2_type = ARG_PTR_TO_MAP_KEY,
 *
 * ret_type says that this function returns 'pointer to map elem value or null'
 * function expects 1st argument to be a const pointer to 'struct bpf_map' and
 * 2nd argument should be a pointer to stack, which will be used inside
 * the helper function as a pointer to map element key.
 *
 * On the kernel side the helper function looks like:
 * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 * {
 *    struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 *    void *key = (void *) (unsigned long) r2;
 *    void *value;
 *
 *    here kernel can access 'key' and 'map' pointers safely, knowing that
 *    [key, key + map->key_size) bytes are valid and were initialized on
 *    the stack of eBPF program.
 * }
 *
 * Corresponding eBPF program may look like:
 *    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),  // after this insn R2 type is FRAME_PTR
 *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
 *    BPF_LD_MAP_FD(BPF_REG_1, map_fd),      // after this insn R1 type is CONST_PTR_TO_MAP
 *    BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 * here verifier looks at prototype of map_lookup_elem() and sees:
 * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
 * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
 *
 * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
 * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
 * and were initialized prior to this call.
 * If it's ok, then verifier allows this BPF_CALL insn and looks at
 * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
 * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
 * returns either pointer to map value or NULL.
 *
 * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
 * insn, the register holding that pointer in the true branch changes state to
 * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
 * branch. See check_cond_jmp_op().
 *
 * After the call R0 is set to return type of the function and registers R1-R5
 * are set to NOT_INIT to indicate that they are no longer readable.
 *
 * The following reference types represent a potential reference to a kernel
 * resource which, after first being allocated, must be checked and freed by
 * the BPF program:
 * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET
 *
 * When the verifier sees a helper call return a reference type, it allocates a
 * pointer id for the reference and stores it in the current function state.
 * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into
 * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
 * passes through a NULL-check conditional. For the branch wherein the state is
 * changed to CONST_IMM, the verifier releases the reference.
 *
 * For each helper function that allocates a reference, such as
 * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
 * bpf_sk_release(). When a reference type passes into the release function,
 * the verifier also releases the reference. If any unchecked or unreleased
 * reference remains at the end of the program, the verifier rejects it.
 */

/* verifier_state + insn_idx are pushed to stack when branch is encountered */
struct bpf_verifier_stack_elem {
        /* verifier state is 'st'
         * before processing instruction 'insn_idx'
         * and after processing instruction 'prev_insn_idx'
         */
        struct bpf_verifier_state st;
        int insn_idx;
        int prev_insn_idx;
        struct bpf_verifier_stack_elem *next;
        /* length of verifier log at the time this state was pushed on stack */
        u32 log_pos;
};

#define BPF_COMPLEXITY_LIMIT_JMP_SEQ        8192
#define BPF_COMPLEXITY_LIMIT_STATES        64

#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512

#define BPF_PRIV_STACK_MIN_SIZE                64

static int acquire_reference(struct bpf_verifier_env *env, int insn_idx);
static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
static int ref_set_non_owning(struct bpf_verifier_env *env,
                              struct bpf_reg_state *reg);
static bool is_trusted_reg(const struct bpf_reg_state *reg);
static inline bool in_sleepable_context(struct bpf_verifier_env *env);
static const char *non_sleepable_context_description(struct bpf_verifier_env *env);
static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg);
static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg);

static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
                              struct bpf_map *map,
                              bool unpriv, bool poison)
{
        unpriv |= bpf_map_ptr_unpriv(aux);
        aux->map_ptr_state.unpriv = unpriv;
        aux->map_ptr_state.poison = poison;
        aux->map_ptr_state.map_ptr = map;
}

static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
{
        bool poisoned = bpf_map_key_poisoned(aux);

        aux->map_key_state = state | BPF_MAP_KEY_SEEN |
                             (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}

struct bpf_call_arg_meta {
        struct bpf_map_desc map;
        bool raw_mode;
        bool pkt_access;
        u8 release_regno;
        int regno;
        int access_size;
        int mem_size;
        u64 msize_max_value;
        int ref_obj_id;
        int dynptr_id;
        int func_id;
        struct btf *btf;
        u32 btf_id;
        struct btf *ret_btf;
        u32 ret_btf_id;
        u32 subprogno;
        struct btf_field *kptr_field;
        s64 const_map_key;
};

struct bpf_kfunc_meta {
        struct btf *btf;
        const struct btf_type *proto;
        const char *name;
        const u32 *flags;
        s32 id;
};

struct btf *btf_vmlinux;

static const char *btf_type_name(const struct btf *btf, u32 id)
{
        return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}

static DEFINE_MUTEX(bpf_verifier_lock);
static DEFINE_MUTEX(bpf_percpu_ma_lock);

__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
{
        struct bpf_verifier_env *env = private_data;
        va_list args;

        if (!bpf_verifier_log_needed(&env->log))
                return;

        va_start(args, fmt);
        bpf_verifier_vlog(&env->log, fmt, args);
        va_end(args);
}

static void verbose_invalid_scalar(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *reg,
                                   struct bpf_retval_range range, const char *ctx,
                                   const char *reg_name)
{
        bool unknown = true;

        verbose(env, "%s the register %s has", ctx, reg_name);
        if (reg->smin_value > S64_MIN) {
                verbose(env, " smin=%lld", reg->smin_value);
                unknown = false;
        }
        if (reg->smax_value < S64_MAX) {
                verbose(env, " smax=%lld", reg->smax_value);
                unknown = false;
        }
        if (unknown)
                verbose(env, " unknown scalar value");
        verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
}

static bool reg_not_null(const struct bpf_reg_state *reg)
{
        enum bpf_reg_type type;

        type = reg->type;
        if (type_may_be_null(type))
                return false;

        type = base_type(type);
        return type == PTR_TO_SOCKET ||
                type == PTR_TO_TCP_SOCK ||
                type == PTR_TO_MAP_VALUE ||
                type == PTR_TO_MAP_KEY ||
                type == PTR_TO_SOCK_COMMON ||
                (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
                (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) ||
                type == CONST_PTR_TO_MAP;
}

static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
{
        struct btf_record *rec = NULL;
        struct btf_struct_meta *meta;

        if (reg->type == PTR_TO_MAP_VALUE) {
                rec = reg->map_ptr->record;
        } else if (type_is_ptr_alloc_obj(reg->type)) {
                meta = btf_find_struct_meta(reg->btf, reg->btf_id);
                if (meta)
                        rec = meta->record;
        }
        return rec;
}

bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog)
{
        struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;

        return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
}

static bool subprog_returns_void(struct bpf_verifier_env *env, int subprog)
{
        const struct btf_type *type, *func, *func_proto;
        const struct btf *btf = env->prog->aux->btf;
        u32 btf_id;

        btf_id = env->prog->aux->func_info[subprog].type_id;

        func = btf_type_by_id(btf, btf_id);
        if (verifier_bug_if(!func, env, "btf_id %u not found", btf_id))
                return false;

        func_proto = btf_type_by_id(btf, func->type);
        if (!func_proto)
                return false;

        type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
        if (!type)
                return false;

        return btf_type_is_void(type);
}

static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
{
        struct bpf_func_info *info;

        if (!env->prog->aux->func_info)
                return "";

        info = &env->prog->aux->func_info[subprog];
        return btf_type_name(env->prog->aux->btf, info->type_id);
}

void bpf_mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
{
        struct bpf_subprog_info *info = subprog_info(env, subprog);

        info->is_cb = true;
        info->is_async_cb = true;
        info->is_exception_cb = true;
}

static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
{
        return subprog_info(env, subprog)->is_exception_cb;
}

static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
        return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK);
}

static bool type_is_rdonly_mem(u32 type)
{
        return type & MEM_RDONLY;
}

static bool is_acquire_function(enum bpf_func_id func_id,
                                const struct bpf_map *map)
{
        enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;

        if (func_id == BPF_FUNC_sk_lookup_tcp ||
            func_id == BPF_FUNC_sk_lookup_udp ||
            func_id == BPF_FUNC_skc_lookup_tcp ||
            func_id == BPF_FUNC_ringbuf_reserve ||
            func_id == BPF_FUNC_kptr_xchg)
                return true;

        if (func_id == BPF_FUNC_map_lookup_elem &&
            (map_type == BPF_MAP_TYPE_SOCKMAP ||
             map_type == BPF_MAP_TYPE_SOCKHASH))
                return true;

        return false;
}

static bool is_ptr_cast_function(enum bpf_func_id func_id)
{
        return func_id == BPF_FUNC_tcp_sock ||
                func_id == BPF_FUNC_sk_fullsock ||
                func_id == BPF_FUNC_skc_to_tcp_sock ||
                func_id == BPF_FUNC_skc_to_tcp6_sock ||
                func_id == BPF_FUNC_skc_to_udp6_sock ||
                func_id == BPF_FUNC_skc_to_mptcp_sock ||
                func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
                func_id == BPF_FUNC_skc_to_tcp_request_sock;
}

static bool is_dynptr_ref_function(enum bpf_func_id func_id)
{
        return func_id == BPF_FUNC_dynptr_data;
}

static bool is_sync_callback_calling_kfunc(u32 btf_id);
static bool is_async_callback_calling_kfunc(u32 btf_id);
static bool is_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);

static bool is_bpf_wq_set_callback_kfunc(u32 btf_id);
static bool is_task_work_add_kfunc(u32 func_id);

static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
        return func_id == BPF_FUNC_for_each_map_elem ||
               func_id == BPF_FUNC_find_vma ||
               func_id == BPF_FUNC_loop ||
               func_id == BPF_FUNC_user_ringbuf_drain;
}

static bool is_async_callback_calling_function(enum bpf_func_id func_id)
{
        return func_id == BPF_FUNC_timer_set_callback;
}

static bool is_callback_calling_function(enum bpf_func_id func_id)
{
        return is_sync_callback_calling_function(func_id) ||
               is_async_callback_calling_function(func_id);
}

bool bpf_is_sync_callback_calling_insn(struct bpf_insn *insn)
{
        return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
               (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
}

bool bpf_is_async_callback_calling_insn(struct bpf_insn *insn)
{
        return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
               (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
}

static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        /* bpf_timer callbacks are never sleepable. */
        if (bpf_helper_call(insn) && insn->imm == BPF_FUNC_timer_set_callback)
                return false;

        /* bpf_wq and bpf_task_work callbacks are always sleepable. */
        if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
            (is_bpf_wq_set_callback_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
                return true;

        verifier_bug(env, "unhandled async callback in is_async_cb_sleepable");
        return false;
}

bool bpf_is_may_goto_insn(struct bpf_insn *insn)
{
        return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
}

static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
                                        const struct bpf_map *map)
{
        int ref_obj_uses = 0;

        if (is_ptr_cast_function(func_id))
                ref_obj_uses++;
        if (is_acquire_function(func_id, map))
                ref_obj_uses++;
        if (is_dynptr_ref_function(func_id))
                ref_obj_uses++;

        return ref_obj_uses > 1;
}


static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
{
       int allocated_slots = state->allocated_stack / BPF_REG_SIZE;

       /* We need to check that slots between [spi - nr_slots + 1, spi] are
        * within [0, allocated_stack).
        *
        * Please note that the spi grows downwards. For example, a dynptr
        * takes the size of two stack slots; the first slot will be at
        * spi and the second slot will be at spi - 1.
        */
       return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
}

static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                  const char *obj_kind, int nr_slots)
{
        int off, spi;

        if (!tnum_is_const(reg->var_off)) {
                verbose(env, "%s has to be at a constant offset\n", obj_kind);
                return -EINVAL;
        }

        off = reg->var_off.value;
        if (off % BPF_REG_SIZE) {
                verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
                return -EINVAL;
        }

        spi = bpf_get_spi(off);
        if (spi + 1 < nr_slots) {
                verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
                return -EINVAL;
        }

        if (!is_spi_bounds_valid(bpf_func(env, reg), spi, nr_slots))
                return -ERANGE;
        return spi;
}

static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
}

static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
{
        return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
}

static int irq_flag_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        return stack_slot_obj_get_spi(env, reg, "irq_flag", 1);
}

static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
{
        switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
        case DYNPTR_TYPE_LOCAL:
                return BPF_DYNPTR_TYPE_LOCAL;
        case DYNPTR_TYPE_RINGBUF:
                return BPF_DYNPTR_TYPE_RINGBUF;
        case DYNPTR_TYPE_SKB:
                return BPF_DYNPTR_TYPE_SKB;
        case DYNPTR_TYPE_XDP:
                return BPF_DYNPTR_TYPE_XDP;
        case DYNPTR_TYPE_SKB_META:
                return BPF_DYNPTR_TYPE_SKB_META;
        case DYNPTR_TYPE_FILE:
                return BPF_DYNPTR_TYPE_FILE;
        default:
                return BPF_DYNPTR_TYPE_INVALID;
        }
}

static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
{
        switch (type) {
        case BPF_DYNPTR_TYPE_LOCAL:
                return DYNPTR_TYPE_LOCAL;
        case BPF_DYNPTR_TYPE_RINGBUF:
                return DYNPTR_TYPE_RINGBUF;
        case BPF_DYNPTR_TYPE_SKB:
                return DYNPTR_TYPE_SKB;
        case BPF_DYNPTR_TYPE_XDP:
                return DYNPTR_TYPE_XDP;
        case BPF_DYNPTR_TYPE_SKB_META:
                return DYNPTR_TYPE_SKB_META;
        case BPF_DYNPTR_TYPE_FILE:
                return DYNPTR_TYPE_FILE;
        default:
                return 0;
        }
}

static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
        return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE;
}

static void __mark_dynptr_reg(struct bpf_reg_state *reg,
                              enum bpf_dynptr_type type,
                              bool first_slot, int dynptr_id);


static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *sreg1,
                                   struct bpf_reg_state *sreg2,
                                   enum bpf_dynptr_type type)
{
        int id = ++env->id_gen;

        __mark_dynptr_reg(sreg1, type, true, id);
        __mark_dynptr_reg(sreg2, type, false, id);
}

static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
                               struct bpf_reg_state *reg,
                               enum bpf_dynptr_type type)
{
        __mark_dynptr_reg(reg, type, true, ++env->id_gen);
}

static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
                                        struct bpf_func_state *state, int spi);

static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                   enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        enum bpf_dynptr_type type;
        int spi, i, err;

        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return spi;

        /* We cannot assume both spi and spi - 1 belong to the same dynptr,
         * hence we need to call destroy_if_dynptr_stack_slot twice for both,
         * to ensure that for the following example:
         *        [d1][d1][d2][d2]
         * spi    3   2   1   0
         * So marking spi = 2 should lead to destruction of both d1 and d2. In
         * case they do belong to same dynptr, second call won't see slot_type
         * as STACK_DYNPTR and will simply skip destruction.
         */
        err = destroy_if_dynptr_stack_slot(env, state, spi);
        if (err)
                return err;
        err = destroy_if_dynptr_stack_slot(env, state, spi - 1);
        if (err)
                return err;

        for (i = 0; i < BPF_REG_SIZE; i++) {
                state->stack[spi].slot_type[i] = STACK_DYNPTR;
                state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
        }

        type = arg_to_dynptr_type(arg_type);
        if (type == BPF_DYNPTR_TYPE_INVALID)
                return -EINVAL;

        mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
                               &state->stack[spi - 1].spilled_ptr, type);

        if (dynptr_type_refcounted(type)) {
                /* The id is used to track proper releasing */
                int id;

                if (clone_ref_obj_id)
                        id = clone_ref_obj_id;
                else
                        id = acquire_reference(env, insn_idx);

                if (id < 0)
                        return id;

                state->stack[spi].spilled_ptr.ref_obj_id = id;
                state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
        }

        return 0;
}

static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
{
        int i;

        for (i = 0; i < BPF_REG_SIZE; i++) {
                state->stack[spi].slot_type[i] = STACK_INVALID;
                state->stack[spi - 1].slot_type[i] = STACK_INVALID;
        }

        bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
        bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
}

static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        int spi, ref_obj_id, i;

        /*
         * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
         * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
         * is safe to do directly.
         */
        if (reg->type == CONST_PTR_TO_DYNPTR) {
                verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released");
                return -EFAULT;
        }
        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return spi;

        if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
                invalidate_dynptr(env, state, spi);
                return 0;
        }

        ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;

        /* If the dynptr has a ref_obj_id, then we need to invalidate
         * two things:
         *
         * 1) Any dynptrs with a matching ref_obj_id (clones)
         * 2) Any slices derived from this dynptr.
         */

        /* Invalidate any slices associated with this dynptr */
        WARN_ON_ONCE(release_reference(env, ref_obj_id));

        /* Invalidate any dynptr clones */
        for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
                if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
                        continue;

                /* it should always be the case that if the ref obj id
                 * matches then the stack slot also belongs to a
                 * dynptr
                 */
                if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
                        verifier_bug(env, "misconfigured ref_obj_id");
                        return -EFAULT;
                }
                if (state->stack[i].spilled_ptr.dynptr.first_slot)
                        invalidate_dynptr(env, state, i);
        }

        return 0;
}

static void __mark_reg_unknown(const struct bpf_verifier_env *env,
                               struct bpf_reg_state *reg);

static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        if (!env->allow_ptr_leaks)
                bpf_mark_reg_not_init(env, reg);
        else
                __mark_reg_unknown(env, reg);
}

static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
                                        struct bpf_func_state *state, int spi)
{
        struct bpf_func_state *fstate;
        struct bpf_reg_state *dreg;
        int i, dynptr_id;

        /* We always ensure that STACK_DYNPTR is never set partially,
         * hence just checking for slot_type[0] is enough. This is
         * different for STACK_SPILL, where it may be only set for
         * 1 byte, so code has to use is_spilled_reg.
         */
        if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
                return 0;

        /* Reposition spi to first slot */
        if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
                spi = spi + 1;

        if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
                int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
                int ref_cnt = 0;

                /*
                 * A referenced dynptr can be overwritten only if there is at
                 * least one other dynptr sharing the same ref_obj_id,
                 * ensuring the reference can still be properly released.
                 */
                for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
                        if (state->stack[i].slot_type[0] != STACK_DYNPTR)
                                continue;
                        if (!state->stack[i].spilled_ptr.dynptr.first_slot)
                                continue;
                        if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id)
                                ref_cnt++;
                }

                if (ref_cnt <= 1) {
                        verbose(env, "cannot overwrite referenced dynptr\n");
                        return -EINVAL;
                }
        }

        mark_stack_slot_scratched(env, spi);
        mark_stack_slot_scratched(env, spi - 1);

        /* Writing partially to one dynptr stack slot destroys both. */
        for (i = 0; i < BPF_REG_SIZE; i++) {
                state->stack[spi].slot_type[i] = STACK_INVALID;
                state->stack[spi - 1].slot_type[i] = STACK_INVALID;
        }

        dynptr_id = state->stack[spi].spilled_ptr.id;
        /* Invalidate any slices associated with this dynptr */
        bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
                /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
                if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
                        continue;
                if (dreg->dynptr_id == dynptr_id)
                        mark_reg_invalid(env, dreg);
        }));

        /* Do not release reference state, we are destroying dynptr on stack,
         * not using some helper to release it. Just reset register.
         */
        bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
        bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);

        return 0;
}

static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        int spi;

        if (reg->type == CONST_PTR_TO_DYNPTR)
                return false;

        spi = dynptr_get_spi(env, reg);

        /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
         * error because this just means the stack state hasn't been updated yet.
         * We will do check_mem_access to check and update stack bounds later.
         */
        if (spi < 0 && spi != -ERANGE)
                return false;

        /* We don't need to check if the stack slots are marked by previous
         * dynptr initializations because we allow overwriting existing unreferenced
         * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
         * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
         * touching are completely destructed before we reinitialize them for a new
         * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
         * instead of delaying it until the end where the user will get "Unreleased
         * reference" error.
         */
        return true;
}

static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        int i, spi;

        /* This already represents first slot of initialized bpf_dynptr.
         *
         * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
         * check_func_arg_reg_off's logic, so we don't need to check its
         * offset and alignment.
         */
        if (reg->type == CONST_PTR_TO_DYNPTR)
                return true;

        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return false;
        if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
                return false;

        for (i = 0; i < BPF_REG_SIZE; i++) {
                if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
                    state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
                        return false;
        }

        return true;
}

static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                    enum bpf_arg_type arg_type)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        enum bpf_dynptr_type dynptr_type;
        int spi;

        /* ARG_PTR_TO_DYNPTR takes any type of dynptr */
        if (arg_type == ARG_PTR_TO_DYNPTR)
                return true;

        dynptr_type = arg_to_dynptr_type(arg_type);
        if (reg->type == CONST_PTR_TO_DYNPTR) {
                return reg->dynptr.type == dynptr_type;
        } else {
                spi = dynptr_get_spi(env, reg);
                if (spi < 0)
                        return false;
                return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
        }
}

static void __mark_reg_known_zero(struct bpf_reg_state *reg);

static bool in_rcu_cs(struct bpf_verifier_env *env);

static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);

static int mark_stack_slots_iter(struct bpf_verifier_env *env,
                                 struct bpf_kfunc_call_arg_meta *meta,
                                 struct bpf_reg_state *reg, int insn_idx,
                                 struct btf *btf, u32 btf_id, int nr_slots)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        int spi, i, j, id;

        spi = iter_get_spi(env, reg, nr_slots);
        if (spi < 0)
                return spi;

        id = acquire_reference(env, insn_idx);
        if (id < 0)
                return id;

        for (i = 0; i < nr_slots; i++) {
                struct bpf_stack_state *slot = &state->stack[spi - i];
                struct bpf_reg_state *st = &slot->spilled_ptr;

                __mark_reg_known_zero(st);
                st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
                if (is_kfunc_rcu_protected(meta)) {
                        if (in_rcu_cs(env))
                                st->type |= MEM_RCU;
                        else
                                st->type |= PTR_UNTRUSTED;
                }
                st->ref_obj_id = i == 0 ? id : 0;
                st->iter.btf = btf;
                st->iter.btf_id = btf_id;
                st->iter.state = BPF_ITER_STATE_ACTIVE;
                st->iter.depth = 0;

                for (j = 0; j < BPF_REG_SIZE; j++)
                        slot->slot_type[j] = STACK_ITER;

                mark_stack_slot_scratched(env, spi - i);
        }

        return 0;
}

static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *reg, int nr_slots)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        int spi, i, j;

        spi = iter_get_spi(env, reg, nr_slots);
        if (spi < 0)
                return spi;

        for (i = 0; i < nr_slots; i++) {
                struct bpf_stack_state *slot = &state->stack[spi - i];
                struct bpf_reg_state *st = &slot->spilled_ptr;

                if (i == 0)
                        WARN_ON_ONCE(release_reference(env, st->ref_obj_id));

                bpf_mark_reg_not_init(env, st);

                for (j = 0; j < BPF_REG_SIZE; j++)
                        slot->slot_type[j] = STACK_INVALID;

                mark_stack_slot_scratched(env, spi - i);
        }

        return 0;
}

static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
                                     struct bpf_reg_state *reg, int nr_slots)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        int spi, i, j;

        /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
         * will do check_mem_access to check and update stack bounds later, so
         * return true for that case.
         */
        spi = iter_get_spi(env, reg, nr_slots);
        if (spi == -ERANGE)
                return true;
        if (spi < 0)
                return false;

        for (i = 0; i < nr_slots; i++) {
                struct bpf_stack_state *slot = &state->stack[spi - i];

                for (j = 0; j < BPF_REG_SIZE; j++)
                        if (slot->slot_type[j] == STACK_ITER)
                                return false;
        }

        return true;
}

static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                   struct btf *btf, u32 btf_id, int nr_slots)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        int spi, i, j;

        spi = iter_get_spi(env, reg, nr_slots);
        if (spi < 0)
                return -EINVAL;

        for (i = 0; i < nr_slots; i++) {
                struct bpf_stack_state *slot = &state->stack[spi - i];
                struct bpf_reg_state *st = &slot->spilled_ptr;

                if (st->type & PTR_UNTRUSTED)
                        return -EPROTO;
                /* only main (first) slot has ref_obj_id set */
                if (i == 0 && !st->ref_obj_id)
                        return -EINVAL;
                if (i != 0 && st->ref_obj_id)
                        return -EINVAL;
                if (st->iter.btf != btf || st->iter.btf_id != btf_id)
                        return -EINVAL;

                for (j = 0; j < BPF_REG_SIZE; j++)
                        if (slot->slot_type[j] != STACK_ITER)
                                return -EINVAL;
        }

        return 0;
}

static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx);
static int release_irq_state(struct bpf_verifier_state *state, int id);

static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
                                     struct bpf_kfunc_call_arg_meta *meta,
                                     struct bpf_reg_state *reg, int insn_idx,
                                     int kfunc_class)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        struct bpf_stack_state *slot;
        struct bpf_reg_state *st;
        int spi, i, id;

        spi = irq_flag_get_spi(env, reg);
        if (spi < 0)
                return spi;

        id = acquire_irq_state(env, insn_idx);
        if (id < 0)
                return id;

        slot = &state->stack[spi];
        st = &slot->spilled_ptr;

        __mark_reg_known_zero(st);
        st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
        st->ref_obj_id = id;
        st->irq.kfunc_class = kfunc_class;

        for (i = 0; i < BPF_REG_SIZE; i++)
                slot->slot_type[i] = STACK_IRQ_FLAG;

        mark_stack_slot_scratched(env, spi);
        return 0;
}

static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                      int kfunc_class)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        struct bpf_stack_state *slot;
        struct bpf_reg_state *st;
        int spi, i, err;

        spi = irq_flag_get_spi(env, reg);
        if (spi < 0)
                return spi;

        slot = &state->stack[spi];
        st = &slot->spilled_ptr;

        if (st->irq.kfunc_class != kfunc_class) {
                const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock";
                const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock";

                verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n",
                        flag_kfunc, used_kfunc);
                return -EINVAL;
        }

        err = release_irq_state(env->cur_state, st->ref_obj_id);
        WARN_ON_ONCE(err && err != -EACCES);
        if (err) {
                int insn_idx = 0;

                for (int i = 0; i < env->cur_state->acquired_refs; i++) {
                        if (env->cur_state->refs[i].id == env->cur_state->active_irq_id) {
                                insn_idx = env->cur_state->refs[i].insn_idx;
                                break;
                        }
                }

                verbose(env, "cannot restore irq state out of order, expected id=%d acquired at insn_idx=%d\n",
                        env->cur_state->active_irq_id, insn_idx);
                return err;
        }

        bpf_mark_reg_not_init(env, st);

        for (i = 0; i < BPF_REG_SIZE; i++)
                slot->slot_type[i] = STACK_INVALID;

        mark_stack_slot_scratched(env, spi);
        return 0;
}

static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        struct bpf_stack_state *slot;
        int spi, i;

        /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
         * will do check_mem_access to check and update stack bounds later, so
         * return true for that case.
         */
        spi = irq_flag_get_spi(env, reg);
        if (spi == -ERANGE)
                return true;
        if (spi < 0)
                return false;

        slot = &state->stack[spi];

        for (i = 0; i < BPF_REG_SIZE; i++)
                if (slot->slot_type[i] == STACK_IRQ_FLAG)
                        return false;
        return true;
}

static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        struct bpf_stack_state *slot;
        struct bpf_reg_state *st;
        int spi, i;

        spi = irq_flag_get_spi(env, reg);
        if (spi < 0)
                return -EINVAL;

        slot = &state->stack[spi];
        st = &slot->spilled_ptr;

        if (!st->ref_obj_id)
                return -EINVAL;

        for (i = 0; i < BPF_REG_SIZE; i++)
                if (slot->slot_type[i] != STACK_IRQ_FLAG)
                        return -EINVAL;
        return 0;
}

/* Check if given stack slot is "special":
 *   - spilled register state (STACK_SPILL);
 *   - dynptr state (STACK_DYNPTR);
 *   - iter state (STACK_ITER).
 *   - irq flag state (STACK_IRQ_FLAG)
 */
static bool is_stack_slot_special(const struct bpf_stack_state *stack)
{
        enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];

        switch (type) {
        case STACK_SPILL:
        case STACK_DYNPTR:
        case STACK_ITER:
        case STACK_IRQ_FLAG:
                return true;
        case STACK_INVALID:
        case STACK_POISON:
        case STACK_MISC:
        case STACK_ZERO:
                return false;
        default:
                WARN_ONCE(1, "unknown stack slot type %d\n", type);
                return true;
        }
}

/* The reg state of a pointer or a bounded scalar was saved when
 * it was spilled to the stack.
 */

/*
 * Mark stack slot as STACK_MISC, unless it is already:
 * - STACK_INVALID, in which case they are equivalent.
 * - STACK_ZERO, in which case we preserve more precise STACK_ZERO.
 * - STACK_POISON, which truly forbids access to the slot.
 * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged
 * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is
 * unnecessary as both are considered equivalent when loading data and pruning,
 * in case of unprivileged mode it will be incorrect to allow reads of invalid
 * slots.
 */
static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{
        if (*stype == STACK_ZERO)
                return;
        if (*stype == STACK_INVALID || *stype == STACK_POISON)
                return;
        *stype = STACK_MISC;
}

static void scrub_spilled_slot(u8 *stype)
{
        if (*stype != STACK_INVALID && *stype != STACK_POISON)
                *stype = STACK_MISC;
}

/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
 * small to hold src. This is different from krealloc since we don't want to preserve
 * the contents of dst.
 *
 * Leaves dst untouched if src is NULL or length is zero. Returns NULL if memory could
 * not be allocated.
 */
static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
{
        size_t alloc_bytes;
        void *orig = dst;
        size_t bytes;

        if (ZERO_OR_NULL_PTR(src))
                goto out;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes));
        dst = krealloc(orig, alloc_bytes, flags);
        if (!dst) {
                kfree(orig);
                return NULL;
        }

        memcpy(dst, src, bytes);
out:
        return dst ? dst : ZERO_SIZE_PTR;
}

/* resize an array from old_n items to new_n items. the array is reallocated if it's too
 * small to hold new_n items. new items are zeroed out if the array grows.
 *
 * Contrary to krealloc_array, does not free arr if new_n is zero.
 */
static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
{
        size_t alloc_size;
        void *new_arr;

        if (!new_n || old_n == new_n)
                goto out;

        alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
        new_arr = krealloc(arr, alloc_size, GFP_KERNEL_ACCOUNT);
        if (!new_arr) {
                kfree(arr);
                return NULL;
        }
        arr = new_arr;

        if (new_n > old_n)
                memset(arr + old_n * size, 0, (new_n - old_n) * size);

out:
        return arr ? arr : ZERO_SIZE_PTR;
}

static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src)
{
        dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
                               sizeof(struct bpf_reference_state), GFP_KERNEL_ACCOUNT);
        if (!dst->refs)
                return -ENOMEM;

        dst->acquired_refs = src->acquired_refs;
        dst->active_locks = src->active_locks;
        dst->active_preempt_locks = src->active_preempt_locks;
        dst->active_rcu_locks = src->active_rcu_locks;
        dst->active_irq_id = src->active_irq_id;
        dst->active_lock_id = src->active_lock_id;
        dst->active_lock_ptr = src->active_lock_ptr;
        return 0;
}

static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
{
        size_t n = src->allocated_stack / BPF_REG_SIZE;

        dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state),
                                GFP_KERNEL_ACCOUNT);
        if (!dst->stack)
                return -ENOMEM;

        dst->allocated_stack = src->allocated_stack;
        return 0;
}

static int resize_reference_state(struct bpf_verifier_state *state, size_t n)
{
        state->refs = realloc_array(state->refs, state->acquired_refs, n,
                                    sizeof(struct bpf_reference_state));
        if (!state->refs)
                return -ENOMEM;

        state->acquired_refs = n;
        return 0;
}

/* Possibly update state->allocated_stack to be at least size bytes. Also
 * possibly update the function's high-water mark in its bpf_subprog_info.
 */
static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
{
        size_t old_n = state->allocated_stack / BPF_REG_SIZE, n;

        /* The stack size is always a multiple of BPF_REG_SIZE. */
        size = round_up(size, BPF_REG_SIZE);
        n = size / BPF_REG_SIZE;

        if (old_n >= n)
                return 0;

        state->stack = realloc_array(state->stack, old_n, n, sizeof(struct bpf_stack_state));
        if (!state->stack)
                return -ENOMEM;

        state->allocated_stack = size;

        /* update known max for given subprogram */
        if (env->subprog_info[state->subprogno].stack_depth < size)
                env->subprog_info[state->subprogno].stack_depth = size;

        return 0;
}

/* Acquire a pointer id from the env and update the state->refs to include
 * this new pointer reference.
 * On success, returns a valid pointer id to associate with the register
 * On failure, returns a negative errno.
 */
static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
{
        struct bpf_verifier_state *state = env->cur_state;
        int new_ofs = state->acquired_refs;
        int err;

        err = resize_reference_state(state, state->acquired_refs + 1);
        if (err)
                return NULL;
        state->refs[new_ofs].insn_idx = insn_idx;

        return &state->refs[new_ofs];
}

static int acquire_reference(struct bpf_verifier_env *env, int insn_idx)
{
        struct bpf_reference_state *s;

        s = acquire_reference_state(env, insn_idx);
        if (!s)
                return -ENOMEM;
        s->type = REF_TYPE_PTR;
        s->id = ++env->id_gen;
        return s->id;
}

static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type,
                              int id, void *ptr)
{
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_reference_state *s;

        s = acquire_reference_state(env, insn_idx);
        if (!s)
                return -ENOMEM;
        s->type = type;
        s->id = id;
        s->ptr = ptr;

        state->active_locks++;
        state->active_lock_id = id;
        state->active_lock_ptr = ptr;
        return 0;
}

static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx)
{
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_reference_state *s;

        s = acquire_reference_state(env, insn_idx);
        if (!s)
                return -ENOMEM;
        s->type = REF_TYPE_IRQ;
        s->id = ++env->id_gen;

        state->active_irq_id = s->id;
        return s->id;
}

static void release_reference_state(struct bpf_verifier_state *state, int idx)
{
        int last_idx;
        size_t rem;

        /* IRQ state requires the relative ordering of elements remaining the
         * same, since it relies on the refs array to behave as a stack, so that
         * it can detect out-of-order IRQ restore. Hence use memmove to shift
         * the array instead of swapping the final element into the deleted idx.
         */
        last_idx = state->acquired_refs - 1;
        rem = state->acquired_refs - idx - 1;
        if (last_idx && idx != last_idx)
                memmove(&state->refs[idx], &state->refs[idx + 1], sizeof(*state->refs) * rem);
        memset(&state->refs[last_idx], 0, sizeof(*state->refs));
        state->acquired_refs--;
        return;
}

static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id)
{
        int i;

        for (i = 0; i < state->acquired_refs; i++)
                if (state->refs[i].id == ptr_id)
                        return true;

        return false;
}

static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr)
{
        void *prev_ptr = NULL;
        u32 prev_id = 0;
        int i;

        for (i = 0; i < state->acquired_refs; i++) {
                if (state->refs[i].type == type && state->refs[i].id == id &&
                    state->refs[i].ptr == ptr) {
                        release_reference_state(state, i);
                        state->active_locks--;
                        /* Reassign active lock (id, ptr). */
                        state->active_lock_id = prev_id;
                        state->active_lock_ptr = prev_ptr;
                        return 0;
                }
                if (state->refs[i].type & REF_TYPE_LOCK_MASK) {
                        prev_id = state->refs[i].id;
                        prev_ptr = state->refs[i].ptr;
                }
        }
        return -EINVAL;
}

static int release_irq_state(struct bpf_verifier_state *state, int id)
{
        u32 prev_id = 0;
        int i;

        if (id != state->active_irq_id)
                return -EACCES;

        for (i = 0; i < state->acquired_refs; i++) {
                if (state->refs[i].type != REF_TYPE_IRQ)
                        continue;
                if (state->refs[i].id == id) {
                        release_reference_state(state, i);
                        state->active_irq_id = prev_id;
                        return 0;
                } else {
                        prev_id = state->refs[i].id;
                }
        }
        return -EINVAL;
}

static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *state, enum ref_state_type type,
                                                   int id, void *ptr)
{
        int i;

        for (i = 0; i < state->acquired_refs; i++) {
                struct bpf_reference_state *s = &state->refs[i];

                if (!(s->type & type))
                        continue;

                if (s->id == id && s->ptr == ptr)
                        return s;
        }
        return NULL;
}

static void free_func_state(struct bpf_func_state *state)
{
        if (!state)
                return;
        kfree(state->stack);
        kfree(state);
}

void bpf_clear_jmp_history(struct bpf_verifier_state *state)
{
        kfree(state->jmp_history);
        state->jmp_history = NULL;
        state->jmp_history_cnt = 0;
}

void bpf_free_verifier_state(struct bpf_verifier_state *state,
                            bool free_self)
{
        int i;

        for (i = 0; i <= state->curframe; i++) {
                free_func_state(state->frame[i]);
                state->frame[i] = NULL;
        }
        kfree(state->refs);
        bpf_clear_jmp_history(state);
        if (free_self)
                kfree(state);
}

/* copy verifier state from src to dst growing dst stack space
 * when necessary to accommodate larger src stack
 */
static int copy_func_state(struct bpf_func_state *dst,
                           const struct bpf_func_state *src)
{
        memcpy(dst, src, offsetof(struct bpf_func_state, stack));
        return copy_stack_state(dst, src);
}

int bpf_copy_verifier_state(struct bpf_verifier_state *dst_state,
                           const struct bpf_verifier_state *src)
{
        struct bpf_func_state *dst;
        int i, err;

        dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
                                          src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
                                          GFP_KERNEL_ACCOUNT);
        if (!dst_state->jmp_history)
                return -ENOMEM;
        dst_state->jmp_history_cnt = src->jmp_history_cnt;

        /* if dst has more stack frames then src frame, free them, this is also
         * necessary in case of exceptional exits using bpf_throw.
         */
        for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
                free_func_state(dst_state->frame[i]);
                dst_state->frame[i] = NULL;
        }
        err = copy_reference_state(dst_state, src);
        if (err)
                return err;
        dst_state->speculative = src->speculative;
        dst_state->in_sleepable = src->in_sleepable;
        dst_state->curframe = src->curframe;
        dst_state->branches = src->branches;
        dst_state->parent = src->parent;
        dst_state->first_insn_idx = src->first_insn_idx;
        dst_state->last_insn_idx = src->last_insn_idx;
        dst_state->dfs_depth = src->dfs_depth;
        dst_state->callback_unroll_depth = src->callback_unroll_depth;
        dst_state->may_goto_depth = src->may_goto_depth;
        dst_state->equal_state = src->equal_state;
        for (i = 0; i <= src->curframe; i++) {
                dst = dst_state->frame[i];
                if (!dst) {
                        dst = kzalloc_obj(*dst, GFP_KERNEL_ACCOUNT);
                        if (!dst)
                                return -ENOMEM;
                        dst_state->frame[i] = dst;
                }
                err = copy_func_state(dst, src->frame[i]);
                if (err)
                        return err;
        }
        return 0;
}

static u32 state_htab_size(struct bpf_verifier_env *env)
{
        return env->prog->len;
}

struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx)
{
        struct bpf_verifier_state *cur = env->cur_state;
        struct bpf_func_state *state = cur->frame[cur->curframe];

        return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
}

static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
{
        int fr;

        if (a->curframe != b->curframe)
                return false;

        for (fr = a->curframe; fr >= 0; fr--)
                if (a->frame[fr]->callsite != b->frame[fr]->callsite)
                        return false;

        return true;
}


void bpf_free_backedges(struct bpf_scc_visit *visit)
{
        struct bpf_scc_backedge *backedge, *next;

        for (backedge = visit->backedges; backedge; backedge = next) {
                bpf_free_verifier_state(&backedge->state, false);
                next = backedge->next;
                kfree(backedge);
        }
        visit->backedges = NULL;
}

static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
                     int *insn_idx, bool pop_log)
{
        struct bpf_verifier_state *cur = env->cur_state;
        struct bpf_verifier_stack_elem *elem, *head = env->head;
        int err;

        if (env->head == NULL)
                return -ENOENT;

        if (cur) {
                err = bpf_copy_verifier_state(cur, &head->st);
                if (err)
                        return err;
        }
        if (pop_log)
                bpf_vlog_reset(&env->log, head->log_pos);
        if (insn_idx)
                *insn_idx = head->insn_idx;
        if (prev_insn_idx)
                *prev_insn_idx = head->prev_insn_idx;
        elem = head->next;
        bpf_free_verifier_state(&head->st, false);
        kfree(head);
        env->head = elem;
        env->stack_size--;
        return 0;
}

static bool error_recoverable_with_nospec(int err)
{
        /* Should only return true for non-fatal errors that are allowed to
         * occur during speculative verification. For these we can insert a
         * nospec and the program might still be accepted. Do not include
         * something like ENOMEM because it is likely to re-occur for the next
         * architectural path once it has been recovered-from in all speculative
         * paths.
         */
        return err == -EPERM || err == -EACCES || err == -EINVAL;
}

static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
                                             int insn_idx, int prev_insn_idx,
                                             bool speculative)
{
        struct bpf_verifier_state *cur = env->cur_state;
        struct bpf_verifier_stack_elem *elem;
        int err;

        elem = kzalloc_obj(struct bpf_verifier_stack_elem, GFP_KERNEL_ACCOUNT);
        if (!elem)
                return ERR_PTR(-ENOMEM);

        elem->insn_idx = insn_idx;
        elem->prev_insn_idx = prev_insn_idx;
        elem->next = env->head;
        elem->log_pos = env->log.end_pos;
        env->head = elem;
        env->stack_size++;
        err = bpf_copy_verifier_state(&elem->st, cur);
        if (err)
                return ERR_PTR(-ENOMEM);
        elem->st.speculative |= speculative;
        if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
                verbose(env, "The sequence of %d jumps is too complex.\n",
                        env->stack_size);
                return ERR_PTR(-E2BIG);
        }
        if (elem->st.parent) {
                ++elem->st.parent->branches;
                /* WARN_ON(branches > 2) technically makes sense here,
                 * but
                 * 1. speculative states will bump 'branches' for non-branch
                 * instructions
                 * 2. is_state_visited() heuristics may decide not to create
                 * a new state for a sequence of branches and all such current
                 * and cloned states will be pointing to a single parent state
                 * which might have large 'branches' count.
                 */
        }
        return &elem->st;
}

static const int caller_saved[CALLER_SAVED_REGS] = {
        BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};

/* This helper doesn't clear reg->id */
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
        reg->var_off = tnum_const(imm);
        reg->smin_value = (s64)imm;
        reg->smax_value = (s64)imm;
        reg->umin_value = imm;
        reg->umax_value = imm;

        reg->s32_min_value = (s32)imm;
        reg->s32_max_value = (s32)imm;
        reg->u32_min_value = (u32)imm;
        reg->u32_max_value = (u32)imm;
}

/* Mark the unknown part of a register (variable offset or scalar value) as
 * known to have the value @imm.
 */
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
        /* Clear off and union(map_ptr, range) */
        memset(((u8 *)reg) + sizeof(reg->type), 0,
               offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
        reg->id = 0;
        reg->ref_obj_id = 0;
        ___mark_reg_known(reg, imm);
}

static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
{
        reg->var_off = tnum_const_subreg(reg->var_off, imm);
        reg->s32_min_value = (s32)imm;
        reg->s32_max_value = (s32)imm;
        reg->u32_min_value = (u32)imm;
        reg->u32_max_value = (u32)imm;
}

/* Mark the 'variable offset' part of a register as zero.  This should be
 * used only on registers holding a pointer type.
 */
static void __mark_reg_known_zero(struct bpf_reg_state *reg)
{
        __mark_reg_known(reg, 0);
}

static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        __mark_reg_known(reg, 0);
        reg->type = SCALAR_VALUE;
        /* all scalars are assumed imprecise initially (unless unprivileged,
         * in which case everything is forced to be precise)
         */
        reg->precise = !env->bpf_capable;
}

static void mark_reg_known_zero(struct bpf_verifier_env *env,
                                struct bpf_reg_state *regs, u32 regno)
{
        __mark_reg_known_zero(regs + regno);
}

static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
                              bool first_slot, int dynptr_id)
{
        /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
         * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
         * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
         */
        __mark_reg_known_zero(reg);
        reg->type = CONST_PTR_TO_DYNPTR;
        /* Give each dynptr a unique id to uniquely associate slices to it. */
        reg->id = dynptr_id;
        reg->dynptr.type = type;
        reg->dynptr.first_slot = first_slot;
}

static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
{
        if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
                const struct bpf_map *map = reg->map_ptr;

                if (map->inner_map_meta) {
                        reg->type = CONST_PTR_TO_MAP;
                        reg->map_ptr = map->inner_map_meta;
                        /* transfer reg's id which is unique for every map_lookup_elem
                         * as UID of the inner map.
                         */
                        if (btf_record_has_field(map->inner_map_meta->record,
                                                 BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
                                reg->map_uid = reg->id;
                        }
                } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
                        reg->type = PTR_TO_XDP_SOCK;
                } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
                           map->map_type == BPF_MAP_TYPE_SOCKHASH) {
                        reg->type = PTR_TO_SOCKET;
                } else {
                        reg->type = PTR_TO_MAP_VALUE;
                }
                return;
        }

        reg->type &= ~PTR_MAYBE_NULL;
}

static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
                                struct btf_field_graph_root *ds_head)
{
        __mark_reg_known(&regs[regno], ds_head->node_offset);
        regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
        regs[regno].btf = ds_head->btf;
        regs[regno].btf_id = ds_head->value_btf_id;
}

static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
{
        return type_is_pkt_pointer(reg->type);
}

static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
{
        return reg_is_pkt_pointer(reg) ||
               reg->type == PTR_TO_PACKET_END;
}

static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
{
        return base_type(reg->type) == PTR_TO_MEM &&
               (reg->type &
                (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META));
}

/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
                                    enum bpf_reg_type which)
{
        /* The register can already have a range from prior markings.
         * This is fine as long as it hasn't been advanced from its
         * origin.
         */
        return reg->type == which &&
               reg->id == 0 &&
               tnum_equals_const(reg->var_off, 0);
}

/* Reset the min/max bounds of a register */
static void __mark_reg_unbounded(struct bpf_reg_state *reg)
{
        reg->smin_value = S64_MIN;
        reg->smax_value = S64_MAX;
        reg->umin_value = 0;
        reg->umax_value = U64_MAX;

        reg->s32_min_value = S32_MIN;
        reg->s32_max_value = S32_MAX;
        reg->u32_min_value = 0;
        reg->u32_max_value = U32_MAX;
}

static void __mark_reg64_unbounded(struct bpf_reg_state *reg)
{
        reg->smin_value = S64_MIN;
        reg->smax_value = S64_MAX;
        reg->umin_value = 0;
        reg->umax_value = U64_MAX;
}

static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
{
        reg->s32_min_value = S32_MIN;
        reg->s32_max_value = S32_MAX;
        reg->u32_min_value = 0;
        reg->u32_max_value = U32_MAX;
}

static void reset_reg64_and_tnum(struct bpf_reg_state *reg)
{
        __mark_reg64_unbounded(reg);
        reg->var_off = tnum_unknown;
}

static void reset_reg32_and_tnum(struct bpf_reg_state *reg)
{
        __mark_reg32_unbounded(reg);
        reg->var_off = tnum_unknown;
}

static void __update_reg32_bounds(struct bpf_reg_state *reg)
{
        struct tnum var32_off = tnum_subreg(reg->var_off);

        /* min signed is max(sign bit) | min(other bits) */
        reg->s32_min_value = max_t(s32, reg->s32_min_value,
                        var32_off.value | (var32_off.mask & S32_MIN));
        /* max signed is min(sign bit) | max(other bits) */
        reg->s32_max_value = min_t(s32, reg->s32_max_value,
                        var32_off.value | (var32_off.mask & S32_MAX));
        reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
        reg->u32_max_value = min(reg->u32_max_value,
                                 (u32)(var32_off.value | var32_off.mask));
}

static void __update_reg64_bounds(struct bpf_reg_state *reg)
{
        u64 tnum_next, tmax;
        bool umin_in_tnum;

        /* min signed is max(sign bit) | min(other bits) */
        reg->smin_value = max_t(s64, reg->smin_value,
                                reg->var_off.value | (reg->var_off.mask & S64_MIN));
        /* max signed is min(sign bit) | max(other bits) */
        reg->smax_value = min_t(s64, reg->smax_value,
                                reg->var_off.value | (reg->var_off.mask & S64_MAX));
        reg->umin_value = max(reg->umin_value, reg->var_off.value);
        reg->umax_value = min(reg->umax_value,
                              reg->var_off.value | reg->var_off.mask);

        /* Check if u64 and tnum overlap in a single value */
        tnum_next = tnum_step(reg->var_off, reg->umin_value);
        umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value;
        tmax = reg->var_off.value | reg->var_off.mask;
        if (umin_in_tnum && tnum_next > reg->umax_value) {
                /* The u64 range and the tnum only overlap in umin.
                 * u64:  ---[xxxxxx]-----
                 * tnum: --xx----------x-
                 */
                ___mark_reg_known(reg, reg->umin_value);
        } else if (!umin_in_tnum && tnum_next == tmax) {
                /* The u64 range and the tnum only overlap in the maximum value
                 * represented by the tnum, called tmax.
                 * u64:  ---[xxxxxx]-----
                 * tnum: xx-----x--------
                 */
                ___mark_reg_known(reg, tmax);
        } else if (!umin_in_tnum && tnum_next <= reg->umax_value &&
                   tnum_step(reg->var_off, tnum_next) > reg->umax_value) {
                /* The u64 range and the tnum only overlap in between umin
                 * (excluded) and umax.
                 * u64:  ---[xxxxxx]-----
                 * tnum: xx----x-------x-
                 */
                ___mark_reg_known(reg, tnum_next);
        }
}

static void __update_reg_bounds(struct bpf_reg_state *reg)
{
        __update_reg32_bounds(reg);
        __update_reg64_bounds(reg);
}

/* Uses signed min/max values to inform unsigned, and vice-versa */
static void deduce_bounds_32_from_64(struct bpf_reg_state *reg)
{
        /* If upper 32 bits of u64/s64 range don't change, we can use lower 32
         * bits to improve our u32/s32 boundaries.
         *
         * E.g., the case where we have upper 32 bits as zero ([10, 20] in
         * u64) is pretty trivial, it's obvious that in u32 we'll also have
         * [10, 20] range. But this property holds for any 64-bit range as
         * long as upper 32 bits in that entire range of values stay the same.
         *
         * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
         * in decimal) has the same upper 32 bits throughout all the values in
         * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
         * range.
         *
         * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
         * following the rules outlined below about u64/s64 correspondence
         * (which equally applies to u32 vs s32 correspondence). In general it
         * depends on actual hexadecimal values of 32-bit range. They can form
         * only valid u32, or only valid s32 ranges in some cases.
         *
         * So we use all these insights to derive bounds for subregisters here.
         */
        if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
                /* u64 to u32 casting preserves validity of low 32 bits as
                 * a range, if upper 32 bits are the same
                 */
                reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
                reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);

                if ((s32)reg->umin_value <= (s32)reg->umax_value) {
                        reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
                        reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
                }
        }
        if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
                /* low 32 bits should form a proper u32 range */
                if ((u32)reg->smin_value <= (u32)reg->smax_value) {
                        reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
                        reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
                }
                /* low 32 bits should form a proper s32 range */
                if ((s32)reg->smin_value <= (s32)reg->smax_value) {
                        reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
                        reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
                }
        }
        /* Special case where upper bits form a small sequence of two
         * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
         * 0x00000000 is also valid), while lower bits form a proper s32 range
         * going from negative numbers to positive numbers. E.g., let's say we
         * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
         * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
         * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
         * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
         * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
         * upper 32 bits. As a random example, s64 range
         * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
         * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
         */
        if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
            (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
                reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
                reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
        }
        if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
            (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
                reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
                reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
        }
}

static void deduce_bounds_32_from_32(struct bpf_reg_state *reg)
{
        /* if u32 range forms a valid s32 range (due to matching sign bit),
         * try to learn from that
         */
        if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
                reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
                reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
        }
        /* If we cannot cross the sign boundary, then signed and unsigned bounds
         * are the same, so combine.  This works even in the negative case, e.g.
         * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
         */
        if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
                reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
                reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
        } else {
                if (reg->u32_max_value < (u32)reg->s32_min_value) {
                        /* See __reg64_deduce_bounds() for detailed explanation.
                         * Refine ranges in the following situation:
                         *
                         * 0                                                   U32_MAX
                         * |  [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx]              |
                         * |----------------------------|----------------------------|
                         * |xxxxx s32 range xxxxxxxxx]                       [xxxxxxx|
                         * 0                     S32_MAX S32_MIN                    -1
                         */
                        reg->s32_min_value = (s32)reg->u32_min_value;
                        reg->u32_max_value = min_t(u32, reg->u32_max_value, reg->s32_max_value);
                } else if ((u32)reg->s32_max_value < reg->u32_min_value) {
                        /*
                         * 0                                                   U32_MAX
                         * |              [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx]  |
                         * |----------------------------|----------------------------|
                         * |xxxxxxxxx]                       [xxxxxxxxxxxx s32 range |
                         * 0                     S32_MAX S32_MIN                    -1
                         */
                        reg->s32_max_value = (s32)reg->u32_max_value;
                        reg->u32_min_value = max_t(u32, reg->u32_min_value, reg->s32_min_value);
                }
        }
}

static void deduce_bounds_64_from_64(struct bpf_reg_state *reg)
{
        /* If u64 range forms a valid s64 range (due to matching sign bit),
         * try to learn from that. Let's do a bit of ASCII art to see when
         * this is happening. Let's take u64 range first:
         *
         * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
         * |-------------------------------|--------------------------------|
         *
         * Valid u64 range is formed when umin and umax are anywhere in the
         * range [0, U64_MAX], and umin <= umax. u64 case is simple and
         * straightforward. Let's see how s64 range maps onto the same range
         * of values, annotated below the line for comparison:
         *
         * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
         * |-------------------------------|--------------------------------|
         * 0                        S64_MAX S64_MIN                        -1
         *
         * So s64 values basically start in the middle and they are logically
         * contiguous to the right of it, wrapping around from -1 to 0, and
         * then finishing as S64_MAX (0x7fffffffffffffff) right before
         * S64_MIN. We can try drawing the continuity of u64 vs s64 values
         * more visually as mapped to sign-agnostic range of hex values.
         *
         *  u64 start                                               u64 end
         *  _______________________________________________________________
         * /                                                               \
         * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
         * |-------------------------------|--------------------------------|
         * 0                        S64_MAX S64_MIN                        -1
         *                                / \
         * >------------------------------   ------------------------------->
         * s64 continues...        s64 end   s64 start          s64 "midpoint"
         *
         * What this means is that, in general, we can't always derive
         * something new about u64 from any random s64 range, and vice versa.
         *
         * But we can do that in two particular cases. One is when entire
         * u64/s64 range is *entirely* contained within left half of the above
         * diagram or when it is *entirely* contained in the right half. I.e.:
         *
         * |-------------------------------|--------------------------------|
         *     ^                   ^            ^                 ^
         *     A                   B            C                 D
         *
         * [A, B] and [C, D] are contained entirely in their respective halves
         * and form valid contiguous ranges as both u64 and s64 values. [A, B]
         * will be non-negative both as u64 and s64 (and in fact it will be
         * identical ranges no matter the signedness). [C, D] treated as s64
         * will be a range of negative values, while in u64 it will be
         * non-negative range of values larger than 0x8000000000000000.
         *
         * Now, any other range here can't be represented in both u64 and s64
         * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
         * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
         * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
         * for example. Similarly, valid s64 range [D, A] (going from negative
         * to positive values), would be two separate [D, U64_MAX] and [0, A]
         * ranges as u64. Currently reg_state can't represent two segments per
         * numeric domain, so in such situations we can only derive maximal
         * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
         *
         * So we use these facts to derive umin/umax from smin/smax and vice
         * versa only if they stay within the same "half". This is equivalent
         * to checking sign bit: lower half will have sign bit as zero, upper
         * half have sign bit 1. Below in code we simplify this by just
         * casting umin/umax as smin/smax and checking if they form valid
         * range, and vice versa. Those are equivalent checks.
         */
        if ((s64)reg->umin_value <= (s64)reg->umax_value) {
                reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
                reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
        }
        /* If we cannot cross the sign boundary, then signed and unsigned bounds
         * are the same, so combine.  This works even in the negative case, e.g.
         * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
         */
        if ((u64)reg->smin_value <= (u64)reg->smax_value) {
                reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
                reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
        } else {
                /* If the s64 range crosses the sign boundary, then it's split
                 * between the beginning and end of the U64 domain. In that
                 * case, we can derive new bounds if the u64 range overlaps
                 * with only one end of the s64 range.
                 *
                 * In the following example, the u64 range overlaps only with
                 * positive portion of the s64 range.
                 *
                 * 0                                                   U64_MAX
                 * |  [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx]              |
                 * |----------------------------|----------------------------|
                 * |xxxxx s64 range xxxxxxxxx]                       [xxxxxxx|
                 * 0                     S64_MAX S64_MIN                    -1
                 *
                 * We can thus derive the following new s64 and u64 ranges.
                 *
                 * 0                                                   U64_MAX
                 * |  [xxxxxx u64 range xxxxx]                               |
                 * |----------------------------|----------------------------|
                 * |  [xxxxxx s64 range xxxxx]                               |
                 * 0                     S64_MAX S64_MIN                    -1
                 *
                 * If they overlap in two places, we can't derive anything
                 * because reg_state can't represent two ranges per numeric
                 * domain.
                 *
                 * 0                                                   U64_MAX
                 * |  [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx]        |
                 * |----------------------------|----------------------------|
                 * |xxxxx s64 range xxxxxxxxx]                    [xxxxxxxxxx|
                 * 0                     S64_MAX S64_MIN                    -1
                 *
                 * The first condition below corresponds to the first diagram
                 * above.
                 */
                if (reg->umax_value < (u64)reg->smin_value) {
                        reg->smin_value = (s64)reg->umin_value;
                        reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value);
                } else if ((u64)reg->smax_value < reg->umin_value) {
                        /* This second condition considers the case where the u64 range
                         * overlaps with the negative portion of the s64 range:
                         *
                         * 0                                                   U64_MAX
                         * |              [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx]  |
                         * |----------------------------|----------------------------|
                         * |xxxxxxxxx]                       [xxxxxxxxxxxx s64 range |
                         * 0                     S64_MAX S64_MIN                    -1
                         */
                        reg->smax_value = (s64)reg->umax_value;
                        reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value);
                }
        }
}

static void deduce_bounds_64_from_32(struct bpf_reg_state *reg)
{
        /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
         * values on both sides of 64-bit range in hope to have tighter range.
         * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
         * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
         * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
         * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
         * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
         * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
         * We just need to make sure that derived bounds we are intersecting
         * with are well-formed ranges in respective s64 or u64 domain, just
         * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
         */
        __u64 new_umin, new_umax;
        __s64 new_smin, new_smax;

        /* u32 -> u64 tightening, it's always well-formed */
        new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
        new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
        reg->umin_value = max_t(u64, reg->umin_value, new_umin);
        reg->umax_value = min_t(u64, reg->umax_value, new_umax);
        /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
        new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
        new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
        reg->smin_value = max_t(s64, reg->smin_value, new_smin);
        reg->smax_value = min_t(s64, reg->smax_value, new_smax);

        /* Here we would like to handle a special case after sign extending load,
         * when upper bits for a 64-bit range are all 1s or all 0s.
         *
         * Upper bits are all 1s when register is in a range:
         *   [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff]
         * Upper bits are all 0s when register is in a range:
         *   [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff]
         * Together this forms are continuous range:
         *   [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff]
         *
         * Now, suppose that register range is in fact tighter:
         *   [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R)
         * Also suppose that it's 32-bit range is positive,
         * meaning that lower 32-bits of the full 64-bit register
         * are in the range:
         *   [0x0000_0000, 0x7fff_ffff] (W)
         *
         * If this happens, then any value in a range:
         *   [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff]
         * is smaller than a lowest bound of the range (R):
         *   0xffff_ffff_8000_0000
         * which means that upper bits of the full 64-bit register
         * can't be all 1s, when lower bits are in range (W).
         *
         * Note that:
         *  - 0xffff_ffff_8000_0000 == (s64)S32_MIN
         *  - 0x0000_0000_7fff_ffff == (s64)S32_MAX
         * These relations are used in the conditions below.
         */
        if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) {
                reg->smin_value = reg->s32_min_value;
                reg->smax_value = reg->s32_max_value;
                reg->umin_value = reg->s32_min_value;
                reg->umax_value = reg->s32_max_value;
                reg->var_off = tnum_intersect(reg->var_off,
                                              tnum_range(reg->smin_value, reg->smax_value));
        }
}

static void __reg_deduce_bounds(struct bpf_reg_state *reg)
{
        deduce_bounds_64_from_64(reg);
        deduce_bounds_32_from_64(reg);
        deduce_bounds_32_from_32(reg);
        deduce_bounds_64_from_32(reg);
}

/* Attempts to improve var_off based on unsigned min/max information */
static void __reg_bound_offset(struct bpf_reg_state *reg)
{
        struct tnum var64_off = tnum_intersect(reg->var_off,
                                               tnum_range(reg->umin_value,
                                                          reg->umax_value));
        struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
                                               tnum_range(reg->u32_min_value,
                                                          reg->u32_max_value));

        reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}

static bool range_bounds_violation(struct bpf_reg_state *reg);

static void reg_bounds_sync(struct bpf_reg_state *reg)
{
        /* If the input reg_state is invalid, we can exit early */
        if (range_bounds_violation(reg))
                return;
        /* We might have learned new bounds from the var_off. */
        __update_reg_bounds(reg);
        /* We might have learned something about the sign bit. */
        __reg_deduce_bounds(reg);
        __reg_deduce_bounds(reg);
        /* We might have learned some bits from the bounds. */
        __reg_bound_offset(reg);
        /* Intersecting with the old var_off might have improved our bounds
         * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
         * then new var_off is (0; 0x7f...fc) which improves our umax.
         */
        __update_reg_bounds(reg);
}

static bool range_bounds_violation(struct bpf_reg_state *reg)
{
        return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value ||
                reg->u32_min_value > reg->u32_max_value ||
                reg->s32_min_value > reg->s32_max_value);
}

static bool const_tnum_range_mismatch(struct bpf_reg_state *reg)
{
        u64 uval = reg->var_off.value;
        s64 sval = (s64)uval;

        if (!tnum_is_const(reg->var_off))
                return false;

        return reg->umin_value != uval || reg->umax_value != uval ||
               reg->smin_value != sval || reg->smax_value != sval;
}

static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg)
{
        u32 uval32 = tnum_subreg(reg->var_off).value;
        s32 sval32 = (s32)uval32;

        if (!tnum_subreg_is_const(reg->var_off))
                return false;

        return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
               reg->s32_min_value != sval32 || reg->s32_max_value != sval32;
}

static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *reg, const char *ctx)
{
        const char *msg;

        if (range_bounds_violation(reg)) {
                msg = "range bounds violation";
                goto out;
        }

        if (const_tnum_range_mismatch(reg)) {
                msg = "const tnum out of sync with range bounds";
                goto out;
        }

        if (const_tnum_range_mismatch_32(reg)) {
                msg = "const subreg tnum out of sync with range bounds";
                goto out;
        }

        return 0;
out:
        verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
                     "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)",
                     ctx, msg, reg->umin_value, reg->umax_value,
                     reg->smin_value, reg->smax_value,
                     reg->u32_min_value, reg->u32_max_value,
                     reg->s32_min_value, reg->s32_max_value,
                     reg->var_off.value, reg->var_off.mask);
        if (env->test_reg_invariants)
                return -EFAULT;
        __mark_reg_unbounded(reg);
        return 0;
}

static bool __reg32_bound_s64(s32 a)
{
        return a >= 0 && a <= S32_MAX;
}

static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
{
        reg->umin_value = reg->u32_min_value;
        reg->umax_value = reg->u32_max_value;

        /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
         * be positive otherwise set to worse case bounds and refine later
         * from tnum.
         */
        if (__reg32_bound_s64(reg->s32_min_value) &&
            __reg32_bound_s64(reg->s32_max_value)) {
                reg->smin_value = reg->s32_min_value;
                reg->smax_value = reg->s32_max_value;
        } else {
                reg->smin_value = 0;
                reg->smax_value = U32_MAX;
        }
}

/* Mark a register as having a completely unknown (scalar) value. */
void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
{
        /*
         * Clear type, off, and union(map_ptr, range) and
         * padding between 'type' and union
         */
        memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
        reg->type = SCALAR_VALUE;
        reg->id = 0;
        reg->ref_obj_id = 0;
        reg->var_off = tnum_unknown;
        reg->frameno = 0;
        reg->precise = false;
        __mark_reg_unbounded(reg);
}

/* Mark a register as having a completely unknown (scalar) value,
 * initialize .precise as true when not bpf capable.
 */
static void __mark_reg_unknown(const struct bpf_verifier_env *env,
                               struct bpf_reg_state *reg)
{
        bpf_mark_reg_unknown_imprecise(reg);
        reg->precise = !env->bpf_capable;
}

static void mark_reg_unknown(struct bpf_verifier_env *env,
                             struct bpf_reg_state *regs, u32 regno)
{
        __mark_reg_unknown(env, regs + regno);
}

static int __mark_reg_s32_range(struct bpf_verifier_env *env,
                                struct bpf_reg_state *regs,
                                u32 regno,
                                s32 s32_min,
                                s32 s32_max)
{
        struct bpf_reg_state *reg = regs + regno;

        reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min);
        reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max);

        reg->smin_value = max_t(s64, reg->smin_value, s32_min);
        reg->smax_value = min_t(s64, reg->smax_value, s32_max);

        reg_bounds_sync(reg);

        return reg_bounds_sanity_check(env, reg, "s32_range");
}

void bpf_mark_reg_not_init(const struct bpf_verifier_env *env,
                           struct bpf_reg_state *reg)
{
        __mark_reg_unknown(env, reg);
        reg->type = NOT_INIT;
}

static int mark_btf_ld_reg(struct bpf_verifier_env *env,
                           struct bpf_reg_state *regs, u32 regno,
                           enum bpf_reg_type reg_type,
                           struct btf *btf, u32 btf_id,
                           enum bpf_type_flag flag)
{
        switch (reg_type) {
        case SCALAR_VALUE:
                mark_reg_unknown(env, regs, regno);
                return 0;
        case PTR_TO_BTF_ID:
                mark_reg_known_zero(env, regs, regno);
                regs[regno].type = PTR_TO_BTF_ID | flag;
                regs[regno].btf = btf;
                regs[regno].btf_id = btf_id;
                if (type_may_be_null(flag))
                        regs[regno].id = ++env->id_gen;
                return 0;
        case PTR_TO_MEM:
                mark_reg_known_zero(env, regs, regno);
                regs[regno].type = PTR_TO_MEM | flag;
                regs[regno].mem_size = 0;
                return 0;
        default:
                verifier_bug(env, "unexpected reg_type %d in %s\n", reg_type, __func__);
                return -EFAULT;
        }
}

#define DEF_NOT_SUBREG        (0)
static void init_reg_state(struct bpf_verifier_env *env,
                           struct bpf_func_state *state)
{
        struct bpf_reg_state *regs = state->regs;
        int i;

        for (i = 0; i < MAX_BPF_REG; i++) {
                bpf_mark_reg_not_init(env, &regs[i]);
                regs[i].subreg_def = DEF_NOT_SUBREG;
        }

        /* frame pointer */
        regs[BPF_REG_FP].type = PTR_TO_STACK;
        mark_reg_known_zero(env, regs, BPF_REG_FP);
        regs[BPF_REG_FP].frameno = state->frameno;
}

static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
{
        /*
         * return_32bit is set to false by default and set explicitly
         * by the caller when necessary.
         */
        return (struct bpf_retval_range){ minval, maxval, false };
}

static void init_func_state(struct bpf_verifier_env *env,
                            struct bpf_func_state *state,
                            int callsite, int frameno, int subprogno)
{
        state->callsite = callsite;
        state->frameno = frameno;
        state->subprogno = subprogno;
        state->callback_ret_range = retval_range(0, 0);
        init_reg_state(env, state);
        mark_verifier_state_scratched(env);
}

/* Similar to push_stack(), but for async callbacks */
static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
                                                int insn_idx, int prev_insn_idx,
                                                int subprog, bool is_sleepable)
{
        struct bpf_verifier_stack_elem *elem;
        struct bpf_func_state *frame;

        elem = kzalloc_obj(struct bpf_verifier_stack_elem, GFP_KERNEL_ACCOUNT);
        if (!elem)
                return ERR_PTR(-ENOMEM);

        elem->insn_idx = insn_idx;
        elem->prev_insn_idx = prev_insn_idx;
        elem->next = env->head;
        elem->log_pos = env->log.end_pos;
        env->head = elem;
        env->stack_size++;
        if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
                verbose(env,
                        "The sequence of %d jumps is too complex for async cb.\n",
                        env->stack_size);
                return ERR_PTR(-E2BIG);
        }
        /* Unlike push_stack() do not bpf_copy_verifier_state().
         * The caller state doesn't matter.
         * This is async callback. It starts in a fresh stack.
         * Initialize it similar to do_check_common().
         */
        elem->st.branches = 1;
        elem->st.in_sleepable = is_sleepable;
        frame = kzalloc_obj(*frame, GFP_KERNEL_ACCOUNT);
        if (!frame)
                return ERR_PTR(-ENOMEM);
        init_func_state(env, frame,
                        BPF_MAIN_FUNC /* callsite */,
                        0 /* frameno within this callchain */,
                        subprog /* subprog number within this prog */);
        elem->st.frame[0] = frame;
        return &elem->st;
}


static int cmp_subprogs(const void *a, const void *b)
{
        return ((struct bpf_subprog_info *)a)->start -
               ((struct bpf_subprog_info *)b)->start;
}

/* Find subprogram that contains instruction at 'off' */
struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off)
{
        struct bpf_subprog_info *vals = env->subprog_info;
        int l, r, m;

        if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0)
                return NULL;

        l = 0;
        r = env->subprog_cnt - 1;
        while (l < r) {
                m = l + (r - l + 1) / 2;
                if (vals[m].start <= off)
                        l = m;
                else
                        r = m - 1;
        }
        return &vals[l];
}

/* Find subprogram that starts exactly at 'off' */
int bpf_find_subprog(struct bpf_verifier_env *env, int off)
{
        struct bpf_subprog_info *p;

        p = bpf_find_containing_subprog(env, off);
        if (!p || p->start != off)
                return -ENOENT;
        return p - env->subprog_info;
}

static int add_subprog(struct bpf_verifier_env *env, int off)
{
        int insn_cnt = env->prog->len;
        int ret;

        if (off >= insn_cnt || off < 0) {
                verbose(env, "call to invalid destination\n");
                return -EINVAL;
        }
        ret = bpf_find_subprog(env, off);
        if (ret >= 0)
                return ret;
        if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
                verbose(env, "too many subprograms\n");
                return -E2BIG;
        }
        /* determine subprog starts. The end is one before the next starts */
        env->subprog_info[env->subprog_cnt++].start = off;
        sort(env->subprog_info, env->subprog_cnt,
             sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
        return env->subprog_cnt - 1;
}

static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
{
        struct bpf_prog_aux *aux = env->prog->aux;
        struct btf *btf = aux->btf;
        const struct btf_type *t;
        u32 main_btf_id, id;
        const char *name;
        int ret, i;

        /* Non-zero func_info_cnt implies valid btf */
        if (!aux->func_info_cnt)
                return 0;
        main_btf_id = aux->func_info[0].type_id;

        t = btf_type_by_id(btf, main_btf_id);
        if (!t) {
                verbose(env, "invalid btf id for main subprog in func_info\n");
                return -EINVAL;
        }

        name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
        if (IS_ERR(name)) {
                ret = PTR_ERR(name);
                /* If there is no tag present, there is no exception callback */
                if (ret == -ENOENT)
                        ret = 0;
                else if (ret == -EEXIST)
                        verbose(env, "multiple exception callback tags for main subprog\n");
                return ret;
        }

        ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
        if (ret < 0) {
                verbose(env, "exception callback '%s' could not be found in BTF\n", name);
                return ret;
        }
        id = ret;
        t = btf_type_by_id(btf, id);
        if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
                verbose(env, "exception callback '%s' must have global linkage\n", name);
                return -EINVAL;
        }
        ret = 0;
        for (i = 0; i < aux->func_info_cnt; i++) {
                if (aux->func_info[i].type_id != id)
                        continue;
                ret = aux->func_info[i].insn_off;
                /* Further func_info and subprog checks will also happen
                 * later, so assume this is the right insn_off for now.
                 */
                if (!ret) {
                        verbose(env, "invalid exception callback insn_off in func_info: 0\n");
                        ret = -EINVAL;
                }
        }
        if (!ret) {
                verbose(env, "exception callback type id not found in func_info\n");
                ret = -EINVAL;
        }
        return ret;
}

#define MAX_KFUNC_BTFS        256

struct bpf_kfunc_btf {
        struct btf *btf;
        struct module *module;
        u16 offset;
};

struct bpf_kfunc_btf_tab {
        struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS];
        u32 nr_descs;
};

static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
{
        const struct bpf_kfunc_desc *d0 = a;
        const struct bpf_kfunc_desc *d1 = b;

        /* func_id is not greater than BTF_MAX_TYPE */
        return d0->func_id - d1->func_id ?: d0->offset - d1->offset;
}

static int kfunc_btf_cmp_by_off(const void *a, const void *b)
{
        const struct bpf_kfunc_btf *d0 = a;
        const struct bpf_kfunc_btf *d1 = b;

        return d0->offset - d1->offset;
}

static struct bpf_kfunc_desc *
find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
{
        struct bpf_kfunc_desc desc = {
                .func_id = func_id,
                .offset = offset,
        };
        struct bpf_kfunc_desc_tab *tab;

        tab = prog->aux->kfunc_tab;
        return bsearch(&desc, tab->descs, tab->nr_descs,
                       sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
}

int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
                       u16 btf_fd_idx, u8 **func_addr)
{
        const struct bpf_kfunc_desc *desc;

        desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
        if (!desc)
                return -EFAULT;

        *func_addr = (u8 *)desc->addr;
        return 0;
}

static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
                                         s16 offset)
{
        struct bpf_kfunc_btf kf_btf = { .offset = offset };
        struct bpf_kfunc_btf_tab *tab;
        struct bpf_kfunc_btf *b;
        struct module *mod;
        struct btf *btf;
        int btf_fd;

        tab = env->prog->aux->kfunc_btf_tab;
        b = bsearch(&kf_btf, tab->descs, tab->nr_descs,
                    sizeof(tab->descs[0]), kfunc_btf_cmp_by_off);
        if (!b) {
                if (tab->nr_descs == MAX_KFUNC_BTFS) {
                        verbose(env, "too many different module BTFs\n");
                        return ERR_PTR(-E2BIG);
                }

                if (bpfptr_is_null(env->fd_array)) {
                        verbose(env, "kfunc offset > 0 without fd_array is invalid\n");
                        return ERR_PTR(-EPROTO);
                }

                if (copy_from_bpfptr_offset(&btf_fd, env->fd_array,
                                            offset * sizeof(btf_fd),
                                            sizeof(btf_fd)))
                        return ERR_PTR(-EFAULT);

                btf = btf_get_by_fd(btf_fd);
                if (IS_ERR(btf)) {
                        verbose(env, "invalid module BTF fd specified\n");
                        return btf;
                }

                if (!btf_is_module(btf)) {
                        verbose(env, "BTF fd for kfunc is not a module BTF\n");
                        btf_put(btf);
                        return ERR_PTR(-EINVAL);
                }

                mod = btf_try_get_module(btf);
                if (!mod) {
                        btf_put(btf);
                        return ERR_PTR(-ENXIO);
                }

                b = &tab->descs[tab->nr_descs++];
                b->btf = btf;
                b->module = mod;
                b->offset = offset;

                /* sort() reorders entries by value, so b may no longer point
                 * to the right entry after this
                 */
                sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
                     kfunc_btf_cmp_by_off, NULL);
        } else {
                btf = b->btf;
        }

        return btf;
}

void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
{
        if (!tab)
                return;

        while (tab->nr_descs--) {
                module_put(tab->descs[tab->nr_descs].module);
                btf_put(tab->descs[tab->nr_descs].btf);
        }
        kfree(tab);
}

static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
{
        if (offset) {
                if (offset < 0) {
                        /* In the future, this can be allowed to increase limit
                         * of fd index into fd_array, interpreted as u16.
                         */
                        verbose(env, "negative offset disallowed for kernel module function call\n");
                        return ERR_PTR(-EINVAL);
                }

                return __find_kfunc_desc_btf(env, offset);
        }
        return btf_vmlinux ?: ERR_PTR(-ENOENT);
}

#define KF_IMPL_SUFFIX "_impl"

static const struct btf_type *find_kfunc_impl_proto(struct bpf_verifier_env *env,
                                                    struct btf *btf,
                                                    const char *func_name)
{
        char *buf = env->tmp_str_buf;
        const struct btf_type *func;
        s32 impl_id;
        int len;

        len = snprintf(buf, TMP_STR_BUF_LEN, "%s%s", func_name, KF_IMPL_SUFFIX);
        if (len < 0 || len >= TMP_STR_BUF_LEN) {
                verbose(env, "function name %s%s is too long\n", func_name, KF_IMPL_SUFFIX);
                return NULL;
        }

        impl_id = btf_find_by_name_kind(btf, buf, BTF_KIND_FUNC);
        if (impl_id <= 0) {
                verbose(env, "cannot find function %s in BTF\n", buf);
                return NULL;
        }

        func = btf_type_by_id(btf, impl_id);

        return btf_type_by_id(btf, func->type);
}

static int fetch_kfunc_meta(struct bpf_verifier_env *env,
                            s32 func_id,
                            s16 offset,
                            struct bpf_kfunc_meta *kfunc)
{
        const struct btf_type *func, *func_proto;
        const char *func_name;
        u32 *kfunc_flags;
        struct btf *btf;

        if (func_id <= 0) {
                verbose(env, "invalid kernel function btf_id %d\n", func_id);
                return -EINVAL;
        }

        btf = find_kfunc_desc_btf(env, offset);
        if (IS_ERR(btf)) {
                verbose(env, "failed to find BTF for kernel function\n");
                return PTR_ERR(btf);
        }

        /*
         * Note that kfunc_flags may be NULL at this point, which
         * means that we couldn't find func_id in any relevant
         * kfunc_id_set. This most likely indicates an invalid kfunc
         * call.  However we don't fail with an error here,
         * and let the caller decide what to do with NULL kfunc->flags.
         */
        kfunc_flags = btf_kfunc_flags(btf, func_id, env->prog);

        func = btf_type_by_id(btf, func_id);
        if (!func || !btf_type_is_func(func)) {
                verbose(env, "kernel btf_id %d is not a function\n", func_id);
                return -EINVAL;
        }

        func_name = btf_name_by_offset(btf, func->name_off);

        /*
         * An actual prototype of a kfunc with KF_IMPLICIT_ARGS flag
         * can be found through the counterpart _impl kfunc.
         */
        if (kfunc_flags && (*kfunc_flags & KF_IMPLICIT_ARGS))
                func_proto = find_kfunc_impl_proto(env, btf, func_name);
        else
                func_proto = btf_type_by_id(btf, func->type);

        if (!func_proto || !btf_type_is_func_proto(func_proto)) {
                verbose(env, "kernel function btf_id %d does not have a valid func_proto\n",
                        func_id);
                return -EINVAL;
        }

        memset(kfunc, 0, sizeof(*kfunc));
        kfunc->btf = btf;
        kfunc->id = func_id;
        kfunc->name = func_name;
        kfunc->proto = func_proto;
        kfunc->flags = kfunc_flags;

        return 0;
}

int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset)
{
        struct bpf_kfunc_btf_tab *btf_tab;
        struct btf_func_model func_model;
        struct bpf_kfunc_desc_tab *tab;
        struct bpf_prog_aux *prog_aux;
        struct bpf_kfunc_meta kfunc;
        struct bpf_kfunc_desc *desc;
        unsigned long addr;
        int err;

        prog_aux = env->prog->aux;
        tab = prog_aux->kfunc_tab;
        btf_tab = prog_aux->kfunc_btf_tab;
        if (!tab) {
                if (!btf_vmlinux) {
                        verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
                        return -ENOTSUPP;
                }

                if (!env->prog->jit_requested) {
                        verbose(env, "JIT is required for calling kernel function\n");
                        return -ENOTSUPP;
                }

                if (!bpf_jit_supports_kfunc_call()) {
                        verbose(env, "JIT does not support calling kernel function\n");
                        return -ENOTSUPP;
                }

                if (!env->prog->gpl_compatible) {
                        verbose(env, "cannot call kernel function from non-GPL compatible program\n");
                        return -EINVAL;
                }

                tab = kzalloc_obj(*tab, GFP_KERNEL_ACCOUNT);
                if (!tab)
                        return -ENOMEM;
                prog_aux->kfunc_tab = tab;
        }

        /* func_id == 0 is always invalid, but instead of returning an error, be
         * conservative and wait until the code elimination pass before returning
         * error, so that invalid calls that get pruned out can be in BPF programs
         * loaded from userspace.  It is also required that offset be untouched
         * for such calls.
         */
        if (!func_id && !offset)
                return 0;

        if (!btf_tab && offset) {
                btf_tab = kzalloc_obj(*btf_tab, GFP_KERNEL_ACCOUNT);
                if (!btf_tab)
                        return -ENOMEM;
                prog_aux->kfunc_btf_tab = btf_tab;
        }

        if (find_kfunc_desc(env->prog, func_id, offset))
                return 0;

        if (tab->nr_descs == MAX_KFUNC_DESCS) {
                verbose(env, "too many different kernel function calls\n");
                return -E2BIG;
        }

        err = fetch_kfunc_meta(env, func_id, offset, &kfunc);
        if (err)
                return err;

        addr = kallsyms_lookup_name(kfunc.name);
        if (!addr) {
                verbose(env, "cannot find address for kernel function %s\n", kfunc.name);
                return -EINVAL;
        }

        if (bpf_dev_bound_kfunc_id(func_id)) {
                err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
                if (err)
                        return err;
        }

        err = btf_distill_func_proto(&env->log, kfunc.btf, kfunc.proto, kfunc.name, &func_model);
        if (err)
                return err;

        desc = &tab->descs[tab->nr_descs++];
        desc->func_id = func_id;
        desc->offset = offset;
        desc->addr = addr;
        desc->func_model = func_model;
        sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
             kfunc_desc_cmp_by_id_off, NULL);
        return 0;
}

bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
{
        return !!prog->aux->kfunc_tab;
}

static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
        struct bpf_subprog_info *subprog = env->subprog_info;
        int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
        struct bpf_insn *insn = env->prog->insnsi;

        /* Add entry function. */
        ret = add_subprog(env, 0);
        if (ret)
                return ret;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
                    !bpf_pseudo_kfunc_call(insn))
                        continue;

                if (!env->bpf_capable) {
                        verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
                        return -EPERM;
                }

                if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
                        ret = add_subprog(env, i + insn->imm + 1);
                else
                        ret = bpf_add_kfunc_call(env, insn->imm, insn->off);

                if (ret < 0)
                        return ret;
        }

        ret = bpf_find_exception_callback_insn_off(env);
        if (ret < 0)
                return ret;
        ex_cb_insn = ret;

        /* If ex_cb_insn > 0, this means that the main program has a subprog
         * marked using BTF decl tag to serve as the exception callback.
         */
        if (ex_cb_insn) {
                ret = add_subprog(env, ex_cb_insn);
                if (ret < 0)
                        return ret;
                for (i = 1; i < env->subprog_cnt; i++) {
                        if (env->subprog_info[i].start != ex_cb_insn)
                                continue;
                        env->exception_callback_subprog = i;
                        bpf_mark_subprog_exc_cb(env, i);
                        break;
                }
        }

        /* Add a fake 'exit' subprog which could simplify subprog iteration
         * logic. 'subprog_cnt' should not be increased.
         */
        subprog[env->subprog_cnt].start = insn_cnt;

        if (env->log.level & BPF_LOG_LEVEL2)
                for (i = 0; i < env->subprog_cnt; i++)
                        verbose(env, "func#%d @%d\n", i, subprog[i].start);

        return 0;
}

static int check_subprogs(struct bpf_verifier_env *env)
{
        int i, subprog_start, subprog_end, off, cur_subprog = 0;
        struct bpf_subprog_info *subprog = env->subprog_info;
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;

        /* now check that all jumps are within the same subprog */
        subprog_start = subprog[cur_subprog].start;
        subprog_end = subprog[cur_subprog + 1].start;
        for (i = 0; i < insn_cnt; i++) {
                u8 code = insn[i].code;

                if (code == (BPF_JMP | BPF_CALL) &&
                    insn[i].src_reg == 0 &&
                    insn[i].imm == BPF_FUNC_tail_call) {
                        subprog[cur_subprog].has_tail_call = true;
                        subprog[cur_subprog].tail_call_reachable = true;
                }
                if (BPF_CLASS(code) == BPF_LD &&
                    (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
                        subprog[cur_subprog].has_ld_abs = true;
                if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
                        goto next;
                if (BPF_OP(code) == BPF_CALL)
                        goto next;
                if (BPF_OP(code) == BPF_EXIT) {
                        subprog[cur_subprog].exit_idx = i;
                        goto next;
                }
                off = i + bpf_jmp_offset(&insn[i]) + 1;
                if (off < subprog_start || off >= subprog_end) {
                        verbose(env, "jump out of range from insn %d to %d\n", i, off);
                        return -EINVAL;
                }
next:
                if (i == subprog_end - 1) {
                        /* to avoid fall-through from one subprog into another
                         * the last insn of the subprog should be either exit
                         * or unconditional jump back or bpf_throw call
                         */
                        if (code != (BPF_JMP | BPF_EXIT) &&
                            code != (BPF_JMP32 | BPF_JA) &&
                            code != (BPF_JMP | BPF_JA)) {
                                verbose(env, "last insn is not an exit or jmp\n");
                                return -EINVAL;
                        }
                        subprog_start = subprog_end;
                        cur_subprog++;
                        if (cur_subprog < env->subprog_cnt)
                                subprog_end = subprog[cur_subprog + 1].start;
                }
        }
        return 0;
}

/*
 * Sort subprogs in topological order so that leaf subprogs come first and
 * their callers come later. This is a DFS post-order traversal of the call
 * graph. Scan only reachable instructions (those in the computed postorder) of
 * the current subprog to discover callees (direct subprogs and sync
 * callbacks).
 */
static int sort_subprogs_topo(struct bpf_verifier_env *env)
{
        struct bpf_subprog_info *si = env->subprog_info;
        int *insn_postorder = env->cfg.insn_postorder;
        struct bpf_insn *insn = env->prog->insnsi;
        int cnt = env->subprog_cnt;
        int *dfs_stack = NULL;
        int top = 0, order = 0;
        int i, ret = 0;
        u8 *color = NULL;

        color = kvzalloc_objs(*color, cnt, GFP_KERNEL_ACCOUNT);
        dfs_stack = kvmalloc_objs(*dfs_stack, cnt, GFP_KERNEL_ACCOUNT);
        if (!color || !dfs_stack) {
                ret = -ENOMEM;
                goto out;
        }

        /*
         * DFS post-order traversal.
         * Color values: 0 = unvisited, 1 = on stack, 2 = done.
         */
        for (i = 0; i < cnt; i++) {
                if (color[i])
                        continue;
                color[i] = 1;
                dfs_stack[top++] = i;

                while (top > 0) {
                        int cur = dfs_stack[top - 1];
                        int po_start = si[cur].postorder_start;
                        int po_end = si[cur + 1].postorder_start;
                        bool pushed = false;
                        int j;

                        for (j = po_start; j < po_end; j++) {
                                int idx = insn_postorder[j];
                                int callee;

                                if (!bpf_pseudo_call(&insn[idx]) && !bpf_pseudo_func(&insn[idx]))
                                        continue;
                                callee = bpf_find_subprog(env, idx + insn[idx].imm + 1);
                                if (callee < 0) {
                                        ret = -EFAULT;
                                        goto out;
                                }
                                if (color[callee] == 2)
                                        continue;
                                if (color[callee] == 1) {
                                        if (bpf_pseudo_func(&insn[idx]))
                                                continue;
                                        verbose(env, "recursive call from %s() to %s()\n",
                                                subprog_name(env, cur),
                                                subprog_name(env, callee));
                                        ret = -EINVAL;
                                        goto out;
                                }
                                color[callee] = 1;
                                dfs_stack[top++] = callee;
                                pushed = true;
                                break;
                        }

                        if (!pushed) {
                                color[cur] = 2;
                                env->subprog_topo_order[order++] = cur;
                                top--;
                        }
                }
        }

        if (env->log.level & BPF_LOG_LEVEL2)
                for (i = 0; i < cnt; i++)
                        verbose(env, "topo_order[%d] = %s\n",
                                i, subprog_name(env, env->subprog_topo_order[i]));
out:
        kvfree(dfs_stack);
        kvfree(color);
        return ret;
}

static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                    int spi, int nr_slots)
{
        int i;

        for (i = 0; i < nr_slots; i++)
                mark_stack_slot_scratched(env, spi - i);
        return 0;
}

static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        int spi;

        /* For CONST_PTR_TO_DYNPTR, it must have already been done by
         * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
         * check_kfunc_call.
         */
        if (reg->type == CONST_PTR_TO_DYNPTR)
                return 0;
        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return spi;
        /* Caller ensures dynptr is valid and initialized, which means spi is in
         * bounds and spi is the first dynptr slot. Simply mark stack slot as
         * read.
         */
        return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS);
}

static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                          int spi, int nr_slots)
{
        return mark_stack_slot_obj_read(env, reg, spi, nr_slots);
}

static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        int spi;

        spi = irq_flag_get_spi(env, reg);
        if (spi < 0)
                return spi;
        return mark_stack_slot_obj_read(env, reg, spi, 1);
}

/* This function is supposed to be used by the following 32-bit optimization
 * code only. It returns TRUE if the source or destination register operates
 * on 64-bit, otherwise return FALSE.
 */
bool bpf_is_reg64(struct bpf_insn *insn,
              u32 regno, struct bpf_reg_state *reg, enum bpf_reg_arg_type t)
{
        u8 code, class, op;

        code = insn->code;
        class = BPF_CLASS(code);
        op = BPF_OP(code);
        if (class == BPF_JMP) {
                /* BPF_EXIT for "main" will reach here. Return TRUE
                 * conservatively.
                 */
                if (op == BPF_EXIT)
                        return true;
                if (op == BPF_CALL) {
                        /* BPF to BPF call will reach here because of marking
                         * caller saved clobber with DST_OP_NO_MARK for which we
                         * don't care the register def because they are anyway
                         * marked as NOT_INIT already.
                         */
                        if (insn->src_reg == BPF_PSEUDO_CALL)
                                return false;
                        /* Helper call will reach here because of arg type
                         * check, conservatively return TRUE.
                         */
                        if (t == SRC_OP)
                                return true;

                        return false;
                }
        }

        if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32))
                return false;

        if (class == BPF_ALU64 || class == BPF_JMP ||
            (class == BPF_ALU && op == BPF_END && insn->imm == 64))
                return true;

        if (class == BPF_ALU || class == BPF_JMP32)
                return false;

        if (class == BPF_LDX) {
                if (t != SRC_OP)
                        return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX;
                /* LDX source must be ptr. */
                return true;
        }

        if (class == BPF_STX) {
                /* BPF_STX (including atomic variants) has one or more source
                 * operands, one of which is a ptr. Check whether the caller is
                 * asking about it.
                 */
                if (t == SRC_OP && reg->type != SCALAR_VALUE)
                        return true;
                return BPF_SIZE(code) == BPF_DW;
        }

        if (class == BPF_LD) {
                u8 mode = BPF_MODE(code);

                /* LD_IMM64 */
                if (mode == BPF_IMM)
                        return true;

                /* Both LD_IND and LD_ABS return 32-bit data. */
                if (t != SRC_OP)
                        return  false;

                /* Implicit ctx ptr. */
                if (regno == BPF_REG_6)
                        return true;

                /* Explicit source could be any width. */
                return true;
        }

        if (class == BPF_ST)
                /* The only source register for BPF_ST is a ptr. */
                return true;

        /* Conservatively return true at default. */
        return true;
}

static void mark_insn_zext(struct bpf_verifier_env *env,
                           struct bpf_reg_state *reg)
{
        s32 def_idx = reg->subreg_def;

        if (def_idx == DEF_NOT_SUBREG)
                return;

        env->insn_aux_data[def_idx - 1].zext_dst = true;
        /* The dst will be zero extended, so won't be sub-register anymore. */
        reg->subreg_def = DEF_NOT_SUBREG;
}

static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
                           enum bpf_reg_arg_type t)
{
        struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
        struct bpf_reg_state *reg;
        bool rw64;

        mark_reg_scratched(env, regno);

        reg = &regs[regno];
        rw64 = bpf_is_reg64(insn, regno, reg, t);
        if (t == SRC_OP) {
                /* check whether register used as source operand can be read */
                if (reg->type == NOT_INIT) {
                        verbose(env, "R%d !read_ok\n", regno);
                        return -EACCES;
                }
                /* We don't need to worry about FP liveness because it's read-only */
                if (regno == BPF_REG_FP)
                        return 0;

                if (rw64)
                        mark_insn_zext(env, reg);

                return 0;
        } else {
                /* check whether register used as dest operand can be written to */
                if (regno == BPF_REG_FP) {
                        verbose(env, "frame pointer is read only\n");
                        return -EACCES;
                }
                reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
                if (t == DST_OP)
                        mark_reg_unknown(env, regs, regno);
        }
        return 0;
}

static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
                         enum bpf_reg_arg_type t)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];

        return __check_reg_arg(env, state->regs, regno, t);
}

static int insn_stack_access_flags(int frameno, int spi)
{
        return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
}

static void mark_indirect_target(struct bpf_verifier_env *env, int idx)
{
        env->insn_aux_data[idx].indirect_target = true;
}

#define LR_FRAMENO_BITS        3
#define LR_SPI_BITS        6
#define LR_ENTRY_BITS        (LR_SPI_BITS + LR_FRAMENO_BITS + 1)
#define LR_SIZE_BITS        4
#define LR_FRAMENO_MASK        ((1ull << LR_FRAMENO_BITS) - 1)
#define LR_SPI_MASK        ((1ull << LR_SPI_BITS)     - 1)
#define LR_SIZE_MASK        ((1ull << LR_SIZE_BITS)    - 1)
#define LR_SPI_OFF        LR_FRAMENO_BITS
#define LR_IS_REG_OFF        (LR_SPI_BITS + LR_FRAMENO_BITS)
#define LINKED_REGS_MAX        6

struct linked_reg {
        u8 frameno;
        union {
                u8 spi;
                u8 regno;
        };
        bool is_reg;
};

struct linked_regs {
        int cnt;
        struct linked_reg entries[LINKED_REGS_MAX];
};

static struct linked_reg *linked_regs_push(struct linked_regs *s)
{
        if (s->cnt < LINKED_REGS_MAX)
                return &s->entries[s->cnt++];

        return NULL;
}

/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track
 * number of elements currently in stack.
 * Pack one history entry for linked registers as 10 bits in the following format:
 * - 3-bits frameno
 * - 6-bits spi_or_reg
 * - 1-bit  is_reg
 */
static u64 linked_regs_pack(struct linked_regs *s)
{
        u64 val = 0;
        int i;

        for (i = 0; i < s->cnt; ++i) {
                struct linked_reg *e = &s->entries[i];
                u64 tmp = 0;

                tmp |= e->frameno;
                tmp |= e->spi << LR_SPI_OFF;
                tmp |= (e->is_reg ? 1 : 0) << LR_IS_REG_OFF;

                val <<= LR_ENTRY_BITS;
                val |= tmp;
        }
        val <<= LR_SIZE_BITS;
        val |= s->cnt;
        return val;
}

static void linked_regs_unpack(u64 val, struct linked_regs *s)
{
        int i;

        s->cnt = val & LR_SIZE_MASK;
        val >>= LR_SIZE_BITS;

        for (i = 0; i < s->cnt; ++i) {
                struct linked_reg *e = &s->entries[i];

                e->frameno =  val & LR_FRAMENO_MASK;
                e->spi     = (val >> LR_SPI_OFF) & LR_SPI_MASK;
                e->is_reg  = (val >> LR_IS_REG_OFF) & 0x1;
                val >>= LR_ENTRY_BITS;
        }
}

static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
{
        const struct btf_type *func;
        struct btf *desc_btf;

        if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
                return NULL;

        desc_btf = find_kfunc_desc_btf(data, insn->off);
        if (IS_ERR(desc_btf))
                return "<error>";

        func = btf_type_by_id(desc_btf, insn->imm);
        return btf_name_by_offset(desc_btf, func->name_off);
}

void bpf_verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        const struct bpf_insn_cbs cbs = {
                .cb_call        = disasm_kfunc_name,
                .cb_print        = verbose,
                .private_data        = env,
        };

        print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}

/* If any register R in hist->linked_regs is marked as precise in bt,
 * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
 */
void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
{
        struct linked_regs linked_regs;
        bool some_precise = false;
        int i;

        if (!hist || hist->linked_regs == 0)
                return;

        linked_regs_unpack(hist->linked_regs, &linked_regs);
        for (i = 0; i < linked_regs.cnt; ++i) {
                struct linked_reg *e = &linked_regs.entries[i];

                if ((e->is_reg && bt_is_frame_reg_set(bt, e->frameno, e->regno)) ||
                    (!e->is_reg && bt_is_frame_slot_set(bt, e->frameno, e->spi))) {
                        some_precise = true;
                        break;
                }
        }

        if (!some_precise)
                return;

        for (i = 0; i < linked_regs.cnt; ++i) {
                struct linked_reg *e = &linked_regs.entries[i];

                if (e->is_reg)
                        bpf_bt_set_frame_reg(bt, e->frameno, e->regno);
                else
                        bpf_bt_set_frame_slot(bt, e->frameno, e->spi);
        }
}

int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
        return bpf_mark_chain_precision(env, env->cur_state, regno, NULL);
}

/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
 * desired reg and stack masks across all relevant frames
 */
static int mark_chain_precision_batch(struct bpf_verifier_env *env,
                                      struct bpf_verifier_state *starting_state)
{
        return bpf_mark_chain_precision(env, starting_state, -1, NULL);
}

static bool is_spillable_regtype(enum bpf_reg_type type)
{
        switch (base_type(type)) {
        case PTR_TO_MAP_VALUE:
        case PTR_TO_STACK:
        case PTR_TO_CTX:
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
        case PTR_TO_PACKET_END:
        case PTR_TO_FLOW_KEYS:
        case CONST_PTR_TO_MAP:
        case PTR_TO_SOCKET:
        case PTR_TO_SOCK_COMMON:
        case PTR_TO_TCP_SOCK:
        case PTR_TO_XDP_SOCK:
        case PTR_TO_BTF_ID:
        case PTR_TO_BUF:
        case PTR_TO_MEM:
        case PTR_TO_FUNC:
        case PTR_TO_MAP_KEY:
        case PTR_TO_ARENA:
                return true;
        default:
                return false;
        }
}


/* check if register is a constant scalar value */
static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
{
        return reg->type == SCALAR_VALUE &&
               tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
}

/* assuming is_reg_const() is true, return constant value of a register */
static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
{
        return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
}

static bool __is_pointer_value(bool allow_ptr_leaks,
                               const struct bpf_reg_state *reg)
{
        if (allow_ptr_leaks)
                return false;

        return reg->type != SCALAR_VALUE;
}

static void clear_scalar_id(struct bpf_reg_state *reg)
{
        reg->id = 0;
        reg->delta = 0;
}

static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
                                        struct bpf_reg_state *src_reg)
{
        if (src_reg->type != SCALAR_VALUE)
                return;
        /*
         * The verifier is processing rX = rY insn and
         * rY->id has special linked register already.
         * Cleared it, since multiple rX += const are not supported.
         */
        if (src_reg->id & BPF_ADD_CONST)
                clear_scalar_id(src_reg);
        /*
         * Ensure that src_reg has a valid ID that will be copied to
         * dst_reg and then will be used by sync_linked_regs() to
         * propagate min/max range.
         */
        if (!src_reg->id && !tnum_is_const(src_reg->var_off))
                src_reg->id = ++env->id_gen;
}

/* Copy src state preserving dst->parent and dst->live fields */
static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
{
        *dst = *src;
}

static void save_register_state(struct bpf_verifier_env *env,
                                struct bpf_func_state *state,
                                int spi, struct bpf_reg_state *reg,
                                int size)
{
        int i;

        copy_register_state(&state->stack[spi].spilled_ptr, reg);

        for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
                state->stack[spi].slot_type[i - 1] = STACK_SPILL;

        /* size < 8 bytes spill */
        for (; i; i--)
                mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]);
}

static bool is_bpf_st_mem(struct bpf_insn *insn)
{
        return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
}

static int get_reg_width(struct bpf_reg_state *reg)
{
        return fls64(reg->umax_value);
}

/* See comment for mark_fastcall_pattern_for_call() */
static void check_fastcall_stack_contract(struct bpf_verifier_env *env,
                                          struct bpf_func_state *state, int insn_idx, int off)
{
        struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno];
        struct bpf_insn_aux_data *aux = env->insn_aux_data;
        int i;

        if (subprog->fastcall_stack_off <= off || aux[insn_idx].fastcall_pattern)
                return;
        /* access to the region [max_stack_depth .. fastcall_stack_off)
         * from something that is not a part of the fastcall pattern,
         * disable fastcall rewrites for current subprogram by setting
         * fastcall_stack_off to a value smaller than any possible offset.
         */
        subprog->fastcall_stack_off = S16_MIN;
        /* reset fastcall aux flags within subprogram,
         * happens at most once per subprogram
         */
        for (i = subprog->start; i < (subprog + 1)->start; ++i) {
                aux[i].fastcall_spills_num = 0;
                aux[i].fastcall_pattern = 0;
        }
}

static void scrub_special_slot(struct bpf_func_state *state, int spi)
{
        int i;

        /* regular write of data into stack destroys any spilled ptr */
        state->stack[spi].spilled_ptr.type = NOT_INIT;
        /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
        if (is_stack_slot_special(&state->stack[spi]))
                for (i = 0; i < BPF_REG_SIZE; i++)
                        scrub_spilled_slot(&state->stack[spi].slot_type[i]);
}

/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
 * stack boundary and alignment are checked in check_mem_access()
 */
static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
                                       /* stack frame we're writing to */
                                       struct bpf_func_state *state,
                                       int off, int size, int value_regno,
                                       int insn_idx)
{
        struct bpf_func_state *cur; /* state of the current function */
        int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
        struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
        struct bpf_reg_state *reg = NULL;
        int insn_flags = insn_stack_access_flags(state->frameno, spi);

        /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
         * so it's aligned access and [off, off + size) are within stack limits
         */
        if (!env->allow_ptr_leaks &&
            bpf_is_spilled_reg(&state->stack[spi]) &&
            !bpf_is_spilled_scalar_reg(&state->stack[spi]) &&
            size != BPF_REG_SIZE) {
                verbose(env, "attempt to corrupt spilled pointer on stack\n");
                return -EACCES;
        }

        cur = env->cur_state->frame[env->cur_state->curframe];
        if (value_regno >= 0)
                reg = &cur->regs[value_regno];
        if (!env->bypass_spec_v4) {
                bool sanitize = reg && is_spillable_regtype(reg->type);

                for (i = 0; i < size; i++) {
                        u8 type = state->stack[spi].slot_type[i];

                        if (type != STACK_MISC && type != STACK_ZERO) {
                                sanitize = true;
                                break;
                        }
                }

                if (sanitize)
                        env->insn_aux_data[insn_idx].nospec_result = true;
        }

        err = destroy_if_dynptr_stack_slot(env, state, spi);
        if (err)
                return err;

        check_fastcall_stack_contract(env, state, insn_idx, off);
        mark_stack_slot_scratched(env, spi);
        if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
                bool reg_value_fits;

                reg_value_fits = get_reg_width(reg) <= BITS_PER_BYTE * size;
                /* Make sure that reg had an ID to build a relation on spill. */
                if (reg_value_fits)
                        assign_scalar_id_before_mov(env, reg);
                save_register_state(env, state, spi, reg, size);
                /* Break the relation on a narrowing spill. */
                if (!reg_value_fits)
                        state->stack[spi].spilled_ptr.id = 0;
        } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
                   env->bpf_capable) {
                struct bpf_reg_state *tmp_reg = &env->fake_reg[0];

                memset(tmp_reg, 0, sizeof(*tmp_reg));
                __mark_reg_known(tmp_reg, insn->imm);
                tmp_reg->type = SCALAR_VALUE;
                save_register_state(env, state, spi, tmp_reg, size);
        } else if (reg && is_spillable_regtype(reg->type)) {
                /* register containing pointer is being spilled into stack */
                if (size != BPF_REG_SIZE) {
                        verbose_linfo(env, insn_idx, "; ");
                        verbose(env, "invalid size of register spill\n");
                        return -EACCES;
                }
                if (state != cur && reg->type == PTR_TO_STACK) {
                        verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
                        return -EINVAL;
                }
                save_register_state(env, state, spi, reg, size);
        } else {
                u8 type = STACK_MISC;

                scrub_special_slot(state, spi);

                /* when we zero initialize stack slots mark them as such */
                if ((reg && bpf_register_is_null(reg)) ||
                    (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
                        /* STACK_ZERO case happened because register spill
                         * wasn't properly aligned at the stack slot boundary,
                         * so it's not a register spill anymore; force
                         * originating register to be precise to make
                         * STACK_ZERO correct for subsequent states
                         */
                        err = mark_chain_precision(env, value_regno);
                        if (err)
                                return err;
                        type = STACK_ZERO;
                }

                /* Mark slots affected by this stack write. */
                for (i = 0; i < size; i++)
                        state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
                insn_flags = 0; /* not a register spill */
        }

        if (insn_flags)
                return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0);
        return 0;
}

/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
 * known to contain a variable offset.
 * This function checks whether the write is permitted and conservatively
 * tracks the effects of the write, considering that each stack slot in the
 * dynamic range is potentially written to.
 *
 * 'value_regno' can be -1, meaning that an unknown value is being written to
 * the stack.
 *
 * Spilled pointers in range are not marked as written because we don't know
 * what's going to be actually written. This means that read propagation for
 * future reads cannot be terminated by this write.
 *
 * For privileged programs, uninitialized stack slots are considered
 * initialized by this write (even though we don't know exactly what offsets
 * are going to be written to). The idea is that we don't want the verifier to
 * reject future reads that access slots written to through variable offsets.
 */
static int check_stack_write_var_off(struct bpf_verifier_env *env,
                                     /* func where register points to */
                                     struct bpf_func_state *state,
                                     int ptr_regno, int off, int size,
                                     int value_regno, int insn_idx)
{
        struct bpf_func_state *cur; /* state of the current function */
        int min_off, max_off;
        int i, err;
        struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
        struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
        bool writing_zero = false;
        /* set if the fact that we're writing a zero is used to let any
         * stack slots remain STACK_ZERO
         */
        bool zero_used = false;

        cur = env->cur_state->frame[env->cur_state->curframe];
        ptr_reg = &cur->regs[ptr_regno];
        min_off = ptr_reg->smin_value + off;
        max_off = ptr_reg->smax_value + off + size;
        if (value_regno >= 0)
                value_reg = &cur->regs[value_regno];
        if ((value_reg && bpf_register_is_null(value_reg)) ||
            (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
                writing_zero = true;

        for (i = min_off; i < max_off; i++) {
                int spi;

                spi = bpf_get_spi(i);
                err = destroy_if_dynptr_stack_slot(env, state, spi);
                if (err)
                        return err;
        }

        check_fastcall_stack_contract(env, state, insn_idx, min_off);
        /* Variable offset writes destroy any spilled pointers in range. */
        for (i = min_off; i < max_off; i++) {
                u8 new_type, *stype;
                int slot, spi;

                slot = -i - 1;
                spi = slot / BPF_REG_SIZE;
                stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
                mark_stack_slot_scratched(env, spi);

                if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
                        /* Reject the write if range we may write to has not
                         * been initialized beforehand. If we didn't reject
                         * here, the ptr status would be erased below (even
                         * though not all slots are actually overwritten),
                         * possibly opening the door to leaks.
                         *
                         * We do however catch STACK_INVALID case below, and
                         * only allow reading possibly uninitialized memory
                         * later for CAP_PERFMON, as the write may not happen to
                         * that slot.
                         */
                        verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
                                insn_idx, i);
                        return -EINVAL;
                }

                /* If writing_zero and the spi slot contains a spill of value 0,
                 * maintain the spill type.
                 */
                if (writing_zero && *stype == STACK_SPILL &&
                    bpf_is_spilled_scalar_reg(&state->stack[spi])) {
                        struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr;

                        if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) {
                                zero_used = true;
                                continue;
                        }
                }

                /*
                 * Scrub slots if variable-offset stack write goes over spilled pointers.
                 * Otherwise bpf_is_spilled_reg() may == true && spilled_ptr.type == NOT_INIT
                 * and valid program is rejected by check_stack_read_fixed_off()
                 * with obscure "invalid size of register fill" message.
                 */
                scrub_special_slot(state, spi);

                /* Update the slot type. */
                new_type = STACK_MISC;
                if (writing_zero && *stype == STACK_ZERO) {
                        new_type = STACK_ZERO;
                        zero_used = true;
                }
                /* If the slot is STACK_INVALID, we check whether it's OK to
                 * pretend that it will be initialized by this write. The slot
                 * might not actually be written to, and so if we mark it as
                 * initialized future reads might leak uninitialized memory.
                 * For privileged programs, we will accept such reads to slots
                 * that may or may not be written because, if we're reject
                 * them, the error would be too confusing.
                 * Conservatively, treat STACK_POISON in a similar way.
                 */
                if ((*stype == STACK_INVALID || *stype == STACK_POISON) &&
                    !env->allow_uninit_stack) {
                        verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
                                        insn_idx, i);
                        return -EINVAL;
                }
                *stype = new_type;
        }
        if (zero_used) {
                /* backtracking doesn't work for STACK_ZERO yet. */
                err = mark_chain_precision(env, value_regno);
                if (err)
                        return err;
        }
        return 0;
}

/* When register 'dst_regno' is assigned some values from stack[min_off,
 * max_off), we set the register's type according to the types of the
 * respective stack slots. If all the stack values are known to be zeros, then
 * so is the destination reg. Otherwise, the register is considered to be
 * SCALAR. This function does not deal with register filling; the caller must
 * ensure that all spilled registers in the stack range have been marked as
 * read.
 */
static void mark_reg_stack_read(struct bpf_verifier_env *env,
                                /* func where src register points to */
                                struct bpf_func_state *ptr_state,
                                int min_off, int max_off, int dst_regno)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        int i, slot, spi;
        u8 *stype;
        int zeros = 0;

        for (i = min_off; i < max_off; i++) {
                slot = -i - 1;
                spi = slot / BPF_REG_SIZE;
                mark_stack_slot_scratched(env, spi);
                stype = ptr_state->stack[spi].slot_type;
                if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
                        break;
                zeros++;
        }
        if (zeros == max_off - min_off) {
                /* Any access_size read into register is zero extended,
                 * so the whole register == const_zero.
                 */
                __mark_reg_const_zero(env, &state->regs[dst_regno]);
        } else {
                /* have read misc data from the stack */
                mark_reg_unknown(env, state->regs, dst_regno);
        }
}

/* Read the stack at 'off' and put the results into the register indicated by
 * 'dst_regno'. It handles reg filling if the addressed stack slot is a
 * spilled reg.
 *
 * 'dst_regno' can be -1, meaning that the read value is not going to a
 * register.
 *
 * The access is assumed to be within the current stack bounds.
 */
static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
                                      /* func where src register points to */
                                      struct bpf_func_state *reg_state,
                                      int off, int size, int dst_regno)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
        struct bpf_reg_state *reg;
        u8 *stype, type;
        int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);

        stype = reg_state->stack[spi].slot_type;
        reg = &reg_state->stack[spi].spilled_ptr;

        mark_stack_slot_scratched(env, spi);
        check_fastcall_stack_contract(env, state, env->insn_idx, off);

        if (bpf_is_spilled_reg(&reg_state->stack[spi])) {
                u8 spill_size = 1;

                for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
                        spill_size++;

                if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) {
                        if (reg->type != SCALAR_VALUE) {
                                verbose_linfo(env, env->insn_idx, "; ");
                                verbose(env, "invalid size of register fill\n");
                                return -EACCES;
                        }

                        if (dst_regno < 0)
                                return 0;

                        if (size <= spill_size &&
                            bpf_stack_narrow_access_ok(off, size, spill_size)) {
                                /* The earlier check_reg_arg() has decided the
                                 * subreg_def for this insn.  Save it first.
                                 */
                                s32 subreg_def = state->regs[dst_regno].subreg_def;

                                if (env->bpf_capable && size == 4 && spill_size == 4 &&
                                    get_reg_width(reg) <= 32)
                                        /* Ensure stack slot has an ID to build a relation
                                         * with the destination register on fill.
                                         */
                                        assign_scalar_id_before_mov(env, reg);
                                copy_register_state(&state->regs[dst_regno], reg);
                                state->regs[dst_regno].subreg_def = subreg_def;

                                /* Break the relation on a narrowing fill.
                                 * coerce_reg_to_size will adjust the boundaries.
                                 */
                                if (get_reg_width(reg) > size * BITS_PER_BYTE)
                                        clear_scalar_id(&state->regs[dst_regno]);
                        } else {
                                int spill_cnt = 0, zero_cnt = 0;

                                for (i = 0; i < size; i++) {
                                        type = stype[(slot - i) % BPF_REG_SIZE];
                                        if (type == STACK_SPILL) {
                                                spill_cnt++;
                                                continue;
                                        }
                                        if (type == STACK_MISC)
                                                continue;
                                        if (type == STACK_ZERO) {
                                                zero_cnt++;
                                                continue;
                                        }
                                        if (type == STACK_INVALID && env->allow_uninit_stack)
                                                continue;
                                        if (type == STACK_POISON) {
                                                verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n",
                                                        off, i, size);
                                        } else {
                                                verbose(env, "invalid read from stack off %d+%d size %d\n",
                                                        off, i, size);
                                        }
                                        return -EACCES;
                                }

                                if (spill_cnt == size &&
                                    tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
                                        __mark_reg_const_zero(env, &state->regs[dst_regno]);
                                        /* this IS register fill, so keep insn_flags */
                                } else if (zero_cnt == size) {
                                        /* similarly to mark_reg_stack_read(), preserve zeroes */
                                        __mark_reg_const_zero(env, &state->regs[dst_regno]);
                                        insn_flags = 0; /* not restoring original register state */
                                } else {
                                        mark_reg_unknown(env, state->regs, dst_regno);
                                        insn_flags = 0; /* not restoring original register state */
                                }
                        }
                } else if (dst_regno >= 0) {
                        /* restore register state from stack */
                        if (env->bpf_capable)
                                /* Ensure stack slot has an ID to build a relation
                                 * with the destination register on fill.
                                 */
                                assign_scalar_id_before_mov(env, reg);
                        copy_register_state(&state->regs[dst_regno], reg);
                        /* mark reg as written since spilled pointer state likely
                         * has its liveness marks cleared by is_state_visited()
                         * which resets stack/reg liveness for state transitions
                         */
                } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
                        /* If dst_regno==-1, the caller is asking us whether
                         * it is acceptable to use this value as a SCALAR_VALUE
                         * (e.g. for XADD).
                         * We must not allow unprivileged callers to do that
                         * with spilled pointers.
                         */
                        verbose(env, "leaking pointer from stack off %d\n",
                                off);
                        return -EACCES;
                }
        } else {
                for (i = 0; i < size; i++) {
                        type = stype[(slot - i) % BPF_REG_SIZE];
                        if (type == STACK_MISC)
                                continue;
                        if (type == STACK_ZERO)
                                continue;
                        if (type == STACK_INVALID && env->allow_uninit_stack)
                                continue;
                        if (type == STACK_POISON) {
                                verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n",
                                        off, i, size);
                        } else {
                                verbose(env, "invalid read from stack off %d+%d size %d\n",
                                        off, i, size);
                        }
                        return -EACCES;
                }
                if (dst_regno >= 0)
                        mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
                insn_flags = 0; /* we are not restoring spilled register */
        }
        if (insn_flags)
                return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0);
        return 0;
}

enum bpf_access_src {
        ACCESS_DIRECT = 1,  /* the access is performed by an instruction */
        ACCESS_HELPER = 2,  /* the access is performed by a helper */
};

static int check_stack_range_initialized(struct bpf_verifier_env *env,
                                         int regno, int off, int access_size,
                                         bool zero_size_allowed,
                                         enum bpf_access_type type,
                                         struct bpf_call_arg_meta *meta);

static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
{
        return cur_regs(env) + regno;
}

/* Read the stack at 'ptr_regno + off' and put the result into the register
 * 'dst_regno'.
 * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
 * but not its variable offset.
 * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
 *
 * As opposed to check_stack_read_fixed_off, this function doesn't deal with
 * filling registers (i.e. reads of spilled register cannot be detected when
 * the offset is not fixed). We conservatively mark 'dst_regno' as containing
 * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
 * offset; for a fixed offset check_stack_read_fixed_off should be used
 * instead.
 */
static int check_stack_read_var_off(struct bpf_verifier_env *env,
                                    int ptr_regno, int off, int size, int dst_regno)
{
        /* The state of the source register. */
        struct bpf_reg_state *reg = reg_state(env, ptr_regno);
        struct bpf_func_state *ptr_state = bpf_func(env, reg);
        int err;
        int min_off, max_off;

        /* Note that we pass a NULL meta, so raw access will not be permitted.
         */
        err = check_stack_range_initialized(env, ptr_regno, off, size,
                                            false, BPF_READ, NULL);
        if (err)
                return err;

        min_off = reg->smin_value + off;
        max_off = reg->smax_value + off;
        mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
        check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off);
        return 0;
}

/* check_stack_read dispatches to check_stack_read_fixed_off or
 * check_stack_read_var_off.
 *
 * The caller must ensure that the offset falls within the allocated stack
 * bounds.
 *
 * 'dst_regno' is a register which will receive the value from the stack. It
 * can be -1, meaning that the read value is not going to a register.
 */
static int check_stack_read(struct bpf_verifier_env *env,
                            int ptr_regno, int off, int size,
                            int dst_regno)
{
        struct bpf_reg_state *reg = reg_state(env, ptr_regno);
        struct bpf_func_state *state = bpf_func(env, reg);
        int err;
        /* Some accesses are only permitted with a static offset. */
        bool var_off = !tnum_is_const(reg->var_off);

        /* The offset is required to be static when reads don't go to a
         * register, in order to not leak pointers (see
         * check_stack_read_fixed_off).
         */
        if (dst_regno < 0 && var_off) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
                        tn_buf, off, size);
                return -EACCES;
        }
        /* Variable offset is prohibited for unprivileged mode for simplicity
         * since it requires corresponding support in Spectre masking for stack
         * ALU. See also retrieve_ptr_limit(). The check in
         * check_stack_access_for_ptr_arithmetic() called by
         * adjust_ptr_min_max_vals() prevents users from creating stack pointers
         * with variable offsets, therefore no check is required here. Further,
         * just checking it here would be insufficient as speculative stack
         * writes could still lead to unsafe speculative behaviour.
         */
        if (!var_off) {
                off += reg->var_off.value;
                err = check_stack_read_fixed_off(env, state, off, size,
                                                 dst_regno);
        } else {
                /* Variable offset stack reads need more conservative handling
                 * than fixed offset ones. Note that dst_regno >= 0 on this
                 * branch.
                 */
                err = check_stack_read_var_off(env, ptr_regno, off, size,
                                               dst_regno);
        }
        return err;
}


/* check_stack_write dispatches to check_stack_write_fixed_off or
 * check_stack_write_var_off.
 *
 * 'ptr_regno' is the register used as a pointer into the stack.
 * 'value_regno' is the register whose value we're writing to the stack. It can
 * be -1, meaning that we're not writing from a register.
 *
 * The caller must ensure that the offset falls within the maximum stack size.
 */
static int check_stack_write(struct bpf_verifier_env *env,
                             int ptr_regno, int off, int size,
                             int value_regno, int insn_idx)
{
        struct bpf_reg_state *reg = reg_state(env, ptr_regno);
        struct bpf_func_state *state = bpf_func(env, reg);
        int err;

        if (tnum_is_const(reg->var_off)) {
                off += reg->var_off.value;
                err = check_stack_write_fixed_off(env, state, off, size,
                                                  value_regno, insn_idx);
        } else {
                /* Variable offset stack reads need more conservative handling
                 * than fixed offset ones.
                 */
                err = check_stack_write_var_off(env, state,
                                                ptr_regno, off, size,
                                                value_regno, insn_idx);
        }
        return err;
}

static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
                                 int off, int size, enum bpf_access_type type)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        struct bpf_map *map = reg->map_ptr;
        u32 cap = bpf_map_flags_to_cap(map);

        if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
                verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n",
                        map->value_size, reg->smin_value + off, size);
                return -EACCES;
        }

        if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
                verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n",
                        map->value_size, reg->smin_value + off, size);
                return -EACCES;
        }

        return 0;
}

/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
static int __check_mem_access(struct bpf_verifier_env *env, int regno,
                              int off, int size, u32 mem_size,
                              bool zero_size_allowed)
{
        bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
        struct bpf_reg_state *reg;

        if (off >= 0 && size_ok && (u64)off + size <= mem_size)
                return 0;

        reg = &cur_regs(env)[regno];
        switch (reg->type) {
        case PTR_TO_MAP_KEY:
                verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
                        mem_size, off, size);
                break;
        case PTR_TO_MAP_VALUE:
                verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
                        mem_size, off, size);
                break;
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
        case PTR_TO_PACKET_END:
                verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
                        off, size, regno, reg->id, off, mem_size);
                break;
        case PTR_TO_CTX:
                verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n",
                        mem_size, off, size);
                break;
        case PTR_TO_MEM:
        default:
                verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
                        mem_size, off, size);
        }

        return -EACCES;
}

/* check read/write into a memory region with possible variable offset */
static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
                                   int off, int size, u32 mem_size,
                                   bool zero_size_allowed)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *reg = &state->regs[regno];
        int err;

        /* We may have adjusted the register pointing to memory region, so we
         * need to try adding each of min_value and max_value to off
         * to make sure our theoretical access will be safe.
         *
         * The minimum value is only important with signed
         * comparisons where we can't assume the floor of a
         * value is 0.  If we are using signed variables for our
         * index'es we need to make sure that whatever we use
         * will have a set floor within our range.
         */
        if (reg->smin_value < 0 &&
            (reg->smin_value == S64_MIN ||
             (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
              reg->smin_value + off < 0)) {
                verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
                        regno);
                return -EACCES;
        }
        err = __check_mem_access(env, regno, reg->smin_value + off, size,
                                 mem_size, zero_size_allowed);
        if (err) {
                verbose(env, "R%d min value is outside of the allowed memory range\n",
                        regno);
                return err;
        }

        /* If we haven't set a max value then we need to bail since we can't be
         * sure we won't do bad things.
         * If reg->umax_value + off could overflow, treat that as unbounded too.
         */
        if (reg->umax_value >= BPF_MAX_VAR_OFF) {
                verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
                        regno);
                return -EACCES;
        }
        err = __check_mem_access(env, regno, reg->umax_value + off, size,
                                 mem_size, zero_size_allowed);
        if (err) {
                verbose(env, "R%d max value is outside of the allowed memory range\n",
                        regno);
                return err;
        }

        return 0;
}

static int __check_ptr_off_reg(struct bpf_verifier_env *env,
                               const struct bpf_reg_state *reg, int regno,
                               bool fixed_off_ok)
{
        /* Access to this pointer-typed register or passing it to a helper
         * is only allowed in its original, unmodified form.
         */

        if (!tnum_is_const(reg->var_off)) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env, "variable %s access var_off=%s disallowed\n",
                        reg_type_str(env, reg->type), tn_buf);
                return -EACCES;
        }

        if (reg->smin_value < 0) {
                verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n",
                        reg_type_str(env, reg->type), regno, reg->var_off.value);
                return -EACCES;
        }

        if (!fixed_off_ok && reg->var_off.value != 0) {
                verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n",
                        reg_type_str(env, reg->type), regno, reg->var_off.value);
                return -EACCES;
        }

        return 0;
}

static int check_ptr_off_reg(struct bpf_verifier_env *env,
                             const struct bpf_reg_state *reg, int regno)
{
        return __check_ptr_off_reg(env, reg, regno, false);
}

static int map_kptr_match_type(struct bpf_verifier_env *env,
                               struct btf_field *kptr_field,
                               struct bpf_reg_state *reg, u32 regno)
{
        const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
        int perm_flags;
        const char *reg_name = "";

        if (base_type(reg->type) != PTR_TO_BTF_ID)
                goto bad_type;

        if (btf_is_kernel(reg->btf)) {
                perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;

                /* Only unreferenced case accepts untrusted pointers */
                if (kptr_field->type == BPF_KPTR_UNREF)
                        perm_flags |= PTR_UNTRUSTED;
        } else {
                perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
                if (kptr_field->type == BPF_KPTR_PERCPU)
                        perm_flags |= MEM_PERCPU;
        }

        if (type_flag(reg->type) & ~perm_flags)
                goto bad_type;

        /* We need to verify reg->type and reg->btf, before accessing reg->btf */
        reg_name = btf_type_name(reg->btf, reg->btf_id);

        /* For ref_ptr case, release function check should ensure we get one
         * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
         * normal store of unreferenced kptr, we must ensure var_off is zero.
         * Since ref_ptr cannot be accessed directly by BPF insns, check for
         * reg->ref_obj_id is not needed here.
         */
        if (__check_ptr_off_reg(env, reg, regno, true))
                return -EACCES;

        /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
         * we also need to take into account the reg->var_off.
         *
         * We want to support cases like:
         *
         * struct foo {
         *         struct bar br;
         *         struct baz bz;
         * };
         *
         * struct foo *v;
         * v = func();              // PTR_TO_BTF_ID
         * val->foo = v;      // reg->var_off is zero, btf and btf_id match type
         * val->bar = &v->br; // reg->var_off is still zero, but we need to retry with
         *                    // first member type of struct after comparison fails
         * val->baz = &v->bz; // reg->var_off is non-zero, so struct needs to be walked
         *                    // to match type
         *
         * In the kptr_ref case, check_func_arg_reg_off already ensures reg->var_off
         * is zero. We must also ensure that btf_struct_ids_match does not walk
         * the struct to match type against first member of struct, i.e. reject
         * second case from above. Hence, when type is BPF_KPTR_REF, we set
         * strict mode to true for type match.
         */
        if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value,
                                  kptr_field->kptr.btf, kptr_field->kptr.btf_id,
                                  kptr_field->type != BPF_KPTR_UNREF))
                goto bad_type;
        return 0;
bad_type:
        verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
                reg_type_str(env, reg->type), reg_name);
        verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
        if (kptr_field->type == BPF_KPTR_UNREF)
                verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
                        targ_name);
        else
                verbose(env, "\n");
        return -EINVAL;
}

static bool in_sleepable(struct bpf_verifier_env *env)
{
        return env->cur_state->in_sleepable;
}

/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
 * can dereference RCU protected pointers and result is PTR_TRUSTED.
 */
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
        return env->cur_state->active_rcu_locks ||
               env->cur_state->active_locks ||
               !in_sleepable(env);
}

/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
BTF_SET_START(rcu_protected_types)
#ifdef CONFIG_NET
BTF_ID(struct, prog_test_ref_kfunc)
#endif
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
#endif
#ifdef CONFIG_BPF_JIT
BTF_ID(struct, bpf_cpumask)
#endif
BTF_ID(struct, task_struct)
#ifdef CONFIG_CRYPTO
BTF_ID(struct, bpf_crypto_ctx)
#endif
BTF_SET_END(rcu_protected_types)

static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
{
        if (!btf_is_kernel(btf))
                return true;
        return btf_id_set_contains(&rcu_protected_types, btf_id);
}

static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field)
{
        struct btf_struct_meta *meta;

        if (btf_is_kernel(kptr_field->kptr.btf))
                return NULL;

        meta = btf_find_struct_meta(kptr_field->kptr.btf,
                                    kptr_field->kptr.btf_id);

        return meta ? meta->record : NULL;
}

static bool rcu_safe_kptr(const struct btf_field *field)
{
        const struct btf_field_kptr *kptr = &field->kptr;

        return field->type == BPF_KPTR_PERCPU ||
               (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
}

static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
{
        struct btf_record *rec;
        u32 ret;

        ret = PTR_MAYBE_NULL;
        if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
                ret |= MEM_RCU;
                if (kptr_field->type == BPF_KPTR_PERCPU)
                        ret |= MEM_PERCPU;
                else if (!btf_is_kernel(kptr_field->kptr.btf))
                        ret |= MEM_ALLOC;

                rec = kptr_pointee_btf_record(kptr_field);
                if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
                        ret |= NON_OWN_REF;
        } else {
                ret |= PTR_UNTRUSTED;
        }

        return ret;
}

static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno,
                            struct btf_field *field)
{
        struct bpf_reg_state *reg;
        const struct btf_type *t;

        t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
        mark_reg_known_zero(env, cur_regs(env), regno);
        reg = reg_state(env, regno);
        reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
        reg->mem_size = t->size;
        reg->id = ++env->id_gen;

        return 0;
}

static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
                                 int value_regno, int insn_idx,
                                 struct btf_field *kptr_field)
{
        struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
        int class = BPF_CLASS(insn->code);
        struct bpf_reg_state *val_reg;
        int ret;

        /* Things we already checked for in check_map_access and caller:
         *  - Reject cases where variable offset may touch kptr
         *  - size of access (must be BPF_DW)
         *  - tnum_is_const(reg->var_off)
         *  - kptr_field->offset == off + reg->var_off.value
         */
        /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
        if (BPF_MODE(insn->code) != BPF_MEM) {
                verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n");
                return -EACCES;
        }

        /* We only allow loading referenced kptr, since it will be marked as
         * untrusted, similar to unreferenced kptr.
         */
        if (class != BPF_LDX &&
            (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) {
                verbose(env, "store to referenced kptr disallowed\n");
                return -EACCES;
        }
        if (class != BPF_LDX && kptr_field->type == BPF_UPTR) {
                verbose(env, "store to uptr disallowed\n");
                return -EACCES;
        }

        if (class == BPF_LDX) {
                if (kptr_field->type == BPF_UPTR)
                        return mark_uptr_ld_reg(env, value_regno, kptr_field);

                /* We can simply mark the value_regno receiving the pointer
                 * value from map as PTR_TO_BTF_ID, with the correct type.
                 */
                ret = mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID,
                                      kptr_field->kptr.btf, kptr_field->kptr.btf_id,
                                      btf_ld_kptr_type(env, kptr_field));
                if (ret < 0)
                        return ret;
        } else if (class == BPF_STX) {
                val_reg = reg_state(env, value_regno);
                if (!bpf_register_is_null(val_reg) &&
                    map_kptr_match_type(env, kptr_field, val_reg, value_regno))
                        return -EACCES;
        } else if (class == BPF_ST) {
                if (insn->imm) {
                        verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
                                kptr_field->offset);
                        return -EACCES;
                }
        } else {
                verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n");
                return -EACCES;
        }
        return 0;
}

/*
 * Return the size of the memory region accessible from a pointer to map value.
 * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible.
 */
static u32 map_mem_size(const struct bpf_map *map)
{
        if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
                return map->max_entries * sizeof(long);

        return map->value_size;
}

/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
                            int off, int size, bool zero_size_allowed,
                            enum bpf_access_src src)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *reg = &state->regs[regno];
        struct bpf_map *map = reg->map_ptr;
        u32 mem_size = map_mem_size(map);
        struct btf_record *rec;
        int err, i;

        err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed);
        if (err)
                return err;

        if (IS_ERR_OR_NULL(map->record))
                return 0;
        rec = map->record;
        for (i = 0; i < rec->cnt; i++) {
                struct btf_field *field = &rec->fields[i];
                u32 p = field->offset;

                /* If any part of a field  can be touched by load/store, reject
                 * this program. To check that [x1, x2) overlaps with [y1, y2),
                 * it is sufficient to check x1 < y2 && y1 < x2.
                 */
                if (reg->smin_value + off < p + field->size &&
                    p < reg->umax_value + off + size) {
                        switch (field->type) {
                        case BPF_KPTR_UNREF:
                        case BPF_KPTR_REF:
                        case BPF_KPTR_PERCPU:
                        case BPF_UPTR:
                                if (src != ACCESS_DIRECT) {
                                        verbose(env, "%s cannot be accessed indirectly by helper\n",
                                                btf_field_type_name(field->type));
                                        return -EACCES;
                                }
                                if (!tnum_is_const(reg->var_off)) {
                                        verbose(env, "%s access cannot have variable offset\n",
                                                btf_field_type_name(field->type));
                                        return -EACCES;
                                }
                                if (p != off + reg->var_off.value) {
                                        verbose(env, "%s access misaligned expected=%u off=%llu\n",
                                                btf_field_type_name(field->type),
                                                p, off + reg->var_off.value);
                                        return -EACCES;
                                }
                                if (size != bpf_size_to_bytes(BPF_DW)) {
                                        verbose(env, "%s access size must be BPF_DW\n",
                                                btf_field_type_name(field->type));
                                        return -EACCES;
                                }
                                break;
                        default:
                                verbose(env, "%s cannot be accessed directly by load/store\n",
                                        btf_field_type_name(field->type));
                                return -EACCES;
                        }
                }
        }
        return 0;
}

static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
                               const struct bpf_call_arg_meta *meta,
                               enum bpf_access_type t)
{
        enum bpf_prog_type prog_type = resolve_prog_type(env->prog);

        switch (prog_type) {
        /* Program types only with direct read access go here! */
        case BPF_PROG_TYPE_LWT_IN:
        case BPF_PROG_TYPE_LWT_OUT:
        case BPF_PROG_TYPE_LWT_SEG6LOCAL:
        case BPF_PROG_TYPE_SK_REUSEPORT:
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
        case BPF_PROG_TYPE_CGROUP_SKB:
                if (t == BPF_WRITE)
                        return false;
                fallthrough;

        /* Program types with direct read + write access go here! */
        case BPF_PROG_TYPE_SCHED_CLS:
        case BPF_PROG_TYPE_SCHED_ACT:
        case BPF_PROG_TYPE_XDP:
        case BPF_PROG_TYPE_LWT_XMIT:
        case BPF_PROG_TYPE_SK_SKB:
        case BPF_PROG_TYPE_SK_MSG:
                if (meta)
                        return meta->pkt_access;

                env->seen_direct_write = true;
                return true;

        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                if (t == BPF_WRITE)
                        env->seen_direct_write = true;

                return true;

        default:
                return false;
        }
}

static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
                               int size, bool zero_size_allowed)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        int err;

        if (reg->range < 0) {
                verbose(env, "R%d offset is outside of the packet\n", regno);
                return -EINVAL;
        }

        err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed);
        if (err)
                return err;

        /* __check_mem_access has made sure "off + size - 1" is within u16.
         * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
         * otherwise find_good_pkt_pointers would have refused to set range info
         * that __check_mem_access would have rejected this pkt access.
         * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
         */
        env->prog->aux->max_pkt_offset =
                max_t(u32, env->prog->aux->max_pkt_offset,
                      off + reg->umax_value + size - 1);

        return 0;
}

static bool is_var_ctx_off_allowed(struct bpf_prog *prog)
{
        return resolve_prog_type(prog) == BPF_PROG_TYPE_SYSCALL;
}

/* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
                              enum bpf_access_type t, struct bpf_insn_access_aux *info)
{
        if (env->ops->is_valid_access &&
            env->ops->is_valid_access(off, size, t, env->prog, info)) {
                /* A non zero info.ctx_field_size indicates that this field is a
                 * candidate for later verifier transformation to load the whole
                 * field and then apply a mask when accessed with a narrower
                 * access than actual ctx access size. A zero info.ctx_field_size
                 * will only allow for whole field access and rejects any other
                 * type of narrower access.
                 */
                if (base_type(info->reg_type) == PTR_TO_BTF_ID) {
                        if (info->ref_obj_id &&
                            !find_reference_state(env->cur_state, info->ref_obj_id)) {
                                verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n",
                                        off);
                                return -EACCES;
                        }
                } else {
                        env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size;
                }
                /* remember the offset of last byte accessed in ctx */
                if (env->prog->aux->max_ctx_offset < off + size)
                        env->prog->aux->max_ctx_offset = off + size;
                return 0;
        }

        verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
        return -EACCES;
}

static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
                            int off, int access_size, enum bpf_access_type t,
                            struct bpf_insn_access_aux *info)
{
        /*
         * Program types that don't rewrite ctx accesses can safely
         * dereference ctx pointers with fixed offsets.
         */
        bool var_off_ok = is_var_ctx_off_allowed(env->prog);
        bool fixed_off_ok = !env->ops->convert_ctx_access;
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = regs + regno;
        int err;

        if (var_off_ok)
                err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false);
        else
                err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
        if (err)
                return err;
        off += reg->umax_value;

        err = __check_ctx_access(env, insn_idx, off, access_size, t, info);
        if (err)
                verbose_linfo(env, insn_idx, "; ");
        return err;
}

static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
                                  int size)
{
        if (size < 0 || off < 0 ||
            (u64)off + size > sizeof(struct bpf_flow_keys)) {
                verbose(env, "invalid access to flow keys off=%d size=%d\n",
                        off, size);
                return -EACCES;
        }
        return 0;
}

static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
                             u32 regno, int off, int size,
                             enum bpf_access_type t)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        struct bpf_insn_access_aux info = {};
        bool valid;

        if (reg->smin_value < 0) {
                verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
                        regno);
                return -EACCES;
        }

        switch (reg->type) {
        case PTR_TO_SOCK_COMMON:
                valid = bpf_sock_common_is_valid_access(off, size, t, &info);
                break;
        case PTR_TO_SOCKET:
                valid = bpf_sock_is_valid_access(off, size, t, &info);
                break;
        case PTR_TO_TCP_SOCK:
                valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
                break;
        case PTR_TO_XDP_SOCK:
                valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
                break;
        default:
                valid = false;
        }


        if (valid) {
                env->insn_aux_data[insn_idx].ctx_field_size =
                        info.ctx_field_size;
                return 0;
        }

        verbose(env, "R%d invalid %s access off=%d size=%d\n",
                regno, reg_type_str(env, reg->type), off, size);

        return -EACCES;
}

static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
        return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
}

static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
{
        const struct bpf_reg_state *reg = reg_state(env, regno);

        return reg->type == PTR_TO_CTX;
}

static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
{
        const struct bpf_reg_state *reg = reg_state(env, regno);

        return type_is_sk_pointer(reg->type);
}

static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
{
        const struct bpf_reg_state *reg = reg_state(env, regno);

        return type_is_pkt_pointer(reg->type);
}

static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
{
        const struct bpf_reg_state *reg = reg_state(env, regno);

        /* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */
        return reg->type == PTR_TO_FLOW_KEYS;
}

static bool is_arena_reg(struct bpf_verifier_env *env, int regno)
{
        const struct bpf_reg_state *reg = reg_state(env, regno);

        return reg->type == PTR_TO_ARENA;
}

/* Return false if @regno contains a pointer whose type isn't supported for
 * atomic instruction @insn.
 */
static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno,
                               struct bpf_insn *insn)
{
        if (is_ctx_reg(env, regno))
                return false;
        if (is_pkt_reg(env, regno))
                return false;
        if (is_flow_key_reg(env, regno))
                return false;
        if (is_sk_reg(env, regno))
                return false;
        if (is_arena_reg(env, regno))
                return bpf_jit_supports_insn(insn, true);

        return true;
}

static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#ifdef CONFIG_NET
        [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
        [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
        [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
#endif
        [CONST_PTR_TO_MAP] = btf_bpf_map_id,
};

static bool is_trusted_reg(const struct bpf_reg_state *reg)
{
        /* A referenced register is always trusted. */
        if (reg->ref_obj_id)
                return true;

        /* Types listed in the reg2btf_ids are always trusted */
        if (reg2btf_ids[base_type(reg->type)] &&
            !bpf_type_has_unsafe_modifiers(reg->type))
                return true;

        /* If a register is not referenced, it is trusted if it has the
         * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
         * other type modifiers may be safe, but we elect to take an opt-in
         * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
         * not.
         *
         * Eventually, we should make PTR_TRUSTED the single source of truth
         * for whether a register is trusted.
         */
        return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
               !bpf_type_has_unsafe_modifiers(reg->type);
}

static bool is_rcu_reg(const struct bpf_reg_state *reg)
{
        return reg->type & MEM_RCU;
}

static void clear_trusted_flags(enum bpf_type_flag *flag)
{
        *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
}

static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
                                   const struct bpf_reg_state *reg,
                                   int off, int size, bool strict)
{
        struct tnum reg_off;
        int ip_align;

        /* Byte size accesses are always allowed. */
        if (!strict || size == 1)
                return 0;

        /* For platforms that do not have a Kconfig enabling
         * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
         * NET_IP_ALIGN is universally set to '2'.  And on platforms
         * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get
         * to this code only in strict mode where we want to emulate
         * the NET_IP_ALIGN==2 checking.  Therefore use an
         * unconditional IP align value of '2'.
         */
        ip_align = 2;

        reg_off = tnum_add(reg->var_off, tnum_const(ip_align + off));
        if (!tnum_is_aligned(reg_off, size)) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env,
                        "misaligned packet access off %d+%s+%d size %d\n",
                        ip_align, tn_buf, off, size);
                return -EACCES;
        }

        return 0;
}

static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
                                       const struct bpf_reg_state *reg,
                                       const char *pointer_desc,
                                       int off, int size, bool strict)
{
        struct tnum reg_off;

        /* Byte size accesses are always allowed. */
        if (!strict || size == 1)
                return 0;

        reg_off = tnum_add(reg->var_off, tnum_const(off));
        if (!tnum_is_aligned(reg_off, size)) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env, "misaligned %saccess off %s+%d size %d\n",
                        pointer_desc, tn_buf, off, size);
                return -EACCES;
        }

        return 0;
}

static int check_ptr_alignment(struct bpf_verifier_env *env,
                               const struct bpf_reg_state *reg, int off,
                               int size, bool strict_alignment_once)
{
        bool strict = env->strict_alignment || strict_alignment_once;
        const char *pointer_desc = "";

        switch (reg->type) {
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
                /* Special case, because of NET_IP_ALIGN. Given metadata sits
                 * right in front, treat it the very same way.
                 */
                return check_pkt_ptr_alignment(env, reg, off, size, strict);
        case PTR_TO_FLOW_KEYS:
                pointer_desc = "flow keys ";
                break;
        case PTR_TO_MAP_KEY:
                pointer_desc = "key ";
                break;
        case PTR_TO_MAP_VALUE:
                pointer_desc = "value ";
                if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY)
                        strict = true;
                break;
        case PTR_TO_CTX:
                pointer_desc = "context ";
                break;
        case PTR_TO_STACK:
                pointer_desc = "stack ";
                /* The stack spill tracking logic in check_stack_write_fixed_off()
                 * and check_stack_read_fixed_off() relies on stack accesses being
                 * aligned.
                 */
                strict = true;
                break;
        case PTR_TO_SOCKET:
                pointer_desc = "sock ";
                break;
        case PTR_TO_SOCK_COMMON:
                pointer_desc = "sock_common ";
                break;
        case PTR_TO_TCP_SOCK:
                pointer_desc = "tcp_sock ";
                break;
        case PTR_TO_XDP_SOCK:
                pointer_desc = "xdp_sock ";
                break;
        case PTR_TO_ARENA:
                return 0;
        default:
                break;
        }
        return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
                                           strict);
}

static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
{
        if (!bpf_jit_supports_private_stack())
                return NO_PRIV_STACK;

        /* bpf_prog_check_recur() checks all prog types that use bpf trampoline
         * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
         * explicitly.
         */
        switch (prog->type) {
        case BPF_PROG_TYPE_KPROBE:
        case BPF_PROG_TYPE_TRACEPOINT:
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
                return PRIV_STACK_ADAPTIVE;
        case BPF_PROG_TYPE_TRACING:
        case BPF_PROG_TYPE_LSM:
        case BPF_PROG_TYPE_STRUCT_OPS:
                if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog))
                        return PRIV_STACK_ADAPTIVE;
                fallthrough;
        default:
                break;
        }

        return NO_PRIV_STACK;
}

static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
{
        if (env->prog->jit_requested)
                return round_up(stack_depth, 16);

        /* round up to 32-bytes, since this is granularity
         * of interpreter stack size
         */
        return round_up(max_t(u32, stack_depth, 1), 32);
}

/* temporary state used for call frame depth calculation */
struct bpf_subprog_call_depth_info {
        int ret_insn; /* caller instruction where we return to. */
        int caller; /* caller subprogram idx */
        int frame; /* # of consecutive static call stack frames on top of stack */
};

/* starting from main bpf function walk all instructions of the function
 * and recursively walk all callees that given function can call.
 * Ignore jump and exit insns.
 */
static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
                                         struct bpf_subprog_call_depth_info *dinfo,
                                         bool priv_stack_supported)
{
        struct bpf_subprog_info *subprog = env->subprog_info;
        struct bpf_insn *insn = env->prog->insnsi;
        int depth = 0, frame = 0, i, subprog_end, subprog_depth;
        bool tail_call_reachable = false;
        int total;
        int tmp;

        /* no caller idx */
        dinfo[idx].caller = -1;

        i = subprog[idx].start;
        if (!priv_stack_supported)
                subprog[idx].priv_stack_mode = NO_PRIV_STACK;
process_func:
        /* protect against potential stack overflow that might happen when
         * bpf2bpf calls get combined with tailcalls. Limit the caller's stack
         * depth for such case down to 256 so that the worst case scenario
         * would result in 8k stack size (32 which is tailcall limit * 256 =
         * 8k).
         *
         * To get the idea what might happen, see an example:
         * func1 -> sub rsp, 128
         *  subfunc1 -> sub rsp, 256
         *  tailcall1 -> add rsp, 256
         *   func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320)
         *   subfunc2 -> sub rsp, 64
         *   subfunc22 -> sub rsp, 128
         *   tailcall2 -> add rsp, 128
         *    func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416)
         *
         * tailcall will unwind the current stack frame but it will not get rid
         * of caller's stack as shown on the example above.
         */
        if (idx && subprog[idx].has_tail_call && depth >= 256) {
                verbose(env,
                        "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
                        depth);
                return -EACCES;
        }

        subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
        if (priv_stack_supported) {
                /* Request private stack support only if the subprog stack
                 * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
                 * avoid jit penalty if the stack usage is small.
                 */
                if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
                    subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
                        subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
        }

        if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
                if (subprog_depth > MAX_BPF_STACK) {
                        verbose(env, "stack size of subprog %d is %d. Too large\n",
                                idx, subprog_depth);
                        return -EACCES;
                }
        } else {
                depth += subprog_depth;
                if (depth > MAX_BPF_STACK) {
                        total = 0;
                        for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller)
                                total++;

                        verbose(env, "combined stack size of %d calls is %d. Too large\n",
                                total, depth);
                        return -EACCES;
                }
        }
continue_func:
        subprog_end = subprog[idx + 1].start;
        for (; i < subprog_end; i++) {
                int next_insn, sidx;

                if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
                        bool err = false;

                        if (!is_bpf_throw_kfunc(insn + i))
                                continue;
                        for (tmp = idx; tmp >= 0 && !err; tmp = dinfo[tmp].caller) {
                                if (subprog[tmp].is_cb) {
                                        err = true;
                                        break;
                                }
                        }
                        if (!err)
                                continue;
                        verbose(env,
                                "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
                                i, idx);
                        return -EINVAL;
                }

                if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
                        continue;
                /* remember insn and function to return to */

                /* find the callee */
                next_insn = i + insn[i].imm + 1;
                sidx = bpf_find_subprog(env, next_insn);
                if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn))
                        return -EFAULT;
                if (subprog[sidx].is_async_cb) {
                        if (subprog[sidx].has_tail_call) {
                                verifier_bug(env, "subprog has tail_call and async cb");
                                return -EFAULT;
                        }
                        /* async callbacks don't increase bpf prog stack size unless called directly */
                        if (!bpf_pseudo_call(insn + i))
                                continue;
                        if (subprog[sidx].is_exception_cb) {
                                verbose(env, "insn %d cannot call exception cb directly", i);
                                return -EINVAL;
                        }
                }

                /* store caller info for after we return from callee */
                dinfo[idx].frame = frame;
                dinfo[idx].ret_insn = i + 1;

                /* push caller idx into callee's dinfo */
                dinfo[sidx].caller = idx;

                i = next_insn;

                idx = sidx;
                if (!priv_stack_supported)
                        subprog[idx].priv_stack_mode = NO_PRIV_STACK;

                if (subprog[idx].has_tail_call)
                        tail_call_reachable = true;

                frame = bpf_subprog_is_global(env, idx) ? 0 : frame + 1;
                if (frame >= MAX_CALL_FRAMES) {
                        verbose(env, "the call stack of %d frames is too deep !\n",
                                frame);
                        return -E2BIG;
                }
                goto process_func;
        }
        /* if tail call got detected across bpf2bpf calls then mark each of the
         * currently present subprog frames as tail call reachable subprogs;
         * this info will be utilized by JIT so that we will be preserving the
         * tail call counter throughout bpf2bpf calls combined with tailcalls
         */
        if (tail_call_reachable)
                for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) {
                        if (subprog[tmp].is_exception_cb) {
                                verbose(env, "cannot tail call within exception cb\n");
                                return -EINVAL;
                        }
                        subprog[tmp].tail_call_reachable = true;
                }
        if (subprog[0].tail_call_reachable)
                env->prog->aux->tail_call_reachable = true;

        /* end of for() loop means the last insn of the 'subprog'
         * was reached. Doesn't matter whether it was JA or EXIT
         */
        if (frame == 0 && dinfo[idx].caller < 0)
                return 0;
        if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
                depth -= round_up_stack_depth(env, subprog[idx].stack_depth);

        /* pop caller idx from callee */
        idx = dinfo[idx].caller;

        /* retrieve caller state from its frame */
        frame = dinfo[idx].frame;
        i = dinfo[idx].ret_insn;

        goto continue_func;
}

static int check_max_stack_depth(struct bpf_verifier_env *env)
{
        enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
        struct bpf_subprog_call_depth_info *dinfo;
        struct bpf_subprog_info *si = env->subprog_info;
        bool priv_stack_supported;
        int ret;

        dinfo = kvcalloc(env->subprog_cnt, sizeof(*dinfo), GFP_KERNEL_ACCOUNT);
        if (!dinfo)
                return -ENOMEM;

        for (int i = 0; i < env->subprog_cnt; i++) {
                if (si[i].has_tail_call) {
                        priv_stack_mode = NO_PRIV_STACK;
                        break;
                }
        }

        if (priv_stack_mode == PRIV_STACK_UNKNOWN)
                priv_stack_mode = bpf_enable_priv_stack(env->prog);

        /* All async_cb subprogs use normal kernel stack. If a particular
         * subprog appears in both main prog and async_cb subtree, that
         * subprog will use normal kernel stack to avoid potential nesting.
         * The reverse subprog traversal ensures when main prog subtree is
         * checked, the subprogs appearing in async_cb subtrees are already
         * marked as using normal kernel stack, so stack size checking can
         * be done properly.
         */
        for (int i = env->subprog_cnt - 1; i >= 0; i--) {
                if (!i || si[i].is_async_cb) {
                        priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
                        ret = check_max_stack_depth_subprog(env, i, dinfo,
                                        priv_stack_supported);
                        if (ret < 0) {
                                kvfree(dinfo);
                                return ret;
                        }
                }
        }

        for (int i = 0; i < env->subprog_cnt; i++) {
                if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
                        env->prog->aux->jits_use_priv_stack = true;
                        break;
                }
        }

        kvfree(dinfo);

        return 0;
}

static int __check_buffer_access(struct bpf_verifier_env *env,
                                 const char *buf_info,
                                 const struct bpf_reg_state *reg,
                                 int regno, int off, int size)
{
        if (off < 0) {
                verbose(env,
                        "R%d invalid %s buffer access: off=%d, size=%d\n",
                        regno, buf_info, off, size);
                return -EACCES;
        }
        if (!tnum_is_const(reg->var_off)) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env,
                        "R%d invalid variable buffer offset: off=%d, var_off=%s\n",
                        regno, off, tn_buf);
                return -EACCES;
        }

        return 0;
}

static int check_tp_buffer_access(struct bpf_verifier_env *env,
                                  const struct bpf_reg_state *reg,
                                  int regno, int off, int size)
{
        int err;

        err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
        if (err)
                return err;

        env->prog->aux->max_tp_access = max(reg->var_off.value + off + size,
                                            env->prog->aux->max_tp_access);

        return 0;
}

static int check_buffer_access(struct bpf_verifier_env *env,
                               const struct bpf_reg_state *reg,
                               int regno, int off, int size,
                               bool zero_size_allowed,
                               u32 *max_access)
{
        const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
        int err;

        err = __check_buffer_access(env, buf_info, reg, regno, off, size);
        if (err)
                return err;

        *max_access = max(reg->var_off.value + off + size, *max_access);

        return 0;
}

/* BPF architecture zero extends alu32 ops into 64-bit registesr */
static void zext_32_to_64(struct bpf_reg_state *reg)
{
        reg->var_off = tnum_subreg(reg->var_off);
        __reg_assign_32_into_64(reg);
}

/* truncate register to smaller size (in bytes)
 * must be called with size < BPF_REG_SIZE
 */
static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
{
        u64 mask;

        /* clear high bits in bit representation */
        reg->var_off = tnum_cast(reg->var_off, size);

        /* fix arithmetic bounds */
        mask = ((u64)1 << (size * 8)) - 1;
        if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
                reg->umin_value &= mask;
                reg->umax_value &= mask;
        } else {
                reg->umin_value = 0;
                reg->umax_value = mask;
        }
        reg->smin_value = reg->umin_value;
        reg->smax_value = reg->umax_value;

        /* If size is smaller than 32bit register the 32bit register
         * values are also truncated so we push 64-bit bounds into
         * 32-bit bounds. Above were truncated < 32-bits already.
         */
        if (size < 4)
                __mark_reg32_unbounded(reg);

        reg_bounds_sync(reg);
}

static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
{
        if (size == 1) {
                reg->smin_value = reg->s32_min_value = S8_MIN;
                reg->smax_value = reg->s32_max_value = S8_MAX;
        } else if (size == 2) {
                reg->smin_value = reg->s32_min_value = S16_MIN;
                reg->smax_value = reg->s32_max_value = S16_MAX;
        } else {
                /* size == 4 */
                reg->smin_value = reg->s32_min_value = S32_MIN;
                reg->smax_value = reg->s32_max_value = S32_MAX;
        }
        reg->umin_value = reg->u32_min_value = 0;
        reg->umax_value = U64_MAX;
        reg->u32_max_value = U32_MAX;
        reg->var_off = tnum_unknown;
}

static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
{
        s64 init_s64_max, init_s64_min, s64_max, s64_min, u64_cval;
        u64 top_smax_value, top_smin_value;
        u64 num_bits = size * 8;

        if (tnum_is_const(reg->var_off)) {
                u64_cval = reg->var_off.value;
                if (size == 1)
                        reg->var_off = tnum_const((s8)u64_cval);
                else if (size == 2)
                        reg->var_off = tnum_const((s16)u64_cval);
                else
                        /* size == 4 */
                        reg->var_off = tnum_const((s32)u64_cval);

                u64_cval = reg->var_off.value;
                reg->smax_value = reg->smin_value = u64_cval;
                reg->umax_value = reg->umin_value = u64_cval;
                reg->s32_max_value = reg->s32_min_value = u64_cval;
                reg->u32_max_value = reg->u32_min_value = u64_cval;
                return;
        }

        top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits;
        top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits;

        if (top_smax_value != top_smin_value)
                goto out;

        /* find the s64_min and s64_min after sign extension */
        if (size == 1) {
                init_s64_max = (s8)reg->smax_value;
                init_s64_min = (s8)reg->smin_value;
        } else if (size == 2) {
                init_s64_max = (s16)reg->smax_value;
                init_s64_min = (s16)reg->smin_value;
        } else {
                init_s64_max = (s32)reg->smax_value;
                init_s64_min = (s32)reg->smin_value;
        }

        s64_max = max(init_s64_max, init_s64_min);
        s64_min = min(init_s64_max, init_s64_min);

        /* both of s64_max/s64_min positive or negative */
        if ((s64_max >= 0) == (s64_min >= 0)) {
                reg->s32_min_value = reg->smin_value = s64_min;
                reg->s32_max_value = reg->smax_value = s64_max;
                reg->u32_min_value = reg->umin_value = s64_min;
                reg->u32_max_value = reg->umax_value = s64_max;
                reg->var_off = tnum_range(s64_min, s64_max);
                return;
        }

out:
        set_sext64_default_val(reg, size);
}

static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
{
        if (size == 1) {
                reg->s32_min_value = S8_MIN;
                reg->s32_max_value = S8_MAX;
        } else {
                /* size == 2 */
                reg->s32_min_value = S16_MIN;
                reg->s32_max_value = S16_MAX;
        }
        reg->u32_min_value = 0;
        reg->u32_max_value = U32_MAX;
        reg->var_off = tnum_subreg(tnum_unknown);
}

static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
{
        s32 init_s32_max, init_s32_min, s32_max, s32_min, u32_val;
        u32 top_smax_value, top_smin_value;
        u32 num_bits = size * 8;

        if (tnum_is_const(reg->var_off)) {
                u32_val = reg->var_off.value;
                if (size == 1)
                        reg->var_off = tnum_const((s8)u32_val);
                else
                        reg->var_off = tnum_const((s16)u32_val);

                u32_val = reg->var_off.value;
                reg->s32_min_value = reg->s32_max_value = u32_val;
                reg->u32_min_value = reg->u32_max_value = u32_val;
                return;
        }

        top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits;
        top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits;

        if (top_smax_value != top_smin_value)
                goto out;

        /* find the s32_min and s32_min after sign extension */
        if (size == 1) {
                init_s32_max = (s8)reg->s32_max_value;
                init_s32_min = (s8)reg->s32_min_value;
        } else {
                /* size == 2 */
                init_s32_max = (s16)reg->s32_max_value;
                init_s32_min = (s16)reg->s32_min_value;
        }
        s32_max = max(init_s32_max, init_s32_min);
        s32_min = min(init_s32_max, init_s32_min);

        if ((s32_min >= 0) == (s32_max >= 0)) {
                reg->s32_min_value = s32_min;
                reg->s32_max_value = s32_max;
                reg->u32_min_value = (u32)s32_min;
                reg->u32_max_value = (u32)s32_max;
                reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max));
                return;
        }

out:
        set_sext32_default_val(reg, size);
}

bool bpf_map_is_rdonly(const struct bpf_map *map)
{
        /* A map is considered read-only if the following condition are true:
         *
         * 1) BPF program side cannot change any of the map content. The
         *    BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
         *    and was set at map creation time.
         * 2) The map value(s) have been initialized from user space by a
         *    loader and then "frozen", such that no new map update/delete
         *    operations from syscall side are possible for the rest of
         *    the map's lifetime from that point onwards.
         * 3) Any parallel/pending map update/delete operations from syscall
         *    side have been completed. Only after that point, it's safe to
         *    assume that map value(s) are immutable.
         */
        return (map->map_flags & BPF_F_RDONLY_PROG) &&
               READ_ONCE(map->frozen) &&
               !bpf_map_write_active(map);
}

int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
                        bool is_ldsx)
{
        void *ptr;
        u64 addr;
        int err;

        err = map->ops->map_direct_value_addr(map, &addr, off);
        if (err)
                return err;
        ptr = (void *)(long)addr + off;

        switch (size) {
        case sizeof(u8):
                *val = is_ldsx ? (s64)*(s8 *)ptr : (u64)*(u8 *)ptr;
                break;
        case sizeof(u16):
                *val = is_ldsx ? (s64)*(s16 *)ptr : (u64)*(u16 *)ptr;
                break;
        case sizeof(u32):
                *val = is_ldsx ? (s64)*(s32 *)ptr : (u64)*(u32 *)ptr;
                break;
        case sizeof(u64):
                *val = *(u64 *)ptr;
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

#define BTF_TYPE_SAFE_RCU(__type)  __PASTE(__type, __safe_rcu)
#define BTF_TYPE_SAFE_RCU_OR_NULL(__type)  __PASTE(__type, __safe_rcu_or_null)
#define BTF_TYPE_SAFE_TRUSTED(__type)  __PASTE(__type, __safe_trusted)
#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type)  __PASTE(__type, __safe_trusted_or_null)

/*
 * Allow list few fields as RCU trusted or full trusted.
 * This logic doesn't allow mix tagging and will be removed once GCC supports
 * btf_type_tag.
 */

/* RCU trusted: these fields are trusted in RCU CS and never NULL */
BTF_TYPE_SAFE_RCU(struct task_struct) {
        const cpumask_t *cpus_ptr;
        struct css_set __rcu *cgroups;
        struct task_struct __rcu *real_parent;
        struct task_struct *group_leader;
};

BTF_TYPE_SAFE_RCU(struct cgroup) {
        /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
        struct kernfs_node *kn;
};

BTF_TYPE_SAFE_RCU(struct css_set) {
        struct cgroup *dfl_cgrp;
};

BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) {
        struct cgroup *cgroup;
};

/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
        struct file __rcu *exe_file;
#ifdef CONFIG_MEMCG
        struct task_struct __rcu *owner;
#endif
};

/* skb->sk, req->sk are not RCU protected, but we mark them as such
 * because bpf prog accessible sockets are SOCK_RCU_FREE.
 */
BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
        struct sock *sk;
};

BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
        struct sock *sk;
};

/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
        struct seq_file *seq;
};

BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
        struct bpf_iter_meta *meta;
        struct task_struct *task;
};

BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
        struct file *file;
};

BTF_TYPE_SAFE_TRUSTED(struct file) {
        struct inode *f_inode;
};

BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) {
        struct inode *d_inode;
};

BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
        struct sock *sk;
};

BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) {
        struct mm_struct *vm_mm;
        struct file *vm_file;
};

static bool type_is_rcu(struct bpf_verifier_env *env,
                        struct bpf_reg_state *reg,
                        const char *field_name, u32 btf_id)
{
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state));

        return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
}

static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
                                struct bpf_reg_state *reg,
                                const char *field_name, u32 btf_id)
{
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));

        return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
}

static bool type_is_trusted(struct bpf_verifier_env *env,
                            struct bpf_reg_state *reg,
                            const char *field_name, u32 btf_id)
{
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));

        return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
}

static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
                                    struct bpf_reg_state *reg,
                                    const char *field_name, u32 btf_id)
{
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct));

        return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
                                          "__safe_trusted_or_null");
}

static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *regs,
                                   int regno, int off, int size,
                                   enum bpf_access_type atype,
                                   int value_regno)
{
        struct bpf_reg_state *reg = regs + regno;
        const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
        const char *tname = btf_name_by_offset(reg->btf, t->name_off);
        const char *field_name = NULL;
        enum bpf_type_flag flag = 0;
        u32 btf_id = 0;
        int ret;

        if (!env->allow_ptr_leaks) {
                verbose(env,
                        "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
                        tname);
                return -EPERM;
        }
        if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
                verbose(env,
                        "Cannot access kernel 'struct %s' from non-GPL compatible program\n",
                        tname);
                return -EINVAL;
        }

        if (!tnum_is_const(reg->var_off)) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env,
                        "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
                        regno, tname, off, tn_buf);
                return -EACCES;
        }

        off += reg->var_off.value;

        if (off < 0) {
                verbose(env,
                        "R%d is ptr_%s invalid negative access: off=%d\n",
                        regno, tname, off);
                return -EACCES;
        }

        if (reg->type & MEM_USER) {
                verbose(env,
                        "R%d is ptr_%s access user memory: off=%d\n",
                        regno, tname, off);
                return -EACCES;
        }

        if (reg->type & MEM_PERCPU) {
                verbose(env,
                        "R%d is ptr_%s access percpu memory: off=%d\n",
                        regno, tname, off);
                return -EACCES;
        }

        if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
                if (!btf_is_kernel(reg->btf)) {
                        verifier_bug(env, "reg->btf must be kernel btf");
                        return -EFAULT;
                }
                ret = env->ops->btf_struct_access(&env->log, reg, off, size);
        } else {
                /* Writes are permitted with default btf_struct_access for
                 * program allocated objects (which always have ref_obj_id > 0),
                 * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
                 */
                if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
                        verbose(env, "only read is supported\n");
                        return -EACCES;
                }

                if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
                    !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
                        verifier_bug(env, "ref_obj_id for allocated object must be non-zero");
                        return -EFAULT;
                }

                ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
        }

        if (ret < 0)
                return ret;

        if (ret != PTR_TO_BTF_ID) {
                /* just mark; */

        } else if (type_flag(reg->type) & PTR_UNTRUSTED) {
                /* If this is an untrusted pointer, all pointers formed by walking it
                 * also inherit the untrusted flag.
                 */
                flag = PTR_UNTRUSTED;

        } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
                /* By default any pointer obtained from walking a trusted pointer is no
                 * longer trusted, unless the field being accessed has explicitly been
                 * marked as inheriting its parent's state of trust (either full or RCU).
                 * For example:
                 * 'cgroups' pointer is untrusted if task->cgroups dereference
                 * happened in a sleepable program outside of bpf_rcu_read_lock()
                 * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
                 * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
                 *
                 * A regular RCU-protected pointer with __rcu tag can also be deemed
                 * trusted if we are in an RCU CS. Such pointer can be NULL.
                 */
                if (type_is_trusted(env, reg, field_name, btf_id)) {
                        flag |= PTR_TRUSTED;
                } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) {
                        flag |= PTR_TRUSTED | PTR_MAYBE_NULL;
                } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
                        if (type_is_rcu(env, reg, field_name, btf_id)) {
                                /* ignore __rcu tag and mark it MEM_RCU */
                                flag |= MEM_RCU;
                        } else if (flag & MEM_RCU ||
                                   type_is_rcu_or_null(env, reg, field_name, btf_id)) {
                                /* __rcu tagged pointers can be NULL */
                                flag |= MEM_RCU | PTR_MAYBE_NULL;

                                /* We always trust them */
                                if (type_is_rcu_or_null(env, reg, field_name, btf_id) &&
                                    flag & PTR_UNTRUSTED)
                                        flag &= ~PTR_UNTRUSTED;
                        } else if (flag & (MEM_PERCPU | MEM_USER)) {
                                /* keep as-is */
                        } else {
                                /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
                                clear_trusted_flags(&flag);
                        }
                } else {
                        /*
                         * If not in RCU CS or MEM_RCU pointer can be NULL then
                         * aggressively mark as untrusted otherwise such
                         * pointers will be plain PTR_TO_BTF_ID without flags
                         * and will be allowed to be passed into helpers for
                         * compat reasons.
                         */
                        flag = PTR_UNTRUSTED;
                }
        } else {
                /* Old compat. Deprecated */
                clear_trusted_flags(&flag);
        }

        if (atype == BPF_READ && value_regno >= 0) {
                ret = mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
                if (ret < 0)
                        return ret;
        }

        return 0;
}

static int check_ptr_to_map_access(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *regs,
                                   int regno, int off, int size,
                                   enum bpf_access_type atype,
                                   int value_regno)
{
        struct bpf_reg_state *reg = regs + regno;
        struct bpf_map *map = reg->map_ptr;
        struct bpf_reg_state map_reg;
        enum bpf_type_flag flag = 0;
        const struct btf_type *t;
        const char *tname;
        u32 btf_id;
        int ret;

        if (!btf_vmlinux) {
                verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n");
                return -ENOTSUPP;
        }

        if (!map->ops->map_btf_id || !*map->ops->map_btf_id) {
                verbose(env, "map_ptr access not supported for map type %d\n",
                        map->map_type);
                return -ENOTSUPP;
        }

        t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
        tname = btf_name_by_offset(btf_vmlinux, t->name_off);

        if (!env->allow_ptr_leaks) {
                verbose(env,
                        "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
                        tname);
                return -EPERM;
        }

        if (off < 0) {
                verbose(env, "R%d is %s invalid negative access: off=%d\n",
                        regno, tname, off);
                return -EACCES;
        }

        if (atype != BPF_READ) {
                verbose(env, "only read from %s is supported\n", tname);
                return -EACCES;
        }

        /* Simulate access to a PTR_TO_BTF_ID */
        memset(&map_reg, 0, sizeof(map_reg));
        ret = mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID,
                              btf_vmlinux, *map->ops->map_btf_id, 0);
        if (ret < 0)
                return ret;
        ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
        if (ret < 0)
                return ret;

        if (value_regno >= 0) {
                ret = mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
                if (ret < 0)
                        return ret;
        }

        return 0;
}

/* Check that the stack access at the given offset is within bounds. The
 * maximum valid offset is -1.
 *
 * The minimum valid offset is -MAX_BPF_STACK for writes, and
 * -state->allocated_stack for reads.
 */
static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
                                          s64 off,
                                          struct bpf_func_state *state,
                                          enum bpf_access_type t)
{
        int min_valid_off;

        if (t == BPF_WRITE || env->allow_uninit_stack)
                min_valid_off = -MAX_BPF_STACK;
        else
                min_valid_off = -state->allocated_stack;

        if (off < min_valid_off || off > -1)
                return -EACCES;
        return 0;
}

/* Check that the stack access at 'regno + off' falls within the maximum stack
 * bounds.
 *
 * 'off' includes `regno->offset`, but not its dynamic part (if any).
 */
static int check_stack_access_within_bounds(
                struct bpf_verifier_env *env,
                int regno, int off, int access_size,
                enum bpf_access_type type)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        struct bpf_func_state *state = bpf_func(env, reg);
        s64 min_off, max_off;
        int err;
        char *err_extra;

        if (type == BPF_READ)
                err_extra = " read from";
        else
                err_extra = " write to";

        if (tnum_is_const(reg->var_off)) {
                min_off = (s64)reg->var_off.value + off;
                max_off = min_off + access_size;
        } else {
                if (reg->smax_value >= BPF_MAX_VAR_OFF ||
                    reg->smin_value <= -BPF_MAX_VAR_OFF) {
                        verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
                                err_extra, regno);
                        return -EACCES;
                }
                min_off = reg->smin_value + off;
                max_off = reg->smax_value + off + access_size;
        }

        err = check_stack_slot_within_bounds(env, min_off, state, type);
        if (!err && max_off > 0)
                err = -EINVAL; /* out of stack access into non-negative offsets */
        if (!err && access_size < 0)
                /* access_size should not be negative (or overflow an int); others checks
                 * along the way should have prevented such an access.
                 */
                err = -EFAULT; /* invalid negative access size; integer overflow? */

        if (err) {
                if (tnum_is_const(reg->var_off)) {
                        verbose(env, "invalid%s stack R%d off=%lld size=%d\n",
                                err_extra, regno, min_off, access_size);
                } else {
                        char tn_buf[48];

                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                        verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
                                err_extra, regno, tn_buf, off, access_size);
                }
                return err;
        }

        /* Note that there is no stack access with offset zero, so the needed stack
         * size is -min_off, not -min_off+1.
         */
        return grow_stack_state(env, state, -min_off /* size */);
}

static bool get_func_retval_range(struct bpf_prog *prog,
                                  struct bpf_retval_range *range)
{
        if (prog->type == BPF_PROG_TYPE_LSM &&
                prog->expected_attach_type == BPF_LSM_MAC &&
                !bpf_lsm_get_retval_range(prog, range)) {
                return true;
        }
        return false;
}

static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val)
{
        struct bpf_reg_state fake_reg;

        if (!val)
                return;

        fake_reg.type = SCALAR_VALUE;
        __mark_reg_known(&fake_reg, val);

        scalar32_min_max_add(dst_reg, &fake_reg);
        scalar_min_max_add(dst_reg, &fake_reg);
        dst_reg->var_off = tnum_add(dst_reg->var_off, fake_reg.var_off);

        reg_bounds_sync(dst_reg);
}

/* check whether memory at (regno + off) is accessible for t = (read | write)
 * if t==write, value_regno is a register which value is stored into memory
 * if t==read, value_regno is a register which will receive the value from memory
 * if t==write && value_regno==-1, some unknown value is stored into memory
 * if t==read && value_regno==-1, don't care what we read from memory
 */
static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
                            int off, int bpf_size, enum bpf_access_type t,
                            int value_regno, bool strict_alignment_once, bool is_ldsx)
{
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = regs + regno;
        int size, err = 0;

        size = bpf_size_to_bytes(bpf_size);
        if (size < 0)
                return size;

        err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);
        if (err)
                return err;

        if (reg->type == PTR_TO_MAP_KEY) {
                if (t == BPF_WRITE) {
                        verbose(env, "write to change key R%d not allowed\n", regno);
                        return -EACCES;
                }

                err = check_mem_region_access(env, regno, off, size,
                                              reg->map_ptr->key_size, false);
                if (err)
                        return err;
                if (value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else if (reg->type == PTR_TO_MAP_VALUE) {
                struct btf_field *kptr_field = NULL;

                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into map\n", value_regno);
                        return -EACCES;
                }
                err = check_map_access_type(env, regno, off, size, t);
                if (err)
                        return err;
                err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
                if (err)
                        return err;
                if (tnum_is_const(reg->var_off))
                        kptr_field = btf_record_find(reg->map_ptr->record,
                                                     off + reg->var_off.value, BPF_KPTR | BPF_UPTR);
                if (kptr_field) {
                        err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
                } else if (t == BPF_READ && value_regno >= 0) {
                        struct bpf_map *map = reg->map_ptr;

                        /*
                         * If map is read-only, track its contents as scalars,
                         * unless it is an insn array (see the special case below)
                         */
                        if (tnum_is_const(reg->var_off) &&
                            bpf_map_is_rdonly(map) &&
                            map->ops->map_direct_value_addr &&
                            map->map_type != BPF_MAP_TYPE_INSN_ARRAY) {
                                int map_off = off + reg->var_off.value;
                                u64 val = 0;

                                err = bpf_map_direct_read(map, map_off, size,
                                                          &val, is_ldsx);
                                if (err)
                                        return err;

                                regs[value_regno].type = SCALAR_VALUE;
                                __mark_reg_known(&regs[value_regno], val);
                        } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
                                if (bpf_size != BPF_DW) {
                                        verbose(env, "Invalid read of %d bytes from insn_array\n",
                                                     size);
                                        return -EACCES;
                                }
                                copy_register_state(&regs[value_regno], reg);
                                add_scalar_to_reg(&regs[value_regno], off);
                                regs[value_regno].type = PTR_TO_INSN;
                        } else {
                                mark_reg_unknown(env, regs, value_regno);
                        }
                }
        } else if (base_type(reg->type) == PTR_TO_MEM) {
                bool rdonly_mem = type_is_rdonly_mem(reg->type);
                bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED);

                if (type_may_be_null(reg->type)) {
                        verbose(env, "R%d invalid mem access '%s'\n", regno,
                                reg_type_str(env, reg->type));
                        return -EACCES;
                }

                if (t == BPF_WRITE && rdonly_mem) {
                        verbose(env, "R%d cannot write into %s\n",
                                regno, reg_type_str(env, reg->type));
                        return -EACCES;
                }

                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into mem\n", value_regno);
                        return -EACCES;
                }

                /*
                 * Accesses to untrusted PTR_TO_MEM are done through probe
                 * instructions, hence no need to check bounds in that case.
                 */
                if (!rdonly_untrusted)
                        err = check_mem_region_access(env, regno, off, size,
                                                      reg->mem_size, false);
                if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
                        mark_reg_unknown(env, regs, value_regno);
        } else if (reg->type == PTR_TO_CTX) {
                struct bpf_insn_access_aux info = {
                        .reg_type = SCALAR_VALUE,
                        .is_ldsx = is_ldsx,
                        .log = &env->log,
                };
                struct bpf_retval_range range;

                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into ctx\n", value_regno);
                        return -EACCES;
                }

                err = check_ctx_access(env, insn_idx, regno, off, size, t, &info);
                if (!err && t == BPF_READ && value_regno >= 0) {
                        /* ctx access returns either a scalar, or a
                         * PTR_TO_PACKET[_META,_END]. In the latter
                         * case, we know the offset is zero.
                         */
                        if (info.reg_type == SCALAR_VALUE) {
                                if (info.is_retval && get_func_retval_range(env->prog, &range)) {
                                        err = __mark_reg_s32_range(env, regs, value_regno,
                                                                   range.minval, range.maxval);
                                        if (err)
                                                return err;
                                } else {
                                        mark_reg_unknown(env, regs, value_regno);
                                }
                        } else {
                                mark_reg_known_zero(env, regs,
                                                    value_regno);
                                if (type_may_be_null(info.reg_type))
                                        regs[value_regno].id = ++env->id_gen;
                                /* A load of ctx field could have different
                                 * actual load size with the one encoded in the
                                 * insn. When the dst is PTR, it is for sure not
                                 * a sub-register.
                                 */
                                regs[value_regno].subreg_def = DEF_NOT_SUBREG;
                                if (base_type(info.reg_type) == PTR_TO_BTF_ID) {
                                        regs[value_regno].btf = info.btf;
                                        regs[value_regno].btf_id = info.btf_id;
                                        regs[value_regno].ref_obj_id = info.ref_obj_id;
                                }
                        }
                        regs[value_regno].type = info.reg_type;
                }

        } else if (reg->type == PTR_TO_STACK) {
                /* Basic bounds checks. */
                err = check_stack_access_within_bounds(env, regno, off, size, t);
                if (err)
                        return err;

                if (t == BPF_READ)
                        err = check_stack_read(env, regno, off, size,
                                               value_regno);
                else
                        err = check_stack_write(env, regno, off, size,
                                                value_regno, insn_idx);
        } else if (reg_is_pkt_pointer(reg)) {
                if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
                        verbose(env, "cannot write into packet\n");
                        return -EACCES;
                }
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into packet\n",
                                value_regno);
                        return -EACCES;
                }
                err = check_packet_access(env, regno, off, size, false);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else if (reg->type == PTR_TO_FLOW_KEYS) {
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into flow keys\n",
                                value_regno);
                        return -EACCES;
                }

                err = check_flow_keys_access(env, off, size);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else if (type_is_sk_pointer(reg->type)) {
                if (t == BPF_WRITE) {
                        verbose(env, "R%d cannot write into %s\n",
                                regno, reg_type_str(env, reg->type));
                        return -EACCES;
                }
                err = check_sock_access(env, insn_idx, regno, off, size, t);
                if (!err && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else if (reg->type == PTR_TO_TP_BUFFER) {
                err = check_tp_buffer_access(env, reg, regno, off, size);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else if (base_type(reg->type) == PTR_TO_BTF_ID &&
                   !type_may_be_null(reg->type)) {
                err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
                                              value_regno);
        } else if (reg->type == CONST_PTR_TO_MAP) {
                err = check_ptr_to_map_access(env, regs, regno, off, size, t,
                                              value_regno);
        } else if (base_type(reg->type) == PTR_TO_BUF &&
                   !type_may_be_null(reg->type)) {
                bool rdonly_mem = type_is_rdonly_mem(reg->type);
                u32 *max_access;

                if (rdonly_mem) {
                        if (t == BPF_WRITE) {
                                verbose(env, "R%d cannot write into %s\n",
                                        regno, reg_type_str(env, reg->type));
                                return -EACCES;
                        }
                        max_access = &env->prog->aux->max_rdonly_access;
                } else {
                        max_access = &env->prog->aux->max_rdwr_access;
                }

                err = check_buffer_access(env, reg, regno, off, size, false,
                                          max_access);

                if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
                        mark_reg_unknown(env, regs, value_regno);
        } else if (reg->type == PTR_TO_ARENA) {
                if (t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else {
                verbose(env, "R%d invalid mem access '%s'\n", regno,
                        reg_type_str(env, reg->type));
                return -EACCES;
        }

        if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
            regs[value_regno].type == SCALAR_VALUE) {
                if (!is_ldsx)
                        /* b/h/w load zero-extends, mark upper bits as known 0 */
                        coerce_reg_to_size(&regs[value_regno], size);
                else
                        coerce_reg_to_size_sx(&regs[value_regno], size);
        }
        return err;
}

static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
                             bool allow_trust_mismatch);

static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
                          bool strict_alignment_once, bool is_ldsx,
                          bool allow_trust_mismatch, const char *ctx)
{
        struct bpf_reg_state *regs = cur_regs(env);
        enum bpf_reg_type src_reg_type;
        int err;

        /* check src operand */
        err = check_reg_arg(env, insn->src_reg, SRC_OP);
        if (err)
                return err;

        /* check dst operand */
        err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
        if (err)
                return err;

        src_reg_type = regs[insn->src_reg].type;

        /* Check if (src_reg + off) is readable. The state of dst_reg will be
         * updated by this call.
         */
        err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off,
                               BPF_SIZE(insn->code), BPF_READ, insn->dst_reg,
                               strict_alignment_once, is_ldsx);
        err = err ?: save_aux_ptr_type(env, src_reg_type,
                                       allow_trust_mismatch);
        err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], ctx);

        return err;
}

static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
                           bool strict_alignment_once)
{
        struct bpf_reg_state *regs = cur_regs(env);
        enum bpf_reg_type dst_reg_type;
        int err;

        /* check src1 operand */
        err = check_reg_arg(env, insn->src_reg, SRC_OP);
        if (err)
                return err;

        /* check src2 operand */
        err = check_reg_arg(env, insn->dst_reg, SRC_OP);
        if (err)
                return err;

        dst_reg_type = regs[insn->dst_reg].type;

        /* Check if (dst_reg + off) is writeable. */
        err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
                               BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg,
                               strict_alignment_once, false);
        err = err ?: save_aux_ptr_type(env, dst_reg_type, false);

        return err;
}

static int check_atomic_rmw(struct bpf_verifier_env *env,
                            struct bpf_insn *insn)
{
        int load_reg;
        int err;

        if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
                verbose(env, "invalid atomic operand size\n");
                return -EINVAL;
        }

        /* check src1 operand */
        err = check_reg_arg(env, insn->src_reg, SRC_OP);
        if (err)
                return err;

        /* check src2 operand */
        err = check_reg_arg(env, insn->dst_reg, SRC_OP);
        if (err)
                return err;

        if (insn->imm == BPF_CMPXCHG) {
                /* Check comparison of R0 with memory location */
                const u32 aux_reg = BPF_REG_0;

                err = check_reg_arg(env, aux_reg, SRC_OP);
                if (err)
                        return err;

                if (is_pointer_value(env, aux_reg)) {
                        verbose(env, "R%d leaks addr into mem\n", aux_reg);
                        return -EACCES;
                }
        }

        if (is_pointer_value(env, insn->src_reg)) {
                verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
                return -EACCES;
        }

        if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
                verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
                        insn->dst_reg,
                        reg_type_str(env, reg_state(env, insn->dst_reg)->type));
                return -EACCES;
        }

        if (insn->imm & BPF_FETCH) {
                if (insn->imm == BPF_CMPXCHG)
                        load_reg = BPF_REG_0;
                else
                        load_reg = insn->src_reg;

                /* check and record load of old value */
                err = check_reg_arg(env, load_reg, DST_OP);
                if (err)
                        return err;
        } else {
                /* This instruction accesses a memory location but doesn't
                 * actually load it into a register.
                 */
                load_reg = -1;
        }

        /* Check whether we can read the memory, with second call for fetch
         * case to simulate the register fill.
         */
        err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
                               BPF_SIZE(insn->code), BPF_READ, -1, true, false);
        if (!err && load_reg >= 0)
                err = check_mem_access(env, env->insn_idx, insn->dst_reg,
                                       insn->off, BPF_SIZE(insn->code),
                                       BPF_READ, load_reg, true, false);
        if (err)
                return err;

        if (is_arena_reg(env, insn->dst_reg)) {
                err = save_aux_ptr_type(env, PTR_TO_ARENA, false);
                if (err)
                        return err;
        }
        /* Check whether we can write into the same memory. */
        err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
                               BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
        if (err)
                return err;
        return 0;
}

static int check_atomic_load(struct bpf_verifier_env *env,
                             struct bpf_insn *insn)
{
        int err;

        err = check_load_mem(env, insn, true, false, false, "atomic_load");
        if (err)
                return err;

        if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) {
                verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n",
                        insn->src_reg,
                        reg_type_str(env, reg_state(env, insn->src_reg)->type));
                return -EACCES;
        }

        return 0;
}

static int check_atomic_store(struct bpf_verifier_env *env,
                              struct bpf_insn *insn)
{
        int err;

        err = check_store_reg(env, insn, true);
        if (err)
                return err;

        if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
                verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
                        insn->dst_reg,
                        reg_type_str(env, reg_state(env, insn->dst_reg)->type));
                return -EACCES;
        }

        return 0;
}

static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        switch (insn->imm) {
        case BPF_ADD:
        case BPF_ADD | BPF_FETCH:
        case BPF_AND:
        case BPF_AND | BPF_FETCH:
        case BPF_OR:
        case BPF_OR | BPF_FETCH:
        case BPF_XOR:
        case BPF_XOR | BPF_FETCH:
        case BPF_XCHG:
        case BPF_CMPXCHG:
                return check_atomic_rmw(env, insn);
        case BPF_LOAD_ACQ:
                if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
                        verbose(env,
                                "64-bit load-acquires are only supported on 64-bit arches\n");
                        return -EOPNOTSUPP;
                }
                return check_atomic_load(env, insn);
        case BPF_STORE_REL:
                if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
                        verbose(env,
                                "64-bit store-releases are only supported on 64-bit arches\n");
                        return -EOPNOTSUPP;
                }
                return check_atomic_store(env, insn);
        default:
                verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n",
                        insn->imm);
                return -EINVAL;
        }
}

/* When register 'regno' is used to read the stack (either directly or through
 * a helper function) make sure that it's within stack boundary and, depending
 * on the access type and privileges, that all elements of the stack are
 * initialized.
 *
 * All registers that have been spilled on the stack in the slots within the
 * read offsets are marked as read.
 */
static int check_stack_range_initialized(
                struct bpf_verifier_env *env, int regno, int off,
                int access_size, bool zero_size_allowed,
                enum bpf_access_type type, struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        struct bpf_func_state *state = bpf_func(env, reg);
        int err, min_off, max_off, i, j, slot, spi;
        /* Some accesses can write anything into the stack, others are
         * read-only.
         */
        bool clobber = type == BPF_WRITE;
        /*
         * Negative access_size signals global subprog/kfunc arg check where
         * STACK_POISON slots are acceptable. static stack liveness
         * might have determined that subprog doesn't read them,
         * but BTF based global subprog validation isn't accurate enough.
         */
        bool allow_poison = access_size < 0 || clobber;

        access_size = abs(access_size);

        if (access_size == 0 && !zero_size_allowed) {
                verbose(env, "invalid zero-sized read\n");
                return -EACCES;
        }

        err = check_stack_access_within_bounds(env, regno, off, access_size, type);
        if (err)
                return err;


        if (tnum_is_const(reg->var_off)) {
                min_off = max_off = reg->var_off.value + off;
        } else {
                /* Variable offset is prohibited for unprivileged mode for
                 * simplicity since it requires corresponding support in
                 * Spectre masking for stack ALU.
                 * See also retrieve_ptr_limit().
                 */
                if (!env->bypass_spec_v1) {
                        char tn_buf[48];

                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                        verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
                                regno, tn_buf);
                        return -EACCES;
                }
                /* Only initialized buffer on stack is allowed to be accessed
                 * with variable offset. With uninitialized buffer it's hard to
                 * guarantee that whole memory is marked as initialized on
                 * helper return since specific bounds are unknown what may
                 * cause uninitialized stack leaking.
                 */
                if (meta && meta->raw_mode)
                        meta = NULL;

                min_off = reg->smin_value + off;
                max_off = reg->smax_value + off;
        }

        if (meta && meta->raw_mode) {
                /* Ensure we won't be overwriting dynptrs when simulating byte
                 * by byte access in check_helper_call using meta.access_size.
                 * This would be a problem if we have a helper in the future
                 * which takes:
                 *
                 *        helper(uninit_mem, len, dynptr)
                 *
                 * Now, uninint_mem may overlap with dynptr pointer. Hence, it
                 * may end up writing to dynptr itself when touching memory from
                 * arg 1. This can be relaxed on a case by case basis for known
                 * safe cases, but reject due to the possibilitiy of aliasing by
                 * default.
                 */
                for (i = min_off; i < max_off + access_size; i++) {
                        int stack_off = -i - 1;

                        spi = bpf_get_spi(i);
                        /* raw_mode may write past allocated_stack */
                        if (state->allocated_stack <= stack_off)
                                continue;
                        if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
                                verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
                                return -EACCES;
                        }
                }
                meta->access_size = access_size;
                meta->regno = regno;
                return 0;
        }

        for (i = min_off; i < max_off + access_size; i++) {
                u8 *stype;

                slot = -i - 1;
                spi = slot / BPF_REG_SIZE;
                if (state->allocated_stack <= slot) {
                        verbose(env, "allocated_stack too small\n");
                        return -EFAULT;
                }

                stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
                if (*stype == STACK_MISC)
                        goto mark;
                if ((*stype == STACK_ZERO) ||
                    (*stype == STACK_INVALID && env->allow_uninit_stack)) {
                        if (clobber) {
                                /* helper can write anything into the stack */
                                *stype = STACK_MISC;
                        }
                        goto mark;
                }

                if (bpf_is_spilled_reg(&state->stack[spi]) &&
                    (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
                     env->allow_ptr_leaks)) {
                        if (clobber) {
                                __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
                                for (j = 0; j < BPF_REG_SIZE; j++)
                                        scrub_spilled_slot(&state->stack[spi].slot_type[j]);
                        }
                        goto mark;
                }

                if (*stype == STACK_POISON) {
                        if (allow_poison)
                                goto mark;
                        verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n",
                                regno, min_off, i - min_off, access_size);
                } else if (tnum_is_const(reg->var_off)) {
                        verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
                                regno, min_off, i - min_off, access_size);
                } else {
                        char tn_buf[48];

                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                        verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n",
                                regno, tn_buf, i - min_off, access_size);
                }
                return -EACCES;
mark:
                ;
        }
        return 0;
}

static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
                                   int access_size, enum bpf_access_type access_type,
                                   bool zero_size_allowed,
                                   struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
        u32 *max_access;

        switch (base_type(reg->type)) {
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
                return check_packet_access(env, regno, 0, access_size,
                                           zero_size_allowed);
        case PTR_TO_MAP_KEY:
                if (access_type == BPF_WRITE) {
                        verbose(env, "R%d cannot write into %s\n", regno,
                                reg_type_str(env, reg->type));
                        return -EACCES;
                }
                return check_mem_region_access(env, regno, 0, access_size,
                                               reg->map_ptr->key_size, false);
        case PTR_TO_MAP_VALUE:
                if (check_map_access_type(env, regno, 0, access_size, access_type))
                        return -EACCES;
                return check_map_access(env, regno, 0, access_size,
                                        zero_size_allowed, ACCESS_HELPER);
        case PTR_TO_MEM:
                if (type_is_rdonly_mem(reg->type)) {
                        if (access_type == BPF_WRITE) {
                                verbose(env, "R%d cannot write into %s\n", regno,
                                        reg_type_str(env, reg->type));
                                return -EACCES;
                        }
                }
                return check_mem_region_access(env, regno, 0,
                                               access_size, reg->mem_size,
                                               zero_size_allowed);
        case PTR_TO_BUF:
                if (type_is_rdonly_mem(reg->type)) {
                        if (access_type == BPF_WRITE) {
                                verbose(env, "R%d cannot write into %s\n", regno,
                                        reg_type_str(env, reg->type));
                                return -EACCES;
                        }

                        max_access = &env->prog->aux->max_rdonly_access;
                } else {
                        max_access = &env->prog->aux->max_rdwr_access;
                }
                return check_buffer_access(env, reg, regno, 0,
                                           access_size, zero_size_allowed,
                                           max_access);
        case PTR_TO_STACK:
                return check_stack_range_initialized(
                                env,
                                regno, 0, access_size,
                                zero_size_allowed, access_type, meta);
        case PTR_TO_BTF_ID:
                return check_ptr_to_btf_access(env, regs, regno, 0,
                                               access_size, BPF_READ, -1);
        case PTR_TO_CTX:
                /* Only permit reading or writing syscall context using helper calls. */
                if (is_var_ctx_off_allowed(env->prog)) {
                        int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX,
                                                          zero_size_allowed);
                        if (err)
                                return err;
                        if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size)
                                env->prog->aux->max_ctx_offset = reg->umax_value + access_size;
                        return 0;
                }
                fallthrough;
        default: /* scalar_value or invalid ptr */
                /* Allow zero-byte read from NULL, regardless of pointer type */
                if (zero_size_allowed && access_size == 0 &&
                    bpf_register_is_null(reg))
                        return 0;

                verbose(env, "R%d type=%s ", regno,
                        reg_type_str(env, reg->type));
                verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
                return -EACCES;
        }
}

/* verify arguments to helpers or kfuncs consisting of a pointer and an access
 * size.
 *
 * @regno is the register containing the access size. regno-1 is the register
 * containing the pointer.
 */
static int check_mem_size_reg(struct bpf_verifier_env *env,
                              struct bpf_reg_state *reg, u32 regno,
                              enum bpf_access_type access_type,
                              bool zero_size_allowed,
                              struct bpf_call_arg_meta *meta)
{
        int err;

        /* This is used to refine r0 return value bounds for helpers
         * that enforce this value as an upper bound on return values.
         * See do_refine_retval_range() for helpers that can refine
         * the return value. C type of helper is u32 so we pull register
         * bound from umax_value however, if negative verifier errors
         * out. Only upper bounds can be learned because retval is an
         * int type and negative retvals are allowed.
         */
        meta->msize_max_value = reg->umax_value;

        /* The register is SCALAR_VALUE; the access check happens using
         * its boundaries. For unprivileged variable accesses, disable
         * raw mode so that the program is required to initialize all
         * the memory that the helper could just partially fill up.
         */
        if (!tnum_is_const(reg->var_off))
                meta = NULL;

        if (reg->smin_value < 0) {
                verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
                        regno);
                return -EACCES;
        }

        if (reg->umin_value == 0 && !zero_size_allowed) {
                verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n",
                        regno, reg->umin_value, reg->umax_value);
                return -EACCES;
        }

        if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
                verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
                        regno);
                return -EACCES;
        }
        err = check_helper_mem_access(env, regno - 1, reg->umax_value,
                                      access_type, zero_size_allowed, meta);
        if (!err)
                err = mark_chain_precision(env, regno);
        return err;
}

static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                         u32 regno, u32 mem_size)
{
        bool may_be_null = type_may_be_null(reg->type);
        struct bpf_reg_state saved_reg;
        int err;

        if (bpf_register_is_null(reg))
                return 0;

        /* Assuming that the register contains a value check if the memory
         * access is safe. Temporarily save and restore the register's state as
         * the conversion shouldn't be visible to a caller.
         */
        if (may_be_null) {
                saved_reg = *reg;
                mark_ptr_not_null_reg(reg);
        }

        int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size;

        err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL);
        err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL);

        if (may_be_null)
                *reg = saved_reg;

        return err;
}

static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                    u32 regno)
{
        struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
        bool may_be_null = type_may_be_null(mem_reg->type);
        struct bpf_reg_state saved_reg;
        struct bpf_call_arg_meta meta;
        int err;

        WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);

        memset(&meta, 0, sizeof(meta));

        if (may_be_null) {
                saved_reg = *mem_reg;
                mark_ptr_not_null_reg(mem_reg);
        }

        err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta);
        err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta);

        if (may_be_null)
                *mem_reg = saved_reg;

        return err;
}

enum {
        PROCESS_SPIN_LOCK = (1 << 0),
        PROCESS_RES_LOCK  = (1 << 1),
        PROCESS_LOCK_IRQ  = (1 << 2),
};

/* Implementation details:
 * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
 * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
 * Two bpf_map_lookups (even with the same key) will have different reg->id.
 * Two separate bpf_obj_new will also have different reg->id.
 * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier
 * clears reg->id after value_or_null->value transition, since the verifier only
 * cares about the range of access to valid map value pointer and doesn't care
 * about actual address of the map element.
 * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
 * reg->id > 0 after value_or_null->value transition. By doing so
 * two bpf_map_lookups will be considered two different pointers that
 * point to different bpf_spin_locks. Likewise for pointers to allocated objects
 * returned from bpf_obj_new.
 * The verifier allows taking only one bpf_spin_lock at a time to avoid
 * dead-locks.
 * Since only one bpf_spin_lock is allowed the checks are simpler than
 * reg_is_refcounted() logic. The verifier needs to remember only
 * one spin_lock instead of array of acquired_refs.
 * env->cur_state->active_locks remembers which map value element or allocated
 * object got locked and clears it after bpf_spin_unlock.
 */
static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
{
        bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK;
        const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin";
        struct bpf_reg_state *reg = reg_state(env, regno);
        struct bpf_verifier_state *cur = env->cur_state;
        bool is_const = tnum_is_const(reg->var_off);
        bool is_irq = flags & PROCESS_LOCK_IRQ;
        u64 val = reg->var_off.value;
        struct bpf_map *map = NULL;
        struct btf *btf = NULL;
        struct btf_record *rec;
        u32 spin_lock_off;
        int err;

        if (!is_const) {
                verbose(env,
                        "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n",
                        regno, lock_str);
                return -EINVAL;
        }
        if (reg->type == PTR_TO_MAP_VALUE) {
                map = reg->map_ptr;
                if (!map->btf) {
                        verbose(env,
                                "map '%s' has to have BTF in order to use %s_lock\n",
                                map->name, lock_str);
                        return -EINVAL;
                }
        } else {
                btf = reg->btf;
        }

        rec = reg_btf_record(reg);
        if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) {
                verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local",
                        map ? map->name : "kptr", lock_str);
                return -EINVAL;
        }
        spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off;
        if (spin_lock_off != val) {
                verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n",
                        val, lock_str, spin_lock_off);
                return -EINVAL;
        }
        if (is_lock) {
                void *ptr;
                int type;

                if (map)
                        ptr = map;
                else
                        ptr = btf;

                if (!is_res_lock && cur->active_locks) {
                        if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) {
                                verbose(env,
                                        "Locking two bpf_spin_locks are not allowed\n");
                                return -EINVAL;
                        }
                } else if (is_res_lock && cur->active_locks) {
                        if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) {
                                verbose(env, "Acquiring the same lock again, AA deadlock detected\n");
                                return -EINVAL;
                        }
                }

                if (is_res_lock && is_irq)
                        type = REF_TYPE_RES_LOCK_IRQ;
                else if (is_res_lock)
                        type = REF_TYPE_RES_LOCK;
                else
                        type = REF_TYPE_LOCK;
                err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr);
                if (err < 0) {
                        verbose(env, "Failed to acquire lock state\n");
                        return err;
                }
        } else {
                void *ptr;
                int type;

                if (map)
                        ptr = map;
                else
                        ptr = btf;

                if (!cur->active_locks) {
                        verbose(env, "%s_unlock without taking a lock\n", lock_str);
                        return -EINVAL;
                }

                if (is_res_lock && is_irq)
                        type = REF_TYPE_RES_LOCK_IRQ;
                else if (is_res_lock)
                        type = REF_TYPE_RES_LOCK;
                else
                        type = REF_TYPE_LOCK;
                if (!find_lock_state(cur, type, reg->id, ptr)) {
                        verbose(env, "%s_unlock of different lock\n", lock_str);
                        return -EINVAL;
                }
                if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) {
                        verbose(env, "%s_unlock cannot be out of order\n", lock_str);
                        return -EINVAL;
                }
                if (release_lock_state(cur, type, reg->id, ptr)) {
                        verbose(env, "%s_unlock of different lock\n", lock_str);
                        return -EINVAL;
                }

                invalidate_non_owning_refs(env);
        }
        return 0;
}

/* Check if @regno is a pointer to a specific field in a map value */
static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
                                   enum btf_field_type field_type,
                                   struct bpf_map_desc *map_desc)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        bool is_const = tnum_is_const(reg->var_off);
        struct bpf_map *map = reg->map_ptr;
        u64 val = reg->var_off.value;
        const char *struct_name = btf_field_type_name(field_type);
        int field_off = -1;

        if (!is_const) {
                verbose(env,
                        "R%d doesn't have constant offset. %s has to be at the constant offset\n",
                        regno, struct_name);
                return -EINVAL;
        }
        if (!map->btf) {
                verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name,
                        struct_name);
                return -EINVAL;
        }
        if (!btf_record_has_field(map->record, field_type)) {
                verbose(env, "map '%s' has no valid %s\n", map->name, struct_name);
                return -EINVAL;
        }
        switch (field_type) {
        case BPF_TIMER:
                field_off = map->record->timer_off;
                break;
        case BPF_TASK_WORK:
                field_off = map->record->task_work_off;
                break;
        case BPF_WORKQUEUE:
                field_off = map->record->wq_off;
                break;
        default:
                verifier_bug(env, "unsupported BTF field type: %s\n", struct_name);
                return -EINVAL;
        }
        if (field_off != val) {
                verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n",
                        val, struct_name, field_off);
                return -EINVAL;
        }
        if (map_desc->ptr) {
                verifier_bug(env, "Two map pointers in a %s helper", struct_name);
                return -EFAULT;
        }
        map_desc->uid = reg->map_uid;
        map_desc->ptr = map;
        return 0;
}

static int process_timer_func(struct bpf_verifier_env *env, int regno,
                              struct bpf_map_desc *map)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n");
                return -EOPNOTSUPP;
        }
        return check_map_field_pointer(env, regno, BPF_TIMER, map);
}

static int process_timer_helper(struct bpf_verifier_env *env, int regno,
                                struct bpf_call_arg_meta *meta)
{
        return process_timer_func(env, regno, &meta->map);
}

static int process_timer_kfunc(struct bpf_verifier_env *env, int regno,
                               struct bpf_kfunc_call_arg_meta *meta)
{
        return process_timer_func(env, regno, &meta->map);
}

static int process_kptr_func(struct bpf_verifier_env *env, int regno,
                             struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        struct btf_field *kptr_field;
        struct bpf_map *map_ptr;
        struct btf_record *rec;
        u32 kptr_off;

        if (type_is_ptr_alloc_obj(reg->type)) {
                rec = reg_btf_record(reg);
        } else { /* PTR_TO_MAP_VALUE */
                map_ptr = reg->map_ptr;
                if (!map_ptr->btf) {
                        verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
                                map_ptr->name);
                        return -EINVAL;
                }
                rec = map_ptr->record;
                meta->map.ptr = map_ptr;
        }

        if (!tnum_is_const(reg->var_off)) {
                verbose(env,
                        "R%d doesn't have constant offset. kptr has to be at the constant offset\n",
                        regno);
                return -EINVAL;
        }

        if (!btf_record_has_field(rec, BPF_KPTR)) {
                verbose(env, "R%d has no valid kptr\n", regno);
                return -EINVAL;
        }

        kptr_off = reg->var_off.value;
        kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR);
        if (!kptr_field) {
                verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
                return -EACCES;
        }
        if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) {
                verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
                return -EACCES;
        }
        meta->kptr_field = kptr_field;
        return 0;
}

/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
 * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
 *
 * In both cases we deal with the first 8 bytes, but need to mark the next 8
 * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
 * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
 *
 * Mutability of bpf_dynptr is at two levels, one is at the level of struct
 * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
 * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
 * mutate the view of the dynptr and also possibly destroy it. In the latter
 * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
 * memory that dynptr points to.
 *
 * The verifier will keep track both levels of mutation (bpf_dynptr's in
 * reg->type and the memory's in reg->dynptr.type), but there is no support for
 * readonly dynptr view yet, hence only the first case is tracked and checked.
 *
 * This is consistent with how C applies the const modifier to a struct object,
 * where the pointer itself inside bpf_dynptr becomes const but not what it
 * points to.
 *
 * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
 * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
 */
static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
                               enum bpf_arg_type arg_type, int clone_ref_obj_id)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        int err;

        if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
                verbose(env,
                        "arg#%d expected pointer to stack or const struct bpf_dynptr\n",
                        regno - 1);
                return -EINVAL;
        }

        /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
         * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
         */
        if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
                verifier_bug(env, "misconfigured dynptr helper type flags");
                return -EFAULT;
        }

        /*  MEM_UNINIT - Points to memory that is an appropriate candidate for
         *                 constructing a mutable bpf_dynptr object.
         *
         *                 Currently, this is only possible with PTR_TO_STACK
         *                 pointing to a region of at least 16 bytes which doesn't
         *                 contain an existing bpf_dynptr.
         *
         *  MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
         *                 mutated or destroyed. However, the memory it points to
         *                 may be mutated.
         *
         *  None       - Points to a initialized dynptr that can be mutated and
         *                 destroyed, including mutation of the memory it points
         *                 to.
         */
        if (arg_type & MEM_UNINIT) {
                int i;

                if (!is_dynptr_reg_valid_uninit(env, reg)) {
                        verbose(env, "Dynptr has to be an uninitialized dynptr\n");
                        return -EINVAL;
                }

                /* we write BPF_DW bits (8 bytes) at a time */
                for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
                        err = check_mem_access(env, insn_idx, regno,
                                               i, BPF_DW, BPF_WRITE, -1, false, false);
                        if (err)
                                return err;
                }

                err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
        } else /* MEM_RDONLY and None case from above */ {
                /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
                if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
                        verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
                        return -EINVAL;
                }

                if (!is_dynptr_reg_valid_init(env, reg)) {
                        verbose(env,
                                "Expected an initialized dynptr as arg #%d\n",
                                regno - 1);
                        return -EINVAL;
                }

                /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
                if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
                        verbose(env,
                                "Expected a dynptr of type %s as arg #%d\n",
                                dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1);
                        return -EINVAL;
                }

                err = mark_dynptr_read(env, reg);
        }
        return err;
}

static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
{
        struct bpf_func_state *state = bpf_func(env, reg);

        return state->stack[spi].spilled_ptr.ref_obj_id;
}

static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
}

static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_ITER_NEW;
}


static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_ITER_DESTROY;
}

static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx,
                              const struct btf_param *arg)
{
        /* btf_check_iter_kfuncs() guarantees that first argument of any iter
         * kfunc is iter state pointer
         */
        if (is_iter_kfunc(meta))
                return arg_idx == 0;

        /* iter passed as an argument to a generic kfunc */
        return btf_param_match_suffix(meta->btf, arg, "__iter");
}

static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
                            struct bpf_kfunc_call_arg_meta *meta)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        const struct btf_type *t;
        int spi, err, i, nr_slots, btf_id;

        if (reg->type != PTR_TO_STACK) {
                verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1);
                return -EINVAL;
        }

        /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs()
         * ensures struct convention, so we wouldn't need to do any BTF
         * validation here. But given iter state can be passed as a parameter
         * to any kfunc, if arg has "__iter" suffix, we need to be a bit more
         * conservative here.
         */
        btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1);
        if (btf_id < 0) {
                verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1);
                return -EINVAL;
        }
        t = btf_type_by_id(meta->btf, btf_id);
        nr_slots = t->size / BPF_REG_SIZE;

        if (is_iter_new_kfunc(meta)) {
                /* bpf_iter_<type>_new() expects pointer to uninit iter state */
                if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
                        verbose(env, "expected uninitialized iter_%s as arg #%d\n",
                                iter_type_str(meta->btf, btf_id), regno - 1);
                        return -EINVAL;
                }

                for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
                        err = check_mem_access(env, insn_idx, regno,
                                               i, BPF_DW, BPF_WRITE, -1, false, false);
                        if (err)
                                return err;
                }

                err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
                if (err)
                        return err;
        } else {
                /* iter_next() or iter_destroy(), as well as any kfunc
                 * accepting iter argument, expect initialized iter state
                 */
                err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
                switch (err) {
                case 0:
                        break;
                case -EINVAL:
                        verbose(env, "expected an initialized iter_%s as arg #%d\n",
                                iter_type_str(meta->btf, btf_id), regno - 1);
                        return err;
                case -EPROTO:
                        verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
                        return err;
                default:
                        return err;
                }

                spi = iter_get_spi(env, reg, nr_slots);
                if (spi < 0)
                        return spi;

                err = mark_iter_read(env, reg, spi, nr_slots);
                if (err)
                        return err;

                /* remember meta->iter info for process_iter_next_call() */
                meta->iter.spi = spi;
                meta->iter.frameno = reg->frameno;
                meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);

                if (is_iter_destroy_kfunc(meta)) {
                        err = unmark_stack_slots_iter(env, reg, nr_slots);
                        if (err)
                                return err;
                }
        }

        return 0;
}

/* Look for a previous loop entry at insn_idx: nearest parent state
 * stopped at insn_idx with callsites matching those in cur->frame.
 */
static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
                                                  struct bpf_verifier_state *cur,
                                                  int insn_idx)
{
        struct bpf_verifier_state_list *sl;
        struct bpf_verifier_state *st;
        struct list_head *pos, *head;

        /* Explored states are pushed in stack order, most recent states come first */
        head = bpf_explored_state(env, insn_idx);
        list_for_each(pos, head) {
                sl = container_of(pos, struct bpf_verifier_state_list, node);
                /* If st->branches != 0 state is a part of current DFS verification path,
                 * hence cur & st for a loop.
                 */
                st = &sl->state;
                if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
                    st->dfs_depth < cur->dfs_depth)
                        return st;
        }

        return NULL;
}

/*
 * Check if scalar registers are exact for the purpose of not widening.
 * More lenient than regs_exact()
 */
static bool scalars_exact_for_widen(const struct bpf_reg_state *rold,
                                    const struct bpf_reg_state *rcur)
{
        return !memcmp(rold, rcur, offsetof(struct bpf_reg_state, id));
}

static void maybe_widen_reg(struct bpf_verifier_env *env,
                            struct bpf_reg_state *rold, struct bpf_reg_state *rcur)
{
        if (rold->type != SCALAR_VALUE)
                return;
        if (rold->type != rcur->type)
                return;
        if (rold->precise || rcur->precise || scalars_exact_for_widen(rold, rcur))
                return;
        __mark_reg_unknown(env, rcur);
}

static int widen_imprecise_scalars(struct bpf_verifier_env *env,
                                   struct bpf_verifier_state *old,
                                   struct bpf_verifier_state *cur)
{
        struct bpf_func_state *fold, *fcur;
        int i, fr, num_slots;

        for (fr = old->curframe; fr >= 0; fr--) {
                fold = old->frame[fr];
                fcur = cur->frame[fr];

                for (i = 0; i < MAX_BPF_REG; i++)
                        maybe_widen_reg(env,
                                        &fold->regs[i],
                                        &fcur->regs[i]);

                num_slots = min(fold->allocated_stack / BPF_REG_SIZE,
                                fcur->allocated_stack / BPF_REG_SIZE);
                for (i = 0; i < num_slots; i++) {
                        if (!bpf_is_spilled_reg(&fold->stack[i]) ||
                            !bpf_is_spilled_reg(&fcur->stack[i]))
                                continue;

                        maybe_widen_reg(env,
                                        &fold->stack[i].spilled_ptr,
                                        &fcur->stack[i].spilled_ptr);
                }
        }
        return 0;
}

static struct bpf_reg_state *get_iter_from_state(struct bpf_verifier_state *cur_st,
                                                 struct bpf_kfunc_call_arg_meta *meta)
{
        int iter_frameno = meta->iter.frameno;
        int iter_spi = meta->iter.spi;

        return &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
}

/* process_iter_next_call() is called when verifier gets to iterator's next
 * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
 * to it as just "iter_next()" in comments below.
 *
 * BPF verifier relies on a crucial contract for any iter_next()
 * implementation: it should *eventually* return NULL, and once that happens
 * it should keep returning NULL. That is, once iterator exhausts elements to
 * iterate, it should never reset or spuriously return new elements.
 *
 * With the assumption of such contract, process_iter_next_call() simulates
 * a fork in the verifier state to validate loop logic correctness and safety
 * without having to simulate infinite amount of iterations.
 *
 * In current state, we first assume that iter_next() returned NULL and
 * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
 * conditions we should not form an infinite loop and should eventually reach
 * exit.
 *
 * Besides that, we also fork current state and enqueue it for later
 * verification. In a forked state we keep iterator state as ACTIVE
 * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
 * also bump iteration depth to prevent erroneous infinite loop detection
 * later on (see iter_active_depths_differ() comment for details). In this
 * state we assume that we'll eventually loop back to another iter_next()
 * calls (it could be in exactly same location or in some other instruction,
 * it doesn't matter, we don't make any unnecessary assumptions about this,
 * everything revolves around iterator state in a stack slot, not which
 * instruction is calling iter_next()). When that happens, we either will come
 * to iter_next() with equivalent state and can conclude that next iteration
 * will proceed in exactly the same way as we just verified, so it's safe to
 * assume that loop converges. If not, we'll go on another iteration
 * simulation with a different input state, until all possible starting states
 * are validated or we reach maximum number of instructions limit.
 *
 * This way, we will either exhaustively discover all possible input states
 * that iterator loop can start with and eventually will converge, or we'll
 * effectively regress into bounded loop simulation logic and either reach
 * maximum number of instructions if loop is not provably convergent, or there
 * is some statically known limit on number of iterations (e.g., if there is
 * an explicit `if n > 100 then break;` statement somewhere in the loop).
 *
 * Iteration convergence logic in is_state_visited() relies on exact
 * states comparison, which ignores read and precision marks.
 * This is necessary because read and precision marks are not finalized
 * while in the loop. Exact comparison might preclude convergence for
 * simple programs like below:
 *
 *     i = 0;
 *     while(iter_next(&it))
 *       i++;
 *
 * At each iteration step i++ would produce a new distinct state and
 * eventually instruction processing limit would be reached.
 *
 * To avoid such behavior speculatively forget (widen) range for
 * imprecise scalar registers, if those registers were not precise at the
 * end of the previous iteration and do not match exactly.
 *
 * This is a conservative heuristic that allows to verify wide range of programs,
 * however it precludes verification of programs that conjure an
 * imprecise value on the first loop iteration and use it as precise on a second.
 * For example, the following safe program would fail to verify:
 *
 *     struct bpf_num_iter it;
 *     int arr[10];
 *     int i = 0, a = 0;
 *     bpf_iter_num_new(&it, 0, 10);
 *     while (bpf_iter_num_next(&it)) {
 *       if (a == 0) {
 *         a = 1;
 *         i = 7; // Because i changed verifier would forget
 *                // it's range on second loop entry.
 *       } else {
 *         arr[i] = 42; // This would fail to verify.
 *       }
 *     }
 *     bpf_iter_num_destroy(&it);
 */
static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
                                  struct bpf_kfunc_call_arg_meta *meta)
{
        struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
        struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
        struct bpf_reg_state *cur_iter, *queued_iter;

        BTF_TYPE_EMIT(struct bpf_iter);

        cur_iter = get_iter_from_state(cur_st, meta);

        if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
            cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
                verifier_bug(env, "unexpected iterator state %d (%s)",
                             cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
                return -EFAULT;
        }

        if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
                /* Because iter_next() call is a checkpoint is_state_visitied()
                 * should guarantee parent state with same call sites and insn_idx.
                 */
                if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
                    !same_callsites(cur_st->parent, cur_st)) {
                        verifier_bug(env, "bad parent state for iter next call");
                        return -EFAULT;
                }
                /* Note cur_st->parent in the call below, it is necessary to skip
                 * checkpoint created for cur_st by is_state_visited()
                 * right at this instruction.
                 */
                prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
                /* branch out active iter state */
                queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
                if (IS_ERR(queued_st))
                        return PTR_ERR(queued_st);

                queued_iter = get_iter_from_state(queued_st, meta);
                queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
                queued_iter->iter.depth++;
                if (prev_st)
                        widen_imprecise_scalars(env, prev_st, queued_st);

                queued_fr = queued_st->frame[queued_st->curframe];
                mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
        }

        /* switch to DRAINED state, but keep the depth unchanged */
        /* mark current iter state as drained and assume returned NULL */
        cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
        __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]);

        return 0;
}

static bool arg_type_is_mem_size(enum bpf_arg_type type)
{
        return type == ARG_CONST_SIZE ||
               type == ARG_CONST_SIZE_OR_ZERO;
}

static bool arg_type_is_raw_mem(enum bpf_arg_type type)
{
        return base_type(type) == ARG_PTR_TO_MEM &&
               type & MEM_UNINIT;
}

static bool arg_type_is_release(enum bpf_arg_type type)
{
        return type & OBJ_RELEASE;
}

static bool arg_type_is_dynptr(enum bpf_arg_type type)
{
        return base_type(type) == ARG_PTR_TO_DYNPTR;
}

static int resolve_map_arg_type(struct bpf_verifier_env *env,
                                 const struct bpf_call_arg_meta *meta,
                                 enum bpf_arg_type *arg_type)
{
        if (!meta->map.ptr) {
                /* kernel subsystem misconfigured verifier */
                verifier_bug(env, "invalid map_ptr to access map->type");
                return -EFAULT;
        }

        switch (meta->map.ptr->map_type) {
        case BPF_MAP_TYPE_SOCKMAP:
        case BPF_MAP_TYPE_SOCKHASH:
                if (*arg_type == ARG_PTR_TO_MAP_VALUE) {
                        *arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON;
                } else {
                        verbose(env, "invalid arg_type for sockmap/sockhash\n");
                        return -EINVAL;
                }
                break;
        case BPF_MAP_TYPE_BLOOM_FILTER:
                if (meta->func_id == BPF_FUNC_map_peek_elem)
                        *arg_type = ARG_PTR_TO_MAP_VALUE;
                break;
        default:
                break;
        }
        return 0;
}

struct bpf_reg_types {
        const enum bpf_reg_type types[10];
        u32 *btf_id;
};

static const struct bpf_reg_types sock_types = {
        .types = {
                PTR_TO_SOCK_COMMON,
                PTR_TO_SOCKET,
                PTR_TO_TCP_SOCK,
                PTR_TO_XDP_SOCK,
        },
};

#ifdef CONFIG_NET
static const struct bpf_reg_types btf_id_sock_common_types = {
        .types = {
                PTR_TO_SOCK_COMMON,
                PTR_TO_SOCKET,
                PTR_TO_TCP_SOCK,
                PTR_TO_XDP_SOCK,
                PTR_TO_BTF_ID,
                PTR_TO_BTF_ID | PTR_TRUSTED,
        },
        .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
};
#endif

static const struct bpf_reg_types mem_types = {
        .types = {
                PTR_TO_STACK,
                PTR_TO_PACKET,
                PTR_TO_PACKET_META,
                PTR_TO_MAP_KEY,
                PTR_TO_MAP_VALUE,
                PTR_TO_MEM,
                PTR_TO_MEM | MEM_RINGBUF,
                PTR_TO_BUF,
                PTR_TO_BTF_ID | PTR_TRUSTED,
                PTR_TO_CTX,
        },
};

static const struct bpf_reg_types spin_lock_types = {
        .types = {
                PTR_TO_MAP_VALUE,
                PTR_TO_BTF_ID | MEM_ALLOC,
        }
};

static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
static const struct bpf_reg_types btf_ptr_types = {
        .types = {
                PTR_TO_BTF_ID,
                PTR_TO_BTF_ID | PTR_TRUSTED,
                PTR_TO_BTF_ID | MEM_RCU,
        },
};
static const struct bpf_reg_types percpu_btf_ptr_types = {
        .types = {
                PTR_TO_BTF_ID | MEM_PERCPU,
                PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
                PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
        }
};
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types kptr_xchg_dest_types = {
        .types = {
                PTR_TO_MAP_VALUE,
                PTR_TO_BTF_ID | MEM_ALLOC,
                PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF,
                PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU,
        }
};
static const struct bpf_reg_types dynptr_types = {
        .types = {
                PTR_TO_STACK,
                CONST_PTR_TO_DYNPTR,
        }
};

static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
        [ARG_PTR_TO_MAP_KEY]                = &mem_types,
        [ARG_PTR_TO_MAP_VALUE]                = &mem_types,
        [ARG_CONST_SIZE]                = &scalar_types,
        [ARG_CONST_SIZE_OR_ZERO]        = &scalar_types,
        [ARG_CONST_ALLOC_SIZE_OR_ZERO]        = &scalar_types,
        [ARG_CONST_MAP_PTR]                = &const_map_ptr_types,
        [ARG_PTR_TO_CTX]                = &context_types,
        [ARG_PTR_TO_SOCK_COMMON]        = &sock_types,
#ifdef CONFIG_NET
        [ARG_PTR_TO_BTF_ID_SOCK_COMMON]        = &btf_id_sock_common_types,
#endif
        [ARG_PTR_TO_SOCKET]                = &fullsock_types,
        [ARG_PTR_TO_BTF_ID]                = &btf_ptr_types,
        [ARG_PTR_TO_SPIN_LOCK]                = &spin_lock_types,
        [ARG_PTR_TO_MEM]                = &mem_types,
        [ARG_PTR_TO_RINGBUF_MEM]        = &ringbuf_mem_types,
        [ARG_PTR_TO_PERCPU_BTF_ID]        = &percpu_btf_ptr_types,
        [ARG_PTR_TO_FUNC]                = &func_ptr_types,
        [ARG_PTR_TO_STACK]                = &stack_ptr_types,
        [ARG_PTR_TO_CONST_STR]                = &const_str_ptr_types,
        [ARG_PTR_TO_TIMER]                = &timer_types,
        [ARG_KPTR_XCHG_DEST]                = &kptr_xchg_dest_types,
        [ARG_PTR_TO_DYNPTR]                = &dynptr_types,
};

static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
                          enum bpf_arg_type arg_type,
                          const u32 *arg_btf_id,
                          struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        enum bpf_reg_type expected, type = reg->type;
        const struct bpf_reg_types *compatible;
        int i, j, err;

        compatible = compatible_reg_types[base_type(arg_type)];
        if (!compatible) {
                verifier_bug(env, "unsupported arg type %d", arg_type);
                return -EFAULT;
        }

        /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
         * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
         *
         * Same for MAYBE_NULL:
         *
         * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
         * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
         *
         * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type.
         *
         * Therefore we fold these flags depending on the arg_type before comparison.
         */
        if (arg_type & MEM_RDONLY)
                type &= ~MEM_RDONLY;
        if (arg_type & PTR_MAYBE_NULL)
                type &= ~PTR_MAYBE_NULL;
        if (base_type(arg_type) == ARG_PTR_TO_MEM)
                type &= ~DYNPTR_TYPE_FLAG_MASK;

        /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */
        if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) {
                type &= ~MEM_ALLOC;
                type &= ~MEM_PERCPU;
        }

        for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
                expected = compatible->types[i];
                if (expected == NOT_INIT)
                        break;

                if (type == expected)
                        goto found;
        }

        verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
        for (j = 0; j + 1 < i; j++)
                verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
        verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
        return -EACCES;

found:
        if (base_type(reg->type) != PTR_TO_BTF_ID)
                return 0;

        if (compatible == &mem_types) {
                if (!(arg_type & MEM_RDONLY)) {
                        verbose(env,
                                "%s() may write into memory pointed by R%d type=%s\n",
                                func_id_name(meta->func_id),
                                regno, reg_type_str(env, reg->type));
                        return -EACCES;
                }
                return 0;
        }

        switch ((int)reg->type) {
        case PTR_TO_BTF_ID:
        case PTR_TO_BTF_ID | PTR_TRUSTED:
        case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL:
        case PTR_TO_BTF_ID | MEM_RCU:
        case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
        case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
        {
                /* For bpf_sk_release, it needs to match against first member
                 * 'struct sock_common', hence make an exception for it. This
                 * allows bpf_sk_release to work for multiple socket types.
                 */
                bool strict_type_match = arg_type_is_release(arg_type) &&
                                         meta->func_id != BPF_FUNC_sk_release;

                if (type_may_be_null(reg->type) &&
                    (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
                        verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
                        return -EACCES;
                }

                if (!arg_btf_id) {
                        if (!compatible->btf_id) {
                                verifier_bug(env, "missing arg compatible BTF ID");
                                return -EFAULT;
                        }
                        arg_btf_id = compatible->btf_id;
                }

                if (meta->func_id == BPF_FUNC_kptr_xchg) {
                        if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
                                return -EACCES;
                } else {
                        if (arg_btf_id == BPF_PTR_POISON) {
                                verbose(env, "verifier internal error:");
                                verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
                                        regno);
                                return -EACCES;
                        }

                        err = __check_ptr_off_reg(env, reg, regno, true);
                        if (err)
                                return err;

                        if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id,
                                                  reg->var_off.value, btf_vmlinux, *arg_btf_id,
                                                  strict_type_match)) {
                                verbose(env, "R%d is of type %s but %s is expected\n",
                                        regno, btf_type_name(reg->btf, reg->btf_id),
                                        btf_type_name(btf_vmlinux, *arg_btf_id));
                                return -EACCES;
                        }
                }
                break;
        }
        case PTR_TO_BTF_ID | MEM_ALLOC:
        case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
        case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
        case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
                if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
                    meta->func_id != BPF_FUNC_kptr_xchg) {
                        verifier_bug(env, "unimplemented handling of MEM_ALLOC");
                        return -EFAULT;
                }
                /* Check if local kptr in src arg matches kptr in dst arg */
                if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) {
                        if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
                                return -EACCES;
                }
                break;
        case PTR_TO_BTF_ID | MEM_PERCPU:
        case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
        case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
                /* Handled by helper specific checks */
                break;
        default:
                verifier_bug(env, "invalid PTR_TO_BTF_ID register for type match");
                return -EFAULT;
        }
        return 0;
}

static struct btf_field *
reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
{
        struct btf_field *field;
        struct btf_record *rec;

        rec = reg_btf_record(reg);
        if (!rec)
                return NULL;

        field = btf_record_find(rec, off, fields);
        if (!field)
                return NULL;

        return field;
}

static int check_func_arg_reg_off(struct bpf_verifier_env *env,
                                  const struct bpf_reg_state *reg, int regno,
                                  enum bpf_arg_type arg_type)
{
        u32 type = reg->type;

        /* When referenced register is passed to release function, its fixed
         * offset must be 0.
         *
         * We will check arg_type_is_release reg has ref_obj_id when storing
         * meta->release_regno.
         */
        if (arg_type_is_release(arg_type)) {
                /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
                 * may not directly point to the object being released, but to
                 * dynptr pointing to such object, which might be at some offset
                 * on the stack. In that case, we simply to fallback to the
                 * default handling.
                 */
                if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
                        return 0;

                /* Doing check_ptr_off_reg check for the offset will catch this
                 * because fixed_off_ok is false, but checking here allows us
                 * to give the user a better error message.
                 */
                if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) {
                        verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
                                regno);
                        return -EINVAL;
                }
        }

        switch (type) {
        /* Pointer types where both fixed and variable offset is explicitly allowed: */
        case PTR_TO_STACK:
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
        case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
        case PTR_TO_MEM:
        case PTR_TO_MEM | MEM_RDONLY:
        case PTR_TO_MEM | MEM_RINGBUF:
        case PTR_TO_BUF:
        case PTR_TO_BUF | MEM_RDONLY:
        case PTR_TO_ARENA:
        case SCALAR_VALUE:
                return 0;
        /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
         * fixed offset.
         */
        case PTR_TO_BTF_ID:
        case PTR_TO_BTF_ID | MEM_ALLOC:
        case PTR_TO_BTF_ID | PTR_TRUSTED:
        case PTR_TO_BTF_ID | MEM_RCU:
        case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
        case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
                /* When referenced PTR_TO_BTF_ID is passed to release function,
                 * its fixed offset must be 0. In the other cases, fixed offset
                 * can be non-zero. This was already checked above. So pass
                 * fixed_off_ok as true to allow fixed offset for all other
                 * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
                 * still need to do checks instead of returning.
                 */
                return __check_ptr_off_reg(env, reg, regno, true);
        case PTR_TO_CTX:
                /*
                 * Allow fixed and variable offsets for syscall context, but
                 * only when the argument is passed as memory, not ctx,
                 * otherwise we may get modified ctx in tail called programs and
                 * global subprogs (that may act as extension prog hooks).
                 */
                if (arg_type != ARG_PTR_TO_CTX && is_var_ctx_off_allowed(env->prog))
                        return 0;
                fallthrough;
        default:
                return __check_ptr_off_reg(env, reg, regno, false);
        }
}

static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
                                                const struct bpf_func_proto *fn,
                                                struct bpf_reg_state *regs)
{
        struct bpf_reg_state *state = NULL;
        int i;

        for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
                if (arg_type_is_dynptr(fn->arg_type[i])) {
                        if (state) {
                                verbose(env, "verifier internal error: multiple dynptr args\n");
                                return NULL;
                        }
                        state = &regs[BPF_REG_1 + i];
                }

        if (!state)
                verbose(env, "verifier internal error: no dynptr arg found\n");

        return state;
}

static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        int spi;

        if (reg->type == CONST_PTR_TO_DYNPTR)
                return reg->id;
        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return spi;
        return state->stack[spi].spilled_ptr.id;
}

static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        int spi;

        if (reg->type == CONST_PTR_TO_DYNPTR)
                return reg->ref_obj_id;
        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return spi;
        return state->stack[spi].spilled_ptr.ref_obj_id;
}

static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
                                            struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = bpf_func(env, reg);
        int spi;

        if (reg->type == CONST_PTR_TO_DYNPTR)
                return reg->dynptr.type;

        spi = bpf_get_spi(reg->var_off.value);
        if (spi < 0) {
                verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
                return BPF_DYNPTR_TYPE_INVALID;
        }

        return state->stack[spi].spilled_ptr.dynptr.type;
}

static int check_reg_const_str(struct bpf_verifier_env *env,
                               struct bpf_reg_state *reg, u32 regno)
{
        struct bpf_map *map = reg->map_ptr;
        int err;
        int map_off;
        u64 map_addr;
        char *str_ptr;

        if (reg->type != PTR_TO_MAP_VALUE)
                return -EINVAL;

        if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
                verbose(env, "R%d points to insn_array map which cannot be used as const string\n", regno);
                return -EACCES;
        }

        if (!bpf_map_is_rdonly(map)) {
                verbose(env, "R%d does not point to a readonly map'\n", regno);
                return -EACCES;
        }

        if (!tnum_is_const(reg->var_off)) {
                verbose(env, "R%d is not a constant address'\n", regno);
                return -EACCES;
        }

        if (!map->ops->map_direct_value_addr) {
                verbose(env, "no direct value access support for this map type\n");
                return -EACCES;
        }

        err = check_map_access(env, regno, 0,
                               map->value_size - reg->var_off.value, false,
                               ACCESS_HELPER);
        if (err)
                return err;

        map_off = reg->var_off.value;
        err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
        if (err) {
                verbose(env, "direct value access on string failed\n");
                return err;
        }

        str_ptr = (char *)(long)(map_addr);
        if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
                verbose(env, "string is not zero-terminated\n");
                return -EINVAL;
        }
        return 0;
}

/* Returns constant key value in `value` if possible, else negative error */
static int get_constant_map_key(struct bpf_verifier_env *env,
                                struct bpf_reg_state *key,
                                u32 key_size,
                                s64 *value)
{
        struct bpf_func_state *state = bpf_func(env, key);
        struct bpf_reg_state *reg;
        int slot, spi, off;
        int spill_size = 0;
        int zero_size = 0;
        int stack_off;
        int i, err;
        u8 *stype;

        if (!env->bpf_capable)
                return -EOPNOTSUPP;
        if (key->type != PTR_TO_STACK)
                return -EOPNOTSUPP;
        if (!tnum_is_const(key->var_off))
                return -EOPNOTSUPP;

        stack_off = key->var_off.value;
        slot = -stack_off - 1;
        spi = slot / BPF_REG_SIZE;
        off = slot % BPF_REG_SIZE;
        stype = state->stack[spi].slot_type;

        /* First handle precisely tracked STACK_ZERO */
        for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--)
                zero_size++;
        if (zero_size >= key_size) {
                *value = 0;
                return 0;
        }

        /* Check that stack contains a scalar spill of expected size */
        if (!bpf_is_spilled_scalar_reg(&state->stack[spi]))
                return -EOPNOTSUPP;
        for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--)
                spill_size++;
        if (spill_size != key_size)
                return -EOPNOTSUPP;

        reg = &state->stack[spi].spilled_ptr;
        if (!tnum_is_const(reg->var_off))
                /* Stack value not statically known */
                return -EOPNOTSUPP;

        /* We are relying on a constant value. So mark as precise
         * to prevent pruning on it.
         */
        bpf_bt_set_frame_slot(&env->bt, key->frameno, spi);
        err = mark_chain_precision_batch(env, env->cur_state);
        if (err < 0)
                return err;

        *value = reg->var_off.value;
        return 0;
}

static bool can_elide_value_nullness(enum bpf_map_type type);

static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
                          struct bpf_call_arg_meta *meta,
                          const struct bpf_func_proto *fn,
                          int insn_idx)
{
        u32 regno = BPF_REG_1 + arg;
        struct bpf_reg_state *reg = reg_state(env, regno);
        enum bpf_arg_type arg_type = fn->arg_type[arg];
        enum bpf_reg_type type = reg->type;
        u32 *arg_btf_id = NULL;
        u32 key_size;
        int err = 0;

        if (arg_type == ARG_DONTCARE)
                return 0;

        err = check_reg_arg(env, regno, SRC_OP);
        if (err)
                return err;

        if (arg_type == ARG_ANYTHING) {
                if (is_pointer_value(env, regno)) {
                        verbose(env, "R%d leaks addr into helper function\n",
                                regno);
                        return -EACCES;
                }
                return 0;
        }

        if (type_is_pkt_pointer(type) &&
            !may_access_direct_pkt_data(env, meta, BPF_READ)) {
                verbose(env, "helper access to the packet is not allowed\n");
                return -EACCES;
        }

        if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
                err = resolve_map_arg_type(env, meta, &arg_type);
                if (err)
                        return err;
        }

        if (bpf_register_is_null(reg) && type_may_be_null(arg_type))
                /* A NULL register has a SCALAR_VALUE type, so skip
                 * type checking.
                 */
                goto skip_type_check;

        /* arg_btf_id and arg_size are in a union. */
        if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
            base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
                arg_btf_id = fn->arg_btf_id[arg];

        err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
        if (err)
                return err;

        err = check_func_arg_reg_off(env, reg, regno, arg_type);
        if (err)
                return err;

skip_type_check:
        if (arg_type_is_release(arg_type)) {
                if (arg_type_is_dynptr(arg_type)) {
                        struct bpf_func_state *state = bpf_func(env, reg);
                        int spi;

                        /* Only dynptr created on stack can be released, thus
                         * the get_spi and stack state checks for spilled_ptr
                         * should only be done before process_dynptr_func for
                         * PTR_TO_STACK.
                         */
                        if (reg->type == PTR_TO_STACK) {
                                spi = dynptr_get_spi(env, reg);
                                if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
                                        verbose(env, "arg %d is an unacquired reference\n", regno);
                                        return -EINVAL;
                                }
                        } else {
                                verbose(env, "cannot release unowned const bpf_dynptr\n");
                                return -EINVAL;
                        }
                } else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) {
                        verbose(env, "R%d must be referenced when passed to release function\n",
                                regno);
                        return -EINVAL;
                }
                if (meta->release_regno) {
                        verifier_bug(env, "more than one release argument");
                        return -EFAULT;
                }
                meta->release_regno = regno;
        }

        if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) {
                if (meta->ref_obj_id) {
                        verbose(env, "more than one arg with ref_obj_id R%d %u %u",
                                regno, reg->ref_obj_id,
                                meta->ref_obj_id);
                        return -EACCES;
                }
                meta->ref_obj_id = reg->ref_obj_id;
        }

        switch (base_type(arg_type)) {
        case ARG_CONST_MAP_PTR:
                /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
                if (meta->map.ptr) {
                        /* Use map_uid (which is unique id of inner map) to reject:
                         * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
                         * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
                         * if (inner_map1 && inner_map2) {
                         *     timer = bpf_map_lookup_elem(inner_map1);
                         *     if (timer)
                         *         // mismatch would have been allowed
                         *         bpf_timer_init(timer, inner_map2);
                         * }
                         *
                         * Comparing map_ptr is enough to distinguish normal and outer maps.
                         */
                        if (meta->map.ptr != reg->map_ptr ||
                            meta->map.uid != reg->map_uid) {
                                verbose(env,
                                        "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
                                        meta->map.uid, reg->map_uid);
                                return -EINVAL;
                        }
                }
                meta->map.ptr = reg->map_ptr;
                meta->map.uid = reg->map_uid;
                break;
        case ARG_PTR_TO_MAP_KEY:
                /* bpf_map_xxx(..., map_ptr, ..., key) call:
                 * check that [key, key + map->key_size) are within
                 * stack limits and initialized
                 */
                if (!meta->map.ptr) {
                        /* in function declaration map_ptr must come before
                         * map_key, so that it's verified and known before
                         * we have to check map_key here. Otherwise it means
                         * that kernel subsystem misconfigured verifier
                         */
                        verifier_bug(env, "invalid map_ptr to access map->key");
                        return -EFAULT;
                }
                key_size = meta->map.ptr->key_size;
                err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
                if (err)
                        return err;
                if (can_elide_value_nullness(meta->map.ptr->map_type)) {
                        err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
                        if (err < 0) {
                                meta->const_map_key = -1;
                                if (err == -EOPNOTSUPP)
                                        err = 0;
                                else
                                        return err;
                        }
                }
                break;
        case ARG_PTR_TO_MAP_VALUE:
                if (type_may_be_null(arg_type) && bpf_register_is_null(reg))
                        return 0;

                /* bpf_map_xxx(..., map_ptr, ..., value) call:
                 * check [value, value + map->value_size) validity
                 */
                if (!meta->map.ptr) {
                        /* kernel subsystem misconfigured verifier */
                        verifier_bug(env, "invalid map_ptr to access map->value");
                        return -EFAULT;
                }
                meta->raw_mode = arg_type & MEM_UNINIT;
                err = check_helper_mem_access(env, regno, meta->map.ptr->value_size,
                                              arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
                                              false, meta);
                break;
        case ARG_PTR_TO_PERCPU_BTF_ID:
                if (!reg->btf_id) {
                        verbose(env, "Helper has invalid btf_id in R%d\n", regno);
                        return -EACCES;
                }
                meta->ret_btf = reg->btf;
                meta->ret_btf_id = reg->btf_id;
                break;
        case ARG_PTR_TO_SPIN_LOCK:
                if (in_rbtree_lock_required_cb(env)) {
                        verbose(env, "can't spin_{lock,unlock} in rbtree cb\n");
                        return -EACCES;
                }
                if (meta->func_id == BPF_FUNC_spin_lock) {
                        err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK);
                        if (err)
                                return err;
                } else if (meta->func_id == BPF_FUNC_spin_unlock) {
                        err = process_spin_lock(env, regno, 0);
                        if (err)
                                return err;
                } else {
                        verifier_bug(env, "spin lock arg on unexpected helper");
                        return -EFAULT;
                }
                break;
        case ARG_PTR_TO_TIMER:
                err = process_timer_helper(env, regno, meta);
                if (err)
                        return err;
                break;
        case ARG_PTR_TO_FUNC:
                meta->subprogno = reg->subprogno;
                break;
        case ARG_PTR_TO_MEM:
                /* The access to this pointer is only checked when we hit the
                 * next is_mem_size argument below.
                 */
                meta->raw_mode = arg_type & MEM_UNINIT;
                if (arg_type & MEM_FIXED_SIZE) {
                        err = check_helper_mem_access(env, regno, fn->arg_size[arg],
                                                      arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
                                                      false, meta);
                        if (err)
                                return err;
                        if (arg_type & MEM_ALIGNED)
                                err = check_ptr_alignment(env, reg, 0, fn->arg_size[arg], true);
                }
                break;
        case ARG_CONST_SIZE:
                err = check_mem_size_reg(env, reg, regno,
                                         fn->arg_type[arg - 1] & MEM_WRITE ?
                                         BPF_WRITE : BPF_READ,
                                         false, meta);
                break;
        case ARG_CONST_SIZE_OR_ZERO:
                err = check_mem_size_reg(env, reg, regno,
                                         fn->arg_type[arg - 1] & MEM_WRITE ?
                                         BPF_WRITE : BPF_READ,
                                         true, meta);
                break;
        case ARG_PTR_TO_DYNPTR:
                err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
                if (err)
                        return err;
                break;
        case ARG_CONST_ALLOC_SIZE_OR_ZERO:
                if (!tnum_is_const(reg->var_off)) {
                        verbose(env, "R%d is not a known constant'\n",
                                regno);
                        return -EACCES;
                }
                meta->mem_size = reg->var_off.value;
                err = mark_chain_precision(env, regno);
                if (err)
                        return err;
                break;
        case ARG_PTR_TO_CONST_STR:
        {
                err = check_reg_const_str(env, reg, regno);
                if (err)
                        return err;
                break;
        }
        case ARG_KPTR_XCHG_DEST:
                err = process_kptr_func(env, regno, meta);
                if (err)
                        return err;
                break;
        }

        return err;
}

static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
{
        enum bpf_attach_type eatype = env->prog->expected_attach_type;
        enum bpf_prog_type type = resolve_prog_type(env->prog);

        if (func_id != BPF_FUNC_map_update_elem &&
            func_id != BPF_FUNC_map_delete_elem)
                return false;

        /* It's not possible to get access to a locked struct sock in these
         * contexts, so updating is safe.
         */
        switch (type) {
        case BPF_PROG_TYPE_TRACING:
                if (eatype == BPF_TRACE_ITER)
                        return true;
                break;
        case BPF_PROG_TYPE_SOCK_OPS:
                /* map_update allowed only via dedicated helpers with event type checks */
                if (func_id == BPF_FUNC_map_delete_elem)
                        return true;
                break;
        case BPF_PROG_TYPE_SOCKET_FILTER:
        case BPF_PROG_TYPE_SCHED_CLS:
        case BPF_PROG_TYPE_SCHED_ACT:
        case BPF_PROG_TYPE_XDP:
        case BPF_PROG_TYPE_SK_REUSEPORT:
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
        case BPF_PROG_TYPE_SK_LOOKUP:
                return true;
        default:
                break;
        }

        verbose(env, "cannot update sockmap in this context\n");
        return false;
}

bool bpf_allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
{
        return env->prog->jit_requested &&
               bpf_jit_supports_subprog_tailcalls();
}

static int check_map_func_compatibility(struct bpf_verifier_env *env,
                                        struct bpf_map *map, int func_id)
{
        if (!map)
                return 0;

        /* We need a two way check, first is from map perspective ... */
        switch (map->map_type) {
        case BPF_MAP_TYPE_PROG_ARRAY:
                if (func_id != BPF_FUNC_tail_call)
                        goto error;
                break;
        case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
                if (func_id != BPF_FUNC_perf_event_read &&
                    func_id != BPF_FUNC_perf_event_output &&
                    func_id != BPF_FUNC_skb_output &&
                    func_id != BPF_FUNC_perf_event_read_value &&
                    func_id != BPF_FUNC_xdp_output)
                        goto error;
                break;
        case BPF_MAP_TYPE_RINGBUF:
                if (func_id != BPF_FUNC_ringbuf_output &&
                    func_id != BPF_FUNC_ringbuf_reserve &&
                    func_id != BPF_FUNC_ringbuf_query &&
                    func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
                    func_id != BPF_FUNC_ringbuf_submit_dynptr &&
                    func_id != BPF_FUNC_ringbuf_discard_dynptr)
                        goto error;
                break;
        case BPF_MAP_TYPE_USER_RINGBUF:
                if (func_id != BPF_FUNC_user_ringbuf_drain)
                        goto error;
                break;
        case BPF_MAP_TYPE_STACK_TRACE:
                if (func_id != BPF_FUNC_get_stackid)
                        goto error;
                break;
        case BPF_MAP_TYPE_CGROUP_ARRAY:
                if (func_id != BPF_FUNC_skb_under_cgroup &&
                    func_id != BPF_FUNC_current_task_under_cgroup)
                        goto error;
                break;
        case BPF_MAP_TYPE_CGROUP_STORAGE:
        case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
                if (func_id != BPF_FUNC_get_local_storage)
                        goto error;
                break;
        case BPF_MAP_TYPE_DEVMAP:
        case BPF_MAP_TYPE_DEVMAP_HASH:
                if (func_id != BPF_FUNC_redirect_map &&
                    func_id != BPF_FUNC_map_lookup_elem)
                        goto error;
                break;
        /* Restrict bpf side of cpumap and xskmap, open when use-cases
         * appear.
         */
        case BPF_MAP_TYPE_CPUMAP:
                if (func_id != BPF_FUNC_redirect_map)
                        goto error;
                break;
        case BPF_MAP_TYPE_XSKMAP:
                if (func_id != BPF_FUNC_redirect_map &&
                    func_id != BPF_FUNC_map_lookup_elem)
                        goto error;
                break;
        case BPF_MAP_TYPE_ARRAY_OF_MAPS:
        case BPF_MAP_TYPE_HASH_OF_MAPS:
                if (func_id != BPF_FUNC_map_lookup_elem)
                        goto error;
                break;
        case BPF_MAP_TYPE_SOCKMAP:
                if (func_id != BPF_FUNC_sk_redirect_map &&
                    func_id != BPF_FUNC_sock_map_update &&
                    func_id != BPF_FUNC_msg_redirect_map &&
                    func_id != BPF_FUNC_sk_select_reuseport &&
                    func_id != BPF_FUNC_map_lookup_elem &&
                    !may_update_sockmap(env, func_id))
                        goto error;
                break;
        case BPF_MAP_TYPE_SOCKHASH:
                if (func_id != BPF_FUNC_sk_redirect_hash &&
                    func_id != BPF_FUNC_sock_hash_update &&
                    func_id != BPF_FUNC_msg_redirect_hash &&
                    func_id != BPF_FUNC_sk_select_reuseport &&
                    func_id != BPF_FUNC_map_lookup_elem &&
                    !may_update_sockmap(env, func_id))
                        goto error;
                break;
        case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
                if (func_id != BPF_FUNC_sk_select_reuseport)
                        goto error;
                break;
        case BPF_MAP_TYPE_QUEUE:
        case BPF_MAP_TYPE_STACK:
                if (func_id != BPF_FUNC_map_peek_elem &&
                    func_id != BPF_FUNC_map_pop_elem &&
                    func_id != BPF_FUNC_map_push_elem)
                        goto error;
                break;
        case BPF_MAP_TYPE_SK_STORAGE:
                if (func_id != BPF_FUNC_sk_storage_get &&
                    func_id != BPF_FUNC_sk_storage_delete &&
                    func_id != BPF_FUNC_kptr_xchg)
                        goto error;
                break;
        case BPF_MAP_TYPE_INODE_STORAGE:
                if (func_id != BPF_FUNC_inode_storage_get &&
                    func_id != BPF_FUNC_inode_storage_delete &&
                    func_id != BPF_FUNC_kptr_xchg)
                        goto error;
                break;
        case BPF_MAP_TYPE_TASK_STORAGE:
                if (func_id != BPF_FUNC_task_storage_get &&
                    func_id != BPF_FUNC_task_storage_delete &&
                    func_id != BPF_FUNC_kptr_xchg)
                        goto error;
                break;
        case BPF_MAP_TYPE_CGRP_STORAGE:
                if (func_id != BPF_FUNC_cgrp_storage_get &&
                    func_id != BPF_FUNC_cgrp_storage_delete &&
                    func_id != BPF_FUNC_kptr_xchg)
                        goto error;
                break;
        case BPF_MAP_TYPE_BLOOM_FILTER:
                if (func_id != BPF_FUNC_map_peek_elem &&
                    func_id != BPF_FUNC_map_push_elem)
                        goto error;
                break;
        case BPF_MAP_TYPE_INSN_ARRAY:
                goto error;
        default:
                break;
        }

        /* ... and second from the function itself. */
        switch (func_id) {
        case BPF_FUNC_tail_call:
                if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
                        goto error;
                if (env->subprog_cnt > 1 && !bpf_allow_tail_call_in_subprogs(env)) {
                        verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n");
                        return -EINVAL;
                }
                break;
        case BPF_FUNC_perf_event_read:
        case BPF_FUNC_perf_event_output:
        case BPF_FUNC_perf_event_read_value:
        case BPF_FUNC_skb_output:
        case BPF_FUNC_xdp_output:
                if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
                        goto error;
                break;
        case BPF_FUNC_ringbuf_output:
        case BPF_FUNC_ringbuf_reserve:
        case BPF_FUNC_ringbuf_query:
        case BPF_FUNC_ringbuf_reserve_dynptr:
        case BPF_FUNC_ringbuf_submit_dynptr:
        case BPF_FUNC_ringbuf_discard_dynptr:
                if (map->map_type != BPF_MAP_TYPE_RINGBUF)
                        goto error;
                break;
        case BPF_FUNC_user_ringbuf_drain:
                if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
                        goto error;
                break;
        case BPF_FUNC_get_stackid:
                if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
                        goto error;
                break;
        case BPF_FUNC_current_task_under_cgroup:
        case BPF_FUNC_skb_under_cgroup:
                if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
                        goto error;
                break;
        case BPF_FUNC_redirect_map:
                if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
                    map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
                    map->map_type != BPF_MAP_TYPE_CPUMAP &&
                    map->map_type != BPF_MAP_TYPE_XSKMAP)
                        goto error;
                break;
        case BPF_FUNC_sk_redirect_map:
        case BPF_FUNC_msg_redirect_map:
        case BPF_FUNC_sock_map_update:
                if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
                        goto error;
                break;
        case BPF_FUNC_sk_redirect_hash:
        case BPF_FUNC_msg_redirect_hash:
        case BPF_FUNC_sock_hash_update:
                if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
                        goto error;
                break;
        case BPF_FUNC_get_local_storage:
                if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
                    map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
                        goto error;
                break;
        case BPF_FUNC_sk_select_reuseport:
                if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY &&
                    map->map_type != BPF_MAP_TYPE_SOCKMAP &&
                    map->map_type != BPF_MAP_TYPE_SOCKHASH)
                        goto error;
                break;
        case BPF_FUNC_map_pop_elem:
                if (map->map_type != BPF_MAP_TYPE_QUEUE &&
                    map->map_type != BPF_MAP_TYPE_STACK)
                        goto error;
                break;
        case BPF_FUNC_map_peek_elem:
        case BPF_FUNC_map_push_elem:
                if (map->map_type != BPF_MAP_TYPE_QUEUE &&
                    map->map_type != BPF_MAP_TYPE_STACK &&
                    map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
                        goto error;
                break;
        case BPF_FUNC_map_lookup_percpu_elem:
                if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
                    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
                    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
                        goto error;
                break;
        case BPF_FUNC_sk_storage_get:
        case BPF_FUNC_sk_storage_delete:
                if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
                        goto error;
                break;
        case BPF_FUNC_inode_storage_get:
        case BPF_FUNC_inode_storage_delete:
                if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
                        goto error;
                break;
        case BPF_FUNC_task_storage_get:
        case BPF_FUNC_task_storage_delete:
                if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
                        goto error;
                break;
        case BPF_FUNC_cgrp_storage_get:
        case BPF_FUNC_cgrp_storage_delete:
                if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
                        goto error;
                break;
        default:
                break;
        }

        return 0;
error:
        verbose(env, "cannot pass map_type %d into func %s#%d\n",
                map->map_type, func_id_name(func_id), func_id);
        return -EINVAL;
}

static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
{
        int count = 0;

        if (arg_type_is_raw_mem(fn->arg1_type))
                count++;
        if (arg_type_is_raw_mem(fn->arg2_type))
                count++;
        if (arg_type_is_raw_mem(fn->arg3_type))
                count++;
        if (arg_type_is_raw_mem(fn->arg4_type))
                count++;
        if (arg_type_is_raw_mem(fn->arg5_type))
                count++;

        /* We only support one arg being in raw mode at the moment,
         * which is sufficient for the helper functions we have
         * right now.
         */
        return count <= 1;
}

static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg)
{
        bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE;
        bool has_size = fn->arg_size[arg] != 0;
        bool is_next_size = false;

        if (arg + 1 < ARRAY_SIZE(fn->arg_type))
                is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]);

        if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM)
                return is_next_size;

        return has_size == is_next_size || is_next_size == is_fixed;
}

static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
{
        /* bpf_xxx(..., buf, len) call will access 'len'
         * bytes from memory 'buf'. Both arg types need
         * to be paired, so make sure there's no buggy
         * helper function specification.
         */
        if (arg_type_is_mem_size(fn->arg1_type) ||
            check_args_pair_invalid(fn, 0) ||
            check_args_pair_invalid(fn, 1) ||
            check_args_pair_invalid(fn, 2) ||
            check_args_pair_invalid(fn, 3) ||
            check_args_pair_invalid(fn, 4))
                return false;

        return true;
}

static bool check_btf_id_ok(const struct bpf_func_proto *fn)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
                if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID)
                        return !!fn->arg_btf_id[i];
                if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK)
                        return fn->arg_btf_id[i] == BPF_PTR_POISON;
                if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
                    /* arg_btf_id and arg_size are in a union. */
                    (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
                     !(fn->arg_type[i] & MEM_FIXED_SIZE)))
                        return false;
        }

        return true;
}

static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
                enum bpf_arg_type arg_type = fn->arg_type[i];

                if (base_type(arg_type) != ARG_PTR_TO_MEM)
                        continue;
                if (!(arg_type & (MEM_WRITE | MEM_RDONLY)))
                        return false;
        }

        return true;
}

static int check_func_proto(const struct bpf_func_proto *fn)
{
        return check_raw_mode_ok(fn) &&
               check_arg_pair_ok(fn) &&
               check_mem_arg_rw_flag_ok(fn) &&
               check_btf_id_ok(fn) ? 0 : -EINVAL;
}

/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
 * are now invalid, so turn them into unknown SCALAR_VALUE.
 *
 * This also applies to dynptr slices belonging to skb and xdp dynptrs,
 * since these slices point to packet data.
 */
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
        struct bpf_func_state *state;
        struct bpf_reg_state *reg;

        bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
                if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
                        mark_reg_invalid(env, reg);
        }));
}

enum {
        AT_PKT_END = -1,
        BEYOND_PKT_END = -2,
};

static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open)
{
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *reg = &state->regs[regn];

        if (reg->type != PTR_TO_PACKET)
                /* PTR_TO_PACKET_META is not supported yet */
                return;

        /* The 'reg' is pkt > pkt_end or pkt >= pkt_end.
         * How far beyond pkt_end it goes is unknown.
         * if (!range_open) it's the case of pkt >= pkt_end
         * if (range_open) it's the case of pkt > pkt_end
         * hence this pointer is at least 1 byte bigger than pkt_end
         */
        if (range_open)
                reg->range = BEYOND_PKT_END;
        else
                reg->range = AT_PKT_END;
}

static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id)
{
        int i;

        for (i = 0; i < state->acquired_refs; i++) {
                if (state->refs[i].type != REF_TYPE_PTR)
                        continue;
                if (state->refs[i].id == ref_obj_id) {
                        release_reference_state(state, i);
                        return 0;
                }
        }
        return -EINVAL;
}

/* The pointer with the specified id has released its reference to kernel
 * resources. Identify all copies of the same pointer and clear the reference.
 *
 * This is the release function corresponding to acquire_reference(). Idempotent.
 */
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state;
        struct bpf_reg_state *reg;
        int err;

        err = release_reference_nomark(vstate, ref_obj_id);
        if (err)
                return err;

        bpf_for_each_reg_in_vstate(vstate, state, reg, ({
                if (reg->ref_obj_id == ref_obj_id)
                        mark_reg_invalid(env, reg);
        }));

        return 0;
}

static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
{
        struct bpf_func_state *unused;
        struct bpf_reg_state *reg;

        bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
                if (type_is_non_owning_ref(reg->type))
                        mark_reg_invalid(env, reg);
        }));
}

static void clear_caller_saved_regs(struct bpf_verifier_env *env,
                                    struct bpf_reg_state *regs)
{
        int i;

        /* after the call registers r0 - r5 were scratched */
        for (i = 0; i < CALLER_SAVED_REGS; i++) {
                bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
                __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
        }
}

typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
                                   struct bpf_func_state *caller,
                                   struct bpf_func_state *callee,
                                   int insn_idx);

static int set_callee_state(struct bpf_verifier_env *env,
                            struct bpf_func_state *caller,
                            struct bpf_func_state *callee, int insn_idx);

static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
                            set_callee_state_fn set_callee_state_cb,
                            struct bpf_verifier_state *state)
{
        struct bpf_func_state *caller, *callee;
        int err;

        if (state->curframe + 1 >= MAX_CALL_FRAMES) {
                verbose(env, "the call stack of %d frames is too deep\n",
                        state->curframe + 2);
                return -E2BIG;
        }

        if (state->frame[state->curframe + 1]) {
                verifier_bug(env, "Frame %d already allocated", state->curframe + 1);
                return -EFAULT;
        }

        caller = state->frame[state->curframe];
        callee = kzalloc_obj(*callee, GFP_KERNEL_ACCOUNT);
        if (!callee)
                return -ENOMEM;
        state->frame[state->curframe + 1] = callee;

        /* callee cannot access r0, r6 - r9 for reading and has to write
         * into its own stack before reading from it.
         * callee can read/write into caller's stack
         */
        init_func_state(env, callee,
                        /* remember the callsite, it will be used by bpf_exit */
                        callsite,
                        state->curframe + 1 /* frameno within this callchain */,
                        subprog /* subprog number within this prog */);
        err = set_callee_state_cb(env, caller, callee, callsite);
        if (err)
                goto err_out;

        /* only increment it after check_reg_arg() finished */
        state->curframe++;

        return 0;

err_out:
        free_func_state(callee);
        state->frame[state->curframe + 1] = NULL;
        return err;
}

static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
                                    const struct btf *btf,
                                    struct bpf_reg_state *regs)
{
        struct bpf_subprog_info *sub = subprog_info(env, subprog);
        struct bpf_verifier_log *log = &env->log;
        u32 i;
        int ret;

        ret = btf_prepare_func_args(env, subprog);
        if (ret)
                return ret;

        /* check that BTF function arguments match actual types that the
         * verifier sees.
         */
        for (i = 0; i < sub->arg_cnt; i++) {
                u32 regno = i + 1;
                struct bpf_reg_state *reg = &regs[regno];
                struct bpf_subprog_arg_info *arg = &sub->args[i];

                if (arg->arg_type == ARG_ANYTHING) {
                        if (reg->type != SCALAR_VALUE) {
                                bpf_log(log, "R%d is not a scalar\n", regno);
                                return -EINVAL;
                        }
                } else if (arg->arg_type & PTR_UNTRUSTED) {
                        /*
                         * Anything is allowed for untrusted arguments, as these are
                         * read-only and probe read instructions would protect against
                         * invalid memory access.
                         */
                } else if (arg->arg_type == ARG_PTR_TO_CTX) {
                        ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX);
                        if (ret < 0)
                                return ret;
                        /* If function expects ctx type in BTF check that caller
                         * is passing PTR_TO_CTX.
                         */
                        if (reg->type != PTR_TO_CTX) {
                                bpf_log(log, "arg#%d expects pointer to ctx\n", i);
                                return -EINVAL;
                        }
                } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
                        ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
                        if (ret < 0)
                                return ret;
                        if (check_mem_reg(env, reg, regno, arg->mem_size))
                                return -EINVAL;
                        if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
                                bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
                                return -EINVAL;
                        }
                } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
                        /*
                         * Can pass any value and the kernel won't crash, but
                         * only PTR_TO_ARENA or SCALAR make sense. Everything
                         * else is a bug in the bpf program. Point it out to
                         * the user at the verification time instead of
                         * run-time debug nightmare.
                         */
                        if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
                                bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
                                return -EINVAL;
                        }
                } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
                        ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR);
                        if (ret)
                                return ret;

                        ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
                        if (ret)
                                return ret;
                } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
                        struct bpf_call_arg_meta meta;
                        int err;

                        if (bpf_register_is_null(reg) && type_may_be_null(arg->arg_type))
                                continue;

                        memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
                        err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
                        err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
                        if (err)
                                return err;
                } else {
                        verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type);
                        return -EFAULT;
                }
        }

        return 0;
}

/* Compare BTF of a function call with given bpf_reg_state.
 * Returns:
 * EFAULT - there is a verifier bug. Abort verification.
 * EINVAL - there is a type mismatch or BTF is not available.
 * 0 - BTF matches with what bpf_reg_state expects.
 * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
 */
static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
                                  struct bpf_reg_state *regs)
{
        struct bpf_prog *prog = env->prog;
        struct btf *btf = prog->aux->btf;
        u32 btf_id;
        int err;

        if (!prog->aux->func_info)
                return -EINVAL;

        btf_id = prog->aux->func_info[subprog].type_id;
        if (!btf_id)
                return -EFAULT;

        if (prog->aux->func_info_aux[subprog].unreliable)
                return -EINVAL;

        err = btf_check_func_arg_match(env, subprog, btf, regs);
        /* Compiler optimizations can remove arguments from static functions
         * or mismatched type can be passed into a global function.
         * In such cases mark the function as unreliable from BTF point of view.
         */
        if (err)
                prog->aux->func_info_aux[subprog].unreliable = true;
        return err;
}

static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                              int insn_idx, int subprog,
                              set_callee_state_fn set_callee_state_cb)
{
        struct bpf_verifier_state *state = env->cur_state, *callback_state;
        struct bpf_func_state *caller, *callee;
        int err;

        caller = state->frame[state->curframe];
        err = btf_check_subprog_call(env, subprog, caller->regs);
        if (err == -EFAULT)
                return err;

        /* set_callee_state is used for direct subprog calls, but we are
         * interested in validating only BPF helpers that can call subprogs as
         * callbacks
         */
        env->subprog_info[subprog].is_cb = true;
        if (bpf_pseudo_kfunc_call(insn) &&
            !is_callback_calling_kfunc(insn->imm)) {
                verifier_bug(env, "kfunc %s#%d not marked as callback-calling",
                             func_id_name(insn->imm), insn->imm);
                return -EFAULT;
        } else if (!bpf_pseudo_kfunc_call(insn) &&
                   !is_callback_calling_function(insn->imm)) { /* helper */
                verifier_bug(env, "helper %s#%d not marked as callback-calling",
                             func_id_name(insn->imm), insn->imm);
                return -EFAULT;
        }

        if (bpf_is_async_callback_calling_insn(insn)) {
                struct bpf_verifier_state *async_cb;

                /* there is no real recursion here. timer and workqueue callbacks are async */
                env->subprog_info[subprog].is_async_cb = true;
                async_cb = push_async_cb(env, env->subprog_info[subprog].start,
                                         insn_idx, subprog,
                                         is_async_cb_sleepable(env, insn));
                if (IS_ERR(async_cb))
                        return PTR_ERR(async_cb);
                callee = async_cb->frame[0];
                callee->async_entry_cnt = caller->async_entry_cnt + 1;

                /* Convert bpf_timer_set_callback() args into timer callback args */
                err = set_callee_state_cb(env, caller, callee, insn_idx);
                if (err)
                        return err;

                return 0;
        }

        /* for callback functions enqueue entry to callback and
         * proceed with next instruction within current frame.
         */
        callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
        if (IS_ERR(callback_state))
                return PTR_ERR(callback_state);

        err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
                               callback_state);
        if (err)
                return err;

        callback_state->callback_unroll_depth++;
        callback_state->frame[callback_state->curframe - 1]->callback_depth++;
        caller->callback_depth = 0;
        return 0;
}

static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                           int *insn_idx)
{
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_func_state *caller;
        int err, subprog, target_insn;

        target_insn = *insn_idx + insn->imm + 1;
        subprog = bpf_find_subprog(env, target_insn);
        if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program",
                            target_insn))
                return -EFAULT;

        caller = state->frame[state->curframe];
        err = btf_check_subprog_call(env, subprog, caller->regs);
        if (err == -EFAULT)
                return err;
        if (bpf_subprog_is_global(env, subprog)) {
                const char *sub_name = subprog_name(env, subprog);

                if (env->cur_state->active_locks) {
                        verbose(env, "global function calls are not allowed while holding a lock,\n"
                                     "use static function instead\n");
                        return -EINVAL;
                }

                if (env->subprog_info[subprog].might_sleep && !in_sleepable_context(env)) {
                        verbose(env, "sleepable global function %s() called in %s\n",
                                sub_name, non_sleepable_context_description(env));
                        return -EINVAL;
                }

                if (err) {
                        verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
                                subprog, sub_name);
                        return err;
                }

                if (env->log.level & BPF_LOG_LEVEL)
                        verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
                                subprog, sub_name);
                if (env->subprog_info[subprog].changes_pkt_data)
                        clear_all_pkt_pointers(env);
                /* mark global subprog for verifying after main prog */
                subprog_aux(env, subprog)->called = true;
                clear_caller_saved_regs(env, caller->regs);

                /* All non-void global functions return a 64-bit SCALAR_VALUE. */
                if (!subprog_returns_void(env, subprog)) {
                        mark_reg_unknown(env, caller->regs, BPF_REG_0);
                        caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
                }

                /* continue with next insn after call */
                return 0;
        }

        /* for regular function entry setup new frame and continue
         * from that frame.
         */
        err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
        if (err)
                return err;

        clear_caller_saved_regs(env, caller->regs);

        /* and go analyze first insn of the callee */
        *insn_idx = env->subprog_info[subprog].start - 1;

        if (env->log.level & BPF_LOG_LEVEL) {
                verbose(env, "caller:\n");
                print_verifier_state(env, state, caller->frameno, true);
                verbose(env, "callee:\n");
                print_verifier_state(env, state, state->curframe, true);
        }

        return 0;
}

int map_set_for_each_callback_args(struct bpf_verifier_env *env,
                                   struct bpf_func_state *caller,
                                   struct bpf_func_state *callee)
{
        /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
         *      void *callback_ctx, u64 flags);
         * callback_fn(struct bpf_map *map, void *key, void *value,
         *      void *callback_ctx);
         */
        callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];

        callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
        __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
        callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;

        callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
        __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
        callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;

        /* pointer to stack or null */
        callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];

        /* unused */
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
        return 0;
}

static int set_callee_state(struct bpf_verifier_env *env,
                            struct bpf_func_state *caller,
                            struct bpf_func_state *callee, int insn_idx)
{
        int i;

        /* copy r1 - r5 args that callee can access.  The copy includes parent
         * pointers, which connects us up to the liveness chain
         */
        for (i = BPF_REG_1; i <= BPF_REG_5; i++)
                callee->regs[i] = caller->regs[i];
        return 0;
}

static int set_map_elem_callback_state(struct bpf_verifier_env *env,
                                       struct bpf_func_state *caller,
                                       struct bpf_func_state *callee,
                                       int insn_idx)
{
        struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
        struct bpf_map *map;
        int err;

        /* valid map_ptr and poison value does not matter */
        map = insn_aux->map_ptr_state.map_ptr;
        if (!map->ops->map_set_for_each_callback_args ||
            !map->ops->map_for_each_callback) {
                verbose(env, "callback function not allowed for map\n");
                return -ENOTSUPP;
        }

        err = map->ops->map_set_for_each_callback_args(env, caller, callee);
        if (err)
                return err;

        callee->in_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static int set_loop_callback_state(struct bpf_verifier_env *env,
                                   struct bpf_func_state *caller,
                                   struct bpf_func_state *callee,
                                   int insn_idx)
{
        /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
         *            u64 flags);
         * callback_fn(u64 index, void *callback_ctx);
         */
        callee->regs[BPF_REG_1].type = SCALAR_VALUE;
        callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];

        /* unused */
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);

        callee->in_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static int set_timer_callback_state(struct bpf_verifier_env *env,
                                    struct bpf_func_state *caller,
                                    struct bpf_func_state *callee,
                                    int insn_idx)
{
        struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;

        /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
         * callback_fn(struct bpf_map *map, void *key, void *value);
         */
        callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
        __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
        callee->regs[BPF_REG_1].map_ptr = map_ptr;

        callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
        __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
        callee->regs[BPF_REG_2].map_ptr = map_ptr;

        callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
        __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
        callee->regs[BPF_REG_3].map_ptr = map_ptr;

        /* unused */
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
        callee->in_async_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 0);
        return 0;
}

static int set_find_vma_callback_state(struct bpf_verifier_env *env,
                                       struct bpf_func_state *caller,
                                       struct bpf_func_state *callee,
                                       int insn_idx)
{
        /* bpf_find_vma(struct task_struct *task, u64 addr,
         *               void *callback_fn, void *callback_ctx, u64 flags)
         * (callback_fn)(struct task_struct *task,
         *               struct vm_area_struct *vma, void *callback_ctx);
         */
        callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];

        callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
        __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
        callee->regs[BPF_REG_2].btf =  btf_vmlinux;
        callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];

        /* pointer to stack or null */
        callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];

        /* unused */
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
        callee->in_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
                                           struct bpf_func_state *caller,
                                           struct bpf_func_state *callee,
                                           int insn_idx)
{
        /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
         *                          callback_ctx, u64 flags);
         * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
         */
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
        mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
        callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];

        /* unused */
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);

        callee->in_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
                                         struct bpf_func_state *caller,
                                         struct bpf_func_state *callee,
                                         int insn_idx)
{
        /* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
         *                     bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
         *
         * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
         * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
         * by this point, so look at 'root'
         */
        struct btf_field *field;

        field = reg_find_field_offset(&caller->regs[BPF_REG_1],
                                      caller->regs[BPF_REG_1].var_off.value,
                                      BPF_RB_ROOT);
        if (!field || !field->graph_root.value_btf_id)
                return -EFAULT;

        mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root);
        ref_set_non_owning(env, &callee->regs[BPF_REG_1]);
        mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
        ref_set_non_owning(env, &callee->regs[BPF_REG_2]);

        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
        callee->in_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env,
                                                 struct bpf_func_state *caller,
                                                 struct bpf_func_state *callee,
                                                 int insn_idx)
{
        struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr;

        /*
         * callback_fn(struct bpf_map *map, void *key, void *value);
         */
        callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
        __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
        callee->regs[BPF_REG_1].map_ptr = map_ptr;

        callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
        __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
        callee->regs[BPF_REG_2].map_ptr = map_ptr;

        callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
        __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
        callee->regs[BPF_REG_3].map_ptr = map_ptr;

        /* unused */
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
        callee->in_async_callback_fn = true;
        callee->callback_ret_range = retval_range(S32_MIN, S32_MAX);
        return 0;
}

static bool is_rbtree_lock_required_kfunc(u32 btf_id);

/* Are we currently verifying the callback for a rbtree helper that must
 * be called with lock held? If so, no need to complain about unreleased
 * lock
 */
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
{
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_insn *insn = env->prog->insnsi;
        struct bpf_func_state *callee;
        int kfunc_btf_id;

        if (!state->curframe)
                return false;

        callee = state->frame[state->curframe];

        if (!callee->in_callback_fn)
                return false;

        kfunc_btf_id = insn[callee->callsite].imm;
        return is_rbtree_lock_required_kfunc(kfunc_btf_id);
}

static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
{
        if (range.return_32bit)
                return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval;
        else
                return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
}

static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
        struct bpf_verifier_state *state = env->cur_state, *prev_st;
        struct bpf_func_state *caller, *callee;
        struct bpf_reg_state *r0;
        bool in_callback_fn;
        int err;

        callee = state->frame[state->curframe];
        r0 = &callee->regs[BPF_REG_0];
        if (r0->type == PTR_TO_STACK) {
                /* technically it's ok to return caller's stack pointer
                 * (or caller's caller's pointer) back to the caller,
                 * since these pointers are valid. Only current stack
                 * pointer will be invalid as soon as function exits,
                 * but let's be conservative
                 */
                verbose(env, "cannot return stack pointer to the caller\n");
                return -EINVAL;
        }

        caller = state->frame[state->curframe - 1];
        if (callee->in_callback_fn) {
                if (r0->type != SCALAR_VALUE) {
                        verbose(env, "R0 not a scalar value\n");
                        return -EACCES;
                }

                /* we are going to rely on register's precise value */
                err = mark_chain_precision(env, BPF_REG_0);
                if (err)
                        return err;

                /* enforce R0 return value range, and bpf_callback_t returns 64bit */
                if (!retval_range_within(callee->callback_ret_range, r0)) {
                        verbose_invalid_scalar(env, r0, callee->callback_ret_range,
                                               "At callback return", "R0");
                        return -EINVAL;
                }
                if (!bpf_calls_callback(env, callee->callsite)) {
                        verifier_bug(env, "in callback at %d, callsite %d !calls_callback",
                                     *insn_idx, callee->callsite);
                        return -EFAULT;
                }
        } else {
                /* return to the caller whatever r0 had in the callee */
                caller->regs[BPF_REG_0] = *r0;
        }

        /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
         * there function call logic would reschedule callback visit. If iteration
         * converges is_state_visited() would prune that visit eventually.
         */
        in_callback_fn = callee->in_callback_fn;
        if (in_callback_fn)
                *insn_idx = callee->callsite;
        else
                *insn_idx = callee->callsite + 1;

        if (env->log.level & BPF_LOG_LEVEL) {
                verbose(env, "returning from callee:\n");
                print_verifier_state(env, state, callee->frameno, true);
                verbose(env, "to caller at %d:\n", *insn_idx);
                print_verifier_state(env, state, caller->frameno, true);
        }
        /* clear everything in the callee. In case of exceptional exits using
         * bpf_throw, this will be done by copy_verifier_state for extra frames. */
        free_func_state(callee);
        state->frame[state->curframe--] = NULL;

        /* for callbacks widen imprecise scalars to make programs like below verify:
         *
         *   struct ctx { int i; }
         *   void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
         *   ...
         *   struct ctx = { .i = 0; }
         *   bpf_loop(100, cb, &ctx, 0);
         *
         * This is similar to what is done in process_iter_next_call() for open
         * coded iterators.
         */
        prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
        if (prev_st) {
                err = widen_imprecise_scalars(env, prev_st, state);
                if (err)
                        return err;
        }
        return 0;
}

static int do_refine_retval_range(struct bpf_verifier_env *env,
                                  struct bpf_reg_state *regs, int ret_type,
                                  int func_id,
                                  struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];

        if (ret_type != RET_INTEGER)
                return 0;

        switch (func_id) {
        case BPF_FUNC_get_stack:
        case BPF_FUNC_get_task_stack:
        case BPF_FUNC_probe_read_str:
        case BPF_FUNC_probe_read_kernel_str:
        case BPF_FUNC_probe_read_user_str:
                ret_reg->smax_value = meta->msize_max_value;
                ret_reg->s32_max_value = meta->msize_max_value;
                ret_reg->smin_value = -MAX_ERRNO;
                ret_reg->s32_min_value = -MAX_ERRNO;
                reg_bounds_sync(ret_reg);
                break;
        case BPF_FUNC_get_smp_processor_id:
                ret_reg->umax_value = nr_cpu_ids - 1;
                ret_reg->u32_max_value = nr_cpu_ids - 1;
                ret_reg->smax_value = nr_cpu_ids - 1;
                ret_reg->s32_max_value = nr_cpu_ids - 1;
                ret_reg->umin_value = 0;
                ret_reg->u32_min_value = 0;
                ret_reg->smin_value = 0;
                ret_reg->s32_min_value = 0;
                reg_bounds_sync(ret_reg);
                break;
        }

        return reg_bounds_sanity_check(env, ret_reg, "retval");
}

static int
record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
                int func_id, int insn_idx)
{
        struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
        struct bpf_map *map = meta->map.ptr;

        if (func_id != BPF_FUNC_tail_call &&
            func_id != BPF_FUNC_map_lookup_elem &&
            func_id != BPF_FUNC_map_update_elem &&
            func_id != BPF_FUNC_map_delete_elem &&
            func_id != BPF_FUNC_map_push_elem &&
            func_id != BPF_FUNC_map_pop_elem &&
            func_id != BPF_FUNC_map_peek_elem &&
            func_id != BPF_FUNC_for_each_map_elem &&
            func_id != BPF_FUNC_redirect_map &&
            func_id != BPF_FUNC_map_lookup_percpu_elem)
                return 0;

        if (map == NULL) {
                verifier_bug(env, "expected map for helper call");
                return -EFAULT;
        }

        /* In case of read-only, some additional restrictions
         * need to be applied in order to prevent altering the
         * state of the map from program side.
         */
        if ((map->map_flags & BPF_F_RDONLY_PROG) &&
            (func_id == BPF_FUNC_map_delete_elem ||
             func_id == BPF_FUNC_map_update_elem ||
             func_id == BPF_FUNC_map_push_elem ||
             func_id == BPF_FUNC_map_pop_elem)) {
                verbose(env, "write into map forbidden\n");
                return -EACCES;
        }

        if (!aux->map_ptr_state.map_ptr)
                bpf_map_ptr_store(aux, meta->map.ptr,
                                  !meta->map.ptr->bypass_spec_v1, false);
        else if (aux->map_ptr_state.map_ptr != meta->map.ptr)
                bpf_map_ptr_store(aux, meta->map.ptr,
                                  !meta->map.ptr->bypass_spec_v1, true);
        return 0;
}

static int
record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
                int func_id, int insn_idx)
{
        struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
        struct bpf_reg_state *reg;
        struct bpf_map *map = meta->map.ptr;
        u64 val, max;
        int err;

        if (func_id != BPF_FUNC_tail_call)
                return 0;
        if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
                verbose(env, "expected prog array map for tail call");
                return -EINVAL;
        }

        reg = reg_state(env, BPF_REG_3);
        val = reg->var_off.value;
        max = map->max_entries;

        if (!(is_reg_const(reg, false) && val < max)) {
                bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
                return 0;
        }

        err = mark_chain_precision(env, BPF_REG_3);
        if (err)
                return err;
        if (bpf_map_key_unseen(aux))
                bpf_map_key_store(aux, val);
        else if (!bpf_map_key_poisoned(aux) &&
                  bpf_map_key_immediate(aux) != val)
                bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
        return 0;
}

static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
        struct bpf_verifier_state *state = env->cur_state;
        enum bpf_prog_type type = resolve_prog_type(env->prog);
        struct bpf_reg_state *reg = reg_state(env, BPF_REG_0);
        bool refs_lingering = false;
        int i;

        if (!exception_exit && cur_func(env)->frameno)
                return 0;

        for (i = 0; i < state->acquired_refs; i++) {
                if (state->refs[i].type != REF_TYPE_PTR)
                        continue;
                /* Allow struct_ops programs to return a referenced kptr back to
                 * kernel. Type checks are performed later in check_return_code.
                 */
                if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit &&
                    reg->ref_obj_id == state->refs[i].id)
                        continue;
                verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
                        state->refs[i].id, state->refs[i].insn_idx);
                refs_lingering = true;
        }
        return refs_lingering ? -EINVAL : 0;
}

static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit, bool check_lock, const char *prefix)
{
        int err;

        if (check_lock && env->cur_state->active_locks) {
                verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix);
                return -EINVAL;
        }

        err = check_reference_leak(env, exception_exit);
        if (err) {
                verbose(env, "%s would lead to reference leak\n", prefix);
                return err;
        }

        if (check_lock && env->cur_state->active_irq_id) {
                verbose(env, "%s cannot be used inside bpf_local_irq_save-ed region\n", prefix);
                return -EINVAL;
        }

        if (check_lock && env->cur_state->active_rcu_locks) {
                verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
                return -EINVAL;
        }

        if (check_lock && env->cur_state->active_preempt_locks) {
                verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix);
                return -EINVAL;
        }

        return 0;
}

static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *regs)
{
        struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
        struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
        struct bpf_map *fmt_map = fmt_reg->map_ptr;
        struct bpf_bprintf_data data = {};
        int err, fmt_map_off, num_args;
        u64 fmt_addr;
        char *fmt;

        /* data must be an array of u64 */
        if (data_len_reg->var_off.value % 8)
                return -EINVAL;
        num_args = data_len_reg->var_off.value / 8;

        /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
         * and map_direct_value_addr is set.
         */
        fmt_map_off = fmt_reg->var_off.value;
        err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
                                                  fmt_map_off);
        if (err) {
                verbose(env, "failed to retrieve map value address\n");
                return -EFAULT;
        }
        fmt = (char *)(long)fmt_addr + fmt_map_off;

        /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
         * can focus on validating the format specifiers.
         */
        err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data);
        if (err < 0)
                verbose(env, "Invalid format string\n");

        return err;
}

static int check_get_func_ip(struct bpf_verifier_env *env)
{
        enum bpf_prog_type type = resolve_prog_type(env->prog);
        int func_id = BPF_FUNC_get_func_ip;

        if (type == BPF_PROG_TYPE_TRACING) {
                if (!bpf_prog_has_trampoline(env->prog)) {
                        verbose(env, "func %s#%d supported only for fentry/fexit/fsession/fmod_ret programs\n",
                                func_id_name(func_id), func_id);
                        return -ENOTSUPP;
                }
                return 0;
        } else if (type == BPF_PROG_TYPE_KPROBE) {
                return 0;
        }

        verbose(env, "func %s#%d not supported for program type %d\n",
                func_id_name(func_id), func_id, type);
        return -ENOTSUPP;
}

static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env)
{
        return &env->insn_aux_data[env->insn_idx];
}

static bool loop_flag_is_zero(struct bpf_verifier_env *env)
{
        struct bpf_reg_state *reg = reg_state(env, BPF_REG_4);
        bool reg_is_null = bpf_register_is_null(reg);

        if (reg_is_null)
                mark_chain_precision(env, BPF_REG_4);

        return reg_is_null;
}

static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
{
        struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;

        if (!state->initialized) {
                state->initialized = 1;
                state->fit_for_inline = loop_flag_is_zero(env);
                state->callback_subprogno = subprogno;
                return;
        }

        if (!state->fit_for_inline)
                return;

        state->fit_for_inline = (loop_flag_is_zero(env) &&
                                 state->callback_subprogno == subprogno);
}

/* Returns whether or not the given map type can potentially elide
 * lookup return value nullness check. This is possible if the key
 * is statically known.
 */
static bool can_elide_value_nullness(enum bpf_map_type type)
{
        switch (type) {
        case BPF_MAP_TYPE_ARRAY:
        case BPF_MAP_TYPE_PERCPU_ARRAY:
                return true;
        default:
                return false;
        }
}

int bpf_get_helper_proto(struct bpf_verifier_env *env, int func_id,
                         const struct bpf_func_proto **ptr)
{
        if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID)
                return -ERANGE;

        if (!env->ops->get_func_proto)
                return -EINVAL;

        *ptr = env->ops->get_func_proto(func_id, env->prog);
        return *ptr && (*ptr)->func ? 0 : -EINVAL;
}

/* Check if we're in a sleepable context. */
static inline bool in_sleepable_context(struct bpf_verifier_env *env)
{
        return !env->cur_state->active_rcu_locks &&
               !env->cur_state->active_preempt_locks &&
               !env->cur_state->active_locks &&
               !env->cur_state->active_irq_id &&
               in_sleepable(env);
}

static const char *non_sleepable_context_description(struct bpf_verifier_env *env)
{
        if (env->cur_state->active_rcu_locks)
                return "rcu_read_lock region";
        if (env->cur_state->active_preempt_locks)
                return "non-preemptible region";
        if (env->cur_state->active_irq_id)
                return "IRQ-disabled region";
        if (env->cur_state->active_locks)
                return "lock region";
        return "non-sleepable prog";
}

static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                             int *insn_idx_p)
{
        enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
        bool returns_cpu_specific_alloc_ptr = false;
        const struct bpf_func_proto *fn = NULL;
        enum bpf_return_type ret_type;
        enum bpf_type_flag ret_flag;
        struct bpf_reg_state *regs;
        struct bpf_call_arg_meta meta;
        int insn_idx = *insn_idx_p;
        bool changes_data;
        int i, err, func_id;

        /* find function prototype */
        func_id = insn->imm;
        err = bpf_get_helper_proto(env, insn->imm, &fn);
        if (err == -ERANGE) {
                verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id);
                return -EINVAL;
        }

        if (err) {
                verbose(env, "program of this type cannot use helper %s#%d\n",
                        func_id_name(func_id), func_id);
                return err;
        }

        /* eBPF programs must be GPL compatible to use GPL-ed functions */
        if (!env->prog->gpl_compatible && fn->gpl_only) {
                verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n");
                return -EINVAL;
        }

        if (fn->allowed && !fn->allowed(env->prog)) {
                verbose(env, "helper call is not allowed in probe\n");
                return -EINVAL;
        }

        /* With LD_ABS/IND some JITs save/restore skb from r1. */
        changes_data = bpf_helper_changes_pkt_data(func_id);
        if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
                verifier_bug(env, "func %s#%d: r1 != ctx", func_id_name(func_id), func_id);
                return -EFAULT;
        }

        memset(&meta, 0, sizeof(meta));
        meta.pkt_access = fn->pkt_access;

        err = check_func_proto(fn);
        if (err) {
                verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id);
                return err;
        }

        if (fn->might_sleep && !in_sleepable_context(env)) {
                verbose(env, "sleepable helper %s#%d in %s\n", func_id_name(func_id), func_id,
                        non_sleepable_context_description(env));
                return -EINVAL;
        }

        /* Track non-sleepable context for helpers. */
        if (!in_sleepable_context(env))
                env->insn_aux_data[insn_idx].non_sleepable = true;

        meta.func_id = func_id;
        /* check args */
        for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
                err = check_func_arg(env, i, &meta, fn, insn_idx);
                if (err)
                        return err;
        }

        err = record_func_map(env, &meta, func_id, insn_idx);
        if (err)
                return err;

        err = record_func_key(env, &meta, func_id, insn_idx);
        if (err)
                return err;

        /* Mark slots with STACK_MISC in case of raw mode, stack offset
         * is inferred from register state.
         */
        for (i = 0; i < meta.access_size; i++) {
                err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
                                       BPF_WRITE, -1, false, false);
                if (err)
                        return err;
        }

        regs = cur_regs(env);

        if (meta.release_regno) {
                err = -EINVAL;
                if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
                        err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
                } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
                        u32 ref_obj_id = meta.ref_obj_id;
                        bool in_rcu = in_rcu_cs(env);
                        struct bpf_func_state *state;
                        struct bpf_reg_state *reg;

                        err = release_reference_nomark(env->cur_state, ref_obj_id);
                        if (!err) {
                                bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
                                        if (reg->ref_obj_id == ref_obj_id) {
                                                if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
                                                        reg->ref_obj_id = 0;
                                                        reg->type &= ~MEM_ALLOC;
                                                        reg->type |= MEM_RCU;
                                                } else {
                                                        mark_reg_invalid(env, reg);
                                                }
                                        }
                                }));
                        }
                } else if (meta.ref_obj_id) {
                        err = release_reference(env, meta.ref_obj_id);
                } else if (bpf_register_is_null(&regs[meta.release_regno])) {
                        /* meta.ref_obj_id can only be 0 if register that is meant to be
                         * released is NULL, which must be > R0.
                         */
                        err = 0;
                }
                if (err) {
                        verbose(env, "func %s#%d reference has not been acquired before\n",
                                func_id_name(func_id), func_id);
                        return err;
                }
        }

        switch (func_id) {
        case BPF_FUNC_tail_call:
                err = check_resource_leak(env, false, true, "tail_call");
                if (err)
                        return err;
                break;
        case BPF_FUNC_get_local_storage:
                /* check that flags argument in get_local_storage(map, flags) is 0,
                 * this is required because get_local_storage() can't return an error.
                 */
                if (!bpf_register_is_null(&regs[BPF_REG_2])) {
                        verbose(env, "get_local_storage() doesn't support non-zero flags\n");
                        return -EINVAL;
                }
                break;
        case BPF_FUNC_for_each_map_elem:
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_map_elem_callback_state);
                break;
        case BPF_FUNC_timer_set_callback:
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_timer_callback_state);
                break;
        case BPF_FUNC_find_vma:
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_find_vma_callback_state);
                break;
        case BPF_FUNC_snprintf:
                err = check_bpf_snprintf_call(env, regs);
                break;
        case BPF_FUNC_loop:
                update_loop_inline_state(env, meta.subprogno);
                /* Verifier relies on R1 value to determine if bpf_loop() iteration
                 * is finished, thus mark it precise.
                 */
                err = mark_chain_precision(env, BPF_REG_1);
                if (err)
                        return err;
                if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
                        err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                                 set_loop_callback_state);
                } else {
                        cur_func(env)->callback_depth = 0;
                        if (env->log.level & BPF_LOG_LEVEL2)
                                verbose(env, "frame%d bpf_loop iteration limit reached\n",
                                        env->cur_state->curframe);
                }
                break;
        case BPF_FUNC_dynptr_from_mem:
                if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
                        verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
                                reg_type_str(env, regs[BPF_REG_1].type));
                        return -EACCES;
                }
                break;
        case BPF_FUNC_set_retval:
                if (prog_type == BPF_PROG_TYPE_LSM &&
                    env->prog->expected_attach_type == BPF_LSM_CGROUP) {
                        if (!env->prog->aux->attach_func_proto->type) {
                                /* Make sure programs that attach to void
                                 * hooks don't try to modify return value.
                                 */
                                verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
                                return -EINVAL;
                        }
                }
                break;
        case BPF_FUNC_dynptr_data:
        {
                struct bpf_reg_state *reg;
                int id, ref_obj_id;

                reg = get_dynptr_arg_reg(env, fn, regs);
                if (!reg)
                        return -EFAULT;


                if (meta.dynptr_id) {
                        verifier_bug(env, "meta.dynptr_id already set");
                        return -EFAULT;
                }
                if (meta.ref_obj_id) {
                        verifier_bug(env, "meta.ref_obj_id already set");
                        return -EFAULT;
                }

                id = dynptr_id(env, reg);
                if (id < 0) {
                        verifier_bug(env, "failed to obtain dynptr id");
                        return id;
                }

                ref_obj_id = dynptr_ref_obj_id(env, reg);
                if (ref_obj_id < 0) {
                        verifier_bug(env, "failed to obtain dynptr ref_obj_id");
                        return ref_obj_id;
                }

                meta.dynptr_id = id;
                meta.ref_obj_id = ref_obj_id;

                break;
        }
        case BPF_FUNC_dynptr_write:
        {
                enum bpf_dynptr_type dynptr_type;
                struct bpf_reg_state *reg;

                reg = get_dynptr_arg_reg(env, fn, regs);
                if (!reg)
                        return -EFAULT;

                dynptr_type = dynptr_get_type(env, reg);
                if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
                        return -EFAULT;

                if (dynptr_type == BPF_DYNPTR_TYPE_SKB ||
                    dynptr_type == BPF_DYNPTR_TYPE_SKB_META)
                        /* this will trigger clear_all_pkt_pointers(), which will
                         * invalidate all dynptr slices associated with the skb
                         */
                        changes_data = true;

                break;
        }
        case BPF_FUNC_per_cpu_ptr:
        case BPF_FUNC_this_cpu_ptr:
        {
                struct bpf_reg_state *reg = &regs[BPF_REG_1];
                const struct btf_type *type;

                if (reg->type & MEM_RCU) {
                        type = btf_type_by_id(reg->btf, reg->btf_id);
                        if (!type || !btf_type_is_struct(type)) {
                                verbose(env, "Helper has invalid btf/btf_id in R1\n");
                                return -EFAULT;
                        }
                        returns_cpu_specific_alloc_ptr = true;
                        env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
                }
                break;
        }
        case BPF_FUNC_user_ringbuf_drain:
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_user_ringbuf_callback_state);
                break;
        }

        if (err)
                return err;

        /* reset caller saved regs */
        for (i = 0; i < CALLER_SAVED_REGS; i++) {
                bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
                check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
        }

        /* helper call returns 64-bit value. */
        regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;

        /* update return register (already marked as written above) */
        ret_type = fn->ret_type;
        ret_flag = type_flag(ret_type);

        switch (base_type(ret_type)) {
        case RET_INTEGER:
                /* sets type to SCALAR_VALUE */
                mark_reg_unknown(env, regs, BPF_REG_0);
                break;
        case RET_VOID:
                regs[BPF_REG_0].type = NOT_INIT;
                break;
        case RET_PTR_TO_MAP_VALUE:
                /* There is no offset yet applied, variable or fixed */
                mark_reg_known_zero(env, regs, BPF_REG_0);
                /* remember map_ptr, so that check_map_access()
                 * can check 'value_size' boundary of memory access
                 * to map element returned from bpf_map_lookup_elem()
                 */
                if (meta.map.ptr == NULL) {
                        verifier_bug(env, "unexpected null map_ptr");
                        return -EFAULT;
                }

                if (func_id == BPF_FUNC_map_lookup_elem &&
                    can_elide_value_nullness(meta.map.ptr->map_type) &&
                    meta.const_map_key >= 0 &&
                    meta.const_map_key < meta.map.ptr->max_entries)
                        ret_flag &= ~PTR_MAYBE_NULL;

                regs[BPF_REG_0].map_ptr = meta.map.ptr;
                regs[BPF_REG_0].map_uid = meta.map.uid;
                regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
                if (!type_may_be_null(ret_flag) &&
                    btf_record_has_field(meta.map.ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
                        regs[BPF_REG_0].id = ++env->id_gen;
                }
                break;
        case RET_PTR_TO_SOCKET:
                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
                break;
        case RET_PTR_TO_SOCK_COMMON:
                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
                break;
        case RET_PTR_TO_TCP_SOCK:
                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
                break;
        case RET_PTR_TO_MEM:
                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
                regs[BPF_REG_0].mem_size = meta.mem_size;
                break;
        case RET_PTR_TO_MEM_OR_BTF_ID:
        {
                const struct btf_type *t;

                mark_reg_known_zero(env, regs, BPF_REG_0);
                t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL);
                if (!btf_type_is_struct(t)) {
                        u32 tsize;
                        const struct btf_type *ret;
                        const char *tname;

                        /* resolve the type size of ksym. */
                        ret = btf_resolve_size(meta.ret_btf, t, &tsize);
                        if (IS_ERR(ret)) {
                                tname = btf_name_by_offset(meta.ret_btf, t->name_off);
                                verbose(env, "unable to resolve the size of type '%s': %ld\n",
                                        tname, PTR_ERR(ret));
                                return -EINVAL;
                        }
                        regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
                        regs[BPF_REG_0].mem_size = tsize;
                } else {
                        if (returns_cpu_specific_alloc_ptr) {
                                regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
                        } else {
                                /* MEM_RDONLY may be carried from ret_flag, but it
                                 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
                                 * it will confuse the check of PTR_TO_BTF_ID in
                                 * check_mem_access().
                                 */
                                ret_flag &= ~MEM_RDONLY;
                                regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
                        }

                        regs[BPF_REG_0].btf = meta.ret_btf;
                        regs[BPF_REG_0].btf_id = meta.ret_btf_id;
                }
                break;
        }
        case RET_PTR_TO_BTF_ID:
        {
                struct btf *ret_btf;
                int ret_btf_id;

                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
                if (func_id == BPF_FUNC_kptr_xchg) {
                        ret_btf = meta.kptr_field->kptr.btf;
                        ret_btf_id = meta.kptr_field->kptr.btf_id;
                        if (!btf_is_kernel(ret_btf)) {
                                regs[BPF_REG_0].type |= MEM_ALLOC;
                                if (meta.kptr_field->type == BPF_KPTR_PERCPU)
                                        regs[BPF_REG_0].type |= MEM_PERCPU;
                        }
                } else {
                        if (fn->ret_btf_id == BPF_PTR_POISON) {
                                verifier_bug(env, "func %s has non-overwritten BPF_PTR_POISON return type",
                                             func_id_name(func_id));
                                return -EFAULT;
                        }
                        ret_btf = btf_vmlinux;
                        ret_btf_id = *fn->ret_btf_id;
                }
                if (ret_btf_id == 0) {
                        verbose(env, "invalid return type %u of func %s#%d\n",
                                base_type(ret_type), func_id_name(func_id),
                                func_id);
                        return -EINVAL;
                }
                regs[BPF_REG_0].btf = ret_btf;
                regs[BPF_REG_0].btf_id = ret_btf_id;
                break;
        }
        default:
                verbose(env, "unknown return type %u of func %s#%d\n",
                        base_type(ret_type), func_id_name(func_id), func_id);
                return -EINVAL;
        }

        if (type_may_be_null(regs[BPF_REG_0].type))
                regs[BPF_REG_0].id = ++env->id_gen;

        if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) {
                verifier_bug(env, "func %s#%d sets ref_obj_id more than once",
                             func_id_name(func_id), func_id);
                return -EFAULT;
        }

        if (is_dynptr_ref_function(func_id))
                regs[BPF_REG_0].dynptr_id = meta.dynptr_id;

        if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
                /* For release_reference() */
                regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
        } else if (is_acquire_function(func_id, meta.map.ptr)) {
                int id = acquire_reference(env, insn_idx);

                if (id < 0)
                        return id;
                /* For mark_ptr_or_null_reg() */
                regs[BPF_REG_0].id = id;
                /* For release_reference() */
                regs[BPF_REG_0].ref_obj_id = id;
        }

        err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
        if (err)
                return err;

        err = check_map_func_compatibility(env, meta.map.ptr, func_id);
        if (err)
                return err;

        if ((func_id == BPF_FUNC_get_stack ||
             func_id == BPF_FUNC_get_task_stack) &&
            !env->prog->has_callchain_buf) {
                const char *err_str;

#ifdef CONFIG_PERF_EVENTS
                err = get_callchain_buffers(sysctl_perf_event_max_stack);
                err_str = "cannot get callchain buffer for func %s#%d\n";
#else
                err = -ENOTSUPP;
                err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
#endif
                if (err) {
                        verbose(env, err_str, func_id_name(func_id), func_id);
                        return err;
                }

                env->prog->has_callchain_buf = true;
        }

        if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
                env->prog->call_get_stack = true;

        if (func_id == BPF_FUNC_get_func_ip) {
                if (check_get_func_ip(env))
                        return -ENOTSUPP;
                env->prog->call_get_func_ip = true;
        }

        if (func_id == BPF_FUNC_tail_call) {
                if (env->cur_state->curframe) {
                        struct bpf_verifier_state *branch;

                        mark_reg_scratched(env, BPF_REG_0);
                        branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
                        if (IS_ERR(branch))
                                return PTR_ERR(branch);
                        clear_all_pkt_pointers(env);
                        mark_reg_unknown(env, regs, BPF_REG_0);
                        err = prepare_func_exit(env, &env->insn_idx);
                        if (err)
                                return err;
                        env->insn_idx--;
                } else {
                        changes_data = false;
                }
        }

        if (changes_data)
                clear_all_pkt_pointers(env);
        return 0;
}

/* mark_btf_func_reg_size() is used when the reg size is determined by
 * the BTF func_proto's return value size and argument.
 */
static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs,
                                     u32 regno, size_t reg_size)
{
        struct bpf_reg_state *reg = &regs[regno];

        if (regno == BPF_REG_0) {
                /* Function return value */
                reg->subreg_def = reg_size == sizeof(u64) ?
                        DEF_NOT_SUBREG : env->insn_idx + 1;
        } else if (reg_size == sizeof(u64)) {
                /* Function argument */
                mark_insn_zext(env, reg);
        }
}

static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
                                   size_t reg_size)
{
        return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size);
}

static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_ACQUIRE;
}

static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_RELEASE;
}


static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_DESTRUCTIVE;
}

static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_RCU;
}

static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_RCU_PROTECTED;
}

static bool is_kfunc_arg_mem_size(const struct btf *btf,
                                  const struct btf_param *arg,
                                  const struct bpf_reg_state *reg)
{
        const struct btf_type *t;

        t = btf_type_skip_modifiers(btf, arg->type, NULL);
        if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
                return false;

        return btf_param_match_suffix(btf, arg, "__sz");
}

static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
                                        const struct btf_param *arg,
                                        const struct bpf_reg_state *reg)
{
        const struct btf_type *t;

        t = btf_type_skip_modifiers(btf, arg->type, NULL);
        if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
                return false;

        return btf_param_match_suffix(btf, arg, "__szk");
}

static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__k");
}

static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__ign");
}

static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__map");
}

static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__alloc");
}

static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__uninit");
}

static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__refcounted_kptr");
}

static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__nullable");
}

static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__str");
}

static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__irq_flag");
}

static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
                                          const struct btf_param *arg,
                                          const char *name)
{
        int len, target_len = strlen(name);
        const char *param_name;

        param_name = btf_name_by_offset(btf, arg->name_off);
        if (str_is_empty(param_name))
                return false;
        len = strlen(param_name);
        if (len != target_len)
                return false;
        if (strcmp(param_name, name))
                return false;

        return true;
}

enum {
        KF_ARG_DYNPTR_ID,
        KF_ARG_LIST_HEAD_ID,
        KF_ARG_LIST_NODE_ID,
        KF_ARG_RB_ROOT_ID,
        KF_ARG_RB_NODE_ID,
        KF_ARG_WORKQUEUE_ID,
        KF_ARG_RES_SPIN_LOCK_ID,
        KF_ARG_TASK_WORK_ID,
        KF_ARG_PROG_AUX_ID,
        KF_ARG_TIMER_ID
};

BTF_ID_LIST(kf_arg_btf_ids)
BTF_ID(struct, bpf_dynptr)
BTF_ID(struct, bpf_list_head)
BTF_ID(struct, bpf_list_node)
BTF_ID(struct, bpf_rb_root)
BTF_ID(struct, bpf_rb_node)
BTF_ID(struct, bpf_wq)
BTF_ID(struct, bpf_res_spin_lock)
BTF_ID(struct, bpf_task_work)
BTF_ID(struct, bpf_prog_aux)
BTF_ID(struct, bpf_timer)

static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
                                    const struct btf_param *arg, int type)
{
        const struct btf_type *t;
        u32 res_id;

        t = btf_type_skip_modifiers(btf, arg->type, NULL);
        if (!t)
                return false;
        if (!btf_type_is_ptr(t))
                return false;
        t = btf_type_skip_modifiers(btf, t->type, &res_id);
        if (!t)
                return false;
        return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
}

static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID);
}

static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID);
}

static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
}

static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID);
}

static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
}

static bool is_kfunc_arg_timer(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TIMER_ID);
}

static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
}

static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID);
}

static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID);
}

static bool is_rbtree_node_type(const struct btf_type *t)
{
        return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]);
}

static bool is_list_node_type(const struct btf_type *t)
{
        return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]);
}

static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
                                  const struct btf_param *arg)
{
        const struct btf_type *t;

        t = btf_type_resolve_func_ptr(btf, arg->type, NULL);
        if (!t)
                return false;

        return true;
}

static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID);
}

/*
 * A kfunc with KF_IMPLICIT_ARGS has two prototypes in BTF:
 *   - the _impl prototype with full arg list (meta->func_proto)
 *   - the BPF API prototype w/o implicit args (func->type in BTF)
 * To determine whether an argument is implicit, we compare its position
 * against the number of arguments in the prototype w/o implicit args.
 */
static bool is_kfunc_arg_implicit(const struct bpf_kfunc_call_arg_meta *meta, u32 arg_idx)
{
        const struct btf_type *func, *func_proto;
        u32 argn;

        if (!(meta->kfunc_flags & KF_IMPLICIT_ARGS))
                return false;

        func = btf_type_by_id(meta->btf, meta->func_id);
        func_proto = btf_type_by_id(meta->btf, func->type);
        argn = btf_type_vlen(func_proto);

        return argn <= arg_idx;
}

/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
                                        const struct btf *btf,
                                        const struct btf_type *t, int rec)
{
        const struct btf_type *member_type;
        const struct btf_member *member;
        u32 i;

        if (!btf_type_is_struct(t))
                return false;

        for_each_member(i, t, member) {
                const struct btf_array *array;

                member_type = btf_type_skip_modifiers(btf, member->type, NULL);
                if (btf_type_is_struct(member_type)) {
                        if (rec >= 3) {
                                verbose(env, "max struct nesting depth exceeded\n");
                                return false;
                        }
                        if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
                                return false;
                        continue;
                }
                if (btf_type_is_array(member_type)) {
                        array = btf_array(member_type);
                        if (!array->nelems)
                                return false;
                        member_type = btf_type_skip_modifiers(btf, array->type, NULL);
                        if (!btf_type_is_scalar(member_type))
                                return false;
                        continue;
                }
                if (!btf_type_is_scalar(member_type))
                        return false;
        }
        return true;
}

enum kfunc_ptr_arg_type {
        KF_ARG_PTR_TO_CTX,
        KF_ARG_PTR_TO_ALLOC_BTF_ID,    /* Allocated object */
        KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
        KF_ARG_PTR_TO_DYNPTR,
        KF_ARG_PTR_TO_ITER,
        KF_ARG_PTR_TO_LIST_HEAD,
        KF_ARG_PTR_TO_LIST_NODE,
        KF_ARG_PTR_TO_BTF_ID,               /* Also covers reg2btf_ids conversions */
        KF_ARG_PTR_TO_MEM,
        KF_ARG_PTR_TO_MEM_SIZE,               /* Size derived from next argument, skip it */
        KF_ARG_PTR_TO_CALLBACK,
        KF_ARG_PTR_TO_RB_ROOT,
        KF_ARG_PTR_TO_RB_NODE,
        KF_ARG_PTR_TO_NULL,
        KF_ARG_PTR_TO_CONST_STR,
        KF_ARG_PTR_TO_MAP,
        KF_ARG_PTR_TO_TIMER,
        KF_ARG_PTR_TO_WORKQUEUE,
        KF_ARG_PTR_TO_IRQ_FLAG,
        KF_ARG_PTR_TO_RES_SPIN_LOCK,
        KF_ARG_PTR_TO_TASK_WORK,
};

enum special_kfunc_type {
        KF_bpf_obj_new_impl,
        KF_bpf_obj_new,
        KF_bpf_obj_drop_impl,
        KF_bpf_obj_drop,
        KF_bpf_refcount_acquire_impl,
        KF_bpf_refcount_acquire,
        KF_bpf_list_push_front_impl,
        KF_bpf_list_push_front,
        KF_bpf_list_push_back_impl,
        KF_bpf_list_push_back,
        KF_bpf_list_pop_front,
        KF_bpf_list_pop_back,
        KF_bpf_list_front,
        KF_bpf_list_back,
        KF_bpf_cast_to_kern_ctx,
        KF_bpf_rdonly_cast,
        KF_bpf_rcu_read_lock,
        KF_bpf_rcu_read_unlock,
        KF_bpf_rbtree_remove,
        KF_bpf_rbtree_add_impl,
        KF_bpf_rbtree_add,
        KF_bpf_rbtree_first,
        KF_bpf_rbtree_root,
        KF_bpf_rbtree_left,
        KF_bpf_rbtree_right,
        KF_bpf_dynptr_from_skb,
        KF_bpf_dynptr_from_xdp,
        KF_bpf_dynptr_from_skb_meta,
        KF_bpf_xdp_pull_data,
        KF_bpf_dynptr_slice,
        KF_bpf_dynptr_slice_rdwr,
        KF_bpf_dynptr_clone,
        KF_bpf_percpu_obj_new_impl,
        KF_bpf_percpu_obj_new,
        KF_bpf_percpu_obj_drop_impl,
        KF_bpf_percpu_obj_drop,
        KF_bpf_throw,
        KF_bpf_wq_set_callback,
        KF_bpf_preempt_disable,
        KF_bpf_preempt_enable,
        KF_bpf_iter_css_task_new,
        KF_bpf_session_cookie,
        KF_bpf_get_kmem_cache,
        KF_bpf_local_irq_save,
        KF_bpf_local_irq_restore,
        KF_bpf_iter_num_new,
        KF_bpf_iter_num_next,
        KF_bpf_iter_num_destroy,
        KF_bpf_set_dentry_xattr,
        KF_bpf_remove_dentry_xattr,
        KF_bpf_res_spin_lock,
        KF_bpf_res_spin_unlock,
        KF_bpf_res_spin_lock_irqsave,
        KF_bpf_res_spin_unlock_irqrestore,
        KF_bpf_dynptr_from_file,
        KF_bpf_dynptr_file_discard,
        KF___bpf_trap,
        KF_bpf_task_work_schedule_signal,
        KF_bpf_task_work_schedule_resume,
        KF_bpf_arena_alloc_pages,
        KF_bpf_arena_free_pages,
        KF_bpf_arena_reserve_pages,
        KF_bpf_session_is_return,
        KF_bpf_stream_vprintk,
        KF_bpf_stream_print_stack,
};

BTF_ID_LIST(special_kfunc_list)
BTF_ID(func, bpf_obj_new_impl)
BTF_ID(func, bpf_obj_new)
BTF_ID(func, bpf_obj_drop_impl)
BTF_ID(func, bpf_obj_drop)
BTF_ID(func, bpf_refcount_acquire_impl)
BTF_ID(func, bpf_refcount_acquire)
BTF_ID(func, bpf_list_push_front_impl)
BTF_ID(func, bpf_list_push_front)
BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_push_back)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
BTF_ID(func, bpf_list_front)
BTF_ID(func, bpf_list_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rcu_read_lock)
BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_add)
BTF_ID(func, bpf_rbtree_first)
BTF_ID(func, bpf_rbtree_root)
BTF_ID(func, bpf_rbtree_left)
BTF_ID(func, bpf_rbtree_right)
#ifdef CONFIG_NET
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_from_skb_meta)
BTF_ID(func, bpf_xdp_pull_data)
#else
BTF_ID_UNUSED
BTF_ID_UNUSED
BTF_ID_UNUSED
BTF_ID_UNUSED
#endif
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_new)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_percpu_obj_drop)
BTF_ID(func, bpf_throw)
BTF_ID(func, bpf_wq_set_callback)
BTF_ID(func, bpf_preempt_disable)
BTF_ID(func, bpf_preempt_enable)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#else
BTF_ID_UNUSED
#endif
#ifdef CONFIG_BPF_EVENTS
BTF_ID(func, bpf_session_cookie)
#else
BTF_ID_UNUSED
#endif
BTF_ID(func, bpf_get_kmem_cache)
BTF_ID(func, bpf_local_irq_save)
BTF_ID(func, bpf_local_irq_restore)
BTF_ID(func, bpf_iter_num_new)
BTF_ID(func, bpf_iter_num_next)
BTF_ID(func, bpf_iter_num_destroy)
#ifdef CONFIG_BPF_LSM
BTF_ID(func, bpf_set_dentry_xattr)
BTF_ID(func, bpf_remove_dentry_xattr)
#else
BTF_ID_UNUSED
BTF_ID_UNUSED
#endif
BTF_ID(func, bpf_res_spin_lock)
BTF_ID(func, bpf_res_spin_unlock)
BTF_ID(func, bpf_res_spin_lock_irqsave)
BTF_ID(func, bpf_res_spin_unlock_irqrestore)
BTF_ID(func, bpf_dynptr_from_file)
BTF_ID(func, bpf_dynptr_file_discard)
BTF_ID(func, __bpf_trap)
BTF_ID(func, bpf_task_work_schedule_signal)
BTF_ID(func, bpf_task_work_schedule_resume)
BTF_ID(func, bpf_arena_alloc_pages)
BTF_ID(func, bpf_arena_free_pages)
BTF_ID(func, bpf_arena_reserve_pages)
BTF_ID(func, bpf_session_is_return)
BTF_ID(func, bpf_stream_vprintk)
BTF_ID(func, bpf_stream_print_stack)

static bool is_bpf_obj_new_kfunc(u32 func_id)
{
        return func_id == special_kfunc_list[KF_bpf_obj_new] ||
               func_id == special_kfunc_list[KF_bpf_obj_new_impl];
}

static bool is_bpf_percpu_obj_new_kfunc(u32 func_id)
{
        return func_id == special_kfunc_list[KF_bpf_percpu_obj_new] ||
               func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl];
}

static bool is_bpf_obj_drop_kfunc(u32 func_id)
{
        return func_id == special_kfunc_list[KF_bpf_obj_drop] ||
               func_id == special_kfunc_list[KF_bpf_obj_drop_impl];
}

static bool is_bpf_percpu_obj_drop_kfunc(u32 func_id)
{
        return func_id == special_kfunc_list[KF_bpf_percpu_obj_drop] ||
               func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl];
}

static bool is_bpf_refcount_acquire_kfunc(u32 func_id)
{
        return func_id == special_kfunc_list[KF_bpf_refcount_acquire] ||
               func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}

static bool is_bpf_list_push_kfunc(u32 func_id)
{
        return func_id == special_kfunc_list[KF_bpf_list_push_front] ||
               func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
               func_id == special_kfunc_list[KF_bpf_list_push_back] ||
               func_id == special_kfunc_list[KF_bpf_list_push_back_impl];
}

static bool is_bpf_rbtree_add_kfunc(u32 func_id)
{
        return func_id == special_kfunc_list[KF_bpf_rbtree_add] ||
               func_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}

static bool is_task_work_add_kfunc(u32 func_id)
{
        return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
               func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume];
}

static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
        if (is_bpf_refcount_acquire_kfunc(meta->func_id) && meta->arg_owning_ref)
                return false;

        return meta->kfunc_flags & KF_RET_NULL;
}

static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
}

static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
}

static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable];
}

static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
}

bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data];
}

static enum kfunc_ptr_arg_type
get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
                       struct bpf_kfunc_call_arg_meta *meta,
                       const struct btf_type *t, const struct btf_type *ref_t,
                       const char *ref_tname, const struct btf_param *args,
                       int argno, int nargs)
{
        u32 regno = argno + 1;
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = &regs[regno];
        bool arg_mem_size = false;

        if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
            meta->func_id == special_kfunc_list[KF_bpf_session_is_return] ||
            meta->func_id == special_kfunc_list[KF_bpf_session_cookie])
                return KF_ARG_PTR_TO_CTX;

        if (argno + 1 < nargs &&
            (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
             is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
                arg_mem_size = true;

        /* In this function, we verify the kfunc's BTF as per the argument type,
         * leaving the rest of the verification with respect to the register
         * type to our caller. When a set of conditions hold in the BTF type of
         * arguments, we resolve it to a known kfunc_ptr_arg_type.
         */
        if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
                return KF_ARG_PTR_TO_CTX;

        if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) &&
            !arg_mem_size)
                return KF_ARG_PTR_TO_NULL;

        if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_ALLOC_BTF_ID;

        if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_REFCOUNTED_KPTR;

        if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_DYNPTR;

        if (is_kfunc_arg_iter(meta, argno, &args[argno]))
                return KF_ARG_PTR_TO_ITER;

        if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_LIST_HEAD;

        if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_LIST_NODE;

        if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_RB_ROOT;

        if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_RB_NODE;

        if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_CONST_STR;

        if (is_kfunc_arg_map(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_MAP;

        if (is_kfunc_arg_wq(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_WORKQUEUE;

        if (is_kfunc_arg_timer(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_TIMER;

        if (is_kfunc_arg_task_work(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_TASK_WORK;

        if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_IRQ_FLAG;

        if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_RES_SPIN_LOCK;

        if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
                if (!btf_type_is_struct(ref_t)) {
                        verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
                                meta->func_name, argno, btf_type_str(ref_t), ref_tname);
                        return -EINVAL;
                }
                return KF_ARG_PTR_TO_BTF_ID;
        }

        if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_CALLBACK;

        /* This is the catch all argument type of register types supported by
         * check_helper_mem_access. However, we only allow when argument type is
         * pointer to scalar, or struct composed (recursively) of scalars. When
         * arg_mem_size is true, the pointer can be void *.
         */
        if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
            (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
                verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
                        argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
                return -EINVAL;
        }
        return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
}

static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
                                        struct bpf_reg_state *reg,
                                        const struct btf_type *ref_t,
                                        const char *ref_tname, u32 ref_id,
                                        struct bpf_kfunc_call_arg_meta *meta,
                                        int argno)
{
        const struct btf_type *reg_ref_t;
        bool strict_type_match = false;
        const struct btf *reg_btf;
        const char *reg_ref_tname;
        bool taking_projection;
        bool struct_same;
        u32 reg_ref_id;

        if (base_type(reg->type) == PTR_TO_BTF_ID) {
                reg_btf = reg->btf;
                reg_ref_id = reg->btf_id;
        } else {
                reg_btf = btf_vmlinux;
                reg_ref_id = *reg2btf_ids[base_type(reg->type)];
        }

        /* Enforce strict type matching for calls to kfuncs that are acquiring
         * or releasing a reference, or are no-cast aliases. We do _not_
         * enforce strict matching for kfuncs by default,
         * as we want to enable BPF programs to pass types that are bitwise
         * equivalent without forcing them to explicitly cast with something
         * like bpf_cast_to_kern_ctx().
         *
         * For example, say we had a type like the following:
         *
         * struct bpf_cpumask {
         *        cpumask_t cpumask;
         *        refcount_t usage;
         * };
         *
         * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed
         * to a struct cpumask, so it would be safe to pass a struct
         * bpf_cpumask * to a kfunc expecting a struct cpumask *.
         *
         * The philosophy here is similar to how we allow scalars of different
         * types to be passed to kfuncs as long as the size is the same. The
         * only difference here is that we're simply allowing
         * btf_struct_ids_match() to walk the struct at the 0th offset, and
         * resolve types.
         */
        if ((is_kfunc_release(meta) && reg->ref_obj_id) ||
            btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
                strict_type_match = true;

        WARN_ON_ONCE(is_kfunc_release(meta) && !tnum_is_const(reg->var_off));

        reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
        reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
        struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->var_off.value,
                                           meta->btf, ref_id, strict_type_match);
        /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot
         * actually use it -- it must cast to the underlying type. So we allow
         * caller to pass in the underlying type.
         */
        taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname);
        if (!taking_projection && !struct_same) {
                verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
                        meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
                        btf_type_str(reg_ref_t), reg_ref_tname);
                return -EINVAL;
        }
        return 0;
}

static int process_irq_flag(struct bpf_verifier_env *env, int regno,
                             struct bpf_kfunc_call_arg_meta *meta)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        int err, kfunc_class = IRQ_NATIVE_KFUNC;
        bool irq_save;

        if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] ||
            meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) {
                irq_save = true;
                if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
                        kfunc_class = IRQ_LOCK_KFUNC;
        } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] ||
                   meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) {
                irq_save = false;
                if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
                        kfunc_class = IRQ_LOCK_KFUNC;
        } else {
                verifier_bug(env, "unknown irq flags kfunc");
                return -EFAULT;
        }

        if (irq_save) {
                if (!is_irq_flag_reg_valid_uninit(env, reg)) {
                        verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1);
                        return -EINVAL;
                }

                err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false);
                if (err)
                        return err;

                err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class);
                if (err)
                        return err;
        } else {
                err = is_irq_flag_reg_valid_init(env, reg);
                if (err) {
                        verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1);
                        return err;
                }

                err = mark_irq_flag_read(env, reg);
                if (err)
                        return err;

                err = unmark_stack_slot_irq_flag(env, reg, kfunc_class);
                if (err)
                        return err;
        }
        return 0;
}


static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct btf_record *rec = reg_btf_record(reg);

        if (!env->cur_state->active_locks) {
                verifier_bug(env, "%s w/o active lock", __func__);
                return -EFAULT;
        }

        if (type_flag(reg->type) & NON_OWN_REF) {
                verifier_bug(env, "NON_OWN_REF already set");
                return -EFAULT;
        }

        reg->type |= NON_OWN_REF;
        if (rec->refcount_off >= 0)
                reg->type |= MEM_RCU;

        return 0;
}

static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
{
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_func_state *unused;
        struct bpf_reg_state *reg;
        int i;

        if (!ref_obj_id) {
                verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion");
                return -EFAULT;
        }

        for (i = 0; i < state->acquired_refs; i++) {
                if (state->refs[i].id != ref_obj_id)
                        continue;

                /* Clear ref_obj_id here so release_reference doesn't clobber
                 * the whole reg
                 */
                bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
                        if (reg->ref_obj_id == ref_obj_id) {
                                reg->ref_obj_id = 0;
                                ref_set_non_owning(env, reg);
                        }
                }));
                return 0;
        }

        verifier_bug(env, "ref state missing for ref_obj_id");
        return -EFAULT;
}

/* Implementation details:
 *
 * Each register points to some region of memory, which we define as an
 * allocation. Each allocation may embed a bpf_spin_lock which protects any
 * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same
 * allocation. The lock and the data it protects are colocated in the same
 * memory region.
 *
 * Hence, everytime a register holds a pointer value pointing to such
 * allocation, the verifier preserves a unique reg->id for it.
 *
 * The verifier remembers the lock 'ptr' and the lock 'id' whenever
 * bpf_spin_lock is called.
 *
 * To enable this, lock state in the verifier captures two values:
 *        active_lock.ptr = Register's type specific pointer
 *        active_lock.id  = A unique ID for each register pointer value
 *
 * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two
 * supported register types.
 *
 * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of
 * allocated objects is the reg->btf pointer.
 *
 * The active_lock.id is non-unique for maps supporting direct_value_addr, as we
 * can establish the provenance of the map value statically for each distinct
 * lookup into such maps. They always contain a single map value hence unique
 * IDs for each pseudo load pessimizes the algorithm and rejects valid programs.
 *
 * So, in case of global variables, they use array maps with max_entries = 1,
 * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point
 * into the same map value as max_entries is 1, as described above).
 *
 * In case of inner map lookups, the inner map pointer has same map_ptr as the
 * outer map pointer (in verifier context), but each lookup into an inner map
 * assigns a fresh reg->id to the lookup, so while lookups into distinct inner
 * maps from the same outer map share the same map_ptr as active_lock.ptr, they
 * will get different reg->id assigned to each lookup, hence different
 * active_lock.id.
 *
 * In case of allocated objects, active_lock.ptr is the reg->btf, and the
 * reg->id is a unique ID preserved after the NULL pointer check on the pointer
 * returned from bpf_obj_new. Each allocation receives a new reg->id.
 */
static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_reference_state *s;
        void *ptr;
        u32 id;

        switch ((int)reg->type) {
        case PTR_TO_MAP_VALUE:
                ptr = reg->map_ptr;
                break;
        case PTR_TO_BTF_ID | MEM_ALLOC:
                ptr = reg->btf;
                break;
        default:
                verifier_bug(env, "unknown reg type for lock check");
                return -EFAULT;
        }
        id = reg->id;

        if (!env->cur_state->active_locks)
                return -EINVAL;
        s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr);
        if (!s) {
                verbose(env, "held lock and object are not in the same allocation\n");
                return -EINVAL;
        }
        return 0;
}

static bool is_bpf_list_api_kfunc(u32 btf_id)
{
        return is_bpf_list_push_kfunc(btf_id) ||
               btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
               btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
               btf_id == special_kfunc_list[KF_bpf_list_front] ||
               btf_id == special_kfunc_list[KF_bpf_list_back];
}

static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
{
        return is_bpf_rbtree_add_kfunc(btf_id) ||
               btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
               btf_id == special_kfunc_list[KF_bpf_rbtree_first] ||
               btf_id == special_kfunc_list[KF_bpf_rbtree_root] ||
               btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
               btf_id == special_kfunc_list[KF_bpf_rbtree_right];
}

static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
{
        return btf_id == special_kfunc_list[KF_bpf_iter_num_new] ||
               btf_id == special_kfunc_list[KF_bpf_iter_num_next] ||
               btf_id == special_kfunc_list[KF_bpf_iter_num_destroy];
}

static bool is_bpf_graph_api_kfunc(u32 btf_id)
{
        return is_bpf_list_api_kfunc(btf_id) ||
               is_bpf_rbtree_api_kfunc(btf_id) ||
               is_bpf_refcount_acquire_kfunc(btf_id);
}

static bool is_bpf_res_spin_lock_kfunc(u32 btf_id)
{
        return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
               btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] ||
               btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
               btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore];
}

static bool is_bpf_arena_kfunc(u32 btf_id)
{
        return btf_id == special_kfunc_list[KF_bpf_arena_alloc_pages] ||
               btf_id == special_kfunc_list[KF_bpf_arena_free_pages] ||
               btf_id == special_kfunc_list[KF_bpf_arena_reserve_pages];
}

static bool is_bpf_stream_kfunc(u32 btf_id)
{
        return btf_id == special_kfunc_list[KF_bpf_stream_vprintk] ||
               btf_id == special_kfunc_list[KF_bpf_stream_print_stack];
}

static bool kfunc_spin_allowed(u32 btf_id)
{
        return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) ||
               is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id) ||
               is_bpf_stream_kfunc(btf_id);
}

static bool is_sync_callback_calling_kfunc(u32 btf_id)
{
        return is_bpf_rbtree_add_kfunc(btf_id);
}

static bool is_async_callback_calling_kfunc(u32 btf_id)
{
        return is_bpf_wq_set_callback_kfunc(btf_id) ||
               is_task_work_add_kfunc(btf_id);
}

static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
{
        return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
               insn->imm == special_kfunc_list[KF_bpf_throw];
}

static bool is_bpf_wq_set_callback_kfunc(u32 btf_id)
{
        return btf_id == special_kfunc_list[KF_bpf_wq_set_callback];
}

static bool is_callback_calling_kfunc(u32 btf_id)
{
        return is_sync_callback_calling_kfunc(btf_id) ||
               is_async_callback_calling_kfunc(btf_id);
}

static bool is_rbtree_lock_required_kfunc(u32 btf_id)
{
        return is_bpf_rbtree_api_kfunc(btf_id);
}

static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env,
                                          enum btf_field_type head_field_type,
                                          u32 kfunc_btf_id)
{
        bool ret;

        switch (head_field_type) {
        case BPF_LIST_HEAD:
                ret = is_bpf_list_api_kfunc(kfunc_btf_id);
                break;
        case BPF_RB_ROOT:
                ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id);
                break;
        default:
                verbose(env, "verifier internal error: unexpected graph root argument type %s\n",
                        btf_field_type_name(head_field_type));
                return false;
        }

        if (!ret)
                verbose(env, "verifier internal error: %s head arg for unknown kfunc\n",
                        btf_field_type_name(head_field_type));
        return ret;
}

static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
                                          enum btf_field_type node_field_type,
                                          u32 kfunc_btf_id)
{
        bool ret;

        switch (node_field_type) {
        case BPF_LIST_NODE:
                ret = is_bpf_list_push_kfunc(kfunc_btf_id);
                break;
        case BPF_RB_NODE:
                ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) ||
                       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
                       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
                       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]);
                break;
        default:
                verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
                        btf_field_type_name(node_field_type));
                return false;
        }

        if (!ret)
                verbose(env, "verifier internal error: %s node arg for unknown kfunc\n",
                        btf_field_type_name(node_field_type));
        return ret;
}

static int
__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *reg, u32 regno,
                                   struct bpf_kfunc_call_arg_meta *meta,
                                   enum btf_field_type head_field_type,
                                   struct btf_field **head_field)
{
        const char *head_type_name;
        struct btf_field *field;
        struct btf_record *rec;
        u32 head_off;

        if (meta->btf != btf_vmlinux) {
                verifier_bug(env, "unexpected btf mismatch in kfunc call");
                return -EFAULT;
        }

        if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id))
                return -EFAULT;

        head_type_name = btf_field_type_name(head_field_type);
        if (!tnum_is_const(reg->var_off)) {
                verbose(env,
                        "R%d doesn't have constant offset. %s has to be at the constant offset\n",
                        regno, head_type_name);
                return -EINVAL;
        }

        rec = reg_btf_record(reg);
        head_off = reg->var_off.value;
        field = btf_record_find(rec, head_off, head_field_type);
        if (!field) {
                verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
                return -EINVAL;
        }

        /* All functions require bpf_list_head to be protected using a bpf_spin_lock */
        if (check_reg_allocation_locked(env, reg)) {
                verbose(env, "bpf_spin_lock at off=%d must be held for %s\n",
                        rec->spin_lock_off, head_type_name);
                return -EINVAL;
        }

        if (*head_field) {
                verifier_bug(env, "repeating %s arg", head_type_name);
                return -EFAULT;
        }
        *head_field = field;
        return 0;
}

static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
                                           struct bpf_reg_state *reg, u32 regno,
                                           struct bpf_kfunc_call_arg_meta *meta)
{
        return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
                                                          &meta->arg_list_head.field);
}

static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
                                             struct bpf_reg_state *reg, u32 regno,
                                             struct bpf_kfunc_call_arg_meta *meta)
{
        return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
                                                          &meta->arg_rbtree_root.field);
}

static int
__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *reg, u32 regno,
                                   struct bpf_kfunc_call_arg_meta *meta,
                                   enum btf_field_type head_field_type,
                                   enum btf_field_type node_field_type,
                                   struct btf_field **node_field)
{
        const char *node_type_name;
        const struct btf_type *et, *t;
        struct btf_field *field;
        u32 node_off;

        if (meta->btf != btf_vmlinux) {
                verifier_bug(env, "unexpected btf mismatch in kfunc call");
                return -EFAULT;
        }

        if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id))
                return -EFAULT;

        node_type_name = btf_field_type_name(node_field_type);
        if (!tnum_is_const(reg->var_off)) {
                verbose(env,
                        "R%d doesn't have constant offset. %s has to be at the constant offset\n",
                        regno, node_type_name);
                return -EINVAL;
        }

        node_off = reg->var_off.value;
        field = reg_find_field_offset(reg, node_off, node_field_type);
        if (!field) {
                verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
                return -EINVAL;
        }

        field = *node_field;

        et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);
        t = btf_type_by_id(reg->btf, reg->btf_id);
        if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf,
                                  field->graph_root.value_btf_id, true)) {
                verbose(env, "operation on %s expects arg#1 %s at offset=%d "
                        "in struct %s, but arg is at offset=%d in struct %s\n",
                        btf_field_type_name(head_field_type),
                        btf_field_type_name(node_field_type),
                        field->graph_root.node_offset,
                        btf_name_by_offset(field->graph_root.btf, et->name_off),
                        node_off, btf_name_by_offset(reg->btf, t->name_off));
                return -EINVAL;
        }
        meta->arg_btf = reg->btf;
        meta->arg_btf_id = reg->btf_id;

        if (node_off != field->graph_root.node_offset) {
                verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
                        node_off, btf_field_type_name(node_field_type),
                        field->graph_root.node_offset,
                        btf_name_by_offset(field->graph_root.btf, et->name_off));
                return -EINVAL;
        }

        return 0;
}

static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
                                           struct bpf_reg_state *reg, u32 regno,
                                           struct bpf_kfunc_call_arg_meta *meta)
{
        return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
                                                  BPF_LIST_HEAD, BPF_LIST_NODE,
                                                  &meta->arg_list_head.field);
}

static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
                                             struct bpf_reg_state *reg, u32 regno,
                                             struct bpf_kfunc_call_arg_meta *meta)
{
        return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
                                                  BPF_RB_ROOT, BPF_RB_NODE,
                                                  &meta->arg_rbtree_root.field);
}

/*
 * css_task iter allowlist is needed to avoid dead locking on css_set_lock.
 * LSM hooks and iters (both sleepable and non-sleepable) are safe.
 * Any sleepable progs are also safe since bpf_check_attach_target() enforce
 * them can only be attached to some specific hook points.
 */
static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
{
        enum bpf_prog_type prog_type = resolve_prog_type(env->prog);

        switch (prog_type) {
        case BPF_PROG_TYPE_LSM:
                return true;
        case BPF_PROG_TYPE_TRACING:
                if (env->prog->expected_attach_type == BPF_TRACE_ITER)
                        return true;
                fallthrough;
        default:
                return in_sleepable(env);
        }
}

static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
                            int insn_idx)
{
        const char *func_name = meta->func_name, *ref_tname;
        const struct btf *btf = meta->btf;
        const struct btf_param *args;
        struct btf_record *rec;
        u32 i, nargs;
        int ret;

        args = (const struct btf_param *)(meta->func_proto + 1);
        nargs = btf_type_vlen(meta->func_proto);
        if (nargs > MAX_BPF_FUNC_REG_ARGS) {
                verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
                        MAX_BPF_FUNC_REG_ARGS);
                return -EINVAL;
        }

        /* Check that BTF function arguments match actual types that the
         * verifier sees.
         */
        for (i = 0; i < nargs; i++) {
                struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
                const struct btf_type *t, *ref_t, *resolve_ret;
                enum bpf_arg_type arg_type = ARG_DONTCARE;
                u32 regno = i + 1, ref_id, type_size;
                bool is_ret_buf_sz = false;
                int kf_arg_type;

                if (is_kfunc_arg_prog_aux(btf, &args[i])) {
                        /* Reject repeated use bpf_prog_aux */
                        if (meta->arg_prog) {
                                verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc");
                                return -EFAULT;
                        }
                        meta->arg_prog = true;
                        cur_aux(env)->arg_prog = regno;
                        continue;
                }

                if (is_kfunc_arg_ignore(btf, &args[i]) || is_kfunc_arg_implicit(meta, i))
                        continue;

                t = btf_type_skip_modifiers(btf, args[i].type, NULL);

                if (btf_type_is_scalar(t)) {
                        if (reg->type != SCALAR_VALUE) {
                                verbose(env, "R%d is not a scalar\n", regno);
                                return -EINVAL;
                        }

                        if (is_kfunc_arg_constant(meta->btf, &args[i])) {
                                if (meta->arg_constant.found) {
                                        verifier_bug(env, "only one constant argument permitted");
                                        return -EFAULT;
                                }
                                if (!tnum_is_const(reg->var_off)) {
                                        verbose(env, "R%d must be a known constant\n", regno);
                                        return -EINVAL;
                                }
                                ret = mark_chain_precision(env, regno);
                                if (ret < 0)
                                        return ret;
                                meta->arg_constant.found = true;
                                meta->arg_constant.value = reg->var_off.value;
                        } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
                                meta->r0_rdonly = true;
                                is_ret_buf_sz = true;
                        } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
                                is_ret_buf_sz = true;
                        }

                        if (is_ret_buf_sz) {
                                if (meta->r0_size) {
                                        verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
                                        return -EINVAL;
                                }

                                if (!tnum_is_const(reg->var_off)) {
                                        verbose(env, "R%d is not a const\n", regno);
                                        return -EINVAL;
                                }

                                meta->r0_size = reg->var_off.value;
                                ret = mark_chain_precision(env, regno);
                                if (ret)
                                        return ret;
                        }
                        continue;
                }

                if (!btf_type_is_ptr(t)) {
                        verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
                        return -EINVAL;
                }

                if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) &&
                    !is_kfunc_arg_nullable(meta->btf, &args[i])) {
                        verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
                        return -EACCES;
                }

                if (reg->ref_obj_id) {
                        if (is_kfunc_release(meta) && meta->ref_obj_id) {
                                verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u",
                                             regno, reg->ref_obj_id,
                                             meta->ref_obj_id);
                                return -EFAULT;
                        }
                        meta->ref_obj_id = reg->ref_obj_id;
                        if (is_kfunc_release(meta))
                                meta->release_regno = regno;
                }

                ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
                ref_tname = btf_name_by_offset(btf, ref_t->name_off);

                kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
                if (kf_arg_type < 0)
                        return kf_arg_type;

                switch (kf_arg_type) {
                case KF_ARG_PTR_TO_NULL:
                        continue;
                case KF_ARG_PTR_TO_MAP:
                        if (!reg->map_ptr) {
                                verbose(env, "pointer in R%d isn't map pointer\n", regno);
                                return -EINVAL;
                        }
                        if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 ||
                                              reg->map_ptr->record->task_work_off >= 0)) {
                                /* Use map_uid (which is unique id of inner map) to reject:
                                 * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
                                 * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
                                 * if (inner_map1 && inner_map2) {
                                 *     wq = bpf_map_lookup_elem(inner_map1);
                                 *     if (wq)
                                 *         // mismatch would have been allowed
                                 *         bpf_wq_init(wq, inner_map2);
                                 * }
                                 *
                                 * Comparing map_ptr is enough to distinguish normal and outer maps.
                                 */
                                if (meta->map.ptr != reg->map_ptr ||
                                    meta->map.uid != reg->map_uid) {
                                        if (reg->map_ptr->record->task_work_off >= 0) {
                                                verbose(env,
                                                        "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n",
                                                        meta->map.uid, reg->map_uid);
                                                return -EINVAL;
                                        }
                                        verbose(env,
                                                "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
                                                meta->map.uid, reg->map_uid);
                                        return -EINVAL;
                                }
                        }
                        meta->map.ptr = reg->map_ptr;
                        meta->map.uid = reg->map_uid;
                        fallthrough;
                case KF_ARG_PTR_TO_ALLOC_BTF_ID:
                case KF_ARG_PTR_TO_BTF_ID:
                        if (!is_trusted_reg(reg)) {
                                if (!is_kfunc_rcu(meta)) {
                                        verbose(env, "R%d must be referenced or trusted\n", regno);
                                        return -EINVAL;
                                }
                                if (!is_rcu_reg(reg)) {
                                        verbose(env, "R%d must be a rcu pointer\n", regno);
                                        return -EINVAL;
                                }
                        }
                        fallthrough;
                case KF_ARG_PTR_TO_DYNPTR:
                case KF_ARG_PTR_TO_ITER:
                case KF_ARG_PTR_TO_LIST_HEAD:
                case KF_ARG_PTR_TO_LIST_NODE:
                case KF_ARG_PTR_TO_RB_ROOT:
                case KF_ARG_PTR_TO_RB_NODE:
                case KF_ARG_PTR_TO_MEM:
                case KF_ARG_PTR_TO_MEM_SIZE:
                case KF_ARG_PTR_TO_CALLBACK:
                case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
                case KF_ARG_PTR_TO_CONST_STR:
                case KF_ARG_PTR_TO_WORKQUEUE:
                case KF_ARG_PTR_TO_TIMER:
                case KF_ARG_PTR_TO_TASK_WORK:
                case KF_ARG_PTR_TO_IRQ_FLAG:
                case KF_ARG_PTR_TO_RES_SPIN_LOCK:
                        break;
                case KF_ARG_PTR_TO_CTX:
                        arg_type = ARG_PTR_TO_CTX;
                        break;
                default:
                        verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type);
                        return -EFAULT;
                }

                if (is_kfunc_release(meta) && reg->ref_obj_id)
                        arg_type |= OBJ_RELEASE;
                ret = check_func_arg_reg_off(env, reg, regno, arg_type);
                if (ret < 0)
                        return ret;

                switch (kf_arg_type) {
                case KF_ARG_PTR_TO_CTX:
                        if (reg->type != PTR_TO_CTX) {
                                verbose(env, "arg#%d expected pointer to ctx, but got %s\n",
                                        i, reg_type_str(env, reg->type));
                                return -EINVAL;
                        }

                        if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
                                ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
                                if (ret < 0)
                                        return -EINVAL;
                                meta->ret_btf_id  = ret;
                        }
                        break;
                case KF_ARG_PTR_TO_ALLOC_BTF_ID:
                        if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                if (!is_bpf_obj_drop_kfunc(meta->func_id)) {
                                        verbose(env, "arg#%d expected for bpf_obj_drop()\n", i);
                                        return -EINVAL;
                                }
                        } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
                                if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) {
                                        verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i);
                                        return -EINVAL;
                                }
                        } else {
                                verbose(env, "arg#%d expected pointer to allocated object\n", i);
                                return -EINVAL;
                        }
                        if (!reg->ref_obj_id) {
                                verbose(env, "allocated object must be referenced\n");
                                return -EINVAL;
                        }
                        if (meta->btf == btf_vmlinux) {
                                meta->arg_btf = reg->btf;
                                meta->arg_btf_id = reg->btf_id;
                        }
                        break;
                case KF_ARG_PTR_TO_DYNPTR:
                {
                        enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
                        int clone_ref_obj_id = 0;

                        if (reg->type == CONST_PTR_TO_DYNPTR)
                                dynptr_arg_type |= MEM_RDONLY;

                        if (is_kfunc_arg_uninit(btf, &args[i]))
                                dynptr_arg_type |= MEM_UNINIT;

                        if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
                                dynptr_arg_type |= DYNPTR_TYPE_SKB;
                        } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
                                dynptr_arg_type |= DYNPTR_TYPE_XDP;
                        } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) {
                                dynptr_arg_type |= DYNPTR_TYPE_SKB_META;
                        } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
                                dynptr_arg_type |= DYNPTR_TYPE_FILE;
                        } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) {
                                dynptr_arg_type |= DYNPTR_TYPE_FILE;
                                meta->release_regno = regno;
                        } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
                                   (dynptr_arg_type & MEM_UNINIT)) {
                                enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;

                                if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
                                        verifier_bug(env, "no dynptr type for parent of clone");
                                        return -EFAULT;
                                }

                                dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
                                clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
                                if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
                                        verifier_bug(env, "missing ref obj id for parent of clone");
                                        return -EFAULT;
                                }
                        }

                        ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
                        if (ret < 0)
                                return ret;

                        if (!(dynptr_arg_type & MEM_UNINIT)) {
                                int id = dynptr_id(env, reg);

                                if (id < 0) {
                                        verifier_bug(env, "failed to obtain dynptr id");
                                        return id;
                                }
                                meta->initialized_dynptr.id = id;
                                meta->initialized_dynptr.type = dynptr_get_type(env, reg);
                                meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
                        }

                        break;
                }
                case KF_ARG_PTR_TO_ITER:
                        if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) {
                                if (!check_css_task_iter_allowlist(env)) {
                                        verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n");
                                        return -EINVAL;
                                }
                        }
                        ret = process_iter_arg(env, regno, insn_idx, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_LIST_HEAD:
                        if (reg->type != PTR_TO_MAP_VALUE &&
                            reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
                                return -EINVAL;
                        }
                        if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
                                verbose(env, "allocated object must be referenced\n");
                                return -EINVAL;
                        }
                        ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_RB_ROOT:
                        if (reg->type != PTR_TO_MAP_VALUE &&
                            reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
                                return -EINVAL;
                        }
                        if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
                                verbose(env, "allocated object must be referenced\n");
                                return -EINVAL;
                        }
                        ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_LIST_NODE:
                        if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                verbose(env, "arg#%d expected pointer to allocated object\n", i);
                                return -EINVAL;
                        }
                        if (!reg->ref_obj_id) {
                                verbose(env, "allocated object must be referenced\n");
                                return -EINVAL;
                        }
                        ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_RB_NODE:
                        if (is_bpf_rbtree_add_kfunc(meta->func_id)) {
                                if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                        verbose(env, "arg#%d expected pointer to allocated object\n", i);
                                        return -EINVAL;
                                }
                                if (!reg->ref_obj_id) {
                                        verbose(env, "allocated object must be referenced\n");
                                        return -EINVAL;
                                }
                        } else {
                                if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
                                        verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name);
                                        return -EINVAL;
                                }
                                if (in_rbtree_lock_required_cb(env)) {
                                        verbose(env, "%s not allowed in rbtree cb\n", func_name);
                                        return -EINVAL;
                                }
                        }

                        ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_MAP:
                        /* If argument has '__map' suffix expect 'struct bpf_map *' */
                        ref_id = *reg2btf_ids[CONST_PTR_TO_MAP];
                        ref_t = btf_type_by_id(btf_vmlinux, ref_id);
                        ref_tname = btf_name_by_offset(btf, ref_t->name_off);
                        fallthrough;
                case KF_ARG_PTR_TO_BTF_ID:
                        /* Only base_type is checked, further checks are done here */
                        if ((base_type(reg->type) != PTR_TO_BTF_ID ||
                             (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
                            !reg2btf_ids[base_type(reg->type)]) {
                                verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
                                verbose(env, "expected %s or socket\n",
                                        reg_type_str(env, base_type(reg->type) |
                                                          (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
                                return -EINVAL;
                        }
                        ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_MEM:
                        resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
                        if (IS_ERR(resolve_ret)) {
                                verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
                                        i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
                                return -EINVAL;
                        }
                        ret = check_mem_reg(env, reg, regno, type_size);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_MEM_SIZE:
                {
                        struct bpf_reg_state *buff_reg = &regs[regno];
                        const struct btf_param *buff_arg = &args[i];
                        struct bpf_reg_state *size_reg = &regs[regno + 1];
                        const struct btf_param *size_arg = &args[i + 1];

                        if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) {
                                ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
                                if (ret < 0) {
                                        verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
                                        return ret;
                                }
                        }

                        if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
                                if (meta->arg_constant.found) {
                                        verifier_bug(env, "only one constant argument permitted");
                                        return -EFAULT;
                                }
                                if (!tnum_is_const(size_reg->var_off)) {
                                        verbose(env, "R%d must be a known constant\n", regno + 1);
                                        return -EINVAL;
                                }
                                meta->arg_constant.found = true;
                                meta->arg_constant.value = size_reg->var_off.value;
                        }

                        /* Skip next '__sz' or '__szk' argument */
                        i++;
                        break;
                }
                case KF_ARG_PTR_TO_CALLBACK:
                        if (reg->type != PTR_TO_FUNC) {
                                verbose(env, "arg%d expected pointer to func\n", i);
                                return -EINVAL;
                        }
                        meta->subprogno = reg->subprogno;
                        break;
                case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
                        if (!type_is_ptr_alloc_obj(reg->type)) {
                                verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
                                return -EINVAL;
                        }
                        if (!type_is_non_owning_ref(reg->type))
                                meta->arg_owning_ref = true;

                        rec = reg_btf_record(reg);
                        if (!rec) {
                                verifier_bug(env, "Couldn't find btf_record");
                                return -EFAULT;
                        }

                        if (rec->refcount_off < 0) {
                                verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
                                return -EINVAL;
                        }

                        meta->arg_btf = reg->btf;
                        meta->arg_btf_id = reg->btf_id;
                        break;
                case KF_ARG_PTR_TO_CONST_STR:
                        if (reg->type != PTR_TO_MAP_VALUE) {
                                verbose(env, "arg#%d doesn't point to a const string\n", i);
                                return -EINVAL;
                        }
                        ret = check_reg_const_str(env, reg, regno);
                        if (ret)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_WORKQUEUE:
                        if (reg->type != PTR_TO_MAP_VALUE) {
                                verbose(env, "arg#%d doesn't point to a map value\n", i);
                                return -EINVAL;
                        }
                        ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_TIMER:
                        if (reg->type != PTR_TO_MAP_VALUE) {
                                verbose(env, "arg#%d doesn't point to a map value\n", i);
                                return -EINVAL;
                        }
                        ret = process_timer_kfunc(env, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_TASK_WORK:
                        if (reg->type != PTR_TO_MAP_VALUE) {
                                verbose(env, "arg#%d doesn't point to a map value\n", i);
                                return -EINVAL;
                        }
                        ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_IRQ_FLAG:
                        if (reg->type != PTR_TO_STACK) {
                                verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
                                return -EINVAL;
                        }
                        ret = process_irq_flag(env, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_RES_SPIN_LOCK:
                {
                        int flags = PROCESS_RES_LOCK;

                        if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                verbose(env, "arg#%d doesn't point to map value or allocated object\n", i);
                                return -EINVAL;
                        }

                        if (!is_bpf_res_spin_lock_kfunc(meta->func_id))
                                return -EFAULT;
                        if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
                            meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
                                flags |= PROCESS_SPIN_LOCK;
                        if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
                            meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
                                flags |= PROCESS_LOCK_IRQ;
                        ret = process_spin_lock(env, regno, flags);
                        if (ret < 0)
                                return ret;
                        break;
                }
                }
        }

        if (is_kfunc_release(meta) && !meta->release_regno) {
                verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
                        func_name);
                return -EINVAL;
        }

        return 0;
}

int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
                             s32 func_id,
                             s16 offset,
                             struct bpf_kfunc_call_arg_meta *meta)
{
        struct bpf_kfunc_meta kfunc;
        int err;

        err = fetch_kfunc_meta(env, func_id, offset, &kfunc);
        if (err)
                return err;

        memset(meta, 0, sizeof(*meta));
        meta->btf = kfunc.btf;
        meta->func_id = kfunc.id;
        meta->func_proto = kfunc.proto;
        meta->func_name = kfunc.name;

        if (!kfunc.flags || !btf_kfunc_is_allowed(kfunc.btf, kfunc.id, env->prog))
                return -EACCES;

        meta->kfunc_flags = *kfunc.flags;

        return 0;
}

/*
 * Determine how many bytes a helper accesses through a stack pointer at
 * argument position @arg (0-based, corresponding to R1-R5).
 *
 * Returns:
 *   > 0   known read access size in bytes
 *     0   doesn't read anything directly
 * S64_MIN unknown
 *   < 0   known write access of (-return) bytes
 */
s64 bpf_helper_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn,
                                  int arg, int insn_idx)
{
        struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
        const struct bpf_func_proto *fn;
        enum bpf_arg_type at;
        s64 size;

        if (bpf_get_helper_proto(env, insn->imm, &fn) < 0)
                return S64_MIN;

        at = fn->arg_type[arg];

        switch (base_type(at)) {
        case ARG_PTR_TO_MAP_KEY:
        case ARG_PTR_TO_MAP_VALUE: {
                bool is_key = base_type(at) == ARG_PTR_TO_MAP_KEY;
                u64 val;
                int i, map_reg;

                for (i = 0; i < arg; i++) {
                        if (base_type(fn->arg_type[i]) == ARG_CONST_MAP_PTR)
                                break;
                }
                if (i >= arg)
                        goto scan_all_maps;

                map_reg = BPF_REG_1 + i;

                if (!(aux->const_reg_map_mask & BIT(map_reg)))
                        goto scan_all_maps;

                i = aux->const_reg_vals[map_reg];
                if (i < env->used_map_cnt) {
                        size = is_key ? env->used_maps[i]->key_size
                                      : env->used_maps[i]->value_size;
                        goto out;
                }
scan_all_maps:
                /*
                 * Map pointer is not known at this call site (e.g. different
                 * maps on merged paths).  Conservatively return the largest
                 * key_size or value_size across all maps used by the program.
                 */
                val = 0;
                for (i = 0; i < env->used_map_cnt; i++) {
                        struct bpf_map *map = env->used_maps[i];
                        u32 sz = is_key ? map->key_size : map->value_size;

                        if (sz > val)
                                val = sz;
                        if (map->inner_map_meta) {
                                sz = is_key ? map->inner_map_meta->key_size
                                            : map->inner_map_meta->value_size;
                                if (sz > val)
                                        val = sz;
                        }
                }
                if (!val)
                        return S64_MIN;
                size = val;
                goto out;
        }
        case ARG_PTR_TO_MEM:
                if (at & MEM_FIXED_SIZE) {
                        size = fn->arg_size[arg];
                        goto out;
                }
                if (arg + 1 < ARRAY_SIZE(fn->arg_type) &&
                    arg_type_is_mem_size(fn->arg_type[arg + 1])) {
                        int size_reg = BPF_REG_1 + arg + 1;

                        if (aux->const_reg_mask & BIT(size_reg)) {
                                size = (s64)aux->const_reg_vals[size_reg];
                                goto out;
                        }
                        /*
                         * Size arg is const on each path but differs across merged
                         * paths. MAX_BPF_STACK is a safe upper bound for reads.
                         */
                        if (at & MEM_UNINIT)
                                return 0;
                        return MAX_BPF_STACK;
                }
                return S64_MIN;
        case ARG_PTR_TO_DYNPTR:
                size = BPF_DYNPTR_SIZE;
                break;
        case ARG_PTR_TO_STACK:
                /*
                 * Only used by bpf_calls_callback() helpers. The helper itself
                 * doesn't access stack. The callback subprog does and it's
                 * analyzed separately.
                 */
                return 0;
        default:
                return S64_MIN;
        }
out:
        /*
         * MEM_UNINIT args are write-only: the helper initializes the
         * buffer without reading it.
         */
        if (at & MEM_UNINIT)
                return -size;
        return size;
}

/*
 * Determine how many bytes a kfunc accesses through a stack pointer at
 * argument position @arg (0-based, corresponding to R1-R5).
 *
 * Returns:
 *   > 0      known read access size in bytes
 *     0      doesn't access memory through that argument (ex: not a pointer)
 *   S64_MIN  unknown
 *   < 0      known write access of (-return) bytes
 */
s64 bpf_kfunc_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn,
                                 int arg, int insn_idx)
{
        struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
        struct bpf_kfunc_call_arg_meta meta;
        const struct btf_param *args;
        const struct btf_type *t, *ref_t;
        const struct btf *btf;
        u32 nargs, type_size;
        s64 size;

        if (bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta) < 0)
                return S64_MIN;

        btf = meta.btf;
        args = btf_params(meta.func_proto);
        nargs = btf_type_vlen(meta.func_proto);
        if (arg >= nargs)
                return 0;

        t = btf_type_skip_modifiers(btf, args[arg].type, NULL);
        if (!btf_type_is_ptr(t))
                return 0;

        /* dynptr: fixed 16-byte on-stack representation */
        if (is_kfunc_arg_dynptr(btf, &args[arg])) {
                size = BPF_DYNPTR_SIZE;
                goto out;
        }

        /* ptr + __sz/__szk pair: size is in the next register */
        if (arg + 1 < nargs &&
            (btf_param_match_suffix(btf, &args[arg + 1], "__sz") ||
             btf_param_match_suffix(btf, &args[arg + 1], "__szk"))) {
                int size_reg = BPF_REG_1 + arg + 1;

                if (aux->const_reg_mask & BIT(size_reg)) {
                        size = (s64)aux->const_reg_vals[size_reg];
                        goto out;
                }
                return MAX_BPF_STACK;
        }

        /* fixed-size pointed-to type: resolve via BTF */
        ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
        if (!IS_ERR(btf_resolve_size(btf, ref_t, &type_size))) {
                size = type_size;
                goto out;
        }

        return S64_MIN;
out:
        /* KF_ITER_NEW kfuncs initialize the iterator state at arg 0 */
        if (arg == 0 && meta.kfunc_flags & KF_ITER_NEW)
                return -size;
        if (is_kfunc_arg_uninit(btf, &args[arg]))
                return -size;
        return size;
}

/* check special kfuncs and return:
 *  1  - not fall-through to 'else' branch, continue verification
 *  0  - fall-through to 'else' branch
 * < 0 - not fall-through to 'else' branch, return error
 */
static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
                               struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux,
                               const struct btf_type *ptr_type, struct btf *desc_btf)
{
        const struct btf_type *ret_t;
        int err = 0;

        if (meta->btf != btf_vmlinux)
                return 0;

        if (is_bpf_obj_new_kfunc(meta->func_id) || is_bpf_percpu_obj_new_kfunc(meta->func_id)) {
                struct btf_struct_meta *struct_meta;
                struct btf *ret_btf;
                u32 ret_btf_id;

                if (is_bpf_obj_new_kfunc(meta->func_id) && !bpf_global_ma_set)
                        return -ENOMEM;

                if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) {
                        verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
                        return -EINVAL;
                }

                ret_btf = env->prog->aux->btf;
                ret_btf_id = meta->arg_constant.value;

                /* This may be NULL due to user not supplying a BTF */
                if (!ret_btf) {
                        verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
                        return -EINVAL;
                }

                ret_t = btf_type_by_id(ret_btf, ret_btf_id);
                if (!ret_t || !__btf_type_is_struct(ret_t)) {
                        verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
                        return -EINVAL;
                }

                if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) {
                        if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
                                verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
                                        ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
                                return -EINVAL;
                        }

                        if (!bpf_global_percpu_ma_set) {
                                mutex_lock(&bpf_percpu_ma_lock);
                                if (!bpf_global_percpu_ma_set) {
                                        /* Charge memory allocated with bpf_global_percpu_ma to
                                         * root memcg. The obj_cgroup for root memcg is NULL.
                                         */
                                        err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
                                        if (!err)
                                                bpf_global_percpu_ma_set = true;
                                }
                                mutex_unlock(&bpf_percpu_ma_lock);
                                if (err)
                                        return err;
                        }

                        mutex_lock(&bpf_percpu_ma_lock);
                        err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
                        mutex_unlock(&bpf_percpu_ma_lock);
                        if (err)
                                return err;
                }

                struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
                if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) {
                        if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
                                verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
                                return -EINVAL;
                        }

                        if (struct_meta) {
                                verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
                                return -EINVAL;
                        }
                }

                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
                regs[BPF_REG_0].btf = ret_btf;
                regs[BPF_REG_0].btf_id = ret_btf_id;
                if (is_bpf_percpu_obj_new_kfunc(meta->func_id))
                        regs[BPF_REG_0].type |= MEM_PERCPU;

                insn_aux->obj_new_size = ret_t->size;
                insn_aux->kptr_struct_meta = struct_meta;
        } else if (is_bpf_refcount_acquire_kfunc(meta->func_id)) {
                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
                regs[BPF_REG_0].btf = meta->arg_btf;
                regs[BPF_REG_0].btf_id = meta->arg_btf_id;

                insn_aux->kptr_struct_meta =
                        btf_find_struct_meta(meta->arg_btf,
                                             meta->arg_btf_id);
        } else if (is_list_node_type(ptr_type)) {
                struct btf_field *field = meta->arg_list_head.field;

                mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
        } else if (is_rbtree_node_type(ptr_type)) {
                struct btf_field *field = meta->arg_rbtree_root.field;

                mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
        } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
                regs[BPF_REG_0].btf = desc_btf;
                regs[BPF_REG_0].btf_id = meta->ret_btf_id;
        } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
                ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value);
                if (!ret_t) {
                        verbose(env, "Unknown type ID %lld passed to kfunc bpf_rdonly_cast\n",
                                meta->arg_constant.value);
                        return -EINVAL;
                } else if (btf_type_is_struct(ret_t)) {
                        mark_reg_known_zero(env, regs, BPF_REG_0);
                        regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
                        regs[BPF_REG_0].btf = desc_btf;
                        regs[BPF_REG_0].btf_id = meta->arg_constant.value;
                } else if (btf_type_is_void(ret_t)) {
                        mark_reg_known_zero(env, regs, BPF_REG_0);
                        regs[BPF_REG_0].type = PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED;
                        regs[BPF_REG_0].mem_size = 0;
                } else {
                        verbose(env,
                                "kfunc bpf_rdonly_cast type ID argument must be of a struct or void\n");
                        return -EINVAL;
                }
        } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
                   meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
                enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type);

                mark_reg_known_zero(env, regs, BPF_REG_0);

                if (!meta->arg_constant.found) {
                        verifier_bug(env, "bpf_dynptr_slice(_rdwr) no constant size");
                        return -EFAULT;
                }

                regs[BPF_REG_0].mem_size = meta->arg_constant.value;

                /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
                regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;

                if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
                        regs[BPF_REG_0].type |= MEM_RDONLY;
                } else {
                        /* this will set env->seen_direct_write to true */
                        if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
                                verbose(env, "the prog does not allow writes to packet data\n");
                                return -EINVAL;
                        }
                }

                if (!meta->initialized_dynptr.id) {
                        verifier_bug(env, "no dynptr id");
                        return -EFAULT;
                }
                regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id;

                /* we don't need to set BPF_REG_0's ref obj id
                 * because packet slices are not refcounted (see
                 * dynptr_type_refcounted)
                 */
        } else {
                return 0;
        }

        return 1;
}

static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
static int process_bpf_exit_full(struct bpf_verifier_env *env,
                                 bool *do_print_state, bool exception_exit);

static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                            int *insn_idx_p)
{
        bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
        u32 i, nargs, ptr_type_id, release_ref_obj_id;
        struct bpf_reg_state *regs = cur_regs(env);
        const char *func_name, *ptr_type_name;
        const struct btf_type *t, *ptr_type;
        struct bpf_kfunc_call_arg_meta meta;
        struct bpf_insn_aux_data *insn_aux;
        int err, insn_idx = *insn_idx_p;
        const struct btf_param *args;
        struct btf *desc_btf;

        /* skip for now, but return error when we find this in fixup_kfunc_call */
        if (!insn->imm)
                return 0;

        err = bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
        if (err == -EACCES && meta.func_name)
                verbose(env, "calling kernel function %s is not allowed\n", meta.func_name);
        if (err)
                return err;
        desc_btf = meta.btf;
        func_name = meta.func_name;
        insn_aux = &env->insn_aux_data[insn_idx];

        insn_aux->is_iter_next = bpf_is_iter_next_kfunc(&meta);

        if (!insn->off &&
            (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] ||
             insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) {
                struct bpf_verifier_state *branch;
                struct bpf_reg_state *regs;

                branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
                if (IS_ERR(branch)) {
                        verbose(env, "failed to push state for failed lock acquisition\n");
                        return PTR_ERR(branch);
                }

                regs = branch->frame[branch->curframe]->regs;

                /* Clear r0-r5 registers in forked state */
                for (i = 0; i < CALLER_SAVED_REGS; i++)
                        bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);

                mark_reg_unknown(env, regs, BPF_REG_0);
                err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1);
                if (err) {
                        verbose(env, "failed to mark s32 range for retval in forked state for lock\n");
                        return err;
                }
                __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32));
        } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) {
                verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n");
                return -EFAULT;
        }

        if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
                verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
                return -EACCES;
        }

        sleepable = bpf_is_kfunc_sleepable(&meta);
        if (sleepable && !in_sleepable(env)) {
                verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
                return -EACCES;
        }

        /* Track non-sleepable context for kfuncs, same as for helpers. */
        if (!in_sleepable_context(env))
                insn_aux->non_sleepable = true;

        /* Check the arguments */
        err = check_kfunc_args(env, &meta, insn_idx);
        if (err < 0)
                return err;

        if (is_bpf_rbtree_add_kfunc(meta.func_id)) {
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_rbtree_add_callback_state);
                if (err) {
                        verbose(env, "kfunc %s#%d failed callback verification\n",
                                func_name, meta.func_id);
                        return err;
                }
        }

        if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) {
                meta.r0_size = sizeof(u64);
                meta.r0_rdonly = false;
        }

        if (is_bpf_wq_set_callback_kfunc(meta.func_id)) {
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_timer_callback_state);
                if (err) {
                        verbose(env, "kfunc %s#%d failed callback verification\n",
                                func_name, meta.func_id);
                        return err;
                }
        }

        if (is_task_work_add_kfunc(meta.func_id)) {
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_task_work_schedule_callback_state);
                if (err) {
                        verbose(env, "kfunc %s#%d failed callback verification\n",
                                func_name, meta.func_id);
                        return err;
                }
        }

        rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
        rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);

        preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
        preempt_enable = is_kfunc_bpf_preempt_enable(&meta);

        if (rcu_lock) {
                env->cur_state->active_rcu_locks++;
        } else if (rcu_unlock) {
                struct bpf_func_state *state;
                struct bpf_reg_state *reg;
                u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);

                if (env->cur_state->active_rcu_locks == 0) {
                        verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
                        return -EINVAL;
                }
                if (--env->cur_state->active_rcu_locks == 0) {
                        bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
                                if (reg->type & MEM_RCU) {
                                        reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
                                        reg->type |= PTR_UNTRUSTED;
                                }
                        }));
                }
        } else if (preempt_disable) {
                env->cur_state->active_preempt_locks++;
        } else if (preempt_enable) {
                if (env->cur_state->active_preempt_locks == 0) {
                        verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
                        return -EINVAL;
                }
                env->cur_state->active_preempt_locks--;
        }

        if (sleepable && !in_sleepable_context(env)) {
                verbose(env, "kernel func %s is sleepable within %s\n",
                        func_name, non_sleepable_context_description(env));
                return -EACCES;
        }

        if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
                verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
                return -EACCES;
        }

        if (is_kfunc_rcu_protected(&meta) && !in_rcu_cs(env)) {
                verbose(env, "kernel func %s requires RCU critical section protection\n", func_name);
                return -EACCES;
        }

        /* In case of release function, we get register number of refcounted
         * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
         */
        if (meta.release_regno) {
                struct bpf_reg_state *reg = &regs[meta.release_regno];

                if (meta.initialized_dynptr.ref_obj_id) {
                        err = unmark_stack_slots_dynptr(env, reg);
                } else {
                        err = release_reference(env, reg->ref_obj_id);
                        if (err)
                                verbose(env, "kfunc %s#%d reference has not been acquired before\n",
                                        func_name, meta.func_id);
                }
                if (err)
                        return err;
        }

        if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) {
                release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
                insn_aux->insert_off = regs[BPF_REG_2].var_off.value;
                insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
                err = ref_convert_owning_non_owning(env, release_ref_obj_id);
                if (err) {
                        verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
                                func_name, meta.func_id);
                        return err;
                }

                err = release_reference(env, release_ref_obj_id);
                if (err) {
                        verbose(env, "kfunc %s#%d reference has not been acquired before\n",
                                func_name, meta.func_id);
                        return err;
                }
        }

        if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
                if (!bpf_jit_supports_exceptions()) {
                        verbose(env, "JIT does not support calling kfunc %s#%d\n",
                                func_name, meta.func_id);
                        return -ENOTSUPP;
                }
                env->seen_exception = true;

                /* In the case of the default callback, the cookie value passed
                 * to bpf_throw becomes the return value of the program.
                 */
                if (!env->exception_callback_subprog) {
                        err = check_return_code(env, BPF_REG_1, "R1");
                        if (err < 0)
                                return err;
                }
        }

        for (i = 0; i < CALLER_SAVED_REGS; i++) {
                u32 regno = caller_saved[i];

                bpf_mark_reg_not_init(env, &regs[regno]);
                regs[regno].subreg_def = DEF_NOT_SUBREG;
        }

        /* Check return type */
        t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);

        if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
                if (meta.btf != btf_vmlinux ||
                    (!is_bpf_obj_new_kfunc(meta.func_id) &&
                     !is_bpf_percpu_obj_new_kfunc(meta.func_id) &&
                     !is_bpf_refcount_acquire_kfunc(meta.func_id))) {
                        verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
                        return -EINVAL;
                }
        }

        if (btf_type_is_scalar(t)) {
                mark_reg_unknown(env, regs, BPF_REG_0);
                if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
                    meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]))
                        __mark_reg_const_zero(env, &regs[BPF_REG_0]);
                mark_btf_func_reg_size(env, BPF_REG_0, t->size);
        } else if (btf_type_is_ptr(t)) {
                ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
                err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf);
                if (err) {
                        if (err < 0)
                                return err;
                } else if (btf_type_is_void(ptr_type)) {
                        /* kfunc returning 'void *' is equivalent to returning scalar */
                        mark_reg_unknown(env, regs, BPF_REG_0);
                } else if (!__btf_type_is_struct(ptr_type)) {
                        if (!meta.r0_size) {
                                __u32 sz;

                                if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) {
                                        meta.r0_size = sz;
                                        meta.r0_rdonly = true;
                                }
                        }
                        if (!meta.r0_size) {
                                ptr_type_name = btf_name_by_offset(desc_btf,
                                                                   ptr_type->name_off);
                                verbose(env,
                                        "kernel function %s returns pointer type %s %s is not supported\n",
                                        func_name,
                                        btf_type_str(ptr_type),
                                        ptr_type_name);
                                return -EINVAL;
                        }

                        mark_reg_known_zero(env, regs, BPF_REG_0);
                        regs[BPF_REG_0].type = PTR_TO_MEM;
                        regs[BPF_REG_0].mem_size = meta.r0_size;

                        if (meta.r0_rdonly)
                                regs[BPF_REG_0].type |= MEM_RDONLY;

                        /* Ensures we don't access the memory after a release_reference() */
                        if (meta.ref_obj_id)
                                regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;

                        if (is_kfunc_rcu_protected(&meta))
                                regs[BPF_REG_0].type |= MEM_RCU;
                } else {
                        enum bpf_reg_type type = PTR_TO_BTF_ID;

                        if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
                                type |= PTR_UNTRUSTED;
                        else if (is_kfunc_rcu_protected(&meta) ||
                                 (bpf_is_iter_next_kfunc(&meta) &&
                                  (get_iter_from_state(env->cur_state, &meta)
                                           ->type & MEM_RCU))) {
                                /*
                                 * If the iterator's constructor (the _new
                                 * function e.g., bpf_iter_task_new) has been
                                 * annotated with BPF kfunc flag
                                 * KF_RCU_PROTECTED and was called within a RCU
                                 * read-side critical section, also propagate
                                 * the MEM_RCU flag to the pointer returned from
                                 * the iterator's next function (e.g.,
                                 * bpf_iter_task_next).
                                 */
                                type |= MEM_RCU;
                        } else {
                                /*
                                 * Any PTR_TO_BTF_ID that is returned from a BPF
                                 * kfunc should by default be treated as
                                 * implicitly trusted.
                                 */
                                type |= PTR_TRUSTED;
                        }

                        mark_reg_known_zero(env, regs, BPF_REG_0);
                        regs[BPF_REG_0].btf = desc_btf;
                        regs[BPF_REG_0].type = type;
                        regs[BPF_REG_0].btf_id = ptr_type_id;
                }

                if (is_kfunc_ret_null(&meta)) {
                        regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
                        /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
                        regs[BPF_REG_0].id = ++env->id_gen;
                }
                mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
                if (is_kfunc_acquire(&meta)) {
                        int id = acquire_reference(env, insn_idx);

                        if (id < 0)
                                return id;
                        if (is_kfunc_ret_null(&meta))
                                regs[BPF_REG_0].id = id;
                        regs[BPF_REG_0].ref_obj_id = id;
                } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) {
                        ref_set_non_owning(env, &regs[BPF_REG_0]);
                }

                if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
                        regs[BPF_REG_0].id = ++env->id_gen;
        } else if (btf_type_is_void(t)) {
                if (meta.btf == btf_vmlinux) {
                        if (is_bpf_obj_drop_kfunc(meta.func_id) ||
                            is_bpf_percpu_obj_drop_kfunc(meta.func_id)) {
                                insn_aux->kptr_struct_meta =
                                        btf_find_struct_meta(meta.arg_btf,
                                                             meta.arg_btf_id);
                        }
                }
        }

        if (bpf_is_kfunc_pkt_changing(&meta))
                clear_all_pkt_pointers(env);

        nargs = btf_type_vlen(meta.func_proto);
        args = (const struct btf_param *)(meta.func_proto + 1);
        for (i = 0; i < nargs; i++) {
                u32 regno = i + 1;

                t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL);
                if (btf_type_is_ptr(t))
                        mark_btf_func_reg_size(env, regno, sizeof(void *));
                else
                        /* scalar. ensured by check_kfunc_args() */
                        mark_btf_func_reg_size(env, regno, t->size);
        }

        if (bpf_is_iter_next_kfunc(&meta)) {
                err = process_iter_next_call(env, insn_idx, &meta);
                if (err)
                        return err;
        }

        if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie])
                env->prog->call_session_cookie = true;

        if (is_bpf_throw_kfunc(insn))
                return process_bpf_exit_full(env, NULL, true);

        return 0;
}

static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env,
                                         const struct bpf_reg_state *reg,
                                         enum bpf_reg_type type)
{
        bool known = tnum_is_const(reg->var_off);
        s64 val = reg->var_off.value;
        s64 smin = reg->smin_value;

        if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
                verbose(env, "math between %s pointer and %lld is not allowed\n",
                        reg_type_str(env, type), val);
                return false;
        }

        if (smin == S64_MIN) {
                verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
                        reg_type_str(env, type));
                return false;
        }

        if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
                verbose(env, "value %lld makes %s pointer be out of bounds\n",
                        smin, reg_type_str(env, type));
                return false;
        }

        return true;
}

static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env,
                                      const struct bpf_reg_state *reg,
                                      enum bpf_reg_type type)
{
        bool known = tnum_is_const(reg->var_off);
        s64 val = reg->var_off.value;
        s64 smin = reg->smin_value;

        if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
                verbose(env, "%s pointer offset %lld is not allowed\n",
                        reg_type_str(env, type), val);
                return false;
        }

        if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
                verbose(env, "%s pointer offset %lld is not allowed\n",
                        reg_type_str(env, type), smin);
                return false;
        }

        return true;
}

enum {
        REASON_BOUNDS        = -1,
        REASON_TYPE        = -2,
        REASON_PATHS        = -3,
        REASON_LIMIT        = -4,
        REASON_STACK        = -5,
};

static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
                              u32 *alu_limit, bool mask_to_left)
{
        u32 max = 0, ptr_limit = 0;

        switch (ptr_reg->type) {
        case PTR_TO_STACK:
                /* Offset 0 is out-of-bounds, but acceptable start for the
                 * left direction, see BPF_REG_FP. Also, unknown scalar
                 * offset where we would need to deal with min/max bounds is
                 * currently prohibited for unprivileged.
                 */
                max = MAX_BPF_STACK + mask_to_left;
                ptr_limit = -ptr_reg->var_off.value;
                break;
        case PTR_TO_MAP_VALUE:
                max = ptr_reg->map_ptr->value_size;
                ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value;
                break;
        default:
                return REASON_TYPE;
        }

        if (ptr_limit >= max)
                return REASON_LIMIT;
        *alu_limit = ptr_limit;
        return 0;
}

static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
                                    const struct bpf_insn *insn)
{
        return env->bypass_spec_v1 ||
                BPF_SRC(insn->code) == BPF_K ||
                cur_aux(env)->nospec;
}

static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
                                       u32 alu_state, u32 alu_limit)
{
        /* If we arrived here from different branches with different
         * state or limits to sanitize, then this won't work.
         */
        if (aux->alu_state &&
            (aux->alu_state != alu_state ||
             aux->alu_limit != alu_limit))
                return REASON_PATHS;

        /* Corresponding fixup done in do_misc_fixups(). */
        aux->alu_state = alu_state;
        aux->alu_limit = alu_limit;
        return 0;
}

static int sanitize_val_alu(struct bpf_verifier_env *env,
                            struct bpf_insn *insn)
{
        struct bpf_insn_aux_data *aux = cur_aux(env);

        if (can_skip_alu_sanitation(env, insn))
                return 0;

        return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
}

static bool sanitize_needed(u8 opcode)
{
        return opcode == BPF_ADD || opcode == BPF_SUB;
}

struct bpf_sanitize_info {
        struct bpf_insn_aux_data aux;
        bool mask_to_left;
};

static int sanitize_speculative_path(struct bpf_verifier_env *env,
                                     const struct bpf_insn *insn,
                                     u32 next_idx, u32 curr_idx)
{
        struct bpf_verifier_state *branch;
        struct bpf_reg_state *regs;

        branch = push_stack(env, next_idx, curr_idx, true);
        if (!IS_ERR(branch) && insn) {
                regs = branch->frame[branch->curframe]->regs;
                if (BPF_SRC(insn->code) == BPF_K) {
                        mark_reg_unknown(env, regs, insn->dst_reg);
                } else if (BPF_SRC(insn->code) == BPF_X) {
                        mark_reg_unknown(env, regs, insn->dst_reg);
                        mark_reg_unknown(env, regs, insn->src_reg);
                }
        }
        return PTR_ERR_OR_ZERO(branch);
}

static int sanitize_ptr_alu(struct bpf_verifier_env *env,
                            struct bpf_insn *insn,
                            const struct bpf_reg_state *ptr_reg,
                            const struct bpf_reg_state *off_reg,
                            struct bpf_reg_state *dst_reg,
                            struct bpf_sanitize_info *info,
                            const bool commit_window)
{
        struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux;
        struct bpf_verifier_state *vstate = env->cur_state;
        bool off_is_imm = tnum_is_const(off_reg->var_off);
        bool off_is_neg = off_reg->smin_value < 0;
        bool ptr_is_dst_reg = ptr_reg == dst_reg;
        u8 opcode = BPF_OP(insn->code);
        u32 alu_state, alu_limit;
        struct bpf_reg_state tmp;
        int err;

        if (can_skip_alu_sanitation(env, insn))
                return 0;

        /* We already marked aux for masking from non-speculative
         * paths, thus we got here in the first place. We only care
         * to explore bad access from here.
         */
        if (vstate->speculative)
                goto do_sim;

        if (!commit_window) {
                if (!tnum_is_const(off_reg->var_off) &&
                    (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
                        return REASON_BOUNDS;

                info->mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
                                     (opcode == BPF_SUB && !off_is_neg);
        }

        err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left);
        if (err < 0)
                return err;

        if (commit_window) {
                /* In commit phase we narrow the masking window based on
                 * the observed pointer move after the simulated operation.
                 */
                alu_state = info->aux.alu_state;
                alu_limit = abs(info->aux.alu_limit - alu_limit);
        } else {
                alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
                alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
                alu_state |= ptr_is_dst_reg ?
                             BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;

                /* Limit pruning on unknown scalars to enable deep search for
                 * potential masking differences from other program paths.
                 */
                if (!off_is_imm)
                        env->explore_alu_limits = true;
        }

        err = update_alu_sanitation_state(aux, alu_state, alu_limit);
        if (err < 0)
                return err;
do_sim:
        /* If we're in commit phase, we're done here given we already
         * pushed the truncated dst_reg into the speculative verification
         * stack.
         *
         * Also, when register is a known constant, we rewrite register-based
         * operation to immediate-based, and thus do not need masking (and as
         * a consequence, do not need to simulate the zero-truncation either).
         */
        if (commit_window || off_is_imm)
                return 0;

        /* Simulate and find potential out-of-bounds access under
         * speculative execution from truncation as a result of
         * masking when off was not within expected range. If off
         * sits in dst, then we temporarily need to move ptr there
         * to simulate dst (== 0) +/-= ptr. Needed, for example,
         * for cases where we use K-based arithmetic in one direction
         * and truncated reg-based in the other in order to explore
         * bad access.
         */
        if (!ptr_is_dst_reg) {
                tmp = *dst_reg;
                copy_register_state(dst_reg, ptr_reg);
        }
        err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx);
        if (err < 0)
                return REASON_STACK;
        if (!ptr_is_dst_reg)
                *dst_reg = tmp;
        return 0;
}

static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
{
        struct bpf_verifier_state *vstate = env->cur_state;

        /* If we simulate paths under speculation, we don't update the
         * insn as 'seen' such that when we verify unreachable paths in
         * the non-speculative domain, sanitize_dead_code() can still
         * rewrite/sanitize them.
         */
        if (!vstate->speculative)
                env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
}

static int sanitize_err(struct bpf_verifier_env *env,
                        const struct bpf_insn *insn, int reason,
                        const struct bpf_reg_state *off_reg,
                        const struct bpf_reg_state *dst_reg)
{
        static const char *err = "pointer arithmetic with it prohibited for !root";
        const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub";
        u32 dst = insn->dst_reg, src = insn->src_reg;

        switch (reason) {
        case REASON_BOUNDS:
                verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n",
                        off_reg == dst_reg ? dst : src, err);
                break;
        case REASON_TYPE:
                verbose(env, "R%d has pointer with unsupported alu operation, %s\n",
                        off_reg == dst_reg ? src : dst, err);
                break;
        case REASON_PATHS:
                verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n",
                        dst, op, err);
                break;
        case REASON_LIMIT:
                verbose(env, "R%d tried to %s beyond pointer bounds, %s\n",
                        dst, op, err);
                break;
        case REASON_STACK:
                verbose(env, "R%d could not be pushed for speculative verification, %s\n",
                        dst, err);
                return -ENOMEM;
        default:
                verifier_bug(env, "unknown reason (%d)", reason);
                break;
        }

        return -EACCES;
}

/* check that stack access falls within stack limits and that 'reg' doesn't
 * have a variable offset.
 *
 * Variable offset is prohibited for unprivileged mode for simplicity since it
 * requires corresponding support in Spectre masking for stack ALU.  See also
 * retrieve_ptr_limit().
 */
static int check_stack_access_for_ptr_arithmetic(
                                struct bpf_verifier_env *env,
                                int regno,
                                const struct bpf_reg_state *reg,
                                int off)
{
        if (!tnum_is_const(reg->var_off)) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
                        regno, tn_buf, off);
                return -EACCES;
        }

        if (off >= 0 || off < -MAX_BPF_STACK) {
                verbose(env, "R%d stack pointer arithmetic goes out of range, "
                        "prohibited for !root; off=%d\n", regno, off);
                return -EACCES;
        }

        return 0;
}

static int sanitize_check_bounds(struct bpf_verifier_env *env,
                                 const struct bpf_insn *insn,
                                 const struct bpf_reg_state *dst_reg)
{
        u32 dst = insn->dst_reg;

        /* For unprivileged we require that resulting offset must be in bounds
         * in order to be able to sanitize access later on.
         */
        if (env->bypass_spec_v1)
                return 0;

        switch (dst_reg->type) {
        case PTR_TO_STACK:
                if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
                                                          dst_reg->var_off.value))
                        return -EACCES;
                break;
        case PTR_TO_MAP_VALUE:
                if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) {
                        verbose(env, "R%d pointer arithmetic of map value goes out of range, "
                                "prohibited for !root\n", dst);
                        return -EACCES;
                }
                break;
        default:
                return -EOPNOTSUPP;
        }

        return 0;
}

/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
 * Caller should also handle BPF_MOV case separately.
 * If we return -EACCES, caller may want to try again treating pointer as a
 * scalar.  So we only emit a diagnostic if !env->allow_ptr_leaks.
 */
static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                                   struct bpf_insn *insn,
                                   const struct bpf_reg_state *ptr_reg,
                                   const struct bpf_reg_state *off_reg)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *regs = state->regs, *dst_reg;
        bool known = tnum_is_const(off_reg->var_off);
        s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
            smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
        u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
            umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
        struct bpf_sanitize_info info = {};
        u8 opcode = BPF_OP(insn->code);
        u32 dst = insn->dst_reg;
        int ret, bounds_ret;

        dst_reg = &regs[dst];

        if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
            smin_val > smax_val || umin_val > umax_val) {
                /* Taint dst register if offset had invalid bounds derived from
                 * e.g. dead branches.
                 */
                __mark_reg_unknown(env, dst_reg);
                return 0;
        }

        if (BPF_CLASS(insn->code) != BPF_ALU64) {
                /* 32-bit ALU ops on pointers produce (meaningless) scalars */
                if (opcode == BPF_SUB && env->allow_ptr_leaks) {
                        __mark_reg_unknown(env, dst_reg);
                        return 0;
                }

                verbose(env,
                        "R%d 32-bit pointer arithmetic prohibited\n",
                        dst);
                return -EACCES;
        }

        if (ptr_reg->type & PTR_MAYBE_NULL) {
                verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
                        dst, reg_type_str(env, ptr_reg->type));
                return -EACCES;
        }

        /*
         * Accesses to untrusted PTR_TO_MEM are done through probe
         * instructions, hence no need to track offsets.
         */
        if (base_type(ptr_reg->type) == PTR_TO_MEM && (ptr_reg->type & PTR_UNTRUSTED))
                return 0;

        switch (base_type(ptr_reg->type)) {
        case PTR_TO_CTX:
        case PTR_TO_MAP_VALUE:
        case PTR_TO_MAP_KEY:
        case PTR_TO_STACK:
        case PTR_TO_PACKET_META:
        case PTR_TO_PACKET:
        case PTR_TO_TP_BUFFER:
        case PTR_TO_BTF_ID:
        case PTR_TO_MEM:
        case PTR_TO_BUF:
        case PTR_TO_FUNC:
        case CONST_PTR_TO_DYNPTR:
                break;
        case PTR_TO_FLOW_KEYS:
                if (known)
                        break;
                fallthrough;
        case CONST_PTR_TO_MAP:
                /* smin_val represents the known value */
                if (known && smin_val == 0 && opcode == BPF_ADD)
                        break;
                fallthrough;
        default:
                verbose(env, "R%d pointer arithmetic on %s prohibited\n",
                        dst, reg_type_str(env, ptr_reg->type));
                return -EACCES;
        }

        /* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
         * The id may be overwritten later if we create a new variable offset.
         */
        dst_reg->type = ptr_reg->type;
        dst_reg->id = ptr_reg->id;

        if (!check_reg_sane_offset_scalar(env, off_reg, ptr_reg->type) ||
            !check_reg_sane_offset_ptr(env, ptr_reg, ptr_reg->type))
                return -EINVAL;

        /* pointer types do not carry 32-bit bounds at the moment. */
        __mark_reg32_unbounded(dst_reg);

        if (sanitize_needed(opcode)) {
                ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
                                       &info, false);
                if (ret < 0)
                        return sanitize_err(env, insn, ret, off_reg, dst_reg);
        }

        switch (opcode) {
        case BPF_ADD:
                /*
                 * dst_reg gets the pointer type and since some positive
                 * integer value was added to the pointer, give it a new 'id'
                 * if it's a PTR_TO_PACKET.
                 * this creates a new 'base' pointer, off_reg (variable) gets
                 * added into the variable offset, and we copy the fixed offset
                 * from ptr_reg.
                 */
                if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) ||
                    check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {
                        dst_reg->smin_value = S64_MIN;
                        dst_reg->smax_value = S64_MAX;
                }
                if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) ||
                    check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {
                        dst_reg->umin_value = 0;
                        dst_reg->umax_value = U64_MAX;
                }
                dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
                dst_reg->raw = ptr_reg->raw;
                if (reg_is_pkt_pointer(ptr_reg)) {
                        if (!known)
                                dst_reg->id = ++env->id_gen;
                        /*
                         * Clear range for unknown addends since we can't know
                         * where the pkt pointer ended up. Also clear AT_PKT_END /
                         * BEYOND_PKT_END from prior comparison as any pointer
                         * arithmetic invalidates them.
                         */
                        if (!known || dst_reg->range < 0)
                                memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
                }
                break;
        case BPF_SUB:
                if (dst_reg == off_reg) {
                        /* scalar -= pointer.  Creates an unknown scalar */
                        verbose(env, "R%d tried to subtract pointer from scalar\n",
                                dst);
                        return -EACCES;
                }
                /* We don't allow subtraction from FP, because (according to
                 * test_verifier.c test "invalid fp arithmetic", JITs might not
                 * be able to deal with it.
                 */
                if (ptr_reg->type == PTR_TO_STACK) {
                        verbose(env, "R%d subtraction from stack pointer prohibited\n",
                                dst);
                        return -EACCES;
                }
                /* A new variable offset is created.  If the subtrahend is known
                 * nonnegative, then any reg->range we had before is still good.
                 */
                if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) ||
                    check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) {
                        /* Overflow possible, we know nothing */
                        dst_reg->smin_value = S64_MIN;
                        dst_reg->smax_value = S64_MAX;
                }
                if (umin_ptr < umax_val) {
                        /* Overflow possible, we know nothing */
                        dst_reg->umin_value = 0;
                        dst_reg->umax_value = U64_MAX;
                } else {
                        /* Cannot overflow (as long as bounds are consistent) */
                        dst_reg->umin_value = umin_ptr - umax_val;
                        dst_reg->umax_value = umax_ptr - umin_val;
                }
                dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
                dst_reg->raw = ptr_reg->raw;
                if (reg_is_pkt_pointer(ptr_reg)) {
                        if (!known)
                                dst_reg->id = ++env->id_gen;
                        /*
                         * Clear range if the subtrahend may be negative since
                         * pkt pointer could move past its bounds. A positive
                         * subtrahend moves it backwards keeping positive range
                         * intact. Also clear AT_PKT_END / BEYOND_PKT_END from
                         * prior comparison as arithmetic invalidates them.
                         */
                        if ((!known && smin_val < 0) || dst_reg->range < 0)
                                memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
                }
                break;
        case BPF_AND:
        case BPF_OR:
        case BPF_XOR:
                /* bitwise ops on pointers are troublesome, prohibit. */
                verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
                        dst, bpf_alu_string[opcode >> 4]);
                return -EACCES;
        default:
                /* other operators (e.g. MUL,LSH) produce non-pointer results */
                verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
                        dst, bpf_alu_string[opcode >> 4]);
                return -EACCES;
        }

        if (!check_reg_sane_offset_ptr(env, dst_reg, ptr_reg->type))
                return -EINVAL;
        reg_bounds_sync(dst_reg);
        bounds_ret = sanitize_check_bounds(env, insn, dst_reg);
        if (bounds_ret == -EACCES)
                return bounds_ret;
        if (sanitize_needed(opcode)) {
                ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
                                       &info, true);
                if (verifier_bug_if(!can_skip_alu_sanitation(env, insn)
                                    && !env->cur_state->speculative
                                    && bounds_ret
                                    && !ret,
                                    env, "Pointer type unsupported by sanitize_check_bounds() not rejected by retrieve_ptr_limit() as required")) {
                        return -EFAULT;
                }
                if (ret < 0)
                        return sanitize_err(env, insn, ret, off_reg, dst_reg);
        }

        return 0;
}

static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        s32 *dst_smin = &dst_reg->s32_min_value;
        s32 *dst_smax = &dst_reg->s32_max_value;
        u32 *dst_umin = &dst_reg->u32_min_value;
        u32 *dst_umax = &dst_reg->u32_max_value;
        u32 umin_val = src_reg->u32_min_value;
        u32 umax_val = src_reg->u32_max_value;
        bool min_overflow, max_overflow;

        if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) ||
            check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) {
                *dst_smin = S32_MIN;
                *dst_smax = S32_MAX;
        }

        /* If either all additions overflow or no additions overflow, then
         * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
         * dst_umax + src_umax. Otherwise (some additions overflow), set
         * the output bounds to unbounded.
         */
        min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
        max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);

        if (!min_overflow && max_overflow) {
                *dst_umin = 0;
                *dst_umax = U32_MAX;
        }
}

static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        s64 *dst_smin = &dst_reg->smin_value;
        s64 *dst_smax = &dst_reg->smax_value;
        u64 *dst_umin = &dst_reg->umin_value;
        u64 *dst_umax = &dst_reg->umax_value;
        u64 umin_val = src_reg->umin_value;
        u64 umax_val = src_reg->umax_value;
        bool min_overflow, max_overflow;

        if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) ||
            check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) {
                *dst_smin = S64_MIN;
                *dst_smax = S64_MAX;
        }

        /* If either all additions overflow or no additions overflow, then
         * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
         * dst_umax + src_umax. Otherwise (some additions overflow), set
         * the output bounds to unbounded.
         */
        min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
        max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);

        if (!min_overflow && max_overflow) {
                *dst_umin = 0;
                *dst_umax = U64_MAX;
        }
}

static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        s32 *dst_smin = &dst_reg->s32_min_value;
        s32 *dst_smax = &dst_reg->s32_max_value;
        u32 *dst_umin = &dst_reg->u32_min_value;
        u32 *dst_umax = &dst_reg->u32_max_value;
        u32 umin_val = src_reg->u32_min_value;
        u32 umax_val = src_reg->u32_max_value;
        bool min_underflow, max_underflow;

        if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) ||
            check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {
                /* Overflow possible, we know nothing */
                *dst_smin = S32_MIN;
                *dst_smax = S32_MAX;
        }

        /* If either all subtractions underflow or no subtractions
         * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
         * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
         * underflow), set the output bounds to unbounded.
         */
        min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
        max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);

        if (min_underflow && !max_underflow) {
                *dst_umin = 0;
                *dst_umax = U32_MAX;
        }
}

static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        s64 *dst_smin = &dst_reg->smin_value;
        s64 *dst_smax = &dst_reg->smax_value;
        u64 *dst_umin = &dst_reg->umin_value;
        u64 *dst_umax = &dst_reg->umax_value;
        u64 umin_val = src_reg->umin_value;
        u64 umax_val = src_reg->umax_value;
        bool min_underflow, max_underflow;

        if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) ||
            check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {
                /* Overflow possible, we know nothing */
                *dst_smin = S64_MIN;
                *dst_smax = S64_MAX;
        }

        /* If either all subtractions underflow or no subtractions
         * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
         * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
         * underflow), set the output bounds to unbounded.
         */
        min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
        max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);

        if (min_underflow && !max_underflow) {
                *dst_umin = 0;
                *dst_umax = U64_MAX;
        }
}

static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        s32 *dst_smin = &dst_reg->s32_min_value;
        s32 *dst_smax = &dst_reg->s32_max_value;
        u32 *dst_umin = &dst_reg->u32_min_value;
        u32 *dst_umax = &dst_reg->u32_max_value;
        s32 tmp_prod[4];

        if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) ||
            check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) {
                /* Overflow possible, we know nothing */
                *dst_umin = 0;
                *dst_umax = U32_MAX;
        }
        if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) ||
            check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) ||
            check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) ||
            check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) {
                /* Overflow possible, we know nothing */
                *dst_smin = S32_MIN;
                *dst_smax = S32_MAX;
        } else {
                *dst_smin = min_array(tmp_prod, 4);
                *dst_smax = max_array(tmp_prod, 4);
        }
}

static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        s64 *dst_smin = &dst_reg->smin_value;
        s64 *dst_smax = &dst_reg->smax_value;
        u64 *dst_umin = &dst_reg->umin_value;
        u64 *dst_umax = &dst_reg->umax_value;
        s64 tmp_prod[4];

        if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) ||
            check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) {
                /* Overflow possible, we know nothing */
                *dst_umin = 0;
                *dst_umax = U64_MAX;
        }
        if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) ||
            check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) ||
            check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) ||
            check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) {
                /* Overflow possible, we know nothing */
                *dst_smin = S64_MIN;
                *dst_smax = S64_MAX;
        } else {
                *dst_smin = min_array(tmp_prod, 4);
                *dst_smax = max_array(tmp_prod, 4);
        }
}

static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg,
                                  struct bpf_reg_state *src_reg)
{
        u32 *dst_umin = &dst_reg->u32_min_value;
        u32 *dst_umax = &dst_reg->u32_max_value;
        u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */

        *dst_umin = *dst_umin / src_val;
        *dst_umax = *dst_umax / src_val;

        /* Reset other ranges/tnum to unbounded/unknown. */
        dst_reg->s32_min_value = S32_MIN;
        dst_reg->s32_max_value = S32_MAX;
        reset_reg64_and_tnum(dst_reg);
}

static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg,
                                struct bpf_reg_state *src_reg)
{
        u64 *dst_umin = &dst_reg->umin_value;
        u64 *dst_umax = &dst_reg->umax_value;
        u64 src_val = src_reg->umin_value; /* non-zero, const divisor */

        *dst_umin = div64_u64(*dst_umin, src_val);
        *dst_umax = div64_u64(*dst_umax, src_val);

        /* Reset other ranges/tnum to unbounded/unknown. */
        dst_reg->smin_value = S64_MIN;
        dst_reg->smax_value = S64_MAX;
        reset_reg32_and_tnum(dst_reg);
}

static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg,
                                  struct bpf_reg_state *src_reg)
{
        s32 *dst_smin = &dst_reg->s32_min_value;
        s32 *dst_smax = &dst_reg->s32_max_value;
        s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
        s32 res1, res2;

        /* BPF div specification: S32_MIN / -1 = S32_MIN */
        if (*dst_smin == S32_MIN && src_val == -1) {
                /*
                 * If the dividend range contains more than just S32_MIN,
                 * we cannot precisely track the result, so it becomes unbounded.
                 * e.g., [S32_MIN, S32_MIN+10]/(-1),
                 *     = {S32_MIN} U [-(S32_MIN+10), -(S32_MIN+1)]
                 *     = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX]
                 * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN.
                 */
                if (*dst_smax != S32_MIN) {
                        *dst_smin = S32_MIN;
                        *dst_smax = S32_MAX;
                }
                goto reset;
        }

        res1 = *dst_smin / src_val;
        res2 = *dst_smax / src_val;
        *dst_smin = min(res1, res2);
        *dst_smax = max(res1, res2);

reset:
        /* Reset other ranges/tnum to unbounded/unknown. */
        dst_reg->u32_min_value = 0;
        dst_reg->u32_max_value = U32_MAX;
        reset_reg64_and_tnum(dst_reg);
}

static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg,
                                struct bpf_reg_state *src_reg)
{
        s64 *dst_smin = &dst_reg->smin_value;
        s64 *dst_smax = &dst_reg->smax_value;
        s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
        s64 res1, res2;

        /* BPF div specification: S64_MIN / -1 = S64_MIN */
        if (*dst_smin == S64_MIN && src_val == -1) {
                /*
                 * If the dividend range contains more than just S64_MIN,
                 * we cannot precisely track the result, so it becomes unbounded.
                 * e.g., [S64_MIN, S64_MIN+10]/(-1),
                 *     = {S64_MIN} U [-(S64_MIN+10), -(S64_MIN+1)]
                 *     = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX]
                 * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN.
                 */
                if (*dst_smax != S64_MIN) {
                        *dst_smin = S64_MIN;
                        *dst_smax = S64_MAX;
                }
                goto reset;
        }

        res1 = div64_s64(*dst_smin, src_val);
        res2 = div64_s64(*dst_smax, src_val);
        *dst_smin = min(res1, res2);
        *dst_smax = max(res1, res2);

reset:
        /* Reset other ranges/tnum to unbounded/unknown. */
        dst_reg->umin_value = 0;
        dst_reg->umax_value = U64_MAX;
        reset_reg32_and_tnum(dst_reg);
}

static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg,
                                  struct bpf_reg_state *src_reg)
{
        u32 *dst_umin = &dst_reg->u32_min_value;
        u32 *dst_umax = &dst_reg->u32_max_value;
        u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
        u32 res_max = src_val - 1;

        /*
         * If dst_umax <= res_max, the result remains unchanged.
         * e.g., [2, 5] % 10 = [2, 5].
         */
        if (*dst_umax <= res_max)
                return;

        *dst_umin = 0;
        *dst_umax = min(*dst_umax, res_max);

        /* Reset other ranges/tnum to unbounded/unknown. */
        dst_reg->s32_min_value = S32_MIN;
        dst_reg->s32_max_value = S32_MAX;
        reset_reg64_and_tnum(dst_reg);
}

static void scalar_min_max_umod(struct bpf_reg_state *dst_reg,
                                struct bpf_reg_state *src_reg)
{
        u64 *dst_umin = &dst_reg->umin_value;
        u64 *dst_umax = &dst_reg->umax_value;
        u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
        u64 res_max = src_val - 1;

        /*
         * If dst_umax <= res_max, the result remains unchanged.
         * e.g., [2, 5] % 10 = [2, 5].
         */
        if (*dst_umax <= res_max)
                return;

        *dst_umin = 0;
        *dst_umax = min(*dst_umax, res_max);

        /* Reset other ranges/tnum to unbounded/unknown. */
        dst_reg->smin_value = S64_MIN;
        dst_reg->smax_value = S64_MAX;
        reset_reg32_and_tnum(dst_reg);
}

static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg,
                                  struct bpf_reg_state *src_reg)
{
        s32 *dst_smin = &dst_reg->s32_min_value;
        s32 *dst_smax = &dst_reg->s32_max_value;
        s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */

        /*
         * Safe absolute value calculation:
         * If src_val == S32_MIN (-2147483648), src_abs becomes 2147483648.
         * Here use unsigned integer to avoid overflow.
         */
        u32 src_abs = (src_val > 0) ? (u32)src_val : -(u32)src_val;

        /*
         * Calculate the maximum possible absolute value of the result.
         * Even if src_abs is 2147483648 (S32_MIN), subtracting 1 gives
         * 2147483647 (S32_MAX), which fits perfectly in s32.
         */
        s32 res_max_abs = src_abs - 1;

        /*
         * If the dividend is already within the result range,
         * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
         */
        if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
                return;

        /* General case: result has the same sign as the dividend. */
        if (*dst_smin >= 0) {
                *dst_smin = 0;
                *dst_smax = min(*dst_smax, res_max_abs);
        } else if (*dst_smax <= 0) {
                *dst_smax = 0;
                *dst_smin = max(*dst_smin, -res_max_abs);
        } else {
                *dst_smin = -res_max_abs;
                *dst_smax = res_max_abs;
        }

        /* Reset other ranges/tnum to unbounded/unknown. */
        dst_reg->u32_min_value = 0;
        dst_reg->u32_max_value = U32_MAX;
        reset_reg64_and_tnum(dst_reg);
}

static void scalar_min_max_smod(struct bpf_reg_state *dst_reg,
                                struct bpf_reg_state *src_reg)
{
        s64 *dst_smin = &dst_reg->smin_value;
        s64 *dst_smax = &dst_reg->smax_value;
        s64 src_val = src_reg->smin_value; /* non-zero, const divisor */

        /*
         * Safe absolute value calculation:
         * If src_val == S64_MIN (-2^63), src_abs becomes 2^63.
         * Here use unsigned integer to avoid overflow.
         */
        u64 src_abs = (src_val > 0) ? (u64)src_val : -(u64)src_val;

        /*
         * Calculate the maximum possible absolute value of the result.
         * Even if src_abs is 2^63 (S64_MIN), subtracting 1 gives
         * 2^63 - 1 (S64_MAX), which fits perfectly in s64.
         */
        s64 res_max_abs = src_abs - 1;

        /*
         * If the dividend is already within the result range,
         * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
         */
        if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
                return;

        /* General case: result has the same sign as the dividend. */
        if (*dst_smin >= 0) {
                *dst_smin = 0;
                *dst_smax = min(*dst_smax, res_max_abs);
        } else if (*dst_smax <= 0) {
                *dst_smax = 0;
                *dst_smin = max(*dst_smin, -res_max_abs);
        } else {
                *dst_smin = -res_max_abs;
                *dst_smax = res_max_abs;
        }

        /* Reset other ranges/tnum to unbounded/unknown. */
        dst_reg->umin_value = 0;
        dst_reg->umax_value = U64_MAX;
        reset_reg32_and_tnum(dst_reg);
}

static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_subreg_is_const(src_reg->var_off);
        bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
        struct tnum var32_off = tnum_subreg(dst_reg->var_off);
        u32 umax_val = src_reg->u32_max_value;

        if (src_known && dst_known) {
                __mark_reg32_known(dst_reg, var32_off.value);
                return;
        }

        /* We get our minimum from the var_off, since that's inherently
         * bitwise.  Our maximum is the minimum of the operands' maxima.
         */
        dst_reg->u32_min_value = var32_off.value;
        dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);

        /* Safe to set s32 bounds by casting u32 result into s32 when u32
         * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
         */
        if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
                dst_reg->s32_min_value = dst_reg->u32_min_value;
                dst_reg->s32_max_value = dst_reg->u32_max_value;
        } else {
                dst_reg->s32_min_value = S32_MIN;
                dst_reg->s32_max_value = S32_MAX;
        }
}

static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_is_const(src_reg->var_off);
        bool dst_known = tnum_is_const(dst_reg->var_off);
        u64 umax_val = src_reg->umax_value;

        if (src_known && dst_known) {
                __mark_reg_known(dst_reg, dst_reg->var_off.value);
                return;
        }

        /* We get our minimum from the var_off, since that's inherently
         * bitwise.  Our maximum is the minimum of the operands' maxima.
         */
        dst_reg->umin_value = dst_reg->var_off.value;
        dst_reg->umax_value = min(dst_reg->umax_value, umax_val);

        /* Safe to set s64 bounds by casting u64 result into s64 when u64
         * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
         */
        if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
                dst_reg->smin_value = dst_reg->umin_value;
                dst_reg->smax_value = dst_reg->umax_value;
        } else {
                dst_reg->smin_value = S64_MIN;
                dst_reg->smax_value = S64_MAX;
        }
        /* We may learn something more from the var_off */
        __update_reg_bounds(dst_reg);
}

static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
                                struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_subreg_is_const(src_reg->var_off);
        bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
        struct tnum var32_off = tnum_subreg(dst_reg->var_off);
        u32 umin_val = src_reg->u32_min_value;

        if (src_known && dst_known) {
                __mark_reg32_known(dst_reg, var32_off.value);
                return;
        }

        /* We get our maximum from the var_off, and our minimum is the
         * maximum of the operands' minima
         */
        dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
        dst_reg->u32_max_value = var32_off.value | var32_off.mask;

        /* Safe to set s32 bounds by casting u32 result into s32 when u32
         * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
         */
        if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
                dst_reg->s32_min_value = dst_reg->u32_min_value;
                dst_reg->s32_max_value = dst_reg->u32_max_value;
        } else {
                dst_reg->s32_min_value = S32_MIN;
                dst_reg->s32_max_value = S32_MAX;
        }
}

static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
                              struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_is_const(src_reg->var_off);
        bool dst_known = tnum_is_const(dst_reg->var_off);
        u64 umin_val = src_reg->umin_value;

        if (src_known && dst_known) {
                __mark_reg_known(dst_reg, dst_reg->var_off.value);
                return;
        }

        /* We get our maximum from the var_off, and our minimum is the
         * maximum of the operands' minima
         */
        dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
        dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;

        /* Safe to set s64 bounds by casting u64 result into s64 when u64
         * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
         */
        if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
                dst_reg->smin_value = dst_reg->umin_value;
                dst_reg->smax_value = dst_reg->umax_value;
        } else {
                dst_reg->smin_value = S64_MIN;
                dst_reg->smax_value = S64_MAX;
        }
        /* We may learn something more from the var_off */
        __update_reg_bounds(dst_reg);
}

static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_subreg_is_const(src_reg->var_off);
        bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
        struct tnum var32_off = tnum_subreg(dst_reg->var_off);

        if (src_known && dst_known) {
                __mark_reg32_known(dst_reg, var32_off.value);
                return;
        }

        /* We get both minimum and maximum from the var32_off. */
        dst_reg->u32_min_value = var32_off.value;
        dst_reg->u32_max_value = var32_off.value | var32_off.mask;

        /* Safe to set s32 bounds by casting u32 result into s32 when u32
         * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
         */
        if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
                dst_reg->s32_min_value = dst_reg->u32_min_value;
                dst_reg->s32_max_value = dst_reg->u32_max_value;
        } else {
                dst_reg->s32_min_value = S32_MIN;
                dst_reg->s32_max_value = S32_MAX;
        }
}

static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_is_const(src_reg->var_off);
        bool dst_known = tnum_is_const(dst_reg->var_off);

        if (src_known && dst_known) {
                /* dst_reg->var_off.value has been updated earlier */
                __mark_reg_known(dst_reg, dst_reg->var_off.value);
                return;
        }

        /* We get both minimum and maximum from the var_off. */
        dst_reg->umin_value = dst_reg->var_off.value;
        dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;

        /* Safe to set s64 bounds by casting u64 result into s64 when u64
         * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
         */
        if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
                dst_reg->smin_value = dst_reg->umin_value;
                dst_reg->smax_value = dst_reg->umax_value;
        } else {
                dst_reg->smin_value = S64_MIN;
                dst_reg->smax_value = S64_MAX;
        }

        __update_reg_bounds(dst_reg);
}

static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
                                   u64 umin_val, u64 umax_val)
{
        /* We lose all sign bit information (except what we can pick
         * up from var_off)
         */
        dst_reg->s32_min_value = S32_MIN;
        dst_reg->s32_max_value = S32_MAX;
        /* If we might shift our top bit out, then we know nothing */
        if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) {
                dst_reg->u32_min_value = 0;
                dst_reg->u32_max_value = U32_MAX;
        } else {
                dst_reg->u32_min_value <<= umin_val;
                dst_reg->u32_max_value <<= umax_val;
        }
}

static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        u32 umax_val = src_reg->u32_max_value;
        u32 umin_val = src_reg->u32_min_value;
        /* u32 alu operation will zext upper bits */
        struct tnum subreg = tnum_subreg(dst_reg->var_off);

        __scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
        dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val));
        /* Not required but being careful mark reg64 bounds as unknown so
         * that we are forced to pick them up from tnum and zext later and
         * if some path skips this step we are still safe.
         */
        __mark_reg64_unbounded(dst_reg);
        __update_reg32_bounds(dst_reg);
}

static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
                                   u64 umin_val, u64 umax_val)
{
        /* Special case <<32 because it is a common compiler pattern to sign
         * extend subreg by doing <<32 s>>32. smin/smax assignments are correct
         * because s32 bounds don't flip sign when shifting to the left by
         * 32bits.
         */
        if (umin_val == 32 && umax_val == 32) {
                dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
                dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
        } else {
                dst_reg->smax_value = S64_MAX;
                dst_reg->smin_value = S64_MIN;
        }

        /* If we might shift our top bit out, then we know nothing */
        if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
                dst_reg->umin_value = 0;
                dst_reg->umax_value = U64_MAX;
        } else {
                dst_reg->umin_value <<= umin_val;
                dst_reg->umax_value <<= umax_val;
        }
}

static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        u64 umax_val = src_reg->umax_value;
        u64 umin_val = src_reg->umin_value;

        /* scalar64 calc uses 32bit unshifted bounds so must be called first */
        __scalar64_min_max_lsh(dst_reg, umin_val, umax_val);
        __scalar32_min_max_lsh(dst_reg, umin_val, umax_val);

        dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
        /* We may learn something more from the var_off */
        __update_reg_bounds(dst_reg);
}

static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        struct tnum subreg = tnum_subreg(dst_reg->var_off);
        u32 umax_val = src_reg->u32_max_value;
        u32 umin_val = src_reg->u32_min_value;

        /* BPF_RSH is an unsigned shift.  If the value in dst_reg might
         * be negative, then either:
         * 1) src_reg might be zero, so the sign bit of the result is
         *    unknown, so we lose our signed bounds
         * 2) it's known negative, thus the unsigned bounds capture the
         *    signed bounds
         * 3) the signed bounds cross zero, so they tell us nothing
         *    about the result
         * If the value in dst_reg is known nonnegative, then again the
         * unsigned bounds capture the signed bounds.
         * Thus, in all cases it suffices to blow away our signed bounds
         * and rely on inferring new ones from the unsigned bounds and
         * var_off of the result.
         */
        dst_reg->s32_min_value = S32_MIN;
        dst_reg->s32_max_value = S32_MAX;

        dst_reg->var_off = tnum_rshift(subreg, umin_val);
        dst_reg->u32_min_value >>= umax_val;
        dst_reg->u32_max_value >>= umin_val;

        __mark_reg64_unbounded(dst_reg);
        __update_reg32_bounds(dst_reg);
}

static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        u64 umax_val = src_reg->umax_value;
        u64 umin_val = src_reg->umin_value;

        /* BPF_RSH is an unsigned shift.  If the value in dst_reg might
         * be negative, then either:
         * 1) src_reg might be zero, so the sign bit of the result is
         *    unknown, so we lose our signed bounds
         * 2) it's known negative, thus the unsigned bounds capture the
         *    signed bounds
         * 3) the signed bounds cross zero, so they tell us nothing
         *    about the result
         * If the value in dst_reg is known nonnegative, then again the
         * unsigned bounds capture the signed bounds.
         * Thus, in all cases it suffices to blow away our signed bounds
         * and rely on inferring new ones from the unsigned bounds and
         * var_off of the result.
         */
        dst_reg->smin_value = S64_MIN;
        dst_reg->smax_value = S64_MAX;
        dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
        dst_reg->umin_value >>= umax_val;
        dst_reg->umax_value >>= umin_val;

        /* Its not easy to operate on alu32 bounds here because it depends
         * on bits being shifted in. Take easy way out and mark unbounded
         * so we can recalculate later from tnum.
         */
        __mark_reg32_unbounded(dst_reg);
        __update_reg_bounds(dst_reg);
}

static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
                                  struct bpf_reg_state *src_reg)
{
        u64 umin_val = src_reg->u32_min_value;

        /* Upon reaching here, src_known is true and
         * umax_val is equal to umin_val.
         */
        dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val);
        dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val);

        dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32);

        /* blow away the dst_reg umin_value/umax_value and rely on
         * dst_reg var_off to refine the result.
         */
        dst_reg->u32_min_value = 0;
        dst_reg->u32_max_value = U32_MAX;

        __mark_reg64_unbounded(dst_reg);
        __update_reg32_bounds(dst_reg);
}

static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
                                struct bpf_reg_state *src_reg)
{
        u64 umin_val = src_reg->umin_value;

        /* Upon reaching here, src_known is true and umax_val is equal
         * to umin_val.
         */
        dst_reg->smin_value >>= umin_val;
        dst_reg->smax_value >>= umin_val;

        dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64);

        /* blow away the dst_reg umin_value/umax_value and rely on
         * dst_reg var_off to refine the result.
         */
        dst_reg->umin_value = 0;
        dst_reg->umax_value = U64_MAX;

        /* Its not easy to operate on alu32 bounds here because it depends
         * on bits being shifted in from upper 32-bits. Take easy way out
         * and mark unbounded so we can recalculate later from tnum.
         */
        __mark_reg32_unbounded(dst_reg);
        __update_reg_bounds(dst_reg);
}

static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *insn)
{
        /*
         * Byte swap operation - update var_off using tnum_bswap.
         * Three cases:
         * 1. bswap(16|32|64): opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE)
         *    unconditional swap
         * 2. to_le(16|32|64): opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE)
         *    swap on big-endian, truncation or no-op on little-endian
         * 3. to_be(16|32|64): opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE)
         *    swap on little-endian, truncation or no-op on big-endian
         */

        bool alu64 = BPF_CLASS(insn->code) == BPF_ALU64;
        bool to_le = BPF_SRC(insn->code) == BPF_TO_LE;
        bool is_big_endian;
#ifdef CONFIG_CPU_BIG_ENDIAN
        is_big_endian = true;
#else
        is_big_endian = false;
#endif
        /* Apply bswap if alu64 or switch between big-endian and little-endian machines */
        bool need_bswap = alu64 || (to_le == is_big_endian);

        /*
         * If the register is mutated, manually reset its scalar ID to break
         * any existing ties and avoid incorrect bounds propagation.
         */
        if (need_bswap || insn->imm == 16 || insn->imm == 32)
                clear_scalar_id(dst_reg);

        if (need_bswap) {
                if (insn->imm == 16)
                        dst_reg->var_off = tnum_bswap16(dst_reg->var_off);
                else if (insn->imm == 32)
                        dst_reg->var_off = tnum_bswap32(dst_reg->var_off);
                else if (insn->imm == 64)
                        dst_reg->var_off = tnum_bswap64(dst_reg->var_off);
                /*
                 * Byteswap scrambles the range, so we must reset bounds.
                 * Bounds will be re-derived from the new tnum later.
                 */
                __mark_reg_unbounded(dst_reg);
        }
        /* For bswap16/32, truncate dst register to match the swapped size */
        if (insn->imm == 16 || insn->imm == 32)
                coerce_reg_to_size(dst_reg, insn->imm / 8);
}

static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
                                             const struct bpf_reg_state *src_reg)
{
        bool src_is_const = false;
        u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;

        if (insn_bitness == 32) {
                if (tnum_subreg_is_const(src_reg->var_off)
                    && src_reg->s32_min_value == src_reg->s32_max_value
                    && src_reg->u32_min_value == src_reg->u32_max_value)
                        src_is_const = true;
        } else {
                if (tnum_is_const(src_reg->var_off)
                    && src_reg->smin_value == src_reg->smax_value
                    && src_reg->umin_value == src_reg->umax_value)
                        src_is_const = true;
        }

        switch (BPF_OP(insn->code)) {
        case BPF_ADD:
        case BPF_SUB:
        case BPF_NEG:
        case BPF_AND:
        case BPF_XOR:
        case BPF_OR:
        case BPF_MUL:
        case BPF_END:
                return true;

        /*
         * Division and modulo operators range is only safe to compute when the
         * divisor is a constant.
         */
        case BPF_DIV:
        case BPF_MOD:
                return src_is_const;

        /* Shift operators range is only computable if shift dimension operand
         * is a constant. Shifts greater than 31 or 63 are undefined. This
         * includes shifts by a negative number.
         */
        case BPF_LSH:
        case BPF_RSH:
        case BPF_ARSH:
                return (src_is_const && src_reg->umax_value < insn_bitness);
        default:
                return false;
        }
}

static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *insn,
                              struct bpf_reg_state *dst_reg)
{
        struct bpf_verifier_state *branch;
        struct bpf_reg_state *regs;
        bool alu32;

        if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0)
                alu32 = false;
        else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0)
                alu32 = true;
        else
                return 0;

        branch = push_stack(env, env->insn_idx, env->insn_idx, false);
        if (IS_ERR(branch))
                return PTR_ERR(branch);

        regs = branch->frame[branch->curframe]->regs;
        if (alu32) {
                __mark_reg32_known(&regs[insn->dst_reg], 0);
                __mark_reg32_known(dst_reg, -1ull);
        } else {
                __mark_reg_known(&regs[insn->dst_reg], 0);
                __mark_reg_known(dst_reg, -1ull);
        }
        return 0;
}

/* WARNING: This function does calculations on 64-bit values, but the actual
 * execution may occur on 32-bit values. Therefore, things like bitshifts
 * need extra checks in the 32-bit case.
 */
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                                      struct bpf_insn *insn,
                                      struct bpf_reg_state *dst_reg,
                                      struct bpf_reg_state src_reg)
{
        u8 opcode = BPF_OP(insn->code);
        s16 off = insn->off;
        bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
        int ret;

        if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) {
                __mark_reg_unknown(env, dst_reg);
                return 0;
        }

        if (sanitize_needed(opcode)) {
                ret = sanitize_val_alu(env, insn);
                if (ret < 0)
                        return sanitize_err(env, insn, ret, NULL, NULL);
        }

        /* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
         * There are two classes of instructions: The first class we track both
         * alu32 and alu64 sign/unsigned bounds independently this provides the
         * greatest amount of precision when alu operations are mixed with jmp32
         * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD,
         * and BPF_OR. This is possible because these ops have fairly easy to
         * understand and calculate behavior in both 32-bit and 64-bit alu ops.
         * See alu32 verifier tests for examples. The second class of
         * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy
         * with regards to tracking sign/unsigned bounds because the bits may
         * cross subreg boundaries in the alu64 case. When this happens we mark
         * the reg unbounded in the subreg bound space and use the resulting
         * tnum to calculate an approximation of the sign/unsigned bounds.
         */
        switch (opcode) {
        case BPF_ADD:
                scalar32_min_max_add(dst_reg, &src_reg);
                scalar_min_max_add(dst_reg, &src_reg);
                dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
                break;
        case BPF_SUB:
                scalar32_min_max_sub(dst_reg, &src_reg);
                scalar_min_max_sub(dst_reg, &src_reg);
                dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
                break;
        case BPF_NEG:
                env->fake_reg[0] = *dst_reg;
                __mark_reg_known(dst_reg, 0);
                scalar32_min_max_sub(dst_reg, &env->fake_reg[0]);
                scalar_min_max_sub(dst_reg, &env->fake_reg[0]);
                dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off);
                break;
        case BPF_MUL:
                dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
                scalar32_min_max_mul(dst_reg, &src_reg);
                scalar_min_max_mul(dst_reg, &src_reg);
                break;
        case BPF_DIV:
                /* BPF div specification: x / 0 = 0 */
                if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) {
                        ___mark_reg_known(dst_reg, 0);
                        break;
                }
                if (alu32)
                        if (off == 1)
                                scalar32_min_max_sdiv(dst_reg, &src_reg);
                        else
                                scalar32_min_max_udiv(dst_reg, &src_reg);
                else
                        if (off == 1)
                                scalar_min_max_sdiv(dst_reg, &src_reg);
                        else
                                scalar_min_max_udiv(dst_reg, &src_reg);
                break;
        case BPF_MOD:
                /* BPF mod specification: x % 0 = x */
                if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0))
                        break;
                if (alu32)
                        if (off == 1)
                                scalar32_min_max_smod(dst_reg, &src_reg);
                        else
                                scalar32_min_max_umod(dst_reg, &src_reg);
                else
                        if (off == 1)
                                scalar_min_max_smod(dst_reg, &src_reg);
                        else
                                scalar_min_max_umod(dst_reg, &src_reg);
                break;
        case BPF_AND:
                if (tnum_is_const(src_reg.var_off)) {
                        ret = maybe_fork_scalars(env, insn, dst_reg);
                        if (ret)
                                return ret;
                }
                dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
                scalar32_min_max_and(dst_reg, &src_reg);
                scalar_min_max_and(dst_reg, &src_reg);
                break;
        case BPF_OR:
                if (tnum_is_const(src_reg.var_off)) {
                        ret = maybe_fork_scalars(env, insn, dst_reg);
                        if (ret)
                                return ret;
                }
                dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
                scalar32_min_max_or(dst_reg, &src_reg);
                scalar_min_max_or(dst_reg, &src_reg);
                break;
        case BPF_XOR:
                dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
                scalar32_min_max_xor(dst_reg, &src_reg);
                scalar_min_max_xor(dst_reg, &src_reg);
                break;
        case BPF_LSH:
                if (alu32)
                        scalar32_min_max_lsh(dst_reg, &src_reg);
                else
                        scalar_min_max_lsh(dst_reg, &src_reg);
                break;
        case BPF_RSH:
                if (alu32)
                        scalar32_min_max_rsh(dst_reg, &src_reg);
                else
                        scalar_min_max_rsh(dst_reg, &src_reg);
                break;
        case BPF_ARSH:
                if (alu32)
                        scalar32_min_max_arsh(dst_reg, &src_reg);
                else
                        scalar_min_max_arsh(dst_reg, &src_reg);
                break;
        case BPF_END:
                scalar_byte_swap(dst_reg, insn);
                break;
        default:
                break;
        }

        /*
         * ALU32 ops are zero extended into 64bit register.
         *
         * BPF_END is already handled inside the helper (truncation),
         * so skip zext here to avoid unexpected zero extension.
         * e.g., le64: opcode=(BPF_END|BPF_ALU|BPF_TO_LE), imm=0x40
         * This is a 64bit byte swap operation with alu32==true,
         * but we should not zero extend the result.
         */
        if (alu32 && opcode != BPF_END)
                zext_32_to_64(dst_reg);
        reg_bounds_sync(dst_reg);
        return 0;
}

/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
 * and var_off.
 */
static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                                   struct bpf_insn *insn)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
        struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
        bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
        u8 opcode = BPF_OP(insn->code);
        int err;

        dst_reg = &regs[insn->dst_reg];
        if (BPF_SRC(insn->code) == BPF_X)
                src_reg = &regs[insn->src_reg];
        else
                src_reg = NULL;

        /* Case where at least one operand is an arena. */
        if (dst_reg->type == PTR_TO_ARENA || (src_reg && src_reg->type == PTR_TO_ARENA)) {
                struct bpf_insn_aux_data *aux = cur_aux(env);

                if (dst_reg->type != PTR_TO_ARENA)
                        *dst_reg = *src_reg;

                dst_reg->subreg_def = env->insn_idx + 1;

                if (BPF_CLASS(insn->code) == BPF_ALU64)
                        /*
                         * 32-bit operations zero upper bits automatically.
                         * 64-bit operations need to be converted to 32.
                         */
                        aux->needs_zext = true;

                /* Any arithmetic operations are allowed on arena pointers */
                return 0;
        }

        if (dst_reg->type != SCALAR_VALUE)
                ptr_reg = dst_reg;

        if (BPF_SRC(insn->code) == BPF_X) {
                if (src_reg->type != SCALAR_VALUE) {
                        if (dst_reg->type != SCALAR_VALUE) {
                                /* Combining two pointers by any ALU op yields
                                 * an arbitrary scalar. Disallow all math except
                                 * pointer subtraction
                                 */
                                if (opcode == BPF_SUB && env->allow_ptr_leaks) {
                                        mark_reg_unknown(env, regs, insn->dst_reg);
                                        return 0;
                                }
                                verbose(env, "R%d pointer %s pointer prohibited\n",
                                        insn->dst_reg,
                                        bpf_alu_string[opcode >> 4]);
                                return -EACCES;
                        } else {
                                /* scalar += pointer
                                 * This is legal, but we have to reverse our
                                 * src/dest handling in computing the range
                                 */
                                err = mark_chain_precision(env, insn->dst_reg);
                                if (err)
                                        return err;
                                return adjust_ptr_min_max_vals(env, insn,
                                                               src_reg, dst_reg);
                        }
                } else if (ptr_reg) {
                        /* pointer += scalar */
                        err = mark_chain_precision(env, insn->src_reg);
                        if (err)
                                return err;
                        return adjust_ptr_min_max_vals(env, insn,
                                                       dst_reg, src_reg);
                } else if (dst_reg->precise) {
                        /* if dst_reg is precise, src_reg should be precise as well */
                        err = mark_chain_precision(env, insn->src_reg);
                        if (err)
                                return err;
                }
        } else {
                /* Pretend the src is a reg with a known value, since we only
                 * need to be able to read from this state.
                 */
                off_reg.type = SCALAR_VALUE;
                __mark_reg_known(&off_reg, insn->imm);
                src_reg = &off_reg;
                if (ptr_reg) /* pointer += K */
                        return adjust_ptr_min_max_vals(env, insn,
                                                       ptr_reg, src_reg);
        }

        /* Got here implies adding two SCALAR_VALUEs */
        if (WARN_ON_ONCE(ptr_reg)) {
                print_verifier_state(env, vstate, vstate->curframe, true);
                verbose(env, "verifier internal error: unexpected ptr_reg\n");
                return -EFAULT;
        }
        if (WARN_ON(!src_reg)) {
                print_verifier_state(env, vstate, vstate->curframe, true);
                verbose(env, "verifier internal error: no src_reg\n");
                return -EFAULT;
        }
        /*
         * For alu32 linked register tracking, we need to check dst_reg's
         * umax_value before the ALU operation. After adjust_scalar_min_max_vals(),
         * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX.
         */
        u64 dst_umax = dst_reg->umax_value;

        err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
        if (err)
                return err;
        /*
         * Compilers can generate the code
         * r1 = r2
         * r1 += 0x1
         * if r2 < 1000 goto ...
         * use r1 in memory access
         * So remember constant delta between r2 and r1 and update r1 after
         * 'if' condition.
         */
        if (env->bpf_capable &&
            (BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) &&
            dst_reg->id && is_reg_const(src_reg, alu32) &&
            !(BPF_SRC(insn->code) == BPF_X && insn->src_reg == insn->dst_reg)) {
                u64 val = reg_const_value(src_reg, alu32);
                s32 off;

                if (!alu32 && ((s64)val < S32_MIN || (s64)val > S32_MAX))
                        goto clear_id;

                if (alu32 && (dst_umax > U32_MAX))
                        goto clear_id;

                off = (s32)val;

                if (BPF_OP(insn->code) == BPF_SUB) {
                        /* Negating S32_MIN would overflow */
                        if (off == S32_MIN)
                                goto clear_id;
                        off = -off;
                }

                if (dst_reg->id & BPF_ADD_CONST) {
                        /*
                         * If the register already went through rX += val
                         * we cannot accumulate another val into rx->off.
                         */
clear_id:
                        clear_scalar_id(dst_reg);
                } else {
                        if (alu32)
                                dst_reg->id |= BPF_ADD_CONST32;
                        else
                                dst_reg->id |= BPF_ADD_CONST64;
                        dst_reg->delta = off;
                }
        } else {
                /*
                 * Make sure ID is cleared otherwise dst_reg min/max could be
                 * incorrectly propagated into other registers by sync_linked_regs()
                 */
                clear_scalar_id(dst_reg);
        }
        return 0;
}

/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        struct bpf_reg_state *regs = cur_regs(env);
        u8 opcode = BPF_OP(insn->code);
        int err;

        if (opcode == BPF_END || opcode == BPF_NEG) {
                /* check src operand */
                err = check_reg_arg(env, insn->dst_reg, SRC_OP);
                if (err)
                        return err;

                if (is_pointer_value(env, insn->dst_reg)) {
                        verbose(env, "R%d pointer arithmetic prohibited\n",
                                insn->dst_reg);
                        return -EACCES;
                }

                /* check dest operand */
                if (regs[insn->dst_reg].type == SCALAR_VALUE) {
                        err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
                        err = err ?: adjust_scalar_min_max_vals(env, insn,
                                                         &regs[insn->dst_reg],
                                                         regs[insn->dst_reg]);
                } else {
                        err = check_reg_arg(env, insn->dst_reg, DST_OP);
                }
                if (err)
                        return err;

        } else if (opcode == BPF_MOV) {

                if (BPF_SRC(insn->code) == BPF_X) {
                        if (insn->off == BPF_ADDR_SPACE_CAST) {
                                if (!env->prog->aux->arena) {
                                        verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n");
                                        return -EINVAL;
                                }
                        }

                        /* check src operand */
                        err = check_reg_arg(env, insn->src_reg, SRC_OP);
                        if (err)
                                return err;
                }

                /* check dest operand, mark as required later */
                err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
                if (err)
                        return err;

                if (BPF_SRC(insn->code) == BPF_X) {
                        struct bpf_reg_state *src_reg = regs + insn->src_reg;
                        struct bpf_reg_state *dst_reg = regs + insn->dst_reg;

                        if (BPF_CLASS(insn->code) == BPF_ALU64) {
                                if (insn->imm) {
                                        /* off == BPF_ADDR_SPACE_CAST */
                                        mark_reg_unknown(env, regs, insn->dst_reg);
                                        if (insn->imm == 1) { /* cast from as(1) to as(0) */
                                                dst_reg->type = PTR_TO_ARENA;
                                                /* PTR_TO_ARENA is 32-bit */
                                                dst_reg->subreg_def = env->insn_idx + 1;
                                        }
                                } else if (insn->off == 0) {
                                        /* case: R1 = R2
                                         * copy register state to dest reg
                                         */
                                        assign_scalar_id_before_mov(env, src_reg);
                                        copy_register_state(dst_reg, src_reg);
                                        dst_reg->subreg_def = DEF_NOT_SUBREG;
                                } else {
                                        /* case: R1 = (s8, s16 s32)R2 */
                                        if (is_pointer_value(env, insn->src_reg)) {
                                                verbose(env,
                                                        "R%d sign-extension part of pointer\n",
                                                        insn->src_reg);
                                                return -EACCES;
                                        } else if (src_reg->type == SCALAR_VALUE) {
                                                bool no_sext;

                                                no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
                                                if (no_sext)
                                                        assign_scalar_id_before_mov(env, src_reg);
                                                copy_register_state(dst_reg, src_reg);
                                                if (!no_sext)
                                                        clear_scalar_id(dst_reg);
                                                coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
                                                dst_reg->subreg_def = DEF_NOT_SUBREG;
                                        } else {
                                                mark_reg_unknown(env, regs, insn->dst_reg);
                                        }
                                }
                        } else {
                                /* R1 = (u32) R2 */
                                if (is_pointer_value(env, insn->src_reg)) {
                                        verbose(env,
                                                "R%d partial copy of pointer\n",
                                                insn->src_reg);
                                        return -EACCES;
                                } else if (src_reg->type == SCALAR_VALUE) {
                                        if (insn->off == 0) {
                                                bool is_src_reg_u32 = get_reg_width(src_reg) <= 32;

                                                if (is_src_reg_u32)
                                                        assign_scalar_id_before_mov(env, src_reg);
                                                copy_register_state(dst_reg, src_reg);
                                                /* Make sure ID is cleared if src_reg is not in u32
                                                 * range otherwise dst_reg min/max could be incorrectly
                                                 * propagated into src_reg by sync_linked_regs()
                                                 */
                                                if (!is_src_reg_u32)
                                                        clear_scalar_id(dst_reg);
                                                dst_reg->subreg_def = env->insn_idx + 1;
                                        } else {
                                                /* case: W1 = (s8, s16)W2 */
                                                bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));

                                                if (no_sext)
                                                        assign_scalar_id_before_mov(env, src_reg);
                                                copy_register_state(dst_reg, src_reg);
                                                if (!no_sext)
                                                        clear_scalar_id(dst_reg);
                                                dst_reg->subreg_def = env->insn_idx + 1;
                                                coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
                                        }
                                } else {
                                        mark_reg_unknown(env, regs,
                                                         insn->dst_reg);
                                }
                                zext_32_to_64(dst_reg);
                                reg_bounds_sync(dst_reg);
                        }
                } else {
                        /* case: R = imm
                         * remember the value we stored into this reg
                         */
                        /* clear any state __mark_reg_known doesn't set */
                        mark_reg_unknown(env, regs, insn->dst_reg);
                        regs[insn->dst_reg].type = SCALAR_VALUE;
                        if (BPF_CLASS(insn->code) == BPF_ALU64) {
                                __mark_reg_known(regs + insn->dst_reg,
                                                 insn->imm);
                        } else {
                                __mark_reg_known(regs + insn->dst_reg,
                                                 (u32)insn->imm);
                        }
                }

        } else {        /* all other ALU ops: and, sub, xor, add, ... */

                if (BPF_SRC(insn->code) == BPF_X) {
                        /* check src1 operand */
                        err = check_reg_arg(env, insn->src_reg, SRC_OP);
                        if (err)
                                return err;
                }

                /* check src2 operand */
                err = check_reg_arg(env, insn->dst_reg, SRC_OP);
                if (err)
                        return err;

                if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
                    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
                        verbose(env, "div by zero\n");
                        return -EINVAL;
                }

                if ((opcode == BPF_LSH || opcode == BPF_RSH ||
                     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
                        int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;

                        if (insn->imm < 0 || insn->imm >= size) {
                                verbose(env, "invalid shift %d\n", insn->imm);
                                return -EINVAL;
                        }
                }

                /* check dest operand */
                err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
                err = err ?: adjust_reg_min_max_vals(env, insn);
                if (err)
                        return err;
        }

        return reg_bounds_sanity_check(env, &regs[insn->dst_reg], "alu");
}

static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
                                   struct bpf_reg_state *dst_reg,
                                   enum bpf_reg_type type,
                                   bool range_right_open)
{
        struct bpf_func_state *state;
        struct bpf_reg_state *reg;
        int new_range;

        if (dst_reg->umax_value == 0 && range_right_open)
                /* This doesn't give us any range */
                return;

        if (dst_reg->umax_value > MAX_PACKET_OFF)
                /* Risk of overflow.  For instance, ptr + (1<<63) may be less
                 * than pkt_end, but that's because it's also less than pkt.
                 */
                return;

        new_range = dst_reg->umax_value;
        if (range_right_open)
                new_range++;

        /* Examples for register markings:
         *
         * pkt_data in dst register:
         *
         *   r2 = r3;
         *   r2 += 8;
         *   if (r2 > pkt_end) goto <handle exception>
         *   <access okay>
         *
         *   r2 = r3;
         *   r2 += 8;
         *   if (r2 < pkt_end) goto <access okay>
         *   <handle exception>
         *
         *   Where:
         *     r2 == dst_reg, pkt_end == src_reg
         *     r2=pkt(id=n,off=8,r=0)
         *     r3=pkt(id=n,off=0,r=0)
         *
         * pkt_data in src register:
         *
         *   r2 = r3;
         *   r2 += 8;
         *   if (pkt_end >= r2) goto <access okay>
         *   <handle exception>
         *
         *   r2 = r3;
         *   r2 += 8;
         *   if (pkt_end <= r2) goto <handle exception>
         *   <access okay>
         *
         *   Where:
         *     pkt_end == dst_reg, r2 == src_reg
         *     r2=pkt(id=n,off=8,r=0)
         *     r3=pkt(id=n,off=0,r=0)
         *
         * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
         * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
         * and [r3, r3 + 8-1) respectively is safe to access depending on
         * the check.
         */

        /* If our ids match, then we must have the same max_value.  And we
         * don't care about the other reg's fixed offset, since if it's too big
         * the range won't allow anything.
         * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16.
         */
        bpf_for_each_reg_in_vstate(vstate, state, reg, ({
                if (reg->type == type && reg->id == dst_reg->id)
                        /* keep the maximum range already checked */
                        reg->range = max(reg->range, new_range);
        }));
}

static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
                                u8 opcode, bool is_jmp32);
static u8 rev_opcode(u8 opcode);

/*
 * Learn more information about live branches by simulating refinement on both branches.
 * regs_refine_cond_op() is sound, so producing ill-formed register bounds for the branch means
 * that branch is dead.
 */
static int simulate_both_branches_taken(struct bpf_verifier_env *env, u8 opcode, bool is_jmp32)
{
        /* Fallthrough (FALSE) branch */
        regs_refine_cond_op(&env->false_reg1, &env->false_reg2, rev_opcode(opcode), is_jmp32);
        reg_bounds_sync(&env->false_reg1);
        reg_bounds_sync(&env->false_reg2);
        /*
         * If there is a range bounds violation in *any* of the abstract values in either
         * reg_states in the FALSE branch (i.e. reg1, reg2), the FALSE branch must be dead. Only
         * TRUE branch will be taken.
         */
        if (range_bounds_violation(&env->false_reg1) || range_bounds_violation(&env->false_reg2))
                return 1;

        /* Jump (TRUE) branch */
        regs_refine_cond_op(&env->true_reg1, &env->true_reg2, opcode, is_jmp32);
        reg_bounds_sync(&env->true_reg1);
        reg_bounds_sync(&env->true_reg2);
        /*
         * If there is a range bounds violation in *any* of the abstract values in either
         * reg_states in the TRUE branch (i.e. true_reg1, true_reg2), the TRUE branch must be dead.
         * Only FALSE branch will be taken.
         */
        if (range_bounds_violation(&env->true_reg1) || range_bounds_violation(&env->true_reg2))
                return 0;

        /* Both branches are possible, we can't determine which one will be taken. */
        return -1;
}

/*
 * <reg1> <op> <reg2>, currently assuming reg2 is a constant
 */
static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *reg1,
                                  struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32)
{
        struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
        struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
        u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
        u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
        s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
        s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
        u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value;
        u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value;
        s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
        s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;

        if (reg1 == reg2) {
                switch (opcode) {
                case BPF_JGE:
                case BPF_JLE:
                case BPF_JSGE:
                case BPF_JSLE:
                case BPF_JEQ:
                        return 1;
                case BPF_JGT:
                case BPF_JLT:
                case BPF_JSGT:
                case BPF_JSLT:
                case BPF_JNE:
                        return 0;
                case BPF_JSET:
                        if (tnum_is_const(t1))
                                return t1.value != 0;
                        else
                                return (smin1 <= 0 && smax1 >= 0) ? -1 : 1;
                default:
                        return -1;
                }
        }

        switch (opcode) {
        case BPF_JEQ:
                /* constants, umin/umax and smin/smax checks would be
                 * redundant in this case because they all should match
                 */
                if (tnum_is_const(t1) && tnum_is_const(t2))
                        return t1.value == t2.value;
                if (!tnum_overlap(t1, t2))
                        return 0;
                /* non-overlapping ranges */
                if (umin1 > umax2 || umax1 < umin2)
                        return 0;
                if (smin1 > smax2 || smax1 < smin2)
                        return 0;
                if (!is_jmp32) {
                        /* if 64-bit ranges are inconclusive, see if we can
                         * utilize 32-bit subrange knowledge to eliminate
                         * branches that can't be taken a priori
                         */
                        if (reg1->u32_min_value > reg2->u32_max_value ||
                            reg1->u32_max_value < reg2->u32_min_value)
                                return 0;
                        if (reg1->s32_min_value > reg2->s32_max_value ||
                            reg1->s32_max_value < reg2->s32_min_value)
                                return 0;
                }
                break;
        case BPF_JNE:
                /* constants, umin/umax and smin/smax checks would be
                 * redundant in this case because they all should match
                 */
                if (tnum_is_const(t1) && tnum_is_const(t2))
                        return t1.value != t2.value;
                if (!tnum_overlap(t1, t2))
                        return 1;
                /* non-overlapping ranges */
                if (umin1 > umax2 || umax1 < umin2)
                        return 1;
                if (smin1 > smax2 || smax1 < smin2)
                        return 1;
                if (!is_jmp32) {
                        /* if 64-bit ranges are inconclusive, see if we can
                         * utilize 32-bit subrange knowledge to eliminate
                         * branches that can't be taken a priori
                         */
                        if (reg1->u32_min_value > reg2->u32_max_value ||
                            reg1->u32_max_value < reg2->u32_min_value)
                                return 1;
                        if (reg1->s32_min_value > reg2->s32_max_value ||
                            reg1->s32_max_value < reg2->s32_min_value)
                                return 1;
                }
                break;
        case BPF_JSET:
                if (!is_reg_const(reg2, is_jmp32)) {
                        swap(reg1, reg2);
                        swap(t1, t2);
                }
                if (!is_reg_const(reg2, is_jmp32))
                        return -1;
                if ((~t1.mask & t1.value) & t2.value)
                        return 1;
                if (!((t1.mask | t1.value) & t2.value))
                        return 0;
                break;
        case BPF_JGT:
                if (umin1 > umax2)
                        return 1;
                else if (umax1 <= umin2)
                        return 0;
                break;
        case BPF_JSGT:
                if (smin1 > smax2)
                        return 1;
                else if (smax1 <= smin2)
                        return 0;
                break;
        case BPF_JLT:
                if (umax1 < umin2)
                        return 1;
                else if (umin1 >= umax2)
                        return 0;
                break;
        case BPF_JSLT:
                if (smax1 < smin2)
                        return 1;
                else if (smin1 >= smax2)
                        return 0;
                break;
        case BPF_JGE:
                if (umin1 >= umax2)
                        return 1;
                else if (umax1 < umin2)
                        return 0;
                break;
        case BPF_JSGE:
                if (smin1 >= smax2)
                        return 1;
                else if (smax1 < smin2)
                        return 0;
                break;
        case BPF_JLE:
                if (umax1 <= umin2)
                        return 1;
                else if (umin1 > umax2)
                        return 0;
                break;
        case BPF_JSLE:
                if (smax1 <= smin2)
                        return 1;
                else if (smin1 > smax2)
                        return 0;
                break;
        }

        return simulate_both_branches_taken(env, opcode, is_jmp32);
}

static int flip_opcode(u32 opcode)
{
        /* How can we transform "a <op> b" into "b <op> a"? */
        static const u8 opcode_flip[16] = {
                /* these stay the same */
                [BPF_JEQ  >> 4] = BPF_JEQ,
                [BPF_JNE  >> 4] = BPF_JNE,
                [BPF_JSET >> 4] = BPF_JSET,
                /* these swap "lesser" and "greater" (L and G in the opcodes) */
                [BPF_JGE  >> 4] = BPF_JLE,
                [BPF_JGT  >> 4] = BPF_JLT,
                [BPF_JLE  >> 4] = BPF_JGE,
                [BPF_JLT  >> 4] = BPF_JGT,
                [BPF_JSGE >> 4] = BPF_JSLE,
                [BPF_JSGT >> 4] = BPF_JSLT,
                [BPF_JSLE >> 4] = BPF_JSGE,
                [BPF_JSLT >> 4] = BPF_JSGT
        };
        return opcode_flip[opcode >> 4];
}

static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
                                   struct bpf_reg_state *src_reg,
                                   u8 opcode)
{
        struct bpf_reg_state *pkt;

        if (src_reg->type == PTR_TO_PACKET_END) {
                pkt = dst_reg;
        } else if (dst_reg->type == PTR_TO_PACKET_END) {
                pkt = src_reg;
                opcode = flip_opcode(opcode);
        } else {
                return -1;
        }

        if (pkt->range >= 0)
                return -1;

        switch (opcode) {
        case BPF_JLE:
                /* pkt <= pkt_end */
                fallthrough;
        case BPF_JGT:
                /* pkt > pkt_end */
                if (pkt->range == BEYOND_PKT_END)
                        /* pkt has at last one extra byte beyond pkt_end */
                        return opcode == BPF_JGT;
                break;
        case BPF_JLT:
                /* pkt < pkt_end */
                fallthrough;
        case BPF_JGE:
                /* pkt >= pkt_end */
                if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END)
                        return opcode == BPF_JGE;
                break;
        }
        return -1;
}

/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
 * and return:
 *  1 - branch will be taken and "goto target" will be executed
 *  0 - branch will not be taken and fall-through to next insn
 * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
 *      range [0,10]
 */
static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *reg1,
                           struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32)
{
        if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
                return is_pkt_ptr_branch_taken(reg1, reg2, opcode);

        if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) {
                u64 val;

                /* arrange that reg2 is a scalar, and reg1 is a pointer */
                if (!is_reg_const(reg2, is_jmp32)) {
                        opcode = flip_opcode(opcode);
                        swap(reg1, reg2);
                }
                /* and ensure that reg2 is a constant */
                if (!is_reg_const(reg2, is_jmp32))
                        return -1;

                if (!reg_not_null(reg1))
                        return -1;

                /* If pointer is valid tests against zero will fail so we can
                 * use this to direct branch taken.
                 */
                val = reg_const_value(reg2, is_jmp32);
                if (val != 0)
                        return -1;

                switch (opcode) {
                case BPF_JEQ:
                        return 0;
                case BPF_JNE:
                        return 1;
                default:
                        return -1;
                }
        }

        /* now deal with two scalars, but not necessarily constants */
        return is_scalar_branch_taken(env, reg1, reg2, opcode, is_jmp32);
}

/* Opcode that corresponds to a *false* branch condition.
 * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2
 */
static u8 rev_opcode(u8 opcode)
{
        switch (opcode) {
        case BPF_JEQ:                return BPF_JNE;
        case BPF_JNE:                return BPF_JEQ;
        /* JSET doesn't have it's reverse opcode in BPF, so add
         * BPF_X flag to denote the reverse of that operation
         */
        case BPF_JSET:                return BPF_JSET | BPF_X;
        case BPF_JSET | BPF_X:        return BPF_JSET;
        case BPF_JGE:                return BPF_JLT;
        case BPF_JGT:                return BPF_JLE;
        case BPF_JLE:                return BPF_JGT;
        case BPF_JLT:                return BPF_JGE;
        case BPF_JSGE:                return BPF_JSLT;
        case BPF_JSGT:                return BPF_JSLE;
        case BPF_JSLE:                return BPF_JSGT;
        case BPF_JSLT:                return BPF_JSGE;
        default:                return 0;
        }
}

/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */
static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
                                u8 opcode, bool is_jmp32)
{
        struct tnum t;
        u64 val;

        /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */
        switch (opcode) {
        case BPF_JGE:
        case BPF_JGT:
        case BPF_JSGE:
        case BPF_JSGT:
                opcode = flip_opcode(opcode);
                swap(reg1, reg2);
                break;
        default:
                break;
        }

        switch (opcode) {
        case BPF_JEQ:
                if (is_jmp32) {
                        reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
                        reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
                        reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
                        reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
                        reg2->u32_min_value = reg1->u32_min_value;
                        reg2->u32_max_value = reg1->u32_max_value;
                        reg2->s32_min_value = reg1->s32_min_value;
                        reg2->s32_max_value = reg1->s32_max_value;

                        t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
                        reg1->var_off = tnum_with_subreg(reg1->var_off, t);
                        reg2->var_off = tnum_with_subreg(reg2->var_off, t);
                } else {
                        reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
                        reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
                        reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
                        reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
                        reg2->umin_value = reg1->umin_value;
                        reg2->umax_value = reg1->umax_value;
                        reg2->smin_value = reg1->smin_value;
                        reg2->smax_value = reg1->smax_value;

                        reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
                        reg2->var_off = reg1->var_off;
                }
                break;
        case BPF_JNE:
                if (!is_reg_const(reg2, is_jmp32))
                        swap(reg1, reg2);
                if (!is_reg_const(reg2, is_jmp32))
                        break;

                /* try to recompute the bound of reg1 if reg2 is a const and
                 * is exactly the edge of reg1.
                 */
                val = reg_const_value(reg2, is_jmp32);
                if (is_jmp32) {
                        /* u32_min_value is not equal to 0xffffffff at this point,
                         * because otherwise u32_max_value is 0xffffffff as well,
                         * in such a case both reg1 and reg2 would be constants,
                         * jump would be predicted and regs_refine_cond_op()
                         * wouldn't be called.
                         *
                         * Same reasoning works for all {u,s}{min,max}{32,64} cases
                         * below.
                         */
                        if (reg1->u32_min_value == (u32)val)
                                reg1->u32_min_value++;
                        if (reg1->u32_max_value == (u32)val)
                                reg1->u32_max_value--;
                        if (reg1->s32_min_value == (s32)val)
                                reg1->s32_min_value++;
                        if (reg1->s32_max_value == (s32)val)
                                reg1->s32_max_value--;
                } else {
                        if (reg1->umin_value == (u64)val)
                                reg1->umin_value++;
                        if (reg1->umax_value == (u64)val)
                                reg1->umax_value--;
                        if (reg1->smin_value == (s64)val)
                                reg1->smin_value++;
                        if (reg1->smax_value == (s64)val)
                                reg1->smax_value--;
                }
                break;
        case BPF_JSET:
                if (!is_reg_const(reg2, is_jmp32))
                        swap(reg1, reg2);
                if (!is_reg_const(reg2, is_jmp32))
                        break;
                val = reg_const_value(reg2, is_jmp32);
                /* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X)
                 * requires single bit to learn something useful. E.g., if we
                 * know that `r1 & 0x3` is true, then which bits (0, 1, or both)
                 * are actually set? We can learn something definite only if
                 * it's a single-bit value to begin with.
                 *
                 * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have
                 * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor
                 * bit 1 is set, which we can readily use in adjustments.
                 */
                if (!is_power_of_2(val))
                        break;
                if (is_jmp32) {
                        t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val));
                        reg1->var_off = tnum_with_subreg(reg1->var_off, t);
                } else {
                        reg1->var_off = tnum_or(reg1->var_off, tnum_const(val));
                }
                break;
        case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */
                if (!is_reg_const(reg2, is_jmp32))
                        swap(reg1, reg2);
                if (!is_reg_const(reg2, is_jmp32))
                        break;
                val = reg_const_value(reg2, is_jmp32);
                /* Forget the ranges before narrowing tnums, to avoid invariant
                 * violations if we're on a dead branch.
                 */
                __mark_reg_unbounded(reg1);
                if (is_jmp32) {
                        t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
                        reg1->var_off = tnum_with_subreg(reg1->var_off, t);
                } else {
                        reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val));
                }
                break;
        case BPF_JLE:
                if (is_jmp32) {
                        reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
                        reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
                } else {
                        reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
                        reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
                }
                break;
        case BPF_JLT:
                if (is_jmp32) {
                        reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
                        reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
                } else {
                        reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
                        reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
                }
                break;
        case BPF_JSLE:
                if (is_jmp32) {
                        reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
                        reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
                } else {
                        reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
                        reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
                }
                break;
        case BPF_JSLT:
                if (is_jmp32) {
                        reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
                        reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
                } else {
                        reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
                        reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
                }
                break;
        default:
                return;
        }
}

/* Check for invariant violations on the registers for both branches of a condition */
static int regs_bounds_sanity_check_branches(struct bpf_verifier_env *env)
{
        int err;

        err = reg_bounds_sanity_check(env, &env->true_reg1, "true_reg1");
        err = err ?: reg_bounds_sanity_check(env, &env->true_reg2, "true_reg2");
        err = err ?: reg_bounds_sanity_check(env, &env->false_reg1, "false_reg1");
        err = err ?: reg_bounds_sanity_check(env, &env->false_reg2, "false_reg2");
        return err;
}

static void mark_ptr_or_null_reg(struct bpf_func_state *state,
                                 struct bpf_reg_state *reg, u32 id,
                                 bool is_null)
{
        if (type_may_be_null(reg->type) && reg->id == id &&
            (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
                /* Old offset should have been known-zero, because we don't
                 * allow pointer arithmetic on pointers that might be NULL.
                 * If we see this happening, don't convert the register.
                 *
                 * But in some cases, some helpers that return local kptrs
                 * advance offset for the returned pointer. In those cases,
                 * it is fine to expect to see reg->var_off.
                 */
                if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
                    WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0)))
                        return;
                if (is_null) {
                        /* We don't need id and ref_obj_id from this point
                         * onwards anymore, thus we should better reset it,
                         * so that state pruning has chances to take effect.
                         */
                        __mark_reg_known_zero(reg);
                        reg->type = SCALAR_VALUE;

                        return;
                }

                mark_ptr_not_null_reg(reg);

                if (!reg_may_point_to_spin_lock(reg)) {
                        /* For not-NULL ptr, reg->ref_obj_id will be reset
                         * in release_reference().
                         *
                         * reg->id is still used by spin_lock ptr. Other
                         * than spin_lock ptr type, reg->id can be reset.
                         */
                        reg->id = 0;
                }
        }
}

/* The logic is similar to find_good_pkt_pointers(), both could eventually
 * be folded together at some point.
 */
static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
                                  bool is_null)
{
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *regs = state->regs, *reg;
        u32 ref_obj_id = regs[regno].ref_obj_id;
        u32 id = regs[regno].id;

        if (ref_obj_id && ref_obj_id == id && is_null)
                /* regs[regno] is in the " == NULL" branch.
                 * No one could have freed the reference state before
                 * doing the NULL check.
                 */
                WARN_ON_ONCE(release_reference_nomark(vstate, id));

        bpf_for_each_reg_in_vstate(vstate, state, reg, ({
                mark_ptr_or_null_reg(state, reg, id, is_null);
        }));
}

static bool try_match_pkt_pointers(const struct bpf_insn *insn,
                                   struct bpf_reg_state *dst_reg,
                                   struct bpf_reg_state *src_reg,
                                   struct bpf_verifier_state *this_branch,
                                   struct bpf_verifier_state *other_branch)
{
        if (BPF_SRC(insn->code) != BPF_X)
                return false;

        /* Pointers are always 64-bit. */
        if (BPF_CLASS(insn->code) == BPF_JMP32)
                return false;

        switch (BPF_OP(insn->code)) {
        case BPF_JGT:
                if ((dst_reg->type == PTR_TO_PACKET &&
                     src_reg->type == PTR_TO_PACKET_END) ||
                    (dst_reg->type == PTR_TO_PACKET_META &&
                     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
                        /* pkt_data' > pkt_end, pkt_meta' > pkt_data */
                        find_good_pkt_pointers(this_branch, dst_reg,
                                               dst_reg->type, false);
                        mark_pkt_end(other_branch, insn->dst_reg, true);
                } else if ((dst_reg->type == PTR_TO_PACKET_END &&
                            src_reg->type == PTR_TO_PACKET) ||
                           (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
                            src_reg->type == PTR_TO_PACKET_META)) {
                        /* pkt_end > pkt_data', pkt_data > pkt_meta' */
                        find_good_pkt_pointers(other_branch, src_reg,
                                               src_reg->type, true);
                        mark_pkt_end(this_branch, insn->src_reg, false);
                } else {
                        return false;
                }
                break;
        case BPF_JLT:
                if ((dst_reg->type == PTR_TO_PACKET &&
                     src_reg->type == PTR_TO_PACKET_END) ||
                    (dst_reg->type == PTR_TO_PACKET_META &&
                     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
                        /* pkt_data' < pkt_end, pkt_meta' < pkt_data */
                        find_good_pkt_pointers(other_branch, dst_reg,
                                               dst_reg->type, true);
                        mark_pkt_end(this_branch, insn->dst_reg, false);
                } else if ((dst_reg->type == PTR_TO_PACKET_END &&
                            src_reg->type == PTR_TO_PACKET) ||
                           (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
                            src_reg->type == PTR_TO_PACKET_META)) {
                        /* pkt_end < pkt_data', pkt_data > pkt_meta' */
                        find_good_pkt_pointers(this_branch, src_reg,
                                               src_reg->type, false);
                        mark_pkt_end(other_branch, insn->src_reg, true);
                } else {
                        return false;
                }
                break;
        case BPF_JGE:
                if ((dst_reg->type == PTR_TO_PACKET &&
                     src_reg->type == PTR_TO_PACKET_END) ||
                    (dst_reg->type == PTR_TO_PACKET_META &&
                     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
                        /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
                        find_good_pkt_pointers(this_branch, dst_reg,
                                               dst_reg->type, true);
                        mark_pkt_end(other_branch, insn->dst_reg, false);
                } else if ((dst_reg->type == PTR_TO_PACKET_END &&
                            src_reg->type == PTR_TO_PACKET) ||
                           (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
                            src_reg->type == PTR_TO_PACKET_META)) {
                        /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
                        find_good_pkt_pointers(other_branch, src_reg,
                                               src_reg->type, false);
                        mark_pkt_end(this_branch, insn->src_reg, true);
                } else {
                        return false;
                }
                break;
        case BPF_JLE:
                if ((dst_reg->type == PTR_TO_PACKET &&
                     src_reg->type == PTR_TO_PACKET_END) ||
                    (dst_reg->type == PTR_TO_PACKET_META &&
                     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
                        /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
                        find_good_pkt_pointers(other_branch, dst_reg,
                                               dst_reg->type, false);
                        mark_pkt_end(this_branch, insn->dst_reg, true);
                } else if ((dst_reg->type == PTR_TO_PACKET_END &&
                            src_reg->type == PTR_TO_PACKET) ||
                           (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
                            src_reg->type == PTR_TO_PACKET_META)) {
                        /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
                        find_good_pkt_pointers(this_branch, src_reg,
                                               src_reg->type, true);
                        mark_pkt_end(other_branch, insn->src_reg, false);
                } else {
                        return false;
                }
                break;
        default:
                return false;
        }

        return true;
}

static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_state *reg,
                                  u32 id, u32 frameno, u32 spi_or_reg, bool is_reg)
{
        struct linked_reg *e;

        if (reg->type != SCALAR_VALUE || (reg->id & ~BPF_ADD_CONST) != id)
                return;

        e = linked_regs_push(reg_set);
        if (e) {
                e->frameno = frameno;
                e->is_reg = is_reg;
                e->regno = spi_or_reg;
        } else {
                clear_scalar_id(reg);
        }
}

/* For all R being scalar registers or spilled scalar registers
 * in verifier state, save R in linked_regs if R->id == id.
 * If there are too many Rs sharing same id, reset id for leftover Rs.
 */
static void collect_linked_regs(struct bpf_verifier_env *env,
                                struct bpf_verifier_state *vstate,
                                u32 id,
                                struct linked_regs *linked_regs)
{
        struct bpf_insn_aux_data *aux = env->insn_aux_data;
        struct bpf_func_state *func;
        struct bpf_reg_state *reg;
        u16 live_regs;
        int i, j;

        id = id & ~BPF_ADD_CONST;
        for (i = vstate->curframe; i >= 0; i--) {
                live_regs = aux[bpf_frame_insn_idx(vstate, i)].live_regs_before;
                func = vstate->frame[i];
                for (j = 0; j < BPF_REG_FP; j++) {
                        if (!(live_regs & BIT(j)))
                                continue;
                        reg = &func->regs[j];
                        __collect_linked_regs(linked_regs, reg, id, i, j, true);
                }
                for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
                        if (!bpf_is_spilled_reg(&func->stack[j]))
                                continue;
                        reg = &func->stack[j].spilled_ptr;
                        __collect_linked_regs(linked_regs, reg, id, i, j, false);
                }
        }
}

/* For all R in linked_regs, copy known_reg range into R
 * if R->id == known_reg->id.
 */
static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate,
                             struct bpf_reg_state *known_reg, struct linked_regs *linked_regs)
{
        struct bpf_reg_state fake_reg;
        struct bpf_reg_state *reg;
        struct linked_reg *e;
        int i;

        for (i = 0; i < linked_regs->cnt; ++i) {
                e = &linked_regs->entries[i];
                reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno]
                                : &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr;
                if (reg->type != SCALAR_VALUE || reg == known_reg)
                        continue;
                if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
                        continue;
                /*
                 * Skip mixed 32/64-bit links: the delta relationship doesn't
                 * hold across different ALU widths.
                 */
                if (((reg->id ^ known_reg->id) & BPF_ADD_CONST) == BPF_ADD_CONST)
                        continue;
                if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
                    reg->delta == known_reg->delta) {
                        s32 saved_subreg_def = reg->subreg_def;

                        copy_register_state(reg, known_reg);
                        reg->subreg_def = saved_subreg_def;
                } else {
                        s32 saved_subreg_def = reg->subreg_def;
                        s32 saved_off = reg->delta;
                        u32 saved_id = reg->id;

                        fake_reg.type = SCALAR_VALUE;
                        __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta);

                        /* reg = known_reg; reg += delta */
                        copy_register_state(reg, known_reg);
                        /*
                         * Must preserve off, id and subreg_def flag,
                         * otherwise another sync_linked_regs() will be incorrect.
                         */
                        reg->delta = saved_off;
                        reg->id = saved_id;
                        reg->subreg_def = saved_subreg_def;

                        scalar32_min_max_add(reg, &fake_reg);
                        scalar_min_max_add(reg, &fake_reg);
                        reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
                        if ((reg->id | known_reg->id) & BPF_ADD_CONST32)
                                zext_32_to_64(reg);
                        reg_bounds_sync(reg);
                }
                if (e->is_reg)
                        mark_reg_scratched(env, e->regno);
                else
                        mark_stack_slot_scratched(env, e->spi);
        }
}

static int check_cond_jmp_op(struct bpf_verifier_env *env,
                             struct bpf_insn *insn, int *insn_idx)
{
        struct bpf_verifier_state *this_branch = env->cur_state;
        struct bpf_verifier_state *other_branch;
        struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
        struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
        struct bpf_reg_state *eq_branch_regs;
        struct linked_regs linked_regs = {};
        u8 opcode = BPF_OP(insn->code);
        int insn_flags = 0;
        bool is_jmp32;
        int pred = -1;
        int err;

        /* Only conditional jumps are expected to reach here. */
        if (opcode == BPF_JA || opcode > BPF_JCOND) {
                verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
                return -EINVAL;
        }

        if (opcode == BPF_JCOND) {
                struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
                int idx = *insn_idx;

                prev_st = find_prev_entry(env, cur_st->parent, idx);

                /* branch out 'fallthrough' insn as a new state to explore */
                queued_st = push_stack(env, idx + 1, idx, false);
                if (IS_ERR(queued_st))
                        return PTR_ERR(queued_st);

                queued_st->may_goto_depth++;
                if (prev_st)
                        widen_imprecise_scalars(env, prev_st, queued_st);
                *insn_idx += insn->off;
                return 0;
        }

        /* check src2 operand */
        err = check_reg_arg(env, insn->dst_reg, SRC_OP);
        if (err)
                return err;

        dst_reg = &regs[insn->dst_reg];
        if (BPF_SRC(insn->code) == BPF_X) {
                /* check src1 operand */
                err = check_reg_arg(env, insn->src_reg, SRC_OP);
                if (err)
                        return err;

                src_reg = &regs[insn->src_reg];
                if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) &&
                    is_pointer_value(env, insn->src_reg)) {
                        verbose(env, "R%d pointer comparison prohibited\n",
                                insn->src_reg);
                        return -EACCES;
                }

                if (src_reg->type == PTR_TO_STACK)
                        insn_flags |= INSN_F_SRC_REG_STACK;
                if (dst_reg->type == PTR_TO_STACK)
                        insn_flags |= INSN_F_DST_REG_STACK;
        } else {
                src_reg = &env->fake_reg[0];
                memset(src_reg, 0, sizeof(*src_reg));
                src_reg->type = SCALAR_VALUE;
                __mark_reg_known(src_reg, insn->imm);

                if (dst_reg->type == PTR_TO_STACK)
                        insn_flags |= INSN_F_DST_REG_STACK;
        }

        if (insn_flags) {
                err = bpf_push_jmp_history(env, this_branch, insn_flags, 0);
                if (err)
                        return err;
        }

        is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
        copy_register_state(&env->false_reg1, dst_reg);
        copy_register_state(&env->false_reg2, src_reg);
        copy_register_state(&env->true_reg1, dst_reg);
        copy_register_state(&env->true_reg2, src_reg);
        pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32);
        if (pred >= 0) {
                /* If we get here with a dst_reg pointer type it is because
                 * above is_branch_taken() special cased the 0 comparison.
                 */
                if (!__is_pointer_value(false, dst_reg))
                        err = mark_chain_precision(env, insn->dst_reg);
                if (BPF_SRC(insn->code) == BPF_X && !err &&
                    !__is_pointer_value(false, src_reg))
                        err = mark_chain_precision(env, insn->src_reg);
                if (err)
                        return err;
        }

        if (pred == 1) {
                /* Only follow the goto, ignore fall-through. If needed, push
                 * the fall-through branch for simulation under speculative
                 * execution.
                 */
                if (!env->bypass_spec_v1) {
                        err = sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx);
                        if (err < 0)
                                return err;
                }
                if (env->log.level & BPF_LOG_LEVEL)
                        print_insn_state(env, this_branch, this_branch->curframe);
                *insn_idx += insn->off;
                return 0;
        } else if (pred == 0) {
                /* Only follow the fall-through branch, since that's where the
                 * program will go. If needed, push the goto branch for
                 * simulation under speculative execution.
                 */
                if (!env->bypass_spec_v1) {
                        err = sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1,
                                                        *insn_idx);
                        if (err < 0)
                                return err;
                }
                if (env->log.level & BPF_LOG_LEVEL)
                        print_insn_state(env, this_branch, this_branch->curframe);
                return 0;
        }

        /* Push scalar registers sharing same ID to jump history,
         * do this before creating 'other_branch', so that both
         * 'this_branch' and 'other_branch' share this history
         * if parent state is created.
         */
        if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id)
                collect_linked_regs(env, this_branch, src_reg->id, &linked_regs);
        if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
                collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs);
        if (linked_regs.cnt > 1) {
                err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
                if (err)
                        return err;
        }

        other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false);
        if (IS_ERR(other_branch))
                return PTR_ERR(other_branch);
        other_branch_regs = other_branch->frame[other_branch->curframe]->regs;

        err = regs_bounds_sanity_check_branches(env);
        if (err)
                return err;

        copy_register_state(dst_reg, &env->false_reg1);
        copy_register_state(src_reg, &env->false_reg2);
        copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1);
        if (BPF_SRC(insn->code) == BPF_X)
                copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2);

        if (BPF_SRC(insn->code) == BPF_X &&
            src_reg->type == SCALAR_VALUE && src_reg->id &&
            !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
                sync_linked_regs(env, this_branch, src_reg, &linked_regs);
                sync_linked_regs(env, other_branch, &other_branch_regs[insn->src_reg],
                                 &linked_regs);
        }
        if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
            !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
                sync_linked_regs(env, this_branch, dst_reg, &linked_regs);
                sync_linked_regs(env, other_branch, &other_branch_regs[insn->dst_reg],
                                 &linked_regs);
        }

        /* if one pointer register is compared to another pointer
         * register check if PTR_MAYBE_NULL could be lifted.
         * E.g. register A - maybe null
         *      register B - not null
         * for JNE A, B, ... - A is not null in the false branch;
         * for JEQ A, B, ... - A is not null in the true branch.
         *
         * Since PTR_TO_BTF_ID points to a kernel struct that does
         * not need to be null checked by the BPF program, i.e.,
         * could be null even without PTR_MAYBE_NULL marking, so
         * only propagate nullness when neither reg is that type.
         */
        if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
            __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
            type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) &&
            base_type(src_reg->type) != PTR_TO_BTF_ID &&
            base_type(dst_reg->type) != PTR_TO_BTF_ID) {
                eq_branch_regs = NULL;
                switch (opcode) {
                case BPF_JEQ:
                        eq_branch_regs = other_branch_regs;
                        break;
                case BPF_JNE:
                        eq_branch_regs = regs;
                        break;
                default:
                        /* do nothing */
                        break;
                }
                if (eq_branch_regs) {
                        if (type_may_be_null(src_reg->type))
                                mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]);
                        else
                                mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
                }
        }

        /* detect if R == 0 where R is returned from bpf_map_lookup_elem().
         * Also does the same detection for a register whose the value is
         * known to be 0.
         * NOTE: these optimizations below are related with pointer comparison
         *       which will never be JMP32.
         */
        if (!is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
            type_may_be_null(dst_reg->type) &&
            ((BPF_SRC(insn->code) == BPF_K && insn->imm == 0) ||
             (BPF_SRC(insn->code) == BPF_X && bpf_register_is_null(src_reg)))) {
                /* Mark all identical registers in each branch as either
                 * safe or unknown depending R == 0 or R != 0 conditional.
                 */
                mark_ptr_or_null_regs(this_branch, insn->dst_reg,
                                      opcode == BPF_JNE);
                mark_ptr_or_null_regs(other_branch, insn->dst_reg,
                                      opcode == BPF_JEQ);
        } else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
                                           this_branch, other_branch) &&
                   is_pointer_value(env, insn->dst_reg)) {
                verbose(env, "R%d pointer comparison prohibited\n",
                        insn->dst_reg);
                return -EACCES;
        }
        if (env->log.level & BPF_LOG_LEVEL)
                print_insn_state(env, this_branch, this_branch->curframe);
        return 0;
}

/* verify BPF_LD_IMM64 instruction */
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        struct bpf_insn_aux_data *aux = cur_aux(env);
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *dst_reg;
        struct bpf_map *map;
        int err;

        if (BPF_SIZE(insn->code) != BPF_DW) {
                verbose(env, "invalid BPF_LD_IMM insn\n");
                return -EINVAL;
        }

        err = check_reg_arg(env, insn->dst_reg, DST_OP);
        if (err)
                return err;

        dst_reg = &regs[insn->dst_reg];
        if (insn->src_reg == 0) {
                u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;

                dst_reg->type = SCALAR_VALUE;
                __mark_reg_known(&regs[insn->dst_reg], imm);
                return 0;
        }

        /* All special src_reg cases are listed below. From this point onwards
         * we either succeed and assign a corresponding dst_reg->type after
         * zeroing the offset, or fail and reject the program.
         */
        mark_reg_known_zero(env, regs, insn->dst_reg);

        if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
                dst_reg->type = aux->btf_var.reg_type;
                switch (base_type(dst_reg->type)) {
                case PTR_TO_MEM:
                        dst_reg->mem_size = aux->btf_var.mem_size;
                        break;
                case PTR_TO_BTF_ID:
                        dst_reg->btf = aux->btf_var.btf;
                        dst_reg->btf_id = aux->btf_var.btf_id;
                        break;
                default:
                        verifier_bug(env, "pseudo btf id: unexpected dst reg type");
                        return -EFAULT;
                }
                return 0;
        }

        if (insn->src_reg == BPF_PSEUDO_FUNC) {
                struct bpf_prog_aux *aux = env->prog->aux;
                u32 subprogno = bpf_find_subprog(env,
                                                 env->insn_idx + insn->imm + 1);

                if (!aux->func_info) {
                        verbose(env, "missing btf func_info\n");
                        return -EINVAL;
                }
                if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
                        verbose(env, "callback function not static\n");
                        return -EINVAL;
                }

                dst_reg->type = PTR_TO_FUNC;
                dst_reg->subprogno = subprogno;
                return 0;
        }

        map = env->used_maps[aux->map_index];

        if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
            insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
                if (map->map_type == BPF_MAP_TYPE_ARENA) {
                        __mark_reg_unknown(env, dst_reg);
                        dst_reg->map_ptr = map;
                        return 0;
                }
                __mark_reg_known(dst_reg, aux->map_off);
                dst_reg->type = PTR_TO_MAP_VALUE;
                dst_reg->map_ptr = map;
                WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY &&
                             map->max_entries != 1);
                /* We want reg->id to be same (0) as map_value is not distinct */
        } else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
                   insn->src_reg == BPF_PSEUDO_MAP_IDX) {
                dst_reg->type = CONST_PTR_TO_MAP;
                dst_reg->map_ptr = map;
        } else {
                verifier_bug(env, "unexpected src reg value for ldimm64");
                return -EFAULT;
        }

        return 0;
}

static bool may_access_skb(enum bpf_prog_type type)
{
        switch (type) {
        case BPF_PROG_TYPE_SOCKET_FILTER:
        case BPF_PROG_TYPE_SCHED_CLS:
        case BPF_PROG_TYPE_SCHED_ACT:
                return true;
        default:
                return false;
        }
}

/* verify safety of LD_ABS|LD_IND instructions:
 * - they can only appear in the programs where ctx == skb
 * - since they are wrappers of function calls, they scratch R1-R5 registers,
 *   preserve R6-R9, and store return value into R0
 *
 * Implicit input:
 *   ctx == skb == R6 == CTX
 *
 * Explicit input:
 *   SRC == any register
 *   IMM == 32-bit immediate
 *
 * Output:
 *   R0 - 8/16/32-bit skb data converted to cpu endianness
 */
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        struct bpf_reg_state *regs = cur_regs(env);
        static const int ctx_reg = BPF_REG_6;
        u8 mode = BPF_MODE(insn->code);
        int i, err;

        if (!may_access_skb(resolve_prog_type(env->prog))) {
                verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
                return -EINVAL;
        }

        if (!env->ops->gen_ld_abs) {
                verifier_bug(env, "gen_ld_abs is null");
                return -EFAULT;
        }

        /* check whether implicit source operand (register R6) is readable */
        err = check_reg_arg(env, ctx_reg, SRC_OP);
        if (err)
                return err;

        /* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as
         * gen_ld_abs() may terminate the program at runtime, leading to
         * reference leak.
         */
        err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]");
        if (err)
                return err;

        if (regs[ctx_reg].type != PTR_TO_CTX) {
                verbose(env,
                        "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
                return -EINVAL;
        }

        if (mode == BPF_IND) {
                /* check explicit source operand */
                err = check_reg_arg(env, insn->src_reg, SRC_OP);
                if (err)
                        return err;
        }

        err = check_ptr_off_reg(env, &regs[ctx_reg], ctx_reg);
        if (err < 0)
                return err;

        /* reset caller saved regs to unreadable */
        for (i = 0; i < CALLER_SAVED_REGS; i++) {
                bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
                check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
        }

        /* mark destination R0 register as readable, since it contains
         * the value fetched from the packet.
         * Already marked as written above.
         */
        mark_reg_unknown(env, regs, BPF_REG_0);
        /* ld_abs load up to 32-bit skb data. */
        regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
        /*
         * See bpf_gen_ld_abs() which emits a hidden BPF_EXIT with r0=0
         * which must be explored by the verifier when in a subprog.
         */
        if (env->cur_state->curframe) {
                struct bpf_verifier_state *branch;

                mark_reg_scratched(env, BPF_REG_0);
                branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
                if (IS_ERR(branch))
                        return PTR_ERR(branch);
                mark_reg_known_zero(env, regs, BPF_REG_0);
                err = prepare_func_exit(env, &env->insn_idx);
                if (err)
                        return err;
                env->insn_idx--;
        }
        return 0;
}


static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_range *range)
{
        enum bpf_prog_type prog_type = resolve_prog_type(env->prog);

        /* Default return value range. */
        *range = retval_range(0, 1);

        switch (prog_type) {
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
                switch (env->prog->expected_attach_type) {
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        *range = retval_range(1, 1);
                        break;
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                        *range = retval_range(0, 3);
                        break;
                default:
                        break;
                }
                break;
        case BPF_PROG_TYPE_CGROUP_SKB:
                if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS)
                        *range = retval_range(0, 3);
                break;
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_SOCK_OPS:
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                break;
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
                if (!env->prog->aux->attach_btf_id)
                        return false;
                *range = retval_range(0, 0);
                break;
        case BPF_PROG_TYPE_TRACING:
                switch (env->prog->expected_attach_type) {
                case BPF_TRACE_FENTRY:
                case BPF_TRACE_FEXIT:
                case BPF_TRACE_FSESSION:
                        *range = retval_range(0, 0);
                        break;
                case BPF_TRACE_RAW_TP:
                case BPF_MODIFY_RETURN:
                        return false;
                case BPF_TRACE_ITER:
                default:
                        break;
                }
                break;
        case BPF_PROG_TYPE_KPROBE:
                switch (env->prog->expected_attach_type) {
                case BPF_TRACE_KPROBE_SESSION:
                case BPF_TRACE_UPROBE_SESSION:
                        break;
                default:
                        return false;
                }
                break;
        case BPF_PROG_TYPE_SK_LOOKUP:
                *range = retval_range(SK_DROP, SK_PASS);
                break;

        case BPF_PROG_TYPE_LSM:
                if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
                        /* no range found, any return value is allowed */
                        if (!get_func_retval_range(env->prog, range))
                                return false;
                        /* no restricted range, any return value is allowed */
                        if (range->minval == S32_MIN && range->maxval == S32_MAX)
                                return false;
                        range->return_32bit = true;
                } else if (!env->prog->aux->attach_func_proto->type) {
                        /* Make sure programs that attach to void
                         * hooks don't try to modify return value.
                         */
                        *range = retval_range(1, 1);
                }
                break;

        case BPF_PROG_TYPE_NETFILTER:
                *range = retval_range(NF_DROP, NF_ACCEPT);
                break;
        case BPF_PROG_TYPE_STRUCT_OPS:
                *range = retval_range(0, 0);
                break;
        case BPF_PROG_TYPE_EXT:
                /* freplace program can return anything as its return value
                 * depends on the to-be-replaced kernel func or bpf program.
                 */
        default:
                return false;
        }

        /* Continue calculating. */

        return true;
}

static bool program_returns_void(struct bpf_verifier_env *env)
{
        const struct bpf_prog *prog = env->prog;
        enum bpf_prog_type prog_type = prog->type;

        switch (prog_type) {
        case BPF_PROG_TYPE_LSM:
                /* See return_retval_range, for BPF_LSM_CGROUP can be 0 or 0-1 depending on hook. */
                if (prog->expected_attach_type != BPF_LSM_CGROUP &&
                    !prog->aux->attach_func_proto->type)
                        return true;
                break;
        case BPF_PROG_TYPE_STRUCT_OPS:
                if (!prog->aux->attach_func_proto->type)
                        return true;
                break;
        case BPF_PROG_TYPE_EXT:
                /*
                 * If the actual program is an extension, let it
                 * return void - attaching will succeed only if the
                 * program being replaced also returns void, and since
                 * it has passed verification its actual type doesn't matter.
                 */
                if (subprog_returns_void(env, 0))
                        return true;
                break;
        default:
                break;
        }
        return false;
}

static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
{
        const char *exit_ctx = "At program exit";
        struct tnum enforce_attach_type_range = tnum_unknown;
        const struct bpf_prog *prog = env->prog;
        struct bpf_reg_state *reg = reg_state(env, regno);
        struct bpf_retval_range range = retval_range(0, 1);
        enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
        struct bpf_func_state *frame = env->cur_state->frame[0];
        const struct btf_type *reg_type, *ret_type = NULL;
        int err;

        /* LSM and struct_ops func-ptr's return type could be "void" */
        if (!frame->in_async_callback_fn && program_returns_void(env))
                return 0;

        if (prog_type == BPF_PROG_TYPE_STRUCT_OPS) {
                /* Allow a struct_ops program to return a referenced kptr if it
                 * matches the operator's return type and is in its unmodified
                 * form. A scalar zero (i.e., a null pointer) is also allowed.
                 */
                reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL;
                ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
                                                prog->aux->attach_func_proto->type,
                                                NULL);
                if (ret_type && ret_type == reg_type && reg->ref_obj_id)
                        return __check_ptr_off_reg(env, reg, regno, false);
        }

        /* eBPF calling convention is such that R0 is used
         * to return the value from eBPF program.
         * Make sure that it's readable at this time
         * of bpf_exit, which means that program wrote
         * something into it earlier
         */
        err = check_reg_arg(env, regno, SRC_OP);
        if (err)
                return err;

        if (is_pointer_value(env, regno)) {
                verbose(env, "R%d leaks addr as return value\n", regno);
                return -EACCES;
        }

        if (frame->in_async_callback_fn) {
                exit_ctx = "At async callback return";
                range = frame->callback_ret_range;
                goto enforce_retval;
        }

        if (prog_type == BPF_PROG_TYPE_STRUCT_OPS && !ret_type)
                return 0;

        if (prog_type == BPF_PROG_TYPE_CGROUP_SKB && (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS))
                enforce_attach_type_range = tnum_range(2, 3);

        if (!return_retval_range(env, &range))
                return 0;

enforce_retval:
        if (reg->type != SCALAR_VALUE) {
                verbose(env, "%s the register R%d is not a known value (%s)\n",
                        exit_ctx, regno, reg_type_str(env, reg->type));
                return -EINVAL;
        }

        err = mark_chain_precision(env, regno);
        if (err)
                return err;

        if (!retval_range_within(range, reg)) {
                verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
                if (prog->expected_attach_type == BPF_LSM_CGROUP &&
                    prog_type == BPF_PROG_TYPE_LSM &&
                    !prog->aux->attach_func_proto->type)
                        verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
                return -EINVAL;
        }

        if (!tnum_is_unknown(enforce_attach_type_range) &&
            tnum_in(enforce_attach_type_range, reg->var_off))
                env->prog->enforce_expected_attach_type = 1;
        return 0;
}

static int check_global_subprog_return_code(struct bpf_verifier_env *env)
{
        struct bpf_reg_state *reg = reg_state(env, BPF_REG_0);
        struct bpf_func_state *cur_frame = cur_func(env);
        int err;

        if (subprog_returns_void(env, cur_frame->subprogno))
                return 0;

        err = check_reg_arg(env, BPF_REG_0, SRC_OP);
        if (err)
                return err;

        if (is_pointer_value(env, BPF_REG_0)) {
                verbose(env, "R%d leaks addr as return value\n", BPF_REG_0);
                return -EACCES;
        }

        if (reg->type != SCALAR_VALUE) {
                verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
                        reg_type_str(env, reg->type));
                return -EINVAL;
        }

        return 0;
}

/* Bitmask with 1s for all caller saved registers */
#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)

/* True if do_misc_fixups() replaces calls to helper number 'imm',
 * replacement patch is presumed to follow bpf_fastcall contract
 * (see mark_fastcall_pattern_for_call() below).
 */
bool bpf_verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
{
        switch (imm) {
#ifdef CONFIG_X86_64
        case BPF_FUNC_get_smp_processor_id:
#ifdef CONFIG_SMP
        case BPF_FUNC_get_current_task_btf:
        case BPF_FUNC_get_current_task:
#endif
                return env->prog->jit_requested && bpf_jit_supports_percpu_insn();
#endif
        default:
                return false;
        }
}

/* If @call is a kfunc or helper call, fills @cs and returns true,
 * otherwise returns false.
 */
bool bpf_get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call,
                          struct bpf_call_summary *cs)
{
        struct bpf_kfunc_call_arg_meta meta;
        const struct bpf_func_proto *fn;
        int i;

        if (bpf_helper_call(call)) {

                if (bpf_get_helper_proto(env, call->imm, &fn) < 0)
                        /* error would be reported later */
                        return false;
                cs->fastcall = fn->allow_fastcall &&
                               (bpf_verifier_inlines_helper_call(env, call->imm) ||
                                bpf_jit_inlines_helper_call(call->imm));
                cs->is_void = fn->ret_type == RET_VOID;
                cs->num_params = 0;
                for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) {
                        if (fn->arg_type[i] == ARG_DONTCARE)
                                break;
                        cs->num_params++;
                }
                return true;
        }

        if (bpf_pseudo_kfunc_call(call)) {
                int err;

                err = bpf_fetch_kfunc_arg_meta(env, call->imm, call->off, &meta);
                if (err < 0)
                        /* error would be reported later */
                        return false;
                cs->num_params = btf_type_vlen(meta.func_proto);
                cs->fastcall = meta.kfunc_flags & KF_FASTCALL;
                cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type));
                return true;
        }

        return false;
}

/* LLVM define a bpf_fastcall function attribute.
 * This attribute means that function scratches only some of
 * the caller saved registers defined by ABI.
 * For BPF the set of such registers could be defined as follows:
 * - R0 is scratched only if function is non-void;
 * - R1-R5 are scratched only if corresponding parameter type is defined
 *   in the function prototype.
 *
 * The contract between kernel and clang allows to simultaneously use
 * such functions and maintain backwards compatibility with old
 * kernels that don't understand bpf_fastcall calls:
 *
 * - for bpf_fastcall calls clang allocates registers as-if relevant r0-r5
 *   registers are not scratched by the call;
 *
 * - as a post-processing step, clang visits each bpf_fastcall call and adds
 *   spill/fill for every live r0-r5;
 *
 * - stack offsets used for the spill/fill are allocated as lowest
 *   stack offsets in whole function and are not used for any other
 *   purposes;
 *
 * - when kernel loads a program, it looks for such patterns
 *   (bpf_fastcall function surrounded by spills/fills) and checks if
 *   spill/fill stack offsets are used exclusively in fastcall patterns;
 *
 * - if so, and if verifier or current JIT inlines the call to the
 *   bpf_fastcall function (e.g. a helper call), kernel removes unnecessary
 *   spill/fill pairs;
 *
 * - when old kernel loads a program, presence of spill/fill pairs
 *   keeps BPF program valid, albeit slightly less efficient.
 *
 * For example:
 *
 *   r1 = 1;
 *   r2 = 2;
 *   *(u64 *)(r10 - 8)  = r1;            r1 = 1;
 *   *(u64 *)(r10 - 16) = r2;            r2 = 2;
 *   call %[to_be_inlined]         -->   call %[to_be_inlined]
 *   r2 = *(u64 *)(r10 - 16);            r0 = r1;
 *   r1 = *(u64 *)(r10 - 8);             r0 += r2;
 *   r0 = r1;                            exit;
 *   r0 += r2;
 *   exit;
 *
 * The purpose of mark_fastcall_pattern_for_call is to:
 * - look for such patterns;
 * - mark spill and fill instructions in env->insn_aux_data[*].fastcall_pattern;
 * - mark set env->insn_aux_data[*].fastcall_spills_num for call instruction;
 * - update env->subprog_info[*]->fastcall_stack_off to find an offset
 *   at which bpf_fastcall spill/fill stack slots start;
 * - update env->subprog_info[*]->keep_fastcall_stack.
 *
 * The .fastcall_pattern and .fastcall_stack_off are used by
 * check_fastcall_stack_contract() to check if every stack access to
 * fastcall spill/fill stack slot originates from spill/fill
 * instructions, members of fastcall patterns.
 *
 * If such condition holds true for a subprogram, fastcall patterns could
 * be rewritten by remove_fastcall_spills_fills().
 * Otherwise bpf_fastcall patterns are not changed in the subprogram
 * (code, presumably, generated by an older clang version).
 *
 * For example, it is *not* safe to remove spill/fill below:
 *
 *   r1 = 1;
 *   *(u64 *)(r10 - 8)  = r1;            r1 = 1;
 *   call %[to_be_inlined]         -->   call %[to_be_inlined]
 *   r1 = *(u64 *)(r10 - 8);             r0 = *(u64 *)(r10 - 8);  <---- wrong !!!
 *   r0 = *(u64 *)(r10 - 8);             r0 += r1;
 *   r0 += r1;                           exit;
 *   exit;
 */
static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
                                           struct bpf_subprog_info *subprog,
                                           int insn_idx, s16 lowest_off)
{
        struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx;
        struct bpf_insn *call = &env->prog->insnsi[insn_idx];
        u32 clobbered_regs_mask;
        struct bpf_call_summary cs;
        u32 expected_regs_mask;
        s16 off;
        int i;

        if (!bpf_get_call_summary(env, call, &cs))
                return;

        /* A bitmask specifying which caller saved registers are clobbered
         * by a call to a helper/kfunc *as if* this helper/kfunc follows
         * bpf_fastcall contract:
         * - includes R0 if function is non-void;
         * - includes R1-R5 if corresponding parameter has is described
         *   in the function prototype.
         */
        clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0);
        /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */
        expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS;

        /* match pairs of form:
         *
         * *(u64 *)(r10 - Y) = rX   (where Y % 8 == 0)
         * ...
         * call %[to_be_inlined]
         * ...
         * rX = *(u64 *)(r10 - Y)
         */
        for (i = 1, off = lowest_off; i <= ARRAY_SIZE(caller_saved); ++i, off += BPF_REG_SIZE) {
                if (insn_idx - i < 0 || insn_idx + i >= env->prog->len)
                        break;
                stx = &insns[insn_idx - i];
                ldx = &insns[insn_idx + i];
                /* must be a stack spill/fill pair */
                if (stx->code != (BPF_STX | BPF_MEM | BPF_DW) ||
                    ldx->code != (BPF_LDX | BPF_MEM | BPF_DW) ||
                    stx->dst_reg != BPF_REG_10 ||
                    ldx->src_reg != BPF_REG_10)
                        break;
                /* must be a spill/fill for the same reg */
                if (stx->src_reg != ldx->dst_reg)
                        break;
                /* must be one of the previously unseen registers */
                if ((BIT(stx->src_reg) & expected_regs_mask) == 0)
                        break;
                /* must be a spill/fill for the same expected offset,
                 * no need to check offset alignment, BPF_DW stack access
                 * is always 8-byte aligned.
                 */
                if (stx->off != off || ldx->off != off)
                        break;
                expected_regs_mask &= ~BIT(stx->src_reg);
                env->insn_aux_data[insn_idx - i].fastcall_pattern = 1;
                env->insn_aux_data[insn_idx + i].fastcall_pattern = 1;
        }
        if (i == 1)
                return;

        /* Conditionally set 'fastcall_spills_num' to allow forward
         * compatibility when more helper functions are marked as
         * bpf_fastcall at compile time than current kernel supports, e.g:
         *
         *   1: *(u64 *)(r10 - 8) = r1
         *   2: call A                  ;; assume A is bpf_fastcall for current kernel
         *   3: r1 = *(u64 *)(r10 - 8)
         *   4: *(u64 *)(r10 - 8) = r1
         *   5: call B                  ;; assume B is not bpf_fastcall for current kernel
         *   6: r1 = *(u64 *)(r10 - 8)
         *
         * There is no need to block bpf_fastcall rewrite for such program.
         * Set 'fastcall_pattern' for both calls to keep check_fastcall_stack_contract() happy,
         * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills()
         * does not remove spill/fill pair {4,6}.
         */
        if (cs.fastcall)
                env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1;
        else
                subprog->keep_fastcall_stack = 1;
        subprog->fastcall_stack_off = min(subprog->fastcall_stack_off, off);
}

static int mark_fastcall_patterns(struct bpf_verifier_env *env)
{
        struct bpf_subprog_info *subprog = env->subprog_info;
        struct bpf_insn *insn;
        s16 lowest_off;
        int s, i;

        for (s = 0; s < env->subprog_cnt; ++s, ++subprog) {
                /* find lowest stack spill offset used in this subprog */
                lowest_off = 0;
                for (i = subprog->start; i < (subprog + 1)->start; ++i) {
                        insn = env->prog->insnsi + i;
                        if (insn->code != (BPF_STX | BPF_MEM | BPF_DW) ||
                            insn->dst_reg != BPF_REG_10)
                                continue;
                        lowest_off = min(lowest_off, insn->off);
                }
                /* use this offset to find fastcall patterns */
                for (i = subprog->start; i < (subprog + 1)->start; ++i) {
                        insn = env->prog->insnsi + i;
                        if (insn->code != (BPF_JMP | BPF_CALL))
                                continue;
                        mark_fastcall_pattern_for_call(env, subprog, i, lowest_off);
                }
        }
        return 0;
}

static void adjust_btf_func(struct bpf_verifier_env *env)
{
        struct bpf_prog_aux *aux = env->prog->aux;
        int i;

        if (!aux->func_info)
                return;

        /* func_info is not available for hidden subprogs */
        for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
                aux->func_info[i].insn_off = env->subprog_info[i].start;
}

/* Find id in idset and increment its count, or add new entry */
static void idset_cnt_inc(struct bpf_idset *idset, u32 id)
{
        u32 i;

        for (i = 0; i < idset->num_ids; i++) {
                if (idset->entries[i].id == id) {
                        idset->entries[i].cnt++;
                        return;
                }
        }
        /* New id */
        if (idset->num_ids < BPF_ID_MAP_SIZE) {
                idset->entries[idset->num_ids].id = id;
                idset->entries[idset->num_ids].cnt = 1;
                idset->num_ids++;
        }
}

/* Find id in idset and return its count, or 0 if not found */
static u32 idset_cnt_get(struct bpf_idset *idset, u32 id)
{
        u32 i;

        for (i = 0; i < idset->num_ids; i++) {
                if (idset->entries[i].id == id)
                        return idset->entries[i].cnt;
        }
        return 0;
}

/*
 * Clear singular scalar ids in a state.
 * A register with a non-zero id is called singular if no other register shares
 * the same base id. Such registers can be treated as independent (id=0).
 */
void bpf_clear_singular_ids(struct bpf_verifier_env *env,
                            struct bpf_verifier_state *st)
{
        struct bpf_idset *idset = &env->idset_scratch;
        struct bpf_func_state *func;
        struct bpf_reg_state *reg;

        idset->num_ids = 0;

        bpf_for_each_reg_in_vstate(st, func, reg, ({
                if (reg->type != SCALAR_VALUE)
                        continue;
                if (!reg->id)
                        continue;
                idset_cnt_inc(idset, reg->id & ~BPF_ADD_CONST);
        }));

        bpf_for_each_reg_in_vstate(st, func, reg, ({
                if (reg->type != SCALAR_VALUE)
                        continue;
                if (!reg->id)
                        continue;
                if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1)
                        clear_scalar_id(reg);
        }));
}

/* Return true if it's OK to have the same insn return a different type. */
static bool reg_type_mismatch_ok(enum bpf_reg_type type)
{
        switch (base_type(type)) {
        case PTR_TO_CTX:
        case PTR_TO_SOCKET:
        case PTR_TO_SOCK_COMMON:
        case PTR_TO_TCP_SOCK:
        case PTR_TO_XDP_SOCK:
        case PTR_TO_BTF_ID:
        case PTR_TO_ARENA:
                return false;
        default:
                return true;
        }
}

/* If an instruction was previously used with particular pointer types, then we
 * need to be careful to avoid cases such as the below, where it may be ok
 * for one branch accessing the pointer, but not ok for the other branch:
 *
 * R1 = sock_ptr
 * goto X;
 * ...
 * R1 = some_other_valid_ptr;
 * goto X;
 * ...
 * R2 = *(u32 *)(R1 + 0);
 */
static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
{
        return src != prev && (!reg_type_mismatch_ok(src) ||
                               !reg_type_mismatch_ok(prev));
}

static bool is_ptr_to_mem_or_btf_id(enum bpf_reg_type type)
{
        switch (base_type(type)) {
        case PTR_TO_MEM:
        case PTR_TO_BTF_ID:
                return true;
        default:
                return false;
        }
}

static bool is_ptr_to_mem(enum bpf_reg_type type)
{
        return base_type(type) == PTR_TO_MEM;
}

static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
                             bool allow_trust_mismatch)
{
        enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
        enum bpf_reg_type merged_type;

        if (*prev_type == NOT_INIT) {
                /* Saw a valid insn
                 * dst_reg = *(u32 *)(src_reg + off)
                 * save type to validate intersecting paths
                 */
                *prev_type = type;
        } else if (reg_type_mismatch(type, *prev_type)) {
                /* Abuser program is trying to use the same insn
                 * dst_reg = *(u32*) (src_reg + off)
                 * with different pointer types:
                 * src_reg == ctx in one branch and
                 * src_reg == stack|map in some other branch.
                 * Reject it.
                 */
                if (allow_trust_mismatch &&
                    is_ptr_to_mem_or_btf_id(type) &&
                    is_ptr_to_mem_or_btf_id(*prev_type)) {
                        /*
                         * Have to support a use case when one path through
                         * the program yields TRUSTED pointer while another
                         * is UNTRUSTED. Fallback to UNTRUSTED to generate
                         * BPF_PROBE_MEM/BPF_PROBE_MEMSX.
                         * Same behavior of MEM_RDONLY flag.
                         */
                        if (is_ptr_to_mem(type) || is_ptr_to_mem(*prev_type))
                                merged_type = PTR_TO_MEM;
                        else
                                merged_type = PTR_TO_BTF_ID;
                        if ((type & PTR_UNTRUSTED) || (*prev_type & PTR_UNTRUSTED))
                                merged_type |= PTR_UNTRUSTED;
                        if ((type & MEM_RDONLY) || (*prev_type & MEM_RDONLY))
                                merged_type |= MEM_RDONLY;
                        *prev_type = merged_type;
                } else {
                        verbose(env, "same insn cannot be used with different pointers\n");
                        return -EINVAL;
                }
        }

        return 0;
}

enum {
        PROCESS_BPF_EXIT = 1,
        INSN_IDX_UPDATED = 2,
};

static int process_bpf_exit_full(struct bpf_verifier_env *env,
                                 bool *do_print_state,
                                 bool exception_exit)
{
        struct bpf_func_state *cur_frame = cur_func(env);

        /* We must do check_reference_leak here before
         * prepare_func_exit to handle the case when
         * state->curframe > 0, it may be a callback function,
         * for which reference_state must match caller reference
         * state when it exits.
         */
        int err = check_resource_leak(env, exception_exit,
                                      exception_exit || !env->cur_state->curframe,
                                      exception_exit ? "bpf_throw" :
                                      "BPF_EXIT instruction in main prog");
        if (err)
                return err;

        /* The side effect of the prepare_func_exit which is
         * being skipped is that it frees bpf_func_state.
         * Typically, process_bpf_exit will only be hit with
         * outermost exit. copy_verifier_state in pop_stack will
         * handle freeing of any extra bpf_func_state left over
         * from not processing all nested function exits. We
         * also skip return code checks as they are not needed
         * for exceptional exits.
         */
        if (exception_exit)
                return PROCESS_BPF_EXIT;

        if (env->cur_state->curframe) {
                /* exit from nested function */
                err = prepare_func_exit(env, &env->insn_idx);
                if (err)
                        return err;
                *do_print_state = true;
                return INSN_IDX_UPDATED;
        }

        /*
         * Return from a regular global subprogram differs from return
         * from the main program or async/exception callback.
         * Main program exit implies return code restrictions
         * that depend on program type.
         * Exit from exception callback is equivalent to main program exit.
         * Exit from async callback implies return code restrictions
         * that depend on async scheduling mechanism.
         */
        if (cur_frame->subprogno &&
            !cur_frame->in_async_callback_fn &&
            !cur_frame->in_exception_callback_fn)
                err = check_global_subprog_return_code(env);
        else
                err = check_return_code(env, BPF_REG_0, "R0");
        if (err)
                return err;
        return PROCESS_BPF_EXIT;
}

static int indirect_jump_min_max_index(struct bpf_verifier_env *env,
                                       int regno,
                                       struct bpf_map *map,
                                       u32 *pmin_index, u32 *pmax_index)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        u64 min_index = reg->umin_value;
        u64 max_index = reg->umax_value;
        const u32 size = 8;

        if (min_index > (u64) U32_MAX * size) {
                verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value);
                return -ERANGE;
        }
        if (max_index > (u64) U32_MAX * size) {
                verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value);
                return -ERANGE;
        }

        min_index /= size;
        max_index /= size;

        if (max_index >= map->max_entries) {
                verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n",
                             regno, min_index, max_index, map->max_entries);
                return -EINVAL;
        }

        *pmin_index = min_index;
        *pmax_index = max_index;
        return 0;
}

/* gotox *dst_reg */
static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        struct bpf_verifier_state *other_branch;
        struct bpf_reg_state *dst_reg;
        struct bpf_map *map;
        u32 min_index, max_index;
        int err = 0;
        int n;
        int i;

        dst_reg = reg_state(env, insn->dst_reg);
        if (dst_reg->type != PTR_TO_INSN) {
                verbose(env, "R%d has type %s, expected PTR_TO_INSN\n",
                             insn->dst_reg, reg_type_str(env, dst_reg->type));
                return -EINVAL;
        }

        map = dst_reg->map_ptr;
        if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg))
                return -EFAULT;

        if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env,
                            "R%d has incorrect map type %d", insn->dst_reg, map->map_type))
                return -EFAULT;

        err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index);
        if (err)
                return err;

        /* Ensure that the buffer is large enough */
        if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) {
                env->gotox_tmp_buf = bpf_iarray_realloc(env->gotox_tmp_buf,
                                                        max_index - min_index + 1);
                if (!env->gotox_tmp_buf)
                        return -ENOMEM;
        }

        n = bpf_copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items);
        if (n < 0)
                return n;
        if (n == 0) {
                verbose(env, "register R%d doesn't point to any offset in map id=%d\n",
                             insn->dst_reg, map->id);
                return -EINVAL;
        }

        for (i = 0; i < n - 1; i++) {
                mark_indirect_target(env, env->gotox_tmp_buf->items[i]);
                other_branch = push_stack(env, env->gotox_tmp_buf->items[i],
                                          env->insn_idx, env->cur_state->speculative);
                if (IS_ERR(other_branch))
                        return PTR_ERR(other_branch);
        }
        env->insn_idx = env->gotox_tmp_buf->items[n-1];
        mark_indirect_target(env, env->insn_idx);
        return INSN_IDX_UPDATED;
}

static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
{
        int err;
        struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx];
        u8 class = BPF_CLASS(insn->code);

        switch (class) {
        case BPF_ALU:
        case BPF_ALU64:
                return check_alu_op(env, insn);

        case BPF_LDX:
                return check_load_mem(env, insn, false,
                                      BPF_MODE(insn->code) == BPF_MEMSX,
                                      true, "ldx");

        case BPF_STX:
                if (BPF_MODE(insn->code) == BPF_ATOMIC)
                        return check_atomic(env, insn);
                return check_store_reg(env, insn, false);

        case BPF_ST: {
                enum bpf_reg_type dst_reg_type;

                err = check_reg_arg(env, insn->dst_reg, SRC_OP);
                if (err)
                        return err;

                dst_reg_type = cur_regs(env)[insn->dst_reg].type;

                err = check_mem_access(env, env->insn_idx, insn->dst_reg,
                                       insn->off, BPF_SIZE(insn->code),
                                       BPF_WRITE, -1, false, false);
                if (err)
                        return err;

                return save_aux_ptr_type(env, dst_reg_type, false);
        }
        case BPF_JMP:
        case BPF_JMP32: {
                u8 opcode = BPF_OP(insn->code);

                env->jmps_processed++;
                if (opcode == BPF_CALL) {
                        if (env->cur_state->active_locks) {
                                if ((insn->src_reg == BPF_REG_0 &&
                                     insn->imm != BPF_FUNC_spin_unlock &&
                                     insn->imm != BPF_FUNC_kptr_xchg) ||
                                    (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
                                     (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) {
                                        verbose(env,
                                                "function calls are not allowed while holding a lock\n");
                                        return -EINVAL;
                                }
                        }
                        mark_reg_scratched(env, BPF_REG_0);
                        if (insn->src_reg == BPF_PSEUDO_CALL)
                                return check_func_call(env, insn, &env->insn_idx);
                        if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
                                return check_kfunc_call(env, insn, &env->insn_idx);
                        return check_helper_call(env, insn, &env->insn_idx);
                } else if (opcode == BPF_JA) {
                        if (BPF_SRC(insn->code) == BPF_X)
                                return check_indirect_jump(env, insn);

                        if (class == BPF_JMP)
                                env->insn_idx += insn->off + 1;
                        else
                                env->insn_idx += insn->imm + 1;
                        return INSN_IDX_UPDATED;
                } else if (opcode == BPF_EXIT) {
                        return process_bpf_exit_full(env, do_print_state, false);
                }
                return check_cond_jmp_op(env, insn, &env->insn_idx);
        }
        case BPF_LD: {
                u8 mode = BPF_MODE(insn->code);

                if (mode == BPF_ABS || mode == BPF_IND)
                        return check_ld_abs(env, insn);

                if (mode == BPF_IMM) {
                        err = check_ld_imm(env, insn);
                        if (err)
                                return err;

                        env->insn_idx++;
                        sanitize_mark_insn_seen(env);
                }
                return 0;
        }
        }
        /* all class values are handled above. silence compiler warning */
        return -EFAULT;
}

static int do_check(struct bpf_verifier_env *env)
{
        bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_insn *insns = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        bool do_print_state = false;
        int prev_insn_idx = -1;

        for (;;) {
                struct bpf_insn *insn;
                struct bpf_insn_aux_data *insn_aux;
                int err;

                /* reset current history entry on each new instruction */
                env->cur_hist_ent = NULL;

                env->prev_insn_idx = prev_insn_idx;
                if (env->insn_idx >= insn_cnt) {
                        verbose(env, "invalid insn idx %d insn_cnt %d\n",
                                env->insn_idx, insn_cnt);
                        return -EFAULT;
                }

                insn = &insns[env->insn_idx];
                insn_aux = &env->insn_aux_data[env->insn_idx];

                if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
                        verbose(env,
                                "BPF program is too large. Processed %d insn\n",
                                env->insn_processed);
                        return -E2BIG;
                }

                state->last_insn_idx = env->prev_insn_idx;
                state->insn_idx = env->insn_idx;

                if (bpf_is_prune_point(env, env->insn_idx)) {
                        err = bpf_is_state_visited(env, env->insn_idx);
                        if (err < 0)
                                return err;
                        if (err == 1) {
                                /* found equivalent state, can prune the search */
                                if (env->log.level & BPF_LOG_LEVEL) {
                                        if (do_print_state)
                                                verbose(env, "\nfrom %d to %d%s: safe\n",
                                                        env->prev_insn_idx, env->insn_idx,
                                                        env->cur_state->speculative ?
                                                        " (speculative execution)" : "");
                                        else
                                                verbose(env, "%d: safe\n", env->insn_idx);
                                }
                                goto process_bpf_exit;
                        }
                }

                if (bpf_is_jmp_point(env, env->insn_idx)) {
                        err = bpf_push_jmp_history(env, state, 0, 0);
                        if (err)
                                return err;
                }

                if (signal_pending(current))
                        return -EAGAIN;

                if (need_resched())
                        cond_resched();

                if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) {
                        verbose(env, "\nfrom %d to %d%s:",
                                env->prev_insn_idx, env->insn_idx,
                                env->cur_state->speculative ?
                                " (speculative execution)" : "");
                        print_verifier_state(env, state, state->curframe, true);
                        do_print_state = false;
                }

                if (env->log.level & BPF_LOG_LEVEL) {
                        if (verifier_state_scratched(env))
                                print_insn_state(env, state, state->curframe);

                        verbose_linfo(env, env->insn_idx, "; ");
                        env->prev_log_pos = env->log.end_pos;
                        verbose(env, "%d: ", env->insn_idx);
                        bpf_verbose_insn(env, insn);
                        env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
                        env->prev_log_pos = env->log.end_pos;
                }

                if (bpf_prog_is_offloaded(env->prog->aux)) {
                        err = bpf_prog_offload_verify_insn(env, env->insn_idx,
                                                           env->prev_insn_idx);
                        if (err)
                                return err;
                }

                sanitize_mark_insn_seen(env);
                prev_insn_idx = env->insn_idx;

                /* Sanity check: precomputed constants must match verifier state */
                if (!state->speculative && insn_aux->const_reg_mask) {
                        struct bpf_reg_state *regs = cur_regs(env);
                        u16 mask = insn_aux->const_reg_mask;

                        for (int r = 0; r < ARRAY_SIZE(insn_aux->const_reg_vals); r++) {
                                u32 cval = insn_aux->const_reg_vals[r];

                                if (!(mask & BIT(r)))
                                        continue;
                                if (regs[r].type != SCALAR_VALUE)
                                        continue;
                                if (!tnum_is_const(regs[r].var_off))
                                        continue;
                                if (verifier_bug_if((u32)regs[r].var_off.value != cval,
                                                    env, "const R%d: %u != %llu",
                                                    r, cval, regs[r].var_off.value))
                                        return -EFAULT;
                        }
                }

                /* Reduce verification complexity by stopping speculative path
                 * verification when a nospec is encountered.
                 */
                if (state->speculative && insn_aux->nospec)
                        goto process_bpf_exit;

                err = do_check_insn(env, &do_print_state);
                if (error_recoverable_with_nospec(err) && state->speculative) {
                        /* Prevent this speculative path from ever reaching the
                         * insn that would have been unsafe to execute.
                         */
                        insn_aux->nospec = true;
                        /* If it was an ADD/SUB insn, potentially remove any
                         * markings for alu sanitization.
                         */
                        insn_aux->alu_state = 0;
                        goto process_bpf_exit;
                } else if (err < 0) {
                        return err;
                } else if (err == PROCESS_BPF_EXIT) {
                        goto process_bpf_exit;
                } else if (err == INSN_IDX_UPDATED) {
                } else if (err == 0) {
                        env->insn_idx++;
                }

                if (state->speculative && insn_aux->nospec_result) {
                        /* If we are on a path that performed a jump-op, this
                         * may skip a nospec patched-in after the jump. This can
                         * currently never happen because nospec_result is only
                         * used for the write-ops
                         * `*(size*)(dst_reg+off)=src_reg|imm32` and helper
                         * calls. These must never skip the following insn
                         * (i.e., bpf_insn_successors()'s opcode_info.can_jump
                         * is false). Still, add a warning to document this in
                         * case nospec_result is used elsewhere in the future.
                         *
                         * All non-branch instructions have a single
                         * fall-through edge. For these, nospec_result should
                         * already work.
                         */
                        if (verifier_bug_if((BPF_CLASS(insn->code) == BPF_JMP ||
                                             BPF_CLASS(insn->code) == BPF_JMP32) &&
                                            BPF_OP(insn->code) != BPF_CALL, env,
                                            "speculation barrier after jump instruction may not have the desired effect"))
                                return -EFAULT;
process_bpf_exit:
                        mark_verifier_state_scratched(env);
                        err = bpf_update_branch_counts(env, env->cur_state);
                        if (err)
                                return err;
                        err = pop_stack(env, &prev_insn_idx, &env->insn_idx,
                                        pop_log);
                        if (err < 0) {
                                if (err != -ENOENT)
                                        return err;
                                break;
                        } else {
                                do_print_state = true;
                                continue;
                        }
                }
        }

        return 0;
}

static int find_btf_percpu_datasec(struct btf *btf)
{
        const struct btf_type *t;
        const char *tname;
        int i, n;

        /*
         * Both vmlinux and module each have their own ".data..percpu"
         * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF
         * types to look at only module's own BTF types.
         */
        n = btf_nr_types(btf);
        for (i = btf_named_start_id(btf, true); i < n; i++) {
                t = btf_type_by_id(btf, i);
                if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
                        continue;

                tname = btf_name_by_offset(btf, t->name_off);
                if (!strcmp(tname, ".data..percpu"))
                        return i;
        }

        return -ENOENT;
}

/*
 * Add btf to the env->used_btfs array. If needed, refcount the
 * corresponding kernel module. To simplify caller's logic
 * in case of error or if btf was added before the function
 * decreases the btf refcount.
 */
static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf)
{
        struct btf_mod_pair *btf_mod;
        int ret = 0;
        int i;

        /* check whether we recorded this BTF (and maybe module) already */
        for (i = 0; i < env->used_btf_cnt; i++)
                if (env->used_btfs[i].btf == btf)
                        goto ret_put;

        if (env->used_btf_cnt >= MAX_USED_BTFS) {
                verbose(env, "The total number of btfs per program has reached the limit of %u\n",
                        MAX_USED_BTFS);
                ret = -E2BIG;
                goto ret_put;
        }

        btf_mod = &env->used_btfs[env->used_btf_cnt];
        btf_mod->btf = btf;
        btf_mod->module = NULL;

        /* if we reference variables from kernel module, bump its refcount */
        if (btf_is_module(btf)) {
                btf_mod->module = btf_try_get_module(btf);
                if (!btf_mod->module) {
                        ret = -ENXIO;
                        goto ret_put;
                }
        }

        env->used_btf_cnt++;
        return 0;

ret_put:
        /* Either error or this BTF was already added */
        btf_put(btf);
        return ret;
}

/* replace pseudo btf_id with kernel symbol address */
static int __check_pseudo_btf_id(struct bpf_verifier_env *env,
                                 struct bpf_insn *insn,
                                 struct bpf_insn_aux_data *aux,
                                 struct btf *btf)
{
        const struct btf_var_secinfo *vsi;
        const struct btf_type *datasec;
        const struct btf_type *t;
        const char *sym_name;
        bool percpu = false;
        u32 type, id = insn->imm;
        s32 datasec_id;
        u64 addr;
        int i;

        t = btf_type_by_id(btf, id);
        if (!t) {
                verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
                return -ENOENT;
        }

        if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
                verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
                return -EINVAL;
        }

        sym_name = btf_name_by_offset(btf, t->name_off);
        addr = kallsyms_lookup_name(sym_name);
        if (!addr) {
                verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
                        sym_name);
                return -ENOENT;
        }
        insn[0].imm = (u32)addr;
        insn[1].imm = addr >> 32;

        if (btf_type_is_func(t)) {
                aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
                aux->btf_var.mem_size = 0;
                return 0;
        }

        datasec_id = find_btf_percpu_datasec(btf);
        if (datasec_id > 0) {
                datasec = btf_type_by_id(btf, datasec_id);
                for_each_vsi(i, datasec, vsi) {
                        if (vsi->type == id) {
                                percpu = true;
                                break;
                        }
                }
        }

        type = t->type;
        t = btf_type_skip_modifiers(btf, type, NULL);
        if (percpu) {
                aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
                aux->btf_var.btf = btf;
                aux->btf_var.btf_id = type;
        } else if (!btf_type_is_struct(t)) {
                const struct btf_type *ret;
                const char *tname;
                u32 tsize;

                /* resolve the type size of ksym. */
                ret = btf_resolve_size(btf, t, &tsize);
                if (IS_ERR(ret)) {
                        tname = btf_name_by_offset(btf, t->name_off);
                        verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
                                tname, PTR_ERR(ret));
                        return -EINVAL;
                }
                aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
                aux->btf_var.mem_size = tsize;
        } else {
                aux->btf_var.reg_type = PTR_TO_BTF_ID;
                aux->btf_var.btf = btf;
                aux->btf_var.btf_id = type;
        }

        return 0;
}

static int check_pseudo_btf_id(struct bpf_verifier_env *env,
                               struct bpf_insn *insn,
                               struct bpf_insn_aux_data *aux)
{
        struct btf *btf;
        int btf_fd;
        int err;

        btf_fd = insn[1].imm;
        if (btf_fd) {
                btf = btf_get_by_fd(btf_fd);
                if (IS_ERR(btf)) {
                        verbose(env, "invalid module BTF object FD specified.\n");
                        return -EINVAL;
                }
        } else {
                if (!btf_vmlinux) {
                        verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
                        return -EINVAL;
                }
                btf_get(btf_vmlinux);
                btf = btf_vmlinux;
        }

        err = __check_pseudo_btf_id(env, insn, aux, btf);
        if (err) {
                btf_put(btf);
                return err;
        }

        return __add_used_btf(env, btf);
}

static bool is_tracing_prog_type(enum bpf_prog_type type)
{
        switch (type) {
        case BPF_PROG_TYPE_KPROBE:
        case BPF_PROG_TYPE_TRACEPOINT:
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
                return true;
        default:
                return false;
        }
}

static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
{
        return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
                map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
}

static int check_map_prog_compatibility(struct bpf_verifier_env *env,
                                        struct bpf_map *map,
                                        struct bpf_prog *prog)

{
        enum bpf_prog_type prog_type = resolve_prog_type(prog);

        if (map->excl_prog_sha &&
            memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) {
                verbose(env, "program's hash doesn't match map's excl_prog_hash\n");
                return -EACCES;
        }

        if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
            btf_record_has_field(map->record, BPF_RB_ROOT)) {
                if (is_tracing_prog_type(prog_type)) {
                        verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");
                        return -EINVAL;
                }
        }

        if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
                if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
                        verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
                        return -EINVAL;
                }

                if (is_tracing_prog_type(prog_type)) {
                        verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
                        return -EINVAL;
                }
        }

        if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
            !bpf_offload_prog_map_match(prog, map)) {
                verbose(env, "offload device mismatch between prog and map\n");
                return -EINVAL;
        }

        if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
                verbose(env, "bpf_struct_ops map cannot be used in prog\n");
                return -EINVAL;
        }

        if (prog->sleepable)
                switch (map->map_type) {
                case BPF_MAP_TYPE_HASH:
                case BPF_MAP_TYPE_LRU_HASH:
                case BPF_MAP_TYPE_ARRAY:
                case BPF_MAP_TYPE_PERCPU_HASH:
                case BPF_MAP_TYPE_PERCPU_ARRAY:
                case BPF_MAP_TYPE_LRU_PERCPU_HASH:
                case BPF_MAP_TYPE_ARRAY_OF_MAPS:
                case BPF_MAP_TYPE_HASH_OF_MAPS:
                case BPF_MAP_TYPE_RINGBUF:
                case BPF_MAP_TYPE_USER_RINGBUF:
                case BPF_MAP_TYPE_INODE_STORAGE:
                case BPF_MAP_TYPE_SK_STORAGE:
                case BPF_MAP_TYPE_TASK_STORAGE:
                case BPF_MAP_TYPE_CGRP_STORAGE:
                case BPF_MAP_TYPE_QUEUE:
                case BPF_MAP_TYPE_STACK:
                case BPF_MAP_TYPE_ARENA:
                case BPF_MAP_TYPE_INSN_ARRAY:
                case BPF_MAP_TYPE_PROG_ARRAY:
                        break;
                default:
                        verbose(env,
                                "Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
                        return -EINVAL;
                }

        if (bpf_map_is_cgroup_storage(map) &&
            bpf_cgroup_storage_assign(env->prog->aux, map)) {
                verbose(env, "only one cgroup storage of each type is allowed\n");
                return -EBUSY;
        }

        if (map->map_type == BPF_MAP_TYPE_ARENA) {
                if (env->prog->aux->arena) {
                        verbose(env, "Only one arena per program\n");
                        return -EBUSY;
                }
                if (!env->allow_ptr_leaks || !env->bpf_capable) {
                        verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
                        return -EPERM;
                }
                if (!env->prog->jit_requested) {
                        verbose(env, "JIT is required to use arena\n");
                        return -EOPNOTSUPP;
                }
                if (!bpf_jit_supports_arena()) {
                        verbose(env, "JIT doesn't support arena\n");
                        return -EOPNOTSUPP;
                }
                env->prog->aux->arena = (void *)map;
                if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
                        verbose(env, "arena's user address must be set via map_extra or mmap()\n");
                        return -EINVAL;
                }
        }

        return 0;
}

static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map)
{
        int i, err;

        /* check whether we recorded this map already */
        for (i = 0; i < env->used_map_cnt; i++)
                if (env->used_maps[i] == map)
                        return i;

        if (env->used_map_cnt >= MAX_USED_MAPS) {
                verbose(env, "The total number of maps per program has reached the limit of %u\n",
                        MAX_USED_MAPS);
                return -E2BIG;
        }

        err = check_map_prog_compatibility(env, map, env->prog);
        if (err)
                return err;

        if (env->prog->sleepable)
                atomic64_inc(&map->sleepable_refcnt);

        /* hold the map. If the program is rejected by verifier,
         * the map will be released by release_maps() or it
         * will be used by the valid program until it's unloaded
         * and all maps are released in bpf_free_used_maps()
         */
        bpf_map_inc(map);

        env->used_maps[env->used_map_cnt++] = map;

        if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
                err = bpf_insn_array_init(map, env->prog);
                if (err) {
                        verbose(env, "Failed to properly initialize insn array\n");
                        return err;
                }
                env->insn_array_maps[env->insn_array_map_cnt++] = map;
        }

        return env->used_map_cnt - 1;
}

/* Add map behind fd to used maps list, if it's not already there, and return
 * its index.
 * Returns <0 on error, or >= 0 index, on success.
 */
static int add_used_map(struct bpf_verifier_env *env, int fd)
{
        struct bpf_map *map;
        CLASS(fd, f)(fd);

        map = __bpf_map_get(f);
        if (IS_ERR(map)) {
                verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
                return PTR_ERR(map);
        }

        return __add_used_map(env, map);
}

static int check_alu_fields(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        u8 class = BPF_CLASS(insn->code);
        u8 opcode = BPF_OP(insn->code);

        switch (opcode) {
        case BPF_NEG:
                if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 ||
                    insn->off != 0 || insn->imm != 0) {
                        verbose(env, "BPF_NEG uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        case BPF_END:
                if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
                    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
                    (class == BPF_ALU64 && BPF_SRC(insn->code) != BPF_TO_LE)) {
                        verbose(env, "BPF_END uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        case BPF_MOV:
                if (BPF_SRC(insn->code) == BPF_X) {
                        if (class == BPF_ALU) {
                                if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
                                    insn->imm) {
                                        verbose(env, "BPF_MOV uses reserved fields\n");
                                        return -EINVAL;
                                }
                        } else if (insn->off == BPF_ADDR_SPACE_CAST) {
                                if (insn->imm != 1 && insn->imm != 1u << 16) {
                                        verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
                                        return -EINVAL;
                                }
                        } else if ((insn->off != 0 && insn->off != 8 &&
                                    insn->off != 16 && insn->off != 32) || insn->imm) {
                                verbose(env, "BPF_MOV uses reserved fields\n");
                                return -EINVAL;
                        }
                } else if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
                        verbose(env, "BPF_MOV uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        case BPF_ADD:
        case BPF_SUB:
        case BPF_AND:
        case BPF_OR:
        case BPF_XOR:
        case BPF_LSH:
        case BPF_RSH:
        case BPF_ARSH:
        case BPF_MUL:
        case BPF_DIV:
        case BPF_MOD:
                if (BPF_SRC(insn->code) == BPF_X) {
                        if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) ||
                            (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
                                verbose(env, "BPF_ALU uses reserved fields\n");
                                return -EINVAL;
                        }
                } else if (insn->src_reg != BPF_REG_0 ||
                           (insn->off != 0 && insn->off != 1) ||
                           (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
                        verbose(env, "BPF_ALU uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        default:
                verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
                return -EINVAL;
        }
}

static int check_jmp_fields(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        u8 class = BPF_CLASS(insn->code);
        u8 opcode = BPF_OP(insn->code);

        switch (opcode) {
        case BPF_CALL:
                if (BPF_SRC(insn->code) != BPF_K ||
                    (insn->src_reg != BPF_PSEUDO_KFUNC_CALL && insn->off != 0) ||
                    (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL &&
                     insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
                    insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) {
                        verbose(env, "BPF_CALL uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        case BPF_JA:
                if (BPF_SRC(insn->code) == BPF_X) {
                        if (insn->src_reg != BPF_REG_0 || insn->imm != 0 || insn->off != 0) {
                                verbose(env, "BPF_JA|BPF_X uses reserved fields\n");
                                return -EINVAL;
                        }
                } else if (insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 ||
                           (class == BPF_JMP && insn->imm != 0) ||
                           (class == BPF_JMP32 && insn->off != 0)) {
                        verbose(env, "BPF_JA uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        case BPF_EXIT:
                if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 ||
                    insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 ||
                    class == BPF_JMP32) {
                        verbose(env, "BPF_EXIT uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        case BPF_JCOND:
                if (insn->code != (BPF_JMP | BPF_JCOND) || insn->src_reg != BPF_MAY_GOTO ||
                    insn->dst_reg || insn->imm) {
                        verbose(env, "invalid may_goto imm %d\n", insn->imm);
                        return -EINVAL;
                }
                return 0;
        default:
                if (BPF_SRC(insn->code) == BPF_X) {
                        if (insn->imm != 0) {
                                verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
                                return -EINVAL;
                        }
                } else if (insn->src_reg != BPF_REG_0) {
                        verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        }
}

static int check_insn_fields(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        switch (BPF_CLASS(insn->code)) {
        case BPF_ALU:
        case BPF_ALU64:
                return check_alu_fields(env, insn);
        case BPF_LDX:
                if ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
                    insn->imm != 0) {
                        verbose(env, "BPF_LDX uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        case BPF_STX:
                if (BPF_MODE(insn->code) == BPF_ATOMIC)
                        return 0;
                if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
                        verbose(env, "BPF_STX uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        case BPF_ST:
                if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) {
                        verbose(env, "BPF_ST uses reserved fields\n");
                        return -EINVAL;
                }
                return 0;
        case BPF_JMP:
        case BPF_JMP32:
                return check_jmp_fields(env, insn);
        case BPF_LD: {
                u8 mode = BPF_MODE(insn->code);

                if (mode == BPF_ABS || mode == BPF_IND) {
                        if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
                            BPF_SIZE(insn->code) == BPF_DW ||
                            (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
                                verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
                                return -EINVAL;
                        }
                } else if (mode != BPF_IMM) {
                        verbose(env, "invalid BPF_LD mode\n");
                        return -EINVAL;
                }
                return 0;
        }
        default:
                verbose(env, "unknown insn class %d\n", BPF_CLASS(insn->code));
                return -EINVAL;
        }
}

/*
 * Check that insns are sane and rewrite pseudo imm in ld_imm64 instructions:
 *
 * 1. if it accesses map FD, replace it with actual map pointer.
 * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
 *
 * NOTE: btf_vmlinux is required for converting pseudo btf_id.
 */
static int check_and_resolve_insns(struct bpf_verifier_env *env)
{
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        int i, err;

        err = bpf_prog_calc_tag(env->prog);
        if (err)
                return err;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (insn->dst_reg >= MAX_BPF_REG) {
                        verbose(env, "R%d is invalid\n", insn->dst_reg);
                        return -EINVAL;
                }
                if (insn->src_reg >= MAX_BPF_REG) {
                        verbose(env, "R%d is invalid\n", insn->src_reg);
                        return -EINVAL;
                }
                if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
                        struct bpf_insn_aux_data *aux;
                        struct bpf_map *map;
                        int map_idx;
                        u64 addr;
                        u32 fd;

                        if (i == insn_cnt - 1 || insn[1].code != 0 ||
                            insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
                            insn[1].off != 0) {
                                verbose(env, "invalid bpf_ld_imm64 insn\n");
                                return -EINVAL;
                        }

                        if (insn[0].off != 0) {
                                verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
                                return -EINVAL;
                        }

                        if (insn[0].src_reg == 0)
                                /* valid generic load 64-bit imm */
                                goto next_insn;

                        if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
                                aux = &env->insn_aux_data[i];
                                err = check_pseudo_btf_id(env, insn, aux);
                                if (err)
                                        return err;
                                goto next_insn;
                        }

                        if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
                                aux = &env->insn_aux_data[i];
                                aux->ptr_type = PTR_TO_FUNC;
                                goto next_insn;
                        }

                        /* In final convert_pseudo_ld_imm64() step, this is
                         * converted into regular 64-bit imm load insn.
                         */
                        switch (insn[0].src_reg) {
                        case BPF_PSEUDO_MAP_VALUE:
                        case BPF_PSEUDO_MAP_IDX_VALUE:
                                break;
                        case BPF_PSEUDO_MAP_FD:
                        case BPF_PSEUDO_MAP_IDX:
                                if (insn[1].imm == 0)
                                        break;
                                fallthrough;
                        default:
                                verbose(env, "unrecognized bpf_ld_imm64 insn\n");
                                return -EINVAL;
                        }

                        switch (insn[0].src_reg) {
                        case BPF_PSEUDO_MAP_IDX_VALUE:
                        case BPF_PSEUDO_MAP_IDX:
                                if (bpfptr_is_null(env->fd_array)) {
                                        verbose(env, "fd_idx without fd_array is invalid\n");
                                        return -EPROTO;
                                }
                                if (copy_from_bpfptr_offset(&fd, env->fd_array,
                                                            insn[0].imm * sizeof(fd),
                                                            sizeof(fd)))
                                        return -EFAULT;
                                break;
                        default:
                                fd = insn[0].imm;
                                break;
                        }

                        map_idx = add_used_map(env, fd);
                        if (map_idx < 0)
                                return map_idx;
                        map = env->used_maps[map_idx];

                        aux = &env->insn_aux_data[i];
                        aux->map_index = map_idx;

                        if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
                            insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
                                addr = (unsigned long)map;
                        } else {
                                u32 off = insn[1].imm;

                                if (!map->ops->map_direct_value_addr) {
                                        verbose(env, "no direct value access support for this map type\n");
                                        return -EINVAL;
                                }

                                err = map->ops->map_direct_value_addr(map, &addr, off);
                                if (err) {
                                        verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
                                                map->value_size, off);
                                        return err;
                                }

                                aux->map_off = off;
                                addr += off;
                        }

                        insn[0].imm = (u32)addr;
                        insn[1].imm = addr >> 32;

next_insn:
                        insn++;
                        i++;
                        continue;
                }

                /* Basic sanity check before we invest more work here. */
                if (!bpf_opcode_in_insntable(insn->code)) {
                        verbose(env, "unknown opcode %02x\n", insn->code);
                        return -EINVAL;
                }

                err = check_insn_fields(env, insn);
                if (err)
                        return err;
        }

        /* now all pseudo BPF_LD_IMM64 instructions load valid
         * 'struct bpf_map *' into a register instead of user map_fd.
         * These pointers will be used later by verifier to validate map access.
         */
        return 0;
}

/* drop refcnt of maps used by the rejected program */
static void release_maps(struct bpf_verifier_env *env)
{
        __bpf_free_used_maps(env->prog->aux, env->used_maps,
                             env->used_map_cnt);
}

/* drop refcnt of maps used by the rejected program */
static void release_btfs(struct bpf_verifier_env *env)
{
        __bpf_free_used_btfs(env->used_btfs, env->used_btf_cnt);
}

/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
{
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        int i;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
                        continue;
                if (insn->src_reg == BPF_PSEUDO_FUNC)
                        continue;
                insn->src_reg = 0;
        }
}

static void release_insn_arrays(struct bpf_verifier_env *env)
{
        int i;

        for (i = 0; i < env->insn_array_map_cnt; i++)
                bpf_insn_array_release(env->insn_array_maps[i]);
}



/* The verifier does more data flow analysis than llvm and will not
 * explore branches that are dead at run time. Malicious programs can
 * have dead code too. Therefore replace all dead at-run-time code
 * with 'ja -1'.
 *
 * Just nops are not optimal, e.g. if they would sit at the end of the
 * program and through another bug we would manage to jump there, then
 * we'd execute beyond program memory otherwise. Returning exception
 * code also wouldn't work since we can have subprogs where the dead
 * code could be located.
 */
static void sanitize_dead_code(struct bpf_verifier_env *env)
{
        struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
        struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
        struct bpf_insn *insn = env->prog->insnsi;
        const int insn_cnt = env->prog->len;
        int i;

        for (i = 0; i < insn_cnt; i++) {
                if (aux_data[i].seen)
                        continue;
                memcpy(insn + i, &trap, sizeof(trap));
                aux_data[i].zext_dst = false;
        }
}



static void free_states(struct bpf_verifier_env *env)
{
        struct bpf_verifier_state_list *sl;
        struct list_head *head, *pos, *tmp;
        struct bpf_scc_info *info;
        int i, j;

        bpf_free_verifier_state(env->cur_state, true);
        env->cur_state = NULL;
        while (!pop_stack(env, NULL, NULL, false));

        list_for_each_safe(pos, tmp, &env->free_list) {
                sl = container_of(pos, struct bpf_verifier_state_list, node);
                bpf_free_verifier_state(&sl->state, false);
                kfree(sl);
        }
        INIT_LIST_HEAD(&env->free_list);

        for (i = 0; i < env->scc_cnt; ++i) {
                info = env->scc_info[i];
                if (!info)
                        continue;
                for (j = 0; j < info->num_visits; j++)
                        bpf_free_backedges(&info->visits[j]);
                kvfree(info);
                env->scc_info[i] = NULL;
        }

        if (!env->explored_states)
                return;

        for (i = 0; i < state_htab_size(env); i++) {
                head = &env->explored_states[i];

                list_for_each_safe(pos, tmp, head) {
                        sl = container_of(pos, struct bpf_verifier_state_list, node);
                        bpf_free_verifier_state(&sl->state, false);
                        kfree(sl);
                }
                INIT_LIST_HEAD(&env->explored_states[i]);
        }
}

static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
        bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
        struct bpf_subprog_info *sub = subprog_info(env, subprog);
        struct bpf_prog_aux *aux = env->prog->aux;
        struct bpf_verifier_state *state;
        struct bpf_reg_state *regs;
        int ret, i;

        env->prev_linfo = NULL;
        env->pass_cnt++;

        state = kzalloc_obj(struct bpf_verifier_state, GFP_KERNEL_ACCOUNT);
        if (!state)
                return -ENOMEM;
        state->curframe = 0;
        state->speculative = false;
        state->branches = 1;
        state->in_sleepable = env->prog->sleepable;
        state->frame[0] = kzalloc_obj(struct bpf_func_state, GFP_KERNEL_ACCOUNT);
        if (!state->frame[0]) {
                kfree(state);
                return -ENOMEM;
        }
        env->cur_state = state;
        init_func_state(env, state->frame[0],
                        BPF_MAIN_FUNC /* callsite */,
                        0 /* frameno */,
                        subprog);
        state->first_insn_idx = env->subprog_info[subprog].start;
        state->last_insn_idx = -1;

        regs = state->frame[state->curframe]->regs;
        if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
                const char *sub_name = subprog_name(env, subprog);
                struct bpf_subprog_arg_info *arg;
                struct bpf_reg_state *reg;

                if (env->log.level & BPF_LOG_LEVEL)
                        verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
                ret = btf_prepare_func_args(env, subprog);
                if (ret)
                        goto out;

                if (subprog_is_exc_cb(env, subprog)) {
                        state->frame[0]->in_exception_callback_fn = true;

                        /*
                         * Global functions are scalar or void, make sure
                         * we return a scalar.
                         */
                        if (subprog_returns_void(env, subprog)) {
                                verbose(env, "exception cb cannot return void\n");
                                ret = -EINVAL;
                                goto out;
                        }

                        /* Also ensure the callback only has a single scalar argument. */
                        if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
                                verbose(env, "exception cb only supports single integer argument\n");
                                ret = -EINVAL;
                                goto out;
                        }
                }
                for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
                        arg = &sub->args[i - BPF_REG_1];
                        reg = &regs[i];

                        if (arg->arg_type == ARG_PTR_TO_CTX) {
                                reg->type = PTR_TO_CTX;
                                mark_reg_known_zero(env, regs, i);
                        } else if (arg->arg_type == ARG_ANYTHING) {
                                reg->type = SCALAR_VALUE;
                                mark_reg_unknown(env, regs, i);
                        } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
                                /* assume unspecial LOCAL dynptr type */
                                __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
                        } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
                                reg->type = PTR_TO_MEM;
                                reg->type |= arg->arg_type &
                                             (PTR_MAYBE_NULL | PTR_UNTRUSTED | MEM_RDONLY);
                                mark_reg_known_zero(env, regs, i);
                                reg->mem_size = arg->mem_size;
                                if (arg->arg_type & PTR_MAYBE_NULL)
                                        reg->id = ++env->id_gen;
                        } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
                                reg->type = PTR_TO_BTF_ID;
                                if (arg->arg_type & PTR_MAYBE_NULL)
                                        reg->type |= PTR_MAYBE_NULL;
                                if (arg->arg_type & PTR_UNTRUSTED)
                                        reg->type |= PTR_UNTRUSTED;
                                if (arg->arg_type & PTR_TRUSTED)
                                        reg->type |= PTR_TRUSTED;
                                mark_reg_known_zero(env, regs, i);
                                reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
                                reg->btf_id = arg->btf_id;
                                reg->id = ++env->id_gen;
                        } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
                                /* caller can pass either PTR_TO_ARENA or SCALAR */
                                mark_reg_unknown(env, regs, i);
                        } else {
                                verifier_bug(env, "unhandled arg#%d type %d",
                                             i - BPF_REG_1, arg->arg_type);
                                ret = -EFAULT;
                                goto out;
                        }
                }
        } else {
                /* if main BPF program has associated BTF info, validate that
                 * it's matching expected signature, and otherwise mark BTF
                 * info for main program as unreliable
                 */
                if (env->prog->aux->func_info_aux) {
                        ret = btf_prepare_func_args(env, 0);
                        if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
                                env->prog->aux->func_info_aux[0].unreliable = true;
                }

                /* 1st arg to a function */
                regs[BPF_REG_1].type = PTR_TO_CTX;
                mark_reg_known_zero(env, regs, BPF_REG_1);
        }

        /* Acquire references for struct_ops program arguments tagged with "__ref" */
        if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
                for (i = 0; i < aux->ctx_arg_info_size; i++)
                        aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ?
                                                          acquire_reference(env, 0) : 0;
        }

        ret = do_check(env);
out:
        if (!ret && pop_log)
                bpf_vlog_reset(&env->log, 0);
        free_states(env);
        return ret;
}

/* Lazily verify all global functions based on their BTF, if they are called
 * from main BPF program or any of subprograms transitively.
 * BPF global subprogs called from dead code are not validated.
 * All callable global functions must pass verification.
 * Otherwise the whole program is rejected.
 * Consider:
 * int bar(int);
 * int foo(int f)
 * {
 *    return bar(f);
 * }
 * int bar(int b)
 * {
 *    ...
 * }
 * foo() will be verified first for R1=any_scalar_value. During verification it
 * will be assumed that bar() already verified successfully and call to bar()
 * from foo() will be checked for type match only. Later bar() will be verified
 * independently to check that it's safe for R1=any_scalar_value.
 */
static int do_check_subprogs(struct bpf_verifier_env *env)
{
        struct bpf_prog_aux *aux = env->prog->aux;
        struct bpf_func_info_aux *sub_aux;
        int i, ret, new_cnt;

        if (!aux->func_info)
                return 0;

        /* exception callback is presumed to be always called */
        if (env->exception_callback_subprog)
                subprog_aux(env, env->exception_callback_subprog)->called = true;

again:
        new_cnt = 0;
        for (i = 1; i < env->subprog_cnt; i++) {
                if (!bpf_subprog_is_global(env, i))
                        continue;

                sub_aux = subprog_aux(env, i);
                if (!sub_aux->called || sub_aux->verified)
                        continue;

                env->insn_idx = env->subprog_info[i].start;
                WARN_ON_ONCE(env->insn_idx == 0);
                ret = do_check_common(env, i);
                if (ret) {
                        return ret;
                } else if (env->log.level & BPF_LOG_LEVEL) {
                        verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
                                i, subprog_name(env, i));
                }

                /* We verified new global subprog, it might have called some
                 * more global subprogs that we haven't verified yet, so we
                 * need to do another pass over subprogs to verify those.
                 */
                sub_aux->verified = true;
                new_cnt++;
        }

        /* We can't loop forever as we verify at least one global subprog on
         * each pass.
         */
        if (new_cnt)
                goto again;

        return 0;
}

static int do_check_main(struct bpf_verifier_env *env)
{
        int ret;

        env->insn_idx = 0;
        ret = do_check_common(env, 0);
        if (!ret)
                env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
        return ret;
}


static void print_verification_stats(struct bpf_verifier_env *env)
{
        int i;

        if (env->log.level & BPF_LOG_STATS) {
                verbose(env, "verification time %lld usec\n",
                        div_u64(env->verification_time, 1000));
                verbose(env, "stack depth ");
                for (i = 0; i < env->subprog_cnt; i++) {
                        u32 depth = env->subprog_info[i].stack_depth;

                        verbose(env, "%d", depth);
                        if (i + 1 < env->subprog_cnt)
                                verbose(env, "+");
                }
                verbose(env, "\n");
        }
        verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
                "total_states %d peak_states %d mark_read %d\n",
                env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS,
                env->max_states_per_insn, env->total_states,
                env->peak_states, env->longest_mark_read_walk);
}

int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
                               const struct bpf_ctx_arg_aux *info, u32 cnt)
{
        prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL_ACCOUNT);
        prog->aux->ctx_arg_info_size = cnt;

        return prog->aux->ctx_arg_info ? 0 : -ENOMEM;
}

static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
{
        const struct btf_type *t, *func_proto;
        const struct bpf_struct_ops_desc *st_ops_desc;
        const struct bpf_struct_ops *st_ops;
        const struct btf_member *member;
        struct bpf_prog *prog = env->prog;
        bool has_refcounted_arg = false;
        u32 btf_id, member_idx, member_off;
        struct btf *btf;
        const char *mname;
        int i, err;

        if (!prog->gpl_compatible) {
                verbose(env, "struct ops programs must have a GPL compatible license\n");
                return -EINVAL;
        }

        if (!prog->aux->attach_btf_id)
                return -ENOTSUPP;

        btf = prog->aux->attach_btf;
        if (btf_is_module(btf)) {
                /* Make sure st_ops is valid through the lifetime of env */
                env->attach_btf_mod = btf_try_get_module(btf);
                if (!env->attach_btf_mod) {
                        verbose(env, "struct_ops module %s is not found\n",
                                btf_get_name(btf));
                        return -ENOTSUPP;
                }
        }

        btf_id = prog->aux->attach_btf_id;
        st_ops_desc = bpf_struct_ops_find(btf, btf_id);
        if (!st_ops_desc) {
                verbose(env, "attach_btf_id %u is not a supported struct\n",
                        btf_id);
                return -ENOTSUPP;
        }
        st_ops = st_ops_desc->st_ops;

        t = st_ops_desc->type;
        member_idx = prog->expected_attach_type;
        if (member_idx >= btf_type_vlen(t)) {
                verbose(env, "attach to invalid member idx %u of struct %s\n",
                        member_idx, st_ops->name);
                return -EINVAL;
        }

        member = &btf_type_member(t)[member_idx];
        mname = btf_name_by_offset(btf, member->name_off);
        func_proto = btf_type_resolve_func_ptr(btf, member->type,
                                               NULL);
        if (!func_proto) {
                verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
                        mname, member_idx, st_ops->name);
                return -EINVAL;
        }

        member_off = __btf_member_bit_offset(t, member) / 8;
        err = bpf_struct_ops_supported(st_ops, member_off);
        if (err) {
                verbose(env, "attach to unsupported member %s of struct %s\n",
                        mname, st_ops->name);
                return err;
        }

        if (st_ops->check_member) {
                err = st_ops->check_member(t, member, prog);

                if (err) {
                        verbose(env, "attach to unsupported member %s of struct %s\n",
                                mname, st_ops->name);
                        return err;
                }
        }

        if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) {
                verbose(env, "Private stack not supported by jit\n");
                return -EACCES;
        }

        for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) {
                if (st_ops_desc->arg_info[member_idx].info[i].refcounted) {
                        has_refcounted_arg = true;
                        break;
                }
        }

        /* Tail call is not allowed for programs with refcounted arguments since we
         * cannot guarantee that valid refcounted kptrs will be passed to the callee.
         */
        for (i = 0; i < env->subprog_cnt; i++) {
                if (has_refcounted_arg && env->subprog_info[i].has_tail_call) {
                        verbose(env, "program with __ref argument cannot tail call\n");
                        return -EINVAL;
                }
        }

        prog->aux->st_ops = st_ops;
        prog->aux->attach_st_ops_member_off = member_off;

        prog->aux->attach_func_proto = func_proto;
        prog->aux->attach_func_name = mname;
        env->ops = st_ops->verifier_ops;

        return bpf_prog_ctx_arg_info_init(prog, st_ops_desc->arg_info[member_idx].info,
                                          st_ops_desc->arg_info[member_idx].cnt);
}
#define SECURITY_PREFIX "security_"

#ifdef CONFIG_FUNCTION_ERROR_INJECTION

/* list of non-sleepable functions that are otherwise on
 * ALLOW_ERROR_INJECTION list
 */
BTF_SET_START(btf_non_sleepable_error_inject)
/* Three functions below can be called from sleepable and non-sleepable context.
 * Assume non-sleepable from bpf safety point of view.
 */
BTF_ID(func, __filemap_add_folio)
#ifdef CONFIG_FAIL_PAGE_ALLOC
BTF_ID(func, should_fail_alloc_page)
#endif
#ifdef CONFIG_FAILSLAB
BTF_ID(func, should_failslab)
#endif
BTF_SET_END(btf_non_sleepable_error_inject)

static int check_non_sleepable_error_inject(u32 btf_id)
{
        return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
}

static int check_attach_sleepable(u32 btf_id, unsigned long addr, const char *func_name)
{
        /* fentry/fexit/fmod_ret progs can be sleepable if they are
         * attached to ALLOW_ERROR_INJECTION and are not in denylist.
         */
        if (!check_non_sleepable_error_inject(btf_id) &&
            within_error_injection_list(addr))
                return 0;

        return -EINVAL;
}

static int check_attach_modify_return(unsigned long addr, const char *func_name)
{
        if (within_error_injection_list(addr) ||
            !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
                return 0;

        return -EINVAL;
}

#else

/* Unfortunately, the arch-specific prefixes are hard-coded in arch syscall code
 * so we need to hard-code them, too. Ftrace has arch_syscall_match_sym_name()
 * but that just compares two concrete function names.
 */
static bool has_arch_syscall_prefix(const char *func_name)
{
#if defined(__x86_64__)
        return !strncmp(func_name, "__x64_", 6);
#elif defined(__i386__)
        return !strncmp(func_name, "__ia32_", 7);
#elif defined(__s390x__)
        return !strncmp(func_name, "__s390x_", 8);
#elif defined(__aarch64__)
        return !strncmp(func_name, "__arm64_", 8);
#elif defined(__riscv)
        return !strncmp(func_name, "__riscv_", 8);
#elif defined(__powerpc__) || defined(__powerpc64__)
        return !strncmp(func_name, "sys_", 4);
#elif defined(__loongarch__)
        return !strncmp(func_name, "sys_", 4);
#else
        return false;
#endif
}

/* Without error injection, allow sleepable and fmod_ret progs on syscalls. */

static int check_attach_sleepable(u32 btf_id, unsigned long addr, const char *func_name)
{
        if (has_arch_syscall_prefix(func_name))
                return 0;

        return -EINVAL;
}

static int check_attach_modify_return(unsigned long addr, const char *func_name)
{
        if (has_arch_syscall_prefix(func_name) ||
            !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
                return 0;

        return -EINVAL;
}

#endif /* CONFIG_FUNCTION_ERROR_INJECTION */

int bpf_check_attach_target(struct bpf_verifier_log *log,
                            const struct bpf_prog *prog,
                            const struct bpf_prog *tgt_prog,
                            u32 btf_id,
                            struct bpf_attach_target_info *tgt_info)
{
        bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
        bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
        char trace_symbol[KSYM_SYMBOL_LEN];
        const char prefix[] = "btf_trace_";
        struct bpf_raw_event_map *btp;
        int ret = 0, subprog = -1, i;
        const struct btf_type *t;
        bool conservative = true;
        const char *tname, *fname;
        struct btf *btf;
        long addr = 0;
        struct module *mod = NULL;

        if (!btf_id) {
                bpf_log(log, "Tracing programs must provide btf_id\n");
                return -EINVAL;
        }
        btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
        if (!btf) {
                bpf_log(log,
                        "Tracing program can only be attached to another program annotated with BTF\n");
                return -EINVAL;
        }
        t = btf_type_by_id(btf, btf_id);
        if (!t) {
                bpf_log(log, "attach_btf_id %u is invalid\n", btf_id);
                return -EINVAL;
        }
        tname = btf_name_by_offset(btf, t->name_off);
        if (!tname) {
                bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id);
                return -EINVAL;
        }
        if (tgt_prog) {
                struct bpf_prog_aux *aux = tgt_prog->aux;
                bool tgt_changes_pkt_data;
                bool tgt_might_sleep;

                if (bpf_prog_is_dev_bound(prog->aux) &&
                    !bpf_prog_dev_bound_match(prog, tgt_prog)) {
                        bpf_log(log, "Target program bound device mismatch");
                        return -EINVAL;
                }

                for (i = 0; i < aux->func_info_cnt; i++)
                        if (aux->func_info[i].type_id == btf_id) {
                                subprog = i;
                                break;
                        }
                if (subprog == -1) {
                        bpf_log(log, "Subprog %s doesn't exist\n", tname);
                        return -EINVAL;
                }
                if (aux->func && aux->func[subprog]->aux->exception_cb) {
                        bpf_log(log,
                                "%s programs cannot attach to exception callback\n",
                                prog_extension ? "Extension" : "Tracing");
                        return -EINVAL;
                }
                conservative = aux->func_info_aux[subprog].unreliable;
                if (prog_extension) {
                        if (conservative) {
                                bpf_log(log,
                                        "Cannot replace static functions\n");
                                return -EINVAL;
                        }
                        if (!prog->jit_requested) {
                                bpf_log(log,
                                        "Extension programs should be JITed\n");
                                return -EINVAL;
                        }
                        tgt_changes_pkt_data = aux->func
                                               ? aux->func[subprog]->aux->changes_pkt_data
                                               : aux->changes_pkt_data;
                        if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) {
                                bpf_log(log,
                                        "Extension program changes packet data, while original does not\n");
                                return -EINVAL;
                        }

                        tgt_might_sleep = aux->func
                                          ? aux->func[subprog]->aux->might_sleep
                                          : aux->might_sleep;
                        if (prog->aux->might_sleep && !tgt_might_sleep) {
                                bpf_log(log,
                                        "Extension program may sleep, while original does not\n");
                                return -EINVAL;
                        }
                }
                if (!tgt_prog->jited) {
                        bpf_log(log, "Can attach to only JITed progs\n");
                        return -EINVAL;
                }
                if (prog_tracing) {
                        if (aux->attach_tracing_prog) {
                                /*
                                 * Target program is an fentry/fexit which is already attached
                                 * to another tracing program. More levels of nesting
                                 * attachment are not allowed.
                                 */
                                bpf_log(log, "Cannot nest tracing program attach more than once\n");
                                return -EINVAL;
                        }
                } else if (tgt_prog->type == prog->type) {
                        /*
                         * To avoid potential call chain cycles, prevent attaching of a
                         * program extension to another extension. It's ok to attach
                         * fentry/fexit to extension program.
                         */
                        bpf_log(log, "Cannot recursively attach\n");
                        return -EINVAL;
                }
                if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
                    prog_extension &&
                    (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
                     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT ||
                     tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) {
                        /* Program extensions can extend all program types
                         * except fentry/fexit. The reason is the following.
                         * The fentry/fexit programs are used for performance
                         * analysis, stats and can be attached to any program
                         * type. When extension program is replacing XDP function
                         * it is necessary to allow performance analysis of all
                         * functions. Both original XDP program and its program
                         * extension. Hence attaching fentry/fexit to
                         * BPF_PROG_TYPE_EXT is allowed. If extending of
                         * fentry/fexit was allowed it would be possible to create
                         * long call chain fentry->extension->fentry->extension
                         * beyond reasonable stack size. Hence extending fentry
                         * is not allowed.
                         */
                        bpf_log(log, "Cannot extend fentry/fexit/fsession\n");
                        return -EINVAL;
                }
        } else {
                if (prog_extension) {
                        bpf_log(log, "Cannot replace kernel functions\n");
                        return -EINVAL;
                }
        }

        switch (prog->expected_attach_type) {
        case BPF_TRACE_RAW_TP:
                if (tgt_prog) {
                        bpf_log(log,
                                "Only FENTRY/FEXIT/FSESSION progs are attachable to another BPF prog\n");
                        return -EINVAL;
                }
                if (!btf_type_is_typedef(t)) {
                        bpf_log(log, "attach_btf_id %u is not a typedef\n",
                                btf_id);
                        return -EINVAL;
                }
                if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
                        bpf_log(log, "attach_btf_id %u points to wrong type name %s\n",
                                btf_id, tname);
                        return -EINVAL;
                }
                tname += sizeof(prefix) - 1;

                /* The func_proto of "btf_trace_##tname" is generated from typedef without argument
                 * names. Thus using bpf_raw_event_map to get argument names.
                 */
                btp = bpf_get_raw_tracepoint(tname);
                if (!btp)
                        return -EINVAL;
                fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL,
                                        trace_symbol);
                bpf_put_raw_tracepoint(btp);

                if (fname)
                        ret = btf_find_by_name_kind(btf, fname, BTF_KIND_FUNC);

                if (!fname || ret < 0) {
                        bpf_log(log, "Cannot find btf of tracepoint template, fall back to %s%s.\n",
                                prefix, tname);
                        t = btf_type_by_id(btf, t->type);
                        if (!btf_type_is_ptr(t))
                                /* should never happen in valid vmlinux build */
                                return -EINVAL;
                } else {
                        t = btf_type_by_id(btf, ret);
                        if (!btf_type_is_func(t))
                                /* should never happen in valid vmlinux build */
                                return -EINVAL;
                }

                t = btf_type_by_id(btf, t->type);
                if (!btf_type_is_func_proto(t))
                        /* should never happen in valid vmlinux build */
                        return -EINVAL;

                break;
        case BPF_TRACE_ITER:
                if (!btf_type_is_func(t)) {
                        bpf_log(log, "attach_btf_id %u is not a function\n",
                                btf_id);
                        return -EINVAL;
                }
                t = btf_type_by_id(btf, t->type);
                if (!btf_type_is_func_proto(t))
                        return -EINVAL;
                ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
                if (ret)
                        return ret;
                break;
        default:
                if (!prog_extension)
                        return -EINVAL;
                fallthrough;
        case BPF_MODIFY_RETURN:
        case BPF_LSM_MAC:
        case BPF_LSM_CGROUP:
        case BPF_TRACE_FENTRY:
        case BPF_TRACE_FEXIT:
        case BPF_TRACE_FSESSION:
                if (prog->expected_attach_type == BPF_TRACE_FSESSION &&
                    !bpf_jit_supports_fsession()) {
                        bpf_log(log, "JIT does not support fsession\n");
                        return -EOPNOTSUPP;
                }
                if (!btf_type_is_func(t)) {
                        bpf_log(log, "attach_btf_id %u is not a function\n",
                                btf_id);
                        return -EINVAL;
                }
                if (prog_extension &&
                    btf_check_type_match(log, prog, btf, t))
                        return -EINVAL;
                t = btf_type_by_id(btf, t->type);
                if (!btf_type_is_func_proto(t))
                        return -EINVAL;

                if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) &&
                    (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type ||
                     prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type))
                        return -EINVAL;

                if (tgt_prog && conservative)
                        t = NULL;

                ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
                if (ret < 0)
                        return ret;

                if (tgt_prog) {
                        if (subprog == 0)
                                addr = (long) tgt_prog->bpf_func;
                        else
                                addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
                } else {
                        if (btf_is_module(btf)) {
                                mod = btf_try_get_module(btf);
                                if (mod)
                                        addr = find_kallsyms_symbol_value(mod, tname);
                                else
                                        addr = 0;
                        } else {
                                addr = kallsyms_lookup_name(tname);
                        }
                        if (!addr) {
                                module_put(mod);
                                bpf_log(log,
                                        "The address of function %s cannot be found\n",
                                        tname);
                                return -ENOENT;
                        }
                }

                if (prog->sleepable) {
                        ret = -EINVAL;
                        switch (prog->type) {
                        case BPF_PROG_TYPE_TRACING:
                                if (!check_attach_sleepable(btf_id, addr, tname))
                                        ret = 0;
                                /* fentry/fexit/fmod_ret progs can also be sleepable if they are
                                 * in the fmodret id set with the KF_SLEEPABLE flag.
                                 */
                                else {
                                        u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
                                                                                prog);

                                        if (flags && (*flags & KF_SLEEPABLE))
                                                ret = 0;
                                }
                                break;
                        case BPF_PROG_TYPE_LSM:
                                /* LSM progs check that they are attached to bpf_lsm_*() funcs.
                                 * Only some of them are sleepable.
                                 */
                                if (bpf_lsm_is_sleepable_hook(btf_id))
                                        ret = 0;
                                break;
                        default:
                                break;
                        }
                        if (ret) {
                                module_put(mod);
                                bpf_log(log, "%s is not sleepable\n", tname);
                                return ret;
                        }
                } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
                        if (tgt_prog) {
                                module_put(mod);
                                bpf_log(log, "can't modify return codes of BPF programs\n");
                                return -EINVAL;
                        }
                        ret = -EINVAL;
                        if (btf_kfunc_is_modify_return(btf, btf_id, prog) ||
                            !check_attach_modify_return(addr, tname))
                                ret = 0;
                        if (ret) {
                                module_put(mod);
                                bpf_log(log, "%s() is not modifiable\n", tname);
                                return ret;
                        }
                }

                break;
        }
        tgt_info->tgt_addr = addr;
        tgt_info->tgt_name = tname;
        tgt_info->tgt_type = t;
        tgt_info->tgt_mod = mod;
        return 0;
}

BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
BTF_ID(func, ___migrate_enable)
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif
#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
BTF_ID(func, rcu_read_unlock_strict)
#endif
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
BTF_ID(func, preempt_count_add)
BTF_ID(func, preempt_count_sub)
#endif
#ifdef CONFIG_PREEMPT_RCU
BTF_ID(func, __rcu_read_lock)
BTF_ID(func, __rcu_read_unlock)
#endif
BTF_SET_END(btf_id_deny)

/* fexit and fmod_ret can't be used to attach to __noreturn functions.
 * Currently, we must manually list all __noreturn functions here. Once a more
 * robust solution is implemented, this workaround can be removed.
 */
BTF_SET_START(noreturn_deny)
#ifdef CONFIG_IA32_EMULATION
BTF_ID(func, __ia32_sys_exit)
BTF_ID(func, __ia32_sys_exit_group)
#endif
#ifdef CONFIG_KUNIT
BTF_ID(func, __kunit_abort)
BTF_ID(func, kunit_try_catch_throw)
#endif
#ifdef CONFIG_MODULES
BTF_ID(func, __module_put_and_kthread_exit)
#endif
#ifdef CONFIG_X86_64
BTF_ID(func, __x64_sys_exit)
BTF_ID(func, __x64_sys_exit_group)
#endif
BTF_ID(func, do_exit)
BTF_ID(func, do_group_exit)
BTF_ID(func, kthread_complete_and_exit)
BTF_ID(func, make_task_dead)
BTF_SET_END(noreturn_deny)

static bool can_be_sleepable(struct bpf_prog *prog)
{
        if (prog->type == BPF_PROG_TYPE_TRACING) {
                switch (prog->expected_attach_type) {
                case BPF_TRACE_FENTRY:
                case BPF_TRACE_FEXIT:
                case BPF_MODIFY_RETURN:
                case BPF_TRACE_ITER:
                case BPF_TRACE_FSESSION:
                        return true;
                default:
                        return false;
                }
        }
        return prog->type == BPF_PROG_TYPE_LSM ||
               prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
               prog->type == BPF_PROG_TYPE_STRUCT_OPS;
}

static int check_attach_btf_id(struct bpf_verifier_env *env)
{
        struct bpf_prog *prog = env->prog;
        struct bpf_prog *tgt_prog = prog->aux->dst_prog;
        struct bpf_attach_target_info tgt_info = {};
        u32 btf_id = prog->aux->attach_btf_id;
        struct bpf_trampoline *tr;
        int ret;
        u64 key;

        if (prog->type == BPF_PROG_TYPE_SYSCALL) {
                if (prog->sleepable)
                        /* attach_btf_id checked to be zero already */
                        return 0;
                verbose(env, "Syscall programs can only be sleepable\n");
                return -EINVAL;
        }

        if (prog->sleepable && !can_be_sleepable(prog)) {
                verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
                return -EINVAL;
        }

        if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
                return check_struct_ops_btf_id(env);

        if (prog->type != BPF_PROG_TYPE_TRACING &&
            prog->type != BPF_PROG_TYPE_LSM &&
            prog->type != BPF_PROG_TYPE_EXT)
                return 0;

        ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info);
        if (ret)
                return ret;

        if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) {
                /* to make freplace equivalent to their targets, they need to
                 * inherit env->ops and expected_attach_type for the rest of the
                 * verification
                 */
                env->ops = bpf_verifier_ops[tgt_prog->type];
                prog->expected_attach_type = tgt_prog->expected_attach_type;
        }

        /* store info about the attachment target that will be used later */
        prog->aux->attach_func_proto = tgt_info.tgt_type;
        prog->aux->attach_func_name = tgt_info.tgt_name;
        prog->aux->mod = tgt_info.tgt_mod;

        if (tgt_prog) {
                prog->aux->saved_dst_prog_type = tgt_prog->type;
                prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type;
        }

        if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
                prog->aux->attach_btf_trace = true;
                return 0;
        } else if (prog->expected_attach_type == BPF_TRACE_ITER) {
                return bpf_iter_prog_supported(prog);
        }

        if (prog->type == BPF_PROG_TYPE_LSM) {
                ret = bpf_lsm_verify_prog(&env->log, prog);
                if (ret < 0)
                        return ret;
        } else if (prog->type == BPF_PROG_TYPE_TRACING &&
                   btf_id_set_contains(&btf_id_deny, btf_id)) {
                verbose(env, "Attaching tracing programs to function '%s' is rejected.\n",
                        tgt_info.tgt_name);
                return -EINVAL;
        } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
                   prog->expected_attach_type == BPF_TRACE_FSESSION ||
                   prog->expected_attach_type == BPF_MODIFY_RETURN) &&
                   btf_id_set_contains(&noreturn_deny, btf_id)) {
                verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n",
                        tgt_info.tgt_name);
                return -EINVAL;
        }

        key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
        tr = bpf_trampoline_get(key, &tgt_info);
        if (!tr)
                return -ENOMEM;

        if (tgt_prog && tgt_prog->aux->tail_call_reachable)
                tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;

        prog->aux->dst_trampoline = tr;
        return 0;
}

struct btf *bpf_get_btf_vmlinux(void)
{
        if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
                mutex_lock(&bpf_verifier_lock);
                if (!btf_vmlinux)
                        btf_vmlinux = btf_parse_vmlinux();
                mutex_unlock(&bpf_verifier_lock);
        }
        return btf_vmlinux;
}

/*
 * The add_fd_from_fd_array() is executed only if fd_array_cnt is non-zero. In
 * this case expect that every file descriptor in the array is either a map or
 * a BTF. Everything else is considered to be trash.
 */
static int add_fd_from_fd_array(struct bpf_verifier_env *env, int fd)
{
        struct bpf_map *map;
        struct btf *btf;
        CLASS(fd, f)(fd);
        int err;

        map = __bpf_map_get(f);
        if (!IS_ERR(map)) {
                err = __add_used_map(env, map);
                if (err < 0)
                        return err;
                return 0;
        }

        btf = __btf_get_by_fd(f);
        if (!IS_ERR(btf)) {
                btf_get(btf);
                return __add_used_btf(env, btf);
        }

        verbose(env, "fd %d is not pointing to valid bpf_map or btf\n", fd);
        return PTR_ERR(map);
}

static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, bpfptr_t uattr)
{
        size_t size = sizeof(int);
        int ret;
        int fd;
        u32 i;

        env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);

        /*
         * The only difference between old (no fd_array_cnt is given) and new
         * APIs is that in the latter case the fd_array is expected to be
         * continuous and is scanned for map fds right away
         */
        if (!attr->fd_array_cnt)
                return 0;

        /* Check for integer overflow */
        if (attr->fd_array_cnt >= (U32_MAX / size)) {
                verbose(env, "fd_array_cnt is too big (%u)\n", attr->fd_array_cnt);
                return -EINVAL;
        }

        for (i = 0; i < attr->fd_array_cnt; i++) {
                if (copy_from_bpfptr_offset(&fd, env->fd_array, i * size, size))
                        return -EFAULT;

                ret = add_fd_from_fd_array(env, fd);
                if (ret)
                        return ret;
        }

        return 0;
}

/* replace a generic kfunc with a specialized version if necessary */
static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx)
{
        struct bpf_prog *prog = env->prog;
        bool seen_direct_write;
        void *xdp_kfunc;
        bool is_rdonly;
        u32 func_id = desc->func_id;
        u16 offset = desc->offset;
        unsigned long addr = desc->addr;

        if (offset) /* return if module BTF is used */
                return 0;

        if (bpf_dev_bound_kfunc_id(func_id)) {
                xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
                if (xdp_kfunc)
                        addr = (unsigned long)xdp_kfunc;
                /* fallback to default kfunc when not supported by netdev */
        } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
                seen_direct_write = env->seen_direct_write;
                is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);

                if (is_rdonly)
                        addr = (unsigned long)bpf_dynptr_from_skb_rdonly;

                /* restore env->seen_direct_write to its original value, since
                 * may_access_direct_pkt_data mutates it
                 */
                env->seen_direct_write = seen_direct_write;
        } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) {
                if (bpf_lsm_has_d_inode_locked(prog))
                        addr = (unsigned long)bpf_set_dentry_xattr_locked;
        } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) {
                if (bpf_lsm_has_d_inode_locked(prog))
                        addr = (unsigned long)bpf_remove_dentry_xattr_locked;
        } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
                if (!env->insn_aux_data[insn_idx].non_sleepable)
                        addr = (unsigned long)bpf_dynptr_from_file_sleepable;
        } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) {
                if (env->insn_aux_data[insn_idx].non_sleepable)
                        addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable;
        } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) {
                if (env->insn_aux_data[insn_idx].non_sleepable)
                        addr = (unsigned long)bpf_arena_free_pages_non_sleepable;
        }
        desc->addr = addr;
        return 0;
}

static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
                                            u16 struct_meta_reg,
                                            u16 node_offset_reg,
                                            struct bpf_insn *insn,
                                            struct bpf_insn *insn_buf,
                                            int *cnt)
{
        struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
        struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };

        insn_buf[0] = addr[0];
        insn_buf[1] = addr[1];
        insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
        insn_buf[3] = *insn;
        *cnt = 4;
}

int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                     struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
        struct bpf_kfunc_desc *desc;
        int err;

        if (!insn->imm) {
                verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
                return -EINVAL;
        }

        *cnt = 0;

        /* insn->imm has the btf func_id. Replace it with an offset relative to
         * __bpf_call_base, unless the JIT needs to call functions that are
         * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
         */
        desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
        if (!desc) {
                verifier_bug(env, "kernel function descriptor not found for func_id %u",
                             insn->imm);
                return -EFAULT;
        }

        err = specialize_kfunc(env, desc, insn_idx);
        if (err)
                return err;

        if (!bpf_jit_supports_far_kfunc_call())
                insn->imm = BPF_CALL_IMM(desc->addr);

        if (is_bpf_obj_new_kfunc(desc->func_id) || is_bpf_percpu_obj_new_kfunc(desc->func_id)) {
                struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
                struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
                u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;

                if (is_bpf_percpu_obj_new_kfunc(desc->func_id) && kptr_struct_meta) {
                        verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
                                     insn_idx);
                        return -EFAULT;
                }

                insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
                insn_buf[1] = addr[0];
                insn_buf[2] = addr[1];
                insn_buf[3] = *insn;
                *cnt = 4;
        } else if (is_bpf_obj_drop_kfunc(desc->func_id) ||
                   is_bpf_percpu_obj_drop_kfunc(desc->func_id) ||
                   is_bpf_refcount_acquire_kfunc(desc->func_id)) {
                struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
                struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };

                if (is_bpf_percpu_obj_drop_kfunc(desc->func_id) && kptr_struct_meta) {
                        verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
                                     insn_idx);
                        return -EFAULT;
                }

                if (is_bpf_refcount_acquire_kfunc(desc->func_id) && !kptr_struct_meta) {
                        verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
                                     insn_idx);
                        return -EFAULT;
                }

                insn_buf[0] = addr[0];
                insn_buf[1] = addr[1];
                insn_buf[2] = *insn;
                *cnt = 3;
        } else if (is_bpf_list_push_kfunc(desc->func_id) ||
                   is_bpf_rbtree_add_kfunc(desc->func_id)) {
                struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
                int struct_meta_reg = BPF_REG_3;
                int node_offset_reg = BPF_REG_4;

                /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
                if (is_bpf_rbtree_add_kfunc(desc->func_id)) {
                        struct_meta_reg = BPF_REG_4;
                        node_offset_reg = BPF_REG_5;
                }

                if (!kptr_struct_meta) {
                        verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
                                     insn_idx);
                        return -EFAULT;
                }

                __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
                                                node_offset_reg, insn, insn_buf, cnt);
        } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
                   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
                insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
                *cnt = 1;
        } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] &&
                   env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
                /*
                 * inline the bpf_session_is_return() for fsession:
                 *   bool bpf_session_is_return(void *ctx)
                 *   {
                 *       return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1;
                 *   }
                 */
                insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
                insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT);
                insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1);
                *cnt = 3;
        } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] &&
                   env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
                /*
                 * inline bpf_session_cookie() for fsession:
                 *   __u64 *bpf_session_cookie(void *ctx)
                 *   {
                 *       u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF;
                 *       return &((u64 *)ctx)[-off];
                 *   }
                 */
                insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
                insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT);
                insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
                insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
                insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1);
                insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0);
                *cnt = 6;
        }

        if (env->insn_aux_data[insn_idx].arg_prog) {
                u32 regno = env->insn_aux_data[insn_idx].arg_prog;
                struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) };
                int idx = *cnt;

                insn_buf[idx++] = ld_addrs[0];
                insn_buf[idx++] = ld_addrs[1];
                insn_buf[idx++] = *insn;
                *cnt = idx;
        }
        return 0;
}

int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
        u64 start_time = ktime_get_ns();
        struct bpf_verifier_env *env;
        int i, len, ret = -EINVAL, err;
        u32 log_true_size;
        bool is_priv;

        BTF_TYPE_EMIT(enum bpf_features);

        /* no program is valid */
        if (ARRAY_SIZE(bpf_verifier_ops) == 0)
                return -EINVAL;

        /* 'struct bpf_verifier_env' can be global, but since it's not small,
         * allocate/free it every time bpf_check() is called
         */
        env = kvzalloc_obj(struct bpf_verifier_env, GFP_KERNEL_ACCOUNT);
        if (!env)
                return -ENOMEM;

        env->bt.env = env;

        len = (*prog)->len;
        env->insn_aux_data =
                vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
        ret = -ENOMEM;
        if (!env->insn_aux_data)
                goto err_free_env;
        for (i = 0; i < len; i++)
                env->insn_aux_data[i].orig_idx = i;
        env->succ = bpf_iarray_realloc(NULL, 2);
        if (!env->succ)
                goto err_free_env;
        env->prog = *prog;
        env->ops = bpf_verifier_ops[env->prog->type];

        env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
        env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
        env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
        env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
        env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);

        bpf_get_btf_vmlinux();

        /* grab the mutex to protect few globals used by verifier */
        if (!is_priv)
                mutex_lock(&bpf_verifier_lock);

        /* user could have requested verbose verifier output
         * and supplied buffer to store the verification trace
         */
        ret = bpf_vlog_init(&env->log, attr->log_level,
                            (char __user *) (unsigned long) attr->log_buf,
                            attr->log_size);
        if (ret)
                goto err_unlock;

        ret = process_fd_array(env, attr, uattr);
        if (ret)
                goto skip_full_check;

        mark_verifier_state_clean(env);

        if (IS_ERR(btf_vmlinux)) {
                /* Either gcc or pahole or kernel are broken. */
                verbose(env, "in-kernel BTF is malformed\n");
                ret = PTR_ERR(btf_vmlinux);
                goto skip_full_check;
        }

        env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                env->strict_alignment = true;
        if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
                env->strict_alignment = false;

        if (is_priv)
                env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
        env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;

        env->explored_states = kvzalloc_objs(struct list_head,
                                             state_htab_size(env),
                                             GFP_KERNEL_ACCOUNT);
        ret = -ENOMEM;
        if (!env->explored_states)
                goto skip_full_check;

        for (i = 0; i < state_htab_size(env); i++)
                INIT_LIST_HEAD(&env->explored_states[i]);
        INIT_LIST_HEAD(&env->free_list);

        ret = bpf_check_btf_info_early(env, attr, uattr);
        if (ret < 0)
                goto skip_full_check;

        ret = add_subprog_and_kfunc(env);
        if (ret < 0)
                goto skip_full_check;

        ret = check_subprogs(env);
        if (ret < 0)
                goto skip_full_check;

        ret = bpf_check_btf_info(env, attr, uattr);
        if (ret < 0)
                goto skip_full_check;

        ret = check_and_resolve_insns(env);
        if (ret < 0)
                goto skip_full_check;

        if (bpf_prog_is_offloaded(env->prog->aux)) {
                ret = bpf_prog_offload_verifier_prep(env->prog);
                if (ret)
                        goto skip_full_check;
        }

        ret = bpf_check_cfg(env);
        if (ret < 0)
                goto skip_full_check;

        ret = bpf_compute_postorder(env);
        if (ret < 0)
                goto skip_full_check;

        ret = bpf_stack_liveness_init(env);
        if (ret)
                goto skip_full_check;

        ret = check_attach_btf_id(env);
        if (ret)
                goto skip_full_check;

        ret = bpf_compute_const_regs(env);
        if (ret < 0)
                goto skip_full_check;

        ret = bpf_prune_dead_branches(env);
        if (ret < 0)
                goto skip_full_check;

        ret = sort_subprogs_topo(env);
        if (ret < 0)
                goto skip_full_check;

        ret = bpf_compute_scc(env);
        if (ret < 0)
                goto skip_full_check;

        ret = bpf_compute_live_registers(env);
        if (ret < 0)
                goto skip_full_check;

        ret = mark_fastcall_patterns(env);
        if (ret < 0)
                goto skip_full_check;

        ret = do_check_main(env);
        ret = ret ?: do_check_subprogs(env);

        if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
                ret = bpf_prog_offload_finalize(env);

skip_full_check:
        kvfree(env->explored_states);

        /* might decrease stack depth, keep it before passes that
         * allocate additional slots.
         */
        if (ret == 0)
                ret = bpf_remove_fastcall_spills_fills(env);

        if (ret == 0)
                ret = check_max_stack_depth(env);

        /* instruction rewrites happen after this point */
        if (ret == 0)
                ret = bpf_optimize_bpf_loop(env);

        if (is_priv) {
                if (ret == 0)
                        bpf_opt_hard_wire_dead_code_branches(env);
                if (ret == 0)
                        ret = bpf_opt_remove_dead_code(env);
                if (ret == 0)
                        ret = bpf_opt_remove_nops(env);
        } else {
                if (ret == 0)
                        sanitize_dead_code(env);
        }

        if (ret == 0)
                /* program is valid, convert *(u32*)(ctx + off) accesses */
                ret = bpf_convert_ctx_accesses(env);

        if (ret == 0)
                ret = bpf_do_misc_fixups(env);

        /* do 32-bit optimization after insn patching has done so those patched
         * insns could be handled correctly.
         */
        if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
                ret = bpf_opt_subreg_zext_lo32_rnd_hi32(env, attr);
                env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
                                                                     : false;
        }

        if (ret == 0)
                ret = bpf_fixup_call_args(env);

        env->verification_time = ktime_get_ns() - start_time;
        print_verification_stats(env);
        env->prog->aux->verified_insns = env->insn_processed;

        /* preserve original error even if log finalization is successful */
        err = bpf_vlog_finalize(&env->log, &log_true_size);
        if (err)
                ret = err;

        if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
            copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
                                  &log_true_size, sizeof(log_true_size))) {
                ret = -EFAULT;
                goto err_release_maps;
        }

        if (ret)
                goto err_release_maps;

        if (env->used_map_cnt) {
                /* if program passed verifier, update used_maps in bpf_prog_info */
                env->prog->aux->used_maps = kmalloc_objs(env->used_maps[0],
                                                         env->used_map_cnt,
                                                         GFP_KERNEL_ACCOUNT);

                if (!env->prog->aux->used_maps) {
                        ret = -ENOMEM;
                        goto err_release_maps;
                }

                memcpy(env->prog->aux->used_maps, env->used_maps,
                       sizeof(env->used_maps[0]) * env->used_map_cnt);
                env->prog->aux->used_map_cnt = env->used_map_cnt;
        }
        if (env->used_btf_cnt) {
                /* if program passed verifier, update used_btfs in bpf_prog_aux */
                env->prog->aux->used_btfs = kmalloc_objs(env->used_btfs[0],
                                                         env->used_btf_cnt,
                                                         GFP_KERNEL_ACCOUNT);
                if (!env->prog->aux->used_btfs) {
                        ret = -ENOMEM;
                        goto err_release_maps;
                }

                memcpy(env->prog->aux->used_btfs, env->used_btfs,
                       sizeof(env->used_btfs[0]) * env->used_btf_cnt);
                env->prog->aux->used_btf_cnt = env->used_btf_cnt;
        }
        if (env->used_map_cnt || env->used_btf_cnt) {
                /* program is valid. Convert pseudo bpf_ld_imm64 into generic
                 * bpf_ld_imm64 instructions
                 */
                convert_pseudo_ld_imm64(env);
        }

        adjust_btf_func(env);

        /* extension progs temporarily inherit the attach_type of their targets
           for verification purposes, so set it back to zero before returning
         */
        if (env->prog->type == BPF_PROG_TYPE_EXT)
                env->prog->expected_attach_type = 0;

        env->prog = __bpf_prog_select_runtime(env, env->prog, &ret);

err_release_maps:
        if (ret)
                release_insn_arrays(env);
        if (!env->prog->aux->used_maps)
                /* if we didn't copy map pointers into bpf_prog_info, release
                 * them now. Otherwise free_used_maps() will release them.
                 */
                release_maps(env);
        if (!env->prog->aux->used_btfs)
                release_btfs(env);

        *prog = env->prog;

        module_put(env->attach_btf_mod);
err_unlock:
        if (!is_priv)
                mutex_unlock(&bpf_verifier_lock);
        bpf_clear_insn_aux_data(env, 0, env->prog->len);
        vfree(env->insn_aux_data);
err_free_env:
        bpf_stack_liveness_free(env);
        kvfree(env->cfg.insn_postorder);
        kvfree(env->scc_info);
        kvfree(env->succ);
        kvfree(env->gotox_tmp_buf);
        kvfree(env);
        return ret;
}





































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/*
 *  linux/fs/hfs/hfs_fs.h
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 */

#ifndef _LINUX_HFS_FS_H
#define _LINUX_HFS_FS_H

#include <linux/slab.h>
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/workqueue.h>

#include <asm/byteorder.h>
#include <linux/uaccess.h>

#include "hfs.h"

/*
 * struct hfs_inode_info
 *
 * The HFS-specific part of a Linux (struct inode)
 */
struct hfs_inode_info {
        atomic_t opencnt;

        unsigned int flags;

        /* to deal with localtime ugliness */
        int tz_secondswest;

        struct hfs_cat_key cat_key;

        struct list_head open_dir_list;
        spinlock_t open_dir_lock;
        struct inode *rsrc_inode;

        struct mutex extents_lock;

        u16 alloc_blocks, clump_blocks;
        sector_t fs_blocks;
        /* Allocation extents from catlog record or volume header */
        hfs_extent_rec first_extents;
        u16 first_blocks;
        hfs_extent_rec cached_extents;
        u16 cached_start, cached_blocks;

        loff_t phys_size;
        struct inode vfs_inode;
};

#define HFS_FLG_RSRC                0x0001
#define HFS_FLG_EXT_DIRTY        0x0002
#define HFS_FLG_EXT_NEW                0x0004

#define HFS_IS_RSRC(inode)        (HFS_I(inode)->flags & HFS_FLG_RSRC)

/*
 * struct hfs_sb_info
 *
 * The HFS-specific part of a Linux (struct super_block)
 */
struct hfs_sb_info {
        struct buffer_head *mdb_bh;                /* The hfs_buffer
                                                   holding the real
                                                   superblock (aka VIB
                                                   or MDB) */
        struct hfs_mdb *mdb;
        struct buffer_head *alt_mdb_bh;                /* The hfs_buffer holding
                                                   the alternate superblock */
        struct hfs_mdb *alt_mdb;
        __be32 *bitmap;                                /* The page holding the
                                                   allocation bitmap */
        struct hfs_btree *ext_tree;                        /* Information about
                                                   the extents b-tree */
        struct hfs_btree *cat_tree;                        /* Information about
                                                   the catalog b-tree */
        atomic64_t file_count;                        /* The number of
                                                   regular files in
                                                   the filesystem */
        atomic64_t folder_count;                /* The number of
                                                   directories in the
                                                   filesystem */
        atomic64_t next_id;                        /* The next available
                                                   file id number */
        u32 clumpablks;                                /* The number of allocation
                                                   blocks to try to add when
                                                   extending a file */
        u32 fs_start;                                /* The first 512-byte
                                                   block represented
                                                   in the bitmap */
        u32 part_start;
        u16 root_files;                                /* The number of
                                                   regular
                                                   (non-directory)
                                                   files in the root
                                                   directory */
        u16 root_dirs;                                /* The number of
                                                   directories in the
                                                   root directory */
        u16 fs_ablocks;                                /* The number of
                                                   allocation blocks
                                                   in the filesystem */
        u16 free_ablocks;                        /* the number of unused
                                                   allocation blocks
                                                   in the filesystem */
        u32 alloc_blksz;                        /* The size of an
                                                   "allocation block" */
        int s_quiet;                                /* Silent failure when
                                                   changing owner or mode? */
        __be32 s_type;                                /* Type for new files */
        __be32 s_creator;                        /* Creator for new files */
        umode_t s_file_umask;                        /* The umask applied to the
                                                   permissions on all files */
        umode_t s_dir_umask;                        /* The umask applied to the
                                                   permissions on all dirs */
        kuid_t s_uid;                                /* The uid of all files */
        kgid_t s_gid;                                /* The gid of all files */

        int session, part;
        struct nls_table *nls_io, *nls_disk;
        struct mutex bitmap_lock;
        unsigned long flags;
        u16 blockoffset;
        int fs_div;
        struct super_block *sb;
        int work_queued;                /* non-zero delayed work is queued */
        struct delayed_work mdb_work;        /* MDB flush delayed work */
        spinlock_t work_lock;                /* protects mdb_work and work_queued */
};

#define HFS_FLG_BITMAP_DIRTY        0
#define HFS_FLG_MDB_DIRTY        1
#define HFS_FLG_ALT_MDB_DIRTY        2

/* bitmap.c */
extern u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits);
extern int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count);

/* catalog.c */
extern int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2);
struct hfs_find_data;
extern int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
                             struct hfs_find_data *fd);
extern int hfs_cat_create(u32 cnid, struct inode *dir,
                          const struct qstr *str, struct inode *inode);
extern int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str);
extern int hfs_cat_move(u32 cnid, struct inode *src_dir,
                        const struct qstr *src_name,
                        struct inode *dst_dir,
                        const struct qstr *dst_name);
extern void hfs_cat_build_key(struct super_block *sb, btree_key *key,
                              u32 parent, const struct qstr *name);

/* dir.c */
extern const struct file_operations hfs_dir_operations;
extern const struct inode_operations hfs_dir_inode_operations;

/* extent.c */
extern int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2);
extern u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off);
extern int hfs_free_fork(struct super_block *sb,
                         struct hfs_cat_file *file, int type);
extern int hfs_ext_write_extent(struct inode *inode);
extern int hfs_extend_file(struct inode *inode);
extern void hfs_file_truncate(struct inode *inode);

extern int hfs_get_block(struct inode *inode, sector_t block,
                         struct buffer_head *bh_result, int create);

/* inode.c */
extern const struct address_space_operations hfs_aops;
extern const struct address_space_operations hfs_btree_aops;

int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
                    loff_t pos, unsigned int len, struct folio **foliop,
                    void **fsdata);
extern struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name,
                                   umode_t mode);
extern void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
                                 __be32 *log_size, __be32 *phys_size);
extern int hfs_write_inode(struct inode *inode, struct writeback_control *wbc);
extern int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                             struct iattr *attr);
extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
                                __be32 __log_size, __be32 phys_size,
                                u32 clump_size);
extern struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key,
                                hfs_cat_rec *rec);
extern void hfs_evict_inode(struct inode *inode);
extern void hfs_delete_inode(struct inode *inode);

/* attr.c */
extern const struct xattr_handler * const hfs_xattr_handlers[];

/* mdb.c */
extern bool is_hfs_cnid_counts_valid(struct super_block *sb);
extern int hfs_mdb_get(struct super_block *sb);
extern void hfs_mdb_commit(struct super_block *sb);
extern void hfs_mdb_close(struct super_block *sb);
extern void hfs_mdb_put(struct super_block *sb);

/* part_tbl.c */
extern int hfs_part_find(struct super_block *sb,
                         sector_t *part_start, sector_t *part_size);

/* string.c */
extern const struct dentry_operations hfs_dentry_operations;

extern int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this);
extern int hfs_strcmp(const unsigned char *s1, unsigned int len1,
                      const unsigned char *s2, unsigned int len2);
extern int hfs_compare_dentry(const struct dentry *dentry,
                                unsigned int len, const char *str,
                                const struct qstr *name);

/* trans.c */
extern void hfs_asc2mac(struct super_block *sb,
                        struct hfs_name *out, const struct qstr *in);
extern int hfs_mac2asc(struct super_block *sb,
                        char *out, const struct hfs_name *in);

/* super.c */
extern void hfs_mark_mdb_dirty(struct super_block *sb);

/*
 * There are two time systems.  Both are based on seconds since
 * a particular time/date.
 *        Unix:        signed little-endian since 00:00 GMT, Jan. 1, 1970
 *        mac:        unsigned big-endian since 00:00 GMT, Jan. 1, 1904
 *
 * HFS implementations are highly inconsistent, this one matches the
 * traditional behavior of 64-bit Linux, giving the most useful
 * time range between 1970 and 2106, by treating any on-disk timestamp
 * under HFS_UTC_OFFSET (Jan 1 1970) as a time between 2040 and 2106.
 */
#define HFS_UTC_OFFSET 2082844800U

static inline time64_t __hfs_m_to_utime(__be32 mt)
{
        time64_t ut = (u32)(be32_to_cpu(mt) - HFS_UTC_OFFSET);

        return ut + sys_tz.tz_minuteswest * 60;
}

static inline __be32 __hfs_u_to_mtime(time64_t ut)
{
        ut -= sys_tz.tz_minuteswest * 60;

        return cpu_to_be32(lower_32_bits(ut) + HFS_UTC_OFFSET);
}
#define HFS_I(inode)        (container_of(inode, struct hfs_inode_info, vfs_inode))
#define HFS_SB(sb)        ((struct hfs_sb_info *)(sb)->s_fs_info)

#define hfs_m_to_utime(time)   (struct timespec64){ .tv_sec = __hfs_m_to_utime(time) }
#define hfs_u_to_mtime(time)   __hfs_u_to_mtime((time).tv_sec)
#define hfs_mtime()                __hfs_u_to_mtime(ktime_get_real_seconds())

static inline const char *hfs_mdb_name(struct super_block *sb)
{
        return sb->s_id;
}

static inline void hfs_bitmap_dirty(struct super_block *sb)
{
        set_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags);
        hfs_mark_mdb_dirty(sb);
}

#define sb_bread512(sb, sec, data) ({                        \
        struct buffer_head *__bh;                        \
        sector_t __block;                                \
        loff_t __start;                                        \
        int __offset;                                        \
                                                        \
        __start = (loff_t)(sec) << HFS_SECTOR_SIZE_BITS;\
        __block = __start >> (sb)->s_blocksize_bits;        \
        __offset = __start & ((sb)->s_blocksize - 1);        \
        __bh = sb_bread((sb), __block);                        \
        if (likely(__bh != NULL))                        \
                data = (void *)(__bh->b_data + __offset);\
        else                                                \
                data = NULL;                                \
        __bh;                                                \
})

#endif










































































































































































    1 


















































    1 









    1 










    1 



    1 





    1 



























    1 











    1 
























    1 












    1 



























    1 



































    1 




    1 











    1 











    1 














    1 
















    1 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
/*
 *  linux/fs/hfs/super.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains hfs_read_super(), some of the super_ops and
 * init_hfs_fs() and exit_hfs_fs().  The remaining super_ops are in
 * inode.c since they deal with inodes.
 *
 * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
 */

#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/nls.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/vfs.h>

#include "hfs_fs.h"
#include "btree.h"

static struct kmem_cache *hfs_inode_cachep;

MODULE_DESCRIPTION("Apple Macintosh file system support");
MODULE_LICENSE("GPL");

static int hfs_sync_fs(struct super_block *sb, int wait)
{
        is_hfs_cnid_counts_valid(sb);
        hfs_mdb_commit(sb);
        return 0;
}

/*
 * hfs_put_super()
 *
 * This is the put_super() entry in the super_operations structure for
 * HFS filesystems.  The purpose is to release the resources
 * associated with the superblock sb.
 */
static void hfs_put_super(struct super_block *sb)
{
        cancel_delayed_work_sync(&HFS_SB(sb)->mdb_work);
        hfs_mdb_close(sb);
        /* release the MDB's resources */
        hfs_mdb_put(sb);
}

static void flush_mdb(struct work_struct *work)
{
        struct hfs_sb_info *sbi;
        struct super_block *sb;

        sbi = container_of(work, struct hfs_sb_info, mdb_work.work);
        sb = sbi->sb;

        spin_lock(&sbi->work_lock);
        sbi->work_queued = 0;
        spin_unlock(&sbi->work_lock);

        is_hfs_cnid_counts_valid(sb);

        hfs_mdb_commit(sb);
}

void hfs_mark_mdb_dirty(struct super_block *sb)
{
        struct hfs_sb_info *sbi = HFS_SB(sb);
        unsigned long delay;

        if (sb_rdonly(sb))
                return;

        spin_lock(&sbi->work_lock);
        if (!sbi->work_queued) {
                delay = msecs_to_jiffies(dirty_writeback_interval * 10);
                queue_delayed_work(system_long_wq, &sbi->mdb_work, delay);
                sbi->work_queued = 1;
        }
        spin_unlock(&sbi->work_lock);
}

/*
 * hfs_statfs()
 *
 * This is the statfs() entry in the super_operations structure for
 * HFS filesystems.  The purpose is to return various data about the
 * filesystem.
 *
 * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks.
 */
static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);

        buf->f_type = HFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div;
        buf->f_bfree = (u32)HFS_SB(sb)->free_ablocks * HFS_SB(sb)->fs_div;
        buf->f_bavail = buf->f_bfree;
        buf->f_files = HFS_SB(sb)->fs_ablocks;
        buf->f_ffree = HFS_SB(sb)->free_ablocks;
        buf->f_fsid = u64_to_fsid(id);
        buf->f_namelen = HFS_NAMELEN;

        return 0;
}

static int hfs_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;

        sync_filesystem(sb);
        fc->sb_flags |= SB_NODIRATIME;
        if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
                return 0;

        if (!(fc->sb_flags & SB_RDONLY)) {
                if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
                        pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  leaving read-only.\n");
                        sb->s_flags |= SB_RDONLY;
                        fc->sb_flags |= SB_RDONLY;
                } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
                        pr_warn("filesystem is marked locked, leaving read-only.\n");
                        sb->s_flags |= SB_RDONLY;
                        fc->sb_flags |= SB_RDONLY;
                }
        }
        return 0;
}

static int hfs_show_options(struct seq_file *seq, struct dentry *root)
{
        struct hfs_sb_info *sbi = HFS_SB(root->d_sb);

        if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
                seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
        if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
                seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
        seq_printf(seq, ",uid=%u,gid=%u",
                        from_kuid_munged(&init_user_ns, sbi->s_uid),
                        from_kgid_munged(&init_user_ns, sbi->s_gid));
        if (sbi->s_file_umask != 0133)
                seq_printf(seq, ",file_umask=%o", sbi->s_file_umask);
        if (sbi->s_dir_umask != 0022)
                seq_printf(seq, ",dir_umask=%o", sbi->s_dir_umask);
        if (sbi->part >= 0)
                seq_printf(seq, ",part=%u", sbi->part);
        if (sbi->session >= 0)
                seq_printf(seq, ",session=%u", sbi->session);
        if (sbi->nls_disk)
                seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset);
        if (sbi->nls_io)
                seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset);
        if (sbi->s_quiet)
                seq_printf(seq, ",quiet");
        return 0;
}

static struct inode *hfs_alloc_inode(struct super_block *sb)
{
        struct hfs_inode_info *i;

        i = alloc_inode_sb(sb, hfs_inode_cachep, GFP_KERNEL);
        return i ? &i->vfs_inode : NULL;
}

static void hfs_free_inode(struct inode *inode)
{
        kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
}

static const struct super_operations hfs_super_operations = {
        .alloc_inode        = hfs_alloc_inode,
        .free_inode        = hfs_free_inode,
        .write_inode        = hfs_write_inode,
        .evict_inode        = hfs_evict_inode,
        .put_super        = hfs_put_super,
        .sync_fs        = hfs_sync_fs,
        .statfs                = hfs_statfs,
        .show_options        = hfs_show_options,
};

enum {
        opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask,
        opt_part, opt_session, opt_type, opt_creator, opt_quiet,
        opt_codepage, opt_iocharset,
};

static const struct fs_parameter_spec hfs_param_spec[] = {
        fsparam_u32        ("uid",                opt_uid),
        fsparam_u32        ("gid",                opt_gid),
        fsparam_u32oct        ("umask",        opt_umask),
        fsparam_u32oct        ("file_umask",        opt_file_umask),
        fsparam_u32oct        ("dir_umask",        opt_dir_umask),
        fsparam_u32        ("part",        opt_part),
        fsparam_u32        ("session",        opt_session),
        fsparam_string        ("type",        opt_type),
        fsparam_string        ("creator",        opt_creator),
        fsparam_flag        ("quiet",        opt_quiet),
        fsparam_string        ("codepage",        opt_codepage),
        fsparam_string        ("iocharset",        opt_iocharset),
        {}
};

/*
 * hfs_parse_param()
 *
 * This function is called by the vfs to parse the mount options.
 */
static int hfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct hfs_sb_info *hsb = fc->s_fs_info;
        struct fs_parse_result result;
        int opt;

        /* hfs does not honor any fs-specific options on remount */
        if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
                return 0;

        opt = fs_parse(fc, hfs_param_spec, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case opt_uid:
                hsb->s_uid = result.uid;
                break;
        case opt_gid:
                hsb->s_gid = result.gid;
                break;
        case opt_umask:
                hsb->s_file_umask = (umode_t)result.uint_32;
                hsb->s_dir_umask = (umode_t)result.uint_32;
                break;
        case opt_file_umask:
                hsb->s_file_umask = (umode_t)result.uint_32;
                break;
        case opt_dir_umask:
                hsb->s_dir_umask = (umode_t)result.uint_32;
                break;
        case opt_part:
                hsb->part = result.uint_32;
                break;
        case opt_session:
                hsb->session = result.uint_32;
                break;
        case opt_type:
                if (strlen(param->string) != 4) {
                        pr_err("type requires a 4 character value\n");
                        return -EINVAL;
                }
                memcpy(&hsb->s_type, param->string, 4);
                break;
        case opt_creator:
                if (strlen(param->string) != 4) {
                        pr_err("creator requires a 4 character value\n");
                        return -EINVAL;
                }
                memcpy(&hsb->s_creator, param->string, 4);
                break;
        case opt_quiet:
                hsb->s_quiet = 1;
                break;
        case opt_codepage:
                if (hsb->nls_disk) {
                        pr_err("unable to change codepage\n");
                        return -EINVAL;
                }
                hsb->nls_disk = load_nls(param->string);
                if (!hsb->nls_disk) {
                        pr_err("unable to load codepage \"%s\"\n",
                                        param->string);
                        return -EINVAL;
                }
                break;
        case opt_iocharset:
                if (hsb->nls_io) {
                        pr_err("unable to change iocharset\n");
                        return -EINVAL;
                }
                hsb->nls_io = load_nls(param->string);
                if (!hsb->nls_io) {
                        pr_err("unable to load iocharset \"%s\"\n",
                                        param->string);
                        return -EINVAL;
                }
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * hfs_read_super()
 *
 * This is the function that is responsible for mounting an HFS
 * filesystem.        It performs all the tasks necessary to get enough data
 * from the disk to read the root inode.  This includes parsing the
 * mount options, dealing with Macintosh partitions, reading the
 * superblock and the allocation bitmap blocks, calling
 * hfs_btree_init() to get the necessary data about the extents and
 * catalog B-trees and, finally, reading the root inode into memory.
 */
static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct hfs_sb_info *sbi = HFS_SB(sb);
        struct hfs_find_data fd;
        hfs_cat_rec rec;
        struct inode *root_inode;
        int silent = fc->sb_flags & SB_SILENT;
        int res;

        atomic64_set(&sbi->file_count, 0);
        atomic64_set(&sbi->folder_count, 0);
        atomic64_set(&sbi->next_id, 0);

        /* load_nls_default does not fail */
        if (sbi->nls_disk && !sbi->nls_io)
                sbi->nls_io = load_nls_default();
        sbi->s_dir_umask &= 0777;
        sbi->s_file_umask &= 0577;

        spin_lock_init(&sbi->work_lock);
        INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb);

        sbi->sb = sb;
        sb->s_op = &hfs_super_operations;
        sb->s_xattr = hfs_xattr_handlers;
        sb->s_flags |= SB_NODIRATIME;
        mutex_init(&sbi->bitmap_lock);

        res = hfs_mdb_get(sb);
        if (res) {
                if (!silent)
                        pr_warn("can't find a HFS filesystem on dev %s\n",
                                hfs_mdb_name(sb));
                res = -EINVAL;
                goto bail;
        }

        /* try to get the root inode */
        res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
        if (res)
                goto bail_no_root;
        res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
        if (!res) {
                if (fd.entrylength != sizeof(rec.dir)) {
                        res =  -EIO;
                        goto bail_hfs_find;
                }
                hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
                if (rec.type != HFS_CDR_DIR)
                        res = -EIO;
        }
        if (res)
                goto bail_hfs_find;
        res = -EINVAL;
        root_inode = hfs_iget(sb, &fd.search_key->cat, &rec);
        hfs_find_exit(&fd);
        if (!root_inode)
                goto bail_no_root;

        set_default_d_op(sb, &hfs_dentry_operations);
        res = -ENOMEM;
        sb->s_root = d_make_root(root_inode);
        if (!sb->s_root)
                goto bail_no_root;

        /* everything's okay */
        return 0;

bail_hfs_find:
        hfs_find_exit(&fd);
bail_no_root:
        pr_err("get root inode failed\n");
bail:
        hfs_mdb_put(sb);
        return res;
}

static int hfs_get_tree(struct fs_context *fc)
{
        return get_tree_bdev(fc, hfs_fill_super);
}

static void hfs_free_fc(struct fs_context *fc)
{
        kfree(fc->s_fs_info);
}

static const struct fs_context_operations hfs_context_ops = {
        .parse_param        = hfs_parse_param,
        .get_tree        = hfs_get_tree,
        .reconfigure        = hfs_reconfigure,
        .free                = hfs_free_fc,
};

static int hfs_init_fs_context(struct fs_context *fc)
{
        struct hfs_sb_info *hsb;

        hsb = kzalloc_obj(struct hfs_sb_info);
        if (!hsb)
                return -ENOMEM;

        fc->s_fs_info = hsb;
        fc->ops = &hfs_context_ops;

        if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) {
                /* initialize options with defaults */
                hsb->s_uid = current_uid();
                hsb->s_gid = current_gid();
                hsb->s_file_umask = 0133;
                hsb->s_dir_umask = 0022;
                hsb->s_type = cpu_to_be32(0x3f3f3f3f); /* == '????' */
                hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */
                hsb->s_quiet = 0;
                hsb->part = -1;
                hsb->session = -1;
        }

        return 0;
}

static void hfs_kill_super(struct super_block *sb)
{
        struct hfs_sb_info *hsb = HFS_SB(sb);

        kill_block_super(sb);
        kfree(hsb);
}

static struct file_system_type hfs_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "hfs",
        .kill_sb        = hfs_kill_super,
        .fs_flags        = FS_REQUIRES_DEV,
        .init_fs_context = hfs_init_fs_context,
};
MODULE_ALIAS_FS("hfs");

static void hfs_init_once(void *p)
{
        struct hfs_inode_info *i = p;

        inode_init_once(&i->vfs_inode);
}

static int __init init_hfs_fs(void)
{
        int err;

        hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
                sizeof(struct hfs_inode_info), 0,
                SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
        if (!hfs_inode_cachep)
                return -ENOMEM;
        err = register_filesystem(&hfs_fs_type);
        if (err)
                kmem_cache_destroy(hfs_inode_cachep);
        return err;
}

static void __exit exit_hfs_fs(void)
{
        unregister_filesystem(&hfs_fs_type);

        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(hfs_inode_cachep);
}

module_init(init_hfs_fs)
module_exit(exit_hfs_fs)





































































   17 












   22 





































    4 




    6 









   10 























































   13 


























   24 


















   24 







    7 









   15 
























   22 










































   11 












   13 


























































   26 
   17 













   21 






    2 



   19 

















    2 






    2 












    2 
























   13 


    7 







    1 

















































   26 





   25 


   21 
   18 




   11 






    1 




   12 




    1 































   14 


    4 




   13 


   14 








   13 
    5 
    5 

























































   13 















   14 






   14 



   13 







   12 

































   12 
    2 


   18 





   19 

















































   17 
   17 







   25 



























   22 






















   23 


   22 








   23 











































   24 







   14 
   15 






   12 
    4 















































   13 



   18 








    6 

















































   21 
   21 




























































































   13 




   13 












   14 



   13 
   13 


























































































































































































   15 






   16 




   14 




































   15 




   16 


   14 






   14 







































   12 
   13 







   23 






   23 





   16 




    5 




   24 









    8 
    6 
   15 





   13 




   11 








   22 




   24 




















    1 

    1 








   10 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2005 SGI, Christoph Lameter
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 * Copyright (C) 2016 Intel, Matthew Wilcox
 * Copyright (C) 2016 Intel, Ross Zwisler
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/percpu.h>
#include <linux/preempt.h>                /* in_interrupt() */
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/xarray.h>

#include "radix-tree.h"

/*
 * Radix tree node cache.
 */
struct kmem_cache *radix_tree_node_cachep;

/*
 * The radix tree is variable-height, so an insert operation not only has
 * to build the branch to its corresponding item, it also has to build the
 * branch to existing items if the size has to be increased (by
 * radix_tree_extend).
 *
 * The worst case is a zero height tree with just a single item at index 0,
 * and then inserting an item at index ULONG_MAX. This requires 2 new branches
 * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
 * Hence:
 */
#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)

/*
 * The IDR does not have to be as high as the radix tree since it uses
 * signed integers, not unsigned longs.
 */
#define IDR_INDEX_BITS                (8 /* CHAR_BIT */ * sizeof(int) - 1)
#define IDR_MAX_PATH                (DIV_ROUND_UP(IDR_INDEX_BITS, \
                                                RADIX_TREE_MAP_SHIFT))
#define IDR_PRELOAD_SIZE        (IDR_MAX_PATH * 2 - 1)

/*
 * Per-cpu pool of preloaded nodes
 */
DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = {
        .lock = INIT_LOCAL_LOCK(lock),
};
EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads);

static inline struct radix_tree_node *entry_to_node(void *ptr)
{
        return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}

static inline void *node_to_entry(void *ptr)
{
        return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
}

#define RADIX_TREE_RETRY        XA_RETRY_ENTRY

static inline unsigned long
get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
{
        return parent ? slot - parent->slots : 0;
}

static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
                        struct radix_tree_node **nodep, unsigned long index)
{
        unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
        void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);

        *nodep = (void *)entry;
        return offset;
}

static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
{
        return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
}

static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __set_bit(offset, node->tags[tag]);
}

static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __clear_bit(offset, node->tags[tag]);
}

static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        return test_bit(offset, node->tags[tag]);
}

static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear_all(struct radix_tree_root *root)
{
        root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1);
}

static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
{
        return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT));
}

static inline unsigned root_tags_get(const struct radix_tree_root *root)
{
        return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT;
}

static inline bool is_idr(const struct radix_tree_root *root)
{
        return !!(root->xa_flags & ROOT_IS_IDR);
}

/*
 * Returns 1 if any slot in the node has this tag set.
 * Otherwise returns 0.
 */
static inline int any_tag_set(const struct radix_tree_node *node,
                                                        unsigned int tag)
{
        unsigned idx;
        for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
                if (node->tags[tag][idx])
                        return 1;
        }
        return 0;
}

static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
{
        bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
}

/**
 * radix_tree_find_next_bit - find the next set bit in a memory region
 *
 * @node: where to begin the search
 * @tag: the tag index
 * @offset: the bitnumber to start searching at
 *
 * Unrollable variant of find_next_bit() for constant size arrays.
 * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
 * Returns next bit offset, or size if nothing found.
 */
static __always_inline unsigned long
radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
                         unsigned long offset)
{
        const unsigned long *addr = node->tags[tag];

        if (offset < RADIX_TREE_MAP_SIZE) {
                unsigned long tmp;

                addr += offset / BITS_PER_LONG;
                tmp = *addr >> (offset % BITS_PER_LONG);
                if (tmp)
                        return __ffs(tmp) + offset;
                offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
                while (offset < RADIX_TREE_MAP_SIZE) {
                        tmp = *++addr;
                        if (tmp)
                                return __ffs(tmp) + offset;
                        offset += BITS_PER_LONG;
                }
        }
        return RADIX_TREE_MAP_SIZE;
}

static unsigned int iter_offset(const struct radix_tree_iter *iter)
{
        return iter->index & RADIX_TREE_MAP_MASK;
}

/*
 * The maximum index which can be stored in a radix tree
 */
static inline unsigned long shift_maxindex(unsigned int shift)
{
        return (RADIX_TREE_MAP_SIZE << shift) - 1;
}

static inline unsigned long node_maxindex(const struct radix_tree_node *node)
{
        return shift_maxindex(node->shift);
}

static unsigned long next_index(unsigned long index,
                                const struct radix_tree_node *node,
                                unsigned long offset)
{
        return (index & ~node_maxindex(node)) + (offset << node->shift);
}

/*
 * This assumes that the caller has performed appropriate preallocation, and
 * that the caller has pinned this thread of control to the current CPU.
 */
static struct radix_tree_node *
radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
                        struct radix_tree_root *root,
                        unsigned int shift, unsigned int offset,
                        unsigned int count, unsigned int nr_values)
{
        struct radix_tree_node *ret = NULL;

        /*
         * Preload code isn't irq safe and it doesn't make sense to use
         * preloading during an interrupt anyway as all the allocations have
         * to be atomic. So just do normal allocation when in interrupt.
         */
        if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
                struct radix_tree_preload *rtp;

                /*
                 * Even if the caller has preloaded, try to allocate from the
                 * cache first for the new node to get accounted to the memory
                 * cgroup.
                 */
                ret = kmem_cache_alloc(radix_tree_node_cachep,
                                       gfp_mask | __GFP_NOWARN);
                if (ret)
                        goto out;

                /*
                 * Provided the caller has preloaded here, we will always
                 * succeed in getting a node here (and never reach
                 * kmem_cache_alloc)
                 */
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr) {
                        ret = rtp->nodes;
                        rtp->nodes = ret->parent;
                        rtp->nr--;
                }
                /*
                 * Update the allocation stack trace as this is more useful
                 * for debugging.
                 */
                kmemleak_update_trace(ret);
                goto out;
        }
        ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
out:
        BUG_ON(radix_tree_is_internal_node(ret));
        if (ret) {
                ret->shift = shift;
                ret->offset = offset;
                ret->count = count;
                ret->nr_values = nr_values;
                ret->parent = parent;
                ret->array = root;
        }
        return ret;
}

void radix_tree_node_rcu_free(struct rcu_head *head)
{
        struct radix_tree_node *node =
                        container_of(head, struct radix_tree_node, rcu_head);

        /*
         * Must only free zeroed nodes into the slab.  We can be left with
         * non-NULL entries by radix_tree_free_nodes, so clear the entries
         * and tags here.
         */
        memset(node->slots, 0, sizeof(node->slots));
        memset(node->tags, 0, sizeof(node->tags));
        INIT_LIST_HEAD(&node->private_list);

        kmem_cache_free(radix_tree_node_cachep, node);
}

static inline void
radix_tree_node_free(struct radix_tree_node *node)
{
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;
        int ret = -ENOMEM;

        /*
         * Nodes preloaded by one cgroup can be used by another cgroup, so
         * they should never be accounted to any particular memory cgroup.
         */
        gfp_mask &= ~__GFP_ACCOUNT;

        local_lock(&radix_tree_preloads.lock);
        rtp = this_cpu_ptr(&radix_tree_preloads);
        while (rtp->nr < nr) {
                local_unlock(&radix_tree_preloads.lock);
                node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
                if (node == NULL)
                        goto out;
                local_lock(&radix_tree_preloads.lock);
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr < nr) {
                        node->parent = rtp->nodes;
                        rtp->nodes = node;
                        rtp->nr++;
                } else {
                        kmem_cache_free(radix_tree_node_cachep, node);
                }
        }
        ret = 0;
out:
        return ret;
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
int radix_tree_preload(gfp_t gfp_mask)
{
        /* Warn on non-sensical use... */
        WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
        return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
}
EXPORT_SYMBOL(radix_tree_preload);

/*
 * The same as above function, except we don't guarantee preloading happens.
 * We do it, if we decide it helps. On success, return zero with preemption
 * disabled. On error, return -ENOMEM with preemption not disabled.
 */
int radix_tree_maybe_preload(gfp_t gfp_mask)
{
        if (gfpflags_allow_blocking(gfp_mask))
                return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
        /* Preloading doesn't help anything with this gfp mask, skip it */
        local_lock(&radix_tree_preloads.lock);
        return 0;
}
EXPORT_SYMBOL(radix_tree_maybe_preload);

static unsigned radix_tree_load_root(const struct radix_tree_root *root,
                struct radix_tree_node **nodep, unsigned long *maxindex)
{
        struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);

        *nodep = node;

        if (likely(radix_tree_is_internal_node(node))) {
                node = entry_to_node(node);
                *maxindex = node_maxindex(node);
                return node->shift + RADIX_TREE_MAP_SHIFT;
        }

        *maxindex = 0;
        return 0;
}

/*
 *        Extend a radix tree so it can store key @index.
 */
static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
                                unsigned long index, unsigned int shift)
{
        void *entry;
        unsigned int maxshift;
        int tag;

        /* Figure out what the shift should be.  */
        maxshift = shift;
        while (index > shift_maxindex(maxshift))
                maxshift += RADIX_TREE_MAP_SHIFT;

        entry = rcu_dereference_raw(root->xa_head);
        if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
                goto out;

        do {
                struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
                                                        root, shift, 0, 1, 0);
                if (!node)
                        return -ENOMEM;

                if (is_idr(root)) {
                        all_tag_set(node, IDR_FREE);
                        if (!root_tag_get(root, IDR_FREE)) {
                                tag_clear(node, IDR_FREE, 0);
                                root_tag_set(root, IDR_FREE);
                        }
                } else {
                        /* Propagate the aggregated tag info to the new child */
                        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
                                if (root_tag_get(root, tag))
                                        tag_set(node, tag, 0);
                        }
                }

                BUG_ON(shift > BITS_PER_LONG);
                if (radix_tree_is_internal_node(entry)) {
                        entry_to_node(entry)->parent = node;
                } else if (xa_is_value(entry)) {
                        /* Moving a value entry root->xa_head to a node */
                        node->nr_values = 1;
                }
                /*
                 * entry was already in the radix tree, so we do not need
                 * rcu_assign_pointer here
                 */
                node->slots[0] = (void __rcu *)entry;
                entry = node_to_entry(node);
                rcu_assign_pointer(root->xa_head, entry);
                shift += RADIX_TREE_MAP_SHIFT;
        } while (shift <= maxshift);
out:
        return maxshift + RADIX_TREE_MAP_SHIFT;
}

/**
 *        radix_tree_shrink    -    shrink radix tree to minimum height
 *        @root:                radix tree root
 */
static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
        bool shrunk = false;

        for (;;) {
                struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
                struct radix_tree_node *child;

                if (!radix_tree_is_internal_node(node))
                        break;
                node = entry_to_node(node);

                /*
                 * The candidate node has more than one child, or its child
                 * is not at the leftmost slot, we cannot shrink.
                 */
                if (node->count != 1)
                        break;
                child = rcu_dereference_raw(node->slots[0]);
                if (!child)
                        break;

                /*
                 * For an IDR, we must not shrink entry 0 into the root in
                 * case somebody calls idr_replace() with a pointer that
                 * appears to be an internal entry
                 */
                if (!node->shift && is_idr(root))
                        break;

                if (radix_tree_is_internal_node(child))
                        entry_to_node(child)->parent = NULL;

                /*
                 * We don't need rcu_assign_pointer(), since we are simply
                 * moving the node from one part of the tree to another: if it
                 * was safe to dereference the old pointer to it
                 * (node->slots[0]), it will be safe to dereference the new
                 * one (root->xa_head) as far as dependent read barriers go.
                 */
                root->xa_head = (void __rcu *)child;
                if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
                        root_tag_clear(root, IDR_FREE);

                /*
                 * We have a dilemma here. The node's slot[0] must not be
                 * NULLed in case there are concurrent lookups expecting to
                 * find the item. However if this was a bottom-level node,
                 * then it may be subject to the slot pointer being visible
                 * to callers dereferencing it. If item corresponding to
                 * slot[0] is subsequently deleted, these callers would expect
                 * their slot to become empty sooner or later.
                 *
                 * For example, lockless pagecache will look up a slot, deref
                 * the page pointer, and if the page has 0 refcount it means it
                 * was concurrently deleted from pagecache so try the deref
                 * again. Fortunately there is already a requirement for logic
                 * to retry the entire slot lookup -- the indirect pointer
                 * problem (replacing direct root node with an indirect pointer
                 * also results in a stale slot). So tag the slot as indirect
                 * to force callers to retry.
                 */
                node->count = 0;
                if (!radix_tree_is_internal_node(child)) {
                        node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                shrunk = true;
        }

        return shrunk;
}

static bool delete_node(struct radix_tree_root *root,
                        struct radix_tree_node *node)
{
        bool deleted = false;

        do {
                struct radix_tree_node *parent;

                if (node->count) {
                        if (node_to_entry(node) ==
                                        rcu_dereference_raw(root->xa_head))
                                deleted |= radix_tree_shrink(root);
                        return deleted;
                }

                parent = node->parent;
                if (parent) {
                        parent->slots[node->offset] = NULL;
                        parent->count--;
                } else {
                        /*
                         * Shouldn't the tags already have all been cleared
                         * by the caller?
                         */
                        if (!is_idr(root))
                                root_tag_clear_all(root);
                        root->xa_head = NULL;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                deleted = true;

                node = parent;
        } while (node);

        return deleted;
}

/**
 *        __radix_tree_create        -        create a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Create, if necessary, and return the node and slot for an item
 *        at position @index in the radix tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 *
 *        Returns -ENOMEM, or 0 for success.
 */
static int __radix_tree_create(struct radix_tree_root *root,
                unsigned long index, struct radix_tree_node **nodep,
                void __rcu ***slotp)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex;
        unsigned int shift, offset = 0;
        unsigned long max = index;
        gfp_t gfp = root_gfp_mask(root);

        shift = radix_tree_load_root(root, &child, &maxindex);

        /* Make sure the tree is high enough.  */
        if (max > maxindex) {
                int error = radix_tree_extend(root, gfp, max, shift);
                if (error < 0)
                        return error;
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }

        while (shift > 0) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return -ENOMEM;
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                /* Go a level down */
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);
                slot = &node->slots[offset];
        }

        if (nodep)
                *nodep = node;
        if (slotp)
                *slotp = slot;
        return 0;
}

/*
 * Free any nodes below this node.  The tree is presumed to not need
 * shrinking, and any user data in the tree is presumed to not need a
 * destructor called on it.  If we need to add a destructor, we can
 * add that functionality later.  Note that we may not clear tags or
 * slots from the tree as an RCU walker may still have a pointer into
 * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
 * but we'll still have to clear those in rcu_free.
 */
static void radix_tree_free_nodes(struct radix_tree_node *node)
{
        unsigned offset = 0;
        struct radix_tree_node *child = entry_to_node(node);

        for (;;) {
                void *entry = rcu_dereference_raw(child->slots[offset]);
                if (xa_is_node(entry) && child->shift) {
                        child = entry_to_node(entry);
                        offset = 0;
                        continue;
                }
                offset++;
                while (offset == RADIX_TREE_MAP_SIZE) {
                        struct radix_tree_node *old = child;
                        offset = child->offset + 1;
                        child = child->parent;
                        WARN_ON_ONCE(!list_empty(&old->private_list));
                        radix_tree_node_free(old);
                        if (old == entry_to_node(node))
                                return;
                }
        }
}

static inline int insert_entries(struct radix_tree_node *node,
                void __rcu **slot, void *item)
{
        if (*slot)
                return -EEXIST;
        rcu_assign_pointer(*slot, item);
        if (node) {
                node->count++;
                if (xa_is_value(item))
                        node->nr_values++;
        }
        return 1;
}

/**
 *        radix_tree_insert    -    insert into a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @item:                item to insert
 *
 *        Insert an item into the radix tree at position @index.
 */
int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
                        void *item)
{
        struct radix_tree_node *node;
        void __rcu **slot;
        int error;

        BUG_ON(radix_tree_is_internal_node(item));

        error = __radix_tree_create(root, index, &node, &slot);
        if (error)
                return error;

        error = insert_entries(node, slot, item);
        if (error < 0)
                return error;

        if (node) {
                unsigned offset = get_slot_offset(node, slot);
                BUG_ON(tag_get(node, 0, offset));
                BUG_ON(tag_get(node, 1, offset));
                BUG_ON(tag_get(node, 2, offset));
        } else {
                BUG_ON(root_tags_get(root));
        }

        return 0;
}
EXPORT_SYMBOL(radix_tree_insert);

/**
 *        __radix_tree_lookup        -        lookup an item in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Lookup and return the item at position @index in the radix
 *        tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 */
void *__radix_tree_lookup(const struct radix_tree_root *root,
                          unsigned long index, struct radix_tree_node **nodep,
                          void __rcu ***slotp)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        void __rcu **slot;

 restart:
        parent = NULL;
        slot = (void __rcu **)&root->xa_head;
        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                slot = parent->slots + offset;
                if (node == RADIX_TREE_RETRY)
                        goto restart;
                if (parent->shift == 0)
                        break;
        }

        if (nodep)
                *nodep = parent;
        if (slotp)
                *slotp = slot;
        return node;
}

/**
 *        radix_tree_lookup_slot    -    lookup a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Returns:  the slot corresponding to the position @index in the
 *        radix tree @root. This is useful for update-if-exists operations.
 *
 *        This function can be called under rcu_read_lock iff the slot is not
 *        modified by radix_tree_replace_slot, otherwise it must be called
 *        exclusive from other writers. Any dereference of the slot must be done
 *        using radix_tree_deref_slot.
 */
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
                                unsigned long index)
{
        void __rcu **slot;

        if (!__radix_tree_lookup(root, index, NULL, &slot))
                return NULL;
        return slot;
}
EXPORT_SYMBOL(radix_tree_lookup_slot);

/**
 *        radix_tree_lookup    -    perform lookup operation on a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Lookup the item at the position @index in the radix tree @root.
 *
 *        This function can be called under rcu_read_lock, however the caller
 *        must manage lifetimes of leaf nodes (eg. RCU may also be used to free
 *        them safely). No RCU barriers are required to access or modify the
 *        returned item, however.
 */
void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
{
        return __radix_tree_lookup(root, index, NULL, NULL);
}
EXPORT_SYMBOL(radix_tree_lookup);

static void replace_slot(void __rcu **slot, void *item,
                struct radix_tree_node *node, int count, int values)
{
        if (node && (count || values)) {
                node->count += count;
                node->nr_values += values;
        }

        rcu_assign_pointer(*slot, item);
}

static bool node_tag_get(const struct radix_tree_root *root,
                                const struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        if (node)
                return tag_get(node, tag, offset);
        return root_tag_get(root, tag);
}

/*
 * IDR users want to be able to store NULL in the tree, so if the slot isn't
 * free, don't adjust the count, even if it's transitioning between NULL and
 * non-NULL.  For the IDA, we mark slots as being IDR_FREE while they still
 * have empty bits, but it only stores NULL in slots when they're being
 * deleted.
 */
static int calculate_count(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot,
                                void *item, void *old)
{
        if (is_idr(root)) {
                unsigned offset = get_slot_offset(node, slot);
                bool free = node_tag_get(root, node, IDR_FREE, offset);
                if (!free)
                        return 0;
                if (!old)
                        return 1;
        }
        return !!item - !!old;
}

/**
 * __radix_tree_replace                - replace item in a slot
 * @root:                radix tree root
 * @node:                pointer to tree node
 * @slot:                pointer to slot in @node
 * @item:                new item to store in the slot.
 *
 * For use with __radix_tree_lookup().  Caller must hold tree write locked
 * across slot lookup and replacement.
 */
void __radix_tree_replace(struct radix_tree_root *root,
                          struct radix_tree_node *node,
                          void __rcu **slot, void *item)
{
        void *old = rcu_dereference_raw(*slot);
        int values = !!xa_is_value(item) - !!xa_is_value(old);
        int count = calculate_count(root, node, slot, item, old);

        /*
         * This function supports replacing value entries and
         * deleting entries, but that needs accounting against the
         * node unless the slot is root->xa_head.
         */
        WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) &&
                        (count || values));
        replace_slot(slot, item, node, count, values);

        if (!node)
                return;

        delete_node(root, node);
}

/**
 * radix_tree_replace_slot        - replace item in a slot
 * @root:        radix tree root
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_lookup_slot() and
 * radix_tree_gang_lookup_tag_slot().  Caller must hold tree write locked
 * across slot lookup and replacement.
 *
 * NOTE: This cannot be used to switch between non-entries (empty slots),
 * regular entries, and value entries, as that requires accounting
 * inside the radix tree node. When switching from one type of entry or
 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
 * radix_tree_iter_replace().
 */
void radix_tree_replace_slot(struct radix_tree_root *root,
                             void __rcu **slot, void *item)
{
        __radix_tree_replace(root, NULL, slot, item);
}
EXPORT_SYMBOL(radix_tree_replace_slot);

/**
 * radix_tree_iter_replace - replace item in a slot
 * @root:        radix tree root
 * @iter:        iterator state
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_for_each_slot().
 * Caller must hold tree write locked.
 */
void radix_tree_iter_replace(struct radix_tree_root *root,
                                const struct radix_tree_iter *iter,
                                void __rcu **slot, void *item)
{
        __radix_tree_replace(root, iter->node, slot, item);
}

static void node_tag_set(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (tag_get(node, tag, offset))
                        return;
                tag_set(node, tag, offset);
                offset = node->offset;
                node = node->parent;
        }

        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);
}

/**
 *        radix_tree_tag_set - set a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  From
 *        the root all the way down to the leaf node.
 *
 *        Returns the address of the tagged item.  Setting a tag on a not-present
 *        item is a bug.
 */
void *radix_tree_tag_set(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        radix_tree_load_root(root, &node, &maxindex);
        BUG_ON(index > maxindex);

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                BUG_ON(!node);

                if (!tag_get(parent, tag, offset))
                        tag_set(parent, tag, offset);
        }

        /* set the root's tag bit */
        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_set);

static void node_tag_clear(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (!tag_get(node, tag, offset))
                        return;
                tag_clear(node, tag, offset);
                if (any_tag_set(node, tag))
                        return;

                offset = node->offset;
                node = node->parent;
        }

        /* clear the root's tag bit */
        if (root_tag_get(root, tag))
                root_tag_clear(root, tag);
}

/**
 *        radix_tree_tag_clear - clear a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  If this causes
 *        the leaf node to have no tags set then clear the tag in the
 *        next-to-leaf node, etc.
 *
 *        Returns the address of the tagged item on success, else NULL.  ie:
 *        has the same return value and semantics as radix_tree_lookup().
 */
void *radix_tree_tag_clear(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        int offset = 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        parent = NULL;

        while (radix_tree_is_internal_node(node)) {
                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
        }

        if (node)
                node_tag_clear(root, parent, tag, offset);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_clear);

/**
  * radix_tree_iter_tag_clear - clear a tag on the current iterator entry
  * @root: radix tree root
  * @iter: iterator state
  * @tag: tag to clear
  */
void radix_tree_iter_tag_clear(struct radix_tree_root *root,
                        const struct radix_tree_iter *iter, unsigned int tag)
{
        node_tag_clear(root, iter->node, tag, iter_offset(iter));
}

/**
 * radix_tree_tag_get - get a tag on a radix tree node
 * @root:                radix tree root
 * @index:                index key
 * @tag:                tag index (< RADIX_TREE_MAX_TAGS)
 *
 * Return values:
 *
 *  0: tag not present or not set
 *  1: tag set
 *
 * Note that the return value of this function may not be relied on, even if
 * the RCU lock is held, unless tag modification and node deletion are excluded
 * from concurrency.
 */
int radix_tree_tag_get(const struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        if (!root_tag_get(root, tag))
                return 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return 0;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);

                if (!tag_get(parent, tag, offset))
                        return 0;
                if (node == RADIX_TREE_RETRY)
                        break;
        }

        return 1;
}
EXPORT_SYMBOL(radix_tree_tag_get);

/* Construct iter->tags bit-mask from node->tags[tag] array */
static void set_iter_tags(struct radix_tree_iter *iter,
                                struct radix_tree_node *node, unsigned offset,
                                unsigned tag)
{
        unsigned tag_long = offset / BITS_PER_LONG;
        unsigned tag_bit  = offset % BITS_PER_LONG;

        if (!node) {
                iter->tags = 1;
                return;
        }

        iter->tags = node->tags[tag][tag_long] >> tag_bit;

        /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
        if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
                /* Pick tags from next element */
                if (tag_bit)
                        iter->tags |= node->tags[tag][tag_long + 1] <<
                                                (BITS_PER_LONG - tag_bit);
                /* Clip chunk size, here only BITS_PER_LONG tags */
                iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
        }
}

void __rcu **radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter)
{
        iter->index = __radix_tree_iter_add(iter, 1);
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}
EXPORT_SYMBOL(radix_tree_iter_resume);

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if iteration is over
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
                             struct radix_tree_iter *iter, unsigned flags)
{
        unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
        struct radix_tree_node *node, *child;
        unsigned long index, offset, maxindex;

        if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
                return NULL;

        /*
         * Catch next_index overflow after ~0UL. iter->index never overflows
         * during iterating; it can be zero only at the beginning.
         * And we cannot overflow iter->next_index in a single step,
         * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
         *
         * This condition also used by radix_tree_next_slot() to stop
         * contiguous iterating, and forbid switching to the next chunk.
         */
        index = iter->next_index;
        if (!index && iter->index)
                return NULL;

 restart:
        radix_tree_load_root(root, &child, &maxindex);
        if (index > maxindex)
                return NULL;
        if (!child)
                return NULL;

        if (!radix_tree_is_internal_node(child)) {
                /* Single-slot tree */
                iter->index = index;
                iter->next_index = maxindex + 1;
                iter->tags = 1;
                iter->node = NULL;
                return (void __rcu **)&root->xa_head;
        }

        do {
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);

                if ((flags & RADIX_TREE_ITER_TAGGED) ?
                                !tag_get(node, tag, offset) : !child) {
                        /* Hole detected */
                        if (flags & RADIX_TREE_ITER_CONTIG)
                                return NULL;

                        if (flags & RADIX_TREE_ITER_TAGGED)
                                offset = radix_tree_find_next_bit(node, tag,
                                                offset + 1);
                        else
                                while (++offset        < RADIX_TREE_MAP_SIZE) {
                                        void *slot = rcu_dereference_raw(
                                                        node->slots[offset]);
                                        if (slot)
                                                break;
                                }
                        index &= ~node_maxindex(node);
                        index += offset << node->shift;
                        /* Overflow after ~0UL */
                        if (!index)
                                return NULL;
                        if (offset == RADIX_TREE_MAP_SIZE)
                                goto restart;
                        child = rcu_dereference_raw(node->slots[offset]);
                }

                if (!child)
                        goto restart;
                if (child == RADIX_TREE_RETRY)
                        break;
        } while (node->shift && radix_tree_is_internal_node(child));

        /* Update the iterator state */
        iter->index = (index &~ node_maxindex(node)) | offset;
        iter->next_index = (index | node_maxindex(node)) + 1;
        iter->node = node;

        if (flags & RADIX_TREE_ITER_TAGGED)
                set_iter_tags(iter, node, offset, tag);

        return node->slots + offset;
}
EXPORT_SYMBOL(radix_tree_next_chunk);

/**
 *        radix_tree_gang_lookup - perform multiple lookup on a radix tree
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *
 *        Performs an index-ascending scan of the tree for present items.  Places
 *        them at *@results and returns the number of items which were placed at
 *        *@results.
 *
 *        The implementation is naive.
 *
 *        Like radix_tree_lookup, radix_tree_gang_lookup may be called under
 *        rcu_read_lock. In this case, rather than the returned results being
 *        an atomic snapshot of the tree at a single point in time, the
 *        semantics of an RCU protected gang lookup are as though multiple
 *        radix_tree_lookups have been issued in individual locks, and results
 *        stored in 'results'.
 */
unsigned int
radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
                        unsigned long first_index, unsigned int max_items)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_slot(slot, root, &iter, first_index) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup);

/**
 *        radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
 *                                     based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the items at *@results and
 *        returns the number of items which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
                unsigned long first_index, unsigned int max_items,
                unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag);

/**
 *        radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
 *                                          radix tree based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the slots at *@results and
 *        returns the number of slots which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = slot;
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);

static bool __radix_tree_delete(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot)
{
        void *old = rcu_dereference_raw(*slot);
        int values = xa_is_value(old) ? -1 : 0;
        unsigned offset = get_slot_offset(node, slot);
        int tag;

        if (is_idr(root))
                node_tag_set(root, node, IDR_FREE, offset);
        else
                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
                        node_tag_clear(root, node, tag, offset);

        replace_slot(slot, NULL, node, -1, values);
        return node && delete_node(root, node);
}

/**
 * radix_tree_iter_delete - delete the entry at this iterator position
 * @root: radix tree root
 * @iter: iterator state
 * @slot: pointer to slot
 *
 * Delete the entry at the position currently pointed to by the iterator.
 * This may result in the current node being freed; if it is, the iterator
 * is advanced so that it will not reference the freed memory.  This
 * function may be called without any locking if there are no other threads
 * which can access this tree.
 */
void radix_tree_iter_delete(struct radix_tree_root *root,
                                struct radix_tree_iter *iter, void __rcu **slot)
{
        if (__radix_tree_delete(root, iter->node, slot))
                iter->index = iter->next_index;
}
EXPORT_SYMBOL(radix_tree_iter_delete);

/**
 * radix_tree_delete_item - delete an item from a radix tree
 * @root: radix tree root
 * @index: index key
 * @item: expected item
 *
 * Remove @item at @index from the radix tree rooted at @root.
 *
 * Return: the deleted entry, or %NULL if it was not present
 * or the entry at the given @index was not @item.
 */
void *radix_tree_delete_item(struct radix_tree_root *root,
                             unsigned long index, void *item)
{
        struct radix_tree_node *node = NULL;
        void __rcu **slot = NULL;
        void *entry;

        entry = __radix_tree_lookup(root, index, &node, &slot);
        if (!slot)
                return NULL;
        if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
                                                get_slot_offset(node, slot))))
                return NULL;

        if (item && entry != item)
                return NULL;

        __radix_tree_delete(root, node, slot);

        return entry;
}
EXPORT_SYMBOL(radix_tree_delete_item);

/**
 * radix_tree_delete - delete an entry from a radix tree
 * @root: radix tree root
 * @index: index key
 *
 * Remove the entry at @index from the radix tree rooted at @root.
 *
 * Return: The deleted entry, or %NULL if it was not present.
 */
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
        return radix_tree_delete_item(root, index, NULL);
}
EXPORT_SYMBOL(radix_tree_delete);

/**
 *        radix_tree_tagged - test whether any items in the tree are tagged
 *        @root:                radix tree root
 *        @tag:                tag to test
 */
int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
{
        return root_tag_get(root, tag);
}
EXPORT_SYMBOL(radix_tree_tagged);

/**
 * idr_preload - preload for idr_alloc()
 * @gfp_mask: allocation mask to use for preloading
 *
 * Preallocate memory to use for the next call to idr_alloc().  This function
 * returns with preemption disabled.  It will be enabled by idr_preload_end().
 */
void idr_preload(gfp_t gfp_mask)
{
        if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
                local_lock(&radix_tree_preloads.lock);
}
EXPORT_SYMBOL(idr_preload);

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex, start = iter->next_index;
        unsigned int shift, offset = 0;

 grow:
        shift = radix_tree_load_root(root, &child, &maxindex);
        if (!radix_tree_tagged(root, IDR_FREE))
                start = max(start, maxindex + 1);
        if (start > max)
                return ERR_PTR(-ENOSPC);

        if (start > maxindex) {
                int error = radix_tree_extend(root, gfp, start, shift);
                if (error < 0)
                        return ERR_PTR(error);
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }
        if (start == 0 && shift == 0)
                shift = RADIX_TREE_MAP_SHIFT;

        while (shift) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return ERR_PTR(-ENOMEM);
                        all_tag_set(child, IDR_FREE);
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, start);
                if (!tag_get(node, IDR_FREE, offset)) {
                        offset = radix_tree_find_next_bit(node, IDR_FREE,
                                                        offset + 1);
                        start = next_index(start, node, offset);
                        if (start > max || start == 0)
                                return ERR_PTR(-ENOSPC);
                        while (offset == RADIX_TREE_MAP_SIZE) {
                                offset = node->offset + 1;
                                node = node->parent;
                                if (!node)
                                        goto grow;
                                shift = node->shift;
                        }
                        child = rcu_dereference_raw(node->slots[offset]);
                }
                slot = &node->slots[offset];
        }

        iter->index = start;
        if (node)
                iter->next_index = 1 + min(max, (start | node_maxindex(node)));
        else
                iter->next_index = 1;
        iter->node = node;
        set_iter_tags(iter, node, offset, IDR_FREE);

        return slot;
}

/**
 * idr_destroy - release all internal memory from an IDR
 * @idr: idr handle
 *
 * After this function is called, the IDR is empty, and may be reused or
 * the data structure containing it may be freed.
 *
 * A typical clean-up sequence for objects stored in an idr tree will use
 * idr_for_each() to free all objects, if necessary, then idr_destroy() to
 * free the memory used to keep track of those objects.
 */
void idr_destroy(struct idr *idr)
{
        struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head);
        if (radix_tree_is_internal_node(node))
                radix_tree_free_nodes(node);
        idr->idr_rt.xa_head = NULL;
        root_tag_set(&idr->idr_rt, IDR_FREE);
}
EXPORT_SYMBOL(idr_destroy);

static void
radix_tree_node_ctor(void *arg)
{
        struct radix_tree_node *node = arg;

        memset(node, 0, sizeof(*node));
        INIT_LIST_HEAD(&node->private_list);
}

static int radix_tree_cpu_dead(unsigned int cpu)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;

        /* Free per-cpu pool of preloaded nodes */
        rtp = &per_cpu(radix_tree_preloads, cpu);
        while (rtp->nr) {
                node = rtp->nodes;
                rtp->nodes = node->parent;
                kmem_cache_free(radix_tree_node_cachep, node);
                rtp->nr--;
        }
        return 0;
}

void __init radix_tree_init(void)
{
        int ret;

        BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
        BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
        BUILD_BUG_ON(XA_CHUNK_SIZE > 255);
        radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
                        sizeof(struct radix_tree_node), 0,
                        SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
                        radix_tree_node_ctor);
        ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
                                        NULL, radix_tree_cpu_dead);
        WARN_ON(ret < 0);
}




























































    3 
    3 

















































































































   23 















    1 













































































































































































































































































































































































































































    1 


































































































































































































































































































































































































































































































































































































































































   11 





   23 






































































































































































    4 













    1 


















































































   15 






















































































    9 



   16 








































































   12 























   14 




    3 








   11 
















    8 
    4 




























    1 




    1 































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_XARRAY_H
#define _LINUX_XARRAY_H
/*
 * eXtensible Arrays
 * Copyright (c) 2017 Microsoft Corporation
 * Author: Matthew Wilcox <willy@infradead.org>
 *
 * See Documentation/core-api/xarray.rst for how to use the XArray.
 */

#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/kconfig.h>
#include <linux/limits.h>
#include <linux/lockdep.h>
#include <linux/rcupdate.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/types.h>

struct list_lru;

/*
 * The bottom two bits of the entry determine how the XArray interprets
 * the contents:
 *
 * 00: Pointer entry
 * 10: Internal entry
 * x1: Value entry or tagged pointer
 *
 * Attempting to store internal entries in the XArray is a bug.
 *
 * Most internal entries are pointers to the next node in the tree.
 * The following internal entries have a special meaning:
 *
 * 0-62: Sibling entries
 * 256: Retry entry
 * 257: Zero entry
 *
 * Errors are also represented as internal entries, but use the negative
 * space (-4094 to -2).  They're never stored in the slots array; only
 * returned by the normal API.
 */

#define BITS_PER_XA_VALUE        (BITS_PER_LONG - 1)

/**
 * xa_mk_value() - Create an XArray entry from an integer.
 * @v: Value to store in XArray.
 *
 * Context: Any context.
 * Return: An entry suitable for storing in the XArray.
 */
static inline void *xa_mk_value(unsigned long v)
{
        WARN_ON((long)v < 0);
        return (void *)((v << 1) | 1);
}

/**
 * xa_to_value() - Get value stored in an XArray entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value stored in the XArray entry.
 */
static inline unsigned long xa_to_value(const void *entry)
{
        return (unsigned long)entry >> 1;
}

/**
 * xa_is_value() - Determine if an entry is a value.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: True if the entry is a value, false if it is a pointer.
 */
static inline bool xa_is_value(const void *entry)
{
        return (unsigned long)entry & 1;
}

/**
 * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
 * @p: Plain pointer.
 * @tag: Tag value (0, 1 or 3).
 *
 * If the user of the XArray prefers, they can tag their pointers instead
 * of storing value entries.  Three tags are available (0, 1 and 3).
 * These are distinct from the xa_mark_t as they are not replicated up
 * through the array and cannot be searched for.
 *
 * Context: Any context.
 * Return: An XArray entry.
 */
static inline void *xa_tag_pointer(void *p, unsigned long tag)
{
        return (void *)((unsigned long)p | tag);
}

/**
 * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the untagged version of the pointer.
 *
 * Context: Any context.
 * Return: A pointer.
 */
static inline void *xa_untag_pointer(void *entry)
{
        return (void *)((unsigned long)entry & ~3UL);
}

/**
 * xa_pointer_tag() - Get the tag stored in an XArray entry.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the tag of that pointer.
 *
 * Context: Any context.
 * Return: A tag.
 */
static inline unsigned int xa_pointer_tag(void *entry)
{
        return (unsigned long)entry & 3UL;
}

/*
 * xa_mk_internal() - Create an internal entry.
 * @v: Value to turn into an internal entry.
 *
 * Internal entries are used for a number of purposes.  Entries 0-255 are
 * used for sibling entries (only 0-62 are used by the current code).  256
 * is used for the retry entry.  257 is used for the reserved / zero entry.
 * Negative internal entries are used to represent errnos.  Node pointers
 * are also tagged as internal entries in some situations.
 *
 * Context: Any context.
 * Return: An XArray internal entry corresponding to this value.
 */
static inline void *xa_mk_internal(unsigned long v)
{
        return (void *)((v << 2) | 2);
}

/*
 * xa_to_internal() - Extract the value from an internal entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value which was stored in the internal entry.
 */
static inline unsigned long xa_to_internal(const void *entry)
{
        return (unsigned long)entry >> 2;
}

/*
 * xa_is_internal() - Is the entry an internal entry?
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: %true if the entry is an internal entry.
 */
static inline bool xa_is_internal(const void *entry)
{
        return ((unsigned long)entry & 3) == 2;
}

#define XA_ZERO_ENTRY                xa_mk_internal(257)

/**
 * xa_is_zero() - Is the entry a zero entry?
 * @entry: Entry retrieved from the XArray
 *
 * The normal API will return NULL as the contents of a slot containing
 * a zero entry.  You can only see zero entries by using the advanced API.
 *
 * Return: %true if the entry is a zero entry.
 */
static inline bool xa_is_zero(const void *entry)
{
        return unlikely(entry == XA_ZERO_ENTRY);
}

/**
 * xa_is_err() - Report whether an XArray operation returned an error
 * @entry: Result from calling an XArray function
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special value indicating an error.  This function tells you
 * whether an error occurred; xa_err() tells you which error occurred.
 *
 * Context: Any context.
 * Return: %true if the entry indicates an error.
 */
static inline bool xa_is_err(const void *entry)
{
        return unlikely(xa_is_internal(entry) &&
                        entry >= xa_mk_internal(-MAX_ERRNO));
}

/**
 * xa_err() - Turn an XArray result into an errno.
 * @entry: Result from calling an XArray function.
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special pointer value which encodes an errno.  This function extracts
 * the errno from the pointer value, or returns 0 if the pointer does not
 * represent an errno.
 *
 * Context: Any context.
 * Return: A negative errno or 0.
 */
static inline int xa_err(void *entry)
{
        /* xa_to_internal() would not do sign extension. */
        if (xa_is_err(entry))
                return (long)entry >> 2;
        return 0;
}

/**
 * struct xa_limit - Represents a range of IDs.
 * @min: The lowest ID to allocate (inclusive).
 * @max: The maximum ID to allocate (inclusive).
 *
 * This structure is used either directly or via the XA_LIMIT() macro
 * to communicate the range of IDs that are valid for allocation.
 * Three common ranges are predefined for you:
 * * xa_limit_32b        - [0 - UINT_MAX]
 * * xa_limit_31b        - [0 - INT_MAX]
 * * xa_limit_16b        - [0 - USHRT_MAX]
 */
struct xa_limit {
        u32 max;
        u32 min;
};

#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max }

#define xa_limit_32b        XA_LIMIT(0, UINT_MAX)
#define xa_limit_31b        XA_LIMIT(0, INT_MAX)
#define xa_limit_16b        XA_LIMIT(0, USHRT_MAX)

typedef unsigned __bitwise xa_mark_t;
#define XA_MARK_0                ((__force xa_mark_t)0U)
#define XA_MARK_1                ((__force xa_mark_t)1U)
#define XA_MARK_2                ((__force xa_mark_t)2U)
#define XA_PRESENT                ((__force xa_mark_t)8U)
#define XA_MARK_MAX                XA_MARK_2
#define XA_FREE_MARK                XA_MARK_0

enum xa_lock_type {
        XA_LOCK_IRQ = 1,
        XA_LOCK_BH = 2,
};

/*
 * Values for xa_flags.  The radix tree stores its GFP flags in the xa_flags,
 * and we remain compatible with that.
 */
#define XA_FLAGS_LOCK_IRQ        ((__force gfp_t)XA_LOCK_IRQ)
#define XA_FLAGS_LOCK_BH        ((__force gfp_t)XA_LOCK_BH)
#define XA_FLAGS_TRACK_FREE        ((__force gfp_t)4U)
#define XA_FLAGS_ZERO_BUSY        ((__force gfp_t)8U)
#define XA_FLAGS_ALLOC_WRAPPED        ((__force gfp_t)16U)
#define XA_FLAGS_ACCOUNT        ((__force gfp_t)32U)
#define XA_FLAGS_MARK(mark)        ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
                                                (__force unsigned)(mark)))

/* ALLOC is for a normal 0-based alloc.  ALLOC1 is for an 1-based alloc */
#define XA_FLAGS_ALLOC        (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
#define XA_FLAGS_ALLOC1        (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY)

/**
 * struct xarray - The anchor of the XArray.
 * @xa_lock: Lock that protects the contents of the XArray.
 *
 * To use the xarray, define it statically or embed it in your data structure.
 * It is a very small data structure, so it does not usually make sense to
 * allocate it separately and keep a pointer to it in your data structure.
 *
 * You may use the xa_lock to protect your own data structures as well.
 */
/*
 * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
 * If the only non-NULL entry in the array is at index 0, @xa_head is that
 * entry.  If any other entry in the array is non-NULL, @xa_head points
 * to an @xa_node.
 */
struct xarray {
        spinlock_t        xa_lock;
/* private: The rest of the data structure is not to be used directly. */
        gfp_t                xa_flags;
        void __rcu *        xa_head;
};

#define XARRAY_INIT(name, flags) {                                \
        .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock),                \
        .xa_flags = flags,                                        \
        .xa_head = NULL,                                        \
}

/**
 * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
 * @name: A string that names your XArray.
 * @flags: XA_FLAG values.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name and flags.  It is
 * equivalent to calling xa_init_flags() on the array, but it does the
 * initialisation at compiletime instead of runtime.
 */
#define DEFINE_XARRAY_FLAGS(name, flags)                                \
        struct xarray name = XARRAY_INIT(name, flags)

/**
 * DEFINE_XARRAY() - Define an XArray.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name.  It is equivalent
 * to calling xa_init() on the array, but it does the initialisation at
 * compiletime instead of runtime.
 */
#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)

/**
 * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)

/**
 * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1)

void *xa_load(struct xarray *, unsigned long index);
void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *xa_erase(struct xarray *, unsigned long index);
void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
                        void *entry, gfp_t);
bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
void *xa_find(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
void *xa_find_after(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
                unsigned long max, unsigned int n, xa_mark_t);
void xa_destroy(struct xarray *);

/**
 * xa_init_flags() - Initialise an empty XArray with flags.
 * @xa: XArray.
 * @flags: XA_FLAG values.
 *
 * If you need to initialise an XArray with special flags (eg you need
 * to take the lock from interrupt context), use this function instead
 * of xa_init().
 *
 * Context: Any context.
 */
static inline void xa_init_flags(struct xarray *xa, gfp_t flags)
{
        spin_lock_init(&xa->xa_lock);
        xa->xa_flags = flags;
        xa->xa_head = NULL;
}

/**
 * xa_init() - Initialise an empty XArray.
 * @xa: XArray.
 *
 * An empty XArray is full of NULL entries.
 *
 * Context: Any context.
 */
static inline void xa_init(struct xarray *xa)
{
        xa_init_flags(xa, 0);
}

/**
 * xa_empty() - Determine if an array has any present entries.
 * @xa: XArray.
 *
 * Context: Any context.
 * Return: %true if the array contains only NULL pointers.
 */
static inline bool xa_empty(const struct xarray *xa)
{
        return xa->xa_head == NULL;
}

/**
 * xa_marked() - Inquire whether any entry in this array has a mark set
 * @xa: Array
 * @mark: Mark value
 *
 * Context: Any context.
 * Return: %true if any entry has this mark set.
 */
static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
{
        return xa->xa_flags & XA_FLAGS_MARK(mark);
}

/**
 * xa_for_each_range() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 * @last: Last index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_range() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_range().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_range(xa, index, entry, start, last)                \
        for (index = start,                                                \
             entry = xa_find(xa, &index, last, XA_PRESENT);                \
             entry;                                                        \
             entry = xa_find_after(xa, &index, last, XA_PRESENT))

/**
 * xa_for_each_start() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_start() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_start().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_start(xa, index, entry, start) \
        xa_for_each_range(xa, index, entry, start, ULONG_MAX)

/**
 * xa_for_each() - Iterate over present entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you want
 * to skip or reprocess indices.  It is safe to modify the array during the
 * iteration.  At the end of the iteration, @entry will be set to NULL and
 * @index will have a value less than or equal to max.
 *
 * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).  xa_for_each()
 * will spin if it hits a retry entry; if you intend to see retry entries,
 * you should use the xas_for_each() iterator instead.  The xas_for_each()
 * iterator will expand into more inline code than xa_for_each().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each(xa, index, entry) \
        xa_for_each_start(xa, index, entry, 0)

/**
 * xa_for_each_marked() - Iterate over marked entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @filter: Selection criterion.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  The iteration will skip all entries in the array
 * which do not match @filter.  You may modify @index during the iteration
 * if you want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set to
 * NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n).
 * You have to handle your own locking with xas_for_each(), and if you have
 * to unlock after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_marked() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each_marked() iterator
 * instead.  The xas_for_each_marked() iterator will expand into more inline
 * code than xa_for_each_marked().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_marked(xa, index, entry, filter) \
        for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \
             entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))

#define xa_trylock(xa)                spin_trylock(&(xa)->xa_lock)
#define xa_lock(xa)                spin_lock(&(xa)->xa_lock)
#define xa_unlock(xa)                spin_unlock(&(xa)->xa_lock)
#define xa_lock_bh(xa)                spin_lock_bh(&(xa)->xa_lock)
#define xa_unlock_bh(xa)        spin_unlock_bh(&(xa)->xa_lock)
#define xa_lock_irq(xa)                spin_lock_irq(&(xa)->xa_lock)
#define xa_unlock_irq(xa)        spin_unlock_irq(&(xa)->xa_lock)
#define xa_lock_irqsave(xa, flags) \
                                spin_lock_irqsave(&(xa)->xa_lock, flags)
#define xa_unlock_irqrestore(xa, flags) \
                                spin_unlock_irqrestore(&(xa)->xa_lock, flags)
#define xa_lock_nested(xa, subclass) \
                                spin_lock_nested(&(xa)->xa_lock, subclass)
#define xa_lock_bh_nested(xa, subclass) \
                                spin_lock_bh_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irq_nested(xa, subclass) \
                                spin_lock_irq_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irqsave_nested(xa, flags, subclass) \
                spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass)

/*
 * Versions of the normal API which require the caller to hold the
 * xa_lock.  If the GFP flags allow it, they will drop the lock to
 * allocate memory, then reacquire it afterwards.  These functions
 * may also re-enable interrupts if the XArray flags indicate the
 * locking should be interrupt safe.
 */
void *__xa_erase(struct xarray *, unsigned long index);
void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
                void *entry, gfp_t);
int __must_check __xa_insert(struct xarray *, unsigned long index,
                void *entry, gfp_t);
int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
                struct xa_limit, gfp_t);
int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
                struct xa_limit, u32 *next, gfp_t);
void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);

/**
 * xa_store_bh() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_bh(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_store_irq() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_erase_bh() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_bh(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_bh(xa);

        return entry;
}

/**
 * xa_erase_irq() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_irq(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_irq(xa);

        return entry;
}

/**
 * xa_cmpxchg() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * If the entry at @index is the same as @old, replace it with @entry.
 * If the return value is equal to @old, then the exchange was successful.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock(xa);

        return curr;
}

/**
 * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_insert() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_insert_bh() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_bh(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_insert_irq() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_irq(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Note that callers interested in whether wrapping has occurred should
 * use __xa_alloc_cyclic() instead.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock(xa);

        return err < 0 ? err : 0;
}

/**
 * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Note that callers interested in whether wrapping has occurred should
 * use __xa_alloc_cyclic() instead.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_bh(xa);

        return err < 0 ? err : 0;
}

/**
 * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Note that callers interested in whether wrapping has occurred should
 * use __xa_alloc_cyclic() instead.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_irq(xa);

        return err < 0 ? err : 0;
}

/**
 * xa_reserve() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * Ensures there is somewhere to store an entry at @index in the array.
 * If there is already something stored at @index, this function does
 * nothing.  If there was nothing there, the entry is marked as reserved.
 * Loading from a reserved entry returns a %NULL pointer.
 *
 * If you do not use the entry that you have reserved, call xa_release()
 * or xa_erase() to free any unnecessary memory.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_bh() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * A softirq-disabling version of xa_reserve().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_irq() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * An interrupt-disabling version of xa_reserve().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_release() - Release a reserved entry.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After calling xa_reserve(), you can call this function to release the
 * reservation.  If the entry at @index has been stored to, this function
 * will do nothing.
 */
static inline void xa_release(struct xarray *xa, unsigned long index)
{
        xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0);
}

/* Everything below here is the Advanced API.  Proceed with caution. */

/*
 * The xarray is constructed out of a set of 'chunks' of pointers.  Choosing
 * the best chunk size requires some tradeoffs.  A power of two recommends
 * itself so that we can walk the tree based purely on shifts and masks.
 * Generally, the larger the better; as the number of slots per level of the
 * tree increases, the less tall the tree needs to be.  But that needs to be
 * balanced against the memory consumption of each node.  On a 64-bit system,
 * xa_node is currently 576 bytes, and we get 7 of them per 4kB page.  If we
 * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
 */
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT                (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)
#endif
#define XA_CHUNK_SIZE                (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK                (XA_CHUNK_SIZE - 1)
#define XA_MAX_MARKS                3
#define XA_MARK_LONGS                BITS_TO_LONGS(XA_CHUNK_SIZE)

/*
 * @count is the count of every non-NULL element in the ->slots array
 * whether that is a value entry, a retry entry, a user pointer,
 * a sibling entry or a pointer to the next level of the tree.
 * @nr_values is the count of every element in ->slots which is
 * either a value entry or a sibling of a value entry.
 */
struct xa_node {
        unsigned char        shift;                /* Bits remaining in each slot */
        unsigned char        offset;                /* Slot offset in parent */
        unsigned char        count;                /* Total entry count */
        unsigned char        nr_values;        /* Value entry count */
        struct xa_node __rcu *parent;        /* NULL at top of tree */
        struct xarray        *array;                /* The array we belong to */
        union {
                struct list_head private_list;        /* For tree user */
                struct rcu_head        rcu_head;        /* Used when freeing node */
        };
        void __rcu        *slots[XA_CHUNK_SIZE];
        union {
                unsigned long        tags[XA_MAX_MARKS][XA_MARK_LONGS];
                unsigned long        marks[XA_MAX_MARKS][XA_MARK_LONGS];
        };
};

void xa_dump(const struct xarray *);
void xa_dump_node(const struct xa_node *);

#ifdef XA_DEBUG
#define XA_BUG_ON(xa, x) do {                                        \
                if (x) {                                        \
                        xa_dump(xa);                                \
                        BUG();                                        \
                }                                                \
        } while (0)
#define XA_NODE_BUG_ON(node, x) do {                                \
                if (x) {                                        \
                        if (node) xa_dump_node(node);                \
                        BUG();                                        \
                }                                                \
        } while (0)
#else
#define XA_BUG_ON(xa, x)        do { } while (0)
#define XA_NODE_BUG_ON(node, x)        do { } while (0)
#endif

/* Private */
static inline void *xa_head(const struct xarray *xa)
{
        return rcu_dereference_check(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_head_locked(const struct xarray *xa)
{
        return rcu_dereference_protected(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_check(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry_locked(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_protected(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_check(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_protected(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_mk_node(const struct xa_node *node)
{
        return (void *)((unsigned long)node | 2);
}

/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
        return (struct xa_node *)((unsigned long)entry - 2);
}

/* Private */
static inline bool xa_is_node(const void *entry)
{
        return xa_is_internal(entry) && (unsigned long)entry > 4096;
}

/* Private */
static inline void *xa_mk_sibling(unsigned int offset)
{
        return xa_mk_internal(offset);
}

/* Private */
static inline unsigned long xa_to_sibling(const void *entry)
{
        return xa_to_internal(entry);
}

/**
 * xa_is_sibling() - Is the entry a sibling entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a sibling entry.
 */
static inline bool xa_is_sibling(const void *entry)
{
        return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
                (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}

#define XA_RETRY_ENTRY                xa_mk_internal(256)

/**
 * xa_is_retry() - Is the entry a retry entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a retry entry.
 */
static inline bool xa_is_retry(const void *entry)
{
        return unlikely(entry == XA_RETRY_ENTRY);
}

/**
 * xa_is_advanced() - Is the entry only permitted for the advanced API?
 * @entry: Entry to be stored in the XArray.
 *
 * Return: %true if the entry cannot be stored by the normal API.
 */
static inline bool xa_is_advanced(const void *entry)
{
        return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY);
}

/**
 * typedef xa_update_node_t - A callback function from the XArray.
 * @node: The node which is being processed
 *
 * This function is called every time the XArray updates the count of
 * present and value entries in a node.  It allows advanced users to
 * maintain the private_list in the node.
 *
 * Context: The xa_lock is held and interrupts may be disabled.
 *            Implementations should not drop the xa_lock, nor re-enable
 *            interrupts.
 */
typedef void (*xa_update_node_t)(struct xa_node *node);

void xa_delete_node(struct xa_node *, xa_update_node_t);

/*
 * The xa_state is opaque to its users.  It contains various different pieces
 * of state involved in the current operation on the XArray.  It should be
 * declared on the stack and passed between the various internal routines.
 * The various elements in it should not be accessed directly, but only
 * through the provided accessor functions.  The below documentation is for
 * the benefit of those working on the code, not for users of the XArray.
 *
 * @xa_node usually points to the xa_node containing the slot we're operating
 * on (and @xa_offset is the offset in the slots array).  If there is a
 * single entry in the array at index 0, there are no allocated xa_nodes to
 * point to, and so we store %NULL in @xa_node.  @xa_node is set to
 * the value %XAS_RESTART if the xa_state is not walked to the correct
 * position in the tree of nodes for this operation.  If an error occurs
 * during an operation, it is set to an %XAS_ERROR value.  If we run off the
 * end of the allocated nodes, it is set to %XAS_BOUNDS.
 */
struct xa_state {
        struct xarray *xa;
        unsigned long xa_index;
        unsigned char xa_shift;
        unsigned char xa_sibs;
        unsigned char xa_offset;
        unsigned char xa_pad;                /* Helps gcc generate better code */
        struct xa_node *xa_node;
        struct xa_node *xa_alloc;
        xa_update_node_t xa_update;
        struct list_lru *xa_lru;
};

/*
 * We encode errnos in the xas->xa_node.  If an error has happened, we need to
 * drop the lock to fix it, and once we've done so the xa_state is invalid.
 */
#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
#define XAS_BOUNDS        ((struct xa_node *)1UL)
#define XAS_RESTART        ((struct xa_node *)3UL)

#define __XA_STATE(array, index, shift, sibs)  {        \
        .xa = array,                                        \
        .xa_index = index,                                \
        .xa_shift = shift,                                \
        .xa_sibs = sibs,                                \
        .xa_offset = 0,                                        \
        .xa_pad = 0,                                        \
        .xa_node = XAS_RESTART,                                \
        .xa_alloc = NULL,                                \
        .xa_update = NULL,                                \
        .xa_lru = NULL,                                        \
}

/**
 * XA_STATE() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 *
 * Declare and initialise an xa_state on the stack.
 */
#define XA_STATE(name, array, index)                                \
        struct xa_state name = __XA_STATE(array, index, 0, 0)

/**
 * XA_STATE_ORDER() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 * @order: Order of entry.
 *
 * Declare and initialise an xa_state on the stack.  This variant of
 * XA_STATE() allows you to specify the 'order' of the element you
 * want to operate on.`
 */
#define XA_STATE_ORDER(name, array, index, order)                \
        struct xa_state name = __XA_STATE(array,                \
                        (index >> order) << order,                \
                        order - (order % XA_CHUNK_SHIFT),        \
                        (1U << (order % XA_CHUNK_SHIFT)) - 1)

#define xas_marked(xas, mark)        xa_marked((xas)->xa, (mark))
#define xas_trylock(xas)        xa_trylock((xas)->xa)
#define xas_lock(xas)                xa_lock((xas)->xa)
#define xas_unlock(xas)                xa_unlock((xas)->xa)
#define xas_lock_bh(xas)        xa_lock_bh((xas)->xa)
#define xas_unlock_bh(xas)        xa_unlock_bh((xas)->xa)
#define xas_lock_irq(xas)        xa_lock_irq((xas)->xa)
#define xas_unlock_irq(xas)        xa_unlock_irq((xas)->xa)
#define xas_lock_irqsave(xas, flags) \
                                xa_lock_irqsave((xas)->xa, flags)
#define xas_unlock_irqrestore(xas, flags) \
                                xa_unlock_irqrestore((xas)->xa, flags)

/**
 * xas_error() - Return an errno stored in the xa_state.
 * @xas: XArray operation state.
 *
 * Return: 0 if no error has been noted.  A negative errno if one has.
 */
static inline int xas_error(const struct xa_state *xas)
{
        return xa_err(xas->xa_node);
}

/**
 * xas_set_err() - Note an error in the xa_state.
 * @xas: XArray operation state.
 * @err: Negative error number.
 *
 * Only call this function with a negative @err; zero or positive errors
 * will probably not behave the way you think they should.  If you want
 * to clear the error from an xa_state, use xas_reset().
 */
static inline void xas_set_err(struct xa_state *xas, long err)
{
        xas->xa_node = XA_ERROR(err);
}

/**
 * xas_invalid() - Is the xas in a retry or error state?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas cannot be used for operations.
 */
static inline bool xas_invalid(const struct xa_state *xas)
{
        return (unsigned long)xas->xa_node & 3;
}

/**
 * xas_valid() - Is the xas a valid cursor into the array?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas can be used for operations.
 */
static inline bool xas_valid(const struct xa_state *xas)
{
        return !xas_invalid(xas);
}

/**
 * xas_is_node() - Does the xas point to a node?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas currently references a node.
 */
static inline bool xas_is_node(const struct xa_state *xas)
{
        return xas_valid(xas) && xas->xa_node;
}

/* True if the pointer is something other than a node */
static inline bool xas_not_node(struct xa_node *node)
{
        return ((unsigned long)node & 3) || !node;
}

/* True if the node represents RESTART or an error */
static inline bool xas_frozen(struct xa_node *node)
{
        return (unsigned long)node & 2;
}

/* True if the node represents head-of-tree, RESTART or BOUNDS */
static inline bool xas_top(struct xa_node *node)
{
        return node <= XAS_RESTART;
}

/**
 * xas_reset() - Reset an XArray operation state.
 * @xas: XArray operation state.
 *
 * Resets the error or walk state of the @xas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * xarray lock and want to reuse the xa_state.
 *
 * Context: Any context.
 */
static inline void xas_reset(struct xa_state *xas)
{
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_retry() - Retry the operation if appropriate.
 * @xas: XArray operation state.
 * @entry: Entry from xarray.
 *
 * The advanced functions may sometimes return an internal entry, such as
 * a retry entry or a zero entry.  This function sets up the @xas to restart
 * the walk from the head of the array if needed.
 *
 * Context: Any context.
 * Return: true if the operation needs to be retried.
 */
static inline bool xas_retry(struct xa_state *xas, const void *entry)
{
        if (xa_is_zero(entry))
                return true;
        if (!xa_is_retry(entry))
                return false;
        xas_reset(xas);
        return true;
}

void *xas_load(struct xa_state *);
void *xas_store(struct xa_state *, void *entry);
void *xas_find(struct xa_state *, unsigned long max);
void *xas_find_conflict(struct xa_state *);

bool xas_get_mark(const struct xa_state *, xa_mark_t);
void xas_set_mark(const struct xa_state *, xa_mark_t);
void xas_clear_mark(const struct xa_state *, xa_mark_t);
void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
void xas_init_marks(const struct xa_state *);

bool xas_nomem(struct xa_state *, gfp_t);
void xas_destroy(struct xa_state *);
void xas_pause(struct xa_state *);

void xas_create_range(struct xa_state *);

#ifdef CONFIG_XARRAY_MULTI
int xa_get_order(struct xarray *, unsigned long index);
int xas_get_order(struct xa_state *xas);
void xas_split(struct xa_state *, void *entry, unsigned int order);
void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
void xas_try_split(struct xa_state *xas, void *entry, unsigned int order);
unsigned int xas_try_split_min_order(unsigned int order);
#else
static inline int xa_get_order(struct xarray *xa, unsigned long index)
{
        return 0;
}

static inline int xas_get_order(struct xa_state *xas)
{
        return 0;
}

static inline void xas_split(struct xa_state *xas, void *entry,
                unsigned int order)
{
        xas_store(xas, entry);
}

static inline void xas_split_alloc(struct xa_state *xas, void *entry,
                unsigned int order, gfp_t gfp)
{
}

static inline void xas_try_split(struct xa_state *xas, void *entry,
                unsigned int order)
{
}

static inline unsigned int xas_try_split_min_order(unsigned int order)
{
        return 0;
}

#endif

/**
 * xas_reload() - Refetch an entry from the xarray.
 * @xas: XArray operation state.
 *
 * Use this function to check that a previously loaded entry still has
 * the same value.  This is useful for the lockless pagecache lookup where
 * we walk the array with only the RCU lock to protect us, lock the page,
 * then check that the page hasn't moved since we looked it up.
 *
 * The caller guarantees that @xas is still valid.  If it may be in an
 * error or restart state, call xas_load() instead.
 *
 * Return: The entry at this location in the xarray.
 */
static inline void *xas_reload(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        char offset;

        if (!node)
                return xa_head(xas->xa);
        if (IS_ENABLED(CONFIG_XARRAY_MULTI)) {
                offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK;
                entry = xa_entry(xas->xa, node, offset);
                if (!xa_is_sibling(entry))
                        return entry;
                offset = xa_to_sibling(entry);
        } else {
                offset = xas->xa_offset;
        }
        return xa_entry(xas->xa, node, offset);
}

/**
 * xas_set() - Set up XArray operation state for a different index.
 * @xas: XArray operation state.
 * @index: New index into the XArray.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see xas_next()
 * to move to an adjacent index.
 */
static inline void xas_set(struct xa_state *xas, unsigned long index)
{
        xas->xa_index = index;
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_advance() - Skip over sibling entries.
 * @xas: XArray operation state.
 * @index: Index of last sibling entry.
 *
 * Move the operation state to refer to the last sibling entry.
 * This is useful for loops that normally want to see sibling
 * entries but sometimes want to skip them.  Use xas_set() if you
 * want to move to an index which is not part of this entry.
 */
static inline void xas_advance(struct xa_state *xas, unsigned long index)
{
        unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0;

        xas->xa_index = index;
        xas->xa_offset = (index >> shift) & XA_CHUNK_MASK;
}

/**
 * xas_set_order() - Set up XArray operation state for a multislot entry.
 * @xas: XArray operation state.
 * @index: Target of the operation.
 * @order: Entry occupies 2^@order indices.
 */
static inline void xas_set_order(struct xa_state *xas, unsigned long index,
                                        unsigned int order)
{
#ifdef CONFIG_XARRAY_MULTI
        xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
        xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
        xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        xas->xa_node = XAS_RESTART;
#else
        BUG_ON(order > 0);
        xas_set(xas, index);
#endif
}

/**
 * xas_set_update() - Set up XArray operation state for a callback.
 * @xas: XArray operation state.
 * @update: Function to call when updating a node.
 *
 * The XArray can notify a caller after it has updated an xa_node.
 * This is advanced functionality and is only needed by the page
 * cache and swap cache.
 */
static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
{
        xas->xa_update = update;
}

static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru)
{
        xas->xa_lru = lru;
}

/**
 * xas_next_entry() - Advance iterator to next present entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * xas_next_entry() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find(), and will call xas_find()
 * for all the hard cases.
 *
 * Return: The next present entry after the one currently referred to by @xas.
 */
static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
{
        struct xa_node *node = xas->xa_node;
        void *entry;

        if (unlikely(xas_not_node(node) || node->shift ||
                        xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
                return xas_find(xas, max);

        do {
                if (unlikely(xas->xa_index >= max))
                        return xas_find(xas, max);
                if (unlikely(xas->xa_offset == XA_CHUNK_MASK))
                        return xas_find(xas, max);
                entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
                if (unlikely(xa_is_internal(entry)))
                        return xas_find(xas, max);
                xas->xa_offset++;
                xas->xa_index++;
        } while (!entry);

        return entry;
}

/* Private */
static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
                xa_mark_t mark)
{
        unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
        unsigned int offset = xas->xa_offset;

        if (advance)
                offset++;
        if (XA_CHUNK_SIZE == BITS_PER_LONG) {
                if (offset < XA_CHUNK_SIZE) {
                        unsigned long data = *addr & (~0UL << offset);
                        if (data)
                                return __ffs(data);
                }
                return XA_CHUNK_SIZE;
        }

        return find_next_bit(addr, XA_CHUNK_SIZE, offset);
}

/**
 * xas_next_marked() - Advance iterator to next marked entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark to search for.
 *
 * xas_next_marked() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find_marked(), and will call
 * xas_find_marked() for all the hard cases.
 *
 * Return: The next marked entry after the one currently referred to by @xas.
 */
static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
                                                                xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        unsigned int offset;

        if (unlikely(xas_not_node(node) || node->shift))
                return xas_find_marked(xas, max, mark);
        offset = xas_find_chunk(xas, true, mark);
        xas->xa_offset = offset;
        xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
        if (xas->xa_index > max)
                return NULL;
        if (offset == XA_CHUNK_SIZE)
                return xas_find_marked(xas, max, mark);
        entry = xa_entry(xas->xa, node, offset);
        if (!entry)
                return xas_find_marked(xas, max, mark);
        return entry;
}

/*
 * If iterating while holding a lock, drop the lock and reschedule
 * every %XA_CHECK_SCHED loops.
 */
enum {
        XA_CHECK_SCHED = 4096,
};

/**
 * xas_for_each() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 *
 * The loop body will be executed for each entry present in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each(xas, entry, max) \
        for (entry = xas_find(xas, max); entry; \
             entry = xas_next_entry(xas, max))

/**
 * xas_for_each_marked() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 * @mark: Mark to search for.
 *
 * The loop body will be executed for each marked entry in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each_marked(xas, entry, max, mark) \
        for (entry = xas_find_marked(xas, max, mark); entry; \
             entry = xas_next_marked(xas, max, mark))

/**
 * xas_for_each_conflict() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 *
 * The loop body will be executed for each entry in the XArray that
 * lies within the range specified by @xas.  If the loop terminates
 * normally, @entry will be %NULL.  The user may break out of the loop,
 * which will leave @entry set to the conflicting entry.  The caller
 * may also call xa_set_err() to exit the loop while setting an error
 * to record the reason.
 */
#define xas_for_each_conflict(xas, entry) \
        while ((entry = xas_find_conflict(xas)))

void *__xas_next(struct xa_state *);
void *__xas_prev(struct xa_state *);

/**
 * xas_prev() - Move iterator to previous index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * subtracted from the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index 0, this function wraps
 * around to %ULONG_MAX.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_prev(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == 0))
                return __xas_prev(xas);

        xas->xa_index--;
        xas->xa_offset--;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

/**
 * xas_next() - Move state to next index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * added to the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index %ULONG_MAX, this function wraps
 * around to 0.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_next(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == XA_CHUNK_MASK))
                return __xas_next(xas);

        xas->xa_index++;
        xas->xa_offset++;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

#endif /* _LINUX_XARRAY_H */




















































   37 
   28 




   20 
   39 





   10 

















   20 


   20 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* SPDX-License-Identifier: GPL-2.0-only */
/* A pointer that can point to either kernel or userspace memory. */
#ifndef _LINUX_BPFPTR_H
#define _LINUX_BPFPTR_H

#include <linux/mm.h>
#include <linux/sockptr.h>

typedef sockptr_t bpfptr_t;

static inline bool bpfptr_is_kernel(bpfptr_t bpfptr)
{
        return bpfptr.is_kernel;
}

static inline bpfptr_t KERNEL_BPFPTR(void *p)
{
        return (bpfptr_t) { .kernel = p, .is_kernel = true };
}

static inline bpfptr_t USER_BPFPTR(void __user *p)
{
        return (bpfptr_t) { .user = p };
}

static inline bpfptr_t make_bpfptr(u64 addr, bool is_kernel)
{
        if (is_kernel)
                return KERNEL_BPFPTR((void*) (uintptr_t) addr);
        else
                return USER_BPFPTR(u64_to_user_ptr(addr));
}

static inline bool bpfptr_is_null(bpfptr_t bpfptr)
{
        if (bpfptr_is_kernel(bpfptr))
                return !bpfptr.kernel;
        return !bpfptr.user;
}

static inline void bpfptr_add(bpfptr_t *bpfptr, size_t val)
{
        if (bpfptr_is_kernel(*bpfptr))
                bpfptr->kernel += val;
        else
                bpfptr->user += val;
}

static inline int copy_from_bpfptr_offset(void *dst, bpfptr_t src,
                                          size_t offset, size_t size)
{
        if (!bpfptr_is_kernel(src))
                return copy_from_user(dst, src.user + offset, size);
        return copy_from_kernel_nofault(dst, src.kernel + offset, size);
}

static inline int copy_from_bpfptr(void *dst, bpfptr_t src, size_t size)
{
        return copy_from_bpfptr_offset(dst, src, 0, size);
}

static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset,
                                        const void *src, size_t size)
{
        return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size);
}

static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len)
{
        void *p = kvmalloc_node_align_noprof(len, 1, GFP_USER | __GFP_NOWARN, NUMA_NO_NODE);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_bpfptr(p, src, len)) {
                kvfree(p);
                return ERR_PTR(-EFAULT);
        }
        return p;
}
#define kvmemdup_bpfptr(...)        alloc_hooks(kvmemdup_bpfptr_noprof(__VA_ARGS__))

static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count)
{
        if (bpfptr_is_kernel(src))
                return strncpy_from_kernel_nofault(dst, src.kernel, count);
        return strncpy_from_user(dst, src.user, count);
}

#endif /* _LINUX_BPFPTR_H */


















































    3 




    3 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
// SPDX-License-Identifier: GPL-2.0
/*
 * All the USB notify logic
 *
 * (C) Copyright 2005 Greg Kroah-Hartman <gregkh@suse.de>
 *
 * notifier functions originally based on those in kernel/sys.c
 * but fixed up to not be so broken.
 *
 * Released under the GPLv2 only.
 */


#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/usb.h>
#include <linux/mutex.h>
#include "usb.h"

static BLOCKING_NOTIFIER_HEAD(usb_notifier_list);

/**
 * usb_register_notify - register a notifier callback whenever a usb change happens
 * @nb: pointer to the notifier block for the callback events.
 *
 * These changes are either USB devices or busses being added or removed.
 */
void usb_register_notify(struct notifier_block *nb)
{
        blocking_notifier_chain_register(&usb_notifier_list, nb);
}
EXPORT_SYMBOL_GPL(usb_register_notify);

/**
 * usb_unregister_notify - unregister a notifier callback
 * @nb: pointer to the notifier block for the callback events.
 *
 * usb_register_notify() must have been previously called for this function
 * to work properly.
 */
void usb_unregister_notify(struct notifier_block *nb)
{
        blocking_notifier_chain_unregister(&usb_notifier_list, nb);
}
EXPORT_SYMBOL_GPL(usb_unregister_notify);


void usb_notify_add_device(struct usb_device *udev)
{
        blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev);
}

void usb_notify_remove_device(struct usb_device *udev)
{
        blocking_notifier_call_chain(&usb_notifier_list,
                        USB_DEVICE_REMOVE, udev);
}

void usb_notify_add_bus(struct usb_bus *ubus)
{
        blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus);
}

void usb_notify_remove_bus(struct usb_bus *ubus)
{
        blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus);
}

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/* SPDX-License-Identifier: GPL-2.0 */
/* Freezer declarations */

#ifndef FREEZER_H_INCLUDED
#define FREEZER_H_INCLUDED

#include <linux/debug_locks.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>

#ifdef CONFIG_FREEZER
DECLARE_STATIC_KEY_FALSE(freezer_active);

extern bool pm_freezing;                /* PM freezing in effect */
extern bool pm_nosig_freezing;                /* PM nosig freezing in effect */

/*
 * Timeout for stopping processes
 */
extern unsigned int freeze_timeout_msecs;

/*
 * Check if a process has been frozen for PM or cgroup1 freezer. Note that
 * cgroup2 freezer uses the job control mechanism and does not interact with
 * the PM freezer.
 */
extern bool frozen(struct task_struct *p);

extern bool freezing_slow_path(struct task_struct *p);

/*
 * Check if there is a request to freeze a task from PM or cgroup1 freezer.
 * Note that cgroup2 freezer uses the job control mechanism and does not
 * interact with the PM freezer.
 */
static inline bool freezing(struct task_struct *p)
{
        if (static_branch_unlikely(&freezer_active))
                return freezing_slow_path(p);

        return false;
}

/* Takes and releases task alloc lock using task_lock() */
extern void __thaw_task(struct task_struct *t);

extern bool __refrigerator(bool check_kthr_stop);
extern int freeze_processes(void);
extern int freeze_kernel_threads(void);
extern void thaw_processes(void);
extern void thaw_kernel_threads(void);
extern void thaw_process(struct task_struct *p);

static inline bool try_to_freeze(void)
{
        might_sleep();
        if (likely(!freezing(current)))
                return false;
        if (!(current->flags & PF_NOFREEZE))
                debug_check_no_locks_held();
        return __refrigerator(false);
}

extern bool freeze_task(struct task_struct *p);
extern bool set_freezable(void);

#ifdef CONFIG_CGROUP_FREEZER
extern bool cgroup1_freezing(struct task_struct *task);
#else /* !CONFIG_CGROUP_FREEZER */
static inline bool cgroup1_freezing(struct task_struct *task)
{
        return false;
}
#endif /* !CONFIG_CGROUP_FREEZER */

#else /* !CONFIG_FREEZER */
static inline bool frozen(struct task_struct *p) { return false; }
static inline bool freezing(struct task_struct *p) { return false; }
static inline void __thaw_task(struct task_struct *t) {}

static inline bool __refrigerator(bool check_kthr_stop) { return false; }
static inline int freeze_processes(void) { return -ENOSYS; }
static inline int freeze_kernel_threads(void) { return -ENOSYS; }
static inline void thaw_processes(void) {}
static inline void thaw_kernel_threads(void) {}
static inline void thaw_process(struct task_struct *p) {}

static inline bool try_to_freeze(void) { return false; }

static inline void set_freezable(void) {}

#endif /* !CONFIG_FREEZER */

#endif        /* FREEZER_H_INCLUDED */























    4 























    3 
































    4 










    4 
    4 






























    1 








    1 











    3 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/sysfs/symlink.c - sysfs symlink implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * Please see Documentation/filesystems/sysfs.rst for more information.
 */

#include <linux/fs.h>
#include <linux/module.h>
#include <linux/kobject.h>
#include <linux/mutex.h>
#include <linux/security.h>

#include "sysfs.h"

static int sysfs_do_create_link_sd(struct kernfs_node *parent,
                                   struct kobject *target_kobj,
                                   const char *name, int warn)
{
        struct kernfs_node *kn, *target = NULL;

        if (WARN_ON(!name || !parent))
                return -EINVAL;

        /*
         * We don't own @target_kobj and it may be removed at any time.
         * Synchronize using sysfs_symlink_target_lock.  See
         * sysfs_remove_dir() for details.
         */
        spin_lock(&sysfs_symlink_target_lock);
        if (target_kobj->sd) {
                target = target_kobj->sd;
                kernfs_get(target);
        }
        spin_unlock(&sysfs_symlink_target_lock);

        if (!target)
                return -ENOENT;

        kn = kernfs_create_link(parent, name, target);
        kernfs_put(target);

        if (!IS_ERR(kn))
                return 0;

        if (warn && PTR_ERR(kn) == -EEXIST)
                sysfs_warn_dup(parent, name);
        return PTR_ERR(kn);
}

/**
 *        sysfs_create_link_sd - create symlink to a given object.
 *        @kn:                directory we're creating the link in.
 *        @target:        object we're pointing to.
 *        @name:                name of the symlink.
 */
int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
                         const char *name)
{
        return sysfs_do_create_link_sd(kn, target, name, 1);
}

static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
                                const char *name, int warn)
{
        struct kernfs_node *parent = NULL;

        if (!kobj)
                parent = sysfs_root_kn;
        else
                parent = kobj->sd;

        if (!parent)
                return -EFAULT;

        return sysfs_do_create_link_sd(parent, target, name, warn);
}

/**
 *        sysfs_create_link - create symlink between two objects.
 *        @kobj:        object whose directory we're creating the link in.
 *        @target:        object we're pointing to.
 *        @name:                name of the symlink.
 */
int sysfs_create_link(struct kobject *kobj, struct kobject *target,
                      const char *name)
{
        return sysfs_do_create_link(kobj, target, name, 1);
}
EXPORT_SYMBOL_GPL(sysfs_create_link);

/**
 *        sysfs_create_link_nowarn - create symlink between two objects.
 *        @kobj:        object whose directory we're creating the link in.
 *        @target:        object we're pointing to.
 *        @name:                name of the symlink.
 *
 *        This function does the same as sysfs_create_link(), but it
 *        doesn't warn if the link already exists.
 */
int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
                             const char *name)
{
        return sysfs_do_create_link(kobj, target, name, 0);
}
EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn);

/**
 *        sysfs_delete_link - remove symlink in object's directory.
 *        @kobj:        object we're acting for.
 *        @targ:        object we're pointing to.
 *        @name:        name of the symlink to remove.
 *
 *        Unlike sysfs_remove_link sysfs_delete_link has enough information
 *        to successfully delete symlinks in tagged directories.
 */
void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
                        const char *name)
{
        const struct ns_common *ns = NULL;

        /*
         * We don't own @target and it may be removed at any time.
         * Synchronize using sysfs_symlink_target_lock.  See
         * sysfs_remove_dir() for details.
         */
        spin_lock(&sysfs_symlink_target_lock);
        if (targ->sd && kernfs_ns_enabled(kobj->sd))
                ns = targ->sd->ns;
        spin_unlock(&sysfs_symlink_target_lock);
        kernfs_remove_by_name_ns(kobj->sd, name, ns);
}

/**
 *        sysfs_remove_link - remove symlink in object's directory.
 *        @kobj:        object we're acting for.
 *        @name:        name of the symlink to remove.
 */
void sysfs_remove_link(struct kobject *kobj, const char *name)
{
        struct kernfs_node *parent = NULL;

        if (!kobj)
                parent = sysfs_root_kn;
        else
                parent = kobj->sd;

        kernfs_remove_by_name(parent, name);
}
EXPORT_SYMBOL_GPL(sysfs_remove_link);

/**
 *        sysfs_rename_link_ns - rename symlink in object's directory.
 *        @kobj:        object we're acting for.
 *        @targ:        object we're pointing to.
 *        @old:        previous name of the symlink.
 *        @new:        new name of the symlink.
 *        @new_ns: new namespace of the symlink.
 *
 *        A helper function for the common rename symlink idiom.
 */
int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
                         const char *old, const char *new,
                         const struct ns_common *new_ns)
{
        struct kernfs_node *parent, *kn = NULL;
        const struct ns_common *old_ns = NULL;
        int result;

        if (!kobj)
                parent = sysfs_root_kn;
        else
                parent = kobj->sd;

        if (targ->sd)
                old_ns = targ->sd->ns;

        result = -ENOENT;
        kn = kernfs_find_and_get_ns(parent, old, old_ns);
        if (!kn)
                goto out;

        result = -EINVAL;
        if (kernfs_type(kn) != KERNFS_LINK)
                goto out;
        if (kn->symlink.target_kn->priv != targ)
                goto out;

        result = kernfs_rename_ns(kn, parent, new, new_ns);

out:
        kernfs_put(kn);
        return result;
}
EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);






























































    8 












   11 

















    3 
































    1 
    1 





























    3 




























































































































































    2 


    3 
    3 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel internal schedule timeout and sleeping functions
 */

#include <linux/delay.h>
#include <linux/jiffies.h>
#include <linux/timer.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>

#include "tick-internal.h"

/*
 * Since schedule_timeout()'s timer is defined on the stack, it must store
 * the target task on the stack as well.
 */
struct process_timer {
        struct timer_list timer;
        struct task_struct *task;
};

static void process_timeout(struct timer_list *t)
{
        struct process_timer *timeout = timer_container_of(timeout, t, timer);

        wake_up_process(timeout->task);
}

/**
 * schedule_timeout - sleep until timeout
 * @timeout: timeout value in jiffies
 *
 * Make the current task sleep until @timeout jiffies have elapsed.
 * The function behavior depends on the current task state
 * (see also set_current_state() description):
 *
 * %TASK_RUNNING - the scheduler is called, but the task does not sleep
 * at all. That happens because sched_submit_work() does nothing for
 * tasks in %TASK_RUNNING state.
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be %TASK_RUNNING when this
 * routine returns.
 *
 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
 * the CPU away without a bound on the timeout. In this case the return
 * value will be %MAX_SCHEDULE_TIMEOUT.
 *
 * Returns: 0 when the timer has expired otherwise the remaining time in
 * jiffies will be returned. In all cases the return value is guaranteed
 * to be non-negative.
 */
signed long __sched schedule_timeout(signed long timeout)
{
        struct process_timer timer;
        unsigned long expire;

        switch (timeout) {
        case MAX_SCHEDULE_TIMEOUT:
                /*
                 * These two special cases are useful to be comfortable
                 * in the caller. Nothing more. We could take
                 * MAX_SCHEDULE_TIMEOUT from one of the negative value
                 * but I' d like to return a valid offset (>=0) to allow
                 * the caller to do everything it want with the retval.
                 */
                schedule();
                goto out;
        default:
                /*
                 * Another bit of PARANOID. Note that the retval will be
                 * 0 since no piece of kernel is supposed to do a check
                 * for a negative retval of schedule_timeout() (since it
                 * should never happens anyway). You just have the printk()
                 * that will tell you if something is gone wrong and where.
                 */
                if (timeout < 0) {
                        pr_err("%s: wrong timeout value %lx\n", __func__, timeout);
                        dump_stack();
                        __set_current_state(TASK_RUNNING);
                        goto out;
                }
        }

        expire = timeout + jiffies;

        timer.task = current;
        timer_setup_on_stack(&timer.timer, process_timeout, 0);
        timer.timer.expires = expire;
        add_timer(&timer.timer);
        schedule();
        timer_delete_sync(&timer.timer);

        /* Remove the timer from the object tracker */
        timer_destroy_on_stack(&timer.timer);

        timeout = expire - jiffies;

 out:
        return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

/*
 * __set_current_state() can be used in schedule_timeout_*() functions, because
 * schedule_timeout() calls schedule() unconditionally.
 */

/**
 * schedule_timeout_interruptible - sleep until timeout (interruptible)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_INTERRUPTIBLE before starting the timeout.
 */
signed long __sched schedule_timeout_interruptible(signed long timeout)
{
        __set_current_state(TASK_INTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_interruptible);

/**
 * schedule_timeout_killable - sleep until timeout (killable)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_KILLABLE before starting the timeout.
 */
signed long __sched schedule_timeout_killable(signed long timeout)
{
        __set_current_state(TASK_KILLABLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_killable);

/**
 * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout.
 */
signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
        __set_current_state(TASK_UNINTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);

/**
 * schedule_timeout_idle - sleep until timeout (idle)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_IDLE before starting the timeout. It is similar to
 * schedule_timeout_uninterruptible(), except this task will not contribute to
 * load average.
 */
signed long __sched schedule_timeout_idle(signed long timeout)
{
        __set_current_state(TASK_IDLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_idle);

/**
 * schedule_hrtimeout_range_clock - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t)
 * @mode:        timer mode
 * @clock_id:        timer clock to be used
 *
 * Details are explained in schedule_hrtimeout_range() function description as
 * this function is commonly used.
 */
int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
                                           const enum hrtimer_mode mode, clockid_t clock_id)
{
        struct hrtimer_sleeper t;

        /*
         * Optimize when a zero timeout value is given. It does not
         * matter whether this is an absolute or a relative time.
         */
        if (expires && *expires == 0) {
                __set_current_state(TASK_RUNNING);
                return 0;
        }

        /*
         * A NULL parameter means "infinite"
         */
        if (!expires) {
                schedule();
                return -EINTR;
        }

        hrtimer_setup_sleeper_on_stack(&t, clock_id, mode);
        hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
        hrtimer_sleeper_start_expires(&t, mode);

        if (likely(t.task))
                schedule();

        hrtimer_cancel(&t.timer);
        destroy_hrtimer_on_stack(&t.timer);

        __set_current_state(TASK_RUNNING);

        return !t.task ? 0 : -EINTR;
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);

/**
 * schedule_hrtimeout_range - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t)
 * @mode:        timer mode
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * The @delta argument gives the kernel the freedom to schedule the
 * actual wakeup to a time that is both power and performance friendly
 * for regular (non RT/DL) tasks.
 * The kernel give the normal best effort behavior for "@expires+@delta",
 * but may decide to fire the timer earlier, but no earlier than @expires.
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns: 0 when the timer has expired. If the task was woken before the
 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
 * by an explicit wakeup, it returns -EINTR.
 */
int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
                                     const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range_clock(expires, delta, mode,
                                              CLOCK_MONOTONIC);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);

/**
 * schedule_hrtimeout - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @mode:        timer mode
 *
 * See schedule_hrtimeout_range() for details. @delta argument of
 * schedule_hrtimeout_range() is set to 0 and has therefore no impact.
 */
int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range(expires, 0, mode);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout);

/**
 * msleep - sleep safely even with waitqueue interruptions
 * @msecs:        Requested sleep duration in milliseconds
 *
 * msleep() uses jiffy based timeouts for the sleep duration. Because of the
 * design of the timer wheel, the maximum additional percentage delay (slack) is
 * 12.5%. This is only valid for timers which will end up in level 1 or a higher
 * level of the timer wheel. For explanation of those 12.5% please check the
 * detailed description about the basics of the timer wheel.
 *
 * The slack of timers which will end up in level 0 depends on sleep duration
 * (msecs) and HZ configuration and can be calculated in the following way (with
 * the timer wheel design restriction that the slack is not less than 12.5%):
 *
 *   ``slack = MSECS_PER_TICK / msecs``
 *
 * When the allowed slack of the callsite is known, the calculation could be
 * turned around to find the minimal allowed sleep duration to meet the
 * constraints. For example:
 *
 * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``:
 *   all sleep durations greater or equal 4ms will meet the constraints.
 * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``:
 *   all sleep durations greater or equal 8ms will meet the constraints.
 * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``:
 *   all sleep durations greater or equal 16ms will meet the constraints.
 * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``:
 *   all sleep durations greater or equal 32ms will meet the constraints.
 *
 * See also the signal aware variant msleep_interruptible().
 */
void msleep(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs);

        while (timeout)
                timeout = schedule_timeout_uninterruptible(timeout);
}
EXPORT_SYMBOL(msleep);

/**
 * msleep_interruptible - sleep waiting for signals
 * @msecs:        Requested sleep duration in milliseconds
 *
 * See msleep() for some basic information.
 *
 * The difference between msleep() and msleep_interruptible() is that the sleep
 * could be interrupted by a signal delivery and then returns early.
 *
 * Returns: The remaining time of the sleep duration transformed to msecs (see
 * schedule_timeout() for details).
 */
unsigned long msleep_interruptible(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs);

        while (timeout && !signal_pending(current))
                timeout = schedule_timeout_interruptible(timeout);
        return jiffies_to_msecs(timeout);
}
EXPORT_SYMBOL(msleep_interruptible);

/**
 * usleep_range_state - Sleep for an approximate time in a given state
 * @min:        Minimum time in usecs to sleep
 * @max:        Maximum time in usecs to sleep
 * @state:        State of the current task that will be while sleeping
 *
 * usleep_range_state() sleeps at least for the minimum specified time but not
 * longer than the maximum specified amount of time. The range might reduce
 * power usage by allowing hrtimers to coalesce an already scheduled interrupt
 * with this hrtimer. In the worst case, an interrupt is scheduled for the upper
 * bound.
 *
 * The sleeping task is set to the specified state before starting the sleep.
 *
 * In non-atomic context where the exact wakeup time is flexible, use
 * usleep_range() or its variants instead of udelay(). The sleep improves
 * responsiveness by avoiding the CPU-hogging busy-wait of udelay().
 */
void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state)
{
        ktime_t exp = ktime_add_us(ktime_get(), min);
        u64 delta = (u64)(max - min) * NSEC_PER_USEC;

        if (WARN_ON_ONCE(max < min))
                delta = 0;

        for (;;) {
                __set_current_state(state);
                /* Do not return before the requested sleep time has elapsed */
                if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
                        break;
        }
}
EXPORT_SYMBOL(usleep_range_state);




















































































































































































    1 















    1 
    1 
    1 


    1 
    1 
    1 
    1 






    1 



    1 

























































































































































    1 









    1 





    1 



















    1 






    1 




    1 







    1 





    1 




    1 












































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
// SPDX-License-Identifier: GPL-2.0-or-later
/* Task credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#define pr_fmt(fmt) "CRED: " fmt

#include <linux/export.h>
#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/coredump.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
#include <linux/binfmts.h>
#include <linux/cn_proc.h>
#include <linux/uidgid.h>

#if 0
#define kdebug(FMT, ...)                                                \
        printk("[%-5.5s%5u] " FMT "\n",                                        \
               current->comm, current->pid, ##__VA_ARGS__)
#else
#define kdebug(FMT, ...)                                                \
do {                                                                        \
        if (0)                                                                \
                no_printk("[%-5.5s%5u] " FMT "\n",                        \
                          current->comm, current->pid, ##__VA_ARGS__);        \
} while (0)
#endif

static struct kmem_cache *cred_jar;

/*
 * The RCU callback to actually dispose of a set of credentials
 */
static void put_cred_rcu(struct rcu_head *rcu)
{
        struct cred *cred = container_of(rcu, struct cred, rcu);

        kdebug("put_cred_rcu(%p)", cred);

        if (atomic_long_read(&cred->usage) != 0)
                panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
                      cred, atomic_long_read(&cred->usage));

        security_cred_free(cred);
        key_put(cred->session_keyring);
        key_put(cred->process_keyring);
        key_put(cred->thread_keyring);
        key_put(cred->request_key_auth);
        if (cred->group_info)
                put_group_info(cred->group_info);
        free_uid(cred->user);
        if (cred->ucounts)
                put_ucounts(cred->ucounts);
        put_user_ns(cred->user_ns);
        kmem_cache_free(cred_jar, cred);
}

/**
 * __put_cred - Destroy a set of credentials
 * @cred: The record to release
 *
 * Destroy a set of credentials on which no references remain.
 */
void __put_cred(struct cred *cred)
{
        kdebug("__put_cred(%p{%ld})", cred,
               atomic_long_read(&cred->usage));

        BUG_ON(atomic_long_read(&cred->usage) != 0);
        BUG_ON(cred == current->cred);
        BUG_ON(cred == current->real_cred);

        if (cred->non_rcu)
                put_cred_rcu(&cred->rcu);
        else
                call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);

/*
 * Clean up a task's credentials when it exits
 */
void exit_creds(struct task_struct *tsk)
{
        struct cred *real_cred, *cred;

        kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred,
               atomic_long_read(&tsk->cred->usage));

        real_cred = (struct cred *) tsk->real_cred;
        tsk->real_cred = NULL;

        cred = (struct cred *) tsk->cred;
        tsk->cred = NULL;

        if (real_cred == cred) {
                put_cred_many(cred, 2);
        } else {
                put_cred(real_cred);
                put_cred(cred);
        }

#ifdef CONFIG_KEYS_REQUEST_CACHE
        key_put(tsk->cached_requested_key);
        tsk->cached_requested_key = NULL;
#endif
}

/**
 * get_task_cred - Get another task's objective credentials
 * @task: The task to query
 *
 * Get the objective credentials of a task, pinning them so that they can't go
 * away.  Accessing a task's credentials directly is not permitted.
 *
 * The caller must also make sure task doesn't get deleted, either by holding a
 * ref on task or by holding tasklist_lock to prevent it from being unlinked.
 */
const struct cred *get_task_cred(struct task_struct *task)
{
        const struct cred *cred;

        rcu_read_lock();

        do {
                cred = __task_cred((task));
                BUG_ON(!cred);
        } while (!get_cred_rcu(cred));

        rcu_read_unlock();
        return cred;
}
EXPORT_SYMBOL(get_task_cred);

/*
 * Allocate blank credentials, such that the credentials can be filled in at a
 * later date without risk of ENOMEM.
 */
struct cred *cred_alloc_blank(void)
{
        struct cred *new;

        new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
        if (!new)
                return NULL;

        atomic_long_set(&new->usage, 1);
        if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
                goto error;

        return new;

error:
        abort_creds(new);
        return NULL;
}

/**
 * prepare_creds - Prepare a new set of credentials for modification
 *
 * Prepare a new set of task credentials for modification.  A task's creds
 * shouldn't generally be modified directly, therefore this function is used to
 * prepare a new copy, which the caller then modifies and then commits by
 * calling commit_creds().
 *
 * Preparation involves making a copy of the objective creds for modification.
 *
 * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
 *
 * Call commit_creds() or abort_creds() to clean up.
 */
struct cred *prepare_creds(void)
{
        struct task_struct *task = current;
        const struct cred *old;
        struct cred *new;

        new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
        if (!new)
                return NULL;

        kdebug("prepare_creds() alloc %p", new);

        old = task->cred;
        memcpy(new, old, sizeof(struct cred));

        new->non_rcu = 0;
        atomic_long_set(&new->usage, 1);
        get_group_info(new->group_info);
        get_uid(new->user);
        get_user_ns(new->user_ns);

#ifdef CONFIG_KEYS
        key_get(new->session_keyring);
        key_get(new->process_keyring);
        key_get(new->thread_keyring);
        key_get(new->request_key_auth);
#endif

#ifdef CONFIG_SECURITY
        new->security = NULL;
#endif

        new->ucounts = get_ucounts(new->ucounts);
        if (!new->ucounts)
                goto error;

        if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
                goto error;

        return new;

error:
        abort_creds(new);
        return NULL;
}
EXPORT_SYMBOL(prepare_creds);

/*
 * Prepare credentials for current to perform an execve()
 * - The caller must hold ->cred_guard_mutex
 */
struct cred *prepare_exec_creds(void)
{
        struct cred *new;

        new = prepare_creds();
        if (!new)
                return new;

#ifdef CONFIG_KEYS
        /* newly exec'd tasks don't get a thread keyring */
        key_put(new->thread_keyring);
        new->thread_keyring = NULL;

        /* inherit the session keyring; new process keyring */
        key_put(new->process_keyring);
        new->process_keyring = NULL;
#endif

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        return new;
}

/*
 * Copy credentials for the new process created by fork()
 *
 * We share if we can, but under some circumstances we have to generate a new
 * set.
 *
 * The new process gets the current process's subjective credentials as its
 * objective and subjective credentials
 */
int copy_creds(struct task_struct *p, u64 clone_flags)
{
        struct cred *new;
        int ret;

#ifdef CONFIG_KEYS_REQUEST_CACHE
        p->cached_requested_key = NULL;
#endif

        if (
#ifdef CONFIG_KEYS
                !p->cred->thread_keyring &&
#endif
                clone_flags & CLONE_THREAD
            ) {
                p->real_cred = get_cred_many(p->cred, 2);
                kdebug("share_creds(%p{%ld})",
                       p->cred, atomic_long_read(&p->cred->usage));
                inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
                get_cred_namespaces(p);
                return 0;
        }

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        if (clone_flags & CLONE_NEWUSER) {
                ret = create_user_ns(new);
                if (ret < 0)
                        goto error_put;
                ret = set_cred_ucounts(new);
                if (ret < 0)
                        goto error_put;
        }

#ifdef CONFIG_KEYS
        /* new threads get their own thread keyrings if their parent already
         * had one */
        if (new->thread_keyring) {
                key_put(new->thread_keyring);
                new->thread_keyring = NULL;
                if (clone_flags & CLONE_THREAD)
                        install_thread_keyring_to_cred(new);
        }

        /* The process keyring is only shared between the threads in a process;
         * anything outside of those threads doesn't inherit.
         */
        if (!(clone_flags & CLONE_THREAD)) {
                key_put(new->process_keyring);
                new->process_keyring = NULL;
        }
#endif

        p->cred = p->real_cred = get_cred(new);
        inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        get_cred_namespaces(p);

        return 0;

error_put:
        put_cred(new);
        return ret;
}

static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
{
        const struct user_namespace *set_ns = set->user_ns;
        const struct user_namespace *subset_ns = subset->user_ns;

        /* If the two credentials are in the same user namespace see if
         * the capabilities of subset are a subset of set.
         */
        if (set_ns == subset_ns)
                return cap_issubset(subset->cap_permitted, set->cap_permitted);

        /* The credentials are in a different user namespaces
         * therefore one is a subset of the other only if a set is an
         * ancestor of subset and set->euid is owner of subset or one
         * of subsets ancestors.
         */
        for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
                if ((set_ns == subset_ns->parent)  &&
                    uid_eq(subset_ns->owner, set->euid))
                        return true;
        }

        return false;
}

/**
 * commit_creds - Install new credentials upon the current task
 * @new: The credentials to be assigned
 *
 * Install a new set of credentials to the current task, using RCU to replace
 * the old set.  Both the objective and the subjective credentials pointers are
 * updated.  This function may not be called if the subjective credentials are
 * in an overridden state.
 *
 * This function eats the caller's reference to the new credentials.
 *
 * Always returns 0 thus allowing this function to be tail-called at the end
 * of, say, sys_setgid().
 */
int commit_creds(struct cred *new)
{
        struct task_struct *task = current;
        const struct cred *old = task->real_cred;

        kdebug("commit_creds(%p{%ld})", new,
               atomic_long_read(&new->usage));

        BUG_ON(task->cred != old);
        BUG_ON(atomic_long_read(&new->usage) < 1);

        get_cred(new); /* we will require a ref for the subj creds too */

        /* dumpability changes */
        if (!uid_eq(old->euid, new->euid) ||
            !gid_eq(old->egid, new->egid) ||
            !uid_eq(old->fsuid, new->fsuid) ||
            !gid_eq(old->fsgid, new->fsgid) ||
            !cred_cap_issubset(old, new)) {
                if (task->mm)
                        set_dumpable(task->mm, suid_dumpable);
                task->pdeath_signal = 0;
                /*
                 * If a task drops privileges and becomes nondumpable,
                 * the dumpability change must become visible before
                 * the credential change; otherwise, a __ptrace_may_access()
                 * racing with this change may be able to attach to a task it
                 * shouldn't be able to attach to (as if the task had dropped
                 * privileges without becoming nondumpable).
                 * Pairs with a read barrier in __ptrace_may_access().
                 */
                smp_wmb();
        }

        /* alter the thread keyring */
        if (!uid_eq(new->fsuid, old->fsuid))
                key_fsuid_changed(new);
        if (!gid_eq(new->fsgid, old->fsgid))
                key_fsgid_changed(new);

        /* do it
         * RLIMIT_NPROC limits on user->processes have already been checked
         * in set_user().
         */
        if (new->user != old->user || new->user_ns != old->user_ns)
                inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);

        rcu_assign_pointer(task->real_cred, new);
        rcu_assign_pointer(task->cred, new);
        if (new->user != old->user || new->user_ns != old->user_ns)
                dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
        if (new->user_ns != old->user_ns)
                switch_cred_namespaces(old, new);

        /* send notifications */
        if (!uid_eq(new->uid,   old->uid)  ||
            !uid_eq(new->euid,  old->euid) ||
            !uid_eq(new->suid,  old->suid) ||
            !uid_eq(new->fsuid, old->fsuid))
                proc_id_connector(task, PROC_EVENT_UID);

        if (!gid_eq(new->gid,   old->gid)  ||
            !gid_eq(new->egid,  old->egid) ||
            !gid_eq(new->sgid,  old->sgid) ||
            !gid_eq(new->fsgid, old->fsgid))
                proc_id_connector(task, PROC_EVENT_GID);

        /* release the old obj and subj refs both */
        put_cred_many(old, 2);
        return 0;
}
EXPORT_SYMBOL(commit_creds);

/**
 * abort_creds - Discard a set of credentials and unlock the current task
 * @new: The credentials that were going to be applied
 *
 * Discard a set of credentials that were under construction and unlock the
 * current task.
 */
void abort_creds(struct cred *new)
{
        kdebug("abort_creds(%p{%ld})", new,
               atomic_long_read(&new->usage));

        BUG_ON(atomic_long_read(&new->usage) < 1);
        put_cred(new);
}
EXPORT_SYMBOL(abort_creds);

/**
 * cred_fscmp - Compare two credentials with respect to filesystem access.
 * @a: The first credential
 * @b: The second credential
 *
 * cred_cmp() will return zero if both credentials have the same
 * fsuid, fsgid, and supplementary groups.  That is, if they will both
 * provide the same access to files based on mode/uid/gid.
 * If the credentials are different, then either -1 or 1 will
 * be returned depending on whether @a comes before or after @b
 * respectively in an arbitrary, but stable, ordering of credentials.
 *
 * Return: -1, 0, or 1 depending on comparison
 */
int cred_fscmp(const struct cred *a, const struct cred *b)
{
        struct group_info *ga, *gb;
        int g;

        if (a == b)
                return 0;
        if (uid_lt(a->fsuid, b->fsuid))
                return -1;
        if (uid_gt(a->fsuid, b->fsuid))
                return 1;

        if (gid_lt(a->fsgid, b->fsgid))
                return -1;
        if (gid_gt(a->fsgid, b->fsgid))
                return 1;

        ga = a->group_info;
        gb = b->group_info;
        if (ga == gb)
                return 0;
        if (ga == NULL)
                return -1;
        if (gb == NULL)
                return 1;
        if (ga->ngroups < gb->ngroups)
                return -1;
        if (ga->ngroups > gb->ngroups)
                return 1;

        for (g = 0; g < ga->ngroups; g++) {
                if (gid_lt(ga->gid[g], gb->gid[g]))
                        return -1;
                if (gid_gt(ga->gid[g], gb->gid[g]))
                        return 1;
        }
        return 0;
}
EXPORT_SYMBOL(cred_fscmp);

int set_cred_ucounts(struct cred *new)
{
        struct ucounts *new_ucounts, *old_ucounts = new->ucounts;

        /*
         * This optimization is needed because alloc_ucounts() uses locks
         * for table lookups.
         */
        if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid))
                return 0;

        if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid)))
                return -EAGAIN;

        new->ucounts = new_ucounts;
        put_ucounts(old_ucounts);

        return 0;
}

/*
 * initialise the credentials stuff
 */
void __init cred_init(void)
{
        /* allocate a slab in which we can store credentials */
        cred_jar = KMEM_CACHE(cred,
                              SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}

/**
 * prepare_kernel_cred - Prepare a set of credentials for a kernel service
 * @daemon: A userspace daemon to be used as a reference
 *
 * Prepare a set of credentials for a kernel service.  This can then be used to
 * override a task's own credentials so that work can be done on behalf of that
 * task that requires a different subjective context.
 *
 * @daemon is used to provide a base cred, with the security data derived from
 * that; if this is "&init_task", they'll be set to 0, no groups, full
 * capabilities, and no keys.
 *
 * The caller may change these controls afterwards if desired.
 *
 * Returns the new credentials or NULL if out of memory.
 */
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
        const struct cred *old;
        struct cred *new;

        if (WARN_ON_ONCE(!daemon))
                return NULL;

        new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
        if (!new)
                return NULL;

        kdebug("prepare_kernel_cred() alloc %p", new);

        old = get_task_cred(daemon);

        *new = *old;
        new->non_rcu = 0;
        atomic_long_set(&new->usage, 1);
        get_uid(new->user);
        get_user_ns(new->user_ns);
        get_group_info(new->group_info);

#ifdef CONFIG_KEYS
        new->session_keyring = NULL;
        new->process_keyring = NULL;
        new->thread_keyring = NULL;
        new->request_key_auth = NULL;
        new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif

#ifdef CONFIG_SECURITY
        new->security = NULL;
#endif
        new->ucounts = get_ucounts(new->ucounts);
        if (!new->ucounts)
                goto error;

        if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
                goto error;

        put_cred(old);
        return new;

error:
        put_cred(new);
        put_cred(old);
        return NULL;
}
EXPORT_SYMBOL(prepare_kernel_cred);

/**
 * set_security_override - Set the security ID in a set of credentials
 * @new: The credentials to alter
 * @secid: The LSM security ID to set
 *
 * Set the LSM security ID in a set of credentials so that the subjective
 * security is overridden when an alternative set of credentials is used.
 */
int set_security_override(struct cred *new, u32 secid)
{
        return security_kernel_act_as(new, secid);
}
EXPORT_SYMBOL(set_security_override);

/**
 * set_create_files_as - Set the LSM file create context in a set of credentials
 * @new: The credentials to alter
 * @inode: The inode to take the context from
 *
 * Change the LSM file creation context in a set of credentials to be the same
 * as the object context of the specified inode, so that the new inodes have
 * the same MAC context as that inode.
 */
int set_create_files_as(struct cred *new, struct inode *inode)
{
        if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
                return -EINVAL;
        new->fsuid = inode->i_uid;
        new->fsgid = inode->i_gid;
        return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_COMPAT_H
#define _LINUX_COMPAT_H
/*
 * These are the type definitions for the architecture specific
 * syscall compatibility layer.
 */

#include <linux/types.h>
#include <linux/time.h>

#include <linux/stat.h>
#include <linux/param.h>        /* for HZ */
#include <linux/sem.h>
#include <linux/socket.h>
#include <linux/if.h>
#include <linux/fs.h>
#include <linux/aio_abi.h>        /* for aio_context_t */
#include <linux/uaccess.h>
#include <linux/unistd.h>

#include <asm/compat.h>
#include <asm/siginfo.h>
#include <asm/signal.h>

#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
/*
 * It may be useful for an architecture to override the definitions of the
 * COMPAT_SYSCALL_DEFINE0 and COMPAT_SYSCALL_DEFINEx() macros, in particular
 * to use a different calling convention for syscalls. To allow for that,
 + the prototypes for the compat_sys_*() functions below will *not* be included
 * if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled.
 */
#include <asm/syscall_wrapper.h>
#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */

#ifndef COMPAT_USE_64BIT_TIME
#define COMPAT_USE_64BIT_TIME 0
#endif

#ifndef __SC_DELOUSE
#define __SC_DELOUSE(t,v) ((__force t)(unsigned long)(v))
#endif

#ifndef COMPAT_SYSCALL_DEFINE0
#define COMPAT_SYSCALL_DEFINE0(name) \
        asmlinkage long compat_sys_##name(void); \
        ALLOW_ERROR_INJECTION(compat_sys_##name, ERRNO); \
        asmlinkage long compat_sys_##name(void)
#endif /* COMPAT_SYSCALL_DEFINE0 */

#define COMPAT_SYSCALL_DEFINE1(name, ...) \
        COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE2(name, ...) \
        COMPAT_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE3(name, ...) \
        COMPAT_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE4(name, ...) \
        COMPAT_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE5(name, ...) \
        COMPAT_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE6(name, ...) \
        COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

/*
 * The asmlinkage stub is aliased to a function named __se_compat_sys_*() which
 * sign-extends 32-bit ints to longs whenever needed. The actual work is
 * done within __do_compat_sys_*().
 */
#ifndef COMPAT_SYSCALL_DEFINEx
#define COMPAT_SYSCALL_DEFINEx(x, name, ...)                                        \
        __diag_push();                                                                \
        __diag_ignore(GCC, 8, "-Wattribute-alias",                                \
                      "Type aliasing is used to sanitize syscall arguments");\
        asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))        \
                __attribute__((alias(__stringify(__se_compat_sys##name))));        \
        ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO);                                \
        static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
        asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));        \
        asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))        \
        {                                                                        \
                long ret = __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\
                __MAP(x,__SC_TEST,__VA_ARGS__);                                        \
                return ret;                                                        \
        }                                                                        \
        __diag_pop();                                                                \
        static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#endif /* COMPAT_SYSCALL_DEFINEx */

struct compat_iovec {
        compat_uptr_t        iov_base;
        compat_size_t        iov_len;
};

#ifndef compat_user_stack_pointer
#define compat_user_stack_pointer() current_user_stack_pointer()
#endif
#ifndef compat_sigaltstack        /* we'll need that for MIPS */
typedef struct compat_sigaltstack {
        compat_uptr_t                        ss_sp;
        int                                ss_flags;
        compat_size_t                        ss_size;
} compat_stack_t;
#endif
#ifndef COMPAT_MINSIGSTKSZ
#define COMPAT_MINSIGSTKSZ        MINSIGSTKSZ
#endif

#define compat_jiffies_to_clock_t(x)        \
                (((unsigned long)(x) * COMPAT_USER_HZ) / HZ)

typedef __compat_uid32_t        compat_uid_t;
typedef __compat_gid32_t        compat_gid_t;

struct compat_sel_arg_struct;
struct rusage;

struct old_itimerval32;

struct compat_tms {
        compat_clock_t                tms_utime;
        compat_clock_t                tms_stime;
        compat_clock_t                tms_cutime;
        compat_clock_t                tms_cstime;
};

#define _COMPAT_NSIG_WORDS        (_COMPAT_NSIG / _COMPAT_NSIG_BPW)

typedef struct {
        compat_sigset_word        sig[_COMPAT_NSIG_WORDS];
} compat_sigset_t;

int set_compat_user_sigmask(const compat_sigset_t __user *umask,
                            size_t sigsetsize);

struct compat_sigaction {
#ifndef __ARCH_HAS_IRIX_SIGACTION
        compat_uptr_t                        sa_handler;
        compat_ulong_t                        sa_flags;
#else
        compat_uint_t                        sa_flags;
        compat_uptr_t                        sa_handler;
#endif
#ifdef __ARCH_HAS_SA_RESTORER
        compat_uptr_t                        sa_restorer;
#endif
        compat_sigset_t                        sa_mask __packed;
};

typedef union compat_sigval {
        compat_int_t        sival_int;
        compat_uptr_t        sival_ptr;
} compat_sigval_t;

typedef struct compat_siginfo {
        int si_signo;
#ifndef __ARCH_HAS_SWAPPED_SIGINFO
        int si_errno;
        int si_code;
#else
        int si_code;
        int si_errno;
#endif

        union {
                int _pad[128/sizeof(int) - 3];

                /* kill() */
                struct {
                        compat_pid_t _pid;        /* sender's pid */
                        __compat_uid32_t _uid;        /* sender's uid */
                } _kill;

                /* POSIX.1b timers */
                struct {
                        compat_timer_t _tid;        /* timer id */
                        int _overrun;                /* overrun count */
                        compat_sigval_t _sigval;        /* same as below */
                } _timer;

                /* POSIX.1b signals */
                struct {
                        compat_pid_t _pid;        /* sender's pid */
                        __compat_uid32_t _uid;        /* sender's uid */
                        compat_sigval_t _sigval;
                } _rt;

                /* SIGCHLD */
                struct {
                        compat_pid_t _pid;        /* which child */
                        __compat_uid32_t _uid;        /* sender's uid */
                        int _status;                /* exit code */
                        compat_clock_t _utime;
                        compat_clock_t _stime;
                } _sigchld;

#ifdef CONFIG_X86_X32_ABI
                /* SIGCHLD (x32 version) */
                struct {
                        compat_pid_t _pid;        /* which child */
                        __compat_uid32_t _uid;        /* sender's uid */
                        int _status;                /* exit code */
                        compat_s64 _utime;
                        compat_s64 _stime;
                } _sigchld_x32;
#endif

                /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGTRAP, SIGEMT */
                struct {
                        compat_uptr_t _addr;        /* faulting insn/memory ref. */
#define __COMPAT_ADDR_BND_PKEY_PAD  (__alignof__(compat_uptr_t) < sizeof(short) ? \
                                     sizeof(short) : __alignof__(compat_uptr_t))
                        union {
                                /* used on alpha and sparc */
                                int _trapno;        /* TRAP # which caused the signal */
                                /*
                                 * used when si_code=BUS_MCEERR_AR or
                                 * used when si_code=BUS_MCEERR_AO
                                 */
                                short int _addr_lsb;        /* Valid LSB of the reported address. */
                                /* used when si_code=SEGV_BNDERR */
                                struct {
                                        char _dummy_bnd[__COMPAT_ADDR_BND_PKEY_PAD];
                                        compat_uptr_t _lower;
                                        compat_uptr_t _upper;
                                } _addr_bnd;
                                /* used when si_code=SEGV_PKUERR */
                                struct {
                                        char _dummy_pkey[__COMPAT_ADDR_BND_PKEY_PAD];
                                        u32 _pkey;
                                } _addr_pkey;
                                /* used when si_code=TRAP_PERF */
                                struct {
                                        compat_ulong_t _data;
                                        u32 _type;
                                        u32 _flags;
                                } _perf;
                        };
                } _sigfault;

                /* SIGPOLL */
                struct {
                        compat_long_t _band;        /* POLL_IN, POLL_OUT, POLL_MSG */
                        int _fd;
                } _sigpoll;

                struct {
                        compat_uptr_t _call_addr; /* calling user insn */
                        int _syscall;        /* triggering system call number */
                        unsigned int _arch;        /* AUDIT_ARCH_* of syscall */
                } _sigsys;
        } _sifields;
} compat_siginfo_t;

struct compat_rlimit {
        compat_ulong_t        rlim_cur;
        compat_ulong_t        rlim_max;
};

#ifdef __ARCH_NEED_COMPAT_FLOCK64_PACKED
#define __ARCH_COMPAT_FLOCK64_PACK        __attribute__((packed))
#else
#define __ARCH_COMPAT_FLOCK64_PACK
#endif

struct compat_flock {
        short                        l_type;
        short                        l_whence;
        compat_off_t                l_start;
        compat_off_t                l_len;
#ifdef __ARCH_COMPAT_FLOCK_EXTRA_SYSID
        __ARCH_COMPAT_FLOCK_EXTRA_SYSID
#endif
        compat_pid_t                l_pid;
#ifdef __ARCH_COMPAT_FLOCK_PAD
        __ARCH_COMPAT_FLOCK_PAD
#endif
};

struct compat_flock64 {
        short                l_type;
        short                l_whence;
        compat_loff_t        l_start;
        compat_loff_t        l_len;
        compat_pid_t        l_pid;
#ifdef __ARCH_COMPAT_FLOCK64_PAD
        __ARCH_COMPAT_FLOCK64_PAD
#endif
} __ARCH_COMPAT_FLOCK64_PACK;

struct compat_rusage {
        struct old_timeval32 ru_utime;
        struct old_timeval32 ru_stime;
        compat_long_t        ru_maxrss;
        compat_long_t        ru_ixrss;
        compat_long_t        ru_idrss;
        compat_long_t        ru_isrss;
        compat_long_t        ru_minflt;
        compat_long_t        ru_majflt;
        compat_long_t        ru_nswap;
        compat_long_t        ru_inblock;
        compat_long_t        ru_oublock;
        compat_long_t        ru_msgsnd;
        compat_long_t        ru_msgrcv;
        compat_long_t        ru_nsignals;
        compat_long_t        ru_nvcsw;
        compat_long_t        ru_nivcsw;
};

extern int put_compat_rusage(const struct rusage *,
                             struct compat_rusage __user *);

struct compat_siginfo;
struct __compat_aio_sigset;

struct compat_dirent {
        u32                d_ino;
        compat_off_t        d_off;
        u16                d_reclen;
        char                d_name[256];
};

struct compat_ustat {
        compat_daddr_t                f_tfree;
        compat_ino_t                f_tinode;
        char                        f_fname[6];
        char                        f_fpack[6];
};

#define COMPAT_SIGEV_PAD_SIZE        ((SIGEV_MAX_SIZE/sizeof(int)) - 3)

typedef struct compat_sigevent {
        compat_sigval_t sigev_value;
        compat_int_t sigev_signo;
        compat_int_t sigev_notify;
        union {
                compat_int_t _pad[COMPAT_SIGEV_PAD_SIZE];
                compat_int_t _tid;

                struct {
                        compat_uptr_t _function;
                        compat_uptr_t _attribute;
                } _sigev_thread;
        } _sigev_un;
} compat_sigevent_t;

struct compat_ifmap {
        compat_ulong_t mem_start;
        compat_ulong_t mem_end;
        unsigned short base_addr;
        unsigned char irq;
        unsigned char dma;
        unsigned char port;
};

struct compat_if_settings {
        unsigned int type;        /* Type of physical device or protocol */
        unsigned int size;        /* Size of the data allocated by the caller */
        compat_uptr_t ifs_ifsu;        /* union of pointers */
};

struct compat_ifreq {
        union {
                char        ifrn_name[IFNAMSIZ];    /* if name, e.g. "en0" */
        } ifr_ifrn;
        union {
                struct        sockaddr ifru_addr;
                struct        sockaddr ifru_dstaddr;
                struct        sockaddr ifru_broadaddr;
                struct        sockaddr ifru_netmask;
                struct        sockaddr ifru_hwaddr;
                short        ifru_flags;
                compat_int_t        ifru_ivalue;
                compat_int_t        ifru_mtu;
                struct        compat_ifmap ifru_map;
                char        ifru_slave[IFNAMSIZ];   /* Just fits the size */
                char        ifru_newname[IFNAMSIZ];
                compat_caddr_t        ifru_data;
                struct        compat_if_settings ifru_settings;
        } ifr_ifru;
};

struct compat_ifconf {
        compat_int_t        ifc_len;                /* size of buffer */
        compat_caddr_t  ifcbuf;
};

struct compat_robust_list {
        compat_uptr_t                        next;
};

struct compat_robust_list_head {
        struct compat_robust_list        list;
        compat_long_t                        futex_offset;
        compat_uptr_t                        list_op_pending;
};

#ifdef CONFIG_COMPAT_OLD_SIGACTION
struct compat_old_sigaction {
        compat_uptr_t                        sa_handler;
        compat_old_sigset_t                sa_mask;
        compat_ulong_t                        sa_flags;
        compat_uptr_t                        sa_restorer;
};
#endif

struct compat_keyctl_kdf_params {
        compat_uptr_t hashname;
        compat_uptr_t otherinfo;
        __u32 otherinfolen;
        __u32 __spare[8];
};

struct compat_stat;
struct compat_statfs;
struct compat_statfs64;
struct compat_old_linux_dirent;
struct compat_linux_dirent;
struct linux_dirent64;
struct compat_msghdr;
struct compat_mmsghdr;
struct compat_sysinfo;
struct compat_sysctl_args;
struct compat_kexec_segment;
struct compat_mq_attr;
struct compat_msgbuf;

void copy_siginfo_to_external32(struct compat_siginfo *to,
                const struct kernel_siginfo *from);
int copy_siginfo_from_user32(kernel_siginfo_t *to,
                const struct compat_siginfo __user *from);
int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
                const kernel_siginfo_t *from);
#ifndef copy_siginfo_to_user32
#define copy_siginfo_to_user32 __copy_siginfo_to_user32
#endif
int get_compat_sigevent(struct sigevent *event,
                const struct compat_sigevent __user *u_event);

extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat);

/*
 * Defined inline such that size can be compile time constant, which avoids
 * CONFIG_HARDENED_USERCOPY complaining about copies from task_struct
 */
static inline int
put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
                  unsigned int size)
{
        /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
#if defined(__BIG_ENDIAN) && defined(CONFIG_64BIT)
        compat_sigset_t v;
        switch (_NSIG_WORDS) {
        case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
                fallthrough;
        case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
                fallthrough;
        case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
                fallthrough;
        case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
        }
        return copy_to_user(compat, &v, size) ? -EFAULT : 0;
#else
        return copy_to_user(compat, set, size) ? -EFAULT : 0;
#endif
}

#ifdef CONFIG_CPU_BIG_ENDIAN
#define unsafe_put_compat_sigset(compat, set, label) do {                \
        compat_sigset_t __user *__c = compat;                                \
        const sigset_t *__s = set;                                        \
                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:                                                                \
                unsafe_put_user(__s->sig[3] >> 32, &__c->sig[7], label);        \
                unsafe_put_user(__s->sig[3], &__c->sig[6], label);        \
                fallthrough;                                                \
        case 3:                                                                \
                unsafe_put_user(__s->sig[2] >> 32, &__c->sig[5], label);        \
                unsafe_put_user(__s->sig[2], &__c->sig[4], label);        \
                fallthrough;                                                \
        case 2:                                                                \
                unsafe_put_user(__s->sig[1] >> 32, &__c->sig[3], label);        \
                unsafe_put_user(__s->sig[1], &__c->sig[2], label);        \
                fallthrough;                                                \
        case 1:                                                                \
                unsafe_put_user(__s->sig[0] >> 32, &__c->sig[1], label);        \
                unsafe_put_user(__s->sig[0], &__c->sig[0], label);        \
        }                                                                \
} while (0)

#define unsafe_get_compat_sigset(set, compat, label) do {                \
        const compat_sigset_t __user *__c = compat;                        \
        compat_sigset_word hi, lo;                                        \
        sigset_t *__s = set;                                                \
                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:                                                                \
                unsafe_get_user(lo, &__c->sig[7], label);                \
                unsafe_get_user(hi, &__c->sig[6], label);                \
                __s->sig[3] = hi | (((long)lo) << 32);                        \
                fallthrough;                                                \
        case 3:                                                                \
                unsafe_get_user(lo, &__c->sig[5], label);                \
                unsafe_get_user(hi, &__c->sig[4], label);                \
                __s->sig[2] = hi | (((long)lo) << 32);                        \
                fallthrough;                                                \
        case 2:                                                                \
                unsafe_get_user(lo, &__c->sig[3], label);                \
                unsafe_get_user(hi, &__c->sig[2], label);                \
                __s->sig[1] = hi | (((long)lo) << 32);                        \
                fallthrough;                                                \
        case 1:                                                                \
                unsafe_get_user(lo, &__c->sig[1], label);                \
                unsafe_get_user(hi, &__c->sig[0], label);                \
                __s->sig[0] = hi | (((long)lo) << 32);                        \
        }                                                                \
} while (0)
#else
#define unsafe_put_compat_sigset(compat, set, label) do {                \
        compat_sigset_t __user *__c = compat;                                \
        const sigset_t *__s = set;                                        \
                                                                        \
        unsafe_copy_to_user(__c, __s, sizeof(*__c), label);                \
} while (0)

#define unsafe_get_compat_sigset(set, compat, label) do {                \
        const compat_sigset_t __user *__c = compat;                        \
        sigset_t *__s = set;                                                \
                                                                        \
        unsafe_copy_from_user(__s, __c, sizeof(*__c), label);                \
} while (0)
#endif

extern int compat_ptrace_request(struct task_struct *child,
                                 compat_long_t request,
                                 compat_ulong_t addr, compat_ulong_t data);

extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
                               compat_ulong_t addr, compat_ulong_t data);

struct epoll_event;        /* fortunately, this one is fixed-layout */

int compat_restore_altstack(const compat_stack_t __user *uss);
int __compat_save_altstack(compat_stack_t __user *, unsigned long);
#define unsafe_compat_save_altstack(uss, sp, label) do { \
        compat_stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
        unsafe_put_user(ptr_to_compat((void __user *)t->sas_ss_sp), \
                        &__uss->ss_sp, label); \
        unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
        unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
} while (0);

/*
 * These syscall function prototypes are kept in the same order as
 * include/uapi/asm-generic/unistd.h. Deprecated or obsolete system calls
 * go below.
 *
 * Please note that these prototypes here are only provided for information
 * purposes, for static analysis, and for linking from the syscall table.
 * These functions should not be called elsewhere from kernel code.
 *
 * As the syscall calling convention may be different from the default
 * for architectures overriding the syscall calling convention, do not
 * include the prototypes if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled.
 */
#ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p);
asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr,
                                     u32 __user *iocb);
asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id,
                                        compat_long_t min_nr,
                                        compat_long_t nr,
                                        struct io_event __user *events,
                                        struct old_timespec32 __user *timeout,
                                        const struct __compat_aio_sigset __user *usig);
asmlinkage long compat_sys_io_pgetevents_time64(compat_aio_context_t ctx_id,
                                        compat_long_t min_nr,
                                        compat_long_t nr,
                                        struct io_event __user *events,
                                        struct __kernel_timespec __user *timeout,
                                        const struct __compat_aio_sigset __user *usig);
asmlinkage long compat_sys_epoll_pwait(int epfd,
                        struct epoll_event __user *events,
                        int maxevents, int timeout,
                        const compat_sigset_t __user *sigmask,
                        compat_size_t sigsetsize);
asmlinkage long compat_sys_epoll_pwait2(int epfd,
                        struct epoll_event __user *events,
                        int maxevents,
                        const struct __kernel_timespec __user *timeout,
                        const compat_sigset_t __user *sigmask,
                        compat_size_t sigsetsize);
asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
                                 compat_ulong_t arg);
asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
                                   compat_ulong_t arg);
asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
                                 compat_ulong_t arg);
asmlinkage long compat_sys_statfs(const char __user *pathname,
                                  struct compat_statfs __user *buf);
asmlinkage long compat_sys_statfs64(const char __user *pathname,
                                    compat_size_t sz,
                                    struct compat_statfs64 __user *buf);
asmlinkage long compat_sys_fstatfs(unsigned int fd,
                                   struct compat_statfs __user *buf);
asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz,
                                     struct compat_statfs64 __user *buf);
asmlinkage long compat_sys_truncate(const char __user *, compat_off_t);
asmlinkage long compat_sys_ftruncate(unsigned int, compat_off_t);
/* No generic prototype for truncate64, ftruncate64, fallocate */
asmlinkage long compat_sys_openat(int dfd, const char __user *filename,
                                  int flags, umode_t mode);
asmlinkage long compat_sys_getdents(unsigned int fd,
                                    struct compat_linux_dirent __user *dirent,
                                    unsigned int count);
asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);
/* No generic prototype for pread64 and pwrite64 */
asmlinkage ssize_t compat_sys_preadv(compat_ulong_t fd,
                const struct iovec __user *vec,
                compat_ulong_t vlen, u32 pos_low, u32 pos_high);
asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd,
                const struct iovec __user *vec,
                compat_ulong_t vlen, u32 pos_low, u32 pos_high);
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
asmlinkage long compat_sys_preadv64(unsigned long fd,
                const struct iovec __user *vec,
                unsigned long vlen, loff_t pos);
#endif

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
asmlinkage long compat_sys_pwritev64(unsigned long fd,
                const struct iovec __user *vec,
                unsigned long vlen, loff_t pos);
#endif
asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
                                    compat_off_t __user *offset, compat_size_t count);
asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
                                    compat_loff_t __user *offset, compat_size_t count);
asmlinkage long compat_sys_pselect6_time32(int n, compat_ulong_t __user *inp,
                                    compat_ulong_t __user *outp,
                                    compat_ulong_t __user *exp,
                                    struct old_timespec32 __user *tsp,
                                    void __user *sig);
asmlinkage long compat_sys_pselect6_time64(int n, compat_ulong_t __user *inp,
                                    compat_ulong_t __user *outp,
                                    compat_ulong_t __user *exp,
                                    struct __kernel_timespec __user *tsp,
                                    void __user *sig);
asmlinkage long compat_sys_ppoll_time32(struct pollfd __user *ufds,
                                 unsigned int nfds,
                                 struct old_timespec32 __user *tsp,
                                 const compat_sigset_t __user *sigmask,
                                 compat_size_t sigsetsize);
asmlinkage long compat_sys_ppoll_time64(struct pollfd __user *ufds,
                                 unsigned int nfds,
                                 struct __kernel_timespec __user *tsp,
                                 const compat_sigset_t __user *sigmask,
                                 compat_size_t sigsetsize);
asmlinkage long compat_sys_signalfd4(int ufd,
                                     const compat_sigset_t __user *sigmask,
                                     compat_size_t sigsetsize, int flags);
asmlinkage long compat_sys_newfstatat(unsigned int dfd,
                                      const char __user *filename,
                                      struct compat_stat __user *statbuf,
                                      int flag);
asmlinkage long compat_sys_newfstat(unsigned int fd,
                                    struct compat_stat __user *statbuf);
/* No generic prototype for sync_file_range and sync_file_range2 */
asmlinkage long compat_sys_waitid(int, compat_pid_t,
                struct compat_siginfo __user *, int,
                struct compat_rusage __user *);
asmlinkage long
compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
                           compat_size_t len);
asmlinkage long
compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                           compat_size_t __user *len_ptr);
asmlinkage long compat_sys_getitimer(int which,
                                     struct old_itimerval32 __user *it);
asmlinkage long compat_sys_setitimer(int which,
                                     struct old_itimerval32 __user *in,
                                     struct old_itimerval32 __user *out);
asmlinkage long compat_sys_kexec_load(compat_ulong_t entry,
                                      compat_ulong_t nr_segments,
                                      struct compat_kexec_segment __user *,
                                      compat_ulong_t flags);
asmlinkage long compat_sys_timer_create(clockid_t which_clock,
                        struct compat_sigevent __user *timer_event_spec,
                        timer_t __user *created_timer_id);
asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
                                  compat_long_t addr, compat_long_t data);
asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
                                     unsigned int len,
                                     compat_ulong_t __user *user_mask_ptr);
asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid,
                                     unsigned int len,
                                     compat_ulong_t __user *user_mask_ptr);
asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
                                       compat_stack_t __user *uoss_ptr);
asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset,
                                         compat_size_t sigsetsize);
#ifndef CONFIG_ODD_RT_SIGACTION
asmlinkage long compat_sys_rt_sigaction(int,
                                 const struct compat_sigaction __user *,
                                 struct compat_sigaction __user *,
                                 compat_size_t);
#endif
asmlinkage long compat_sys_rt_sigprocmask(int how, compat_sigset_t __user *set,
                                          compat_sigset_t __user *oset,
                                          compat_size_t sigsetsize);
asmlinkage long compat_sys_rt_sigpending(compat_sigset_t __user *uset,
                                         compat_size_t sigsetsize);
asmlinkage long compat_sys_rt_sigtimedwait_time32(compat_sigset_t __user *uthese,
                struct compat_siginfo __user *uinfo,
                struct old_timespec32 __user *uts, compat_size_t sigsetsize);
asmlinkage long compat_sys_rt_sigtimedwait_time64(compat_sigset_t __user *uthese,
                struct compat_siginfo __user *uinfo,
                struct __kernel_timespec __user *uts, compat_size_t sigsetsize);
asmlinkage long compat_sys_rt_sigqueueinfo(compat_pid_t pid, int sig,
                                struct compat_siginfo __user *uinfo);
/* No generic prototype for rt_sigreturn */
asmlinkage long compat_sys_times(struct compat_tms __user *tbuf);
asmlinkage long compat_sys_getrlimit(unsigned int resource,
                                     struct compat_rlimit __user *rlim);
asmlinkage long compat_sys_setrlimit(unsigned int resource,
                                     struct compat_rlimit __user *rlim);
asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru);
asmlinkage long compat_sys_gettimeofday(struct old_timeval32 __user *tv,
                struct timezone __user *tz);
asmlinkage long compat_sys_settimeofday(struct old_timeval32 __user *tv,
                struct timezone __user *tz);
asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info);
asmlinkage long compat_sys_mq_open(const char __user *u_name,
                        int oflag, compat_mode_t mode,
                        struct compat_mq_attr __user *u_attr);
asmlinkage long compat_sys_mq_notify(mqd_t mqdes,
                        const struct compat_sigevent __user *u_notification);
asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes,
                        const struct compat_mq_attr __user *u_mqstat,
                        struct compat_mq_attr __user *u_omqstat);
asmlinkage long compat_sys_msgctl(int first, int second, void __user *uptr);
asmlinkage long compat_sys_msgrcv(int msqid, compat_uptr_t msgp,
                compat_ssize_t msgsz, compat_long_t msgtyp, int msgflg);
asmlinkage long compat_sys_msgsnd(int msqid, compat_uptr_t msgp,
                compat_ssize_t msgsz, int msgflg);
asmlinkage long compat_sys_semctl(int semid, int semnum, int cmd, int arg);
asmlinkage long compat_sys_shmctl(int first, int second, void __user *uptr);
asmlinkage long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg);
asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, compat_size_t len,
                            unsigned flags, struct sockaddr __user *addr,
                            int __user *addrlen);
asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg,
                                   unsigned flags);
asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg,
                                   unsigned int flags);
/* No generic prototype for readahead */
asmlinkage long compat_sys_keyctl(u32 option,
                              u32 arg2, u32 arg3, u32 arg4, u32 arg5);
asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
                     const compat_uptr_t __user *envp);
/* No generic prototype for fadvise64_64 */
/* CONFIG_MMU only */
asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid,
                                        compat_pid_t pid, int sig,
                                        struct compat_siginfo __user *uinfo);
asmlinkage long compat_sys_recvmmsg_time64(int fd, struct compat_mmsghdr __user *mmsg,
                                    unsigned vlen, unsigned int flags,
                                    struct __kernel_timespec __user *timeout);
asmlinkage long compat_sys_recvmmsg_time32(int fd, struct compat_mmsghdr __user *mmsg,
                                    unsigned vlen, unsigned int flags,
                                    struct old_timespec32 __user *timeout);
asmlinkage long compat_sys_wait4(compat_pid_t pid,
                                 compat_uint_t __user *stat_addr, int options,
                                 struct compat_rusage __user *ru);
asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
                                            int, const char __user *);
asmlinkage long compat_sys_open_by_handle_at(int mountdirfd,
                                             struct file_handle __user *handle,
                                             int flags);
asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg,
                                    unsigned vlen, unsigned int flags);
asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
                     const compat_uptr_t __user *argv,
                     const compat_uptr_t __user *envp, int flags);
asmlinkage ssize_t compat_sys_preadv2(compat_ulong_t fd,
                const struct iovec __user *vec,
                compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags);
asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd,
                const struct iovec __user *vec,
                compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags);
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
asmlinkage long  compat_sys_preadv64v2(unsigned long fd,
                const struct iovec __user *vec,
                unsigned long vlen, loff_t pos, rwf_t flags);
#endif

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
asmlinkage long compat_sys_pwritev64v2(unsigned long fd,
                const struct iovec __user *vec,
                unsigned long vlen, loff_t pos, rwf_t flags);
#endif


/*
 * Deprecated system calls which are still defined in
 * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch
 */

/* __ARCH_WANT_SYSCALL_NO_AT */
asmlinkage long compat_sys_open(const char __user *filename, int flags,
                                umode_t mode);

/* __ARCH_WANT_SYSCALL_NO_FLAGS */
asmlinkage long compat_sys_signalfd(int ufd,
                                    const compat_sigset_t __user *sigmask,
                                    compat_size_t sigsetsize);

/* __ARCH_WANT_SYSCALL_OFF_T */
asmlinkage long compat_sys_newstat(const char __user *filename,
                                   struct compat_stat __user *statbuf);
asmlinkage long compat_sys_newlstat(const char __user *filename,
                                    struct compat_stat __user *statbuf);

/* __ARCH_WANT_SYSCALL_DEPRECATED */
asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
                compat_ulong_t __user *outp, compat_ulong_t __user *exp,
                struct old_timeval32 __user *tvp);
asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u32);
asmlinkage long compat_sys_recv(int fd, void __user *buf, compat_size_t len,
                                unsigned flags);

/* obsolete */
asmlinkage long compat_sys_old_readdir(unsigned int fd,
                                       struct compat_old_linux_dirent __user *,
                                       unsigned int count);

/* obsolete */
asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg);

/* obsolete */
asmlinkage long compat_sys_ipc(u32, int, int, u32, compat_uptr_t, u32);

/* obsolete */
#ifdef __ARCH_WANT_SYS_SIGPENDING
asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set);
#endif

#ifdef __ARCH_WANT_SYS_SIGPROCMASK
asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *nset,
                                       compat_old_sigset_t __user *oset);
#endif
#ifdef CONFIG_COMPAT_OLD_SIGACTION
asmlinkage long compat_sys_sigaction(int sig,
                                   const struct compat_old_sigaction __user *act,
                                   struct compat_old_sigaction __user *oact);
#endif

/* obsolete */
asmlinkage long compat_sys_socketcall(int call, u32 __user *args);

#ifdef __ARCH_WANT_COMPAT_TRUNCATE64
asmlinkage long compat_sys_truncate64(const char __user *pathname, compat_arg_u64(len));
#endif

#ifdef __ARCH_WANT_COMPAT_FTRUNCATE64
asmlinkage long compat_sys_ftruncate64(unsigned int fd, compat_arg_u64(len));
#endif

#ifdef __ARCH_WANT_COMPAT_FALLOCATE
asmlinkage long compat_sys_fallocate(int fd, int mode, compat_arg_u64(offset),
                                     compat_arg_u64(len));
#endif

#ifdef __ARCH_WANT_COMPAT_PREAD64
asmlinkage long compat_sys_pread64(unsigned int fd, char __user *buf, size_t count,
                                   compat_arg_u64(pos));
#endif

#ifdef __ARCH_WANT_COMPAT_PWRITE64
asmlinkage long compat_sys_pwrite64(unsigned int fd, const char __user *buf, size_t count,
                                    compat_arg_u64(pos));
#endif

#ifdef __ARCH_WANT_COMPAT_SYNC_FILE_RANGE
asmlinkage long compat_sys_sync_file_range(int fd, compat_arg_u64(pos),
                                           compat_arg_u64(nbytes), unsigned int flags);
#endif

#ifdef __ARCH_WANT_COMPAT_FADVISE64_64
asmlinkage long compat_sys_fadvise64_64(int fd, compat_arg_u64(pos),
                                        compat_arg_u64(len), int advice);
#endif

#ifdef __ARCH_WANT_COMPAT_READAHEAD
asmlinkage long compat_sys_readahead(int fd, compat_arg_u64(offset), size_t count);
#endif

#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */

/**
 * ns_to_old_timeval32 - Compat version of ns_to_timeval
 * @nsec:        the nanoseconds value to be converted
 *
 * Returns the old_timeval32 representation of the nsec parameter.
 */
static inline struct old_timeval32 ns_to_old_timeval32(s64 nsec)
{
        struct __kernel_old_timeval tv;
        struct old_timeval32 ctv;

        tv = ns_to_kernel_old_timeval(nsec);
        ctv.tv_sec = tv.tv_sec;
        ctv.tv_usec = tv.tv_usec;

        return ctv;
}

/*
 * Kernel code should not call compat syscalls (i.e., compat_sys_xyzyyz())
 * directly.  Instead, use one of the functions which work equivalently, such
 * as the kcompat_sys_xyzyyz() functions prototyped below.
 */

int kcompat_sys_statfs64(const char __user * pathname, compat_size_t sz,
                     struct compat_statfs64 __user * buf);
int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz,
                          struct compat_statfs64 __user * buf);

#ifdef CONFIG_COMPAT

/*
 * For most but not all architectures, "am I in a compat syscall?" and
 * "am I a compat task?" are the same question.  For architectures on which
 * they aren't the same question, arch code can override in_compat_syscall.
 */
#ifndef in_compat_syscall
static inline bool in_compat_syscall(void) { return is_compat_task(); }
#endif

#else /* !CONFIG_COMPAT */

#define is_compat_task() (0)
/* Ensure no one redefines in_compat_syscall() under !CONFIG_COMPAT */
#define in_compat_syscall in_compat_syscall
static inline bool in_compat_syscall(void) { return false; }

#endif /* CONFIG_COMPAT */

#define BITS_PER_COMPAT_LONG    (8*sizeof(compat_long_t))

#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG)

long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
                       unsigned long bitmap_size);
long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
                       unsigned long bitmap_size);

/*
 * Some legacy ABIs like the i386 one use less than natural alignment for 64-bit
 * types, and will need special compat treatment for that.  Most architectures
 * don't need that special handling even for compat syscalls.
 */
#ifndef compat_need_64bit_alignment_fixup
#define compat_need_64bit_alignment_fixup()                false
#endif

/*
 * A pointer passed in from user mode. This should not
 * be used for syscall parameters, just declare them
 * as pointers because the syscall entry code will have
 * appropriately converted them already.
 */
#ifndef compat_ptr
static inline void __user *compat_ptr(compat_uptr_t uptr)
{
        return (void __user *)(unsigned long)uptr;
}
#endif

static inline compat_uptr_t ptr_to_compat(void __user *uptr)
{
        return (u32)(unsigned long)uptr;
}

#endif /* _LINUX_COMPAT_H */




























    3 
    3 




    3 














    3 


















    3 









    3 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// SPDX-License-Identifier: GPL-2.0
/*
 * module.c - module sysfs fun for drivers
 */
#include <linux/device.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
#include "base.h"

static char *make_driver_name(const struct device_driver *drv)
{
        char *driver_name;

        driver_name = kasprintf(GFP_KERNEL, "%s:%s", drv->bus->name, drv->name);
        if (!driver_name)
                return NULL;

        return driver_name;
}

static void module_create_drivers_dir(struct module_kobject *mk)
{
        static DEFINE_MUTEX(drivers_dir_mutex);

        mutex_lock(&drivers_dir_mutex);
        if (mk && !mk->drivers_dir)
                mk->drivers_dir = kobject_create_and_add("drivers", &mk->kobj);
        mutex_unlock(&drivers_dir_mutex);
}

int module_add_driver(struct module *mod, const struct device_driver *drv)
{
        char *driver_name;
        struct module_kobject *mk = NULL;
        int ret;

        if (!drv)
                return 0;

        if (mod)
                mk = &mod->mkobj;
        else if (drv->mod_name) {
                /* Lookup or create built-in module entry in /sys/modules */
                mk = lookup_or_create_module_kobject(drv->mod_name);
                if (mk) {
                        /* remember our module structure */
                        drv->p->mkobj = mk;
                        /* lookup_or_create_module_kobject took a reference */
                        kobject_put(&mk->kobj);
                }
        }

        if (!mk)
                return 0;

        ret = sysfs_create_link(&drv->p->kobj, &mk->kobj, "module");
        if (ret)
                return ret;

        driver_name = make_driver_name(drv);
        if (!driver_name) {
                ret = -ENOMEM;
                goto out_remove_kobj;
        }

        module_create_drivers_dir(mk);
        if (!mk->drivers_dir) {
                ret = -EINVAL;
                goto out_free_driver_name;
        }

        ret = sysfs_create_link(mk->drivers_dir, &drv->p->kobj, driver_name);
        if (ret)
                goto out_remove_drivers_dir;

        kfree(driver_name);

        return 0;

out_remove_drivers_dir:
        sysfs_remove_link(mk->drivers_dir, driver_name);

out_free_driver_name:
        kfree(driver_name);

out_remove_kobj:
        sysfs_remove_link(&drv->p->kobj, "module");
        return ret;
}

void module_remove_driver(const struct device_driver *drv)
{
        struct module_kobject *mk = NULL;
        char *driver_name;

        if (!drv)
                return;

        sysfs_remove_link(&drv->p->kobj, "module");

        if (drv->owner)
                mk = &drv->owner->mkobj;
        else if (drv->p->mkobj)
                mk = drv->p->mkobj;
        if (mk && mk->drivers_dir) {
                driver_name = make_driver_name(drv);
                if (driver_name) {
                        sysfs_remove_link(mk->drivers_dir, driver_name);
                        kfree(driver_name);
                }
        }
}










































































































































































































































































































    1 



















    1 















































































































































































































































































    1 


    1 





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
/*
 *  linux/fs/hfs/inode.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains inode-related functions which do not depend on
 * which scheme is being used to represent forks.
 *
 * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
 */

#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/uio.h>
#include <linux/xattr.h>
#include <linux/blkdev.h>

#include "hfs_fs.h"
#include "btree.h"

static const struct file_operations hfs_file_operations;
static const struct inode_operations hfs_file_inode_operations;

/*================ Variable-like macros ================*/

#define HFS_VALID_MODE_BITS  (S_IFREG | S_IFDIR | S_IRWXUGO)

static int hfs_read_folio(struct file *file, struct folio *folio)
{
        return block_read_full_folio(folio, hfs_get_block);
}

static void hfs_write_failed(struct address_space *mapping, loff_t to)
{
        struct inode *inode = mapping->host;

        if (to > inode->i_size) {
                truncate_pagecache(inode, inode->i_size);
                hfs_file_truncate(inode);
        }
}

int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
                    loff_t pos, unsigned int len, struct folio **foliop,
                    void **fsdata)
{
        int ret;

        ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
                                hfs_get_block,
                                &HFS_I(mapping->host)->phys_size);
        if (unlikely(ret))
                hfs_write_failed(mapping, pos + len);

        return ret;
}

static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
{
        return generic_block_bmap(mapping, block, hfs_get_block);
}

static bool hfs_release_folio(struct folio *folio, gfp_t mask)
{
        struct inode *inode = folio->mapping->host;
        struct super_block *sb = inode->i_sb;
        struct hfs_btree *tree;
        struct hfs_bnode *node;
        u32 nidx;
        int i;
        bool res = true;

        switch (inode->i_ino) {
        case HFS_EXT_CNID:
                tree = HFS_SB(sb)->ext_tree;
                break;
        case HFS_CAT_CNID:
                tree = HFS_SB(sb)->cat_tree;
                break;
        default:
                BUG();
                return false;
        }

        if (!tree)
                return false;

        if (tree->node_size >= PAGE_SIZE) {
                nidx = folio->index >> (tree->node_size_shift - PAGE_SHIFT);
                spin_lock(&tree->hash_lock);
                node = hfs_bnode_findhash(tree, nidx);
                if (!node)
                        ;
                else if (atomic_read(&node->refcnt))
                        res = false;
                if (res && node) {
                        hfs_bnode_unhash(node);
                        hfs_bnode_free(node);
                }
                spin_unlock(&tree->hash_lock);
        } else {
                nidx = folio->index << (PAGE_SHIFT - tree->node_size_shift);
                i = 1 << (PAGE_SHIFT - tree->node_size_shift);
                spin_lock(&tree->hash_lock);
                do {
                        node = hfs_bnode_findhash(tree, nidx++);
                        if (!node)
                                continue;
                        if (atomic_read(&node->refcnt)) {
                                res = false;
                                break;
                        }
                        hfs_bnode_unhash(node);
                        hfs_bnode_free(node);
                } while (--i && nidx < tree->node_count);
                spin_unlock(&tree->hash_lock);
        }
        return res ? try_to_free_buffers(folio) : false;
}

static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        size_t count = iov_iter_count(iter);
        ssize_t ret;

        ret = blockdev_direct_IO(iocb, inode, iter, hfs_get_block);

        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again.
         */
        if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
                loff_t isize = i_size_read(inode);
                loff_t end = iocb->ki_pos + count;

                if (end > isize)
                        hfs_write_failed(mapping, end);
        }

        return ret;
}

static int hfs_writepages(struct address_space *mapping,
                          struct writeback_control *wbc)
{
        return mpage_writepages(mapping, wbc, hfs_get_block);
}

const struct address_space_operations hfs_btree_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio        = hfs_read_folio,
        .writepages        = hfs_writepages,
        .write_begin        = hfs_write_begin,
        .write_end        = generic_write_end,
        .migrate_folio        = buffer_migrate_folio,
        .bmap                = hfs_bmap,
        .release_folio        = hfs_release_folio,
};

const struct address_space_operations hfs_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio        = hfs_read_folio,
        .write_begin        = hfs_write_begin,
        .write_end        = generic_write_end,
        .bmap                = hfs_bmap,
        .direct_IO        = hfs_direct_IO,
        .writepages        = hfs_writepages,
        .migrate_folio        = buffer_migrate_folio,
};

/*
 * hfs_new_inode
 */
struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t mode)
{
        struct super_block *sb = dir->i_sb;
        struct inode *inode = new_inode(sb);
        s64 next_id;
        s64 file_count;
        s64 folder_count;
        int err = -ENOMEM;

        if (!inode)
                goto out_err;

        err = -ERANGE;

        mutex_init(&HFS_I(inode)->extents_lock);
        INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
        spin_lock_init(&HFS_I(inode)->open_dir_lock);
        hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
        next_id = atomic64_inc_return(&HFS_SB(sb)->next_id);
        if (next_id > U32_MAX) {
                atomic64_dec(&HFS_SB(sb)->next_id);
                pr_err("cannot create new inode: next CNID exceeds limit\n");
                goto out_discard;
        }
        inode->i_ino = (u32)next_id;
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        set_nlink(inode, 1);
        simple_inode_init_ts(inode);
        HFS_I(inode)->flags = 0;
        HFS_I(inode)->rsrc_inode = NULL;
        HFS_I(inode)->fs_blocks = 0;
        HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;
        if (S_ISDIR(mode)) {
                inode->i_size = 2;
                folder_count = atomic64_inc_return(&HFS_SB(sb)->folder_count);
                if (folder_count> U32_MAX) {
                        atomic64_dec(&HFS_SB(sb)->folder_count);
                        pr_err("cannot create new inode: folder count exceeds limit\n");
                        goto out_discard;
                }
                if (dir->i_ino == HFS_ROOT_CNID)
                        HFS_SB(sb)->root_dirs++;
                inode->i_op = &hfs_dir_inode_operations;
                inode->i_fop = &hfs_dir_operations;
                inode->i_mode |= S_IRWXUGO;
                inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask;
        } else if (S_ISREG(mode)) {
                HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
                file_count = atomic64_inc_return(&HFS_SB(sb)->file_count);
                if (file_count > U32_MAX) {
                        atomic64_dec(&HFS_SB(sb)->file_count);
                        pr_err("cannot create new inode: file count exceeds limit\n");
                        goto out_discard;
                }
                if (dir->i_ino == HFS_ROOT_CNID)
                        HFS_SB(sb)->root_files++;
                inode->i_op = &hfs_file_inode_operations;
                inode->i_fop = &hfs_file_operations;
                inode->i_mapping->a_ops = &hfs_aops;
                inode->i_mode |= S_IRUGO|S_IXUGO;
                if (mode & S_IWUSR)
                        inode->i_mode |= S_IWUGO;
                inode->i_mode &= ~HFS_SB(inode->i_sb)->s_file_umask;
                HFS_I(inode)->phys_size = 0;
                HFS_I(inode)->alloc_blocks = 0;
                HFS_I(inode)->first_blocks = 0;
                HFS_I(inode)->cached_start = 0;
                HFS_I(inode)->cached_blocks = 0;
                memset(HFS_I(inode)->first_extents, 0, sizeof(hfs_extent_rec));
                memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
        }
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
        set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
        hfs_mark_mdb_dirty(sb);

        return inode;

        out_discard:
                iput(inode);
        out_err:
                return ERR_PTR(err);
}

void hfs_delete_inode(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        hfs_dbg("ino %llu\n", inode->i_ino);
        if (S_ISDIR(inode->i_mode)) {
                atomic64_dec(&HFS_SB(sb)->folder_count);
                if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
                        HFS_SB(sb)->root_dirs--;
                set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
                hfs_mark_mdb_dirty(sb);
                return;
        }

        atomic64_dec(&HFS_SB(sb)->file_count);
        if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
                HFS_SB(sb)->root_files--;
        if (S_ISREG(inode->i_mode)) {
                if (!inode->i_nlink) {
                        inode->i_size = 0;
                        hfs_file_truncate(inode);
                }
        }
        set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
        hfs_mark_mdb_dirty(sb);
}

void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
                         __be32 __log_size, __be32 phys_size, u32 clump_size)
{
        struct super_block *sb = inode->i_sb;
        u32 log_size = be32_to_cpu(__log_size);
        u16 count;
        int i;

        memcpy(HFS_I(inode)->first_extents, ext, sizeof(hfs_extent_rec));
        for (count = 0, i = 0; i < 3; i++)
                count += be16_to_cpu(ext[i].count);
        HFS_I(inode)->first_blocks = count;
        HFS_I(inode)->cached_start = 0;
        HFS_I(inode)->cached_blocks = 0;

        inode->i_size = HFS_I(inode)->phys_size = log_size;
        HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
        inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits);
        HFS_I(inode)->alloc_blocks = be32_to_cpu(phys_size) /
                                     HFS_SB(sb)->alloc_blksz;
        HFS_I(inode)->clump_blocks = clump_size / HFS_SB(sb)->alloc_blksz;
        if (!HFS_I(inode)->clump_blocks)
                HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
}

struct hfs_iget_data {
        struct hfs_cat_key *key;
        hfs_cat_rec *rec;
};

static int hfs_test_inode(struct inode *inode, void *data)
{
        struct hfs_iget_data *idata = data;
        hfs_cat_rec *rec;

        rec = idata->rec;
        switch (rec->type) {
        case HFS_CDR_DIR:
                return inode->i_ino == be32_to_cpu(rec->dir.DirID);
        case HFS_CDR_FIL:
                return inode->i_ino == be32_to_cpu(rec->file.FlNum);
        default:
                BUG();
                return 1;
        }
}

/*
 * hfs_read_inode
 */
static int hfs_read_inode(struct inode *inode, void *data)
{
        struct hfs_iget_data *idata = data;
        struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
        hfs_cat_rec *rec;

        HFS_I(inode)->flags = 0;
        HFS_I(inode)->rsrc_inode = NULL;
        mutex_init(&HFS_I(inode)->extents_lock);
        INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
        spin_lock_init(&HFS_I(inode)->open_dir_lock);

        /* Initialize the inode */
        inode->i_uid = hsb->s_uid;
        inode->i_gid = hsb->s_gid;
        set_nlink(inode, 1);

        if (idata->key)
                HFS_I(inode)->cat_key = *idata->key;
        else
                HFS_I(inode)->flags |= HFS_FLG_RSRC;
        HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;

        rec = idata->rec;
        switch (rec->type) {
        case HFS_CDR_FIL:
                if (!HFS_IS_RSRC(inode)) {
                        hfs_inode_read_fork(inode, rec->file.ExtRec, rec->file.LgLen,
                                            rec->file.PyLen, be16_to_cpu(rec->file.ClpSize));
                } else {
                        hfs_inode_read_fork(inode, rec->file.RExtRec, rec->file.RLgLen,
                                            rec->file.RPyLen, be16_to_cpu(rec->file.ClpSize));
                }

                inode->i_ino = be32_to_cpu(rec->file.FlNum);
                inode->i_mode = S_IRUGO | S_IXUGO;
                if (!(rec->file.Flags & HFS_FIL_LOCK))
                        inode->i_mode |= S_IWUGO;
                inode->i_mode &= ~hsb->s_file_umask;
                inode->i_mode |= S_IFREG;
                inode_set_mtime_to_ts(inode,
                                      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->file.MdDat))));
                inode->i_op = &hfs_file_inode_operations;
                inode->i_fop = &hfs_file_operations;
                inode->i_mapping->a_ops = &hfs_aops;
                break;
        case HFS_CDR_DIR:
                inode->i_ino = be32_to_cpu(rec->dir.DirID);
                inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
                HFS_I(inode)->fs_blocks = 0;
                inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
                inode_set_mtime_to_ts(inode,
                                      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->dir.MdDat))));
                inode->i_op = &hfs_dir_inode_operations;
                inode->i_fop = &hfs_dir_operations;
                break;
        default:
                make_bad_inode(inode);
        }
        return 0;
}

/*
 * __hfs_iget()
 *
 * Given the MDB for a HFS filesystem, a 'key' and an 'entry' in
 * the catalog B-tree and the 'type' of the desired file return the
 * inode for that file/directory or NULL.  Note that 'type' indicates
 * whether we want the actual file or directory, or the corresponding
 * metadata (AppleDouble header file or CAP metadata file).
 */
struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec)
{
        struct hfs_iget_data data = { key, rec };
        struct inode *inode;
        u32 cnid;

        switch (rec->type) {
        case HFS_CDR_DIR:
                cnid = be32_to_cpu(rec->dir.DirID);
                break;
        case HFS_CDR_FIL:
                cnid = be32_to_cpu(rec->file.FlNum);
                break;
        default:
                return NULL;
        }
        inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data);
        if (inode && (inode_state_read_once(inode) & I_NEW))
                unlock_new_inode(inode);
        return inode;
}

void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
                          __be32 *log_size, __be32 *phys_size)
{
        memcpy(ext, HFS_I(inode)->first_extents, sizeof(hfs_extent_rec));

        if (log_size)
                *log_size = cpu_to_be32(inode->i_size);
        if (phys_size)
                *phys_size = cpu_to_be32(HFS_I(inode)->alloc_blocks *
                                         HFS_SB(inode->i_sb)->alloc_blksz);
}

int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        struct inode *main_inode = inode;
        struct hfs_find_data fd;
        hfs_cat_rec rec;
        int res;

        hfs_dbg("ino %llu\n", inode->i_ino);
        res = hfs_ext_write_extent(inode);
        if (res)
                return res;

        if (inode->i_ino < HFS_FIRSTUSER_CNID) {
                switch (inode->i_ino) {
                case HFS_ROOT_CNID:
                        break;
                case HFS_EXT_CNID:
                        hfs_btree_write(HFS_SB(inode->i_sb)->ext_tree);
                        return 0;
                case HFS_CAT_CNID:
                        hfs_btree_write(HFS_SB(inode->i_sb)->cat_tree);
                        return 0;
                default:
                        BUG();
                        return -EIO;
                }
        }

        if (HFS_IS_RSRC(inode))
                main_inode = HFS_I(inode)->rsrc_inode;

        if (!main_inode->i_nlink)
                return 0;

        if (hfs_find_init(HFS_SB(main_inode->i_sb)->cat_tree, &fd))
                /* panic? */
                return -EIO;

        res = -EIO;
        if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN)
                goto out;
        fd.search_key->cat = HFS_I(main_inode)->cat_key;
        if (hfs_brec_find(&fd))
                goto out;

        if (S_ISDIR(main_inode->i_mode)) {
                if (fd.entrylength < sizeof(struct hfs_cat_dir))
                        goto out;
                hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
                           sizeof(struct hfs_cat_dir));
                if (rec.type != HFS_CDR_DIR ||
                    be32_to_cpu(rec.dir.DirID) != inode->i_ino) {
                }

                rec.dir.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));
                rec.dir.Val = cpu_to_be16(inode->i_size - 2);

                hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
                            sizeof(struct hfs_cat_dir));
        } else if (HFS_IS_RSRC(inode)) {
                if (fd.entrylength < sizeof(struct hfs_cat_file))
                        goto out;
                hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
                               sizeof(struct hfs_cat_file));
                hfs_inode_write_fork(inode, rec.file.RExtRec,
                                     &rec.file.RLgLen, &rec.file.RPyLen);
                hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
                                sizeof(struct hfs_cat_file));
        } else {
                if (fd.entrylength < sizeof(struct hfs_cat_file))
                        goto out;
                hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
                           sizeof(struct hfs_cat_file));
                if (rec.type != HFS_CDR_FIL ||
                    be32_to_cpu(rec.file.FlNum) != inode->i_ino) {
                }

                if (inode->i_mode & S_IWUSR)
                        rec.file.Flags &= ~HFS_FIL_LOCK;
                else
                        rec.file.Flags |= HFS_FIL_LOCK;
                hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen);
                rec.file.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));

                hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
                            sizeof(struct hfs_cat_file));
        }
        res = 0;
out:
        hfs_find_exit(&fd);
        return res;
}

static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
                                      unsigned int flags)
{
        struct inode *inode = NULL;
        hfs_cat_rec rec;
        struct hfs_find_data fd;
        int res;

        if (HFS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
                goto out;

        inode = HFS_I(dir)->rsrc_inode;
        if (inode)
                goto out;

        inode = new_inode(dir->i_sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);

        res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
        if (res) {
                iput(inode);
                return ERR_PTR(res);
        }
        fd.search_key->cat = HFS_I(dir)->cat_key;
        res = hfs_brec_read(&fd, &rec, sizeof(rec));
        if (!res) {
                struct hfs_iget_data idata = { NULL, &rec };
                hfs_read_inode(inode, &idata);
        }
        hfs_find_exit(&fd);
        if (res) {
                iput(inode);
                return ERR_PTR(res);
        }
        HFS_I(inode)->rsrc_inode = dir;
        HFS_I(dir)->rsrc_inode = inode;
        igrab(dir);
        inode_fake_hash(inode);
        mark_inode_dirty(inode);
        dont_mount(dentry);
out:
        return d_splice_alias(inode, dentry);
}

void hfs_evict_inode(struct inode *inode)
{
        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
                HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
                iput(HFS_I(inode)->rsrc_inode);
        }
}

static int hfs_file_open(struct inode *inode, struct file *file)
{
        if (HFS_IS_RSRC(inode))
                inode = HFS_I(inode)->rsrc_inode;
        atomic_inc(&HFS_I(inode)->opencnt);
        return 0;
}

static int hfs_file_release(struct inode *inode, struct file *file)
{
        //struct super_block *sb = inode->i_sb;

        if (HFS_IS_RSRC(inode))
                inode = HFS_I(inode)->rsrc_inode;
        if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
                inode_lock(inode);
                hfs_file_truncate(inode);
                //if (inode->i_flags & S_DEAD) {
                //        hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
                //        hfs_delete_inode(inode);
                //}
                inode_unlock(inode);
        }
        return 0;
}

int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                      struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
        int error;

        error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
        if (error)
                return error;

        /* no uig/gid changes and limit which mode bits can be set */
        if (((attr->ia_valid & ATTR_UID) &&
             (!uid_eq(attr->ia_uid, hsb->s_uid))) ||
            ((attr->ia_valid & ATTR_GID) &&
             (!gid_eq(attr->ia_gid, hsb->s_gid))) ||
            ((attr->ia_valid & ATTR_MODE) &&
             ((S_ISDIR(inode->i_mode) &&
               (attr->ia_mode != inode->i_mode)) ||
              (attr->ia_mode & ~HFS_VALID_MODE_BITS)))) {
                return hsb->s_quiet ? 0 : error;
        }

        /* map file permissions to the closest allowable permissions in HFS */
        if (attr->ia_valid & ATTR_MODE) {
                /* Only the 'w' bits can ever change and only all together. */
                if (attr->ia_mode & S_IWUSR)
                        attr->ia_mode = inode->i_mode | S_IWUGO;
                else
                        attr->ia_mode = inode->i_mode & ~S_IWUGO;
                attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
        }

        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
                inode_dio_wait(inode);

                error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        return error;

                truncate_setsize(inode, attr->ia_size);
                hfs_file_truncate(inode);
                simple_inode_init_ts(inode);
        }

        setattr_copy(&nop_mnt_idmap, inode, attr);
        mark_inode_dirty(inode);
        return 0;
}

static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
                          int datasync)
{
        struct inode *inode = filp->f_mapping->host;
        struct super_block * sb;
        int ret, err;

        ret = file_write_and_wait_range(filp, start, end);
        if (ret)
                return ret;
        inode_lock(inode);

        /* sync the inode to buffers */
        ret = write_inode_now(inode, 0);

        /* sync the superblock to buffers */
        sb = inode->i_sb;
        flush_delayed_work(&HFS_SB(sb)->mdb_work);
        /* .. finally sync the buffers to disk */
        err = sync_blockdev(sb->s_bdev);
        if (!ret)
                ret = err;
        inode_unlock(inode);
        return ret;
}

static const struct file_operations hfs_file_operations = {
        .llseek                = generic_file_llseek,
        .read_iter        = generic_file_read_iter,
        .write_iter        = generic_file_write_iter,
        .mmap_prepare        = generic_file_mmap_prepare,
        .splice_read        = filemap_splice_read,
        .splice_write        = iter_file_splice_write,
        .fsync                = hfs_file_fsync,
        .open                = hfs_file_open,
        .release        = hfs_file_release,
};

static const struct inode_operations hfs_file_inode_operations = {
        .lookup                = hfs_file_lookup,
        .setattr        = hfs_inode_setattr,
        .listxattr        = generic_listxattr,
};

























    2 













    2 





    2 





    2 


























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2013
 * Phillip Lougher <phillip@squashfs.org.uk>
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include "squashfs_fs_sb.h"
#include "decompressor.h"
#include "page_actor.h"

/*
 * This file contains implementations of page_actor for decompressing into
 * an intermediate buffer, and for decompressing directly into the
 * page cache.
 *
 * Calling code should avoid sleeping between calls to squashfs_first_page()
 * and squashfs_finish_page().
 */

/* Implementation of page_actor for decompressing into intermediate buffer */
static void *cache_first_page(struct squashfs_page_actor *actor)
{
        actor->next_page = 1;
        return actor->buffer[0];
}

static void *cache_next_page(struct squashfs_page_actor *actor)
{
        if (actor->next_page == actor->pages)
                return NULL;

        return actor->buffer[actor->next_page++];
}

static void cache_finish_page(struct squashfs_page_actor *actor)
{
        /* empty */
}

struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
        int pages, int length)
{
        struct squashfs_page_actor *actor = kmalloc_obj(*actor);

        if (actor == NULL)
                return NULL;

        actor->length = length ? : pages * PAGE_SIZE;
        actor->buffer = buffer;
        actor->pages = pages;
        actor->next_page = 0;
        actor->tmp_buffer = NULL;
        actor->squashfs_first_page = cache_first_page;
        actor->squashfs_next_page = cache_next_page;
        actor->squashfs_finish_page = cache_finish_page;
        return actor;
}

/* Implementation of page_actor for decompressing directly into page cache. */
static loff_t page_next_index(struct squashfs_page_actor *actor)
{
        return page_folio(actor->page[actor->next_page])->index;
}

static void *handle_next_page(struct squashfs_page_actor *actor)
{
        int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT;

        if (actor->returned_pages == max_pages)
                return NULL;

        if ((actor->next_page == actor->pages) ||
                        (actor->next_index != page_next_index(actor))) {
                actor->next_index++;
                actor->returned_pages++;
                actor->last_page = NULL;
                return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM);
        }

        actor->next_index++;
        actor->returned_pages++;
        actor->last_page = actor->page[actor->next_page];
        return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]);
}

static void *direct_first_page(struct squashfs_page_actor *actor)
{
        return handle_next_page(actor);
}

static void *direct_next_page(struct squashfs_page_actor *actor)
{
        if (actor->pageaddr) {
                kunmap_local(actor->pageaddr);
                actor->pageaddr = NULL;
        }

        return handle_next_page(actor);
}

static void direct_finish_page(struct squashfs_page_actor *actor)
{
        if (actor->pageaddr)
                kunmap_local(actor->pageaddr);
}

struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk,
        struct page **page, int pages, int length, loff_t start_index)
{
        struct squashfs_page_actor *actor = kmalloc_obj(*actor);

        if (actor == NULL)
                return NULL;

        if (msblk->decompressor->alloc_buffer) {
                actor->tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);

                if (actor->tmp_buffer == NULL) {
                        kfree(actor);
                        return NULL;
                }
        } else
                actor->tmp_buffer = NULL;

        actor->length = length ? : pages * PAGE_SIZE;
        actor->page = page;
        actor->pages = pages;
        actor->next_page = 0;
        actor->returned_pages = 0;
        actor->next_index = start_index >> PAGE_SHIFT;
        actor->pageaddr = NULL;
        actor->last_page = NULL;
        actor->alloc_buffer = msblk->decompressor->alloc_buffer;
        actor->squashfs_first_page = direct_first_page;
        actor->squashfs_next_page = direct_next_page;
        actor->squashfs_finish_page = direct_finish_page;
        return actor;
}











































































































































































































































































































































































   16 
























































































































































































































































































































































































































































































   16 


















   17 






   16 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
// SPDX-License-Identifier: GPL-2.0
/*
 * device_cgroup.c - device cgroup subsystem
 *
 * Copyright 2007 IBM Corp
 */

#include <linux/bpf-cgroup.h>
#include <linux/device_cgroup.h>
#include <linux/cgroup.h>
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>

#ifdef CONFIG_CGROUP_DEVICE

static DEFINE_MUTEX(devcgroup_mutex);

enum devcg_behavior {
        DEVCG_DEFAULT_NONE,
        DEVCG_DEFAULT_ALLOW,
        DEVCG_DEFAULT_DENY,
};

/*
 * exception list locking rules:
 * hold devcgroup_mutex for update/read.
 * hold rcu_read_lock() for read.
 */

struct dev_exception_item {
        u32 major, minor;
        short type;
        short access;
        struct list_head list;
        struct rcu_head rcu;
};

struct dev_cgroup {
        struct cgroup_subsys_state css;
        struct list_head exceptions;
        enum devcg_behavior behavior;
};

static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
{
        return s ? container_of(s, struct dev_cgroup, css) : NULL;
}

static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
{
        return css_to_devcgroup(task_css(task, devices_cgrp_id));
}

/*
 * called under devcgroup_mutex
 */
static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig)
{
        struct dev_exception_item *ex, *tmp, *new;

        lockdep_assert_held(&devcgroup_mutex);

        list_for_each_entry(ex, orig, list) {
                new = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
                if (!new)
                        goto free_and_exit;
                list_add_tail(&new->list, dest);
        }

        return 0;

free_and_exit:
        list_for_each_entry_safe(ex, tmp, dest, list) {
                list_del(&ex->list);
                kfree(ex);
        }
        return -ENOMEM;
}

static void dev_exceptions_move(struct list_head *dest, struct list_head *orig)
{
        struct dev_exception_item *ex, *tmp;

        lockdep_assert_held(&devcgroup_mutex);

        list_for_each_entry_safe(ex, tmp, orig, list) {
                list_move_tail(&ex->list, dest);
        }
}

/*
 * called under devcgroup_mutex
 */
static int dev_exception_add(struct dev_cgroup *dev_cgroup,
                             struct dev_exception_item *ex)
{
        struct dev_exception_item *excopy, *walk;

        lockdep_assert_held(&devcgroup_mutex);

        excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
        if (!excopy)
                return -ENOMEM;

        list_for_each_entry(walk, &dev_cgroup->exceptions, list) {
                if (walk->type != ex->type)
                        continue;
                if (walk->major != ex->major)
                        continue;
                if (walk->minor != ex->minor)
                        continue;

                walk->access |= ex->access;
                kfree(excopy);
                excopy = NULL;
        }

        if (excopy != NULL)
                list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions);
        return 0;
}

/*
 * called under devcgroup_mutex
 */
static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
                             struct dev_exception_item *ex)
{
        struct dev_exception_item *walk, *tmp;

        lockdep_assert_held(&devcgroup_mutex);

        list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
                if (walk->type != ex->type)
                        continue;
                if (walk->major != ex->major)
                        continue;
                if (walk->minor != ex->minor)
                        continue;

                walk->access &= ~ex->access;
                if (!walk->access) {
                        list_del_rcu(&walk->list);
                        kfree_rcu(walk, rcu);
                }
        }
}

static void __dev_exception_clean(struct dev_cgroup *dev_cgroup)
{
        struct dev_exception_item *ex, *tmp;

        list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) {
                list_del_rcu(&ex->list);
                kfree_rcu(ex, rcu);
        }
}

/**
 * dev_exception_clean - frees all entries of the exception list
 * @dev_cgroup: dev_cgroup with the exception list to be cleaned
 *
 * called under devcgroup_mutex
 */
static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
{
        lockdep_assert_held(&devcgroup_mutex);

        __dev_exception_clean(dev_cgroup);
}

static inline bool is_devcg_online(const struct dev_cgroup *devcg)
{
        return (devcg->behavior != DEVCG_DEFAULT_NONE);
}

/**
 * devcgroup_online - initializes devcgroup's behavior and exceptions based on
 *                       parent's
 * @css: css getting online
 * returns 0 in case of success, error code otherwise
 */
static int devcgroup_online(struct cgroup_subsys_state *css)
{
        struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
        struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent);
        int ret = 0;

        mutex_lock(&devcgroup_mutex);

        if (parent_dev_cgroup == NULL)
                dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
        else {
                ret = dev_exceptions_copy(&dev_cgroup->exceptions,
                                          &parent_dev_cgroup->exceptions);
                if (!ret)
                        dev_cgroup->behavior = parent_dev_cgroup->behavior;
        }
        mutex_unlock(&devcgroup_mutex);

        return ret;
}

static void devcgroup_offline(struct cgroup_subsys_state *css)
{
        struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);

        mutex_lock(&devcgroup_mutex);
        dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
        mutex_unlock(&devcgroup_mutex);
}

/*
 * called from kernel/cgroup/cgroup.c with cgroup_lock() held.
 */
static struct cgroup_subsys_state *
devcgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct dev_cgroup *dev_cgroup;

        dev_cgroup = kzalloc_obj(*dev_cgroup);
        if (!dev_cgroup)
                return ERR_PTR(-ENOMEM);
        INIT_LIST_HEAD(&dev_cgroup->exceptions);
        dev_cgroup->behavior = DEVCG_DEFAULT_NONE;

        return &dev_cgroup->css;
}

static void devcgroup_css_free(struct cgroup_subsys_state *css)
{
        struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);

        __dev_exception_clean(dev_cgroup);
        kfree(dev_cgroup);
}

#define DEVCG_ALLOW 1
#define DEVCG_DENY 2
#define DEVCG_LIST 3

static void seq_putaccess(struct seq_file *m, short access)
{
        if (access & DEVCG_ACC_READ)
                seq_putc(m, 'r');
        if (access & DEVCG_ACC_WRITE)
                seq_putc(m, 'w');
        if (access & DEVCG_ACC_MKNOD)
                seq_putc(m, 'm');
}

static void seq_puttype(struct seq_file *m, short type)
{
        if (type == DEVCG_DEV_ALL)
                seq_putc(m, 'a');
        else if (type == DEVCG_DEV_CHAR)
                seq_putc(m, 'c');
        else if (type == DEVCG_DEV_BLOCK)
                seq_putc(m, 'b');
        else
                seq_putc(m, 'X');
}

static void seq_putversion(struct seq_file *m, unsigned int version)
{
        if (version == ~0)
                seq_putc(m, '*');
        else
                seq_printf(m, "%u", version);
}

static int devcgroup_seq_show(struct seq_file *m, void *v)
{
        struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m));
        struct dev_exception_item *ex;

        rcu_read_lock();
        /*
         * To preserve the compatibility:
         * - Only show the "all devices" when the default policy is to allow
         * - List the exceptions in case the default policy is to deny
         * This way, the file remains as a "whitelist of devices"
         */
        if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
                seq_puts(m, "a *:* rwm\n");
        } else {
                list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) {
                        seq_puttype(m, ex->type);
                        seq_putc(m, ' ');
                        seq_putversion(m, ex->major);
                        seq_putc(m, ':');
                        seq_putversion(m, ex->minor);
                        seq_putc(m, ' ');
                        seq_putaccess(m, ex->access);
                        seq_putc(m, '\n');
                }
        }
        rcu_read_unlock();

        return 0;
}

/**
 * match_exception        - iterates the exception list trying to find a complete match
 * @exceptions: list of exceptions
 * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR)
 * @major: device file major number, ~0 to match all
 * @minor: device file minor number, ~0 to match all
 * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD)
 *
 * It is considered a complete match if an exception is found that will
 * contain the entire range of provided parameters.
 *
 * Return: true in case it matches an exception completely
 */
static bool match_exception(struct list_head *exceptions, short type,
                            u32 major, u32 minor, short access)
{
        struct dev_exception_item *ex;

        list_for_each_entry_rcu(ex, exceptions, list) {
                if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK))
                        continue;
                if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR))
                        continue;
                if (ex->major != ~0 && ex->major != major)
                        continue;
                if (ex->minor != ~0 && ex->minor != minor)
                        continue;
                /* provided access cannot have more than the exception rule */
                if (access & (~ex->access))
                        continue;
                return true;
        }
        return false;
}

/**
 * match_exception_partial - iterates the exception list trying to find a partial match
 * @exceptions: list of exceptions
 * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR)
 * @major: device file major number, ~0 to match all
 * @minor: device file minor number, ~0 to match all
 * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD)
 *
 * It is considered a partial match if an exception's range is found to
 * contain *any* of the devices specified by provided parameters. This is
 * used to make sure no extra access is being granted that is forbidden by
 * any of the exception list.
 *
 * Return: true in case the provided range mat matches an exception completely
 */
static bool match_exception_partial(struct list_head *exceptions, short type,
                                    u32 major, u32 minor, short access)
{
        struct dev_exception_item *ex;

        list_for_each_entry_rcu(ex, exceptions, list,
                                lockdep_is_held(&devcgroup_mutex)) {
                if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK))
                        continue;
                if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR))
                        continue;
                /*
                 * We must be sure that both the exception and the provided
                 * range aren't masking all devices
                 */
                if (ex->major != ~0 && major != ~0 && ex->major != major)
                        continue;
                if (ex->minor != ~0 && minor != ~0 && ex->minor != minor)
                        continue;
                /*
                 * In order to make sure the provided range isn't matching
                 * an exception, all its access bits shouldn't match the
                 * exception's access bits
                 */
                if (!(access & ex->access))
                        continue;
                return true;
        }
        return false;
}

/**
 * verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions
 * @dev_cgroup: dev cgroup to be tested against
 * @refex: new exception
 * @behavior: behavior of the exception's dev_cgroup
 *
 * This is used to make sure a child cgroup won't have more privileges
 * than its parent
 */
static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
                          struct dev_exception_item *refex,
                          enum devcg_behavior behavior)
{
        bool match = false;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
                         !lockdep_is_held(&devcgroup_mutex),
                         "device_cgroup:verify_new_ex called without proper synchronization");

        if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
                if (behavior == DEVCG_DEFAULT_ALLOW) {
                        /*
                         * new exception in the child doesn't matter, only
                         * adding extra restrictions
                         */ 
                        return true;
                } else {
                        /*
                         * new exception in the child will add more devices
                         * that can be accessed, so it can't match any of
                         * parent's exceptions, even slightly
                         */ 
                        match = match_exception_partial(&dev_cgroup->exceptions,
                                                        refex->type,
                                                        refex->major,
                                                        refex->minor,
                                                        refex->access);

                        if (match)
                                return false;
                        return true;
                }
        } else {
                /*
                 * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore
                 * the new exception will add access to more devices and must
                 * be contained completely in an parent's exception to be
                 * allowed
                 */
                match = match_exception(&dev_cgroup->exceptions, refex->type,
                                        refex->major, refex->minor,
                                        refex->access);

                if (match)
                        /* parent has an exception that matches the proposed */
                        return true;
                else
                        return false;
        }
        return false;
}

/*
 * parent_has_perm:
 * when adding a new allow rule to a device exception list, the rule
 * must be allowed in the parent device
 */
static int parent_has_perm(struct dev_cgroup *childcg,
                                  struct dev_exception_item *ex)
{
        struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent);

        if (!parent)
                return 1;
        return verify_new_ex(parent, ex, childcg->behavior);
}

/**
 * parent_allows_removal - verify if it's ok to remove an exception
 * @childcg: child cgroup from where the exception will be removed
 * @ex: exception being removed
 *
 * When removing an exception in cgroups with default ALLOW policy, it must
 * be checked if removing it will give the child cgroup more access than the
 * parent.
 *
 * Return: true if it's ok to remove exception, false otherwise
 */
static bool parent_allows_removal(struct dev_cgroup *childcg,
                                  struct dev_exception_item *ex)
{
        struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent);

        if (!parent)
                return true;

        /* It's always allowed to remove access to devices */
        if (childcg->behavior == DEVCG_DEFAULT_DENY)
                return true;

        /*
         * Make sure you're not removing part or a whole exception existing in
         * the parent cgroup
         */
        return !match_exception_partial(&parent->exceptions, ex->type,
                                        ex->major, ex->minor, ex->access);
}

/**
 * may_allow_all - checks if it's possible to change the behavior to
 *                   allow based on parent's rules.
 * @parent: device cgroup's parent
 * returns: != 0 in case it's allowed, 0 otherwise
 */
static inline int may_allow_all(struct dev_cgroup *parent)
{
        if (!parent)
                return 1;
        return parent->behavior == DEVCG_DEFAULT_ALLOW;
}

/**
 * revalidate_active_exceptions - walks through the active exception list and
 *                                   revalidates the exceptions based on parent's
 *                                   behavior and exceptions. The exceptions that
 *                                   are no longer valid will be removed.
 *                                   Called with devcgroup_mutex held.
 * @devcg: cgroup which exceptions will be checked
 *
 * This is one of the three key functions for hierarchy implementation.
 * This function is responsible for re-evaluating all the cgroup's active
 * exceptions due to a parent's exception change.
 * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details.
 */
static void revalidate_active_exceptions(struct dev_cgroup *devcg)
{
        struct dev_exception_item *ex;
        struct list_head *this, *tmp;

        list_for_each_safe(this, tmp, &devcg->exceptions) {
                ex = container_of(this, struct dev_exception_item, list);
                if (!parent_has_perm(devcg, ex))
                        dev_exception_rm(devcg, ex);
        }
}

/**
 * propagate_exception - propagates a new exception to the children
 * @devcg_root: device cgroup that added a new exception
 * @ex: new exception to be propagated
 *
 * returns: 0 in case of success, != 0 in case of error
 */
static int propagate_exception(struct dev_cgroup *devcg_root,
                               struct dev_exception_item *ex)
{
        struct cgroup_subsys_state *pos;
        int rc = 0;

        rcu_read_lock();

        css_for_each_descendant_pre(pos, &devcg_root->css) {
                struct dev_cgroup *devcg = css_to_devcgroup(pos);

                /*
                 * Because devcgroup_mutex is held, no devcg will become
                 * online or offline during the tree walk (see on/offline
                 * methods), and online ones are safe to access outside RCU
                 * read lock without bumping refcnt.
                 */
                if (pos == &devcg_root->css || !is_devcg_online(devcg))
                        continue;

                rcu_read_unlock();

                /*
                 * in case both root's behavior and devcg is allow, a new
                 * restriction means adding to the exception list
                 */
                if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW &&
                    devcg->behavior == DEVCG_DEFAULT_ALLOW) {
                        rc = dev_exception_add(devcg, ex);
                        if (rc)
                                return rc;
                } else {
                        /*
                         * in the other possible cases:
                         * root's behavior: allow, devcg's: deny
                         * root's behavior: deny, devcg's: deny
                         * the exception will be removed
                         */
                        dev_exception_rm(devcg, ex);
                }
                revalidate_active_exceptions(devcg);

                rcu_read_lock();
        }

        rcu_read_unlock();
        return rc;
}

/*
 * Modify the exception list using allow/deny rules.
 * CAP_SYS_ADMIN is needed for this.  It's at least separate from CAP_MKNOD
 * so we can give a container CAP_MKNOD to let it create devices but not
 * modify the exception list.
 * It seems likely we'll want to add a CAP_CONTAINER capability to allow
 * us to also grant CAP_SYS_ADMIN to containers without giving away the
 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN
 *
 * Taking rules away is always allowed (given CAP_SYS_ADMIN).  Granting
 * new access is only allowed if you're in the top-level cgroup, or your
 * parent cgroup has the access you're asking for.
 */
static int devcgroup_update_access(struct dev_cgroup *devcgroup,
                                   int filetype, char *buffer)
{
        const char *b;
        char temp[12];                /* 11 + 1 characters needed for a u32 */
        int count, rc = 0;
        struct dev_exception_item ex;
        struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent);
        struct dev_cgroup tmp_devcgrp;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        memset(&ex, 0, sizeof(ex));
        memset(&tmp_devcgrp, 0, sizeof(tmp_devcgrp));
        b = buffer;

        switch (*b) {
        case 'a':
                switch (filetype) {
                case DEVCG_ALLOW:
                        if (css_has_online_children(&devcgroup->css))
                                return -EINVAL;

                        if (!may_allow_all(parent))
                                return -EPERM;
                        if (!parent) {
                                devcgroup->behavior = DEVCG_DEFAULT_ALLOW;
                                dev_exception_clean(devcgroup);
                                break;
                        }

                        INIT_LIST_HEAD(&tmp_devcgrp.exceptions);
                        rc = dev_exceptions_copy(&tmp_devcgrp.exceptions,
                                                 &devcgroup->exceptions);
                        if (rc)
                                return rc;
                        dev_exception_clean(devcgroup);
                        rc = dev_exceptions_copy(&devcgroup->exceptions,
                                                 &parent->exceptions);
                        if (rc) {
                                dev_exceptions_move(&devcgroup->exceptions,
                                                    &tmp_devcgrp.exceptions);
                                return rc;
                        }
                        devcgroup->behavior = DEVCG_DEFAULT_ALLOW;
                        dev_exception_clean(&tmp_devcgrp);
                        break;
                case DEVCG_DENY:
                        if (css_has_online_children(&devcgroup->css))
                                return -EINVAL;

                        dev_exception_clean(devcgroup);
                        devcgroup->behavior = DEVCG_DEFAULT_DENY;
                        break;
                default:
                        return -EINVAL;
                }
                return 0;
        case 'b':
                ex.type = DEVCG_DEV_BLOCK;
                break;
        case 'c':
                ex.type = DEVCG_DEV_CHAR;
                break;
        default:
                return -EINVAL;
        }
        b++;
        if (!isspace(*b))
                return -EINVAL;
        b++;
        if (*b == '*') {
                ex.major = ~0;
                b++;
        } else if (isdigit(*b)) {
                memset(temp, 0, sizeof(temp));
                for (count = 0; count < sizeof(temp) - 1; count++) {
                        temp[count] = *b;
                        b++;
                        if (!isdigit(*b))
                                break;
                }
                rc = kstrtou32(temp, 10, &ex.major);
                if (rc)
                        return -EINVAL;
        } else {
                return -EINVAL;
        }
        if (*b != ':')
                return -EINVAL;
        b++;

        /* read minor */
        if (*b == '*') {
                ex.minor = ~0;
                b++;
        } else if (isdigit(*b)) {
                memset(temp, 0, sizeof(temp));
                for (count = 0; count < sizeof(temp) - 1; count++) {
                        temp[count] = *b;
                        b++;
                        if (!isdigit(*b))
                                break;
                }
                rc = kstrtou32(temp, 10, &ex.minor);
                if (rc)
                        return -EINVAL;
        } else {
                return -EINVAL;
        }
        if (!isspace(*b))
                return -EINVAL;
        for (b++, count = 0; count < 3; count++, b++) {
                switch (*b) {
                case 'r':
                        ex.access |= DEVCG_ACC_READ;
                        break;
                case 'w':
                        ex.access |= DEVCG_ACC_WRITE;
                        break;
                case 'm':
                        ex.access |= DEVCG_ACC_MKNOD;
                        break;
                case '\n':
                case '\0':
                        count = 3;
                        break;
                default:
                        return -EINVAL;
                }
        }

        switch (filetype) {
        case DEVCG_ALLOW:
                /*
                 * If the default policy is to allow by default, try to remove
                 * an matching exception instead. And be silent about it: we
                 * don't want to break compatibility
                 */
                if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
                        /* Check if the parent allows removing it first */
                        if (!parent_allows_removal(devcgroup, &ex))
                                return -EPERM;
                        dev_exception_rm(devcgroup, &ex);
                        break;
                }

                if (!parent_has_perm(devcgroup, &ex))
                        return -EPERM;
                rc = dev_exception_add(devcgroup, &ex);
                break;
        case DEVCG_DENY:
                /*
                 * If the default policy is to deny by default, try to remove
                 * an matching exception instead. And be silent about it: we
                 * don't want to break compatibility
                 */
                if (devcgroup->behavior == DEVCG_DEFAULT_DENY)
                        dev_exception_rm(devcgroup, &ex);
                else
                        rc = dev_exception_add(devcgroup, &ex);

                if (rc)
                        break;
                /* we only propagate new restrictions */
                rc = propagate_exception(devcgroup, &ex);
                break;
        default:
                rc = -EINVAL;
        }
        return rc;
}

static ssize_t devcgroup_access_write(struct kernfs_open_file *of,
                                      char *buf, size_t nbytes, loff_t off)
{
        int retval;

        mutex_lock(&devcgroup_mutex);
        retval = devcgroup_update_access(css_to_devcgroup(of_css(of)),
                                         of_cft(of)->private, strstrip(buf));
        mutex_unlock(&devcgroup_mutex);
        return retval ?: nbytes;
}

static struct cftype dev_cgroup_files[] = {
        {
                .name = "allow",
                .write = devcgroup_access_write,
                .private = DEVCG_ALLOW,
        },
        {
                .name = "deny",
                .write = devcgroup_access_write,
                .private = DEVCG_DENY,
        },
        {
                .name = "list",
                .seq_show = devcgroup_seq_show,
                .private = DEVCG_LIST,
        },
        { }        /* terminate */
};

struct cgroup_subsys devices_cgrp_subsys = {
        .css_alloc = devcgroup_css_alloc,
        .css_free = devcgroup_css_free,
        .css_online = devcgroup_online,
        .css_offline = devcgroup_offline,
        .legacy_cftypes = dev_cgroup_files,
};

/**
 * devcgroup_legacy_check_permission - checks if an inode operation is permitted
 * @type: device type
 * @major: device major number
 * @minor: device minor number
 * @access: combination of DEVCG_ACC_WRITE, DEVCG_ACC_READ and DEVCG_ACC_MKNOD
 *
 * returns 0 on success, -EPERM case the operation is not permitted
 */
static int devcgroup_legacy_check_permission(short type, u32 major, u32 minor,
                                        short access)
{
        struct dev_cgroup *dev_cgroup;
        bool rc;

        rcu_read_lock();
        dev_cgroup = task_devcgroup(current);
        if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW)
                /* Can't match any of the exceptions, even partially */
                rc = !match_exception_partial(&dev_cgroup->exceptions,
                                              type, major, minor, access);
        else
                /* Need to match completely one exception to be allowed */
                rc = match_exception(&dev_cgroup->exceptions, type, major,
                                     minor, access);
        rcu_read_unlock();

        if (!rc)
                return -EPERM;

        return 0;
}

#endif /* CONFIG_CGROUP_DEVICE */

#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)

int devcgroup_check_permission(short type, u32 major, u32 minor, short access)
{
        int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access);

        if (rc)
                return rc;

        #ifdef CONFIG_CGROUP_DEVICE
        return devcgroup_legacy_check_permission(type, major, minor, access);

        #else /* CONFIG_CGROUP_DEVICE */
        return 0;

        #endif /* CONFIG_CGROUP_DEVICE */
}
EXPORT_SYMBOL(devcgroup_check_permission);
#endif /* defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) */


















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SEGMENT_H
#define _ASM_X86_SEGMENT_H

#include <linux/const.h>
#include <asm/alternative.h>
#include <asm/ibt.h>

/*
 * Constructor for a conventional segment GDT (or LDT) entry.
 * This is a macro so it can be used in initializers.
 */
#define GDT_ENTRY(flags, base, limit)                        \
        ((((base)  & _AC(0xff000000,ULL)) << (56-24)) |        \
         (((flags) & _AC(0x0000f0ff,ULL)) << 40) |        \
         (((limit) & _AC(0x000f0000,ULL)) << (48-16)) |        \
         (((base)  & _AC(0x00ffffff,ULL)) << 16) |        \
         (((limit) & _AC(0x0000ffff,ULL))))

/* Simple and small GDT entries for booting only: */

#define GDT_ENTRY_BOOT_CS        2
#define GDT_ENTRY_BOOT_DS        3
#define GDT_ENTRY_BOOT_TSS        4
#define __BOOT_CS                (GDT_ENTRY_BOOT_CS*8)
#define __BOOT_DS                (GDT_ENTRY_BOOT_DS*8)
#define __BOOT_TSS                (GDT_ENTRY_BOOT_TSS*8)

/*
 * Bottom two bits of selector give the ring
 * privilege level
 */
#define SEGMENT_RPL_MASK        0x3

/*
 * When running on Xen PV, the actual privilege level of the kernel is 1,
 * not 0. Testing the Requested Privilege Level in a segment selector to
 * determine whether the context is user mode or kernel mode with
 * SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level
 * matches the 0x3 mask.
 *
 * Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV
 * kernels because privilege level 2 is never used.
 */
#define USER_SEGMENT_RPL_MASK        0x2

/* User mode is privilege level 3: */
#define USER_RPL                0x3

/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */
#define SEGMENT_TI_MASK                0x4
/* LDT segment has TI set ... */
#define SEGMENT_LDT                0x4
/* ... GDT has it cleared */
#define SEGMENT_GDT                0x0

#define GDT_ENTRY_INVALID_SEG        0

#if defined(CONFIG_X86_32) && !defined(BUILD_VDSO32_64)
/*
 * The layout of the per-CPU GDT under Linux:
 *
 *   0 - null                                                                <=== cacheline #1
 *   1 - reserved
 *   2 - reserved
 *   3 - reserved
 *
 *   4 - unused                                                                <=== cacheline #2
 *   5 - unused
 *
 *  ------- start of TLS (Thread-Local Storage) segments:
 *
 *   6 - TLS segment #1                        [ glibc's TLS segment ]
 *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
 *   8 - TLS segment #3                                                        <=== cacheline #3
 *   9 - reserved
 *  10 - reserved
 *  11 - reserved
 *
 *  ------- start of kernel segments:
 *
 *  12 - kernel code segment                                                <=== cacheline #4
 *  13 - kernel data segment
 *  14 - default user CS
 *  15 - default user DS
 *  16 - TSS                                                                <=== cacheline #5
 *  17 - LDT
 *  18 - PNPBIOS support (16->32 gate)
 *  19 - PNPBIOS support
 *  20 - PNPBIOS support                                                <=== cacheline #6
 *  21 - PNPBIOS support
 *  22 - PNPBIOS support
 *  23 - APM BIOS support
 *  24 - APM BIOS support                                                <=== cacheline #7
 *  25 - APM BIOS support
 *
 *  26 - ESPFIX small SS
 *  27 - per-cpu                        [ offset to per-cpu data area ]
 *  28 - VDSO getcpu
 *  29 - unused
 *  30 - unused
 *  31 - TSS for double fault handler
 */
#define GDT_ENTRY_TLS_MIN                6
#define GDT_ENTRY_TLS_MAX                 (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)

#define GDT_ENTRY_KERNEL_CS                12
#define GDT_ENTRY_KERNEL_DS                13
#define GDT_ENTRY_DEFAULT_USER_CS        14
#define GDT_ENTRY_DEFAULT_USER_DS        15
#define GDT_ENTRY_TSS                        16
#define GDT_ENTRY_LDT                        17
#define GDT_ENTRY_PNPBIOS_CS32                18
#define GDT_ENTRY_PNPBIOS_CS16                19
#define GDT_ENTRY_PNPBIOS_DS                20
#define GDT_ENTRY_PNPBIOS_TS1                21
#define GDT_ENTRY_PNPBIOS_TS2                22
#define GDT_ENTRY_APMBIOS_BASE                23

#define GDT_ENTRY_ESPFIX_SS                26
#define GDT_ENTRY_PERCPU                27
#define GDT_ENTRY_CPUNODE                28

#define GDT_ENTRY_DOUBLEFAULT_TSS        31

/*
 * Number of entries in the GDT table:
 */
#define GDT_ENTRIES                        32

/*
 * Segment selector values corresponding to the above entries:
 */

#define __KERNEL_CS                        (GDT_ENTRY_KERNEL_CS*8)
#define __KERNEL_DS                        (GDT_ENTRY_KERNEL_DS*8)
#define __USER_DS                        (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
#define __USER_CS                        (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
#define __USER32_CS                        __USER_CS
#define __ESPFIX_SS                        (GDT_ENTRY_ESPFIX_SS*8)

/* segment for calling fn: */
#define PNP_CS32                        (GDT_ENTRY_PNPBIOS_CS32*8)
/* code segment for BIOS: */
#define PNP_CS16                        (GDT_ENTRY_PNPBIOS_CS16*8)

/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
#define SEGMENT_IS_PNP_CODE(x)                (((x) & 0xf4) == PNP_CS32)

/* data segment for BIOS: */
#define PNP_DS                                (GDT_ENTRY_PNPBIOS_DS*8)
/* transfer data segment: */
#define PNP_TS1                                (GDT_ENTRY_PNPBIOS_TS1*8)
/* another data segment: */
#define PNP_TS2                                (GDT_ENTRY_PNPBIOS_TS2*8)

#ifdef CONFIG_SMP
# define __KERNEL_PERCPU                (GDT_ENTRY_PERCPU*8)
#else
# define __KERNEL_PERCPU                0
#endif

#define __CPUNODE_SEG                        (GDT_ENTRY_CPUNODE*8 + 3)

#else /* 64-bit: */

#include <asm/cache.h>

#define GDT_ENTRY_KERNEL32_CS                1
#define GDT_ENTRY_KERNEL_CS                2
#define GDT_ENTRY_KERNEL_DS                3

/*
 * We cannot use the same code segment descriptor for user and kernel mode,
 * not even in long flat mode, because of different DPL.
 *
 * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes
 * selectors:
 *
 *   if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
 *   if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
 *
 * ss = STAR.SYSRET_CS+8 (in either case)
 *
 * thus USER_DS should be between 32-bit and 64-bit code selectors:
 */
#define GDT_ENTRY_DEFAULT_USER32_CS        4
#define GDT_ENTRY_DEFAULT_USER_DS        5
#define GDT_ENTRY_DEFAULT_USER_CS        6

/* Needs two entries */
#define GDT_ENTRY_TSS                        8
/* Needs two entries */
#define GDT_ENTRY_LDT                        10

#define GDT_ENTRY_TLS_MIN                12
#define GDT_ENTRY_TLS_MAX                14

#define GDT_ENTRY_CPUNODE                15

/*
 * Number of entries in the GDT table:
 */
#define GDT_ENTRIES                        16

/*
 * Segment selector values corresponding to the above entries:
 *
 * Note, selectors also need to have a correct RPL,
 * expressed with the +3 value for user-space selectors:
 */
#define __KERNEL32_CS                        (GDT_ENTRY_KERNEL32_CS*8)
#define __KERNEL_CS                        (GDT_ENTRY_KERNEL_CS*8)
#define __KERNEL_DS                        (GDT_ENTRY_KERNEL_DS*8)
#define __USER32_CS                        (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
#define __USER_DS                        (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
#define __USER_CS                        (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
#define __CPUNODE_SEG                        (GDT_ENTRY_CPUNODE*8 + 3)

#endif

#define IDT_ENTRIES                        256
#define NUM_EXCEPTION_VECTORS                32

/* Bitmask of exception vectors which push an error code on the stack: */
#define EXCEPTION_ERRCODE_MASK                0x20027d00

#define GDT_SIZE                        (GDT_ENTRIES*8)
#define GDT_ENTRY_TLS_ENTRIES                3
#define TLS_SIZE                        (GDT_ENTRY_TLS_ENTRIES* 8)

/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */
#define VDSO_CPUNODE_BITS                12
#define VDSO_CPUNODE_MASK                0xfff

#ifndef __ASSEMBLER__

/* Helper functions to store/load CPU and node numbers */

static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node)
{
        return (node << VDSO_CPUNODE_BITS) | cpu;
}

static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
{
        unsigned long p;

        /*
         * Load CPU and node number from the GDT.  LSL is faster than RDTSCP
         * and works on all CPUs.  This is volatile so that it orders
         * correctly with respect to barrier() and to keep GCC from cleverly
         * hoisting it out of the calling function.
         *
         * If RDPID is available, use it.
         */
        alternative_io ("lsl %[seg],%k[p]",
                        "rdpid %[p]",
                        X86_FEATURE_RDPID,
                        [p] "=r" (p), [seg] "r" (__CPUNODE_SEG));

        if (cpu)
                *cpu = (p & VDSO_CPUNODE_MASK);
        if (node)
                *node = (p >> VDSO_CPUNODE_BITS);
}

#endif /* !__ASSEMBLER__ */

#ifdef __KERNEL__

/*
 * early_idt_handler_array is an array of entry points referenced in the
 * early IDT.  For simplicity, it's a real array with one entry point
 * every nine bytes.  That leaves room for an optional 'push $0' if the
 * vector has no error code (two bytes), a 'push $vector_number' (two
 * bytes), and a jump to the common entry code (up to five bytes).
 */
#define EARLY_IDT_HANDLER_SIZE (9 + ENDBR_INSN_SIZE)

/*
 * xen_early_idt_handler_array is for Xen pv guests: for each entry in
 * early_idt_handler_array it contains a prequel in the form of
 * pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to
 * max 8 bytes.
 */
#define XEN_EARLY_IDT_HANDLER_SIZE (8 + ENDBR_INSN_SIZE)

#ifndef __ASSEMBLER__

extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
extern void early_ignore_irq(void);

#ifdef CONFIG_XEN_PV
extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE];
#endif

/*
 * Load a segment. Fall back on loading the zero segment if something goes
 * wrong.  This variant assumes that loading zero fully clears the segment.
 * This is always the case on Intel CPUs and, even on 64-bit AMD CPUs, any
 * failure to fully clear the cached descriptor is only observable for
 * FS and GS.
 */
#define LOAD_SEGMENT(seg)                                                \
static inline void __loadsegment_##seg(u16 value)                        \
{                                                                        \
        asm volatile("1:        movl %k0,%%" #seg "\n"                        \
                     _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k0)\
                     : "+r" (value) : : "memory");                        \
}

LOAD_SEGMENT(ss)
LOAD_SEGMENT(ds)
LOAD_SEGMENT(es)

#ifdef CONFIG_X86_32

/*
 * On 32-bit systems, the hidden parts of FS and GS are unobservable if
 * the selector is NULL, so there's no funny business here.
 */
LOAD_SEGMENT(fs)
LOAD_SEGMENT(gs)

#else

static inline void __loadsegment_fs(u16 value)
{
        asm volatile("1:        movw %0, %%fs\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS)
                     : : ASM_INPUT_RM (value) : "memory");
}

/* __loadsegment_gs is intentionally undefined.  Use load_gs_index instead. */

#endif

#undef LOAD_SEGMENT

#define loadsegment(seg, val) __loadsegment_##seg(val)

/*
 * Save a segment register away:
 */
#define SAVE_SEGMENT(seg)                                \
static inline unsigned long __savesegment_##seg(void)        \
{                                                        \
        unsigned long v;                                \
        asm volatile("movl %%" #seg ",%k0" : "=r" (v));        \
        return v;                                        \
}

SAVE_SEGMENT(cs)
SAVE_SEGMENT(ss)
SAVE_SEGMENT(ds)
SAVE_SEGMENT(es)
SAVE_SEGMENT(fs)
SAVE_SEGMENT(gs)

#undef SAVE_SEGMENT

#define savesegment(seg, var) ((var) = __savesegment_##seg())

#endif /* !__ASSEMBLER__ */
#endif /* __KERNEL__ */

#endif /* _ASM_X86_SEGMENT_H */





















    2 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _BLOCK_BLK_PM_H_
#define _BLOCK_BLK_PM_H_

#include <linux/pm_runtime.h>

#ifdef CONFIG_PM
static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
{
        if (!q->dev || !blk_queue_pm_only(q))
                return 1;        /* Nothing to do */
        if (pm && q->rpm_status != RPM_SUSPENDED)
                return 1;        /* Request allowed */
        pm_request_resume(q->dev);
        return 0;
}

static inline void blk_pm_mark_last_busy(struct request *rq)
{
        if (rq->q->dev && !(rq->rq_flags & RQF_PM))
                pm_runtime_mark_last_busy(rq->q->dev);
}
#else
static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
{
        return 1;
}

static inline void blk_pm_mark_last_busy(struct request *rq)
{
}
#endif

#endif /* _BLOCK_BLK_PM_H_ */


































































































































































































































































































    1 
























    1 





















































































































































    1 


    1 































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Host Side support for RNDIS Networking Links
 * Copyright (C) 2005 by David Brownell
 */
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/mii.h>
#include <linux/usb.h>
#include <linux/usb/cdc.h>
#include <linux/usb/usbnet.h>
#include <linux/usb/rndis_host.h>


/*
 * RNDIS is NDIS remoted over USB.  It's a MSFT variant of CDC ACM ... of
 * course ACM was intended for modems, not Ethernet links!  USB's standard
 * for Ethernet links is "CDC Ethernet", which is significantly simpler.
 *
 * NOTE that Microsoft's "RNDIS 1.0" specification is incomplete.  Issues
 * include:
 *    - Power management in particular relies on information that's scattered
 *        through other documentation, and which is incomplete or incorrect even
 *        there.
 *    - There are various undocumented protocol requirements, such as the
 *        need to send unused garbage in control-OUT messages.
 *    - In some cases, MS-Windows will emit undocumented requests; this
 *        matters more to peripheral implementations than host ones.
 *
 * Moreover there's a no-open-specs variant of RNDIS called "ActiveSync".
 *
 * For these reasons and others, ** USE OF RNDIS IS STRONGLY DISCOURAGED ** in
 * favor of such non-proprietary alternatives as CDC Ethernet or the newer (and
 * currently rare) "Ethernet Emulation Model" (EEM).
 */

/*
 * RNDIS notifications from device: command completion; "reverse"
 * keepalives; etc
 */
void rndis_status(struct usbnet *dev, struct urb *urb)
{
        netdev_dbg(dev->net, "rndis status urb, len %d stat %d\n",
                   urb->actual_length, urb->status);
        // FIXME for keepalives, respond immediately (asynchronously)
        // if not an RNDIS status, do like cdc_status(dev,urb) does
}
EXPORT_SYMBOL_GPL(rndis_status);

/*
 * RNDIS indicate messages.
 */
static void rndis_msg_indicate(struct usbnet *dev, struct rndis_indicate *msg,
                                int buflen)
{
        struct cdc_state *info = (void *)&dev->data;
        struct device *udev = &info->control->dev;

        if (dev->driver_info->indication) {
                dev->driver_info->indication(dev, msg, buflen);
        } else {
                u32 status = le32_to_cpu(msg->status);

                switch (status) {
                case RNDIS_STATUS_MEDIA_CONNECT:
                        dev_info(udev, "rndis media connect\n");
                        break;
                case RNDIS_STATUS_MEDIA_DISCONNECT:
                        dev_info(udev, "rndis media disconnect\n");
                        break;
                default:
                        dev_info(udev, "rndis indication: 0x%08x\n", status);
                }
        }
}

/*
 * RPC done RNDIS-style.  Caller guarantees:
 * - message is properly byteswapped
 * - there's no other request pending
 * - buf can hold up to 1KB response (required by RNDIS spec)
 * On return, the first few entries are already byteswapped.
 *
 * Call context is likely probe(), before interface name is known,
 * which is why we won't try to use it in the diagnostics.
 */
int rndis_command(struct usbnet *dev, struct rndis_msg_hdr *buf, int buflen)
{
        struct cdc_state        *info = (void *) &dev->data;
        struct usb_cdc_notification notification;
        int                        master_ifnum;
        int                        retval;
        int                        partial;
        unsigned                count;
        u32                        xid = 0, msg_len, request_id, msg_type, rsp,
                                status;

        /* REVISIT when this gets called from contexts other than probe() or
         * disconnect(): either serialize, or dispatch responses on xid
         */

        msg_type = le32_to_cpu(buf->msg_type);

        /* Issue the request; xid is unique, don't bother byteswapping it */
        if (likely(msg_type != RNDIS_MSG_HALT && msg_type != RNDIS_MSG_RESET)) {
                xid = dev->xid++;
                if (!xid)
                        xid = dev->xid++;
                buf->request_id = (__force __le32) xid;
        }
        master_ifnum = info->control->cur_altsetting->desc.bInterfaceNumber;
        retval = usb_control_msg(dev->udev,
                usb_sndctrlpipe(dev->udev, 0),
                USB_CDC_SEND_ENCAPSULATED_COMMAND,
                USB_TYPE_CLASS | USB_RECIP_INTERFACE,
                0, master_ifnum,
                buf, le32_to_cpu(buf->msg_len),
                RNDIS_CONTROL_TIMEOUT_MS);
        if (unlikely(retval < 0 || xid == 0))
                return retval;

        /* Some devices don't respond on the control channel until
         * polled on the status channel, so do that first. */
        if (dev->driver_info->data & RNDIS_DRIVER_DATA_POLL_STATUS) {
                retval = usb_interrupt_msg(
                        dev->udev,
                        usb_rcvintpipe(dev->udev,
                                       dev->status->desc.bEndpointAddress),
                        &notification, sizeof(notification), &partial,
                        RNDIS_CONTROL_TIMEOUT_MS);
                if (unlikely(retval < 0))
                        return retval;
        }

        /* Poll the control channel; the request probably completed immediately */
        rsp = le32_to_cpu(buf->msg_type) | RNDIS_MSG_COMPLETION;
        for (count = 0; count < 10; count++) {
                memset(buf, 0, CONTROL_BUFFER_SIZE);
                retval = usb_control_msg(dev->udev,
                        usb_rcvctrlpipe(dev->udev, 0),
                        USB_CDC_GET_ENCAPSULATED_RESPONSE,
                        USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE,
                        0, master_ifnum,
                        buf, buflen,
                        RNDIS_CONTROL_TIMEOUT_MS);
                if (likely(retval >= 8)) {
                        msg_type = le32_to_cpu(buf->msg_type);
                        msg_len = le32_to_cpu(buf->msg_len);
                        status = le32_to_cpu(buf->status);
                        request_id = (__force u32) buf->request_id;
                        if (likely(msg_type == rsp)) {
                                if (likely(request_id == xid)) {
                                        if (unlikely(rsp == RNDIS_MSG_RESET_C))
                                                return 0;
                                        if (likely(RNDIS_STATUS_SUCCESS ==
                                                        status))
                                                return 0;
                                        dev_dbg(&info->control->dev,
                                                "rndis reply status %08x\n",
                                                status);
                                        return -EL3RST;
                                }
                                dev_dbg(&info->control->dev,
                                        "rndis reply id %d expected %d\n",
                                        request_id, xid);
                                /* then likely retry */
                        } else switch (msg_type) {
                        case RNDIS_MSG_INDICATE: /* fault/event */
                                rndis_msg_indicate(dev, (void *)buf, buflen);
                                break;
                        case RNDIS_MSG_KEEPALIVE: { /* ping */
                                struct rndis_keepalive_c *msg = (void *)buf;

                                msg->msg_type = cpu_to_le32(RNDIS_MSG_KEEPALIVE_C);
                                msg->msg_len = cpu_to_le32(sizeof *msg);
                                msg->status = cpu_to_le32(RNDIS_STATUS_SUCCESS);
                                retval = usb_control_msg(dev->udev,
                                        usb_sndctrlpipe(dev->udev, 0),
                                        USB_CDC_SEND_ENCAPSULATED_COMMAND,
                                        USB_TYPE_CLASS | USB_RECIP_INTERFACE,
                                        0, master_ifnum,
                                        msg, sizeof *msg,
                                        RNDIS_CONTROL_TIMEOUT_MS);
                                if (unlikely(retval < 0))
                                        dev_dbg(&info->control->dev,
                                                "rndis keepalive err %d\n",
                                                retval);
                                }
                                break;
                        default:
                                dev_dbg(&info->control->dev,
                                        "unexpected rndis msg %08x len %d\n",
                                        le32_to_cpu(buf->msg_type), msg_len);
                        }
                } else {
                        /* device probably issued a protocol stall; ignore */
                        dev_dbg(&info->control->dev,
                                "rndis response error, code %d\n", retval);
                }
                msleep(40);
        }
        dev_dbg(&info->control->dev, "rndis response timeout\n");
        return -ETIMEDOUT;
}
EXPORT_SYMBOL_GPL(rndis_command);

/*
 * rndis_query:
 *
 * Performs a query for @oid along with 0 or more bytes of payload as
 * specified by @in_len. If @reply_len is not set to -1 then the reply
 * length is checked against this value, resulting in an error if it
 * doesn't match.
 *
 * NOTE: Adding a payload exactly or greater than the size of the expected
 * response payload is an evident requirement MSFT added for ActiveSync.
 *
 * The only exception is for OIDs that return a variably sized response,
 * in which case no payload should be added.  This undocumented (and
 * nonsensical!) issue was found by sniffing protocol requests from the
 * ActiveSync 4.1 Windows driver.
 */
static int rndis_query(struct usbnet *dev, struct usb_interface *intf,
                void *buf, u32 oid, u32 in_len,
                void **reply, int *reply_len)
{
        int retval;
        union {
                void                        *buf;
                struct rndis_msg_hdr        *header;
                struct rndis_query        *get;
                struct rndis_query_c        *get_c;
        } u;
        u32 off, len;

        u.buf = buf;

        memset(u.get, 0, sizeof *u.get + in_len);
        u.get->msg_type = cpu_to_le32(RNDIS_MSG_QUERY);
        u.get->msg_len = cpu_to_le32(sizeof *u.get + in_len);
        u.get->oid = cpu_to_le32(oid);
        u.get->len = cpu_to_le32(in_len);
        u.get->offset = cpu_to_le32(20);

        retval = rndis_command(dev, u.header, CONTROL_BUFFER_SIZE);
        if (unlikely(retval < 0)) {
                dev_err(&intf->dev, "RNDIS_MSG_QUERY(0x%08x) failed, %d\n",
                                oid, retval);
                return retval;
        }

        off = le32_to_cpu(u.get_c->offset);
        len = le32_to_cpu(u.get_c->len);
        if (unlikely((off > CONTROL_BUFFER_SIZE - 8) ||
                     (len > CONTROL_BUFFER_SIZE - 8 - off)))
                goto response_error;

        if (*reply_len != -1 && len != *reply_len)
                goto response_error;

        *reply = (unsigned char *) &u.get_c->request_id + off;
        *reply_len = len;

        return retval;

response_error:
        dev_err(&intf->dev, "RNDIS_MSG_QUERY(0x%08x) "
                        "invalid response - off %d len %d\n",
                oid, off, len);
        return -EDOM;
}

/* same as usbnet_netdev_ops but MTU change not allowed */
static const struct net_device_ops rndis_netdev_ops = {
        .ndo_open                = usbnet_open,
        .ndo_stop                = usbnet_stop,
        .ndo_start_xmit                = usbnet_start_xmit,
        .ndo_tx_timeout                = usbnet_tx_timeout,
        .ndo_get_stats64        = dev_get_tstats64,
        .ndo_set_mac_address         = eth_mac_addr,
        .ndo_validate_addr        = eth_validate_addr,
};

int
generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags)
{
        int                        retval;
        struct net_device        *net = dev->net;
        struct cdc_state        *info = (void *) &dev->data;
        union {
                void                        *buf;
                struct rndis_msg_hdr        *header;
                struct rndis_init        *init;
                struct rndis_init_c        *init_c;
                struct rndis_query        *get;
                struct rndis_query_c        *get_c;
                struct rndis_set        *set;
                struct rndis_set_c        *set_c;
                struct rndis_halt        *halt;
        } u;
        u32                        tmp;
        __le32                        phym_unspec, *phym;
        int                        reply_len;
        unsigned char                *bp;

        /* we can't rely on i/o from stack working, or stack allocation */
        u.buf = kmalloc(CONTROL_BUFFER_SIZE, GFP_KERNEL);
        if (!u.buf)
                return -ENOMEM;
        retval = usbnet_generic_cdc_bind(dev, intf);
        if (retval < 0)
                goto fail;

        u.init->msg_type = cpu_to_le32(RNDIS_MSG_INIT);
        u.init->msg_len = cpu_to_le32(sizeof *u.init);
        u.init->major_version = cpu_to_le32(1);
        u.init->minor_version = cpu_to_le32(0);

        /* max transfer (in spec) is 0x4000 at full speed, but for
         * TX we'll stick to one Ethernet packet plus RNDIS framing.
         * For RX we handle drivers that zero-pad to end-of-packet.
         * Don't let userspace change these settings.
         *
         * NOTE: there still seems to be weirdness here, as if we need
         * to do some more things to make sure WinCE targets accept this.
         * They default to jumbograms of 8KB or 16KB, which is absurd
         * for such low data rates and which is also more than Linux
         * can usually expect to allocate for SKB data...
         */
        net->hard_header_len += sizeof (struct rndis_data_hdr);
        dev->hard_mtu = net->mtu + net->hard_header_len;

        dev->maxpacket = usb_maxpacket(dev->udev, dev->out);
        if (dev->maxpacket == 0) {
                netif_dbg(dev, probe, dev->net,
                          "dev->maxpacket can't be 0\n");
                retval = -EINVAL;
                goto fail_and_release;
        }

        dev->rx_urb_size = dev->hard_mtu + (dev->maxpacket + 1);
        dev->rx_urb_size &= ~(dev->maxpacket - 1);
        u.init->max_transfer_size = cpu_to_le32(dev->rx_urb_size);

        net->netdev_ops = &rndis_netdev_ops;

        retval = rndis_command(dev, u.header, CONTROL_BUFFER_SIZE);
        if (unlikely(retval < 0)) {
                /* it might not even be an RNDIS device!! */
                dev_err(&intf->dev, "RNDIS init failed, %d\n", retval);
                goto fail_and_release;
        }
        tmp = le32_to_cpu(u.init_c->max_transfer_size);
        if (tmp < dev->hard_mtu) {
                if (tmp <= net->hard_header_len) {
                        dev_err(&intf->dev,
                                "dev can't take %u byte packets (max %u)\n",
                                dev->hard_mtu, tmp);
                        retval = -EINVAL;
                        goto halt_fail_and_release;
                }
                dev_warn(&intf->dev,
                         "dev can't take %u byte packets (max %u), "
                         "adjusting MTU to %u\n",
                         dev->hard_mtu, tmp, tmp - net->hard_header_len);
                dev->hard_mtu = tmp;
                net->mtu = dev->hard_mtu - net->hard_header_len;
        }

        /* REVISIT:  peripheral "alignment" request is ignored ... */
        dev_dbg(&intf->dev,
                "hard mtu %u (%u from dev), rx buflen %zu, align %d\n",
                dev->hard_mtu, tmp, dev->rx_urb_size,
                1 << le32_to_cpu(u.init_c->packet_alignment));

        /* module has some device initialization code needs to be done right
         * after RNDIS_INIT */
        if (dev->driver_info->early_init &&
                        dev->driver_info->early_init(dev) != 0)
                goto halt_fail_and_release;

        /* Check physical medium */
        phym = NULL;
        reply_len = sizeof *phym;
        retval = rndis_query(dev, intf, u.buf,
                             RNDIS_OID_GEN_PHYSICAL_MEDIUM,
                             reply_len, (void **)&phym, &reply_len);
        if (retval != 0 || !phym) {
                /* OID is optional so don't fail here. */
                phym_unspec = cpu_to_le32(RNDIS_PHYSICAL_MEDIUM_UNSPECIFIED);
                phym = &phym_unspec;
        }
        if ((flags & FLAG_RNDIS_PHYM_WIRELESS) &&
            le32_to_cpup(phym) != RNDIS_PHYSICAL_MEDIUM_WIRELESS_LAN) {
                netif_dbg(dev, probe, dev->net,
                          "driver requires wireless physical medium, but device is not\n");
                retval = -ENODEV;
                goto halt_fail_and_release;
        }
        if ((flags & FLAG_RNDIS_PHYM_NOT_WIRELESS) &&
            le32_to_cpup(phym) == RNDIS_PHYSICAL_MEDIUM_WIRELESS_LAN) {
                netif_dbg(dev, probe, dev->net,
                          "driver requires non-wireless physical medium, but device is wireless.\n");
                retval = -ENODEV;
                goto halt_fail_and_release;
        }

        /* Get designated host ethernet address */
        reply_len = ETH_ALEN;
        retval = rndis_query(dev, intf, u.buf,
                             RNDIS_OID_802_3_PERMANENT_ADDRESS,
                             48, (void **) &bp, &reply_len);
        if (unlikely(retval< 0)) {
                dev_err(&intf->dev, "rndis get ethaddr, %d\n", retval);
                goto halt_fail_and_release;
        }

        eth_hw_addr_set(net, bp);

        /* set a nonzero filter to enable data transfers */
        memset(u.set, 0, sizeof *u.set);
        u.set->msg_type = cpu_to_le32(RNDIS_MSG_SET);
        u.set->msg_len = cpu_to_le32(4 + sizeof *u.set);
        u.set->oid = cpu_to_le32(RNDIS_OID_GEN_CURRENT_PACKET_FILTER);
        u.set->len = cpu_to_le32(4);
        u.set->offset = cpu_to_le32((sizeof *u.set) - 8);
        *(__le32 *)(u.buf + sizeof *u.set) = cpu_to_le32(RNDIS_DEFAULT_FILTER);

        retval = rndis_command(dev, u.header, CONTROL_BUFFER_SIZE);
        if (unlikely(retval < 0)) {
                dev_err(&intf->dev, "rndis set packet filter, %d\n", retval);
                goto halt_fail_and_release;
        }

        retval = 0;

        kfree(u.buf);
        return retval;

halt_fail_and_release:
        memset(u.halt, 0, sizeof *u.halt);
        u.halt->msg_type = cpu_to_le32(RNDIS_MSG_HALT);
        u.halt->msg_len = cpu_to_le32(sizeof *u.halt);
        (void) rndis_command(dev, (void *)u.halt, CONTROL_BUFFER_SIZE);
fail_and_release:
        usb_set_intfdata(info->data, NULL);
        usb_driver_release_interface(driver_of(intf), info->data);
        info->data = NULL;
fail:
        kfree(u.buf);
        return retval;
}
EXPORT_SYMBOL_GPL(generic_rndis_bind);

static int rndis_bind(struct usbnet *dev, struct usb_interface *intf)
{
        return generic_rndis_bind(dev, intf, FLAG_RNDIS_PHYM_NOT_WIRELESS);
}

static int zte_rndis_bind(struct usbnet *dev, struct usb_interface *intf)
{
        int status = rndis_bind(dev, intf);

        if (!status && (dev->net->dev_addr[0] & 0x02))
                eth_hw_addr_random(dev->net);

        return status;
}

void rndis_unbind(struct usbnet *dev, struct usb_interface *intf)
{
        struct rndis_halt        *halt;

        /* try to clear any rndis state/activity (no i/o from stack!) */
        halt = kzalloc(CONTROL_BUFFER_SIZE, GFP_KERNEL);
        if (halt) {
                halt->msg_type = cpu_to_le32(RNDIS_MSG_HALT);
                halt->msg_len = cpu_to_le32(sizeof *halt);
                (void) rndis_command(dev, (void *)halt, CONTROL_BUFFER_SIZE);
                kfree(halt);
        }

        usbnet_cdc_unbind(dev, intf);
}
EXPORT_SYMBOL_GPL(rndis_unbind);

/*
 * DATA -- host must not write zlps
 */
int rndis_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
{
        bool dst_mac_fixup;

        /* This check is no longer done by usbnet */
        if (skb->len < dev->net->hard_header_len)
                return 0;

        dst_mac_fixup = !!(dev->driver_info->data & RNDIS_DRIVER_DATA_DST_MAC_FIXUP);

        /* peripheral may have batched packets to us... */
        while (likely(skb->len)) {
                struct rndis_data_hdr        *hdr = (void *)skb->data;
                struct sk_buff                *skb2;
                u32                        msg_type, msg_len, data_offset, data_len;

                msg_type = le32_to_cpu(hdr->msg_type);
                msg_len = le32_to_cpu(hdr->msg_len);
                data_offset = le32_to_cpu(hdr->data_offset);
                data_len = le32_to_cpu(hdr->data_len);

                /* don't choke if we see oob, per-packet data, etc */
                if (unlikely(msg_type != RNDIS_MSG_PACKET || skb->len < msg_len
                                || (data_offset + data_len + 8) > msg_len)) {
                        dev->net->stats.rx_frame_errors++;
                        netdev_dbg(dev->net, "bad rndis message %d/%d/%d/%d, len %d\n",
                                   le32_to_cpu(hdr->msg_type),
                                   msg_len, data_offset, data_len, skb->len);
                        return 0;
                }
                skb_pull(skb, 8 + data_offset);

                /* at most one packet left? */
                if (likely((data_len - skb->len) <= sizeof *hdr)) {
                        skb_trim(skb, data_len);
                        break;
                }

                /* try to return all the packets in the batch */
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (unlikely(!skb2))
                        break;
                skb_pull(skb, msg_len - sizeof *hdr);
                skb_trim(skb2, data_len);

                if (unlikely(dst_mac_fixup))
                        usbnet_cdc_zte_rx_fixup(dev, skb2);

                usbnet_skb_return(dev, skb2);
        }

        /* caller will usbnet_skb_return the remaining packet */
        if (unlikely(dst_mac_fixup))
                usbnet_cdc_zte_rx_fixup(dev, skb);

        return 1;
}
EXPORT_SYMBOL_GPL(rndis_rx_fixup);

struct sk_buff *
rndis_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags)
{
        struct rndis_data_hdr        *hdr;
        struct sk_buff                *skb2;
        unsigned                len = skb->len;

        if (likely(!skb_cloned(skb))) {
                int        room = skb_headroom(skb);

                /* enough head room as-is? */
                if (unlikely((sizeof *hdr) <= room))
                        goto fill;

                /* enough room, but needs to be readjusted? */
                room += skb_tailroom(skb);
                if (likely((sizeof *hdr) <= room)) {
                        skb->data = memmove(skb->head + sizeof *hdr,
                                            skb->data, len);
                        skb_set_tail_pointer(skb, len);
                        goto fill;
                }
        }

        /* create a new skb, with the correct size (and tailpad) */
        skb2 = skb_copy_expand(skb, sizeof *hdr, 1, flags);
        dev_kfree_skb_any(skb);
        if (unlikely(!skb2))
                return skb2;
        skb = skb2;

        /* fill out the RNDIS header.  we won't bother trying to batch
         * packets; Linux minimizes wasted bandwidth through tx queues.
         */
fill:
        hdr = __skb_push(skb, sizeof *hdr);
        memset(hdr, 0, sizeof *hdr);
        hdr->msg_type = cpu_to_le32(RNDIS_MSG_PACKET);
        hdr->msg_len = cpu_to_le32(skb->len);
        hdr->data_offset = cpu_to_le32(sizeof(*hdr) - 8);
        hdr->data_len = cpu_to_le32(len);

        /* FIXME make the last packet always be short ... */
        return skb;
}
EXPORT_SYMBOL_GPL(rndis_tx_fixup);


static const struct driver_info        rndis_info = {
        .description =        "RNDIS device",
        .flags =        FLAG_ETHER | FLAG_POINTTOPOINT | FLAG_FRAMING_RN | FLAG_NO_SETINT,
        .bind =                rndis_bind,
        .unbind =        rndis_unbind,
        .status =        rndis_status,
        .rx_fixup =        rndis_rx_fixup,
        .tx_fixup =        rndis_tx_fixup,
};

static const struct driver_info        rndis_poll_status_info = {
        .description =        "RNDIS device (poll status before control)",
        .flags =        FLAG_ETHER | FLAG_POINTTOPOINT | FLAG_FRAMING_RN | FLAG_NO_SETINT,
        .data =                RNDIS_DRIVER_DATA_POLL_STATUS,
        .bind =                rndis_bind,
        .unbind =        rndis_unbind,
        .status =        rndis_status,
        .rx_fixup =        rndis_rx_fixup,
        .tx_fixup =        rndis_tx_fixup,
};

static const struct driver_info        zte_rndis_info = {
        .description =        "ZTE RNDIS device",
        .flags =        FLAG_ETHER | FLAG_POINTTOPOINT | FLAG_FRAMING_RN | FLAG_NO_SETINT,
        .data =                RNDIS_DRIVER_DATA_DST_MAC_FIXUP,
        .bind =                zte_rndis_bind,
        .unbind =        rndis_unbind,
        .status =        rndis_status,
        .rx_fixup =        rndis_rx_fixup,
        .tx_fixup =        rndis_tx_fixup,
};

/*-------------------------------------------------------------------------*/

static const struct usb_device_id        products [] = {
{
        /* 2Wire HomePortal 1000SW */
        USB_DEVICE_AND_INTERFACE_INFO(0x1630, 0x0042,
                                      USB_CLASS_COMM, 2 /* ACM */, 0x0ff),
        .driver_info = (unsigned long) &rndis_poll_status_info,
}, {
        /* Hytera Communications DMR radios' "Radio to PC Network" */
        USB_VENDOR_AND_INTERFACE_INFO(0x238b,
                                      USB_CLASS_COMM, 2 /* ACM */, 0x0ff),
        .driver_info = (unsigned long)&rndis_info,
}, {
        /* ZTE WWAN modules */
        USB_VENDOR_AND_INTERFACE_INFO(0x19d2,
                                      USB_CLASS_WIRELESS_CONTROLLER, 1, 3),
        .driver_info = (unsigned long)&zte_rndis_info,
}, {
        /* ZTE WWAN modules, ACM flavour */
        USB_VENDOR_AND_INTERFACE_INFO(0x19d2,
                                      USB_CLASS_COMM, 2 /* ACM */, 0x0ff),
        .driver_info = (unsigned long)&zte_rndis_info,
}, {
        /* RNDIS is MSFT's un-official variant of CDC ACM */
        USB_INTERFACE_INFO(USB_CLASS_COMM, 2 /* ACM */, 0x0ff),
        .driver_info = (unsigned long) &rndis_info,
}, {
        /* "ActiveSync" is an undocumented variant of RNDIS, used in WM5 */
        USB_INTERFACE_INFO(USB_CLASS_MISC, 1, 1),
        .driver_info = (unsigned long) &rndis_poll_status_info,
}, {
        /* RNDIS for tethering */
        USB_INTERFACE_INFO(USB_CLASS_WIRELESS_CONTROLLER, 1, 3),
        .driver_info = (unsigned long) &rndis_info,
}, {
        /* Novatel Verizon USB730L */
        USB_INTERFACE_INFO(USB_CLASS_MISC, 4, 1),
        .driver_info = (unsigned long) &rndis_info,
},
        { },                // END
};
MODULE_DEVICE_TABLE(usb, products);

static struct usb_driver rndis_driver = {
        .name =                "rndis_host",
        .id_table =        products,
        .probe =        usbnet_probe,
        .disconnect =        usbnet_disconnect,
        .suspend =        usbnet_suspend,
        .resume =        usbnet_resume,
        .disable_hub_initiated_lpm = 1,
};

module_usb_driver(rndis_driver);

MODULE_AUTHOR("David Brownell");
MODULE_DESCRIPTION("USB Host side RNDIS driver");
MODULE_LICENSE("GPL");
































































































































































    1 










































    1 
    1 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 */
#include "xfs_platform.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_bmap.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
#include "xfs_icache.h"
#include "xfs_sysfs.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_reflink.h"
#include "xfs_extent_busy.h"
#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_ag.h"
#include "xfs_rtbitmap.h"
#include "xfs_metafile.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
#include "scrub/stats.h"
#include "xfs_zone_alloc.h"
#include "xfs_healthmon.h"

static DEFINE_MUTEX(xfs_uuid_table_mutex);
static DEFINE_XARRAY_ALLOC(xfs_uuid_table);

static uuid_t *
xfs_uuid_search(
        uuid_t                *new_uuid)
{
        unsigned long        index = 0;
        uuid_t                *uuid;

        xa_for_each(&xfs_uuid_table, index, uuid) {
                if (uuid_equal(uuid, new_uuid))
                        return uuid;
        }
        return NULL;
}

static void
xfs_uuid_delete(
        uuid_t                *uuid,
        unsigned int        index)
{
        ASSERT(uuid_equal(xa_load(&xfs_uuid_table, index), uuid));
        xa_erase(&xfs_uuid_table, index);
}

void
xfs_uuid_table_free(void)
{
        ASSERT(xa_empty(&xfs_uuid_table));
        xa_destroy(&xfs_uuid_table);
}

/*
 * See if the UUID is unique among mounted XFS filesystems.
 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
 */
STATIC int
xfs_uuid_mount(
        struct xfs_mount        *mp)
{
        uuid_t                        *uuid = &mp->m_sb.sb_uuid;
        int                        ret;

        /* Publish UUID in struct super_block */
        super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid));

        if (xfs_has_nouuid(mp))
                return 0;

        if (uuid_is_null(uuid)) {
                xfs_warn(mp, "Filesystem has null UUID - can't mount");
                return -EINVAL;
        }

        mutex_lock(&xfs_uuid_table_mutex);
        if (unlikely(xfs_uuid_search(uuid))) {
                xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount",
                                uuid);
                mutex_unlock(&xfs_uuid_table_mutex);
                return -EINVAL;
        }

        ret = xa_alloc(&xfs_uuid_table, &mp->m_uuid_table_index, uuid,
                                xa_limit_32b, GFP_KERNEL);
        mutex_unlock(&xfs_uuid_table_mutex);
        return ret;
}

STATIC void
xfs_uuid_unmount(
        struct xfs_mount        *mp)
{
        uuid_t                        *uuid = &mp->m_sb.sb_uuid;

        if (xfs_has_nouuid(mp))
                return;

        mutex_lock(&xfs_uuid_table_mutex);
        xfs_uuid_delete(uuid, mp->m_uuid_table_index);
        mutex_unlock(&xfs_uuid_table_mutex);
}

/*
 * Check size of device based on the (data/realtime) block count.
 * Note: this check is used by the growfs code as well as mount.
 */
int
xfs_sb_validate_fsb_count(
        xfs_sb_t        *sbp,
        uint64_t        nblocks)
{
        uint64_t                max_bytes;

        ASSERT(sbp->sb_blocklog >= BBSHIFT);

        if (check_shl_overflow(nblocks, sbp->sb_blocklog, &max_bytes))
                return -EFBIG;

        /* Limited by ULONG_MAX of page cache index */
        if (max_bytes >> PAGE_SHIFT > ULONG_MAX)
                return -EFBIG;
        return 0;
}

/*
 * xfs_readsb
 *
 * Does the initial read of the superblock.
 */
int
xfs_readsb(
        struct xfs_mount *mp,
        int                flags)
{
        unsigned int        sector_size;
        struct xfs_buf        *bp;
        struct xfs_sb        *sbp = &mp->m_sb;
        int                error;
        int                loud = !(flags & XFS_MFSI_QUIET);
        const struct xfs_buf_ops *buf_ops;

        ASSERT(mp->m_sb_bp == NULL);
        ASSERT(mp->m_ddev_targp != NULL);

        /*
         * In the first pass, use the device sector size to just read enough
         * of the superblock to extract the XFS sector size.
         *
         * The device sector size must be smaller than or equal to the XFS
         * sector size and thus we can always read the superblock.  Once we know
         * the XFS sector size, re-read it and run the buffer verifier.
         */
        sector_size = mp->m_ddev_targp->bt_logical_sectorsize;
        buf_ops = NULL;

reread:
        error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
                                      BTOBB(sector_size), &bp, buf_ops);
        if (error) {
                if (loud)
                        xfs_warn(mp, "SB validate failed with error %d.", error);
                /* bad CRC means corrupted metadata */
                if (error == -EFSBADCRC)
                        error = -EFSCORRUPTED;
                return error;
        }

        /*
         * Initialize the mount structure from the superblock.
         */
        xfs_sb_from_disk(sbp, bp->b_addr);

        /*
         * If we haven't validated the superblock, do so now before we try
         * to check the sector size and reread the superblock appropriately.
         */
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
                if (loud)
                        xfs_warn(mp, "Invalid superblock magic number");
                error = -EINVAL;
                goto release_buf;
        }

        /*
         * We must be able to do sector-sized and sector-aligned IO.
         */
        if (sector_size > sbp->sb_sectsize) {
                if (loud)
                        xfs_warn(mp, "device supports %u byte sectors (not %u)",
                                sector_size, sbp->sb_sectsize);
                error = -ENOSYS;
                goto release_buf;
        }

        if (buf_ops == NULL) {
                /*
                 * Re-read the superblock so the buffer is correctly sized,
                 * and properly verified.
                 */
                xfs_buf_relse(bp);
                sector_size = sbp->sb_sectsize;
                buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
                goto reread;
        }

        mp->m_features |= xfs_sb_version_to_features(sbp);
        xfs_reinit_percpu_counters(mp);

        /*
         * If logged xattrs are enabled after log recovery finishes, then set
         * the opstate so that log recovery will work properly.
         */
        if (xfs_sb_version_haslogxattrs(&mp->m_sb))
                xfs_set_using_logged_xattrs(mp);

        /* no need to be quiet anymore, so reset the buf ops */
        bp->b_ops = &xfs_sb_buf_ops;

        /*
         * Keep a pointer of the sb buffer around instead of caching it in the
         * buffer cache because we access it frequently.
         */
        mp->m_sb_bp = bp;
        xfs_buf_unlock(bp);
        return 0;

release_buf:
        xfs_buf_relse(bp);
        return error;
}

/*
 * If the sunit/swidth change would move the precomputed root inode value, we
 * must reject the ondisk change because repair will stumble over that.
 * However, we allow the mount to proceed because we never rejected this
 * combination before.  Returns true to update the sb, false otherwise.
 */
static inline int
xfs_check_new_dalign(
        struct xfs_mount        *mp,
        int                        new_dalign,
        bool                        *update_sb)
{
        struct xfs_sb                *sbp = &mp->m_sb;
        xfs_ino_t                calc_ino;

        calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
        trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);

        if (sbp->sb_rootino == calc_ino) {
                *update_sb = true;
                return 0;
        }

        xfs_warn(mp,
"Cannot change stripe alignment; would require moving root inode.");

        /*
         * XXX: Next time we add a new incompat feature, this should start
         * returning -EINVAL to fail the mount.  Until then, spit out a warning
         * that we're ignoring the administrator's instructions.
         */
        xfs_warn(mp, "Skipping superblock stripe alignment update.");
        *update_sb = false;
        return 0;
}

/*
 * If we were provided with new sunit/swidth values as mount options, make sure
 * that they pass basic alignment and superblock feature checks, and convert
 * them into the same units (FSB) that everything else expects.  This step
 * /must/ be done before computing the inode geometry.
 */
STATIC int
xfs_validate_new_dalign(
        struct xfs_mount        *mp)
{
        if (mp->m_dalign == 0)
                return 0;

        /*
         * If stripe unit and stripe width are not multiples
         * of the fs blocksize turn off alignment.
         */
        if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
            (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
                xfs_warn(mp,
        "alignment check failed: sunit/swidth vs. blocksize(%d)",
                        mp->m_sb.sb_blocksize);
                return -EINVAL;
        }

        /*
         * Convert the stripe unit and width to FSBs.
         */
        mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
        if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
                xfs_warn(mp,
        "alignment check failed: sunit/swidth vs. agsize(%d)",
                        mp->m_sb.sb_agblocks);
                return -EINVAL;
        }

        if (!mp->m_dalign) {
                xfs_warn(mp,
        "alignment check failed: sunit(%d) less than bsize(%d)",
                        mp->m_dalign, mp->m_sb.sb_blocksize);
                return -EINVAL;
        }

        mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);

        if (!xfs_has_dalign(mp)) {
                xfs_warn(mp,
"cannot change alignment: superblock does not support data alignment");
                return -EINVAL;
        }

        return 0;
}

/* Update alignment values based on mount options and sb values. */
STATIC int
xfs_update_alignment(
        struct xfs_mount        *mp)
{
        struct xfs_sb                *sbp = &mp->m_sb;

        if (mp->m_dalign) {
                bool                update_sb;
                int                error;

                if (sbp->sb_unit == mp->m_dalign &&
                    sbp->sb_width == mp->m_swidth)
                        return 0;

                error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
                if (error || !update_sb)
                        return error;

                sbp->sb_unit = mp->m_dalign;
                sbp->sb_width = mp->m_swidth;
                mp->m_update_sb = true;
        } else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) {
                mp->m_dalign = sbp->sb_unit;
                mp->m_swidth = sbp->sb_width;
        }

        return 0;
}

/*
 * precalculate the low space thresholds for dynamic speculative preallocation.
 */
void
xfs_set_low_space_thresholds(
        struct xfs_mount        *mp)
{
        uint64_t                dblocks = mp->m_sb.sb_dblocks;
        uint64_t                rtexts = mp->m_sb.sb_rextents;
        int                        i;

        do_div(dblocks, 100);
        do_div(rtexts, 100);

        for (i = 0; i < XFS_LOWSP_MAX; i++) {
                mp->m_low_space[i] = dblocks * (i + 1);
                mp->m_low_rtexts[i] = rtexts * (i + 1);
        }
}

/*
 * Check that the data (and log if separate) is an ok size.
 */
STATIC int
xfs_check_sizes(
        struct xfs_mount *mp)
{
        struct xfs_buf        *bp;
        xfs_daddr_t        d;
        int                error;

        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
                xfs_warn(mp, "filesystem size mismatch detected");
                return -EFBIG;
        }
        error = xfs_buf_read_uncached(mp->m_ddev_targp,
                                        d - XFS_FSS_TO_BB(mp, 1),
                                        XFS_FSS_TO_BB(mp, 1), &bp, NULL);
        if (error) {
                xfs_warn(mp, "last sector read failed");
                return error;
        }
        xfs_buf_relse(bp);

        if (mp->m_logdev_targp == mp->m_ddev_targp)
                return 0;

        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
                xfs_warn(mp, "log size mismatch detected");
                return -EFBIG;
        }
        error = xfs_buf_read_uncached(mp->m_logdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
                                        XFS_FSB_TO_BB(mp, 1), &bp, NULL);
        if (error) {
                xfs_warn(mp, "log device read failed");
                return error;
        }
        xfs_buf_relse(bp);
        return 0;
}

/*
 * Clear the quotaflags in memory and in the superblock.
 */
int
xfs_mount_reset_sbqflags(
        struct xfs_mount        *mp)
{
        mp->m_qflags = 0;

        /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
        if (mp->m_sb.sb_qflags == 0)
                return 0;
        spin_lock(&mp->m_sb_lock);
        mp->m_sb.sb_qflags = 0;
        spin_unlock(&mp->m_sb_lock);

        if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
                return 0;

        return xfs_sync_sb(mp, false);
}

static const char *const xfs_free_pool_name[] = {
        [XC_FREE_BLOCKS]        = "free blocks",
        [XC_FREE_RTEXTENTS]        = "free rt extents",
        [XC_FREE_RTAVAILABLE]        = "available rt extents",
};

uint64_t
xfs_default_resblks(
        struct xfs_mount        *mp,
        enum xfs_free_counter        ctr)
{
        switch (ctr) {
        case XC_FREE_BLOCKS:
                /*
                 * Default to 5% or 8192 FSBs of space reserved, whichever is
                 * smaller.
                 *
                 * This is intended to cover concurrent allocation transactions
                 * when we initially hit ENOSPC.  These each require a 4 block
                 * reservation. Hence by default we cover roughly 2000
                 * concurrent allocation reservations.
                 */
                return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
        case XC_FREE_RTEXTENTS:
        case XC_FREE_RTAVAILABLE:
                if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
                        return xfs_zoned_default_resblks(mp, ctr);
                return 0;
        default:
                ASSERT(0);
                return 0;
        }
}

/* Ensure the summary counts are correct. */
STATIC int
xfs_check_summary_counts(
        struct xfs_mount        *mp)
{
        int                        error = 0;

        /*
         * The AG0 superblock verifier rejects in-progress filesystems,
         * so we should never see the flag set this far into mounting.
         */
        if (mp->m_sb.sb_inprogress) {
                xfs_err(mp, "sb_inprogress set after log recovery??");
                WARN_ON(1);
                return -EFSCORRUPTED;
        }

        /*
         * Now the log is mounted, we know if it was an unclean shutdown or
         * not. If it was, with the first phase of recovery has completed, we
         * have consistent AG blocks on disk. We have not recovered EFIs yet,
         * but they are recovered transactionally in the second recovery phase
         * later.
         *
         * If the log was clean when we mounted, we can check the summary
         * counters.  If any of them are obviously incorrect, we can recompute
         * them from the AGF headers in the next step.
         */
        if (xfs_is_clean(mp) &&
            (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
             !xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
             mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
                xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);

        /*
         * We can safely re-initialise incore superblock counters from the
         * per-ag data. These may not be correct if the filesystem was not
         * cleanly unmounted, so we waited for recovery to finish before doing
         * this.
         *
         * If the filesystem was cleanly unmounted or the previous check did
         * not flag anything weird, then we can trust the values in the
         * superblock to be correct and we don't need to do anything here.
         * Otherwise, recalculate the summary counters.
         */
        if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
            xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
                error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
                if (error)
                        return error;
        }

        /*
         * Older kernels misused sb_frextents to reflect both incore
         * reservations made by running transactions and the actual count of
         * free rt extents in the ondisk metadata.  Transactions committed
         * during runtime can therefore contain a superblock update that
         * undercounts the number of free rt extents tracked in the rt bitmap.
         * A clean unmount record will have the correct frextents value since
         * there can be no other transactions running at that point.
         *
         * If we're mounting the rt volume after recovering the log, recompute
         * frextents from the rtbitmap file to fix the inconsistency.
         */
        if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
                error = xfs_rtalloc_reinit_frextents(mp);
                if (error)
                        return error;
        }

        return 0;
}

static void
xfs_unmount_check(
        struct xfs_mount        *mp)
{
        if (xfs_is_shutdown(mp))
                return;

        if (percpu_counter_sum(&mp->m_ifree) >
                        percpu_counter_sum(&mp->m_icount)) {
                xfs_alert(mp, "ifree/icount mismatch at unmount");
                xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
        }
}

/*
 * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
 * internal inode structures can be sitting in the CIL and AIL at this point,
 * so we need to unpin them, write them back and/or reclaim them before unmount
 * can proceed.  In other words, callers are required to have inactivated all
 * inodes.
 *
 * An inode cluster that has been freed can have its buffer still pinned in
 * memory because the transaction is still sitting in a iclog. The stale inodes
 * on that buffer will be pinned to the buffer until the transaction hits the
 * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
 * may never see the pinned buffer, so nothing will push out the iclog and
 * unpin the buffer.
 *
 * Hence we need to force the log to unpin everything first. However, log
 * forces don't wait for the discards they issue to complete, so we have to
 * explicitly wait for them to complete here as well.
 *
 * Then we can tell the world we are unmounting so that error handling knows
 * that the filesystem is going away and we should error out anything that we
 * have been retrying in the background.  This will prevent never-ending
 * retries in AIL pushing from hanging the unmount.
 *
 * Stop inodegc and background reclaim before pushing the AIL so that they
 * are not running while the AIL is being flushed. Then push the AIL to
 * clean all the remaining dirty objects and reclaim the remaining inodes.
 */
static void
xfs_unmount_flush_inodes(
        struct xfs_mount        *mp)
{
        xfs_log_force(mp, XFS_LOG_SYNC);
        xfs_extent_busy_wait_all(mp);
        flush_workqueue(xfs_discard_wq);

        xfs_set_unmounting(mp);

        xfs_inodegc_stop(mp);
        cancel_delayed_work_sync(&mp->m_reclaim_work);
        xfs_ail_push_all_sync(mp->m_ail);
        xfs_reclaim_inodes(mp);
        xfs_health_unmount(mp);
        xfs_healthmon_unmount(mp);
}

static void
xfs_mount_setup_inode_geom(
        struct xfs_mount        *mp)
{
        struct xfs_ino_geometry *igeo = M_IGEO(mp);

        igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp);
        ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp));

        xfs_ialloc_setup_geometry(mp);
}

/* Mount the metadata directory tree root. */
STATIC int
xfs_mount_setup_metadir(
        struct xfs_mount        *mp)
{
        int                        error;

        /* Load the metadata directory root inode into memory. */
        error = xfs_metafile_iget(mp, mp->m_sb.sb_metadirino, XFS_METAFILE_DIR,
                        &mp->m_metadirip);
        if (error)
                xfs_warn(mp, "Failed to load metadir root directory, error %d",
                                error);
        return error;
}

/* Compute maximum possible height for per-AG btree types for this fs. */
static inline void
xfs_agbtree_compute_maxlevels(
        struct xfs_mount        *mp)
{
        unsigned int                levels;

        levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels);
        levels = max(levels, mp->m_rmap_maxlevels);
        mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
}

/* Maximum atomic write IO size that the kernel allows. */
static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp)
{
        return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT));
}

/*
 * If the underlying device advertises atomic write support, limit the size of
 * atomic writes to the greatest power-of-two factor of the group size so
 * that every atomic write unit aligns with the start of every group.  This is
 * required so that the allocations for an atomic write will always be
 * aligned compatibly with the alignment requirements of the storage.
 *
 * If the device doesn't advertise atomic writes, then there are no alignment
 * restrictions and the largest out-of-place write we can do ourselves is the
 * number of blocks that user files can allocate from any group.
 */
static xfs_extlen_t
xfs_calc_group_awu_max(
        struct xfs_mount        *mp,
        enum xfs_group_type        type)
{
        struct xfs_groups        *g = &mp->m_groups[type];
        struct xfs_buftarg        *btp = xfs_group_type_buftarg(mp, type);

        if (g->blocks == 0)
                return 0;
        if (btp && btp->bt_awu_min > 0)
                return max_pow_of_two_factor(g->blocks);
        return rounddown_pow_of_two(g->blocks);
}

/* Compute the maximum atomic write unit size for each section. */
static inline void
xfs_calc_atomic_write_unit_max(
        struct xfs_mount        *mp,
        enum xfs_group_type        type)
{
        struct xfs_groups        *g = &mp->m_groups[type];

        const xfs_extlen_t        max_write = xfs_calc_atomic_write_max(mp);
        const xfs_extlen_t        max_ioend = xfs_reflink_max_atomic_cow(mp);
        const xfs_extlen_t        max_gsize = xfs_calc_group_awu_max(mp, type);

        g->awu_max = min3(max_write, max_ioend, max_gsize);
        trace_xfs_calc_atomic_write_unit_max(mp, type, max_write, max_ioend,
                        max_gsize, g->awu_max);
}

/*
 * Try to set the atomic write maximum to a new value that we got from
 * userspace via mount option.
 */
int
xfs_set_max_atomic_write_opt(
        struct xfs_mount        *mp,
        unsigned long long        new_max_bytes)
{
        const xfs_filblks_t        new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes);
        const xfs_extlen_t        max_write = xfs_calc_atomic_write_max(mp);
        const xfs_extlen_t        max_group =
                max(mp->m_groups[XG_TYPE_AG].blocks,
                    mp->m_groups[XG_TYPE_RTG].blocks);
        const xfs_extlen_t        max_group_write =
                max(xfs_calc_group_awu_max(mp, XG_TYPE_AG),
                    xfs_calc_group_awu_max(mp, XG_TYPE_RTG));
        int                        error;

        if (new_max_bytes == 0)
                goto set_limit;

        ASSERT(max_write <= U32_MAX);

        /* generic_atomic_write_valid enforces power of two length */
        if (!is_power_of_2(new_max_bytes)) {
                xfs_warn(mp,
 "max atomic write size of %llu bytes is not a power of 2",
                                new_max_bytes);
                return -EINVAL;
        }

        if (new_max_bytes & mp->m_blockmask) {
                xfs_warn(mp,
 "max atomic write size of %llu bytes not aligned with fsblock",
                                new_max_bytes);
                return -EINVAL;
        }

        if (new_max_fsbs > max_write) {
                xfs_warn(mp,
 "max atomic write size of %lluk cannot be larger than max write size %lluk",
                                new_max_bytes >> 10,
                                XFS_FSB_TO_B(mp, max_write) >> 10);
                return -EINVAL;
        }

        if (new_max_fsbs > max_group) {
                xfs_warn(mp,
 "max atomic write size of %lluk cannot be larger than allocation group size %lluk",
                                new_max_bytes >> 10,
                                XFS_FSB_TO_B(mp, max_group) >> 10);
                return -EINVAL;
        }

        if (new_max_fsbs > max_group_write) {
                xfs_warn(mp,
 "max atomic write size of %lluk cannot be larger than max allocation group write size %lluk",
                                new_max_bytes >> 10,
                                XFS_FSB_TO_B(mp, max_group_write) >> 10);
                return -EINVAL;
        }

        if (xfs_has_reflink(mp))
                goto set_limit;

        if (new_max_fsbs == 1) {
                if (mp->m_ddev_targp->bt_awu_max ||
                    (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_awu_max)) {
                } else {
                        xfs_warn(mp,
 "cannot support atomic writes of size %lluk with no reflink or HW support",
                                new_max_bytes >> 10);
                        return -EINVAL;
                }
        } else {
                xfs_warn(mp,
 "cannot support atomic writes of size %lluk with no reflink support",
                                new_max_bytes >> 10);
                return -EINVAL;
        }

set_limit:
        error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs);
        if (error) {
                xfs_warn(mp,
 "cannot support completing atomic writes of %lluk",
                                new_max_bytes >> 10);
                return error;
        }

        xfs_calc_atomic_write_unit_max(mp, XG_TYPE_AG);
        xfs_calc_atomic_write_unit_max(mp, XG_TYPE_RTG);
        mp->m_awu_max_bytes = new_max_bytes;
        return 0;
}

/* Compute maximum possible height for realtime btree types for this fs. */
static inline void
xfs_rtbtree_compute_maxlevels(
        struct xfs_mount        *mp)
{
        mp->m_rtbtree_maxlevels = max(mp->m_rtrmap_maxlevels,
                                      mp->m_rtrefc_maxlevels);
}

/*
 * This function does the following on an initial mount of a file system:
 *        - reads the superblock from disk and init the mount struct
 *        - if we're a 32-bit kernel, do a size check on the superblock
 *                so we don't mount terabyte filesystems
 *        - init mount struct realtime fields
 *        - allocate inode hash table for fs
 *        - init directory manager
 *        - perform recovery and init the log manager
 */
int
xfs_mountfs(
        struct xfs_mount        *mp)
{
        struct xfs_sb                *sbp = &(mp->m_sb);
        struct xfs_inode        *rip;
        struct xfs_ino_geometry        *igeo = M_IGEO(mp);
        uint                        quotamount = 0;
        uint                        quotaflags = 0;
        int                        error = 0;
        int                        i;

        xfs_sb_mount_common(mp, sbp);

        /*
         * Check for a mismatched features2 values.  Older kernels read & wrote
         * into the wrong sb offset for sb_features2 on some platforms due to
         * xfs_sb_t not being 64bit size aligned when sb_features2 was added,
         * which made older superblock reading/writing routines swap it as a
         * 64-bit value.
         *
         * For backwards compatibility, we make both slots equal.
         *
         * If we detect a mismatched field, we OR the set bits into the existing
         * features2 field in case it has already been modified; we don't want
         * to lose any features.  We then update the bad location with the ORed
         * value so that older kernels will see any features2 flags. The
         * superblock writeback code ensures the new sb_features2 is copied to
         * sb_bad_features2 before it is logged or written to disk.
         */
        if (xfs_sb_has_mismatched_features2(sbp)) {
                xfs_warn(mp, "correcting sb_features alignment problem");
                sbp->sb_features2 |= sbp->sb_bad_features2;
                mp->m_update_sb = true;
        }


        /* always use v2 inodes by default now */
        if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
                mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
                mp->m_features |= XFS_FEAT_NLINK;
                mp->m_update_sb = true;
        }

        /*
         * If we were given new sunit/swidth options, do some basic validation
         * checks and convert the incore dalign and swidth values to the
         * same units (FSB) that everything else uses.  This /must/ happen
         * before computing the inode geometry.
         */
        error = xfs_validate_new_dalign(mp);
        if (error)
                goto out;

        xfs_alloc_compute_maxlevels(mp);
        xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
        xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
        xfs_mount_setup_inode_geom(mp);
        xfs_rmapbt_compute_maxlevels(mp);
        xfs_rtrmapbt_compute_maxlevels(mp);
        xfs_refcountbt_compute_maxlevels(mp);
        xfs_rtrefcountbt_compute_maxlevels(mp);

        xfs_agbtree_compute_maxlevels(mp);
        xfs_rtbtree_compute_maxlevels(mp);

        /*
         * Check if sb_agblocks is aligned at stripe boundary.  If sb_agblocks
         * is NOT aligned turn off m_dalign since allocator alignment is within
         * an ag, therefore ag has to be aligned at stripe boundary.  Note that
         * we must compute the free space and rmap btree geometry before doing
         * this.
         */
        error = xfs_update_alignment(mp);
        if (error)
                goto out;

        /* enable fail_at_unmount as default */
        mp->m_fail_unmount = true;

        error = xfs_mount_sysfs_init(mp);
        if (error)
                goto out_remove_scrub_stats;

        xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);

        error = xfs_errortag_init(mp);
        if (error)
                goto out_remove_sysfs;

        error = xfs_uuid_mount(mp);
        if (error)
                goto out_remove_errortag;

        /*
         * Update the preferred write size based on the information from the
         * on-disk superblock.
         */
        mp->m_allocsize_log =
                max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
        mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog);

        /* set the low space thresholds for dynamic preallocation */
        xfs_set_low_space_thresholds(mp);

        /*
         * If enabled, sparse inode chunk alignment is expected to match the
         * cluster size. Full inode chunk alignment must match the chunk size,
         * but that is checked on sb read verification...
         */
        if (xfs_has_sparseinodes(mp) &&
            mp->m_sb.sb_spino_align !=
                        XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
                xfs_warn(mp,
        "Sparse inode block alignment (%u) must match cluster size (%llu).",
                         mp->m_sb.sb_spino_align,
                         XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
                error = -EINVAL;
                goto out_remove_uuid;
        }

        /*
         * Check that the data (and log if separate) is an ok size.
         */
        error = xfs_check_sizes(mp);
        if (error)
                goto out_remove_uuid;

        /*
         * Initialize realtime fields in the mount structure
         */
        error = xfs_rtmount_init(mp);
        if (error) {
                xfs_warn(mp, "RT mount failed");
                goto out_remove_uuid;
        }

        /*
         *  Copies the low order bits of the timestamp and the randomly
         *  set "sequence" number out of a UUID.
         */
        mp->m_fixedfsid[0] =
                (get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) |
                 get_unaligned_be16(&sbp->sb_uuid.b[4]);
        mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]);

        error = xfs_da_mount(mp);
        if (error) {
                xfs_warn(mp, "Failed dir/attr init: %d", error);
                goto out_remove_uuid;
        }

        /*
         * Initialize the precomputed transaction reservations values.
         */
        xfs_trans_init(mp);

        /*
         * Allocate and initialize the per-ag data.
         */
        error = xfs_initialize_perag(mp, 0, sbp->sb_agcount,
                        mp->m_sb.sb_dblocks, &mp->m_maxagi);
        if (error) {
                xfs_warn(mp, "Failed per-ag init: %d", error);
                goto out_free_dir;
        }

        error = xfs_initialize_rtgroups(mp, 0, sbp->sb_rgcount,
                        mp->m_sb.sb_rextents);
        if (error) {
                xfs_warn(mp, "Failed rtgroup init: %d", error);
                goto out_free_perag;
        }

        if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
                xfs_warn(mp, "no log defined");
                error = -EFSCORRUPTED;
                goto out_free_rtgroup;
        }

        error = xfs_inodegc_register_shrinker(mp);
        if (error)
                goto out_fail_wait;

        /*
         * If we're resuming quota status, pick up the preliminary qflags from
         * the ondisk superblock so that we know if we should recover dquots.
         */
        if (xfs_is_resuming_quotaon(mp))
                xfs_qm_resume_quotaon(mp);

        /*
         * Log's mount-time initialization. The first part of recovery can place
         * some items on the AIL, to be handled when recovery is finished or
         * cancelled.
         */
        error = xfs_log_mount(mp, mp->m_logdev_targp,
                              XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
                              XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
        if (error) {
                xfs_warn(mp, "log mount failed");
                goto out_inodegc_shrinker;
        }

        /*
         * If we're resuming quota status and recovered the log, re-sample the
         * qflags from the ondisk superblock now that we've recovered it, just
         * in case someone shut down enforcement just before a crash.
         */
        if (xfs_clear_resuming_quotaon(mp) && xlog_recovery_needed(mp->m_log))
                xfs_qm_resume_quotaon(mp);

        /*
         * If logged xattrs are still enabled after log recovery finishes, then
         * they'll be available until unmount.  Otherwise, turn them off.
         */
        if (xfs_sb_version_haslogxattrs(&mp->m_sb))
                xfs_set_using_logged_xattrs(mp);
        else
                xfs_clear_using_logged_xattrs(mp);

        /* Enable background inode inactivation workers. */
        xfs_inodegc_start(mp);
        xfs_blockgc_start(mp);

        if (xfs_has_metadir(mp)) {
                error = xfs_mount_setup_metadir(mp);
                if (error)
                        goto out_free_metadir;
        }

        /*
         * Get and sanity-check the root inode.
         * Save the pointer to it in the mount structure.
         */
        error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_IGET_UNTRUSTED,
                         XFS_ILOCK_EXCL, &rip);
        if (error) {
                xfs_warn(mp,
                        "Failed to read root inode 0x%llx, error %d",
                        sbp->sb_rootino, -error);
                goto out_free_metadir;
        }

        ASSERT(rip != NULL);

        if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
                xfs_warn(mp, "corrupted root inode %llu: not a directory",
                        (unsigned long long)rip->i_ino);
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                error = -EFSCORRUPTED;
                goto out_rele_rip;
        }
        mp->m_rootip = rip;        /* save it */

        xfs_iunlock(rip, XFS_ILOCK_EXCL);

        /*
         * Initialize realtime inode pointers in the mount structure
         */
        error = xfs_rtmount_inodes(mp);
        if (error) {
                /*
                 * Free up the root inode.
                 */
                xfs_warn(mp, "failed to read RT inodes");
                goto out_rele_rip;
        }

        /* Make sure the summary counts are ok. */
        error = xfs_check_summary_counts(mp);
        if (error)
                goto out_rtunmount;

        /*
         * If this is a read-only mount defer the superblock updates until
         * the next remount into writeable mode.  Otherwise we would never
         * perform the update e.g. for the root filesystem.
         */
        if (mp->m_update_sb && !xfs_is_readonly(mp)) {
                error = xfs_sync_sb(mp, false);
                if (error) {
                        xfs_warn(mp, "failed to write sb changes");
                        goto out_rtunmount;
                }
        }

        /*
         * Initialise the XFS quota management subsystem for this mount
         */
        if (XFS_IS_QUOTA_ON(mp)) {
                error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
                if (error)
                        goto out_rtunmount;
        } else {
                /*
                 * If a file system had quotas running earlier, but decided to
                 * mount without -o uquota/pquota/gquota options, revoke the
                 * quotachecked license.
                 */
                if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
                        xfs_notice(mp, "resetting quota flags");
                        error = xfs_mount_reset_sbqflags(mp);
                        if (error)
                                goto out_rtunmount;
                }
        }

        /*
         * Finish recovering the file system.  This part needed to be delayed
         * until after the root and real-time bitmap inodes were consistently
         * read in.  Temporarily create per-AG space reservations for metadata
         * btree shape changes because space freeing transactions (for inode
         * inactivation) require the per-AG reservation in lieu of reserving
         * blocks.
         */
        error = xfs_fs_reserve_ag_blocks(mp);
        if (error && error == -ENOSPC)
                xfs_warn(mp,
        "ENOSPC reserving per-AG metadata pool, log recovery may fail.");
        error = xfs_log_mount_finish(mp);
        xfs_fs_unreserve_ag_blocks(mp);
        if (error) {
                xfs_warn(mp, "log mount finish failed");
                goto out_rtunmount;
        }

        /*
         * Now the log is fully replayed, we can transition to full read-only
         * mode for read-only mounts. This will sync all the metadata and clean
         * the log so that the recovery we just performed does not have to be
         * replayed again on the next mount.
         *
         * We use the same quiesce mechanism as the rw->ro remount, as they are
         * semantically identical operations.
         */
        if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
                xfs_log_clean(mp);

        if (xfs_has_zoned(mp)) {
                error = xfs_mount_zones(mp);
                if (error)
                        goto out_rtunmount;
        }

        /*
         * Complete the quota initialisation, post-log-replay component.
         */
        if (quotamount) {
                ASSERT(mp->m_qflags == 0);
                mp->m_qflags = quotaflags;

                xfs_qm_mount_quotas(mp);
        }

        /*
         * Now we are mounted, reserve a small amount of unused space for
         * privileged transactions. This is needed so that transaction
         * space required for critical operations can dip into this pool
         * when at ENOSPC. This is needed for operations like create with
         * attr, unwritten extent conversion at ENOSPC, garbage collection
         * etc. Data allocations are not allowed to use this reserved space.
         *
         * This may drive us straight to ENOSPC on mount, but that implies
         * we were already there on the last unmount. Warn if this occurs.
         */
        if (!xfs_is_readonly(mp)) {
                for (i = 0; i < XC_FREE_NR; i++) {
                        error = xfs_reserve_blocks(mp, i,
                                        xfs_default_resblks(mp, i));
                        if (error)
                                xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool for %s.",
                                        xfs_free_pool_name[i]);
                }

                /* Reserve AG blocks for future btree expansion. */
                error = xfs_fs_reserve_ag_blocks(mp);
                if (error && error != -ENOSPC)
                        goto out_agresv;

                xfs_zone_gc_start(mp);
        }

        /*
         * Pre-calculate atomic write unit max.  This involves computations
         * derived from transaction reservations, so we must do this after the
         * log is fully initialized.
         */
        error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes);
        if (error)
                goto out_agresv;

        return 0;

 out_agresv:
        xfs_fs_unreserve_ag_blocks(mp);
        xfs_qm_unmount_quotas(mp);
        if (xfs_has_zoned(mp))
                xfs_unmount_zones(mp);
 out_rtunmount:
        xfs_rtunmount_inodes(mp);
 out_rele_rip:
        xfs_irele(rip);
        /* Clean out dquots that might be in memory after quotacheck. */
        xfs_qm_unmount(mp);
 out_free_metadir:
        if (mp->m_metadirip)
                xfs_irele(mp->m_metadirip);

        /*
         * Inactivate all inodes that might still be in memory after a log
         * intent recovery failure so that reclaim can free them.  Metadata
         * inodes and the root directory shouldn't need inactivation, but the
         * mount failed for some reason, so pull down all the state and flee.
         */
        xfs_inodegc_flush(mp);

        /*
         * Flush all inode reclamation work and flush the log.
         * We have to do this /after/ rtunmount and qm_unmount because those
         * two will have scheduled delayed reclaim for the rt/quota inodes.
         *
         * This is slightly different from the unmountfs call sequence
         * because we could be tearing down a partially set up mount.  In
         * particular, if log_mount_finish fails we bail out without calling
         * qm_unmount_quotas and therefore rely on qm_unmount to release the
         * quota inodes.
         */
        xfs_unmount_flush_inodes(mp);
        xfs_log_mount_cancel(mp);
 out_inodegc_shrinker:
        shrinker_free(mp->m_inodegc_shrinker);
 out_fail_wait:
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
                xfs_buftarg_drain(mp->m_logdev_targp);
        xfs_buftarg_drain(mp->m_ddev_targp);
 out_free_rtgroup:
        xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount);
 out_free_perag:
        xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
 out_free_dir:
        xfs_da_unmount(mp);
 out_remove_uuid:
        xfs_uuid_unmount(mp);
 out_remove_errortag:
        xfs_errortag_del(mp);
 out_remove_sysfs:
        xfs_mount_sysfs_del(mp);
 out_remove_scrub_stats:
        xchk_stats_unregister(mp->m_scrub_stats);
 out:
        return error;
}

/*
 * This flushes out the inodes,dquots and the superblock, unmounts the
 * log and makes sure that incore structures are freed.
 */
void
xfs_unmountfs(
        struct xfs_mount        *mp)
{
        int                        error;

        /*
         * Perform all on-disk metadata updates required to inactivate inodes
         * that the VFS evicted earlier in the unmount process.  Freeing inodes
         * and discarding CoW fork preallocations can cause shape changes to
         * the free inode and refcount btrees, respectively, so we must finish
         * this before we discard the metadata space reservations.  Metadata
         * inodes and the root directory do not require inactivation.
         */
        xfs_inodegc_flush(mp);

        xfs_blockgc_stop(mp);
        if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
                xfs_zone_gc_stop(mp);
        xfs_fs_unreserve_ag_blocks(mp);
        xfs_qm_unmount_quotas(mp);
        if (xfs_has_zoned(mp))
                xfs_unmount_zones(mp);
        xfs_rtunmount_inodes(mp);
        xfs_irele(mp->m_rootip);
        if (mp->m_metadirip)
                xfs_irele(mp->m_metadirip);

        xfs_unmount_flush_inodes(mp);

        xfs_qm_unmount(mp);

        /*
         * Unreserve any blocks we have so that when we unmount we don't account
         * the reserved free space as used. This is really only necessary for
         * lazy superblock counting because it trusts the incore superblock
         * counters to be absolutely correct on clean unmount.
         *
         * We don't bother correcting this elsewhere for lazy superblock
         * counting because on mount of an unclean filesystem we reconstruct the
         * correct counter value and this is irrelevant.
         *
         * For non-lazy counter filesystems, this doesn't matter at all because
         * we only every apply deltas to the superblock and hence the incore
         * value does not matter....
         */
        error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0);
        if (error)
                xfs_warn(mp, "Unable to free reserved block pool. "
                                "Freespace may not be correct on next mount.");
        xfs_unmount_check(mp);

        /*
         * Indicate that it's ok to clear log incompat bits before cleaning
         * the log and writing the unmount record.
         */
        xfs_set_done_with_log_incompat(mp);
        xfs_log_unmount(mp);
        xfs_da_unmount(mp);
        xfs_uuid_unmount(mp);

#if defined(DEBUG)
        xfs_errortag_clearall(mp);
#endif
        shrinker_free(mp->m_inodegc_shrinker);
        xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount);
        xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
        xfs_errortag_del(mp);
        xchk_stats_unregister(mp->m_scrub_stats);
        xfs_mount_sysfs_del(mp);
}

/*
 * Determine whether modifications can proceed. The caller specifies the minimum
 * freeze level for which modifications should not be allowed. This allows
 * certain operations to proceed while the freeze sequence is in progress, if
 * necessary.
 */
bool
xfs_fs_writable(
        struct xfs_mount        *mp,
        int                        level)
{
        ASSERT(level > SB_UNFROZEN);
        if ((mp->m_super->s_writers.frozen >= level) ||
            xfs_is_shutdown(mp) || xfs_is_readonly(mp))
                return false;

        return true;
}

/*
 * Estimate the amount of free space that is not available to userspace and is
 * not explicitly reserved from the incore fdblocks.  This includes:
 *
 * - The minimum number of blocks needed to support splitting a bmap btree
 * - The blocks currently in use by the freespace btrees because they record
 *   the actual blocks that will fill per-AG metadata space reservations
 */
uint64_t
xfs_freecounter_unavailable(
        struct xfs_mount        *mp,
        enum xfs_free_counter        ctr)
{
        if (ctr != XC_FREE_BLOCKS)
                return 0;
        return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}

void
xfs_add_freecounter(
        struct xfs_mount        *mp,
        enum xfs_free_counter        ctr,
        uint64_t                delta)
{
        struct xfs_freecounter        *counter = &mp->m_free[ctr];
        uint64_t                res_used;

        /*
         * If the reserve pool is depleted, put blocks back into it first.
         * Most of the time the pool is full.
         */
        if (likely(counter->res_avail == counter->res_total)) {
                percpu_counter_add(&counter->count, delta);
                return;
        }

        spin_lock(&mp->m_sb_lock);
        res_used = counter->res_total - counter->res_avail;
        if (res_used > delta) {
                counter->res_avail += delta;
        } else {
                delta -= res_used;
                counter->res_avail = counter->res_total;
                percpu_counter_add(&counter->count, delta);
        }
        spin_unlock(&mp->m_sb_lock);
}


/* Adjust in-core free blocks or RT extents. */
int
xfs_dec_freecounter(
        struct xfs_mount        *mp,
        enum xfs_free_counter        ctr,
        uint64_t                delta,
        bool                        rsvd)
{
        struct xfs_freecounter        *counter = &mp->m_free[ctr];
        s32                        batch;

        ASSERT(ctr < XC_FREE_NR);

        /*
         * Taking blocks away, need to be more accurate the closer we
         * are to zero.
         *
         * If the counter has a value of less than 2 * max batch size,
         * then make everything serialise as we are real close to
         * ENOSPC.
         */
        if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH,
                                     XFS_FDBLOCKS_BATCH) < 0)
                batch = 1;
        else
                batch = XFS_FDBLOCKS_BATCH;

        /*
         * Set aside allocbt blocks because these blocks are tracked as free
         * space but not available for allocation. Technically this means that a
         * single reservation cannot consume all remaining free space, but the
         * ratio of allocbt blocks to usable free blocks should be rather small.
         * The tradeoff without this is that filesystems that maintain high
         * perag block reservations can over reserve physical block availability
         * and fail physical allocation, which leads to much more serious
         * problems (i.e. transaction abort, pagecache discards, etc.) than
         * slightly premature -ENOSPC.
         */
        percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch);
        if (__percpu_counter_compare(&counter->count,
                        xfs_freecounter_unavailable(mp, ctr),
                        XFS_FDBLOCKS_BATCH) < 0) {
                /*
                 * Lock up the sb for dipping into reserves before releasing the
                 * space that took us to ENOSPC.
                 */
                spin_lock(&mp->m_sb_lock);
                percpu_counter_add(&counter->count, delta);
                if (!rsvd)
                        goto fdblocks_enospc;
                if (delta > counter->res_avail) {
                        if (ctr == XC_FREE_BLOCKS)
                                xfs_warn_once(mp,
"Reserve blocks depleted! Consider increasing reserve pool size.");
                        goto fdblocks_enospc;
                }
                counter->res_avail -= delta;
                trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_);
                spin_unlock(&mp->m_sb_lock);
        }

        /* we had space! */
        return 0;

fdblocks_enospc:
        trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_);
        spin_unlock(&mp->m_sb_lock);
        return -ENOSPC;
}

/*
 * Used to free the superblock along various error paths.
 */
void
xfs_freesb(
        struct xfs_mount        *mp)
{
        struct xfs_buf                *bp = mp->m_sb_bp;

        xfs_buf_lock(bp);
        mp->m_sb_bp = NULL;
        xfs_buf_relse(bp);
}

/*
 * If the underlying (data/log/rt) device is readonly, there are some
 * operations that cannot proceed.
 */
int
xfs_dev_is_read_only(
        struct xfs_mount        *mp,
        char                        *message)
{
        if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
            xfs_readonly_buftarg(mp->m_logdev_targp) ||
            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
                xfs_notice(mp, "%s required on read-only device.", message);
                xfs_notice(mp, "write access unavailable, cannot proceed.");
                return -EROFS;
        }
        return 0;
}

/* Force the summary counters to be recalculated at next mount. */
void
xfs_force_summary_recalc(
        struct xfs_mount        *mp)
{
        if (!xfs_has_lazysbcount(mp))
                return;

        xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
}

/*
 * Enable a log incompat feature flag in the primary superblock.  The caller
 * cannot have any other transactions in progress.
 */
int
xfs_add_incompat_log_feature(
        struct xfs_mount        *mp,
        uint32_t                feature)
{
        struct xfs_dsb                *dsb;
        int                        error;

        ASSERT(hweight32(feature) == 1);
        ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));

        /*
         * Force the log to disk and kick the background AIL thread to reduce
         * the chances that the bwrite will stall waiting for the AIL to unpin
         * the primary superblock buffer.  This isn't a data integrity
         * operation, so we don't need a synchronous push.
         */
        error = xfs_log_force(mp, XFS_LOG_SYNC);
        if (error)
                return error;
        xfs_ail_push_all(mp->m_ail);

        /*
         * Lock the primary superblock buffer to serialize all callers that
         * are trying to set feature bits.
         */
        xfs_buf_lock(mp->m_sb_bp);
        xfs_buf_hold(mp->m_sb_bp);

        if (xfs_is_shutdown(mp)) {
                error = -EIO;
                goto rele;
        }

        if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature))
                goto rele;

        /*
         * Write the primary superblock to disk immediately, because we need
         * the log_incompat bit to be set in the primary super now to protect
         * the log items that we're going to commit later.
         */
        dsb = mp->m_sb_bp->b_addr;
        xfs_sb_to_disk(dsb, &mp->m_sb);
        dsb->sb_features_log_incompat |= cpu_to_be32(feature);
        error = xfs_bwrite(mp->m_sb_bp);
        if (error)
                goto shutdown;

        /*
         * Add the feature bits to the incore superblock before we unlock the
         * buffer.
         */
        xfs_sb_add_incompat_log_features(&mp->m_sb, feature);
        xfs_buf_relse(mp->m_sb_bp);

        /* Log the superblock to disk. */
        return xfs_sync_sb(mp, false);
shutdown:
        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
rele:
        xfs_buf_relse(mp->m_sb_bp);
        return error;
}

/*
 * Clear all the log incompat flags from the superblock.
 *
 * The caller cannot be in a transaction, must ensure that the log does not
 * contain any log items protected by any log incompat bit, and must ensure
 * that there are no other threads that depend on the state of the log incompat
 * feature flags in the primary super.
 *
 * Returns true if the superblock is dirty.
 */
bool
xfs_clear_incompat_log_features(
        struct xfs_mount        *mp)
{
        bool                        ret = false;

        if (!xfs_has_crc(mp) ||
            !xfs_sb_has_incompat_log_feature(&mp->m_sb,
                                XFS_SB_FEAT_INCOMPAT_LOG_ALL) ||
            xfs_is_shutdown(mp) ||
            !xfs_is_done_with_log_incompat(mp))
                return false;

        /*
         * Update the incore superblock.  We synchronize on the primary super
         * buffer lock to be consistent with the add function, though at least
         * in theory this shouldn't be necessary.
         */
        xfs_buf_lock(mp->m_sb_bp);
        xfs_buf_hold(mp->m_sb_bp);

        if (xfs_sb_has_incompat_log_feature(&mp->m_sb,
                                XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
                xfs_sb_remove_incompat_log_features(&mp->m_sb);
                ret = true;
        }

        xfs_buf_relse(mp->m_sb_bp);
        return ret;
}

/*
 * Update the in-core delayed block counter.
 *
 * We prefer to update the counter without having to take a spinlock for every
 * counter update (i.e. batching).  Each change to delayed allocation
 * reservations can change can easily exceed the default percpu counter
 * batching, so we use a larger batch factor here.
 *
 * Note that we don't currently have any callers requiring fast summation
 * (e.g. percpu_counter_read) so we can use a big batch value here.
 */
#define XFS_DELALLOC_BATCH        (4096)
void
xfs_mod_delalloc(
        struct xfs_inode        *ip,
        int64_t                        data_delta,
        int64_t                        ind_delta)
{
        struct xfs_mount        *mp = ip->i_mount;

        if (XFS_IS_REALTIME_INODE(ip)) {
                percpu_counter_add_batch(&mp->m_delalloc_rtextents,
                                xfs_blen_to_rtbxlen(mp, data_delta),
                                XFS_DELALLOC_BATCH);
                if (!ind_delta)
                        return;
                data_delta = 0;
        }
        percpu_counter_add_batch(&mp->m_delalloc_blks, data_delta + ind_delta,
                        XFS_DELALLOC_BATCH);
}



























































































































    1 
























































































    1 













































































    1 
































































    1 
    1 

















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  This file contains functions assisting in mapping VFS to 9P2000
 *
 *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
 *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/fs_parser.h>
#include <linux/fs_context.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
#include <net/9p/transport.h>
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "cache.h"

static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
static LIST_HEAD(v9fs_sessionlist);
struct kmem_cache *v9fs_inode_cache;

/*
 * Option Parsing (code inspired by NFS code)
 *  NOTE: each transport will parse its own options
 */

enum {
        /* Mount-point source, we need to handle this explicitly because
         * the code below accepts unknown args and the vfs layer only handles
         * source if we rejected it as EINVAL */
        Opt_source,
        /* Options that take integer arguments */
        Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
        /* String options */
        Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag,
        /* Options that take no arguments */
        Opt_nodevmap, Opt_noxattr, Opt_directio, Opt_ignoreqv,
        /* Access options */
        Opt_access, Opt_posixacl,
        /* Lock timeout option */
        Opt_locktimeout,

        /* Client options */
        Opt_msize, Opt_trans, Opt_legacy, Opt_version,

        /* fd transport options */
        /* Options that take integer arguments */
        Opt_rfdno, Opt_wfdno,
        /* Options that take no arguments */

        /* rdma transport options */
        /* Options that take integer arguments */
        Opt_rq_depth, Opt_sq_depth, Opt_timeout,

        /* Options for both fd and rdma transports */
        Opt_port, Opt_privport,
};

static const struct constant_table p9_versions[] = {
        { "9p2000",        p9_proto_legacy },
        { "9p2000.u",        p9_proto_2000u },
        { "9p2000.L",        p9_proto_2000L },
        {}
};

/*
 * This structure contains all parameters used for the core code,
 * the client, and all the transports.
 */
const struct fs_parameter_spec v9fs_param_spec[] = {
        fsparam_string        ("source",        Opt_source),
        fsparam_u32hex        ("debug",        Opt_debug),
        fsparam_uid        ("dfltuid",        Opt_dfltuid),
        fsparam_gid        ("dfltgid",        Opt_dfltgid),
        fsparam_u32        ("afid",        Opt_afid),
        fsparam_string        ("uname",        Opt_uname),
        fsparam_string        ("aname",        Opt_remotename),
        fsparam_flag        ("nodevmap",        Opt_nodevmap),
        fsparam_flag        ("noxattr",        Opt_noxattr),
        fsparam_flag        ("directio",        Opt_directio),
        fsparam_flag        ("ignoreqv",        Opt_ignoreqv),
        fsparam_string        ("cache",        Opt_cache),
        fsparam_string        ("cachetag",        Opt_cachetag),
        fsparam_string        ("access",        Opt_access),
        fsparam_flag        ("posixacl",        Opt_posixacl),
        fsparam_u32        ("locktimeout",        Opt_locktimeout),

        /* client options */
        fsparam_u32        ("msize",        Opt_msize),
        fsparam_flag        ("noextend",        Opt_legacy),
        fsparam_string        ("trans",        Opt_trans),
        fsparam_enum        ("version",        Opt_version, p9_versions),

        /* fd transport options */
        fsparam_u32        ("rfdno",        Opt_rfdno),
        fsparam_u32        ("wfdno",        Opt_wfdno),

        /* rdma transport options */
        fsparam_u32        ("sq",                Opt_sq_depth),
        fsparam_u32        ("rq",                Opt_rq_depth),
        fsparam_u32        ("timeout",        Opt_timeout),

        /* fd and rdma transprt options */
        fsparam_u32        ("port",        Opt_port),
        fsparam_flag        ("privport",        Opt_privport),
        {}
};

/* Interpret mount options for cache mode */
static int get_cache_mode(char *s)
{
        int version = -EINVAL;

        if (!strcmp(s, "loose")) {
                version = CACHE_SC_LOOSE;
                p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
        } else if (!strcmp(s, "fscache")) {
                version = CACHE_SC_FSCACHE;
                p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
        } else if (!strcmp(s, "mmap")) {
                version = CACHE_SC_MMAP;
                p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n");
        } else if (!strcmp(s, "readahead")) {
                version = CACHE_SC_READAHEAD;
                p9_debug(P9_DEBUG_9P, "Cache mode: readahead\n");
        } else if (!strcmp(s, "none")) {
                version = CACHE_SC_NONE;
                p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
        } else if (kstrtoint(s, 0, &version) != 0) {
                version = -EINVAL;
                pr_info("Unknown Cache mode or invalid value %s\n", s);
        }
        return version;
}

/*
 * Display the mount options in /proc/mounts.
 */
int v9fs_show_options(struct seq_file *m, struct dentry *root)
{
        struct v9fs_session_info *v9ses = root->d_sb->s_fs_info;

        if (v9ses->debug)
                seq_printf(m, ",debug=%#x", v9ses->debug);
        if (!uid_eq(v9ses->dfltuid, V9FS_DEFUID))
                seq_printf(m, ",dfltuid=%u",
                           from_kuid_munged(&init_user_ns, v9ses->dfltuid));
        if (!gid_eq(v9ses->dfltgid, V9FS_DEFGID))
                seq_printf(m, ",dfltgid=%u",
                           from_kgid_munged(&init_user_ns, v9ses->dfltgid));
        if (v9ses->afid != ~0)
                seq_printf(m, ",afid=%u", v9ses->afid);
        if (strcmp(v9ses->uname, V9FS_DEFUSER) != 0)
                seq_printf(m, ",uname=%s", v9ses->uname);
        if (strcmp(v9ses->aname, V9FS_DEFANAME) != 0)
                seq_printf(m, ",aname=%s", v9ses->aname);
        if (v9ses->nodev)
                seq_puts(m, ",nodevmap");
        if (v9ses->cache)
                seq_printf(m, ",cache=%#x", v9ses->cache);
#ifdef CONFIG_9P_FSCACHE
        if (v9ses->cachetag && (v9ses->cache & CACHE_FSCACHE))
                seq_printf(m, ",cachetag=%s", v9ses->cachetag);
#endif

        switch (v9ses->flags & V9FS_ACCESS_MASK) {
        case V9FS_ACCESS_USER:
                seq_puts(m, ",access=user");
                break;
        case V9FS_ACCESS_ANY:
                seq_puts(m, ",access=any");
                break;
        case V9FS_ACCESS_CLIENT:
                seq_puts(m, ",access=client");
                break;
        case V9FS_ACCESS_SINGLE:
                seq_printf(m, ",access=%u",
                           from_kuid_munged(&init_user_ns, v9ses->uid));
                break;
        }

        if (v9ses->flags & V9FS_IGNORE_QV)
                seq_puts(m, ",ignoreqv");
        if (v9ses->flags & V9FS_DIRECT_IO)
                seq_puts(m, ",directio");
        if (v9ses->flags & V9FS_POSIX_ACL)
                seq_puts(m, ",posixacl");

        if (v9ses->flags & V9FS_NO_XATTR)
                seq_puts(m, ",noxattr");

        return p9_show_client_options(m, v9ses->clnt);
}

/**
 * v9fs_parse_param - parse a mount option into the filesystem context
 * @fc: the filesystem context
 * @param: the parameter to parse
 *
 * Return 0 upon success, -ERRNO upon failure.
 */
int v9fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct v9fs_context *ctx = fc->fs_private;
        struct fs_parse_result result;
        char *s;
        int r;
        int opt;
        struct p9_client_opts        *clnt = &ctx->client_opts;
        struct p9_fd_opts        *fd_opts = &ctx->fd_opts;
        struct p9_rdma_opts        *rdma_opts = &ctx->rdma_opts;
        struct p9_session_opts        *session_opts = &ctx->session_opts;

        opt = fs_parse(fc, v9fs_param_spec, param, &result);
        if (opt < 0) {
                /*
                 * We might like to report bad mount options here, but
                 * traditionally 9p has ignored unknown mount options
                 */
                if (opt == -ENOPARAM)
                        return 0;

                return opt;
        }

        switch (opt) {
        case Opt_source:
                if (fc->source) {
                        pr_info("p9: multiple sources not supported\n");
                        return -EINVAL;
                }
                fc->source = param->string;
                param->string = NULL;
                break;
        case Opt_debug:
                session_opts->debug = result.uint_32;
#ifdef CONFIG_NET_9P_DEBUG
                p9_debug_level = result.uint_32;
#endif
                break;

        case Opt_dfltuid:
                session_opts->dfltuid = result.uid;
                break;
        case Opt_dfltgid:
                session_opts->dfltgid = result.gid;
                break;
        case Opt_afid:
                session_opts->afid = result.uint_32;
                break;
        case Opt_uname:
                kfree(session_opts->uname);
                session_opts->uname = param->string;
                param->string = NULL;
                break;
        case Opt_remotename:
                kfree(session_opts->aname);
                session_opts->aname = param->string;
                param->string = NULL;
                break;
        case Opt_nodevmap:
                session_opts->nodev = 1;
                break;
        case Opt_noxattr:
                session_opts->flags |= V9FS_NO_XATTR;
                break;
        case Opt_directio:
                session_opts->flags |= V9FS_DIRECT_IO;
                break;
        case Opt_ignoreqv:
                session_opts->flags |= V9FS_IGNORE_QV;
                break;
        case Opt_cachetag:
#ifdef CONFIG_9P_FSCACHE
                kfree(session_opts->cachetag);
                session_opts->cachetag = param->string;
                param->string = NULL;
#endif
                break;
        case Opt_cache:
                r = get_cache_mode(param->string);
                if (r < 0)
                        return r;
                session_opts->cache = r;
                break;
        case Opt_access:
                s = param->string;
                session_opts->flags &= ~V9FS_ACCESS_MASK;
                if (strcmp(s, "user") == 0) {
                        session_opts->flags |= V9FS_ACCESS_USER;
                } else if (strcmp(s, "any") == 0) {
                        session_opts->flags |= V9FS_ACCESS_ANY;
                } else if (strcmp(s, "client") == 0) {
                        session_opts->flags |= V9FS_ACCESS_CLIENT;
                } else {
                        uid_t uid;

                        session_opts->flags |= V9FS_ACCESS_SINGLE;
                        r = kstrtouint(s, 10, &uid);
                        if (r) {
                                pr_info("Unknown access argument %s: %d\n",
                                        param->string, r);
                                return r;
                        }
                        session_opts->uid = make_kuid(current_user_ns(), uid);
                        if (!uid_valid(session_opts->uid)) {
                                pr_info("Unknown uid %s\n", s);
                                return -EINVAL;
                        }
                }
                break;

        case Opt_posixacl:
#ifdef CONFIG_9P_FS_POSIX_ACL
                session_opts->flags |= V9FS_POSIX_ACL;
#else
                p9_debug(P9_DEBUG_ERROR,
                         "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
#endif
                break;

        case Opt_locktimeout:
                if (result.uint_32 < 1) {
                        p9_debug(P9_DEBUG_ERROR,
                                 "locktimeout must be a greater than zero integer.\n");
                        return -EINVAL;
                }
                session_opts->session_lock_timeout = (long)result.uint_32 * HZ;
                break;

        /* Options for client */
        case Opt_msize:
                if (result.uint_32 < 4096) {
                        p9_debug(P9_DEBUG_ERROR, "msize should be at least 4k\n");
                        return -EINVAL;
                }
                if (result.uint_32 > INT_MAX) {
                        p9_debug(P9_DEBUG_ERROR, "msize too big\n");
                        return -EINVAL;
                }
                clnt->msize = result.uint_32;
                break;
        case Opt_trans:
                v9fs_put_trans(clnt->trans_mod);
                clnt->trans_mod = v9fs_get_trans_by_name(param->string);
                if (!clnt->trans_mod) {
                        pr_info("Could not find request transport: %s\n",
                                param->string);
                        return -EINVAL;
                }
                break;
        case Opt_legacy:
                clnt->proto_version = p9_proto_legacy;
                break;
        case Opt_version:
                clnt->proto_version = result.uint_32;
                p9_debug(P9_DEBUG_9P, "Protocol version: %s\n", param->string);
                break;
        /* Options for fd transport */
        case Opt_rfdno:
                fd_opts->rfd = result.uint_32;
                break;
        case Opt_wfdno:
                fd_opts->wfd = result.uint_32;
                break;
        /* Options for rdma transport */
        case Opt_sq_depth:
                rdma_opts->sq_depth = result.uint_32;
                break;
        case Opt_rq_depth:
                rdma_opts->rq_depth = result.uint_32;
                break;
        case Opt_timeout:
                rdma_opts->timeout = result.uint_32;
                break;
        /* Options for both fd and rdma transports */
        case Opt_port:
                fd_opts->port = result.uint_32;
                rdma_opts->port = result.uint_32;
                break;
        case Opt_privport:
                fd_opts->privport = true;
                rdma_opts->port = true;
                break;
        }

        return 0;
}

static void v9fs_apply_options(struct v9fs_session_info *v9ses,
                  struct fs_context *fc)
{
        struct v9fs_context        *ctx = fc->fs_private;

        v9ses->debug = ctx->session_opts.debug;
        v9ses->dfltuid = ctx->session_opts.dfltuid;
        v9ses->dfltgid = ctx->session_opts.dfltgid;
        v9ses->afid = ctx->session_opts.afid;
        v9ses->uname = ctx->session_opts.uname;
        ctx->session_opts.uname = NULL;
        v9ses->aname = ctx->session_opts.aname;
        ctx->session_opts.aname = NULL;
        v9ses->nodev = ctx->session_opts.nodev;
        /*
         * Note that we must |= flags here as session_init already
         * set basic flags. This adds in flags from parsed options.
         */
        v9ses->flags |= ctx->session_opts.flags;
#ifdef CONFIG_9P_FSCACHE
        v9ses->cachetag = ctx->session_opts.cachetag;
        ctx->session_opts.cachetag = NULL;
#endif
        v9ses->cache = ctx->session_opts.cache;
        v9ses->uid = ctx->session_opts.uid;
        v9ses->session_lock_timeout = ctx->session_opts.session_lock_timeout;
}

/**
 * v9fs_session_init - initialize session
 * @v9ses: session information structure
 * @fc: the filesystem mount context
 *
 */

struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                  struct fs_context *fc)
{
        struct p9_fid *fid;
        int rc = -ENOMEM;

        init_rwsem(&v9ses->rename_sem);

        v9ses->clnt = p9_client_create(fc);
        if (IS_ERR(v9ses->clnt)) {
                rc = PTR_ERR(v9ses->clnt);
                p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
                goto err_names;
        }

        /*
         * Initialize flags on the real v9ses. v9fs_apply_options below
         * will |= the additional flags from parsed options.
         */
        v9ses->flags = V9FS_ACCESS_USER;

        if (p9_is_proto_dotl(v9ses->clnt)) {
                v9ses->flags = V9FS_ACCESS_CLIENT;
                v9ses->flags |= V9FS_PROTO_2000L;
        } else if (p9_is_proto_dotu(v9ses->clnt)) {
                v9ses->flags |= V9FS_PROTO_2000U;
        }

        v9fs_apply_options(v9ses, fc);

        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;

        if (!v9fs_proto_dotl(v9ses) &&
            ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
                /*
                 * We support ACCESS_CLIENT only for dotl.
                 * Fall back to ACCESS_USER
                 */
                v9ses->flags &= ~V9FS_ACCESS_MASK;
                v9ses->flags |= V9FS_ACCESS_USER;
        }
        /* FIXME: for legacy mode, fall back to V9FS_ACCESS_ANY */
        if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
                ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {

                v9ses->flags &= ~V9FS_ACCESS_MASK;
                v9ses->flags |= V9FS_ACCESS_ANY;
                v9ses->uid = INVALID_UID;
        }
        if (!v9fs_proto_dotl(v9ses) ||
                !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
                /*
                 * We support ACL checks on client only if the protocol is
                 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
                 */
                v9ses->flags &= ~V9FS_ACL_MASK;
        }

        fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID,
                                                        v9ses->aname);
        if (IS_ERR(fid)) {
                rc = PTR_ERR(fid);
                p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
                goto err_clnt;
        }

        if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)
                fid->uid = v9ses->uid;
        else
                fid->uid = INVALID_UID;

#ifdef CONFIG_9P_FSCACHE
        /* register the session for caching */
        if (v9ses->cache & CACHE_FSCACHE) {
                rc = v9fs_cache_session_get_cookie(v9ses, fc->source);
                if (rc < 0)
                        goto err_clnt;
        }
#endif
        spin_lock(&v9fs_sessionlist_lock);
        list_add(&v9ses->slist, &v9fs_sessionlist);
        spin_unlock(&v9fs_sessionlist_lock);

        return fid;

err_clnt:
#ifdef CONFIG_9P_FSCACHE
        kfree(v9ses->cachetag);
#endif
        p9_client_destroy(v9ses->clnt);
err_names:
        kfree(v9ses->uname);
        kfree(v9ses->aname);
        return ERR_PTR(rc);
}

/**
 * v9fs_session_close - shutdown a session
 * @v9ses: session information structure
 *
 */

void v9fs_session_close(struct v9fs_session_info *v9ses)
{
        if (v9ses->clnt) {
                p9_client_destroy(v9ses->clnt);
                v9ses->clnt = NULL;
        }

#ifdef CONFIG_9P_FSCACHE
        fscache_relinquish_volume(v9fs_session_cache(v9ses), NULL, false);
        kfree(v9ses->cachetag);
#endif
        kfree(v9ses->uname);
        kfree(v9ses->aname);

        spin_lock(&v9fs_sessionlist_lock);
        list_del(&v9ses->slist);
        spin_unlock(&v9fs_sessionlist_lock);
}

/**
 * v9fs_session_cancel - terminate a session
 * @v9ses: session to terminate
 *
 * mark transport as disconnected and cancel all pending requests.
 */

void v9fs_session_cancel(struct v9fs_session_info *v9ses)
{
        p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
        p9_client_disconnect(v9ses->clnt);
}

/**
 * v9fs_session_begin_cancel - Begin terminate of a session
 * @v9ses: session to terminate
 *
 * After this call we don't allow any request other than clunk.
 */

void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
{
        p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
        p9_client_begin_disconnect(v9ses->clnt);
}

static struct kobject *v9fs_kobj;

#ifdef CONFIG_9P_FSCACHE
/*
 * List caches associated with a session
 */
static ssize_t caches_show(struct kobject *kobj,
                           struct kobj_attribute *attr,
                           char *buf)
{
        ssize_t n = 0, count = 0, limit = PAGE_SIZE;
        struct v9fs_session_info *v9ses;

        spin_lock(&v9fs_sessionlist_lock);
        list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
                if (v9ses->cachetag) {
                        n = snprintf(buf + count, limit, "%s\n", v9ses->cachetag);
                        if (n < 0) {
                                count = n;
                                break;
                        }

                        count += n;
                        limit -= n;
                }
        }

        spin_unlock(&v9fs_sessionlist_lock);
        return count;
}

static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches);
#endif /* CONFIG_9P_FSCACHE */

static struct attribute *v9fs_attrs[] = {
#ifdef CONFIG_9P_FSCACHE
        &v9fs_attr_cache.attr,
#endif
        NULL,
};

static const struct attribute_group v9fs_attr_group = {
        .attrs = v9fs_attrs,
};

/**
 * v9fs_sysfs_init - Initialize the v9fs sysfs interface
 *
 */

static int __init v9fs_sysfs_init(void)
{
        int ret;

        v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
        if (!v9fs_kobj)
                return -ENOMEM;

        ret = sysfs_create_group(v9fs_kobj, &v9fs_attr_group);
        if (ret) {
                kobject_put(v9fs_kobj);
                return ret;
        }

        return 0;
}

/**
 * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface
 *
 */

static void v9fs_sysfs_cleanup(void)
{
        sysfs_remove_group(v9fs_kobj, &v9fs_attr_group);
        kobject_put(v9fs_kobj);
}

static void v9fs_inode_init_once(void *foo)
{
        struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;

        memset(&v9inode->qid, 0, sizeof(v9inode->qid));
        inode_init_once(&v9inode->netfs.inode);
}

/**
 * v9fs_init_inode_cache - initialize a cache for 9P
 * Returns 0 on success.
 */
static int v9fs_init_inode_cache(void)
{
        v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
                                          sizeof(struct v9fs_inode),
                                          0, (SLAB_RECLAIM_ACCOUNT|
                                              SLAB_ACCOUNT),
                                          v9fs_inode_init_once);
        if (!v9fs_inode_cache)
                return -ENOMEM;

        return 0;
}

/**
 * v9fs_destroy_inode_cache - destroy the cache of 9P inode
 *
 */
static void v9fs_destroy_inode_cache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(v9fs_inode_cache);
}

/**
 * init_v9fs - Initialize module
 *
 */

static int __init init_v9fs(void)
{
        int err;

        pr_info("Installing v9fs 9p2000 file system support\n");
        /* TODO: Setup list of registered transport modules */

        err = v9fs_init_inode_cache();
        if (err < 0) {
                pr_err("Failed to register v9fs for caching\n");
                return err;
        }

        err = v9fs_sysfs_init();
        if (err < 0) {
                pr_err("Failed to register with sysfs\n");
                goto out_cache;
        }
        err = register_filesystem(&v9fs_fs_type);
        if (err < 0) {
                pr_err("Failed to register filesystem\n");
                goto out_sysfs_cleanup;
        }

        return 0;

out_sysfs_cleanup:
        v9fs_sysfs_cleanup();

out_cache:
        v9fs_destroy_inode_cache();

        return err;
}

/**
 * exit_v9fs - shutdown module
 *
 */

static void __exit exit_v9fs(void)
{
        v9fs_sysfs_cleanup();
        v9fs_destroy_inode_cache();
        unregister_filesystem(&v9fs_fs_type);
}

module_init(init_v9fs)
module_exit(exit_v9fs)

MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
MODULE_DESCRIPTION("9P Client File System");
MODULE_LICENSE("GPL");









































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 











    2 







    3 






































    1 

























    3 















































































































    3 

    3 

























































































    3 
























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
// SPDX-License-Identifier: GPL-2.0
/*
 * device.h - generic, centralized driver model
 *
 * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2008-2009 Novell Inc.
 *
 * See Documentation/driver-api/driver-model/ for more information.
 */

#ifndef _DEVICE_H_
#define _DEVICE_H_

#include <linux/dev_printk.h>
#include <linux/energy_model.h>
#include <linux/ioport.h>
#include <linux/kobject.h>
#include <linux/klist.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/pm.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/gfp.h>
#include <linux/device/bus.h>
#include <linux/device/class.h>
#include <linux/device/devres.h>
#include <linux/device/driver.h>
#include <linux/cleanup.h>
#include <asm/device.h>

struct device;
struct device_private;
struct device_driver;
struct driver_private;
struct module;
struct class;
struct subsys_private;
struct device_node;
struct fwnode_handle;
struct iommu_group;
struct dev_pin_info;
struct dev_iommu;
struct msi_device_data;

/**
 * struct subsys_interface - interfaces to device functions
 * @name:       name of the device function
 * @subsys:     subsystem of the devices to attach to
 * @node:       the list of functions registered at the subsystem
 * @add_dev:    device hookup to device function handler
 * @remove_dev: device hookup to device function handler
 *
 * Simple interfaces attached to a subsystem. Multiple interfaces can
 * attach to a subsystem and its devices. Unlike drivers, they do not
 * exclusively claim or control devices. Interfaces usually represent
 * a specific functionality of a subsystem/class of devices.
 */
struct subsys_interface {
        const char *name;
        const struct bus_type *subsys;
        struct list_head node;
        int (*add_dev)(struct device *dev, struct subsys_interface *sif);
        void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
};

int subsys_interface_register(struct subsys_interface *sif);
void subsys_interface_unregister(struct subsys_interface *sif);

int subsys_system_register(const struct bus_type *subsys,
                           const struct attribute_group **groups);
int subsys_virtual_register(const struct bus_type *subsys,
                            const struct attribute_group **groups);

/*
 * The type of device, "struct device" is embedded in. A class
 * or bus can contain devices of different types
 * like "partitions" and "disks", "mouse" and "event".
 * This identifies the device type and carries type-specific
 * information, equivalent to the kobj_type of a kobject.
 * If "name" is specified, the uevent will contain it in
 * the DEVTYPE variable.
 */
struct device_type {
        const char *name;
        const struct attribute_group **groups;
        int (*uevent)(const struct device *dev, struct kobj_uevent_env *env);
        char *(*devnode)(const struct device *dev, umode_t *mode,
                         kuid_t *uid, kgid_t *gid);
        void (*release)(struct device *dev);

        const struct dev_pm_ops *pm;
};

/**
 * struct device_attribute - Interface for exporting device attributes.
 * @attr: sysfs attribute definition.
 * @show: Show handler.
 * @store: Store handler.
 */
struct device_attribute {
        struct attribute        attr;
        ssize_t (*show)(struct device *dev, struct device_attribute *attr,
                        char *buf);
        ssize_t (*store)(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
};

/**
 * struct dev_ext_attribute - Exported device attribute with extra context.
 * @attr: Exported device attribute.
 * @var: Pointer to context.
 */
struct dev_ext_attribute {
        struct device_attribute attr;
        void *var;
};

ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr,
                          char *buf);
ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count);
ssize_t device_show_int(struct device *dev, struct device_attribute *attr,
                        char *buf);
ssize_t device_store_int(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
                        char *buf);
ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
ssize_t device_show_string(struct device *dev, struct device_attribute *attr,
                           char *buf);

/**
 * DEVICE_ATTR - Define a device attribute.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_show: Show handler. Optional, but mandatory if attribute is readable.
 * @_store: Store handler. Optional, but mandatory if attribute is writable.
 *
 * Convenience macro for defining a struct device_attribute.
 *
 * For example, ``DEVICE_ATTR(foo, 0644, foo_show, foo_store);`` expands to:
 *
 * .. code-block:: c
 *
 *        struct device_attribute dev_attr_foo = {
 *                .attr        = { .name = "foo", .mode = 0644 },
 *                .show        = foo_show,
 *                .store        = foo_store,
 *        };
 */
#define DEVICE_ATTR(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store)

/**
 * DEVICE_ATTR_PREALLOC - Define a preallocated device attribute.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_show: Show handler. Optional, but mandatory if attribute is readable.
 * @_store: Store handler. Optional, but mandatory if attribute is writable.
 *
 * Like DEVICE_ATTR(), but ``SYSFS_PREALLOC`` is set on @_mode.
 */
#define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name = \
                __ATTR_PREALLOC(_name, _mode, _show, _store)

/**
 * DEVICE_ATTR_RW - Define a read-write device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0644, @_show is <_name>_show,
 * and @_store is <_name>_store.
 */
#define DEVICE_ATTR_RW(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RW(_name)

/**
 * DEVICE_ATTR_ADMIN_RW - Define an admin-only read-write device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR_RW(), but @_mode is 0600.
 */
#define DEVICE_ATTR_ADMIN_RW(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600)

/**
 * DEVICE_ATTR_RW_NAMED - Define a read-write device attribute with a sysfs name
 * that differs from the function name.
 * @_name: Attribute function preface
 * @_attrname: Attribute name as it wil be exposed in the sysfs.
 *
 * Like DEVICE_ATTR_RW(), but allows for reusing names under separate paths in
 * the same driver.
 */
#define DEVICE_ATTR_RW_NAMED(_name, _attrname)                            \
        struct device_attribute dev_attr_##_name = {                      \
                .attr = { .name = _attrname, .mode = 0644 }, \
                .show = _name##_show,                                     \
                .store = _name##_store,                                   \
        }

/**
 * DEVICE_ATTR_RO - Define a readable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0444 and @_show is <_name>_show.
 */
#define DEVICE_ATTR_RO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RO(_name)

/**
 * DEVICE_ATTR_ADMIN_RO - Define an admin-only readable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR_RO(), but @_mode is 0400.
 */
#define DEVICE_ATTR_ADMIN_RO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400)

/**
 * DEVICE_ATTR_RO_NAMED - Define a read-only device attribute with a sysfs name
 * that differs from the function name.
 * @_name: Attribute function preface
 * @_attrname: Attribute name as it wil be exposed in the sysfs.
 *
 * Like DEVICE_ATTR_RO(), but allows for reusing names under separate paths in
 * the same driver.
 */
#define DEVICE_ATTR_RO_NAMED(_name, _attrname)                            \
        struct device_attribute dev_attr_##_name = {                      \
                .attr = { .name = _attrname, .mode = 0444 }, \
                .show = _name##_show,                                     \
        }

/**
 * DEVICE_ATTR_WO - Define an admin-only writable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0200 and @_store is <_name>_store.
 */
#define DEVICE_ATTR_WO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_WO(_name)

/**
 * DEVICE_ATTR_WO_NAMED - Define a read-only device attribute with a sysfs name
 * that differs from the function name.
 * @_name: Attribute function preface
 * @_attrname: Attribute name as it wil be exposed in the sysfs.
 *
 * Like DEVICE_ATTR_WO(), but allows for reusing names under separate paths in
 * the same driver.
 */
#define DEVICE_ATTR_WO_NAMED(_name, _attrname)                            \
        struct device_attribute dev_attr_##_name = {                      \
                .attr = { .name = _attrname, .mode = 0200 }, \
                .store = _name##_store,                                   \
        }

/**
 * DEVICE_ULONG_ATTR - Define a device attribute backed by an unsigned long.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of unsigned long.
 *
 * Like DEVICE_ATTR(), but @_show and @_store are automatically provided
 * such that reads and writes to the attribute from userspace affect @_var.
 */
#define DEVICE_ULONG_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) }

/**
 * DEVICE_INT_ATTR - Define a device attribute backed by an int.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of int.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is an int.
 */
#define DEVICE_INT_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) }

/**
 * DEVICE_BOOL_ATTR - Define a device attribute backed by a bool.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of bool.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is a bool.
 */
#define DEVICE_BOOL_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) }

/**
 * DEVICE_STRING_ATTR_RO - Define a device attribute backed by a r/o string.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of string.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is a string. Because the length of the
 * string allocation is unknown, the attribute must be read-only.
 */
#define DEVICE_STRING_ATTR_RO(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, (_mode) & ~0222, device_show_string, NULL), (_var) }

#define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name =                \
                __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)

int device_create_file(struct device *device,
                       const struct device_attribute *entry);
void device_remove_file(struct device *dev,
                        const struct device_attribute *attr);
bool device_remove_file_self(struct device *dev,
                             const struct device_attribute *attr);
int __must_check device_create_bin_file(struct device *dev,
                                        const struct bin_attribute *attr);
void device_remove_bin_file(struct device *dev,
                            const struct bin_attribute *attr);

struct device_dma_parameters {
        /*
         * a low level driver may set these to teach IOMMU code about
         * sg limitations.
         */
        unsigned int max_segment_size;
        unsigned int min_align_mask;
        unsigned long segment_boundary_mask;
};

/**
 * enum device_link_state - Device link states.
 * @DL_STATE_NONE: The presence of the drivers is not being tracked.
 * @DL_STATE_DORMANT: None of the supplier/consumer drivers is present.
 * @DL_STATE_AVAILABLE: The supplier driver is present, but the consumer is not.
 * @DL_STATE_CONSUMER_PROBE: The consumer is probing (supplier driver present).
 * @DL_STATE_ACTIVE: Both the supplier and consumer drivers are present.
 * @DL_STATE_SUPPLIER_UNBIND: The supplier driver is unbinding.
 */
enum device_link_state {
        DL_STATE_NONE = -1,
        DL_STATE_DORMANT = 0,
        DL_STATE_AVAILABLE,
        DL_STATE_CONSUMER_PROBE,
        DL_STATE_ACTIVE,
        DL_STATE_SUPPLIER_UNBIND,
};

/*
 * Device link flags.
 *
 * STATELESS: The core will not remove this link automatically.
 * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind.
 * PM_RUNTIME: If set, the runtime PM framework will use this link.
 * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation.
 * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind.
 * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds.
 * MANAGED: The core tracks presence of supplier/consumer drivers (internal).
 * SYNC_STATE_ONLY: Link only affects sync_state() behavior.
 * INFERRED: Inferred from data (eg: firmware) and not from driver actions.
 */
#define DL_FLAG_STATELESS                BIT(0)
#define DL_FLAG_AUTOREMOVE_CONSUMER        BIT(1)
#define DL_FLAG_PM_RUNTIME                BIT(2)
#define DL_FLAG_RPM_ACTIVE                BIT(3)
#define DL_FLAG_AUTOREMOVE_SUPPLIER        BIT(4)
#define DL_FLAG_AUTOPROBE_CONSUMER        BIT(5)
#define DL_FLAG_MANAGED                        BIT(6)
#define DL_FLAG_SYNC_STATE_ONLY                BIT(7)
#define DL_FLAG_INFERRED                BIT(8)
#define DL_FLAG_CYCLE                        BIT(9)

/**
 * enum dl_dev_state - Device driver presence tracking information.
 * @DL_DEV_NO_DRIVER: There is no driver attached to the device.
 * @DL_DEV_PROBING: A driver is probing.
 * @DL_DEV_DRIVER_BOUND: The driver has been bound to the device.
 * @DL_DEV_UNBINDING: The driver is unbinding from the device.
 */
enum dl_dev_state {
        DL_DEV_NO_DRIVER = 0,
        DL_DEV_PROBING,
        DL_DEV_DRIVER_BOUND,
        DL_DEV_UNBINDING,
};

/**
 * enum device_removable - Whether the device is removable. The criteria for a
 * device to be classified as removable is determined by its subsystem or bus.
 * @DEVICE_REMOVABLE_NOT_SUPPORTED: This attribute is not supported for this
 *                                    device (default).
 * @DEVICE_REMOVABLE_UNKNOWN:  Device location is Unknown.
 * @DEVICE_FIXED: Device is not removable by the user.
 * @DEVICE_REMOVABLE: Device is removable by the user.
 */
enum device_removable {
        DEVICE_REMOVABLE_NOT_SUPPORTED = 0, /* must be 0 */
        DEVICE_REMOVABLE_UNKNOWN,
        DEVICE_FIXED,
        DEVICE_REMOVABLE,
};

/**
 * struct dev_links_info - Device data related to device links.
 * @suppliers: List of links to supplier devices.
 * @consumers: List of links to consumer devices.
 * @defer_sync: Hook to global list of devices that have deferred sync_state.
 * @status: Driver status information.
 */
struct dev_links_info {
        struct list_head suppliers;
        struct list_head consumers;
        struct list_head defer_sync;
        enum dl_dev_state status;
};

/**
 * struct dev_msi_info - Device data related to MSI
 * @domain:        The MSI interrupt domain associated to the device
 * @data:        Pointer to MSI device data
 */
struct dev_msi_info {
#ifdef CONFIG_GENERIC_MSI_IRQ
        struct irq_domain        *domain;
        struct msi_device_data        *data;
#endif
};

/**
 * enum device_physical_location_panel - Describes which panel surface of the
 * system's housing the device connection point resides on.
 * @DEVICE_PANEL_TOP: Device connection point is on the top panel.
 * @DEVICE_PANEL_BOTTOM: Device connection point is on the bottom panel.
 * @DEVICE_PANEL_LEFT: Device connection point is on the left panel.
 * @DEVICE_PANEL_RIGHT: Device connection point is on the right panel.
 * @DEVICE_PANEL_FRONT: Device connection point is on the front panel.
 * @DEVICE_PANEL_BACK: Device connection point is on the back panel.
 * @DEVICE_PANEL_UNKNOWN: The panel with device connection point is unknown.
 */
enum device_physical_location_panel {
        DEVICE_PANEL_TOP,
        DEVICE_PANEL_BOTTOM,
        DEVICE_PANEL_LEFT,
        DEVICE_PANEL_RIGHT,
        DEVICE_PANEL_FRONT,
        DEVICE_PANEL_BACK,
        DEVICE_PANEL_UNKNOWN,
};

/**
 * enum device_physical_location_vertical_position - Describes vertical
 * position of the device connection point on the panel surface.
 * @DEVICE_VERT_POS_UPPER: Device connection point is at upper part of panel.
 * @DEVICE_VERT_POS_CENTER: Device connection point is at center part of panel.
 * @DEVICE_VERT_POS_LOWER: Device connection point is at lower part of panel.
 */
enum device_physical_location_vertical_position {
        DEVICE_VERT_POS_UPPER,
        DEVICE_VERT_POS_CENTER,
        DEVICE_VERT_POS_LOWER,
};

/**
 * enum device_physical_location_horizontal_position - Describes horizontal
 * position of the device connection point on the panel surface.
 * @DEVICE_HORI_POS_LEFT: Device connection point is at left part of panel.
 * @DEVICE_HORI_POS_CENTER: Device connection point is at center part of panel.
 * @DEVICE_HORI_POS_RIGHT: Device connection point is at right part of panel.
 */
enum device_physical_location_horizontal_position {
        DEVICE_HORI_POS_LEFT,
        DEVICE_HORI_POS_CENTER,
        DEVICE_HORI_POS_RIGHT,
};

/**
 * struct device_physical_location - Device data related to physical location
 * of the device connection point.
 * @panel: Panel surface of the system's housing that the device connection
 *         point resides on.
 * @vertical_position: Vertical position of the device connection point within
 *                     the panel.
 * @horizontal_position: Horizontal position of the device connection point
 *                       within the panel.
 * @dock: Set if the device connection point resides in a docking station or
 *        port replicator.
 * @lid: Set if this device connection point resides on the lid of laptop
 *       system.
 */
struct device_physical_location {
        enum device_physical_location_panel panel;
        enum device_physical_location_vertical_position vertical_position;
        enum device_physical_location_horizontal_position horizontal_position;
        bool dock;
        bool lid;
};

/**
 * enum struct_device_flags - Flags in struct device
 *
 * Each flag should have a set of accessor functions created via
 * __create_dev_flag_accessors() for each access.
 *
 * @DEV_FLAG_READY_TO_PROBE: If set then device_add() has finished enough
 *                initialization that probe could be called.
 * @DEV_FLAG_COUNT: Number of defined struct_device_flags.
 */
enum struct_device_flags {
        DEV_FLAG_READY_TO_PROBE = 0,

        DEV_FLAG_COUNT
};

/**
 * struct device - The basic device structure
 * @parent:        The device's "parent" device, the device to which it is attached.
 *                 In most cases, a parent device is some sort of bus or host
 *                 controller. If parent is NULL, the device, is a top-level device,
 *                 which is not usually what you want.
 * @p:                Holds the private data of the driver core portions of the device.
 *                 See the comment of the struct device_private for detail.
 * @kobj:        A top-level, abstract class from which other classes are derived.
 * @init_name:        Initial name of the device.
 * @type:        The type of device.
 *                 This identifies the device type and carries type-specific
 *                 information.
 * @mutex:        Mutex to synchronize calls to its driver.
 * @bus:        Type of bus device is on.
 * @driver:        Which driver has allocated this
 * @platform_data: Platform data specific to the device.
 *                 Example: For devices on custom boards, as typical of embedded
 *                 and SOC based hardware, Linux often uses platform_data to point
 *                 to board-specific structures describing devices and how they
 *                 are wired.  That can include what ports are available, chip
 *                 variants, which GPIO pins act in what additional roles, and so
 *                 on.  This shrinks the "Board Support Packages" (BSPs) and
 *                 minimizes board-specific #ifdefs in drivers.
 * @driver_data: Private pointer for driver specific info.
 * @driver_override: Driver name to force a match.  Do not touch directly; use
 *                     device_set_driver_override() instead.
 * @links:        Links to suppliers and consumers of this device.
 * @power:        For device power management.
 *                See Documentation/driver-api/pm/devices.rst for details.
 * @pm_domain:        Provide callbacks that are executed during system suspend,
 *                 hibernation, system resume and during runtime PM transitions
 *                 along with subsystem-level and driver-level callbacks.
 * @em_pd:        device's energy model performance domain
 * @pins:        For device pin management.
 *                See Documentation/driver-api/pin-control.rst for details.
 * @msi:        MSI related data
 * @numa_node:        NUMA node this device is close to.
 * @dma_ops:    DMA mapping operations for this device.
 * @dma_mask:        Dma mask (if dma'ble device).
 * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
 *                 hardware supports 64-bit addresses for consistent allocations
 *                 such descriptors.
 * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller
 *                DMA limit than the device itself supports.
 * @dma_range_map: map for DMA memory ranges relative to that of RAM
 * @dma_parms:        A low level driver may set these to teach IOMMU code about
 *                 segment limitations.
 * @dma_pools:        Dma pools (if dma'ble device).
 * @dma_mem:        Internal for coherent mem override.
 * @cma_area:        Contiguous memory area for dma allocations
 * @dma_io_tlb_mem: Software IO TLB allocator.  Not for driver use.
 * @dma_io_tlb_pools:        List of transient swiotlb memory pools.
 * @dma_io_tlb_lock:        Protects changes to the list of active pools.
 * @dma_uses_io_tlb: %true if device has used the software IO TLB.
 * @archdata:        For arch-specific additions.
 * @of_node:        Associated device tree node.
 * @fwnode:        Associated device node supplied by platform firmware.
 * @devt:        For creating the sysfs "dev".
 * @id:                device instance
 * @devres_lock: Spinlock to protect the resource of the device.
 * @devres_head: The resources list of the device.
 * @class:        The class of the device.
 * @groups:        Optional attribute groups.
 * @release:        Callback to free the device after all references have
 *                 gone away. This should be set by the allocator of the
 *                 device (i.e. the bus driver that discovered the device).
 * @iommu_group: IOMMU group the device belongs to.
 * @iommu:        Per device generic IOMMU runtime data
 * @physical_location: Describes physical location of the device connection
 *                point in the system housing.
 * @removable:  Whether the device can be removed from the system. This
 *              should be set by the subsystem / bus driver that discovered
 *              the device.
 *
 * @offline_disabled: If set, the device is permanently online.
 * @offline:        Set after successful invocation of bus type's .offline().
 * @of_node_reused: Set if the device-tree node is shared with an ancestor
 *              device.
 * @state_synced: The hardware state of this device has been synced to match
 *                  the software state of this device by calling the driver/bus
 *                  sync_state() callback.
 * @can_match:        The device has matched with a driver at least once or it is in
 *                a bus (like AMBA) which can't check for matching drivers until
 *                other devices probe successfully.
 * @dma_coherent: this particular device is dma coherent, even if the
 *                architecture supports non-coherent devices.
 * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
 *                streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
 *                and optionall (if the coherent mask is large enough) also
 *                for dma allocations.  This flag is managed by the dma ops
 *                instance from ->dma_supported.
 * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers.
 * @dma_iommu: Device is using default IOMMU implementation for DMA and
 *                doesn't rely on dma_ops structure.
 * @flags:        DEV_FLAG_XXX flags. Use atomic bitfield operations to modify.
 *
 * At the lowest level, every device in a Linux system is represented by an
 * instance of struct device. The device structure contains the information
 * that the device model core needs to model the system. Most subsystems,
 * however, track additional information about the devices they host. As a
 * result, it is rare for devices to be represented by bare device structures;
 * instead, that structure, like kobject structures, is usually embedded within
 * a higher-level representation of the device.
 */
struct device {
        struct kobject kobj;
        struct device                *parent;

        struct device_private        *p;

        const char                *init_name; /* initial name of the device */
        const struct device_type *type;

        const struct bus_type        *bus;        /* type of bus device is on */
        struct device_driver *driver;        /* which driver has allocated this
                                           device */
        void                *platform_data;        /* Platform specific data, device
                                           core doesn't touch it */
        void                *driver_data;        /* Driver data, set and get with
                                           dev_set_drvdata/dev_get_drvdata */
        struct {
                const char        *name;
                spinlock_t        lock;
        } driver_override;
        struct mutex                mutex;        /* mutex to synchronize calls to
                                         * its driver.
                                         */

        struct dev_links_info        links;
        struct dev_pm_info        power;
        struct dev_pm_domain        *pm_domain;

#ifdef CONFIG_ENERGY_MODEL
        struct em_perf_domain        *em_pd;
#endif

#ifdef CONFIG_PINCTRL
        struct dev_pin_info        *pins;
#endif
        struct dev_msi_info        msi;
#ifdef CONFIG_ARCH_HAS_DMA_OPS
        const struct dma_map_ops *dma_ops;
#endif
        u64                *dma_mask;        /* dma mask (if dma'able device) */
        u64                coherent_dma_mask;/* Like dma_mask, but for
                                             alloc_coherent mappings as
                                             not all hardware supports
                                             64 bit addresses for consistent
                                             allocations such descriptors. */
        u64                bus_dma_limit;        /* upstream dma constraint */
        const struct bus_dma_region *dma_range_map;

        struct device_dma_parameters *dma_parms;

        struct list_head        dma_pools;        /* dma pools (if dma'ble) */

#ifdef CONFIG_DMA_DECLARE_COHERENT
        struct dma_coherent_mem        *dma_mem; /* internal for coherent mem
                                             override */
#endif
#ifdef CONFIG_DMA_CMA
        struct cma *cma_area;                /* contiguous memory area for dma
                                           allocations */
#endif
#ifdef CONFIG_SWIOTLB
        struct io_tlb_mem *dma_io_tlb_mem;
#endif
#ifdef CONFIG_SWIOTLB_DYNAMIC
        struct list_head dma_io_tlb_pools;
        spinlock_t dma_io_tlb_lock;
        bool dma_uses_io_tlb;
#endif
        /* arch specific additions */
        struct dev_archdata        archdata;

        struct device_node        *of_node; /* associated device tree node */
        struct fwnode_handle        *fwnode; /* firmware device node */

#ifdef CONFIG_NUMA
        int                numa_node;        /* NUMA node this device is close to */
#endif
        dev_t                        devt;        /* dev_t, creates the sysfs "dev" */
        u32                        id;        /* device instance */

        spinlock_t                devres_lock;
        struct list_head        devres_head;

        const struct class        *class;
        const struct attribute_group **groups;        /* optional groups */

        void        (*release)(struct device *dev);
        struct iommu_group        *iommu_group;
        struct dev_iommu        *iommu;

        struct device_physical_location *physical_location;

        enum device_removable        removable;

        bool                        offline_disabled:1;
        bool                        offline:1;
        bool                        of_node_reused:1;
        bool                        state_synced:1;
        bool                        can_match:1;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
        bool                        dma_coherent:1;
#endif
#ifdef CONFIG_DMA_OPS_BYPASS
        bool                        dma_ops_bypass : 1;
#endif
#ifdef CONFIG_DMA_NEED_SYNC
        bool                        dma_skip_sync:1;
#endif
#ifdef CONFIG_IOMMU_DMA
        bool                        dma_iommu:1;
#endif

        DECLARE_BITMAP(flags, DEV_FLAG_COUNT);
};

#define __create_dev_flag_accessors(accessor_name, flag_name) \
static inline bool dev_##accessor_name(const struct device *dev) \
{ \
        return test_bit(flag_name, dev->flags); \
} \
static inline void dev_set_##accessor_name(struct device *dev) \
{ \
        set_bit(flag_name, dev->flags); \
} \
static inline void dev_clear_##accessor_name(struct device *dev) \
{ \
        clear_bit(flag_name, dev->flags); \
} \
static inline void dev_assign_##accessor_name(struct device *dev, bool value) \
{ \
        assign_bit(flag_name, dev->flags, value); \
} \
static inline bool dev_test_and_set_##accessor_name(struct device *dev) \
{ \
        return test_and_set_bit(flag_name, dev->flags); \
}

__create_dev_flag_accessors(ready_to_probe, DEV_FLAG_READY_TO_PROBE);

#undef __create_dev_flag_accessors

/**
 * struct device_link - Device link representation.
 * @supplier: The device on the supplier end of the link.
 * @s_node: Hook to the supplier device's list of links to consumers.
 * @consumer: The device on the consumer end of the link.
 * @c_node: Hook to the consumer device's list of links to suppliers.
 * @link_dev: device used to expose link details in sysfs
 * @status: The state of the link (with respect to the presence of drivers).
 * @flags: Link flags.
 * @rpm_active: Whether or not the consumer device is runtime-PM-active.
 * @kref: Count repeated addition of the same link.
 * @rm_work: Work structure used for removing the link.
 * @supplier_preactivated: Supplier has been made active before consumer probe.
 */
struct device_link {
        struct device *supplier;
        struct list_head s_node;
        struct device *consumer;
        struct list_head c_node;
        struct device link_dev;
        enum device_link_state status;
        u32 flags;
        refcount_t rpm_active;
        struct kref kref;
        struct work_struct rm_work;
        bool supplier_preactivated; /* Owned by consumer probe. */
};

#define kobj_to_dev(__kobj)        container_of_const(__kobj, struct device, kobj)

int __device_set_driver_override(struct device *dev, const char *s, size_t len);

/**
 * device_set_driver_override() - Helper to set or clear driver override.
 * @dev: Device to change
 * @s: NUL-terminated string, new driver name to force a match, pass empty
 *     string to clear it ("" or "\n", where the latter is only for sysfs
 *     interface).
 *
 * Helper to set or clear driver override of a device.
 *
 * Returns: 0 on success or a negative error code on failure.
 */
static inline int device_set_driver_override(struct device *dev, const char *s)
{
        return __device_set_driver_override(dev, s, s ? strlen(s) : 0);
}

/**
 * device_has_driver_override() - Check if a driver override has been set.
 * @dev: device to check
 *
 * Returns true if a driver override has been set for this device.
 */
static inline bool device_has_driver_override(struct device *dev)
{
        guard(spinlock)(&dev->driver_override.lock);
        return !!dev->driver_override.name;
}

/**
 * device_match_driver_override() - Match a driver against the device's driver_override.
 * @dev: device to check
 * @drv: driver to match against
 *
 * Returns > 0 if a driver override is set and matches the given driver, 0 if a
 * driver override is set but does not match, or < 0 if a driver override is not
 * set at all.
 */
static inline int device_match_driver_override(struct device *dev,
                                               const struct device_driver *drv)
{
        guard(spinlock)(&dev->driver_override.lock);
        if (dev->driver_override.name)
                return !strcmp(dev->driver_override.name, drv->name);
        return -1;
}

/**
 * device_iommu_mapped - Returns true when the device DMA is translated
 *                         by an IOMMU
 * @dev: Device to perform the check on
 */
static inline bool device_iommu_mapped(struct device *dev)
{
        return (dev->iommu_group != NULL);
}

/* Get the wakeup routines, which depend on struct device */
#include <linux/pm_wakeup.h>

/**
 * dev_name - Return a device's name.
 * @dev: Device with name to get.
 * Return: The kobject name of the device, or its initial name if unavailable.
 */
static inline const char *dev_name(const struct device *dev)
{
        /* Use the init name until the kobject becomes available */
        if (dev->init_name)
                return dev->init_name;

        return kobject_name(&dev->kobj);
}

/**
 * dev_bus_name - Return a device's bus/class name, if at all possible
 * @dev: struct device to get the bus/class name of
 *
 * Will return the name of the bus/class the device is attached to.  If it is
 * not attached to a bus/class, an empty string will be returned.
 */
static inline const char *dev_bus_name(const struct device *dev)
{
        return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : "");
}

__printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...);

#ifdef CONFIG_NUMA
static inline int dev_to_node(struct device *dev)
{
        return dev->numa_node;
}
static inline void set_dev_node(struct device *dev, int node)
{
        dev->numa_node = node;
}
#else
static inline int dev_to_node(struct device *dev)
{
        return NUMA_NO_NODE;
}
static inline void set_dev_node(struct device *dev, int node)
{
}
#endif

static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
{
#ifdef CONFIG_GENERIC_MSI_IRQ
        return dev->msi.domain;
#else
        return NULL;
#endif
}

static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d)
{
#ifdef CONFIG_GENERIC_MSI_IRQ
        dev->msi.domain = d;
#endif
}

static inline void *dev_get_drvdata(const struct device *dev)
{
        return dev->driver_data;
}

static inline void dev_set_drvdata(struct device *dev, void *data)
{
        dev->driver_data = data;
}

static inline struct pm_subsys_data *dev_to_psd(struct device *dev)
{
        return dev ? dev->power.subsys_data : NULL;
}

static inline unsigned int dev_get_uevent_suppress(const struct device *dev)
{
        return dev->kobj.uevent_suppress;
}

static inline void dev_set_uevent_suppress(struct device *dev, int val)
{
        dev->kobj.uevent_suppress = val;
}

static inline int device_is_registered(struct device *dev)
{
        return dev->kobj.state_in_sysfs;
}

static inline void device_enable_async_suspend(struct device *dev)
{
        if (!dev->power.is_prepared)
                dev->power.async_suspend = true;
}

static inline void device_disable_async_suspend(struct device *dev)
{
        if (!dev->power.is_prepared)
                dev->power.async_suspend = false;
}

static inline bool device_async_suspend_enabled(struct device *dev)
{
        return !!dev->power.async_suspend;
}

static inline bool device_pm_not_required(struct device *dev)
{
        return dev->power.no_pm;
}

static inline void device_set_pm_not_required(struct device *dev)
{
        dev->power.no_pm = true;
#ifdef CONFIG_PM
        dev->power.no_callbacks = true;
#endif
}

static inline void dev_pm_syscore_device(struct device *dev, bool val)
{
#ifdef CONFIG_PM_SLEEP
        dev->power.syscore = val;
#endif
}

static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags)
{
        dev->power.driver_flags = flags;
}

static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags)
{
        return !!(dev->power.driver_flags & flags);
}

static inline bool dev_pm_smart_suspend(struct device *dev)
{
#ifdef CONFIG_PM_SLEEP
        return dev->power.smart_suspend;
#else
        return false;
#endif
}

/*
 * dev_pm_set_strict_midlayer - Update the device's power.strict_midlayer flag
 * @dev: Target device.
 * @val: New flag value.
 *
 * When set, power.strict_midlayer means that the middle layer power management
 * code (typically, a bus type or a PM domain) does not expect its runtime PM
 * suspend callback to be invoked at all during system-wide PM transitions and
 * it does not expect its runtime PM resume callback to be invoked at any point
 * when runtime PM is disabled for the device during system-wide PM transitions.
 */
static inline void dev_pm_set_strict_midlayer(struct device *dev, bool val)
{
#ifdef CONFIG_PM_SLEEP
        dev->power.strict_midlayer = val;
#endif
}

static inline bool dev_pm_strict_midlayer_is_set(struct device *dev)
{
#ifdef CONFIG_PM_SLEEP
        return dev->power.strict_midlayer;
#else
        return false;
#endif
}

static inline void device_lock(struct device *dev)
{
        mutex_lock(&dev->mutex);
}

static inline int device_lock_interruptible(struct device *dev)
{
        return mutex_lock_interruptible(&dev->mutex);
}

static inline int device_trylock(struct device *dev)
{
        return mutex_trylock(&dev->mutex);
}

static inline void device_unlock(struct device *dev)
{
        mutex_unlock(&dev->mutex);
}

DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T))
DEFINE_GUARD_COND(device, _intr, device_lock_interruptible(_T), _RET == 0)

static inline void device_lock_assert(struct device *dev)
{
        lockdep_assert_held(&dev->mutex);
}

static inline bool dev_has_sync_state(struct device *dev)
{
        if (!dev)
                return false;
        if (dev->driver && dev->driver->sync_state)
                return true;
        if (dev->bus && dev->bus->sync_state)
                return true;
        return false;
}

static inline int dev_set_drv_sync_state(struct device *dev,
                                         void (*fn)(struct device *dev))
{
        if (!dev || !dev->driver)
                return 0;
        if (dev->driver->sync_state && dev->driver->sync_state != fn)
                return -EBUSY;
        if (!dev->driver->sync_state)
                dev->driver->sync_state = fn;
        return 0;
}

static inline void dev_set_removable(struct device *dev,
                                     enum device_removable removable)
{
        dev->removable = removable;
}

static inline bool dev_is_removable(struct device *dev)
{
        return dev->removable == DEVICE_REMOVABLE;
}

static inline bool dev_removable_is_valid(struct device *dev)
{
        return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED;
}

/*
 * High level routines for use by the bus drivers
 */
int __must_check device_register(struct device *dev);
void device_unregister(struct device *dev);
void device_initialize(struct device *dev);
int __must_check device_add(struct device *dev);
void device_del(struct device *dev);

DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T))

int device_for_each_child(struct device *parent, void *data,
                          device_iter_t fn);
int device_for_each_child_reverse(struct device *parent, void *data,
                                  device_iter_t fn);
int device_for_each_child_reverse_from(struct device *parent,
                                       struct device *from, void *data,
                                       device_iter_t fn);
struct device *device_find_child(struct device *parent, const void *data,
                                 device_match_t match);
/**
 * device_find_child_by_name - device iterator for locating a child device.
 * @parent: parent struct device
 * @name: name of the child device
 *
 * This is similar to the device_find_child() function above, but it
 * returns a reference to a device that has the name @name.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
static inline struct device *device_find_child_by_name(struct device *parent,
                                                       const char *name)
{
        return device_find_child(parent, name, device_match_name);
}

/**
 * device_find_any_child - device iterator for locating a child device, if any.
 * @parent: parent struct device
 *
 * This is similar to the device_find_child() function above, but it
 * returns a reference to a child device, if any.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
static inline struct device *device_find_any_child(struct device *parent)
{
        return device_find_child(parent, NULL, device_match_any);
}

int device_rename(struct device *dev, const char *new_name);
int device_move(struct device *dev, struct device *new_parent,
                enum dpm_order dpm_order);
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);

static inline bool device_supports_offline(struct device *dev)
{
        return dev->bus && dev->bus->offline && dev->bus->online;
}

#define __device_lock_set_class(dev, name, key)                        \
do {                                                                   \
        struct device *__d2 __maybe_unused = dev;                      \
        lock_set_class(&__d2->mutex.dep_map, name, key, 0, _THIS_IP_); \
} while (0)

/**
 * device_lock_set_class - Specify a temporary lock class while a device
 *                           is attached to a driver
 * @dev: device to modify
 * @key: lock class key data
 *
 * This must be called with the device_lock() already held, for example
 * from driver ->probe(). Take care to only override the default
 * lockdep_no_validate class.
 */
#ifdef CONFIG_LOCKDEP
#define device_lock_set_class(dev, key)                                    \
do {                                                                       \
        struct device *__d = dev;                                          \
        dev_WARN_ONCE(__d, !lockdep_match_class(&__d->mutex,               \
                                                &__lockdep_no_validate__), \
                 "overriding existing custom lock class\n");               \
        __device_lock_set_class(__d, #key, key);                           \
} while (0)
#else
#define device_lock_set_class(dev, key) __device_lock_set_class(dev, #key, key)
#endif

/**
 * device_lock_reset_class - Return a device to the default lockdep novalidate state
 * @dev: device to modify
 *
 * This must be called with the device_lock() already held, for example
 * from driver ->remove().
 */
#define device_lock_reset_class(dev) \
do { \
        struct device *__d __maybe_unused = dev;                       \
        lock_set_novalidate_class(&__d->mutex.dep_map, "&dev->mutex",  \
                                  _THIS_IP_);                          \
} while (0)

void lock_device_hotplug(void);
void unlock_device_hotplug(void);
int lock_device_hotplug_sysfs(void);
int device_offline(struct device *dev);
int device_online(struct device *dev);

void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void device_set_node(struct device *dev, struct fwnode_handle *fwnode);
int device_add_of_node(struct device *dev, struct device_node *of_node);
void device_remove_of_node(struct device *dev);
void device_set_of_node_from_dev(struct device *dev, const struct device *dev2);
struct device *get_dev_from_fwnode(struct fwnode_handle *fwnode);

static inline struct device_node *dev_of_node(struct device *dev)
{
        if (!IS_ENABLED(CONFIG_OF) || !dev)
                return NULL;
        return dev->of_node;
}

static inline int dev_num_vf(struct device *dev)
{
        if (dev->bus && dev->bus->num_vf)
                return dev->bus->num_vf(dev);
        return 0;
}

/*
 * Root device objects for grouping under /sys/devices
 */
struct device *__root_device_register(const char *name, struct module *owner);

/* This is a macro to avoid include problems with THIS_MODULE */
#define root_device_register(name) \
        __root_device_register(name, THIS_MODULE)

void root_device_unregister(struct device *root);

static inline void *dev_get_platdata(const struct device *dev)
{
        return dev->platform_data;
}

/*
 * Manual binding of a device to driver. See drivers/base/bus.c
 * for information on use.
 */
int __must_check device_driver_attach(const struct device_driver *drv,
                                      struct device *dev);
int __must_check device_bind_driver(struct device *dev);
void device_release_driver(struct device *dev);
int  __must_check device_attach(struct device *dev);
int __must_check driver_attach(const struct device_driver *drv);
void device_initial_probe(struct device *dev);
int __must_check device_reprobe(struct device *dev);

bool device_is_bound(struct device *dev);

/*
 * Easy functions for dynamically creating devices on the fly
 */
__printf(5, 6) struct device *
device_create(const struct class *cls, struct device *parent, dev_t devt,
              void *drvdata, const char *fmt, ...);
__printf(6, 7) struct device *
device_create_with_groups(const struct class *cls, struct device *parent, dev_t devt,
                          void *drvdata, const struct attribute_group **groups,
                          const char *fmt, ...);
void device_destroy(const struct class *cls, dev_t devt);

int __must_check device_add_groups(struct device *dev,
                                   const struct attribute_group *const *groups);
void device_remove_groups(struct device *dev,
                          const struct attribute_group *const *groups);

static inline int __must_check device_add_group(struct device *dev,
                                        const struct attribute_group *grp)
{
        const struct attribute_group *groups[] = { grp, NULL };

        return device_add_groups(dev, groups);
}

static inline void device_remove_group(struct device *dev,
                                       const struct attribute_group *grp)
{
        const struct attribute_group *groups[] = { grp, NULL };

        device_remove_groups(dev, groups);
}

int __must_check devm_device_add_group(struct device *dev,
                                       const struct attribute_group *grp);

/*
 * get_device - atomically increment the reference count for the device.
 *
 */
struct device *get_device(struct device *dev);
void put_device(struct device *dev);

DEFINE_FREE(put_device, struct device *, if (_T) put_device(_T))

bool kill_device(struct device *dev);

#ifdef CONFIG_DEVTMPFS
int devtmpfs_mount(void);
#else
static inline int devtmpfs_mount(void) { return 0; }
#endif

/* drivers/base/power/shutdown.c */
void device_shutdown(void);

/* debugging and troubleshooting/diagnostic helpers. */
const char *dev_driver_string(const struct device *dev);

/* Device links interface. */
struct device_link *device_link_add(struct device *consumer,
                                    struct device *supplier, u32 flags);
void device_link_del(struct device_link *link);
void device_link_remove(void *consumer, struct device *supplier);
void device_links_supplier_sync_state_pause(void);
void device_links_supplier_sync_state_resume(void);
void device_link_wait_removal(void);

static inline bool device_link_test(const struct device_link *link, u32 flags)
{
        return !!(link->flags & flags);
}

/* Create alias, so I can be autoloaded. */
#define MODULE_ALIAS_CHARDEV(major,minor) \
        MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_CHARDEV_MAJOR(major) \
        MODULE_ALIAS("char-major-" __stringify(major) "-*")

#endif /* _DEVICE_H_ */
























































































































































































































































































































































   49 

















   45 
   49 



















































   14 
   14 











































    1 














    1 



    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/kernel/capability.c
 *
 * Copyright (C) 1997  Andrew Main <zefram@fysh.org>
 *
 * Integrated into 2.1.97+,  Andrew G. Morgan <morgan@kernel.org>
 * 30 May 2002:        Cleanup, Robert M. Love <rml@tech9.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>

int file_caps_enabled = 1;

static int __init file_caps_disable(char *str)
{
        file_caps_enabled = 0;
        return 1;
}
__setup("no_file_caps", file_caps_disable);

#ifdef CONFIG_MULTIUSER
/*
 * More recent versions of libcap are available from:
 *
 *   http://www.kernel.org/pub/linux/libs/security/linux-privs/
 */

static void warn_legacy_capability_use(void)
{
        pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
                     current->comm);
}

/*
 * Version 2 capabilities worked fine, but the linux/capability.h file
 * that accompanied their introduction encouraged their use without
 * the necessary user-space source code changes. As such, we have
 * created a version 3 with equivalent functionality to version 2, but
 * with a header change to protect legacy source code from using
 * version 2 when it wanted to use version 1. If your system has code
 * that trips the following warning, it is using version 2 specific
 * capabilities and may be doing so insecurely.
 *
 * The remedy is to either upgrade your version of libcap (to 2.10+,
 * if the application is linked against it), or recompile your
 * application with modern kernel headers and this warning will go
 * away.
 */

static void warn_deprecated_v2(void)
{
        pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
                     current->comm);
}

/*
 * Version check. Return the number of u32s in each capability flag
 * array, or a negative value on error.
 */
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
        __u32 version;

        if (get_user(version, &header->version))
                return -EFAULT;

        switch (version) {
        case _LINUX_CAPABILITY_VERSION_1:
                warn_legacy_capability_use();
                *tocopy = _LINUX_CAPABILITY_U32S_1;
                break;
        case _LINUX_CAPABILITY_VERSION_2:
                warn_deprecated_v2();
                fallthrough;        /* v3 is otherwise equivalent to v2 */
        case _LINUX_CAPABILITY_VERSION_3:
                *tocopy = _LINUX_CAPABILITY_U32S_3;
                break;
        default:
                if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
                        return -EFAULT;
                return -EINVAL;
        }

        return 0;
}

/*
 * The only thing that can change the capabilities of the current
 * process is the current process. As such, we can't be in this code
 * at the same time as we are in the process of setting capabilities
 * in this process. The net result is that we can limit our use of
 * locks to when we are reading the caps of another process.
 */
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
{
        int ret;

        if (pid && (pid != task_pid_vnr(current))) {
                const struct task_struct *target;

                rcu_read_lock();

                target = find_task_by_vpid(pid);
                if (!target)
                        ret = -ESRCH;
                else
                        ret = security_capget(target, pEp, pIp, pPp);

                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);

        return ret;
}

/**
 * sys_capget - get the capabilities of a given process.
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @dataptr: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities that are returned
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
        int ret = 0;
        pid_t pid;
        unsigned tocopy;
        kernel_cap_t pE, pI, pP;
        struct __user_cap_data_struct kdata[2];

        ret = cap_validate_magic(header, &tocopy);
        if ((dataptr == NULL) || (ret != 0))
                return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        if (pid < 0)
                return -EINVAL;

        ret = cap_get_target_pid(pid, &pE, &pI, &pP);
        if (ret)
                return ret;

        /*
         * Annoying legacy format with 64-bit capabilities exposed
         * as two sets of 32-bit fields, so we need to split the
         * capability values up.
         */
        kdata[0].effective   = pE.val; kdata[1].effective   = pE.val >> 32;
        kdata[0].permitted   = pP.val; kdata[1].permitted   = pP.val >> 32;
        kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;

        /*
         * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
         * we silently drop the upper capabilities here. This
         * has the effect of making older libcap
         * implementations implicitly drop upper capability
         * bits when they perform a: capget/modify/capset
         * sequence.
         *
         * This behavior is considered fail-safe
         * behavior. Upgrading the application to a newer
         * version of libcap will enable access to the newer
         * capabilities.
         *
         * An alternative would be to return an error here
         * (-ERANGE), but that causes legacy applications to
         * unexpectedly fail; the capget/modify/capset aborts
         * before modification is attempted and the application
         * fails.
         */
        if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
                return -EFAULT;

        return 0;
}

static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
{
        return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}

/**
 * sys_capset - set capabilities for a process or (*) a group of processes
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @data: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities
 *
 * Set capabilities for the current process only.  The ability to any other
 * process(es) has been deprecated and removed.
 *
 * The restrictions on setting capabilities are specified as:
 *
 * I: any raised capabilities must be a subset of the old permitted
 * P: any raised capabilities must be a subset of the old permitted
 * E: must be set to a subset of new permitted
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
        struct __user_cap_data_struct kdata[2] = { { 0, }, };
        unsigned tocopy, copybytes;
        kernel_cap_t inheritable, permitted, effective;
        struct cred *new;
        int ret;
        pid_t pid;

        ret = cap_validate_magic(header, &tocopy);
        if (ret != 0)
                return ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        /* may only affect current now */
        if (pid != 0 && pid != task_pid_vnr(current))
                return -EPERM;

        copybytes = tocopy * sizeof(struct __user_cap_data_struct);
        if (copybytes > sizeof(kdata))
                return -EFAULT;

        if (copy_from_user(&kdata, data, copybytes))
                return -EFAULT;

        effective   = mk_kernel_cap(kdata[0].effective,   kdata[1].effective);
        permitted   = mk_kernel_cap(kdata[0].permitted,   kdata[1].permitted);
        inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = security_capset(new, current_cred(),
                              &effective, &inheritable, &permitted);
        if (ret < 0)
                goto error;

        audit_log_capset(new, current_cred());

        return commit_creds(new);

error:
        abort_creds(new);
        return ret;
}

/**
 * has_ns_capability - Does a task have a capability in a specific user ns
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability(struct task_struct *t,
                       struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_ns_capability_noaudit - Does a task have a capability (unaudited)
 * in a specific user ns.
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 * Do not write an audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability_noaudit(struct task_struct *t,
                               struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability_noaudit - Does a task have a capability (unaudited) in the
 * initial user ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to init_user_ns, false if not.  Don't write an
 * audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability_noaudit);

static bool ns_capable_common(struct user_namespace *ns,
                              int cap,
                              unsigned int opts)
{
        int capable;

        if (unlikely(!cap_valid(cap))) {
                pr_crit("capable() called with invalid cap=%u\n", cap);
                BUG();
        }

        capable = security_capable(current_cred(), ns, cap, opts);
        if (capable == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
        return false;
}

/**
 * ns_capable - Determine if the current task has a superior capability in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);

/**
 * ns_capable_noaudit - Determine if the current task has a superior capability
 * (unaudited) in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);

/**
 * ns_capable_setid - Determine if the current task has a superior capability
 * in effect, while signalling that this check is being done from within a
 * setid or setgroups syscall.
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_INSETID);
}
EXPORT_SYMBOL(ns_capable_setid);

/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool capable(int cap)
{
        return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
#endif /* CONFIG_MULTIUSER */

/**
 * file_ns_capable - Determine if the file's opener had a capability in effect
 * @file:  The file we want to check
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if task that opened the file had a capability in effect
 * when the file was opened.
 *
 * This does not set PF_SUPERPRIV because the caller may not
 * actually be privileged.
 */
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
                     int cap)
{

        if (WARN_ON_ONCE(!cap_valid(cap)))
                return false;

        if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
                return true;

        return false;
}
EXPORT_SYMBOL(file_ns_capable);

/**
 * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
 * @ns: The user namespace in question
 * @idmap: idmap of the mount @inode was found from
 * @inode: The inode in question
 *
 * Return true if the inode uid and gid are within the namespace.
 */
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
                                 struct mnt_idmap *idmap,
                                 const struct inode *inode)
{
        return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
               vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}

/**
 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
 * @idmap: idmap of the mount @inode was found from
 * @inode: The inode in question
 * @cap: The capability in question
 *
 * Return true if the current task has the given capability targeted at
 * its own user namespace and that the given inode's uid and gid are
 * mapped into the current user namespace.
 */
bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
                              const struct inode *inode, int cap)
{
        struct user_namespace *ns = current_user_ns();

        return ns_capable(ns, cap) &&
               privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);

/**
 * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
 * @tsk: The task that may be ptraced
 * @ns: The user namespace to search for CAP_SYS_PTRACE in
 *
 * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
 * in the specified user namespace.
 */
bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
        int ret = 0;  /* An absent tracer adds no restrictions */
        const struct cred *cred;

        rcu_read_lock();
        cred = rcu_dereference(tsk->ptracer_cred);
        if (cred)
                ret = security_capable(cred, ns, CAP_SYS_PTRACE,
                                       CAP_OPT_NOAUDIT);
        rcu_read_unlock();
        return (ret == 0);
}















    1 









    1 


















    1 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// SPDX-License-Identifier: GPL-2.0
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <net/gro_cells.h>
#include <net/hotdata.h>

struct gro_cell {
        struct sk_buff_head        napi_skbs;
        struct napi_struct        napi;
        local_lock_t                bh_lock;
};

int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        bool have_bh_lock = false;
        struct gro_cell *cell;
        int res;

        rcu_read_lock();
        if (unlikely(!(dev->flags & IFF_UP)))
                goto drop;

        if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) {
                res = netif_rx(skb);
                goto unlock;
        }

        local_lock_nested_bh(&gcells->cells->bh_lock);
        have_bh_lock = true;
        cell = this_cpu_ptr(gcells->cells);

        if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) {
drop:
                dev_core_stats_rx_dropped_inc(dev);
                kfree_skb(skb);
                res = NET_RX_DROP;
                goto unlock;
        }

        __skb_queue_tail(&cell->napi_skbs, skb);
        if (skb_queue_len(&cell->napi_skbs) == 1)
                napi_schedule(&cell->napi);

        res = NET_RX_SUCCESS;

unlock:
        if (have_bh_lock)
                local_unlock_nested_bh(&gcells->cells->bh_lock);
        rcu_read_unlock();
        return res;
}
EXPORT_SYMBOL(gro_cells_receive);

/* called under BH context */
static int gro_cell_poll(struct napi_struct *napi, int budget)
{
        struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
        struct sk_buff *skb;
        int work_done = 0;

        while (work_done < budget) {
                __local_lock_nested_bh(&cell->bh_lock);
                skb = __skb_dequeue(&cell->napi_skbs);
                __local_unlock_nested_bh(&cell->bh_lock);
                if (!skb)
                        break;
                napi_gro_receive(napi, skb);
                work_done++;
        }

        if (work_done < budget)
                napi_complete_done(napi, work_done);
        return work_done;
}

int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
{
        int i;

        gcells->cells = alloc_percpu(struct gro_cell);
        if (!gcells->cells)
                return -ENOMEM;

        for_each_possible_cpu(i) {
                struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);

                __skb_queue_head_init(&cell->napi_skbs);
                local_lock_init(&cell->bh_lock);

                set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);

                netif_napi_add(dev, &cell->napi, gro_cell_poll);
                napi_enable(&cell->napi);
        }
        return 0;
}
EXPORT_SYMBOL(gro_cells_init);

struct percpu_free_defer {
        struct rcu_head rcu;
        void __percpu        *ptr;
};

static void percpu_free_defer_callback(struct rcu_head *head)
{
        struct percpu_free_defer *defer;

        defer = container_of(head, struct percpu_free_defer, rcu);
        free_percpu(defer->ptr);
        kfree(defer);
}

void gro_cells_destroy(struct gro_cells *gcells)
{
        struct percpu_free_defer *defer;
        int i;

        if (!gcells->cells)
                return;
        for_each_possible_cpu(i) {
                struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);

                napi_disable(&cell->napi);
                __netif_napi_del(&cell->napi);
                __skb_queue_purge(&cell->napi_skbs);
        }
        /* We need to observe an rcu grace period before freeing ->cells,
         * because netpoll could access dev->napi_list under rcu protection.
         * Try hard using call_rcu() instead of synchronize_rcu(),
         * because we might be called from cleanup_net(), and we
         * definitely do not want to block this critical task.
         */
        defer = kmalloc_obj(*defer, GFP_KERNEL | __GFP_NOWARN);
        if (likely(defer)) {
                defer->ptr = gcells->cells;
                call_rcu(&defer->rcu, percpu_free_defer_callback);
        } else {
                /* We do not hold RTNL at this point, synchronize_net()
                 * would not be able to expedite this sync.
                 */
                synchronize_rcu_expedited();
                free_percpu(gcells->cells);
        }
        gcells->cells = NULL;
}
EXPORT_SYMBOL(gro_cells_destroy);



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 













    1 
    1 






























    1 


































    1 





































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * USB Network driver infrastructure
 * Copyright (C) 2000-2005 by David Brownell
 * Copyright (C) 2003-2005 David Hollis <dhollis@davehollis.com>
 */

/*
 * This is a generic "USB networking" framework that works with several
 * kinds of full and high speed networking devices:  host-to-host cables,
 * smart usb peripherals, and actual Ethernet adapters.
 *
 * These devices usually differ in terms of control protocols (if they
 * even have one!) and sometimes they define new framing to wrap or batch
 * Ethernet packets.  Otherwise, they talk to USB pretty much the same,
 * so interface (un)binding, endpoint I/O queues, fault handling, and other
 * issues can usefully be addressed by this framework.
 */

#include <linux/module.h>
#include <linux/hex.h>
#include <linux/init.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ctype.h>
#include <linux/ethtool.h>
#include <linux/workqueue.h>
#include <linux/mii.h>
#include <linux/usb.h>
#include <linux/usb/usbnet.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/pm_runtime.h>

/*-------------------------------------------------------------------------*/

/*
 * Nineteen USB 1.1 max size bulk transactions per frame (ms), max.
 * Several dozen bytes of IPv4 data can fit in two such transactions.
 * One maximum size Ethernet packet takes twenty four of them.
 * For high speed, each frame comfortably fits almost 36 max size
 * Ethernet packets (so queues should be bigger).
 *
 * The goal is to let the USB host controller be busy for 5msec or
 * more before an irq is required, under load.  Jumbograms change
 * the equation.
 */
#define        MAX_QUEUE_MEMORY        (60 * 1518)
#define        RX_QLEN(dev)                ((dev)->rx_qlen)
#define        TX_QLEN(dev)                ((dev)->tx_qlen)

// reawaken network queue this soon after stopping; else watchdog barks
#define TX_TIMEOUT_JIFFIES        (5*HZ)

/* throttle rx/tx briefly after some faults, so hub_wq might disconnect()
 * us (it polls at HZ/4 usually) before we report too many false errors.
 */
#define THROTTLE_JIFFIES        (HZ/8)

// between wakeups
#define UNLINK_TIMEOUT_MS        3

/*-------------------------------------------------------------------------*/

/* use ethtool to change the level for any given device */
static int msg_level = -1;
module_param (msg_level, int, 0);
MODULE_PARM_DESC (msg_level, "Override default message level");

/*-------------------------------------------------------------------------*/

static const char * const usbnet_event_names[] = {
        [EVENT_TX_HALT]                   = "EVENT_TX_HALT",
        [EVENT_RX_HALT]                   = "EVENT_RX_HALT",
        [EVENT_RX_MEMORY]           = "EVENT_RX_MEMORY",
        [EVENT_STS_SPLIT]           = "EVENT_STS_SPLIT",
        [EVENT_LINK_RESET]           = "EVENT_LINK_RESET",
        [EVENT_RX_PAUSED]           = "EVENT_RX_PAUSED",
        [EVENT_DEV_ASLEEP]           = "EVENT_DEV_ASLEEP",
        [EVENT_DEV_OPEN]           = "EVENT_DEV_OPEN",
        [EVENT_DEVICE_REPORT_IDLE] = "EVENT_DEVICE_REPORT_IDLE",
        [EVENT_NO_RUNTIME_PM]           = "EVENT_NO_RUNTIME_PM",
        [EVENT_RX_KILL]                   = "EVENT_RX_KILL",
        [EVENT_LINK_CHANGE]           = "EVENT_LINK_CHANGE",
        [EVENT_SET_RX_MODE]           = "EVENT_SET_RX_MODE",
        [EVENT_NO_IP_ALIGN]           = "EVENT_NO_IP_ALIGN",
};

/* handles CDC Ethernet and many other network "bulk data" interfaces */
int usbnet_get_endpoints(struct usbnet *dev, struct usb_interface *intf)
{
        int                                tmp;
        struct usb_host_interface        *alt = NULL;
        struct usb_host_endpoint        *in = NULL, *out = NULL;
        struct usb_host_endpoint        *status = NULL;

        for (tmp = 0; tmp < intf->num_altsetting; tmp++) {
                unsigned        ep;

                in = out = status = NULL;
                alt = intf->altsetting + tmp;

                /* take the first altsetting with in-bulk + out-bulk;
                 * remember any status endpoint, just in case;
                 * ignore other endpoints and altsettings.
                 */
                for (ep = 0; ep < alt->desc.bNumEndpoints; ep++) {
                        struct usb_host_endpoint        *e;
                        int                                intr = 0;

                        e = alt->endpoint + ep;

                        /* ignore endpoints which cannot transfer data */
                        if (!usb_endpoint_maxp(&e->desc))
                                continue;

                        switch (e->desc.bmAttributes) {
                        case USB_ENDPOINT_XFER_INT:
                                if (!usb_endpoint_dir_in(&e->desc))
                                        continue;
                                intr = 1;
                                fallthrough;
                        case USB_ENDPOINT_XFER_BULK:
                                break;
                        default:
                                continue;
                        }
                        if (usb_endpoint_dir_in(&e->desc)) {
                                if (!intr && !in)
                                        in = e;
                                else if (intr && !status)
                                        status = e;
                        } else {
                                if (!out)
                                        out = e;
                        }
                }
                if (in && out)
                        break;
        }
        if (!alt || !in || !out)
                return -EINVAL;

        if (alt->desc.bAlternateSetting != 0 ||
            !(dev->driver_info->flags & FLAG_NO_SETINT)) {
                tmp = usb_set_interface(dev->udev, alt->desc.bInterfaceNumber,
                                        alt->desc.bAlternateSetting);
                if (tmp < 0)
                        return tmp;
        }

        dev->in = usb_rcvbulkpipe(dev->udev,
                                  in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
        dev->out = usb_sndbulkpipe(dev->udev,
                                   out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
        dev->status = status;
        return 0;
}
EXPORT_SYMBOL_GPL(usbnet_get_endpoints);

int usbnet_get_ethernet_addr(struct usbnet *dev, int iMACAddress)
{
        u8                addr[ETH_ALEN];
        int                 tmp = -1, ret;
        unsigned char        buf [13];

        ret = usb_string(dev->udev, iMACAddress, buf, sizeof(buf));
        if (ret == 12)
                tmp = hex2bin(addr, buf, 6);
        if (tmp < 0) {
                dev_dbg(&dev->udev->dev,
                        "bad MAC string %d fetch, %d\n", iMACAddress, tmp);
                if (ret >= 0)
                        ret = -EINVAL;
                return ret;
        }
        eth_hw_addr_set(dev->net, addr);
        return 0;
}
EXPORT_SYMBOL_GPL(usbnet_get_ethernet_addr);

static bool usbnet_needs_usb_name_format(struct usbnet *dev, struct net_device *net)
{
        /* Point to point devices which don't have a real MAC address
         * (or report a fake local one) have historically used the usb%d
         * naming. Preserve this..
         */
        return (dev->driver_info->flags & FLAG_POINTTOPOINT) != 0 &&
                (is_zero_ether_addr(net->dev_addr) ||
                 is_local_ether_addr(net->dev_addr));
}

static void intr_complete(struct urb *urb)
{
        struct usbnet        *dev = urb->context;
        int                status = urb->status;

        switch (status) {
        /* success */
        case 0:
                dev->driver_info->status(dev, urb);
                break;

        /* software-driven interface shutdown */
        case -ENOENT:                /* urb killed */
        case -ESHUTDOWN:        /* hardware gone */
                netif_dbg(dev, ifdown, dev->net,
                          "intr shutdown, code %d\n", status);
                return;

        /* NOTE:  not throttling like RX/TX, since this endpoint
         * already polls infrequently
         */
        default:
                netdev_dbg(dev->net, "intr status %d\n", status);
                break;
        }

        status = usb_submit_urb(urb, GFP_ATOMIC);
        if (status != 0)
                netif_err(dev, timer, dev->net,
                          "intr resubmit --> %d\n", status);
}

static int init_status(struct usbnet *dev, struct usb_interface *intf)
{
        char                *buf = NULL;
        unsigned        pipe = 0;
        unsigned        maxp;
        unsigned        period;

        if (!dev->driver_info->status)
                return 0;

        pipe = usb_rcvintpipe(dev->udev,
                              dev->status->desc.bEndpointAddress
                              & USB_ENDPOINT_NUMBER_MASK);
        maxp = usb_maxpacket(dev->udev, pipe);

        /* avoid 1 msec chatter:  min 8 msec poll rate */
        period = max ((int) dev->status->desc.bInterval,
                (dev->udev->speed == USB_SPEED_HIGH) ? 7 : 3);

        buf = kmalloc(maxp, GFP_KERNEL);
        if (buf) {
                dev->interrupt = usb_alloc_urb(0, GFP_KERNEL);
                if (!dev->interrupt) {
                        kfree(buf);
                        return -ENOMEM;
                } else {
                        usb_fill_int_urb(dev->interrupt, dev->udev, pipe,
                                         buf, maxp, intr_complete, dev, period);
                        dev->interrupt->transfer_flags |= URB_FREE_BUFFER;
                        dev_dbg(&intf->dev,
                                "status ep%din, %d bytes period %d\n",
                                usb_pipeendpoint(pipe), maxp, period);
                }
        }
        return 0;
}

/* Submit the interrupt URB if not previously submitted, increasing refcount */
int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags)
{
        int ret = 0;

        WARN_ON_ONCE(dev->interrupt == NULL);
        if (dev->interrupt) {
                mutex_lock(&dev->interrupt_mutex);

                if (++dev->interrupt_count == 1)
                        ret = usb_submit_urb(dev->interrupt, mem_flags);

                dev_dbg(&dev->udev->dev, "incremented interrupt URB count to %d\n",
                        dev->interrupt_count);
                mutex_unlock(&dev->interrupt_mutex);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(usbnet_status_start);

/* For resume; submit interrupt URB if previously submitted */
static int __usbnet_status_start_force(struct usbnet *dev, gfp_t mem_flags)
{
        int ret = 0;

        mutex_lock(&dev->interrupt_mutex);
        if (dev->interrupt_count) {
                ret = usb_submit_urb(dev->interrupt, mem_flags);
                dev_dbg(&dev->udev->dev,
                        "submitted interrupt URB for resume\n");
        }
        mutex_unlock(&dev->interrupt_mutex);
        return ret;
}

/* Kill the interrupt URB if all submitters want it killed */
void usbnet_status_stop(struct usbnet *dev)
{
        if (dev->interrupt) {
                mutex_lock(&dev->interrupt_mutex);
                WARN_ON(dev->interrupt_count == 0);

                if (dev->interrupt_count && --dev->interrupt_count == 0)
                        usb_kill_urb(dev->interrupt);

                dev_dbg(&dev->udev->dev,
                        "decremented interrupt URB count to %d\n",
                        dev->interrupt_count);
                mutex_unlock(&dev->interrupt_mutex);
        }
}
EXPORT_SYMBOL_GPL(usbnet_status_stop);

/* For suspend; always kill interrupt URB */
static void __usbnet_status_stop_force(struct usbnet *dev)
{
        if (dev->interrupt) {
                mutex_lock(&dev->interrupt_mutex);
                usb_kill_urb(dev->interrupt);
                dev_dbg(&dev->udev->dev, "killed interrupt URB for suspend\n");
                mutex_unlock(&dev->interrupt_mutex);
        }
}

/* Passes this packet up the stack, updating its accounting.
 * Some link protocols batch packets, so their rx_fixup paths
 * can return clones as well as just modify the original skb.
 */
void usbnet_skb_return(struct usbnet *dev, struct sk_buff *skb)
{
        struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->net->tstats);
        unsigned long flags;
        int        status;

        if (test_bit(EVENT_RX_PAUSED, &dev->flags)) {
                skb_queue_tail(&dev->rxq_pause, skb);
                return;
        }

        /* only update if unset to allow minidriver rx_fixup override */
        if (skb->protocol == 0)
                skb->protocol = eth_type_trans(skb, dev->net);

        flags = u64_stats_update_begin_irqsave(&stats64->syncp);
        u64_stats_inc(&stats64->rx_packets);
        u64_stats_add(&stats64->rx_bytes, skb->len);
        u64_stats_update_end_irqrestore(&stats64->syncp, flags);

        netif_dbg(dev, rx_status, dev->net, "< rx, len %zu, type 0x%x\n",
                  skb->len + sizeof(struct ethhdr), skb->protocol);
        memset(skb->cb, 0, sizeof(struct skb_data));

        if (skb_defer_rx_timestamp(skb))
                return;

        status = netif_rx (skb);
        if (status != NET_RX_SUCCESS)
                netif_dbg(dev, rx_err, dev->net,
                          "netif_rx status %d\n", status);
}
EXPORT_SYMBOL_GPL(usbnet_skb_return);

/* must be called if hard_mtu or rx_urb_size changed */
void usbnet_update_max_qlen(struct usbnet *dev)
{
        enum usb_device_speed speed = dev->udev->speed;

        if (!dev->rx_urb_size || !dev->hard_mtu)
                goto insanity;
        switch (speed) {
        case USB_SPEED_HIGH:
                dev->rx_qlen = MAX_QUEUE_MEMORY / dev->rx_urb_size;
                dev->tx_qlen = MAX_QUEUE_MEMORY / dev->hard_mtu;
                break;
        case USB_SPEED_SUPER:
        case USB_SPEED_SUPER_PLUS:
                /*
                 * Not take default 5ms qlen for super speed HC to
                 * save memory, and iperf tests show 2.5ms qlen can
                 * work well
                 */
                dev->rx_qlen = 5 * MAX_QUEUE_MEMORY / dev->rx_urb_size;
                dev->tx_qlen = 5 * MAX_QUEUE_MEMORY / dev->hard_mtu;
                break;
        default:
insanity:
                dev->rx_qlen = dev->tx_qlen = 4;
        }
}
EXPORT_SYMBOL_GPL(usbnet_update_max_qlen);


/*-------------------------------------------------------------------------
 *
 * Network Device Driver (peer link to "Host Device", from USB host)
 *
 *-------------------------------------------------------------------------*/

int usbnet_change_mtu(struct net_device *net, int new_mtu)
{
        struct usbnet        *dev = netdev_priv(net);
        int                ll_mtu = new_mtu + net->hard_header_len;
        int                old_hard_mtu = dev->hard_mtu;
        int                old_rx_urb_size = dev->rx_urb_size;

        // no second zero-length packet read wanted after mtu-sized packets
        if ((ll_mtu % dev->maxpacket) == 0)
                return -EDOM;
        WRITE_ONCE(net->mtu, new_mtu);

        dev->hard_mtu = net->mtu + net->hard_header_len;
        if (dev->rx_urb_size == old_hard_mtu) {
                dev->rx_urb_size = dev->hard_mtu;
                if (dev->rx_urb_size > old_rx_urb_size) {
                        usbnet_pause_rx(dev);
                        usbnet_unlink_rx_urbs(dev);
                        usbnet_resume_rx(dev);
                }
        }

        /* max qlen depend on hard_mtu and rx_urb_size */
        usbnet_update_max_qlen(dev);

        return 0;
}
EXPORT_SYMBOL_GPL(usbnet_change_mtu);

/* The caller must hold list->lock */
static void __usbnet_queue_skb(struct sk_buff_head *list,
                        struct sk_buff *newsk, enum skb_state state)
{
        struct skb_data *entry = (struct skb_data *) newsk->cb;

        __skb_queue_tail(list, newsk);
        entry->state = state;
}

/*-------------------------------------------------------------------------*/

/* some LK 2.4 HCDs oopsed if we freed or resubmitted urbs from
 * completion callbacks.  2.5 should have fixed those bugs...
 */

static enum skb_state defer_bh(struct usbnet *dev, struct sk_buff *skb,
                struct sk_buff_head *list, enum skb_state state)
{
        unsigned long                flags;
        enum skb_state                 old_state;
        struct skb_data *entry = (struct skb_data *) skb->cb;

        spin_lock_irqsave(&list->lock, flags);
        old_state = entry->state;
        entry->state = state;
        __skb_unlink(skb, list);

        /* defer_bh() is never called with list == &dev->done.
         * spin_lock_nested() tells lockdep that it is OK to take
         * dev->done.lock here with list->lock held.
         */
        spin_lock_nested(&dev->done.lock, SINGLE_DEPTH_NESTING);

        __skb_queue_tail(&dev->done, skb);
        if (dev->done.qlen == 1)
                queue_work(system_bh_wq, &dev->bh_work);
        spin_unlock(&dev->done.lock);
        spin_unlock_irqrestore(&list->lock, flags);
        return old_state;
}

/* some work can't be done in tasklets, so we use keventd
 *
 * NOTE:  annoying asymmetry:  if it's active, schedule_work() fails,
 * but tasklet_schedule() doesn't.  hope the failure is rare.
 */
void usbnet_defer_kevent(struct usbnet *dev, int work)
{
        set_bit (work, &dev->flags);
        if (!usbnet_going_away(dev)) {
                if (!schedule_work(&dev->kevent))
                        netdev_dbg(dev->net,
                                   "kevent %s may have been dropped\n",
                                   usbnet_event_names[work]);
                else
                        netdev_dbg(dev->net,
                                   "kevent %s scheduled\n", usbnet_event_names[work]);
        }
}
EXPORT_SYMBOL_GPL(usbnet_defer_kevent);

/*-------------------------------------------------------------------------*/

static void rx_complete(struct urb *urb);

static int rx_submit(struct usbnet *dev, struct urb *urb, gfp_t flags)
{
        struct sk_buff                *skb;
        struct skb_data                *entry;
        int                        retval = 0;
        unsigned long                lockflags;
        size_t                        size = dev->rx_urb_size;

        /* prevent rx skb allocation when error ratio is high */
        if (test_bit(EVENT_RX_KILL, &dev->flags)) {
                usb_free_urb(urb);
                return -ENOLINK;
        }

        if (test_bit(EVENT_NO_IP_ALIGN, &dev->flags))
                skb = __netdev_alloc_skb(dev->net, size, flags);
        else
                skb = __netdev_alloc_skb_ip_align(dev->net, size, flags);
        if (!skb) {
                netif_dbg(dev, rx_err, dev->net, "no rx skb\n");
                usbnet_defer_kevent(dev, EVENT_RX_MEMORY);
                usb_free_urb(urb);
                return -ENOMEM;
        }

        entry = (struct skb_data *) skb->cb;
        entry->urb = urb;
        entry->dev = dev;
        entry->length = 0;

        usb_fill_bulk_urb(urb, dev->udev, dev->in,
                          skb->data, size, rx_complete, skb);

        spin_lock_irqsave(&dev->rxq.lock, lockflags);

        if (netif_running(dev->net) &&
            netif_device_present(dev->net) &&
            test_bit(EVENT_DEV_OPEN, &dev->flags) &&
            !test_bit(EVENT_RX_HALT, &dev->flags) &&
            !test_bit(EVENT_DEV_ASLEEP, &dev->flags) &&
            !usbnet_going_away(dev)) {
                switch (retval = usb_submit_urb(urb, GFP_ATOMIC)) {
                case -EPIPE:
                        usbnet_defer_kevent(dev, EVENT_RX_HALT);
                        break;
                case -ENOMEM:
                        usbnet_defer_kevent(dev, EVENT_RX_MEMORY);
                        break;
                case -ENODEV:
                        netif_dbg(dev, ifdown, dev->net, "device gone\n");
                        netif_device_detach(dev->net);
                        break;
                case -EHOSTUNREACH:
                        retval = -ENOLINK;
                        break;
                default:
                        netif_dbg(dev, rx_err, dev->net,
                                  "rx submit, %d\n", retval);
                        queue_work(system_bh_wq, &dev->bh_work);
                        break;
                case 0:
                        __usbnet_queue_skb(&dev->rxq, skb, rx_start);
                }
        } else {
                netif_dbg(dev, ifdown, dev->net, "rx: stopped\n");
                retval = -ENOLINK;
        }
        spin_unlock_irqrestore(&dev->rxq.lock, lockflags);
        if (retval) {
                dev_kfree_skb_any(skb);
                usb_free_urb(urb);
        }
        return retval;
}


/*-------------------------------------------------------------------------*/

static inline int rx_process(struct usbnet *dev, struct sk_buff *skb)
{
        if (dev->driver_info->rx_fixup &&
            !dev->driver_info->rx_fixup(dev, skb)) {
                /* With RX_ASSEMBLE, rx_fixup() must update counters */
                if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE))
                        dev->net->stats.rx_errors++;
                return -EPROTO;
        }
        // else network stack removes extra byte if we forced a short packet

        /* all data was already cloned from skb inside the driver */
        if (dev->driver_info->flags & FLAG_MULTI_PACKET)
                return -EALREADY;

        if (skb->len < ETH_HLEN) {
                dev->net->stats.rx_errors++;
                dev->net->stats.rx_length_errors++;
                netif_dbg(dev, rx_err, dev->net, "rx length %d\n", skb->len);
                return -EPROTO;
        }

        usbnet_skb_return(dev, skb);
        return 0;
}

/*-------------------------------------------------------------------------*/

static void rx_complete(struct urb *urb)
{
        struct sk_buff                *skb = (struct sk_buff *) urb->context;
        struct skb_data                *entry = (struct skb_data *) skb->cb;
        struct usbnet                *dev = entry->dev;
        int                        urb_status = urb->status;
        enum skb_state                state;

        skb_put(skb, urb->actual_length);
        state = rx_done;
        entry->urb = NULL;

        switch (urb_status) {
        /* success */
        case 0:
                break;

        /* stalls need manual reset. this is rare ... except that
         * when going through USB 2.0 TTs, unplug appears this way.
         * we avoid the highspeed version of the ETIMEDOUT/EILSEQ
         * storm, recovering as needed.
         */
        case -EPIPE:
                dev->net->stats.rx_errors++;
                usbnet_defer_kevent(dev, EVENT_RX_HALT);
                fallthrough;

        /* software-driven interface shutdown */
        case -ECONNRESET:                /* async unlink */
        case -ESHUTDOWN:                /* hardware gone */
                netif_dbg(dev, ifdown, dev->net,
                          "rx shutdown, code %d\n", urb_status);
                goto block;

        /* we get controller i/o faults during hub_wq disconnect() delays.
         * throttle down resubmits, to avoid log floods; just temporarily,
         * so we still recover when the fault isn't a hub_wq delay.
         */
        case -EPROTO:
        case -ETIME:
        case -EILSEQ:
                dev->net->stats.rx_errors++;
                if (!timer_pending(&dev->delay)) {
                        mod_timer(&dev->delay, jiffies + THROTTLE_JIFFIES);
                        netif_dbg(dev, link, dev->net,
                                  "rx throttle %d\n", urb_status);
                }
block:
                state = rx_cleanup;
                entry->urb = urb;
                urb = NULL;
                break;

        /* data overrun ... flush fifo? */
        case -EOVERFLOW:
                dev->net->stats.rx_over_errors++;
                fallthrough;

        default:
                state = rx_cleanup;
                dev->net->stats.rx_errors++;
                netif_dbg(dev, rx_err, dev->net, "rx status %d\n", urb_status);
                break;
        }

        /* stop rx if packet error rate is high */
        if (++dev->pkt_cnt > 30) {
                dev->pkt_cnt = 0;
                dev->pkt_err = 0;
        } else {
                if (state == rx_cleanup)
                        dev->pkt_err++;
                if (dev->pkt_err > 20)
                        set_bit(EVENT_RX_KILL, &dev->flags);
        }

        state = defer_bh(dev, skb, &dev->rxq, state);

        if (urb) {
                if (netif_running(dev->net) &&
                    !test_bit(EVENT_RX_HALT, &dev->flags) &&
                    state != unlink_start) {
                        rx_submit(dev, urb, GFP_ATOMIC);
                        usb_mark_last_busy(dev->udev);
                        return;
                }
                usb_free_urb(urb);
        }
        netif_dbg(dev, rx_err, dev->net, "no read resubmitted\n");
}

/*-------------------------------------------------------------------------*/
void usbnet_pause_rx(struct usbnet *dev)
{
        set_bit(EVENT_RX_PAUSED, &dev->flags);

        netif_dbg(dev, rx_status, dev->net, "paused rx queue enabled\n");
}
EXPORT_SYMBOL_GPL(usbnet_pause_rx);

void usbnet_resume_rx(struct usbnet *dev)
{
        struct sk_buff *skb;
        int num = 0;

        local_bh_disable();
        clear_bit(EVENT_RX_PAUSED, &dev->flags);

        while ((skb = skb_dequeue(&dev->rxq_pause)) != NULL) {
                usbnet_skb_return(dev, skb);
                num++;
        }

        queue_work(system_bh_wq, &dev->bh_work);
        local_bh_enable();

        netif_dbg(dev, rx_status, dev->net,
                  "paused rx queue disabled, %d skbs requeued\n", num);
}
EXPORT_SYMBOL_GPL(usbnet_resume_rx);

void usbnet_purge_paused_rxq(struct usbnet *dev)
{
        skb_queue_purge(&dev->rxq_pause);
}
EXPORT_SYMBOL_GPL(usbnet_purge_paused_rxq);

/*-------------------------------------------------------------------------*/

// unlink pending rx/tx; completion handlers do all other cleanup

static int unlink_urbs(struct usbnet *dev, struct sk_buff_head *q)
{
        unsigned long                flags;
        struct sk_buff                *skb;
        int                        count = 0;

        spin_lock_irqsave (&q->lock, flags);
        while (!skb_queue_empty(q)) {
                struct skb_data                *entry;
                struct urb                *urb;
                int                        retval;

                skb_queue_walk(q, skb) {
                        entry = (struct skb_data *) skb->cb;
                        if (entry->state != unlink_start)
                                goto found;
                }
                break;
found:
                entry->state = unlink_start;
                urb = entry->urb;

                /*
                 * Get reference count of the URB to avoid it to be
                 * freed during usb_unlink_urb, which may trigger
                 * use-after-free problem inside usb_unlink_urb since
                 * usb_unlink_urb is always racing with .complete
                 * handler(include defer_bh).
                 */
                usb_get_urb(urb);
                spin_unlock_irqrestore(&q->lock, flags);
                // during some PM-driven resume scenarios,
                // these (async) unlinks complete immediately
                retval = usb_unlink_urb(urb);
                if (retval != -EINPROGRESS && retval != 0)
                        netdev_dbg(dev->net, "unlink urb err, %d\n", retval);
                else
                        count++;
                usb_put_urb(urb);
                spin_lock_irqsave(&q->lock, flags);
        }
        spin_unlock_irqrestore(&q->lock, flags);
        return count;
}

// Flush all pending rx urbs
// minidrivers may need to do this when the MTU changes

void usbnet_unlink_rx_urbs(struct usbnet *dev)
{
        if (netif_running(dev->net)) {
                (void) unlink_urbs (dev, &dev->rxq);
                queue_work(system_bh_wq, &dev->bh_work);
        }
}
EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs);

/*-------------------------------------------------------------------------*/

static void wait_skb_queue_empty(struct sk_buff_head *q)
{
        unsigned long flags;

        spin_lock_irqsave(&q->lock, flags);
        while (!skb_queue_empty(q)) {
                spin_unlock_irqrestore(&q->lock, flags);
                schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
                set_current_state(TASK_UNINTERRUPTIBLE);
                spin_lock_irqsave(&q->lock, flags);
        }
        spin_unlock_irqrestore(&q->lock, flags);
}

// precondition: never called in_interrupt
static void usbnet_terminate_urbs(struct usbnet *dev)
{
        DECLARE_WAITQUEUE(wait, current);
        int temp;

        /* ensure there are no more active urbs */
        add_wait_queue(&dev->wait, &wait);
        set_current_state(TASK_UNINTERRUPTIBLE);
        temp = unlink_urbs(dev, &dev->txq) +
                unlink_urbs(dev, &dev->rxq);

        /* maybe wait for deletions to finish. */
        wait_skb_queue_empty(&dev->rxq);
        wait_skb_queue_empty(&dev->txq);
        wait_skb_queue_empty(&dev->done);
        netif_dbg(dev, ifdown, dev->net,
                  "waited for %d urb completions\n", temp);
        set_current_state(TASK_RUNNING);
        remove_wait_queue(&dev->wait, &wait);
}

int usbnet_stop(struct net_device *net)
{
        struct usbnet                *dev = netdev_priv(net);
        const struct driver_info *info = dev->driver_info;
        int                        retval, pm, mpn;

        clear_bit(EVENT_DEV_OPEN, &dev->flags);
        netif_stop_queue(net);

        netif_info(dev, ifdown, dev->net,
                   "stop stats: rx/tx %lu/%lu, errs %lu/%lu\n",
                   net->stats.rx_packets, net->stats.tx_packets,
                   net->stats.rx_errors, net->stats.tx_errors);

        /* to not race resume */
        pm = usb_autopm_get_interface(dev->intf);
        /* allow minidriver to stop correctly (wireless devices to turn off
         * radio etc) */
        if (info->stop) {
                retval = info->stop(dev);
                if (retval < 0)
                        netif_info(dev, ifdown, dev->net,
                                   "stop fail (%d) usbnet usb-%s-%s, %s\n",
                                   retval,
                                   dev->udev->bus->bus_name, dev->udev->devpath,
                                   info->description);
        }

        if (!(info->flags & FLAG_AVOID_UNLINK_URBS))
                usbnet_terminate_urbs(dev);

        usbnet_status_stop(dev);

        usbnet_purge_paused_rxq(dev);

        mpn = !test_and_clear_bit(EVENT_NO_RUNTIME_PM, &dev->flags);

        /* deferred work (timer, softirq, task) must also stop */
        dev->flags = 0;
        timer_delete_sync(&dev->delay);
        cancel_work_sync(&dev->bh_work);
        cancel_work_sync(&dev->kevent);

        /* We have cyclic dependencies. Those calls are needed
         * to break a cycle. We cannot fall into the gaps because
         * we have a flag
         */
        cancel_work_sync(&dev->bh_work);
        timer_delete_sync(&dev->delay);
        cancel_work_sync(&dev->kevent);

        netdev_reset_queue(net);

        if (!pm)
                usb_autopm_put_interface(dev->intf);

        if (info->manage_power && mpn)
                info->manage_power(dev, 0);
        else
                usb_autopm_put_interface(dev->intf);

        return 0;
}
EXPORT_SYMBOL_GPL(usbnet_stop);

/*-------------------------------------------------------------------------*/

// posts reads, and enables write queuing

// precondition: never called in_interrupt

int usbnet_open(struct net_device *net)
{
        struct usbnet                *dev = netdev_priv(net);
        int                        retval;
        const struct driver_info *info = dev->driver_info;

        if ((retval = usb_autopm_get_interface(dev->intf)) < 0) {
                netif_info(dev, ifup, dev->net,
                           "resumption fail (%d) usbnet usb-%s-%s, %s\n",
                           retval,
                           dev->udev->bus->bus_name,
                           dev->udev->devpath,
                           info->description);
                goto done_nopm;
        }

        // put into "known safe" state
        if (info->reset) {
                retval = info->reset(dev);
                if (retval < 0) {
                        netif_info(dev, ifup, dev->net,
                                   "open reset fail (%d) usbnet usb-%s-%s, %s\n",
                                   retval,
                                   dev->udev->bus->bus_name,
                                   dev->udev->devpath,
                                   info->description);
                        goto done;
                }
        }

        /* hard_mtu or rx_urb_size may change in reset() */
        usbnet_update_max_qlen(dev);

        // insist peer be connected
        if (info->check_connect) {
                retval = info->check_connect(dev);
                if (retval < 0) {
                        netif_err(dev, ifup, dev->net, "can't open; %d\n", retval);
                        goto done;
                }
        }

        /* start any status interrupt transfer */
        if (dev->interrupt) {
                retval = usbnet_status_start(dev, GFP_KERNEL);
                if (retval < 0) {
                        netif_err(dev, ifup, dev->net,
                                  "intr submit %d\n", retval);
                        goto done;
                }
        }

        set_bit(EVENT_DEV_OPEN, &dev->flags);
        netdev_reset_queue(net);
        netif_start_queue (net);
        netif_info(dev, ifup, dev->net,
                   "open: enable queueing (rx %d, tx %d) mtu %d %s framing\n",
                   (int)RX_QLEN(dev), (int)TX_QLEN(dev),
                   dev->net->mtu,
                   (dev->driver_info->flags & FLAG_FRAMING_NC) ? "NetChip" :
                   (dev->driver_info->flags & FLAG_FRAMING_GL) ? "GeneSys" :
                   (dev->driver_info->flags & FLAG_FRAMING_Z) ? "Zaurus" :
                   (dev->driver_info->flags & FLAG_FRAMING_RN) ? "RNDIS" :
                   (dev->driver_info->flags & FLAG_FRAMING_AX) ? "ASIX" :
                   "simple");

        /* reset rx error state */
        dev->pkt_cnt = 0;
        dev->pkt_err = 0;
        clear_bit(EVENT_RX_KILL, &dev->flags);

        // delay posting reads until we're fully open
        queue_work(system_bh_wq, &dev->bh_work);
        if (info->manage_power) {
                retval = info->manage_power(dev, 1);
                if (retval < 0) {
                        retval = 0;
                        set_bit(EVENT_NO_RUNTIME_PM, &dev->flags);
                } else {
                        usb_autopm_put_interface(dev->intf);
                }
        }
        return retval;
done:
        usb_autopm_put_interface(dev->intf);
done_nopm:
        return retval;
}
EXPORT_SYMBOL_GPL(usbnet_open);

/*-------------------------------------------------------------------------*/

/* ethtool methods; minidrivers may need to add some more, but
 * they'll probably want to use this base set.
 */

/* These methods are written on the assumption that the device
 * uses MII
 */
int usbnet_get_link_ksettings_mii(struct net_device *net,
                              struct ethtool_link_ksettings *cmd)
{
        struct usbnet *dev = netdev_priv(net);

        if (!dev->mii.mdio_read)
                return -EOPNOTSUPP;

        mii_ethtool_get_link_ksettings(&dev->mii, cmd);

        return 0;
}
EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_mii);

int usbnet_get_link_ksettings_internal(struct net_device *net,
                                        struct ethtool_link_ksettings *cmd)
{
        struct usbnet *dev = netdev_priv(net);

        /* the assumption that speed is equal on tx and rx
         * is deeply engrained into the networking layer.
         * For wireless stuff it is not true.
         * We assume that rx_speed matters more.
         */
        if (dev->rx_speed != SPEED_UNSET)
                cmd->base.speed = dev->rx_speed / 1000000;
        else if (dev->tx_speed != SPEED_UNSET)
                cmd->base.speed = dev->tx_speed / 1000000;
        else
                cmd->base.speed = SPEED_UNKNOWN;

        /* The standard "Universal Serial Bus Class Definitions
         * for Communications Devices v1.2" does not specify
         * anything about duplex status.
         * So set it DUPLEX_UNKNOWN instead of default DUPLEX_HALF.
         */
        cmd->base.duplex = DUPLEX_UNKNOWN;

        return 0;
}
EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_internal);

int usbnet_set_link_ksettings_mii(struct net_device *net,
                              const struct ethtool_link_ksettings *cmd)
{
        struct usbnet *dev = netdev_priv(net);
        int retval;

        if (!dev->mii.mdio_write)
                return -EOPNOTSUPP;

        retval = mii_ethtool_set_link_ksettings(&dev->mii, cmd);

        /* link speed/duplex might have changed */
        if (dev->driver_info->link_reset)
                dev->driver_info->link_reset(dev);

        /* hard_mtu or rx_urb_size may change in link_reset() */
        usbnet_update_max_qlen(dev);

        return retval;
}
EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings_mii);

u32 usbnet_get_link(struct net_device *net)
{
        struct usbnet *dev = netdev_priv(net);

        /* If a check_connect is defined, return its result */
        if (dev->driver_info->check_connect)
                return dev->driver_info->check_connect(dev) == 0;

        /* if the device has mii operations, use those */
        if (dev->mii.mdio_read)
                return mii_link_ok(&dev->mii);

        /* Otherwise, dtrt for drivers calling netif_carrier_{on,off} */
        return ethtool_op_get_link(net);
}
EXPORT_SYMBOL_GPL(usbnet_get_link);

int usbnet_nway_reset(struct net_device *net)
{
        struct usbnet *dev = netdev_priv(net);

        if (!dev->mii.mdio_write)
                return -EOPNOTSUPP;

        return mii_nway_restart(&dev->mii);
}
EXPORT_SYMBOL_GPL(usbnet_nway_reset);

int usbnet_mii_ioctl(struct net_device *net, struct ifreq *rq, int cmd)
{
        struct usbnet *dev = netdev_priv(net);

        return generic_mii_ioctl(&dev->mii, if_mii(rq), cmd, NULL);
}
EXPORT_SYMBOL_GPL(usbnet_mii_ioctl);

void usbnet_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info)
{
        struct usbnet *dev = netdev_priv(net);

        strscpy(info->driver, dev->driver_name, sizeof(info->driver));
        strscpy(info->fw_version, dev->driver_info->description,
                sizeof(info->fw_version));
        usb_make_path(dev->udev, info->bus_info, sizeof(info->bus_info));
}
EXPORT_SYMBOL_GPL(usbnet_get_drvinfo);

u32 usbnet_get_msglevel(struct net_device *net)
{
        struct usbnet *dev = netdev_priv(net);

        return dev->msg_enable;
}
EXPORT_SYMBOL_GPL(usbnet_get_msglevel);

void usbnet_set_msglevel(struct net_device *net, u32 level)
{
        struct usbnet *dev = netdev_priv(net);

        dev->msg_enable = level;
}
EXPORT_SYMBOL_GPL(usbnet_set_msglevel);

/* drivers may override default ethtool_ops in their bind() routine */
static const struct ethtool_ops usbnet_ethtool_ops = {
        .get_link                = usbnet_get_link,
        .nway_reset                = usbnet_nway_reset,
        .get_drvinfo                = usbnet_get_drvinfo,
        .get_msglevel                = usbnet_get_msglevel,
        .set_msglevel                = usbnet_set_msglevel,
        .get_ts_info                = ethtool_op_get_ts_info,
        .get_link_ksettings        = usbnet_get_link_ksettings_mii,
        .set_link_ksettings        = usbnet_set_link_ksettings_mii,
};

/*-------------------------------------------------------------------------*/

static void __handle_link_change(struct usbnet *dev)
{
        if (!test_bit(EVENT_DEV_OPEN, &dev->flags))
                return;

        if (test_and_clear_bit(EVENT_LINK_CARRIER_ON, &dev->flags))
                netif_carrier_on(dev->net);

        if (!netif_carrier_ok(dev->net)) {
                /* kill URBs for reading packets to save bus bandwidth */
                unlink_urbs(dev, &dev->rxq);

                /*
                 * tx_timeout will unlink URBs for sending packets and
                 * tx queue is stopped by netcore after link becomes off
                 */
        } else {
                /* submitting URBs for reading packets */
                queue_work(system_bh_wq, &dev->bh_work);
        }

        /* hard_mtu or rx_urb_size may change during link change */
        usbnet_update_max_qlen(dev);

        clear_bit(EVENT_LINK_CHANGE, &dev->flags);
}

void usbnet_set_rx_mode(struct net_device *net)
{
        struct usbnet                *dev = netdev_priv(net);

        usbnet_defer_kevent(dev, EVENT_SET_RX_MODE);
}
EXPORT_SYMBOL_GPL(usbnet_set_rx_mode);

static void __handle_set_rx_mode(struct usbnet *dev)
{
        if (dev->driver_info->set_rx_mode)
                (dev->driver_info->set_rx_mode)(dev);

        clear_bit(EVENT_SET_RX_MODE, &dev->flags);
}

/* work that cannot be done in interrupt context uses keventd.
 *
 * NOTE:  with 2.5 we could do more of this using completion callbacks,
 * especially now that control transfers can be queued.
 */
static void
usbnet_deferred_kevent(struct work_struct *work)
{
        struct usbnet                *dev =
                container_of(work, struct usbnet, kevent);
        int                        status;

        /* usb_clear_halt() needs a thread context */
        if (test_bit(EVENT_TX_HALT, &dev->flags)) {
                unlink_urbs(dev, &dev->txq);
                status = usb_autopm_get_interface(dev->intf);
                if (status < 0)
                        goto fail_pipe;
                status = usb_clear_halt(dev->udev, dev->out);
                usb_autopm_put_interface(dev->intf);
                if (status < 0 &&
                    status != -EPIPE &&
                    status != -ESHUTDOWN) {
                        if (netif_msg_tx_err(dev))
fail_pipe:
                                netdev_err(dev->net, "can't clear tx halt, status %d\n",
                                           status);
                } else {
                        clear_bit(EVENT_TX_HALT, &dev->flags);
                        if (status != -ESHUTDOWN)
                                netif_wake_queue(dev->net);
                }
        }
        if (test_bit(EVENT_RX_HALT, &dev->flags)) {
                unlink_urbs(dev, &dev->rxq);
                status = usb_autopm_get_interface(dev->intf);
                if (status < 0)
                        goto fail_halt;
                status = usb_clear_halt(dev->udev, dev->in);
                usb_autopm_put_interface(dev->intf);
                if (status < 0 &&
                    status != -EPIPE &&
                    status != -ESHUTDOWN) {
                        if (netif_msg_rx_err(dev))
fail_halt:
                                netdev_err(dev->net, "can't clear rx halt, status %d\n",
                                           status);
                } else {
                        clear_bit(EVENT_RX_HALT, &dev->flags);
                        if (!usbnet_going_away(dev))
                                queue_work(system_bh_wq, &dev->bh_work);
                }
        }

        /* work could resubmit itself forever if memory is tight */
        if (test_bit(EVENT_RX_MEMORY, &dev->flags)) {
                struct urb        *urb = NULL;
                int resched = 1;

                if (netif_running(dev->net))
                        urb = usb_alloc_urb(0, GFP_KERNEL);
                else
                        clear_bit(EVENT_RX_MEMORY, &dev->flags);
                if (urb != NULL) {
                        clear_bit(EVENT_RX_MEMORY, &dev->flags);
                        status = usb_autopm_get_interface(dev->intf);
                        if (status < 0) {
                                usb_free_urb(urb);
                                goto fail_lowmem;
                        }
                        if (rx_submit(dev, urb, GFP_KERNEL) == -ENOLINK)
                                resched = 0;
                        usb_autopm_put_interface(dev->intf);
fail_lowmem:
                        if (resched)
                                if (!usbnet_going_away(dev))
                                        queue_work(system_bh_wq, &dev->bh_work);
                }
        }

        if (test_bit (EVENT_LINK_RESET, &dev->flags)) {
                const struct driver_info *info = dev->driver_info;
                int                        retval = 0;

                clear_bit(EVENT_LINK_RESET, &dev->flags);
                status = usb_autopm_get_interface(dev->intf);
                if (status < 0)
                        goto skip_reset;
                if(info->link_reset && (retval = info->link_reset(dev)) < 0) {
                        usb_autopm_put_interface(dev->intf);
skip_reset:
                        netdev_info(dev->net, "link reset failed (%d) usbnet usb-%s-%s, %s\n",
                                    retval,
                                    dev->udev->bus->bus_name,
                                    dev->udev->devpath,
                                    info->description);
                } else {
                        usb_autopm_put_interface(dev->intf);
                }

                /* handle link change from link resetting */
                __handle_link_change(dev);
        }

        if (test_bit(EVENT_LINK_CHANGE, &dev->flags))
                __handle_link_change(dev);

        if (test_bit(EVENT_SET_RX_MODE, &dev->flags))
                __handle_set_rx_mode(dev);


        if (dev->flags)
                netdev_dbg(dev->net, "kevent done, flags = 0x%lx\n", dev->flags);
}

/*-------------------------------------------------------------------------*/

static void tx_complete(struct urb *urb)
{
        struct sk_buff                *skb = (struct sk_buff *) urb->context;
        struct skb_data                *entry = (struct skb_data *) skb->cb;
        struct usbnet                *dev = entry->dev;

        if (urb->status == 0) {
                struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->net->tstats);
                unsigned long flags;

                flags = u64_stats_update_begin_irqsave(&stats64->syncp);
                u64_stats_add(&stats64->tx_packets, entry->packets);
                u64_stats_add(&stats64->tx_bytes, entry->length);
                u64_stats_update_end_irqrestore(&stats64->syncp, flags);
        } else {
                dev->net->stats.tx_errors++;

                switch (urb->status) {
                case -EPIPE:
                        usbnet_defer_kevent(dev, EVENT_TX_HALT);
                        break;

                /* software-driven interface shutdown */
                case -ECONNRESET:                // async unlink
                case -ESHUTDOWN:                // hardware gone
                        break;

                /* like rx, tx gets controller i/o faults during hub_wq
                 * delays and so it uses the same throttling mechanism.
                 */
                case -EPROTO:
                case -ETIME:
                case -EILSEQ:
                        usb_mark_last_busy(dev->udev);
                        if (!timer_pending(&dev->delay)) {
                                mod_timer(&dev->delay,
                                          jiffies + THROTTLE_JIFFIES);
                                netif_dbg(dev, link, dev->net,
                                          "tx throttle %d\n", urb->status);
                        }
                        netif_stop_queue(dev->net);
                        break;
                default:
                        netif_dbg(dev, tx_err, dev->net,
                                  "tx err %d\n", entry->urb->status);
                        break;
                }
        }

        usb_autopm_put_interface_async(dev->intf);
        (void) defer_bh(dev, skb, &dev->txq, tx_done);
}

/*-------------------------------------------------------------------------*/

void usbnet_tx_timeout(struct net_device *net, unsigned int txqueue)
{
        struct usbnet                *dev = netdev_priv(net);

        unlink_urbs(dev, &dev->txq);
        queue_work(system_bh_wq, &dev->bh_work);
        /* this needs to be handled individually because the generic layer
         * doesn't know what is sufficient and could not restore private
         * information if a remedy of an unconditional reset were used.
         */
        if (dev->driver_info->recover)
                (dev->driver_info->recover)(dev);
}
EXPORT_SYMBOL_GPL(usbnet_tx_timeout);

/*-------------------------------------------------------------------------*/

static int build_dma_sg(const struct sk_buff *skb, struct urb *urb)
{
        unsigned num_sgs, total_len = 0;
        int i, s = 0;

        num_sgs = skb_shinfo(skb)->nr_frags + 1;
        if (num_sgs == 1)
                return 0;

        /* reserve one for zero packet */
        urb->sg = kmalloc_objs(struct scatterlist, num_sgs + 1, GFP_ATOMIC);
        if (!urb->sg)
                return -ENOMEM;

        urb->num_sgs = num_sgs;
        sg_init_table(urb->sg, urb->num_sgs + 1);

        sg_set_buf(&urb->sg[s++], skb->data, skb_headlen(skb));
        total_len += skb_headlen(skb);

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];

                total_len += skb_frag_size(f);
                sg_set_page(&urb->sg[i + s], skb_frag_page(f), skb_frag_size(f),
                            skb_frag_off(f));
        }
        urb->transfer_buffer_length = total_len;

        return 1;
}

netdev_tx_t usbnet_start_xmit(struct sk_buff *skb, struct net_device *net)
{
        struct usbnet                *dev = netdev_priv(net);
        unsigned int                        length;
        struct urb                *urb = NULL;
        struct skb_data                *entry;
        const struct driver_info *info = dev->driver_info;
        unsigned long                flags;
        int retval;

        if (skb)
                skb_tx_timestamp(skb);

        // some devices want funky USB-level framing, for
        // win32 driver (usually) and/or hardware quirks
        if (info->tx_fixup) {
                skb = info->tx_fixup(dev, skb, GFP_ATOMIC);
                if (!skb) {
                        /* packet collected; minidriver waiting for more */
                        if (info->flags & FLAG_MULTI_PACKET)
                                goto not_drop;
                        netif_dbg(dev, tx_err, dev->net, "can't tx_fixup skb\n");
                        goto drop;
                }
        }

        urb = usb_alloc_urb(0, GFP_ATOMIC);
        if (!urb) {
                netif_dbg(dev, tx_err, dev->net, "no urb\n");
                goto drop;
        }

        entry = (struct skb_data *) skb->cb;
        entry->urb = urb;
        entry->dev = dev;

        usb_fill_bulk_urb(urb, dev->udev, dev->out,
                          skb->data, skb->len, tx_complete, skb);
        if (dev->can_dma_sg) {
                if (build_dma_sg(skb, urb) < 0)
                        goto drop;
        }
        length = urb->transfer_buffer_length;

        /* don't assume the hardware handles USB_ZERO_PACKET
         * NOTE:  strictly conforming cdc-ether devices should expect
         * the ZLP here, but ignore the one-byte packet.
         * NOTE2: CDC NCM specification is different from CDC ECM when
         * handling ZLP/short packets, so cdc_ncm driver will make short
         * packet itself if needed.
         */
        if (length % dev->maxpacket == 0) {
                if (!(info->flags & FLAG_SEND_ZLP)) {
                        if (!(info->flags & FLAG_MULTI_PACKET)) {
                                length++;
                                if (skb_tailroom(skb) && !urb->num_sgs) {
                                        skb->data[skb->len] = 0;
                                        __skb_put(skb, 1);
                                } else if (urb->num_sgs)
                                        sg_set_buf(&urb->sg[urb->num_sgs++],
                                                        dev->padding_pkt, 1);
                        }
                } else
                        urb->transfer_flags |= URB_ZERO_PACKET;
        }
        urb->transfer_buffer_length = length;

        if (info->flags & FLAG_MULTI_PACKET) {
                /* Driver has set number of packets and a length delta.
                 * Calculate the complete length and ensure that it's
                 * positive.
                 */
                entry->length += length;
                if (WARN_ON_ONCE(entry->length <= 0))
                        entry->length = length;
        } else {
                usbnet_set_skb_tx_stats(skb, 1, length);
        }

        spin_lock_irqsave(&dev->txq.lock, flags);
        retval = usb_autopm_get_interface_async(dev->intf);
        if (retval < 0) {
                spin_unlock_irqrestore(&dev->txq.lock, flags);
                goto drop;
        }
        if (netif_queue_stopped(net)) {
                usb_autopm_put_interface_async(dev->intf);
                spin_unlock_irqrestore(&dev->txq.lock, flags);
                goto drop;
        }

#ifdef CONFIG_PM
        /* if this triggers the device is still a sleep */
        if (test_bit(EVENT_DEV_ASLEEP, &dev->flags)) {
                /* transmission will be done in resume */
                usb_anchor_urb(urb, &dev->deferred);
                /* no use to process more packets */
                netif_stop_queue(net);
                usb_put_urb(urb);
                spin_unlock_irqrestore(&dev->txq.lock, flags);
                netdev_dbg(dev->net, "Delaying transmission for resumption\n");
                goto deferred;
        }
#endif

        switch ((retval = usb_submit_urb (urb, GFP_ATOMIC))) {
        case -EPIPE:
                netif_stop_queue(net);
                usbnet_defer_kevent(dev, EVENT_TX_HALT);
                usb_autopm_put_interface_async(dev->intf);
                break;
        default:
                usb_autopm_put_interface_async(dev->intf);
                netif_dbg(dev, tx_err, dev->net,
                          "tx: submit urb err %d\n", retval);
                break;
        case 0:
                netif_trans_update(net);
                __usbnet_queue_skb(&dev->txq, skb, tx_start);
                netdev_sent_queue(net, skb->len);
                if (dev->txq.qlen >= TX_QLEN (dev))
                        netif_stop_queue (net);
        }
        spin_unlock_irqrestore(&dev->txq.lock, flags);

        if (retval) {
                netif_dbg(dev, tx_err, dev->net, "drop, code %d\n", retval);
drop:
                dev->net->stats.tx_dropped++;
not_drop:
                if (skb)
                        dev_kfree_skb_any(skb);
                if (urb) {
                        kfree(urb->sg);
                        usb_free_urb(urb);
                }
        } else
                netif_dbg(dev, tx_queued, dev->net,
                          "> tx, len %u, type 0x%x\n", length, skb->protocol);
#ifdef CONFIG_PM
deferred:
#endif
        return NETDEV_TX_OK;
}
EXPORT_SYMBOL_GPL(usbnet_start_xmit);

static int rx_alloc_submit(struct usbnet *dev, gfp_t flags)
{
        struct urb        *urb;
        int                i;
        int                ret = 0;

        /* don't refill the queue all at once */
        for (i = 0; i < 10 && dev->rxq.qlen < RX_QLEN(dev); i++) {
                urb = usb_alloc_urb(0, flags);
                if (urb != NULL) {
                        ret = rx_submit(dev, urb, flags);
                        if (ret)
                                goto err;
                } else {
                        ret = -ENOMEM;
                        goto err;
                }
        }
err:
        return ret;
}

static inline void usb_free_skb(struct sk_buff *skb)
{
        struct skb_data *entry = (struct skb_data *)skb->cb;

        usb_free_urb(entry->urb);
        dev_kfree_skb(skb);
}

/*-------------------------------------------------------------------------*/

// work (work deferred from completions, in_irq) or timer

static void usbnet_bh(struct timer_list *t)
{
        struct usbnet                *dev = timer_container_of(dev, t, delay);
        unsigned int bytes_compl = 0, pkts_compl = 0;
        struct sk_buff                *skb;
        struct skb_data                *entry;

        while ((skb = skb_dequeue (&dev->done))) {
                entry = (struct skb_data *) skb->cb;
                switch (entry->state) {
                case rx_done:
                        if (rx_process(dev, skb))
                                usb_free_skb(skb);
                        continue;
                case tx_done:
                        bytes_compl += skb->len;
                        pkts_compl++;
                        kfree(entry->urb->sg);
                        fallthrough;
                case rx_cleanup:
                        usb_free_skb(skb);
                        continue;
                default:
                        netdev_dbg(dev->net, "bogus skb state %d\n", entry->state);
                }
        }

        spin_lock_bh(&dev->bql_spinlock);
        netdev_completed_queue(dev->net, pkts_compl, bytes_compl);
        spin_unlock_bh(&dev->bql_spinlock);

        /* restart RX again after disabling due to high error rate */
        clear_bit(EVENT_RX_KILL, &dev->flags);

        /* waiting for all pending urbs to complete?
         * only then can we forgo submitting anew
         */
        if (waitqueue_active(&dev->wait)) {
                if (dev->txq.qlen + dev->rxq.qlen + dev->done.qlen == 0)
                        wake_up_all(&dev->wait);

        // or are we maybe short a few urbs?
        } else if (netif_running (dev->net) &&
                   netif_device_present (dev->net) &&
                   netif_carrier_ok(dev->net) &&
                   !usbnet_going_away(dev) &&
                   !timer_pending(&dev->delay) &&
                   !test_bit(EVENT_RX_PAUSED, &dev->flags) &&
                   !test_bit(EVENT_RX_HALT, &dev->flags)) {
                int        temp = dev->rxq.qlen;

                if (temp < RX_QLEN(dev)) {
                        if (rx_alloc_submit(dev, GFP_ATOMIC) == -ENOLINK)
                                return;
                        if (temp != dev->rxq.qlen)
                                netif_dbg(dev, link, dev->net,
                                          "rxqlen %d --> %d\n",
                                          temp, dev->rxq.qlen);
                        if (dev->rxq.qlen < RX_QLEN(dev))
                                queue_work(system_bh_wq, &dev->bh_work);
                }
                if (dev->txq.qlen < TX_QLEN (dev))
                        netif_wake_queue(dev->net);
        }
}

static void usbnet_bh_work(struct work_struct *work)
{
        struct usbnet *dev = from_work(dev, work, bh_work);

        usbnet_bh(&dev->delay);
}


/*-------------------------------------------------------------------------
 *
 * USB Device Driver support
 *
 *-------------------------------------------------------------------------*/

// precondition: never called in_interrupt

void usbnet_disconnect(struct usb_interface *intf)
{
        struct usbnet                *dev;
        struct usb_device        *xdev;
        struct net_device        *net;
        struct urb                *urb;

        dev = usb_get_intfdata(intf);
        usb_set_intfdata(intf, NULL);
        if (!dev)
                return;
        usbnet_mark_going_away(dev);

        xdev = interface_to_usbdev(intf);

        netif_info(dev, probe, dev->net, "unregister '%s' usb-%s-%s, %s\n",
                   intf->dev.driver->name,
                   xdev->bus->bus_name, xdev->devpath,
                   dev->driver_info->description);

        net = dev->net;
        unregister_netdev(net);

        cancel_work_sync(&dev->kevent);

        while ((urb = usb_get_from_anchor(&dev->deferred))) {
                dev_kfree_skb(urb->context);
                kfree(urb->sg);
                usb_free_urb(urb);
        }

        if (dev->driver_info->unbind)
                dev->driver_info->unbind(dev, intf);

        usb_kill_urb(dev->interrupt);
        usb_free_urb(dev->interrupt);
        kfree(dev->padding_pkt);

        free_netdev(net);
}
EXPORT_SYMBOL_GPL(usbnet_disconnect);

static const struct net_device_ops usbnet_netdev_ops = {
        .ndo_open                = usbnet_open,
        .ndo_stop                = usbnet_stop,
        .ndo_start_xmit                = usbnet_start_xmit,
        .ndo_tx_timeout                = usbnet_tx_timeout,
        .ndo_set_rx_mode        = usbnet_set_rx_mode,
        .ndo_change_mtu                = usbnet_change_mtu,
        .ndo_set_mac_address         = eth_mac_addr,
        .ndo_validate_addr        = eth_validate_addr,
};

/*-------------------------------------------------------------------------*/

// precondition: never called in_interrupt

static const struct device_type wlan_type = {
        .name        = "wlan",
};

static const struct device_type wwan_type = {
        .name        = "wwan",
};

int
usbnet_probe(struct usb_interface *udev, const struct usb_device_id *prod)
{
        struct usbnet                        *dev;
        struct net_device                *net;
        struct usb_host_interface        *interface;
        const struct driver_info        *info;
        struct usb_device                *xdev;
        int                                status;
        const char                        *name;
        struct usb_driver         *driver = to_usb_driver(udev->dev.driver);

        /* usbnet already took usb runtime pm, so have to enable the feature
         * for usb interface, otherwise usb_autopm_get_interface may return
         * failure if RUNTIME_PM is enabled.
         */
        if (!driver->supports_autosuspend) {
                driver->supports_autosuspend = 1;
                pm_runtime_enable(&udev->dev);
        }

        name = udev->dev.driver->name;
        info = (const struct driver_info *) prod->driver_info;
        if (!info) {
                dev_dbg (&udev->dev, "blacklisted by %s\n", name);
                return -ENODEV;
        }
        xdev = interface_to_usbdev(udev);
        interface = udev->cur_altsetting;

        status = -ENOMEM;

        // set up our own records
        net = alloc_etherdev(sizeof(*dev));
        if (!net)
                goto out;

        /* netdev_printk() needs this so do it as early as possible */
        SET_NETDEV_DEV(net, &udev->dev);

        dev = netdev_priv(net);
        dev->udev = xdev;
        dev->intf = udev;
        dev->driver_info = info;
        dev->driver_name = name;
        dev->rx_speed = SPEED_UNSET;
        dev->tx_speed = SPEED_UNSET;

        dev->msg_enable = netif_msg_init (msg_level, NETIF_MSG_DRV
                                | NETIF_MSG_PROBE | NETIF_MSG_LINK);
        init_waitqueue_head(&dev->wait);
        skb_queue_head_init (&dev->rxq);
        skb_queue_head_init (&dev->txq);
        skb_queue_head_init (&dev->done);
        skb_queue_head_init(&dev->rxq_pause);
        spin_lock_init(&dev->bql_spinlock);
        INIT_WORK(&dev->bh_work, usbnet_bh_work);
        INIT_WORK(&dev->kevent, usbnet_deferred_kevent);
        init_usb_anchor(&dev->deferred);
        timer_setup(&dev->delay, usbnet_bh, 0);
        mutex_init(&dev->phy_mutex);
        mutex_init(&dev->interrupt_mutex);
        dev->interrupt_count = 0;

        dev->net = net;
        strscpy(net->name, "usb%d", sizeof(net->name));

        /* rx and tx sides can use different message sizes;
         * bind() should set rx_urb_size in that case.
         */
        dev->hard_mtu = net->mtu + net->hard_header_len;
        net->min_mtu = 0;
        net->max_mtu = ETH_MAX_MTU;

        net->netdev_ops = &usbnet_netdev_ops;
        net->watchdog_timeo = TX_TIMEOUT_JIFFIES;
        net->ethtool_ops = &usbnet_ethtool_ops;
        net->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;

        // allow device-specific bind/init procedures
        // NOTE net->name still not usable ...
        if (info->bind) {
                status = info->bind(dev, udev);
                if (status < 0)
                        goto out1;

                /* heuristic: rename to "eth%d" if we are not sure this link
                 * is two-host (these links keep "usb%d")
                 */
                if ((dev->driver_info->flags & FLAG_ETHER) != 0 &&
                    !usbnet_needs_usb_name_format(dev, net))
                        strscpy(net->name, "eth%d", sizeof(net->name));
                /* WLAN devices should always be named "wlan%d" */
                if ((dev->driver_info->flags & FLAG_WLAN) != 0)
                        strscpy(net->name, "wlan%d", sizeof(net->name));
                /* WWAN devices should always be named "wwan%d" */
                if ((dev->driver_info->flags & FLAG_WWAN) != 0)
                        strscpy(net->name, "wwan%d", sizeof(net->name));

                /* devices that cannot do ARP */
                if ((dev->driver_info->flags & FLAG_NOARP) != 0)
                        net->flags |= IFF_NOARP;

                if ((dev->driver_info->flags & FLAG_NOMAXMTU) == 0 &&
                    net->max_mtu > (dev->hard_mtu - net->hard_header_len))
                        net->max_mtu = dev->hard_mtu - net->hard_header_len;

                if (net->mtu > (dev->hard_mtu - net->hard_header_len))
                        net->mtu = dev->hard_mtu - net->hard_header_len;

        } else if (!info->in || !info->out)
                status = usbnet_get_endpoints(dev, udev);
        else {
                u8 ep_addrs[3] = {
                        info->in + USB_DIR_IN, info->out + USB_DIR_OUT, 0
                };

                dev->in = usb_rcvbulkpipe(xdev, info->in);
                dev->out = usb_sndbulkpipe(xdev, info->out);
                if (!(info->flags & FLAG_NO_SETINT))
                        status = usb_set_interface(xdev,
                                                   interface->desc.bInterfaceNumber,
                                                   interface->desc.bAlternateSetting);
                else
                        status = 0;

                if (status == 0 && !usb_check_bulk_endpoints(udev, ep_addrs))
                        status = -EINVAL;
        }
        if (status >= 0 && dev->status)
                status = init_status(dev, udev);
        if (status < 0)
                goto out3;

        if (!dev->rx_urb_size)
                dev->rx_urb_size = dev->hard_mtu;
        dev->maxpacket = usb_maxpacket(dev->udev, dev->out);
        if (dev->maxpacket == 0) {
                /* that is a broken device */
                status = -ENODEV;
                goto out4;
        }

        /* this flags the device for user space */
        if (!is_valid_ether_addr(net->dev_addr))
                eth_hw_addr_random(net);

        if ((dev->driver_info->flags & FLAG_WLAN) != 0)
                SET_NETDEV_DEVTYPE(net, &wlan_type);
        if ((dev->driver_info->flags & FLAG_WWAN) != 0)
                SET_NETDEV_DEVTYPE(net, &wwan_type);

        /* initialize max rx_qlen and tx_qlen */
        usbnet_update_max_qlen(dev);

        if (dev->can_dma_sg && !(info->flags & FLAG_SEND_ZLP) &&
                !(info->flags & FLAG_MULTI_PACKET)) {
                dev->padding_pkt = kzalloc(1, GFP_KERNEL);
                if (!dev->padding_pkt) {
                        status = -ENOMEM;
                        goto out4;
                }
        }

        status = register_netdev(net);
        if (status)
                goto out5;
        netif_info(dev, probe, dev->net,
                   "register '%s' at usb-%s-%s, %s, %pM\n",
                   udev->dev.driver->name,
                   xdev->bus->bus_name, xdev->devpath,
                   dev->driver_info->description,
                   net->dev_addr);

        // ok, it's ready to go.
        usb_set_intfdata(udev, dev);

        netif_device_attach(net);

        if (dev->driver_info->flags & FLAG_LINK_INTR)
                usbnet_link_change(dev, 0, 0);

        return 0;

out5:
        kfree(dev->padding_pkt);
out4:
        usb_free_urb(dev->interrupt);
out3:
        if (info->unbind)
                info->unbind(dev, udev);
out1:
        /* subdrivers must undo all they did in bind() if they
         * fail it, but we may fail later and a deferred kevent
         * may trigger an error resubmitting itself and, worse,
         * schedule a timer. So we kill it all just in case.
         */
        usbnet_mark_going_away(dev);
        cancel_work_sync(&dev->kevent);
        timer_delete_sync(&dev->delay);
        free_netdev(net);
out:
        return status;
}
EXPORT_SYMBOL_GPL(usbnet_probe);

/*-------------------------------------------------------------------------*/

/*
 * suspend the whole driver as soon as the first interface is suspended
 * resume only when the last interface is resumed
 */

int usbnet_suspend(struct usb_interface *intf, pm_message_t message)
{
        struct usbnet                *dev = usb_get_intfdata(intf);

        if (!dev->suspend_count++) {
                spin_lock_irq(&dev->txq.lock);
                /* don't autosuspend while transmitting */
                if (dev->txq.qlen && PMSG_IS_AUTO(message)) {
                        dev->suspend_count--;
                        spin_unlock_irq(&dev->txq.lock);
                        return -EBUSY;
                } else {
                        set_bit(EVENT_DEV_ASLEEP, &dev->flags);
                        spin_unlock_irq(&dev->txq.lock);
                }
                /*
                 * accelerate emptying of the rx and queues, to avoid
                 * having everything error out.
                 */
                netif_device_detach(dev->net);
                usbnet_terminate_urbs(dev);
                __usbnet_status_stop_force(dev);

                /*
                 * reattach so runtime management can use and
                 * wake the device
                 */
                netif_device_attach(dev->net);
        }
        return 0;
}
EXPORT_SYMBOL_GPL(usbnet_suspend);

int usbnet_resume(struct usb_interface *intf)
{
        struct usbnet                *dev = usb_get_intfdata(intf);
        struct sk_buff          *skb;
        struct urb              *res;
        int                     retval;

        if (!--dev->suspend_count) {
                /* resume interrupt URB if it was previously submitted */
                __usbnet_status_start_force(dev, GFP_NOIO);

                spin_lock_irq(&dev->txq.lock);
                while ((res = usb_get_from_anchor(&dev->deferred))) {

                        skb = (struct sk_buff *)res->context;
                        retval = usb_submit_urb(res, GFP_ATOMIC);
                        if (retval < 0) {
                                dev_kfree_skb_any(skb);
                                kfree(res->sg);
                                usb_free_urb(res);
                                usb_autopm_put_interface_async(dev->intf);
                        } else {
                                netif_trans_update(dev->net);
                                __skb_queue_tail(&dev->txq, skb);
                                netdev_sent_queue(dev->net, skb->len);
                        }
                }

                smp_mb();
                clear_bit(EVENT_DEV_ASLEEP, &dev->flags);
                spin_unlock_irq(&dev->txq.lock);

                if (test_bit(EVENT_DEV_OPEN, &dev->flags)) {
                        /* handle remote wakeup ASAP
                         * we cannot race against stop
                         */
                        if (netif_device_present(dev->net) &&
                                !timer_pending(&dev->delay) &&
                                !test_bit(EVENT_RX_HALT, &dev->flags))
                                        rx_alloc_submit(dev, GFP_NOIO);

                        if (!(dev->txq.qlen >= TX_QLEN(dev)))
                                netif_tx_wake_all_queues(dev->net);
                        queue_work(system_bh_wq, &dev->bh_work);
                }
        }

        if (test_and_clear_bit(EVENT_DEVICE_REPORT_IDLE, &dev->flags))
                usb_autopm_get_interface_no_resume(intf);

        return 0;
}
EXPORT_SYMBOL_GPL(usbnet_resume);

/*
 * Either a subdriver implements manage_power, then it is assumed to always
 * be ready to be suspended or it reports the readiness to be suspended
 * explicitly
 */
void usbnet_device_suggests_idle(struct usbnet *dev)
{
        if (!test_and_set_bit(EVENT_DEVICE_REPORT_IDLE, &dev->flags)) {
                dev->intf->needs_remote_wakeup = 1;
                usb_autopm_put_interface_async(dev->intf);
        }
}
EXPORT_SYMBOL(usbnet_device_suggests_idle);

/*
 * For devices that can do without special commands
 */
int usbnet_manage_power(struct usbnet *dev, int on)
{
        dev->intf->needs_remote_wakeup = on;
        return 0;
}
EXPORT_SYMBOL(usbnet_manage_power);

void usbnet_link_change(struct usbnet *dev, bool link, bool need_reset)
{
        /* update link after link is reseted */
        if (link && !need_reset) {
                set_bit(EVENT_LINK_CARRIER_ON, &dev->flags);
        } else {
                clear_bit(EVENT_LINK_CARRIER_ON, &dev->flags);
                netif_carrier_off(dev->net);
        }

        if (need_reset && link)
                usbnet_defer_kevent(dev, EVENT_LINK_RESET);
        else
                usbnet_defer_kevent(dev, EVENT_LINK_CHANGE);
}
EXPORT_SYMBOL(usbnet_link_change);

/*-------------------------------------------------------------------------*/
static int __usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype,
                             u16 value, u16 index, void *data, u16 size)
{
        void *buf = NULL;
        int err = -ENOMEM;

        netdev_dbg(dev->net, "usbnet_read_cmd cmd=0x%02x reqtype=%02x"
                   " value=0x%04x index=0x%04x size=%d\n",
                   cmd, reqtype, value, index, size);

        if (size) {
                buf = kmalloc(size, GFP_NOIO);
                if (!buf)
                        goto out;
        }

        err = usb_control_msg(dev->udev, usb_rcvctrlpipe(dev->udev, 0),
                              cmd, reqtype, value, index, buf, size,
                              USB_CTRL_GET_TIMEOUT);
        if (err > 0 && err <= size) {
                if (data)
                        memcpy(data, buf, err);
                else
                        netdev_dbg(dev->net,
                                   "Huh? Data requested but thrown away.\n");
        }
        kfree(buf);
out:
        return err;
}

static int __usbnet_write_cmd(struct usbnet *dev, u8 cmd, u8 reqtype,
                              u16 value, u16 index, const void *data,
                              u16 size)
{
        void *buf = NULL;
        int err = -ENOMEM;

        netdev_dbg(dev->net, "usbnet_write_cmd cmd=0x%02x reqtype=%02x"
                   " value=0x%04x index=0x%04x size=%d\n",
                   cmd, reqtype, value, index, size);

        if (data) {
                buf = kmemdup(data, size, GFP_NOIO);
                if (!buf)
                        goto out;
        } else {
        if (size) {
            WARN_ON_ONCE(1);
            err = -EINVAL;
            goto out;
        }
    }

        err = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0),
                              cmd, reqtype, value, index, buf, size,
                              USB_CTRL_SET_TIMEOUT);
        kfree(buf);

out:
        return err;
}

/*
 * The function can't be called inside suspend/resume callback,
 * otherwise deadlock will be caused.
 */
int usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype,
                    u16 value, u16 index, void *data, u16 size)
{
        int ret;

        if (usb_autopm_get_interface(dev->intf) < 0)
                return -ENODEV;
        ret = __usbnet_read_cmd(dev, cmd, reqtype, value, index,
                                data, size);
        usb_autopm_put_interface(dev->intf);
        return ret;
}
EXPORT_SYMBOL_GPL(usbnet_read_cmd);

/*
 * The function can't be called inside suspend/resume callback,
 * otherwise deadlock will be caused.
 */
int usbnet_write_cmd(struct usbnet *dev, u8 cmd, u8 reqtype,
                     u16 value, u16 index, const void *data, u16 size)
{
        int ret;

        if (usb_autopm_get_interface(dev->intf) < 0)
                return -ENODEV;
        ret = __usbnet_write_cmd(dev, cmd, reqtype, value, index,
                                 data, size);
        usb_autopm_put_interface(dev->intf);
        return ret;
}
EXPORT_SYMBOL_GPL(usbnet_write_cmd);

/*
 * The function can be called inside suspend/resume callback safely
 * and should only be called by suspend/resume callback generally.
 */
int usbnet_read_cmd_nopm(struct usbnet *dev, u8 cmd, u8 reqtype,
                          u16 value, u16 index, void *data, u16 size)
{
        return __usbnet_read_cmd(dev, cmd, reqtype, value, index,
                                 data, size);
}
EXPORT_SYMBOL_GPL(usbnet_read_cmd_nopm);

/*
 * The function can be called inside suspend/resume callback safely
 * and should only be called by suspend/resume callback generally.
 */
int usbnet_write_cmd_nopm(struct usbnet *dev, u8 cmd, u8 reqtype,
                          u16 value, u16 index, const void *data,
                          u16 size)
{
        return __usbnet_write_cmd(dev, cmd, reqtype, value, index,
                                  data, size);
}
EXPORT_SYMBOL_GPL(usbnet_write_cmd_nopm);

static void usbnet_async_cmd_cb(struct urb *urb)
{
        struct usb_ctrlrequest *req = (struct usb_ctrlrequest *)urb->context;
        int status = urb->status;

        if (status < 0)
                dev_dbg(&urb->dev->dev, "%s failed with %d",
                        __func__, status);

        kfree(req);
        usb_free_urb(urb);
}

/*
 * The caller must make sure that device can't be put into suspend
 * state until the control URB completes.
 */
int usbnet_write_cmd_async(struct usbnet *dev, u8 cmd, u8 reqtype,
                           u16 value, u16 index, const void *data, u16 size)
{
        struct usb_ctrlrequest *req;
        struct urb *urb;
        int err = -ENOMEM;
        void *buf = NULL;

        netdev_dbg(dev->net, "usbnet_write_cmd cmd=0x%02x reqtype=%02x"
                   " value=0x%04x index=0x%04x size=%d\n",
                   cmd, reqtype, value, index, size);

        urb = usb_alloc_urb(0, GFP_ATOMIC);
        if (!urb)
                goto fail;

        if (data) {
                buf = kmemdup(data, size, GFP_ATOMIC);
                if (!buf) {
                        netdev_err(dev->net, "Error allocating buffer"
                                   " in %s!\n", __func__);
                        goto fail_free_urb;
                }
        }

        req = kmalloc_obj(struct usb_ctrlrequest, GFP_ATOMIC);
        if (!req)
                goto fail_free_buf;

        req->bRequestType = reqtype;
        req->bRequest = cmd;
        req->wValue = cpu_to_le16(value);
        req->wIndex = cpu_to_le16(index);
        req->wLength = cpu_to_le16(size);

        usb_fill_control_urb(urb, dev->udev,
                             usb_sndctrlpipe(dev->udev, 0),
                             (void *)req, buf, size,
                             usbnet_async_cmd_cb, req);
        urb->transfer_flags |= URB_FREE_BUFFER;

        err = usb_submit_urb(urb, GFP_ATOMIC);
        if (err < 0) {
                netdev_err(dev->net, "Error submitting the control"
                           " message: status=%d\n", err);
                goto fail_free_all;
        }
        return 0;

fail_free_all:
        kfree(req);
fail_free_buf:
        kfree(buf);
        /*
         * avoid a double free
         * needed because the flag can be set only
         * after filling the URB
         */
        urb->transfer_flags = 0;
fail_free_urb:
        usb_free_urb(urb);
fail:
        return err;

}
EXPORT_SYMBOL_GPL(usbnet_write_cmd_async);
/*-------------------------------------------------------------------------*/

static int __init usbnet_init(void)
{
        /* Compiler should optimize this out. */
        BUILD_BUG_ON(
                sizeof_field(struct sk_buff, cb) < sizeof(struct skb_data));

        return 0;
}
module_init(usbnet_init);

static void __exit usbnet_exit(void)
{
}
module_exit(usbnet_exit);

MODULE_AUTHOR("David Brownell");
MODULE_DESCRIPTION("USB network driver framework");
MODULE_LICENSE("GPL");
































































































































































































































































































































































































































































































    1 




    1 


    1 







































































































































































































































































































































































































































































































































































































































































































































    1 










    1 


























































































































    3 



    8 


























    3 






























































    3 













































    6 









































































































































































    8 











    8 
    6 





    8 
































    8 
    7 








    3 







    3 
    3 















    3 
    3 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
// SPDX-License-Identifier: GPL-2.0
/*
 * mm/rmap.c - physical to virtual reverse mappings
 *
 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
 *
 * Simple, low overhead reverse mapping scheme.
 * Please try to keep this thing as modular as possible.
 *
 * Provides methods for unmapping each kind of mapped page:
 * the anon methods track anonymous pages, and
 * the file methods track pages belonging to an inode.
 *
 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
 * Contributions by Hugh Dickins 2003, 2004
 */

/*
 * Lock ordering in mm:
 *
 * inode->i_rwsem        (while writing or truncating, not reading or faulting)
 *   mm->mmap_lock
 *     mapping->invalidate_lock (in filemap_fault)
 *       folio_lock
 *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
 *           vma_start_write
 *             mapping->i_mmap_rwsem
 *               anon_vma->rwsem
 *                 mm->page_table_lock or pte_lock
 *                   swap_lock (in swap_duplicate, swap_info_get)
 *                     mmlist_lock (in mmput, drain_mmlist and others)
 *                     mapping->private_lock (in block_dirty_folio)
 *                         i_pages lock (widely used)
 *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
 *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
 *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
 *                       sb_lock (within inode_lock in fs/fs-writeback.c)
 *                       i_pages lock (widely used, in set_page_dirty,
 *                                 in arch-dependent flush_dcache_mmap_lock,
 *                                 within bdi.wb->list_lock in __sync_single_inode)
 *
 * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
 *     pte map lock
 *
 * hugetlbfs PageHuge() take locks in this order:
 *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
 *     vma_lock (hugetlb specific lock for pmd_sharing)
 *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
 *         folio_lock
 */

#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/leafops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/mm_inline.h>
#include <linux/oom.h>

#include <asm/tlb.h>

#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>

#include "internal.h"
#include "swap.h"

static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;

static inline struct anon_vma *anon_vma_alloc(void)
{
        struct anon_vma *anon_vma;

        anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
        if (anon_vma) {
                atomic_set(&anon_vma->refcount, 1);
                anon_vma->num_children = 0;
                anon_vma->num_active_vmas = 0;
                anon_vma->parent = anon_vma;
                /*
                 * Initialise the anon_vma root to point to itself. If called
                 * from fork, the root will be reset to the parents anon_vma.
                 */
                anon_vma->root = anon_vma;
        }

        return anon_vma;
}

static inline void anon_vma_free(struct anon_vma *anon_vma)
{
        VM_BUG_ON(atomic_read(&anon_vma->refcount));

        /*
         * Synchronize against folio_lock_anon_vma_read() such that
         * we can safely hold the lock without the anon_vma getting
         * freed.
         *
         * Relies on the full mb implied by the atomic_dec_and_test() from
         * put_anon_vma() against the acquire barrier implied by
         * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
         *
         * folio_lock_anon_vma_read()        VS        put_anon_vma()
         *   down_read_trylock()                  atomic_dec_and_test()
         *   LOCK                                  MB
         *   atomic_read()                          rwsem_is_locked()
         *
         * LOCK should suffice since the actual taking of the lock must
         * happen _before_ what follows.
         */
        might_sleep();
        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
                anon_vma_lock_write(anon_vma);
                anon_vma_unlock_write(anon_vma);
        }

        kmem_cache_free(anon_vma_cachep, anon_vma);
}

static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
        return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}

static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
{
        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}

static void anon_vma_chain_assign(struct vm_area_struct *vma,
                                  struct anon_vma_chain *avc,
                                  struct anon_vma *anon_vma)
{
        avc->vma = vma;
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
}

/**
 * __anon_vma_prepare - attach an anon_vma to a memory region
 * @vma: the memory region in question
 *
 * This makes sure the memory mapping described by 'vma' has
 * an 'anon_vma' attached to it, so that we can associate the
 * anonymous pages mapped into it with that anon_vma.
 *
 * The common case will be that we already have one, which
 * is handled inline by anon_vma_prepare(). But if
 * not we either need to find an adjacent mapping that we
 * can re-use the anon_vma from (very common when the only
 * reason for splitting a vma has been mprotect()), or we
 * allocate a new one.
 *
 * Anon-vma allocations are very subtle, because we may have
 * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
 * and that may actually touch the rwsem even in the newly
 * allocated vma (it depends on RCU to make sure that the
 * anon_vma isn't actually destroyed).
 *
 * As a result, we need to do proper anon_vma locking even
 * for the new allocation. At the same time, we do not want
 * to do any locking for the common case of already having
 * an anon_vma.
 */
int __anon_vma_prepare(struct vm_area_struct *vma)
{
        struct mm_struct *mm = vma->vm_mm;
        struct anon_vma *anon_vma, *allocated;
        struct anon_vma_chain *avc;

        mmap_assert_locked(mm);
        might_sleep();

        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_enomem;

        anon_vma = find_mergeable_anon_vma(vma);
        allocated = NULL;
        if (!anon_vma) {
                anon_vma = anon_vma_alloc();
                if (unlikely(!anon_vma))
                        goto out_enomem_free_avc;
                anon_vma->num_children++; /* self-parent link for new root */
                allocated = anon_vma;
        }

        anon_vma_lock_write(anon_vma);
        /* page_table_lock to protect against threads */
        spin_lock(&mm->page_table_lock);
        if (likely(!vma->anon_vma)) {
                vma->anon_vma = anon_vma;
                anon_vma_chain_assign(vma, avc, anon_vma);
                anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
                anon_vma->num_active_vmas++;
                allocated = NULL;
                avc = NULL;
        }
        spin_unlock(&mm->page_table_lock);
        anon_vma_unlock_write(anon_vma);

        if (unlikely(allocated))
                put_anon_vma(allocated);
        if (unlikely(avc))
                anon_vma_chain_free(avc);

        return 0;

 out_enomem_free_avc:
        anon_vma_chain_free(avc);
 out_enomem:
        return -ENOMEM;
}

static void check_anon_vma_clone(struct vm_area_struct *dst,
                                 struct vm_area_struct *src,
                                 enum vma_operation operation)
{
        /* The write lock must be held. */
        mmap_assert_write_locked(src->vm_mm);
        /* If not a fork then must be on same mm. */
        VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);

        /* If we have anything to do src->anon_vma must be provided. */
        VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
        VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma);
        /* We are establishing a new anon_vma_chain. */
        VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain));
        /*
         * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma
         * must be the same across dst and src.
         */
        VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
        /*
         * Essentially equivalent to above - if not a no-op, we should expect
         * dst->anon_vma to be set for everything except a fork.
         */
        VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma &&
                        !dst->anon_vma);
        /* For the anon_vma to be compatible, it can only be singular. */
        VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
                        !list_is_singular(&src->anon_vma_chain));
#ifdef CONFIG_PER_VMA_LOCK
        /* Only merging an unfaulted VMA leaves the destination attached. */
        VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
                        vma_is_attached(dst));
#endif
}

static void maybe_reuse_anon_vma(struct vm_area_struct *dst,
                struct anon_vma *anon_vma)
{
        /* If already populated, nothing to do.*/
        if (dst->anon_vma)
                return;

        /*
         * We reuse an anon_vma if any linking VMAs were unmapped and it has
         * only a single child at most.
         */
        if (anon_vma->num_active_vmas > 0)
                return;
        if (anon_vma->num_children > 1)
                return;

        dst->anon_vma = anon_vma;
        anon_vma->num_active_vmas++;
}

static void cleanup_partial_anon_vmas(struct vm_area_struct *vma);

/**
 * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to
 * all of the anon_vma objects contained within @src anon_vma_chain's.
 * @dst: The destination VMA with an empty anon_vma_chain.
 * @src: The source VMA we wish to duplicate.
 * @operation: The type of operation which resulted in the clone.
 *
 * This is the heart of the VMA side of the anon_vma implementation - we invoke
 * this function whenever we need to set up a new VMA's anon_vma state.
 *
 * This is invoked for:
 *
 * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we
 *   clone @src into @dst.
 * - VMA split.
 * - VMA (m)remap.
 * - Fork of faulted VMA.
 *
 * In all cases other than fork this is simply a duplication. Fork additionally
 * adds a new active anon_vma.
 *
 * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an
 * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them
 * but do have a single child. This is to avoid waste of memory when repeatedly
 * forking.
 *
 * Returns: 0 on success, -ENOMEM on failure.
 */
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
                   enum vma_operation operation)
{
        struct anon_vma_chain *avc, *pavc;
        struct anon_vma *active_anon_vma = src->anon_vma;

        check_anon_vma_clone(dst, src, operation);

        if (!active_anon_vma)
                return 0;

        /*
         * Allocate AVCs. We don't need an anon_vma lock for this as we
         * are not updating the anon_vma rbtree nor are we changing
         * anon_vma statistics.
         *
         * Either src, dst have the same mm for which we hold an exclusive mmap
         * write lock, or we are forking and we hold it on src->vm_mm and dst is
         * not yet accessible to other threads so there's no possibliity of the
         * unlinked AVC's being observed yet.
         */
        list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
                avc = anon_vma_chain_alloc(GFP_KERNEL);
                if (!avc)
                        goto enomem_failure;

                anon_vma_chain_assign(dst, avc, pavc->anon_vma);
        }

        /*
         * Now link the anon_vma's back to the newly inserted AVCs.
         * Note that all anon_vma's share the same root.
         */
        anon_vma_lock_write(src->anon_vma);
        list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
                if (operation == VMA_OP_FORK)
                        maybe_reuse_anon_vma(dst, anon_vma);
        }

        if (operation != VMA_OP_FORK)
                dst->anon_vma->num_active_vmas++;

        anon_vma_unlock_write(active_anon_vma);
        return 0;

 enomem_failure:
        cleanup_partial_anon_vmas(dst);
        return -ENOMEM;
}

/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
 */
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
        struct anon_vma_chain *avc;
        struct anon_vma *anon_vma;
        int rc;

        /* Don't bother if the parent process has no anon_vma here. */
        if (!pvma->anon_vma)
                return 0;

        /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
        vma->anon_vma = NULL;

        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                return -ENOMEM;
        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc) {
                put_anon_vma(anon_vma);
                return -ENOMEM;
        }

        /*
         * First, attach the new VMA to the parent VMA's anon_vmas,
         * so rmap can find non-COWed pages in child processes.
         */
        rc = anon_vma_clone(vma, pvma, VMA_OP_FORK);
        /* An error arose or an existing anon_vma was reused, all done then. */
        if (rc || vma->anon_vma) {
                put_anon_vma(anon_vma);
                anon_vma_chain_free(avc);
                return rc;
        }

        /*
         * OK no reuse, so add our own anon_vma.
         *
         * Since it is not linked anywhere we can safely manipulate anon_vma
         * fields without a lock.
         */

        anon_vma->num_active_vmas = 1;
        /*
         * The root anon_vma's rwsem is the lock actually used when we
         * lock any of the anon_vmas in this anon_vma tree.
         */
        anon_vma->root = pvma->anon_vma->root;
        anon_vma->parent = pvma->anon_vma;
        /*
         * With refcounts, an anon_vma can stay around longer than the
         * process it belongs to. The root anon_vma needs to be pinned until
         * this anon_vma is freed, because the lock lives in the root.
         */
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
        anon_vma_chain_assign(vma, avc, anon_vma);
        /* Now let rmap see it. */
        anon_vma_lock_write(anon_vma);
        anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
        anon_vma->parent->num_children++;
        anon_vma_unlock_write(anon_vma);

        return 0;
}

/*
 * In the unfortunate case of anon_vma_clone() failing to allocate memory we
 * have to clean things up.
 *
 * Since we allocate anon_vma_chain's before we insert them into the interval
 * trees, we simply have to free up the AVC's and remove the entries from the
 * VMA's anon_vma_chain.
 */
static void cleanup_partial_anon_vmas(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc, *next;

        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }

        /*
         * The anon_vma assigned to this VMA is no longer valid, as we were not
         * able to correctly clone AVC state. Avoid inconsistent anon_vma tree
         * state by resetting.
         */
        vma->anon_vma = NULL;
}

/**
 * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing
 * anon_vma_chain objects.
 * @vma: The VMA whose links to anon_vma objects is to be severed.
 *
 * As part of the process anon_vma_chain's are freed,
 * anon_vma->num_children,num_active_vmas is updated as required and, if the
 * relevant anon_vma references no further VMAs, its reference count is
 * decremented.
 */
void unlink_anon_vmas(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc, *next;
        struct anon_vma *active_anon_vma = vma->anon_vma;

        /* Always hold mmap lock, read-lock on unmap possibly. */
        mmap_assert_locked(vma->vm_mm);

        /* Unfaulted is a no-op. */
        if (!active_anon_vma) {
                VM_WARN_ON_ONCE(!list_empty(&vma->anon_vma_chain));
                return;
        }

        anon_vma_lock_write(active_anon_vma);

        /*
         * Unlink each anon_vma chained to the VMA.  This list is ordered
         * from newest to oldest, ensuring the root anon_vma gets freed last.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);

                /*
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
                if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
                        anon_vma->parent->num_children--;
                        continue;
                }

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }

        active_anon_vma->num_active_vmas--;
        /*
         * vma would still be needed after unlink, and anon_vma will be prepared
         * when handle fault.
         */
        vma->anon_vma = NULL;
        anon_vma_unlock_write(active_anon_vma);


        /*
         * Iterate the list once more, it now only contains empty and unlinked
         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
         * needing to write-acquire the anon_vma->root->rwsem.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                VM_WARN_ON(anon_vma->num_children);
                VM_WARN_ON(anon_vma->num_active_vmas);
                put_anon_vma(anon_vma);

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
}

static void anon_vma_ctor(void *data)
{
        struct anon_vma *anon_vma = data;

        init_rwsem(&anon_vma->rwsem);
        atomic_set(&anon_vma->refcount, 0);
        anon_vma->rb_root = RB_ROOT_CACHED;
}

void __init anon_vma_init(void)
{
        anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
                        0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
                        anon_vma_ctor);
        anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
                        SLAB_PANIC|SLAB_ACCOUNT);
}

/*
 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
 *
 * Since there is no serialization what so ever against folio_remove_rmap_*()
 * the best this function can do is return a refcount increased anon_vma
 * that might have been relevant to this page.
 *
 * The page might have been remapped to a different anon_vma or the anon_vma
 * returned may already be freed (and even reused).
 *
 * In case it was remapped to a different anon_vma, the new anon_vma will be a
 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
 * ensure that any anon_vma obtained from the page will still be valid for as
 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
 *
 * All users of this function must be very careful when walking the anon_vma
 * chain and verify that the page in question is indeed mapped in it
 * [ something equivalent to page_mapped_in_vma() ].
 *
 * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
 * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid
 * if there is a mapcount, we can dereference the anon_vma after observing
 * those.
 *
 * NOTE: the caller should hold folio lock when calling this.
 */
struct anon_vma *folio_get_anon_vma(const struct folio *folio)
{
        struct anon_vma *anon_vma = NULL;
        unsigned long anon_mapping;

        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);

        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
        if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
                goto out;
        if (!folio_mapped(folio))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        /*
         * If this folio is still mapped, then its anon_vma cannot have been
         * freed.  But if it has been unmapped, we have no security against the
         * anon_vma structure being freed and reused (for another anon_vma:
         * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
         * above cannot corrupt).
         */
        if (!folio_mapped(folio)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }
out:
        rcu_read_unlock();

        return anon_vma;
}

/*
 * Similar to folio_get_anon_vma() except it locks the anon_vma.
 *
 * Its a little more complex as it tries to keep the fast path to a single
 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 * reference like with folio_get_anon_vma() and then block on the mutex
 * on !rwc->try_lock case.
 */
struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
                                          struct rmap_walk_control *rwc)
{
        struct anon_vma *anon_vma = NULL;
        struct anon_vma *root_anon_vma;
        unsigned long anon_mapping;

        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);

        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
        if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
                goto out;
        if (!folio_mapped(folio))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
        root_anon_vma = READ_ONCE(anon_vma->root);
        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * If the folio is still mapped, then this anon_vma is still
                 * its anon_vma, and holding the mutex ensures that it will
                 * not go away, see anon_vma_free().
                 */
                if (!folio_mapped(folio)) {
                        up_read(&root_anon_vma->rwsem);
                        anon_vma = NULL;
                }
                goto out;
        }

        if (rwc && rwc->try_lock) {
                anon_vma = NULL;
                rwc->contended = true;
                goto out;
        }

        /* trylock failed, we got to sleep */
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        if (!folio_mapped(folio)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }

        /* we pinned the anon_vma, its safe to sleep */
        rcu_read_unlock();
        anon_vma_lock_read(anon_vma);

        if (atomic_dec_and_test(&anon_vma->refcount)) {
                /*
                 * Oops, we held the last refcount, release the lock
                 * and bail -- can't simply use put_anon_vma() because
                 * we'll deadlock on the anon_vma_lock_write() recursion.
                 */
                anon_vma_unlock_read(anon_vma);
                __put_anon_vma(anon_vma);
                anon_vma = NULL;
        }

        return anon_vma;

out:
        rcu_read_unlock();
        return anon_vma;
}

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
 * Flush TLB entries for recently unmapped pages from remote CPUs. It is
 * important if a PTE was dirty when it was unmapped that it's flushed
 * before any IO is initiated on the page to prevent lost writes. Similarly,
 * it must be flushed before freeing to prevent data leakage.
 */
void try_to_unmap_flush(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (!tlb_ubc->flush_required)
                return;

        arch_tlbbatch_flush(&tlb_ubc->arch);
        tlb_ubc->flush_required = false;
        tlb_ubc->writable = false;
}

/* Flush iff there are potentially writable TLB entries that can race with IO */
void try_to_unmap_flush_dirty(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (tlb_ubc->writable)
                try_to_unmap_flush();
}

/*
 * Bits 0-14 of mm->tlb_flush_batched record pending generations.
 * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
 */
#define TLB_FLUSH_BATCH_FLUSHED_SHIFT        16
#define TLB_FLUSH_BATCH_PENDING_MASK                        \
        ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
#define TLB_FLUSH_BATCH_PENDING_LARGE                        \
        (TLB_FLUSH_BATCH_PENDING_MASK / 2)

static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
                unsigned long start, unsigned long end)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
        int batch;
        bool writable = pte_dirty(pteval);

        if (!pte_accessible(mm, pteval))
                return;

        arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end);
        tlb_ubc->flush_required = true;

        /*
         * Ensure compiler does not re-order the setting of tlb_flush_batched
         * before the PTE is cleared.
         */
        barrier();
        batch = atomic_read(&mm->tlb_flush_batched);
retry:
        if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
                /*
                 * Prevent `pending' from catching up with `flushed' because of
                 * overflow.  Reset `pending' and `flushed' to be 1 and 0 if
                 * `pending' becomes large.
                 */
                if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
                        goto retry;
        } else {
                atomic_inc(&mm->tlb_flush_batched);
        }

        /*
         * If the PTE was dirty then it's best to assume it's writable. The
         * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
         * before the page is queued for IO.
         */
        if (writable)
                tlb_ubc->writable = true;
}

/*
 * Returns true if the TLB flush should be deferred to the end of a batch of
 * unmap operations to reduce IPIs.
 */
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        if (!(flags & TTU_BATCH_FLUSH))
                return false;

        return arch_tlbbatch_should_defer(mm);
}

/*
 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
 * releasing the PTL if TLB flushes are batched. It's possible for a parallel
 * operation such as mprotect or munmap to race between reclaim unmapping
 * the page and flushing the page. If this race occurs, it potentially allows
 * access to data via a stale TLB entry. Tracking all mm's that have TLB
 * batching in flight would be expensive during reclaim so instead track
 * whether TLB batching occurred in the past and if so then do a flush here
 * if required. This will cost one additional flush per reclaim cycle paid
 * by the first operation at risk such as mprotect and mumap.
 *
 * This must be called under the PTL so that an access to tlb_flush_batched
 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
 * via the PTL.
 */
void flush_tlb_batched_pending(struct mm_struct *mm)
{
        int batch = atomic_read(&mm->tlb_flush_batched);
        int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
        int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;

        if (pending != flushed) {
                flush_tlb_mm(mm);
                /*
                 * If the new TLB flushing is pending during flushing, leave
                 * mm->tlb_flush_batched as is, to avoid losing flushing.
                 */
                atomic_cmpxchg(&mm->tlb_flush_batched, batch,
                               pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
        }
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
                unsigned long start, unsigned long end)
{
}

static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        return false;
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

/**
 * page_address_in_vma - The virtual address of a page in this VMA.
 * @folio: The folio containing the page.
 * @page: The page within the folio.
 * @vma: The VMA we need to know the address in.
 *
 * Calculates the user virtual address of this page in the specified VMA.
 * It is the caller's responsibility to check the page is actually
 * within the VMA.  There may not currently be a PTE pointing at this
 * page, but if a page fault occurs at this address, this is the page
 * which will be accessed.
 *
 * Context: Caller should hold a reference to the folio.  Caller should
 * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the
 * VMA from being altered.
 *
 * Return: The virtual address corresponding to this page in the VMA.
 */
unsigned long page_address_in_vma(const struct folio *folio,
                const struct page *page, const struct vm_area_struct *vma)
{
        if (folio_test_anon(folio)) {
                struct anon_vma *anon_vma = folio_anon_vma(folio);
                /*
                 * Note: swapoff's unuse_vma() is more efficient with this
                 * check, and needs it to match anon_vma when KSM is active.
                 */
                if (!vma->anon_vma || !anon_vma ||
                    vma->anon_vma->root != anon_vma->root)
                        return -EFAULT;
        } else if (!vma->vm_file) {
                return -EFAULT;
        } else if (vma->vm_file->f_mapping != folio->mapping) {
                return -EFAULT;
        }

        /* KSM folios don't reach here because of the !anon_vma check */
        return vma_address(vma, page_pgoff(folio, page), 1);
}

/*
 * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
 * NULL if it doesn't exist.  No guarantees / checks on what the pmd_t*
 * represents.
 */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd = NULL;

        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
out:
        return pmd;
}

struct folio_referenced_arg {
        int mapcount;
        int referenced;
        vm_flags_t vm_flags;
        struct mem_cgroup *memcg;
};

/*
 * arg: folio_referenced_arg will be passed
 */
static bool folio_referenced_one(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address, void *arg)
{
        struct folio_referenced_arg *pra = arg;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        int ptes = 0, referenced = 0;
        unsigned int nr;

        while (page_vma_mapped_walk(&pvmw)) {
                address = pvmw.address;
                nr = 1;

                if (vma->vm_flags & VM_LOCKED) {
                        ptes++;
                        pra->mapcount--;

                        /* Only mlock fully mapped pages */
                        if (pvmw.pte && ptes != pvmw.nr_pages)
                                continue;

                        /*
                         * All PTEs must be protected by page table lock in
                         * order to mlock the page.
                         *
                         * If page table boundary has been cross, current ptl
                         * only protect part of ptes.
                         */
                        if (pvmw.flags & PVMW_PGTABLE_CROSSED)
                                continue;

                        /* Restore the mlock which got missed */
                        mlock_vma_folio(folio, vma);
                        page_vma_mapped_walk_done(&pvmw);
                        pra->vm_flags |= VM_LOCKED;
                        return false; /* To break the loop */
                }

                /*
                 * Skip the non-shared swapbacked folio mapped solely by
                 * the exiting or OOM-reaped process. This avoids redundant
                 * swap-out followed by an immediate unmap.
                 */
                if ((!atomic_read(&vma->vm_mm->mm_users) ||
                    check_stable_address_space(vma->vm_mm)) &&
                    folio_test_anon(folio) && folio_test_swapbacked(folio) &&
                    !folio_maybe_mapped_shared(folio)) {
                        pra->referenced = -1;
                        page_vma_mapped_walk_done(&pvmw);
                        return false;
                }

                if (pvmw.pte && folio_test_large(folio)) {
                        const unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
                        const unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
                        pte_t pteval = ptep_get(pvmw.pte);

                        nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr);
                }

                /*
                 * When LRU is switching, we don’t know where the surrounding folios
                 * are. —they could be on active/inactive lists or on MGLRU. So the
                 * simplest approach is to disable this look-around optimization.
                 */
                if (lru_gen_enabled() && !lru_gen_switching() && pvmw.pte) {
                        if (lru_gen_look_around(&pvmw, nr))
                                referenced++;
                } else if (pvmw.pte) {
                        if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
                                referenced++;
                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                        if (pmdp_clear_flush_young_notify(vma, address,
                                                pvmw.pmd))
                                referenced++;
                } else {
                        /* unexpected pmd-mapped folio? */
                        WARN_ON_ONCE(1);
                }

                ptes += nr;
                pra->mapcount -= nr;
                /*
                 * If we are sure that we batched the entire folio,
                 * we can just optimize and stop right here.
                 */
                if (ptes == pvmw.nr_pages) {
                        page_vma_mapped_walk_done(&pvmw);
                        break;
                }

                /* Skip the batched PTEs */
                pvmw.pte += nr - 1;
                pvmw.address += (nr - 1) * PAGE_SIZE;
        }

        if (referenced)
                folio_clear_idle(folio);
        if (folio_test_clear_young(folio))
                referenced++;

        if (referenced) {
                pra->referenced++;
                pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
        }

        if (!pra->mapcount)
                return false; /* To break the loop */

        return true;
}

static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
{
        struct folio_referenced_arg *pra = arg;
        struct mem_cgroup *memcg = pra->memcg;

        /*
         * Ignore references from this mapping if it has no recency. If the
         * folio has been used in another mapping, we will catch it; if this
         * other mapping is already gone, the unmap path will have set the
         * referenced flag or activated the folio in zap_pte_range().
         */
        if (!vma_has_recency(vma))
                return true;

        /*
         * If we are reclaiming on behalf of a cgroup, skip counting on behalf
         * of references from different cgroups.
         */
        if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
                return true;

        return false;
}

/**
 * folio_referenced() - Test if the folio was referenced.
 * @folio: The folio to test.
 * @is_locked: Caller holds lock on the folio.
 * @memcg: target memory cgroup
 * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
 *
 * Quick test_and_clear_referenced for all mappings of a folio,
 *
 * Return: The number of mappings which referenced the folio. Return -1 if
 * the function bailed out due to rmap lock contention.
 */
int folio_referenced(struct folio *folio, int is_locked,
                     struct mem_cgroup *memcg, vm_flags_t *vm_flags)
{
        bool we_locked = false;
        struct folio_referenced_arg pra = {
                .mapcount = folio_mapcount(folio),
                .memcg = memcg,
        };
        struct rmap_walk_control rwc = {
                .rmap_one = folio_referenced_one,
                .arg = (void *)&pra,
                .anon_lock = folio_lock_anon_vma_read,
                .try_lock = true,
                .invalid_vma = invalid_folio_referenced_vma,
        };

        VM_WARN_ON_ONCE_FOLIO(folio_is_zone_device(folio), folio);
        *vm_flags = 0;
        if (!pra.mapcount)
                return 0;

        if (!folio_raw_mapping(folio))
                return 0;

        if (!is_locked) {
                we_locked = folio_trylock(folio);
                if (!we_locked)
                        return 1;
        }

        rmap_walk(folio, &rwc);
        *vm_flags = pra.vm_flags;

        if (we_locked)
                folio_unlock(folio);

        return rwc.contended ? -1 : pra.referenced;
}

static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
{
        int cleaned = 0;
        struct vm_area_struct *vma = pvmw->vma;
        struct mmu_notifier_range range;
        unsigned long address = pvmw->address;

        /*
         * We have to assume the worse case ie pmd for invalidation. Note that
         * the folio can not be freed from this function.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
                                vma->vm_mm, address, vma_address_end(pvmw));
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(pvmw)) {
                int ret = 0;

                address = pvmw->address;
                if (pvmw->pte) {
                        pte_t *pte = pvmw->pte;
                        pte_t entry = ptep_get(pte);

                        /*
                         * PFN swap PTEs, such as device-exclusive ones, that
                         * actually map pages are clean and not writable from a
                         * CPU perspective. The MMU notifier takes care of any
                         * device aspects.
                         */
                        if (!pte_present(entry))
                                continue;
                        if (!pte_dirty(entry) && !pte_write(entry))
                                continue;

                        flush_cache_page(vma, address, pte_pfn(entry));
                        entry = ptep_clear_flush(vma, address, pte);
                        entry = pte_wrprotect(entry);
                        entry = pte_mkclean(entry);
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        ret = 1;
                } else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        pmd_t *pmd = pvmw->pmd;
                        pmd_t entry = pmdp_get(pmd);

                        /*
                         * Please see the comment above (!pte_present).
                         * A non present PMD is not writable from a CPU
                         * perspective.
                         */
                        if (!pmd_present(entry))
                                continue;
                        if (!pmd_dirty(entry) && !pmd_write(entry))
                                continue;

                        flush_cache_range(vma, address,
                                          address + HPAGE_PMD_SIZE);
                        entry = pmdp_invalidate(vma, address, pmd);
                        entry = pmd_wrprotect(entry);
                        entry = pmd_mkclean(entry);
                        set_pmd_at(vma->vm_mm, address, pmd, entry);
                        ret = 1;
#else
                        /* unexpected pmd-mapped folio? */
                        WARN_ON_ONCE(1);
#endif
                }

                if (ret)
                        cleaned++;
        }

        mmu_notifier_invalidate_range_end(&range);

        return cleaned;
}

static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
                             unsigned long address, void *arg)
{
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
        int *cleaned = arg;

        *cleaned += page_vma_mkclean_one(&pvmw);

        return true;
}

static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
        if (vma->vm_flags & VM_SHARED)
                return false;

        return true;
}

int folio_mkclean(struct folio *folio)
{
        int cleaned = 0;
        struct address_space *mapping;
        struct rmap_walk_control rwc = {
                .arg = (void *)&cleaned,
                .rmap_one = page_mkclean_one,
                .invalid_vma = invalid_mkclean_vma,
        };

        BUG_ON(!folio_test_locked(folio));

        if (!folio_mapped(folio))
                return 0;

        mapping = folio_mapping(folio);
        if (!mapping)
                return 0;

        rmap_walk(folio, &rwc);

        return cleaned;
}
EXPORT_SYMBOL_GPL(folio_mkclean);

struct wrprotect_file_state {
        int cleaned;
        pgoff_t pgoff;
        unsigned long pfn;
        unsigned long nr_pages;
};

static bool mapping_wrprotect_range_one(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address, void *arg)
{
        struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
        struct page_vma_mapped_walk pvmw = {
                .pfn                = state->pfn,
                .nr_pages        = state->nr_pages,
                .pgoff                = state->pgoff,
                .vma                = vma,
                .address        = address,
                .flags                = PVMW_SYNC,
        };

        state->cleaned += page_vma_mkclean_one(&pvmw);

        return true;
}

static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
                             pgoff_t pgoff_start, unsigned long nr_pages,
                             struct rmap_walk_control *rwc, bool locked);

/**
 * mapping_wrprotect_range() - Write-protect all mappings in a specified range.
 *
 * @mapping:        The mapping whose reverse mapping should be traversed.
 * @pgoff:        The page offset at which @pfn is mapped within @mapping.
 * @pfn:        The PFN of the page mapped in @mapping at @pgoff.
 * @nr_pages:        The number of physically contiguous base pages spanned.
 *
 * Traverses the reverse mapping, finding all VMAs which contain a shared
 * mapping of the pages in the specified range in @mapping, and write-protects
 * them (that is, updates the page tables to mark the mappings read-only such
 * that a write protection fault arises when the mappings are written to).
 *
 * The @pfn value need not refer to a folio, but rather can reference a kernel
 * allocation which is mapped into userland. We therefore do not require that
 * the page maps to a folio with a valid mapping or index field, rather the
 * caller specifies these in @mapping and @pgoff.
 *
 * Return: the number of write-protected PTEs, or an error.
 */
int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
                unsigned long pfn, unsigned long nr_pages)
{
        struct wrprotect_file_state state = {
                .cleaned = 0,
                .pgoff = pgoff,
                .pfn = pfn,
                .nr_pages = nr_pages,
        };
        struct rmap_walk_control rwc = {
                .arg = (void *)&state,
                .rmap_one = mapping_wrprotect_range_one,
                .invalid_vma = invalid_mkclean_vma,
        };

        if (!mapping)
                return 0;

        __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
                         /* locked = */false);

        return state.cleaned;
}
EXPORT_SYMBOL_GPL(mapping_wrprotect_range);

/**
 * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
 *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
 *                     within the @vma of shared mappings. And since clean PTEs
 *                     should also be readonly, write protects them too.
 * @pfn: start pfn.
 * @nr_pages: number of physically contiguous pages srarting with @pfn.
 * @pgoff: page offset that the @pfn mapped with.
 * @vma: vma that @pfn mapped within.
 *
 * Returns the number of cleaned PTEs (including PMDs).
 */
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
                      struct vm_area_struct *vma)
{
        struct page_vma_mapped_walk pvmw = {
                .pfn                = pfn,
                .nr_pages        = nr_pages,
                .pgoff                = pgoff,
                .vma                = vma,
                .flags                = PVMW_SYNC,
        };

        if (invalid_mkclean_vma(vma, NULL))
                return 0;

        pvmw.address = vma_address(vma, pgoff, nr_pages);
        VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);

        return page_vma_mkclean_one(&pvmw);
}

static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
{
        int idx;

        if (nr) {
                idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
                lruvec_stat_mod_folio(folio, idx, nr);
        }
        if (nr_pmdmapped) {
                if (folio_test_anon(folio)) {
                        idx = NR_ANON_THPS;
                        lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
                } else {
                        /* NR_*_PMDMAPPED are not maintained per-memcg */
                        idx = folio_test_swapbacked(folio) ?
                                NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
                        __mod_node_page_state(folio_pgdat(folio), idx,
                                              nr_pmdmapped);
                }
        }
}

static __always_inline void __folio_add_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                enum pgtable_level level)
{
        atomic_t *mapped = &folio->_nr_pages_mapped;
        const int orig_nr_pages = nr_pages;
        int first = 0, nr = 0, nr_pmdmapped = 0;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case PGTABLE_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        nr = atomic_inc_and_test(&folio->_mapcount);
                        break;
                }

                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
                        nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma);
                        if (nr == orig_nr_pages)
                                /* Was completely unmapped. */
                                nr = folio_large_nr_pages(folio);
                        else
                                nr = 0;
                        break;
                }

                do {
                        first += atomic_inc_and_test(&page->_mapcount);
                } while (page++, --nr_pages > 0);

                if (first &&
                    atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
                        nr = first;

                folio_add_large_mapcount(folio, orig_nr_pages, vma);
                break;
        case PGTABLE_LEVEL_PMD:
        case PGTABLE_LEVEL_PUD:
                first = atomic_inc_and_test(&folio->_entire_mapcount);
                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
                        if (level == PGTABLE_LEVEL_PMD && first)
                                nr_pmdmapped = folio_large_nr_pages(folio);
                        nr = folio_inc_return_large_mapcount(folio, vma);
                        if (nr == 1)
                                /* Was completely unmapped. */
                                nr = folio_large_nr_pages(folio);
                        else
                                nr = 0;
                        break;
                }

                if (first) {
                        nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
                        if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
                                nr_pages = folio_large_nr_pages(folio);
                                /*
                                 * We only track PMD mappings of PMD-sized
                                 * folios separately.
                                 */
                                if (level == PGTABLE_LEVEL_PMD)
                                        nr_pmdmapped = nr_pages;
                                nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
                                /* Raced ahead of a remove and another add? */
                                if (unlikely(nr < 0))
                                        nr = 0;
                        } else {
                                /* Raced ahead of a remove of ENTIRELY_MAPPED */
                                nr = 0;
                        }
                }
                folio_inc_large_mapcount(folio, vma);
                break;
        default:
                BUILD_BUG();
        }
        __folio_mod_stat(folio, nr, nr_pmdmapped);
}

/**
 * folio_move_anon_rmap - move a folio to our anon_vma
 * @folio:        The folio to move to our anon_vma
 * @vma:        The vma the folio belongs to
 *
 * When a folio belongs exclusively to one process after a COW event,
 * that folio can be moved into the anon_vma that belongs to just that
 * process, so the rmap code will not search the parent or sibling processes.
 */
void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
{
        void *anon_vma = vma->anon_vma;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_VMA(!anon_vma, vma);

        anon_vma += FOLIO_MAPPING_ANON;
        /*
         * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written
         * simultaneously, so a concurrent reader (eg folio_referenced()'s
         * folio_test_anon()) will not see one without the other.
         */
        WRITE_ONCE(folio->mapping, anon_vma);
}

/**
 * __folio_set_anon - set up a new anonymous rmap for a folio
 * @folio:        The folio to set up the new anonymous rmap for.
 * @vma:        VM area to add the folio to.
 * @address:        User virtual address of the mapping
 * @exclusive:        Whether the folio is exclusive to the process.
 */
static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
                             unsigned long address, bool exclusive)
{
        struct anon_vma *anon_vma = vma->anon_vma;

        BUG_ON(!anon_vma);

        /*
         * If the folio isn't exclusive to this vma, we must use the _oldest_
         * possible anon_vma for the folio mapping!
         */
        if (!exclusive)
                anon_vma = anon_vma->root;

        /*
         * page_idle does a lockless/optimistic rmap scan on folio->mapping.
         * Make sure the compiler doesn't split the stores of anon_vma and
         * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code
         * could mistake the mapping for a struct address_space and crash.
         */
        anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON;
        WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
        folio->index = linear_page_index(vma, address);
}

/**
 * __page_check_anon_rmap - sanity check anonymous rmap addition
 * @folio:        The folio containing @page.
 * @page:        the page to check the mapping of
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 */
static void __page_check_anon_rmap(const struct folio *folio,
                const struct page *page, struct vm_area_struct *vma,
                unsigned long address)
{
        /*
         * The page's anon-rmap details (mapping and index) are guaranteed to
         * be set up correctly at this point.
         *
         * We have exclusion against folio_add_anon_rmap_*() because the caller
         * always holds the page locked.
         *
         * We have exclusion against folio_add_new_anon_rmap because those pages
         * are initially only visible via the pagetables, and the pte is locked
         * over the call to folio_add_new_anon_rmap.
         */
        VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
                        folio);
        VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
                       page);
}

static __always_inline void __folio_add_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                unsigned long address, rmap_t flags, enum pgtable_level level)
{
        int i;

        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        __folio_add_rmap(folio, page, nr_pages, vma, level);

        if (likely(!folio_test_ksm(folio)))
                __page_check_anon_rmap(folio, page, vma, address);

        if (flags & RMAP_EXCLUSIVE) {
                switch (level) {
                case PGTABLE_LEVEL_PTE:
                        for (i = 0; i < nr_pages; i++)
                                SetPageAnonExclusive(page + i);
                        break;
                case PGTABLE_LEVEL_PMD:
                        SetPageAnonExclusive(page);
                        break;
                case PGTABLE_LEVEL_PUD:
                        /*
                         * Keep the compiler happy, we don't support anonymous
                         * PUD mappings.
                         */
                        WARN_ON_ONCE(1);
                        break;
                default:
                        BUILD_BUG();
                }
        }

        VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
                         atomic_read(&folio->_mapcount) > 0, folio);
        for (i = 0; i < nr_pages; i++) {
                struct page *cur_page = page + i;

                VM_WARN_ON_FOLIO(folio_test_large(folio) &&
                                 folio_entire_mapcount(folio) > 1 &&
                                 PageAnonExclusive(cur_page), folio);
                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
                        continue;

                /*
                 * While PTE-mapping a THP we have a PMD and a PTE
                 * mapping.
                 */
                VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 &&
                                 PageAnonExclusive(cur_page), folio);
        }

        /*
         * Only mlock it if the folio is fully mapped to the VMA.
         *
         * Partially mapped folios can be split on reclaim and part outside
         * of mlocked VMA can be evicted or freed.
         */
        if (folio_nr_pages(folio) == nr_pages)
                mlock_vma_folio(folio, vma);
}

/**
 * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio
 * @folio:        The folio to add the mappings to
 * @page:        The first page to add
 * @nr_pages:        The number of pages which will be mapped
 * @vma:        The vm area in which the mappings are added
 * @address:        The user virtual address of the first page to map
 * @flags:        The rmap flags
 *
 * The page range of folio is defined by [first_page, first_page + nr_pages)
 *
 * The caller needs to hold the page table lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting,
 * and to ensure that an anon folio is not being upgraded racily to a KSM folio
 * (but KSM folios are never downgraded).
 */
void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma, unsigned long address,
                rmap_t flags)
{
        __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
                              PGTABLE_LEVEL_PTE);
}

/**
 * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio
 * @folio:        The folio to add the mapping to
 * @page:        The first page to add
 * @vma:        The vm area in which the mapping is added
 * @address:        The user virtual address of the first page to map
 * @flags:        The rmap flags
 *
 * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting.
 */
void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma, unsigned long address, rmap_t flags)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
                              PGTABLE_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

/**
 * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
 * @folio:        The folio to add the mapping to.
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 * @flags:        The rmap flags
 *
 * Like folio_add_anon_rmap_*() but must only be called on *new* folios.
 * This means the inc-and-test can be bypassed.
 * The folio doesn't necessarily need to be locked while it's exclusive
 * unless two threads map it concurrently. However, the folio must be
 * locked if it's shared.
 *
 * If the folio is pmd-mappable, it is accounted as a THP.
 */
void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
                unsigned long address, rmap_t flags)
{
        const bool exclusive = flags & RMAP_EXCLUSIVE;
        int nr = 1, nr_pmdmapped = 0;

        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);

        /*
         * VM_DROPPABLE mappings don't swap; instead they're just dropped when
         * under memory pressure.
         */
        if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
                __folio_set_swapbacked(folio);
        __folio_set_anon(folio, vma, address, exclusive);

        if (likely(!folio_test_large(folio))) {
                /* increment count (starts at -1) */
                atomic_set(&folio->_mapcount, 0);
                if (exclusive)
                        SetPageAnonExclusive(&folio->page);
        } else if (!folio_test_pmd_mappable(folio)) {
                int i;

                nr = folio_large_nr_pages(folio);
                for (i = 0; i < nr; i++) {
                        struct page *page = folio_page(folio, i);

                        if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                                /* increment count (starts at -1) */
                                atomic_set(&page->_mapcount, 0);
                        if (exclusive)
                                SetPageAnonExclusive(page);
                }

                folio_set_large_mapcount(folio, nr, vma);
                if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                        atomic_set(&folio->_nr_pages_mapped, nr);
        } else {
                nr = folio_large_nr_pages(folio);
                /* increment count (starts at -1) */
                atomic_set(&folio->_entire_mapcount, 0);
                folio_set_large_mapcount(folio, 1, vma);
                if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                        atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
                if (exclusive)
                        SetPageAnonExclusive(&folio->page);
                nr_pmdmapped = nr;
        }

        VM_WARN_ON_ONCE(address < vma->vm_start ||
                        address + (nr << PAGE_SHIFT) > vma->vm_end);

        __folio_mod_stat(folio, nr, nr_pmdmapped);
        mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
}

static __always_inline void __folio_add_file_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                enum pgtable_level level)
{
        VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);

        __folio_add_rmap(folio, page, nr_pages, vma, level);

        /*
         * Only mlock it if the folio is fully mapped to the VMA.
         *
         * Partially mapped folios can be split on reclaim and part outside
         * of mlocked VMA can be evicted or freed.
         */
        if (folio_nr_pages(folio) == nr_pages)
                mlock_vma_folio(folio, vma);
}

/**
 * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
 * @folio:        The folio to add the mappings to
 * @page:        The first page to add
 * @nr_pages:        The number of pages that will be mapped using PTEs
 * @vma:        The vm area in which the mappings are added
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma)
{
        __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}

/**
 * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
 * @folio:        The folio to add the mapping to
 * @page:        The first page to add
 * @vma:        The vm area in which the mapping is added
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

/**
 * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
 * @folio:        The folio to add the mapping to
 * @page:        The first page to add
 * @vma:        The vm area in which the mapping is added
 *
 * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
        WARN_ON_ONCE(true);
#endif
}

static __always_inline void __folio_remove_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                enum pgtable_level level)
{
        atomic_t *mapped = &folio->_nr_pages_mapped;
        int last = 0, nr = 0, nr_pmdmapped = 0;
        bool partially_mapped = false;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case PGTABLE_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        nr = atomic_add_negative(-1, &folio->_mapcount);
                        break;
                }

                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
                        nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
                        if (!nr) {
                                /* Now completely unmapped. */
                                nr = folio_large_nr_pages(folio);
                        } else {
                                partially_mapped = nr < folio_large_nr_pages(folio) &&
                                                   !folio_entire_mapcount(folio);
                                nr = 0;
                        }
                        break;
                }

                folio_sub_large_mapcount(folio, nr_pages, vma);
                do {
                        last += atomic_add_negative(-1, &page->_mapcount);
                } while (page++, --nr_pages > 0);

                if (last &&
                    atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED)
                        nr = last;

                partially_mapped = nr && atomic_read(mapped);
                break;
        case PGTABLE_LEVEL_PMD:
        case PGTABLE_LEVEL_PUD:
                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
                        last = atomic_add_negative(-1, &folio->_entire_mapcount);
                        if (level == PGTABLE_LEVEL_PMD && last)
                                nr_pmdmapped = folio_large_nr_pages(folio);
                        nr = folio_dec_return_large_mapcount(folio, vma);
                        if (!nr) {
                                /* Now completely unmapped. */
                                nr = folio_large_nr_pages(folio);
                        } else {
                                partially_mapped = last &&
                                                   nr < folio_large_nr_pages(folio);
                                nr = 0;
                        }
                        break;
                }

                folio_dec_large_mapcount(folio, vma);
                last = atomic_add_negative(-1, &folio->_entire_mapcount);
                if (last) {
                        nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
                        if (likely(nr < ENTIRELY_MAPPED)) {
                                nr_pages = folio_large_nr_pages(folio);
                                if (level == PGTABLE_LEVEL_PMD)
                                        nr_pmdmapped = nr_pages;
                                nr = nr_pages - nr;
                                /* Raced ahead of another remove and an add? */
                                if (unlikely(nr < 0))
                                        nr = 0;
                        } else {
                                /* An add of ENTIRELY_MAPPED raced ahead */
                                nr = 0;
                        }
                }

                partially_mapped = nr && nr < nr_pmdmapped;
                break;
        default:
                BUILD_BUG();
        }

        /*
         * Queue anon large folio for deferred split if at least one page of
         * the folio is unmapped and at least one page is still mapped.
         *
         * Check partially_mapped first to ensure it is a large folio.
         *
         * Device private folios do not support deferred splitting and
         * shrinker based scanning of the folios to free.
         */
        if (partially_mapped && folio_test_anon(folio) &&
            !folio_test_partially_mapped(folio) &&
            !folio_is_device_private(folio))
                deferred_split_folio(folio, true);

        __folio_mod_stat(folio, -nr, -nr_pmdmapped);

        /*
         * It would be tidy to reset folio_test_anon mapping when fully
         * unmapped, but that might overwrite a racing folio_add_anon_rmap_*()
         * which increments mapcount after us but sets mapping before us:
         * so leave the reset to free_pages_prepare, and remember that
         * it's only reliable while mapped.
         */

        munlock_vma_folio(folio, vma);
}

/**
 * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio
 * @folio:        The folio to remove the mappings from
 * @page:        The first page to remove
 * @nr_pages:        The number of pages that will be removed from the mapping
 * @vma:        The vm area from which the mappings are removed
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma)
{
        __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}

/**
 * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio
 * @folio:        The folio to remove the mapping from
 * @page:        The first page to remove
 * @vma:        The vm area from which the mapping is removed
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

/**
 * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
 * @folio:        The folio to remove the mapping from
 * @page:        The first page to remove
 * @vma:        The vm area from which the mapping is removed
 *
 * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_remove_rmap_pud(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
        WARN_ON_ONCE(true);
#endif
}

static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
                        struct page_vma_mapped_walk *pvmw,
                        enum ttu_flags flags, pte_t pte)
{
        unsigned long end_addr, addr = pvmw->address;
        struct vm_area_struct *vma = pvmw->vma;
        unsigned int max_nr;

        if (flags & TTU_HWPOISON)
                return 1;
        if (!folio_test_large(folio))
                return 1;

        /* We may only batch within a single VMA and a single page table. */
        end_addr = pmd_addr_end(addr, vma->vm_end);
        max_nr = (end_addr - addr) >> PAGE_SHIFT;

        /* We only support lazyfree or file folios batching for now ... */
        if (folio_test_anon(folio) && folio_test_swapbacked(folio))
                return 1;

        if (pte_unused(pte))
                return 1;

        if (userfaultfd_wp(vma))
                return 1;

        /*
         * If unmap fails, we need to restore the ptes. To avoid accidentally
         * upgrading write permissions for ptes that were not originally
         * writable, and to avoid losing the soft-dirty bit, use the
         * appropriate FPB flags.
         */
        return folio_pte_batch_flags(folio, vma, pvmw->pte, &pte, max_nr,
                                     FPB_RESPECT_WRITE | FPB_RESPECT_SOFT_DIRTY);
}

/*
 * @arg: enum ttu_flags will be passed to this argument
 */
static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        bool anon_exclusive, ret = true;
        pte_t pteval;
        struct page *subpage;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
        unsigned long nr_pages = 1, end_addr;
        unsigned long pfn;
        unsigned long hsz = 0;
        int ptes = 0;

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
         * try_to_unmap() may return before page_mapped() has become false,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        if (flags & TTU_SYNC)
                pvmw.flags = PVMW_SYNC;

        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
         * invalidation in the case of pmd sharing.
         *
         * Note that the folio can not be freed in this function as call of
         * try_to_unmap() must hold a reference on the folio.
         */
        range.end = vma_address_end(&pvmw);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, range.end);
        if (folio_test_hugetlb(folio)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);

                /* We need the huge page size for set_huge_pte_at() */
                hsz = huge_page_size(hstate_vma(vma));
        }
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
                /*
                 * If the folio is in an mlock()d vma, we must not swap it out.
                 */
                if (!(flags & TTU_IGNORE_MLOCK) &&
                    (vma->vm_flags & VM_LOCKED)) {
                        ptes++;

                        /*
                         * Set 'ret' to indicate the page cannot be unmapped.
                         *
                         * Do not jump to walk_abort immediately as additional
                         * iteration might be required to detect fully mapped
                         * folio an mlock it.
                         */
                        ret = false;

                        /* Only mlock fully mapped pages */
                        if (pvmw.pte && ptes != pvmw.nr_pages)
                                continue;

                        /*
                         * All PTEs must be protected by page table lock in
                         * order to mlock the page.
                         *
                         * If page table boundary has been cross, current ptl
                         * only protect part of ptes.
                         */
                        if (pvmw.flags & PVMW_PGTABLE_CROSSED)
                                goto walk_done;

                        /* Restore the mlock which got missed */
                        mlock_vma_folio(folio, vma);
                        goto walk_done;
                }

                if (!pvmw.pte) {
                        if (folio_test_lazyfree(folio)) {
                                if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
                                        goto walk_done;
                                /*
                                 * unmap_huge_pmd_locked has either already marked
                                 * the folio as swap-backed or decided to retain it
                                 * due to GUP or speculative references.
                                 */
                                goto walk_abort;
                        }

                        if (flags & TTU_SPLIT_HUGE_PMD) {
                                /*
                                 * We temporarily have to drop the PTL and
                                 * restart so we can process the PTE-mapped THP.
                                 */
                                split_huge_pmd_locked(vma, pvmw.address,
                                                      pvmw.pmd, false);
                                flags &= ~TTU_SPLIT_HUGE_PMD;
                                page_vma_mapped_walk_restart(&pvmw);
                                continue;
                        }
                }

                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);

                /*
                 * Handle PFN swap PTEs, such as device-exclusive ones, that
                 * actually map pages.
                 */
                pteval = ptep_get(pvmw.pte);
                if (likely(pte_present(pteval))) {
                        pfn = pte_pfn(pteval);
                } else {
                        const softleaf_t entry = softleaf_from_pte(pteval);

                        pfn = softleaf_to_pfn(entry);
                        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
                }

                subpage = folio_page(folio, pfn - folio_pfn(folio));
                address = pvmw.address;
                anon_exclusive = folio_test_anon(folio) &&
                                 PageAnonExclusive(subpage);

                if (folio_test_hugetlb(folio)) {
                        bool anon = folio_test_anon(folio);

                        /*
                         * The try_to_unmap() is only passed a hugetlb page
                         * in the case where the hugetlb page is poisoned.
                         */
                        VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
                        /*
                         * huge_pmd_unshare may unmap an entire PMD page.
                         * There is no way of knowing exactly which PMDs may
                         * be cached for this mm, so we must flush them all.
                         * start/end were already adjusted above to cover this
                         * range.
                         */
                        flush_cache_range(vma, range.start, range.end);

                        /*
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
                         *
                         * We also must hold hugetlb vma_lock in write mode.
                         * Lock order dictates acquiring vma_lock BEFORE
                         * i_mmap_rwsem.  We can only try lock here and fail
                         * if unsuccessful.
                         */
                        if (!anon) {
                                struct mmu_gather tlb;

                                VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
                                if (!hugetlb_vma_trylock_write(vma))
                                        goto walk_abort;

                                tlb_gather_mmu_vma(&tlb, vma);
                                if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
                                        hugetlb_vma_unlock_write(vma);
                                        huge_pmd_unshare_flush(&tlb, vma);
                                        tlb_finish_mmu(&tlb);
                                        /*
                                         * The PMD table was unmapped,
                                         * consequently unmapping the folio.
                                         */
                                        goto walk_done;
                                }
                                hugetlb_vma_unlock_write(vma);
                                tlb_finish_mmu(&tlb);
                        }
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                        if (pte_dirty(pteval))
                                folio_mark_dirty(folio);
                } else if (likely(pte_present(pteval))) {
                        nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
                        end_addr = address + nr_pages * PAGE_SIZE;
                        flush_cache_range(vma, address, end_addr);

                        /* Nuke the page table entry. */
                        pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages);
                        /*
                         * We clear the PTE but do not flush so potentially
                         * a remote CPU could still be writing to the folio.
                         * If the entry was previously clean then the
                         * architecture must guarantee that a clear->dirty
                         * transition on a cached TLB entry is written through
                         * and traps if the PTE is unmapped.
                         */
                        if (should_defer_flush(mm, flags))
                                set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
                        else
                                flush_tlb_range(vma, address, end_addr);
                        if (pte_dirty(pteval))
                                folio_mark_dirty(folio);
                } else {
                        pte_clear(mm, address, pvmw.pte);
                }

                /*
                 * Now the pte is cleared. If this pte was uffd-wp armed,
                 * we may want to replace a none pte with a marker pte if
                 * it's file-backed, so we don't lose the tracking info.
                 */
                pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);

                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);

                if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (folio_test_hugetlb(folio)) {
                                hugetlb_count_sub(folio_nr_pages(folio), mm);
                                set_huge_pte_at(mm, address, pvmw.pte, pteval,
                                                hsz);
                        } else {
                                dec_mm_counter(mm, mm_counter(folio));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }
                } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
                           !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         * A future reference will then fault in a new zero
                         * page. When userfaultfd is active, we must not drop
                         * this page though, as its main user (postcopy
                         * migration) will not expect userfaults on already
                         * copied pages.
                         */
                        dec_mm_counter(mm, mm_counter(folio));
                } else if (folio_test_anon(folio)) {
                        swp_entry_t entry = page_swap_entry(subpage);
                        pte_t swp_pte;
                        /*
                         * Store the swap location in the pte.
                         * See handle_pte_fault() ...
                         */
                        if (unlikely(folio_test_swapbacked(folio) !=
                                        folio_test_swapcache(folio))) {
                                WARN_ON_ONCE(1);
                                goto walk_abort;
                        }

                        /* MADV_FREE page check */
                        if (!folio_test_swapbacked(folio)) {
                                int ref_count, map_count;

                                /*
                                 * Synchronize with gup_pte_range():
                                 * - clear PTE; barrier; read refcount
                                 * - inc refcount; barrier; read PTE
                                 */
                                smp_mb();

                                ref_count = folio_ref_count(folio);
                                map_count = folio_mapcount(folio);

                                /*
                                 * Order reads for page refcount and dirty flag
                                 * (see comments in __remove_mapping()).
                                 */
                                smp_rmb();

                                if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
                                        /*
                                         * redirtied either using the page table or a previously
                                         * obtained GUP reference.
                                         */
                                        set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
                                        folio_set_swapbacked(folio);
                                        goto walk_abort;
                                } else if (ref_count != 1 + map_count) {
                                        /*
                                         * Additional reference. Could be a GUP reference or any
                                         * speculative reference. GUP users must mark the folio
                                         * dirty if there was a modification. This folio cannot be
                                         * reclaimed right now either way, so act just like nothing
                                         * happened.
                                         * We'll come back here later and detect if the folio was
                                         * dirtied when the additional reference is gone.
                                         */
                                        set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
                                        goto walk_abort;
                                }
                                add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
                                goto discard;
                        }

                        if (folio_dup_swap(folio, subpage) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                goto walk_abort;
                        }

                        /*
                         * arch_unmap_one() is expected to be a NOP on
                         * architectures where we could have PFN swap PTEs,
                         * so we'll not check/care.
                         */
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                folio_put_swap(folio, subpage);
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                goto walk_abort;
                        }

                        /* See folio_try_share_anon_rmap(): clear PTE first. */
                        if (anon_exclusive &&
                            folio_try_share_anon_rmap_pte(folio, subpage)) {
                                folio_put_swap(folio, subpage);
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                goto walk_abort;
                        }
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
                                        list_add(&mm->mmlist, &init_mm.mmlist);
                                spin_unlock(&mmlist_lock);
                        }
                        dec_mm_counter(mm, MM_ANONPAGES);
                        inc_mm_counter(mm, MM_SWAPENTS);
                        swp_pte = swp_entry_to_pte(entry);
                        if (anon_exclusive)
                                swp_pte = pte_swp_mkexclusive(swp_pte);
                        if (likely(pte_present(pteval))) {
                                if (pte_soft_dirty(pteval))
                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                                if (pte_uffd_wp(pteval))
                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        } else {
                                if (pte_swp_soft_dirty(pteval))
                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                                if (pte_swp_uffd_wp(pteval))
                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        }
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                } else {
                        /*
                         * This is a locked file-backed folio,
                         * so it cannot be removed from the page
                         * cache and replaced by a new folio before
                         * mmu_notifier_invalidate_range_end, so no
                         * concurrent thread might update its page table
                         * to point at a new folio while a device is
                         * still using this folio.
                         *
                         * See Documentation/mm/mmu_notifier.rst
                         */
                        add_mm_counter(mm, mm_counter_file(folio), -nr_pages);
                }
discard:
                if (unlikely(folio_test_hugetlb(folio))) {
                        hugetlb_remove_rmap(folio);
                } else {
                        folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
                }
                if (vma->vm_flags & VM_LOCKED)
                        mlock_drain_local();
                folio_put_refs(folio, nr_pages);

                /*
                 * If we are sure that we batched the entire folio and cleared
                 * all PTEs, we can just optimize and stop right here.
                 */
                if (nr_pages == folio_nr_pages(folio))
                        goto walk_done;
                continue;
walk_abort:
                ret = false;
walk_done:
                page_vma_mapped_walk_done(&pvmw);
                break;
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
{
        return vma_is_temporary_stack(vma);
}

static int folio_not_mapped(struct folio *folio)
{
        return !folio_mapped(folio);
}

/**
 * try_to_unmap - Try to remove all page table mappings to a folio.
 * @folio: The folio to unmap.
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this
 * folio.  It is the caller's responsibility to check if the folio is
 * still mapped if needed (use TTU_SYNC to prevent accounting races).
 *
 * Context: Caller must hold the folio lock.
 */
void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
                .arg = (void *)flags,
                .done = folio_not_mapped,
                .anon_lock = folio_lock_anon_vma_read,
        };

        if (flags & TTU_RMAP_LOCKED)
                rmap_walk_locked(folio, &rwc);
        else
                rmap_walk(folio, &rwc);
}

/*
 * @arg: enum ttu_flags will be passed to this argument.
 *
 * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
 * containing migration entries.
 */
static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        bool anon_exclusive, writable, ret = true;
        pte_t pteval;
        struct page *subpage;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
        unsigned long pfn;
        unsigned long hsz = 0;

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
         * try_to_migrate() may return before page_mapped() has become false,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        if (flags & TTU_SYNC)
                pvmw.flags = PVMW_SYNC;

        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
         * invalidation in the case of pmd sharing.
         *
         * Note that the page can not be free in this function as call of
         * try_to_unmap() must hold a reference on the page.
         */
        range.end = vma_address_end(&pvmw);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, range.end);
        if (folio_test_hugetlb(folio)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);

                /* We need the huge page size for set_huge_pte_at() */
                hsz = huge_page_size(hstate_vma(vma));
        }
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
                /* PMD-mapped THP migration entry */
                if (!pvmw.pte) {
                        __maybe_unused unsigned long pfn;
                        __maybe_unused pmd_t pmdval;

                        if (flags & TTU_SPLIT_HUGE_PMD) {
                                /*
                                 * split_huge_pmd_locked() might leave the
                                 * folio mapped through PTEs. Retry the walk
                                 * so we can detect this scenario and properly
                                 * abort the walk.
                                 */
                                split_huge_pmd_locked(vma, pvmw.address,
                                                      pvmw.pmd, true);
                                flags &= ~TTU_SPLIT_HUGE_PMD;
                                page_vma_mapped_walk_restart(&pvmw);
                                continue;
                        }
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
                        pmdval = pmdp_get(pvmw.pmd);
                        if (likely(pmd_present(pmdval)))
                                pfn = pmd_pfn(pmdval);
                        else
                                pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval));

                        subpage = folio_page(folio, pfn - folio_pfn(folio));

                        VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
                                        !folio_test_pmd_mappable(folio), folio);

                        if (set_pmd_migration_entry(&pvmw, subpage)) {
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        continue;
#endif
                }

                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);

                /*
                 * Handle PFN swap PTEs, such as device-exclusive ones, that
                 * actually map pages.
                 */
                pteval = ptep_get(pvmw.pte);
                if (likely(pte_present(pteval))) {
                        pfn = pte_pfn(pteval);
                } else {
                        const softleaf_t entry = softleaf_from_pte(pteval);

                        pfn = softleaf_to_pfn(entry);
                        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
                }

                subpage = folio_page(folio, pfn - folio_pfn(folio));
                address = pvmw.address;
                anon_exclusive = folio_test_anon(folio) &&
                                 PageAnonExclusive(subpage);

                if (folio_test_hugetlb(folio)) {
                        bool anon = folio_test_anon(folio);

                        /*
                         * huge_pmd_unshare may unmap an entire PMD page.
                         * There is no way of knowing exactly which PMDs may
                         * be cached for this mm, so we must flush them all.
                         * start/end were already adjusted above to cover this
                         * range.
                         */
                        flush_cache_range(vma, range.start, range.end);

                        /*
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
                         *
                         * We also must hold hugetlb vma_lock in write mode.
                         * Lock order dictates acquiring vma_lock BEFORE
                         * i_mmap_rwsem.  We can only try lock here and
                         * fail if unsuccessful.
                         */
                        if (!anon) {
                                struct mmu_gather tlb;

                                VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
                                if (!hugetlb_vma_trylock_write(vma)) {
                                        page_vma_mapped_walk_done(&pvmw);
                                        ret = false;
                                        break;
                                }

                                tlb_gather_mmu_vma(&tlb, vma);
                                if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
                                        hugetlb_vma_unlock_write(vma);
                                        huge_pmd_unshare_flush(&tlb, vma);
                                        tlb_finish_mmu(&tlb);
                                        /*
                                         * The PMD table was unmapped,
                                         * consequently unmapping the folio.
                                         */
                                        page_vma_mapped_walk_done(&pvmw);
                                        break;
                                }
                                hugetlb_vma_unlock_write(vma);
                                tlb_finish_mmu(&tlb);
                        }
                        /* Nuke the hugetlb page table entry */
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                        if (pte_dirty(pteval))
                                folio_mark_dirty(folio);
                        writable = pte_write(pteval);
                } else if (likely(pte_present(pteval))) {
                        flush_cache_page(vma, address, pfn);
                        /* Nuke the page table entry. */
                        if (should_defer_flush(mm, flags)) {
                                /*
                                 * We clear the PTE but do not flush so potentially
                                 * a remote CPU could still be writing to the folio.
                                 * If the entry was previously clean then the
                                 * architecture must guarantee that a clear->dirty
                                 * transition on a cached TLB entry is written through
                                 * and traps if the PTE is unmapped.
                                 */
                                pteval = ptep_get_and_clear(mm, address, pvmw.pte);

                                set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
                        } else {
                                pteval = ptep_clear_flush(vma, address, pvmw.pte);
                        }
                        if (pte_dirty(pteval))
                                folio_mark_dirty(folio);
                        writable = pte_write(pteval);
                } else {
                        const softleaf_t entry = softleaf_from_pte(pteval);

                        pte_clear(mm, address, pvmw.pte);

                        writable = softleaf_is_device_private_write(entry);
                }

                VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
                                !anon_exclusive, folio);

                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);

                if (PageHWPoison(subpage)) {
                        VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);

                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (folio_test_hugetlb(folio)) {
                                hugetlb_count_sub(folio_nr_pages(folio), mm);
                                set_huge_pte_at(mm, address, pvmw.pte, pteval,
                                                hsz);
                        } else {
                                dec_mm_counter(mm, mm_counter(folio));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }
                } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
                           !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         * A future reference will then fault in a new zero
                         * page. When userfaultfd is active, we must not drop
                         * this page though, as its main user (postcopy
                         * migration) will not expect userfaults on already
                         * copied pages.
                         */
                        dec_mm_counter(mm, mm_counter(folio));
                } else {
                        swp_entry_t entry;
                        pte_t swp_pte;

                        /*
                         * arch_unmap_one() is expected to be a NOP on
                         * architectures where we could have PFN swap PTEs,
                         * so we'll not check/care.
                         */
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                if (folio_test_hugetlb(folio))
                                        set_huge_pte_at(mm, address, pvmw.pte,
                                                        pteval, hsz);
                                else
                                        set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
                        if (folio_test_hugetlb(folio)) {
                                if (anon_exclusive &&
                                    hugetlb_try_share_anon_rmap(folio)) {
                                        set_huge_pte_at(mm, address, pvmw.pte,
                                                        pteval, hsz);
                                        ret = false;
                                        page_vma_mapped_walk_done(&pvmw);
                                        break;
                                }
                        } else if (anon_exclusive &&
                                   folio_try_share_anon_rmap_pte(folio, subpage)) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
                        if (writable)
                                entry = make_writable_migration_entry(
                                                        page_to_pfn(subpage));
                        else if (anon_exclusive)
                                entry = make_readable_exclusive_migration_entry(
                                                        page_to_pfn(subpage));
                        else
                                entry = make_readable_migration_entry(
                                                        page_to_pfn(subpage));
                        if (likely(pte_present(pteval))) {
                                if (pte_young(pteval))
                                        entry = make_migration_entry_young(entry);
                                if (pte_dirty(pteval))
                                        entry = make_migration_entry_dirty(entry);
                                swp_pte = swp_entry_to_pte(entry);
                                if (pte_soft_dirty(pteval))
                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                                if (pte_uffd_wp(pteval))
                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        } else {
                                swp_pte = swp_entry_to_pte(entry);
                                if (pte_swp_soft_dirty(pteval))
                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                                if (pte_swp_uffd_wp(pteval))
                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        }
                        if (folio_test_hugetlb(folio))
                                set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
                                                hsz);
                        else
                                set_pte_at(mm, address, pvmw.pte, swp_pte);
                        trace_set_migration_pte(address, pte_val(swp_pte),
                                                folio_order(folio));
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
                         */
                }

                if (unlikely(folio_test_hugetlb(folio)))
                        hugetlb_remove_rmap(folio);
                else
                        folio_remove_rmap_pte(folio, subpage, vma);
                if (vma->vm_flags & VM_LOCKED)
                        mlock_drain_local();
                folio_put(folio);
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

/**
 * try_to_migrate - try to replace all page table mappings with swap entries
 * @folio: the folio to replace page table entries for
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this folio and
 * replace them with special swap entries. Caller must hold the folio lock.
 */
void try_to_migrate(struct folio *folio, enum ttu_flags flags)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_migrate_one,
                .arg = (void *)flags,
                .done = folio_not_mapped,
                .anon_lock = folio_lock_anon_vma_read,
        };

        /*
         * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
         * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
         */
        if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
                                        TTU_SYNC | TTU_BATCH_FLUSH)))
                return;

        if (folio_is_zone_device(folio) &&
            (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
                return;

        /*
         * During exec, a temporary VMA is setup and later moved.
         * The VMA is moved under the anon_vma lock but not the
         * page tables leading to a race where migration cannot
         * find the migration ptes. Rather than increasing the
         * locking requirements of exec(), migration skips
         * temporary VMAs until after exec() completes.
         */
        if (!folio_test_ksm(folio) && folio_test_anon(folio))
                rwc.invalid_vma = invalid_migration_vma;

        if (flags & TTU_RMAP_LOCKED)
                rmap_walk_locked(folio, &rwc);
        else
                rmap_walk(folio, &rwc);
}

#ifdef CONFIG_DEVICE_PRIVATE
/**
 * make_device_exclusive() - Mark a page for exclusive use by a device
 * @mm: mm_struct of associated target process
 * @addr: the virtual address to mark for exclusive device access
 * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
 * @foliop: folio pointer will be stored here on success.
 *
 * This function looks up the page mapped at the given address, grabs a
 * folio reference, locks the folio and replaces the PTE with special
 * device-exclusive PFN swap entry, preventing access through the process
 * page tables. The function will return with the folio locked and referenced.
 *
 * On fault, the device-exclusive entries are replaced with the original PTE
 * under folio lock, after calling MMU notifiers.
 *
 * Only anonymous non-hugetlb folios are supported and the VMA must have
 * write permissions such that we can fault in the anonymous page writable
 * in order to mark it exclusive. The caller must hold the mmap_lock in read
 * mode.
 *
 * A driver using this to program access from a device must use a mmu notifier
 * critical section to hold a device specific lock during programming. Once
 * programming is complete it should drop the folio lock and reference after
 * which point CPU access to the page will revoke the exclusive access.
 *
 * Notes:
 *   #. This function always operates on individual PTEs mapping individual
 *      pages. PMD-sized THPs are first remapped to be mapped by PTEs before
 *      the conversion happens on a single PTE corresponding to @addr.
 *   #. While concurrent access through the process page tables is prevented,
 *      concurrent access through other page references (e.g., earlier GUP
 *      invocation) is not handled and not supported.
 *   #. device-exclusive entries are considered "clean" and "old" by core-mm.
 *      Device drivers must update the folio state when informed by MMU
 *      notifiers.
 *
 * Returns: pointer to mapped page on success, otherwise a negative error.
 */
struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
                void *owner, struct folio **foliop)
{
        struct mmu_notifier_range range;
        struct folio *folio, *fw_folio;
        struct vm_area_struct *vma;
        struct folio_walk fw;
        struct page *page;
        swp_entry_t entry;
        pte_t swp_pte;
        int ret;

        mmap_assert_locked(mm);
        addr = PAGE_ALIGN_DOWN(addr);

        /*
         * Fault in the page writable and try to lock it; note that if the
         * address would already be marked for exclusive use by a device,
         * the GUP call would undo that first by triggering a fault.
         *
         * If any other device would already map this page exclusively, the
         * fault will trigger a conversion to an ordinary
         * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
         */
retry:
        page = get_user_page_vma_remote(mm, addr,
                                        FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
                                        &vma);
        if (IS_ERR(page))
                return page;
        folio = page_folio(page);

        if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
                folio_put(folio);
                return ERR_PTR(-EOPNOTSUPP);
        }

        ret = folio_lock_killable(folio);
        if (ret) {
                folio_put(folio);
                return ERR_PTR(ret);
        }

        /*
         * Inform secondary MMUs that we are going to convert this PTE to
         * device-exclusive, such that they unmap it now. Note that the
         * caller must filter this event out to prevent livelocks.
         */
        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
                                      mm, addr, addr + PAGE_SIZE, owner);
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Let's do a second walk and make sure we still find the same page
         * mapped writable. Note that any page of an anonymous folio can
         * only be mapped writable using exactly one PTE ("exclusive"), so
         * there cannot be other mappings.
         */
        fw_folio = folio_walk_start(&fw, vma, addr, 0);
        if (fw_folio != folio || fw.page != page ||
            fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
                if (fw_folio)
                        folio_walk_end(&fw, vma);
                mmu_notifier_invalidate_range_end(&range);
                folio_unlock(folio);
                folio_put(folio);
                goto retry;
        }

        /* Nuke the page table entry so we get the uptodate dirty bit. */
        flush_cache_page(vma, addr, page_to_pfn(page));
        fw.pte = ptep_clear_flush(vma, addr, fw.ptep);

        /* Set the dirty flag on the folio now the PTE is gone. */
        if (pte_dirty(fw.pte))
                folio_mark_dirty(folio);

        /*
         * Store the pfn of the page in a special device-exclusive PFN swap PTE.
         * do_swap_page() will trigger the conversion back while holding the
         * folio lock.
         */
        entry = make_device_exclusive_entry(page_to_pfn(page));
        swp_pte = swp_entry_to_pte(entry);
        if (pte_soft_dirty(fw.pte))
                swp_pte = pte_swp_mksoft_dirty(swp_pte);
        /* The pte is writable, uffd-wp does not apply. */
        set_pte_at(mm, addr, fw.ptep, swp_pte);

        folio_walk_end(&fw, vma);
        mmu_notifier_invalidate_range_end(&range);
        *foliop = folio;
        return page;
}
EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif

void __put_anon_vma(struct anon_vma *anon_vma)
{
        struct anon_vma *root = anon_vma->root;

        anon_vma_free(anon_vma);
        if (root != anon_vma && atomic_dec_and_test(&root->refcount))
                anon_vma_free(root);
}

static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
                                            struct rmap_walk_control *rwc)
{
        struct anon_vma *anon_vma;

        if (rwc->anon_lock)
                return rwc->anon_lock(folio, rwc);

        /*
         * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
         * because that depends on page_mapped(); but not all its usages
         * are holding mmap_lock. Users without mmap_lock are required to
         * take a reference count to prevent the anon_vma disappearing
         */
        anon_vma = folio_anon_vma(folio);
        if (!anon_vma)
                return NULL;

        if (anon_vma_trylock_read(anon_vma))
                goto out;

        if (rwc->try_lock) {
                anon_vma = NULL;
                rwc->contended = true;
                goto out;
        }

        anon_vma_lock_read(anon_vma);
out:
        return anon_vma;
}

/*
 * rmap_walk_anon - do something to anonymous page using the object-based
 * rmap method
 * @folio: the folio to be handled
 * @rwc: control variable according to each walk type
 * @locked: caller holds relevant rmap lock
 *
 * Find all the mappings of a folio using the mapping pointer and the vma
 * chains contained in the anon_vma struct it points to.
 */
static void rmap_walk_anon(struct folio *folio,
                struct rmap_walk_control *rwc, bool locked)
{
        struct anon_vma *anon_vma;
        pgoff_t pgoff_start, pgoff_end;
        struct anon_vma_chain *avc;

        /*
         * The folio lock ensures that folio->mapping can't be changed under us
         * to an anon_vma with different root.
         */
        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);

        if (locked) {
                anon_vma = folio_anon_vma(folio);
                /* anon_vma disappear under us? */
                VM_BUG_ON_FOLIO(!anon_vma, folio);
        } else {
                anon_vma = rmap_walk_anon_lock(folio, rwc);
        }
        if (!anon_vma)
                return;

        pgoff_start = folio_pgoff(folio);
        pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
                        pgoff_start, pgoff_end) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(vma, pgoff_start,
                                folio_nr_pages(folio));

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(folio, vma, address, rwc->arg))
                        break;
                if (rwc->done && rwc->done(folio))
                        break;
        }

        if (!locked)
                anon_vma_unlock_read(anon_vma);
}

/**
 * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
 * of a page mapped within a specified page cache object at a specified offset.
 *
 * @folio:                 Either the folio whose mappings to traverse, or if NULL,
 *                         the callbacks specified in @rwc will be configured such
 *                         as to be able to look up mappings correctly.
 * @mapping:                 The page cache object whose mapping VMAs we intend to
 *                         traverse. If @folio is non-NULL, this should be equal to
 *                        folio_mapping(folio).
 * @pgoff_start:        The offset within @mapping of the page which we are
 *                         looking up. If @folio is non-NULL, this should be equal
 *                         to folio_pgoff(folio).
 * @nr_pages:                The number of pages mapped by the mapping. If @folio is
 *                        non-NULL, this should be equal to folio_nr_pages(folio).
 * @rwc:                The reverse mapping walk control object describing how
 *                        the traversal should proceed.
 * @locked:                Is the @mapping already locked? If not, we acquire the
 *                        lock.
 */
static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
                             pgoff_t pgoff_start, unsigned long nr_pages,
                             struct rmap_walk_control *rwc, bool locked)
{
        pgoff_t pgoff_end = pgoff_start + nr_pages - 1;
        struct vm_area_struct *vma;

        VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
        VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
        VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);

        if (!locked) {
                if (i_mmap_trylock_read(mapping))
                        goto lookup;

                if (rwc->try_lock) {
                        rwc->contended = true;
                        return;
                }

                i_mmap_lock_read(mapping);
        }
lookup:
        vma_interval_tree_foreach(vma, &mapping->i_mmap,
                        pgoff_start, pgoff_end) {
                unsigned long address = vma_address(vma, pgoff_start, nr_pages);

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(folio, vma, address, rwc->arg))
                        goto done;
                if (rwc->done && rwc->done(folio))
                        goto done;
        }
done:
        if (!locked)
                i_mmap_unlock_read(mapping);
}

/*
 * rmap_walk_file - do something to file page using the object-based rmap method
 * @folio: the folio to be handled
 * @rwc: control variable according to each walk type
 * @locked: caller holds relevant rmap lock
 *
 * Find all the mappings of a folio using the mapping pointer and the vma chains
 * contained in the address_space struct it points to.
 */
static void rmap_walk_file(struct folio *folio,
                struct rmap_walk_control *rwc, bool locked)
{
        /*
         * The folio lock not only makes sure that folio->mapping cannot
         * suddenly be NULLified by truncation, it makes sure that the structure
         * at mapping cannot be freed and reused yet, so we can safely take
         * mapping->i_mmap_rwsem.
         */
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (!folio->mapping)
                return;

        __rmap_walk_file(folio, folio->mapping, folio->index,
                         folio_nr_pages(folio), rwc, locked);
}

void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
        if (unlikely(folio_test_ksm(folio)))
                rmap_walk_ksm(folio, rwc);
        else if (folio_test_anon(folio))
                rmap_walk_anon(folio, rwc, false);
        else
                rmap_walk_file(folio, rwc, false);
}

/* Like rmap_walk, but caller holds relevant rmap lock */
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
{
        /* no ksm support for now */
        VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
        if (folio_test_anon(folio))
                rmap_walk_anon(folio, rwc, true);
        else
                rmap_walk_file(folio, rwc, true);
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * The following two functions are for anonymous (private mapped) hugepages.
 * Unlike common anonymous pages, anonymous hugepages have no accounting code
 * and no lru code, because we handle hugepages differently from common pages.
 */
void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
                unsigned long address, rmap_t flags)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
        if (flags & RMAP_EXCLUSIVE)
                SetPageAnonExclusive(&folio->page);
        VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
                         PageAnonExclusive(&folio->page), folio);
}

void hugetlb_add_new_anon_rmap(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);

        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        /* increment count (starts at -1) */
        atomic_set(&folio->_entire_mapcount, 0);
        atomic_set(&folio->_large_mapcount, 0);
        folio_clear_hugetlb_restore_reserve(folio);
        __folio_set_anon(folio, vma, address, true);
        SetPageAnonExclusive(&folio->page);
}
#endif /* CONFIG_HUGETLB_PAGE */
















































































































































































































    1 
    3 























    1 












    2 
    2 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/bad_inode.c
 *
 *  Copyright (C) 1997, Stephen Tweedie
 *
 *  Provide stub functions for unreadable inodes
 *
 *  Fabian Frederick : August 2003 - All file operations assigned to EIO
 */

#include <linux/fs.h>
#include <linux/export.h>
#include <linux/stat.h>
#include <linux/time.h>
#include <linux/namei.h>
#include <linux/poll.h>
#include <linux/fiemap.h>

static int bad_file_open(struct inode *inode, struct file *filp)
{
        return -EIO;
}

static const struct file_operations bad_file_ops =
{
        .open                = bad_file_open,
};

static int bad_inode_create(struct mnt_idmap *idmap,
                            struct inode *dir, struct dentry *dentry,
                            umode_t mode, bool excl)
{
        return -EIO;
}

static struct dentry *bad_inode_lookup(struct inode *dir,
                        struct dentry *dentry, unsigned int flags)
{
        return ERR_PTR(-EIO);
}

static int bad_inode_link (struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
{
        return -EIO;
}

static int bad_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        return -EIO;
}

static int bad_inode_symlink(struct mnt_idmap *idmap,
                             struct inode *dir, struct dentry *dentry,
                             const char *symname)
{
        return -EIO;
}

static struct dentry *bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                                      struct dentry *dentry, umode_t mode)
{
        return ERR_PTR(-EIO);
}

static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
{
        return -EIO;
}

static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir,
                           struct dentry *dentry, umode_t mode, dev_t rdev)
{
        return -EIO;
}

static int bad_inode_rename2(struct mnt_idmap *idmap,
                             struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags)
{
        return -EIO;
}

static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
                int buflen)
{
        return -EIO;
}

static int bad_inode_permission(struct mnt_idmap *idmap,
                                struct inode *inode, int mask)
{
        return -EIO;
}

static int bad_inode_getattr(struct mnt_idmap *idmap,
                             const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        return -EIO;
}

static int bad_inode_setattr(struct mnt_idmap *idmap,
                             struct dentry *direntry, struct iattr *attrs)
{
        return -EIO;
}

static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer,
                        size_t buffer_size)
{
        return -EIO;
}

static const char *bad_inode_get_link(struct dentry *dentry,
                                      struct inode *inode,
                                      struct delayed_call *done)
{
        return ERR_PTR(-EIO);
}

static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu)
{
        return ERR_PTR(-EIO);
}

static int bad_inode_fiemap(struct inode *inode,
                            struct fiemap_extent_info *fieinfo, u64 start,
                            u64 len)
{
        return -EIO;
}

static int bad_inode_update_time(struct inode *inode, enum fs_update_time type,
                                 unsigned int flags)
{
        return -EIO;
}

static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
                                 struct file *file, unsigned int open_flag,
                                 umode_t create_mode)
{
        return -EIO;
}

static int bad_inode_tmpfile(struct mnt_idmap *idmap,
                             struct inode *inode, struct file *file,
                             umode_t mode)
{
        return -EIO;
}

static int bad_inode_set_acl(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct posix_acl *acl,
                             int type)
{
        return -EIO;
}

static const struct inode_operations bad_inode_ops =
{
        .create                = bad_inode_create,
        .lookup                = bad_inode_lookup,
        .link                = bad_inode_link,
        .unlink                = bad_inode_unlink,
        .symlink        = bad_inode_symlink,
        .mkdir                = bad_inode_mkdir,
        .rmdir                = bad_inode_rmdir,
        .mknod                = bad_inode_mknod,
        .rename                = bad_inode_rename2,
        .readlink        = bad_inode_readlink,
        .permission        = bad_inode_permission,
        .getattr        = bad_inode_getattr,
        .setattr        = bad_inode_setattr,
        .listxattr        = bad_inode_listxattr,
        .get_link        = bad_inode_get_link,
        .get_inode_acl        = bad_inode_get_acl,
        .fiemap                = bad_inode_fiemap,
        .update_time        = bad_inode_update_time,
        .atomic_open        = bad_inode_atomic_open,
        .tmpfile        = bad_inode_tmpfile,
        .set_acl        = bad_inode_set_acl,
};


/*
 * When a filesystem is unable to read an inode due to an I/O error in
 * its read_inode() function, it can call make_bad_inode() to return a
 * set of stubs which will return EIO errors as required. 
 *
 * We only need to do limited initialisation: all other fields are
 * preinitialised to zero automatically.
 */
 
/**
 *        make_bad_inode - mark an inode bad due to an I/O error
 *        @inode: Inode to mark bad
 *
 *        When an inode cannot be read due to a media or remote network
 *        failure this function makes the inode "bad" and causes I/O operations
 *        on it to fail from this point on.
 */
 
void make_bad_inode(struct inode *inode)
{
        remove_inode_hash(inode);

        inode->i_mode = S_IFREG;
        simple_inode_init_ts(inode);
        inode->i_op = &bad_inode_ops;        
        inode->i_opflags &= ~IOP_XATTR;
        inode->i_fop = &bad_file_ops;        
}
EXPORT_SYMBOL(make_bad_inode);

/*
 * This tests whether an inode has been flagged as bad. The test uses
 * &bad_inode_ops to cover the case of invalidated inodes as well as
 * those created by make_bad_inode() above.
 */
 
/**
 *        is_bad_inode - is an inode errored
 *        @inode: inode to test
 *
 *        Returns true if the inode in question has been marked as bad.
 */
 
bool is_bad_inode(struct inode *inode)
{
        return (inode->i_op == &bad_inode_ops);        
}

EXPORT_SYMBOL(is_bad_inode);

/**
 * iget_failed - Mark an under-construction inode as dead and release it
 * @inode: The inode to discard
 *
 * Mark an under-construction inode as dead and release it.
 */
void iget_failed(struct inode *inode)
{
        make_bad_inode(inode);
        unlock_new_inode(inode);
        iput(inode);
}
EXPORT_SYMBOL(iget_failed);




































    2 












    2 
    2 












    2 





    2 

























    2 



    2 


























    2 














    2 






    2 


    2 





    2 





    2 





































    2 



    2 


    2 









    2 


















    2 





    2 



    2 














    2 




    2 
























    2 






































    2 



















    2 





    2 
    1 










    1 

    2 







    2 

    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
 * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * block.c
 */

/*
 * This file implements the low-level routines to read and decompress
 * datablocks and metadata blocks.
 */

#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/string.h>
#include <linux/bio.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs.h"
#include "decompressor.h"
#include "page_actor.h"

/*
 * Returns the amount of bytes copied to the page actor.
 */
static int copy_bio_to_actor(struct bio *bio,
                             struct squashfs_page_actor *actor,
                             int offset, int req_length)
{
        void *actor_addr;
        struct bvec_iter_all iter_all = {};
        struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
        int copied_bytes = 0;
        int actor_offset = 0;

        squashfs_actor_nobuff(actor);
        actor_addr = squashfs_first_page(actor);

        if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all)))
                return 0;

        while (copied_bytes < req_length) {
                int bytes_to_copy = min_t(int, bvec->bv_len - offset,
                                          PAGE_SIZE - actor_offset);

                bytes_to_copy = min_t(int, bytes_to_copy,
                                      req_length - copied_bytes);
                if (!IS_ERR(actor_addr))
                        memcpy(actor_addr + actor_offset, bvec_virt(bvec) +
                                        offset, bytes_to_copy);

                actor_offset += bytes_to_copy;
                copied_bytes += bytes_to_copy;
                offset += bytes_to_copy;

                if (actor_offset >= PAGE_SIZE) {
                        actor_addr = squashfs_next_page(actor);
                        if (!actor_addr)
                                break;
                        actor_offset = 0;
                }
                if (offset >= bvec->bv_len) {
                        if (!bio_next_segment(bio, &iter_all))
                                break;
                        offset = 0;
                }
        }
        squashfs_finish_page(actor);
        return copied_bytes;
}

static int squashfs_bio_read_cached(struct bio *fullbio,
                struct address_space *cache_mapping, u64 index, int length,
                u64 read_start, u64 read_end, int page_count)
{
        struct folio *head_to_cache = NULL, *tail_to_cache = NULL;
        struct block_device *bdev = fullbio->bi_bdev;
        int start_idx = 0, end_idx = 0;
        struct folio_iter fi;
        struct bio *bio = NULL;
        int idx = 0;
        int err = 0;
#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
        struct folio **cache_folios = kmalloc_objs(*cache_folios, page_count,
                                                   GFP_KERNEL | __GFP_ZERO);
#endif

        bio_for_each_folio_all(fi, fullbio) {
                struct folio *folio = fi.folio;

                if (folio->mapping == cache_mapping) {
                        idx++;
                        continue;
                }

                /*
                 * We only use this when the device block size is the same as
                 * the page size, so read_start and read_end cover full pages.
                 *
                 * Compare these to the original required index and length to
                 * only cache pages which were requested partially, since these
                 * are the ones which are likely to be needed when reading
                 * adjacent blocks.
                 */
                if (idx == 0 && index != read_start)
                        head_to_cache = folio;
                else if (idx == page_count - 1 && index + length != read_end)
                        tail_to_cache = folio;
#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
                /* Cache all pages in the BIO for repeated reads */
                else if (cache_folios)
                        cache_folios[idx] = folio;
#endif

                if (!bio || idx != end_idx) {
                        struct bio *new = bio_alloc_clone(bdev, fullbio,
                                                          GFP_NOIO, &fs_bio_set);

                        if (bio) {
                                bio_trim(bio, start_idx * PAGE_SECTORS,
                                         (end_idx - start_idx) * PAGE_SECTORS);
                                bio_chain(bio, new);
                                submit_bio(bio);
                        }

                        bio = new;
                        start_idx = idx;
                }

                idx++;
                end_idx = idx;
        }

        if (bio) {
                bio_trim(bio, start_idx * PAGE_SECTORS,
                         (end_idx - start_idx) * PAGE_SECTORS);
                err = submit_bio_wait(bio);
                bio_put(bio);
        }

        if (err)
                return err;

        if (head_to_cache) {
                int ret = filemap_add_folio(cache_mapping, head_to_cache,
                                                read_start >> PAGE_SHIFT,
                                                GFP_NOIO);

                if (!ret) {
                        folio_mark_uptodate(head_to_cache);
                        folio_unlock(head_to_cache);
                }

        }

        if (tail_to_cache) {
                int ret = filemap_add_folio(cache_mapping, tail_to_cache,
                                                (read_end >> PAGE_SHIFT) - 1,
                                                GFP_NOIO);

                if (!ret) {
                        folio_mark_uptodate(tail_to_cache);
                        folio_unlock(tail_to_cache);
                }
        }

#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
        if (!cache_folios)
                goto out;

        for (idx = 0; idx < page_count; idx++) {
                if (!cache_folios[idx])
                        continue;
                int ret = filemap_add_folio(cache_mapping, cache_folios[idx],
                                                (read_start >> PAGE_SHIFT) + idx,
                                                GFP_NOIO);

                if (!ret) {
                        folio_mark_uptodate(cache_folios[idx]);
                        folio_unlock(cache_folios[idx]);
                }
        }
        kfree(cache_folios);
out:
#endif
        return 0;
}

static struct page *squashfs_get_cache_page(struct address_space *mapping,
                                            pgoff_t index)
{
        struct page *page;

        if (!mapping)
                return NULL;

        page = find_get_page(mapping, index);
        if (!page)
                return NULL;

        if (!PageUptodate(page)) {
                put_page(page);
                return NULL;
        }

        return page;
}

static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
                             struct bio **biop, int *block_offset)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        struct address_space *cache_mapping = msblk->cache_mapping;
        const u64 read_start = round_down(index, msblk->devblksize);
        const sector_t block = read_start >> msblk->devblksize_log2;
        const u64 read_end = round_up(index + length, msblk->devblksize);
        const sector_t block_end = read_end >> msblk->devblksize_log2;
        int offset = read_start - round_down(index, PAGE_SIZE);
        int total_len = (block_end - block) << msblk->devblksize_log2;
        const int page_count = DIV_ROUND_UP(total_len + offset, PAGE_SIZE);
        int error, i;
        struct bio *bio;

        bio = bio_kmalloc(page_count, GFP_NOIO);
        if (!bio)
                return -ENOMEM;
        bio_init_inline(bio, sb->s_bdev, page_count, REQ_OP_READ);
        bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);

        for (i = 0; i < page_count; ++i) {
                unsigned int len =
                        min_t(unsigned int, PAGE_SIZE - offset, total_len);
                pgoff_t index = (read_start >> PAGE_SHIFT) + i;
                struct page *page;

                page = squashfs_get_cache_page(cache_mapping, index);
                if (!page)
                        page = alloc_page(GFP_NOIO);

                if (!page) {
                        error = -ENOMEM;
                        goto out_free_bio;
                }

                /*
                 * Use the __ version to avoid merging since we need each page
                 * to be separate when we check for and avoid cached pages.
                 */
                __bio_add_page(bio, page, len, offset);
                offset = 0;
                total_len -= len;
        }

        if (cache_mapping)
                error = squashfs_bio_read_cached(bio, cache_mapping, index,
                                                 length, read_start, read_end,
                                                 page_count);
        else
                error = submit_bio_wait(bio);
        if (error)
                goto out_free_bio;

        *biop = bio;
        *block_offset = index & ((1 << msblk->devblksize_log2) - 1);
        return 0;

out_free_bio:
        bio_free_pages(bio);
        bio_uninit(bio);
        kfree(bio);
        return error;
}

/*
 * Read and decompress a metadata block or datablock.  Length is non-zero
 * if a datablock is being read (the size is stored elsewhere in the
 * filesystem), otherwise the length is obtained from the first two bytes of
 * the metadata block.  A bit in the length field indicates if the block
 * is stored uncompressed in the filesystem (usually because compression
 * generated a larger block - this does occasionally happen with compression
 * algorithms).
 */
int squashfs_read_data(struct super_block *sb, u64 index, int length,
                       u64 *next_index, struct squashfs_page_actor *output)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        struct bio *bio = NULL;
        int compressed;
        int res;
        int offset;

        if (length) {
                /*
                 * Datablock.
                 */
                compressed = SQUASHFS_COMPRESSED_BLOCK(length);
                length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
                TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
                        index, compressed ? "" : "un", length, output->length);
        } else {
                /*
                 * Metadata block.
                 */
                const u8 *data;
                struct bvec_iter_all iter_all = {};
                struct bio_vec *bvec = bvec_init_iter_all(&iter_all);

                if (index + 2 > msblk->bytes_used) {
                        res = -EIO;
                        goto out;
                }
                res = squashfs_bio_read(sb, index, 2, &bio, &offset);
                if (res)
                        goto out;

                if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) {
                        res = -EIO;
                        goto out_free_bio;
                }
                /* Extract the length of the metadata block */
                data = bvec_virt(bvec);
                length = data[offset];
                if (offset < bvec->bv_len - 1) {
                        length |= data[offset + 1] << 8;
                } else {
                        if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) {
                                res = -EIO;
                                goto out_free_bio;
                        }
                        data = bvec_virt(bvec);
                        length |= data[0] << 8;
                }
                bio_free_pages(bio);
                bio_uninit(bio);
                kfree(bio);

                compressed = SQUASHFS_COMPRESSED(length);
                length = SQUASHFS_COMPRESSED_SIZE(length);
                index += 2;

                TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2,
                      compressed ? "" : "un", length);
        }
        if (length <= 0 || length > output->length ||
                        (index + length) > msblk->bytes_used) {
                res = -EIO;
                goto out;
        }

        if (next_index)
                *next_index = index + length;

        res = squashfs_bio_read(sb, index, length, &bio, &offset);
        if (res)
                goto out;

        if (compressed) {
                if (!msblk->stream) {
                        res = -EIO;
                        goto out_free_bio;
                }
                res = msblk->thread_ops->decompress(msblk, bio, offset, length, output);
        } else {
                res = copy_bio_to_actor(bio, output, offset, length);
        }

out_free_bio:
        bio_free_pages(bio);
        bio_uninit(bio);
        kfree(bio);
out:
        if (res < 0) {
                ERROR("Failed to read block 0x%llx: %d\n", index, res);
                if (msblk->panic_on_errors)
                        panic("squashfs read failed");
        }

        return res;
}


























   82 








    1 











   46 





   17 

   17 










   17 













   54 


































   19 




































   23 

   17 



















   19 












   55 


















   23 















   23 
   53 



   52 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * AppArmor security module
 *
 * This file contains AppArmor contexts used to associate "labels" to objects.
 *
 * Copyright (C) 1998-2008 Novell/SUSE
 * Copyright 2009-2010 Canonical Ltd.
 */

#ifndef __AA_CONTEXT_H
#define __AA_CONTEXT_H

#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/sched.h>

#include "label.h"
#include "policy_ns.h"
#include "task.h"

static inline struct aa_label *cred_label(const struct cred *cred)
{
        struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred;

        AA_BUG(!blob);
        return *blob;
}

static inline void set_cred_label(const struct cred *cred,
                                  struct aa_label *label)
{
        struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred;

        AA_BUG(!blob);
        *blob = label;
}

/**
 * aa_get_newest_cred_label - obtain the newest label on a cred
 * @cred: cred to obtain label from (NOT NULL)
 *
 * Returns: newest version of confining label
 */
static inline struct aa_label *aa_get_newest_cred_label(const struct cred *cred)
{
        return aa_get_newest_label(cred_label(cred));
}

static inline struct aa_label *aa_get_newest_cred_label_condref(const struct cred *cred,
                                                                bool *needput)
{
        struct aa_label *l = cred_label(cred);

        if (unlikely(label_is_stale(l))) {
                *needput = true;
                return aa_get_newest_label(l);
        }

        *needput = false;
        return l;
}

static inline void aa_put_label_condref(struct aa_label *l, bool needput)
{
        if (unlikely(needput))
                aa_put_label(l);
}

/**
 * aa_current_raw_label - find the current tasks confining label
 *
 * Returns: up to date confining label or the ns unconfined label (NOT NULL)
 *
 * This fn will not update the tasks cred to the most up to date version
 * of the label so it is safe to call when inside of locks.
 */
static inline struct aa_label *aa_current_raw_label(void)
{
        return cred_label(current_cred());
}

/**
 * aa_get_current_label - get the newest version of the current tasks label
 *
 * Returns: newest version of confining label (NOT NULL)
 *
 * This fn will not update the tasks cred, so it is safe inside of locks
 *
 * The returned reference must be put with aa_put_label()
 */
static inline struct aa_label *aa_get_current_label(void)
{
        struct aa_label *l = aa_current_raw_label();

        if (label_is_stale(l))
                return aa_get_newest_label(l);
        return aa_get_label(l);
}

/**
 * __end_cred_crit_section - end crit section begun with __begin_...
 * @label: label obtained from __begin_cred_crit_section
 * @needput: output: bool set by __begin_cred_crit_section
 *
 * While the cred passed to __begin is guaranteed to not change
 * and the cred and label could be passed here instead of needput
 * using needput with a local var makes it easier for the compiler
 * and processor to optimize and speculatively execute the comparison
 * than chasing a pointer in the cred struct.
 */
static inline void __end_cred_crit_section(struct aa_label *label,
                                                    bool needput)
{
        if (unlikely(needput))
                aa_put_label(label);
}

/**
 * __begin_cred_crit_section - @cred's confining label
 * @cred: current's cred to start a crit section on its label
 * @needput: store whether the label needs to be put when ending crit section
 *
 * Returns: up to date confining label or the ns unconfined label (NOT NULL)
 *
 * safe to call inside locks
 *
 * The returned reference must be put with __end_cred_crit_section()
 * This must NOT be used if the task cred could be updated within the
 * critical section between
 *   __begin_cred_crit_section() ..  __end_cred_crit_section()
 *
 * The crit section is an optimization to avoid having to get and put
 * the newest version of the label. While the cred won't change and
 * hence the label it contains won't change, the newest version of the
 * label can. During the crit section the newest versions of the label
 * will be used until the end of the crit section.
 *
 * If the label has not been updated at the start of the crit section
 * no refcount is taken, the cred's refcount is enough to hold the
 * label for the duration of the crit section.
 *
 * If the label has been updated then a refcount will be taken and the
 * newest version of the label will be returned. While the cred label
 * and the returned label could be compared at the end of the crit
 * section, needput is used because it allows better optimization by
 * the compiler and the processor's speculative execution.
 */
static inline struct aa_label *__begin_cred_crit_section(const struct cred *cred,
                                                         bool *needput)
{
        struct aa_label *label = cred_label(cred);

        if (label_is_stale(label)) {
                *needput = true;
                return aa_get_newest_label(label);
        }

        *needput = false;
        return label;
}

/**
 * __end_current_label_crit_section - end crit section begun with __begin_...
 * @label: label obtained from __begin_current_label_crit_section
 * @needput: output: bool set by __begin_current_label_crit_section
 *
 * wrapper around __end_cred_crit_section() to pair nicely with
 * __begin_current_label_crit_section()
 */
static inline void __end_current_label_crit_section(struct aa_label *label,
                                                    bool needput)
{
        __end_cred_crit_section(label, needput);
}

/**
 * end_current_label_crit_section - put a reference found with begin_current_label..
 * @label: label reference to put
 *
 * Should only be used with a reference obtained with
 * begin_current_label_crit_section and never used in situations where the
 * task cred may be updated
 */
static inline void end_current_label_crit_section(struct aa_label *label)
{
        if (label != aa_current_raw_label())
                aa_put_label(label);
}

/**
 * __begin_current_label_crit_section - current's confining label
 * @needput: store whether the label needs to be put when ending crit section
 *
 * Returns: up to date confining label or the ns unconfined label (NOT NULL)
 *
 * safe to call inside locks
 *
 * The returned reference must be put with __end_current_label_crit_section()
 * This must NOT be used if the task cred could be updated within the
 * critical section between __begin_current_label_crit_section() ..
 * __end_current_label_crit_section()
 */
static inline struct aa_label *__begin_current_label_crit_section(bool *needput)
{
        return __begin_cred_crit_section(current_cred(), needput);
}

/**
 * begin_current_label_crit_section - current's confining label and update it
 *
 * Returns: up to date confining label or the ns unconfined label (NOT NULL)
 *
 * Not safe to call inside locks
 *
 * The returned reference must be put with end_current_label_crit_section()
 * This must NOT be used if the task cred could be updated within the
 * critical section between begin_current_label_crit_section() ..
 * end_current_label_crit_section()
 */
static inline struct aa_label *begin_current_label_crit_section(void)
{
        struct aa_label *label = aa_current_raw_label();

        might_sleep();

        if (label_is_stale(label)) {
                label = aa_get_newest_label(label);
                if (aa_replace_current_label(label) == 0)
                        /* task cred will keep the reference */
                        aa_put_label(label);
        }

        return label;
}

static inline struct aa_ns *aa_get_current_ns(void)
{
        struct aa_label *label;
        struct aa_ns *ns;
        bool needput;

        label  = __begin_current_label_crit_section(&needput);
        ns = aa_get_ns(labels_ns(label));
        __end_current_label_crit_section(label, needput);

        return ns;
}

#endif /* __AA_CONTEXT_H */






































































































































































































































































































































































































































































    2 









































    2 







    2 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the BSD Socket
 *                interface as the means of communication with the user level.
 *
 * Authors:        Lotsa people, from code originally in tcp
 */

#ifndef _INET_HASHTABLES_H
#define _INET_HASHTABLES_H


#include <linux/interrupt.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/wait.h>

#include <net/inet_connection_sock.h>
#include <net/inet_sock.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/netns/hash.h>

#include <linux/refcount.h>
#include <asm/byteorder.h>

/* This is for all connections with a full identity, no wildcards.
 * The 'e' prefix stands for Establish, but we really put all sockets
 * but LISTEN ones.
 */
struct inet_ehash_bucket {
        struct hlist_nulls_head chain;
};

/* There are a few simple rules, which allow for local port reuse by
 * an application.  In essence:
 *
 *        1) Sockets bound to different interfaces may share a local port.
 *           Failing that, goto test 2.
 *        2) If all sockets have sk->sk_reuse set, and none of them are in
 *           TCP_LISTEN state, the port may be shared.
 *           Failing that, goto test 3.
 *        3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
 *           address, and none of them are the same, the port may be
 *           shared.
 *           Failing this, the port cannot be shared.
 *
 * The interesting point, is test #2.  This is what an FTP server does
 * all day.  To optimize this case we use a specific flag bit defined
 * below.  As we add sockets to a bind bucket list, we perform a
 * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
 * As long as all sockets added to a bind bucket pass this test,
 * the flag bit will be set.
 * The resulting situation is that tcp_v[46]_verify_bind() can just check
 * for this flag bit, if it is set and the socket trying to bind has
 * sk->sk_reuse set, we don't even have to walk the owners list at all,
 * we return that it is ok to bind this socket to the requested local port.
 *
 * Sounds like a lot of work, but it is worth it.  In a more naive
 * implementation (ie. current FreeBSD etc.) the entire list of ports
 * must be walked for each data port opened by an ftp server.  Needless
 * to say, this does not scale at all.  With a couple thousand FTP
 * users logged onto your box, isn't it nice to know that new data
 * ports are created in O(1) time?  I thought so. ;-)        -DaveM
 */
#define FASTREUSEPORT_ANY        1
#define FASTREUSEPORT_STRICT        2

struct inet_bind_bucket {
        possible_net_t                ib_net;
        int                        l3mdev;
        unsigned short                port;
        signed char                fastreuse;
        signed char                fastreuseport;
        kuid_t                        fastuid;
#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr                fast_v6_rcv_saddr;
#endif
        __be32                        fast_rcv_saddr;
        unsigned short                fast_sk_family;
        bool                        fast_ipv6_only;
        struct hlist_node        node;
        struct hlist_head        bhash2;
        struct rcu_head                rcu;
};

struct inet_bind2_bucket {
        possible_net_t                ib_net;
        int                        l3mdev;
        unsigned short                port;
#if IS_ENABLED(CONFIG_IPV6)
        unsigned short                addr_type;
        struct in6_addr                v6_rcv_saddr;
#define rcv_saddr                v6_rcv_saddr.s6_addr32[3]
#else
        __be32                        rcv_saddr;
#endif
        /* Node in the bhash2 inet_bind_hashbucket chain */
        struct hlist_node        node;
        struct hlist_node        bhash_node;
        /* List of sockets hashed to this bucket */
        struct hlist_head        owners;
        signed char                fastreuse;
        signed char                fastreuseport;
};

static inline struct net *ib_net(const struct inet_bind_bucket *ib)
{
        return read_pnet(&ib->ib_net);
}

static inline struct net *ib2_net(const struct inet_bind2_bucket *ib)
{
        return read_pnet(&ib->ib_net);
}

#define inet_bind_bucket_for_each(tb, head) \
        hlist_for_each_entry(tb, head, node)

struct inet_bind_hashbucket {
        spinlock_t                lock;
        struct hlist_head        chain;
};

/* Sockets can be hashed in established or listening table.
 * We must use different 'nulls' end-of-chain value for all hash buckets :
 * A socket might transition from ESTABLISH to LISTEN state without
 * RCU grace period. A lookup in ehash table needs to handle this case.
 */
#define LISTENING_NULLS_BASE (1U << 29)
struct inet_listen_hashbucket {
        spinlock_t                lock;
        struct hlist_nulls_head        nulls_head;
};

/* This is for listening sockets, thus all sockets which possess wildcards. */
#define INET_LHTABLE_SIZE        32        /* Yes, really, this is all you need. */

struct inet_hashinfo {
        /* This is for sockets with full identity only.  Sockets here will
         * always be without wildcards and will have the following invariant:
         *
         *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
         *
         */
        struct inet_ehash_bucket        *ehash;
        spinlock_t                        *ehash_locks;
        unsigned int                        ehash_mask;
        unsigned int                        ehash_locks_mask;

        /* Ok, let's try this, I give up, we do need a local binding
         * TCP hash as well as the others for fast bind/connect.
         */
        struct kmem_cache                *bind_bucket_cachep;
        /* This bind table is hashed by local port */
        struct inet_bind_hashbucket        *bhash;
        struct kmem_cache                *bind2_bucket_cachep;
        /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4)
         * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used
         * primarily for expediting bind conflict resolution.
         */
        struct inet_bind_hashbucket        *bhash2;
        unsigned int                        bhash_size;

        /* The 2nd listener table hashed by local port and address */
        unsigned int                        lhash2_mask;
        struct inet_listen_hashbucket        *lhash2;

        bool                                pernet;
} ____cacheline_aligned_in_smp;

static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk)
{
        return sock_net(sk)->ipv4.tcp_death_row.hashinfo;
}

static inline struct inet_listen_hashbucket *
inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
{
        return &h->lhash2[hash & h->lhash2_mask];
}

static inline struct inet_ehash_bucket *inet_ehash_bucket(
        struct inet_hashinfo *hashinfo,
        unsigned int hash)
{
        return &hashinfo->ehash[hash & hashinfo->ehash_mask];
}

static inline spinlock_t *inet_ehash_lockp(
        struct inet_hashinfo *hashinfo,
        unsigned int hash)
{
        return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
}

int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo);

static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
{
        kvfree(hashinfo->ehash_locks);
        hashinfo->ehash_locks = NULL;
}

struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
                                                 unsigned int ehash_entries);
void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo);

struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
                        struct inet_bind_hashbucket *head,
                        const unsigned short snum, int l3mdev);
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb);

bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
                            const struct net *net, unsigned short port,
                            int l3mdev);

struct inet_bind2_bucket *
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
                         struct inet_bind_hashbucket *head,
                         struct inet_bind_bucket *tb,
                         const struct sock *sk);

void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
                               struct inet_bind2_bucket *tb);

struct inet_bind2_bucket *
inet_bind2_bucket_find(const struct inet_bind_hashbucket *head,
                       const struct net *net,
                       unsigned short port, int l3mdev,
                       const struct sock *sk);

bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb,
                                      const struct net *net, unsigned short port,
                                      int l3mdev, const struct sock *sk);

static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
                               const u32 bhash_size)
{
        return (lport + net_hash_mix(net)) & (bhash_size - 1);
}

static inline struct inet_bind_hashbucket *
inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk,
                      const struct net *net, unsigned short port)
{
        u32 hash;

#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port);
        else
#endif
                hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port);
        return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}

static inline bool inet_use_hash2_on_bind(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6) {
                if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
                        return false;

                if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
                        return true;
        }
#endif
        return sk->sk_rcv_saddr != htonl(INADDR_ANY);
}

struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port);

/* This should be called whenever a socket's sk_rcv_saddr (ipv4) or
 * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's
 * rcv_saddr field should already have been updated when this is called.
 */
int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family);
void inet_bhash2_reset_saddr(struct sock *sk);

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
                    struct inet_bind2_bucket *tb2, unsigned short port);

/* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child);

void inet_put_port(struct sock *sk);

void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
                         unsigned long numentries, int scale,
                         unsigned long low_limit,
                         unsigned long high_limit);

bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk);
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk,
                         bool *found_dup_sk);
int inet_hash(struct sock *sk);
void inet_unhash(struct sock *sk);

struct sock *__inet_lookup_listener(const struct net *net,
                                    struct sk_buff *skb, int doff,
                                    const __be32 saddr, const __be16 sport,
                                    const __be32 daddr,
                                    const unsigned short hnum,
                                    const int dif, const int sdif);

static inline struct sock *inet_lookup_listener(struct net *net,
                                                struct sk_buff *skb, int doff,
                                                __be32 saddr, __be16 sport,
                                                __be32 daddr, __be16 dport,
                                                int dif, int sdif)
{
        return __inet_lookup_listener(net, skb, doff, saddr, sport,
                                      daddr, ntohs(dport), dif, sdif);
}

/* Socket demux engine toys. */
/* What happens here is ugly; there's a pair of adjacent fields in
   struct inet_sock; __be16 dport followed by __u16 num.  We want to
   search by pair, so we combine the keys into a single 32bit value
   and compare with 32bit value read from &...->dport.  Let's at least
   make sure that it's not mixed with anything else...
   On 64bit targets we combine comparisons with pair of adjacent __be32
   fields in the same way.
*/
#ifdef __BIG_ENDIAN
#define INET_COMBINED_PORTS(__sport, __dport) \
        ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport)))
#else /* __LITTLE_ENDIAN */
#define INET_COMBINED_PORTS(__sport, __dport) \
        ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport)))
#endif

#ifdef __BIG_ENDIAN
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
        const __addrpair __name = (__force __addrpair) ( \
                                   (((__force __u64)(__be32)(__saddr)) << 32) | \
                                   ((__force __u64)(__be32)(__daddr)))
#else /* __LITTLE_ENDIAN */
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
        const __addrpair __name = (__force __addrpair) ( \
                                   (((__force __u64)(__be32)(__daddr)) << 32) | \
                                   ((__force __u64)(__be32)(__saddr)))
#endif /* __BIG_ENDIAN */

static inline bool inet_match(const struct net *net, const struct sock *sk,
                              const __addrpair cookie, const __portpair ports,
                              int dif, int sdif)
{
        if (!net_eq(sock_net(sk), net) ||
            READ_ONCE(sk->sk_portpair) != ports ||
            sk->sk_addrpair != cookie)
                return false;

        /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
        return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
                                    sdif);
}

/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
 * not check it for lookups anymore, thanks Alexey. -DaveM
 */
struct sock *__inet_lookup_established(const struct net *net,
                                       const __be32 saddr, const __be16 sport,
                                       const __be32 daddr, const u16 hnum,
                                       const int dif, const int sdif);

typedef u32 (inet_ehashfn_t)(const struct net *net,
                              const __be32 laddr, const __u16 lport,
                              const __be32 faddr, const __be16 fport);

inet_ehashfn_t inet_ehashfn;

INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn);

struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk,
                                   struct sk_buff *skb, int doff,
                                   __be32 saddr, __be16 sport,
                                   __be32 daddr, unsigned short hnum,
                                   inet_ehashfn_t *ehashfn);

struct sock *inet_lookup_run_sk_lookup(const struct net *net,
                                       int protocol,
                                       struct sk_buff *skb, int doff,
                                       __be32 saddr, __be16 sport,
                                       __be32 daddr, u16 hnum, const int dif,
                                       inet_ehashfn_t *ehashfn);

static inline struct sock *inet_lookup_established(struct net *net,
                                                   const __be32 saddr, const __be16 sport,
                                                   const __be32 daddr, const __be16 dport,
                                                   const int dif)
{
        return __inet_lookup_established(net, saddr, sport, daddr,
                                         ntohs(dport), dif, 0);
}

static inline struct sock *__inet_lookup(struct net *net,
                                         struct sk_buff *skb, int doff,
                                         const __be32 saddr, const __be16 sport,
                                         const __be32 daddr, const __be16 dport,
                                         const int dif, const int sdif,
                                         bool *refcounted)
{
        u16 hnum = ntohs(dport);
        struct sock *sk;

        sk = __inet_lookup_established(net, saddr, sport,
                                       daddr, hnum, dif, sdif);
        *refcounted = true;
        if (sk)
                return sk;
        *refcounted = false;
        return __inet_lookup_listener(net, skb, doff, saddr,
                                      sport, daddr, hnum, dif, sdif);
}

static inline struct sock *inet_lookup(struct net *net,
                                       struct sk_buff *skb, int doff,
                                       const __be32 saddr, const __be16 sport,
                                       const __be32 daddr, const __be16 dport,
                                       const int dif)
{
        struct sock *sk;
        bool refcounted;

        sk = __inet_lookup(net, skb, doff, saddr, sport, daddr,
                           dport, dif, 0, &refcounted);

        if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
}

static inline
struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff,
                             const __be32 saddr, const __be16 sport,
                             const __be32 daddr, const __be16 dport,
                             bool *refcounted, inet_ehashfn_t *ehashfn)
{
        struct sock *sk, *reuse_sk;
        bool prefetched;

        sk = skb_steal_sock(skb, refcounted, &prefetched);
        if (!sk)
                return NULL;

        if (!prefetched || !sk_fullsock(sk))
                return sk;

        if (sk->sk_protocol == IPPROTO_TCP) {
                if (sk->sk_state != TCP_LISTEN)
                        return sk;
        } else if (sk->sk_protocol == IPPROTO_UDP) {
                if (sk->sk_state != TCP_CLOSE)
                        return sk;
        } else {
                return sk;
        }

        reuse_sk = inet_lookup_reuseport(net, sk, skb, doff,
                                         saddr, sport, daddr, ntohs(dport),
                                         ehashfn);
        if (!reuse_sk)
                return sk;

        /* We've chosen a new reuseport sock which is never refcounted. This
         * implies that sk also isn't refcounted.
         */
        WARN_ON_ONCE(*refcounted);

        return reuse_sk;
}

static inline struct sock *__inet_lookup_skb(struct sk_buff *skb,
                                             int doff,
                                             const __be16 sport,
                                             const __be16 dport,
                                             const int sdif,
                                             bool *refcounted)
{
        struct net *net = skb_dst_dev_net_rcu(skb);
        const struct iphdr *iph = ip_hdr(skb);
        struct sock *sk;

        sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport,
                             refcounted, inet_ehashfn);
        if (IS_ERR(sk))
                return NULL;
        if (sk)
                return sk;

        return __inet_lookup(net, skb, doff, iph->saddr, sport,
                             iph->daddr, dport, inet_iif(skb), sdif,
                             refcounted);
}

static inline void sk_daddr_set(struct sock *sk, __be32 addr)
{
        sk->sk_daddr = addr; /* alias of inet_daddr */
#if IS_ENABLED(CONFIG_IPV6)
        ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr);
#endif
}

static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr)
{
        sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */
#if IS_ENABLED(CONFIG_IPV6)
        ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr);
#endif
}

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
                        struct sock *sk, u64 port_offset,
                        u32 hash_port0,
                        int (*check_established)(struct inet_timewait_death_row *,
                                                 struct sock *, __u16,
                                                 struct inet_timewait_sock **,
                                                 bool rcu_lookup,
                                                 u32 hash));

int inet_hash_connect(struct inet_timewait_death_row *death_row,
                      struct sock *sk);
#endif /* _INET_HASHTABLES_H */
























































































































































































    1 






































    2 















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * bvec iterator
 *
 * Copyright (C) 2001 Ming Lei <ming.lei@canonical.com>
 */
#ifndef __LINUX_BVEC_H
#define __LINUX_BVEC_H

#include <linux/highmem.h>
#include <linux/bug.h>
#include <linux/errno.h>
#include <linux/limits.h>
#include <linux/minmax.h>
#include <linux/types.h>

struct page;

/**
 * struct bio_vec - a contiguous range of physical memory addresses
 * @bv_page:   First page associated with the address range.
 * @bv_len:    Number of bytes in the address range.
 * @bv_offset: Start of the address range relative to the start of @bv_page.
 *
 * All pages within a bio_vec starting from @bv_page are contiguous and
 * can simply be iterated (see bvec_advance()).
 */
struct bio_vec {
        struct page        *bv_page;
        unsigned int        bv_len;
        unsigned int        bv_offset;
};

/**
 * bvec_set_page - initialize a bvec based off a struct page
 * @bv:                bvec to initialize
 * @page:        page the bvec should point to
 * @len:        length of the bvec
 * @offset:        offset into the page
 */
static inline void bvec_set_page(struct bio_vec *bv, struct page *page,
                unsigned int len, unsigned int offset)
{
        bv->bv_page = page;
        bv->bv_len = len;
        bv->bv_offset = offset;
}

/**
 * bvec_set_folio - initialize a bvec based off a struct folio
 * @bv:                bvec to initialize
 * @folio:        folio the bvec should point to
 * @len:        length of the bvec
 * @offset:        offset into the folio
 */
static inline void bvec_set_folio(struct bio_vec *bv, struct folio *folio,
                size_t len, size_t offset)
{
        unsigned long nr = offset / PAGE_SIZE;

        WARN_ON_ONCE(len > UINT_MAX);
        bvec_set_page(bv, folio_page(folio, nr), len, offset % PAGE_SIZE);
}

/**
 * bvec_set_virt - initialize a bvec based on a virtual address
 * @bv:                bvec to initialize
 * @vaddr:        virtual address to set the bvec to
 * @len:        length of the bvec
 */
static inline void bvec_set_virt(struct bio_vec *bv, void *vaddr,
                unsigned int len)
{
        bvec_set_page(bv, virt_to_page(vaddr), len, offset_in_page(vaddr));
}

struct bvec_iter {
        /*
         * Current device address in 512 byte sectors. Only updated by the bio
         * iter wrappers and not the bvec iterator helpers themselves.
         */
        sector_t                bi_sector;

        /*
         * Remaining size in bytes.
         */
        unsigned int                bi_size;

        /*
         * Current index into the bvec array. This indexes into `bi_io_vec` when
         * iterating a bvec array that is part of a `bio`.
         */
        unsigned int                bi_idx;

        /*
         * Current offset in the bvec entry pointed to by `bi_idx`.
         */
        unsigned int                bi_bvec_done;
} __packed __aligned(4);

struct bvec_iter_all {
        struct bio_vec        bv;
        int                idx;
        unsigned        done;
};

/*
 * various member access, note that bio_data should of course not be used
 * on highmem page vectors
 */
#define __bvec_iter_bvec(bvec, iter)        (&(bvec)[(iter).bi_idx])

/* multi-page (mp_bvec) helpers */
#define mp_bvec_iter_page(bvec, iter)                                \
        (__bvec_iter_bvec((bvec), (iter))->bv_page)

#define mp_bvec_iter_len(bvec, iter)                                \
        min((iter).bi_size,                                        \
            __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)

#define mp_bvec_iter_offset(bvec, iter)                                \
        (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)

#define mp_bvec_iter_page_idx(bvec, iter)                        \
        (mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)

#define mp_bvec_iter_bvec(bvec, iter)                                \
((struct bio_vec) {                                                \
        .bv_page        = mp_bvec_iter_page((bvec), (iter)),        \
        .bv_len                = mp_bvec_iter_len((bvec), (iter)),        \
        .bv_offset        = mp_bvec_iter_offset((bvec), (iter)),        \
})

/* For building single-page bvec in flight */
 #define bvec_iter_offset(bvec, iter)                                \
        (mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)

#define bvec_iter_len(bvec, iter)                                \
        min_t(unsigned, mp_bvec_iter_len((bvec), (iter)),                \
              PAGE_SIZE - bvec_iter_offset((bvec), (iter)))

#define bvec_iter_page(bvec, iter)                                \
        (mp_bvec_iter_page((bvec), (iter)) +                        \
         mp_bvec_iter_page_idx((bvec), (iter)))

#define bvec_iter_bvec(bvec, iter)                                \
((struct bio_vec) {                                                \
        .bv_page        = bvec_iter_page((bvec), (iter)),        \
        .bv_len                = bvec_iter_len((bvec), (iter)),        \
        .bv_offset        = bvec_iter_offset((bvec), (iter)),        \
})

static inline bool bvec_iter_advance(const struct bio_vec *bv,
                struct bvec_iter *iter, unsigned bytes)
{
        unsigned int idx = iter->bi_idx;

        if (WARN_ONCE(bytes > iter->bi_size,
                     "Attempted to advance past end of bvec iter\n")) {
                iter->bi_size = 0;
                return false;
        }

        iter->bi_size -= bytes;
        bytes += iter->bi_bvec_done;

        while (bytes && bytes >= bv[idx].bv_len) {
                bytes -= bv[idx].bv_len;
                idx++;
        }

        iter->bi_idx = idx;
        iter->bi_bvec_done = bytes;
        return true;
}

/*
 * A simpler version of bvec_iter_advance(), @bytes should not span
 * across multiple bvec entries, i.e. bytes <= bv[i->bi_idx].bv_len
 */
static inline void bvec_iter_advance_single(const struct bio_vec *bv,
                                struct bvec_iter *iter, unsigned int bytes)
{
        unsigned int done = iter->bi_bvec_done + bytes;

        if (done == bv[iter->bi_idx].bv_len) {
                done = 0;
                iter->bi_idx++;
        }
        iter->bi_bvec_done = done;
        iter->bi_size -= bytes;
}

#define for_each_bvec(bvl, bio_vec, iter, start)                        \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1);        \
             bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len))

#define for_each_mp_bvec(bvl, bio_vec, iter, start)                        \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1);        \
             bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len))

static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all)
{
        iter_all->done = 0;
        iter_all->idx = 0;

        return &iter_all->bv;
}

static inline void bvec_advance(const struct bio_vec *bvec,
                                struct bvec_iter_all *iter_all)
{
        struct bio_vec *bv = &iter_all->bv;

        if (iter_all->done) {
                bv->bv_page++;
                bv->bv_offset = 0;
        } else {
                bv->bv_page = bvec->bv_page + (bvec->bv_offset >> PAGE_SHIFT);
                bv->bv_offset = bvec->bv_offset & ~PAGE_MASK;
        }
        bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset,
                           bvec->bv_len - iter_all->done);
        iter_all->done += bv->bv_len;

        if (iter_all->done == bvec->bv_len) {
                iter_all->idx++;
                iter_all->done = 0;
        }
}

/**
 * bvec_kmap_local - map a bvec into the kernel virtual address space
 * @bvec: bvec to map
 *
 * Must be called on single-page bvecs only.  Call kunmap_local on the returned
 * address to unmap.
 */
static inline void *bvec_kmap_local(struct bio_vec *bvec)
{
        return kmap_local_page(bvec->bv_page) + bvec->bv_offset;
}

/**
 * memcpy_from_bvec - copy data from a bvec
 * @bvec: bvec to copy from
 *
 * Must be called on single-page bvecs only.
 */
static inline void memcpy_from_bvec(char *to, struct bio_vec *bvec)
{
        memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, bvec->bv_len);
}

/**
 * memcpy_to_bvec - copy data to a bvec
 * @bvec: bvec to copy to
 *
 * Must be called on single-page bvecs only.
 */
static inline void memcpy_to_bvec(struct bio_vec *bvec, const char *from)
{
        memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, bvec->bv_len);
}

/**
 * memzero_bvec - zero all data in a bvec
 * @bvec: bvec to zero
 *
 * Must be called on single-page bvecs only.
 */
static inline void memzero_bvec(struct bio_vec *bvec)
{
        memzero_page(bvec->bv_page, bvec->bv_offset, bvec->bv_len);
}

/**
 * bvec_virt - return the virtual address for a bvec
 * @bvec: bvec to return the virtual address for
 *
 * Note: the caller must ensure that @bvec->bv_page is not a highmem page.
 */
static inline void *bvec_virt(struct bio_vec *bvec)
{
        WARN_ON_ONCE(PageHighMem(bvec->bv_page));
        return page_address(bvec->bv_page) + bvec->bv_offset;
}

/**
 * bvec_phys - return the physical address for a bvec
 * @bvec: bvec to return the physical address for
 */
static inline phys_addr_t bvec_phys(const struct bio_vec *bvec)
{
        return page_to_phys(bvec->bv_page) + bvec->bv_offset;
}

#endif /* __LINUX_BVEC_H */
































































































































    2 






















































































































    2 
















    2 












    1 







    1 





    2 












































































































































































































































































































































































































































































































































































































































































































































































































































































    2 




























    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS checkpoint file.
 *
 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Koji Sato.
 */

#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/errno.h>
#include "mdt.h"
#include "cpfile.h"


static inline unsigned long
nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
{
        return NILFS_MDT(cpfile)->mi_entries_per_block;
}

/* block number from the beginning of the file */
static unsigned long
nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
{
        __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;

        tcno = div64_ul(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
        return (unsigned long)tcno;
}

/* offset in block */
static unsigned long
nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
{
        __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;

        return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
}

static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile,
                                                    unsigned long blkoff)
{
        return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff
                + 1 - NILFS_MDT(cpfile)->mi_first_entry_offset;
}

static unsigned long
nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
                                  __u64 curr,
                                  __u64 max)
{
        return min_t(__u64,
                     nilfs_cpfile_checkpoints_per_block(cpfile) -
                     nilfs_cpfile_get_offset(cpfile, curr),
                     max - curr);
}

static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
                                           __u64 cno)
{
        return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
}

static unsigned int
nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
                                         struct buffer_head *bh,
                                         unsigned int n)
{
        struct nilfs_checkpoint *cp;
        unsigned int count;

        cp = kmap_local_folio(bh->b_folio,
                              offset_in_folio(bh->b_folio, bh->b_data));
        count = le32_to_cpu(cp->cp_checkpoints_count) + n;
        cp->cp_checkpoints_count = cpu_to_le32(count);
        kunmap_local(cp);
        return count;
}

static unsigned int
nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
                                         struct buffer_head *bh,
                                         unsigned int n)
{
        struct nilfs_checkpoint *cp;
        unsigned int count;

        cp = kmap_local_folio(bh->b_folio,
                              offset_in_folio(bh->b_folio, bh->b_data));
        WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
        count = le32_to_cpu(cp->cp_checkpoints_count) - n;
        cp->cp_checkpoints_count = cpu_to_le32(count);
        kunmap_local(cp);
        return count;
}

static void nilfs_cpfile_block_init(struct inode *cpfile,
                                    struct buffer_head *bh,
                                    void *from)
{
        struct nilfs_checkpoint *cp = from;
        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
        int n = nilfs_cpfile_checkpoints_per_block(cpfile);

        while (n-- > 0) {
                nilfs_checkpoint_set_invalid(cp);
                cp = (void *)cp + cpsz;
        }
}

/**
 * nilfs_cpfile_checkpoint_offset - calculate the byte offset of a checkpoint
 *                                  entry in the folio containing it
 * @cpfile: checkpoint file inode
 * @cno:    checkpoint number
 * @bh:     buffer head of block containing checkpoint indexed by @cno
 *
 * Return: Byte offset in the folio of the checkpoint specified by @cno.
 */
static size_t nilfs_cpfile_checkpoint_offset(const struct inode *cpfile,
                                             __u64 cno,
                                             struct buffer_head *bh)
{
        return offset_in_folio(bh->b_folio, bh->b_data) +
                nilfs_cpfile_get_offset(cpfile, cno) *
                NILFS_MDT(cpfile)->mi_entry_size;
}

/**
 * nilfs_cpfile_cp_snapshot_list_offset - calculate the byte offset of a
 *                                        checkpoint snapshot list in the folio
 *                                        containing it
 * @cpfile: checkpoint file inode
 * @cno:    checkpoint number
 * @bh:     buffer head of block containing checkpoint indexed by @cno
 *
 * Return: Byte offset in the folio of the checkpoint snapshot list specified
 *         by @cno.
 */
static size_t nilfs_cpfile_cp_snapshot_list_offset(const struct inode *cpfile,
                                                   __u64 cno,
                                                   struct buffer_head *bh)
{
        return nilfs_cpfile_checkpoint_offset(cpfile, cno, bh) +
                offsetof(struct nilfs_checkpoint, cp_snapshot_list);
}

/**
 * nilfs_cpfile_ch_snapshot_list_offset - calculate the byte offset of the
 *                                        snapshot list in the header
 *
 * Return: Byte offset in the folio of the checkpoint snapshot list
 */
static size_t nilfs_cpfile_ch_snapshot_list_offset(void)
{
        return offsetof(struct nilfs_cpfile_header, ch_snapshot_list);
}

static int nilfs_cpfile_get_header_block(struct inode *cpfile,
                                         struct buffer_head **bhp)
{
        int err = nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);

        if (unlikely(err == -ENOENT)) {
                nilfs_error(cpfile->i_sb,
                            "missing header block in checkpoint metadata");
                err = -EIO;
        }
        return err;
}

static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
                                                    __u64 cno,
                                                    int create,
                                                    struct buffer_head **bhp)
{
        return nilfs_mdt_get_block(cpfile,
                                   nilfs_cpfile_get_blkoff(cpfile, cno),
                                   create, nilfs_cpfile_block_init, bhp);
}

/**
 * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile
 * @cpfile: inode of cpfile
 * @start_cno: start checkpoint number (inclusive)
 * @end_cno: end checkpoint number (inclusive)
 * @cnop: place to store the next checkpoint number
 * @bhp: place to store a pointer to buffer_head struct
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - no block exists in the range.
 * * %-ENOMEM        - Insufficient memory available.
 */
static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile,
                                              __u64 start_cno, __u64 end_cno,
                                              __u64 *cnop,
                                              struct buffer_head **bhp)
{
        unsigned long start, end, blkoff;
        int ret;

        if (unlikely(start_cno > end_cno))
                return -ENOENT;

        start = nilfs_cpfile_get_blkoff(cpfile, start_cno);
        end = nilfs_cpfile_get_blkoff(cpfile, end_cno);

        ret = nilfs_mdt_find_block(cpfile, start, end, &blkoff, bhp);
        if (!ret)
                *cnop = (blkoff == start) ? start_cno :
                        nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff);
        return ret;
}

static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
                                                       __u64 cno)
{
        return nilfs_mdt_delete_block(cpfile,
                                      nilfs_cpfile_get_blkoff(cpfile, cno));
}

/**
 * nilfs_cpfile_read_checkpoint - read a checkpoint entry in cpfile
 * @cpfile: checkpoint file inode
 * @cno:    number of checkpoint entry to read
 * @root:   nilfs root object
 * @ifile:  ifile's inode to read and attach to @root
 *
 * This function imports checkpoint information from the checkpoint file and
 * stores it to the inode file given by @ifile and the nilfs root object
 * given by @root.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - Invalid checkpoint.
 * * %-ENOMEM        - Insufficient memory available.
 * * %-EIO        - I/O error (including metadata corruption).
 */
int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
                                 struct nilfs_root *root, struct inode *ifile)
{
        struct buffer_head *cp_bh;
        struct nilfs_checkpoint *cp;
        size_t offset;
        int ret;

        if (cno < 1 || cno > nilfs_mdt_cno(cpfile))
                return -EINVAL;

        down_read(&NILFS_MDT(cpfile)->mi_sem);
        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
        if (unlikely(ret < 0)) {
                if (ret == -ENOENT)
                        ret = -EINVAL;
                goto out_sem;
        }

        offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh);
        cp = kmap_local_folio(cp_bh->b_folio, offset);
        if (nilfs_checkpoint_invalid(cp)) {
                ret = -EINVAL;
                goto put_cp;
        }

        ret = nilfs_read_inode_common(ifile, &cp->cp_ifile_inode);
        if (unlikely(ret)) {
                /*
                 * Since this inode is on a checkpoint entry, treat errors
                 * as metadata corruption.
                 */
                nilfs_err(cpfile->i_sb,
                          "ifile inode (checkpoint number=%llu) corrupted",
                          (unsigned long long)cno);
                ret = -EIO;
                goto put_cp;
        }

        /* Configure the nilfs root object */
        atomic64_set(&root->inodes_count, le64_to_cpu(cp->cp_inodes_count));
        atomic64_set(&root->blocks_count, le64_to_cpu(cp->cp_blocks_count));
        root->ifile = ifile;

put_cp:
        kunmap_local(cp);
        brelse(cp_bh);
out_sem:
        up_read(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
}

/**
 * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile
 * @cpfile: checkpoint file inode
 * @cno:    number of checkpoint to set up
 *
 * This function creates a checkpoint with the number specified by @cno on
 * cpfile.  If the specified checkpoint entry already exists due to a past
 * failure, it will be reused without returning an error.
 * In either case, the buffer of the block containing the checkpoint entry
 * and the cpfile inode are made dirty for inclusion in the write log.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-ENOMEM        - Insufficient memory available.
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-EROFS        - Read only filesystem
 */
int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno)
{
        struct buffer_head *header_bh, *cp_bh;
        struct nilfs_cpfile_header *header;
        struct nilfs_checkpoint *cp;
        size_t offset;
        int ret;

        if (WARN_ON_ONCE(cno < 1))
                return -EIO;

        down_write(&NILFS_MDT(cpfile)->mi_sem);
        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
        if (unlikely(ret < 0))
                goto out_sem;

        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 1, &cp_bh);
        if (unlikely(ret < 0))
                goto out_header;

        offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh);
        cp = kmap_local_folio(cp_bh->b_folio, offset);
        if (nilfs_checkpoint_invalid(cp)) {
                /* a newly-created checkpoint */
                nilfs_checkpoint_clear_invalid(cp);
                kunmap_local(cp);
                if (!nilfs_cpfile_is_in_first(cpfile, cno))
                        nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
                                                                 1);

                header = kmap_local_folio(header_bh->b_folio, 0);
                le64_add_cpu(&header->ch_ncheckpoints, 1);
                kunmap_local(header);
                mark_buffer_dirty(header_bh);
        } else {
                kunmap_local(cp);
        }

        /* Force the buffer and the inode to become dirty */
        mark_buffer_dirty(cp_bh);
        brelse(cp_bh);
        nilfs_mdt_mark_dirty(cpfile);

out_header:
        brelse(header_bh);

out_sem:
        up_write(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
}

/**
 * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile
 * @cpfile: checkpoint file inode
 * @cno:    checkpoint number
 * @root:   nilfs root object
 * @blkinc: number of blocks added by this checkpoint
 * @ctime:  checkpoint creation time
 * @minor:  minor checkpoint flag
 *
 * This function completes the checkpoint entry numbered by @cno in the
 * cpfile with the data given by the arguments @root, @blkinc, @ctime, and
 * @minor.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-ENOMEM        - Insufficient memory available.
 * * %-EIO        - I/O error (including metadata corruption).
 */
int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
                                     struct nilfs_root *root, __u64 blkinc,
                                     time64_t ctime, bool minor)
{
        struct buffer_head *cp_bh;
        struct nilfs_checkpoint *cp;
        size_t offset;
        int ret;

        if (WARN_ON_ONCE(cno < 1))
                return -EIO;

        down_write(&NILFS_MDT(cpfile)->mi_sem);
        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
        if (unlikely(ret < 0)) {
                if (ret == -ENOENT)
                        goto error;
                goto out_sem;
        }

        offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh);
        cp = kmap_local_folio(cp_bh->b_folio, offset);
        if (unlikely(nilfs_checkpoint_invalid(cp))) {
                kunmap_local(cp);
                brelse(cp_bh);
                goto error;
        }

        cp->cp_snapshot_list.ssl_next = 0;
        cp->cp_snapshot_list.ssl_prev = 0;
        cp->cp_inodes_count = cpu_to_le64(atomic64_read(&root->inodes_count));
        cp->cp_blocks_count = cpu_to_le64(atomic64_read(&root->blocks_count));
        cp->cp_nblk_inc = cpu_to_le64(blkinc);
        cp->cp_create = cpu_to_le64(ctime);
        cp->cp_cno = cpu_to_le64(cno);

        if (minor)
                nilfs_checkpoint_set_minor(cp);
        else
                nilfs_checkpoint_clear_minor(cp);

        nilfs_write_inode_common(root->ifile, &cp->cp_ifile_inode);
        nilfs_bmap_write(NILFS_I(root->ifile)->i_bmap, &cp->cp_ifile_inode);

        kunmap_local(cp);
        brelse(cp_bh);
out_sem:
        up_write(&NILFS_MDT(cpfile)->mi_sem);
        return ret;

error:
        nilfs_error(cpfile->i_sb,
                    "checkpoint finalization failed due to metadata corruption.");
        ret = -EIO;
        goto out_sem;
}

/**
 * nilfs_cpfile_delete_checkpoints - delete checkpoints
 * @cpfile: inode of checkpoint file
 * @start: start checkpoint number
 * @end: end checkpoint number
 *
 * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
 * the period from @start to @end, excluding @end itself. The checkpoints
 * which have been already deleted are ignored.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - Invalid checkpoints.
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                                    __u64 start,
                                    __u64 end)
{
        struct buffer_head *header_bh, *cp_bh;
        struct nilfs_cpfile_header *header;
        struct nilfs_checkpoint *cp;
        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
        __u64 cno;
        size_t offset;
        void *kaddr;
        unsigned long tnicps;
        int ret, ncps, nicps, nss, count, i;

        if (unlikely(start == 0 || start > end)) {
                nilfs_err(cpfile->i_sb,
                          "cannot delete checkpoints: invalid range [%llu, %llu)",
                          (unsigned long long)start, (unsigned long long)end);
                return -EINVAL;
        }

        down_write(&NILFS_MDT(cpfile)->mi_sem);

        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
        if (ret < 0)
                goto out_sem;
        tnicps = 0;
        nss = 0;

        for (cno = start; cno < end; cno += ncps) {
                ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
                ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
                if (ret < 0) {
                        if (ret != -ENOENT)
                                break;
                        /* skip hole */
                        ret = 0;
                        continue;
                }

                offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh);
                cp = kaddr = kmap_local_folio(cp_bh->b_folio, offset);
                nicps = 0;
                for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
                        if (nilfs_checkpoint_snapshot(cp)) {
                                nss++;
                        } else if (!nilfs_checkpoint_invalid(cp)) {
                                nilfs_checkpoint_set_invalid(cp);
                                nicps++;
                        }
                }
                kunmap_local(kaddr);

                if (nicps <= 0) {
                        brelse(cp_bh);
                        continue;
                }

                tnicps += nicps;
                mark_buffer_dirty(cp_bh);
                nilfs_mdt_mark_dirty(cpfile);
                if (nilfs_cpfile_is_in_first(cpfile, cno)) {
                        brelse(cp_bh);
                        continue;
                }

                count = nilfs_cpfile_block_sub_valid_checkpoints(cpfile, cp_bh,
                                                                 nicps);
                brelse(cp_bh);
                if (count)
                        continue;

                /* Delete the block if there are no more valid checkpoints */
                ret = nilfs_cpfile_delete_checkpoint_block(cpfile, cno);
                if (unlikely(ret)) {
                        nilfs_err(cpfile->i_sb,
                                  "error %d deleting checkpoint block", ret);
                        break;
                }
        }

        if (tnicps > 0) {
                header = kmap_local_folio(header_bh->b_folio, 0);
                le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
                mark_buffer_dirty(header_bh);
                nilfs_mdt_mark_dirty(cpfile);
                kunmap_local(header);
        }

        brelse(header_bh);
        if (nss > 0)
                ret = -EBUSY;

 out_sem:
        up_write(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
}

static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
                                              struct nilfs_checkpoint *cp,
                                              struct nilfs_cpinfo *ci)
{
        ci->ci_flags = le32_to_cpu(cp->cp_flags);
        ci->ci_cno = le64_to_cpu(cp->cp_cno);
        ci->ci_create = le64_to_cpu(cp->cp_create);
        ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
        ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
        ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
        ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
}

static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
                                          void *buf, unsigned int cisz,
                                          size_t nci)
{
        struct nilfs_checkpoint *cp;
        struct nilfs_cpinfo *ci = buf;
        struct buffer_head *bh;
        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
        __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
        size_t offset;
        void *kaddr;
        int n, ret;
        int ncps, i;

        if (cno == 0)
                return -ENOENT; /* checkpoint number 0 is invalid */
        down_read(&NILFS_MDT(cpfile)->mi_sem);

        for (n = 0; n < nci; cno += ncps) {
                ret = nilfs_cpfile_find_checkpoint_block(
                        cpfile, cno, cur_cno - 1, &cno, &bh);
                if (ret < 0) {
                        if (likely(ret == -ENOENT))
                                break;
                        goto out;
                }
                ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);

                offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, bh);
                cp = kaddr = kmap_local_folio(bh->b_folio, offset);
                for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
                        if (!nilfs_checkpoint_invalid(cp)) {
                                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp,
                                                                  ci);
                                ci = (void *)ci + cisz;
                                n++;
                        }
                }
                kunmap_local(kaddr);
                brelse(bh);
        }

        ret = n;
        if (n > 0) {
                ci = (void *)ci - cisz;
                *cnop = ci->ci_cno + 1;
        }

 out:
        up_read(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
}

static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
                                          void *buf, unsigned int cisz,
                                          size_t nci)
{
        struct buffer_head *bh;
        struct nilfs_cpfile_header *header;
        struct nilfs_checkpoint *cp;
        struct nilfs_cpinfo *ci = buf;
        __u64 curr = *cnop, next;
        unsigned long curr_blkoff, next_blkoff;
        size_t offset;
        int n = 0, ret;

        down_read(&NILFS_MDT(cpfile)->mi_sem);

        if (curr == 0) {
                ret = nilfs_cpfile_get_header_block(cpfile, &bh);
                if (ret < 0)
                        goto out;
                header = kmap_local_folio(bh->b_folio, 0);
                curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
                kunmap_local(header);
                brelse(bh);
                if (curr == 0) {
                        ret = 0;
                        goto out;
                }
        } else if (unlikely(curr == ~(__u64)0)) {
                ret = 0;
                goto out;
        }

        curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
        ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
        if (unlikely(ret < 0)) {
                if (ret == -ENOENT)
                        ret = 0; /* No snapshots (started from a hole block) */
                goto out;
        }
        offset = nilfs_cpfile_checkpoint_offset(cpfile, curr, bh);
        cp = kmap_local_folio(bh->b_folio, offset);
        while (n < nci) {
                curr = ~(__u64)0; /* Terminator */
                if (unlikely(nilfs_checkpoint_invalid(cp) ||
                             !nilfs_checkpoint_snapshot(cp)))
                        break;
                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci);
                ci = (void *)ci + cisz;
                n++;
                next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
                if (next == 0)
                        break; /* reach end of the snapshot list */

                kunmap_local(cp);
                next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
                if (curr_blkoff != next_blkoff) {
                        brelse(bh);
                        ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
                                                                0, &bh);
                        if (unlikely(ret < 0)) {
                                WARN_ON(ret == -ENOENT);
                                goto out;
                        }
                }
                offset = nilfs_cpfile_checkpoint_offset(cpfile, next, bh);
                cp = kmap_local_folio(bh->b_folio, offset);
                curr = next;
                curr_blkoff = next_blkoff;
        }
        kunmap_local(cp);
        brelse(bh);
        *cnop = curr;
        ret = n;

 out:
        up_read(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
}

/**
 * nilfs_cpfile_get_cpinfo - get information on checkpoints
 * @cpfile: checkpoint file inode
 * @cnop:   place to pass a starting checkpoint number and receive a
 *          checkpoint number to continue the search
 * @mode:   mode of checkpoints that the caller wants to retrieve
 * @buf:    buffer for storing checkpoints' information
 * @cisz:   byte size of one checkpoint info item in array
 * @nci:    number of checkpoint info items to retrieve
 *
 * nilfs_cpfile_get_cpinfo() searches for checkpoints in @mode state
 * starting from the checkpoint number stored in @cnop, and stores
 * information about found checkpoints in @buf.
 * The buffer pointed to by @buf must be large enough to store information
 * for @nci checkpoints.  If at least one checkpoint information is
 * successfully retrieved, @cnop is updated to point to the checkpoint
 * number to continue searching.
 *
 * Return: Count of checkpoint info items stored in the output buffer on
 * success, or one of the following negative error codes on failure:
 * * %-EINVAL        - Invalid checkpoint mode.
 * * %-ENOMEM        - Insufficient memory available.
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - Invalid checkpoint number specified.
 */

ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
                                void *buf, unsigned int cisz, size_t nci)
{
        switch (mode) {
        case NILFS_CHECKPOINT:
                return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci);
        case NILFS_SNAPSHOT:
                return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci);
        default:
                return -EINVAL;
        }
}

/**
 * nilfs_cpfile_delete_checkpoint - delete a checkpoint
 * @cpfile: checkpoint file inode
 * @cno:    checkpoint number to delete
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EBUSY        - Checkpoint in use (snapshot specified).
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - No valid checkpoint found.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
{
        struct nilfs_cpinfo ci;
        __u64 tcno = cno;
        ssize_t nci;

        nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1);
        if (nci < 0)
                return nci;
        else if (nci == 0 || ci.ci_cno != cno)
                return -ENOENT;
        else if (nilfs_cpinfo_snapshot(&ci))
                return -EBUSY;

        return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
}

static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
{
        struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
        struct nilfs_cpfile_header *header;
        struct nilfs_checkpoint *cp;
        struct nilfs_snapshot_list *list;
        __u64 curr, prev;
        unsigned long curr_blkoff, prev_blkoff;
        size_t offset, curr_list_offset, prev_list_offset;
        int ret;

        if (cno == 0)
                return -ENOENT; /* checkpoint number 0 is invalid */
        down_write(&NILFS_MDT(cpfile)->mi_sem);

        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
        if (unlikely(ret < 0))
                goto out_sem;

        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
        if (ret < 0)
                goto out_header;

        offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh);
        cp = kmap_local_folio(cp_bh->b_folio, offset);
        if (nilfs_checkpoint_invalid(cp)) {
                ret = -ENOENT;
                kunmap_local(cp);
                goto out_cp;
        }
        if (nilfs_checkpoint_snapshot(cp)) {
                ret = 0;
                kunmap_local(cp);
                goto out_cp;
        }
        kunmap_local(cp);

        /*
         * Find the last snapshot before the checkpoint being changed to
         * snapshot mode by going backwards through the snapshot list.
         * Set "prev" to its checkpoint number, or 0 if not found.
         */
        header = kmap_local_folio(header_bh->b_folio, 0);
        list = &header->ch_snapshot_list;
        curr_bh = header_bh;
        get_bh(curr_bh);
        curr = 0;
        curr_blkoff = 0;
        curr_list_offset = nilfs_cpfile_ch_snapshot_list_offset();
        prev = le64_to_cpu(list->ssl_prev);
        while (prev > cno) {
                prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
                curr = prev;
                kunmap_local(list);
                if (curr_blkoff != prev_blkoff) {
                        brelse(curr_bh);
                        ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
                                                                0, &curr_bh);
                        if (unlikely(ret < 0))
                                goto out_cp;
                }
                curr_list_offset = nilfs_cpfile_cp_snapshot_list_offset(
                        cpfile, curr, curr_bh);
                list = kmap_local_folio(curr_bh->b_folio, curr_list_offset);
                curr_blkoff = prev_blkoff;
                prev = le64_to_cpu(list->ssl_prev);
        }
        kunmap_local(list);

        if (prev != 0) {
                ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
                                                        &prev_bh);
                if (ret < 0)
                        goto out_curr;

                prev_list_offset = nilfs_cpfile_cp_snapshot_list_offset(
                        cpfile, prev, prev_bh);
        } else {
                prev_bh = header_bh;
                get_bh(prev_bh);
                prev_list_offset = nilfs_cpfile_ch_snapshot_list_offset();
        }

        /* Update the list entry for the next snapshot */
        list = kmap_local_folio(curr_bh->b_folio, curr_list_offset);
        list->ssl_prev = cpu_to_le64(cno);
        kunmap_local(list);

        /* Update the checkpoint being changed to a snapshot */
        offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh);
        cp = kmap_local_folio(cp_bh->b_folio, offset);
        cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
        cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
        nilfs_checkpoint_set_snapshot(cp);
        kunmap_local(cp);

        /* Update the list entry for the previous snapshot */
        list = kmap_local_folio(prev_bh->b_folio, prev_list_offset);
        list->ssl_next = cpu_to_le64(cno);
        kunmap_local(list);

        /* Update the statistics in the header */
        header = kmap_local_folio(header_bh->b_folio, 0);
        le64_add_cpu(&header->ch_nsnapshots, 1);
        kunmap_local(header);

        mark_buffer_dirty(prev_bh);
        mark_buffer_dirty(curr_bh);
        mark_buffer_dirty(cp_bh);
        mark_buffer_dirty(header_bh);
        nilfs_mdt_mark_dirty(cpfile);

        brelse(prev_bh);

 out_curr:
        brelse(curr_bh);

 out_cp:
        brelse(cp_bh);

 out_header:
        brelse(header_bh);

 out_sem:
        up_write(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
}

static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
{
        struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
        struct nilfs_cpfile_header *header;
        struct nilfs_checkpoint *cp;
        struct nilfs_snapshot_list *list;
        __u64 next, prev;
        size_t offset, next_list_offset, prev_list_offset;
        int ret;

        if (cno == 0)
                return -ENOENT; /* checkpoint number 0 is invalid */
        down_write(&NILFS_MDT(cpfile)->mi_sem);

        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
        if (unlikely(ret < 0))
                goto out_sem;

        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
        if (ret < 0)
                goto out_header;

        offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh);
        cp = kmap_local_folio(cp_bh->b_folio, offset);
        if (nilfs_checkpoint_invalid(cp)) {
                ret = -ENOENT;
                kunmap_local(cp);
                goto out_cp;
        }
        if (!nilfs_checkpoint_snapshot(cp)) {
                ret = 0;
                kunmap_local(cp);
                goto out_cp;
        }

        list = &cp->cp_snapshot_list;
        next = le64_to_cpu(list->ssl_next);
        prev = le64_to_cpu(list->ssl_prev);
        kunmap_local(cp);

        if (next != 0) {
                ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
                                                        &next_bh);
                if (ret < 0)
                        goto out_cp;

                next_list_offset = nilfs_cpfile_cp_snapshot_list_offset(
                        cpfile, next, next_bh);
        } else {
                next_bh = header_bh;
                get_bh(next_bh);
                next_list_offset = nilfs_cpfile_ch_snapshot_list_offset();
        }
        if (prev != 0) {
                ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
                                                        &prev_bh);
                if (ret < 0)
                        goto out_next;

                prev_list_offset = nilfs_cpfile_cp_snapshot_list_offset(
                        cpfile, prev, prev_bh);
        } else {
                prev_bh = header_bh;
                get_bh(prev_bh);
                prev_list_offset = nilfs_cpfile_ch_snapshot_list_offset();
        }

        /* Update the list entry for the next snapshot */
        list = kmap_local_folio(next_bh->b_folio, next_list_offset);
        list->ssl_prev = cpu_to_le64(prev);
        kunmap_local(list);

        /* Update the list entry for the previous snapshot */
        list = kmap_local_folio(prev_bh->b_folio, prev_list_offset);
        list->ssl_next = cpu_to_le64(next);
        kunmap_local(list);

        /* Update the snapshot being changed back to a plain checkpoint */
        cp = kmap_local_folio(cp_bh->b_folio, offset);
        cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
        cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
        nilfs_checkpoint_clear_snapshot(cp);
        kunmap_local(cp);

        /* Update the statistics in the header */
        header = kmap_local_folio(header_bh->b_folio, 0);
        le64_add_cpu(&header->ch_nsnapshots, -1);
        kunmap_local(header);

        mark_buffer_dirty(next_bh);
        mark_buffer_dirty(prev_bh);
        mark_buffer_dirty(cp_bh);
        mark_buffer_dirty(header_bh);
        nilfs_mdt_mark_dirty(cpfile);

        brelse(prev_bh);

 out_next:
        brelse(next_bh);

 out_cp:
        brelse(cp_bh);

 out_header:
        brelse(header_bh);

 out_sem:
        up_write(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
}

/**
 * nilfs_cpfile_is_snapshot - determine if checkpoint is a snapshot
 * @cpfile: inode of checkpoint file
 * @cno:    checkpoint number
 *
 * Return: 1 if the checkpoint specified by @cno is a snapshot, 0 if not, or
 * one of the following negative error codes on failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - No such checkpoint.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
{
        struct buffer_head *bh;
        struct nilfs_checkpoint *cp;
        size_t offset;
        int ret;

        /*
         * CP number is invalid if it's zero or larger than the
         * largest existing one.
         */
        if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
                return -ENOENT;
        down_read(&NILFS_MDT(cpfile)->mi_sem);

        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
        if (ret < 0)
                goto out;

        offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, bh);
        cp = kmap_local_folio(bh->b_folio, offset);
        if (nilfs_checkpoint_invalid(cp))
                ret = -ENOENT;
        else
                ret = nilfs_checkpoint_snapshot(cp);
        kunmap_local(cp);
        brelse(bh);

 out:
        up_read(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
}

/**
 * nilfs_cpfile_change_cpmode - change checkpoint mode
 * @cpfile: inode of checkpoint file
 * @cno: checkpoint number
 * @mode: mode of checkpoint
 *
 * Description: nilfs_change_cpmode() changes the mode of the checkpoint
 * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - No such checkpoint.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
{
        int ret;

        switch (mode) {
        case NILFS_CHECKPOINT:
                if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
                        /*
                         * Current implementation does not have to protect
                         * plain read-only mounts since they are exclusive
                         * with a read/write mount and are protected from the
                         * cleaner.
                         */
                        ret = -EBUSY;
                else
                        ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
                return ret;
        case NILFS_SNAPSHOT:
                return nilfs_cpfile_set_snapshot(cpfile, cno);
        default:
                return -EINVAL;
        }
}

/**
 * nilfs_cpfile_get_stat - get checkpoint statistics
 * @cpfile: inode of checkpoint file
 * @cpstat: pointer to a structure of checkpoint statistics
 *
 * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
 * The checkpoint statistics are stored in the location pointed to by @cpstat.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
{
        struct buffer_head *bh;
        struct nilfs_cpfile_header *header;
        int ret;

        down_read(&NILFS_MDT(cpfile)->mi_sem);

        ret = nilfs_cpfile_get_header_block(cpfile, &bh);
        if (ret < 0)
                goto out_sem;
        header = kmap_local_folio(bh->b_folio, 0);
        cpstat->cs_cno = nilfs_mdt_cno(cpfile);
        cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
        cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
        kunmap_local(header);
        brelse(bh);

 out_sem:
        up_read(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
}

/**
 * nilfs_cpfile_read - read or get cpfile inode
 * @sb: super block instance
 * @cpsize: size of a checkpoint entry
 * @raw_inode: on-disk cpfile inode
 * @inodep: buffer to store the inode
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
                      struct nilfs_inode *raw_inode, struct inode **inodep)
{
        struct inode *cpfile;
        int err;

        if (cpsize > sb->s_blocksize) {
                nilfs_err(sb, "too large checkpoint size: %zu bytes", cpsize);
                return -EINVAL;
        } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
                nilfs_err(sb, "too small checkpoint size: %zu bytes", cpsize);
                return -EINVAL;
        }

        cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
        if (unlikely(!cpfile))
                return -ENOMEM;
        if (!(inode_state_read_once(cpfile) & I_NEW))
                goto out;

        err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
        if (err)
                goto failed;

        nilfs_mdt_set_entry_size(cpfile, cpsize,
                                 sizeof(struct nilfs_cpfile_header));

        err = nilfs_read_inode_common(cpfile, raw_inode);
        if (err)
                goto failed;

        unlock_new_inode(cpfile);
 out:
        *inodep = cpfile;
        return 0;
 failed:
        iget_failed(cpfile);
        return err;
}




































































































































































































































































































































































































































































































































































































































































































































































































































































   23 


































   49 



























































































   39 

























































































































































































































   13 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright IBM Corporation, 2001
 *
 * Author: Dipankar Sarma <dipankar@in.ibm.com>
 *
 * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com>
 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
 * Papers:
 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                http://lse.sourceforge.net/locking/rcupdate.html
 *
 */

#ifndef __LINUX_RCUPDATE_H
#define __LINUX_RCUPDATE_H

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/atomic.h>
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/processor.h>
#include <linux/context_tracking_irq.h>

token_context_lock(RCU, __reentrant_ctx_lock);
token_context_lock_instance(RCU, RCU_SCHED);
token_context_lock_instance(RCU, RCU_BH);

/*
 * A convenience macro that can be used for RCU-protected globals or struct
 * members; adds type qualifier __rcu, and also enforces __guarded_by(RCU).
 */
#define __rcu_guarded __rcu __guarded_by(RCU)

#define ULONG_CMP_GE(a, b)        (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b)        (ULONG_MAX / 2 < (a) - (b))

#define RCU_SEQ_CTR_SHIFT    2
#define RCU_SEQ_STATE_MASK   ((1 << RCU_SEQ_CTR_SHIFT) - 1)

/* Exported common interfaces */
void call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier_tasks(void);
void synchronize_rcu(void);

struct rcu_gp_oldstate;
unsigned long get_completed_synchronize_rcu(void);
void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);

// Maximum number of unsigned long values corresponding to
// not-yet-completed RCU grace periods.
#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2

/**
 * same_state_synchronize_rcu - Are two old-state values identical?
 * @oldstate1: First old-state value.
 * @oldstate2: Second old-state value.
 *
 * The two old-state values must have been obtained from either
 * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or
 * get_completed_synchronize_rcu().  Returns @true if the two values are
 * identical and @false otherwise.  This allows structures whose lifetimes
 * are tracked by old-state values to push these values to a list header,
 * allowing those structures to be slightly smaller.
 */
static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2)
{
        return oldstate1 == oldstate2;
}

#ifdef CONFIG_PREEMPT_RCU

void __rcu_read_lock(void);
void __rcu_read_unlock(void);

/*
 * Defined as a macro as it is a very low level header included from
 * areas that don't even know about current.  This gives the rcu_read_lock()
 * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
 */
#define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting)

#else /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TINY_RCU
#define rcu_read_unlock_strict() do { } while (0)
#else
void rcu_read_unlock_strict(void);
#endif

static inline void __rcu_read_lock(void)
{
        preempt_disable();
}

static inline void __rcu_read_unlock(void)
{
        if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
                rcu_read_unlock_strict();
        preempt_enable();
}

static inline int rcu_preempt_depth(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_RCU_LAZY
void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func);
#else
static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
{
        call_rcu(head, func);
}
#endif

/* Internal to kernel */
void rcu_init(void);
extern int rcu_scheduler_active;
void rcu_sched_clock_irq(int user);

#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
#else /* #ifdef CONFIG_RCU_STALL_COMMON */
static inline void rcu_sysrq_start(void) { }
static inline void rcu_sysrq_end(void) { }
#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */

#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK))
void rcu_irq_work_resched(void);
#else
static __always_inline void rcu_irq_work_resched(void) { }
#endif

#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
int rcu_nocb_cpu_offload(int cpu);
int rcu_nocb_cpu_deoffload(int cpu);
void rcu_nocb_flush_deferred_wakeup(void);

#define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s)

#else /* #ifdef CONFIG_RCU_NOCB_CPU */

static inline void rcu_init_nohz(void) { }
static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
static inline void rcu_nocb_flush_deferred_wakeup(void) { }

#define RCU_NOCB_LOCKDEP_WARN(c, s)

#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */

/*
 * Note a quasi-voluntary context switch for RCU-tasks's benefit.
 * This is a macro rather than an inline function to avoid #include hell.
 */
#ifdef CONFIG_TASKS_RCU_GENERIC

# ifdef CONFIG_TASKS_RCU
# define rcu_tasks_classic_qs(t, preempt)                                \
        do {                                                                \
                if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout))        \
                        WRITE_ONCE((t)->rcu_tasks_holdout, false);        \
        } while (0)
void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks(void);
void rcu_tasks_torture_stats_print(char *tt, char *tf);
# else
# define rcu_tasks_classic_qs(t, preempt) do { } while (0)
# define call_rcu_tasks call_rcu
# define synchronize_rcu_tasks synchronize_rcu
# endif

#define rcu_tasks_qs(t, preempt) rcu_tasks_classic_qs((t), (preempt))

# ifdef CONFIG_TASKS_RUDE_RCU
void synchronize_rcu_tasks_rude(void);
void rcu_tasks_rude_torture_stats_print(char *tt, char *tf);
# endif

#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false)
void exit_tasks_rcu_start(void);
void exit_tasks_rcu_finish(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
#define rcu_tasks_classic_qs(t, preempt) do { } while (0)
#define rcu_tasks_qs(t, preempt) do { } while (0)
#define rcu_note_voluntary_context_switch(t) do { } while (0)
#define call_rcu_tasks call_rcu
#define synchronize_rcu_tasks synchronize_rcu
static inline void exit_tasks_rcu_start(void) { }
static inline void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */

/**
 * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
 *
 * This macro resembles cond_resched(), except that it is defined to
 * report potential quiescent states to RCU-tasks even if the cond_resched()
 * machinery were to be shut off, as some advocate for PREEMPTION kernels.
 */
#define cond_resched_tasks_rcu_qs() \
do { \
        rcu_tasks_qs(current, false); \
        cond_resched(); \
} while (0)

/**
 * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states
 * @old_ts: jiffies at start of processing.
 *
 * This helper is for long-running softirq handlers, such as NAPI threads in
 * networking. The caller should initialize the variable passed in as @old_ts
 * at the beginning of the softirq handler. When invoked frequently, this macro
 * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will
 * provide both RCU and RCU-Tasks quiescent states. Note that this macro
 * modifies its old_ts argument.
 *
 * Because regions of code that have disabled softirq act as RCU read-side
 * critical sections, this macro should be invoked with softirq (and
 * preemption) enabled.
 *
 * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would
 * have more chance to invoke schedule() calls and provide necessary quiescent
 * states. As a contrast, calling cond_resched() only won't achieve the same
 * effect because cond_resched() does not provide RCU-Tasks quiescent states.
 */
#define rcu_softirq_qs_periodic(old_ts) \
do { \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \
            time_after(jiffies, (old_ts) + HZ / 10)) { \
                preempt_disable(); \
                rcu_softirq_qs(); \
                preempt_enable(); \
                (old_ts) = jiffies; \
        } \
} while (0)

/*
 * Infrastructure to implement the synchronize_() primitives in
 * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
 */

#if defined(CONFIG_TREE_RCU)
#include <linux/rcutree.h>
#elif defined(CONFIG_TINY_RCU)
#include <linux/rcutiny.h>
#else
#error "Unknown RCU implementation specified to kernel configuration"
#endif

/*
 * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls
 * are needed for dynamic initialization and destruction of rcu_head
 * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for
 * dynamic initialization and destruction of statically allocated rcu_head
 * structures.  However, rcu_head structures allocated dynamically in the
 * heap don't need any initialization.
 */
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head);
void destroy_rcu_head(struct rcu_head *head);
void init_rcu_head_on_stack(struct rcu_head *head);
void destroy_rcu_head_on_stack(struct rcu_head *head);
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
static inline void init_rcu_head(struct rcu_head *head) { }
static inline void destroy_rcu_head(struct rcu_head *head) { }
static inline void init_rcu_head_on_stack(struct rcu_head *head) { }
static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { }
#endif        /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */

#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
bool rcu_lockdep_current_cpu_online(void);
#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
static inline bool rcu_lockdep_current_cpu_online(void) { return true; }
#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */

extern struct lockdep_map rcu_lock_map;
extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
extern struct lockdep_map rcu_callback_map;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

static inline void rcu_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_try_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_lock_release(struct lockdep_map *map)
{
        lock_release(map, _THIS_IP_);
}

int debug_lockdep_rcu_enabled(void);
int rcu_read_lock_held(void);
int rcu_read_lock_bh_held(void);
int rcu_read_lock_sched_held(void);
int rcu_read_lock_any_held(void);

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

# define rcu_lock_acquire(a)                do { } while (0)
# define rcu_try_lock_acquire(a)        do { } while (0)
# define rcu_lock_release(a)                do { } while (0)

static inline int rcu_read_lock_held(void)
{
        return 1;
}

static inline int rcu_read_lock_bh_held(void)
{
        return 1;
}

static inline int rcu_read_lock_sched_held(void)
{
        return !preemptible();
}

static inline int rcu_read_lock_any_held(void)
{
        return !preemptible();
}

static inline int debug_lockdep_rcu_enabled(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_PROVE_RCU

/**
 * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
 * @c: condition to check
 * @s: informative message
 *
 * This checks debug_lockdep_rcu_enabled() before checking (c) to
 * prevent early boot splats due to lockdep not yet being initialized,
 * and rechecks it after checking (c) to prevent false-positive splats
 * due to races with lockdep being disabled.  See commit 3066820034b5dd
 * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail.
 */
#define RCU_LOCKDEP_WARN(c, s)                                                \
        do {                                                                \
                static bool __section(".data..unlikely") __warned;        \
                if (debug_lockdep_rcu_enabled() && (c) &&                \
                    debug_lockdep_rcu_enabled() && !__warned) {                \
                        __warned = true;                                \
                        lockdep_rcu_suspicious(__FILE__, __LINE__, s);        \
                }                                                        \
        } while (0)

#ifndef CONFIG_PREEMPT_RCU
static inline void rcu_preempt_sleep_check(void)
{
        RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
                         "Illegal context switch in RCU read-side critical section");
}
#else // #ifndef CONFIG_PREEMPT_RCU
static inline void rcu_preempt_sleep_check(void) { }
#endif // #else // #ifndef CONFIG_PREEMPT_RCU

#define rcu_sleep_check()                                                \
        do {                                                                \
                rcu_preempt_sleep_check();                                \
                if (!IS_ENABLED(CONFIG_PREEMPT_RT))                        \
                    RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),        \
                                 "Illegal context switch in RCU-bh read-side critical section"); \
                RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),        \
                                 "Illegal context switch in RCU-sched read-side critical section"); \
        } while (0)

// See RCU_LOCKDEP_WARN() for an explanation of the double call to
// debug_lockdep_rcu_enabled().
static __always_inline bool lockdep_assert_rcu_helper(bool c, const struct __ctx_lock_RCU *ctx)
        __assumes_shared_ctx_lock(RCU) __assumes_shared_ctx_lock(ctx)
{
        return debug_lockdep_rcu_enabled() &&
               (c || !rcu_is_watching() || !rcu_lockdep_current_cpu_online()) &&
               debug_lockdep_rcu_enabled();
}

/**
 * lockdep_assert_in_rcu_read_lock - WARN if not protected by rcu_read_lock()
 *
 * Splats if lockdep is enabled and there is no rcu_read_lock() in effect.
 */
#define lockdep_assert_in_rcu_read_lock() \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map), RCU))

/**
 * lockdep_assert_in_rcu_read_lock_bh - WARN if not protected by rcu_read_lock_bh()
 *
 * Splats if lockdep is enabled and there is no rcu_read_lock_bh() in effect.
 * Note that local_bh_disable() and friends do not suffice here, instead an
 * actual rcu_read_lock_bh() is required.
 */
#define lockdep_assert_in_rcu_read_lock_bh() \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_bh_lock_map), RCU_BH))

/**
 * lockdep_assert_in_rcu_read_lock_sched - WARN if not protected by rcu_read_lock_sched()
 *
 * Splats if lockdep is enabled and there is no rcu_read_lock_sched()
 * in effect.  Note that preempt_disable() and friends do not suffice here,
 * instead an actual rcu_read_lock_sched() is required.
 */
#define lockdep_assert_in_rcu_read_lock_sched() \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_sched_lock_map), RCU_SCHED))

/**
 * lockdep_assert_in_rcu_reader - WARN if not within some type of RCU reader
 *
 * Splats if lockdep is enabled and there is no RCU reader of any
 * type in effect.  Note that regions of code protected by things like
 * preempt_disable, local_bh_disable(), and local_irq_disable() all qualify
 * as RCU readers.
 *
 * Note that this will never trigger in PREEMPT_NONE or PREEMPT_VOLUNTARY
 * kernels that are not also built with PREEMPT_COUNT.  But if you have
 * lockdep enabled, you might as well also enable PREEMPT_COUNT.
 */
#define lockdep_assert_in_rcu_reader()                                                                \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map) &&                        \
                                               !lock_is_held(&rcu_bh_lock_map) &&                \
                                               !lock_is_held(&rcu_sched_lock_map) &&                \
                                               preemptible(), RCU))

#else /* #ifdef CONFIG_PROVE_RCU */

#define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c))
#define rcu_sleep_check() do { } while (0)

#define lockdep_assert_in_rcu_read_lock() __assume_shared_ctx_lock(RCU)
#define lockdep_assert_in_rcu_read_lock_bh() __assume_shared_ctx_lock(RCU_BH)
#define lockdep_assert_in_rcu_read_lock_sched() __assume_shared_ctx_lock(RCU_SCHED)
#define lockdep_assert_in_rcu_reader() __assume_shared_ctx_lock(RCU)

#endif /* #else #ifdef CONFIG_PROVE_RCU */

/*
 * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
 * and rcu_assign_pointer().  Some of these could be folded into their
 * callers, but they are left separate in order to ease introduction of
 * multiple pointers markings to match different RCU implementations
 * (e.g., __srcu), should this make sense in the future.
 */

#ifdef __CHECKER__
#define rcu_check_sparse(p, space) \
        ((void)(((typeof(*p) space *)p) == p))
#else /* #ifdef __CHECKER__ */
#define rcu_check_sparse(p, space)
#endif /* #else #ifdef __CHECKER__ */

#define __unrcu_pointer(p, local)                                        \
context_unsafe(                                                                \
        typeof(*p) *local = (typeof(*p) *__force)(p);                        \
        rcu_check_sparse(p, __rcu);                                        \
        ((typeof(*p) __force __kernel *)(local))                        \
)
/**
 * unrcu_pointer - mark a pointer as not being RCU protected
 * @p: pointer needing to lose its __rcu property
 *
 * Converts @p from an __rcu pointer to a __kernel pointer.
 * This allows an __rcu pointer to be used with xchg() and friends.
 */
#define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu))

#define __rcu_access_pointer(p, local, space) \
({ \
        typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define __rcu_dereference_check(p, local, c, space) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define __rcu_dereference_protected(p, local, c, space) \
({ \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(p)); \
})
#define __rcu_dereference_raw(p, local) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(p) local = READ_ONCE(p); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu))

/**
 * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
 * @v: The value to statically initialize with.
 */
#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)

/**
 * rcu_assign_pointer() - assign to RCU-protected pointer
 * @p: pointer to assign to
 * @v: value to assign (publish)
 *
 * Assigns the specified value to the specified RCU-protected
 * pointer, ensuring that any concurrent RCU readers will see
 * any prior initialization.
 *
 * Inserts memory barriers on architectures that require them
 * (which is most of them), and also prevents the compiler from
 * reordering the code that initializes the structure after the pointer
 * assignment.  More importantly, this call documents which pointers
 * will be dereferenced by RCU read-side code.
 *
 * In some special cases, you may use RCU_INIT_POINTER() instead
 * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
 * to the fact that it does not constrain either the CPU or the compiler.
 * That said, using RCU_INIT_POINTER() when you should have used
 * rcu_assign_pointer() is a very bad thing that results in
 * impossible-to-diagnose memory corruption.  So please be careful.
 * See the RCU_INIT_POINTER() comment header for details.
 *
 * Note that rcu_assign_pointer() evaluates each of its arguments only
 * once, appearances notwithstanding.  One of the "extra" evaluations
 * is in typeof() and the other visible only to sparse (__CHECKER__),
 * neither of which actually execute the argument.  As with most cpp
 * macros, this execute-arguments-only-once property is important, so
 * please be careful when making changes to rcu_assign_pointer() and the
 * other macros that it invokes.
 */
#define rcu_assign_pointer(p, v)                                              \
context_unsafe(                                                              \
        uintptr_t _r_a_p__v = (uintptr_t)(v);                                      \
        rcu_check_sparse(p, __rcu);                                              \
                                                                              \
        if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)              \
                WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
        else                                                                      \
                smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
)

/**
 * rcu_replace_pointer() - replace an RCU pointer, returning its old value
 * @rcu_ptr: RCU pointer, whose old value is returned
 * @ptr: regular pointer
 * @c: the lockdep conditions under which the dereference will take place
 *
 * Perform a replacement, where @rcu_ptr is an RCU-annotated
 * pointer and @c is the lockdep argument that is passed to the
 * rcu_dereference_protected() call used to read that pointer.  The old
 * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr.
 */
#define rcu_replace_pointer(rcu_ptr, ptr, c)                                \
({                                                                        \
        typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c));        \
        rcu_assign_pointer((rcu_ptr), (ptr));                                \
        __tmp;                                                                \
})

/**
 * rcu_access_pointer() - fetch RCU pointer with no dereferencing
 * @p: The pointer to read
 *
 * Return the value of the specified RCU-protected pointer, but omit the
 * lockdep checks for being in an RCU read-side critical section.  This is
 * useful when the value of this pointer is accessed, but the pointer is
 * not dereferenced, for example, when testing an RCU-protected pointer
 * against NULL.  Although rcu_access_pointer() may also be used in cases
 * where update-side locks prevent the value of the pointer from changing,
 * you should instead use rcu_dereference_protected() for this use case.
 * Within an RCU read-side critical section, there is little reason to
 * use rcu_access_pointer().
 *
 * It is usually best to test the rcu_access_pointer() return value
 * directly in order to avoid accidental dereferences being introduced
 * by later inattentive changes.  In other words, assigning the
 * rcu_access_pointer() return value to a local variable results in an
 * accident waiting to happen.
 *
 * It is also permissible to use rcu_access_pointer() when read-side
 * access to the pointer was removed at least one grace period ago, as is
 * the case in the context of the RCU callback that is freeing up the data,
 * or after a synchronize_rcu() returns.  This can be useful when tearing
 * down multi-linked structures after a grace period has elapsed.  However,
 * rcu_dereference_protected() is normally preferred for this use case.
 */
#define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu)

/**
 * rcu_dereference_check() - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Do an rcu_dereference(), but check that the conditions under which the
 * dereference will take place are correct.  Typically the conditions
 * indicate the various locking conditions that should be held at that
 * point.  The check should return true if the conditions are satisfied.
 * An implicit check for being in an RCU read-side critical section
 * (rcu_read_lock()) is included.
 *
 * For example:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
 *
 * could be used to indicate to lockdep that foo->bar may only be dereferenced
 * if either rcu_read_lock() is held, or that the lock required to replace
 * the bar struct at foo->bar is held.
 *
 * Note that the list of conditions may also include indications of when a lock
 * need not be held, for example during initialisation or destruction of the
 * target struct:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
 *                                              atomic_read(&foo->usage) == 0);
 *
 * Inserts memory barriers on architectures that require them
 * (currently only the Alpha), prevents the compiler from refetching
 * (and from merging fetches), and, more importantly, documents exactly
 * which pointers are protected by RCU and checks that the pointer is
 * annotated as __rcu.
 */
#define rcu_dereference_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_held(), __rcu)

/**
 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-bh counterpart to rcu_dereference_check().  However,
 * please note that starting in v5.0 kernels, vanilla RCU grace periods
 * wait for local_bh_disable() regions of code in addition to regions of
 * code demarked by rcu_read_lock() and rcu_read_unlock().  This means
 * that synchronize_rcu(), call_rcu, and friends all take not only
 * rcu_read_lock() but also rcu_read_lock_bh() into account.
 */
#define rcu_dereference_bh_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_bh_held(), __rcu)

/**
 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-sched counterpart to rcu_dereference_check().
 * However, please note that starting in v5.0 kernels, vanilla RCU grace
 * periods wait for preempt_disable() regions of code in addition to
 * regions of code demarked by rcu_read_lock() and rcu_read_unlock().
 * This means that synchronize_rcu(), call_rcu, and friends all take not
 * only rcu_read_lock() but also rcu_read_lock_sched() into account.
 */
#define rcu_dereference_sched_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_sched_held(), \
                                __rcu)

/**
 * rcu_dereference_all_check() - rcu_dereference_all with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is similar to rcu_dereference_check(), but allows protection
 * by all forms of vanilla RCU readers, including preemption disabled,
 * bh-disabled, and interrupt-disabled regions of code.  Note that "vanilla
 * RCU" excludes SRCU and the various Tasks RCU flavors.  Please note
 * that this macro should not be backported to any Linux-kernel version
 * preceding v5.0 due to changes in synchronize_rcu() semantics prior
 * to that version.
 */
#define rcu_dereference_all_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_any_held(), \
                                __rcu)

/*
 * The tracing infrastructure traces RCU (we want that), but unfortunately
 * some of the RCU checks causes tracing to lock up the system.
 *
 * The no-tracing version of rcu_dereference_raw() must not call
 * rcu_read_lock_held().
 */
#define rcu_dereference_raw_check(p) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu)

/**
 * rcu_dereference_protected() - fetch RCU pointer when updates prevented
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Return the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE().  This is useful in cases where update-side locks
 * prevent the value of the pointer from changing.  Please note that this
 * primitive does *not* prevent the compiler from repeating this reference
 * or combining it with other references, so it should not be used without
 * protection of appropriate locks.
 *
 * This function is only for update-side use.  Using this function
 * when protected only by rcu_read_lock() will result in infrequent
 * but very ugly failures.
 */
#define rcu_dereference_protected(p, c) \
        __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu)


/**
 * rcu_dereference() - fetch RCU-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * This is a simple wrapper around rcu_dereference_check().
 */
#define rcu_dereference(p) rcu_dereference_check(p, 0)

/**
 * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)

/**
 * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)

/**
 * rcu_dereference_all() - fetch RCU-all-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_all(p) rcu_dereference_all_check(p, 0)

/**
 * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism
 * @p: The pointer to hand off
 *
 * This is simply an identity function, but it documents where a pointer
 * is handed off from RCU to some other synchronization mechanism, for
 * example, reference counting or locking.  In C11, it would map to
 * kill_dependency().  It could be used as follows::
 *
 *        rcu_read_lock();
 *        p = rcu_dereference(gp);
 *        long_lived = is_long_lived(p);
 *        if (long_lived) {
 *                if (!atomic_inc_not_zero(p->refcnt))
 *                        long_lived = false;
 *                else
 *                        p = rcu_pointer_handoff(p);
 *        }
 *        rcu_read_unlock();
 */
#define rcu_pointer_handoff(p) (p)

/**
 * rcu_read_lock() - mark the beginning of an RCU read-side critical section
 *
 * When synchronize_rcu() is invoked on one CPU while other CPUs
 * are within RCU read-side critical sections, then the
 * synchronize_rcu() is guaranteed to block until after all the other
 * CPUs exit their critical sections.  Similarly, if call_rcu() is invoked
 * on one CPU while other CPUs are within RCU read-side critical
 * sections, invocation of the corresponding RCU callback is deferred
 * until after the all the other CPUs exit their critical sections.
 *
 * Both synchronize_rcu() and call_rcu() also wait for regions of code
 * with preemption disabled, including regions of code with interrupts or
 * softirqs disabled.
 *
 * Note, however, that RCU callbacks are permitted to run concurrently
 * with new RCU read-side critical sections.  One way that this can happen
 * is via the following sequence of events: (1) CPU 0 enters an RCU
 * read-side critical section, (2) CPU 1 invokes call_rcu() to register
 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
 * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU
 * callback is invoked.  This is legal, because the RCU read-side critical
 * section that was running concurrently with the call_rcu() (and which
 * therefore might be referencing something that the corresponding RCU
 * callback would free up) has completed before the corresponding
 * RCU callback is invoked.
 *
 * RCU read-side critical sections may be nested.  Any deferred actions
 * will be deferred until the outermost RCU read-side critical section
 * completes.
 *
 * You can avoid reading and understanding the next paragraph by
 * following this rule: don't put anything in an rcu_read_lock() RCU
 * read-side critical section that would block in a !PREEMPTION kernel.
 * But if you want the full story, read on!
 *
 * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU),
 * it is illegal to block while in an RCU read-side critical section.
 * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
 * kernel builds, RCU read-side critical sections may be preempted,
 * but explicit blocking is illegal.  Finally, in preemptible RCU
 * implementations in real-time (with -rt patchset) kernel builds, RCU
 * read-side critical sections may be preempted and they may also block, but
 * only when acquiring spinlocks that are subject to priority inheritance.
 */
static __always_inline void rcu_read_lock(void)
        __acquires_shared(RCU)
{
        __rcu_read_lock();
        __acquire_shared(RCU);
        rcu_lock_acquire(&rcu_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock() used illegally while idle");
}

/*
 * So where is rcu_write_lock()?  It does not exist, as there is no
 * way for writers to lock out RCU readers.  This is a feature, not
 * a bug -- this property is what provides RCU's performance benefits.
 * Of course, writers must coordinate with each other.  The normal
 * spinlock primitives work well for this, but any other technique may be
 * used as well.  RCU does not care how the writers keep out of each
 * others' way, as long as they do so.
 */

/**
 * rcu_read_unlock() - marks the end of an RCU read-side critical section.
 *
 * In almost all situations, rcu_read_unlock() is immune from deadlock.
 * This deadlock immunity also extends to the scheduler's runqueue
 * and priority-inheritance spinlocks, courtesy of the quiescent-state
 * deferral that is carried out when rcu_read_unlock() is invoked with
 * interrupts disabled.
 *
 * See rcu_read_lock() for more information.
 */
static inline void rcu_read_unlock(void)
        __releases_shared(RCU)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock() used illegally while idle");
        rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
        __release_shared(RCU);
        __rcu_read_unlock();
}

/**
 * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
 *
 * This is equivalent to rcu_read_lock(), but also disables softirqs.
 * Note that anything else that disables softirqs can also serve as an RCU
 * read-side critical section.  However, please note that this equivalence
 * applies only to v5.0 and later.  Before v5.0, rcu_read_lock() and
 * rcu_read_lock_bh() were unrelated.
 *
 * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
 * was invoked from some other task.
 */
static inline void rcu_read_lock_bh(void)
        __acquires_shared(RCU) __acquires_shared(RCU_BH)
{
        local_bh_disable();
        __acquire_shared(RCU);
        __acquire_shared(RCU_BH);
        rcu_lock_acquire(&rcu_bh_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_bh() used illegally while idle");
}

/**
 * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section
 *
 * See rcu_read_lock_bh() for more information.
 */
static inline void rcu_read_unlock_bh(void)
        __releases_shared(RCU) __releases_shared(RCU_BH)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_bh() used illegally while idle");
        rcu_lock_release(&rcu_bh_lock_map);
        __release_shared(RCU_BH);
        __release_shared(RCU);
        local_bh_enable();
}

/**
 * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
 *
 * This is equivalent to rcu_read_lock(), but also disables preemption.
 * Read-side critical sections can also be introduced by anything else that
 * disables preemption, including local_irq_disable() and friends.  However,
 * please note that the equivalence to rcu_read_lock() applies only to
 * v5.0 and later.  Before v5.0, rcu_read_lock() and rcu_read_lock_sched()
 * were unrelated.
 *
 * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_sched() from process context if the matching
 * rcu_read_lock_sched() was invoked from an NMI handler.
 */
static inline void rcu_read_lock_sched(void)
        __acquires_shared(RCU) __acquires_shared(RCU_SCHED)
{
        preempt_disable();
        __acquire_shared(RCU);
        __acquire_shared(RCU_SCHED);
        rcu_lock_acquire(&rcu_sched_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_sched() used illegally while idle");
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_lock_sched_notrace(void)
        __acquires_shared(RCU) __acquires_shared(RCU_SCHED)
{
        preempt_disable_notrace();
        __acquire_shared(RCU);
        __acquire_shared(RCU_SCHED);
}

/**
 * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section
 *
 * See rcu_read_lock_sched() for more information.
 */
static inline void rcu_read_unlock_sched(void)
        __releases_shared(RCU) __releases_shared(RCU_SCHED)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_sched() used illegally while idle");
        rcu_lock_release(&rcu_sched_lock_map);
        __release_shared(RCU_SCHED);
        __release_shared(RCU);
        preempt_enable();
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_unlock_sched_notrace(void)
        __releases_shared(RCU) __releases_shared(RCU_SCHED)
{
        __release_shared(RCU_SCHED);
        __release_shared(RCU);
        preempt_enable_notrace();
}

static __always_inline void rcu_read_lock_dont_migrate(void)
        __acquires_shared(RCU)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RCU))
                migrate_disable();
        rcu_read_lock();
}

static inline void rcu_read_unlock_migrate(void)
        __releases_shared(RCU)
{
        rcu_read_unlock();
        if (IS_ENABLED(CONFIG_PREEMPT_RCU))
                migrate_enable();
}

/**
 * RCU_INIT_POINTER() - initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * Initialize an RCU-protected pointer in special cases where readers
 * do not need ordering constraints on the CPU or the compiler.  These
 * special cases are:
 *
 * 1.        This use of RCU_INIT_POINTER() is NULLing out the pointer *or*
 * 2.        The caller has taken whatever steps are required to prevent
 *        RCU readers from concurrently accessing this pointer *or*
 * 3.        The referenced data structure has already been exposed to
 *        readers either at compile time or via rcu_assign_pointer() *and*
 *
 *        a.        You have not made *any* reader-visible changes to
 *                this structure since then *or*
 *        b.        It is OK for readers accessing this structure from its
 *                new location to see the old state of the structure.  (For
 *                example, the changes were to statistical counters or to
 *                other state where exact synchronization is not required.)
 *
 * Failure to follow these rules governing use of RCU_INIT_POINTER() will
 * result in impossible-to-diagnose memory corruption.  As in the structures
 * will look OK in crash dumps, but any concurrent RCU readers might
 * see pre-initialized values of the referenced data structure.  So
 * please be very careful how you use RCU_INIT_POINTER()!!!
 *
 * If you are creating an RCU-protected linked structure that is accessed
 * by a single external-to-structure RCU-protected pointer, then you may
 * use RCU_INIT_POINTER() to initialize the internal RCU-protected
 * pointers, but you must use rcu_assign_pointer() to initialize the
 * external-to-structure pointer *after* you have completely initialized
 * the reader-accessible portions of the linked structure.
 *
 * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no
 * ordering guarantees for either the CPU or the compiler.
 */
#define RCU_INIT_POINTER(p, v) \
        context_unsafe( \
                rcu_check_sparse(p, __rcu); \
                WRITE_ONCE(p, RCU_INITIALIZER(v)); \
        )

/**
 * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * GCC-style initialization for an RCU-protected pointer in a structure field.
 */
#define RCU_POINTER_INITIALIZER(p, v) \
                .p = RCU_INITIALIZER(v)

/**
 * kfree_rcu() - kfree an object after a grace period.
 * @ptr: pointer to kfree for double-argument invocations.
 * @rhf: the name of the struct rcu_head within the type of @ptr.
 *
 * Many rcu callbacks functions just call kfree() on the base structure.
 * These functions are trivial, but their size adds up, and furthermore
 * when they are used in a kernel module, that module must invoke the
 * high-latency rcu_barrier() function at module-unload time.
 *
 * The kfree_rcu() function handles this issue. In order to have a universal
 * callback function handling different offsets of rcu_head, the callback needs
 * to determine the starting address of the freed object, which can be a large
 * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to
 * page boundary for those, only offsets up to 4095 bytes can be accommodated.
 * If the offset is larger than 4095 bytes, a compile-time error will
 * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
 * either fall back to use of call_rcu() or rearrange the structure to
 * position the rcu_head structure into the first 4096 bytes.
 *
 * The object to be freed can be allocated either by kmalloc(),
 * kmalloc_nolock(), or kmem_cache_alloc().
 *
 * Note that the allowable offset might decrease in the future.
 *
 * The BUILD_BUG_ON check must not involve any function calls, hence the
 * checks are done in macros here.
 */
#define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)
#define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)

/**
 * kfree_rcu_mightsleep() - kfree an object after a grace period.
 * @ptr: pointer to kfree for single-argument invocations.
 *
 * When it comes to head-less variant, only one argument
 * is passed and that is just a pointer which has to be
 * freed after a grace period. Therefore the semantic is
 *
 *     kfree_rcu_mightsleep(ptr);
 *
 * where @ptr is the pointer to be freed by kvfree().
 *
 * Please note, head-less way of freeing is permitted to
 * use from a context that has to follow might_sleep()
 * annotation. Otherwise, please switch and embed the
 * rcu_head structure within the type of @ptr.
 */
#define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)
#define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)

/*
 * In mm/slab_common.c, no suitable header to include here.
 */
void kvfree_call_rcu(struct rcu_head *head, void *ptr);

/*
 * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
 * comment of kfree_rcu() for details.
 */
#define kvfree_rcu_arg_2(ptr, rhf)                                        \
do {                                                                        \
        typeof (ptr) ___p = (ptr);                                        \
                                                                        \
        if (___p) {                                                        \
                BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096);        \
                kvfree_call_rcu(&((___p)->rhf), (void *) (___p));        \
        }                                                                \
} while (0)

#define kvfree_rcu_arg_1(ptr)                                        \
do {                                                                \
        typeof(ptr) ___p = (ptr);                                \
                                                                \
        if (___p)                                                \
                kvfree_call_rcu(NULL, (void *) (___p));                \
} while (0)

/*
 * Place this after a lock-acquisition primitive to guarantee that
 * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
 * if the UNLOCK and LOCK are executed by the same CPU or if the
 * UNLOCK and LOCK operate on the same lock variable.
 */
#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
#define smp_mb__after_unlock_lock()        smp_mb()  /* Full ordering for lock. */
#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
#define smp_mb__after_unlock_lock()        do { } while (0)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */


/* Has the specified rcu_head structure been handed to call_rcu()? */

/**
 * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
 * @rhp: The rcu_head structure to initialize.
 *
 * If you intend to invoke rcu_head_after_call_rcu() to test whether a
 * given rcu_head structure has already been passed to call_rcu(), then
 * you must also invoke this rcu_head_init() function on it just after
 * allocating that structure.  Calls to this function must not race with
 * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
 */
static inline void rcu_head_init(struct rcu_head *rhp)
{
        rhp->func = (rcu_callback_t)~0L;
}

/**
 * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()?
 * @rhp: The rcu_head structure to test.
 * @f: The function passed to call_rcu() along with @rhp.
 *
 * Returns @true if the @rhp has been passed to call_rcu() with @func,
 * and @false otherwise.  Emits a warning in any other case, including
 * the case where @rhp has already been invoked after a grace period.
 * Calls to this function must not race with callback invocation.  One way
 * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
 * in an RCU read-side critical section that includes a read-side fetch
 * of the pointer to the structure containing @rhp.
 */
static inline bool
rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
{
        rcu_callback_t func = READ_ONCE(rhp->func);

        if (func == f)
                return true;
        WARN_ON_ONCE(func != (rcu_callback_t)~0L);
        return false;
}

/* kernel/ksysfs.c definitions */
extern int rcu_expedited;
extern int rcu_normal;

DEFINE_LOCK_GUARD_0(rcu, rcu_read_lock(), rcu_read_unlock())
DECLARE_LOCK_GUARD_0_ATTRS(rcu, __acquires_shared(RCU), __releases_shared(RCU))

#endif /* __LINUX_RCUPDATE_H */






















    1 





    1 



































    1 





    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/bitmap.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 */

#include <linux/buffer_head.h>
#include "ext4.h"

unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
{
        return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
}

int ext4_inode_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh)
{
        __u32 hi;
        __u32 provided, calculated;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int sz;

        if (!ext4_has_feature_metadata_csum(sb))
                return 1;

        sz = EXT4_INODES_PER_GROUP(sb) >> 3;
        provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
        calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
                hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
                provided |= (hi << 16);
        } else
                calculated &= 0xFFFF;

        return provided == calculated;
}

void ext4_inode_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh)
{
        __u32 csum;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int sz;

        if (!ext4_has_feature_metadata_csum(sb))
                return;

        sz = EXT4_INODES_PER_GROUP(sb) >> 3;
        csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
        if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
                gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}

int ext4_block_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh)
{
        __u32 hi;
        __u32 provided, calculated;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;

        if (!ext4_has_feature_metadata_csum(sb))
                return 1;

        provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
        calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
                hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
                provided |= (hi << 16);
        } else
                calculated &= 0xFFFF;

        return provided == calculated;
}

void ext4_block_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh)
{
        int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
        __u32 csum;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!ext4_has_feature_metadata_csum(sb))
                return;

        csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
        if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
                gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/*
 * include/net/tipc.h: Include file for TIPC message header routines
 *
 * Copyright (c) 2017 Ericsson AB
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _TIPC_HDR_H
#define _TIPC_HDR_H

#include <linux/random.h>

#define KEEPALIVE_MSG_MASK 0x0e080000  /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */

struct tipc_basic_hdr {
        __be32 w[4];
};

static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
{
        u32 w0 = ntohl(hdr->w[0]);
        bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK;
        __be32 key;

        /* Return source node identity as key */
        if (likely(!keepalive_msg))
                return hdr->w[3];

        /* Spread PROBE/PROBE_REPLY messages across the cores */
        get_random_bytes(&key, sizeof(key));
        return key;
}

#endif



























































































































































































































































    2 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/compat.c
 *
 *  Kernel compatibililty routines for e.g. 32 bit syscall support
 *  on 64 bit kernels.
 *
 *  Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
 */

#include <linux/linkage.h>
#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/signal.h>
#include <linux/sched.h>        /* for MAX_SCHEDULE_TIMEOUT */
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/migrate.h>
#include <linux/posix-timers.h>
#include <linux/times.h>
#include <linux/ptrace.h>
#include <linux/gfp.h>

#include <linux/uaccess.h>

#ifdef __ARCH_WANT_SYS_SIGPROCMASK

/*
 * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
 * blocked set of signals to the supplied signal set
 */
static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
{
        memcpy(blocked->sig, &set, sizeof(set));
}

COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
                       compat_old_sigset_t __user *, nset,
                       compat_old_sigset_t __user *, oset)
{
        old_sigset_t old_set, new_set;
        sigset_t new_blocked;

        old_set = current->blocked.sig[0];

        if (nset) {
                if (get_user(new_set, nset))
                        return -EFAULT;
                new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));

                new_blocked = current->blocked;

                switch (how) {
                case SIG_BLOCK:
                        sigaddsetmask(&new_blocked, new_set);
                        break;
                case SIG_UNBLOCK:
                        sigdelsetmask(&new_blocked, new_set);
                        break;
                case SIG_SETMASK:
                        compat_sig_setmask(&new_blocked, new_set);
                        break;
                default:
                        return -EINVAL;
                }

                set_current_blocked(&new_blocked);
        }

        if (oset) {
                if (put_user(old_set, oset))
                        return -EFAULT;
        }

        return 0;
}

#endif

int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
{
        struct compat_rusage r32;
        memset(&r32, 0, sizeof(r32));
        r32.ru_utime.tv_sec = r->ru_utime.tv_sec;
        r32.ru_utime.tv_usec = r->ru_utime.tv_usec;
        r32.ru_stime.tv_sec = r->ru_stime.tv_sec;
        r32.ru_stime.tv_usec = r->ru_stime.tv_usec;
        r32.ru_maxrss = r->ru_maxrss;
        r32.ru_ixrss = r->ru_ixrss;
        r32.ru_idrss = r->ru_idrss;
        r32.ru_isrss = r->ru_isrss;
        r32.ru_minflt = r->ru_minflt;
        r32.ru_majflt = r->ru_majflt;
        r32.ru_nswap = r->ru_nswap;
        r32.ru_inblock = r->ru_inblock;
        r32.ru_oublock = r->ru_oublock;
        r32.ru_msgsnd = r->ru_msgsnd;
        r32.ru_msgrcv = r->ru_msgrcv;
        r32.ru_nsignals = r->ru_nsignals;
        r32.ru_nvcsw = r->ru_nvcsw;
        r32.ru_nivcsw = r->ru_nivcsw;
        if (copy_to_user(ru, &r32, sizeof(r32)))
                return -EFAULT;
        return 0;
}

static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
                                    unsigned len, struct cpumask *new_mask)
{
        unsigned long *k;

        if (len < cpumask_size())
                memset(new_mask, 0, cpumask_size());
        else if (len > cpumask_size())
                len = cpumask_size();

        k = cpumask_bits(new_mask);
        return compat_get_bitmap(k, user_mask_ptr, len * 8);
}

COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
                       unsigned int, len,
                       compat_ulong_t __user *, user_mask_ptr)
{
        cpumask_var_t new_mask;
        int retval;

        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
                return -ENOMEM;

        retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask);
        if (retval)
                goto out;

        retval = sched_setaffinity(pid, new_mask);
out:
        free_cpumask_var(new_mask);
        return retval;
}

COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t,  pid, unsigned int, len,
                       compat_ulong_t __user *, user_mask_ptr)
{
        int ret;
        cpumask_var_t mask;

        if ((len * BITS_PER_BYTE) < nr_cpu_ids)
                return -EINVAL;
        if (len & (sizeof(compat_ulong_t)-1))
                return -EINVAL;

        if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
                return -ENOMEM;

        ret = sched_getaffinity(pid, mask);
        if (ret == 0) {
                unsigned int retlen = min(len, cpumask_size());

                if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
                        ret = -EFAULT;
                else
                        ret = retlen;
        }
        free_cpumask_var(mask);

        return ret;
}

/*
 * We currently only need the following fields from the sigevent
 * structure: sigev_value, sigev_signo, sig_notify and (sometimes
 * sigev_notify_thread_id).  The others are handled in user mode.
 * We also assume that copying sigev_value.sival_int is sufficient
 * to keep all the bits of sigev_value.sival_ptr intact.
 */
int get_compat_sigevent(struct sigevent *event,
                const struct compat_sigevent __user *u_event)
{
        memset(event, 0, sizeof(*event));
        return (!access_ok(u_event, sizeof(*u_event)) ||
                __get_user(event->sigev_value.sival_int,
                        &u_event->sigev_value.sival_int) ||
                __get_user(event->sigev_signo, &u_event->sigev_signo) ||
                __get_user(event->sigev_notify, &u_event->sigev_notify) ||
                __get_user(event->sigev_notify_thread_id,
                        &u_event->sigev_notify_thread_id))
                ? -EFAULT : 0;
}

long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
                       unsigned long bitmap_size)
{
        unsigned long nr_compat_longs;

        /* align bitmap up to nearest compat_long_t boundary */
        bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
        nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);

        if (!user_read_access_begin(umask, bitmap_size / 8))
                return -EFAULT;

        while (nr_compat_longs > 1) {
                compat_ulong_t l1, l2;
                unsafe_get_user(l1, umask++, Efault);
                unsafe_get_user(l2, umask++, Efault);
                *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1;
                nr_compat_longs -= 2;
        }
        if (nr_compat_longs)
                unsafe_get_user(*mask, umask++, Efault);
        user_read_access_end();
        return 0;

Efault:
        user_read_access_end();
        return -EFAULT;
}

long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
                       unsigned long bitmap_size)
{
        unsigned long nr_compat_longs;

        /* align bitmap up to nearest compat_long_t boundary */
        bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
        nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);

        if (!user_write_access_begin(umask, bitmap_size / 8))
                return -EFAULT;

        while (nr_compat_longs > 1) {
                unsigned long m = *mask++;
                unsafe_put_user((compat_ulong_t)m, umask++, Efault);
                unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault);
                nr_compat_longs -= 2;
        }
        if (nr_compat_longs)
                unsafe_put_user((compat_ulong_t)*mask, umask++, Efault);
        user_write_access_end();
        return 0;
Efault:
        user_write_access_end();
        return -EFAULT;
}

int
get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
{
#ifdef __BIG_ENDIAN
        compat_sigset_t v;
        if (copy_from_user(&v, compat, sizeof(compat_sigset_t)))
                return -EFAULT;
        switch (_NSIG_WORDS) {
        case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
                fallthrough;
        case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
                fallthrough;
        case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
                fallthrough;
        case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
        }
#else
        if (copy_from_user(set, compat, sizeof(compat_sigset_t)))
                return -EFAULT;
#endif
        return 0;
}
EXPORT_SYMBOL_GPL(get_compat_sigset);





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 










   10 




   12 








   11 









































    3 



    9 











   12 












    4 



    3 







    3 


    4 
    4 






    3 







    3 

































   12 






   11 




























   12 


   12 















   11 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
 *     Agricultural Sciences.
 *
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
 * This work is based on the LPC-trie which is originally described in:
 *
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
 * https://www.csc.kth.se/~snilsson/software/dyntrie2/
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                IPv4 FIB: lookup engine and maintenance routines.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 * Substantial contributions to this work comes from:
 *
 *                David S. Miller, <davem@davemloft.net>
 *                Stephen Hemminger <shemminger@osdl.org>
 *                Paul E. McKenney <paulmck@us.ibm.com>
 *                Patrick McHardy <kaber@trash.net>
 */
#include <linux/cache.h>
#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/rcupdate_wait.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/notifier.h>
#include <net/net_namespace.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/fib_notifier.h>
#include <trace/events/fib.h>
#include "fib_lookup.h"

static int call_fib_entry_notifier(struct notifier_block *nb,
                                   enum fib_event_type event_type, u32 dst,
                                   int dst_len, struct fib_alias *fa,
                                   struct netlink_ext_ack *extack)
{
        struct fib_entry_notifier_info info = {
                .info.extack = extack,
                .dst = dst,
                .dst_len = dst_len,
                .fi = fa->fa_info,
                .dscp = fa->fa_dscp,
                .type = fa->fa_type,
                .tb_id = fa->tb_id,
        };
        return call_fib4_notifier(nb, event_type, &info.info);
}

static int call_fib_entry_notifiers(struct net *net,
                                    enum fib_event_type event_type, u32 dst,
                                    int dst_len, struct fib_alias *fa,
                                    struct netlink_ext_ack *extack)
{
        struct fib_entry_notifier_info info = {
                .info.extack = extack,
                .dst = dst,
                .dst_len = dst_len,
                .fi = fa->fa_info,
                .dscp = fa->fa_dscp,
                .type = fa->fa_type,
                .tb_id = fa->tb_id,
        };
        return call_fib4_notifiers(net, event_type, &info.info);
}

#define MAX_STAT_DEPTH 32

#define KEYLENGTH        (8*sizeof(t_key))
#define KEY_MAX                ((t_key)~0)

typedef unsigned int t_key;

#define IS_TRIE(n)        ((n)->pos >= KEYLENGTH)
#define IS_TNODE(n)        ((n)->bits)
#define IS_LEAF(n)        (!(n)->bits)

struct key_vector {
        t_key key;
        unsigned char pos;                /* 2log(KEYLENGTH) bits needed */
        unsigned char bits;                /* 2log(KEYLENGTH) bits needed */
        unsigned char slen;
        union {
                /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
                struct hlist_head leaf;
                /* This array is valid if (pos | bits) > 0 (TNODE) */
                DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode);
        };
};

struct tnode {
        struct rcu_head rcu;
        t_key empty_children;                /* KEYLENGTH bits needed */
        t_key full_children;                /* KEYLENGTH bits needed */
        struct key_vector __rcu *parent;
        struct key_vector kv[1];
#define tn_bits kv[0].bits
};

#define TNODE_SIZE(n)        offsetof(struct tnode, kv[0].tnode[n])
#define LEAF_SIZE        TNODE_SIZE(1)

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
        unsigned int gets;
        unsigned int backtrack;
        unsigned int semantic_match_passed;
        unsigned int semantic_match_miss;
        unsigned int null_node_hit;
        unsigned int resize_node_skipped;
};
#endif

struct trie_stat {
        unsigned int totdepth;
        unsigned int maxdepth;
        unsigned int tnodes;
        unsigned int leaves;
        unsigned int nullpointers;
        unsigned int prefixes;
        unsigned int nodesizes[MAX_STAT_DEPTH];
};

struct trie {
        struct key_vector kv[1];
#ifdef CONFIG_IP_FIB_TRIE_STATS
        struct trie_use_stats __percpu *stats;
#endif
};

static struct key_vector *resize(struct trie *t, struct key_vector *tn);
static unsigned int tnode_free_size;

/*
 * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
 * especially useful before resizing the root node with PREEMPT_NONE configs;
 * the value was obtained experimentally, aiming to avoid visible slowdown.
 */
unsigned int sysctl_fib_sync_mem = 512 * 1024;
unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;

static struct kmem_cache *fn_alias_kmem __ro_after_init;
static struct kmem_cache *trie_leaf_kmem __ro_after_init;

static inline struct tnode *tn_info(struct key_vector *kv)
{
        return container_of(kv, struct tnode, kv[0]);
}

/* caller must hold RTNL */
#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])

/* caller must hold RCU read lock or RTNL */
#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])

/* wrapper for rcu_assign_pointer */
static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
{
        if (n)
                rcu_assign_pointer(tn_info(n)->parent, tp);
}

#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)

/* This provides us with the number of children in this node, in the case of a
 * leaf this will return 0 meaning none of the children are accessible.
 */
static inline unsigned long child_length(const struct key_vector *tn)
{
        return (1ul << tn->bits) & ~(1ul);
}

#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)

static inline unsigned long get_index(t_key key, struct key_vector *kv)
{
        unsigned long index = key ^ kv->key;

        if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
                return 0;

        return index >> kv->pos;
}

/* To understand this stuff, an understanding of keys and all their bits is
 * necessary. Every node in the trie has a key associated with it, but not
 * all of the bits in that key are significant.
 *
 * Consider a node 'n' and its parent 'tp'.
 *
 * If n is a leaf, every bit in its key is significant. Its presence is
 * necessitated by path compression, since during a tree traversal (when
 * searching for a leaf - unless we are doing an insertion) we will completely
 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
 * a potentially successful search, that we have indeed been walking the
 * correct key path.
 *
 * Note that we can never "miss" the correct key in the tree if present by
 * following the wrong path. Path compression ensures that segments of the key
 * that are the same for all keys with a given prefix are skipped, but the
 * skipped part *is* identical for each node in the subtrie below the skipped
 * bit! trie_insert() in this implementation takes care of that.
 *
 * if n is an internal node - a 'tnode' here, the various parts of its key
 * have many different meanings.
 *
 * Example:
 * _________________________________________________________________
 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 * -----------------------------------------------------------------
 *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
 *
 * _________________________________________________________________
 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 * -----------------------------------------------------------------
 *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
 *
 * tp->pos = 22
 * tp->bits = 3
 * n->pos = 13
 * n->bits = 4
 *
 * First, let's just ignore the bits that come before the parent tp, that is
 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
 * point we do not use them for anything.
 *
 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 * index into the parent's child array. That is, they will be used to find
 * 'n' among tp's children.
 *
 * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
 * for the node n.
 *
 * All the bits we have seen so far are significant to the node n. The rest
 * of the bits are really not needed or indeed known in n->key.
 *
 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 * n's child array, and will of course be different for each child.
 *
 * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
 * at this point.
 */

static const int halve_threshold = 25;
static const int inflate_threshold = 50;
static const int halve_threshold_root = 15;
static const int inflate_threshold_root = 30;

static inline void alias_free_mem_rcu(struct fib_alias *fa)
{
        kfree_rcu(fa, rcu);
}

#define TNODE_VMALLOC_MAX \
        ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))

static void __node_free_rcu(struct rcu_head *head)
{
        struct tnode *n = container_of(head, struct tnode, rcu);

        if (!n->tn_bits)
                kmem_cache_free(trie_leaf_kmem, n);
        else
                kvfree(n);
}

#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)

static struct tnode *tnode_alloc(int bits)
{
        size_t size;

        /* verify bits is within bounds */
        if (bits > TNODE_VMALLOC_MAX)
                return NULL;

        /* determine size and verify it is non-zero and didn't overflow */
        size = TNODE_SIZE(1ul << bits);

        if (size <= PAGE_SIZE)
                return kzalloc(size, GFP_KERNEL);
        else
                return vzalloc(size);
}

static inline void empty_child_inc(struct key_vector *n)
{
        tn_info(n)->empty_children++;

        if (!tn_info(n)->empty_children)
                tn_info(n)->full_children++;
}

static inline void empty_child_dec(struct key_vector *n)
{
        if (!tn_info(n)->empty_children)
                tn_info(n)->full_children--;

        tn_info(n)->empty_children--;
}

static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
{
        struct key_vector *l;
        struct tnode *kv;

        kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
        if (!kv)
                return NULL;

        /* initialize key vector */
        l = kv->kv;
        l->key = key;
        l->pos = 0;
        l->bits = 0;
        l->slen = fa->fa_slen;

        /* link leaf to fib alias */
        INIT_HLIST_HEAD(&l->leaf);
        hlist_add_head(&fa->fa_list, &l->leaf);

        return l;
}

static struct key_vector *tnode_new(t_key key, int pos, int bits)
{
        unsigned int shift = pos + bits;
        struct key_vector *tn;
        struct tnode *tnode;

        /* verify bits and pos their msb bits clear and values are valid */
        BUG_ON(!bits || (shift > KEYLENGTH));

        tnode = tnode_alloc(bits);
        if (!tnode)
                return NULL;

        pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
                 sizeof(struct key_vector *) << bits);

        if (bits == KEYLENGTH)
                tnode->full_children = 1;
        else
                tnode->empty_children = 1ul << bits;

        tn = tnode->kv;
        tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
        tn->pos = pos;
        tn->bits = bits;
        tn->slen = pos;

        return tn;
}

/* Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */
static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
{
        return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
}

/* Add a child at position i overwriting the old value.
 * Update the value of full_children and empty_children.
 */
static void put_child(struct key_vector *tn, unsigned long i,
                      struct key_vector *n)
{
        struct key_vector *chi = get_child(tn, i);
        int isfull, wasfull;

        BUG_ON(i >= child_length(tn));

        /* update emptyChildren, overflow into fullChildren */
        if (!n && chi)
                empty_child_inc(tn);
        if (n && !chi)
                empty_child_dec(tn);

        /* update fullChildren */
        wasfull = tnode_full(tn, chi);
        isfull = tnode_full(tn, n);

        if (wasfull && !isfull)
                tn_info(tn)->full_children--;
        else if (!wasfull && isfull)
                tn_info(tn)->full_children++;

        if (n && (tn->slen < n->slen))
                tn->slen = n->slen;

        rcu_assign_pointer(tn->tnode[i], n);
}

static void update_children(struct key_vector *tn)
{
        unsigned long i;

        /* update all of the child parent pointers */
        for (i = child_length(tn); i;) {
                struct key_vector *inode = get_child(tn, --i);

                if (!inode)
                        continue;

                /* Either update the children of a tnode that
                 * already belongs to us or update the child
                 * to point to ourselves.
                 */
                if (node_parent(inode) == tn)
                        update_children(inode);
                else
                        node_set_parent(inode, tn);
        }
}

static inline void put_child_root(struct key_vector *tp, t_key key,
                                  struct key_vector *n)
{
        if (IS_TRIE(tp))
                rcu_assign_pointer(tp->tnode[0], n);
        else
                put_child(tp, get_index(key, tp), n);
}

static inline void tnode_free_init(struct key_vector *tn)
{
        tn_info(tn)->rcu.next = NULL;
}

static inline void tnode_free_append(struct key_vector *tn,
                                     struct key_vector *n)
{
        tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
        tn_info(tn)->rcu.next = &tn_info(n)->rcu;
}

static void tnode_free(struct key_vector *tn)
{
        struct callback_head *head = &tn_info(tn)->rcu;

        while (head) {
                head = head->next;
                tnode_free_size += TNODE_SIZE(1ul << tn->bits);
                node_free(tn);

                tn = container_of(head, struct tnode, rcu)->kv;
        }

        if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) {
                tnode_free_size = 0;
                synchronize_net();
        }
}

static struct key_vector *replace(struct trie *t,
                                  struct key_vector *oldtnode,
                                  struct key_vector *tn)
{
        struct key_vector *tp = node_parent(oldtnode);
        unsigned long i;

        /* setup the parent pointer out of and back into this node */
        NODE_INIT_PARENT(tn, tp);
        put_child_root(tp, tn->key, tn);

        /* update all of the child parent pointers */
        update_children(tn);

        /* all pointers should be clean so we are done */
        tnode_free(oldtnode);

        /* resize children now that oldtnode is freed */
        for (i = child_length(tn); i;) {
                struct key_vector *inode = get_child(tn, --i);

                /* resize child node */
                if (tnode_full(tn, inode))
                        tn = resize(t, inode);
        }

        return tp;
}

static struct key_vector *inflate(struct trie *t,
                                  struct key_vector *oldtnode)
{
        struct key_vector *tn;
        unsigned long i;
        t_key m;

        pr_debug("In inflate\n");

        tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
        if (!tn)
                goto notnode;

        /* prepare oldtnode to be freed */
        tnode_free_init(oldtnode);

        /* Assemble all of the pointers in our cluster, in this case that
         * represents all of the pointers out of our allocated nodes that
         * point to existing tnodes and the links between our allocated
         * nodes.
         */
        for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
                struct key_vector *inode = get_child(oldtnode, --i);
                struct key_vector *node0, *node1;
                unsigned long j, k;

                /* An empty child */
                if (!inode)
                        continue;

                /* A leaf or an internal node with skipped bits */
                if (!tnode_full(oldtnode, inode)) {
                        put_child(tn, get_index(inode->key, tn), inode);
                        continue;
                }

                /* drop the node in the old tnode free list */
                tnode_free_append(oldtnode, inode);

                /* An internal node with two children */
                if (inode->bits == 1) {
                        put_child(tn, 2 * i + 1, get_child(inode, 1));
                        put_child(tn, 2 * i, get_child(inode, 0));
                        continue;
                }

                /* We will replace this node 'inode' with two new
                 * ones, 'node0' and 'node1', each with half of the
                 * original children. The two new nodes will have
                 * a position one bit further down the key and this
                 * means that the "significant" part of their keys
                 * (see the discussion near the top of this file)
                 * will differ by one bit, which will be "0" in
                 * node0's key and "1" in node1's key. Since we are
                 * moving the key position by one step, the bit that
                 * we are moving away from - the bit at position
                 * (tn->pos) - is the one that will differ between
                 * node0 and node1. So... we synthesize that bit in the
                 * two new keys.
                 */
                node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
                if (!node1)
                        goto nomem;
                node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);

                tnode_free_append(tn, node1);
                if (!node0)
                        goto nomem;
                tnode_free_append(tn, node0);

                /* populate child pointers in new nodes */
                for (k = child_length(inode), j = k / 2; j;) {
                        put_child(node1, --j, get_child(inode, --k));
                        put_child(node0, j, get_child(inode, j));
                        put_child(node1, --j, get_child(inode, --k));
                        put_child(node0, j, get_child(inode, j));
                }

                /* link new nodes to parent */
                NODE_INIT_PARENT(node1, tn);
                NODE_INIT_PARENT(node0, tn);

                /* link parent to nodes */
                put_child(tn, 2 * i + 1, node1);
                put_child(tn, 2 * i, node0);
        }

        /* setup the parent pointers into and out of this node */
        return replace(t, oldtnode, tn);
nomem:
        /* all pointers should be clean so we are done */
        tnode_free(tn);
notnode:
        return NULL;
}

static struct key_vector *halve(struct trie *t,
                                struct key_vector *oldtnode)
{
        struct key_vector *tn;
        unsigned long i;

        pr_debug("In halve\n");

        tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
        if (!tn)
                goto notnode;

        /* prepare oldtnode to be freed */
        tnode_free_init(oldtnode);

        /* Assemble all of the pointers in our cluster, in this case that
         * represents all of the pointers out of our allocated nodes that
         * point to existing tnodes and the links between our allocated
         * nodes.
         */
        for (i = child_length(oldtnode); i;) {
                struct key_vector *node1 = get_child(oldtnode, --i);
                struct key_vector *node0 = get_child(oldtnode, --i);
                struct key_vector *inode;

                /* At least one of the children is empty */
                if (!node1 || !node0) {
                        put_child(tn, i / 2, node1 ? : node0);
                        continue;
                }

                /* Two nonempty children */
                inode = tnode_new(node0->key, oldtnode->pos, 1);
                if (!inode)
                        goto nomem;
                tnode_free_append(tn, inode);

                /* initialize pointers out of node */
                put_child(inode, 1, node1);
                put_child(inode, 0, node0);
                NODE_INIT_PARENT(inode, tn);

                /* link parent to node */
                put_child(tn, i / 2, inode);
        }

        /* setup the parent pointers into and out of this node */
        return replace(t, oldtnode, tn);
nomem:
        /* all pointers should be clean so we are done */
        tnode_free(tn);
notnode:
        return NULL;
}

static struct key_vector *collapse(struct trie *t,
                                   struct key_vector *oldtnode)
{
        struct key_vector *n, *tp;
        unsigned long i;

        /* scan the tnode looking for that one child that might still exist */
        for (n = NULL, i = child_length(oldtnode); !n && i;)
                n = get_child(oldtnode, --i);

        /* compress one level */
        tp = node_parent(oldtnode);
        put_child_root(tp, oldtnode->key, n);
        node_set_parent(n, tp);

        /* drop dead node */
        node_free(oldtnode);

        return tp;
}

static unsigned char update_suffix(struct key_vector *tn)
{
        unsigned char slen = tn->pos;
        unsigned long stride, i;
        unsigned char slen_max;

        /* only vector 0 can have a suffix length greater than or equal to
         * tn->pos + tn->bits, the second highest node will have a suffix
         * length at most of tn->pos + tn->bits - 1
         */
        slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);

        /* search though the list of children looking for nodes that might
         * have a suffix greater than the one we currently have.  This is
         * why we start with a stride of 2 since a stride of 1 would
         * represent the nodes with suffix length equal to tn->pos
         */
        for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
                struct key_vector *n = get_child(tn, i);

                if (!n || (n->slen <= slen))
                        continue;

                /* update stride and slen based on new value */
                stride <<= (n->slen - slen);
                slen = n->slen;
                i &= ~(stride - 1);

                /* stop searching if we have hit the maximum possible value */
                if (slen >= slen_max)
                        break;
        }

        tn->slen = slen;

        return slen;
}

/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 * the Helsinki University of Technology and Matti Tikkanen of Nokia
 * Telecommunications, page 6:
 * "A node is doubled if the ratio of non-empty children to all
 * children in the *doubled* node is at least 'high'."
 *
 * 'high' in this instance is the variable 'inflate_threshold'. It
 * is expressed as a percentage, so we multiply it with
 * child_length() and instead of multiplying by 2 (since the
 * child array will be doubled by inflate()) and multiplying
 * the left-hand side by 100 (to handle the percentage thing) we
 * multiply the left-hand side by 50.
 *
 * The left-hand side may look a bit weird: child_length(tn)
 * - tn->empty_children is of course the number of non-null children
 * in the current node. tn->full_children is the number of "full"
 * children, that is non-null tnodes with a skip value of 0.
 * All of those will be doubled in the resulting inflated tnode, so
 * we just count them one extra time here.
 *
 * A clearer way to write this would be:
 *
 * to_be_doubled = tn->full_children;
 * not_to_be_doubled = child_length(tn) - tn->empty_children -
 *     tn->full_children;
 *
 * new_child_length = child_length(tn) * 2;
 *
 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 *      new_child_length;
 * if (new_fill_factor >= inflate_threshold)
 *
 * ...and so on, tho it would mess up the while () loop.
 *
 * anyway,
 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 *      inflate_threshold
 *
 * avoid a division:
 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 *      inflate_threshold * new_child_length
 *
 * expand not_to_be_doubled and to_be_doubled, and shorten:
 * 100 * (child_length(tn) - tn->empty_children +
 *    tn->full_children) >= inflate_threshold * new_child_length
 *
 * expand new_child_length:
 * 100 * (child_length(tn) - tn->empty_children +
 *    tn->full_children) >=
 *      inflate_threshold * child_length(tn) * 2
 *
 * shorten again:
 * 50 * (tn->full_children + child_length(tn) -
 *    tn->empty_children) >= inflate_threshold *
 *    child_length(tn)
 *
 */
static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
{
        unsigned long used = child_length(tn);
        unsigned long threshold = used;

        /* Keep root node larger */
        threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
        used -= tn_info(tn)->empty_children;
        used += tn_info(tn)->full_children;

        /* if bits == KEYLENGTH then pos = 0, and will fail below */

        return (used > 1) && tn->pos && ((50 * used) >= threshold);
}

static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
{
        unsigned long used = child_length(tn);
        unsigned long threshold = used;

        /* Keep root node larger */
        threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
        used -= tn_info(tn)->empty_children;

        /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */

        return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
}

static inline bool should_collapse(struct key_vector *tn)
{
        unsigned long used = child_length(tn);

        used -= tn_info(tn)->empty_children;

        /* account for bits == KEYLENGTH case */
        if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
                used -= KEY_MAX;

        /* One child or none, time to drop us from the trie */
        return used < 2;
}

#define MAX_WORK 10
static struct key_vector *resize(struct trie *t, struct key_vector *tn)
{
#ifdef CONFIG_IP_FIB_TRIE_STATS
        struct trie_use_stats __percpu *stats = t->stats;
#endif
        struct key_vector *tp = node_parent(tn);
        unsigned long cindex = get_index(tn->key, tp);
        int max_work = MAX_WORK;

        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
                 tn, inflate_threshold, halve_threshold);

        /* track the tnode via the pointer from the parent instead of
         * doing it ourselves.  This way we can let RCU fully do its
         * thing without us interfering
         */
        BUG_ON(tn != get_child(tp, cindex));

        /* Double as long as the resulting node has a number of
         * nonempty nodes that are above the threshold.
         */
        while (should_inflate(tp, tn) && max_work) {
                tp = inflate(t, tn);
                if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
                        this_cpu_inc(stats->resize_node_skipped);
#endif
                        break;
                }

                max_work--;
                tn = get_child(tp, cindex);
        }

        /* update parent in case inflate failed */
        tp = node_parent(tn);

        /* Return if at least one inflate is run */
        if (max_work != MAX_WORK)
                return tp;

        /* Halve as long as the number of empty children in this
         * node is above threshold.
         */
        while (should_halve(tp, tn) && max_work) {
                tp = halve(t, tn);
                if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
                        this_cpu_inc(stats->resize_node_skipped);
#endif
                        break;
                }

                max_work--;
                tn = get_child(tp, cindex);
        }

        /* Only one child remains */
        if (should_collapse(tn))
                return collapse(t, tn);

        /* update parent in case halve failed */
        return node_parent(tn);
}

static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
{
        unsigned char node_slen = tn->slen;

        while ((node_slen > tn->pos) && (node_slen > slen)) {
                slen = update_suffix(tn);
                if (node_slen == slen)
                        break;

                tn = node_parent(tn);
                node_slen = tn->slen;
        }
}

static void node_push_suffix(struct key_vector *tn, unsigned char slen)
{
        while (tn->slen < slen) {
                tn->slen = slen;
                tn = node_parent(tn);
        }
}

/* rcu_read_lock needs to be hold by caller from readside */
static struct key_vector *fib_find_node(struct trie *t,
                                        struct key_vector **tp, u32 key)
{
        struct key_vector *pn, *n = t->kv;
        unsigned long index = 0;

        do {
                pn = n;
                n = get_child_rcu(n, index);

                if (!n)
                        break;

                index = get_cindex(key, n);

                /* This bit of code is a bit tricky but it combines multiple
                 * checks into a single check.  The prefix consists of the
                 * prefix plus zeros for the bits in the cindex. The index
                 * is the difference between the key and this value.  From
                 * this we can actually derive several pieces of data.
                 *   if (index >= (1ul << bits))
                 *     we have a mismatch in skip bits and failed
                 *   else
                 *     we know the value is cindex
                 *
                 * This check is safe even if bits == KEYLENGTH due to the
                 * fact that we can only allocate a node with 32 bits if a
                 * long is greater than 32 bits.
                 */
                if (index >= (1ul << n->bits)) {
                        n = NULL;
                        break;
                }

                /* keep searching until we find a perfect match leaf or NULL */
        } while (IS_TNODE(n));

        *tp = pn;

        return n;
}

/* Return the first fib alias matching DSCP with
 * priority less than or equal to PRIO.
 * If 'find_first' is set, return the first matching
 * fib alias, regardless of DSCP and priority.
 */
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
                                        dscp_t dscp, u32 prio, u32 tb_id,
                                        bool find_first)
{
        struct fib_alias *fa;

        if (!fah)
                return NULL;

        hlist_for_each_entry(fa, fah, fa_list) {
                /* Avoid Sparse warning when using dscp_t in inequalities */
                u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp);
                u8 __dscp = inet_dscp_to_dsfield(dscp);

                if (fa->fa_slen < slen)
                        continue;
                if (fa->fa_slen != slen)
                        break;
                if (fa->tb_id > tb_id)
                        continue;
                if (fa->tb_id != tb_id)
                        break;
                if (find_first)
                        return fa;
                if (__fa_dscp > __dscp)
                        continue;
                if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp)
                        return fa;
        }

        return NULL;
}

static struct fib_alias *
fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
{
        u8 slen = KEYLENGTH - fri->dst_len;
        struct key_vector *l, *tp;
        struct fib_table *tb;
        struct fib_alias *fa;
        struct trie *t;

        tb = fib_get_table(net, fri->tb_id);
        if (!tb)
                return NULL;

        t = (struct trie *)tb->tb_data;
        l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
        if (!l)
                return NULL;

        hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
                if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
                    fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi &&
                    fa->fa_type == fri->type)
                        return fa;
        }

        return NULL;
}

void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
{
        u8 fib_notify_on_flag_change;
        struct fib_alias *fa_match;
        struct sk_buff *skb;
        int err;

        rcu_read_lock();

        fa_match = fib_find_matching_alias(net, fri);
        if (!fa_match)
                goto out;

        /* These are paired with the WRITE_ONCE() happening in this function.
         * The reason is that we are only protected by RCU at this point.
         */
        if (READ_ONCE(fa_match->offload) == fri->offload &&
            READ_ONCE(fa_match->trap) == fri->trap &&
            READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
                goto out;

        WRITE_ONCE(fa_match->offload, fri->offload);
        WRITE_ONCE(fa_match->trap, fri->trap);

        fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change);

        /* 2 means send notifications only if offload_failed was changed. */
        if (fib_notify_on_flag_change == 2 &&
            READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
                goto out;

        WRITE_ONCE(fa_match->offload_failed, fri->offload_failed);

        if (!fib_notify_on_flag_change)
                goto out;

        skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);
        if (!skb) {
                err = -ENOBUFS;
                goto errout;
        }

        err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC);
        goto out;

errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err);
out:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);

static void trie_rebalance(struct trie *t, struct key_vector *tn)
{
        while (!IS_TRIE(tn))
                tn = resize(t, tn);
}

static int fib_insert_node(struct trie *t, struct key_vector *tp,
                           struct fib_alias *new, t_key key)
{
        struct key_vector *n, *l;

        l = leaf_new(key, new);
        if (!l)
                goto noleaf;

        /* retrieve child from parent node */
        n = get_child(tp, get_index(key, tp));

        /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
         *
         *  Add a new tnode here
         *  first tnode need some special handling
         *  leaves us in position for handling as case 3
         */
        if (n) {
                struct key_vector *tn;

                tn = tnode_new(key, __fls(key ^ n->key), 1);
                if (!tn)
                        goto notnode;

                /* initialize routes out of node */
                NODE_INIT_PARENT(tn, tp);
                put_child(tn, get_index(key, tn) ^ 1, n);

                /* start adding routes into the node */
                put_child_root(tp, key, tn);
                node_set_parent(n, tn);

                /* parent now has a NULL spot where the leaf can go */
                tp = tn;
        }

        /* Case 3: n is NULL, and will just insert a new leaf */
        node_push_suffix(tp, new->fa_slen);
        NODE_INIT_PARENT(l, tp);
        put_child_root(tp, key, l);
        trie_rebalance(t, tp);

        return 0;
notnode:
        node_free(l);
noleaf:
        return -ENOMEM;
}

static int fib_insert_alias(struct trie *t, struct key_vector *tp,
                            struct key_vector *l, struct fib_alias *new,
                            struct fib_alias *fa, t_key key)
{
        if (!l)
                return fib_insert_node(t, tp, new, key);

        if (fa) {
                hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
        } else {
                struct fib_alias *last;

                hlist_for_each_entry(last, &l->leaf, fa_list) {
                        if (new->fa_slen < last->fa_slen)
                                break;
                        if ((new->fa_slen == last->fa_slen) &&
                            (new->tb_id > last->tb_id))
                                break;
                        fa = last;
                }

                if (fa)
                        hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
                else
                        hlist_add_head_rcu(&new->fa_list, &l->leaf);
        }

        /* if we added to the tail node then we need to update slen */
        if (l->slen < new->fa_slen) {
                l->slen = new->fa_slen;
                node_push_suffix(tp, new->fa_slen);
        }

        return 0;
}

static void fib_remove_alias(struct trie *t, struct key_vector *tp,
                             struct key_vector *l, struct fib_alias *old);

/* Caller must hold RTNL. */
int fib_table_insert(struct net *net, struct fib_table *tb,
                     struct fib_config *cfg, struct netlink_ext_ack *extack)
{
        struct trie *t = (struct trie *)tb->tb_data;
        struct fib_alias *fa, *new_fa;
        struct key_vector *l, *tp;
        u16 nlflags = NLM_F_EXCL;
        struct fib_info *fi;
        u8 plen = cfg->fc_dst_len;
        u8 slen = KEYLENGTH - plen;
        dscp_t dscp;
        u32 key;
        int err;

        key = ntohl(cfg->fc_dst);

        pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);

        fi = fib_create_info(cfg, extack);
        if (IS_ERR(fi)) {
                err = PTR_ERR(fi);
                goto err;
        }

        dscp = cfg->fc_dscp;
        l = fib_find_node(t, &tp, key);
        fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority,
                                tb->tb_id, false) : NULL;

        /* Now fa, if non-NULL, points to the first fib alias
         * with the same keys [prefix,dscp,priority], if such key already
         * exists or to the node before which we will insert new one.
         *
         * If fa is NULL, we will need to allocate a new one and
         * insert to the tail of the section matching the suffix length
         * of the new alias.
         */

        if (fa && fa->fa_dscp == dscp &&
            fa->fa_info->fib_priority == fi->fib_priority) {
                struct fib_alias *fa_first, *fa_match;

                err = -EEXIST;
                if (cfg->fc_nlflags & NLM_F_EXCL)
                        goto out;

                nlflags &= ~NLM_F_EXCL;

                /* We have 2 goals:
                 * 1. Find exact match for type, scope, fib_info to avoid
                 * duplicate routes
                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
                 */
                fa_match = NULL;
                fa_first = fa;
                hlist_for_each_entry_from(fa, fa_list) {
                        if ((fa->fa_slen != slen) ||
                            (fa->tb_id != tb->tb_id) ||
                            (fa->fa_dscp != dscp))
                                break;
                        if (fa->fa_info->fib_priority != fi->fib_priority)
                                break;
                        if (fa->fa_type == cfg->fc_type &&
                            fa->fa_info == fi) {
                                fa_match = fa;
                                break;
                        }
                }

                if (cfg->fc_nlflags & NLM_F_REPLACE) {
                        struct fib_info *fi_drop;
                        u8 state;

                        nlflags |= NLM_F_REPLACE;
                        fa = fa_first;
                        if (fa_match) {
                                if (fa == fa_match)
                                        err = 0;
                                goto out;
                        }
                        err = -ENOBUFS;
                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
                        if (!new_fa)
                                goto out;

                        fi_drop = fa->fa_info;
                        new_fa->fa_dscp = fa->fa_dscp;
                        new_fa->fa_info = fi;
                        new_fa->fa_type = cfg->fc_type;
                        state = READ_ONCE(fa->fa_state);
                        new_fa->fa_state = state & ~FA_S_ACCESSED;
                        new_fa->fa_slen = fa->fa_slen;
                        new_fa->tb_id = tb->tb_id;
                        new_fa->fa_default = -1;
                        new_fa->offload = 0;
                        new_fa->trap = 0;
                        new_fa->offload_failed = 0;

                        hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);

                        if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
                                           tb->tb_id, true) == new_fa) {
                                enum fib_event_type fib_event;

                                fib_event = FIB_EVENT_ENTRY_REPLACE;
                                err = call_fib_entry_notifiers(net, fib_event,
                                                               key, plen,
                                                               new_fa, extack);
                                if (err) {
                                        hlist_replace_rcu(&new_fa->fa_list,
                                                          &fa->fa_list);
                                        goto out_free_new_fa;
                                }
                        }

                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
                                  tb->tb_id, &cfg->fc_nlinfo, nlflags);

                        alias_free_mem_rcu(fa);

                        fib_release_info(fi_drop);
                        if (state & FA_S_ACCESSED)
                                rt_cache_flush(cfg->fc_nlinfo.nl_net);

                        goto succeeded;
                }
                /* Error if we find a perfect match which
                 * uses the same scope, type, and nexthop
                 * information.
                 */
                if (fa_match)
                        goto out;

                if (cfg->fc_nlflags & NLM_F_APPEND)
                        nlflags |= NLM_F_APPEND;
                else
                        fa = fa_first;
        }
        err = -ENOENT;
        if (!(cfg->fc_nlflags & NLM_F_CREATE))
                goto out;

        nlflags |= NLM_F_CREATE;
        err = -ENOBUFS;
        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
        if (!new_fa)
                goto out;

        new_fa->fa_info = fi;
        new_fa->fa_dscp = dscp;
        new_fa->fa_type = cfg->fc_type;
        new_fa->fa_state = 0;
        new_fa->fa_slen = slen;
        new_fa->tb_id = tb->tb_id;
        new_fa->fa_default = -1;
        new_fa->offload = 0;
        new_fa->trap = 0;
        new_fa->offload_failed = 0;

        /* Insert new entry to the list. */
        err = fib_insert_alias(t, tp, l, new_fa, fa, key);
        if (err)
                goto out_free_new_fa;

        /* The alias was already inserted, so the node must exist. */
        l = l ? l : fib_find_node(t, &tp, key);
        if (WARN_ON_ONCE(!l)) {
                err = -ENOENT;
                goto out_free_new_fa;
        }

        if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
            new_fa) {
                enum fib_event_type fib_event;

                fib_event = FIB_EVENT_ENTRY_REPLACE;
                err = call_fib_entry_notifiers(net, fib_event, key, plen,
                                               new_fa, extack);
                if (err)
                        goto out_remove_new_fa;
        }

        if (!plen)
                tb->tb_num_default++;

        rt_cache_flush(cfg->fc_nlinfo.nl_net);
        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
                  &cfg->fc_nlinfo, nlflags);
succeeded:
        return 0;

out_remove_new_fa:
        fib_remove_alias(t, tp, l, new_fa);
out_free_new_fa:
        kmem_cache_free(fn_alias_kmem, new_fa);
out:
        fib_release_info(fi);
err:
        return err;
}

static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
{
        t_key prefix = n->key;

        return (key ^ prefix) & (prefix | -prefix);
}

bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
                         const struct flowi4 *flp)
{
        if (nhc->nhc_flags & RTNH_F_DEAD)
                return false;

        if (ip_ignore_linkdown(nhc->nhc_dev) &&
            nhc->nhc_flags & RTNH_F_LINKDOWN &&
            !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
                return false;

        if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif)
                return false;

        return true;
}

/* should be called with rcu_read_lock */
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
                     struct fib_result *res, int fib_flags)
{
        struct trie *t = (struct trie *) tb->tb_data;
#ifdef CONFIG_IP_FIB_TRIE_STATS
        struct trie_use_stats __percpu *stats = t->stats;
#endif
        const t_key key = ntohl(flp->daddr);
        struct key_vector *n, *pn;
        struct fib_alias *fa;
        unsigned long index;
        t_key cindex;

        pn = t->kv;
        cindex = 0;

        n = get_child_rcu(pn, cindex);
        if (!n) {
                trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
                return -EAGAIN;
        }

#ifdef CONFIG_IP_FIB_TRIE_STATS
        this_cpu_inc(stats->gets);
#endif

        /* Step 1: Travel to the longest prefix match in the trie */
        for (;;) {
                index = get_cindex(key, n);

                /* This bit of code is a bit tricky but it combines multiple
                 * checks into a single check.  The prefix consists of the
                 * prefix plus zeros for the "bits" in the prefix. The index
                 * is the difference between the key and this value.  From
                 * this we can actually derive several pieces of data.
                 *   if (index >= (1ul << bits))
                 *     we have a mismatch in skip bits and failed
                 *   else
                 *     we know the value is cindex
                 *
                 * This check is safe even if bits == KEYLENGTH due to the
                 * fact that we can only allocate a node with 32 bits if a
                 * long is greater than 32 bits.
                 */
                if (index >= (1ul << n->bits))
                        break;

                /* we have found a leaf. Prefixes have already been compared */
                if (IS_LEAF(n))
                        goto found;

                /* only record pn and cindex if we are going to be chopping
                 * bits later.  Otherwise we are just wasting cycles.
                 */
                if (n->slen > n->pos) {
                        pn = n;
                        cindex = index;
                }

                n = get_child_rcu(n, index);
                if (unlikely(!n))
                        goto backtrace;
        }

        /* Step 2: Sort out leaves and begin backtracing for longest prefix */
        for (;;) {
                /* record the pointer where our next node pointer is stored */
                struct key_vector __rcu **cptr = n->tnode;

                /* This test verifies that none of the bits that differ
                 * between the key and the prefix exist in the region of
                 * the lsb and higher in the prefix.
                 */
                if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
                        goto backtrace;

                /* exit out and process leaf */
                if (unlikely(IS_LEAF(n)))
                        break;

                /* Don't bother recording parent info.  Since we are in
                 * prefix match mode we will have to come back to wherever
                 * we started this traversal anyway
                 */

                while ((n = rcu_dereference(*cptr)) == NULL) {
backtrace:
#ifdef CONFIG_IP_FIB_TRIE_STATS
                        if (!n)
                                this_cpu_inc(stats->null_node_hit);
#endif
                        /* If we are at cindex 0 there are no more bits for
                         * us to strip at this level so we must ascend back
                         * up one level to see if there are any more bits to
                         * be stripped there.
                         */
                        while (!cindex) {
                                t_key pkey = pn->key;

                                /* If we don't have a parent then there is
                                 * nothing for us to do as we do not have any
                                 * further nodes to parse.
                                 */
                                if (IS_TRIE(pn)) {
                                        trace_fib_table_lookup(tb->tb_id, flp,
                                                               NULL, -EAGAIN);
                                        return -EAGAIN;
                                }
#ifdef CONFIG_IP_FIB_TRIE_STATS
                                this_cpu_inc(stats->backtrack);
#endif
                                /* Get Child's index */
                                pn = node_parent_rcu(pn);
                                cindex = get_index(pkey, pn);
                        }

                        /* strip the least significant bit from the cindex */
                        cindex &= cindex - 1;

                        /* grab pointer for next child node */
                        cptr = &pn->tnode[cindex];
                }
        }

found:
        /* this line carries forward the xor from earlier in the function */
        index = key ^ n->key;

        /* Step 3: Process the leaf, if that fails fall back to backtracing */
        hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
                struct fib_info *fi = fa->fa_info;
                struct fib_nh_common *nhc;
                int nhsel, err;

                if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
                        if (index >= (1ul << fa->fa_slen))
                                continue;
                }
                if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp))
                        continue;
                /* Paired with WRITE_ONCE() in fib_release_info() */
                if (READ_ONCE(fi->fib_dead))
                        continue;
                if (fa->fa_info->fib_scope < flp->flowi4_scope)
                        continue;
                fib_alias_accessed(fa);
                err = fib_props[fa->fa_type].error;
                if (unlikely(err < 0)) {
out_reject:
#ifdef CONFIG_IP_FIB_TRIE_STATS
                        this_cpu_inc(stats->semantic_match_passed);
#endif
                        trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
                        return err;
                }
                if (fi->fib_flags & RTNH_F_DEAD)
                        continue;

                if (unlikely(fi->nh)) {
                        if (nexthop_is_blackhole(fi->nh)) {
                                err = fib_props[RTN_BLACKHOLE].error;
                                goto out_reject;
                        }

                        nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp,
                                                     &nhsel);
                        if (nhc)
                                goto set_result;
                        goto miss;
                }

                for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
                        nhc = fib_info_nhc(fi, nhsel);

                        if (!fib_lookup_good_nhc(nhc, fib_flags, flp))
                                continue;
set_result:
                        if (!(fib_flags & FIB_LOOKUP_NOREF))
                                refcount_inc(&fi->fib_clntref);

                        res->prefix = htonl(n->key);
                        res->prefixlen = KEYLENGTH - fa->fa_slen;
                        res->nh_sel = nhsel;
                        res->nhc = nhc;
                        res->type = fa->fa_type;
                        res->scope = fi->fib_scope;
                        res->dscp = fa->fa_dscp;
                        res->fi = fi;
                        res->table = tb;
                        res->fa_head = &n->leaf;
#ifdef CONFIG_IP_FIB_TRIE_STATS
                        this_cpu_inc(stats->semantic_match_passed);
#endif
                        trace_fib_table_lookup(tb->tb_id, flp, nhc, err);

                        return err;
                }
        }
miss:
#ifdef CONFIG_IP_FIB_TRIE_STATS
        this_cpu_inc(stats->semantic_match_miss);
#endif
        goto backtrace;
}
EXPORT_SYMBOL_GPL(fib_table_lookup);

static void fib_remove_alias(struct trie *t, struct key_vector *tp,
                             struct key_vector *l, struct fib_alias *old)
{
        /* record the location of the previous list_info entry */
        struct hlist_node **pprev = old->fa_list.pprev;
        struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);

        /* remove the fib_alias from the list */
        hlist_del_rcu(&old->fa_list);

        /* if we emptied the list this leaf will be freed and we can sort
         * out parent suffix lengths as a part of trie_rebalance
         */
        if (hlist_empty(&l->leaf)) {
                if (tp->slen == l->slen)
                        node_pull_suffix(tp, tp->pos);
                put_child_root(tp, l->key, NULL);
                node_free(l);
                trie_rebalance(t, tp);
                return;
        }

        /* only access fa if it is pointing at the last valid hlist_node */
        if (*pprev)
                return;

        /* update the trie with the latest suffix length */
        l->slen = fa->fa_slen;
        node_pull_suffix(tp, fa->fa_slen);
}

static void fib_notify_alias_delete(struct net *net, u32 key,
                                    struct hlist_head *fah,
                                    struct fib_alias *fa_to_delete,
                                    struct netlink_ext_ack *extack)
{
        struct fib_alias *fa_next, *fa_to_notify;
        u32 tb_id = fa_to_delete->tb_id;
        u8 slen = fa_to_delete->fa_slen;
        enum fib_event_type fib_event;

        /* Do not notify if we do not care about the route. */
        if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
                return;

        /* Determine if the route should be replaced by the next route in the
         * list.
         */
        fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
                                   struct fib_alias, fa_list);
        if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
                fib_event = FIB_EVENT_ENTRY_REPLACE;
                fa_to_notify = fa_next;
        } else {
                fib_event = FIB_EVENT_ENTRY_DEL;
                fa_to_notify = fa_to_delete;
        }
        call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
                                 fa_to_notify, extack);
}

/* Caller must hold RTNL. */
int fib_table_delete(struct net *net, struct fib_table *tb,
                     struct fib_config *cfg, struct netlink_ext_ack *extack)
{
        struct trie *t = (struct trie *) tb->tb_data;
        struct fib_alias *fa, *fa_to_delete;
        struct key_vector *l, *tp;
        u8 plen = cfg->fc_dst_len;
        u8 slen = KEYLENGTH - plen;
        dscp_t dscp;
        u32 key;

        key = ntohl(cfg->fc_dst);

        l = fib_find_node(t, &tp, key);
        if (!l)
                return -ESRCH;

        dscp = cfg->fc_dscp;
        fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false);
        if (!fa)
                return -ESRCH;

        pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen,
                 inet_dscp_to_dsfield(dscp), t);

        fa_to_delete = NULL;
        hlist_for_each_entry_from(fa, fa_list) {
                struct fib_info *fi = fa->fa_info;

                if ((fa->fa_slen != slen) ||
                    (fa->tb_id != tb->tb_id) ||
                    (fa->fa_dscp != dscp))
                        break;

                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
                     fa->fa_info->fib_scope == cfg->fc_scope) &&
                    (!cfg->fc_prefsrc ||
                     fi->fib_prefsrc == cfg->fc_prefsrc) &&
                    (!cfg->fc_protocol ||
                     fi->fib_protocol == cfg->fc_protocol) &&
                    fib_nh_match(net, cfg, fi, extack) == 0 &&
                    fib_metrics_match(cfg, fi)) {
                        fa_to_delete = fa;
                        break;
                }
        }

        if (!fa_to_delete)
                return -ESRCH;

        fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
        rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
                  &cfg->fc_nlinfo, 0);

        if (!plen)
                tb->tb_num_default--;

        fib_remove_alias(t, tp, l, fa_to_delete);

        if (READ_ONCE(fa_to_delete->fa_state) & FA_S_ACCESSED)
                rt_cache_flush(cfg->fc_nlinfo.nl_net);

        fib_release_info(fa_to_delete->fa_info);
        alias_free_mem_rcu(fa_to_delete);
        return 0;
}

/* Scan for the next leaf starting at the provided key value */
static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
{
        struct key_vector *pn, *n = *tn;
        unsigned long cindex;

        /* this loop is meant to try and find the key in the trie */
        do {
                /* record parent and next child index */
                pn = n;
                cindex = (key > pn->key) ? get_index(key, pn) : 0;

                if (cindex >> pn->bits)
                        break;

                /* descend into the next child */
                n = get_child_rcu(pn, cindex++);
                if (!n)
                        break;

                /* guarantee forward progress on the keys */
                if (IS_LEAF(n) && (n->key >= key))
                        goto found;
        } while (IS_TNODE(n));

        /* this loop will search for the next leaf with a greater key */
        while (!IS_TRIE(pn)) {
                /* if we exhausted the parent node we will need to climb */
                if (cindex >= (1ul << pn->bits)) {
                        t_key pkey = pn->key;

                        pn = node_parent_rcu(pn);
                        cindex = get_index(pkey, pn) + 1;
                        continue;
                }

                /* grab the next available node */
                n = get_child_rcu(pn, cindex++);
                if (!n)
                        continue;

                /* no need to compare keys since we bumped the index */
                if (IS_LEAF(n))
                        goto found;

                /* Rescan start scanning in new node */
                pn = n;
                cindex = 0;
        }

        *tn = pn;
        return NULL; /* Root of trie */
found:
        /* if we are at the limit for keys just return NULL for the tnode */
        *tn = pn;
        return n;
}

static void fib_trie_free(struct fib_table *tb)
{
        struct trie *t = (struct trie *)tb->tb_data;
        struct key_vector *pn = t->kv;
        unsigned long cindex = 1;
        struct hlist_node *tmp;
        struct fib_alias *fa;

        /* walk trie in reverse order and free everything */
        for (;;) {
                struct key_vector *n;

                if (!(cindex--)) {
                        t_key pkey = pn->key;

                        if (IS_TRIE(pn))
                                break;

                        n = pn;
                        pn = node_parent(pn);

                        /* drop emptied tnode */
                        put_child_root(pn, n->key, NULL);
                        node_free(n);

                        cindex = get_index(pkey, pn);

                        continue;
                }

                /* grab the next available node */
                n = get_child(pn, cindex);
                if (!n)
                        continue;

                if (IS_TNODE(n)) {
                        /* record pn and cindex for leaf walking */
                        pn = n;
                        cindex = 1ul << n->bits;

                        continue;
                }

                hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
                        hlist_del_rcu(&fa->fa_list);
                        alias_free_mem_rcu(fa);
                }

                put_child_root(pn, n->key, NULL);
                node_free(n);
        }

#ifdef CONFIG_IP_FIB_TRIE_STATS
        free_percpu(t->stats);
#endif
        kfree(tb);
}

struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
{
        struct trie *ot = (struct trie *)oldtb->tb_data;
        struct key_vector *l, *tp = ot->kv;
        struct fib_table *local_tb;
        struct fib_alias *fa;
        struct trie *lt;
        t_key key = 0;

        if (oldtb->tb_data == oldtb->__data)
                return oldtb;

        local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
        if (!local_tb)
                return NULL;

        lt = (struct trie *)local_tb->tb_data;

        while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
                struct key_vector *local_l = NULL, *local_tp;

                hlist_for_each_entry(fa, &l->leaf, fa_list) {
                        struct fib_alias *new_fa;

                        if (local_tb->tb_id != fa->tb_id)
                                continue;

                        /* clone fa for new local table */
                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
                        if (!new_fa)
                                goto out;

                        memcpy(new_fa, fa, sizeof(*fa));

                        /* insert clone into table */
                        if (!local_l)
                                local_l = fib_find_node(lt, &local_tp, l->key);

                        if (fib_insert_alias(lt, local_tp, local_l, new_fa,
                                             NULL, l->key)) {
                                kmem_cache_free(fn_alias_kmem, new_fa);
                                goto out;
                        }
                }

                /* stop loop if key wrapped back to 0 */
                key = l->key + 1;
                if (key < l->key)
                        break;
        }

        return local_tb;
out:
        fib_trie_free(local_tb);

        return NULL;
}

/* Caller must hold RTNL */
void fib_table_flush_external(struct fib_table *tb)
{
        struct trie *t = (struct trie *)tb->tb_data;
        struct key_vector *pn = t->kv;
        unsigned long cindex = 1;
        struct hlist_node *tmp;
        struct fib_alias *fa;

        /* walk trie in reverse order */
        for (;;) {
                unsigned char slen = 0;
                struct key_vector *n;

                if (!(cindex--)) {
                        t_key pkey = pn->key;

                        /* cannot resize the trie vector */
                        if (IS_TRIE(pn))
                                break;

                        /* update the suffix to address pulled leaves */
                        if (pn->slen > pn->pos)
                                update_suffix(pn);

                        /* resize completed node */
                        pn = resize(t, pn);
                        cindex = get_index(pkey, pn);

                        continue;
                }

                /* grab the next available node */
                n = get_child(pn, cindex);
                if (!n)
                        continue;

                if (IS_TNODE(n)) {
                        /* record pn and cindex for leaf walking */
                        pn = n;
                        cindex = 1ul << n->bits;

                        continue;
                }

                hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
                        /* if alias was cloned to local then we just
                         * need to remove the local copy from main
                         */
                        if (tb->tb_id != fa->tb_id) {
                                hlist_del_rcu(&fa->fa_list);
                                alias_free_mem_rcu(fa);
                                continue;
                        }

                        /* record local slen */
                        slen = fa->fa_slen;
                }

                /* update leaf slen */
                n->slen = slen;

                if (hlist_empty(&n->leaf)) {
                        put_child_root(pn, n->key, NULL);
                        node_free(n);
                }
        }
}

/* Caller must hold RTNL. */
int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
{
        struct trie *t = (struct trie *)tb->tb_data;
        struct nl_info info = { .nl_net = net };
        struct key_vector *pn = t->kv;
        unsigned long cindex = 1;
        struct hlist_node *tmp;
        struct fib_alias *fa;
        int found = 0;

        /* walk trie in reverse order */
        for (;;) {
                unsigned char slen = 0;
                struct key_vector *n;

                if (!(cindex--)) {
                        t_key pkey = pn->key;

                        /* cannot resize the trie vector */
                        if (IS_TRIE(pn))
                                break;

                        /* update the suffix to address pulled leaves */
                        if (pn->slen > pn->pos)
                                update_suffix(pn);

                        /* resize completed node */
                        pn = resize(t, pn);
                        cindex = get_index(pkey, pn);

                        continue;
                }

                /* grab the next available node */
                n = get_child(pn, cindex);
                if (!n)
                        continue;

                if (IS_TNODE(n)) {
                        /* record pn and cindex for leaf walking */
                        pn = n;
                        cindex = 1ul << n->bits;

                        continue;
                }

                hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
                        struct fib_info *fi = fa->fa_info;

                        if (!fi || tb->tb_id != fa->tb_id ||
                            (!(fi->fib_flags & RTNH_F_DEAD) &&
                             !fib_props[fa->fa_type].error)) {
                                slen = fa->fa_slen;
                                continue;
                        }

                        /* When not flushing the entire table, skip error
                         * routes that are not marked for deletion.
                         */
                        if (!flush_all && fib_props[fa->fa_type].error &&
                            !(fi->fib_flags & RTNH_F_DEAD)) {
                                slen = fa->fa_slen;
                                continue;
                        }

                        fib_notify_alias_delete(net, n->key, &n->leaf, fa,
                                                NULL);
                        if (fi->pfsrc_removed)
                                rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa,
                                          KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0);
                        hlist_del_rcu(&fa->fa_list);
                        fib_release_info(fa->fa_info);
                        alias_free_mem_rcu(fa);
                        found++;
                }

                /* update leaf slen */
                n->slen = slen;

                if (hlist_empty(&n->leaf)) {
                        put_child_root(pn, n->key, NULL);
                        node_free(n);
                }
        }

        pr_debug("trie_flush found=%d\n", found);
        return found;
}

/* derived from fib_trie_free */
static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
                                     struct nl_info *info)
{
        struct trie *t = (struct trie *)tb->tb_data;
        struct key_vector *pn = t->kv;
        unsigned long cindex = 1;
        struct fib_alias *fa;

        for (;;) {
                struct key_vector *n;

                if (!(cindex--)) {
                        t_key pkey = pn->key;

                        if (IS_TRIE(pn))
                                break;

                        pn = node_parent(pn);
                        cindex = get_index(pkey, pn);
                        continue;
                }

                /* grab the next available node */
                n = get_child(pn, cindex);
                if (!n)
                        continue;

                if (IS_TNODE(n)) {
                        /* record pn and cindex for leaf walking */
                        pn = n;
                        cindex = 1ul << n->bits;

                        continue;
                }

                hlist_for_each_entry(fa, &n->leaf, fa_list) {
                        struct fib_info *fi = fa->fa_info;

                        if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
                                continue;

                        rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
                                  KEYLENGTH - fa->fa_slen, tb->tb_id,
                                  info, NLM_F_REPLACE);
                }
        }
}

void fib_info_notify_update(struct net *net, struct nl_info *info)
{
        unsigned int h;

        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
                struct fib_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb_hlist,
                                         lockdep_rtnl_is_held())
                        __fib_info_notify_update(net, tb, info);
        }
}

static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
                           struct notifier_block *nb,
                           struct netlink_ext_ack *extack)
{
        struct fib_alias *fa;
        int last_slen = -1;
        int err;

        hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
                struct fib_info *fi = fa->fa_info;

                if (!fi)
                        continue;

                /* local and main table can share the same trie,
                 * so don't notify twice for the same entry.
                 */
                if (tb->tb_id != fa->tb_id)
                        continue;

                if (fa->fa_slen == last_slen)
                        continue;

                last_slen = fa->fa_slen;
                err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
                                              l->key, KEYLENGTH - fa->fa_slen,
                                              fa, extack);
                if (err)
                        return err;
        }
        return 0;
}

static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
                            struct netlink_ext_ack *extack)
{
        struct trie *t = (struct trie *)tb->tb_data;
        struct key_vector *l, *tp = t->kv;
        t_key key = 0;
        int err;

        while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
                err = fib_leaf_notify(l, tb, nb, extack);
                if (err)
                        return err;

                key = l->key + 1;
                /* stop in case of wrap around */
                if (key < l->key)
                        break;
        }
        return 0;
}

int fib_notify(struct net *net, struct notifier_block *nb,
               struct netlink_ext_ack *extack)
{
        unsigned int h;
        int err;

        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
                struct fib_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
                        err = fib_table_notify(tb, nb, extack);
                        if (err)
                                return err;
                }
        }
        return 0;
}

static void __trie_free_rcu(struct rcu_head *head)
{
        struct fib_table *tb = container_of(head, struct fib_table, rcu);
#ifdef CONFIG_IP_FIB_TRIE_STATS
        struct trie *t = (struct trie *)tb->tb_data;

        if (tb->tb_data == tb->__data)
                free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
        kfree(tb);
}

void fib_free_table(struct fib_table *tb)
{
        call_rcu(&tb->rcu, __trie_free_rcu);
}

static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
                             struct sk_buff *skb, struct netlink_callback *cb,
                             struct fib_dump_filter *filter)
{
        unsigned int flags = NLM_F_MULTI;
        __be32 xkey = htonl(l->key);
        int i, s_i, i_fa, s_fa, err;
        struct fib_alias *fa;

        if (filter->filter_set ||
            !filter->dump_exceptions || !filter->dump_routes)
                flags |= NLM_F_DUMP_FILTERED;

        s_i = cb->args[4];
        s_fa = cb->args[5];
        i = 0;

        /* rcu_read_lock is hold by caller */
        hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
                struct fib_info *fi = fa->fa_info;

                if (i < s_i)
                        goto next;

                i_fa = 0;

                if (tb->tb_id != fa->tb_id)
                        goto next;

                if (filter->filter_set) {
                        if (filter->rt_type && fa->fa_type != filter->rt_type)
                                goto next;

                        if ((filter->protocol &&
                             fi->fib_protocol != filter->protocol))
                                goto next;

                        if (filter->dev &&
                            !fib_info_nh_uses_dev(fi, filter->dev))
                                goto next;
                }

                if (filter->dump_routes) {
                        if (!s_fa) {
                                struct fib_rt_info fri;

                                fri.fi = fi;
                                fri.tb_id = tb->tb_id;
                                fri.dst = xkey;
                                fri.dst_len = KEYLENGTH - fa->fa_slen;
                                fri.dscp = fa->fa_dscp;
                                fri.type = fa->fa_type;
                                fri.offload = READ_ONCE(fa->offload);
                                fri.trap = READ_ONCE(fa->trap);
                                fri.offload_failed = READ_ONCE(fa->offload_failed);
                                err = fib_dump_info(skb,
                                                    NETLINK_CB(cb->skb).portid,
                                                    cb->nlh->nlmsg_seq,
                                                    RTM_NEWROUTE, &fri, flags);
                                if (err < 0)
                                        goto stop;
                        }

                        i_fa++;
                }

                if (filter->dump_exceptions) {
                        err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
                                                 &i_fa, s_fa, flags);
                        if (err < 0)
                                goto stop;
                }

next:
                i++;
        }

        cb->args[4] = i;
        return skb->len;

stop:
        cb->args[4] = i;
        cb->args[5] = i_fa;
        return err;
}

/* rcu_read_lock needs to be hold by caller from readside */
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
                   struct netlink_callback *cb, struct fib_dump_filter *filter)
{
        struct trie *t = (struct trie *)tb->tb_data;
        struct key_vector *l, *tp = t->kv;
        /* Dump starting at last key.
         * Note: 0.0.0.0/0 (ie default) is first key.
         */
        int count = cb->args[2];
        t_key key = cb->args[3];

        /* First time here, count and key are both always 0. Count > 0
         * and key == 0 means the dump has wrapped around and we are done.
         */
        if (count && !key)
                return 0;

        while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
                int err;

                err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
                if (err < 0) {
                        cb->args[3] = key;
                        cb->args[2] = count;
                        return err;
                }

                ++count;
                key = l->key + 1;

                memset(&cb->args[4], 0,
                       sizeof(cb->args) - 4*sizeof(cb->args[0]));

                /* stop loop if key wrapped back to 0 */
                if (key < l->key)
                        break;
        }

        cb->args[3] = key;
        cb->args[2] = count;

        return 0;
}

void __init fib_trie_init(void)
{
        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
                                          sizeof(struct fib_alias),
                                          0, SLAB_PANIC | SLAB_ACCOUNT, NULL);

        trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
                                           LEAF_SIZE,
                                           0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
}

struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
{
        struct fib_table *tb;
        struct trie *t;
        size_t sz = sizeof(*tb);

        if (!alias)
                sz += sizeof(struct trie);

        tb = kzalloc(sz, GFP_KERNEL);
        if (!tb)
                return NULL;

        tb->tb_id = id;
        tb->tb_num_default = 0;
        tb->tb_data = (alias ? alias->__data : tb->__data);

        if (alias)
                return tb;

        t = (struct trie *) tb->tb_data;
        t->kv[0].pos = KEYLENGTH;
        t->kv[0].slen = KEYLENGTH;
#ifdef CONFIG_IP_FIB_TRIE_STATS
        t->stats = alloc_percpu(struct trie_use_stats);
        if (!t->stats) {
                kfree(tb);
                tb = NULL;
        }
#endif

        return tb;
}

#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
        struct seq_net_private p;
        struct fib_table *tb;
        struct key_vector *tnode;
        unsigned int index;
        unsigned int depth;
};

static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
{
        unsigned long cindex = iter->index;
        struct key_vector *pn = iter->tnode;
        t_key pkey;

        pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
                 iter->tnode, iter->index, iter->depth);

        while (!IS_TRIE(pn)) {
                while (cindex < child_length(pn)) {
                        struct key_vector *n = get_child_rcu(pn, cindex++);

                        if (!n)
                                continue;

                        if (IS_LEAF(n)) {
                                iter->tnode = pn;
                                iter->index = cindex;
                        } else {
                                /* push down one level */
                                iter->tnode = n;
                                iter->index = 0;
                                ++iter->depth;
                        }

                        return n;
                }

                /* Current node exhausted, pop back up */
                pkey = pn->key;
                pn = node_parent_rcu(pn);
                cindex = get_index(pkey, pn) + 1;
                --iter->depth;
        }

        /* record root node so further searches know we are done */
        iter->tnode = pn;
        iter->index = 0;

        return NULL;
}

static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
                                             struct trie *t)
{
        struct key_vector *n, *pn;

        if (!t)
                return NULL;

        pn = t->kv;
        n = rcu_dereference(pn->tnode[0]);
        if (!n)
                return NULL;

        if (IS_TNODE(n)) {
                iter->tnode = n;
                iter->index = 0;
                iter->depth = 1;
        } else {
                iter->tnode = pn;
                iter->index = 0;
                iter->depth = 0;
        }

        return n;
}

static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
        struct key_vector *n;
        struct fib_trie_iter iter;

        memset(s, 0, sizeof(*s));

        rcu_read_lock();
        for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
                if (IS_LEAF(n)) {
                        struct fib_alias *fa;

                        s->leaves++;
                        s->totdepth += iter.depth;
                        if (iter.depth > s->maxdepth)
                                s->maxdepth = iter.depth;

                        hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
                                ++s->prefixes;
                } else {
                        s->tnodes++;
                        if (n->bits < MAX_STAT_DEPTH)
                                s->nodesizes[n->bits]++;
                        s->nullpointers += tn_info(n)->empty_children;
                }
        }
        rcu_read_unlock();
}

/*
 *        This outputs /proc/net/fib_triestats
 */
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
{
        unsigned int i, max, pointers, bytes, avdepth;

        if (stat->leaves)
                avdepth = stat->totdepth*100 / stat->leaves;
        else
                avdepth = 0;

        seq_printf(seq, "\tAver depth:     %u.%02d\n",
                   avdepth / 100, avdepth % 100);
        seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);

        seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
        bytes = LEAF_SIZE * stat->leaves;

        seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
        bytes += sizeof(struct fib_alias) * stat->prefixes;

        seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
        bytes += TNODE_SIZE(0) * stat->tnodes;

        max = MAX_STAT_DEPTH;
        while (max > 0 && stat->nodesizes[max-1] == 0)
                max--;

        pointers = 0;
        for (i = 1; i < max; i++)
                if (stat->nodesizes[i] != 0) {
                        seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
                        pointers += (1<<i) * stat->nodesizes[i];
                }
        seq_putc(seq, '\n');
        seq_printf(seq, "\tPointers: %u\n", pointers);

        bytes += sizeof(struct key_vector *) * pointers;
        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
}

#ifdef CONFIG_IP_FIB_TRIE_STATS
static void trie_show_usage(struct seq_file *seq,
                            const struct trie_use_stats __percpu *stats)
{
        struct trie_use_stats s = { 0 };
        int cpu;

        /* loop through all of the CPUs and gather up the stats */
        for_each_possible_cpu(cpu) {
                const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);

                s.gets += pcpu->gets;
                s.backtrack += pcpu->backtrack;
                s.semantic_match_passed += pcpu->semantic_match_passed;
                s.semantic_match_miss += pcpu->semantic_match_miss;
                s.null_node_hit += pcpu->null_node_hit;
                s.resize_node_skipped += pcpu->resize_node_skipped;
        }

        seq_printf(seq, "\nCounters:\n---------\n");
        seq_printf(seq, "gets = %u\n", s.gets);
        seq_printf(seq, "backtracks = %u\n", s.backtrack);
        seq_printf(seq, "semantic match passed = %u\n",
                   s.semantic_match_passed);
        seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
        seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
        seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
}
#endif /*  CONFIG_IP_FIB_TRIE_STATS */

static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
{
        if (tb->tb_id == RT_TABLE_LOCAL)
                seq_puts(seq, "Local:\n");
        else if (tb->tb_id == RT_TABLE_MAIN)
                seq_puts(seq, "Main:\n");
        else
                seq_printf(seq, "Id %d:\n", tb->tb_id);
}


static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
        struct net *net = seq->private;
        unsigned int h;

        seq_printf(seq,
                   "Basic info: size of leaf:"
                   " %zd bytes, size of tnode: %zd bytes.\n",
                   LEAF_SIZE, TNODE_SIZE(0));

        rcu_read_lock();
        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
                struct fib_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
                        struct trie *t = (struct trie *) tb->tb_data;
                        struct trie_stat stat;

                        if (!t)
                                continue;

                        fib_table_print(seq, tb);

                        trie_collect_stats(t, &stat);
                        trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
                        trie_show_usage(seq, t->stats);
#endif
                }
                cond_resched_rcu();
        }
        rcu_read_unlock();

        return 0;
}

static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
        struct fib_trie_iter *iter = seq->private;
        struct net *net = seq_file_net(seq);
        loff_t idx = 0;
        unsigned int h;

        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
                struct fib_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
                        struct key_vector *n;

                        for (n = fib_trie_get_first(iter,
                                                    (struct trie *) tb->tb_data);
                             n; n = fib_trie_get_next(iter))
                                if (pos == idx++) {
                                        iter->tb = tb;
                                        return n;
                                }
                }
        }

        return NULL;
}

static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        rcu_read_lock();
        return fib_trie_get_idx(seq, *pos);
}

static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct fib_trie_iter *iter = seq->private;
        struct net *net = seq_file_net(seq);
        struct fib_table *tb = iter->tb;
        struct hlist_node *tb_node;
        unsigned int h;
        struct key_vector *n;

        ++*pos;
        /* next node in same table */
        n = fib_trie_get_next(iter);
        if (n)
                return n;

        /* walk rest of this hash chain */
        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
        while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
                if (n)
                        goto found;
        }

        /* new hash chain */
        while (++h < FIB_TABLE_HASHSZ) {
                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
                        n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
                        if (n)
                                goto found;
                }
        }
        return NULL;

found:
        iter->tb = tb;
        return n;
}

static void fib_trie_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        rcu_read_unlock();
}

static void seq_indent(struct seq_file *seq, int n)
{
        while (n-- > 0)
                seq_puts(seq, "   ");
}

static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
{
        switch (s) {
        case RT_SCOPE_UNIVERSE: return "universe";
        case RT_SCOPE_SITE:        return "site";
        case RT_SCOPE_LINK:        return "link";
        case RT_SCOPE_HOST:        return "host";
        case RT_SCOPE_NOWHERE:        return "nowhere";
        default:
                snprintf(buf, len, "scope=%d", s);
                return buf;
        }
}

static const char *const rtn_type_names[__RTN_MAX] = {
        [RTN_UNSPEC] = "UNSPEC",
        [RTN_UNICAST] = "UNICAST",
        [RTN_LOCAL] = "LOCAL",
        [RTN_BROADCAST] = "BROADCAST",
        [RTN_ANYCAST] = "ANYCAST",
        [RTN_MULTICAST] = "MULTICAST",
        [RTN_BLACKHOLE] = "BLACKHOLE",
        [RTN_UNREACHABLE] = "UNREACHABLE",
        [RTN_PROHIBIT] = "PROHIBIT",
        [RTN_THROW] = "THROW",
        [RTN_NAT] = "NAT",
        [RTN_XRESOLVE] = "XRESOLVE",
};

static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
{
        if (t < __RTN_MAX && rtn_type_names[t])
                return rtn_type_names[t];
        snprintf(buf, len, "type %u", t);
        return buf;
}

/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
        const struct fib_trie_iter *iter = seq->private;
        struct key_vector *n = v;

        if (IS_TRIE(node_parent_rcu(n)))
                fib_table_print(seq, iter->tb);

        if (IS_TNODE(n)) {
                __be32 prf = htonl(n->key);

                seq_indent(seq, iter->depth-1);
                seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
                           &prf, KEYLENGTH - n->pos - n->bits, n->bits,
                           tn_info(n)->full_children,
                           tn_info(n)->empty_children);
        } else {
                __be32 val = htonl(n->key);
                struct fib_alias *fa;

                seq_indent(seq, iter->depth);
                seq_printf(seq, "  |-- %pI4\n", &val);

                hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
                        char buf1[32], buf2[32];

                        seq_indent(seq, iter->depth + 1);
                        seq_printf(seq, "  /%zu %s %s",
                                   KEYLENGTH - fa->fa_slen,
                                   rtn_scope(buf1, sizeof(buf1),
                                             fa->fa_info->fib_scope),
                                   rtn_type(buf2, sizeof(buf2),
                                            fa->fa_type));
                        if (fa->fa_dscp)
                                seq_printf(seq, " tos=%d",
                                           inet_dscp_to_dsfield(fa->fa_dscp));
                        seq_putc(seq, '\n');
                }
        }

        return 0;
}

static const struct seq_operations fib_trie_seq_ops = {
        .start  = fib_trie_seq_start,
        .next   = fib_trie_seq_next,
        .stop   = fib_trie_seq_stop,
        .show   = fib_trie_seq_show,
};

struct fib_route_iter {
        struct seq_net_private p;
        struct fib_table *main_tb;
        struct key_vector *tnode;
        loff_t        pos;
        t_key        key;
};

static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
                                            loff_t pos)
{
        struct key_vector *l, **tp = &iter->tnode;
        t_key key;

        /* use cached location of previously found key */
        if (iter->pos > 0 && pos >= iter->pos) {
                key = iter->key;
        } else {
                iter->pos = 1;
                key = 0;
        }

        pos -= iter->pos;

        while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
                key = l->key + 1;
                iter->pos++;
                l = NULL;

                /* handle unlikely case of a key wrap */
                if (!key)
                        break;
        }

        if (l)
                iter->key = l->key;        /* remember it */
        else
                iter->pos = 0;                /* forget it */

        return l;
}

static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        struct fib_route_iter *iter = seq->private;
        struct fib_table *tb;
        struct trie *t;

        rcu_read_lock();

        tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
        if (!tb)
                return NULL;

        iter->main_tb = tb;
        t = (struct trie *)tb->tb_data;
        iter->tnode = t->kv;

        if (*pos != 0)
                return fib_route_get_idx(iter, *pos);

        iter->pos = 0;
        iter->key = KEY_MAX;

        return SEQ_START_TOKEN;
}

static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct fib_route_iter *iter = seq->private;
        struct key_vector *l = NULL;
        t_key key = iter->key + 1;

        ++*pos;

        /* only allow key of 0 for start of sequence */
        if ((v == SEQ_START_TOKEN) || key)
                l = leaf_walk_rcu(&iter->tnode, key);

        if (l) {
                iter->key = l->key;
                iter->pos++;
        } else {
                iter->pos = 0;
        }

        return l;
}

static void fib_route_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        rcu_read_unlock();
}

static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
{
        unsigned int flags = 0;

        if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
                flags = RTF_REJECT;
        if (fi) {
                const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);

                if (nhc->nhc_gw.ipv4)
                        flags |= RTF_GATEWAY;
        }
        if (mask == htonl(0xFFFFFFFF))
                flags |= RTF_HOST;
        flags |= RTF_UP;
        return flags;
}

/*
 *        This outputs /proc/net/route.
 *        The format of the file is not supposed to be changed
 *        and needs to be same as fib_hash output to avoid breaking
 *        legacy utilities
 */
static int fib_route_seq_show(struct seq_file *seq, void *v)
{
        struct fib_route_iter *iter = seq->private;
        struct fib_table *tb = iter->main_tb;
        struct fib_alias *fa;
        struct key_vector *l = v;
        __be32 prefix;

        if (v == SEQ_START_TOKEN) {
                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
                           "\tWindow\tIRTT");
                return 0;
        }

        prefix = htonl(l->key);

        hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
                struct fib_info *fi = fa->fa_info;
                __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
                unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);

                if ((fa->fa_type == RTN_BROADCAST) ||
                    (fa->fa_type == RTN_MULTICAST))
                        continue;

                if (fa->tb_id != tb->tb_id)
                        continue;

                seq_setwidth(seq, 127);

                if (fi) {
                        struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
                        __be32 gw = 0;

                        if (nhc->nhc_gw_family == AF_INET)
                                gw = nhc->nhc_gw.ipv4;

                        seq_printf(seq,
                                   "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
                                   "%u\t%08X\t%d\t%u\t%u",
                                   nhc->nhc_dev ? nhc->nhc_dev->name : "*",
                                   prefix, gw, flags, 0, 0,
                                   fi->fib_priority,
                                   mask,
                                   (fi->fib_advmss ?
                                    fi->fib_advmss + 40 : 0),
                                   fi->fib_window,
                                   fi->fib_rtt >> 3);
                } else {
                        seq_printf(seq,
                                   "*\t%08X\t%08X\t%04X\t%d\t%u\t"
                                   "%u\t%08X\t%d\t%u\t%u",
                                   prefix, 0, flags, 0, 0, 0,
                                   mask, 0, 0, 0);
                }
                seq_pad(seq, '\n');
        }

        return 0;
}

static const struct seq_operations fib_route_seq_ops = {
        .start  = fib_route_seq_start,
        .next   = fib_route_seq_next,
        .stop   = fib_route_seq_stop,
        .show   = fib_route_seq_show,
};

int __net_init fib_proc_init(struct net *net)
{
        if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
                        sizeof(struct fib_trie_iter)))
                goto out1;

        if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
                        fib_triestat_seq_show, NULL))
                goto out2;

        if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
                        sizeof(struct fib_route_iter)))
                goto out3;

        return 0;

out3:
        remove_proc_entry("fib_triestat", net->proc_net);
out2:
        remove_proc_entry("fib_trie", net->proc_net);
out1:
        return -ENOMEM;
}

void __net_exit fib_proc_exit(struct net *net)
{
        remove_proc_entry("fib_trie", net->proc_net);
        remove_proc_entry("fib_triestat", net->proc_net);
        remove_proc_entry("route", net->proc_net);
}

#endif /* CONFIG_PROC_FS */























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
// SPDX-License-Identifier: GPL-2.0
/*
 * USB Serial Converter stuff
 *
 *        Copyright (C) 1999 - 2012
 *            Greg Kroah-Hartman (greg@kroah.com)
 */

#ifndef __LINUX_USB_SERIAL_H
#define __LINUX_USB_SERIAL_H

#include <linux/kref.h>
#include <linux/mutex.h>
#include <linux/serial.h>
#include <linux/kfifo.h>

/* The maximum number of ports one device can grab at once */
#define MAX_NUM_PORTS                16

/* USB serial flags */
#define USB_SERIAL_WRITE_BUSY        0
#define USB_SERIAL_THROTTLED        1

/**
 * usb_serial_port: structure for the specific ports of a device.
 * @serial: pointer back to the struct usb_serial owner of this port.
 * @port: pointer to the corresponding tty_port for this port.
 * @lock: spinlock to grab when updating portions of this structure.
 * @minor: the minor number of the port
 * @port_number: the struct usb_serial port number of this port (starts at 0)
 * @interrupt_in_buffer: pointer to the interrupt in buffer for this port.
 * @interrupt_in_urb: pointer to the interrupt in struct urb for this port.
 * @interrupt_in_endpointAddress: endpoint address for the interrupt in pipe
 *        for this port.
 * @interrupt_out_buffer: pointer to the interrupt out buffer for this port.
 * @interrupt_out_size: the size of the interrupt_out_buffer, in bytes.
 * @interrupt_out_urb: pointer to the interrupt out struct urb for this port.
 * @interrupt_out_endpointAddress: endpoint address for the interrupt out pipe
 *        for this port.
 * @bulk_in_buffer: pointer to the bulk in buffer for this port.
 * @bulk_in_size: the size of the bulk_in_buffer, in bytes.
 * @read_urb: pointer to the bulk in struct urb for this port.
 * @bulk_in_endpointAddress: endpoint address for the bulk in pipe for this
 *        port.
 * @bulk_in_buffers: pointers to the bulk in buffers for this port
 * @read_urbs: pointers to the bulk in urbs for this port
 * @read_urbs_free: status bitmap the for bulk in urbs
 * @bulk_out_buffer: pointer to the bulk out buffer for this port.
 * @bulk_out_size: the size of the bulk_out_buffer, in bytes.
 * @write_urb: pointer to the bulk out struct urb for this port.
 * @write_fifo: kfifo used to buffer outgoing data
 * @bulk_out_buffers: pointers to the bulk out buffers for this port
 * @write_urbs: pointers to the bulk out urbs for this port
 * @write_urbs_free: status bitmap the for bulk out urbs
 * @icount: interrupt counters
 * @tx_bytes: number of bytes currently in host stack queues
 * @bulk_out_endpointAddress: endpoint address for the bulk out pipe for this
 *        port.
 * @flags: usb serial port flags
 * @work: work queue entry for the line discipline waking up.
 * @dev: pointer to the serial device
 *
 * This structure is used by the usb-serial core and drivers for the specific
 * ports of a device.
 */
struct usb_serial_port {
        struct usb_serial        *serial;
        struct tty_port                port;
        spinlock_t                lock;
        u32                        minor;
        u8                        port_number;

        unsigned char                *interrupt_in_buffer;
        struct urb                *interrupt_in_urb;
        __u8                        interrupt_in_endpointAddress;

        unsigned char                *interrupt_out_buffer;
        int                        interrupt_out_size;
        struct urb                *interrupt_out_urb;
        __u8                        interrupt_out_endpointAddress;

        unsigned char                *bulk_in_buffer;
        int                        bulk_in_size;
        struct urb                *read_urb;
        __u8                        bulk_in_endpointAddress;

        unsigned char                *bulk_in_buffers[2];
        struct urb                *read_urbs[2];
        unsigned long                read_urbs_free;

        unsigned char                *bulk_out_buffer;
        int                        bulk_out_size;
        struct urb                *write_urb;
        struct kfifo                write_fifo;

        unsigned char                *bulk_out_buffers[2];
        struct urb                *write_urbs[2];
        unsigned long                write_urbs_free;
        __u8                        bulk_out_endpointAddress;

        struct async_icount        icount;
        int                        tx_bytes;

        unsigned long                flags;
        struct work_struct        work;
        unsigned long                sysrq; /* sysrq timeout */
        struct device                dev;
};
#define to_usb_serial_port(d) container_of(d, struct usb_serial_port, dev)

/* get and set the port private data pointer helper functions */
static inline void *usb_get_serial_port_data(struct usb_serial_port *port)
{
        return dev_get_drvdata(&port->dev);
}

static inline void usb_set_serial_port_data(struct usb_serial_port *port,
                                            void *data)
{
        dev_set_drvdata(&port->dev, data);
}

/**
 * usb_serial - structure used by the usb-serial core for a device
 * @dev: pointer to the struct usb_device for this device
 * @type: pointer to the struct usb_serial_driver for this device
 * @interface: pointer to the struct usb_interface for this device
 * @sibling: pointer to the struct usb_interface of any sibling interface
 * @suspend_count: number of suspended (sibling) interfaces
 * @num_ports: the number of ports this device has
 * @num_interrupt_in: number of interrupt in endpoints we have
 * @num_interrupt_out: number of interrupt out endpoints we have
 * @num_bulk_in: number of bulk in endpoints we have
 * @num_bulk_out: number of bulk out endpoints we have
 * @port: array of struct usb_serial_port structures for the different ports.
 * @private: place to put any driver specific information that is needed.  The
 *        usb-serial driver is required to manage this data, the usb-serial core
 *        will not touch this.  Use usb_get_serial_data() and
 *        usb_set_serial_data() to access this.
 */
struct usb_serial {
        struct usb_device                *dev;
        struct usb_serial_driver        *type;
        struct usb_interface                *interface;
        struct usb_interface                *sibling;
        unsigned int                        suspend_count;
        unsigned char                        disconnected:1;
        unsigned char                        attached:1;
        unsigned char                        minors_reserved:1;
        unsigned char                        num_ports;
        unsigned char                        num_port_pointers;
        unsigned char                        num_interrupt_in;
        unsigned char                        num_interrupt_out;
        unsigned char                        num_bulk_in;
        unsigned char                        num_bulk_out;
        struct usb_serial_port                *port[MAX_NUM_PORTS];
        struct kref                        kref;
        struct mutex                        disc_mutex;
        void                                *private;
};
#define to_usb_serial(d) container_of(d, struct usb_serial, kref)

/* get and set the serial private data pointer helper functions */
static inline void *usb_get_serial_data(struct usb_serial *serial)
{
        return serial->private;
}

static inline void usb_set_serial_data(struct usb_serial *serial, void *data)
{
        serial->private = data;
}

struct usb_serial_endpoints {
        unsigned char num_bulk_in;
        unsigned char num_bulk_out;
        unsigned char num_interrupt_in;
        unsigned char num_interrupt_out;
        struct usb_endpoint_descriptor *bulk_in[MAX_NUM_PORTS];
        struct usb_endpoint_descriptor *bulk_out[MAX_NUM_PORTS];
        struct usb_endpoint_descriptor *interrupt_in[MAX_NUM_PORTS];
        struct usb_endpoint_descriptor *interrupt_out[MAX_NUM_PORTS];
};

/**
 * usb_serial_driver - describes a usb serial driver
 * @description: pointer to a string that describes this driver.  This string
 *        used in the syslog messages when a device is inserted or removed.
 * @id_table: pointer to a list of usb_device_id structures that define all
 *        of the devices this structure can support.
 * @num_ports: the number of different ports this device will have.
 * @num_bulk_in: minimum number of bulk-in endpoints
 * @num_bulk_out: minimum number of bulk-out endpoints
 * @num_interrupt_in: minimum number of interrupt-in endpoints
 * @num_interrupt_out: minimum number of interrupt-out endpoints
 * @bulk_in_size: minimum number of bytes to allocate for bulk-in buffer
 *        (0 = end-point size)
 * @bulk_out_size: bytes to allocate for bulk-out buffer (0 = end-point size)
 * @calc_num_ports: pointer to a function to determine how many ports this
 *        device has dynamically. It can also be used to verify the number of
 *        endpoints or to modify the port-endpoint mapping. It will be called
 *        after the probe() callback is called, but before attach().
 * @probe: pointer to the driver's probe function.
 *        This will be called when the device is inserted into the system,
 *        but before the device has been fully initialized by the usb_serial
 *        subsystem.  Use this function to download any firmware to the device,
 *        or any other early initialization that might be needed.
 *        Return 0 to continue on with the initialization sequence.  Anything
 *        else will abort it.
 * @attach: pointer to the driver's attach function.
 *        This will be called when the struct usb_serial structure is fully
 *        set up.  Do any local initialization of the device, or any private
 *        memory structure allocation at this point in time.
 * @disconnect: pointer to the driver's disconnect function.  This will be
 *        called when the device is unplugged or unbound from the driver.
 * @release: pointer to the driver's release function.  This will be called
 *        when the usb_serial data structure is about to be destroyed.
 * @usb_driver: pointer to the struct usb_driver that controls this
 *        device.  This is necessary to allow dynamic ids to be added to
 *        the driver from sysfs.
 *
 * This structure is defines a USB Serial driver.  It provides all of
 * the information that the USB serial core code needs.  If the function
 * pointers are defined, then the USB serial core code will call them when
 * the corresponding tty port functions are called.  If they are not
 * called, the generic serial function will be used instead.
 *
 * The driver.owner field should be set to the module owner of this driver.
 * The driver.name field should be set to the name of this driver (remember
 * it will show up in sysfs, so it needs to be short and to the point.
 * Using the module name is a good idea.)
 */
struct usb_serial_driver {
        const char *description;
        const struct usb_device_id *id_table;

        struct list_head        driver_list;
        struct device_driver        driver;
        struct usb_driver        *usb_driver;
        struct usb_dynids        dynids;

        unsigned char                num_ports;

        unsigned char                num_bulk_in;
        unsigned char                num_bulk_out;
        unsigned char                num_interrupt_in;
        unsigned char                num_interrupt_out;

        size_t                        bulk_in_size;
        size_t                        bulk_out_size;

        int (*probe)(struct usb_serial *serial, const struct usb_device_id *id);
        int (*attach)(struct usb_serial *serial);
        int (*calc_num_ports)(struct usb_serial *serial,
                        struct usb_serial_endpoints *epds);

        void (*disconnect)(struct usb_serial *serial);
        void (*release)(struct usb_serial *serial);

        int (*port_probe)(struct usb_serial_port *port);
        void (*port_remove)(struct usb_serial_port *port);

        int (*suspend)(struct usb_serial *serial, pm_message_t message);
        int (*resume)(struct usb_serial *serial);
        int (*reset_resume)(struct usb_serial *serial);

        /* serial function calls */
        /* Called by console and by the tty layer */
        int  (*open)(struct tty_struct *tty, struct usb_serial_port *port);
        void (*close)(struct usb_serial_port *port);
        int  (*write)(struct tty_struct *tty, struct usb_serial_port *port,
                        const unsigned char *buf, int count);
        /* Called only by the tty layer */
        unsigned int (*write_room)(struct tty_struct *tty);
        int  (*ioctl)(struct tty_struct *tty,
                      unsigned int cmd, unsigned long arg);
        void (*get_serial)(struct tty_struct *tty, struct serial_struct *ss);
        int  (*set_serial)(struct tty_struct *tty, struct serial_struct *ss);
        void (*set_termios)(struct tty_struct *tty, struct usb_serial_port *port,
                            const struct ktermios *old);
        int (*break_ctl)(struct tty_struct *tty, int break_state);
        unsigned int (*chars_in_buffer)(struct tty_struct *tty);
        void (*wait_until_sent)(struct tty_struct *tty, long timeout);
        bool (*tx_empty)(struct usb_serial_port *port);
        void (*throttle)(struct tty_struct *tty);
        void (*unthrottle)(struct tty_struct *tty);
        int  (*tiocmget)(struct tty_struct *tty);
        int  (*tiocmset)(struct tty_struct *tty,
                         unsigned int set, unsigned int clear);
        int  (*tiocmiwait)(struct tty_struct *tty, unsigned long arg);
        int  (*get_icount)(struct tty_struct *tty,
                        struct serial_icounter_struct *icount);
        /* Called by the tty layer for port level work. There may or may not
           be an attached tty at this point */
        void (*dtr_rts)(struct usb_serial_port *port, int on);
        int  (*carrier_raised)(struct usb_serial_port *port);
        /* Called by the usb serial hooks to allow the user to rework the
           termios state */
        void (*init_termios)(struct tty_struct *tty);
        /* USB events */
        void (*read_int_callback)(struct urb *urb);
        void (*write_int_callback)(struct urb *urb);
        void (*read_bulk_callback)(struct urb *urb);
        void (*write_bulk_callback)(struct urb *urb);
        /* Called by the generic read bulk callback */
        void (*process_read_urb)(struct urb *urb);
        /* Called by the generic write implementation */
        int (*prepare_write_buffer)(struct usb_serial_port *port,
                                                void *dest, size_t size);
};
#define to_usb_serial_driver(d) \
        container_of(d, struct usb_serial_driver, driver)

#define usb_serial_register_drivers(serial_drivers, name, id_table) \
        __usb_serial_register_drivers(serial_drivers, THIS_MODULE, name, id_table)
int __usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[],
                                  struct module *owner, const char *name,
                                  const struct usb_device_id *id_table);
void usb_serial_deregister_drivers(struct usb_serial_driver *const serial_drivers[]);
void usb_serial_port_softint(struct usb_serial_port *port);

int usb_serial_suspend(struct usb_interface *intf, pm_message_t message);
int usb_serial_resume(struct usb_interface *intf);

/* USB Serial console functions */
#ifdef CONFIG_USB_SERIAL_CONSOLE
void usb_serial_console_init(int minor);
void usb_serial_console_exit(void);
void usb_serial_console_disconnect(struct usb_serial *serial);
#else
static inline void usb_serial_console_init(int minor) { }
static inline void usb_serial_console_exit(void) { }
static inline void usb_serial_console_disconnect(struct usb_serial *serial) {}
#endif

/* Functions needed by other parts of the usbserial core */
struct usb_serial_port *usb_serial_port_get_by_minor(unsigned int minor);
void usb_serial_put(struct usb_serial *serial);

int usb_serial_claim_interface(struct usb_serial *serial, struct usb_interface *intf);

int usb_serial_generic_open(struct tty_struct *tty, struct usb_serial_port *port);
int usb_serial_generic_write_start(struct usb_serial_port *port, gfp_t mem_flags);
int usb_serial_generic_write(struct tty_struct *tty, struct usb_serial_port *port,
                const unsigned char *buf, int count);
void usb_serial_generic_close(struct usb_serial_port *port);
int usb_serial_generic_resume(struct usb_serial *serial);
unsigned int usb_serial_generic_write_room(struct tty_struct *tty);
unsigned int usb_serial_generic_chars_in_buffer(struct tty_struct *tty);
void usb_serial_generic_wait_until_sent(struct tty_struct *tty, long timeout);
void usb_serial_generic_read_bulk_callback(struct urb *urb);
void usb_serial_generic_write_bulk_callback(struct urb *urb);
void usb_serial_generic_throttle(struct tty_struct *tty);
void usb_serial_generic_unthrottle(struct tty_struct *tty);
int usb_serial_generic_tiocmiwait(struct tty_struct *tty, unsigned long arg);
int usb_serial_generic_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount);
int usb_serial_generic_register(void);
void usb_serial_generic_deregister(void);
int usb_serial_generic_submit_read_urbs(struct usb_serial_port *port, gfp_t mem_flags);
void usb_serial_generic_process_read_urb(struct urb *urb);
int usb_serial_generic_prepare_write_buffer(struct usb_serial_port *port, void *dest, size_t size);

#if defined(CONFIG_USB_SERIAL_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
int usb_serial_handle_sysrq_char(struct usb_serial_port *port, unsigned int ch);
int usb_serial_handle_break(struct usb_serial_port *port);
#else
static inline int usb_serial_handle_sysrq_char(struct usb_serial_port *port, unsigned int ch)
{
        return 0;
}
static inline int usb_serial_handle_break(struct usb_serial_port *port)
{
        return 0;
}
#endif

void usb_serial_handle_dcd_change(struct usb_serial_port *usb_port,
                struct tty_struct *tty, unsigned int status);


int usb_serial_bus_register(struct usb_serial_driver *device);
void usb_serial_bus_deregister(struct usb_serial_driver *device);

extern const struct bus_type usb_serial_bus_type;
extern struct tty_driver *usb_serial_tty_driver;

static inline void usb_serial_debug_data(struct device *dev,
                                         const char *function, int size,
                                         const unsigned char *data)
{
        dev_dbg(dev, "%s - length = %d, data = %*ph\n",
                function, size, size, data);
}

/*
 * Macro for reporting errors in write path to avoid infinite loop
 * when port is used as a console.
 */
#define dev_err_console(usport, fmt, ...)                                \
do {                                                                        \
        static bool __print_once;                                        \
        struct usb_serial_port *__port = (usport);                        \
                                                                        \
        if (!__port->port.console || !__print_once) {                        \
                __print_once = true;                                        \
                dev_err(&__port->dev, fmt, ##__VA_ARGS__);                \
        }                                                                \
} while (0)

/*
 * module_usb_serial_driver() - Helper macro for registering a USB Serial driver
 * @__serial_drivers: list of usb_serial drivers to register
 * @__ids: all device ids that @__serial_drivers bind to
 *
 * Helper macro for USB serial drivers which do not do anything special
 * in module init/exit. This eliminates a lot of boilerplate. Each
 * module may only use this macro once, and calling it replaces
 * module_init() and module_exit()
 *
 */
#define usb_serial_module_driver(__name, __serial_drivers, __ids)        \
static int __init usb_serial_module_init(void)                                \
{                                                                        \
        return usb_serial_register_drivers(__serial_drivers,                \
                                           __name, __ids);                \
}                                                                        \
module_init(usb_serial_module_init);                                        \
static void __exit usb_serial_module_exit(void)                                \
{                                                                        \
        usb_serial_deregister_drivers(__serial_drivers);                \
}                                                                        \
module_exit(usb_serial_module_exit);

#define module_usb_serial_driver(__serial_drivers, __ids)                \
        usb_serial_module_driver(KBUILD_MODNAME, __serial_drivers, __ids)

#endif /* __LINUX_USB_SERIAL_H */



































    1 









    1 


    1 
    1 


    1 





    1 
    1 














    1 



    1 
    1 
    1 

    1 




























    1 

    1 









    1 
    1 

    1 













    1 

    1 









































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * lib/plist.c
 *
 * Descending-priority-sorted double-linked list
 *
 * (C) 2002-2003 Intel Corp
 * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
 *
 * 2001-2005 (c) MontaVista Software, Inc.
 * Daniel Walker <dwalker@mvista.com>
 *
 * (C) 2005 Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
 *
 * Simplifications of the original code by
 * Oleg Nesterov <oleg@tv-sign.ru>
 *
 * Based on simple lists (include/linux/list.h).
 *
 * This file contains the add / del functions which are considered to
 * be too large to inline. See include/linux/plist.h for further
 * information.
 */

#include <linux/bug.h>
#include <linux/plist.h>

#ifdef CONFIG_DEBUG_PLIST

static struct plist_head test_head;

static void plist_check_prev_next(struct list_head *t, struct list_head *p,
                                  struct list_head *n)
{
        WARN(n->prev != p || p->next != n,
                        "top: %p, n: %p, p: %p\n"
                        "prev: %p, n: %p, p: %p\n"
                        "next: %p, n: %p, p: %p\n",
                         t, t->next, t->prev,
                        p, p->next, p->prev,
                        n, n->next, n->prev);
}

static void plist_check_list(struct list_head *top)
{
        struct list_head *prev = top, *next = top->next;

        plist_check_prev_next(top, prev, next);
        while (next != top) {
                prev = next;
                next = prev->next;
                plist_check_prev_next(top, prev, next);
        }
}

static void plist_check_head(struct plist_head *head)
{
        if (!plist_head_empty(head))
                plist_check_list(&plist_first(head)->prio_list);
        plist_check_list(&head->node_list);
}

#else
# define plist_check_head(h)        do { } while (0)
#endif

/**
 * plist_add - add @node to @head
 *
 * @node:        &struct plist_node pointer
 * @head:        &struct plist_head pointer
 */
void plist_add(struct plist_node *node, struct plist_head *head)
{
        struct plist_node *first, *iter, *prev = NULL, *last, *reverse_iter;
        struct list_head *node_next = &head->node_list;

        plist_check_head(head);
        WARN_ON(!plist_node_empty(node));
        WARN_ON(!list_empty(&node->prio_list));

        if (plist_head_empty(head))
                goto ins_node;

        first = iter = plist_first(head);
        last = reverse_iter = list_entry(first->prio_list.prev, struct plist_node, prio_list);

        do {
                if (node->prio < iter->prio) {
                        node_next = &iter->node_list;
                        break;
                } else if (node->prio >= reverse_iter->prio) {
                        prev = reverse_iter;
                        iter = list_entry(reverse_iter->prio_list.next,
                                struct plist_node, prio_list);
                        if (likely(reverse_iter != last))
                                node_next = &iter->node_list;
                        break;
                }

                prev = iter;
                iter = list_entry(iter->prio_list.next,
                                struct plist_node, prio_list);
                reverse_iter = list_entry(reverse_iter->prio_list.prev,
                                struct plist_node, prio_list);
        } while (iter != first);

        if (!prev || prev->prio != node->prio)
                list_add_tail(&node->prio_list, &iter->prio_list);
ins_node:
        list_add_tail(&node->node_list, node_next);

        plist_check_head(head);
}

/**
 * plist_del - Remove a @node from plist.
 *
 * @node:        &struct plist_node pointer - entry to be removed
 * @head:        &struct plist_head pointer - list head
 */
void plist_del(struct plist_node *node, struct plist_head *head)
{
        plist_check_head(head);

        if (!list_empty(&node->prio_list)) {
                if (node->node_list.next != &head->node_list) {
                        struct plist_node *next;

                        next = list_entry(node->node_list.next,
                                        struct plist_node, node_list);

                        /* add the next plist_node into prio_list */
                        if (list_empty(&next->prio_list))
                                list_add(&next->prio_list, &node->prio_list);
                }
                list_del_init(&node->prio_list);
        }

        list_del_init(&node->node_list);

        plist_check_head(head);
}

/**
 * plist_requeue - Requeue @node at end of same-prio entries.
 *
 * This is essentially an optimized plist_del() followed by
 * plist_add().  It moves an entry already in the plist to
 * after any other same-priority entries.
 *
 * @node:        &struct plist_node pointer - entry to be moved
 * @head:        &struct plist_head pointer - list head
 */
void plist_requeue(struct plist_node *node, struct plist_head *head)
{
        struct plist_node *iter;
        struct list_head *node_next = &head->node_list;

        plist_check_head(head);
        BUG_ON(plist_head_empty(head));
        BUG_ON(plist_node_empty(node));

        if (node == plist_last(head))
                return;

        iter = plist_next(node);

        if (node->prio != iter->prio)
                return;

        plist_del(node, head);

        /*
         * After plist_del(), iter is the replacement of the node.  If the node
         * was on prio_list, take shortcut to find node_next instead of looping.
         */
        if (!list_empty(&iter->prio_list)) {
                iter = list_entry(iter->prio_list.next, struct plist_node,
                                  prio_list);
                node_next = &iter->node_list;
                goto queue;
        }

        plist_for_each_continue(iter, head) {
                if (node->prio != iter->prio) {
                        node_next = &iter->node_list;
                        break;
                }
        }
queue:
        list_add_tail(&node->node_list, node_next);

        plist_check_head(head);
}

#ifdef CONFIG_DEBUG_PLIST
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/module.h>
#include <linux/init.h>

static struct plist_node __initdata test_node[241];

static void __init plist_test_check(int nr_expect)
{
        struct plist_node *first, *prio_pos, *node_pos;

        if (plist_head_empty(&test_head)) {
                BUG_ON(nr_expect != 0);
                return;
        }

        prio_pos = first = plist_first(&test_head);
        plist_for_each(node_pos, &test_head) {
                if (nr_expect-- < 0)
                        break;
                if (node_pos == first)
                        continue;
                if (node_pos->prio == prio_pos->prio) {
                        BUG_ON(!list_empty(&node_pos->prio_list));
                        continue;
                }

                BUG_ON(prio_pos->prio > node_pos->prio);
                BUG_ON(prio_pos->prio_list.next != &node_pos->prio_list);
                prio_pos = node_pos;
        }

        BUG_ON(nr_expect != 0);
        BUG_ON(prio_pos->prio_list.next != &first->prio_list);
}

static void __init plist_test_requeue(struct plist_node *node)
{
        plist_requeue(node, &test_head);

        if (node != plist_last(&test_head))
                BUG_ON(node->prio == plist_next(node)->prio);
}

static int  __init plist_test(void)
{
        int nr_expect = 0, i, loop;
        unsigned int r = local_clock();

        printk(KERN_DEBUG "start plist test\n");
        plist_head_init(&test_head);
        for (i = 0; i < ARRAY_SIZE(test_node); i++)
                plist_node_init(test_node + i, 0);

        for (loop = 0; loop < 1000; loop++) {
                r = r * 193939 % 47629;
                i = r % ARRAY_SIZE(test_node);
                if (plist_node_empty(test_node + i)) {
                        r = r * 193939 % 47629;
                        test_node[i].prio = r % 99;
                        plist_add(test_node + i, &test_head);
                        nr_expect++;
                } else {
                        plist_del(test_node + i, &test_head);
                        nr_expect--;
                }
                plist_test_check(nr_expect);
                if (!plist_node_empty(test_node + i)) {
                        plist_test_requeue(test_node + i);
                        plist_test_check(nr_expect);
                }
        }

        for (i = 0; i < ARRAY_SIZE(test_node); i++) {
                if (plist_node_empty(test_node + i))
                        continue;
                plist_del(test_node + i, &test_head);
                nr_expect--;
                plist_test_check(nr_expect);
        }

        printk(KERN_DEBUG "end plist test\n");

        /* Worst case test for plist_add() */
        unsigned int test_data[241];

        for (i = 0; i < ARRAY_SIZE(test_data); i++)
                test_data[i] = i;

        ktime_t start, end, time_elapsed = 0;

        plist_head_init(&test_head);

        for (i = 0; i < ARRAY_SIZE(test_node); i++) {
                plist_node_init(test_node + i, 0);
                test_node[i].prio = test_data[i];
        }

        for (i = 0; i < ARRAY_SIZE(test_node); i++) {
                if (plist_node_empty(test_node + i)) {
                        start = ktime_get();
                        plist_add(test_node + i, &test_head);
                        end = ktime_get();
                        time_elapsed += (end - start);
                }
        }

        pr_debug("plist_add worst case test time elapsed %lld\n", time_elapsed);
        return 0;
}

module_init(plist_test);

#endif






















































































































































































































































































































































































































    1 




    3 

    2 

































































































































































































    1 













































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_SIGNAL_H
#define _LINUX_SCHED_SIGNAL_H

#include <linux/rculist.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/jobctl.h>
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
#include <linux/pid.h>
#include <linux/posix-timers.h>
#include <linux/mm_types.h>
#include <asm/ptrace.h>

/*
 * Types defining task->signal and task->sighand and APIs using them:
 */

struct sighand_struct {
        spinlock_t                siglock;
        refcount_t                count;
        wait_queue_head_t        signalfd_wqh;
        struct k_sigaction        action[_NSIG];
};

/*
 * Per-process accounting stats:
 */
struct pacct_struct {
        int                        ac_flag;
        long                        ac_exitcode;
        unsigned long                ac_mem;
        u64                        ac_utime, ac_stime;
        unsigned long                ac_minflt, ac_majflt;
};

struct cpu_itimer {
        u64 expires;
        u64 incr;
};

/*
 * This is the atomic variant of task_cputime, which can be used for
 * storing and updating task_cputime statistics without locking.
 */
struct task_cputime_atomic {
        atomic64_t utime;
        atomic64_t stime;
        atomic64_t sum_exec_runtime;
};

#define INIT_CPUTIME_ATOMIC \
        (struct task_cputime_atomic) {                                \
                .utime = ATOMIC64_INIT(0),                        \
                .stime = ATOMIC64_INIT(0),                        \
                .sum_exec_runtime = ATOMIC64_INIT(0),                \
        }
/**
 * struct thread_group_cputimer - thread group interval timer counts
 * @cputime_atomic:        atomic thread group interval timers.
 *
 * This structure contains the version of task_cputime, above, that is
 * used for thread group CPU timer calculations.
 */
struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
};

struct multiprocess_signals {
        sigset_t signal;
        struct hlist_node node;
};

struct core_thread {
        struct task_struct *task;
        struct core_thread *next;
};

struct core_state {
        atomic_t nr_threads;
        struct core_thread dumper;
        struct completion startup;
};

/*
 * NOTE! "signal_struct" does not have its own
 * locking, because a shared signal_struct always
 * implies a shared sighand_struct, so locking
 * sighand_struct is always a proper superset of
 * the locking of signal_struct.
 */
struct signal_struct {
        refcount_t                sigcnt;
        atomic_t                live;
        int                        nr_threads;
        int                        quick_threads;
        struct list_head        thread_head;

        wait_queue_head_t        wait_chldexit;        /* for wait4() */

        /* current thread group signal load-balancing target: */
        struct task_struct        *curr_target;

        /* shared signal handling: */
        struct sigpending        shared_pending;

        /* For collecting multiprocess signals during fork */
        struct hlist_head        multiprocess;

        /* thread group exit support */
        int                        group_exit_code;
        /* notify group_exec_task when notify_count is less or equal to 0 */
        int                        notify_count;
        struct task_struct        *group_exec_task;

        /* thread group stop support, overloads group_exit_code too */
        int                        group_stop_count;
        unsigned int                flags; /* see SIGNAL_* flags below */

        struct core_state *core_state; /* coredumping support */

        /*
         * PR_SET_CHILD_SUBREAPER marks a process, like a service
         * manager, to re-parent orphan (double-forking) child processes
         * to this process instead of 'init'. The service manager is
         * able to receive SIGCHLD signals and is able to investigate
         * the process until it calls wait(). All children of this
         * process will inherit a flag if they should look for a
         * child_subreaper process at exit.
         */
        unsigned int                is_child_subreaper:1;
        unsigned int                has_child_subreaper:1;
        unsigned int                autoreap:1;

#ifdef CONFIG_POSIX_TIMERS

        /* POSIX.1b Interval Timers */
        unsigned int                timer_create_restore_ids:1;
        atomic_t                next_posix_timer_id;
        struct hlist_head        posix_timers;
        struct hlist_head        ignored_posix_timers;

        /* ITIMER_REAL timer for the process */
        struct hrtimer real_timer;
        ktime_t it_real_incr;

        /*
         * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
         * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
         * values are defined to 0 and 1 respectively
         */
        struct cpu_itimer it[2];

        /*
         * Thread group totals for process CPU timers.
         * See thread_group_cputimer(), et al, for details.
         */
        struct thread_group_cputimer cputimer;

#endif
        /* Empty if CONFIG_POSIX_TIMERS=n */
        struct posix_cputimers posix_cputimers;

        /* PID/PID hash table linkage. */
        struct pid *pids[PIDTYPE_MAX];

#ifdef CONFIG_NO_HZ_FULL
        atomic_t tick_dep_mask;
#endif

        struct pid *tty_old_pgrp;

        /* boolean value for session group leader */
        int leader;

        struct tty_struct *tty; /* NULL if no tty */

#ifdef CONFIG_SCHED_AUTOGROUP
        struct autogroup *autogroup;
#endif
        /*
         * Cumulative resource counters for dead threads in the group,
         * and for reaped dead child processes forked by this group.
         * Live threads maintain their own counters and add to these
         * in __exit_signal, except for the group leader.
         */
        seqlock_t stats_lock;
        u64 utime, stime, cutime, cstime;
        u64 gtime;
        u64 cgtime;
        struct prev_cputime prev_cputime;
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
        unsigned long inblock, oublock, cinblock, coublock;
        unsigned long maxrss, cmaxrss;
        struct task_io_accounting ioac;

        /*
         * Cumulative ns of schedule CPU time fo dead threads in the
         * group, not including a zombie group leader, (This only differs
         * from jiffies_to_ns(utime + stime) if sched_clock uses something
         * other than jiffies.)
         */
        unsigned long long sum_sched_runtime;

        /*
         * We don't bother to synchronize most readers of this at all,
         * because there is no reader checking a limit that actually needs
         * to get both rlim_cur and rlim_max atomically, and either one
         * alone is a single word that can safely be read normally.
         * getrlimit/setrlimit use task_lock(current->group_leader) to
         * protect this instead of the siglock, because they really
         * have no need to disable irqs.
         */
        struct rlimit rlim[RLIM_NLIMITS];

#ifdef CONFIG_BSD_PROCESS_ACCT
        struct pacct_struct pacct;        /* per-process accounting information */
#endif
#ifdef CONFIG_TASKSTATS
        struct taskstats *stats;
#endif
#ifdef CONFIG_AUDIT
        unsigned audit_tty;
        struct tty_audit_buf *tty_audit_buf;
#endif

#ifdef CONFIG_CGROUPS
        struct rw_semaphore cgroup_threadgroup_rwsem;
#endif

        /*
         * Thread is the potential origin of an oom condition; kill first on
         * oom
         */
        bool oom_flag_origin;
        short oom_score_adj;                /* OOM kill score adjustment */
        short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                         * Only settable by CAP_SYS_RESOURCE. */
        struct mm_struct *oom_mm;        /* recorded mm when the thread group got
                                         * killed by the oom killer */

        struct mutex cred_guard_mutex;        /* guard against foreign influences on
                                         * credential calculations
                                         * (notably. ptrace)
                                         * Deprecated do not use in new code.
                                         * Use exec_update_lock instead.
                                         */
        struct rw_semaphore exec_update_lock;        /* Held while task_struct is
                                                 * being updated during exec,
                                                 * and may have inconsistent
                                                 * permissions.
                                                 */
} __randomize_layout;

/*
 * Bits in flags field of signal_struct.
 */
#define SIGNAL_STOP_STOPPED        0x00000001 /* job control stop in effect */
#define SIGNAL_STOP_CONTINUED        0x00000002 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT        0x00000004 /* group exit in progress */
/*
 * Pending notifications to parent.
 */
#define SIGNAL_CLD_STOPPED        0x00000010
#define SIGNAL_CLD_CONTINUED        0x00000020
#define SIGNAL_CLD_MASK                (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)

#define SIGNAL_UNKILLABLE        0x00000040 /* for init: ignore fatal signals */

#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
                          SIGNAL_STOP_CONTINUED)

static inline void signal_set_stop_flags(struct signal_struct *sig,
                                         unsigned int flags)
{
        WARN_ON(sig->flags & SIGNAL_GROUP_EXIT);
        sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
}

extern void flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type);

static inline int kernel_dequeue_signal(void)
{
        struct task_struct *task = current;
        kernel_siginfo_t __info;
        enum pid_type __type;
        int ret;

        spin_lock_irq(&task->sighand->siglock);
        ret = dequeue_signal(&task->blocked, &__info, &__type);
        spin_unlock_irq(&task->sighand->siglock);

        return ret;
}

static inline void kernel_signal_stop(void)
{
        spin_lock_irq(&current->sighand->siglock);
        if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
                current->jobctl |= JOBCTL_STOPPED;
                set_special_state(TASK_STOPPED);
        }
        spin_unlock_irq(&current->sighand->siglock);

        schedule();
}

int force_sig_fault_to_task(int sig, int code, void __user *addr,
                            struct task_struct *t);
int force_sig_fault(int sig, int code, void __user *addr);
int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);

int force_sig_mceerr(int code, void __user *, short);
int send_sig_mceerr(int code, void __user *, short, struct task_struct *);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper);
int force_sig_pkuerr(void __user *addr, u32 pkey);
int send_sig_perf(void __user *addr, u32 type, u64 sig_data);

int force_sig_ptrace_errno_trap(int errno, void __user *addr);
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno);
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
                        struct task_struct *t);
int force_sig_seccomp(int syscall, int reason, bool force_coredump);

extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern void force_sigsegv(int sig);
extern int force_sig_info(struct kernel_siginfo *);
extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *,
                                const struct cred *);
extern int kill_pgrp(struct pid *pid, int sig, int priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern __must_check bool do_notify_parent(struct task_struct *, int);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int);
extern void force_fatal_sig(int);
extern void force_exit_sig(int);
extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);

static inline void clear_notify_signal(void)
{
        clear_thread_flag(TIF_NOTIFY_SIGNAL);
        smp_mb__after_atomic();
}

/*
 * Returns 'true' if kick_process() is needed to force a transition from
 * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
 */
static inline bool __set_notify_signal(struct task_struct *task)
{
        return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
               !wake_up_state(task, TASK_INTERRUPTIBLE);
}

/*
 * Called to break out of interruptible wait loops, and enter the
 * exit_to_user_mode_loop().
 */
static inline void set_notify_signal(struct task_struct *task)
{
        if (__set_notify_signal(task))
                kick_process(task);
}

static inline int restart_syscall(void)
{
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        return -ERESTARTNOINTR;
}

static inline int task_sigpending(struct task_struct *p)
{
        return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}

static inline int signal_pending(struct task_struct *p)
{
        /*
         * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
         * behavior in terms of ensuring that we break out of wait loops
         * so that notify signal callbacks can be processed.
         */
        if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
                return 1;
        return task_sigpending(p);
}

static inline int __fatal_signal_pending(struct task_struct *p)
{
        return unlikely(sigismember(&p->pending.signal, SIGKILL));
}

static inline int fatal_signal_pending(struct task_struct *p)
{
        return task_sigpending(p) && __fatal_signal_pending(p);
}

static inline int signal_pending_state(unsigned int state, struct task_struct *p)
{
        if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
                return 0;
        if (!signal_pending(p))
                return 0;

        return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}

/*
 * This should only be used in fault handlers to decide whether we
 * should stop the current fault routine to handle the signals
 * instead, especially with the case where we've got interrupted with
 * a VM_FAULT_RETRY.
 */
static inline bool fault_signal_pending(vm_fault_t fault_flags,
                                        struct pt_regs *regs)
{
        return unlikely((fault_flags & VM_FAULT_RETRY) &&
                        (fatal_signal_pending(current) ||
                         (user_mode(regs) && signal_pending(current))));
}

/*
 * Reevaluate whether the task has signals pending delivery.
 * Wake the task if so.
 * This is required every time the blocked sigset_t changes.
 * callers must hold sighand->siglock.
 */
extern void recalc_sigpending(void);
extern void calculate_sigpending(void);

extern void signal_wake_up_state(struct task_struct *t, unsigned int state);

static inline void signal_wake_up(struct task_struct *t, bool fatal)
{
        unsigned int state = 0;
        if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) {
                t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
                state = TASK_WAKEKILL | __TASK_TRACED;
        }
        signal_wake_up_state(t, state);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
        unsigned int state = 0;
        if (resume) {
                t->jobctl &= ~JOBCTL_TRACED;
                state = __TASK_TRACED;
        }
        signal_wake_up_state(t, state);
}

void task_join_group_stop(struct task_struct *task);

#ifdef TIF_RESTORE_SIGMASK
/*
 * Legacy restore_sigmask accessors.  These are inefficient on
 * SMP architectures because they require atomic operations.
 */

/**
 * set_restore_sigmask() - make sure saved_sigmask processing gets done
 *
 * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
 * will run before returning to user mode, to process the flag.  For
 * all callers, TIF_SIGPENDING is already set or it's no harm to set
 * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
 * arch code will notice on return to user mode, in case those bits
 * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
 * signal code always gets run when TIF_RESTORE_SIGMASK is set.
 */
static inline void set_restore_sigmask(void)
{
        set_thread_flag(TIF_RESTORE_SIGMASK);
}

static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}

static inline void clear_restore_sigmask(void)
{
        clear_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}
static inline bool test_restore_sigmask(void)
{
        return test_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_and_clear_restore_sigmask(void)
{
        return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
}

#else        /* TIF_RESTORE_SIGMASK */

/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
static inline void set_restore_sigmask(void)
{
        current->restore_sigmask = true;
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        task->restore_sigmask = false;
}
static inline void clear_restore_sigmask(void)
{
        current->restore_sigmask = false;
}
static inline bool test_restore_sigmask(void)
{
        return current->restore_sigmask;
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return task->restore_sigmask;
}
static inline bool test_and_clear_restore_sigmask(void)
{
        if (!current->restore_sigmask)
                return false;
        current->restore_sigmask = false;
        return true;
}
#endif

static inline void restore_saved_sigmask(void)
{
        if (test_and_clear_restore_sigmask())
                __set_current_blocked(&current->saved_sigmask);
}

extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);

static inline void restore_saved_sigmask_unless(bool interrupted)
{
        if (interrupted)
                WARN_ON(!signal_pending(current));
        else
                restore_saved_sigmask();
}

static inline sigset_t *sigmask_to_save(void)
{
        sigset_t *res = &current->blocked;
        if (unlikely(test_restore_sigmask()))
                res = &current->saved_sigmask;
        return res;
}

static inline int kill_cad_pid(int sig, int priv)
{
        return kill_pid(cad_pid, sig, priv);
}

/* These can be the second arg to send_sig_info/send_group_sig_info.  */
#define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
#define SEND_SIG_PRIV        ((struct kernel_siginfo *) 1)

static inline int __on_sig_stack(unsigned long sp)
{
#ifdef CONFIG_STACK_GROWSUP
        return sp >= current->sas_ss_sp &&
                sp - current->sas_ss_sp < current->sas_ss_size;
#else
        return sp > current->sas_ss_sp &&
                sp - current->sas_ss_sp <= current->sas_ss_size;
#endif
}

/*
 * True if we are on the alternate signal stack.
 */
static inline int on_sig_stack(unsigned long sp)
{
        /*
         * If the signal stack is SS_AUTODISARM then, by construction, we
         * can't be on the signal stack unless user code deliberately set
         * SS_AUTODISARM when we were already on it.
         *
         * This improves reliability: if user state gets corrupted such that
         * the stack pointer points very close to the end of the signal stack,
         * then this check will enable the signal to be handled anyway.
         */
        if (current->sas_ss_flags & SS_AUTODISARM)
                return 0;

        return __on_sig_stack(sp);
}

static inline int sas_ss_flags(unsigned long sp)
{
        if (!current->sas_ss_size)
                return SS_DISABLE;

        return on_sig_stack(sp) ? SS_ONSTACK : 0;
}

static inline void sas_ss_reset(struct task_struct *p)
{
        p->sas_ss_sp = 0;
        p->sas_ss_size = 0;
        p->sas_ss_flags = SS_DISABLE;
}

static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
{
        if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
#ifdef CONFIG_STACK_GROWSUP
                return current->sas_ss_sp;
#else
                return current->sas_ss_sp + current->sas_ss_size;
#endif
        return sp;
}

extern void __cleanup_sighand(struct sighand_struct *);
extern void flush_itimer_signals(void);

#define tasklist_empty() \
        list_empty(&init_task.tasks)

#define next_task(p) \
        list_entry_rcu((p)->tasks.next, struct task_struct, tasks)

#define for_each_process(p) \
        for (p = &init_task ; (p = next_task(p)) != &init_task ; )

extern bool current_is_single_threaded(void);

/*
 * Without tasklist/siglock it is only rcu-safe if g can't exit/exec,
 * otherwise next_thread(t) will never reach g after list_del_rcu(g).
 */
#define while_each_thread(g, t) \
        while ((t = next_thread(t)) != g)

#define for_other_threads(p, t)        \
        for (t = p; (t = next_thread(t)) != p; )

#define __for_each_thread(signal, t)        \
        list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
                lockdep_is_held(&tasklist_lock))

#define for_each_thread(p, t)                \
        __for_each_thread((p)->signal, t)

/* Careful: this is a double loop, 'break' won't work as expected. */
#define for_each_process_thread(p, t)        \
        for_each_process(p) for_each_thread(p, t)

typedef int (*proc_visitor)(struct task_struct *p, void *data);
void walk_process_tree(struct task_struct *top, proc_visitor, void *);

static inline
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        if (type == PIDTYPE_PID)
                pid = task_pid(task);
        else
                pid = task->signal->pids[type];
        return pid;
}

static inline struct pid *task_tgid(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_TGID];
}

/*
 * Without tasklist or RCU lock it is not safe to dereference
 * the result of task_pgrp/task_session even if task == current,
 * we can race with another thread doing sys_setsid/sys_setpgid.
 */
static inline struct pid *task_pgrp(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_PGID];
}

static inline struct pid *task_session(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_SID];
}

static inline int get_nr_threads(struct task_struct *task)
{
        return task->signal->nr_threads;
}

static inline bool thread_group_leader(struct task_struct *p)
{
        return p->exit_signal >= 0;
}

static inline
bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
{
        return p1->signal == p2->signal;
}

/*
 * returns NULL if p is the last thread in the thread group
 */
static inline struct task_struct *__next_thread(struct task_struct *p)
{
        return list_next_or_null_rcu(&p->signal->thread_head,
                                        &p->thread_node,
                                        struct task_struct,
                                        thread_node);
}

static inline struct task_struct *next_thread(struct task_struct *p)
{
        return __next_thread(p) ?: p->group_leader;
}

static inline int thread_group_empty(struct task_struct *p)
{
        return thread_group_leader(p) &&
               list_is_last(&p->thread_node, &p->signal->thread_head);
}

#define delay_group_leader(p) \
                (thread_group_leader(p) && !thread_group_empty(p))

extern struct sighand_struct *lock_task_sighand(struct task_struct *task,
                                                unsigned long *flags)
        __cond_acquires(nonnull, &task->sighand->siglock);

static inline void unlock_task_sighand(struct task_struct *task,
                                                unsigned long *flags)
        __releases(&task->sighand->siglock)
{
        spin_unlock_irqrestore(&task->sighand->siglock, *flags);
}

#ifdef CONFIG_LOCKDEP
extern void lockdep_assert_task_sighand_held(struct task_struct *task);
#else
static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { }
#endif

static inline unsigned long task_rlimit(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_cur);
}

static inline unsigned long task_rlimit_max(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_max);
}

static inline unsigned long rlimit(unsigned int limit)
{
        return task_rlimit(current, limit);
}

static inline unsigned long rlimit_max(unsigned int limit)
{
        return task_rlimit_max(current, limit);
}

#endif /* _LINUX_SCHED_SIGNAL_H */











    2 






    2 






























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
// SPDX-License-Identifier: GPL-2.0
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
#include <linux/netpoll.h>
#include <linux/export.h>
#include <net/gro.h>
#include "vlan.h"

bool vlan_do_receive(struct sk_buff **skbp)
{
        struct sk_buff *skb = *skbp;
        __be16 vlan_proto = skb->vlan_proto;
        u16 vlan_id = skb_vlan_tag_get_id(skb);
        struct net_device *vlan_dev;
        struct vlan_pcpu_stats *rx_stats;

        vlan_dev = vlan_find_dev(skb->dev, vlan_proto, vlan_id);
        if (!vlan_dev)
                return false;

        skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
        if (unlikely(!skb))
                return false;

        if (unlikely(!(vlan_dev->flags & IFF_UP))) {
                kfree_skb(skb);
                *skbp = NULL;
                return false;
        }

        skb->dev = vlan_dev;
        if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) {
                /* Our lower layer thinks this is not local, let's make sure.
                 * This allows the VLAN to have a different MAC than the
                 * underlying device, and still route correctly. */
                if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, vlan_dev->dev_addr))
                        skb->pkt_type = PACKET_HOST;
        }

        if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) &&
            !netif_is_macvlan_port(vlan_dev) &&
            !netif_is_bridge_port(vlan_dev)) {
                unsigned int offset = skb->data - skb_mac_header(skb);

                /*
                 * vlan_insert_tag expect skb->data pointing to mac header.
                 * So change skb->data before calling it and change back to
                 * original position later
                 */
                skb_push(skb, offset);
                skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto,
                                                    skb->vlan_tci, skb->mac_len);
                if (!skb)
                        return false;
                skb_pull(skb, offset + VLAN_HLEN);
                skb_reset_mac_len(skb);
        }

        skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
        __vlan_hwaccel_clear_tag(skb);

        rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);

        u64_stats_update_begin(&rx_stats->syncp);
        u64_stats_inc(&rx_stats->rx_packets);
        u64_stats_add(&rx_stats->rx_bytes, skb->len);
        if (skb->pkt_type == PACKET_MULTICAST)
                u64_stats_inc(&rx_stats->rx_multicast);
        u64_stats_update_end(&rx_stats->syncp);

        return true;
}

/* Must be invoked with rcu_read_lock. */
struct net_device *__vlan_find_dev_deep_rcu(struct net_device *dev,
                                        __be16 vlan_proto, u16 vlan_id)
{
        struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info);

        if (vlan_info) {
                return vlan_group_get_device(&vlan_info->grp,
                                             vlan_proto, vlan_id);
        } else {
                /*
                 * Lower devices of master uppers (bonding, team) do not have
                 * grp assigned to themselves. Grp is assigned to upper device
                 * instead.
                 */
                struct net_device *upper_dev;

                upper_dev = netdev_master_upper_dev_get_rcu(dev);
                if (upper_dev)
                        return __vlan_find_dev_deep_rcu(upper_dev,
                                                    vlan_proto, vlan_id);
        }

        return NULL;
}
EXPORT_SYMBOL(__vlan_find_dev_deep_rcu);

struct net_device *vlan_dev_real_dev(const struct net_device *dev)
{
        struct net_device *ret = vlan_dev_priv(dev)->real_dev;

        while (is_vlan_dev(ret))
                ret = vlan_dev_priv(ret)->real_dev;

        return ret;
}
EXPORT_SYMBOL(vlan_dev_real_dev);

u16 vlan_dev_vlan_id(const struct net_device *dev)
{
        return vlan_dev_priv(dev)->vlan_id;
}
EXPORT_SYMBOL(vlan_dev_vlan_id);

__be16 vlan_dev_vlan_proto(const struct net_device *dev)
{
        return vlan_dev_priv(dev)->vlan_proto;
}
EXPORT_SYMBOL(vlan_dev_vlan_proto);

/*
 * vlan info and vid list
 */

static void vlan_group_free(struct vlan_group *grp)
{
        int i, j;

        for (i = 0; i < VLAN_PROTO_NUM; i++)
                for (j = 0; j < VLAN_GROUP_ARRAY_SPLIT_PARTS; j++)
                        kfree(grp->vlan_devices_arrays[i][j]);
}

static void vlan_info_free(struct vlan_info *vlan_info)
{
        vlan_group_free(&vlan_info->grp);
        kfree(vlan_info);
}

static void vlan_info_rcu_free(struct rcu_head *rcu)
{
        vlan_info_free(container_of(rcu, struct vlan_info, rcu));
}

static struct vlan_info *vlan_info_alloc(struct net_device *dev)
{
        struct vlan_info *vlan_info;

        vlan_info = kzalloc_obj(struct vlan_info);
        if (!vlan_info)
                return NULL;

        vlan_info->real_dev = dev;
        INIT_LIST_HEAD(&vlan_info->vid_list);
        return vlan_info;
}

struct vlan_vid_info {
        struct list_head list;
        __be16 proto;
        u16 vid;
        int refcount;
};

static bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto)
{
        if (proto == htons(ETH_P_8021Q) &&
            dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
                return true;
        if (proto == htons(ETH_P_8021AD) &&
            dev->features & NETIF_F_HW_VLAN_STAG_FILTER)
                return true;
        return false;
}

static struct vlan_vid_info *vlan_vid_info_get(struct vlan_info *vlan_info,
                                               __be16 proto, u16 vid)
{
        struct vlan_vid_info *vid_info;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                if (vid_info->proto == proto && vid_info->vid == vid)
                        return vid_info;
        }
        return NULL;
}

static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid)
{
        struct vlan_vid_info *vid_info;

        vid_info = kzalloc_obj(struct vlan_vid_info);
        if (!vid_info)
                return NULL;
        vid_info->proto = proto;
        vid_info->vid = vid;

        return vid_info;
}

static int vlan_add_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
{
        if (!vlan_hw_filter_capable(dev, proto))
                return 0;

        if (netif_device_present(dev))
                return dev->netdev_ops->ndo_vlan_rx_add_vid(dev, proto, vid);
        else
                return -ENODEV;
}

static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
{
        if (!vlan_hw_filter_capable(dev, proto))
                return 0;

        if (netif_device_present(dev))
                return dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
        else
                return -ENODEV;
}

int vlan_for_each(struct net_device *dev,
                  int (*action)(struct net_device *dev, int vid, void *arg),
                  void *arg)
{
        struct vlan_vid_info *vid_info;
        struct vlan_info *vlan_info;
        struct net_device *vdev;
        int ret;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info)
                return 0;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto,
                                             vid_info->vid);
                ret = action(vdev, vid_info->vid, arg);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(vlan_for_each);

int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto)
{
        struct net_device *real_dev = vlan_info->real_dev;
        struct vlan_vid_info *vlan_vid_info;
        int err;

        list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) {
                if (vlan_vid_info->proto == proto) {
                        err = vlan_add_rx_filter_info(real_dev, proto,
                                                      vlan_vid_info->vid);
                        if (err)
                                goto unwind;
                }
        }

        return 0;

unwind:
        list_for_each_entry_continue_reverse(vlan_vid_info,
                                             &vlan_info->vid_list, list) {
                if (vlan_vid_info->proto == proto)
                        vlan_kill_rx_filter_info(real_dev, proto,
                                                 vlan_vid_info->vid);
        }

        return err;
}
EXPORT_SYMBOL(vlan_filter_push_vids);

void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto)
{
        struct vlan_vid_info *vlan_vid_info;

        list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list)
                if (vlan_vid_info->proto == proto)
                        vlan_kill_rx_filter_info(vlan_info->real_dev,
                                                 vlan_vid_info->proto,
                                                 vlan_vid_info->vid);
}
EXPORT_SYMBOL(vlan_filter_drop_vids);

static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
                          struct vlan_vid_info **pvid_info)
{
        struct net_device *dev = vlan_info->real_dev;
        struct vlan_vid_info *vid_info;
        int err;

        vid_info = vlan_vid_info_alloc(proto, vid);
        if (!vid_info)
                return -ENOMEM;

        err = vlan_add_rx_filter_info(dev, proto, vid);
        if (err) {
                kfree(vid_info);
                return err;
        }

        list_add(&vid_info->list, &vlan_info->vid_list);
        vlan_info->nr_vids++;
        *pvid_info = vid_info;
        return 0;
}

int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
{
        struct vlan_info *vlan_info;
        struct vlan_vid_info *vid_info;
        bool vlan_info_created = false;
        int err;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info) {
                vlan_info = vlan_info_alloc(dev);
                if (!vlan_info)
                        return -ENOMEM;
                vlan_info_created = true;
        }
        vid_info = vlan_vid_info_get(vlan_info, proto, vid);
        if (!vid_info) {
                err = __vlan_vid_add(vlan_info, proto, vid, &vid_info);
                if (err)
                        goto out_free_vlan_info;
        }
        vid_info->refcount++;

        if (vlan_info_created)
                rcu_assign_pointer(dev->vlan_info, vlan_info);

        return 0;

out_free_vlan_info:
        if (vlan_info_created)
                kfree(vlan_info);
        return err;
}
EXPORT_SYMBOL(vlan_vid_add);

static void __vlan_vid_del(struct vlan_info *vlan_info,
                           struct vlan_vid_info *vid_info)
{
        struct net_device *dev = vlan_info->real_dev;
        __be16 proto = vid_info->proto;
        u16 vid = vid_info->vid;
        int err;

        err = vlan_kill_rx_filter_info(dev, proto, vid);
        if (err && dev->reg_state != NETREG_UNREGISTERING)
                netdev_warn(dev, "failed to kill vid %04x/%d\n", proto, vid);

        list_del(&vid_info->list);
        kfree(vid_info);
        vlan_info->nr_vids--;
}

void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
{
        struct vlan_info *vlan_info;
        struct vlan_vid_info *vid_info;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info)
                return;

        vid_info = vlan_vid_info_get(vlan_info, proto, vid);
        if (!vid_info)
                return;
        vid_info->refcount--;
        if (vid_info->refcount == 0) {
                __vlan_vid_del(vlan_info, vid_info);
                if (vlan_info->nr_vids == 0) {
                        RCU_INIT_POINTER(dev->vlan_info, NULL);
                        call_rcu(&vlan_info->rcu, vlan_info_rcu_free);
                }
        }
}
EXPORT_SYMBOL(vlan_vid_del);

int vlan_vids_add_by_dev(struct net_device *dev,
                         const struct net_device *by_dev)
{
        struct vlan_vid_info *vid_info;
        struct vlan_info *vlan_info;
        int err;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(by_dev->vlan_info);
        if (!vlan_info)
                return 0;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
                        continue;
                err = vlan_vid_add(dev, vid_info->proto, vid_info->vid);
                if (err)
                        goto unwind;
        }
        return 0;

unwind:
        list_for_each_entry_continue_reverse(vid_info,
                                             &vlan_info->vid_list,
                                             list) {
                if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
                        continue;
                vlan_vid_del(dev, vid_info->proto, vid_info->vid);
        }

        return err;
}
EXPORT_SYMBOL(vlan_vids_add_by_dev);

void vlan_vids_del_by_dev(struct net_device *dev,
                          const struct net_device *by_dev)
{
        struct vlan_vid_info *vid_info;
        struct vlan_info *vlan_info;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(by_dev->vlan_info);
        if (!vlan_info)
                return;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
                        continue;
                vlan_vid_del(dev, vid_info->proto, vid_info->vid);
        }
}
EXPORT_SYMBOL(vlan_vids_del_by_dev);

bool vlan_uses_dev(const struct net_device *dev)
{
        struct vlan_info *vlan_info;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info)
                return false;
        return vlan_info->grp.nr_vlan_devs ? true : false;
}
EXPORT_SYMBOL(vlan_uses_dev);

static struct sk_buff *vlan_gro_receive(struct list_head *head,
                                        struct sk_buff *skb)
{
        const struct packet_offload *ptype;
        unsigned int hlen, off_vlan;
        struct sk_buff *pp = NULL;
        struct vlan_hdr *vhdr;
        struct sk_buff *p;
        __be16 type;
        int flush = 1;

        off_vlan = skb_gro_offset(skb);
        hlen = off_vlan + sizeof(*vhdr);
        vhdr = skb_gro_header(skb, hlen, off_vlan);
        if (unlikely(!vhdr))
                goto out;

        NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen;

        type = vhdr->h_vlan_encapsulated_proto;

        ptype = gro_find_receive_by_type(type);
        if (!ptype)
                goto out;

        flush = 0;

        list_for_each_entry(p, head, list) {
                struct vlan_hdr *vhdr2;

                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
                if (compare_vlan_header(vhdr, vhdr2))
                        NAPI_GRO_CB(p)->same_flow = 0;
        }

        skb_gro_pull(skb, sizeof(*vhdr));
        skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));

        pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
                                            ipv6_gro_receive, inet_gro_receive,
                                            head, skb);

out:
        skb_gro_flush_final(skb, pp, flush);

        return pp;
}

static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
{
        struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
        __be16 type = vhdr->h_vlan_encapsulated_proto;
        struct packet_offload *ptype;
        int err = -ENOENT;

        ptype = gro_find_complete_by_type(type);
        if (ptype)
                err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
                                         ipv6_gro_complete, inet_gro_complete,
                                         skb, nhoff + sizeof(*vhdr));

        return err;
}

static struct packet_offload vlan_packet_offloads[] __read_mostly = {
        {
                .type = cpu_to_be16(ETH_P_8021Q),
                .priority = 10,
                .callbacks = {
                        .gro_receive = vlan_gro_receive,
                        .gro_complete = vlan_gro_complete,
                },
        },
        {
                .type = cpu_to_be16(ETH_P_8021AD),
                .priority = 10,
                .callbacks = {
                        .gro_receive = vlan_gro_receive,
                        .gro_complete = vlan_gro_complete,
                },
        },
};

static int __init vlan_offload_init(void)
{
        unsigned int i;

        for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
                dev_add_offload(&vlan_packet_offloads[i]);

        return 0;
}

fs_initcall(vlan_offload_init);































































































































































































































    1 







    1 













    1 





    1 




























    1 


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions related to io context handling
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/sched/task.h>

#include "blk.h"
#include "blk-mq-sched.h"

/*
 * For io context allocations
 */
static struct kmem_cache *iocontext_cachep;

#ifdef CONFIG_BLK_ICQ
/**
 * get_io_context - increment reference count to io_context
 * @ioc: io_context to get
 *
 * Increment reference count to @ioc.
 */
static void get_io_context(struct io_context *ioc)
{
        BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
        atomic_long_inc(&ioc->refcount);
}

/*
 * Exit an icq. Called with ioc locked for blk-mq, and with both ioc
 * and queue locked for legacy.
 */
static void ioc_exit_icq(struct io_cq *icq)
{
        struct elevator_type *et = icq->q->elevator->type;

        if (icq->flags & ICQ_EXITED)
                return;

        if (et->ops.exit_icq)
                et->ops.exit_icq(icq);

        icq->flags |= ICQ_EXITED;
}

static void ioc_exit_icqs(struct io_context *ioc)
{
        struct io_cq *icq;

        spin_lock_irq(&ioc->lock);
        hlist_for_each_entry(icq, &ioc->icq_list, ioc_node)
                ioc_exit_icq(icq);
        spin_unlock_irq(&ioc->lock);
}

/*
 * Release an icq. Called with ioc locked for blk-mq, and with both ioc
 * and queue locked for legacy.
 */
static void ioc_destroy_icq(struct io_cq *icq)
{
        struct io_context *ioc = icq->ioc;
        struct request_queue *q = icq->q;
        struct elevator_type *et = q->elevator->type;

        lockdep_assert_held(&ioc->lock);
        lockdep_assert_held(&q->queue_lock);

        if (icq->flags & ICQ_DESTROYED)
                return;

        radix_tree_delete(&ioc->icq_tree, icq->q->id);
        hlist_del_init(&icq->ioc_node);
        list_del_init(&icq->q_node);

        /*
         * Both setting lookup hint to and clearing it from @icq are done
         * under queue_lock.  If it's not pointing to @icq now, it never
         * will.  Hint assignment itself can race safely.
         */
        if (rcu_access_pointer(ioc->icq_hint) == icq)
                rcu_assign_pointer(ioc->icq_hint, NULL);

        ioc_exit_icq(icq);

        /*
         * @icq->q might have gone away by the time RCU callback runs
         * making it impossible to determine icq_cache.  Record it in @icq.
         */
        icq->__rcu_icq_cache = et->icq_cache;
        icq->flags |= ICQ_DESTROYED;
        kfree_rcu(icq, __rcu_head);
}

/*
 * Slow path for ioc release in put_io_context().  Performs double-lock
 * dancing to unlink all icq's and then frees ioc.
 */
static void ioc_release_fn(struct work_struct *work)
{
        struct io_context *ioc = container_of(work, struct io_context,
                                              release_work);
        spin_lock_irq(&ioc->lock);

        while (!hlist_empty(&ioc->icq_list)) {
                struct io_cq *icq = hlist_entry(ioc->icq_list.first,
                                                struct io_cq, ioc_node);
                struct request_queue *q = icq->q;

                if (spin_trylock(&q->queue_lock)) {
                        ioc_destroy_icq(icq);
                        spin_unlock(&q->queue_lock);
                } else {
                        /* Make sure q and icq cannot be freed. */
                        rcu_read_lock();

                        /* Re-acquire the locks in the correct order. */
                        spin_unlock(&ioc->lock);
                        spin_lock(&q->queue_lock);
                        spin_lock(&ioc->lock);

                        ioc_destroy_icq(icq);

                        spin_unlock(&q->queue_lock);
                        rcu_read_unlock();
                }
        }

        spin_unlock_irq(&ioc->lock);

        kmem_cache_free(iocontext_cachep, ioc);
}

/*
 * Releasing icqs requires reverse order double locking and we may already be
 * holding a queue_lock.  Do it asynchronously from a workqueue.
 */
static bool ioc_delay_free(struct io_context *ioc)
{
        unsigned long flags;

        spin_lock_irqsave(&ioc->lock, flags);
        if (!hlist_empty(&ioc->icq_list)) {
                queue_work(system_power_efficient_wq, &ioc->release_work);
                spin_unlock_irqrestore(&ioc->lock, flags);
                return true;
        }
        spin_unlock_irqrestore(&ioc->lock, flags);
        return false;
}

/**
 * ioc_clear_queue - break any ioc association with the specified queue
 * @q: request_queue being cleared
 *
 * Walk @q->icq_list and exit all io_cq's.
 */
void ioc_clear_queue(struct request_queue *q)
{
        spin_lock_irq(&q->queue_lock);
        while (!list_empty(&q->icq_list)) {
                struct io_cq *icq =
                        list_first_entry(&q->icq_list, struct io_cq, q_node);

                /*
                 * Other context won't hold ioc lock to wait for queue_lock, see
                 * details in ioc_release_fn().
                 */
                spin_lock(&icq->ioc->lock);
                ioc_destroy_icq(icq);
                spin_unlock(&icq->ioc->lock);
        }
        spin_unlock_irq(&q->queue_lock);
}
#else /* CONFIG_BLK_ICQ */
static inline void ioc_exit_icqs(struct io_context *ioc)
{
}
static inline bool ioc_delay_free(struct io_context *ioc)
{
        return false;
}
#endif /* CONFIG_BLK_ICQ */

/**
 * put_io_context - put a reference of io_context
 * @ioc: io_context to put
 *
 * Decrement reference count of @ioc and release it if the count reaches
 * zero.
 */
void put_io_context(struct io_context *ioc)
{
        BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
        if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc))
                kmem_cache_free(iocontext_cachep, ioc);
}
EXPORT_SYMBOL_GPL(put_io_context);

/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
{
        struct io_context *ioc;

        task_lock(task);
        ioc = task->io_context;
        task->io_context = NULL;
        task_unlock(task);

        if (atomic_dec_and_test(&ioc->active_ref)) {
                ioc_exit_icqs(ioc);
                put_io_context(ioc);
        }
}

static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
{
        struct io_context *ioc;

        ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
                                    node);
        if (unlikely(!ioc))
                return NULL;

        atomic_long_set(&ioc->refcount, 1);
        atomic_set(&ioc->active_ref, 1);
#ifdef CONFIG_BLK_ICQ
        spin_lock_init(&ioc->lock);
        INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
        INIT_HLIST_HEAD(&ioc->icq_list);
        INIT_WORK(&ioc->release_work, ioc_release_fn);
#endif
        ioc->ioprio = IOPRIO_DEFAULT;

        return ioc;
}

int set_task_ioprio(struct task_struct *task, int ioprio)
{
        int err;
        const struct cred *cred = current_cred(), *tcred;

        rcu_read_lock();
        tcred = __task_cred(task);
        if (!uid_eq(tcred->uid, cred->euid) &&
            !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
                rcu_read_unlock();
                return -EPERM;
        }
        rcu_read_unlock();

        err = security_task_setioprio(task, ioprio);
        if (err)
                return err;

        task_lock(task);
        if (unlikely(!task->io_context)) {
                struct io_context *ioc;

                task_unlock(task);

                ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE);
                if (!ioc)
                        return -ENOMEM;

                task_lock(task);
                if (task->flags & PF_EXITING) {
                        kmem_cache_free(iocontext_cachep, ioc);
                        goto out;
                }
                if (task->io_context)
                        kmem_cache_free(iocontext_cachep, ioc);
                else
                        task->io_context = ioc;
        }
        task->io_context->ioprio = ioprio;
out:
        task_unlock(task);
        return 0;
}
EXPORT_SYMBOL_GPL(set_task_ioprio);

int __copy_io(u64 clone_flags, struct task_struct *tsk)
{
        struct io_context *ioc = current->io_context;

        /*
         * Share io context with parent, if CLONE_IO is set
         */
        if (clone_flags & CLONE_IO) {
                atomic_inc(&ioc->active_ref);
                tsk->io_context = ioc;
        } else if (ioprio_valid(ioc->ioprio)) {
                tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE);
                if (!tsk->io_context)
                        return -ENOMEM;
                tsk->io_context->ioprio = ioc->ioprio;
        }

        return 0;
}

#ifdef CONFIG_BLK_ICQ
/**
 * ioc_lookup_icq - lookup io_cq from ioc in io issue path
 * @q: the associated request_queue
 *
 * Look up io_cq associated with @ioc - @q pair from @ioc.  Must be called
 * from io issue path, either return NULL if current issue io to @q for the
 * first time, or return a valid icq.
 */
struct io_cq *ioc_lookup_icq(struct request_queue *q)
{
        struct io_context *ioc = current->io_context;
        struct io_cq *icq;

        /*
         * icq's are indexed from @ioc using radix tree and hint pointer,
         * both of which are protected with RCU, io issue path ensures that
         * both request_queue and current task are valid, the found icq
         * is guaranteed to be valid until the io is done.
         */
        rcu_read_lock();
        icq = rcu_dereference(ioc->icq_hint);
        if (icq && icq->q == q)
                goto out;

        icq = radix_tree_lookup(&ioc->icq_tree, q->id);
        if (icq && icq->q == q)
                rcu_assign_pointer(ioc->icq_hint, icq);        /* allowed to race */
        else
                icq = NULL;
out:
        rcu_read_unlock();
        return icq;
}
EXPORT_SYMBOL(ioc_lookup_icq);

/**
 * ioc_create_icq - create and link io_cq
 * @q: request_queue of interest
 *
 * Make sure io_cq linking @ioc and @q exists.  If icq doesn't exist, they
 * will be created using @gfp_mask.
 *
 * The caller is responsible for ensuring @ioc won't go away and @q is
 * alive and will stay alive until this function returns.
 */
static struct io_cq *ioc_create_icq(struct request_queue *q)
{
        struct io_context *ioc = current->io_context;
        struct elevator_type *et = q->elevator->type;
        struct io_cq *icq;

        /* allocate stuff */
        icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO,
                                    q->node);
        if (!icq)
                return NULL;

        if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) {
                kmem_cache_free(et->icq_cache, icq);
                return NULL;
        }

        icq->ioc = ioc;
        icq->q = q;
        INIT_LIST_HEAD(&icq->q_node);
        INIT_HLIST_NODE(&icq->ioc_node);

        /* lock both q and ioc and try to link @icq */
        spin_lock_irq(&q->queue_lock);
        spin_lock(&ioc->lock);

        if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
                hlist_add_head(&icq->ioc_node, &ioc->icq_list);
                list_add(&icq->q_node, &q->icq_list);
                if (et->ops.init_icq)
                        et->ops.init_icq(icq);
        } else {
                kmem_cache_free(et->icq_cache, icq);
                icq = ioc_lookup_icq(q);
                if (!icq)
                        printk(KERN_ERR "cfq: icq link failed!\n");
        }

        spin_unlock(&ioc->lock);
        spin_unlock_irq(&q->queue_lock);
        radix_tree_preload_end();
        return icq;
}

struct io_cq *ioc_find_get_icq(struct request_queue *q)
{
        struct io_context *ioc = current->io_context;
        struct io_cq *icq = NULL;

        if (unlikely(!ioc)) {
                ioc = alloc_io_context(GFP_ATOMIC, q->node);
                if (!ioc)
                        return NULL;

                task_lock(current);
                if (current->io_context) {
                        kmem_cache_free(iocontext_cachep, ioc);
                        ioc = current->io_context;
                } else {
                        current->io_context = ioc;
                }

                get_io_context(ioc);
                task_unlock(current);
        } else {
                get_io_context(ioc);
                icq = ioc_lookup_icq(q);
        }

        if (!icq) {
                icq = ioc_create_icq(q);
                if (!icq) {
                        put_io_context(ioc);
                        return NULL;
                }
        }
        return icq;
}
EXPORT_SYMBOL_GPL(ioc_find_get_icq);
#endif /* CONFIG_BLK_ICQ */

static int __init blk_ioc_init(void)
{
        iocontext_cachep = kmem_cache_create("blkdev_ioc",
                        sizeof(struct io_context), 0, SLAB_PANIC, NULL);
        return 0;
}
subsys_initcall(blk_ioc_init);





















































































































    2 














































    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
 * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * export.c
 */

/*
 * This file implements code to make Squashfs filesystems exportable (NFS etc.)
 *
 * The export code uses an inode lookup table to map inode numbers passed in
 * filehandles to an inode location on disk.  This table is stored compressed
 * into metadata blocks.  A second index table is used to locate these.  This
 * second index table for speed of access (and because it is small) is read at
 * mount time and cached in memory.
 *
 * The inode lookup table is used only by the export code, inode disk
 * locations are directly encoded in directories, enabling direct access
 * without an intermediate lookup for all operations except the export ops.
 */

#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/dcache.h>
#include <linux/exportfs.h>
#include <linux/slab.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"

/*
 * Look-up inode number (ino) in table, returning the inode location.
 */
static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
{
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
        int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
        u64 start;
        __le64 ino;
        int err;

        TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);

        if (ino_num == 0 || (ino_num - 1) >= msblk->inodes)
                return -EINVAL;

        start = le64_to_cpu(msblk->inode_lookup_table[blk]);

        err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
        if (err < 0)
                return err;

        TRACE("squashfs_inode_lookup, inode = 0x%llx\n",
                (u64) le64_to_cpu(ino));

        return le64_to_cpu(ino);
}


static struct dentry *squashfs_export_iget(struct super_block *sb,
        unsigned int ino_num)
{
        long long ino;
        struct dentry *dentry = ERR_PTR(-ENOENT);

        TRACE("Entered squashfs_export_iget\n");

        ino = squashfs_inode_lookup(sb, ino_num);
        if (ino >= 0)
                dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num));

        return dentry;
}


static struct dentry *squashfs_fh_to_dentry(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
{
        if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT)
                        || fh_len < 2)
                return NULL;

        return squashfs_export_iget(sb, fid->i32.ino);
}


static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
{
        if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4)
                return NULL;

        return squashfs_export_iget(sb, fid->i32.parent_ino);
}


static struct dentry *squashfs_get_parent(struct dentry *child)
{
        struct inode *inode = d_inode(child);
        unsigned int parent_ino = squashfs_i(inode)->parent;

        return squashfs_export_iget(inode->i_sb, parent_ino);
}


/*
 * Read uncompressed inode lookup table indexes off disk into memory
 */
__le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
                u64 lookup_table_start, u64 next_table, unsigned int inodes)
{
        unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
        unsigned int indexes = SQUASHFS_LOOKUP_BLOCKS(inodes);
        int n;
        __le64 *table;
        u64 start, end;

        TRACE("In read_inode_lookup_table, length %d\n", length);

        /* Sanity check values */

        /* there should always be at least one inode */
        if (inodes == 0)
                return ERR_PTR(-EINVAL);

        /*
         * The computed size of the lookup table (length bytes) should exactly
         * match the table start and end points
         */
        if (length != (next_table - lookup_table_start))
                return ERR_PTR(-EINVAL);

        table = squashfs_read_table(sb, lookup_table_start, length);
        if (IS_ERR(table))
                return table;

        /*
         * table0], table[1], ... table[indexes - 1] store the locations
         * of the compressed inode lookup blocks.  Each entry should be
         * less than the next (i.e. table[0] < table[1]), and the difference
         * between them should be SQUASHFS_METADATA_SIZE or less.
         * table[indexes - 1] should  be less than lookup_table_start, and
         * again the difference should be SQUASHFS_METADATA_SIZE or less
         */
        for (n = 0; n < (indexes - 1); n++) {
                start = le64_to_cpu(table[n]);
                end = le64_to_cpu(table[n + 1]);

                if (start >= end
                    || (end - start) >
                    (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                        kfree(table);
                        return ERR_PTR(-EINVAL);
                }
        }

        start = le64_to_cpu(table[indexes - 1]);
        if (start >= lookup_table_start ||
            (lookup_table_start - start) >
            (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                kfree(table);
                return ERR_PTR(-EINVAL);
        }

        return table;
}


const struct export_operations squashfs_export_ops = {
        .encode_fh = generic_encode_ino32_fh,
        .fh_to_dentry = squashfs_fh_to_dentry,
        .fh_to_parent = squashfs_fh_to_parent,
        .get_parent = squashfs_get_parent
};





































































































































































































    3 
    3 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
// SPDX-License-Identifier: GPL-2.0
/*
 * Wakeup statistics in sysfs
 *
 * Copyright (c) 2019 Linux Foundation
 * Copyright (c) 2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 * Copyright (c) 2019 Google Inc.
 */

#include <linux/device.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/kobject.h>
#include <linux/slab.h>
#include <linux/timekeeping.h>

#include "power.h"

static struct class *wakeup_class;

#define wakeup_attr(_name)                                                \
static ssize_t _name##_show(struct device *dev,                                \
                            struct device_attribute *attr, char *buf)        \
{                                                                        \
        struct wakeup_source *ws = dev_get_drvdata(dev);                \
                                                                        \
        return sysfs_emit(buf, "%lu\n", ws->_name);                        \
}                                                                        \
static DEVICE_ATTR_RO(_name)

wakeup_attr(active_count);
wakeup_attr(event_count);
wakeup_attr(wakeup_count);
wakeup_attr(expire_count);
wakeup_attr(relax_count);

static ssize_t active_time_ms_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct wakeup_source *ws = dev_get_drvdata(dev);
        ktime_t active_time =
                ws->active ? ktime_sub(ktime_get(), ws->last_time) : 0;

        return sysfs_emit(buf, "%lld\n", ktime_to_ms(active_time));
}
static DEVICE_ATTR_RO(active_time_ms);

static ssize_t total_time_ms_show(struct device *dev,
                                  struct device_attribute *attr, char *buf)
{
        struct wakeup_source *ws = dev_get_drvdata(dev);
        ktime_t active_time;
        ktime_t total_time = ws->total_time;

        if (ws->active) {
                active_time = ktime_sub(ktime_get(), ws->last_time);
                total_time = ktime_add(total_time, active_time);
        }

        return sysfs_emit(buf, "%lld\n", ktime_to_ms(total_time));
}
static DEVICE_ATTR_RO(total_time_ms);

static ssize_t max_time_ms_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        struct wakeup_source *ws = dev_get_drvdata(dev);
        ktime_t active_time;
        ktime_t max_time = ws->max_time;

        if (ws->active) {
                active_time = ktime_sub(ktime_get(), ws->last_time);
                if (active_time > max_time)
                        max_time = active_time;
        }

        return sysfs_emit(buf, "%lld\n", ktime_to_ms(max_time));
}
static DEVICE_ATTR_RO(max_time_ms);

static ssize_t last_change_ms_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct wakeup_source *ws = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%lld\n", ktime_to_ms(ws->last_time));
}
static DEVICE_ATTR_RO(last_change_ms);

static ssize_t name_show(struct device *dev, struct device_attribute *attr,
                         char *buf)
{
        struct wakeup_source *ws = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%s\n", ws->name);
}
static DEVICE_ATTR_RO(name);

static ssize_t prevent_suspend_time_ms_show(struct device *dev,
                                            struct device_attribute *attr,
                                            char *buf)
{
        struct wakeup_source *ws = dev_get_drvdata(dev);
        ktime_t prevent_sleep_time = ws->prevent_sleep_time;

        if (ws->active && ws->autosleep_enabled) {
                prevent_sleep_time = ktime_add(prevent_sleep_time,
                        ktime_sub(ktime_get(), ws->start_prevent_time));
        }

        return sysfs_emit(buf, "%lld\n", ktime_to_ms(prevent_sleep_time));
}
static DEVICE_ATTR_RO(prevent_suspend_time_ms);

static struct attribute *wakeup_source_attrs[] = {
        &dev_attr_name.attr,
        &dev_attr_active_count.attr,
        &dev_attr_event_count.attr,
        &dev_attr_wakeup_count.attr,
        &dev_attr_expire_count.attr,
        &dev_attr_relax_count.attr,
        &dev_attr_active_time_ms.attr,
        &dev_attr_total_time_ms.attr,
        &dev_attr_max_time_ms.attr,
        &dev_attr_last_change_ms.attr,
        &dev_attr_prevent_suspend_time_ms.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wakeup_source);

static void device_create_release(struct device *dev)
{
        kfree(dev);
}

static struct device *wakeup_source_device_create(struct device *parent,
                                                  struct wakeup_source *ws)
{
        struct device *dev = NULL;
        int retval;

        dev = kzalloc_obj(*dev);
        if (!dev) {
                retval = -ENOMEM;
                goto error;
        }

        device_initialize(dev);
        dev->devt = MKDEV(0, 0);
        dev->class = wakeup_class;
        dev->parent = parent;
        dev->groups = wakeup_source_groups;
        dev->release = device_create_release;
        dev_set_drvdata(dev, ws);
        device_set_pm_not_required(dev);

        retval = dev_set_name(dev, "wakeup%d", ws->id);
        if (retval)
                goto error;

        retval = device_add(dev);
        if (retval)
                goto error;

        return dev;

error:
        put_device(dev);
        return ERR_PTR(retval);
}

/**
 * wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs.
 * @parent: Device given wakeup source is associated with (or NULL if virtual).
 * @ws: Wakeup source to be added in sysfs.
 */
int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws)
{
        struct device *dev;

        dev = wakeup_source_device_create(parent, ws);
        if (IS_ERR(dev))
                return PTR_ERR(dev);
        ws->dev = dev;

        return 0;
}

/**
 * pm_wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs
 * for a device if they're missing.
 * @parent: Device given wakeup source is associated with
 */
int pm_wakeup_source_sysfs_add(struct device *parent)
{
        if (!parent->power.wakeup || parent->power.wakeup->dev)
                return 0;

        return wakeup_source_sysfs_add(parent, parent->power.wakeup);
}

/**
 * wakeup_source_sysfs_remove - Remove wakeup_source attributes from sysfs.
 * @ws: Wakeup source to be removed from sysfs.
 */
void wakeup_source_sysfs_remove(struct wakeup_source *ws)
{
        device_unregister(ws->dev);
}

static int __init wakeup_sources_sysfs_init(void)
{
        wakeup_class = class_create("wakeup");

        return PTR_ERR_OR_ZERO(wakeup_class);
}
postcore_initcall(wakeup_sources_sysfs_init);

























   22 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H

#include <linux/build_bug.h>
#include <linux/compiler.h>

#ifndef __ASSEMBLER__

#include <linux/cache.h>
#include <asm/percpu.h>

struct task_struct;

DECLARE_PER_CPU_CACHE_HOT(struct task_struct *, current_task);
/* const-qualified alias provided by the linker. */
DECLARE_PER_CPU_CACHE_HOT(struct task_struct * const __percpu_seg_override,
                          const_current_task);

static __always_inline struct task_struct *get_current(void)
{
        if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
                return this_cpu_read_const(const_current_task);

        return this_cpu_read_stable(current_task);
}

#define current get_current()

#endif /* __ASSEMBLER__ */

#endif /* _ASM_X86_CURRENT_H */




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * NILFS meta data file prototype and definitions
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Ryusuke Konishi.
 */

#ifndef _NILFS_MDT_H
#define _NILFS_MDT_H

#include <linux/buffer_head.h>
#include <linux/blockgroup_lock.h>
#include "nilfs.h"
#include "page.h"

/**
 * struct nilfs_shadow_map - shadow mapping of meta data file
 * @bmap_store: shadow copy of bmap state
 * @inode: holder of page caches used in shadow mapping
 * @frozen_buffers: list of frozen buffers
 */
struct nilfs_shadow_map {
        struct nilfs_bmap_store bmap_store;
        struct inode *inode;
        struct list_head frozen_buffers;
};

/**
 * struct nilfs_mdt_info - on-memory private data of meta data files
 * @mi_sem: reader/writer semaphore for meta data operations
 * @mi_bgl: per-blockgroup locking
 * @mi_entry_size: size of an entry
 * @mi_first_entry_offset: offset to the first entry
 * @mi_entries_per_block: number of entries in a block
 * @mi_palloc_cache: persistent object allocator cache
 * @mi_shadow: shadow of bmap and page caches
 * @mi_blocks_per_group: number of blocks in a group
 * @mi_blocks_per_desc_block: number of blocks per descriptor block
 */
struct nilfs_mdt_info {
        struct rw_semaphore        mi_sem;
        struct blockgroup_lock *mi_bgl;
        unsigned int                mi_entry_size;
        unsigned int                mi_first_entry_offset;
        unsigned long                mi_entries_per_block;
        struct nilfs_palloc_cache *mi_palloc_cache;
        struct nilfs_shadow_map *mi_shadow;
        unsigned long                mi_blocks_per_group;
        unsigned long                mi_blocks_per_desc_block;
};

static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
{
        return inode->i_private;
}

static inline int nilfs_is_metadata_file_inode(const struct inode *inode)
{
        return inode->i_private != NULL;
}

/* Default GFP flags using highmem */
#define NILFS_MDT_GFP      (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)

int nilfs_mdt_get_block(struct inode *, unsigned long, int,
                        void (*init_block)(struct inode *,
                                           struct buffer_head *, void *),
                        struct buffer_head **);
int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
                         unsigned long end, unsigned long *blkoff,
                         struct buffer_head **out_bh);
int nilfs_mdt_delete_block(struct inode *, unsigned long);
int nilfs_mdt_forget_block(struct inode *, unsigned long);
int nilfs_mdt_fetch_dirty(struct inode *);

int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
void nilfs_mdt_clear(struct inode *inode);
void nilfs_mdt_destroy(struct inode *inode);

void nilfs_mdt_set_entry_size(struct inode *, unsigned int, unsigned int);

int nilfs_mdt_setup_shadow_map(struct inode *inode,
                               struct nilfs_shadow_map *shadow);
int nilfs_mdt_save_to_shadow_map(struct inode *inode);
void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
void nilfs_mdt_clear_shadow_map(struct inode *inode);
int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
                                                struct buffer_head *bh);

static inline void nilfs_mdt_mark_dirty(struct inode *inode)
{
        if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
                set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
}

static inline void nilfs_mdt_clear_dirty(struct inode *inode)
{
        clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
}

static inline __u64 nilfs_mdt_cno(struct inode *inode)
{
        return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno;
}

static inline spinlock_t *
nilfs_mdt_bgl_lock(struct inode *inode, unsigned int block_group)
{
        return bgl_lock_ptr(NILFS_MDT(inode)->mi_bgl, block_group);
}

#endif /* _NILFS_MDT_H */











   33 










































    2 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM percpu

#if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PERCPU_H

#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

TRACE_EVENT(percpu_alloc_percpu,

        TP_PROTO(unsigned long call_site,
                 bool reserved, bool is_atomic, size_t size,
                 size_t align, void *base_addr, int off,
                 void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),

        TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off,
                ptr, bytes_alloc, gfp_flags),

        TP_STRUCT__entry(
                __field(        unsigned long,                call_site        )
                __field(        bool,                        reserved        )
                __field(        bool,                        is_atomic        )
                __field(        size_t,                        size                )
                __field(        size_t,                        align                )
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
                __field(        size_t,                        bytes_alloc        )
                __field(        unsigned long,                gfp_flags        )
        ),
        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
        ),

        TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s",
                  (void *)__entry->call_site,
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align,
                  __entry->base_addr, __entry->off, __entry->ptr,
                  __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags))
);

TRACE_EVENT(percpu_free_percpu,

        TP_PROTO(void *base_addr, int off, void __percpu *ptr),

        TP_ARGS(base_addr, off, ptr),

        TP_STRUCT__entry(
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
        ),

        TP_printk("base_addr=%p off=%d ptr=%p",
                __entry->base_addr, __entry->off, __entry->ptr)
);

TRACE_EVENT(percpu_alloc_percpu_fail,

        TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align),

        TP_ARGS(reserved, is_atomic, size, align),

        TP_STRUCT__entry(
                __field(        bool,        reserved        )
                __field(        bool,        is_atomic        )
                __field(        size_t,        size                )
                __field(        size_t, align                )
        ),

        TP_fast_assign(
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
        ),

        TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu",
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align)
);

TRACE_EVENT(percpu_create_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *, base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

TRACE_EVENT(percpu_destroy_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *,        base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

#endif /* _TRACE_PERCPU_H */

#include <trace/define_trace.h>























   17 









































































   16 



   17 
































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
// SPDX-License-Identifier: GPL-2.0-or-later
/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
 */

#include <net/ip.h>

#include "ipvlan.h"

static unsigned int ipvlan_netid __read_mostly;

struct ipvlan_netns {
        unsigned int ipvl_nf_hook_refcnt;
};

static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb,
                                            struct net_device *dev)
{
        struct ipvl_addr *addr = NULL;
        struct ipvl_port *port;
        int addr_type;
        void *lyr3h;

        if (!dev || !netif_is_ipvlan_port(dev))
                goto out;

        port = ipvlan_port_get_rcu(dev);
        if (!port || port->mode != IPVLAN_MODE_L3S)
                goto out;

        lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type);
        if (!lyr3h)
                goto out;

        addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
out:
        return addr;
}

static struct sk_buff *ipvlan_l3_rcv(struct net_device *dev,
                                     struct sk_buff *skb, u16 proto)
{
        struct ipvl_addr *addr;
        struct net_device *sdev;

        addr = ipvlan_skb_to_addr(skb, dev);
        if (!addr)
                goto out;

        sdev = addr->master->dev;
        switch (proto) {
        case AF_INET:
        {
                const struct iphdr *ip4h = ip_hdr(skb);
                int err;

                err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr,
                                           ip4h_dscp(ip4h), sdev);
                if (unlikely(err))
                        goto out;
                break;
        }
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
        {
                struct dst_entry *dst;
                struct ipv6hdr *ip6h = ipv6_hdr(skb);
                int flags = RT6_LOOKUP_F_HAS_SADDR;
                struct flowi6 fl6 = {
                        .flowi6_iif   = sdev->ifindex,
                        .daddr        = ip6h->daddr,
                        .saddr        = ip6h->saddr,
                        .flowlabel    = ip6_flowinfo(ip6h),
                        .flowi6_mark  = skb->mark,
                        .flowi6_proto = ip6h->nexthdr,
                };

                skb_dst_drop(skb);
                dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6,
                                             skb, flags);
                skb_dst_set(skb, dst);
                break;
        }
#endif
        default:
                break;
        }
out:
        return skb;
}

static const struct l3mdev_ops ipvl_l3mdev_ops = {
        .l3mdev_l3_rcv = ipvlan_l3_rcv,
};

static unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
                                    const struct nf_hook_state *state)
{
        struct ipvl_addr *addr;
        unsigned int len;

        addr = ipvlan_skb_to_addr(skb, skb->dev);
        if (!addr)
                goto out;

        skb->dev = addr->master->dev;
        skb->skb_iif = skb->dev->ifindex;
#if IS_ENABLED(CONFIG_IPV6)
        if (addr->atype == IPVL_IPV6)
                IP6CB(skb)->iif = skb->dev->ifindex;
#endif
        len = skb->len + ETH_HLEN;
        ipvlan_count_rx(addr->master, len, true, false);
out:
        return NF_ACCEPT;
}

static const struct nf_hook_ops ipvl_nfops[] = {
        {
                .hook     = ipvlan_nf_input,
                .pf       = NFPROTO_IPV4,
                .hooknum  = NF_INET_LOCAL_IN,
                .priority = INT_MAX,
        },
#if IS_ENABLED(CONFIG_IPV6)
        {
                .hook     = ipvlan_nf_input,
                .pf       = NFPROTO_IPV6,
                .hooknum  = NF_INET_LOCAL_IN,
                .priority = INT_MAX,
        },
#endif
};

static int ipvlan_register_nf_hook(struct net *net)
{
        struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid);
        int err = 0;

        if (!vnet->ipvl_nf_hook_refcnt) {
                err = nf_register_net_hooks(net, ipvl_nfops,
                                            ARRAY_SIZE(ipvl_nfops));
                if (!err)
                        vnet->ipvl_nf_hook_refcnt = 1;
        } else {
                vnet->ipvl_nf_hook_refcnt++;
        }

        return err;
}

static void ipvlan_unregister_nf_hook(struct net *net)
{
        struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid);

        if (WARN_ON(!vnet->ipvl_nf_hook_refcnt))
                return;

        vnet->ipvl_nf_hook_refcnt--;
        if (!vnet->ipvl_nf_hook_refcnt)
                nf_unregister_net_hooks(net, ipvl_nfops,
                                        ARRAY_SIZE(ipvl_nfops));
}

void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet)
{
        struct ipvlan_netns *old_vnet;

        ASSERT_RTNL();

        old_vnet = net_generic(oldnet, ipvlan_netid);
        if (!old_vnet->ipvl_nf_hook_refcnt)
                return;

        ipvlan_register_nf_hook(newnet);
        ipvlan_unregister_nf_hook(oldnet);
}

static void ipvlan_ns_exit(struct net *net)
{
        struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid);

        if (WARN_ON_ONCE(vnet->ipvl_nf_hook_refcnt)) {
                vnet->ipvl_nf_hook_refcnt = 0;
                nf_unregister_net_hooks(net, ipvl_nfops,
                                        ARRAY_SIZE(ipvl_nfops));
        }
}

static struct pernet_operations ipvlan_net_ops = {
        .id   = &ipvlan_netid,
        .size = sizeof(struct ipvlan_netns),
        .exit = ipvlan_ns_exit,
};

int ipvlan_l3s_init(void)
{
        return register_pernet_subsys(&ipvlan_net_ops);
}

void ipvlan_l3s_cleanup(void)
{
        unregister_pernet_subsys(&ipvlan_net_ops);
}

int ipvlan_l3s_register(struct ipvl_port *port)
{
        struct net_device *dev = port->dev;
        int ret;

        ASSERT_RTNL();

        ret = ipvlan_register_nf_hook(read_pnet(&port->pnet));
        if (!ret) {
                dev->l3mdev_ops = &ipvl_l3mdev_ops;
                dev->priv_flags |= IFF_L3MDEV_RX_HANDLER;
        }

        return ret;
}

void ipvlan_l3s_unregister(struct ipvl_port *port)
{
        struct net_device *dev = port->dev;

        ASSERT_RTNL();

        dev->priv_flags &= ~IFF_L3MDEV_RX_HANDLER;
        ipvlan_unregister_nf_hook(read_pnet(&port->pnet));
}


















































































    1 





















































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM tcp

#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TCP_H

#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <linux/tracepoint.h>
#include <net/ipv6.h>
#include <net/tcp.h>
#include <linux/sock_diag.h>
#include <net/rstreason.h>

TRACE_EVENT(tcp_retransmit_skb,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb, int err),

        TP_ARGS(sk, skb, err),

        TP_STRUCT__entry(
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
                __field(int, state)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
                __field(int, err)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skbaddr = skb;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                              sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);

                __entry->err = err;
        ),

        TP_printk("skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s err=%d",
                  __entry->skbaddr, __entry->skaddr,
                  show_family_name(__entry->family),
                  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  show_tcp_state_name(__entry->state),
                  __entry->err)
);

#undef FN
#define FN(reason)        TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
DEFINE_RST_REASON(FN, FN)

#undef FN
#undef FNe
#define FN(reason)        { SK_RST_REASON_##reason, #reason },
#define FNe(reason)        { SK_RST_REASON_##reason, #reason }

/*
 * skb of trace_tcp_send_reset is the skb that caused RST. In case of
 * active reset, skb should be NULL
 */
TRACE_EVENT(tcp_send_reset,

        TP_PROTO(const struct sock *sk,
                 const struct sk_buff *skb__nullable,
                 const enum sk_rst_reason reason),

        TP_ARGS(sk, skb__nullable, reason),

        TP_STRUCT__entry(
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
                __field(int, state)
                __field(enum sk_rst_reason, reason)
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
        ),

        TP_fast_assign(
                __entry->skbaddr = skb__nullable;
                __entry->skaddr = sk;
                /* Zero means unknown state. */
                __entry->state = sk ? sk->sk_state : 0;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));

                if (sk && sk_fullsock(sk)) {
                        const struct inet_sock *inet = inet_sk(sk);

                        TP_STORE_ADDR_PORTS(__entry, inet, sk);
                } else if (skb__nullable) {
                        const struct tcphdr *th = (const struct tcphdr *)skb__nullable->data;
                        /*
                         * We should reverse the 4-tuple of skb, so later
                         * it can print the right flow direction of rst.
                         */
                        TP_STORE_ADDR_PORTS_SKB(skb__nullable, th, entry->daddr, entry->saddr);
                }
                __entry->reason = reason;
        ),

        TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s reason=%s",
                  __entry->skbaddr, __entry->skaddr,
                  __entry->saddr, __entry->daddr,
                  __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN",
                  __print_symbolic(__entry->reason, DEFINE_RST_REASON(FN, FNe)))
);

#undef FN
#undef FNe

/*
 * tcp event with arguments sk
 *
 * Note: this class requires a valid sk pointer.
 */
DECLARE_EVENT_CLASS(tcp_event_sk,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
                __field(__u64, sock_cookie)
        ),

        TP_fast_assign(
                struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skaddr = sk;

                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);

                __entry->sock_cookie = sock_gen_cookie(sk);
        ),

        TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx",
                  show_family_name(__entry->family),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  __entry->sock_cookie)
);

DEFINE_EVENT(tcp_event_sk, tcp_receive_reset,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk)
);

DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk)
);

DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk)
);

TRACE_EVENT(tcp_rcvbuf_grow,

        TP_PROTO(struct sock *sk, int time),

        TP_ARGS(sk, time),

        TP_STRUCT__entry(
                __field(int, time)
                __field(__u32, rtt_us)
                __field(__u32, copied)
                __field(__u32, inq)
                __field(__u32, space)
                __field(__u32, ooo_space)
                __field(__u32, rcvbuf)
                __field(__u32, rcv_ssthresh)
                __field(__u32, window_clamp)
                __field(__u32, rcv_wnd)
                __field(__u8, scaling_ratio)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
                __field(const void *, skaddr)
                __field(__u64, sock_cookie)
        ),

        TP_fast_assign(
                struct inet_sock *inet = inet_sk(sk);
                struct tcp_sock *tp = tcp_sk(sk);
                __be32 *p32;

                __entry->time = time;
                __entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
                __entry->copied = tp->copied_seq - tp->rcvq_space.seq;
                __entry->inq = tp->rcv_nxt - tp->copied_seq;
                __entry->space = tp->rcvq_space.space;
                __entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 :
                                     TCP_SKB_CB(tp->ooo_last_skb)->end_seq -
                                     tp->rcv_nxt;

                __entry->rcvbuf = sk->sk_rcvbuf;
                __entry->rcv_ssthresh = tp->rcv_ssthresh;
                __entry->window_clamp = tp->window_clamp;
                __entry->rcv_wnd = tp->rcv_wnd;
                __entry->scaling_ratio = tp->scaling_ratio;
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 = inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);

                __entry->skaddr = sk;
                __entry->sock_cookie = sock_gen_cookie(sk);
        ),

        TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
                  "rcv_ssthresh=%u window_clamp=%u rcv_wnd=%u "
                  "family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
                  "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
                  __entry->time, __entry->rtt_us, __entry->copied,
                  __entry->inq, __entry->space, __entry->ooo_space,
                  __entry->scaling_ratio, __entry->rcvbuf,
                  __entry->rcv_ssthresh, __entry->window_clamp,
                  __entry->rcv_wnd,
                  show_family_name(__entry->family),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  __entry->skaddr,
                  __entry->sock_cookie)
);

TRACE_EVENT(tcp_retransmit_synack,

        TP_PROTO(const struct sock *sk, const struct request_sock *req),

        TP_ARGS(sk, req),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(const void *, req)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                const struct inet_request_sock *ireq = inet_rsk(req);
                __be32 *p32;

                __entry->skaddr = sk;
                __entry->req = req;

                __entry->sport = ireq->ir_num;
                __entry->dport = ntohs(ireq->ir_rmt_port);
                __entry->family = sk->sk_family;

                p32 = (__be32 *) __entry->saddr;
                *p32 = ireq->ir_loc_addr;

                p32 = (__be32 *) __entry->daddr;
                *p32 = ireq->ir_rmt_addr;

                TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr,
                              ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr);
        ),

        TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
                  show_family_name(__entry->family),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6)
);

TRACE_EVENT(tcp_sendmsg_locked,
        TP_PROTO(const struct sock *sk, const struct msghdr *msg,
                 const struct sk_buff *skb, int size_goal),

        TP_ARGS(sk, msg, skb, size_goal),

        TP_STRUCT__entry(
                __field(const void *, skb_addr)
                __field(int, skb_len)
                __field(int, msg_left)
                __field(int, size_goal)
        ),

        TP_fast_assign(
                __entry->skb_addr = skb;
                __entry->skb_len = skb ? skb->len : 0;
                __entry->msg_left = msg_data_left(msg);
                __entry->size_goal = size_goal;
        ),

        TP_printk("skb_addr %p skb_len %d msg_left %d size_goal %d",
                  __entry->skb_addr, __entry->skb_len, __entry->msg_left,
                  __entry->size_goal));

DECLARE_TRACE(tcp_cwnd_reduction,
        TP_PROTO(const struct sock *sk, int newly_acked_sacked,
                 int newly_lost, int flag),
        TP_ARGS(sk, newly_acked_sacked, newly_lost, flag)
);

#include <trace/events/net_probe_common.h>

TRACE_EVENT(tcp_probe,

        TP_PROTO(struct sock *sk, const struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u32, mark)
                __field(__u16, data_len)
                __field(__u32, snd_nxt)
                __field(__u32, snd_una)
                __field(__u32, snd_cwnd)
                __field(__u32, ssthresh)
                __field(__u32, snd_wnd)
                __field(__u32, srtt)
                __field(__u32, rcv_wnd)
                __field(__u64, sock_cookie)
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
        ),

        TP_fast_assign(
                const struct tcphdr *th = (const struct tcphdr *)skb->data;
                const struct inet_sock *inet = inet_sk(sk);
                const struct tcp_sock *tp = tcp_sk(sk);

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));

                TP_STORE_ADDR_PORTS(__entry, inet, sk);

                /* For filtering use */
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->mark = skb->mark;
                __entry->family = sk->sk_family;

                __entry->data_len = skb->len - __tcp_hdrlen(th);
                __entry->snd_nxt = tp->snd_nxt;
                __entry->snd_una = tp->snd_una;
                __entry->snd_cwnd = tcp_snd_cwnd(tp);
                __entry->snd_wnd = tp->snd_wnd;
                __entry->rcv_wnd = tp->rcv_wnd;
                __entry->ssthresh = tcp_current_ssthresh(sk);
                __entry->srtt = tp->srtt_us >> 3;
                __entry->sock_cookie = sock_gen_cookie(sk);

                __entry->skbaddr = skb;
                __entry->skaddr = sk;
        ),

        TP_printk("family=%s src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx skbaddr=%p skaddr=%p",
                  show_family_name(__entry->family),
                  __entry->saddr, __entry->daddr, __entry->mark,
                  __entry->data_len, __entry->snd_nxt, __entry->snd_una,
                  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
                  __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie,
                  __entry->skbaddr, __entry->skaddr)
);

/*
 * tcp event with only skb
 */
DECLARE_EVENT_CLASS(tcp_event_skb,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __field(const void *, skbaddr)
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
        ),

        TP_fast_assign(
                const struct tcphdr *th = (const struct tcphdr *)skb->data;
                __entry->skbaddr = skb;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));

                TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr);
        ),

        TP_printk("skbaddr=%p src=%pISpc dest=%pISpc",
                  __entry->skbaddr, __entry->saddr, __entry->daddr)
);

DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

TRACE_EVENT(tcp_cong_state_set,

        TP_PROTO(struct sock *sk, const u8 ca_state),

        TP_ARGS(sk, ca_state),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
                __field(__u8, cong_state)
        ),

        TP_fast_assign(
                struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skaddr = sk;

                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                           sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);

                __entry->cong_state = ca_state;
        ),

        TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u",
                  show_family_name(__entry->family),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  __entry->cong_state)
);

DECLARE_EVENT_CLASS(tcp_hash_event,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                __field(__u64, net_cookie)
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
                __field(int, state)

                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
                __field(int, l3index)

                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)

                __field(bool, fin)
                __field(bool, syn)
                __field(bool, rst)
                __field(bool, psh)
                __field(bool, ack)
        ),

        TP_fast_assign(
                const struct tcphdr *th = (const struct tcphdr *)skb->data;

                __entry->net_cookie = sock_net(sk)->net_cookie;
                __entry->skbaddr = skb;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
                TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr);
                __entry->l3index = inet_sdif(skb) ? inet_iif(skb) : 0;

                /* For filtering use */
                __entry->sport = ntohs(th->source);
                __entry->dport = ntohs(th->dest);
                __entry->family = sk->sk_family;

                __entry->fin = th->fin;
                __entry->syn = th->syn;
                __entry->rst = th->rst;
                __entry->psh = th->psh;
                __entry->ack = th->ack;
        ),

        TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc L3index=%d [%c%c%c%c%c]",
                  __entry->net_cookie,
                  show_tcp_state_name(__entry->state),
                  show_family_name(__entry->family),
                  __entry->saddr, __entry->daddr,
                  __entry->l3index,
                  __entry->fin ? 'F' : ' ',
                  __entry->syn ? 'S' : ' ',
                  __entry->rst ? 'R' : ' ',
                  __entry->psh ? 'P' : ' ',
                  __entry->ack ? '.' : ' ')
);

DEFINE_EVENT(tcp_hash_event, tcp_hash_bad_header,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
        TP_ARGS(sk, skb)
);

DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_required,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
        TP_ARGS(sk, skb)
);

DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_unexpected,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
        TP_ARGS(sk, skb)
);

DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_mismatch,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
        TP_ARGS(sk, skb)
);

DEFINE_EVENT(tcp_hash_event, tcp_hash_ao_required,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
        TP_ARGS(sk, skb)
);

DECLARE_EVENT_CLASS(tcp_ao_event,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),

        TP_ARGS(sk, skb, keyid, rnext, maclen),

        TP_STRUCT__entry(
                __field(__u64, net_cookie)
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
                __field(int, state)

                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
                __field(int, l3index)

                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)

                __field(bool, fin)
                __field(bool, syn)
                __field(bool, rst)
                __field(bool, psh)
                __field(bool, ack)

                __field(__u8, keyid)
                __field(__u8, rnext)
                __field(__u8, maclen)
        ),

        TP_fast_assign(
                const struct tcphdr *th = (const struct tcphdr *)skb->data;

                __entry->net_cookie = sock_net(sk)->net_cookie;
                __entry->skbaddr = skb;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
                TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr);
                __entry->l3index = inet_sdif(skb) ? inet_iif(skb) : 0;

                /* For filtering use */
                __entry->sport = ntohs(th->source);
                __entry->dport = ntohs(th->dest);
                __entry->family = sk->sk_family;

                __entry->fin = th->fin;
                __entry->syn = th->syn;
                __entry->rst = th->rst;
                __entry->psh = th->psh;
                __entry->ack = th->ack;

                __entry->keyid = keyid;
                __entry->rnext = rnext;
                __entry->maclen = maclen;
        ),

        TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc L3index=%d [%c%c%c%c%c] keyid=%u rnext=%u maclen=%u",
                  __entry->net_cookie,
                  show_tcp_state_name(__entry->state),
                  show_family_name(__entry->family),
                  __entry->saddr, __entry->daddr,
                  __entry->l3index,
                  __entry->fin ? 'F' : ' ',
                  __entry->syn ? 'S' : ' ',
                  __entry->rst ? 'R' : ' ',
                  __entry->psh ? 'P' : ' ',
                  __entry->ack ? '.' : ' ',
                  __entry->keyid, __entry->rnext, __entry->maclen)
);

DEFINE_EVENT(tcp_ao_event, tcp_ao_handshake_failure,
        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),
        TP_ARGS(sk, skb, keyid, rnext, maclen)
);

#ifdef CONFIG_TCP_AO
DEFINE_EVENT(tcp_ao_event, tcp_ao_wrong_maclen,
        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),
        TP_ARGS(sk, skb, keyid, rnext, maclen)
);

DEFINE_EVENT(tcp_ao_event, tcp_ao_mismatch,
        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),
        TP_ARGS(sk, skb, keyid, rnext, maclen)
);

DEFINE_EVENT(tcp_ao_event, tcp_ao_key_not_found,
        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),
        TP_ARGS(sk, skb, keyid, rnext, maclen)
);

DEFINE_EVENT(tcp_ao_event, tcp_ao_rnext_request,
        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),
        TP_ARGS(sk, skb, keyid, rnext, maclen)
);

DECLARE_EVENT_CLASS(tcp_ao_event_sk,

        TP_PROTO(const struct sock *sk, const __u8 keyid, const __u8 rnext),

        TP_ARGS(sk, keyid, rnext),

        TP_STRUCT__entry(
                __field(__u64, net_cookie)
                __field(const void *, skaddr)
                __field(int, state)

                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))

                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)

                __field(__u8, keyid)
                __field(__u8, rnext)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);

                __entry->net_cookie = sock_net(sk)->net_cookie;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
                TP_STORE_ADDR_PORTS(__entry, inet, sk);

                /* For filtering use */
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                __entry->keyid = keyid;
                __entry->rnext = rnext;
        ),

        TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc keyid=%u rnext=%u",
                  __entry->net_cookie,
                  show_tcp_state_name(__entry->state),
                  show_family_name(__entry->family),
                  __entry->saddr, __entry->daddr,
                  __entry->keyid, __entry->rnext)
);

DEFINE_EVENT(tcp_ao_event_sk, tcp_ao_synack_no_key,
        TP_PROTO(const struct sock *sk, const __u8 keyid, const __u8 rnext),
        TP_ARGS(sk, keyid, rnext)
);

DECLARE_EVENT_CLASS(tcp_ao_event_sne,

        TP_PROTO(const struct sock *sk, __u32 new_sne),

        TP_ARGS(sk, new_sne),

        TP_STRUCT__entry(
                __field(__u64, net_cookie)
                __field(const void *, skaddr)
                __field(int, state)

                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))

                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)

                __field(__u32, new_sne)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);

                __entry->net_cookie = sock_net(sk)->net_cookie;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
                TP_STORE_ADDR_PORTS(__entry, inet, sk);

                /* For filtering use */
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                __entry->new_sne = new_sne;
        ),

        TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc sne=%u",
                  __entry->net_cookie,
                  show_tcp_state_name(__entry->state),
                  show_family_name(__entry->family),
                  __entry->saddr, __entry->daddr,
                  __entry->new_sne)
);

DEFINE_EVENT(tcp_ao_event_sne, tcp_ao_snd_sne_update,
        TP_PROTO(const struct sock *sk, __u32 new_sne),
        TP_ARGS(sk, new_sne)
);

DEFINE_EVENT(tcp_ao_event_sne, tcp_ao_rcv_sne_update,
        TP_PROTO(const struct sock *sk, __u32 new_sne),
        TP_ARGS(sk, new_sne)
);
#endif /* CONFIG_TCP_AO */

#endif /* _TRACE_TCP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





















































































































































































   29 





















































































































   27 







































    1 

































































































































































   29 
   26 


   27 






































































































   28 



























    6 









































































































































































































































    6 







































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_CPUMASK_H
#define __LINUX_CPUMASK_H

/*
 * Cpumasks provide a bitmap suitable for representing the
 * set of CPUs in a system, one bit position per CPU number.  In general,
 * only nr_cpu_ids (<= NR_CPUS) bits are valid.
 */
#include <linux/atomic.h>
#include <linux/bitmap.h>
#include <linux/cleanup.h>
#include <linux/cpumask_types.h>
#include <linux/gfp_types.h>
#include <linux/numa.h>
#include <linux/threads.h>
#include <linux/types.h>

#include <asm/bug.h>

/**
 * cpumask_pr_args - printf args to output a cpumask
 * @maskp: cpumask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
 */
#define cpumask_pr_args(maskp)                nr_cpu_ids, cpumask_bits(maskp)

#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
#define nr_cpu_ids ((unsigned int)NR_CPUS)
#else
extern unsigned int nr_cpu_ids;
#endif

static __always_inline void set_nr_cpu_ids(unsigned int nr)
{
#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
        WARN_ON(nr != nr_cpu_ids);
#else
        nr_cpu_ids = nr;
#endif
}

/*
 * We have several different "preferred sizes" for the cpumask
 * operations, depending on operation.
 *
 * For example, the bitmap scanning and operating operations have
 * optimized routines that work for the single-word case, but only when
 * the size is constant. So if NR_CPUS fits in one single word, we are
 * better off using that small constant, in order to trigger the
 * optimized bit finding. That is 'small_cpumask_size'.
 *
 * The clearing and copying operations will similarly perform better
 * with a constant size, but we limit that size arbitrarily to four
 * words. We call this 'large_cpumask_size'.
 *
 * Finally, some operations just want the exact limit, either because
 * they set bits or just don't have any faster fixed-sized versions. We
 * call this just 'nr_cpumask_bits'.
 *
 * Note that these optional constants are always guaranteed to be at
 * least as big as 'nr_cpu_ids' itself is, and all our cpumask
 * allocations are at least that size (see cpumask_size()). The
 * optimization comes from being able to potentially use a compile-time
 * constant instead of a run-time generated exact number of CPUs.
 */
#if NR_CPUS <= BITS_PER_LONG
  #define small_cpumask_bits ((unsigned int)NR_CPUS)
  #define large_cpumask_bits ((unsigned int)NR_CPUS)
#elif NR_CPUS <= 4*BITS_PER_LONG
  #define small_cpumask_bits nr_cpu_ids
  #define large_cpumask_bits ((unsigned int)NR_CPUS)
#else
  #define small_cpumask_bits nr_cpu_ids
  #define large_cpumask_bits nr_cpu_ids
#endif
#define nr_cpumask_bits nr_cpu_ids

/*
 * The following particular system cpumasks and operations manage
 * possible, present, active and online cpus.
 *
 *     cpu_possible_mask- has bit 'cpu' set iff cpu is populatable
 *     cpu_present_mask - has bit 'cpu' set iff cpu is populated
 *     cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online
 *     cpu_online_mask  - has bit 'cpu' set iff cpu available to scheduler
 *     cpu_active_mask  - has bit 'cpu' set iff cpu available to migration
 *
 *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
 *
 *  The cpu_possible_mask is fixed at boot time, as the set of CPU IDs
 *  that it is possible might ever be plugged in at anytime during the
 *  life of that system boot.  The cpu_present_mask is dynamic(*),
 *  representing which CPUs are currently plugged in.  And
 *  cpu_online_mask is the dynamic subset of cpu_present_mask,
 *  indicating those CPUs available for scheduling.
 *
 *  If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
 *  depending on what ACPI reports as currently plugged in, otherwise
 *  cpu_present_mask is just a copy of cpu_possible_mask.
 *
 *  (*) Well, cpu_present_mask is dynamic in the hotplug case.  If not
 *      hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
 *
 * Subtleties:
 * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
 *    assumption that their single CPU is online.  The UP
 *    cpu_{online,possible,present}_masks are placebos.  Changing them
 *    will have no useful affect on the following num_*_cpus()
 *    and cpu_*() macros in the UP case.  This ugliness is a UP
 *    optimization - don't waste any instructions or memory references
 *    asking if you're online or how many CPUs there are if there is
 *    only one CPU.
 */

extern struct cpumask __cpu_possible_mask;
extern struct cpumask __cpu_online_mask;
extern struct cpumask __cpu_enabled_mask;
extern struct cpumask __cpu_present_mask;
extern struct cpumask __cpu_active_mask;
extern struct cpumask __cpu_dying_mask;
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
#define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
#define cpu_enabled_mask   ((const struct cpumask *)&__cpu_enabled_mask)
#define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
#define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
#define cpu_dying_mask    ((const struct cpumask *)&__cpu_dying_mask)

extern atomic_t __num_online_cpus;
extern unsigned int __num_possible_cpus;

extern cpumask_t cpus_booted_once_mask;

static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
{
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        WARN_ON_ONCE(cpu >= bits);
#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
}

/* verify cpu argument to cpumask_* operators */
static __always_inline unsigned int cpumask_check(unsigned int cpu)
{
        cpu_max_bits_warn(cpu, small_cpumask_bits);
        return cpu;
}

/**
 * cpumask_first - get the first cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
static __always_inline unsigned int cpumask_first(const struct cpumask *srcp)
{
        return find_first_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_first_zero - get the first unset cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if all cpus are set.
 */
static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
{
        return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
 * @srcp1: the first input
 * @srcp2: the second input
 *
 * Return: >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
 */
static __always_inline
unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
        return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_first_andnot - return the first cpu from *srcp1 & ~*srcp2
 * @srcp1: the first input
 * @srcp2: the second input
 *
 * Return: >= nr_cpu_ids if no such cpu found.
 */
static __always_inline
unsigned int cpumask_first_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
        return find_first_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3
 * @srcp1: the first input
 * @srcp2: the second input
 * @srcp3: the third input
 *
 * Return: >= nr_cpu_ids if no cpus set in all.
 */
static __always_inline
unsigned int cpumask_first_and_and(const struct cpumask *srcp1,
                                   const struct cpumask *srcp2,
                                   const struct cpumask *srcp3)
{
        return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                      cpumask_bits(srcp3), small_cpumask_bits);
}

/**
 * cpumask_last - get the last CPU in a cpumask
 * @srcp:        - the cpumask pointer
 *
 * Return:        >= nr_cpumask_bits if no CPUs set.
 */
static __always_inline unsigned int cpumask_last(const struct cpumask *srcp)
{
        return find_last_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_next - get the next cpu in a cpumask
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set.
 */
static __always_inline
unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_zero - get the next unset cpu in a cpumask
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus unset.
 */
static __always_inline
unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1);
}

#if NR_CPUS == 1
/* Uniprocessor: there is only one valid CPU */
static __always_inline
unsigned int cpumask_local_spread(unsigned int i, int node)
{
        return 0;
}

static __always_inline
unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                                        const struct cpumask *src2p)
{
        return cpumask_first_and(src1p, src2p);
}

static __always_inline
unsigned int cpumask_any_distribute(const struct cpumask *srcp)
{
        return cpumask_first(srcp);
}
#else
unsigned int cpumask_local_spread(unsigned int i, int node);
unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p);
unsigned int cpumask_any_distribute(const struct cpumask *srcp);
#endif /* NR_CPUS */

/**
 * cpumask_next_and - get the next cpu in *src1p & *src2p
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set in both.
 */
static __always_inline
unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
                              const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
                small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_andnot - get the next cpu in *src1p & ~*src2p
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set in both.
 */
static __always_inline
unsigned int cpumask_next_andnot(int n, const struct cpumask *src1p,
                                 const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_andnot_bit(cpumask_bits(src1p), cpumask_bits(src2p),
                small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from
 *                           @n+1. If nothing found, wrap around and start from
 *                           the beginning
 * @n: the cpu prior to the place to search (i.e. search starts from @n+1)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty.
 */
static __always_inline
unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p,
                              const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p),
                small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing
 *                       found, wrap around and start from the beginning
 * @n: the cpu prior to the place to search (i.e. search starts from @n+1)
 * @src: cpumask pointer
 *
 * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty.
 */
static __always_inline
unsigned int cpumask_next_wrap(int n, const struct cpumask *src)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1);
}

/**
 * cpumask_random - get random cpu in *src.
 * @src: cpumask pointer
 *
 * Return: random set bit, or >= nr_cpu_ids if @src is empty.
 */
static __always_inline
unsigned int cpumask_random(const struct cpumask *src)
{
        return find_random_bit(cpumask_bits(src), nr_cpu_ids);
}

/**
 * for_each_cpu - iterate over every cpu in a mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu(cpu, mask)                                \
        for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)

/**
 * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 * @start: the start location
 *
 * The implementation does not assume any bit in @mask is set (including @start).
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_wrap(cpu, mask, start)                                \
        for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start)

/**
 * for_each_cpu_and - iterate over every cpu in both masks
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_and(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_and(cpu, mask1, mask2)                                \
        for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
 *                         those present in another.
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_andnot(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_andnot(cpu, mask1, mask2)                                \
        for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_or - iterate over every cpu present in either mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_or(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_or(cpu, mask1, mask2)                                \
        for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask.
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_from(cpu, mask)                                \
        for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits)

/**
 * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one.
 * @mask: the cpumask to search
 * @cpu: the cpu to ignore.
 *
 * Often used to find any cpu but smp_processor_id() in a mask.
 * If @cpu == -1, the function is equivalent to cpumask_any().
 * Return: >= nr_cpu_ids if no cpus set.
 */
static __always_inline
unsigned int cpumask_any_but(const struct cpumask *mask, int cpu)
{
        unsigned int i;

        /* -1 is a legal arg here. */
        if (cpu != -1)
                cpumask_check(cpu);

        for_each_cpu(i, mask)
                if (i != cpu)
                        break;
        return i;
}

/**
 * cpumask_any_and_but - pick an arbitrary cpu from *mask1 & *mask2, but not this one.
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 * @cpu: the cpu to ignore
 *
 * If @cpu == -1, the function is equivalent to cpumask_any_and().
 * Returns >= nr_cpu_ids if no cpus set.
 */
static __always_inline
unsigned int cpumask_any_and_but(const struct cpumask *mask1,
                                 const struct cpumask *mask2,
                                 int cpu)
{
        unsigned int i;

        /* -1 is a legal arg here. */
        if (cpu != -1)
                cpumask_check(cpu);

        i = cpumask_first_and(mask1, mask2);
        if (i != cpu)
                return i;

        return cpumask_next_and(cpu, mask1, mask2);
}

/**
 * cpumask_any_andnot_but - pick an arbitrary cpu from *mask1 & ~*mask2, but not this one.
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 * @cpu: the cpu to ignore
 *
 * If @cpu == -1, the function returns the first matching cpu.
 * Returns >= nr_cpu_ids if no cpus set.
 */
static __always_inline
unsigned int cpumask_any_andnot_but(const struct cpumask *mask1,
                                    const struct cpumask *mask2,
                                    int cpu)
{
        unsigned int i;

        /* -1 is a legal arg here. */
        if (cpu != -1)
                cpumask_check(cpu);

        i = cpumask_first_andnot(mask1, mask2);
        if (i != cpu)
                return i;

        return cpumask_next_andnot(cpu, mask1, mask2);
}

/**
 * cpumask_nth - get the Nth cpu in a cpumask
 * @srcp: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
{
        return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and - get the Nth cpu in 2 cpumasks
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2)
{
        return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @srcp3: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2,
                                                        const struct cpumask *srcp3)
{
        return find_nth_and_andnot_bit(cpumask_bits(srcp1),
                                        cpumask_bits(srcp2),
                                        cpumask_bits(srcp3),
                                        small_cpumask_bits, cpumask_check(cpu));
}

#define CPU_BITS_NONE                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL                        \
}

#define CPU_BITS_CPU0                                                \
{                                                                \
        [0] =  1UL                                                \
}

/**
 * cpumask_set_cpu - set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static __always_inline
void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static __always_inline
void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

/**
 * cpumask_clear_cpus - clear cpus in a cpumask
 * @dstp:  the cpumask pointer
 * @cpu:   cpu number (< nr_cpu_ids)
 * @ncpus: number of cpus to clear (< nr_cpu_ids)
 */
static __always_inline void cpumask_clear_cpus(struct cpumask *dstp,
                                                unsigned int cpu, unsigned int ncpus)
{
        cpumask_check(cpu + ncpus - 1);
        bitmap_clear(cpumask_bits(dstp), cpumask_check(cpu), ncpus);
}

/**
 * cpumask_clear_cpu - clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

/**
 * cpumask_test_cpu - test for a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Return: true if @cpu is set in @cpumask, else returns false
 */
static __always_inline
bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
{
        return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
}

/**
 * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * test_and_set_bit wrapper for cpumasks.
 *
 * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
 */
static __always_inline
bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * test_and_clear_bit wrapper for cpumasks.
 *
 * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
 */
static __always_inline
bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_setall(struct cpumask *dstp)
{
        if (small_const_nbits(small_cpumask_bits)) {
                cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits);
                return;
        }
        bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_clear(struct cpumask *dstp)
{
        bitmap_zero(cpumask_bits(dstp), large_cpumask_bits);
}

/**
 * cpumask_and - *dstp = *src1p & *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: false if *@dstp is empty, else returns true
 */
static __always_inline
bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p,
                 const struct cpumask *src2p)
{
        return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_or - *dstp = *src1p | *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static __always_inline
void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
                const struct cpumask *src2p)
{
        bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
                                      cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_weighted_or - *dstp = *src1p | *src2p and return the weight of the result
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: The number of bits set in the resulting cpumask @dstp
 */
static __always_inline
unsigned int cpumask_weighted_or(struct cpumask *dstp, const struct cpumask *src1p,
                                 const struct cpumask *src2p)
{
        return bitmap_weighted_or(cpumask_bits(dstp), cpumask_bits(src1p),
                                  cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_xor - *dstp = *src1p ^ *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static __always_inline
void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p,
                 const struct cpumask *src2p)
{
        bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_andnot - *dstp = *src1p & ~*src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: false if *@dstp is empty, else returns true
 */
static __always_inline
bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p,
                    const struct cpumask *src2p)
{
        return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
                                          cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_equal - *src1p == *src2p
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if the cpumasks are equal, false if not
 */
static __always_inline
bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p)
{
        return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                                                 small_cpumask_bits);
}

/**
 * cpumask_or_equal - *src1p | *src2p == *src3p
 * @src1p: the first input
 * @src2p: the second input
 * @src3p: the third input
 *
 * Return: true if first cpumask ORed with second cpumask == third cpumask,
 *           otherwise false
 */
static __always_inline
bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p,
                      const struct cpumask *src3p)
{
        return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                               cpumask_bits(src3p), small_cpumask_bits);
}

/**
 * cpumask_intersects - (*src1p & *src2p) != 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if first cpumask ANDed with second cpumask is non-empty,
 *           otherwise false
 */
static __always_inline
bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p)
{
        return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
                                                      small_cpumask_bits);
}

/**
 * cpumask_subset - (*src1p & ~*src2p) == 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if *@src1p is a subset of *@src2p, else returns false
 */
static __always_inline
bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p)
{
        return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
                                                  small_cpumask_bits);
}

/**
 * cpumask_empty - *srcp == 0
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
 *
 * Return: true if srcp is empty (has no bits set), else false
 */
static __always_inline bool cpumask_empty(const struct cpumask *srcp)
{
        return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_full - *srcp == 0xFFFFFFFF...
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
 *
 * Return: true if srcp is full (has all bits set), else false
 */
static __always_inline bool cpumask_full(const struct cpumask *srcp)
{
        return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_weight - Count of bits in *srcp
 * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in *srcp
 */
static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp)
{
        return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2)
 * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
 * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in both *srcp1 and *srcp2
 */
static __always_inline
unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
        return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2)
 * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
 * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in both *srcp1 and *srcp2
 */
static __always_inline
unsigned int cpumask_weight_andnot(const struct cpumask *srcp1,
                                   const struct cpumask *srcp2)
{
        return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_shift_right - *dstp = *srcp >> n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static __always_inline
void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n)
{
        bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                               small_cpumask_bits);
}

/**
 * cpumask_shift_left - *dstp = *srcp << n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static __always_inline
void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n)
{
        bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                              nr_cpumask_bits);
}

/**
 * cpumask_copy - *dstp = *srcp
 * @dstp: the result
 * @srcp: the input cpumask
 */
static __always_inline
void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp)
{
        bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits);
}

/**
 * cpumask_any - pick an arbitrary cpu from *srcp
 * @srcp: the input cpumask
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any(srcp) cpumask_first(srcp)

/**
 * cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))

/**
 * cpumask_of - the cpumask containing just a given cpu
 * @cpu: the cpu (<= nr_cpu_ids)
 */
#define cpumask_of(cpu) (get_cpu_mask(cpu))

/**
 * cpumask_parse_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline
int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp)
{
        return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_parselist_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline
int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp)
{
        return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
                                     nr_cpumask_bits);
}

/**
 * cpumask_parse - extract a cpumask from a string
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpulist_parse - extract a cpumask from a user string of ranges
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes
 *
 * Return: size to allocate for a &struct cpumask in bytes
 */
static __always_inline unsigned int cpumask_size(void)
{
        return bitmap_size(large_cpumask_bits);
}

#ifdef CONFIG_CPUMASK_OFFSTACK

#define this_cpu_cpumask_var_ptr(x)        this_cpu_read(x)
#define __cpumask_var_read_mostly        __read_mostly
#define CPUMASK_VAR_NULL                NULL

bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);

static __always_inline
bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
}

/**
 * alloc_cpumask_var - allocate a struct cpumask
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>).
 *
 * See alloc_cpumask_var_node.
 *
 * Return: %true if allocation succeeded, %false if not
 */
static __always_inline
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
}

static __always_inline
bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var(mask, flags | __GFP_ZERO);
}

void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
void free_cpumask_var(cpumask_var_t mask);
void free_bootmem_cpumask_var(cpumask_var_t mask);

static __always_inline bool cpumask_available(cpumask_var_t mask)
{
        return mask != NULL;
}

#else

#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
#define __cpumask_var_read_mostly
#define CPUMASK_VAR_NULL {}

static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return true;
}

static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        return true;
}

static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        cpumask_clear(*mask);
        return true;
}

static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        cpumask_clear(*mask);
        return true;
}

static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
}

static __always_inline void free_cpumask_var(cpumask_var_t mask)
{
}

static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask)
{
}

static __always_inline bool cpumask_available(cpumask_var_t mask)
{
        return true;
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T));

/* It's common to want to use cpu_all_mask in struct member initializers,
 * so it has to refer to an address rather than a pointer. */
extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
#define cpu_all_mask to_cpumask(cpu_all_bits)

/* First bits of cpu_bit_bitmap are in fact unset. */
#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])

#if NR_CPUS == 1
/* Uniprocessor: the possible/online/present masks are always "1" */
#define for_each_possible_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_online_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_present_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)

#define for_each_possible_cpu_wrap(cpu, start)        \
        for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_online_cpu_wrap(cpu, start)        \
        for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++)
#else
#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
#define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
#define for_each_enabled_cpu(cpu)   for_each_cpu((cpu), cpu_enabled_mask)
#define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)

#define for_each_possible_cpu_wrap(cpu, start)        \
        for_each_cpu_wrap((cpu), cpu_possible_mask, (start))
#define for_each_online_cpu_wrap(cpu, start)        \
        for_each_cpu_wrap((cpu), cpu_online_mask, (start))
#endif

/* Wrappers for arch boot code to manipulate normally-constant masks */
void init_cpu_present(const struct cpumask *src);
void init_cpu_possible(const struct cpumask *src);

#define assign_cpu(cpu, mask, val)        \
        assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val))

#define __assign_cpu(cpu, mask, val)        \
        __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val))

#define set_cpu_enabled(cpu, enabled)        assign_cpu((cpu), &__cpu_enabled_mask, (enabled))
#define set_cpu_present(cpu, present)        assign_cpu((cpu), &__cpu_present_mask, (present))
#define set_cpu_active(cpu, active)        assign_cpu((cpu), &__cpu_active_mask, (active))
#define set_cpu_dying(cpu, dying)        assign_cpu((cpu), &__cpu_dying_mask, (dying))

void set_cpu_online(unsigned int cpu, bool online);
void set_cpu_possible(unsigned int cpu, bool possible);

/**
 * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
 * @bitmap: the bitmap
 *
 * There are a few places where cpumask_var_t isn't appropriate and
 * static cpumasks must be used (eg. very early boot), yet we don't
 * expose the definition of 'struct cpumask'.
 *
 * This does the conversion, and can be used as a constant initializer.
 */
#define to_cpumask(bitmap)                                                \
        ((struct cpumask *)(1 ? (bitmap)                                \
                            : (void *)sizeof(__check_is_bitmap(bitmap))))

static __always_inline int __check_is_bitmap(const unsigned long *bitmap)
{
        return 1;
}

/*
 * Special-case data structure for "single bit set only" constant CPU masks.
 *
 * We pre-generate all the 64 (or 32) possible bit positions, with enough
 * padding to the left and the right, and return the constant pointer
 * appropriately offset.
 */
extern const unsigned long
        cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];

static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu)
{
        const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
        p -= cpu / BITS_PER_LONG;
        return to_cpumask(p);
}

#if NR_CPUS > 1
/**
 * num_online_cpus() - Read the number of online CPUs
 *
 * Despite the fact that __num_online_cpus is of type atomic_t, this
 * interface gives only a momentary snapshot and is not protected against
 * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
 * region.
 *
 * Return: momentary snapshot of the number of online CPUs
 */
static __always_inline unsigned int num_online_cpus(void)
{
        return raw_atomic_read(&__num_online_cpus);
}

static __always_inline unsigned int num_possible_cpus(void)
{
        return __num_possible_cpus;
}

#define num_enabled_cpus()        cpumask_weight(cpu_enabled_mask)
#define num_present_cpus()        cpumask_weight(cpu_present_mask)
#define num_active_cpus()        cpumask_weight(cpu_active_mask)

static __always_inline bool cpu_online(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_online_mask);
}

static __always_inline bool cpu_enabled(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_enabled_mask);
}

static __always_inline bool cpu_possible(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_possible_mask);
}

static __always_inline bool cpu_present(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_present_mask);
}

static __always_inline bool cpu_active(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_active_mask);
}

static __always_inline bool cpu_dying(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_dying_mask);
}

#else

#define num_online_cpus()        1U
#define num_possible_cpus()        1U
#define num_enabled_cpus()        1U
#define num_present_cpus()        1U
#define num_active_cpus()        1U

static __always_inline bool cpu_online(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_possible(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_enabled(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_present(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_active(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_dying(unsigned int cpu)
{
        return false;
}

#endif /* NR_CPUS > 1 */

#define cpu_is_offline(cpu)        unlikely(!cpu_online(cpu))

#if NR_CPUS <= BITS_PER_LONG
#define CPU_BITS_ALL                                                \
{                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}

#else /* NR_CPUS > BITS_PER_LONG */

#define CPU_BITS_ALL                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}
#endif /* NR_CPUS > BITS_PER_LONG */

/**
 * cpumap_print_to_pagebuf  - copies the cpumask into the buffer either
 *        as comma-separated list of cpus or hex values of cpumask
 * @list: indicates whether the cpumap must be list
 * @mask: the cpumask to copy
 * @buf: the buffer to copy into
 *
 * Return: the length of the (null-terminated) @buf string, zero if
 * nothing is copied.
 */
static __always_inline ssize_t
cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
{
        return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
                                      nr_cpu_ids);
}

/**
 * cpumap_print_bitmask_to_buf  - copies the cpumask into the buffer as
 *        hex values of cpumask
 *
 * @buf: the buffer to copy into
 * @mask: the cpumask to copy
 * @off: in the string from which we are copying, we copy to @buf
 * @count: the maximum number of bytes to print
 *
 * The function prints the cpumask into the buffer as hex values of
 * cpumask; Typically used by bin_attribute to export cpumask bitmask
 * ABI.
 *
 * Return: the length of how many bytes have been copied, excluding
 * terminating '\0'.
 */
static __always_inline
ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
                                    loff_t off, size_t count)
{
        return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask),
                                   nr_cpu_ids, off, count) - 1;
}

/**
 * cpumap_print_list_to_buf  - copies the cpumask into the buffer as
 *        comma-separated list of cpus
 * @buf: the buffer to copy into
 * @mask: the cpumask to copy
 * @off: in the string from which we are copying, we copy to @buf
 * @count: the maximum number of bytes to print
 *
 * Everything is same with the above cpumap_print_bitmask_to_buf()
 * except the print format.
 *
 * Return: the length of how many bytes have been copied, excluding
 * terminating '\0'.
 */
static __always_inline
ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
                                 loff_t off, size_t count)
{
        return bitmap_print_list_to_buf(buf, cpumask_bits(mask),
                                   nr_cpu_ids, off, count) - 1;
}

#if NR_CPUS <= BITS_PER_LONG
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#else
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                        \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#endif /* NR_CPUS > BITS_PER_LONG */

#define CPU_MASK_NONE                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL                                \
} }

#define CPU_MASK_CPU0                                                        \
(cpumask_t) { {                                                                \
        [0] =  1UL                                                        \
} }

/*
 * Provide a valid theoretical max size for cpumap and cpulist sysfs files
 * to avoid breaking userspace which may allocate a buffer based on the size
 * reported by e.g. fstat.
 *
 * for cpumap NR_CPUS * 9/32 - 1 should be an exact length.
 *
 * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up
 * to 2 orders of magnitude larger than 8192. And then we divide by 2 to
 * cover a worst-case of every other cpu being on one of two nodes for a
 * very large NR_CPUS.
 *
 *  Use PAGE_SIZE as a minimum for smaller configurations while avoiding
 *  unsigned comparison to -1.
 */
#define CPUMAP_FILE_MAX_BYTES  (((NR_CPUS * 9)/32 > PAGE_SIZE) \
                                        ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE)
#define CPULIST_FILE_MAX_BYTES  (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE)

#endif /* __LINUX_CPUMASK_H */



























































































































    1 




   11 
    1 
    1 



    3 

    1 
    1 












    1 







    5 


















   10 






















































































































    8 





   11 




















   11 
   11 





















































































    3 






    1 

    8 
    8 








    1 

   10 
    4 




    1 

    2 


































































    2 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/buffer_head.h
 *
 * Everything to do with buffer_heads.
 */

#ifndef _LINUX_BUFFER_HEAD_H
#define _LINUX_BUFFER_HEAD_H

#include <linux/types.h>
#include <linux/blk_types.h>
#include <linux/fs.h>
#include <linux/linkage.h>
#include <linux/pagemap.h>
#include <linux/wait.h>
#include <linux/atomic.h>

enum bh_state_bits {
        BH_Uptodate,        /* Contains valid data */
        BH_Dirty,        /* Is dirty */
        BH_Lock,        /* Is locked */
        BH_Req,                /* Has been submitted for I/O */

        BH_Mapped,        /* Has a disk mapping */
        BH_New,                /* Disk mapping was newly created by get_block */
        BH_Async_Read,        /* Is under end_buffer_async_read I/O */
        BH_Async_Write,        /* Is under end_buffer_async_write I/O */
        BH_Delay,        /* Buffer is not yet allocated on disk */
        BH_Boundary,        /* Block is followed by a discontiguity */
        BH_Write_EIO,        /* I/O error on write */
        BH_Unwritten,        /* Buffer is allocated on disk but not written */
        BH_Quiet,        /* Buffer Error Prinks to be quiet */
        BH_Meta,        /* Buffer contains metadata */
        BH_Prio,        /* Buffer should be submitted with REQ_PRIO */
        BH_Defer_Completion, /* Defer AIO completion to workqueue */
        BH_Migrate,     /* Buffer is being migrated (norefs) */

        BH_PrivateStart,/* not a state bit, but the first bit available
                         * for private allocation by other entities
                         */
};

#define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)

struct page;
struct buffer_head;
struct address_space;
typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);

/*
 * Historically, a buffer_head was used to map a single block
 * within a page, and of course as the unit of I/O through the
 * filesystem and block layers.  Nowadays the basic I/O unit
 * is the bio, and buffer_heads are used for extracting block
 * mappings (via a get_block_t call), for tracking state within
 * a folio (via a folio_mapping) and for wrapping bio submission
 * for backward compatibility reasons (e.g. submit_bh).
 */
struct buffer_head {
        unsigned long b_state;                /* buffer state bitmap (see above) */
        struct buffer_head *b_this_page;/* circular list of page's buffers */
        union {
                struct page *b_page;        /* the page this bh is mapped to */
                struct folio *b_folio;        /* the folio this bh is mapped to */
        };

        sector_t b_blocknr;                /* start block number */
        size_t b_size;                        /* size of mapping */
        char *b_data;                        /* pointer to data within the page */

        struct block_device *b_bdev;
        bh_end_io_t *b_end_io;                /* I/O completion */
         void *b_private;                /* reserved for b_end_io */
        struct list_head b_assoc_buffers; /* associated with another mapping */
        struct mapping_metadata_bhs *b_mmb; /* head of the list of metadata bhs
                                             * this buffer is associated with */
        atomic_t b_count;                /* users using this buffer_head */
        spinlock_t b_uptodate_lock;        /* Used by the first bh in a page, to
                                         * serialise IO completion of other
                                         * buffers in the page */
};

/*
 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
 * and buffer_foo() functions.
 * To avoid reset buffer flags that are already set, because that causes
 * a costly cache line transition, check the flag first.
 */
#define BUFFER_FNS(bit, name)                                                \
static __always_inline void set_buffer_##name(struct buffer_head *bh)        \
{                                                                        \
        if (!test_bit(BH_##bit, &(bh)->b_state))                        \
                set_bit(BH_##bit, &(bh)->b_state);                        \
}                                                                        \
static __always_inline void clear_buffer_##name(struct buffer_head *bh)        \
{                                                                        \
        clear_bit(BH_##bit, &(bh)->b_state);                                \
}                                                                        \
static __always_inline int buffer_##name(const struct buffer_head *bh)        \
{                                                                        \
        return test_bit(BH_##bit, &(bh)->b_state);                        \
}

/*
 * test_set_buffer_foo() and test_clear_buffer_foo()
 */
#define TAS_BUFFER_FNS(bit, name)                                        \
static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \
{                                                                        \
        return test_and_set_bit(BH_##bit, &(bh)->b_state);                \
}                                                                        \
static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \
{                                                                        \
        return test_and_clear_bit(BH_##bit, &(bh)->b_state);                \
}                                                                        \

/*
 * Emit the buffer bitops functions.   Note that there are also functions
 * of the form "mark_buffer_foo()".  These are higher-level functions which
 * do something in addition to setting a b_state bit.
 */
BUFFER_FNS(Dirty, dirty)
TAS_BUFFER_FNS(Dirty, dirty)
BUFFER_FNS(Lock, locked)
BUFFER_FNS(Req, req)
TAS_BUFFER_FNS(Req, req)
BUFFER_FNS(Mapped, mapped)
BUFFER_FNS(New, new)
BUFFER_FNS(Async_Read, async_read)
BUFFER_FNS(Async_Write, async_write)
BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
BUFFER_FNS(Defer_Completion, defer_completion)

static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If somebody else already set this uptodate, they will
         * have done the memory barrier, and a reader will thus
         * see *some* valid buffer state.
         *
         * Any other serialization (with IO errors or whatever that
         * might clear the bit) has to come from other state (eg BH_Lock).
         */
        if (test_bit(BH_Uptodate, &bh->b_state))
                return;

        /*
         * make it consistent with folio_mark_uptodate
         * pairs with smp_load_acquire in buffer_uptodate
         */
        smp_mb__before_atomic();
        set_bit(BH_Uptodate, &bh->b_state);
}

static __always_inline void clear_buffer_uptodate(struct buffer_head *bh)
{
        clear_bit(BH_Uptodate, &bh->b_state);
}

static __always_inline int buffer_uptodate(const struct buffer_head *bh)
{
        /*
         * make it consistent with folio_test_uptodate
         * pairs with smp_mb__before_atomic in set_buffer_uptodate
         */
        return test_bit_acquire(BH_Uptodate, &bh->b_state);
}

static inline unsigned long bh_offset(const struct buffer_head *bh)
{
        return (unsigned long)(bh)->b_data & (page_size(bh->b_page) - 1);
}

/* If we *know* page->private refers to buffer_heads */
#define page_buffers(page)                                        \
        ({                                                        \
                BUG_ON(!PagePrivate(page));                        \
                ((struct buffer_head *)page_private(page));        \
        })
#define folio_buffers(folio)                folio_get_private(folio)

void buffer_check_dirty_writeback(struct folio *folio,
                                     bool *dirty, bool *writeback);

/*
 * Declarations
 */

void mark_buffer_dirty(struct buffer_head *bh);
void mark_buffer_write_io_error(struct buffer_head *bh);
void touch_buffer(struct buffer_head *bh);
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
                  unsigned long offset);
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
                                        gfp_t gfp);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size);
struct buffer_head *create_empty_buffers(struct folio *folio,
                unsigned long blocksize, unsigned long b_state);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate);

/* Things to do with metadata buffers list */
void mmb_mark_buffer_dirty(struct buffer_head *bh, struct mapping_metadata_bhs *mmb);
int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb,
                      loff_t start, loff_t end, bool datasync);
int mmb_fsync(struct file *file, struct mapping_metadata_bhs *mmb,
              loff_t start, loff_t end, bool datasync);
void clean_bdev_aliases(struct block_device *bdev, sector_t block,
                        sector_t len);
static inline void clean_bdev_bh_alias(struct buffer_head *bh)
{
        clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1);
}

void mark_buffer_async_write(struct buffer_head *bh);
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
                        unsigned size);
struct buffer_head *__find_get_block_nonatomic(struct block_device *bdev,
                        sector_t block, unsigned size);
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread_gfp(struct block_device *,
                                sector_t block, unsigned size, gfp_t gfp);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
void __lock_buffer(struct buffer_head *bh);
int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void submit_bh(blk_opf_t, struct buffer_head *);
void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize);
int bh_uptodate_or_lock(struct buffer_head *bh);
int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
void __bh_read_batch(int nr, struct buffer_head *bhs[],
                     blk_opf_t op_flags, bool force_lock);

/*
 * Generic address_space_operations implementations for buffer_head-backed
 * address_spaces.
 */
void block_invalidate_folio(struct folio *folio, size_t offset, size_t length);
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
                void *get_block);
int __block_write_full_folio(struct inode *inode, struct folio *folio,
                get_block_t *get_block, struct writeback_control *wbc);
int block_read_full_folio(struct folio *, get_block_t *);
bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                struct folio **foliop, get_block_t *get_block);
int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block);
int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *);
int generic_write_end(const struct kiocb *, struct address_space *,
                                loff_t, unsigned len, unsigned copied,
                                struct folio *, void *);
void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to);
int cont_write_begin(const struct kiocb *, struct address_space *, loff_t,
                        unsigned, struct folio **, void **,
                        get_block_t *, loff_t *);
int generic_cont_expand_simple(struct inode *inode, loff_t size);
void block_commit_write(struct folio *folio, size_t from, size_t to);
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                                get_block_t get_block);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int block_truncate_page(struct address_space *, loff_t, get_block_t *);

#ifdef CONFIG_MIGRATION
extern int buffer_migrate_folio(struct address_space *,
                struct folio *dst, struct folio *src, enum migrate_mode);
extern int buffer_migrate_folio_norefs(struct address_space *,
                struct folio *dst, struct folio *src, enum migrate_mode);
#else
#define buffer_migrate_folio NULL
#define buffer_migrate_folio_norefs NULL
#endif

/*
 * inline definitions
 */

static inline void get_bh(struct buffer_head *bh)
{
        atomic_inc(&bh->b_count);
}

static inline void put_bh(struct buffer_head *bh)
{
        smp_mb__before_atomic();
        atomic_dec(&bh->b_count);
}

/**
 * brelse - Release a buffer.
 * @bh: The buffer to release.
 *
 * Decrement a buffer_head's reference count.  If @bh is NULL, this
 * function is a no-op.
 *
 * If all buffers on a folio have zero reference count, are clean
 * and unlocked, and if the folio is unlocked and not under writeback
 * then try_to_free_buffers() may strip the buffers from the folio in
 * preparation for freeing it (sometimes, rarely, buffers are removed
 * from a folio but it ends up not being freed, and buffers may later
 * be reattached).
 *
 * Context: Any context.
 */
static inline void brelse(struct buffer_head *bh)
{
        if (bh)
                __brelse(bh);
}

/**
 * bforget - Discard any dirty data in a buffer.
 * @bh: The buffer to forget.
 *
 * Call this function instead of brelse() if the data written to a buffer
 * no longer needs to be written back.  It will clear the buffer's dirty
 * flag so writeback of this buffer will be skipped.
 *
 * Context: Any context.
 */
static inline void bforget(struct buffer_head *bh)
{
        if (bh)
                __bforget(bh);
}

static inline struct buffer_head *
sb_bread(struct super_block *sb, sector_t block)
{
        return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
}

static inline struct buffer_head *
sb_bread_unmovable(struct super_block *sb, sector_t block)
{
        return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
}

static inline void
sb_breadahead(struct super_block *sb, sector_t block)
{
        __breadahead(sb->s_bdev, block, sb->s_blocksize);
}

static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
                sector_t block, unsigned size)
{
        gfp_t gfp;

        gfp = mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
        gfp |= __GFP_NOFAIL;

        return bdev_getblk(bdev, block, size, gfp);
}

static inline struct buffer_head *__getblk(struct block_device *bdev,
                sector_t block, unsigned size)
{
        gfp_t gfp;

        gfp = mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
        gfp |= __GFP_MOVABLE | __GFP_NOFAIL;

        return bdev_getblk(bdev, block, size, gfp);
}

static inline struct buffer_head *sb_getblk(struct super_block *sb,
                sector_t block)
{
        return __getblk(sb->s_bdev, block, sb->s_blocksize);
}

static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb,
                sector_t block, gfp_t gfp)
{
        return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp);
}

static inline struct buffer_head *
sb_find_get_block(struct super_block *sb, sector_t block)
{
        return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
}

static inline struct buffer_head *
sb_find_get_block_nonatomic(struct super_block *sb, sector_t block)
{
        return __find_get_block_nonatomic(sb->s_bdev, block, sb->s_blocksize);
}

static inline void
map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
{
        set_buffer_mapped(bh);
        bh->b_bdev = sb->s_bdev;
        bh->b_blocknr = block;
        bh->b_size = sb->s_blocksize;
}

static inline void wait_on_buffer(struct buffer_head *bh)
{
        might_sleep();
        if (buffer_locked(bh))
                __wait_on_buffer(bh);
}

static inline int trylock_buffer(struct buffer_head *bh)
{
        return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
}

static inline void lock_buffer(struct buffer_head *bh)
{
        might_sleep();
        if (!trylock_buffer(bh))
                __lock_buffer(bh);
}

static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
{
        if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
                if (!buffer_uptodate(bh))
                        __bh_read(bh, op_flags, false);
                else
                        unlock_buffer(bh);
        }
}

static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags)
{
        if (!bh_uptodate_or_lock(bh))
                __bh_read(bh, op_flags, false);
}

/* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */
static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags)
{
        if (bh_uptodate_or_lock(bh))
                return 1;
        return __bh_read(bh, op_flags, true);
}

static inline void bh_read_batch(int nr, struct buffer_head *bhs[])
{
        __bh_read_batch(nr, bhs, 0, true);
}

static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[],
                                      blk_opf_t op_flags)
{
        __bh_read_batch(nr, bhs, op_flags, false);
}

/**
 * __bread() - Read a block.
 * @bdev: The block device to read from.
 * @block: Block number in units of block size.
 * @size: The block size of this device in bytes.
 *
 * Read a specified block, and return the buffer head that refers
 * to it.  The memory is allocated from the movable area so that it can
 * be migrated.  The returned buffer head has its refcount increased.
 * The caller should call brelse() when it has finished with the buffer.
 *
 * Context: May sleep waiting for I/O.
 * Return: NULL if the block was unreadable.
 */
static inline struct buffer_head *__bread(struct block_device *bdev,
                sector_t block, unsigned size)
{
        return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
}

/**
 * get_nth_bh - Get a reference on the n'th buffer after this one.
 * @bh: The buffer to start counting from.
 * @count: How many buffers to skip.
 *
 * This is primarily useful for finding the nth buffer in a folio; in
 * that case you pass the head buffer and the byte offset in the folio
 * divided by the block size.  It can be used for other purposes, but
 * it will wrap at the end of the folio rather than returning NULL or
 * proceeding to the next folio for you.
 *
 * Return: The requested buffer with an elevated refcount.
 */
static inline __must_check
struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count)
{
        while (count--)
                bh = bh->b_this_page;
        get_bh(bh);
        return bh;
}

bool block_dirty_folio(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_BUFFER_HEAD

void buffer_init(void);
bool try_to_free_buffers(struct folio *folio);
void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping);
bool mmb_has_buffers(struct mapping_metadata_bhs *mmb);
void mmb_invalidate(struct mapping_metadata_bhs *mmb);
int mmb_sync(struct mapping_metadata_bhs *mmb);
void invalidate_bh_lrus(void);
void invalidate_bh_lrus_cpu(void);
bool has_bh_in_lru(int cpu, void *dummy);
extern int buffer_heads_over_limit;

#else /* CONFIG_BUFFER_HEAD */

static inline void buffer_init(void) {}
static inline bool try_to_free_buffers(struct folio *folio) { return true; }
static inline int mmb_sync(struct mapping_metadata_bhs *mmb) { return 0; }
static inline void invalidate_bh_lrus(void) {}
static inline void invalidate_bh_lrus_cpu(void) {}
static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
#define buffer_heads_over_limit 0

#endif /* CONFIG_BUFFER_HEAD */
#endif /* _LINUX_BUFFER_HEAD_H */






















































































   21 































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/*
 * include/linux/topology.h
 *
 * Written by: Matthew Dobson, IBM Corporation
 *
 * Copyright (C) 2002, IBM Corp.
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Send feedback to <colpatch@us.ibm.com>
 */
#ifndef _LINUX_TOPOLOGY_H
#define _LINUX_TOPOLOGY_H

#include <linux/arch_topology.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/bitops.h>
#include <linux/mmzone.h>
#include <linux/smp.h>
#include <linux/percpu.h>
#include <asm/topology.h>

#ifndef nr_cpus_node
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
#endif

int arch_update_cpu_topology(void);

/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE                10
#define REMOTE_DISTANCE                20
#define DISTANCE_BITS           8
#ifndef node_distance
#define node_distance(from,to)        ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
#ifndef RECLAIM_DISTANCE
/*
 * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
 * (in whatever arch specific measurement units returned by node_distance())
 * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
 * on nodes within this distance.
 */
#define RECLAIM_DISTANCE 30
#endif

/*
 * The following tunable allows platforms to override the default node
 * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
 * sufficiently fast that the default value actually hurts
 * performance.
 *
 * AMD EPYC machines use this because even though the 2-hop distance
 * is 32 (3.2x slower than a local memory access) performance actually
 * *improves* if allowed to reclaim memory and load balance tasks
 * between NUMA nodes 2-hops apart.
 */
extern int __read_mostly node_reclaim_distance;

#ifndef PENALTY_FOR_NODE_WITH_CPUS
#define PENALTY_FOR_NODE_WITH_CPUS        (1)
#endif

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DECLARE_PER_CPU(int, numa_node);

#ifndef numa_node_id
/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
        return raw_cpu_read(numa_node);
}
#endif

#ifndef cpu_to_node
static inline int cpu_to_node(int cpu)
{
        return per_cpu(numa_node, cpu);
}
#endif

#ifndef set_numa_node
static inline void set_numa_node(int node)
{
        this_cpu_write(numa_node, node);
}
#endif

#ifndef set_cpu_numa_node
static inline void set_cpu_numa_node(int cpu, int node)
{
        per_cpu(numa_node, cpu) = node;
}
#endif

#else        /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */

/* Returns the number of the current Node. */
#ifndef numa_node_id
static inline int numa_node_id(void)
{
        return cpu_to_node(raw_smp_processor_id());
}
#endif

#endif        /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

/*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
 */
DECLARE_PER_CPU(int, _numa_mem_);

#ifndef set_numa_mem
static inline void set_numa_mem(int node)
{
        this_cpu_write(_numa_mem_, node);
}
#endif

#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
        return raw_cpu_read(_numa_mem_);
}
#endif

#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
        return per_cpu(_numa_mem_, cpu);
}
#endif

#ifndef set_cpu_numa_mem
static inline void set_cpu_numa_mem(int cpu, int node)
{
        per_cpu(_numa_mem_, cpu) = node;
}
#endif

#else        /* !CONFIG_HAVE_MEMORYLESS_NODES */

#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
        return numa_node_id();
}
#endif

#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
        return cpu_to_node(cpu);
}
#endif

#endif        /* [!]CONFIG_HAVE_MEMORYLESS_NODES */

#if defined(topology_die_id) && defined(topology_die_cpumask)
#define TOPOLOGY_DIE_SYSFS
#endif
#if defined(topology_cluster_id) && defined(topology_cluster_cpumask)
#define TOPOLOGY_CLUSTER_SYSFS
#endif
#if defined(topology_book_id) && defined(topology_book_cpumask)
#define TOPOLOGY_BOOK_SYSFS
#endif
#if defined(topology_drawer_id) && defined(topology_drawer_cpumask)
#define TOPOLOGY_DRAWER_SYSFS
#endif

#ifndef topology_physical_package_id
#define topology_physical_package_id(cpu)        ((void)(cpu), -1)
#endif
#ifndef topology_die_id
#define topology_die_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_cluster_id
#define topology_cluster_id(cpu)                ((void)(cpu), -1)
#endif
#ifndef topology_core_id
#define topology_core_id(cpu)                        ((void)(cpu), 0)
#endif
#ifndef topology_book_id
#define topology_book_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_drawer_id
#define topology_drawer_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_ppin
#define topology_ppin(cpu)                        ((void)(cpu), 0ull)
#endif
#ifndef topology_sibling_cpumask
#define topology_sibling_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_cluster_cpumask
#define topology_cluster_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_die_cpumask
#define topology_die_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_book_cpumask
#define topology_book_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_drawer_cpumask
#define topology_drawer_cpumask(cpu)                cpumask_of(cpu)
#endif

#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
static inline const struct cpumask *cpu_smt_mask(int cpu)
{
        return topology_sibling_cpumask(cpu);
}
#endif

#ifndef topology_is_primary_thread

static inline bool topology_is_primary_thread(unsigned int cpu)
{
        /*
         * When disabling SMT, the primary thread of the SMT will remain
         * enabled/active. Architectures that have a special primary thread
         * (e.g. x86) need to override this function. Otherwise the first
         * thread in the SMT can be made the primary thread.
         *
         * The sibling cpumask of an offline CPU always contains the CPU
         * itself on architectures using the implementation of
         * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology.
         * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for
         * building their topology have to check whether to use this default
         * implementation or to override it.
         */
        return cpu == cpumask_first(topology_sibling_cpumask(cpu));
}
#define topology_is_primary_thread topology_is_primary_thread

#endif

static inline const struct cpumask *cpu_node_mask(int cpu)
{
        return cpumask_of_node(cpu_to_node(cpu));
}

#ifdef CONFIG_NUMA
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
#else
static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
{
        return cpumask_nth_and(cpu, cpus, cpu_online_mask);
}

static inline const struct cpumask *
sched_numa_hop_mask(unsigned int node, unsigned int hops)
{
        return ERR_PTR(-EOPNOTSUPP);
}
#endif        /* CONFIG_NUMA */

/**
 * for_each_node_numadist() - iterate over nodes in increasing distance
 *                              order, starting from a given node
 * @node: the iteration variable and the starting node.
 * @unvisited: a nodemask to keep track of the unvisited nodes.
 *
 * This macro iterates over NUMA node IDs in increasing distance from the
 * starting @node and yields MAX_NUMNODES when all the nodes have been
 * visited.
 *
 * Note that by the time the loop completes, the @unvisited nodemask will
 * be fully cleared, unless the loop exits early.
 *
 * The difference between for_each_node() and for_each_node_numadist() is
 * that the former allows to iterate over nodes in numerical order, whereas
 * the latter iterates over nodes in increasing order of distance.
 *
 * This complexity of this iterator is O(N^2), where N represents the
 * number of nodes, as each iteration involves scanning all nodes to
 * find the one with the shortest distance.
 *
 * Requires rcu_lock to be held.
 */
#define for_each_node_numadist(node, unvisited)                                        \
        for (int __start = (node),                                                \
             (node) = nearest_node_nodemask((__start), &(unvisited));                \
             (node) < MAX_NUMNODES;                                                \
             node_clear((node), (unvisited)),                                        \
             (node) = nearest_node_nodemask((__start), &(unvisited)))

/**
 * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
 *                          from a given node.
 * @mask: the iteration variable.
 * @node: the NUMA node to start the search from.
 *
 * Requires rcu_lock to be held.
 *
 * Yields cpu_online_mask for @node == NUMA_NO_NODE.
 */
#define for_each_numa_hop_mask(mask, node)                                       \
        for (unsigned int __hops = 0;                                               \
             mask = (node != NUMA_NO_NODE || __hops) ?                               \
                     sched_numa_hop_mask(node, __hops) :                       \
                     cpu_online_mask,                                               \
             !IS_ERR_OR_NULL(mask);                                               \
             __hops++)

DECLARE_PER_CPU(unsigned long, cpu_scale);

static inline unsigned long topology_get_cpu_scale(int cpu)
{
        return per_cpu(cpu_scale, cpu);
}

void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);

#endif /* _LINUX_TOPOLOGY_H */











































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 
















































































































































































































    1 







































































































































































































    1 






































































































































































































































































































































































































    1 




    1 



































































































































































































































































































































































































































































































































































































































































































































































































    2 




    1 































    1 




    1 















    1 




















    1 








































































































    1 



















    2 




    1 
    2 













    1 





    1 





























































































    1 

































































































































































































    1 




    1 





















    1 






    2 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
// SPDX-License-Identifier: GPL-2.0
/*
 *  ext4.h
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/include/linux/minix_fs.h
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#ifndef _EXT4_H
#define _EXT4_H

#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
#include <linux/jbd2.h>
#include <linux/quota.h>
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/sched/signal.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
#include <linux/fiemap.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
#include <uapi/linux/ext4.h>

#include <linux/fscrypt.h>
#include <linux/fsverity.h>

#include <linux/compiler.h>

/*
 * The fourth extended filesystem constants/structures
 */

/*
 * with AGGRESSIVE_CHECK allocator runs consistency checks over
 * structures. these checks slow things down a lot
 */
#define AGGRESSIVE_CHECK__

/*
 * with DOUBLE_CHECK defined mballoc creates persistent in-core
 * bitmaps, maintains and uses them to check for double allocations
 */
#define DOUBLE_CHECK__

/*
 * Define EXT4FS_DEBUG to produce debug messages
 */
#undef EXT4FS_DEBUG

/*
 * Debug code
 */
#ifdef EXT4FS_DEBUG
#define ext4_debug(f, a...)                                                \
        do {                                                                \
                printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",        \
                        __FILE__, __LINE__, __func__);                        \
                printk(KERN_DEBUG f, ## a);                                \
        } while (0)
#else
#define ext4_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

 /*
  * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c
  */
#define EXT_DEBUG__

/*
 * Dynamic printk for controlled extents debugging.
 */
#ifdef CONFIG_EXT4_DEBUG
#define ext_debug(ino, fmt, ...)                                        \
        pr_debug("[%s/%d] EXT4-fs (%s): ino %llu: (%s, %d): %s:" fmt,        \
                 current->comm, task_pid_nr(current),                        \
                 ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__,        \
                 __func__, ##__VA_ARGS__)
#else
#define ext_debug(ino, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

#define ASSERT(assert)                                                \
do {                                                                        \
        if (unlikely(!(assert))) {                                        \
                printk(KERN_EMERG                                        \
                       "Assertion failure in %s() at %s:%d: '%s'\n",        \
                       __func__, __FILE__, __LINE__, #assert);                \
                BUG();                                                        \
        }                                                                \
} while (0)

/* data type for block offset of block group */
typedef int ext4_grpblk_t;

/* data type for filesystem-wide blocks number */
typedef unsigned long long ext4_fsblk_t;

/* data type for file logical block number */
typedef __u32 ext4_lblk_t;

/* data type for block group number */
typedef unsigned int ext4_group_t;

enum SHIFT_DIRECTION {
        SHIFT_LEFT = 0,
        SHIFT_RIGHT,
};

/*
 * For each criteria, mballoc has slightly different way of finding
 * the required blocks nad usually, higher the criteria the slower the
 * allocation.  We start at lower criterias and keep falling back to
 * higher ones if we are not able to find any blocks.  Lower (earlier)
 * criteria are faster.
 */
enum criteria {
        /*
         * Used when number of blocks needed is a power of 2. This
         * doesn't trigger any disk IO except prefetch and is the
         * fastest criteria.
         */
        CR_POWER2_ALIGNED,

        /*
         * Tries to lookup in-memory data structures to find the most
         * suitable group that satisfies goal request. No disk IO
         * except block prefetch.
         */
        CR_GOAL_LEN_FAST,

        /*
         * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal
         * length to the best available length for faster allocation.
         */
        CR_BEST_AVAIL_LEN,

        /*
         * Reads each block group sequentially, performing disk IO if
         * necessary, to find suitable block group. Tries to
         * allocate goal length but might trim the request if nothing
         * is found after enough tries.
         */
        CR_GOAL_LEN_SLOW,

        /*
         * Finds the first free set of blocks and allocates
         * those. This is only used in rare cases when
         * CR_GOAL_LEN_SLOW also fails to allocate anything.
         */
        CR_ANY_FREE,

        /*
         * Number of criterias defined.
         */
        EXT4_MB_NUM_CRS
};

/*
 * Flags used in mballoc's allocation_context flags field.
 *
 * Also used to show what's going on for debugging purposes when the
 * flag field is exported via the traceport interface
 */

/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE                0x0001
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST                0x0008
/* data is being allocated */
#define EXT4_MB_HINT_DATA                0x0020
/* don't preallocate (for tails) */
#define EXT4_MB_HINT_NOPREALLOC                0x0040
/* allocate for locality group */
#define EXT4_MB_HINT_GROUP_ALLOC        0x0080
/* allocate goal blocks or none */
#define EXT4_MB_HINT_GOAL_ONLY                0x0100
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL                0x0200
/* blocks already pre-reserved by delayed allocation */
#define EXT4_MB_DELALLOC_RESERVED        0x0400
/* We are doing stream allocation */
#define EXT4_MB_STREAM_ALLOC                0x0800
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS                0x1000
/* Use blocks from reserved pool */
#define EXT4_MB_USE_RESERVED                0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK                0x4000

struct ext4_allocation_request {
        /* target inode for block we're allocating */
        struct inode *inode;
        /* how many blocks we want to allocate */
        unsigned int len;
        /* logical block in target inode */
        ext4_lblk_t logical;
        /* the closest logical allocated block to the left */
        ext4_lblk_t lleft;
        /* the closest logical allocated block to the right */
        ext4_lblk_t lright;
        /* phys. target (a hint) */
        ext4_fsblk_t goal;
        /* phys. block for the closest logical allocated block to the left */
        ext4_fsblk_t pleft;
        /* phys. block for the closest logical allocated block to the right */
        ext4_fsblk_t pright;
        /* flags. see above EXT4_MB_HINT_* */
        unsigned int flags;
};

/*
 * Logical to physical block mapping, used by ext4_map_blocks()
 *
 * This structure is used to pass requests into ext4_map_blocks() as
 * well as to store the information returned by ext4_map_blocks().  It
 * takes less room on the stack than a struct buffer_head.
 */
#define EXT4_MAP_NEW                BIT(BH_New)
#define EXT4_MAP_MAPPED                BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN        BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY        BIT(BH_Boundary)
#define EXT4_MAP_DELAYED        BIT(BH_Delay)
/*
 * This is for use in ext4_map_query_blocks() for a special case where we can
 * have a physically and logically contiguous blocks split across two leaf
 * nodes instead of a single extent. This is required in case of atomic writes
 * to know whether the returned extent is last in leaf. If yes, then lookup for
 * next in leaf block in ext4_map_query_blocks_next_in_leaf().
 * - This is never going to be added to any buffer head state.
 * - We use the next available bit after BH_BITMAP_UPTODATE.
 */
#define EXT4_MAP_QUERY_LAST_IN_LEAF        BIT(BH_BITMAP_UPTODATE + 1)
#define EXT4_MAP_FLAGS                (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
                                 EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF)

struct ext4_map_blocks {
        ext4_fsblk_t m_pblk;
        ext4_lblk_t m_lblk;
        unsigned int m_len;
        unsigned int m_flags;
        u64 m_seq;
};

/*
 * Block validity checking, system zone rbtree.
 */
struct ext4_system_blocks {
        struct rb_root root;
        struct rcu_head rcu;
};

/*
 * Flags for ext4_io_end->flags
 */
#define EXT4_IO_END_UNWRITTEN        0x0001
#define EXT4_IO_END_FAILED        0x0002

#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED)

struct ext4_io_end_vec {
        struct list_head list;                /* list of io_end_vec */
        loff_t offset;                        /* offset in the file */
        ssize_t size;                        /* size of the extent */
};

/*
 * For converting unwritten extents on a work queue. 'handle' is used for
 * buffered writeback.
 */
typedef struct ext4_io_end {
        struct list_head        list;                /* per-file finished IO list */
        handle_t                *handle;        /* handle reserved for extent
                                                 * conversion */
        struct inode                *inode;                /* file being written to */
        struct bio                *bio;                /* Linked list of completed
                                                 * bios covering the extent */
        unsigned int                flag;                /* unwritten or not */
        refcount_t                count;                /* reference counter */
        struct list_head        list_vec;        /* list of ext4_io_end_vec */
} ext4_io_end_t;

struct ext4_io_submit {
        struct writeback_control *io_wbc;
        struct bio                *io_bio;
        ext4_io_end_t                *io_end;
        sector_t                io_next_block;
};

/*
 * Special inodes numbers
 */
#define        EXT4_BAD_INO                 1        /* Bad blocks inode */
#define EXT4_ROOT_INO                 2        /* Root inode */
#define EXT4_USR_QUOTA_INO         3        /* User quota inode */
#define EXT4_GRP_QUOTA_INO         4        /* Group quota inode */
#define EXT4_BOOT_LOADER_INO         5        /* Boot loader inode */
#define EXT4_UNDEL_DIR_INO         6        /* Undelete directory inode */
#define EXT4_RESIZE_INO                 7        /* Reserved group descriptors inode */
#define EXT4_JOURNAL_INO         8        /* Journal inode */

/* First non-reserved inode for old ext4 filesystems */
#define EXT4_GOOD_OLD_FIRST_INO        11

/*
 * Maximal count of links to a file
 */
#define EXT4_LINK_MAX                65000

/*
 * Macro-instructions used to manage several block sizes
 */
#define EXT4_MIN_BLOCK_SIZE                1024
#define        EXT4_MAX_BLOCK_SIZE                65536
#define EXT4_MIN_BLOCK_LOG_SIZE                10
#define EXT4_MAX_BLOCK_LOG_SIZE                16
#define EXT4_MAX_CLUSTER_LOG_SIZE        30
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE(s)                ((s)->s_blocksize)
#else
# define EXT4_BLOCK_SIZE(s)                (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
#endif
#define        EXT4_ADDR_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
#define EXT4_CLUSTER_SIZE(s)                (EXT4_BLOCK_SIZE(s) << \
                                         EXT4_SB(s)->s_cluster_bits)
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_blocksize_bits)
# define EXT4_CLUSTER_BITS(s)                (EXT4_SB(s)->s_cluster_bits)
#else
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_log_block_size + 10)
#endif
#ifdef __KERNEL__
#define        EXT4_ADDR_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_addr_per_block_bits)
#define EXT4_INODE_SIZE(s)                (EXT4_SB(s)->s_inode_size)
#define EXT4_FIRST_INO(s)                (EXT4_SB(s)->s_first_ino)
#else
#define EXT4_INODE_SIZE(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_INODE_SIZE : \
                                 (s)->s_inode_size)
#define EXT4_FIRST_INO(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_FIRST_INO : \
                                 (s)->s_first_ino)
#endif
#define EXT4_BLOCK_ALIGN(size, blkbits)                ALIGN((size), (1 << (blkbits)))
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
        ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
                                                                  blkbits))
#define EXT4_B_TO_LBLK(inode, offset) \
        (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
#define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits)

/* Translate a block number to a page index */
#define EXT4_LBLK_TO_PG(inode, lblk)        (EXT4_LBLK_TO_B((inode), (lblk)) >> \
                                         PAGE_SHIFT)
/* Translate a page index to a block number */
#define EXT4_PG_TO_LBLK(inode, pnum)        (((loff_t)(pnum) << PAGE_SHIFT) >> \
                                         (inode)->i_blkbits)
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk)        ((blk) >> (sbi)->s_cluster_bits)
/* Translate a cluster number to a block number */
#define EXT4_C2B(sbi, cluster)        ((cluster) << (sbi)->s_cluster_bits)
/* Translate # of blks to # of clusters */
#define EXT4_NUM_B2C(sbi, blks)        (((blks) + (sbi)->s_cluster_ratio - 1) >> \
                                 (sbi)->s_cluster_bits)
/* Mask out the low bits to get the starting block of the cluster */
#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &                                \
                                  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &                                \
                                  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
/* Fill in the low bits to get the last block of the cluster */
#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) |                                \
                                    ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
/* Get the cluster offset */
#define EXT4_PBLK_COFF(s, pblk) ((pblk) &                                \
                                 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_COFF(s, lblk) ((lblk) &                                \
                                 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))

/*
 * Structure of a blocks group descriptor
 */
struct ext4_group_desc
{
        __le32        bg_block_bitmap_lo;        /* Blocks bitmap block */
        __le32        bg_inode_bitmap_lo;        /* Inodes bitmap block */
        __le32        bg_inode_table_lo;        /* Inodes table block */
        __le16        bg_free_blocks_count_lo;/* Free blocks count */
        __le16        bg_free_inodes_count_lo;/* Free inodes count */
        __le16        bg_used_dirs_count_lo;        /* Directories count */
        __le16        bg_flags;                /* EXT4_BG_flags (INODE_UNINIT, etc) */
        __le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
        __le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
        __le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
        __le16  bg_itable_unused_lo;        /* Unused inodes count */
        __le16  bg_checksum;                /* crc16(sb_uuid+group+desc) */
        __le32        bg_block_bitmap_hi;        /* Blocks bitmap block MSB */
        __le32        bg_inode_bitmap_hi;        /* Inodes bitmap block MSB */
        __le32        bg_inode_table_hi;        /* Inodes table block MSB */
        __le16        bg_free_blocks_count_hi;/* Free blocks count MSB */
        __le16        bg_free_inodes_count_hi;/* Free inodes count MSB */
        __le16        bg_used_dirs_count_hi;        /* Directories count MSB */
        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
        __le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
        __le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
        __le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
        __u32   bg_reserved;
};

#define EXT4_BG_INODE_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
         sizeof(__le16))
#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
         sizeof(__le16))

/*
 * Structure of a flex block group info
 */

struct flex_groups {
        atomic64_t        free_clusters;
        atomic_t        free_inodes;
        atomic_t        used_dirs;
};

#define EXT4_BG_INODE_UNINIT        0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT        0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED        0x0004 /* On-disk itable initialized to zero */

/*
 * Macro-instructions used to manage group descriptors
 */
#define EXT4_MIN_DESC_SIZE                32
#define EXT4_MIN_DESC_SIZE_64BIT        64
#define        EXT4_MAX_DESC_SIZE                EXT4_MIN_BLOCK_SIZE
#define EXT4_DESC_SIZE(s)                (EXT4_SB(s)->s_desc_size)
#ifdef __KERNEL__
# define EXT4_BLOCKS_PER_GROUP(s)        (EXT4_SB(s)->s_blocks_per_group)
# define EXT4_CLUSTERS_PER_GROUP(s)        (EXT4_SB(s)->s_clusters_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_SB(s)->s_desc_per_block)
# define EXT4_INODES_PER_GROUP(s)        (EXT4_SB(s)->s_inodes_per_group)
# define EXT4_DESC_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_desc_per_block_bits)
#else
# define EXT4_BLOCKS_PER_GROUP(s)        ((s)->s_blocks_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
# define EXT4_INODES_PER_GROUP(s)        ((s)->s_inodes_per_group)
#endif

/*
 * Constants relative to the data blocks
 */
#define        EXT4_NDIR_BLOCKS                12
#define        EXT4_IND_BLOCK                        EXT4_NDIR_BLOCKS
#define        EXT4_DIND_BLOCK                        (EXT4_IND_BLOCK + 1)
#define        EXT4_TIND_BLOCK                        (EXT4_DIND_BLOCK + 1)
#define        EXT4_N_BLOCKS                        (EXT4_TIND_BLOCK + 1)

/*
 * Inode flags
 */
#define        EXT4_SECRM_FL                        0x00000001 /* Secure deletion */
#define        EXT4_UNRM_FL                        0x00000002 /* Undelete */
#define        EXT4_COMPR_FL                        0x00000004 /* Compress file */
#define EXT4_SYNC_FL                        0x00000008 /* Synchronous updates */
#define EXT4_IMMUTABLE_FL                0x00000010 /* Immutable file */
#define EXT4_APPEND_FL                        0x00000020 /* writes to file may only append */
#define EXT4_NODUMP_FL                        0x00000040 /* do not dump file */
#define EXT4_NOATIME_FL                        0x00000080 /* do not update atime */
/* Reserved for compression usage... */
#define EXT4_DIRTY_FL                        0x00000100
#define EXT4_COMPRBLK_FL                0x00000200 /* One or more compressed clusters */
#define EXT4_NOCOMPR_FL                        0x00000400 /* Don't compress */
        /* nb: was previously EXT2_ECOMPR_FL */
#define EXT4_ENCRYPT_FL                        0x00000800 /* encrypted file */
/* End compression flags --- maybe not all used */
#define EXT4_INDEX_FL                        0x00001000 /* hash-indexed directory */
#define EXT4_IMAGIC_FL                        0x00002000 /* AFS directory */
#define EXT4_JOURNAL_DATA_FL                0x00004000 /* file data should be journaled */
#define EXT4_NOTAIL_FL                        0x00008000 /* file tail should not be merged */
#define EXT4_DIRSYNC_FL                        0x00010000 /* dirsync behaviour (directories only) */
#define EXT4_TOPDIR_FL                        0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL                        0x00080000 /* Inode uses extents */
#define EXT4_VERITY_FL                        0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL                0x00200000 /* Inode used for large EA */
/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */

#define EXT4_DAX_FL                        0x02000000 /* Inode is DAX */

#define EXT4_INLINE_DATA_FL                0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL                0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL                0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */

/* User modifiable flags */
#define EXT4_FL_USER_MODIFIABLE                (EXT4_SECRM_FL | \
                                         EXT4_UNRM_FL | \
                                         EXT4_COMPR_FL | \
                                         EXT4_SYNC_FL | \
                                         EXT4_IMMUTABLE_FL | \
                                         EXT4_APPEND_FL | \
                                         EXT4_NODUMP_FL | \
                                         EXT4_NOATIME_FL | \
                                         EXT4_JOURNAL_DATA_FL | \
                                         EXT4_NOTAIL_FL | \
                                         EXT4_DIRSYNC_FL | \
                                         EXT4_TOPDIR_FL | \
                                         EXT4_EXTENTS_FL | \
                                         0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
                                         EXT4_DAX_FL | \
                                         EXT4_PROJINHERIT_FL | \
                                         EXT4_CASEFOLD_FL)

/* User visible flags */
#define EXT4_FL_USER_VISIBLE                (EXT4_FL_USER_MODIFIABLE | \
                                         EXT4_DIRTY_FL | \
                                         EXT4_COMPRBLK_FL | \
                                         EXT4_NOCOMPR_FL | \
                                         EXT4_ENCRYPT_FL | \
                                         EXT4_INDEX_FL | \
                                         EXT4_VERITY_FL | \
                                         EXT4_INLINE_DATA_FL)

/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
                           EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
                           EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
                           EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
                           EXT4_DAX_FL)

/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
                           EXT4_PROJINHERIT_FL))

/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)

/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)

/* Flags which are mutually exclusive to DAX */
#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
                           EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL)

/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
        if (S_ISDIR(mode))
                return flags;
        else if (S_ISREG(mode))
                return flags & EXT4_REG_FLMASK;
        else
                return flags & EXT4_OTHER_FLMASK;
}

/*
 * Inode flags used for atomic set/get
 */
enum {
        EXT4_INODE_SECRM        = 0,        /* Secure deletion */
        EXT4_INODE_UNRM                = 1,        /* Undelete */
        EXT4_INODE_COMPR        = 2,        /* Compress file */
        EXT4_INODE_SYNC                = 3,        /* Synchronous updates */
        EXT4_INODE_IMMUTABLE        = 4,        /* Immutable file */
        EXT4_INODE_APPEND        = 5,        /* writes to file may only append */
        EXT4_INODE_NODUMP        = 6,        /* do not dump file */
        EXT4_INODE_NOATIME        = 7,        /* do not update atime */
/* Reserved for compression usage... */
        EXT4_INODE_DIRTY        = 8,
        EXT4_INODE_COMPRBLK        = 9,        /* One or more compressed clusters */
        EXT4_INODE_NOCOMPR        = 10,        /* Don't compress */
        EXT4_INODE_ENCRYPT        = 11,        /* Encrypted file */
/* End compression flags --- maybe not all used */
        EXT4_INODE_INDEX        = 12,        /* hash-indexed directory */
        EXT4_INODE_IMAGIC        = 13,        /* AFS directory */
        EXT4_INODE_JOURNAL_DATA        = 14,        /* file data should be journaled */
        EXT4_INODE_NOTAIL        = 15,        /* file tail should not be merged */
        EXT4_INODE_DIRSYNC        = 16,        /* dirsync behaviour (directories only) */
        EXT4_INODE_TOPDIR        = 17,        /* Top of directory hierarchies*/
        EXT4_INODE_HUGE_FILE        = 18,        /* Set to each huge file */
        EXT4_INODE_EXTENTS        = 19,        /* Inode uses extents */
        EXT4_INODE_VERITY        = 20,        /* Verity protected inode */
        EXT4_INODE_EA_INODE        = 21,        /* Inode used for large EA */
/* 22 was formerly EXT4_INODE_EOFBLOCKS */
        EXT4_INODE_DAX                = 25,        /* Inode is DAX */
        EXT4_INODE_INLINE_DATA        = 28,        /* Data in inode. */
        EXT4_INODE_PROJINHERIT        = 29,        /* Create with parents projid */
        EXT4_INODE_CASEFOLD        = 30,        /* Casefolded directory */
        EXT4_INODE_RESERVED        = 31,        /* reserved for ext4 lib */
};

/*
 * Since it's pretty easy to mix up bit numbers and hex values, we use a
 * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
 * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
 * any extra space in the compiled kernel image, otherwise, the build will fail.
 * It's important that these values are the same, since we are using
 * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
 * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
 * values found in ext2, ext3 and ext4 filesystems, and of course the values
 * defined in e2fsprogs.
 *
 * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
 */
#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))

static inline void ext4_check_flag_values(void)
{
        CHECK_FLAG_VALUE(SECRM);
        CHECK_FLAG_VALUE(UNRM);
        CHECK_FLAG_VALUE(COMPR);
        CHECK_FLAG_VALUE(SYNC);
        CHECK_FLAG_VALUE(IMMUTABLE);
        CHECK_FLAG_VALUE(APPEND);
        CHECK_FLAG_VALUE(NODUMP);
        CHECK_FLAG_VALUE(NOATIME);
        CHECK_FLAG_VALUE(DIRTY);
        CHECK_FLAG_VALUE(COMPRBLK);
        CHECK_FLAG_VALUE(NOCOMPR);
        CHECK_FLAG_VALUE(ENCRYPT);
        CHECK_FLAG_VALUE(INDEX);
        CHECK_FLAG_VALUE(IMAGIC);
        CHECK_FLAG_VALUE(JOURNAL_DATA);
        CHECK_FLAG_VALUE(NOTAIL);
        CHECK_FLAG_VALUE(DIRSYNC);
        CHECK_FLAG_VALUE(TOPDIR);
        CHECK_FLAG_VALUE(HUGE_FILE);
        CHECK_FLAG_VALUE(EXTENTS);
        CHECK_FLAG_VALUE(VERITY);
        CHECK_FLAG_VALUE(EA_INODE);
        CHECK_FLAG_VALUE(INLINE_DATA);
        CHECK_FLAG_VALUE(PROJINHERIT);
        CHECK_FLAG_VALUE(CASEFOLD);
        CHECK_FLAG_VALUE(RESERVED);
}

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
struct compat_ext4_new_group_input {
        u32 group;
        compat_u64 block_bitmap;
        compat_u64 inode_bitmap;
        compat_u64 inode_table;
        u32 blocks_count;
        u16 reserved_blocks;
        u16 unused;
};
#endif

/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
        __u32 group;
        __u64 block_bitmap;
        __u64 inode_bitmap;
        __u64 inode_table;
        __u32 blocks_count;
        __u16 reserved_blocks;
        __u16 mdata_blocks;
        __u32 free_clusters_count;
};

/* Indexes used to index group tables in ext4_new_group_data */
enum {
        BLOCK_BITMAP = 0,        /* block bitmap */
        INODE_BITMAP,                /* inode bitmap */
        INODE_TABLE,                /* inode tables */
        GROUP_TABLE_COUNT,
};

/*
 * Flags used by ext4_map_blocks()
 */
        /* Allocate any needed blocks and/or convert an unwritten
           extent to be an initialized ext4 */
#define EXT4_GET_BLOCKS_CREATE                        0x0001
        /* Request the creation of an unwritten extent */
#define EXT4_GET_BLOCKS_UNWRIT_EXT                0x0002
#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT        (EXT4_GET_BLOCKS_UNWRIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path
         * finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /*
         * This means that we cannot merge newly allocated extents, and if we
         * found an unwritten extent, we need to split it.
         */
#define EXT4_GET_BLOCKS_SPLIT_NOMERGE                0x0008
        /* Convert unwritten extent to initialized. */
#define EXT4_GET_BLOCKS_CONVERT                        0x0010
        /* Eventual metadata allocation (due to growing extent tree)
         * should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL                0x0020
        /* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE                0x0040
        /* Convert written extents to unwritten */
#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN        0x0100
        /* Write zeros to newly created written extents */
#define EXT4_GET_BLOCKS_ZERO                        0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO                (EXT4_GET_BLOCKS_CREATE |\
                                        EXT4_GET_BLOCKS_ZERO)
        /* Caller is in the context of data submission, such as writeback,
         * fsync, etc. Especially, in the generic writeback path, caller will
         * submit data before dropping transaction handle. This allows jbd2
         * to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT                0x0400
        /* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT                (EXT4_GET_BLOCKS_CONVERT |\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\
                                         EXT4_GET_BLOCKS_IO_SUBMIT)
        /* Caller is in the atomic contex, find extent if it has been cached */
#define EXT4_GET_BLOCKS_CACHED_NOWAIT                0x0800
/*
 * Atomic write caller needs this to query in the slow path of mixed mapping
 * case, when a contiguous extent can be split across two adjacent leaf nodes.
 * Look EXT4_MAP_QUERY_LAST_IN_LEAF.
 */
#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF        0x1000

/*
 * The bit position of these flags must not overlap with any of the
 * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
 * read_extent_tree_block(), ext4_split_extent_at(),
 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
 * caching the extents when reading from the extent tree while a
 * truncate or punch hole operation is in progress.
 */
#define EXT4_EX_NOCACHE                                0x40000000
#define EXT4_EX_FORCE_CACHE                        0x20000000
#define EXT4_EX_NOFAIL                                0x10000000
/*
 * ext4_map_query_blocks() uses this filter mask to filter the flags needed to
 * pass while lookup/querying of on disk extent tree.
 */
#define EXT4_EX_QUERY_FILTER        (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\
                                 EXT4_EX_NOFAIL |\
                                 EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)

/*
 * Flags used by ext4_free_blocks
 */
#define EXT4_FREE_BLOCKS_METADATA                0x0001
#define EXT4_FREE_BLOCKS_FORGET                        0x0002
#define EXT4_FREE_BLOCKS_VALIDATED                0x0004
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE                0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER        0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER        0x0020
#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER      0x0040

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
 * ioctl commands in 32 bit emulation
 */
#define EXT4_IOC32_GETVERSION                _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION                _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ                _IOR('f', 5, int)
#define EXT4_IOC32_SETRSVSZ                _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND                _IOW('f', 7, unsigned int)
#define EXT4_IOC32_GROUP_ADD                _IOW('f', 8, struct compat_ext4_new_group_input)
#define EXT4_IOC32_GETVERSION_OLD        FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD        FS_IOC32_SETVERSION
#endif

/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF

/* Max logical block we can support */
#define EXT4_MAX_LOGICAL_BLOCK                0xFFFFFFFE

/*
 * Structure of an inode on the disk
 */
struct ext4_inode {
        __le16        i_mode;                /* File mode */
        __le16        i_uid;                /* Low 16 bits of Owner Uid */
        __le32        i_size_lo;        /* Size in bytes */
        __le32        i_atime;        /* Access time */
        __le32        i_ctime;        /* Inode Change time */
        __le32        i_mtime;        /* Modification time */
        __le32        i_dtime;        /* Deletion Time */
        __le16        i_gid;                /* Low 16 bits of Group Id */
        __le16        i_links_count;        /* Links count */
        __le32        i_blocks_lo;        /* Blocks count */
        __le32        i_flags;        /* File flags */
        union {
                struct {
                        __le32  l_i_version;
                } linux1;
                struct {
                        __u32  h_i_translator;
                } hurd1;
                struct {
                        __u32  m_i_reserved1;
                } masix1;
        } osd1;                                /* OS dependent 1 */
        __le32        i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
        __le32        i_generation;        /* File version (for NFS) */
        __le32        i_file_acl_lo;        /* File ACL */
        __le32        i_size_high;
        __le32        i_obso_faddr;        /* Obsoleted fragment address */
        union {
                struct {
                        __le16        l_i_blocks_high; /* were l_i_reserved1 */
                        __le16        l_i_file_acl_high;
                        __le16        l_i_uid_high;        /* these 2 fields */
                        __le16        l_i_gid_high;        /* were reserved2[0] */
                        __le16        l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
                        __le16        l_i_reserved;
                } linux2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __u16        h_i_mode_high;
                        __u16        h_i_uid_high;
                        __u16        h_i_gid_high;
                        __u32        h_i_author;
                } hurd2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __le16        m_i_file_acl_high;
                        __u32        m_i_reserved2[2];
                } masix2;
        } osd2;                                /* OS dependent 2 */
        __le16        i_extra_isize;
        __le16        i_checksum_hi;        /* crc32c(uuid+inum+inode) BE */
        __le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
        __le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
        __le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
        __le32  i_crtime;       /* File Creation time */
        __le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
        __le32  i_version_hi;        /* high 32 bits for 64-bit version */
        __le32        i_projid;        /* Project ID */
};

#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)

/*
 * Extended fields will fit into an inode if the filesystem was formatted
 * with large inodes (-I 256 or larger) and there are not currently any EAs
 * consuming all of the available space. For new inodes we always reserve
 * enough space for the kernel's known extended fields, but for inodes
 * created with an old kernel this might not have been the case. None of
 * the extended inode fields is critical for correct filesystem operation.
 * This macro checks if a certain field fits in the inode. Note that
 * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
 */
#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)        \
        ((offsetof(typeof(*ext4_inode), field) +        \
          sizeof((ext4_inode)->field))                        \
        <= (EXT4_GOOD_OLD_INODE_SIZE +                        \
            (einode)->i_extra_isize))                        \

/*
 * We use an encoding that preserves the times for extra epoch "00":
 *
 * extra  msb of                         adjust for signed
 * epoch  32-bit                         32-bit tv_sec to
 * bits   time    decoded 64-bit tv_sec  64-bit tv_sec      valid time range
 * 0 0    1    -0x80000000..-0x00000001  0x000000000 1901-12-13..1969-12-31
 * 0 0    0    0x000000000..0x07fffffff  0x000000000 1970-01-01..2038-01-19
 * 0 1    1    0x080000000..0x0ffffffff  0x100000000 2038-01-19..2106-02-07
 * 0 1    0    0x100000000..0x17fffffff  0x100000000 2106-02-07..2174-02-25
 * 1 0    1    0x180000000..0x1ffffffff  0x200000000 2174-02-25..2242-03-16
 * 1 0    0    0x200000000..0x27fffffff  0x200000000 2242-03-16..2310-04-04
 * 1 1    1    0x280000000..0x2ffffffff  0x300000000 2310-04-04..2378-04-22
 * 1 1    0    0x300000000..0x37fffffff  0x300000000 2378-04-22..2446-05-10
 *
 * Note that previous versions of the kernel on 64-bit systems would
 * incorrectly use extra epoch bits 1,1 for dates between 1901 and
 * 1970.  e2fsck will correct this, assuming that it is run on the
 * affected filesystem before 2242.
 */

static inline __le32 ext4_encode_extra_time(struct timespec64 ts)
{
        u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK;
        return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS));
}

static inline struct timespec64 ext4_decode_extra_time(__le32 base,
                                                       __le32 extra)
{
        struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) };

        if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
                ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
        ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
        return ts;
}

#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts)                        \
do {                                                                                \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {        \
                (raw_inode)->xtime = cpu_to_le32((ts).tv_sec);                        \
                (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts);        \
        } else                                                                        \
                (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX));        \
} while (0)

#define EXT4_INODE_SET_ATIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode))

#define EXT4_INODE_SET_MTIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode))

#define EXT4_INODE_SET_CTIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))

#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)                                \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                        \
                EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode),                \
                                         raw_inode, (einode)->xtime)

#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode)                        \
        (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ?        \
                ext4_decode_extra_time((raw_inode)->xtime,                                \
                                       (raw_inode)->xtime ## _extra) :                \
                (struct timespec64) {                                                \
                        .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime)        \
                })

#define EXT4_INODE_GET_ATIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_atime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode));                \
} while (0)

#define EXT4_INODE_GET_MTIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_mtime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode));                \
} while (0)

#define EXT4_INODE_GET_CTIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_ctime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode));                \
} while (0)

#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                                \
do {                                                                                \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                         \
                (einode)->xtime =                                                \
                        EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode),        \
                                                 raw_inode);                        \
        else                                                                        \
                (einode)->xtime = (struct timespec64){0, 0};                        \
} while (0)

#define i_disk_version osd1.linux1.l_i_version

#if defined(__KERNEL__) || defined(__linux__)
#define i_reserved1        osd1.linux1.l_i_reserved1
#define i_file_acl_high        osd2.linux2.l_i_file_acl_high
#define i_blocks_high        osd2.linux2.l_i_blocks_high
#define i_uid_low        i_uid
#define i_gid_low        i_gid
#define i_uid_high        osd2.linux2.l_i_uid_high
#define i_gid_high        osd2.linux2.l_i_gid_high
#define i_checksum_lo        osd2.linux2.l_i_checksum_lo

#elif defined(__GNU__)

#define i_translator        osd1.hurd1.h_i_translator
#define i_uid_high        osd2.hurd2.h_i_uid_high
#define i_gid_high        osd2.hurd2.h_i_gid_high
#define i_author        osd2.hurd2.h_i_author

#elif defined(__masix__)

#define i_reserved1        osd1.masix1.m_i_reserved1
#define i_file_acl_high        osd2.masix2.m_i_file_acl_high
#define i_reserved2        osd2.masix2.m_i_reserved2

#endif /* defined(__KERNEL__) || defined(__linux__) */

#include "extents_status.h"
#include "fast_commit.h"

/*
 * Lock subclasses for i_data_sem in the ext4_inode_info structure.
 *
 * These are needed to avoid lockdep false positives when we need to
 * allocate blocks to the quota inode during ext4_map_blocks(), while
 * holding i_data_sem for a normal (non-quota) inode.  Since we don't
 * do quota tracking for the quota inode, this avoids deadlock (as
 * well as infinite recursion, since it isn't turtles all the way
 * down...)
 *
 *  I_DATA_SEM_NORMAL - Used for most inodes
 *  I_DATA_SEM_OTHER  - Used by move_inode.c for the second normal inode
 *                          where the second inode has larger inode number
 *                          than the first
 *  I_DATA_SEM_QUOTA  - Used for quota inodes only
 *  I_DATA_SEM_EA     - Used for ea_inodes only
 */
enum {
        I_DATA_SEM_NORMAL = 0,
        I_DATA_SEM_OTHER,
        I_DATA_SEM_QUOTA,
        I_DATA_SEM_EA
};


/*
 * fourth extended file system inode data in memory
 */
struct ext4_inode_info {
        __le32        i_data[15];        /* unconverted */
        __u32        i_dtime;
        ext4_fsblk_t        i_file_acl;

        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
         * it is used for making block allocation decisions - we try to
         * place a file's data blocks near its inode block, and new inodes
         * near to their parent directory's inode.
         */
        ext4_group_t        i_block_group;
        ext4_lblk_t        i_dir_start_lookup;
#if (BITS_PER_LONG < 64)
        unsigned long        i_state_flags;                /* Dynamic state flags */
#endif
        unsigned long        i_flags;

        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_rwsem even when reading would cause contention
         * between readers of EAs and writers of regular file data, so
         * instead we synchronize on xattr_sem when reading or changing
         * EAs.
         */
        struct rw_semaphore xattr_sem;

        /*
         * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
         * i_orphan is used.
         */
        union {
                struct list_head i_orphan;        /* unlinked but open inodes */
                unsigned int i_orphan_idx;        /* Index in orphan file */
        };

        /* Fast commit related info */

        /* For tracking dentry create updates */
        struct list_head i_fc_dilist;
        struct list_head i_fc_list;        /*
                                         * inodes that need fast commit
                                         * protected by sbi->s_fc_lock.
                                         */

        /* Start of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_start;

        /* End of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_len;

        spinlock_t i_raw_lock;        /* protects updates to the raw inode */

        /*
         * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len
         * and inode's EXT4_FC_STATE_COMMITTING state bit.
         */
        spinlock_t i_fc_lock;

        /*
         * i_disksize keeps track of what the inode size is ON DISK, not
         * in memory.  During truncate, i_size is set to the new size by
         * the VFS prior to calling ext4_truncate(), but the filesystem won't
         * set i_disksize to 0 until the truncate is actually under way.
         *
         * The intent is that i_disksize always represents the blocks which
         * are used by this file.  This allows recovery to restart truncate
         * on orphans if we crash during truncate.  We actually write i_disksize
         * into the on-disk inode when writing inodes out, instead of i_size.
         *
         * The only time when i_disksize and i_size may be different is when
         * a truncate is in progress.  The only things which change i_disksize
         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
         */
        loff_t        i_disksize;

        /*
         * i_data_sem is for serialising ext4_truncate() against
         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
         * data tree are chopped off during truncate. We can't do that in
         * ext4 because whenever we perform intermediate commits during
         * truncate, the inode and all the metadata blocks *must* be in a
         * consistent state which allows truncation of the orphans to restart
         * during recovery.  Hence we must fix the get_block-vs-truncate race
         * by other means, so we have i_data_sem.
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
        struct jbd2_inode *jinode;
        struct mapping_metadata_bhs i_metadata_bhs;

        /*
         * File creation time. Its function is same as that of
         * struct timespec64 i_{a,c,m}time in the generic inode.
         */
        struct timespec64 i_crtime;

        /* mballoc */
        atomic_t i_prealloc_active;

        /* allocation reservation info for delalloc */
        /* In case of bigalloc, this refer to clusters rather than blocks */
        unsigned int i_reserved_data_blocks;
        struct rb_root i_prealloc_node;
        rwlock_t i_prealloc_lock;

        /* extents status tree */
        struct ext4_es_tree i_es_tree;
        rwlock_t i_es_lock;
        struct list_head i_es_list;
        unsigned int i_es_all_nr;        /* protected by i_es_lock */
        unsigned int i_es_shk_nr;        /* protected by i_es_lock */
        ext4_lblk_t i_es_shrink_lblk;        /* Offset where we start searching for
                                           extents to shrink. Protected by
                                           i_es_lock  */
        u64 i_es_seq;                        /* Change counter for extents.
                                           Protected by i_es_lock */

        /* ialloc */
        ext4_group_t        i_last_alloc_group;

        /* pending cluster reservations for bigalloc file systems */
        struct ext4_pending_tree i_pending_tree;

        /* on-disk additional length */
        __u16 i_extra_isize;

        /* Indicate the inline data space. */
        u16 i_inline_off;
        u16 i_inline_size;

#ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
#endif
        spinlock_t i_block_reservation_lock;

        /* Lock protecting lists below */
        spinlock_t i_completed_io_lock;
        /*
         * Completed IOs that need unwritten extents handling and have
         * transaction reserved
         */
        struct list_head i_rsv_conversion_list;
        struct work_struct i_rsv_conversion_work;

        /*
         * Transactions that contain inode's metadata needed to complete
         * fsync and fdatasync, respectively.
         */
        tid_t i_sync_tid;
        tid_t i_datasync_tid;

#ifdef CONFIG_QUOTA
        struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif

        /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
        __u32 i_csum_seed;

        kprojid_t i_projid;

#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_inode_info *i_crypt_info;
#endif
};

/*
 * File system states
 */
#define        EXT4_VALID_FS                        0x0001        /* Unmounted cleanly */
#define        EXT4_ERROR_FS                        0x0002        /* Errors detected */
#define        EXT4_ORPHAN_FS                        0x0004        /* Orphans being recovered */
#define EXT4_FC_REPLAY                        0x0020        /* Fast commit replay ongoing */

/*
 * Misc. filesystem flags
 */
#define EXT2_FLAGS_SIGNED_HASH                0x0001  /* Signed dirhash in use */
#define EXT2_FLAGS_UNSIGNED_HASH        0x0002  /* Unsigned dirhash in use */
#define EXT2_FLAGS_TEST_FILESYS                0x0004        /* to test development code */

/*
 * Mount flags set via mount options or defaults
 */
#define EXT4_MOUNT_NO_MBCACHE                0x00001 /* Do not use mbcache */
#define EXT4_MOUNT_GRPID                0x00004        /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG                0x00008        /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT                0x00010        /* Continue on errors */
#define EXT4_MOUNT_ERRORS_RO                0x00020        /* Remount fs ro on errors */
#define EXT4_MOUNT_ERRORS_PANIC                0x00040        /* Panic on errors */
#define EXT4_MOUNT_ERRORS_MASK                0x00070
#define EXT4_MOUNT_MINIX_DF                0x00080        /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD                0x00100        /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
#define EXT4_MOUNT_DAX_ALWAYS                0x00200        /* Direct Access */
#else
#define EXT4_MOUNT_DAX_ALWAYS                0
#endif
#define EXT4_MOUNT_DATA_FLAGS                0x00C00        /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA                0x00400        /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA                0x00800        /* Flush data before commit */
#define EXT4_MOUNT_WRITEBACK_DATA        0x00C00        /* No data ordering */
#define EXT4_MOUNT_UPDATE_JOURNAL        0x01000        /* Update the journal format */
#define EXT4_MOUNT_NO_UID32                0x02000  /* Disable 32-bit UIDs */
#define EXT4_MOUNT_XATTR_USER                0x04000        /* Extended user attributes */
#define EXT4_MOUNT_POSIX_ACL                0x08000        /* POSIX Access Control Lists */
#define EXT4_MOUNT_NO_AUTO_DA_ALLOC        0x10000        /* No auto delalloc mapping */
#define EXT4_MOUNT_BARRIER                0x20000 /* Use block barriers */
#define EXT4_MOUNT_QUOTA                0x40000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA                0x80000 /* "old" user quota,
                                                 * enable enforcement for hidden
                                                 * quota files */
#define EXT4_MOUNT_GRPQUOTA                0x100000 /* "old" group quota, enable
                                                  * enforcement for hidden quota
                                                  * files */
#define EXT4_MOUNT_PRJQUOTA                0x200000 /* Enable project quota
                                                  * enforcement */
#define EXT4_MOUNT_DIOREAD_NOLOCK        0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM        0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR        0x2000000 /* Trigger WARN_ON on error */
#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC                0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT        0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY        0x20000000 /* Block validity checking */
#define EXT4_MOUNT_DISCARD                0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE        0x80000000 /* Initialize uninitialized itables */

/*
 * Mount flags set either automatically (could not be set by mount option)
 * based on per file system feature or property or in special cases such as
 * distinguishing between explicit mount option definition and default.
 */
#define EXT4_MOUNT2_EXPLICIT_DELALLOC        0x00000001 /* User explicitly
                                                      specified delalloc */
#define EXT4_MOUNT2_STD_GROUP_SIZE        0x00000002 /* We have standard group
                                                      size of blocksize * 8
                                                      blocks */
#define EXT4_MOUNT2_HURD_COMPAT                0x00000004 /* Support HURD-castrated
                                                      file systems */
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM        0x00000008 /* User explicitly
                                                specified journal checksum */

#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT        0x00000010 /* Journal fast commit */
#define EXT4_MOUNT2_DAX_NEVER                0x00000020 /* Do not allow Direct Access */
#define EXT4_MOUNT2_DAX_INODE                0x00000040 /* For printing options only */
#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN        0x00000080 /* Optimize group
                                                    * scanning in mballoc
                                                    */
#define EXT4_MOUNT2_ABORT                0x00000100 /* Abort filesystem */

#define clear_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
                                                EXT4_MOUNT_##opt
#define test_opt(sb, opt)                (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)

#define clear_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 &= \
                                                ~EXT4_MOUNT2_##opt
#define set_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 |= \
                                                EXT4_MOUNT2_##opt
#define test_opt2(sb, opt)                (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)

#define ext4_test_and_set_bit                __test_and_set_bit_le
#define ext4_set_bit                        __set_bit_le
#define ext4_test_and_clear_bit                __test_and_clear_bit_le
#define ext4_clear_bit                        __clear_bit_le
#define ext4_test_bit                        test_bit_le
#define ext4_find_next_zero_bit                find_next_zero_bit_le
#define ext4_find_next_bit                find_next_bit_le

extern void mb_set_bits(void *bm, int cur, int len);

/*
 * Maximal mount counts between two filesystem checks
 */
#define EXT4_DFL_MAX_MNT_COUNT                20        /* Allow 20 mounts */
#define EXT4_DFL_CHECKINTERVAL                0        /* Don't use interval check */

/*
 * Behaviour when detecting errors
 */
#define EXT4_ERRORS_CONTINUE                1        /* Continue execution */
#define EXT4_ERRORS_RO                        2        /* Remount fs read-only */
#define EXT4_ERRORS_PANIC                3        /* Panic */
#define EXT4_ERRORS_DEFAULT                EXT4_ERRORS_CONTINUE

/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM                1

#define EXT4_LABEL_MAX                        16

/*
 * Structure of the super block
 */
struct ext4_super_block {
/*00*/        __le32        s_inodes_count;                /* Inodes count */
        __le32        s_blocks_count_lo;        /* Blocks count */
        __le32        s_r_blocks_count_lo;        /* Reserved blocks count */
        __le32        s_free_blocks_count_lo;        /* Free blocks count */
/*10*/        __le32        s_free_inodes_count;        /* Free inodes count */
        __le32        s_first_data_block;        /* First Data Block */
        __le32        s_log_block_size;        /* Block size */
        __le32        s_log_cluster_size;        /* Allocation cluster size */
/*20*/        __le32        s_blocks_per_group;        /* # Blocks per group */
        __le32        s_clusters_per_group;        /* # Clusters per group */
        __le32        s_inodes_per_group;        /* # Inodes per group */
        __le32        s_mtime;                /* Mount time */
/*30*/        __le32        s_wtime;                /* Write time */
        __le16        s_mnt_count;                /* Mount count */
        __le16        s_max_mnt_count;        /* Maximal mount count */
        __le16        s_magic;                /* Magic signature */
        __le16        s_state;                /* File system state */
        __le16        s_errors;                /* Behaviour when detecting errors */
        __le16        s_minor_rev_level;        /* minor revision level */
/*40*/        __le32        s_lastcheck;                /* time of last check */
        __le32        s_checkinterval;        /* max. time between checks */
        __le32        s_creator_os;                /* OS */
        __le32        s_rev_level;                /* Revision level */
/*50*/        __le16        s_def_resuid;                /* Default uid for reserved blocks */
        __le16        s_def_resgid;                /* Default gid for reserved blocks */
        /*
         * These fields are for EXT4_DYNAMIC_REV superblocks only.
         *
         * Note: the difference between the compatible feature set and
         * the incompatible feature set is that if there is a bit set
         * in the incompatible feature set that the kernel doesn't
         * know about, it should refuse to mount the filesystem.
         *
         * e2fsck's requirements are more strict; if it doesn't know
         * about a feature in either the compatible or incompatible
         * feature set, it must abort and not try to meddle with
         * things it doesn't understand...
         */
        __le32        s_first_ino;                /* First non-reserved inode */
        __le16  s_inode_size;                /* size of inode structure */
        __le16        s_block_group_nr;        /* block group # of this superblock */
        __le32        s_feature_compat;        /* compatible feature set */
/*60*/        __le32        s_feature_incompat;        /* incompatible feature set */
        __le32        s_feature_ro_compat;        /* readonly-compatible feature set */
/*68*/        __u8        s_uuid[16];                /* 128-bit uuid for volume */
/*78*/        char        s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */
/*88*/        char        s_last_mounted[64] __nonstring;        /* directory where last mounted */
/*C8*/        __le32        s_algorithm_usage_bitmap; /* For compression */
        /*
         * Performance hints.  Directory preallocation should only
         * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
         */
        __u8        s_prealloc_blocks;        /* Nr of blocks to try to preallocate*/
        __u8        s_prealloc_dir_blocks;        /* Nr to preallocate for dirs */
        __le16        s_reserved_gdt_blocks;        /* Per group desc for online growth */
        /*
         * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
         */
/*D0*/        __u8        s_journal_uuid[16];        /* uuid of journal superblock */
/*E0*/        __le32        s_journal_inum;                /* inode number of journal file */
        __le32        s_journal_dev;                /* device number of journal file */
        __le32        s_last_orphan;                /* start of list of inodes to delete */
        __le32        s_hash_seed[4];                /* HTREE hash seed */
        __u8        s_def_hash_version;        /* Default hash version to use */
        __u8        s_jnl_backup_type;
        __le16  s_desc_size;                /* size of group descriptor */
/*100*/        __le32        s_default_mount_opts;
        __le32        s_first_meta_bg;        /* First metablock block group */
        __le32        s_mkfs_time;                /* When the filesystem was created */
        __le32        s_jnl_blocks[17];        /* Backup of the journal inode */
        /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */
/*150*/        __le32        s_blocks_count_hi;        /* Blocks count */
        __le32        s_r_blocks_count_hi;        /* Reserved blocks count */
        __le32        s_free_blocks_count_hi;        /* Free blocks count */
        __le16        s_min_extra_isize;        /* All inodes have at least # bytes */
        __le16        s_want_extra_isize;         /* New inodes should reserve # bytes */
        __le32        s_flags;                /* Miscellaneous flags */
        __le16  s_raid_stride;                /* RAID stride */
        __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8        s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8        s_checksum_type;        /* metadata checksum algorithm used */
        __u8        s_encryption_level;        /* versioning level for encryption */
        __u8        s_reserved_pad;                /* Padding to next 32bits */
        __le64        s_kbytes_written;        /* nr of lifetime kilobytes written */
        __le32        s_snapshot_inum;        /* Inode number of active snapshot */
        __le32        s_snapshot_id;                /* sequential ID of active snapshot */
        __le64        s_snapshot_r_blocks_count; /* reserved blocks for active
                                              snapshot's future use */
        __le32        s_snapshot_list;        /* inode number of the head of the
                                           on-disk snapshot list */
#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
        __le32        s_error_count;                /* number of fs errors */
        __le32        s_first_error_time;        /* first time an error happened */
        __le32        s_first_error_ino;        /* inode involved in first error */
        __le64        s_first_error_block;        /* block involved of first error */
        __u8        s_first_error_func[32] __nonstring;        /* function where the error happened */
        __le32        s_first_error_line;        /* line number where error happened */
        __le32        s_last_error_time;        /* most recent time of an error */
        __le32        s_last_error_ino;        /* inode involved in last error */
        __le32        s_last_error_line;        /* line number where error happened */
        __le64        s_last_error_block;        /* block involved of last error */
        __u8        s_last_error_func[32] __nonstring;        /* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
        __u8        s_mount_opts[64];
        __le32        s_usr_quota_inum;        /* inode for tracking user quota */
        __le32        s_grp_quota_inum;        /* inode for tracking group quota */
        __le32        s_overhead_clusters;        /* overhead blocks/clusters in fs */
        __le32        s_backup_bgs[2];        /* groups with sparse_super2 SBs */
        __u8        s_encrypt_algos[4];        /* Encryption algorithms in use  */
        __u8        s_encrypt_pw_salt[16];        /* Salt used for string2key algorithm */
        __le32        s_lpf_ino;                /* Location of the lost+found inode */
        __le32        s_prj_quota_inum;        /* inode for tracking project quota */
        __le32        s_checksum_seed;        /* crc32c(uuid) if csum_seed set */
        __u8        s_wtime_hi;
        __u8        s_mtime_hi;
        __u8        s_mkfs_time_hi;
        __u8        s_lastcheck_hi;
        __u8        s_first_error_time_hi;
        __u8        s_last_error_time_hi;
        __u8        s_first_error_errcode;
        __u8    s_last_error_errcode;
        __le16  s_encoding;                /* Filename charset encoding */
        __le16  s_encoding_flags;        /* Filename charset encoding flags */
        __le32  s_orphan_file_inum;        /* Inode for tracking orphan inodes */
        __le16        s_def_resuid_hi;
        __le16        s_def_resgid_hi;
        __le32        s_reserved[93];                /* Padding to the end of the block */
        __le32        s_checksum;                /* crc32c(superblock) */
};

#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)

#ifdef __KERNEL__

/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3

#define EXT4_ENC_UTF8_12_1        1

/* Types of ext4 journal triggers */
enum ext4_journal_trigger_type {
        EXT4_JTR_ORPHAN_FILE,
        EXT4_JTR_NONE        /* This must be the last entry for indexing to work! */
};

#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE

struct ext4_journal_trigger {
        struct jbd2_buffer_trigger_type tr_triggers;
        struct super_block *sb;
};

static inline struct ext4_journal_trigger *EXT4_TRIGGER(
                                struct jbd2_buffer_trigger_type *trigger)
{
        return container_of(trigger, struct ext4_journal_trigger, tr_triggers);
}

#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04

/* Structure at the tail of orphan block */
struct ext4_orphan_block_tail {
        __le32 ob_magic;
        __le32 ob_checksum;
};

static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
{
        return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) /
                        sizeof(u32);
}

struct ext4_orphan_block {
        atomic_t ob_free_entries;        /* Number of free orphan entries in block */
        struct buffer_head *ob_bh;        /* Buffer for orphan block */
};

/*
 * Info about orphan file.
 */
struct ext4_orphan_info {
        int of_blocks;                        /* Number of orphan blocks in a file */
        __u32 of_csum_seed;                /* Checksum seed for orphan file */
        struct ext4_orphan_block *of_binfo;        /* Array with info about orphan
                                                 * file blocks */
};

/*
 * fourth extended-fs super-block data in memory
 */
struct ext4_sb_info {
        unsigned long s_desc_size;        /* Size of a group descriptor in bytes */
        unsigned long s_inodes_per_block;/* Number of inodes per block */
        unsigned long s_blocks_per_group;/* Number of blocks in a group */
        unsigned long s_clusters_per_group; /* Number of clusters in a group */
        unsigned long s_inodes_per_group;/* Number of inodes in a group */
        unsigned long s_itb_per_group;        /* Number of inode table blocks per group */
        unsigned long s_gdb_count;        /* Number of group descriptor blocks */
        unsigned long s_desc_per_block;        /* Number of group descriptors per block */
        ext4_group_t s_groups_count;        /* Number of groups in the fs */
        ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
        unsigned long s_overhead;  /* # of fs overhead clusters */
        unsigned int s_cluster_ratio;        /* Number of blocks per cluster */
        unsigned int s_cluster_bits;        /* log2 of s_cluster_ratio */
        loff_t s_bitmap_maxbytes;        /* max bytes for bitmap files */
        struct buffer_head * s_sbh;        /* Buffer containing the super block */
        struct ext4_super_block *s_es;        /* Pointer to the super block in the buffer */
        /* Array of bh's for the block group descriptors */
        struct buffer_head * __rcu *s_group_desc;
        unsigned int s_mount_opt;
        unsigned int s_mount_opt2;
        unsigned long s_mount_flags;
        unsigned int s_def_mount_opt;
        unsigned int s_def_mount_opt2;
        ext4_fsblk_t s_sb_block;
        atomic64_t s_resv_clusters;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned short s_mount_state;
        unsigned short s_pad;
        int s_addr_per_block_bits;
        int s_desc_per_block_bits;
        int s_inode_size;
        int s_first_ino;
        unsigned int s_inode_readahead_blks;
        unsigned int s_inode_goal;
        u32 s_hash_seed[4];
        int s_def_hash_version;
        int s_hash_unsigned;        /* 3 if hash should be unsigned, 0 if not */
        struct percpu_counter s_freeclusters_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyclusters_counter;
        struct percpu_counter s_sra_exceeded_retry_limit;
        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
        struct kobject s_kobj;
        struct completion s_kobj_unregister;
        struct mutex s_error_notify_mutex; /* protects sysfs_notify vs kobject_del */
        struct super_block *s_sb;
        struct buffer_head *s_mmp_bh;

        /* Journaling */
        struct journal_s *s_journal;
        unsigned long s_ext4_flags;                /* Ext4 superblock flags */
        struct mutex s_orphan_lock;        /* Protects on disk list changes */
        struct list_head s_orphan;        /* List of orphaned inodes in on disk
                                           list */
        struct ext4_orphan_info s_orphan_info;
        unsigned long s_commit_interval;
        u32 s_max_batch_time;
        u32 s_min_batch_time;
        struct file *s_journal_bdev_file;
#ifdef CONFIG_QUOTA
        /* Names of quota files with journalled quota */
        char __rcu *s_qf_names[EXT4_MAXQUOTAS];
        int s_jquota_fmt;                        /* Format of quota to use */
#endif
        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
        struct ext4_system_blocks __rcu *s_system_blks;

#ifdef EXTENTS_STATS
        /* ext4 extents stats */
        unsigned long s_ext_min;
        unsigned long s_ext_max;
        unsigned long s_depth_max;
        spinlock_t s_ext_stats_lock;
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
#endif

        /* for buddy allocator */
        struct ext4_group_info ** __rcu *s_group_info;
        struct inode *s_buddy_cache;
        spinlock_t s_md_lock;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
        unsigned int s_group_info_size;
        atomic_t s_mb_free_pending;
        struct list_head s_freed_data_list[2];        /* List of blocks to be freed
                                                   after commit completed */
        struct list_head s_discard_list;
        struct work_struct s_discard_work;
        atomic_t s_retry_alloc_pending;
        struct xarray *s_mb_avg_fragment_size;
        struct xarray *s_mb_largest_free_orders;

        /* tunables */
        unsigned long s_stripe;
        unsigned int s_mb_max_linear_groups;
        unsigned int s_mb_stream_request;
        unsigned int s_mb_max_to_scan;
        unsigned int s_mb_min_to_scan;
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
        unsigned int s_max_dir_size_kb;
        unsigned int s_mb_prefetch;
        unsigned int s_mb_prefetch_limit;
        unsigned int s_mb_best_avail_max_trim_order;
        unsigned int s_sb_update_sec;
        unsigned int s_sb_update_kb;

        /* where last allocation was done - for stream allocation */
        ext4_group_t *s_mb_last_groups;
        unsigned int s_mb_nr_global_goals;

        /* stats for buddy allocator */
        atomic_t s_bal_reqs;        /* number of reqs with len > 1 */
        atomic_t s_bal_success;        /* we found long enough chunks */
        atomic_t s_bal_allocated;        /* in blocks */
        atomic_t s_bal_ex_scanned;        /* total extents scanned */
        atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS];        /* total extents scanned */
        atomic_t s_bal_groups_scanned;        /* number of groups scanned */
        atomic_t s_bal_goals;        /* goal hits */
        atomic_t s_bal_stream_goals;        /* stream allocation global goal hits */
        atomic_t s_bal_len_goals;        /* len goal hits */
        atomic_t s_bal_breaks;        /* too long searches */
        atomic_t s_bal_2orders;        /* 2^order hits */
        atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
        atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
        atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS];                /* cX loop didn't find blocks */
        atomic_t s_mb_buddies_generated;        /* number of buddies generated */
        atomic64_t s_mb_generation_time;
        atomic_t s_mb_lost_chunks;
        atomic_t s_mb_preallocated;
        atomic_t s_mb_discarded;
        atomic_t s_lock_busy;

        /* locality groups */
        struct ext4_locality_group __percpu *s_locality_groups;

        /* for write statistics */
        unsigned long s_sectors_written_start;
        u64 s_kbytes_written;

        /* the size of zero-out chunk */
        unsigned int s_extent_max_zeroout_kb;

        unsigned int s_log_groups_per_flex;
        struct flex_groups * __rcu *s_flex_groups;
        ext4_group_t s_flex_groups_allocated;

        /* workqueue for reserved extent conversions (buffered io) */
        struct workqueue_struct *rsv_conversion_wq;

        /* timer for periodic error stats printing */
        struct timer_list s_err_report;
        /* timeout in seconds for s_err_report; 0 disables the timer. */
        unsigned long s_err_report_sec;

        /* Lazy inode table initialization info */
        struct ext4_li_request *s_li_request;
        /* Wait multiplier for lazy initialization thread */
        unsigned int s_li_wait_mult;

        /* Kernel thread for multiple mount protection */
        struct task_struct *s_mmp_tsk;

        /* record the last minlen when FITRIM is called. */
        unsigned long s_last_trim_minblks;

        /* minimum folio order of a page cache allocation */
        u16 s_min_folio_order;
        /* supported maximum folio order, 0 means not supported */
        u16 s_max_folio_order;

        /* Precomputed FS UUID checksum for seeding other checksums */
        __u32 s_csum_seed;

        /* Reclaim extents from extent status tree */
        struct shrinker *s_es_shrinker;
        struct list_head s_es_list;        /* List of inodes with reclaimable extents */
        long s_es_nr_inode;
        struct ext4_es_stats s_es_stats;
        struct mb_cache *s_ea_block_cache;
        struct mb_cache *s_ea_inode_cache;
        spinlock_t s_es_lock ____cacheline_aligned_in_smp;

        /* Journal triggers for checksum computation */
        struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT];

        /* Ratelimit ext4 messages. */
        struct ratelimit_state s_err_ratelimit_state;
        struct ratelimit_state s_warning_ratelimit_state;
        struct ratelimit_state s_msg_ratelimit_state;
        atomic_t s_warning_count;
        atomic_t s_msg_count;

        /* Encryption policy for '-o test_dummy_encryption' */
        struct fscrypt_dummy_policy s_dummy_enc_policy;

        /*
         * Barrier between writepages ops and changing any inode's JOURNAL_DATA
         * or EXTENTS flag or between writepages ops and changing DELALLOC or
         * DIOREAD_NOLOCK mount options on remount.
         */
        struct percpu_rw_semaphore s_writepages_rwsem;
        struct dax_device *s_daxdev;
        u64 s_dax_part_off;
#ifdef CONFIG_EXT4_DEBUG
        unsigned long s_simulate_fail;
#endif
        /* Record the errseq of the backing block device */
        errseq_t s_bdev_wb_err;
        spinlock_t s_bdev_wb_lock;

        /* Information about errors that happened during this mount */
        spinlock_t s_error_lock;
        int s_add_error_count;
        int s_first_error_code;
        __u32 s_first_error_line;
        __u32 s_first_error_ino;
        __u64 s_first_error_block;
        const char *s_first_error_func;
        time64_t s_first_error_time;
        int s_last_error_code;
        __u32 s_last_error_line;
        __u32 s_last_error_ino;
        __u64 s_last_error_block;
        const char *s_last_error_func;
        time64_t s_last_error_time;
        /*
         * If we are in a context where we cannot update the on-disk
         * superblock, we queue the work here.  This is used to update
         * the error information in the superblock, and for periodic
         * updates of the superblock called from the commit callback
         * function.
         */
        struct work_struct s_sb_upd_work;

        /* Atomic write unit values in bytes */
        unsigned int s_awu_min;
        unsigned int s_awu_max;

        /* Ext4 fast commit sub transaction ID */
        atomic_t s_fc_subtid;

        /*
         * After commit starts, the main queue gets locked, and the further
         * updates get added in the staging queue.
         */
#define FC_Q_MAIN        0
#define FC_Q_STAGING        1
        struct list_head s_fc_q[2];        /* Inodes staged for fast commit
                                         * that have data changes in them.
                                         */
        struct list_head s_fc_dentry_q[2];        /* directory entry updates */
        unsigned int s_fc_bytes;
        /*
         * Main fast commit lock. This lock protects accesses to the
         * following fields:
         * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
         *
         * s_fc_lock can be taken from reclaim context (inode eviction) and is
         * thus reclaim unsafe. Use ext4_fc_lock()/ext4_fc_unlock() helpers
         * when acquiring / releasing the lock.
         */
        struct mutex s_fc_lock;
        struct buffer_head *s_fc_bh;
        struct ext4_fc_stats s_fc_stats;
        tid_t s_fc_ineligible_tid;
#ifdef CONFIG_EXT4_DEBUG
        int s_fc_debug_max_replay;
#endif
        struct ext4_fc_replay_state s_fc_replay_state;
};

static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}
static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
{
        return container_of(inode, struct ext4_inode_info, vfs_inode);
}

static inline int ext4_writepages_down_read(struct super_block *sb)
{
        percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
        return memalloc_nofs_save();
}

static inline void ext4_writepages_up_read(struct super_block *sb, int ctx)
{
        memalloc_nofs_restore(ctx);
        percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
}

static inline int ext4_writepages_down_write(struct super_block *sb)
{
        percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem);
        return memalloc_nofs_save();
}

static inline void ext4_writepages_up_write(struct super_block *sb, int ctx)
{
        memalloc_nofs_restore(ctx);
        percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem);
}

static inline int ext4_fc_lock(struct super_block *sb)
{
        mutex_lock(&EXT4_SB(sb)->s_fc_lock);
        return memalloc_nofs_save();
}

static inline void ext4_fc_unlock(struct super_block *sb, int ctx)
{
        memalloc_nofs_restore(ctx);
        mutex_unlock(&EXT4_SB(sb)->s_fc_lock);
}

static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
        return ino == EXT4_ROOT_INO ||
                (ino >= EXT4_FIRST_INO(sb) &&
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}

static inline int ext4_get_resuid(struct ext4_super_block *es)
{
        return le16_to_cpu(es->s_def_resuid) |
                le16_to_cpu(es->s_def_resuid_hi) << 16;
}

static inline int ext4_get_resgid(struct ext4_super_block *es)
{
        return le16_to_cpu(es->s_def_resgid) |
                le16_to_cpu(es->s_def_resgid_hi) << 16;
}

/*
 * Returns: sbi->field[index]
 * Used to access an array element from the following sbi fields which require
 * rcu protection to avoid dereferencing an invalid pointer due to reassignment
 * - s_group_desc
 * - s_group_info
 * - s_flex_group
 */
#define sbi_array_rcu_deref(sbi, field, index)                                   \
({                                                                           \
        typeof(*((sbi)->field)) _v;                                           \
        rcu_read_lock();                                                   \
        _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index];           \
        rcu_read_unlock();                                                   \
        _v;                                                                   \
})

/*
 * run-time mount flags
 */
enum {
        EXT4_MF_MNTDIR_SAMPLED,
        EXT4_MF_FC_INELIGIBLE,        /* Fast commit ineligible */
        EXT4_MF_JOURNAL_DESTROY        /* Journal is in process of destroying */
};

static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
{
        set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
{
        clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
{
        return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}


/*
 * Simulate_fail codes
 */
#define EXT4_SIM_BBITMAP_EIO        1
#define EXT4_SIM_BBITMAP_CRC        2
#define EXT4_SIM_IBITMAP_EIO        3
#define EXT4_SIM_IBITMAP_CRC        4
#define EXT4_SIM_INODE_EIO        5
#define EXT4_SIM_INODE_CRC        6
#define EXT4_SIM_DIRBLOCK_EIO        7
#define EXT4_SIM_DIRBLOCK_CRC        8

static inline bool ext4_simulate_fail(struct super_block *sb,
                                     unsigned long code)
{
#ifdef CONFIG_EXT4_DEBUG
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (unlikely(sbi->s_simulate_fail == code)) {
                sbi->s_simulate_fail = 0;
                return true;
        }
#endif
        return false;
}

/*
 * Error number codes for s_{first,last}_error_errno
 *
 * Linux errno numbers are architecture specific, so we need to translate
 * them into something which is architecture independent.   We don't define
 * codes for all errno's; just the ones which are most likely to be the cause
 * of an ext4_error() call.
 */
#define EXT4_ERR_UNKNOWN         1
#define EXT4_ERR_EIO                 2
#define EXT4_ERR_ENOMEM                 3
#define EXT4_ERR_EFSBADCRC         4
#define EXT4_ERR_EFSCORRUPTED         5
#define EXT4_ERR_ENOSPC                 6
#define EXT4_ERR_ENOKEY                 7
#define EXT4_ERR_EROFS                 8
#define EXT4_ERR_EFBIG                 9
#define EXT4_ERR_EEXIST                10
#define EXT4_ERR_ERANGE                11
#define EXT4_ERR_EOVERFLOW        12
#define EXT4_ERR_EBUSY                13
#define EXT4_ERR_ENOTDIR        14
#define EXT4_ERR_ENOTEMPTY        15
#define EXT4_ERR_ESHUTDOWN        16
#define EXT4_ERR_EFAULT                17

/*
 * Inode dynamic state flags
 */
enum {
        EXT4_STATE_NEW,                        /* inode is newly created */
        EXT4_STATE_XATTR,                /* has in-inode xattrs */
        EXT4_STATE_NO_EXPAND,                /* No space for expansion */
        EXT4_STATE_DA_ALLOC_CLOSE,        /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,                /* Inode is migrating */
        EXT4_STATE_NEWENTRY,                /* File just added to dir */
        EXT4_STATE_MAY_INLINE_DATA,        /* may have in-inode data */
        EXT4_STATE_EXT_PRECACHED,        /* extents have been precached */
        EXT4_STATE_LUSTRE_EA_INODE,        /* Lustre-style ea_inode */
        EXT4_STATE_VERITY_IN_PROGRESS,        /* building fs-verity Merkle tree */
        EXT4_STATE_FC_COMMITTING,        /* Fast commit ongoing */
        EXT4_STATE_FC_FLUSHING_DATA,        /* Fast commit flushing data */
        EXT4_STATE_ORPHAN_FILE,                /* Inode orphaned in orphan file */
};

#define EXT4_INODE_BIT_FNS(name, field, offset)                                \
static inline int ext4_test_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);        \
}                                                                        \
static inline void ext4_set_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}                                                                        \
static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
{                                                                        \
        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_flag(struct inode *inode, int bit);
static inline void ext4_set_inode_flag(struct inode *inode, int bit);
static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
EXT4_INODE_BIT_FNS(flag, flags, 0)

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_state(struct inode *inode, int bit);
static inline void ext4_set_inode_state(struct inode *inode, int bit);
static inline void ext4_clear_inode_state(struct inode *inode, int bit);
#if (BITS_PER_LONG < 64)
EXT4_INODE_BIT_FNS(state, state_flags, 0)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        (ei)->i_state_flags = 0;
}
#else
EXT4_INODE_BIT_FNS(state, flags, 32)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        /* We depend on the fact that callers will set i_flags */
}
#endif
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
 * macros from user land. */
#define EXT4_SB(sb)        (sb)
#endif

static inline bool ext4_verity_in_progress(struct inode *inode)
{
        return IS_ENABLED(CONFIG_FS_VERITY) &&
               ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
}

#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime

/*
 * Check whether the inode is tracked as orphan (either in orphan file or
 * orphan list).
 */
static inline bool ext4_inode_orphan_tracked(struct inode *inode)
{
        return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
                !list_empty(&EXT4_I(inode)->i_orphan);
}

/*
 * Codes for operating systems
 */
#define EXT4_OS_LINUX                0
#define EXT4_OS_HURD                1
#define EXT4_OS_MASIX                2
#define EXT4_OS_FREEBSD                3
#define EXT4_OS_LITES                4

/*
 * Revision levels
 */
#define EXT4_GOOD_OLD_REV        0        /* The good old (original) format */
#define EXT4_DYNAMIC_REV        1        /* V2 format w/ dynamic inode sizes */

#define EXT4_MAX_SUPP_REV        EXT4_DYNAMIC_REV

#define EXT4_GOOD_OLD_INODE_SIZE 128

#define EXT4_EXTRA_TIMESTAMP_MAX        (((s64)1 << 34) - 1  + S32_MIN)
#define EXT4_NON_EXTRA_TIMESTAMP_MAX        S32_MAX
#define EXT4_TIMESTAMP_MIN                S32_MIN

/*
 * Feature set definitions
 */

#define EXT4_FEATURE_COMPAT_DIR_PREALLOC        0x0001
#define EXT4_FEATURE_COMPAT_IMAGIC_INODES        0x0002
#define EXT4_FEATURE_COMPAT_HAS_JOURNAL                0x0004
#define EXT4_FEATURE_COMPAT_EXT_ATTR                0x0008
#define EXT4_FEATURE_COMPAT_RESIZE_INODE        0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX                0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2        0x0200
/*
 * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
 * incompatible only if fast commit blocks are present in the FS. Since we
 * clear the journal (and thus the fast commit blocks), we don't mark FS as
 * incompatible. We also have a JBD2 incompat feature, which gets set when
 * there are fast commit blocks present in the journal.
 */
#define EXT4_FEATURE_COMPAT_FAST_COMMIT                0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES        0x0800
#define EXT4_FEATURE_COMPAT_ORPHAN_FILE                0x1000        /* Orphan file exists */

#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER        0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE        0x0002
#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR        0x0004
#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM                0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK        0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE        0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA                0x0100
#define EXT4_FEATURE_RO_COMPAT_BIGALLOC                0x0200
/*
 * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
 * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
 * all other data structures' checksums.  However, the METADATA_CSUM and
 * GDT_CSUM bits are mutually exclusive.
 */
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM        0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY                0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT                0x2000
#define EXT4_FEATURE_RO_COMPAT_VERITY                0x8000
#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT        0x10000 /* Orphan file may be
                                                           non-empty */

#define EXT4_FEATURE_INCOMPAT_COMPRESSION        0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE                0x0002
#define EXT4_FEATURE_INCOMPAT_RECOVER                0x0004 /* Needs recovery */
#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV        0x0008 /* Journal device */
#define EXT4_FEATURE_INCOMPAT_META_BG                0x0010
#define EXT4_FEATURE_INCOMPAT_EXTENTS                0x0040 /* extents support */
#define EXT4_FEATURE_INCOMPAT_64BIT                0x0080
#define EXT4_FEATURE_INCOMPAT_MMP               0x0100
#define EXT4_FEATURE_INCOMPAT_FLEX_BG                0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE                0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA                0x1000 /* data in dirent */
#define EXT4_FEATURE_INCOMPAT_CSUM_SEED                0x2000
#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA        0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT                0x10000
#define EXT4_FEATURE_INCOMPAT_CASEFOLD                0x20000

extern void ext4_update_dynamic_rev(struct super_block *sb);

#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_compat |= \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
}

#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_ro_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
}

#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_incompat |= \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_incompat &= \
                ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
}

EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc,                DIR_PREALLOC)
EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes,        IMAGIC_INODES)
EXT4_FEATURE_COMPAT_FUNCS(journal,                HAS_JOURNAL)
EXT4_FEATURE_COMPAT_FUNCS(xattr,                EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode,                RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index,                DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,        SPARSE_SUPER2)
EXT4_FEATURE_COMPAT_FUNCS(fast_commit,                FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes,        STABLE_INODES)
EXT4_FEATURE_COMPAT_FUNCS(orphan_file,                ORPHAN_FILE)

EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,        SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,        LARGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir,                BTREE_DIR)
EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file,                HUGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum,                GDT_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink,                DIR_NLINK)
EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize,        EXTRA_ISIZE)
EXT4_FEATURE_RO_COMPAT_FUNCS(quota,                QUOTA)
EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc,                BIGALLOC)
EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,        METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,                READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project,                PROJECT)
EXT4_FEATURE_RO_COMPAT_FUNCS(verity,                VERITY)
EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present,        ORPHAN_PRESENT)

EXT4_FEATURE_INCOMPAT_FUNCS(compression,        COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype,                FILETYPE)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery,        RECOVER)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev,        JOURNAL_DEV)
EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg,                META_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(extents,                EXTENTS)
EXT4_FEATURE_INCOMPAT_FUNCS(64bit,                64BIT)
EXT4_FEATURE_INCOMPAT_FUNCS(mmp,                MMP)
EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg,                FLEX_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode,                EA_INODE)
EXT4_FEATURE_INCOMPAT_FUNCS(dirdata,                DIRDATA)
EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed,                CSUM_SEED)
EXT4_FEATURE_INCOMPAT_FUNCS(largedir,                LARGEDIR)
EXT4_FEATURE_INCOMPAT_FUNCS(inline_data,        INLINE_DATA)
EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,                ENCRYPT)
EXT4_FEATURE_INCOMPAT_FUNCS(casefold,                CASEFOLD)

#define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT2_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT3_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT3_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT3_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT4_FEATURE_COMPAT_SUPP        (EXT4_FEATURE_COMPAT_EXT_ATTR| \
                                         EXT4_FEATURE_COMPAT_ORPHAN_FILE)
#define EXT4_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG| \
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
                                         EXT4_FEATURE_INCOMPAT_EA_INODE| \
                                         EXT4_FEATURE_INCOMPAT_MMP | \
                                         EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
                                         EXT4_FEATURE_INCOMPAT_ENCRYPT | \
                                         EXT4_FEATURE_INCOMPAT_CASEFOLD | \
                                         EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
                                         EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
                                         EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
                                         EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
                                         EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
                                         EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
                                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
                                         EXT4_FEATURE_RO_COMPAT_QUOTA |\
                                         EXT4_FEATURE_RO_COMPAT_PROJECT |\
                                         EXT4_FEATURE_RO_COMPAT_VERITY |\
                                         EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT)

#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \
}

EXTN_FEATURE_FUNCS(2)
EXTN_FEATURE_FUNCS(3)
EXTN_FEATURE_FUNCS(4)

static inline bool ext4_has_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_compat != 0);
}
static inline bool ext4_has_ro_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0);
}
static inline bool ext4_has_incompat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
}

extern int ext4_feature_set_ok(struct super_block *sb, int readonly);

/*
 * Superblock flags
 */
enum {
        EXT4_FLAGS_RESIZING,        /* Avoid superblock update and resize race */
        EXT4_FLAGS_SHUTDOWN,        /* Prevent access to the file system */
        EXT4_FLAGS_BDEV_IS_DAX,        /* Current block device support DAX */
        EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */
};

static inline int ext4_forced_shutdown(struct super_block *sb)
{
        return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}

static inline int ext4_emergency_ro(struct super_block *sb)
{
        return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
}

static inline int ext4_emergency_state(struct super_block *sb)
{
        if (unlikely(ext4_forced_shutdown(sb)))
                return -EIO;
        if (unlikely(ext4_emergency_ro(sb)))
                return -EROFS;
        return 0;
}

/*
 * Default values for user and/or group using reserved blocks
 */
#define        EXT4_DEF_RESUID                0
#define        EXT4_DEF_RESGID                0

/*
 * Default project ID
 */
#define        EXT4_DEF_PROJID                0

#define EXT4_DEF_INODE_READAHEAD_BLKS        32

/*
 * Default mount options
 */
#define EXT4_DEFM_DEBUG                0x0001
#define EXT4_DEFM_BSDGROUPS        0x0002
#define EXT4_DEFM_XATTR_USER        0x0004
#define EXT4_DEFM_ACL                0x0008
#define EXT4_DEFM_UID16                0x0010
#define EXT4_DEFM_JMODE                0x0060
#define EXT4_DEFM_JMODE_DATA        0x0020
#define EXT4_DEFM_JMODE_ORDERED        0x0040
#define EXT4_DEFM_JMODE_WBACK        0x0060
#define EXT4_DEFM_NOBARRIER        0x0100
#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
#define EXT4_DEFM_DISCARD        0x0400
#define EXT4_DEFM_NODELALLOC        0x0800

/*
 * Default journal batch times and ioprio.
 */
#define EXT4_DEF_MIN_BATCH_TIME        0
#define EXT4_DEF_MAX_BATCH_TIME        15000 /* 15ms */
#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))


/*
 * Default values for superblock update
 */
#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */
#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */

/*
 * Minimum number of groups in a flexgroup before we separate out
 * directories into the first block group of a flexgroup
 */
#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME        4

/*
 * Structure of a directory entry
 */
#define EXT4_NAME_LEN 255
/*
 * Base length of the ext4 directory entry excluding the name length
 */
#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)

struct ext4_dir_entry {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __le16        name_len;                /* Name length */
        char        name[EXT4_NAME_LEN];        /* File name */
};


/*
 * Encrypted Casefolded entries require saving the hash on disk. This structure
 * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned
 * boundary.
 */
struct ext4_dir_entry_hash {
        __le32 hash;
        __le32 minor_hash;
};

/*
 * The new version of the directory entry.  Since EXT4 structures are
 * stored in intel byte order, and the name_len field could never be
 * bigger than 255 chars, it's safe to reclaim the extra byte for the
 * file_type field.
 */
struct ext4_dir_entry_2 {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __u8        name_len;                /* Name length */
        __u8        file_type;                /* See file type macros EXT4_FT_* below */
        char        name[EXT4_NAME_LEN];        /* File name */
};

/*
 * Access the hashes at the end of ext4_dir_entry_2
 */
#define EXT4_DIRENT_HASHES(entry) \
        ((struct ext4_dir_entry_hash *) \
                (((void *)(entry)) + \
                ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash)
#define EXT4_DIRENT_MINOR_HASH(entry) \
                le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash)

static inline bool ext4_hash_in_dirent(const struct inode *inode)
{
        return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode);
}

/*
 * This is a bogus directory entry at the end of each leaf block that
 * records checksums.
 */
struct ext4_dir_entry_tail {
        __le32        det_reserved_zero1;        /* Pretend to be unused */
        __le16        det_rec_len;                /* 12 */
        __u8        det_reserved_zero2;        /* Zero name length */
        __u8        det_reserved_ft;        /* 0xDE, fake file type */
        __le32        det_checksum;                /* crc32c(uuid+inum+dirblock) */
};

#define EXT4_DIRENT_TAIL(block, blocksize) \
        ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
                                        ((blocksize) - \
                                         sizeof(struct ext4_dir_entry_tail))))

/*
 * Ext4 directory file types.  Only the low 3 bits are used.  The
 * other bits are reserved for now.
 */
#define EXT4_FT_UNKNOWN                0
#define EXT4_FT_REG_FILE        1
#define EXT4_FT_DIR                2
#define EXT4_FT_CHRDEV                3
#define EXT4_FT_BLKDEV                4
#define EXT4_FT_FIFO                5
#define EXT4_FT_SOCK                6
#define EXT4_FT_SYMLINK                7

#define EXT4_FT_MAX                8

#define EXT4_FT_DIR_CSUM        0xDE

/*
 * EXT4_DIR_PAD defines the directory entries boundaries
 *
 * NOTE: It must be a multiple of 4
 */
#define EXT4_DIR_PAD                        4
#define EXT4_DIR_ROUND                        (EXT4_DIR_PAD - 1)
#define EXT4_MAX_REC_LEN                ((1<<16)-1)

/*
 * The rec_len is dependent on the type of directory. Directories that are
 * casefolded and encrypted need to store the hash as well, so we add room for
 * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should
 * pass NULL for dir, as those entries do not use the extra fields.
 */
static inline unsigned int ext4_dir_rec_len(__u8 name_len,
                                                const struct inode *dir)
{
        int rec_len = (name_len + 8 + EXT4_DIR_ROUND);

        if (dir && ext4_hash_in_dirent(dir))
                rec_len += sizeof(struct ext4_dir_entry_hash);
        return (rec_len & ~EXT4_DIR_ROUND);
}

static inline unsigned int
ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
        unsigned len = le16_to_cpu(dlen);

        if (len == EXT4_MAX_REC_LEN || len == 0)
                return blocksize;
        return (len & 65532) | ((len & 3) << 16);
}

static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
        BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3));
        if (len < 65536)
                return cpu_to_le16(len);
        if (len == blocksize) {
                if (blocksize == 65536)
                        return cpu_to_le16(EXT4_MAX_REC_LEN);
                else
                        return cpu_to_le16(0);
        }
        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
}

/*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
 */

#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \
                    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \
                    !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir)))
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)

/* Legal values for the dx_root hash_version field: */

#define DX_HASH_LEGACY                        0
#define DX_HASH_HALF_MD4                1
#define DX_HASH_TEA                        2
#define DX_HASH_LEGACY_UNSIGNED                3
#define DX_HASH_HALF_MD4_UNSIGNED        4
#define DX_HASH_TEA_UNSIGNED                5
#define DX_HASH_SIPHASH                        6
#define DX_HASH_LAST                         DX_HASH_SIPHASH

static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length)
{
        return crc32c(crc, address, length);
}

#ifdef __KERNEL__

/* hash info structure used by the directory hash */
struct dx_hash_info
{
        u32                hash;
        u32                minor_hash;
        int                hash_version;
        u32                *seed;
};


/* 32 and 64 bit signed EOF for dx directories */
#define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
#define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)


/*
 * Control parameters used by ext4_htree_next_block
 */
#define HASH_NB_ALWAYS                1

struct ext4_filename {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        struct dx_hash_info hinfo;
#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_str crypto_buf;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
        struct qstr cf_name;
#endif
};

#define fname_name(p) ((p)->disk_name.name)
#define fname_usr_name(p) ((p)->usr_fname->name)
#define fname_len(p)  ((p)->disk_name.len)

/*
 * Describe an inode's exact location on disk and in memory
 */
struct ext4_iloc
{
        struct buffer_head *bh;
        unsigned long offset;
        ext4_group_t block_group;
};

static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
{
        return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
}

static inline bool ext4_is_quota_file(struct inode *inode)
{
        return IS_NOQUOTA(inode) &&
               !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
}

/*
 * This structure is stuffed into the struct file's private_data field
 * for directories.  It is where we put information so that we can do
 * readdir operations in hash tree order.
 */
struct dir_private_info {
        struct rb_root        root;
        struct rb_node        *curr_node;
        struct fname        *extra_fname;
        loff_t                last_pos;
        __u32                curr_hash;
        __u32                curr_minor_hash;
        __u32                next_hash;
        u64                cookie;
        bool                initialized;
};

/* calculate the first block number of the group */
static inline ext4_fsblk_t
ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
{
        return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
                le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
}

/*
 * Special error return code only used by dx_probe() and its callers.
 */
#define ERR_BAD_DX_DIR        (-(MAX_ERRNO - 1))

/* htree levels for ext4 */
#define        EXT4_HTREE_LEVEL_COMPAT        2
#define        EXT4_HTREE_LEVEL        3

static inline int ext4_dir_htree_level(struct super_block *sb)
{
        return ext4_has_feature_largedir(sb) ?
                EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
}

/*
 * Timeout and state flag for lazy initialization inode thread.
 */
#define EXT4_DEF_LI_WAIT_MULT                        10
#define EXT4_DEF_LI_MAX_START_DELAY                5
#define EXT4_LAZYINIT_QUIT                        0x0001
#define EXT4_LAZYINIT_RUNNING                        0x0002

/*
 * Lazy inode table initialization info
 */
struct ext4_lazy_init {
        unsigned long                li_state;
        struct list_head        li_request_list;
        struct mutex                li_list_mtx;
};

enum ext4_li_mode {
        EXT4_LI_MODE_PREFETCH_BBITMAP,
        EXT4_LI_MODE_ITABLE,
};

struct ext4_li_request {
        struct super_block        *lr_super;
        enum ext4_li_mode        lr_mode;
        ext4_group_t                lr_first_not_zeroed;
        ext4_group_t                lr_next_group;
        struct list_head        lr_request;
        unsigned long                lr_next_sched;
        unsigned long                lr_timeout;
};

struct ext4_features {
        struct kobject f_kobj;
        struct completion f_kobj_unregister;
};

/*
 * This structure will be used for multiple mount protection. It will be
 * written into the block number saved in the s_mmp_block field in the
 * superblock. Programs that check MMP should assume that if
 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
 * to use the filesystem, regardless of how old the timestamp is.
 */
#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */

struct mmp_struct {
        __le32        mmp_magic;                /* Magic number for MMP */
        __le32        mmp_seq;                /* Sequence no. updated periodically */

        /*
         * mmp_time, mmp_nodename & mmp_bdevname are only used for information
         * purposes and do not affect the correctness of the algorithm
         */
        __le64        mmp_time;                /* Time last updated */
        char        mmp_nodename[64];        /* Node which last updated MMP block */
        char        mmp_bdevname[32];        /* Bdev which last updated MMP block */

        /*
         * mmp_check_interval is used to verify if the MMP block has been
         * updated on the block device. The value is updated based on the
         * maximum time to write the MMP block during an update cycle.
         */
        __le16        mmp_check_interval;

        __le16        mmp_pad1;
        __le32        mmp_pad2[226];
        __le32        mmp_checksum;                /* crc32c(uuid+mmp_block) */
};

/* arguments passed to the mmp thread */
struct mmpd_data {
        struct buffer_head *bh; /* bh from initial read_mmp_block() */
        struct super_block *sb;  /* super block of the fs */
};

/*
 * Check interval multiplier
 * The MMP block is written every update interval and initially checked every
 * update interval x the multiplier (the value is then adapted based on the
 * write latency). The reason is that writes can be delayed under load and we
 * don't want readers to incorrectly assume that the filesystem is no longer
 * in use.
 */
#define EXT4_MMP_CHECK_MULT                2UL

/*
 * Minimum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MIN_CHECK_INTERVAL        5UL

/*
 * Maximum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MAX_CHECK_INTERVAL        300UL

/*
 * Function prototypes
 */

/*
 * Ok, these declarations are also in <linux/kernel.h> but none of the
 * ext4 source programs needs to include it so they are duplicated here.
 */
# define NORET_TYPE        /**/
# define ATTRIB_NORET        __attribute__((noreturn))
# define NORET_AND        noreturn,

/* bitmap.c */
extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh);
int ext4_inode_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh);
void ext4_block_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh);
int ext4_block_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh);

/* balloc.c */
extern void ext4_get_group_no_and_offset(struct super_block *sb,
                                         ext4_fsblk_t blocknr,
                                         ext4_group_t *blockgrpp,
                                         ext4_grpblk_t *offsetp);
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
                                          ext4_fsblk_t block);

extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                         ext4_fsblk_t goal,
                                         unsigned int flags,
                                         unsigned long *count,
                                         int *errp);
extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
                                    s64 nclusters, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
                                                   ext4_group_t group);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);

extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
                                                ext4_group_t block_group,
                                                bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
                                  ext4_group_t block_group,
                                  struct buffer_head *bh);
extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                                  ext4_group_t block_group);
extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                              ext4_group_t block_group,
                                              struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);

#if IS_ENABLED(CONFIG_UNICODE)
extern int ext4_fname_setup_ci_filename(struct inode *dir,
                                        const struct qstr *iname,
                                        struct ext4_filename *fname);

static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
{
        kfree(fname->cf_name.name);
        fname->cf_name.name = NULL;
}
#else
static inline int ext4_fname_setup_ci_filename(struct inode *dir,
                                               const struct qstr *iname,
                                               struct ext4_filename *fname)
{
        return 0;
}

static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
{
}
#endif

/* ext4 encryption related stuff goes here crypto.c */
#ifdef CONFIG_FS_ENCRYPTION
extern const struct fscrypt_operations ext4_cryptops;

int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
                              int lookup, struct ext4_filename *fname);

int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
                              struct ext4_filename *fname);

void ext4_fname_free_filename(struct ext4_filename *fname);

int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg);

#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
                                            const struct qstr *iname,
                                            int lookup,
                                            struct ext4_filename *fname)
{
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *) iname->name;
        fname->disk_name.len = iname->len;

        return ext4_fname_setup_ci_filename(dir, iname, fname);
}

static inline int ext4_fname_prepare_lookup(struct inode *dir,
                                            struct dentry *dentry,
                                            struct ext4_filename *fname)
{
        return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname);
}

static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
        ext4_fname_free_ci_filename(fname);
}

static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
                                                   void __user *arg)
{
        return -EOPNOTSUPP;
}
#endif /* !CONFIG_FS_ENCRYPTION */

/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, char *, int,
                                  unsigned int);
#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
                                (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                __u32 minor_hash,
                                struct ext4_dir_entry_2 *dirent,
                                struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
                             void *buf, int buf_size,
                             struct ext4_filename *fname,
                             struct ext4_dir_entry_2 **dest_de);
void ext4_insert_dentry(struct inode *dir, struct inode *inode,
                        struct ext4_dir_entry_2 *de,
                        int buf_size,
                        struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
        if (!ext4_has_feature_dir_index(inode->i_sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                /* ext4_iget() should have caught this... */
                WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
        }
}
static const unsigned char ext4_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};

static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
{
        if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
                return DT_UNKNOWN;

        return ext4_filetype_table[filetype];
}
extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
                             void *buf, int buf_size);

/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);

/* hash.c */
extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
                          struct dx_hash_info *hinfo);

/* ialloc.c */
extern int ext4_mark_inode_used(struct super_block *sb, int ino);
extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *,
                                      struct inode *, umode_t,
                                      const struct qstr *qstr, __u32 goal,
                                      uid_t *owner, __u32 i_flags,
                                      int handle_type, unsigned int line_no,
                                      int nblocks);

#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags)          \
        __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr),      \
                         (goal), (owner), i_flags, 0, 0, 0)
#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \
                                    type, nblocks)                    \
        __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \
                         0, (type), __LINE__, (nblocks))


extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
                                 ext4_group_t group, int barrier);
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);

/* fast_commit.c */
int ext4_fc_info_show(struct seq_file *seq, void *v);
void ext4_fc_init(struct super_block *sb, journal_t *journal);
void ext4_fc_init_inode(struct inode *inode);
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
                         ext4_lblk_t end);
void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_link(handle_t *handle, struct inode *inode,
                        struct dentry *dentry);
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
                            struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
int __init ext4_fc_init_dentry_cache(void);
void ext4_fc_destroy_dentry_cache(void);
int ext4_fc_record_regions(struct super_block *sb, int ino,
                           ext4_lblk_t lblk, ext4_fsblk_t pblk,
                           int len, int replay);

/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern void ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
extern void ext4_discard_preallocations(struct inode *);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
                                     ext4_group_t group,
                                     unsigned int nr, int *cnt);
extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
                                  unsigned int nr);

extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
                                   ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
                            int len, bool state);
static inline bool ext4_mb_cr_expensive(enum criteria cr)
{
        return cr >= CR_GOAL_LEN_SLOW;
}

/* inode.c */
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
void ext4_check_map_extents_env(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs);
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
                           struct inode *inode,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle, struct inode *inode,
                                     struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
                                struct buffer_head *bh);
void ext4_set_inode_mapping_order(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA         2

typedef enum {
        EXT4_IGET_NORMAL =        0,
        EXT4_IGET_SPECIAL =        0x0001, /* OK to iget a system inode */
        EXT4_IGET_HANDLE =         0x0002,        /* Inode # is from a handle */
        EXT4_IGET_BAD =                0x0004, /* Allow to iget a bad inode */
        EXT4_IGET_EA_INODE =        0x0008        /* Inode should contain an EA value */
} ext4_iget_flags;

extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                                 ext4_iget_flags flags, const char *function,
                                 unsigned int line);

#define ext4_iget(sb, ino, flags) \
        __ext4_iget((sb), (ino), (flags), __func__, __LINE__)

extern int  ext4_write_inode(struct inode *, struct writeback_control *);
extern int  ext4_setattr(struct mnt_idmap *, struct dentry *,
                         struct iattr *);
extern u32  ext4_dio_alignment(struct inode *inode);
extern int  ext4_getattr(struct mnt_idmap *, const struct path *,
                         struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int  ext4_file_getattr(struct mnt_idmap *, const struct path *,
                              struct kstat *, u32, unsigned int);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_truncate_page_cache_block_range(struct inode *inode,
                                                loff_t start, loff_t end);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks);
extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
                                  int pextents);
extern int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end);
extern int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart,
                                    loff_t length, bool *did_zero);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
                              ext4_fsblk_t pblk, ext4_lblk_t len);

static inline bool is_special_ino(struct super_block *sb, unsigned long ino)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
                ino == le32_to_cpu(es->s_usr_quota_inum) ||
                ino == le32_to_cpu(es->s_grp_quota_inum) ||
                ino == le32_to_cpu(es->s_prj_quota_inum) ||
                ino == le32_to_cpu(es->s_orphan_file_inum);
}

/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
                                 ext4_lblk_t start, ext4_lblk_t end);

/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
int ext4_fileattr_set(struct mnt_idmap *idmap,
                      struct dentry *dentry, struct file_kattr *fa);
int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
int ext4_update_overhead(struct super_block *sb, bool force);
int ext4_force_shutdown(struct super_block *sb, u32 flags);

/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);

/* namei.c */
extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
                             struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
                                     struct buffer_head *bh);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
extern int ext4_search_dir(struct buffer_head *bh,
                           char *search_buf,
                           int buf_size,
                           struct inode *dir,
                           struct ext4_filename *fname,
                           unsigned int offset,
                           struct ext4_dir_entry_2 **res_dir);
extern int ext4_generic_delete_entry(struct inode *dir,
                                     struct ext4_dir_entry_2 *de_del,
                                     struct buffer_head *bh,
                                     void *entry_buf,
                                     int buf_size,
                                     int csum_size);
extern bool ext4_empty_dir(struct inode *inode);

/* resize.c */
extern void ext4_kvfree_array_rcu(void *to_free);
extern int ext4_group_add(struct super_block *sb,
                                struct ext4_new_group_data *input);
extern int ext4_group_extend(struct super_block *sb,
                                struct ext4_super_block *es,
                                ext4_fsblk_t n_blocks_count);
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
extern unsigned int ext4_list_backups(struct super_block *sb,
                                      unsigned int *three, unsigned int *five,
                                      unsigned int *seven);

/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
                                         sector_t block, blk_opf_t op_flags);
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
                                                   sector_t block);
extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
                                                sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
                                bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
                        bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern __le32 ext4_superblock_csum(struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
                                    ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                             ext4_group_t block_group,
                                             unsigned int flags);
extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
                                              ext4_group_t block_group);
extern void print_daily_error_info(struct timer_list *t);

extern __printf(7, 8)
void __ext4_error(struct super_block *, const char *, unsigned int, bool,
                  int, __u64, const char *, ...);
extern __printf(6, 7)
void __ext4_error_inode(struct inode *, const char *, unsigned int,
                        ext4_fsblk_t, int, const char *, ...);
extern __printf(5, 6)
void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
extern __printf(4, 5)
void __ext4_warning_inode(const struct inode *inode, const char *function,
                          unsigned int line, const char *fmt, ...);
extern __printf(3, 4)
void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
                           const char *, unsigned int, const char *);
extern __printf(7, 8)
void __ext4_grp_locked_error(const char *, unsigned int,
                             struct super_block *, ext4_group_t,
                             u64, ext4_fsblk_t,
                             const char *, ...);

#define EXT4_ERROR_INODE(inode, fmt, a...) \
        ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)

#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...)                        \
        __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a)

#define ext4_error_inode_block(inode, block, err, fmt, a...)                \
        __ext4_error_inode((inode), __func__, __LINE__, (block), (err),        \
                           (fmt), ## a)

#define EXT4_ERROR_FILE(file, block, fmt, a...)                                \
        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)

#define ext4_abort(sb, err, fmt, a...)                                        \
        __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a)

#ifdef CONFIG_PRINTK

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
        __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
        __ext4_error_inode((inode), (func), (line), (block),                 \
                           (err), (fmt), ##__VA_ARGS__)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
        __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...)                                        \
        __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt),        \
                ##__VA_ARGS__)
#define ext4_error_err(sb, err, fmt, ...)                                \
        __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt),        \
                ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...)                                        \
        __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...)                                \
        __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_msg(sb, level, fmt, ...)                                \
        __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
        __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
                                fmt, ##__VA_ARGS__)

#else

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, 0, " ");                \
} while (0)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, err, " ");                \
} while (0)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_file(file, "", 0, block, " ");                        \
} while (0)
#define ext4_error(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, false, 0, 0, " ");                        \
} while (0)
#define ext4_error_err(sb, err, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, false, err, 0, " ");                        \
} while (0)
#define ext4_warning(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning(sb, "", 0, " ");                                        \
} while (0)
#define ext4_warning_inode(inode, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning_inode(inode, "", 0, " ");                        \
} while (0)
#define ext4_msg(sb, level, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_msg(sb, "", " ");                                        \
} while (0)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, "", 0, "")
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                \
        __ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");        \
} while (0)

#endif

extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
extern __u32 ext4_free_group_clusters(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern __u32 ext4_free_inodes_count(struct super_block *sb,
                                 struct ext4_group_desc *bg);
extern __u32 ext4_used_dirs_count(struct super_block *sb,
                                struct ext4_group_desc *bg);
extern __u32 ext4_itable_unused_count(struct super_block *sb,
                                   struct ext4_group_desc *bg);
extern void ext4_block_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_free_group_clusters_set(struct super_block *sb,
                                         struct ext4_group_desc *bg,
                                         __u32 count);
extern void ext4_free_inodes_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
                                       struct ext4_group_desc *gdp);
extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
                                     struct ext4_group_desc *gdp);
extern int ext4_register_li_request(struct super_block *sb,
                                    ext4_group_t first_not_zeroed);

static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
        return ext4_has_feature_gdt_csum(sb) ||
               ext4_has_feature_metadata_csum(sb);
}

#define ext4_read_incompat_64bit_val(es, name) \
        (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \
                ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \
                le32_to_cpu(es->name##_lo))

static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_blocks_count);
}

static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_r_blocks_count);
}

static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_free_blocks_count);
}

static inline void ext4_blocks_count_set(struct ext4_super_block *es,
                                         ext4_fsblk_t blk)
{
        es->s_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
                                              ext4_fsblk_t blk)
{
        es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
                                           ext4_fsblk_t blk)
{
        es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline loff_t ext4_isize(struct super_block *sb,
                                struct ext4_inode *raw_inode)
{
        if (ext4_has_feature_largedir(sb) ||
            S_ISREG(le16_to_cpu(raw_inode->i_mode)))
                return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
                        le32_to_cpu(raw_inode->i_size_lo);

        return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}

static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
{
        raw_inode->i_size_lo = cpu_to_le32(i_size);
        raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
}

/*
 * Reading s_groups_count requires using smp_rmb() afterwards.  See
 * the locking protocol documented in the comments of ext4_group_add()
 * in resize.c
 */
static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
{
        ext4_group_t        ngroups = EXT4_SB(sb)->s_groups_count;

        smp_rmb();
        return ngroups;
}

static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
                                             ext4_group_t block_group)
{
        return block_group >> sbi->s_log_groups_per_flex;
}

static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
{
        return 1 << sbi->s_log_groups_per_flex;
}

static inline loff_t ext4_get_maxbytes(struct inode *inode)
{
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return inode->i_sb->s_maxbytes;
        return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
}

#define ext4_std_error(sb, errno)                                \
do {                                                                \
        if ((errno))                                                \
                __ext4_std_error((sb), __func__, __LINE__, (errno));        \
} while (0)

#ifdef CONFIG_SMP
/* Each CPU can accumulate percpu_counter_batch clusters in their local
 * counters. So we need to make sure we have free clusters more
 * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
 */
#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
#else
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif

/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
        WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
                     !inode_is_locked(inode));
        down_write(&EXT4_I(inode)->i_data_sem);
        if (newsize > EXT4_I(inode)->i_disksize)
                WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
        up_write(&EXT4_I(inode)->i_data_sem);
}

/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
        int changed = 0;

        if (newsize > inode->i_size) {
                i_size_write(inode, newsize);
                changed = 1;
        }
        if (newsize > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, newsize);
                changed |= 2;
        }
        return changed;
}

int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len);

struct ext4_group_info {
        unsigned long   bb_state;
#ifdef AGGRESSIVE_CHECK
        unsigned long        bb_check_counter;
#endif
        struct rb_root  bb_free_root;
        ext4_grpblk_t        bb_first_free;        /* first free block */
        ext4_grpblk_t        bb_free;        /* total free blocks */
        ext4_grpblk_t        bb_fragments;        /* nr of freespace fragments */
        int                bb_avg_fragment_size_order;        /* order of average
                                                           fragment in BG */
        ext4_grpblk_t        bb_largest_free_order;/* order of largest frag in BG */
        ext4_group_t        bb_group;        /* Group number */
        struct          list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
        void            *bb_bitmap;
#endif
        struct rw_semaphore alloc_sem;
        ext4_grpblk_t        bb_counters[];        /* Nr of free power-of-two-block
                                         * regions, index is order.
                                         * bb_counters[3] = 5 means
                                         * 5 free 8-block regions. */
};

#define EXT4_GROUP_INFO_NEED_INIT_BIT                0
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT                1
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT        2
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT        3
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_BBITMAP_READ_BIT        4

#define EXT4_MB_GRP_NEED_INIT(grp)        \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))

#define EXT4_MB_GRP_WAS_TRIMMED(grp)        \
        (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_SET_TRIMMED(grp)        \
        (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)        \
        (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_TEST_AND_SET_READ(grp)        \
        (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))

#define EXT4_MAX_CONTENTION                8
#define EXT4_CONTENTION_THRESHOLD        2

static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
                                              ext4_group_t group)
{
        return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}

/*
 * Returns true if the filesystem is busy enough that attempts to
 * access the block group locks has run into contention.
 */
static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
{
        return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}

static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group)
{
        if (!spin_trylock(ext4_group_lock_ptr(sb, group)))
                return false;
        /*
         * We're able to grab the lock right away, so drop the lock
         * contention counter.
         */
        atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
        return true;
}

static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
        if (!ext4_try_lock_group(sb, group)) {
                /*
                 * The lock is busy, so bump the contention counter,
                 * and then wait on the spin lock.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
                                  EXT4_MAX_CONTENTION);
                spin_lock(ext4_group_lock_ptr(sb, group));
        }
}

static inline void ext4_unlock_group(struct super_block *sb,
                                        ext4_group_t group)
{
        spin_unlock(ext4_group_lock_ptr(sb, group));
}

#ifdef CONFIG_QUOTA
static inline bool ext4_quota_capable(struct super_block *sb)
{
        return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb));
}

static inline bool ext4_is_quota_journalled(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        return (ext4_has_feature_quota(sb) ||
                sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
}
int ext4_enable_quotas(struct super_block *sb);
#endif

/*
 * Block validity checking
 */
#define ext4_check_indirect_blockref(inode, bh)                                \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            (__le32 *)(bh)->b_data,                        \
                            EXT4_ADDR_PER_BLOCK((inode)->i_sb))

#define ext4_ind_check_inode(inode)                                        \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            EXT4_I(inode)->i_data,                        \
                            EXT4_NDIR_BLOCKS)

/*
 * Inodes and files operations
 */

/* dir.c */
extern const struct file_operations ext4_dir_operations;

/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);

/* inline.c */
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
extern void ext4_update_final_de(void *de_buf, int old_size, int new_size);

int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
                                         struct inode *inode,
                                         loff_t pos, unsigned len,
                                         struct folio **foliop);
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
                               unsigned copied, struct folio *folio);
extern int ext4_generic_write_inline_data(struct address_space *mapping,
                                          struct inode *inode,
                                          loff_t pos, unsigned len,
                                          struct folio **foliop,
                                          void **fsdata, bool da);
extern int ext4_try_add_inline_entry(handle_t *handle,
                                     struct ext4_filename *fname,
                                     struct inode *dir, struct inode *inode);
extern int ext4_try_create_inline_dir(handle_t *handle,
                                      struct inode *parent,
                                      struct inode *inode);
extern int ext4_read_inline_dir(struct file *filp,
                                struct dir_context *ctx,
                                int *has_inline_data);
extern int ext4_inlinedir_to_tree(struct file *dir_file,
                                  struct inode *dir, ext4_lblk_t block,
                                  struct dx_hash_info *hinfo,
                                  __u32 start_hash, __u32 start_minor_hash,
                                  int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                        struct ext4_filename *fname,
                                        struct ext4_dir_entry_2 **res_dir,
                                        int *has_inline_data);
extern int ext4_delete_inline_entry(handle_t *handle,
                                    struct inode *dir,
                                    struct ext4_dir_entry_2 *de_del,
                                    struct buffer_head *bh,
                                    int *has_inline_data);
extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
                                        struct ext4_dir_entry_2 **parent_de,
                                        int *retval);
extern void *ext4_read_inline_link(struct inode *inode);

struct iomap;
extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);

extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline);

extern int ext4_convert_inline_data(struct inode *inode);

static inline int ext4_has_inline_data(struct inode *inode)
{
        return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
               EXT4_I(inode)->i_inline_off;
}

/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
extern int ext4_init_dirblock(handle_t *handle, struct inode *inode,
                              struct buffer_head *dir_block,
                              unsigned int parent_ino, void *inline_buf,
                              int inline_size);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
                                        unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
                                      struct buffer_head *bh);
extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
                         struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
                       const struct qstr *d_name, struct dentry *dentry);

#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
        [S_IFREG >> S_SHIFT]        = EXT4_FT_REG_FILE,
        [S_IFDIR >> S_SHIFT]        = EXT4_FT_DIR,
        [S_IFCHR >> S_SHIFT]        = EXT4_FT_CHRDEV,
        [S_IFBLK >> S_SHIFT]        = EXT4_FT_BLKDEV,
        [S_IFIFO >> S_SHIFT]        = EXT4_FT_FIFO,
        [S_IFSOCK >> S_SHIFT]        = EXT4_FT_SOCK,
        [S_IFLNK >> S_SHIFT]        = EXT4_FT_SYMLINK,
};

static inline void ext4_set_de_type(struct super_block *sb,
                                struct ext4_dir_entry_2 *de,
                                umode_t mode) {
        if (ext4_has_feature_filetype(sb))
                de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}

/* readpages.c */
int ext4_read_folio(struct file *file, struct folio *folio);
void ext4_readahead(struct readahead_control *rac);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);

/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;

/* sysfs.c */
extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi);
extern int ext4_register_sysfs(struct super_block *sb);
extern void ext4_unregister_sysfs(struct super_block *sb);
extern int __init ext4_init_sysfs(void);
extern void ext4_exit_sysfs(void);

/* block_validity */
extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
extern int ext4_inode_block_valid(struct inode *inode,
                                  ext4_fsblk_t start_blk,
                                  unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
                               struct inode *, __le32 *, unsigned int);
extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
                                ext4_fsblk_t start_blk, unsigned int count);


/* extents.c */
struct ext4_ext_path;
struct ext4_extent;

/*
 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
 * __le32.
 */
#define EXT_MAX_BLOCKS        0xffffffff

extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
extern int ext4_ext_truncate(handle_t *, struct inode *);
extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                                 ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                          loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_extents_atomic(handle_t *handle,
                        struct inode *inode, loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
                                             ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
extern int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
                                  struct ext4_map_blocks *map, int flags);
extern int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
                                  struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
                                                   struct ext4_ext_path *path);
extern struct ext4_ext_path *ext4_ext_insert_extent(
                                handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path,
                                struct ext4_extent *newext, int gb_flags);
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
                                              struct ext4_ext_path *,
                                              int flags);
extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
extern int ext4_get_es_cache(struct inode *inode,
                             struct fiemap_extent_info *fieinfo,
                             __u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
                                struct inode *inode2, ext4_lblk_t lblk1,
                             ext4_lblk_t lblk2,  ext4_lblk_t count,
                             int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
                                       int check_cred, int restart_cred,
                                       int revoke_cred);
extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
extern int ext4_ext_replay_set_iblocks(struct inode *inode);
extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
                int len, int unwritten, ext4_fsblk_t pblk);
extern int ext4_ext_clear_bb(struct inode *inode);


/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
                                            struct inode *second);
extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
                                          struct inode *donor_inode);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);

/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
extern int ext4_put_io_end(ext4_io_end_t *io_end);
extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
                                struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
                size_t len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);

/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);

/* mmp.c */
extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);

/* verity.c */
extern const struct fsverity_operations ext4_verityops;

/* orphan.c */
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern void ext4_orphan_cleanup(struct super_block *sb,
                                struct ext4_super_block *es);
extern void ext4_release_orphan_info(struct super_block *sb);
extern int ext4_init_orphan_info(struct super_block *sb);
extern int ext4_orphan_file_empty(struct super_block *sb);
extern void ext4_orphan_file_block_trigger(
                                struct jbd2_buffer_trigger_type *triggers,
                                struct buffer_head *bh,
                                void *data, size_t size);

/*
 * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
 * to mark the bitmap uptodate. We need to also zero-out the bitmap
 */
#define BH_BITMAP_UPTODATE BH_JBDPrivateStart

static inline int bitmap_uptodate(struct buffer_head *bh)
{
        return (buffer_uptodate(bh) &&
                        test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
}
static inline void set_bitmap_uptodate(struct buffer_head *bh)
{
        set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}

extern int ext4_resize_begin(struct super_block *sb);
extern int ext4_resize_end(struct super_block *sb, bool update_backups);

static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end)
{
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN))
                io_end->flag |= EXT4_IO_END_UNWRITTEN;
}

static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
        if (io_end->flag & EXT4_IO_END_UNWRITTEN)
                io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
}

extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_report_ops;

static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If the buffer has the write error flag, we have failed
         * to write out data in the block.  In this  case, we don't
         * have to read the block because we may read the old data
         * successfully.
         */
        if (buffer_write_io_error(bh))
                set_buffer_uptodate(bh);
        return buffer_uptodate(bh);
}

static inline bool ext4_inode_can_atomic_write(struct inode *inode)
{

        return S_ISREG(inode->i_mode) &&
                ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
                EXT4_SB(inode->i_sb)->s_awu_min > 0;
}

extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
                                  loff_t pos, unsigned len,
                                  get_block_t *get_block);

#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
#define EXPORT_SYMBOL_FOR_EXT4_TEST(sym) \
        EXPORT_SYMBOL_FOR_MODULES(sym, "ext4-test")
#endif
#endif        /* __KERNEL__ */

#endif        /* _EXT4_H */























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MEMREMAP_H_
#define _LINUX_MEMREMAP_H_

#include <linux/mmzone.h>
#include <linux/range.h>
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>

struct resource;
struct device;

/**
 * struct vmem_altmap - pre-allocated storage for vmemmap_populate
 * @base_pfn: base of the entire dev_pagemap mapping
 * @reserve: pages mapped, but reserved for driver use (relative to @base)
 * @free: free pages set aside in the mapping for memmap storage
 * @align: pages reserved to meet allocation alignments
 * @alloc: track pages consumed, private to vmemmap_populate()
 */
struct vmem_altmap {
        unsigned long base_pfn;
        const unsigned long end_pfn;
        const unsigned long reserve;
        unsigned long free;
        unsigned long align;
        unsigned long alloc;
};

/*
 * Specialize ZONE_DEVICE memory into multiple types each has a different
 * usage.
 *
 * MEMORY_DEVICE_PRIVATE:
 * Device memory that is not directly addressable by the CPU: CPU can neither
 * read nor write private memory. In this case, we do still have struct pages
 * backing the device memory. Doing so simplifies the implementation, but it is
 * important to remember that there are certain points at which the struct page
 * must be treated as an opaque object, rather than a "normal" struct page.
 *
 * A more complete discussion of unaddressable memory may be found in
 * include/linux/hmm.h and Documentation/mm/hmm.rst.
 *
 * MEMORY_DEVICE_COHERENT:
 * Device memory that is cache coherent from device and CPU point of view. This
 * is used on platforms that have an advanced system bus (like CAPI or CXL). A
 * driver can hotplug the device memory using ZONE_DEVICE and with that memory
 * type. Any page of a process can be migrated to such memory. However no one
 * should be allowed to pin such memory so that it can always be evicted.
 *
 * MEMORY_DEVICE_FS_DAX:
 * Host memory that has similar access semantics as System RAM i.e. DMA
 * coherent and supports page pinning. In support of coordinating page
 * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
 * wakeup event whenever a page is unpinned and becomes idle. This
 * wakeup is used to coordinate physical address space management (ex:
 * fs truncate/hole punch) vs pinned pages (ex: device dma).
 *
 * MEMORY_DEVICE_GENERIC:
 * Host memory that has similar access semantics as System RAM i.e. DMA
 * coherent and supports page pinning. This is for example used by DAX devices
 * that expose memory using a character device.
 *
 * MEMORY_DEVICE_PCI_P2PDMA:
 * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
 * transactions.
 */
enum memory_type {
        /* 0 is reserved to catch uninitialized type fields */
        MEMORY_DEVICE_PRIVATE = 1,
        MEMORY_DEVICE_COHERENT,
        MEMORY_DEVICE_FS_DAX,
        MEMORY_DEVICE_GENERIC,
        MEMORY_DEVICE_PCI_P2PDMA,
};

struct dev_pagemap_ops {
        /*
         * Called once the folio refcount reaches 0.  The reference count will be
         * reset to one by the core code after the method is called to prepare
         * for handing out the folio again.
         */
        void (*folio_free)(struct folio *folio);

        /*
         * Used for private (un-addressable) device memory only.  Must migrate
         * the page back to a CPU accessible page.
         */
        vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);

        /*
         * Handle the memory failure happens on a range of pfns.  Notify the
         * processes who are using these pfns, and try to recover the data on
         * them if necessary.  The mf_flags is finally passed to the recover
         * function through the whole notify routine.
         *
         * When this is not implemented, or it returns -EOPNOTSUPP, the caller
         * will fall back to a common handler called mf_generic_kill_procs().
         */
        int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
                              unsigned long nr_pages, int mf_flags);

        /*
         * Used for private (un-addressable) device memory only.
         * This callback is used when a folio is split into
         * a smaller folio
         */
        void (*folio_split)(struct folio *head, struct folio *tail);
};

#define PGMAP_ALTMAP_VALID        (1 << 0)

/**
 * struct dev_pagemap - metadata for ZONE_DEVICE mappings
 * @altmap: pre-allocated/reserved memory for vmemmap allocations
 * @ref: reference count that pins the devm_memremap_pages() mapping
 * @done: completion for @ref
 * @type: memory type: see MEMORY_* above in memremap.h
 * @flags: PGMAP_* flags to specify defailed behavior
 * @vmemmap_shift: structural definition of how the vmemmap page metadata
 *      is populated, specifically the metadata page order.
 *        A zero value (default) uses base pages as the vmemmap metadata
 *        representation. A bigger value will set up compound struct pages
 *        of the requested order value.
 * @ops: method table
 * @owner: an opaque pointer identifying the entity that manages this
 *        instance.  Used by various helpers to make sure that no
 *        foreign ZONE_DEVICE memory is accessed.
 * @nr_range: number of ranges to be mapped
 * @range: range to be mapped when nr_range == 1
 * @ranges: array of ranges to be mapped when nr_range > 1
 */
struct dev_pagemap {
        struct vmem_altmap altmap;
        struct percpu_ref ref;
        struct completion done;
        enum memory_type type;
        unsigned int flags;
        unsigned long vmemmap_shift;
        const struct dev_pagemap_ops *ops;
        void *owner;
        int nr_range;
        union {
                struct range range;
                DECLARE_FLEX_ARRAY(struct range, ranges);
        };
};

static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap)
{
        return pgmap->ops && pgmap->ops->memory_failure;
}

static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
{
        if (pgmap->flags & PGMAP_ALTMAP_VALID)
                return &pgmap->altmap;
        return NULL;
}

static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
{
        return 1 << pgmap->vmemmap_shift;
}

static inline bool folio_is_device_private(const struct folio *folio)
{
        return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
                folio_is_zone_device(folio) &&
                folio->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

static inline bool is_device_private_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
                folio_is_device_private(page_folio(page));
}

static inline bool folio_is_pci_p2pdma(const struct folio *folio)
{
        return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
                folio_is_zone_device(folio) &&
                folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}

static inline void *folio_zone_device_data(const struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio);
        return folio->page.zone_device_data;
}

static inline void folio_set_zone_device_data(struct folio *folio, void *data)
{
        VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio);
        folio->page.zone_device_data = data;
}

static inline bool is_pci_p2pdma_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
                folio_is_pci_p2pdma(page_folio(page));
}

static inline bool folio_is_device_coherent(const struct folio *folio)
{
        return folio_is_zone_device(folio) &&
                folio->pgmap->type == MEMORY_DEVICE_COHERENT;
}

static inline bool is_device_coherent_page(const struct page *page)
{
        return folio_is_device_coherent(page_folio(page));
}

static inline bool folio_is_fsdax(const struct folio *folio)
{
        return folio_is_zone_device(folio) &&
                folio->pgmap->type == MEMORY_DEVICE_FS_DAX;
}

static inline bool is_fsdax_page(const struct page *page)
{
        return folio_is_fsdax(page_folio(page));
}

#ifdef CONFIG_ZONE_DEVICE
void zone_device_page_init(struct page *page, struct dev_pagemap *pgmap,
                           unsigned int order);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
struct dev_pagemap *get_dev_pagemap(unsigned long pfn);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);

unsigned long memremap_compat_align(void);

static inline void zone_device_folio_init(struct folio *folio,
                                          struct dev_pagemap *pgmap,
                                          unsigned int order)
{
        zone_device_page_init(&folio->page, pgmap, order);
        if (order)
                folio_set_large_rmappable(folio);
}

static inline void zone_device_private_split_cb(struct folio *original_folio,
                                                struct folio *new_folio)
{
        if (folio_is_device_private(original_folio)) {
                if (!original_folio->pgmap->ops->folio_split) {
                        if (new_folio) {
                                new_folio->pgmap = original_folio->pgmap;
                                new_folio->page.mapping =
                                        original_folio->page.mapping;
                        }
                } else {
                        original_folio->pgmap->ops->folio_split(original_folio,
                                                                 new_folio);
                }
        }
}

#else
static inline void *devm_memremap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
{
        /*
         * Fail attempts to call devm_memremap_pages() without
         * ZONE_DEVICE support enabled, this requires callers to fall
         * back to plain devm_memremap() based on config
         */
        WARN_ON_ONCE(1);
        return ERR_PTR(-ENXIO);
}

static inline void devm_memunmap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
{
}

static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn)
{
        return NULL;
}

static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
{
        return false;
}

/* when memremap_pages() is disabled all archs can remap a single page */
static inline unsigned long memremap_compat_align(void)
{
        return PAGE_SIZE;
}

static inline void zone_device_private_split_cb(struct folio *original_folio,
                                                struct folio *new_folio)
{
}
#endif /* CONFIG_ZONE_DEVICE */

static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
{
        if (pgmap)
                percpu_ref_put(&pgmap->ref);
}

#endif /* _LINUX_MEMREMAP_H_ */
















































































































































































































































































































































































































































































    2 


    2 











    1 





    1 







    1 




    1 


















































































































































   10 












   14 
















   13 




   14 

   12 









   14 

   12 



   11 
    2 




   13 


















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 1991, 1992  Linus Torvalds
 * Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
 * Copyright (C) 2016 - 2020 Christoph Hellwig
 */
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/falloc.h>
#include <linux/suspend.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/module.h>
#include <linux/io_uring/cmd.h>
#include "blk.h"

static inline struct inode *bdev_file_inode(struct file *file)
{
        return file->f_mapping->host;
}

static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
{
        blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;

        /* avoid the need for a I/O completion work item */
        if (iocb_is_dsync(iocb))
                opf |= REQ_FUA;
        return opf;
}

static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
                                struct iov_iter *iter)
{
        return (iocb->ki_pos | iov_iter_count(iter)) &
                        (bdev_logical_block_size(bdev) - 1);
}

static inline int blkdev_iov_iter_get_pages(struct bio *bio,
                struct iov_iter *iter, struct block_device *bdev)
{
        return bio_iov_iter_get_pages(bio, iter,
                        bdev_logical_block_size(bdev) - 1);
}

#define DIO_INLINE_BIO_VECS 4

static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
                struct iov_iter *iter, struct block_device *bdev,
                unsigned int nr_pages)
{
        struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
        loff_t pos = iocb->ki_pos;
        bool should_dirty = false;
        struct bio bio;
        ssize_t ret;

        if (nr_pages <= DIO_INLINE_BIO_VECS)
                vecs = inline_vecs;
        else {
                vecs = kmalloc_objs(struct bio_vec, nr_pages);
                if (!vecs)
                        return -ENOMEM;
        }

        if (iov_iter_rw(iter) == READ) {
                bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
                if (user_backed_iter(iter))
                        should_dirty = true;
        } else {
                bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
        }
        bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
        bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
        bio.bi_write_stream = iocb->ki_write_stream;
        bio.bi_ioprio = iocb->ki_ioprio;
        if (iocb->ki_flags & IOCB_ATOMIC)
                bio.bi_opf |= REQ_ATOMIC;

        ret = blkdev_iov_iter_get_pages(&bio, iter, bdev);
        if (unlikely(ret))
                goto out;
        ret = bio.bi_iter.bi_size;

        if (iov_iter_rw(iter) == WRITE)
                task_io_account_write(ret);

        if (iocb->ki_flags & IOCB_NOWAIT)
                bio.bi_opf |= REQ_NOWAIT;

        submit_bio_wait(&bio);

        bio_release_pages(&bio, should_dirty);
        if (unlikely(bio.bi_status))
                ret = blk_status_to_errno(bio.bi_status);

out:
        if (vecs != inline_vecs)
                kfree(vecs);

        bio_uninit(&bio);

        return ret;
}

enum {
        DIO_SHOULD_DIRTY        = 1,
        DIO_IS_SYNC                = 2,
};

struct blkdev_dio {
        union {
                struct kiocb                *iocb;
                struct task_struct        *waiter;
        };
        size_t                        size;
        atomic_t                ref;
        unsigned int                flags;
        struct bio                bio ____cacheline_aligned_in_smp;
};

static struct bio_set blkdev_dio_pool;

static void blkdev_bio_end_io(struct bio *bio)
{
        struct blkdev_dio *dio = bio->bi_private;
        bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
        bool is_sync = dio->flags & DIO_IS_SYNC;

        if (bio->bi_status && !dio->bio.bi_status)
                dio->bio.bi_status = bio->bi_status;

        if (bio_integrity(bio))
                bio_integrity_unmap_user(bio);

        if (atomic_dec_and_test(&dio->ref)) {
                if (!is_sync) {
                        struct kiocb *iocb = dio->iocb;
                        ssize_t ret;

                        WRITE_ONCE(iocb->private, NULL);

                        if (likely(!dio->bio.bi_status)) {
                                ret = dio->size;
                                iocb->ki_pos += ret;
                        } else {
                                ret = blk_status_to_errno(dio->bio.bi_status);
                        }

                        dio->iocb->ki_complete(iocb, ret);
                        bio_put(&dio->bio);
                } else {
                        struct task_struct *waiter = dio->waiter;

                        WRITE_ONCE(dio->waiter, NULL);
                        blk_wake_io_task(waiter);
                }
        }

        if (should_dirty) {
                bio_check_pages_dirty(bio);
        } else {
                bio_release_pages(bio, false);
                bio_put(bio);
        }
}

static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                struct block_device *bdev, unsigned int nr_pages)
{
        struct blk_plug plug;
        struct blkdev_dio *dio;
        struct bio *bio;
        bool is_read = (iov_iter_rw(iter) == READ), is_sync;
        blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
        loff_t pos = iocb->ki_pos;
        int ret = 0;

        bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
                               &blkdev_dio_pool);
        dio = container_of(bio, struct blkdev_dio, bio);
        atomic_set(&dio->ref, 1);
        /*
         * Grab an extra reference to ensure the dio structure which is embedded
         * into the first bio stays around.
         */
        bio_get(bio);

        is_sync = is_sync_kiocb(iocb);
        if (is_sync) {
                dio->flags = DIO_IS_SYNC;
                dio->waiter = current;
        } else {
                dio->flags = 0;
                dio->iocb = iocb;
        }

        dio->size = 0;
        if (is_read && user_backed_iter(iter))
                dio->flags |= DIO_SHOULD_DIRTY;

        blk_start_plug(&plug);

        for (;;) {
                bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
                bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
                bio->bi_write_stream = iocb->ki_write_stream;
                bio->bi_private = dio;
                bio->bi_end_io = blkdev_bio_end_io;
                bio->bi_ioprio = iocb->ki_ioprio;

                ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
                if (unlikely(ret)) {
                        bio->bi_status = BLK_STS_IOERR;
                        bio_endio(bio);
                        break;
                }
                if (iocb->ki_flags & IOCB_NOWAIT) {
                        /*
                         * This is nonblocking IO, and we need to allocate
                         * another bio if we have data left to map. As we
                         * cannot guarantee that one of the sub bios will not
                         * fail getting issued FOR NOWAIT and as error results
                         * are coalesced across all of them, be safe and ask for
                         * a retry of this from blocking context.
                         */
                        if (unlikely(iov_iter_count(iter))) {
                                ret = -EAGAIN;
                                goto fail;
                        }
                        bio->bi_opf |= REQ_NOWAIT;
                }
                if (iocb->ki_flags & IOCB_HAS_METADATA) {
                        ret = bio_integrity_map_iter(bio, iocb->private);
                        if (unlikely(ret))
                                goto fail;
                }

                if (is_read) {
                        if (dio->flags & DIO_SHOULD_DIRTY)
                                bio_set_pages_dirty(bio);
                } else {
                        task_io_account_write(bio->bi_iter.bi_size);
                }
                dio->size += bio->bi_iter.bi_size;
                pos += bio->bi_iter.bi_size;

                nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
                if (!nr_pages) {
                        submit_bio(bio);
                        break;
                }
                atomic_inc(&dio->ref);
                submit_bio(bio);
                bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL);
        }

        blk_finish_plug(&plug);

        if (!is_sync)
                return -EIOCBQUEUED;

        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (!READ_ONCE(dio->waiter))
                        break;
                blk_io_schedule();
        }
        __set_current_state(TASK_RUNNING);

        if (!ret)
                ret = blk_status_to_errno(dio->bio.bi_status);
        if (likely(!ret))
                ret = dio->size;

        bio_put(&dio->bio);
        return ret;
fail:
        bio_release_pages(bio, false);
        bio_clear_flag(bio, BIO_REFFED);
        bio_put(bio);
        blk_finish_plug(&plug);
        return ret;
}

static void blkdev_bio_end_io_async(struct bio *bio)
{
        struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
        struct kiocb *iocb = dio->iocb;
        ssize_t ret;

        WRITE_ONCE(iocb->private, NULL);

        if (likely(!bio->bi_status)) {
                ret = dio->size;
                iocb->ki_pos += ret;
        } else {
                ret = blk_status_to_errno(bio->bi_status);
        }

        if (bio_integrity(bio))
                bio_integrity_unmap_user(bio);

        iocb->ki_complete(iocb, ret);

        if (dio->flags & DIO_SHOULD_DIRTY) {
                bio_check_pages_dirty(bio);
        } else {
                bio_release_pages(bio, false);
                bio_put(bio);
        }
}

static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
                                        struct iov_iter *iter,
                                        struct block_device *bdev,
                                        unsigned int nr_pages)
{
        bool is_read = iov_iter_rw(iter) == READ;
        blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
        struct blkdev_dio *dio;
        struct bio *bio;
        loff_t pos = iocb->ki_pos;
        int ret = 0;

        bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
                               &blkdev_dio_pool);
        dio = container_of(bio, struct blkdev_dio, bio);
        dio->flags = 0;
        dio->iocb = iocb;
        bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
        bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
        bio->bi_write_stream = iocb->ki_write_stream;
        bio->bi_end_io = blkdev_bio_end_io_async;
        bio->bi_ioprio = iocb->ki_ioprio;

        if (iov_iter_is_bvec(iter)) {
                /*
                 * Users don't rely on the iterator being in any particular
                 * state for async I/O returning -EIOCBQUEUED, hence we can
                 * avoid expensive iov_iter_advance(). Bypass
                 * bio_iov_iter_get_pages() and set the bvec directly.
                 */
                bio_iov_bvec_set(bio, iter);
        } else {
                ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
                if (unlikely(ret))
                        goto out_bio_put;
        }
        dio->size = bio->bi_iter.bi_size;

        if (is_read) {
                if (user_backed_iter(iter)) {
                        dio->flags |= DIO_SHOULD_DIRTY;
                        bio_set_pages_dirty(bio);
                }
        } else {
                task_io_account_write(bio->bi_iter.bi_size);
        }

        if (iocb->ki_flags & IOCB_HAS_METADATA) {
                ret = bio_integrity_map_iter(bio, iocb->private);
                WRITE_ONCE(iocb->private, NULL);
                if (unlikely(ret))
                        goto out_bio_put;
        }

        if (iocb->ki_flags & IOCB_ATOMIC)
                bio->bi_opf |= REQ_ATOMIC;

        if (iocb->ki_flags & IOCB_NOWAIT)
                bio->bi_opf |= REQ_NOWAIT;

        if (iocb->ki_flags & IOCB_HIPRI) {
                bio->bi_opf |= REQ_POLLED;
                submit_bio(bio);
                WRITE_ONCE(iocb->private, bio);
        } else {
                submit_bio(bio);
        }
        return -EIOCBQUEUED;

out_bio_put:
        bio_put(bio);
        return ret;
}

static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
        unsigned int nr_pages;

        if (!iov_iter_count(iter))
                return 0;

        if (blkdev_dio_invalid(bdev, iocb, iter))
                return -EINVAL;

        if (iov_iter_rw(iter) == WRITE) {
                u16 max_write_streams = bdev_max_write_streams(bdev);

                if (iocb->ki_write_stream) {
                        if (iocb->ki_write_stream > max_write_streams)
                                return -EINVAL;
                } else if (max_write_streams) {
                        enum rw_hint write_hint =
                                file_inode(iocb->ki_filp)->i_write_hint;

                        /*
                         * Just use the write hint as write stream for block
                         * device writes.  This assumes no file system is
                         * mounted that would use the streams differently.
                         */
                        if (write_hint <= max_write_streams)
                                iocb->ki_write_stream = write_hint;
                }
        }

        nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
        if (likely(nr_pages <= BIO_MAX_VECS &&
                   !(iocb->ki_flags & IOCB_HAS_METADATA))) {
                if (is_sync_kiocb(iocb))
                        return __blkdev_direct_IO_simple(iocb, iter, bdev,
                                                        nr_pages);
                return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
        } else if (iocb->ki_flags & IOCB_ATOMIC) {
                return -EINVAL;
        }
        return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
}

static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
        struct block_device *bdev = I_BDEV(inode);
        loff_t isize = i_size_read(inode);

        if (offset >= isize)
                return -EIO;

        iomap->bdev = bdev;
        iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
        iomap->type = IOMAP_MAPPED;
        iomap->addr = iomap->offset;
        iomap->length = isize - iomap->offset;
        iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
        return 0;
}

static const struct iomap_ops blkdev_iomap_ops = {
        .iomap_begin                = blkdev_iomap_begin,
};

#ifdef CONFIG_BUFFER_HEAD
static int blkdev_get_block(struct inode *inode, sector_t iblock,
                struct buffer_head *bh, int create)
{
        bh->b_bdev = I_BDEV(inode);
        bh->b_blocknr = iblock;
        set_buffer_mapped(bh);
        return 0;
}

/*
 * We cannot call mpage_writepages() as it does not take the buffer lock.
 * We must use block_write_full_folio() directly which holds the buffer
 * lock.  The buffer lock provides the synchronisation with writeback
 * that filesystems rely on when they use the blockdev's mapping.
 */
static int blkdev_writepages(struct address_space *mapping,
                struct writeback_control *wbc)
{
        struct folio *folio = NULL;
        struct blk_plug plug;
        int err;

        blk_start_plug(&plug);
        while ((folio = writeback_iter(mapping, wbc, folio, &err)))
                err = block_write_full_folio(folio, wbc, blkdev_get_block);
        blk_finish_plug(&plug);

        return err;
}

static int blkdev_read_folio(struct file *file, struct folio *folio)
{
        return block_read_full_folio(folio, blkdev_get_block);
}

static void blkdev_readahead(struct readahead_control *rac)
{
        mpage_readahead(rac, blkdev_get_block);
}

static int blkdev_write_begin(const struct kiocb *iocb,
                              struct address_space *mapping, loff_t pos,
                              unsigned len, struct folio **foliop,
                              void **fsdata)
{
        return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
}

static int blkdev_write_end(const struct kiocb *iocb,
                            struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned copied,
                            struct folio *folio, void *fsdata)
{
        int ret;
        ret = block_write_end(pos, len, copied, folio);

        folio_unlock(folio);
        folio_put(folio);

        return ret;
}

const struct address_space_operations def_blk_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio        = blkdev_read_folio,
        .readahead        = blkdev_readahead,
        .writepages        = blkdev_writepages,
        .write_begin        = blkdev_write_begin,
        .write_end        = blkdev_write_end,
        .migrate_folio        = buffer_migrate_folio_norefs,
        .is_dirty_writeback = buffer_check_dirty_writeback,
};
#else /* CONFIG_BUFFER_HEAD */
static int blkdev_read_folio(struct file *file, struct folio *folio)
{
        iomap_bio_read_folio(folio, &blkdev_iomap_ops);
        return 0;
}

static void blkdev_readahead(struct readahead_control *rac)
{
        iomap_bio_readahead(rac, &blkdev_iomap_ops);
}

static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
                struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
{
        loff_t isize = i_size_read(wpc->inode);

        if (WARN_ON_ONCE(offset >= isize))
                return -EIO;

        if (offset < wpc->iomap.offset ||
            offset >= wpc->iomap.offset + wpc->iomap.length) {
                int error;

                error = blkdev_iomap_begin(wpc->inode, offset, isize - offset,
                                IOMAP_WRITE, &wpc->iomap, NULL);
                if (error)
                        return error;
        }

        return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}

static const struct iomap_writeback_ops blkdev_writeback_ops = {
        .writeback_range        = blkdev_writeback_range,
        .writeback_submit        = iomap_ioend_writeback_submit,
};

static int blkdev_writepages(struct address_space *mapping,
                struct writeback_control *wbc)
{
        struct iomap_writepage_ctx wpc = {
                .inode                = mapping->host,
                .wbc                = wbc,
                .ops                = &blkdev_writeback_ops
        };

        return iomap_writepages(&wpc);
}

const struct address_space_operations def_blk_aops = {
        .dirty_folio        = filemap_dirty_folio,
        .release_folio                = iomap_release_folio,
        .invalidate_folio        = iomap_invalidate_folio,
        .read_folio                = blkdev_read_folio,
        .readahead                = blkdev_readahead,
        .writepages                = blkdev_writepages,
        .is_partially_uptodate  = iomap_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .migrate_folio                = filemap_migrate_folio,
};
#endif /* CONFIG_BUFFER_HEAD */

/*
 * for a block special file file_inode(file)->i_size is zero
 * so we compute the size by hand (just as in block_read/write above)
 */
static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *bd_inode = bdev_file_inode(file);
        loff_t retval;

        inode_lock(bd_inode);
        retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
        inode_unlock(bd_inode);
        return retval;
}

static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
                int datasync)
{
        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
        int error;

        error = file_write_and_wait_range(filp, start, end);
        if (error)
                return error;

        /*
         * There is no need to serialise calls to blkdev_issue_flush with
         * i_mutex and doing so causes performance issues with concurrent
         * O_SYNC writers to a block device.
         */
        error = blkdev_issue_flush(bdev);
        if (error == -EOPNOTSUPP)
                error = 0;

        return error;
}

/**
 * file_to_blk_mode - get block open flags from file flags
 * @file: file whose open flags should be converted
 *
 * Look at file open flags and generate corresponding block open flags from
 * them. The function works both for file just being open (e.g. during ->open
 * callback) and for file that is already open. This is actually non-trivial
 * (see comment in the function).
 */
blk_mode_t file_to_blk_mode(struct file *file)
{
        blk_mode_t mode = 0;

        if (file->f_mode & FMODE_READ)
                mode |= BLK_OPEN_READ;
        if (file->f_mode & FMODE_WRITE)
                mode |= BLK_OPEN_WRITE;
        /*
         * do_dentry_open() clears O_EXCL from f_flags, use file->private_data
         * to determine whether the open was exclusive for already open files.
         */
        if (file->private_data)
                mode |= BLK_OPEN_EXCL;
        else if (file->f_flags & O_EXCL)
                mode |= BLK_OPEN_EXCL;
        if (file->f_flags & O_NDELAY)
                mode |= BLK_OPEN_NDELAY;

        /*
         * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy
         * driver has historically allowed ioctls as if the file was opened for
         * writing, but does not allow and actual reads or writes.
         */
        if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY))
                mode |= BLK_OPEN_WRITE_IOCTL;

        return mode;
}

static int blkdev_open(struct inode *inode, struct file *filp)
{
        struct block_device *bdev;
        blk_mode_t mode;
        int ret;

        mode = file_to_blk_mode(filp);
        /* Use the file as the holder. */
        if (mode & BLK_OPEN_EXCL)
                filp->private_data = filp;
        ret = bdev_permission(inode->i_rdev, mode, filp->private_data);
        if (ret)
                return ret;

        bdev = blkdev_get_no_open(inode->i_rdev, true);
        if (!bdev)
                return -ENXIO;

        if (bdev_can_atomic_write(bdev))
                filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
        if (blk_get_integrity(bdev->bd_disk))
                filp->f_mode |= FMODE_HAS_METADATA;

        ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
        if (ret)
                blkdev_put_no_open(bdev);
        return ret;
}

static int blkdev_release(struct inode *inode, struct file *filp)
{
        bdev_release(filp);
        return 0;
}

static ssize_t
blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
        size_t count = iov_iter_count(from);
        ssize_t written;

        written = kiocb_invalidate_pages(iocb, count);
        if (written) {
                if (written == -EBUSY)
                        return 0;
                return written;
        }

        written = blkdev_direct_IO(iocb, from);
        if (written > 0) {
                kiocb_invalidate_post_direct_write(iocb, count);
                iocb->ki_pos += written;
                count -= written;
        }
        if (written != -EIOCBQUEUED)
                iov_iter_revert(from, count - iov_iter_count(from));
        return written;
}

static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
{
        return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL,
                        NULL);
}

/*
 * Write data to the block device.  Only intended for the block device itself
 * and the raw driver which basically is a fake block device.
 *
 * Does not take i_mutex for the write and thus is not for general purpose
 * use.
 */
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *bd_inode = bdev_file_inode(file);
        struct block_device *bdev = I_BDEV(bd_inode);
        bool atomic = iocb->ki_flags & IOCB_ATOMIC;
        loff_t size = bdev_nr_bytes(bdev);
        size_t shorted = 0;
        ssize_t ret;

        if (bdev_read_only(bdev))
                return -EPERM;

        if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
                return -ETXTBSY;

        if (!iov_iter_count(from))
                return 0;

        if (iocb->ki_pos >= size)
                return -ENOSPC;

        if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
                return -EOPNOTSUPP;

        if (atomic) {
                ret = generic_atomic_write_valid(iocb, from);
                if (ret)
                        return ret;
        }

        size -= iocb->ki_pos;
        if (iov_iter_count(from) > size) {
                if (atomic)
                        return -EINVAL;
                shorted = iov_iter_count(from) - size;
                iov_iter_truncate(from, size);
        }

        ret = file_update_time(file);
        if (ret)
                return ret;

        if (iocb->ki_flags & IOCB_DIRECT) {
                ret = blkdev_direct_write(iocb, from);
                if (ret >= 0 && iov_iter_count(from))
                        ret = direct_write_fallback(iocb, from, ret,
                                        blkdev_buffered_write(iocb, from));
        } else {
                /*
                 * Take i_rwsem and invalidate_lock to avoid racing with
                 * set_blocksize changing i_blkbits/folio order and punching
                 * out the pagecache.
                 */
                inode_lock_shared(bd_inode);
                ret = blkdev_buffered_write(iocb, from);
                inode_unlock_shared(bd_inode);
        }

        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        iov_iter_reexpand(from, iov_iter_count(from) + shorted);
        return ret;
}

static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct inode *bd_inode = bdev_file_inode(iocb->ki_filp);
        struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
        loff_t size = bdev_nr_bytes(bdev);
        loff_t pos = iocb->ki_pos;
        size_t shorted = 0;
        ssize_t ret = 0;
        size_t count;

        if (unlikely(pos + iov_iter_count(to) > size)) {
                if (pos >= size)
                        return 0;
                size -= pos;
                shorted = iov_iter_count(to) - size;
                iov_iter_truncate(to, size);
        }

        count = iov_iter_count(to);
        if (!count)
                goto reexpand; /* skip atime */

        if (iocb->ki_flags & IOCB_DIRECT) {
                ret = kiocb_write_and_wait(iocb, count);
                if (ret < 0)
                        goto reexpand;
                file_accessed(iocb->ki_filp);

                ret = blkdev_direct_IO(iocb, to);
                if (ret > 0) {
                        iocb->ki_pos += ret;
                        count -= ret;
                }
                if (ret != -EIOCBQUEUED)
                        iov_iter_revert(to, count - iov_iter_count(to));
                if (ret < 0 || !count)
                        goto reexpand;
        }

        /*
         * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize
         * changing i_blkbits/folio order and punching out the pagecache.
         */
        inode_lock_shared(bd_inode);
        ret = filemap_read(iocb, to, ret);
        inode_unlock_shared(bd_inode);

reexpand:
        if (unlikely(shorted))
                iov_iter_reexpand(to, iov_iter_count(to) + shorted);
        return ret;
}

#define        BLKDEV_FALLOC_FL_SUPPORTED                                        \
                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |                \
                 FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES)

static long blkdev_fallocate(struct file *file, int mode, loff_t start,
                             loff_t len)
{
        struct inode *inode = bdev_file_inode(file);
        struct block_device *bdev = I_BDEV(inode);
        loff_t end = start + len - 1;
        loff_t isize;
        unsigned int flags;
        int error;

        /* Fail if we don't recognize the flags. */
        if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
                return -EOPNOTSUPP;
        /*
         * Don't allow writing zeroes if the device does not enable the
         * unmap write zeroes operation.
         */
        if ((mode & FALLOC_FL_WRITE_ZEROES) &&
            !bdev_write_zeroes_unmap_sectors(bdev))
                return -EOPNOTSUPP;

        /* Don't go off the end of the device. */
        isize = bdev_nr_bytes(bdev);
        if (start >= isize)
                return -EINVAL;
        if (end >= isize) {
                if (mode & FALLOC_FL_KEEP_SIZE) {
                        len = isize - start;
                        end = start + len - 1;
                } else
                        return -EINVAL;
        }

        /*
         * Don't allow IO that isn't aligned to logical block size.
         */
        if ((start | len) & (bdev_logical_block_size(bdev) - 1))
                return -EINVAL;

        inode_lock(inode);
        filemap_invalidate_lock(inode->i_mapping);

        switch (mode) {
        case FALLOC_FL_ZERO_RANGE:
        case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
                flags = BLKDEV_ZERO_NOUNMAP;
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
                flags = BLKDEV_ZERO_NOFALLBACK;
                break;
        case FALLOC_FL_WRITE_ZEROES:
                flags = 0;
                break;
        default:
                error = -EOPNOTSUPP;
                goto fail;
        }

        /*
         * Invalidate the page cache, including dirty pages, for valid
         * de-allocate mode calls to fallocate().
         */
        error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
        if (error)
                goto fail;

        error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
                                     len >> SECTOR_SHIFT, GFP_KERNEL, flags);
 fail:
        filemap_invalidate_unlock(inode->i_mapping);
        inode_unlock(inode);
        return error;
}

static int blkdev_mmap_prepare(struct vm_area_desc *desc)
{
        struct file *file = desc->file;

        if (bdev_read_only(I_BDEV(bdev_file_inode(file))))
                return generic_file_readonly_mmap_prepare(desc);

        return generic_file_mmap_prepare(desc);
}

const struct file_operations def_blk_fops = {
        .open                = blkdev_open,
        .release        = blkdev_release,
        .llseek                = blkdev_llseek,
        .read_iter        = blkdev_read_iter,
        .write_iter        = blkdev_write_iter,
        .iopoll                = iocb_bio_iopoll,
        .mmap_prepare        = blkdev_mmap_prepare,
        .fsync                = blkdev_fsync,
        .unlocked_ioctl        = blkdev_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = compat_blkdev_ioctl,
#endif
        .splice_read        = filemap_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = blkdev_fallocate,
        .uring_cmd        = blkdev_uring_cmd,
        .fop_flags        = FOP_BUFFER_RASYNC,
};

static __init int blkdev_init(void)
{
        return bioset_init(&blkdev_dio_pool, 4,
                                offsetof(struct blkdev_dio, bio),
                                BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
}
module_init(blkdev_init);


















































































































































































   11 



   11 































   11 





































   11 
   11 










   15 
   16 

















































































































   11 
































































































    1 

















































































































































































































































































































































































































































































































































































































































































   11 














   11 





   11 




   11 
   11 









   11 

























   11 
   11 

   11 
   11 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

















    1 




    1 

















































































































































































































































































































































































































































































    9 











   11 




























   10 








   10 

   11 










   11 

















   13 


   13 







   11 


   11 


   11 
   11 
   11 






































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/locks.c
 *
 * We implement four types of file locks: BSD locks, posix locks, open
 * file description locks, and leases.  For details about BSD locks,
 * see the flock(2) man page; for details about the other three, see
 * fcntl(2).
 *
 *
 * Locking conflicts and dependencies:
 * If multiple threads attempt to lock the same byte (or flock the same file)
 * only one can be granted the lock, and other must wait their turn.
 * The first lock has been "applied" or "granted", the others are "waiting"
 * and are "blocked" by the "applied" lock..
 *
 * Waiting and applied locks are all kept in trees whose properties are:
 *
 *        - the root of a tree may be an applied or waiting lock.
 *        - every other node in the tree is a waiting lock that
 *          conflicts with every ancestor of that node.
 *
 * Every such tree begins life as a waiting singleton which obviously
 * satisfies the above properties.
 *
 * The only ways we modify trees preserve these properties:
 *
 *        1. We may add a new leaf node, but only after first verifying that it
 *           conflicts with all of its ancestors.
 *        2. We may remove the root of a tree, creating a new singleton
 *           tree from the root and N new trees rooted in the immediate
 *           children.
 *        3. If the root of a tree is not currently an applied lock, we may
 *           apply it (if possible).
 *        4. We may upgrade the root of the tree (either extend its range,
 *           or upgrade its entire range from read to write).
 *
 * When an applied lock is modified in a way that reduces or downgrades any
 * part of its range, we remove all its children (2 above).  This particularly
 * happens when a lock is unlocked.
 *
 * For each of those child trees we "wake up" the thread which is
 * waiting for the lock so it can continue handling as follows: if the
 * root of the tree applies, we do so (3).  If it doesn't, it must
 * conflict with some applied lock.  We remove (wake up) all of its children
 * (2), and add it is a new leaf to the tree rooted in the applied
 * lock (1).  We then repeat the process recursively with those
 * children.
 *
 */
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/filelock.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/time.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
#include <linux/percpu.h>
#include <linux/sysctl.h>

#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>

#include <linux/uaccess.h>

static struct file_lock *file_lock(struct file_lock_core *flc)
{
        return container_of(flc, struct file_lock, c);
}

static struct file_lease *file_lease(struct file_lock_core *flc)
{
        return container_of(flc, struct file_lease, c);
}

static bool lease_breaking(struct file_lease *fl)
{
        return fl->c.flc_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
}

static int target_leasetype(struct file_lease *fl)
{
        if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                return F_UNLCK;
        if (fl->c.flc_flags & FL_DOWNGRADE_PENDING)
                return F_RDLCK;
        return fl->c.flc_type;
}

static int leases_enable = 1;
static int lease_break_time = 45;

#ifdef CONFIG_SYSCTL
static const struct ctl_table locks_sysctls[] = {
        {
                .procname        = "leases-enable",
                .data                = &leases_enable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#ifdef CONFIG_MMU
        {
                .procname        = "lease-break-time",
                .data                = &lease_break_time,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif /* CONFIG_MMU */
};

static int __init init_fs_locks_sysctls(void)
{
        register_sysctl_init("fs", locks_sysctls);
        return 0;
}
early_initcall(init_fs_locks_sysctls);
#endif /* CONFIG_SYSCTL */

/*
 * The global file_lock_list is only used for displaying /proc/locks, so we
 * keep a list on each CPU, with each list protected by its own spinlock.
 * Global serialization is done using file_rwsem.
 *
 * Note that alterations to the list also require that the relevant flc_lock is
 * held.
 */
struct file_lock_list_struct {
        spinlock_t                lock;
        struct hlist_head        hlist;
};
static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);


/*
 * The blocked_hash is used to find POSIX lock loops for deadlock detection.
 * It is protected by blocked_lock_lock.
 *
 * We hash locks by lockowner in order to optimize searching for the lock a
 * particular lockowner is waiting on.
 *
 * FIXME: make this value scale via some heuristic? We generally will want more
 * buckets when we have more lockowners holding locks, but that's a little
 * difficult to determine without knowing what the workload will look like.
 */
#define BLOCKED_HASH_BITS        7
static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);

/*
 * This lock protects the blocked_hash. Generally, if you're accessing it, you
 * want to be holding this lock.
 *
 * In addition, it also protects the fl->fl_blocked_requests list, and the
 * fl->fl_blocker pointer for file_lock structures that are acting as lock
 * requests (in contrast to those that are acting as records of acquired locks).
 *
 * Note that when we acquire this lock in order to change the above fields,
 * we often hold the flc_lock as well. In certain cases, when reading the fields
 * protected by this lock, we can skip acquiring it iff we already hold the
 * flc_lock.
 */
static DEFINE_SPINLOCK(blocked_lock_lock);

static struct kmem_cache *flctx_cache __ro_after_init;
static struct kmem_cache *filelock_cache __ro_after_init;
static struct kmem_cache *filelease_cache __ro_after_init;

static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
{
        struct file_lock_context *ctx;

        ctx = locks_inode_context(inode);
        if (likely(ctx) || type == F_UNLCK)
                goto out;

        ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
        if (!ctx)
                goto out;

        spin_lock_init(&ctx->flc_lock);
        INIT_LIST_HEAD(&ctx->flc_flock);
        INIT_LIST_HEAD(&ctx->flc_posix);
        INIT_LIST_HEAD(&ctx->flc_lease);

        /*
         * Assign the pointer if it's not already assigned. If it is, then
         * free the context we just allocated.
         */
        spin_lock(&inode->i_lock);
        if (!(inode->i_opflags & IOP_FLCTX)) {
                VFS_BUG_ON_INODE(inode->i_flctx, inode);
                WRITE_ONCE(inode->i_flctx, ctx);
                /*
                 * Paired with locks_inode_context().
                 */
                smp_store_release(&inode->i_opflags, inode->i_opflags | IOP_FLCTX);
                spin_unlock(&inode->i_lock);
        } else {
                VFS_BUG_ON_INODE(!inode->i_flctx, inode);
                spin_unlock(&inode->i_lock);
                kmem_cache_free(flctx_cache, ctx);
                ctx = locks_inode_context(inode);
        }
out:
        trace_locks_get_lock_context(inode, type, ctx);
        return ctx;
}

static void
locks_dump_ctx_list(struct list_head *list, char *list_type)
{
        struct file_lock_core *flc;

        list_for_each_entry(flc, list, flc_list)
                pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
                        list_type, flc->flc_owner, flc->flc_flags,
                        flc->flc_type, flc->flc_pid);
}

static void
locks_check_ctx_lists(struct inode *inode)
{
        struct file_lock_context *ctx = inode->i_flctx;

        if (unlikely(!list_empty(&ctx->flc_flock) ||
                     !list_empty(&ctx->flc_posix) ||
                     !list_empty(&ctx->flc_lease))) {
                pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%llx:\n",
                        MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
                        inode->i_ino);
                locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
                locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
                locks_dump_ctx_list(&ctx->flc_lease, "LEASE");
        }
}

static void
locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type)
{
        struct file_lock_core *flc;
        struct inode *inode = file_inode(filp);

        list_for_each_entry(flc, list, flc_list)
                if (flc->flc_file == filp)
                        pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%llx "
                                " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
                                list_type, MAJOR(inode->i_sb->s_dev),
                                MINOR(inode->i_sb->s_dev), inode->i_ino,
                                flc->flc_owner, flc->flc_flags,
                                flc->flc_type, flc->flc_pid);
}

void
locks_free_lock_context(struct inode *inode)
{
        struct file_lock_context *ctx = locks_inode_context(inode);

        if (unlikely(ctx)) {
                locks_check_ctx_lists(inode);
                kmem_cache_free(flctx_cache, ctx);
        }
}

static void locks_init_lock_heads(struct file_lock_core *flc)
{
        INIT_HLIST_NODE(&flc->flc_link);
        INIT_LIST_HEAD(&flc->flc_list);
        INIT_LIST_HEAD(&flc->flc_blocked_requests);
        INIT_LIST_HEAD(&flc->flc_blocked_member);
        init_waitqueue_head(&flc->flc_wait);
}

/* Allocate an empty lock structure. */
struct file_lock *locks_alloc_lock(void)
{
        struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);

        if (fl)
                locks_init_lock_heads(&fl->c);

        return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lock);

/* Allocate an empty lock structure. */
struct file_lease *locks_alloc_lease(void)
{
        struct file_lease *fl = kmem_cache_zalloc(filelease_cache, GFP_KERNEL);

        if (fl)
                locks_init_lock_heads(&fl->c);

        return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lease);

void locks_release_private(struct file_lock *fl)
{
        struct file_lock_core *flc = &fl->c;

        BUG_ON(waitqueue_active(&flc->flc_wait));
        BUG_ON(!list_empty(&flc->flc_list));
        BUG_ON(!list_empty(&flc->flc_blocked_requests));
        BUG_ON(!list_empty(&flc->flc_blocked_member));
        BUG_ON(!hlist_unhashed(&flc->flc_link));

        if (fl->fl_ops) {
                if (fl->fl_ops->fl_release_private)
                        fl->fl_ops->fl_release_private(fl);
                fl->fl_ops = NULL;
        }

        if (fl->fl_lmops) {
                if (fl->fl_lmops->lm_put_owner) {
                        fl->fl_lmops->lm_put_owner(flc->flc_owner);
                        flc->flc_owner = NULL;
                }
                fl->fl_lmops = NULL;
        }
}
EXPORT_SYMBOL_GPL(locks_release_private);

/**
 * locks_owner_has_blockers - Check for blocking lock requests
 * @flctx: file lock context
 * @owner: lock owner
 *
 * Return values:
 *   %true: @owner has at least one blocker
 *   %false: @owner has no blockers
 */
bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner)
{
        struct file_lock_core *flc;

        spin_lock(&flctx->flc_lock);
        list_for_each_entry(flc, &flctx->flc_posix, flc_list) {
                if (flc->flc_owner != owner)
                        continue;
                if (!list_empty(&flc->flc_blocked_requests)) {
                        spin_unlock(&flctx->flc_lock);
                        return true;
                }
        }
        spin_unlock(&flctx->flc_lock);
        return false;
}
EXPORT_SYMBOL_GPL(locks_owner_has_blockers);

/* Free a lock which is not in use. */
void locks_free_lock(struct file_lock *fl)
{
        locks_release_private(fl);
        kmem_cache_free(filelock_cache, fl);
}
EXPORT_SYMBOL(locks_free_lock);

/* Free a lease which is not in use. */
void locks_free_lease(struct file_lease *fl)
{
        kmem_cache_free(filelease_cache, fl);
}
EXPORT_SYMBOL(locks_free_lease);

static void
locks_dispose_list(struct list_head *dispose)
{
        struct file_lock_core *flc;

        while (!list_empty(dispose)) {
                flc = list_first_entry(dispose, struct file_lock_core, flc_list);
                list_del_init(&flc->flc_list);
                locks_free_lock(file_lock(flc));
        }
}

static void
lease_dispose_list(struct list_head *dispose)
{
        struct file_lock_core *flc;

        while (!list_empty(dispose)) {
                flc = list_first_entry(dispose, struct file_lock_core, flc_list);
                list_del_init(&flc->flc_list);
                locks_free_lease(file_lease(flc));
        }
}

void locks_init_lock(struct file_lock *fl)
{
        memset(fl, 0, sizeof(struct file_lock));
        locks_init_lock_heads(&fl->c);
}
EXPORT_SYMBOL(locks_init_lock);

void locks_init_lease(struct file_lease *fl)
{
        memset(fl, 0, sizeof(*fl));
        locks_init_lock_heads(&fl->c);
}
EXPORT_SYMBOL(locks_init_lease);

/*
 * Initialize a new lock from an existing file_lock structure.
 */
void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        new->c.flc_owner = fl->c.flc_owner;
        new->c.flc_pid = fl->c.flc_pid;
        new->c.flc_file = NULL;
        new->c.flc_flags = fl->c.flc_flags;
        new->c.flc_type = fl->c.flc_type;
        new->fl_start = fl->fl_start;
        new->fl_end = fl->fl_end;
        new->fl_lmops = fl->fl_lmops;
        new->fl_ops = NULL;

        if (fl->fl_lmops) {
                if (fl->fl_lmops->lm_get_owner)
                        fl->fl_lmops->lm_get_owner(fl->c.flc_owner);
        }
}
EXPORT_SYMBOL(locks_copy_conflock);

void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        /* "new" must be a freshly-initialized lock */
        WARN_ON_ONCE(new->fl_ops);

        locks_copy_conflock(new, fl);

        new->c.flc_file = fl->c.flc_file;
        new->fl_ops = fl->fl_ops;

        if (fl->fl_ops) {
                if (fl->fl_ops->fl_copy_lock)
                        fl->fl_ops->fl_copy_lock(new, fl);
        }
}
EXPORT_SYMBOL(locks_copy_lock);

static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
{
        struct file_lock *f;

        /*
         * As ctx->flc_lock is held, new requests cannot be added to
         * ->flc_blocked_requests, so we don't need a lock to check if it
         * is empty.
         */
        if (list_empty(&fl->c.flc_blocked_requests))
                return;
        spin_lock(&blocked_lock_lock);
        list_splice_init(&fl->c.flc_blocked_requests,
                         &new->c.flc_blocked_requests);
        list_for_each_entry(f, &new->c.flc_blocked_requests,
                            c.flc_blocked_member)
                f->c.flc_blocker = &new->c;
        spin_unlock(&blocked_lock_lock);
}

static inline int flock_translate_cmd(int cmd) {
        switch (cmd) {
        case LOCK_SH:
                return F_RDLCK;
        case LOCK_EX:
                return F_WRLCK;
        case LOCK_UN:
                return F_UNLCK;
        }
        return -EINVAL;
}

/* Fill in a file_lock structure with an appropriate FLOCK lock. */
static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
{
        locks_init_lock(fl);

        fl->c.flc_file = filp;
        fl->c.flc_owner = filp;
        fl->c.flc_pid = current->tgid;
        fl->c.flc_flags = FL_FLOCK;
        fl->c.flc_type = type;
        fl->fl_end = OFFSET_MAX;
}

static int assign_type(struct file_lock_core *flc, int type)
{
        switch (type) {
        case F_RDLCK:
        case F_WRLCK:
        case F_UNLCK:
                flc->flc_type = type;
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
                                 struct flock64 *l)
{
        switch (l->l_whence) {
        case SEEK_SET:
                fl->fl_start = 0;
                break;
        case SEEK_CUR:
                fl->fl_start = filp->f_pos;
                break;
        case SEEK_END:
                fl->fl_start = i_size_read(file_inode(filp));
                break;
        default:
                return -EINVAL;
        }
        if (l->l_start > OFFSET_MAX - fl->fl_start)
                return -EOVERFLOW;
        fl->fl_start += l->l_start;
        if (fl->fl_start < 0)
                return -EINVAL;

        /* POSIX-1996 leaves the case l->l_len < 0 undefined;
           POSIX-2001 defines it. */
        if (l->l_len > 0) {
                if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
                        return -EOVERFLOW;
                fl->fl_end = fl->fl_start + (l->l_len - 1);

        } else if (l->l_len < 0) {
                if (fl->fl_start + l->l_len < 0)
                        return -EINVAL;
                fl->fl_end = fl->fl_start - 1;
                fl->fl_start += l->l_len;
        } else
                fl->fl_end = OFFSET_MAX;

        fl->c.flc_owner = current->files;
        fl->c.flc_pid = current->tgid;
        fl->c.flc_file = filp;
        fl->c.flc_flags = FL_POSIX;
        fl->fl_ops = NULL;
        fl->fl_lmops = NULL;

        return assign_type(&fl->c, l->l_type);
}

/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
 * style lock.
 */
static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
                               struct flock *l)
{
        struct flock64 ll = {
                .l_type = l->l_type,
                .l_whence = l->l_whence,
                .l_start = l->l_start,
                .l_len = l->l_len,
        };

        return flock64_to_posix_lock(filp, fl, &ll);
}

/* default lease lock manager operations */
static bool
lease_break_callback(struct file_lease *fl)
{
        kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
        return false;
}

static void
lease_setup(struct file_lease *fl, void **priv)
{
        struct file *filp = fl->c.flc_file;
        struct fasync_struct *fa = *priv;

        /*
         * fasync_insert_entry() returns the old entry if any. If there was no
         * old entry, then it used "priv" and inserted it into the fasync list.
         * Clear the pointer to indicate that it shouldn't be freed.
         */
        if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa))
                *priv = NULL;

        __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);
}

/**
 * lease_open_conflict - see if the given file points to an inode that has
 *                         an existing open that would conflict with the
 *                         desired lease.
 * @filp:        file to check
 * @arg:        type of lease that we're trying to acquire
 *
 * Check to see if there's an existing open fd on this file that would
 * conflict with the lease we're trying to set.
 */
static int
lease_open_conflict(struct file *filp, const int arg)
{
        struct inode *inode = file_inode(filp);
        int self_wcount = 0, self_rcount = 0;

        if (arg == F_RDLCK)
                return inode_is_open_for_write(inode) ? -EAGAIN : 0;
        else if (arg != F_WRLCK)
                return 0;

        /*
         * Make sure that only read/write count is from lease requestor.
         * Note that this will result in denying write leases when i_writecount
         * is negative, which is what we want.  (We shouldn't grant write leases
         * on files open for execution.)
         */
        if (filp->f_mode & FMODE_WRITE)
                self_wcount = 1;
        else if (filp->f_mode & FMODE_READ)
                self_rcount = 1;

        if (atomic_read(&inode->i_writecount) != self_wcount ||
            atomic_read(&inode->i_readcount) != self_rcount)
                return -EAGAIN;

        return 0;
}

static const struct lease_manager_operations lease_manager_ops = {
        .lm_break = lease_break_callback,
        .lm_change = lease_modify,
        .lm_setup = lease_setup,
        .lm_open_conflict = lease_open_conflict,
};

/*
 * Initialize a lease, use the default lock manager operations
 */
static int lease_init(struct file *filp, unsigned int flags, int type, struct file_lease *fl)
{
        if (assign_type(&fl->c, type) != 0)
                return -EINVAL;

        fl->c.flc_owner = filp;
        fl->c.flc_pid = current->tgid;

        fl->c.flc_file = filp;
        fl->c.flc_flags = flags;
        fl->fl_lmops = &lease_manager_ops;
        return 0;
}

/* Allocate a file_lock initialised to this type of lease */
static struct file_lease *lease_alloc(struct file *filp, unsigned int flags, int type)
{
        struct file_lease *fl = locks_alloc_lease();
        int error = -ENOMEM;

        if (fl == NULL)
                return ERR_PTR(error);

        error = lease_init(filp, flags, type, fl);
        if (error) {
                locks_free_lease(fl);
                return ERR_PTR(error);
        }
        return fl;
}

/* Check if two locks overlap each other.
 */
static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
{
        return ((fl1->fl_end >= fl2->fl_start) &&
                (fl2->fl_end >= fl1->fl_start));
}

/*
 * Check whether two locks have the same owner.
 */
static int posix_same_owner(struct file_lock_core *fl1, struct file_lock_core *fl2)
{
        return fl1->flc_owner == fl2->flc_owner;
}

/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock_core *flc)
{
        struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);

        percpu_rwsem_assert_held(&file_rwsem);

        spin_lock(&fll->lock);
        flc->flc_link_cpu = smp_processor_id();
        hlist_add_head(&flc->flc_link, &fll->hlist);
        spin_unlock(&fll->lock);
}

/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock_core *flc)
{
        struct file_lock_list_struct *fll;

        percpu_rwsem_assert_held(&file_rwsem);

        /*
         * Avoid taking lock if already unhashed. This is safe since this check
         * is done while holding the flc_lock, and new insertions into the list
         * also require that it be held.
         */
        if (hlist_unhashed(&flc->flc_link))
                return;

        fll = per_cpu_ptr(&file_lock_list, flc->flc_link_cpu);
        spin_lock(&fll->lock);
        hlist_del_init(&flc->flc_link);
        spin_unlock(&fll->lock);
}

static unsigned long
posix_owner_key(struct file_lock_core *flc)
{
        return (unsigned long) flc->flc_owner;
}

static void locks_insert_global_blocked(struct file_lock_core *waiter)
{
        lockdep_assert_held(&blocked_lock_lock);

        hash_add(blocked_hash, &waiter->flc_link, posix_owner_key(waiter));
}

static void locks_delete_global_blocked(struct file_lock_core *waiter)
{
        lockdep_assert_held(&blocked_lock_lock);

        hash_del(&waiter->flc_link);
}

/* Remove waiter from blocker's block list.
 * When blocker ends up pointing to itself then the list is empty.
 *
 * Must be called with blocked_lock_lock held.
 */
static void __locks_unlink_block(struct file_lock_core *waiter)
{
        locks_delete_global_blocked(waiter);
        list_del_init(&waiter->flc_blocked_member);
}

static void __locks_wake_up_blocks(struct file_lock_core *blocker)
{
        while (!list_empty(&blocker->flc_blocked_requests)) {
                struct file_lock_core *waiter;
                struct file_lock *fl;

                waiter = list_first_entry(&blocker->flc_blocked_requests,
                                          struct file_lock_core, flc_blocked_member);

                fl = file_lock(waiter);
                __locks_unlink_block(waiter);
                if ((waiter->flc_flags & (FL_POSIX | FL_FLOCK)) &&
                    fl->fl_lmops && fl->fl_lmops->lm_notify)
                        fl->fl_lmops->lm_notify(fl);
                else
                        locks_wake_up_waiter(waiter);

                /*
                 * The setting of flc_blocker to NULL marks the "done"
                 * point in deleting a block. Paired with acquire at the top
                 * of locks_delete_block().
                 */
                smp_store_release(&waiter->flc_blocker, NULL);
        }
}

static int __locks_delete_block(struct file_lock_core *waiter)
{
        int status = -ENOENT;

        /*
         * If fl_blocker is NULL, it won't be set again as this thread "owns"
         * the lock and is the only one that might try to claim the lock.
         *
         * We use acquire/release to manage fl_blocker so that we can
         * optimize away taking the blocked_lock_lock in many cases.
         *
         * The smp_load_acquire guarantees two things:
         *
         * 1/ that fl_blocked_requests can be tested locklessly. If something
         * was recently added to that list it must have been in a locked region
         * *before* the locked region when fl_blocker was set to NULL.
         *
         * 2/ that no other thread is accessing 'waiter', so it is safe to free
         * it.  __locks_wake_up_blocks is careful not to touch waiter after
         * fl_blocker is released.
         *
         * If a lockless check of fl_blocker shows it to be NULL, we know that
         * no new locks can be inserted into its fl_blocked_requests list, and
         * can avoid doing anything further if the list is empty.
         */
        if (!smp_load_acquire(&waiter->flc_blocker) &&
            list_empty(&waiter->flc_blocked_requests))
                return status;

        spin_lock(&blocked_lock_lock);
        if (waiter->flc_blocker)
                status = 0;
        __locks_wake_up_blocks(waiter);
        __locks_unlink_block(waiter);

        /*
         * The setting of fl_blocker to NULL marks the "done" point in deleting
         * a block. Paired with acquire at the top of this function.
         */
        smp_store_release(&waiter->flc_blocker, NULL);
        spin_unlock(&blocked_lock_lock);
        return status;
}

/**
 *        locks_delete_block - stop waiting for a file lock
 *        @waiter: the lock which was waiting
 *
 *        lockd/nfsd need to disconnect the lock while working on it.
 */
int locks_delete_block(struct file_lock *waiter)
{
        return __locks_delete_block(&waiter->c);
}
EXPORT_SYMBOL(locks_delete_block);

/* Insert waiter into blocker's block list.
 * We use a circular list so that processes can be easily woken up in
 * the order they blocked. The documentation doesn't require this but
 * it seems like the reasonable thing to do.
 *
 * Must be called with both the flc_lock and blocked_lock_lock held. The
 * fl_blocked_requests list itself is protected by the blocked_lock_lock,
 * but by ensuring that the flc_lock is also held on insertions we can avoid
 * taking the blocked_lock_lock in some cases when we see that the
 * fl_blocked_requests list is empty.
 *
 * Rather than just adding to the list, we check for conflicts with any existing
 * waiters, and add beneath any waiter that blocks the new waiter.
 * Thus wakeups don't happen until needed.
 */
static void __locks_insert_block(struct file_lock_core *blocker,
                                 struct file_lock_core *waiter,
                                 bool conflict(struct file_lock_core *,
                                               struct file_lock_core *))
{
        struct file_lock_core *flc;

        BUG_ON(!list_empty(&waiter->flc_blocked_member));
new_blocker:
        list_for_each_entry(flc, &blocker->flc_blocked_requests, flc_blocked_member)
                if (conflict(flc, waiter)) {
                        blocker =  flc;
                        goto new_blocker;
                }
        waiter->flc_blocker = blocker;
        list_add_tail(&waiter->flc_blocked_member,
                      &blocker->flc_blocked_requests);

        if ((blocker->flc_flags & (FL_POSIX|FL_OFDLCK)) == FL_POSIX)
                locks_insert_global_blocked(waiter);

        /* The requests in waiter->flc_blocked are known to conflict with
         * waiter, but might not conflict with blocker, or the requests
         * and lock which block it.  So they all need to be woken.
         */
        __locks_wake_up_blocks(waiter);
}

/* Must be called with flc_lock held. */
static void locks_insert_block(struct file_lock_core *blocker,
                               struct file_lock_core *waiter,
                               bool conflict(struct file_lock_core *,
                                             struct file_lock_core *))
{
        spin_lock(&blocked_lock_lock);
        __locks_insert_block(blocker, waiter, conflict);
        spin_unlock(&blocked_lock_lock);
}

/*
 * Wake up processes blocked waiting for blocker.
 *
 * Must be called with the inode->flc_lock held!
 */
static void locks_wake_up_blocks(struct file_lock_core *blocker)
{
        /*
         * Avoid taking global lock if list is empty. This is safe since new
         * blocked requests are only added to the list under the flc_lock, and
         * the flc_lock is always held here. Note that removal from the
         * fl_blocked_requests list does not require the flc_lock, so we must
         * recheck list_empty() after acquiring the blocked_lock_lock.
         */
        if (list_empty(&blocker->flc_blocked_requests))
                return;

        spin_lock(&blocked_lock_lock);
        __locks_wake_up_blocks(blocker);
        spin_unlock(&blocked_lock_lock);
}

static void
locks_insert_lock_ctx(struct file_lock_core *fl, struct list_head *before)
{
        list_add_tail(&fl->flc_list, before);
        locks_insert_global_locks(fl);
}

static void
locks_unlink_lock_ctx(struct file_lock_core *fl)
{
        locks_delete_global_locks(fl);
        list_del_init(&fl->flc_list);
        locks_wake_up_blocks(fl);
}

static void
locks_delete_lock_ctx(struct file_lock_core *fl, struct list_head *dispose)
{
        locks_unlink_lock_ctx(fl);
        if (dispose)
                list_add(&fl->flc_list, dispose);
        else
                locks_free_lock(file_lock(fl));
}

/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
 * checks for shared/exclusive status of overlapping locks.
 */
static bool locks_conflict(struct file_lock_core *caller_flc,
                           struct file_lock_core *sys_flc)
{
        if (sys_flc->flc_type == F_WRLCK)
                return true;
        if (caller_flc->flc_type == F_WRLCK)
                return true;
        return false;
}

/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
 * checking before calling the locks_conflict().
 */
static bool posix_locks_conflict(struct file_lock_core *caller_flc,
                                 struct file_lock_core *sys_flc)
{
        struct file_lock *caller_fl = file_lock(caller_flc);
        struct file_lock *sys_fl = file_lock(sys_flc);

        /* POSIX locks owned by the same process do not conflict with
         * each other.
         */
        if (posix_same_owner(caller_flc, sys_flc))
                return false;

        /* Check whether they overlap */
        if (!locks_overlap(caller_fl, sys_fl))
                return false;

        return locks_conflict(caller_flc, sys_flc);
}

/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK
 * path so checks for additional GETLK-specific things like F_UNLCK.
 */
static bool posix_test_locks_conflict(struct file_lock *caller_fl,
                                      struct file_lock *sys_fl)
{
        struct file_lock_core *caller = &caller_fl->c;
        struct file_lock_core *sys = &sys_fl->c;

        /* F_UNLCK checks any locks on the same fd. */
        if (lock_is_unlock(caller_fl)) {
                if (!posix_same_owner(caller, sys))
                        return false;
                return locks_overlap(caller_fl, sys_fl);
        }
        return posix_locks_conflict(caller, sys);
}

/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
 * checking before calling the locks_conflict().
 */
static bool flock_locks_conflict(struct file_lock_core *caller_flc,
                                 struct file_lock_core *sys_flc)
{
        /* FLOCK locks referring to the same filp do not conflict with
         * each other.
         */
        if (caller_flc->flc_file == sys_flc->flc_file)
                return false;

        return locks_conflict(caller_flc, sys_flc);
}

void
posix_test_lock(struct file *filp, struct file_lock *fl)
{
        struct file_lock *cfl;
        struct file_lock_context *ctx;
        struct inode *inode = file_inode(filp);
        void *owner;
        void (*func)(void);

        ctx = locks_inode_context(inode);
        if (!ctx || list_empty_careful(&ctx->flc_posix)) {
                fl->c.flc_type = F_UNLCK;
                return;
        }

retry:
        spin_lock(&ctx->flc_lock);
        list_for_each_entry(cfl, &ctx->flc_posix, c.flc_list) {
                if (!posix_test_locks_conflict(fl, cfl))
                        continue;
                if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
                        && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
                        owner = cfl->fl_lmops->lm_mod_owner;
                        func = cfl->fl_lmops->lm_expire_lock;
                        __module_get(owner);
                        spin_unlock(&ctx->flc_lock);
                        (*func)();
                        module_put(owner);
                        goto retry;
                }
                locks_copy_conflock(fl, cfl);
                goto out;
        }
        fl->c.flc_type = F_UNLCK;
out:
        spin_unlock(&ctx->flc_lock);
        return;
}
EXPORT_SYMBOL(posix_test_lock);

/*
 * Deadlock detection:
 *
 * We attempt to detect deadlocks that are due purely to posix file
 * locks.
 *
 * We assume that a task can be waiting for at most one lock at a time.
 * So for any acquired lock, the process holding that lock may be
 * waiting on at most one other lock.  That lock in turns may be held by
 * someone waiting for at most one other lock.  Given a requested lock
 * caller_fl which is about to wait for a conflicting lock block_fl, we
 * follow this chain of waiters to ensure we are not about to create a
 * cycle.
 *
 * Since we do this before we ever put a process to sleep on a lock, we
 * are ensured that there is never a cycle; that is what guarantees that
 * the while() loop in posix_locks_deadlock() eventually completes.
 *
 * Note: the above assumption may not be true when handling lock
 * requests from a broken NFS client. It may also fail in the presence
 * of tasks (such as posix threads) sharing the same open file table.
 * To handle those cases, we just bail out after a few iterations.
 *
 * For FL_OFDLCK locks, the owner is the filp, not the files_struct.
 * Because the owner is not even nominally tied to a thread of
 * execution, the deadlock detection below can't reasonably work well. Just
 * skip it for those.
 *
 * In principle, we could do a more limited deadlock detection on FL_OFDLCK
 * locks that just checks for the case where two tasks are attempting to
 * upgrade from read to write locks on the same inode.
 */

#define MAX_DEADLK_ITERATIONS 10

/* Find a lock that the owner of the given @blocker is blocking on. */
static struct file_lock_core *what_owner_is_waiting_for(struct file_lock_core *blocker)
{
        struct file_lock_core *flc;

        hash_for_each_possible(blocked_hash, flc, flc_link, posix_owner_key(blocker)) {
                if (posix_same_owner(flc, blocker)) {
                        while (flc->flc_blocker)
                                flc = flc->flc_blocker;
                        return flc;
                }
        }
        return NULL;
}

/* Must be called with the blocked_lock_lock held! */
static bool posix_locks_deadlock(struct file_lock *caller_fl,
                                 struct file_lock *block_fl)
{
        struct file_lock_core *caller = &caller_fl->c;
        struct file_lock_core *blocker = &block_fl->c;
        int i = 0;

        lockdep_assert_held(&blocked_lock_lock);

        /*
         * This deadlock detector can't reasonably detect deadlocks with
         * FL_OFDLCK locks, since they aren't owned by a process, per-se.
         */
        if (caller->flc_flags & FL_OFDLCK)
                return false;

        while ((blocker = what_owner_is_waiting_for(blocker))) {
                if (i++ > MAX_DEADLK_ITERATIONS)
                        return false;
                if (posix_same_owner(caller, blocker))
                        return true;
        }
        return false;
}

/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
 * after any leases, but before any posix locks.
 *
 * Note that if called with an FL_EXISTS argument, the caller may determine
 * whether or not a lock was successfully freed by testing the return
 * value for -ENOENT.
 */
static int flock_lock_inode(struct inode *inode, struct file_lock *request)
{
        struct file_lock *new_fl = NULL;
        struct file_lock *fl;
        struct file_lock_context *ctx;
        int error = 0;
        bool found = false;
        LIST_HEAD(dispose);

        ctx = locks_get_lock_context(inode, request->c.flc_type);
        if (!ctx) {
                if (request->c.flc_type != F_UNLCK)
                        return -ENOMEM;
                return (request->c.flc_flags & FL_EXISTS) ? -ENOENT : 0;
        }

        if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK)) {
                new_fl = locks_alloc_lock();
                if (!new_fl)
                        return -ENOMEM;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        if (request->c.flc_flags & FL_ACCESS)
                goto find_conflict;

        list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
                if (request->c.flc_file != fl->c.flc_file)
                        continue;
                if (request->c.flc_type == fl->c.flc_type)
                        goto out;
                found = true;
                locks_delete_lock_ctx(&fl->c, &dispose);
                break;
        }

        if (lock_is_unlock(request)) {
                if ((request->c.flc_flags & FL_EXISTS) && !found)
                        error = -ENOENT;
                goto out;
        }

find_conflict:
        list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
                if (!flock_locks_conflict(&request->c, &fl->c))
                        continue;
                error = -EAGAIN;
                if (!(request->c.flc_flags & FL_SLEEP))
                        goto out;
                error = FILE_LOCK_DEFERRED;
                locks_insert_block(&fl->c, &request->c, flock_locks_conflict);
                goto out;
        }
        if (request->c.flc_flags & FL_ACCESS)
                goto out;
        locks_copy_lock(new_fl, request);
        locks_move_blocks(new_fl, request);
        locks_insert_lock_ctx(&new_fl->c, &ctx->flc_flock);
        new_fl = NULL;
        error = 0;

out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        if (new_fl)
                locks_free_lock(new_fl);
        locks_dispose_list(&dispose);
        trace_flock_lock_inode(inode, request, error);
        return error;
}

static int posix_lock_inode(struct inode *inode, struct file_lock *request,
                            struct file_lock *conflock)
{
        struct file_lock *fl, *tmp;
        struct file_lock *new_fl = NULL;
        struct file_lock *new_fl2 = NULL;
        struct file_lock *left = NULL;
        struct file_lock *right = NULL;
        struct file_lock_context *ctx;
        int error;
        bool added = false;
        LIST_HEAD(dispose);
        void *owner;
        void (*func)(void);

        ctx = locks_get_lock_context(inode, request->c.flc_type);
        if (!ctx)
                return lock_is_unlock(request) ? 0 : -ENOMEM;

        /*
         * We may need two file_lock structures for this operation,
         * so we get them in advance to avoid races.
         *
         * In some cases we can be sure, that no new locks will be needed
         */
        if (!(request->c.flc_flags & FL_ACCESS) &&
            (request->c.flc_type != F_UNLCK ||
             request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
                new_fl = locks_alloc_lock();
                new_fl2 = locks_alloc_lock();
        }

retry:
        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        /*
         * New lock request. Walk all POSIX locks and look for conflicts. If
         * there are any, either return error or put the request on the
         * blocker's list of waiters and the global blocked_hash.
         */
        if (request->c.flc_type != F_UNLCK) {
                list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
                        if (!posix_locks_conflict(&request->c, &fl->c))
                                continue;
                        if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
                                && (*fl->fl_lmops->lm_lock_expirable)(fl)) {
                                owner = fl->fl_lmops->lm_mod_owner;
                                func = fl->fl_lmops->lm_expire_lock;
                                __module_get(owner);
                                spin_unlock(&ctx->flc_lock);
                                percpu_up_read(&file_rwsem);
                                (*func)();
                                module_put(owner);
                                goto retry;
                        }
                        if (conflock)
                                locks_copy_conflock(conflock, fl);
                        error = -EAGAIN;
                        if (!(request->c.flc_flags & FL_SLEEP))
                                goto out;
                        /*
                         * Deadlock detection and insertion into the blocked
                         * locks list must be done while holding the same lock!
                         */
                        error = -EDEADLK;
                        spin_lock(&blocked_lock_lock);
                        /*
                         * Ensure that we don't find any locks blocked on this
                         * request during deadlock detection.
                         */
                        __locks_wake_up_blocks(&request->c);
                        if (likely(!posix_locks_deadlock(request, fl))) {
                                error = FILE_LOCK_DEFERRED;
                                __locks_insert_block(&fl->c, &request->c,
                                                     posix_locks_conflict);
                        }
                        spin_unlock(&blocked_lock_lock);
                        goto out;
                }
        }

        /* If we're just looking for a conflict, we're done. */
        error = 0;
        if (request->c.flc_flags & FL_ACCESS)
                goto out;

        /* Find the first old lock with the same owner as the new lock */
        list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
                if (posix_same_owner(&request->c, &fl->c))
                        break;
        }

        /* Process locks with this owner. */
        list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, c.flc_list) {
                if (!posix_same_owner(&request->c, &fl->c))
                        break;

                /* Detect adjacent or overlapping regions (if same lock type) */
                if (request->c.flc_type == fl->c.flc_type) {
                        /* In all comparisons of start vs end, use
                         * "start - 1" rather than "end + 1". If end
                         * is OFFSET_MAX, end + 1 will become negative.
                         */
                        if (fl->fl_end < request->fl_start - 1)
                                continue;
                        /* If the next lock in the list has entirely bigger
                         * addresses than the new one, insert the lock here.
                         */
                        if (fl->fl_start - 1 > request->fl_end)
                                break;

                        /* If we come here, the new and old lock are of the
                         * same type and adjacent or overlapping. Make one
                         * lock yielding from the lower start address of both
                         * locks to the higher end address.
                         */
                        if (fl->fl_start > request->fl_start)
                                fl->fl_start = request->fl_start;
                        else
                                request->fl_start = fl->fl_start;
                        if (fl->fl_end < request->fl_end)
                                fl->fl_end = request->fl_end;
                        else
                                request->fl_end = fl->fl_end;
                        if (added) {
                                locks_delete_lock_ctx(&fl->c, &dispose);
                                continue;
                        }
                        request = fl;
                        added = true;
                } else {
                        /* Processing for different lock types is a bit
                         * more complex.
                         */
                        if (fl->fl_end < request->fl_start)
                                continue;
                        if (fl->fl_start > request->fl_end)
                                break;
                        if (lock_is_unlock(request))
                                added = true;
                        if (fl->fl_start < request->fl_start)
                                left = fl;
                        /* If the next lock in the list has a higher end
                         * address than the new one, insert the new one here.
                         */
                        if (fl->fl_end > request->fl_end) {
                                right = fl;
                                break;
                        }
                        if (fl->fl_start >= request->fl_start) {
                                /* The new lock completely replaces an old
                                 * one (This may happen several times).
                                 */
                                if (added) {
                                        locks_delete_lock_ctx(&fl->c, &dispose);
                                        continue;
                                }
                                /*
                                 * Replace the old lock with new_fl, and
                                 * remove the old one. It's safe to do the
                                 * insert here since we know that we won't be
                                 * using new_fl later, and that the lock is
                                 * just replacing an existing lock.
                                 */
                                error = -ENOLCK;
                                if (!new_fl)
                                        goto out;
                                locks_copy_lock(new_fl, request);
                                locks_move_blocks(new_fl, request);
                                request = new_fl;
                                new_fl = NULL;
                                locks_insert_lock_ctx(&request->c,
                                                      &fl->c.flc_list);
                                locks_delete_lock_ctx(&fl->c, &dispose);
                                added = true;
                        }
                }
        }

        /*
         * The above code only modifies existing locks in case of merging or
         * replacing. If new lock(s) need to be inserted all modifications are
         * done below this, so it's safe yet to bail out.
         */
        error = -ENOLCK; /* "no luck" */
        if (right && left == right && !new_fl2)
                goto out;

        error = 0;
        if (!added) {
                if (lock_is_unlock(request)) {
                        if (request->c.flc_flags & FL_EXISTS)
                                error = -ENOENT;
                        goto out;
                }

                if (!new_fl) {
                        error = -ENOLCK;
                        goto out;
                }
                locks_copy_lock(new_fl, request);
                locks_move_blocks(new_fl, request);
                locks_insert_lock_ctx(&new_fl->c, &fl->c.flc_list);
                fl = new_fl;
                new_fl = NULL;
        }
        if (right) {
                if (left == right) {
                        /* The new lock breaks the old one in two pieces,
                         * so we have to use the second new lock.
                         */
                        left = new_fl2;
                        new_fl2 = NULL;
                        locks_copy_lock(left, right);
                        locks_insert_lock_ctx(&left->c, &fl->c.flc_list);
                }
                right->fl_start = request->fl_end + 1;
                locks_wake_up_blocks(&right->c);
        }
        if (left) {
                left->fl_end = request->fl_start - 1;
                locks_wake_up_blocks(&left->c);
        }
 out:
        trace_posix_lock_inode(inode, request, error);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        /*
         * Free any unused locks.
         */
        if (new_fl)
                locks_free_lock(new_fl);
        if (new_fl2)
                locks_free_lock(new_fl2);
        locks_dispose_list(&dispose);

        return error;
}

/**
 * posix_lock_file - Apply a POSIX-style lock to a file
 * @filp: The file to apply the lock to
 * @fl: The lock to be applied
 * @conflock: Place to return a copy of the conflicting lock, if found.
 *
 * Add a POSIX style lock to a file.
 * We merge adjacent & overlapping locks whenever possible.
 * POSIX locks are sorted by owner task, then by starting address
 *
 * Note that if called with an FL_EXISTS argument, the caller may determine
 * whether or not a lock was successfully freed by testing the return
 * value for -ENOENT.
 */
int posix_lock_file(struct file *filp, struct file_lock *fl,
                        struct file_lock *conflock)
{
        return posix_lock_inode(file_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);

/**
 * posix_lock_inode_wait - Apply a POSIX-style lock to a file
 * @inode: inode of file to which lock request should be applied
 * @fl: The lock to be applied
 *
 * Apply a POSIX style lock request to an inode.
 */
static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int error;
        might_sleep ();
        for (;;) {
                error = posix_lock_inode(inode, fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);
        return error;
}

static void lease_clear_pending(struct file_lease *fl, int arg)
{
        switch (arg) {
        case F_UNLCK:
                fl->c.flc_flags &= ~FL_UNLOCK_PENDING;
                fallthrough;
        case F_RDLCK:
                fl->c.flc_flags &= ~FL_DOWNGRADE_PENDING;
        }
}

/* We already had a lease on this file; just change its type */
int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
{
        int error = assign_type(&fl->c, arg);

        if (error)
                return error;
        lease_clear_pending(fl, arg);
        locks_wake_up_blocks(&fl->c);
        if (arg == F_UNLCK) {
                struct file *filp = fl->c.flc_file;

                f_delown(filp);
                file_f_owner(filp)->signum = 0;
                fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
                if (fl->fl_fasync != NULL) {
                        printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
                        fl->fl_fasync = NULL;
                }
                locks_delete_lock_ctx(&fl->c, dispose);
        }
        return 0;
}
EXPORT_SYMBOL(lease_modify);

static bool past_time(unsigned long then)
{
        if (!then)
                /* 0 is a special value meaning "this never expires": */
                return false;
        return time_after(jiffies, then);
}

static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lease *fl, *tmp;
        bool remove;

        lockdep_assert_held(&ctx->flc_lock);

        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
                trace_time_out_leases(inode, fl);
                if (past_time(fl->fl_downgrade_time))
                        lease_modify(fl, F_RDLCK, dispose);

                remove = true;
                if (past_time(fl->fl_break_time)) {
                        /*
                         * Consult the lease manager when a lease break times
                         * out to determine whether the lease should be disposed
                         * of.
                         */
                        if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
                                remove = fl->fl_lmops->lm_breaker_timedout(fl);
                        if (remove)
                                lease_modify(fl, F_UNLCK, dispose);
                }
        }
}

static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc)
{
        bool rc;
        struct file_lease *lease = file_lease(lc);
        struct file_lease *breaker = file_lease(bc);

        if (lease->fl_lmops->lm_breaker_owns_lease
                        && lease->fl_lmops->lm_breaker_owns_lease(lease))
                return false;
        if ((bc->flc_flags & FL_LAYOUT) != (lc->flc_flags & FL_LAYOUT)) {
                rc = false;
                goto trace;
        }
        if ((bc->flc_flags & FL_DELEG) && (lc->flc_flags & FL_LEASE)) {
                rc = false;
                goto trace;
        }

        rc = locks_conflict(bc, lc);
trace:
        trace_leases_conflict(rc, lease, breaker);
        return rc;
}

static bool
any_leases_conflict(struct inode *inode, struct file_lease *breaker)
{
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lock_core *flc;

        lockdep_assert_held(&ctx->flc_lock);

        list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
                if (leases_conflict(flc, &breaker->c))
                        return true;
        }
        return false;
}

/**
 *        __break_lease        -        revoke all outstanding leases on file
 *        @inode: the inode of the file to return
 *        @flags: LEASE_BREAK_* flags
 *
 *        break_lease (inlined for speed) has checked there already is at least
 *        some kind of lock (maybe a lease) on this file.  Leases are broken on
 *        a call to open() or truncate().  This function can block waiting for the
 *        lease break unless you specify LEASE_BREAK_NONBLOCK.
 */
int __break_lease(struct inode *inode, unsigned int flags)
{
        struct file_lease *new_fl, *fl, *tmp;
        struct file_lock_context *ctx;
        unsigned long break_time;
        unsigned int type;
        LIST_HEAD(dispose);
        bool want_write = !(flags & LEASE_BREAK_OPEN_RDONLY);
        int error = 0;

        if (flags & LEASE_BREAK_LEASE)
                type = FL_LEASE;
        else if (flags & LEASE_BREAK_DELEG)
                type = FL_DELEG;
        else if (flags & LEASE_BREAK_LAYOUT)
                type = FL_LAYOUT;
        else
                return -EINVAL;

        new_fl = lease_alloc(NULL, type, want_write ? F_WRLCK : F_RDLCK);
        if (IS_ERR(new_fl))
                return PTR_ERR(new_fl);

        /* typically we will check that ctx is non-NULL before calling */
        ctx = locks_inode_context(inode);
        if (!ctx) {
                WARN_ON_ONCE(1);
                goto free_lock;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);

        time_out_leases(inode, &dispose);

        if (!any_leases_conflict(inode, new_fl))
                goto out;

        break_time = 0;
        if (lease_break_time > 0) {
                break_time = jiffies + lease_break_time * HZ;
                if (break_time == 0)
                        break_time++;        /* so that 0 means no break time */
        }

        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
                if (!leases_conflict(&fl->c, &new_fl->c))
                        continue;
                if (want_write) {
                        if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                                continue;
                        fl->c.flc_flags |= FL_UNLOCK_PENDING;
                        fl->fl_break_time = break_time;
                } else {
                        if (lease_breaking(fl))
                                continue;
                        fl->c.flc_flags |= FL_DOWNGRADE_PENDING;
                        fl->fl_downgrade_time = break_time;
                }
                if (fl->fl_lmops->lm_break(fl))
                        locks_delete_lock_ctx(&fl->c, &dispose);
        }

        if (list_empty(&ctx->flc_lease))
                goto out;

        if (flags & LEASE_BREAK_NONBLOCK) {
                trace_break_lease_noblock(inode, new_fl);
                error = -EWOULDBLOCK;
                goto out;
        }

restart:
        fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
        break_time = fl->fl_break_time;
        if (break_time != 0) {
                if (time_after(jiffies, break_time)) {
                        fl->fl_break_time = jiffies + lease_break_time * HZ;
                        break_time = lease_break_time * HZ;
                } else
                        break_time -= jiffies;
        } else
                break_time++;
        locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
        trace_break_lease_block(inode, new_fl);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);

        lease_dispose_list(&dispose);
        error = wait_event_interruptible_timeout(new_fl->c.flc_wait,
                                                 list_empty(&new_fl->c.flc_blocked_member),
                                                 break_time);

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        trace_break_lease_unblock(inode, new_fl);
        __locks_delete_block(&new_fl->c);
        if (error >= 0) {
                /*
                 * Wait for the next conflicting lease that has not been
                 * broken yet
                 */
                if (error == 0)
                        time_out_leases(inode, &dispose);
                if (any_leases_conflict(inode, new_fl))
                        goto restart;
                error = 0;
        }
out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        lease_dispose_list(&dispose);
free_lock:
        locks_free_lease(new_fl);
        return error;
}
EXPORT_SYMBOL(__break_lease);

/**
 *        lease_get_mtime - update modified time of an inode with exclusive lease
 *        @inode: the inode
 *      @time:  pointer to a timespec which contains the last modified time
 *
 * This is to force NFS clients to flush their caches for files with
 * exclusive leases.  The justification is that if someone has an
 * exclusive lease, then they could be modifying it.
 */
void lease_get_mtime(struct inode *inode, struct timespec64 *time)
{
        bool has_lease = false;
        struct file_lock_context *ctx;
        struct file_lock_core *flc;

        ctx = locks_inode_context(inode);
        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
                spin_lock(&ctx->flc_lock);
                flc = list_first_entry_or_null(&ctx->flc_lease,
                                               struct file_lock_core, flc_list);
                if (flc && flc->flc_type == F_WRLCK)
                        has_lease = true;
                spin_unlock(&ctx->flc_lock);
        }

        if (has_lease)
                *time = current_time(inode);
}
EXPORT_SYMBOL(lease_get_mtime);

/**
 *        __fcntl_getlease - Enquire what lease is currently active
 *        @filp: the file
 *        @flavor: type of lease flags to check
 *
 *        The value returned by this function will be one of
 *        (if no lease break is pending):
 *
 *        %F_RDLCK to indicate a shared lease is held.
 *
 *        %F_WRLCK to indicate an exclusive lease is held.
 *
 *        %F_UNLCK to indicate no lease is held.
 *
 *        (if a lease break is pending):
 *
 *        %F_RDLCK to indicate an exclusive lease needs to be
 *                changed to a shared lease (or removed).
 *
 *        %F_UNLCK to indicate the lease needs to be removed.
 *
 *        XXX: sfr & willy disagree over whether F_INPROGRESS
 *        should be returned to userspace.
 */
static int __fcntl_getlease(struct file *filp, unsigned int flavor)
{
        struct file_lease *fl;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        int type = F_UNLCK;
        LIST_HEAD(dispose);

        ctx = locks_inode_context(inode);
        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
                percpu_down_read(&file_rwsem);
                spin_lock(&ctx->flc_lock);
                time_out_leases(inode, &dispose);
                list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                        if (fl->c.flc_file != filp)
                                continue;
                        if (fl->c.flc_flags & flavor)
                                type = target_leasetype(fl);
                        break;
                }
                spin_unlock(&ctx->flc_lock);
                percpu_up_read(&file_rwsem);

                lease_dispose_list(&dispose);
        }
        return type;
}

int fcntl_getlease(struct file *filp)
{
        return __fcntl_getlease(filp, FL_LEASE);
}

int fcntl_getdeleg(struct file *filp, struct delegation *deleg)
{
        if (deleg->d_flags != 0 || deleg->__pad != 0)
                return -EINVAL;
        deleg->d_type = __fcntl_getlease(filp, FL_DELEG);
        return 0;
}

static int
generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv)
{
        struct file_lease *fl, *my_fl = NULL, *lease;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        bool is_deleg = (*flp)->c.flc_flags & FL_DELEG;
        int error;
        LIST_HEAD(dispose);

        lease = *flp;
        trace_generic_add_lease(inode, lease);

        error = file_f_owner_allocate(filp);
        if (error)
                return error;

        /* Note that arg is never F_UNLCK here */
        ctx = locks_get_lock_context(inode, arg);
        if (!ctx)
                return -ENOMEM;

        /*
         * In the delegation case we need mutual exclusion with
         * a number of operations that take the i_rwsem.  We trylock
         * because delegations are an optional optimization, and if
         * there's some chance of a conflict--we'd rather not
         * bother, maybe that's a sign this just isn't a good file to
         * hand out a delegation on.
         */
        if (is_deleg && !inode_trylock(inode))
                return -EAGAIN;

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
        error = lease->fl_lmops->lm_open_conflict(filp, arg);
        if (error)
                goto out;

        /*
         * At this point, we know that if there is an exclusive
         * lease on this file, then we hold it on this filp
         * (otherwise our open of this file would have blocked).
         * And if we are trying to acquire an exclusive lease,
         * then the file is not open by anyone (including us)
         * except for this filp.
         */
        error = -EAGAIN;
        list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                if (fl->c.flc_file == filp &&
                    fl->c.flc_owner == lease->c.flc_owner) {
                        my_fl = fl;
                        continue;
                }

                /*
                 * No exclusive leases if someone else has a lease on
                 * this file:
                 */
                if (arg == F_WRLCK)
                        goto out;
                /*
                 * Modifying our existing lease is OK, but no getting a
                 * new lease if someone else is opening for write:
                 */
                if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                        goto out;
        }

        if (my_fl != NULL) {
                lease = my_fl;
                error = lease->fl_lmops->lm_change(lease, arg, &dispose);
                if (error)
                        goto out;
                goto out_setup;
        }

        error = -EINVAL;
        if (!leases_enable)
                goto out;

        locks_insert_lock_ctx(&lease->c, &ctx->flc_lease);
        /*
         * The check in break_lease() is lockless. It's possible for another
         * open to race in after we did the earlier check for a conflicting
         * open but before the lease was inserted. Check again for a
         * conflicting open and cancel the lease if there is one.
         *
         * We also add a barrier here to ensure that the insertion of the lock
         * precedes these checks.
         */
        smp_mb();
        error = lease->fl_lmops->lm_open_conflict(filp, arg);
        if (error) {
                locks_unlink_lock_ctx(&lease->c);
                goto out;
        }

out_setup:
        if (lease->fl_lmops->lm_setup)
                lease->fl_lmops->lm_setup(lease, priv);
out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        lease_dispose_list(&dispose);
        if (is_deleg)
                inode_unlock(inode);
        if (!error && !my_fl)
                *flp = NULL;
        return error;
}

static int generic_delete_lease(struct file *filp, void *owner)
{
        int error = -EAGAIN;
        struct file_lease *fl, *victim = NULL;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        LIST_HEAD(dispose);

        ctx = locks_inode_context(inode);
        if (!ctx) {
                trace_generic_delete_lease(inode, NULL);
                return error;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                if (fl->c.flc_file == filp &&
                    fl->c.flc_owner == owner) {
                        victim = fl;
                        break;
                }
        }
        trace_generic_delete_lease(inode, victim);
        if (victim)
                error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        lease_dispose_list(&dispose);
        return error;
}

/**
 *        generic_setlease        -        sets a lease on an open file
 *        @filp:        file pointer
 *        @arg:        type of lease to obtain
 *        @flp:        input - file_lock to use, output - file_lock inserted
 *        @priv:        private data for lm_setup (may be NULL if lm_setup
 *                doesn't require it)
 *
 *        The (input) flp->fl_lmops->lm_break function is required
 *        by break_lease().
 */
int generic_setlease(struct file *filp, int arg, struct file_lease **flp,
                        void **priv)
{
        struct inode *inode = file_inode(filp);

        if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                return -EINVAL;

        switch (arg) {
        case F_UNLCK:
                return generic_delete_lease(filp, *priv);
        case F_WRLCK:
                if (S_ISDIR(inode->i_mode))
                        return -EINVAL;
                fallthrough;
        case F_RDLCK:
                if (!(*flp)->fl_lmops->lm_break) {
                        WARN_ON_ONCE(1);
                        return -ENOLCK;
                }

                return generic_add_lease(filp, arg, flp, priv);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(generic_setlease);

/*
 * Kernel subsystems can register to be notified on any attempt to set
 * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd
 * to close files that it may have cached when there is an attempt to set a
 * conflicting lease.
 */
static struct srcu_notifier_head lease_notifier_chain;

static inline void
lease_notifier_chain_init(void)
{
        srcu_init_notifier_head(&lease_notifier_chain);
}

static inline void
setlease_notifier(int arg, struct file_lease *lease)
{
        if (arg != F_UNLCK)
                srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
}

int lease_register_notifier(struct notifier_block *nb)
{
        return srcu_notifier_chain_register(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_register_notifier);

void lease_unregister_notifier(struct notifier_block *nb)
{
        srcu_notifier_chain_unregister(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_unregister_notifier);


int
kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
{
        if (lease)
                setlease_notifier(arg, *lease);
        if (filp->f_op->setlease)
                return filp->f_op->setlease(filp, arg, lease, priv);
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(kernel_setlease);

/**
 * vfs_setlease        -       sets a lease on an open file
 * @filp:        file pointer
 * @arg:        type of lease to obtain
 * @lease:        file_lock to use when adding a lease
 * @priv:        private info for lm_setup when adding a lease (may be
 *                NULL if lm_setup doesn't require it)
 *
 * Call this to establish a lease on the file. The "lease" argument is not
 * used for F_UNLCK requests and may be NULL. For commands that set or alter
 * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be
 * set; if not, this function will return -ENOLCK (and generate a scary-looking
 * stack trace).
 *
 * The "priv" pointer is passed directly to the lm_setup function as-is. It
 * may be NULL if the lm_setup operation doesn't require it.
 */
int
vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
{
        struct inode *inode = file_inode(filp);
        vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode);
        int error;

        if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
                return -EACCES;
        error = security_file_lock(filp, arg);
        if (error)
                return error;
        return kernel_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(vfs_setlease);

static int do_fcntl_add_lease(unsigned int fd, struct file *filp, unsigned int flavor, int arg)
{
        struct file_lease *fl;
        struct fasync_struct *new;
        int error;

        fl = lease_alloc(filp, flavor, arg);
        if (IS_ERR(fl))
                return PTR_ERR(fl);

        new = fasync_alloc();
        if (!new) {
                locks_free_lease(fl);
                return -ENOMEM;
        }
        new->fa_fd = fd;

        error = vfs_setlease(filp, arg, &fl, (void **)&new);
        if (fl)
                locks_free_lease(fl);
        if (new)
                fasync_free(new);
        return error;
}

/**
 *        fcntl_setlease        -        sets a lease on an open file
 *        @fd: open file descriptor
 *        @filp: file pointer
 *        @arg: type of lease to obtain
 *
 *        Call this fcntl to establish a lease on the file.
 *        Note that you also need to call %F_SETSIG to
 *        receive a signal when the lease is broken.
 */
int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
        if (S_ISDIR(file_inode(filp)->i_mode))
                return -EINVAL;

        if (arg == F_UNLCK)
                return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
        return do_fcntl_add_lease(fd, filp, FL_LEASE, arg);
}

/**
 *        fcntl_setdeleg        -        sets a delegation on an open file
 *        @fd: open file descriptor
 *        @filp: file pointer
 *        @deleg: delegation request from userland
 *
 *        Call this fcntl to establish a delegation on the file.
 *        Note that you also need to call %F_SETSIG to
 *        receive a signal when the lease is broken.
 */
int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg)
{
        /* For now, no flags are supported */
        if (deleg->d_flags != 0 || deleg->__pad != 0)
                return -EINVAL;

        if (deleg->d_type == F_UNLCK)
                return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
        return do_fcntl_add_lease(fd, filp, FL_DELEG, deleg->d_type);
}

/**
 * flock_lock_inode_wait - Apply a FLOCK-style lock to a file
 * @inode: inode of the file to apply to
 * @fl: The lock to be applied
 *
 * Apply a FLOCK style lock request to an inode.
 */
static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int error;
        might_sleep();
        for (;;) {
                error = flock_lock_inode(inode, fl);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);
        return error;
}

/**
 * locks_lock_inode_wait - Apply a lock to an inode
 * @inode: inode of the file to apply to
 * @fl: The lock to be applied
 *
 * Apply a POSIX or FLOCK style lock request to an inode.
 */
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int res = 0;
        switch (fl->c.flc_flags & (FL_POSIX|FL_FLOCK)) {
                case FL_POSIX:
                        res = posix_lock_inode_wait(inode, fl);
                        break;
                case FL_FLOCK:
                        res = flock_lock_inode_wait(inode, fl);
                        break;
                default:
                        BUG();
        }
        return res;
}
EXPORT_SYMBOL(locks_lock_inode_wait);

/**
 *        sys_flock: - flock() system call.
 *        @fd: the file descriptor to lock.
 *        @cmd: the type of lock to apply.
 *
 *        Apply a %FL_FLOCK style lock to an open file descriptor.
 *        The @cmd can be one of:
 *
 *        - %LOCK_SH -- a shared lock.
 *        - %LOCK_EX -- an exclusive lock.
 *        - %LOCK_UN -- remove an existing lock.
 *        - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
 *
 *        %LOCK_MAND support has been removed from the kernel.
 */
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
        int can_sleep, error, type;
        struct file_lock fl;

        /*
         * LOCK_MAND locks were broken for a long time in that they never
         * conflicted with one another and didn't prevent any sort of open,
         * read or write activity.
         *
         * Just ignore these requests now, to preserve legacy behavior, but
         * throw a warning to let people know that they don't actually work.
         */
        if (cmd & LOCK_MAND) {
                pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid);
                return 0;
        }

        type = flock_translate_cmd(cmd & ~LOCK_NB);
        if (type < 0)
                return type;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE)))
                return -EBADF;

        flock_make_lock(fd_file(f), &fl, type);

        error = security_file_lock(fd_file(f), fl.c.flc_type);
        if (error)
                return error;

        can_sleep = !(cmd & LOCK_NB);
        if (can_sleep)
                fl.c.flc_flags |= FL_SLEEP;

        if (fd_file(f)->f_op->flock)
                error = fd_file(f)->f_op->flock(fd_file(f),
                                            (can_sleep) ? F_SETLKW : F_SETLK,
                                            &fl);
        else
                error = locks_lock_file_wait(fd_file(f), &fl);

        locks_release_private(&fl);
        return error;
}

/**
 * vfs_test_lock - test file byte range lock
 * @filp: The file to test lock for
 * @fl: The byte-range in the file to test; also used to hold result
 *
 * On entry, @fl does not contain a lock, but identifies a range (fl_start, fl_end)
 * in the file (c.flc_file), and an owner (c.flc_owner) for whom existing locks
 * should be ignored.  c.flc_type and c.flc_flags are ignored.
 * Both fl_lmops and fl_ops in @fl must be NULL.
 * Returns -ERRNO on failure.  Indicates presence of conflicting lock by
 * setting fl->fl_type to something other than F_UNLCK.
 *
 * If vfs_test_lock() does find a lock and return it, the caller must
 * use locks_free_lock() or locks_release_private() on the returned lock.
 */
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        int error = 0;

        WARN_ON_ONCE(fl->fl_ops || fl->fl_lmops);
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                error = filp->f_op->lock(filp, F_GETLK, fl);
        else
                posix_test_lock(filp, fl);

        /*
         * We don't expect FILE_LOCK_DEFERRED and callers cannot
         * handle it.
         */
        if (WARN_ON_ONCE(error == FILE_LOCK_DEFERRED))
                error = -EIO;

        return error;
}
EXPORT_SYMBOL_GPL(vfs_test_lock);

/**
 * locks_translate_pid - translate a file_lock's fl_pid number into a namespace
 * @fl: The file_lock who's fl_pid should be translated
 * @ns: The namespace into which the pid should be translated
 *
 * Used to translate a fl_pid into a namespace virtual pid number
 */
static pid_t locks_translate_pid(struct file_lock_core *fl, struct pid_namespace *ns)
{
        pid_t vnr;
        struct pid *pid;

        if (fl->flc_flags & FL_OFDLCK)
                return -1;

        /* Remote locks report a negative pid value */
        if (fl->flc_pid <= 0)
                return fl->flc_pid;

        /*
         * If the flock owner process is dead and its pid has been already
         * freed, the translation below won't work, but we still want to show
         * flock owner pid number in init pidns.
         */
        if (ns == &init_pid_ns)
                return (pid_t) fl->flc_pid;

        rcu_read_lock();
        pid = find_pid_ns(fl->flc_pid, &init_pid_ns);
        vnr = pid_nr_ns(pid, ns);
        rcu_read_unlock();
        return vnr;
}

static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
        flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
#if BITS_PER_LONG == 32
        /*
         * Make sure we can represent the posix lock via
         * legacy 32bit flock.
         */
        if (fl->fl_start > OFFT_OFFSET_MAX)
                return -EOVERFLOW;
        if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX)
                return -EOVERFLOW;
#endif
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
        flock->l_whence = 0;
        flock->l_type = fl->c.flc_type;
        return 0;
}

#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
        flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
        flock->l_whence = 0;
        flock->l_type = fl->c.flc_type;
}
#endif

/* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
{
        struct file_lock *fl;
        int error;

        fl = locks_alloc_lock();
        if (fl == NULL)
                return -ENOMEM;
        error = -EINVAL;
        if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
                        && flock->l_type != F_WRLCK)
                goto out;

        error = flock_to_posix_lock(filp, fl, flock);
        if (error)
                goto out;

        if (cmd == F_OFD_GETLK) {
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                fl->c.flc_flags |= FL_OFDLCK;
                fl->c.flc_owner = filp;
        }

        error = vfs_test_lock(filp, fl);
        if (error)
                goto out;

        flock->l_type = fl->c.flc_type;
        if (fl->c.flc_type != F_UNLCK) {
                error = posix_lock_to_flock(flock, fl);
                if (error)
                        goto out;
        }
out:
        locks_free_lock(fl);
        return error;
}

/**
 * vfs_lock_file - file byte range lock
 * @filp: The file to apply the lock to
 * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.)
 * @fl: The lock to be applied
 * @conf: Place to return a copy of the conflicting lock, if found.
 *
 * A caller that doesn't care about the conflicting lock may pass NULL
 * as the final argument.
 *
 * If the filesystem defines a private ->lock() method, then @conf will
 * be left unchanged; so a caller that cares should initialize it to
 * some acceptable default.
 *
 * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
 * locks, the ->lock() interface may return asynchronously, before the lock has
 * been granted or denied by the underlying filesystem, if (and only if)
 * lm_grant is set. Additionally FOP_ASYNC_LOCK in file_operations fop_flags
 * need to be set.
 *
 * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
 * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
 * blocking lock. When ->lock() does return asynchronously, it must return
 * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
 * If the request is for non-blocking lock the file system should return
 * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
 * with the result. If the request timed out the callback routine will return a
 * nonzero return code and the file system should release the lock. The file
 * system is also responsible to keep a corresponding posix lock when it
 * grants a lock so the VFS can find out which locks are locally held and do
 * the correct lock cleanup when required.
 * The underlying filesystem must not drop the kernel lock or call
 * ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED
 * return code.
 */
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, cmd, fl);
        else
                return posix_lock_file(filp, fl, conf);
}
EXPORT_SYMBOL_GPL(vfs_lock_file);

static int do_lock_file_wait(struct file *filp, unsigned int cmd,
                             struct file_lock *fl)
{
        int error;

        error = security_file_lock(filp, fl->c.flc_type);
        if (error)
                return error;

        for (;;) {
                error = vfs_lock_file(filp, cmd, fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);

        return error;
}

/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */
static int
check_fmode_for_setlk(struct file_lock *fl)
{
        switch (fl->c.flc_type) {
        case F_RDLCK:
                if (!(fl->c.flc_file->f_mode & FMODE_READ))
                        return -EBADF;
                break;
        case F_WRLCK:
                if (!(fl->c.flc_file->f_mode & FMODE_WRITE))
                        return -EBADF;
        }
        return 0;
}

/* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
                struct flock *flock)
{
        struct file_lock *file_lock = locks_alloc_lock();
        struct inode *inode = file_inode(filp);
        struct file *f;
        int error;

        if (file_lock == NULL)
                return -ENOLCK;

        error = flock_to_posix_lock(filp, file_lock, flock);
        if (error)
                goto out;

        error = check_fmode_for_setlk(file_lock);
        if (error)
                goto out;

        /*
         * If the cmd is requesting file-private locks, then set the
         * FL_OFDLCK flag and override the owner.
         */
        switch (cmd) {
        case F_OFD_SETLK:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLK;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                break;
        case F_OFD_SETLKW:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLKW;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                fallthrough;
        case F_SETLKW:
                file_lock->c.flc_flags |= FL_SLEEP;
        }

        error = do_lock_file_wait(filp, cmd, file_lock);

        /*
         * Detect close/fcntl races and recover by zapping all POSIX locks
         * associated with this file and our files_struct, just like on
         * filp_flush(). There is no need to do that when we're
         * unlocking though, or for OFD locks.
         */
        if (!error && file_lock->c.flc_type != F_UNLCK &&
            !(file_lock->c.flc_flags & FL_OFDLCK)) {
                struct files_struct *files = current->files;
                /*
                 * We need that spin_lock here - it prevents reordering between
                 * update of i_flctx->flc_posix and check for it done in
                 * close(). rcu_read_lock() wouldn't do.
                 */
                spin_lock(&files->file_lock);
                f = files_lookup_fd_locked(files, fd);
                spin_unlock(&files->file_lock);
                if (f != filp) {
                        locks_remove_posix(filp, files);
                        error = -EBADF;
                }
        }
out:
        trace_fcntl_setlk(inode, file_lock, error);
        locks_free_lock(file_lock);
        return error;
}

#if BITS_PER_LONG == 32
/* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
{
        struct file_lock *fl;
        int error;

        fl = locks_alloc_lock();
        if (fl == NULL)
                return -ENOMEM;

        error = -EINVAL;
        if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
                        && flock->l_type != F_WRLCK)
                goto out;

        error = flock64_to_posix_lock(filp, fl, flock);
        if (error)
                goto out;

        if (cmd == F_OFD_GETLK) {
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                fl->c.flc_flags |= FL_OFDLCK;
                fl->c.flc_owner = filp;
        }

        error = vfs_test_lock(filp, fl);
        if (error)
                goto out;

        flock->l_type = fl->c.flc_type;
        if (fl->c.flc_type != F_UNLCK)
                posix_lock_to_flock64(flock, fl);

out:
        locks_free_lock(fl);
        return error;
}

/* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
                struct flock64 *flock)
{
        struct file_lock *file_lock = locks_alloc_lock();
        struct file *f;
        int error;

        if (file_lock == NULL)
                return -ENOLCK;

        error = flock64_to_posix_lock(filp, file_lock, flock);
        if (error)
                goto out;

        error = check_fmode_for_setlk(file_lock);
        if (error)
                goto out;

        /*
         * If the cmd is requesting file-private locks, then set the
         * FL_OFDLCK flag and override the owner.
         */
        switch (cmd) {
        case F_OFD_SETLK:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLK64;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                break;
        case F_OFD_SETLKW:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLKW64;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                fallthrough;
        case F_SETLKW64:
                file_lock->c.flc_flags |= FL_SLEEP;
        }

        error = do_lock_file_wait(filp, cmd, file_lock);

        /*
         * Detect close/fcntl races and recover by zapping all POSIX locks
         * associated with this file and our files_struct, just like on
         * filp_flush(). There is no need to do that when we're
         * unlocking though, or for OFD locks.
         */
        if (!error && file_lock->c.flc_type != F_UNLCK &&
            !(file_lock->c.flc_flags & FL_OFDLCK)) {
                struct files_struct *files = current->files;
                /*
                 * We need that spin_lock here - it prevents reordering between
                 * update of i_flctx->flc_posix and check for it done in
                 * close(). rcu_read_lock() wouldn't do.
                 */
                spin_lock(&files->file_lock);
                f = files_lookup_fd_locked(files, fd);
                spin_unlock(&files->file_lock);
                if (f != filp) {
                        locks_remove_posix(filp, files);
                        error = -EBADF;
                }
        }
out:
        locks_free_lock(file_lock);
        return error;
}
#endif /* BITS_PER_LONG == 32 */

/*
 * This function is called when the file is being removed
 * from the task's fd array.  POSIX locks belonging to this task
 * are deleted at this time.
 */
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        int error;
        struct inode *inode = file_inode(filp);
        struct file_lock lock;
        struct file_lock_context *ctx;

        /*
         * If there are no locks held on this file, we don't need to call
         * posix_lock_file().  Another process could be setting a lock on this
         * file at the same time, but we wouldn't remove that lock anyway.
         */
        ctx = locks_inode_context(inode);
        if (!ctx || list_empty(&ctx->flc_posix))
                return;

        locks_init_lock(&lock);
        lock.c.flc_type = F_UNLCK;
        lock.c.flc_flags = FL_POSIX | FL_CLOSE;
        lock.fl_start = 0;
        lock.fl_end = OFFSET_MAX;
        lock.c.flc_owner = owner;
        lock.c.flc_pid = current->tgid;
        lock.c.flc_file = filp;
        lock.fl_ops = NULL;
        lock.fl_lmops = NULL;

        error = vfs_lock_file(filp, F_SETLK, &lock, NULL);

        if (lock.fl_ops && lock.fl_ops->fl_release_private)
                lock.fl_ops->fl_release_private(&lock);
        trace_locks_remove_posix(inode, &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);

/* The i_flctx must be valid when calling into here */
static void
locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
{
        struct file_lock fl;
        struct inode *inode = file_inode(filp);

        if (list_empty(&flctx->flc_flock))
                return;

        flock_make_lock(filp, &fl, F_UNLCK);
        fl.c.flc_flags |= FL_CLOSE;

        if (filp->f_op->flock)
                filp->f_op->flock(filp, F_SETLKW, &fl);
        else
                flock_lock_inode(inode, &fl);

        if (fl.fl_ops && fl.fl_ops->fl_release_private)
                fl.fl_ops->fl_release_private(&fl);
}

/* The i_flctx must be valid when calling into here */
static void
locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
        struct file_lease *fl, *tmp;
        LIST_HEAD(dispose);

        if (list_empty(&ctx->flc_lease))
                return;

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list)
                if (filp == fl->c.flc_file)
                        lease_modify(fl, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);

        lease_dispose_list(&dispose);
}

/*
 * This function is called on the last close of an open file.
 */
void locks_remove_file(struct file *filp)
{
        struct file_lock_context *ctx;

        ctx = locks_inode_context(file_inode(filp));
        if (!ctx)
                return;

        /* remove any OFD locks */
        locks_remove_posix(filp, filp);

        /* remove flock locks */
        locks_remove_flock(filp, ctx);

        /* remove any leases */
        locks_remove_lease(filp, ctx);

        spin_lock(&ctx->flc_lock);
        locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX");
        locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK");
        locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE");
        spin_unlock(&ctx->flc_lock);
}

/**
 * vfs_cancel_lock - file byte range unblock lock
 * @filp: The file to apply the unblock to
 * @fl: The lock to be unblocked
 *
 * Used by lock managers to cancel blocked requests
 */
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, F_CANCELLK, fl);
        return 0;
}
EXPORT_SYMBOL_GPL(vfs_cancel_lock);

/**
 * vfs_inode_has_locks - are any file locks held on @inode?
 * @inode: inode to check for locks
 *
 * Return true if there are any FL_POSIX or FL_FLOCK locks currently
 * set on @inode.
 */
bool vfs_inode_has_locks(struct inode *inode)
{
        struct file_lock_context *ctx;
        bool ret;

        ctx = locks_inode_context(inode);
        if (!ctx)
                return false;

        spin_lock(&ctx->flc_lock);
        ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock);
        spin_unlock(&ctx->flc_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_inode_has_locks);

#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

struct locks_iterator {
        int        li_cpu;
        loff_t        li_pos;
};

static void lock_get_status(struct seq_file *f, struct file_lock_core *flc,
                            loff_t id, char *pfx, int repeat)
{
        struct inode *inode = NULL;
        unsigned int pid;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
        int type = flc->flc_type;
        struct file_lock *fl = file_lock(flc);

        pid = locks_translate_pid(flc, proc_pidns);

        /*
         * If lock owner is dead (and pid is freed) or not visible in current
         * pidns, zero is shown as a pid value. Check lock info from
         * init_pid_ns to get saved lock pid value.
         */
        if (flc->flc_file != NULL)
                inode = file_inode(flc->flc_file);

        seq_printf(f, "%lld: ", id);

        if (repeat)
                seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx);

        if (flc->flc_flags & FL_POSIX) {
                if (flc->flc_flags & FL_ACCESS)
                        seq_puts(f, "ACCESS");
                else if (flc->flc_flags & FL_OFDLCK)
                        seq_puts(f, "OFDLCK");
                else
                        seq_puts(f, "POSIX ");

                seq_printf(f, " %s ",
                             (inode == NULL) ? "*NOINODE*" : "ADVISORY ");
        } else if (flc->flc_flags & FL_FLOCK) {
                seq_puts(f, "FLOCK  ADVISORY  ");
        } else if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) {
                struct file_lease *lease = file_lease(flc);

                type = target_leasetype(lease);

                if (flc->flc_flags & FL_DELEG)
                        seq_puts(f, "DELEG  ");
                else
                        seq_puts(f, "LEASE  ");

                if (lease_breaking(lease))
                        seq_puts(f, "BREAKING  ");
                else if (flc->flc_file)
                        seq_puts(f, "ACTIVE    ");
                else
                        seq_puts(f, "BREAKER   ");
        } else {
                seq_puts(f, "UNKNOWN UNKNOWN  ");
        }

        seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
                             (type == F_RDLCK) ? "READ" : "UNLCK");
        if (inode) {
                /* userspace relies on this representation of dev_t */
                seq_printf(f, "%d %02x:%02x:%llu ", pid,
                                MAJOR(inode->i_sb->s_dev),
                                MINOR(inode->i_sb->s_dev), inode->i_ino);
        } else {
                seq_printf(f, "%d <none>:0 ", pid);
        }
        if (flc->flc_flags & FL_POSIX) {
                if (fl->fl_end == OFFSET_MAX)
                        seq_printf(f, "%Ld EOF\n", fl->fl_start);
                else
                        seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
        } else {
                seq_puts(f, "0 EOF\n");
        }
}

static struct file_lock_core *get_next_blocked_member(struct file_lock_core *node)
{
        struct file_lock_core *tmp;

        /* NULL node or root node */
        if (node == NULL || node->flc_blocker == NULL)
                return NULL;

        /* Next member in the linked list could be itself */
        tmp = list_next_entry(node, flc_blocked_member);
        if (list_entry_is_head(tmp, &node->flc_blocker->flc_blocked_requests,
                               flc_blocked_member)
                || tmp == node) {
                return NULL;
        }

        return tmp;
}

static int locks_show(struct seq_file *f, void *v)
{
        struct locks_iterator *iter = f->private;
        struct file_lock_core *cur, *tmp;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
        int level = 0;

        cur = hlist_entry(v, struct file_lock_core, flc_link);

        if (locks_translate_pid(cur, proc_pidns) == 0)
                return 0;

        /* View this crossed linked list as a binary tree, the first member of flc_blocked_requests
         * is the left child of current node, the next silibing in flc_blocked_member is the
         * right child, we can alse get the parent of current node from flc_blocker, so this
         * question becomes traversal of a binary tree
         */
        while (cur != NULL) {
                if (level)
                        lock_get_status(f, cur, iter->li_pos, "-> ", level);
                else
                        lock_get_status(f, cur, iter->li_pos, "", level);

                if (!list_empty(&cur->flc_blocked_requests)) {
                        /* Turn left */
                        cur = list_first_entry_or_null(&cur->flc_blocked_requests,
                                                       struct file_lock_core,
                                                       flc_blocked_member);
                        level++;
                } else {
                        /* Turn right */
                        tmp = get_next_blocked_member(cur);
                        /* Fall back to parent node */
                        while (tmp == NULL && cur->flc_blocker != NULL) {
                                cur = cur->flc_blocker;
                                level--;
                                tmp = get_next_blocked_member(cur);
                        }
                        cur = tmp;
                }
        }

        return 0;
}

static void __show_fd_locks(struct seq_file *f,
                        struct list_head *head, int *id,
                        struct file *filp, struct files_struct *files)
{
        struct file_lock_core *fl;

        list_for_each_entry(fl, head, flc_list) {

                if (filp != fl->flc_file)
                        continue;
                if (fl->flc_owner != files && fl->flc_owner != filp)
                        continue;

                (*id)++;
                seq_puts(f, "lock:\t");
                lock_get_status(f, fl, *id, "", 0);
        }
}

void show_fd_locks(struct seq_file *f,
                  struct file *filp, struct files_struct *files)
{
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        int id = 0;

        ctx = locks_inode_context(inode);
        if (!ctx)
                return;

        spin_lock(&ctx->flc_lock);
        __show_fd_locks(f, &ctx->flc_flock, &id, filp, files);
        __show_fd_locks(f, &ctx->flc_posix, &id, filp, files);
        __show_fd_locks(f, &ctx->flc_lease, &id, filp, files);
        spin_unlock(&ctx->flc_lock);
}

static void *locks_start(struct seq_file *f, loff_t *pos)
        __acquires(&blocked_lock_lock)
{
        struct locks_iterator *iter = f->private;

        iter->li_pos = *pos + 1;
        percpu_down_write(&file_rwsem);
        spin_lock(&blocked_lock_lock);
        return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
}

static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
{
        struct locks_iterator *iter = f->private;

        ++iter->li_pos;
        return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
}

static void locks_stop(struct seq_file *f, void *v)
        __releases(&blocked_lock_lock)
{
        spin_unlock(&blocked_lock_lock);
        percpu_up_write(&file_rwsem);
}

static const struct seq_operations locks_seq_operations = {
        .start        = locks_start,
        .next        = locks_next,
        .stop        = locks_stop,
        .show        = locks_show,
};

static int __init proc_locks_init(void)
{
        proc_create_seq_private("locks", 0, NULL, &locks_seq_operations,
                        sizeof(struct locks_iterator), NULL);
        return 0;
}
fs_initcall(proc_locks_init);
#endif

static int __init filelock_init(void)
{
        int i;

        flctx_cache = kmem_cache_create("file_lock_ctx",
                        sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);

        filelock_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lock), 0, SLAB_PANIC, NULL);

        filelease_cache = kmem_cache_create("file_lease_cache",
                        sizeof(struct file_lease), 0, SLAB_PANIC, NULL);

        for_each_possible_cpu(i) {
                struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);

                spin_lock_init(&fll->lock);
                INIT_HLIST_HEAD(&fll->hlist);
        }

        lease_notifier_chain_init();
        return 0;
}
core_initcall(filelock_init);





























































































































































    1 













    1 





   21 































   20 

    1 






   21 





   16 




























   13 












   14 



    1 







   16 




    1 





   17 














   23 
    2 





















































































































































    4 

    5 








   21 



















































   21 





   21 






    1 












































   20 

    4 


    5 



















   21 



   18 



















   20 


   23 















   16 



   21 
    1 


   21 
   16 




























   21 





    2 

   23 


   19 









   17 

   19 










   17 
    2 









































   24 


   22 







   24 

   17 










   23 

   24 





















































   22 
   19 




   19 

   17 
    4 







































    3 













   20 




























   20 









   22 



   21 


   22 























    1 









    1 



    1 










































   18 
















   21 



    5 






   21 





   23 




















   19 


    6 













   20 














   19 

   19 
   19 

    2 






   21 


   15 









   19 










    2 







































    2 

    3 

    2 






















































































































































































































































































































































































    3 




    3 


    3 














    3 


    3 
























    3 

    3 


























































   15 











   17 






   17 





   20 






















    4 

    3 

   16 










    3 
    3 

    1 



    1 










    3 






















    3 




    3 













    3 










    3 





















   20 











    2 











   18 












    3 

    3 

    1 










    2 
























    2 
    2 










    2 




























   23 
   23 













    1 







    3 
























    2 






    3 






    3 










    3 









    1 

    1 












    3 



















    3 


    3 
















   18 




   19 
    1 


   18 
    3 





   17 



    5 
    3 


















   21 






   19 

















































































   16 
   16 












































    3 

   15 




   21 

    2 


   16 
    3 
   21 
































































































































































































































   21 

   24 




































   22 


























   21 






    1 
   18 


   18 













   23 
    1 



   22 








   15 







   18 






   20 







    1 




   21 






    1 


   22 


    1 


    1 



   22 











   18 







   23 


   19 

    1 




   24 
   23 























   19 











   16 


   18 
   16 


































   16 













   16 
    1 

   19 












   16 



   19 





   10 
   19 

   15 






   18 
    3 












   19 




   19 

   15 

   15 


    1 
   17 

   19 












    1 












    1 





    1 
    1 







    1 

    1 


























    2 



















    2 
    2 










    2 

    2 


























































































   15 


   15 









































    2 



    2 



    2 











    2 





















































































































































































































































































































































































































































































































































   15 


   17 


























































































    1 
    1 







    1 




























































































































































































































































































































































































































    1 











































































   12 

   12 




   15 







   14 





    3 




   17 









    1 













   17 







   15 







































































































































    1 


    1 
















    1 












    1 

    1 























    1 
    1 










    1 

















    1 




























   17 



   16 
































   17 



   17 



    1 








    1 










    1 
    1 

    1 

    1 





    1 

    1 













    1 






   16 

   18 
    3 





















   15 
   17 

   17 









   15 








   17 








   16 

   17 

   16 



   17 



































































































































   18 













    1 
   18 


   18 


    3 
   15 
















   16 






   18 

   17 
    1 
   16 

   17 





























    1 































    1 










































































































































    1 






    1 






    1 
    1 







    1 





    1 






    1 
















    1 







    1 








    1 




    1 
















    1 




    1 













    1 



    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/wordpart.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>

#include <asm/runtime-const.h>

#include "internal.h"
#include "mount.h"

/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
 * the name is a symlink pointing to a non-existent name.
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *        inside the path - always follow.
 *        in the last component in creation/removal/renaming - never follow.
 *        if LOOKUP_FOLLOW passed - follow.
 *        if the pathname has trailing slashes - follow.
 *        otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */

/* SLAB cache for struct filename instances */
static struct kmem_cache *__names_cache __ro_after_init;
#define names_cache        runtime_const_ptr(__names_cache)

void __init filename_init(void)
{
        __names_cache = kmem_cache_create_usercopy("names_cache", sizeof(struct filename), 0,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC, offsetof(struct filename, iname),
                         EMBEDDED_NAME_MAX, NULL);
        runtime_const_init(ptr, __names_cache);
}

static inline struct filename *alloc_filename(void)
{
        return kmem_cache_alloc(names_cache, GFP_KERNEL);
}

static inline void free_filename(struct filename *p)
{
        kmem_cache_free(names_cache, p);
}

static inline void initname(struct filename *name)
{
        name->aname = NULL;
        name->refcnt = 1;
}

static int getname_long(struct filename *name, const char __user *filename)
{
        int len;
        char *p __free(kfree) = kmalloc(PATH_MAX, GFP_KERNEL);
        if (unlikely(!p))
                return -ENOMEM;

        memcpy(p, &name->iname, EMBEDDED_NAME_MAX);
        len = strncpy_from_user(p + EMBEDDED_NAME_MAX,
                                filename + EMBEDDED_NAME_MAX,
                                PATH_MAX - EMBEDDED_NAME_MAX);
        if (unlikely(len < 0))
                return len;
        if (unlikely(len == PATH_MAX - EMBEDDED_NAME_MAX))
                return -ENAMETOOLONG;
        name->name = no_free_ptr(p);
        return 0;
}

static struct filename *
do_getname(const char __user *filename, int flags, bool incomplete)
{
        struct filename *result;
        char *kname;
        int len;

        result = alloc_filename();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        /*
         * First, try to embed the struct filename inside the names_cache
         * allocation
         */
        kname = (char *)result->iname;
        result->name = kname;

        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
        /*
         * Handle both empty path and copy failure in one go.
         */
        if (unlikely(len <= 0)) {
                /* The empty path is special. */
                if (!len && !(flags & LOOKUP_EMPTY))
                        len = -ENOENT;
        }

        /*
         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
         * separate struct filename so we can dedicate the entire
         * names_cache allocation for the pathname, and re-do the copy from
         * userland.
         */
        if (unlikely(len == EMBEDDED_NAME_MAX))
                len = getname_long(result, filename);
        if (unlikely(len < 0)) {
                free_filename(result);
                return ERR_PTR(len);
        }

        initname(result);
        if (likely(!incomplete))
                audit_getname(result);
        return result;
}

struct filename *
getname_flags(const char __user *filename, int flags)
{
        return do_getname(filename, flags, false);
}

struct filename *getname_uflags(const char __user *filename, int uflags)
{
        int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

        return getname_flags(filename, flags);
}

struct filename *__getname_maybe_null(const char __user *pathname)
{
        char c;

        /* try to save on allocations; loss on um, though */
        if (get_user(c, pathname))
                return ERR_PTR(-EFAULT);
        if (!c)
                return NULL;

        CLASS(filename_flags, name)(pathname, LOOKUP_EMPTY);
        /* empty pathname translates to NULL */
        if (!IS_ERR(name) && !(name->name[0]))
                return NULL;
        return no_free_ptr(name);
}

static struct filename *do_getname_kernel(const char *filename, bool incomplete)
{
        struct filename *result;
        int len = strlen(filename) + 1;
        char *p;

        if (unlikely(len > PATH_MAX))
                return ERR_PTR(-ENAMETOOLONG);

        result = alloc_filename();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        if (len <= EMBEDDED_NAME_MAX) {
                p = (char *)result->iname;
                memcpy(p, filename, len);
        } else {
                p = kmemdup(filename, len, GFP_KERNEL);
                if (unlikely(!p)) {
                        free_filename(result);
                        return ERR_PTR(-ENOMEM);
                }
        }
        result->name = p;
        initname(result);
        if (likely(!incomplete))
                audit_getname(result);
        return result;
}

struct filename *getname_kernel(const char *filename)
{
        return do_getname_kernel(filename, false);
}
EXPORT_SYMBOL(getname_kernel);

void putname(struct filename *name)
{
        int refcnt;

        if (IS_ERR_OR_NULL(name))
                return;

        refcnt = name->refcnt;
        if (unlikely(refcnt != 1)) {
                if (WARN_ON_ONCE(!refcnt))
                        return;

                name->refcnt--;
                return;
        }

        if (unlikely(name->name != name->iname))
                kfree(name->name);
        free_filename(name);
}
EXPORT_SYMBOL(putname);

static inline int __delayed_getname(struct delayed_filename *v,
                           const char __user *string, int flags)
{
        v->__incomplete_filename = do_getname(string, flags, true);
        return PTR_ERR_OR_ZERO(v->__incomplete_filename);
}

int delayed_getname(struct delayed_filename *v, const char __user *string)
{
        return __delayed_getname(v, string, 0);
}

int delayed_getname_uflags(struct delayed_filename *v, const char __user *string,
                         int uflags)
{
        int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
        return __delayed_getname(v, string, flags);
}

int putname_to_delayed(struct delayed_filename *v, struct filename *name)
{
        if (likely(name->refcnt == 1)) {
                v->__incomplete_filename = name;
                return 0;
        }
        name->refcnt--;
        v->__incomplete_filename = do_getname_kernel(name->name, true);
        return PTR_ERR_OR_ZERO(v->__incomplete_filename);
}

void dismiss_delayed_filename(struct delayed_filename *v)
{
        putname(no_free_ptr(v->__incomplete_filename));
}

struct filename *complete_getname(struct delayed_filename *v)
{
        struct filename *res = no_free_ptr(v->__incomplete_filename);
        if (!IS_ERR(res))
                audit_getname(res);
        return res;
}

/**
 * check_acl - perform ACL permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the ACL permission checking. Since this function
 * retrieve POSIX acls it needs to know whether it is called from a blocking or
 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
static int check_acl(struct mnt_idmap *idmap,
                     struct inode *inode, int mask)
{
#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl *acl;

        if (mask & MAY_NOT_BLOCK) {
                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
                if (!acl)
                        return -EAGAIN;
                /* no ->get_inode_acl() calls in RCU mode... */
                if (is_uncached_acl(acl))
                        return -ECHILD;
                return posix_acl_permission(idmap, inode, acl, mask);
        }

        acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
                int error = posix_acl_permission(idmap, inode, acl, mask);
                posix_acl_release(acl);
                return error;
        }
#endif

        return -EAGAIN;
}

/*
 * Very quick optimistic "we know we have no ACL's" check.
 *
 * Note that this is purely for ACL_TYPE_ACCESS, and purely
 * for the "we have cached that there are no ACLs" case.
 *
 * If this returns true, we know there are no ACLs. But if
 * it returns false, we might still not have ACLs (it could
 * be the is_uncached_acl() case).
 */
static inline bool no_acl_inode(struct inode *inode)
{
#ifdef CONFIG_FS_POSIX_ACL
        return likely(!READ_ONCE(inode->i_acl));
#else
        return true;
#endif
}

/**
 * acl_permission_check - perform basic UNIX permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the basic UNIX permission checking. Since this
 * function may retrieve POSIX acls it needs to know whether it is called from a
 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
static int acl_permission_check(struct mnt_idmap *idmap,
                                struct inode *inode, int mask)
{
        unsigned int mode = inode->i_mode;
        vfsuid_t vfsuid;

        /*
         * Common cheap case: everybody has the requested
         * rights, and there are no ACLs to check. No need
         * to do any owner/group checks in that case.
         *
         *  - 'mask&7' is the requested permission bit set
         *  - multiplying by 0111 spreads them out to all of ugo
         *  - '& ~mode' looks for missing inode permission bits
         *  - the '!' is for "no missing permissions"
         *
         * After that, we just need to check that there are no
         * ACL's on the inode - do the 'IS_POSIXACL()' check last
         * because it will dereference the ->i_sb pointer and we
         * want to avoid that if at all possible.
         */
        if (!((mask & 7) * 0111 & ~mode)) {
                if (no_acl_inode(inode))
                        return 0;
                if (!IS_POSIXACL(inode))
                        return 0;
        }

        /* Are we the owner? If so, ACL's don't matter */
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
                mask &= 7;
                mode >>= 6;
                return (mask & ~mode) ? -EACCES : 0;
        }

        /* Do we have ACL's? */
        if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
                int error = check_acl(idmap, inode, mask);
                if (error != -EAGAIN)
                        return error;
        }

        /* Only RWX matters for group/other mode bits */
        mask &= 7;

        /*
         * Are the group permissions different from
         * the other permissions in the bits we care
         * about? Need to check group ownership if so.
         */
        if (mask & (mode ^ (mode >> 3))) {
                vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
                if (vfsgid_in_group_p(vfsgid))
                        mode >>= 3;
        }

        /* Bits in 'mode' clear that we require? */
        return (mask & ~mode) ? -EACCES : 0;
}

/**
 * generic_permission -  check for access rights on a Posix-like filesystem
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check access rights for
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 *                %MAY_NOT_BLOCK ...)
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
                       int mask)
{
        int ret;

        /*
         * Do the basic permission checks.
         */
        ret = acl_permission_check(idmap, inode, mask);
        if (ret != -EACCES)
                return ret;

        if (S_ISDIR(inode->i_mode)) {
                /* DACs are overridable for directories */
                if (!(mask & MAY_WRITE))
                        if (capable_wrt_inode_uidgid(idmap, inode,
                                                     CAP_DAC_READ_SEARCH))
                                return 0;
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_OVERRIDE))
                        return 0;
                return -EACCES;
        }

        /*
         * Searching includes executable on directories, else just read.
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ)
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_READ_SEARCH))
                        return 0;
        /*
         * Read/write DACs are always overridable.
         * Executable DACs are overridable when there is
         * at least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_OVERRIDE))
                        return 0;

        return -EACCES;
}
EXPORT_SYMBOL(generic_permission);

/**
 * do_inode_permission - UNIX permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct mnt_idmap *idmap,
                                      struct inode *inode, int mask)
{
        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
                if (likely(inode->i_op->permission))
                        return inode->i_op->permission(idmap, inode, mask);

                /* This gets set once for the inode lifetime */
                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_FASTPERM;
                spin_unlock(&inode->i_lock);
        }
        return generic_permission(idmap, inode, mask);
}

/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 *
 * Note: lookup_inode_permission_may_exec() does not call here. If you add
 * MAY_EXEC checks, adjust it.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
        if (mask & MAY_WRITE) {
                umode_t mode = inode->i_mode;

                /* Nobody gets write access to a read-only fs. */
                if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
                        return -EROFS;
        }
        return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        Inode to check permission on
 * @mask:        Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct mnt_idmap *idmap,
                     struct inode *inode, int mask)
{
        int retval;

        retval = sb_permission(inode->i_sb, inode, mask);
        if (unlikely(retval))
                return retval;

        if (mask & MAY_WRITE) {
                /*
                 * Nobody gets write access to an immutable file.
                 */
                if (unlikely(IS_IMMUTABLE(inode)))
                        return -EPERM;

                /*
                 * Updating mtime will likely cause i_uid and i_gid to be
                 * written back improperly if their true value is unknown
                 * to the vfs.
                 */
                if (unlikely(HAS_UNMAPPED_ID(idmap, inode)))
                        return -EACCES;
        }

        retval = do_inode_permission(idmap, inode, mask);
        if (unlikely(retval))
                return retval;

        retval = devcgroup_inode_permission(inode, mask);
        if (unlikely(retval))
                return retval;

        return security_inode_permission(inode, mask);
}
EXPORT_SYMBOL(inode_permission);

/*
 * lookup_inode_permission_may_exec - Check traversal right for given inode
 *
 * This is a special case routine for may_lookup() making assumptions specific
 * to path traversal. Use inode_permission() if you are doing something else.
 *
 * Work is shaved off compared to inode_permission() as follows:
 * - we know for a fact there is no MAY_WRITE to worry about
 * - it is an invariant the inode is a directory
 *
 * Since majority of real-world traversal happens on inodes which grant it for
 * everyone, we check it upfront and only resort to more expensive work if it
 * fails.
 *
 * Filesystems which have their own ->permission hook and consequently miss out
 * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
 * on their directory inodes.
 */
static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
        struct inode *inode, int mask)
{
        /* Lookup already checked this to return -ENOTDIR */
        VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
        VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);

        mask |= MAY_EXEC;

        if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
                return inode_permission(idmap, inode, mask);

        if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
                return inode_permission(idmap, inode, mask);

        return security_inode_permission(inode, mask);
}

/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
void path_get(const struct path *path)
{
        mntget(path->mnt);
        dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
void path_put(const struct path *path)
{
        dput(path->dentry);
        mntput(path->mnt);
}
EXPORT_SYMBOL(path_put);

#define EMBEDDED_LEVELS 2
struct nameidata {
        struct path        path;
        struct qstr        last;
        struct path        root;
        struct inode        *inode; /* path.dentry.d_inode */
        unsigned int        flags, state;
        unsigned        seq, next_seq, m_seq, r_seq;
        int                last_type;
        unsigned        depth;
        int                total_link_count;
        struct saved {
                struct path link;
                struct delayed_call done;
                const char *name;
                unsigned seq;
        } *stack, internal[EMBEDDED_LEVELS];
        struct filename        *name;
        const char *pathname;
        struct nameidata *saved;
        unsigned        root_seq;
        int                dfd;
        vfsuid_t        dir_vfsuid;
        umode_t                dir_mode;
} __randomize_layout;

#define ND_ROOT_PRESET 1
#define ND_ROOT_GRABBED 2
#define ND_JUMPED 4

static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
        struct nameidata *old = current->nameidata;
        p->stack = p->internal;
        p->depth = 0;
        p->dfd = dfd;
        p->name = name;
        p->pathname = likely(name) ? name->name : "";
        p->path.mnt = NULL;
        p->path.dentry = NULL;
        p->total_link_count = old ? old->total_link_count : 0;
        p->saved = old;
        current->nameidata = p;
}

static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
                          const struct path *root)
{
        __set_nameidata(p, dfd, name);
        p->state = 0;
        if (unlikely(root)) {
                p->state = ND_ROOT_PRESET;
                p->root = *root;
        }
}

static void restore_nameidata(void)
{
        struct nameidata *now = current->nameidata, *old = now->saved;

        current->nameidata = old;
        if (old)
                old->total_link_count = now->total_link_count;
        if (now->stack != now->internal)
                kfree(now->stack);
}

static bool nd_alloc_stack(struct nameidata *nd)
{
        struct saved *p;

        p= kmalloc_objs(struct saved, MAXSYMLINKS,
                        nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
        if (unlikely(!p))
                return false;
        memcpy(p, nd->internal, sizeof(nd->internal));
        nd->stack = p;
        return true;
}

/**
 * path_connected - Verify that a dentry is below mnt.mnt_root
 * @mnt: The mountpoint to check.
 * @dentry: The dentry to check.
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
        struct super_block *sb = mnt->mnt_sb;

        /* Bind mounts can have disconnected paths */
        if (mnt->mnt_root == sb->s_root)
                return true;

        return is_subdir(dentry, mnt->mnt_root);
}

static void drop_links(struct nameidata *nd)
{
        int i = nd->depth;
        while (i--) {
                struct saved *last = nd->stack + i;
                do_delayed_call(&last->done);
                clear_delayed_call(&last->done);
        }
}

static void leave_rcu(struct nameidata *nd)
{
        nd->flags &= ~LOOKUP_RCU;
        nd->seq = nd->next_seq = 0;
        rcu_read_unlock();
}

static void terminate_walk(struct nameidata *nd)
{
        if (unlikely(nd->depth))
                drop_links(nd);
        if (!(nd->flags & LOOKUP_RCU)) {
                int i;
                path_put(&nd->path);
                for (i = 0; i < nd->depth; i++)
                        path_put(&nd->stack[i].link);
                if (nd->state & ND_ROOT_GRABBED) {
                        path_put(&nd->root);
                        nd->state &= ~ND_ROOT_GRABBED;
                }
        } else {
                leave_rcu(nd);
        }
        nd->depth = 0;
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
}

/* path_put is needed afterwards regardless of success or failure */
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
{
        int res = __legitimize_mnt(path->mnt, mseq);
        if (unlikely(res)) {
                if (res > 0)
                        path->mnt = NULL;
                path->dentry = NULL;
                return false;
        }
        if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
                path->dentry = NULL;
                return false;
        }
        return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static inline bool legitimize_path(struct nameidata *nd,
                            struct path *path, unsigned seq)
{
        return __legitimize_path(path, seq, nd->m_seq);
}

static bool legitimize_links(struct nameidata *nd)
{
        int i;

        VFS_BUG_ON(nd->flags & LOOKUP_CACHED);

        for (i = 0; i < nd->depth; i++) {
                struct saved *last = nd->stack + i;
                if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
                        drop_links(nd);
                        nd->depth = i + 1;
                        return false;
                }
        }
        return true;
}

static bool legitimize_root(struct nameidata *nd)
{
        /* Nothing to do if nd->root is zero or is managed by the VFS user. */
        if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
                return true;
        nd->state |= ND_ROOT_GRABBED;
        return legitimize_path(nd, &nd->root, nd->root_seq);
}

/*
 * Path walking has 2 modes, rcu-walk and ref-walk (see
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
 */

/**
 * try_to_unlazy - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * Returns: true on success, false on failure
 *
 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy() failure and
 * terminate_walk().
 */
static bool try_to_unlazy(struct nameidata *nd)
{
        struct dentry *parent = nd->path.dentry;

        VFS_BUG_ON(!(nd->flags & LOOKUP_RCU));

        if (unlikely(nd->flags & LOOKUP_CACHED)) {
                drop_links(nd);
                nd->depth = 0;
                goto out1;
        }
        if (unlikely(nd->depth && !legitimize_links(nd)))
                goto out1;
        if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
                goto out;
        if (unlikely(!legitimize_root(nd)))
                goto out;
        leave_rcu(nd);
        BUG_ON(nd->inode != parent->d_inode);
        return true;

out1:
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
out:
        leave_rcu(nd);
        return false;
}

/**
 * try_to_unlazy_next - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: next dentry to step into
 * Returns: true on success, false on failure
 *
 * Similar to try_to_unlazy(), but here we have the next dentry already
 * picked by rcu-walk and want to legitimize that in addition to the current
 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy_next() failure and
 * terminate_walk().
 */
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
{
        int res;

        VFS_BUG_ON(!(nd->flags & LOOKUP_RCU));

        if (unlikely(nd->flags & LOOKUP_CACHED)) {
                drop_links(nd);
                nd->depth = 0;
                goto out2;
        }
        if (unlikely(nd->depth && !legitimize_links(nd)))
                goto out2;
        res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
        if (unlikely(res)) {
                if (res > 0)
                        goto out2;
                goto out1;
        }
        if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
                goto out1;

        /*
         * We need to move both the parent and the dentry from the RCU domain
         * to be properly refcounted. And the sequence number in the dentry
         * validates *both* dentry counters, since we checked the sequence
         * number of the parent after we got the child sequence number. So we
         * know the parent must still be valid if the child sequence number is
         */
        if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
                goto out;
        if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
                goto out_dput;
        /*
         * Sequence counts matched. Now make sure that the root is
         * still valid and get it if required.
         */
        if (unlikely(!legitimize_root(nd)))
                goto out_dput;
        leave_rcu(nd);
        return true;

out2:
        nd->path.mnt = NULL;
out1:
        nd->path.dentry = NULL;
out:
        leave_rcu(nd);
        return false;
out_dput:
        leave_rcu(nd);
        dput(dentry);
        return false;
}

static inline int d_revalidate(struct inode *dir, const struct qstr *name,
                               struct dentry *dentry, unsigned int flags)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
                return dentry->d_op->d_revalidate(dir, name, dentry, flags);
        else
                return 1;
}

/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
 *
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
 */
static int complete_walk(struct nameidata *nd)
{
        struct dentry *dentry = nd->path.dentry;
        int status;

        if (nd->flags & LOOKUP_RCU) {
                /*
                 * We don't want to zero nd->root for scoped-lookups or
                 * externally-managed nd->root.
                 */
                if (likely(!(nd->state & ND_ROOT_PRESET)))
                        if (likely(!(nd->flags & LOOKUP_IS_SCOPED)))
                                nd->root.mnt = NULL;
                nd->flags &= ~LOOKUP_CACHED;
                if (!try_to_unlazy(nd))
                        return -ECHILD;
        }

        if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                /*
                 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
                 * ever step outside the root during lookup" and should already
                 * be guaranteed by the rest of namei, we want to avoid a namei
                 * BUG resulting in userspace being given a path that was not
                 * scoped within the root at some point during the lookup.
                 *
                 * So, do a final sanity-check to make sure that in the
                 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
                 * we won't silently return an fd completely outside of the
                 * requested root to userspace.
                 *
                 * Userspace could move the path outside the root after this
                 * check, but as discussed elsewhere this is not a concern (the
                 * resolved file was inside the root at some point).
                 */
                if (!path_is_under(&nd->path, &nd->root))
                        return -EXDEV;
        }

        if (likely(!(nd->state & ND_JUMPED)))
                return 0;

        if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
                return 0;

        status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
        if (status > 0)
                return 0;

        if (!status)
                status = -ESTALE;

        return status;
}

static int set_root(struct nameidata *nd)
{
        struct fs_struct *fs = current->fs;

        /*
         * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
         * still have to ensure it doesn't happen because it will cause a breakout
         * from the dirfd.
         */
        if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
                return -ENOTRECOVERABLE;

        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;

                do {
                        seq = read_seqbegin(&fs->seq);
                        nd->root = fs->root;
                        nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqretry(&fs->seq, seq));
        } else {
                get_fs_root(fs, &nd->root);
                nd->state |= ND_ROOT_GRABBED;
        }
        return 0;
}

static int nd_jump_root(struct nameidata *nd)
{
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                /* Absolute path arguments to path_init() are allowed. */
                if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
                        return -EXDEV;
        }
        if (!nd->root.mnt) {
                int error = set_root(nd);
                if (unlikely(error))
                        return error;
        }
        if (nd->flags & LOOKUP_RCU) {
                struct dentry *d;
                nd->path = nd->root;
                d = nd->path.dentry;
                nd->inode = d->d_inode;
                nd->seq = nd->root_seq;
                if (read_seqcount_retry(&d->d_seq, nd->seq))
                        return -ECHILD;
        } else {
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->path);
                nd->inode = nd->path.dentry->d_inode;
        }
        nd->state |= ND_JUMPED;
        return 0;
}

/*
 * Helper to directly jump to a known parsed path from ->get_link,
 * caller must have taken a reference to path beforehand.
 */
int nd_jump_link(const struct path *path)
{
        int error = -ELOOP;
        struct nameidata *nd = current->nameidata;

        if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
                goto err;

        error = -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                if (nd->path.mnt != path->mnt)
                        goto err;
        }
        /* Not currently safe for scoped-lookups. */
        if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
                goto err;

        path_put(&nd->path);
        nd->path = *path;
        nd->inode = nd->path.dentry->d_inode;
        nd->state |= ND_JUMPED;
        return 0;

err:
        path_put(path);
        return error;
}

static inline void put_link(struct nameidata *nd)
{
        struct saved *last = nd->stack + --nd->depth;
        do_delayed_call(&last->done);
        if (!(nd->flags & LOOKUP_RCU))
                path_put(&last->link);
}

static int sysctl_protected_symlinks __read_mostly;
static int sysctl_protected_hardlinks __read_mostly;
static int sysctl_protected_fifos __read_mostly;
static int sysctl_protected_regular __read_mostly;

#ifdef CONFIG_SYSCTL
static const struct ctl_table namei_sysctls[] = {
        {
                .procname        = "protected_symlinks",
                .data                = &sysctl_protected_symlinks,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "protected_hardlinks",
                .data                = &sysctl_protected_hardlinks,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "protected_fifos",
                .data                = &sysctl_protected_fifos,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
        {
                .procname        = "protected_regular",
                .data                = &sysctl_protected_regular,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
};

static int __init init_fs_namei_sysctls(void)
{
        register_sysctl_init("fs", namei_sysctls);
        return 0;
}
fs_initcall(init_fs_namei_sysctls);

#endif /* CONFIG_SYSCTL */

/**
 * may_follow_link - Check symlink following for unsafe situations
 * @nd: nameidata pathwalk data
 * @inode: Used for idmapping.
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
        struct mnt_idmap *idmap;
        vfsuid_t vfsuid;

        if (!sysctl_protected_symlinks)
                return 0;

        idmap = mnt_idmap(nd->path.mnt);
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        /* Allowed if owner and follower match. */
        if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                return 0;

        /* Allowed if parent directory not sticky and world-writable. */
        if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
                return 0;

        /* Allowed if parent directory and link owner match. */
        if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
                return 0;

        if (nd->flags & LOOKUP_RCU)
                return -ECHILD;

        audit_inode(nd->name, nd->stack[0].link.dentry, 0);
        audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
        return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @idmap: idmap of the mount the inode was found from
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct mnt_idmap *idmap,
                                 struct inode *inode)
{
        umode_t mode = inode->i_mode;

        /* Special files should not get pinned to the filesystem. */
        if (!S_ISREG(mode))
                return false;

        /* Setuid files should not get pinned to the filesystem. */
        if (mode & S_ISUID)
                return false;

        /* Executable setgid files should not get pinned to the filesystem. */
        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
                return false;

        /* Hardlinking to unreadable or unwritable sources is dangerous. */
        if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE))
                return false;

        return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @idmap: idmap of the mount the inode was found from
 * @link:  the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 *
 * Returns 0 if successful, -ve on error.
 */
int may_linkat(struct mnt_idmap *idmap, const struct path *link)
{
        struct inode *inode = link->dentry->d_inode;

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
            !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
                return -EOVERFLOW;

        if (!sysctl_protected_hardlinks)
                return 0;

        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
         * otherwise, it must be a safe source.
         */
        if (safe_hardlink_source(idmap, inode) ||
            inode_owner_or_capable(idmap, inode))
                return 0;

        audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
        return -EPERM;
}

/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *                          should be allowed, or not, on files that already
 *                          exist.
 * @idmap: idmap of the mount the inode was found from
 * @nd: nameidata pathwalk data
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 *
 * Returns 0 if the open is allowed, -ve on error.
 */
static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd,
                                struct inode *const inode)
{
        umode_t dir_mode = nd->dir_mode;
        vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid;

        if (likely(!(dir_mode & S_ISVTX)))
                return 0;

        if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
                return 0;

        if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos)
                return 0;

        i_vfsuid = i_uid_into_vfsuid(idmap, inode);

        if (vfsuid_eq(i_vfsuid, dir_vfsuid))
                return 0;

        if (vfsuid_eq_kuid(i_vfsuid, current_fsuid()))
                return 0;

        if (likely(dir_mode & 0002)) {
                audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create");
                return -EACCES;
        }

        if (dir_mode & 0020) {
                if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) {
                        audit_log_path_denied(AUDIT_ANOM_CREAT,
                                              "sticky_create_fifo");
                        return -EACCES;
                }

                if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) {
                        audit_log_path_denied(AUDIT_ANOM_CREAT,
                                              "sticky_create_regular");
                        return -EACCES;
                }
        }

        return 0;
}

/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
int follow_up(struct path *path)
{
        struct mount *mnt = real_mount(path->mnt);
        struct mount *parent;
        struct dentry *mountpoint;

        read_seqlock_excl(&mount_lock);
        parent = mnt->mnt_parent;
        if (parent == mnt) {
                read_sequnlock_excl(&mount_lock);
                return 0;
        }
        mntget(&parent->mnt);
        mountpoint = dget(mnt->mnt_mountpoint);
        read_sequnlock_excl(&mount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
        path->mnt = &parent->mnt;
        return 1;
}
EXPORT_SYMBOL(follow_up);

static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
                                  struct path *path, unsigned *seqp)
{
        while (mnt_has_parent(m)) {
                struct dentry *mountpoint = m->mnt_mountpoint;

                m = m->mnt_parent;
                if (unlikely(root->dentry == mountpoint &&
                             root->mnt == &m->mnt))
                        break;
                if (mountpoint != m->mnt.mnt_root) {
                        path->mnt = &m->mnt;
                        path->dentry = mountpoint;
                        *seqp = read_seqcount_begin(&mountpoint->d_seq);
                        return true;
                }
        }
        return false;
}

static bool choose_mountpoint(struct mount *m, const struct path *root,
                              struct path *path)
{
        bool found;

        rcu_read_lock();
        while (1) {
                unsigned seq, mseq = read_seqbegin(&mount_lock);

                found = choose_mountpoint_rcu(m, root, path, &seq);
                if (unlikely(!found)) {
                        if (!read_seqretry(&mount_lock, mseq))
                                break;
                } else {
                        if (likely(__legitimize_path(path, seq, mseq)))
                                break;
                        rcu_read_unlock();
                        path_put(path);
                        rcu_read_lock();
                }
        }
        rcu_read_unlock();
        return found;
}

/*
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
 */
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
{
        struct dentry *dentry = path->dentry;

        /* We don't want to mount if someone's just doing a stat -
         * unless they're stat'ing a directory and appended a '/' to
         * the name.
         *
         * We do, however, want to mount if someone wants to open or
         * create a file of any type under the mountpoint, wants to
         * traverse through the mountpoint or wants to open the
         * mounted directory.  Also, autofs may mark negative dentries
         * as being automount points.  These will need the attentions
         * of the daemon to instantiate them before they can be used.
         */
        if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
                           LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
            dentry->d_inode)
                return -EISDIR;

        /* No need to trigger automounts if mountpoint crossing is disabled. */
        if (lookup_flags & LOOKUP_NO_XDEV)
                return -EXDEV;

        if (count && (*count)++ >= MAXSYMLINKS)
                return -ELOOP;

        return finish_automount(dentry->d_op->d_automount(path), path);
}

/*
 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
 * dentries are pinned but not locked here, so negative dentry can go
 * positive right under us.  Use of smp_load_acquire() provides a barrier
 * sufficient for ->d_inode and ->d_flags consistency.
 */
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
                             int *count, unsigned lookup_flags)
{
        struct vfsmount *mnt = path->mnt;
        bool need_mntput = false;
        int ret = 0;

        while (flags & DCACHE_MANAGED_DENTRY) {
                /* Allow the filesystem to manage the transit without i_rwsem
                 * being held. */
                if (flags & DCACHE_MANAGE_TRANSIT) {
                        if (lookup_flags & LOOKUP_NO_XDEV) {
                                ret = -EXDEV;
                                break;
                        }
                        ret = path->dentry->d_op->d_manage(path, false);
                        flags = smp_load_acquire(&path->dentry->d_flags);
                        if (ret < 0)
                                break;
                }

                if (flags & DCACHE_MOUNTED) {        // something's mounted on it..
                        struct vfsmount *mounted = lookup_mnt(path);
                        if (mounted) {                // ... in our namespace
                                dput(path->dentry);
                                if (need_mntput)
                                        mntput(path->mnt);
                                path->mnt = mounted;
                                path->dentry = dget(mounted->mnt_root);
                                // here we know it's positive
                                flags = path->dentry->d_flags;
                                need_mntput = true;
                                if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) {
                                        ret = -EXDEV;
                                        break;
                                }
                                continue;
                        }
                }

                if (!(flags & DCACHE_NEED_AUTOMOUNT))
                        break;

                // uncovered automount point
                ret = follow_automount(path, count, lookup_flags);
                flags = smp_load_acquire(&path->dentry->d_flags);
                if (ret < 0)
                        break;
        }

        if (ret == -EISDIR)
                ret = 0;
        // possible if you race with several mount --move
        if (need_mntput && path->mnt == mnt)
                mntput(path->mnt);
        if (!ret && unlikely(d_flags_negative(flags)))
                ret = -ENOENT;
        *jumped = need_mntput;
        return ret;
}

static inline int traverse_mounts(struct path *path, bool *jumped,
                                  int *count, unsigned lookup_flags)
{
        unsigned flags = smp_load_acquire(&path->dentry->d_flags);

        /* fastpath */
        if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
                *jumped = false;
                if (unlikely(d_flags_negative(flags)))
                        return -ENOENT;
                return 0;
        }
        return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

int follow_down_one(struct path *path)
{
        struct vfsmount *mounted;

        mounted = lookup_mnt(path);
        if (mounted) {
                dput(path->dentry);
                mntput(path->mnt);
                path->mnt = mounted;
                path->dentry = dget(mounted->mnt_root);
                return 1;
        }
        return 0;
}
EXPORT_SYMBOL(follow_down_one);

/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
int follow_down(struct path *path, unsigned int flags)
{
        struct vfsmount *mnt = path->mnt;
        bool jumped;
        int ret = traverse_mounts(path, &jumped, NULL, flags);

        if (path->mnt != mnt)
                mntput(mnt);
        return ret;
}
EXPORT_SYMBOL(follow_down);

/*
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
{
        struct dentry *dentry = path->dentry;
        unsigned int flags = dentry->d_flags;

        if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                return false;

        for (;;) {
                /*
                 * Don't forget we might have a non-mountpoint managed dentry
                 * that wants to block transit.
                 */
                if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
                        int res = dentry->d_op->d_manage(path, true);
                        if (res)
                                return res == -EISDIR;
                        flags = dentry->d_flags;
                }

                if (flags & DCACHE_MOUNTED) {
                        struct mount *mounted = __lookup_mnt(path->mnt, dentry);
                        if (mounted) {
                                path->mnt = &mounted->mnt;
                                dentry = path->dentry = mounted->mnt.mnt_root;
                                nd->state |= ND_JUMPED;
                                nd->next_seq = read_seqcount_begin(&dentry->d_seq);
                                flags = dentry->d_flags;
                                // makes sure that non-RCU pathwalk could reach
                                // this state.
                                if (read_seqretry(&mount_lock, nd->m_seq))
                                        return false;
                                continue;
                        }
                        if (read_seqretry(&mount_lock, nd->m_seq))
                                return false;
                }
                return !(flags & DCACHE_NEED_AUTOMOUNT);
        }
}

static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
                          struct path *path)
{
        bool jumped;
        int ret;

        path->mnt = nd->path.mnt;
        path->dentry = dentry;
        if (nd->flags & LOOKUP_RCU) {
                unsigned int seq = nd->next_seq;
                if (likely(!d_managed(dentry)))
                        return 0;
                if (likely(__follow_mount_rcu(nd, path)))
                        return 0;
                // *path and nd->next_seq might've been clobbered
                path->mnt = nd->path.mnt;
                path->dentry = dentry;
                nd->next_seq = seq;
                if (unlikely(!try_to_unlazy_next(nd, dentry)))
                        return -ECHILD;
        }
        ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
        if (jumped)
                nd->state |= ND_JUMPED;
        if (unlikely(ret)) {
                dput(path->dentry);
                if (path->mnt != nd->path.mnt)
                        mntput(path->mnt);
        }
        return ret;
}

/*
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
 */
static struct dentry *lookup_dcache(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry = d_lookup(dir, name);
        if (dentry) {
                int error = d_revalidate(dir->d_inode, name, dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error)
                                d_invalidate(dentry);
                        dput(dentry);
                        return ERR_PTR(error);
                }
        }
        return dentry;
}

/*
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
 * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
 * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
 */
static struct dentry *lookup_one_qstr_excl(const struct qstr *name,
                                           struct dentry *base, unsigned int flags)
{
        struct dentry *dentry;
        struct dentry *old;
        struct inode *dir;

        dentry = lookup_dcache(name, base, flags);
        if (dentry)
                goto found;

        /* Don't create child dentry for a dead directory. */
        dir = base->d_inode;
        if (unlikely(IS_DEADDIR(dir)))
                return ERR_PTR(-ENOENT);

        dentry = d_alloc(base, name);
        if (unlikely(!dentry))
                return ERR_PTR(-ENOMEM);

        old = dir->i_op->lookup(dir, dentry, flags);
        if (unlikely(old)) {
                dput(dentry);
                dentry = old;
        }
found:
        if (IS_ERR(dentry))
                return dentry;
        if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
                dput(dentry);
                return ERR_PTR(-ENOENT);
        }
        if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
                dput(dentry);
                return ERR_PTR(-EEXIST);
        }
        return dentry;
}

/**
 * lookup_fast - do fast lockless (but racy) lookup of a dentry
 * @nd: current nameidata
 *
 * Do a fast, but racy lookup in the dcache for the given dentry, and
 * revalidate it. Returns a valid dentry pointer or NULL if one wasn't
 * found. On error, an ERR_PTR will be returned.
 *
 * If this function returns a valid dentry and the walk is no longer
 * lazy, the dentry will carry a reference that must later be put. If
 * RCU mode is still in force, then this is not the case and the dentry
 * must be legitimized before use. If this returns NULL, then the walk
 * will no longer be in RCU mode.
 */
static struct dentry *lookup_fast(struct nameidata *nd)
{
        struct dentry *dentry, *parent = nd->path.dentry;
        int status = 1;

        /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, the caller is
         * going to fall back to non-racy lookup.
         */
        if (nd->flags & LOOKUP_RCU) {
                dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
                if (unlikely(!dentry)) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                        return NULL;
                }

                /*
                 * This sequence count validates that the parent had no
                 * changes while we did the lookup of the dentry above.
                 */
                if (read_seqcount_retry(&parent->d_seq, nd->seq))
                        return ERR_PTR(-ECHILD);

                status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
                if (likely(status > 0))
                        return dentry;
                if (!try_to_unlazy_next(nd, dentry))
                        return ERR_PTR(-ECHILD);
                if (status == -ECHILD)
                        /* we'd been told to redo it in non-rcu mode */
                        status = d_revalidate(nd->inode, &nd->last,
                                              dentry, nd->flags);
        } else {
                dentry = __d_lookup(parent, &nd->last);
                if (unlikely(!dentry))
                        return NULL;
                status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
        }
        if (unlikely(status <= 0)) {
                if (!status)
                        d_invalidate(dentry);
                dput(dentry);
                return ERR_PTR(status);
        }
        return dentry;
}

/* Fast lookup failed, do it the slow way */
static struct dentry *__lookup_slow(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry, *old;
        struct inode *inode = dir->d_inode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        /* Don't go there if it's already dead */
        if (unlikely(IS_DEADDIR(inode)))
                return ERR_PTR(-ENOENT);
again:
        dentry = d_alloc_parallel(dir, name, &wq);
        if (IS_ERR(dentry))
                return dentry;
        if (unlikely(!d_in_lookup(dentry))) {
                int error = d_revalidate(inode, name, dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error) {
                                d_invalidate(dentry);
                                dput(dentry);
                                goto again;
                        }
                        dput(dentry);
                        dentry = ERR_PTR(error);
                }
        } else {
                old = inode->i_op->lookup(inode, dentry, flags);
                d_lookup_done(dentry);
                if (unlikely(old)) {
                        dput(dentry);
                        dentry = old;
                }
        }
        return dentry;
}

static noinline struct dentry *lookup_slow(const struct qstr *name,
                                  struct dentry *dir,
                                  unsigned int flags)
{
        struct inode *inode = dir->d_inode;
        struct dentry *res;
        inode_lock_shared(inode);
        res = __lookup_slow(name, dir, flags);
        inode_unlock_shared(inode);
        return res;
}

static struct dentry *lookup_slow_killable(const struct qstr *name,
                                           struct dentry *dir,
                                           unsigned int flags)
{
        struct inode *inode = dir->d_inode;
        struct dentry *res;

        if (inode_lock_shared_killable(inode))
                return ERR_PTR(-EINTR);
        res = __lookup_slow(name, dir, flags);
        inode_unlock_shared(inode);
        return res;
}

static inline int may_lookup(struct mnt_idmap *idmap,
                             struct nameidata *restrict nd)
{
        int err, mask;

        mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
        err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
        if (likely(!err))
                return 0;

        // If we failed, and we weren't in LOOKUP_RCU, it's final
        if (!(nd->flags & LOOKUP_RCU))
                return err;

        // Drop out of RCU mode to make sure it wasn't transient
        if (!try_to_unlazy(nd))
                return -ECHILD;        // redo it all non-lazy

        if (err != -ECHILD)        // hard error
                return err;

        return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
}

static int reserve_stack(struct nameidata *nd, struct path *link)
{
        if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
                return -ELOOP;

        if (likely(nd->depth != EMBEDDED_LEVELS))
                return 0;
        if (likely(nd->stack != nd->internal))
                return 0;
        if (likely(nd_alloc_stack(nd)))
                return 0;

        if (nd->flags & LOOKUP_RCU) {
                // we need to grab link before we do unlazy.  And we can't skip
                // unlazy even if we fail to grab the link - cleanup needs it
                bool grabbed_link = legitimize_path(nd, link, nd->next_seq);

                if (!try_to_unlazy(nd) || !grabbed_link)
                        return -ECHILD;

                if (nd_alloc_stack(nd))
                        return 0;
        }
        return -ENOMEM;
}

enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

static noinline const char *pick_link(struct nameidata *nd, struct path *link,
                     struct inode *inode, int flags)
{
        struct saved *last;
        const char *res;
        int error;

        if (nd->flags & LOOKUP_RCU) {
                /* make sure that d_is_symlink from step_into_slowpath() matches the inode */
                if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq))
                        return ERR_PTR(-ECHILD);
        } else {
                if (link->mnt == nd->path.mnt)
                        mntget(link->mnt);
        }

        error = reserve_stack(nd, link);
        if (unlikely(error)) {
                if (!(nd->flags & LOOKUP_RCU))
                        path_put(link);
                return ERR_PTR(error);
        }
        last = nd->stack + nd->depth++;
        last->link = *link;
        clear_delayed_call(&last->done);
        last->seq = nd->next_seq;

        if (flags & WALK_TRAILING) {
                error = may_follow_link(nd, inode);
                if (unlikely(error))
                        return ERR_PTR(error);
        }

        if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
                        unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
                return ERR_PTR(-ELOOP);

        if (unlikely(atime_needs_update(&last->link, inode))) {
                if (nd->flags & LOOKUP_RCU) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                }
                touch_atime(&last->link);
                cond_resched();
        }

        error = security_inode_follow_link(link->dentry, inode,
                                           nd->flags & LOOKUP_RCU);
        if (unlikely(error))
                return ERR_PTR(error);

        res = READ_ONCE(inode->i_link);
        if (!res) {
                const char * (*get)(struct dentry *, struct inode *,
                                struct delayed_call *);
                get = inode->i_op->get_link;
                if (nd->flags & LOOKUP_RCU) {
                        res = get(NULL, inode, &last->done);
                        if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
                                res = get(link->dentry, inode, &last->done);
                } else {
                        res = get(link->dentry, inode, &last->done);
                }
                if (!res)
                        goto all_done;
                if (IS_ERR(res))
                        return res;
        }
        if (*res == '/') {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                while (unlikely(*++res == '/'))
                        ;
        }
        if (*res)
                return res;
all_done: // pure jump
        put_link(nd);
        return NULL;
}

/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 *
 * NOTE: dentry must be what nd->next_seq had been sampled from.
 */
static noinline const char *step_into_slowpath(struct nameidata *nd, int flags,
                     struct dentry *dentry)
{
        struct path path;
        struct inode *inode;
        int err;

        err = handle_mounts(nd, dentry, &path);
        if (unlikely(err < 0))
                return ERR_PTR(err);
        inode = path.dentry->d_inode;
        if (likely(!d_is_symlink(path.dentry)) ||
           ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
           (flags & WALK_NOFOLLOW)) {
                /* not a symlink or should not follow */
                if (nd->flags & LOOKUP_RCU) {
                        if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
                                return ERR_PTR(-ECHILD);
                        if (unlikely(!inode))
                                return ERR_PTR(-ENOENT);
                } else {
                        dput(nd->path.dentry);
                        if (nd->path.mnt != path.mnt)
                                mntput(nd->path.mnt);
                }
                nd->path = path;
                nd->inode = inode;
                nd->seq = nd->next_seq;
                return NULL;
        }
        return pick_link(nd, &path, inode, flags);
}

static __always_inline const char *step_into(struct nameidata *nd, int flags,
                    struct dentry *dentry)
{
        /*
         * In the common case we are in rcu-walk and traversing over a non-mounted on
         * directory (as opposed to e.g., a symlink).
         *
         * We can handle that and negative entries with the checks below.
         */
        if (likely((nd->flags & LOOKUP_RCU) &&
            !d_managed(dentry) && !d_is_symlink(dentry))) {
                struct inode *inode = dentry->d_inode;
                if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
                        return ERR_PTR(-ECHILD);
                if (unlikely(!inode))
                        return ERR_PTR(-ENOENT);
                nd->path.dentry = dentry;
                /* nd->path.mnt remains unchanged as no mount point was crossed */
                nd->inode = inode;
                nd->seq = nd->next_seq;
                return NULL;
        }
        return step_into_slowpath(nd, flags, dentry);
}

static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
{
        struct dentry *parent, *old;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;
                unsigned seq;
                if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
                                           &nd->root, &path, &seq))
                        goto in_root;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-ECHILD);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                nd->seq = seq;
                // makes sure that non-RCU pathwalk could reach this state
                if (read_seqretry(&mount_lock, nd->m_seq))
                        return ERR_PTR(-ECHILD);
                /* we know that mountpoint was pinned */
        }
        old = nd->path.dentry;
        parent = old->d_parent;
        nd->next_seq = read_seqcount_begin(&parent->d_seq);
        // makes sure that non-RCU pathwalk could reach this state
        if (read_seqcount_retry(&old->d_seq, nd->seq))
                return ERR_PTR(-ECHILD);
        if (unlikely(!path_connected(nd->path.mnt, parent)))
                return ERR_PTR(-ECHILD);
        return parent;
in_root:
        if (read_seqretry(&mount_lock, nd->m_seq))
                return ERR_PTR(-ECHILD);
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-ECHILD);
        nd->next_seq = nd->seq;
        return nd->path.dentry;
}

static struct dentry *follow_dotdot(struct nameidata *nd)
{
        struct dentry *parent;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;

                if (!choose_mountpoint(real_mount(nd->path.mnt),
                                       &nd->root, &path))
                        goto in_root;
                path_put(&nd->path);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-EXDEV);
        }
        /* rare case of legitimate dget_parent()... */
        parent = dget_parent(nd->path.dentry);
        if (unlikely(!path_connected(nd->path.mnt, parent))) {
                dput(parent);
                return ERR_PTR(-ENOENT);
        }
        return parent;

in_root:
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-EXDEV);
        return dget(nd->path.dentry);
}

static const char *handle_dots(struct nameidata *nd, int type)
{
        if (type == LAST_DOTDOT) {
                const char *error = NULL;
                struct dentry *parent;

                if (!nd->root.mnt) {
                        error = ERR_PTR(set_root(nd));
                        if (unlikely(error))
                                return error;
                }
                if (nd->flags & LOOKUP_RCU)
                        parent = follow_dotdot_rcu(nd);
                else
                        parent = follow_dotdot(nd);
                if (IS_ERR(parent))
                        return ERR_CAST(parent);
                error = step_into(nd, WALK_NOFOLLOW, parent);
                if (unlikely(error))
                        return error;

                if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                        /*
                         * If there was a racing rename or mount along our
                         * path, then we can't be sure that ".." hasn't jumped
                         * above nd->root (and so userspace should retry or use
                         * some fallback).
                         */
                        smp_rmb();
                        if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
                                return ERR_PTR(-EAGAIN);
                        if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
                                return ERR_PTR(-EAGAIN);
                }
        }
        return NULL;
}

static __always_inline const char *walk_component(struct nameidata *nd, int flags)
{
        struct dentry *dentry;
        /*
         * "." and ".." are special - ".." especially so because it has
         * to be able to know about the current root directory and
         * parent relationships.
         */
        if (unlikely(nd->last_type != LAST_NORM)) {
                if (unlikely(nd->depth) && !(flags & WALK_MORE))
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }
        dentry = lookup_fast(nd);
        if (IS_ERR(dentry))
                return ERR_CAST(dentry);
        if (unlikely(!dentry)) {
                dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
                if (IS_ERR(dentry))
                        return ERR_CAST(dentry);
        }
        if (unlikely(nd->depth) && !(flags & WALK_MORE))
                put_link(nd);
        return step_into(nd, flags, dentry);
}

/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>

#ifdef HASH_MIX

/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */

#elif defined(CONFIG_64BIT)
/*
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol64(x,12),\
        x += y,        y = rol64(y,45),\
        y *= 9                        )

/*
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
 */
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        y ^= x * GOLDEN_RATIO_64;
        y *= GOLDEN_RATIO_64;
        return y >> 32;
}

#else        /* 32-bit case */

/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol32(x, 7),\
        x += y,        y = rol32(y,20),\
        y *= 9                        )

static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        /* Use arch-optimized multiply if one exists */
        return __hash_32(y ^ __hash_32(x));
}

#endif

/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long a, x = 0, y = (unsigned long)salt;

        for (;;) {
                if (!len)
                        goto done;
                a = load_unaligned_zeropad(name);
                if (len < sizeof(unsigned long))
                        break;
                HASH_MIX(x, y, a);
                name += sizeof(unsigned long);
                len -= sizeof(unsigned long);
        }
        x ^= a & bytemask_from_count(len);
done:
        return fold_hash(x, y);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long a = 0, x = 0, y = (unsigned long)salt;
        unsigned long adata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        len = 0;
        goto inside;

        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
inside:
                a = load_unaligned_zeropad(name+len);
        } while (!has_zero(a, &adata, &constants));

        adata = prep_zero_mask(a, adata, &constants);
        mask = create_zero_mask(adata);
        x ^= a & zero_bytemask(mask);

        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
EXPORT_SYMBOL(hashlen_string);

/*
 * hash_name - Calculate the length and hash of the path component
 * @nd: the path resolution state
 * @name: the pathname to read the component from
 * @lastword: if the component fits in a single word, LAST_WORD_IS_DOT,
 * LAST_WORD_IS_DOTDOT, or some other value depending on whether the
 * component is '.', '..', or something else. Otherwise, @lastword is 0.
 *
 * Returns: a pointer to the terminating '/' or NUL character in @name.
 */
static inline const char *hash_name(struct nameidata *nd,
                                    const char *name,
                                    unsigned long *lastword)
{
        unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
        unsigned long adata, bdata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        /*
         * The first iteration is special, because it can result in
         * '.' and '..' and has no mixing other than the final fold.
         */
        a = load_unaligned_zeropad(name);
        b = a ^ REPEAT_BYTE('/');
        if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) {
                adata = prep_zero_mask(a, adata, &constants);
                bdata = prep_zero_mask(b, bdata, &constants);
                mask = create_zero_mask(adata | bdata);
                a &= zero_bytemask(mask);
                *lastword = a;
                len = find_zero(mask);
                nd->last.hash = fold_hash(a, y);
                nd->last.len = len;
                return name + len;
        }

        len = 0;
        x = 0;
        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
                a = load_unaligned_zeropad(name+len);
                b = a ^ REPEAT_BYTE('/');
        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

        adata = prep_zero_mask(a, adata, &constants);
        bdata = prep_zero_mask(b, bdata, &constants);
        mask = create_zero_mask(adata | bdata);
        a &= zero_bytemask(mask);
        x ^= a;
        len += find_zero(mask);
        *lastword = 0;                // Multi-word components cannot be DOT or DOTDOT

        nd->last.hash = fold_hash(x, y);
        nd->last.len = len;
        return name + len;
}

/*
 * Note that the 'last' word is always zero-masked, but
 * was loaded as a possibly big-endian word.
 */
#ifdef __BIG_ENDIAN
  #define LAST_WORD_IS_DOT        (0x2eul << (BITS_PER_LONG-8))
  #define LAST_WORD_IS_DOTDOT        (0x2e2eul << (BITS_PER_LONG-16))
#endif

#else        /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */

/* Return the hash of a string of known length */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long hash = init_name_hash(salt);
        while (len--)
                hash = partial_name_hash((unsigned char)*name++, hash);
        return end_name_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;

        c = (unsigned char)*name;
        while (c) {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        }
        return hashlen_create(end_name_hash(hash), len);
}
EXPORT_SYMBOL(hashlen_string);

/*
 * We know there's a real path component here of at least
 * one character.
 */
static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword)
{
        unsigned long hash = init_name_hash(nd->path.dentry);
        unsigned long len = 0, c, last = 0;

        c = (unsigned char)*name;
        do {
                last = (last << 8) + c;
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        } while (c && c != '/');

        // This is reliable for DOT or DOTDOT, since the component
        // cannot contain NUL characters - top bits being zero means
        // we cannot have had any other pathnames.
        *lastword = last;
        nd->last.hash = end_name_hash(hash);
        nd->last.len = len;
        return name + len;
}

#endif

#ifndef LAST_WORD_IS_DOT
  #define LAST_WORD_IS_DOT        0x2e
  #define LAST_WORD_IS_DOTDOT        0x2e2e
#endif

/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
static int link_path_walk(const char *name, struct nameidata *nd)
{
        int depth = 0; // depth <= nd->depth
        int err;

        nd->last_type = LAST_ROOT;
        nd->flags |= LOOKUP_PARENT;
        if (IS_ERR(name))
                return PTR_ERR(name);
        if (*name == '/') {
                do {
                        name++;
                } while (unlikely(*name == '/'));
        }
        if (unlikely(!*name)) {
                nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
                return 0;
        }

        /* At this point we know we have a real path component. */
        for(;;) {
                struct mnt_idmap *idmap;
                const char *link;
                unsigned long lastword;

                idmap = mnt_idmap(nd->path.mnt);
                err = may_lookup(idmap, nd);
                if (unlikely(err))
                        return err;

                nd->last.name = name;
                name = hash_name(nd, name, &lastword);

                switch(lastword) {
                case LAST_WORD_IS_DOTDOT:
                        nd->last_type = LAST_DOTDOT;
                        nd->state |= ND_JUMPED;
                        break;

                case LAST_WORD_IS_DOT:
                        nd->last_type = LAST_DOT;
                        break;

                default:
                        nd->last_type = LAST_NORM;
                        nd->state &= ~ND_JUMPED;

                        struct dentry *parent = nd->path.dentry;
                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
                                err = parent->d_op->d_hash(parent, &nd->last);
                                if (err < 0)
                                        return err;
                        }
                }

                if (!*name)
                        goto OK;
                /*
                 * If it wasn't NUL, we know it was '/'. Skip that
                 * slash, and continue until no more slashes.
                 */
                do {
                        name++;
                } while (unlikely(*name == '/'));
                if (unlikely(!*name)) {
OK:
                        /* pathname or trailing symlink, done */
                        if (likely(!depth)) {
                                nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode);
                                nd->dir_mode = nd->inode->i_mode;
                                nd->flags &= ~LOOKUP_PARENT;
                                return 0;
                        }
                        /* last component of nested symlink */
                        name = nd->stack[--depth].name;
                        link = walk_component(nd, 0);
                } else {
                        /* not the last component */
                        link = walk_component(nd, WALK_MORE);
                }
                if (unlikely(link)) {
                        if (IS_ERR(link))
                                return PTR_ERR(link);
                        /* a symlink to follow */
                        nd->stack[depth++].name = name;
                        name = link;
                        continue;
                }
                if (unlikely(!d_can_lookup(nd->path.dentry))) {
                        if (nd->flags & LOOKUP_RCU) {
                                if (!try_to_unlazy(nd))
                                        return -ECHILD;
                        }
                        return -ENOTDIR;
                }
        }
}

/* must be paired with terminate_walk() */
static const char *path_init(struct nameidata *nd, unsigned flags)
{
        int error;
        const char *s = nd->pathname;

        /* LOOKUP_CACHED requires RCU, ask caller to retry */
        if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED))
                return ERR_PTR(-EAGAIN);

        if (unlikely(!*s))
                flags &= ~LOOKUP_RCU;
        if (flags & LOOKUP_RCU)
                rcu_read_lock();
        else
                nd->seq = nd->next_seq = 0;

        nd->flags = flags;
        nd->state |= ND_JUMPED;

        nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
        nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
        smp_rmb();

        if (unlikely(nd->state & ND_ROOT_PRESET)) {
                struct dentry *root = nd->root.dentry;
                struct inode *inode = root->d_inode;
                if (*s && unlikely(!d_can_lookup(root)))
                        return ERR_PTR(-ENOTDIR);
                nd->path = nd->root;
                nd->inode = inode;
                if (flags & LOOKUP_RCU) {
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->path);
                }
                return s;
        }

        nd->root.mnt = NULL;

        /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
        if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                return s;
        }

        /* Relative pathname -- get the starting-point it is relative to. */
        if (nd->dfd == AT_FDCWD) {
                if (flags & LOOKUP_RCU) {
                        struct fs_struct *fs = current->fs;
                        unsigned seq;

                        do {
                                seq = read_seqbegin(&fs->seq);
                                nd->path = fs->pwd;
                                nd->inode = nd->path.dentry->d_inode;
                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                        } while (read_seqretry(&fs->seq, seq));
                } else {
                        get_fs_pwd(current->fs, &nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
        } else {
                /* Caller must check execute permissions on the starting path component */
                CLASS(fd_raw, f)(nd->dfd);
                struct dentry *dentry;

                if (fd_empty(f))
                        return ERR_PTR(-EBADF);

                if (flags & LOOKUP_LINKAT_EMPTY) {
                        if (fd_file(f)->f_cred != current_cred() &&
                            !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
                                return ERR_PTR(-ENOENT);
                }

                dentry = fd_file(f)->f_path.dentry;

                if (*s && unlikely(!d_can_lookup(dentry)))
                        return ERR_PTR(-ENOTDIR);

                nd->path = fd_file(f)->f_path;
                if (flags & LOOKUP_RCU) {
                        nd->inode = nd->path.dentry->d_inode;
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                } else {
                        path_get(&nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
        }

        /* For scoped-lookups we need to set the root to the dirfd as well. */
        if (unlikely(flags & LOOKUP_IS_SCOPED)) {
                nd->root = nd->path;
                if (flags & LOOKUP_RCU) {
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->root);
                        nd->state |= ND_ROOT_GRABBED;
                }
        }
        return s;
}

static inline const char *lookup_last(struct nameidata *nd)
{
        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

        return walk_component(nd, WALK_TRAILING);
}

static int handle_lookup_down(struct nameidata *nd)
{
        if (!(nd->flags & LOOKUP_RCU))
                dget(nd->path.dentry);
        nd->next_seq = nd->seq;
        return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
}

/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
        const char *s = path_init(nd, flags);
        int err;

        if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
                err = handle_lookup_down(nd);
                if (unlikely(err < 0))
                        s = ERR_PTR(err);
        }

        while (!(err = link_path_walk(s, nd)) &&
               (s = lookup_last(nd)) != NULL)
                ;
        if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
                err = handle_lookup_down(nd);
                nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
        }
        if (!err)
                err = complete_walk(nd);

        if (!err && nd->flags & LOOKUP_DIRECTORY)
                if (!d_can_lookup(nd->path.dentry))
                        err = -ENOTDIR;
        if (!err) {
                *path = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

int filename_lookup(int dfd, struct filename *name, unsigned flags,
                    struct path *path, const struct path *root)
{
        int retval;
        struct nameidata nd;
        if (IS_ERR(name))
                return PTR_ERR(name);
        set_nameidata(&nd, dfd, name, root);
        retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
        if (unlikely(retval == -ECHILD))
                retval = path_lookupat(&nd, flags, path);
        if (unlikely(retval == -ESTALE))
                retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);

        if (likely(!retval))
                audit_inode(name, path->dentry,
                            flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
        restore_nameidata();
        return retval;
}

/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_parentat(struct nameidata *nd, unsigned flags,
                                struct path *parent)
{
        const char *s = path_init(nd, flags);
        int err = link_path_walk(s, nd);
        if (!err)
                err = complete_walk(nd);
        if (!err) {
                *parent = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

/* Note: this does not consume "name" */
static int __filename_parentat(int dfd, struct filename *name,
                               unsigned int flags, struct path *parent,
                               struct qstr *last, int *type,
                               const struct path *root)
{
        int retval;
        struct nameidata nd;

        if (IS_ERR(name))
                return PTR_ERR(name);
        set_nameidata(&nd, dfd, name, root);
        retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
        if (unlikely(retval == -ECHILD))
                retval = path_parentat(&nd, flags, parent);
        if (unlikely(retval == -ESTALE))
                retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
        if (likely(!retval)) {
                *last = nd.last;
                *type = nd.last_type;
                audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
        }
        restore_nameidata();
        return retval;
}

static int filename_parentat(int dfd, struct filename *name,
                             unsigned int flags, struct path *parent,
                             struct qstr *last, int *type)
{
        return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
}

static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name,
                                    unsigned int lookup_flags,
                                    unsigned int state)
{
        struct dentry *dentry;
        struct inode *dir = d_inode(parent);

        if (state == TASK_KILLABLE) {
                int ret = down_write_killable_nested(&dir->i_rwsem,
                                                     I_MUTEX_PARENT);
                if (ret)
                        return ERR_PTR(ret);
        } else {
                inode_lock_nested(dir, I_MUTEX_PARENT);
        }
        dentry = lookup_one_qstr_excl(name, parent, lookup_flags);
        if (IS_ERR(dentry))
                inode_unlock(dir);
        return dentry;
}

/**
 * start_dirop - begin a create or remove dirop, performing locking and lookup
 * @parent:       the dentry of the parent in which the operation will occur
 * @name:         a qstr holding the name within that parent
 * @lookup_flags: intent and other lookup flags.
 *
 * The lookup is performed and necessary locks are taken so that, on success,
 * the returned dentry can be operated on safely.
 * The qstr must already have the hash value calculated.
 *
 * Returns: a locked dentry, or an error.
 *
 */
struct dentry *start_dirop(struct dentry *parent, struct qstr *name,
                           unsigned int lookup_flags)
{
        return __start_dirop(parent, name, lookup_flags, TASK_NORMAL);
}

/**
 * end_dirop - signal completion of a dirop
 * @de: the dentry which was returned by start_dirop or similar.
 *
 * If the de is an error, nothing happens. Otherwise any lock taken to
 * protect the dentry is dropped and the dentry itself is release (dput()).
 */
void end_dirop(struct dentry *de)
{
        if (!IS_ERR(de)) {
                inode_unlock(de->d_parent->d_inode);
                dput(de);
        }
}
EXPORT_SYMBOL(end_dirop);

/* does lookup, returns the object with parent locked */
static struct dentry *__start_removing_path(int dfd, struct filename *name,
                                           struct path *path)
{
        struct path parent_path __free(path_put) = {};
        struct dentry *d;
        struct qstr last;
        int type, error;

        error = filename_parentat(dfd, name, 0, &parent_path, &last, &type);
        if (error)
                return ERR_PTR(error);
        if (unlikely(type != LAST_NORM))
                return ERR_PTR(-EINVAL);
        /* don't fail immediately if it's r/o, at least try to report other errors */
        error = mnt_want_write(parent_path.mnt);
        d = start_dirop(parent_path.dentry, &last, 0);
        if (IS_ERR(d))
                goto drop;
        if (error)
                goto fail;
        path->dentry = no_free_ptr(parent_path.dentry);
        path->mnt = no_free_ptr(parent_path.mnt);
        return d;

fail:
        end_dirop(d);
        d = ERR_PTR(error);
drop:
        if (!error)
                mnt_drop_write(parent_path.mnt);
        return d;
}

/**
 * kern_path_parent: lookup path returning parent and target
 * @name: path name
 * @path: path to store parent in
 *
 * The path @name should end with a normal component, not "." or ".." or "/".
 * A lookup is performed and if successful the parent information
 * is store in @parent and the dentry is returned.
 *
 * The dentry maybe negative, the parent will be positive.
 *
 * Returns:  dentry or error.
 */
struct dentry *kern_path_parent(const char *name, struct path *path)
{
        struct path parent_path __free(path_put) = {};
        CLASS(filename_kernel, filename)(name);
        struct dentry *d;
        struct qstr last;
        int type, error;

        error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type);
        if (error)
                return ERR_PTR(error);
        if (unlikely(type != LAST_NORM))
                return ERR_PTR(-EINVAL);

        d = lookup_noperm_unlocked(&last, parent_path.dentry);
        if (IS_ERR(d))
                return d;
        path->dentry = no_free_ptr(parent_path.dentry);
        path->mnt = no_free_ptr(parent_path.mnt);
        return d;
}

struct dentry *start_removing_path(const char *name, struct path *path)
{
        CLASS(filename_kernel, filename)(name);
        return __start_removing_path(AT_FDCWD, filename, path);
}

struct dentry *start_removing_user_path_at(int dfd,
                                           const char __user *name,
                                           struct path *path)
{
        CLASS(filename, filename)(name);
        return __start_removing_path(dfd, filename, path);
}
EXPORT_SYMBOL(start_removing_user_path_at);

int kern_path(const char *name, unsigned int flags, struct path *path)
{
        CLASS(filename_kernel, filename)(name);
        return filename_lookup(AT_FDCWD, filename, flags, path, NULL);
}
EXPORT_SYMBOL(kern_path);

/**
 * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
 * @filename: filename structure
 * @flags: lookup flags
 * @parent: pointer to struct path to fill
 * @last: last component
 * @type: type of the last component
 * @root: pointer to struct path of the base directory
 */
int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
                           struct path *parent, struct qstr *last, int *type,
                           const struct path *root)
{
        return  __filename_parentat(AT_FDCWD, filename, flags, parent, last,
                                    type, root);
}
EXPORT_SYMBOL(vfs_path_parent_lookup);

/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
 * @path: pointer to struct path to fill
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct path *path)
{
        CLASS(filename_kernel, filename)(name);
        struct path root = {.mnt = mnt, .dentry = dentry};

        /* the first argument of filename_lookup() is ignored with root */
        return filename_lookup(AT_FDCWD, filename, flags, path, &root);
}
EXPORT_SYMBOL(vfs_path_lookup);

int lookup_noperm_common(struct qstr *qname, struct dentry *base)
{
        const char *name = qname->name;
        u32 len = qname->len;

        qname->hash = full_name_hash(base, name, len);
        if (!len)
                return -EACCES;

        if (name_is_dot_dotdot(name, len))
                return -EACCES;

        while (len--) {
                unsigned int c = *(const unsigned char *)name++;
                if (c == '/' || c == '\0')
                        return -EACCES;
        }
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
        if (base->d_flags & DCACHE_OP_HASH) {
                int err = base->d_op->d_hash(base, qname);
                if (err < 0)
                        return err;
        }
        return 0;
}

static int lookup_one_common(struct mnt_idmap *idmap,
                             struct qstr *qname, struct dentry *base)
{
        int err;
        err = lookup_noperm_common(qname, base);
        if (err < 0)
                return err;
        return inode_permission(idmap, base->d_inode, MAY_EXEC);
}

/**
 * try_lookup_noperm - filesystem helper to lookup single pathname component
 * @name:        qstr storing pathname component to lookup
 * @base:        base directory to lookup from
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist or an error if there is a problem with the name.
 * The function does not try to create a dentry and if one
 * is found it doesn't try to revalidate it.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.  It does no permission checking.
 *
 * No locks need be held - only a counted reference to @base is needed.
 *
 * Returns:
 *   - ref-counted dentry on success, or
 *   - %NULL if name could not be found, or
 *   - ERR_PTR(-EACCES) if name is dot or dotdot or contains a slash or nul, or
 *   - ERR_PTR() if fs provide ->d_hash, and this returned an error.
 */
struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base)
{
        int err;

        err = lookup_noperm_common(name, base);
        if (err)
                return ERR_PTR(err);

        return d_lookup(base, name);
}
EXPORT_SYMBOL(try_lookup_noperm);

/**
 * lookup_noperm - filesystem helper to lookup single pathname component
 * @name:        qstr storing pathname component to lookup
 * @base:        base directory to lookup from
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.  It does no permission checking.
 *
 * The caller must hold base->i_rwsem.
 */
struct dentry *lookup_noperm(struct qstr *name, struct dentry *base)
{
        struct dentry *dentry;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_noperm_common(name, base);
        if (err)
                return ERR_PTR(err);

        dentry = lookup_dcache(name, base, 0);
        return dentry ? dentry : __lookup_slow(name, base, 0);
}
EXPORT_SYMBOL(lookup_noperm);

/**
 * lookup_one - lookup single pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        qstr holding pathname component to lookup
 * @base:        base directory to lookup from
 *
 * This can be used for in-kernel filesystem clients such as file servers.
 *
 * The caller must hold base->i_rwsem.
 */
struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name,
                          struct dentry *base)
{
        struct dentry *dentry;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_common(idmap, name, base);
        if (err)
                return ERR_PTR(err);

        dentry = lookup_dcache(name, base, 0);
        return dentry ? dentry : __lookup_slow(name, base, 0);
}
EXPORT_SYMBOL(lookup_one);

/**
 * lookup_one_unlocked - lookup single pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        qstr olding pathname component to lookup
 * @base:        base directory to lookup from
 *
 * This can be used for in-kernel filesystem clients such as file servers.
 *
 * Unlike lookup_one, it should be called without the parent
 * i_rwsem held, and will take the i_rwsem itself if necessary.
 *
 * Returns: - A dentry, possibly negative, or
 *            - same errors as try_lookup_noperm() or
 *            - ERR_PTR(-ENOENT) if parent has been removed, or
 *            - ERR_PTR(-EACCES) if parent directory is not searchable.
 */
struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name,
                                   struct dentry *base)
{
        int err;
        struct dentry *ret;

        err = lookup_one_common(idmap, name, base);
        if (err)
                return ERR_PTR(err);

        ret = lookup_dcache(name, base, 0);
        if (!ret)
                ret = lookup_slow(name, base, 0);
        return ret;
}
EXPORT_SYMBOL(lookup_one_unlocked);

/**
 * lookup_one_positive_killable - lookup single pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        qstr olding pathname component to lookup
 * @base:        base directory to lookup from
 *
 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
 * known positive or ERR_PTR(). This is what most of the users want.
 *
 * Note that pinned negative with unlocked parent _can_ become positive at any
 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
 * positives have >d_inode stable, so this one avoids such problems.
 *
 * This can be used for in-kernel filesystem clients such as file servers.
 *
 * It should be called without the parent i_rwsem held, and will take
 * the i_rwsem itself if necessary.  If a fatal signal is pending or
 * delivered, it will return %-EINTR if the lock is needed.
 *
 * Returns: A dentry, possibly negative, or
 *           - same errors as lookup_one_unlocked() or
 *           - ERR_PTR(-EINTR) if a fatal signal is pending.
 */
struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
                                            struct qstr *name,
                                            struct dentry *base)
{
        int err;
        struct dentry *ret;

        err = lookup_one_common(idmap, name, base);
        if (err)
                return ERR_PTR(err);

        ret = lookup_dcache(name, base, 0);
        if (!ret)
                ret = lookup_slow_killable(name, base, 0);
        if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
                dput(ret);
                ret = ERR_PTR(-ENOENT);
        }
        return ret;
}
EXPORT_SYMBOL(lookup_one_positive_killable);

/**
 * lookup_one_positive_unlocked - lookup single pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        qstr holding pathname component to lookup
 * @base:        base directory to lookup from
 *
 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
 * known positive or ERR_PTR(). This is what most of the users want.
 *
 * Note that pinned negative with unlocked parent _can_ become positive at any
 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
 * positives have >d_inode stable, so this one avoids such problems.
 *
 * This can be used for in-kernel filesystem clients such as file servers.
 *
 * The helper should be called without i_rwsem held.
 *
 * Returns: A positive dentry, or
 *           - ERR_PTR(-ENOENT) if the name could not be found, or
 *           - same errors as lookup_one_unlocked().
 */
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
                                            struct qstr *name,
                                            struct dentry *base)
{
        struct dentry *ret = lookup_one_unlocked(idmap, name, base);

        if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
                dput(ret);
                ret = ERR_PTR(-ENOENT);
        }
        return ret;
}
EXPORT_SYMBOL(lookup_one_positive_unlocked);

/**
 * lookup_noperm_unlocked - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code. It does no permission checking.
 *
 * Unlike lookup_noperm(), it should be called without the parent
 * i_rwsem held, and will take the i_rwsem itself if necessary.
 *
 * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already
 * existed.
 *
 * Returns: A dentry, possibly negative, or
 *           - ERR_PTR(-ENOENT) if parent has been removed, or
 *           - same errors as try_lookup_noperm()
 */
struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base)
{
        struct dentry *ret;
        int err;

        err = lookup_noperm_common(name, base);
        if (err)
                return ERR_PTR(err);

        ret = lookup_dcache(name, base, 0);
        if (!ret)
                ret = lookup_slow(name, base, 0);
        return ret;
}
EXPORT_SYMBOL(lookup_noperm_unlocked);

/*
 * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_noperm_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 *
 * Returns: A positive dentry, or
 *           - ERR_PTR(-ENOENT) if name cannot be found or parent has been removed, or
 *           - same errors as try_lookup_noperm()
 */
struct dentry *lookup_noperm_positive_unlocked(struct qstr *name,
                                               struct dentry *base)
{
        struct dentry *ret;

        ret = lookup_noperm_unlocked(name, base);
        if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
                dput(ret);
                ret = ERR_PTR(-ENOENT);
        }
        return ret;
}
EXPORT_SYMBOL(lookup_noperm_positive_unlocked);

/**
 * start_creating - prepare to create a given name with permission checking
 * @idmap:  idmap of the mount
 * @parent: directory in which to prepare to create the name
 * @name:   the name to be created
 *
 * Locks are taken and a lookup is performed prior to creating
 * an object in a directory.  Permission checking (MAY_EXEC) is performed
 * against @idmap.
 *
 * If the name already exists, a positive dentry is returned, so
 * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail
 * with -EEXIST.
 *
 * Returns: a negative or positive dentry, or an error.
 */
struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
                              struct qstr *name)
{
        int err = lookup_one_common(idmap, name, parent);

        if (err)
                return ERR_PTR(err);
        return start_dirop(parent, name, LOOKUP_CREATE);
}
EXPORT_SYMBOL(start_creating);

/**
 * start_removing - prepare to remove a given name with permission checking
 * @idmap:  idmap of the mount
 * @parent: directory in which to find the name
 * @name:   the name to be removed
 *
 * Locks are taken and a lookup in performed prior to removing
 * an object from a directory.  Permission checking (MAY_EXEC) is performed
 * against @idmap.
 *
 * If the name doesn't exist, an error is returned.
 *
 * end_removing() should be called when removal is complete, or aborted.
 *
 * Returns: a positive dentry, or an error.
 */
struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
                              struct qstr *name)
{
        int err = lookup_one_common(idmap, name, parent);

        if (err)
                return ERR_PTR(err);
        return start_dirop(parent, name, 0);
}
EXPORT_SYMBOL(start_removing);

/**
 * start_creating_killable - prepare to create a given name with permission checking
 * @idmap:  idmap of the mount
 * @parent: directory in which to prepare to create the name
 * @name:   the name to be created
 *
 * Locks are taken and a lookup in performed prior to creating
 * an object in a directory.  Permission checking (MAY_EXEC) is performed
 * against @idmap.
 *
 * If the name already exists, a positive dentry is returned.
 *
 * If a signal is received or was already pending, the function aborts
 * with -EINTR;
 *
 * Returns: a negative or positive dentry, or an error.
 */
struct dentry *start_creating_killable(struct mnt_idmap *idmap,
                                       struct dentry *parent,
                                       struct qstr *name)
{
        int err = lookup_one_common(idmap, name, parent);

        if (err)
                return ERR_PTR(err);
        return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE);
}
EXPORT_SYMBOL(start_creating_killable);

/**
 * start_removing_killable - prepare to remove a given name with permission checking
 * @idmap:  idmap of the mount
 * @parent: directory in which to find the name
 * @name:   the name to be removed
 *
 * Locks are taken and a lookup in performed prior to removing
 * an object from a directory.  Permission checking (MAY_EXEC) is performed
 * against @idmap.
 *
 * If the name doesn't exist, an error is returned.
 *
 * end_removing() should be called when removal is complete, or aborted.
 *
 * If a signal is received or was already pending, the function aborts
 * with -EINTR;
 *
 * Returns: a positive dentry, or an error.
 */
struct dentry *start_removing_killable(struct mnt_idmap *idmap,
                                       struct dentry *parent,
                                       struct qstr *name)
{
        int err = lookup_one_common(idmap, name, parent);

        if (err)
                return ERR_PTR(err);
        return __start_dirop(parent, name, 0, TASK_KILLABLE);
}
EXPORT_SYMBOL(start_removing_killable);

/**
 * start_creating_noperm - prepare to create a given name without permission checking
 * @parent: directory in which to prepare to create the name
 * @name:   the name to be created
 *
 * Locks are taken and a lookup in performed prior to creating
 * an object in a directory.
 *
 * If the name already exists, a positive dentry is returned.
 *
 * Returns: a negative or positive dentry, or an error.
 */
struct dentry *start_creating_noperm(struct dentry *parent,
                                     struct qstr *name)
{
        int err = lookup_noperm_common(name, parent);

        if (err)
                return ERR_PTR(err);
        return start_dirop(parent, name, LOOKUP_CREATE);
}
EXPORT_SYMBOL(start_creating_noperm);

/**
 * start_removing_noperm - prepare to remove a given name without permission checking
 * @parent: directory in which to find the name
 * @name:   the name to be removed
 *
 * Locks are taken and a lookup in performed prior to removing
 * an object from a directory.
 *
 * If the name doesn't exist, an error is returned.
 *
 * end_removing() should be called when removal is complete, or aborted.
 *
 * Returns: a positive dentry, or an error.
 */
struct dentry *start_removing_noperm(struct dentry *parent,
                                     struct qstr *name)
{
        int err = lookup_noperm_common(name, parent);

        if (err)
                return ERR_PTR(err);
        return start_dirop(parent, name, 0);
}
EXPORT_SYMBOL(start_removing_noperm);

/**
 * start_creating_dentry - prepare to create a given dentry
 * @parent: directory from which dentry should be removed
 * @child:  the dentry to be removed
 *
 * A lock is taken to protect the dentry again other dirops and
 * the validity of the dentry is checked: correct parent and still hashed.
 *
 * If the dentry is valid and negative a reference is taken and
 * returned.  If not an error is returned.
 *
 * end_creating() should be called when creation is complete, or aborted.
 *
 * Returns: the valid dentry, or an error.
 */
struct dentry *start_creating_dentry(struct dentry *parent,
                                     struct dentry *child)
{
        inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
        if (unlikely(IS_DEADDIR(parent->d_inode) ||
                     child->d_parent != parent ||
                     d_unhashed(child))) {
                inode_unlock(parent->d_inode);
                return ERR_PTR(-EINVAL);
        }
        if (d_is_positive(child)) {
                inode_unlock(parent->d_inode);
                return ERR_PTR(-EEXIST);
        }
        return dget(child);
}
EXPORT_SYMBOL(start_creating_dentry);

/**
 * start_removing_dentry - prepare to remove a given dentry
 * @parent: directory from which dentry should be removed
 * @child:  the dentry to be removed
 *
 * A lock is taken to protect the dentry again other dirops and
 * the validity of the dentry is checked: correct parent and still hashed.
 *
 * If the dentry is valid and positive, a reference is taken and
 * returned.  If not an error is returned.
 *
 * end_removing() should be called when removal is complete, or aborted.
 *
 * Returns: the valid dentry, or an error.
 */
struct dentry *start_removing_dentry(struct dentry *parent,
                                     struct dentry *child)
{
        inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
        if (unlikely(IS_DEADDIR(parent->d_inode) ||
                     child->d_parent != parent ||
                     d_unhashed(child))) {
                inode_unlock(parent->d_inode);
                return ERR_PTR(-EINVAL);
        }
        if (d_is_negative(child)) {
                inode_unlock(parent->d_inode);
                return ERR_PTR(-ENOENT);
        }
        return dget(child);
}
EXPORT_SYMBOL(start_removing_dentry);

#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
        /* Find something mounted on "pts" in the same directory as
         * the input path.
         */
        struct dentry *parent = dget_parent(path->dentry);
        struct dentry *child;
        struct qstr this = QSTR_INIT("pts", 3);

        if (unlikely(!path_connected(path->mnt, parent))) {
                dput(parent);
                return -ENOENT;
        }
        dput(path->dentry);
        path->dentry = parent;
        child = d_hash_and_lookup(parent, &this);
        if (IS_ERR_OR_NULL(child))
                return -ENOENT;

        path->dentry = child;
        dput(parent);
        follow_down(path, 0);
        return 0;
}
#endif

int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
{
        CLASS(filename_flags, filename)(name, flags);
        return filename_lookup(dfd, filename, flags, path, NULL);
}
EXPORT_SYMBOL(user_path_at);

int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
                   struct inode *inode)
{
        kuid_t fsuid = current_fsuid();

        if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid))
                return 0;
        if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid))
                return 0;
        return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
}
EXPORT_SYMBOL(__check_sticky);

/*
 *        Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *        a. be owner of dir, or
 *        b. be owner of victim, or
 *        c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */
int may_delete_dentry(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *victim, bool isdir)
{
        struct inode *inode = d_backing_inode(victim);
        int error;

        if (d_is_negative(victim))
                return -ENOENT;
        BUG_ON(!inode);

        BUG_ON(victim->d_parent->d_inode != dir);

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
            !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
                return -EOVERFLOW;

        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);

        error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (IS_APPEND(dir))
                return -EPERM;

        if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) ||
            IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
            HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        if (isdir) {
                if (!d_is_dir(victim))
                        return -ENOTDIR;
                if (IS_ROOT(victim))
                        return -EBUSY;
        } else if (d_is_dir(victim))
                return -EISDIR;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
                return -EBUSY;
        return 0;
}
EXPORT_SYMBOL(may_delete_dentry);

/*        Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
 */
int may_create_dentry(struct mnt_idmap *idmap,
                      struct inode *dir, struct dentry *child)
{
        audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
        if (child->d_inode)
                return -EEXIST;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (!fsuidgid_has_mapping(dir->i_sb, idmap))
                return -EOVERFLOW;

        return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
}
EXPORT_SYMBOL(may_create_dentry);

// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p = p1, *q = p2, *r;

        while ((r = p->d_parent) != p2 && r != p)
                p = r;
        if (r == p2) {
                // p is a child of p2 and an ancestor of p1 or p1 itself
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
                return p;
        }
        // p is the root of connected component that contains p1
        // p2 does not occur on the path from p to p1
        while ((r = q->d_parent) != p1 && r != p && r != q)
                q = r;
        if (r == p1) {
                // q is a child of p1 and an ancestor of p2 or p2 itself
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
                return q;
        } else if (likely(r == p)) {
                // both p2 and p1 are descendents of p
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
                return NULL;
        } else { // no common ancestor at the time we'd been called
                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
                return ERR_PTR(-EXDEV);
        }
}

/*
 * p1 and p2 should be directories on the same fs.
 */
static struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
        if (p1 == p2) {
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                return NULL;
        }

        mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
        return lock_two_directories(p1, p2);
}

/*
 * c1 and p2 should be on the same fs.
 */
static struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
{
        if (READ_ONCE(c1->d_parent) == p2) {
                /*
                 * hopefully won't need to touch ->s_vfs_rename_mutex at all.
                 */
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
                /*
                 * now that p2 is locked, nobody can move in or out of it,
                 * so the test below is safe.
                 */
                if (likely(c1->d_parent == p2))
                        return NULL;

                /*
                 * c1 got moved out of p2 while we'd been taking locks;
                 * unlock and fall back to slow case.
                 */
                inode_unlock(p2->d_inode);
        }

        mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
        /*
         * nobody can move out of any directories on this fs.
         */
        if (likely(c1->d_parent != p2))
                return lock_two_directories(c1->d_parent, p2);

        /*
         * c1 got moved into p2 while we were taking locks;
         * we need p2 locked and ->s_vfs_rename_mutex unlocked,
         * for consistency with lock_rename().
         */
        inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
        mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
        return NULL;
}

static void unlock_rename(struct dentry *p1, struct dentry *p2)
{
        inode_unlock(p1->d_inode);
        if (p1 != p2) {
                inode_unlock(p2->d_inode);
                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
        }
}

/**
 * __start_renaming - lookup and lock names for rename
 * @rd:           rename data containing parents and flags, and
 *                for receiving found dentries
 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
 *                LOOKUP_NO_SYMLINKS etc).
 * @old_last:     name of object in @rd.old_parent
 * @new_last:     name of object in @rd.new_parent
 *
 * Look up two names and ensure locks are in place for
 * rename.
 *
 * On success the found dentries are stored in @rd.old_dentry,
 * @rd.new_dentry and an extra ref is taken on @rd.old_parent.
 * These references and the lock are dropped by end_renaming().
 *
 * The passed in qstrs must have the hash calculated, and no permission
 * checking is performed.
 *
 * Returns: zero or an error.
 */
static int
__start_renaming(struct renamedata *rd, int lookup_flags,
                 struct qstr *old_last, struct qstr *new_last)
{
        struct dentry *trap;
        struct dentry *d1, *d2;
        int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
        int err;

        if (rd->flags & RENAME_EXCHANGE)
                target_flags = 0;
        if (rd->flags & RENAME_NOREPLACE)
                target_flags |= LOOKUP_EXCL;

        trap = lock_rename(rd->old_parent, rd->new_parent);
        if (IS_ERR(trap))
                return PTR_ERR(trap);

        d1 = lookup_one_qstr_excl(old_last, rd->old_parent,
                                  lookup_flags);
        err = PTR_ERR(d1);
        if (IS_ERR(d1))
                goto out_unlock;

        d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
                                  lookup_flags | target_flags);
        err = PTR_ERR(d2);
        if (IS_ERR(d2))
                goto out_dput_d1;

        if (d1 == trap) {
                /* source is an ancestor of target */
                err = -EINVAL;
                goto out_dput_d2;
        }

        if (d2 == trap) {
                /* target is an ancestor of source */
                if (rd->flags & RENAME_EXCHANGE)
                        err = -EINVAL;
                else
                        err = -ENOTEMPTY;
                goto out_dput_d2;
        }

        rd->old_dentry = d1;
        rd->new_dentry = d2;
        dget(rd->old_parent);
        return 0;

out_dput_d2:
        dput(d2);
out_dput_d1:
        dput(d1);
out_unlock:
        unlock_rename(rd->old_parent, rd->new_parent);
        return err;
}

/**
 * start_renaming - lookup and lock names for rename with permission checking
 * @rd:           rename data containing parents and flags, and
 *                for receiving found dentries
 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
 *                LOOKUP_NO_SYMLINKS etc).
 * @old_last:     name of object in @rd.old_parent
 * @new_last:     name of object in @rd.new_parent
 *
 * Look up two names and ensure locks are in place for
 * rename.
 *
 * On success the found dentries are stored in @rd.old_dentry,
 * @rd.new_dentry.  Also the refcount on @rd->old_parent is increased.
 * These references and the lock are dropped by end_renaming().
 *
 * The passed in qstrs need not have the hash calculated, and basic
 * eXecute permission checking is performed against @rd.mnt_idmap.
 *
 * Returns: zero or an error.
 */
int start_renaming(struct renamedata *rd, int lookup_flags,
                   struct qstr *old_last, struct qstr *new_last)
{
        int err;

        err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent);
        if (err)
                return err;
        err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent);
        if (err)
                return err;
        return __start_renaming(rd, lookup_flags, old_last, new_last);
}
EXPORT_SYMBOL(start_renaming);

static int
__start_renaming_dentry(struct renamedata *rd, int lookup_flags,
                        struct dentry *old_dentry, struct qstr *new_last)
{
        struct dentry *trap;
        struct dentry *d2;
        int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
        int err;

        if (rd->flags & RENAME_EXCHANGE)
                target_flags = 0;
        if (rd->flags & RENAME_NOREPLACE)
                target_flags |= LOOKUP_EXCL;

        /* Already have the dentry - need to be sure to lock the correct parent */
        trap = lock_rename_child(old_dentry, rd->new_parent);
        if (IS_ERR(trap))
                return PTR_ERR(trap);
        if (d_unhashed(old_dentry) ||
            (rd->old_parent && rd->old_parent != old_dentry->d_parent)) {
                /* dentry was removed, or moved and explicit parent requested */
                err = -EINVAL;
                goto out_unlock;
        }

        d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
                                  lookup_flags | target_flags);
        err = PTR_ERR(d2);
        if (IS_ERR(d2))
                goto out_unlock;

        if (old_dentry == trap) {
                /* source is an ancestor of target */
                err = -EINVAL;
                goto out_dput_d2;
        }

        if (d2 == trap) {
                /* target is an ancestor of source */
                if (rd->flags & RENAME_EXCHANGE)
                        err = -EINVAL;
                else
                        err = -ENOTEMPTY;
                goto out_dput_d2;
        }

        rd->old_dentry = dget(old_dentry);
        rd->new_dentry = d2;
        rd->old_parent = dget(old_dentry->d_parent);
        return 0;

out_dput_d2:
        dput(d2);
out_unlock:
        unlock_rename(old_dentry->d_parent, rd->new_parent);
        return err;
}

/**
 * start_renaming_dentry - lookup and lock name for rename with permission checking
 * @rd:           rename data containing parents and flags, and
 *                for receiving found dentries
 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
 *                LOOKUP_NO_SYMLINKS etc).
 * @old_dentry:   dentry of name to move
 * @new_last:     name of target in @rd.new_parent
 *
 * Look up target name and ensure locks are in place for
 * rename.
 *
 * On success the found dentry is stored in @rd.new_dentry and
 * @rd.old_parent is confirmed to be the parent of @old_dentry.  If it
 * was originally %NULL, it is set.  In either case a reference is taken
 * so that end_renaming() can have a stable reference to unlock.
 *
 * References and the lock can be dropped with end_renaming()
 *
 * The passed in qstr need not have the hash calculated, and basic
 * eXecute permission checking is performed against @rd.mnt_idmap.
 *
 * Returns: zero or an error.
 */
int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
                          struct dentry *old_dentry, struct qstr *new_last)
{
        int err;

        err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent);
        if (err)
                return err;
        return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last);
}
EXPORT_SYMBOL(start_renaming_dentry);

/**
 * start_renaming_two_dentries - Lock to dentries in given parents for rename
 * @rd:           rename data containing parent
 * @old_dentry:   dentry of name to move
 * @new_dentry:   dentry to move to
 *
 * Ensure locks are in place for rename and check parentage is still correct.
 *
 * On success the two dentries are stored in @rd.old_dentry and
 * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to
 * be the parents of the dentries.
 *
 * References and the lock can be dropped with end_renaming()
 *
 * Returns: zero or an error.
 */
int
start_renaming_two_dentries(struct renamedata *rd,
                            struct dentry *old_dentry, struct dentry *new_dentry)
{
        struct dentry *trap;
        int err;

        /* Already have the dentry - need to be sure to lock the correct parent */
        trap = lock_rename_child(old_dentry, rd->new_parent);
        if (IS_ERR(trap))
                return PTR_ERR(trap);
        err = -EINVAL;
        if (d_unhashed(old_dentry) ||
            (rd->old_parent && rd->old_parent != old_dentry->d_parent))
                /* old_dentry was removed, or moved and explicit parent requested */
                goto out_unlock;
        if (d_unhashed(new_dentry) ||
            rd->new_parent != new_dentry->d_parent)
                /* new_dentry was removed or moved */
                goto out_unlock;

        if (old_dentry == trap)
                /* source is an ancestor of target */
                goto out_unlock;

        if (new_dentry == trap) {
                /* target is an ancestor of source */
                if (rd->flags & RENAME_EXCHANGE)
                        err = -EINVAL;
                else
                        err = -ENOTEMPTY;
                goto out_unlock;
        }

        err = -EEXIST;
        if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE))
                goto out_unlock;

        rd->old_dentry = dget(old_dentry);
        rd->new_dentry = dget(new_dentry);
        rd->old_parent = dget(old_dentry->d_parent);
        return 0;

out_unlock:
        unlock_rename(old_dentry->d_parent, rd->new_parent);
        return err;
}
EXPORT_SYMBOL(start_renaming_two_dentries);

void end_renaming(struct renamedata *rd)
{
        unlock_rename(rd->old_parent, rd->new_parent);
        dput(rd->old_dentry);
        dput(rd->new_dentry);
        dput(rd->old_parent);
}
EXPORT_SYMBOL(end_renaming);

/**
 * vfs_prepare_mode - prepare the mode to be used for a new inode
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode
 * @mask_perms:        allowed permission by the vfs
 * @type:        type of file to be created
 *
 * This helper consolidates and enforces vfs restrictions on the @mode of a new
 * object to be created.
 *
 * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
 * the kernel documentation for mode_strip_umask()). Moving umask stripping
 * after setgid stripping allows the same ordering for both non-POSIX ACL and
 * POSIX ACL supporting filesystems.
 *
 * Note that it's currently valid for @type to be 0 if a directory is created.
 * Filesystems raise that flag individually and we need to check whether each
 * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
 * non-zero type.
 *
 * Returns: mode to be passed to the filesystem
 */
static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
                                       const struct inode *dir, umode_t mode,
                                       umode_t mask_perms, umode_t type)
{
        mode = mode_strip_sgid(idmap, dir, mode);
        mode = mode_strip_umask(dir, mode);

        /*
         * Apply the vfs mandated allowed permission mask and set the type of
         * file to be created before we call into the filesystem.
         */
        mode &= (mask_perms & ~S_IFMT);
        mode |= (type & S_IFMT);

        return mode;
}

/**
 * vfs_create - create new file
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        dentry of the child file
 * @mode:        mode of the child file
 * @di:                returns parent inode, if the inode is delegated.
 *
 * Create a new file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode,
               struct delegated_inode *di)
{
        struct inode *dir = d_inode(dentry->d_parent);
        int error;

        error = may_create_dentry(idmap, dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->create)
                return -EACCES;        /* shouldn't it be ENOSYS? */

        mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = try_break_deleg(dir, di);
        if (error)
                return error;
        error = dir->i_op->create(idmap, dir, dentry, mode, true);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_create);

int vfs_mkobj(struct dentry *dentry, umode_t mode,
                int (*f)(struct dentry *, umode_t, void *),
                void *arg)
{
        struct inode *dir = dentry->d_parent->d_inode;
        int error = may_create_dentry(&nop_mnt_idmap, dir, dentry);
        if (error)
                return error;

        mode &= S_IALLUGO;
        mode |= S_IFREG;
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = f(dentry, mode, arg);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mkobj);

bool may_open_dev(const struct path *path)
{
        return !(path->mnt->mnt_flags & MNT_NODEV) &&
                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

static int may_open(struct mnt_idmap *idmap, const struct path *path,
                    int acc_mode, int flag)
{
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;

        if (!inode)
                return -ENOENT;

        switch (inode->i_mode & S_IFMT) {
        case S_IFLNK:
                return -ELOOP;
        case S_IFDIR:
                if (acc_mode & MAY_WRITE)
                        return -EISDIR;
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                break;
        case S_IFBLK:
        case S_IFCHR:
                if (!may_open_dev(path))
                        return -EACCES;
                fallthrough;
        case S_IFIFO:
        case S_IFSOCK:
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                flag &= ~O_TRUNC;
                break;
        case S_IFREG:
                if ((acc_mode & MAY_EXEC) && path_noexec(path))
                        return -EACCES;
                break;
        default:
                VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode);
        }

        error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
        if (error)
                return error;

        /*
         * An append-only file must be opened in append mode for writing.
         */
        if (IS_APPEND(inode)) {
                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
                        return -EPERM;
                if (flag & O_TRUNC)
                        return -EPERM;
        }

        /* O_NOATIME can only be set by the owner or superuser */
        if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
                return -EPERM;

        return 0;
}

static int handle_truncate(struct mnt_idmap *idmap, struct file *filp)
{
        const struct path *path = &filp->f_path;
        struct inode *inode = path->dentry->d_inode;
        int error = get_write_access(inode);
        if (error)
                return error;

        error = security_file_truncate(filp);
        if (!error) {
                error = do_truncate(idmap, path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
                                    filp);
        }
        put_write_access(inode);
        return error;
}

static inline int open_to_namei_flags(int flag)
{
        if ((flag & O_ACCMODE) == 3)
                flag--;
        return flag;
}

static int may_o_create(struct mnt_idmap *idmap,
                        const struct path *dir, struct dentry *dentry,
                        umode_t mode)
{
        int error = security_path_mknod(dir, dentry, mode, 0);
        if (error)
                return error;

        if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
                return -EOVERFLOW;

        error = inode_permission(idmap, dir->dentry->d_inode,
                                 MAY_WRITE | MAY_EXEC);
        if (error)
                return error;

        return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
 *
 * Returns an error code otherwise.
 */
static struct dentry *atomic_open(const struct path *path, struct dentry *dentry,
                                  struct file *file,
                                  int open_flag, umode_t mode)
{
        struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
        struct inode *dir =  path->dentry->d_inode;
        int error;

        file->__f_path.dentry = DENTRY_NOT_SET;
        file->__f_path.mnt = path->mnt;
        error = dir->i_op->atomic_open(dir, dentry, file,
                                       open_to_namei_flags(open_flag), mode);
        d_lookup_done(dentry);
        if (!error) {
                if (file->f_mode & FMODE_OPENED) {
                        if (unlikely(dentry != file->f_path.dentry)) {
                                dput(dentry);
                                dentry = dget(file->f_path.dentry);
                        }
                } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
                        error = -EIO;
                } else {
                        if (file->f_path.dentry) {
                                dput(dentry);
                                dentry = file->f_path.dentry;
                        }
                        if (unlikely(d_is_negative(dentry)))
                                error = -ENOENT;
                }
        }
        if (error) {
                dput(dentry);
                dentry = ERR_PTR(error);
        }
        return dentry;
}

/*
 * Look up and maybe create and open the last component.
 *
 * Must be called with parent locked (exclusive in O_CREAT case).
 *
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
 *
 * An error code is returned on failure.
 */
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
                                  const struct open_flags *op,
                                  bool got_write, struct delegated_inode *delegated_inode)
{
        struct mnt_idmap *idmap;
        struct dentry *dir = nd->path.dentry;
        struct inode *dir_inode = dir->d_inode;
        int open_flag = op->open_flag;
        struct dentry *dentry;
        int error, create_error = 0;
        umode_t mode = op->mode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        if (unlikely(IS_DEADDIR(dir_inode)))
                return ERR_PTR(-ENOENT);

        file->f_mode &= ~FMODE_CREATED;
        dentry = d_lookup(dir, &nd->last);
        for (;;) {
                if (!dentry) {
                        dentry = d_alloc_parallel(dir, &nd->last, &wq);
                        if (IS_ERR(dentry))
                                return dentry;
                }
                if (d_in_lookup(dentry))
                        break;

                error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags);
                if (likely(error > 0))
                        break;
                if (error)
                        goto out_dput;
                d_invalidate(dentry);
                dput(dentry);
                dentry = NULL;
        }
        if (dentry->d_inode) {
                /* Cached positive dentry: will open in f_op->open */
                return dentry;
        }

        if (open_flag & O_CREAT)
                audit_inode(nd->name, dir, AUDIT_INODE_PARENT);

        /*
         * Checking write permission is tricky, bacuse we don't know if we are
         * going to actually need it: O_CREAT opens should work as long as the
         * file exists.  But checking existence breaks atomicity.  The trick is
         * to check access and if not granted clear O_CREAT from the flags.
         *
         * Another problem is returing the "right" error value (e.g. for an
         * O_EXCL open we want to return EEXIST not EROFS).
         */
        if (unlikely(!got_write))
                open_flag &= ~O_TRUNC;
        idmap = mnt_idmap(nd->path.mnt);
        if (open_flag & O_CREAT) {
                if (open_flag & O_EXCL)
                        open_flag &= ~O_TRUNC;
                mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
                if (likely(got_write))
                        create_error = may_o_create(idmap, &nd->path,
                                                    dentry, mode);
                else
                        create_error = -EROFS;
        }
        if (create_error)
                open_flag &= ~O_CREAT;
        if (dir_inode->i_op->atomic_open) {
                if (nd->flags & LOOKUP_DIRECTORY)
                        open_flag |= O_DIRECTORY;
                dentry = atomic_open(&nd->path, dentry, file, open_flag, mode);
                if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
                        dentry = ERR_PTR(create_error);
                return dentry;
        }

        if (d_in_lookup(dentry)) {
                struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
                                                             nd->flags);
                d_lookup_done(dentry);
                if (unlikely(res)) {
                        if (IS_ERR(res)) {
                                error = PTR_ERR(res);
                                goto out_dput;
                        }
                        dput(dentry);
                        dentry = res;
                }
        }

        /* Negative dentry, just create the file */
        if (!dentry->d_inode && (open_flag & O_CREAT)) {
                /* but break the directory lease first! */
                error = try_break_deleg(dir_inode, delegated_inode);
                if (error)
                        goto out_dput;

                file->f_mode |= FMODE_CREATED;
                audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
                if (!dir_inode->i_op->create) {
                        error = -EACCES;
                        goto out_dput;
                }

                error = dir_inode->i_op->create(idmap, dir_inode, dentry,
                                                mode, open_flag & O_EXCL);
                if (error)
                        goto out_dput;
        }
        if (unlikely(create_error) && !dentry->d_inode) {
                error = create_error;
                goto out_dput;
        }
        return dentry;

out_dput:
        dput(dentry);
        return ERR_PTR(error);
}

static inline bool trailing_slashes(struct nameidata *nd)
{
        return (bool)nd->last.name[nd->last.len];
}

static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
{
        struct dentry *dentry;

        if (open_flag & O_CREAT) {
                if (trailing_slashes(nd))
                        return ERR_PTR(-EISDIR);

                /* Don't bother on an O_EXCL create */
                if (open_flag & O_EXCL)
                        return NULL;
        }

        if (trailing_slashes(nd))
                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

        dentry = lookup_fast(nd);
        if (IS_ERR_OR_NULL(dentry))
                return dentry;

        if (open_flag & O_CREAT) {
                /* Discard negative dentries. Need inode_lock to do the create */
                if (!dentry->d_inode) {
                        if (!(nd->flags & LOOKUP_RCU))
                                dput(dentry);
                        dentry = NULL;
                }
        }
        return dentry;
}

static const char *open_last_lookups(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        struct delegated_inode delegated_inode = { };
        struct dentry *dir = nd->path.dentry;
        int open_flag = op->open_flag;
        bool got_write = false;
        struct dentry *dentry;
        const char *res;

        nd->flags |= op->intent;

        if (nd->last_type != LAST_NORM) {
                if (nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }

        /* We _can_ be in RCU mode here */
        dentry = lookup_fast_for_open(nd, open_flag);
        if (IS_ERR(dentry))
                return ERR_CAST(dentry);

        if (likely(dentry))
                goto finish_lookup;

        if (!(open_flag & O_CREAT)) {
                if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
                        return ERR_PTR(-ECHILD);
        } else {
                if (nd->flags & LOOKUP_RCU) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                }
        }
retry:
        if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
                got_write = !mnt_want_write(nd->path.mnt);
                /*
                 * do _not_ fail yet - we might not need that or fail with
                 * a different error; let lookup_open() decide; we'll be
                 * dropping this one anyway.
                 */
        }
        if (open_flag & O_CREAT)
                inode_lock(dir->d_inode);
        else
                inode_lock_shared(dir->d_inode);
        dentry = lookup_open(nd, file, op, got_write, &delegated_inode);
        if (!IS_ERR(dentry)) {
                if (file->f_mode & FMODE_CREATED)
                        fsnotify_create(dir->d_inode, dentry);
                if (file->f_mode & FMODE_OPENED)
                        fsnotify_open(file);
        }
        if (open_flag & O_CREAT)
                inode_unlock(dir->d_inode);
        else
                inode_unlock_shared(dir->d_inode);

        if (got_write)
                mnt_drop_write(nd->path.mnt);

        if (IS_ERR(dentry)) {
                if (is_delegated(&delegated_inode)) {
                        int error = break_deleg_wait(&delegated_inode);

                        if (!error)
                                goto retry;
                        return ERR_PTR(error);
                }
                return ERR_CAST(dentry);
        }

        if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
                dput(nd->path.dentry);
                nd->path.dentry = dentry;
                return NULL;
        }

finish_lookup:
        if (nd->depth)
                put_link(nd);
        res = step_into(nd, WALK_TRAILING, dentry);
        if (unlikely(res))
                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
        return res;
}

/*
 * Handle the last step of open()
 */
static int do_open(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        struct mnt_idmap *idmap;
        int open_flag = op->open_flag;
        bool do_truncate;
        int acc_mode;
        int error;

        if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
                error = complete_walk(nd);
                if (error)
                        return error;
        }
        if (!(file->f_mode & FMODE_CREATED))
                audit_inode(nd->name, nd->path.dentry, 0);
        idmap = mnt_idmap(nd->path.mnt);
        if (open_flag & O_CREAT) {
                if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
                        return -EEXIST;
                if (d_is_dir(nd->path.dentry))
                        return -EISDIR;
                error = may_create_in_sticky(idmap, nd,
                                             d_backing_inode(nd->path.dentry));
                if (unlikely(error))
                        return error;
        }
        if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
                return -ENOTDIR;

        do_truncate = false;
        acc_mode = op->acc_mode;
        if (file->f_mode & FMODE_CREATED) {
                /* Don't check for write permission, don't truncate */
                open_flag &= ~O_TRUNC;
                acc_mode = 0;
        } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        return error;
                do_truncate = true;
        }
        error = may_open(idmap, &nd->path, acc_mode, open_flag);
        if (!error && !(file->f_mode & FMODE_OPENED))
                error = vfs_open(&nd->path, file);
        if (!error)
                error = security_file_post_open(file, op->acc_mode);
        if (!error && do_truncate)
                error = handle_truncate(idmap, file);
        if (unlikely(error > 0)) {
                WARN_ON(1);
                error = -EINVAL;
        }
        if (do_truncate)
                mnt_drop_write(nd->path.mnt);
        return error;
}

/**
 * vfs_tmpfile - create tmpfile
 * @idmap:        idmap of the mount the inode was found from
 * @parentpath:        pointer to the path of the base directory
 * @file:        file descriptor of the new tmpfile
 * @mode:        mode of the new tmpfile
 *
 * Create a temporary file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_tmpfile(struct mnt_idmap *idmap,
                const struct path *parentpath,
                struct file *file, umode_t mode)
{
        struct dentry *child;
        struct inode *dir = d_inode(parentpath->dentry);
        struct inode *inode;
        int error;
        int open_flag = file->f_flags;

        /* we want directory to be writable */
        error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (!dir->i_op->tmpfile)
                return -EOPNOTSUPP;
        child = d_alloc(parentpath->dentry, &slash_name);
        if (unlikely(!child))
                return -ENOMEM;
        file->__f_path.mnt = parentpath->mnt;
        file->__f_path.dentry = child;
        mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
        error = dir->i_op->tmpfile(idmap, dir, file, mode);
        dput(child);
        if (file->f_mode & FMODE_OPENED)
                fsnotify_open(file);
        if (error)
                return error;
        /* Don't check for other permissions, the inode was just created */
        error = may_open(idmap, &file->f_path, 0, file->f_flags);
        if (error)
                return error;
        inode = file_inode(file);
        if (!(open_flag & O_EXCL)) {
                spin_lock(&inode->i_lock);
                inode_state_set(inode, I_LINKABLE);
                spin_unlock(&inode->i_lock);
        }
        security_inode_post_create_tmpfile(idmap, inode);
        return 0;
}

/**
 * kernel_tmpfile_open - open a tmpfile for kernel internal use
 * @idmap:        idmap of the mount the inode was found from
 * @parentpath:        path of the base directory
 * @mode:        mode of the new tmpfile
 * @open_flag:        flags
 * @cred:        credentials for open
 *
 * Create and open a temporary file.  The file is not accounted in nr_files,
 * hence this is only for kernel internal use, and must not be installed into
 * file tables or such.
 */
struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
                                 const struct path *parentpath,
                                 umode_t mode, int open_flag,
                                 const struct cred *cred)
{
        struct file *file;
        int error;

        file = alloc_empty_file_noaccount(open_flag, cred);
        if (IS_ERR(file))
                return file;

        error = vfs_tmpfile(idmap, parentpath, file, mode);
        if (error) {
                fput(file);
                file = ERR_PTR(error);
        }
        return file;
}
EXPORT_SYMBOL(kernel_tmpfile_open);

static int do_tmpfile(struct nameidata *nd, unsigned flags,
                const struct open_flags *op,
                struct file *file)
{
        struct path path;
        int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);

        if (unlikely(error))
                return error;
        error = mnt_want_write(path.mnt);
        if (unlikely(error))
                goto out;
        error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode);
        if (error)
                goto out2;
        audit_inode(nd->name, file->f_path.dentry, 0);
out2:
        mnt_drop_write(path.mnt);
out:
        path_put(&path);
        return error;
}

static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
        struct path path;
        int error = path_lookupat(nd, flags, &path);
        if (!error) {
                audit_inode(nd->name, path.dentry, 0);
                error = vfs_open(&path, file);
                path_put(&path);
        }
        return error;
}

static struct file *path_openat(struct nameidata *nd,
                        const struct open_flags *op, unsigned flags)
{
        struct file *file;
        int error;

        file = alloc_empty_file(op->open_flag, current_cred());
        if (IS_ERR(file))
                return file;

        if (unlikely(file->f_flags & __O_TMPFILE)) {
                error = do_tmpfile(nd, flags, op, file);
        } else if (unlikely(file->f_flags & O_PATH)) {
                error = do_o_path(nd, flags, file);
        } else {
                const char *s = path_init(nd, flags);
                while (!(error = link_path_walk(s, nd)) &&
                       (s = open_last_lookups(nd, file, op)) != NULL)
                        ;
                if (!error)
                        error = do_open(nd, file, op);
                terminate_walk(nd);
        }
        if (likely(!error)) {
                if (likely(file->f_mode & FMODE_OPENED))
                        return file;
                WARN_ON(1);
                error = -EINVAL;
        }
        fput_close(file);
        if (error == -EOPENSTALE) {
                if (flags & LOOKUP_RCU)
                        error = -ECHILD;
                else
                        error = -ESTALE;
        }
        return ERR_PTR(error);
}

struct file *do_file_open(int dfd, struct filename *pathname,
                const struct open_flags *op)
{
        struct nameidata nd;
        int flags = op->lookup_flags;
        struct file *filp;

        if (IS_ERR(pathname))
                return ERR_CAST(pathname);
        set_nameidata(&nd, dfd, pathname, NULL);
        filp = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(filp == ERR_PTR(-ECHILD)))
                filp = path_openat(&nd, op, flags);
        if (unlikely(filp == ERR_PTR(-ESTALE)))
                filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        return filp;
}

struct file *do_file_open_root(const struct path *root,
                const char *name, const struct open_flags *op)
{
        struct nameidata nd;
        struct file *file;
        int flags = op->lookup_flags;

        if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);

        CLASS(filename_kernel, filename)(name);
        if (IS_ERR(filename))
                return ERR_CAST(filename);

        set_nameidata(&nd, -1, filename, root);
        file = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(file == ERR_PTR(-ECHILD)))
                file = path_openat(&nd, op, flags);
        if (unlikely(file == ERR_PTR(-ESTALE)))
                file = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        return file;
}

static struct dentry *filename_create(int dfd, struct filename *name,
                                      struct path *path, unsigned int lookup_flags)
{
        struct dentry *dentry = ERR_PTR(-EEXIST);
        struct qstr last;
        bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
        unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
        unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
        int type;
        int error;

        error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
        if (error)
                return ERR_PTR(error);

        /*
         * Yucky last component or no last component at all?
         * (foo/., foo/.., /////)
         */
        if (unlikely(type != LAST_NORM))
                goto out;

        /* don't fail immediately if it's r/o, at least try to report other errors */
        error = mnt_want_write(path->mnt);
        /*
         * Do the final lookup.  Suppress 'create' if there is a trailing
         * '/', and a directory wasn't requested.
         */
        if (last.name[last.len] && !want_dir)
                create_flags &= ~LOOKUP_CREATE;
        dentry = start_dirop(path->dentry, &last, reval_flag | create_flags);
        if (IS_ERR(dentry))
                goto out_drop_write;

        if (unlikely(error))
                goto fail;

        return dentry;
fail:
        end_dirop(dentry);
        dentry = ERR_PTR(error);
out_drop_write:
        if (!error)
                mnt_drop_write(path->mnt);
out:
        path_put(path);
        return dentry;
}

struct dentry *start_creating_path(int dfd, const char *pathname,
                                   struct path *path, unsigned int lookup_flags)
{
        CLASS(filename_kernel, filename)(pathname);
        return filename_create(dfd, filename, path, lookup_flags);
}
EXPORT_SYMBOL(start_creating_path);

/**
 * end_creating_path - finish a code section started by start_creating_path()
 * @path: the path instantiated by start_creating_path()
 * @dentry: the dentry returned by start_creating_path()
 *
 * end_creating_path() will unlock and locks taken by start_creating_path()
 * and drop an references that were taken.  It should only be called
 * if start_creating_path() returned a non-error.
 * If vfs_mkdir() was called and it returned an error, that error *should*
 * be passed to end_creating_path() together with the path.
 */
void end_creating_path(const struct path *path, struct dentry *dentry)
{
        end_creating(dentry);
        mnt_drop_write(path->mnt);
        path_put(path);
}
EXPORT_SYMBOL(end_creating_path);

inline struct dentry *start_creating_user_path(
        int dfd, const char __user *pathname,
        struct path *path, unsigned int lookup_flags)
{
        CLASS(filename, filename)(pathname);
        return filename_create(dfd, filename, path, lookup_flags);
}
EXPORT_SYMBOL(start_creating_user_path);

/**
 * dentry_create - Create and open a file
 * @path: path to create
 * @flags: O\_ flags
 * @mode: mode bits for new file
 * @cred: credentials to use
 *
 * Caller must hold the parent directory's lock, and have prepared
 * a negative dentry, placed in @path->dentry, for the new file.
 *
 * Caller sets @path->mnt to the vfsmount of the filesystem where
 * the new file is to be created. The parent directory and the
 * negative dentry must reside on the same filesystem instance.
 *
 * On success, returns a ``struct file *``. Otherwise an ERR_PTR
 * is returned.
 */
struct file *dentry_create(struct path *path, int flags, umode_t mode,
                           const struct cred *cred)
{
        struct file *file __free(fput) = NULL;
        struct dentry *dentry = path->dentry;
        struct dentry *dir = dentry->d_parent;
        struct inode *dir_inode = d_inode(dir);
        struct mnt_idmap *idmap;
        int error, create_error;

        file = alloc_empty_file(flags, cred);
        if (IS_ERR(file))
                return file;

        idmap = mnt_idmap(path->mnt);

        if (dir_inode->i_op->atomic_open) {
                path->dentry = dir;
                mode = vfs_prepare_mode(idmap, dir_inode, mode, S_IALLUGO, S_IFREG);

                create_error = may_o_create(idmap, path, dentry, mode);
                if (create_error)
                        flags &= ~O_CREAT;

                dentry = atomic_open(path, dentry, file, flags, mode);
                error = PTR_ERR_OR_ZERO(dentry);

                if (unlikely(create_error) && error == -ENOENT)
                        error = create_error;

                if (!error) {
                        if (file->f_mode & FMODE_CREATED)
                                fsnotify_create(dir->d_inode, dentry);
                        if (file->f_mode & FMODE_OPENED)
                                fsnotify_open(file);
                }

                path->dentry = dentry;

        } else {
                error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode, NULL);
                if (!error)
                        error = vfs_open(path, file);
        }
        if (unlikely(error))
                return ERR_PTR(error);

        return no_free_ptr(file);
}
EXPORT_SYMBOL(dentry_create);

/**
 * vfs_mknod - create device node or file
 * @idmap:                idmap of the mount the inode was found from
 * @dir:                inode of the parent directory
 * @dentry:                dentry of the child device node
 * @mode:                mode of the child device node
 * @dev:                device number of device to create
 * @delegated_inode:        returns parent inode, if the inode is delegated.
 *
 * Create a device node or file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
              struct dentry *dentry, umode_t mode, dev_t dev,
              struct delegated_inode *delegated_inode)
{
        bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
        int error = may_create_dentry(idmap, dir, dentry);

        if (error)
                return error;

        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
            !capable(CAP_MKNOD))
                return -EPERM;

        if (!dir->i_op->mknod)
                return -EPERM;

        mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
        error = devcgroup_inode_mknod(mode, dev);
        if (error)
                return error;

        error = security_inode_mknod(dir, dentry, mode, dev);
        if (error)
                return error;

        error = try_break_deleg(dir, delegated_inode);
        if (error)
                return error;

        error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mknod);

static int may_mknod(umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFREG:
        case S_IFCHR:
        case S_IFBLK:
        case S_IFIFO:
        case S_IFSOCK:
        case 0: /* zero mode translates to S_IFREG */
                return 0;
        case S_IFDIR:
                return -EPERM;
        default:
                return -EINVAL;
        }
}

int filename_mknodat(int dfd, struct filename *name, umode_t mode,
                     unsigned int dev)
{
        struct delegated_inode di = { };
        struct mnt_idmap *idmap;
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = 0;

        error = may_mknod(mode);
        if (error)
                return error;
retry:
        dentry = filename_create(dfd, name, &path, lookup_flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_path_mknod(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode), dev);
        if (error)
                goto out2;

        idmap = mnt_idmap(path.mnt);
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(idmap, dentry, mode, &di);
                        if (!error)
                                security_path_post_mknod(idmap, dentry);
                        break;
                case S_IFCHR: case S_IFBLK:
                        error = vfs_mknod(idmap, path.dentry->d_inode,
                                          dentry, mode, new_decode_dev(dev), &di);
                        break;
                case S_IFIFO: case S_IFSOCK:
                        error = vfs_mknod(idmap, path.dentry->d_inode,
                                          dentry, mode, 0, &di);
                        break;
        }
out2:
        end_creating_path(&path, dentry);
        if (is_delegated(&di)) {
                error = break_deleg_wait(&di);
                if (!error)
                        goto retry;
        }
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
                unsigned int, dev)
{
        CLASS(filename, name)(filename);
        return filename_mknodat(dfd, name, mode, dev);
}

SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
        CLASS(filename, name)(filename);
        return filename_mknodat(AT_FDCWD, name, mode, dev);
}

/**
 * vfs_mkdir - create directory returning correct dentry if possible
 * @idmap:                idmap of the mount the inode was found from
 * @dir:                inode of the parent directory
 * @dentry:                dentry of the child directory
 * @mode:                mode of the child directory
 * @delegated_inode:        returns parent inode, if the inode is delegated.
 *
 * Create a directory.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 *
 * In the event that the filesystem does not use the *@dentry but leaves it
 * negative or unhashes it and possibly splices a different one returning it,
 * the original dentry is dput() and the alternate is returned.
 *
 * In case of an error the dentry is dput() and an ERR_PTR() is returned.
 */
struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                         struct dentry *dentry, umode_t mode,
                         struct delegated_inode *delegated_inode)
{
        int error;
        unsigned max_links = dir->i_sb->s_max_links;
        struct dentry *de;

        error = may_create_dentry(idmap, dir, dentry);
        if (error)
                goto err;

        error = -EPERM;
        if (!dir->i_op->mkdir)
                goto err;

        mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
        error = security_inode_mkdir(dir, dentry, mode);
        if (error)
                goto err;

        error = -EMLINK;
        if (max_links && dir->i_nlink >= max_links)
                goto err;

        error = try_break_deleg(dir, delegated_inode);
        if (error)
                goto err;

        de = dir->i_op->mkdir(idmap, dir, dentry, mode);
        error = PTR_ERR(de);
        if (IS_ERR(de))
                goto err;
        if (de) {
                dput(dentry);
                dentry = de;
        }
        fsnotify_mkdir(dir, dentry);
        return dentry;

err:
        end_creating(dentry);
        return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_mkdir);

int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
{
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_DIRECTORY;
        struct delegated_inode delegated_inode = { };

retry:
        dentry = filename_create(dfd, name, &path, lookup_flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_path_mkdir(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode));
        if (!error) {
                dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
                                   dentry, mode, &delegated_inode);
                if (IS_ERR(dentry))
                        error = PTR_ERR(dentry);
        }
        end_creating_path(&path, dentry);
        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry;
        }
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
        CLASS(filename, name)(pathname);
        return filename_mkdirat(dfd, name, mode);
}

SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
        CLASS(filename, name)(pathname);
        return filename_mkdirat(AT_FDCWD, name, mode);
}

/**
 * vfs_rmdir - remove directory
 * @idmap:                idmap of the mount the inode was found from
 * @dir:                inode of the parent directory
 * @dentry:                dentry of the child directory
 * @delegated_inode:        returns parent inode, if it's delegated.
 *
 * Remove a directory.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
              struct dentry *dentry, struct delegated_inode *delegated_inode)
{
        int error = may_delete_dentry(idmap, dir, dentry, true);

        if (error)
                return error;

        if (!dir->i_op->rmdir)
                return -EPERM;

        dget(dentry);
        inode_lock(dentry->d_inode);

        error = -EBUSY;
        if (is_local_mountpoint(dentry) ||
            (dentry->d_inode->i_flags & S_KERNEL_FILE))
                goto out;

        error = security_inode_rmdir(dir, dentry);
        if (error)
                goto out;

        error = try_break_deleg(dir, delegated_inode);
        if (error)
                goto out;

        error = dir->i_op->rmdir(dir, dentry);
        if (error)
                goto out;

        shrink_dcache_parent(dentry);
        dentry->d_inode->i_flags |= S_DEAD;
        dont_mount(dentry);
        detach_mounts(dentry);

out:
        inode_unlock(dentry->d_inode);
        dput(dentry);
        if (!error)
                d_delete_notify(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_rmdir);

int filename_rmdir(int dfd, struct filename *name)
{
        int error;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        unsigned int lookup_flags = 0;
        struct delegated_inode delegated_inode = { };
retry:
        error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (error)
                return error;

        switch (type) {
        case LAST_DOTDOT:
                error = -ENOTEMPTY;
                goto exit2;
        case LAST_DOT:
                error = -EINVAL;
                goto exit2;
        case LAST_ROOT:
                error = -EBUSY;
                goto exit2;
        }

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit2;

        dentry = start_dirop(path.dentry, &last, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit3;
        error = security_path_rmdir(&path, dentry);
        if (error)
                goto exit4;
        error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode,
                          dentry, &delegated_inode);
exit4:
        end_dirop(dentry);
exit3:
        mnt_drop_write(path.mnt);
exit2:
        path_put(&path);
        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry;
        }
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
{
        CLASS(filename, name)(pathname);
        return filename_rmdir(AT_FDCWD, name);
}

/**
 * vfs_unlink - unlink a filesystem object
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        parent directory
 * @dentry:        victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_rwsem exclusively.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_rwsem before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
               struct dentry *dentry, struct delegated_inode *delegated_inode)
{
        struct inode *target = dentry->d_inode;
        int error = may_delete_dentry(idmap, dir, dentry, false);

        if (error)
                return error;

        if (!dir->i_op->unlink)
                return -EPERM;

        inode_lock(target);
        if (IS_SWAPFILE(target))
                error = -EPERM;
        else if (is_local_mountpoint(dentry))
                error = -EBUSY;
        else {
                error = security_inode_unlink(dir, dentry);
                if (!error) {
                        error = try_break_deleg(dir, delegated_inode);
                        if (error)
                                goto out;
                        error = try_break_deleg(target, delegated_inode);
                        if (error)
                                goto out;
                        error = dir->i_op->unlink(dir, dentry);
                        if (!error) {
                                dont_mount(dentry);
                                detach_mounts(dentry);
                        }
                }
        }
out:
        inode_unlock(target);

        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
        if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                fsnotify_unlink(dir, dentry);
        } else if (!error) {
                fsnotify_link_count(target);
                d_delete_notify(dir, dentry);
        }

        return error;
}
EXPORT_SYMBOL(vfs_unlink);

/*
 * Make sure that the actual truncation of the file will occur outside its
 * directory's i_rwsem.  Truncate can take a long time if there is a lot of
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
int filename_unlinkat(int dfd, struct filename *name)
{
        int error;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        struct inode *inode;
        struct delegated_inode delegated_inode = { };
        unsigned int lookup_flags = 0;
retry:
        error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (error)
                return error;

        error = -EISDIR;
        if (type != LAST_NORM)
                goto exit_path_put;

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit_path_put;
retry_deleg:
        dentry = start_dirop(path.dentry, &last, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit_drop_write;

        /* Why not before? Because we want correct error value */
        if (unlikely(last.name[last.len])) {
                if (d_is_dir(dentry))
                        error = -EISDIR;
                else
                        error = -ENOTDIR;
                end_dirop(dentry);
                goto exit_drop_write;
        }
        inode = dentry->d_inode;
        ihold(inode);
        error = security_path_unlink(&path, dentry);
        if (error)
                goto exit_end_dirop;
        error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
                           dentry, &delegated_inode);
exit_end_dirop:
        end_dirop(dentry);
        iput(inode);        /* truncate the inode here */
        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
exit_drop_write:
        mnt_drop_write(path.mnt);
exit_path_put:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
{
        if ((flag & ~AT_REMOVEDIR) != 0)
                return -EINVAL;

        CLASS(filename, name)(pathname);
        if (flag & AT_REMOVEDIR)
                return filename_rmdir(dfd, name);
        return filename_unlinkat(dfd, name);
}

SYSCALL_DEFINE1(unlink, const char __user *, pathname)
{
        CLASS(filename, name)(pathname);
        return filename_unlinkat(AT_FDCWD, name);
}

/**
 * vfs_symlink - create symlink
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of the parent directory
 * @dentry:        dentry of the child symlink file
 * @oldname:        name of the file to link to
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * Create a symlink.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
                struct dentry *dentry, const char *oldname,
                struct delegated_inode *delegated_inode)
{
        int error;

        error = may_create_dentry(idmap, dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->symlink)
                return -EPERM;

        error = security_inode_symlink(dir, dentry, oldname);
        if (error)
                return error;

        error = try_break_deleg(dir, delegated_inode);
        if (error)
                return error;

        error = dir->i_op->symlink(idmap, dir, dentry, oldname);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_symlink);

int filename_symlinkat(struct filename *from, int newdfd, struct filename *to)
{
        int error;
        struct dentry *dentry;
        struct path path;
        unsigned int lookup_flags = 0;
        struct delegated_inode delegated_inode = { };

        if (IS_ERR(from))
                return PTR_ERR(from);

retry:
        dentry = filename_create(newdfd, to, &path, lookup_flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_path_symlink(&path, dentry, from->name);
        if (!error)
                error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
                                    dentry, from->name, &delegated_inode);
        end_creating_path(&path, dentry);
        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry;
        }
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        CLASS(filename, old)(oldname);
        CLASS(filename, new)(newname);
        return filename_symlinkat(old, newdfd, new);
}

SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
{
        CLASS(filename, old)(oldname);
        CLASS(filename, new)(newname);
        return filename_symlinkat(old, AT_FDCWD, new);
}

/**
 * vfs_link - create a new link
 * @old_dentry:        object to be linked
 * @idmap:        idmap of the mount
 * @dir:        new parent
 * @new_dentry:        where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_rwsem exclusively.
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_rwsem before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
             struct inode *dir, struct dentry *new_dentry,
             struct delegated_inode *delegated_inode)
{
        struct inode *inode = old_dentry->d_inode;
        unsigned max_links = dir->i_sb->s_max_links;
        int error;

        if (!inode)
                return -ENOENT;

        error = may_create_dentry(idmap, dir, new_dentry);
        if (error)
                return error;

        if (dir->i_sb != inode->i_sb)
                return -EXDEV;

        /*
         * A link to an append-only or immutable file cannot be created.
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
        /*
         * Updating the link count will likely cause i_uid and i_gid to
         * be written back improperly if their true value is unknown to
         * the vfs.
         */
        if (HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
                return -EPERM;

        error = security_inode_link(old_dentry, dir, new_dentry);
        if (error)
                return error;

        inode_lock(inode);
        /* Make sure we don't allow creating hardlink to an unlinked file */
        if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
                error =  -ENOENT;
        else if (max_links && inode->i_nlink >= max_links)
                error = -EMLINK;
        else {
                error = try_break_deleg(dir, delegated_inode);
                if (!error)
                        error = try_break_deleg(inode, delegated_inode);
                if (!error)
                        error = dir->i_op->link(old_dentry, dir, new_dentry);
        }

        if (!error && (inode_state_read_once(inode) & I_LINKABLE)) {
                spin_lock(&inode->i_lock);
                inode_state_clear(inode, I_LINKABLE);
                spin_unlock(&inode->i_lock);
        }
        inode_unlock(inode);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
        return error;
}
EXPORT_SYMBOL(vfs_link);

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
*/
int filename_linkat(int olddfd, struct filename *old,
                    int newdfd, struct filename *new, int flags)
{
        struct mnt_idmap *idmap;
        struct dentry *new_dentry;
        struct path old_path, new_path;
        struct delegated_inode delegated_inode = { };
        int how = 0;
        int error;

        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
        /*
         * To use null names we require CAP_DAC_READ_SEARCH or
         * that the open-time creds of the dfd matches current.
         * This ensures that not everyone will be able to create
         * a hardlink using the passed file descriptor.
         */
        if (flags & AT_EMPTY_PATH)
                how |= LOOKUP_LINKAT_EMPTY;

        if (flags & AT_SYMLINK_FOLLOW)
                how |= LOOKUP_FOLLOW;
retry:
        error = filename_lookup(olddfd, old, how, &old_path, NULL);
        if (error)
                return error;

        new_dentry = filename_create(newdfd, new, &new_path,
                                        (how & LOOKUP_REVAL));
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto out_putpath;

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto out_dput;
        idmap = mnt_idmap(new_path.mnt);
        error = may_linkat(idmap, &old_path);
        if (unlikely(error))
                goto out_dput;
        error = security_path_link(old_path.dentry, &new_path, new_dentry);
        if (error)
                goto out_dput;
        error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
                         new_dentry, &delegated_inode);
out_dput:
        end_creating_path(&new_path, new_dentry);
        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error) {
                        path_put(&old_path);
                        goto retry;
                }
        }
        if (retry_estale(error, how)) {
                path_put(&old_path);
                how |= LOOKUP_REVAL;
                goto retry;
        }
out_putpath:
        path_put(&old_path);
        return error;
}

SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, int, flags)
{
        CLASS(filename_uflags, old)(oldname, flags);
        CLASS(filename, new)(newname);
        return filename_linkat(olddfd, old, newdfd, new, flags);
}

SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
{
        CLASS(filename, old)(oldname);
        CLASS(filename, new)(newname);
        return filename_linkat(AT_FDCWD, old, AT_FDCWD, new, 0);
}

/**
 * vfs_rename - rename a filesystem object
 * @rd:                pointer to &struct renamedata info
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
 *
 *        a) we can get into loop creation.
 *        b) race potential - two innocent renames can create a loop together.
 *           That's where 4.4BSD screws up. Current fix: serialization on
 *           sb->s_vfs_rename_mutex. We might be more accurate, but that's another
 *           story.
 *        c) we may have to lock up to _four_ objects - parents and victim (if it exists),
 *           and source (if it's a non-directory or a subdirectory that moves to
 *           different parent).
 *           And that - after we got ->i_rwsem on parents (until then we don't know
 *           whether the target exists).  Solution: try to be smart with locking
 *           order for inodes.  We rely on the fact that tree topology may change
 *           only under ->s_vfs_rename_mutex _and_ that parent of the object we
 *           move will be locked.  Thus we can rank directories by the tree
 *           (ancestors first) and rank all non-directories after them.
 *           That works since everybody except rename does "lock parent, lookup,
 *           lock child" and rename is under ->s_vfs_rename_mutex.
 *           HOWEVER, it relies on the assumption that any object with ->lookup()
 *           has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *           we'd better make sure that there's no link(2) for them.
 *        d) conversion from fhandle to dentry may come in the wrong moment - when
 *           we are removing the target. Solution: we will have to grab ->i_rwsem
 *           in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
 *           ->i_rwsem on parents, which works but leads to some truly excessive
 *           locking].
 */
int vfs_rename(struct renamedata *rd)
{
        int error;
        struct inode *old_dir = d_inode(rd->old_parent);
        struct inode *new_dir = d_inode(rd->new_parent);
        struct dentry *old_dentry = rd->old_dentry;
        struct dentry *new_dentry = rd->new_dentry;
        struct delegated_inode *delegated_inode = rd->delegated_inode;
        unsigned int flags = rd->flags;
        bool is_dir = d_is_dir(old_dentry);
        struct inode *source = old_dentry->d_inode;
        struct inode *target = new_dentry->d_inode;
        bool new_is_dir = false;
        unsigned max_links = new_dir->i_sb->s_max_links;
        struct name_snapshot old_name;
        bool lock_old_subdir, lock_new_subdir;

        if (source == target)
                return 0;

        error = may_delete_dentry(rd->mnt_idmap, old_dir, old_dentry, is_dir);
        if (error)
                return error;

        if (!target) {
                error = may_create_dentry(rd->mnt_idmap, new_dir, new_dentry);
        } else {
                new_is_dir = d_is_dir(new_dentry);

                if (!(flags & RENAME_EXCHANGE))
                        error = may_delete_dentry(rd->mnt_idmap, new_dir,
                                                  new_dentry, is_dir);
                else
                        error = may_delete_dentry(rd->mnt_idmap, new_dir,
                                                  new_dentry, new_is_dir);
        }
        if (error)
                return error;

        if (!old_dir->i_op->rename)
                return -EPERM;

        /*
         * If we are going to change the parent - check write permissions,
         * we'll need to flip '..'.
         */
        if (new_dir != old_dir) {
                if (is_dir) {
                        error = inode_permission(rd->mnt_idmap, source,
                                                 MAY_WRITE);
                        if (error)
                                return error;
                }
                if ((flags & RENAME_EXCHANGE) && new_is_dir) {
                        error = inode_permission(rd->mnt_idmap, target,
                                                 MAY_WRITE);
                        if (error)
                                return error;
                }
        }

        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
                                      flags);
        if (error)
                return error;

        take_dentry_name_snapshot(&old_name, old_dentry);
        dget(new_dentry);
        /*
         * Lock children.
         * The source subdirectory needs to be locked on cross-directory
         * rename or cross-directory exchange since its parent changes.
         * The target subdirectory needs to be locked on cross-directory
         * exchange due to parent change and on any rename due to becoming
         * a victim.
         * Non-directories need locking in all cases (for NFS reasons);
         * they get locked after any subdirectories (in inode address order).
         *
         * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
         * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
         */
        lock_old_subdir = new_dir != old_dir;
        lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
        if (is_dir) {
                if (lock_old_subdir)
                        inode_lock_nested(source, I_MUTEX_CHILD);
                if (target && (!new_is_dir || lock_new_subdir))
                        inode_lock(target);
        } else if (new_is_dir) {
                if (lock_new_subdir)
                        inode_lock_nested(target, I_MUTEX_CHILD);
                inode_lock(source);
        } else {
                lock_two_nondirectories(source, target);
        }

        error = -EPERM;
        if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
                goto out;

        error = -EBUSY;
        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
                goto out;

        if (max_links && new_dir != old_dir) {
                error = -EMLINK;
                if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
                        goto out;
                if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
                    old_dir->i_nlink >= max_links)
                        goto out;
        }
        error = try_break_deleg(old_dir, delegated_inode);
        if (error)
                goto out;
        if (new_dir != old_dir) {
                error = try_break_deleg(new_dir, delegated_inode);
                if (error)
                        goto out;
        }
        if (!is_dir) {
                error = try_break_deleg(source, delegated_inode);
                if (error)
                        goto out;
        }
        if (target && !new_is_dir) {
                error = try_break_deleg(target, delegated_inode);
                if (error)
                        goto out;
        }
        error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry,
                                      new_dir, new_dentry, flags);
        if (error)
                goto out;

        if (!(flags & RENAME_EXCHANGE) && target) {
                if (is_dir) {
                        shrink_dcache_parent(new_dentry);
                        target->i_flags |= S_DEAD;
                }
                dont_mount(new_dentry);
                detach_mounts(new_dentry);
        }
        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
                if (!(flags & RENAME_EXCHANGE))
                        d_move(old_dentry, new_dentry);
                else
                        d_exchange(old_dentry, new_dentry);
        }
out:
        if (!is_dir || lock_old_subdir)
                inode_unlock(source);
        if (target && (!new_is_dir || lock_new_subdir))
                inode_unlock(target);
        dput(new_dentry);
        if (!error) {
                fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
                              !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
                if (flags & RENAME_EXCHANGE) {
                        fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
                                      new_is_dir, NULL, new_dentry);
                }
        }
        release_dentry_name_snapshot(&old_name);

        return error;
}
EXPORT_SYMBOL(vfs_rename);

int filename_renameat2(int olddfd, struct filename *from,
                       int newdfd, struct filename *to, unsigned int flags)
{
        struct renamedata rd;
        struct path old_path, new_path;
        struct qstr old_last, new_last;
        int old_type, new_type;
        struct delegated_inode delegated_inode = { };
        unsigned int lookup_flags = 0;
        bool should_retry = false;
        int error;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;

        if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
            (flags & RENAME_EXCHANGE))
                return -EINVAL;

retry:
        error = filename_parentat(olddfd, from, lookup_flags, &old_path,
                                  &old_last, &old_type);
        if (error)
                return error;

        error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
                                  &new_type);
        if (error)
                goto exit1;

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto exit2;

        error = -EBUSY;
        if (old_type != LAST_NORM)
                goto exit2;

        if (flags & RENAME_NOREPLACE)
                error = -EEXIST;
        if (new_type != LAST_NORM)
                goto exit2;

        error = mnt_want_write(old_path.mnt);
        if (error)
                goto exit2;

retry_deleg:
        rd.old_parent           = old_path.dentry;
        rd.mnt_idmap           = mnt_idmap(old_path.mnt);
        rd.new_parent           = new_path.dentry;
        rd.delegated_inode = &delegated_inode;
        rd.flags           = flags;

        error = __start_renaming(&rd, lookup_flags, &old_last, &new_last);
        if (error)
                goto exit_lock_rename;

        if (flags & RENAME_EXCHANGE) {
                if (!d_is_dir(rd.new_dentry)) {
                        error = -ENOTDIR;
                        if (new_last.name[new_last.len])
                                goto exit_unlock;
                }
        }
        /* unless the source is a directory trailing slashes give -ENOTDIR */
        if (!d_is_dir(rd.old_dentry)) {
                error = -ENOTDIR;
                if (old_last.name[old_last.len])
                        goto exit_unlock;
                if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
                        goto exit_unlock;
        }

        error = security_path_rename(&old_path, rd.old_dentry,
                                     &new_path, rd.new_dentry, flags);
        if (error)
                goto exit_unlock;

        error = vfs_rename(&rd);
exit_unlock:
        end_renaming(&rd);
exit_lock_rename:
        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(old_path.mnt);
exit2:
        if (retry_estale(error, lookup_flags))
                should_retry = true;
        path_put(&new_path);
exit1:
        path_put(&old_path);
        if (should_retry) {
                should_retry = false;
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, unsigned int, flags)
{
        CLASS(filename, old)(oldname);
        CLASS(filename, new)(newname);
        return filename_renameat2(olddfd, old, newdfd, new, flags);
}

SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        CLASS(filename, old)(oldname);
        CLASS(filename, new)(newname);
        return filename_renameat2(olddfd, old, newdfd, new, 0);
}

SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
        CLASS(filename, old)(oldname);
        CLASS(filename, new)(newname);
        return filename_renameat2(AT_FDCWD, old, AT_FDCWD, new, 0);
}

int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
{
        int copylen;

        copylen = linklen;
        if (unlikely(copylen > (unsigned) buflen))
                copylen = buflen;
        if (copy_to_user(buffer, link, copylen))
                copylen = -EFAULT;
        return copylen;
}

/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        DEFINE_DELAYED_CALL(done);
        const char *link;
        int res;

        if (inode->i_opflags & IOP_CACHED_LINK)
                return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);

        if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
                if (unlikely(inode->i_op->readlink))
                        return inode->i_op->readlink(dentry, buffer, buflen);

                if (!d_is_symlink(dentry))
                        return -EINVAL;

                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_DEFAULT_READLINK;
                spin_unlock(&inode->i_lock);
        }

        link = READ_ONCE(inode->i_link);
        if (!link) {
                link = inode->i_op->get_link(dentry, inode, &done);
                if (IS_ERR(link))
                        return PTR_ERR(link);
        }
        res = readlink_copy(buffer, buflen, link, strlen(link));
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(vfs_readlink);

/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
        const char *res = ERR_PTR(-EINVAL);
        struct inode *inode = d_inode(dentry);

        if (d_is_symlink(dentry)) {
                res = ERR_PTR(security_inode_readlink(dentry));
                if (!res)
                        res = inode->i_op->get_link(dentry, inode, done);
        }
        return res;
}
EXPORT_SYMBOL(vfs_get_link);

/* get the link contents into pagecache */
static char *__page_get_link(struct dentry *dentry, struct inode *inode,
                             struct delayed_call *callback)
{
        struct folio *folio;
        struct address_space *mapping = inode->i_mapping;

        if (!dentry) {
                folio = filemap_get_folio(mapping, 0);
                if (IS_ERR(folio))
                        return ERR_PTR(-ECHILD);
                if (!folio_test_uptodate(folio)) {
                        folio_put(folio);
                        return ERR_PTR(-ECHILD);
                }
        } else {
                folio = read_mapping_folio(mapping, 0, NULL);
                if (IS_ERR(folio))
                        return ERR_CAST(folio);
        }
        set_delayed_call(callback, page_put_link, folio);
        BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
        return folio_address(folio);
}

const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
                              struct delayed_call *callback)
{
        return __page_get_link(dentry, inode, callback);
}
EXPORT_SYMBOL_GPL(page_get_link_raw);

/**
 * page_get_link() - An implementation of the get_link inode_operation.
 * @dentry: The directory entry which is the symlink.
 * @inode: The inode for the symlink.
 * @callback: Used to drop the reference to the symlink.
 *
 * Filesystems which store their symlinks in the page cache should use
 * this to implement the get_link() member of their inode_operations.
 *
 * Return: A pointer to the NUL-terminated symlink.
 */
const char *page_get_link(struct dentry *dentry, struct inode *inode,
                                        struct delayed_call *callback)
{
        char *kaddr = __page_get_link(dentry, inode, callback);

        if (!IS_ERR(kaddr))
                nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
        return kaddr;
}
EXPORT_SYMBOL(page_get_link);

/**
 * page_put_link() - Drop the reference to the symlink.
 * @arg: The folio which contains the symlink.
 *
 * This is used internally by page_get_link().  It is exported for use
 * by filesystems which need to implement a variant of page_get_link()
 * themselves.  Despite the apparent symmetry, filesystems which use
 * page_get_link() do not need to call page_put_link().
 *
 * The argument, while it has a void pointer type, must be a pointer to
 * the folio which was retrieved from the page cache.  The delayed_call
 * infrastructure is used to drop the reference count once the caller
 * is done with the symlink.
 */
void page_put_link(void *arg)
{
        folio_put(arg);
}
EXPORT_SYMBOL(page_put_link);

int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        const char *link;
        int res;

        DEFINE_DELAYED_CALL(done);
        link = page_get_link(dentry, d_inode(dentry), &done);
        res = PTR_ERR(link);
        if (!IS_ERR(link))
                res = readlink_copy(buffer, buflen, link, strlen(link));
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(page_readlink);

int page_symlink(struct inode *inode, const char *symname, int len)
{
        struct address_space *mapping = inode->i_mapping;
        const struct address_space_operations *aops = mapping->a_ops;
        bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
        struct folio *folio;
        void *fsdata = NULL;
        int err;
        unsigned int flags;

retry:
        if (nofs)
                flags = memalloc_nofs_save();
        err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata);
        if (nofs)
                memalloc_nofs_restore(flags);
        if (err)
                goto fail;

        memcpy(folio_address(folio), symname, len - 1);

        err = aops->write_end(NULL, mapping, 0, len - 1, len - 1,
                                                folio, fsdata);
        if (err < 0)
                goto fail;
        if (err < len-1)
                goto retry;

        mark_inode_dirty(inode);
        return 0;
fail:
        return err;
}
EXPORT_SYMBOL(page_symlink);

const struct inode_operations page_symlink_inode_operations = {
        .get_link        = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);









































































































































































































   47 








































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_JUMP_LABEL_H
#define _LINUX_JUMP_LABEL_H

/*
 * Jump label support
 *
 * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com>
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 *
 * DEPRECATED API:
 *
 * The use of 'struct static_key' directly, is now DEPRECATED. In addition
 * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following:
 *
 * struct static_key false = STATIC_KEY_INIT_FALSE;
 * struct static_key true = STATIC_KEY_INIT_TRUE;
 * static_key_true()
 * static_key_false()
 *
 * The updated API replacements are:
 *
 * DEFINE_STATIC_KEY_TRUE(key);
 * DEFINE_STATIC_KEY_FALSE(key);
 * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
 * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count);
 * static_branch_likely()
 * static_branch_unlikely()
 *
 * Jump labels provide an interface to generate dynamic branches using
 * self-modifying code. Assuming toolchain and architecture support, if we
 * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)",
 * an "if (static_branch_unlikely(&key))" statement is an unconditional branch
 * (which defaults to false - and the true block is placed out of line).
 * Similarly, we can define an initially true key via
 * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same
 * "if (static_branch_unlikely(&key))", in which case we will generate an
 * unconditional branch to the out-of-line true branch. Keys that are
 * initially true or false can be using in both static_branch_unlikely()
 * and static_branch_likely() statements.
 *
 * At runtime we can change the branch target by setting the key
 * to true via a call to static_branch_enable(), or false using
 * static_branch_disable(). If the direction of the branch is switched by
 * these calls then we run-time modify the branch target via a
 * no-op -> jump or jump -> no-op conversion. For example, for an
 * initially false key that is used in an "if (static_branch_unlikely(&key))"
 * statement, setting the key to true requires us to patch in a jump
 * to the out-of-line of true branch.
 *
 * In addition to static_branch_{enable,disable}, we can also reference count
 * the key or branch direction via static_branch_{inc,dec}. Thus,
 * static_branch_inc() can be thought of as a 'make more true' and
 * static_branch_dec() as a 'make more false'.
 *
 * Since this relies on modifying code, the branch modifying functions
 * must be considered absolute slow paths (machine wide synchronization etc.).
 * OTOH, since the affected branches are unconditional, their runtime overhead
 * will be absolutely minimal, esp. in the default (off) case where the total
 * effect is a single NOP of appropriate size. The on case will patch in a jump
 * to the out-of-line block.
 *
 * When the control is directly exposed to userspace, it is prudent to delay the
 * decrement to avoid high frequency code modifications which can (and do)
 * cause significant performance degradation. Struct static_key_deferred and
 * static_key_slow_dec_deferred() provide for this.
 *
 * Lacking toolchain and or architecture support, static keys fall back to a
 * simple conditional branch.
 *
 * Additional babbling in: Documentation/staging/static-keys.rst
 */

#ifndef __ASSEMBLY__

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/cleanup.h>

extern bool static_key_initialized;

#define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized,                      \
                                    "%s(): static key '%pS' used before call to jump_label_init()", \
                                    __func__, (key))

struct static_key {
        atomic_t enabled;
#ifdef CONFIG_JUMP_LABEL
/*
 * bit 0 => 1 if key is initially true
 *            0 if initially false
 * bit 1 => 1 if points to struct static_key_mod
 *            0 if points to struct jump_entry
 */
        union {
                unsigned long type;
                struct jump_entry *entries;
                struct static_key_mod *next;
        };
#endif        /* CONFIG_JUMP_LABEL */
};

#endif /* __ASSEMBLY__ */

#ifdef CONFIG_JUMP_LABEL
#include <asm/jump_label.h>

#ifndef __ASSEMBLY__
#ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE

struct jump_entry {
        s32 code;
        s32 target;
        long key;        // key may be far away from the core kernel under KASLR
};

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return (unsigned long)&entry->code + entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return (unsigned long)&entry->target + entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        long offset = entry->key & ~3L;

        return (struct static_key *)((unsigned long)&entry->key + offset);
}

#else

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        return (struct static_key *)((unsigned long)entry->key & ~3UL);
}

#endif

static inline bool jump_entry_is_branch(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 1UL;
}

static inline bool jump_entry_is_init(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 2UL;
}

static inline void jump_entry_set_init(struct jump_entry *entry, bool set)
{
        if (set)
                entry->key |= 2;
        else
                entry->key &= ~2;
}

static inline int jump_entry_size(struct jump_entry *entry)
{
#ifdef JUMP_LABEL_NOP_SIZE
        return JUMP_LABEL_NOP_SIZE;
#else
        return arch_jump_entry_size(entry);
#endif
}

#endif
#endif

#ifndef __ASSEMBLY__

enum jump_label_type {
        JUMP_LABEL_NOP = 0,
        JUMP_LABEL_JMP,
};

struct module;

#ifdef CONFIG_JUMP_LABEL

#define JUMP_TYPE_FALSE                0UL
#define JUMP_TYPE_TRUE                1UL
#define JUMP_TYPE_LINKED        2UL
#define JUMP_TYPE_MASK                3UL

static __always_inline bool static_key_false(struct static_key *key)
{
        return arch_static_branch(key, false);
}

static __always_inline bool static_key_true(struct static_key *key)
{
        return !arch_static_branch(key, true);
}

extern struct jump_entry __start___jump_table[];
extern struct jump_entry __stop___jump_table[];

extern void jump_label_init(void);
extern void jump_label_init_ro(void);
extern void jump_label_lock(void);
extern void jump_label_unlock(void);
extern void arch_jump_label_transform(struct jump_entry *entry,
                                      enum jump_label_type type);
extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
                                            enum jump_label_type type);
extern void arch_jump_label_transform_apply(void);
extern int jump_label_text_reserved(void *start, void *end);
extern bool static_key_slow_inc(struct static_key *key);
extern bool static_key_fast_inc_not_disabled(struct static_key *key);
extern void static_key_slow_dec(struct static_key *key);
extern bool static_key_slow_inc_cpuslocked(struct static_key *key);
extern void static_key_slow_dec_cpuslocked(struct static_key *key);
extern int static_key_count(struct static_key *key);
extern void static_key_enable(struct static_key *key);
extern void static_key_disable(struct static_key *key);
extern void static_key_enable_cpuslocked(struct static_key *key);
extern void static_key_disable_cpuslocked(struct static_key *key);
extern enum jump_label_type jump_label_init_type(struct jump_entry *entry);

#define STATIC_KEY_INIT_TRUE                                        \
        { .enabled = ATOMIC_INIT(1),                                \
          .type = JUMP_TYPE_TRUE }
#define STATIC_KEY_INIT_FALSE                                        \
        { .enabled = ATOMIC_INIT(0),                                \
          .type = JUMP_TYPE_FALSE }

#else  /* !CONFIG_JUMP_LABEL */

#include <linux/atomic.h>
#include <linux/bug.h>

static __always_inline int static_key_count(struct static_key *key)
{
        return raw_atomic_read(&key->enabled);
}

static __always_inline void jump_label_init(void)
{
        static_key_initialized = true;
}

static __always_inline void jump_label_init_ro(void) { }

static __always_inline bool static_key_false(struct static_key *key)
{
        if (unlikely_notrace(static_key_count(key) > 0))
                return true;
        return false;
}

static __always_inline bool static_key_true(struct static_key *key)
{
        if (likely_notrace(static_key_count(key) > 0))
                return true;
        return false;
}

static inline bool static_key_fast_inc_not_disabled(struct static_key *key)
{
        int v;

        STATIC_KEY_CHECK_USE(key);
        /*
         * Prevent key->enabled getting negative to follow the same semantics
         * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment.
         */
        v = atomic_read(&key->enabled);
        do {
                if (v < 0 || (v + 1) < 0)
                        return false;
        } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
        return true;
}
#define static_key_slow_inc(key)        static_key_fast_inc_not_disabled(key)

static inline void static_key_slow_dec(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        atomic_dec(&key->enabled);
}

#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key)
#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key)

static inline int jump_label_text_reserved(void *start, void *end)
{
        return 0;
}

static inline void jump_label_lock(void) {}
static inline void jump_label_unlock(void) {}

static inline void static_key_enable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 0) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
                return;
        }
        atomic_set(&key->enabled, 1);
}

static inline void static_key_disable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 1) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
                return;
        }
        atomic_set(&key->enabled, 0);
}

#define static_key_enable_cpuslocked(k)                static_key_enable((k))
#define static_key_disable_cpuslocked(k)        static_key_disable((k))

#define STATIC_KEY_INIT_TRUE        { .enabled = ATOMIC_INIT(1) }
#define STATIC_KEY_INIT_FALSE        { .enabled = ATOMIC_INIT(0) }

#endif        /* CONFIG_JUMP_LABEL */

DEFINE_LOCK_GUARD_0(jump_label_lock, jump_label_lock(), jump_label_unlock())

#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
#define jump_label_enabled static_key_enabled

/* -------------------------------------------------------------------------- */

/*
 * Two type wrappers around static_key, such that we can use compile time
 * type differentiation to emit the right code.
 *
 * All the below code is macros in order to play type games.
 */

struct static_key_true {
        struct static_key key;
};

struct static_key_false {
        struct static_key key;
};

#define STATIC_KEY_TRUE_INIT  (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE,  }
#define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, }

#define DEFINE_STATIC_KEY_TRUE(name)        \
        struct static_key_true name = STATIC_KEY_TRUE_INIT

#define DEFINE_STATIC_KEY_TRUE_RO(name)        \
        struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT

#define DECLARE_STATIC_KEY_TRUE(name)        \
        extern struct static_key_true name

#define DEFINE_STATIC_KEY_FALSE(name)        \
        struct static_key_false name = STATIC_KEY_FALSE_INIT

#define DEFINE_STATIC_KEY_FALSE_RO(name)        \
        struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT

#define DECLARE_STATIC_KEY_FALSE(name)        \
        extern struct static_key_false name

#define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count)                \
        struct static_key_true name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT,        \
        }

#define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count)                \
        struct static_key_false name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT,        \
        }

#define _DEFINE_STATIC_KEY_1(name)        DEFINE_STATIC_KEY_TRUE(name)
#define _DEFINE_STATIC_KEY_0(name)        DEFINE_STATIC_KEY_FALSE(name)
#define DEFINE_STATIC_KEY_MAYBE(cfg, name)                        \
        __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name)

#define _DEFINE_STATIC_KEY_RO_1(name)        DEFINE_STATIC_KEY_TRUE_RO(name)
#define _DEFINE_STATIC_KEY_RO_0(name)        DEFINE_STATIC_KEY_FALSE_RO(name)
#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name)                        \
        __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name)

#define _DECLARE_STATIC_KEY_1(name)        DECLARE_STATIC_KEY_TRUE(name)
#define _DECLARE_STATIC_KEY_0(name)        DECLARE_STATIC_KEY_FALSE(name)
#define DECLARE_STATIC_KEY_MAYBE(cfg, name)                        \
        __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name)

extern bool ____wrong_branch_error(void);

#define static_key_enabled(x)                                                        \
({                                                                                \
        if (!__builtin_types_compatible_p(typeof(*x), struct static_key) &&        \
            !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
            !__builtin_types_compatible_p(typeof(*x), struct static_key_false))        \
                ____wrong_branch_error();                                        \
        static_key_count((struct static_key *)x) > 0;                                \
})

#ifdef CONFIG_JUMP_LABEL

/*
 * Combine the right initial value (type) with the right branch order
 * to generate the desired result.
 *
 *
 * type\branch|        likely (1)              |        unlikely (0)
 * -----------+-----------------------+------------------
 *            |                       |
 *  true (1)  |           ...                      |           ...
 *            |    NOP                      |           JMP L
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *            |                       |
 *  false (0) |           ...                      |           ...
 *            |    JMP L              |           NOP
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *
 * The initial value is encoded in the LSB of static_key::entries,
 * type: 0 = false, 1 = true.
 *
 * The branch type is encoded in the LSB of jump_entry::key,
 * branch: 0 = unlikely, 1 = likely.
 *
 * This gives the following logic table:
 *
 *        enabled        type        branch          instuction
 * -----------------------------+-----------
 *        0        0        0        | NOP
 *        0        0        1        | JMP
 *        0        1        0        | NOP
 *        0        1        1        | JMP
 *
 *        1        0        0        | JMP
 *        1        0        1        | NOP
 *        1        1        0        | JMP
 *        1        1        1        | NOP
 *
 * Which gives the following functions:
 *
 *   dynamic: instruction = enabled ^ branch
 *   static:  instruction = type ^ branch
 *
 * See jump_label_type() / jump_label_init_type().
 */

#define static_branch_likely(x)                                                        \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = !arch_static_branch(&(x)->key, true);                        \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = !arch_static_branch_jump(&(x)->key, true);                \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        likely_notrace(branch);                                                                \
})

#define static_branch_unlikely(x)                                                \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = arch_static_branch_jump(&(x)->key, false);                \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = arch_static_branch(&(x)->key, false);                        \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        unlikely_notrace(branch);                                                        \
})

#else /* !CONFIG_JUMP_LABEL */

#define static_branch_likely(x)                likely_notrace(static_key_enabled(&(x)->key))
#define static_branch_unlikely(x)        unlikely_notrace(static_key_enabled(&(x)->key))

#endif /* CONFIG_JUMP_LABEL */

#define static_branch_maybe(config, x)                                        \
        (IS_ENABLED(config) ? static_branch_likely(x)                        \
                            : static_branch_unlikely(x))

/*
 * Advanced usage; refcount, branch is enabled when: count != 0
 */

#define static_branch_inc(x)                static_key_slow_inc(&(x)->key)
#define static_branch_dec(x)                static_key_slow_dec(&(x)->key)
#define static_branch_inc_cpuslocked(x)        static_key_slow_inc_cpuslocked(&(x)->key)
#define static_branch_dec_cpuslocked(x)        static_key_slow_dec_cpuslocked(&(x)->key)

/*
 * Normal usage; boolean enable/disable.
 */

#define static_branch_enable(x)                        static_key_enable(&(x)->key)
#define static_branch_disable(x)                static_key_disable(&(x)->key)
#define static_branch_enable_cpuslocked(x)        static_key_enable_cpuslocked(&(x)->key)
#define static_branch_disable_cpuslocked(x)        static_key_disable_cpuslocked(&(x)->key)

#endif /* __ASSEMBLY__ */

#endif        /* _LINUX_JUMP_LABEL_H */


















































































































    1 














    1 










































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct ptr_ring' datastructure.
 *
 *        Author:
 *                Michael S. Tsirkin <mst@redhat.com>
 *
 *        Copyright (C) 2016 Red Hat, Inc.
 *
 *        This is a limited-size FIFO maintaining pointers in FIFO order, with
 *        one CPU producing entries and another consuming entries from a FIFO.
 *
 *        This implementation tries to minimize cache-contention when there is a
 *        single producer and a single consumer CPU.
 */

#ifndef _LINUX_PTR_RING_H
#define _LINUX_PTR_RING_H 1

#ifdef __KERNEL__
#include <linux/spinlock.h>
#include <linux/cache.h>
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <asm/errno.h>
#endif

struct ptr_ring {
        int producer ____cacheline_aligned_in_smp;
        spinlock_t producer_lock;
        int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */
        int consumer_tail; /* next entry to invalidate */
        spinlock_t consumer_lock;
        /* Shared consumer/producer data */
        /* Read-only by both the producer and the consumer */
        int size ____cacheline_aligned_in_smp; /* max entries in queue */
        int batch; /* number of entries to consume in a batch */
        void **queue;
};

/* Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax().
 *
 * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock:
 * see e.g. ptr_ring_full.
 */
static inline bool __ptr_ring_full(struct ptr_ring *r)
{
        return data_race(r->queue[r->producer]);
}

static inline bool ptr_ring_full(struct ptr_ring *r)
{
        bool ret;

        spin_lock(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock(&r->producer_lock);

        return ret;
}

static inline bool ptr_ring_full_irq(struct ptr_ring *r)
{
        bool ret;

        spin_lock_irq(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock_irq(&r->producer_lock);

        return ret;
}

static inline bool ptr_ring_full_any(struct ptr_ring *r)
{
        unsigned long flags;
        bool ret;

        spin_lock_irqsave(&r->producer_lock, flags);
        ret = __ptr_ring_full(r);
        spin_unlock_irqrestore(&r->producer_lock, flags);

        return ret;
}

static inline bool ptr_ring_full_bh(struct ptr_ring *r)
{
        bool ret;

        spin_lock_bh(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock_bh(&r->producer_lock);

        return ret;
}

/* Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax(). Callers must hold producer_lock.
 * Callers are responsible for making sure pointer that is being queued
 * points to a valid data.
 */
static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
{
        if (unlikely(!r->size) || data_race(r->queue[r->producer]))
                return -ENOSPC;

        /* Make sure the pointer we are storing points to a valid data. */
        /* Pairs with the dependency ordering in __ptr_ring_consume. */
        smp_wmb();

        WRITE_ONCE(r->queue[r->producer++], ptr);
        if (unlikely(r->producer >= r->size))
                r->producer = 0;
        return 0;
}

/*
 * Note: resize (below) nests producer lock within consumer lock, so if you
 * consume in interrupt or BH context, you must disable interrupts/BH when
 * calling this.
 */
static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock(&r->producer_lock);

        return ret;
}

static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock_irq(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_irq(&r->producer_lock);

        return ret;
}

static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&r->producer_lock, flags);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_irqrestore(&r->producer_lock, flags);

        return ret;
}

static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock_bh(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_bh(&r->producer_lock);

        return ret;
}

static inline void *__ptr_ring_peek(struct ptr_ring *r)
{
        if (likely(r->size))
                return READ_ONCE(r->queue[r->consumer_head]);
        return NULL;
}

/*
 * Test ring empty status without taking any locks.
 *
 * NB: This is only safe to call if ring is never resized.
 *
 * However, if some other CPU consumes ring entries at the same time, the value
 * returned is not guaranteed to be correct.
 *
 * In this case - to avoid incorrectly detecting the ring
 * as empty - the CPU consuming the ring entries is responsible
 * for either consuming all ring entries until the ring is empty,
 * or synchronizing with some other CPU and causing it to
 * re-test __ptr_ring_empty and/or consume the ring enteries
 * after the synchronization point.
 *
 * Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax().
 */
static inline bool __ptr_ring_empty(struct ptr_ring *r)
{
        if (likely(r->size))
                return !data_race(r->queue[READ_ONCE(r->consumer_head)]);
        return true;
}

static inline bool ptr_ring_empty(struct ptr_ring *r)
{
        bool ret;

        spin_lock(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock(&r->consumer_lock);

        return ret;
}

static inline bool ptr_ring_empty_irq(struct ptr_ring *r)
{
        bool ret;

        spin_lock_irq(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock_irq(&r->consumer_lock);

        return ret;
}

static inline bool ptr_ring_empty_any(struct ptr_ring *r)
{
        unsigned long flags;
        bool ret;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ret = __ptr_ring_empty(r);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ret;
}

static inline bool ptr_ring_empty_bh(struct ptr_ring *r)
{
        bool ret;

        spin_lock_bh(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock_bh(&r->consumer_lock);

        return ret;
}

/* Zero entries from tail to specified head.
 * NB: if consumer_head can be >= r->size need to fixup tail later.
 */
static inline void __ptr_ring_zero_tail(struct ptr_ring *r, int consumer_head)
{
        int head = consumer_head;

        /* Zero out entries in the reverse order: this way we touch the
         * cache line that producer might currently be reading the last;
         * producer won't make progress and touch other cache lines
         * besides the first one until we write out all entries.
         */
        while (likely(head > r->consumer_tail))
                data_race(r->queue[--head] = NULL);

        r->consumer_tail = consumer_head;
}

/* Must only be called after __ptr_ring_peek returned !NULL */
static inline void __ptr_ring_discard_one(struct ptr_ring *r)
{
        /* Fundamentally, what we want to do is update consumer
         * index and zero out the entry so producer can reuse it.
         * Doing it naively at each consume would be as simple as:
         *       consumer = r->consumer;
         *       r->queue[consumer++] = NULL;
         *       if (unlikely(consumer >= r->size))
         *               consumer = 0;
         *       r->consumer = consumer;
         * but that is suboptimal when the ring is full as producer is writing
         * out new entries in the same cache line.  Defer these updates until a
         * batch of entries has been consumed.
         */
        /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty
         * to work correctly.
         */
        int consumer_head = r->consumer_head + 1;

        /* Once we have processed enough entries invalidate them in
         * the ring all at once so producer can reuse their space in the ring.
         * We also do this when we reach end of the ring - not mandatory
         * but helps keep the implementation simple.
         */
        if (unlikely(consumer_head - r->consumer_tail >= r->batch ||
                     consumer_head >= r->size))
                __ptr_ring_zero_tail(r, consumer_head);

        if (unlikely(consumer_head >= r->size)) {
                consumer_head = 0;
                r->consumer_tail = 0;
        }
        /* matching READ_ONCE in __ptr_ring_empty for lockless tests */
        WRITE_ONCE(r->consumer_head, consumer_head);
}

static inline void *__ptr_ring_consume(struct ptr_ring *r)
{
        void *ptr;

        /* The READ_ONCE in __ptr_ring_peek guarantees that anyone
         * accessing data through the pointer is up to date. Pairs
         * with smp_wmb in __ptr_ring_produce.
         */
        ptr = __ptr_ring_peek(r);
        if (ptr)
                __ptr_ring_discard_one(r);

        return ptr;
}

static inline int __ptr_ring_consume_batched(struct ptr_ring *r,
                                             void **array, int n)
{
        void *ptr;
        int i;

        for (i = 0; i < n; i++) {
                ptr = __ptr_ring_consume(r);
                if (!ptr)
                        break;
                array[i] = ptr;
        }

        return i;
}

/*
 * Note: resize (below) nests producer lock within consumer lock, so if you
 * call this in interrupt or BH context, you must disable interrupts/BH when
 * producing.
 */
static inline void *ptr_ring_consume(struct ptr_ring *r)
{
        void *ptr;

        spin_lock(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock(&r->consumer_lock);

        return ptr;
}

static inline void *ptr_ring_consume_irq(struct ptr_ring *r)
{
        void *ptr;

        spin_lock_irq(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock_irq(&r->consumer_lock);

        return ptr;
}

static inline void *ptr_ring_consume_any(struct ptr_ring *r)
{
        unsigned long flags;
        void *ptr;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ptr = __ptr_ring_consume(r);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ptr;
}

static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
{
        void *ptr;

        spin_lock_bh(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock_bh(&r->consumer_lock);

        return ptr;
}

static inline int ptr_ring_consume_batched(struct ptr_ring *r,
                                           void **array, int n)
{
        int ret;

        spin_lock(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock(&r->consumer_lock);

        return ret;
}

static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r,
                                               void **array, int n)
{
        int ret;

        spin_lock_irq(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_irq(&r->consumer_lock);

        return ret;
}

static inline int ptr_ring_consume_batched_any(struct ptr_ring *r,
                                               void **array, int n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ret;
}

static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
                                              void **array, int n)
{
        int ret;

        spin_lock_bh(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_bh(&r->consumer_lock);

        return ret;
}

/* Cast to structure type and call a function without discarding from FIFO.
 * Function must return a value.
 * Callers must take consumer_lock.
 */
#define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r)))

#define PTR_RING_PEEK_CALL(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock_irq(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_irq(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_BH(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock_bh(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_bh(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_ANY(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        unsigned long __PTR_RING_PEEK_CALL_f;\
        \
        spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \
        __PTR_RING_PEEK_CALL_v; \
})

/* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See
 * documentation for vmalloc for which of them are legal.
 */
static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp)
{
        if (size > KMALLOC_MAX_SIZE / sizeof(void *))
                return NULL;
        return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO);
}

static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
{
        r->size = size;
        r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue));
        /* We need to set batch at least to 1 to make logic
         * in __ptr_ring_discard_one work correctly.
         * Batching too much (because ring is small) would cause a lot of
         * burstiness. Needs tuning, for now disable batching.
         */
        if (r->batch > r->size / 2 || !r->batch)
                r->batch = 1;
}

static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp)
{
        r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp);
        if (!r->queue)
                return -ENOMEM;

        __ptr_ring_set_size(r, size);
        r->producer = r->consumer_head = r->consumer_tail = 0;
        spin_lock_init(&r->producer_lock);
        spin_lock_init(&r->consumer_lock);

        return 0;
}
#define ptr_ring_init(...)        alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__))

/*
 * Return entries into ring. Destroy entries that don't fit.
 *
 * Note: this is expected to be a rare slow path operation.
 *
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in interrupt or BH context, you must
 * disable interrupts/BH when doing so.
 */
static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
                                      void (*destroy)(void *))
{
        unsigned long flags;

        spin_lock_irqsave(&r->consumer_lock, flags);
        spin_lock(&r->producer_lock);

        if (!r->size)
                goto done;

        /*
         * Clean out buffered entries (for simplicity). This way following code
         * can test entries for NULL and if not assume they are valid.
         */
        __ptr_ring_zero_tail(r, r->consumer_head);

        /*
         * Go over entries in batch, start moving head back and copy entries.
         * Stop when we run into previously unconsumed entries.
         */
        while (n) {
                int head = r->consumer_head - 1;
                if (head < 0)
                        head = r->size - 1;
                if (r->queue[head]) {
                        /* This batch entry will have to be destroyed. */
                        goto done;
                }
                r->queue[head] = batch[--n];
                r->consumer_tail = head;
                /* matching READ_ONCE in __ptr_ring_empty for lockless tests */
                WRITE_ONCE(r->consumer_head, head);
        }

done:
        /* Destroy all entries left in the batch. */
        while (n)
                destroy(batch[--n]);
        spin_unlock(&r->producer_lock);
        spin_unlock_irqrestore(&r->consumer_lock, flags);
}

static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
                                           int size, gfp_t gfp,
                                           void (*destroy)(void *))
{
        int producer = 0;
        void **old;
        void *ptr;

        while ((ptr = __ptr_ring_consume(r)))
                if (producer < size)
                        queue[producer++] = ptr;
                else if (destroy)
                        destroy(ptr);

        if (producer >= size)
                producer = 0;
        __ptr_ring_set_size(r, size);
        r->producer = producer;
        r->consumer_head = 0;
        r->consumer_tail = 0;
        old = r->queue;
        r->queue = queue;

        return old;
}

/*
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in interrupt or BH context, you must
 * disable interrupts/BH when doing so.
 */
static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp,
                                  void (*destroy)(void *))
{
        unsigned long flags;
        void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp);
        void **old;

        if (!queue)
                return -ENOMEM;

        spin_lock_irqsave(&(r)->consumer_lock, flags);
        spin_lock(&(r)->producer_lock);

        old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy);

        spin_unlock(&(r)->producer_lock);
        spin_unlock_irqrestore(&(r)->consumer_lock, flags);

        kvfree(old);

        return 0;
}
#define ptr_ring_resize(...)        alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__))

/*
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in BH context, you must
 * disable BH when doing so.
 */
static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings,
                                                     unsigned int nrings,
                                                     int size, gfp_t gfp,
                                                     void (*destroy)(void *))
{
        void ***queues;
        int i;

        queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp);
        if (!queues)
                goto noqueues;

        for (i = 0; i < nrings; ++i) {
                queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp);
                if (!queues[i])
                        goto nomem;
        }

        for (i = 0; i < nrings; ++i) {
                spin_lock_bh(&(rings[i])->consumer_lock);
                spin_lock(&(rings[i])->producer_lock);
                queues[i] = __ptr_ring_swap_queue(rings[i], queues[i],
                                                  size, gfp, destroy);
                spin_unlock(&(rings[i])->producer_lock);
                spin_unlock_bh(&(rings[i])->consumer_lock);
        }

        for (i = 0; i < nrings; ++i)
                kvfree(queues[i]);

        kfree(queues);

        return 0;

nomem:
        while (--i >= 0)
                kvfree(queues[i]);

        kfree(queues);

noqueues:
        return -ENOMEM;
}
#define ptr_ring_resize_multiple_bh(...) \
                alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__))

static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
{
        void *ptr;

        if (destroy)
                while ((ptr = ptr_ring_consume(r)))
                        destroy(ptr);
        kvfree(r->queue);
}

#endif /* _LINUX_PTR_RING_H  */








































































    1 













    1 
































































































































































































































































    1 










    1 


    1 

    1 





























    1 















    1 


















    1 






































































































    1 















































    1 
















































    1 

    1 




















    1 
















    1 














































    1 














    1 






















    1 











































    1 







    1 
    1 

























    1 















































    1 


























































































    1 































































































































































































































































































































    1 






































































































































































































    1 
















    1 




















    1 












    1 




    1 














    1 






    1 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/isofs/inode.c
 *
 *  (C) 1991  Linus Torvalds - minix filesystem
 *      1992, 1993, 1994  Eric Youngdale Modified for ISO 9660 filesystem.
 *      1994  Eberhard Mönkeberg - multi session handling.
 *      1995  Mark Dobie - allow mounting of some weird VideoCDs and PhotoCDs.
 *        1997  Gordon Chaffee - Joliet CDs
 *        1998  Eric Lammerts - ISO 9660 Level 3
 *        2004  Paul Serice - Inode Support pushed out from 4GB to 128GB
 *        2004  Paul Serice - NFS Export Operations
 */

#include <linux/init.h>
#include <linux/module.h>

#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/nls.h>
#include <linux/ctype.h>
#include <linux/statfs.h>
#include <linux/cdrom.h>
#include <linux/mpage.h>
#include <linux/user_namespace.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>

#include "isofs.h"
#include "zisofs.h"

/* max tz offset is 13 hours */
#define MAX_TZ_OFFSET (52*15*60)

#define BEQUIET

static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
static int isofs_dentry_cmpi(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name);

#ifdef CONFIG_JOLIET
static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
static int isofs_dentry_cmpi_ms(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name);
static int isofs_dentry_cmp_ms(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name);
#endif

static void isofs_put_super(struct super_block *sb)
{
        struct isofs_sb_info *sbi = ISOFS_SB(sb);

#ifdef CONFIG_JOLIET
        unload_nls(sbi->s_nls_iocharset);
#endif

        kfree(sbi);
        sb->s_fs_info = NULL;
        return;
}

static int isofs_read_inode(struct inode *, int relocated);
static int isofs_statfs (struct dentry *, struct kstatfs *);
static int isofs_show_options(struct seq_file *, struct dentry *);

static struct kmem_cache *isofs_inode_cachep;

static struct inode *isofs_alloc_inode(struct super_block *sb)
{
        struct iso_inode_info *ei;
        ei = alloc_inode_sb(sb, isofs_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        return &ei->vfs_inode;
}

static void isofs_free_inode(struct inode *inode)
{
        kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
}

static void init_once(void *foo)
{
        struct iso_inode_info *ei = foo;

        inode_init_once(&ei->vfs_inode);
}

static int __init init_inodecache(void)
{
        isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
                                        sizeof(struct iso_inode_info),
                                        0, (SLAB_RECLAIM_ACCOUNT|
                                        SLAB_ACCOUNT),
                                        init_once);
        if (!isofs_inode_cachep)
                return -ENOMEM;
        return 0;
}

static void destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(isofs_inode_cachep);
}

static int isofs_reconfigure(struct fs_context *fc)
{
        sync_filesystem(fc->root->d_sb);
        if (!(fc->sb_flags & SB_RDONLY))
                return -EROFS;
        return 0;
}

static const struct super_operations isofs_sops = {
        .alloc_inode        = isofs_alloc_inode,
        .free_inode        = isofs_free_inode,
        .put_super        = isofs_put_super,
        .statfs                = isofs_statfs,
        .show_options        = isofs_show_options,
};


static const struct dentry_operations isofs_dentry_ops[] = {
        {
                .d_hash                = isofs_hashi,
                .d_compare        = isofs_dentry_cmpi,
        },
#ifdef CONFIG_JOLIET
        {
                .d_hash                = isofs_hash_ms,
                .d_compare        = isofs_dentry_cmp_ms,
        },
        {
                .d_hash                = isofs_hashi_ms,
                .d_compare        = isofs_dentry_cmpi_ms,
        },
#endif
};

struct isofs_options{
        unsigned int rock:1;
        unsigned int joliet:1;
        unsigned int cruft:1;
        unsigned int hide:1;
        unsigned int showassoc:1;
        unsigned int nocompress:1;
        unsigned int overriderockperm:1;
        unsigned int uid_set:1;
        unsigned int gid_set:1;
        unsigned char map;
        unsigned char check;
        unsigned int blocksize;
        umode_t fmode;
        umode_t dmode;
        kgid_t gid;
        kuid_t uid;
        char *iocharset;
        /* LVE */
        s32 session;
        s32 sbsector;
};

/*
 * Compute the hash for the isofs name corresponding to the dentry.
 */
static int
isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
{
        const char *name;
        int len;
        char c;
        unsigned long hash;

        len = qstr->len;
        name = qstr->name;
        if (ms) {
                while (len && name[len-1] == '.')
                        len--;
        }

        hash = init_name_hash(dentry);
        while (len--) {
                c = tolower(*name++);
                hash = partial_name_hash(c, hash);
        }
        qstr->hash = end_name_hash(hash);

        return 0;
}

/*
 * Compare of two isofs names.
 */
static int isofs_dentry_cmp_common(
                unsigned int len, const char *str,
                const struct qstr *name, int ms, int ci)
{
        int alen, blen;

        /* A filename cannot end in '.' or we treat it like it has none */
        alen = name->len;
        blen = len;
        if (ms) {
                while (alen && name->name[alen-1] == '.')
                        alen--;
                while (blen && str[blen-1] == '.')
                        blen--;
        }
        if (alen == blen) {
                if (ci) {
                        if (strncasecmp(name->name, str, alen) == 0)
                                return 0;
                } else {
                        if (strncmp(name->name, str, alen) == 0)
                                return 0;
                }
        }
        return 1;
}

static int
isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
{
        return isofs_hashi_common(dentry, qstr, 0);
}

static int
isofs_dentry_cmpi(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
{
        return isofs_dentry_cmp_common(len, str, name, 0, 1);
}

#ifdef CONFIG_JOLIET
/*
 * Compute the hash for the isofs name corresponding to the dentry.
 */
static int
isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
{
        const char *name;
        int len;

        len = qstr->len;
        name = qstr->name;
        if (ms) {
                while (len && name[len-1] == '.')
                        len--;
        }

        qstr->hash = full_name_hash(dentry, name, len);

        return 0;
}

static int
isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
{
        return isofs_hash_common(dentry, qstr, 1);
}

static int
isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
{
        return isofs_hashi_common(dentry, qstr, 1);
}

static int
isofs_dentry_cmp_ms(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
{
        return isofs_dentry_cmp_common(len, str, name, 1, 0);
}

static int
isofs_dentry_cmpi_ms(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
{
        return isofs_dentry_cmp_common(len, str, name, 1, 1);
}
#endif

enum {
        Opt_block, Opt_check, Opt_cruft, Opt_gid, Opt_ignore, Opt_iocharset,
        Opt_map, Opt_mode, Opt_nojoliet, Opt_norock, Opt_sb, Opt_session,
        Opt_uid, Opt_unhide, Opt_utf8, Opt_err, Opt_nocompress, Opt_hide,
        Opt_showassoc, Opt_dmode, Opt_overriderockperm,
};

static const struct constant_table isofs_param_map[] = {
        {"acorn",        'a'},
        {"a",                'a'},
        {"normal",        'n'},
        {"n",                'n'},
        {"off",                'o'},
        {"o",                'o'},
        {}
};

static const struct constant_table isofs_param_check[] = {
        {"relaxed",        'r'},
        {"r",                'r'},
        {"strict",        's'},
        {"s",                's'},
        {}
};

static const struct fs_parameter_spec isofs_param_spec[] = {
        fsparam_flag        ("norock",                Opt_norock),
        fsparam_flag        ("nojoliet",                Opt_nojoliet),
        fsparam_flag        ("unhide",                Opt_unhide),
        fsparam_flag        ("hide",                Opt_hide),
        fsparam_flag        ("showassoc",                Opt_showassoc),
        fsparam_flag        ("cruft",                Opt_cruft),
        fsparam_flag        ("utf8",                Opt_utf8),
        fsparam_string        ("iocharset",                Opt_iocharset),
        fsparam_enum        ("map",                        Opt_map, isofs_param_map),
        fsparam_u32        ("session",                Opt_session),
        fsparam_u32        ("sbsector",                Opt_sb),
        fsparam_enum        ("check",                Opt_check, isofs_param_check),
        fsparam_uid        ("uid",                        Opt_uid),
        fsparam_gid        ("gid",                        Opt_gid),
        /* Note: mode/dmode historically accepted %u not strictly %o */
        fsparam_u32        ("mode",                Opt_mode),
        fsparam_u32        ("dmode",                Opt_dmode),
        fsparam_flag        ("overriderockperm",        Opt_overriderockperm),
        fsparam_u32        ("block",                Opt_block),
        fsparam_string        ("conv",                Opt_ignore),
        fsparam_flag        ("nocompress",                Opt_nocompress),
        {}
};

static int isofs_parse_param(struct fs_context *fc,
                               struct fs_parameter *param)
{
        struct isofs_options *popt = fc->fs_private;
        struct fs_parse_result result;
        int opt;
        unsigned int n;

        /* There are no remountable options */
        if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
                return 0;

        opt = fs_parse(fc, isofs_param_spec, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_norock:
                popt->rock = 0;
                break;
        case Opt_nojoliet:
                popt->joliet = 0;
                break;
        case Opt_hide:
                popt->hide = 1;
                break;
        case Opt_unhide:
        case Opt_showassoc:
                popt->showassoc = 1;
                break;
        case Opt_cruft:
                popt->cruft = 1;
                break;
#ifdef CONFIG_JOLIET
        case Opt_utf8:
                kfree(popt->iocharset);
                popt->iocharset = kstrdup("utf8", GFP_KERNEL);
                if (!popt->iocharset)
                        return -ENOMEM;
                break;
        case Opt_iocharset:
                kfree(popt->iocharset);
                popt->iocharset = kstrdup(param->string, GFP_KERNEL);
                if (!popt->iocharset)
                        return -ENOMEM;
                break;
#endif
        case Opt_map:
                popt->map = result.uint_32;
                break;
        case Opt_session:
                n = result.uint_32;
                /*
                 * Track numbers are supposed to be in range 1-99, the
                 * mount option starts indexing at 0.
                 */
                if (n >= 99)
                        return -EINVAL;
                popt->session = n + 1;
                break;
        case Opt_sb:
                popt->sbsector = result.uint_32;
                break;
        case Opt_check:
                popt->check = result.uint_32;
                break;
        case Opt_ignore:
                break;
        case Opt_uid:
                popt->uid = result.uid;
                popt->uid_set = 1;
                break;
        case Opt_gid:
                popt->gid = result.gid;
                popt->gid_set = 1;
                break;
        case Opt_mode:
                popt->fmode = result.uint_32;
                break;
        case Opt_dmode:
                popt->dmode = result.uint_32;
                break;
        case Opt_overriderockperm:
                popt->overriderockperm = 1;
                break;
        case Opt_block:
                n = result.uint_32;
                if (n != 512 && n != 1024 && n != 2048)
                        return -EINVAL;
                popt->blocksize = n;
                break;
        case Opt_nocompress:
                popt->nocompress = 1;
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

/*
 * Display the mount options in /proc/mounts.
 */
static int isofs_show_options(struct seq_file *m, struct dentry *root)
{
        struct isofs_sb_info *sbi = ISOFS_SB(root->d_sb);

        if (!sbi->s_rock)                seq_puts(m, ",norock");
        else if (!sbi->s_joliet_level)        seq_puts(m, ",nojoliet");
        if (sbi->s_cruft)                seq_puts(m, ",cruft");
        if (sbi->s_hide)                seq_puts(m, ",hide");
        if (sbi->s_nocompress)                seq_puts(m, ",nocompress");
        if (sbi->s_overriderockperm)        seq_puts(m, ",overriderockperm");
        if (sbi->s_showassoc)                seq_puts(m, ",showassoc");

        if (sbi->s_check)                seq_printf(m, ",check=%c", sbi->s_check);
        if (sbi->s_mapping)                seq_printf(m, ",map=%c", sbi->s_mapping);
        if (sbi->s_session != 255)        seq_printf(m, ",session=%u", sbi->s_session - 1);
        if (sbi->s_sbsector != -1)        seq_printf(m, ",sbsector=%u", sbi->s_sbsector);

        if (root->d_sb->s_blocksize != 1024)
                seq_printf(m, ",blocksize=%lu", root->d_sb->s_blocksize);

        if (sbi->s_uid_set)
                seq_printf(m, ",uid=%u",
                           from_kuid_munged(&init_user_ns, sbi->s_uid));
        if (sbi->s_gid_set)
                seq_printf(m, ",gid=%u",
                           from_kgid_munged(&init_user_ns, sbi->s_gid));

        if (sbi->s_dmode != ISOFS_INVALID_MODE)
                seq_printf(m, ",dmode=%o", sbi->s_dmode);
        if (sbi->s_fmode != ISOFS_INVALID_MODE)
                seq_printf(m, ",fmode=%o", sbi->s_fmode);

#ifdef CONFIG_JOLIET
        if (sbi->s_nls_iocharset)
                seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset);
        else
                seq_puts(m, ",iocharset=utf8");
#endif
        return 0;
}

/*
 * look if the driver can tell the multi session redirection value
 *
 * don't change this if you don't know what you do, please!
 * Multisession is legal only with XA disks.
 * A non-XA disk with more than one volume descriptor may do it right, but
 * usually is written in a nowhere standardized "multi-partition" manner.
 * Multisession uses absolute addressing (solely the first frame of the whole
 * track is #0), multi-partition uses relative addressing (each first frame of
 * each track is #0), and a track is not a session.
 *
 * A broken CDwriter software or drive firmware does not set new standards,
 * at least not if conflicting with the existing ones.
 *
 * emoenke@gwdg.de
 */
#define WE_OBEY_THE_WRITTEN_STANDARDS 1

static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
{
        struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);
        unsigned int vol_desc_start = 0;

        if (session > 0) {
                struct cdrom_tocentry te;

                if (!cdi)
                        return 0;

                te.cdte_track = session;
                te.cdte_format = CDROM_LBA;
                if (cdrom_read_tocentry(cdi, &te) == 0) {
                        printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n",
                                session, te.cdte_addr.lba,
                                te.cdte_ctrl & CDROM_DATA_TRACK);
                        if ((te.cdte_ctrl & CDROM_DATA_TRACK) == 4)
                                return te.cdte_addr.lba;
                }

                printk(KERN_ERR "ISOFS: Invalid session number or type of track\n");
        }

        if (cdi) {
                struct cdrom_multisession ms_info;

                ms_info.addr_format = CDROM_LBA;
                if (cdrom_multisession(cdi, &ms_info) == 0) {
#if WE_OBEY_THE_WRITTEN_STANDARDS
                        /* necessary for a valid ms_info.addr */
                        if (ms_info.xa_flag)
#endif
                                vol_desc_start = ms_info.addr.lba;
                }
        }

        return vol_desc_start;
}

/*
 * Check if root directory is empty (has less than 3 files).
 *
 * Used to detect broken CDs where ISO root directory is empty but Joliet root
 * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
 * (and Joliet used instead) or else no files would be visible.
 */
static bool rootdir_empty(struct super_block *sb, unsigned long block)
{
        int offset = 0, files = 0, de_len;
        struct iso_directory_record *de;
        struct buffer_head *bh;

        bh = sb_bread(sb, block);
        if (!bh)
                return true;
        while (files < 3) {
                de = (struct iso_directory_record *) (bh->b_data + offset);
                de_len = *(unsigned char *) de;
                if (de_len == 0)
                        break;
                files++;
                offset += de_len;
        }
        brelse(bh);
        return files < 3;
}

/*
 * Initialize the superblock and read the root inode.
 */
static int isofs_fill_super(struct super_block *s, struct fs_context *fc)
{
        struct buffer_head *bh = NULL, *pri_bh = NULL;
        struct hs_primary_descriptor *h_pri = NULL;
        struct iso_primary_descriptor *pri = NULL;
        struct iso_supplementary_descriptor *sec = NULL;
        struct iso_directory_record *rootp;
        struct inode *inode;
        struct isofs_options *opt = fc->fs_private;
        struct isofs_sb_info *sbi;
        unsigned long first_data_zone;
        int joliet_level = 0;
        int iso_blknum, block;
        int orig_zonesize;
        int table, error = -EINVAL;
        unsigned int vol_desc_start;
        int silent = fc->sb_flags & SB_SILENT;

        sbi = kzalloc_obj(*sbi);
        if (!sbi)
                return -ENOMEM;
        s->s_fs_info = sbi;

        /*
         * First of all, get the hardware blocksize for this device.
         * If we don't know what it is, or the hardware blocksize is
         * larger than the blocksize the user specified, then use
         * that value.
         */
        /*
         * What if bugger tells us to go beyond page size?
         */
        if (bdev_logical_block_size(s->s_bdev) > 2048) {
                printk(KERN_WARNING
                       "ISOFS: unsupported/invalid hardware sector size %d\n",
                        bdev_logical_block_size(s->s_bdev));
                goto out_freesbi;
        }
        opt->blocksize = sb_min_blocksize(s, opt->blocksize);
        if (!opt->blocksize) {
                printk(KERN_ERR
                       "ISOFS: unable to set blocksize\n");
                goto out_freesbi;
        }

        sbi->s_high_sierra = 0; /* default is iso9660 */
        sbi->s_session = opt->session;
        sbi->s_sbsector = opt->sbsector;

        vol_desc_start = (opt->sbsector != -1) ?
                opt->sbsector : isofs_get_last_session(s, opt->session);

        for (iso_blknum = vol_desc_start+16;
                iso_blknum < vol_desc_start+100; iso_blknum++) {
                struct hs_volume_descriptor *hdp;
                struct iso_volume_descriptor  *vdp;

                block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits);
                if (!(bh = sb_bread(s, block)))
                        goto out_no_read;

                vdp = (struct iso_volume_descriptor *)bh->b_data;
                hdp = (struct hs_volume_descriptor *)bh->b_data;

                /*
                 * Due to the overlapping physical location of the descriptors,
                 * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure
                 * proper identification in this case, we first check for ISO.
                 */
                if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) {
                        if (isonum_711(vdp->type) == ISO_VD_END)
                                break;
                        if (isonum_711(vdp->type) == ISO_VD_PRIMARY) {
                                if (!pri) {
                                        pri = (struct iso_primary_descriptor *)vdp;
                                        /* Save the buffer in case we need it ... */
                                        pri_bh = bh;
                                        bh = NULL;
                                }
                        }
#ifdef CONFIG_JOLIET
                        else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
                                sec = (struct iso_supplementary_descriptor *)vdp;
                                if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
                                        if (opt->joliet) {
                                                if (sec->escape[2] == 0x40)
                                                        joliet_level = 1;
                                                else if (sec->escape[2] == 0x43)
                                                        joliet_level = 2;
                                                else if (sec->escape[2] == 0x45)
                                                        joliet_level = 3;

                                                printk(KERN_DEBUG "ISO 9660 Extensions: "
                                                        "Microsoft Joliet Level %d\n",
                                                        joliet_level);
                                        }
                                        goto root_found;
                                } else {
                                /* Unknown supplementary volume descriptor */
                                sec = NULL;
                                }
                        }
#endif
                } else {
                        if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) {
                                if (isonum_711(hdp->type) != ISO_VD_PRIMARY)
                                        goto out_freebh;

                                sbi->s_high_sierra = 1;
                                opt->rock = 0;
                                h_pri = (struct hs_primary_descriptor *)vdp;
                                goto root_found;
                        }
                }

                /* Just skip any volume descriptors we don't recognize */

                brelse(bh);
                bh = NULL;
        }
        /*
         * If we fall through, either no volume descriptor was found,
         * or else we passed a primary descriptor looking for others.
         */
        if (!pri)
                goto out_unknown_format;
        brelse(bh);
        bh = pri_bh;
        pri_bh = NULL;

root_found:
        /* We don't support read-write mounts */
        if (!sb_rdonly(s)) {
                error = -EACCES;
                goto out_freebh;
        }

        if (joliet_level && (!pri || !opt->rock)) {
                /* This is the case of Joliet with the norock mount flag.
                 * A disc with both Joliet and Rock Ridge is handled later
                 */
                pri = (struct iso_primary_descriptor *) sec;
        }

        if(sbi->s_high_sierra){
                rootp = (struct iso_directory_record *) h_pri->root_directory_record;
                sbi->s_nzones = isonum_733(h_pri->volume_space_size);
                sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size);
                sbi->s_max_size = isonum_733(h_pri->volume_space_size);
        } else {
                if (!pri)
                        goto out_freebh;
                rootp = (struct iso_directory_record *) pri->root_directory_record;
                sbi->s_nzones = isonum_733(pri->volume_space_size);
                sbi->s_log_zone_size = isonum_723(pri->logical_block_size);
                sbi->s_max_size = isonum_733(pri->volume_space_size);
        }

        sbi->s_ninodes = 0; /* No way to figure this out easily */

        orig_zonesize = sbi->s_log_zone_size;
        /*
         * If the zone size is smaller than the hardware sector size,
         * this is a fatal error.  This would occur if the disc drive
         * had sectors that were 2048 bytes, but the filesystem had
         * blocks that were 512 bytes (which should only very rarely
         * happen.)
         */
        if (orig_zonesize < opt->blocksize)
                goto out_bad_size;

        /* RDE: convert log zone size to bit shift */
        switch (sbi->s_log_zone_size) {
        case  512: sbi->s_log_zone_size =  9; break;
        case 1024: sbi->s_log_zone_size = 10; break;
        case 2048: sbi->s_log_zone_size = 11; break;

        default:
                goto out_bad_zone_size;
        }

        s->s_magic = ISOFS_SUPER_MAGIC;

        /*
         * With multi-extent files, file size is only limited by the maximum
         * size of a file system, which is 8 TB.
         */
        s->s_maxbytes = 0x80000000000LL;

        /* ECMA-119 timestamp from 1900/1/1 with tz offset */
        s->s_time_min = mktime64(1900, 1, 1, 0, 0, 0) - MAX_TZ_OFFSET;
        s->s_time_max = mktime64(U8_MAX+1900, 12, 31, 23, 59, 59) + MAX_TZ_OFFSET;

        /* Set this for reference. Its not currently used except on write
           which we don't have .. */

        first_data_zone = isonum_733(rootp->extent) +
                          isonum_711(rootp->ext_attr_length);
        sbi->s_firstdatazone = first_data_zone;
#ifndef BEQUIET
        printk(KERN_DEBUG "ISOFS: Max size:%ld   Log zone size:%ld\n",
                sbi->s_max_size, 1UL << sbi->s_log_zone_size);
        printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone);
        if(sbi->s_high_sierra)
                printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n");
#endif

        /*
         * If the Joliet level is set, we _may_ decide to use the
         * secondary descriptor, but can't be sure until after we
         * read the root inode. But before reading the root inode
         * we may need to change the device blocksize, and would
         * rather release the old buffer first. So, we cache the
         * first_data_zone value from the secondary descriptor.
         */
        if (joliet_level) {
                pri = (struct iso_primary_descriptor *) sec;
                rootp = (struct iso_directory_record *)
                        pri->root_directory_record;
                first_data_zone = isonum_733(rootp->extent) +
                                isonum_711(rootp->ext_attr_length);
        }

        /*
         * We're all done using the volume descriptor, and may need
         * to change the device blocksize, so release the buffer now.
         */
        brelse(pri_bh);
        brelse(bh);

        /*
         * Force the blocksize to 512 for 512 byte sectors.  The file
         * read primitives really get it wrong in a bad way if we don't
         * do this.
         *
         * Note - we should never be setting the blocksize to something
         * less than the hardware sector size for the device.  If we
         * do, we would end up having to read larger buffers and split
         * out portions to satisfy requests.
         *
         * Note2- the idea here is that we want to deal with the optimal
         * zonesize in the filesystem.  If we have it set to something less,
         * then we have horrible problems with trying to piece together
         * bits of adjacent blocks in order to properly read directory
         * entries.  By forcing the blocksize in this way, we ensure
         * that we will never be required to do this.
         */
        sb_set_blocksize(s, orig_zonesize);

        sbi->s_nls_iocharset = NULL;

#ifdef CONFIG_JOLIET
        if (joliet_level) {
                char *p = opt->iocharset ? opt->iocharset : CONFIG_NLS_DEFAULT;
                if (strcmp(p, "utf8") != 0) {
                        sbi->s_nls_iocharset = opt->iocharset ?
                                load_nls(opt->iocharset) : load_nls_default();
                        if (!sbi->s_nls_iocharset)
                                goto out_freesbi;
                }
        }
#endif
        s->s_op = &isofs_sops;
        s->s_export_op = &isofs_export_ops;
        sbi->s_mapping = opt->map;
        sbi->s_rock = (opt->rock ? 2 : 0);
        sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
        sbi->s_cruft = opt->cruft;
        sbi->s_hide = opt->hide;
        sbi->s_showassoc = opt->showassoc;
        sbi->s_uid = opt->uid;
        sbi->s_gid = opt->gid;
        sbi->s_uid_set = opt->uid_set;
        sbi->s_gid_set = opt->gid_set;
        sbi->s_nocompress = opt->nocompress;
        sbi->s_overriderockperm = opt->overriderockperm;
        /*
         * It would be incredibly stupid to allow people to mark every file
         * on the disk as suid, so we merely allow them to set the default
         * permissions.
         */
        if (opt->fmode != ISOFS_INVALID_MODE)
                sbi->s_fmode = opt->fmode & 0777;
        else
                sbi->s_fmode = ISOFS_INVALID_MODE;
        if (opt->dmode != ISOFS_INVALID_MODE)
                sbi->s_dmode = opt->dmode & 0777;
        else
                sbi->s_dmode = ISOFS_INVALID_MODE;

        /*
         * Read the root inode, which _may_ result in changing
         * the s_rock flag. Once we have the final s_rock value,
         * we then decide whether to use the Joliet descriptor.
         */
        inode = isofs_iget(s, sbi->s_firstdatazone, 0);

        /*
         * Fix for broken CDs with a corrupt root inode but a correct Joliet
         * root directory.
         */
        if (IS_ERR(inode)) {
                if (joliet_level && sbi->s_firstdatazone != first_data_zone) {
                        printk(KERN_NOTICE
                               "ISOFS: root inode is unusable. "
                               "Disabling Rock Ridge and switching to Joliet.");
                        sbi->s_rock = 0;
                        inode = NULL;
                } else {
                        goto out_no_root;
                }
        }

        /*
         * Fix for broken CDs with Rock Ridge and empty ISO root directory but
         * correct Joliet root directory.
         */
        if (sbi->s_rock == 1 && joliet_level &&
                                rootdir_empty(s, sbi->s_firstdatazone)) {
                printk(KERN_NOTICE
                        "ISOFS: primary root directory is empty. "
                        "Disabling Rock Ridge and switching to Joliet.");
                sbi->s_rock = 0;
        }

        /*
         * If this disk has both Rock Ridge and Joliet on it, then we
         * want to use Rock Ridge by default.  This can be overridden
         * by using the norock mount option.  There is still one other
         * possibility that is not taken into account: a Rock Ridge
         * CD with Unicode names.  Until someone sees such a beast, it
         * will not be supported.
         */
        if (sbi->s_rock == 1) {
                joliet_level = 0;
        } else if (joliet_level) {
                sbi->s_rock = 0;
                if (sbi->s_firstdatazone != first_data_zone) {
                        sbi->s_firstdatazone = first_data_zone;
                        printk(KERN_DEBUG
                                "ISOFS: changing to secondary root\n");
                        iput(inode);
                        inode = isofs_iget(s, sbi->s_firstdatazone, 0);
                        if (IS_ERR(inode))
                                goto out_no_root;
                }
        }

        if (opt->check == 'u') {
                /* Only Joliet is case insensitive by default */
                if (joliet_level)
                        opt->check = 'r';
                else
                        opt->check = 's';
        }
        sbi->s_joliet_level = joliet_level;

        /* Make sure the root inode is a directory */
        if (!S_ISDIR(inode->i_mode)) {
                printk(KERN_WARNING
                        "isofs_fill_super: root inode is not a directory. "
                        "Corrupted media?\n");
                goto out_iput;
        }

        table = 0;
        if (joliet_level)
                table += 2;
        if (opt->check == 'r')
                table++;
        sbi->s_check = opt->check;

        if (table)
                set_default_d_op(s, &isofs_dentry_ops[table - 1]);

        /* get the root dentry */
        s->s_root = d_make_root(inode);
        if (!(s->s_root)) {
                error = -ENOMEM;
                goto out_no_inode;
        }

        return 0;

        /*
         * Display error messages and free resources.
         */
out_iput:
        iput(inode);
        goto out_no_inode;
out_no_root:
        error = PTR_ERR(inode);
        if (error != -ENOMEM)
                printk(KERN_WARNING "%s: get root inode failed\n", __func__);
out_no_inode:
#ifdef CONFIG_JOLIET
        unload_nls(sbi->s_nls_iocharset);
#endif
        goto out_freesbi;
out_no_read:
        printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n",
                __func__, s->s_id, iso_blknum, block);
        goto out_freebh;
out_bad_zone_size:
        printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n",
                sbi->s_log_zone_size);
        goto out_freebh;
out_bad_size:
        printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n",
                orig_zonesize, opt->blocksize);
        goto out_freebh;
out_unknown_format:
        if (!silent)
                printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n");

out_freebh:
        brelse(bh);
        brelse(pri_bh);
out_freesbi:
        kfree(sbi);
        s->s_fs_info = NULL;
        return error;
}

static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);

        buf->f_type = ISOFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (ISOFS_SB(sb)->s_nzones
                << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits));
        buf->f_bfree = 0;
        buf->f_bavail = 0;
        buf->f_files = ISOFS_SB(sb)->s_ninodes;
        buf->f_ffree = 0;
        buf->f_fsid = u64_to_fsid(id);
        buf->f_namelen = NAME_MAX;
        return 0;
}

/*
 * Get a set of blocks; filling in buffer_heads if already allocated
 * or getblk() if they are not.  Returns the number of blocks inserted
 * (-ve == error.)
 */
int isofs_get_blocks(struct inode *inode, sector_t iblock,
                     struct buffer_head **bh, unsigned long nblocks)
{
        unsigned long b_off = iblock;
        unsigned offset, sect_size;
        unsigned int firstext;
        unsigned long nextblk, nextoff;
        int section, rv, error;
        struct iso_inode_info *ei = ISOFS_I(inode);

        error = -EIO;
        rv = 0;
        if (iblock != b_off) {
                printk(KERN_DEBUG "%s: block number too large\n", __func__);
                goto abort;
        }


        offset = 0;
        firstext = ei->i_first_extent;
        sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode);
        nextblk = ei->i_next_section_block;
        nextoff = ei->i_next_section_offset;
        section = 0;

        while (nblocks) {
                /* If we are *way* beyond the end of the file, print a message.
                 * Access beyond the end of the file up to the next page boundary
                 * is normal, however because of the way the page cache works.
                 * In this case, we just return 0 so that we can properly fill
                 * the page with useless information without generating any
                 * I/O errors.
                 */
                if (b_off > ((inode->i_size + PAGE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
                        printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
                                __func__, b_off,
                                (unsigned long long)inode->i_size);
                        goto abort;
                }

                /* On the last section, nextblk == 0, section size is likely to
                 * exceed sect_size by a partial block, and access beyond the
                 * end of the file will reach beyond the section size, too.
                 */
                while (nextblk && (b_off >= (offset + sect_size))) {
                        struct inode *ninode;

                        offset += sect_size;
                        ninode = isofs_iget(inode->i_sb, nextblk, nextoff);
                        if (IS_ERR(ninode)) {
                                error = PTR_ERR(ninode);
                                goto abort;
                        }
                        firstext  = ISOFS_I(ninode)->i_first_extent;
                        sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode);
                        nextblk   = ISOFS_I(ninode)->i_next_section_block;
                        nextoff   = ISOFS_I(ninode)->i_next_section_offset;
                        iput(ninode);

                        if (++section > 100) {
                                printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
                                        " aborting...\n", __func__);
                                printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
                                        "nextblk=%lu nextoff=%lu\n", __func__,
                                        b_off, firstext, (unsigned) sect_size,
                                        nextblk, nextoff);
                                goto abort;
                        }
                }

                if (*bh) {
                        map_bh(*bh, inode->i_sb, firstext + b_off - offset);
                } else {
                        *bh = sb_getblk(inode->i_sb, firstext+b_off-offset);
                        if (!*bh)
                                goto abort;
                }
                bh++;        /* Next buffer head */
                b_off++;        /* Next buffer offset */
                nblocks--;
                rv++;
        }

        error = 0;
abort:
        return rv != 0 ? rv : error;
}

/*
 * Used by the standard interfaces.
 */
static int isofs_get_block(struct inode *inode, sector_t iblock,
                    struct buffer_head *bh_result, int create)
{
        int ret;

        if (create) {
                printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__);
                return -EROFS;
        }

        ret = isofs_get_blocks(inode, iblock, &bh_result, 1);
        return ret < 0 ? ret : 0;
}

static int isofs_bmap(struct inode *inode, sector_t block)
{
        struct buffer_head dummy;
        int error;

        dummy.b_state = 0;
        dummy.b_blocknr = -1000;
        error = isofs_get_block(inode, block, &dummy, 0);
        if (!error)
                return dummy.b_blocknr;
        return 0;
}

struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
{
        sector_t blknr = isofs_bmap(inode, block);
        if (!blknr)
                return NULL;
        return sb_bread(inode->i_sb, blknr);
}

static int isofs_read_folio(struct file *file, struct folio *folio)
{
        return mpage_read_folio(folio, isofs_get_block);
}

static void isofs_readahead(struct readahead_control *rac)
{
        mpage_readahead(rac, isofs_get_block);
}

static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
{
        return generic_block_bmap(mapping,block,isofs_get_block);
}

static const struct address_space_operations isofs_aops = {
        .read_folio = isofs_read_folio,
        .readahead = isofs_readahead,
        .bmap = _isofs_bmap
};

static int isofs_read_level3_size(struct inode *inode)
{
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
        int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra;
        struct buffer_head *bh = NULL;
        unsigned long block, offset, block_saved, offset_saved;
        int i = 0;
        int more_entries = 0;
        struct iso_directory_record *tmpde = NULL;
        struct iso_inode_info *ei = ISOFS_I(inode);

        inode->i_size = 0;

        /* The first 16 blocks are reserved as the System Area.  Thus,
         * no inodes can appear in block 0.  We use this to flag that
         * this is the last section. */
        ei->i_next_section_block = 0;
        ei->i_next_section_offset = 0;

        block = ei->i_iget5_block;
        offset = ei->i_iget5_offset;

        do {
                struct iso_directory_record *de;
                unsigned int de_len;

                if (!bh) {
                        bh = sb_bread(inode->i_sb, block);
                        if (!bh)
                                goto out_noread;
                }
                de = (struct iso_directory_record *) (bh->b_data + offset);
                de_len = *(unsigned char *) de;

                if (de_len == 0) {
                        brelse(bh);
                        bh = NULL;
                        ++block;
                        offset = 0;
                        continue;
                }

                block_saved = block;
                offset_saved = offset;
                offset += de_len;

                /* Make sure we have a full directory entry */
                if (offset >= bufsize) {
                        int slop = bufsize - offset + de_len;
                        if (!tmpde) {
                                tmpde = kmalloc(256, GFP_KERNEL);
                                if (!tmpde)
                                        goto out_nomem;
                        }
                        memcpy(tmpde, de, slop);
                        offset &= bufsize - 1;
                        block++;
                        brelse(bh);
                        bh = NULL;
                        if (offset) {
                                bh = sb_bread(inode->i_sb, block);
                                if (!bh)
                                        goto out_noread;
                                memcpy((void *)tmpde+slop, bh->b_data, offset);
                        }
                        de = tmpde;
                }

                inode->i_size += isonum_733(de->size);
                if (i == 1) {
                        ei->i_next_section_block = block_saved;
                        ei->i_next_section_offset = offset_saved;
                }

                more_entries = de->flags[-high_sierra] & 0x80;

                i++;
                if (i > 100)
                        goto out_toomany;
        } while (more_entries);
out:
        kfree(tmpde);
        brelse(bh);
        return 0;

out_nomem:
        brelse(bh);
        return -ENOMEM;

out_noread:
        printk(KERN_INFO "ISOFS: unable to read i-node block %lu\n", block);
        kfree(tmpde);
        return -EIO;

out_toomany:
        printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n"
                "isofs_read_level3_size: inode=%llu\n",
                __func__, inode->i_ino);
        goto out;
}

static int isofs_read_inode(struct inode *inode, int relocated)
{
        struct super_block *sb = inode->i_sb;
        struct isofs_sb_info *sbi = ISOFS_SB(sb);
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
        unsigned long block;
        int high_sierra = sbi->s_high_sierra;
        struct buffer_head *bh;
        struct iso_directory_record *de;
        struct iso_directory_record *tmpde = NULL;
        unsigned int de_len;
        unsigned long offset;
        struct iso_inode_info *ei = ISOFS_I(inode);
        int ret = -EIO;
        struct timespec64 ts;

        block = ei->i_iget5_block;
        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                goto out_badread;

        offset = ei->i_iget5_offset;

        de = (struct iso_directory_record *) (bh->b_data + offset);
        de_len = *(unsigned char *) de;
        if (de_len < sizeof(struct iso_directory_record))
                goto fail;

        if (offset + de_len > bufsize) {
                int frag1 = bufsize - offset;

                tmpde = kmalloc(de_len, GFP_KERNEL);
                if (!tmpde) {
                        ret = -ENOMEM;
                        goto fail;
                }
                memcpy(tmpde, bh->b_data + offset, frag1);
                brelse(bh);
                bh = sb_bread(inode->i_sb, ++block);
                if (!bh)
                        goto out_badread;
                memcpy((char *)tmpde+frag1, bh->b_data, de_len - frag1);
                de = tmpde;
        }

        inode->i_ino = isofs_get_ino(ei->i_iget5_block,
                                        ei->i_iget5_offset,
                                        ISOFS_BUFFER_BITS(inode));

        /* Assume it is a normal-format file unless told otherwise */
        ei->i_file_format = isofs_file_normal;

        if (de->flags[-high_sierra] & 2) {
                if (sbi->s_dmode != ISOFS_INVALID_MODE)
                        inode->i_mode = S_IFDIR | sbi->s_dmode;
                else
                        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
                set_nlink(inode, 1);        /*
                                         * Set to 1.  We know there are 2, but
                                         * the find utility tries to optimize
                                         * if it is 2, and it screws up.  It is
                                         * easier to give 1 which tells find to
                                         * do it the hard way.
                                         */
        } else {
                if (sbi->s_fmode != ISOFS_INVALID_MODE) {
                        inode->i_mode = S_IFREG | sbi->s_fmode;
                } else {
                        /*
                         * Set default permissions: r-x for all.  The disc
                         * could be shared with DOS machines so virtually
                         * anything could be a valid executable.
                         */
                        inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
                }
                set_nlink(inode, 1);
        }
        inode->i_uid = sbi->s_uid;
        inode->i_gid = sbi->s_gid;
        inode->i_blocks = 0;

        ei->i_format_parm[0] = 0;
        ei->i_format_parm[1] = 0;
        ei->i_format_parm[2] = 0;

        ei->i_section_size = isonum_733(de->size);
        if (de->flags[-high_sierra] & 0x80) {
                ret = isofs_read_level3_size(inode);
                if (ret < 0)
                        goto fail;
                ret = -EIO;
        } else {
                ei->i_next_section_block = 0;
                ei->i_next_section_offset = 0;
                inode->i_size = isonum_733(de->size);
        }

        /*
         * Some dipshit decided to store some other bit of information
         * in the high byte of the file length.  Truncate size in case
         * this CDROM was mounted with the cruft option.
         */

        if (sbi->s_cruft)
                inode->i_size &= 0x00ffffff;

        if (de->interleave[0]) {
                printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n");
                inode->i_size = 0;
        }

        /* I have no idea what file_unit_size is used for, so
           we will flag it for now */
        if (de->file_unit_size[0] != 0) {
                printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%llu).\n",
                        inode->i_ino);
        }

        /* I have no idea what other flag bits are used for, so
           we will flag it for now */
#ifdef DEBUG
        if((de->flags[-high_sierra] & ~2)!= 0){
                printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file "
                                "(%llu %x).\n",
                        inode->i_ino, de->flags[-high_sierra]);
        }
#endif
        ts = iso_date(de->date, high_sierra ? ISO_DATE_HIGH_SIERRA : 0);
        inode_set_ctime_to_ts(inode, ts);
        inode_set_atime_to_ts(inode, ts);
        inode_set_mtime_to_ts(inode, ts);

        ei->i_first_extent = (isonum_733(de->extent) +
                        isonum_711(de->ext_attr_length));

        /* Set the number of blocks for stat() - should be done before RR */
        inode->i_blocks = (inode->i_size + 511) >> 9;

        /*
         * Now test for possible Rock Ridge extensions which will override
         * some of these numbers in the inode structure.
         */

        if (!high_sierra) {
                parse_rock_ridge_inode(de, inode, relocated);
                /* if we want uid/gid set, override the rock ridge setting */
                if (sbi->s_uid_set)
                        inode->i_uid = sbi->s_uid;
                if (sbi->s_gid_set)
                        inode->i_gid = sbi->s_gid;
        }
        /* Now set final access rights if overriding rock ridge setting */
        if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm &&
            sbi->s_dmode != ISOFS_INVALID_MODE)
                inode->i_mode = S_IFDIR | sbi->s_dmode;
        if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm &&
            sbi->s_fmode != ISOFS_INVALID_MODE)
                inode->i_mode = S_IFREG | sbi->s_fmode;

        /* Install the inode operations vector */
        if (S_ISREG(inode->i_mode)) {
                inode->i_fop = &generic_ro_fops;
                switch (ei->i_file_format) {
#ifdef CONFIG_ZISOFS
                case isofs_file_compressed:
                        inode->i_data.a_ops = &zisofs_aops;
                        break;
#endif
                default:
                        inode->i_data.a_ops = &isofs_aops;
                        break;
                }
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &isofs_dir_inode_operations;
                inode->i_fop = &isofs_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = &page_symlink_inode_operations;
                inode_nohighmem(inode);
                inode->i_data.a_ops = &isofs_symlink_aops;
        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
                   S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                /* XXX - parse_rock_ridge_inode() had already set i_rdev. */
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
        } else {
                printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %llu.\n",
                        inode->i_mode, inode->i_ino);
                ret = -EIO;
                goto fail;
        }

        ret = 0;
out:
        kfree(tmpde);
        brelse(bh);
        return ret;

out_badread:
        printk(KERN_WARNING "ISOFS: unable to read i-node block\n");
fail:
        goto out;
}

struct isofs_iget5_callback_data {
        unsigned long block;
        unsigned long offset;
};

static int isofs_iget5_test(struct inode *ino, void *data)
{
        struct iso_inode_info *i = ISOFS_I(ino);
        struct isofs_iget5_callback_data *d =
                (struct isofs_iget5_callback_data*)data;
        return (i->i_iget5_block == d->block)
                && (i->i_iget5_offset == d->offset);
}

static int isofs_iget5_set(struct inode *ino, void *data)
{
        struct iso_inode_info *i = ISOFS_I(ino);
        struct isofs_iget5_callback_data *d =
                (struct isofs_iget5_callback_data*)data;
        i->i_iget5_block = d->block;
        i->i_iget5_offset = d->offset;
        return 0;
}

/* Store, in the inode's containing structure, the block and block
 * offset that point to the underlying meta-data for the inode.  The
 * code below is otherwise similar to the iget() code in
 * include/linux/fs.h */
struct inode *__isofs_iget(struct super_block *sb,
                           unsigned long block,
                           unsigned long offset,
                           int relocated)
{
        unsigned long hashval;
        struct inode *inode;
        struct isofs_iget5_callback_data data;
        long ret;

        if (offset >= 1ul << sb->s_blocksize_bits)
                return ERR_PTR(-EINVAL);

        data.block = block;
        data.offset = offset;

        hashval = (block << sb->s_blocksize_bits) | offset;

        inode = iget5_locked(sb, hashval, &isofs_iget5_test,
                                &isofs_iget5_set, &data);

        if (!inode)
                return ERR_PTR(-ENOMEM);

        if (inode_state_read_once(inode) & I_NEW) {
                ret = isofs_read_inode(inode, relocated);
                if (ret < 0) {
                        iget_failed(inode);
                        inode = ERR_PTR(ret);
                } else {
                        unlock_new_inode(inode);
                }
        }

        return inode;
}

static int isofs_get_tree(struct fs_context *fc)
{
        return get_tree_bdev(fc, isofs_fill_super);
}

static void isofs_free_fc(struct fs_context *fc)
{
        struct isofs_options *opt = fc->fs_private;

        kfree(opt->iocharset);
        kfree(opt);
}

static const struct fs_context_operations isofs_context_ops = {
        .parse_param        = isofs_parse_param,
        .get_tree        = isofs_get_tree,
        .reconfigure        = isofs_reconfigure,
        .free                = isofs_free_fc,
};

static int isofs_init_fs_context(struct fs_context *fc)
{
        struct isofs_options *opt;

        opt = kzalloc_obj(*opt);
        if (!opt)
                return -ENOMEM;

        opt->map = 'n';
        opt->rock = 1;
        opt->joliet = 1;
        opt->cruft = 0;
        opt->hide = 0;
        opt->showassoc = 0;
        opt->check = 'u';                /* unset */
        opt->nocompress = 0;
        opt->blocksize = 1024;
        opt->fmode = opt->dmode = ISOFS_INVALID_MODE;
        opt->uid_set = 0;
        opt->gid_set = 0;
        opt->gid = GLOBAL_ROOT_GID;
        opt->uid = GLOBAL_ROOT_UID;
        opt->iocharset = NULL;
        opt->overriderockperm = 0;
        opt->session = -1;
        opt->sbsector = -1;

        fc->fs_private = opt;
        fc->ops = &isofs_context_ops;

        return 0;
}

static struct file_system_type iso9660_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "iso9660",
        .kill_sb        = kill_block_super,
        .fs_flags        = FS_REQUIRES_DEV,
        .init_fs_context = isofs_init_fs_context,
        .parameters        = isofs_param_spec,
};
MODULE_ALIAS_FS("iso9660");
MODULE_ALIAS("iso9660");

static int __init init_iso9660_fs(void)
{
        int err = init_inodecache();
        if (err)
                goto out;
#ifdef CONFIG_ZISOFS
        err = zisofs_init();
        if (err)
                goto out1;
#endif
        err = register_filesystem(&iso9660_fs_type);
        if (err)
                goto out2;
        return 0;
out2:
#ifdef CONFIG_ZISOFS
        zisofs_cleanup();
out1:
#endif
        destroy_inodecache();
out:
        return err;
}

static void __exit exit_iso9660_fs(void)
{
        unregister_filesystem(&iso9660_fs_type);
#ifdef CONFIG_ZISOFS
        zisofs_cleanup();
#endif
        destroy_inodecache();
}

module_init(init_iso9660_fs)
module_exit(exit_iso9660_fs)
MODULE_DESCRIPTION("ISO 9660 CDROM file system support");
MODULE_LICENSE("GPL");































    1 

















































































































    1 




























    1 




    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


































    1 









    1 



    1 
































    1 












    1 
    1 


    1 




    1 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
// SPDX-License-Identifier: LGPL-2.1
/*
 * Copyright (c) 2012 Taobao.
 * Written by Tao Ma <boyu.mt@taobao.com>
 */

#include <linux/iomap.h>
#include <linux/fiemap.h>
#include <linux/namei.h>
#include <linux/iversion.h>
#include <linux/sched/mm.h>

#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
#include "truncate.h"

#define EXT4_XATTR_SYSTEM_DATA        "data"
#define EXT4_MIN_INLINE_DATA_SIZE        ((sizeof(__le32) * EXT4_N_BLOCKS))
#define EXT4_INLINE_DOTDOT_OFFSET        2
#define EXT4_INLINE_DOTDOT_SIZE                4


static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
                                                 struct inode *inode,
                                                 void **fsdata);

static int ext4_get_inline_size(struct inode *inode)
{
        if (EXT4_I(inode)->i_inline_off)
                return EXT4_I(inode)->i_inline_size;

        return 0;
}

static int get_max_inline_xattr_value_size(struct inode *inode,
                                           struct ext4_iloc *iloc)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_entry *entry;
        struct ext4_inode *raw_inode;
        void *end;
        int free, min_offs;

        if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
                return 0;

        min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
                        EXT4_GOOD_OLD_INODE_SIZE -
                        EXT4_I(inode)->i_extra_isize -
                        sizeof(struct ext4_xattr_ibody_header);

        /*
         * We need to subtract another sizeof(__u32) since an in-inode xattr
         * needs an empty 4 bytes to indicate the gap between the xattr entry
         * and the name/value pair.
         */
        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return EXT4_XATTR_SIZE(min_offs -
                        EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
                        EXT4_XATTR_ROUND - sizeof(__u32));

        raw_inode = ext4_raw_inode(iloc);
        header = IHDR(inode, raw_inode);
        entry = IFIRST(header);
        end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;

        /* Compute min_offs. */
        while (!IS_LAST_ENTRY(entry)) {
                void *next = EXT4_XATTR_NEXT(entry);

                if (next >= end) {
                        EXT4_ERROR_INODE(inode,
                                         "corrupt xattr in inline inode");
                        return 0;
                }
                if (!entry->e_value_inum && entry->e_value_size) {
                        size_t offs = le16_to_cpu(entry->e_value_offs);
                        if (offs < min_offs)
                                min_offs = offs;
                }
                entry = next;
        }
        free = min_offs -
                ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);

        if (EXT4_I(inode)->i_inline_off) {
                entry = (struct ext4_xattr_entry *)
                        ((void *)raw_inode + EXT4_I(inode)->i_inline_off);

                free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
                goto out;
        }

        free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));

        if (free > EXT4_XATTR_ROUND)
                free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
        else
                free = 0;

out:
        return free;
}

/*
 * Get the maximum size we now can store in an inode.
 * If we can't find the space for a xattr entry, don't use the space
 * of the extents since we have no space to indicate the inline data.
 */
int ext4_get_max_inline_size(struct inode *inode)
{
        int error, max_inline_size;
        struct ext4_iloc iloc;

        if (EXT4_I(inode)->i_extra_isize == 0)
                return 0;

        error = ext4_get_inode_loc(inode, &iloc);
        if (error) {
                ext4_error_inode_err(inode, __func__, __LINE__, 0, -error,
                                     "can't get inode location %llu",
                                     inode->i_ino);
                return 0;
        }

        down_read(&EXT4_I(inode)->xattr_sem);
        max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
        up_read(&EXT4_I(inode)->xattr_sem);

        brelse(iloc.bh);

        if (!max_inline_size)
                return 0;

        return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
}

/*
 * this function does not take xattr_sem, which is OK because it is
 * currently only used in a code path coming form ext4_iget, before
 * the new inode has been unlocked
 */
int ext4_find_inline_data_nolock(struct inode *inode)
{
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
        };
        int error;

        if (EXT4_I(inode)->i_extra_isize == 0)
                return 0;

        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error)
                return error;

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto out;

        if (!is.s.not_found) {
                if (is.s.here->e_value_inum) {
                        EXT4_ERROR_INODE(inode, "inline data xattr refers "
                                         "to an external xattr inode");
                        error = -EFSCORRUPTED;
                        goto out;
                }
                EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
                                        (void *)ext4_raw_inode(&is.iloc));
                EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
                                le32_to_cpu(is.s.here->e_value_size);
        }
out:
        brelse(is.iloc.bh);
        return error;
}

static int ext4_read_inline_data(struct inode *inode, void *buffer,
                                 unsigned int len,
                                 struct ext4_iloc *iloc)
{
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_ibody_header *header;
        int cp_len = 0;
        struct ext4_inode *raw_inode;

        if (!len)
                return 0;

        BUG_ON(len > EXT4_I(inode)->i_inline_size);

        cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE);

        raw_inode = ext4_raw_inode(iloc);
        memcpy(buffer, (void *)(raw_inode->i_block), cp_len);

        len -= cp_len;
        buffer += cp_len;

        if (!len)
                goto out;

        header = IHDR(inode, raw_inode);
        entry = (struct ext4_xattr_entry *)((void *)raw_inode +
                                            EXT4_I(inode)->i_inline_off);
        len = min_t(unsigned int, len,
                    (unsigned int)le32_to_cpu(entry->e_value_size));

        memcpy(buffer,
               (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
        cp_len += len;

out:
        return cp_len;
}

/*
 * write the buffer to the inline inode.
 * If 'create' is set, we don't need to do the extra copy in the xattr
 * value since it is already handled by ext4_xattr_ibody_set.
 * That saves us one memcpy.
 */
static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
                                   void *buffer, loff_t pos, unsigned int len)
{
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        int cp_len = 0;

        if (unlikely(ext4_emergency_state(inode->i_sb)))
                return;

        BUG_ON(!EXT4_I(inode)->i_inline_off);
        BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);

        raw_inode = ext4_raw_inode(iloc);
        buffer += pos;

        if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
                cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
                         EXT4_MIN_INLINE_DATA_SIZE - pos : len;
                memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);

                len -= cp_len;
                buffer += cp_len;
                pos += cp_len;
        }

        if (!len)
                return;

        pos -= EXT4_MIN_INLINE_DATA_SIZE;
        header = IHDR(inode, raw_inode);
        entry = (struct ext4_xattr_entry *)((void *)raw_inode +
                                            EXT4_I(inode)->i_inline_off);

        memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
               buffer, len);
}

static int ext4_create_inline_data(handle_t *handle,
                                   struct inode *inode, unsigned len)
{
        int error;
        void *value = NULL;
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
        };

        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error)
                return error;

        BUFFER_TRACE(is.iloc.bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
                                              EXT4_JTR_NONE);
        if (error)
                goto out;

        if (len > EXT4_MIN_INLINE_DATA_SIZE) {
                value = EXT4_ZERO_XATTR_VALUE;
                len -= EXT4_MIN_INLINE_DATA_SIZE;
        } else {
                value = "";
                len = 0;
        }

        /* Insert the xttr entry. */
        i.value = value;
        i.value_len = len;

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto out;

        if (!is.s.not_found) {
                EXT4_ERROR_INODE(inode, "unexpected inline data xattr");
                error = -EFSCORRUPTED;
                goto out;
        }

        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
        if (error) {
                if (error == -ENOSPC)
                        ext4_clear_inode_state(inode,
                                               EXT4_STATE_MAY_INLINE_DATA);
                goto out;
        }

        memset((void *)ext4_raw_inode(&is.iloc)->i_block,
                0, EXT4_MIN_INLINE_DATA_SIZE);

        EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
                                      (void *)ext4_raw_inode(&is.iloc));
        EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
        ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
        ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
        get_bh(is.iloc.bh);
        error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);

out:
        brelse(is.iloc.bh);
        return error;
}

static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
                                   unsigned int len)
{
        int error;
        void *value = NULL;
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
        };

        /* If the old space is ok, write the data directly. */
        if (len <= EXT4_I(inode)->i_inline_size)
                return 0;

        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error)
                return error;

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto out;

        if (is.s.not_found) {
                EXT4_ERROR_INODE(inode, "missing inline data xattr");
                error = -EFSCORRUPTED;
                goto out;
        }

        len -= EXT4_MIN_INLINE_DATA_SIZE;
        value = kzalloc(len, GFP_NOFS);
        if (!value) {
                error = -ENOMEM;
                goto out;
        }

        error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
                                     value, len);
        if (error < 0)
                goto out;

        BUFFER_TRACE(is.iloc.bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
                                              EXT4_JTR_NONE);
        if (error)
                goto out;

        /* Update the xattr entry. */
        i.value = value;
        i.value_len = len;

        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
        if (error)
                goto out;

        EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
                                      (void *)ext4_raw_inode(&is.iloc));
        EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
                                le32_to_cpu(is.s.here->e_value_size);
        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
        get_bh(is.iloc.bh);
        error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);

out:
        kfree(value);
        brelse(is.iloc.bh);
        return error;
}

static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
                                    loff_t len)
{
        int ret, size, no_expand;
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
                return -ENOSPC;

        size = ext4_get_max_inline_size(inode);
        if (size < len)
                return -ENOSPC;

        ext4_write_lock_xattr(inode, &no_expand);
        /*
         * ei->i_inline_size may have changed since the initial check
         * if other xattrs were added. Recalculate to ensure
         * ext4_update_inline_data() validates against current capacity.
         */
        (void) ext4_find_inline_data_nolock(inode);
        if (ei->i_inline_off)
                ret = ext4_update_inline_data(handle, inode, len);
        else
                ret = ext4_create_inline_data(handle, inode, len);

        ext4_write_unlock_xattr(inode, &no_expand);
        return ret;
}

static int ext4_destroy_inline_data_nolock(handle_t *handle,
                                           struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = 0, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
                .value = NULL,
                .value_len = 0,
        };
        int error;

        if (!ei->i_inline_off)
                return 0;

        down_write(&ei->i_data_sem);

        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error) {
                up_write(&ei->i_data_sem);
                return error;
        }

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto out;

        BUFFER_TRACE(is.iloc.bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
                                              EXT4_JTR_NONE);
        if (error)
                goto out;

        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
        if (error)
                goto out;

        memset((void *)ext4_raw_inode(&is.iloc)->i_block,
                0, EXT4_MIN_INLINE_DATA_SIZE);
        memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE);

        if (ext4_has_feature_extents(inode->i_sb)) {
                if (S_ISDIR(inode->i_mode) ||
                    S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
                        ext4_ext_tree_init(handle, inode);
                }
        }
        ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);

        get_bh(is.iloc.bh);
        error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);

        EXT4_I(inode)->i_inline_off = 0;
        EXT4_I(inode)->i_inline_size = 0;
        ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
out:
        brelse(is.iloc.bh);
        if (error == -ENODATA)
                error = 0;
        up_write(&ei->i_data_sem);
        return error;
}

static int ext4_read_inline_folio(struct inode *inode, struct folio *folio)
{
        void *kaddr;
        int ret = 0;
        size_t len;
        struct ext4_iloc iloc;

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(!ext4_has_inline_data(inode));
        BUG_ON(folio->index);

        if (!EXT4_I(inode)->i_inline_off) {
                ext4_warning(inode->i_sb, "inode %llu doesn't have inline data.",
                             inode->i_ino);
                goto out;
        }

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                goto out;

        len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));

        if (len > PAGE_SIZE) {
                ext4_error_inode(inode, __func__, __LINE__, 0,
                                 "inline size %zu exceeds PAGE_SIZE", len);
                ret = -EFSCORRUPTED;
                brelse(iloc.bh);
                goto out;
        }

        kaddr = kmap_local_folio(folio, 0);
        ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
        kaddr = folio_zero_tail(folio, len, kaddr + len);
        kunmap_local(kaddr);
        folio_mark_uptodate(folio);
        brelse(iloc.bh);

out:
        return ret;
}

int ext4_readpage_inline(struct inode *inode, struct folio *folio)
{
        int ret = 0;

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode)) {
                up_read(&EXT4_I(inode)->xattr_sem);
                return -EAGAIN;
        }

        /*
         * Current inline data can only exist in the 1st page,
         * So for all the other pages, just set them uptodate.
         */
        if (!folio->index)
                ret = ext4_read_inline_folio(inode, folio);
        else if (!folio_test_uptodate(folio)) {
                folio_zero_segment(folio, 0, folio_size(folio));
                folio_mark_uptodate(folio);
        }

        up_read(&EXT4_I(inode)->xattr_sem);

        folio_unlock(folio);
        return ret >= 0 ? 0 : ret;
}

static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
                                              struct inode *inode)
{
        int ret, needed_blocks, no_expand;
        handle_t *handle = NULL;
        int retries = 0, sem_held = 0;
        struct folio *folio = NULL;
        unsigned from, to;
        struct ext4_iloc iloc;

        if (!ext4_has_inline_data(inode)) {
                /*
                 * clear the flag so that no new write
                 * will trap here again.
                 */
                ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                return 0;
        }

        needed_blocks = ext4_chunk_trans_extent(inode, 1);

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

retry:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                handle = NULL;
                goto out;
        }

        /* We cannot recurse into the filesystem as the transaction is already
         * started */
        folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio)) {
                ret = PTR_ERR(folio);
                goto out_nofolio;
        }

        ext4_write_lock_xattr(inode, &no_expand);
        sem_held = 1;
        /* If some one has already done this for us, just exit. */
        if (!ext4_has_inline_data(inode)) {
                ret = 0;
                goto out;
        }

        from = 0;
        to = ext4_get_inline_size(inode);
        if (!folio_test_uptodate(folio)) {
                ret = ext4_read_inline_folio(inode, folio);
                if (ret < 0)
                        goto out;
        }

        ext4_fc_track_inode(handle, inode);
        ret = ext4_destroy_inline_data_nolock(handle, inode);
        if (ret)
                goto out;

        if (ext4_should_dioread_nolock(inode)) {
                ret = ext4_block_write_begin(handle, folio, from, to,
                                             ext4_get_block_unwritten);
        } else
                ret = ext4_block_write_begin(handle, folio, from, to,
                                             ext4_get_block);
        clear_buffer_new(folio_buffers(folio));

        if (!ret && ext4_should_journal_data(inode)) {
                ret = ext4_walk_page_buffers(handle, inode,
                                             folio_buffers(folio), from, to,
                                             NULL, do_journal_get_write_access);
        }

        if (ret) {
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
                ext4_orphan_add(handle, inode);
                ext4_write_unlock_xattr(inode, &no_expand);
                sem_held = 0;
                ext4_journal_stop(handle);
                handle = NULL;
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might
                 * still be on the orphan list; we need to
                 * make sure the inode is removed from the
                 * orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;

        if (folio)
                block_commit_write(folio, from, to);
out:
        if (folio) {
                folio_unlock(folio);
                folio_put(folio);
        }
out_nofolio:
        if (sem_held)
                ext4_write_unlock_xattr(inode, &no_expand);
        if (handle)
                ext4_journal_stop(handle);
        brelse(iloc.bh);
        return ret;
}

/*
 * Prepare the write for the inline data.
 * If the data can be written into the inode, we just read
 * the page and make it uptodate, and start the journal.
 * Otherwise read the page, makes it dirty so that it can be
 * handle in writepages(the i_disksize update is left to the
 * normal ext4_da_write_end).
 */
int ext4_generic_write_inline_data(struct address_space *mapping,
                                          struct inode *inode,
                                          loff_t pos, unsigned len,
                                          struct folio **foliop,
                                          void **fsdata, bool da)
{
        int ret;
        handle_t *handle;
        struct folio *folio;
        struct ext4_iloc iloc;
        int retries = 0;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out_release_bh;
        }

        ret = ext4_prepare_inline_data(handle, inode, pos + len);
        if (ret && ret != -ENOSPC)
                goto out_stop_journal;

        if (ret == -ENOSPC) {
                ext4_journal_stop(handle);
                if (!da) {
                        brelse(iloc.bh);
                        /* Retry inside */
                        return ext4_convert_inline_data_to_extent(mapping, inode);
                }

                ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata);
                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry_journal;
                goto out_release_bh;
        }

        folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
                                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio)) {
                ret = PTR_ERR(folio);
                goto out_stop_journal;
        }

        down_read(&EXT4_I(inode)->xattr_sem);
        /* Someone else had converted it to extent */
        if (!ext4_has_inline_data(inode)) {
                ret = 0;
                goto out_release_folio;
        }

        if (!folio_test_uptodate(folio)) {
                ret = ext4_read_inline_folio(inode, folio);
                if (ret < 0)
                        goto out_release_folio;
        }

        ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE);
        if (ret)
                goto out_release_folio;
        *foliop = folio;
        up_read(&EXT4_I(inode)->xattr_sem);
        brelse(iloc.bh);
        return 1;

out_release_folio:
        up_read(&EXT4_I(inode)->xattr_sem);
        folio_unlock(folio);
        folio_put(folio);
out_stop_journal:
        ext4_journal_stop(handle);
out_release_bh:
        brelse(iloc.bh);
        return ret;
}

/*
 * Try to write data in the inode.
 * If the inode has inline data, check whether the new write can be
 * in the inode also. If not, create the page the handle, move the data
 * to the page make it update and let the later codes create extent for it.
 */
int ext4_try_to_write_inline_data(struct address_space *mapping,
                                  struct inode *inode,
                                  loff_t pos, unsigned len,
                                  struct folio **foliop)
{
        if (pos + len > ext4_get_max_inline_size(inode))
                return ext4_convert_inline_data_to_extent(mapping, inode);
        return ext4_generic_write_inline_data(mapping, inode, pos, len,
                                              foliop, NULL, false);
}

int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
                               unsigned copied, struct folio *folio)
{
        handle_t *handle = ext4_journal_current_handle();
        int no_expand;
        void *kaddr;
        struct ext4_iloc iloc;
        int ret = 0, ret2;

        if (unlikely(copied < len) && !folio_test_uptodate(folio))
                copied = 0;

        if (likely(copied)) {
                ret = ext4_get_inode_loc(inode, &iloc);
                if (ret) {
                        folio_unlock(folio);
                        folio_put(folio);
                        ext4_std_error(inode->i_sb, ret);
                        goto out;
                }
                ext4_write_lock_xattr(inode, &no_expand);
                BUG_ON(!ext4_has_inline_data(inode));

                /*
                 * ei->i_inline_off may have changed since
                 * ext4_write_begin() called
                 * ext4_try_to_write_inline_data()
                 */
                (void) ext4_find_inline_data_nolock(inode);

                kaddr = kmap_local_folio(folio, 0);
                ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
                kunmap_local(kaddr);
                folio_mark_uptodate(folio);
                /* clear dirty flag so that writepages wouldn't work for us. */
                folio_clear_dirty(folio);

                ext4_write_unlock_xattr(inode, &no_expand);
                brelse(iloc.bh);

                /*
                 * It's important to update i_size while still holding folio
                 * lock: page writeout could otherwise come in and zero
                 * beyond i_size.
                 */
                ext4_update_inode_size(inode, pos + copied);
        }
        folio_unlock(folio);
        folio_put(folio);

        /*
         * Don't mark the inode dirty under folio lock. First, it unnecessarily
         * makes the holding time of folio lock longer. Second, it forces lock
         * ordering of folio lock and transaction start for journaling
         * filesystems.
         */
        if (likely(copied))
                mark_inode_dirty(inode);
out:
        /*
         * If we didn't copy as much data as expected, we need to trim back
         * size of xattr containing inline data.
         */
        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }
        return ret ? ret : copied;
}

/*
 * Try to make the page cache and handle ready for the inline data case.
 * We can call this function in 2 cases:
 * 1. The inode is created and the first write exceeds inline size. We can
 *    clear the inode state safely.
 * 2. The inode has inline data, then we need to read the data, make it
 *    update and dirty so that ext4_da_writepages can handle it. We don't
 *    need to start the journal since the file's metadata isn't changed now.
 */
static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
                                                 struct inode *inode,
                                                 void **fsdata)
{
        int ret = 0, inline_size;
        struct folio *folio;

        folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN,
                                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode)) {
                ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                goto out;
        }

        inline_size = ext4_get_inline_size(inode);

        if (!folio_test_uptodate(folio)) {
                ret = ext4_read_inline_folio(inode, folio);
                if (ret < 0)
                        goto out;
        }

        ret = ext4_block_write_begin(NULL, folio, 0, inline_size,
                                     ext4_da_get_block_prep);
        if (ret) {
                up_read(&EXT4_I(inode)->xattr_sem);
                folio_unlock(folio);
                folio_put(folio);
                ext4_truncate_failed_write(inode);
                return ret;
        }

        clear_buffer_new(folio_buffers(folio));
        folio_mark_dirty(folio);
        folio_mark_uptodate(folio);
        ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
        *fsdata = (void *)CONVERT_INLINE_DATA;

out:
        up_read(&EXT4_I(inode)->xattr_sem);
        if (folio) {
                folio_unlock(folio);
                folio_put(folio);
        }
        return ret;
}

#ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
                          void *inline_start, int inline_size)
{
        int offset;
        unsigned short de_len;
        struct ext4_dir_entry_2 *de = inline_start;
        void *dlimit = inline_start + inline_size;

        trace_printk("inode %llu\n", dir->i_ino);
        offset = 0;
        while ((void *)de < dlimit) {
                de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
                trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n",
                             offset, de_len, de->name_len, de->name,
                             de->name_len, le32_to_cpu(de->inode));
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                         inline_start, inline_size, offset))
                        BUG();

                offset += de_len;
                de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
        }
}
#else
#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
#endif

/*
 * Add a new entry into a inline dir.
 * It will return -ENOSPC if no space is available, and -EIO
 * and -EEXIST if directory entry already exists.
 */
static int ext4_add_dirent_to_inline(handle_t *handle,
                                     struct ext4_filename *fname,
                                     struct inode *dir,
                                     struct inode *inode,
                                     struct ext4_iloc *iloc,
                                     void *inline_start, int inline_size)
{
        int                err;
        struct ext4_dir_entry_2 *de;

        err = ext4_find_dest_de(dir, iloc->bh, inline_start,
                                inline_size, fname, &de);
        if (err)
                return err;

        BUFFER_TRACE(iloc->bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh,
                                            EXT4_JTR_NONE);
        if (err)
                return err;
        ext4_insert_dentry(dir, inode, de, inline_size, fname);

        ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);

        /*
         * XXX shouldn't update any times until successful
         * completion of syscall, but too many callers depend
         * on this.
         *
         * XXX similarly, too many callers depend on
         * ext4_new_inode() setting the times, but error
         * recovery deletes the inode, so the worst that can
         * happen is that the times are slightly out of date
         * and/or different from the directory change time.
         */
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        ext4_update_dx_flag(dir);
        inode_inc_iversion(dir);
        return 1;
}

static void *ext4_get_inline_xattr_pos(struct inode *inode,
                                       struct ext4_iloc *iloc)
{
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_ibody_header *header;

        BUG_ON(!EXT4_I(inode)->i_inline_off);

        header = IHDR(inode, ext4_raw_inode(iloc));
        entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
                                            EXT4_I(inode)->i_inline_off);

        return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
}

/* Set the final de to cover the whole block. */
void ext4_update_final_de(void *de_buf, int old_size, int new_size)
{
        struct ext4_dir_entry_2 *de, *prev_de;
        void *limit;
        int de_len;

        de = de_buf;
        if (old_size) {
                limit = de_buf + old_size;
                do {
                        prev_de = de;
                        de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
                        de_buf += de_len;
                        de = de_buf;
                } while (de_buf < limit);

                prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
                                                        old_size, new_size);
        } else {
                /* this is just created, so create an empty entry. */
                de->inode = 0;
                de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
        }
}

static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
                                  struct ext4_iloc *iloc)
{
        int ret;
        int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
        int new_size = get_max_inline_xattr_value_size(dir, iloc);

        if (new_size - old_size <= ext4_dir_rec_len(1, NULL))
                return -ENOSPC;

        ret = ext4_update_inline_data(handle, dir,
                                      new_size + EXT4_MIN_INLINE_DATA_SIZE);
        if (ret)
                return ret;

        ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
                             EXT4_I(dir)->i_inline_size -
                                                EXT4_MIN_INLINE_DATA_SIZE);
        dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
        return 0;
}

static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
                                     struct ext4_iloc *iloc,
                                     void *buf, int inline_size)
{
        int ret;

        ret = ext4_create_inline_data(handle, inode, inline_size);
        if (ret) {
                ext4_msg(inode->i_sb, KERN_EMERG,
                        "error restoring inline_data for inode -- potential data loss! (inode %llu, error %d)",
                        inode->i_ino, ret);
                return;
        }
        ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}

static int ext4_convert_inline_data_nolock(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_iloc *iloc)
{
        int error;
        void *buf = NULL;
        struct buffer_head *data_bh = NULL;
        struct ext4_map_blocks map;
        int inline_size;

        inline_size = ext4_get_inline_size(inode);
        buf = kmalloc(inline_size, GFP_NOFS);
        if (!buf) {
                error = -ENOMEM;
                goto out;
        }

        error = ext4_read_inline_data(inode, buf, inline_size, iloc);
        if (error < 0)
                goto out;

        /*
         * Make sure the inline directory entries pass checks before we try to
         * convert them, so that we avoid touching stuff that needs fsck.
         */
        if (S_ISDIR(inode->i_mode)) {
                error = ext4_check_all_de(inode, iloc->bh,
                                        buf + EXT4_INLINE_DOTDOT_SIZE,
                                        inline_size - EXT4_INLINE_DOTDOT_SIZE);
                if (error)
                        goto out;
        }

        error = ext4_destroy_inline_data_nolock(handle, inode);
        if (error)
                goto out;

        map.m_lblk = 0;
        map.m_len = 1;
        map.m_flags = 0;
        error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
        if (error < 0)
                goto out_restore;
        if (!(map.m_flags & EXT4_MAP_MAPPED)) {
                error = -EIO;
                goto out_restore;
        }

        data_bh = sb_getblk(inode->i_sb, map.m_pblk);
        if (!data_bh) {
                error = -ENOMEM;
                goto out_restore;
        }

        lock_buffer(data_bh);
        error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh,
                                               EXT4_JTR_NONE);
        if (error) {
                unlock_buffer(data_bh);
                error = -EIO;
                goto out_restore;
        }
        memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);

        if (!S_ISDIR(inode->i_mode)) {
                memcpy(data_bh->b_data, buf, inline_size);
                set_buffer_uptodate(data_bh);
                unlock_buffer(data_bh);
                error = ext4_handle_dirty_metadata(handle,
                                                   inode, data_bh);
        } else {
                unlock_buffer(data_bh);
                inode->i_size = inode->i_sb->s_blocksize;
                i_size_write(inode, inode->i_sb->s_blocksize);
                EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;

                error = ext4_init_dirblock(handle, inode, data_bh,
                          le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode),
                          buf + EXT4_INLINE_DOTDOT_SIZE,
                          inline_size - EXT4_INLINE_DOTDOT_SIZE);
                if (!error)
                        error = ext4_mark_inode_dirty(handle, inode);
        }

out_restore:
        if (error)
                ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);

out:
        brelse(data_bh);
        kfree(buf);
        return error;
}

/*
 * Try to add the new entry to the inline data.
 * If succeeds, return 0. If not, extended the inline dir and copied data to
 * the new created block.
 */
int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
                              struct inode *dir, struct inode *inode)
{
        int ret, ret2, inline_size, no_expand;
        void *inline_start;
        struct ext4_iloc iloc;

        ret = ext4_get_inode_loc(dir, &iloc);
        if (ret)
                return ret;

        ext4_write_lock_xattr(dir, &no_expand);
        if (!ext4_has_inline_data(dir))
                goto out;

        inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
                                                 EXT4_INLINE_DOTDOT_SIZE;
        inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;

        ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc,
                                        inline_start, inline_size);
        if (ret != -ENOSPC)
                goto out;

        /* check whether it can be inserted to inline xattr space. */
        inline_size = EXT4_I(dir)->i_inline_size -
                        EXT4_MIN_INLINE_DATA_SIZE;
        if (!inline_size) {
                /* Try to use the xattr space.*/
                ret = ext4_update_inline_dir(handle, dir, &iloc);
                if (ret && ret != -ENOSPC)
                        goto out;

                inline_size = EXT4_I(dir)->i_inline_size -
                                EXT4_MIN_INLINE_DATA_SIZE;
        }

        if (inline_size) {
                inline_start = ext4_get_inline_xattr_pos(dir, &iloc);

                ret = ext4_add_dirent_to_inline(handle, fname, dir,
                                                inode, &iloc, inline_start,
                                                inline_size);

                if (ret != -ENOSPC)
                        goto out;
        }

        /*
         * The inline space is filled up, so create a new block for it.
         * As the extent tree will be created, we have to save the inline
         * dir first.
         */
        ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);

out:
        ext4_write_unlock_xattr(dir, &no_expand);
        ret2 = ext4_mark_inode_dirty(handle, dir);
        if (unlikely(ret2 && !ret))
                ret = ret2;
        brelse(iloc.bh);
        return ret;
}

/*
 * This function fills a red-black tree with information from an
 * inlined dir.  It returns the number directory entries loaded
 * into the tree.  If there is an error it is returned in err.
 */
int ext4_inlinedir_to_tree(struct file *dir_file,
                           struct inode *dir, ext4_lblk_t block,
                           struct dx_hash_info *hinfo,
                           __u32 start_hash, __u32 start_minor_hash,
                           int *has_inline_data)
{
        int err = 0, count = 0;
        unsigned int parent_ino;
        int pos;
        struct ext4_dir_entry_2 *de;
        struct inode *inode = file_inode(dir_file);
        int ret, inline_size = 0;
        struct ext4_iloc iloc;
        void *dir_buf = NULL;
        struct ext4_dir_entry_2 fake;
        struct fscrypt_str tmp_str;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode)) {
                up_read(&EXT4_I(inode)->xattr_sem);
                *has_inline_data = 0;
                goto out;
        }

        inline_size = ext4_get_inline_size(inode);
        dir_buf = kmalloc(inline_size, GFP_NOFS);
        if (!dir_buf) {
                ret = -ENOMEM;
                up_read(&EXT4_I(inode)->xattr_sem);
                goto out;
        }

        ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
        up_read(&EXT4_I(inode)->xattr_sem);
        if (ret < 0)
                goto out;

        pos = 0;
        parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
        while (pos < inline_size) {
                /*
                 * As inlined dir doesn't store any information about '.' and
                 * only the inode number of '..' is stored, we have to handle
                 * them differently.
                 */
                if (pos == 0) {
                        fake.inode = cpu_to_le32(inode->i_ino);
                        fake.name_len = 1;
                        memcpy(fake.name, ".", 2);
                        fake.rec_len = ext4_rec_len_to_disk(
                                          ext4_dir_rec_len(fake.name_len, NULL),
                                          inline_size);
                        ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
                        de = &fake;
                        pos = EXT4_INLINE_DOTDOT_OFFSET;
                } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
                        fake.inode = cpu_to_le32(parent_ino);
                        fake.name_len = 2;
                        memcpy(fake.name, "..", 3);
                        fake.rec_len = ext4_rec_len_to_disk(
                                          ext4_dir_rec_len(fake.name_len, NULL),
                                          inline_size);
                        ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
                        de = &fake;
                        pos = EXT4_INLINE_DOTDOT_SIZE;
                } else {
                        de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
                        pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
                        if (ext4_check_dir_entry(inode, dir_file, de,
                                         iloc.bh, dir_buf,
                                         inline_size, pos)) {
                                ret = count;
                                goto out;
                        }
                }

                if (ext4_hash_in_dirent(dir)) {
                        hinfo->hash = EXT4_DIRENT_HASH(de);
                        hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
                } else {
                        err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
                        if (err) {
                                ret = err;
                                goto out;
                        }
                }
                if ((hinfo->hash < start_hash) ||
                    ((hinfo->hash == start_hash) &&
                     (hinfo->minor_hash < start_minor_hash)))
                        continue;
                if (de->inode == 0)
                        continue;
                tmp_str.name = de->name;
                tmp_str.len = de->name_len;
                err = ext4_htree_store_dirent(dir_file, hinfo->hash,
                                              hinfo->minor_hash, de, &tmp_str);
                if (err) {
                        ret = err;
                        goto out;
                }
                count++;
        }
        ret = count;
out:
        kfree(dir_buf);
        brelse(iloc.bh);
        return ret;
}

/*
 * So this function is called when the volume is mkfsed with
 * dir_index disabled. In order to keep f_pos persistent
 * after we convert from an inlined dir to a blocked based,
 * we just pretend that we are a normal dir and return the
 * offset as if '.' and '..' really take place.
 *
 */
int ext4_read_inline_dir(struct file *file,
                         struct dir_context *ctx,
                         int *has_inline_data)
{
        unsigned int offset, parent_ino;
        int i;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
        struct inode *inode = file_inode(file);
        int ret, inline_size = 0;
        struct ext4_iloc iloc;
        void *dir_buf = NULL;
        int dotdot_offset, dotdot_size, extra_offset, extra_size;
        struct dir_private_info *info = file->private_data;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode)) {
                up_read(&EXT4_I(inode)->xattr_sem);
                *has_inline_data = 0;
                goto out;
        }

        inline_size = ext4_get_inline_size(inode);
        dir_buf = kmalloc(inline_size, GFP_NOFS);
        if (!dir_buf) {
                ret = -ENOMEM;
                up_read(&EXT4_I(inode)->xattr_sem);
                goto out;
        }

        ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
        up_read(&EXT4_I(inode)->xattr_sem);
        if (ret < 0)
                goto out;

        ret = 0;
        sb = inode->i_sb;
        parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
        offset = ctx->pos;

        /*
         * dotdot_offset and dotdot_size is the real offset and
         * size for ".." and "." if the dir is block based while
         * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
         * So we will use extra_offset and extra_size to indicate them
         * during the inline dir iteration.
         */
        dotdot_offset = ext4_dir_rec_len(1, NULL);
        dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL);
        extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
        extra_size = extra_offset + inline_size;

        /*
         * If the cookie has changed since the last call to
         * readdir(2), then we might be pointing to an invalid
         * dirent right now.  Scan from the start of the inline
         * dir to make sure.
         */
        if (!inode_eq_iversion(inode, info->cookie)) {
                for (i = 0; i < extra_size && i < offset;) {
                        /*
                         * "." is with offset 0 and
                         * ".." is dotdot_offset.
                         */
                        if (!i) {
                                i = dotdot_offset;
                                continue;
                        } else if (i == dotdot_offset) {
                                i = dotdot_size;
                                continue;
                        }
                        /* for other entry, the real offset in
                         * the buf has to be tuned accordingly.
                         */
                        de = (struct ext4_dir_entry_2 *)
                                (dir_buf + i - extra_offset);
                        /* It's too expensive to do a full
                         * dirent test each time round this
                         * loop, but we do have to test at
                         * least that it is non-zero.  A
                         * failure will be detected in the
                         * dirent test below. */
                        if (ext4_rec_len_from_disk(de->rec_len, extra_size)
                                < ext4_dir_rec_len(1, NULL))
                                break;
                        i += ext4_rec_len_from_disk(de->rec_len,
                                                    extra_size);
                }
                offset = i;
                ctx->pos = offset;
                info->cookie = inode_query_iversion(inode);
        }

        while (ctx->pos < extra_size) {
                if (ctx->pos == 0) {
                        if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
                                goto out;
                        ctx->pos = dotdot_offset;
                        continue;
                }

                if (ctx->pos == dotdot_offset) {
                        if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
                                goto out;
                        ctx->pos = dotdot_size;
                        continue;
                }

                de = (struct ext4_dir_entry_2 *)
                        (dir_buf + ctx->pos - extra_offset);
                if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
                                         extra_size, ctx->pos))
                        goto out;
                if (le32_to_cpu(de->inode)) {
                        if (!dir_emit(ctx, de->name, de->name_len,
                                      le32_to_cpu(de->inode),
                                      get_dtype(sb, de->file_type)))
                                goto out;
                }
                ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
        }
out:
        kfree(dir_buf);
        brelse(iloc.bh);
        return ret;
}

void *ext4_read_inline_link(struct inode *inode)
{
        struct ext4_iloc iloc;
        int ret, inline_size;
        void *link;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ERR_PTR(ret);

        ret = -ENOMEM;
        inline_size = ext4_get_inline_size(inode);
        link = kmalloc(inline_size + 1, GFP_NOFS);
        if (!link)
                goto out;

        ret = ext4_read_inline_data(inode, link, inline_size, &iloc);
        if (ret < 0) {
                kfree(link);
                goto out;
        }
        nd_terminate_link(link, inode->i_size, ret);
out:
        if (ret < 0)
                link = ERR_PTR(ret);
        brelse(iloc.bh);
        return link;
}

struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
                                        struct ext4_dir_entry_2 **parent_de,
                                        int *retval)
{
        struct ext4_iloc iloc;

        *retval = ext4_get_inode_loc(inode, &iloc);
        if (*retval)
                return NULL;

        *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;

        return iloc.bh;
}

/*
 * Try to create the inline data for the new dir.
 * If it succeeds, return 0, otherwise return the error.
 * In case of ENOSPC, the caller should create the normal disk layout dir.
 */
int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
                               struct inode *inode)
{
        int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
        struct ext4_iloc iloc;
        struct ext4_dir_entry_2 *de;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        ret = ext4_prepare_inline_data(handle, inode, inline_size);
        if (ret)
                goto out;

        /*
         * For inline dir, we only save the inode information for the ".."
         * and create a fake dentry to cover the left space.
         */
        de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
        de->inode = cpu_to_le32(parent->i_ino);
        de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
        de->inode = 0;
        de->rec_len = ext4_rec_len_to_disk(
                                inline_size - EXT4_INLINE_DOTDOT_SIZE,
                                inline_size);
        set_nlink(inode, 2);
        inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
out:
        brelse(iloc.bh);
        return ret;
}

struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                        struct ext4_filename *fname,
                                        struct ext4_dir_entry_2 **res_dir,
                                        int *has_inline_data)
{
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
        };
        int ret;
        void *inline_start;
        int inline_size;

        ret = ext4_get_inode_loc(dir, &is.iloc);
        if (ret)
                return ERR_PTR(ret);

        down_read(&EXT4_I(dir)->xattr_sem);

        ret = ext4_xattr_ibody_find(dir, &i, &is);
        if (ret)
                goto out;

        if (!ext4_has_inline_data(dir)) {
                *has_inline_data = 0;
                goto out;
        }

        inline_start = (void *)ext4_raw_inode(&is.iloc)->i_block +
                                                EXT4_INLINE_DOTDOT_SIZE;
        inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
        ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size,
                              dir, fname, 0, res_dir);
        if (ret == 1)
                goto out_find;
        if (ret < 0)
                goto out;

        if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
                goto out;

        inline_start = ext4_get_inline_xattr_pos(dir, &is.iloc);
        inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;

        ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size,
                              dir, fname, 0, res_dir);
        if (ret == 1)
                goto out_find;

out:
        brelse(is.iloc.bh);
        if (ret < 0)
                is.iloc.bh = ERR_PTR(ret);
        else
                is.iloc.bh = NULL;
out_find:
        up_read(&EXT4_I(dir)->xattr_sem);
        return is.iloc.bh;
}

int ext4_delete_inline_entry(handle_t *handle,
                             struct inode *dir,
                             struct ext4_dir_entry_2 *de_del,
                             struct buffer_head *bh,
                             int *has_inline_data)
{
        int err, inline_size, no_expand;
        struct ext4_iloc iloc;
        void *inline_start;

        err = ext4_get_inode_loc(dir, &iloc);
        if (err)
                return err;

        ext4_write_lock_xattr(dir, &no_expand);
        if (!ext4_has_inline_data(dir)) {
                *has_inline_data = 0;
                goto out;
        }

        if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
                EXT4_MIN_INLINE_DATA_SIZE) {
                inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
                                        EXT4_INLINE_DOTDOT_SIZE;
                inline_size = EXT4_MIN_INLINE_DATA_SIZE -
                                EXT4_INLINE_DOTDOT_SIZE;
        } else {
                inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
                inline_size = ext4_get_inline_size(dir) -
                                EXT4_MIN_INLINE_DATA_SIZE;
        }

        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
                                            EXT4_JTR_NONE);
        if (err)
                goto out;

        err = ext4_generic_delete_entry(dir, de_del, bh,
                                        inline_start, inline_size, 0);
        if (err)
                goto out;

        ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
out:
        ext4_write_unlock_xattr(dir, &no_expand);
        if (likely(err == 0))
                err = ext4_mark_inode_dirty(handle, dir);
        brelse(iloc.bh);
        if (err != -ENOENT)
                ext4_std_error(dir->i_sb, err);
        return err;
}

/*
 * Get the inline dentry at offset.
 */
static inline struct ext4_dir_entry_2 *
ext4_get_inline_entry(struct inode *inode,
                      struct ext4_iloc *iloc,
                      unsigned int offset,
                      void **inline_start,
                      int *inline_size)
{
        void *inline_pos;

        BUG_ON(offset > ext4_get_inline_size(inode));

        if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
                inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
                *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
        } else {
                inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
                offset -= EXT4_MIN_INLINE_DATA_SIZE;
                *inline_size = ext4_get_inline_size(inode) -
                                EXT4_MIN_INLINE_DATA_SIZE;
        }

        if (inline_start)
                *inline_start = inline_pos;
        return (struct ext4_dir_entry_2 *)(inline_pos + offset);
}

bool empty_inline_dir(struct inode *dir, int *has_inline_data)
{
        int err, inline_size;
        struct ext4_iloc iloc;
        size_t inline_len;
        void *inline_pos;
        unsigned int offset;
        struct ext4_dir_entry_2 *de;
        bool ret = false;

        err = ext4_get_inode_loc(dir, &iloc);
        if (err) {
                EXT4_ERROR_INODE_ERR(dir, -err,
                                     "error %d getting inode %llu block",
                                     err, dir->i_ino);
                return false;
        }

        down_read(&EXT4_I(dir)->xattr_sem);
        if (!ext4_has_inline_data(dir)) {
                *has_inline_data = 0;
                ret = true;
                goto out;
        }

        de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
        if (!le32_to_cpu(de->inode)) {
                ext4_warning(dir->i_sb,
                             "bad inline directory (dir #%llu) - no `..'",
                             dir->i_ino);
                goto out;
        }

        inline_len = ext4_get_inline_size(dir);
        offset = EXT4_INLINE_DOTDOT_SIZE;
        while (offset < inline_len) {
                de = ext4_get_inline_entry(dir, &iloc, offset,
                                           &inline_pos, &inline_size);
                if (ext4_check_dir_entry(dir, NULL, de,
                                         iloc.bh, inline_pos,
                                         inline_size, offset)) {
                        ext4_warning(dir->i_sb,
                                     "bad inline directory (dir #%llu) - "
                                     "inode %u, rec_len %u, name_len %d"
                                     "inline size %d",
                                     dir->i_ino, le32_to_cpu(de->inode),
                                     le16_to_cpu(de->rec_len), de->name_len,
                                     inline_size);
                        goto out;
                }
                if (le32_to_cpu(de->inode)) {
                        goto out;
                }
                offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
        }

        ret = true;
out:
        up_read(&EXT4_I(dir)->xattr_sem);
        brelse(iloc.bh);
        return ret;
}

int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
{
        int ret, no_expand;

        ext4_write_lock_xattr(inode, &no_expand);
        ret = ext4_destroy_inline_data_nolock(handle, inode);
        ext4_write_unlock_xattr(inode, &no_expand);

        return ret;
}

int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap)
{
        __u64 addr;
        int error = -EAGAIN;
        struct ext4_iloc iloc;

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode))
                goto out;

        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
                goto out;

        addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
        addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
        addr += offsetof(struct ext4_inode, i_block);

        brelse(iloc.bh);

        iomap->addr = addr;
        iomap->offset = 0;
        iomap->length = min_t(loff_t, ext4_get_inline_size(inode),
                              i_size_read(inode));
        iomap->type = IOMAP_INLINE;
        iomap->flags = 0;

out:
        up_read(&EXT4_I(inode)->xattr_sem);
        return error;
}

int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
{
        handle_t *handle;
        int inline_size, value_len, needed_blocks, no_expand, err = 0;
        size_t i_size;
        void *value = NULL;
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
        };


        needed_blocks = ext4_chunk_trans_extent(inode, 1);
        handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        ext4_write_lock_xattr(inode, &no_expand);
        if (!ext4_has_inline_data(inode)) {
                ext4_write_unlock_xattr(inode, &no_expand);
                *has_inline = 0;
                ext4_journal_stop(handle);
                return 0;
        }

        if ((err = ext4_orphan_add(handle, inode)) != 0)
                goto out;

        if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0)
                goto out;

        down_write(&EXT4_I(inode)->i_data_sem);
        i_size = inode->i_size;
        inline_size = ext4_get_inline_size(inode);
        EXT4_I(inode)->i_disksize = i_size;

        if (i_size < inline_size) {
                /*
                 * if there's inline data to truncate and this file was
                 * converted to extents after that inline data was written,
                 * the extent status cache must be cleared to avoid leaving
                 * behind stale delayed allocated extent entries
                 */
                if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
                        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);

                /* Clear the content in the xattr space. */
                if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
                        if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
                                goto out_error;

                        if (is.s.not_found) {
                                EXT4_ERROR_INODE(inode,
                                                 "missing inline data xattr");
                                err = -EFSCORRUPTED;
                                goto out_error;
                        }

                        value_len = le32_to_cpu(is.s.here->e_value_size);
                        value = kmalloc(value_len, GFP_NOFS);
                        if (!value) {
                                err = -ENOMEM;
                                goto out_error;
                        }

                        err = ext4_xattr_ibody_get(inode, i.name_index,
                                                   i.name, value, value_len);
                        if (err <= 0)
                                goto out_error;

                        i.value = value;
                        i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
                                        i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
                        err = ext4_xattr_ibody_set(handle, inode, &i, &is);
                        if (err)
                                goto out_error;
                }

                /* Clear the content within i_blocks. */
                if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
                        void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
                        memset(p + i_size, 0,
                               EXT4_MIN_INLINE_DATA_SIZE - i_size);
                }

                EXT4_I(inode)->i_inline_size = i_size <
                                        EXT4_MIN_INLINE_DATA_SIZE ?
                                        EXT4_MIN_INLINE_DATA_SIZE : i_size;
        }

out_error:
        up_write(&EXT4_I(inode)->i_data_sem);
out:
        brelse(is.iloc.bh);
        ext4_write_unlock_xattr(inode, &no_expand);
        kfree(value);
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);

        if (err == 0) {
                inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
                err = ext4_mark_inode_dirty(handle, inode);
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
        }
        ext4_journal_stop(handle);
        return err;
}

int ext4_convert_inline_data(struct inode *inode)
{
        int error, needed_blocks, no_expand;
        handle_t *handle;
        struct ext4_iloc iloc;

        if (!ext4_has_inline_data(inode)) {
                ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                return 0;
        } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                /*
                 * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is
                 * cleared. This means we are in the middle of moving of
                 * inline data to delay allocated block. Just force writeout
                 * here to finish conversion.
                 */
                error = filemap_flush(inode->i_mapping);
                if (error)
                        return error;
                if (!ext4_has_inline_data(inode))
                        return 0;
        }

        needed_blocks = ext4_chunk_trans_extent(inode, 1);

        iloc.bh = NULL;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
                return error;

        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
                goto out_free;
        }

        ext4_write_lock_xattr(inode, &no_expand);
        if (ext4_has_inline_data(inode))
                error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
        ext4_write_unlock_xattr(inode, &no_expand);
        ext4_journal_stop(handle);
out_free:
        brelse(iloc.bh);
        return error;
}











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RATELIMIT_H
#define _LINUX_RATELIMIT_H

#include <linux/ratelimit_types.h>
#include <linux/sched.h>
#include <linux/spinlock.h>

static inline void ratelimit_state_init(struct ratelimit_state *rs,
                                        int interval, int burst)
{
        memset(rs, 0, sizeof(*rs));

        raw_spin_lock_init(&rs->lock);
        rs->interval        = interval;
        rs->burst        = burst;
}

static inline void ratelimit_default_init(struct ratelimit_state *rs)
{
        return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL,
                                        DEFAULT_RATELIMIT_BURST);
}

static inline void ratelimit_state_inc_miss(struct ratelimit_state *rs)
{
        atomic_inc(&rs->missed);
}

static inline int ratelimit_state_get_miss(struct ratelimit_state *rs)
{
        return atomic_read(&rs->missed);
}

static inline int ratelimit_state_reset_miss(struct ratelimit_state *rs)
{
        return atomic_xchg_relaxed(&rs->missed, 0);
}

static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, int interval_init)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&rs->lock, flags);
        rs->interval = interval_init;
        rs->flags &= ~RATELIMIT_INITIALIZED;
        atomic_set(&rs->rs_n_left, rs->burst);
        ratelimit_state_reset_miss(rs);
        raw_spin_unlock_irqrestore(&rs->lock, flags);
}

static inline void ratelimit_state_exit(struct ratelimit_state *rs)
{
        int m;

        if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE))
                return;

        m = ratelimit_state_reset_miss(rs);
        if (m)
                pr_warn("%s: %d output lines suppressed due to ratelimiting\n", current->comm, m);
}

static inline void
ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags)
{
        rs->flags = flags;
}

extern struct ratelimit_state printk_ratelimit_state;

#ifdef CONFIG_PRINTK

#define WARN_ON_RATELIMIT(condition, state)        ({                \
        bool __rtn_cond = !!(condition);                        \
        WARN_ON(__rtn_cond && __ratelimit(state));                \
        __rtn_cond;                                                \
})

#define WARN_RATELIMIT(condition, format, ...)                        \
({                                                                \
        static DEFINE_RATELIMIT_STATE(_rs,                        \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);        \
        int rtn = !!(condition);                                \
                                                                \
        if (unlikely(rtn && __ratelimit(&_rs)))                        \
                WARN(rtn, format, ##__VA_ARGS__);                \
                                                                \
        rtn;                                                        \
})

#else

#define WARN_ON_RATELIMIT(condition, state)                        \
        WARN_ON(condition)

#define WARN_RATELIMIT(condition, format, ...)                        \
({                                                                \
        int rtn = WARN(condition, format, ##__VA_ARGS__);        \
        rtn;                                                        \
})

#endif

#endif /* _LINUX_RATELIMIT_H */






























































































    3 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_STRING_CHOICES_H_
#define _LINUX_STRING_CHOICES_H_

/*
 * Here provide a series of helpers in the str_$TRUE_$FALSE format (you can
 * also expand some helpers as needed), where $TRUE and $FALSE are their
 * corresponding literal strings. These helpers can be used in the printing
 * and also in other places where constant strings are required. Using these
 * helpers offers the following benefits:
 *  1) Reducing the hardcoding of strings, which makes the code more elegant
 *     through these simple literal-meaning helpers.
 *  2) Unifying the output, which prevents the same string from being printed
 *     in various forms, such as enable/disable, enabled/disabled, en/dis.
 *  3) Deduping by the linker, which results in a smaller binary file.
 */

#include <linux/types.h>

static inline const char *str_assert_deassert(bool v)
{
        return v ? "assert" : "deassert";
}
#define str_deassert_assert(v)                str_assert_deassert(!(v))

static inline const char *str_enable_disable(bool v)
{
        return v ? "enable" : "disable";
}
#define str_disable_enable(v)                str_enable_disable(!(v))

static inline const char *str_enabled_disabled(bool v)
{
        return v ? "enabled" : "disabled";
}
#define str_disabled_enabled(v)                str_enabled_disabled(!(v))

static inline const char *str_hi_lo(bool v)
{
        return v ? "hi" : "lo";
}
#define str_lo_hi(v)                str_hi_lo(!(v))

static inline const char *str_high_low(bool v)
{
        return v ? "high" : "low";
}
#define str_low_high(v)                str_high_low(!(v))

static inline const char *str_input_output(bool v)
{
        return v ? "input" : "output";
}
#define str_output_input(v)        str_input_output(!(v))

static inline const char *str_on_off(bool v)
{
        return v ? "on" : "off";
}
#define str_off_on(v)                str_on_off(!(v))

static inline const char *str_read_write(bool v)
{
        return v ? "read" : "write";
}
#define str_write_read(v)                str_read_write(!(v))

static inline const char *str_true_false(bool v)
{
        return v ? "true" : "false";
}
#define str_false_true(v)                str_true_false(!(v))

static inline const char *str_up_down(bool v)
{
        return v ? "up" : "down";
}
#define str_down_up(v)                str_up_down(!(v))

static inline const char *str_yes_no(bool v)
{
        return v ? "yes" : "no";
}
#define str_no_yes(v)                str_yes_no(!(v))

/**
 * str_plural - Return the simple pluralization based on English counts
 * @num: Number used for deciding pluralization
 *
 * If @num is 1, returns empty string, otherwise returns "s".
 */
static inline const char *str_plural(size_t num)
{
        return num == 1 ? "" : "s";
}

#endif




































   14 

   15 


























































































































   15 

   15 









































































































































   16 

































   16 



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/extable.h>
#include <linux/uaccess.h>
#include <linux/sched/debug.h>
#include <linux/bitfield.h>
#include <xen/xen.h>

#include <asm/fpu/api.h>
#include <asm/fred.h>
#include <asm/sev.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
#include <asm/insn-eval.h>
#include <asm/sgx.h>

static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr)
{
        int reg_offset = pt_regs_offset(regs, nr);
        static unsigned long __dummy;

        if (WARN_ON_ONCE(reg_offset < 0))
                return &__dummy;

        return (unsigned long *)((unsigned long)regs + reg_offset);
}

static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
        return (unsigned long)&x->fixup + x->fixup;
}

static bool ex_handler_default(const struct exception_table_entry *e,
                               struct pt_regs *regs)
{
        if (e->data & EX_FLAG_CLEAR_AX)
                regs->ax = 0;
        if (e->data & EX_FLAG_CLEAR_DX)
                regs->dx = 0;

        regs->ip = ex_fixup_addr(e);
        return true;
}

/*
 * This is the *very* rare case where we do a "load_unaligned_zeropad()"
 * and it's a page crosser into a non-existent page.
 *
 * This happens when we optimistically load a pathname a word-at-a-time
 * and the name is less than the full word and the  next page is not
 * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC.
 *
 * NOTE! The faulting address is always a 'mov mem,reg' type instruction
 * of size 'long', and the exception fixup must always point to right
 * after the instruction.
 */
static bool ex_handler_zeropad(const struct exception_table_entry *e,
                               struct pt_regs *regs,
                               unsigned long fault_addr)
{
        struct insn insn;
        const unsigned long mask = sizeof(long) - 1;
        unsigned long offset, addr, next_ip, len;
        unsigned long *reg;

        next_ip = ex_fixup_addr(e);
        len = next_ip - regs->ip;
        if (len > MAX_INSN_SIZE)
                return false;

        if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN))
                return false;
        if (insn.length != len)
                return false;

        if (insn.opcode.bytes[0] != 0x8b)
                return false;
        if (insn.opnd_bytes != sizeof(long))
                return false;

        addr = (unsigned long) insn_get_addr_ref(&insn, regs);
        if (addr == ~0ul)
                return false;

        offset = addr & mask;
        addr = addr & ~mask;
        if (fault_addr != addr + sizeof(long))
                return false;

        reg = insn_get_modrm_reg_ptr(&insn, regs);
        if (!reg)
                return false;

        *reg = *(unsigned long *)addr >> (offset * 8);
        return ex_handler_default(e, regs);
}

static bool ex_handler_fault(const struct exception_table_entry *fixup,
                             struct pt_regs *regs, int trapnr)
{
        regs->ax = trapnr;
        return ex_handler_default(fixup, regs);
}

static bool ex_handler_sgx(const struct exception_table_entry *fixup,
                           struct pt_regs *regs, int trapnr)
{
        regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG;
        return ex_handler_default(fixup, regs);
}

/*
 * Handler for when we fail to restore a task's FPU state.  We should never get
 * here because the FPU state of a task using the FPU (struct fpu::fpstate)
 * should always be valid.  However, past bugs have allowed userspace to set
 * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
 * These caused XRSTOR to fail when switching to the task, leaking the FPU
 * registers of the task previously executing on the CPU.  Mitigate this class
 * of vulnerability by restoring from the initial state (essentially, zeroing
 * out all the FPU registers) if we can't restore from the task's FPU state.
 */
static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
                                 struct pt_regs *regs)
{
        WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
                  (void *)instruction_pointer(regs));

        fpu_reset_from_exception_fixup();

        return ex_handler_default(fixup, regs);
}

/*
 * On x86-64, we end up being imprecise with 'access_ok()', and allow
 * non-canonical user addresses to make the range comparisons simpler,
 * and to not have to worry about LAM being enabled.
 *
 * In fact, we allow up to one page of "slop" at the sign boundary,
 * which means that we can do access_ok() by just checking the sign
 * of the pointer for the common case of having a small access size.
 */
static bool gp_fault_address_ok(unsigned long fault_address)
{
#ifdef CONFIG_X86_64
        /* Is it in the "user space" part of the non-canonical space? */
        if (valid_user_address(fault_address))
                return true;

        /* .. or just above it? */
        fault_address -= PAGE_SIZE;
        if (valid_user_address(fault_address))
                return true;
#endif
        return false;
}

static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
                               struct pt_regs *regs, int trapnr,
                               unsigned long fault_address)
{
        WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address),
                "General protection fault in user access. Non-canonical address?");
        return ex_handler_default(fixup, regs);
}

static bool ex_handler_msr(const struct exception_table_entry *fixup,
                           struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
        if (__ONCE_LITE_IF(!safe && wrmsr)) {
                pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
                        (unsigned int)regs->cx, (unsigned int)regs->dx,
                        (unsigned int)regs->ax,  regs->ip, (void *)regs->ip);
                show_stack_regs(regs);
        }

        if (__ONCE_LITE_IF(!safe && !wrmsr)) {
                pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
                        (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
                show_stack_regs(regs);
        }

        if (!wrmsr) {
                /* Pretend that the read succeeded and returned 0. */
                regs->ax = 0;
                regs->dx = 0;
        }

        if (safe)
                *pt_regs_nr(regs, reg) = -EIO;

        return ex_handler_default(fixup, regs);
}

static bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
                                struct pt_regs *regs)
{
        if (static_cpu_has(X86_BUG_NULL_SEG))
                asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
        asm volatile ("mov %0, %%fs" : : "rm" (0));
        return ex_handler_default(fixup, regs);
}

static bool ex_handler_imm_reg(const struct exception_table_entry *fixup,
                               struct pt_regs *regs, int reg, int imm)
{
        *pt_regs_nr(regs, reg) = (long)imm;
        return ex_handler_default(fixup, regs);
}

static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup,
                                  struct pt_regs *regs, int trapnr,
                                  unsigned long fault_address,
                                  int reg, int imm)
{
        regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg);
        return ex_handler_uaccess(fixup, regs, trapnr, fault_address);
}

#ifdef CONFIG_X86_FRED
static bool ex_handler_eretu(const struct exception_table_entry *fixup,
                             struct pt_regs *regs, unsigned long error_code)
{
        struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax));
        unsigned short ss = uregs->ss;
        unsigned short cs = uregs->cs;

        /*
         * Move the NMI bit from the invalid stack frame, which caused ERETU
         * to fault, to the fault handler's stack frame, thus to unblock NMI
         * with the fault handler's ERETS instruction ASAP if NMI is blocked.
         */
        regs->fred_ss.nmi = uregs->fred_ss.nmi;

        /*
         * Sync event information to uregs, i.e., the ERETU return frame, but
         * is it safe to write to the ERETU return frame which is just above
         * current event stack frame?
         *
         * The RSP used by FRED to push a stack frame is not the value in %rsp,
         * it is calculated from %rsp with the following 2 steps:
         * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0)        // Reserve N*64 bytes
         * 2) RSP = RSP & ~0x3f                // Align to a 64-byte cache line
         * when an event delivery doesn't trigger a stack level change.
         *
         * Here is an example with N*64 (N=1) bytes reserved:
         *
         *  64-byte cache line ==>  ______________
         *                         |___Reserved___|
         *                         |__Event_data__|
         *                         |_____SS_______|
         *                         |_____RSP______|
         *                         |_____FLAGS____|
         *                         |_____CS_______|
         *                         |_____IP_______|
         *  64-byte cache line ==> |__Error_code__| <== ERETU return frame
         *                         |______________|
         *                         |______________|
         *                         |______________|
         *                         |______________|
         *                         |______________|
         *                         |______________|
         *                         |______________|
         *  64-byte cache line ==> |______________| <== RSP after step 1) and 2)
         *                         |___Reserved___|
         *                         |__Event_data__|
         *                         |_____SS_______|
         *                         |_____RSP______|
         *                         |_____FLAGS____|
         *                         |_____CS_______|
         *                         |_____IP_______|
         *  64-byte cache line ==> |__Error_code__| <== ERETS return frame
         *
         * Thus a new FRED stack frame will always be pushed below a previous
         * FRED stack frame ((N*64) bytes may be reserved between), and it is
         * safe to write to a previous FRED stack frame as they never overlap.
         */
        fred_info(uregs)->edata = fred_event_data(regs);
        uregs->ssx = regs->ssx;
        uregs->fred_ss.ss = ss;
        /* The NMI bit was moved away above */
        uregs->fred_ss.nmi = 0;
        uregs->csx = regs->csx;
        uregs->fred_cs.sl = 0;
        uregs->fred_cs.wfe = 0;
        uregs->cs = cs;
        uregs->orig_ax = error_code;

        return ex_handler_default(fixup, regs);
}
#endif

int ex_get_fixup_type(unsigned long ip)
{
        const struct exception_table_entry *e = search_exception_tables(ip);

        return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE;
}

int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
                    unsigned long fault_addr)
{
        const struct exception_table_entry *e;
        int type, reg, imm;

#ifdef CONFIG_PNPBIOS
        if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
                extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
                extern u32 pnp_bios_is_utter_crap;
                pnp_bios_is_utter_crap = 1;
                printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
                __asm__ volatile(
                        "movl %0, %%esp\n\t"
                        "jmp *%1\n\t"
                        : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
                panic("do_trap: can't hit this");
        }
#endif

        e = search_exception_tables(regs->ip);
        if (!e)
                return 0;

        type = FIELD_GET(EX_DATA_TYPE_MASK, e->data);
        reg  = FIELD_GET(EX_DATA_REG_MASK,  e->data);
        imm  = FIELD_GET(EX_DATA_IMM_MASK,  e->data);

        switch (type) {
        case EX_TYPE_DEFAULT:
        case EX_TYPE_DEFAULT_MCE_SAFE:
                return ex_handler_default(e, regs);
        case EX_TYPE_FAULT:
        case EX_TYPE_FAULT_MCE_SAFE:
                return ex_handler_fault(e, regs, trapnr);
        case EX_TYPE_UACCESS:
                return ex_handler_uaccess(e, regs, trapnr, fault_addr);
        case EX_TYPE_CLEAR_FS:
                return ex_handler_clear_fs(e, regs);
        case EX_TYPE_FPU_RESTORE:
                return ex_handler_fprestore(e, regs);
        case EX_TYPE_BPF:
                return ex_handler_bpf(e, regs);
        case EX_TYPE_WRMSR:
                return ex_handler_msr(e, regs, true, false, reg);
        case EX_TYPE_RDMSR:
                return ex_handler_msr(e, regs, false, false, reg);
        case EX_TYPE_WRMSR_SAFE:
                return ex_handler_msr(e, regs, true, true, reg);
        case EX_TYPE_RDMSR_SAFE:
                return ex_handler_msr(e, regs, false, true, reg);
        case EX_TYPE_WRMSR_IN_MCE:
                ex_handler_msr_mce(regs, true);
                break;
        case EX_TYPE_RDMSR_IN_MCE:
                ex_handler_msr_mce(regs, false);
                break;
        case EX_TYPE_POP_REG:
                regs->sp += sizeof(long);
                fallthrough;
        case EX_TYPE_IMM_REG:
                return ex_handler_imm_reg(e, regs, reg, imm);
        case EX_TYPE_FAULT_SGX:
                return ex_handler_sgx(e, regs, trapnr);
        case EX_TYPE_UCOPY_LEN:
                return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm);
        case EX_TYPE_ZEROPAD:
                return ex_handler_zeropad(e, regs, fault_addr);
#ifdef CONFIG_X86_FRED
        case EX_TYPE_ERETU:
                return ex_handler_eretu(e, regs, error_code);
#endif
        }
        BUG();
}

extern unsigned int early_recursion_flag;

/* Restricted version used during very early boot */
void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
{
        /* Ignore early NMIs. */
        if (trapnr == X86_TRAP_NMI)
                return;

        if (early_recursion_flag > 2)
                goto halt_loop;

        /*
         * Old CPUs leave the high bits of CS on the stack
         * undefined.  I'm not sure which CPUs do this, but at least
         * the 486 DX works this way.
         * Xen pv domains are not using the default __KERNEL_CS.
         */
        if (!xen_pv_domain() && regs->cs != __KERNEL_CS)
                goto fail;

        /*
         * The full exception fixup machinery is available as soon as
         * the early IDT is loaded.  This means that it is the
         * responsibility of extable users to either function correctly
         * when handlers are invoked early or to simply avoid causing
         * exceptions before they're ready to handle them.
         *
         * This is better than filtering which handlers can be used,
         * because refusing to call a handler here is guaranteed to
         * result in a hard-to-debug panic.
         *
         * Keep in mind that not all vectors actually get here.  Early
         * page faults, for example, are special.
         */
        if (fixup_exception(regs, trapnr, regs->orig_ax, 0))
                return;

        if (trapnr == X86_TRAP_UD) {
                if (handle_bug(regs))
                        return;

                /*
                 * If this was a BUG and handle_bug returns or if this
                 * was just a normal #UD, we want to continue onward and
                 * crash.
                 */
        }

fail:
        early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
                     (unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
                     regs->orig_ax, read_cr2());

        show_regs(regs);

halt_loop:
        while (true)
                halt();
}







































































    1 


































    1 




    1 








    1 
    1 




    1 













    1 
    1 




















    1 
    1 


















































    3 
























    3 






















    3 



































    3 

































    3 












































































































   25 
























































   18 


   25 



   20 






    1 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
// SPDX-License-Identifier: GPL-2.0
#define CREATE_TRACE_POINTS
#include <trace/events/mmap_lock.h>

#include <linux/mm.h>
#include <linux/cgroup.h>
#include <linux/memcontrol.h>
#include <linux/mmap_lock.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/trace_events.h>
#include <linux/local_lock.h>

EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);

#ifdef CONFIG_TRACING
/*
 * Trace calls must be in a separate file, as otherwise there's a circular
 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
 */

void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
{
        trace_mmap_lock_start_locking(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);

void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
                                           bool success)
{
        trace_mmap_lock_acquire_returned(mm, write, success);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);

void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
{
        trace_mmap_lock_released(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#endif /* CONFIG_TRACING */

#ifdef CONFIG_MMU
#ifdef CONFIG_PER_VMA_LOCK

/* State shared across __vma_[start, end]_exclude_readers. */
struct vma_exclude_readers_state {
        /* Input parameters. */
        struct vm_area_struct *vma;
        int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */
        bool detaching;

        /* Output parameters. */
        bool detached;
        bool exclusive; /* Are we exclusively locked? */
};

/*
 * Now that all readers have been evicted, mark the VMA as being out of the
 * 'exclude readers' state.
 */
static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves)
{
        struct vm_area_struct *vma = ves->vma;

        VM_WARN_ON_ONCE(ves->detached);

        ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG,
                                              &vma->vm_refcnt);
        __vma_lockdep_release_exclusive(vma);
}

static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves)
{
        const unsigned int tgt = ves->detaching ? 0 : 1;

        return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG;
}

/*
 * Mark the VMA as being in a state of excluding readers, check to see if any
 * VMA read locks are indeed held, and if so wait for them to be released.
 *
 * Note that this function pairs with vma_refcount_put() which will wake up this
 * thread when it detects that the last reader has released its lock.
 *
 * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases
 * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal
 * signal is permitted to kill it.
 *
 * The function sets the ves->exclusive parameter to true if readers were
 * excluded, or false if the VMA was detached or an error arose on wait.
 *
 * If the function indicates an exclusive lock was acquired via ves->exclusive
 * the caller is required to invoke __vma_end_exclude_readers() once the
 * exclusive state is no longer required.
 *
 * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the
 * function may also return -EINTR to indicate a fatal signal was received while
 * waiting.  Otherwise, the function returns 0.
 */
static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves)
{
        struct vm_area_struct *vma = ves->vma;
        unsigned int tgt_refcnt = get_target_refcnt(ves);
        int err = 0;

        mmap_assert_write_locked(vma->vm_mm);

        /*
         * If vma is detached then only vma_mark_attached() can raise the
         * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
         *
         * See the comment describing the vm_area_struct->vm_refcnt field for
         * details of possible refcnt values.
         */
        if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) {
                ves->detached = true;
                return 0;
        }

        __vma_lockdep_acquire_exclusive(vma);
        err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
                   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
                   ves->state);
        if (err) {
                __vma_end_exclude_readers(ves);
                return err;
        }

        __vma_lockdep_stat_mark_acquired(vma);
        ves->exclusive = true;
        return 0;
}

int __vma_start_write(struct vm_area_struct *vma, int state)
{
        const unsigned int mm_lock_seq = __vma_raw_mm_seqnum(vma);
        struct vma_exclude_readers_state ves = {
                .vma = vma,
                .state = state,
        };
        int err;

        err = __vma_start_exclude_readers(&ves);
        if (err) {
                WARN_ON_ONCE(ves.detached);
                return err;
        }

        /*
         * We should use WRITE_ONCE() here because we can have concurrent reads
         * from the early lockless pessimistic check in vma_start_read().
         * We don't really care about the correctness of that early check, but
         * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
         */
        WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);

        if (ves.exclusive) {
                __vma_end_exclude_readers(&ves);
                /* VMA should remain attached. */
                WARN_ON_ONCE(ves.detached);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(__vma_start_write);

void __vma_exclude_readers_for_detach(struct vm_area_struct *vma)
{
        struct vma_exclude_readers_state ves = {
                .vma = vma,
                .state = TASK_UNINTERRUPTIBLE,
                .detaching = true,
        };
        int err;

        /*
         * Wait until the VMA is detached with no readers. Since we hold the VMA
         * write lock, the only read locks that might be present are those from
         * threads trying to acquire the read lock and incrementing the
         * reference count before realising the write lock is held and
         * decrementing it.
         */
        err = __vma_start_exclude_readers(&ves);
        if (!err && ves.exclusive) {
                /*
                 * Once this is complete, no readers can increment the
                 * reference count, and the VMA is marked detached.
                 */
                __vma_end_exclude_readers(&ves);
        }
        /* If an error arose but we were detached anyway, we don't care. */
        WARN_ON_ONCE(!ves.detached);
}

/*
 * Try to read-lock a vma. The function is allowed to occasionally yield false
 * locked result to avoid performance overhead, in which case we fall back to
 * using mmap_lock. The function should never yield false unlocked result.
 * False locked result is possible if mm_lock_seq overflows or if vma gets
 * reused and attached to a different mm before we lock it.
 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
 * detached.
 *
 * IMPORTANT: RCU lock must be held upon entering the function, but upon error
 *            IT IS RELEASED. The caller must handle this correctly.
 */
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
                                                    struct vm_area_struct *vma)
{
        struct mm_struct *other_mm;
        int oldcnt;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
        /*
         * Check before locking. A race might cause false locked result.
         * We can use READ_ONCE() for the mm_lock_seq here, and don't need
         * ACQUIRE semantics, because this is just a lockless check whose result
         * we don't rely on for anything - the mm_lock_seq read against which we
         * need ordering is below.
         */
        if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
                vma = NULL;
                goto err;
        }

        /*
         * If VM_REFCNT_EXCLUDE_READERS_FLAG is set,
         * __refcount_inc_not_zero_limited_acquire() will fail because
         * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG.
         *
         * Acquire fence is required here to avoid reordering against later
         * vm_lock_seq check and checks inside lock_vma_under_rcu().
         */
        if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
                                                              VM_REFCNT_LIMIT))) {
                /* return EAGAIN if vma got detached from under us */
                vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
                goto err;
        }

        __vma_lockdep_acquire_read(vma);

        if (unlikely(vma->vm_mm != mm))
                goto err_unstable;

        /*
         * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
         * False unlocked result is impossible because we modify and check
         * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
         * modification invalidates all existing locks.
         *
         * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
         * racing with vma_end_write_all(), we only start reading from the VMA
         * after it has been unlocked.
         * This pairs with RELEASE semantics in vma_end_write_all().
         */
        if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
                vma_refcount_put(vma);
                vma = NULL;
                goto err;
        }

        return vma;
err:
        rcu_read_unlock();

        return vma;
err_unstable:
        /*
         * If vma got attached to another mm from under us, that mm is not
         * stable and can be freed in the narrow window after vma->vm_refcnt
         * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
         * releasing vma->vm_refcnt.
         */
        other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */

        /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
        rcu_read_unlock();
        mmgrab(other_mm);
        vma_refcount_put(vma);
        mmdrop(other_mm);

        return NULL;
}

/*
 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
 * stable and not isolated. If the VMA is not found or is being modified the
 * function returns NULL.
 */
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address)
{
        MA_STATE(mas, &mm->mm_mt, address, address);
        struct vm_area_struct *vma;

retry:
        rcu_read_lock();
        vma = mas_walk(&mas);
        if (!vma) {
                rcu_read_unlock();
                goto inval;
        }

        vma = vma_start_read(mm, vma);
        if (IS_ERR_OR_NULL(vma)) {
                /* Check if the VMA got isolated after we found it */
                if (PTR_ERR(vma) == -EAGAIN) {
                        count_vm_vma_lock_event(VMA_LOCK_MISS);
                        /* The area was replaced with another one */
                        mas_set(&mas, address);
                        goto retry;
                }

                /* Failed to lock the VMA */
                goto inval;
        }
        /*
         * At this point, we have a stable reference to a VMA: The VMA is
         * locked and we know it hasn't already been isolated.
         * From here on, we can access the VMA without worrying about which
         * fields are accessible for RCU readers.
         */
        rcu_read_unlock();

        /* Check if the vma we locked is the right one. */
        if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
                vma_end_read(vma);
                goto inval;
        }

        return vma;

inval:
        count_vm_vma_lock_event(VMA_LOCK_ABORT);
        return NULL;
}

static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
                                                            struct vma_iterator *vmi,
                                                            unsigned long from_addr)
{
        struct vm_area_struct *vma;
        int ret;

        ret = mmap_read_lock_killable(mm);
        if (ret)
                return ERR_PTR(ret);

        /* Lookup the vma at the last position again under mmap_read_lock */
        vma_iter_set(vmi, from_addr);
        vma = vma_next(vmi);
        if (vma) {
                /* Very unlikely vma->vm_refcnt overflow case */
                if (unlikely(!vma_start_read_locked(vma)))
                        vma = ERR_PTR(-EAGAIN);
        }

        mmap_read_unlock(mm);

        return vma;
}

struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
                                     struct vma_iterator *vmi,
                                     unsigned long from_addr)
{
        struct vm_area_struct *vma;
        unsigned int mm_wr_seq;
        bool mmap_unlocked;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
retry:
        /* Start mmap_lock speculation in case we need to verify the vma later */
        mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
        vma = vma_next(vmi);
        if (!vma)
                return NULL;

        vma = vma_start_read(mm, vma);
        if (IS_ERR_OR_NULL(vma)) {
                /*
                 * Retry immediately if the vma gets detached from under us.
                 * Infinite loop should not happen because the vma we find will
                 * have to be constantly knocked out from under us.
                 */
                if (PTR_ERR(vma) == -EAGAIN) {
                        /* reset to search from the last address */
                        rcu_read_lock();
                        vma_iter_set(vmi, from_addr);
                        goto retry;
                }

                goto fallback;
        }

        /* Verify the vma is not behind the last search position. */
        if (unlikely(from_addr >= vma->vm_end))
                goto fallback_unlock;

        /*
         * vma can be ahead of the last search position but we need to verify
         * it was not shrunk after we found it and another vma has not been
         * installed ahead of it. Otherwise we might observe a gap that should
         * not be there.
         */
        if (from_addr < vma->vm_start) {
                /* Verify only if the address space might have changed since vma lookup. */
                if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
                        vma_iter_set(vmi, from_addr);
                        if (vma != vma_next(vmi))
                                goto fallback_unlock;
                }
        }

        return vma;

fallback_unlock:
        rcu_read_unlock();
        vma_end_read(vma);
fallback:
        vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
        rcu_read_lock();
        /* Reinitialize the iterator after re-entering rcu read section */
        vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);

        return vma;
}
#endif /* CONFIG_PER_VMA_LOCK */

#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
#include <linux/extable.h>

static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        if (likely(mmap_read_trylock(mm)))
                return true;

        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }

        return !mmap_read_lock_killable(mm);
}

static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
{
        /*
         * We don't have this operation yet.
         *
         * It should be easy enough to do: it's basically a
         *    atomic_long_try_cmpxchg_acquire()
         * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
         * it also needs the proper lockdep magic etc.
         */
        return false;
}

static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        mmap_read_unlock(mm);
        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }
        return !mmap_write_lock_killable(mm);
}

/*
 * Helper for page fault handling.
 *
 * This is kind of equivalent to "mmap_read_lock()" followed
 * by "find_extend_vma()", except it's a lot more careful about
 * the locking (and will drop the lock on failure).
 *
 * For example, if we have a kernel bug that causes a page
 * fault, we don't want to just use mmap_read_lock() to get
 * the mm lock, because that would deadlock if the bug were
 * to happen while we're holding the mm lock for writing.
 *
 * So this checks the exception tables on kernel faults in
 * order to only do this all for instructions that are actually
 * expected to fault.
 *
 * We can also actually take the mm lock for writing if we
 * need to extend the vma, which helps the VM layer a lot.
 */
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                        unsigned long addr, struct pt_regs *regs)
{
        struct vm_area_struct *vma;

        if (!get_mmap_lock_carefully(mm, regs))
                return NULL;

        vma = find_vma(mm, addr);
        if (likely(vma && (vma->vm_start <= addr)))
                return vma;

        /*
         * Well, dang. We might still be successful, but only
         * if we can extend a vma to do so.
         */
        if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
                mmap_read_unlock(mm);
                return NULL;
        }

        /*
         * We can try to upgrade the mmap lock atomically,
         * in which case we can continue to use the vma
         * we already looked up.
         *
         * Otherwise we'll have to drop the mmap lock and
         * re-take it, and also look up the vma again,
         * re-checking it.
         */
        if (!mmap_upgrade_trylock(mm)) {
                if (!upgrade_mmap_lock_carefully(mm, regs))
                        return NULL;

                vma = find_vma(mm, addr);
                if (!vma)
                        goto fail;
                if (vma->vm_start <= addr)
                        goto success;
                if (!(vma->vm_flags & VM_GROWSDOWN))
                        goto fail;
        }

        if (expand_stack_locked(vma, addr))
                goto fail;

success:
        mmap_write_downgrade(mm);
        return vma;

fail:
        mmap_write_unlock(mm);
        return NULL;
}
#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */

#else /* CONFIG_MMU */

/*
 * At least xtensa ends up having protection faults even with no
 * MMU.. No stack expansion, at least.
 */
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                        unsigned long addr, struct pt_regs *regs)
{
        struct vm_area_struct *vma;

        mmap_read_lock(mm);
        vma = vma_lookup(mm, addr);
        if (!vma)
                mmap_read_unlock(mm);
        return vma;
}

#endif /* CONFIG_MMU */


























    3 
   14 

   13 
    3 





    3 





    3 













    3 
    5 







    5 

    4 





    3 



    5 















    4 
    3 



    3 





    3 


    3 


    3 




















    5 





    5 






    5 







    3 






    3 


















    3 






    4 



    6 






    4 

















    2 










    3 


    6 

























    6 
    6 
























































   14 





   12 






    3 


   14 
    3 
   11 















    3 


    3 

    4 















    3 













    3 


    3 
















    3 





    3 







































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/sysfs/group.c - Operations for adding/removing multiple files at once.
 *
 * Copyright (c) 2003 Patrick Mochel
 * Copyright (c) 2003 Open Source Development Lab
 * Copyright (c) 2013 Greg Kroah-Hartman
 * Copyright (c) 2013 The Linux Foundation
 */

#include <linux/kobject.h>
#include <linux/module.h>
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/err.h>
#include <linux/fs.h>
#include "sysfs.h"


static void remove_files(struct kernfs_node *parent,
                         const struct attribute_group *grp)
{
        struct attribute *const *attr;
        const struct bin_attribute *const *bin_attr;

        if (grp->attrs)
                for (attr = grp->attrs; *attr; attr++)
                        kernfs_remove_by_name(parent, (*attr)->name);
        if (grp->bin_attrs)
                for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
                        kernfs_remove_by_name(parent, (*bin_attr)->attr.name);
}

static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj)
{
        if (grp->attrs && grp->attrs[0] && grp->is_visible)
                return grp->is_visible(kobj, grp->attrs[0], 0);

        if (grp->attrs && grp->attrs[0] && grp->is_visible_const)
                return grp->is_visible_const(kobj, grp->attrs[0], 0);

        if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible)
                return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0);

        return 0;
}

static int create_files(struct kernfs_node *parent, struct kobject *kobj,
                        kuid_t uid, kgid_t gid,
                        const struct attribute_group *grp, int update)
{
        struct attribute *const *attr;
        const struct bin_attribute *const *bin_attr;
        int error = 0, i;

        if (grp->attrs) {
                for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
                        umode_t mode = (*attr)->mode;

                        /*
                         * In update mode, we're changing the permissions or
                         * visibility.  Do this by first removing then
                         * re-adding (if required) the file.
                         */
                        if (update)
                                kernfs_remove_by_name(parent, (*attr)->name);
                        if (grp->is_visible || grp->is_visible_const) {
                                if (grp->is_visible)
                                        mode = grp->is_visible(kobj, *attr, i);
                                else
                                        mode = grp->is_visible_const(kobj, *attr, i);
                                mode &= ~SYSFS_GROUP_INVISIBLE;
                                if (!mode)
                                        continue;
                        }

                        WARN(mode & ~(SYSFS_PREALLOC | 0664),
                             "Attribute %s: Invalid permissions 0%o\n",
                             (*attr)->name, mode);

                        mode &= SYSFS_PREALLOC | 0664;
                        error = sysfs_add_file_mode_ns(parent, *attr, mode, uid,
                                                       gid, NULL);
                        if (unlikely(error))
                                break;
                }
                if (error) {
                        remove_files(parent, grp);
                        goto exit;
                }
        }

        if (grp->bin_attrs) {
                for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) {
                        umode_t mode = (*bin_attr)->attr.mode;
                        size_t size = (*bin_attr)->size;

                        if (update)
                                kernfs_remove_by_name(parent,
                                                (*bin_attr)->attr.name);
                        if (grp->is_bin_visible) {
                                mode = grp->is_bin_visible(kobj, *bin_attr, i);
                                mode &= ~SYSFS_GROUP_INVISIBLE;
                                if (!mode)
                                        continue;
                        }
                        if (grp->bin_size)
                                size = grp->bin_size(kobj, *bin_attr, i);

                        WARN(mode & ~(SYSFS_PREALLOC | 0664),
                             "Attribute %s: Invalid permissions 0%o\n",
                             (*bin_attr)->attr.name, mode);

                        mode &= SYSFS_PREALLOC | 0664;
                        error = sysfs_add_bin_file_mode_ns(parent, *bin_attr,
                                                           mode, size, uid, gid,
                                                           NULL);
                        if (error)
                                break;
                }
                if (error)
                        remove_files(parent, grp);
        }
exit:
        return error;
}


static int internal_create_group(struct kobject *kobj, int update,
                                 const struct attribute_group *grp)
{
        struct kernfs_node *kn;
        kuid_t uid;
        kgid_t gid;
        int error;

        if (WARN_ON(!kobj || (!update && !kobj->sd)))
                return -EINVAL;

        /* Updates may happen before the object has been instantiated */
        if (unlikely(update && !kobj->sd))
                return -EINVAL;

        if (!grp->attrs && !grp->bin_attrs) {
                pr_debug("sysfs: (bin_)attrs not set by subsystem for group: %s/%s, skipping\n",
                         kobj->name, grp->name ?: "");
                return 0;
        }

        kobject_get_ownership(kobj, &uid, &gid);
        if (grp->name) {
                umode_t mode = __first_visible(grp, kobj);

                if (mode & SYSFS_GROUP_INVISIBLE)
                        mode = 0;
                else
                        mode = S_IRWXU | S_IRUGO | S_IXUGO;

                if (update) {
                        kn = kernfs_find_and_get(kobj->sd, grp->name);
                        if (!kn) {
                                pr_debug("attr grp %s/%s not created yet\n",
                                         kobj->name, grp->name);
                                /* may have been invisible prior to this update */
                                update = 0;
                        } else if (!mode) {
                                sysfs_remove_group(kobj, grp);
                                kernfs_put(kn);
                                return 0;
                        }
                }

                if (!update) {
                        if (!mode)
                                return 0;
                        kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode,
                                                  uid, gid, kobj, NULL);
                        if (IS_ERR(kn)) {
                                if (PTR_ERR(kn) == -EEXIST)
                                        sysfs_warn_dup(kobj->sd, grp->name);
                                return PTR_ERR(kn);
                        }
                }
        } else {
                kn = kobj->sd;
        }

        kernfs_get(kn);
        error = create_files(kn, kobj, uid, gid, grp, update);
        if (error) {
                if (grp->name)
                        kernfs_remove(kn);
        }
        kernfs_put(kn);

        if (grp->name && update)
                kernfs_put(kn);

        return error;
}

/**
 * sysfs_create_group - given a directory kobject, create an attribute group
 * @kobj:        The kobject to create the group on
 * @grp:        The attribute group to create
 *
 * This function creates a group for the first time.  It will explicitly
 * warn and error if any of the attribute files being created already exist.
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_create_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        return internal_create_group(kobj, 0, grp);
}
EXPORT_SYMBOL_GPL(sysfs_create_group);

static int internal_create_groups(struct kobject *kobj, int update,
                                  const struct attribute_group *const *groups)
{
        int error = 0;
        int i;

        if (!groups)
                return 0;

        for (i = 0; groups[i]; i++) {
                error = internal_create_group(kobj, update, groups[i]);
                if (error) {
                        while (--i >= 0)
                                sysfs_remove_group(kobj, groups[i]);
                        break;
                }
        }
        return error;
}

/**
 * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups
 * @kobj:        The kobject to create the group on
 * @groups:        The attribute groups to create, NULL terminated
 *
 * This function creates a bunch of attribute groups.  If an error occurs when
 * creating a group, all previously created groups will be removed, unwinding
 * everything back to the original state when this function was called.
 * It will explicitly warn and error if any of the attribute files being
 * created already exist.
 *
 * Returns 0 on success or error code from sysfs_create_group on failure.
 */
int sysfs_create_groups(struct kobject *kobj,
                        const struct attribute_group *const *groups)
{
        return internal_create_groups(kobj, 0, groups);
}
EXPORT_SYMBOL_GPL(sysfs_create_groups);

/**
 * sysfs_update_groups - given a directory kobject, create a bunch of attribute groups
 * @kobj:        The kobject to update the group on
 * @groups:        The attribute groups to update, NULL terminated
 *
 * This function update a bunch of attribute groups.  If an error occurs when
 * updating a group, all previously updated groups will be removed together
 * with already existing (not updated) attributes.
 *
 * Returns 0 on success or error code from sysfs_update_group on failure.
 */
int sysfs_update_groups(struct kobject *kobj,
                        const struct attribute_group *const *groups)
{
        return internal_create_groups(kobj, 1, groups);
}
EXPORT_SYMBOL_GPL(sysfs_update_groups);

/**
 * sysfs_update_group - given a directory kobject, update an attribute group
 * @kobj:        The kobject to update the group on
 * @grp:        The attribute group to update
 *
 * This function updates an attribute group.  Unlike
 * sysfs_create_group(), it will explicitly not warn or error if any
 * of the attribute files being created already exist.  Furthermore,
 * if the visibility of the files has changed through the is_visible()
 * callback, it will update the permissions and add or remove the
 * relevant files. Changing a group's name (subdirectory name under
 * kobj's directory in sysfs) is not allowed.
 *
 * The primary use for this function is to call it after making a change
 * that affects group visibility.
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_update_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        return internal_create_group(kobj, 1, grp);
}
EXPORT_SYMBOL_GPL(sysfs_update_group);

/**
 * sysfs_remove_group: remove a group from a kobject
 * @kobj:        kobject to remove the group from
 * @grp:        group to remove
 *
 * This function removes a group of attributes from a kobject.  The attributes
 * previously have to have been created for this group, otherwise it will fail.
 */
void sysfs_remove_group(struct kobject *kobj,
                        const struct attribute_group *grp)
{
        struct kernfs_node *parent = kobj->sd;
        struct kernfs_node *kn;

        if (grp->name) {
                kn = kernfs_find_and_get(parent, grp->name);
                if (!kn) {
                        pr_debug("sysfs group '%s' not found for kobject '%s'\n",
                                 grp->name, kobject_name(kobj));
                        return;
                }
        } else {
                kn = parent;
                kernfs_get(kn);
        }

        remove_files(kn, grp);
        if (grp->name)
                kernfs_remove(kn);

        kernfs_put(kn);
}
EXPORT_SYMBOL_GPL(sysfs_remove_group);

/**
 * sysfs_remove_groups - remove a list of groups
 *
 * @kobj:        The kobject for the groups to be removed from
 * @groups:        NULL terminated list of groups to be removed
 *
 * If groups is not NULL, remove the specified groups from the kobject.
 */
void sysfs_remove_groups(struct kobject *kobj,
                         const struct attribute_group *const *groups)
{
        int i;

        if (!groups)
                return;
        for (i = 0; groups[i]; i++)
                sysfs_remove_group(kobj, groups[i]);
}
EXPORT_SYMBOL_GPL(sysfs_remove_groups);

/**
 * sysfs_merge_group - merge files into a pre-existing named attribute group.
 * @kobj:        The kobject containing the group.
 * @grp:        The files to create and the attribute group they belong to.
 *
 * This function returns an error if the group doesn't exist, the .name field is
 * NULL or any of the files already exist in that group, in which case none of
 * the new files are created.
 */
int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        struct kernfs_node *parent;
        kuid_t uid;
        kgid_t gid;
        int error = 0;
        struct attribute *const *attr;
        int i;

        parent = kernfs_find_and_get(kobj->sd, grp->name);
        if (!parent)
                return -ENOENT;

        kobject_get_ownership(kobj, &uid, &gid);

        for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
                error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode,
                                               uid, gid, NULL);
        if (error) {
                while (--i >= 0)
                        kernfs_remove_by_name(parent, (*--attr)->name);
        }
        kernfs_put(parent);

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_merge_group);

/**
 * sysfs_unmerge_group - remove files from a pre-existing named attribute group.
 * @kobj:        The kobject containing the group.
 * @grp:        The files to remove and the attribute group they belong to.
 */
void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        struct kernfs_node *parent;
        struct attribute *const *attr;

        parent = kernfs_find_and_get(kobj->sd, grp->name);
        if (parent) {
                for (attr = grp->attrs; *attr; ++attr)
                        kernfs_remove_by_name(parent, (*attr)->name);
                kernfs_put(parent);
        }
}
EXPORT_SYMBOL_GPL(sysfs_unmerge_group);

/**
 * sysfs_add_link_to_group - add a symlink to an attribute group.
 * @kobj:        The kobject containing the group.
 * @group_name:        The name of the group.
 * @target:        The target kobject of the symlink to create.
 * @link_name:        The name of the symlink to create.
 */
int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
                            struct kobject *target, const char *link_name)
{
        struct kernfs_node *parent;
        int error = 0;

        parent = kernfs_find_and_get(kobj->sd, group_name);
        if (!parent)
                return -ENOENT;

        error = sysfs_create_link_sd(parent, target, link_name);
        kernfs_put(parent);

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);

/**
 * sysfs_remove_link_from_group - remove a symlink from an attribute group.
 * @kobj:        The kobject containing the group.
 * @group_name:        The name of the group.
 * @link_name:        The name of the symlink to remove.
 */
void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
                                  const char *link_name)
{
        struct kernfs_node *parent;

        parent = kernfs_find_and_get(kobj->sd, group_name);
        if (parent) {
                kernfs_remove_by_name(parent, link_name);
                kernfs_put(parent);
        }
}
EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);

/**
 * compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing
 * to a group or an attribute
 * @kobj:                The kobject containing the group.
 * @target_kobj:        The target kobject.
 * @target_name:        The name of the target group or attribute.
 * @symlink_name:        The name of the symlink file (target_name will be
 *                        considered if symlink_name is NULL).
 */
int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
                                         struct kobject *target_kobj,
                                         const char *target_name,
                                         const char *symlink_name)
{
        struct kernfs_node *target;
        struct kernfs_node *entry;
        struct kernfs_node *link;

        /*
         * We don't own @target_kobj and it may be removed at any time.
         * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir()
         * for details.
         */
        spin_lock(&sysfs_symlink_target_lock);
        target = target_kobj->sd;
        if (target)
                kernfs_get(target);
        spin_unlock(&sysfs_symlink_target_lock);
        if (!target)
                return -ENOENT;

        entry = kernfs_find_and_get(target, target_name);
        if (!entry) {
                kernfs_put(target);
                return -ENOENT;
        }

        if (!symlink_name)
                symlink_name = target_name;

        link = kernfs_create_link(kobj->sd, symlink_name, entry);
        if (PTR_ERR(link) == -EEXIST)
                sysfs_warn_dup(kobj->sd, symlink_name);

        kernfs_put(entry);
        kernfs_put(target);
        return PTR_ERR_OR_ZERO(link);
}
EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj);

static int sysfs_group_attrs_change_owner(struct kobject *kobj,
                                          struct kernfs_node *grp_kn,
                                          const struct attribute_group *grp,
                                          struct iattr *newattrs)
{
        struct kernfs_node *kn;
        int error, i;
        umode_t mode;

        if (grp->attrs) {
                struct attribute *const *attr;

                for (i = 0, attr = grp->attrs; *attr; i++, attr++) {
                        if (grp->is_visible || grp->is_visible_const) {
                                if (grp->is_visible)
                                        mode = grp->is_visible(kobj, *attr, i);
                                else
                                        mode = grp->is_visible_const(kobj, *attr, i);
                                if (mode & SYSFS_GROUP_INVISIBLE)
                                        break;
                                if (!mode)
                                        continue;
                        }
                        kn = kernfs_find_and_get(grp_kn, (*attr)->name);
                        if (!kn)
                                return -ENOENT;

                        error = kernfs_setattr(kn, newattrs);
                        kernfs_put(kn);
                        if (error)
                                return error;
                }
        }

        if (grp->bin_attrs) {
                const struct bin_attribute *const *bin_attr;

                for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) {
                        if (grp->is_bin_visible) {
                                mode = grp->is_bin_visible(kobj, *bin_attr, i);
                                if (mode & SYSFS_GROUP_INVISIBLE)
                                        break;
                                if (!mode)
                                        continue;
                        }
                        kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);
                        if (!kn)
                                return -ENOENT;

                        error = kernfs_setattr(kn, newattrs);
                        kernfs_put(kn);
                        if (error)
                                return error;
                }
        }

        return 0;
}

/**
 * sysfs_group_change_owner - change owner of an attribute group.
 * @kobj:        The kobject containing the group.
 * @grp:        The attribute group.
 * @kuid:        new owner's kuid
 * @kgid:        new owner's kgid
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_group_change_owner(struct kobject *kobj,
                             const struct attribute_group *grp, kuid_t kuid,
                             kgid_t kgid)
{
        struct kernfs_node *grp_kn;
        int error;
        struct iattr newattrs = {
                .ia_valid = ATTR_UID | ATTR_GID,
                .ia_uid = kuid,
                .ia_gid = kgid,
        };

        if (!kobj->state_in_sysfs)
                return -EINVAL;

        if (grp->name) {
                grp_kn = kernfs_find_and_get(kobj->sd, grp->name);
        } else {
                kernfs_get(kobj->sd);
                grp_kn = kobj->sd;
        }
        if (!grp_kn)
                return -ENOENT;

        error = kernfs_setattr(grp_kn, &newattrs);
        if (!error)
                error = sysfs_group_attrs_change_owner(kobj, grp_kn, grp, &newattrs);

        kernfs_put(grp_kn);

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_group_change_owner);

/**
 * sysfs_groups_change_owner - change owner of a set of attribute groups.
 * @kobj:        The kobject containing the groups.
 * @groups:        The attribute groups.
 * @kuid:        new owner's kuid
 * @kgid:        new owner's kgid
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_groups_change_owner(struct kobject *kobj,
                              const struct attribute_group *const *groups,
                              kuid_t kuid, kgid_t kgid)
{
        int error = 0, i;

        if (!kobj->state_in_sysfs)
                return -EINVAL;

        if (!groups)
                return 0;

        for (i = 0; groups[i]; i++) {
                error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid);
                if (error)
                        break;
        }

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_groups_change_owner);






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
















    1 

    1 







































































































    1 















    1 











    1 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS segment constructor.
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Ryusuke Konishi.
 *
 */

#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/bitops.h>
#include <linux/bio.h>
#include <linux/completion.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/crc32.h>
#include <linux/folio_batch.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>

#include "nilfs.h"
#include "btnode.h"
#include "page.h"
#include "segment.h"
#include "sufile.h"
#include "cpfile.h"
#include "ifile.h"
#include "segbuf.h"


/*
 * Segment constructor
 */
#define SC_N_INODEVEC        16   /* Size of locally allocated inode vector */

#define SC_MAX_SEGDELTA 64   /*
                              * Upper limit of the number of segments
                              * appended in collection retry loop
                              */

/* Construction mode */
enum {
        SC_LSEG_SR = 1,        /* Make a logical segment having a super root */
        SC_LSEG_DSYNC,        /*
                         * Flush data blocks of a given file and make
                         * a logical segment without a super root.
                         */
        SC_FLUSH_FILE,        /*
                         * Flush data files, leads to segment writes without
                         * creating a checkpoint.
                         */
        SC_FLUSH_DAT,        /*
                         * Flush DAT file.  This also creates segments
                         * without a checkpoint.
                         */
};

/* Stage numbers of dirty block collection */
enum {
        NILFS_ST_INIT = 0,
        NILFS_ST_GC,                /* Collecting dirty blocks for GC */
        NILFS_ST_FILE,
        NILFS_ST_IFILE,
        NILFS_ST_CPFILE,
        NILFS_ST_SUFILE,
        NILFS_ST_DAT,
        NILFS_ST_SR,                /* Super root */
        NILFS_ST_DSYNC,                /* Data sync blocks */
        NILFS_ST_DONE,
};

#define CREATE_TRACE_POINTS
#include <trace/events/nilfs2.h>

/*
 * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are
 * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of
 * the variable must use them because transition of stage count must involve
 * trace events (trace_nilfs2_collection_stage_transition).
 *
 * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't
 * produce tracepoint events. It is provided just for making the intention
 * clear.
 */
static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci)
{
        sci->sc_stage.scnt++;
        trace_nilfs2_collection_stage_transition(sci);
}

static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt)
{
        sci->sc_stage.scnt = next_scnt;
        trace_nilfs2_collection_stage_transition(sci);
}

static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci)
{
        return sci->sc_stage.scnt;
}

/* State flags of collection */
#define NILFS_CF_NODE                0x0001        /* Collecting node blocks */
#define NILFS_CF_IFILE_STARTED        0x0002        /* IFILE stage has started */
#define NILFS_CF_SUFREED        0x0004        /* segment usages has been freed */
#define NILFS_CF_HISTORY_MASK        (NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED)

/* Operations depending on the construction mode and file type */
struct nilfs_sc_operations {
        int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
                            struct inode *);
        int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
                            struct inode *);
        int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
                            struct inode *);
        void (*write_data_binfo)(struct nilfs_sc_info *,
                                 struct nilfs_segsum_pointer *,
                                 union nilfs_binfo *);
        void (*write_node_binfo)(struct nilfs_sc_info *,
                                 struct nilfs_segsum_pointer *,
                                 union nilfs_binfo *);
};

/*
 * Other definitions
 */
static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);

#define nilfs_cnt32_ge(a, b)   \
        (typecheck(__u32, a) && typecheck(__u32, b) && \
         ((__s32)((a) - (b)) >= 0))

static int nilfs_prepare_segment_lock(struct super_block *sb,
                                      struct nilfs_transaction_info *ti)
{
        struct nilfs_transaction_info *cur_ti = current->journal_info;
        void *save = NULL;

        if (cur_ti) {
                if (cur_ti->ti_magic == NILFS_TI_MAGIC)
                        return ++cur_ti->ti_count;

                /*
                 * If journal_info field is occupied by other FS,
                 * it is saved and will be restored on
                 * nilfs_transaction_commit().
                 */
                nilfs_warn(sb, "journal info from a different FS");
                save = current->journal_info;
        }
        if (!ti) {
                ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
                if (!ti)
                        return -ENOMEM;
                ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
        } else {
                ti->ti_flags = 0;
        }
        ti->ti_count = 0;
        ti->ti_save = save;
        ti->ti_magic = NILFS_TI_MAGIC;
        current->journal_info = ti;
        return 0;
}

/**
 * nilfs_transaction_begin - start indivisible file operations.
 * @sb: super block
 * @ti: nilfs_transaction_info
 * @vacancy_check: flags for vacancy rate checks
 *
 * nilfs_transaction_begin() acquires a reader/writer semaphore, called
 * the segment semaphore, to make a segment construction and write tasks
 * exclusive.  The function is used with nilfs_transaction_commit() in pairs.
 * The region enclosed by these two functions can be nested.  To avoid a
 * deadlock, the semaphore is only acquired or released in the outermost call.
 *
 * This function allocates a nilfs_transaction_info struct to keep context
 * information on it.  It is initialized and hooked onto the current task in
 * the outermost call.  If a pre-allocated struct is given to @ti, it is used
 * instead; otherwise a new struct is assigned from a slab.
 *
 * When @vacancy_check flag is set, this function will check the amount of
 * free space, and will wait for the GC to reclaim disk space if low capacity.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-ENOMEM        - Insufficient memory available.
 * * %-ENOSPC        - No space left on device (if checking free space).
 */
int nilfs_transaction_begin(struct super_block *sb,
                            struct nilfs_transaction_info *ti,
                            int vacancy_check)
{
        struct the_nilfs *nilfs;
        int ret = nilfs_prepare_segment_lock(sb, ti);
        struct nilfs_transaction_info *trace_ti;

        if (unlikely(ret < 0))
                return ret;
        if (ret > 0) {
                trace_ti = current->journal_info;

                trace_nilfs2_transaction_transition(sb, trace_ti,
                                    trace_ti->ti_count, trace_ti->ti_flags,
                                    TRACE_NILFS2_TRANSACTION_BEGIN);
                return 0;
        }

        sb_start_intwrite(sb);

        nilfs = sb->s_fs_info;
        down_read(&nilfs->ns_segctor_sem);
        if (vacancy_check && nilfs_near_disk_full(nilfs)) {
                up_read(&nilfs->ns_segctor_sem);
                ret = -ENOSPC;
                goto failed;
        }

        trace_ti = current->journal_info;
        trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count,
                                            trace_ti->ti_flags,
                                            TRACE_NILFS2_TRANSACTION_BEGIN);
        return 0;

 failed:
        ti = current->journal_info;
        current->journal_info = ti->ti_save;
        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
                kmem_cache_free(nilfs_transaction_cachep, ti);
        sb_end_intwrite(sb);
        return ret;
}

/**
 * nilfs_transaction_commit - commit indivisible file operations.
 * @sb: super block
 *
 * nilfs_transaction_commit() releases the read semaphore which is
 * acquired by nilfs_transaction_begin(). This is only performed
 * in outermost call of this function.  If a commit flag is set,
 * nilfs_transaction_commit() sets a timer to start the segment
 * constructor.  If a sync flag is set, it starts construction
 * directly.
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_transaction_commit(struct super_block *sb)
{
        struct nilfs_transaction_info *ti = current->journal_info;
        struct the_nilfs *nilfs = sb->s_fs_info;
        int err = 0;

        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
        ti->ti_flags |= NILFS_TI_COMMIT;
        if (ti->ti_count > 0) {
                ti->ti_count--;
                trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
                            ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
                return 0;
        }
        if (nilfs->ns_writer) {
                struct nilfs_sc_info *sci = nilfs->ns_writer;

                if (ti->ti_flags & NILFS_TI_COMMIT)
                        nilfs_segctor_start_timer(sci);
                if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark)
                        nilfs_segctor_do_flush(sci, 0);
        }
        up_read(&nilfs->ns_segctor_sem);
        trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
                            ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);

        current->journal_info = ti->ti_save;

        if (ti->ti_flags & NILFS_TI_SYNC)
                err = nilfs_construct_segment(sb);
        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
                kmem_cache_free(nilfs_transaction_cachep, ti);
        sb_end_intwrite(sb);
        return err;
}

void nilfs_transaction_abort(struct super_block *sb)
{
        struct nilfs_transaction_info *ti = current->journal_info;
        struct the_nilfs *nilfs = sb->s_fs_info;

        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
        if (ti->ti_count > 0) {
                ti->ti_count--;
                trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
                            ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
                return;
        }
        up_read(&nilfs->ns_segctor_sem);

        trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
                    ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);

        current->journal_info = ti->ti_save;
        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
                kmem_cache_free(nilfs_transaction_cachep, ti);
        sb_end_intwrite(sb);
}

void nilfs_relax_pressure_in_lock(struct super_block *sb)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_sc_info *sci = nilfs->ns_writer;

        if (sb_rdonly(sb) || unlikely(!sci) || !sci->sc_flush_request)
                return;

        set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
        up_read(&nilfs->ns_segctor_sem);

        down_write(&nilfs->ns_segctor_sem);
        if (sci->sc_flush_request &&
            test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
                struct nilfs_transaction_info *ti = current->journal_info;

                ti->ti_flags |= NILFS_TI_WRITER;
                nilfs_segctor_do_immediate_flush(sci);
                ti->ti_flags &= ~NILFS_TI_WRITER;
        }
        downgrade_write(&nilfs->ns_segctor_sem);
}

static void nilfs_transaction_lock(struct super_block *sb,
                                   struct nilfs_transaction_info *ti,
                                   int gcflag)
{
        struct nilfs_transaction_info *cur_ti = current->journal_info;
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_sc_info *sci = nilfs->ns_writer;

        WARN_ON(cur_ti);
        ti->ti_flags = NILFS_TI_WRITER;
        ti->ti_count = 0;
        ti->ti_save = cur_ti;
        ti->ti_magic = NILFS_TI_MAGIC;
        current->journal_info = ti;

        for (;;) {
                trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
                            ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK);

                down_write(&nilfs->ns_segctor_sem);
                if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
                        break;

                nilfs_segctor_do_immediate_flush(sci);

                up_write(&nilfs->ns_segctor_sem);
                cond_resched();
        }
        if (gcflag)
                ti->ti_flags |= NILFS_TI_GC;

        trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
                            ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK);
}

static void nilfs_transaction_unlock(struct super_block *sb)
{
        struct nilfs_transaction_info *ti = current->journal_info;
        struct the_nilfs *nilfs = sb->s_fs_info;

        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
        BUG_ON(ti->ti_count > 0);

        up_write(&nilfs->ns_segctor_sem);
        current->journal_info = ti->ti_save;

        trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
                            ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK);
}

static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
                                            struct nilfs_segsum_pointer *ssp,
                                            unsigned int bytes)
{
        struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
        unsigned int blocksize = sci->sc_super->s_blocksize;
        void *p;

        if (unlikely(ssp->offset + bytes > blocksize)) {
                ssp->offset = 0;
                BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
                                               &segbuf->sb_segsum_buffers));
                ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
        }
        p = ssp->bh->b_data + ssp->offset;
        ssp->offset += bytes;
        return p;
}

/**
 * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
 * @sci: nilfs_sc_info
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
{
        struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
        struct buffer_head *sumbh;
        unsigned int sumbytes;
        unsigned int flags = 0;
        int err;

        if (nilfs_doing_gc())
                flags = NILFS_SS_GC;
        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
        if (unlikely(err))
                return err;

        sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
        sumbytes = segbuf->sb_sum.sumbytes;
        sci->sc_finfo_ptr.bh = sumbh;  sci->sc_finfo_ptr.offset = sumbytes;
        sci->sc_binfo_ptr.bh = sumbh;  sci->sc_binfo_ptr.offset = sumbytes;
        sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
        return 0;
}

/**
 * nilfs_segctor_zeropad_segsum - zero pad the rest of the segment summary area
 * @sci: segment constructor object
 *
 * nilfs_segctor_zeropad_segsum() zero-fills unallocated space at the end of
 * the current segment summary block.
 */
static void nilfs_segctor_zeropad_segsum(struct nilfs_sc_info *sci)
{
        struct nilfs_segsum_pointer *ssp;

        ssp = sci->sc_blk_cnt > 0 ? &sci->sc_binfo_ptr : &sci->sc_finfo_ptr;
        if (ssp->offset < ssp->bh->b_size)
                memset(ssp->bh->b_data + ssp->offset, 0,
                       ssp->bh->b_size - ssp->offset);
}

static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
{
        sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
        if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
                return -E2BIG; /*
                                * The current segment is filled up
                                * (internal code)
                                */
        nilfs_segctor_zeropad_segsum(sci);
        sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
        return nilfs_segctor_reset_segment_buffer(sci);
}

static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
{
        struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
        int err;

        if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
                err = nilfs_segctor_feed_segment(sci);
                if (err)
                        return err;
                segbuf = sci->sc_curseg;
        }
        err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
        if (likely(!err))
                segbuf->sb_sum.flags |= NILFS_SS_SR;
        return err;
}

/*
 * Functions for making segment summary and payloads
 */
static int nilfs_segctor_segsum_block_required(
        struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
        unsigned int binfo_size)
{
        unsigned int blocksize = sci->sc_super->s_blocksize;
        /* Size of finfo and binfo is enough small against blocksize */

        return ssp->offset + binfo_size +
                (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
                blocksize;
}

static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
                                      struct inode *inode)
{
        sci->sc_curseg->sb_sum.nfinfo++;
        sci->sc_binfo_ptr = sci->sc_finfo_ptr;
        nilfs_segctor_map_segsum_entry(
                sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));

        if (NILFS_I(inode)->i_root &&
            !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
                set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
        /* skip finfo */
}

static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
                                    struct inode *inode)
{
        struct nilfs_finfo *finfo;
        struct nilfs_inode_info *ii;
        struct nilfs_segment_buffer *segbuf;
        __u64 cno;

        if (sci->sc_blk_cnt == 0)
                return;

        ii = NILFS_I(inode);

        if (ii->i_type & NILFS_I_TYPE_GC)
                cno = ii->i_cno;
        else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
                cno = 0;
        else
                cno = sci->sc_cno;

        finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
                                                 sizeof(*finfo));
        finfo->fi_ino = cpu_to_le64(inode->i_ino);
        finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
        finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
        finfo->fi_cno = cpu_to_le64(cno);

        segbuf = sci->sc_curseg;
        segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
                sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
        sci->sc_finfo_ptr = sci->sc_binfo_ptr;
        sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
}

static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
                                        struct buffer_head *bh,
                                        struct inode *inode,
                                        unsigned int binfo_size)
{
        struct nilfs_segment_buffer *segbuf;
        int required, err = 0;

 retry:
        segbuf = sci->sc_curseg;
        required = nilfs_segctor_segsum_block_required(
                sci, &sci->sc_binfo_ptr, binfo_size);
        if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
                nilfs_segctor_end_finfo(sci, inode);
                err = nilfs_segctor_feed_segment(sci);
                if (err)
                        return err;
                goto retry;
        }
        if (unlikely(required)) {
                nilfs_segctor_zeropad_segsum(sci);
                err = nilfs_segbuf_extend_segsum(segbuf);
                if (unlikely(err))
                        goto failed;
        }
        if (sci->sc_blk_cnt == 0)
                nilfs_segctor_begin_finfo(sci, inode);

        nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
        /* Substitution to vblocknr is delayed until update_blocknr() */
        nilfs_segbuf_add_file_buffer(segbuf, bh);
        sci->sc_blk_cnt++;
 failed:
        return err;
}

/*
 * Callback functions that enumerate, mark, and collect dirty blocks
 */
static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
                                   struct buffer_head *bh, struct inode *inode)
{
        int err;

        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
        if (err < 0)
                return err;

        err = nilfs_segctor_add_file_block(sci, bh, inode,
                                           sizeof(struct nilfs_binfo_v));
        if (!err)
                sci->sc_datablk_cnt++;
        return err;
}

static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
                                   struct buffer_head *bh,
                                   struct inode *inode)
{
        return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
}

static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
                                   struct buffer_head *bh,
                                   struct inode *inode)
{
        WARN_ON(!buffer_dirty(bh));
        return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
}

static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
                                        struct nilfs_segsum_pointer *ssp,
                                        union nilfs_binfo *binfo)
{
        struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
                sci, ssp, sizeof(*binfo_v));
        *binfo_v = binfo->bi_v;
}

static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
                                        struct nilfs_segsum_pointer *ssp,
                                        union nilfs_binfo *binfo)
{
        __le64 *vblocknr = nilfs_segctor_map_segsum_entry(
                sci, ssp, sizeof(*vblocknr));
        *vblocknr = binfo->bi_v.bi_vblocknr;
}

static const struct nilfs_sc_operations nilfs_sc_file_ops = {
        .collect_data = nilfs_collect_file_data,
        .collect_node = nilfs_collect_file_node,
        .collect_bmap = nilfs_collect_file_bmap,
        .write_data_binfo = nilfs_write_file_data_binfo,
        .write_node_binfo = nilfs_write_file_node_binfo,
};

static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
                                  struct buffer_head *bh, struct inode *inode)
{
        int err;

        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
        if (err < 0)
                return err;

        err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
        if (!err)
                sci->sc_datablk_cnt++;
        return err;
}

static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
                                  struct buffer_head *bh, struct inode *inode)
{
        WARN_ON(!buffer_dirty(bh));
        return nilfs_segctor_add_file_block(sci, bh, inode,
                                            sizeof(struct nilfs_binfo_dat));
}

static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
                                       struct nilfs_segsum_pointer *ssp,
                                       union nilfs_binfo *binfo)
{
        __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
                                                          sizeof(*blkoff));
        *blkoff = binfo->bi_dat.bi_blkoff;
}

static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
                                       struct nilfs_segsum_pointer *ssp,
                                       union nilfs_binfo *binfo)
{
        struct nilfs_binfo_dat *binfo_dat =
                nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
        *binfo_dat = binfo->bi_dat;
}

static const struct nilfs_sc_operations nilfs_sc_dat_ops = {
        .collect_data = nilfs_collect_dat_data,
        .collect_node = nilfs_collect_file_node,
        .collect_bmap = nilfs_collect_dat_bmap,
        .write_data_binfo = nilfs_write_dat_data_binfo,
        .write_node_binfo = nilfs_write_dat_node_binfo,
};

static const struct nilfs_sc_operations nilfs_sc_dsync_ops = {
        .collect_data = nilfs_collect_file_data,
        .collect_node = NULL,
        .collect_bmap = NULL,
        .write_data_binfo = nilfs_write_file_data_binfo,
        .write_node_binfo = NULL,
};

static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
                                              struct list_head *listp,
                                              size_t nlimit,
                                              loff_t start, loff_t end)
{
        struct address_space *mapping = inode->i_mapping;
        struct folio_batch fbatch;
        pgoff_t index = 0, last = ULONG_MAX;
        size_t ndirties = 0;
        int i;

        if (unlikely(start != 0 || end != LLONG_MAX)) {
                /*
                 * A valid range is given for sync-ing data pages. The
                 * range is rounded to per-page; extra dirty buffers
                 * may be included if blocksize < pagesize.
                 */
                index = start >> PAGE_SHIFT;
                last = end >> PAGE_SHIFT;
        }
        folio_batch_init(&fbatch);
 repeat:
        if (unlikely(index > last) ||
              !filemap_get_folios_tag(mapping, &index, last,
                      PAGECACHE_TAG_DIRTY, &fbatch))
                return ndirties;

        for (i = 0; i < folio_batch_count(&fbatch); i++) {
                struct buffer_head *bh, *head;
                struct folio *folio = fbatch.folios[i];

                folio_lock(folio);
                if (unlikely(folio->mapping != mapping)) {
                        /* Exclude folios removed from the address space */
                        folio_unlock(folio);
                        continue;
                }
                head = folio_buffers(folio);
                if (!head)
                        head = create_empty_buffers(folio,
                                        i_blocksize(inode), 0);

                bh = head;
                do {
                        if (!buffer_dirty(bh) || buffer_async_write(bh))
                                continue;
                        get_bh(bh);
                        list_add_tail(&bh->b_assoc_buffers, listp);
                        ndirties++;
                        if (unlikely(ndirties >= nlimit)) {
                                folio_unlock(folio);
                                folio_batch_release(&fbatch);
                                cond_resched();
                                return ndirties;
                        }
                } while (bh = bh->b_this_page, bh != head);

                folio_unlock(folio);
        }
        folio_batch_release(&fbatch);
        cond_resched();
        goto repeat;
}

static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
                                            struct list_head *listp)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct inode *btnc_inode = ii->i_assoc_inode;
        struct folio_batch fbatch;
        struct buffer_head *bh, *head;
        unsigned int i;
        pgoff_t index = 0;

        if (!btnc_inode)
                return;
        folio_batch_init(&fbatch);

        while (filemap_get_folios_tag(btnc_inode->i_mapping, &index,
                                (pgoff_t)-1, PAGECACHE_TAG_DIRTY, &fbatch)) {
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        bh = head = folio_buffers(fbatch.folios[i]);
                        do {
                                if (buffer_dirty(bh) &&
                                                !buffer_async_write(bh)) {
                                        get_bh(bh);
                                        list_add_tail(&bh->b_assoc_buffers,
                                                      listp);
                                }
                                bh = bh->b_this_page;
                        } while (bh != head);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
}

static void nilfs_dispose_list(struct the_nilfs *nilfs,
                               struct list_head *head, int force)
{
        struct nilfs_inode_info *ii, *n;
        struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
        unsigned int nv = 0;

        while (!list_empty(head)) {
                spin_lock(&nilfs->ns_inode_lock);
                list_for_each_entry_safe(ii, n, head, i_dirty) {
                        list_del_init(&ii->i_dirty);
                        if (force) {
                                if (unlikely(ii->i_bh)) {
                                        brelse(ii->i_bh);
                                        ii->i_bh = NULL;
                                }
                        } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
                                set_bit(NILFS_I_QUEUED, &ii->i_state);
                                list_add_tail(&ii->i_dirty,
                                              &nilfs->ns_dirty_files);
                                continue;
                        }
                        ivec[nv++] = ii;
                        if (nv == SC_N_INODEVEC)
                                break;
                }
                spin_unlock(&nilfs->ns_inode_lock);

                for (pii = ivec; nv > 0; pii++, nv--)
                        iput(&(*pii)->vfs_inode);
        }
}

static void nilfs_iput_work_func(struct work_struct *work)
{
        struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
                                                 sc_iput_work);
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;

        nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
}

static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
                                     struct nilfs_root *root)
{
        int ret = 0;

        if (nilfs_mdt_fetch_dirty(root->ifile))
                ret++;
        if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
                ret++;
        if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
                ret++;
        if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
                ret++;
        return ret;
}

static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
{
        return list_empty(&sci->sc_dirty_files) &&
                !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
                sci->sc_nfreesegs == 0 &&
                (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
}

static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
{
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int ret = 0;

        if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);

        spin_lock(&nilfs->ns_inode_lock);
        if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci))
                ret++;

        spin_unlock(&nilfs->ns_inode_lock);
        return ret;
}

static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
{
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;

        nilfs_mdt_clear_dirty(sci->sc_root->ifile);
        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
        nilfs_mdt_clear_dirty(nilfs->ns_sufile);
        nilfs_mdt_clear_dirty(nilfs->ns_dat);
}

static void nilfs_fill_in_file_bmap(struct inode *ifile,
                                    struct nilfs_inode_info *ii)

{
        struct buffer_head *ibh;
        struct nilfs_inode *raw_inode;

        if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
                ibh = ii->i_bh;
                BUG_ON(!ibh);
                raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
                                                  ibh);
                nilfs_bmap_write(ii->i_bmap, raw_inode);
                nilfs_ifile_unmap_inode(raw_inode);
        }
}

static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
{
        struct nilfs_inode_info *ii;

        list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
                nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
                set_bit(NILFS_I_COLLECTED, &ii->i_state);
        }
}

/**
 * nilfs_write_root_mdt_inode - export root metadata inode information to
 *                              the on-disk inode
 * @inode:     inode object of the root metadata file
 * @raw_inode: on-disk inode
 *
 * nilfs_write_root_mdt_inode() writes inode information and bmap data of
 * @inode to the inode area of the metadata file allocated on the super root
 * block created to finalize the log.  Since super root blocks are configured
 * each time, this function zero-fills the unused area of @raw_inode.
 */
static void nilfs_write_root_mdt_inode(struct inode *inode,
                                       struct nilfs_inode *raw_inode)
{
        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;

        nilfs_write_inode_common(inode, raw_inode);

        /* zero-fill unused portion of raw_inode */
        raw_inode->i_xattr = 0;
        raw_inode->i_pad = 0;
        memset((void *)raw_inode + sizeof(*raw_inode), 0,
               nilfs->ns_inode_size - sizeof(*raw_inode));

        nilfs_bmap_write(NILFS_I(inode)->i_bmap, raw_inode);
}

static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs)
{
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
        unsigned int isz, srsz;

        bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;

        lock_buffer(bh_sr);
        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
        isz = nilfs->ns_inode_size;
        srsz = NILFS_SR_BYTES(isz);

        raw_sr->sr_sum = 0;  /* Ensure initialization within this update */
        raw_sr->sr_bytes = cpu_to_le16(srsz);
        raw_sr->sr_nongc_ctime
                = cpu_to_le64(nilfs_doing_gc() ?
                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
        raw_sr->sr_flags = 0;

        nilfs_write_root_mdt_inode(nilfs->ns_dat, (void *)raw_sr +
                                   NILFS_SR_DAT_OFFSET(isz));
        nilfs_write_root_mdt_inode(nilfs->ns_cpfile, (void *)raw_sr +
                                   NILFS_SR_CPFILE_OFFSET(isz));
        nilfs_write_root_mdt_inode(nilfs->ns_sufile, (void *)raw_sr +
                                   NILFS_SR_SUFILE_OFFSET(isz));

        memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
        set_buffer_uptodate(bh_sr);
        unlock_buffer(bh_sr);
}

static void nilfs_redirty_inodes(struct list_head *head)
{
        struct nilfs_inode_info *ii;

        list_for_each_entry(ii, head, i_dirty) {
                if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
                        clear_bit(NILFS_I_COLLECTED, &ii->i_state);
        }
}

static void nilfs_drop_collected_inodes(struct list_head *head)
{
        struct nilfs_inode_info *ii;

        list_for_each_entry(ii, head, i_dirty) {
                if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
                        continue;

                clear_bit(NILFS_I_INODE_SYNC, &ii->i_state);
                set_bit(NILFS_I_UPDATED, &ii->i_state);
        }
}

static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
                                       struct inode *inode,
                                       struct list_head *listp,
                                       int (*collect)(struct nilfs_sc_info *,
                                                      struct buffer_head *,
                                                      struct inode *))
{
        struct buffer_head *bh, *n;
        int err = 0;

        if (collect) {
                list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
                        list_del_init(&bh->b_assoc_buffers);
                        err = collect(sci, bh, inode);
                        brelse(bh);
                        if (unlikely(err))
                                goto dispose_buffers;
                }
                return 0;
        }

 dispose_buffers:
        while (!list_empty(listp)) {
                bh = list_first_entry(listp, struct buffer_head,
                                      b_assoc_buffers);
                list_del_init(&bh->b_assoc_buffers);
                brelse(bh);
        }
        return err;
}

static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
{
        /* Remaining number of blocks within segment buffer */
        return sci->sc_segbuf_nblocks -
                (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
}

static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
                                   struct inode *inode,
                                   const struct nilfs_sc_operations *sc_ops)
{
        LIST_HEAD(data_buffers);
        LIST_HEAD(node_buffers);
        int err;

        if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
                size_t n, rest = nilfs_segctor_buffer_rest(sci);

                n = nilfs_lookup_dirty_data_buffers(
                        inode, &data_buffers, rest + 1, 0, LLONG_MAX);
                if (n > rest) {
                        err = nilfs_segctor_apply_buffers(
                                sci, inode, &data_buffers,
                                sc_ops->collect_data);
                        BUG_ON(!err); /* always receive -E2BIG or true error */
                        goto break_or_fail;
                }
        }
        nilfs_lookup_dirty_node_buffers(inode, &node_buffers);

        if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
                err = nilfs_segctor_apply_buffers(
                        sci, inode, &data_buffers, sc_ops->collect_data);
                if (unlikely(err)) {
                        /* dispose node list */
                        nilfs_segctor_apply_buffers(
                                sci, inode, &node_buffers, NULL);
                        goto break_or_fail;
                }
                sci->sc_stage.flags |= NILFS_CF_NODE;
        }
        /* Collect node */
        err = nilfs_segctor_apply_buffers(
                sci, inode, &node_buffers, sc_ops->collect_node);
        if (unlikely(err))
                goto break_or_fail;

        nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
        err = nilfs_segctor_apply_buffers(
                sci, inode, &node_buffers, sc_ops->collect_bmap);
        if (unlikely(err))
                goto break_or_fail;

        nilfs_segctor_end_finfo(sci, inode);
        sci->sc_stage.flags &= ~NILFS_CF_NODE;

 break_or_fail:
        return err;
}

static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
                                         struct inode *inode)
{
        LIST_HEAD(data_buffers);
        size_t n, rest = nilfs_segctor_buffer_rest(sci);
        int err;

        n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
                                            sci->sc_dsync_start,
                                            sci->sc_dsync_end);

        err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
                                          nilfs_collect_file_data);
        if (!err) {
                nilfs_segctor_end_finfo(sci, inode);
                BUG_ON(n > rest);
                /* always receive -E2BIG or true error if n > rest */
        }
        return err;
}

/**
 * nilfs_free_segments - free the segments given by an array of segment numbers
 * @nilfs:   nilfs object
 * @segnumv: array of segment numbers to be freed
 * @nsegs:   number of segments to be freed in @segnumv
 *
 * nilfs_free_segments() wraps nilfs_sufile_freev() and
 * nilfs_sufile_cancel_freev(), and edits the segment usage metadata file
 * (sufile) to free all segments given by @segnumv and @nsegs at once.  If
 * it fails midway, it cancels the changes so that none of the segments are
 * freed.  If @nsegs is 0, this function does nothing.
 *
 * The freeing of segments is not finalized until the writing of a log with
 * a super root block containing this sufile change is complete, and it can
 * be canceled with nilfs_sufile_cancel_freev() until then.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - Invalid segment number.
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 */
static int nilfs_free_segments(struct the_nilfs *nilfs, __u64 *segnumv,
                               size_t nsegs)
{
        size_t ndone;
        int ret;

        if (!nsegs)
                return 0;

        ret = nilfs_sufile_freev(nilfs->ns_sufile, segnumv, nsegs, &ndone);
        if (unlikely(ret)) {
                nilfs_sufile_cancel_freev(nilfs->ns_sufile, segnumv, ndone,
                                          NULL);
                /*
                 * If a segment usage of the segments to be freed is in a
                 * hole block, nilfs_sufile_freev() will return -ENOENT.
                 * In this case, -EINVAL should be returned to the caller
                 * since there is something wrong with the given segment
                 * number array.  This error can only occur during GC, so
                 * there is no need to worry about it propagating to other
                 * callers (such as fsync).
                 */
                if (ret == -ENOENT) {
                        nilfs_err(nilfs->ns_sb,
                                  "The segment usage entry %llu to be freed is invalid (in a hole)",
                                  (unsigned long long)segnumv[ndone]);
                        ret = -EINVAL;
                }
        }
        return ret;
}

static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
{
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        struct list_head *head;
        struct nilfs_inode_info *ii;
        int err = 0;

        switch (nilfs_sc_cstage_get(sci)) {
        case NILFS_ST_INIT:
                /* Pre-processes */
                sci->sc_stage.flags = 0;

                if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
                        sci->sc_nblk_inc = 0;
                        sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
                        if (mode == SC_LSEG_DSYNC) {
                                nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC);
                                goto dsync_mode;
                        }
                }

                sci->sc_stage.dirty_file_ptr = NULL;
                sci->sc_stage.gc_inode_ptr = NULL;
                if (mode == SC_FLUSH_DAT) {
                        nilfs_sc_cstage_set(sci, NILFS_ST_DAT);
                        goto dat_stage;
                }
                nilfs_sc_cstage_inc(sci);
                fallthrough;
        case NILFS_ST_GC:
                if (nilfs_doing_gc()) {
                        head = &sci->sc_gc_inodes;
                        ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
                                                head, i_dirty);
                        list_for_each_entry_continue(ii, head, i_dirty) {
                                err = nilfs_segctor_scan_file(
                                        sci, &ii->vfs_inode,
                                        &nilfs_sc_file_ops);
                                if (unlikely(err)) {
                                        sci->sc_stage.gc_inode_ptr = list_entry(
                                                ii->i_dirty.prev,
                                                struct nilfs_inode_info,
                                                i_dirty);
                                        goto break_or_fail;
                                }
                                set_bit(NILFS_I_COLLECTED, &ii->i_state);
                        }
                        sci->sc_stage.gc_inode_ptr = NULL;
                }
                nilfs_sc_cstage_inc(sci);
                fallthrough;
        case NILFS_ST_FILE:
                head = &sci->sc_dirty_files;
                ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
                                        i_dirty);
                list_for_each_entry_continue(ii, head, i_dirty) {
                        clear_bit(NILFS_I_DIRTY, &ii->i_state);

                        err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
                                                      &nilfs_sc_file_ops);
                        if (unlikely(err)) {
                                sci->sc_stage.dirty_file_ptr =
                                        list_entry(ii->i_dirty.prev,
                                                   struct nilfs_inode_info,
                                                   i_dirty);
                                goto break_or_fail;
                        }
                        /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
                        /* XXX: required ? */
                }
                sci->sc_stage.dirty_file_ptr = NULL;
                if (mode == SC_FLUSH_FILE) {
                        nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
                        return 0;
                }
                nilfs_sc_cstage_inc(sci);
                sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
                fallthrough;
        case NILFS_ST_IFILE:
                err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
                        break;
                nilfs_sc_cstage_inc(sci);
                /* Creating a checkpoint */
                err = nilfs_cpfile_create_checkpoint(nilfs->ns_cpfile,
                                                     nilfs->ns_cno);
                if (unlikely(err))
                        break;
                fallthrough;
        case NILFS_ST_CPFILE:
                err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
                        break;
                nilfs_sc_cstage_inc(sci);
                fallthrough;
        case NILFS_ST_SUFILE:
                err = nilfs_free_segments(nilfs, sci->sc_freesegs,
                                          sci->sc_nfreesegs);
                if (unlikely(err))
                        break;
                sci->sc_stage.flags |= NILFS_CF_SUFREED;

                err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
                        break;
                nilfs_sc_cstage_inc(sci);
                fallthrough;
        case NILFS_ST_DAT:
 dat_stage:
                err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
                                              &nilfs_sc_dat_ops);
                if (unlikely(err))
                        break;
                if (mode == SC_FLUSH_DAT) {
                        nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
                        return 0;
                }
                nilfs_sc_cstage_inc(sci);
                fallthrough;
        case NILFS_ST_SR:
                if (mode == SC_LSEG_SR) {
                        /* Appending a super root */
                        err = nilfs_segctor_add_super_root(sci);
                        if (unlikely(err))
                                break;
                }
                /* End of a logical segment */
                sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
                nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
                return 0;
        case NILFS_ST_DSYNC:
 dsync_mode:
                sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
                ii = sci->sc_dsync_inode;
                if (!test_bit(NILFS_I_BUSY, &ii->i_state))
                        break;

                err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
                if (unlikely(err))
                        break;
                sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
                nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
                return 0;
        case NILFS_ST_DONE:
                return 0;
        default:
                BUG();
        }

 break_or_fail:
        return err;
}

/**
 * nilfs_segctor_begin_construction - setup segment buffer to make a new log
 * @sci: nilfs_sc_info
 * @nilfs: nilfs object
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
                                            struct the_nilfs *nilfs)
{
        struct nilfs_segment_buffer *segbuf, *prev;
        __u64 nextnum;
        int err, alloc = 0;

        segbuf = nilfs_segbuf_new(sci->sc_super);
        if (unlikely(!segbuf))
                return -ENOMEM;

        if (list_empty(&sci->sc_write_logs)) {
                nilfs_segbuf_map(segbuf, nilfs->ns_segnum,
                                 nilfs->ns_pseg_offset, nilfs);
                if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
                        nilfs_shift_to_next_segment(nilfs);
                        nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
                }

                segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
                nextnum = nilfs->ns_nextnum;

                if (nilfs->ns_segnum == nilfs->ns_nextnum)
                        /* Start from the head of a new full segment */
                        alloc++;
        } else {
                /* Continue logs */
                prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
                nilfs_segbuf_map_cont(segbuf, prev);
                segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq;
                nextnum = prev->sb_nextnum;

                if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
                        nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
                        segbuf->sb_sum.seg_seq++;
                        alloc++;
                }
        }

        err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum);
        if (err)
                goto failed;

        if (alloc) {
                err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
                if (err)
                        goto failed;
        }
        nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);

        BUG_ON(!list_empty(&sci->sc_segbufs));
        list_add_tail(&segbuf->sb_list, &sci->sc_segbufs);
        sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
        return 0;

 failed:
        nilfs_segbuf_free(segbuf);
        return err;
}

static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
                                         struct the_nilfs *nilfs, int nadd)
{
        struct nilfs_segment_buffer *segbuf, *prev;
        struct inode *sufile = nilfs->ns_sufile;
        __u64 nextnextnum;
        LIST_HEAD(list);
        int err, ret, i;

        prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
        /*
         * Since the segment specified with nextnum might be allocated during
         * the previous construction, the buffer including its segusage may
         * not be dirty.  The following call ensures that the buffer is dirty
         * and will pin the buffer on memory until the sufile is written.
         */
        err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum);
        if (unlikely(err))
                return err;

        for (i = 0; i < nadd; i++) {
                /* extend segment info */
                err = -ENOMEM;
                segbuf = nilfs_segbuf_new(sci->sc_super);
                if (unlikely(!segbuf))
                        goto failed;

                /* map this buffer to region of segment on-disk */
                nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
                sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;

                /* allocate the next next full segment */
                err = nilfs_sufile_alloc(sufile, &nextnextnum);
                if (unlikely(err))
                        goto failed_segbuf;

                segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
                nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);

                list_add_tail(&segbuf->sb_list, &list);
                prev = segbuf;
        }
        list_splice_tail(&list, &sci->sc_segbufs);
        return 0;

 failed_segbuf:
        nilfs_segbuf_free(segbuf);
 failed:
        list_for_each_entry(segbuf, &list, sb_list) {
                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
                WARN_ON(ret); /* never fails */
        }
        nilfs_destroy_logs(&list);
        return err;
}

static void nilfs_free_incomplete_logs(struct list_head *logs,
                                       struct the_nilfs *nilfs)
{
        struct nilfs_segment_buffer *segbuf, *prev;
        struct inode *sufile = nilfs->ns_sufile;
        int ret;

        segbuf = NILFS_FIRST_SEGBUF(logs);
        if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
                WARN_ON(ret); /* never fails */
        }
        if (atomic_read(&segbuf->sb_err)) {
                /* Case 1: The first segment failed */
                if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
                        /*
                         * Case 1a:  Partial segment appended into an existing
                         * segment
                         */
                        nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
                                                segbuf->sb_fseg_end);
                else /* Case 1b:  New full segment */
                        set_nilfs_discontinued(nilfs);
        }

        prev = segbuf;
        list_for_each_entry_continue(segbuf, logs, sb_list) {
                if (prev->sb_nextnum != segbuf->sb_nextnum) {
                        ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
                        WARN_ON(ret); /* never fails */
                }
                if (atomic_read(&segbuf->sb_err) &&
                    segbuf->sb_segnum != nilfs->ns_nextnum)
                        /* Case 2: extended segment (!= next) failed */
                        nilfs_sufile_set_error(sufile, segbuf->sb_segnum);
                prev = segbuf;
        }
}

static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
                                          struct inode *sufile)
{
        struct nilfs_segment_buffer *segbuf;
        unsigned long live_blocks;
        int ret;

        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
                live_blocks = segbuf->sb_sum.nblocks +
                        (segbuf->sb_pseg_start - segbuf->sb_fseg_start);
                ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
                                                     live_blocks,
                                                     sci->sc_seg_ctime);
                WARN_ON(ret); /* always succeed because the segusage is dirty */
        }
}

static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile)
{
        struct nilfs_segment_buffer *segbuf;
        int ret;

        segbuf = NILFS_FIRST_SEGBUF(logs);
        ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
                                             segbuf->sb_pseg_start -
                                             segbuf->sb_fseg_start, 0);
        WARN_ON(ret); /* always succeed because the segusage is dirty */

        list_for_each_entry_continue(segbuf, logs, sb_list) {
                ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
                                                     0, 0);
                WARN_ON(ret); /* always succeed */
        }
}

static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
                                            struct nilfs_segment_buffer *last,
                                            struct inode *sufile)
{
        struct nilfs_segment_buffer *segbuf = last;
        int ret;

        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
                sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
                WARN_ON(ret);
        }
        nilfs_truncate_logs(&sci->sc_segbufs, last);
}


static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                                 struct the_nilfs *nilfs, int mode)
{
        struct nilfs_cstage prev_stage = sci->sc_stage;
        int err, nadd = 1;

        /* Collection retry loop */
        for (;;) {
                sci->sc_nblk_this_inc = 0;
                sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);

                err = nilfs_segctor_reset_segment_buffer(sci);
                if (unlikely(err))
                        goto failed;

                err = nilfs_segctor_collect_blocks(sci, mode);
                sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
                if (!err)
                        break;

                if (unlikely(err != -E2BIG))
                        goto failed;

                /* The current segment is filled up */
                if (mode != SC_LSEG_SR ||
                    nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE)
                        break;

                nilfs_clear_logs(&sci->sc_segbufs);

                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
                        err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
                                                        sci->sc_freesegs,
                                                        sci->sc_nfreesegs,
                                                        NULL);
                        WARN_ON(err); /* do not happen */
                        sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
                }

                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
                if (unlikely(err))
                        return err;

                nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
                sci->sc_stage = prev_stage;
        }
        nilfs_segctor_zeropad_segsum(sci);
        nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
        return 0;

 failed:
        return err;
}

static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
                                      struct buffer_head *new_bh)
{
        BUG_ON(!list_empty(&new_bh->b_assoc_buffers));

        list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
        /* The caller must release old_bh */
}

static int
nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
                                     struct nilfs_segment_buffer *segbuf,
                                     int mode)
{
        struct inode *inode = NULL;
        sector_t blocknr;
        unsigned long nfinfo = segbuf->sb_sum.nfinfo;
        unsigned long nblocks = 0, ndatablk = 0;
        const struct nilfs_sc_operations *sc_op = NULL;
        struct nilfs_segsum_pointer ssp;
        struct nilfs_finfo *finfo = NULL;
        union nilfs_binfo binfo;
        struct buffer_head *bh, *bh_org;
        ino_t ino = 0;
        int err = 0;

        if (!nfinfo)
                goto out;

        blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
        ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
        ssp.offset = sizeof(struct nilfs_segment_summary);

        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
                if (bh == segbuf->sb_super_root)
                        break;
                if (!finfo) {
                        finfo =        nilfs_segctor_map_segsum_entry(
                                sci, &ssp, sizeof(*finfo));
                        ino = le64_to_cpu(finfo->fi_ino);
                        nblocks = le32_to_cpu(finfo->fi_nblocks);
                        ndatablk = le32_to_cpu(finfo->fi_ndatablk);

                        inode = bh->b_folio->mapping->host;

                        if (mode == SC_LSEG_DSYNC)
                                sc_op = &nilfs_sc_dsync_ops;
                        else if (ino == NILFS_DAT_INO)
                                sc_op = &nilfs_sc_dat_ops;
                        else /* file blocks */
                                sc_op = &nilfs_sc_file_ops;
                }
                bh_org = bh;
                get_bh(bh_org);
                err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
                                        &binfo);
                if (bh != bh_org)
                        nilfs_list_replace_buffer(bh_org, bh);
                brelse(bh_org);
                if (unlikely(err))
                        goto failed_bmap;

                if (ndatablk > 0)
                        sc_op->write_data_binfo(sci, &ssp, &binfo);
                else
                        sc_op->write_node_binfo(sci, &ssp, &binfo);

                blocknr++;
                if (--nblocks == 0) {
                        finfo = NULL;
                        if (--nfinfo == 0)
                                break;
                } else if (ndatablk > 0)
                        ndatablk--;
        }
 out:
        return 0;

 failed_bmap:
        return err;
}

static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
{
        struct nilfs_segment_buffer *segbuf;
        int err;

        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
                err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
                if (unlikely(err))
                        return err;
                nilfs_segbuf_fill_in_segsum(segbuf);
        }
        return 0;
}

static void nilfs_begin_folio_io(struct folio *folio)
{
        if (!folio || folio_test_writeback(folio))
                /*
                 * For split b-tree node pages, this function may be called
                 * twice.  We ignore the 2nd or later calls by this check.
                 */
                return;

        folio_lock(folio);
        folio_clear_dirty_for_io(folio);
        folio_start_writeback(folio);
        folio_unlock(folio);
}

/**
 * nilfs_prepare_write_logs - prepare to write logs
 * @logs: logs to prepare for writing
 * @seed: checksum seed value
 *
 * nilfs_prepare_write_logs() adds checksums and prepares the block
 * buffers/folios for writing logs.  In order to stabilize folios of
 * memory-mapped file blocks by putting them in writeback state before
 * calculating the checksums, first prepare to write payload blocks other
 * than segment summary and super root blocks in which the checksums will
 * be embedded.
 */
static void nilfs_prepare_write_logs(struct list_head *logs, u32 seed)
{
        struct nilfs_segment_buffer *segbuf;
        struct folio *bd_folio = NULL, *fs_folio = NULL;
        struct buffer_head *bh;

        /* Prepare to write payload blocks */
        list_for_each_entry(segbuf, logs, sb_list) {
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
                        if (bh == segbuf->sb_super_root)
                                break;
                        set_buffer_async_write(bh);
                        if (bh->b_folio != fs_folio) {
                                nilfs_begin_folio_io(fs_folio);
                                fs_folio = bh->b_folio;
                        }
                }
        }
        nilfs_begin_folio_io(fs_folio);

        nilfs_add_checksums_on_logs(logs, seed);

        /* Prepare to write segment summary blocks */
        list_for_each_entry(segbuf, logs, sb_list) {
                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
                                    b_assoc_buffers) {
                        mark_buffer_dirty(bh);
                        if (bh->b_folio == bd_folio)
                                continue;
                        if (bd_folio) {
                                folio_lock(bd_folio);
                                folio_wait_writeback(bd_folio);
                                folio_clear_dirty_for_io(bd_folio);
                                folio_start_writeback(bd_folio);
                                folio_unlock(bd_folio);
                        }
                        bd_folio = bh->b_folio;
                }
        }

        /* Prepare to write super root block */
        bh = NILFS_LAST_SEGBUF(logs)->sb_super_root;
        if (bh) {
                mark_buffer_dirty(bh);
                if (bh->b_folio != bd_folio) {
                        folio_lock(bd_folio);
                        folio_wait_writeback(bd_folio);
                        folio_clear_dirty_for_io(bd_folio);
                        folio_start_writeback(bd_folio);
                        folio_unlock(bd_folio);
                        bd_folio = bh->b_folio;
                }
        }

        if (bd_folio) {
                folio_lock(bd_folio);
                folio_wait_writeback(bd_folio);
                folio_clear_dirty_for_io(bd_folio);
                folio_start_writeback(bd_folio);
                folio_unlock(bd_folio);
        }
}

static int nilfs_segctor_write(struct nilfs_sc_info *sci,
                               struct the_nilfs *nilfs)
{
        int ret;

        ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
        list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
        return ret;
}

static void nilfs_end_folio_io(struct folio *folio, int err)
{
        if (!folio)
                return;

        if (buffer_nilfs_node(folio_buffers(folio)) &&
                        !folio_test_writeback(folio)) {
                /*
                 * For b-tree node pages, this function may be called twice
                 * or more because they might be split in a segment.
                 */
                if (folio_test_dirty(folio)) {
                        /*
                         * For pages holding split b-tree node buffers, dirty
                         * flag on the buffers may be cleared discretely.
                         * In that case, the page is once redirtied for
                         * remaining buffers, and it must be cancelled if
                         * all the buffers get cleaned later.
                         */
                        folio_lock(folio);
                        if (nilfs_folio_buffers_clean(folio))
                                __nilfs_clear_folio_dirty(folio);
                        folio_unlock(folio);
                }
                return;
        }

        if (err || !nilfs_folio_buffers_clean(folio))
                filemap_dirty_folio(folio->mapping, folio);

        folio_end_writeback(folio);
}

static void nilfs_abort_logs(struct list_head *logs, int err)
{
        struct nilfs_segment_buffer *segbuf;
        struct folio *bd_folio = NULL, *fs_folio = NULL;
        struct buffer_head *bh;

        if (list_empty(logs))
                return;

        list_for_each_entry(segbuf, logs, sb_list) {
                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
                                    b_assoc_buffers) {
                        clear_buffer_uptodate(bh);
                        if (bh->b_folio != bd_folio) {
                                if (bd_folio)
                                        folio_end_writeback(bd_folio);
                                bd_folio = bh->b_folio;
                        }
                }

                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
                        if (bh == segbuf->sb_super_root) {
                                clear_buffer_uptodate(bh);
                                if (bh->b_folio != bd_folio) {
                                        folio_end_writeback(bd_folio);
                                        bd_folio = bh->b_folio;
                                }
                                break;
                        }
                        clear_buffer_async_write(bh);
                        if (bh->b_folio != fs_folio) {
                                nilfs_end_folio_io(fs_folio, err);
                                fs_folio = bh->b_folio;
                        }
                }
        }
        if (bd_folio)
                folio_end_writeback(bd_folio);

        nilfs_end_folio_io(fs_folio, err);
}

static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs, int err)
{
        LIST_HEAD(logs);
        int ret;

        list_splice_tail_init(&sci->sc_write_logs, &logs);
        ret = nilfs_wait_on_logs(&logs);
        nilfs_abort_logs(&logs, ret ? : err);

        list_splice_tail_init(&sci->sc_segbufs, &logs);
        if (list_empty(&logs))
                return; /* if the first segment buffer preparation failed */

        nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
        nilfs_free_incomplete_logs(&logs, nilfs);

        if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
                ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
                                                sci->sc_freesegs,
                                                sci->sc_nfreesegs,
                                                NULL);
                WARN_ON(ret); /* do not happen */
        }

        nilfs_destroy_logs(&logs);
}

static void nilfs_set_next_segment(struct the_nilfs *nilfs,
                                   struct nilfs_segment_buffer *segbuf)
{
        nilfs->ns_segnum = segbuf->sb_segnum;
        nilfs->ns_nextnum = segbuf->sb_nextnum;
        nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
                + segbuf->sb_sum.nblocks;
        nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
        nilfs->ns_ctime = segbuf->sb_sum.ctime;
}

static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
{
        struct nilfs_segment_buffer *segbuf;
        struct folio *bd_folio = NULL, *fs_folio = NULL;
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int update_sr = false;

        list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
                struct buffer_head *bh;

                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
                                    b_assoc_buffers) {
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
                        if (bh->b_folio != bd_folio) {
                                if (bd_folio)
                                        folio_end_writeback(bd_folio);
                                bd_folio = bh->b_folio;
                        }
                }
                /*
                 * We assume that the buffers which belong to the same folio
                 * continue over the buffer list.
                 * Under this assumption, the last BHs of folios is
                 * identifiable by the discontinuity of bh->b_folio
                 * (folio != fs_folio).
                 *
                 * For B-tree node blocks, however, this assumption is not
                 * guaranteed.  The cleanup code of B-tree node folios needs
                 * special care.
                 */
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
                        const unsigned long set_bits = BIT(BH_Uptodate);
                        const unsigned long clear_bits =
                                (BIT(BH_Dirty) | BIT(BH_Async_Write) |
                                 BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
                                 BIT(BH_NILFS_Redirected));

                        if (bh == segbuf->sb_super_root) {
                                set_buffer_uptodate(bh);
                                clear_buffer_dirty(bh);
                                if (bh->b_folio != bd_folio) {
                                        folio_end_writeback(bd_folio);
                                        bd_folio = bh->b_folio;
                                }
                                update_sr = true;
                                break;
                        }
                        set_mask_bits(&bh->b_state, clear_bits, set_bits);
                        if (bh->b_folio != fs_folio) {
                                nilfs_end_folio_io(fs_folio, 0);
                                fs_folio = bh->b_folio;
                        }
                }

                if (!nilfs_segbuf_simplex(segbuf)) {
                        if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) {
                                set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
                                sci->sc_lseg_stime = jiffies;
                        }
                        if (segbuf->sb_sum.flags & NILFS_SS_LOGEND)
                                clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
                }
        }
        /*
         * Since folios may continue over multiple segment buffers,
         * end of the last folio must be checked outside of the loop.
         */
        if (bd_folio)
                folio_end_writeback(bd_folio);

        nilfs_end_folio_io(fs_folio, 0);

        nilfs_drop_collected_inodes(&sci->sc_dirty_files);

        if (nilfs_doing_gc())
                nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
        else
                nilfs->ns_nongc_ctime = sci->sc_seg_ctime;

        sci->sc_nblk_inc += sci->sc_nblk_this_inc;

        segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
        nilfs_set_next_segment(nilfs, segbuf);

        if (update_sr) {
                nilfs->ns_flushed_device = 0;
                nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
                                       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);

                clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
                clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
                set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
                nilfs_segctor_clear_metadata_dirty(sci);
        } else
                clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
}

static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
{
        int ret;

        ret = nilfs_wait_on_logs(&sci->sc_write_logs);
        if (!ret) {
                nilfs_segctor_complete_write(sci);
                nilfs_destroy_logs(&sci->sc_write_logs);
        }
        return ret;
}

static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs)
{
        struct nilfs_inode_info *ii, *n;
        struct inode *ifile = sci->sc_root->ifile;

        spin_lock(&nilfs->ns_inode_lock);
 retry:
        list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) {
                if (!ii->i_bh) {
                        struct buffer_head *ibh;
                        int err;

                        spin_unlock(&nilfs->ns_inode_lock);
                        err = nilfs_ifile_get_inode_block(
                                ifile, ii->vfs_inode.i_ino, &ibh);
                        if (unlikely(err)) {
                                nilfs_warn(sci->sc_super,
                                           "log writer: error %d getting inode block (ino=%llu)",
                                           err, ii->vfs_inode.i_ino);
                                return err;
                        }
                        spin_lock(&nilfs->ns_inode_lock);
                        if (likely(!ii->i_bh))
                                ii->i_bh = ibh;
                        else
                                brelse(ibh);
                        goto retry;
                }

                // Always redirty the buffer to avoid race condition
                mark_buffer_dirty(ii->i_bh);
                nilfs_mdt_mark_dirty(ifile);

                clear_bit(NILFS_I_QUEUED, &ii->i_state);
                set_bit(NILFS_I_BUSY, &ii->i_state);
                list_move_tail(&ii->i_dirty, &sci->sc_dirty_files);
        }
        spin_unlock(&nilfs->ns_inode_lock);

        return 0;
}

static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs)
{
        struct nilfs_inode_info *ii, *n;
        int during_mount = !(sci->sc_super->s_flags & SB_ACTIVE);
        int defer_iput = false;

        spin_lock(&nilfs->ns_inode_lock);
        list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
                if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
                    test_bit(NILFS_I_DIRTY, &ii->i_state))
                        continue;

                clear_bit(NILFS_I_BUSY, &ii->i_state);
                brelse(ii->i_bh);
                ii->i_bh = NULL;
                list_del_init(&ii->i_dirty);
                if (!ii->vfs_inode.i_nlink || during_mount) {
                        /*
                         * Defer calling iput() to avoid deadlocks if
                         * i_nlink == 0 or mount is not yet finished.
                         */
                        list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
                        defer_iput = true;
                } else {
                        spin_unlock(&nilfs->ns_inode_lock);
                        iput(&ii->vfs_inode);
                        spin_lock(&nilfs->ns_inode_lock);
                }
        }
        spin_unlock(&nilfs->ns_inode_lock);

        if (defer_iput)
                schedule_work(&sci->sc_iput_work);
}

/*
 * Main procedure of segment constructor
 */
static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
{
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int err;

        if (sb_rdonly(sci->sc_super))
                return -EROFS;

        nilfs_sc_cstage_set(sci, NILFS_ST_INIT);
        sci->sc_cno = nilfs->ns_cno;

        err = nilfs_segctor_collect_dirty_files(sci, nilfs);
        if (unlikely(err))
                goto out;

        if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);

        if (nilfs_segctor_clean(sci))
                goto out;

        do {
                sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;

                err = nilfs_segctor_begin_construction(sci, nilfs);
                if (unlikely(err))
                        goto failed;

                /* Update time stamp */
                sci->sc_seg_ctime = ktime_get_real_seconds();

                err = nilfs_segctor_collect(sci, nilfs, mode);
                if (unlikely(err))
                        goto failed;

                /* Avoid empty segment */
                if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE &&
                    nilfs_segbuf_empty(sci->sc_curseg)) {
                        nilfs_segctor_abort_construction(sci, nilfs, 1);
                        goto out;
                }

                err = nilfs_segctor_assign(sci, mode);
                if (unlikely(err))
                        goto failed;

                if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
                        nilfs_segctor_fill_in_file_bmap(sci);

                if (mode == SC_LSEG_SR &&
                    nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
                        err = nilfs_cpfile_finalize_checkpoint(
                                nilfs->ns_cpfile, nilfs->ns_cno, sci->sc_root,
                                sci->sc_nblk_inc + sci->sc_nblk_this_inc,
                                sci->sc_seg_ctime,
                                !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags));
                        if (unlikely(err))
                                goto failed_to_write;

                        nilfs_segctor_fill_in_super_root(sci, nilfs);
                }
                nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);

                /* Write partial segments */
                nilfs_prepare_write_logs(&sci->sc_segbufs, nilfs->ns_crc_seed);

                err = nilfs_segctor_write(sci, nilfs);
                if (unlikely(err))
                        goto failed_to_write;

                if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
                    nilfs->ns_blocksize_bits != PAGE_SHIFT) {
                        /*
                         * At this point, we avoid double buffering
                         * for blocksize < pagesize because page dirty
                         * flag is turned off during write and dirty
                         * buffers are not properly collected for
                         * pages crossing over segments.
                         */
                        err = nilfs_segctor_wait(sci);
                        if (err)
                                goto failed_to_write;
                }
        } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE);

 out:
        nilfs_segctor_drop_written_files(sci, nilfs);
        return err;

 failed_to_write:
 failed:
        if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_IFILE)
                nilfs_redirty_inodes(&sci->sc_dirty_files);
        if (nilfs_doing_gc())
                nilfs_redirty_inodes(&sci->sc_gc_inodes);
        nilfs_segctor_abort_construction(sci, nilfs, err);
        goto out;
}

/**
 * nilfs_segctor_start_timer - set timer of background write
 * @sci: nilfs_sc_info
 *
 * If the timer has already been set, it ignores the new request.
 * This function MUST be called within a section locking the segment
 * semaphore.
 */
static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
{
        spin_lock(&sci->sc_state_lock);
        if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
                if (sci->sc_task) {
                        sci->sc_timer.expires = jiffies + sci->sc_interval;
                        add_timer(&sci->sc_timer);
                }
                sci->sc_state |= NILFS_SEGCTOR_COMMIT;
        }
        spin_unlock(&sci->sc_state_lock);
}

static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
{
        spin_lock(&sci->sc_state_lock);
        if (!(sci->sc_flush_request & BIT(bn))) {
                unsigned long prev_req = sci->sc_flush_request;

                sci->sc_flush_request |= BIT(bn);
                if (!prev_req)
                        wake_up(&sci->sc_wait_daemon);
        }
        spin_unlock(&sci->sc_state_lock);
}

struct nilfs_segctor_wait_request {
        wait_queue_entry_t        wq;
        __u32                seq;
        int                err;
        atomic_t        done;
};

static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
{
        struct nilfs_segctor_wait_request wait_req;
        int err = 0;

        init_wait(&wait_req.wq);
        wait_req.err = 0;
        atomic_set(&wait_req.done, 0);
        init_waitqueue_entry(&wait_req.wq, current);

        /*
         * To prevent a race issue where completion notifications from the
         * log writer thread are missed, increment the request sequence count
         * "sc_seq_request" and insert a wait queue entry using the current
         * sequence number into the "sc_wait_request" queue at the same time
         * within the lock section of "sc_state_lock".
         */
        spin_lock(&sci->sc_state_lock);
        wait_req.seq = ++sci->sc_seq_request;
        add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
        spin_unlock(&sci->sc_state_lock);

        wake_up(&sci->sc_wait_daemon);

        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);

                /*
                 * Synchronize only while the log writer thread is alive.
                 * Leave flushing out after the log writer thread exits to
                 * the cleanup work in nilfs_segctor_destroy().
                 */
                if (!sci->sc_task)
                        break;

                if (atomic_read(&wait_req.done)) {
                        err = wait_req.err;
                        break;
                }
                if (!signal_pending(current)) {
                        schedule();
                        continue;
                }
                err = -ERESTARTSYS;
                break;
        }
        finish_wait(&sci->sc_wait_request, &wait_req.wq);
        return err;
}

static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err, bool force)
{
        struct nilfs_segctor_wait_request *wrq, *n;
        unsigned long flags;

        spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
        list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) {
                if (!atomic_read(&wrq->done) &&
                    (force || nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq))) {
                        wrq->err = err;
                        atomic_set(&wrq->done, 1);
                }
                if (atomic_read(&wrq->done)) {
                        wrq->wq.func(&wrq->wq,
                                     TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
                                     0, NULL);
                }
        }
        spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
}

/**
 * nilfs_construct_segment - construct a logical segment
 * @sb: super block
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO                - I/O error (including metadata corruption).
 * * %-ENOMEM                - Insufficient memory available.
 * * %-ENOSPC                - No space left on device (only in a panic state).
 * * %-ERESTARTSYS        - Interrupted.
 * * %-EROFS                - Read only filesystem.
 */
int nilfs_construct_segment(struct super_block *sb)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_sc_info *sci = nilfs->ns_writer;
        struct nilfs_transaction_info *ti;

        if (sb_rdonly(sb) || unlikely(!sci))
                return -EROFS;

        /* A call inside transactions causes a deadlock. */
        BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);

        return nilfs_segctor_sync(sci);
}

/**
 * nilfs_construct_dsync_segment - construct a data-only logical segment
 * @sb: super block
 * @inode: inode whose data blocks should be written out
 * @start: start byte offset
 * @end: end byte offset (inclusive)
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO                - I/O error (including metadata corruption).
 * * %-ENOMEM                - Insufficient memory available.
 * * %-ENOSPC                - No space left on device (only in a panic state).
 * * %-ERESTARTSYS        - Interrupted.
 * * %-EROFS                - Read only filesystem.
 */
int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
                                  loff_t start, loff_t end)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_sc_info *sci = nilfs->ns_writer;
        struct nilfs_inode_info *ii;
        struct nilfs_transaction_info ti;
        int err = 0;

        if (sb_rdonly(sb) || unlikely(!sci))
                return -EROFS;

        nilfs_transaction_lock(sb, &ti, 0);

        ii = NILFS_I(inode);
        if (test_bit(NILFS_I_INODE_SYNC, &ii->i_state) ||
            nilfs_test_opt(nilfs, STRICT_ORDER) ||
            test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
            nilfs_discontinued(nilfs)) {
                nilfs_transaction_unlock(sb);
                err = nilfs_segctor_sync(sci);
                return err;
        }

        spin_lock(&nilfs->ns_inode_lock);
        if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
            !test_bit(NILFS_I_BUSY, &ii->i_state)) {
                spin_unlock(&nilfs->ns_inode_lock);
                nilfs_transaction_unlock(sb);
                return 0;
        }
        spin_unlock(&nilfs->ns_inode_lock);
        sci->sc_dsync_inode = ii;
        sci->sc_dsync_start = start;
        sci->sc_dsync_end = end;

        err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
        if (!err)
                nilfs->ns_flushed_device = 0;

        nilfs_transaction_unlock(sb);
        return err;
}

#define FLUSH_FILE_BIT        (0x1) /* data file only */
#define FLUSH_DAT_BIT        BIT(NILFS_DAT_INO) /* DAT only */

/**
 * nilfs_segctor_accept - record accepted sequence count of log-write requests
 * @sci: segment constructor object
 */
static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
{
        bool thread_is_alive;

        spin_lock(&sci->sc_state_lock);
        sci->sc_seq_accepted = sci->sc_seq_request;
        thread_is_alive = (bool)sci->sc_task;
        spin_unlock(&sci->sc_state_lock);

        /*
         * This function does not race with the log writer thread's
         * termination.  Therefore, deleting sc_timer, which should not be
         * done after the log writer thread exits, can be done safely outside
         * the area protected by sc_state_lock.
         */
        if (thread_is_alive)
                timer_delete_sync(&sci->sc_timer);
}

/**
 * nilfs_segctor_notify - notify the result of request to caller threads
 * @sci: segment constructor object
 * @mode: mode of log forming
 * @err: error code to be notified
 */
static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
{
        /* Clear requests (even when the construction failed) */
        spin_lock(&sci->sc_state_lock);

        if (mode == SC_LSEG_SR) {
                sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
                sci->sc_seq_done = sci->sc_seq_accepted;
                nilfs_segctor_wakeup(sci, err, false);
                sci->sc_flush_request = 0;
        } else {
                if (mode == SC_FLUSH_FILE)
                        sci->sc_flush_request &= ~FLUSH_FILE_BIT;
                else if (mode == SC_FLUSH_DAT)
                        sci->sc_flush_request &= ~FLUSH_DAT_BIT;

                /* re-enable timer if checkpoint creation was not done */
                if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && sci->sc_task &&
                    time_before(jiffies, sci->sc_timer.expires))
                        add_timer(&sci->sc_timer);
        }
        spin_unlock(&sci->sc_state_lock);
}

/**
 * nilfs_segctor_construct - form logs and write them to disk
 * @sci: segment constructor object
 * @mode: mode of log forming
 *
 * Return: 0 on success, or a negative error code on failure.
 */
static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
{
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        struct nilfs_super_block **sbp;
        int err = 0;

        nilfs_segctor_accept(sci);

        if (nilfs_discontinued(nilfs))
                mode = SC_LSEG_SR;
        if (!nilfs_segctor_confirm(sci))
                err = nilfs_segctor_do_construct(sci, mode);

        if (likely(!err)) {
                if (mode != SC_FLUSH_DAT)
                        atomic_set(&nilfs->ns_ndirtyblks, 0);
                if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
                    nilfs_discontinued(nilfs)) {
                        down_write(&nilfs->ns_sem);
                        err = -EIO;
                        sbp = nilfs_prepare_super(sci->sc_super,
                                                  nilfs_sb_will_flip(nilfs));
                        if (likely(sbp)) {
                                nilfs_set_log_cursor(sbp[0], nilfs);
                                err = nilfs_commit_super(sci->sc_super,
                                                         NILFS_SB_COMMIT);
                        }
                        up_write(&nilfs->ns_sem);
                }
        }

        nilfs_segctor_notify(sci, mode, err);
        return err;
}

static void nilfs_construction_timeout(struct timer_list *t)
{
        struct nilfs_sc_info *sci = timer_container_of(sci, t, sc_timer);

        wake_up_process(sci->sc_task);
}

static void
nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
{
        struct nilfs_inode_info *ii, *n;

        list_for_each_entry_safe(ii, n, head, i_dirty) {
                if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
                        continue;
                list_del_init(&ii->i_dirty);
                truncate_inode_pages(&ii->vfs_inode.i_data, 0);
                nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping);
                iput(&ii->vfs_inode);
        }
}

int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
                         void **kbufs)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_sc_info *sci = nilfs->ns_writer;
        struct nilfs_transaction_info ti;
        int err;

        if (unlikely(!sci))
                return -EROFS;

        nilfs_transaction_lock(sb, &ti, 1);

        err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
        if (unlikely(err))
                goto out_unlock;

        err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
        if (unlikely(err)) {
                nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
                goto out_unlock;
        }

        sci->sc_freesegs = kbufs[4];
        sci->sc_nfreesegs = argv[4].v_nmembs;
        list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);

        for (;;) {
                err = nilfs_segctor_construct(sci, SC_LSEG_SR);
                nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);

                if (likely(!err))
                        break;

                nilfs_warn(sb, "error %d cleaning segments", err);
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(sci->sc_interval);
        }
        if (nilfs_test_opt(nilfs, DISCARD)) {
                int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
                                                 sci->sc_nfreesegs);
                if (ret) {
                        nilfs_warn(sb,
                                   "error %d on discard request, turning discards off for the device",
                                   ret);
                        nilfs_clear_opt(nilfs, DISCARD);
                }
        }

 out_unlock:
        sci->sc_freesegs = NULL;
        sci->sc_nfreesegs = 0;
        nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
        nilfs_transaction_unlock(sb);
        return err;
}

static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
{
        struct nilfs_transaction_info ti;

        nilfs_transaction_lock(sci->sc_super, &ti, 0);
        nilfs_segctor_construct(sci, mode);

        /*
         * Unclosed segment should be retried.  We do this using sc_timer.
         * Timeout of sc_timer will invoke complete construction which leads
         * to close the current logical segment.
         */
        if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
                nilfs_segctor_start_timer(sci);

        nilfs_transaction_unlock(sci->sc_super);
}

static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
{
        int mode = 0;

        spin_lock(&sci->sc_state_lock);
        mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
                SC_FLUSH_DAT : SC_FLUSH_FILE;
        spin_unlock(&sci->sc_state_lock);

        if (mode) {
                nilfs_segctor_do_construct(sci, mode);

                spin_lock(&sci->sc_state_lock);
                sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
                        ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
                spin_unlock(&sci->sc_state_lock);
        }
        clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
}

static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
{
        if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
            time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
                if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
                        return SC_FLUSH_FILE;
                else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
                        return SC_FLUSH_DAT;
        }
        return SC_LSEG_SR;
}

/**
 * nilfs_log_write_required - determine whether log writing is required
 * @sci:   nilfs_sc_info struct
 * @modep: location for storing log writing mode
 *
 * Return: true if log writing is required, false otherwise.  If log writing
 * is required, the mode is stored in the location pointed to by @modep.
 */
static bool nilfs_log_write_required(struct nilfs_sc_info *sci, int *modep)
{
        bool timedout, ret = true;

        spin_lock(&sci->sc_state_lock);
        timedout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
                   time_after_eq(jiffies, sci->sc_timer.expires));
        if (timedout || sci->sc_seq_request != sci->sc_seq_done)
                *modep = SC_LSEG_SR;
        else if (sci->sc_flush_request)
                *modep = nilfs_segctor_flush_mode(sci);
        else
                ret = false;

        spin_unlock(&sci->sc_state_lock);
        return ret;
}

/**
 * nilfs_segctor_thread - main loop of the log writer thread
 * @arg: pointer to a struct nilfs_sc_info.
 *
 * nilfs_segctor_thread() is the main loop function of the log writer kernel
 * thread, which determines whether log writing is necessary, and if so,
 * performs the log write in the background, or waits if not.  It is also
 * used to decide the background writeback of the superblock.
 *
 * Return: Always 0.
 */
static int nilfs_segctor_thread(void *arg)
{
        struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;

        nilfs_info(sci->sc_super,
                   "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
                   sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);

        set_freezable();

        while (!kthread_should_stop()) {
                DEFINE_WAIT(wait);
                bool should_write;
                int mode;

                if (freezing(current)) {
                        try_to_freeze();
                        continue;
                }

                prepare_to_wait(&sci->sc_wait_daemon, &wait,
                                TASK_INTERRUPTIBLE);
                should_write = nilfs_log_write_required(sci, &mode);
                if (!should_write)
                        schedule();
                finish_wait(&sci->sc_wait_daemon, &wait);

                if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
                        set_nilfs_discontinued(nilfs);

                if (should_write)
                        nilfs_segctor_thread_construct(sci, mode);
        }

        /* end sync. */
        spin_lock(&sci->sc_state_lock);
        sci->sc_task = NULL;
        timer_shutdown_sync(&sci->sc_timer);
        spin_unlock(&sci->sc_state_lock);
        return 0;
}

/*
 * Setup & clean-up functions
 */
static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
                                               struct nilfs_root *root)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_sc_info *sci;

        sci = kzalloc_obj(*sci);
        if (!sci)
                return NULL;

        sci->sc_super = sb;

        nilfs_get_root(root);
        sci->sc_root = root;

        init_waitqueue_head(&sci->sc_wait_request);
        init_waitqueue_head(&sci->sc_wait_daemon);
        spin_lock_init(&sci->sc_state_lock);
        INIT_LIST_HEAD(&sci->sc_dirty_files);
        INIT_LIST_HEAD(&sci->sc_segbufs);
        INIT_LIST_HEAD(&sci->sc_write_logs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
        INIT_LIST_HEAD(&sci->sc_iput_queue);
        INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);

        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
        sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
        sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;

        if (nilfs->ns_interval)
                sci->sc_interval = HZ * nilfs->ns_interval;
        if (nilfs->ns_watermark)
                sci->sc_watermark = nilfs->ns_watermark;
        return sci;
}

static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
{
        int ret, retrycount = NILFS_SC_CLEANUP_RETRY;

        /*
         * The segctord thread was stopped and its timer was removed.
         * But some tasks remain.
         */
        do {
                struct nilfs_transaction_info ti;

                nilfs_transaction_lock(sci->sc_super, &ti, 0);
                ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
                nilfs_transaction_unlock(sci->sc_super);

                flush_work(&sci->sc_iput_work);

        } while (ret && ret != -EROFS && retrycount-- > 0);
}

/**
 * nilfs_segctor_destroy - destroy the segment constructor.
 * @sci: nilfs_sc_info
 *
 * nilfs_segctor_destroy() kills the segctord thread and frees
 * the nilfs_sc_info struct.
 * Caller must hold the segment semaphore.
 */
static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
{
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int flag;

        up_write(&nilfs->ns_segctor_sem);

        if (sci->sc_task) {
                wake_up(&sci->sc_wait_daemon);
                if (kthread_stop(sci->sc_task)) {
                        spin_lock(&sci->sc_state_lock);
                        sci->sc_task = NULL;
                        timer_shutdown_sync(&sci->sc_timer);
                        spin_unlock(&sci->sc_state_lock);
                }
        }

        spin_lock(&sci->sc_state_lock);
        flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
                || sci->sc_seq_request != sci->sc_seq_done);
        spin_unlock(&sci->sc_state_lock);

        /*
         * Forcibly wake up tasks waiting in nilfs_segctor_sync(), which can
         * be called from delayed iput() via nilfs_evict_inode() and can race
         * with the above log writer thread termination.
         */
        nilfs_segctor_wakeup(sci, 0, true);

        if (flush_work(&sci->sc_iput_work))
                flag = true;

        if (flag || !nilfs_segctor_confirm(sci))
                nilfs_segctor_write_out(sci);

        if (!list_empty(&sci->sc_dirty_files)) {
                nilfs_warn(sci->sc_super,
                           "disposed unprocessed dirty file(s) when stopping log writer");
                nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
        }

        if (!list_empty(&sci->sc_iput_queue)) {
                nilfs_warn(sci->sc_super,
                           "disposed unprocessed inode(s) in iput queue when stopping log writer");
                nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
        }

        WARN_ON(!list_empty(&sci->sc_segbufs));
        WARN_ON(!list_empty(&sci->sc_write_logs));

        nilfs_put_root(sci->sc_root);

        down_write(&nilfs->ns_segctor_sem);

        kfree(sci);
}

/**
 * nilfs_attach_log_writer - attach log writer
 * @sb: super block instance
 * @root: root object of the current filesystem tree
 *
 * This allocates a log writer object, initializes it, and starts the
 * log writer.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINTR        - Log writer thread creation failed due to interruption.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_sc_info *sci;
        struct task_struct *t;
        int err;

        if (nilfs->ns_writer) {
                /*
                 * This happens if the filesystem is made read-only by
                 * __nilfs_error or nilfs_remount and then remounted
                 * read/write.  In these cases, reuse the existing
                 * writer.
                 */
                return 0;
        }

        sci = nilfs_segctor_new(sb, root);
        if (unlikely(!sci))
                return -ENOMEM;

        nilfs->ns_writer = sci;
        t = kthread_create(nilfs_segctor_thread, sci, "segctord");
        if (IS_ERR(t)) {
                err = PTR_ERR(t);
                nilfs_err(sb, "error %d creating segctord thread", err);
                nilfs_detach_log_writer(sb);
                return err;
        }
        sci->sc_task = t;
        timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0);

        wake_up_process(sci->sc_task);
        return 0;
}

/**
 * nilfs_detach_log_writer - destroy log writer
 * @sb: super block instance
 *
 * This kills log writer daemon, frees the log writer object, and
 * destroys list of dirty files.
 */
void nilfs_detach_log_writer(struct super_block *sb)
{
        struct the_nilfs *nilfs = sb->s_fs_info;
        LIST_HEAD(garbage_list);

        down_write(&nilfs->ns_segctor_sem);
        if (nilfs->ns_writer) {
                nilfs_segctor_destroy(nilfs->ns_writer);
                nilfs->ns_writer = NULL;
        }
        set_nilfs_purging(nilfs);

        /* Force to free the list of dirty files */
        spin_lock(&nilfs->ns_inode_lock);
        if (!list_empty(&nilfs->ns_dirty_files)) {
                list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
                nilfs_warn(sb,
                           "disposed unprocessed dirty file(s) when detaching log writer");
        }
        spin_unlock(&nilfs->ns_inode_lock);
        up_write(&nilfs->ns_segctor_sem);

        nilfs_dispose_list(nilfs, &garbage_list, 1);
        clear_nilfs_purging(nilfs);
}
































































































































    2 





    2 














































    2 




























































    2 







































































































    2 
























    2 


































    2 





    2 










































    1 













    1 









    1 












    2 


















    2 





































    2 









    2 























































    2 























































    2 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
 * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * super.c
 */

/*
 * This file implements code to read the superblock, read and initialise
 * in-memory structures at mount time, and all the VFS glue code to register
 * the filesystem.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/vfs.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/magic.h>
#include <linux/xattr.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
#include "decompressor.h"
#include "xattr.h"

static struct file_system_type squashfs_fs_type;
static const struct super_operations squashfs_super_ops;

enum Opt_errors {
        Opt_errors_continue,
        Opt_errors_panic,
};

enum squashfs_param {
        Opt_errors,
        Opt_threads,
};

struct squashfs_mount_opts {
        enum Opt_errors errors;
        const struct squashfs_decompressor_thread_ops *thread_ops;
        int thread_num;
};

static const struct constant_table squashfs_param_errors[] = {
        {"continue",   Opt_errors_continue },
        {"panic",      Opt_errors_panic },
        {}
};

static const struct fs_parameter_spec squashfs_fs_parameters[] = {
        fsparam_enum("errors", Opt_errors, squashfs_param_errors),
        fsparam_string("threads", Opt_threads),
        {}
};


static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts)
{
#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
        if (strcmp(str, "single") == 0) {
                opts->thread_ops = &squashfs_decompressor_single;
                return 0;
        }
        if (strcmp(str, "multi") == 0) {
                opts->thread_ops = &squashfs_decompressor_multi;
                return 0;
        }
        if (strcmp(str, "percpu") == 0) {
                opts->thread_ops = &squashfs_decompressor_percpu;
                return 0;
        }
#endif
        return -EINVAL;
}

static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts)
{
#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS
        int ret;
        unsigned long num;

        ret = kstrtoul(str, 0, &num);
        if (ret != 0)
                return -EINVAL;
        if (num > 1) {
                opts->thread_ops = &squashfs_decompressor_multi;
                if (num > opts->thread_ops->max_decompressors())
                        return -EINVAL;
                opts->thread_num = (int)num;
                return 0;
        }
#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
        if (num == 1) {
                opts->thread_ops = &squashfs_decompressor_single;
                opts->thread_num = 1;
                return 0;
        }
#endif
#endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */
        return -EINVAL;
}

static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts)
{
        int ret = squashfs_parse_param_threads_str(str, opts);

        if (ret == 0)
                return ret;
        return squashfs_parse_param_threads_num(str, opts);
}

static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct squashfs_mount_opts *opts = fc->fs_private;
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, squashfs_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_errors:
                opts->errors = result.uint_32;
                break;
        case Opt_threads:
                if (squashfs_parse_param_threads(param->string, opts) != 0)
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct squashfs_decompressor *supported_squashfs_filesystem(
        struct fs_context *fc,
        short major, short minor, short id)
{
        const struct squashfs_decompressor *decompressor;

        if (major < SQUASHFS_MAJOR) {
                errorf(fc, "Major/Minor mismatch, older Squashfs %d.%d "
                       "filesystems are unsupported", major, minor);
                return NULL;
        } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
                errorf(fc, "Major/Minor mismatch, trying to mount newer "
                       "%d.%d filesystem", major, minor);
                errorf(fc, "Please update your kernel");
                return NULL;
        }

        decompressor = squashfs_lookup_decompressor(id);
        if (!decompressor->supported) {
                errorf(fc, "Filesystem uses \"%s\" compression. This is not supported",
                       decompressor->name);
                return NULL;
        }

        return decompressor;
}


static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct squashfs_mount_opts *opts = fc->fs_private;
        struct squashfs_sb_info *msblk;
        struct squashfs_super_block *sblk = NULL;
        struct inode *root;
        long long root_inode;
        unsigned short flags;
        unsigned int fragments;
        u64 lookup_table_start, xattr_id_table_start, next_table;
        int err, devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);

        TRACE("Entered squashfs_fill_superblock\n");

        if (!devblksize) {
                errorf(fc, "squashfs: unable to set blocksize\n");
                return -EINVAL;
        }

        sb->s_fs_info = kzalloc_obj(*msblk);
        if (sb->s_fs_info == NULL) {
                ERROR("Failed to allocate squashfs_sb_info\n");
                return -ENOMEM;
        }
        msblk = sb->s_fs_info;
        msblk->thread_ops = opts->thread_ops;

        msblk->panic_on_errors = (opts->errors == Opt_errors_panic);

        msblk->devblksize = devblksize;
        msblk->devblksize_log2 = ffz(~msblk->devblksize);

        mutex_init(&msblk->meta_index_mutex);

        /*
         * msblk->bytes_used is checked in squashfs_read_table to ensure reads
         * are not beyond filesystem end.  But as we're using
         * squashfs_read_table here to read the superblock (including the value
         * of bytes_used) we need to set it to an initial sensible dummy value
         */
        msblk->bytes_used = sizeof(*sblk);
        sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk));

        if (IS_ERR(sblk)) {
                errorf(fc, "unable to read squashfs_super_block");
                err = PTR_ERR(sblk);
                sblk = NULL;
                goto failed_mount;
        }

        err = -EINVAL;

        /* Check it is a SQUASHFS superblock */
        sb->s_magic = le32_to_cpu(sblk->s_magic);
        if (sb->s_magic != SQUASHFS_MAGIC) {
                if (!(fc->sb_flags & SB_SILENT))
                        errorf(fc, "Can't find a SQUASHFS superblock on %pg",
                               sb->s_bdev);
                goto failed_mount;
        }

        if (opts->thread_num == 0) {
                msblk->max_thread_num = msblk->thread_ops->max_decompressors();
        } else {
                msblk->max_thread_num = opts->thread_num;
        }

        /* Check the MAJOR & MINOR versions and lookup compression type */
        msblk->decompressor = supported_squashfs_filesystem(
                        fc,
                        le16_to_cpu(sblk->s_major),
                        le16_to_cpu(sblk->s_minor),
                        le16_to_cpu(sblk->compression));
        if (msblk->decompressor == NULL)
                goto failed_mount;

        /* Check the filesystem does not extend beyond the end of the
           block device */
        msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
        if (msblk->bytes_used < 0 ||
            msblk->bytes_used > bdev_nr_bytes(sb->s_bdev))
                goto failed_mount;

        /* Check block size for sanity */
        msblk->block_size = le32_to_cpu(sblk->block_size);
        if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
                goto insanity;

        /*
         * Check the system page size is not larger than the filesystem
         * block size (by default 128K).  This is currently not supported.
         */
        if (PAGE_SIZE > msblk->block_size) {
                errorf(fc, "Page size > filesystem block size (%d).  This is "
                       "currently not supported!", msblk->block_size);
                goto failed_mount;
        }

        /* Check block log for sanity */
        msblk->block_log = le16_to_cpu(sblk->block_log);
        if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
                goto failed_mount;

        /* Check that block_size and block_log match */
        if (msblk->block_size != (1 << msblk->block_log))
                goto insanity;

        /* Check the root inode for sanity */
        root_inode = le64_to_cpu(sblk->root_inode);
        if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
                goto insanity;

        msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
        msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
        msblk->inodes = le32_to_cpu(sblk->inodes);
        msblk->fragments = le32_to_cpu(sblk->fragments);
        msblk->ids = le16_to_cpu(sblk->no_ids);
        flags = le16_to_cpu(sblk->flags);

        TRACE("Found valid superblock on %pg\n", sb->s_bdev);
        TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
                                ? "un" : "");
        TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
                                ? "un" : "");
        TRACE("Filesystem size %lld bytes\n", msblk->bytes_used);
        TRACE("Block size %d\n", msblk->block_size);
        TRACE("Number of inodes %d\n", msblk->inodes);
        TRACE("Number of fragments %d\n", msblk->fragments);
        TRACE("Number of ids %d\n", msblk->ids);
        TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
        TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
        TRACE("sblk->fragment_table_start %llx\n",
                (u64) le64_to_cpu(sblk->fragment_table_start));
        TRACE("sblk->id_table_start %llx\n",
                (u64) le64_to_cpu(sblk->id_table_start));

        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_time_min = 0;
        sb->s_time_max = U32_MAX;
        sb->s_flags |= SB_RDONLY;
        sb->s_op = &squashfs_super_ops;

        msblk->block_cache = squashfs_cache_init("metadata",
                        SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
        if (IS_ERR(msblk->block_cache)) {
                err = PTR_ERR(msblk->block_cache);
                goto failed_mount;
        }

        /* Allocate read_page block */
        msblk->read_page = squashfs_cache_init("data",
                SQUASHFS_READ_PAGES, msblk->block_size);
        if (IS_ERR(msblk->read_page)) {
                errorf(fc, "Failed to allocate read_page block");
                err = PTR_ERR(msblk->read_page);
                goto failed_mount;
        }

        if (msblk->devblksize == PAGE_SIZE) {
                struct inode *cache = new_inode(sb);

                if (cache == NULL) {
                        err = -ENOMEM;
                        goto failed_mount;
                }

                set_nlink(cache, 1);
                cache->i_size = OFFSET_MAX;
                mapping_set_gfp_mask(cache->i_mapping, GFP_NOFS);

                msblk->cache_mapping = cache->i_mapping;
        }

        msblk->stream = squashfs_decompressor_setup(sb, flags);
        if (IS_ERR(msblk->stream)) {
                err = PTR_ERR(msblk->stream);
                msblk->stream = NULL;
                goto insanity;
        }

        /* Handle xattrs */
        sb->s_xattr = squashfs_xattr_handlers;
        xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
        if (xattr_id_table_start == SQUASHFS_INVALID_BLK) {
                next_table = msblk->bytes_used;
                goto allocate_id_index_table;
        }

        /* Allocate and read xattr id lookup table */
        msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
                xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
        if (IS_ERR(msblk->xattr_id_table)) {
                errorf(fc, "unable to read xattr id index table");
                err = PTR_ERR(msblk->xattr_id_table);
                msblk->xattr_id_table = NULL;
                if (err != -ENOTSUPP)
                        goto failed_mount;
        }
        next_table = msblk->xattr_table;

allocate_id_index_table:
        /* Allocate and read id index table */
        msblk->id_table = squashfs_read_id_index_table(sb,
                le64_to_cpu(sblk->id_table_start), next_table, msblk->ids);
        if (IS_ERR(msblk->id_table)) {
                errorf(fc, "unable to read id index table");
                err = PTR_ERR(msblk->id_table);
                msblk->id_table = NULL;
                goto failed_mount;
        }
        next_table = le64_to_cpu(msblk->id_table[0]);

        /* Handle inode lookup table */
        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
        if (lookup_table_start == SQUASHFS_INVALID_BLK)
                goto handle_fragments;

        /* Allocate and read inode lookup table */
        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
                lookup_table_start, next_table, msblk->inodes);
        if (IS_ERR(msblk->inode_lookup_table)) {
                errorf(fc, "unable to read inode lookup table");
                err = PTR_ERR(msblk->inode_lookup_table);
                msblk->inode_lookup_table = NULL;
                goto failed_mount;
        }
        next_table = le64_to_cpu(msblk->inode_lookup_table[0]);

        sb->s_export_op = &squashfs_export_ops;

handle_fragments:
        fragments = msblk->fragments;
        if (fragments == 0)
                goto check_directory_table;

        msblk->fragment_cache = squashfs_cache_init("fragment",
                min(SQUASHFS_CACHED_FRAGMENTS, fragments), msblk->block_size);
        if (IS_ERR(msblk->fragment_cache)) {
                err = PTR_ERR(msblk->fragment_cache);
                goto failed_mount;
        }

        /* Allocate and read fragment index table */
        msblk->fragment_index = squashfs_read_fragment_index_table(sb,
                le64_to_cpu(sblk->fragment_table_start), next_table, fragments);
        if (IS_ERR(msblk->fragment_index)) {
                errorf(fc, "unable to read fragment index table");
                err = PTR_ERR(msblk->fragment_index);
                msblk->fragment_index = NULL;
                goto failed_mount;
        }
        next_table = le64_to_cpu(msblk->fragment_index[0]);

check_directory_table:
        /* Sanity check directory_table */
        if (msblk->directory_table > next_table) {
                err = -EINVAL;
                goto insanity;
        }

        /* Sanity check inode_table */
        if (msblk->inode_table >= msblk->directory_table) {
                err = -EINVAL;
                goto insanity;
        }

        /* allocate root */
        root = new_inode(sb);
        if (!root) {
                err = -ENOMEM;
                goto failed_mount;
        }

        err = squashfs_read_inode(root, root_inode);
        if (err) {
                make_bad_inode(root);
                iput(root);
                goto failed_mount;
        }
        insert_inode_hash(root);

        sb->s_root = d_make_root(root);
        if (sb->s_root == NULL) {
                ERROR("Root inode create failed\n");
                err = -ENOMEM;
                goto failed_mount;
        }

        TRACE("Leaving squashfs_fill_super\n");
        kfree(sblk);
        return 0;

insanity:
        errorf(fc, "squashfs image failed sanity check");
failed_mount:
        squashfs_cache_delete(msblk->block_cache);
        squashfs_cache_delete(msblk->fragment_cache);
        squashfs_cache_delete(msblk->read_page);
        if (msblk->cache_mapping)
                iput(msblk->cache_mapping->host);
        msblk->thread_ops->destroy(msblk);
        kfree(msblk->inode_lookup_table);
        kfree(msblk->fragment_index);
        kfree(msblk->id_table);
        kfree(msblk->xattr_id_table);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
        kfree(sblk);
        return err;
}

static int squashfs_get_tree(struct fs_context *fc)
{
        return get_tree_bdev(fc, squashfs_fill_super);
}

static int squashfs_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        struct squashfs_mount_opts *opts = fc->fs_private;

        sync_filesystem(fc->root->d_sb);
        fc->sb_flags |= SB_RDONLY;

        msblk->panic_on_errors = (opts->errors == Opt_errors_panic);

        return 0;
}

static void squashfs_free_fs_context(struct fs_context *fc)
{
        kfree(fc->fs_private);
}

static const struct fs_context_operations squashfs_context_ops = {
        .get_tree        = squashfs_get_tree,
        .free                = squashfs_free_fs_context,
        .parse_param        = squashfs_parse_param,
        .reconfigure        = squashfs_reconfigure,
};

static int squashfs_show_options(struct seq_file *s, struct dentry *root)
{
        struct super_block *sb = root->d_sb;
        struct squashfs_sb_info *msblk = sb->s_fs_info;

        if (msblk->panic_on_errors)
                seq_puts(s, ",errors=panic");
        else
                seq_puts(s, ",errors=continue");

#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
        if (msblk->thread_ops == &squashfs_decompressor_single) {
                seq_puts(s, ",threads=single");
                return 0;
        }
        if (msblk->thread_ops == &squashfs_decompressor_percpu) {
                seq_puts(s, ",threads=percpu");
                return 0;
        }
#endif
#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS
        seq_printf(s, ",threads=%d", msblk->max_thread_num);
#endif
        return 0;
}

static int squashfs_init_fs_context(struct fs_context *fc)
{
        struct squashfs_mount_opts *opts;

        opts = kzalloc_obj(*opts);
        if (!opts)
                return -ENOMEM;

#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
        opts->thread_ops = &squashfs_decompressor_single;
#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI)
        opts->thread_ops = &squashfs_decompressor_multi;
#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU)
        opts->thread_ops = &squashfs_decompressor_percpu;
#else
#error "fail: unknown squashfs decompression thread mode?"
#endif
        opts->thread_num = 0;
        fc->fs_private = opts;
        fc->ops = &squashfs_context_ops;
        return 0;
}

static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
        u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);

        TRACE("Entered squashfs_statfs\n");

        buf->f_type = SQUASHFS_MAGIC;
        buf->f_bsize = msblk->block_size;
        buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1;
        buf->f_bfree = buf->f_bavail = 0;
        buf->f_files = msblk->inodes;
        buf->f_ffree = 0;
        buf->f_namelen = SQUASHFS_NAME_LEN;
        buf->f_fsid = u64_to_fsid(id);

        return 0;
}


static void squashfs_put_super(struct super_block *sb)
{
        if (sb->s_fs_info) {
                struct squashfs_sb_info *sbi = sb->s_fs_info;
                squashfs_cache_delete(sbi->block_cache);
                squashfs_cache_delete(sbi->fragment_cache);
                squashfs_cache_delete(sbi->read_page);
                if (sbi->cache_mapping)
                        iput(sbi->cache_mapping->host);
                sbi->thread_ops->destroy(sbi);
                kfree(sbi->id_table);
                kfree(sbi->fragment_index);
                kfree(sbi->meta_index);
                kfree(sbi->inode_lookup_table);
                kfree(sbi->xattr_id_table);
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
}

static struct kmem_cache *squashfs_inode_cachep;


static void init_once(void *foo)
{
        struct squashfs_inode_info *ei = foo;

        inode_init_once(&ei->vfs_inode);
}


static int __init init_inodecache(void)
{
        squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
                sizeof(struct squashfs_inode_info), 0,
                SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
                init_once);

        return squashfs_inode_cachep ? 0 : -ENOMEM;
}


static void destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(squashfs_inode_cachep);
}


static int __init init_squashfs_fs(void)
{
        int err = init_inodecache();

        if (err)
                return err;

        err = register_filesystem(&squashfs_fs_type);
        if (err) {
                destroy_inodecache();
                return err;
        }

        pr_info("version 4.0 (2009/01/31) Phillip Lougher\n");

        return 0;
}


static void __exit exit_squashfs_fs(void)
{
        unregister_filesystem(&squashfs_fs_type);
        destroy_inodecache();
}


static struct inode *squashfs_alloc_inode(struct super_block *sb)
{
        struct squashfs_inode_info *ei =
                alloc_inode_sb(sb, squashfs_inode_cachep, GFP_KERNEL);

        return ei ? &ei->vfs_inode : NULL;
}


static void squashfs_free_inode(struct inode *inode)
{
        kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
}

static struct file_system_type squashfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "squashfs",
        .init_fs_context = squashfs_init_fs_context,
        .parameters = squashfs_fs_parameters,
        .kill_sb = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("squashfs");

static const struct super_operations squashfs_super_ops = {
        .alloc_inode = squashfs_alloc_inode,
        .free_inode = squashfs_free_inode,
        .statfs = squashfs_statfs,
        .put_super = squashfs_put_super,
        .show_options = squashfs_show_options,
};

module_init(init_squashfs_fs);
module_exit(exit_squashfs_fs);
MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
MODULE_AUTHOR("Phillip Lougher <phillip@squashfs.org.uk>");
MODULE_LICENSE("GPL");


































    1 



    1 










    1 
















    1 









































    1 
















































































    1 



    1 





































































































































































































































































    1 



















    1 

































































    1 




















    1 


































































































































    1 














    1 




    1 











    1 







    1 








    1 
















    1 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/hfsplus/super.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
#include <linux/slab.h>
#include <linux/vfs.h>
#include <linux/nls.h>

static struct inode *hfsplus_alloc_inode(struct super_block *sb);
static void hfsplus_free_inode(struct inode *inode);

#include "hfsplus_fs.h"
#include "xattr.h"

static int hfsplus_system_read_inode(struct inode *inode)
{
        struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;

        switch (inode->i_ino) {
        case HFSPLUS_EXT_CNID:
                hfsplus_inode_read_fork(inode, &vhdr->ext_file);
                inode->i_mapping->a_ops = &hfsplus_btree_aops;
                break;
        case HFSPLUS_CAT_CNID:
                hfsplus_inode_read_fork(inode, &vhdr->cat_file);
                inode->i_mapping->a_ops = &hfsplus_btree_aops;
                break;
        case HFSPLUS_ALLOC_CNID:
                hfsplus_inode_read_fork(inode, &vhdr->alloc_file);
                inode->i_mapping->a_ops = &hfsplus_aops;
                break;
        case HFSPLUS_START_CNID:
                hfsplus_inode_read_fork(inode, &vhdr->start_file);
                break;
        case HFSPLUS_ATTR_CNID:
                hfsplus_inode_read_fork(inode, &vhdr->attr_file);
                inode->i_mapping->a_ops = &hfsplus_btree_aops;
                break;
        default:
                return -EIO;
        }

        /*
         * Assign a dummy file type, for may_open() requires that
         * an inode has a valid file type.
         */
        inode->i_mode = S_IFREG;

        return 0;
}

struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
{
        struct hfs_find_data fd;
        struct inode *inode;
        int err;

        inode = iget_locked(sb, ino);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (!(inode_state_read_once(inode) & I_NEW))
                return inode;

        atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
        HFSPLUS_I(inode)->first_blocks = 0;
        HFSPLUS_I(inode)->clump_blocks = 0;
        HFSPLUS_I(inode)->alloc_blocks = 0;
        HFSPLUS_I(inode)->cached_start = U32_MAX;
        HFSPLUS_I(inode)->cached_blocks = 0;
        memset(HFSPLUS_I(inode)->first_extents, 0, sizeof(hfsplus_extent_rec));
        memset(HFSPLUS_I(inode)->cached_extents, 0, sizeof(hfsplus_extent_rec));
        HFSPLUS_I(inode)->extent_state = 0;
        mutex_init(&HFSPLUS_I(inode)->extents_lock);
        HFSPLUS_I(inode)->rsrc_inode = NULL;
        HFSPLUS_I(inode)->create_date = 0;
        HFSPLUS_I(inode)->linkid = 0;
        HFSPLUS_I(inode)->flags = 0;
        HFSPLUS_I(inode)->fs_blocks = 0;
        HFSPLUS_I(inode)->userflags = 0;
        HFSPLUS_I(inode)->subfolders = 0;
        INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
        spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock);
        HFSPLUS_I(inode)->phys_size = 0;

        if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
            inode->i_ino == HFSPLUS_ROOT_CNID) {
                err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
                if (!err) {
                        err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
                        if (!err)
                                err = hfsplus_cat_read_inode(inode, &fd);
                        hfs_find_exit(&fd);
                }
        } else {
                err = hfsplus_system_read_inode(inode);
        }

        if (err) {
                iget_failed(inode);
                return ERR_PTR(err);
        }

        unlock_new_inode(inode);
        return inode;
}

static int hfsplus_system_write_inode(struct inode *inode)
{
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
        struct hfsplus_vh *vhdr = sbi->s_vhdr;
        struct hfsplus_fork_raw *fork;
        struct hfs_btree *tree = NULL;

        switch (inode->i_ino) {
        case HFSPLUS_EXT_CNID:
                fork = &vhdr->ext_file;
                tree = sbi->ext_tree;
                break;
        case HFSPLUS_CAT_CNID:
                fork = &vhdr->cat_file;
                tree = sbi->cat_tree;
                break;
        case HFSPLUS_ALLOC_CNID:
                fork = &vhdr->alloc_file;
                break;
        case HFSPLUS_START_CNID:
                fork = &vhdr->start_file;
                break;
        case HFSPLUS_ATTR_CNID:
                fork = &vhdr->attr_file;
                tree = sbi->attr_tree;
                break;
        default:
                return -EIO;
        }

        if (fork->total_size != cpu_to_be64(inode->i_size)) {
                set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
                hfsplus_mark_mdb_dirty(inode->i_sb);
        }
        hfsplus_inode_write_fork(inode, fork);
        if (tree) {
                mutex_lock_nested(&tree->tree_lock,
                                  hfsplus_btree_lock_class(tree));
                int err = hfs_btree_write(tree);
                mutex_unlock(&tree->tree_lock);

                if (err) {
                        pr_err("b-tree write err: %d, ino %llu\n",
                               err, inode->i_ino);
                        return err;
                }
        }
        return 0;
}

static int hfsplus_write_inode(struct inode *inode,
                struct writeback_control *wbc)
{
        int err;

        hfs_dbg("ino %llu\n", inode->i_ino);

        err = hfsplus_ext_write_extent(inode);
        if (err)
                return err;

        if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
            inode->i_ino == HFSPLUS_ROOT_CNID)
                return hfsplus_cat_write_inode(inode);
        else
                return hfsplus_system_write_inode(inode);
}

static void hfsplus_evict_inode(struct inode *inode)
{
        hfs_dbg("ino %llu\n", inode->i_ino);
        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        if (HFSPLUS_IS_RSRC(inode)) {
                HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
                iput(HFSPLUS_I(inode)->rsrc_inode);
        }
}

int hfsplus_commit_superblock(struct super_block *sb)
{
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct hfsplus_vh *vhdr = sbi->s_vhdr;
        int write_backup = 0;
        int error = 0, error2;

        hfs_dbg("starting...\n");

        mutex_lock(&sbi->vh_mutex);
        mutex_lock(&sbi->alloc_mutex);
        vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
        vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
        vhdr->folder_count = cpu_to_be32(sbi->folder_count);
        vhdr->file_count = cpu_to_be32(sbi->file_count);

        hfs_dbg("free_blocks %u, next_cnid %u, folder_count %u, file_count %u\n",
                sbi->free_blocks, sbi->next_cnid,
                sbi->folder_count, sbi->file_count);

        if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
                memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
                write_backup = 1;
        }

        error2 = hfsplus_submit_bio(sb,
                                   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
                                   sbi->s_vhdr_buf, NULL, REQ_OP_WRITE);
        if (!error)
                error = error2;
        if (!write_backup)
                goto out;

        error2 = hfsplus_submit_bio(sb,
                                  sbi->part_start + sbi->sect_count - 2,
                                  sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE);
        if (!error)
                error = error2;
out:
        mutex_unlock(&sbi->alloc_mutex);
        mutex_unlock(&sbi->vh_mutex);

        hfs_dbg("finished: err %d\n", error);

        return error;
}

static int hfsplus_sync_fs(struct super_block *sb, int wait)
{
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        int error, error2;

        if (!wait)
                return 0;

        hfs_dbg("starting...\n");

        /*
         * Explicitly write out the special metadata inodes.
         *
         * While these special inodes are marked as hashed and written
         * out peridocically by the flusher threads we redirty them
         * during writeout of normal inodes, and thus the life lock
         * prevents us from getting the latest state to disk.
         */
        error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
        error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
        if (!error)
                error = error2;
        if (sbi->attr_tree) {
                error2 =
                    filemap_write_and_wait(sbi->attr_tree->inode->i_mapping);
                if (!error)
                        error = error2;
        }
        error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
        if (!error)
                error = error2;

        error2 = hfsplus_commit_superblock(sb);
        if (!error)
                error = error2;

        if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
                blkdev_issue_flush(sb->s_bdev);

        hfs_dbg("finished: err %d\n", error);

        return error;
}

static void delayed_sync_fs(struct work_struct *work)
{
        int err;
        struct hfsplus_sb_info *sbi;

        sbi = container_of(work, struct hfsplus_sb_info, sync_work.work);

        spin_lock(&sbi->work_lock);
        sbi->work_queued = 0;
        spin_unlock(&sbi->work_lock);

        err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1);
        if (err)
                pr_err("delayed sync fs err %d\n", err);
}

void hfsplus_mark_mdb_dirty(struct super_block *sb)
{
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        unsigned long delay;

        if (sb_rdonly(sb))
                return;

        spin_lock(&sbi->work_lock);
        if (!sbi->work_queued) {
                delay = msecs_to_jiffies(dirty_writeback_interval * 10);
                queue_delayed_work(system_long_wq, &sbi->sync_work, delay);
                sbi->work_queued = 1;
        }
        spin_unlock(&sbi->work_lock);
}

static void delayed_free(struct rcu_head *p)
{
        struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu);

        unload_nls(sbi->nls);
        kfree(sbi);
}

static void hfsplus_put_super(struct super_block *sb)
{
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);

        hfs_dbg("starting...\n");

        cancel_delayed_work_sync(&sbi->sync_work);

        if (!sb_rdonly(sb) && sbi->s_vhdr) {
                struct hfsplus_vh *vhdr = sbi->s_vhdr;

                vhdr->modify_date = hfsp_now2mt();
                vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
                vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);

                hfsplus_sync_fs(sb, 1);
        }

        iput(sbi->alloc_file);
        iput(sbi->hidden_dir);
        hfs_btree_close(sbi->attr_tree);
        hfs_btree_close(sbi->cat_tree);
        hfs_btree_close(sbi->ext_tree);
        kfree(sbi->s_vhdr_buf);
        kfree(sbi->s_backup_vhdr_buf);
        hfs_dbg("finished\n");
}

static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);

        buf->f_type = HFSPLUS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
        buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
        buf->f_bavail = buf->f_bfree;
        buf->f_files = 0xFFFFFFFF;
        buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
        buf->f_fsid = u64_to_fsid(id);
        buf->f_namelen = HFSPLUS_MAX_STRLEN;

        return 0;
}

static int hfsplus_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;

        sync_filesystem(sb);
        if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
                return 0;
        if (!(fc->sb_flags & SB_RDONLY)) {
                struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
                struct hfsplus_vh *vhdr = sbi->s_vhdr;

                if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
                        pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  leaving read-only.\n");
                        sb->s_flags |= SB_RDONLY;
                        fc->sb_flags |= SB_RDONLY;
                } else if (test_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
                        /* nothing */
                } else if (vhdr->attributes &
                                cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
                        pr_warn("filesystem is marked locked, leaving read-only.\n");
                        sb->s_flags |= SB_RDONLY;
                        fc->sb_flags |= SB_RDONLY;
                } else if (vhdr->attributes &
                                cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
                        pr_warn("filesystem is marked journaled, leaving read-only.\n");
                        sb->s_flags |= SB_RDONLY;
                        fc->sb_flags |= SB_RDONLY;
                }
        }
        return 0;
}

static const struct super_operations hfsplus_sops = {
        .alloc_inode        = hfsplus_alloc_inode,
        .free_inode        = hfsplus_free_inode,
        .write_inode        = hfsplus_write_inode,
        .evict_inode        = hfsplus_evict_inode,
        .put_super        = hfsplus_put_super,
        .sync_fs        = hfsplus_sync_fs,
        .statfs                = hfsplus_statfs,
        .show_options        = hfsplus_show_options,
};

void hfsplus_prepare_volume_header_for_commit(struct hfsplus_vh *vhdr)
{
        vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
        vhdr->modify_date = hfsp_now2mt();
        be32_add_cpu(&vhdr->write_count, 1);
        vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
        vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
}

static inline int hfsplus_get_hidden_dir_entry(struct super_block *sb,
                                               const struct qstr *str,
                                               hfsplus_cat_entry *entry)
{
        struct hfs_find_data fd;
        int err;

        err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        if (unlikely(err))
                return err;

        err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, str);
        if (unlikely(err))
                goto free_fd;

        err = hfsplus_brec_read_cat(&fd, entry);
        if (err)
                err = -ENOENT;

free_fd:
        hfs_find_exit(&fd);
        return err;
}

static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct hfsplus_vh *vhdr;
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        hfsplus_cat_entry entry;
        struct inode *root, *inode;
        struct qstr str;
        struct nls_table *nls;
        u64 last_fs_block, last_fs_page;
        int silent = fc->sb_flags & SB_SILENT;
        int err;

        mutex_init(&sbi->alloc_mutex);
        mutex_init(&sbi->vh_mutex);
        spin_lock_init(&sbi->work_lock);
        INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);

        err = -EINVAL;
        if (!sbi->nls) {
                /* try utf8 first, as this is the old default behaviour */
                sbi->nls = load_nls("utf8");
                if (!sbi->nls)
                        sbi->nls = load_nls_default();
        }

        /* temporarily use utf8 to correctly find the hidden dir below */
        nls = sbi->nls;
        sbi->nls = load_nls("utf8");
        if (!sbi->nls) {
                pr_err("unable to load nls for utf8\n");
                goto out_unload_nls;
        }

        /* Grab the volume header */
        if (hfsplus_read_wrapper(sb)) {
                if (!silent)
                        pr_warn("unable to find HFS+ superblock\n");
                goto out_unload_nls;
        }
        vhdr = sbi->s_vhdr;

        /* Copy parts of the volume header into the superblock */
        sb->s_magic = HFSPLUS_VOLHEAD_SIG;
        if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
            be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
                pr_err("wrong filesystem version\n");
                goto out_free_vhdr;
        }
        sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
        sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
        sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
        sbi->file_count = be32_to_cpu(vhdr->file_count);
        sbi->folder_count = be32_to_cpu(vhdr->folder_count);
        sbi->data_clump_blocks =
                be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
        if (!sbi->data_clump_blocks)
                sbi->data_clump_blocks = 1;
        sbi->rsrc_clump_blocks =
                be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
        if (!sbi->rsrc_clump_blocks)
                sbi->rsrc_clump_blocks = 1;

        err = -EFBIG;
        last_fs_block = sbi->total_blocks - 1;
        last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
                        PAGE_SHIFT;

        if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
            (last_fs_page > (pgoff_t)(~0ULL))) {
                pr_err("filesystem size too large\n");
                goto out_free_vhdr;
        }

        /* Set up operations so we can load metadata */
        sb->s_op = &hfsplus_sops;
        sb->s_maxbytes = MAX_LFS_FILESIZE;

        if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
                pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  mounting read-only.\n");
                sb->s_flags |= SB_RDONLY;
        } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
                /* nothing */
        } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
                pr_warn("Filesystem is marked locked, mounting read-only.\n");
                sb->s_flags |= SB_RDONLY;
        } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
                        !sb_rdonly(sb)) {
                pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n");
                sb->s_flags |= SB_RDONLY;
        }

        err = -EINVAL;

        /* Load metadata objects (B*Trees) */
        sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
        if (!sbi->ext_tree) {
                pr_err("failed to load extents file\n");
                goto out_free_vhdr;
        }
        sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
        if (!sbi->cat_tree) {
                pr_err("failed to load catalog file\n");
                goto out_close_ext_tree;
        }
        atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE);
        if (vhdr->attr_file.total_blocks != 0) {
                sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
                if (!sbi->attr_tree) {
                        pr_err("failed to load attributes file\n");
                        goto out_close_cat_tree;
                }
                atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE);
        }
        sb->s_xattr = hfsplus_xattr_handlers;

        inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
        if (IS_ERR(inode)) {
                pr_err("failed to load allocation file\n");
                err = PTR_ERR(inode);
                goto out_close_attr_tree;
        }
        sbi->alloc_file = inode;

        /* Load the root directory */
        root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
        if (IS_ERR(root)) {
                pr_err("failed to load root directory\n");
                err = PTR_ERR(root);
                goto out_put_alloc_file;
        }

        set_default_d_op(sb, &hfsplus_dentry_operations);
        sb->s_root = d_make_root(root);
        if (!sb->s_root) {
                err = -ENOMEM;
                goto out_put_alloc_file;
        }

        str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
        str.name = HFSP_HIDDENDIR_NAME;
        err = hfsplus_get_hidden_dir_entry(sb, &str, &entry);
        if (err == -ENOENT) {
                /*
                 * Hidden directory is absent or it cannot be read.
                 */
        } else if (unlikely(err)) {
                goto out_put_root;
        } else {
                if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) {
                        err = -EIO;
                        goto out_put_root;
                }
                inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        goto out_put_root;
                }
                sbi->hidden_dir = inode;
        }

        if (!sb_rdonly(sb)) {
                /*
                 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
                 * all three are registered with Apple for our use
                 */
                hfsplus_prepare_volume_header_for_commit(vhdr);
                hfsplus_sync_fs(sb, 1);

                if (!sbi->hidden_dir) {
                        mutex_lock(&sbi->vh_mutex);
                        sbi->hidden_dir = hfsplus_new_inode(sb, root, S_IFDIR);
                        if (!sbi->hidden_dir) {
                                mutex_unlock(&sbi->vh_mutex);
                                err = -ENOMEM;
                                goto out_put_root;
                        }
                        err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,
                                                 &str, sbi->hidden_dir);
                        if (err) {
                                mutex_unlock(&sbi->vh_mutex);
                                goto out_put_hidden_dir;
                        }

                        err = hfsplus_init_security(sbi->hidden_dir,
                                                        root, &str);
                        if (err == -EOPNOTSUPP)
                                err = 0; /* Operation is not supported. */
                        else if (err) {
                                /*
                                 * Try to delete anyway without
                                 * error analysis.
                                 */
                                hfsplus_delete_cat(sbi->hidden_dir->i_ino,
                                                        root, &str);
                                mutex_unlock(&sbi->vh_mutex);
                                goto out_put_hidden_dir;
                        }

                        mutex_unlock(&sbi->vh_mutex);
                        hfsplus_mark_inode_dirty(HFSPLUS_CAT_TREE_I(sb),
                                                 HFSPLUS_I_CAT_DIRTY);
                        hfsplus_mark_inode_dirty(sbi->hidden_dir,
                                                 HFSPLUS_I_CAT_DIRTY);
                }
        }

        unload_nls(sbi->nls);
        sbi->nls = nls;
        return 0;

out_put_hidden_dir:
        cancel_delayed_work_sync(&sbi->sync_work);
        iput(sbi->hidden_dir);
out_put_root:
        dput(sb->s_root);
        sb->s_root = NULL;
out_put_alloc_file:
        iput(sbi->alloc_file);
out_close_attr_tree:
        hfs_btree_close(sbi->attr_tree);
out_close_cat_tree:
        hfs_btree_close(sbi->cat_tree);
out_close_ext_tree:
        hfs_btree_close(sbi->ext_tree);
out_free_vhdr:
        kfree(sbi->s_vhdr_buf);
        kfree(sbi->s_backup_vhdr_buf);
out_unload_nls:
        unload_nls(nls);
        return err;
}

MODULE_AUTHOR("Brad Boyer");
MODULE_DESCRIPTION("Extended Macintosh Filesystem");
MODULE_LICENSE("GPL");

static struct kmem_cache *hfsplus_inode_cachep;

static struct inode *hfsplus_alloc_inode(struct super_block *sb)
{
        struct hfsplus_inode_info *i;

        i = alloc_inode_sb(sb, hfsplus_inode_cachep, GFP_KERNEL);
        return i ? &i->vfs_inode : NULL;
}

static void hfsplus_free_inode(struct inode *inode)
{
        kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
}

#define HFSPLUS_INODE_SIZE        sizeof(struct hfsplus_inode_info)

static int hfsplus_get_tree(struct fs_context *fc)
{
        return get_tree_bdev(fc, hfsplus_fill_super);
}

static void hfsplus_free_fc(struct fs_context *fc)
{
        kfree(fc->s_fs_info);
}

static const struct fs_context_operations hfsplus_context_ops = {
        .parse_param        = hfsplus_parse_param,
        .get_tree        = hfsplus_get_tree,
        .reconfigure        = hfsplus_reconfigure,
        .free                = hfsplus_free_fc,
};

static int hfsplus_init_fs_context(struct fs_context *fc)
{
        struct hfsplus_sb_info *sbi;

        sbi = kzalloc_obj(struct hfsplus_sb_info);
        if (!sbi)
                return -ENOMEM;

        if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
                hfsplus_fill_defaults(sbi);

        fc->s_fs_info = sbi;
        fc->ops = &hfsplus_context_ops;

        return 0;
}

static void hfsplus_kill_super(struct super_block *sb)
{
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);

        kill_block_super(sb);
        call_rcu(&sbi->rcu, delayed_free);
}

static struct file_system_type hfsplus_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "hfsplus",
        .kill_sb        = hfsplus_kill_super,
        .fs_flags        = FS_REQUIRES_DEV,
        .init_fs_context = hfsplus_init_fs_context,
};
MODULE_ALIAS_FS("hfsplus");

static void hfsplus_init_once(void *p)
{
        struct hfsplus_inode_info *i = p;

        inode_init_once(&i->vfs_inode);
}

static int __init init_hfsplus_fs(void)
{
        int err;

        hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
                HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
                hfsplus_init_once);
        if (!hfsplus_inode_cachep)
                return -ENOMEM;
        err = hfsplus_create_attr_tree_cache();
        if (err)
                goto destroy_inode_cache;
        err = register_filesystem(&hfsplus_fs_type);
        if (err)
                goto destroy_attr_tree_cache;
        return 0;

destroy_attr_tree_cache:
        hfsplus_destroy_attr_tree_cache();

destroy_inode_cache:
        kmem_cache_destroy(hfsplus_inode_cachep);

        return err;
}

static void __exit exit_hfsplus_fs(void)
{
        unregister_filesystem(&hfsplus_fs_type);

        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        hfsplus_destroy_attr_tree_cache();
        kmem_cache_destroy(hfsplus_inode_cachep);
}

module_init(init_hfsplus_fs)
module_exit(exit_hfsplus_fs)















































































































































































































































































    6 

































































    6 









































































































































































































































































































































    6 



























    5 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Linux INET6 implementation 
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>        
 */

#ifndef _IP6_FIB_H
#define _IP6_FIB_H

#include <linux/ipv6_route.h>
#include <linux/rtnetlink.h>
#include <linux/spinlock.h>
#include <linux/notifier.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/ip_fib.h>
#include <net/netlink.h>
#include <net/inetpeer.h>
#include <net/fib_notifier.h>
#include <linux/indirect_call_wrapper.h>
#include <uapi/linux/bpf.h>

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
#define FIB6_TABLE_HASHSZ 256
#else
#define FIB6_TABLE_HASHSZ 1
#endif

#define RT6_DEBUG 2

struct rt6_info;
struct fib6_info;

struct fib6_config {
        u32                fc_table;
        u32                fc_metric;
        int                fc_dst_len;
        int                fc_src_len;
        int                fc_ifindex;
        u32                fc_flags;
        u32                fc_protocol;
        u16                fc_type;        /* only 8 bits are used */
        u16                fc_delete_all_nh : 1,
                        fc_ignore_dev_down:1,
                        __unused : 14;
        u32                fc_nh_id;

        struct in6_addr        fc_dst;
        struct in6_addr        fc_src;
        struct in6_addr        fc_prefsrc;
        struct in6_addr        fc_gateway;

        unsigned long        fc_expires;
        struct nlattr        *fc_mx;
        int                fc_mx_len;
        int                fc_mp_len;
        struct nlattr        *fc_mp;

        struct nl_info        fc_nlinfo;
        struct nlattr        *fc_encap;
        u16                fc_encap_type;
        bool                fc_is_fdb;
};

struct fib6_node {
        struct fib6_node __rcu        *parent;
        struct fib6_node __rcu        *left;
        struct fib6_node __rcu        *right;
#ifdef CONFIG_IPV6_SUBTREES
        struct fib6_node __rcu        *subtree;
#endif
        struct fib6_info __rcu        *leaf;

        __u16                        fn_bit;                /* bit key */
        __u16                        fn_flags;
        int                        fn_sernum;
        struct fib6_info __rcu        *rr_ptr;
        struct rcu_head                rcu;
};

struct fib6_gc_args {
        int                        timeout;
        int                        more;
};

#ifndef CONFIG_IPV6_SUBTREES
#define FIB6_SUBTREE(fn)        NULL

static inline bool fib6_routes_require_src(const struct net *net)
{
        return false;
}

static inline void fib6_routes_require_src_inc(struct net *net) {}
static inline void fib6_routes_require_src_dec(struct net *net) {}

#else

static inline bool fib6_routes_require_src(const struct net *net)
{
        return net->ipv6.fib6_routes_require_src > 0;
}

static inline void fib6_routes_require_src_inc(struct net *net)
{
        net->ipv6.fib6_routes_require_src++;
}

static inline void fib6_routes_require_src_dec(struct net *net)
{
        net->ipv6.fib6_routes_require_src--;
}

#define FIB6_SUBTREE(fn)        (rcu_dereference_protected((fn)->subtree, 1))
#endif

/*
 *        routing information
 *
 */

struct rt6key {
        struct in6_addr        addr;
        int                plen;
};

struct fib6_table;

struct rt6_exception_bucket {
        struct hlist_head        chain;
        int                        depth;
};

struct rt6_exception {
        struct hlist_node        hlist;
        struct rt6_info                *rt6i;
        unsigned long                stamp;
        struct rcu_head                rcu;
};

#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
#define FIB6_MAX_DEPTH 5

struct fib6_nh {
        struct fib_nh_common        nh_common;

#ifdef CONFIG_IPV6_ROUTER_PREF
        unsigned long                last_probe;
#endif

        struct rt6_info * __percpu *rt6i_pcpu;
        struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
};

struct fib6_info {
        struct fib6_table                *fib6_table;
        struct fib6_info __rcu                *fib6_next;
        struct fib6_node __rcu                *fib6_node;

        /* Multipath routes:
         * siblings is a list of fib6_info that have the same metric/weight,
         * destination, but not the same gateway. nsiblings is just a cache
         * to speed up lookup.
         */
        union {
                struct list_head        fib6_siblings;
                struct list_head        nh_list;
        };
        unsigned int                        fib6_nsiblings;

        refcount_t                        fib6_ref;
        unsigned long                        expires;

        struct hlist_node                gc_link;

        struct dst_metrics                *fib6_metrics;
#define fib6_pmtu                fib6_metrics->metrics[RTAX_MTU-1]

        struct rt6key                        fib6_dst;
        u32                                fib6_flags;
        struct rt6key                        fib6_src;
        struct rt6key                        fib6_prefsrc;

        u32                                fib6_metric;
        u8                                fib6_protocol;
        u8                                fib6_type;

        u8                                offload;
        u8                                trap;
        u8                                offload_failed;

        u8                                should_flush:1,
                                        dst_nocount:1,
                                        dst_nopolicy:1,
                                        fib6_destroying:1,
                                        unused:4;

        struct list_head                purge_link;
        struct rcu_head                        rcu;
        struct nexthop                        *nh;
        struct fib6_nh                        fib6_nh[];
};

struct rt6_info {
        struct dst_entry                dst;
        struct fib6_info __rcu                *from;
        int                                sernum;

        struct rt6key                        rt6i_dst;
        struct rt6key                        rt6i_src;
        struct in6_addr                        rt6i_gateway;
        struct inet6_dev                *rt6i_idev;
        u32                                rt6i_flags;

        /* more non-fragment space at head required */
        unsigned short                        rt6i_nfheader_len;
};

struct fib6_result {
        struct fib6_nh                *nh;
        struct fib6_info        *f6i;
        u32                        fib6_flags;
        u8                        fib6_type;
        struct rt6_info                *rt6;
};

#define for_each_fib6_node_rt_rcu(fn)                                        \
        for (rt = rcu_dereference((fn)->leaf); rt;                        \
             rt = rcu_dereference(rt->fib6_next))

#define for_each_fib6_walker_rt(w)                                        \
        for (rt = (w)->leaf; rt;                                        \
             rt = rcu_dereference_protected(rt->fib6_next, 1))

#define dst_rt6_info(_ptr) container_of_const(_ptr, struct rt6_info, dst)

static inline struct inet6_dev *ip6_dst_idev(const struct dst_entry *dst)
{
        return dst_rt6_info(dst)->rt6i_idev;
}

static inline bool fib6_requires_src(const struct fib6_info *rt)
{
        return rt->fib6_src.plen > 0;
}

/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever
 * been added to a table before.
 */
static inline void fib6_clean_expires(struct fib6_info *f6i)
{
        f6i->fib6_flags &= ~RTF_EXPIRES;
        f6i->expires = 0;
}

/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever
 * been added to a table before.
 */
static inline void fib6_set_expires(struct fib6_info *f6i,
                                    unsigned long expires)
{
        f6i->expires = expires;
        f6i->fib6_flags |= RTF_EXPIRES;
}

static inline bool fib6_check_expired(const struct fib6_info *f6i)
{
        if (f6i->fib6_flags & RTF_EXPIRES)
                return time_after(jiffies, f6i->expires);
        return false;
}

/* Function to safely get fn->fn_sernum for passed in rt
 * and store result in passed in cookie.
 * Return true if we can get cookie safely
 * Return false if not
 */
static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
                                        u32 *cookie)
{
        struct fib6_node *fn;
        bool status = false;

        fn = rcu_dereference(f6i->fib6_node);

        if (fn) {
                *cookie = READ_ONCE(fn->fn_sernum);
                /* pairs with smp_wmb() in __fib6_update_sernum_upto_root() */
                smp_rmb();
                status = true;
        }

        return status;
}

static inline u32 rt6_get_cookie(const struct rt6_info *rt)
{
        struct fib6_info *from;
        u32 cookie = 0;

        if (rt->sernum)
                return rt->sernum;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (from)
                fib6_get_cookie_safe(from, &cookie);

        rcu_read_unlock();

        return cookie;
}

static inline void ip6_rt_put(struct rt6_info *rt)
{
        /* dst_release() accepts a NULL parameter.
         * We rely on dst being first structure in struct rt6_info
         */
        BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0);
        dst_release(&rt->dst);
}

struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh);
void fib6_info_destroy_rcu(struct rcu_head *head);

static inline void fib6_info_hold(struct fib6_info *f6i)
{
        refcount_inc(&f6i->fib6_ref);
}

static inline bool fib6_info_hold_safe(struct fib6_info *f6i)
{
        return refcount_inc_not_zero(&f6i->fib6_ref);
}

static inline void fib6_info_release(struct fib6_info *f6i)
{
        if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) {
                DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link));
                call_rcu_hurry(&f6i->rcu, fib6_info_destroy_rcu);
        }
}

enum fib6_walk_state {
#ifdef CONFIG_IPV6_SUBTREES
        FWS_S,
#endif
        FWS_L,
        FWS_R,
        FWS_C,
        FWS_U
};

struct fib6_walker {
        struct list_head lh;
        struct fib6_node *root, *node;
        struct fib6_info *leaf;
        enum fib6_walk_state state;
        unsigned int skip;
        unsigned int count;
        unsigned int skip_in_node;
        int (*func)(struct fib6_walker *);
        void *args;
};

struct rt6_statistics {
        __u32                fib_nodes;                /* all fib6 nodes */
        __u32                fib_route_nodes;        /* intermediate nodes */
        __u32                fib_rt_entries;                /* rt entries in fib table */
        __u32                fib_rt_cache;                /* cached rt entries in exception table */
        __u32                fib_discarded_routes;        /* total number of routes delete */

        /* The following stat is not protected by any lock */
        atomic_t        fib_rt_alloc;                /* total number of routes alloced */
};

#define RTN_TL_ROOT        0x0001
#define RTN_ROOT        0x0002                /* tree root node                */
#define RTN_RTINFO        0x0004                /* node with valid routing info        */

/*
 *        priority levels (or metrics)
 *
 */


struct fib6_table {
        struct hlist_node        tb6_hlist;
        u32                        tb6_id;
        spinlock_t                tb6_lock;
        struct fib6_node        tb6_root;
        struct inet_peer_base        tb6_peers;
        unsigned int                flags;
        unsigned int                fib_seq; /* writes protected by rtnl_mutex */
        struct hlist_head       tb6_gc_hlist;        /* GC candidates */
#define RT6_TABLE_HAS_DFLT_ROUTER        BIT(0)
};

#define RT6_TABLE_UNSPEC        RT_TABLE_UNSPEC
#define RT6_TABLE_MAIN                RT_TABLE_MAIN
#define RT6_TABLE_DFLT                RT6_TABLE_MAIN
#define RT6_TABLE_INFO                RT6_TABLE_MAIN
#define RT6_TABLE_PREFIX        RT6_TABLE_MAIN

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
#define FIB6_TABLE_MIN                1
#define FIB6_TABLE_MAX                RT_TABLE_MAX
#define RT6_TABLE_LOCAL                RT_TABLE_LOCAL
#else
#define FIB6_TABLE_MIN                RT_TABLE_MAIN
#define FIB6_TABLE_MAX                FIB6_TABLE_MIN
#define RT6_TABLE_LOCAL                RT6_TABLE_MAIN
#endif

typedef struct rt6_info *(*pol_lookup_t)(struct net *,
                                         struct fib6_table *,
                                         struct flowi6 *,
                                         const struct sk_buff *, int);

struct fib6_entry_notifier_info {
        struct fib_notifier_info info; /* must be first */
        struct fib6_info *rt;
        unsigned int nsiblings;
};

/*
 *        exported functions
 */

struct fib6_table *fib6_get_table(struct net *net, u32 id);
struct fib6_table *fib6_new_table(struct net *net, u32 id);
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb,
                                   int flags, pol_lookup_t lookup);

/* called with rcu lock held; can return error pointer
 * caller needs to select path
 */
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                struct fib6_result *res, int flags);

/* called with rcu lock held; caller needs to select path */
int fib6_table_lookup(struct net *net, struct fib6_table *table,
                      int oif, struct flowi6 *fl6, struct fib6_result *res,
                      int strict);

void fib6_select_path(const struct net *net, struct fib6_result *res,
                      struct flowi6 *fl6, int oif, bool have_oif_match,
                      const struct sk_buff *skb, int strict);
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
                                   const struct in6_addr *daddr,
                                   const struct in6_addr *saddr);

struct fib6_node *fib6_locate(struct fib6_node *root,
                              const struct in6_addr *daddr, int dst_len,
                              const struct in6_addr *saddr, int src_len,
                              bool exact_match);

void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg),
                    void *arg);
void fib6_clean_all_skip_notify(struct net *net,
                                int (*func)(struct fib6_info *, void *arg),
                                void *arg);

int fib6_add(struct fib6_node *root, struct fib6_info *rt,
             struct nl_info *info, struct netlink_ext_ack *extack);
int fib6_del(struct fib6_info *rt, struct nl_info *info);

static inline
void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
{
        const struct fib6_info *from;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (from)
                *addr = from->fib6_prefsrc.addr;
        else
                *addr = in6addr_any;

        rcu_read_unlock();
}

#if IS_ENABLED(CONFIG_IPV6)
int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
                 struct fib6_config *cfg, gfp_t gfp_flags,
                 struct netlink_ext_ack *extack);
void fib6_nh_release(struct fib6_nh *fib6_nh);
void fib6_nh_release_dsts(struct fib6_nh *fib6_nh);
#else
static inline int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
                               struct fib6_config *cfg, gfp_t gfp_flags,
                               struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
        return -EAFNOSUPPORT;
}

static inline void fib6_nh_release(struct fib6_nh *fib6_nh)
{
}

static inline void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
{
}
#endif


int call_fib6_entry_notifiers(struct net *net,
                              enum fib_event_type event_type,
                              struct fib6_info *rt,
                              struct netlink_ext_ack *extack);
int call_fib6_multipath_entry_notifiers(struct net *net,
                                        enum fib_event_type event_type,
                                        struct fib6_info *rt,
                                        unsigned int nsiblings,
                                        struct netlink_ext_ack *extack);
int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt);
#if IS_ENABLED(CONFIG_IPV6)
void fib6_rt_update(struct net *net, struct fib6_info *rt,
                    struct nl_info *info);
#else
static inline void fib6_rt_update(struct net *net, struct fib6_info *rt,
                                  struct nl_info *info)
{
}
#endif
void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
                     unsigned int flags);

void fib6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args,
                         unsigned long now);
void fib6_run_gc(unsigned long expires, struct net *net, bool force);
void fib6_gc_cleanup(void);

int fib6_init(void);

#if IS_ENABLED(CONFIG_IPV6)
/* Add the route to the gc list if it is not already there
 *
 * The callers should hold f6i->fib6_table->tb6_lock.
 */
static inline void fib6_add_gc_list(struct fib6_info *f6i)
{
        /* If fib6_node is null, the f6i is not in (or removed from) the
         * table.
         *
         * There is a gap between finding the f6i from the table and
         * calling this function without the protection of the tb6_lock.
         * This check makes sure the f6i is not added to the gc list when
         * it is not on the table.
         */
        if (!rcu_dereference_protected(f6i->fib6_node,
                                       lockdep_is_held(&f6i->fib6_table->tb6_lock)))
                return;

        if (hlist_unhashed(&f6i->gc_link))
                hlist_add_head(&f6i->gc_link, &f6i->fib6_table->tb6_gc_hlist);
}

/* Remove the route from the gc list if it is on the list.
 *
 * The callers should hold f6i->fib6_table->tb6_lock.
 */
static inline void fib6_remove_gc_list(struct fib6_info *f6i)
{
        if (!hlist_unhashed(&f6i->gc_link))
                hlist_del_init(&f6i->gc_link);
}

static inline void fib6_may_remove_gc_list(struct net *net,
                                           struct fib6_info *f6i)
{
        struct fib6_gc_args gc_args;

        if (hlist_unhashed(&f6i->gc_link))
                return;

        gc_args.timeout = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval);
        gc_args.more = 0;

        rcu_read_lock();
        fib6_age_exceptions(f6i, &gc_args, jiffies);
        rcu_read_unlock();
}
#endif

struct ipv6_route_iter {
        struct seq_net_private p;
        struct fib6_walker w;
        loff_t skip;
        struct fib6_table *tbl;
        int sernum;
};

extern const struct seq_operations ipv6_route_seq_ops;

int call_fib6_notifier(struct notifier_block *nb,
                       enum fib_event_type event_type,
                       struct fib_notifier_info *info);
int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
                        struct fib_notifier_info *info);

int __net_init fib6_notifier_init(struct net *net);
void __net_exit fib6_notifier_exit(struct net *net);

unsigned int fib6_tables_seq_read(const struct net *net);
int fib6_tables_dump(struct net *net, struct notifier_block *nb,
                     struct netlink_ext_ack *extack);

void fib6_update_sernum(struct net *net, struct fib6_info *rt);
#if IS_ENABLED(CONFIG_IPV6)
void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt);
#else
static inline void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
{
}
#endif

void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val);
static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
{
        return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
}
void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
                            bool offload, bool trap, bool offload_failed);

#if IS_ENABLED(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
struct bpf_iter__ipv6_route {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct fib6_info *, rt);
};
#endif

INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_output(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_input(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *__ip6_route_redirect(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_lookup(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
static inline struct rt6_info *pol_lookup_func(pol_lookup_t lookup,
                                                struct net *net,
                                                struct fib6_table *table,
                                                struct flowi6 *fl6,
                                                const struct sk_buff *skb,
                                                int flags)
{
        return INDIRECT_CALL_4(lookup,
                               ip6_pol_route_output,
                               ip6_pol_route_input,
                               ip6_pol_route_lookup,
                               __ip6_route_redirect,
                               net, table, fl6, skb, flags);
}

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
static inline bool fib6_has_custom_rules(const struct net *net)
{
        return net->ipv6.fib6_has_custom_rules;
}

int fib6_rules_init(void);
void fib6_rules_cleanup(void);
bool fib6_rule_default(const struct fib_rule *rule);
int fib6_rules_dump(struct net *net, struct notifier_block *nb,
                    struct netlink_ext_ack *extack);
unsigned int fib6_rules_seq_read(const struct net *net);

static inline bool fib6_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi6 *fl6,
                                                 struct flow_keys *flkeys)
{
        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;

        if (!net->ipv6.fib6_rules_require_fldissect)
                return false;

        memset(flkeys, 0, sizeof(*flkeys));
        __skb_flow_dissect(net, skb, &flow_keys_dissector,
                           flkeys, NULL, 0, 0, 0, flag);

        fl6->fl6_sport = flkeys->ports.src;
        fl6->fl6_dport = flkeys->ports.dst;
        fl6->flowi6_proto = flkeys->basic.ip_proto;

        return true;
}
#else
static inline bool fib6_has_custom_rules(const struct net *net)
{
        return false;
}
static inline int               fib6_rules_init(void)
{
        return 0;
}
static inline void              fib6_rules_cleanup(void)
{
        return ;
}
static inline bool fib6_rule_default(const struct fib_rule *rule)
{
        return true;
}
static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb,
                                  struct netlink_ext_ack *extack)
{
        return 0;
}
static inline unsigned int fib6_rules_seq_read(const struct net *net)
{
        return 0;
}
static inline bool fib6_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi6 *fl6,
                                                 struct flow_keys *flkeys)
{
        return false;
}
#endif
#endif


































































































































    1 




    1 






















    1 







































































































































































































































































































































































































    3 










    2 





































































































































































































































































































































   14 


















    4 
   16 







   16 
    3 





















   17 







   16 

    2 


   14 
   14 
    3 

   18 


   14 
    3 
   16 
    1 













    3 






   13 


















    3 

    2 
    3 






































































   17 




    3 





   17 




















































































   17 



   18 




   18 

































   17 













   16 










   16 






























   16 






















   15 































































   18 






   17 




   18 




























































   18 

   18 
























   11 








   11 



   11 





















   11 
















   11 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/open.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/module.h>
#include <linux/tty.h>
#include <linux/namei.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/rcupdate.h>
#include <linux/audit.h>
#include <linux/falloc.h>
#include <linux/fs_struct.h>
#include <linux/dnotify.h>
#include <linux/compat.h>
#include <linux/mnt_idmapping.h>
#include <linux/filelock.h>

#include "internal.h"

int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
                loff_t length, unsigned int time_attrs, struct file *filp)
{
        int ret;
        struct iattr newattrs;

        /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
        if (length < 0)
                return -EINVAL;

        newattrs.ia_size = length;
        newattrs.ia_valid = ATTR_SIZE | time_attrs;
        if (filp) {
                newattrs.ia_file = filp;
                newattrs.ia_valid |= ATTR_FILE;
        }

        /* Remove suid, sgid, and file capabilities on truncate too */
        ret = dentry_needs_remove_privs(idmap, dentry);
        if (ret < 0)
                return ret;
        if (ret)
                newattrs.ia_valid |= ret | ATTR_FORCE;

        ret = inode_lock_killable(dentry->d_inode);
        if (ret)
                return ret;

        /* Note any delegations or leases have already been broken: */
        ret = notify_change(idmap, dentry, &newattrs, NULL);
        inode_unlock(dentry->d_inode);
        return ret;
}

int vfs_truncate(const struct path *path, loff_t length)
{
        struct mnt_idmap *idmap;
        struct inode *inode;
        int error;

        inode = path->dentry->d_inode;

        /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
        if (S_ISDIR(inode->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;

        idmap = mnt_idmap(path->mnt);
        error = inode_permission(idmap, inode, MAY_WRITE);
        if (error)
                return error;

        error = fsnotify_truncate_perm(path, length);
        if (error)
                return error;

        error = mnt_want_write(path->mnt);
        if (error)
                return error;

        error = -EPERM;
        if (IS_APPEND(inode))
                goto mnt_drop_write_and_out;

        error = get_write_access(inode);
        if (error)
                goto mnt_drop_write_and_out;

        /*
         * Make sure that there are no leases.  get_write_access() protects
         * against the truncate racing with a lease-granting setlease().
         */
        error = break_lease(inode, O_WRONLY);
        if (error)
                goto put_write_and_out;

        error = security_path_truncate(path);
        if (!error)
                error = do_truncate(idmap, path->dentry, length, 0, NULL);

put_write_and_out:
        put_write_access(inode);
mnt_drop_write_and_out:
        mnt_drop_write(path->mnt);

        return error;
}
EXPORT_SYMBOL_GPL(vfs_truncate);

int ksys_truncate(const char __user *pathname, loff_t length)
{
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        struct path path;
        int error;

        if (length < 0)        /* sorry, but loff_t says... */
                return -EINVAL;

        CLASS(filename, name)(pathname);
retry:
        error = filename_lookup(AT_FDCWD, name, lookup_flags, &path, NULL);
        if (!error) {
                error = vfs_truncate(&path, length);
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
        return ksys_truncate(path, length);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
{
        return ksys_truncate(path, length);
}
#endif

int do_ftruncate(struct file *file, loff_t length, unsigned int flags)
{
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        int error;

        if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
                return -EINVAL;

        /*
         * Cannot ftruncate over 2^31 bytes without large file support, either
         * through opening with O_LARGEFILE or by using ftruncate64().
         */
        if (length > MAX_NON_LFS &&
            !(file->f_flags & O_LARGEFILE) && !(flags & FTRUNCATE_LFS))
                return -EINVAL;

        /* Check IS_APPEND on real upper inode */
        if (IS_APPEND(file_inode(file)))
                return -EPERM;

        error = security_file_truncate(file);
        if (error)
                return error;

        error = fsnotify_truncate_perm(&file->f_path, length);
        if (error)
                return error;

        scoped_guard(super_write, inode->i_sb)
                return do_truncate(file_mnt_idmap(file), dentry, length,
                                   ATTR_MTIME | ATTR_CTIME, file);
}

int ksys_ftruncate(unsigned int fd, loff_t length, unsigned int flags)
{
        if (length < 0)
                return -EINVAL;
        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        return do_ftruncate(fd_file(f), length, flags);
}

SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
{
        return ksys_ftruncate(fd, length, 0);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
{
        return ksys_ftruncate(fd, length, 0);
}
#endif

/* LFS versions of truncate are only needed on 32 bit machines */
#if BITS_PER_LONG == 32
SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
{
        return ksys_truncate(path, length);
}

SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
{
        return ksys_ftruncate(fd, length, FTRUNCATE_LFS);
}
#endif /* BITS_PER_LONG == 32 */

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
                       compat_arg_u64_dual(length))
{
        return ksys_truncate(pathname, compat_arg_u64_glue(length));
}
#endif

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64)
COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
                       compat_arg_u64_dual(length))
{
        return ksys_ftruncate(fd, compat_arg_u64_glue(length), FTRUNCATE_LFS);
}
#endif

int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        int ret;
        loff_t sum;

        if (offset < 0 || len <= 0)
                return -EINVAL;

        if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE))
                return -EOPNOTSUPP;

        /*
         * Modes are exclusive, even if that is not obvious from the encoding
         * as bit masks and the mix with the flag in the same namespace.
         *
         * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is
         * encoded as no bit set.
         */
        switch (mode & FALLOC_FL_MODE_MASK) {
        case FALLOC_FL_ALLOCATE_RANGE:
        case FALLOC_FL_UNSHARE_RANGE:
        case FALLOC_FL_ZERO_RANGE:
                break;
        case FALLOC_FL_PUNCH_HOLE:
                if (!(mode & FALLOC_FL_KEEP_SIZE))
                        return -EOPNOTSUPP;
                break;
        case FALLOC_FL_COLLAPSE_RANGE:
        case FALLOC_FL_INSERT_RANGE:
        case FALLOC_FL_WRITE_ZEROES:
                if (mode & FALLOC_FL_KEEP_SIZE)
                        return -EOPNOTSUPP;
                break;
        default:
                return -EOPNOTSUPP;
        }

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;

        /*
         * On append-only files only space preallocation is supported.
         */
        if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
                return -EPERM;

        if (IS_IMMUTABLE(inode))
                return -EPERM;

        /*
         * We cannot allow any fallocate operation on an active swapfile
         */
        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
         */
        ret = security_file_permission(file, MAY_WRITE);
        if (ret)
                return ret;

        ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len);
        if (ret)
                return ret;

        if (S_ISFIFO(inode->i_mode))
                return -ESPIPE;

        if (S_ISDIR(inode->i_mode))
                return -EISDIR;

        if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
                return -ENODEV;

        /* Check for wraparound */
        if (check_add_overflow(offset, len, &sum))
                return -EFBIG;

        if (sum > inode->i_sb->s_maxbytes)
                return -EFBIG;

        if (!file->f_op->fallocate)
                return -EOPNOTSUPP;

        file_start_write(file);
        ret = file->f_op->fallocate(file, mode, offset, len);

        /*
         * Create inotify and fanotify events.
         *
         * To keep the logic simple always create events if fallocate succeeds.
         * This implies that events are even created if the file size remains
         * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
         */
        if (ret == 0)
                fsnotify_modify(file);

        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_fallocate);

int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;

        return vfs_fallocate(fd_file(f), mode, offset, len);
}

SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
{
        return ksys_fallocate(fd, mode, offset, len);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE)
COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset),
                       compat_arg_u64_dual(len))
{
        return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset),
                              compat_arg_u64_glue(len));
}
#endif

/*
 * access() needs to use the real uid/gid, not the effective uid/gid.
 * We do this by temporarily clearing all FS-related capabilities and
 * switching the fsuid/fsgid around to the real ones.
 *
 * Creating new credentials is expensive, so we try to skip doing it,
 * which we can if the result would match what we already got.
 */
static bool access_need_override_creds(int flags)
{
        const struct cred *cred;

        if (flags & AT_EACCESS)
                return false;

        cred = current_cred();
        if (!uid_eq(cred->fsuid, cred->uid) ||
            !gid_eq(cred->fsgid, cred->gid))
                return true;

        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                kuid_t root_uid = make_kuid(cred->user_ns, 0);
                if (!uid_eq(cred->uid, root_uid)) {
                        if (!cap_isclear(cred->cap_effective))
                                return true;
                } else {
                        if (!cap_isidentical(cred->cap_effective,
                            cred->cap_permitted))
                                return true;
                }
        }

        return false;
}

static const struct cred *access_override_creds(void)
{
        struct cred *override_cred;

        override_cred = prepare_creds();
        if (!override_cred)
                return NULL;

        /*
         * XXX access_need_override_creds performs checks in hopes of skipping
         * this work. Make sure it stays in sync if making any changes in this
         * routine.
         */

        override_cred->fsuid = override_cred->uid;
        override_cred->fsgid = override_cred->gid;

        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                /* Clear the capabilities if we switch to a non-root user */
                kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
                if (!uid_eq(override_cred->uid, root_uid))
                        cap_clear(override_cred->cap_effective);
                else
                        override_cred->cap_effective =
                                override_cred->cap_permitted;
        }

        /*
         * The new set of credentials can *only* be used in
         * task-synchronous circumstances, and does not need
         * RCU freeing, unless somebody then takes a separate
         * reference to it.
         *
         * NOTE! This is _only_ true because this credential
         * is used purely for override_creds() that installs
         * it as the subjective cred. Other threads will be
         * accessing ->real_cred, not the subjective cred.
         *
         * If somebody _does_ make a copy of this (using the
         * 'get_current_cred()' function), that will clear the
         * non_rcu field, because now that other user may be
         * expecting RCU freeing. But normal thread-synchronous
         * cred accesses will keep things non-racy to avoid RCU
         * freeing.
         */
        override_cred->non_rcu = 1;
        return override_creds(override_cred);
}

static int do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
        struct path path;
        struct inode *inode;
        int res;
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        const struct cred *old_cred = NULL;

        if (mode & ~S_IRWXO)        /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;

        if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
                return -EINVAL;

        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;

        if (access_need_override_creds(flags)) {
                old_cred = access_override_creds();
                if (!old_cred)
                        return -ENOMEM;
        }

        CLASS(filename_uflags, name)(filename, flags);
retry:
        res = filename_lookup(dfd, name, lookup_flags, &path, NULL);
        if (res)
                goto out;

        inode = d_backing_inode(path.dentry);

        if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                /*
                 * MAY_EXEC on regular files is denied if the fs is mounted
                 * with the "noexec" flag.
                 */
                res = -EACCES;
                if (path_noexec(&path))
                        goto out_path_release;
        }

        res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS);
        /* SuS v2 requires we report a read only fs too */
        if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
                goto out_path_release;
        /*
         * This is a rare case where using __mnt_is_readonly()
         * is OK without a mnt_want/drop_write() pair.  Since
         * no actual write to the fs is performed here, we do
         * not need to telegraph to that to anyone.
         *
         * By doing this, we accept that this access is
         * inherently racy and know that the fs may change
         * state before we even see this result.
         */
        if (__mnt_is_readonly(path.mnt))
                res = -EROFS;

out_path_release:
        path_put(&path);
        if (retry_estale(res, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        if (old_cred)
                put_cred(revert_creds(old_cred));

        return res;
}

SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
{
        return do_faccessat(dfd, filename, mode, 0);
}

SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode,
                int, flags)
{
        return do_faccessat(dfd, filename, mode, flags);
}

SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
{
        return do_faccessat(AT_FDCWD, filename, mode, 0);
}

SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
        CLASS(filename, name)(filename);
retry:
        error = filename_lookup(AT_FDCWD, name, lookup_flags, &path, NULL);
        if (!error) {
                error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
                if (!error)
                        set_fs_pwd(current->fs, &path);
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
        CLASS(fd_raw, f)(fd);
        int error;

        if (fd_empty(f))
                return -EBADF;

        if (!d_can_lookup(fd_file(f)->f_path.dentry))
                return -ENOTDIR;

        error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR);
        if (!error)
                set_fs_pwd(current->fs, &fd_file(f)->f_path);
        return error;
}

SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
        CLASS(filename, name)(filename);
retry:
        error = filename_lookup(AT_FDCWD, name, lookup_flags, &path, NULL);
        if (error)
                return error;

        error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        error = -EPERM;
        if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
                goto dput_and_out;
        error = security_path_chroot(&path);
        if (!error)
                set_fs_root(current->fs, &path);
dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

int chmod_common(const struct path *path, umode_t mode)
{
        struct inode *inode = path->dentry->d_inode;
        struct delegated_inode delegated_inode = { };
        struct iattr newattrs;
        int error;

        error = mnt_want_write(path->mnt);
        if (error)
                return error;
retry_deleg:
        error = inode_lock_killable(inode);
        if (error)
                goto out_mnt_unlock;
        error = security_path_chmod(path, mode);
        if (error)
                goto out_unlock;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        error = notify_change(mnt_idmap(path->mnt), path->dentry,
                              &newattrs, &delegated_inode);
out_unlock:
        inode_unlock(inode);
        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
out_mnt_unlock:
        mnt_drop_write(path->mnt);
        return error;
}

int vfs_fchmod(struct file *file, umode_t mode)
{
        audit_file(file);
        return chmod_common(&file->f_path, mode);
}

SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;

        return vfs_fchmod(fd_file(f), mode);
}

static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
                       unsigned int flags)
{
        struct path path;
        int error;
        unsigned int lookup_flags;

        if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)))
                return -EINVAL;

        lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        CLASS(filename_uflags, name)(filename, flags);
retry:
        error = filename_lookup(dfd, name, lookup_flags, &path, NULL);
        if (!error) {
                error = chmod_common(&path, mode);
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename,
                umode_t, mode, unsigned int, flags)
{
        return do_fchmodat(dfd, filename, mode, flags);
}

SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
                umode_t, mode)
{
        return do_fchmodat(dfd, filename, mode, 0);
}

SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
        return do_fchmodat(AT_FDCWD, filename, mode, 0);
}

/*
 * Check whether @kuid is valid and if so generate and set vfsuid_t in
 * ia_vfsuid.
 *
 * Return: true if @kuid is valid, false if not.
 */
static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
{
        if (!uid_valid(kuid))
                return false;
        attr->ia_valid |= ATTR_UID;
        attr->ia_vfsuid = VFSUIDT_INIT(kuid);
        return true;
}

/*
 * Check whether @kgid is valid and if so generate and set vfsgid_t in
 * ia_vfsgid.
 *
 * Return: true if @kgid is valid, false if not.
 */
static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
{
        if (!gid_valid(kgid))
                return false;
        attr->ia_valid |= ATTR_GID;
        attr->ia_vfsgid = VFSGIDT_INIT(kgid);
        return true;
}

int chown_common(const struct path *path, uid_t user, gid_t group)
{
        struct mnt_idmap *idmap;
        struct user_namespace *fs_userns;
        struct inode *inode = path->dentry->d_inode;
        struct delegated_inode delegated_inode = { };
        int error;
        struct iattr newattrs;
        kuid_t uid;
        kgid_t gid;

        uid = make_kuid(current_user_ns(), user);
        gid = make_kgid(current_user_ns(), group);

        idmap = mnt_idmap(path->mnt);
        fs_userns = i_user_ns(inode);

retry_deleg:
        newattrs.ia_vfsuid = INVALID_VFSUID;
        newattrs.ia_vfsgid = INVALID_VFSGID;
        newattrs.ia_valid =  ATTR_CTIME;
        if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
                return -EINVAL;
        if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
                return -EINVAL;
        error = inode_lock_killable(inode);
        if (error)
                return error;
        if (!S_ISDIR(inode->i_mode))
                newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
                                     setattr_should_drop_sgid(idmap, inode);
        /* Continue to send actual fs values, not the mount values. */
        error = security_path_chown(
                path,
                from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid),
                from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid));
        if (!error)
                error = notify_change(idmap, path->dentry, &newattrs,
                                      &delegated_inode);
        inode_unlock(inode);
        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        return error;
}

int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
                int flag)
{
        struct path path;
        int error;
        int lookup_flags;

        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;

        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        CLASS(filename_uflags, name)(filename, flag);
retry:
        error = filename_lookup(dfd, name, lookup_flags, &path, NULL);
        if (!error) {
                error = mnt_want_write(path.mnt);
                if (!error) {
                        error = chown_common(&path, user, group);
                        mnt_drop_write(path.mnt);
                }
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
                gid_t, group, int, flag)
{
        return do_fchownat(dfd, filename, user, group, flag);
}

SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group, 0);
}

SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group,
                           AT_SYMLINK_NOFOLLOW);
}

int vfs_fchown(struct file *file, uid_t user, gid_t group)
{
        int error;

        error = mnt_want_write_file(file);
        if (error)
                return error;
        audit_file(file);
        error = chown_common(&file->f_path, user, group);
        mnt_drop_write_file(file);
        return error;
}

int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;

        return vfs_fchown(fd_file(f), user, group);
}

SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
{
        return ksys_fchown(fd, user, group);
}

static inline int file_get_write_access(struct file *f)
{
        int error;

        error = get_write_access(f->f_inode);
        if (unlikely(error))
                return error;
        error = mnt_get_write_access(f->f_path.mnt);
        if (unlikely(error))
                goto cleanup_inode;
        if (unlikely(f->f_mode & FMODE_BACKING)) {
                error = mnt_get_write_access(backing_file_user_path(f)->mnt);
                if (unlikely(error))
                        goto cleanup_mnt;
        }
        return 0;

cleanup_mnt:
        mnt_put_write_access(f->f_path.mnt);
cleanup_inode:
        put_write_access(f->f_inode);
        return error;
}

static int do_dentry_open(struct file *f,
                          int (*open)(struct inode *, struct file *))
{
        static const struct file_operations empty_fops = {};
        struct inode *inode = f->f_path.dentry->d_inode;
        int error;

        path_get(&f->f_path);
        f->f_inode = inode;
        f->f_mapping = inode->i_mapping;
        f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
        f->f_sb_err = file_sample_sb_err(f);

        if (unlikely(f->f_flags & O_PATH)) {
                f->f_mode = FMODE_PATH | FMODE_OPENED;
                file_set_fsnotify_mode(f, FMODE_NONOTIFY);
                f->f_op = &empty_fops;
                return 0;
        }

        if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
                i_readcount_inc(inode);
        } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
                error = file_get_write_access(f);
                if (unlikely(error))
                        goto cleanup_file;
                f->f_mode |= FMODE_WRITER;
        }

        /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
                f->f_mode |= FMODE_ATOMIC_POS;

        f->f_op = fops_get(inode->i_fop);
        if (WARN_ON(!f->f_op)) {
                error = -ENODEV;
                goto cleanup_all;
        }

        error = security_file_open(f);
        if (unlikely(error))
                goto cleanup_all;

        /*
         * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits
         * according to existing permission watches.
         * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a
         * pseudo file, this call will not change the mode.
         */
        error = fsnotify_open_perm_and_set_mode(f);
        if (unlikely(error))
                goto cleanup_all;

        error = break_lease(file_inode(f), f->f_flags);
        if (unlikely(error))
                goto cleanup_all;

        /* normally all 3 are set; ->open() can clear them if needed */
        f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        if (!open)
                open = f->f_op->open;
        if (open) {
                error = open(inode, f);
                if (error)
                        goto cleanup_all;
        }
        f->f_mode |= FMODE_OPENED;
        if ((f->f_mode & FMODE_READ) &&
             likely(f->f_op->read || f->f_op->read_iter))
                f->f_mode |= FMODE_CAN_READ;
        if ((f->f_mode & FMODE_WRITE) &&
             likely(f->f_op->write || f->f_op->write_iter))
                f->f_mode |= FMODE_CAN_WRITE;
        if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
                f->f_mode &= ~FMODE_LSEEK;
        if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
                f->f_mode |= FMODE_CAN_ODIRECT;

        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
        f->f_iocb_flags = iocb_flags(f);

        file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

        if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
                return -EINVAL;

        /*
         * XXX: Huge page cache doesn't support writing yet. Drop all page
         * cache for this file before processing writes.
         */
        if (f->f_mode & FMODE_WRITE) {
                /*
                 * Depends on full fence from get_write_access() to synchronize
                 * against collapse_file() regarding i_writecount and nr_thps
                 * updates. Ensures subsequent insertion of THPs into the page
                 * cache will fail.
                 */
                if (filemap_nr_thps(inode->i_mapping)) {
                        struct address_space *mapping = inode->i_mapping;

                        filemap_invalidate_lock(inode->i_mapping);
                        /*
                         * unmap_mapping_range just need to be called once
                         * here, because the private pages is not need to be
                         * unmapped mapping (e.g. data segment of dynamic
                         * shared libraries here).
                         */
                        unmap_mapping_range(mapping, 0, 0, 0);
                        truncate_inode_pages(mapping, 0);
                        filemap_invalidate_unlock(inode->i_mapping);
                }
        }

        return 0;

cleanup_all:
        if (WARN_ON_ONCE(error > 0))
                error = -EINVAL;
        fops_put(f->f_op);
        put_file_access(f);
cleanup_file:
        path_put(&f->f_path);
        f->__f_path.mnt = NULL;
        f->__f_path.dentry = NULL;
        f->f_inode = NULL;
        return error;
}

/**
 * finish_open - finish opening a file
 * @file: file pointer
 * @dentry: pointer to dentry
 * @open: open callback
 *
 * This can be used to finish opening a file passed to i_op->atomic_open().
 *
 * If the open callback is set to NULL, then the standard f_op->open()
 * filesystem callback is substituted.
 *
 * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
 * the return value of d_splice_alias(), then the caller needs to perform dput()
 * on it after finish_open().
 *
 * Returns zero on success or -errno if the open failed.
 */
int finish_open(struct file *file, struct dentry *dentry,
                int (*open)(struct inode *, struct file *))
{
        BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */

        file->__f_path.dentry = dentry;
        return do_dentry_open(file, open);
}
EXPORT_SYMBOL(finish_open);

/**
 * finish_no_open - finish ->atomic_open() without opening the file
 *
 * @file: file pointer
 * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup())
 *
 * This can be used to set the result of a lookup in ->atomic_open().
 *
 * NB: unlike finish_open() this function does consume the dentry reference and
 * the caller need not dput() it.
 *
 * Returns 0 or -E..., which must be the return value of ->atomic_open() after
 * having called this function.
 */
int finish_no_open(struct file *file, struct dentry *dentry)
{
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
        file->__f_path.dentry = dentry;
        return 0;
}
EXPORT_SYMBOL(finish_no_open);

char *file_path(struct file *filp, char *buf, int buflen)
{
        return d_path(&filp->f_path, buf, buflen);
}
EXPORT_SYMBOL(file_path);

/**
 * vfs_open - open the file at the given path
 * @path: path to open
 * @file: newly allocated file with f_flag initialized
 */
int vfs_open(const struct path *path, struct file *file)
{
        int ret;

        file->__f_path = *path;
        ret = do_dentry_open(file, NULL);
        if (!ret) {
                /*
                 * Once we return a file with FMODE_OPENED, __fput() will call
                 * fsnotify_close(), so we need fsnotify_open() here for
                 * symmetry.
                 */
                fsnotify_open(file);
        }
        return ret;
}

struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *cred)
{
        int error;
        struct file *f;

        /* We must always pass in a valid mount pointer. */
        BUG_ON(!path->mnt);

        f = alloc_empty_file(flags, cred);
        if (!IS_ERR(f)) {
                error = vfs_open(path, f);
                if (error) {
                        fput(f);
                        f = ERR_PTR(error);
                }
        }
        return f;
}
EXPORT_SYMBOL(dentry_open);

struct file *dentry_open_nonotify(const struct path *path, int flags,
                                  const struct cred *cred)
{
        struct file *f = alloc_empty_file(flags, cred);
        if (!IS_ERR(f)) {
                int error;

                file_set_fsnotify_mode(f, FMODE_NONOTIFY);
                error = vfs_open(path, f);
                if (error) {
                        fput(f);
                        f = ERR_PTR(error);
                }
        }
        return f;
}

/**
 * kernel_file_open - open a file for kernel internal use
 * @path:        path of the file to open
 * @flags:        open flags
 * @cred:        credentials for open
 *
 * Open a file for use by in-kernel consumers. The file is not accounted
 * against nr_files and must not be installed into the file descriptor
 * table.
 *
 * Return: Opened file on success, an error pointer on failure.
 */
struct file *kernel_file_open(const struct path *path, int flags,
                                const struct cred *cred)
{
        struct file *f;
        int error;

        f = alloc_empty_file_noaccount(flags, cred);
        if (IS_ERR(f))
                return f;

        error = vfs_open(path, f);
        if (error) {
                fput(f);
                return ERR_PTR(error);
        }
        return f;
}
EXPORT_SYMBOL_GPL(kernel_file_open);

#define WILL_CREATE(flags)        (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS                (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)

inline struct open_how build_open_how(int flags, umode_t mode)
{
        struct open_how how = {
                .flags = flags & VALID_OPEN_FLAGS,
                .mode = mode & S_IALLUGO,
        };

        /* O_PATH beats everything else. */
        if (how.flags & O_PATH)
                how.flags &= O_PATH_FLAGS;
        /* Modes should only be set for create-like flags. */
        if (!WILL_CREATE(how.flags))
                how.mode = 0;
        return how;
}

inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
        u64 flags = how->flags;
        u64 strip = O_CLOEXEC;
        int lookup_flags = 0;
        int acc_mode = ACC_MODE(flags);

        BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
                         "struct open_flags doesn't yet handle flags > 32 bits");

        /*
         * Strip flags that aren't relevant in determining struct open_flags.
         */
        flags &= ~strip;

        /*
         * Older syscalls implicitly clear all of the invalid flags or argument
         * values before calling build_open_flags(), but openat2(2) checks all
         * of its arguments.
         */
        if (flags & ~VALID_OPEN_FLAGS)
                return -EINVAL;
        if (how->resolve & ~VALID_RESOLVE_FLAGS)
                return -EINVAL;

        /* Scoping flags are mutually exclusive. */
        if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
                return -EINVAL;

        /* Deal with the mode. */
        if (WILL_CREATE(flags)) {
                if (how->mode & ~S_IALLUGO)
                        return -EINVAL;
                op->mode = how->mode | S_IFREG;
        } else {
                if (how->mode != 0)
                        return -EINVAL;
                op->mode = 0;
        }

        /*
         * Block bugs where O_DIRECTORY | O_CREAT created regular files.
         * Note, that blocking O_DIRECTORY | O_CREAT here also protects
         * O_TMPFILE below which requires O_DIRECTORY being raised.
         */
        if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT))
                return -EINVAL;

        /* Now handle the creative implementation of O_TMPFILE. */
        if (flags & __O_TMPFILE) {
                /*
                 * In order to ensure programs get explicit errors when trying
                 * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY
                 * is raised alongside __O_TMPFILE.
                 */
                if (!(flags & O_DIRECTORY))
                        return -EINVAL;
                if (!(acc_mode & MAY_WRITE))
                        return -EINVAL;
        }
        if (flags & O_PATH) {
                /* O_PATH only permits certain other flags to be set. */
                if (flags & ~O_PATH_FLAGS)
                        return -EINVAL;
                acc_mode = 0;
        }

        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
         * check for O_DSYNC if the need any syncing at all we enforce it's
         * always set instead of having to deal with possibly weird behaviour
         * for malicious applications setting only __O_SYNC.
         */
        if (flags & __O_SYNC)
                flags |= O_DSYNC;

        op->open_flag = flags;

        /* O_TRUNC implies we need access checks for write permissions */
        if (flags & O_TRUNC)
                acc_mode |= MAY_WRITE;

        /* Allow the LSM permission hook to distinguish append
           access from general write access. */
        if (flags & O_APPEND)
                acc_mode |= MAY_APPEND;

        op->acc_mode = acc_mode;

        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;

        if (flags & O_CREAT) {
                op->intent |= LOOKUP_CREATE;
                if (flags & O_EXCL) {
                        op->intent |= LOOKUP_EXCL;
                        flags |= O_NOFOLLOW;
                }
        }

        if (flags & O_DIRECTORY)
                lookup_flags |= LOOKUP_DIRECTORY;
        if (!(flags & O_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;

        if (how->resolve & RESOLVE_NO_XDEV)
                lookup_flags |= LOOKUP_NO_XDEV;
        if (how->resolve & RESOLVE_NO_MAGICLINKS)
                lookup_flags |= LOOKUP_NO_MAGICLINKS;
        if (how->resolve & RESOLVE_NO_SYMLINKS)
                lookup_flags |= LOOKUP_NO_SYMLINKS;
        if (how->resolve & RESOLVE_BENEATH)
                lookup_flags |= LOOKUP_BENEATH;
        if (how->resolve & RESOLVE_IN_ROOT)
                lookup_flags |= LOOKUP_IN_ROOT;
        if (how->resolve & RESOLVE_CACHED) {
                /* Don't bother even trying for create/truncate/tmpfile open */
                if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
                        return -EAGAIN;
                lookup_flags |= LOOKUP_CACHED;
        }

        op->lookup_flags = lookup_flags;
        return 0;
}

/**
 * file_open_name - open file and return file pointer
 *
 * @name:        struct filename containing path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *file_open_name(struct filename *name, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_file_open(AT_FDCWD, name, &op);
}

/**
 * filp_open - open file and return file pointer
 *
 * @filename:        path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *filp_open(const char *filename, int flags, umode_t mode)
{
        CLASS(filename_kernel, name)(filename);
        return file_open_name(name, flags, mode);
}
EXPORT_SYMBOL(filp_open);

struct file *file_open_root(const struct path *root,
                            const char *filename, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_file_open_root(root, filename, &op);
}
EXPORT_SYMBOL(file_open_root);

static int do_sys_openat2(int dfd, const char __user *filename,
                          struct open_how *how)
{
        struct open_flags op;
        int err = build_open_flags(how, &op);
        if (unlikely(err))
                return err;

        CLASS(filename, name)(filename);
        return FD_ADD(how->flags, do_file_open(dfd, name, &op));
}

int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
        struct open_how how = build_open_how(flags, mode);
        return do_sys_openat2(dfd, filename, &how);
}


SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
                umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(dfd, filename, flags, mode);
}

SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
                struct open_how __user *, how, size_t, usize)
{
        int err;
        struct open_how tmp;

        BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
        BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);

        if (unlikely(usize < OPEN_HOW_SIZE_VER0))
                return -EINVAL;
        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;

        err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
        if (err)
                return err;

        audit_openat2_how(&tmp);

        /* O_LARGEFILE is only allowed for non-O_PATH. */
        if (!(tmp.flags & O_PATH) && force_o_largefile())
                tmp.flags |= O_LARGEFILE;

        return do_sys_openat2(dfd, filename, &tmp);
}

#ifdef CONFIG_COMPAT
/*
 * Exactly like sys_open(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

/*
 * Exactly like sys_openat(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(dfd, filename, flags, mode);
}
#endif

#ifndef __alpha__

/*
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
{
        int flags = O_CREAT | O_WRONLY | O_TRUNC;

        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, pathname, flags, mode);
}
#endif

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
static int filp_flush(struct file *filp, fl_owner_t id)
{
        int retval = 0;

        if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp,
                        "VFS: Close: file count is 0 (f_op=%ps)",
                        filp->f_op)) {
                return 0;
        }

        if (filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);

        if (likely(!(filp->f_mode & FMODE_PATH))) {
                dnotify_flush(filp, id);
                locks_remove_posix(filp, id);
        }
        return retval;
}

int filp_close(struct file *filp, fl_owner_t id)
{
        int retval;

        retval = filp_flush(filp, id);
        fput_close(filp);

        return retval;
}
EXPORT_SYMBOL(filp_close);

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
        int retval;
        struct file *file;

        file = file_close_fd(fd);
        if (!file)
                return -EBADF;

        retval = filp_flush(file, current->files);

        /*
         * We're returning to user space. Don't bother
         * with any delayed fput() cases.
         */
        fput_close_sync(file);

        if (likely(retval == 0))
                return 0;

        /* can't restart close syscall because file table entry was cleared */
        if (retval == -ERESTARTSYS ||
            retval == -ERESTARTNOINTR ||
            retval == -ERESTARTNOHAND ||
            retval == -ERESTART_RESTARTBLOCK)
                retval = -EINTR;

        return retval;
}

/*
 * This routine simulates a hangup on the tty, to arrange that users
 * are given clean terminals at login time.
 */
SYSCALL_DEFINE0(vhangup)
{
        if (capable(CAP_SYS_TTY_CONFIG)) {
                tty_vhangup_self();
                return 0;
        }
        return -EPERM;
}

/*
 * Called when an inode is about to be open.
 * We use this to disallow opening large files on 32bit systems if
 * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
 * on this flag in sys_open.
 */
int generic_file_open(struct inode * inode, struct file * filp)
{
        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EOVERFLOW;
        return 0;
}

EXPORT_SYMBOL(generic_file_open);

/*
 * This is used by subsystems that don't want seekable
 * file descriptors. The function is not supposed to ever fail, the only
 * reason it returns an 'int' and not 'void' is so that it can be plugged
 * directly into file_operations structure.
 */
int nonseekable_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
        return 0;
}

EXPORT_SYMBOL(nonseekable_open);

/*
 * stream_open is used by subsystems that want stream-like file descriptors.
 * Such file descriptors are not seekable and don't have notion of position
 * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
 * Contrary to file descriptors of other regular files, .read() and .write()
 * can run simultaneously.
 *
 * stream_open never fails and is marked to return int so that it could be
 * directly used as file_operations.open .
 */
int stream_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
        filp->f_mode |= FMODE_STREAM;
        return 0;
}

EXPORT_SYMBOL(stream_open);








































































































































   24 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Landlock - Credential hooks
 *
 * Copyright © 2019-2020 Mickaël Salaün <mic@digikod.net>
 * Copyright © 2019-2020 ANSSI
 * Copyright © 2021-2025 Microsoft Corporation
 */

#ifndef _SECURITY_LANDLOCK_CRED_H
#define _SECURITY_LANDLOCK_CRED_H

#include <linux/container_of.h>
#include <linux/cred.h>
#include <linux/init.h>
#include <linux/rcupdate.h>

#include "access.h"
#include "limits.h"
#include "ruleset.h"
#include "setup.h"

/**
 * struct landlock_cred_security - Credential security blob
 *
 * This structure is packed to minimize the size of struct
 * landlock_file_security.  However, it is always aligned in the LSM cred blob,
 * see lsm_set_blob_size().
 *
 * When updating this, also update landlock_cred_copy() if needed.
 */
struct landlock_cred_security {
        /**
         * @domain: Immutable ruleset enforced on a task.
         */
        struct landlock_ruleset *domain;

#ifdef CONFIG_AUDIT
        /**
         * @domain_exec: Bitmask identifying the domain layers that were enforced by
         * the current task's executed file (i.e. no new execve(2) since
         * landlock_restrict_self(2)).
         */
        u16 domain_exec;
        /**
         * @log_subdomains_off: Set if the domain descendants's log_status should be
         * set to %LANDLOCK_LOG_DISABLED.  This is not a landlock_hierarchy
         * configuration because it applies to future descendant domains and it does
         * not require a current domain.
         */
        u8 log_subdomains_off : 1;
#endif /* CONFIG_AUDIT */
} __packed;

#ifdef CONFIG_AUDIT

/* Makes sure all layer executions can be stored. */
static_assert(BITS_PER_TYPE(typeof_member(struct landlock_cred_security,
                                          domain_exec)) >=
              LANDLOCK_MAX_NUM_LAYERS);

#endif /* CONFIG_AUDIT */

static inline struct landlock_cred_security *
landlock_cred(const struct cred *cred)
{
        return cred->security + landlock_blob_sizes.lbs_cred;
}

static inline void landlock_cred_copy(struct landlock_cred_security *dst,
                                      const struct landlock_cred_security *src)
{
        landlock_put_ruleset(dst->domain);

        *dst = *src;

        landlock_get_ruleset(src->domain);
}

static inline struct landlock_ruleset *landlock_get_current_domain(void)
{
        return landlock_cred(current_cred())->domain;
}

/*
 * The call needs to come from an RCU read-side critical section.
 */
static inline const struct landlock_ruleset *
landlock_get_task_domain(const struct task_struct *const task)
{
        return landlock_cred(__task_cred(task))->domain;
}

static inline bool landlocked(const struct task_struct *const task)
{
        bool has_dom;

        if (task == current)
                return !!landlock_get_current_domain();

        rcu_read_lock();
        has_dom = !!landlock_get_task_domain(task);
        rcu_read_unlock();
        return has_dom;
}

/**
 * landlock_get_applicable_subject - Return the subject's Landlock credential
 *                                   if its enforced domain applies to (i.e.
 *                                   handles) at least one of the access rights
 *                                   specified in @masks
 *
 * @cred: credential
 * @masks: access masks
 * @handle_layer: returned youngest layer handling a subset of @masks.  Not set
 *                if the function returns NULL.
 *
 * Return: landlock_cred(@cred) if any access rights specified in @masks is
 * handled, or NULL otherwise.
 */
static inline const struct landlock_cred_security *
landlock_get_applicable_subject(const struct cred *const cred,
                                const struct access_masks masks,
                                size_t *const handle_layer)
{
        const union access_masks_all masks_all = {
                .masks = masks,
        };
        const struct landlock_ruleset *domain;
        ssize_t layer_level;

        if (!cred)
                return NULL;

        domain = landlock_cred(cred)->domain;
        if (!domain)
                return NULL;

        for (layer_level = domain->num_layers - 1; layer_level >= 0;
             layer_level--) {
                union access_masks_all layer = {
                        .masks = domain->access_masks[layer_level],
                };

                if (layer.all & masks_all.all) {
                        if (handle_layer)
                                *handle_layer = layer_level;

                        return landlock_cred(cred);
                }
        }

        return NULL;
}

__init void landlock_add_cred_hooks(void);

#endif /* _SECURITY_LANDLOCK_CRED_H */




























































































































































































    2 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HIGHMEM_INTERNAL_H
#define _LINUX_HIGHMEM_INTERNAL_H

/*
 * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
 */
#ifdef CONFIG_KMAP_LOCAL
void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
void *__kmap_local_page_prot(const struct page *page, pgprot_t prot);
void kunmap_local_indexed(const void *vaddr);
void kmap_local_fork(struct task_struct *tsk);
void __kmap_local_sched_out(void);
void __kmap_local_sched_in(void);
static inline void kmap_assert_nomap(void)
{
        DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx);
}
#else
static inline void kmap_local_fork(struct task_struct *tsk) { }
static inline void kmap_assert_nomap(void) { }
#endif

#ifdef CONFIG_HIGHMEM
#include <asm/highmem.h>

#ifndef ARCH_HAS_KMAP_FLUSH_TLB
static inline void kmap_flush_tlb(unsigned long addr) { }
#endif

#ifndef kmap_prot
#define kmap_prot PAGE_KERNEL
#endif

void *kmap_high(struct page *page);
void kunmap_high(const struct page *page);
void __kmap_flush_unused(void);
struct page *__kmap_to_page(void *addr);

static inline void *kmap(struct page *page)
{
        void *addr;

        might_sleep();
        if (!PageHighMem(page))
                addr = page_address(page);
        else
                addr = kmap_high(page);
        kmap_flush_tlb((unsigned long)addr);
        return addr;
}

static inline void kunmap(const struct page *page)
{
        might_sleep();
        if (!PageHighMem(page))
                return;
        kunmap_high(page);
}

static inline struct page *kmap_to_page(void *addr)
{
        return __kmap_to_page(addr);
}

static inline void kmap_flush_unused(void)
{
        __kmap_flush_unused();
}

static inline void *kmap_local_page(const struct page *page)
{
        return __kmap_local_page_prot(page, kmap_prot);
}

static inline void *kmap_local_page_try_from_panic(const struct page *page)
{
        if (!PageHighMem(page))
                return page_address(page);
        /* If the page is in HighMem, it's not safe to kmap it.*/
        return NULL;
}

static inline void *kmap_local_folio(const struct folio *folio, size_t offset)
{
        const struct page *page = folio_page(folio, offset / PAGE_SIZE);
        return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE;
}

static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot)
{
        return __kmap_local_page_prot(page, prot);
}

static inline void *kmap_local_pfn(unsigned long pfn)
{
        return __kmap_local_pfn_prot(pfn, kmap_prot);
}

static inline void __kunmap_local(const void *vaddr)
{
        kunmap_local_indexed(vaddr);
}

static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();

        pagefault_disable();
        return __kmap_local_page_prot(page, prot);
}

static inline void *kmap_atomic(const struct page *page)
{
        return kmap_atomic_prot(page, kmap_prot);
}

static inline void *kmap_atomic_pfn(unsigned long pfn)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();

        pagefault_disable();
        return __kmap_local_pfn_prot(pfn, kmap_prot);
}

static inline void __kunmap_atomic(const void *addr)
{
        kunmap_local_indexed(addr);
        pagefault_enable();
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_enable();
        else
                preempt_enable();
}

unsigned long __nr_free_highpages(void);
unsigned long __totalhigh_pages(void);

static inline unsigned long nr_free_highpages(void)
{
        return __nr_free_highpages();
}

static inline unsigned long totalhigh_pages(void)
{
        return __totalhigh_pages();
}

static inline bool is_kmap_addr(const void *x)
{
        unsigned long addr = (unsigned long)x;

        return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) ||
                (addr >= __fix_to_virt(FIX_KMAP_END) &&
                 addr < __fix_to_virt(FIX_KMAP_BEGIN));
}
#else /* CONFIG_HIGHMEM */

static inline struct page *kmap_to_page(void *addr)
{
        return virt_to_page(addr);
}

static inline void *kmap(struct page *page)
{
        might_sleep();
        return page_address(page);
}

static inline void kunmap_high(const struct page *page) { }
static inline void kmap_flush_unused(void) { }

static inline void kunmap(const struct page *page)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(page_address(page));
#endif
}

static inline void *kmap_local_page(const struct page *page)
{
        return page_address(page);
}

static inline void *kmap_local_page_try_from_panic(const struct page *page)
{
        return page_address(page);
}

static inline void *kmap_local_folio(const struct folio *folio, size_t offset)
{
        return folio_address(folio) + offset;
}

static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot)
{
        return kmap_local_page(page);
}

static inline void *kmap_local_pfn(unsigned long pfn)
{
        return kmap_local_page(pfn_to_page(pfn));
}

static inline void __kunmap_local(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
}

static inline void *kmap_atomic(const struct page *page)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();
        pagefault_disable();
        return page_address(page);
}

static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot)
{
        return kmap_atomic(page);
}

static inline void *kmap_atomic_pfn(unsigned long pfn)
{
        return kmap_atomic(pfn_to_page(pfn));
}

static inline void __kunmap_atomic(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
        pagefault_enable();
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_enable();
        else
                preempt_enable();
}

static inline unsigned long nr_free_highpages(void) { return 0; }
static inline unsigned long totalhigh_pages(void) { return 0; }

static inline bool is_kmap_addr(const void *x)
{
        return false;
}

#endif /* CONFIG_HIGHMEM */

/**
 * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated!
 * @__addr:       Virtual address to be unmapped
 *
 * Unmaps an address previously mapped by kmap_atomic() and re-enables
 * pagefaults. Depending on PREEMP_RT configuration, re-enables also
 * migration and preemption. Users should not count on these side effects.
 *
 * Mappings should be unmapped in the reverse order that they were mapped.
 * See kmap_local_page() for details on nesting.
 *
 * @__addr can be any address within the mapped page, so there is no need
 * to subtract any offset that has been added. In contrast to kunmap(),
 * this function takes the address returned from kmap_atomic(), not the
 * page passed to it. The compiler will warn you if you pass the page.
 */
#define kunmap_atomic(__addr)                                        \
do {                                                                \
        BUILD_BUG_ON(__same_type((__addr), struct page *));        \
        __kunmap_atomic(__addr);                                \
} while (0)

/**
 * kunmap_local - Unmap a page mapped via kmap_local_page().
 * @__addr: An address within the page mapped
 *
 * @__addr can be any address within the mapped page.  Commonly it is the
 * address return from kmap_local_page(), but it can also include offsets.
 *
 * Unmapping should be done in the reverse order of the mapping.  See
 * kmap_local_page() for details.
 */
#define kunmap_local(__addr)                                        \
do {                                                                \
        BUILD_BUG_ON(__same_type((__addr), struct page *));        \
        __kunmap_local(__addr);                                        \
} while (0)

#endif




































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_HUGETLB_H
#define _ASM_GENERIC_HUGETLB_H

#include <linux/swap.h>
#include <linux/swapops.h>

static inline unsigned long huge_pte_write(pte_t pte)
{
        return pte_write(pte);
}

static inline unsigned long huge_pte_dirty(pte_t pte)
{
        return pte_dirty(pte);
}

static inline pte_t huge_pte_mkwrite(pte_t pte)
{
        return pte_mkwrite_novma(pte);
}

#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
static inline pte_t huge_pte_wrprotect(pte_t pte)
{
        return pte_wrprotect(pte);
}
#endif

static inline pte_t huge_pte_mkdirty(pte_t pte)
{
        return pte_mkdirty(pte);
}

static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
{
        return pte_modify(pte, newprot);
}

#ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP
static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
{
        return huge_pte_wrprotect(pte_mkuffd_wp(pte));
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP
static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
{
        return pte_clear_uffd_wp(pte);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP
static inline int huge_pte_uffd_wp(pte_t pte)
{
        return pte_uffd_wp(pte);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
                    pte_t *ptep, unsigned long sz)
{
        pte_clear(mm, addr, ptep);
}
#endif

#ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, pte_t pte, unsigned long sz)
{
        set_pte_at(mm, addr, ptep, pte);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned long sz)
{
        return ptep_get_and_clear(mm, addr, ptep);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
        return ptep_clear_flush(vma, addr, ptep);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTE_NONE
static inline int huge_pte_none(pte_t pte)
{
        return pte_none(pte);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep)
{
        ptep_set_wrprotect(mm, addr, ptep);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep,
                pte_t pte, int dirty)
{
        return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTEP_GET
static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
        return ptep_get(ptep);
}
#endif

#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
static inline bool gigantic_page_runtime_supported(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE);
}
#endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */

#endif /* _ASM_GENERIC_HUGETLB_H */
























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 *  Driver for 8250/16550-type serial ports
 *
 *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
 *
 *  Copyright (C) 2001 Russell King.
 */

#include <linux/bits.h>
#include <linux/serial_8250.h>
#include <linux/serial_core.h>
#include <linux/dmaengine.h>

#include "../serial_mctrl_gpio.h"

struct uart_8250_dma {
        int (*tx_dma)(struct uart_8250_port *p);
        int (*rx_dma)(struct uart_8250_port *p);
        void (*prepare_tx_dma)(struct uart_8250_port *p);
        void (*prepare_rx_dma)(struct uart_8250_port *p);

        /* Filter function */
        dma_filter_fn                fn;
        /* Parameter to the filter function */
        void                        *rx_param;
        void                        *tx_param;

        struct dma_slave_config        rxconf;
        struct dma_slave_config        txconf;

        struct dma_chan                *rxchan;
        struct dma_chan                *txchan;

        /* Device address base for DMA operations */
        phys_addr_t                rx_dma_addr;
        phys_addr_t                tx_dma_addr;

        /* DMA address of the buffer in memory */
        dma_addr_t                rx_addr;
        dma_addr_t                tx_addr;

        dma_cookie_t                rx_cookie;
        dma_cookie_t                tx_cookie;

        void                        *rx_buf;

        size_t                        rx_size;
        size_t                        tx_size;

        unsigned char                tx_running;
        unsigned char                tx_err;
        unsigned char                rx_running;
};

struct old_serial_port {
        unsigned int uart;
        unsigned int baud_base;
        unsigned int port;
        unsigned int irq;
        upf_t        flags;
        unsigned char io_type;
        unsigned char __iomem *iomem_base;
        unsigned short iomem_reg_shift;
};

struct serial8250_config {
        const char        *name;
        unsigned short        fifo_size;
        unsigned short        tx_loadsz;
        unsigned char        fcr;
        unsigned char        rxtrig_bytes[UART_FCR_R_TRIG_MAX_STATE];
        unsigned int        flags;
};

#define UART_CAP_FIFO        BIT(8)        /* UART has FIFO */
#define UART_CAP_EFR        BIT(9)        /* UART has EFR */
#define UART_CAP_SLEEP        BIT(10)        /* UART has IER sleep */
#define UART_CAP_AFE        BIT(11)        /* MCR-based hw flow control */
#define UART_CAP_UUE        BIT(12)        /* UART needs IER bit 6 set (Xscale) */
#define UART_CAP_RTOIE        BIT(13)        /* UART needs IER bit 4 set (Xscale, Tegra) */
#define UART_CAP_HFIFO        BIT(14)        /* UART has a "hidden" FIFO */
#define UART_CAP_RPM        BIT(15)        /* Runtime PM is active while idle */
#define UART_CAP_IRDA        BIT(16)        /* UART supports IrDA line discipline */
#define UART_CAP_MINI        BIT(17)        /* Mini UART on BCM283X family lacks:
                                         * STOP PARITY EPAR SPAR WLEN5 WLEN6
                                         */
#define UART_CAP_NOTEMT        BIT(18)        /* UART without interrupt on TEMT available */

#define UART_BUG_QUOT        BIT(0)        /* UART has buggy quot LSB */
#define UART_BUG_TXEN        BIT(1)        /* UART has buggy TX IIR status */
#define UART_BUG_NOMSR        BIT(2)        /* UART has buggy MSR status bits (Au1x00) */
#define UART_BUG_THRE        BIT(3)        /* UART has buggy THRE reassertion */
#define UART_BUG_TXRACE        BIT(5)        /* UART Tx fails to set remote DR */

/* Module parameters */
#define UART_NR        CONFIG_SERIAL_8250_NR_UARTS

extern unsigned int nr_uarts;

#define SERIAL8250_PORT_FLAGS(_base, _irq, _flags)                \
        {                                                        \
                .iobase                = _base,                        \
                .irq                = _irq,                                \
                .uartclk        = 1843200,                        \
                .iotype                = UPIO_PORT,                        \
                .flags                = UPF_BOOT_AUTOCONF | (_flags),        \
        }

#define SERIAL8250_PORT(_base, _irq) SERIAL8250_PORT_FLAGS(_base, _irq, 0)

extern struct uart_driver serial8250_reg;
void serial8250_register_ports(struct uart_driver *drv, struct device *dev);

/* Legacy ISA bus related APIs */
typedef void (*serial8250_isa_config_fn)(int, struct uart_port *, u32 *);
extern serial8250_isa_config_fn serial8250_isa_config;

void serial8250_isa_init_ports(void);

extern struct platform_device *serial8250_isa_devs;

extern const struct uart_ops *univ8250_port_base_ops;
extern struct uart_ops univ8250_port_ops;

static inline int serial_in(struct uart_8250_port *up, int offset)
{
        return up->port.serial_in(&up->port, offset);
}

static inline void serial_out(struct uart_8250_port *up, int offset, int value)
{
        up->port.serial_out(&up->port, offset, value);
}

/**
 *        serial_lsr_in - Read LSR register and preserve flags across reads
 *        @up:        uart 8250 port
 *
 *        Read LSR register and handle saving non-preserved flags across reads.
 *        The flags that are not preserved across reads are stored into
 *        up->lsr_saved_flags.
 *
 *        Returns LSR value or'ed with the preserved flags (if any).
 */
static inline u16 serial_lsr_in(struct uart_8250_port *up)
{
        u16 lsr = up->lsr_saved_flags;

        lsr |= serial_in(up, UART_LSR);
        up->lsr_saved_flags = lsr & up->lsr_save_mask;

        return lsr;
}

/*
 * For the 16C950
 */
static void serial_icr_write(struct uart_8250_port *up, int offset, int value)
{
        serial_out(up, UART_SCR, offset);
        serial_out(up, UART_ICR, value);
}

static unsigned int __maybe_unused serial_icr_read(struct uart_8250_port *up,
                                                   int offset)
{
        unsigned int value;

        serial_icr_write(up, UART_ACR, up->acr | UART_ACR_ICRRD);
        serial_out(up, UART_SCR, offset);
        value = serial_in(up, UART_ICR);
        serial_icr_write(up, UART_ACR, up->acr);

        return value;
}

void serial8250_clear_fifos(struct uart_8250_port *p);
void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p);
void serial8250_fifo_wait_for_lsr_thre(struct uart_8250_port *up, unsigned int count);

void serial8250_rpm_get(struct uart_8250_port *p);
void serial8250_rpm_put(struct uart_8250_port *p);
DEFINE_GUARD(serial8250_rpm, struct uart_8250_port *,
             serial8250_rpm_get(_T), serial8250_rpm_put(_T));

static inline u32 serial_dl_read(struct uart_8250_port *up)
{
        return up->dl_read(up);
}

static inline void serial_dl_write(struct uart_8250_port *up, u32 value)
{
        up->dl_write(up, value);
}

static inline bool serial8250_set_THRI(struct uart_8250_port *up)
{
        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&up->port.lock);

        if (up->ier & UART_IER_THRI)
                return false;
        up->ier |= UART_IER_THRI;
        serial_out(up, UART_IER, up->ier);
        return true;
}

static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
{
        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&up->port.lock);

        if (!(up->ier & UART_IER_THRI))
                return false;
        up->ier &= ~UART_IER_THRI;
        serial_out(up, UART_IER, up->ier);
        return true;
}

struct uart_8250_port *serial8250_setup_port(int index);
struct uart_8250_port *serial8250_get_port(int line);

int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
                            struct serial_rs485 *rs485);
void serial8250_em485_start_tx(struct uart_8250_port *p, bool toggle_ier);
void serial8250_em485_stop_tx(struct uart_8250_port *p, bool toggle_ier);
void serial8250_em485_destroy(struct uart_8250_port *p);
extern struct serial_rs485 serial8250_em485_supported;

/* MCR <-> TIOCM conversion */
static inline int serial8250_TIOCM_to_MCR(int tiocm)
{
        int mcr = 0;

        if (tiocm & TIOCM_RTS)
                mcr |= UART_MCR_RTS;
        if (tiocm & TIOCM_DTR)
                mcr |= UART_MCR_DTR;
        if (tiocm & TIOCM_OUT1)
                mcr |= UART_MCR_OUT1;
        if (tiocm & TIOCM_OUT2)
                mcr |= UART_MCR_OUT2;
        if (tiocm & TIOCM_LOOP)
                mcr |= UART_MCR_LOOP;

        return mcr;
}

static inline int serial8250_MCR_to_TIOCM(int mcr)
{
        int tiocm = 0;

        if (mcr & UART_MCR_RTS)
                tiocm |= TIOCM_RTS;
        if (mcr & UART_MCR_DTR)
                tiocm |= TIOCM_DTR;
        if (mcr & UART_MCR_OUT1)
                tiocm |= TIOCM_OUT1;
        if (mcr & UART_MCR_OUT2)
                tiocm |= TIOCM_OUT2;
        if (mcr & UART_MCR_LOOP)
                tiocm |= TIOCM_LOOP;

        return tiocm;
}

/* MSR <-> TIOCM conversion */
static inline int serial8250_MSR_to_TIOCM(int msr)
{
        int tiocm = 0;

        if (msr & UART_MSR_DCD)
                tiocm |= TIOCM_CAR;
        if (msr & UART_MSR_RI)
                tiocm |= TIOCM_RNG;
        if (msr & UART_MSR_DSR)
                tiocm |= TIOCM_DSR;
        if (msr & UART_MSR_CTS)
                tiocm |= TIOCM_CTS;

        return tiocm;
}

static inline void serial8250_out_MCR(struct uart_8250_port *up, int value)
{
        serial_out(up, UART_MCR, value);

        if (up->gpios)
                mctrl_gpio_set(up->gpios, serial8250_MCR_to_TIOCM(value));
}

static inline int serial8250_in_MCR(struct uart_8250_port *up)
{
        int mctrl;

        mctrl = serial_in(up, UART_MCR);

        if (up->gpios) {
                unsigned int mctrl_gpio = 0;

                mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio);
                mctrl |= serial8250_TIOCM_to_MCR(mctrl_gpio);
        }

        return mctrl;
}

#ifdef CONFIG_SERIAL_8250_PNP
int serial8250_pnp_init(void);
void serial8250_pnp_exit(void);
#else
static inline int serial8250_pnp_init(void) { return 0; }
static inline void serial8250_pnp_exit(void) { }
#endif

#ifdef CONFIG_SERIAL_8250_RSA
void univ8250_rsa_support(struct uart_ops *ops, const struct uart_ops *core_ops);
void rsa_enable(struct uart_8250_port *up);
void rsa_disable(struct uart_8250_port *up);
void rsa_autoconfig(struct uart_8250_port *up);
void rsa_reset(struct uart_8250_port *up);
#else
static inline void univ8250_rsa_support(struct uart_ops *ops, const struct uart_ops *core_ops) { }
static inline void rsa_enable(struct uart_8250_port *up) {}
static inline void rsa_disable(struct uart_8250_port *up) {}
static inline void rsa_autoconfig(struct uart_8250_port *up) {}
static inline void rsa_reset(struct uart_8250_port *up) {}
#endif

#ifdef CONFIG_SERIAL_8250_FINTEK
int fintek_8250_probe(struct uart_8250_port *uart);
#else
static inline int fintek_8250_probe(struct uart_8250_port *uart) { return 0; }
#endif

#ifdef CONFIG_ARCH_OMAP1
#include <linux/soc/ti/omap1-soc.h>
static inline int is_omap1_8250(struct uart_8250_port *pt)
{
        int res;

        switch (pt->port.mapbase) {
        case OMAP1_UART1_BASE:
        case OMAP1_UART2_BASE:
        case OMAP1_UART3_BASE:
                res = 1;
                break;
        default:
                res = 0;
                break;
        }

        return res;
}

static inline int is_omap1510_8250(struct uart_8250_port *pt)
{
        if (!cpu_is_omap1510())
                return 0;

        return is_omap1_8250(pt);
}
#else
static inline int is_omap1_8250(struct uart_8250_port *pt)
{
        return 0;
}
static inline int is_omap1510_8250(struct uart_8250_port *pt)
{
        return 0;
}
#endif

#ifdef CONFIG_SERIAL_8250_DMA
extern int serial8250_tx_dma(struct uart_8250_port *);
extern void serial8250_tx_dma_flush(struct uart_8250_port *);
extern int serial8250_rx_dma(struct uart_8250_port *);
extern void serial8250_rx_dma_flush(struct uart_8250_port *);
extern int serial8250_request_dma(struct uart_8250_port *);
extern void serial8250_release_dma(struct uart_8250_port *);

static inline void serial8250_do_prepare_tx_dma(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        if (dma->prepare_tx_dma)
                dma->prepare_tx_dma(p);
}

static inline void serial8250_do_prepare_rx_dma(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        if (dma->prepare_rx_dma)
                dma->prepare_rx_dma(p);
}

static inline bool serial8250_tx_dma_running(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        return dma && dma->tx_running;
}

static inline void serial8250_tx_dma_pause(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        if (!dma->tx_running)
                return;

        dmaengine_pause(dma->txchan);
}

static inline void serial8250_tx_dma_resume(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        if (!dma->tx_running)
                return;

        dmaengine_resume(dma->txchan);
}
#else
static inline int serial8250_tx_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_tx_dma_flush(struct uart_8250_port *p) { }
static inline int serial8250_rx_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_rx_dma_flush(struct uart_8250_port *p) { }
static inline int serial8250_request_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_release_dma(struct uart_8250_port *p) { }

static inline bool serial8250_tx_dma_running(struct uart_8250_port *p)
{
        return false;
}

static inline void serial8250_tx_dma_pause(struct uart_8250_port *p) { }
static inline void serial8250_tx_dma_resume(struct uart_8250_port *p) { }
#endif

static inline int ns16550a_goto_highspeed(struct uart_8250_port *up)
{
        unsigned char status;

        status = serial_in(up, 0x04); /* EXCR2 */
#define PRESL(x) ((x) & 0x30)
        if (PRESL(status) == 0x10) {
                /* already in high speed mode */
                return 0;
        } else {
                status &= ~0xB0; /* Disable LOCK, mask out PRESL[01] */
                status |= 0x10;  /* 1.625 divisor for baud_base --> 921600 */
                serial_out(up, 0x04, status);
        }
        return 1;
}

static inline int serial_index(struct uart_port *port)
{
        return port->minor - 64;
}




























































































































    1 









































   11 

   11 





   11 


























   10 


   11 




































































    6 


    6 









    6 

    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                RAW - implementation of IP "raw" sockets.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *                Alan Cox        :        verify_area() fixed up
 *                Alan Cox        :        ICMP error handling
 *                Alan Cox        :        EMSGSIZE if you send too big a packet
 *                Alan Cox        :         Now uses generic datagrams and shared
 *                                        skbuff library. No more peek crashes,
 *                                        no more backlogs
 *                Alan Cox        :        Checks sk->broadcast.
 *                Alan Cox        :        Uses skb_free_datagram/skb_copy_datagram
 *                Alan Cox        :        Raw passes ip options too
 *                Alan Cox        :        Setsocketopt added
 *                Alan Cox        :        Fixed error return for broadcasts
 *                Alan Cox        :        Removed wake_up calls
 *                Alan Cox        :        Use ttl/tos
 *                Alan Cox        :        Cleaned up old debugging
 *                Alan Cox        :        Use new kernel side addresses
 *        Arnt Gulbrandsen        :        Fixed MSG_DONTROUTE in raw sockets.
 *                Alan Cox        :        BSD style RAW socket demultiplexing.
 *                Alan Cox        :        Beginnings of mrouted support.
 *                Alan Cox        :        Added IP_HDRINCL option.
 *                Alan Cox        :        Skip broadcast check if BSDism set.
 *                David S. Miller        :        New socket lookup architecture.
 */

#include <linux/types.h>
#include <linux/atomic.h>
#include <asm/byteorder.h>
#include <asm/current.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/sockios.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/mroute.h>
#include <linux/netdevice.h>
#include <linux/in_route.h>
#include <linux/route.h>
#include <linux/skbuff.h>
#include <linux/igmp.h>
#include <net/net_namespace.h>
#include <net/dst.h>
#include <net/sock.h>
#include <linux/ip.h>
#include <linux/net.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <net/snmp.h>
#include <net/tcp_states.h>
#include <net/inet_common.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/compat.h>
#include <linux/uio.h>

struct raw_frag_vec {
        struct msghdr *msg;
        union {
                struct icmphdr icmph;
                char c[1];
        } hdr;
        int hlen;
};

struct raw_hashinfo raw_v4_hashinfo;
EXPORT_SYMBOL_GPL(raw_v4_hashinfo);

int raw_hash_sk(struct sock *sk)
{
        struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
        struct hlist_head *hlist;

        hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)];

        spin_lock(&h->lock);
        sk_add_node_rcu(sk, hlist);
        sock_set_flag(sk, SOCK_RCU_FREE);
        spin_unlock(&h->lock);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

        return 0;
}
EXPORT_SYMBOL_GPL(raw_hash_sk);

void raw_unhash_sk(struct sock *sk)
{
        struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;

        spin_lock(&h->lock);
        if (sk_del_node_init_rcu(sk))
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
        spin_unlock(&h->lock);
}
EXPORT_SYMBOL_GPL(raw_unhash_sk);

bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num,
                  __be32 raddr, __be32 laddr, int dif, int sdif)
{
        const struct inet_sock *inet = inet_sk(sk);

        if (net_eq(sock_net(sk), net) && inet->inet_num == num        &&
            !(inet->inet_daddr && inet->inet_daddr != raddr)         &&
            !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
            raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
                return true;
        return false;
}
EXPORT_SYMBOL_GPL(raw_v4_match);

/*
 *        0 - deliver
 *        1 - block
 */
static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
{
        struct icmphdr _hdr;
        const struct icmphdr *hdr;

        hdr = skb_header_pointer(skb, skb_transport_offset(skb),
                                 sizeof(_hdr), &_hdr);
        if (!hdr)
                return 1;

        if (hdr->type < 32) {
                __u32 data = raw_sk(sk)->filter.data;

                return ((1U << hdr->type) & data) != 0;
        }

        /* Do not block unknown ICMP types */
        return 0;
}

/* IP input processing comes here for RAW socket delivery.
 * Caller owns SKB, so we must make clones.
 *
 * RFC 1122: SHOULD pass TOS value up to the transport layer.
 * -> It does. And not only TOS, but all IP header.
 */
static int raw_v4_input(struct net *net, struct sk_buff *skb,
                        const struct iphdr *iph, int hash)
{
        int sdif = inet_sdif(skb);
        struct hlist_head *hlist;
        int dif = inet_iif(skb);
        int delivered = 0;
        struct sock *sk;

        hlist = &raw_v4_hashinfo.ht[hash];
        rcu_read_lock();
        sk_for_each_rcu(sk, hlist) {
                if (!raw_v4_match(net, sk, iph->protocol,
                                  iph->saddr, iph->daddr, dif, sdif))
                        continue;

                if (atomic_read(&sk->sk_rmem_alloc) >=
                    READ_ONCE(sk->sk_rcvbuf)) {
                        sk_drops_inc(sk);
                        continue;
                }

                delivered = 1;
                if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
                    ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
                                   skb->dev->ifindex, sdif)) {
                        struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);

                        /* Not releasing hash table! */
                        if (clone)
                                raw_rcv(sk, clone);
                }
        }
        rcu_read_unlock();
        return delivered;
}

int raw_local_deliver(struct sk_buff *skb, int protocol)
{
        struct net *net = dev_net(skb->dev);

        return raw_v4_input(net, skb, ip_hdr(skb),
                            raw_hashfunc(net, protocol));
}

static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
{
        struct inet_sock *inet = inet_sk(sk);
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        int harderr = 0;
        bool recverr;
        int err = 0;

        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
                ipv4_sk_update_pmtu(skb, sk, info);
        else if (type == ICMP_REDIRECT) {
                ipv4_sk_redirect(skb, sk);
                return;
        }

        /* Report error on raw socket, if:
           1. User requested ip_recverr.
           2. Socket is connected (otherwise the error indication
              is useless without ip_recverr and error is hard.
         */
        recverr = inet_test_bit(RECVERR, sk);
        if (!recverr && sk->sk_state != TCP_ESTABLISHED)
                return;

        switch (type) {
        default:
        case ICMP_TIME_EXCEEDED:
                err = EHOSTUNREACH;
                break;
        case ICMP_SOURCE_QUENCH:
                return;
        case ICMP_PARAMETERPROB:
                err = EPROTO;
                harderr = 1;
                break;
        case ICMP_DEST_UNREACH:
                err = EHOSTUNREACH;
                if (code > NR_ICMP_UNREACH)
                        break;
                if (code == ICMP_FRAG_NEEDED) {
                        harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT;
                        err = EMSGSIZE;
                } else {
                        err = icmp_err_convert[code].errno;
                        harderr = icmp_err_convert[code].fatal;
                }
        }

        if (recverr) {
                const struct iphdr *iph = (const struct iphdr *)skb->data;
                u8 *payload = skb->data + (iph->ihl << 2);

                if (inet_test_bit(HDRINCL, sk))
                        payload = skb->data;
                ip_icmp_error(sk, skb, err, 0, info, payload);
        }

        if (recverr || harderr) {
                sk->sk_err = err;
                sk_error_report(sk);
        }
}

void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
{
        struct net *net = dev_net(skb->dev);
        int dif = skb->dev->ifindex;
        int sdif = inet_sdif(skb);
        struct hlist_head *hlist;
        const struct iphdr *iph;
        struct sock *sk;
        int hash;

        hash = raw_hashfunc(net, protocol);
        hlist = &raw_v4_hashinfo.ht[hash];

        rcu_read_lock();
        sk_for_each_rcu(sk, hlist) {
                iph = (const struct iphdr *)skb->data;
                if (!raw_v4_match(net, sk, iph->protocol,
                                  iph->daddr, iph->saddr, dif, sdif))
                        continue;
                raw_err(sk, skb, info);
        }
        rcu_read_unlock();
}

static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason reason;

        /* Charge it to the socket. */

        ipv4_pktinfo_prepare(sk, skb, true);
        reason = sock_queue_rcv_skb_reason(sk, skb);
        if (reason) {
                sk_skb_reason_drop(sk, skb, reason);
                return NET_RX_DROP;
        }

        return NET_RX_SUCCESS;
}

int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
                sk_drops_inc(sk);
                sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
                return NET_RX_DROP;
        }
        nf_reset_ct(skb);

        skb_push(skb, -skb_network_offset(skb));

        raw_rcv_skb(sk, skb);
        return 0;
}

static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
                           struct msghdr *msg, size_t length,
                           struct rtable **rtp, unsigned int flags,
                           const struct sockcm_cookie *sockc)
{
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        struct iphdr *iph;
        struct sk_buff *skb;
        unsigned int iphlen;
        int err;
        struct rtable *rt = *rtp;
        int hlen, tlen;

        if (length > rt->dst.dev->mtu) {
                ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
                               rt->dst.dev->mtu);
                return -EMSGSIZE;
        }
        if (length < sizeof(struct iphdr))
                return -EINVAL;

        if (flags&MSG_PROBE)
                goto out;

        hlen = LL_RESERVED_SPACE(rt->dst.dev);
        tlen = rt->dst.dev->needed_tailroom;
        skb = sock_alloc_send_skb(sk,
                                  length + hlen + tlen + 15,
                                  flags & MSG_DONTWAIT, &err);
        if (!skb)
                goto error;
        skb_reserve(skb, hlen);

        skb->protocol = htons(ETH_P_IP);
        skb->priority = sockc->priority;
        skb->mark = sockc->mark;
        skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
        skb_dst_set(skb, &rt->dst);
        *rtp = NULL;

        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
        skb_put(skb, length);

        skb->ip_summed = CHECKSUM_NONE;

        skb_setup_tx_timestamp(skb, sockc);

        if (flags & MSG_CONFIRM)
                skb_set_dst_pending_confirm(skb, 1);

        skb->transport_header = skb->network_header;
        err = -EFAULT;
        if (memcpy_from_msg(iph, msg, length))
                goto error_free;

        iphlen = iph->ihl * 4;

        /*
         * We don't want to modify the ip header, but we do need to
         * be sure that it won't cause problems later along the network
         * stack.  Specifically we want to make sure that iph->ihl is a
         * sane value.  If ihl points beyond the length of the buffer passed
         * in, reject the frame as invalid
         */
        err = -EINVAL;
        if (iphlen > length)
                goto error_free;

        if (iphlen >= sizeof(*iph)) {
                if (!iph->saddr)
                        iph->saddr = fl4->saddr;
                iph->check   = 0;
                iph->tot_len = htons(length);
                if (!iph->id)
                        ip_select_ident(net, skb, NULL);

                iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
                skb->transport_header += iphlen;
                if (iph->protocol == IPPROTO_ICMP &&
                    length >= iphlen + sizeof(struct icmphdr))
                        icmp_out_count(net, ((struct icmphdr *)
                                skb_transport_header(skb))->type);
        }

        err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
                      net, sk, skb, NULL, rt->dst.dev,
                      dst_output);
        if (err > 0)
                err = net_xmit_errno(err);
        if (err)
                goto error;
out:
        return 0;

error_free:
        kfree_skb(skb);
error:
        IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
        if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk))
                err = 0;
        return err;
}

static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4)
{
        int err;

        if (fl4->flowi4_proto != IPPROTO_ICMP)
                return 0;

        /* We only need the first two bytes. */
        rfv->hlen = 2;

        err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen);
        if (err)
                return err;

        fl4->fl4_icmp_type = rfv->hdr.icmph.type;
        fl4->fl4_icmp_code = rfv->hdr.icmph.code;

        return 0;
}

static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
                       struct sk_buff *skb)
{
        struct raw_frag_vec *rfv = from;

        if (offset < rfv->hlen) {
                int copy = min(rfv->hlen - offset, len);

                if (skb->ip_summed == CHECKSUM_PARTIAL)
                        memcpy(to, rfv->hdr.c + offset, copy);
                else
                        skb->csum = csum_block_add(
                                skb->csum,
                                csum_partial_copy_nocheck(rfv->hdr.c + offset,
                                                          to, copy),
                                odd);

                odd = 0;
                offset += copy;
                to += copy;
                len -= copy;

                if (!len)
                        return 0;
        }

        offset -= rfv->hlen;

        return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
}

static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
        DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
                        IP_OPTIONS_DATA_FIXED_SIZE);
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        struct ipcm_cookie ipc;
        struct rtable *rt = NULL;
        struct flowi4 fl4;
        u8 scope;
        int free = 0;
        __be32 daddr;
        __be32 saddr;
        int uc_index, err;
        struct raw_frag_vec rfv;
        int hdrincl;

        err = -EMSGSIZE;
        if (len > 0xFFFF)
                goto out;

        hdrincl = inet_test_bit(HDRINCL, sk);

        /*
         *        Check the flags.
         */

        err = -EOPNOTSUPP;
        if (msg->msg_flags & MSG_OOB)        /* Mirror BSD error message */
                goto out;               /* compatibility */

        /*
         *        Get and verify the address.
         */

        if (msg->msg_namelen) {
                DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
                err = -EINVAL;
                if (msg->msg_namelen < sizeof(*usin))
                        goto out;
                if (usin->sin_family != AF_INET) {
                        pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n",
                                     __func__, current->comm);
                        err = -EAFNOSUPPORT;
                        if (usin->sin_family)
                                goto out;
                }
                daddr = usin->sin_addr.s_addr;
                /* ANK: I did not forget to get protocol from port field.
                 * I just do not know, who uses this weirdness.
                 * IP_HDRINCL is much more convenient.
                 */
        } else {
                err = -EDESTADDRREQ;
                if (sk->sk_state != TCP_ESTABLISHED)
                        goto out;
                daddr = inet->inet_daddr;
        }

        ipcm_init_sk(&ipc, inet);
        /* Keep backward compat */
        if (hdrincl)
                ipc.protocol = IPPROTO_RAW;

        if (msg->msg_controllen) {
                err = ip_cmsg_send(sk, msg, &ipc, false);
                if (unlikely(err)) {
                        kfree(ipc.opt);
                        goto out;
                }
                if (ipc.opt)
                        free = 1;
        }

        saddr = ipc.addr;
        ipc.addr = daddr;

        if (!ipc.opt) {
                struct ip_options_rcu *inet_opt;

                rcu_read_lock();
                inet_opt = rcu_dereference(inet->inet_opt);
                if (inet_opt) {
                        memcpy(opt_copy, inet_opt,
                               sizeof(*inet_opt) + inet_opt->opt.optlen);
                        ipc.opt = opt_copy;
                }
                rcu_read_unlock();
        }

        if (ipc.opt) {
                err = -EINVAL;
                /* Linux does not mangle headers on raw sockets,
                 * so that IP options + IP_HDRINCL is non-sense.
                 */
                if (hdrincl)
                        goto done;
                if (ipc.opt->opt.srr) {
                        if (!daddr)
                                goto done;
                        daddr = ipc.opt->opt.faddr;
                }
        }
        scope = ip_sendmsg_scope(inet, &ipc, msg);

        uc_index = READ_ONCE(inet->uc_index);
        if (ipv4_is_multicast(daddr)) {
                if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
                        ipc.oif = READ_ONCE(inet->mc_index);
                if (!saddr)
                        saddr = READ_ONCE(inet->mc_addr);
        } else if (!ipc.oif) {
                ipc.oif = uc_index;
        } else if (ipv4_is_lbcast(daddr) && uc_index) {
                /* oif is set, packet is to local broadcast
                 * and uc_index is set. oif is most likely set
                 * by sk_bound_dev_if. If uc_index != oif check if the
                 * oif is an L3 master and uc_index is an L3 slave.
                 * If so, we want to allow the send using the uc_index.
                 */
                if (ipc.oif != uc_index &&
                    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
                                                              uc_index)) {
                        ipc.oif = uc_index;
                }
        }

        flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark,
                           ipc.tos & INET_DSCP_MASK, scope,
                           hdrincl ? ipc.protocol : sk->sk_protocol,
                           inet_sk_flowi_flags(sk) |
                            (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
                           daddr, saddr, 0, 0, sk_uid(sk));

        fl4.fl4_icmp_type = 0;
        fl4.fl4_icmp_code = 0;

        if (!hdrincl) {
                rfv.msg = msg;
                rfv.hlen = 0;

                err = raw_probe_proto_opt(&rfv, &fl4);
                if (err)
                        goto done;
        }

        security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
        rt = ip_route_output_flow(net, &fl4, sk);
        if (IS_ERR(rt)) {
                err = PTR_ERR(rt);
                rt = NULL;
                goto done;
        }

        err = -EACCES;
        if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
                goto done;

        if (msg->msg_flags & MSG_CONFIRM)
                goto do_confirm;
back_from_confirm:

        if (hdrincl)
                err = raw_send_hdrinc(sk, &fl4, msg, len,
                                      &rt, msg->msg_flags, &ipc.sockc);

         else {
                if (!ipc.addr)
                        ipc.addr = fl4.daddr;
                lock_sock(sk);
                err = ip_append_data(sk, &fl4, raw_getfrag,
                                     &rfv, len, 0,
                                     &ipc, &rt, msg->msg_flags);
                if (err)
                        ip_flush_pending_frames(sk);
                else if (!(msg->msg_flags & MSG_MORE)) {
                        err = ip_push_pending_frames(sk, &fl4);
                        if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk))
                                err = 0;
                }
                release_sock(sk);
        }
done:
        if (free)
                kfree(ipc.opt);
        ip_rt_put(rt);

out:
        if (err < 0)
                return err;
        return len;

do_confirm:
        if (msg->msg_flags & MSG_PROBE)
                dst_confirm_neigh(&rt->dst, &fl4.daddr);
        if (!(msg->msg_flags & MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
        goto done;
}

static void raw_close(struct sock *sk, long timeout)
{
        /*
         * Raw sockets may have direct kernel references. Kill them.
         */
        ip_ra_control(sk, 0, NULL);

        sk_common_release(sk);
}

static void raw_destroy(struct sock *sk)
{
        lock_sock(sk);
        ip_flush_pending_frames(sk);
        release_sock(sk);
}

/* This gets rid of all the nasties in af_inet. -DaveM */
static int raw_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
                    int addr_len)
{
        struct inet_sock *inet = inet_sk(sk);
        struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
        struct net *net = sock_net(sk);
        u32 tb_id = RT_TABLE_LOCAL;
        int ret = -EINVAL;
        int chk_addr_ret;

        lock_sock(sk);
        if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
                goto out;

        if (sk->sk_bound_dev_if)
                tb_id = l3mdev_fib_table_by_index(net,
                                                  sk->sk_bound_dev_if) ? : tb_id;

        chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);

        ret = -EADDRNOTAVAIL;
        if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
                                         chk_addr_ret))
                goto out;

        inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
        if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
                inet->inet_saddr = 0;  /* Use device */
        sk_dst_reset(sk);
        ret = 0;
out:
        release_sock(sk);
        return ret;
}

/*
 *        This should be easy, if there is something there
 *        we return it, otherwise we block.
 */

static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                       int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        size_t copied = 0;
        int err = -EOPNOTSUPP;
        DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
        struct sk_buff *skb;

        if (flags & MSG_OOB)
                goto out;

        if (flags & MSG_ERRQUEUE) {
                err = ip_recv_error(sk, msg, len);
                goto out;
        }

        skb = skb_recv_datagram(sk, flags, &err);
        if (!skb)
                goto out;

        copied = skb->len;
        if (len < copied) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }

        err = skb_copy_datagram_msg(skb, 0, msg, copied);
        if (err)
                goto done;

        sock_recv_cmsgs(msg, sk, skb);

        /* Copy the address. */
        if (sin) {
                sin->sin_family = AF_INET;
                sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
                sin->sin_port = 0;
                memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
                msg->msg_namelen = sizeof(*sin);
        }
        if (inet_cmsg_flags(inet))
                ip_cmsg_recv(msg, skb);
        if (flags & MSG_TRUNC)
                copied = skb->len;
done:
        skb_free_datagram(sk, skb);
out:
        if (err)
                return err;
        return copied;
}

static int raw_sk_init(struct sock *sk)
{
        struct raw_sock *rp = raw_sk(sk);

        sk->sk_drop_counters = &rp->drop_counters;
        if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
                memset(&rp->filter, 0, sizeof(rp->filter));
        return 0;
}

static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen)
{
        if (optlen > sizeof(struct icmp_filter))
                optlen = sizeof(struct icmp_filter);
        if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen))
                return -EFAULT;
        return 0;
}

static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
{
        int len, ret = -EFAULT;

        if (get_user(len, optlen))
                goto out;
        ret = -EINVAL;
        if (len < 0)
                goto out;
        if (len > sizeof(struct icmp_filter))
                len = sizeof(struct icmp_filter);
        ret = -EFAULT;
        if (put_user(len, optlen) ||
            copy_to_user(optval, &raw_sk(sk)->filter, len))
                goto out;
        ret = 0;
out:        return ret;
}

static int do_raw_setsockopt(struct sock *sk, int optname,
                             sockptr_t optval, unsigned int optlen)
{
        if (optname == ICMP_FILTER) {
                if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
                        return -EOPNOTSUPP;
                else
                        return raw_seticmpfilter(sk, optval, optlen);
        }
        return -ENOPROTOOPT;
}

static int raw_setsockopt(struct sock *sk, int level, int optname,
                          sockptr_t optval, unsigned int optlen)
{
        if (level != SOL_RAW)
                return ip_setsockopt(sk, level, optname, optval, optlen);
        return do_raw_setsockopt(sk, optname, optval, optlen);
}

static int do_raw_getsockopt(struct sock *sk, int optname,
                             char __user *optval, int __user *optlen)
{
        if (optname == ICMP_FILTER) {
                if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
                        return -EOPNOTSUPP;
                else
                        return raw_geticmpfilter(sk, optval, optlen);
        }
        return -ENOPROTOOPT;
}

static int raw_getsockopt(struct sock *sk, int level, int optname,
                          char __user *optval, int __user *optlen)
{
        if (level != SOL_RAW)
                return ip_getsockopt(sk, level, optname, optval, optlen);
        return do_raw_getsockopt(sk, optname, optval, optlen);
}

static int raw_ioctl(struct sock *sk, int cmd, int *karg)
{
        switch (cmd) {
        case SIOCOUTQ: {
                *karg = sk_wmem_alloc_get(sk);
                return 0;
        }
        case SIOCINQ: {
                struct sk_buff *skb;

                spin_lock_bh(&sk->sk_receive_queue.lock);
                skb = skb_peek(&sk->sk_receive_queue);
                if (skb)
                        *karg = skb->len;
                else
                        *karg = 0;
                spin_unlock_bh(&sk->sk_receive_queue.lock);
                return 0;
        }

        default:
#ifdef CONFIG_IP_MROUTE
                return ipmr_ioctl(sk, cmd, karg);
#else
                return -ENOIOCTLCMD;
#endif
        }
}

#ifdef CONFIG_COMPAT
static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
{
        switch (cmd) {
        case SIOCOUTQ:
        case SIOCINQ:
                return -ENOIOCTLCMD;
        default:
#ifdef CONFIG_IP_MROUTE
                return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
#else
                return -ENOIOCTLCMD;
#endif
        }
}
#endif

int raw_abort(struct sock *sk, int err)
{
        lock_sock(sk);

        sk->sk_err = err;
        sk_error_report(sk);
        __udp_disconnect(sk, 0);

        release_sock(sk);

        return 0;
}
EXPORT_SYMBOL_GPL(raw_abort);

struct proto raw_prot = {
        .name                   = "RAW",
        .owner                   = THIS_MODULE,
        .close                   = raw_close,
        .destroy           = raw_destroy,
        .connect           = ip4_datagram_connect,
        .disconnect           = __udp_disconnect,
        .ioctl                   = raw_ioctl,
        .init                   = raw_sk_init,
        .setsockopt           = raw_setsockopt,
        .getsockopt           = raw_getsockopt,
        .sendmsg           = raw_sendmsg,
        .recvmsg           = raw_recvmsg,
        .bind                   = raw_bind,
        .backlog_rcv           = raw_rcv_skb,
        .release_cb           = ip4_datagram_release_cb,
        .hash                   = raw_hash_sk,
        .unhash                   = raw_unhash_sk,
        .obj_size           = sizeof(struct raw_sock),
        .useroffset           = offsetof(struct raw_sock, filter),
        .usersize           = sizeof_field(struct raw_sock, filter),
        .h.raw_hash           = &raw_v4_hashinfo,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = compat_raw_ioctl,
#endif
        .diag_destroy           = raw_abort,
};

#ifdef CONFIG_PROC_FS
static struct sock *raw_get_first(struct seq_file *seq, int bucket)
{
        struct raw_hashinfo *h = pde_data(file_inode(seq->file));
        struct raw_iter_state *state = raw_seq_private(seq);
        struct hlist_head *hlist;
        struct sock *sk;

        for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE;
                        ++state->bucket) {
                hlist = &h->ht[state->bucket];
                sk_for_each(sk, hlist) {
                        if (sock_net(sk) == seq_file_net(seq))
                                return sk;
                }
        }
        return NULL;
}

static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
{
        struct raw_iter_state *state = raw_seq_private(seq);

        do {
                sk = sk_next(sk);
        } while (sk && sock_net(sk) != seq_file_net(seq));

        if (!sk)
                return raw_get_first(seq, state->bucket + 1);
        return sk;
}

static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
{
        struct sock *sk = raw_get_first(seq, 0);

        if (sk)
                while (pos && (sk = raw_get_next(seq, sk)) != NULL)
                        --pos;
        return pos ? NULL : sk;
}

void *raw_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(&h->lock)
{
        struct raw_hashinfo *h = pde_data(file_inode(seq->file));

        spin_lock(&h->lock);

        return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
EXPORT_SYMBOL_GPL(raw_seq_start);

void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct sock *sk;

        if (v == SEQ_START_TOKEN)
                sk = raw_get_first(seq, 0);
        else
                sk = raw_get_next(seq, v);
        ++*pos;
        return sk;
}
EXPORT_SYMBOL_GPL(raw_seq_next);

void raw_seq_stop(struct seq_file *seq, void *v)
        __releases(&h->lock)
{
        struct raw_hashinfo *h = pde_data(file_inode(seq->file));

        spin_unlock(&h->lock);
}
EXPORT_SYMBOL_GPL(raw_seq_stop);

static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
{
        struct inet_sock *inet = inet_sk(sp);
        __be32 dest = inet->inet_daddr,
               src = inet->inet_rcv_saddr;
        __u16 destp = 0,
              srcp  = inet->inet_num;

        seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %llu %d %pK %u\n",
                i, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
                0, 0L, 0,
                from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
                0, sock_i_ino(sp),
                refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp));
}

static int raw_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN)
                seq_printf(seq, "  sl  local_address rem_address   st tx_queue "
                                "rx_queue tr tm->when retrnsmt   uid  timeout "
                                "inode ref pointer drops\n");
        else
                raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);
        return 0;
}

static const struct seq_operations raw_seq_ops = {
        .start = raw_seq_start,
        .next  = raw_seq_next,
        .stop  = raw_seq_stop,
        .show  = raw_seq_show,
};

static __net_init int raw_init_net(struct net *net)
{
        if (!proc_create_net_data("raw", 0444, net->proc_net, &raw_seq_ops,
                        sizeof(struct raw_iter_state), &raw_v4_hashinfo))
                return -ENOMEM;

        return 0;
}

static __net_exit void raw_exit_net(struct net *net)
{
        remove_proc_entry("raw", net->proc_net);
}

static __net_initdata struct pernet_operations raw_net_ops = {
        .init = raw_init_net,
        .exit = raw_exit_net,
};

int __init raw_proc_init(void)
{

        return register_pernet_subsys(&raw_net_ops);
}

void __init raw_proc_exit(void)
{
        unregister_pernet_subsys(&raw_net_ops);
}
#endif /* CONFIG_PROC_FS */

static void raw_sysctl_init_net(struct net *net)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
        net->ipv4.sysctl_raw_l3mdev_accept = 1;
#endif
}

static int __net_init raw_sysctl_init(struct net *net)
{
        raw_sysctl_init_net(net);
        return 0;
}

static struct pernet_operations __net_initdata raw_sysctl_ops = {
        .init        = raw_sysctl_init,
};

void __init raw_init(void)
{
        raw_sysctl_init_net(&init_net);
        if (register_pernet_subsys(&raw_sysctl_ops))
                panic("RAW: failed to init sysctl parameters.\n");
}













































































    3 



    3 




























    2 







    3 


    3 







































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  SR-IPv6 implementation
 *
 *  Author:
 *  David Lebrun <david.lebrun@uclouvain.be>
 */

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/slab.h>
#include <linux/rhashtable.h>

#include <net/ipv6.h>
#include <net/protocol.h>

#include <net/seg6.h>
#include <net/genetlink.h>
#include <linux/seg6.h>
#include <linux/seg6_genl.h>
#include <net/seg6_hmac.h>

bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced)
{
        unsigned int tlv_offset;
        int max_last_entry;
        int trailing;

        if (srh->type != IPV6_SRCRT_TYPE_4)
                return false;

        if (((srh->hdrlen + 1) << 3) != len)
                return false;

        if (!reduced && srh->segments_left > srh->first_segment) {
                return false;
        } else {
                max_last_entry = (srh->hdrlen / 2) - 1;

                if (srh->first_segment > max_last_entry)
                        return false;

                if (srh->segments_left > srh->first_segment + 1)
                        return false;
        }

        tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4);

        trailing = len - tlv_offset;
        if (trailing < 0)
                return false;

        while (trailing) {
                struct sr6_tlv *tlv;
                unsigned int tlv_len;

                if (trailing < sizeof(*tlv))
                        return false;

                tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset);
                tlv_len = sizeof(*tlv) + tlv->len;

                trailing -= tlv_len;
                if (trailing < 0)
                        return false;

                tlv_offset += tlv_len;
        }

        return true;
}

struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags)
{
        struct ipv6_sr_hdr *srh;
        int len, srhoff = 0;

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, &flags) < 0)
                return NULL;

        if (!pskb_may_pull(skb, srhoff + sizeof(*srh)))
                return NULL;

        srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);

        len = (srh->hdrlen + 1) << 3;

        if (!pskb_may_pull(skb, srhoff + len))
                return NULL;

        /* note that pskb_may_pull may change pointers in header;
         * for this reason it is necessary to reload them when needed.
         */
        srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);

        if (!seg6_validate_srh(srh, len, true))
                return NULL;

        return srh;
}

/* Determine if an ICMP invoking packet contains a segment routing
 * header.  If it does, extract the offset to the true destination
 * address, which is in the first segment address.
 */
void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt)
{
        __u16 network_header = skb->network_header;
        struct ipv6_sr_hdr *srh;

        /* Update network header to point to the invoking packet
         * inside the ICMP packet, so we can use the seg6_get_srh()
         * helper.
         */
        skb_reset_network_header(skb);

        srh = seg6_get_srh(skb, 0);
        if (!srh)
                goto out;

        if (srh->type != IPV6_SRCRT_TYPE_4)
                goto out;

        opt->flags |= IP6SKB_SEG6;
        opt->srhoff = (unsigned char *)srh - skb->data;

out:
        /* Restore the network header back to the ICMP packet */
        skb->network_header = network_header;
}

static struct genl_family seg6_genl_family;

static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = {
        [SEG6_ATTR_DST]                                = { .type = NLA_BINARY,
                .len = sizeof(struct in6_addr) },
        [SEG6_ATTR_DSTLEN]                        = { .type = NLA_S32, },
        [SEG6_ATTR_HMACKEYID]                = { .type = NLA_U32, },
        [SEG6_ATTR_SECRET]                        = { .type = NLA_BINARY, },
        [SEG6_ATTR_SECRETLEN]                = { .type = NLA_U8, },
        [SEG6_ATTR_ALGID]                        = { .type = NLA_U8, },
        [SEG6_ATTR_HMACINFO]                = { .type = NLA_NESTED, },
};

#ifdef CONFIG_IPV6_SEG6_HMAC

static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);
        struct seg6_pernet_data *sdata;
        struct seg6_hmac_info *hinfo;
        u32 hmackeyid;
        char *secret;
        int err = 0;
        u8 algid;
        u8 slen;

        sdata = seg6_pernet(net);

        if (!info->attrs[SEG6_ATTR_HMACKEYID] ||
            !info->attrs[SEG6_ATTR_SECRETLEN] ||
            !info->attrs[SEG6_ATTR_ALGID])
                return -EINVAL;

        hmackeyid = nla_get_u32(info->attrs[SEG6_ATTR_HMACKEYID]);
        slen = nla_get_u8(info->attrs[SEG6_ATTR_SECRETLEN]);
        algid = nla_get_u8(info->attrs[SEG6_ATTR_ALGID]);

        if (hmackeyid == 0)
                return -EINVAL;

        if (slen > SEG6_HMAC_SECRET_LEN)
                return -EINVAL;

        mutex_lock(&sdata->lock);
        hinfo = seg6_hmac_info_lookup(net, hmackeyid);

        if (!slen) {
                err = seg6_hmac_info_del(net, hmackeyid);

                goto out_unlock;
        }

        if (!info->attrs[SEG6_ATTR_SECRET]) {
                err = -EINVAL;
                goto out_unlock;
        }

        if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) {
                err = -EINVAL;
                goto out_unlock;
        }

        if (hinfo) {
                err = seg6_hmac_info_del(net, hmackeyid);
                if (err)
                        goto out_unlock;
        }

        secret = (char *)nla_data(info->attrs[SEG6_ATTR_SECRET]);

        hinfo = kzalloc_obj(*hinfo);
        if (!hinfo) {
                err = -ENOMEM;
                goto out_unlock;
        }

        memcpy(hinfo->secret, secret, slen);
        hinfo->slen = slen;
        hinfo->alg_id = algid;
        hinfo->hmackeyid = hmackeyid;

        err = seg6_hmac_info_add(net, hmackeyid, hinfo);
        if (err)
                kfree(hinfo);

out_unlock:
        mutex_unlock(&sdata->lock);
        return err;
}

#else

static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
{
        return -ENOTSUPP;
}

#endif

static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);
        struct in6_addr *val, *t_old, *t_new;
        struct seg6_pernet_data *sdata;

        sdata = seg6_pernet(net);

        if (!info->attrs[SEG6_ATTR_DST])
                return -EINVAL;

        val = nla_data(info->attrs[SEG6_ATTR_DST]);
        t_new = kmemdup(val, sizeof(*val), GFP_KERNEL);
        if (!t_new)
                return -ENOMEM;

        mutex_lock(&sdata->lock);

        t_old = sdata->tun_src;
        rcu_assign_pointer(sdata->tun_src, t_new);

        mutex_unlock(&sdata->lock);

        synchronize_net();
        kfree(t_old);

        return 0;
}

static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);
        struct in6_addr *tun_src;
        struct sk_buff *msg;
        void *hdr;

        msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
                          &seg6_genl_family, 0, SEG6_CMD_GET_TUNSRC);
        if (!hdr)
                goto free_msg;

        rcu_read_lock();
        tun_src = rcu_dereference(seg6_pernet(net)->tun_src);

        if (nla_put(msg, SEG6_ATTR_DST, sizeof(struct in6_addr), tun_src))
                goto nla_put_failure;

        rcu_read_unlock();

        genlmsg_end(msg, hdr);
        return genlmsg_reply(msg, info);

nla_put_failure:
        rcu_read_unlock();
free_msg:
        nlmsg_free(msg);
        return -ENOMEM;
}

#ifdef CONFIG_IPV6_SEG6_HMAC

static int __seg6_hmac_fill_info(struct seg6_hmac_info *hinfo,
                                 struct sk_buff *msg)
{
        if (nla_put_u32(msg, SEG6_ATTR_HMACKEYID, hinfo->hmackeyid) ||
            nla_put_u8(msg, SEG6_ATTR_SECRETLEN, hinfo->slen) ||
            nla_put(msg, SEG6_ATTR_SECRET, hinfo->slen, hinfo->secret) ||
            nla_put_u8(msg, SEG6_ATTR_ALGID, hinfo->alg_id))
                return -1;

        return 0;
}

static int __seg6_genl_dumphmac_element(struct seg6_hmac_info *hinfo,
                                        u32 portid, u32 seq, u32 flags,
                                        struct sk_buff *skb, u8 cmd)
{
        void *hdr;

        hdr = genlmsg_put(skb, portid, seq, &seg6_genl_family, flags, cmd);
        if (!hdr)
                return -ENOMEM;

        if (__seg6_hmac_fill_info(hinfo, skb) < 0)
                goto nla_put_failure;

        genlmsg_end(skb, hdr);
        return 0;

nla_put_failure:
        genlmsg_cancel(skb, hdr);
        return -EMSGSIZE;
}

static int seg6_genl_dumphmac_start(struct netlink_callback *cb)
{
        struct net *net = sock_net(cb->skb->sk);
        struct seg6_pernet_data *sdata;
        struct rhashtable_iter *iter;

        sdata = seg6_pernet(net);
        iter = (struct rhashtable_iter *)cb->args[0];

        if (!iter) {
                iter = kmalloc_obj(*iter);
                if (!iter)
                        return -ENOMEM;

                cb->args[0] = (long)iter;
        }

        rhashtable_walk_enter(&sdata->hmac_infos, iter);

        return 0;
}

static int seg6_genl_dumphmac_done(struct netlink_callback *cb)
{
        struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];

        rhashtable_walk_exit(iter);

        kfree(iter);

        return 0;
}

static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
        struct seg6_hmac_info *hinfo;
        int ret;

        rhashtable_walk_start(iter);

        for (;;) {
                hinfo = rhashtable_walk_next(iter);

                if (IS_ERR(hinfo)) {
                        if (PTR_ERR(hinfo) == -EAGAIN)
                                continue;
                        ret = PTR_ERR(hinfo);
                        goto done;
                } else if (!hinfo) {
                        break;
                }

                ret = __seg6_genl_dumphmac_element(hinfo,
                                                   NETLINK_CB(cb->skb).portid,
                                                   cb->nlh->nlmsg_seq,
                                                   NLM_F_MULTI,
                                                   skb, SEG6_CMD_DUMPHMAC);
                if (ret)
                        goto done;
        }

        ret = skb->len;

done:
        rhashtable_walk_stop(iter);
        return ret;
}

#else

static int seg6_genl_dumphmac_start(struct netlink_callback *cb)
{
        return 0;
}

static int seg6_genl_dumphmac_done(struct netlink_callback *cb)
{
        return 0;
}

static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
{
        return -ENOTSUPP;
}

#endif

static int __net_init seg6_net_init(struct net *net)
{
        struct seg6_pernet_data *sdata;

        sdata = kzalloc_obj(*sdata);
        if (!sdata)
                return -ENOMEM;

        mutex_init(&sdata->lock);

        sdata->tun_src = kzalloc_obj(*sdata->tun_src);
        if (!sdata->tun_src) {
                kfree(sdata);
                return -ENOMEM;
        }

        net->ipv6.seg6_data = sdata;

        if (seg6_hmac_net_init(net)) {
                kfree(rcu_dereference_raw(sdata->tun_src));
                kfree(sdata);
                return -ENOMEM;
        }

        return 0;
}

static void __net_exit seg6_net_exit(struct net *net)
{
        struct seg6_pernet_data *sdata = seg6_pernet(net);

        seg6_hmac_net_exit(net);

        kfree(rcu_dereference_raw(sdata->tun_src));
        kfree(sdata);
}

static struct pernet_operations ip6_segments_ops = {
        .init = seg6_net_init,
        .exit = seg6_net_exit,
};

static const struct genl_ops seg6_genl_ops[] = {
        {
                .cmd        = SEG6_CMD_SETHMAC,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit        = seg6_genl_sethmac,
                .flags        = GENL_ADMIN_PERM,
        },
        {
                .cmd        = SEG6_CMD_DUMPHMAC,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .start        = seg6_genl_dumphmac_start,
                .dumpit        = seg6_genl_dumphmac,
                .done        = seg6_genl_dumphmac_done,
                .flags        = GENL_ADMIN_PERM,
        },
        {
                .cmd        = SEG6_CMD_SET_TUNSRC,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit        = seg6_genl_set_tunsrc,
                .flags        = GENL_ADMIN_PERM,
        },
        {
                .cmd        = SEG6_CMD_GET_TUNSRC,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit        = seg6_genl_get_tunsrc,
                .flags        = GENL_ADMIN_PERM,
        },
};

static struct genl_family seg6_genl_family __ro_after_init = {
        .hdrsize        = 0,
        .name                = SEG6_GENL_NAME,
        .version        = SEG6_GENL_VERSION,
        .maxattr        = SEG6_ATTR_MAX,
        .policy = seg6_genl_policy,
        .netnsok        = true,
        .parallel_ops        = true,
        .ops                = seg6_genl_ops,
        .n_ops                = ARRAY_SIZE(seg6_genl_ops),
        .resv_start_op        = SEG6_CMD_GET_TUNSRC + 1,
        .module                = THIS_MODULE,
};

int __init seg6_init(void)
{
        int err;

        err = register_pernet_subsys(&ip6_segments_ops);
        if (err)
                goto out;

        err = genl_register_family(&seg6_genl_family);
        if (err)
                goto out_unregister_pernet;

        err = seg6_iptunnel_init();
        if (err)
                goto out_unregister_genl;

        err = seg6_local_init();
        if (err)
                goto out_unregister_iptun;

        pr_info("Segment Routing with IPv6\n");

out:
        return err;
out_unregister_iptun:
        seg6_iptunnel_exit();
out_unregister_genl:
        genl_unregister_family(&seg6_genl_family);
out_unregister_pernet:
        unregister_pernet_subsys(&ip6_segments_ops);
        goto out;
}

void seg6_exit(void)
{
        seg6_local_exit();
        seg6_iptunnel_exit();
        genl_unregister_family(&seg6_genl_family);
        unregister_pernet_subsys(&ip6_segments_ops);
}

































































































































































































































































































































































    3 

















    3 





















    3 





    3 









    3 


    3 






















    1 



















    3 





    3 

    3 





























    1 




















    1 



























    1 



    1 

    1 





    1 





















    1 


    1 




















    1 





    1 



    1 













































    1 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/char_dev.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/kdev_t.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/cleanup.h>

#include <linux/major.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/seq_file.h>

#include <linux/kobject.h>
#include <linux/kobj_map.h>
#include <linux/cdev.h>
#include <linux/mutex.h>
#include <linux/backing-dev.h>
#include <linux/tty.h>

#include "internal.h"

static struct kobj_map *cdev_map __ro_after_init;

static DEFINE_MUTEX(chrdevs_lock);

#define CHRDEV_MAJOR_HASH_SIZE 255

static struct char_device_struct {
        struct char_device_struct *next;
        unsigned int major;
        unsigned int baseminor;
        int minorct;
        char name[64];
        struct cdev *cdev;                /* will die */
} *chrdevs[CHRDEV_MAJOR_HASH_SIZE];

/* index in the above */
static inline int major_to_index(unsigned major)
{
        return major % CHRDEV_MAJOR_HASH_SIZE;
}

#ifdef CONFIG_PROC_FS

void chrdev_show(struct seq_file *f, off_t offset)
{
        struct char_device_struct *cd;

        mutex_lock(&chrdevs_lock);
        for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) {
                if (cd->major == offset)
                        seq_printf(f, "%3d %s\n", cd->major, cd->name);
        }
        mutex_unlock(&chrdevs_lock);
}

#endif /* CONFIG_PROC_FS */

static int find_dynamic_major(void)
{
        int i;
        struct char_device_struct *cd;

        for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) {
                if (chrdevs[i] == NULL)
                        return i;
        }

        for (i = CHRDEV_MAJOR_DYN_EXT_START;
             i >= CHRDEV_MAJOR_DYN_EXT_END; i--) {
                for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next)
                        if (cd->major == i)
                                break;

                if (cd == NULL)
                        return i;
        }

        return -EBUSY;
}

/*
 * Register a single major with a specified minor range.
 *
 * If major == 0 this function will dynamically allocate an unused major.
 * If major > 0 this function will attempt to reserve the range of minors
 * with given major.
 *
 */
static struct char_device_struct *
__register_chrdev_region(unsigned int major, unsigned int baseminor,
                           int minorct, const char *name)
{
        struct char_device_struct *cd __free(kfree) = NULL;
        struct char_device_struct *curr, *prev = NULL;
        int ret;
        int i;

        if (major >= CHRDEV_MAJOR_MAX) {
                pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n",
                       name, major, CHRDEV_MAJOR_MAX-1);
                return ERR_PTR(-EINVAL);
        }

        if (minorct > MINORMASK + 1 - baseminor) {
                pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n",
                        name, baseminor, baseminor + minorct - 1, 0, MINORMASK);
                return ERR_PTR(-EINVAL);
        }

        cd = kzalloc_obj(struct char_device_struct);
        if (cd == NULL)
                return ERR_PTR(-ENOMEM);

        guard(mutex)(&chrdevs_lock);

        if (major == 0) {
                ret = find_dynamic_major();
                if (ret < 0) {
                        pr_err("CHRDEV \"%s\" dynamic allocation region is full\n",
                               name);
                        return ERR_PTR(ret);
                }
                major = ret;
        }

        ret = -EBUSY;
        i = major_to_index(major);
        for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) {
                if (curr->major < major)
                        continue;

                if (curr->major > major)
                        break;

                if (curr->baseminor + curr->minorct <= baseminor)
                        continue;

                if (curr->baseminor >= baseminor + minorct)
                        break;

                return ERR_PTR(ret);
        }

        cd->major = major;
        cd->baseminor = baseminor;
        cd->minorct = minorct;
        strscpy(cd->name, name, sizeof(cd->name));

        if (!prev) {
                cd->next = curr;
                chrdevs[i] = cd;
        } else {
                cd->next = prev->next;
                prev->next = cd;
        }

        return_ptr(cd);
}

static struct char_device_struct *
__unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct)
{
        struct char_device_struct *cd = NULL, **cp;
        int i = major_to_index(major);

        mutex_lock(&chrdevs_lock);
        for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
                if ((*cp)->major == major &&
                    (*cp)->baseminor == baseminor &&
                    (*cp)->minorct == minorct)
                        break;
        if (*cp) {
                cd = *cp;
                *cp = cd->next;
        }
        mutex_unlock(&chrdevs_lock);
        return cd;
}

/**
 * register_chrdev_region() - register a range of device numbers
 * @from: the first in the desired range of device numbers; must include
 *        the major number.
 * @count: the number of consecutive device numbers required
 * @name: the name of the device or driver.
 *
 * Return value is zero on success, a negative error code on failure.
 */
int register_chrdev_region(dev_t from, unsigned count, const char *name)
{
        struct char_device_struct *cd;
        dev_t to = from + count;
        dev_t n, next;

        for (n = from; n < to; n = next) {
                next = MKDEV(MAJOR(n)+1, 0);
                if (next > to)
                        next = to;
                cd = __register_chrdev_region(MAJOR(n), MINOR(n),
                               next - n, name);
                if (IS_ERR(cd))
                        goto fail;
        }
        return 0;
fail:
        to = n;
        for (n = from; n < to; n = next) {
                next = MKDEV(MAJOR(n)+1, 0);
                kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n));
        }
        return PTR_ERR(cd);
}

/**
 * alloc_chrdev_region() - register a range of char device numbers
 * @dev: output parameter for first assigned number
 * @baseminor: first of the requested range of minor numbers
 * @count: the number of minor numbers required
 * @name: the name of the associated device or driver
 *
 * Allocates a range of char device numbers.  The major number will be
 * chosen dynamically, and returned (along with the first minor number)
 * in @dev.  Returns zero or a negative error code.
 */
int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
                        const char *name)
{
        struct char_device_struct *cd;
        cd = __register_chrdev_region(0, baseminor, count, name);
        if (IS_ERR(cd))
                return PTR_ERR(cd);
        *dev = MKDEV(cd->major, cd->baseminor);
        return 0;
}

/**
 * __register_chrdev() - create and register a cdev occupying a range of minors
 * @major: major device number or 0 for dynamic allocation
 * @baseminor: first of the requested range of minor numbers
 * @count: the number of minor numbers required
 * @name: name of this range of devices
 * @fops: file operations associated with this devices
 *
 * If @major == 0 this functions will dynamically allocate a major and return
 * its number.
 *
 * If @major > 0 this function will attempt to reserve a device with the given
 * major number and will return zero on success.
 *
 * Returns a -ve errno on failure.
 *
 * The name of this device has nothing to do with the name of the device in
 * /dev. It only helps to keep track of the different owners of devices. If
 * your module name has only one type of devices it's ok to use e.g. the name
 * of the module here.
 */
int __register_chrdev(unsigned int major, unsigned int baseminor,
                      unsigned int count, const char *name,
                      const struct file_operations *fops)
{
        struct char_device_struct *cd;
        struct cdev *cdev;
        int err = -ENOMEM;

        cd = __register_chrdev_region(major, baseminor, count, name);
        if (IS_ERR(cd))
                return PTR_ERR(cd);

        cdev = cdev_alloc();
        if (!cdev)
                goto out2;

        cdev->owner = fops->owner;
        cdev->ops = fops;
        kobject_set_name(&cdev->kobj, "%s", name);

        err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
        if (err)
                goto out;

        cd->cdev = cdev;

        return major ? 0 : cd->major;
out:
        kobject_put(&cdev->kobj);
out2:
        kfree(__unregister_chrdev_region(cd->major, baseminor, count));
        return err;
}

/**
 * unregister_chrdev_region() - unregister a range of device numbers
 * @from: the first in the range of numbers to unregister
 * @count: the number of device numbers to unregister
 *
 * This function will unregister a range of @count device numbers,
 * starting with @from.  The caller should normally be the one who
 * allocated those numbers in the first place...
 */
void unregister_chrdev_region(dev_t from, unsigned count)
{
        dev_t to = from + count;
        dev_t n, next;

        for (n = from; n < to; n = next) {
                next = MKDEV(MAJOR(n)+1, 0);
                if (next > to)
                        next = to;
                kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n));
        }
}

/**
 * __unregister_chrdev - unregister and destroy a cdev
 * @major: major device number
 * @baseminor: first of the range of minor numbers
 * @count: the number of minor numbers this cdev is occupying
 * @name: name of this range of devices
 *
 * Unregister and destroy the cdev occupying the region described by
 * @major, @baseminor and @count.  This function undoes what
 * __register_chrdev() did.
 */
void __unregister_chrdev(unsigned int major, unsigned int baseminor,
                         unsigned int count, const char *name)
{
        struct char_device_struct *cd;

        cd = __unregister_chrdev_region(major, baseminor, count);
        if (cd && cd->cdev)
                cdev_del(cd->cdev);
        kfree(cd);
}

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(cdev_lock);

static struct kobject *cdev_get(struct cdev *p)
{
        struct module *owner = p->owner;
        struct kobject *kobj;

        if (!try_module_get(owner))
                return NULL;
        kobj = kobject_get_unless_zero(&p->kobj);
        if (!kobj)
                module_put(owner);
        return kobj;
}

void cdev_put(struct cdev *p)
{
        if (p) {
                struct module *owner = p->owner;
                kobject_put(&p->kobj);
                module_put(owner);
        }
}

/*
 * Called every time a character special file is opened
 */
static int chrdev_open(struct inode *inode, struct file *filp)
{
        const struct file_operations *fops;
        struct cdev *p;
        struct cdev *new = NULL;
        int ret = 0;

        spin_lock(&cdev_lock);
        p = inode->i_cdev;
        if (!p) {
                struct kobject *kobj;
                int idx;
                spin_unlock(&cdev_lock);
                kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
                if (!kobj)
                        return -ENXIO;
                new = container_of(kobj, struct cdev, kobj);
                spin_lock(&cdev_lock);
                /* Check i_cdev again in case somebody beat us to it while
                   we dropped the lock. */
                p = inode->i_cdev;
                if (!p) {
                        inode->i_cdev = p = new;
                        list_add(&inode->i_devices, &p->list);
                        new = NULL;
                } else if (!cdev_get(p))
                        ret = -ENXIO;
        } else if (!cdev_get(p))
                ret = -ENXIO;
        spin_unlock(&cdev_lock);
        cdev_put(new);
        if (ret)
                return ret;

        ret = -ENXIO;
        fops = fops_get(p->ops);
        if (!fops)
                goto out_cdev_put;

        replace_fops(filp, fops);
        if (filp->f_op->open) {
                ret = filp->f_op->open(inode, filp);
                if (ret)
                        goto out_cdev_put;
        }

        return 0;

 out_cdev_put:
        cdev_put(p);
        return ret;
}

void cd_forget(struct inode *inode)
{
        spin_lock(&cdev_lock);
        list_del_init(&inode->i_devices);
        inode->i_cdev = NULL;
        inode->i_mapping = &inode->i_data;
        spin_unlock(&cdev_lock);
}

static void cdev_purge(struct cdev *cdev)
{
        spin_lock(&cdev_lock);
        while (!list_empty(&cdev->list)) {
                struct inode *inode;
                inode = container_of(cdev->list.next, struct inode, i_devices);
                list_del_init(&inode->i_devices);
                inode->i_cdev = NULL;
        }
        spin_unlock(&cdev_lock);
}

/*
 * Dummy default file-operations: the only thing this does
 * is contain the open that then fills in the correct operations
 * depending on the special file...
 */
const struct file_operations def_chr_fops = {
        .open = chrdev_open,
        .llseek = noop_llseek,
};

static struct kobject *exact_match(dev_t dev, int *part, void *data)
{
        struct cdev *p = data;
        return &p->kobj;
}

static int exact_lock(dev_t dev, void *data)
{
        struct cdev *p = data;
        return cdev_get(p) ? 0 : -1;
}

/**
 * cdev_add() - add a char device to the system
 * @p: the cdev structure for the device
 * @dev: the first device number for which this device is responsible
 * @count: the number of consecutive minor numbers corresponding to this
 *         device
 *
 * cdev_add() adds the device represented by @p to the system, making it
 * live immediately.  A negative error code is returned on failure.
 */
int cdev_add(struct cdev *p, dev_t dev, unsigned count)
{
        int error;

        p->dev = dev;
        p->count = count;

        if (WARN_ON(dev == WHITEOUT_DEV)) {
                error = -EBUSY;
                goto err;
        }

        error = kobj_map(cdev_map, dev, count, NULL,
                         exact_match, exact_lock, p);
        if (error)
                goto err;

        kobject_get(p->kobj.parent);

        return 0;

err:
        kfree_const(p->kobj.name);
        p->kobj.name = NULL;
        return error;
}

/**
 * cdev_set_parent() - set the parent kobject for a char device
 * @p: the cdev structure
 * @kobj: the kobject to take a reference to
 *
 * cdev_set_parent() sets a parent kobject which will be referenced
 * appropriately so the parent is not freed before the cdev. This
 * should be called before cdev_add.
 */
void cdev_set_parent(struct cdev *p, struct kobject *kobj)
{
        WARN_ON(!kobj->state_initialized);
        p->kobj.parent = kobj;
}

/**
 * cdev_device_add() - add a char device and it's corresponding
 *        struct device, linkink
 * @dev: the device structure
 * @cdev: the cdev structure
 *
 * cdev_device_add() adds the char device represented by @cdev to the system,
 * just as cdev_add does. It then adds @dev to the system using device_add
 * The dev_t for the char device will be taken from the struct device which
 * needs to be initialized first. This helper function correctly takes a
 * reference to the parent device so the parent will not get released until
 * all references to the cdev are released.
 *
 * This helper uses dev->devt for the device number. If it is not set
 * it will not add the cdev and it will be equivalent to device_add.
 *
 * This function should be used whenever the struct cdev and the
 * struct device are members of the same structure whose lifetime is
 * managed by the struct device.
 *
 * NOTE: Callers must assume that userspace was able to open the cdev and
 * can call cdev fops callbacks at any time, even if this function fails.
 */
int cdev_device_add(struct cdev *cdev, struct device *dev)
{
        int rc = 0;

        if (dev->devt) {
                cdev_set_parent(cdev, &dev->kobj);

                rc = cdev_add(cdev, dev->devt, 1);
                if (rc)
                        return rc;
        }

        rc = device_add(dev);
        if (rc && dev->devt)
                cdev_del(cdev);

        return rc;
}

/**
 * cdev_device_del() - inverse of cdev_device_add
 * @cdev: the cdev structure
 * @dev: the device structure
 *
 * cdev_device_del() is a helper function to call cdev_del and device_del.
 * It should be used whenever cdev_device_add is used.
 *
 * If dev->devt is not set it will not remove the cdev and will be equivalent
 * to device_del.
 *
 * NOTE: This guarantees that associated sysfs callbacks are not running
 * or runnable, however any cdevs already open will remain and their fops
 * will still be callable even after this function returns.
 */
void cdev_device_del(struct cdev *cdev, struct device *dev)
{
        device_del(dev);
        if (dev->devt)
                cdev_del(cdev);
}

static void cdev_unmap(dev_t dev, unsigned count)
{
        kobj_unmap(cdev_map, dev, count);
}

/**
 * cdev_del() - remove a cdev from the system
 * @p: the cdev structure to be removed
 *
 * cdev_del() removes @p from the system, possibly freeing the structure
 * itself.
 *
 * NOTE: This guarantees that cdev device will no longer be able to be
 * opened, however any cdevs already open will remain and their fops will
 * still be callable even after cdev_del returns.
 */
void cdev_del(struct cdev *p)
{
        cdev_unmap(p->dev, p->count);
        kobject_put(&p->kobj);
}


static void cdev_default_release(struct kobject *kobj)
{
        struct cdev *p = container_of(kobj, struct cdev, kobj);
        struct kobject *parent = kobj->parent;

        cdev_purge(p);
        kobject_put(parent);
}

static void cdev_dynamic_release(struct kobject *kobj)
{
        struct cdev *p = container_of(kobj, struct cdev, kobj);
        struct kobject *parent = kobj->parent;

        cdev_purge(p);
        kfree(p);
        kobject_put(parent);
}

static struct kobj_type ktype_cdev_default = {
        .release        = cdev_default_release,
};

static struct kobj_type ktype_cdev_dynamic = {
        .release        = cdev_dynamic_release,
};

/**
 * cdev_alloc() - allocate a cdev structure
 *
 * Allocates and returns a cdev structure, or NULL on failure.
 */
struct cdev *cdev_alloc(void)
{
        struct cdev *p = kzalloc_obj(struct cdev);
        if (p) {
                INIT_LIST_HEAD(&p->list);
                kobject_init(&p->kobj, &ktype_cdev_dynamic);
        }
        return p;
}

/**
 * cdev_init() - initialize a cdev structure
 * @cdev: the structure to initialize
 * @fops: the file_operations for this device
 *
 * Initializes @cdev, remembering @fops, making it ready to add to the
 * system with cdev_add().
 */
void cdev_init(struct cdev *cdev, const struct file_operations *fops)
{
        memset(cdev, 0, sizeof *cdev);
        INIT_LIST_HEAD(&cdev->list);
        kobject_init(&cdev->kobj, &ktype_cdev_default);
        cdev->ops = fops;
}

static struct kobject *base_probe(dev_t dev, int *part, void *data)
{
        if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0)
                /* Make old-style 2.4 aliases work */
                request_module("char-major-%d", MAJOR(dev));
        return NULL;
}

void __init chrdev_init(void)
{
        cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
}


/* Let modules do char dev stuff */
EXPORT_SYMBOL(register_chrdev_region);
EXPORT_SYMBOL(unregister_chrdev_region);
EXPORT_SYMBOL(alloc_chrdev_region);
EXPORT_SYMBOL(cdev_init);
EXPORT_SYMBOL(cdev_alloc);
EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
EXPORT_SYMBOL(cdev_set_parent);
EXPORT_SYMBOL(cdev_device_add);
EXPORT_SYMBOL(cdev_device_del);
EXPORT_SYMBOL(__register_chrdev);
EXPORT_SYMBOL(__unregister_chrdev);



















    3 


    2 

    1 










    2 
    2 









    3 
    3 















































































































    3 





    1 
    1 






















    6 



    4 












    1 






    1 



    2 






    3 





    3 





    1 








    1 




















































    1 

























    3 






















    2 




    2 






    1 












    5 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
 */

#include <linux/bpf.h>

#include "disasm.h"

#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
static const char * const func_id_str[] = {
        __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
};
#undef __BPF_FUNC_STR_FN

static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
                                   const struct bpf_insn *insn,
                                   char *buff, size_t len)
{
        BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);

        if (!insn->src_reg &&
            insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
            func_id_str[insn->imm])
                return func_id_str[insn->imm];

        if (cbs && cbs->cb_call) {
                const char *res;

                res = cbs->cb_call(cbs->private_data, insn);
                if (res)
                        return res;
        }

        if (insn->src_reg == BPF_PSEUDO_CALL)
                snprintf(buff, len, "%+d", insn->imm);
        else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
                snprintf(buff, len, "kernel-function");

        return buff;
}

static const char *__func_imm_name(const struct bpf_insn_cbs *cbs,
                                   const struct bpf_insn *insn,
                                   u64 full_imm, char *buff, size_t len)
{
        if (cbs && cbs->cb_imm)
                return cbs->cb_imm(cbs->private_data, insn, full_imm);

        snprintf(buff, len, "0x%llx", (unsigned long long)full_imm);
        return buff;
}

const char *func_id_name(int id)
{
        if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
                return func_id_str[id];
        else
                return "unknown";
}

const char *const bpf_class_string[8] = {
        [BPF_LD]    = "ld",
        [BPF_LDX]   = "ldx",
        [BPF_ST]    = "st",
        [BPF_STX]   = "stx",
        [BPF_ALU]   = "alu",
        [BPF_JMP]   = "jmp",
        [BPF_JMP32] = "jmp32",
        [BPF_ALU64] = "alu64",
};

const char *const bpf_alu_string[16] = {
        [BPF_ADD >> 4]  = "+=",
        [BPF_SUB >> 4]  = "-=",
        [BPF_MUL >> 4]  = "*=",
        [BPF_DIV >> 4]  = "/=",
        [BPF_OR  >> 4]  = "|=",
        [BPF_AND >> 4]  = "&=",
        [BPF_LSH >> 4]  = "<<=",
        [BPF_RSH >> 4]  = ">>=",
        [BPF_NEG >> 4]  = "neg",
        [BPF_MOD >> 4]  = "%=",
        [BPF_XOR >> 4]  = "^=",
        [BPF_MOV >> 4]  = "=",
        [BPF_ARSH >> 4] = "s>>=",
        [BPF_END >> 4]  = "endian",
};

static const char *const bpf_alu_sign_string[16] = {
        [BPF_DIV >> 4]  = "s/=",
        [BPF_MOD >> 4]  = "s%=",
};

static const char *const bpf_movsx_string[4] = {
        [0] = "(s8)",
        [1] = "(s16)",
        [3] = "(s32)",
};

static const char *const bpf_atomic_alu_string[16] = {
        [BPF_ADD >> 4]  = "add",
        [BPF_AND >> 4]  = "and",
        [BPF_OR >> 4]  = "or",
        [BPF_XOR >> 4]  = "xor",
};

static const char *const bpf_ldst_string[] = {
        [BPF_W >> 3]  = "u32",
        [BPF_H >> 3]  = "u16",
        [BPF_B >> 3]  = "u8",
        [BPF_DW >> 3] = "u64",
};

static const char *const bpf_ldsx_string[] = {
        [BPF_W >> 3]  = "s32",
        [BPF_H >> 3]  = "s16",
        [BPF_B >> 3]  = "s8",
};

static const char *const bpf_jmp_string[16] = {
        [BPF_JA >> 4]   = "jmp",
        [BPF_JEQ >> 4]  = "==",
        [BPF_JGT >> 4]  = ">",
        [BPF_JLT >> 4]  = "<",
        [BPF_JGE >> 4]  = ">=",
        [BPF_JLE >> 4]  = "<=",
        [BPF_JSET >> 4] = "&",
        [BPF_JNE >> 4]  = "!=",
        [BPF_JSGT >> 4] = "s>",
        [BPF_JSLT >> 4] = "s<",
        [BPF_JSGE >> 4] = "s>=",
        [BPF_JSLE >> 4] = "s<=",
        [BPF_CALL >> 4] = "call",
        [BPF_EXIT >> 4] = "exit",
};

static void print_bpf_end_insn(bpf_insn_print_t verbose,
                               void *private_data,
                               const struct bpf_insn *insn)
{
        verbose(private_data, "(%02x) r%d = %s%d r%d\n",
                insn->code, insn->dst_reg,
                BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
                insn->imm, insn->dst_reg);
}

static void print_bpf_bswap_insn(bpf_insn_print_t verbose,
                               void *private_data,
                               const struct bpf_insn *insn)
{
        verbose(private_data, "(%02x) r%d = bswap%d r%d\n",
                insn->code, insn->dst_reg,
                insn->imm, insn->dst_reg);
}

static bool is_sdiv_smod(const struct bpf_insn *insn)
{
        return (BPF_OP(insn->code)  == BPF_DIV || BPF_OP(insn->code) == BPF_MOD) &&
               insn->off == 1;
}

static bool is_movsx(const struct bpf_insn *insn)
{
        return BPF_OP(insn->code) == BPF_MOV &&
               (insn->off == 8 || insn->off == 16 || insn->off == 32);
}

static bool is_addr_space_cast(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
                insn->off == BPF_ADDR_SPACE_CAST;
}

/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
 * dst_reg = src_reg + <percpu_base_off>
 * BPF_ADDR_PERCPU is used as a special insn->off value.
 */
#define BPF_ADDR_PERCPU        (-1)

static inline bool is_mov_percpu_addr(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
}

void print_bpf_insn(const struct bpf_insn_cbs *cbs,
                    const struct bpf_insn *insn,
                    bool allow_ptr_leaks)
{
        const bpf_insn_print_t verbose = cbs->cb_print;
        u8 class = BPF_CLASS(insn->code);

        if (class == BPF_ALU || class == BPF_ALU64) {
                if (BPF_OP(insn->code) == BPF_END) {
                        if (class == BPF_ALU64)
                                print_bpf_bswap_insn(verbose, cbs->private_data, insn);
                        else
                                print_bpf_end_insn(verbose, cbs->private_data, insn);
                } else if (BPF_OP(insn->code) == BPF_NEG) {
                        verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n",
                                insn->code, class == BPF_ALU ? 'w' : 'r',
                                insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
                                insn->dst_reg);
                } else if (is_addr_space_cast(insn)) {
                        verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n",
                                insn->code, insn->dst_reg,
                                insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
                } else if (is_mov_percpu_addr(insn)) {
                        verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n",
                                insn->code, insn->dst_reg, insn->src_reg);
                } else if (BPF_SRC(insn->code) == BPF_X) {
                        verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
                                insn->code, class == BPF_ALU ? 'w' : 'r',
                                insn->dst_reg,
                                is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
                                                   : bpf_alu_string[BPF_OP(insn->code) >> 4],
                                is_movsx(insn) ? bpf_movsx_string[(insn->off >> 3) - 1] : "",
                                class == BPF_ALU ? 'w' : 'r',
                                insn->src_reg);
                } else {
                        verbose(cbs->private_data, "(%02x) %c%d %s %d\n",
                                insn->code, class == BPF_ALU ? 'w' : 'r',
                                insn->dst_reg,
                                is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
                                                   : bpf_alu_string[BPF_OP(insn->code) >> 4],
                                insn->imm);
                }
        } else if (class == BPF_STX) {
                if (BPF_MODE(insn->code) == BPF_MEM)
                        verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n",
                                insn->code,
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->dst_reg,
                                insn->off, insn->src_reg);
                else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
                         (insn->imm == BPF_ADD || insn->imm == BPF_AND ||
                          insn->imm == BPF_OR || insn->imm == BPF_XOR)) {
                        verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n",
                                insn->code,
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->dst_reg, insn->off,
                                bpf_alu_string[BPF_OP(insn->imm) >> 4],
                                insn->src_reg);
                } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
                           (insn->imm == (BPF_ADD | BPF_FETCH) ||
                            insn->imm == (BPF_AND | BPF_FETCH) ||
                            insn->imm == (BPF_OR | BPF_FETCH) ||
                            insn->imm == (BPF_XOR | BPF_FETCH))) {
                        verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n",
                                insn->code, insn->src_reg,
                                BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
                                bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4],
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->dst_reg, insn->off, insn->src_reg);
                } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
                           insn->imm == BPF_CMPXCHG) {
                        verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n",
                                insn->code,
                                BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->dst_reg, insn->off,
                                insn->src_reg);
                } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
                           insn->imm == BPF_XCHG) {
                        verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n",
                                insn->code, insn->src_reg,
                                BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->dst_reg, insn->off, insn->src_reg);
                } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
                           insn->imm == BPF_LOAD_ACQ) {
                        verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n",
                                insn->code, insn->dst_reg,
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->src_reg, insn->off);
                } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
                           insn->imm == BPF_STORE_REL) {
                        verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n",
                                insn->code,
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->dst_reg, insn->off, insn->src_reg);
                } else {
                        verbose(cbs->private_data, "BUG_%02x\n", insn->code);
                }
        } else if (class == BPF_ST) {
                if (BPF_MODE(insn->code) == BPF_MEM) {
                        verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
                                insn->code,
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->dst_reg,
                                insn->off, insn->imm);
                } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
                        verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
                } else {
                        verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
                }
        } else if (class == BPF_LDX) {
                if (BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) {
                        verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
                        return;
                }
                verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n",
                        insn->code, insn->dst_reg,
                        BPF_MODE(insn->code) == BPF_MEM ?
                                 bpf_ldst_string[BPF_SIZE(insn->code) >> 3] :
                                 bpf_ldsx_string[BPF_SIZE(insn->code) >> 3],
                        insn->src_reg, insn->off);
        } else if (class == BPF_LD) {
                if (BPF_MODE(insn->code) == BPF_ABS) {
                        verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n",
                                insn->code,
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->imm);
                } else if (BPF_MODE(insn->code) == BPF_IND) {
                        verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
                                insn->code,
                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
                                insn->src_reg, insn->imm);
                } else if (BPF_MODE(insn->code) == BPF_IMM &&
                           BPF_SIZE(insn->code) == BPF_DW) {
                        /* At this point, we already made sure that the second
                         * part of the ldimm64 insn is accessible.
                         */
                        u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
                        bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD ||
                                      insn->src_reg == BPF_PSEUDO_MAP_VALUE;
                        char tmp[64];

                        if (is_ptr && !allow_ptr_leaks)
                                imm = 0;

                        verbose(cbs->private_data, "(%02x) r%d = %s\n",
                                insn->code, insn->dst_reg,
                                __func_imm_name(cbs, insn, imm,
                                                tmp, sizeof(tmp)));
                } else {
                        verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code);
                        return;
                }
        } else if (class == BPF_JMP32 || class == BPF_JMP) {
                u8 opcode = BPF_OP(insn->code);

                if (opcode == BPF_CALL) {
                        char tmp[64];

                        if (insn->src_reg == BPF_PSEUDO_CALL) {
                                verbose(cbs->private_data, "(%02x) call pc%s\n",
                                        insn->code,
                                        __func_get_name(cbs, insn,
                                                        tmp, sizeof(tmp)));
                        } else {
                                strcpy(tmp, "unknown");
                                verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code,
                                        __func_get_name(cbs, insn,
                                                        tmp, sizeof(tmp)),
                                        insn->imm);
                        }
                } else if (insn->code == (BPF_JMP | BPF_JA)) {
                        verbose(cbs->private_data, "(%02x) goto pc%+d\n",
                                insn->code, insn->off);
                } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) {
                        verbose(cbs->private_data, "(%02x) gotox r%d\n",
                                insn->code, insn->dst_reg);
                } else if (insn->code == (BPF_JMP | BPF_JCOND) &&
                           insn->src_reg == BPF_MAY_GOTO) {
                        verbose(cbs->private_data, "(%02x) may_goto pc%+d\n",
                                insn->code, insn->off);
                } else if (insn->code == (BPF_JMP32 | BPF_JA)) {
                        verbose(cbs->private_data, "(%02x) gotol pc%+d\n",
                                insn->code, insn->imm);
                } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
                        verbose(cbs->private_data, "(%02x) exit\n", insn->code);
                } else if (BPF_SRC(insn->code) == BPF_X) {
                        verbose(cbs->private_data,
                                "(%02x) if %c%d %s %c%d goto pc%+d\n",
                                insn->code, class == BPF_JMP32 ? 'w' : 'r',
                                insn->dst_reg,
                                bpf_jmp_string[BPF_OP(insn->code) >> 4],
                                class == BPF_JMP32 ? 'w' : 'r',
                                insn->src_reg, insn->off);
                } else {
                        verbose(cbs->private_data,
                                "(%02x) if %c%d %s 0x%x goto pc%+d\n",
                                insn->code, class == BPF_JMP32 ? 'w' : 'r',
                                insn->dst_reg,
                                bpf_jmp_string[BPF_OP(insn->code) >> 4],
                                (u32)insn->imm, insn->off);
                }
        } else {
                verbose(cbs->private_data, "(%02x) %s\n",
                        insn->code, bpf_class_string[class]);
        }
}





























   40 















   11 
































































































   41 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This file provides wrappers with sanitizer instrumentation for non-atomic
 * bit operations.
 *
 * To use this functionality, an arch's bitops.h file needs to define each of
 * the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
 * arch___set_bit(), etc.).
 */
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H

#include <linux/instrumented.h>

/**
 * ___set_bit - Set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * Unlike set_bit(), this function is non-atomic. If it is called on the same
 * region of memory concurrently, the effect may be that only one operation
 * succeeds.
 */
static __always_inline void
___set_bit(unsigned long nr, volatile unsigned long *addr)
{
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___set_bit(nr, addr);
}

/**
 * ___clear_bit - Clears a bit in memory
 * @nr: the bit to clear
 * @addr: the address to start counting from
 *
 * Unlike clear_bit(), this function is non-atomic. If it is called on the same
 * region of memory concurrently, the effect may be that only one operation
 * succeeds.
 */
static __always_inline void
___clear_bit(unsigned long nr, volatile unsigned long *addr)
{
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___clear_bit(nr, addr);
}

/**
 * ___change_bit - Toggle a bit in memory
 * @nr: the bit to change
 * @addr: the address to start counting from
 *
 * Unlike change_bit(), this function is non-atomic. If it is called on the same
 * region of memory concurrently, the effect may be that only one operation
 * succeeds.
 */
static __always_inline void
___change_bit(unsigned long nr, volatile unsigned long *addr)
{
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___change_bit(nr, addr);
}

static __always_inline void __instrument_read_write_bitop(long nr, volatile unsigned long *addr)
{
        if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) {
                /*
                 * We treat non-atomic read-write bitops a little more special.
                 * Given the operations here only modify a single bit, assuming
                 * non-atomicity of the writer is sufficient may be reasonable
                 * for certain usage (and follows the permissible nature of the
                 * assume-plain-writes-atomic rule):
                 * 1. report read-modify-write races -> check read;
                 * 2. do not report races with marked readers, but do report
                 *    races with unmarked readers -> check "atomic" write.
                 */
                kcsan_check_read(addr + BIT_WORD(nr), sizeof(long));
                /*
                 * Use generic write instrumentation, in case other sanitizers
                 * or tools are enabled alongside KCSAN.
                 */
                instrument_write(addr + BIT_WORD(nr), sizeof(long));
        } else {
                instrument_read_write(addr + BIT_WORD(nr), sizeof(long));
        }
}

/**
 * ___test_and_set_bit - Set a bit and return its old value
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This operation is non-atomic. If two instances of this operation race, one
 * can appear to succeed but actually fail.
 */
static __always_inline bool
___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
{
        __instrument_read_write_bitop(nr, addr);
        return arch___test_and_set_bit(nr, addr);
}

/**
 * ___test_and_clear_bit - Clear a bit and return its old value
 * @nr: Bit to clear
 * @addr: Address to count from
 *
 * This operation is non-atomic. If two instances of this operation race, one
 * can appear to succeed but actually fail.
 */
static __always_inline bool
___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
{
        __instrument_read_write_bitop(nr, addr);
        return arch___test_and_clear_bit(nr, addr);
}

/**
 * ___test_and_change_bit - Change a bit and return its old value
 * @nr: Bit to change
 * @addr: Address to count from
 *
 * This operation is non-atomic. If two instances of this operation race, one
 * can appear to succeed but actually fail.
 */
static __always_inline bool
___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
{
        __instrument_read_write_bitop(nr, addr);
        return arch___test_and_change_bit(nr, addr);
}

/**
 * _test_bit - Determine whether a bit is set
 * @nr: bit number to test
 * @addr: Address to start counting from
 */
static __always_inline bool
_test_bit(unsigned long nr, const volatile unsigned long *addr)
{
        instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_bit(nr, addr);
}

/**
 * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set
 * @nr: bit number to test
 * @addr: Address to start counting from
 */
static __always_inline bool
_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
{
        instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_bit_acquire(nr, addr);
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */

















































   12 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  include/linux/eventpoll.h ( Efficient event polling implementation )
 *  Copyright (C) 2001,...,2006         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */
#ifndef _LINUX_EVENTPOLL_H
#define _LINUX_EVENTPOLL_H

#include <uapi/linux/eventpoll.h>
#include <uapi/linux/kcmp.h>


/* Forward declarations to avoid compiler errors */
struct file;


#ifdef CONFIG_EPOLL

#ifdef CONFIG_KCMP
struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
#endif

/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);

/* Copy ready events to userspace */
int epoll_sendevents(struct file *file, struct epoll_event __user *events,
                     int maxevents);

/*
 * This is called from inside fs/file_table.c:__fput() to unlink files
 * from the eventpoll interface. We need to have this facility to cleanup
 * correctly files that are closed without being removed from the eventpoll
 * interface.
 */
static inline void eventpoll_release(struct file *file)
{

        /*
         * Fast check to avoid the get/release of the semaphore. Since
         * we're doing this outside the semaphore lock, it might return
         * false negatives, but we don't care. It'll help in 99.99% of cases
         * to avoid the semaphore lock. False positives simply cannot happen
         * because the file in on the way to be removed and nobody ( but
         * eventpoll ) has still a reference to this file.
         */
        if (likely(!READ_ONCE(file->f_ep)))
                return;

        /*
         * The file is being closed while it is still linked to an epoll
         * descriptor. We need to handle this by correctly unlinking it
         * from its containers.
         */
        eventpoll_release_file(file);
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock);

/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
        return op != EPOLL_CTL_DEL;
}

#else

static inline void eventpoll_release(struct file *file) {}

#endif

#if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT)
/* ARM OABI has an incompatible struct layout and needs a special handler */
extern struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
                 struct epoll_event __user *uevent);
#else
static inline struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
                 struct epoll_event __user *uevent)
{
        scoped_user_write_access_size(uevent, sizeof(*uevent), efault) {
                unsafe_put_user(revents, &uevent->events, efault);
                unsafe_put_user(data, &uevent->data, efault);
        }
        return uevent+1;

efault:
        return NULL;
}
#endif

#endif /* #ifndef _LINUX_EVENTPOLL_H */





























































































































    2 








    2 















    1 













    1 


























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux NET3:        IP/IP protocol decoder.
 *
 *        Authors:
 *                Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
 *
 *        Fixes:
 *                Alan Cox        :        Merged and made usable non modular (its so tiny its silly as
 *                                        a module taking up 2 pages).
 *                Alan Cox        :         Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
 *                                        to keep ip_forward happy.
 *                Alan Cox        :        More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
 *                Kai Schulte        :        Fixed #defines for IP_FIREWALL->FIREWALL
 *              David Woodhouse :       Perform some basic ICMP handling.
 *                                      IPIP Routing without decapsulation.
 *              Carlos Picoto   :       GRE over IP support
 *                Alexey Kuznetsov:        Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
 *                                        I do not want to merge them together.
 */

/* tunnel.c: an IP tunnel driver

        The purpose of this driver is to provide an IP tunnel through
        which you can tunnel network traffic transparently across subnets.

        This was written by looking at Nick Holloway's dummy driver
        Thanks for the great code!

                -Sam Lantinga        (slouken@cs.ucdavis.edu)  02/01/95

        Minor tweaks:
                Cleaned up the code a little and added some pre-1.3.0 tweaks.
                dev->hard_header/hard_header_len changed to use no headers.
                Comments/bracketing tweaked.
                Made the tunnels use dev->name not tunnel: when error reporting.
                Added tx_dropped stat

                -Alan Cox        (alan@lxorguk.ukuu.org.uk) 21 March 95

        Reworked:
                Changed to tunnel to destination gateway in addition to the
                        tunnel's pointopoint address
                Almost completely rewritten
                Note:  There is currently no firewall or ICMP handling done.

                -Sam Lantinga        (slouken@cs.ucdavis.edu) 02/13/96

*/

/* Things I wish I had known when writing the tunnel driver:

        When the tunnel_xmit() function is called, the skb contains the
        packet to be sent (plus a great deal of extra info), and dev
        contains the tunnel device that _we_ are.

        When we are passed a packet, we are expected to fill in the
        source address with our source IP address.

        What is the proper way to allocate, copy and free a buffer?
        After you allocate it, it is a "0 length" chunk of memory
        starting at zero.  If you want to add headers to the buffer
        later, you'll have to call "skb_reserve(skb, amount)" with
        the amount of memory you want reserved.  Then, you call
        "skb_put(skb, amount)" with the amount of space you want in
        the buffer.  skb_put() returns a pointer to the top (#0) of
        that buffer.  skb->len is set to the amount of space you have
        "allocated" with skb_put().  You can then write up to skb->len
        bytes to that buffer.  If you need more, you can call skb_put()
        again with the additional amount of space you need.  You can
        find out how much more space you can allocate by calling
        "skb_tailroom(skb)".
        Now, to add header space, call "skb_push(skb, header_len)".
        This creates space at the beginning of the buffer and returns
        a pointer to this new space.  If later you need to strip a
        header from a buffer, call "skb_pull(skb, header_len)".
        skb_headroom() will return how much space is left at the top
        of the buffer (before the main data).  Remember, this headroom
        space must be reserved before the skb_put() function is called.
        */

/*
   This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c

   For comments look at net/ipv4/ip_gre.c --ANK
 */


#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>

#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/ip_tunnels.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/dst_metadata.h>

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

static unsigned int ipip_net_id __read_mostly;

static int ipip_tunnel_init(struct net_device *dev);
static struct rtnl_link_ops ipip_link_ops __read_mostly;

static int ipip_err(struct sk_buff *skb, u32 info)
{
        /* All the routers (except for Linux) return only
         * 8 bytes of packet payload. It means, that precise relaying of
         * ICMP in the real Internet is absolutely infeasible.
         */
        struct net *net = dev_net(skb->dev);
        struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        IP_TUNNEL_DECLARE_FLAGS(flags) = { };
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        struct ip_tunnel *t;
        int err = 0;

        __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);

        t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr,
                             iph->saddr, 0);
        if (!t) {
                err = -ENOENT;
                goto out;
        }

        switch (type) {
        case ICMP_DEST_UNREACH:
                switch (code) {
                case ICMP_SR_FAILED:
                        /* Impossible event. */
                        goto out;
                default:
                        /* All others are translated to HOST_UNREACH.
                         * rfc2003 contains "deep thoughts" about NET_UNREACH,
                         * I believe they are just ether pollution. --ANK
                         */
                        break;
                }
                break;

        case ICMP_TIME_EXCEEDED:
                if (code != ICMP_EXC_TTL)
                        goto out;
                break;

        case ICMP_REDIRECT:
                break;

        default:
                goto out;
        }

        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
                ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
                goto out;
        }

        if (type == ICMP_REDIRECT) {
                ipv4_redirect(skb, net, t->parms.link, iph->protocol);
                goto out;
        }

        if (t->parms.iph.daddr == 0) {
                err = -ENOENT;
                goto out;
        }

        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
                goto out;

        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
                t->err_count++;
        else
                t->err_count = 1;
        t->err_time = jiffies;

out:
        return err;
}

static const struct tnl_ptk_info ipip_tpi = {
        /* no tunnel info required for ipip. */
        .proto = htons(ETH_P_IP),
};

#if IS_ENABLED(CONFIG_MPLS)
static const struct tnl_ptk_info mplsip_tpi = {
        /* no tunnel info required for mplsip. */
        .proto = htons(ETH_P_MPLS_UC),
};
#endif

static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
{
        struct net *net = dev_net(skb->dev);
        struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
        IP_TUNNEL_DECLARE_FLAGS(flags) = { };
        struct metadata_dst *tun_dst = NULL;
        struct ip_tunnel *tunnel;
        const struct iphdr *iph;

        __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);

        iph = ip_hdr(skb);
        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr,
                                  iph->daddr, 0);
        if (tunnel) {
                const struct tnl_ptk_info *tpi;

                if (tunnel->parms.iph.protocol != ipproto &&
                    tunnel->parms.iph.protocol != 0)
                        goto drop;

                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
                        goto drop;
#if IS_ENABLED(CONFIG_MPLS)
                if (ipproto == IPPROTO_MPLS)
                        tpi = &mplsip_tpi;
                else
#endif
                        tpi = &ipip_tpi;
                if (iptunnel_pull_header(skb, 0, tpi->proto, false))
                        goto drop;
                if (tunnel->collect_md) {
                        ip_tunnel_flags_zero(flags);

                        tun_dst = ip_tun_rx_dst(skb, flags, 0, 0);
                        if (!tun_dst)
                                return 0;
                        ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info);
                }
                skb_reset_mac_header(skb);

                return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
        }

        return -1;

drop:
        kfree_skb(skb);
        return 0;
}

static int ipip_rcv(struct sk_buff *skb)
{
        return ipip_tunnel_rcv(skb, IPPROTO_IPIP);
}

#if IS_ENABLED(CONFIG_MPLS)
static int mplsip_rcv(struct sk_buff *skb)
{
        return ipip_tunnel_rcv(skb, IPPROTO_MPLS);
}
#endif

/*
 *        This function assumes it is being called from dev_queue_xmit()
 *        and that skb is filled properly by that function.
 */
static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
                                    struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        const struct iphdr  *tiph = &tunnel->parms.iph;
        u8 ipproto;

        if (!pskb_inet_may_pull(skb))
                goto tx_error;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                ipproto = IPPROTO_IPIP;
                break;
#if IS_ENABLED(CONFIG_MPLS)
        case htons(ETH_P_MPLS_UC):
                ipproto = IPPROTO_MPLS;
                break;
#endif
        default:
                goto tx_error;
        }

        if (tiph->protocol != ipproto && tiph->protocol != 0)
                goto tx_error;

        if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
                goto tx_error;

        skb_set_inner_ipproto(skb, ipproto);

        if (tunnel->collect_md)
                ip_md_tunnel_xmit(skb, dev, ipproto, 0);
        else
                ip_tunnel_xmit(skb, dev, tiph, ipproto);
        return NETDEV_TX_OK;

tx_error:
        kfree_skb(skb);

        DEV_STATS_INC(dev, tx_errors);
        return NETDEV_TX_OK;
}

static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto)
{
        switch (ipproto) {
        case 0:
        case IPPROTO_IPIP:
#if IS_ENABLED(CONFIG_MPLS)
        case IPPROTO_MPLS:
#endif
                return true;
        }

        return false;
}

static int
ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
{
        if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
                if (p->iph.version != 4 ||
                    !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) ||
                    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)))
                        return -EINVAL;
        }

        p->i_key = p->o_key = 0;
        ip_tunnel_flags_zero(p->i_flags);
        ip_tunnel_flags_zero(p->o_flags);
        return ip_tunnel_ctl(dev, p, cmd);
}

static int ipip_fill_forward_path(struct net_device_path_ctx *ctx,
                                  struct net_device_path *path)
{
        struct ip_tunnel *tunnel = netdev_priv(ctx->dev);
        const struct iphdr *tiph = &tunnel->parms.iph;
        struct rtable *rt;

        rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0,
                             RT_SCOPE_UNIVERSE);
        if (IS_ERR(rt))
                return PTR_ERR(rt);

        path->type = DEV_PATH_TUN;
        path->tun.src_v4.s_addr = tiph->saddr;
        path->tun.dst_v4.s_addr = tiph->daddr;
        path->tun.l3_proto = IPPROTO_IPIP;
        path->dev = ctx->dev;

        ctx->dev = rt->dst.dev;
        ip_rt_put(rt);

        return 0;
}

static const struct net_device_ops ipip_netdev_ops = {
        .ndo_init       = ipip_tunnel_init,
        .ndo_uninit     = ip_tunnel_uninit,
        .ndo_start_xmit        = ipip_tunnel_xmit,
        .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
        .ndo_change_mtu = ip_tunnel_change_mtu,
        .ndo_get_stats64 = dev_get_tstats64,
        .ndo_get_iflink = ip_tunnel_get_iflink,
        .ndo_tunnel_ctl        = ipip_tunnel_ctl,
        .ndo_fill_forward_path = ipip_fill_forward_path,
};

#define IPIP_FEATURES (NETIF_F_SG |                \
                       NETIF_F_FRAGLIST |        \
                       NETIF_F_HIGHDMA |        \
                       NETIF_F_GSO_SOFTWARE |        \
                       NETIF_F_HW_CSUM)

static void ipip_tunnel_setup(struct net_device *dev)
{
        dev->netdev_ops                = &ipip_netdev_ops;
        dev->header_ops                = &ip_tunnel_header_ops;

        dev->type                = ARPHRD_TUNNEL;
        dev->flags                = IFF_NOARP;
        dev->addr_len                = 4;
        dev->lltx                = true;
        netif_keep_dst(dev);

        dev->features                |= IPIP_FEATURES;
        dev->hw_features        |= IPIP_FEATURES;
        ip_tunnel_setup(dev, ipip_net_id);
}

static int ipip_tunnel_init(struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);

        __dev_addr_set(dev, &tunnel->parms.iph.saddr, 4);
        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);

        tunnel->tun_hlen = 0;
        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
        return ip_tunnel_init(dev);
}

static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
                                struct netlink_ext_ack *extack)
{
        u8 proto;

        if (!data || !data[IFLA_IPTUN_PROTO])
                return 0;

        proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
        if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0)
                return -EINVAL;

        return 0;
}

static void ipip_netlink_parms(struct nlattr *data[],
                               struct ip_tunnel_parm_kern *parms,
                               bool *collect_md, __u32 *fwmark)
{
        memset(parms, 0, sizeof(*parms));

        parms->iph.version = 4;
        parms->iph.protocol = IPPROTO_IPIP;
        parms->iph.ihl = 5;
        *collect_md = false;

        if (!data)
                return;

        ip_tunnel_netlink_parms(data, parms);

        if (data[IFLA_IPTUN_COLLECT_METADATA])
                *collect_md = true;

        if (data[IFLA_IPTUN_FWMARK])
                *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
}

static int ipip_newlink(struct net_device *dev,
                        struct rtnl_newlink_params *params,
                        struct netlink_ext_ack *extack)
{
        struct ip_tunnel *t = netdev_priv(dev);
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        struct ip_tunnel_encap ipencap;
        struct ip_tunnel_parm_kern p;
        __u32 fwmark = 0;

        if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
                int err = ip_tunnel_encap_setup(t, &ipencap);

                if (err < 0)
                        return err;
        }

        ipip_netlink_parms(data, &p, &t->collect_md, &fwmark);
        return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
                                 fwmark);
}

static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
                           struct nlattr *data[],
                           struct netlink_ext_ack *extack)
{
        struct ip_tunnel *t = netdev_priv(dev);
        struct ip_tunnel_encap ipencap;
        struct ip_tunnel_parm_kern p;
        bool collect_md;
        __u32 fwmark = t->fwmark;

        if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
                int err = ip_tunnel_encap_setup(t, &ipencap);

                if (err < 0)
                        return err;
        }

        ipip_netlink_parms(data, &p, &collect_md, &fwmark);
        if (collect_md)
                return -EINVAL;

        if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
            (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
                return -EINVAL;

        return ip_tunnel_changelink(dev, tb, &p, fwmark);
}

static size_t ipip_get_size(const struct net_device *dev)
{
        return
                /* IFLA_IPTUN_LINK */
                nla_total_size(4) +
                /* IFLA_IPTUN_LOCAL */
                nla_total_size(4) +
                /* IFLA_IPTUN_REMOTE */
                nla_total_size(4) +
                /* IFLA_IPTUN_TTL */
                nla_total_size(1) +
                /* IFLA_IPTUN_TOS */
                nla_total_size(1) +
                /* IFLA_IPTUN_PROTO */
                nla_total_size(1) +
                /* IFLA_IPTUN_PMTUDISC */
                nla_total_size(1) +
                /* IFLA_IPTUN_ENCAP_TYPE */
                nla_total_size(2) +
                /* IFLA_IPTUN_ENCAP_FLAGS */
                nla_total_size(2) +
                /* IFLA_IPTUN_ENCAP_SPORT */
                nla_total_size(2) +
                /* IFLA_IPTUN_ENCAP_DPORT */
                nla_total_size(2) +
                /* IFLA_IPTUN_COLLECT_METADATA */
                nla_total_size(0) +
                /* IFLA_IPTUN_FWMARK */
                nla_total_size(4) +
                0;
}

static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct ip_tunnel_parm_kern *parm = &tunnel->parms;

        if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
            nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
            nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
            nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
            nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
            nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
            nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
                       !!(parm->iph.frag_off & htons(IP_DF))) ||
            nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark))
                goto nla_put_failure;

        if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
                        tunnel->encap.type) ||
            nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
                         tunnel->encap.sport) ||
            nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
                         tunnel->encap.dport) ||
            nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
                        tunnel->encap.flags))
                goto nla_put_failure;

        if (tunnel->collect_md)
                if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
                        goto nla_put_failure;
        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
        [IFLA_IPTUN_LINK]                = { .type = NLA_U32 },
        [IFLA_IPTUN_LOCAL]                = { .type = NLA_U32 },
        [IFLA_IPTUN_REMOTE]                = { .type = NLA_U32 },
        [IFLA_IPTUN_TTL]                = { .type = NLA_U8 },
        [IFLA_IPTUN_TOS]                = { .type = NLA_U8 },
        [IFLA_IPTUN_PROTO]                = { .type = NLA_U8 },
        [IFLA_IPTUN_PMTUDISC]                = { .type = NLA_U8 },
        [IFLA_IPTUN_ENCAP_TYPE]                = { .type = NLA_U16 },
        [IFLA_IPTUN_ENCAP_FLAGS]        = { .type = NLA_U16 },
        [IFLA_IPTUN_ENCAP_SPORT]        = { .type = NLA_U16 },
        [IFLA_IPTUN_ENCAP_DPORT]        = { .type = NLA_U16 },
        [IFLA_IPTUN_COLLECT_METADATA]        = { .type = NLA_FLAG },
        [IFLA_IPTUN_FWMARK]                = { .type = NLA_U32 },
};

static struct rtnl_link_ops ipip_link_ops __read_mostly = {
        .kind                = "ipip",
        .maxtype        = IFLA_IPTUN_MAX,
        .policy                = ipip_policy,
        .priv_size        = sizeof(struct ip_tunnel),
        .setup                = ipip_tunnel_setup,
        .validate        = ipip_tunnel_validate,
        .newlink        = ipip_newlink,
        .changelink        = ipip_changelink,
        .dellink        = ip_tunnel_dellink,
        .get_size        = ipip_get_size,
        .fill_info        = ipip_fill_info,
        .get_link_net        = ip_tunnel_get_link_net,
};

static struct xfrm_tunnel ipip_handler __read_mostly = {
        .handler        =        ipip_rcv,
        .err_handler        =        ipip_err,
        .priority        =        1,
};

#if IS_ENABLED(CONFIG_MPLS)
static struct xfrm_tunnel mplsip_handler __read_mostly = {
        .handler        =        mplsip_rcv,
        .err_handler        =        ipip_err,
        .priority        =        1,
};
#endif

static int __net_init ipip_init_net(struct net *net)
{
        return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
}

static void __net_exit ipip_exit_rtnl(struct net *net,
                                      struct list_head *dev_to_kill)
{
        ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill);
}

static struct pernet_operations ipip_net_ops = {
        .init = ipip_init_net,
        .exit_rtnl = ipip_exit_rtnl,
        .id   = &ipip_net_id,
        .size = sizeof(struct ip_tunnel_net),
};

static int __init ipip_init(void)
{
        int err;

        pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n");

        err = register_pernet_device(&ipip_net_ops);
        if (err < 0)
                return err;
        err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
        if (err < 0) {
                pr_info("%s: can't register tunnel\n", __func__);
                goto xfrm_tunnel_ipip_failed;
        }
#if IS_ENABLED(CONFIG_MPLS)
        err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS);
        if (err < 0) {
                pr_info("%s: can't register tunnel\n", __func__);
                goto xfrm_tunnel_mplsip_failed;
        }
#endif
        err = rtnl_link_register(&ipip_link_ops);
        if (err < 0)
                goto rtnl_link_failed;

out:
        return err;

rtnl_link_failed:
#if IS_ENABLED(CONFIG_MPLS)
        xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
xfrm_tunnel_mplsip_failed:

#endif
        xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
xfrm_tunnel_ipip_failed:
        unregister_pernet_device(&ipip_net_ops);
        goto out;
}

static void __exit ipip_fini(void)
{
        rtnl_link_unregister(&ipip_link_ops);
        if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
                pr_info("%s: can't deregister tunnel\n", __func__);
#if IS_ENABLED(CONFIG_MPLS)
        if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS))
                pr_info("%s: can't deregister tunnel\n", __func__);
#endif
        unregister_pernet_device(&ipip_net_ops);
}

module_init(ipip_init);
module_exit(ipip_fini);
MODULE_DESCRIPTION("IP/IP protocol decoder library");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("ipip");
MODULE_ALIAS_NETDEV("tunl0");


















































































































































































































































































    3 




































































































































































































































































































































































































































































































































































































































































































   18 







































    5 



















    1 









































































































   11 

    2 



















    6 




    2 






































   16 













































































    6 




    6 


































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 */
#ifndef _LINUX_BPF_VERIFIER_H
#define _LINUX_BPF_VERIFIER_H 1

#include <linux/bpf.h> /* for enum bpf_reg_type */
#include <linux/btf.h> /* for struct btf and btf_id() */
#include <linux/filter.h> /* for MAX_BPF_STACK */
#include <linux/tnum.h>

/* Maximum variable offset umax_value permitted when resolving memory accesses.
 * In practice this is far bigger than any realistic pointer offset; this limit
 * ensures that umax_value + (int)off + (int)size cannot overflow a u64.
 */
#define BPF_MAX_VAR_OFF        (1 << 29)
/* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO].  This ensures
 * that converting umax_value to int cannot overflow.
 */
#define BPF_MAX_VAR_SIZ        (1 << 29)
/* size of tmp_str_buf in bpf_verifier.
 * we need at least 306 bytes to fit full stack mask representation
 * (in the "-8,-16,...,-512" form)
 */
#define TMP_STR_BUF_LEN 320
/* Patch buffer size */
#define INSN_BUF_SIZE 32

#define ITER_PREFIX "bpf_iter_"

enum bpf_iter_state {
        BPF_ITER_STATE_INVALID, /* for non-first slot */
        BPF_ITER_STATE_ACTIVE,
        BPF_ITER_STATE_DRAINED,
};

struct bpf_reg_state {
        /* Ordering of fields matters.  See states_equal() */
        enum bpf_reg_type type;
        /*
         * Constant delta between "linked" scalars with the same ID.
         */
        s32 delta;
        union {
                /* valid when type == PTR_TO_PACKET */
                int range;

                /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
                 *   PTR_TO_MAP_VALUE_OR_NULL
                 */
                struct {
                        struct bpf_map *map_ptr;
                        /* To distinguish map lookups from outer map
                         * the map_uid is non-zero for registers
                         * pointing to inner maps.
                         */
                        u32 map_uid;
                };

                /* for PTR_TO_BTF_ID */
                struct {
                        struct btf *btf;
                        u32 btf_id;
                };

                struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
                        u32 mem_size;
                        u32 dynptr_id; /* for dynptr slices */
                };

                /* For dynptr stack slots */
                struct {
                        enum bpf_dynptr_type type;
                        /* A dynptr is 16 bytes so it takes up 2 stack slots.
                         * We need to track which slot is the first slot
                         * to protect against cases where the user may try to
                         * pass in an address starting at the second slot of the
                         * dynptr.
                         */
                        bool first_slot;
                } dynptr;

                /* For bpf_iter stack slots */
                struct {
                        /* BTF container and BTF type ID describing
                         * struct bpf_iter_<type> of an iterator state
                         */
                        struct btf *btf;
                        u32 btf_id;
                        /* packing following two fields to fit iter state into 16 bytes */
                        enum bpf_iter_state state:2;
                        int depth:30;
                } iter;

                /* For irq stack slots */
                struct {
                        enum {
                                IRQ_NATIVE_KFUNC,
                                IRQ_LOCK_KFUNC,
                        } kfunc_class;
                } irq;

                /* Max size from any of the above. */
                struct {
                        unsigned long raw1;
                        unsigned long raw2;
                } raw;

                u32 subprogno; /* for PTR_TO_FUNC */
        };
        /* For scalar types (SCALAR_VALUE), this represents our knowledge of
         * the actual value.
         * For pointer types, this represents the variable part of the offset
         * from the pointed-to object, and is shared with all bpf_reg_states
         * with the same id as us.
         */
        struct tnum var_off;
        /* Used to determine if any memory access using this register will
         * result in a bad access.
         * These refer to the same value as var_off, not necessarily the actual
         * contents of the register.
         */
        s64 smin_value; /* minimum possible (s64)value */
        s64 smax_value; /* maximum possible (s64)value */
        u64 umin_value; /* minimum possible (u64)value */
        u64 umax_value; /* maximum possible (u64)value */
        s32 s32_min_value; /* minimum possible (s32)value */
        s32 s32_max_value; /* maximum possible (s32)value */
        u32 u32_min_value; /* minimum possible (u32)value */
        u32 u32_max_value; /* maximum possible (u32)value */
        /* For PTR_TO_PACKET, used to find other pointers with the same variable
         * offset, so they can share range knowledge.
         * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
         * came from, when one is tested for != NULL.
         * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation
         * for the purpose of tracking that it's freed.
         * For PTR_TO_SOCKET this is used to share which pointers retain the
         * same reference to the socket, to determine proper reference freeing.
         * For stack slots that are dynptrs, this is used to track references to
         * the dynptr to determine proper reference freeing.
         * Similarly to dynptrs, we use ID to track "belonging" of a reference
         * to a specific instance of bpf_iter.
         */
        /*
         * Upper bit of ID is used to remember relationship between "linked"
         * registers. Example:
         * r1 = r2;    both will have r1->id == r2->id == N
         * r1 += 10;   r1->id == N | BPF_ADD_CONST and r1->delta == 10
         * r3 = r2;    both will have r3->id == r2->id == N
         * w3 += 10;   r3->id == N | BPF_ADD_CONST32 and r3->delta == 10
         */
#define BPF_ADD_CONST64 (1U << 31)
#define BPF_ADD_CONST32 (1U << 30)
#define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32)
        u32 id;
        /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
         * from a pointer-cast helper, bpf_sk_fullsock() and
         * bpf_tcp_sock().
         *
         * Consider the following where "sk" is a reference counted
         * pointer returned from "sk = bpf_sk_lookup_tcp();":
         *
         * 1: sk = bpf_sk_lookup_tcp();
         * 2: if (!sk) { return 0; }
         * 3: fullsock = bpf_sk_fullsock(sk);
         * 4: if (!fullsock) { bpf_sk_release(sk); return 0; }
         * 5: tp = bpf_tcp_sock(fullsock);
         * 6: if (!tp) { bpf_sk_release(sk); return 0; }
         * 7: bpf_sk_release(sk);
         * 8: snd_cwnd = tp->snd_cwnd;  // verifier will complain
         *
         * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and
         * "tp" ptr should be invalidated also.  In order to do that,
         * the reg holding "fullsock" and "sk" need to remember
         * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id
         * such that the verifier can reset all regs which have
         * ref_obj_id matching the sk_reg->id.
         *
         * sk_reg->ref_obj_id is set to sk_reg->id at line 1.
         * sk_reg->id will stay as NULL-marking purpose only.
         * After NULL-marking is done, sk_reg->id can be reset to 0.
         *
         * After "fullsock = bpf_sk_fullsock(sk);" at line 3,
         * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id.
         *
         * After "tp = bpf_tcp_sock(fullsock);" at line 5,
         * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id
         * which is the same as sk_reg->ref_obj_id.
         *
         * From the verifier perspective, if sk, fullsock and tp
         * are not NULL, they are the same ptr with different
         * reg->type.  In particular, bpf_sk_release(tp) is also
         * allowed and has the same effect as bpf_sk_release(sk).
         */
        u32 ref_obj_id;
        /* Inside the callee two registers can be both PTR_TO_STACK like
         * R1=fp-8 and R2=fp-8, but one of them points to this function stack
         * while another to the caller's stack. To differentiate them 'frameno'
         * is used which is an index in bpf_verifier_state->frame[] array
         * pointing to bpf_func_state.
         */
        u32 frameno;
        /* Tracks subreg definition. The stored value is the insn_idx of the
         * writing insn. This is safe because subreg_def is used before any insn
         * patching which only happens after main verification finished.
         */
        s32 subreg_def;
        /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
        bool precise;
};

enum bpf_stack_slot_type {
        STACK_INVALID,    /* nothing was stored in this stack slot */
        STACK_SPILL,      /* register spilled into stack */
        STACK_MISC,          /* BPF program wrote some data into this slot */
        STACK_ZERO,          /* BPF program wrote constant zero */
        /* A dynptr is stored in this stack slot. The type of dynptr
         * is stored in bpf_stack_state->spilled_ptr.dynptr.type
         */
        STACK_DYNPTR,
        STACK_ITER,
        STACK_IRQ_FLAG,
        STACK_POISON,
};

#define BPF_REG_SIZE 8        /* size of eBPF register in bytes */

/* 4-byte stack slot granularity for liveness analysis */
#define BPF_HALF_REG_SIZE        4
#define STACK_SLOT_SZ                4
#define STACK_SLOTS                (MAX_BPF_STACK / BPF_HALF_REG_SIZE)        /* 128 */

typedef struct {
        u64 v[2];
} spis_t;

#define SPIS_ZERO        ((spis_t){})
#define SPIS_ALL        ((spis_t){{ U64_MAX, U64_MAX }})

static inline bool spis_is_zero(spis_t s)
{
        return s.v[0] == 0 && s.v[1] == 0;
}

static inline bool spis_equal(spis_t a, spis_t b)
{
        return a.v[0] == b.v[0] && a.v[1] == b.v[1];
}

static inline spis_t spis_or(spis_t a, spis_t b)
{
        return (spis_t){{ a.v[0] | b.v[0], a.v[1] | b.v[1] }};
}

static inline spis_t spis_and(spis_t a, spis_t b)
{
        return (spis_t){{ a.v[0] & b.v[0], a.v[1] & b.v[1] }};
}

static inline spis_t spis_not(spis_t s)
{
        return (spis_t){{ ~s.v[0], ~s.v[1] }};
}

static inline bool spis_test_bit(spis_t s, u32 slot)
{
        return s.v[slot / 64] & BIT_ULL(slot % 64);
}

static inline void spis_or_range(spis_t *mask, u32 lo, u32 hi)
{
        u32 w;

        for (w = lo; w <= hi && w < STACK_SLOTS; w++)
                mask->v[w / 64] |= BIT_ULL(w % 64);
}

#define BPF_REGMASK_ARGS ((1 << BPF_REG_1) | (1 << BPF_REG_2) | \
                          (1 << BPF_REG_3) | (1 << BPF_REG_4) | \
                          (1 << BPF_REG_5))

#define BPF_MAIN_FUNC (-1)

#define BPF_DYNPTR_SIZE                sizeof(struct bpf_dynptr_kern)
#define BPF_DYNPTR_NR_SLOTS                (BPF_DYNPTR_SIZE / BPF_REG_SIZE)

struct bpf_stack_state {
        struct bpf_reg_state spilled_ptr;
        u8 slot_type[BPF_REG_SIZE];
};

struct bpf_reference_state {
        /* Each reference object has a type. Ensure REF_TYPE_PTR is zero to
         * default to pointer reference on zero initialization of a state.
         */
        enum ref_state_type {
                REF_TYPE_PTR                = (1 << 1),
                REF_TYPE_IRQ                = (1 << 2),
                REF_TYPE_LOCK                = (1 << 3),
                REF_TYPE_RES_LOCK         = (1 << 4),
                REF_TYPE_RES_LOCK_IRQ        = (1 << 5),
                REF_TYPE_LOCK_MASK        = REF_TYPE_LOCK | REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ,
        } type;
        /* Track each reference created with a unique id, even if the same
         * instruction creates the reference multiple times (eg, via CALL).
         */
        int id;
        /* Instruction where the allocation of this reference occurred. This
         * is used purely to inform the user of a reference leak.
         */
        int insn_idx;
        /* Use to keep track of the source object of a lock, to ensure
         * it matches on unlock.
         */
        void *ptr;
};

struct bpf_retval_range {
        s32 minval;
        s32 maxval;
        bool return_32bit;
};

/* state of the program:
 * type of all registers and stack info
 */
struct bpf_func_state {
        struct bpf_reg_state regs[MAX_BPF_REG];
        /* index of call instruction that called into this func */
        int callsite;
        /* stack frame number of this function state from pov of
         * enclosing bpf_verifier_state.
         * 0 = main function, 1 = first callee.
         */
        u32 frameno;
        /* subprog number == index within subprog_info
         * zero == main subprog
         */
        u32 subprogno;
        /* Every bpf_timer_start will increment async_entry_cnt.
         * It's used to distinguish:
         * void foo(void) { for(;;); }
         * void foo(void) { bpf_timer_set_callback(,foo); }
         */
        u32 async_entry_cnt;
        struct bpf_retval_range callback_ret_range;
        bool in_callback_fn;
        bool in_async_callback_fn;
        bool in_exception_callback_fn;
        /* For callback calling functions that limit number of possible
         * callback executions (e.g. bpf_loop) keeps track of current
         * simulated iteration number.
         * Value in frame N refers to number of times callback with frame
         * N+1 was simulated, e.g. for the following call:
         *
         *   bpf_loop(..., fn, ...); | suppose current frame is N
         *                           | fn would be simulated in frame N+1
         *                           | number of simulations is tracked in frame N
         */
        u32 callback_depth;

        /* The following fields should be last. See copy_func_state() */
        /* The state of the stack. Each element of the array describes BPF_REG_SIZE
         * (i.e. 8) bytes worth of stack memory.
         * stack[0] represents bytes [*(r10-8)..*(r10-1)]
         * stack[1] represents bytes [*(r10-16)..*(r10-9)]
         * ...
         * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)]
         */
        struct bpf_stack_state *stack;
        /* Size of the current stack, in bytes. The stack state is tracked below, in
         * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE.
         */
        int allocated_stack;
};

#define MAX_CALL_FRAMES 8

/* instruction history flags, used in bpf_jmp_history_entry.flags field */
enum {
        /* instruction references stack slot through PTR_TO_STACK register;
         * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8)
         * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512,
         * 8 bytes per slot, so slot index (spi) is [0, 63])
         */
        INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */

        INSN_F_SPI_MASK = 0x3f, /* 6 bits */
        INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */

        INSN_F_STACK_ACCESS = BIT(9),

        INSN_F_DST_REG_STACK = BIT(10), /* dst_reg is PTR_TO_STACK */
        INSN_F_SRC_REG_STACK = BIT(11), /* src_reg is PTR_TO_STACK */
        /* total 12 bits are used now. */
};

static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES);
static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8);

struct bpf_jmp_history_entry {
        u32 idx;
        /* insn idx can't be bigger than 1 million */
        u32 prev_idx : 20;
        /* special INSN_F_xxx flags */
        u32 flags : 12;
        /* additional registers that need precision tracking when this
         * jump is backtracked, vector of six 10-bit records
         */
        u64 linked_regs;
};

/* Maximum number of register states that can exist at once */
#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES)
struct bpf_verifier_state {
        /* call stack tracking */
        struct bpf_func_state *frame[MAX_CALL_FRAMES];
        struct bpf_verifier_state *parent;
        /* Acquired reference states */
        struct bpf_reference_state *refs;
        /*
         * 'branches' field is the number of branches left to explore:
         * 0 - all possible paths from this state reached bpf_exit or
         * were safely pruned
         * 1 - at least one path is being explored.
         * This state hasn't reached bpf_exit
         * 2 - at least two paths are being explored.
         * This state is an immediate parent of two children.
         * One is fallthrough branch with branches==1 and another
         * state is pushed into stack (to be explored later) also with
         * branches==1. The parent of this state has branches==1.
         * The verifier state tree connected via 'parent' pointer looks like:
         * 1
         * 1
         * 2 -> 1 (first 'if' pushed into stack)
         * 1
         * 2 -> 1 (second 'if' pushed into stack)
         * 1
         * 1
         * 1 bpf_exit.
         *
         * Once do_check() reaches bpf_exit, it calls update_branch_counts()
         * and the verifier state tree will look:
         * 1
         * 1
         * 2 -> 1 (first 'if' pushed into stack)
         * 1
         * 1 -> 1 (second 'if' pushed into stack)
         * 0
         * 0
         * 0 bpf_exit.
         * After pop_stack() the do_check() will resume at second 'if'.
         *
         * If is_state_visited() sees a state with branches > 0 it means
         * there is a loop. If such state is exactly equal to the current state
         * it's an infinite loop. Note states_equal() checks for states
         * equivalency, so two states being 'states_equal' does not mean
         * infinite loop. The exact comparison is provided by
         * states_maybe_looping() function. It's a stronger pre-check and
         * much faster than states_equal().
         *
         * This algorithm may not find all possible infinite loops or
         * loop iteration count may be too high.
         * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in.
         */
        u32 branches;
        u32 insn_idx;
        u32 curframe;

        u32 acquired_refs;
        u32 active_locks;
        u32 active_preempt_locks;
        u32 active_irq_id;
        u32 active_lock_id;
        void *active_lock_ptr;
        u32 active_rcu_locks;

        bool speculative;
        bool in_sleepable;

        /* first and last insn idx of this verifier state */
        u32 first_insn_idx;
        u32 last_insn_idx;
        /* if this state is a backedge state then equal_state
         * records cached state to which this state is equal.
         */
        struct bpf_verifier_state *equal_state;
        /* jmp history recorded from first to last.
         * backtracking is using it to go from last to first.
         * For most states jmp_history_cnt is [0-3].
         * For loops can go up to ~40.
         */
        struct bpf_jmp_history_entry *jmp_history;
        u32 jmp_history_cnt;
        u32 dfs_depth;
        u32 callback_unroll_depth;
        u32 may_goto_depth;
};

#define bpf_get_spilled_reg(slot, frame, mask)                                \
        (((slot < frame->allocated_stack / BPF_REG_SIZE) &&                \
          ((1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & (mask))) \
         ? &frame->stack[slot].spilled_ptr : NULL)

/* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
#define bpf_for_each_spilled_reg(iter, frame, reg, mask)                        \
        for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask);                \
             iter < frame->allocated_stack / BPF_REG_SIZE;                \
             iter++, reg = bpf_get_spilled_reg(iter, frame, mask))

#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr)   \
        ({                                                               \
                struct bpf_verifier_state *___vstate = __vst;            \
                int ___i, ___j;                                          \
                for (___i = 0; ___i <= ___vstate->curframe; ___i++) {    \
                        struct bpf_reg_state *___regs;                   \
                        __state = ___vstate->frame[___i];                \
                        ___regs = __state->regs;                         \
                        for (___j = 0; ___j < MAX_BPF_REG; ___j++) {     \
                                __reg = &___regs[___j];                  \
                                (void)(__expr);                          \
                        }                                                \
                        bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \
                                if (!__reg)                              \
                                        continue;                        \
                                (void)(__expr);                          \
                        }                                                \
                }                                                        \
        })

/* Invoke __expr over regsiters in __vst, setting __state and __reg */
#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \
        bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr)

/* linked list of verifier states used to prune search */
struct bpf_verifier_state_list {
        struct bpf_verifier_state state;
        struct list_head node;
        u32 miss_cnt;
        u32 hit_cnt:31;
        u32 in_free_list:1;
};

struct bpf_loop_inline_state {
        unsigned int initialized:1; /* set to true upon first entry */
        unsigned int fit_for_inline:1; /* true if callback function is the same
                                        * at each call and flags are always zero
                                        */
        u32 callback_subprogno; /* valid when fit_for_inline is true */
};

/* pointer and state for maps */
struct bpf_map_ptr_state {
        struct bpf_map *map_ptr;
        bool poison;
        bool unpriv;
};

/* Possible states for alu_state member. */
#define BPF_ALU_SANITIZE_SRC                (1U << 0)
#define BPF_ALU_SANITIZE_DST                (1U << 1)
#define BPF_ALU_NEG_VALUE                (1U << 2)
#define BPF_ALU_NON_POINTER                (1U << 3)
#define BPF_ALU_IMMEDIATE                (1U << 4)
#define BPF_ALU_SANITIZE                (BPF_ALU_SANITIZE_SRC | \
                                         BPF_ALU_SANITIZE_DST)

/*
 * An array of BPF instructions.
 * Primary usage: return value of bpf_insn_successors.
 */
struct bpf_iarray {
        int cnt;
        u32 items[];
};

struct bpf_insn_aux_data {
        union {
                enum bpf_reg_type ptr_type;        /* pointer type for load/store insns */
                struct bpf_map_ptr_state map_ptr_state;
                s32 call_imm;                        /* saved imm field of call insn */
                u32 alu_limit;                        /* limit for add/sub register with pointer */
                struct {
                        u32 map_index;                /* index into used_maps[] */
                        u32 map_off;                /* offset from value base address */
                };
                struct {
                        enum bpf_reg_type reg_type;        /* type of pseudo_btf_id */
                        union {
                                struct {
                                        struct btf *btf;
                                        u32 btf_id;        /* btf_id for struct typed var */
                                };
                                u32 mem_size;        /* mem_size for non-struct typed var */
                        };
                } btf_var;
                /* if instruction is a call to bpf_loop this field tracks
                 * the state of the relevant registers to make decision about inlining
                 */
                struct bpf_loop_inline_state loop_inline_state;
        };
        union {
                /* remember the size of type passed to bpf_obj_new to rewrite R1 */
                u64 obj_new_size;
                /* remember the offset of node field within type to rewrite */
                u64 insert_off;
        };
        struct bpf_iarray *jt;        /* jump table for gotox or bpf_tailcall call instruction */
        struct btf_struct_meta *kptr_struct_meta;
        u64 map_key_state; /* constant (32 bit) key tracking for maps */
        int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
        u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
        bool nospec; /* do not execute this instruction speculatively */
        bool nospec_result; /* result is unsafe under speculation, nospec must follow */
        bool zext_dst; /* this insn zero extends dst reg */
        bool needs_zext; /* alu op needs to clear upper bits */
        bool non_sleepable; /* helper/kfunc may be called from non-sleepable context */
        bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
        bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
        u8 alu_state; /* used in combination with alu_limit */
        /* true if STX or LDX instruction is a part of a spill/fill
         * pattern for a bpf_fastcall call.
         */
        u8 fastcall_pattern:1;
        /* for CALL instructions, a number of spill/fill pairs in the
         * bpf_fastcall pattern.
         */
        u8 fastcall_spills_num:3;
        u8 arg_prog:4;

        /* below fields are initialized once */
        unsigned int orig_idx; /* original instruction index */
        u32 jmp_point:1;
        u32 prune_point:1;
        /* ensure we check state equivalence and save state checkpoint and
         * this instruction, regardless of any heuristics
         */
        u32 force_checkpoint:1;
        /* true if instruction is a call to a helper function that
         * accepts callback function as a parameter.
         */
        u32 calls_callback:1;
        u32 indirect_target:1; /* if it is an indirect jump target */
        /*
         * CFG strongly connected component this instruction belongs to,
         * zero if it is a singleton SCC.
         */
        u32 scc;
        /* registers alive before this instruction. */
        u16 live_regs_before;
        /*
         * Bitmask of R0-R9 that hold known values at this instruction.
         * const_reg_mask: scalar constants that fit in 32 bits.
         * const_reg_map_mask: map pointers, val is map_index into used_maps[].
         * const_reg_subprog_mask: subprog pointers, val is subprog number.
         * const_reg_vals[i] holds the 32-bit value for register i.
         * Populated by compute_const_regs() pre-pass.
         */
        u16 const_reg_mask;
        u16 const_reg_map_mask;
        u16 const_reg_subprog_mask;
        u32 const_reg_vals[10];
};

#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
#define MAX_USED_BTFS 64 /* max number of BTFs accessed by one BPF program */

#define BPF_VERIFIER_TMP_LOG_SIZE        1024

struct bpf_verifier_log {
        /* Logical start and end positions of a "log window" of the verifier log.
         * start_pos == 0 means we haven't truncated anything.
         * Once truncation starts to happen, start_pos + len_total == end_pos,
         * except during log reset situations, in which (end_pos - start_pos)
         * might get smaller than len_total (see bpf_vlog_reset()).
         * Generally, (end_pos - start_pos) gives number of useful data in
         * user log buffer.
         */
        u64 start_pos;
        u64 end_pos;
        char __user *ubuf;
        u32 level;
        u32 len_total;
        u32 len_max;
        char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
};

#define BPF_LOG_LEVEL1        1
#define BPF_LOG_LEVEL2        2
#define BPF_LOG_STATS        4
#define BPF_LOG_FIXED        8
#define BPF_LOG_LEVEL        (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
#define BPF_LOG_MASK        (BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED)
#define BPF_LOG_KERNEL        (BPF_LOG_MASK + 1) /* kernel internal flag */
#define BPF_LOG_MIN_ALIGNMENT 8U
#define BPF_LOG_ALIGNMENT 40U

static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
{
        return log && log->level;
}

#define BPF_MAX_SUBPROGS 256

struct bpf_subprog_arg_info {
        enum bpf_arg_type arg_type;
        union {
                u32 mem_size;
                u32 btf_id;
        };
};

enum priv_stack_mode {
        PRIV_STACK_UNKNOWN,
        NO_PRIV_STACK,
        PRIV_STACK_ADAPTIVE,
};

struct bpf_subprog_info {
        const char *name; /* name extracted from BTF */
        u32 start; /* insn idx of function entry point */
        u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
        u32 postorder_start; /* The idx to the env->cfg.insn_postorder */
        u32 exit_idx; /* Index of one of the BPF_EXIT instructions in this subprogram */
        u16 stack_depth; /* max. stack depth used by this function */
        u16 stack_extra;
        /* offsets in range [stack_depth .. fastcall_stack_off)
         * are used for bpf_fastcall spills and fills.
         */
        s16 fastcall_stack_off;
        bool has_tail_call: 1;
        bool tail_call_reachable: 1;
        bool has_ld_abs: 1;
        bool is_cb: 1;
        bool is_async_cb: 1;
        bool is_exception_cb: 1;
        bool args_cached: 1;
        /* true if bpf_fastcall stack region is used by functions that can't be inlined */
        bool keep_fastcall_stack: 1;
        bool changes_pkt_data: 1;
        bool might_sleep: 1;
        u8 arg_cnt:3;

        enum priv_stack_mode priv_stack_mode;
        struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
};

struct bpf_verifier_env;

struct backtrack_state {
        struct bpf_verifier_env *env;
        u32 frame;
        u32 reg_masks[MAX_CALL_FRAMES];
        u64 stack_masks[MAX_CALL_FRAMES];
};

struct bpf_id_pair {
        u32 old;
        u32 cur;
};

struct bpf_idmap {
        u32 tmp_id_gen;
        u32 cnt;
        struct bpf_id_pair map[BPF_ID_MAP_SIZE];
};

struct bpf_idset {
        u32 num_ids;
        struct {
                u32 id;
                u32 cnt;
        } entries[BPF_ID_MAP_SIZE];
};

/* see verifier.c:compute_scc_callchain() */
struct bpf_scc_callchain {
        /* call sites from bpf_verifier_state->frame[*]->callsite leading to this SCC */
        u32 callsites[MAX_CALL_FRAMES - 1];
        /* last frame in a chain is identified by SCC id */
        u32 scc;
};

/* verifier state waiting for propagate_backedges() */
struct bpf_scc_backedge {
        struct bpf_scc_backedge *next;
        struct bpf_verifier_state state;
};

struct bpf_scc_visit {
        struct bpf_scc_callchain callchain;
        /* first state in current verification path that entered SCC
         * identified by the callchain
         */
        struct bpf_verifier_state *entry_state;
        struct bpf_scc_backedge *backedges; /* list of backedges */
        u32 num_backedges;
};

/* An array of bpf_scc_visit structs sharing tht same bpf_scc_callchain->scc
 * but having different bpf_scc_callchain->callsites.
 */
struct bpf_scc_info {
        u32 num_visits;
        struct bpf_scc_visit visits[];
};

struct bpf_liveness;

/* single container for all structs
 * one verifier_env per bpf_check() call
 */
struct bpf_verifier_env {
        u32 insn_idx;
        u32 prev_insn_idx;
        struct bpf_prog *prog;                /* eBPF program being verified */
        const struct bpf_verifier_ops *ops;
        struct module *attach_btf_mod;        /* The owner module of prog->aux->attach_btf */
        struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
        int stack_size;                        /* number of states to be processed */
        bool strict_alignment;                /* perform strict pointer alignment checks */
        bool test_state_freq;                /* test verifier with different pruning frequency */
        bool test_reg_invariants;        /* fail verification on register invariants violations */
        struct bpf_verifier_state *cur_state; /* current verifier state */
        /* Search pruning optimization, array of list_heads for
         * lists of struct bpf_verifier_state_list.
         */
        struct list_head *explored_states;
        struct list_head free_list;        /* list of struct bpf_verifier_state_list */
        struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
        struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */
        struct bpf_map *insn_array_maps[MAX_USED_MAPS]; /* array of INSN_ARRAY map's to be relocated */
        u32 used_map_cnt;                /* number of used maps */
        u32 used_btf_cnt;                /* number of used BTF objects */
        u32 insn_array_map_cnt;                /* number of used maps of type BPF_MAP_TYPE_INSN_ARRAY */
        u32 id_gen;                        /* used to generate unique reg IDs */
        u32 hidden_subprog_cnt;                /* number of hidden subprogs */
        int exception_callback_subprog;
        bool explore_alu_limits;
        bool allow_ptr_leaks;
        /* Allow access to uninitialized stack memory. Writes with fixed offset are
         * always allowed, so this refers to reads (with fixed or variable offset),
         * to writes with variable offset and to indirect (helper) accesses.
         */
        bool allow_uninit_stack;
        bool bpf_capable;
        bool bypass_spec_v1;
        bool bypass_spec_v4;
        bool seen_direct_write;
        bool seen_exception;
        struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
        const struct bpf_line_info *prev_linfo;
        struct bpf_verifier_log log;
        struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */
        /* subprog indices sorted in topological order: leaves first, callers last */
        int subprog_topo_order[BPF_MAX_SUBPROGS + 2];
        union {
                struct bpf_idmap idmap_scratch;
                struct bpf_idset idset_scratch;
        };
        struct {
                int *insn_state;
                int *insn_stack;
                /*
                 * vector of instruction indexes sorted in post-order, grouped by subprogram,
                 * see bpf_subprog_info->postorder_start.
                 */
                int *insn_postorder;
                int cur_stack;
                /* current position in the insn_postorder vector */
                int cur_postorder;
        } cfg;
        struct backtrack_state bt;
        struct bpf_jmp_history_entry *cur_hist_ent;
        /* Per-callsite copy of parent's converged at_stack_in for cross-frame fills. */
        struct arg_track **callsite_at_stack;
        u32 pass_cnt; /* number of times do_check() was called */
        u32 subprog_cnt;
        /* number of instructions analyzed by the verifier */
        u32 prev_insn_processed, insn_processed;
        /* number of jmps, calls, exits analyzed so far */
        u32 prev_jmps_processed, jmps_processed;
        /* total verification time */
        u64 verification_time;
        /* maximum number of verifier states kept in 'branching' instructions */
        u32 max_states_per_insn;
        /* total number of allocated verifier states */
        u32 total_states;
        /* some states are freed during program analysis.
         * this is peak number of states. this number dominates kernel
         * memory consumption during verification
         */
        u32 peak_states;
        /* longest register parentage chain walked for liveness marking */
        u32 longest_mark_read_walk;
        u32 free_list_size;
        u32 explored_states_size;
        u32 num_backedges;
        bpfptr_t fd_array;

        /* bit mask to keep track of whether a register has been accessed
         * since the last time the function state was printed
         */
        u32 scratched_regs;
        /* Same as scratched_regs but for stack slots */
        u64 scratched_stack_slots;
        u64 prev_log_pos, prev_insn_print_pos;
        /* buffer used to temporary hold constants as scalar registers */
        struct bpf_reg_state fake_reg[1];
        /* buffers used to save updated reg states while simulating branches */
        struct bpf_reg_state true_reg1, true_reg2, false_reg1, false_reg2;
        /* buffer used to generate temporary string representations,
         * e.g., in reg_type_str() to generate reg_type string
         */
        char tmp_str_buf[TMP_STR_BUF_LEN];
        struct bpf_insn insn_buf[INSN_BUF_SIZE];
        struct bpf_insn epilogue_buf[INSN_BUF_SIZE];
        struct bpf_scc_callchain callchain_buf;
        struct bpf_liveness *liveness;
        /* array of pointers to bpf_scc_info indexed by SCC id */
        struct bpf_scc_info **scc_info;
        u32 scc_cnt;
        struct bpf_iarray *succ;
        struct bpf_iarray *gotox_tmp_buf;
};

static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
{
        return &env->prog->aux->func_info_aux[subprog];
}

static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
{
        return &env->subprog_info[subprog];
}

struct bpf_call_summary {
        u8 num_params;
        bool is_void;
        bool fastcall;
};

static inline bool bpf_helper_call(const struct bpf_insn *insn)
{
        return insn->code == (BPF_JMP | BPF_CALL) &&
               insn->src_reg == 0;
}

static inline bool bpf_pseudo_call(const struct bpf_insn *insn)
{
        return insn->code == (BPF_JMP | BPF_CALL) &&
               insn->src_reg == BPF_PSEUDO_CALL;
}

static inline bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
{
        return insn->code == (BPF_JMP | BPF_CALL) &&
               insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
}

__printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
                                      const char *fmt, va_list args);
__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
                                           const char *fmt, ...);
__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
                            const char *fmt, ...);
int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
                  char __user *log_buf, u32 log_size);
void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual);

__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
                                  u32 insn_off,
                                  const char *prefix_fmt, ...);

#define verifier_bug_if(cond, env, fmt, args...)                                                \
        ({                                                                                        \
                bool __cond = (cond);                                                                \
                if (unlikely(__cond))                                                                \
                        verifier_bug(env, fmt " (" #cond ")", ##args);                                \
                (__cond);                                                                        \
        })
#define verifier_bug(env, fmt, args...)                                                                \
        ({                                                                                        \
                BPF_WARN_ONCE(1, "verifier bug: " fmt "\n", ##args);                                \
                bpf_log(&env->log, "verifier bug: " fmt "\n", ##args);                                \
        })

static inline void mark_prune_point(struct bpf_verifier_env *env, int idx)
{
        env->insn_aux_data[idx].prune_point = true;
}

static inline bool bpf_is_prune_point(struct bpf_verifier_env *env, int insn_idx)
{
        return env->insn_aux_data[insn_idx].prune_point;
}

static inline void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
{
        env->insn_aux_data[idx].force_checkpoint = true;
}

static inline bool bpf_is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
{
        return env->insn_aux_data[insn_idx].force_checkpoint;
}

static inline void mark_calls_callback(struct bpf_verifier_env *env, int idx)
{
        env->insn_aux_data[idx].calls_callback = true;
}

static inline bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx)
{
        return env->insn_aux_data[insn_idx].calls_callback;
}

static inline void mark_jmp_point(struct bpf_verifier_env *env, int idx)
{
        env->insn_aux_data[idx].jmp_point = true;
}

static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
{
        struct bpf_verifier_state *cur = env->cur_state;

        return cur->frame[cur->curframe];
}

static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
{
        return cur_func(env)->regs;
}

int bpf_prog_offload_verifier_prep(struct bpf_prog *prog);
int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
                                 int insn_idx, int prev_insn_idx);
int bpf_prog_offload_finalize(struct bpf_verifier_env *env);
void
bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
                              struct bpf_insn *insn);
void
bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);

/* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
                                             struct btf *btf, u32 btf_id)
{
        if (tgt_prog)
                return ((u64)tgt_prog->aux->id << 32) | btf_id;
        else
                return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id;
}

/* unpack the IDs from the key as constructed above */
static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id)
{
        if (obj_id)
                *obj_id = key >> 32;
        if (btf_id)
                *btf_id = key & 0x7FFFFFFF;
}

int bpf_check_btf_info_early(struct bpf_verifier_env *env,
                             const union bpf_attr *attr, bpfptr_t uattr);
int bpf_check_btf_info(struct bpf_verifier_env *env,
                       const union bpf_attr *attr, bpfptr_t uattr);

int bpf_check_attach_target(struct bpf_verifier_log *log,
                            const struct bpf_prog *prog,
                            const struct bpf_prog *tgt_prog,
                            u32 btf_id,
                            struct bpf_attach_target_info *tgt_info);
void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab);

int mark_chain_precision(struct bpf_verifier_env *env, int regno);

int bpf_is_state_visited(struct bpf_verifier_env *env, int insn_idx);
int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st);

void bpf_clear_jmp_history(struct bpf_verifier_state *state);
int bpf_copy_verifier_state(struct bpf_verifier_state *dst_state,
                            const struct bpf_verifier_state *src);
struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx);
void bpf_free_verifier_state(struct bpf_verifier_state *state, bool free_self);
void bpf_free_backedges(struct bpf_scc_visit *visit);
int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
                         int insn_flags, u64 linked_regs);
void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist);
void bpf_mark_reg_not_init(const struct bpf_verifier_env *env,
                           struct bpf_reg_state *reg);
void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg);
void bpf_mark_all_scalars_precise(struct bpf_verifier_env *env,
                                  struct bpf_verifier_state *st);
void bpf_clear_singular_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st);
int bpf_mark_chain_precision(struct bpf_verifier_env *env,
                             struct bpf_verifier_state *starting_state,
                             int regno, bool *changed);

static inline int bpf_get_spi(s32 off)
{
        return (-off - 1) / BPF_REG_SIZE;
}

static inline struct bpf_func_state *bpf_func(struct bpf_verifier_env *env,
                                              const struct bpf_reg_state *reg)
{
        struct bpf_verifier_state *cur = env->cur_state;

        return cur->frame[reg->frameno];
}

/* Return IP for a given frame in a call stack */
static inline u32 bpf_frame_insn_idx(struct bpf_verifier_state *st, u32 frame)
{
        return frame == st->curframe
               ? st->insn_idx
               : st->frame[frame + 1]->callsite;
}

static inline bool bpf_is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
{
        return env->insn_aux_data[insn_idx].jmp_point;
}

static inline bool bpf_is_spilled_reg(const struct bpf_stack_state *stack)
{
        return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
}

static inline bool bpf_is_spilled_scalar_reg(const struct bpf_stack_state *stack)
{
        return bpf_is_spilled_reg(stack) && stack->spilled_ptr.type == SCALAR_VALUE;
}

static inline bool bpf_register_is_null(struct bpf_reg_state *reg)
{
        return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
}

static inline void bpf_bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
{
        bt->reg_masks[frame] |= 1 << reg;
}

static inline void bpf_bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
{
        bt->stack_masks[frame] |= 1ull << slot;
}

static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg)
{
        return bt->reg_masks[frame] & (1 << reg);
}

static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
{
        return bt->stack_masks[frame] & (1ull << slot);
}

bool bpf_map_is_rdonly(const struct bpf_map *map);
int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
                        bool is_ldsx);

#define BPF_BASE_TYPE_MASK        GENMASK(BPF_BASE_TYPE_BITS - 1, 0)

/* extract base type from bpf_{arg, return, reg}_type. */
static inline u32 base_type(u32 type)
{
        return type & BPF_BASE_TYPE_MASK;
}

/* extract flags from an extended type. See bpf_type_flag in bpf.h. */
static inline u32 type_flag(u32 type)
{
        return type & ~BPF_BASE_TYPE_MASK;
}

/* only use after check_attach_btf_id() */
static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog)
{
        return (prog->type == BPF_PROG_TYPE_EXT && prog->aux->saved_dst_prog_type) ?
                prog->aux->saved_dst_prog_type : prog->type;
}

static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
{
        switch (resolve_prog_type(prog)) {
        case BPF_PROG_TYPE_TRACING:
                return prog->expected_attach_type != BPF_TRACE_ITER;
        case BPF_PROG_TYPE_STRUCT_OPS:
                return prog->aux->jits_use_priv_stack;
        case BPF_PROG_TYPE_LSM:
        case BPF_PROG_TYPE_SYSCALL:
                return false;
        default:
                return true;
        }
}

#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED | NON_OWN_REF)

static inline bool bpf_type_has_unsafe_modifiers(u32 type)
{
        return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS;
}

static inline bool type_is_ptr_alloc_obj(u32 type)
{
        return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
}

static inline bool type_is_non_owning_ref(u32 type)
{
        return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
}

static inline bool type_is_pkt_pointer(enum bpf_reg_type type)
{
        type = base_type(type);
        return type == PTR_TO_PACKET ||
               type == PTR_TO_PACKET_META;
}

static inline bool type_is_sk_pointer(enum bpf_reg_type type)
{
        return type == PTR_TO_SOCKET ||
                type == PTR_TO_SOCK_COMMON ||
                type == PTR_TO_TCP_SOCK ||
                type == PTR_TO_XDP_SOCK;
}

static inline bool type_may_be_null(u32 type)
{
        return type & PTR_MAYBE_NULL;
}

static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
{
        env->scratched_regs |= 1U << regno;
}

static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
{
        env->scratched_stack_slots |= 1ULL << spi;
}

static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
{
        return (env->scratched_regs >> regno) & 1;
}

static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
{
        return (env->scratched_stack_slots >> regno) & 1;
}

static inline bool verifier_state_scratched(const struct bpf_verifier_env *env)
{
        return env->scratched_regs || env->scratched_stack_slots;
}

static inline void mark_verifier_state_clean(struct bpf_verifier_env *env)
{
        env->scratched_regs = 0U;
        env->scratched_stack_slots = 0ULL;
}

/* Used for printing the entire verifier state. */
static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env)
{
        env->scratched_regs = ~0U;
        env->scratched_stack_slots = ~0ULL;
}

static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_size)
{
#ifdef __BIG_ENDIAN
        off -= spill_size - fill_size;
#endif

        return !(off % BPF_REG_SIZE);
}

static inline bool insn_is_gotox(struct bpf_insn *insn)
{
        return BPF_CLASS(insn->code) == BPF_JMP &&
               BPF_OP(insn->code) == BPF_JA &&
               BPF_SRC(insn->code) == BPF_X;
}

const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type);
const char *dynptr_type_str(enum bpf_dynptr_type type);
const char *iter_type_str(const struct btf *btf, u32 btf_id);
const char *iter_state_str(enum bpf_iter_state state);

void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
                          u32 frameno, bool print_all);
void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
                      u32 frameno);
u32 bpf_vlog_alignment(u32 pos);

struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off);
int bpf_jmp_offset(struct bpf_insn *insn);
struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx);
void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask);
bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog);

int bpf_find_subprog(struct bpf_verifier_env *env, int off);
int bpf_compute_const_regs(struct bpf_verifier_env *env);
int bpf_prune_dead_branches(struct bpf_verifier_env *env);
int bpf_check_cfg(struct bpf_verifier_env *env);
int bpf_compute_postorder(struct bpf_verifier_env *env);
int bpf_compute_scc(struct bpf_verifier_env *env);

struct bpf_map_desc {
        struct bpf_map *ptr;
        int uid;
};

struct bpf_kfunc_call_arg_meta {
        /* In parameters */
        struct btf *btf;
        u32 func_id;
        u32 kfunc_flags;
        const struct btf_type *func_proto;
        const char *func_name;
        /* Out parameters */
        u32 ref_obj_id;
        u8 release_regno;
        bool r0_rdonly;
        u32 ret_btf_id;
        u64 r0_size;
        u32 subprogno;
        struct {
                u64 value;
                bool found;
        } arg_constant;

        /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
         * generally to pass info about user-defined local kptr types to later
         * verification logic
         *   bpf_obj_drop/bpf_percpu_obj_drop
         *     Record the local kptr type to be drop'd
         *   bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
         *     Record the local kptr type to be refcount_incr'd and use
         *     arg_owning_ref to determine whether refcount_acquire should be
         *     fallible
         */
        struct btf *arg_btf;
        u32 arg_btf_id;
        bool arg_owning_ref;
        bool arg_prog;

        struct {
                struct btf_field *field;
        } arg_list_head;
        struct {
                struct btf_field *field;
        } arg_rbtree_root;
        struct {
                enum bpf_dynptr_type type;
                u32 id;
                u32 ref_obj_id;
        } initialized_dynptr;
        struct {
                u8 spi;
                u8 frameno;
        } iter;
        struct bpf_map_desc map;
        u64 mem_size;
};

int bpf_get_helper_proto(struct bpf_verifier_env *env, int func_id,
                         const struct bpf_func_proto **ptr);
int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env, s32 func_id,
                             s16 offset, struct bpf_kfunc_call_arg_meta *meta);
bool bpf_is_async_callback_calling_insn(struct bpf_insn *insn);
bool bpf_is_sync_callback_calling_insn(struct bpf_insn *insn);
static inline bool bpf_is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_ITER_NEXT;
}

static inline bool bpf_is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_SLEEPABLE;
}
bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta);
struct bpf_iarray *bpf_iarray_realloc(struct bpf_iarray *old, size_t n_elem);
int bpf_copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off);
bool bpf_insn_is_cond_jump(u8 code);
bool bpf_is_may_goto_insn(struct bpf_insn *insn);

void bpf_verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn);
bool bpf_get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call,
                          struct bpf_call_summary *cs);
s64 bpf_helper_stack_access_bytes(struct bpf_verifier_env *env,
                                  struct bpf_insn *insn, int arg,
                                  int insn_idx);
s64 bpf_kfunc_stack_access_bytes(struct bpf_verifier_env *env,
                                 struct bpf_insn *insn, int arg,
                                 int insn_idx);
int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env);

int bpf_stack_liveness_init(struct bpf_verifier_env *env);
void bpf_stack_liveness_free(struct bpf_verifier_env *env);
int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st);
bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi);
int bpf_compute_live_registers(struct bpf_verifier_env *env);

#define BPF_MAP_KEY_POISON        (1ULL << 63)
#define BPF_MAP_KEY_SEEN        (1ULL << 62)

static inline bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
        return aux->map_ptr_state.poison;
}

static inline bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
{
        return aux->map_ptr_state.unpriv;
}

static inline bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
{
        return aux->map_key_state & BPF_MAP_KEY_POISON;
}

static inline bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
{
        return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
}

static inline u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
{
        return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON);
}

#define MAX_PACKET_OFF 0xffff
#define CALLER_SAVED_REGS 6

enum bpf_reg_arg_type {
        SRC_OP,                /* register is used as source operand */
        DST_OP,                /* register is used as destination operand */
        DST_OP_NO_MARK        /* same as above, check only, don't mark */
};

#define MAX_KFUNC_DESCS 256

struct bpf_kfunc_desc {
        struct btf_func_model func_model;
        u32 func_id;
        s32 imm;
        u16 offset;
        unsigned long addr;
};

struct bpf_kfunc_desc_tab {
        /* Sorted by func_id (BTF ID) and offset (fd_array offset) during
         * verification. JITs do lookups by bpf_insn, where func_id may not be
         * available, therefore at the end of verification do_misc_fixups()
         * sorts this by imm and offset.
         */
        struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
        u32 nr_descs;
};

/* Functions exported from verifier.c, used by fixups.c */
bool bpf_is_reg64(struct bpf_insn *insn, u32 regno, struct bpf_reg_state *reg, enum bpf_reg_arg_type t);
void bpf_clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len);
void bpf_mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog);
bool bpf_allow_tail_call_in_subprogs(struct bpf_verifier_env *env);
bool bpf_verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm);
int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset);
int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                         struct bpf_insn *insn_buf, int insn_idx, int *cnt);

/* Functions in fixups.c, called from bpf_check() */
int bpf_remove_fastcall_spills_fills(struct bpf_verifier_env *env);
int bpf_optimize_bpf_loop(struct bpf_verifier_env *env);
void bpf_opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env);
int bpf_opt_remove_dead_code(struct bpf_verifier_env *env);
int bpf_opt_remove_nops(struct bpf_verifier_env *env);
int bpf_opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, const union bpf_attr *attr);
int bpf_convert_ctx_accesses(struct bpf_verifier_env *env);
int bpf_jit_subprogs(struct bpf_verifier_env *env);
int bpf_fixup_call_args(struct bpf_verifier_env *env);
int bpf_do_misc_fixups(struct bpf_verifier_env *env);

#endif /* _LINUX_BPF_VERIFIER_H */

































































































    1 












    1 













    1 












    1 













    1 


    1 
    1 




    1 
    1 


























    1 










    1 










    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







































    1 





    1 

























    1 





































    1 



    1 








    1 



    1 



    1 


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/ialloc.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  BSD ufs-inspired inode and directory allocation by
 *  Stephen Tweedie (sct@redhat.com), 1993
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/cred.h>

#include <asm/byteorder.h>

#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"

#include <trace/events/ext4.h>

/*
 * ialloc.c contains the inodes allocation and deallocation routines
 */

/*
 * The free inodes are managed by bitmaps.  A file system contains several
 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
 * block for inodes, N blocks for the inode table and data blocks.
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.
 */

/*
 * To avoid calling the atomic setbit hundreds or thousands of times, we only
 * need to use it within a single byte (to ensure we get endianness right).
 * We can use memset for the rest of the bitmap as there are no other users.
 */
void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
{
        int i;

        if (start_bit >= end_bit)
                return;

        ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
        for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
                ext4_set_bit(i, bitmap);
        if (i < end_bit)
                memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
}

void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
                set_bitmap_uptodate(bh);
        }
        unlock_buffer(bh);
        put_bh(bh);
}

static int ext4_validate_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *desc,
                                      ext4_group_t block_group,
                                      struct buffer_head *bh)
{
        ext4_fsblk_t        blk;
        struct ext4_group_info *grp;

        if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        if (buffer_verified(bh))
                return 0;

        grp = ext4_get_group_info(sb, block_group);
        if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
                return -EFSCORRUPTED;

        ext4_lock_group(sb, block_group);
        if (buffer_verified(bh))
                goto verified;
        blk = ext4_inode_bitmap(sb, desc);
        if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) ||
            ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
                           "inode_bitmap = %llu", block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        set_buffer_verified(bh);
verified:
        ext4_unlock_group(sb, block_group);
        return 0;
}

/*
 * Read the inode allocation bitmap for a given block_group, reading
 * into the specified slot in the superblock's bitmap cache.
 *
 * Return buffer_head of bitmap on success, or an ERR_PTR on error.
 */
static struct buffer_head *
ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
{
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh = NULL;
        ext4_fsblk_t bitmap_blk;
        int err;

        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return ERR_PTR(-EFSCORRUPTED);

        bitmap_blk = ext4_inode_bitmap(sb, desc);
        if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
                ext4_error(sb, "Invalid inode bitmap blk %llu in "
                           "block_group %u", bitmap_blk, block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return ERR_PTR(-EFSCORRUPTED);
        }
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
                ext4_warning(sb, "Cannot read inode bitmap - "
                             "block_group = %u, inode_bitmap = %llu",
                             block_group, bitmap_blk);
                return ERR_PTR(-ENOMEM);
        }
        if (bitmap_uptodate(bh))
                goto verify;

        lock_buffer(bh);
        if (bitmap_uptodate(bh)) {
                unlock_buffer(bh);
                goto verify;
        }

        ext4_lock_group(sb, block_group);
        if (ext4_has_group_desc_csum(sb) &&
            (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
                if (block_group == 0) {
                        ext4_unlock_group(sb, block_group);
                        unlock_buffer(bh);
                        ext4_error(sb, "Inode bitmap for bg 0 marked "
                                   "uninitialized");
                        err = -EFSCORRUPTED;
                        goto out;
                }
                memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
                ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
                                     sb->s_blocksize * 8, bh->b_data);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
                set_buffer_verified(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
        ext4_unlock_group(sb, block_group);

        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
                 * bitmap is also uptodate
                 */
                set_bitmap_uptodate(bh);
                unlock_buffer(bh);
                goto verify;
        }
        /*
         * submit the buffer_head for reading
         */
        trace_ext4_load_inode_bitmap(sb, block_group);
        ext4_read_bh(bh, REQ_META | REQ_PRIO,
                     ext4_end_bitmap_read,
                     ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO));
        if (!buffer_uptodate(bh)) {
                put_bh(bh);
                ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
                               "block_group = %u, inode_bitmap = %llu",
                               block_group, bitmap_blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return ERR_PTR(-EIO);
        }

verify:
        err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
        if (err)
                goto out;
        return bh;
out:
        put_bh(bh);
        return ERR_PTR(err);
}

/*
 * NOTE! When we get the inode, we're the only people
 * that have access to it, and as such there are no
 * race conditions we have to worry about. The inode
 * is not on the hash-lists, and it cannot be reached
 * through the filesystem because the directory entry
 * has been deleted earlier.
 *
 * HOWEVER: we must make sure that we get no aliases,
 * which means that we have to call "clear_inode()"
 * _before_ we mark the inode not in use in the inode
 * bitmaps. Otherwise a newly created file might use
 * the same inode number (not actually the same pointer
 * though), and then we'd have two inodes sharing the
 * same inode number and space on the harddisk.
 */
void ext4_free_inode(handle_t *handle, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        int is_directory;
        unsigned long ino;
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *bh2;
        ext4_group_t block_group;
        unsigned long bit;
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
        int fatal = 0, err, count, cleared;
        struct ext4_group_info *grp;

        if (!sb) {
                printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
                       "nonexistent device\n", __func__, __LINE__);
                return;
        }
        if (icount_read(inode) > 1) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: count=%d",
                         __func__, __LINE__, inode->i_ino,
                         icount_read(inode));
                return;
        }
        if (inode->i_nlink) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: nlink=%d\n",
                         __func__, __LINE__, inode->i_ino, inode->i_nlink);
                return;
        }
        sbi = EXT4_SB(sb);

        ino = inode->i_ino;
        ext4_debug("freeing inode %lu\n", ino);
        trace_ext4_free_inode(inode);

        dquot_initialize(inode);
        dquot_free_inode(inode);

        is_directory = S_ISDIR(inode->i_mode);

        /* Do this BEFORE marking the inode not in use or returning an error */
        ext4_clear_inode(inode);

        es = sbi->s_es;
        if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
                ext4_error(sb, "reserved or nonexistent inode %lu", ino);
                goto error_return;
        }
        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        /* Don't bother if the inode bitmap is corrupt. */
        if (IS_ERR(bitmap_bh)) {
                fatal = PTR_ERR(bitmap_bh);
                bitmap_bh = NULL;
                goto error_return;
        }
        if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
                grp = ext4_get_group_info(sb, block_group);
                if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
                        fatal = -EFSCORRUPTED;
                        goto error_return;
                }
        }

        BUFFER_TRACE(bitmap_bh, "get_write_access");
        fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh,
                                              EXT4_JTR_NONE);
        if (fatal)
                goto error_return;

        fatal = -ESRCH;
        gdp = ext4_get_group_desc(sb, block_group, &bh2);
        if (gdp) {
                BUFFER_TRACE(bh2, "get_write_access");
                fatal = ext4_journal_get_write_access(handle, sb, bh2,
                                                      EXT4_JTR_NONE);
        }
        ext4_lock_group(sb, block_group);
        cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
        if (fatal || !cleared) {
                ext4_unlock_group(sb, block_group);
                goto out;
        }

        count = ext4_free_inodes_count(sb, gdp) + 1;
        ext4_free_inodes_set(sb, gdp, count);
        if (is_directory) {
                count = ext4_used_dirs_count(sb, gdp) - 1;
                ext4_used_dirs_set(sb, gdp, count);
                if (percpu_counter_initialized(&sbi->s_dirs_counter))
                        percpu_counter_dec(&sbi->s_dirs_counter);
        }
        ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh);
        ext4_group_desc_csum_set(sb, block_group, gdp);
        ext4_unlock_group(sb, block_group);

        if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
                percpu_counter_inc(&sbi->s_freeinodes_counter);
        if (sbi->s_log_groups_per_flex) {
                struct flex_groups *fg;

                fg = sbi_array_rcu_deref(sbi, s_flex_groups,
                                         ext4_flex_group(sbi, block_group));
                atomic_inc(&fg->free_inodes);
                if (is_directory)
                        atomic_dec(&fg->used_dirs);
        }
        BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
        fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
out:
        if (cleared) {
                BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!fatal)
                        fatal = err;
        } else {
                ext4_error(sb, "bit already cleared for inode %lu", ino);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
        }

error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, fatal);
}

struct orlov_stats {
        __u64 free_clusters;
        __u32 free_inodes;
        __u32 used_dirs;
};

/*
 * Helper function for Orlov's allocator; returns critical information
 * for a particular block group or flex_bg.  If flex_size is 1, then g
 * is a block group number; otherwise it is flex_bg number.
 */
static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
                            int flex_size, struct orlov_stats *stats)
{
        struct ext4_group_desc *desc;

        if (flex_size > 1) {
                struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb),
                                                             s_flex_groups, g);
                stats->free_inodes = atomic_read(&fg->free_inodes);
                stats->free_clusters = atomic64_read(&fg->free_clusters);
                stats->used_dirs = atomic_read(&fg->used_dirs);
                return;
        }

        desc = ext4_get_group_desc(sb, g, NULL);
        if (desc) {
                stats->free_inodes = ext4_free_inodes_count(sb, desc);
                stats->free_clusters = ext4_free_group_clusters(sb, desc);
                stats->used_dirs = ext4_used_dirs_count(sb, desc);
        } else {
                stats->free_inodes = 0;
                stats->free_clusters = 0;
                stats->used_dirs = 0;
        }
}

/*
 * Orlov's allocator for directories.
 *
 * We always try to spread first-level directories.
 *
 * If there are blockgroups with both free inodes and free clusters counts
 * not worse than average we return one with smallest directory count.
 * Otherwise we simply return a random group.
 *
 * For the rest rules look so:
 *
 * It's OK to put directory into a group unless
 * it has too many directories already (max_dirs) or
 * it has too few free inodes left (min_inodes) or
 * it has too few free clusters left (min_clusters) or
 * Parent's group is preferred, if it doesn't satisfy these
 * conditions we search cyclically through the rest. If none
 * of the groups look good we just look for a group with more
 * free inodes than average (starting at parent's group).
 */

static int find_group_orlov(struct super_block *sb, struct inode *parent,
                            ext4_group_t *group, umode_t mode,
                            const struct qstr *qstr)
{
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t real_ngroups = ext4_get_groups_count(sb);
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei, grp_free;
        ext4_fsblk_t freec, avefreec;
        unsigned int ndirs;
        int max_dirs, min_inodes;
        ext4_grpblk_t min_clusters;
        ext4_group_t i, grp, g, ngroups;
        struct ext4_group_desc *desc;
        struct orlov_stats stats;
        int flex_size = ext4_flex_bg_size(sbi);
        struct dx_hash_info hinfo;

        ngroups = real_ngroups;
        if (flex_size > 1) {
                ngroups = (real_ngroups + flex_size - 1) >>
                        sbi->s_log_groups_per_flex;
                parent_group >>= sbi->s_log_groups_per_flex;
        }

        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
        avefreei = freei / ngroups;
        freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter);
        avefreec = freec;
        do_div(avefreec, ngroups);
        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);

        if (S_ISDIR(mode) &&
            ((parent == d_inode(sb->s_root)) ||
             (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
                int best_ndir = inodes_per_group;
                int ret = -1;

                if (qstr) {
                        hinfo.hash_version = DX_HASH_HALF_MD4;
                        hinfo.seed = sbi->s_hash_seed;
                        ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
                        parent_group = hinfo.hash % ngroups;
                } else
                        parent_group = get_random_u32_below(ngroups);
                for (i = 0; i < ngroups; i++) {
                        g = (parent_group + i) % ngroups;
                        get_orlov_stats(sb, g, flex_size, &stats);
                        if (!stats.free_inodes)
                                continue;
                        if (stats.used_dirs >= best_ndir)
                                continue;
                        if (stats.free_inodes < avefreei)
                                continue;
                        if (stats.free_clusters < avefreec)
                                continue;
                        grp = g;
                        ret = 0;
                        best_ndir = stats.used_dirs;
                }
                if (ret)
                        goto fallback;
        found_flex_bg:
                if (flex_size == 1) {
                        *group = grp;
                        return 0;
                }

                /*
                 * We pack inodes at the beginning of the flexgroup's
                 * inode tables.  Block allocation decisions will do
                 * something similar, although regular files will
                 * start at 2nd block group of the flexgroup.  See
                 * ext4_ext_find_goal() and ext4_find_near().
                 */
                grp *= flex_size;
                for (i = 0; i < flex_size; i++) {
                        if (grp+i >= real_ngroups)
                                break;
                        desc = ext4_get_group_desc(sb, grp+i, NULL);
                        if (desc && ext4_free_inodes_count(sb, desc)) {
                                *group = grp+i;
                                return 0;
                        }
                }
                goto fallback;
        }

        max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
        min_inodes = avefreei - inodes_per_group*flex_size / 4;
        if (min_inodes < 1)
                min_inodes = 1;
        min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
        if (min_clusters < 0)
                min_clusters = 0;

        /*
         * Start looking in the flex group where we last allocated an
         * inode for this parent directory
         */
        if (EXT4_I(parent)->i_last_alloc_group != ~0) {
                parent_group = EXT4_I(parent)->i_last_alloc_group;
                if (flex_size > 1)
                        parent_group >>= sbi->s_log_groups_per_flex;
        }

        for (i = 0; i < ngroups; i++) {
                grp = (parent_group + i) % ngroups;
                get_orlov_stats(sb, grp, flex_size, &stats);
                if (stats.used_dirs >= max_dirs)
                        continue;
                if (stats.free_inodes < min_inodes)
                        continue;
                if (stats.free_clusters < min_clusters)
                        continue;
                goto found_flex_bg;
        }

fallback:
        ngroups = real_ngroups;
        avefreei = freei / ngroups;
fallback_retry:
        parent_group = EXT4_I(parent)->i_block_group;
        for (i = 0; i < ngroups; i++) {
                grp = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, grp, NULL);
                if (desc) {
                        grp_free = ext4_free_inodes_count(sb, desc);
                        if (grp_free && grp_free >= avefreei) {
                                *group = grp;
                                return 0;
                        }
                }
        }

        if (avefreei) {
                /*
                 * The free-inodes counter is approximate, and for really small
                 * filesystems the above test can fail to find any blockgroups
                 */
                avefreei = 0;
                goto fallback_retry;
        }

        return -1;
}

static int find_group_other(struct super_block *sb, struct inode *parent,
                            ext4_group_t *group, umode_t mode)
{
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        int flex_size = ext4_flex_bg_size(EXT4_SB(sb));

        /*
         * Try to place the inode is the same flex group as its
         * parent.  If we can't find space, use the Orlov algorithm to
         * find another flex group, and store that information in the
         * parent directory's inode information so that use that flex
         * group for future allocations.
         */
        if (flex_size > 1) {
                int retry = 0;

        try_again:
                parent_group &= ~(flex_size-1);
                last = parent_group + flex_size;
                if (last > ngroups)
                        last = ngroups;
                for  (i = parent_group; i < last; i++) {
                        desc = ext4_get_group_desc(sb, i, NULL);
                        if (desc && ext4_free_inodes_count(sb, desc)) {
                                *group = i;
                                return 0;
                        }
                }
                if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
                        retry = 1;
                        parent_group = EXT4_I(parent)->i_last_alloc_group;
                        goto try_again;
                }
                /*
                 * If this didn't work, use the Orlov search algorithm
                 * to find a new flex group; we pass in the mode to
                 * avoid the topdir algorithms.
                 */
                *group = parent_group + flex_size;
                if (*group > ngroups)
                        *group = 0;
                return find_group_orlov(sb, parent, group, mode, NULL);
        }

        /*
         * Try to place the inode in its parent directory
         */
        *group = parent_group;
        desc = ext4_get_group_desc(sb, *group, NULL);
        if (desc && ext4_free_inodes_count(sb, desc) &&
            ext4_free_group_clusters(sb, desc))
                return 0;

        /*
         * We're going to place this inode in a different blockgroup from its
         * parent.  We want to cause files in a common directory to all land in
         * the same blockgroup.  But we want files which are in a different
         * directory which shares a blockgroup with our parent to land in a
         * different blockgroup.
         *
         * So add our directory's i_ino into the starting point for the hash.
         */
        *group = (*group + (unsigned int)parent->i_ino) % ngroups;

        /*
         * Use a quadratic hash to find a group with a free inode and some free
         * blocks.
         */
        for (i = 1; i < ngroups; i <<= 1) {
                *group += i;
                if (*group >= ngroups)
                        *group -= ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
                if (desc && ext4_free_inodes_count(sb, desc) &&
                    ext4_free_group_clusters(sb, desc))
                        return 0;
        }

        /*
         * That failed: try linear search for a free inode, even if that group
         * has no free blocks.
         */
        *group = parent_group;
        for (i = 0; i < ngroups; i++) {
                if (++*group >= ngroups)
                        *group = 0;
                desc = ext4_get_group_desc(sb, *group, NULL);
                if (desc && ext4_free_inodes_count(sb, desc))
                        return 0;
        }

        return -1;
}

/*
 * In no journal mode, if an inode has recently been deleted, we want
 * to avoid reusing it until we're reasonably sure the inode table
 * block has been written back to disk.  (Yes, these values are
 * somewhat arbitrary...)
 */
#define RECENTCY_MIN        60
#define RECENTCY_DIRTY        300

static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
{
        struct ext4_group_desc        *gdp;
        struct ext4_inode        *raw_inode;
        struct buffer_head        *bh;
        int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        int offset, ret = 0;
        int recentcy = RECENTCY_MIN;
        u32 dtime, now;

        gdp = ext4_get_group_desc(sb, group, NULL);
        if (unlikely(!gdp))
                return 0;

        /* Inode was never used in this filesystem? */
        if (ext4_has_group_desc_csum(sb) &&
            (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT) ||
             ino >= EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp)))
                return 0;

        bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) +
                       (ino / inodes_per_block));
        if (!bh || !buffer_uptodate(bh))
                /*
                 * If the block is not in the buffer cache, then it
                 * must have been written out, or, most unlikely, is
                 * being migrated - false failure should be OK here.
                 */
                goto out;

        offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
        raw_inode = (struct ext4_inode *) (bh->b_data + offset);

        /* i_dtime is only 32 bits on disk, but we only care about relative
         * times in the range of a few minutes (i.e. long enough to sync a
         * recently-deleted inode to disk), so using the low 32 bits of the
         * clock (a 68 year range) is enough, see time_before32() */
        dtime = le32_to_cpu(raw_inode->i_dtime);
        now = ktime_get_real_seconds();
        if (buffer_dirty(bh))
                recentcy += RECENTCY_DIRTY;

        if (dtime && time_before32(dtime, now) &&
            time_before32(now, dtime + recentcy))
                ret = 1;
out:
        brelse(bh);
        return ret;
}

static int find_inode_bit(struct super_block *sb, ext4_group_t group,
                          struct buffer_head *bitmap, unsigned long *ino)
{
        bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL;
        unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb);

next:
        *ino = ext4_find_next_zero_bit((unsigned long *)
                                       bitmap->b_data,
                                       EXT4_INODES_PER_GROUP(sb), *ino);
        if (*ino >= EXT4_INODES_PER_GROUP(sb))
                goto not_found;

        if (check_recently_deleted && recently_deleted(sb, group, *ino)) {
                recently_deleted_ino = *ino;
                *ino = *ino + 1;
                if (*ino < EXT4_INODES_PER_GROUP(sb))
                        goto next;
                goto not_found;
        }
        return 1;
not_found:
        if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb))
                return 0;
        /*
         * Not reusing recently deleted inodes is mostly a preference. We don't
         * want to report ENOSPC or skew allocation patterns because of that.
         * So return even recently deleted inode if we could find better in the
         * given range.
         */
        *ino = recently_deleted_ino;
        return 1;
}

int ext4_mark_inode_used(struct super_block *sb, int ino)
{
        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
        struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
        struct ext4_group_desc *gdp;
        ext4_group_t group;
        int bit;
        int err;

        if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
                return -EFSCORRUPTED;

        group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
        if (IS_ERR(inode_bitmap_bh))
                return PTR_ERR(inode_bitmap_bh);

        if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
                err = 0;
                goto out;
        }

        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
        if (!gdp) {
                err = -EINVAL;
                goto out;
        }

        ext4_set_bit(bit, inode_bitmap_bh->b_data);

        BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }
        err = sync_dirty_buffer(inode_bitmap_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        /* We may have to initialize the block bitmap if it isn't already */
        if (ext4_has_group_desc_csum(sb) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                struct buffer_head *block_bitmap_bh;

                block_bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(block_bitmap_bh)) {
                        err = PTR_ERR(block_bitmap_bh);
                        goto out;
                }

                BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
                err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
                sync_dirty_buffer(block_bitmap_bh);

                /* recheck and clear flag under lock if we still need to */
                ext4_lock_group(sb, group);
                if (ext4_has_group_desc_csum(sb) &&
                    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        ext4_free_group_clusters_set(sb, gdp,
                                ext4_free_clusters_after_init(sb, group, gdp));
                        ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
                        ext4_group_desc_csum_set(sb, group, gdp);
                }
                ext4_unlock_group(sb, group);
                brelse(block_bitmap_bh);

                if (err) {
                        ext4_std_error(sb, err);
                        goto out;
                }
        }

        /* Update the relevant bg descriptor fields */
        if (ext4_has_group_desc_csum(sb)) {
                int free;

                ext4_lock_group(sb, group); /* while we modify the bg desc */
                free = EXT4_INODES_PER_GROUP(sb) -
                        ext4_itable_unused_count(sb, gdp);
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
                        free = 0;
                }

                /*
                 * Check the relative inode number against the last used
                 * relative inode number in this group. if it is greater
                 * we need to update the bg_itable_unused count
                 */
                if (bit >= free)
                        ext4_itable_unused_set(sb, gdp,
                                        (EXT4_INODES_PER_GROUP(sb) - bit - 1));
        } else {
                ext4_lock_group(sb, group);
        }

        ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
        if (ext4_has_group_desc_csum(sb)) {
                ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
                ext4_group_desc_csum_set(sb, group, gdp);
        }

        ext4_unlock_group(sb, group);
        err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
        sync_dirty_buffer(group_desc_bh);
out:
        brelse(inode_bitmap_bh);
        return err;
}

static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
                                            bool encrypt)
{
        struct super_block *sb = dir->i_sb;
        int nblocks = 0;
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);

        if (IS_ERR(p))
                return PTR_ERR(p);
        if (p) {
                int acl_size = p->a_count * sizeof(ext4_acl_entry);

                nblocks += (S_ISDIR(mode) ? 2 : 1) *
                        __ext4_xattr_set_credits(sb, NULL /* inode */,
                                                 NULL /* block_bh */, acl_size,
                                                 true /* is_create */);
                posix_acl_release(p);
        }
#endif

#ifdef CONFIG_SECURITY
        {
                int num_security_xattrs = 1;

#ifdef CONFIG_INTEGRITY
                num_security_xattrs++;
#endif
                /*
                 * We assume that security xattrs are never more than 1k.
                 * In practice they are under 128 bytes.
                 */
                nblocks += num_security_xattrs *
                        __ext4_xattr_set_credits(sb, NULL /* inode */,
                                                 NULL /* block_bh */, 1024,
                                                 true /* is_create */);
        }
#endif
        if (encrypt)
                nblocks += __ext4_xattr_set_credits(sb,
                                                    NULL /* inode */,
                                                    NULL /* block_bh */,
                                                    FSCRYPT_SET_CONTEXT_MAX_SIZE,
                                                    true /* is_create */);
        return nblocks;
}

/*
 * There are two policies for allocating an inode.  If the new inode is
 * a directory, then a forward search is made for a block group with both
 * free space and a low directory-to-inode ratio; if that fails, then of
 * the groups with above-average free space, that group with the fewest
 * directories already is chosen.
 *
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
                               handle_t *handle, struct inode *dir,
                               umode_t mode, const struct qstr *qstr,
                               __u32 goal, uid_t *owner, __u32 i_flags,
                               int handle_type, unsigned int line_no,
                               int nblocks)
{
        struct super_block *sb;
        struct buffer_head *inode_bitmap_bh = NULL;
        struct buffer_head *group_desc_bh;
        ext4_group_t ngroups, group = 0;
        unsigned long ino = 0;
        struct inode *inode;
        struct ext4_group_desc *gdp = NULL;
        struct ext4_inode_info *ei;
        struct ext4_sb_info *sbi;
        int ret2, err;
        struct inode *ret;
        ext4_group_t i;
        ext4_group_t flex_group;
        struct ext4_group_info *grp = NULL;
        bool encrypt = false;

        /* Cannot create files in a deleted directory */
        if (!dir || !dir->i_nlink)
                return ERR_PTR(-EPERM);

        sb = dir->i_sb;
        sbi = EXT4_SB(sb);

        ret2 = ext4_emergency_state(sb);
        if (unlikely(ret2))
                return ERR_PTR(ret2);

        ngroups = ext4_get_groups_count(sb);
        trace_ext4_request_inode(dir, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        ei = EXT4_I(inode);

        /*
         * Initialize owners and quota early so that we don't have to account
         * for quota initialization worst case in standard inode creating
         * transaction
         */
        if (owner) {
                inode->i_mode = mode;
                i_uid_write(inode, owner[0]);
                i_gid_write(inode, owner[1]);
        } else if (test_opt(sb, GRPID)) {
                inode->i_mode = mode;
                inode_fsuid_set(inode, idmap);
                inode->i_gid = dir->i_gid;
        } else
                inode_init_owner(idmap, inode, dir, mode);

        if (ext4_has_feature_project(sb) &&
            ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
                ei->i_projid = EXT4_I(dir)->i_projid;
        else
                ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);

        if (!(i_flags & EXT4_EA_INODE_FL)) {
                err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
                if (err)
                        goto out;
        }

        err = dquot_initialize(inode);
        if (err)
                goto out;

        if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
                ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
                if (ret2 < 0) {
                        err = ret2;
                        goto out;
                }
                nblocks += ret2;
        }

        if (!goal)
                goal = sbi->s_inode_goal;

        if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
                group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
                ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
                ret2 = 0;
                goto got_group;
        }

        if (S_ISDIR(mode))
                ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
        else
                ret2 = find_group_other(sb, dir, &group, mode);

got_group:
        EXT4_I(dir)->i_last_alloc_group = group;
        err = -ENOSPC;
        if (ret2 == -1)
                goto out;

        /*
         * Normally we will only go through one pass of this loop,
         * unless we get unlucky and it turns out the group we selected
         * had its last inode grabbed by someone else.
         */
        for (i = 0; i < ngroups; i++, ino = 0) {
                err = -EIO;

                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
                if (!gdp)
                        goto out;

                /*
                 * Check free inodes count before loading bitmap.
                 */
                if (ext4_free_inodes_count(sb, gdp) == 0)
                        goto next_group;

                if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
                        grp = ext4_get_group_info(sb, group);
                        /*
                         * Skip groups with already-known suspicious inode
                         * tables
                         */
                        if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
                                goto next_group;
                }

                brelse(inode_bitmap_bh);
                inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
                /* Skip groups with suspicious inode tables */
                if (IS_ERR(inode_bitmap_bh)) {
                        inode_bitmap_bh = NULL;
                        goto next_group;
                }
                if (!(sbi->s_mount_state & EXT4_FC_REPLAY) &&
                    EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
                        goto next_group;

                ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
                if (!ret2)
                        goto next_group;

                if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
                        ext4_error(sb, "reserved inode found cleared - "
                                   "inode=%lu", ino + 1);
                        ext4_mark_group_bitmap_corrupted(sb, group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                        goto next_group;
                }

                if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
                        BUG_ON(nblocks <= 0);
                        handle = __ext4_journal_start_sb(NULL, dir->i_sb,
                                 line_no, handle_type, nblocks, 0,
                                 ext4_trans_default_revoke_credits(sb));
                        if (IS_ERR(handle)) {
                                err = PTR_ERR(handle);
                                ext4_std_error(sb, err);
                                goto out;
                        }
                }
                BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
                                                    EXT4_JTR_NONE);
                if (err) {
                        ext4_std_error(sb, err);
                        goto out;
                }
                ext4_lock_group(sb, group);
                ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
                if (ret2) {
                        /* Someone already took the bit. Repeat the search
                         * with lock held.
                         */
                        ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
                        if (ret2) {
                                ext4_set_bit(ino, inode_bitmap_bh->b_data);
                                ret2 = 0;
                        } else {
                                ret2 = 1; /* we didn't grab the inode */
                        }
                }
                ext4_unlock_group(sb, group);
                ino++;                /* the inode bitmap is zero-based */
                if (!ret2)
                        goto got; /* we grabbed the inode! */

next_group:
                if (++group == ngroups)
                        group = 0;
        }
        err = -ENOSPC;
        goto out;

got:
        BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        BUFFER_TRACE(group_desc_bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
                                            EXT4_JTR_NONE);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        /* We may have to initialize the block bitmap if it isn't already */
        if (ext4_has_group_desc_csum(sb) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                struct buffer_head *block_bitmap_bh;

                block_bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(block_bitmap_bh)) {
                        err = PTR_ERR(block_bitmap_bh);
                        goto out;
                }
                BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
                err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh,
                                                    EXT4_JTR_NONE);
                if (err) {
                        brelse(block_bitmap_bh);
                        ext4_std_error(sb, err);
                        goto out;
                }

                BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
                err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);

                /* recheck and clear flag under lock if we still need to */
                ext4_lock_group(sb, group);
                if (ext4_has_group_desc_csum(sb) &&
                    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        ext4_free_group_clusters_set(sb, gdp,
                                ext4_free_clusters_after_init(sb, group, gdp));
                        ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
                        ext4_group_desc_csum_set(sb, group, gdp);
                }
                ext4_unlock_group(sb, group);
                brelse(block_bitmap_bh);

                if (err) {
                        ext4_std_error(sb, err);
                        goto out;
                }
        }

        /* Update the relevant bg descriptor fields */
        if (ext4_has_group_desc_csum(sb)) {
                int free;
                struct ext4_group_info *grp = NULL;

                if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
                        grp = ext4_get_group_info(sb, group);
                        if (!grp) {
                                err = -EFSCORRUPTED;
                                goto out;
                        }
                        down_read(&grp->alloc_sem); /*
                                                     * protect vs itable
                                                     * lazyinit
                                                     */
                }
                ext4_lock_group(sb, group); /* while we modify the bg desc */
                free = EXT4_INODES_PER_GROUP(sb) -
                        ext4_itable_unused_count(sb, gdp);
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
                        free = 0;
                }
                /*
                 * Check the relative inode number against the last used
                 * relative inode number in this group. if it is greater
                 * we need to update the bg_itable_unused count
                 */
                if (ino > free)
                        ext4_itable_unused_set(sb, gdp,
                                        (EXT4_INODES_PER_GROUP(sb) - ino));
                if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
                        up_read(&grp->alloc_sem);
        } else {
                ext4_lock_group(sb, group);
        }

        ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
        if (S_ISDIR(mode)) {
                ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
                if (sbi->s_log_groups_per_flex) {
                        ext4_group_t f = ext4_flex_group(sbi, group);

                        atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
                                                        f)->used_dirs);
                }
        }
        if (ext4_has_group_desc_csum(sb)) {
                ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
                ext4_group_desc_csum_set(sb, group, gdp);
        }
        ext4_unlock_group(sb, group);

        BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);

        if (sbi->s_log_groups_per_flex) {
                flex_group = ext4_flex_group(sbi, group);
                atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
                                                flex_group)->free_inodes);
        }

        inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
        /* This is the optimal IO size (for stat), not the fs block size */
        inode->i_blocks = 0;
        simple_inode_init_ts(inode);
        ei->i_crtime = inode_get_mtime(inode);

        memset(ei->i_data, 0, sizeof(ei->i_data));
        ei->i_dir_start_lookup = 0;
        ei->i_disksize = 0;

        /* Don't inherit extent flag from directory, amongst others. */
        ei->i_flags =
                ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
        ei->i_flags |= i_flags;
        ei->i_file_acl = 0;
        ei->i_dtime = 0;
        ei->i_block_group = group;
        ei->i_last_alloc_group = ~0;

        ext4_set_inode_flags(inode, true);
        if (IS_DIRSYNC(inode))
                ext4_handle_sync(handle);
        if (insert_inode_locked(inode) < 0) {
                /*
                 * Likely a bitmap corruption causing inode to be allocated
                 * twice.
                 */
                err = -EIO;
                ext4_error(sb, "failed to insert inode %llu: doubly allocated?",
                           inode->i_ino);
                ext4_mark_group_bitmap_corrupted(sb, group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                goto out;
        }
        inode->i_generation = get_random_u32();

        /* Precompute checksum seed for inode metadata */
        if (ext4_has_feature_metadata_csum(sb)) {
                __u32 csum;
                __le32 inum = cpu_to_le32(inode->i_ino);
                __le32 gen = cpu_to_le32(inode->i_generation);
                csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
                                   sizeof(inum));
                ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
        }

        ext4_set_inode_state(inode, EXT4_STATE_NEW);

        ei->i_extra_isize = sbi->s_want_extra_isize;
        ei->i_inline_off = 0;
        if (ext4_has_feature_inline_data(sb) &&
            (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode)))
                ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
        ret = inode;
        err = dquot_alloc_inode(inode);
        if (err)
                goto fail_drop;

        /*
         * Since the encryption xattr will always be unique, create it first so
         * that it's less likely to end up in an external xattr block and
         * prevent its deduplication.
         */
        if (encrypt) {
                err = fscrypt_set_context(inode, handle);
                if (err)
                        goto fail_free_drop;
        }

        if (!(ei->i_flags & EXT4_EA_INODE_FL)) {
                err = ext4_init_acl(handle, inode, dir);
                if (err)
                        goto fail_free_drop;

                err = ext4_init_security(handle, inode, dir, qstr);
                if (err)
                        goto fail_free_drop;
        }

        if (ext4_has_feature_extents(sb)) {
                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
                        ext4_ext_tree_init(handle, inode);
                }
        }

        ext4_set_inode_mapping_order(inode);

        ext4_update_inode_fsync_trans(handle, inode, 1);

        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_std_error(sb, err);
                goto fail_free_drop;
        }

        ext4_debug("allocating inode %llu\n", inode->i_ino);
        trace_ext4_allocate_inode(inode, dir, mode);
        brelse(inode_bitmap_bh);
        return ret;

fail_free_drop:
        dquot_free_inode(inode);
fail_drop:
        clear_nlink(inode);
        unlock_new_inode(inode);
out:
        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
        iput(inode);
        brelse(inode_bitmap_bh);
        return ERR_PTR(err);
}

/* Verify that we are loading a valid orphan from disk */
struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
{
        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
        ext4_group_t block_group;
        int bit;
        struct buffer_head *bitmap_bh = NULL;
        struct inode *inode = NULL;
        int err = -EFSCORRUPTED;

        if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
                goto bad_orphan;

        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        if (IS_ERR(bitmap_bh))
                return ERR_CAST(bitmap_bh);

        /* Having the inode bit set should be a 100% indicator that this
         * is a valid orphan (no e2fsck run on fs).  Orphans also include
         * inodes that were being truncated, so we can't check i_nlink==0.
         */
        if (!ext4_test_bit(bit, bitmap_bh->b_data))
                goto bad_orphan;

        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ext4_error_err(sb, -err,
                               "couldn't read orphan inode %lu (err %d)",
                               ino, err);
                brelse(bitmap_bh);
                return inode;
        }

        /*
         * If the orphans has i_nlinks > 0 then it should be able to
         * be truncated, otherwise it won't be removed from the orphan
         * list during processing and an infinite loop will result.
         * Similarly, it must not be a bad inode.
         */
        if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
            is_bad_inode(inode))
                goto bad_orphan;

        if (NEXT_ORPHAN(inode) > max_ino)
                goto bad_orphan;
        brelse(bitmap_bh);
        return inode;

bad_orphan:
        ext4_error(sb, "bad orphan inode %lu", ino);
        if (bitmap_bh)
                printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
                       bit, (unsigned long long)bitmap_bh->b_blocknr,
                       ext4_test_bit(bit, bitmap_bh->b_data));
        if (inode) {
                printk(KERN_ERR "is_bad_inode(inode)=%d\n",
                       is_bad_inode(inode));
                printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
                       NEXT_ORPHAN(inode));
                printk(KERN_ERR "max_ino=%lu\n", max_ino);
                printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
                /* Avoid freeing blocks if we got a bad deleted inode */
                if (inode->i_nlink == 0)
                        inode->i_blocks = 0;
                iput(inode);
        }
        brelse(bitmap_bh);
        return ERR_PTR(err);
}

unsigned long ext4_count_free_inodes(struct super_block *sb)
{
        unsigned long desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
#ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        unsigned long bitmap_count, x;
        struct buffer_head *bitmap_bh = NULL;

        es = EXT4_SB(sb)->s_es;
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                desc_count += ext4_free_inodes_count(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_inode_bitmap(sb, i);
                if (IS_ERR(bitmap_bh)) {
                        bitmap_bh = NULL;
                        continue;
                }

                x = ext4_count_free(bitmap_bh->b_data,
                                    EXT4_INODES_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
                        (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
        printk(KERN_DEBUG "ext4_count_free_inodes: "
               "stored = %u, computed = %lu, %lu\n",
               le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
        return desc_count;
#else
        desc_count = 0;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                desc_count += ext4_free_inodes_count(sb, gdp);
                cond_resched();
        }
        return desc_count;
#endif
}

/* Called at mount-time, super-block is locked */
unsigned long ext4_count_dirs(struct super_block * sb)
{
        unsigned long count = 0;
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);

        for (i = 0; i < ngroups; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                count += ext4_used_dirs_count(sb, gdp);
        }
        return count;
}

/*
 * Zeroes not yet zeroed inode table - just write zeroes through the whole
 * inode table. Must be called without any spinlock held. The only place
 * where it is called from on active part of filesystem is ext4lazyinit
 * thread, so we do not need any special locks, however we have to prevent
 * inode allocation from the current group, so we take alloc_sem lock, to
 * block ext4_new_inode() until we are finished.
 */
int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
                                 int barrier)
{
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
        struct buffer_head *group_desc_bh;
        handle_t *handle;
        ext4_fsblk_t blk;
        int num, ret = 0, used_blks = 0;
        unsigned long used_inos = 0;

        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
        if (!gdp || !grp)
                goto out;

        /*
         * We do not need to lock this, because we are the only one
         * handling this flag.
         */
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
                goto out;

        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out;
        }

        down_write(&grp->alloc_sem);
        /*
         * If inode bitmap was already initialized there may be some
         * used inodes so we need to skip blocks with used inodes in
         * inode table.
         */
        if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
                used_inos = EXT4_INODES_PER_GROUP(sb) -
                            ext4_itable_unused_count(sb, gdp);
                used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block);

                /* Bogus inode unused count? */
                if (used_blks < 0 || used_blks > sbi->s_itb_per_group) {
                        ext4_error(sb, "Something is wrong with group %u: "
                                   "used itable blocks: %d; "
                                   "itable unused count: %u",
                                   group, used_blks,
                                   ext4_itable_unused_count(sb, gdp));
                        ret = 1;
                        goto err_out;
                }

                used_inos += group * EXT4_INODES_PER_GROUP(sb);
                /*
                 * Are there some uninitialized inodes in the inode table
                 * before the first normal inode?
                 */
                if ((used_blks != sbi->s_itb_per_group) &&
                     (used_inos < EXT4_FIRST_INO(sb))) {
                        ext4_error(sb, "Something is wrong with group %u: "
                                   "itable unused count: %u; "
                                   "itables initialized count: %ld",
                                   group, ext4_itable_unused_count(sb, gdp),
                                   used_inos);
                        ret = 1;
                        goto err_out;
                }
        }

        blk = ext4_inode_table(sb, gdp) + used_blks;
        num = sbi->s_itb_per_group - used_blks;

        BUFFER_TRACE(group_desc_bh, "get_write_access");
        ret = ext4_journal_get_write_access(handle, sb, group_desc_bh,
                                            EXT4_JTR_NONE);
        if (ret)
                goto err_out;

        /*
         * Skip zeroout if the inode table is full. But we set the ZEROED
         * flag anyway, because obviously, when it is full it does not need
         * further zeroing.
         */
        if (unlikely(num == 0))
                goto skip_zeroout;

        ext4_debug("going to zero out inode table in group %d\n",
                   group);
        ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
        if (ret < 0)
                goto err_out;
        if (barrier)
                blkdev_issue_flush(sb->s_bdev);

skip_zeroout:
        ext4_lock_group(sb, group);
        gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
        ext4_group_desc_csum_set(sb, group, gdp);
        ext4_unlock_group(sb, group);

        BUFFER_TRACE(group_desc_bh,
                     "call ext4_handle_dirty_metadata");
        ret = ext4_handle_dirty_metadata(handle, NULL,
                                         group_desc_bh);

err_out:
        up_write(&grp->alloc_sem);
        ext4_journal_stop(handle);
out:
        return ret;
}
























































































































































































































































































































































































































































































































































   18 






   13 




    7 

    3 

   12 

    2 









    3 



































    3 













   14 

   16 



   17 






   16 







    3 





   17 



































































































































































































   10 





    3 














   11 




















    8 
   10 
   11 







    3 

    3 










    3 


    3 
    3 



    3 











































   17 







    9 



    5 






   18 













































   14 












   16 





























































   18 



   14 




   15 
















   11 















   17 

























































   15 



















































    4 


    4 





























    2 


    2 
































































































    4 

























    4 
    4 













































    2 









    4 
    3 



















































































































































    3 











    4 



















































    4 
    4 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel internal timers
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
 *
 *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
 *              "A Kernel Model for Precision Timekeeping" by Dave Mills
 *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
 *              serialize accesses to xtime/lost_ticks).
 *                              Copyright (C) 1998  Andrea Arcangeli
 *  1999-03-10  Improved NTP compatibility by Ulrich Windl
 *  2002-05-31        Move sys_sysinfo here and make its locking sane, Robert Love
 *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
 *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
 *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
 */

#include <linux/kernel_stat.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pid_namespace.h>
#include <linux/notifier.h>
#include <linux/thread_info.h>
#include <linux/time.h>
#include <linux/jiffies.h>
#include <linux/posix-timers.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>
#include <linux/sysctl.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
#include <asm/io.h>

#include "tick-internal.h"
#include "timer_migration.h"

#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>

__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;

EXPORT_SYMBOL(jiffies_64);

/*
 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
 * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
 * level has a different granularity.
 *
 * The level granularity is:                LVL_CLK_DIV ^ level
 * The level clock frequency is:        HZ / (LVL_CLK_DIV ^ level)
 *
 * The array level of a newly armed timer depends on the relative expiry
 * time. The farther the expiry time is away the higher the array level and
 * therefore the granularity becomes.
 *
 * Contrary to the original timer wheel implementation, which aims for 'exact'
 * expiry of the timers, this implementation removes the need for recascading
 * the timers into the lower array levels. The previous 'classic' timer wheel
 * implementation of the kernel already violated the 'exact' expiry by adding
 * slack to the expiry time to provide batched expiration. The granularity
 * levels provide implicit batching.
 *
 * This is an optimization of the original timer wheel implementation for the
 * majority of the timer wheel use cases: timeouts. The vast majority of
 * timeout timers (networking, disk I/O ...) are canceled before expiry. If
 * the timeout expires it indicates that normal operation is disturbed, so it
 * does not matter much whether the timeout comes with a slight delay.
 *
 * The only exception to this are networking timers with a small expiry
 * time. They rely on the granularity. Those fit into the first wheel level,
 * which has HZ granularity.
 *
 * We don't have cascading anymore. timers with a expiry time above the
 * capacity of the last wheel level are force expired at the maximum timeout
 * value of the last wheel level. From data sampling we know that the maximum
 * value observed is 5 days (network connection tracking), so this should not
 * be an issue.
 *
 * The currently chosen array constants values are a good compromise between
 * array size and granularity.
 *
 * This results in the following granularity and range levels:
 *
 * HZ 1000 steps
 * Level Offset  Granularity            Range
 *  0      0         1 ms                0 ms -         63 ms
 *  1     64         8 ms               64 ms -        511 ms
 *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
 *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
 *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
 *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
 *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
 *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
 *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
 *
 * HZ  300
 * Level Offset  Granularity            Range
 *  0           0         3 ms                0 ms -        210 ms
 *  1          64        26 ms              213 ms -       1703 ms (213ms - ~1s)
 *  2         128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
 *  3         192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
 *  4         256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
 *  5         320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
 *  6         384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
 *  7         448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
 *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
 *
 * HZ  250
 * Level Offset  Granularity            Range
 *  0           0         4 ms                0 ms -        255 ms
 *  1          64        32 ms              256 ms -       2047 ms (256ms - ~2s)
 *  2         128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
 *  3         192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
 *  4         256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
 *  5         320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
 *  6         384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
 *  7         448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
 *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
 *
 * HZ  100
 * Level Offset  Granularity            Range
 *  0           0         10 ms               0 ms -        630 ms
 *  1          64         80 ms             640 ms -       5110 ms (640ms - ~5s)
 *  2         128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
 *  3         192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
 *  4         256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
 *  5         320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
 *  6         384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
 *  7         448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
 */

/* Clock divisor for the next level */
#define LVL_CLK_SHIFT        3
#define LVL_CLK_DIV        (1UL << LVL_CLK_SHIFT)
#define LVL_CLK_MASK        (LVL_CLK_DIV - 1)
#define LVL_SHIFT(n)        ((n) * LVL_CLK_SHIFT)
#define LVL_GRAN(n)        (1UL << LVL_SHIFT(n))

/*
 * The time start value for each level to select the bucket at enqueue
 * time. We start from the last possible delta of the previous level
 * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
 */
#define LVL_START(n)        ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))

/* Size of each clock level */
#define LVL_BITS        6
#define LVL_SIZE        (1UL << LVL_BITS)
#define LVL_MASK        (LVL_SIZE - 1)
#define LVL_OFFS(n)        ((n) * LVL_SIZE)

/* Level depth */
#if HZ > 100
# define LVL_DEPTH        9
# else
# define LVL_DEPTH        8
#endif

/* The cutoff (max. capacity of the wheel) */
#define WHEEL_TIMEOUT_CUTOFF        (LVL_START(LVL_DEPTH))
#define WHEEL_TIMEOUT_MAX        (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))

/*
 * The resulting wheel size. If NOHZ is configured we allocate two
 * wheels so we have a separate storage for the deferrable timers.
 */
#define WHEEL_SIZE        (LVL_SIZE * LVL_DEPTH)

#ifdef CONFIG_NO_HZ_COMMON
/*
 * If multiple bases need to be locked, use the base ordering for lock
 * nesting, i.e. lowest number first.
 */
# define NR_BASES        3
# define BASE_LOCAL        0
# define BASE_GLOBAL        1
# define BASE_DEF        2
#else
# define NR_BASES        1
# define BASE_LOCAL        0
# define BASE_GLOBAL        0
# define BASE_DEF        0
#endif

/**
 * struct timer_base - Per CPU timer base (number of base depends on config)
 * @lock:                Lock protecting the timer_base
 * @running_timer:        When expiring timers, the lock is dropped. To make
 *                        sure not to race against deleting/modifying a
 *                        currently running timer, the pointer is set to the
 *                        timer, which expires at the moment. If no timer is
 *                        running, the pointer is NULL.
 * @expiry_lock:        PREEMPT_RT only: Lock is taken in softirq around
 *                        timer expiry callback execution and when trying to
 *                        delete a running timer and it wasn't successful in
 *                        the first glance. It prevents priority inversion
 *                        when callback was preempted on a remote CPU and a
 *                        caller tries to delete the running timer. It also
 *                        prevents a life lock, when the task which tries to
 *                        delete a timer preempted the softirq thread which
 *                        is running the timer callback function.
 * @timer_waiters:        PREEMPT_RT only: Tells, if there is a waiter
 *                        waiting for the end of the timer callback function
 *                        execution.
 * @clk:                clock of the timer base; is updated before enqueue
 *                        of a timer; during expiry, it is 1 offset ahead of
 *                        jiffies to avoid endless requeuing to current
 *                        jiffies
 * @next_expiry:        expiry value of the first timer; it is updated when
 *                        finding the next timer and during enqueue; the
 *                        value is not valid, when next_expiry_recalc is set
 * @cpu:                Number of CPU the timer base belongs to
 * @next_expiry_recalc: States, whether a recalculation of next_expiry is
 *                        required. Value is set true, when a timer was
 *                        deleted.
 * @is_idle:                Is set, when timer_base is idle. It is triggered by NOHZ
 *                        code. This state is only used in standard
 *                        base. Deferrable timers, which are enqueued remotely
 *                        never wake up an idle CPU. So no matter of supporting it
 *                        for this base.
 * @timers_pending:        Is set, when a timer is pending in the base. It is only
 *                        reliable when next_expiry_recalc is not set.
 * @pending_map:        bitmap of the timer wheel; each bit reflects a
 *                        bucket of the wheel. When a bit is set, at least a
 *                        single timer is enqueued in the related bucket.
 * @vectors:                Array of lists; Each array member reflects a bucket
 *                        of the timer wheel. The list contains all timers
 *                        which are enqueued into a specific bucket.
 */
struct timer_base {
        raw_spinlock_t                lock;
        struct timer_list        *running_timer;
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                expiry_lock;
        atomic_t                timer_waiters;
#endif
        unsigned long                clk;
        unsigned long                next_expiry;
        unsigned int                cpu;
        bool                        next_expiry_recalc;
        bool                        is_idle;
        bool                        timers_pending;
        DECLARE_BITMAP(pending_map, WHEEL_SIZE);
        struct hlist_head        vectors[WHEEL_SIZE];
} ____cacheline_aligned;

static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);

#ifdef CONFIG_NO_HZ_COMMON

static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
static DEFINE_MUTEX(timer_keys_mutex);

static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);

#ifdef CONFIG_SMP
static unsigned int sysctl_timer_migration = 1;

DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);

static void timers_update_migration(void)
{
        if (sysctl_timer_migration && tick_nohz_is_active())
                static_branch_enable(&timers_migration_enabled);
        else
                static_branch_disable(&timers_migration_enabled);
}

#ifdef CONFIG_SYSCTL
static int timer_migration_handler(const struct ctl_table *table, int write,
                            void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        mutex_lock(&timer_keys_mutex);
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (!ret && write)
                timers_update_migration();
        mutex_unlock(&timer_keys_mutex);
        return ret;
}

static const struct ctl_table timer_sysctl[] = {
        {
                .procname        = "timer_migration",
                .data                = &sysctl_timer_migration,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = timer_migration_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static int __init timer_sysctl_init(void)
{
        register_sysctl("kernel", timer_sysctl);
        return 0;
}
device_initcall(timer_sysctl_init);
#endif /* CONFIG_SYSCTL */
#else /* CONFIG_SMP */
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */

static void timer_update_keys(struct work_struct *work)
{
        mutex_lock(&timer_keys_mutex);
        timers_update_migration();
        static_branch_enable(&timers_nohz_active);
        mutex_unlock(&timer_keys_mutex);
}

void timers_update_nohz(void)
{
        schedule_work(&timer_update_work);
}

static inline bool is_timers_nohz_active(void)
{
        return static_branch_unlikely(&timers_nohz_active);
}
#else
static inline bool is_timers_nohz_active(void) { return false; }
#endif /* NO_HZ_COMMON */

static unsigned long round_jiffies_common(unsigned long j, int cpu,
                bool force_up)
{
        int rem;
        unsigned long original = j;

        /*
         * We don't want all cpus firing their timers at once hitting the
         * same lock or cachelines, so we skew each extra cpu with an extra
         * 3 jiffies. This 3 jiffies came originally from the mm/ code which
         * already did this.
         * The skew is done by adding 3*cpunr, then round, then subtract this
         * extra offset again.
         */
        j += cpu * 3;

        rem = j % HZ;

        /*
         * If the target jiffy is just after a whole second (which can happen
         * due to delays of the timer irq, long irq off times etc etc) then
         * we should round down to the whole second, not up. Use 1/4th second
         * as cutoff for this rounding as an extreme upper bound for this.
         * But never round down if @force_up is set.
         */
        if (rem < HZ/4 && !force_up) /* round down */
                j = j - rem;
        else /* round up */
                j = j - rem + HZ;

        /* now that we have rounded, subtract the extra skew again */
        j -= cpu * 3;

        /*
         * Make sure j is still in the future. Otherwise return the
         * unmodified value.
         */
        return time_is_after_jiffies(j) ? j : original;
}

/**
 * __round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);

/**
 * round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);

/**
 * round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies_relative(unsigned long j)
{
        return __round_jiffies_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);

/**
 * __round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, true) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);

/**
 * round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * This is the same as round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), true);
}
EXPORT_SYMBOL_GPL(round_jiffies_up);

/**
 * round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * This is the same as round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up_relative(unsigned long j)
{
        return __round_jiffies_up_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);


static inline unsigned int timer_get_idx(struct timer_list *timer)
{
        return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
}

static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
{
        timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
                        idx << TIMER_ARRAYSHIFT;
}

/*
 * Helper function to calculate the array index for a given expiry
 * time.
 */
static inline unsigned calc_index(unsigned long expires, unsigned lvl,
                                  unsigned long *bucket_expiry)
{

        /*
         * The timer wheel has to guarantee that a timer does not fire
         * early. Early expiry can happen due to:
         * - Timer is armed at the edge of a tick
         * - Truncation of the expiry time in the outer wheel levels
         *
         * Round up with level granularity to prevent this.
         */
        expires = (expires >> LVL_SHIFT(lvl)) + 1;
        *bucket_expiry = expires << LVL_SHIFT(lvl);
        return LVL_OFFS(lvl) + (expires & LVL_MASK);
}

static int calc_wheel_index(unsigned long expires, unsigned long clk,
                            unsigned long *bucket_expiry)
{
        unsigned long delta = expires - clk;
        unsigned int idx;

        if (delta < LVL_START(1)) {
                idx = calc_index(expires, 0, bucket_expiry);
        } else if (delta < LVL_START(2)) {
                idx = calc_index(expires, 1, bucket_expiry);
        } else if (delta < LVL_START(3)) {
                idx = calc_index(expires, 2, bucket_expiry);
        } else if (delta < LVL_START(4)) {
                idx = calc_index(expires, 3, bucket_expiry);
        } else if (delta < LVL_START(5)) {
                idx = calc_index(expires, 4, bucket_expiry);
        } else if (delta < LVL_START(6)) {
                idx = calc_index(expires, 5, bucket_expiry);
        } else if (delta < LVL_START(7)) {
                idx = calc_index(expires, 6, bucket_expiry);
        } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
                idx = calc_index(expires, 7, bucket_expiry);
        } else if ((long) delta < 0) {
                idx = clk & LVL_MASK;
                *bucket_expiry = clk;
        } else {
                /*
                 * Force expire obscene large timeouts to expire at the
                 * capacity limit of the wheel.
                 */
                if (delta >= WHEEL_TIMEOUT_CUTOFF)
                        expires = clk + WHEEL_TIMEOUT_MAX;

                idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
        }
        return idx;
}

static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
        /*
         * Deferrable timers do not prevent the CPU from entering dynticks and
         * are not taken into account on the idle/nohz_full path. An IPI when a
         * new deferrable timer is enqueued will wake up the remote CPU but
         * nothing will be done with the deferrable timer base. Therefore skip
         * the remote IPI for deferrable timers completely.
         */
        if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
                return;

        /*
         * We might have to IPI the remote CPU if the base is idle and the
         * timer is pinned. If it is a non pinned timer, it is only queued
         * on the remote CPU, when timer was running during queueing. Then
         * everything is handled by remote CPU anyway. If the other CPU is
         * on the way to idle then it can't set base->is_idle as we hold
         * the base lock:
         */
        if (base->is_idle) {
                WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
                               tick_nohz_full_cpu(base->cpu)));
                wake_up_nohz_cpu(base->cpu);
        }
}

/*
 * Enqueue the timer into the hash bucket, mark it pending in
 * the bitmap, store the index in the timer flags then wake up
 * the target CPU if needed.
 */
static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
                          unsigned int idx, unsigned long bucket_expiry)
{

        hlist_add_head(&timer->entry, base->vectors + idx);
        __set_bit(idx, base->pending_map);
        timer_set_idx(timer, idx);

        trace_timer_start(timer, bucket_expiry);

        /*
         * Check whether this is the new first expiring timer. The
         * effective expiry time of the timer is required here
         * (bucket_expiry) instead of timer->expires.
         */
        if (time_before(bucket_expiry, base->next_expiry)) {
                /*
                 * Set the next expiry time and kick the CPU so it
                 * can reevaluate the wheel:
                 */
                WRITE_ONCE(base->next_expiry, bucket_expiry);
                base->timers_pending = true;
                base->next_expiry_recalc = false;
                trigger_dyntick_cpu(base, timer);
        }
}

static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
        unsigned long bucket_expiry;
        unsigned int idx;

        idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
        enqueue_timer(base, timer, idx, bucket_expiry);
}

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr timer_debug_descr;

struct timer_hint {
        void        (*function)(struct timer_list *t);
        long        offset;
};

#define TIMER_HINT(fn, container, timr, hintfn)                        \
        {                                                        \
                .function = fn,                                        \
                .offset          = offsetof(container, hintfn) -        \
                            offsetof(container, timr)                \
        }

static const struct timer_hint timer_hints[] = {
        TIMER_HINT(delayed_work_timer_fn,
                   struct delayed_work, timer, work.func),
        TIMER_HINT(kthread_delayed_work_timer_fn,
                   struct kthread_delayed_work, timer, work.func),
};

static void *timer_debug_hint(void *addr)
{
        struct timer_list *timer = addr;
        int i;

        for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
                if (timer_hints[i].function == timer->function) {
                        void (**fn)(void) = addr + timer_hints[i].offset;

                        return *fn;
                }
        }

        return timer->function;
}

static bool timer_is_static_object(void *addr)
{
        struct timer_list *timer = addr;

        return (timer->entry.pprev == NULL &&
                timer->entry.next == TIMER_ENTRY_STATIC);
}

/*
 * timer_fixup_init is called when:
 * - an active object is initialized
 */
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                timer_delete_sync(timer);
                debug_object_init(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/* Stub timer callback for improperly used timers. */
static void stub_timer(struct timer_list *unused)
{
        WARN_ON(1);
}

/*
 * timer_fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;

        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * timer_fixup_free is called when:
 * - an active object is freed
 */
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                timer_delete_sync(timer);
                debug_object_free(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * timer_fixup_assert_init is called when:
 * - an untracked/uninit-ed object is found
 */
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr timer_debug_descr = {
        .name                        = "timer_list",
        .debug_hint                = timer_debug_hint,
        .is_static_object        = timer_is_static_object,
        .fixup_init                = timer_fixup_init,
        .fixup_activate                = timer_fixup_activate,
        .fixup_free                = timer_fixup_free,
        .fixup_assert_init        = timer_fixup_assert_init,
};

static inline void debug_timer_init(struct timer_list *timer)
{
        debug_object_init(timer, &timer_debug_descr);
}

static inline void debug_timer_activate(struct timer_list *timer)
{
        debug_object_activate(timer, &timer_debug_descr);
}

static inline void debug_timer_deactivate(struct timer_list *timer)
{
        debug_object_deactivate(timer, &timer_debug_descr);
}

static inline void debug_timer_assert_init(struct timer_list *timer)
{
        debug_object_assert_init(timer, &timer_debug_descr);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key);

void timer_init_key_on_stack(struct timer_list *timer,
                             void (*func)(struct timer_list *),
                             unsigned int flags,
                             const char *name, struct lock_class_key *key)
{
        debug_object_init_on_stack(timer, &timer_debug_descr);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(timer_init_key_on_stack);

void timer_destroy_on_stack(struct timer_list *timer)
{
        debug_object_free(timer, &timer_debug_descr);
}
EXPORT_SYMBOL_GPL(timer_destroy_on_stack);

#else
static inline void debug_timer_init(struct timer_list *timer) { }
static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
static inline void debug_timer_assert_init(struct timer_list *timer) { }
#endif

static inline void debug_init(struct timer_list *timer)
{
        debug_timer_init(timer);
        trace_timer_init(timer);
}

static inline void debug_deactivate(struct timer_list *timer)
{
        debug_timer_deactivate(timer);
        trace_timer_cancel(timer);
}

static inline void debug_assert_init(struct timer_list *timer)
{
        debug_timer_assert_init(timer);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key)
{
        timer->entry.pprev = NULL;
        timer->function = func;
        if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
                flags &= TIMER_INIT_FLAGS;
        timer->flags = flags | raw_smp_processor_id();
        lockdep_init_map(&timer->lockdep_map, name, key, 0);
}

/**
 * timer_init_key - initialize a timer
 * @timer: the timer to be initialized
 * @func: timer callback function
 * @flags: timer flags
 * @name: name of the timer
 * @key: lockdep class key of the fake lock used for tracking timer
 *       sync lock dependencies
 *
 * timer_init_key() must be done to a timer prior to calling *any* of the
 * other timer functions.
 */
void timer_init_key(struct timer_list *timer,
                    void (*func)(struct timer_list *), unsigned int flags,
                    const char *name, struct lock_class_key *key)
{
        debug_init(timer);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(timer_init_key);

static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
        struct hlist_node *entry = &timer->entry;

        debug_deactivate(timer);

        __hlist_del(entry);
        if (clear_pending)
                entry->pprev = NULL;
        entry->next = LIST_POISON2;
}

static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
                             bool clear_pending)
{
        unsigned idx = timer_get_idx(timer);

        if (!timer_pending(timer))
                return 0;

        if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
                __clear_bit(idx, base->pending_map);
                base->next_expiry_recalc = true;
        }

        detach_timer(timer, clear_pending);
        return 1;
}

static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
        int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                index = BASE_DEF;

        return per_cpu_ptr(&timer_bases[index], cpu);
}

static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
        int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                index = BASE_DEF;

        return this_cpu_ptr(&timer_bases[index]);
}

static inline struct timer_base *get_timer_base(u32 tflags)
{
        return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}

static inline void __forward_timer_base(struct timer_base *base,
                                        unsigned long basej)
{
        /*
         * Check whether we can forward the base. We can only do that when
         * @basej is past base->clk otherwise we might rewind base->clk.
         */
        if (time_before_eq(basej, base->clk))
                return;

        /*
         * If the next expiry value is > jiffies, then we fast forward to
         * jiffies otherwise we forward to the next expiry value.
         */
        if (time_after(base->next_expiry, basej)) {
                base->clk = basej;
        } else {
                if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
                        return;
                base->clk = base->next_expiry;
        }

}

static inline void forward_timer_base(struct timer_base *base)
{
        __forward_timer_base(base, READ_ONCE(jiffies));
}

/*
 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
 * that all timers which are tied to this base are locked, and the base itself
 * is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found in the base->vectors array.
 *
 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
 * to wait until the migration is done.
 */
static struct timer_base *lock_timer_base(struct timer_list *timer,
                                          unsigned long *flags)
        __acquires(timer->base->lock)
{
        for (;;) {
                struct timer_base *base;
                u32 tf;

                /*
                 * We need to use READ_ONCE() here, otherwise the compiler
                 * might re-read @tf between the check for TIMER_MIGRATING
                 * and spin_lock().
                 */
                tf = READ_ONCE(timer->flags);

                if (!(tf & TIMER_MIGRATING)) {
                        base = get_timer_base(tf);
                        raw_spin_lock_irqsave(&base->lock, *flags);
                        if (timer->flags == tf)
                                return base;
                        raw_spin_unlock_irqrestore(&base->lock, *flags);
                }
                cpu_relax();
        }
}

#define MOD_TIMER_PENDING_ONLY                0x01
#define MOD_TIMER_REDUCE                0x02
#define MOD_TIMER_NOTPENDING                0x04

static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
        unsigned long clk = 0, flags, bucket_expiry;
        struct timer_base *base, *new_base;
        unsigned int idx = UINT_MAX;
        int ret = 0;

        debug_assert_init(timer);

        /*
         * This is a common optimization triggered by the networking code - if
         * the timer is re-modified to have the same timeout or ends up in the
         * same array bucket then just return:
         */
        if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
                /*
                 * The downside of this optimization is that it can result in
                 * larger granularity than you would get from adding a new
                 * timer with this expiry.
                 */
                long diff = timer->expires - expires;

                if (!diff)
                        return 1;
                if (options & MOD_TIMER_REDUCE && diff <= 0)
                        return 1;

                /*
                 * We lock timer base and calculate the bucket index right
                 * here. If the timer ends up in the same bucket, then we
                 * just update the expiry time and avoid the whole
                 * dequeue/enqueue dance.
                 */
                base = lock_timer_base(timer, &flags);
                /*
                 * Has @timer been shutdown? This needs to be evaluated
                 * while holding base lock to prevent a race against the
                 * shutdown code.
                 */
                if (!timer->function)
                        goto out_unlock;

                forward_timer_base(base);

                if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
                    time_before_eq(timer->expires, expires)) {
                        ret = 1;
                        goto out_unlock;
                }

                clk = base->clk;
                idx = calc_wheel_index(expires, clk, &bucket_expiry);

                /*
                 * Retrieve and compare the array index of the pending
                 * timer. If it matches set the expiry to the new value so a
                 * subsequent call will exit in the expires check above.
                 */
                if (idx == timer_get_idx(timer)) {
                        if (!(options & MOD_TIMER_REDUCE))
                                timer->expires = expires;
                        else if (time_after(timer->expires, expires))
                                timer->expires = expires;
                        ret = 1;
                        goto out_unlock;
                }
        } else {
                base = lock_timer_base(timer, &flags);
                /*
                 * Has @timer been shutdown? This needs to be evaluated
                 * while holding base lock to prevent a race against the
                 * shutdown code.
                 */
                if (!timer->function)
                        goto out_unlock;

                forward_timer_base(base);
        }

        ret = detach_if_pending(timer, base, false);
        if (!ret && (options & MOD_TIMER_PENDING_ONLY))
                goto out_unlock;

        new_base = get_timer_this_cpu_base(timer->flags);

        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the new base.
                 * However we can't change timer's base while it is running,
                 * otherwise timer_delete_sync() can't detect that the timer's
                 * handler yet has not finished. This also guarantees that the
                 * timer is serialized wrt itself.
                 */
                if (likely(base->running_timer != timer)) {
                        /* See the comment in lock_timer_base() */
                        timer->flags |= TIMER_MIGRATING;

                        raw_spin_unlock(&base->lock);
                        base = new_base;
                        raw_spin_lock(&base->lock);
                        WRITE_ONCE(timer->flags,
                                   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
                        forward_timer_base(base);
                }
        }

        debug_timer_activate(timer);

        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
         * between calculating 'idx' and possibly switching the base, only
         * enqueue_timer() is required. Otherwise we need to (re)calculate
         * the wheel index via internal_add_timer().
         */
        if (idx != UINT_MAX && clk == base->clk)
                enqueue_timer(base, timer, idx, bucket_expiry);
        else
                internal_add_timer(base, timer);

out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * mod_timer_pending - Modify a pending timer's timeout
 * @timer:        The pending timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer_pending() is the same for pending timers as mod_timer(), but
 * will not activate inactive timers.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * Return:
 * * %0 - The timer was inactive and not modified or was in
 *          shutdown state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires
 */
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);

/**
 * mod_timer - Modify a timer's timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer(timer, expires) is equivalent to:
 *
 *     timer_delete(timer); timer->expires = expires; add_timer(timer);
 *
 * mod_timer() is more efficient than the above open coded sequence. In
 * case that the timer is inactive, the timer_delete() part is a NOP. The
 * timer is in any case activated with the new expiry time @expires.
 *
 * Note that if there are multiple unserialized concurrent users of the
 * same timer, then mod_timer() is the only safe way to modify the timeout,
 * since add_timer() cannot modify an already running timer.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded. In this case the return value is 0 and meaningless.
 *
 * Return:
 * * %0 - The timer was inactive and started or was in shutdown
 *          state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires did
 *          not change the effective expiry time
 */
int mod_timer(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);

/**
 * timer_reduce - Modify a timer's timeout if it would reduce the timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * timer_reduce() is very similar to mod_timer(), except that it will only
 * modify an enqueued timer if that would reduce the expiration time. If
 * @timer is not enqueued it starts the timer.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * Return:
 * * %0 - The timer was inactive and started or was in shutdown
 *          state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires
 *          did not change the effective expiry time such that the
 *          timer would expire earlier than already scheduled
 */
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
}
EXPORT_SYMBOL(timer_reduce);

/**
 * add_timer - Start a timer
 * @timer:        The timer to be started
 *
 * Start @timer to expire at @timer->expires in the future. @timer->expires
 * is the absolute expiry time measured in 'jiffies'. When the timer expires
 * timer->function(timer) will be invoked from soft interrupt context.
 *
 * The @timer->expires and @timer->function fields must be set prior
 * to calling this function.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * If @timer->expires is already in the past @timer will be queued to
 * expire at the next timer tick.
 *
 * This can only operate on an inactive timer. Attempts to invoke this on
 * an active timer are rejected with a warning.
 */
void add_timer(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);

/**
 * add_timer_local() - Start a timer on the local CPU
 * @timer:        The timer to be started
 *
 * Same as add_timer() except that the timer flag TIMER_PINNED is set.
 *
 * See add_timer() for further details.
 */
void add_timer_local(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        timer->flags |= TIMER_PINNED;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_local);

/**
 * add_timer_global() - Start a timer without TIMER_PINNED flag set
 * @timer:        The timer to be started
 *
 * Same as add_timer() except that the timer flag TIMER_PINNED is unset.
 *
 * See add_timer() for further details.
 */
void add_timer_global(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        timer->flags &= ~TIMER_PINNED;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_global);

/**
 * add_timer_on - Start a timer on a particular CPU
 * @timer:        The timer to be started
 * @cpu:        The CPU to start it on
 *
 * Same as add_timer() except that it starts the timer on the given CPU and
 * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
 * the next round, add_timer_global() should be used instead as it unsets
 * the TIMER_PINNED flag.
 *
 * See add_timer() for further details.
 */
void add_timer_on(struct timer_list *timer, int cpu)
{
        struct timer_base *new_base, *base;
        unsigned long flags;

        debug_assert_init(timer);

        if (WARN_ON_ONCE(timer_pending(timer)))
                return;

        /* Make sure timer flags have TIMER_PINNED flag set */
        timer->flags |= TIMER_PINNED;

        new_base = get_timer_cpu_base(timer->flags, cpu);

        /*
         * If @timer was on a different CPU, it should be migrated with the
         * old base locked to prevent other operations proceeding with the
         * wrong base locked.  See lock_timer_base().
         */
        base = lock_timer_base(timer, &flags);
        /*
         * Has @timer been shutdown? This needs to be evaluated while
         * holding base lock to prevent a race against the shutdown code.
         */
        if (!timer->function)
                goto out_unlock;

        if (base != new_base) {
                timer->flags |= TIMER_MIGRATING;

                raw_spin_unlock(&base->lock);
                base = new_base;
                raw_spin_lock(&base->lock);
                WRITE_ONCE(timer->flags,
                           (timer->flags & ~TIMER_BASEMASK) | cpu);
        }
        forward_timer_base(base);

        debug_timer_activate(timer);
        internal_add_timer(base, timer);
out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);

/**
 * __timer_delete - Internal function: Deactivate a timer
 * @timer:        The timer to be deactivated
 * @shutdown:        If true, this indicates that the timer is about to be
 *                shutdown permanently.
 *
 * If @shutdown is true then @timer->function is set to NULL under the
 * timer base lock which prevents further rearming of the time. In that
 * case any attempt to rearm @timer after this function returns will be
 * silently ignored.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
static int __timer_delete(struct timer_list *timer, bool shutdown)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = 0;

        debug_assert_init(timer);

        /*
         * If @shutdown is set then the lock has to be taken whether the
         * timer is pending or not to protect against a concurrent rearm
         * which might hit between the lockless pending check and the lock
         * acquisition. By taking the lock it is ensured that such a newly
         * enqueued timer is dequeued and cannot end up with
         * timer->function == NULL in the expiry code.
         *
         * If timer->function is currently executed, then this makes sure
         * that the callback cannot requeue the timer.
         */
        if (timer_pending(timer) || shutdown) {
                base = lock_timer_base(timer, &flags);
                ret = detach_if_pending(timer, base, true);
                if (shutdown)
                        timer->function = NULL;
                raw_spin_unlock_irqrestore(&base->lock, flags);
        }

        return ret;
}

/**
 * timer_delete - Deactivate a timer
 * @timer:        The timer to be deactivated
 *
 * The function only deactivates a pending timer, but contrary to
 * timer_delete_sync() it does not take into account whether the timer's
 * callback function is concurrently executed on a different CPU or not.
 * It neither prevents rearming of the timer.  If @timer can be rearmed
 * concurrently then the return value of this function is meaningless.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
int timer_delete(struct timer_list *timer)
{
        return __timer_delete(timer, false);
}
EXPORT_SYMBOL(timer_delete);

/**
 * timer_shutdown - Deactivate a timer and prevent rearming
 * @timer:        The timer to be deactivated
 *
 * The function does not wait for an eventually running timer callback on a
 * different CPU but it prevents rearming of the timer. Any attempt to arm
 * @timer after this function returns will be silently ignored.
 *
 * This function is useful for teardown code and should only be used when
 * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending
 */
int timer_shutdown(struct timer_list *timer)
{
        return __timer_delete(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown);

/**
 * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
 * @timer:        Timer to deactivate
 * @shutdown:        If true, this indicates that the timer is about to be
 *                shutdown permanently.
 *
 * If @shutdown is true then @timer->function is set to NULL under the
 * timer base lock which prevents further rearming of the timer. Any
 * attempt to rearm @timer after this function returns will be silently
 * ignored.
 *
 * This function cannot guarantee that the timer cannot be rearmed
 * right after dropping the base lock if @shutdown is false. That
 * needs to be prevented by the calling code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = -1;

        debug_assert_init(timer);

        base = lock_timer_base(timer, &flags);

        if (base->running_timer != timer) {
                ret = detach_if_pending(timer, base, true);
                if (shutdown)
                        timer->function = NULL;
        }

        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * timer_delete_sync_try - Try to deactivate a timer
 * @timer:        Timer to deactivate
 *
 * This function tries to deactivate a timer. On success the timer is not
 * queued and the timer callback function is not running on any CPU.
 *
 * This function does not guarantee that the timer cannot be rearmed right
 * after dropping the base lock. That needs to be prevented by the calling
 * code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
int timer_delete_sync_try(struct timer_list *timer)
{
        return __try_to_del_timer_sync(timer, false);
}
EXPORT_SYMBOL(timer_delete_sync_try);

#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
        spin_lock_init(&base->expiry_lock);
}

static inline void timer_base_lock_expiry(struct timer_base *base)
{
        spin_lock(&base->expiry_lock);
}

static inline void timer_base_unlock_expiry(struct timer_base *base)
{
        spin_unlock(&base->expiry_lock);
}

/*
 * The counterpart to del_timer_wait_running().
 *
 * If there is a waiter for base->expiry_lock, then it was waiting for the
 * timer callback to finish. Drop expiry_lock and reacquire it. That allows
 * the waiter to acquire the lock and make progress.
 */
static void timer_sync_wait_running(struct timer_base *base)
        __releases(&base->lock) __releases(&base->expiry_lock)
        __acquires(&base->expiry_lock) __acquires(&base->lock)
{
        if (atomic_read(&base->timer_waiters)) {
                raw_spin_unlock_irq(&base->lock);
                spin_unlock(&base->expiry_lock);
                spin_lock(&base->expiry_lock);
                raw_spin_lock_irq(&base->lock);
        }
}

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion, if the softirq thread on a remote CPU
 * got preempted, and it prevents a life lock when the task which tries to
 * delete a timer preempted the softirq thread running the timer callback
 * function.
 */
static void del_timer_wait_running(struct timer_list *timer)
{
        u32 tf;

        tf = READ_ONCE(timer->flags);
        if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
                struct timer_base *base = get_timer_base(tf);

                /*
                 * Mark the base as contended and grab the expiry lock,
                 * which is held by the softirq across the timer
                 * callback. Drop the lock immediately so the softirq can
                 * expire the next timer. In theory the timer could already
                 * be running again, but that's more than unlikely and just
                 * causes another wait loop.
                 */
                atomic_inc(&base->timer_waiters);
                spin_lock_bh(&base->expiry_lock);
                atomic_dec(&base->timer_waiters);
                spin_unlock_bh(&base->expiry_lock);
        }
}
#else
static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
static inline void timer_base_lock_expiry(struct timer_base *base) { }
static inline void timer_base_unlock_expiry(struct timer_base *base) { }
static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif

/**
 * __timer_delete_sync - Internal function: Deactivate a timer and wait
 *                         for the handler to finish.
 * @timer:        The timer to be deactivated
 * @shutdown:        If true, @timer->function will be set to NULL under the
 *                timer base lock which prevents rearming of @timer
 *
 * If @shutdown is not set the timer can be rearmed later. If the timer can
 * be rearmed concurrently, i.e. after dropping the base lock then the
 * return value is meaningless.
 *
 * If @shutdown is set then @timer->function is set to NULL under timer
 * base lock which prevents rearming of the timer. Any attempt to rearm
 * a shutdown timer is silently ignored.
 *
 * If the timer should be reused after shutdown it has to be initialized
 * again.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
{
        int ret;

#ifdef CONFIG_LOCKDEP
        unsigned long flags;

        /*
         * If lockdep gives a backtrace here, please reference
         * the synchronization rules above.
         */
        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
        local_irq_restore(flags);
#endif
        /*
         * don't use it in hardirq context, because it
         * could lead to deadlock.
         */
        WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));

        /*
         * Must be able to sleep on PREEMPT_RT because of the slowpath in
         * del_timer_wait_running().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
                lockdep_assert_preemption_enabled();

        do {
                ret = __try_to_del_timer_sync(timer, shutdown);

                if (unlikely(ret < 0)) {
                        del_timer_wait_running(timer);
                        cpu_relax();
                }
        } while (ret < 0);

        return ret;
}

/**
 * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
 * @timer:        The timer to be deactivated
 *
 * Synchronization rules: Callers must prevent restarting of the timer,
 * otherwise this function is meaningless. It must not be called from
 * interrupt contexts unless the timer is an irqsafe one. The caller must
 * not hold locks which would prevent completion of the timer's callback
 * function. The timer's handler must not call add_timer_on(). Upon exit
 * the timer is not queued and the handler is not running on any CPU.
 *
 * For !irqsafe timers, the caller must not hold locks that are held in
 * interrupt context. Even if the lock has nothing to do with the timer in
 * question.  Here's why::
 *
 *    CPU0                             CPU1
 *    ----                             ----
 *                                     <SOFTIRQ>
 *                                       call_timer_fn();
 *                                       base->running_timer = mytimer;
 *    spin_lock_irq(somelock);
 *                                     <IRQ>
 *                                        spin_lock(somelock);
 *    timer_delete_sync(mytimer);
 *    while (base->running_timer == mytimer);
 *
 * Now timer_delete_sync() will never return and never release somelock.
 * The interrupt on the other CPU is waiting to grab somelock but it has
 * interrupted the softirq that CPU0 is waiting to finish.
 *
 * This function cannot guarantee that the timer is not rearmed again by
 * some concurrent or preempting code, right after it dropped the base
 * lock. If there is the possibility of a concurrent rearm then the return
 * value of the function is meaningless.
 *
 * If such a guarantee is needed, e.g. for teardown situations then use
 * timer_shutdown_sync() instead.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
int timer_delete_sync(struct timer_list *timer)
{
        return __timer_delete_sync(timer, false);
}
EXPORT_SYMBOL(timer_delete_sync);

/**
 * timer_shutdown_sync - Shutdown a timer and prevent rearming
 * @timer: The timer to be shutdown
 *
 * When the function returns it is guaranteed that:
 *   - @timer is not queued
 *   - The callback function of @timer is not running
 *   - @timer cannot be enqueued again. Any attempt to rearm
 *     @timer is silently ignored.
 *
 * See timer_delete_sync() for synchronization rules.
 *
 * This function is useful for final teardown of an infrastructure where
 * the timer is subject to a circular dependency problem.
 *
 * A common pattern for this is a timer and a workqueue where the timer can
 * schedule work and work can arm the timer. On shutdown the workqueue must
 * be destroyed and the timer must be prevented from rearming. Unless the
 * code has conditionals like 'if (mything->in_shutdown)' to prevent that
 * there is no way to get this correct with timer_delete_sync().
 *
 * timer_shutdown_sync() is solving the problem. The correct ordering of
 * calls in this case is:
 *
 *        timer_shutdown_sync(&mything->timer);
 *        workqueue_destroy(&mything->workqueue);
 *
 * After this 'mything' can be safely freed.
 *
 * This obviously implies that the timer is not required to be functional
 * for the rest of the shutdown operation.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending
 */
int timer_shutdown_sync(struct timer_list *timer)
{
        return __timer_delete_sync(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown_sync);

static void call_timer_fn(struct timer_list *timer,
                          void (*fn)(struct timer_list *),
                          unsigned long baseclk)
{
        int count = preempt_count();

#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the timer from inside the
         * function that is called from it, this we need to take into
         * account for lockdep too. To avoid bogus "held lock freed"
         * warnings as well as problems when looking into
         * timer->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
        /*
         * Couple the lock chain with the lock chain at
         * timer_delete_sync() by acquiring the lock_map around the fn()
         * call here and in timer_delete_sync().
         */
        lock_map_acquire(&lockdep_map);

        trace_timer_expire_entry(timer, baseclk);
        fn(timer);
        trace_timer_expire_exit(timer);

        lock_map_release(&lockdep_map);

        if (count != preempt_count()) {
                WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
                 * chance to survive and extract information. If the
                 * callback kept a lock held, bad luck, but not worse
                 * than the BUG() we had.
                 */
                preempt_count_set(count);
        }
}

static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
        /*
         * This value is required only for tracing. base->clk was
         * incremented directly before expire_timers was called. But expiry
         * is related to the old base->clk value.
         */
        unsigned long baseclk = base->clk - 1;

        while (!hlist_empty(head)) {
                struct timer_list *timer;
                void (*fn)(struct timer_list *);

                timer = hlist_entry(head->first, struct timer_list, entry);

                base->running_timer = timer;
                detach_timer(timer, true);

                fn = timer->function;

                if (WARN_ON_ONCE(!fn)) {
                        /* Should never happen. Emphasis on should! */
                        base->running_timer = NULL;
                        continue;
                }

                if (timer->flags & TIMER_IRQSAFE) {
                        raw_spin_unlock(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock(&base->lock);
                        base->running_timer = NULL;
                } else {
                        raw_spin_unlock_irq(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock_irq(&base->lock);
                        base->running_timer = NULL;
                        timer_sync_wait_running(base);
                }
        }
}

static int collect_expired_timers(struct timer_base *base,
                                  struct hlist_head *heads)
{
        unsigned long clk = base->clk = base->next_expiry;
        struct hlist_head *vec;
        int i, levels = 0;
        unsigned int idx;

        for (i = 0; i < LVL_DEPTH; i++) {
                idx = (clk & LVL_MASK) + i * LVL_SIZE;

                if (__test_and_clear_bit(idx, base->pending_map)) {
                        vec = base->vectors + idx;
                        hlist_move_list(vec, heads++);
                        levels++;
                }
                /* Is it time to look at the next level? */
                if (clk & LVL_CLK_MASK)
                        break;
                /* Shift clock for the next level granularity */
                clk >>= LVL_CLK_SHIFT;
        }
        return levels;
}

/*
 * Find the next pending bucket of a level. Search from level start (@offset)
 * + @clk upwards and if nothing there, search from start of the level
 * (@offset) up to @offset + clk.
 */
static int next_pending_bucket(struct timer_base *base, unsigned offset,
                               unsigned clk)
{
        unsigned pos, start = offset + clk;
        unsigned end = offset + LVL_SIZE;

        pos = find_next_bit(base->pending_map, end, start);
        if (pos < end)
                return pos - start;

        pos = find_next_bit(base->pending_map, start, offset);
        return pos < start ? pos + LVL_SIZE - start : -1;
}

/*
 * Search the first expiring timer in the various clock levels. Caller must
 * hold base->lock.
 *
 * Store next expiry time in base->next_expiry.
 */
static void timer_recalc_next_expiry(struct timer_base *base)
{
        unsigned long clk, next, adj;
        unsigned lvl, offset = 0;

        next = base->clk + TIMER_NEXT_MAX_DELTA;
        clk = base->clk;
        for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
                int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
                unsigned long lvl_clk = clk & LVL_CLK_MASK;

                if (pos >= 0) {
                        unsigned long tmp = clk + (unsigned long) pos;

                        tmp <<= LVL_SHIFT(lvl);
                        if (time_before(tmp, next))
                                next = tmp;

                        /*
                         * If the next expiration happens before we reach
                         * the next level, no need to check further.
                         */
                        if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
                                break;
                }
                /*
                 * Clock for the next level. If the current level clock lower
                 * bits are zero, we look at the next level as is. If not we
                 * need to advance it by one because that's going to be the
                 * next expiring bucket in that level. base->clk is the next
                 * expiring jiffy. So in case of:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    0
                 *
                 * we have to look at all levels @index 0. With
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    2
                 *
                 * LVL0 has the next expiring bucket @index 2. The upper
                 * levels have the next expiring bucket @index 1.
                 *
                 * In case that the propagation wraps the next level the same
                 * rules apply:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    F    2
                 *
                 * So after looking at LVL0 we get:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1
                 *  0    0    0    1    0
                 *
                 * So no propagation from LVL1 to LVL2 because that happened
                 * with the add already, but then we need to propagate further
                 * from LVL2 to LVL3.
                 *
                 * So the simple check whether the lower bits of the current
                 * level are 0 or not is sufficient for all cases.
                 */
                adj = lvl_clk ? 1 : 0;
                clk >>= LVL_CLK_SHIFT;
                clk += adj;
        }

        WRITE_ONCE(base->next_expiry, next);
        base->next_expiry_recalc = false;
        base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA);
}

#ifdef CONFIG_NO_HZ_COMMON
/*
 * Check, if the next hrtimer event is before the next timer wheel
 * event:
 */
static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
        u64 nextevt = hrtimer_get_next_event();

        /*
         * If high resolution timers are enabled
         * hrtimer_get_next_event() returns KTIME_MAX.
         */
        if (expires <= nextevt)
                return expires;

        /*
         * If the next timer is already expired, return the tick base
         * time so the tick is fired immediately.
         */
        if (nextevt <= basem)
                return basem;

        /*
         * Round up to the next jiffy. High resolution timers are
         * off, so the hrtimers are expired in the tick and we need to
         * make sure that this tick really expires the timer to avoid
         * a ping pong of the nohz stop code.
         *
         * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
         */
        return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}

static unsigned long next_timer_interrupt(struct timer_base *base,
                                          unsigned long basej)
{
        if (base->next_expiry_recalc)
                timer_recalc_next_expiry(base);

        /*
         * Move next_expiry for the empty base into the future to prevent an
         * unnecessary raise of the timer softirq when the next_expiry value
         * will be reached even if there is no timer pending.
         *
         * This update is also required to make timer_base::next_expiry values
         * easy comparable to find out which base holds the first pending timer.
         */
        if (!base->timers_pending)
                WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA);

        return base->next_expiry;
}

static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem,
                                                struct timer_base *base_local,
                                                struct timer_base *base_global,
                                                struct timer_events *tevt)
{
        unsigned long nextevt, nextevt_local, nextevt_global;
        bool local_first;

        nextevt_local = next_timer_interrupt(base_local, basej);
        nextevt_global = next_timer_interrupt(base_global, basej);

        local_first = time_before_eq(nextevt_local, nextevt_global);

        nextevt = local_first ? nextevt_local : nextevt_global;

        /*
         * If the @nextevt is at max. one tick away, use @nextevt and store
         * it in the local expiry value. The next global event is irrelevant in
         * this case and can be left as KTIME_MAX.
         */
        if (time_before_eq(nextevt, basej + 1)) {
                /* If we missed a tick already, force 0 delta */
                if (time_before(nextevt, basej))
                        nextevt = basej;
                tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;

                /*
                 * This is required for the remote check only but it doesn't
                 * hurt, when it is done for both call sites:
                 *
                 * * The remote callers will only take care of the global timers
                 *   as local timers will be handled by CPU itself. When not
                 *   updating tevt->global with the already missed first global
                 *   timer, it is possible that it will be missed completely.
                 *
                 * * The local callers will ignore the tevt->global anyway, when
                 *   nextevt is max. one tick away.
                 */
                if (!local_first)
                        tevt->global = tevt->local;
                return nextevt;
        }

        /*
         * Update tevt.* values:
         *
         * If the local queue expires first, then the global event can be
         * ignored. If the global queue is empty, nothing to do either.
         */
        if (!local_first && base_global->timers_pending)
                tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;

        if (base_local->timers_pending)
                tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC;

        return nextevt;
}

# ifdef CONFIG_SMP
/**
 * fetch_next_timer_interrupt_remote() - Store next timers into @tevt
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 * @tevt:        Pointer to the storage for the expiry values
 * @cpu:        Remote CPU
 *
 * Stores the next pending local and global timer expiry values in the
 * struct pointed to by @tevt. If a queue is empty the corresponding
 * field is set to KTIME_MAX. If local event expires before global
 * event, global event is set to KTIME_MAX as well.
 *
 * Caller needs to make sure timer base locks are held (use
 * timer_lock_remote_bases() for this purpose).
 */
void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
                                       struct timer_events *tevt,
                                       unsigned int cpu)
{
        struct timer_base *base_local, *base_global;

        /* Preset local / global events */
        tevt->local = tevt->global = KTIME_MAX;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        lockdep_assert_held(&base_local->lock);
        lockdep_assert_held(&base_global->lock);

        fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt);
}

/**
 * timer_unlock_remote_bases - unlock timer bases of cpu
 * @cpu:        Remote CPU
 *
 * Unlocks the remote timer bases.
 */
void timer_unlock_remote_bases(unsigned int cpu)
        __releases(timer_bases[BASE_LOCAL]->lock)
        __releases(timer_bases[BASE_GLOBAL]->lock)
{
        struct timer_base *base_local, *base_global;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        raw_spin_unlock(&base_global->lock);
        raw_spin_unlock(&base_local->lock);
}

/**
 * timer_lock_remote_bases - lock timer bases of cpu
 * @cpu:        Remote CPU
 *
 * Locks the remote timer bases.
 */
void timer_lock_remote_bases(unsigned int cpu)
        __acquires(timer_bases[BASE_LOCAL]->lock)
        __acquires(timer_bases[BASE_GLOBAL]->lock)
{
        struct timer_base *base_local, *base_global;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        lockdep_assert_irqs_disabled();

        raw_spin_lock(&base_local->lock);
        raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
}

/**
 * timer_base_is_idle() - Return whether timer base is set idle
 *
 * Returns value of local timer base is_idle value.
 */
bool timer_base_is_idle(void)
{
        return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
}

static void __run_timer_base(struct timer_base *base);

/**
 * timer_expire_remote() - expire global timers of cpu
 * @cpu:        Remote CPU
 *
 * Expire timers of global base of remote CPU.
 */
void timer_expire_remote(unsigned int cpu)
{
        struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        __run_timer_base(base);
}

static void timer_use_tmigr(unsigned long basej, u64 basem,
                            unsigned long *nextevt, bool *tick_stop_path,
                            bool timer_base_idle, struct timer_events *tevt)
{
        u64 next_tmigr;

        if (timer_base_idle)
                next_tmigr = tmigr_cpu_new_timer(tevt->global);
        else if (tick_stop_path)
                next_tmigr = tmigr_cpu_deactivate(tevt->global);
        else
                next_tmigr = tmigr_quick_check(tevt->global);

        /*
         * If the CPU is the last going idle in timer migration hierarchy, make
         * sure the CPU will wake up in time to handle remote timers.
         * next_tmigr == KTIME_MAX if other CPUs are still active.
         */
        if (next_tmigr < tevt->local) {
                u64 tmp;

                /* If we missed a tick already, force 0 delta */
                if (next_tmigr < basem)
                        next_tmigr = basem;

                tmp = div_u64(next_tmigr - basem, TICK_NSEC);

                *nextevt = basej + (unsigned long)tmp;
                tevt->local = next_tmigr;
        }
}
# else
static void timer_use_tmigr(unsigned long basej, u64 basem,
                            unsigned long *nextevt, bool *tick_stop_path,
                            bool timer_base_idle, struct timer_events *tevt)
{
        /*
         * Make sure first event is written into tevt->local to not miss a
         * timer on !SMP systems.
         */
        tevt->local = min_t(u64, tevt->local, tevt->global);
}
# endif /* CONFIG_SMP */

static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
                                             bool *idle)
{
        struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
        struct timer_base *base_local, *base_global;
        unsigned long nextevt;
        bool idle_is_possible;

        /*
         * When the CPU is offline, the tick is cancelled and nothing is supposed
         * to try to stop it.
         */
        if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
                if (idle)
                        *idle = true;
                return tevt.local;
        }

        base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
        base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);

        raw_spin_lock(&base_local->lock);
        raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);

        nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
                                             base_global, &tevt);

        /*
         * If the next event is only one jiffy ahead there is no need to call
         * timer migration hierarchy related functions. The value for the next
         * global timer in @tevt struct equals then KTIME_MAX. This is also
         * true, when the timer base is idle.
         *
         * The proper timer migration hierarchy function depends on the callsite
         * and whether timer base is idle or not. @nextevt will be updated when
         * this CPU needs to handle the first timer migration hierarchy
         * event. See timer_use_tmigr() for detailed information.
         */
        idle_is_possible = time_after(nextevt, basej + 1);
        if (idle_is_possible)
                timer_use_tmigr(basej, basem, &nextevt, idle,
                                base_local->is_idle, &tevt);

        /*
         * We have a fresh next event. Check whether we can forward the
         * base.
         */
        __forward_timer_base(base_local, basej);
        __forward_timer_base(base_global, basej);

        /*
         * Set base->is_idle only when caller is timer_base_try_to_set_idle()
         */
        if (idle) {
                /*
                 * Bases are idle if the next event is more than a tick
                 * away. Caution: @nextevt could have changed by enqueueing a
                 * global timer into timer migration hierarchy. Therefore a new
                 * check is required here.
                 *
                 * If the base is marked idle then any timer add operation must
                 * forward the base clk itself to keep granularity small. This
                 * idle logic is only maintained for the BASE_LOCAL and
                 * BASE_GLOBAL base, deferrable timers may still see large
                 * granularity skew (by design).
                 */
                if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
                        base_local->is_idle = true;
                        /*
                         * Global timers queued locally while running in a task
                         * in nohz_full mode need a self-IPI to kick reprogramming
                         * in IRQ tail.
                         */
                        if (tick_nohz_full_cpu(base_local->cpu))
                                base_global->is_idle = true;
                        trace_timer_base_idle(true, base_local->cpu);
                }
                *idle = base_local->is_idle;

                /*
                 * When timer base is not set idle, undo the effect of
                 * tmigr_cpu_deactivate() to prevent inconsistent states - active
                 * timer base but inactive timer migration hierarchy.
                 *
                 * When timer base was already marked idle, nothing will be
                 * changed here.
                 */
                if (!base_local->is_idle && idle_is_possible)
                        tmigr_cpu_activate();
        }

        raw_spin_unlock(&base_global->lock);
        raw_spin_unlock(&base_local->lock);

        return cmp_next_hrtimer_event(basem, tevt.local);
}

/**
 * get_next_timer_interrupt() - return the time (clock mono) of the next timer
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 *
 * Returns the tick aligned clock monotonic time of the next pending timer or
 * KTIME_MAX if no timer is pending. If timer of global base was queued into
 * timer migration hierarchy, first global timer is not taken into account. If
 * it was the last CPU of timer migration hierarchy going idle, first global
 * event is taken into account.
 */
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
        return __get_next_timer_interrupt(basej, basem, NULL);
}

/**
 * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 * @idle:        pointer to store the value of timer_base->is_idle on return;
 *                *idle contains the information whether tick was already stopped
 *
 * Returns the tick aligned clock monotonic time of the next pending timer or
 * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
 * returned as well.
 */
u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
{
        if (*idle)
                return KTIME_MAX;

        return __get_next_timer_interrupt(basej, basem, idle);
}

/**
 * timer_clear_idle - Clear the idle state of the timer base
 *
 * Called with interrupts disabled
 */
void timer_clear_idle(void)
{
        int this_cpu = smp_processor_id();
        /*
         * We do this unlocked. The worst outcome is a remote pinned timer
         * enqueue sending a pointless IPI, but taking the lock would just
         * make the window for sending the IPI a few instructions smaller
         * for the cost of taking the lock in the exit from idle
         * path. Required for BASE_LOCAL only.
         */
        __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
        if (tick_nohz_full_cpu(this_cpu))
                __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
        trace_timer_base_idle(false, this_cpu);

        /* Activate without holding the timer_base->lock */
        tmigr_cpu_activate();
}
#endif

/**
 * __run_timers - run all expired timers (if any) on this CPU.
 * @base: the timer vector to be processed.
 */
static inline void __run_timers(struct timer_base *base)
{
        struct hlist_head heads[LVL_DEPTH];
        int levels;

        lockdep_assert_held(&base->lock);

        if (base->running_timer)
                return;

        while (time_after_eq(jiffies, base->clk) &&
               time_after_eq(jiffies, base->next_expiry)) {
                levels = collect_expired_timers(base, heads);
                /*
                 * The two possible reasons for not finding any expired
                 * timer at this clk are that all matching timers have been
                 * dequeued or no timer has been queued since
                 * base::next_expiry was set to base::clk +
                 * TIMER_NEXT_MAX_DELTA.
                 */
                WARN_ON_ONCE(!levels && !base->next_expiry_recalc
                             && base->timers_pending);
                /*
                 * While executing timers, base->clk is set 1 offset ahead of
                 * jiffies to avoid endless requeuing to current jiffies.
                 */
                base->clk++;
                timer_recalc_next_expiry(base);

                while (levels--)
                        expire_timers(base, heads + levels);
        }
}

static void __run_timer_base(struct timer_base *base)
{
        /* Can race against a remote CPU updating next_expiry under the lock */
        if (time_before(jiffies, READ_ONCE(base->next_expiry)))
                return;

        timer_base_lock_expiry(base);
        raw_spin_lock_irq(&base->lock);
        __run_timers(base);
        raw_spin_unlock_irq(&base->lock);
        timer_base_unlock_expiry(base);
}

static void run_timer_base(int index)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[index]);

        __run_timer_base(base);
}

/*
 * This function runs timers and the timer-tq in bottom half context.
 */
static __latent_entropy void run_timer_softirq(void)
{
        run_timer_base(BASE_LOCAL);
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
                run_timer_base(BASE_GLOBAL);
                run_timer_base(BASE_DEF);

                if (is_timers_nohz_active())
                        tmigr_handle_remote();
        }
}

/*
 * Called by the local, per-CPU timer interrupt on SMP.
 */
static void run_local_timers(void)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);

        hrtimer_run_queues();

        for (int i = 0; i < NR_BASES; i++, base++) {
                /*
                 * Raise the softirq only if required.
                 *
                 * timer_base::next_expiry can be written by a remote CPU while
                 * holding the lock. If this write happens at the same time than
                 * the lockless local read, sanity checker could complain about
                 * data corruption.
                 *
                 * There are two possible situations where
                 * timer_base::next_expiry is written by a remote CPU:
                 *
                 * 1. Remote CPU expires global timers of this CPU and updates
                 * timer_base::next_expiry of BASE_GLOBAL afterwards in
                 * next_timer_interrupt() or timer_recalc_next_expiry(). The
                 * worst outcome is a superfluous raise of the timer softirq
                 * when the not yet updated value is read.
                 *
                 * 2. A new first pinned timer is enqueued by a remote CPU
                 * and therefore timer_base::next_expiry of BASE_LOCAL is
                 * updated. When this update is missed, this isn't a
                 * problem, as an IPI is executed nevertheless when the CPU
                 * was idle before. When the CPU wasn't idle but the update
                 * is missed, then the timer would expire one jiffy late -
                 * bad luck.
                 *
                 * Those unlikely corner cases where the worst outcome is only a
                 * one jiffy delay or a superfluous raise of the softirq are
                 * not that expensive as doing the check always while holding
                 * the lock.
                 *
                 * Possible remote writers are using WRITE_ONCE(). Local reader
                 * uses therefore READ_ONCE().
                 */
                if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) ||
                    (i == BASE_DEF && tmigr_requires_handle_remote())) {
                        raise_timer_softirq(TIMER_SOFTIRQ);
                        return;
                }
        }
}

/*
 * Called from the timer interrupt handler to charge one tick to the current
 * process.  user_tick is 1 if the tick is user time, 0 for system.
 */
void update_process_times(int user_tick)
{
        struct task_struct *p = current;

        /* Note: this timer irq context must be accounted for as well. */
        account_process_tick(p, user_tick);
        run_local_timers();
        rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
        if (in_hardirq())
                irq_work_tick();
#endif
        sched_tick();
        if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                run_posix_cpu_timers();
}

#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
        struct timer_list *timer;
        int cpu = new_base->cpu;

        while (!hlist_empty(head)) {
                timer = hlist_entry(head->first, struct timer_list, entry);
                detach_timer(timer, false);
                timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
                internal_add_timer(new_base, timer);
        }
}

int timers_prepare_cpu(unsigned int cpu)
{
        struct timer_base *base;
        int b;

        for (b = 0; b < NR_BASES; b++) {
                base = per_cpu_ptr(&timer_bases[b], cpu);
                base->clk = jiffies;
                base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
                base->next_expiry_recalc = false;
                base->timers_pending = false;
                base->is_idle = false;
        }
        return 0;
}

int timers_dead_cpu(unsigned int cpu)
{
        struct timer_base *old_base;
        struct timer_base *new_base;
        int b, i;

        for (b = 0; b < NR_BASES; b++) {
                old_base = per_cpu_ptr(&timer_bases[b], cpu);
                new_base = get_cpu_ptr(&timer_bases[b]);
                /*
                 * The caller is globally serialized and nobody else
                 * takes two locks at once, deadlock is not possible.
                 */
                raw_spin_lock_irq(&new_base->lock);
                raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);

                /*
                 * The current CPUs base clock might be stale. Update it
                 * before moving the timers over.
                 */
                forward_timer_base(new_base);

                WARN_ON_ONCE(old_base->running_timer);
                old_base->running_timer = NULL;

                for (i = 0; i < WHEEL_SIZE; i++)
                        migrate_timer_list(new_base, old_base->vectors + i);

                raw_spin_unlock(&old_base->lock);
                raw_spin_unlock_irq(&new_base->lock);
                put_cpu_ptr(&timer_bases);
        }
        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

static void __init init_timer_cpu(int cpu)
{
        struct timer_base *base;
        int i;

        for (i = 0; i < NR_BASES; i++) {
                base = per_cpu_ptr(&timer_bases[i], cpu);
                base->cpu = cpu;
                raw_spin_lock_init(&base->lock);
                base->clk = jiffies;
                base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
                timer_base_init_expiry_lock(base);
        }
}

static void __init init_timer_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                init_timer_cpu(cpu);
}

void __init timers_init(void)
{
        init_timer_cpus();
        posix_cputimers_init_work();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
































































































































































































































































































































































   69 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    8 























    8 

    9 































































































   26 







   29 






















































































































   65 


    3 


   63 



















































    1 




    1 
    1 















































































































   67 










   66 


































   67 





















    6 



















    6 







    8 
    8 
    8 





   23 






    5 
   66 




















   48 
   56 


   54 





   10 
   11 
























































































































































































   21 





   21 













   21 


























































































































































































































































































































































































































































































   10 



   11 
















































































































































    1 








    1 




    1 



































































































    1 


    1 
    1 


























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Simple NUMA memory policy for the Linux kernel.
 *
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
 *
 * NUMA policy allows the user to give hints in which node(s) memory should
 * be allocated.
 *
 * Support six policies per VMA and per process:
 *
 * The VMA policy has priority over the process policy for a page fault.
 *
 * interleave     Allocate memory interleaved over a set of nodes,
 *                with normal fallback if it fails.
 *                For VMA based allocations this interleaves based on the
 *                offset into the backing object or offset into the mapping
 *                for anonymous memory. For process policy an process counter
 *                is used.
 *
 * weighted interleave
 *                Allocate memory interleaved over a set of nodes based on
 *                a set of weights (per-node), with normal fallback if it
 *                fails.  Otherwise operates the same as interleave.
 *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
 *                on node 0 for every 1 page allocated on node 1.
 *
 * bind           Only allocate memory on a specific set of nodes,
 *                no fallback.
 *                FIXME: memory is allocated starting with the first node
 *                to the last. It would be better if bind would truly restrict
 *                the allocation to memory nodes instead
 *
 * preferred      Try a specific node first before normal fallback.
 *                As a special case NUMA_NO_NODE here means do the allocation
 *                on the local CPU. This is normally identical to default,
 *                but useful to set in a VMA when you have a non default
 *                process policy.
 *
 * preferred many Try a set of nodes first before normal fallback. This is
 *                similar to preferred without the special case.
 *
 * default        Allocate on the local node first, or when on a VMA
 *                use the process policy. This is what Linux always did
 *                  in a NUMA aware kernel and still does by, ahem, default.
 *
 * The process policy is applied for most non interrupt memory allocations
 * in that process' context. Interrupts ignore the policies and always
 * try to allocate on the local CPU. The VMA policy is only applied for memory
 * allocations for a VMA in the VM.
 *
 * Currently there are a few corner cases in swapping where the policy
 * is not applied, but the majority should be handled. When process policy
 * is used it is not remembered over swap outs/swap ins.
 *
 * Only the highest zone in the zone hierarchy gets policied. Allocations
 * requesting a lower zone just use default policy. This implies that
 * on systems with highmem kernel lowmem allocation don't get policied.
 * Same with GFP_DMA allocations.
 *
 * For shmem/tmpfs shared memory the policy is shared between
 * all users and remembered even when nobody has memory mapped.
 */

/* Notebook:
   fix mmap readahead to honour policy and enable policy for any page cache
   object
   statistics for bigpages
   global policy for page cache? currently it uses process policy. Requires
   first item above.
   handle mremap for shared memory (currently ignored for the policy)
   grows down?
   make bind policy root only? It can trigger oom much faster and the
   kernel is not always grateful with that.
*/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/memory-tiers.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/leafops.h>
#include <linux/gcd.h>

#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>
#include <linux/memory.h>

#include "internal.h"

/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)        /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)        /* Invert check for nodemask */
#define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)        /* Write-lock walked vmas */

static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;

/* Highest zone. An specific allocation for a zone below that is not
   policied. */
enum zone_type policy_zone = 0;

/*
 * run-time system-wide default policy => local allocation
 */
static struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
        .mode = MPOL_LOCAL,
};

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

/*
 * weightiness balances the tradeoff between small weights (cycles through nodes
 * faster, more fair/even distribution) and large weights (smaller errors
 * between actual bandwidth ratios and weight ratios). 32 is a number that has
 * been found to perform at a reasonable compromise between the two goals.
 */
static const int weightiness = 32;

/*
 * A null weighted_interleave_state is interpreted as having .mode="auto",
 * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
 */
struct weighted_interleave_state {
        bool mode_auto;
        u8 iw_table[];
};
static struct weighted_interleave_state __rcu *wi_state;
static unsigned int *node_bw_table;

/*
 * wi_state_lock protects both wi_state and node_bw_table.
 * node_bw_table is only used by writers to update wi_state.
 */
static DEFINE_MUTEX(wi_state_lock);

static u8 get_il_weight(int node)
{
        struct weighted_interleave_state *state;
        u8 weight = 1;

        rcu_read_lock();
        state = rcu_dereference(wi_state);
        if (state)
                weight = state->iw_table[node];
        rcu_read_unlock();
        return weight;
}

/*
 * Convert bandwidth values into weighted interleave weights.
 * Call with wi_state_lock.
 */
static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
{
        u64 sum_bw = 0;
        unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
        int nid;

        for_each_node_state(nid, N_MEMORY)
                sum_bw += bw[nid];

        /* Scale bandwidths to whole numbers in the range [1, weightiness] */
        for_each_node_state(nid, N_MEMORY) {
                /*
                 * Try not to perform 64-bit division.
                 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
                 * If sum_bw > scaling_factor, then round the weight up to 1.
                 */
                scaling_factor = weightiness * bw[nid];
                if (bw[nid] && sum_bw < scaling_factor) {
                        cast_sum_bw = (unsigned int)sum_bw;
                        new_iw[nid] = scaling_factor / cast_sum_bw;
                } else {
                        new_iw[nid] = 1;
                }
                if (!iw_gcd)
                        iw_gcd = new_iw[nid];
                iw_gcd = gcd(iw_gcd, new_iw[nid]);
        }

        /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
        for_each_node_state(nid, N_MEMORY)
                new_iw[nid] /= iw_gcd;
}

int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
{
        struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
        unsigned int *old_bw, *new_bw;
        unsigned int bw_val;
        int i;

        bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
        new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
        if (!new_bw)
                return -ENOMEM;

        new_wi_state = kmalloc_flex(*new_wi_state, iw_table, nr_node_ids);
        if (!new_wi_state) {
                kfree(new_bw);
                return -ENOMEM;
        }
        new_wi_state->mode_auto = true;
        for (i = 0; i < nr_node_ids; i++)
                new_wi_state->iw_table[i] = 1;

        /*
         * Update bandwidth info, even in manual mode. That way, when switching
         * to auto mode in the future, iw_table can be overwritten using
         * accurate bw data.
         */
        mutex_lock(&wi_state_lock);

        old_bw = node_bw_table;
        if (old_bw)
                memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
        new_bw[node] = bw_val;
        node_bw_table = new_bw;

        old_wi_state = rcu_dereference_protected(wi_state,
                                        lockdep_is_held(&wi_state_lock));
        if (old_wi_state && !old_wi_state->mode_auto) {
                /* Manual mode; skip reducing weights and updating wi_state */
                mutex_unlock(&wi_state_lock);
                kfree(new_wi_state);
                goto out;
        }

        /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
        reduce_interleave_weights(new_bw, new_wi_state->iw_table);
        rcu_assign_pointer(wi_state, new_wi_state);

        mutex_unlock(&wi_state_lock);
        if (old_wi_state) {
                synchronize_rcu();
                kfree(old_wi_state);
        }
out:
        kfree(old_bw);
        return 0;
}

/**
 * numa_nearest_node - Find nearest node by state
 * @node: Node id to start the search
 * @state: State to filter the search
 *
 * Lookup the closest node by distance if @nid is not in state.
 *
 * Return: this @node if it is in state, otherwise the closest node by distance
 */
int numa_nearest_node(int node, unsigned int state)
{
        int min_dist = INT_MAX, dist, n, min_node;

        if (state >= NR_NODE_STATES)
                return -EINVAL;

        if (node == NUMA_NO_NODE || node_state(node, state))
                return node;

        min_node = node;
        for_each_node_state(n, state) {
                dist = node_distance(node, n);
                if (dist < min_dist) {
                        min_dist = dist;
                        min_node = n;
                }
        }

        return min_node;
}
EXPORT_SYMBOL_GPL(numa_nearest_node);

/**
 * nearest_node_nodemask - Find the node in @mask at the nearest distance
 *                           from @node.
 *
 * @node: a valid node ID to start the search from.
 * @mask: a pointer to a nodemask representing the allowed nodes.
 *
 * This function iterates over all nodes in @mask and calculates the
 * distance from the starting @node, then it returns the node ID that is
 * the closest to @node, or MAX_NUMNODES if no node is found.
 *
 * Note that @node must be a valid node ID usable with node_distance(),
 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
 * or unexpected behavior.
 */
int nearest_node_nodemask(int node, nodemask_t *mask)
{
        int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;

        for_each_node_mask(n, *mask) {
                dist = node_distance(node, n);
                if (dist < min_dist) {
                        min_dist = dist;
                        min_node = n;
                }
        }

        return min_node;
}
EXPORT_SYMBOL_GPL(nearest_node_nodemask);

struct mempolicy *get_task_policy(struct task_struct *p)
{
        struct mempolicy *pol = p->mempolicy;
        int node;

        if (pol)
                return pol;

        node = numa_node_id();
        if (node != NUMA_NO_NODE) {
                pol = &preferred_node_policy[node];
                /* preferred_node_policy is not initialised early in boot */
                if (pol->mode)
                        return pol;
        }

        return &default_policy;
}
EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm");

static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];

static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
        return pol->flags & MPOL_USER_NODEMASK_FLAGS;
}

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
                                   const nodemask_t *rel)
{
        nodemask_t tmp;
        nodes_fold(tmp, *orig, nodes_weight(*rel));
        nodes_onto(*ret, tmp, *rel);
}

static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;
        pol->nodes = *nodes;
        return 0;
}

static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;

        nodes_clear(pol->nodes);
        node_set(first_node(*nodes), pol->nodes);
        return 0;
}

/*
 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 * any, for the new policy.  mpol_new() has already validated the nodes
 * parameter with respect to the policy mode and flags.
 *
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_lock for write.
 */
static int mpol_set_nodemask(struct mempolicy *pol,
                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
        int ret;

        /*
         * Default (pol==NULL) resp. local memory policies are not a
         * subject of any remapping. They also do not need any special
         * constructor.
         */
        if (!pol || pol->mode == MPOL_LOCAL)
                return 0;

        /* Check N_MEMORY */
        nodes_and(nsc->mask1,
                  cpuset_current_mems_allowed, node_states[N_MEMORY]);

        VM_BUG_ON(!nodes);

        if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
        else
                nodes_and(nsc->mask2, *nodes, nsc->mask1);

        if (mpol_store_user_nodemask(pol))
                pol->w.user_nodemask = *nodes;
        else
                pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;

        ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
        return ret;
}

/*
 * This function just creates a new policy, does some check and simple
 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 */
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                                  nodemask_t *nodes)
{
        struct mempolicy *policy;

        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
                return NULL;
        }
        VM_BUG_ON(!nodes);

        /*
         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
         * All other modes require a valid pointer to a non-empty nodemask.
         */
        if (mode == MPOL_PREFERRED) {
                if (nodes_empty(*nodes)) {
                        if (((flags & MPOL_F_STATIC_NODES) ||
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);

                        mode = MPOL_LOCAL;
                }
        } else if (mode == MPOL_LOCAL) {
                if (!nodes_empty(*nodes) ||
                    (flags & MPOL_F_STATIC_NODES) ||
                    (flags & MPOL_F_RELATIVE_NODES))
                        return ERR_PTR(-EINVAL);
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);

        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!policy)
                return ERR_PTR(-ENOMEM);
        atomic_set(&policy->refcnt, 1);
        policy->mode = mode;
        policy->flags = flags;
        policy->home_node = NUMA_NO_NODE;

        return policy;
}

/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *pol)
{
        if (!atomic_dec_and_test(&pol->refcnt))
                return;
        /*
         * Required to allow mmap_lock_speculative*() access, see for example
         * futex_key_to_node_opt(). All accesses are serialized by mmap_lock,
         * however the speculative lock section unbound by the normal lock
         * boundaries, requiring RCU freeing.
         */
        kfree_rcu(pol, rcu);
}
EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");

static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}

static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        nodemask_t tmp;

        if (pol->flags & MPOL_F_STATIC_NODES)
                nodes_and(tmp, pol->w.user_nodemask, *nodes);
        else if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
        else {
                nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
                                                                *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }

        if (nodes_empty(tmp))
                tmp = *nodes;

        pol->nodes = tmp;
}

static void mpol_rebind_preferred(struct mempolicy *pol,
                                                const nodemask_t *nodes)
{
        pol->w.cpuset_mems_allowed = *nodes;
}

/*
 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 *
 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 * policies are protected by task->mems_allowed_seq to prevent a premature
 * OOM/allocation failure due to parallel nodemask modification.
 */
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
        if (!pol || pol->mode == MPOL_LOCAL)
                return;
        if (!mpol_store_user_nodemask(pol) &&
            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
                return;

        mpol_ops[pol->mode].rebind(pol, newmask);
}

/*
 * Wrapper for mpol_rebind_policy() that just requires task
 * pointer, and updates task mempolicy.
 *
 * Called with task's alloc_lock held.
 */
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
        mpol_rebind_policy(tsk->mempolicy, new);
}

/*
 * Rebind each vma in mm to new nodemask.
 *
 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 */
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_write_lock(mm);
        for_each_vma(vmi, vma) {
                vma_start_write(vma);
                mpol_rebind_policy(vma->vm_policy, new);
        }
        mmap_write_unlock(mm);
}

static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
        [MPOL_DEFAULT] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_INTERLEAVE] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_PREFERRED] = {
                .create = mpol_new_preferred,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_BIND] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_LOCAL] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_PREFERRED_MANY] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_WEIGHTED_INTERLEAVE] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
};

static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags);
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
                                pgoff_t ilx, int *nid);

static bool strictly_unmovable(unsigned long flags)
{
        /*
         * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
         * if any misplaced page is found.
         */
        return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
                         MPOL_MF_STRICT;
}

struct migration_mpol {                /* for alloc_migration_target_by_mpol() */
        struct mempolicy *pol;
        pgoff_t ilx;
};

struct queue_pages {
        struct list_head *pagelist;
        unsigned long flags;
        nodemask_t *nmask;
        unsigned long start;
        unsigned long end;
        struct vm_area_struct *first;
        struct folio *large;                /* note last large folio encountered */
        long nr_failed;                        /* could not be isolated at this time */
};

/*
 * Check if the folio's nid is in qp->nmask.
 *
 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 * in the invert of qp->nmask.
 */
static inline bool queue_folio_required(struct folio *folio,
                                        struct queue_pages *qp)
{
        int nid = folio_nid(folio);
        unsigned long flags = qp->flags;

        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}

static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
{
        struct folio *folio;
        struct queue_pages *qp = walk->private;

        if (unlikely(pmd_is_migration_entry(*pmd))) {
                qp->nr_failed++;
                return;
        }
        folio = pmd_folio(*pmd);
        if (is_huge_zero_folio(folio)) {
                walk->action = ACTION_CONTINUE;
                return;
        }
        if (!queue_folio_required(folio, qp))
                return;
        if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma) ||
            !migrate_folio_add(folio, qp->pagelist, qp->flags))
                qp->nr_failed++;
}

/*
 * Scan through folios, checking if they satisfy the required conditions,
 * moving them from LRU to local pagelist for migration if they do (or not).
 *
 * queue_folios_pte_range() has two possible return values:
 * 0 - continue walking to scan for more, even if an existing folio on the
 *     wrong node could not be isolated and queued for migration.
 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
 *        and an existing folio was on a node that does not follow the policy.
 */
static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
                        unsigned long end, struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->vma;
        struct folio *folio;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        pte_t *pte, *mapped_pte;
        pte_t ptent;
        spinlock_t *ptl;
        int max_nr, nr;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                queue_folios_pmd(pmd, walk);
                spin_unlock(ptl);
                goto out;
        }

        mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        if (!pte) {
                walk->action = ACTION_AGAIN;
                return 0;
        }
        for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
                max_nr = (end - addr) >> PAGE_SHIFT;
                nr = 1;
                ptent = ptep_get(pte);
                if (pte_none(ptent))
                        continue;
                if (!pte_present(ptent)) {
                        const softleaf_t entry = softleaf_from_pte(ptent);

                        if (softleaf_is_migration(entry))
                                qp->nr_failed++;
                        continue;
                }
                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;
                if (folio_test_large(folio) && max_nr != 1)
                        nr = folio_pte_batch(folio, pte, ptent, max_nr);
                /*
                 * vm_normal_folio() filters out zero pages, but there might
                 * still be reserved folios to skip, perhaps in a VDSO.
                 */
                if (folio_test_reserved(folio))
                        continue;
                if (!queue_folio_required(folio, qp))
                        continue;
                if (folio_test_large(folio)) {
                        /*
                         * A large folio can only be isolated from LRU once,
                         * but may be mapped by many PTEs (and Copy-On-Write may
                         * intersperse PTEs of other, order 0, folios).  This is
                         * a common case, so don't mistake it for failure (but
                         * there can be other cases of multi-mapped pages which
                         * this quick check does not help to filter out - and a
                         * search of the pagelist might grow to be prohibitive).
                         *
                         * migrate_pages(&pagelist) returns nr_failed folios, so
                         * check "large" now so that queue_pages_range() returns
                         * a comparable nr_failed folios.  This does imply that
                         * if folio could not be isolated for some racy reason
                         * at its first PTE, later PTEs will not give it another
                         * chance of isolation; but keeps the accounting simple.
                         */
                        if (folio == qp->large)
                                continue;
                        qp->large = folio;
                }
                if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
                    !vma_migratable(vma) ||
                    !migrate_folio_add(folio, qp->pagelist, flags)) {
                        qp->nr_failed += nr;
                        if (strictly_unmovable(flags))
                                break;
                }
        }
        pte_unmap_unlock(mapped_pte, ptl);
        cond_resched();
out:
        if (qp->nr_failed && strictly_unmovable(flags))
                return -EIO;
        return 0;
}

static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
                               unsigned long addr, unsigned long end,
                               struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        struct folio *folio;
        spinlock_t *ptl;
        pte_t ptep;

        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
        ptep = huge_ptep_get(walk->mm, addr, pte);
        if (!pte_present(ptep)) {
                if (!huge_pte_none(ptep)) {
                        const softleaf_t entry = softleaf_from_pte(ptep);

                        if (unlikely(softleaf_is_migration(entry)))
                                qp->nr_failed++;
                }

                goto unlock;
        }
        folio = pfn_folio(pte_pfn(ptep));
        if (!queue_folio_required(folio, qp))
                goto unlock;
        if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma)) {
                qp->nr_failed++;
                goto unlock;
        }
        /*
         * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
         * Choosing not to migrate a shared folio is not counted as a failure.
         *
         * See folio_maybe_mapped_shared() on possible imprecision when we
         * cannot easily detect if a folio is shared.
         */
        if ((flags & MPOL_MF_MOVE_ALL) ||
            (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
                if (!folio_isolate_hugetlb(folio, qp->pagelist))
                        qp->nr_failed++;
unlock:
        spin_unlock(ptl);
        if (qp->nr_failed && strictly_unmovable(flags))
                return -EIO;
#endif
        return 0;
}

#ifdef CONFIG_NUMA_BALANCING
/**
 * folio_can_map_prot_numa() - check whether the folio can map prot numa
 * @folio: The folio whose mapping considered for being made NUMA hintable
 * @vma: The VMA that the folio belongs to.
 * @is_private_single_threaded: Is this a single-threaded private VMA or not
 *
 * This function checks to see if the folio actually indicates that
 * we need to make the mapping one which causes a NUMA hinting fault,
 * as there are cases where it's simply unnecessary, and the folio's
 * access time is adjusted for memory tiering if prot numa needed.
 *
 * Return: True if the mapping of the folio needs to be changed, false otherwise.
 */
bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
                bool is_private_single_threaded)
{
        int nid;

        if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
                return false;

        /* Also skip shared copy-on-write folios */
        if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
                return false;

        /* Folios are pinned and can't be migrated */
        if (folio_maybe_dma_pinned(folio))
                return false;

        /*
         * While migration can move some dirty folios,
         * it cannot move them all from MIGRATE_ASYNC
         * context.
         */
        if (folio_is_file_lru(folio) && folio_test_dirty(folio))
                return false;

        /*
         * Don't mess with PTEs if folio is already on the node
         * a single-threaded process is running on.
         */
        nid = folio_nid(folio);
        if (is_private_single_threaded && (nid == numa_node_id()))
                return false;

        /*
         * Skip scanning top tier node if normal numa
         * balancing is disabled
         */
        if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
            node_is_toptier(nid))
                return false;

        if (folio_use_access_time(folio))
                folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));

        return true;
}

/*
 * This is used to mark a range of virtual addresses to be inaccessible.
 * These are later cleared by a NUMA hinting fault. Depending on these
 * faults, pages may be migrated for better NUMA placement.
 *
 * This is assuming that NUMA faults are handled using PROT_NONE. If
 * an architecture makes a different choice, it will need further
 * changes to the core.
 */
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long addr, unsigned long end)
{
        struct mmu_gather tlb;
        long nr_updated;

        tlb_gather_mmu(&tlb, vma->vm_mm);

        nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
        if (nr_updated > 0) {
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
                count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
        }

        tlb_finish_mmu(&tlb);

        return nr_updated;
}
#endif /* CONFIG_NUMA_BALANCING */

static int queue_pages_test_walk(unsigned long start, unsigned long end,
                                struct mm_walk *walk)
{
        struct vm_area_struct *next, *vma = walk->vma;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;

        /* range check first */
        VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);

        if (!qp->first) {
                qp->first = vma;
                if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                        (qp->start < vma->vm_start))
                        /* hole at head side of range */
                        return -EFAULT;
        }
        next = find_vma(vma->vm_mm, vma->vm_end);
        if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                ((vma->vm_end < qp->end) &&
                (!next || vma->vm_end < next->vm_start)))
                /* hole at middle or tail of range */
                return -EFAULT;

        /*
         * Need check MPOL_MF_STRICT to return -EIO if possible
         * regardless of vma_migratable
         */
        if (!vma_migratable(vma) &&
            !(flags & MPOL_MF_STRICT))
                return 1;

        /*
         * Check page nodes, and queue pages to move, in the current vma.
         * But if no moving, and no strict checking, the scan can be skipped.
         */
        if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                return 0;
        return 1;
}

static const struct mm_walk_ops queue_pages_walk_ops = {
        .hugetlb_entry                = queue_folios_hugetlb,
        .pmd_entry                = queue_folios_pte_range,
        .test_walk                = queue_pages_test_walk,
        .walk_lock                = PGWALK_RDLOCK,
};

static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
        .hugetlb_entry                = queue_folios_hugetlb,
        .pmd_entry                = queue_folios_pte_range,
        .test_walk                = queue_pages_test_walk,
        .walk_lock                = PGWALK_WRLOCK,
};

/*
 * Walk through page tables and collect pages to be migrated.
 *
 * If pages found in a given range are not on the required set of @nodes,
 * and migration is allowed, they are isolated and queued to @pagelist.
 *
 * queue_pages_range() may return:
 * 0 - all pages already on the right node, or successfully queued for moving
 *     (or neither strict checking nor moving requested: only range checking).
 * >0 - this number of misplaced folios could not be queued for moving
 *      (a hugetlbfs page or a transparent huge page being counted as 1).
 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
 */
static long
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                nodemask_t *nodes, unsigned long flags,
                struct list_head *pagelist)
{
        int err;
        struct queue_pages qp = {
                .pagelist = pagelist,
                .flags = flags,
                .nmask = nodes,
                .start = start,
                .end = end,
                .first = NULL,
        };
        const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
                        &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;

        err = walk_page_range(mm, start, end, ops, &qp);

        if (!qp.first)
                /* whole range in hole */
                err = -EFAULT;

        return err ? : qp.nr_failed;
}

/*
 * Apply policy to a single VMA
 * This must be called with the mmap_lock held for writing.
 */
static int vma_replace_policy(struct vm_area_struct *vma,
                                struct mempolicy *pol)
{
        int err;
        struct mempolicy *old;
        struct mempolicy *new;

        vma_assert_write_locked(vma);

        new = mpol_dup(pol);
        if (IS_ERR(new))
                return PTR_ERR(new);

        if (vma->vm_ops && vma->vm_ops->set_policy) {
                err = vma->vm_ops->set_policy(vma, new);
                if (err)
                        goto err_out;
        }

        old = vma->vm_policy;
        WRITE_ONCE(vma->vm_policy, new); /* protected by mmap_lock */
        mpol_put(old);

        return 0;
 err_out:
        mpol_put(new);
        return err;
}

/* Split or merge the VMA (if required) and apply the new policy */
static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
                struct vm_area_struct **prev, unsigned long start,
                unsigned long end, struct mempolicy *new_pol)
{
        unsigned long vmstart, vmend;

        vmend = min(end, vma->vm_end);
        if (start > vma->vm_start) {
                *prev = vma;
                vmstart = start;
        } else {
                vmstart = vma->vm_start;
        }

        if (mpol_equal(vma->vm_policy, new_pol)) {
                *prev = vma;
                return 0;
        }

        vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
        if (IS_ERR(vma))
                return PTR_ERR(vma);

        *prev = vma;
        return vma_replace_policy(vma, new_pol);
}

/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
{
        struct mempolicy *new, *old;
        NODEMASK_SCRATCH(scratch);
        int ret;

        if (!scratch)
                return -ENOMEM;

        new = mpol_new(mode, flags, nodes);
        if (IS_ERR(new)) {
                ret = PTR_ERR(new);
                goto out;
        }

        task_lock(current);
        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
                mpol_put(new);
                goto out;
        }

        old = current->mempolicy;
        current->mempolicy = new;
        if (new && (new->mode == MPOL_INTERLEAVE ||
                    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
                current->il_prev = MAX_NUMNODES-1;
                current->il_weight = 0;
        }
        task_unlock(current);
        mpol_put(old);
        ret = 0;
out:
        NODEMASK_SCRATCH_FREE(scratch);
        return ret;
}

/*
 * Return nodemask for policy for get_mempolicy() query
 *
 * Called with task's alloc_lock held
 */
static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
{
        nodes_clear(*nodes);
        if (pol == &default_policy)
                return;

        switch (pol->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_WEIGHTED_INTERLEAVE:
                *nodes = pol->nodes;
                break;
        case MPOL_LOCAL:
                /* return empty node mask for local allocation */
                break;
        default:
                BUG();
        }
}

static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
        struct page *p = NULL;
        int ret;

        ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
        if (ret > 0) {
                ret = page_to_nid(p);
                put_page(p);
        }
        return ret;
}

/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                             unsigned long addr, unsigned long flags)
{
        int err;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;

        if (flags &
                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
                return -EINVAL;

        if (flags & MPOL_F_MEMS_ALLOWED) {
                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
                        return -EINVAL;
                *policy = 0;        /* just so it's initialized */
                task_lock(current);
                *nmask  = cpuset_current_mems_allowed;
                task_unlock(current);
                return 0;
        }

        if (flags & MPOL_F_ADDR) {
                pgoff_t ilx;                /* ignored here */
                /*
                 * Do NOT fall back to task policy if the
                 * vma/shared policy at addr is NULL.  We
                 * want to return MPOL_DEFAULT in this case.
                 */
                mmap_read_lock(mm);
                vma = vma_lookup(mm, addr);
                if (!vma) {
                        mmap_read_unlock(mm);
                        return -EFAULT;
                }
                pol = __get_vma_policy(vma, addr, &ilx);
        } else if (addr)
                return -EINVAL;

        if (!pol)
                pol = &default_policy;        /* indicates default behavior */

        if (flags & MPOL_F_NODE) {
                if (flags & MPOL_F_ADDR) {
                        /*
                         * Take a refcount on the mpol, because we are about to
                         * drop the mmap_lock, after which only "pol" remains
                         * valid, "vma" is stale.
                         */
                        pol_refcount = pol;
                        vma = NULL;
                        mpol_get(pol);
                        mmap_read_unlock(mm);
                        err = lookup_node(mm, addr);
                        if (err < 0)
                                goto out;
                        *policy = err;
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_INTERLEAVE) {
                        *policy = next_node_in(current->il_prev, pol->nodes);
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
                        if (current->il_weight)
                                *policy = current->il_prev;
                        else
                                *policy = next_node_in(current->il_prev,
                                                       pol->nodes);
                } else {
                        err = -EINVAL;
                        goto out;
                }
        } else {
                *policy = pol == &default_policy ? MPOL_DEFAULT :
                                                pol->mode;
                /*
                 * Internal mempolicy flags must be masked off before exposing
                 * the policy to userspace.
                 */
                *policy |= (pol->flags & MPOL_MODE_FLAGS);
        }

        err = 0;
        if (nmask) {
                if (mpol_store_user_nodemask(pol)) {
                        *nmask = pol->w.user_nodemask;
                } else {
                        task_lock(current);
                        get_policy_nodemask(pol, nmask);
                        task_unlock(current);
                }
        }

 out:
        mpol_cond_put(pol);
        if (vma)
                mmap_read_unlock(mm);
        if (pol_refcount)
                mpol_put(pol_refcount);
        return err;
}

#ifdef CONFIG_NUMA_MIGRATION
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags)
{
        /*
         * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
         * Choosing not to migrate a shared folio is not counted as a failure.
         *
         * See folio_maybe_mapped_shared() on possible imprecision when we
         * cannot easily detect if a folio is shared.
         */
        if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
                if (folio_isolate_lru(folio)) {
                        list_add_tail(&folio->lru, foliolist);
                        node_stat_mod_folio(folio,
                                NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                folio_nr_pages(folio));
                } else {
                        /*
                         * Non-movable folio may reach here.  And, there may be
                         * temporary off LRU folios or non-LRU movable folios.
                         * Treat them as unmovable folios since they can't be
                         * isolated, so they can't be moved at the moment.
                         */
                        return false;
                }
        }
        return true;
}

/*
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
 */
static long migrate_to_node(struct mm_struct *mm, int source, int dest,
                            int flags)
{
        nodemask_t nmask;
        struct vm_area_struct *vma;
        LIST_HEAD(pagelist);
        long nr_failed;
        long err = 0;
        struct migration_target_control mtc = {
                .nid = dest,
                .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
                .reason = MR_SYSCALL,
        };

        nodes_clear(nmask);
        node_set(source, nmask);

        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));

        mmap_read_lock(mm);
        vma = find_vma(mm, 0);
        if (unlikely(!vma)) {
                mmap_read_unlock(mm);
                return 0;
        }

        /*
         * This does not migrate the range, but isolates all pages that
         * need migration.  Between passing in the full user address
         * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
         * but passes back the count of pages which could not be isolated.
         */
        nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
                                      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
        mmap_read_unlock(mm);

        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
                        (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
                if (err)
                        putback_movable_pages(&pagelist);
        }

        if (err >= 0)
                err += nr_failed;
        return err;
}

/*
 * Move pages between the two nodesets so as to preserve the physical
 * layout as much as possible.
 *
 * Returns the number of page that could not be moved.
 */
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        long nr_failed = 0;
        long err = 0;
        nodemask_t tmp;

        lru_cache_disable();

        /*
         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
         * bit in 'tmp', and return that <source, dest> pair for migration.
         * The pair of nodemasks 'to' and 'from' define the map.
         *
         * If no pair of bits is found that way, fallback to picking some
         * pair of 'source' and 'dest' bits that are not the same.  If the
         * 'source' and 'dest' bits are the same, this represents a node
         * that will be migrating to itself, so no pages need move.
         *
         * If no bits are left in 'tmp', or if all remaining bits left
         * in 'tmp' correspond to the same bit in 'to', return false
         * (nothing left to migrate).
         *
         * This lets us pick a pair of nodes to migrate between, such that
         * if possible the dest node is not already occupied by some other
         * source node, minimizing the risk of overloading the memory on a
         * node that would happen if we migrated incoming memory to a node
         * before migrating outgoing memory source that same node.
         *
         * A single scan of tmp is sufficient.  As we go, we remember the
         * most recent <s, d> pair that moved (s != d).  If we find a pair
         * that not only moved, but what's better, moved to an empty slot
         * (d is not set in tmp), then we break out then, with that pair.
         * Otherwise when we finish scanning from_tmp, we at least have the
         * most recent <s, d> pair that moved.  If we get all the way through
         * the scan of tmp without finding any node that moved, much less
         * moved to an empty node, then there is nothing left worth migrating.
         */

        tmp = *from;
        while (!nodes_empty(tmp)) {
                int s, d;
                int source = NUMA_NO_NODE;
                int dest = 0;

                for_each_node_mask(s, tmp) {

                        /*
                         * do_migrate_pages() tries to maintain the relative
                         * node relationship of the pages established between
                         * threads and memory areas.
                         *
                         * However if the number of source nodes is not equal to
                         * the number of destination nodes we can not preserve
                         * this node relative relationship.  In that case, skip
                         * copying memory from a node that is in the destination
                         * mask.
                         *
                         * Example: [2,3,4] -> [3,4,5] moves everything.
                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
                         */

                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
                                                (node_isset(s, *to)))
                                continue;

                        d = node_remap(s, *from, *to);
                        if (s == d)
                                continue;

                        source = s;        /* Node moved. Memorize */
                        dest = d;

                        /* dest not in remaining from nodes? */
                        if (!node_isset(dest, tmp))
                                break;
                }
                if (source == NUMA_NO_NODE)
                        break;

                node_clear(source, tmp);
                err = migrate_to_node(mm, source, dest, flags);
                if (err > 0)
                        nr_failed += err;
                if (err < 0)
                        break;
        }

        lru_cache_enable();
        if (err < 0)
                return err;
        return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
}

/*
 * Allocate a new folio for page migration, according to NUMA mempolicy.
 */
static struct folio *alloc_migration_target_by_mpol(struct folio *src,
                                                    unsigned long private)
{
        struct migration_mpol *mmpol = (struct migration_mpol *)private;
        struct mempolicy *pol = mmpol->pol;
        pgoff_t ilx = mmpol->ilx;
        unsigned int order;
        int nid = numa_node_id();
        gfp_t gfp;

        order = folio_order(src);
        ilx += src->index >> order;

        if (folio_test_hugetlb(src)) {
                nodemask_t *nodemask;
                struct hstate *h;

                h = folio_hstate(src);
                gfp = htlb_alloc_mask(h);
                nodemask = policy_nodemask(gfp, pol, ilx, &nid);
                return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
                                htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
        }

        if (folio_test_large(src))
                gfp = GFP_TRANSHUGE;
        else
                gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;

        return folio_alloc_mpol(gfp, order, pol, ilx, nid);
}
#else

static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags)
{
        return false;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        return -ENOSYS;
}

static struct folio *alloc_migration_target_by_mpol(struct folio *src,
                                                    unsigned long private)
{
        return NULL;
}
#endif

static long do_mbind(unsigned long start, unsigned long len,
                     unsigned short mode, unsigned short mode_flags,
                     nodemask_t *nmask, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vma_iterator vmi;
        struct migration_mpol mmpol;
        struct mempolicy *new;
        unsigned long end;
        long err;
        long nr_failed;
        LIST_HEAD(pagelist);

        if (flags & ~(unsigned long)MPOL_MF_VALID)
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;

        if (start & ~PAGE_MASK)
                return -EINVAL;

        if (mode == MPOL_DEFAULT)
                flags &= ~MPOL_MF_STRICT;

        len = PAGE_ALIGN(len);
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;

        new = mpol_new(mode, mode_flags, nmask);
        if (IS_ERR(new))
                return PTR_ERR(new);

        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
         */
        if (!new)
                flags |= MPOL_MF_DISCONTIG_OK;

        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                lru_cache_disable();
        {
                NODEMASK_SCRATCH(scratch);
                if (scratch) {
                        mmap_write_lock(mm);
                        err = mpol_set_nodemask(new, nmask, scratch);
                        if (err)
                                mmap_write_unlock(mm);
                } else
                        err = -ENOMEM;
                NODEMASK_SCRATCH_FREE(scratch);
        }
        if (err)
                goto mpol_out;

        /*
         * Lock the VMAs before scanning for pages to migrate,
         * to ensure we don't miss a concurrently inserted page.
         */
        nr_failed = queue_pages_range(mm, start, end, nmask,
                        flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);

        if (nr_failed < 0) {
                err = nr_failed;
                nr_failed = 0;
        } else {
                vma_iter_init(&vmi, mm, start);
                prev = vma_prev(&vmi);
                for_each_vma_range(vmi, vma, end) {
                        err = mbind_range(&vmi, vma, &prev, start, end, new);
                        if (err)
                                break;
                }
        }

        if (!err && !list_empty(&pagelist)) {
                /* Convert MPOL_DEFAULT's NULL to task or default policy */
                if (!new) {
                        new = get_task_policy(current);
                        mpol_get(new);
                }
                mmpol.pol = new;
                mmpol.ilx = 0;

                /*
                 * In the interleaved case, attempt to allocate on exactly the
                 * targeted nodes, for the first VMA to be migrated; for later
                 * VMAs, the nodes will still be interleaved from the targeted
                 * nodemask, but one by one may be selected differently.
                 */
                if (new->mode == MPOL_INTERLEAVE ||
                    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
                        struct folio *folio;
                        unsigned int order;
                        unsigned long addr = -EFAULT;

                        list_for_each_entry(folio, &pagelist, lru) {
                                if (!folio_test_ksm(folio))
                                        break;
                        }
                        if (!list_entry_is_head(folio, &pagelist, lru)) {
                                vma_iter_init(&vmi, mm, start);
                                for_each_vma_range(vmi, vma, end) {
                                        addr = page_address_in_vma(folio,
                                                folio_page(folio, 0), vma);
                                        if (addr != -EFAULT)
                                                break;
                                }
                        }
                        if (addr != -EFAULT) {
                                order = folio_order(folio);
                                /* We already know the pol, but not the ilx */
                                mpol_cond_put(get_vma_policy(vma, addr, order,
                                                             &mmpol.ilx));
                                /* Set base from which to increment by index */
                                mmpol.ilx -= folio->index >> order;
                        }
                }
        }

        mmap_write_unlock(mm);

        if (!err && !list_empty(&pagelist)) {
                nr_failed |= migrate_pages(&pagelist,
                                alloc_migration_target_by_mpol, NULL,
                                (unsigned long)&mmpol, MIGRATE_SYNC,
                                MR_MEMPOLICY_MBIND, NULL);
        }

        if (nr_failed && (flags & MPOL_MF_STRICT))
                err = -EIO;
        if (!list_empty(&pagelist))
                putback_movable_pages(&pagelist);
mpol_out:
        mpol_put(new);
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                lru_cache_enable();
        return err;
}

/*
 * User space interface with variable sized bitmaps for nodelists.
 */
static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
                      unsigned long maxnode)
{
        unsigned long nlongs = BITS_TO_LONGS(maxnode);
        int ret;

        if (in_compat_syscall())
                ret = compat_get_bitmap(mask,
                                        (const compat_ulong_t __user *)nmask,
                                        maxnode);
        else
                ret = copy_from_user(mask, nmask,
                                     nlongs * sizeof(unsigned long));

        if (ret)
                return -EFAULT;

        if (maxnode % BITS_PER_LONG)
                mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;

        return 0;
}

/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                     unsigned long maxnode)
{
        --maxnode;
        nodes_clear(*nodes);
        if (maxnode == 0 || !nmask)
                return 0;
        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
                return -EINVAL;

        /*
         * When the user specified more nodes than supported just check
         * if the non supported part is all zero, one word at a time,
         * starting at the end.
         */
        while (maxnode > MAX_NUMNODES) {
                unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
                unsigned long t;

                if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
                        return -EFAULT;

                if (maxnode - bits >= MAX_NUMNODES) {
                        maxnode -= bits;
                } else {
                        maxnode = MAX_NUMNODES;
                        t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
                }
                if (t)
                        return -EINVAL;
        }

        return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
}

/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
                              nodemask_t *nodes)
{
        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
        bool compat = in_compat_syscall();

        if (compat)
                nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);

        if (copy > nbytes) {
                if (copy > PAGE_SIZE)
                        return -EINVAL;
                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
                        return -EFAULT;
                copy = nbytes;
                maxnode = nr_node_ids;
        }

        if (compat)
                return compat_put_bitmap((compat_ulong_t __user *)mask,
                                         nodes_addr(*nodes), maxnode);

        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}

/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
        *flags = *mode & MPOL_MODE_FLAGS;
        *mode &= ~MPOL_MODE_FLAGS;

        if ((unsigned int)(*mode) >=  MPOL_MAX)
                return -EINVAL;
        if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
        if (*flags & MPOL_F_NUMA_BALANCING) {
                if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
                        *flags |= (MPOL_F_MOF | MPOL_F_MORON);
                else
                        return -EINVAL;
        }
        return 0;
}

static long kernel_mbind(unsigned long start, unsigned long len,
                         unsigned long mode, const unsigned long __user *nmask,
                         unsigned long maxnode, unsigned int flags)
{
        unsigned short mode_flags;
        nodemask_t nodes;
        int lmode = mode;
        int err;

        start = untagged_addr(start);
        err = sanitize_mpol_flags(&lmode, &mode_flags);
        if (err)
                return err;

        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;

        return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}

SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
                unsigned long, home_node, unsigned long, flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct mempolicy *new, *old;
        unsigned long end;
        int err = -ENOENT;
        VMA_ITERATOR(vmi, mm, start);

        start = untagged_addr(start);
        if (start & ~PAGE_MASK)
                return -EINVAL;
        /*
         * flags is used for future extension if any.
         */
        if (flags != 0)
                return -EINVAL;

        /*
         * Check home_node is online to avoid accessing uninitialized
         * NODE_DATA.
         */
        if (home_node >= MAX_NUMNODES || !node_online(home_node))
                return -EINVAL;

        len = PAGE_ALIGN(len);
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;
        mmap_write_lock(mm);
        prev = vma_prev(&vmi);
        for_each_vma_range(vmi, vma, end) {
                /*
                 * If any vma in the range got policy other than MPOL_BIND
                 * or MPOL_PREFERRED_MANY we return error. We don't reset
                 * the home node for vmas we already updated before.
                 */
                old = vma_policy(vma);
                if (!old) {
                        prev = vma;
                        continue;
                }
                if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
                        err = -EOPNOTSUPP;
                        break;
                }
                new = mpol_dup(old);
                if (IS_ERR(new)) {
                        err = PTR_ERR(new);
                        break;
                }

                vma_start_write(vma);
                new->home_node = home_node;
                err = mbind_range(&vmi, vma, &prev, start, end, new);
                mpol_put(new);
                if (err)
                        break;
        }
        mmap_write_unlock(mm);
        return err;
}

SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
                unsigned long, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode, unsigned int, flags)
{
        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}

/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
                                 unsigned long maxnode)
{
        unsigned short mode_flags;
        nodemask_t nodes;
        int lmode = mode;
        int err;

        err = sanitize_mpol_flags(&lmode, &mode_flags);
        if (err)
                return err;

        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;

        return do_set_mempolicy(lmode, mode_flags, &nodes);
}

SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode)
{
        return kernel_set_mempolicy(mode, nmask, maxnode);
}

static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
                                const unsigned long __user *old_nodes,
                                const unsigned long __user *new_nodes)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        nodemask_t task_nodes;
        int err;
        nodemask_t *old;
        nodemask_t *new;
        NODEMASK_SCRATCH(scratch);

        if (!scratch)
                return -ENOMEM;

        old = &scratch->mask1;
        new = &scratch->mask2;

        err = get_nodes(old, old_nodes, maxnode);
        if (err)
                goto out;

        err = get_nodes(new, new_nodes, maxnode);
        if (err)
                goto out;

        /* Find the mm_struct */
        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        get_task_struct(task);

        err = -EINVAL;

        /*
         * Check if this process has the right to modify the specified process.
         * Use the regular "ptrace_may_access()" checks.
         */
        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
                rcu_read_unlock();
                err = -EPERM;
                goto out_put;
        }
        rcu_read_unlock();

        task_nodes = cpuset_mems_allowed(task);
        /* Is the user allowed to access the target nodes? */
        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out_put;
        }

        task_nodes = cpuset_mems_allowed(current);
        if (!nodes_and(*new, *new, task_nodes))
                goto out_put;

        err = security_task_movememory(task);
        if (err)
                goto out_put;

        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                err = -EINVAL;
                goto out;
        }

        err = do_migrate_pages(mm, old, new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);

        mmput(mm);
out:
        NODEMASK_SCRATCH_FREE(scratch);

        return err;

out_put:
        put_task_struct(task);
        goto out;
}

SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                const unsigned long __user *, old_nodes,
                const unsigned long __user *, new_nodes)
{
        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}

/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
                                unsigned long __user *nmask,
                                unsigned long maxnode,
                                unsigned long addr,
                                unsigned long flags)
{
        int err;
        int pval;
        nodemask_t nodes;

        if (nmask != NULL && maxnode < nr_node_ids)
                return -EINVAL;

        addr = untagged_addr(addr);

        err = do_get_mempolicy(&pval, &nodes, addr, flags);

        if (err)
                return err;

        if (policy && put_user(pval, policy))
                return -EFAULT;

        if (nmask)
                err = copy_nodes_to_user(nmask, maxnode, &nodes);

        return err;
}

SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
                unsigned long __user *, nmask, unsigned long, maxnode,
                unsigned long, addr, unsigned long, flags)
{
        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}

bool vma_migratable(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                return false;

        /*
         * DAX device mappings require predictable access latency, so avoid
         * incurring periodic faults.
         */
        if (vma_is_dax(vma))
                return false;

        if (is_vm_hugetlb_page(vma) &&
                !hugepage_migration_supported(hstate_vma(vma)))
                return false;

        /*
         * Migration allocates pages in the highest zone. If we cannot
         * do so then migration (at least from node to node) is not
         * possible.
         */
        if (vma->vm_file &&
                gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
                        < policy_zone)
                return false;
        return true;
}

struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                                   unsigned long addr, pgoff_t *ilx)
{
        *ilx = 0;
        return (vma->vm_ops && vma->vm_ops->get_policy) ?
                vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
}

/*
 * get_vma_policy(@vma, @addr, @order, @ilx)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup
 * @order: 0, or appropriate huge_page_order for interleaving
 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
 *       MPOL_WEIGHTED_INTERLEAVE
 *
 * Returns effective policy for a VMA at specified address.
 * Falls back to current->mempolicy or system default policy, as necessary.
 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
 * count--added by the get_policy() vm_op, as appropriate--to protect against
 * freeing by another task.  It is the caller's responsibility to free the
 * extra reference for shared policies.
 */
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                 unsigned long addr, int order, pgoff_t *ilx)
{
        struct mempolicy *pol;

        pol = __get_vma_policy(vma, addr, ilx);
        if (!pol)
                pol = get_task_policy(current);
        if (pol->mode == MPOL_INTERLEAVE ||
            pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
                *ilx += vma->vm_pgoff >> order;
                *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
        }
        return pol;
}

bool vma_policy_mof(struct vm_area_struct *vma)
{
        struct mempolicy *pol;

        if (vma->vm_ops && vma->vm_ops->get_policy) {
                bool ret = false;
                pgoff_t ilx;                /* ignored here */

                pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
                if (pol && (pol->flags & MPOL_F_MOF))
                        ret = true;
                mpol_cond_put(pol);

                return ret;
        }

        pol = vma->vm_policy;
        if (!pol)
                pol = get_task_policy(current);

        return pol->flags & MPOL_F_MOF;
}

bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
        enum zone_type dynamic_policy_zone = policy_zone;

        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);

        /*
         * if policy->nodes has movable memory only,
         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
         *
         * policy->nodes is intersect with node_states[N_MEMORY].
         * so if the following test fails, it implies
         * policy->nodes has movable memory only.
         */
        if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
                dynamic_policy_zone = ZONE_MOVABLE;

        return zone >= dynamic_policy_zone;
}

static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
{
        unsigned int node;
        unsigned int cpuset_mems_cookie;

retry:
        /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
        cpuset_mems_cookie = read_mems_allowed_begin();
        node = current->il_prev;
        if (!current->il_weight || !node_isset(node, policy->nodes)) {
                node = next_node_in(node, policy->nodes);
                if (read_mems_allowed_retry(cpuset_mems_cookie))
                        goto retry;
                if (node == MAX_NUMNODES)
                        return node;
                current->il_prev = node;
                current->il_weight = get_il_weight(node);
        }
        current->il_weight--;
        return node;
}

/* Do dynamic interleaving for a process */
static unsigned int interleave_nodes(struct mempolicy *policy)
{
        unsigned int nid;
        unsigned int cpuset_mems_cookie;

        /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
                nid = next_node_in(current->il_prev, policy->nodes);
        } while (read_mems_allowed_retry(cpuset_mems_cookie));

        if (nid < MAX_NUMNODES)
                current->il_prev = nid;
        return nid;
}

/*
 * Depending on the memory policy provide a node from which to allocate the
 * next slab entry.
 */
unsigned int mempolicy_slab_node(void)
{
        struct mempolicy *policy;
        int node = numa_mem_id();

        if (!in_task())
                return node;

        policy = current->mempolicy;
        if (!policy)
                return node;

        switch (policy->mode) {
        case MPOL_PREFERRED:
                return first_node(policy->nodes);

        case MPOL_INTERLEAVE:
                return interleave_nodes(policy);

        case MPOL_WEIGHTED_INTERLEAVE:
                return weighted_interleave_nodes(policy);

        case MPOL_BIND:
        case MPOL_PREFERRED_MANY:
        {
                struct zoneref *z;

                /*
                 * Follow bind policy behavior and start allocation at the
                 * first node.
                 */
                struct zonelist *zonelist;
                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
                z = first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->nodes);
                return zonelist_zone(z) ? zonelist_node_idx(z) : node;
        }
        case MPOL_LOCAL:
                return node;

        default:
                BUG();
        }
}

static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
                                              nodemask_t *mask)
{
        /*
         * barrier stabilizes the nodemask locally so that it can be iterated
         * over safely without concern for changes. Allocators validate node
         * selection does not violate mems_allowed, so this is safe.
         */
        barrier();
        memcpy(mask, &pol->nodes, sizeof(nodemask_t));
        barrier();
        return nodes_weight(*mask);
}

static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
        struct weighted_interleave_state *state;
        nodemask_t nodemask;
        unsigned int target, nr_nodes;
        u8 *table = NULL;
        unsigned int weight_total = 0;
        u8 weight;
        int nid = 0;

        nr_nodes = read_once_policy_nodemask(pol, &nodemask);
        if (!nr_nodes)
                return numa_node_id();

        rcu_read_lock();

        state = rcu_dereference(wi_state);
        /* Uninitialized wi_state means we should assume all weights are 1 */
        if (state)
                table = state->iw_table;

        /* calculate the total weight */
        for_each_node_mask(nid, nodemask)
                weight_total += table ? table[nid] : 1;

        /* Calculate the node offset based on totals */
        target = ilx % weight_total;
        nid = first_node(nodemask);
        while (target) {
                /* detect system default usage */
                weight = table ? table[nid] : 1;
                if (target < weight)
                        break;
                target -= weight;
                nid = next_node_in(nid, nodemask);
        }
        rcu_read_unlock();
        return nid;
}

/*
 * Do static interleaving for interleave index @ilx.  Returns the ilx'th
 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
 * exceeds the number of present nodes.
 */
static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
        nodemask_t nodemask;
        unsigned int target, nnodes;
        int i;
        int nid;

        nnodes = read_once_policy_nodemask(pol, &nodemask);
        if (!nnodes)
                return numa_node_id();
        target = ilx % nnodes;
        nid = first_node(nodemask);
        for (i = 0; i < target; i++)
                nid = next_node(nid, nodemask);
        return nid;
}

/*
 * Return a nodemask representing a mempolicy for filtering nodes for
 * page allocation, together with preferred node id (or the input node id).
 */
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
                                   pgoff_t ilx, int *nid)
{
        nodemask_t *nodemask = NULL;

        switch (pol->mode) {
        case MPOL_PREFERRED:
                /* Override input node id */
                *nid = first_node(pol->nodes);
                break;
        case MPOL_PREFERRED_MANY:
                nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
                break;
        case MPOL_BIND:
                /* Restrict to nodemask (but not on lower zones) */
                if (apply_policy_zone(pol, gfp_zone(gfp)) &&
                    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
                        nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
                /*
                 * __GFP_THISNODE shouldn't even be used with the bind policy
                 * because we might easily break the expectation to stay on the
                 * requested node and not break the policy.
                 */
                WARN_ON_ONCE(gfp & __GFP_THISNODE);
                break;
        case MPOL_INTERLEAVE:
                /* Override input node id */
                *nid = (ilx == NO_INTERLEAVE_INDEX) ?
                        interleave_nodes(pol) : interleave_nid(pol, ilx);
                break;
        case MPOL_WEIGHTED_INTERLEAVE:
                *nid = (ilx == NO_INTERLEAVE_INDEX) ?
                        weighted_interleave_nodes(pol) :
                        weighted_interleave_nid(pol, ilx);
                break;
        }

        return nodemask;
}

#ifdef CONFIG_HUGETLBFS
/*
 * huge_node(@vma, @addr, @gfp_flags, @mpol)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup and interleave policy
 * @gfp_flags: for requested zone
 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
 *
 * Returns a nid suitable for a huge page allocation and a pointer
 * to the struct mempolicy for conditional unref after allocation.
 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
 * to the mempolicy's @nodemask for filtering the zonelist.
 */
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
                struct mempolicy **mpol, nodemask_t **nodemask)
{
        pgoff_t ilx;
        int nid;

        nid = numa_node_id();
        *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
        *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
        return nid;
}

/*
 * init_nodemask_of_mempolicy
 *
 * If the current task's mempolicy is "default" [NULL], return 'false'
 * to indicate default policy.  Otherwise, extract the policy nodemask
 * for 'bind' or 'interleave' policy into the argument nodemask, or
 * initialize the argument nodemask to contain the single node for
 * 'preferred' or 'local' policy and return 'true' to indicate presence
 * of non-default mempolicy.
 *
 * We don't bother with reference counting the mempolicy [mpol_get/put]
 * because the current task is examining it's own mempolicy and a task's
 * mempolicy is only ever changed by the task itself.
 *
 * N.B., it is the caller's responsibility to free a returned nodemask.
 */
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
        struct mempolicy *mempolicy;

        if (!(mask && current->mempolicy))
                return false;

        task_lock(current);
        mempolicy = current->mempolicy;
        switch (mempolicy->mode) {
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                *mask = mempolicy->nodes;
                break;

        case MPOL_LOCAL:
                init_nodemask_of_node(mask, numa_node_id());
                break;

        default:
                BUG();
        }
        task_unlock(current);

        return true;
}
#endif

/*
 * mempolicy_in_oom_domain
 *
 * If tsk's mempolicy is "bind", check for intersection between mask and
 * the policy nodemask. Otherwise, return true for all other policies
 * including "interleave", as a tsk with "interleave" policy may have
 * memory allocated from all nodes in system.
 *
 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
 */
bool mempolicy_in_oom_domain(struct task_struct *tsk,
                                        const nodemask_t *mask)
{
        struct mempolicy *mempolicy;
        bool ret = true;

        if (!mask)
                return ret;

        task_lock(tsk);
        mempolicy = tsk->mempolicy;
        if (mempolicy && mempolicy->mode == MPOL_BIND)
                ret = nodes_intersects(mempolicy->nodes, *mask);
        task_unlock(tsk);

        return ret;
}

static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
                                                int nid, nodemask_t *nodemask)
{
        struct page *page;
        gfp_t preferred_gfp;

        /*
         * This is a two pass approach. The first pass will only try the
         * preferred nodes but skip the direct reclaim and allow the
         * allocation to fail, while the second pass will try all the
         * nodes in system.
         */
        preferred_gfp = gfp | __GFP_NOWARN;
        preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
        page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
        if (!page)
                page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);

        return page;
}

/**
 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
 * @gfp: GFP flags.
 * @order: Order of the page allocation.
 * @pol: Pointer to the NUMA mempolicy.
 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
 *
 * Return: The page on success or NULL if allocation fails.
 */
static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
                struct mempolicy *pol, pgoff_t ilx, int nid)
{
        nodemask_t *nodemask;
        struct page *page;

        nodemask = policy_nodemask(gfp, pol, ilx, &nid);

        if (pol->mode == MPOL_PREFERRED_MANY)
                return alloc_pages_preferred_many(gfp, order, nid, nodemask);

        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
            /* filter "hugepage" allocation, unless from alloc_pages() */
            is_pmd_order(order) && ilx != NO_INTERLEAVE_INDEX) {
                /*
                 * For hugepage allocation and non-interleave policy which
                 * allows the current node (or other explicitly preferred
                 * node) we only try to allocate from the current/preferred
                 * node and don't fall back to other nodes, as the cost of
                 * remote accesses would likely offset THP benefits.
                 *
                 * If the policy is interleave or does not allow the current
                 * node in its nodemask, we allocate the standard way.
                 */
                if (pol->mode != MPOL_INTERLEAVE &&
                    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
                    (!nodemask || node_isset(nid, *nodemask))) {
                        /*
                         * First, try to allocate THP only on local node, but
                         * don't reclaim unnecessarily, just compact.
                         */
                        page = __alloc_frozen_pages_noprof(
                                gfp | __GFP_THISNODE | __GFP_NORETRY, order,
                                nid, NULL);
                        if (page || !(gfp & __GFP_DIRECT_RECLAIM))
                                return page;
                        /*
                         * If hugepage allocations are configured to always
                         * synchronous compact or the vma has been madvised
                         * to prefer hugepage backing, retry allowing remote
                         * memory with both reclaim and compact as well.
                         */
                }
        }

        page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);

        if (unlikely(pol->mode == MPOL_INTERLEAVE ||
                     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
                /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
                if (static_branch_likely(&vm_numa_stat_key) &&
                    page_to_nid(page) == nid) {
                        preempt_disable();
                        __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
                        preempt_enable();
                }
        }

        return page;
}

struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *pol, pgoff_t ilx, int nid)
{
        struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
                        ilx, nid);
        if (!page)
                return NULL;

        set_page_refcounted(page);
        return page_rmappable_folio(page);
}

/**
 * vma_alloc_folio - Allocate a folio for a VMA.
 * @gfp: GFP flags.
 * @order: Order of the folio.
 * @vma: Pointer to VMA.
 * @addr: Virtual address of the allocation.  Must be inside @vma.
 *
 * Allocate a folio for a specific address in @vma, using the appropriate
 * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
 * VMA to prevent it from going away.  Should be used for all allocations
 * for folios that will be mapped into user space, excepting hugetlbfs, and
 * excepting where direct use of folio_alloc_mpol() is more appropriate.
 *
 * Return: The folio on success or NULL if allocation fails.
 */
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr)
{
        struct mempolicy *pol;
        pgoff_t ilx;
        struct folio *folio;

        if (vma->vm_flags & VM_DROPPABLE)
                gfp |= __GFP_NOWARN;

        pol = get_vma_policy(vma, addr, order, &ilx);
        folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
        mpol_cond_put(pol);
        return folio;
}
EXPORT_SYMBOL(vma_alloc_folio_noprof);

struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
{
        struct mempolicy *pol = &default_policy;

        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
         */
        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
                                       numa_node_id());
}

/**
 * alloc_pages - Allocate pages.
 * @gfp: GFP flags.
 * @order: Power of two of number of pages to allocate.
 *
 * Allocate 1 << @order contiguous pages.  The physical address of the
 * first page is naturally aligned (eg an order-3 allocation will be aligned
 * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
 * process is honoured when in process context.
 *
 * Context: Can be called from any context, providing the appropriate GFP
 * flags are used.
 * Return: The page on success or NULL if allocation fails.
 */
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
{
        struct page *page = alloc_frozen_pages_noprof(gfp, order);

        if (page)
                set_page_refcounted(page);
        return page;
}
EXPORT_SYMBOL(alloc_pages_noprof);

struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
        return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
}
EXPORT_SYMBOL(folio_alloc_noprof);

static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        int nodes;
        unsigned long nr_pages_per_node;
        int delta;
        int i;
        unsigned long nr_allocated;
        unsigned long total_allocated = 0;

        nodes = nodes_weight(pol->nodes);
        nr_pages_per_node = nr_pages / nodes;
        delta = nr_pages - nodes * nr_pages_per_node;

        for (i = 0; i < nodes; i++) {
                if (delta) {
                        nr_allocated = alloc_pages_bulk_noprof(gfp,
                                        interleave_nodes(pol), NULL,
                                        nr_pages_per_node + 1,
                                        page_array);
                        delta--;
                } else {
                        nr_allocated = alloc_pages_bulk_noprof(gfp,
                                        interleave_nodes(pol), NULL,
                                        nr_pages_per_node, page_array);
                }

                page_array += nr_allocated;
                total_allocated += nr_allocated;
        }

        return total_allocated;
}

static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        struct weighted_interleave_state *state;
        struct task_struct *me = current;
        unsigned int cpuset_mems_cookie;
        unsigned long total_allocated = 0;
        unsigned long nr_allocated = 0;
        unsigned long rounds;
        unsigned long node_pages, delta;
        u8 *weights, weight;
        unsigned int weight_total = 0;
        unsigned long rem_pages = nr_pages;
        nodemask_t nodes;
        int nnodes, node;
        int resume_node = MAX_NUMNODES - 1;
        u8 resume_weight = 0;
        int prev_node;
        int i;

        if (!nr_pages)
                return 0;

        /* read the nodes onto the stack, retry if done during rebind */
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
                nnodes = read_once_policy_nodemask(pol, &nodes);
        } while (read_mems_allowed_retry(cpuset_mems_cookie));

        /* if the nodemask has become invalid, we cannot do anything */
        if (!nnodes)
                return 0;

        /* Continue allocating from most recent node and adjust the nr_pages */
        node = me->il_prev;
        weight = me->il_weight;
        if (weight && node_isset(node, nodes)) {
                node_pages = min(rem_pages, weight);
                nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
                                                  page_array);
                page_array += nr_allocated;
                total_allocated += nr_allocated;
                /* if that's all the pages, no need to interleave */
                if (rem_pages <= weight) {
                        me->il_weight -= rem_pages;
                        return total_allocated;
                }
                /* Otherwise we adjust remaining pages, continue from there */
                rem_pages -= weight;
        }
        /* clear active weight in case of an allocation failure */
        me->il_weight = 0;
        prev_node = node;

        /* create a local copy of node weights to operate on outside rcu */
        weights = kzalloc(nr_node_ids, GFP_KERNEL);
        if (!weights)
                return total_allocated;

        rcu_read_lock();
        state = rcu_dereference(wi_state);
        if (state) {
                memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
                rcu_read_unlock();
        } else {
                rcu_read_unlock();
                for (i = 0; i < nr_node_ids; i++)
                        weights[i] = 1;
        }

        /* calculate total, detect system default usage */
        for_each_node_mask(node, nodes)
                weight_total += weights[node];

        /*
         * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
         * Track which node weighted interleave should resume from.
         *
         * if (rounds > 0) and (delta == 0), resume_node will always be
         * the node following prev_node and its weight.
         */
        rounds = rem_pages / weight_total;
        delta = rem_pages % weight_total;
        resume_node = next_node_in(prev_node, nodes);
        resume_weight = weights[resume_node];
        for (i = 0; i < nnodes; i++) {
                node = next_node_in(prev_node, nodes);
                weight = weights[node];
                node_pages = weight * rounds;
                /* If a delta exists, add this node's portion of the delta */
                if (delta > weight) {
                        node_pages += weight;
                        delta -= weight;
                } else if (delta) {
                        /* when delta is depleted, resume from that node */
                        node_pages += delta;
                        resume_node = node;
                        resume_weight = weight - delta;
                        delta = 0;
                }
                /* node_pages can be 0 if an allocation fails and rounds == 0 */
                if (!node_pages)
                        break;
                nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
                                                  page_array);
                page_array += nr_allocated;
                total_allocated += nr_allocated;
                if (total_allocated == nr_pages)
                        break;
                prev_node = node;
        }
        me->il_prev = resume_node;
        me->il_weight = resume_weight;
        kfree(weights);
        return total_allocated;
}

static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        gfp_t preferred_gfp;
        unsigned long nr_allocated = 0;

        preferred_gfp = gfp | __GFP_NOWARN;
        preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);

        nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
                                           nr_pages, page_array);

        if (nr_allocated < nr_pages)
                nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
                                nr_pages - nr_allocated,
                                page_array + nr_allocated);
        return nr_allocated;
}

/* alloc pages bulk and mempolicy should be considered at the
 * same time in some situation such as vmalloc.
 *
 * It can accelerate memory allocation especially interleaving
 * allocate memory.
 */
unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
                unsigned long nr_pages, struct page **page_array)
{
        struct mempolicy *pol = &default_policy;
        nodemask_t *nodemask;
        int nid;

        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        if (pol->mode == MPOL_INTERLEAVE)
                return alloc_pages_bulk_interleave(gfp, pol,
                                                         nr_pages, page_array);

        if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
                return alloc_pages_bulk_weighted_interleave(
                                  gfp, pol, nr_pages, page_array);

        if (pol->mode == MPOL_PREFERRED_MANY)
                return alloc_pages_bulk_preferred_many(gfp,
                                numa_node_id(), pol, nr_pages, page_array);

        nid = numa_node_id();
        nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
        return alloc_pages_bulk_noprof(gfp, nid, nodemask,
                                       nr_pages, page_array);
}

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        struct mempolicy *pol = mpol_dup(src->vm_policy);

        if (IS_ERR(pol))
                return PTR_ERR(pol);
        dst->vm_policy = pol;
        return 0;
}

/*
 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
 * with the mems_allowed returned by cpuset_mems_allowed().  This
 * keeps mempolicies cpuset relative after its cpuset moves.  See
 * further kernel/cpuset.c update_nodemask().
 *
 * current's mempolicy may be rebinded by the other task(the task that changes
 * cpuset's mems), so we needn't do rebind work for current task.
 */

/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

        if (!new)
                return ERR_PTR(-ENOMEM);

        /* task's mempolicy is protected by alloc_lock */
        if (old == current->mempolicy) {
                task_lock(current);
                *new = *old;
                task_unlock(current);
        } else
                *new = *old;

        if (current_cpuset_is_being_rebound()) {
                nodemask_t mems = cpuset_mems_allowed(current);
                mpol_rebind_policy(new, &mems);
        }
        atomic_set(&new->refcnt, 1);
        return new;
}

/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (!a || !b)
                return false;
        if (a->mode != b->mode)
                return false;
        if (a->flags != b->flags)
                return false;
        if (a->home_node != b->home_node)
                return false;
        if (mpol_store_user_nodemask(a))
                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
                        return false;

        switch (a->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_WEIGHTED_INTERLEAVE:
                return !!nodes_equal(a->nodes, b->nodes);
        case MPOL_LOCAL:
                return true;
        default:
                BUG();
                return false;
        }
}

/*
 * Shared memory backing store policy support.
 *
 * Remember policies even when nobody has shared memory mapped.
 * The policies are kept in Red-Black tree linked from the inode.
 * They are protected by the sp->lock rwlock, which should be held
 * for any accesses to the tree.
 */

/*
 * lookup first element intersecting start-end.  Caller holds sp->lock for
 * reading or for writing
 */
static struct sp_node *sp_lookup(struct shared_policy *sp,
                                        pgoff_t start, pgoff_t end)
{
        struct rb_node *n = sp->root.rb_node;

        while (n) {
                struct sp_node *p = rb_entry(n, struct sp_node, nd);

                if (start >= p->end)
                        n = n->rb_right;
                else if (end <= p->start)
                        n = n->rb_left;
                else
                        break;
        }
        if (!n)
                return NULL;
        for (;;) {
                struct sp_node *w = NULL;
                struct rb_node *prev = rb_prev(n);
                if (!prev)
                        break;
                w = rb_entry(prev, struct sp_node, nd);
                if (w->end <= start)
                        break;
                n = prev;
        }
        return rb_entry(n, struct sp_node, nd);
}

/*
 * Insert a new shared policy into the list.  Caller holds sp->lock for
 * writing.
 */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
        struct rb_node **p = &sp->root.rb_node;
        struct rb_node *parent = NULL;
        struct sp_node *nd;

        while (*p) {
                parent = *p;
                nd = rb_entry(parent, struct sp_node, nd);
                if (new->start < nd->start)
                        p = &(*p)->rb_left;
                else if (new->end > nd->end)
                        p = &(*p)->rb_right;
                else
                        BUG();
        }
        rb_link_node(&new->nd, parent, p);
        rb_insert_color(&new->nd, &sp->root);
}

/* Find shared policy intersecting idx */
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                                pgoff_t idx)
{
        struct mempolicy *pol = NULL;
        struct sp_node *sn;

        if (!sp->root.rb_node)
                return NULL;
        read_lock(&sp->lock);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
        read_unlock(&sp->lock);
        return pol;
}
EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm");

static void sp_free(struct sp_node *n)
{
        mpol_put(n->policy);
        kmem_cache_free(sn_cache, n);
}

/**
 * mpol_misplaced - check whether current folio node is valid in policy
 *
 * @folio: folio to be checked
 * @vmf: structure describing the fault
 * @addr: virtual address in @vma for shared policy lookup and interleave policy
 *
 * Lookup current policy node id for vma,addr and "compare to" folio's
 * node id.  Policy determination "mimics" alloc_page_vma().
 * Called from fault path where we know the vma and faulting address.
 *
 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
 * policy, or a suitable node ID to allocate a replacement folio from.
 */
int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
                   unsigned long addr)
{
        struct mempolicy *pol;
        pgoff_t ilx;
        struct zoneref *z;
        int curnid = folio_nid(folio);
        struct vm_area_struct *vma = vmf->vma;
        int thiscpu = raw_smp_processor_id();
        int thisnid = numa_node_id();
        int polnid = NUMA_NO_NODE;
        int ret = NUMA_NO_NODE;

        /*
         * Make sure ptl is held so that we don't preempt and we
         * have a stable smp processor id
         */
        lockdep_assert_held(vmf->ptl);
        pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
        if (!(pol->flags & MPOL_F_MOF))
                goto out;

        switch (pol->mode) {
        case MPOL_INTERLEAVE:
                polnid = interleave_nid(pol, ilx);
                break;

        case MPOL_WEIGHTED_INTERLEAVE:
                polnid = weighted_interleave_nid(pol, ilx);
                break;

        case MPOL_PREFERRED:
                if (node_isset(curnid, pol->nodes))
                        goto out;
                polnid = first_node(pol->nodes);
                break;

        case MPOL_LOCAL:
                polnid = numa_node_id();
                break;

        case MPOL_BIND:
        case MPOL_PREFERRED_MANY:
                /*
                 * Even though MPOL_PREFERRED_MANY can allocate pages outside
                 * policy nodemask we don't allow numa migration to nodes
                 * outside policy nodemask for now. This is done so that if we
                 * want demotion to slow memory to happen, before allocating
                 * from some DRAM node say 'x', we will end up using a
                 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
                 * we should not promote to node 'x' from slow memory node.
                 */
                if (pol->flags & MPOL_F_MORON) {
                        /*
                         * Optimize placement among multiple nodes
                         * via NUMA balancing
                         */
                        if (node_isset(thisnid, pol->nodes))
                                break;
                        goto out;
                }

                /*
                 * use current page if in policy nodemask,
                 * else select nearest allowed node, if any.
                 * If no allowed nodes, use current [!misplaced].
                 */
                if (node_isset(curnid, pol->nodes))
                        goto out;
                z = first_zones_zonelist(
                                node_zonelist(thisnid, GFP_HIGHUSER),
                                gfp_zone(GFP_HIGHUSER),
                                &pol->nodes);
                polnid = zonelist_node_idx(z);
                break;

        default:
                BUG();
        }

        /* Migrate the folio towards the node whose CPU is referencing it */
        if (pol->flags & MPOL_F_MORON) {
                polnid = thisnid;

                if (!should_numa_migrate_memory(current, folio, curnid,
                                                thiscpu))
                        goto out;
        }

        if (curnid != polnid)
                ret = polnid;
out:
        mpol_cond_put(pol);

        return ret;
}

/*
 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
 * dropped after task->mempolicy is set to NULL so that any allocation done as
 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
 * policy.
 */
void mpol_put_task_policy(struct task_struct *task)
{
        struct mempolicy *pol;

        task_lock(task);
        pol = task->mempolicy;
        task->mempolicy = NULL;
        task_unlock(task);
        mpol_put(pol);
}

static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
        rb_erase(&n->nd, &sp->root);
        sp_free(n);
}

static void sp_node_init(struct sp_node *node, unsigned long start,
                        unsigned long end, struct mempolicy *pol)
{
        node->start = start;
        node->end = end;
        node->policy = pol;
}

static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
{
        struct sp_node *n;
        struct mempolicy *newpol;

        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;

        newpol = mpol_dup(pol);
        if (IS_ERR(newpol)) {
                kmem_cache_free(sn_cache, n);
                return NULL;
        }
        newpol->flags |= MPOL_F_SHARED;
        sp_node_init(n, start, end, newpol);

        return n;
}

/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
                                 pgoff_t end, struct sp_node *new)
{
        struct sp_node *n;
        struct sp_node *n_new = NULL;
        struct mempolicy *mpol_new = NULL;
        int ret = 0;

restart:
        write_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
                struct rb_node *next = rb_next(&n->nd);
                if (n->start >= start) {
                        if (n->end <= end)
                                sp_delete(sp, n);
                        else
                                n->start = end;
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
                                if (!n_new)
                                        goto alloc_new;

                                *mpol_new = *n->policy;
                                atomic_set(&mpol_new->refcnt, 1);
                                sp_node_init(n_new, end, n->end, mpol_new);
                                n->end = start;
                                sp_insert(sp, n_new);
                                n_new = NULL;
                                mpol_new = NULL;
                                break;
                        } else
                                n->end = start;
                }
                if (!next)
                        break;
                n = rb_entry(next, struct sp_node, nd);
        }
        if (new)
                sp_insert(sp, new);
        write_unlock(&sp->lock);
        ret = 0;

err_out:
        if (mpol_new)
                mpol_put(mpol_new);
        if (n_new)
                kmem_cache_free(sn_cache, n_new);

        return ret;

alloc_new:
        write_unlock(&sp->lock);
        ret = -ENOMEM;
        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n_new)
                goto err_out;
        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!mpol_new)
                goto err_out;
        atomic_set(&mpol_new->refcnt, 1);
        goto restart;
}

/**
 * mpol_shared_policy_init - initialize shared policy for inode
 * @sp: pointer to inode shared policy
 * @mpol:  struct mempolicy to install
 *
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
 * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
        int ret;

        sp->root = RB_ROOT;                /* empty tree == default mempolicy */
        rwlock_init(&sp->lock);

        if (mpol) {
                struct sp_node *sn;
                struct mempolicy *npol;
                NODEMASK_SCRATCH(scratch);

                if (!scratch)
                        goto put_mpol;

                /* contextualize the tmpfs mount point mempolicy to this file */
                npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(npol))
                        goto free_scratch; /* no valid nodemask intersection */

                task_lock(current);
                ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                if (ret)
                        goto put_npol;

                /* alloc node covering entire file; adds ref to file's npol */
                sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
                if (sn)
                        sp_insert(sp, sn);
put_npol:
                mpol_put(npol);        /* drop initial ref on file's npol */
free_scratch:
                NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
                mpol_put(mpol);        /* drop our incoming ref on sb mpol */
        }
}
EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm");

int mpol_set_shared_policy(struct shared_policy *sp,
                        struct vm_area_struct *vma, struct mempolicy *pol)
{
        int err;
        struct sp_node *new = NULL;
        unsigned long sz = vma_pages(vma);

        if (pol) {
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
                if (!new)
                        return -ENOMEM;
        }
        err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
        if (err && new)
                sp_free(new);
        return err;
}
EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm");

/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *sp)
{
        struct sp_node *n;
        struct rb_node *next;

        if (!sp->root.rb_node)
                return;
        write_lock(&sp->lock);
        next = rb_first(&sp->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
                sp_delete(sp, n);
        }
        write_unlock(&sp->lock);
}
EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm");

#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;

static void __init check_numabalancing_enable(void)
{
        bool numabalancing_default = false;

        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
                numabalancing_default = true;

        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
        if (numabalancing_override)
                set_numabalancing_state(numabalancing_override == 1);

        if (num_online_nodes() > 1 && !numabalancing_override) {
                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
                        numabalancing_default ? "Enabling" : "Disabling");
                set_numabalancing_state(numabalancing_default);
        }
}

static int __init setup_numabalancing(char *str)
{
        int ret = 0;
        if (!str)
                goto out;

        if (!strcmp(str, "enable")) {
                numabalancing_override = 1;
                ret = 1;
        } else if (!strcmp(str, "disable")) {
                numabalancing_override = -1;
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("Unable to parse numa_balancing=\n");

        return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */

void __init numa_policy_init(void)
{
        nodemask_t interleave_nodes;
        unsigned long largest = 0;
        int nid, prefer = 0;

        policy_cache = kmem_cache_create("numa_policy",
                                         sizeof(struct mempolicy),
                                         0, SLAB_PANIC, NULL);

        sn_cache = kmem_cache_create("shared_policy_node",
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);

        for_each_node(nid) {
                preferred_node_policy[nid] = (struct mempolicy) {
                        .refcnt = ATOMIC_INIT(1),
                        .mode = MPOL_PREFERRED,
                        .flags = MPOL_F_MOF | MPOL_F_MORON,
                        .nodes = nodemask_of_node(nid),
                };
        }

        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
        for_each_node_state(nid, N_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);

                /* Preserve the largest node */
                if (largest < total_pages) {
                        largest = total_pages;
                        prefer = nid;
                }

                /* Interleave this node? */
                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
                        node_set(nid, interleave_nodes);
        }

        /* All too small, use the largest */
        if (unlikely(nodes_empty(interleave_nodes)))
                node_set(prefer, interleave_nodes);

        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                pr_err("%s: interleaving failed\n", __func__);

        check_numabalancing_enable();
}

/* Reset policy of current process to default */
void numa_default_policy(void)
{
        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

/*
 * Parse and format mempolicy from/to strings
 */
static const char * const policy_modes[] =
{
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
        [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
        [MPOL_LOCAL]      = "local",
        [MPOL_PREFERRED_MANY]  = "prefer (many)",
};

#ifdef CONFIG_TMPFS
/**
 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
 * @str:  string containing mempolicy to parse
 * @mpol:  pointer to struct mempolicy pointer, returned on success.
 *
 * Format of input:
 *        <mode>[=<flags>][:<nodelist>]
 *
 * Return: %0 on success, else %1
 */
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        struct mempolicy *new = NULL;
        unsigned short mode_flags;
        nodemask_t nodes;
        char *nodelist = strchr(str, ':');
        char *flags = strchr(str, '=');
        int err = 1, mode;

        if (flags)
                *flags++ = '\0';        /* terminate mode string */

        if (nodelist) {
                /* NUL-terminate mode or flags string */
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, nodes))
                        goto out;
                if (!nodes_subset(nodes, node_states[N_MEMORY]))
                        goto out;
        } else
                nodes_clear(nodes);

        mode = match_string(policy_modes, MPOL_MAX, str);
        if (mode < 0)
                goto out;

        switch (mode) {
        case MPOL_PREFERRED:
                /*
                 * Insist on a nodelist of one node only, although later
                 * we use first_node(nodes) to grab a single node, so here
                 * nodelist (or nodes) cannot be empty.
                 */
                if (nodelist) {
                        char *rest = nodelist;
                        while (isdigit(*rest))
                                rest++;
                        if (*rest)
                                goto out;
                        if (nodes_empty(nodes))
                                goto out;
                }
                break;
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                /*
                 * Default to online nodes with memory if no nodelist
                 */
                if (!nodelist)
                        nodes = node_states[N_MEMORY];
                break;
        case MPOL_LOCAL:
                /*
                 * Don't allow a nodelist;  mpol_new() checks flags
                 */
                if (nodelist)
                        goto out;
                break;
        case MPOL_DEFAULT:
                /*
                 * Insist on a empty nodelist
                 */
                if (!nodelist)
                        err = 0;
                goto out;
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
                /*
                 * Insist on a nodelist
                 */
                if (!nodelist)
                        goto out;
        }

        mode_flags = 0;
        if (flags) {
                /*
                 * Currently, we only support two mutually exclusive
                 * mode flags.
                 */
                if (!strcmp(flags, "static"))
                        mode_flags |= MPOL_F_STATIC_NODES;
                else if (!strcmp(flags, "relative"))
                        mode_flags |= MPOL_F_RELATIVE_NODES;
                else
                        goto out;
        }

        new = mpol_new(mode, mode_flags, &nodes);
        if (IS_ERR(new))
                goto out;

        /*
         * Save nodes for mpol_to_str() to show the tmpfs mount options
         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
         */
        if (mode != MPOL_PREFERRED) {
                new->nodes = nodes;
        } else if (nodelist) {
                nodes_clear(new->nodes);
                node_set(first_node(nodes), new->nodes);
        } else {
                new->mode = MPOL_LOCAL;
        }

        /*
         * Save nodes for contextualization: this will be used to "clone"
         * the mempolicy in a specific context [cpuset] at a later time.
         */
        new->w.user_nodemask = nodes;

        err = 0;

out:
        /* Restore string for error message */
        if (nodelist)
                *--nodelist = ':';
        if (flags)
                *--flags = '=';
        if (!err)
                *mpol = new;
        return err;
}
#endif /* CONFIG_TMPFS */

/**
 * mpol_to_str - format a mempolicy structure for printing
 * @buffer:  to contain formatted mempolicy string
 * @maxlen:  length of @buffer
 * @pol:  pointer to mempolicy to be formatted
 *
 * Convert @pol into a string.  If @buffer is too short, truncate the string.
 * Recommend a @maxlen of at least 51 for the longest mode, "weighted
 * interleave", plus the longest flag flags, "relative|balancing", and to
 * display at least a few node ids.
 */
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
        char *p = buffer;
        nodemask_t nodes = NODE_MASK_NONE;
        unsigned short mode = MPOL_DEFAULT;
        unsigned short flags = 0;

        if (pol &&
            pol != &default_policy &&
            !(pol >= &preferred_node_policy[0] &&
              pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
                mode = pol->mode;
                flags = pol->flags;
        }

        switch (mode) {
        case MPOL_DEFAULT:
        case MPOL_LOCAL:
                break;
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                nodes = pol->nodes;
                break;
        default:
                WARN_ON_ONCE(1);
                snprintf(p, maxlen, "unknown");
                return;
        }

        p += snprintf(p, maxlen, "%s", policy_modes[mode]);

        if (flags & MPOL_MODE_FLAGS) {
                p += snprintf(p, buffer + maxlen - p, "=");

                /*
                 * Static and relative are mutually exclusive.
                 */
                if (flags & MPOL_F_STATIC_NODES)
                        p += snprintf(p, buffer + maxlen - p, "static");
                else if (flags & MPOL_F_RELATIVE_NODES)
                        p += snprintf(p, buffer + maxlen - p, "relative");

                if (flags & MPOL_F_NUMA_BALANCING) {
                        if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
                                p += snprintf(p, buffer + maxlen - p, "|");
                        p += snprintf(p, buffer + maxlen - p, "balancing");
                }
        }

        if (!nodes_empty(nodes))
                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
                               nodemask_pr_args(&nodes));
}

#ifdef CONFIG_SYSFS
struct iw_node_attr {
        struct kobj_attribute kobj_attr;
        int nid;
};

struct sysfs_wi_group {
        struct kobject wi_kobj;
        struct mutex kobj_lock;
        struct iw_node_attr *nattrs[];
};

static struct sysfs_wi_group *wi_group;

static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
                         char *buf)
{
        struct iw_node_attr *node_attr;
        u8 weight;

        node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
        weight = get_il_weight(node_attr->nid);
        return sysfs_emit(buf, "%d\n", weight);
}

static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
                          const char *buf, size_t count)
{
        struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
        struct iw_node_attr *node_attr;
        u8 weight = 0;
        int i;

        node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
        if (count == 0 || sysfs_streq(buf, "") ||
            kstrtou8(buf, 0, &weight) || weight == 0)
                return -EINVAL;

        new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
        if (!new_wi_state)
                return -ENOMEM;

        mutex_lock(&wi_state_lock);
        old_wi_state = rcu_dereference_protected(wi_state,
                                        lockdep_is_held(&wi_state_lock));
        if (old_wi_state) {
                memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
                                        nr_node_ids * sizeof(u8));
        } else {
                for (i = 0; i < nr_node_ids; i++)
                        new_wi_state->iw_table[i] = 1;
        }
        new_wi_state->iw_table[node_attr->nid] = weight;
        new_wi_state->mode_auto = false;

        rcu_assign_pointer(wi_state, new_wi_state);
        mutex_unlock(&wi_state_lock);
        if (old_wi_state) {
                synchronize_rcu();
                kfree(old_wi_state);
        }
        return count;
}

static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
                struct kobj_attribute *attr, char *buf)
{
        struct weighted_interleave_state *state;
        bool wi_auto = true;

        rcu_read_lock();
        state = rcu_dereference(wi_state);
        if (state)
                wi_auto = state->mode_auto;
        rcu_read_unlock();

        return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
}

static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
        unsigned int *bw;
        bool input;
        int i;

        if (kstrtobool(buf, &input))
                return -EINVAL;

        new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
        if (!new_wi_state)
                return -ENOMEM;
        for (i = 0; i < nr_node_ids; i++)
                new_wi_state->iw_table[i] = 1;

        mutex_lock(&wi_state_lock);
        old_wi_state = rcu_dereference_protected(wi_state,
                                lockdep_is_held(&wi_state_lock));

        if (old_wi_state && input == old_wi_state->mode_auto) {
                mutex_unlock(&wi_state_lock);
                kfree(new_wi_state);
                return count;
        }

        if (!input) {
                if (old_wi_state)
                        memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
                                                       nr_node_ids * sizeof(u8));
                goto update_wi_state;
        }

        bw = node_bw_table;
        if (!bw) {
                mutex_unlock(&wi_state_lock);
                kfree(new_wi_state);
                return -ENODEV;
        }

        new_wi_state->mode_auto = true;
        reduce_interleave_weights(bw, new_wi_state->iw_table);

update_wi_state:
        rcu_assign_pointer(wi_state, new_wi_state);
        mutex_unlock(&wi_state_lock);
        if (old_wi_state) {
                synchronize_rcu();
                kfree(old_wi_state);
        }
        return count;
}

static void sysfs_wi_node_delete(int nid)
{
        struct iw_node_attr *attr;

        if (nid < 0 || nid >= nr_node_ids)
                return;

        mutex_lock(&wi_group->kobj_lock);
        attr = wi_group->nattrs[nid];
        if (!attr) {
                mutex_unlock(&wi_group->kobj_lock);
                return;
        }

        wi_group->nattrs[nid] = NULL;
        mutex_unlock(&wi_group->kobj_lock);

        sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
        kfree(attr->kobj_attr.attr.name);
        kfree(attr);
}

static void sysfs_wi_node_delete_all(void)
{
        int nid;

        for (nid = 0; nid < nr_node_ids; nid++)
                sysfs_wi_node_delete(nid);
}

static void wi_state_free(void)
{
        struct weighted_interleave_state *old_wi_state;

        mutex_lock(&wi_state_lock);
        old_wi_state = rcu_dereference_protected(wi_state,
                        lockdep_is_held(&wi_state_lock));
        rcu_assign_pointer(wi_state, NULL);
        mutex_unlock(&wi_state_lock);

        if (old_wi_state) {
                synchronize_rcu();
                kfree(old_wi_state);
        }
}

static struct kobj_attribute wi_auto_attr = {
        .attr = { .name = "auto", .mode = 0664 },
        .show = weighted_interleave_auto_show,
        .store = weighted_interleave_auto_store,
};

static void wi_cleanup(void) {
        sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
        sysfs_wi_node_delete_all();
        wi_state_free();
}

static void wi_kobj_release(struct kobject *wi_kobj)
{
        kfree(wi_group);
}

static const struct kobj_type wi_ktype = {
        .sysfs_ops = &kobj_sysfs_ops,
        .release = wi_kobj_release,
};

static int sysfs_wi_node_add(int nid)
{
        int ret;
        char *name;
        struct iw_node_attr *new_attr;

        if (nid < 0 || nid >= nr_node_ids) {
                pr_err("invalid node id: %d\n", nid);
                return -EINVAL;
        }

        new_attr = kzalloc_obj(*new_attr);
        if (!new_attr)
                return -ENOMEM;

        name = kasprintf(GFP_KERNEL, "node%d", nid);
        if (!name) {
                kfree(new_attr);
                return -ENOMEM;
        }

        sysfs_attr_init(&new_attr->kobj_attr.attr);
        new_attr->kobj_attr.attr.name = name;
        new_attr->kobj_attr.attr.mode = 0644;
        new_attr->kobj_attr.show = node_show;
        new_attr->kobj_attr.store = node_store;
        new_attr->nid = nid;

        mutex_lock(&wi_group->kobj_lock);
        if (wi_group->nattrs[nid]) {
                mutex_unlock(&wi_group->kobj_lock);
                ret = -EEXIST;
                goto out;
        }

        ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
        if (ret) {
                mutex_unlock(&wi_group->kobj_lock);
                goto out;
        }
        wi_group->nattrs[nid] = new_attr;
        mutex_unlock(&wi_group->kobj_lock);
        return 0;

out:
        kfree(new_attr->kobj_attr.attr.name);
        kfree(new_attr);
        return ret;
}

static int wi_node_notifier(struct notifier_block *nb,
                               unsigned long action, void *data)
{
        int err;
        struct node_notify *nn = data;
        int nid = nn->nid;

        switch (action) {
        case NODE_ADDED_FIRST_MEMORY:
                err = sysfs_wi_node_add(nid);
                if (err)
                        pr_err("failed to add sysfs for node%d during hotplug: %d\n",
                               nid, err);
                break;
        case NODE_REMOVED_LAST_MEMORY:
                sysfs_wi_node_delete(nid);
                break;
        }

        return NOTIFY_OK;
}

static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
{
        int nid, err;

        wi_group = kzalloc_flex(*wi_group, nattrs, nr_node_ids);
        if (!wi_group)
                return -ENOMEM;
        mutex_init(&wi_group->kobj_lock);

        err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
                                   "weighted_interleave");
        if (err)
                goto err_put_kobj;

        err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
        if (err)
                goto err_put_kobj;

        for_each_online_node(nid) {
                if (!node_state(nid, N_MEMORY))
                        continue;

                err = sysfs_wi_node_add(nid);
                if (err) {
                        pr_err("failed to add sysfs for node%d during init: %d\n",
                               nid, err);
                        goto err_cleanup_kobj;
                }
        }

        hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
        return 0;

err_cleanup_kobj:
        wi_cleanup();
        kobject_del(&wi_group->wi_kobj);
err_put_kobj:
        kobject_put(&wi_group->wi_kobj);
        return err;
}

static int __init mempolicy_sysfs_init(void)
{
        int err;
        static struct kobject *mempolicy_kobj;

        mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
        if (!mempolicy_kobj)
                return -ENOMEM;

        err = add_weighted_interleave_group(mempolicy_kobj);
        if (err)
                goto err_kobj;

        return 0;

err_kobj:
        kobject_del(mempolicy_kobj);
        kobject_put(mempolicy_kobj);
        return err;
}

late_initcall(mempolicy_sysfs_init);
#endif /* CONFIG_SYSFS */






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



















    1 





    1 

    1 




















































































































































































































































































































































































































































    1 










    1 
































    1 

























































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/ipc/sem.c
 * Copyright (C) 1992 Krishna Balasubramanian
 * Copyright (C) 1995 Eric Schenk, Bruno Haible
 *
 * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
 *
 * SMP-threaded, sysctl's added
 * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
 * Enforced range limit on SEM_UNDO
 * (c) 2001 Red Hat Inc
 * Lockless wakeup
 * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
 * (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
 * Further wakeup optimizations, documentation
 * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
 *
 * support for audit of ipc object properties and permission changes
 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
 *
 * namespaces support
 * OpenVZ, SWsoft Inc.
 * Pavel Emelianov <xemul@openvz.org>
 *
 * Implementation notes: (May 2010)
 * This file implements System V semaphores.
 *
 * User space visible behavior:
 * - FIFO ordering for semop() operations (just FIFO, not starvation
 *   protection)
 * - multiple semaphore operations that alter the same semaphore in
 *   one semop() are handled.
 * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
 *   SETALL calls.
 * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
 * - undo adjustments at process exit are limited to 0..SEMVMX.
 * - namespace are supported.
 * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing
 *   to /proc/sys/kernel/sem.
 * - statistics about the usage are reported in /proc/sysvipc/sem.
 *
 * Internals:
 * - scalability:
 *   - all global variables are read-mostly.
 *   - semop() calls and semctl(RMID) are synchronized by RCU.
 *   - most operations do write operations (actually: spin_lock calls) to
 *     the per-semaphore array structure.
 *   Thus: Perfect SMP scaling between independent semaphore arrays.
 *         If multiple semaphores in one array are used, then cache line
 *         trashing on the semaphore array spinlock will limit the scaling.
 * - semncnt and semzcnt are calculated on demand in count_semcnt()
 * - the task that performs a successful semop() scans the list of all
 *   sleeping tasks and completes any pending operations that can be fulfilled.
 *   Semaphores are actively given to waiting tasks (necessary for FIFO).
 *   (see update_queue())
 * - To improve the scalability, the actual wake-up calls are performed after
 *   dropping all locks. (see wake_up_sem_queue_prepare())
 * - All work is done by the waker, the woken up task does not have to do
 *   anything - not even acquiring a lock or dropping a refcount.
 * - A woken up task may not even touch the semaphore array anymore, it may
 *   have been destroyed already by a semctl(RMID).
 * - UNDO values are stored in an array (one per process and per
 *   semaphore array, lazily allocated). For backwards compatibility, multiple
 *   modes for the UNDO variables are supported (per process, per thread)
 *   (see copy_semundo, CLONE_SYSVSEM)
 * - There are two lists of the pending operations: a per-array list
 *   and per-semaphore list (stored in the array). This allows to achieve FIFO
 *   ordering without always scanning all pending operations.
 *   The worst-case behavior is nevertheless O(N^2) for N wakeups.
 */

#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/time.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/seq_file.h>
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
#include <linux/sched/wake_q.h>
#include <linux/nospec.h>
#include <linux/rhashtable.h>

#include <linux/uaccess.h>
#include "util.h"

/* One semaphore structure for each semaphore in the system. */
struct sem {
        int        semval;                /* current value */
        /*
         * PID of the process that last modified the semaphore. For
         * Linux, specifically these are:
         *  - semop
         *  - semctl, via SETVAL and SETALL.
         *  - at task exit when performing undo adjustments (see exit_sem).
         */
        struct pid *sempid;
        spinlock_t        lock;        /* spinlock for fine-grained semtimedop */
        struct list_head pending_alter; /* pending single-sop operations */
                                        /* that alter the semaphore */
        struct list_head pending_const; /* pending single-sop operations */
                                        /* that do not alter the semaphore*/
        time64_t         sem_otime;        /* candidate for sem_otime */
} ____cacheline_aligned_in_smp;

/* One sem_array data structure for each set of semaphores in the system. */
struct sem_array {
        struct kern_ipc_perm        sem_perm;        /* permissions .. see ipc.h */
        time64_t                sem_ctime;        /* create/last semctl() time */
        struct list_head        pending_alter;        /* pending operations */
                                                /* that alter the array */
        struct list_head        pending_const;        /* pending complex operations */
                                                /* that do not alter semvals */
        struct list_head        list_id;        /* undo requests on this array */
        int                        sem_nsems;        /* no. of semaphores in array */
        int                        complex_count;        /* pending complex operations */
        unsigned int                use_global_lock;/* >0: global lock required */

        struct sem                sems[];
} __randomize_layout;

/* One queue for each sleeping process in the system. */
struct sem_queue {
        struct list_head        list;         /* queue of pending operations */
        struct task_struct        *sleeper; /* this process */
        struct sem_undo                *undo;         /* undo structure */
        struct pid                *pid;         /* process id of requesting process */
        int                        status;         /* completion status of operation */
        struct sembuf                *sops;         /* array of pending operations */
        struct sembuf                *blocking; /* the operation that blocked */
        int                        nsops;         /* number of operations */
        bool                        alter;         /* does *sops alter the array? */
        bool                    dupsop;         /* sops on more than one sem_num */
};

/* Each task has a list of undo requests. They are executed automatically
 * when the process exits.
 */
struct sem_undo {
        struct list_head        list_proc;        /* per-process list: *
                                                 * all undos from one process
                                                 * rcu protected */
        struct rcu_head                rcu;                /* rcu struct for sem_undo */
        struct sem_undo_list        *ulp;                /* back ptr to sem_undo_list */
        struct list_head        list_id;        /* per semaphore array list:
                                                 * all undos for one array */
        int                        semid;                /* semaphore set identifier */
        short                        semadj[];        /* array of adjustments */
                                                /* one per semaphore */
};

/* sem_undo_list controls shared access to the list of sem_undo structures
 * that may be shared among all a CLONE_SYSVSEM task group.
 */
struct sem_undo_list {
        refcount_t                refcnt;
        spinlock_t                lock;
        struct list_head        list_proc;
};


#define sem_ids(ns)        ((ns)->ids[IPC_SEM_IDS])

static int newary(struct ipc_namespace *, struct ipc_params *);
static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
#ifdef CONFIG_PROC_FS
static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
#endif

#define SEMMSL_FAST        256 /* 512 bytes on stack */
#define SEMOPM_FAST        64  /* ~ 372 bytes on stack */

/*
 * Switching from the mode suitable for simple ops
 * to the mode for complex ops is costly. Therefore:
 * use some hysteresis
 */
#define USE_GLOBAL_LOCK_HYSTERESIS        10

/*
 * Locking:
 * a) global sem_lock() for read/write
 *        sem_undo.id_next,
 *        sem_array.complex_count,
 *        sem_array.pending{_alter,_const},
 *        sem_array.sem_undo
 *
 * b) global or semaphore sem_lock() for read/write:
 *        sem_array.sems[i].pending_{const,alter}:
 *
 * c) special:
 *        sem_undo_list.list_proc:
 *        * undo_list->lock for write
 *        * rcu for read
 *        use_global_lock:
 *        * global sem_lock() for write
 *        * either local or global sem_lock() for read.
 *
 * Memory ordering:
 * Most ordering is enforced by using spin_lock() and spin_unlock().
 *
 * Exceptions:
 * 1) use_global_lock: (SEM_BARRIER_1)
 * Setting it from non-zero to 0 is a RELEASE, this is ensured by
 * using smp_store_release(): Immediately after setting it to 0,
 * a simple op can start.
 * Testing if it is non-zero is an ACQUIRE, this is ensured by using
 * smp_load_acquire().
 * Setting it from 0 to non-zero must be ordered with regards to
 * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
 * is inside a spin_lock() and after a write from 0 to non-zero a
 * spin_lock()+spin_unlock() is done.
 * To prevent the compiler/cpu temporarily writing 0 to use_global_lock,
 * READ_ONCE()/WRITE_ONCE() is used.
 *
 * 2) queue.status: (SEM_BARRIER_2)
 * Initialization is done while holding sem_lock(), so no further barrier is
 * required.
 * Setting it to a result code is a RELEASE, this is ensured by both a
 * smp_store_release() (for case a) and while holding sem_lock()
 * (for case b).
 * The ACQUIRE when reading the result code without holding sem_lock() is
 * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep().
 * (case a above).
 * Reading the result code while holding sem_lock() needs no further barriers,
 * the locks inside sem_lock() enforce ordering (case b above)
 *
 * 3) current->state:
 * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock().
 * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may
 * happen immediately after calling wake_q_add. As wake_q_add_safe() is called
 * when holding sem_lock(), no further barriers are required.
 *
 * See also ipc/mqueue.c for more details on the covered races.
 */

#define sc_semmsl        sem_ctls[0]
#define sc_semmns        sem_ctls[1]
#define sc_semopm        sem_ctls[2]
#define sc_semmni        sem_ctls[3]

void sem_init_ns(struct ipc_namespace *ns)
{
        ns->sc_semmsl = SEMMSL;
        ns->sc_semmns = SEMMNS;
        ns->sc_semopm = SEMOPM;
        ns->sc_semmni = SEMMNI;
        ns->used_sems = 0;
        ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
}

#ifdef CONFIG_IPC_NS
void sem_exit_ns(struct ipc_namespace *ns)
{
        free_ipcs(ns, &sem_ids(ns), freeary);
        idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
        rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
}
#endif

void __init sem_init(void)
{
        sem_init_ns(&init_ipc_ns);
        ipc_init_proc_interface("sysvipc/sem",
                                "       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
                                IPC_SEM_IDS, sysvipc_sem_proc_show);
}

/**
 * unmerge_queues - unmerge queues, if possible.
 * @sma: semaphore array
 *
 * The function unmerges the wait queues if complex_count is 0.
 * It must be called prior to dropping the global semaphore array lock.
 */
static void unmerge_queues(struct sem_array *sma)
{
        struct sem_queue *q, *tq;

        /* complex operations still around? */
        if (sma->complex_count)
                return;
        /*
         * We will switch back to simple mode.
         * Move all pending operation back into the per-semaphore
         * queues.
         */
        list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
                struct sem *curr;
                curr = &sma->sems[q->sops[0].sem_num];

                list_add_tail(&q->list, &curr->pending_alter);
        }
        INIT_LIST_HEAD(&sma->pending_alter);
}

/**
 * merge_queues - merge single semop queues into global queue
 * @sma: semaphore array
 *
 * This function merges all per-semaphore queues into the global queue.
 * It is necessary to achieve FIFO ordering for the pending single-sop
 * operations when a multi-semop operation must sleep.
 * Only the alter operations must be moved, the const operations can stay.
 */
static void merge_queues(struct sem_array *sma)
{
        int i;
        for (i = 0; i < sma->sem_nsems; i++) {
                struct sem *sem = &sma->sems[i];

                list_splice_init(&sem->pending_alter, &sma->pending_alter);
        }
}

static void sem_rcu_free(struct rcu_head *head)
{
        struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu);
        struct sem_array *sma = container_of(p, struct sem_array, sem_perm);

        security_sem_free(&sma->sem_perm);
        kvfree(sma);
}

/*
 * Enter the mode suitable for non-simple operations:
 * Caller must own sem_perm.lock.
 */
static void complexmode_enter(struct sem_array *sma)
{
        int i;
        struct sem *sem;

        if (sma->use_global_lock > 0)  {
                /*
                 * We are already in global lock mode.
                 * Nothing to do, just reset the
                 * counter until we return to simple mode.
                 */
                WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS);
                return;
        }
        WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS);

        for (i = 0; i < sma->sem_nsems; i++) {
                sem = &sma->sems[i];
                spin_lock(&sem->lock);
                spin_unlock(&sem->lock);
        }
}

/*
 * Try to leave the mode that disallows simple operations:
 * Caller must own sem_perm.lock.
 */
static void complexmode_tryleave(struct sem_array *sma)
{
        if (sma->complex_count)  {
                /* Complex ops are sleeping.
                 * We must stay in complex mode
                 */
                return;
        }
        if (sma->use_global_lock == 1) {

                /* See SEM_BARRIER_1 for purpose/pairing */
                smp_store_release(&sma->use_global_lock, 0);
        } else {
                WRITE_ONCE(sma->use_global_lock,
                                sma->use_global_lock-1);
        }
}

#define SEM_GLOBAL_LOCK        (-1)
/*
 * If the request contains only one semaphore operation, and there are
 * no complex transactions pending, lock only the semaphore involved.
 * Otherwise, lock the entire semaphore array, since we either have
 * multiple semaphores in our own semops, or we need to look at
 * semaphores from other pending complex operations.
 */
static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
                              int nsops)
{
        struct sem *sem;
        int idx;

        if (nsops != 1) {
                /* Complex operation - acquire a full lock */
                ipc_lock_object(&sma->sem_perm);

                /* Prevent parallel simple ops */
                complexmode_enter(sma);
                return SEM_GLOBAL_LOCK;
        }

        /*
         * Only one semaphore affected - try to optimize locking.
         * Optimized locking is possible if no complex operation
         * is either enqueued or processed right now.
         *
         * Both facts are tracked by use_global_mode.
         */
        idx = array_index_nospec(sops->sem_num, sma->sem_nsems);
        sem = &sma->sems[idx];

        /*
         * Initial check for use_global_lock. Just an optimization,
         * no locking, no memory barrier.
         */
        if (!READ_ONCE(sma->use_global_lock)) {
                /*
                 * It appears that no complex operation is around.
                 * Acquire the per-semaphore lock.
                 */
                spin_lock(&sem->lock);

                /* see SEM_BARRIER_1 for purpose/pairing */
                if (!smp_load_acquire(&sma->use_global_lock)) {
                        /* fast path successful! */
                        return sops->sem_num;
                }
                spin_unlock(&sem->lock);
        }

        /* slow path: acquire the full lock */
        ipc_lock_object(&sma->sem_perm);

        if (sma->use_global_lock == 0) {
                /*
                 * The use_global_lock mode ended while we waited for
                 * sma->sem_perm.lock. Thus we must switch to locking
                 * with sem->lock.
                 * Unlike in the fast path, there is no need to recheck
                 * sma->use_global_lock after we have acquired sem->lock:
                 * We own sma->sem_perm.lock, thus use_global_lock cannot
                 * change.
                 */
                spin_lock(&sem->lock);

                ipc_unlock_object(&sma->sem_perm);
                return sops->sem_num;
        } else {
                /*
                 * Not a false alarm, thus continue to use the global lock
                 * mode. No need for complexmode_enter(), this was done by
                 * the caller that has set use_global_mode to non-zero.
                 */
                return SEM_GLOBAL_LOCK;
        }
}

static inline void sem_unlock(struct sem_array *sma, int locknum)
{
        if (locknum == SEM_GLOBAL_LOCK) {
                unmerge_queues(sma);
                complexmode_tryleave(sma);
                ipc_unlock_object(&sma->sem_perm);
        } else {
                struct sem *sem = &sma->sems[locknum];
                spin_unlock(&sem->lock);
        }
}

/*
 * sem_lock_(check_) routines are called in the paths where the rwsem
 * is not held.
 *
 * The caller holds the RCU read lock.
 */
static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
{
        struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);

        if (IS_ERR(ipcp))
                return ERR_CAST(ipcp);

        return container_of(ipcp, struct sem_array, sem_perm);
}

static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns,
                                                        int id)
{
        struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);

        if (IS_ERR(ipcp))
                return ERR_CAST(ipcp);

        return container_of(ipcp, struct sem_array, sem_perm);
}

static inline void sem_lock_and_putref(struct sem_array *sma)
{
        sem_lock(sma, NULL, -1);
        ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
}

static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
{
        ipc_rmid(&sem_ids(ns), &s->sem_perm);
}

static struct sem_array *sem_alloc(size_t nsems)
{
        struct sem_array *sma;

        if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0]))
                return NULL;

        sma = kvzalloc_flex(*sma, sems, nsems, GFP_KERNEL_ACCOUNT);
        if (unlikely(!sma))
                return NULL;

        return sma;
}

/**
 * newary - Create a new semaphore set
 * @ns: namespace
 * @params: ptr to the structure that contains key, semflg and nsems
 *
 * Called with sem_ids.rwsem held (as a writer)
 */
static int newary(struct ipc_namespace *ns, struct ipc_params *params)
{
        int retval;
        struct sem_array *sma;
        key_t key = params->key;
        int nsems = params->u.nsems;
        int semflg = params->flg;
        int i;

        if (!nsems)
                return -EINVAL;
        if (ns->used_sems + nsems > ns->sc_semmns)
                return -ENOSPC;

        sma = sem_alloc(nsems);
        if (!sma)
                return -ENOMEM;

        sma->sem_perm.mode = (semflg & S_IRWXUGO);
        sma->sem_perm.key = key;

        sma->sem_perm.security = NULL;
        retval = security_sem_alloc(&sma->sem_perm);
        if (retval) {
                kvfree(sma);
                return retval;
        }

        for (i = 0; i < nsems; i++) {
                INIT_LIST_HEAD(&sma->sems[i].pending_alter);
                INIT_LIST_HEAD(&sma->sems[i].pending_const);
                spin_lock_init(&sma->sems[i].lock);
        }

        sma->complex_count = 0;
        sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
        INIT_LIST_HEAD(&sma->pending_alter);
        INIT_LIST_HEAD(&sma->pending_const);
        INIT_LIST_HEAD(&sma->list_id);
        sma->sem_nsems = nsems;
        sma->sem_ctime = ktime_get_real_seconds();

        /* ipc_addid() locks sma upon success. */
        retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
        if (retval < 0) {
                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                return retval;
        }
        ns->used_sems += nsems;

        sem_unlock(sma, -1);
        rcu_read_unlock();

        return sma->sem_perm.id;
}


/*
 * Called with sem_ids.rwsem and ipcp locked.
 */
static int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params)
{
        struct sem_array *sma;

        sma = container_of(ipcp, struct sem_array, sem_perm);
        if (params->u.nsems > sma->sem_nsems)
                return -EINVAL;

        return 0;
}

long ksys_semget(key_t key, int nsems, int semflg)
{
        struct ipc_namespace *ns;
        static const struct ipc_ops sem_ops = {
                .getnew = newary,
                .associate = security_sem_associate,
                .more_checks = sem_more_checks,
        };
        struct ipc_params sem_params;

        ns = current->nsproxy->ipc_ns;

        if (nsems < 0 || nsems > ns->sc_semmsl)
                return -EINVAL;

        sem_params.key = key;
        sem_params.flg = semflg;
        sem_params.u.nsems = nsems;

        return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
}

SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
{
        return ksys_semget(key, nsems, semflg);
}

/**
 * perform_atomic_semop[_slow] - Attempt to perform semaphore
 *                               operations on a given array.
 * @sma: semaphore array
 * @q: struct sem_queue that describes the operation
 *
 * Caller blocking are as follows, based the value
 * indicated by the semaphore operation (sem_op):
 *
 *  (1) >0 never blocks.
 *  (2)  0 (wait-for-zero operation): semval is non-zero.
 *  (3) <0 attempting to decrement semval to a value smaller than zero.
 *
 * Returns 0 if the operation was possible.
 * Returns 1 if the operation is impossible, the caller must sleep.
 * Returns <0 for error codes.
 */
static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q)
{
        int result, sem_op, nsops;
        struct pid *pid;
        struct sembuf *sop;
        struct sem *curr;
        struct sembuf *sops;
        struct sem_undo *un;

        sops = q->sops;
        nsops = q->nsops;
        un = q->undo;

        for (sop = sops; sop < sops + nsops; sop++) {
                int idx = array_index_nospec(sop->sem_num, sma->sem_nsems);
                curr = &sma->sems[idx];
                sem_op = sop->sem_op;
                result = curr->semval;

                if (!sem_op && result)
                        goto would_block;

                result += sem_op;
                if (result < 0)
                        goto would_block;
                if (result > SEMVMX)
                        goto out_of_range;

                if (sop->sem_flg & SEM_UNDO) {
                        int undo = un->semadj[sop->sem_num] - sem_op;
                        /* Exceeding the undo range is an error. */
                        if (undo < (-SEMAEM - 1) || undo > SEMAEM)
                                goto out_of_range;
                        un->semadj[sop->sem_num] = undo;
                }

                curr->semval = result;
        }

        sop--;
        pid = q->pid;
        while (sop >= sops) {
                ipc_update_pid(&sma->sems[sop->sem_num].sempid, pid);
                sop--;
        }

        return 0;

out_of_range:
        result = -ERANGE;
        goto undo;

would_block:
        q->blocking = sop;

        if (sop->sem_flg & IPC_NOWAIT)
                result = -EAGAIN;
        else
                result = 1;

undo:
        sop--;
        while (sop >= sops) {
                sem_op = sop->sem_op;
                sma->sems[sop->sem_num].semval -= sem_op;
                if (sop->sem_flg & SEM_UNDO)
                        un->semadj[sop->sem_num] += sem_op;
                sop--;
        }

        return result;
}

static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
{
        int result, sem_op, nsops;
        struct sembuf *sop;
        struct sem *curr;
        struct sembuf *sops;
        struct sem_undo *un;

        sops = q->sops;
        nsops = q->nsops;
        un = q->undo;

        if (unlikely(q->dupsop))
                return perform_atomic_semop_slow(sma, q);

        /*
         * We scan the semaphore set twice, first to ensure that the entire
         * operation can succeed, therefore avoiding any pointless writes
         * to shared memory and having to undo such changes in order to block
         * until the operations can go through.
         */
        for (sop = sops; sop < sops + nsops; sop++) {
                int idx = array_index_nospec(sop->sem_num, sma->sem_nsems);

                curr = &sma->sems[idx];
                sem_op = sop->sem_op;
                result = curr->semval;

                if (!sem_op && result)
                        goto would_block; /* wait-for-zero */

                result += sem_op;
                if (result < 0)
                        goto would_block;

                if (result > SEMVMX)
                        return -ERANGE;

                if (sop->sem_flg & SEM_UNDO) {
                        int undo = un->semadj[sop->sem_num] - sem_op;

                        /* Exceeding the undo range is an error. */
                        if (undo < (-SEMAEM - 1) || undo > SEMAEM)
                                return -ERANGE;
                }
        }

        for (sop = sops; sop < sops + nsops; sop++) {
                curr = &sma->sems[sop->sem_num];
                sem_op = sop->sem_op;

                if (sop->sem_flg & SEM_UNDO) {
                        int undo = un->semadj[sop->sem_num] - sem_op;

                        un->semadj[sop->sem_num] = undo;
                }
                curr->semval += sem_op;
                ipc_update_pid(&curr->sempid, q->pid);
        }

        return 0;

would_block:
        q->blocking = sop;
        return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
}

static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
                                             struct wake_q_head *wake_q)
{
        struct task_struct *sleeper;

        sleeper = get_task_struct(q->sleeper);

        /* see SEM_BARRIER_2 for purpose/pairing */
        smp_store_release(&q->status, error);

        wake_q_add_safe(wake_q, sleeper);
}

static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
{
        list_del(&q->list);
        if (q->nsops > 1)
                sma->complex_count--;
}

/** check_restart(sma, q)
 * @sma: semaphore array
 * @q: the operation that just completed
 *
 * update_queue is O(N^2) when it restarts scanning the whole queue of
 * waiting operations. Therefore this function checks if the restart is
 * really necessary. It is called after a previously waiting operation
 * modified the array.
 * Note that wait-for-zero operations are handled without restart.
 */
static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
{
        /* pending complex alter operations are too difficult to analyse */
        if (!list_empty(&sma->pending_alter))
                return 1;

        /* we were a sleeping complex operation. Too difficult */
        if (q->nsops > 1)
                return 1;

        /* It is impossible that someone waits for the new value:
         * - complex operations always restart.
         * - wait-for-zero are handled separately.
         * - q is a previously sleeping simple operation that
         *   altered the array. It must be a decrement, because
         *   simple increments never sleep.
         * - If there are older (higher priority) decrements
         *   in the queue, then they have observed the original
         *   semval value and couldn't proceed. The operation
         *   decremented to value - thus they won't proceed either.
         */
        return 0;
}

/**
 * wake_const_ops - wake up non-alter tasks
 * @sma: semaphore array.
 * @semnum: semaphore that was modified.
 * @wake_q: lockless wake-queue head.
 *
 * wake_const_ops must be called after a semaphore in a semaphore array
 * was set to 0. If complex const operations are pending, wake_const_ops must
 * be called with semnum = -1, as well as with the number of each modified
 * semaphore.
 * The tasks that must be woken up are added to @wake_q. The return code
 * is stored in q->pid.
 * The function returns 1 if at least one operation was completed successfully.
 */
static int wake_const_ops(struct sem_array *sma, int semnum,
                          struct wake_q_head *wake_q)
{
        struct sem_queue *q, *tmp;
        struct list_head *pending_list;
        int semop_completed = 0;

        if (semnum == -1)
                pending_list = &sma->pending_const;
        else
                pending_list = &sma->sems[semnum].pending_const;

        list_for_each_entry_safe(q, tmp, pending_list, list) {
                int error = perform_atomic_semop(sma, q);

                if (error > 0)
                        continue;
                /* operation completed, remove from queue & wakeup */
                unlink_queue(sma, q);

                wake_up_sem_queue_prepare(q, error, wake_q);
                if (error == 0)
                        semop_completed = 1;
        }

        return semop_completed;
}

/**
 * do_smart_wakeup_zero - wakeup all wait for zero tasks
 * @sma: semaphore array
 * @sops: operations that were performed
 * @nsops: number of operations
 * @wake_q: lockless wake-queue head
 *
 * Checks all required queue for wait-for-zero operations, based
 * on the actual changes that were performed on the semaphore array.
 * The function returns 1 if at least one operation was completed successfully.
 */
static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
                                int nsops, struct wake_q_head *wake_q)
{
        int i;
        int semop_completed = 0;
        int got_zero = 0;

        /* first: the per-semaphore queues, if known */
        if (sops) {
                for (i = 0; i < nsops; i++) {
                        int num = sops[i].sem_num;

                        if (sma->sems[num].semval == 0) {
                                got_zero = 1;
                                semop_completed |= wake_const_ops(sma, num, wake_q);
                        }
                }
        } else {
                /*
                 * No sops means modified semaphores not known.
                 * Assume all were changed.
                 */
                for (i = 0; i < sma->sem_nsems; i++) {
                        if (sma->sems[i].semval == 0) {
                                got_zero = 1;
                                semop_completed |= wake_const_ops(sma, i, wake_q);
                        }
                }
        }
        /*
         * If one of the modified semaphores got 0,
         * then check the global queue, too.
         */
        if (got_zero)
                semop_completed |= wake_const_ops(sma, -1, wake_q);

        return semop_completed;
}


/**
 * update_queue - look for tasks that can be completed.
 * @sma: semaphore array.
 * @semnum: semaphore that was modified.
 * @wake_q: lockless wake-queue head.
 *
 * update_queue must be called after a semaphore in a semaphore array
 * was modified. If multiple semaphores were modified, update_queue must
 * be called with semnum = -1, as well as with the number of each modified
 * semaphore.
 * The tasks that must be woken up are added to @wake_q. The return code
 * is stored in q->pid.
 * The function internally checks if const operations can now succeed.
 *
 * The function return 1 if at least one semop was completed successfully.
 */
static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
{
        struct sem_queue *q, *tmp;
        struct list_head *pending_list;
        int semop_completed = 0;

        if (semnum == -1)
                pending_list = &sma->pending_alter;
        else
                pending_list = &sma->sems[semnum].pending_alter;

again:
        list_for_each_entry_safe(q, tmp, pending_list, list) {
                int error, restart;

                /* If we are scanning the single sop, per-semaphore list of
                 * one semaphore and that semaphore is 0, then it is not
                 * necessary to scan further: simple increments
                 * that affect only one entry succeed immediately and cannot
                 * be in the  per semaphore pending queue, and decrements
                 * cannot be successful if the value is already 0.
                 */
                if (semnum != -1 && sma->sems[semnum].semval == 0)
                        break;

                error = perform_atomic_semop(sma, q);

                /* Does q->sleeper still need to sleep? */
                if (error > 0)
                        continue;

                unlink_queue(sma, q);

                if (error) {
                        restart = 0;
                } else {
                        semop_completed = 1;
                        do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
                        restart = check_restart(sma, q);
                }

                wake_up_sem_queue_prepare(q, error, wake_q);
                if (restart)
                        goto again;
        }
        return semop_completed;
}

/**
 * set_semotime - set sem_otime
 * @sma: semaphore array
 * @sops: operations that modified the array, may be NULL
 *
 * sem_otime is replicated to avoid cache line trashing.
 * This function sets one instance to the current time.
 */
static void set_semotime(struct sem_array *sma, struct sembuf *sops)
{
        if (sops == NULL) {
                sma->sems[0].sem_otime = ktime_get_real_seconds();
        } else {
                sma->sems[sops[0].sem_num].sem_otime =
                                                ktime_get_real_seconds();
        }
}

/**
 * do_smart_update - optimized update_queue
 * @sma: semaphore array
 * @sops: operations that were performed
 * @nsops: number of operations
 * @otime: force setting otime
 * @wake_q: lockless wake-queue head
 *
 * do_smart_update() does the required calls to update_queue and wakeup_zero,
 * based on the actual changes that were performed on the semaphore array.
 * Note that the function does not do the actual wake-up: the caller is
 * responsible for calling wake_up_q().
 * It is safe to perform this call after dropping all locks.
 */
static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
                            int otime, struct wake_q_head *wake_q)
{
        int i;

        otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q);

        if (!list_empty(&sma->pending_alter)) {
                /* semaphore array uses the global queue - just process it. */
                otime |= update_queue(sma, -1, wake_q);
        } else {
                if (!sops) {
                        /*
                         * No sops, thus the modified semaphores are not
                         * known. Check all.
                         */
                        for (i = 0; i < sma->sem_nsems; i++)
                                otime |= update_queue(sma, i, wake_q);
                } else {
                        /*
                         * Check the semaphores that were increased:
                         * - No complex ops, thus all sleeping ops are
                         *   decrease.
                         * - if we decreased the value, then any sleeping
                         *   semaphore ops won't be able to run: If the
                         *   previous value was too small, then the new
                         *   value will be too small, too.
                         */
                        for (i = 0; i < nsops; i++) {
                                if (sops[i].sem_op > 0) {
                                        otime |= update_queue(sma,
                                                              sops[i].sem_num, wake_q);
                                }
                        }
                }
        }
        if (otime)
                set_semotime(sma, sops);
}

/*
 * check_qop: Test if a queued operation sleeps on the semaphore semnum
 */
static int check_qop(struct sem_array *sma, int semnum, struct sem_queue *q,
                        bool count_zero)
{
        struct sembuf *sop = q->blocking;

        /*
         * Linux always (since 0.99.10) reported a task as sleeping on all
         * semaphores. This violates SUS, therefore it was changed to the
         * standard compliant behavior.
         * Give the administrators a chance to notice that an application
         * might misbehave because it relies on the Linux behavior.
         */
        pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n"
                        "The task %s (%d) triggered the difference, watch for misbehavior.\n",
                        current->comm, task_pid_nr(current));

        if (sop->sem_num != semnum)
                return 0;

        if (count_zero && sop->sem_op == 0)
                return 1;
        if (!count_zero && sop->sem_op < 0)
                return 1;

        return 0;
}

/* The following counts are associated to each semaphore:
 *   semncnt        number of tasks waiting on semval being nonzero
 *   semzcnt        number of tasks waiting on semval being zero
 *
 * Per definition, a task waits only on the semaphore of the first semop
 * that cannot proceed, even if additional operation would block, too.
 */
static int count_semcnt(struct sem_array *sma, ushort semnum,
                        bool count_zero)
{
        struct list_head *l;
        struct sem_queue *q;
        int semcnt;

        semcnt = 0;
        /* First: check the simple operations. They are easy to evaluate */
        if (count_zero)
                l = &sma->sems[semnum].pending_const;
        else
                l = &sma->sems[semnum].pending_alter;

        list_for_each_entry(q, l, list) {
                /* all task on a per-semaphore list sleep on exactly
                 * that semaphore
                 */
                semcnt++;
        }

        /* Then: check the complex operations. */
        list_for_each_entry(q, &sma->pending_alter, list) {
                semcnt += check_qop(sma, semnum, q, count_zero);
        }
        if (count_zero) {
                list_for_each_entry(q, &sma->pending_const, list) {
                        semcnt += check_qop(sma, semnum, q, count_zero);
                }
        }
        return semcnt;
}

/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
 * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
 * remains locked on exit.
 */
static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
{
        struct sem_undo *un, *tu;
        struct sem_queue *q, *tq;
        struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
        int i;
        DEFINE_WAKE_Q(wake_q);

        /* Free the existing undo structures for this semaphore set.  */
        ipc_assert_locked_object(&sma->sem_perm);
        list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
                list_del(&un->list_id);
                spin_lock(&un->ulp->lock);
                un->semid = -1;
                list_del_rcu(&un->list_proc);
                spin_unlock(&un->ulp->lock);
                kvfree_rcu(un, rcu);
        }

        /* Wake up all pending processes and let them fail with EIDRM. */
        list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
                unlink_queue(sma, q);
                wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
        }

        list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
                unlink_queue(sma, q);
                wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
        }
        for (i = 0; i < sma->sem_nsems; i++) {
                struct sem *sem = &sma->sems[i];
                list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
                        unlink_queue(sma, q);
                        wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
                }
                list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
                        unlink_queue(sma, q);
                        wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
                }
                ipc_update_pid(&sem->sempid, NULL);
        }

        /* Remove the semaphore set from the IDR */
        sem_rmid(ns, sma);
        sem_unlock(sma, -1);
        rcu_read_unlock();

        wake_up_q(&wake_q);
        ns->used_sems -= sma->sem_nsems;
        ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
}

static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
{
        switch (version) {
        case IPC_64:
                return copy_to_user(buf, in, sizeof(*in));
        case IPC_OLD:
            {
                struct semid_ds out;

                memset(&out, 0, sizeof(out));

                ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);

                out.sem_otime        = in->sem_otime;
                out.sem_ctime        = in->sem_ctime;
                out.sem_nsems        = in->sem_nsems;

                return copy_to_user(buf, &out, sizeof(out));
            }
        default:
                return -EINVAL;
        }
}

static time64_t get_semotime(struct sem_array *sma)
{
        int i;
        time64_t res;

        res = sma->sems[0].sem_otime;
        for (i = 1; i < sma->sem_nsems; i++) {
                time64_t to = sma->sems[i].sem_otime;

                if (to > res)
                        res = to;
        }
        return res;
}

static int semctl_stat(struct ipc_namespace *ns, int semid,
                         int cmd, struct semid64_ds *semid64)
{
        struct sem_array *sma;
        time64_t semotime;
        int err;

        memset(semid64, 0, sizeof(*semid64));

        rcu_read_lock();
        if (cmd == SEM_STAT || cmd == SEM_STAT_ANY) {
                sma = sem_obtain_object(ns, semid);
                if (IS_ERR(sma)) {
                        err = PTR_ERR(sma);
                        goto out_unlock;
                }
        } else { /* IPC_STAT */
                sma = sem_obtain_object_check(ns, semid);
                if (IS_ERR(sma)) {
                        err = PTR_ERR(sma);
                        goto out_unlock;
                }
        }

        /* see comment for SHM_STAT_ANY */
        if (cmd == SEM_STAT_ANY)
                audit_ipc_obj(&sma->sem_perm);
        else {
                err = -EACCES;
                if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
                        goto out_unlock;
        }

        err = security_sem_semctl(&sma->sem_perm, cmd);
        if (err)
                goto out_unlock;

        ipc_lock_object(&sma->sem_perm);

        if (!ipc_valid_object(&sma->sem_perm)) {
                ipc_unlock_object(&sma->sem_perm);
                err = -EIDRM;
                goto out_unlock;
        }

        kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm);
        semotime = get_semotime(sma);
        semid64->sem_otime = semotime;
        semid64->sem_ctime = sma->sem_ctime;
#ifndef CONFIG_64BIT
        semid64->sem_otime_high = semotime >> 32;
        semid64->sem_ctime_high = sma->sem_ctime >> 32;
#endif
        semid64->sem_nsems = sma->sem_nsems;

        if (cmd == IPC_STAT) {
                /*
                 * As defined in SUS:
                 * Return 0 on success
                 */
                err = 0;
        } else {
                /*
                 * SEM_STAT and SEM_STAT_ANY (both Linux specific)
                 * Return the full id, including the sequence number
                 */
                err = sma->sem_perm.id;
        }
        ipc_unlock_object(&sma->sem_perm);
out_unlock:
        rcu_read_unlock();
        return err;
}

static int semctl_info(struct ipc_namespace *ns, int semid,
                         int cmd, void __user *p)
{
        struct seminfo seminfo;
        int max_idx;
        int err;

        err = security_sem_semctl(NULL, cmd);
        if (err)
                return err;

        memset(&seminfo, 0, sizeof(seminfo));
        seminfo.semmni = ns->sc_semmni;
        seminfo.semmns = ns->sc_semmns;
        seminfo.semmsl = ns->sc_semmsl;
        seminfo.semopm = ns->sc_semopm;
        seminfo.semvmx = SEMVMX;
        seminfo.semmnu = SEMMNU;
        seminfo.semmap = SEMMAP;
        seminfo.semume = SEMUME;
        down_read(&sem_ids(ns).rwsem);
        if (cmd == SEM_INFO) {
                seminfo.semusz = sem_ids(ns).in_use;
                seminfo.semaem = ns->used_sems;
        } else {
                seminfo.semusz = SEMUSZ;
                seminfo.semaem = SEMAEM;
        }
        max_idx = ipc_get_maxidx(&sem_ids(ns));
        up_read(&sem_ids(ns).rwsem);
        if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
                return -EFAULT;
        return (max_idx < 0) ? 0 : max_idx;
}

static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
                int val)
{
        struct sem_undo *un;
        struct sem_array *sma;
        struct sem *curr;
        int err;
        DEFINE_WAKE_Q(wake_q);

        if (val > SEMVMX || val < 0)
                return -ERANGE;

        rcu_read_lock();
        sma = sem_obtain_object_check(ns, semid);
        if (IS_ERR(sma)) {
                rcu_read_unlock();
                return PTR_ERR(sma);
        }

        if (semnum < 0 || semnum >= sma->sem_nsems) {
                rcu_read_unlock();
                return -EINVAL;
        }


        if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
                rcu_read_unlock();
                return -EACCES;
        }

        err = security_sem_semctl(&sma->sem_perm, SETVAL);
        if (err) {
                rcu_read_unlock();
                return -EACCES;
        }

        sem_lock(sma, NULL, -1);

        if (!ipc_valid_object(&sma->sem_perm)) {
                sem_unlock(sma, -1);
                rcu_read_unlock();
                return -EIDRM;
        }

        semnum = array_index_nospec(semnum, sma->sem_nsems);
        curr = &sma->sems[semnum];

        ipc_assert_locked_object(&sma->sem_perm);
        list_for_each_entry(un, &sma->list_id, list_id)
                un->semadj[semnum] = 0;

        curr->semval = val;
        ipc_update_pid(&curr->sempid, task_tgid(current));
        sma->sem_ctime = ktime_get_real_seconds();
        /* maybe some queued-up processes were waiting for this */
        do_smart_update(sma, NULL, 0, 0, &wake_q);
        sem_unlock(sma, -1);
        rcu_read_unlock();
        wake_up_q(&wake_q);
        return 0;
}

static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                int cmd, void __user *p)
{
        struct sem_array *sma;
        struct sem *curr;
        int err, nsems;
        ushort fast_sem_io[SEMMSL_FAST];
        ushort *sem_io = fast_sem_io;
        DEFINE_WAKE_Q(wake_q);

        rcu_read_lock();
        sma = sem_obtain_object_check(ns, semid);
        if (IS_ERR(sma)) {
                rcu_read_unlock();
                return PTR_ERR(sma);
        }

        nsems = sma->sem_nsems;

        err = -EACCES;
        if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
                goto out_rcu_wakeup;

        err = security_sem_semctl(&sma->sem_perm, cmd);
        if (err)
                goto out_rcu_wakeup;

        switch (cmd) {
        case GETALL:
        {
                ushort __user *array = p;
                int i;

                sem_lock(sma, NULL, -1);
                if (!ipc_valid_object(&sma->sem_perm)) {
                        err = -EIDRM;
                        goto out_unlock;
                }
                if (nsems > SEMMSL_FAST) {
                        if (!ipc_rcu_getref(&sma->sem_perm)) {
                                err = -EIDRM;
                                goto out_unlock;
                        }
                        sem_unlock(sma, -1);
                        rcu_read_unlock();
                        sem_io = kvmalloc_array(nsems, sizeof(ushort),
                                                GFP_KERNEL);
                        if (sem_io == NULL) {
                                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                                return -ENOMEM;
                        }

                        rcu_read_lock();
                        sem_lock_and_putref(sma);
                        if (!ipc_valid_object(&sma->sem_perm)) {
                                err = -EIDRM;
                                goto out_unlock;
                        }
                }
                for (i = 0; i < sma->sem_nsems; i++)
                        sem_io[i] = sma->sems[i].semval;
                sem_unlock(sma, -1);
                rcu_read_unlock();
                err = 0;
                if (copy_to_user(array, sem_io, nsems*sizeof(ushort)))
                        err = -EFAULT;
                goto out_free;
        }
        case SETALL:
        {
                int i;
                struct sem_undo *un;

                if (!ipc_rcu_getref(&sma->sem_perm)) {
                        err = -EIDRM;
                        goto out_rcu_wakeup;
                }
                rcu_read_unlock();

                if (nsems > SEMMSL_FAST) {
                        sem_io = kvmalloc_array(nsems, sizeof(ushort),
                                                GFP_KERNEL);
                        if (sem_io == NULL) {
                                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                                return -ENOMEM;
                        }
                }

                if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) {
                        ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                        err = -EFAULT;
                        goto out_free;
                }

                for (i = 0; i < nsems; i++) {
                        if (sem_io[i] > SEMVMX) {
                                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                                err = -ERANGE;
                                goto out_free;
                        }
                }
                rcu_read_lock();
                sem_lock_and_putref(sma);
                if (!ipc_valid_object(&sma->sem_perm)) {
                        err = -EIDRM;
                        goto out_unlock;
                }

                for (i = 0; i < nsems; i++) {
                        sma->sems[i].semval = sem_io[i];
                        ipc_update_pid(&sma->sems[i].sempid, task_tgid(current));
                }

                ipc_assert_locked_object(&sma->sem_perm);
                list_for_each_entry(un, &sma->list_id, list_id) {
                        for (i = 0; i < nsems; i++)
                                un->semadj[i] = 0;
                }
                sma->sem_ctime = ktime_get_real_seconds();
                /* maybe some queued-up processes were waiting for this */
                do_smart_update(sma, NULL, 0, 0, &wake_q);
                err = 0;
                goto out_unlock;
        }
        /* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
        }
        err = -EINVAL;
        if (semnum < 0 || semnum >= nsems)
                goto out_rcu_wakeup;

        sem_lock(sma, NULL, -1);
        if (!ipc_valid_object(&sma->sem_perm)) {
                err = -EIDRM;
                goto out_unlock;
        }

        semnum = array_index_nospec(semnum, nsems);
        curr = &sma->sems[semnum];

        switch (cmd) {
        case GETVAL:
                err = curr->semval;
                goto out_unlock;
        case GETPID:
                err = pid_vnr(curr->sempid);
                goto out_unlock;
        case GETNCNT:
                err = count_semcnt(sma, semnum, 0);
                goto out_unlock;
        case GETZCNT:
                err = count_semcnt(sma, semnum, 1);
                goto out_unlock;
        }

out_unlock:
        sem_unlock(sma, -1);
out_rcu_wakeup:
        rcu_read_unlock();
        wake_up_q(&wake_q);
out_free:
        if (sem_io != fast_sem_io)
                kvfree(sem_io);
        return err;
}

static inline unsigned long
copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
{
        switch (version) {
        case IPC_64:
                if (copy_from_user(out, buf, sizeof(*out)))
                        return -EFAULT;
                return 0;
        case IPC_OLD:
            {
                struct semid_ds tbuf_old;

                if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
                        return -EFAULT;

                out->sem_perm.uid        = tbuf_old.sem_perm.uid;
                out->sem_perm.gid        = tbuf_old.sem_perm.gid;
                out->sem_perm.mode        = tbuf_old.sem_perm.mode;

                return 0;
            }
        default:
                return -EINVAL;
        }
}

/*
 * This function handles some semctl commands which require the rwsem
 * to be held in write mode.
 * NOTE: no locks must be held, the rwsem is taken inside this function.
 */
static int semctl_down(struct ipc_namespace *ns, int semid,
                       int cmd, struct semid64_ds *semid64)
{
        struct sem_array *sma;
        int err;
        struct kern_ipc_perm *ipcp;

        down_write(&sem_ids(ns).rwsem);
        rcu_read_lock();

        ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd,
                                      &semid64->sem_perm, 0);
        if (IS_ERR(ipcp)) {
                err = PTR_ERR(ipcp);
                goto out_unlock1;
        }

        sma = container_of(ipcp, struct sem_array, sem_perm);

        err = security_sem_semctl(&sma->sem_perm, cmd);
        if (err)
                goto out_unlock1;

        switch (cmd) {
        case IPC_RMID:
                sem_lock(sma, NULL, -1);
                /* freeary unlocks the ipc object and rcu */
                freeary(ns, ipcp);
                goto out_up;
        case IPC_SET:
                sem_lock(sma, NULL, -1);
                err = ipc_update_perm(&semid64->sem_perm, ipcp);
                if (err)
                        goto out_unlock0;
                sma->sem_ctime = ktime_get_real_seconds();
                break;
        default:
                err = -EINVAL;
                goto out_unlock1;
        }

out_unlock0:
        sem_unlock(sma, -1);
out_unlock1:
        rcu_read_unlock();
out_up:
        up_write(&sem_ids(ns).rwsem);
        return err;
}

static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int version)
{
        struct ipc_namespace *ns;
        void __user *p = (void __user *)arg;
        struct semid64_ds semid64;
        int err;

        if (semid < 0)
                return -EINVAL;

        ns = current->nsproxy->ipc_ns;

        switch (cmd) {
        case IPC_INFO:
        case SEM_INFO:
                return semctl_info(ns, semid, cmd, p);
        case IPC_STAT:
        case SEM_STAT:
        case SEM_STAT_ANY:
                err = semctl_stat(ns, semid, cmd, &semid64);
                if (err < 0)
                        return err;
                if (copy_semid_to_user(p, &semid64, version))
                        err = -EFAULT;
                return err;
        case GETALL:
        case GETVAL:
        case GETPID:
        case GETNCNT:
        case GETZCNT:
        case SETALL:
                return semctl_main(ns, semid, semnum, cmd, p);
        case SETVAL: {
                int val;
#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
                /* big-endian 64bit */
                val = arg >> 32;
#else
                /* 32bit or little-endian 64bit */
                val = arg;
#endif
                return semctl_setval(ns, semid, semnum, val);
        }
        case IPC_SET:
                if (copy_semid_from_user(&semid64, p, version))
                        return -EFAULT;
                fallthrough;
        case IPC_RMID:
                return semctl_down(ns, semid, cmd, &semid64);
        default:
                return -EINVAL;
        }
}

SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
{
        return ksys_semctl(semid, semnum, cmd, arg, IPC_64);
}

#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg)
{
        int version = ipc_parse_version(&cmd);

        return ksys_semctl(semid, semnum, cmd, arg, version);
}

SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
{
        return ksys_old_semctl(semid, semnum, cmd, arg);
}
#endif

#ifdef CONFIG_COMPAT

struct compat_semid_ds {
        struct compat_ipc_perm sem_perm;
        old_time32_t sem_otime;
        old_time32_t sem_ctime;
        compat_uptr_t sem_base;
        compat_uptr_t sem_pending;
        compat_uptr_t sem_pending_last;
        compat_uptr_t undo;
        unsigned short sem_nsems;
};

static int copy_compat_semid_from_user(struct semid64_ds *out, void __user *buf,
                                        int version)
{
        memset(out, 0, sizeof(*out));
        if (version == IPC_64) {
                struct compat_semid64_ds __user *p = buf;
                return get_compat_ipc64_perm(&out->sem_perm, &p->sem_perm);
        } else {
                struct compat_semid_ds __user *p = buf;
                return get_compat_ipc_perm(&out->sem_perm, &p->sem_perm);
        }
}

static int copy_compat_semid_to_user(void __user *buf, struct semid64_ds *in,
                                        int version)
{
        if (version == IPC_64) {
                struct compat_semid64_ds v;
                memset(&v, 0, sizeof(v));
                to_compat_ipc64_perm(&v.sem_perm, &in->sem_perm);
                v.sem_otime         = lower_32_bits(in->sem_otime);
                v.sem_otime_high = upper_32_bits(in->sem_otime);
                v.sem_ctime         = lower_32_bits(in->sem_ctime);
                v.sem_ctime_high = upper_32_bits(in->sem_ctime);
                v.sem_nsems = in->sem_nsems;
                return copy_to_user(buf, &v, sizeof(v));
        } else {
                struct compat_semid_ds v;
                memset(&v, 0, sizeof(v));
                to_compat_ipc_perm(&v.sem_perm, &in->sem_perm);
                v.sem_otime = in->sem_otime;
                v.sem_ctime = in->sem_ctime;
                v.sem_nsems = in->sem_nsems;
                return copy_to_user(buf, &v, sizeof(v));
        }
}

static long compat_ksys_semctl(int semid, int semnum, int cmd, int arg, int version)
{
        void __user *p = compat_ptr(arg);
        struct ipc_namespace *ns;
        struct semid64_ds semid64;
        int err;

        ns = current->nsproxy->ipc_ns;

        if (semid < 0)
                return -EINVAL;

        switch (cmd & (~IPC_64)) {
        case IPC_INFO:
        case SEM_INFO:
                return semctl_info(ns, semid, cmd, p);
        case IPC_STAT:
        case SEM_STAT:
        case SEM_STAT_ANY:
                err = semctl_stat(ns, semid, cmd, &semid64);
                if (err < 0)
                        return err;
                if (copy_compat_semid_to_user(p, &semid64, version))
                        err = -EFAULT;
                return err;
        case GETVAL:
        case GETPID:
        case GETNCNT:
        case GETZCNT:
        case GETALL:
        case SETALL:
                return semctl_main(ns, semid, semnum, cmd, p);
        case SETVAL:
                return semctl_setval(ns, semid, semnum, arg);
        case IPC_SET:
                if (copy_compat_semid_from_user(&semid64, p, version))
                        return -EFAULT;
                fallthrough;
        case IPC_RMID:
                return semctl_down(ns, semid, cmd, &semid64);
        default:
                return -EINVAL;
        }
}

COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg)
{
        return compat_ksys_semctl(semid, semnum, cmd, arg, IPC_64);
}

#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION
long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg)
{
        int version = compat_ipc_parse_version(&cmd);

        return compat_ksys_semctl(semid, semnum, cmd, arg, version);
}

COMPAT_SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, int, arg)
{
        return compat_ksys_old_semctl(semid, semnum, cmd, arg);
}
#endif
#endif

/* If the task doesn't already have a undo_list, then allocate one
 * here.  We guarantee there is only one thread using this undo list,
 * and current is THE ONE
 *
 * If this allocation and assignment succeeds, but later
 * portions of this code fail, there is no need to free the sem_undo_list.
 * Just let it stay associated with the task, and it'll be freed later
 * at exit time.
 *
 * This can block, so callers must hold no locks.
 */
static inline int get_undo_list(struct sem_undo_list **undo_listp)
{
        struct sem_undo_list *undo_list;

        undo_list = current->sysvsem.undo_list;
        if (!undo_list) {
                undo_list = kzalloc_obj(*undo_list, GFP_KERNEL_ACCOUNT);
                if (undo_list == NULL)
                        return -ENOMEM;
                spin_lock_init(&undo_list->lock);
                refcount_set(&undo_list->refcnt, 1);
                INIT_LIST_HEAD(&undo_list->list_proc);

                current->sysvsem.undo_list = undo_list;
        }
        *undo_listp = undo_list;
        return 0;
}

static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid)
{
        struct sem_undo *un;

        list_for_each_entry_rcu(un, &ulp->list_proc, list_proc,
                                spin_is_locked(&ulp->lock)) {
                if (un->semid == semid)
                        return un;
        }
        return NULL;
}

static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
{
        struct sem_undo *un;

        assert_spin_locked(&ulp->lock);

        un = __lookup_undo(ulp, semid);
        if (un) {
                list_del_rcu(&un->list_proc);
                list_add_rcu(&un->list_proc, &ulp->list_proc);
        }
        return un;
}

/**
 * find_alloc_undo - lookup (and if not present create) undo array
 * @ns: namespace
 * @semid: semaphore array id
 *
 * The function looks up (and if not present creates) the undo structure.
 * The size of the undo structure depends on the size of the semaphore
 * array, thus the alloc path is not that straightforward.
 * Lifetime-rules: sem_undo is rcu-protected, on success, the function
 * performs a rcu_read_lock().
 */
static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
{
        struct sem_array *sma;
        struct sem_undo_list *ulp;
        struct sem_undo *un, *new;
        int nsems, error;

        error = get_undo_list(&ulp);
        if (error)
                return ERR_PTR(error);

        rcu_read_lock();
        spin_lock(&ulp->lock);
        un = lookup_undo(ulp, semid);
        spin_unlock(&ulp->lock);
        if (likely(un != NULL))
                goto out;

        /* no undo structure around - allocate one. */
        /* step 1: figure out the size of the semaphore array */
        sma = sem_obtain_object_check(ns, semid);
        if (IS_ERR(sma)) {
                rcu_read_unlock();
                return ERR_CAST(sma);
        }

        nsems = sma->sem_nsems;
        if (!ipc_rcu_getref(&sma->sem_perm)) {
                rcu_read_unlock();
                un = ERR_PTR(-EIDRM);
                goto out;
        }
        rcu_read_unlock();

        /* step 2: allocate new undo structure */
        new = kvzalloc_flex(*new, semadj, nsems, GFP_KERNEL_ACCOUNT);
        if (!new) {
                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                return ERR_PTR(-ENOMEM);
        }

        /* step 3: Acquire the lock on semaphore array */
        rcu_read_lock();
        sem_lock_and_putref(sma);
        if (!ipc_valid_object(&sma->sem_perm)) {
                sem_unlock(sma, -1);
                rcu_read_unlock();
                kvfree(new);
                un = ERR_PTR(-EIDRM);
                goto out;
        }
        spin_lock(&ulp->lock);

        /*
         * step 4: check for races: did someone else allocate the undo struct?
         */
        un = lookup_undo(ulp, semid);
        if (un) {
                spin_unlock(&ulp->lock);
                kvfree(new);
                goto success;
        }
        /* step 5: initialize & link new undo structure */
        new->ulp = ulp;
        new->semid = semid;
        assert_spin_locked(&ulp->lock);
        list_add_rcu(&new->list_proc, &ulp->list_proc);
        ipc_assert_locked_object(&sma->sem_perm);
        list_add(&new->list_id, &sma->list_id);
        un = new;
        spin_unlock(&ulp->lock);
success:
        sem_unlock(sma, -1);
out:
        return un;
}

long __do_semtimedop(int semid, struct sembuf *sops,
                unsigned nsops, const struct timespec64 *timeout,
                struct ipc_namespace *ns)
{
        int error = -EINVAL;
        struct sem_array *sma;
        struct sembuf *sop;
        struct sem_undo *un;
        int max, locknum;
        bool undos = false, alter = false, dupsop = false;
        struct sem_queue queue;
        unsigned long dup = 0;
        ktime_t expires, *exp = NULL;
        bool timed_out = false;

        if (nsops < 1 || semid < 0)
                return -EINVAL;
        if (nsops > ns->sc_semopm)
                return -E2BIG;

        if (timeout) {
                if (!timespec64_valid(timeout))
                        return -EINVAL;
                expires = ktime_add_safe(ktime_get(),
                                timespec64_to_ktime(*timeout));
                exp = &expires;
        }


        max = 0;
        for (sop = sops; sop < sops + nsops; sop++) {
                unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);

                if (sop->sem_num >= max)
                        max = sop->sem_num;
                if (sop->sem_flg & SEM_UNDO)
                        undos = true;
                if (dup & mask) {
                        /*
                         * There was a previous alter access that appears
                         * to have accessed the same semaphore, thus use
                         * the dupsop logic. "appears", because the detection
                         * can only check % BITS_PER_LONG.
                         */
                        dupsop = true;
                }
                if (sop->sem_op != 0) {
                        alter = true;
                        dup |= mask;
                }
        }

        if (undos) {
                /* On success, find_alloc_undo takes the rcu_read_lock */
                un = find_alloc_undo(ns, semid);
                if (IS_ERR(un)) {
                        error = PTR_ERR(un);
                        goto out;
                }
        } else {
                un = NULL;
                rcu_read_lock();
        }

        sma = sem_obtain_object_check(ns, semid);
        if (IS_ERR(sma)) {
                rcu_read_unlock();
                error = PTR_ERR(sma);
                goto out;
        }

        error = -EFBIG;
        if (max >= sma->sem_nsems) {
                rcu_read_unlock();
                goto out;
        }

        error = -EACCES;
        if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
                rcu_read_unlock();
                goto out;
        }

        error = security_sem_semop(&sma->sem_perm, sops, nsops, alter);
        if (error) {
                rcu_read_unlock();
                goto out;
        }

        error = -EIDRM;
        locknum = sem_lock(sma, sops, nsops);
        /*
         * We eventually might perform the following check in a lockless
         * fashion, considering ipc_valid_object() locking constraints.
         * If nsops == 1 and there is no contention for sem_perm.lock, then
         * only a per-semaphore lock is held and it's OK to proceed with the
         * check below. More details on the fine grained locking scheme
         * entangled here and why it's RMID race safe on comments at sem_lock()
         */
        if (!ipc_valid_object(&sma->sem_perm))
                goto out_unlock;
        /*
         * semid identifiers are not unique - find_alloc_undo may have
         * allocated an undo structure, it was invalidated by an RMID
         * and now a new array with received the same id. Check and fail.
         * This case can be detected checking un->semid. The existence of
         * "un" itself is guaranteed by rcu.
         */
        if (un && un->semid == -1)
                goto out_unlock;

        queue.sops = sops;
        queue.nsops = nsops;
        queue.undo = un;
        queue.pid = task_tgid(current);
        queue.alter = alter;
        queue.dupsop = dupsop;

        error = perform_atomic_semop(sma, &queue);
        if (error == 0) { /* non-blocking successful path */
                DEFINE_WAKE_Q(wake_q);

                /*
                 * If the operation was successful, then do
                 * the required updates.
                 */
                if (alter)
                        do_smart_update(sma, sops, nsops, 1, &wake_q);
                else
                        set_semotime(sma, sops);

                sem_unlock(sma, locknum);
                rcu_read_unlock();
                wake_up_q(&wake_q);

                goto out;
        }
        if (error < 0) /* non-blocking error path */
                goto out_unlock;

        /*
         * We need to sleep on this operation, so we put the current
         * task into the pending queue and go to sleep.
         */
        if (nsops == 1) {
                struct sem *curr;
                int idx = array_index_nospec(sops->sem_num, sma->sem_nsems);
                curr = &sma->sems[idx];

                if (alter) {
                        if (sma->complex_count) {
                                list_add_tail(&queue.list,
                                                &sma->pending_alter);
                        } else {

                                list_add_tail(&queue.list,
                                                &curr->pending_alter);
                        }
                } else {
                        list_add_tail(&queue.list, &curr->pending_const);
                }
        } else {
                if (!sma->complex_count)
                        merge_queues(sma);

                if (alter)
                        list_add_tail(&queue.list, &sma->pending_alter);
                else
                        list_add_tail(&queue.list, &sma->pending_const);

                sma->complex_count++;
        }

        do {
                /* memory ordering ensured by the lock in sem_lock() */
                WRITE_ONCE(queue.status, -EINTR);
                queue.sleeper = current;

                /* memory ordering is ensured by the lock in sem_lock() */
                __set_current_state(TASK_INTERRUPTIBLE);
                sem_unlock(sma, locknum);
                rcu_read_unlock();

                timed_out = !schedule_hrtimeout_range(exp,
                                current->timer_slack_ns, HRTIMER_MODE_ABS);

                /*
                 * fastpath: the semop has completed, either successfully or
                 * not, from the syscall pov, is quite irrelevant to us at this
                 * point; we're done.
                 *
                 * We _do_ care, nonetheless, about being awoken by a signal or
                 * spuriously.  The queue.status is checked again in the
                 * slowpath (aka after taking sem_lock), such that we can detect
                 * scenarios where we were awakened externally, during the
                 * window between wake_q_add() and wake_up_q().
                 */
                rcu_read_lock();
                error = READ_ONCE(queue.status);
                if (error != -EINTR) {
                        /* see SEM_BARRIER_2 for purpose/pairing */
                        smp_acquire__after_ctrl_dep();
                        rcu_read_unlock();
                        goto out;
                }

                locknum = sem_lock(sma, sops, nsops);

                if (!ipc_valid_object(&sma->sem_perm))
                        goto out_unlock;

                /*
                 * No necessity for any barrier: We are protect by sem_lock()
                 */
                error = READ_ONCE(queue.status);

                /*
                 * If queue.status != -EINTR we are woken up by another process.
                 * Leave without unlink_queue(), but with sem_unlock().
                 */
                if (error != -EINTR)
                        goto out_unlock;

                /*
                 * If an interrupt occurred we have to clean up the queue.
                 */
                if (timed_out)
                        error = -EAGAIN;
        } while (error == -EINTR && !signal_pending(current)); /* spurious */

        unlink_queue(sma, &queue);

out_unlock:
        sem_unlock(sma, locknum);
        rcu_read_unlock();
out:
        return error;
}

static long do_semtimedop(int semid, struct sembuf __user *tsops,
                unsigned nsops, const struct timespec64 *timeout)
{
        struct sembuf fast_sops[SEMOPM_FAST];
        struct sembuf *sops = fast_sops;
        struct ipc_namespace *ns;
        int ret;

        ns = current->nsproxy->ipc_ns;
        if (nsops > ns->sc_semopm)
                return -E2BIG;
        if (nsops < 1)
                return -EINVAL;

        if (nsops > SEMOPM_FAST) {
                sops = kvmalloc_objs(*sops, nsops);
                if (sops == NULL)
                        return -ENOMEM;
        }

        if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
                ret =  -EFAULT;
                goto out_free;
        }

        ret = __do_semtimedop(semid, sops, nsops, timeout, ns);

out_free:
        if (sops != fast_sops)
                kvfree(sops);

        return ret;
}

long ksys_semtimedop(int semid, struct sembuf __user *tsops,
                     unsigned int nsops, const struct __kernel_timespec __user *timeout)
{
        if (timeout) {
                struct timespec64 ts;
                if (get_timespec64(&ts, timeout))
                        return -EFAULT;
                return do_semtimedop(semid, tsops, nsops, &ts);
        }
        return do_semtimedop(semid, tsops, nsops, NULL);
}

SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
                unsigned int, nsops, const struct __kernel_timespec __user *, timeout)
{
        return ksys_semtimedop(semid, tsops, nsops, timeout);
}

#ifdef CONFIG_COMPAT_32BIT_TIME
long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
                            unsigned int nsops,
                            const struct old_timespec32 __user *timeout)
{
        if (timeout) {
                struct timespec64 ts;
                if (get_old_timespec32(&ts, timeout))
                        return -EFAULT;
                return do_semtimedop(semid, tsems, nsops, &ts);
        }
        return do_semtimedop(semid, tsems, nsops, NULL);
}

SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems,
                       unsigned int, nsops,
                       const struct old_timespec32 __user *, timeout)
{
        return compat_ksys_semtimedop(semid, tsems, nsops, timeout);
}
#endif

SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
                unsigned, nsops)
{
        return do_semtimedop(semid, tsops, nsops, NULL);
}

/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
 * parent and child tasks.
 */

int copy_semundo(u64 clone_flags, struct task_struct *tsk)
{
        struct sem_undo_list *undo_list;
        int error;

        if (clone_flags & CLONE_SYSVSEM) {
                error = get_undo_list(&undo_list);
                if (error)
                        return error;
                refcount_inc(&undo_list->refcnt);
                tsk->sysvsem.undo_list = undo_list;
        } else
                tsk->sysvsem.undo_list = NULL;

        return 0;
}

/*
 * add semadj values to semaphores, free undo structures.
 * undo structures are not freed when semaphore arrays are destroyed
 * so some of them may be out of date.
 * IMPLEMENTATION NOTE: There is some confusion over whether the
 * set of adjustments that needs to be done should be done in an atomic
 * manner or not. That is, if we are attempting to decrement the semval
 * should we queue up and wait until we can do so legally?
 * The original implementation attempted to do this (queue and wait).
 * The current implementation does not do so. The POSIX standard
 * and SVID should be consulted to determine what behavior is mandated.
 */
void exit_sem(struct task_struct *tsk)
{
        struct sem_undo_list *ulp;

        ulp = tsk->sysvsem.undo_list;
        if (!ulp)
                return;
        tsk->sysvsem.undo_list = NULL;

        if (!refcount_dec_and_test(&ulp->refcnt))
                return;

        for (;;) {
                struct sem_array *sma;
                struct sem_undo *un;
                int semid, i;
                DEFINE_WAKE_Q(wake_q);

                cond_resched();

                rcu_read_lock();
                un = list_entry_rcu(ulp->list_proc.next,
                                    struct sem_undo, list_proc);
                if (&un->list_proc == &ulp->list_proc) {
                        /*
                         * We must wait for freeary() before freeing this ulp,
                         * in case we raced with last sem_undo. There is a small
                         * possibility where we exit while freeary() didn't
                         * finish unlocking sem_undo_list.
                         */
                        spin_lock(&ulp->lock);
                        spin_unlock(&ulp->lock);
                        rcu_read_unlock();
                        break;
                }
                spin_lock(&ulp->lock);
                semid = un->semid;
                spin_unlock(&ulp->lock);

                /* exit_sem raced with IPC_RMID, nothing to do */
                if (semid == -1) {
                        rcu_read_unlock();
                        continue;
                }

                sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, semid);
                /* exit_sem raced with IPC_RMID, nothing to do */
                if (IS_ERR(sma)) {
                        rcu_read_unlock();
                        continue;
                }

                sem_lock(sma, NULL, -1);
                /* exit_sem raced with IPC_RMID, nothing to do */
                if (!ipc_valid_object(&sma->sem_perm)) {
                        sem_unlock(sma, -1);
                        rcu_read_unlock();
                        continue;
                }
                un = __lookup_undo(ulp, semid);
                if (un == NULL) {
                        /* exit_sem raced with IPC_RMID+semget() that created
                         * exactly the same semid. Nothing to do.
                         */
                        sem_unlock(sma, -1);
                        rcu_read_unlock();
                        continue;
                }

                /* remove un from the linked lists */
                ipc_assert_locked_object(&sma->sem_perm);
                list_del(&un->list_id);

                spin_lock(&ulp->lock);
                list_del_rcu(&un->list_proc);
                spin_unlock(&ulp->lock);

                /* perform adjustments registered in un */
                for (i = 0; i < sma->sem_nsems; i++) {
                        struct sem *semaphore = &sma->sems[i];
                        if (un->semadj[i]) {
                                semaphore->semval += un->semadj[i];
                                /*
                                 * Range checks of the new semaphore value,
                                 * not defined by sus:
                                 * - Some unices ignore the undo entirely
                                 *   (e.g. HP UX 11i 11.22, Tru64 V5.1)
                                 * - some cap the value (e.g. FreeBSD caps
                                 *   at 0, but doesn't enforce SEMVMX)
                                 *
                                 * Linux caps the semaphore value, both at 0
                                 * and at SEMVMX.
                                 *
                                 *        Manfred <manfred@colorfullife.com>
                                 */
                                if (semaphore->semval < 0)
                                        semaphore->semval = 0;
                                if (semaphore->semval > SEMVMX)
                                        semaphore->semval = SEMVMX;
                                ipc_update_pid(&semaphore->sempid, task_tgid(current));
                        }
                }
                /* maybe some queued-up processes were waiting for this */
                do_smart_update(sma, NULL, 0, 1, &wake_q);
                sem_unlock(sma, -1);
                rcu_read_unlock();
                wake_up_q(&wake_q);

                kvfree_rcu(un, rcu);
        }
        kfree(ulp);
}

#ifdef CONFIG_PROC_FS
static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
{
        struct user_namespace *user_ns = seq_user_ns(s);
        struct kern_ipc_perm *ipcp = it;
        struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
        time64_t sem_otime;

        /*
         * The proc interface isn't aware of sem_lock(), it calls
         * ipc_lock_object(), i.e. spin_lock(&sma->sem_perm.lock).
         * (in sysvipc_find_ipc)
         * In order to stay compatible with sem_lock(), we must
         * enter / leave complex_mode.
         */
        complexmode_enter(sma);

        sem_otime = get_semotime(sma);

        seq_printf(s,
                   "%10d %10d  %4o %10u %5u %5u %5u %5u %10llu %10llu\n",
                   sma->sem_perm.key,
                   sma->sem_perm.id,
                   sma->sem_perm.mode,
                   sma->sem_nsems,
                   from_kuid_munged(user_ns, sma->sem_perm.uid),
                   from_kgid_munged(user_ns, sma->sem_perm.gid),
                   from_kuid_munged(user_ns, sma->sem_perm.cuid),
                   from_kgid_munged(user_ns, sma->sem_perm.cgid),
                   sem_otime,
                   sma->sem_ctime);

        complexmode_tryleave(sma);

        return 0;
}
#endif

















































































































    1 




















    1 























    1 













    1 



























































    1 








    1 






    1 





    1 




















    1 





































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * CDC Ethernet based networking peripherals
 * Copyright (C) 2003-2005 by David Brownell
 * Copyright (C) 2006 by Ole Andre Vadla Ravnas (ActiveSync)
 */

// #define        DEBUG                        // error path messages, extra info
// #define        VERBOSE                        // more; success messages

#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/workqueue.h>
#include <linux/mii.h>
#include <linux/usb.h>
#include <linux/usb/cdc.h>
#include <linux/usb/usbnet.h>


#if IS_ENABLED(CONFIG_USB_NET_RNDIS_HOST)

static int is_rndis(struct usb_interface_descriptor *desc)
{
        return (desc->bInterfaceClass == USB_CLASS_COMM &&
                desc->bInterfaceSubClass == 2 &&
                desc->bInterfaceProtocol == 0xff);
}

static int is_activesync(struct usb_interface_descriptor *desc)
{
        return (desc->bInterfaceClass == USB_CLASS_MISC &&
                desc->bInterfaceSubClass == 1 &&
                desc->bInterfaceProtocol == 1);
}

static int is_wireless_rndis(struct usb_interface_descriptor *desc)
{
        return (desc->bInterfaceClass == USB_CLASS_WIRELESS_CONTROLLER &&
                desc->bInterfaceSubClass == 1 &&
                desc->bInterfaceProtocol == 3);
}

static int is_novatel_rndis(struct usb_interface_descriptor *desc)
{
        return (desc->bInterfaceClass == USB_CLASS_MISC &&
                desc->bInterfaceSubClass == 4 &&
                desc->bInterfaceProtocol == 1);
}

#else

#define is_rndis(desc)                0
#define is_activesync(desc)        0
#define is_wireless_rndis(desc)        0
#define is_novatel_rndis(desc)        0

#endif

static const u8 mbm_guid[16] = {
        0xa3, 0x17, 0xa8, 0x8b, 0x04, 0x5e, 0x4f, 0x01,
        0xa6, 0x07, 0xc0, 0xff, 0xcb, 0x7e, 0x39, 0x2a,
};

void usbnet_cdc_update_filter(struct usbnet *dev)
{
        struct net_device        *net = dev->net;

        u16 cdc_filter = USB_CDC_PACKET_TYPE_DIRECTED
                        | USB_CDC_PACKET_TYPE_BROADCAST;

        /* filtering on the device is an optional feature and not worth
         * the hassle so we just roughly care about snooping and if any
         * multicast is requested, we take every multicast
         */
        if (net->flags & IFF_PROMISC)
                cdc_filter |= USB_CDC_PACKET_TYPE_PROMISCUOUS;
        if (!netdev_mc_empty(net) || (net->flags & IFF_ALLMULTI))
                cdc_filter |= USB_CDC_PACKET_TYPE_ALL_MULTICAST;

        usb_control_msg(dev->udev,
                        usb_sndctrlpipe(dev->udev, 0),
                        USB_CDC_SET_ETHERNET_PACKET_FILTER,
                        USB_TYPE_CLASS | USB_RECIP_INTERFACE,
                        cdc_filter,
                        dev->intf->cur_altsetting->desc.bInterfaceNumber,
                        NULL,
                        0,
                        USB_CTRL_SET_TIMEOUT
                );
}
EXPORT_SYMBOL_GPL(usbnet_cdc_update_filter);

/* We need to override usbnet_*_link_ksettings in bind() */
static const struct ethtool_ops cdc_ether_ethtool_ops = {
        .get_link                = usbnet_get_link,
        .nway_reset                = usbnet_nway_reset,
        .get_drvinfo                = usbnet_get_drvinfo,
        .get_msglevel                = usbnet_get_msglevel,
        .set_msglevel                = usbnet_set_msglevel,
        .get_ts_info                = ethtool_op_get_ts_info,
        .get_link_ksettings        = usbnet_get_link_ksettings_internal,
        .set_link_ksettings        = NULL,
};

/* probes control interface, claims data interface, collects the bulk
 * endpoints, activates data interface (if needed), maybe sets MTU.
 * all pure cdc, except for certain firmware workarounds, and knowing
 * that rndis uses one different rule.
 */
int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf)
{
        u8                                *buf = intf->cur_altsetting->extra;
        int                                len = intf->cur_altsetting->extralen;
        struct usb_interface_descriptor        *d;
        struct cdc_state                *info = (void *) &dev->data;
        int                                status = -ENODEV;
        int                                rndis;
        bool                                android_rndis_quirk = false;
        struct usb_driver                *driver = driver_of(intf);
        struct usb_cdc_parsed_header header;

        if (sizeof(dev->data) < sizeof(*info))
                return -EDOM;

        /* expect strict spec conformance for the descriptors, but
         * cope with firmware which stores them in the wrong place
         */
        if (len == 0 && dev->udev->actconfig->extralen) {
                /* Motorola SB4100 (and others: Brad Hards says it's
                 * from a Broadcom design) put CDC descriptors here
                 */
                buf = dev->udev->actconfig->extra;
                len = dev->udev->actconfig->extralen;
                dev_dbg(&intf->dev, "CDC descriptors on config\n");
        }

        /* Maybe CDC descriptors are after the endpoint?  This bug has
         * been seen on some 2Wire Inc RNDIS-ish products.
         */
        if (len == 0) {
                struct usb_host_endpoint        *hep;

                hep = intf->cur_altsetting->endpoint;
                if (hep) {
                        buf = hep->extra;
                        len = hep->extralen;
                }
                if (len)
                        dev_dbg(&intf->dev,
                                "CDC descriptors on endpoint\n");
        }

        /* this assumes that if there's a non-RNDIS vendor variant
         * of cdc-acm, it'll fail RNDIS requests cleanly.
         */
        rndis = (is_rndis(&intf->cur_altsetting->desc) ||
                 is_activesync(&intf->cur_altsetting->desc) ||
                 is_wireless_rndis(&intf->cur_altsetting->desc) ||
                 is_novatel_rndis(&intf->cur_altsetting->desc));

        memset(info, 0, sizeof(*info));
        info->control = intf;

        cdc_parse_cdc_header(&header, intf, buf, len);

        info->u = header.usb_cdc_union_desc;
        info->header = header.usb_cdc_header_desc;
        info->ether = header.usb_cdc_ether_desc;
        if (!info->u) {
                if (rndis) {
                        goto skip;
                } else {
                        /* in that case a quirk is mandatory */
                        dev_err(&dev->udev->dev, "No union descriptors\n");
                        goto bad_desc;
                }
        }
        /* we need a master/control interface (what we're
         * probed with) and a slave/data interface; union
         * descriptors sort this all out.
         */
        info->control = usb_ifnum_to_if(dev->udev, info->u->bMasterInterface0);
        info->data = usb_ifnum_to_if(dev->udev, info->u->bSlaveInterface0);
        if (!info->control || !info->data) {
                dev_dbg(&intf->dev,
                        "master #%u/%p slave #%u/%p\n",
                        info->u->bMasterInterface0,
                        info->control,
                        info->u->bSlaveInterface0,
                        info->data);
                /* fall back to hard-wiring for RNDIS */
                if (rndis) {
                        android_rndis_quirk = true;
                        goto skip;
                }
                dev_err(&intf->dev, "bad CDC descriptors\n");
                goto bad_desc;
        }
        if (info->control != intf) {
                /* Ambit USB Cable Modem (and maybe others)
                 * interchanges master and slave interface.
                 */
                if (info->data == intf) {
                        info->data = info->control;
                        info->control = intf;
                } else {
                        dev_err(&intf->dev, "bogus CDC Union\n");
                        goto bad_desc;
                }
        }

        /* some devices merge these - skip class check */
        if (info->control == info->data)
                goto skip;

        /* a data interface altsetting does the real i/o */
        d = &info->data->cur_altsetting->desc;
        if (d->bInterfaceClass != USB_CLASS_CDC_DATA) {
                dev_err(&intf->dev, "slave class %u\n", d->bInterfaceClass);
                goto bad_desc;
        }
skip:
        /* Communication class functions with bmCapabilities are not
         * RNDIS.  But some Wireless class RNDIS functions use
         * bmCapabilities for their own purpose. The failsafe is
         * therefore applied only to Communication class RNDIS
         * functions.  The rndis test is redundant, but a cheap
         * optimization.
         */
        if (rndis && is_rndis(&intf->cur_altsetting->desc) &&
            header.usb_cdc_acm_descriptor &&
            header.usb_cdc_acm_descriptor->bmCapabilities) {
                dev_err(&intf->dev,
                        "ACM capabilities %02x, not really RNDIS?\n",
                        header.usb_cdc_acm_descriptor->bmCapabilities);
                goto bad_desc;
        }

        if (header.usb_cdc_ether_desc && info->ether->wMaxSegmentSize) {
                dev->hard_mtu = le16_to_cpu(info->ether->wMaxSegmentSize);
                /* because of Zaurus, we may be ignoring the host
                 * side link address we were given.
                 */
        }

        if (header.usb_cdc_mdlm_desc &&
            memcmp(header.usb_cdc_mdlm_desc->bGUID, mbm_guid, 16)) {
                dev_err(&intf->dev, "GUID doesn't match\n");
                goto bad_desc;
        }

        if (header.usb_cdc_mdlm_detail_desc &&
                header.usb_cdc_mdlm_detail_desc->bLength <
                        (sizeof(struct usb_cdc_mdlm_detail_desc) + 1)) {
                dev_err(&intf->dev, "Descriptor too short\n");
                goto bad_desc;
        }



        /* Microsoft ActiveSync based and some regular RNDIS devices lack the
         * CDC descriptors, so we'll hard-wire the interfaces and not check
         * for descriptors.
         *
         * Some Android RNDIS devices have a CDC Union descriptor pointing
         * to non-existing interfaces.  Ignore that and attempt the same
         * hard-wired 0 and 1 interfaces.
         */
        if (rndis && (!info->u || android_rndis_quirk)) {
                info->control = usb_ifnum_to_if(dev->udev, 0);
                info->data = usb_ifnum_to_if(dev->udev, 1);
                if (!info->control || !info->data || info->control != intf) {
                        dev_err(&intf->dev,
                                "rndis: master #0/%p slave #1/%p\n",
                                info->control,
                                info->data);
                        goto bad_desc;
                }

        } else if (!info->header || (!rndis && !info->ether)) {
                dev_err(&intf->dev, "missing cdc %s%s%sdescriptor\n",
                        info->header ? "" : "header ",
                        info->u ? "" : "union ",
                        info->ether ? "" : "ether ");
                goto bad_desc;
        }

        /* claim data interface and set it up ... with side effects.
         * network traffic can't flow until an altsetting is enabled.
         */
        if (info->data != info->control) {
                status = usb_driver_claim_interface(driver, info->data, dev);
                if (status < 0) {
                        dev_err(&intf->dev, "Second interface unclaimable\n");
                        goto bad_desc;
                }
        }
        status = usbnet_get_endpoints(dev, info->data);
        if (status < 0) {
                dev_dbg(&intf->dev, "Mandatory endpoints missing\n");
                goto bail_out_and_release;
        }

        /* status endpoint: optional for CDC Ethernet, not RNDIS (or ACM) */
        if (info->data != info->control)
                dev->status = NULL;
        if (info->control->cur_altsetting->desc.bNumEndpoints == 1) {
                struct usb_endpoint_descriptor        *desc;

                dev->status = &info->control->cur_altsetting->endpoint[0];
                desc = &dev->status->desc;
                if (!usb_endpoint_is_int_in(desc) ||
                    (le16_to_cpu(desc->wMaxPacketSize)
                     < sizeof(struct usb_cdc_notification)) ||
                    !desc->bInterval) {
                        dev_dbg(&intf->dev, "bad notification endpoint\n");
                        dev->status = NULL;
                }
        }
        if (rndis && !dev->status) {
                dev_err(&intf->dev, "missing RNDIS status endpoint\n");
                status = -ENODEV;
                goto bail_out_and_release;
        }

        /* override ethtool_ops */
        dev->net->ethtool_ops = &cdc_ether_ethtool_ops;

        return 0;

bail_out_and_release:
        usb_set_intfdata(info->data, NULL);
        if (info->data != info->control)
                usb_driver_release_interface(driver, info->data);
bad_desc:
        return status;
}
EXPORT_SYMBOL_GPL(usbnet_generic_cdc_bind);


/* like usbnet_generic_cdc_bind() but handles filter initialization
 * correctly
 */
int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf)
{
        int rv;

        rv = usbnet_generic_cdc_bind(dev, intf);
        if (rv < 0)
                goto bail_out;

        /* Some devices don't initialise properly. In particular
         * the packet filter is not reset. There are devices that
         * don't do reset all the way. So the packet filter should
         * be set to a sane initial value.
         */
        usbnet_cdc_update_filter(dev);

bail_out:
        return rv;
}
EXPORT_SYMBOL_GPL(usbnet_ether_cdc_bind);

void usbnet_cdc_unbind(struct usbnet *dev, struct usb_interface *intf)
{
        struct cdc_state                *info = (void *) &dev->data;
        struct usb_driver                *driver = driver_of(intf);

        /* combined interface - nothing  to do */
        if (info->data == info->control)
                return;

        /* disconnect master --> disconnect slave */
        if (intf == info->control && info->data) {
                /* ensure immediate exit from usbnet_disconnect */
                usb_set_intfdata(info->data, NULL);
                usb_driver_release_interface(driver, info->data);
                info->data = NULL;
        }

        /* and vice versa (just in case) */
        else if (intf == info->data && info->control) {
                /* ensure immediate exit from usbnet_disconnect */
                usb_set_intfdata(info->control, NULL);
                usb_driver_release_interface(driver, info->control);
                info->control = NULL;
        }
}
EXPORT_SYMBOL_GPL(usbnet_cdc_unbind);

/* Communications Device Class, Ethernet Control model
 *
 * Takes two interfaces.  The DATA interface is inactive till an altsetting
 * is selected.  Configuration data includes class descriptors.  There's
 * an optional status endpoint on the control interface.
 *
 * This should interop with whatever the 2.4 "CDCEther.c" driver
 * (by Brad Hards) talked with, with more functionality.
 */

static void speed_change(struct usbnet *dev, __le32 *speeds)
{
        dev->tx_speed = __le32_to_cpu(speeds[0]);
        dev->rx_speed = __le32_to_cpu(speeds[1]);
}

void usbnet_cdc_status(struct usbnet *dev, struct urb *urb)
{
        struct usb_cdc_notification        *event;

        if (urb->actual_length < sizeof(*event))
                return;

        /* SPEED_CHANGE can get split into two 8-byte packets */
        if (test_and_clear_bit(EVENT_STS_SPLIT, &dev->flags)) {
                speed_change(dev, (__le32 *) urb->transfer_buffer);
                return;
        }

        event = urb->transfer_buffer;
        switch (event->bNotificationType) {
        case USB_CDC_NOTIFY_NETWORK_CONNECTION:
                netif_dbg(dev, timer, dev->net, "CDC: carrier %s\n",
                          event->wValue ? "on" : "off");
                if (netif_carrier_ok(dev->net) != !!event->wValue)
                        usbnet_link_change(dev, !!event->wValue, 0);
                break;
        case USB_CDC_NOTIFY_SPEED_CHANGE:        /* tx/rx rates */
                netif_dbg(dev, timer, dev->net, "CDC: speed change (len %d)\n",
                          urb->actual_length);
                if (urb->actual_length != (sizeof(*event) + 8))
                        set_bit(EVENT_STS_SPLIT, &dev->flags);
                else
                        speed_change(dev, (__le32 *) &event[1]);
                break;
        /* USB_CDC_NOTIFY_RESPONSE_AVAILABLE can happen too (e.g. RNDIS),
         * but there are no standard formats for the response data.
         */
        default:
                netdev_err(dev->net, "CDC: unexpected notification %02x!\n",
                           event->bNotificationType);
                break;
        }
}
EXPORT_SYMBOL_GPL(usbnet_cdc_status);

int usbnet_cdc_bind(struct usbnet *dev, struct usb_interface *intf)
{
        int                                status;
        struct cdc_state                *info = (void *) &dev->data;

        BUILD_BUG_ON((sizeof(((struct usbnet *)0)->data)
                        < sizeof(struct cdc_state)));

        status = usbnet_ether_cdc_bind(dev, intf);
        if (status < 0)
                return status;

        status = usbnet_get_ethernet_addr(dev, info->ether->iMACAddress);
        if (status < 0) {
                usb_set_intfdata(info->data, NULL);
                usb_driver_release_interface(driver_of(intf), info->data);
                return status;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(usbnet_cdc_bind);

static int usbnet_cdc_zte_bind(struct usbnet *dev, struct usb_interface *intf)
{
        int status = usbnet_cdc_bind(dev, intf);

        if (!status && (dev->net->dev_addr[0] & 0x02))
                eth_hw_addr_random(dev->net);

        return status;
}

/* Make sure packets have correct destination MAC address
 *
 * A firmware bug observed on some devices (ZTE MF823/831/910) is that the
 * device sends packets with a static, bogus, random MAC address (event if
 * device MAC address has been updated). Always set MAC address to that of the
 * device.
 */
int usbnet_cdc_zte_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
{
        if (skb->len < ETH_HLEN || !(skb->data[0] & 0x02))
                return 1;

        skb_reset_mac_header(skb);
        ether_addr_copy(eth_hdr(skb)->h_dest, dev->net->dev_addr);

        return 1;
}
EXPORT_SYMBOL_GPL(usbnet_cdc_zte_rx_fixup);

/* Ensure correct link state
 *
 * Some devices (ZTE MF823/831/910) export two carrier on notifications when
 * connected. This causes the link state to be incorrect. Work around this by
 * always setting the state to off, then on.
 */
static void usbnet_cdc_zte_status(struct usbnet *dev, struct urb *urb)
{
        struct usb_cdc_notification *event;

        if (urb->actual_length < sizeof(*event))
                return;

        event = urb->transfer_buffer;

        if (event->bNotificationType != USB_CDC_NOTIFY_NETWORK_CONNECTION) {
                usbnet_cdc_status(dev, urb);
                return;
        }

        netif_dbg(dev, timer, dev->net, "CDC: carrier %s\n",
                  event->wValue ? "on" : "off");

        if (event->wValue &&
            netif_carrier_ok(dev->net))
                netif_carrier_off(dev->net);

        usbnet_link_change(dev, !!event->wValue, 0);
}

static const struct driver_info        cdc_info = {
        .description =        "CDC Ethernet Device",
        .flags =        FLAG_ETHER | FLAG_POINTTOPOINT,
        .bind =                usbnet_cdc_bind,
        .unbind =        usbnet_cdc_unbind,
        .status =        usbnet_cdc_status,
        .set_rx_mode =        usbnet_cdc_update_filter,
        .manage_power =        usbnet_manage_power,
};

static const struct driver_info        zte_cdc_info = {
        .description =        "ZTE CDC Ethernet Device",
        .flags =        FLAG_ETHER | FLAG_POINTTOPOINT,
        .bind =                usbnet_cdc_zte_bind,
        .unbind =        usbnet_cdc_unbind,
        .status =        usbnet_cdc_zte_status,
        .set_rx_mode =        usbnet_cdc_update_filter,
        .manage_power =        usbnet_manage_power,
        .rx_fixup = usbnet_cdc_zte_rx_fixup,
};

static const struct driver_info wwan_info = {
        .description =        "Mobile Broadband Network Device",
        .flags =        FLAG_WWAN,
        .bind =                usbnet_cdc_bind,
        .unbind =        usbnet_cdc_unbind,
        .status =        usbnet_cdc_status,
        .set_rx_mode =        usbnet_cdc_update_filter,
        .manage_power =        usbnet_manage_power,
};

/*-------------------------------------------------------------------------*/

#define HUAWEI_VENDOR_ID        0x12D1
#define NOVATEL_VENDOR_ID        0x1410
#define ZTE_VENDOR_ID                0x19D2
#define DELL_VENDOR_ID                0x413C
#define REALTEK_VENDOR_ID        0x0bda
#define SAMSUNG_VENDOR_ID        0x04e8
#define LENOVO_VENDOR_ID        0x17ef
#define LINKSYS_VENDOR_ID        0x13b1
#define NVIDIA_VENDOR_ID        0x0955
#define HP_VENDOR_ID                0x03f0
#define MICROSOFT_VENDOR_ID        0x045e
#define UBLOX_VENDOR_ID                0x1546
#define TPLINK_VENDOR_ID        0x2357
#define AQUANTIA_VENDOR_ID        0x2eca
#define ASIX_VENDOR_ID                0x0b95

static const struct usb_device_id        products[] = {
/* BLACKLIST !!
 *
 * First blacklist any products that are egregiously nonconformant
 * with the CDC Ethernet specs.  Minor braindamage we cope with; when
 * they're not even trying, needing a separate driver is only the first
 * of the differences to show up.
 */

#define        ZAURUS_MASTER_INTERFACE \
        .bInterfaceClass        = USB_CLASS_COMM, \
        .bInterfaceSubClass        = USB_CDC_SUBCLASS_ETHERNET, \
        .bInterfaceProtocol        = USB_CDC_PROTO_NONE

#define ZAURUS_FAKE_INTERFACE \
        .bInterfaceClass        = USB_CLASS_COMM, \
        .bInterfaceSubClass        = USB_CDC_SUBCLASS_MDLM, \
        .bInterfaceProtocol        = USB_CDC_PROTO_NONE

/* SA-1100 based Sharp Zaurus ("collie"), or compatible;
 * wire-incompatible with true CDC Ethernet implementations.
 * (And, it seems, needlessly so...)
 */
{
        .match_flags        =   USB_DEVICE_ID_MATCH_INT_INFO
                          | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor                = 0x04DD,
        .idProduct                = 0x8004,
        ZAURUS_MASTER_INTERFACE,
        .driver_info                = 0,
},

/* PXA-25x based Sharp Zaurii.  Note that it seems some of these
 * (later models especially) may have shipped only with firmware
 * advertising false "CDC MDLM" compatibility ... but we're not
 * clear which models did that, so for now let's assume the worst.
 */
{
        .match_flags        =   USB_DEVICE_ID_MATCH_INT_INFO
                          | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor                = 0x04DD,
        .idProduct                = 0x8005,        /* A-300 */
        ZAURUS_MASTER_INTERFACE,
        .driver_info                = 0,
}, {
        .match_flags        =   USB_DEVICE_ID_MATCH_INT_INFO
                          | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor                = 0x04DD,
        .idProduct                = 0x8005,   /* A-300 */
        ZAURUS_FAKE_INTERFACE,
        .driver_info        = 0,
}, {
        .match_flags    =   USB_DEVICE_ID_MATCH_INT_INFO
                          | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor                = 0x04DD,
        .idProduct                = 0x8006,        /* B-500/SL-5600 */
        ZAURUS_MASTER_INTERFACE,
        .driver_info                = 0,
}, {
        .match_flags    =   USB_DEVICE_ID_MATCH_INT_INFO
                          | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor                = 0x04DD,
        .idProduct                = 0x8006,   /* B-500/SL-5600 */
        ZAURUS_FAKE_INTERFACE,
        .driver_info        = 0,
}, {
        .match_flags    =   USB_DEVICE_ID_MATCH_INT_INFO
                          | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor                = 0x04DD,
        .idProduct                = 0x8007,        /* C-700 */
        ZAURUS_MASTER_INTERFACE,
        .driver_info                = 0,
}, {
        .match_flags    =   USB_DEVICE_ID_MATCH_INT_INFO
                          | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor                = 0x04DD,
        .idProduct                = 0x8007,   /* C-700 */
        ZAURUS_FAKE_INTERFACE,
        .driver_info        = 0,
}, {
        .match_flags    =   USB_DEVICE_ID_MATCH_INT_INFO
                 | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor               = 0x04DD,
        .idProduct              = 0x9031,        /* C-750 C-760 */
        ZAURUS_MASTER_INTERFACE,
        .driver_info                = 0,
}, {
        .match_flags    =   USB_DEVICE_ID_MATCH_INT_INFO
                 | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor               = 0x04DD,
        .idProduct              = 0x9032,        /* SL-6000 */
        ZAURUS_MASTER_INTERFACE,
        .driver_info                = 0,
}, {
        .match_flags    =   USB_DEVICE_ID_MATCH_INT_INFO
                 | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor               = 0x04DD,
        .idProduct              = 0x9032,        /* SL-6000 */
        ZAURUS_FAKE_INTERFACE,
        .driver_info                = 0,
}, {
        .match_flags    =   USB_DEVICE_ID_MATCH_INT_INFO
                 | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor               = 0x04DD,
        /* reported with some C860 units */
        .idProduct              = 0x9050,        /* C-860 */
        ZAURUS_MASTER_INTERFACE,
        .driver_info                = 0,
},

/* Olympus has some models with a Zaurus-compatible option.
 * R-1000 uses a FreeScale i.MXL cpu (ARMv4T)
 */
{
        .match_flags    =   USB_DEVICE_ID_MATCH_INT_INFO
                 | USB_DEVICE_ID_MATCH_DEVICE,
        .idVendor               = 0x07B4,
        .idProduct              = 0x0F02,        /* R-1000 */
        ZAURUS_MASTER_INTERFACE,
        .driver_info                = 0,
},

/* LG Electronics VL600 wants additional headers on every frame */
{
        USB_DEVICE_AND_INTERFACE_INFO(0x1004, 0x61aa, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* Logitech Harmony 900 - uses the pseudo-MDLM (BLAN) driver */
{
        USB_DEVICE_AND_INTERFACE_INFO(0x046d, 0xc11f, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
        .driver_info                = 0,
},

/* Novatel USB551L and MC551 - handled by qmi_wwan */
{
        USB_DEVICE_AND_INTERFACE_INFO(NOVATEL_VENDOR_ID, 0xB001, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* Novatel E362 - handled by qmi_wwan */
{
        USB_DEVICE_AND_INTERFACE_INFO(NOVATEL_VENDOR_ID, 0x9010, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* Dell Wireless 5800 (Novatel E362) - handled by qmi_wwan */
{
        USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x8195, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* Dell Wireless 5800 (Novatel E362) - handled by qmi_wwan */
{
        USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x8196, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* Dell Wireless 5804 (Novatel E371) - handled by qmi_wwan */
{
        USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x819b, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* Novatel Expedite E371 - handled by qmi_wwan */
{
        USB_DEVICE_AND_INTERFACE_INFO(NOVATEL_VENDOR_ID, 0x9011, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* HP lt2523 (Novatel E371) - handled by qmi_wwan */
{
        USB_DEVICE_AND_INTERFACE_INFO(HP_VENDOR_ID, 0x421d, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* AnyDATA ADU960S - handled by qmi_wwan */
{
        USB_DEVICE_AND_INTERFACE_INFO(0x16d5, 0x650a, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* Huawei E1820 - handled by qmi_wwan */
{
        USB_DEVICE_INTERFACE_NUMBER(HUAWEI_VENDOR_ID, 0x14ac, 1),
        .driver_info = 0,
},

/* Realtek RTL8153 Based USB 3.0 Ethernet Adapters */
{
        USB_DEVICE_AND_INTERFACE_INFO(REALTEK_VENDOR_ID, 0x8153, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* Lenovo Powered USB-C Travel Hub (4X90S92381, based on Realtek RTL8153) */
{
        USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x721e, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* Lenovo ThinkPad Hybrid USB-C with USB-A Dock (40af0135eu, based on Realtek RTL8153) */
{
        USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0xa359, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* Aquantia AQtion USB to 5GbE Controller (based on AQC111U) */
{
        USB_DEVICE_AND_INTERFACE_INFO(AQUANTIA_VENDOR_ID, 0xc101,
                                      USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* ASIX USB 3.1 Gen1 to 5G Multi-Gigabit Ethernet Adapter(based on AQC111U) */
{
        USB_DEVICE_AND_INTERFACE_INFO(ASIX_VENDOR_ID, 0x2790, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* ASIX USB 3.1 Gen1 to 2.5G Multi-Gigabit Ethernet Adapter(based on AQC112U) */
{
        USB_DEVICE_AND_INTERFACE_INFO(ASIX_VENDOR_ID, 0x2791, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* USB-C 3.1 to 5GBASE-T Ethernet Adapter (based on AQC111U) */
{
        USB_DEVICE_AND_INTERFACE_INFO(0x20f4, 0xe05a, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* QNAP QNA-UC5G1T USB to 5GbE Adapter (based on AQC111U) */
{
        USB_DEVICE_AND_INTERFACE_INFO(0x1c04, 0x0015, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = 0,
},

/* WHITELIST!!!
 *
 * CDC Ether uses two interfaces, not necessarily consecutive.
 * We match the main interface, ignoring the optional device
 * class so we could handle devices that aren't exclusively
 * CDC ether.
 *
 * NOTE:  this match must come AFTER entries blacklisting devices
 * because of bugs/quirks in a given product (like Zaurus, above).
 */
{
        /* ZTE (Vodafone) K3805-Z */
        USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1003, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* ZTE (Vodafone) K3806-Z */
        USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1015, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* ZTE (Vodafone) K4510-Z */
        USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1173, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* ZTE (Vodafone) K3770-Z */
        USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1177, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* ZTE (Vodafone) K3772-Z */
        USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1181, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* Telit modules */
        USB_VENDOR_AND_INTERFACE_INFO(0x1bc7, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = (kernel_ulong_t) &wwan_info,
}, {
        /* Dell DW5580 modules */
        USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x81ba, USB_CLASS_COMM,
                        USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
        .driver_info = (kernel_ulong_t)&wwan_info,
}, {
        /* Huawei ME906 and ME909 */
        USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0x15c1, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* ZTE modules */
        USB_VENDOR_AND_INTERFACE_INFO(ZTE_VENDOR_ID, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&zte_cdc_info,
}, {
        /* U-blox TOBY-L2 */
        USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1143, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* U-blox SARA-U2 */
        USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1104, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* U-blox LARA-R6 01B */
        USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1313, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* U-blox LARA-L6 */
        USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1343, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* Cinterion PLS8 modem by GEMALTO */
        USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0061, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* Cinterion AHS3 modem by GEMALTO */
        USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0055, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* Cinterion PLS62-W modem by GEMALTO/THALES */
        USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x005b, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        /* Cinterion PLS83/PLS63 modem by GEMALTO/THALES */
        USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0069, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET,
                                      USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,
}, {
        USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET,
                        USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long) &cdc_info,
}, {
        USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MDLM,
                        USB_CDC_PROTO_NONE),
        .driver_info = (unsigned long)&wwan_info,

}, {
        /* Various Huawei modems with a network port like the UMG1831 */
        USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_COMM,
                                      USB_CDC_SUBCLASS_ETHERNET, 255),
        .driver_info = (unsigned long)&wwan_info,
},
        { },                /* END */
};
MODULE_DEVICE_TABLE(usb, products);

static struct usb_driver cdc_driver = {
        .name =                "cdc_ether",
        .id_table =        products,
        .probe =        usbnet_probe,
        .disconnect =        usbnet_disconnect,
        .suspend =        usbnet_suspend,
        .resume =        usbnet_resume,
        .reset_resume =        usbnet_resume,
        .supports_autosuspend = 1,
        .disable_hub_initiated_lpm = 1,
};

module_usb_driver(cdc_driver);

MODULE_AUTHOR("David Brownell");
MODULE_DESCRIPTION("USB CDC Ethernet devices");
MODULE_LICENSE("GPL");






















































































































    2 













    2 




    2 








    2 


    2 






    2 







    2 







    2 

















    1 









    2 



    2 












































    2 





    2 





































































































































































































    2 






    2 

















    1 



    1 
    1 

    1 
    1 





















    2 
















    2 









    2 

















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
// SPDX-License-Identifier: GPL-2.0+
/*
 * Meta data file for NILFS
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Ryusuke Konishi.
 */

#include <linux/buffer_head.h>
#include <linux/mpage.h>
#include <linux/mm.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include "nilfs.h"
#include "btnode.h"
#include "segment.h"
#include "page.h"
#include "mdt.h"
#include "alloc.h"                /* nilfs_palloc_destroy_cache() */

#include <trace/events/nilfs2.h>

#define NILFS_MDT_MAX_RA_BLOCKS                (16 - 1)


static int
nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
                           struct buffer_head *bh,
                           void (*init_block)(struct inode *,
                                              struct buffer_head *, void *))
{
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct folio *folio = bh->b_folio;
        void *from;
        int ret;

        /* Caller exclude read accesses using page lock */

        /* set_buffer_new(bh); */
        bh->b_blocknr = 0;

        ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
        if (unlikely(ret))
                return ret;

        set_buffer_mapped(bh);

        /* Initialize block (block size > PAGE_SIZE not yet supported) */
        from = kmap_local_folio(folio, offset_in_folio(folio, bh->b_data));
        memset(from, 0, bh->b_size);
        if (init_block)
                init_block(inode, bh, from);
        kunmap_local(from);

        flush_dcache_folio(folio);

        set_buffer_uptodate(bh);
        mark_buffer_dirty(bh);
        nilfs_mdt_mark_dirty(inode);

        trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block);

        return 0;
}

static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
                                  struct buffer_head **out_bh,
                                  void (*init_block)(struct inode *,
                                                     struct buffer_head *,
                                                     void *))
{
        struct super_block *sb = inode->i_sb;
        struct nilfs_transaction_info ti;
        struct buffer_head *bh;
        int err;

        nilfs_transaction_begin(sb, &ti, 0);

        err = -ENOMEM;
        bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
        if (unlikely(!bh))
                goto failed_unlock;

        err = -EEXIST;
        if (buffer_uptodate(bh))
                goto failed_bh;

        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                goto failed_bh;

        err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
        if (likely(!err)) {
                get_bh(bh);
                *out_bh = bh;
        }

 failed_bh:
        folio_unlock(bh->b_folio);
        folio_put(bh->b_folio);
        brelse(bh);

 failed_unlock:
        if (likely(!err))
                err = nilfs_transaction_commit(sb);
        else
                nilfs_transaction_abort(sb);

        return err;
}

static int
nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, blk_opf_t opf,
                       struct buffer_head **out_bh)
{
        struct buffer_head *bh;
        __u64 blknum = 0;
        int ret = -ENOMEM;

        bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
        if (unlikely(!bh))
                goto failed;

        ret = -EEXIST; /* internal code */
        if (buffer_uptodate(bh))
                goto out;

        if (opf & REQ_RAHEAD) {
                if (!trylock_buffer(bh)) {
                        ret = -EBUSY;
                        goto failed_bh;
                }
        } else /* opf == REQ_OP_READ */
                lock_buffer(bh);

        if (buffer_uptodate(bh)) {
                unlock_buffer(bh);
                goto out;
        }

        ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
        if (unlikely(ret)) {
                unlock_buffer(bh);
                goto failed_bh;
        }
        map_bh(bh, inode->i_sb, (sector_t)blknum);

        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
        submit_bh(opf, bh);
        ret = 0;

        trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff,
                                      opf & REQ_OP_MASK);
 out:
        get_bh(bh);
        *out_bh = bh;

 failed_bh:
        folio_unlock(bh->b_folio);
        folio_put(bh->b_folio);
        brelse(bh);
 failed:
        return ret;
}

static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
                                int readahead, struct buffer_head **out_bh)
{
        struct buffer_head *first_bh, *bh;
        unsigned long blkoff;
        int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
        int err;

        err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, &first_bh);
        if (err == -EEXIST) /* internal code */
                goto out;

        if (unlikely(err))
                goto failed;

        if (readahead) {
                blkoff = block + 1;
                for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
                        err = nilfs_mdt_submit_block(inode, blkoff,
                                                REQ_OP_READ | REQ_RAHEAD, &bh);
                        if (likely(!err || err == -EEXIST))
                                brelse(bh);
                        else if (err != -EBUSY)
                                break;
                                /* abort readahead if bmap lookup failed */
                        if (!buffer_locked(first_bh))
                                goto out_no_wait;
                }
        }

        wait_on_buffer(first_bh);

 out_no_wait:
        err = -EIO;
        if (!buffer_uptodate(first_bh)) {
                nilfs_err(inode->i_sb,
                          "I/O error reading meta-data file (ino=%llu, block-offset=%lu)",
                          inode->i_ino, block);
                goto failed_bh;
        }
 out:
        *out_bh = first_bh;
        return 0;

 failed_bh:
        brelse(first_bh);
 failed:
        return err;
}

/**
 * nilfs_mdt_get_block - read or create a buffer on meta data file.
 * @inode: inode of the meta data file
 * @blkoff: block offset
 * @create: create flag
 * @init_block: initializer used for newly allocated block
 * @out_bh: output of a pointer to the buffer_head
 *
 * nilfs_mdt_get_block() looks up the specified buffer and tries to create
 * a new buffer if @create is not zero.  If (and only if) this function
 * succeeds, it stores a pointer to the retrieved buffer head in the location
 * pointed to by @out_bh.
 *
 * The retrieved buffer may be either an existing one or a newly allocated one.
 * For a newly created buffer, if the callback function argument @init_block
 * is non-NULL, the callback will be called with the buffer locked to format
 * the block.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - The specified block does not exist (hole block).
 * * %-ENOMEM        - Insufficient memory available.
 * * %-EROFS        - Read only filesystem (for create mode).
 */
int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
                        void (*init_block)(struct inode *,
                                           struct buffer_head *, void *),
                        struct buffer_head **out_bh)
{
        int ret;

        /* Should be rewritten with merging nilfs_mdt_read_block() */
 retry:
        ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh);
        if (!create || ret != -ENOENT)
                return ret;

        ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
        if (unlikely(ret == -EEXIST)) {
                /* create = 0; */  /* limit read-create loop retries */
                goto retry;
        }
        return ret;
}

/**
 * nilfs_mdt_find_block - find and get a buffer on meta data file.
 * @inode: inode of the meta data file
 * @start: start block offset (inclusive)
 * @end: end block offset (inclusive)
 * @blkoff: block offset
 * @out_bh: place to store a pointer to buffer_head struct
 *
 * nilfs_mdt_find_block() looks up an existing block in range of
 * [@start, @end] and stores pointer to a buffer head of the block to
 * @out_bh, and block offset to @blkoff, respectively.  @out_bh and
 * @blkoff are substituted only when zero is returned.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - No block was found in the range.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
                         unsigned long end, unsigned long *blkoff,
                         struct buffer_head **out_bh)
{
        __u64 next;
        int ret;

        if (unlikely(start > end))
                return -ENOENT;

        ret = nilfs_mdt_read_block(inode, start, true, out_bh);
        if (!ret) {
                *blkoff = start;
                goto out;
        }
        if (unlikely(ret != -ENOENT || start == ULONG_MAX))
                goto out;

        ret = nilfs_bmap_seek_key(NILFS_I(inode)->i_bmap, start + 1, &next);
        if (!ret) {
                if (next <= end) {
                        ret = nilfs_mdt_read_block(inode, next, true, out_bh);
                        if (!ret)
                                *blkoff = next;
                } else {
                        ret = -ENOENT;
                }
        }
out:
        return ret;
}

/**
 * nilfs_mdt_delete_block - make a hole on the meta data file.
 * @inode: inode of the meta data file
 * @block: block offset
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - Non-existent block.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);
        int err;

        err = nilfs_bmap_delete(ii->i_bmap, block);
        if (!err || err == -ENOENT) {
                nilfs_mdt_mark_dirty(inode);
                nilfs_mdt_forget_block(inode, block);
        }
        return err;
}

/**
 * nilfs_mdt_forget_block - discard dirty state and try to remove the page
 * @inode: inode of the meta data file
 * @block: block offset
 *
 * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
 * tries to release the page including the buffer from a page cache.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EBUSY        - Page has an active buffer.
 * * %-ENOENT        - Page cache has no page addressed by the offset.
 */
int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
{
        pgoff_t index = block >> (PAGE_SHIFT - inode->i_blkbits);
        struct folio *folio;
        struct buffer_head *bh;
        int ret = 0;
        int still_dirty;

        folio = filemap_lock_folio(inode->i_mapping, index);
        if (IS_ERR(folio))
                return -ENOENT;

        folio_wait_writeback(folio);

        bh = folio_buffers(folio);
        if (bh) {
                unsigned long first_block = index <<
                                (PAGE_SHIFT - inode->i_blkbits);
                bh = get_nth_bh(bh, block - first_block);
                nilfs_forget_buffer(bh);
        }
        still_dirty = folio_test_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);

        if (still_dirty ||
            invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
                ret = -EBUSY;
        return ret;
}

int nilfs_mdt_fetch_dirty(struct inode *inode)
{
        struct nilfs_inode_info *ii = NILFS_I(inode);

        if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
                set_bit(NILFS_I_DIRTY, &ii->i_state);
                return 1;
        }
        return test_bit(NILFS_I_DIRTY, &ii->i_state);
}

static int nilfs_mdt_write_folio(struct folio *folio,
                struct writeback_control *wbc)
{
        struct inode *inode = folio->mapping->host;
        struct super_block *sb;
        int err = 0;

        if (inode && sb_rdonly(inode->i_sb)) {
                /*
                 * It means that filesystem was remounted in read-only
                 * mode because of error or metadata corruption. But we
                 * have dirty folios that try to be flushed in background.
                 * So, here we simply discard this dirty folio.
                 */
                nilfs_clear_folio_dirty(folio);
                folio_unlock(folio);
                return -EROFS;
        }

        folio_redirty_for_writepage(wbc, folio);
        folio_unlock(folio);

        if (!inode)
                return 0;

        sb = inode->i_sb;

        if (wbc->sync_mode == WB_SYNC_ALL)
                err = nilfs_construct_segment(sb);

        return err;
}

static int nilfs_mdt_writeback(struct address_space *mapping,
                struct writeback_control *wbc)
{
        struct folio *folio = NULL;
        int error;

        while ((folio = writeback_iter(mapping, wbc, folio, &error)))
                error = nilfs_mdt_write_folio(folio, wbc);

        return error;
}

static const struct address_space_operations def_mdt_aops = {
        .dirty_folio                = block_dirty_folio,
        .invalidate_folio        = block_invalidate_folio,
        .writepages                = nilfs_mdt_writeback,
        .migrate_folio                = buffer_migrate_folio_norefs,
};

static const struct inode_operations def_mdt_iops;
static const struct file_operations def_mdt_fops;


int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
{
        struct nilfs_mdt_info *mi;

        mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
        if (!mi)
                return -ENOMEM;

        init_rwsem(&mi->mi_sem);
        inode->i_private = mi;

        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, gfp_mask);

        inode->i_op = &def_mdt_iops;
        inode->i_fop = &def_mdt_fops;
        inode->i_mapping->a_ops = &def_mdt_aops;

        return 0;
}

/**
 * nilfs_mdt_clear - do cleanup for the metadata file
 * @inode: inode of the metadata file
 */
void nilfs_mdt_clear(struct inode *inode)
{
        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
        struct nilfs_shadow_map *shadow = mdi->mi_shadow;

        if (mdi->mi_palloc_cache)
                nilfs_palloc_destroy_cache(inode);

        if (shadow) {
                struct inode *s_inode = shadow->inode;

                shadow->inode = NULL;
                iput(s_inode);
                mdi->mi_shadow = NULL;
        }
}

/**
 * nilfs_mdt_destroy - release resources used by the metadata file
 * @inode: inode of the metadata file
 */
void nilfs_mdt_destroy(struct inode *inode)
{
        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);

        kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
        kfree(mdi);
}

void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size,
                              unsigned int header_size)
{
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);

        mi->mi_entry_size = entry_size;
        mi->mi_entries_per_block = i_blocksize(inode) / entry_size;
        mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
}

/**
 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
 * @inode: inode of the metadata file
 * @shadow: shadow mapping
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_mdt_setup_shadow_map(struct inode *inode,
                               struct nilfs_shadow_map *shadow)
{
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
        struct inode *s_inode;

        INIT_LIST_HEAD(&shadow->frozen_buffers);

        s_inode = nilfs_iget_for_shadow(inode);
        if (IS_ERR(s_inode))
                return PTR_ERR(s_inode);

        shadow->inode = s_inode;
        mi->mi_shadow = shadow;
        return 0;
}

/**
 * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
 * @inode: inode of the metadata file
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_mdt_save_to_shadow_map(struct inode *inode)
{
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct nilfs_shadow_map *shadow = mi->mi_shadow;
        struct inode *s_inode = shadow->inode;
        int ret;

        ret = nilfs_copy_dirty_pages(s_inode->i_mapping, inode->i_mapping);
        if (ret)
                goto out;

        ret = nilfs_copy_dirty_pages(NILFS_I(s_inode)->i_assoc_inode->i_mapping,
                                     ii->i_assoc_inode->i_mapping);
        if (ret)
                goto out;

        nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
 out:
        return ret;
}

int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
{
        struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
        struct buffer_head *bh_frozen;
        struct folio *folio;
        int blkbits = inode->i_blkbits;

        folio = filemap_grab_folio(shadow->inode->i_mapping,
                        bh->b_folio->index);
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        bh_frozen = folio_buffers(folio);
        if (!bh_frozen)
                bh_frozen = create_empty_buffers(folio, 1 << blkbits, 0);

        bh_frozen = get_nth_bh(bh_frozen,
                               offset_in_folio(folio, bh->b_data) >> blkbits);

        if (!buffer_uptodate(bh_frozen))
                nilfs_copy_buffer(bh_frozen, bh);
        if (list_empty(&bh_frozen->b_assoc_buffers)) {
                list_add_tail(&bh_frozen->b_assoc_buffers,
                              &shadow->frozen_buffers);
                set_buffer_nilfs_redirected(bh);
        } else {
                brelse(bh_frozen); /* already frozen */
        }

        folio_unlock(folio);
        folio_put(folio);
        return 0;
}

struct buffer_head *
nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
{
        struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
        struct buffer_head *bh_frozen = NULL;
        struct folio *folio;
        int n;

        folio = filemap_lock_folio(shadow->inode->i_mapping,
                        bh->b_folio->index);
        if (!IS_ERR(folio)) {
                bh_frozen = folio_buffers(folio);
                if (bh_frozen) {
                        n = offset_in_folio(folio, bh->b_data) >>
                                inode->i_blkbits;
                        bh_frozen = get_nth_bh(bh_frozen, n);
                }
                folio_unlock(folio);
                folio_put(folio);
        }
        return bh_frozen;
}

static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
{
        struct list_head *head = &shadow->frozen_buffers;
        struct buffer_head *bh;

        while (!list_empty(head)) {
                bh = list_first_entry(head, struct buffer_head,
                                      b_assoc_buffers);
                list_del_init(&bh->b_assoc_buffers);
                brelse(bh); /* drop ref-count to make it releasable */
        }
}

/**
 * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
 * @inode: inode of the metadata file
 */
void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
{
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        struct nilfs_shadow_map *shadow = mi->mi_shadow;

        down_write(&mi->mi_sem);

        if (mi->mi_palloc_cache)
                nilfs_palloc_clear_cache(inode);

        nilfs_clear_dirty_pages(inode->i_mapping);
        nilfs_copy_back_pages(inode->i_mapping, shadow->inode->i_mapping);

        nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping);
        nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping,
                              NILFS_I(shadow->inode)->i_assoc_inode->i_mapping);

        nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);

        up_write(&mi->mi_sem);
}

/**
 * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
 * @inode: inode of the metadata file
 */
void nilfs_mdt_clear_shadow_map(struct inode *inode)
{
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
        struct nilfs_shadow_map *shadow = mi->mi_shadow;
        struct inode *shadow_btnc_inode = NILFS_I(shadow->inode)->i_assoc_inode;

        down_write(&mi->mi_sem);
        nilfs_release_frozen_buffers(shadow);
        truncate_inode_pages(shadow->inode->i_mapping, 0);
        truncate_inode_pages(shadow_btnc_inode->i_mapping, 0);
        up_write(&mi->mi_sem);
}

























































































    3 


































































































    6 




    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
// SPDX-License-Identifier: GPL-2.0
/*
 * kobject.h - generic kernel object infrastructure.
 *
 * Copyright (c) 2002-2003 Patrick Mochel
 * Copyright (c) 2002-2003 Open Source Development Labs
 * Copyright (c) 2006-2008 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (c) 2006-2008 Novell Inc.
 *
 * Please read Documentation/core-api/kobject.rst before using the kobject
 * interface, ESPECIALLY the parts about reference counts and object
 * destructors.
 */

#ifndef _KOBJECT_H_
#define _KOBJECT_H_

#include <linux/types.h>
#include <linux/list.h>
#include <linux/sysfs.h>
#include <linux/compiler.h>
#include <linux/container_of.h>
#include <linux/spinlock.h>
#include <linux/kref.h>
#include <linux/kobject_ns.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/workqueue.h>
#include <linux/uidgid.h>

#define UEVENT_HELPER_PATH_LEN                256
#define UEVENT_NUM_ENVP                        64        /* number of env pointers */
#define UEVENT_BUFFER_SIZE                2048        /* buffer for the variables */

#ifdef CONFIG_UEVENT_HELPER
/* path to the userspace helper executed on an event */
extern char uevent_helper[];
#endif

/* counter to tag the uevent, read only except for the kobject core */
extern atomic64_t uevent_seqnum;

/*
 * The actions here must match the index to the string array
 * in lib/kobject_uevent.c
 *
 * Do not add new actions here without checking with the driver-core
 * maintainers. Action strings are not meant to express subsystem
 * or device specific properties. In most cases you want to send a
 * kobject_uevent_env(kobj, KOBJ_CHANGE, env) with additional event
 * specific variables added to the event environment.
 */
enum kobject_action {
        KOBJ_ADD,
        KOBJ_REMOVE,
        KOBJ_CHANGE,
        KOBJ_MOVE,
        KOBJ_ONLINE,
        KOBJ_OFFLINE,
        KOBJ_BIND,
        KOBJ_UNBIND,
};

struct kobject {
        const char                *name;
        struct list_head        entry;
        struct kobject                *parent;
        struct kset                *kset;
        const struct kobj_type        *ktype;
        struct kernfs_node        *sd; /* sysfs directory entry */
        struct kref                kref;

        unsigned int state_initialized:1;
        unsigned int state_in_sysfs:1;
        unsigned int state_add_uevent_sent:1;
        unsigned int state_remove_uevent_sent:1;
        unsigned int uevent_suppress:1;

#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
        struct delayed_work        release;
#endif
};

__printf(2, 3) int kobject_set_name(struct kobject *kobj, const char *name, ...);
__printf(2, 0) int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs);

static inline const char *kobject_name(const struct kobject *kobj)
{
        return kobj->name;
}

void kobject_init(struct kobject *kobj, const struct kobj_type *ktype);
__printf(3, 4) __must_check int kobject_add(struct kobject *kobj,
                                            struct kobject *parent,
                                            const char *fmt, ...);
__printf(4, 5) __must_check int kobject_init_and_add(struct kobject *kobj,
                                                     const struct kobj_type *ktype,
                                                     struct kobject *parent,
                                                     const char *fmt, ...);

void kobject_del(struct kobject *kobj);

struct kobject * __must_check kobject_create_and_add(const char *name, struct kobject *parent);

int __must_check kobject_rename(struct kobject *, const char *new_name);
int __must_check kobject_move(struct kobject *, struct kobject *);

struct kobject *kobject_get(struct kobject *kobj);
struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj);
void kobject_put(struct kobject *kobj);

const struct ns_common *kobject_namespace(const struct kobject *kobj);
void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid);
char *kobject_get_path(const struct kobject *kobj, gfp_t flag);

struct kobj_type {
        void (*release)(struct kobject *kobj);
        const struct sysfs_ops *sysfs_ops;
        const struct attribute_group **default_groups;
        const struct kobj_ns_type_operations *(*child_ns_type)(const struct kobject *kobj);
        const struct ns_common *(*namespace)(const struct kobject *kobj);
        void (*get_ownership)(const struct kobject *kobj, kuid_t *uid, kgid_t *gid);
};

struct kobj_uevent_env {
        char *argv[3];
        char *envp[UEVENT_NUM_ENVP];
        int envp_idx;
        char buf[UEVENT_BUFFER_SIZE];
        int buflen;
};

struct kset_uevent_ops {
        int (* const filter)(const struct kobject *kobj);
        const char *(* const name)(const struct kobject *kobj);
        int (* const uevent)(const struct kobject *kobj, struct kobj_uevent_env *env);
};

struct kobj_attribute {
        struct attribute attr;
        ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,
                        char *buf);
        ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,
                         const char *buf, size_t count);
};

extern const struct sysfs_ops kobj_sysfs_ops;

struct sock;

/**
 * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem.
 *
 * A kset defines a group of kobjects.  They can be individually
 * different "types" but overall these kobjects all want to be grouped
 * together and operated on in the same manner.  ksets are used to
 * define the attribute callbacks and other common events that happen to
 * a kobject.
 *
 * @list: the list of all kobjects for this kset
 * @list_lock: a lock for iterating over the kobjects
 * @kobj: the embedded kobject for this kset (recursion, isn't it fun...)
 * @uevent_ops: the set of uevent operations for this kset.  These are
 * called whenever a kobject has something happen to it so that the kset
 * can add new environment variables, or filter out the uevents if so
 * desired.
 */
struct kset {
        struct list_head list;
        spinlock_t list_lock;
        struct kobject kobj;
        const struct kset_uevent_ops *uevent_ops;
} __randomize_layout;

void kset_init(struct kset *kset);
int __must_check kset_register(struct kset *kset);
void kset_unregister(struct kset *kset);
struct kset * __must_check kset_create_and_add(const char *name, const struct kset_uevent_ops *u,
                                               struct kobject *parent_kobj);

static inline struct kset *to_kset(struct kobject *kobj)
{
        return kobj ? container_of(kobj, struct kset, kobj) : NULL;
}

static inline struct kset *kset_get(struct kset *k)
{
        return k ? to_kset(kobject_get(&k->kobj)) : NULL;
}

static inline void kset_put(struct kset *k)
{
        kobject_put(&k->kobj);
}

static inline const struct kobj_type *get_ktype(const struct kobject *kobj)
{
        return kobj->ktype;
}

struct kobject *kset_find_obj(struct kset *, const char *);

/* The global /sys/kernel/ kobject for people to chain off of */
extern struct kobject *kernel_kobj;
/* The global /sys/kernel/mm/ kobject for people to chain off of */
extern struct kobject *mm_kobj;
/* The global /sys/hypervisor/ kobject for people to chain off of */
extern struct kobject *hypervisor_kobj;
/* The global /sys/power/ kobject for people to chain off of */
extern struct kobject *power_kobj;
/* The global /sys/firmware/ kobject for people to chain off of */
extern struct kobject *firmware_kobj;

int kobject_uevent(struct kobject *kobj, enum kobject_action action);
int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
                        char *envp[]);
int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count);

__printf(2, 3)
int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...);

#endif /* _KOBJECT_H_ */


















































































































    1 



    1 




























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
// SPDX-License-Identifier: GPL-2.0-or-later
/* xfrm4_protocol.c - Generic xfrm protocol multiplexer.
 *
 * Copyright (C) 2013 secunet Security Networks AG
 *
 * Author:
 * Steffen Klassert <steffen.klassert@secunet.com>
 *
 * Based on:
 * net/ipv4/tunnel4.c
 */

#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/skbuff.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/xfrm.h>

static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly;
static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly;
static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly;
static DEFINE_MUTEX(xfrm4_protocol_mutex);

static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
{
        switch (protocol) {
        case IPPROTO_ESP:
                return &esp4_handlers;
        case IPPROTO_AH:
                return &ah4_handlers;
        case IPPROTO_COMP:
                return &ipcomp4_handlers;
        }

        return NULL;
}

#define for_each_protocol_rcu(head, handler)                \
        for (handler = rcu_dereference(head);                \
             handler != NULL;                                \
             handler = rcu_dereference(handler->next))        \

static int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
{
        int ret;
        struct xfrm4_protocol *handler;
        struct xfrm4_protocol __rcu **head = proto_handlers(protocol);

        if (!head)
                return 0;

        for_each_protocol_rcu(*head, handler)
                if ((ret = handler->cb_handler(skb, err)) <= 0)
                        return ret;

        return 0;
}

int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
                    int encap_type)
{
        int ret;
        struct xfrm4_protocol *handler;
        struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr);

        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
        XFRM_SPI_SKB_CB(skb)->family = AF_INET;
        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);

        if (!head)
                goto out;

        if (!skb_dst(skb)) {
                const struct iphdr *iph = ip_hdr(skb);

                if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
                                         ip4h_dscp(iph), skb->dev))
                        goto drop;
        }

        for_each_protocol_rcu(*head, handler)
                if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
                        return ret;

out:
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

drop:
        kfree_skb(skb);
        return 0;
}
EXPORT_SYMBOL(xfrm4_rcv_encap);

static int xfrm4_esp_rcv(struct sk_buff *skb)
{
        int ret;
        struct xfrm4_protocol *handler;

        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;

        for_each_protocol_rcu(esp4_handlers, handler)
                if ((ret = handler->handler(skb)) != -EINVAL)
                        return ret;

        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

        kfree_skb(skb);
        return 0;
}

static int xfrm4_esp_err(struct sk_buff *skb, u32 info)
{
        struct xfrm4_protocol *handler;

        for_each_protocol_rcu(esp4_handlers, handler)
                if (!handler->err_handler(skb, info))
                        return 0;

        return -ENOENT;
}

static int xfrm4_ah_rcv(struct sk_buff *skb)
{
        int ret;
        struct xfrm4_protocol *handler;

        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;

        for_each_protocol_rcu(ah4_handlers, handler)
                if ((ret = handler->handler(skb)) != -EINVAL)
                        return ret;

        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

        kfree_skb(skb);
        return 0;
}

static int xfrm4_ah_err(struct sk_buff *skb, u32 info)
{
        struct xfrm4_protocol *handler;

        for_each_protocol_rcu(ah4_handlers, handler)
                if (!handler->err_handler(skb, info))
                        return 0;

        return -ENOENT;
}

static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
{
        int ret;
        struct xfrm4_protocol *handler;

        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;

        for_each_protocol_rcu(ipcomp4_handlers, handler)
                if ((ret = handler->handler(skb)) != -EINVAL)
                        return ret;

        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

        kfree_skb(skb);
        return 0;
}

static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
{
        struct xfrm4_protocol *handler;

        for_each_protocol_rcu(ipcomp4_handlers, handler)
                if (!handler->err_handler(skb, info))
                        return 0;

        return -ENOENT;
}

static const struct net_protocol esp4_protocol = {
        .handler        =        xfrm4_esp_rcv,
        .err_handler        =        xfrm4_esp_err,
        .no_policy        =        1,
};

static const struct net_protocol ah4_protocol = {
        .handler        =        xfrm4_ah_rcv,
        .err_handler        =        xfrm4_ah_err,
        .no_policy        =        1,
};

static const struct net_protocol ipcomp4_protocol = {
        .handler        =        xfrm4_ipcomp_rcv,
        .err_handler        =        xfrm4_ipcomp_err,
        .no_policy        =        1,
};

static const struct xfrm_input_afinfo xfrm4_input_afinfo = {
        .family                =        AF_INET,
        .callback        =        xfrm4_rcv_cb,
};

static inline const struct net_protocol *netproto(unsigned char protocol)
{
        switch (protocol) {
        case IPPROTO_ESP:
                return &esp4_protocol;
        case IPPROTO_AH:
                return &ah4_protocol;
        case IPPROTO_COMP:
                return &ipcomp4_protocol;
        }

        return NULL;
}

int xfrm4_protocol_register(struct xfrm4_protocol *handler,
                            unsigned char protocol)
{
        struct xfrm4_protocol __rcu **pprev;
        struct xfrm4_protocol *t;
        bool add_netproto = false;
        int ret = -EEXIST;
        int priority = handler->priority;

        if (!proto_handlers(protocol) || !netproto(protocol))
                return -EINVAL;

        mutex_lock(&xfrm4_protocol_mutex);

        if (!rcu_dereference_protected(*proto_handlers(protocol),
                                       lockdep_is_held(&xfrm4_protocol_mutex)))
                add_netproto = true;

        for (pprev = proto_handlers(protocol);
             (t = rcu_dereference_protected(*pprev,
                        lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
             pprev = &t->next) {
                if (t->priority < priority)
                        break;
                if (t->priority == priority)
                        goto err;
        }

        handler->next = *pprev;
        rcu_assign_pointer(*pprev, handler);

        ret = 0;

err:
        mutex_unlock(&xfrm4_protocol_mutex);

        if (add_netproto) {
                if (inet_add_protocol(netproto(protocol), protocol)) {
                        pr_err("%s: can't add protocol\n", __func__);
                        ret = -EAGAIN;
                }
        }

        return ret;
}
EXPORT_SYMBOL(xfrm4_protocol_register);

int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
                              unsigned char protocol)
{
        struct xfrm4_protocol __rcu **pprev;
        struct xfrm4_protocol *t;
        int ret = -ENOENT;

        if (!proto_handlers(protocol) || !netproto(protocol))
                return -EINVAL;

        mutex_lock(&xfrm4_protocol_mutex);

        for (pprev = proto_handlers(protocol);
             (t = rcu_dereference_protected(*pprev,
                        lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
             pprev = &t->next) {
                if (t == handler) {
                        *pprev = handler->next;
                        ret = 0;
                        break;
                }
        }

        if (!rcu_dereference_protected(*proto_handlers(protocol),
                                       lockdep_is_held(&xfrm4_protocol_mutex))) {
                if (inet_del_protocol(netproto(protocol), protocol) < 0) {
                        pr_err("%s: can't remove protocol\n", __func__);
                        ret = -EAGAIN;
                }
        }

        mutex_unlock(&xfrm4_protocol_mutex);

        synchronize_net();

        return ret;
}
EXPORT_SYMBOL(xfrm4_protocol_deregister);

void __init xfrm4_protocol_init(void)
{
        xfrm_input_register_afinfo(&xfrm4_input_afinfo);
}
















































































































































































































































































































































































































































































































































































































































































































    1 







    1 































    1 








    1 



























    1 




    1 



    1 




























    1 







    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Swap reorganised 29.12.95, Stephen Tweedie.
 *  kswapd added: 7.1.96  sct
 *  Removed kswapd_ctl limits, and swap out as many pages as needed
 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
 *  Multiqueue VM started 5.8.00, Rik van Riel.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/vmpressure.h>
#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>        /* for buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/oom.h>
#include <linux/folio_batch.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <linux/mmu_notifier.h>
#include <linux/parser.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>

#include <linux/swapops.h>
#include <linux/sched/sysctl.h>

#include "internal.h"
#include "swap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>

struct scan_control {
        /* How many pages shrink_list() should reclaim */
        unsigned long nr_to_reclaim;

        /*
         * Nodemask of nodes allowed by the caller. If NULL, all nodes
         * are scanned.
         */
        nodemask_t        *nodemask;

        /*
         * The memory cgroup that hit its limit and as a result is the
         * primary target of this reclaim invocation.
         */
        struct mem_cgroup *target_mem_cgroup;

        /*
         * Scan pressure balancing between anon and file LRUs
         */
        unsigned long        anon_cost;
        unsigned long        file_cost;

        /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
        int *proactive_swappiness;

        /* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
        unsigned int may_deactivate:2;
        unsigned int force_deactivate:1;
        unsigned int skipped_deactivate:1;

        /* zone_reclaim_mode, boost reclaim */
        unsigned int may_writepage:1;

        /* zone_reclaim_mode */
        unsigned int may_unmap:1;

        /* zome_reclaim_mode, boost reclaim, cgroup restrictions */
        unsigned int may_swap:1;

        /* Not allow cache_trim_mode to be turned on as part of reclaim? */
        unsigned int no_cache_trim_mode:1;

        /* Has cache_trim_mode failed at least once? */
        unsigned int cache_trim_mode_failed:1;

        /* Proactive reclaim invoked by userspace */
        unsigned int proactive:1;

        /*
         * Cgroup memory below memory.low is protected as long as we
         * don't threaten to OOM. If any cgroup is reclaimed at
         * reduced force or passed over entirely due to its memory.low
         * setting (memcg_low_skipped), and nothing is reclaimed as a
         * result, then go back for one more cycle that reclaims the protected
         * memory (memcg_low_reclaim) to avert OOM.
         */
        unsigned int memcg_low_reclaim:1;
        unsigned int memcg_low_skipped:1;

        /* Shared cgroup tree walk failed, rescan the whole tree */
        unsigned int memcg_full_walk:1;

        unsigned int hibernation_mode:1;

        /* One of the zones is ready for compaction */
        unsigned int compaction_ready:1;

        /* There is easily reclaimable cold cache in the current node */
        unsigned int cache_trim_mode:1;

        /* The file folios on the current node are dangerously low */
        unsigned int file_is_tiny:1;

        /* Always discard instead of demoting to lower tier memory */
        unsigned int no_demotion:1;

        /* Allocation order */
        s8 order;

        /* Scan (total_size >> priority) pages at once */
        s8 priority;

        /* The highest zone to isolate folios for reclaim from */
        s8 reclaim_idx;

        /* This context's GFP mask */
        gfp_t gfp_mask;

        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;

        /* Number of pages freed so far during a call to shrink_zones() */
        unsigned long nr_reclaimed;

        struct {
                unsigned int dirty;
                unsigned int unqueued_dirty;
                unsigned int congested;
                unsigned int writeback;
                unsigned int immediate;
                unsigned int file_taken;
                unsigned int taken;
        } nr;

        /* for recording the reclaimed slab by now */
        struct reclaim_state reclaim_state;
};

#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_folio(_folio, _base, _field)                        \
        do {                                                                \
                if ((_folio)->lru.prev != _base) {                        \
                        struct folio *prev;                                \
                                                                        \
                        prev = lru_to_folio(&(_folio->lru));                \
                        prefetchw(&prev->_field);                        \
                }                                                        \
        } while (0)
#else
#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
#endif

/*
 * From 0 .. MAX_SWAPPINESS.  Higher means more swappy.
 */
int vm_swappiness = 60;

#ifdef CONFIG_MEMCG

/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
static bool cgroup_reclaim(struct scan_control *sc)
{
        return sc->target_mem_cgroup;
}

/*
 * Returns true for reclaim on the root cgroup. This is true for direct
 * allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
 */
static bool root_reclaim(struct scan_control *sc)
{
        return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
}

/**
 * writeback_throttling_sane - is the usual dirty throttling mechanism available?
 * @sc: scan_control in question
 *
 * The normal page dirty throttling mechanism in balance_dirty_pages() is
 * completely broken with the legacy memcg and direct stalling in
 * shrink_folio_list() is used for throttling instead, which lacks all the
 * niceties such as fairness, adaptive pausing, bandwidth proportional
 * allocation and configurability.
 *
 * This function tests whether the vmscan currently in progress can assume
 * that the normal dirty throttling mechanism is operational.
 */
static bool writeback_throttling_sane(struct scan_control *sc)
{
        if (!cgroup_reclaim(sc))
                return true;
#ifdef CONFIG_CGROUP_WRITEBACK
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return true;
#endif
        return false;
}

static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
{
        if (sc->proactive && sc->proactive_swappiness)
                return *sc->proactive_swappiness;
        return mem_cgroup_swappiness(memcg);
}
#else
static bool cgroup_reclaim(struct scan_control *sc)
{
        return false;
}

static bool root_reclaim(struct scan_control *sc)
{
        return true;
}

static bool writeback_throttling_sane(struct scan_control *sc)
{
        return true;
}

static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
{
        return READ_ONCE(vm_swappiness);
}
#endif

static void set_task_reclaim_state(struct task_struct *task,
                                   struct reclaim_state *rs)
{
        /* Check for an overwrite */
        WARN_ON_ONCE(rs && task->reclaim_state);

        /* Check for the nulling of an already-nulled member */
        WARN_ON_ONCE(!rs && !task->reclaim_state);

        task->reclaim_state = rs;
}

/*
 * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
 * scan_control->nr_reclaimed.
 */
static void flush_reclaim_state(struct scan_control *sc)
{
        /*
         * Currently, reclaim_state->reclaimed includes three types of pages
         * freed outside of vmscan:
         * (1) Slab pages.
         * (2) Clean file pages from pruned inodes (on highmem systems).
         * (3) XFS freed buffer pages.
         *
         * For all of these cases, we cannot universally link the pages to a
         * single memcg. For example, a memcg-aware shrinker can free one object
         * charged to the target memcg, causing an entire page to be freed.
         * If we count the entire page as reclaimed from the memcg, we end up
         * overestimating the reclaimed amount (potentially under-reclaiming).
         *
         * Only count such pages for global reclaim to prevent under-reclaiming
         * from the target memcg; preventing unnecessary retries during memcg
         * charging and false positives from proactive reclaim.
         *
         * For uncommon cases where the freed pages were actually mostly
         * charged to the target memcg, we end up underestimating the reclaimed
         * amount. This should be fine. The freed pages will be uncharged
         * anyway, even if they are not counted here properly, and we will be
         * able to make forward progress in charging (which is usually in a
         * retry loop).
         *
         * We can go one step further, and report the uncharged objcg pages in
         * memcg reclaim, to make reporting more accurate and reduce
         * underestimation, but it's probably not worth the complexity for now.
         */
        if (current->reclaim_state && root_reclaim(sc)) {
                sc->nr_reclaimed += current->reclaim_state->reclaimed;
                current->reclaim_state->reclaimed = 0;
        }
}

static bool can_demote(int nid, struct scan_control *sc,
                       struct mem_cgroup *memcg)
{
        struct pglist_data *pgdat = NODE_DATA(nid);
        nodemask_t allowed_mask;

        if (!pgdat || !numa_demotion_enabled)
                return false;
        if (sc && sc->no_demotion)
                return false;

        node_get_allowed_targets(pgdat, &allowed_mask);
        if (nodes_empty(allowed_mask))
                return false;

        /* Filter out nodes that are not in cgroup's mems_allowed. */
        mem_cgroup_node_filter_allowed(memcg, &allowed_mask);
        return !nodes_empty(allowed_mask);
}

static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
                                          int nid,
                                          struct scan_control *sc)
{
        if (memcg == NULL) {
                /*
                 * For non-memcg reclaim, is there
                 * space in any swap device?
                 */
                if (get_nr_swap_pages() > 0)
                        return true;
        } else {
                /* Is the memcg below its swap limit? */
                if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
                        return true;
        }

        /*
         * The page can not be swapped.
         *
         * Can it be reclaimed from this node via demotion?
         */
        return can_demote(nid, sc, memcg);
}

/*
 * This misses isolated folios which are not accounted for to save counters.
 * As the data only determines if reclaim or compaction continues, it is
 * not expected that isolated folios will be a dominating factor.
 */
unsigned long zone_reclaimable_pages(struct zone *zone)
{
        unsigned long nr;

        nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
                zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
        if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
                nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
                        zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);

        return nr;
}

/**
 * lruvec_lru_size -  Returns the number of pages on the given LRU list.
 * @lruvec: lru vector
 * @lru: lru to use
 * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
 */
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
        unsigned long size = 0;
        int zid;
        struct zone *zone;

        for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
                if (!mem_cgroup_disabled())
                        size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
                else
                        size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
        }
        return size;
}

static unsigned long drop_slab_node(int nid)
{
        unsigned long freed = 0;
        struct mem_cgroup *memcg = NULL;

        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
                freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
        } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);

        return freed;
}

void drop_slab(void)
{
        int nid;
        int shift = 0;
        unsigned long freed;

        do {
                freed = 0;
                for_each_online_node(nid) {
                        if (fatal_signal_pending(current))
                                return;

                        freed += drop_slab_node(nid);
                }
        } while ((freed >> shift++) > 1);
}

#define CHECK_RECLAIMER_OFFSET(type)                                        \
        do {                                                                \
                BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD !=                \
                             PGDEMOTE_##type - PGDEMOTE_KSWAPD);        \
                BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD !=                \
                             PGSCAN_##type - PGSCAN_KSWAPD);                \
        } while (0)

static int reclaimer_offset(struct scan_control *sc)
{
        CHECK_RECLAIMER_OFFSET(DIRECT);
        CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
        CHECK_RECLAIMER_OFFSET(PROACTIVE);

        if (current_is_kswapd())
                return 0;
        if (current_is_khugepaged())
                return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
        if (sc->proactive)
                return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
        return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}

/*
 * We detected a synchronous write error writing a folio out.  Probably
 * -ENOSPC.  We need to propagate that into the address_space for a subsequent
 * fsync(), msync() or close().
 *
 * The tricky part is that after writepage we cannot touch the mapping: nothing
 * prevents it from being freed up.  But we have a ref on the folio and once
 * that folio is locked, the mapping is pinned.
 *
 * We're allowed to run sleeping folio_lock() here because we know the caller has
 * __GFP_FS.
 */
static void handle_write_error(struct address_space *mapping,
                                struct folio *folio, int error)
{
        folio_lock(folio);
        if (folio_mapping(folio) == mapping)
                mapping_set_error(mapping, error);
        folio_unlock(folio);
}

static bool skip_throttle_noprogress(pg_data_t *pgdat)
{
        int reclaimable = 0, write_pending = 0;
        int i;
        struct zone *zone;
        /*
         * If kswapd is disabled, reschedule if necessary but do not
         * throttle as the system is likely near OOM.
         */
        if (kswapd_test_hopeless(pgdat))
                return true;

        /*
         * If there are a lot of dirty/writeback folios then do not
         * throttle as throttling will occur when the folios cycle
         * towards the end of the LRU if still under writeback.
         */
        for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
                reclaimable += zone_reclaimable_pages(zone);
                write_pending += zone_page_state_snapshot(zone,
                                                  NR_ZONE_WRITE_PENDING);
        }
        if (2 * write_pending <= reclaimable)
                return true;

        return false;
}

void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
{
        wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
        long timeout, ret;
        DEFINE_WAIT(wait);

        /*
         * Do not throttle user workers, kthreads other than kswapd or
         * workqueues. They may be required for reclaim to make
         * forward progress (e.g. journalling workqueues or kthreads).
         */
        if (!current_is_kswapd() &&
            current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
                cond_resched();
                return;
        }

        /*
         * These figures are pulled out of thin air.
         * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
         * parallel reclaimers which is a short-lived event so the timeout is
         * short. Failing to make progress or waiting on writeback are
         * potentially long-lived events so use a longer timeout. This is shaky
         * logic as a failure to make progress could be due to anything from
         * writeback to a slow device to excessive referenced folios at the tail
         * of the inactive LRU.
         */
        switch(reason) {
        case VMSCAN_THROTTLE_WRITEBACK:
                timeout = HZ/10;

                if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
                        WRITE_ONCE(pgdat->nr_reclaim_start,
                                node_page_state(pgdat, NR_THROTTLED_WRITTEN));
                }

                break;
        case VMSCAN_THROTTLE_CONGESTED:
                fallthrough;
        case VMSCAN_THROTTLE_NOPROGRESS:
                if (skip_throttle_noprogress(pgdat)) {
                        cond_resched();
                        return;
                }

                timeout = 1;

                break;
        case VMSCAN_THROTTLE_ISOLATED:
                timeout = HZ/50;
                break;
        default:
                WARN_ON_ONCE(1);
                timeout = HZ;
                break;
        }

        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
        ret = schedule_timeout(timeout);
        finish_wait(wqh, &wait);

        if (reason == VMSCAN_THROTTLE_WRITEBACK)
                atomic_dec(&pgdat->nr_writeback_throttled);

        trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
                                jiffies_to_usecs(timeout - ret),
                                reason);
}

/*
 * Account for folios written if tasks are throttled waiting on dirty
 * folios to clean. If enough folios have been cleaned since throttling
 * started then wakeup the throttled tasks.
 */
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
                                                        int nr_throttled)
{
        unsigned long nr_written;

        node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);

        /*
         * This is an inaccurate read as the per-cpu deltas may not
         * be synchronised. However, given that the system is
         * writeback throttled, it is not worth taking the penalty
         * of getting an accurate count. At worst, the throttle
         * timeout guarantees forward progress.
         */
        nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
                READ_ONCE(pgdat->nr_reclaim_start);

        if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
                wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
}

/* possible outcome of pageout() */
typedef enum {
        /* failed to write folio out, folio is locked */
        PAGE_KEEP,
        /* move folio to the active list, folio is locked */
        PAGE_ACTIVATE,
        /* folio has been sent to the disk successfully, folio is unlocked */
        PAGE_SUCCESS,
        /* folio is clean and locked */
        PAGE_CLEAN,
} pageout_t;

static pageout_t writeout(struct folio *folio, struct address_space *mapping,
                struct swap_iocb **plug, struct list_head *folio_list)
{
        int res;

        folio_set_reclaim(folio);

        /*
         * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
         * or we failed to allocate contiguous swap entries, in which case
         * the split out folios get added back to folio_list.
         */
        if (shmem_mapping(mapping))
                res = shmem_writeout(folio, plug, folio_list);
        else
                res = swap_writeout(folio, plug);

        if (res < 0)
                handle_write_error(mapping, folio, res);
        if (res == AOP_WRITEPAGE_ACTIVATE) {
                folio_clear_reclaim(folio);
                return PAGE_ACTIVATE;
        }

        /* synchronous write? */
        if (!folio_test_writeback(folio))
                folio_clear_reclaim(folio);

        trace_mm_vmscan_write_folio(folio);
        node_stat_add_folio(folio, NR_VMSCAN_WRITE);
        return PAGE_SUCCESS;
}

/*
 * pageout is called by shrink_folio_list() for each dirty folio.
 */
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
                         struct swap_iocb **plug, struct list_head *folio_list)
{
        /*
         * We no longer attempt to writeback filesystem folios here, other
         * than tmpfs/shmem.  That's taken care of in page-writeback.
         * If we find a dirty filesystem folio at the end of the LRU list,
         * typically that means the filesystem is saturating the storage
         * with contiguous writes and telling it to write a folio here
         * would only make the situation worse by injecting an element
         * of random access.
         *
         * If the folio is swapcache, write it back even if that would
         * block, for some throttling. This happens by accident, because
         * swap_backing_dev_info is bust: it doesn't reflect the
         * congestion state of the swapdevs.  Easy to fix, if needed.
         *
         * A freeable shmem or swapcache folio is referenced only by the
         * caller that isolated the folio and the page cache.
         */
        if (folio_ref_count(folio) != 1 + folio_nr_pages(folio) || !mapping)
                return PAGE_KEEP;
        if (!shmem_mapping(mapping) && !folio_test_anon(folio))
                return PAGE_ACTIVATE;
        if (!folio_clear_dirty_for_io(folio))
                return PAGE_CLEAN;
        return writeout(folio, mapping, plug, folio_list);
}

/*
 * Same as remove_mapping, but if the folio is removed from the mapping, it
 * gets returned with a refcount of 0.
 */
static int __remove_mapping(struct address_space *mapping, struct folio *folio,
                            bool reclaimed, struct mem_cgroup *target_memcg)
{
        int refcount;
        void *shadow = NULL;
        struct swap_cluster_info *ci;

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(mapping != folio_mapping(folio));

        if (folio_test_swapcache(folio)) {
                ci = swap_cluster_get_and_lock_irq(folio);
        } else {
                spin_lock(&mapping->host->i_lock);
                xa_lock_irq(&mapping->i_pages);
        }

        /*
         * The non racy check for a busy folio.
         *
         * Must be careful with the order of the tests. When someone has
         * a ref to the folio, it may be possible that they dirty it then
         * drop the reference. So if the dirty flag is tested before the
         * refcount here, then the following race may occur:
         *
         * get_user_pages(&page);
         * [user mapping goes away]
         * write_to(page);
         *                                !folio_test_dirty(folio)    [good]
         * folio_set_dirty(folio);
         * folio_put(folio);
         *                                !refcount(folio)   [good, discard it]
         *
         * [oops, our write_to data is lost]
         *
         * Reversing the order of the tests ensures such a situation cannot
         * escape unnoticed. The smp_rmb is needed to ensure the folio->flags
         * load is not satisfied before that of folio->_refcount.
         *
         * Note that if the dirty flag is always set via folio_mark_dirty,
         * and thus under the i_pages lock, then this ordering is not required.
         */
        refcount = 1 + folio_nr_pages(folio);
        if (!folio_ref_freeze(folio, refcount))
                goto cannot_free;
        /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */
        if (unlikely(folio_test_dirty(folio))) {
                folio_ref_unfreeze(folio, refcount);
                goto cannot_free;
        }

        if (folio_test_swapcache(folio)) {
                swp_entry_t swap = folio->swap;

                if (reclaimed && !mapping_exiting(mapping))
                        shadow = workingset_eviction(folio, target_memcg);
                memcg1_swapout(folio, swap);
                __swap_cache_del_folio(ci, folio, swap, shadow);
                swap_cluster_unlock_irq(ci);
        } else {
                void (*free_folio)(struct folio *);

                free_folio = mapping->a_ops->free_folio;
                /*
                 * Remember a shadow entry for reclaimed file cache in
                 * order to detect refaults, thus thrashing, later on.
                 *
                 * But don't store shadows in an address space that is
                 * already exiting.  This is not just an optimization,
                 * inode reclaim needs to empty out the radix tree or
                 * the nodes are lost.  Don't plant shadows behind its
                 * back.
                 *
                 * We also don't store shadows for DAX mappings because the
                 * only page cache folios found in these are zero pages
                 * covering holes, and because we don't want to mix DAX
                 * exceptional entries and shadow exceptional entries in the
                 * same address_space.
                 */
                if (reclaimed && folio_is_file_lru(folio) &&
                    !mapping_exiting(mapping) && !dax_mapping(mapping))
                        shadow = workingset_eviction(folio, target_memcg);
                __filemap_remove_folio(folio, shadow);
                xa_unlock_irq(&mapping->i_pages);
                if (mapping_shrinkable(mapping))
                        inode_lru_list_add(mapping->host);
                spin_unlock(&mapping->host->i_lock);

                if (free_folio)
                        free_folio(folio);
        }

        return 1;

cannot_free:
        if (folio_test_swapcache(folio)) {
                swap_cluster_unlock_irq(ci);
        } else {
                xa_unlock_irq(&mapping->i_pages);
                spin_unlock(&mapping->host->i_lock);
        }
        return 0;
}

/**
 * remove_mapping() - Attempt to remove a folio from its mapping.
 * @mapping: The address space.
 * @folio: The folio to remove.
 *
 * If the folio is dirty, under writeback or if someone else has a ref
 * on it, removal will fail.
 * Return: The number of pages removed from the mapping.  0 if the folio
 * could not be removed.
 * Context: The caller should have a single refcount on the folio and
 * hold its lock.
 */
long remove_mapping(struct address_space *mapping, struct folio *folio)
{
        if (__remove_mapping(mapping, folio, false, NULL)) {
                /*
                 * Unfreezing the refcount with 1 effectively
                 * drops the pagecache ref for us without requiring another
                 * atomic operation.
                 */
                folio_ref_unfreeze(folio, 1);
                return folio_nr_pages(folio);
        }
        return 0;
}

/**
 * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
 * @folio: Folio to be returned to an LRU list.
 *
 * Add previously isolated @folio to appropriate LRU list.
 * The folio may still be unevictable for other reasons.
 *
 * Context: lru_lock must not be held, interrupts must be enabled.
 */
void folio_putback_lru(struct folio *folio)
{
        folio_add_lru(folio);
        folio_put(folio);                /* drop ref from isolate */
}

enum folio_references {
        FOLIOREF_RECLAIM,
        FOLIOREF_RECLAIM_CLEAN,
        FOLIOREF_KEEP,
        FOLIOREF_ACTIVATE,
};

#ifdef CONFIG_LRU_GEN
/*
 * Only used on a mapped folio in the eviction (rmap walk) path, where promotion
 * needs to be done by taking the folio off the LRU list and then adding it back
 * with PG_active set. In contrast, the aging (page table walk) path uses
 * folio_update_gen().
 */
static bool lru_gen_set_refs(struct folio *folio)
{
        /* see the comment on LRU_REFS_FLAGS */
        if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
                set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
                return false;
        }

        set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset));
        return true;
}
#else
static bool lru_gen_set_refs(struct folio *folio)
{
        return false;
}
#endif /* CONFIG_LRU_GEN */

static enum folio_references folio_check_references(struct folio *folio,
                                                  struct scan_control *sc)
{
        int referenced_ptes, referenced_folio;
        vm_flags_t vm_flags;

        referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
                                           &vm_flags);

        /*
         * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
         * Let the folio, now marked Mlocked, be moved to the unevictable list.
         */
        if (vm_flags & VM_LOCKED)
                return FOLIOREF_ACTIVATE;

        /*
         * There are two cases to consider.
         * 1) Rmap lock contention: rotate.
         * 2) Skip the non-shared swapbacked folio mapped solely by
         *    the exiting or OOM-reaped process.
         */
        if (referenced_ptes == -1)
                return FOLIOREF_KEEP;

        if (lru_gen_enabled() && !lru_gen_switching()) {
                if (!referenced_ptes)
                        return FOLIOREF_RECLAIM;

                return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
        }

        referenced_folio = folio_test_clear_referenced(folio);

        if (referenced_ptes) {
                /*
                 * All mapped folios start out with page table
                 * references from the instantiating fault, so we need
                 * to look twice if a mapped file/anon folio is used more
                 * than once.
                 *
                 * Mark it and spare it for another trip around the
                 * inactive list.  Another page table reference will
                 * lead to its activation.
                 *
                 * Note: the mark is set for activated folios as well
                 * so that recently deactivated but used folios are
                 * quickly recovered.
                 */
                folio_set_referenced(folio);

                if (referenced_folio || referenced_ptes > 1)
                        return FOLIOREF_ACTIVATE;

                /*
                 * Activate file-backed executable folios after first usage.
                 */
                if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
                        return FOLIOREF_ACTIVATE;

                return FOLIOREF_KEEP;
        }

        /* Reclaim if clean, defer dirty folios to writeback */
        if (referenced_folio && folio_is_file_lru(folio))
                return FOLIOREF_RECLAIM_CLEAN;

        return FOLIOREF_RECLAIM;
}

/* Check if a folio is dirty or under writeback */
static void folio_check_dirty_writeback(struct folio *folio,
                                       bool *dirty, bool *writeback)
{
        struct address_space *mapping;

        /*
         * Anonymous folios are not handled by flushers and must be written
         * from reclaim context. Do not stall reclaim based on them.
         * MADV_FREE anonymous folios are put into inactive file list too.
         * They could be mistakenly treated as file lru. So further anon
         * test is needed.
         */
        if (!folio_is_file_lru(folio) || folio_test_lazyfree(folio)) {
                *dirty = false;
                *writeback = false;
                return;
        }

        /* By default assume that the folio flags are accurate */
        *dirty = folio_test_dirty(folio);
        *writeback = folio_test_writeback(folio);

        /* Verify dirty/writeback state if the filesystem supports it */
        if (!folio_test_private(folio))
                return;

        mapping = folio_mapping(folio);
        if (mapping && mapping->a_ops->is_dirty_writeback)
                mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}

static struct folio *alloc_demote_folio(struct folio *src,
                unsigned long private)
{
        struct migration_target_control *mtc, target_nid_mtc;
        struct folio *dst;

        mtc = (struct migration_target_control *)private;

        /*
         * make sure we allocate from the target node first also trying to
         * demote or reclaim pages from the target node via kswapd if we are
         * low on free memory on target node. If we don't do this and if
         * we have free memory on the slower(lower) memtier, we would start
         * allocating pages from slower(lower) memory tiers without even forcing
         * a demotion of cold pages from the target memtier. This can result
         * in the kernel placing hot pages in slower(lower) memory tiers.
         */
        target_nid_mtc = *mtc;
        target_nid_mtc.nmask = NULL;
        target_nid_mtc.gfp_mask |= __GFP_THISNODE;
        dst = alloc_migration_target(src, (unsigned long)&target_nid_mtc);
        if (dst)
                return dst;

        return alloc_migration_target(src, (unsigned long)mtc);
}

/*
 * Take folios on @demote_folios and attempt to demote them to another node.
 * Folios which are not demoted are left on @demote_folios.
 */
static unsigned int demote_folio_list(struct list_head *demote_folios,
                                      struct pglist_data *pgdat,
                                      struct mem_cgroup *memcg)
{
        int target_nid;
        unsigned int nr_succeeded;
        nodemask_t allowed_mask;

        struct migration_target_control mtc = {
                /*
                 * Allocate from 'node', or fail quickly and quietly.
                 * When this happens, 'page' will likely just be discarded
                 * instead of migrated.
                 */
                .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
                        __GFP_NOMEMALLOC | GFP_NOWAIT,
                .nmask = &allowed_mask,
                .reason = MR_DEMOTION,
        };

        if (list_empty(demote_folios))
                return 0;

        node_get_allowed_targets(pgdat, &allowed_mask);
        mem_cgroup_node_filter_allowed(memcg, &allowed_mask);
        if (nodes_empty(allowed_mask))
                return 0;

        target_nid = next_demotion_node(pgdat->node_id, &allowed_mask);
        if (target_nid == NUMA_NO_NODE)
                /* No lower-tier nodes or nodes were hot-unplugged. */
                return 0;

        mtc.nid = target_nid;

        /* Demotion ignores all cpuset and mempolicy settings */
        migrate_pages(demote_folios, alloc_demote_folio, NULL,
                      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
                      &nr_succeeded);

        return nr_succeeded;
}

static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
{
        if (gfp_mask & __GFP_FS)
                return true;
        if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
                return false;
        /*
         * We can "enter_fs" for swap-cache with only __GFP_IO
         * providing this isn't SWP_FS_OPS.
         * ->flags can be updated non-atomically,
         * but that will never affect SWP_FS_OPS, so the data_race
         * is safe.
         */
        return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
}

/*
 * shrink_folio_list() returns the number of reclaimed pages
 */
static unsigned int shrink_folio_list(struct list_head *folio_list,
                struct pglist_data *pgdat, struct scan_control *sc,
                struct reclaim_stat *stat, bool ignore_references,
                struct mem_cgroup *memcg)
{
        struct folio_batch free_folios;
        LIST_HEAD(ret_folios);
        LIST_HEAD(demote_folios);
        unsigned int nr_reclaimed = 0, nr_demoted = 0;
        unsigned int pgactivate = 0;
        bool do_demote_pass;
        struct swap_iocb *plug = NULL;

        folio_batch_init(&free_folios);
        memset(stat, 0, sizeof(*stat));
        cond_resched();
        do_demote_pass = can_demote(pgdat->node_id, sc, memcg);

retry:
        while (!list_empty(folio_list)) {
                struct address_space *mapping;
                struct folio *folio;
                enum folio_references references = FOLIOREF_RECLAIM;
                bool dirty, writeback;
                unsigned int nr_pages;

                cond_resched();

                folio = lru_to_folio(folio_list);
                list_del(&folio->lru);

                if (!folio_trylock(folio))
                        goto keep;

                if (folio_contain_hwpoisoned_page(folio)) {
                        /*
                         * unmap_poisoned_folio() can't handle large
                         * folio, just skip it. memory_failure() will
                         * handle it if the UCE is triggered again.
                         */
                        if (folio_test_large(folio))
                                goto keep_locked;

                        unmap_poisoned_folio(folio, folio_pfn(folio), false);
                        folio_unlock(folio);
                        folio_put(folio);
                        continue;
                }

                VM_BUG_ON_FOLIO(folio_test_active(folio), folio);

                nr_pages = folio_nr_pages(folio);

                /* Account the number of base pages */
                sc->nr_scanned += nr_pages;

                if (unlikely(!folio_evictable(folio)))
                        goto activate_locked;

                if (!sc->may_unmap && folio_mapped(folio))
                        goto keep_locked;

                /*
                 * The number of dirty pages determines if a node is marked
                 * reclaim_congested. kswapd will stall and start writing
                 * folios if the tail of the LRU is all dirty unqueued folios.
                 */
                folio_check_dirty_writeback(folio, &dirty, &writeback);
                if (dirty || writeback)
                        stat->nr_dirty += nr_pages;

                if (dirty && !writeback)
                        stat->nr_unqueued_dirty += nr_pages;

                /*
                 * Treat this folio as congested if folios are cycling
                 * through the LRU so quickly that the folios marked
                 * for immediate reclaim are making it to the end of
                 * the LRU a second time.
                 */
                if (writeback && folio_test_reclaim(folio))
                        stat->nr_congested += nr_pages;

                /*
                 * If a folio at the tail of the LRU is under writeback, there
                 * are three cases to consider.
                 *
                 * 1) If reclaim is encountering an excessive number
                 *    of folios under writeback and this folio has both
                 *    the writeback and reclaim flags set, then it
                 *    indicates that folios are being queued for I/O but
                 *    are being recycled through the LRU before the I/O
                 *    can complete. Waiting on the folio itself risks an
                 *    indefinite stall if it is impossible to writeback
                 *    the folio due to I/O error or disconnected storage
                 *    so instead note that the LRU is being scanned too
                 *    quickly and the caller can stall after the folio
                 *    list has been processed.
                 *
                 * 2) Global or new memcg reclaim encounters a folio that is
                 *    not marked for immediate reclaim, or the caller does not
                 *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
                 *    not to fs), or the folio belongs to a mapping where
                 *    waiting on writeback during reclaim may lead to a deadlock.
                 *    In this case mark the folio for immediate reclaim and
                 *    continue scanning.
                 *
                 *    Require may_enter_fs() because we would wait on fs, which
                 *    may not have submitted I/O yet. And the loop driver might
                 *    enter reclaim, and deadlock if it waits on a folio for
                 *    which it is needed to do the write (loop masks off
                 *    __GFP_IO|__GFP_FS for this reason); but more thought
                 *    would probably show more reasons.
                 *
                 * 3) Legacy memcg encounters a folio that already has the
                 *    reclaim flag set. memcg does not have any dirty folio
                 *    throttling so we could easily OOM just because too many
                 *    folios are in writeback and there is nothing else to
                 *    reclaim. Wait for the writeback to complete.
                 *
                 * In cases 1) and 2) we activate the folios to get them out of
                 * the way while we continue scanning for clean folios on the
                 * inactive list and refilling from the active list. The
                 * observation here is that waiting for disk writes is more
                 * expensive than potentially causing reloads down the line.
                 * Since they're marked for immediate reclaim, they won't put
                 * memory pressure on the cache working set any longer than it
                 * takes to write them to disk.
                 */
                if (folio_test_writeback(folio)) {
                        mapping = folio_mapping(folio);

                        /* Case 1 above */
                        if (current_is_kswapd() &&
                            folio_test_reclaim(folio) &&
                            test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
                                stat->nr_immediate += nr_pages;
                                goto activate_locked;

                        /* Case 2 above */
                        } else if (writeback_throttling_sane(sc) ||
                            !folio_test_reclaim(folio) ||
                            !may_enter_fs(folio, sc->gfp_mask) ||
                            (mapping &&
                             mapping_writeback_may_deadlock_on_reclaim(mapping))) {
                                /*
                                 * This is slightly racy -
                                 * folio_end_writeback() might have
                                 * just cleared the reclaim flag, then
                                 * setting the reclaim flag here ends up
                                 * interpreted as the readahead flag - but
                                 * that does not matter enough to care.
                                 * What we do want is for this folio to
                                 * have the reclaim flag set next time
                                 * memcg reclaim reaches the tests above,
                                 * so it will then wait for writeback to
                                 * avoid OOM; and it's also appropriate
                                 * in global reclaim.
                                 */
                                folio_set_reclaim(folio);
                                stat->nr_writeback += nr_pages;
                                goto activate_locked;

                        /* Case 3 above */
                        } else {
                                folio_unlock(folio);
                                folio_wait_writeback(folio);
                                /* then go back and try same folio again */
                                list_add_tail(&folio->lru, folio_list);
                                continue;
                        }
                }

                if (!ignore_references)
                        references = folio_check_references(folio, sc);

                switch (references) {
                case FOLIOREF_ACTIVATE:
                        goto activate_locked;
                case FOLIOREF_KEEP:
                        stat->nr_ref_keep += nr_pages;
                        goto keep_locked;
                case FOLIOREF_RECLAIM:
                case FOLIOREF_RECLAIM_CLEAN:
                        ; /* try to reclaim the folio below */
                }

                /*
                 * Before reclaiming the folio, try to relocate
                 * its contents to another node.
                 */
                if (do_demote_pass &&
                    (thp_migration_supported() || !folio_test_large(folio))) {
                        list_add(&folio->lru, &demote_folios);
                        folio_unlock(folio);
                        continue;
                }

                /*
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
                 * Lazyfree folio could be freed directly
                 */
                if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
                                !folio_test_swapcache(folio)) {
                        if (!(sc->gfp_mask & __GFP_IO))
                                goto keep_locked;
                        if (folio_maybe_dma_pinned(folio))
                                goto keep_locked;
                        if (folio_test_large(folio)) {
                                /* cannot split folio, skip it */
                                if (folio_expected_ref_count(folio) !=
                                    folio_ref_count(folio) - 1)
                                        goto activate_locked;
                                /*
                                 * Split partially mapped folios right away.
                                 * We can free the unmapped pages without IO.
                                 */
                                if (data_race(!list_empty(&folio->_deferred_list) &&
                                    folio_test_partially_mapped(folio)) &&
                                    split_folio_to_list(folio, folio_list))
                                        goto activate_locked;
                        }
                        if (folio_alloc_swap(folio)) {
                                int __maybe_unused order = folio_order(folio);

                                if (!folio_test_large(folio))
                                        goto activate_locked_split;
                                /* Fallback to swap normal pages */
                                if (split_folio_to_list(folio, folio_list))
                                        goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                                if (nr_pages >= HPAGE_PMD_NR) {
                                        count_memcg_folio_events(folio,
                                                THP_SWPOUT_FALLBACK, 1);
                                        count_vm_event(THP_SWPOUT_FALLBACK);
                                }
#endif
                                count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
                                if (folio_alloc_swap(folio))
                                        goto activate_locked_split;
                        }
                        /*
                         * Normally the folio will be dirtied in unmap because
                         * its pte should be dirty. A special case is MADV_FREE
                         * page. The page's pte could have dirty bit cleared but
                         * the folio's SwapBacked flag is still set because
                         * clearing the dirty bit and SwapBacked flag has no
                         * lock protected. For such folio, unmap will not set
                         * dirty bit for it, so folio reclaim will not write the
                         * folio out. This can cause data corruption when the
                         * folio is swapped in later. Always setting the dirty
                         * flag for the folio solves the problem.
                         */
                        folio_mark_dirty(folio);
                }

                /*
                 * If the folio was split above, the tail pages will make
                 * their own pass through this function and be accounted
                 * then.
                 */
                if ((nr_pages > 1) && !folio_test_large(folio)) {
                        sc->nr_scanned -= (nr_pages - 1);
                        nr_pages = 1;
                }

                /*
                 * The folio is mapped into the page tables of one or more
                 * processes. Try to unmap it here.
                 */
                if (folio_mapped(folio)) {
                        enum ttu_flags flags = TTU_BATCH_FLUSH;
                        bool was_swapbacked = folio_test_swapbacked(folio);

                        if (folio_test_pmd_mappable(folio))
                                flags |= TTU_SPLIT_HUGE_PMD;
                        /*
                         * Without TTU_SYNC, try_to_unmap will only begin to
                         * hold PTL from the first present PTE within a large
                         * folio. Some initial PTEs might be skipped due to
                         * races with parallel PTE writes in which PTEs can be
                         * cleared temporarily before being written new present
                         * values. This will lead to a large folio is still
                         * mapped while some subpages have been partially
                         * unmapped after try_to_unmap; TTU_SYNC helps
                         * try_to_unmap acquire PTL from the first PTE,
                         * eliminating the influence of temporary PTE values.
                         */
                        if (folio_test_large(folio))
                                flags |= TTU_SYNC;

                        try_to_unmap(folio, flags);
                        if (folio_mapped(folio)) {
                                stat->nr_unmap_fail += nr_pages;
                                if (!was_swapbacked &&
                                    folio_test_swapbacked(folio))
                                        stat->nr_lazyfree_fail += nr_pages;
                                goto activate_locked;
                        }
                }

                /*
                 * Folio is unmapped now so it cannot be newly pinned anymore.
                 * No point in trying to reclaim folio if it is pinned.
                 * Furthermore we don't want to reclaim underlying fs metadata
                 * if the folio is pinned and thus potentially modified by the
                 * pinning process as that may upset the filesystem.
                 */
                if (folio_maybe_dma_pinned(folio))
                        goto activate_locked;

                mapping = folio_mapping(folio);
                if (folio_test_dirty(folio)) {
                        if (folio_is_file_lru(folio)) {
                                /*
                                 * Immediately reclaim when written back.
                                 * Similar in principle to folio_deactivate()
                                 * except we already have the folio isolated
                                 * and know it's dirty
                                 */
                                node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
                                                nr_pages);
                                if (!folio_test_reclaim(folio))
                                        folio_set_reclaim(folio);

                                goto activate_locked;
                        }

                        if (references == FOLIOREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs(folio, sc->gfp_mask))
                                goto keep_locked;
                        if (!sc->may_writepage)
                                goto keep_locked;

                        /*
                         * Folio is dirty. Flush the TLB if a writable entry
                         * potentially exists to avoid CPU writes after I/O
                         * starts and then write it out here.
                         */
                        try_to_unmap_flush_dirty();
                        switch (pageout(folio, mapping, &plug, folio_list)) {
                        case PAGE_KEEP:
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                /*
                                 * If shmem folio is split when writeback to swap,
                                 * the tail pages will make their own pass through
                                 * this function and be accounted then.
                                 */
                                if (nr_pages > 1 && !folio_test_large(folio)) {
                                        sc->nr_scanned -= (nr_pages - 1);
                                        nr_pages = 1;
                                }
                                goto activate_locked;
                        case PAGE_SUCCESS:
                                if (nr_pages > 1 && !folio_test_large(folio)) {
                                        sc->nr_scanned -= (nr_pages - 1);
                                        nr_pages = 1;
                                }
                                stat->nr_pageout += nr_pages;

                                if (folio_test_writeback(folio))
                                        goto keep;
                                if (folio_test_dirty(folio))
                                        goto keep;

                                /*
                                 * A synchronous write - probably a ramdisk.  Go
                                 * ahead and try to reclaim the folio.
                                 */
                                if (!folio_trylock(folio))
                                        goto keep;
                                if (folio_test_dirty(folio) ||
                                    folio_test_writeback(folio))
                                        goto keep_locked;
                                mapping = folio_mapping(folio);
                                fallthrough;
                        case PAGE_CLEAN:
                                ; /* try to free the folio below */
                        }
                }

                /*
                 * If the folio has buffers, try to free the buffer
                 * mappings associated with this folio. If we succeed
                 * we try to free the folio as well.
                 *
                 * We do this even if the folio is dirty.
                 * filemap_release_folio() does not perform I/O, but it
                 * is possible for a folio to have the dirty flag set,
                 * but it is actually clean (all its buffers are clean).
                 * This happens if the buffers were written out directly,
                 * with submit_bh(). ext3 will do this, as well as
                 * the blockdev mapping.  filemap_release_folio() will
                 * discover that cleanness and will drop the buffers
                 * and mark the folio clean - it can be freed.
                 *
                 * Rarely, folios can have buffers and no ->mapping.
                 * These are the folios which were not successfully
                 * invalidated in truncate_cleanup_folio().  We try to
                 * drop those buffers here and if that worked, and the
                 * folio is no longer mapped into process address space
                 * (refcount == 1) it can be freed.  Otherwise, leave
                 * the folio on the LRU so it is swappable.
                 */
                if (folio_needs_release(folio)) {
                        if (!filemap_release_folio(folio, sc->gfp_mask))
                                goto activate_locked;
                        if (!mapping && folio_ref_count(folio) == 1) {
                                folio_unlock(folio);
                                if (folio_put_testzero(folio))
                                        goto free_it;
                                else {
                                        /*
                                         * rare race with speculative reference.
                                         * the speculative reference will free
                                         * this folio shortly, so we may
                                         * increment nr_reclaimed here (and
                                         * leave it off the LRU).
                                         */
                                        nr_reclaimed += nr_pages;
                                        continue;
                                }
                        }
                }

                if (folio_test_lazyfree(folio)) {
                        /* follow __remove_mapping for reference */
                        if (!folio_ref_freeze(folio, 1))
                                goto keep_locked;
                        /*
                         * The folio has only one reference left, which is
                         * from the isolation. After the caller puts the
                         * folio back on the lru and drops the reference, the
                         * folio will be freed anyway. It doesn't matter
                         * which lru it goes on. So we don't bother checking
                         * the dirty flag here.
                         */
                        count_vm_events(PGLAZYFREED, nr_pages);
                        count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
                } else if (!mapping || !__remove_mapping(mapping, folio, true,
                                                         sc->target_mem_cgroup))
                        goto keep_locked;

                folio_unlock(folio);
free_it:
                /*
                 * Folio may get swapped out as a whole, need to account
                 * all pages in it.
                 */
                nr_reclaimed += nr_pages;

                folio_unqueue_deferred_split(folio);
                if (folio_batch_add(&free_folios, folio) == 0) {
                        mem_cgroup_uncharge_folios(&free_folios);
                        try_to_unmap_flush();
                        free_unref_folios(&free_folios);
                }
                continue;

activate_locked_split:
                /*
                 * The tail pages that are failed to add into swap cache
                 * reach here.  Fixup nr_scanned and nr_pages.
                 */
                if (nr_pages > 1) {
                        sc->nr_scanned -= (nr_pages - 1);
                        nr_pages = 1;
                }
activate_locked:
                /* Not a candidate for swapping, so reclaim swap space. */
                if (folio_test_swapcache(folio) &&
                    (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
                        folio_free_swap(folio);
                VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
                if (!folio_test_mlocked(folio)) {
                        int type = folio_is_file_lru(folio);
                        folio_set_active(folio);
                        stat->nr_activate[type] += nr_pages;
                        count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
                }
keep_locked:
                folio_unlock(folio);
keep:
                list_add(&folio->lru, &ret_folios);
                VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
                                folio_test_unevictable(folio), folio);
        }
        /* 'folio_list' is always empty here */

        /* Migrate folios selected for demotion */
        nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
        nr_reclaimed += nr_demoted;
        stat->nr_demoted += nr_demoted;
        /* Folios that could not be demoted are still in @demote_folios */
        if (!list_empty(&demote_folios)) {
                /* Folios which weren't demoted go back on @folio_list */
                list_splice_init(&demote_folios, folio_list);

                /*
                 * goto retry to reclaim the undemoted folios in folio_list if
                 * desired.
                 *
                 * Reclaiming directly from top tier nodes is not often desired
                 * due to it breaking the LRU ordering: in general memory
                 * should be reclaimed from lower tier nodes and demoted from
                 * top tier nodes.
                 *
                 * However, disabling reclaim from top tier nodes entirely
                 * would cause ooms in edge scenarios where lower tier memory
                 * is unreclaimable for whatever reason, eg memory being
                 * mlocked or too hot to reclaim. We can disable reclaim
                 * from top tier nodes in proactive reclaim though as that is
                 * not real memory pressure.
                 */
                if (!sc->proactive) {
                        do_demote_pass = false;
                        goto retry;
                }
        }

        pgactivate = stat->nr_activate[0] + stat->nr_activate[1];

        mem_cgroup_uncharge_folios(&free_folios);
        try_to_unmap_flush();
        free_unref_folios(&free_folios);

        list_splice(&ret_folios, folio_list);
        count_vm_events(PGACTIVATE, pgactivate);

        if (plug)
                swap_write_unplug(plug);
        return nr_reclaimed;
}

unsigned int reclaim_clean_pages_from_list(struct zone *zone,
                                           struct list_head *folio_list)
{
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_unmap = 1,
        };
        struct reclaim_stat stat;
        unsigned int nr_reclaimed;
        struct folio *folio, *next;
        LIST_HEAD(clean_folios);
        unsigned int noreclaim_flag;

        list_for_each_entry_safe(folio, next, folio_list, lru) {
                /* TODO: these pages should not even appear in this list. */
                if (page_has_movable_ops(&folio->page))
                        continue;
                if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
                    !folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
                        folio_clear_active(folio);
                        list_move(&folio->lru, &clean_folios);
                }
        }

        /*
         * We should be safe here since we are only dealing with file pages and
         * we are not kswapd and therefore cannot write dirty file pages. But
         * call memalloc_noreclaim_save() anyway, just in case these conditions
         * change in the future.
         */
        noreclaim_flag = memalloc_noreclaim_save();
        nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
                                        &stat, true, NULL);
        memalloc_noreclaim_restore(noreclaim_flag);

        list_splice(&clean_folios, folio_list);
        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
                            -(long)nr_reclaimed);
        /*
         * Since lazyfree pages are isolated from file LRU from the beginning,
         * they will rotate back to anonymous LRU in the end if it failed to
         * discard so isolated count will be mismatched.
         * Compensate the isolated count for both LRU lists.
         */
        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
                            stat.nr_lazyfree_fail);
        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
                            -(long)stat.nr_lazyfree_fail);
        return nr_reclaimed;
}

/*
 * Update LRU sizes after isolating pages. The LRU size updates must
 * be complete before mem_cgroup_update_lru_size due to a sanity check.
 */
static __always_inline void update_lru_sizes(struct lruvec *lruvec,
                        enum lru_list lru, unsigned long *nr_zone_taken)
{
        int zid;

        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                if (!nr_zone_taken[zid])
                        continue;

                update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
        }

}

/*
 * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
 *
 * lruvec->lru_lock is heavily contended.  Some of the functions that
 * shrink the lists perform better by taking out a batch of pages
 * and working on them outside the LRU lock.
 *
 * For pagecache intensive workloads, this function is the hottest
 * spot in the kernel (apart from copy_*_user functions).
 *
 * Lru_lock must be held before calling this function.
 *
 * @nr_to_scan:        The number of eligible pages to look through on the list.
 * @lruvec:        The LRU vector to pull pages from.
 * @dst:        The temp list to put pages on to.
 * @nr_scanned:        The number of pages that were scanned.
 * @sc:                The scan_control struct for this reclaim session
 * @lru:        LRU list id for isolating
 *
 * returns how many pages were moved onto *@dst.
 */
static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
                struct lruvec *lruvec, struct list_head *dst,
                unsigned long *nr_scanned, struct scan_control *sc,
                enum lru_list lru)
{
        struct list_head *src = &lruvec->lists[lru];
        unsigned long nr_taken = 0;
        unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
        unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
        unsigned long skipped = 0, total_scan = 0, scan = 0;
        unsigned long nr_pages;
        unsigned long max_nr_skipped = 0;
        LIST_HEAD(folios_skipped);

        while (scan < nr_to_scan && !list_empty(src)) {
                struct list_head *move_to = src;
                struct folio *folio;

                folio = lru_to_folio(src);
                prefetchw_prev_lru_folio(folio, src, flags);

                nr_pages = folio_nr_pages(folio);
                total_scan += nr_pages;

                /* Using max_nr_skipped to prevent hard LOCKUP*/
                if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
                    (folio_zonenum(folio) > sc->reclaim_idx)) {
                        nr_skipped[folio_zonenum(folio)] += nr_pages;
                        move_to = &folios_skipped;
                        max_nr_skipped++;
                        goto move;
                }

                /*
                 * Do not count skipped folios because that makes the function
                 * return with no isolated folios if the LRU mostly contains
                 * ineligible folios.  This causes the VM to not reclaim any
                 * folios, triggering a premature OOM.
                 * Account all pages in a folio.
                 */
                scan += nr_pages;

                if (!folio_test_lru(folio))
                        goto move;
                if (!sc->may_unmap && folio_mapped(folio))
                        goto move;

                /*
                 * Be careful not to clear the lru flag until after we're
                 * sure the folio is not being freed elsewhere -- the
                 * folio release code relies on it.
                 */
                if (unlikely(!folio_try_get(folio)))
                        goto move;

                if (!folio_test_clear_lru(folio)) {
                        /* Another thread is already isolating this folio */
                        folio_put(folio);
                        goto move;
                }

                nr_taken += nr_pages;
                nr_zone_taken[folio_zonenum(folio)] += nr_pages;
                move_to = dst;
move:
                list_move(&folio->lru, move_to);
        }

        /*
         * Splice any skipped folios to the start of the LRU list. Note that
         * this disrupts the LRU order when reclaiming for lower zones but
         * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
         * scanning would soon rescan the same folios to skip and waste lots
         * of cpu cycles.
         */
        if (!list_empty(&folios_skipped)) {
                int zid;

                list_splice(&folios_skipped, src);
                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                        if (!nr_skipped[zid])
                                continue;

                        __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
                        skipped += nr_skipped[zid];
                }
        }
        *nr_scanned = total_scan;
        trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
                                    total_scan, skipped, nr_taken, lru);
        update_lru_sizes(lruvec, lru, nr_zone_taken);
        return nr_taken;
}

/**
 * folio_isolate_lru() - Try to isolate a folio from its LRU list.
 * @folio: Folio to isolate from its LRU list.
 *
 * Isolate a @folio from an LRU list and adjust the vmstat statistic
 * corresponding to whatever LRU list the folio was on.
 *
 * The folio will have its LRU flag cleared.  If it was found on the
 * active list, it will have the Active flag set.  If it was found on the
 * unevictable list, it will have the Unevictable flag set.  These flags
 * may need to be cleared by the caller before letting the page go.
 *
 * Context:
 *
 * (1) Must be called with an elevated refcount on the folio. This is a
 *     fundamental difference from isolate_lru_folios() (which is called
 *     without a stable reference).
 * (2) The lru_lock must not be held.
 * (3) Interrupts must be enabled.
 *
 * Return: true if the folio was removed from an LRU list.
 * false if the folio was not on an LRU list.
 */
bool folio_isolate_lru(struct folio *folio)
{
        bool ret = false;

        VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);

        if (folio_test_clear_lru(folio)) {
                struct lruvec *lruvec;

                folio_get(folio);
                lruvec = folio_lruvec_lock_irq(folio);
                lruvec_del_folio(lruvec, folio);
                lruvec_unlock_irq(lruvec);
                ret = true;
        }

        return ret;
}

/*
 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
 * then get rescheduled. When there are massive number of tasks doing page
 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
 * the LRU list will go small and be scanned faster than necessary, leading to
 * unnecessary swapping, thrashing and OOM.
 */
static bool too_many_isolated(struct pglist_data *pgdat, int file,
                struct scan_control *sc)
{
        unsigned long inactive, isolated;
        bool too_many;

        if (current_is_kswapd())
                return false;

        if (!writeback_throttling_sane(sc))
                return false;

        if (file) {
                inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
                isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
        } else {
                inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
                isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
        }

        /*
         * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
         * won't get blocked by normal direct-reclaimers, forming a circular
         * deadlock.
         */
        if (gfp_has_io_fs(sc->gfp_mask))
                inactive >>= 3;

        too_many = isolated > inactive;

        /* Wake up tasks throttled due to too_many_isolated. */
        if (!too_many)
                wake_throttle_isolated(pgdat);

        return too_many;
}

/*
 * move_folios_to_lru() moves folios from private @list to appropriate LRU list.
 *
 * Returns the number of pages moved to the appropriate lruvec.
 *
 * Note: The caller must not hold any lruvec lock.
 */
static unsigned int move_folios_to_lru(struct list_head *list)
{
        int nr_pages, nr_moved = 0;
        struct lruvec *lruvec = NULL;
        struct folio_batch free_folios;

        folio_batch_init(&free_folios);
        while (!list_empty(list)) {
                struct folio *folio = lru_to_folio(list);

                lruvec = folio_lruvec_relock_irq(folio, lruvec);
                VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
                list_del(&folio->lru);
                if (unlikely(!folio_evictable(folio))) {
                        lruvec_unlock_irq(lruvec);
                        folio_putback_lru(folio);
                        lruvec = NULL;
                        continue;
                }

                /*
                 * The folio_set_lru needs to be kept here for list integrity.
                 * Otherwise:
                 *   #0 move_folios_to_lru             #1 release_pages
                 *   if (!folio_put_testzero())
                 *                                      if (folio_put_testzero())
                 *                                        !lru //skip lru_lock
                 *     folio_set_lru()
                 *     list_add(&folio->lru,)
                 *                                        list_add(&folio->lru,)
                 */
                folio_set_lru(folio);

                if (unlikely(folio_put_testzero(folio))) {
                        __folio_clear_lru_flags(folio);

                        folio_unqueue_deferred_split(folio);
                        if (folio_batch_add(&free_folios, folio) == 0) {
                                lruvec_unlock_irq(lruvec);
                                mem_cgroup_uncharge_folios(&free_folios);
                                free_unref_folios(&free_folios);
                                lruvec = NULL;
                        }

                        continue;
                }

                lruvec_add_folio(lruvec, folio);
                nr_pages = folio_nr_pages(folio);
                nr_moved += nr_pages;
                if (folio_test_active(folio))
                        workingset_age_nonresident(lruvec, nr_pages);
        }

        if (lruvec)
                lruvec_unlock_irq(lruvec);

        if (free_folios.nr) {
                mem_cgroup_uncharge_folios(&free_folios);
                free_unref_folios(&free_folios);
        }

        return nr_moved;
}

/*
 * If a kernel thread (such as nfsd for loop-back mounts) services a backing
 * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
 * we should not throttle.  Otherwise it is safe to do so.
 */
static int current_may_throttle(void)
{
        return !(current->flags & PF_LOCAL_THROTTLE);
}

/*
 * shrink_inactive_list() is a helper for shrink_node().  It returns the number
 * of reclaimed pages
 */
static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
                struct lruvec *lruvec, struct scan_control *sc,
                enum lru_list lru)
{
        LIST_HEAD(folio_list);
        unsigned long nr_scanned;
        unsigned int nr_reclaimed = 0;
        unsigned long nr_taken;
        struct reclaim_stat stat;
        bool file = is_file_lru(lru);
        enum node_stat_item item;
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        bool stalled = false;

        while (unlikely(too_many_isolated(pgdat, file, sc))) {
                if (stalled)
                        return 0;

                /* wait a bit for the reclaimer. */
                stalled = true;
                reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);

                /* We are about to die and free our memory. Return now. */
                if (fatal_signal_pending(current))
                        return SWAP_CLUSTER_MAX;
        }

        lru_add_drain();

        lruvec_lock_irq(lruvec);

        nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
                                     &nr_scanned, sc, lru);

        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
        item = PGSCAN_KSWAPD + reclaimer_offset(sc);
        mod_lruvec_state(lruvec, item, nr_scanned);
        mod_lruvec_state(lruvec, PGSCAN_ANON + file, nr_scanned);

        lruvec_unlock_irq(lruvec);

        if (nr_taken == 0)
                return 0;

        nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
                                         lruvec_memcg(lruvec));

        move_folios_to_lru(&folio_list);

        mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
                                        stat.nr_demoted);
        mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
        item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
        mod_lruvec_state(lruvec, item, nr_reclaimed);
        mod_lruvec_state(lruvec, PGSTEAL_ANON + file, nr_reclaimed);

        lruvec_lock_irq(lruvec);
        lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
                                        nr_scanned - nr_reclaimed);

        /*
         * If dirty folios are scanned that are not queued for IO, it
         * implies that flushers are not doing their job. This can
         * happen when memory pressure pushes dirty folios to the end of
         * the LRU before the dirty limits are breached and the dirty
         * data has expired. It can also happen when the proportion of
         * dirty folios grows not through writes but through memory
         * pressure reclaiming all the clean cache. And in some cases,
         * the flushers simply cannot keep up with the allocation
         * rate. Nudge the flusher threads in case they are asleep.
         */
        if (stat.nr_unqueued_dirty == nr_taken) {
                wakeup_flusher_threads(WB_REASON_VMSCAN);
                /*
                 * For cgroupv1 dirty throttling is achieved by waking up
                 * the kernel flusher here and later waiting on folios
                 * which are in writeback to finish (see shrink_folio_list()).
                 *
                 * Flusher may not be able to issue writeback quickly
                 * enough for cgroupv1 writeback throttling to work
                 * on a large system.
                 */
                if (!writeback_throttling_sane(sc))
                        reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
        }

        sc->nr.dirty += stat.nr_dirty;
        sc->nr.congested += stat.nr_congested;
        sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
        sc->nr.writeback += stat.nr_writeback;
        sc->nr.immediate += stat.nr_immediate;
        sc->nr.taken += nr_taken;
        if (file)
                sc->nr.file_taken += nr_taken;

        trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
                        nr_scanned, nr_reclaimed, &stat, sc->priority, file);
        return nr_reclaimed;
}

/*
 * shrink_active_list() moves folios from the active LRU to the inactive LRU.
 *
 * We move them the other way if the folio is referenced by one or more
 * processes.
 *
 * If the folios are mostly unmapped, the processing is fast and it is
 * appropriate to hold lru_lock across the whole operation.  But if
 * the folios are mapped, the processing is slow (folio_referenced()), so
 * we should drop lru_lock around each folio.  It's impossible to balance
 * this, so instead we remove the folios from the LRU while processing them.
 * It is safe to rely on the active flag against the non-LRU folios in here
 * because nobody will play with that bit on a non-LRU folio.
 *
 * The downside is that we have to touch folio->_refcount against each folio.
 * But we had to alter folio->flags anyway.
 */
static void shrink_active_list(unsigned long nr_to_scan,
                               struct lruvec *lruvec,
                               struct scan_control *sc,
                               enum lru_list lru)
{
        unsigned long nr_taken;
        unsigned long nr_scanned;
        vm_flags_t vm_flags;
        LIST_HEAD(l_hold);        /* The folios which were snipped off */
        LIST_HEAD(l_active);
        LIST_HEAD(l_inactive);
        unsigned nr_deactivate, nr_activate;
        unsigned nr_rotated = 0;
        bool file = is_file_lru(lru);
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        lru_add_drain();

        lruvec_lock_irq(lruvec);

        nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
                                     &nr_scanned, sc, lru);

        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);

        mod_lruvec_state(lruvec, PGREFILL, nr_scanned);

        lruvec_unlock_irq(lruvec);

        while (!list_empty(&l_hold)) {
                struct folio *folio;

                cond_resched();
                folio = lru_to_folio(&l_hold);
                list_del(&folio->lru);

                if (unlikely(!folio_evictable(folio))) {
                        folio_putback_lru(folio);
                        continue;
                }

                if (unlikely(buffer_heads_over_limit)) {
                        if (folio_needs_release(folio) &&
                            folio_trylock(folio)) {
                                filemap_release_folio(folio, 0);
                                folio_unlock(folio);
                        }
                }

                /* Referenced or rmap lock contention: rotate */
                if (folio_referenced(folio, 0, sc->target_mem_cgroup,
                                     &vm_flags) != 0) {
                        /*
                         * Identify referenced, file-backed active folios and
                         * give them one more trip around the active list. So
                         * that executable code get better chances to stay in
                         * memory under moderate memory pressure.  Anon folios
                         * are not likely to be evicted by use-once streaming
                         * IO, plus JVM can create lots of anon VM_EXEC folios,
                         * so we ignore them here.
                         */
                        if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
                                nr_rotated += folio_nr_pages(folio);
                                list_add(&folio->lru, &l_active);
                                continue;
                        }
                }

                folio_clear_active(folio);        /* we are de-activating */
                folio_set_workingset(folio);
                list_add(&folio->lru, &l_inactive);
        }

        /*
         * Move folios back to the lru list.
         */
        nr_activate = move_folios_to_lru(&l_active);
        nr_deactivate = move_folios_to_lru(&l_inactive);

        count_vm_events(PGDEACTIVATE, nr_deactivate);
        count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
        mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);

        lruvec_lock_irq(lruvec);
        lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
        trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
                        nr_deactivate, nr_rotated, sc->priority, file);
}

static unsigned int reclaim_folio_list(struct list_head *folio_list,
                                      struct pglist_data *pgdat)
{
        struct reclaim_stat stat;
        unsigned int nr_reclaimed;
        struct folio *folio;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_writepage = 1,
                .may_unmap = 1,
                .may_swap = 1,
                .no_demotion = 1,
        };

        nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
        while (!list_empty(folio_list)) {
                folio = lru_to_folio(folio_list);
                list_del(&folio->lru);
                folio_putback_lru(folio);
        }
        trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat);

        return nr_reclaimed;
}

unsigned long reclaim_pages(struct list_head *folio_list)
{
        int nid;
        unsigned int nr_reclaimed = 0;
        LIST_HEAD(node_folio_list);
        unsigned int noreclaim_flag;

        if (list_empty(folio_list))
                return nr_reclaimed;

        noreclaim_flag = memalloc_noreclaim_save();

        nid = folio_nid(lru_to_folio(folio_list));
        do {
                struct folio *folio = lru_to_folio(folio_list);

                if (nid == folio_nid(folio)) {
                        folio_clear_active(folio);
                        list_move(&folio->lru, &node_folio_list);
                        continue;
                }

                nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
                nid = folio_nid(lru_to_folio(folio_list));
        } while (!list_empty(folio_list));

        nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));

        memalloc_noreclaim_restore(noreclaim_flag);

        return nr_reclaimed;
}

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                                 struct lruvec *lruvec, struct scan_control *sc)
{
        if (is_active_lru(lru)) {
                if (sc->may_deactivate & (1 << is_file_lru(lru)))
                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
                else
                        sc->skipped_deactivate = 1;
                return 0;
        }

        return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}

/*
 * The inactive anon list should be small enough that the VM never has
 * to do too much work.
 *
 * The inactive file list should be small enough to leave most memory
 * to the established workingset on the scan-resistant active list,
 * but large enough to avoid thrashing the aggregate readahead window.
 *
 * Both inactive lists should also be large enough that each inactive
 * folio has a chance to be referenced again before it is reclaimed.
 *
 * If that fails and refaulting is observed, the inactive list grows.
 *
 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
 * on this LRU, maintained by the pageout code. An inactive_ratio
 * of 3 means 3:1 or 25% of the folios are kept on the inactive list.
 *
 * total     target    max
 * memory    ratio     inactive
 * -------------------------------------
 *   10MB       1         5MB
 *  100MB       1        50MB
 *    1GB       3       250MB
 *   10GB      10       0.9GB
 *  100GB      31         3GB
 *    1TB     101        10GB
 *   10TB     320        32GB
 */
static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
{
        enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
        unsigned long inactive, active;
        unsigned long inactive_ratio;
        unsigned long gb;

        inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
        active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);

        gb = (inactive + active) >> (30 - PAGE_SHIFT);
        if (gb)
                inactive_ratio = int_sqrt(10 * gb);
        else
                inactive_ratio = 1;

        return inactive * inactive_ratio < active;
}

enum scan_balance {
        SCAN_EQUAL,
        SCAN_FRACT,
        SCAN_ANON,
        SCAN_FILE,
};

static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
{
        unsigned long file;
        struct lruvec *target_lruvec;

        if (lru_gen_enabled() && !lru_gen_switching())
                return;

        target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);

        /*
         * Flush the memory cgroup stats in rate-limited way as we don't need
         * most accurate stats here. We may switch to regular stats flushing
         * in the future once it is cheap enough.
         */
        mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup);

        /*
         * Determine the scan balance between anon and file LRUs.
         */
        spin_lock_irq(&target_lruvec->lru_lock);
        sc->anon_cost = target_lruvec->anon_cost;
        sc->file_cost = target_lruvec->file_cost;
        spin_unlock_irq(&target_lruvec->lru_lock);

        /*
         * Target desirable inactive:active list ratios for the anon
         * and file LRU lists.
         */
        if (!sc->force_deactivate) {
                unsigned long refaults;

                /*
                 * When refaults are being observed, it means a new
                 * workingset is being established. Deactivate to get
                 * rid of any stale active pages quickly.
                 */
                refaults = lruvec_page_state(target_lruvec,
                                WORKINGSET_ACTIVATE_ANON);
                if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
                        inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
                        sc->may_deactivate |= DEACTIVATE_ANON;
                else
                        sc->may_deactivate &= ~DEACTIVATE_ANON;

                refaults = lruvec_page_state(target_lruvec,
                                WORKINGSET_ACTIVATE_FILE);
                if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
                    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
                        sc->may_deactivate |= DEACTIVATE_FILE;
                else
                        sc->may_deactivate &= ~DEACTIVATE_FILE;
        } else
                sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;

        /*
         * If we have plenty of inactive file pages that aren't
         * thrashing, try to reclaim those first before touching
         * anonymous pages.
         */
        file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
        if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) &&
            !sc->no_cache_trim_mode)
                sc->cache_trim_mode = 1;
        else
                sc->cache_trim_mode = 0;

        /*
         * Prevent the reclaimer from falling into the cache trap: as
         * cache pages start out inactive, every cache fault will tip
         * the scan balance towards the file LRU.  And as the file LRU
         * shrinks, so does the window for rotation from references.
         * This means we have a runaway feedback loop where a tiny
         * thrashing file LRU becomes infinitely more attractive than
         * anon pages.  Try to detect this based on file LRU size.
         */
        if (!cgroup_reclaim(sc)) {
                unsigned long total_high_wmark = 0;
                unsigned long free, anon;
                int z;
                struct zone *zone;

                free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
                file = node_page_state(pgdat, NR_ACTIVE_FILE) +
                           node_page_state(pgdat, NR_INACTIVE_FILE);

                for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) {
                        total_high_wmark += high_wmark_pages(zone);
                }

                /*
                 * Consider anon: if that's low too, this isn't a
                 * runaway file reclaim problem, but rather just
                 * extreme pressure. Reclaim as per usual then.
                 */
                anon = node_page_state(pgdat, NR_INACTIVE_ANON);

                sc->file_is_tiny =
                        file + free <= total_high_wmark &&
                        !(sc->may_deactivate & DEACTIVATE_ANON) &&
                        anon >> sc->priority;
        }
}

static inline void calculate_pressure_balance(struct scan_control *sc,
                        int swappiness, u64 *fraction, u64 *denominator)
{
        unsigned long anon_cost, file_cost, total_cost;
        unsigned long ap, fp;

        /*
         * Calculate the pressure balance between anon and file pages.
         *
         * The amount of pressure we put on each LRU is inversely
         * proportional to the cost of reclaiming each list, as
         * determined by the share of pages that are refaulting, times
         * the relative IO cost of bringing back a swapped out
         * anonymous page vs reloading a filesystem page (swappiness).
         *
         * Although we limit that influence to ensure no list gets
         * left behind completely: at least a third of the pressure is
         * applied, before swappiness.
         *
         * With swappiness at 100, anon and file have equal IO cost.
         */
        total_cost = sc->anon_cost + sc->file_cost;
        anon_cost = total_cost + sc->anon_cost;
        file_cost = total_cost + sc->file_cost;
        total_cost = anon_cost + file_cost;

        ap = swappiness * (total_cost + 1);
        ap /= anon_cost + 1;

        fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
        fp /= file_cost + 1;

        fraction[WORKINGSET_ANON] = ap;
        fraction[WORKINGSET_FILE] = fp;
        *denominator = ap + fp;
}

static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
                struct scan_control *sc, unsigned long scan)
{
        unsigned long min, low, usage;

        mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low, &usage);

        if (min || low) {
                /*
                 * Scale a cgroup's reclaim pressure by proportioning
                 * its current usage to its memory.low or memory.min
                 * setting.
                 *
                 * This is important, as otherwise scanning aggression
                 * becomes extremely binary -- from nothing as we
                 * approach the memory protection threshold, to totally
                 * nominal as we exceed it.  This results in requiring
                 * setting extremely liberal protection thresholds. It
                 * also means we simply get no protection at all if we
                 * set it too low, which is not ideal.
                 *
                 * If there is any protection in place, we reduce scan
                 * pressure by how much of the total memory used is
                 * within protection thresholds.
                 *
                 * There is one special case: in the first reclaim pass,
                 * we skip over all groups that are within their low
                 * protection. If that fails to reclaim enough pages to
                 * satisfy the reclaim goal, we come back and override
                 * the best-effort low protection. However, we still
                 * ideally want to honor how well-behaved groups are in
                 * that case instead of simply punishing them all
                 * equally. As such, we reclaim them based on how much
                 * memory they are using, reducing the scan pressure
                 * again by how much of the total memory used is under
                 * hard protection.
                 */
                unsigned long protection;

                /* memory.low scaling, make sure we retry before OOM */
                if (!sc->memcg_low_reclaim && low > min) {
                        protection = low;
                        sc->memcg_low_skipped = 1;
                } else {
                        protection = min;
                }

                /* Avoid TOCTOU with earlier protection check */
                usage = max(usage, protection);

                scan -= scan * protection / (usage + 1);

                /*
                 * Minimally target SWAP_CLUSTER_MAX pages to keep
                 * reclaim moving forwards, avoiding decrementing
                 * sc->priority further than desirable.
                 */
                scan = max(scan, SWAP_CLUSTER_MAX);
        }
        return scan;
}

/*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.
 *
 * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
 * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
 */
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
{
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        int swappiness = sc_swappiness(sc, memcg);
        u64 fraction[ANON_AND_FILE];
        u64 denominator = 0;        /* gcc */
        enum scan_balance scan_balance;
        enum lru_list lru;

        /* If we have no swap space, do not bother scanning anon folios. */
        if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
                scan_balance = SCAN_FILE;
                goto out;
        }

        /*
         * Global reclaim will swap to prevent OOM even with no
         * swappiness, but memcg users want to use this knob to
         * disable swapping for individual groups completely when
         * using the memory controller's swap limit feature would be
         * too expensive.
         */
        if (cgroup_reclaim(sc) && !swappiness) {
                scan_balance = SCAN_FILE;
                goto out;
        }

        /* Proactive reclaim initiated by userspace for anonymous memory only */
        if (swappiness == SWAPPINESS_ANON_ONLY) {
                WARN_ON_ONCE(!sc->proactive);
                scan_balance = SCAN_ANON;
                goto out;
        }

        /*
         * Do not apply any pressure balancing cleverness when the
         * system is close to OOM, scan both anon and file equally
         * (unless the swappiness setting disagrees with swapping).
         */
        if (!sc->priority && swappiness) {
                scan_balance = SCAN_EQUAL;
                goto out;
        }

        /*
         * If the system is almost out of file pages, force-scan anon.
         */
        if (sc->file_is_tiny) {
                scan_balance = SCAN_ANON;
                goto out;
        }

        /*
         * If there is enough inactive page cache, we do not reclaim
         * anything from the anonymous working right now to make sure
         * a streaming file access pattern doesn't cause swapping.
         */
        if (sc->cache_trim_mode) {
                scan_balance = SCAN_FILE;
                goto out;
        }

        scan_balance = SCAN_FRACT;
        calculate_pressure_balance(sc, swappiness, fraction, &denominator);

out:
        for_each_evictable_lru(lru) {
                bool file = is_file_lru(lru);
                unsigned long lruvec_size;
                unsigned long scan;

                lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
                scan = apply_proportional_protection(memcg, sc, lruvec_size);
                scan >>= sc->priority;

                /*
                 * If the cgroup's already been deleted, make sure to
                 * scrape out the remaining cache.
                 */
                if (!scan && !mem_cgroup_online(memcg))
                        scan = min(lruvec_size, SWAP_CLUSTER_MAX);

                switch (scan_balance) {
                case SCAN_EQUAL:
                        /* Scan lists relative to size */
                        break;
                case SCAN_FRACT:
                        /*
                         * Scan types proportional to swappiness and
                         * their relative recent reclaim efficiency.
                         * Make sure we don't miss the last page on
                         * the offlined memory cgroups because of a
                         * round-off error.
                         */
                        scan = mem_cgroup_online(memcg) ?
                               div64_u64(scan * fraction[file], denominator) :
                               DIV64_U64_ROUND_UP(scan * fraction[file],
                                                  denominator);
                        break;
                case SCAN_FILE:
                case SCAN_ANON:
                        /* Scan one type exclusively */
                        if ((scan_balance == SCAN_FILE) != file)
                                scan = 0;
                        break;
                default:
                        /* Look ma, no brain */
                        BUG();
                }

                nr[lru] = scan;
        }
}

/*
 * Anonymous LRU management is a waste if there is
 * ultimately no way to reclaim the memory.
 */
static bool can_age_anon_pages(struct lruvec *lruvec,
                               struct scan_control *sc)
{
        /* Aging the anon LRU is valuable if swap is present: */
        if (total_swap_pages > 0)
                return true;

        /* Also valuable if anon pages can be demoted: */
        return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
                          lruvec_memcg(lruvec));
}

#ifdef CONFIG_LRU_GEN

DEFINE_STATIC_KEY_FALSE(lru_switch);
#ifdef CONFIG_LRU_GEN_ENABLED
DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
#define get_cap(cap)        static_branch_likely(&lru_gen_caps[cap])
#else
DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
#define get_cap(cap)        static_branch_unlikely(&lru_gen_caps[cap])
#endif

static bool should_walk_mmu(void)
{
        return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK);
}

static bool should_clear_pmd_young(void)
{
        return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG);
}

/******************************************************************************
 *                          shorthand helpers
 ******************************************************************************/

#define DEFINE_MAX_SEQ(lruvec)                                                \
        unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)

#define DEFINE_MIN_SEQ(lruvec)                                                \
        unsigned long min_seq[ANON_AND_FILE] = {                        \
                READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]),        \
                READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]),        \
        }

/* Get the min/max evictable type based on swappiness */
#define min_type(swappiness) (!(swappiness))
#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)

#define evictable_min_seq(min_seq, swappiness)                                \
        min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])

#define for_each_gen_type_zone(gen, type, zone)                                \
        for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)                        \
                for ((type) = 0; (type) < ANON_AND_FILE; (type)++)        \
                        for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)

#define for_each_evictable_type(type, swappiness)                        \
        for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)

#define get_memcg_gen(seq)        ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin)        ((bin) % MEMCG_NR_BINS)

static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
{
        struct pglist_data *pgdat = NODE_DATA(nid);

#ifdef CONFIG_MEMCG
        if (memcg) {
                struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;

                /* see the comment in mem_cgroup_lruvec() */
                if (!lruvec->pgdat)
                        lruvec->pgdat = pgdat;

                return lruvec;
        }
#endif
        VM_WARN_ON_ONCE(!mem_cgroup_disabled());

        return &pgdat->__lruvec;
}

static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
{
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        if (!sc->may_swap)
                return 0;

        if (!can_demote(pgdat->node_id, sc, memcg) &&
            mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
                return 0;

        return sc_swappiness(sc, memcg);
}

static int get_nr_gens(struct lruvec *lruvec, int type)
{
        return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
}

static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
        int type;

        for (type = 0; type < ANON_AND_FILE; type++) {
                int n = get_nr_gens(lruvec, type);

                if (n < MIN_NR_GENS || n > MAX_NR_GENS)
                        return false;
        }

        return true;
}

/******************************************************************************
 *                          Bloom filters
 ******************************************************************************/

/*
 * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
 * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
 * bits in a bitmap, k is the number of hash functions and n is the number of
 * inserted items.
 *
 * Page table walkers use one of the two filters to reduce their search space.
 * To get rid of non-leaf entries that no longer have enough leaf entries, the
 * aging uses the double-buffering technique to flip to the other filter each
 * time it produces a new generation. For non-leaf entries that have enough
 * leaf entries, the aging carries them over to the next generation in
 * walk_pmd_range(); the eviction also report them when walking the rmap
 * in lru_gen_look_around().
 *
 * For future optimizations:
 * 1. It's not necessary to keep both filters all the time. The spare one can be
 *    freed after the RCU grace period and reallocated if needed again.
 * 2. And when reallocating, it's worth scaling its size according to the number
 *    of inserted entries in the other filter, to reduce the memory overhead on
 *    small systems and false positives on large systems.
 * 3. Jenkins' hash function is an alternative to Knuth's.
 */
#define BLOOM_FILTER_SHIFT        15

static inline int filter_gen_from_seq(unsigned long seq)
{
        return seq % NR_BLOOM_FILTERS;
}

static void get_item_key(void *item, int *key)
{
        u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);

        BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));

        key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
        key[1] = hash >> BLOOM_FILTER_SHIFT;
}

static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
                              void *item)
{
        int key[2];
        unsigned long *filter;
        int gen = filter_gen_from_seq(seq);

        filter = READ_ONCE(mm_state->filters[gen]);
        if (!filter)
                return true;

        get_item_key(item, key);

        return test_bit(key[0], filter) && test_bit(key[1], filter);
}

static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
                                void *item)
{
        int key[2];
        unsigned long *filter;
        int gen = filter_gen_from_seq(seq);

        filter = READ_ONCE(mm_state->filters[gen]);
        if (!filter)
                return;

        get_item_key(item, key);

        if (!test_bit(key[0], filter))
                set_bit(key[0], filter);
        if (!test_bit(key[1], filter))
                set_bit(key[1], filter);
}

static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq)
{
        unsigned long *filter;
        int gen = filter_gen_from_seq(seq);

        filter = mm_state->filters[gen];
        if (filter) {
                bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
                return;
        }

        filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
                               __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
        WRITE_ONCE(mm_state->filters[gen], filter);
}

/******************************************************************************
 *                          mm_struct list
 ******************************************************************************/

#ifdef CONFIG_LRU_GEN_WALKS_MMU

static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
{
        static struct lru_gen_mm_list mm_list = {
                .fifo = LIST_HEAD_INIT(mm_list.fifo),
                .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
        };

#ifdef CONFIG_MEMCG
        if (memcg)
                return &memcg->mm_list;
#endif
        VM_WARN_ON_ONCE(!mem_cgroup_disabled());

        return &mm_list;
}

static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
{
        return &lruvec->mm_state;
}

static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
{
        int key;
        struct mm_struct *mm;
        struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
        struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);

        mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
        key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);

        if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
                return NULL;

        clear_bit(key, &mm->lru_gen.bitmap);
        mmgrab(mm);

        return mm;
}

void lru_gen_add_mm(struct mm_struct *mm)
{
        int nid;
        struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
        struct lru_gen_mm_list *mm_list = get_mm_list(memcg);

        VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
#ifdef CONFIG_MEMCG
        VM_WARN_ON_ONCE(mm->lru_gen.memcg);
        mm->lru_gen.memcg = memcg;
#endif
        spin_lock(&mm_list->lock);

        for_each_node_state(nid, N_MEMORY) {
                struct lruvec *lruvec = get_lruvec(memcg, nid);
                struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);

                /* the first addition since the last iteration */
                if (mm_state->tail == &mm_list->fifo)
                        mm_state->tail = &mm->lru_gen.list;
        }

        list_add_tail(&mm->lru_gen.list, &mm_list->fifo);

        spin_unlock(&mm_list->lock);
}

void lru_gen_del_mm(struct mm_struct *mm)
{
        int nid;
        struct lru_gen_mm_list *mm_list;
        struct mem_cgroup *memcg = NULL;

        if (list_empty(&mm->lru_gen.list))
                return;

#ifdef CONFIG_MEMCG
        memcg = mm->lru_gen.memcg;
#endif
        mm_list = get_mm_list(memcg);

        spin_lock(&mm_list->lock);

        for_each_node(nid) {
                struct lruvec *lruvec = get_lruvec(memcg, nid);
                struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);

                /* where the current iteration continues after */
                if (mm_state->head == &mm->lru_gen.list)
                        mm_state->head = mm_state->head->prev;

                /* where the last iteration ended before */
                if (mm_state->tail == &mm->lru_gen.list)
                        mm_state->tail = mm_state->tail->next;
        }

        list_del_init(&mm->lru_gen.list);

        spin_unlock(&mm_list->lock);

#ifdef CONFIG_MEMCG
        mem_cgroup_put(mm->lru_gen.memcg);
        mm->lru_gen.memcg = NULL;
#endif
}

#ifdef CONFIG_MEMCG
void lru_gen_migrate_mm(struct mm_struct *mm)
{
        struct mem_cgroup *memcg;
        struct task_struct *task = rcu_dereference_protected(mm->owner, true);

        VM_WARN_ON_ONCE(task->mm != mm);
        lockdep_assert_held(&task->alloc_lock);

        /* for mm_update_next_owner() */
        if (mem_cgroup_disabled())
                return;

        /* migration can happen before addition */
        if (!mm->lru_gen.memcg)
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(task);
        rcu_read_unlock();
        if (memcg == mm->lru_gen.memcg)
                return;

        VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));

        lru_gen_del_mm(mm);
        lru_gen_add_mm(mm);
}
#endif

#else /* !CONFIG_LRU_GEN_WALKS_MMU */

static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
{
        return NULL;
}

static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
{
        return NULL;
}

static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
{
        return NULL;
}

#endif

static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
{
        int i;
        int hist;
        struct lruvec *lruvec = walk->lruvec;
        struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);

        lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);

        hist = lru_hist_from_seq(walk->seq);

        for (i = 0; i < NR_MM_STATS; i++) {
                WRITE_ONCE(mm_state->stats[hist][i],
                           mm_state->stats[hist][i] + walk->mm_stats[i]);
                walk->mm_stats[i] = 0;
        }

        if (NR_HIST_GENS > 1 && last) {
                hist = lru_hist_from_seq(walk->seq + 1);

                for (i = 0; i < NR_MM_STATS; i++)
                        WRITE_ONCE(mm_state->stats[hist][i], 0);
        }
}

static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter)
{
        bool first = false;
        bool last = false;
        struct mm_struct *mm = NULL;
        struct lruvec *lruvec = walk->lruvec;
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
        struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);

        /*
         * mm_state->seq is incremented after each iteration of mm_list. There
         * are three interesting cases for this page table walker:
         * 1. It tries to start a new iteration with a stale max_seq: there is
         *    nothing left to do.
         * 2. It started the next iteration: it needs to reset the Bloom filter
         *    so that a fresh set of PTE tables can be recorded.
         * 3. It ended the current iteration: it needs to reset the mm stats
         *    counters and tell its caller to increment max_seq.
         */
        spin_lock(&mm_list->lock);

        VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq);

        if (walk->seq <= mm_state->seq)
                goto done;

        if (!mm_state->head)
                mm_state->head = &mm_list->fifo;

        if (mm_state->head == &mm_list->fifo)
                first = true;

        do {
                mm_state->head = mm_state->head->next;
                if (mm_state->head == &mm_list->fifo) {
                        WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
                        last = true;
                        break;
                }

                /* force scan for those added after the last iteration */
                if (!mm_state->tail || mm_state->tail == mm_state->head) {
                        mm_state->tail = mm_state->head->next;
                        walk->force_scan = true;
                }
        } while (!(mm = get_next_mm(walk)));
done:
        if (*iter || last)
                reset_mm_stats(walk, last);

        spin_unlock(&mm_list->lock);

        if (mm && first)
                reset_bloom_filter(mm_state, walk->seq + 1);

        if (*iter)
                mmdrop(*iter);

        *iter = mm;

        return last;
}

static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq)
{
        bool success = false;
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
        struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);

        spin_lock(&mm_list->lock);

        VM_WARN_ON_ONCE(mm_state->seq + 1 < seq);

        if (seq > mm_state->seq) {
                mm_state->head = NULL;
                mm_state->tail = NULL;
                WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
                success = true;
        }

        spin_unlock(&mm_list->lock);

        return success;
}

/******************************************************************************
 *                          PID controller
 ******************************************************************************/

/*
 * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
 *
 * The P term is refaulted/(evicted+protected) from a tier in the generation
 * currently being evicted; the I term is the exponential moving average of the
 * P term over the generations previously evicted, using the smoothing factor
 * 1/2; the D term isn't supported.
 *
 * The setpoint (SP) is always the first tier of one type; the process variable
 * (PV) is either any tier of the other type or any other tier of the same
 * type.
 *
 * The error is the difference between the SP and the PV; the correction is to
 * turn off protection when SP>PV or turn on protection when SP<PV.
 *
 * For future optimizations:
 * 1. The D term may discount the other two terms over time so that long-lived
 *    generations can resist stale information.
 */
struct ctrl_pos {
        unsigned long refaulted;
        unsigned long total;
        int gain;
};

static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
                          struct ctrl_pos *pos)
{
        int i;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        int hist = lru_hist_from_seq(lrugen->min_seq[type]);

        pos->gain = gain;
        pos->refaulted = pos->total = 0;

        for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) {
                pos->refaulted += lrugen->avg_refaulted[type][i] +
                                  atomic_long_read(&lrugen->refaulted[hist][type][i]);
                pos->total += lrugen->avg_total[type][i] +
                              lrugen->protected[hist][type][i] +
                              atomic_long_read(&lrugen->evicted[hist][type][i]);
        }
}

static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
{
        int hist, tier;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
        unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;

        lockdep_assert_held(&lruvec->lru_lock);

        if (!carryover && !clear)
                return;

        hist = lru_hist_from_seq(seq);

        for (tier = 0; tier < MAX_NR_TIERS; tier++) {
                if (carryover) {
                        unsigned long sum;

                        sum = lrugen->avg_refaulted[type][tier] +
                              atomic_long_read(&lrugen->refaulted[hist][type][tier]);
                        WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);

                        sum = lrugen->avg_total[type][tier] +
                              lrugen->protected[hist][type][tier] +
                              atomic_long_read(&lrugen->evicted[hist][type][tier]);
                        WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
                }

                if (clear) {
                        atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
                        atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
                        WRITE_ONCE(lrugen->protected[hist][type][tier], 0);
                }
        }
}

static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
{
        /*
         * Return true if the PV has a limited number of refaults or a lower
         * refaulted/total than the SP.
         */
        return pv->refaulted < MIN_LRU_BATCH ||
               pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
               (sp->refaulted + 1) * pv->total * pv->gain;
}

/******************************************************************************
 *                          the aging
 ******************************************************************************/

/* promote pages accessed through page tables */
static int folio_update_gen(struct folio *folio, int gen)
{
        unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);

        VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);

        /* see the comment on LRU_REFS_FLAGS */
        if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
                set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
                return -1;
        }

        do {
                /* lru_gen_del_folio() has isolated this page? */
                if (!(old_flags & LRU_GEN_MASK))
                        return -1;

                new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
                new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
        } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));

        return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}

/* protect pages accessed multiple times through file descriptors */
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        int type = folio_is_file_lru(folio);
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
        unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);

        VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);

        do {
                new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
                /* folio_update_gen() has promoted this page? */
                if (new_gen >= 0 && new_gen != old_gen)
                        return new_gen;

                new_gen = (old_gen + 1) % MAX_NR_GENS;

                new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
                new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
                /* for folio_end_writeback() */
                if (reclaiming)
                        new_flags |= BIT(PG_reclaim);
        } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));

        lru_gen_update_size(lruvec, folio, old_gen, new_gen);

        return new_gen;
}

static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
                              int old_gen, int new_gen)
{
        int type = folio_is_file_lru(folio);
        int zone = folio_zonenum(folio);
        int delta = folio_nr_pages(folio);

        VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
        VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);

        walk->batched++;

        walk->nr_pages[old_gen][type][zone] -= delta;
        walk->nr_pages[new_gen][type][zone] += delta;
}

static void reset_batch_size(struct lru_gen_mm_walk *walk)
{
        int gen, type, zone;
        struct lruvec *lruvec = walk->lruvec;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        walk->batched = 0;

        for_each_gen_type_zone(gen, type, zone) {
                enum lru_list lru = type * LRU_INACTIVE_FILE;
                int delta = walk->nr_pages[gen][type][zone];

                if (!delta)
                        continue;

                walk->nr_pages[gen][type][zone] = 0;
                WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
                           lrugen->nr_pages[gen][type][zone] + delta);

                if (lru_gen_is_active(lruvec, gen))
                        lru += LRU_ACTIVE;
                __update_lru_size(lruvec, lru, zone, delta);
        }
}

static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
{
        struct address_space *mapping;
        struct vm_area_struct *vma = args->vma;
        struct lru_gen_mm_walk *walk = args->private;

        if (!vma_is_accessible(vma))
                return true;

        if (is_vm_hugetlb_page(vma))
                return true;

        if (!vma_has_recency(vma))
                return true;

        if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
                return true;

        if (vma == get_gate_vma(vma->vm_mm))
                return true;

        if (vma_is_anonymous(vma))
                return !walk->swappiness;

        if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
                return true;

        mapping = vma->vm_file->f_mapping;
        if (mapping_unevictable(mapping))
                return true;

        if (shmem_mapping(mapping))
                return !walk->swappiness;

        if (walk->swappiness > MAX_SWAPPINESS)
                return true;

        /* to exclude special mappings like dax, etc. */
        return !mapping->a_ops->read_folio;
}

/*
 * Some userspace memory allocators map many single-page VMAs. Instead of
 * returning back to the PGD table for each of such VMAs, finish an entire PMD
 * table to reduce zigzags and improve cache performance.
 */
static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
                         unsigned long *vm_start, unsigned long *vm_end)
{
        unsigned long start = round_up(*vm_end, size);
        unsigned long end = (start | ~mask) + 1;
        VMA_ITERATOR(vmi, args->mm, start);

        VM_WARN_ON_ONCE(mask & size);
        VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));

        for_each_vma(vmi, args->vma) {
                if (end && end <= args->vma->vm_start)
                        return false;

                if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args))
                        continue;

                *vm_start = max(start, args->vma->vm_start);
                *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;

                return true;
        }

        return false;
}

static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr,
                                 struct pglist_data *pgdat)
{
        unsigned long pfn = pte_pfn(pte);

        VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);

        if (!pte_present(pte) || is_zero_pfn(pfn))
                return -1;

        if (WARN_ON_ONCE(pte_special(pte)))
                return -1;

        if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
                return -1;

        if (WARN_ON_ONCE(!pfn_valid(pfn)))
                return -1;

        if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
                return -1;

        return pfn;
}

static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr,
                                 struct pglist_data *pgdat)
{
        unsigned long pfn = pmd_pfn(pmd);

        VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);

        if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
                return -1;

        if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
                return -1;

        if (WARN_ON_ONCE(!pfn_valid(pfn)))
                return -1;

        if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
                return -1;

        return pfn;
}

static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
                                   struct pglist_data *pgdat)
{
        struct folio *folio = pfn_folio(pfn);

        if (folio_lru_gen(folio) < 0)
                return NULL;

        if (folio_nid(folio) != pgdat->node_id)
                return NULL;

        rcu_read_lock();
        if (folio_memcg(folio) != memcg)
                folio = NULL;
        rcu_read_unlock();

        return folio;
}

static bool suitable_to_scan(int total, int young)
{
        int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);

        /* suitable if the average number of young PTEs per cacheline is >=1 */
        return young * n >= total;
}

static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
                              int new_gen, bool dirty)
{
        int old_gen;

        if (!folio)
                return;

        if (dirty && !folio_test_dirty(folio) &&
            !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
              !folio_test_swapcache(folio)))
                folio_mark_dirty(folio);

        if (walk) {
                old_gen = folio_update_gen(folio, new_gen);
                if (old_gen >= 0 && old_gen != new_gen)
                        update_batch_size(walk, folio, old_gen, new_gen);
        } else if (lru_gen_set_refs(folio)) {
                old_gen = folio_lru_gen(folio);
                if (old_gen >= 0 && old_gen != new_gen)
                        folio_activate(folio);
        }
}

static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
                           struct mm_walk *args)
{
        int i;
        bool dirty;
        pte_t *pte;
        spinlock_t *ptl;
        unsigned long addr;
        int total = 0;
        int young = 0;
        struct folio *last = NULL;
        struct lru_gen_mm_walk *walk = args->private;
        struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
        struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
        DEFINE_MAX_SEQ(walk->lruvec);
        int gen = lru_gen_from_seq(max_seq);
        unsigned int nr;
        pmd_t pmdval;

        pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
        if (!pte)
                return false;

        if (!spin_trylock(ptl)) {
                pte_unmap(pte);
                return true;
        }

        if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
                pte_unmap_unlock(pte, ptl);
                return false;
        }

        lazy_mmu_mode_enable();
restart:
        for (i = pte_index(start), addr = start; addr != end; i += nr, addr += nr * PAGE_SIZE) {
                unsigned long pfn;
                struct folio *folio;
                pte_t *cur_pte = pte + i;
                pte_t ptent = ptep_get(cur_pte);

                nr = 1;
                total++;
                walk->mm_stats[MM_LEAF_TOTAL]++;

                pfn = get_pte_pfn(ptent, args->vma, addr, pgdat);
                if (pfn == -1)
                        continue;

                folio = get_pfn_folio(pfn, memcg, pgdat);
                if (!folio)
                        continue;

                if (folio_test_large(folio)) {
                        const unsigned int max_nr = (end - addr) >> PAGE_SHIFT;

                        nr = folio_pte_batch_flags(folio, NULL, cur_pte, &ptent,
                                                   max_nr, FPB_MERGE_YOUNG_DIRTY);
                        total += nr - 1;
                        walk->mm_stats[MM_LEAF_TOTAL] += nr - 1;
                }

                if (!test_and_clear_young_ptes_notify(args->vma, addr, cur_pte, nr))
                        continue;

                if (last != folio) {
                        walk_update_folio(walk, last, gen, dirty);

                        last = folio;
                        dirty = false;
                }

                if (pte_dirty(ptent))
                        dirty = true;

                young += nr;
                walk->mm_stats[MM_LEAF_YOUNG] += nr;
        }

        walk_update_folio(walk, last, gen, dirty);
        last = NULL;

        if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
                goto restart;

        lazy_mmu_mode_disable();
        pte_unmap_unlock(pte, ptl);

        return suitable_to_scan(total, young);
}

static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
                                  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
        int i;
        bool dirty;
        pmd_t *pmd;
        spinlock_t *ptl;
        struct folio *last = NULL;
        struct lru_gen_mm_walk *walk = args->private;
        struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
        struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
        DEFINE_MAX_SEQ(walk->lruvec);
        int gen = lru_gen_from_seq(max_seq);

        VM_WARN_ON_ONCE(pud_leaf(*pud));

        /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
        if (*first == -1) {
                *first = addr;
                bitmap_zero(bitmap, MIN_LRU_BATCH);
                return;
        }

        i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
        if (i && i <= MIN_LRU_BATCH) {
                __set_bit(i - 1, bitmap);
                return;
        }

        pmd = pmd_offset(pud, *first);

        ptl = pmd_lockptr(args->mm, pmd);
        if (!spin_trylock(ptl))
                goto done;

        lazy_mmu_mode_enable();

        do {
                unsigned long pfn;
                struct folio *folio;

                /* don't round down the first address */
                addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;

                if (!pmd_present(pmd[i]))
                        goto next;

                if (!pmd_trans_huge(pmd[i])) {
                        if (!walk->force_scan && should_clear_pmd_young() &&
                            !mm_has_notifiers(args->mm))
                                pmdp_test_and_clear_young(vma, addr, pmd + i);
                        goto next;
                }

                pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat);
                if (pfn == -1)
                        goto next;

                folio = get_pfn_folio(pfn, memcg, pgdat);
                if (!folio)
                        goto next;

                if (!pmdp_test_and_clear_young_notify(vma, addr, pmd + i))
                        goto next;

                if (last != folio) {
                        walk_update_folio(walk, last, gen, dirty);

                        last = folio;
                        dirty = false;
                }

                if (pmd_dirty(pmd[i]))
                        dirty = true;

                walk->mm_stats[MM_LEAF_YOUNG]++;
next:
                i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
        } while (i <= MIN_LRU_BATCH);

        walk_update_folio(walk, last, gen, dirty);

        lazy_mmu_mode_disable();
        spin_unlock(ptl);
done:
        *first = -1;
}

static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
                           struct mm_walk *args)
{
        int i;
        pmd_t *pmd;
        unsigned long next;
        unsigned long addr;
        struct vm_area_struct *vma;
        DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
        unsigned long first = -1;
        struct lru_gen_mm_walk *walk = args->private;
        struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);

        VM_WARN_ON_ONCE(pud_leaf(*pud));

        /*
         * Finish an entire PMD in two passes: the first only reaches to PTE
         * tables to avoid taking the PMD lock; the second, if necessary, takes
         * the PMD lock to clear the accessed bit in PMD entries.
         */
        pmd = pmd_offset(pud, start & PUD_MASK);
restart:
        /* walk_pte_range() may call get_next_vma() */
        vma = args->vma;
        for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
                pmd_t val = pmdp_get_lockless(pmd + i);

                next = pmd_addr_end(addr, end);

                if (!pmd_present(val) || is_huge_zero_pmd(val)) {
                        walk->mm_stats[MM_LEAF_TOTAL]++;
                        continue;
                }

                if (pmd_trans_huge(val)) {
                        struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
                        unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat);

                        walk->mm_stats[MM_LEAF_TOTAL]++;

                        if (pfn != -1)
                                walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
                        continue;
                }

                if (!walk->force_scan && should_clear_pmd_young() &&
                    !mm_has_notifiers(args->mm)) {
                        if (!pmd_young(val))
                                continue;

                        walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
                }

                if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i))
                        continue;

                walk->mm_stats[MM_NONLEAF_FOUND]++;

                if (!walk_pte_range(&val, addr, next, args))
                        continue;

                walk->mm_stats[MM_NONLEAF_ADDED]++;

                /* carry over to the next generation */
                update_bloom_filter(mm_state, walk->seq + 1, pmd + i);
        }

        walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);

        if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
                goto restart;
}

static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
                          struct mm_walk *args)
{
        int i;
        pud_t *pud;
        unsigned long addr;
        unsigned long next;
        struct lru_gen_mm_walk *walk = args->private;

        VM_WARN_ON_ONCE(p4d_leaf(*p4d));

        pud = pud_offset(p4d, start & P4D_MASK);
restart:
        for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
                pud_t val = pudp_get(pud + i);

                next = pud_addr_end(addr, end);

                if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
                        continue;

                walk_pmd_range(&val, addr, next, args);

                if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
                        end = (addr | ~PUD_MASK) + 1;
                        goto done;
                }
        }

        if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
                goto restart;

        end = round_up(end, P4D_SIZE);
done:
        if (!end || !args->vma)
                return 1;

        walk->next_addr = max(end, args->vma->vm_start);

        return -EAGAIN;
}

static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
{
        static const struct mm_walk_ops mm_walk_ops = {
                .test_walk = should_skip_vma,
                .p4d_entry = walk_pud_range,
                .walk_lock = PGWALK_RDLOCK,
        };
        int err;
        struct lruvec *lruvec = walk->lruvec;

        walk->next_addr = FIRST_USER_ADDRESS;

        do {
                DEFINE_MAX_SEQ(lruvec);

                err = -EBUSY;

                /* another thread might have called inc_max_seq() */
                if (walk->seq != max_seq)
                        break;

                /* the caller might be holding the lock for write */
                if (mmap_read_trylock(mm)) {
                        err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);

                        mmap_read_unlock(mm);
                }

                if (walk->batched) {
                        lruvec_lock_irq(lruvec);
                        reset_batch_size(walk);
                        lruvec_unlock_irq(lruvec);
                }

                cond_resched();
        } while (err == -EAGAIN);
}

static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
{
        struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;

        if (pgdat && current_is_kswapd()) {
                VM_WARN_ON_ONCE(walk);

                walk = &pgdat->mm_walk;
        } else if (!walk && force_alloc) {
                VM_WARN_ON_ONCE(current_is_kswapd());

                walk = kzalloc_obj(*walk,
                                   __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
        }

        current->reclaim_state->mm_walk = walk;

        return walk;
}

static void clear_mm_walk(void)
{
        struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;

        VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
        VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));

        current->reclaim_state->mm_walk = NULL;

        if (!current_is_kswapd())
                kfree(walk);
}

static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
{
        int zone;
        int remaining = MAX_LRU_BATCH;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        int hist = lru_hist_from_seq(lrugen->min_seq[type]);
        int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);

        /* For file type, skip the check if swappiness is anon only */
        if (type && (swappiness == SWAPPINESS_ANON_ONLY))
                goto done;

        /* For anon type, skip the check if swappiness is zero (file only) */
        if (!type && !swappiness)
                goto done;

        /* prevent cold/hot inversion if the type is evictable */
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                struct list_head *head = &lrugen->folios[old_gen][type][zone];

                while (!list_empty(head)) {
                        struct folio *folio = lru_to_folio(head);
                        int refs = folio_lru_refs(folio);
                        bool workingset = folio_test_workingset(folio);

                        VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);

                        new_gen = folio_inc_gen(lruvec, folio, false);
                        list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);

                        /* don't count the workingset being lazily promoted */
                        if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
                                int tier = lru_tier_from_refs(refs, workingset);
                                int delta = folio_nr_pages(folio);

                                WRITE_ONCE(lrugen->protected[hist][type][tier],
                                           lrugen->protected[hist][type][tier] + delta);
                        }

                        if (!--remaining)
                                return false;
                }
        }
done:
        reset_ctrl_pos(lruvec, type, true);
        WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);

        return true;
}

static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
        int gen, type, zone;
        bool success = false;
        bool seq_inc_flag = false;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        DEFINE_MIN_SEQ(lruvec);

        VM_WARN_ON_ONCE(!seq_is_valid(lruvec));

        /* find the oldest populated generation */
        for_each_evictable_type(type, swappiness) {
                while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
                        gen = lru_gen_from_seq(min_seq[type]);

                        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                                if (!list_empty(&lrugen->folios[gen][type][zone]))
                                        goto next;
                        }

                        min_seq[type]++;
                        seq_inc_flag = true;
                }
next:
                ;
        }

        /*
         * If min_seq[type] of both anonymous and file is not increased,
         * we can directly return false to avoid unnecessary checking
         * overhead later.
         */
        if (!seq_inc_flag)
                return success;

        /* see the comment on lru_gen_folio */
        if (swappiness && swappiness <= MAX_SWAPPINESS) {
                unsigned long seq = lrugen->max_seq - MIN_NR_GENS;

                if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq)
                        min_seq[LRU_GEN_ANON] = seq;
                else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq)
                        min_seq[LRU_GEN_FILE] = seq;
        }

        for_each_evictable_type(type, swappiness) {
                if (min_seq[type] <= lrugen->min_seq[type])
                        continue;

                reset_ctrl_pos(lruvec, type, true);
                WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
                success = true;
        }

        return success;
}

static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness)
{
        bool success;
        int prev, next;
        int type, zone;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
restart:
        if (seq < READ_ONCE(lrugen->max_seq))
                return false;

        lruvec_lock_irq(lruvec);

        VM_WARN_ON_ONCE(!seq_is_valid(lruvec));

        success = seq == lrugen->max_seq;
        if (!success)
                goto unlock;

        for (type = 0; type < ANON_AND_FILE; type++) {
                if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
                        continue;

                if (inc_min_seq(lruvec, type, swappiness))
                        continue;

                lruvec_unlock_irq(lruvec);
                cond_resched();
                goto restart;
        }

        /*
         * Update the active/inactive LRU sizes for compatibility. Both sides of
         * the current max_seq need to be covered, since max_seq+1 can overlap
         * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
         * overlap, cold/hot inversion happens.
         */
        prev = lru_gen_from_seq(lrugen->max_seq - 1);
        next = lru_gen_from_seq(lrugen->max_seq + 1);

        for (type = 0; type < ANON_AND_FILE; type++) {
                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                        enum lru_list lru = type * LRU_INACTIVE_FILE;
                        long delta = lrugen->nr_pages[prev][type][zone] -
                                     lrugen->nr_pages[next][type][zone];

                        if (!delta)
                                continue;

                        __update_lru_size(lruvec, lru, zone, delta);
                        __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
                }
        }

        for (type = 0; type < ANON_AND_FILE; type++)
                reset_ctrl_pos(lruvec, type, false);

        WRITE_ONCE(lrugen->timestamps[next], jiffies);
        /* make sure preceding modifications appear */
        smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
unlock:
        lruvec_unlock_irq(lruvec);

        return success;
}

static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
                               int swappiness, bool force_scan)
{
        bool success;
        struct lru_gen_mm_walk *walk;
        struct mm_struct *mm = NULL;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);

        VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));

        if (!mm_state)
                return inc_max_seq(lruvec, seq, swappiness);

        /* see the comment in iterate_mm_list() */
        if (seq <= READ_ONCE(mm_state->seq))
                return false;

        /*
         * If the hardware doesn't automatically set the accessed bit, fallback
         * to lru_gen_look_around(), which only clears the accessed bit in a
         * handful of PTEs. Spreading the work out over a period of time usually
         * is less efficient, but it avoids bursty page faults.
         */
        if (!should_walk_mmu()) {
                success = iterate_mm_list_nowalk(lruvec, seq);
                goto done;
        }

        walk = set_mm_walk(NULL, true);
        if (!walk) {
                success = iterate_mm_list_nowalk(lruvec, seq);
                goto done;
        }

        walk->lruvec = lruvec;
        walk->seq = seq;
        walk->swappiness = swappiness;
        walk->force_scan = force_scan;

        do {
                success = iterate_mm_list(walk, &mm);
                if (mm)
                        walk_mm(mm, walk);
        } while (mm);
done:
        if (success) {
                success = inc_max_seq(lruvec, seq, swappiness);
                WARN_ON_ONCE(!success);
        }

        return success;
}

/******************************************************************************
 *                          working set protection
 ******************************************************************************/

static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
{
        int priority;
        unsigned long reclaimable;

        if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
                return;
        /*
         * Determine the initial priority based on
         * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
         * where reclaimed_to_scanned_ratio = inactive / total.
         */
        reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
        if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
                reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);

        /* round down reclaimable and round up sc->nr_to_reclaim */
        priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);

        /*
         * The estimation is based on LRU pages only, so cap it to prevent
         * overshoots of shrinker objects by large margins.
         */
        sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
}

static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
        int gen, type, zone;
        unsigned long total = 0;
        int swappiness = get_swappiness(lruvec, sc);
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        DEFINE_MAX_SEQ(lruvec);
        DEFINE_MIN_SEQ(lruvec);

        for_each_evictable_type(type, swappiness) {
                unsigned long seq;

                for (seq = min_seq[type]; seq <= max_seq; seq++) {
                        gen = lru_gen_from_seq(seq);

                        for (zone = 0; zone < MAX_NR_ZONES; zone++)
                                total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
                }
        }

        /* whether the size is big enough to be helpful */
        return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
}

static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
                                  unsigned long min_ttl)
{
        int gen;
        unsigned long birth;
        int swappiness = get_swappiness(lruvec, sc);
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        DEFINE_MIN_SEQ(lruvec);

        if (mem_cgroup_below_min(NULL, memcg))
                return false;

        if (!lruvec_is_sizable(lruvec, sc))
                return false;

        gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness));
        birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);

        return time_is_before_jiffies(birth + min_ttl);
}

/* to protect the working set of the last N jiffies */
static unsigned long lru_gen_min_ttl __read_mostly;

static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
        struct mem_cgroup *memcg;
        unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
        bool reclaimable = !min_ttl;

        VM_WARN_ON_ONCE(!current_is_kswapd());

        set_initial_priority(pgdat, sc);

        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
                struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);

                mem_cgroup_calculate_protection(NULL, memcg);

                if (!reclaimable)
                        reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
        } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));

        /*
         * The main goal is to OOM kill if every generation from all memcgs is
         * younger than min_ttl. However, another possibility is all memcgs are
         * either too small or below min.
         */
        if (!reclaimable && mutex_trylock(&oom_lock)) {
                struct oom_control oc = {
                        .gfp_mask = sc->gfp_mask,
                };

                out_of_memory(&oc);

                mutex_unlock(&oom_lock);
        }
}

/******************************************************************************
 *                          rmap/PT walk feedback
 ******************************************************************************/

/*
 * This function exploits spatial locality when shrink_folio_list() walks the
 * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
 * the scan was done cacheline efficiently, it adds the PMD entry pointing to
 * the PTE table to the Bloom filter. This forms a feedback loop between the
 * eviction and the aging.
 */
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
{
        int i;
        bool dirty;
        unsigned long start;
        unsigned long end;
        struct lru_gen_mm_walk *walk;
        struct folio *last = NULL;
        int young = 1;
        pte_t *pte = pvmw->pte;
        unsigned long addr = pvmw->address;
        struct vm_area_struct *vma = pvmw->vma;
        struct folio *folio = pfn_folio(pvmw->pfn);
        struct mem_cgroup *memcg;
        struct pglist_data *pgdat = folio_pgdat(folio);
        struct lruvec *lruvec;
        struct lru_gen_mm_state *mm_state;
        unsigned long max_seq;
        int gen;

        lockdep_assert_held(pvmw->ptl);
        VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);

        if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr))
                return false;

        if (spin_is_contended(pvmw->ptl))
                return true;

        /* exclude special VMAs containing anon pages from COW */
        if (vma->vm_flags & VM_SPECIAL)
                return true;

        /* avoid taking the LRU lock under the PTL when possible */
        walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;

        start = max(addr & PMD_MASK, vma->vm_start);
        end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;

        if (end - start == PAGE_SIZE)
                return true;

        if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
                if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
                        end = start + MIN_LRU_BATCH * PAGE_SIZE;
                else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
                        start = end - MIN_LRU_BATCH * PAGE_SIZE;
                else {
                        start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
                        end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
                }
        }

        memcg = get_mem_cgroup_from_folio(folio);
        lruvec = mem_cgroup_lruvec(memcg, pgdat);
        max_seq = READ_ONCE((lruvec)->lrugen.max_seq);
        gen = lru_gen_from_seq(max_seq);
        mm_state = get_mm_state(lruvec);

        lazy_mmu_mode_enable();

        pte -= (addr - start) / PAGE_SIZE;

        for (i = 0, addr = start; addr != end;
             i += nr, pte += nr, addr += nr * PAGE_SIZE) {
                unsigned long pfn;
                pte_t ptent = ptep_get(pte);

                nr = 1;
                pfn = get_pte_pfn(ptent, vma, addr, pgdat);
                if (pfn == -1)
                        continue;

                folio = get_pfn_folio(pfn, memcg, pgdat);
                if (!folio)
                        continue;

                if (folio_test_large(folio)) {
                        const unsigned int max_nr = (end - addr) >> PAGE_SHIFT;

                        nr = folio_pte_batch_flags(folio, NULL, pte, &ptent,
                                                   max_nr, FPB_MERGE_YOUNG_DIRTY);
                }

                if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr))
                        continue;

                if (last != folio) {
                        walk_update_folio(walk, last, gen, dirty);

                        last = folio;
                        dirty = false;
                }

                if (pte_dirty(ptent))
                        dirty = true;

                young += nr;
        }

        walk_update_folio(walk, last, gen, dirty);

        lazy_mmu_mode_disable();

        /* feedback from rmap walkers to page table walkers */
        if (mm_state && suitable_to_scan(i, young))
                update_bloom_filter(mm_state, max_seq, pvmw->pmd);

        mem_cgroup_put(memcg);

        return true;
}

/******************************************************************************
 *                          memcg LRU
 ******************************************************************************/

/* see the comment on MEMCG_NR_GENS */
enum {
        MEMCG_LRU_NOP,
        MEMCG_LRU_HEAD,
        MEMCG_LRU_TAIL,
        MEMCG_LRU_OLD,
        MEMCG_LRU_YOUNG,
};

static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
        int seg;
        int old, new;
        unsigned long flags;
        int bin = get_random_u32_below(MEMCG_NR_BINS);
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        spin_lock_irqsave(&pgdat->memcg_lru.lock, flags);

        VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));

        seg = 0;
        new = old = lruvec->lrugen.gen;

        /* see the comment on MEMCG_NR_GENS */
        if (op == MEMCG_LRU_HEAD)
                seg = MEMCG_LRU_HEAD;
        else if (op == MEMCG_LRU_TAIL)
                seg = MEMCG_LRU_TAIL;
        else if (op == MEMCG_LRU_OLD)
                new = get_memcg_gen(pgdat->memcg_lru.seq);
        else if (op == MEMCG_LRU_YOUNG)
                new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
        else
                VM_WARN_ON_ONCE(true);

        WRITE_ONCE(lruvec->lrugen.seg, seg);
        WRITE_ONCE(lruvec->lrugen.gen, new);

        hlist_nulls_del_rcu(&lruvec->lrugen.list);

        if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
                hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
        else
                hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);

        pgdat->memcg_lru.nr_memcgs[old]--;
        pgdat->memcg_lru.nr_memcgs[new]++;

        if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
                WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);

        spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
}

#ifdef CONFIG_MEMCG

void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
        int gen;
        int nid;
        int bin = get_random_u32_below(MEMCG_NR_BINS);

        for_each_node(nid) {
                struct pglist_data *pgdat = NODE_DATA(nid);
                struct lruvec *lruvec = get_lruvec(memcg, nid);

                spin_lock_irq(&pgdat->memcg_lru.lock);

                VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));

                gen = get_memcg_gen(pgdat->memcg_lru.seq);

                lruvec->lrugen.gen = gen;

                hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
                pgdat->memcg_lru.nr_memcgs[gen]++;

                spin_unlock_irq(&pgdat->memcg_lru.lock);
        }
}

void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
        int nid;

        for_each_node(nid) {
                struct lruvec *lruvec = get_lruvec(memcg, nid);

                lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
        }
}

void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
        int gen;
        int nid;

        for_each_node(nid) {
                struct pglist_data *pgdat = NODE_DATA(nid);
                struct lruvec *lruvec = get_lruvec(memcg, nid);

                spin_lock_irq(&pgdat->memcg_lru.lock);

                if (hlist_nulls_unhashed(&lruvec->lrugen.list))
                        goto unlock;

                gen = lruvec->lrugen.gen;

                hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
                pgdat->memcg_lru.nr_memcgs[gen]--;

                if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
                        WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
unlock:
                spin_unlock_irq(&pgdat->memcg_lru.lock);
        }
}

void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
        struct lruvec *lruvec = get_lruvec(memcg, nid);

        /* see the comment on MEMCG_NR_GENS */
        if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD)
                lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
}

bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid)
{
        struct lruvec *lruvec = get_lruvec(memcg, nid);
        int type;

        for (type = 0; type < ANON_AND_FILE; type++) {
                if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
                        return false;
        }

        return true;
}

static void try_to_inc_max_seq_nowalk(struct mem_cgroup *memcg,
                                      struct lruvec *lruvec)
{
        struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
        struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
        int swappiness = mem_cgroup_swappiness(memcg);
        DEFINE_MAX_SEQ(lruvec);
        bool success = false;

        /*
         * We are not iterating the mm_list here, updating mm_state->seq is just
         * to make mm walkers work properly.
         */
        if (mm_state) {
                spin_lock(&mm_list->lock);
                VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
                if (max_seq > mm_state->seq) {
                        WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
                        success = true;
                }
                spin_unlock(&mm_list->lock);
        } else {
                success = true;
        }

        if (success)
                inc_max_seq(lruvec, max_seq, swappiness);
}

/*
 * We need to ensure that the folios of child memcg can be reparented to the
 * same gen of the parent memcg, so the gens of the parent memcg needed be
 * incremented to the MAX_NR_GENS before reparenting.
 */
void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid)
{
        struct lruvec *lruvec = get_lruvec(memcg, nid);
        int type;

        for (type = 0; type < ANON_AND_FILE; type++) {
                while (get_nr_gens(lruvec, type) < MAX_NR_GENS) {
                        try_to_inc_max_seq_nowalk(memcg, lruvec);
                        cond_resched();
                }
        }
}

/*
 * Compared to traditional LRU, MGLRU faces the following challenges:
 *
 * 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the
 *    number of generations of the parent and child memcg may be different,
 *    so we cannot simply transfer MGLRU folios in the child memcg to the
 *    parent memcg as we did for traditional LRU folios.
 * 2. The generation information is stored in folio->flags, but we cannot
 *    traverse these folios while holding the lru lock, otherwise it may
 *    cause softlockup.
 * 3. In walk_update_folio(), the gen of folio and corresponding lru size
 *    may be updated, but the folio is not immediately moved to the
 *    corresponding lru list. Therefore, there may be folios of different
 *    generations on an LRU list.
 * 4. In lru_gen_del_folio(), the generation to which the folio belongs is
 *    found based on the generation information in folio->flags, and the
 *    corresponding LRU size will be updated. Therefore, we need to update
 *    the lru size correctly during reparenting, otherwise the lru size may
 *    be updated incorrectly in lru_gen_del_folio().
 *
 * Finally, we choose a compromise method, which is to splice the lru list in
 * the child memcg to the lru list of the same generation in the parent memcg
 * during reparenting.
 *
 * The same generation has different meanings in the parent and child memcg,
 * so this compromise method will cause the LRU inversion problem. But as the
 * system runs, this problem will be fixed automatically.
 */
static void __lru_gen_reparent_memcg(struct lruvec *child_lruvec, struct lruvec *parent_lruvec,
                                     int zone, int type)
{
        struct lru_gen_folio *child_lrugen, *parent_lrugen;
        enum lru_list lru = type * LRU_INACTIVE_FILE;
        int i;

        child_lrugen = &child_lruvec->lrugen;
        parent_lrugen = &parent_lruvec->lrugen;

        for (i = 0; i < get_nr_gens(child_lruvec, type); i++) {
                int gen = lru_gen_from_seq(child_lrugen->max_seq - i);
                long nr_pages = child_lrugen->nr_pages[gen][type][zone];
                int child_lru_active = lru_gen_is_active(child_lruvec, gen) ? LRU_ACTIVE : 0;
                int parent_lru_active = lru_gen_is_active(parent_lruvec, gen) ? LRU_ACTIVE : 0;

                /* Assuming that child pages are colder than parent pages */
                list_splice_tail_init(&child_lrugen->folios[gen][type][zone],
                                      &parent_lrugen->folios[gen][type][zone]);

                WRITE_ONCE(child_lrugen->nr_pages[gen][type][zone], 0);
                WRITE_ONCE(parent_lrugen->nr_pages[gen][type][zone],
                           parent_lrugen->nr_pages[gen][type][zone] + nr_pages);

                if (lru_gen_is_active(child_lruvec, gen) != lru_gen_is_active(parent_lruvec, gen)) {
                        __update_lru_size(child_lruvec, lru + child_lru_active, zone, -nr_pages);
                        __update_lru_size(parent_lruvec, lru + parent_lru_active, zone, nr_pages);
                }
        }
}

void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
{
        struct lruvec *child_lruvec, *parent_lruvec;
        int type, zid;
        struct zone *zone;
        enum lru_list lru;

        child_lruvec = get_lruvec(memcg, nid);
        parent_lruvec = get_lruvec(parent, nid);

        for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1)
                for (type = 0; type < ANON_AND_FILE; type++)
                        __lru_gen_reparent_memcg(child_lruvec, parent_lruvec, zid, type);

        for_each_lru(lru) {
                for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
                        unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);

                        mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
                }
        }
}

#endif /* CONFIG_MEMCG */

/******************************************************************************
 *                          the eviction
 ******************************************************************************/

static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
                       int tier_idx)
{
        bool success;
        bool dirty, writeback;
        int gen = folio_lru_gen(folio);
        int type = folio_is_file_lru(folio);
        int zone = folio_zonenum(folio);
        int delta = folio_nr_pages(folio);
        int refs = folio_lru_refs(folio);
        bool workingset = folio_test_workingset(folio);
        int tier = lru_tier_from_refs(refs, workingset);
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);

        /* unevictable */
        if (!folio_evictable(folio)) {
                success = lru_gen_del_folio(lruvec, folio, true);
                VM_WARN_ON_ONCE_FOLIO(!success, folio);
                folio_set_unevictable(folio);
                lruvec_add_folio(lruvec, folio);
                __count_vm_events(UNEVICTABLE_PGCULLED, delta);
                return true;
        }

        /* promoted */
        if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
                list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
                return true;
        }

        /* protected */
        if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) {
                gen = folio_inc_gen(lruvec, folio, false);
                list_move(&folio->lru, &lrugen->folios[gen][type][zone]);

                /* don't count the workingset being lazily promoted */
                if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
                        int hist = lru_hist_from_seq(lrugen->min_seq[type]);

                        WRITE_ONCE(lrugen->protected[hist][type][tier],
                                   lrugen->protected[hist][type][tier] + delta);
                }
                return true;
        }

        /* ineligible */
        if (zone > sc->reclaim_idx) {
                gen = folio_inc_gen(lruvec, folio, false);
                list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
                return true;
        }

        dirty = folio_test_dirty(folio);
        writeback = folio_test_writeback(folio);
        if (type == LRU_GEN_FILE && dirty) {
                sc->nr.file_taken += delta;
                if (!writeback)
                        sc->nr.unqueued_dirty += delta;
        }

        /* waiting for writeback */
        if (writeback || (type == LRU_GEN_FILE && dirty)) {
                gen = folio_inc_gen(lruvec, folio, true);
                list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
                return true;
        }

        return false;
}

static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc)
{
        bool success;

        /* swap constrained */
        if (!(sc->gfp_mask & __GFP_IO) &&
            (folio_test_dirty(folio) ||
             (folio_test_anon(folio) && !folio_test_swapcache(folio))))
                return false;

        /* raced with release_pages() */
        if (!folio_try_get(folio))
                return false;

        /* raced with another isolation */
        if (!folio_test_clear_lru(folio)) {
                folio_put(folio);
                return false;
        }

        /* see the comment on LRU_REFS_FLAGS */
        if (!folio_test_referenced(folio))
                set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0);

        /* for shrink_folio_list() */
        folio_clear_reclaim(folio);

        success = lru_gen_del_folio(lruvec, folio, true);
        VM_WARN_ON_ONCE_FOLIO(!success, folio);

        return true;
}

static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
                       struct scan_control *sc, int type, int tier,
                       struct list_head *list)
{
        int i;
        int gen;
        enum node_stat_item item;
        int sorted = 0;
        int scanned = 0;
        int isolated = 0;
        int skipped = 0;
        int scan_batch = min(nr_to_scan, MAX_LRU_BATCH);
        int remaining = scan_batch;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        VM_WARN_ON_ONCE(!list_empty(list));

        if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
                return 0;

        gen = lru_gen_from_seq(lrugen->min_seq[type]);

        for (i = MAX_NR_ZONES; i > 0; i--) {
                LIST_HEAD(moved);
                int skipped_zone = 0;
                int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
                struct list_head *head = &lrugen->folios[gen][type][zone];

                while (!list_empty(head)) {
                        struct folio *folio = lru_to_folio(head);
                        int delta = folio_nr_pages(folio);

                        VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);

                        scanned += delta;

                        if (sort_folio(lruvec, folio, sc, tier))
                                sorted += delta;
                        else if (isolate_folio(lruvec, folio, sc)) {
                                list_add(&folio->lru, list);
                                isolated += delta;
                        } else {
                                list_move(&folio->lru, &moved);
                                skipped_zone += delta;
                        }

                        if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH)
                                break;
                }

                if (skipped_zone) {
                        list_splice(&moved, head);
                        __count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone);
                        skipped += skipped_zone;
                }

                if (!remaining || isolated >= MIN_LRU_BATCH)
                        break;
        }

        item = PGSCAN_KSWAPD + reclaimer_offset(sc);
        mod_lruvec_state(lruvec, item, isolated);
        mod_lruvec_state(lruvec, PGREFILL, sorted);
        mod_lruvec_state(lruvec, PGSCAN_ANON + type, isolated);
        trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, scan_batch,
                                scanned, skipped, isolated,
                                type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
        if (type == LRU_GEN_FILE)
                sc->nr.file_taken += isolated;
        /*
         * There might not be eligible folios due to reclaim_idx. Check the
         * remaining to prevent livelock if it's not making progress.
         */
        return isolated || !remaining ? scanned : 0;
}

static int get_tier_idx(struct lruvec *lruvec, int type)
{
        int tier;
        struct ctrl_pos sp, pv = {};

        /*
         * To leave a margin for fluctuations, use a larger gain factor (2:3).
         * This value is chosen because any other tier would have at least twice
         * as many refaults as the first tier.
         */
        read_ctrl_pos(lruvec, type, 0, 2, &sp);
        for (tier = 1; tier < MAX_NR_TIERS; tier++) {
                read_ctrl_pos(lruvec, type, tier, 3, &pv);
                if (!positive_ctrl_err(&sp, &pv))
                        break;
        }

        return tier - 1;
}

static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
{
        struct ctrl_pos sp, pv = {};

        if (swappiness <= MIN_SWAPPINESS + 1)
                return LRU_GEN_FILE;

        if (swappiness >= MAX_SWAPPINESS)
                return LRU_GEN_ANON;
        /*
         * Compare the sum of all tiers of anon with that of file to determine
         * which type to scan.
         */
        read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp);
        read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv);

        return positive_ctrl_err(&sp, &pv);
}

static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
                          struct scan_control *sc, int swappiness,
                          int *type_scanned, struct list_head *list)
{
        int i;
        int type = get_type_to_scan(lruvec, swappiness);

        for_each_evictable_type(i, swappiness) {
                int scanned;
                int tier = get_tier_idx(lruvec, type);

                *type_scanned = type;

                scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list);
                if (scanned)
                        return scanned;

                type = !type;
        }

        return 0;
}

static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
                        struct scan_control *sc, int swappiness)
{
        int type;
        int scanned;
        int reclaimed;
        LIST_HEAD(list);
        LIST_HEAD(clean);
        struct folio *folio;
        struct folio *next;
        enum node_stat_item item;
        struct reclaim_stat stat;
        struct lru_gen_mm_walk *walk;
        bool skip_retry = false;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        lruvec_lock_irq(lruvec);

        scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);

        scanned += try_to_inc_min_seq(lruvec, swappiness);

        if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
                scanned = 0;

        lruvec_unlock_irq(lruvec);

        if (list_empty(&list))
                return scanned;
retry:
        reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
        sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
        sc->nr_reclaimed += reclaimed;
        trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
                        scanned, reclaimed, &stat, sc->priority,
                        type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);

        list_for_each_entry_safe_reverse(folio, next, &list, lru) {
                DEFINE_MIN_SEQ(lruvec);

                if (!folio_evictable(folio)) {
                        list_del(&folio->lru);
                        folio_putback_lru(folio);
                        continue;
                }

                /* retry folios that may have missed folio_rotate_reclaimable() */
                if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
                    !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
                        list_move(&folio->lru, &clean);
                        continue;
                }

                /* don't add rejected folios to the oldest generation */
                if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
                        set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
        }

        move_folios_to_lru(&list);

        walk = current->reclaim_state->mm_walk;
        if (walk && walk->batched) {
                walk->lruvec = lruvec;
                lruvec_lock_irq(lruvec);
                reset_batch_size(walk);
                lruvec_unlock_irq(lruvec);
        }

        mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
                                        stat.nr_demoted);

        item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
        mod_lruvec_state(lruvec, item, reclaimed);
        mod_lruvec_state(lruvec, PGSTEAL_ANON + type, reclaimed);

        list_splice_init(&clean, &list);

        if (!list_empty(&list)) {
                skip_retry = true;
                goto retry;
        }

        return scanned;
}

static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
                             int swappiness, unsigned long *nr_to_scan)
{
        int gen, type, zone;
        unsigned long size = 0;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        DEFINE_MIN_SEQ(lruvec);

        *nr_to_scan = 0;
        /* have to run aging, since eviction is not possible anymore */
        if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
                return true;

        for_each_evictable_type(type, swappiness) {
                unsigned long seq;

                for (seq = min_seq[type]; seq <= max_seq; seq++) {
                        gen = lru_gen_from_seq(seq);

                        for (zone = 0; zone < MAX_NR_ZONES; zone++)
                                size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
                }
        }

        *nr_to_scan = size;
        /* better to run aging even though eviction is still possible */
        return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
}

/*
 * For future optimizations:
 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
 *    reclaim.
 */
static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
        bool success;
        unsigned long nr_to_scan;
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        DEFINE_MAX_SEQ(lruvec);

        if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
                return -1;

        success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan);

        /* try to scrape all its memory if this memcg was deleted */
        if (nr_to_scan && !mem_cgroup_online(memcg))
                return nr_to_scan;

        nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan);

        /* try to get away with not aging at the default priority */
        if (!success || sc->priority == DEF_PRIORITY)
                return nr_to_scan >> sc->priority;

        /* stop scanning this lruvec as it's low on cold folios */
        return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0;
}

static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
{
        int i;
        enum zone_watermarks mark;

        if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
                return true;

        /* check the order to exclude compaction-induced reclaim */
        if (!current_is_kswapd() || sc->order)
                return false;

        mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
               WMARK_PROMO : WMARK_HIGH;

        for (i = 0; i <= sc->reclaim_idx; i++) {
                struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
                unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH;

                if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0))
                        return false;
        }

        /* kswapd should abort if all eligible zones are safe */
        return true;
}

static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
        long nr_to_scan;
        unsigned long scanned = 0;
        int swappiness = get_swappiness(lruvec, sc);

        while (true) {
                int delta;

                nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
                if (nr_to_scan <= 0)
                        break;

                delta = evict_folios(nr_to_scan, lruvec, sc, swappiness);
                if (!delta)
                        break;

                scanned += delta;
                if (scanned >= nr_to_scan)
                        break;

                if (should_abort_scan(lruvec, sc))
                        break;

                cond_resched();
        }

        /*
         * If too many file cache in the coldest generation can't be evicted
         * due to being dirty, wake up the flusher.
         */
        if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) {
                struct pglist_data *pgdat = lruvec_pgdat(lruvec);

                wakeup_flusher_threads(WB_REASON_VMSCAN);

                /*
                 * For cgroupv1 dirty throttling is achieved by waking up
                 * the kernel flusher here and later waiting on folios
                 * which are in writeback to finish (see shrink_folio_list()).
                 *
                 * Flusher may not be able to issue writeback quickly
                 * enough for cgroupv1 writeback throttling to work
                 * on a large system.
                 */
                if (!writeback_throttling_sane(sc))
                        reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
        }

        /* whether this lruvec should be rotated */
        return nr_to_scan < 0;
}

static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
{
        bool success;
        unsigned long scanned = sc->nr_scanned;
        unsigned long reclaimed = sc->nr_reclaimed;
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
        if (mem_cgroup_below_min(NULL, memcg))
                return MEMCG_LRU_YOUNG;

        if (mem_cgroup_below_low(NULL, memcg)) {
                /* see the comment on MEMCG_NR_GENS */
                if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
                        return MEMCG_LRU_TAIL;

                memcg_memory_event(memcg, MEMCG_LOW);
        }

        success = try_to_shrink_lruvec(lruvec, sc);

        shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);

        if (!sc->proactive)
                vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
                           sc->nr_reclaimed - reclaimed);

        flush_reclaim_state(sc);

        if (success && mem_cgroup_online(memcg))
                return MEMCG_LRU_YOUNG;

        if (!success && lruvec_is_sizable(lruvec, sc))
                return 0;

        /* one retry if offlined or too small */
        return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
               MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
}

static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
{
        int op;
        int gen;
        int bin;
        int first_bin;
        struct lruvec *lruvec;
        struct lru_gen_folio *lrugen;
        struct mem_cgroup *memcg;
        struct hlist_nulls_node *pos;

        gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
        bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
restart:
        op = 0;
        memcg = NULL;

        rcu_read_lock();

        hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
                if (op) {
                        lru_gen_rotate_memcg(lruvec, op);
                        op = 0;
                }

                mem_cgroup_put(memcg);
                memcg = NULL;

                if (gen != READ_ONCE(lrugen->gen))
                        continue;

                lruvec = container_of(lrugen, struct lruvec, lrugen);
                memcg = lruvec_memcg(lruvec);

                if (!mem_cgroup_tryget(memcg)) {
                        lru_gen_release_memcg(memcg);
                        memcg = NULL;
                        continue;
                }

                rcu_read_unlock();

                op = shrink_one(lruvec, sc);

                rcu_read_lock();

                if (should_abort_scan(lruvec, sc))
                        break;
        }

        rcu_read_unlock();

        if (op)
                lru_gen_rotate_memcg(lruvec, op);

        mem_cgroup_put(memcg);

        if (!is_a_nulls(pos))
                return;

        /* restart if raced with lru_gen_rotate_memcg() */
        if (gen != get_nulls_value(pos))
                goto restart;

        /* try the rest of the bins of the current generation */
        bin = get_memcg_bin(bin + 1);
        if (bin != first_bin)
                goto restart;
}

static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
        struct blk_plug plug;

        VM_WARN_ON_ONCE(root_reclaim(sc));
        VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);

        lru_add_drain();

        blk_start_plug(&plug);

        set_mm_walk(NULL, sc->proactive);

        if (try_to_shrink_lruvec(lruvec, sc))
                lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);

        clear_mm_walk();

        blk_finish_plug(&plug);
}

static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
        struct blk_plug plug;
        unsigned long reclaimed = sc->nr_reclaimed;

        VM_WARN_ON_ONCE(!root_reclaim(sc));

        /*
         * Unmapped clean folios are already prioritized. Scanning for more of
         * them is likely futile and can cause high reclaim latency when there
         * is a large number of memcgs.
         */
        if (!sc->may_writepage || !sc->may_unmap)
                goto done;

        lru_add_drain();

        blk_start_plug(&plug);

        set_mm_walk(pgdat, sc->proactive);

        set_initial_priority(pgdat, sc);

        if (current_is_kswapd())
                sc->nr_reclaimed = 0;

        if (mem_cgroup_disabled())
                shrink_one(&pgdat->__lruvec, sc);
        else
                shrink_many(pgdat, sc);

        if (current_is_kswapd())
                sc->nr_reclaimed += reclaimed;

        clear_mm_walk();

        blk_finish_plug(&plug);
done:
        if (sc->nr_reclaimed > reclaimed)
                kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx);
}

/******************************************************************************
 *                          state change
 ******************************************************************************/

static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
{
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        if (lrugen->enabled) {
                enum lru_list lru;

                for_each_evictable_lru(lru) {
                        if (!list_empty(&lruvec->lists[lru]))
                                return false;
                }
        } else {
                int gen, type, zone;

                for_each_gen_type_zone(gen, type, zone) {
                        if (!list_empty(&lrugen->folios[gen][type][zone]))
                                return false;
                }
        }

        return true;
}

static bool fill_evictable(struct lruvec *lruvec)
{
        enum lru_list lru;
        int remaining = MAX_LRU_BATCH;

        for_each_evictable_lru(lru) {
                int type = is_file_lru(lru);
                bool active = is_active_lru(lru);
                struct list_head *head = &lruvec->lists[lru];

                while (!list_empty(head)) {
                        bool success;
                        struct folio *folio = lru_to_folio(head);

                        VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);

                        lruvec_del_folio(lruvec, folio);
                        success = lru_gen_add_folio(lruvec, folio, false);
                        VM_WARN_ON_ONCE(!success);

                        if (!--remaining)
                                return false;
                }
        }

        return true;
}

static bool drain_evictable(struct lruvec *lruvec)
{
        int gen, type, zone;
        int remaining = MAX_LRU_BATCH;

        for_each_gen_type_zone(gen, type, zone) {
                struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];

                while (!list_empty(head)) {
                        bool success;
                        struct folio *folio = lru_to_folio(head);

                        VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
                        VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);

                        success = lru_gen_del_folio(lruvec, folio, false);
                        VM_WARN_ON_ONCE(!success);
                        lruvec_add_folio(lruvec, folio);

                        if (!--remaining)
                                return false;
                }
        }

        return true;
}

static void lru_gen_change_state(bool enabled)
{
        static DEFINE_MUTEX(state_mutex);

        struct mem_cgroup *memcg;

        cgroup_lock();
        cpus_read_lock();
        get_online_mems();
        mutex_lock(&state_mutex);

        if (enabled == lru_gen_enabled())
                goto unlock;

        static_branch_enable_cpuslocked(&lru_switch);

        if (enabled)
                static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
        else
                static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);

        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
                int nid;

                for_each_node(nid) {
                        struct lruvec *lruvec = get_lruvec(memcg, nid);

                        lruvec_lock_irq(lruvec);

                        VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
                        VM_WARN_ON_ONCE(!state_is_valid(lruvec));

                        lruvec->lrugen.enabled = enabled;

                        while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
                                lruvec_unlock_irq(lruvec);
                                cond_resched();
                                lruvec_lock_irq(lruvec);
                        }

                        lruvec_unlock_irq(lruvec);
                }

                cond_resched();
        } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));

        static_branch_disable_cpuslocked(&lru_switch);

unlock:
        mutex_unlock(&state_mutex);
        put_online_mems();
        cpus_read_unlock();
        cgroup_unlock();
}

/******************************************************************************
 *                          sysfs interface
 ******************************************************************************/

static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
}

/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
                                const char *buf, size_t len)
{
        unsigned int msecs;

        if (kstrtouint(buf, 0, &msecs))
                return -EINVAL;

        WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));

        return len;
}

static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms);

static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
        unsigned int caps = 0;

        if (get_cap(LRU_GEN_CORE))
                caps |= BIT(LRU_GEN_CORE);

        if (should_walk_mmu())
                caps |= BIT(LRU_GEN_MM_WALK);

        if (should_clear_pmd_young())
                caps |= BIT(LRU_GEN_NONLEAF_YOUNG);

        return sysfs_emit(buf, "0x%04x\n", caps);
}

/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
                             const char *buf, size_t len)
{
        int i;
        unsigned int caps;

        if (tolower(*buf) == 'n')
                caps = 0;
        else if (tolower(*buf) == 'y')
                caps = -1;
        else if (kstrtouint(buf, 0, &caps))
                return -EINVAL;

        for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
                bool enabled = caps & BIT(i);

                if (i == LRU_GEN_CORE)
                        lru_gen_change_state(enabled);
                else if (enabled)
                        static_branch_enable(&lru_gen_caps[i]);
                else
                        static_branch_disable(&lru_gen_caps[i]);
        }

        return len;
}

static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled);

static struct attribute *lru_gen_attrs[] = {
        &lru_gen_min_ttl_attr.attr,
        &lru_gen_enabled_attr.attr,
        NULL
};

static const struct attribute_group lru_gen_attr_group = {
        .name = "lru_gen",
        .attrs = lru_gen_attrs,
};

/******************************************************************************
 *                          debugfs interface
 ******************************************************************************/

static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
{
        struct mem_cgroup *memcg;
        loff_t nr_to_skip = *pos;

        m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
        if (!m->private)
                return ERR_PTR(-ENOMEM);

        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
                int nid;

                for_each_node_state(nid, N_MEMORY) {
                        if (!nr_to_skip--)
                                return get_lruvec(memcg, nid);
                }
        } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));

        return NULL;
}

static void lru_gen_seq_stop(struct seq_file *m, void *v)
{
        if (!IS_ERR_OR_NULL(v))
                mem_cgroup_iter_break(NULL, lruvec_memcg(v));

        kvfree(m->private);
        m->private = NULL;
}

static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
        int nid = lruvec_pgdat(v)->node_id;
        struct mem_cgroup *memcg = lruvec_memcg(v);

        ++*pos;

        nid = next_memory_node(nid);
        if (nid == MAX_NUMNODES) {
                memcg = mem_cgroup_iter(NULL, memcg, NULL);
                if (!memcg)
                        return NULL;

                nid = first_memory_node;
        }

        return get_lruvec(memcg, nid);
}

static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
                                  unsigned long max_seq, unsigned long *min_seq,
                                  unsigned long seq)
{
        int i;
        int type, tier;
        int hist = lru_hist_from_seq(seq);
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);

        for (tier = 0; tier < MAX_NR_TIERS; tier++) {
                seq_printf(m, "            %10d", tier);
                for (type = 0; type < ANON_AND_FILE; type++) {
                        const char *s = "xxx";
                        unsigned long n[3] = {};

                        if (seq == max_seq) {
                                s = "RTx";
                                n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
                                n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
                        } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
                                s = "rep";
                                n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
                                n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
                                n[2] = READ_ONCE(lrugen->protected[hist][type][tier]);
                        }

                        for (i = 0; i < 3; i++)
                                seq_printf(m, " %10lu%c", n[i], s[i]);
                }
                seq_putc(m, '\n');
        }

        if (!mm_state)
                return;

        seq_puts(m, "                      ");
        for (i = 0; i < NR_MM_STATS; i++) {
                const char *s = "xxxx";
                unsigned long n = 0;

                if (seq == max_seq && NR_HIST_GENS == 1) {
                        s = "TYFA";
                        n = READ_ONCE(mm_state->stats[hist][i]);
                } else if (seq != max_seq && NR_HIST_GENS > 1) {
                        s = "tyfa";
                        n = READ_ONCE(mm_state->stats[hist][i]);
                }

                seq_printf(m, " %10lu%c", n, s[i]);
        }
        seq_putc(m, '\n');
}

/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static int lru_gen_seq_show(struct seq_file *m, void *v)
{
        unsigned long seq;
        bool full = debugfs_get_aux_num(m->file);
        struct lruvec *lruvec = v;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        int nid = lruvec_pgdat(lruvec)->node_id;
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        DEFINE_MAX_SEQ(lruvec);
        DEFINE_MIN_SEQ(lruvec);

        if (nid == first_memory_node) {
                const char *path = memcg ? m->private : "";

#ifdef CONFIG_MEMCG
                if (memcg)
                        cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
#endif
                seq_printf(m, "memcg %llu %s\n", mem_cgroup_id(memcg), path);
        }

        seq_printf(m, " node %5d\n", nid);

        if (!full)
                seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2);
        else if (max_seq >= MAX_NR_GENS)
                seq = max_seq - MAX_NR_GENS + 1;
        else
                seq = 0;

        for (; seq <= max_seq; seq++) {
                int type, zone;
                int gen = lru_gen_from_seq(seq);
                unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);

                seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));

                for (type = 0; type < ANON_AND_FILE; type++) {
                        unsigned long size = 0;
                        char mark = full && seq < min_seq[type] ? 'x' : ' ';

                        for (zone = 0; zone < MAX_NR_ZONES; zone++)
                                size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);

                        seq_printf(m, " %10lu%c", size, mark);
                }

                seq_putc(m, '\n');

                if (full)
                        lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
        }

        return 0;
}

static const struct seq_operations lru_gen_seq_ops = {
        .start = lru_gen_seq_start,
        .stop = lru_gen_seq_stop,
        .next = lru_gen_seq_next,
        .show = lru_gen_seq_show,
};

static int run_aging(struct lruvec *lruvec, unsigned long seq,
                     int swappiness, bool force_scan)
{
        DEFINE_MAX_SEQ(lruvec);

        if (seq > max_seq)
                return -EINVAL;

        return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST;
}

static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
                        int swappiness, unsigned long nr_to_reclaim)
{
        DEFINE_MAX_SEQ(lruvec);

        if (seq + MIN_NR_GENS > max_seq)
                return -EINVAL;

        sc->nr_reclaimed = 0;

        while (!signal_pending(current)) {
                DEFINE_MIN_SEQ(lruvec);

                if (seq < evictable_min_seq(min_seq, swappiness))
                        return 0;

                if (sc->nr_reclaimed >= nr_to_reclaim)
                        return 0;

                if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc,
                                  swappiness))
                        return 0;

                cond_resched();
        }

        return -EINTR;
}

static int run_cmd(char cmd, u64 memcg_id, int nid, unsigned long seq,
                   struct scan_control *sc, int swappiness, unsigned long opt)
{
        struct lruvec *lruvec;
        int err = -EINVAL;
        struct mem_cgroup *memcg = NULL;

        if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
                return -EINVAL;

        if (!mem_cgroup_disabled()) {
                memcg = mem_cgroup_get_from_id(memcg_id);
                if (!memcg)
                        return -EINVAL;
        }

        if (memcg_id != mem_cgroup_id(memcg))
                goto done;

        sc->target_mem_cgroup = memcg;
        lruvec = get_lruvec(memcg, nid);

        if (swappiness < MIN_SWAPPINESS)
                swappiness = get_swappiness(lruvec, sc);
        else if (swappiness > SWAPPINESS_ANON_ONLY)
                goto done;

        switch (cmd) {
        case '+':
                err = run_aging(lruvec, seq, swappiness, opt);
                break;
        case '-':
                err = run_eviction(lruvec, seq, sc, swappiness, opt);
                break;
        }
done:
        mem_cgroup_put(memcg);

        return err;
}

/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
                                 size_t len, loff_t *pos)
{
        void *buf;
        char *cur, *next;
        unsigned int flags;
        struct blk_plug plug;
        int err = -EINVAL;
        struct scan_control sc = {
                .may_writepage = true,
                .may_unmap = true,
                .may_swap = true,
                .reclaim_idx = MAX_NR_ZONES - 1,
                .gfp_mask = GFP_KERNEL,
                .proactive = true,
        };

        buf = kvmalloc(len + 1, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        if (copy_from_user(buf, src, len)) {
                kvfree(buf);
                return -EFAULT;
        }

        set_task_reclaim_state(current, &sc.reclaim_state);
        flags = memalloc_noreclaim_save();
        blk_start_plug(&plug);
        if (!set_mm_walk(NULL, true)) {
                err = -ENOMEM;
                goto done;
        }

        next = buf;
        next[len] = '\0';

        while ((cur = strsep(&next, ",;\n"))) {
                int n;
                int end;
                char cmd, swap_string[5];
                u64 memcg_id;
                unsigned int nid;
                unsigned long seq;
                unsigned int swappiness;
                unsigned long opt = -1;

                cur = skip_spaces(cur);
                if (!*cur)
                        continue;

                n = sscanf(cur, "%c %llu %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
                           &seq, &end, swap_string, &end, &opt, &end);
                if (n < 4 || cur[end]) {
                        err = -EINVAL;
                        break;
                }

                if (n == 4) {
                        swappiness = -1;
                } else if (!strcmp("max", swap_string)) {
                        /* set by userspace for anonymous memory only */
                        swappiness = SWAPPINESS_ANON_ONLY;
                } else {
                        err = kstrtouint(swap_string, 0, &swappiness);
                        if (err)
                                break;
                }

                err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
                if (err)
                        break;
        }
done:
        clear_mm_walk();
        blk_finish_plug(&plug);
        memalloc_noreclaim_restore(flags);
        set_task_reclaim_state(current, NULL);

        kvfree(buf);

        return err ? : len;
}

static int lru_gen_seq_open(struct inode *inode, struct file *file)
{
        return seq_open(file, &lru_gen_seq_ops);
}

static const struct file_operations lru_gen_rw_fops = {
        .open = lru_gen_seq_open,
        .read = seq_read,
        .write = lru_gen_seq_write,
        .llseek = seq_lseek,
        .release = seq_release,
};

static const struct file_operations lru_gen_ro_fops = {
        .open = lru_gen_seq_open,
        .read = seq_read,
        .llseek = seq_lseek,
        .release = seq_release,
};

/******************************************************************************
 *                          initialization
 ******************************************************************************/

void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
        int i, j;

        spin_lock_init(&pgdat->memcg_lru.lock);

        for (i = 0; i < MEMCG_NR_GENS; i++) {
                for (j = 0; j < MEMCG_NR_BINS; j++)
                        INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
        }
}

void lru_gen_init_lruvec(struct lruvec *lruvec)
{
        int i;
        int gen, type, zone;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);

        lrugen->max_seq = MIN_NR_GENS + 1;
        lrugen->enabled = lru_gen_enabled();

        for (i = 0; i <= MIN_NR_GENS + 1; i++)
                lrugen->timestamps[i] = jiffies;

        for_each_gen_type_zone(gen, type, zone)
                INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);

        if (mm_state)
                mm_state->seq = MIN_NR_GENS;
}

#ifdef CONFIG_MEMCG

void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
        struct lru_gen_mm_list *mm_list = get_mm_list(memcg);

        if (!mm_list)
                return;

        INIT_LIST_HEAD(&mm_list->fifo);
        spin_lock_init(&mm_list->lock);
}

void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
        int i;
        int nid;
        struct lru_gen_mm_list *mm_list = get_mm_list(memcg);

        VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo));

        for_each_node(nid) {
                struct lruvec *lruvec = get_lruvec(memcg, nid);
                struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);

                VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
                                           sizeof(lruvec->lrugen.nr_pages)));

                lruvec->lrugen.list.next = LIST_POISON1;

                if (!mm_state)
                        continue;

                for (i = 0; i < NR_BLOOM_FILTERS; i++) {
                        bitmap_free(mm_state->filters[i]);
                        mm_state->filters[i] = NULL;
                }
        }
}

#endif /* CONFIG_MEMCG */

static int __init init_lru_gen(void)
{
        BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
        BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);

        if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
                pr_err("lru_gen: failed to create sysfs group\n");

        debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, false,
                                    &lru_gen_rw_fops);
        debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, true,
                                    &lru_gen_ro_fops);

        return 0;
};
late_initcall(init_lru_gen);

#else /* !CONFIG_LRU_GEN */

static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
        BUILD_BUG();
}

static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
        BUILD_BUG();
}

static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
        BUILD_BUG();
}

#endif /* CONFIG_LRU_GEN */

static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
        unsigned long nr[NR_LRU_LISTS];
        unsigned long targets[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list lru;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        bool proportional_reclaim;
        struct blk_plug plug;

        if ((lru_gen_enabled() || lru_gen_switching()) && !root_reclaim(sc)) {
                lru_gen_shrink_lruvec(lruvec, sc);

                if (!lru_gen_switching())
                        return;

        }

        get_scan_count(lruvec, sc, nr);

        /* Record the original scan target for proportional adjustments later */
        memcpy(targets, nr, sizeof(nr));

        /*
         * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
         * event that can occur when there is little memory pressure e.g.
         * multiple streaming readers/writers. Hence, we do not abort scanning
         * when the requested number of pages are reclaimed when scanning at
         * DEF_PRIORITY on the assumption that the fact we are direct
         * reclaiming implies that kswapd is not keeping up and it is best to
         * do a batch of work at once. For memcg reclaim one check is made to
         * abort proportional reclaim if either the file or anon lru has already
         * dropped to zero at the first pass.
         */
        proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
                                sc->priority == DEF_PRIORITY);

        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                unsigned long nr_anon, nr_file, percentage;
                unsigned long nr_scanned;

                for_each_evictable_lru(lru) {
                        if (nr[lru]) {
                                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
                                nr[lru] -= nr_to_scan;

                                nr_reclaimed += shrink_list(lru, nr_to_scan,
                                                            lruvec, sc);
                        }
                }

                cond_resched();

                if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
                        continue;

                /*
                 * For kswapd and memcg, reclaim at least the number of pages
                 * requested. Ensure that the anon and file LRUs are scanned
                 * proportionally what was requested by get_scan_count(). We
                 * stop reclaiming one LRU and reduce the amount scanning
                 * proportional to the original scan target.
                 */
                nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
                nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

                /*
                 * It's just vindictive to attack the larger once the smaller
                 * has gone to zero.  And given the way we stop scanning the
                 * smaller below, this makes sure that we only make one nudge
                 * towards proportionality once we've got nr_to_reclaim.
                 */
                if (!nr_file || !nr_anon)
                        break;

                if (nr_file > nr_anon) {
                        unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
                                                targets[LRU_ACTIVE_ANON] + 1;
                        lru = LRU_BASE;
                        percentage = nr_anon * 100 / scan_target;
                } else {
                        unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
                                                targets[LRU_ACTIVE_FILE] + 1;
                        lru = LRU_FILE;
                        percentage = nr_file * 100 / scan_target;
                }

                /* Stop scanning the smaller of the LRU */
                nr[lru] = 0;
                nr[lru + LRU_ACTIVE] = 0;

                /*
                 * Recalculate the other LRU scan count based on its original
                 * scan target and the percentage scanning already complete
                 */
                lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
                nr_scanned = targets[lru] - nr[lru];
                nr[lru] = targets[lru] * (100 - percentage) / 100;
                nr[lru] -= min(nr[lru], nr_scanned);

                lru += LRU_ACTIVE;
                nr_scanned = targets[lru] - nr[lru];
                nr[lru] = targets[lru] * (100 - percentage) / 100;
                nr[lru] -= min(nr[lru], nr_scanned);
        }
        blk_finish_plug(&plug);
        sc->nr_reclaimed += nr_reclaimed;

        /*
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
        if (can_age_anon_pages(lruvec, sc) &&
            inactive_is_low(lruvec, LRU_INACTIVE_ANON))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
}

/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
        if (gfp_compaction_allowed(sc->gfp_mask) && sc->order &&
                        (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
                         sc->priority < DEF_PRIORITY - 2))
                return true;

        return false;
}

/*
 * Reclaim/compaction is used for high-order allocation requests. It reclaims
 * order-0 pages before compacting the zone. should_continue_reclaim() returns
 * true if more pages should be reclaimed such that when the page allocator
 * calls try_to_compact_pages() that it will have enough free pages to succeed.
 * It will give up earlier than that if there is difficulty reclaiming pages.
 */
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
                                        unsigned long nr_reclaimed,
                                        struct scan_control *sc)
{
        unsigned long pages_for_compaction;
        unsigned long inactive_lru_pages;
        int z;
        struct zone *zone;

        /* If not in reclaim/compaction mode, stop */
        if (!in_reclaim_compaction(sc))
                return false;

        /*
         * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
         * number of pages that were scanned. This will return to the caller
         * with the risk reclaim/compaction and the resulting allocation attempt
         * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
         * allocations through requiring that the full LRU list has been scanned
         * first, by assuming that zero delta of sc->nr_scanned means full LRU
         * scan, but that approximation was wrong, and there were corner cases
         * where always a non-zero amount of pages were scanned.
         */
        if (!nr_reclaimed)
                return false;

        /* If compaction would go ahead or the allocation would succeed, stop */
        for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
                unsigned long watermark = min_wmark_pages(zone);

                /* Allocation can already succeed, nothing to do */
                if (zone_watermark_ok(zone, sc->order, watermark,
                                      sc->reclaim_idx, 0))
                        return false;

                if (compaction_suitable(zone, sc->order, watermark,
                                        sc->reclaim_idx))
                        return false;
        }

        /*
         * If we have not reclaimed enough pages for compaction and the
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = compact_gap(sc->order);
        inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
        if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
                inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);

        return inactive_lru_pages > pages_for_compaction;
}

static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
{
        struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
        struct mem_cgroup_reclaim_cookie reclaim = {
                .pgdat = pgdat,
        };
        struct mem_cgroup_reclaim_cookie *partial = &reclaim;
        struct mem_cgroup *memcg;

        /*
         * In most cases, direct reclaimers can do partial walks
         * through the cgroup tree, using an iterator state that
         * persists across invocations. This strikes a balance between
         * fairness and allocation latency.
         *
         * For kswapd, reliable forward progress is more important
         * than a quick return to idle. Always do full walks.
         */
        if (current_is_kswapd() || sc->memcg_full_walk)
                partial = NULL;

        memcg = mem_cgroup_iter(target_memcg, NULL, partial);
        do {
                struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
                unsigned long reclaimed;
                unsigned long scanned;

                /*
                 * This loop can become CPU-bound when target memcgs
                 * aren't eligible for reclaim - either because they
                 * don't have any reclaimable pages, or because their
                 * memory is explicitly protected. Avoid soft lockups.
                 */
                cond_resched();

                mem_cgroup_calculate_protection(target_memcg, memcg);

                if (mem_cgroup_below_min(target_memcg, memcg)) {
                        /*
                         * Hard protection.
                         * If there is no reclaimable memory, OOM.
                         */
                        continue;
                } else if (mem_cgroup_below_low(target_memcg, memcg)) {
                        /*
                         * Soft protection.
                         * Respect the protection only as long as
                         * there is an unprotected supply
                         * of reclaimable memory from other cgroups.
                         */
                        if (!sc->memcg_low_reclaim) {
                                sc->memcg_low_skipped = 1;
                                continue;
                        }
                        memcg_memory_event(memcg, MEMCG_LOW);
                }

                reclaimed = sc->nr_reclaimed;
                scanned = sc->nr_scanned;

                shrink_lruvec(lruvec, sc);

                shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
                            sc->priority);

                /* Record the group's reclaim efficiency */
                if (!sc->proactive)
                        vmpressure(sc->gfp_mask, memcg, false,
                                   sc->nr_scanned - scanned,
                                   sc->nr_reclaimed - reclaimed);

                /* If partial walks are allowed, bail once goal is reached */
                if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) {
                        mem_cgroup_iter_break(target_memcg, memcg);
                        break;
                }
        } while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial)));
}

static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
        unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed;
        struct lruvec *target_lruvec;
        bool reclaimable = false;

        if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) {
                memset(&sc->nr, 0, sizeof(sc->nr));
                lru_gen_shrink_node(pgdat, sc);

                if (!lru_gen_switching())
                        return;

        }

        target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);

again:
        memset(&sc->nr, 0, sizeof(sc->nr));

        nr_reclaimed = sc->nr_reclaimed;
        nr_scanned = sc->nr_scanned;

        prepare_scan_control(pgdat, sc);

        shrink_node_memcgs(pgdat, sc);

        flush_reclaim_state(sc);

        nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed;

        /* Record the subtree's reclaim efficiency */
        if (!sc->proactive)
                vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
                           sc->nr_scanned - nr_scanned, nr_node_reclaimed);

        if (nr_node_reclaimed)
                reclaimable = true;

        if (current_is_kswapd()) {
                /*
                 * If reclaim is isolating dirty pages under writeback,
                 * it implies that the long-lived page allocation rate
                 * is exceeding the page laundering rate. Either the
                 * global limits are not being effective at throttling
                 * processes due to the page distribution throughout
                 * zones or there is heavy usage of a slow backing
                 * device. The only option is to throttle from reclaim
                 * context which is not ideal as there is no guarantee
                 * the dirtying process is throttled in the same way
                 * balance_dirty_pages() manages.
                 *
                 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
                 * count the number of pages under pages flagged for
                 * immediate reclaim and stall if any are encountered
                 * in the nr_immediate check below.
                 */
                if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
                        set_bit(PGDAT_WRITEBACK, &pgdat->flags);

                /*
                 * If kswapd scans pages marked for immediate
                 * reclaim and under writeback (nr_immediate), it
                 * implies that pages are cycling through the LRU
                 * faster than they are written so forcibly stall
                 * until some pages complete writeback.
                 */
                if (sc->nr.immediate)
                        reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
        }

        /*
         * Tag a node/memcg as congested if all the dirty pages were marked
         * for writeback and immediate reclaim (counted in nr.congested).
         *
         * Legacy memcg will stall in page writeback so avoid forcibly
         * stalling in reclaim_throttle().
         */
        if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) {
                if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
                        set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags);

                if (current_is_kswapd())
                        set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
        }

        /*
         * Stall direct reclaim for IO completions if the lruvec is
         * node is congested. Allow kswapd to continue until it
         * starts encountering unqueued dirty pages or cycling through
         * the LRU too quickly.
         */
        if (!current_is_kswapd() && current_may_throttle() &&
            !sc->hibernation_mode &&
            (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) ||
             test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
                reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);

        if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc))
                goto again;

        /*
         * Kswapd gives up on balancing particular nodes after too
         * many failures to reclaim anything from them and goes to
         * sleep. On reclaim progress, reset the failure counter. A
         * successful direct reclaim run will revive a dormant kswapd.
         */
        if (reclaimable)
                kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx);
        else if (sc->cache_trim_mode)
                sc->cache_trim_mode_failed = 1;
}

/*
 * Returns true if compaction should go ahead for a costly-order request, or
 * the allocation would already succeed without compaction. Return false if we
 * should reclaim first.
 */
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
        unsigned long watermark;

        if (!gfp_compaction_allowed(sc->gfp_mask))
                return false;

        /* Allocation can already succeed, nothing to do */
        if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
                              sc->reclaim_idx, 0))
                return true;

        /*
         * Direct reclaim usually targets the min watermark, but compaction
         * takes time to run and there are potentially other callers using the
         * pages just freed. So target a higher buffer to give compaction a
         * reasonable chance of completing and allocating the pages.
         *
         * Note that we won't actually reclaim the whole buffer in one attempt
         * as the target watermark in should_continue_reclaim() is lower. But if
         * we are already above the high+gap watermark, don't reclaim at all.
         */
        watermark = high_wmark_pages(zone);
        if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx))
                return true;

        return false;
}

static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
{
        /*
         * If reclaim is making progress greater than 12% efficiency then
         * wake all the NOPROGRESS throttled tasks.
         */
        if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
                wait_queue_head_t *wqh;

                wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
                if (waitqueue_active(wqh))
                        wake_up(wqh);

                return;
        }

        /*
         * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
         * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
         * under writeback and marked for immediate reclaim at the tail of the
         * LRU.
         */
        if (current_is_kswapd() || cgroup_reclaim(sc))
                return;

        /* Throttle if making no progress at high prioities. */
        if (sc->priority == 1 && !sc->nr_reclaimed)
                reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
}

/*
 * This is the direct reclaim path, for page-allocating processes.  We only
 * try to reclaim pages from zones which will satisfy the caller's allocation
 * request.
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
        struct zoneref *z;
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
        gfp_t orig_mask;
        pg_data_t *last_pgdat = NULL;
        pg_data_t *first_pgdat = NULL;

        /*
         * If the number of buffer_heads in the machine exceeds the maximum
         * allowed level, force direct reclaim to scan the highmem zone as
         * highmem pages could be pinning lowmem pages storing buffer_heads
         */
        orig_mask = sc->gfp_mask;
        if (buffer_heads_over_limit) {
                sc->gfp_mask |= __GFP_HIGHMEM;
                sc->reclaim_idx = gfp_zone(sc->gfp_mask);
        }

        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        sc->reclaim_idx, sc->nodemask) {
                /*
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
                 */
                if (!cgroup_reclaim(sc)) {
                        if (!cpuset_zone_allowed(zone,
                                                 GFP_KERNEL | __GFP_HARDWALL))
                                continue;

                        /*
                         * If we already have plenty of memory free for
                         * compaction in this zone, don't free any more.
                         * Even though compaction is invoked for any
                         * non-zero order, only frequent costly order
                         * reclamation is disruptive enough to become a
                         * noticeable problem, like transparent huge
                         * page allocations.
                         */
                        if (IS_ENABLED(CONFIG_COMPACTION) &&
                            sc->order > PAGE_ALLOC_COSTLY_ORDER &&
                            compaction_ready(zone, sc)) {
                                sc->compaction_ready = true;
                                continue;
                        }

                        /*
                         * Shrink each node in the zonelist once. If the
                         * zonelist is ordered by zone (not the default) then a
                         * node may be shrunk multiple times but in that case
                         * the user prefers lower zones being preserved.
                         */
                        if (zone->zone_pgdat == last_pgdat)
                                continue;

                        /*
                         * This steals pages from memory cgroups over softlimit
                         * and returns the number of reclaimed pages and
                         * scanned pages. This works for global memory pressure
                         * and balancing, not for a memcg's limit.
                         */
                        nr_soft_scanned = 0;
                        nr_soft_reclaimed = memcg1_soft_limit_reclaim(zone->zone_pgdat,
                                                                      sc->order, sc->gfp_mask,
                                                                      &nr_soft_scanned);
                        sc->nr_reclaimed += nr_soft_reclaimed;
                        sc->nr_scanned += nr_soft_scanned;
                        /* need some check for avoid more shrink_zone() */
                }

                if (!first_pgdat)
                        first_pgdat = zone->zone_pgdat;

                /* See comment about same check for global reclaim above */
                if (zone->zone_pgdat == last_pgdat)
                        continue;
                last_pgdat = zone->zone_pgdat;
                shrink_node(zone->zone_pgdat, sc);
        }

        if (first_pgdat)
                consider_reclaim_throttle(first_pgdat, sc);

        /*
         * Restore to original mask to avoid the impact on the caller if we
         * promoted it to __GFP_HIGHMEM.
         */
        sc->gfp_mask = orig_mask;
}

static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
{
        struct lruvec *target_lruvec;
        unsigned long refaults;

        if (lru_gen_enabled() && !lru_gen_switching())
                return;

        target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
        refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
        target_lruvec->refaults[WORKINGSET_ANON] = refaults;
        refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
        target_lruvec->refaults[WORKINGSET_FILE] = refaults;
}

/*
 * This is the main entry point to direct page reclaim.
 *
 * If a full scan of the inactive list fails to free enough memory then we
 * are "out of memory" and something needs to be killed.
 *
 * If the caller is !__GFP_FS then the probability of a failure is reasonably
 * high - the zone may be full of dirty or under-writeback pages, which this
 * caller can't do much about.  We kick the writeback threads and take explicit
 * naps in the hope that some of these pages can be written.  But if the
 * allocating task holds filesystem locks which prevent writeout this might not
 * work, and the allocation attempt will fail.
 *
 * returns:        0, if no pages reclaimed
 *                 else, the number of pages reclaimed
 */
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                          struct scan_control *sc)
{
        int initial_priority = sc->priority;
        pg_data_t *last_pgdat;
        struct zoneref *z;
        struct zone *zone;
retry:
        delayacct_freepages_start();

        if (!cgroup_reclaim(sc))
                __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);

        do {
                if (!sc->proactive)
                        vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
                                        sc->priority);
                sc->nr_scanned = 0;
                shrink_zones(zonelist, sc);

                if (sc->nr_reclaimed >= sc->nr_to_reclaim)
                        break;

                if (sc->compaction_ready)
                        break;
        } while (--sc->priority >= 0);

        last_pgdat = NULL;
        for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
                                        sc->nodemask) {
                if (zone->zone_pgdat == last_pgdat)
                        continue;
                last_pgdat = zone->zone_pgdat;

                snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);

                if (cgroup_reclaim(sc)) {
                        struct lruvec *lruvec;

                        lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
                                                   zone->zone_pgdat);
                        clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
                }
        }

        delayacct_freepages_end();

        if (sc->nr_reclaimed)
                return sc->nr_reclaimed;

        /* Aborted reclaim to try compaction? don't OOM, then */
        if (sc->compaction_ready)
                return 1;

        /*
         * In most cases, direct reclaimers can do partial walks
         * through the cgroup tree to meet the reclaim goal while
         * keeping latency low. Since the iterator state is shared
         * among all direct reclaim invocations (to retain fairness
         * among cgroups), though, high concurrency can result in
         * individual threads not seeing enough cgroups to make
         * meaningful forward progress. Avoid false OOMs in this case.
         */
        if (!sc->memcg_full_walk) {
                sc->priority = initial_priority;
                sc->memcg_full_walk = 1;
                goto retry;
        }

        /*
         * We make inactive:active ratio decisions based on the node's
         * composition of memory, but a restrictive reclaim_idx or a
         * memory.low cgroup setting can exempt large amounts of
         * memory from reclaim. Neither of which are very common, so
         * instead of doing costly eligibility calculations of the
         * entire cgroup subtree up front, we assume the estimates are
         * good, and retry with forcible deactivation if that fails.
         */
        if (sc->skipped_deactivate) {
                sc->priority = initial_priority;
                sc->force_deactivate = 1;
                sc->skipped_deactivate = 0;
                goto retry;
        }

        /* Untapped cgroup reserves?  Don't OOM, retry. */
        if (sc->memcg_low_skipped) {
                sc->priority = initial_priority;
                sc->force_deactivate = 0;
                sc->memcg_low_reclaim = 1;
                sc->memcg_low_skipped = 0;
                goto retry;
        }

        return 0;
}

static bool allow_direct_reclaim(pg_data_t *pgdat)
{
        struct zone *zone;
        unsigned long pfmemalloc_reserve = 0;
        unsigned long free_pages = 0;
        int i;
        bool wmark_ok;

        if (kswapd_test_hopeless(pgdat))
                return true;

        for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
                if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES))
                        continue;

                pfmemalloc_reserve += min_wmark_pages(zone);
                free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES);
        }

        /* If there are no reserves (unexpected config) then do not throttle */
        if (!pfmemalloc_reserve)
                return true;

        wmark_ok = free_pages > pfmemalloc_reserve / 2;

        /* kswapd must be awake if processes are being throttled */
        if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
                if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
                        WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);

                wake_up_interruptible(&pgdat->kswapd_wait);
        }

        return wmark_ok;
}

/*
 * Throttle direct reclaimers if backing storage is backed by the network
 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
 * depleted. kswapd will continue to make progress and wake the processes
 * when the low watermark is reached.
 *
 * Returns true if a fatal signal was delivered during throttling. If this
 * happens, the page allocator should not consider triggering the OOM killer.
 */
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                        nodemask_t *nodemask)
{
        struct zoneref *z;
        struct zone *zone;
        pg_data_t *pgdat = NULL;

        /*
         * Kernel threads should not be throttled as they may be indirectly
         * responsible for cleaning pages necessary for reclaim to make forward
         * progress. kjournald for example may enter direct reclaim while
         * committing a transaction where throttling it could forcing other
         * processes to block on log_wait_commit().
         */
        if (current->flags & PF_KTHREAD)
                goto out;

        /*
         * If a fatal signal is pending, this process should not throttle.
         * It should return quickly so it can exit and free its memory
         */
        if (fatal_signal_pending(current))
                goto out;

        /*
         * Check if the pfmemalloc reserves are ok by finding the first node
         * with a usable ZONE_NORMAL or lower zone. The expectation is that
         * GFP_KERNEL will be required for allocating network buffers when
         * swapping over the network so ZONE_HIGHMEM is unusable.
         *
         * Throttling is based on the first usable node and throttled processes
         * wait on a queue until kswapd makes progress and wakes them. There
         * is an affinity then between processes waking up and where reclaim
         * progress has been made assuming the process wakes on the same node.
         * More importantly, processes running on remote nodes will not compete
         * for remote pfmemalloc reserves and processes on different nodes
         * should make reasonable progress.
         */
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(gfp_mask), nodemask) {
                if (zone_idx(zone) > ZONE_NORMAL)
                        continue;

                /* Throttle based on the first usable node */
                pgdat = zone->zone_pgdat;
                if (allow_direct_reclaim(pgdat))
                        goto out;
                break;
        }

        /* If no zone was usable by the allocation flags then do not throttle */
        if (!pgdat)
                goto out;

        /* Account for the throttling */
        count_vm_event(PGSCAN_DIRECT_THROTTLE);

        /*
         * If the caller cannot enter the filesystem, it's possible that it
         * is due to the caller holding an FS lock or performing a journal
         * transaction in the case of a filesystem like ext[3|4]. In this case,
         * it is not safe to block on pfmemalloc_wait as kswapd could be
         * blocked waiting on the same lock. Instead, throttle for up to a
         * second before continuing.
         */
        if (!(gfp_mask & __GFP_FS))
                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
                        allow_direct_reclaim(pgdat), HZ);
        else
                /* Throttle until kswapd wakes the process */
                wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
                        allow_direct_reclaim(pgdat));

        if (fatal_signal_pending(current))
                return true;

out:
        return false;
}

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
{
        unsigned long nr_reclaimed;
        struct scan_control sc = {
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .gfp_mask = current_gfp_context(gfp_mask),
                .reclaim_idx = gfp_zone(gfp_mask),
                .order = order,
                .nodemask = nodemask,
                .priority = DEF_PRIORITY,
                .may_writepage = 1,
                .may_unmap = 1,
                .may_swap = 1,
        };

        /*
         * scan_control uses s8 fields for order, priority, and reclaim_idx.
         * Confirm they are large enough for max values.
         */
        BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX);
        BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
        BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);

        /*
         * Do not enter reclaim if fatal signal was delivered while throttled.
         * 1 is returned so that the page allocator does not OOM kill at this
         * point.
         */
        if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
                return 1;

        set_task_reclaim_state(current, &sc.reclaim_state);
        trace_mm_vmscan_direct_reclaim_begin(sc.gfp_mask, order, 0);

        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);

        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed, 0);
        set_task_reclaim_state(current, NULL);

        return nr_reclaimed;
}

#ifdef CONFIG_MEMCG

/* Only used by soft limit reclaim. Do not reuse for anything else. */
unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
                                                pg_data_t *pgdat,
                                                unsigned long *nr_scanned)
{
        struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
        struct scan_control sc = {
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .target_mem_cgroup = memcg,
                .may_writepage = 1,
                .may_unmap = 1,
                .reclaim_idx = MAX_NR_ZONES - 1,
                .may_swap = !noswap,
        };

        WARN_ON_ONCE(!current->reclaim_state);

        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);

        trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.gfp_mask,
                                                      sc.order,
                                                      memcg);

        /*
         * NOTE: Although we can get the priority field, using it
         * here is not a good idea, since it limits the pages we can scan.
         * if we don't reclaim here, the shrink_node from balance_pgdat
         * will pick up pages from other mem cgroup's as well. We hack
         * the priority and make it zero.
         */
        shrink_lruvec(lruvec, &sc);

        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed, memcg);

        *nr_scanned = sc.nr_scanned;

        return sc.nr_reclaimed;
}

unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                           unsigned long nr_pages,
                                           gfp_t gfp_mask,
                                           unsigned int reclaim_options,
                                           int *swappiness)
{
        unsigned long nr_reclaimed;
        unsigned int noreclaim_flag;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .proactive_swappiness = swappiness,
                .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
                .reclaim_idx = MAX_NR_ZONES - 1,
                .target_mem_cgroup = memcg,
                .priority = DEF_PRIORITY,
                .may_writepage = 1,
                .may_unmap = 1,
                .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
                .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
        };
        /*
         * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
         * equal pressure on all the nodes. This is based on the assumption that
         * the reclaim does not bail out early.
         */
        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);

        set_task_reclaim_state(current, &sc.reclaim_state);
        trace_mm_vmscan_memcg_reclaim_begin(sc.gfp_mask, 0, memcg);
        noreclaim_flag = memalloc_noreclaim_save();

        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);

        memalloc_noreclaim_restore(noreclaim_flag);
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed, memcg);
        set_task_reclaim_state(current, NULL);

        return nr_reclaimed;
}
#else
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                           unsigned long nr_pages,
                                           gfp_t gfp_mask,
                                           unsigned int reclaim_options,
                                           int *swappiness)
{
        return 0;
}
#endif

static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;

        if (lru_gen_enabled() || lru_gen_switching()) {
                lru_gen_age_node(pgdat, sc);

                if (!lru_gen_switching())
                        return;

        }

        lruvec = mem_cgroup_lruvec(NULL, pgdat);
        if (!can_age_anon_pages(lruvec, sc))
                return;

        if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
                return;

        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
                lruvec = mem_cgroup_lruvec(memcg, pgdat);
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
                memcg = mem_cgroup_iter(NULL, memcg, NULL);
        } while (memcg);
}

static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
{
        int i;
        struct zone *zone;

        /*
         * Check for watermark boosts top-down as the higher zones
         * are more likely to be boosted. Both watermarks and boosts
         * should not be checked at the same time as reclaim would
         * start prematurely when there is no boosting and a lower
         * zone is balanced.
         */
        for (i = highest_zoneidx; i >= 0; i--) {
                zone = pgdat->node_zones + i;
                if (!managed_zone(zone))
                        continue;

                if (zone->watermark_boost)
                        return true;
        }

        return false;
}

/*
 * Returns true if there is an eligible zone balanced for the request order
 * and highest_zoneidx
 */
static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
{
        int i;
        unsigned long mark = -1;
        struct zone *zone;

        /*
         * Check watermarks bottom-up as lower zones are more likely to
         * meet watermarks.
         */
        for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
                enum zone_stat_item item;
                unsigned long free_pages;

                if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
                        mark = promo_wmark_pages(zone);
                else
                        mark = high_wmark_pages(zone);

                /*
                 * In defrag_mode, watermarks must be met in whole
                 * blocks to avoid polluting allocator fallbacks.
                 *
                 * However, kswapd usually cannot accomplish this on
                 * its own and needs kcompactd support. Once it's
                 * reclaimed a compaction gap, and kswapd_shrink_node
                 * has dropped order, simply ensure there are enough
                 * base pages for compaction, wake kcompactd & sleep.
                 */
                if (defrag_mode && order)
                        item = NR_FREE_PAGES_BLOCKS;
                else
                        item = NR_FREE_PAGES;

                /*
                 * When there is a high number of CPUs in the system,
                 * the cumulative error from the vmstat per-cpu cache
                 * can blur the line between the watermarks. In that
                 * case, be safe and get an accurate snapshot.
                 *
                 * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
                 * pageblock_nr_pages, while the vmstat pcp threshold
                 * is limited to 125. On many configurations that
                 * counter won't actually be per-cpu cached. But keep
                 * things simple for now; revisit when somebody cares.
                 */
                free_pages = zone_page_state(zone, item);
                if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
                        free_pages = zone_page_state_snapshot(zone, item);

                if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
                                        0, free_pages))
                        return true;
        }

        /*
         * If a node has no managed zone within highest_zoneidx, it does not
         * need balancing by definition. This can happen if a zone-restricted
         * allocation tries to wake a remote kswapd.
         */
        if (mark == -1)
                return true;

        return false;
}

/* Clear pgdat state for congested, dirty or under writeback. */
static void clear_pgdat_congested(pg_data_t *pgdat)
{
        struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);

        clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
        clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
        clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}

/*
 * Prepare kswapd for sleeping. This verifies that there are no processes
 * waiting in throttle_direct_reclaim() and that watermarks have been met.
 *
 * Returns true if kswapd is ready to sleep
 */
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
                                int highest_zoneidx)
{
        /*
         * The throttled processes are normally woken up in balance_pgdat() as
         * soon as allow_direct_reclaim() is true. But there is a potential
         * race between when kswapd checks the watermarks and a process gets
         * throttled. There is also a potential race if processes get
         * throttled, kswapd wakes, a large process exits thereby balancing the
         * zones, which causes kswapd to exit balance_pgdat() before reaching
         * the wake up checks. If kswapd is going to sleep, no process should
         * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
         * the wake up is premature, processes will wake kswapd and get
         * throttled again. The difference from wake ups in balance_pgdat() is
         * that here we are under prepare_to_wait().
         */
        if (waitqueue_active(&pgdat->pfmemalloc_wait))
                wake_up_all(&pgdat->pfmemalloc_wait);

        /* Hopeless node, leave it to direct reclaim */
        if (kswapd_test_hopeless(pgdat))
                return true;

        if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
                clear_pgdat_congested(pgdat);
                return true;
        }

        return false;
}

/*
 * kswapd shrinks a node of pages that are at or below the highest usable
 * zone that is currently unbalanced.
 *
 * Returns true if kswapd scanned at least the requested number of pages to
 * reclaim or if the lack of progress was due to pages under writeback.
 * This is used to determine if the scanning priority needs to be raised.
 */
static bool kswapd_shrink_node(pg_data_t *pgdat,
                               struct scan_control *sc)
{
        struct zone *zone;
        int z;
        unsigned long nr_reclaimed = sc->nr_reclaimed;

        /* Reclaim a number of pages proportional to the number of zones */
        sc->nr_to_reclaim = 0;
        for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
                sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
        }

        /*
         * Historically care was taken to put equal pressure on all zones but
         * now pressure is applied based on node LRU order.
         */
        shrink_node(pgdat, sc);

        /*
         * Fragmentation may mean that the system cannot be rebalanced for
         * high-order allocations. If twice the allocation size has been
         * reclaimed then recheck watermarks only at order-0 to prevent
         * excessive reclaim. Assume that a process requested a high-order
         * can direct reclaim/compact.
         */
        if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
                sc->order = 0;

        /* account for progress from mm_account_reclaimed_pages() */
        return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim;
}

/* Page allocator PCP high watermark is lowered if reclaim is active. */
static inline void
update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
{
        int i;
        struct zone *zone;

        for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
                if (active)
                        set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
                else
                        clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
        }
}

static inline void
set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
{
        update_reclaim_active(pgdat, highest_zoneidx, true);
}

static inline void
clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
{
        update_reclaim_active(pgdat, highest_zoneidx, false);
}

/*
 * For kswapd, balance_pgdat() will reclaim pages across a node from zones
 * that are eligible for use by the caller until at least one zone is
 * balanced.
 *
 * Returns the order kswapd finished reclaiming at.
 *
 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
 * found to have free_pages <= high_wmark_pages(zone), any page in that zone
 * or lower is eligible for reclaim until at least one usable zone is
 * balanced.
 */
static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
        int i;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
        unsigned long pflags;
        unsigned long nr_boost_reclaim;
        unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
        bool boosted;
        struct zone *zone;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .order = order,
                .may_unmap = 1,
        };

        set_task_reclaim_state(current, &sc.reclaim_state);
        psi_memstall_enter(&pflags);
        __fs_reclaim_acquire(_THIS_IP_);

        count_vm_event(PAGEOUTRUN);

        /*
         * Account for the reclaim boost. Note that the zone boost is left in
         * place so that parallel allocations that are near the watermark will
         * stall or direct reclaim until kswapd is finished.
         */
        nr_boost_reclaim = 0;
        for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
                nr_boost_reclaim += zone->watermark_boost;
                zone_boosts[i] = zone->watermark_boost;
        }
        boosted = nr_boost_reclaim;

restart:
        set_reclaim_active(pgdat, highest_zoneidx);
        sc.priority = DEF_PRIORITY;
        do {
                unsigned long nr_reclaimed = sc.nr_reclaimed;
                bool raise_priority = true;
                bool balanced;
                bool ret;
                bool was_frozen;

                sc.reclaim_idx = highest_zoneidx;

                /*
                 * If the number of buffer_heads exceeds the maximum allowed
                 * then consider reclaiming from all zones. This has a dual
                 * purpose -- on 64-bit systems it is expected that
                 * buffer_heads are stripped during active rotation. On 32-bit
                 * systems, highmem pages can pin lowmem memory and shrinking
                 * buffers can relieve lowmem pressure. Reclaim may still not
                 * go ahead if all eligible zones for the original allocation
                 * request are balanced to avoid excessive reclaim from kswapd.
                 */
                if (buffer_heads_over_limit) {
                        for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
                                zone = pgdat->node_zones + i;
                                if (!managed_zone(zone))
                                        continue;

                                sc.reclaim_idx = i;
                                break;
                        }
                }

                /*
                 * If the pgdat is imbalanced then ignore boosting and preserve
                 * the watermarks for a later time and restart. Note that the
                 * zone watermarks will be still reset at the end of balancing
                 * on the grounds that the normal reclaim should be enough to
                 * re-evaluate if boosting is required when kswapd next wakes.
                 */
                balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
                if (!balanced && nr_boost_reclaim) {
                        nr_boost_reclaim = 0;
                        goto restart;
                }

                /*
                 * If boosting is not active then only reclaim if there are no
                 * eligible zones. Note that sc.reclaim_idx is not used as
                 * buffer_heads_over_limit may have adjusted it.
                 */
                if (!nr_boost_reclaim && balanced)
                        goto out;

                /* Limit the priority of boosting to avoid reclaim writeback */
                if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
                        raise_priority = false;

                /*
                 * Do not writeback or swap pages for boosted reclaim. The
                 * intent is to relieve pressure not issue sub-optimal IO
                 * from reclaim context. If no pages are reclaimed, the
                 * reclaim will be aborted.
                 */
                sc.may_writepage = !nr_boost_reclaim;
                sc.may_swap = !nr_boost_reclaim;

                /*
                 * Do some background aging, to give pages a chance to be
                 * referenced before reclaiming. All pages are rotated
                 * regardless of classzone as this is about consistent aging.
                 */
                kswapd_age_node(pgdat, &sc);

                /* Call soft limit reclaim before calling shrink_node. */
                sc.nr_scanned = 0;
                nr_soft_scanned = 0;
                nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat, sc.order,
                                                              sc.gfp_mask, &nr_soft_scanned);
                sc.nr_reclaimed += nr_soft_reclaimed;

                /*
                 * There should be no need to raise the scanning priority if
                 * enough pages are already being scanned that that high
                 * watermark would be met at 100% efficiency.
                 */
                if (kswapd_shrink_node(pgdat, &sc))
                        raise_priority = false;

                /*
                 * If the low watermark is met there is no need for processes
                 * to be throttled on pfmemalloc_wait as they should not be
                 * able to safely make forward progress. Wake them
                 */
                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
                                allow_direct_reclaim(pgdat))
                        wake_up_all(&pgdat->pfmemalloc_wait);

                /* Check if kswapd should be suspending */
                __fs_reclaim_release(_THIS_IP_);
                ret = kthread_freezable_should_stop(&was_frozen);
                __fs_reclaim_acquire(_THIS_IP_);
                if (was_frozen || ret)
                        break;

                /*
                 * Raise priority if scanning rate is too low or there was no
                 * progress in reclaiming pages
                 */
                nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
                nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);

                /*
                 * If reclaim made no progress for a boost, stop reclaim as
                 * IO cannot be queued and it could be an infinite loop in
                 * extreme circumstances.
                 */
                if (nr_boost_reclaim && !nr_reclaimed)
                        break;

                if (raise_priority || !nr_reclaimed)
                        sc.priority--;
        } while (sc.priority >= 1);

        /*
         * Restart only if it went through the priority loop all the way,
         * but cache_trim_mode didn't work.
         */
        if (!sc.nr_reclaimed && sc.priority < 1 &&
            !sc.no_cache_trim_mode && sc.cache_trim_mode_failed) {
                sc.no_cache_trim_mode = 1;
                goto restart;
        }

        /*
         * If the reclaim was boosted, we might still be far from the
         * watermark_high at this point. We need to avoid increasing the
         * failure count to prevent the kswapd thread from stopping.
         */
        if (!sc.nr_reclaimed && !boosted) {
                int fail_cnt = atomic_inc_return(&pgdat->kswapd_failures);
                /* kswapd context, low overhead to trace every failure */
                trace_mm_vmscan_kswapd_reclaim_fail(pgdat->node_id, fail_cnt);
        }

out:
        clear_reclaim_active(pgdat, highest_zoneidx);

        /* If reclaim was boosted, account for the reclaim done in this pass */
        if (boosted) {
                unsigned long flags;

                for (i = 0; i <= highest_zoneidx; i++) {
                        if (!zone_boosts[i])
                                continue;

                        /* Increments are under the zone lock */
                        zone = pgdat->node_zones + i;
                        spin_lock_irqsave(&zone->lock, flags);
                        zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
                        spin_unlock_irqrestore(&zone->lock, flags);
                }

                /*
                 * As there is now likely space, wakeup kcompact to defragment
                 * pageblocks.
                 */
                wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
        }

        snapshot_refaults(NULL, pgdat);
        __fs_reclaim_release(_THIS_IP_);
        psi_memstall_leave(&pflags);
        set_task_reclaim_state(current, NULL);

        /*
         * Return the order kswapd stopped reclaiming at as
         * prepare_kswapd_sleep() takes it into account. If another caller
         * entered the allocator slow path while kswapd was awake, order will
         * remain at the higher level.
         */
        return sc.order;
}

/*
 * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
 * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
 * not a valid index then either kswapd runs for first time or kswapd couldn't
 * sleep after previous reclaim attempt (node is still unbalanced). In that
 * case return the zone index of the previous kswapd reclaim cycle.
 */
static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
                                           enum zone_type prev_highest_zoneidx)
{
        enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);

        return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
}

static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
                                unsigned int highest_zoneidx)
{
        long remaining = 0;
        DEFINE_WAIT(wait);

        if (freezing(current) || kthread_should_stop())
                return;

        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

        /*
         * Try to sleep for a short interval. Note that kcompactd will only be
         * woken if it is possible to sleep for a short interval. This is
         * deliberate on the assumption that if reclaim cannot keep an
         * eligible zone balanced that it's also unlikely that compaction will
         * succeed.
         */
        if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
                /*
                 * Compaction records what page blocks it recently failed to
                 * isolate pages from and skips them in the future scanning.
                 * When kswapd is going to sleep, it is reasonable to assume
                 * that pages and compaction may succeed so reset the cache.
                 */
                reset_isolation_suitable(pgdat);

                /*
                 * We have freed the memory, now we should compact it to make
                 * allocation of the requested order possible.
                 */
                wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);

                remaining = schedule_timeout(HZ/10);

                /*
                 * If woken prematurely then reset kswapd_highest_zoneidx and
                 * order. The values will either be from a wakeup request or
                 * the previous request that slept prematurely.
                 */
                if (remaining) {
                        WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
                                        kswapd_highest_zoneidx(pgdat,
                                                        highest_zoneidx));

                        if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
                                WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
                }

                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
        }

        /*
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
        if (!remaining &&
            prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);

                /*
                 * vmstat counters are not perfectly accurate and the estimated
                 * value for counters such as NR_FREE_PAGES can deviate from the
                 * true value by nr_online_cpus * threshold. To avoid the zone
                 * watermarks being breached while under pressure, we reduce the
                 * per-cpu vmstat threshold while kswapd is awake and restore
                 * them before going back to sleep.
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);

                if (!kthread_should_stop())
                        schedule();

                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
        } else {
                if (remaining)
                        count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
                else
                        count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
        }
        finish_wait(&pgdat->kswapd_wait, &wait);
}

/*
 * The background pageout daemon, started as a kernel thread
 * from the init process.
 *
 * This basically trickles out pages so that we have _some_
 * free memory available even if there is no other activity
 * that frees anything up. This is needed for things like routing
 * etc, where we otherwise might have all activity going on in
 * asynchronous contexts that cannot page things out.
 *
 * If there are applications that are active memory-allocators
 * (most normal use), this basically shouldn't matter.
 */
static int kswapd(void *p)
{
        unsigned int alloc_order, reclaim_order;
        unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
        pg_data_t *pgdat = (pg_data_t *)p;
        struct task_struct *tsk = current;

        /*
         * Tell the memory management that we're a "memory allocator",
         * and that if we need more memory we should get access to it
         * regardless (see "__alloc_pages()"). "kswapd" should
         * never get caught in the normal page freeing logic.
         *
         * (Kswapd normally doesn't need memory anyway, but sometimes
         * you need a small amount of memory in order to be able to
         * page out something else, and this flag essentially protects
         * us from recursively trying to free more memory as we're
         * trying to free the first piece of memory in the first place).
         */
        tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
        set_freezable();

        WRITE_ONCE(pgdat->kswapd_order, 0);
        WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
        atomic_set(&pgdat->nr_writeback_throttled, 0);
        for ( ; ; ) {
                bool was_frozen;

                alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
                highest_zoneidx = kswapd_highest_zoneidx(pgdat,
                                                        highest_zoneidx);

kswapd_try_sleep:
                kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
                                        highest_zoneidx);

                /* Read the new order and highest_zoneidx */
                alloc_order = READ_ONCE(pgdat->kswapd_order);
                highest_zoneidx = kswapd_highest_zoneidx(pgdat,
                                                        highest_zoneidx);
                WRITE_ONCE(pgdat->kswapd_order, 0);
                WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);

                if (kthread_freezable_should_stop(&was_frozen))
                        break;

                /*
                 * We can speed up thawing tasks if we don't call balance_pgdat
                 * after returning from the refrigerator
                 */
                if (was_frozen)
                        continue;

                /*
                 * Reclaim begins at the requested order but if a high-order
                 * reclaim fails then kswapd falls back to reclaiming for
                 * order-0. If that happens, kswapd will consider sleeping
                 * for the order it finished reclaiming at (reclaim_order)
                 * but kcompactd is woken to compact for the original
                 * request (alloc_order).
                 */
                trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
                                                alloc_order);
                reclaim_order = balance_pgdat(pgdat, alloc_order,
                                                highest_zoneidx);
                if (reclaim_order < alloc_order)
                        goto kswapd_try_sleep;
        }

        tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);

        return 0;
}

/*
 * A zone is low on free memory or too fragmented for high-order memory.  If
 * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
 * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
 * has failed or is not needed, still wake up kcompactd if only compaction is
 * needed.
 */
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
                   enum zone_type highest_zoneidx)
{
        pg_data_t *pgdat;
        enum zone_type curr_idx;

        if (!managed_zone(zone))
                return;

        if (!cpuset_zone_allowed(zone, gfp_flags))
                return;

        pgdat = zone->zone_pgdat;
        curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);

        if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
                WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);

        if (READ_ONCE(pgdat->kswapd_order) < order)
                WRITE_ONCE(pgdat->kswapd_order, order);

        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;

        /* Hopeless node, leave it to direct reclaim if possible */
        if (kswapd_test_hopeless(pgdat) ||
            (pgdat_balanced(pgdat, order, highest_zoneidx) &&
             !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
                /*
                 * There may be plenty of free memory available, but it's too
                 * fragmented for high-order allocations.  Wake up kcompactd
                 * and rely on compaction_suitable() to determine if it's
                 * needed.  If it fails, it will defer subsequent attempts to
                 * ratelimit its work.
                 */
                if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
                        wakeup_kcompactd(pgdat, order, highest_zoneidx);
                return;
        }

        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
                                      gfp_flags);
        wake_up_interruptible(&pgdat->kswapd_wait);
}

void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason)
{
        /* Only trace actual resets, not redundant zero-to-zero */
        if (atomic_xchg(&pgdat->kswapd_failures, 0))
                trace_mm_vmscan_kswapd_clear_hopeless(pgdat->node_id, reason);
}

/*
 * Reset kswapd_failures only when the node is balanced. Without this
 * check, successful direct reclaim (e.g., from cgroup memory.high
 * throttling) can keep resetting kswapd_failures even when the node
 * cannot be balanced, causing kswapd to run endlessly.
 */
void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
                               unsigned int order, int highest_zoneidx)
{
        if (pgdat_balanced(pgdat, order, highest_zoneidx))
                kswapd_clear_hopeless(pgdat, current_is_kswapd() ?
                        KSWAPD_CLEAR_HOPELESS_KSWAPD : KSWAPD_CLEAR_HOPELESS_DIRECT);
}

bool kswapd_test_hopeless(pg_data_t *pgdat)
{
        return atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES;
}

#ifdef CONFIG_HIBERNATION
/*
 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
 * freed pages.
 *
 * Rather than trying to age LRUs the aim is to preserve the overall
 * LRU order by reclaiming preferentially
 * inactive > active > active referenced > active mapped
 */
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
        struct scan_control sc = {
                .nr_to_reclaim = nr_to_reclaim,
                .gfp_mask = GFP_HIGHUSER_MOVABLE,
                .reclaim_idx = MAX_NR_ZONES - 1,
                .priority = DEF_PRIORITY,
                .may_writepage = 1,
                .may_unmap = 1,
                .may_swap = 1,
                .hibernation_mode = 1,
        };
        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
        unsigned long nr_reclaimed;
        unsigned int noreclaim_flag;

        fs_reclaim_acquire(sc.gfp_mask);
        noreclaim_flag = memalloc_noreclaim_save();
        set_task_reclaim_state(current, &sc.reclaim_state);

        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);

        set_task_reclaim_state(current, NULL);
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(sc.gfp_mask);

        return nr_reclaimed;
}
#endif /* CONFIG_HIBERNATION */

/*
 * This kswapd start function will be called by init and node-hot-add.
 */
void __meminit kswapd_run(int nid)
{
        pg_data_t *pgdat = NODE_DATA(nid);

        pgdat_kswapd_lock(pgdat);
        if (!pgdat->kswapd) {
                pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid);
                if (IS_ERR(pgdat->kswapd)) {
                        /* failure at boot is fatal */
                        pr_err("Failed to start kswapd on node %d, ret=%pe\n",
                                   nid, pgdat->kswapd);
                        BUG_ON(system_state < SYSTEM_RUNNING);
                        pgdat->kswapd = NULL;
                } else {
                        wake_up_process(pgdat->kswapd);
                }
        }
        pgdat_kswapd_unlock(pgdat);
}

/*
 * Called by memory hotplug when all memory in a node is offlined.  Caller must
 * be holding mem_hotplug_begin/done().
 */
void __meminit kswapd_stop(int nid)
{
        pg_data_t *pgdat = NODE_DATA(nid);
        struct task_struct *kswapd;

        pgdat_kswapd_lock(pgdat);
        kswapd = pgdat->kswapd;
        if (kswapd) {
                kthread_stop(kswapd);
                pgdat->kswapd = NULL;
        }
        pgdat_kswapd_unlock(pgdat);
}

static const struct ctl_table vmscan_sysctl_table[] = {
        {
                .procname        = "swappiness",
                .data                = &vm_swappiness,
                .maxlen                = sizeof(vm_swappiness),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO_HUNDRED,
        },
#ifdef CONFIG_NUMA
        {
                .procname        = "zone_reclaim_mode",
                .data                = &node_reclaim_mode,
                .maxlen                = sizeof(node_reclaim_mode),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
        }
#endif
};

static int __init kswapd_init(void)
{
        int nid;

        swap_setup();
        for_each_node_state(nid, N_MEMORY)
                 kswapd_run(nid);
        register_sysctl_init("vm", vmscan_sysctl_table);
        return 0;
}

module_init(kswapd_init)

#ifdef CONFIG_NUMA
/*
 * Node reclaim mode
 *
 * If non-zero call node_reclaim when the number of free pages falls below
 * the watermarks.
 */
int node_reclaim_mode __read_mostly;

/*
 * Priority for NODE_RECLAIM. This determines the fraction of pages
 * of a node considered for each zone_reclaim. 4 scans 1/16th of
 * a zone.
 */
#define NODE_RECLAIM_PRIORITY 4

/*
 * Percentage of pages in a zone that must be unmapped for node_reclaim to
 * occur.
 */
int sysctl_min_unmapped_ratio = 1;

/*
 * If the number of slab pages in a zone grows beyond this percentage then
 * slab reclaim needs to occur.
 */
int sysctl_min_slab_ratio = 5;

static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
{
        unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
        unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
                node_page_state(pgdat, NR_ACTIVE_FILE);

        /*
         * It's possible for there to be more file mapped pages than
         * accounted for by the pages on the file LRU lists because
         * tmpfs pages accounted for as ANON can also be FILE_MAPPED
         */
        return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
}

/* Work out how many page cache pages we can reclaim in this reclaim_mode */
static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
{
        unsigned long nr_pagecache_reclaimable;
        unsigned long delta = 0;

        /*
         * If RECLAIM_UNMAP is set, then all file pages are considered
         * potentially reclaimable. Otherwise, we have to worry about
         * pages like swapcache and node_unmapped_file_pages() provides
         * a better estimate
         */
        if (node_reclaim_mode & RECLAIM_UNMAP)
                nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
        else
                nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);

        /*
         * Since we can't clean folios through reclaim, remove dirty file
         * folios from consideration.
         */
        delta += node_page_state(pgdat, NR_FILE_DIRTY);

        /* Watch for any possible underflows due to delta */
        if (unlikely(delta > nr_pagecache_reclaimable))
                delta = nr_pagecache_reclaimable;

        return nr_pagecache_reclaimable - delta;
}

/*
 * Try to free up some pages from this node through reclaim.
 */
static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
                                    unsigned long nr_pages,
                                    struct scan_control *sc)
{
        struct task_struct *p = current;
        unsigned int noreclaim_flag;
        unsigned long pflags;

        trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order,
                                           sc->gfp_mask);

        cond_resched();
        psi_memstall_enter(&pflags);
        delayacct_freepages_start();
        fs_reclaim_acquire(sc->gfp_mask);
        /*
         * We need to be able to allocate from the reserves for RECLAIM_UNMAP
         */
        noreclaim_flag = memalloc_noreclaim_save();
        set_task_reclaim_state(p, &sc->reclaim_state);

        if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
            node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
                /*
                 * Free memory by calling shrink node with increasing
                 * priorities until we have enough memory freed.
                 */
                do {
                        shrink_node(pgdat, sc);
                } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0);
        }

        set_task_reclaim_state(p, NULL);
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(sc->gfp_mask);
        delayacct_freepages_end();
        psi_memstall_leave(&pflags);

        trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed, 0);

        return sc->nr_reclaimed;
}

int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
        int ret;
        /* Minimum pages needed in order to stay on node */
        const unsigned long nr_pages = 1 << order;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = current_gfp_context(gfp_mask),
                .order = order,
                .priority = NODE_RECLAIM_PRIORITY,
                .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
                .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
                .may_swap = 1,
                .reclaim_idx = gfp_zone(gfp_mask),
        };

        /*
         * Node reclaim reclaims unmapped file backed pages and
         * slab pages if we are over the defined limits.
         *
         * A small portion of unmapped file backed pages is needed for
         * file I/O otherwise pages read by file I/O will be immediately
         * thrown out if the node is overallocated. So we do not reclaim
         * if less than a specified percentage of the node is used by
         * unmapped file backed pages.
         */
        if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
            node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
            pgdat->min_slab_pages)
                return NODE_RECLAIM_FULL;

        /*
         * Do not scan if the allocation should not be delayed.
         */
        if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
                return NODE_RECLAIM_NOSCAN;

        /*
         * Only run node reclaim on the local node or on nodes that do not
         * have associated processors. This will favor the local processor
         * over remote processors and spread off node memory allocations
         * as wide as possible.
         */
        if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
                return NODE_RECLAIM_NOSCAN;

        if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
                return NODE_RECLAIM_NOSCAN;

        ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
        clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);

        if (ret)
                count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);
        else
                count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);

        return ret;
}

#else

static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
                                    unsigned long nr_pages,
                                    struct scan_control *sc)
{
        return 0;
}

#endif

enum {
        MEMORY_RECLAIM_SWAPPINESS = 0,
        MEMORY_RECLAIM_SWAPPINESS_MAX,
        MEMORY_RECLAIM_NULL,
};
static const match_table_t tokens = {
        { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
        { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
        { MEMORY_RECLAIM_NULL, NULL },
};

int user_proactive_reclaim(char *buf,
                           struct mem_cgroup *memcg, pg_data_t *pgdat)
{
        unsigned int nr_retries = MAX_RECLAIM_RETRIES;
        unsigned long nr_to_reclaim, nr_reclaimed = 0;
        int swappiness = -1;
        char *old_buf, *start;
        substring_t args[MAX_OPT_ARGS];
        gfp_t gfp_mask = GFP_KERNEL;

        if (!buf || (!memcg && !pgdat) || (memcg && pgdat))
                return -EINVAL;

        buf = strstrip(buf);

        old_buf = buf;
        nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
        if (buf == old_buf)
                return -EINVAL;

        buf = strstrip(buf);

        while ((start = strsep(&buf, " ")) != NULL) {
                if (!strlen(start))
                        continue;
                switch (match_token(start, tokens, args)) {
                case MEMORY_RECLAIM_SWAPPINESS:
                        if (match_int(&args[0], &swappiness))
                                return -EINVAL;
                        if (swappiness < MIN_SWAPPINESS ||
                            swappiness > MAX_SWAPPINESS)
                                return -EINVAL;
                        break;
                case MEMORY_RECLAIM_SWAPPINESS_MAX:
                        swappiness = SWAPPINESS_ANON_ONLY;
                        break;
                default:
                        return -EINVAL;
                }
        }

        while (nr_reclaimed < nr_to_reclaim) {
                /* Will converge on zero, but reclaim enforces a minimum */
                unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
                unsigned long reclaimed;

                if (signal_pending(current))
                        return -EINTR;

                /*
                 * This is the final attempt, drain percpu lru caches in the
                 * hope of introducing more evictable pages.
                 */
                if (!nr_retries)
                        lru_add_drain_all();

                if (memcg) {
                        unsigned int reclaim_options;

                        reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
                                          MEMCG_RECLAIM_PROACTIVE;
                        reclaimed = try_to_free_mem_cgroup_pages(memcg,
                                                 batch_size, gfp_mask,
                                                 reclaim_options,
                                                 swappiness == -1 ? NULL : &swappiness);
                } else {
                        struct scan_control sc = {
                                .gfp_mask = current_gfp_context(gfp_mask),
                                .reclaim_idx = gfp_zone(gfp_mask),
                                .proactive_swappiness = swappiness == -1 ? NULL : &swappiness,
                                .priority = DEF_PRIORITY,
                                .may_writepage = 1,
                                .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
                                .may_unmap = 1,
                                .may_swap = 1,
                                .proactive = 1,
                        };

                        if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED,
                                                  &pgdat->flags))
                                return -EBUSY;

                        reclaimed = __node_reclaim(pgdat, gfp_mask,
                                                   batch_size, &sc);
                        clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
                }

                if (!reclaimed && !nr_retries--)
                        return -EAGAIN;

                nr_reclaimed += reclaimed;
        }

        return 0;
}

/**
 * check_move_unevictable_folios - Move evictable folios to appropriate zone
 * lru list
 * @fbatch: Batch of lru folios to check.
 *
 * Checks folios for evictability, if an evictable folio is in the unevictable
 * lru list, moves it to the appropriate evictable lru list. This function
 * should be only used for lru folios.
 */
void check_move_unevictable_folios(struct folio_batch *fbatch)
{
        struct lruvec *lruvec = NULL;
        int pgscanned = 0;
        int pgrescued = 0;
        int i;

        for (i = 0; i < fbatch->nr; i++) {
                struct folio *folio = fbatch->folios[i];
                int nr_pages = folio_nr_pages(folio);

                pgscanned += nr_pages;

                /* block memcg migration while the folio moves between lrus */
                if (!folio_test_clear_lru(folio))
                        continue;

                lruvec = folio_lruvec_relock_irq(folio, lruvec);
                if (folio_evictable(folio) && folio_test_unevictable(folio)) {
                        lruvec_del_folio(lruvec, folio);
                        folio_clear_unevictable(folio);
                        lruvec_add_folio(lruvec, folio);
                        pgrescued += nr_pages;
                }
                folio_set_lru(folio);
        }

        if (lruvec) {
                __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
                __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
                lruvec_unlock_irq(lruvec);
        } else if (pgscanned) {
                count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
        }
}
EXPORT_SYMBOL_GPL(check_move_unevictable_folios);

#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
static ssize_t reclaim_store(struct device *dev,
                             struct device_attribute *attr,
                             const char *buf, size_t count)
{
        int ret, nid = dev->id;

        ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid));
        return ret ? -EAGAIN : count;
}

static DEVICE_ATTR_WO(reclaim);
int reclaim_register_node(struct node *node)
{
        return device_create_file(&node->dev, &dev_attr_reclaim);
}

void reclaim_unregister_node(struct node *node)
{
        return device_remove_file(&node->dev, &dev_attr_reclaim);
}
#endif
































































































































    1 



    1 





    1 












































    1 























    1 


    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS inode file
 *
 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Amagai Yoshiji.
 * Revised by Ryusuke Konishi.
 *
 */

#include <linux/types.h>
#include <linux/buffer_head.h>
#include "nilfs.h"
#include "mdt.h"
#include "alloc.h"
#include "ifile.h"
#include "cpfile.h"

/**
 * struct nilfs_ifile_info - on-memory private data of ifile
 * @mi: on-memory private data of metadata file
 * @palloc_cache: persistent object allocator cache of ifile
 */
struct nilfs_ifile_info {
        struct nilfs_mdt_info mi;
        struct nilfs_palloc_cache palloc_cache;
};

static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile)
{
        return (struct nilfs_ifile_info *)NILFS_MDT(ifile);
}

/**
 * nilfs_ifile_create_inode - create a new disk inode
 * @ifile: ifile inode
 * @out_ino: pointer to a variable to store inode number
 * @out_bh: buffer_head contains newly allocated disk inode
 *
 * nilfs_ifile_create_inode() allocates a new inode in the ifile metadata
 * file and stores the inode number in the variable pointed to by @out_ino,
 * as well as storing the ifile's buffer with the disk inode in the location
 * pointed to by @out_bh.
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOMEM        - Insufficient memory available.
 * * %-ENOSPC        - No inode left.
 */
int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
                             struct buffer_head **out_bh)
{
        struct nilfs_palloc_req req;
        int ret;

        req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb);
        req.pr_entry_bh = NULL;

        ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false);
        if (!ret) {
                ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
                                                   &req.pr_entry_bh);
                if (ret < 0)
                        nilfs_palloc_abort_alloc_entry(ifile, &req);
        }
        if (ret < 0) {
                brelse(req.pr_entry_bh);
                return ret;
        }
        nilfs_palloc_commit_alloc_entry(ifile, &req);
        mark_buffer_dirty(req.pr_entry_bh);
        nilfs_mdt_mark_dirty(ifile);
        *out_ino = (ino_t)req.pr_entry_nr;
        *out_bh = req.pr_entry_bh;
        return 0;
}

/**
 * nilfs_ifile_delete_inode - delete a disk inode
 * @ifile: ifile inode
 * @ino: inode number
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EIO        - I/O error (including metadata corruption).
 * * %-ENOENT        - Inode number unallocated.
 * * %-ENOMEM        - Insufficient memory available.
 */
int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
{
        struct nilfs_palloc_req req = {
                .pr_entry_nr = ino, .pr_entry_bh = NULL
        };
        struct nilfs_inode *raw_inode;
        size_t offset;
        int ret;

        ret = nilfs_palloc_prepare_free_entry(ifile, &req);
        if (!ret) {
                ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
                                                   &req.pr_entry_bh);
                if (ret < 0)
                        nilfs_palloc_abort_free_entry(ifile, &req);
        }
        if (ret < 0) {
                brelse(req.pr_entry_bh);
                return ret;
        }

        offset = nilfs_palloc_entry_offset(ifile, req.pr_entry_nr,
                                           req.pr_entry_bh);
        raw_inode = kmap_local_folio(req.pr_entry_bh->b_folio, offset);
        raw_inode->i_flags = 0;
        kunmap_local(raw_inode);

        mark_buffer_dirty(req.pr_entry_bh);
        brelse(req.pr_entry_bh);

        nilfs_palloc_commit_free_entry(ifile, &req);

        return 0;
}

int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
                                struct buffer_head **out_bh)
{
        struct super_block *sb = ifile->i_sb;
        int err;

        if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
                nilfs_error(sb, "bad inode number: %lu", (unsigned long)ino);
                return -EINVAL;
        }

        err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
        if (unlikely(err))
                nilfs_warn(sb, "error %d reading inode: ino=%lu",
                           err, (unsigned long)ino);
        return err;
}

/**
 * nilfs_ifile_count_free_inodes - calculate free inodes count
 * @ifile: ifile inode
 * @nmaxinodes: current maximum of available inodes count [out]
 * @nfreeinodes: free inodes count [out]
 *
 * Return: 0 on success, or a negative error code on failure.
 */
int nilfs_ifile_count_free_inodes(struct inode *ifile,
                                    u64 *nmaxinodes, u64 *nfreeinodes)
{
        u64 nused;
        int err;

        *nmaxinodes = 0;
        *nfreeinodes = 0;

        nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count);
        err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
        if (likely(!err))
                *nfreeinodes = *nmaxinodes - nused;
        return err;
}

/**
 * nilfs_ifile_read - read or get ifile inode
 * @sb: super block instance
 * @root: root object
 * @cno: number of checkpoint entry to read
 * @inode_size: size of an inode
 *
 * Return: 0 on success, or one of the following negative error codes on
 * failure:
 * * %-EINVAL        - Invalid checkpoint.
 * * %-ENOMEM        - Insufficient memory available.
 * * %-EIO        - I/O error (including metadata corruption).
 */
int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
                     __u64 cno, size_t inode_size)
{
        struct the_nilfs *nilfs;
        struct inode *ifile;
        int err;

        ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
        if (unlikely(!ifile))
                return -ENOMEM;
        if (!(inode_state_read_once(ifile) & I_NEW))
                goto out;

        err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
                             sizeof(struct nilfs_ifile_info));
        if (err)
                goto failed;

        err = nilfs_palloc_init_blockgroup(ifile, inode_size);
        if (err)
                goto failed;

        nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);

        nilfs = sb->s_fs_info;
        err = nilfs_cpfile_read_checkpoint(nilfs->ns_cpfile, cno, root, ifile);
        if (err)
                goto failed;

        unlock_new_inode(ifile);
 out:
        return 0;
 failed:
        iget_failed(ifile);
        return err;
}





















































































































   12 



   12 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   11 



   10 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/fcntl.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/syscalls.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/file.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/pipe_fs_i.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/rw_hint.h>

#include <linux/poll.h>
#include <asm/siginfo.h>
#include <linux/uaccess.h>

#include "internal.h"

#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)

static int setfl(int fd, struct file * filp, unsigned int arg)
{
        struct inode * inode = file_inode(filp);
        int error = 0;

        /*
         * O_APPEND cannot be cleared if the file is marked as append-only
         * and the file is open for write.
         */
        if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
                return -EPERM;

        /* O_NOATIME can only be set by the owner or superuser */
        if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
                if (!inode_owner_or_capable(file_mnt_idmap(filp), inode))
                        return -EPERM;

        /* required for strict SunOS emulation */
        if (O_NONBLOCK != O_NDELAY)
               if (arg & O_NDELAY)
                   arg |= O_NONBLOCK;

        /* Pipe packetized mode is controlled by O_DIRECT flag */
        if (!S_ISFIFO(inode->i_mode) &&
            (arg & O_DIRECT) &&
            !(filp->f_mode & FMODE_CAN_ODIRECT))
                return -EINVAL;

        if (filp->f_op->check_flags)
                error = filp->f_op->check_flags(arg);
        if (error)
                return error;

        /*
         * ->fasync() is responsible for setting the FASYNC bit.
         */
        if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
                error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
                if (error < 0)
                        goto out;
                if (error > 0)
                        error = 0;
        }
        spin_lock(&filp->f_lock);
        filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
        filp->f_iocb_flags = iocb_flags(filp);
        spin_unlock(&filp->f_lock);

 out:
        return error;
}

/*
 * Allocate an file->f_owner struct if it doesn't exist, handling racing
 * allocations correctly.
 */
int file_f_owner_allocate(struct file *file)
{
        struct fown_struct *f_owner;

        f_owner = file_f_owner(file);
        if (f_owner)
                return 0;

        f_owner = kzalloc_obj(struct fown_struct);
        if (!f_owner)
                return -ENOMEM;

        rwlock_init(&f_owner->lock);
        f_owner->file = file;
        /* If someone else raced us, drop our allocation. */
        if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
                kfree(f_owner);
        return 0;
}
EXPORT_SYMBOL(file_f_owner_allocate);

void file_f_owner_release(struct file *file)
{
        struct fown_struct *f_owner;

        f_owner = file_f_owner(file);
        if (f_owner) {
                put_pid(f_owner->pid);
                kfree(f_owner);
        }
}

void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
                int force)
{
        struct fown_struct *f_owner;

        f_owner = file_f_owner(filp);
        if (WARN_ON_ONCE(!f_owner))
                return;

        write_lock_irq(&f_owner->lock);
        if (force || !f_owner->pid) {
                put_pid(f_owner->pid);
                f_owner->pid = get_pid(pid);
                f_owner->pid_type = type;

                if (pid) {
                        const struct cred *cred = current_cred();
                        security_file_set_fowner(filp);
                        f_owner->uid = cred->uid;
                        f_owner->euid = cred->euid;
                }
        }
        write_unlock_irq(&f_owner->lock);
}
EXPORT_SYMBOL(__f_setown);

int f_setown(struct file *filp, int who, int force)
{
        enum pid_type type;
        struct pid *pid = NULL;
        int ret = 0;

        might_sleep();

        type = PIDTYPE_TGID;
        if (who < 0) {
                /* avoid overflow below */
                if (who == INT_MIN)
                        return -EINVAL;

                type = PIDTYPE_PGID;
                who = -who;
        }

        ret = file_f_owner_allocate(filp);
        if (ret)
                return ret;

        rcu_read_lock();
        if (who) {
                pid = find_vpid(who);
                if (!pid)
                        ret = -ESRCH;
        }

        if (!ret)
                __f_setown(filp, pid, type, force);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(f_setown);

void f_delown(struct file *filp)
{
        __f_setown(filp, NULL, PIDTYPE_TGID, 1);
}

pid_t f_getown(struct file *filp)
{
        pid_t pid = 0;
        struct fown_struct *f_owner;

        f_owner = file_f_owner(filp);
        if (!f_owner)
                return pid;

        read_lock_irq(&f_owner->lock);
        rcu_read_lock();
        if (pid_task(f_owner->pid, f_owner->pid_type)) {
                pid = pid_vnr(f_owner->pid);
                if (f_owner->pid_type == PIDTYPE_PGID)
                        pid = -pid;
        }
        rcu_read_unlock();
        read_unlock_irq(&f_owner->lock);
        return pid;
}

static int f_setown_ex(struct file *filp, unsigned long arg)
{
        struct f_owner_ex __user *owner_p = (void __user *)arg;
        struct f_owner_ex owner;
        struct pid *pid;
        int type;
        int ret;

        ret = copy_from_user(&owner, owner_p, sizeof(owner));
        if (ret)
                return -EFAULT;

        switch (owner.type) {
        case F_OWNER_TID:
                type = PIDTYPE_PID;
                break;

        case F_OWNER_PID:
                type = PIDTYPE_TGID;
                break;

        case F_OWNER_PGRP:
                type = PIDTYPE_PGID;
                break;

        default:
                return -EINVAL;
        }

        ret = file_f_owner_allocate(filp);
        if (ret)
                return ret;

        rcu_read_lock();
        pid = find_vpid(owner.pid);
        if (owner.pid && !pid)
                ret = -ESRCH;
        else
                 __f_setown(filp, pid, type, 1);
        rcu_read_unlock();

        return ret;
}

static int f_getown_ex(struct file *filp, unsigned long arg)
{
        struct f_owner_ex __user *owner_p = (void __user *)arg;
        struct f_owner_ex owner = {};
        int ret = 0;
        struct fown_struct *f_owner;
        enum pid_type pid_type = PIDTYPE_PID;

        f_owner = file_f_owner(filp);
        if (f_owner) {
                read_lock_irq(&f_owner->lock);
                rcu_read_lock();
                if (pid_task(f_owner->pid, f_owner->pid_type))
                        owner.pid = pid_vnr(f_owner->pid);
                rcu_read_unlock();
                pid_type = f_owner->pid_type;
        }

        switch (pid_type) {
        case PIDTYPE_PID:
                owner.type = F_OWNER_TID;
                break;

        case PIDTYPE_TGID:
                owner.type = F_OWNER_PID;
                break;

        case PIDTYPE_PGID:
                owner.type = F_OWNER_PGRP;
                break;

        default:
                WARN_ON(1);
                ret = -EINVAL;
                break;
        }
        if (f_owner)
                read_unlock_irq(&f_owner->lock);

        if (!ret) {
                ret = copy_to_user(owner_p, &owner, sizeof(owner));
                if (ret)
                        ret = -EFAULT;
        }
        return ret;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
        struct user_namespace *user_ns = current_user_ns();
        struct fown_struct *f_owner;
        uid_t __user *dst = (void __user *)arg;
        uid_t src[2] = {0, 0};
        int err;

        f_owner = file_f_owner(filp);
        if (f_owner) {
                read_lock_irq(&f_owner->lock);
                src[0] = from_kuid(user_ns, f_owner->uid);
                src[1] = from_kuid(user_ns, f_owner->euid);
                read_unlock_irq(&f_owner->lock);
        }

        err  = put_user(src[0], &dst[0]);
        err |= put_user(src[1], &dst[1]);

        return err;
}
#else
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
        return -EINVAL;
}
#endif

static bool rw_hint_valid(u64 hint)
{
        BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
        BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
        BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
        BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
        BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
        BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);

        switch (hint) {
        case RWH_WRITE_LIFE_NOT_SET:
        case RWH_WRITE_LIFE_NONE:
        case RWH_WRITE_LIFE_SHORT:
        case RWH_WRITE_LIFE_MEDIUM:
        case RWH_WRITE_LIFE_LONG:
        case RWH_WRITE_LIFE_EXTREME:
                return true;
        default:
                return false;
        }
}

static long fcntl_get_rw_hint(struct file *file, unsigned long arg)
{
        struct inode *inode = file_inode(file);
        u64 __user *argp = (u64 __user *)arg;
        u64 hint = READ_ONCE(inode->i_write_hint);

        if (copy_to_user(argp, &hint, sizeof(*argp)))
                return -EFAULT;
        return 0;
}

static long fcntl_set_rw_hint(struct file *file, unsigned long arg)
{
        struct inode *inode = file_inode(file);
        u64 __user *argp = (u64 __user *)arg;
        u64 hint;

        if (!inode_owner_or_capable(file_mnt_idmap(file), inode))
                return -EPERM;

        if (copy_from_user(&hint, argp, sizeof(hint)))
                return -EFAULT;
        if (!rw_hint_valid(hint))
                return -EINVAL;

        WRITE_ONCE(inode->i_write_hint, hint);

        /*
         * file->f_mapping->host may differ from inode. As an example,
         * blkdev_open() modifies file->f_mapping.
         */
        if (file->f_mapping->host != inode)
                WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);

        return 0;
}

/* Is the file descriptor a dup of the file? */
static long f_dupfd_query(int fd, struct file *filp)
{
        CLASS(fd_raw, f)(fd);

        if (fd_empty(f))
                return -EBADF;

        /*
         * We can do the 'fdput()' immediately, as the only thing that
         * matters is the pointer value which isn't changed by the fdput.
         *
         * Technically we didn't need a ref at all, and 'fdget()' was
         * overkill, but given our lockless file pointer lookup, the
         * alternatives are complicated.
         */
        return fd_file(f) == filp;
}

/* Let the caller figure out whether a given file was just created. */
static long f_created_query(const struct file *filp)
{
        return !!(filp->f_mode & FMODE_CREATED);
}

static int f_owner_sig(struct file *filp, int signum, bool setsig)
{
        int ret = 0;
        struct fown_struct *f_owner;

        might_sleep();

        if (setsig) {
                if (!valid_signal(signum))
                        return -EINVAL;

                ret = file_f_owner_allocate(filp);
                if (ret)
                        return ret;
        }

        f_owner = file_f_owner(filp);
        if (setsig)
                f_owner->signum = signum;
        else if (f_owner)
                ret = f_owner->signum;
        return ret;
}

static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
                struct file *filp)
{
        void __user *argp = (void __user *)arg;
        struct delegation deleg;
        int argi = (int)arg;
        struct flock flock;
        long err = -EINVAL;

        switch (cmd) {
        case F_CREATED_QUERY:
                err = f_created_query(filp);
                break;
        case F_DUPFD:
                err = f_dupfd(argi, filp, 0);
                break;
        case F_DUPFD_CLOEXEC:
                err = f_dupfd(argi, filp, O_CLOEXEC);
                break;
        case F_DUPFD_QUERY:
                err = f_dupfd_query(argi, filp);
                break;
        case F_GETFD:
                err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
                break;
        case F_SETFD:
                err = 0;
                set_close_on_exec(fd, argi & FD_CLOEXEC);
                break;
        case F_GETFL:
                err = filp->f_flags;
                break;
        case F_SETFL:
                err = setfl(fd, filp, argi);
                break;
#if BITS_PER_LONG != 32
        /* 32-bit arches must use fcntl64() */
        case F_OFD_GETLK:
#endif
        case F_GETLK:
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        return -EFAULT;
                err = fcntl_getlk(filp, cmd, &flock);
                if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                        return -EFAULT;
                break;
#if BITS_PER_LONG != 32
        /* 32-bit arches must use fcntl64() */
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                fallthrough;
#endif
        case F_SETLK:
        case F_SETLKW:
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        return -EFAULT;
                err = fcntl_setlk(fd, filp, cmd, &flock);
                break;
        case F_GETOWN:
                /*
                 * XXX If f_owner is a process group, the
                 * negative return value will get converted
                 * into an error.  Oops.  If we keep the
                 * current syscall conventions, the only way
                 * to fix this will be in libc.
                 */
                err = f_getown(filp);
                force_successful_syscall_return();
                break;
        case F_SETOWN:
                err = f_setown(filp, argi, 1);
                break;
        case F_GETOWN_EX:
                err = f_getown_ex(filp, arg);
                break;
        case F_SETOWN_EX:
                err = f_setown_ex(filp, arg);
                break;
        case F_GETOWNER_UIDS:
                err = f_getowner_uids(filp, arg);
                break;
        case F_GETSIG:
                err = f_owner_sig(filp, 0, false);
                break;
        case F_SETSIG:
                err = f_owner_sig(filp, argi, true);
                break;
        case F_GETLEASE:
                err = fcntl_getlease(filp);
                break;
        case F_SETLEASE:
                err = fcntl_setlease(fd, filp, argi);
                break;
        case F_NOTIFY:
                err = fcntl_dirnotify(fd, filp, argi);
                break;
        case F_SETPIPE_SZ:
        case F_GETPIPE_SZ:
                err = pipe_fcntl(filp, cmd, argi);
                break;
        case F_ADD_SEALS:
        case F_GET_SEALS:
                err = memfd_fcntl(filp, cmd, argi);
                break;
        case F_GET_RW_HINT:
                err = fcntl_get_rw_hint(filp, arg);
                break;
        case F_SET_RW_HINT:
                err = fcntl_set_rw_hint(filp, arg);
                break;
        case F_GETDELEG:
                if (copy_from_user(&deleg, argp, sizeof(deleg)))
                        return -EFAULT;
                err = fcntl_getdeleg(filp, &deleg);
                if (!err && copy_to_user(argp, &deleg, sizeof(deleg)))
                        return -EFAULT;
                break;
        case F_SETDELEG:
                if (copy_from_user(&deleg, argp, sizeof(deleg)))
                        return -EFAULT;
                err = fcntl_setdeleg(fd, filp, &deleg);
                break;
        default:
                break;
        }
        return err;
}

static int check_fcntl_cmd(unsigned cmd)
{
        switch (cmd) {
        case F_CREATED_QUERY:
        case F_DUPFD:
        case F_DUPFD_CLOEXEC:
        case F_DUPFD_QUERY:
        case F_GETFD:
        case F_SETFD:
        case F_GETFL:
                return 1;
        }
        return 0;
}

SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{        
        CLASS(fd_raw, f)(fd);
        long err;

        if (fd_empty(f))
                return -EBADF;

        if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        return -EBADF;
        }

        err = security_file_fcntl(fd_file(f), cmd, arg);
        if (!err)
                err = do_fcntl(fd, cmd, arg, fd_file(f));

        return err;
}

#if BITS_PER_LONG == 32
SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                unsigned long, arg)
{        
        void __user *argp = (void __user *)arg;
        CLASS(fd_raw, f)(fd);
        struct flock64 flock;
        long err;

        if (fd_empty(f))
                return -EBADF;

        if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        return -EBADF;
        }

        err = security_file_fcntl(fd_file(f), cmd, arg);
        if (err)
                return err;
        
        switch (cmd) {
        case F_GETLK64:
        case F_OFD_GETLK:
                err = -EFAULT;
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        break;
                err = fcntl_getlk64(fd_file(f), cmd, &flock);
                if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                        err = -EFAULT;
                break;
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                err = -EFAULT;
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        break;
                err = fcntl_setlk64(fd, fd_file(f), cmd, &flock);
                break;
        default:
                err = do_fcntl(fd, cmd, arg, fd_file(f));
                break;
        }
        return err;
}
#endif

#ifdef CONFIG_COMPAT
/* careful - don't use anywhere else */
#define copy_flock_fields(dst, src)                \
        (dst)->l_type = (src)->l_type;                \
        (dst)->l_whence = (src)->l_whence;        \
        (dst)->l_start = (src)->l_start;        \
        (dst)->l_len = (src)->l_len;                \
        (dst)->l_pid = (src)->l_pid;

static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl)
{
        struct compat_flock fl;

        if (copy_from_user(&fl, ufl, sizeof(struct compat_flock)))
                return -EFAULT;
        copy_flock_fields(kfl, &fl);
        return 0;
}

static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl)
{
        struct compat_flock64 fl;

        if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64)))
                return -EFAULT;
        copy_flock_fields(kfl, &fl);
        return 0;
}

static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl)
{
        struct compat_flock fl;

        memset(&fl, 0, sizeof(struct compat_flock));
        copy_flock_fields(&fl, kfl);
        if (copy_to_user(ufl, &fl, sizeof(struct compat_flock)))
                return -EFAULT;
        return 0;
}

static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl)
{
        struct compat_flock64 fl;

        BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start));
        BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len));

        memset(&fl, 0, sizeof(struct compat_flock64));
        copy_flock_fields(&fl, kfl);
        if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
                return -EFAULT;
        return 0;
}
#undef copy_flock_fields

static unsigned int
convert_fcntl_cmd(unsigned int cmd)
{
        switch (cmd) {
        case F_GETLK64:
                return F_GETLK;
        case F_SETLK64:
                return F_SETLK;
        case F_SETLKW64:
                return F_SETLKW;
        }

        return cmd;
}

/*
 * GETLK was successful and we need to return the data, but it needs to fit in
 * the compat structure.
 * l_start shouldn't be too big, unless the original start + end is greater than
 * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return
 * -EOVERFLOW in that case.  l_len could be too big, in which case we just
 * truncate it, and only allow the app to see that part of the conflicting lock
 * that might make sense to it anyway
 */
static int fixup_compat_flock(struct flock *flock)
{
        if (flock->l_start > COMPAT_OFF_T_MAX)
                return -EOVERFLOW;
        if (flock->l_len > COMPAT_OFF_T_MAX)
                flock->l_len = COMPAT_OFF_T_MAX;
        return 0;
}

static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
                             compat_ulong_t arg)
{
        CLASS(fd_raw, f)(fd);
        struct flock flock;
        long err;

        if (fd_empty(f))
                return -EBADF;

        if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        return -EBADF;
        }

        err = security_file_fcntl(fd_file(f), cmd, arg);
        if (err)
                return err;

        switch (cmd) {
        case F_GETLK:
                err = get_compat_flock(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
                if (err)
                        break;
                err = fixup_compat_flock(&flock);
                if (!err)
                        err = put_compat_flock(&flock, compat_ptr(arg));
                break;
        case F_GETLK64:
        case F_OFD_GETLK:
                err = get_compat_flock64(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
                if (!err)
                        err = put_compat_flock64(&flock, compat_ptr(arg));
                break;
        case F_SETLK:
        case F_SETLKW:
                err = get_compat_flock(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
                break;
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                err = get_compat_flock64(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
                break;
        default:
                err = do_fcntl(fd, cmd, arg, fd_file(f));
                break;
        }
        return err;
}

COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        return do_compat_fcntl64(fd, cmd, arg);
}

COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        switch (cmd) {
        case F_GETLK64:
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_GETLK:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                return -EINVAL;
        }
        return do_compat_fcntl64(fd, cmd, arg);
}
#endif

/* Table to convert sigio signal codes into poll band bitmaps */

static const __poll_t band_table[NSIGPOLL] = {
        EPOLLIN | EPOLLRDNORM,                        /* POLL_IN */
        EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND,        /* POLL_OUT */
        EPOLLIN | EPOLLRDNORM | EPOLLMSG,                /* POLL_MSG */
        EPOLLERR,                                /* POLL_ERR */
        EPOLLPRI | EPOLLRDBAND,                        /* POLL_PRI */
        EPOLLHUP | EPOLLERR                        /* POLL_HUP */
};

static inline int sigio_perm(struct task_struct *p,
                             struct fown_struct *fown, int sig)
{
        const struct cred *cred;
        int ret;

        rcu_read_lock();
        cred = __task_cred(p);
        ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) ||
                uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) ||
                uid_eq(fown->uid,  cred->suid) || uid_eq(fown->uid,  cred->uid)) &&
               !security_file_send_sigiotask(p, fown, sig));
        rcu_read_unlock();
        return ret;
}

static void send_sigio_to_task(struct task_struct *p,
                               struct fown_struct *fown,
                               int fd, int reason, enum pid_type type)
{
        /*
         * F_SETSIG can change ->signum lockless in parallel, make
         * sure we read it once and use the same value throughout.
         */
        int signum = READ_ONCE(fown->signum);

        if (!sigio_perm(p, fown, signum))
                return;

        switch (signum) {
                default: {
                        kernel_siginfo_t si;

                        /* Queue a rt signal with the appropriate fd as its
                           value.  We use SI_SIGIO as the source, not 
                           SI_KERNEL, since kernel signals always get 
                           delivered even if we can't queue.  Failure to
                           queue in this case _should_ be reported; we fall
                           back to SIGIO in that case. --sct */
                        clear_siginfo(&si);
                        si.si_signo = signum;
                        si.si_errno = 0;
                        si.si_code  = reason;
                        /*
                         * Posix definies POLL_IN and friends to be signal
                         * specific si_codes for SIG_POLL.  Linux extended
                         * these si_codes to other signals in a way that is
                         * ambiguous if other signals also have signal
                         * specific si_codes.  In that case use SI_SIGIO instead
                         * to remove the ambiguity.
                         */
                        if ((signum != SIGPOLL) && sig_specific_sicodes(signum))
                                si.si_code = SI_SIGIO;

                        /* Make sure we are called with one of the POLL_*
                           reasons, otherwise we could leak kernel stack into
                           userspace.  */
                        BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL));
                        if (reason - POLL_IN >= NSIGPOLL)
                                si.si_band  = ~0L;
                        else
                                si.si_band = mangle_poll(band_table[reason - POLL_IN]);
                        si.si_fd    = fd;
                        if (!do_send_sig_info(signum, &si, p, type))
                                break;
                }
                        fallthrough;        /* fall back on the old plain SIGIO signal */
                case 0:
                        do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
        }
}

void send_sigio(struct fown_struct *fown, int fd, int band)
{
        struct task_struct *p;
        enum pid_type type;
        unsigned long flags;
        struct pid *pid;
        
        read_lock_irqsave(&fown->lock, flags);

        type = fown->pid_type;
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;

        if (type <= PIDTYPE_TGID) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        send_sigio_to_task(p, fown, fd, band, type);
                rcu_read_unlock();
        } else {
                read_lock(&tasklist_lock);
                do_each_pid_task(pid, type, p) {
                        send_sigio_to_task(p, fown, fd, band, type);
                } while_each_pid_task(pid, type, p);
                read_unlock(&tasklist_lock);
        }
 out_unlock_fown:
        read_unlock_irqrestore(&fown->lock, flags);
}

static void send_sigurg_to_task(struct task_struct *p,
                                struct fown_struct *fown, enum pid_type type)
{
        if (sigio_perm(p, fown, SIGURG))
                do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
}

int send_sigurg(struct file *file)
{
        struct fown_struct *fown;
        struct task_struct *p;
        enum pid_type type;
        struct pid *pid;
        unsigned long flags;
        int ret = 0;
        
        fown = file_f_owner(file);
        if (!fown)
                return 0;

        read_lock_irqsave(&fown->lock, flags);

        type = fown->pid_type;
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;

        ret = 1;

        if (type <= PIDTYPE_TGID) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        send_sigurg_to_task(p, fown, type);
                rcu_read_unlock();
        } else {
                read_lock(&tasklist_lock);
                do_each_pid_task(pid, type, p) {
                        send_sigurg_to_task(p, fown, type);
                } while_each_pid_task(pid, type, p);
                read_unlock(&tasklist_lock);
        }
 out_unlock_fown:
        read_unlock_irqrestore(&fown->lock, flags);
        return ret;
}

static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __ro_after_init;

/*
 * Remove a fasync entry. If successfully removed, return
 * positive and clear the FASYNC flag. If no entry exists,
 * do nothing and return 0.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 *
 */
int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
{
        struct fasync_struct *fa, **fp;
        int result = 0;

        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;

                write_lock_irq(&fa->fa_lock);
                fa->fa_file = NULL;
                write_unlock_irq(&fa->fa_lock);

                *fp = fa->fa_next;
                kfree_rcu(fa, fa_rcu);
                filp->f_flags &= ~FASYNC;
                result = 1;
                break;
        }
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
}

struct fasync_struct *fasync_alloc(void)
{
        return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
}

/*
 * NOTE! This can be used only for unused fasync entries:
 * entries that actually got inserted on the fasync list
 * need to be released by rcu - see fasync_remove_entry.
 */
void fasync_free(struct fasync_struct *new)
{
        kmem_cache_free(fasync_cache, new);
}

/*
 * Insert a new entry into the fasync list.  Return the pointer to the
 * old one if we didn't use the new one.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 */
struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
{
        struct fasync_struct *fa, **fp;

        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;

                write_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
                write_unlock_irq(&fa->fa_lock);
                goto out;
        }

        rwlock_init(&new->fa_lock);
        new->magic = FASYNC_MAGIC;
        new->fa_file = filp;
        new->fa_fd = fd;
        new->fa_next = *fapp;
        rcu_assign_pointer(*fapp, new);
        filp->f_flags |= FASYNC;

out:
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return fa;
}

/*
 * Add a fasync entry. Return negative on error, positive if
 * added, and zero if did nothing but change an existing one.
 */
static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
{
        struct fasync_struct *new;

        new = fasync_alloc();
        if (!new)
                return -ENOMEM;

        /*
         * fasync_insert_entry() returns the old (update) entry if
         * it existed.
         *
         * So free the (unused) new entry and return 0 to let the
         * caller know that we didn't add any new fasync entries.
         */
        if (fasync_insert_entry(fd, filp, fapp, new)) {
                fasync_free(new);
                return 0;
        }

        return 1;
}

/*
 * fasync_helper() is used by almost all character device drivers
 * to set up the fasync queue, and for regular files by the file
 * lease code. It returns negative on error, 0 if it did no changes
 * and positive if it added/deleted the entry.
 */
int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
{
        if (!on)
                return fasync_remove_entry(filp, fapp);
        return fasync_add_entry(fd, filp, fapp);
}

EXPORT_SYMBOL(fasync_helper);

/*
 * rcu_read_lock() is held
 */
static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
        while (fa) {
                struct fown_struct *fown;
                unsigned long flags;

                if (fa->magic != FASYNC_MAGIC) {
                        printk(KERN_ERR "kill_fasync: bad magic number in "
                               "fasync_struct!\n");
                        return;
                }
                read_lock_irqsave(&fa->fa_lock, flags);
                if (fa->fa_file) {
                        fown = file_f_owner(fa->fa_file);
                        if (!fown)
                                goto next;
                        /* Don't send SIGURG to processes which have not set a
                           queued signum: SIGURG has its own default signalling
                           mechanism. */
                        if (!(sig == SIGURG && fown->signum == 0))
                                send_sigio(fown, fa->fa_fd, band);
                }
next:
                read_unlock_irqrestore(&fa->fa_lock, flags);
                fa = rcu_dereference(fa->fa_next);
        }
}

void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
        /* First a quick test without locking: usually
         * the list is empty.
         */
        if (*fp) {
                rcu_read_lock();
                kill_fasync_rcu(rcu_dereference(*fp), sig, band);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(kill_fasync);

static int __init fcntl_init(void)
{
        /*
         * Please add new bits here to ensure allocation uniqueness.
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
        BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
                HWEIGHT32(
                        (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
                        __FMODE_EXEC));

        fasync_cache = kmem_cache_create("fasync_cache",
                                         sizeof(struct fasync_struct), 0,
                                         SLAB_PANIC | SLAB_ACCOUNT, NULL);
        return 0;
}

module_init(fcntl_init)





























































































    6 











    4 




    6 



    5 


    6 

    6 






































































    6 






    1 








    6 


    6 




















    6 



    6 





















    6 












    6 








    1 











    4 








    1 





    6 


    6 






    5 

    1 


    6 











    5 












    7 



    1 

    6 





















































    6 














    6 

    6 

















    1 

    5 






    5 
    5 








    5 




    5 





    3 
    2 



    2 


    6 













    1 
    5 


























    4 
    6 






    6 







    6 



    6 









    5 

    5 



    5 




    5 














    5 




















































    5 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        IPv6 input
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Ian P. Morris                <I.P.Morris@soton.ac.uk>
 *
 *        Based in linux/net/ipv4/ip_input.c
 */
/* Changes
 *
 *        Mitsuru KANDA @USAGI and
 *        YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs().
 */

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/icmpv6.h>
#include <linux/mroute6.h>
#include <linux/slab.h>
#include <linux/indirect_call_wrapper.h>

#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>

#include <net/sock.h>
#include <net/snmp.h>
#include <net/udp.h>

#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/transp_v6.h>
#include <net/rawv6.h>
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/xfrm.h>
#include <net/inet_ecn.h>
#include <net/dst_metadata.h>
#include <net/inet6_hashtables.h>

static void tcp_v6_early_demux(struct sk_buff *skb)
{
        struct net *net = dev_net_rcu(skb->dev);
        const struct ipv6hdr *hdr;
        const struct tcphdr *th;
        struct sock *sk;

        if (skb->pkt_type != PACKET_HOST)
                return;

        if (!pskb_may_pull(skb, skb_transport_offset(skb) +
                                sizeof(struct tcphdr)))
                return;

        hdr = ipv6_hdr(skb);
        th = tcp_hdr(skb);

        if (th->doff < sizeof(struct tcphdr) / 4)
                return;

        /* Note : We use inet6_iif() here, not tcp_v6_iif() */
        sk = __inet6_lookup_established(net, &hdr->saddr, th->source,
                                        &hdr->daddr, ntohs(th->dest),
                                        inet6_iif(skb), inet6_sdif(skb));
        if (sk) {
                skb->sk = sk;
                skb->destructor = sock_edemux;
                if (sk_fullsock(sk)) {
                        struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);

                        if (dst)
                                dst = dst_check(dst, sk->sk_rx_dst_cookie);
                        if (dst &&
                            sk->sk_rx_dst_ifindex == skb->skb_iif)
                                skb_dst_set_noref(skb, dst);
                }
        }
}

static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
                                struct sk_buff *skb)
{
        if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
            !skb_dst(skb) && !skb->sk) {
                switch (ipv6_hdr(skb)->nexthdr) {
                case IPPROTO_TCP:
                        if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux))
                                tcp_v6_early_demux(skb);
                        break;
                case IPPROTO_UDP:
                        if (READ_ONCE(net->ipv4.sysctl_udp_early_demux))
                                udp_v6_early_demux(skb);
                        break;
                }
        }

        if (!skb_valid_dst(skb))
                ip6_route_input(skb);
}

int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        /* if ingress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
        skb = l3mdev_ip6_rcv(skb);
        if (!skb)
                return NET_RX_SUCCESS;
        ip6_rcv_finish_core(net, sk, skb);

        return dst_input(skb);
}

static void ip6_sublist_rcv_finish(struct list_head *head)
{
        struct sk_buff *skb, *next;

        list_for_each_entry_safe(skb, next, head, list) {
                skb_list_del_init(skb);
                dst_input(skb);
        }
}

static bool ip6_can_use_hint(const struct sk_buff *skb,
                             const struct sk_buff *hint)
{
        return hint && !skb_dst(skb) &&
               ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr);
}

static struct sk_buff *ip6_extract_route_hint(const struct net *net,
                                              struct sk_buff *skb)
{
        if (fib6_routes_require_src(net) || fib6_has_custom_rules(net) ||
            IP6CB(skb)->flags & IP6SKB_MULTIPATH)
                return NULL;

        return skb;
}

static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
                                struct list_head *head)
{
        struct sk_buff *skb, *next, *hint = NULL;
        struct dst_entry *curr_dst = NULL;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                struct dst_entry *dst;

                skb_list_del_init(skb);
                /* if ingress device is enslaved to an L3 master device pass the
                 * skb to its handler for processing
                 */
                skb = l3mdev_ip6_rcv(skb);
                if (!skb)
                        continue;

                if (ip6_can_use_hint(skb, hint))
                        skb_dst_copy(skb, hint);
                else
                        ip6_rcv_finish_core(net, sk, skb);
                dst = skb_dst(skb);
                if (curr_dst != dst) {
                        hint = ip6_extract_route_hint(net, skb);

                        /* dispatch old sublist */
                        if (!list_empty(&sublist))
                                ip6_sublist_rcv_finish(&sublist);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        curr_dst = dst;
                }
                list_add_tail(&skb->list, &sublist);
        }
        /* dispatch final sublist */
        ip6_sublist_rcv_finish(&sublist);
}

static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
                                    struct net *net)
{
        enum skb_drop_reason reason;
        const struct ipv6hdr *hdr;
        u32 pkt_len;
        struct inet6_dev *idev;

        if (skb->pkt_type == PACKET_OTHERHOST) {
                dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
                kfree_skb_reason(skb, SKB_DROP_REASON_OTHERHOST);
                return NULL;
        }

        rcu_read_lock();

        idev = __in6_dev_get(skb->dev);

        __IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_IN, skb->len);

        SKB_DR_SET(reason, NOT_SPECIFIED);
        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL ||
            !idev || unlikely(READ_ONCE(idev->cnf.disable_ipv6))) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
                if (idev && unlikely(READ_ONCE(idev->cnf.disable_ipv6)))
                        SKB_DR_SET(reason, IPV6DISABLED);
                goto drop;
        }

        memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));

        /*
         * Store incoming device index. When the packet will
         * be queued, we cannot refer to skb->dev anymore.
         *
         * BTW, when we send a packet for our own local address on a
         * non-loopback interface (e.g. ethX), it is being delivered
         * via the loopback interface (lo) here; skb->dev = loopback_dev.
         * It, however, should be considered as if it is being
         * arrived via the sending interface (ethX), because of the
         * nature of scoping architecture. --yoshfuji
         */
        IP6CB(skb)->iif = skb_valid_dst(skb) ?
                                ip6_dst_idev(skb_dst(skb))->dev->ifindex :
                                dev->ifindex;

        if (unlikely(!pskb_may_pull(skb, sizeof(*hdr))))
                goto err;

        hdr = ipv6_hdr(skb);

        if (hdr->version != 6) {
                SKB_DR_SET(reason, UNHANDLED_PROTO);
                goto err;
        }

        __IP6_ADD_STATS(net, idev,
                        IPSTATS_MIB_NOECTPKTS +
                                (ipv6_get_dsfield(hdr) & INET_ECN_MASK),
                        max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
        /*
         * RFC4291 2.5.3
         * The loopback address must not be used as the source address in IPv6
         * packets that are sent outside of a single node. [..]
         * A packet received on an interface with a destination address
         * of loopback must be dropped.
         */
        if ((ipv6_addr_loopback(&hdr->saddr) ||
             ipv6_addr_loopback(&hdr->daddr)) &&
            !(dev->flags & IFF_LOOPBACK) &&
            !netif_is_l3_master(dev))
                goto err;

        /* RFC4291 Errata ID: 3480
         * Interface-Local scope spans only a single interface on a
         * node and is useful only for loopback transmission of
         * multicast.  Packets with interface-local scope received
         * from another node must be discarded.
         */
        if (!(skb->pkt_type == PACKET_LOOPBACK ||
              dev->flags & IFF_LOOPBACK) &&
            ipv6_addr_is_multicast(&hdr->daddr) &&
            IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1)
                goto err;

        /* If enabled, drop unicast packets that were encapsulated in link-layer
         * multicast or broadcast to protected against the so-called "hole-196"
         * attack in 802.11 wireless.
         */
        if (!ipv6_addr_is_multicast(&hdr->daddr) &&
            (skb->pkt_type == PACKET_BROADCAST ||
             skb->pkt_type == PACKET_MULTICAST) &&
            READ_ONCE(idev->cnf.drop_unicast_in_l2_multicast)) {
                SKB_DR_SET(reason, UNICAST_IN_L2_MULTICAST);
                goto err;
        }

        /* RFC4291 2.7
         * Nodes must not originate a packet to a multicast address whose scope
         * field contains the reserved value 0; if such a packet is received, it
         * must be silently dropped.
         */
        if (ipv6_addr_is_multicast(&hdr->daddr) &&
            IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 0)
                goto err;

        /*
         * RFC4291 2.7
         * Multicast addresses must not be used as source addresses in IPv6
         * packets or appear in any Routing header.
         */
        if (ipv6_addr_is_multicast(&hdr->saddr))
                goto err;

        skb->transport_header = skb->network_header + sizeof(*hdr);
        IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);

        pkt_len = ipv6_payload_len(skb, hdr);

        /* pkt_len may be zero if Jumbo payload option is present */
        if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
                if (pkt_len + sizeof(struct ipv6hdr) > skb->len) {
                        __IP6_INC_STATS(net,
                                        idev, IPSTATS_MIB_INTRUNCATEDPKTS);
                        SKB_DR_SET(reason, PKT_TOO_SMALL);
                        goto drop;
                }
                if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
                        goto err;
                hdr = ipv6_hdr(skb);
        }

        if (hdr->nexthdr == NEXTHDR_HOP) {
                if (ipv6_parse_hopopts(skb) < 0) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                        rcu_read_unlock();
                        return NULL;
                }
        }

        rcu_read_unlock();

        /* Must drop socket now because of tproxy. */
        if (!skb_sk_is_prefetched(skb))
                skb_orphan(skb);

        return skb;
err:
        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
        SKB_DR_OR(reason, IP_INHDR);
drop:
        rcu_read_unlock();
        kfree_skb_reason(skb, reason);
        return NULL;
}

int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
        struct net *net = dev_net(skb->dev);

        skb = ip6_rcv_core(skb, dev, net);
        if (skb == NULL)
                return NET_RX_DROP;
        return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
                       net, NULL, skb, dev, NULL,
                       ip6_rcv_finish);
}

static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev,
                            struct net *net)
{
        NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL,
                     head, dev, NULL, ip6_rcv_finish);
        ip6_list_rcv_finish(net, NULL, head);
}

/* Receive a list of IPv6 packets */
void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
                   struct net_device *orig_dev)
{
        struct net_device *curr_dev = NULL;
        struct net *curr_net = NULL;
        struct sk_buff *skb, *next;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                struct net_device *dev = skb->dev;
                struct net *net = dev_net(dev);

                skb_list_del_init(skb);
                skb = ip6_rcv_core(skb, dev, net);
                if (skb == NULL)
                        continue;

                if (curr_dev != dev || curr_net != net) {
                        /* dispatch old sublist */
                        if (!list_empty(&sublist))
                                ip6_sublist_rcv(&sublist, curr_dev, curr_net);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        curr_dev = dev;
                        curr_net = net;
                }
                list_add_tail(&skb->list, &sublist);
        }
        /* dispatch final sublist */
        if (!list_empty(&sublist))
                ip6_sublist_rcv(&sublist, curr_dev, curr_net);
}

INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *));

/*
 *        Deliver the packet to the host
 */
void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
                              bool have_final)
{
        const struct inet6_protocol *ipprot;
        struct inet6_dev *idev;
        unsigned int nhoff;
        SKB_DR(reason);
        bool raw;

        /*
         *        Parse extension headers
         */

resubmit:
        idev = ip6_dst_idev(skb_dst(skb));
        nhoff = IP6CB(skb)->nhoff;
        if (!have_final) {
                if (!pskb_pull(skb, skb_transport_offset(skb)))
                        goto discard;
                nexthdr = skb_network_header(skb)[nhoff];
        }

resubmit_final:
        raw = raw6_local_deliver(skb, nexthdr);
        ipprot = rcu_dereference(inet6_protos[nexthdr]);
        if (ipprot) {
                int ret;

                if (have_final) {
                        if (!(ipprot->flags & INET6_PROTO_FINAL)) {
                                /* Once we've seen a final protocol don't
                                 * allow encapsulation on any non-final
                                 * ones. This allows foo in UDP encapsulation
                                 * to work.
                                 */
                                goto discard;
                        }
                } else if (ipprot->flags & INET6_PROTO_FINAL) {
                        const struct ipv6hdr *hdr;
                        int sdif = inet6_sdif(skb);
                        struct net_device *dev;

                        /* Only do this once for first final protocol */
                        have_final = true;


                        skb_postpull_rcsum(skb, skb_network_header(skb),
                                           skb_network_header_len(skb));
                        hdr = ipv6_hdr(skb);

                        /* skb->dev passed may be master dev for vrfs. */
                        if (sdif) {
                                dev = dev_get_by_index_rcu(net, sdif);
                                if (!dev)
                                        goto discard;
                        } else {
                                dev = skb->dev;
                        }

                        if (ipv6_addr_is_multicast(&hdr->daddr) &&
                            !ipv6_chk_mcast_addr(dev, &hdr->daddr,
                                                 &hdr->saddr) &&
                            !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) {
                                SKB_DR_SET(reason, IP_INADDRERRORS);
                                goto discard;
                        }
                }
                if (!(ipprot->flags & INET6_PROTO_NOPOLICY)) {
                        if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                SKB_DR_SET(reason, XFRM_POLICY);
                                goto discard;
                        }
                        nf_reset_ct(skb);
                }

                ret = INDIRECT_CALL_2(ipprot->handler, tcp_v6_rcv, udpv6_rcv,
                                      skb);
                if (ret > 0) {
                        if (ipprot->flags & INET6_PROTO_FINAL) {
                                /* Not an extension header, most likely UDP
                                 * encapsulation. Use return value as nexthdr
                                 * protocol not nhoff (which presumably is
                                 * not set by handler).
                                 */
                                nexthdr = ret;
                                goto resubmit_final;
                        } else {
                                goto resubmit;
                        }
                } else if (ret == 0) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
                }
        } else {
                if (!raw) {
                        if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                __IP6_INC_STATS(net, idev,
                                                IPSTATS_MIB_INUNKNOWNPROTOS);
                                icmpv6_send(skb, ICMPV6_PARAMPROB,
                                            ICMPV6_UNK_NEXTHDR, nhoff);
                                SKB_DR_SET(reason, IP_NOPROTO);
                        } else {
                                SKB_DR_SET(reason, XFRM_POLICY);
                        }
                        kfree_skb_reason(skb, reason);
                } else {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
                        consume_skb(skb);
                }
        }
        return;

discard:
        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
        kfree_skb_reason(skb, reason);
}

static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
                __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
                                IPSTATS_MIB_INDISCARDS);
                kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM);
                return 0;
        }

        skb_clear_delivery_time(skb);
        ip6_protocol_deliver_rcu(net, skb, 0, false);

        return 0;
}


int ip6_input(struct sk_buff *skb)
{
        int res;

        rcu_read_lock();
        res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
                      dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
                      ip6_input_finish);
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL_GPL(ip6_input);

int ip6_mc_input(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        int sdif = inet6_sdif(skb);
        const struct ipv6hdr *hdr;
        bool deliver;

        __IP6_UPD_PO_STATS(skb_dst_dev_net_rcu(skb),
                           __in6_dev_get_safely(dev), IPSTATS_MIB_INMCAST,
                           skb->len);

        /* skb->dev passed may be master dev for vrfs. */
        if (sdif) {
                dev = dev_get_by_index_rcu(dev_net_rcu(dev), sdif);
                if (!dev) {
                        kfree_skb(skb);
                        return -ENODEV;
                }
        }

        hdr = ipv6_hdr(skb);
        deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL);

#ifdef CONFIG_IPV6_MROUTE
        /*
         *      IPv6 multicast router mode is now supported ;)
         */
        if (atomic_read(&dev_net_rcu(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
            !(ipv6_addr_type(&hdr->daddr) &
              (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
            likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
                /*
                 * Okay, we try to forward - split and duplicate
                 * packets.
                 */
                struct sk_buff *skb2;
                struct inet6_skb_parm *opt = IP6CB(skb);

                /* Check for MLD */
                if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
                        /* Check if this is a mld message */
                        u8 nexthdr = hdr->nexthdr;
                        __be16 frag_off;
                        int offset;

                        /* Check if the value of Router Alert
                         * is for MLD (0x0000).
                         */
                        if (opt->ra == htons(IPV6_OPT_ROUTERALERT_MLD)) {
                                deliver = false;

                                if (!ipv6_ext_hdr(nexthdr)) {
                                        /* BUG */
                                        goto out;
                                }
                                offset = ipv6_skip_exthdr(skb, sizeof(*hdr),
                                                          &nexthdr, &frag_off);
                                if (offset < 0)
                                        goto out;

                                if (ipv6_is_mld(skb, nexthdr, offset))
                                        deliver = true;

                                goto out;
                        }
                        /* unknown RA - process it normally */
                }

                if (deliver) {
                        skb2 = skb_clone(skb, GFP_ATOMIC);
                } else {
                        skb2 = skb;
                        skb = NULL;
                }

                if (skb2)
                        ip6_mr_input(skb2);
        }
out:
#endif
        if (likely(deliver)) {
                ip6_input(skb);
        } else {
                /* discard */
                kfree_skb(skb);
        }

        return 0;
}































































































































































































































































































































   16 











   16 
   16 

   16 














   15 
   16 

   16 

   15 











    6 
















    5 







    6 





    6 














    5 
    6 




















   12 







   13 
   13 



































































   11 



   13 







   14 





    1 







    1 







    1 



    1 








    6 

   16 

























   14 



   15 









   14 




   13 
    3 

   13 













   13 
































    6 















    6 




    6 













    6 













    6 




























    6 


    6 














    6 



































































    6 

































    6 















    6 




















   14 













   15 




   15 























































   13 





   12 
















































































































































    6 















    5 









































    3 




























    1 






















    3 







    3 

















    1 








    1 






















    1 

































































































   16 


   16 





























   16 


   16 












    6 







    6 
    6 


















    6 






    6 
    5 




























































   13 


    9 
    7 






   13 













   12 







   14 


   13 





   13 







   13 









   12 



   14 


   14 









   11 









   14 





   13 































































































































































   14 














    3 
   14 



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/dir.c - kernfs directory implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/hash.h>
#include <linux/ns_common.h>

#include "kernfs-internal.h"

/*
 * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
 * call pr_cont() while holding rename_lock. Because sometimes pr_cont()
 * will perform wakeups when releasing console_sem. Holding rename_lock
 * will introduce deadlock if the scheduler reads the kernfs_name in the
 * wakeup path.
 */
static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
static char kernfs_pr_cont_buf[PATH_MAX];        /* protected by pr_cont_lock */

#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)

static bool __kernfs_active(struct kernfs_node *kn)
{
        return atomic_read(&kn->active) >= 0;
}

static bool kernfs_active(struct kernfs_node *kn)
{
        lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem);
        return __kernfs_active(kn);
}

static bool kernfs_lockdep(struct kernfs_node *kn)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        return kn->flags & KERNFS_LOCKDEP;
#else
        return false;
#endif
}

/* kernfs_node_depth - compute depth from @from to @to */
static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
{
        size_t depth = 0;

        while (rcu_dereference(to->__parent) && to != from) {
                depth++;
                to = rcu_dereference(to->__parent);
        }
        return depth;
}

static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
                                                  struct kernfs_node *b)
{
        size_t da, db;
        struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b);

        if (ra != rb)
                return NULL;

        da = kernfs_depth(ra->kn, a);
        db = kernfs_depth(rb->kn, b);

        while (da > db) {
                a = rcu_dereference(a->__parent);
                da--;
        }
        while (db > da) {
                b = rcu_dereference(b->__parent);
                db--;
        }

        /* worst case b and a will be the same at root */
        while (b != a) {
                b = rcu_dereference(b->__parent);
                a = rcu_dereference(a->__parent);
        }

        return a;
}

/**
 * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to,
 * where kn_from is treated as root of the path.
 * @kn_from: kernfs node which should be treated as root for the path
 * @kn_to: kernfs node to which path is needed
 * @buf: buffer to copy the path into
 * @buflen: size of @buf
 *
 * We need to handle couple of scenarios here:
 * [1] when @kn_from is an ancestor of @kn_to at some level
 * kn_from: /n1/n2/n3
 * kn_to:   /n1/n2/n3/n4/n5
 * result:  /n4/n5
 *
 * [2] when @kn_from is on a different hierarchy and we need to find common
 * ancestor between @kn_from and @kn_to.
 * kn_from: /n1/n2/n3/n4
 * kn_to:   /n1/n2/n5
 * result:  /../../n5
 * OR
 * kn_from: /n1/n2/n3/n4/n5   [depth=5]
 * kn_to:   /n1/n2/n3         [depth=3]
 * result:  /../..
 *
 * [3] when @kn_to is %NULL result will be "(null)"
 *
 * Return: the length of the constructed path.  If the path would have been
 * greater than @buflen, @buf contains the truncated path with the trailing
 * '\0'.  On error, -errno is returned.
 */
static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
                                        struct kernfs_node *kn_from,
                                        char *buf, size_t buflen)
{
        struct kernfs_node *kn, *common;
        const char parent_str[] = "/..";
        size_t depth_from, depth_to, len = 0;
        ssize_t copied;
        int i, j;

        if (!kn_to)
                return strscpy(buf, "(null)", buflen);

        if (!kn_from)
                kn_from = kernfs_root(kn_to)->kn;

        if (kn_from == kn_to)
                return strscpy(buf, "/", buflen);

        common = kernfs_common_ancestor(kn_from, kn_to);
        if (WARN_ON(!common))
                return -EINVAL;

        depth_to = kernfs_depth(common, kn_to);
        depth_from = kernfs_depth(common, kn_from);

        buf[0] = '\0';

        for (i = 0; i < depth_from; i++) {
                copied = strscpy(buf + len, parent_str, buflen - len);
                if (copied < 0)
                        return copied;
                len += copied;
        }

        /* Calculate how many bytes we need for the rest */
        for (i = depth_to - 1; i >= 0; i--) {
                const char *name;

                for (kn = kn_to, j = 0; j < i; j++)
                        kn = rcu_dereference(kn->__parent);

                name = rcu_dereference(kn->name);
                len += scnprintf(buf + len, buflen - len, "/%s", name);
        }

        return len;
}

/**
 * kernfs_name - obtain the name of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * Copies the name of @kn into @buf of @buflen bytes.  The behavior is
 * similar to strscpy().
 *
 * Fills buffer with "(null)" if @kn is %NULL.
 *
 * Return: the resulting length of @buf. If @buf isn't long enough,
 * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG.
 *
 * This function can be called from any context.
 */
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{
        struct kernfs_node *kn_parent;

        if (!kn)
                return strscpy(buf, "(null)", buflen);

        guard(rcu)();
        /*
         * KERNFS_ROOT_INVARIANT_PARENT is ignored here. The name is RCU freed and
         * the parent is either existing or not.
         */
        kn_parent = rcu_dereference(kn->__parent);
        return strscpy(buf, kn_parent ? rcu_dereference(kn->name) : "/", buflen);
}

/**
 * kernfs_path_from_node - build path of node @to relative to @from.
 * @from: parent kernfs_node relative to which we need to build the path
 * @to: kernfs_node of interest
 * @buf: buffer to copy @to's path into
 * @buflen: size of @buf
 *
 * Builds @to's path relative to @from in @buf. @from and @to must
 * be on the same kernfs-root. If @from is not parent of @to, then a relative
 * path (which includes '..'s) as needed to reach from @from to @to is
 * returned.
 *
 * Return: the length of the constructed path.  If the path would have been
 * greater than @buflen, @buf contains the truncated path with the trailing
 * '\0'.  On error, -errno is returned.
 */
int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
                          char *buf, size_t buflen)
{
        struct kernfs_root *root;

        guard(rcu)();
        if (to) {
                root = kernfs_root(to);
                if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) {
                        guard(read_lock_irqsave)(&root->kernfs_rename_lock);
                        return kernfs_path_from_node_locked(to, from, buf, buflen);
                }
        }
        return kernfs_path_from_node_locked(to, from, buf, buflen);
}
EXPORT_SYMBOL_GPL(kernfs_path_from_node);

/**
 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_name(struct kernfs_node *kn)
{
        unsigned long flags;

        spin_lock_irqsave(&kernfs_pr_cont_lock, flags);

        kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
        pr_cont("%s", kernfs_pr_cont_buf);

        spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}

/**
 * pr_cont_kernfs_path - pr_cont path of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_path(struct kernfs_node *kn)
{
        unsigned long flags;
        int sz;

        spin_lock_irqsave(&kernfs_pr_cont_lock, flags);

        sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
                                   sizeof(kernfs_pr_cont_buf));
        if (sz < 0) {
                if (sz == -E2BIG)
                        pr_cont("(name too long)");
                else
                        pr_cont("(error)");
                goto out;
        }

        pr_cont("%s", kernfs_pr_cont_buf);

out:
        spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}

/**
 * kernfs_get_parent - determine the parent node and pin it
 * @kn: kernfs_node of interest
 *
 * Determines @kn's parent, pins and returns it.  This function can be
 * called from any context.
 *
 * Return: parent node of @kn
 */
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
        struct kernfs_node *parent;
        struct kernfs_root *root;
        unsigned long flags;

        root = kernfs_root(kn);
        read_lock_irqsave(&root->kernfs_rename_lock, flags);
        parent = kernfs_parent(kn);
        kernfs_get(parent);
        read_unlock_irqrestore(&root->kernfs_rename_lock, flags);

        return parent;
}

/*
 * kernfs_ns_id - return the namespace id for a given namespace
 * @ns: namespace tag (may be NULL)
 *
 * Use the 64-bit namespace id instead of raw pointers for hashing
 * and comparison to avoid leaking kernel addresses to userspace.
 */
static u64 kernfs_ns_id(const struct ns_common *ns)
{
        return ns ? ns->ns_id : 0;
}

/**
 *        kernfs_name_hash - calculate hash of @ns + @name
 *        @name: Null terminated string to hash
 *        @ns:   Namespace tag to hash
 *
 *        Return: 31-bit hash of ns + name (so it fits in an off_t)
 */
static unsigned int kernfs_name_hash(const char *name,
                                     const struct ns_common *ns)
{
        unsigned long hash = init_name_hash(kernfs_ns_id(ns));
        unsigned int len = strlen(name);
        while (len--)
                hash = partial_name_hash(*name++, hash);
        hash = end_name_hash(hash);
        hash &= 0x7fffffffU;
        /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
        if (hash < 2)
                hash += 2;
        if (hash >= INT_MAX)
                hash = INT_MAX - 1;
        return hash;
}

static int kernfs_name_compare(unsigned int hash, const char *name,
                               const struct ns_common *ns, const struct kernfs_node *kn)
{
        u64 ns_id = kernfs_ns_id(ns);
        u64 kn_ns_id = kernfs_ns_id(kn->ns);

        if (hash < kn->hash)
                return -1;
        if (hash > kn->hash)
                return 1;
        if (ns_id < kn_ns_id)
                return -1;
        if (ns_id > kn_ns_id)
                return 1;
        return strcmp(name, kernfs_rcu_name(kn));
}

static int kernfs_sd_compare(const struct kernfs_node *left,
                             const struct kernfs_node *right)
{
        return kernfs_name_compare(left->hash, kernfs_rcu_name(left), left->ns, right);
}

/**
 *        kernfs_link_sibling - link kernfs_node into sibling rbtree
 *        @kn: kernfs_node of interest
 *
 *        Link @kn into its sibling rbtree which starts from
 *        @kn->parent->dir.children.
 *
 *        Locking:
 *        kernfs_rwsem held exclusive
 *
 *        Return:
 *        %0 on success, -EEXIST on failure.
 */
static int kernfs_link_sibling(struct kernfs_node *kn)
{
        struct rb_node *parent = NULL;
        struct kernfs_node *kn_parent;
        struct rb_node **node;

        kn_parent = kernfs_parent(kn);
        node = &kn_parent->dir.children.rb_node;

        while (*node) {
                struct kernfs_node *pos;
                int result;

                pos = rb_to_kn(*node);
                parent = *node;
                result = kernfs_sd_compare(kn, pos);
                if (result < 0)
                        node = &pos->rb.rb_left;
                else if (result > 0)
                        node = &pos->rb.rb_right;
                else
                        return -EEXIST;
        }

        /* add new node and rebalance the tree */
        rb_link_node(&kn->rb, parent, node);
        rb_insert_color(&kn->rb, &kn_parent->dir.children);

        /* successfully added, account subdir number */
        down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
        if (kernfs_type(kn) == KERNFS_DIR)
                kn_parent->dir.subdirs++;
        kernfs_inc_rev(kn_parent);
        up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);

        return 0;
}

/**
 *        kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
 *        @kn: kernfs_node of interest
 *
 *        Try to unlink @kn from its sibling rbtree which starts from
 *        kn->parent->dir.children.
 *
 *        Return: %true if @kn was actually removed,
 *        %false if @kn wasn't on the rbtree.
 *
 *        Locking:
 *        kernfs_rwsem held exclusive
 */
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
{
        struct kernfs_node *kn_parent;

        if (RB_EMPTY_NODE(&kn->rb))
                return false;

        kn_parent = kernfs_parent(kn);
        down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
        if (kernfs_type(kn) == KERNFS_DIR)
                kn_parent->dir.subdirs--;
        kernfs_inc_rev(kn_parent);
        up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);

        rb_erase(&kn->rb, &kn_parent->dir.children);
        RB_CLEAR_NODE(&kn->rb);
        return true;
}

/**
 *        kernfs_get_active - get an active reference to kernfs_node
 *        @kn: kernfs_node to get an active reference to
 *
 *        Get an active reference of @kn.  This function is noop if @kn
 *        is %NULL.
 *
 *        Return:
 *        Pointer to @kn on success, %NULL on failure.
 */
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
{
        if (unlikely(!kn))
                return NULL;

        if (!atomic_inc_unless_negative(&kn->active))
                return NULL;

        if (kernfs_lockdep(kn))
                rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
        return kn;
}

/**
 *        kernfs_put_active - put an active reference to kernfs_node
 *        @kn: kernfs_node to put an active reference to
 *
 *        Put an active reference to @kn.  This function is noop if @kn
 *        is %NULL.
 */
void kernfs_put_active(struct kernfs_node *kn)
{
        int v;

        if (unlikely(!kn))
                return;

        if (kernfs_lockdep(kn))
                rwsem_release(&kn->dep_map, _RET_IP_);
        v = atomic_dec_return(&kn->active);
        if (likely(v != KN_DEACTIVATED_BIAS))
                return;

        wake_up_all(&kernfs_root(kn)->deactivate_waitq);
}

/**
 * kernfs_drain - drain kernfs_node
 * @kn: kernfs_node to drain
 * @drop_supers: Set to true if this function is called with the
 *               kernfs_supers_rwsem locked.
 *
 * Drain existing usages and nuke all existing mmaps of @kn.  Multiple
 * removers may invoke this function concurrently on @kn and all will
 * return after draining is complete.
 */
static void kernfs_drain(struct kernfs_node *kn, bool drop_supers)
        __releases(&kernfs_root(kn)->kernfs_rwsem)
        __acquires(&kernfs_root(kn)->kernfs_rwsem)
{
        struct kernfs_root *root = kernfs_root(kn);

        lockdep_assert_held_write(&root->kernfs_rwsem);
        WARN_ON_ONCE(kernfs_active(kn));

        /*
         * Skip draining if already fully drained. This avoids draining and its
         * lockdep annotations for nodes which have never been activated
         * allowing embedding kernfs_remove() in create error paths without
         * worrying about draining.
         */
        if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS &&
            !kernfs_should_drain_open_files(kn))
                return;

        up_write(&root->kernfs_rwsem);
        if (drop_supers)
                up_read(&root->kernfs_supers_rwsem);

        if (kernfs_lockdep(kn)) {
                rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
                if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
                        lock_contended(&kn->dep_map, _RET_IP_);
        }

        wait_event(root->deactivate_waitq,
                   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);

        if (kernfs_lockdep(kn)) {
                lock_acquired(&kn->dep_map, _RET_IP_);
                rwsem_release(&kn->dep_map, _RET_IP_);
        }

        if (kernfs_should_drain_open_files(kn))
                kernfs_drain_open_files(kn);

        if (drop_supers)
                down_read(&root->kernfs_supers_rwsem);
        down_write(&root->kernfs_rwsem);
}

/**
 * kernfs_get - get a reference count on a kernfs_node
 * @kn: the target kernfs_node
 */
void kernfs_get(struct kernfs_node *kn)
{
        if (kn) {
                WARN_ON(!atomic_read(&kn->count));
                atomic_inc(&kn->count);
        }
}
EXPORT_SYMBOL_GPL(kernfs_get);

static void kernfs_free_rcu(struct rcu_head *rcu)
{
        struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);

        /* If the whole node goes away, then name can't be used outside */
        kfree_const(rcu_access_pointer(kn->name));

        if (kn->iattr)
                kmem_cache_free(kernfs_iattrs_cache, kn->iattr);

        kmem_cache_free(kernfs_node_cache, kn);
}

/**
 * kernfs_put - put a reference count on a kernfs_node
 * @kn: the target kernfs_node
 *
 * Put a reference count of @kn and destroy it if it reached zero.
 */
void kernfs_put(struct kernfs_node *kn)
{
        struct kernfs_node *parent;
        struct kernfs_root *root;

        if (!kn || !atomic_dec_and_test(&kn->count))
                return;
        root = kernfs_root(kn);
 repeat:
        /*
         * Moving/renaming is always done while holding reference.
         * kn->parent won't change beneath us.
         */
        parent = kernfs_parent(kn);

        WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
                  "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
                  parent ? rcu_dereference(parent->name) : "",
                  rcu_dereference(kn->name), atomic_read(&kn->active));

        if (kernfs_type(kn) == KERNFS_LINK)
                kernfs_put(kn->symlink.target_kn);

        if (kn->iattr && kn->iattr->xattrs) {
                simple_xattrs_free(kn->iattr->xattrs, NULL);
                kfree(kn->iattr->xattrs);
                kn->iattr->xattrs = NULL;
        }

        spin_lock(&root->kernfs_idr_lock);
        idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
        spin_unlock(&root->kernfs_idr_lock);

        call_rcu(&kn->rcu, kernfs_free_rcu);

        kn = parent;
        if (kn) {
                if (atomic_dec_and_test(&kn->count))
                        goto repeat;
        } else {
                /* just released the root kn, free @root too */
                idr_destroy(&root->ino_idr);
                kfree_rcu(root, rcu);
        }
}
EXPORT_SYMBOL_GPL(kernfs_put);

/**
 * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
 * @dentry: the dentry in question
 *
 * Return: the kernfs_node associated with @dentry.  If @dentry is not a
 * kernfs one, %NULL is returned.
 *
 * While the returned kernfs_node will stay accessible as long as @dentry
 * is accessible, the returned node can be in any state and the caller is
 * fully responsible for determining what's accessible.
 */
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{
        if (dentry->d_sb->s_op == &kernfs_sops)
                return kernfs_dentry_node(dentry);
        return NULL;
}

static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
                                             struct kernfs_node *parent,
                                             const char *name, umode_t mode,
                                             kuid_t uid, kgid_t gid,
                                             unsigned flags)
{
        struct kernfs_node *kn;
        u32 id_highbits;
        int ret;

        name = kstrdup_const(name, GFP_KERNEL);
        if (!name)
                return NULL;

        kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
        if (!kn)
                goto err_out1;

        idr_preload(GFP_KERNEL);
        spin_lock(&root->kernfs_idr_lock);
        ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
        if (ret >= 0 && ret < root->last_id_lowbits)
                root->id_highbits++;
        id_highbits = root->id_highbits;
        root->last_id_lowbits = ret;
        spin_unlock(&root->kernfs_idr_lock);
        idr_preload_end();
        if (ret < 0)
                goto err_out2;

        kn->id = (u64)id_highbits << 32 | ret;

        atomic_set(&kn->count, 1);
        atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
        RB_CLEAR_NODE(&kn->rb);

        rcu_assign_pointer(kn->name, name);
        kn->mode = mode;
        kn->flags = flags;

        if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) {
                struct iattr iattr = {
                        .ia_valid = ATTR_UID | ATTR_GID,
                        .ia_uid = uid,
                        .ia_gid = gid,
                };

                ret = __kernfs_setattr(kn, &iattr);
                if (ret < 0)
                        goto err_out3;
        }

        if (parent) {
                ret = security_kernfs_init_security(parent, kn);
                if (ret)
                        goto err_out4;
        }

        return kn;

 err_out4:
        if (kn->iattr) {
                if (kn->iattr->xattrs) {
                        simple_xattrs_free(kn->iattr->xattrs, NULL);
                        kfree(kn->iattr->xattrs);
                }
                kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
        }
 err_out3:
        spin_lock(&root->kernfs_idr_lock);
        idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
        spin_unlock(&root->kernfs_idr_lock);
 err_out2:
        kmem_cache_free(kernfs_node_cache, kn);
 err_out1:
        kfree_const(name);
        return NULL;
}

struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                                    const char *name, umode_t mode,
                                    kuid_t uid, kgid_t gid,
                                    unsigned flags)
{
        struct kernfs_node *kn;

        if (parent->mode & S_ISGID) {
                /* this code block imitates inode_init_owner() for
                 * kernfs
                 */

                if (parent->iattr)
                        gid = parent->iattr->ia_gid;

                if (flags & KERNFS_DIR)
                        mode |= S_ISGID;
        }

        kn = __kernfs_new_node(kernfs_root(parent), parent,
                               name, mode, uid, gid, flags);
        if (kn) {
                kernfs_get(parent);
                rcu_assign_pointer(kn->__parent, parent);
        }
        return kn;
}

/*
 * kernfs_find_and_get_node_by_id - get kernfs_node from node id
 * @root: the kernfs root
 * @id: the target node id
 *
 * @id's lower 32bits encode ino and upper gen.  If the gen portion is
 * zero, all generations are matched.
 *
 * Return: %NULL on failure,
 * otherwise a kernfs node with reference counter incremented.
 */
struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
                                                   u64 id)
{
        struct kernfs_node *kn;
        ino_t ino = kernfs_id_ino(id);
        u32 gen = kernfs_id_gen(id);

        rcu_read_lock();

        kn = idr_find(&root->ino_idr, (u32)ino);
        if (!kn)
                goto err_unlock;

        if (sizeof(ino_t) >= sizeof(u64)) {
                /* we looked up with the low 32bits, compare the whole */
                if (kernfs_ino(kn) != ino)
                        goto err_unlock;
        } else {
                /* 0 matches all generations */
                if (unlikely(gen && kernfs_gen(kn) != gen))
                        goto err_unlock;
        }

        /*
         * We should fail if @kn has never been activated and guarantee success
         * if the caller knows that @kn is active. Both can be achieved by
         * __kernfs_active() which tests @kn->active without kernfs_rwsem.
         */
        if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
                goto err_unlock;

        rcu_read_unlock();
        return kn;
err_unlock:
        rcu_read_unlock();
        return NULL;
}

/**
 *        kernfs_add_one - add kernfs_node to parent without warning
 *        @kn: kernfs_node to be added
 *
 *        The caller must already have initialized @kn->parent.  This
 *        function increments nlink of the parent's inode if @kn is a
 *        directory and link into the children list of the parent.
 *
 *        Return:
 *        %0 on success, -EEXIST if entry with the given name already
 *        exists.
 */
int kernfs_add_one(struct kernfs_node *kn)
{
        struct kernfs_root *root = kernfs_root(kn);
        struct kernfs_iattrs *ps_iattr;
        struct kernfs_node *parent;
        bool has_ns;
        int ret;

        down_write(&root->kernfs_rwsem);
        parent = kernfs_parent(kn);

        ret = -EINVAL;
        has_ns = kernfs_ns_enabled(parent);
        if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
                 has_ns ? "required" : "invalid",
                 kernfs_rcu_name(parent), kernfs_rcu_name(kn)))
                goto out_unlock;

        if (kernfs_type(parent) != KERNFS_DIR)
                goto out_unlock;

        ret = -ENOENT;
        if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR))
                goto out_unlock;

        kn->hash = kernfs_name_hash(kernfs_rcu_name(kn), kn->ns);

        ret = kernfs_link_sibling(kn);
        if (ret)
                goto out_unlock;

        /* Update timestamps on the parent */
        down_write(&root->kernfs_iattr_rwsem);

        ps_iattr = parent->iattr;
        if (ps_iattr) {
                ktime_get_real_ts64(&ps_iattr->ia_ctime);
                ps_iattr->ia_mtime = ps_iattr->ia_ctime;
        }

        up_write(&root->kernfs_iattr_rwsem);
        up_write(&root->kernfs_rwsem);

        /*
         * Activate the new node unless CREATE_DEACTIVATED is requested.
         * If not activated here, the kernfs user is responsible for
         * activating the node with kernfs_activate().  A node which hasn't
         * been activated is not visible to userland and its removal won't
         * trigger deactivation.
         */
        if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
                kernfs_activate(kn);
        return 0;

out_unlock:
        up_write(&root->kernfs_rwsem);
        return ret;
}

/**
 * kernfs_find_ns - find kernfs_node with the given name
 * @parent: kernfs_node to search under
 * @name: name to look for
 * @ns: the namespace tag to use
 *
 * Look for kernfs_node with name @name under @parent.
 *
 * Return: pointer to the found kernfs_node on success, %NULL on failure.
 */
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
                                          const unsigned char *name,
                                          const struct ns_common *ns)
{
        struct rb_node *node = parent->dir.children.rb_node;
        bool has_ns = kernfs_ns_enabled(parent);
        unsigned int hash;

        lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem);

        if (has_ns != (bool)ns) {
                WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
                     has_ns ? "required" : "invalid", kernfs_rcu_name(parent), name);
                return NULL;
        }

        hash = kernfs_name_hash(name, ns);
        while (node) {
                struct kernfs_node *kn;
                int result;

                kn = rb_to_kn(node);
                result = kernfs_name_compare(hash, name, ns, kn);
                if (result < 0)
                        node = node->rb_left;
                else if (result > 0)
                        node = node->rb_right;
                else
                        return kn;
        }
        return NULL;
}

static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
                                          const unsigned char *path,
                                          const struct ns_common *ns)
{
        ssize_t len;
        char *p, *name;

        lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);

        spin_lock_irq(&kernfs_pr_cont_lock);

        len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));

        if (len < 0) {
                spin_unlock_irq(&kernfs_pr_cont_lock);
                return NULL;
        }

        p = kernfs_pr_cont_buf;

        while ((name = strsep(&p, "/")) && parent) {
                if (*name == '\0')
                        continue;
                parent = kernfs_find_ns(parent, name, ns);
        }

        spin_unlock_irq(&kernfs_pr_cont_lock);

        return parent;
}

/**
 * kernfs_find_and_get_ns - find and get kernfs_node with the given name
 * @parent: kernfs_node to search under
 * @name: name to look for
 * @ns: the namespace tag to use
 *
 * Look for kernfs_node with name @name under @parent and get a reference
 * if found.  This function may sleep.
 *
 * Return: pointer to the found kernfs_node on success, %NULL on failure.
 */
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
                                           const char *name,
                                           const struct ns_common *ns)
{
        struct kernfs_node *kn;
        struct kernfs_root *root = kernfs_root(parent);

        down_read(&root->kernfs_rwsem);
        kn = kernfs_find_ns(parent, name, ns);
        kernfs_get(kn);
        up_read(&root->kernfs_rwsem);

        return kn;
}
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);

/**
 * kernfs_walk_and_get_ns - find and get kernfs_node with the given path
 * @parent: kernfs_node to search under
 * @path: path to look for
 * @ns: the namespace tag to use
 *
 * Look for kernfs_node with path @path under @parent and get a reference
 * if found.  This function may sleep.
 *
 * Return: pointer to the found kernfs_node on success, %NULL on failure.
 */
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
                                           const char *path,
                                           const struct ns_common *ns)
{
        struct kernfs_node *kn;
        struct kernfs_root *root = kernfs_root(parent);

        down_read(&root->kernfs_rwsem);
        kn = kernfs_walk_ns(parent, path, ns);
        kernfs_get(kn);
        up_read(&root->kernfs_rwsem);

        return kn;
}

unsigned int kernfs_root_flags(struct kernfs_node *kn)
{
        return kernfs_root(kn)->flags;
}

/**
 * kernfs_create_root - create a new kernfs hierarchy
 * @scops: optional syscall operations for the hierarchy
 * @flags: KERNFS_ROOT_* flags
 * @priv: opaque data associated with the new directory
 *
 * Return: the root of the new hierarchy on success, ERR_PTR() value on
 * failure.
 */
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
                                       unsigned int flags, void *priv)
{
        struct kernfs_root *root;
        struct kernfs_node *kn;

        root = kzalloc_obj(*root);
        if (!root)
                return ERR_PTR(-ENOMEM);

        idr_init(&root->ino_idr);
        spin_lock_init(&root->kernfs_idr_lock);
        init_rwsem(&root->kernfs_rwsem);
        init_rwsem(&root->kernfs_iattr_rwsem);
        init_rwsem(&root->kernfs_supers_rwsem);
        INIT_LIST_HEAD(&root->supers);
        rwlock_init(&root->kernfs_rename_lock);

        /*
         * On 64bit ino setups, id is ino.  On 32bit, low 32bits are ino.
         * High bits generation.  The starting value for both ino and
         * genenration is 1.  Initialize upper 32bit allocation
         * accordingly.
         */
        if (sizeof(ino_t) >= sizeof(u64))
                root->id_highbits = 0;
        else
                root->id_highbits = 1;

        kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO,
                               GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
                               KERNFS_DIR);
        if (!kn) {
                idr_destroy(&root->ino_idr);
                kfree(root);
                return ERR_PTR(-ENOMEM);
        }

        kn->priv = priv;
        kn->dir.root = root;

        root->syscall_ops = scops;
        root->flags = flags;
        root->kn = kn;
        init_waitqueue_head(&root->deactivate_waitq);

        if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
                kernfs_activate(kn);

        return root;
}

/**
 * kernfs_destroy_root - destroy a kernfs hierarchy
 * @root: root of the hierarchy to destroy
 *
 * Destroy the hierarchy anchored at @root by removing all existing
 * directories and destroying @root.
 */
void kernfs_destroy_root(struct kernfs_root *root)
{
        /*
         *  kernfs_remove holds kernfs_rwsem from the root so the root
         *  shouldn't be freed during the operation.
         */
        kernfs_get(root->kn);
        kernfs_remove(root->kn);
        kernfs_put(root->kn); /* will also free @root */
}

/**
 * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root
 * @root: root to use to lookup
 *
 * Return: @root's kernfs_node
 */
struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
{
        return root->kn;
}

/**
 * kernfs_create_dir_ns - create a directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
 * @mode: mode of the new directory
 * @uid: uid of the new directory
 * @gid: gid of the new directory
 * @priv: opaque data associated with the new directory
 * @ns: optional namespace tag of the directory
 *
 * Return: the created node on success, ERR_PTR() value on failure.
 */
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         void *priv,
                                         const struct ns_common *ns)
{
        struct kernfs_node *kn;
        int rc;

        /* allocate */
        kn = kernfs_new_node(parent, name, mode | S_IFDIR,
                             uid, gid, KERNFS_DIR);
        if (!kn)
                return ERR_PTR(-ENOMEM);

        kn->dir.root = parent->dir.root;
        kn->ns = ns;
        kn->priv = priv;

        /* link in */
        rc = kernfs_add_one(kn);
        if (!rc)
                return kn;

        kernfs_put(kn);
        return ERR_PTR(rc);
}

/**
 * kernfs_create_empty_dir - create an always empty directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
 *
 * Return: the created node on success, ERR_PTR() value on failure.
 */
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
                                            const char *name)
{
        struct kernfs_node *kn;
        int rc;

        /* allocate */
        kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR,
                             GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR);
        if (!kn)
                return ERR_PTR(-ENOMEM);

        kn->flags |= KERNFS_EMPTY_DIR;
        kn->dir.root = parent->dir.root;
        kn->ns = NULL;
        kn->priv = NULL;

        /* link in */
        rc = kernfs_add_one(kn);
        if (!rc)
                return kn;

        kernfs_put(kn);
        return ERR_PTR(rc);
}

static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name,
                                 struct dentry *dentry, unsigned int flags)
{
        struct kernfs_node *kn, *parent;
        struct kernfs_root *root;

        if (flags & LOOKUP_RCU)
                return -ECHILD;

        /* Negative hashed dentry? */
        if (d_really_is_negative(dentry)) {
                /* If the kernfs parent node has changed discard and
                 * proceed to ->lookup.
                 *
                 * There's nothing special needed here when getting the
                 * dentry parent, even if a concurrent rename is in
                 * progress. That's because the dentry is negative so
                 * it can only be the target of the rename and it will
                 * be doing a d_move() not a replace. Consequently the
                 * dentry d_parent won't change over the d_move().
                 *
                 * Also kernfs negative dentries transitioning from
                 * negative to positive during revalidate won't happen
                 * because they are invalidated on containing directory
                 * changes and the lookup re-done so that a new positive
                 * dentry can be properly created.
                 */
                root = kernfs_root_from_sb(dentry->d_sb);
                down_read(&root->kernfs_rwsem);
                parent = kernfs_dentry_node(dentry->d_parent);
                if (parent) {
                        if (kernfs_dir_changed(parent, dentry)) {
                                up_read(&root->kernfs_rwsem);
                                return 0;
                        }
                }
                up_read(&root->kernfs_rwsem);

                /* The kernfs parent node hasn't changed, leave the
                 * dentry negative and return success.
                 */
                return 1;
        }

        kn = kernfs_dentry_node(dentry);
        root = kernfs_root(kn);
        down_read(&root->kernfs_rwsem);

        /* The kernfs node has been deactivated */
        if (!kernfs_active(kn))
                goto out_bad;

        parent = kernfs_parent(kn);
        /* The kernfs node has been moved? */
        if (kernfs_dentry_node(dentry->d_parent) != parent)
                goto out_bad;

        /* The kernfs node has been renamed */
        if (strcmp(dentry->d_name.name, kernfs_rcu_name(kn)) != 0)
                goto out_bad;

        /* The kernfs node has been moved to a different namespace */
        if (parent && kernfs_ns_enabled(parent) &&
            kernfs_ns_id(kernfs_info(dentry->d_sb)->ns) != kernfs_ns_id(kn->ns))
                goto out_bad;

        up_read(&root->kernfs_rwsem);
        return 1;
out_bad:
        up_read(&root->kernfs_rwsem);
        return 0;
}

const struct dentry_operations kernfs_dops = {
        .d_revalidate        = kernfs_dop_revalidate,
};

static struct dentry *kernfs_iop_lookup(struct inode *dir,
                                        struct dentry *dentry,
                                        unsigned int flags)
{
        struct kernfs_node *parent = dir->i_private;
        struct kernfs_node *kn;
        struct kernfs_root *root;
        struct inode *inode = NULL;
        const struct ns_common *ns = NULL;

        root = kernfs_root(parent);
        down_read(&root->kernfs_rwsem);
        if (kernfs_ns_enabled(parent))
                ns = kernfs_info(dir->i_sb)->ns;

        kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
        /* attach dentry and inode */
        if (kn) {
                /* Inactive nodes are invisible to the VFS so don't
                 * create a negative.
                 */
                if (!kernfs_active(kn)) {
                        up_read(&root->kernfs_rwsem);
                        return NULL;
                }
                inode = kernfs_get_inode(dir->i_sb, kn);
                if (!inode)
                        inode = ERR_PTR(-ENOMEM);
        }
        /*
         * Needed for negative dentry validation.
         * The negative dentry can be created in kernfs_iop_lookup()
         * or transforms from positive dentry in dentry_unlink_inode()
         * called from vfs_rmdir().
         */
        if (!IS_ERR(inode))
                kernfs_set_rev(parent, dentry);
        up_read(&root->kernfs_rwsem);

        /* instantiate and hash (possibly negative) dentry */
        return d_splice_alias(inode, dentry);
}

static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap,
                                       struct inode *dir, struct dentry *dentry,
                                       umode_t mode)
{
        struct kernfs_node *parent = dir->i_private;
        struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
        int ret;

        if (!scops || !scops->mkdir)
                return ERR_PTR(-EPERM);

        if (!kernfs_get_active(parent))
                return ERR_PTR(-ENODEV);

        ret = scops->mkdir(parent, dentry->d_name.name, mode);

        kernfs_put_active(parent);
        return ERR_PTR(ret);
}

static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
{
        struct kernfs_node *kn  = kernfs_dentry_node(dentry);
        struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
        int ret;

        if (!scops || !scops->rmdir)
                return -EPERM;

        if (!kernfs_get_active(kn))
                return -ENODEV;

        ret = scops->rmdir(kn);

        kernfs_put_active(kn);
        return ret;
}

static int kernfs_iop_rename(struct mnt_idmap *idmap,
                             struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags)
{
        struct kernfs_node *kn = kernfs_dentry_node(old_dentry);
        struct kernfs_node *new_parent = new_dir->i_private;
        struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
        int ret;

        if (flags)
                return -EINVAL;

        if (!scops || !scops->rename)
                return -EPERM;

        if (!kernfs_get_active(kn))
                return -ENODEV;

        if (!kernfs_get_active(new_parent)) {
                kernfs_put_active(kn);
                return -ENODEV;
        }

        ret = scops->rename(kn, new_parent, new_dentry->d_name.name);

        kernfs_put_active(new_parent);
        kernfs_put_active(kn);
        return ret;
}

const struct inode_operations kernfs_dir_iops = {
        .lookup                = kernfs_iop_lookup,
        .permission        = kernfs_iop_permission,
        .setattr        = kernfs_iop_setattr,
        .getattr        = kernfs_iop_getattr,
        .listxattr        = kernfs_iop_listxattr,

        .mkdir                = kernfs_iop_mkdir,
        .rmdir                = kernfs_iop_rmdir,
        .rename                = kernfs_iop_rename,
};

static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
{
        struct kernfs_node *last;

        while (true) {
                struct rb_node *rbn;

                last = pos;

                if (kernfs_type(pos) != KERNFS_DIR)
                        break;

                rbn = rb_first(&pos->dir.children);
                if (!rbn)
                        break;

                pos = rb_to_kn(rbn);
        }

        return last;
}

/**
 * kernfs_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: kernfs_node whose descendants to walk
 *
 * Find the next descendant to visit for post-order traversal of @root's
 * descendants.  @root is included in the iteration and the last node to be
 * visited.
 *
 * Return: the next descendant to visit or %NULL when done.
 */
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
                                                       struct kernfs_node *root)
{
        struct rb_node *rbn;

        lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem);

        /* if first iteration, visit leftmost descendant which may be root */
        if (!pos)
                return kernfs_leftmost_descendant(root);

        /* if we visited @root, we're done */
        if (pos == root)
                return NULL;

        /* if there's an unvisited sibling, visit its leftmost descendant */
        rbn = rb_next(&pos->rb);
        if (rbn)
                return kernfs_leftmost_descendant(rb_to_kn(rbn));

        /* no sibling left, visit parent */
        return kernfs_parent(pos);
}

static void kernfs_activate_one(struct kernfs_node *kn)
{
        lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);

        kn->flags |= KERNFS_ACTIVATED;

        if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING)))
                return;

        WARN_ON_ONCE(rcu_access_pointer(kn->__parent) && RB_EMPTY_NODE(&kn->rb));
        WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);

        atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
}

/**
 * kernfs_activate - activate a node which started deactivated
 * @kn: kernfs_node whose subtree is to be activated
 *
 * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
 * needs to be explicitly activated.  A node which hasn't been activated
 * isn't visible to userland and deactivation is skipped during its
 * removal.  This is useful to construct atomic init sequences where
 * creation of multiple nodes should either succeed or fail atomically.
 *
 * The caller is responsible for ensuring that this function is not called
 * after kernfs_remove*() is invoked on @kn.
 */
void kernfs_activate(struct kernfs_node *kn)
{
        struct kernfs_node *pos;
        struct kernfs_root *root = kernfs_root(kn);

        down_write(&root->kernfs_rwsem);

        pos = NULL;
        while ((pos = kernfs_next_descendant_post(pos, kn)))
                kernfs_activate_one(pos);

        up_write(&root->kernfs_rwsem);
}

/**
 * kernfs_show - show or hide a node
 * @kn: kernfs_node to show or hide
 * @show: whether to show or hide
 *
 * If @show is %false, @kn is marked hidden and deactivated. A hidden node is
 * ignored in future activaitons. If %true, the mark is removed and activation
 * state is restored. This function won't implicitly activate a new node in a
 * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet.
 *
 * To avoid recursion complexities, directories aren't supported for now.
 */
void kernfs_show(struct kernfs_node *kn, bool show)
{
        struct kernfs_root *root = kernfs_root(kn);

        if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR))
                return;

        down_write(&root->kernfs_rwsem);

        if (show) {
                kn->flags &= ~KERNFS_HIDDEN;
                if (kn->flags & KERNFS_ACTIVATED)
                        kernfs_activate_one(kn);
        } else {
                kn->flags |= KERNFS_HIDDEN;
                if (kernfs_active(kn))
                        atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
                kernfs_drain(kn, false);
        }

        up_write(&root->kernfs_rwsem);
}

/*
 * This function enables VFS to send fsnotify events for deletions.
 * There is gap in this implementation for certain file removals due their
 * unique nature in kernfs. Directory removals that trigger file removals occur
 * through vfs_rmdir, which shrinks the dcache and emits fsnotify events after
 * the rmdir operation; there is no issue here. However kernfs writes to
 * particular files (e.g. cgroup.subtree_control) can also cause file removal,
 * but vfs_write does not attempt to emit fsnotify events after the write
 * operation, even if i_nlink counts are 0. As a usecase for monitoring this
 * category of file removals is not known, they are left without having
 * IN_DELETE or IN_DELETE_SELF events generated.
 * Fanotify recursive monitoring also does not work for kernfs nodes that do not
 * have inodes attached, as they are created on-demand in kernfs.
 */
static void kernfs_clear_inode_nlink(struct kernfs_node *kn)
{
        struct kernfs_root *root = kernfs_root(kn);
        struct kernfs_super_info *info;

        lockdep_assert_held_read(&root->kernfs_supers_rwsem);

        list_for_each_entry(info, &root->supers, node) {
                struct inode *inode = ilookup(info->sb, kernfs_ino(kn));

                if (inode) {
                        clear_nlink(inode);
                        iput(inode);
                }
        }
}

static void __kernfs_remove(struct kernfs_node *kn)
{
        struct kernfs_node *pos, *parent;

        /* Short-circuit if non-root @kn has already finished removal. */
        if (!kn)
                return;

        lockdep_assert_held_read(&kernfs_root(kn)->kernfs_supers_rwsem);
        lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);

        /*
         * This is for kernfs_remove_self() which plays with active ref
         * after removal.
         */
        if (kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb))
                return;

        pr_debug("kernfs %s: removing\n", kernfs_rcu_name(kn));

        /* prevent new usage by marking all nodes removing and deactivating */
        down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
        pos = NULL;
        while ((pos = kernfs_next_descendant_post(pos, kn))) {
                pos->flags |= KERNFS_REMOVING;
                if (kernfs_active(pos))
                        atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
        }
        up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);

        /* deactivate and unlink the subtree node-by-node */
        do {
                pos = kernfs_leftmost_descendant(kn);

                /*
                 * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's
                 * base ref could have been put by someone else by the time
                 * the function returns.  Make sure it doesn't go away
                 * underneath us.
                 */
                kernfs_get(pos);

                kernfs_drain(pos, true);
                parent = kernfs_parent(pos);
                /*
                 * kernfs_unlink_sibling() succeeds once per node.  Use it
                 * to decide who's responsible for cleanups.
                 */
                if (!parent || kernfs_unlink_sibling(pos)) {
                        struct kernfs_iattrs *ps_iattr =
                                parent ? parent->iattr : NULL;

                        down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);

                        kernfs_clear_inode_nlink(pos);

                        /* update timestamps on the parent */
                        if (ps_iattr) {
                                ktime_get_real_ts64(&ps_iattr->ia_ctime);
                                ps_iattr->ia_mtime = ps_iattr->ia_ctime;
                        }

                        up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
                        kernfs_put(pos);
                }

                kernfs_put(pos);
        } while (pos != kn);
}

/**
 * kernfs_remove - remove a kernfs_node recursively
 * @kn: the kernfs_node to remove
 *
 * Remove @kn along with all its subdirectories and files.
 */
void kernfs_remove(struct kernfs_node *kn)
{
        struct kernfs_root *root;

        if (!kn)
                return;

        root = kernfs_root(kn);

        down_read(&root->kernfs_supers_rwsem);
        down_write(&root->kernfs_rwsem);
        __kernfs_remove(kn);
        up_write(&root->kernfs_rwsem);
        up_read(&root->kernfs_supers_rwsem);
}

/**
 * kernfs_break_active_protection - break out of active protection
 * @kn: the self kernfs_node
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  Each invocation of
 * this function must also be matched with an invocation of
 * kernfs_unbreak_active_protection().
 *
 * This function releases the active reference of @kn the caller is
 * holding.  Once this function is called, @kn may be removed at any point
 * and the caller is solely responsible for ensuring that the objects it
 * dereferences are accessible.
 */
void kernfs_break_active_protection(struct kernfs_node *kn)
{
        /*
         * Take out ourself out of the active ref dependency chain.  If
         * we're called without an active ref, lockdep will complain.
         */
        kernfs_put_active(kn);
}

/**
 * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
 * @kn: the self kernfs_node
 *
 * If kernfs_break_active_protection() was called, this function must be
 * invoked before finishing the kernfs operation.  Note that while this
 * function restores the active reference, it doesn't and can't actually
 * restore the active protection - @kn may already or be in the process of
 * being drained and removed.  Once kernfs_break_active_protection() is
 * invoked, that protection is irreversibly gone for the kernfs operation
 * instance.
 *
 * While this function may be called at any point after
 * kernfs_break_active_protection() is invoked, its most useful location
 * would be right before the enclosing kernfs operation returns.
 */
void kernfs_unbreak_active_protection(struct kernfs_node *kn)
{
        /*
         * @kn->active could be in any state; however, the increment we do
         * here will be undone as soon as the enclosing kernfs operation
         * finishes and this temporary bump can't break anything.  If @kn
         * is alive, nothing changes.  If @kn is being deactivated, the
         * soon-to-follow put will either finish deactivation or restore
         * deactivated state.  If @kn is already removed, the temporary
         * bump is guaranteed to be gone before @kn is released.
         */
        atomic_inc(&kn->active);
        if (kernfs_lockdep(kn))
                rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
}

/**
 * kernfs_remove_self - remove a kernfs_node from its own method
 * @kn: the self kernfs_node to remove
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  This can be used to
 * implement a file operation which deletes itself.
 *
 * For example, the "delete" file for a sysfs device directory can be
 * implemented by invoking kernfs_remove_self() on the "delete" file
 * itself.  This function breaks the circular dependency of trying to
 * deactivate self while holding an active ref itself.  It isn't necessary
 * to modify the usual removal path to use kernfs_remove_self().  The
 * "delete" implementation can simply invoke kernfs_remove_self() on self
 * before proceeding with the usual removal path.  kernfs will ignore later
 * kernfs_remove() on self.
 *
 * kernfs_remove_self() can be called multiple times concurrently on the
 * same kernfs_node.  Only the first one actually performs removal and
 * returns %true.  All others will wait until the kernfs operation which
 * won self-removal finishes and return %false.  Note that the losers wait
 * for the completion of not only the winning kernfs_remove_self() but also
 * the whole kernfs_ops which won the arbitration.  This can be used to
 * guarantee, for example, all concurrent writes to a "delete" file to
 * finish only after the whole operation is complete.
 *
 * Return: %true if @kn is removed by this call, otherwise %false.
 */
bool kernfs_remove_self(struct kernfs_node *kn)
{
        bool ret;
        struct kernfs_root *root = kernfs_root(kn);

        down_read(&root->kernfs_supers_rwsem);
        down_write(&root->kernfs_rwsem);
        kernfs_break_active_protection(kn);

        /*
         * SUICIDAL is used to arbitrate among competing invocations.  Only
         * the first one will actually perform removal.  When the removal
         * is complete, SUICIDED is set and the active ref is restored
         * while kernfs_rwsem for held exclusive.  The ones which lost
         * arbitration waits for SUICIDED && drained which can happen only
         * after the enclosing kernfs operation which executed the winning
         * instance of kernfs_remove_self() finished.
         */
        if (!(kn->flags & KERNFS_SUICIDAL)) {
                kn->flags |= KERNFS_SUICIDAL;
                __kernfs_remove(kn);
                kn->flags |= KERNFS_SUICIDED;
                ret = true;
        } else {
                wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
                DEFINE_WAIT(wait);

                while (true) {
                        prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);

                        if ((kn->flags & KERNFS_SUICIDED) &&
                            atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
                                break;

                        up_write(&root->kernfs_rwsem);
                        up_read(&root->kernfs_supers_rwsem);
                        schedule();
                        down_read(&root->kernfs_supers_rwsem);
                        down_write(&root->kernfs_rwsem);
                }
                finish_wait(waitq, &wait);
                WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
                ret = false;
        }

        /*
         * This must be done while kernfs_rwsem held exclusive; otherwise,
         * waiting for SUICIDED && deactivated could finish prematurely.
         */
        kernfs_unbreak_active_protection(kn);

        up_write(&root->kernfs_rwsem);
        up_read(&root->kernfs_supers_rwsem);
        return ret;
}

/**
 * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
 * @parent: parent of the target
 * @name: name of the kernfs_node to remove
 * @ns: namespace tag of the kernfs_node to remove
 *
 * Look for the kernfs_node with @name and @ns under @parent and remove it.
 *
 * Return: %0 on success, -ENOENT if such entry doesn't exist.
 */
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
                             const struct ns_common *ns)
{
        struct kernfs_node *kn;
        struct kernfs_root *root;

        if (!parent) {
                WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
                        name);
                return -ENOENT;
        }

        root = kernfs_root(parent);
        down_read(&root->kernfs_supers_rwsem);
        down_write(&root->kernfs_rwsem);

        kn = kernfs_find_ns(parent, name, ns);
        if (kn) {
                kernfs_get(kn);
                __kernfs_remove(kn);
                kernfs_put(kn);
        }

        up_write(&root->kernfs_rwsem);
        up_read(&root->kernfs_supers_rwsem);

        if (kn)
                return 0;
        else
                return -ENOENT;
}

/**
 * kernfs_rename_ns - move and rename a kernfs_node
 * @kn: target node
 * @new_parent: new parent to put @sd under
 * @new_name: new name
 * @new_ns: new namespace tag
 *
 * Return: %0 on success, -errno on failure.
 */
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
                     const char *new_name, const struct ns_common *new_ns)
{
        struct kernfs_node *old_parent;
        struct kernfs_root *root;
        const char *old_name;
        int error;

        /* can't move or rename root */
        if (!rcu_access_pointer(kn->__parent))
                return -EINVAL;

        root = kernfs_root(kn);
        down_write(&root->kernfs_rwsem);

        error = -ENOENT;
        if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
            (new_parent->flags & KERNFS_EMPTY_DIR))
                goto out;

        old_parent = kernfs_parent(kn);
        if (root->flags & KERNFS_ROOT_INVARIANT_PARENT) {
                error = -EINVAL;
                if (WARN_ON_ONCE(old_parent != new_parent))
                        goto out;
        }

        error = 0;
        old_name = kernfs_rcu_name(kn);
        if (!new_name)
                new_name = old_name;
        if ((old_parent == new_parent) &&
            (kernfs_ns_id(kn->ns) == kernfs_ns_id(new_ns)) &&
            (strcmp(old_name, new_name) == 0))
                goto out;        /* nothing to rename */

        error = -EEXIST;
        if (kernfs_find_ns(new_parent, new_name, new_ns))
                goto out;

        /* rename kernfs_node */
        if (strcmp(old_name, new_name) != 0) {
                error = -ENOMEM;
                new_name = kstrdup_const(new_name, GFP_KERNEL);
                if (!new_name)
                        goto out;
        } else {
                new_name = NULL;
        }

        /*
         * Move to the appropriate place in the appropriate directories rbtree.
         */
        kernfs_unlink_sibling(kn);

        /* rename_lock protects ->parent accessors */
        if (old_parent != new_parent) {
                kernfs_get(new_parent);
                write_lock_irq(&root->kernfs_rename_lock);

                rcu_assign_pointer(kn->__parent, new_parent);

                kn->ns = new_ns;
                if (new_name)
                        rcu_assign_pointer(kn->name, new_name);

                write_unlock_irq(&root->kernfs_rename_lock);
                kernfs_put(old_parent);
        } else {
                /* name assignment is RCU protected, parent is the same */
                kn->ns = new_ns;
                if (new_name)
                        rcu_assign_pointer(kn->name, new_name);
        }

        kn->hash = kernfs_name_hash(new_name ?: old_name, kn->ns);
        kernfs_link_sibling(kn);

        if (new_name && !is_kernel_rodata((unsigned long)old_name))
                kfree_rcu_mightsleep(old_name);

        error = 0;
 out:
        up_write(&root->kernfs_rwsem);
        return error;
}

static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
{
        kernfs_put(filp->private_data);
        return 0;
}

static struct kernfs_node *kernfs_dir_pos(const struct ns_common *ns,
        struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
{
        if (pos) {
                int valid = kernfs_active(pos) &&
                        rcu_access_pointer(pos->__parent) == parent &&
                        hash == pos->hash;
                kernfs_put(pos);
                if (!valid)
                        pos = NULL;
        }
        if (!pos && (hash > 1) && (hash < INT_MAX)) {
                struct rb_node *node = parent->dir.children.rb_node;
                u64 ns_id = kernfs_ns_id(ns);
                while (node) {
                        pos = rb_to_kn(node);

                        if (hash < pos->hash)
                                node = node->rb_left;
                        else if (hash > pos->hash)
                                node = node->rb_right;
                        else if (ns_id < kernfs_ns_id(pos->ns))
                                node = node->rb_left;
                        else if (ns_id > kernfs_ns_id(pos->ns))
                                node = node->rb_right;
                        else
                                break;
                }
        }
        /* Skip over entries which are dying/dead or in the wrong namespace */
        while (pos && (!kernfs_active(pos) ||
                       kernfs_ns_id(pos->ns) != kernfs_ns_id(ns))) {
                struct rb_node *node = rb_next(&pos->rb);
                if (!node)
                        pos = NULL;
                else
                        pos = rb_to_kn(node);
        }
        return pos;
}

static struct kernfs_node *kernfs_dir_next_pos(const struct ns_common *ns,
        struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
{
        pos = kernfs_dir_pos(ns, parent, ino, pos);
        if (pos) {
                do {
                        struct rb_node *node = rb_next(&pos->rb);
                        if (!node)
                                pos = NULL;
                        else
                                pos = rb_to_kn(node);
                } while (pos && (!kernfs_active(pos) ||
                        kernfs_ns_id(pos->ns) != kernfs_ns_id(ns)));
        }
        return pos;
}

static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dentry = file->f_path.dentry;
        struct kernfs_node *parent = kernfs_dentry_node(dentry);
        struct kernfs_node *pos = file->private_data;
        struct kernfs_root *root;
        const struct ns_common *ns = NULL;

        if (!dir_emit_dots(file, ctx))
                return 0;

        root = kernfs_root(parent);
        down_read(&root->kernfs_rwsem);

        if (kernfs_ns_enabled(parent))
                ns = kernfs_info(dentry->d_sb)->ns;

        for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
             pos;
             pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
                const char *name = kernfs_rcu_name(pos);
                unsigned int type = fs_umode_to_dtype(pos->mode);
                int len = strlen(name);
                ino_t ino = kernfs_ino(pos);

                ctx->pos = pos->hash;
                file->private_data = pos;
                kernfs_get(pos);

                if (!dir_emit(ctx, name, len, ino, type)) {
                        up_read(&root->kernfs_rwsem);
                        return 0;
                }
        }
        up_read(&root->kernfs_rwsem);
        file->private_data = NULL;
        ctx->pos = INT_MAX;
        return 0;
}

const struct file_operations kernfs_dir_fops = {
        .read                = generic_read_dir,
        .iterate_shared        = kernfs_fop_readdir,
        .release        = kernfs_dir_fop_release,
        .llseek                = generic_file_llseek,
};















































































































































   18 






    1 







    1 






    1 











    1 



    1 








    1 































































































































































































    1 


































































    1 




    1 
    1 













    1 











    1 
























    1 



    1 


    1 

    1 




































    1 






    1 

    1 




    1 



















    1 








    1 






















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/readahead.c - address_space-level file readahead.
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 09Apr2002        Andrew Morton
 *                Initial version.
 */

/**
 * DOC: Readahead Overview
 *
 * Readahead is used to read content into the page cache before it is
 * explicitly requested by the application.  Readahead only ever
 * attempts to read folios that are not yet in the page cache.  If a
 * folio is present but not up-to-date, readahead will not try to read
 * it. In that case a simple ->read_folio() will be requested.
 *
 * Readahead is triggered when an application read request (whether a
 * system call or a page fault) finds that the requested folio is not in
 * the page cache, or that it is in the page cache and has the
 * readahead flag set.  This flag indicates that the folio was read
 * as part of a previous readahead request and now that it has been
 * accessed, it is time for the next readahead.
 *
 * Each readahead request is partly synchronous read, and partly async
 * readahead.  This is reflected in the struct file_ra_state which
 * contains ->size being the total number of pages, and ->async_size
 * which is the number of pages in the async section.  The readahead
 * flag will be set on the first folio in this async section to trigger
 * a subsequent readahead.  Once a series of sequential reads has been
 * established, there should be no need for a synchronous component and
 * all readahead request will be fully asynchronous.
 *
 * When either of the triggers causes a readahead, three numbers need
 * to be determined: the start of the region to read, the size of the
 * region, and the size of the async tail.
 *
 * The start of the region is simply the first page address at or after
 * the accessed address, which is not currently populated in the page
 * cache.  This is found with a simple search in the page cache.
 *
 * The size of the async tail is determined by subtracting the size that
 * was explicitly requested from the determined request size, unless
 * this would be less than zero - then zero is used.  NOTE THIS
 * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
 * PAGE.  ALSO THIS CALCULATION IS NOT USED CONSISTENTLY.
 *
 * The size of the region is normally determined from the size of the
 * previous readahead which loaded the preceding pages.  This may be
 * discovered from the struct file_ra_state for simple sequential reads,
 * or from examining the state of the page cache when multiple
 * sequential reads are interleaved.  Specifically: where the readahead
 * was triggered by the readahead flag, the size of the previous
 * readahead is assumed to be the number of pages from the triggering
 * page to the start of the new readahead.  In these cases, the size of
 * the previous readahead is scaled, often doubled, for the new
 * readahead, though see get_next_ra_size() for details.
 *
 * If the size of the previous read cannot be determined, the number of
 * preceding pages in the page cache is used to estimate the size of
 * a previous read.  This estimate could easily be misled by random
 * reads being coincidentally adjacent, so it is ignored unless it is
 * larger than the current request, and it is not scaled up, unless it
 * is at the start of file.
 *
 * In general readahead is accelerated at the start of the file, as
 * reads from there are often sequential.  There are other minor
 * adjustments to the readahead size in various special cases and these
 * are best discovered by reading the code.
 *
 * The above calculation, based on the previous readahead size,
 * determines the size of the readahead, to which any requested read
 * size may be added.
 *
 * Readahead requests are sent to the filesystem using the ->readahead()
 * address space operation, for which mpage_readahead() is a canonical
 * implementation.  ->readahead() should normally initiate reads on all
 * folios, but may fail to read any or all folios without causing an I/O
 * error.  The page cache reading code will issue a ->read_folio() request
 * for any folio which ->readahead() did not read, and only an error
 * from this will be final.
 *
 * ->readahead() will generally call readahead_folio() repeatedly to get
 * each folio from those prepared for readahead.  It may fail to read a
 * folio by:
 *
 * * not calling readahead_folio() sufficiently many times, effectively
 *   ignoring some folios, as might be appropriate if the path to
 *   storage is congested.
 *
 * * failing to actually submit a read request for a given folio,
 *   possibly due to insufficient resources, or
 *
 * * getting an error during subsequent processing of a request.
 *
 * In the last two cases, the folio should be unlocked by the filesystem
 * to indicate that the read attempt has failed.  In the first case the
 * folio will be unlocked by the VFS.
 *
 * Those folios not in the final ``async_size`` of the request should be
 * considered to be important and ->readahead() should not fail them due
 * to congestion or temporary resource unavailability, but should wait
 * for necessary resources (e.g.  memory or indexing information) to
 * become available.  Folios in the final ``async_size`` may be
 * considered less urgent and failure to read them is more acceptable.
 * In this case it is best to use filemap_remove_folio() to remove the
 * folios from the page cache as is automatically done for folios that
 * were not fetched with readahead_folio().  This will allow a
 * subsequent synchronous readahead request to try them again.  If they
 * are left in the page cache, then they will be read individually using
 * ->read_folio() which may be less efficient.
 */

#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagemap.h>
#include <linux/psi.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>

#define CREATE_TRACE_POINTS
#include <trace/events/readahead.h>

#include "internal.h"

/*
 * Initialise a struct file's readahead state.  Assumes that the caller has
 * memset *ra to zero.
 */
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
        ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
        ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);

static void read_pages(struct readahead_control *rac)
{
        const struct address_space_operations *aops = rac->mapping->a_ops;
        struct folio *folio;
        struct blk_plug plug;

        if (!readahead_count(rac))
                return;

        if (unlikely(rac->_workingset))
                psi_memstall_enter(&rac->_pflags);
        blk_start_plug(&plug);

        if (aops->readahead) {
                aops->readahead(rac);
                /* Clean up the remaining folios. */
                while ((folio = readahead_folio(rac)) != NULL) {
                        folio_get(folio);
                        filemap_remove_folio(folio);
                        folio_unlock(folio);
                        folio_put(folio);
                }
        } else {
                while ((folio = readahead_folio(rac)) != NULL)
                        aops->read_folio(rac->file, folio);
        }

        blk_finish_plug(&plug);
        if (unlikely(rac->_workingset))
                psi_memstall_leave(&rac->_pflags);
        rac->_workingset = false;

        BUG_ON(readahead_count(rac));
}

static struct folio *ractl_alloc_folio(struct readahead_control *ractl,
                                       gfp_t gfp_mask, unsigned int order)
{
        struct folio *folio;

        folio = filemap_alloc_folio(gfp_mask, order, NULL);
        if (folio && ractl->dropbehind)
                __folio_set_dropbehind(folio);

        return folio;
}

/**
 * page_cache_ra_unbounded - Start unchecked readahead.
 * @ractl: Readahead control.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller, and ractl->mapping->invalidate_lock
 * must be held by the caller at least in shared mode.  Mutexes may be held by
 * caller.  May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_ra_unbounded(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct address_space *mapping = ractl->mapping;
        unsigned long index = readahead_index(ractl);
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
        unsigned long mark = ULONG_MAX, i = 0;
        unsigned int min_nrpages = mapping_min_folio_nrpages(mapping);

        /*
         * Partway through the readahead operation, we will have added
         * locked pages to the page cache, but will not yet have submitted
         * them for I/O.  Adding another page may need to allocate memory,
         * which can trigger memory reclaim.  Telling the VM we're in
         * the middle of a filesystem operation will cause it to not
         * touch file-backed pages, preventing a deadlock.  Most (all?)
         * filesystems already specify __GFP_NOFS in their mapping's
         * gfp_mask, but let's be explicit here.
         */
        unsigned int nofs = memalloc_nofs_save();

        lockdep_assert_held(&mapping->invalidate_lock);

        trace_page_cache_ra_unbounded(mapping->host, index, nr_to_read,
                                      lookahead_size);
        index = mapping_align_index(mapping, index);

        /*
         * As iterator `i` is aligned to min_nrpages, round_up the
         * difference between nr_to_read and lookahead_size to mark the
         * index that only has lookahead or "async_region" to set the
         * readahead flag.
         */
        if (lookahead_size <= nr_to_read) {
                unsigned long ra_folio_index;

                ra_folio_index = round_up(readahead_index(ractl) +
                                          nr_to_read - lookahead_size,
                                          min_nrpages);
                mark = ra_folio_index - index;
        }
        nr_to_read += readahead_index(ractl) - index;
        ractl->_index = index;

        /*
         * Preallocate as many pages as we will need.
         */
        while (i < nr_to_read) {
                struct folio *folio = xa_load(&mapping->i_pages, index + i);
                int ret;

                if (folio && !xa_is_value(folio)) {
                        /*
                         * Page already present?  Kick off the current batch
                         * of contiguous pages before continuing with the
                         * next batch.  This page may be the one we would
                         * have intended to mark as Readahead, but we don't
                         * have a stable reference to this page, and it's
                         * not worth getting one just for that.
                         */
                        read_pages(ractl);
                        ractl->_index += min_nrpages;
                        i = ractl->_index + ractl->_nr_pages - index;
                        continue;
                }

                folio = ractl_alloc_folio(ractl, gfp_mask,
                                        mapping_min_folio_order(mapping));
                if (!folio)
                        break;

                ret = filemap_add_folio(mapping, folio, index + i, gfp_mask);
                if (ret < 0) {
                        folio_put(folio);
                        if (ret == -ENOMEM)
                                break;
                        read_pages(ractl);
                        ractl->_index += min_nrpages;
                        i = ractl->_index + ractl->_nr_pages - index;
                        continue;
                }
                if (i == mark)
                        folio_set_readahead(folio);
                ractl->_workingset |= folio_test_workingset(folio);
                ractl->_nr_pages += min_nrpages;
                i += min_nrpages;
        }

        /*
         * Now start the IO.  We ignore I/O errors - if the folio is not
         * uptodate then the caller will launch read_folio again, and
         * will then handle the error.
         */
        read_pages(ractl);
        memalloc_nofs_restore(nofs);
}
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);

/*
 * do_page_cache_ra() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
static void do_page_cache_ra(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct address_space *mapping = ractl->mapping;
        unsigned long index = readahead_index(ractl);
        loff_t isize = i_size_read(mapping->host);
        pgoff_t end_index;        /* The last page we want to read */

        if (isize == 0)
                return;

        end_index = (isize - 1) >> PAGE_SHIFT;
        if (index > end_index)
                return;
        /* Don't read past the page containing the last byte of the file */
        if (nr_to_read > end_index - index)
                nr_to_read = end_index - index + 1;

        filemap_invalidate_lock_shared(mapping);
        page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
        filemap_invalidate_unlock_shared(mapping);
}

/*
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
void force_page_cache_ra(struct readahead_control *ractl,
                unsigned long nr_to_read)
{
        struct address_space *mapping = ractl->mapping;
        struct file_ra_state *ra = ractl->ra;
        struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        unsigned long max_pages;

        if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead))
                return;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
        nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
        while (nr_to_read) {
                unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;

                if (this_chunk > nr_to_read)
                        this_chunk = nr_to_read;
                do_page_cache_ra(ractl, this_chunk, 0);

                nr_to_read -= this_chunk;
        }
}

/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
 */
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
        unsigned long newsize = roundup_pow_of_two(size);

        if (newsize <= max / 32)
                newsize = newsize * 4;
        else if (newsize <= max / 4)
                newsize = newsize * 2;
        else
                newsize = max;

        return newsize;
}

/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
static unsigned long get_next_ra_size(struct file_ra_state *ra,
                                      unsigned long max)
{
        unsigned long cur = ra->size;

        if (cur < max / 16)
                return 4 * cur;
        if (cur <= max / 2)
                return 2 * cur;
        return max;
}

/*
 * On-demand readahead design.
 *
 * The fields in struct file_ra_state represent the most-recently-executed
 * readahead attempt:
 *
 *                        |<----- async_size ---------|
 *     |------------------- size -------------------->|
 *     |==================#===========================|
 *     ^start             ^page marked with PG_readahead
 *
 * To overlap application thinking time and disk I/O time, we do
 * `readahead pipelining': Do not wait until the application consumed all
 * readahead pages and stalled on the missing page at readahead_index;
 * Instead, submit an asynchronous readahead I/O as soon as there are
 * only async_size pages left in the readahead window. Normally async_size
 * will be equal to size, for maximum pipelining.
 *
 * In interleaved sequential reads, concurrent streams on the same fd can
 * be invalidating each other's readahead state. So we flag the new readahead
 * page at (start+size-async_size) with PG_readahead, and use it as readahead
 * indicator. The flag won't be set on already cached pages, to avoid the
 * readahead-for-nothing fuss, saving pointless page cache lookups.
 *
 * prev_pos tracks the last visited byte in the _previous_ read request.
 * It should be maintained by the caller, and will be used for detecting
 * small random reads. Note that the readahead algorithm checks loosely
 * for sequential patterns. Hence interleaved reads might be served as
 * sequential ones.
 *
 * There is a special-case: if the first page which the application tries to
 * read happens to be the first page of the file, it is assumed that a linear
 * read is about to happen and the window is immediately set to the initial size
 * based on I/O request size and the max_readahead.
 *
 * The code ramps up the readahead size aggressively at first, but slow down as
 * it approaches max_readahead.
 */

static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
                pgoff_t mark, unsigned int order, gfp_t gfp)
{
        int err;
        struct folio *folio = ractl_alloc_folio(ractl, gfp, order);

        if (!folio)
                return -ENOMEM;
        mark = round_down(mark, 1UL << order);
        if (index == mark)
                folio_set_readahead(folio);
        err = filemap_add_folio(ractl->mapping, folio, index, gfp);
        if (err) {
                folio_put(folio);
                return err;
        }

        ractl->_nr_pages += 1UL << order;
        ractl->_workingset |= folio_test_workingset(folio);
        return 0;
}

void page_cache_ra_order(struct readahead_control *ractl,
                struct file_ra_state *ra)
{
        struct address_space *mapping = ractl->mapping;
        pgoff_t start = readahead_index(ractl);
        pgoff_t index = start;
        unsigned int min_order = mapping_min_folio_order(mapping);
        pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
        pgoff_t mark = index + ra->size - ra->async_size;
        unsigned int nofs;
        int err = 0;
        gfp_t gfp = readahead_gfp_mask(mapping);
        unsigned int new_order = ra->order;

        trace_page_cache_ra_order(mapping->host, start, ra);
        if (!mapping_large_folio_support(mapping)) {
                ra->order = 0;
                goto fallback;
        }

        limit = min(limit, index + ra->size - 1);

        new_order = min(mapping_max_folio_order(mapping), new_order);
        new_order = min_t(unsigned int, new_order, ilog2(ra->size));
        new_order = max(new_order, min_order);

        ra->order = new_order;

        /* See comment in page_cache_ra_unbounded() */
        nofs = memalloc_nofs_save();
        filemap_invalidate_lock_shared(mapping);
        /*
         * If the new_order is greater than min_order and index is
         * already aligned to new_order, then this will be noop as index
         * aligned to new_order should also be aligned to min_order.
         */
        ractl->_index = mapping_align_index(mapping, index);
        index = readahead_index(ractl);

        while (index <= limit) {
                unsigned int order = new_order;

                /* Align with smaller pages if needed */
                if (index & ((1UL << order) - 1))
                        order = __ffs(index);
                /* Don't allocate pages past EOF */
                while (order > min_order && index + (1UL << order) - 1 > limit)
                        order--;
                err = ra_alloc_folio(ractl, index, mark, order, gfp);
                if (err)
                        break;
                index += 1UL << order;
        }

        read_pages(ractl);
        filemap_invalidate_unlock_shared(mapping);
        memalloc_nofs_restore(nofs);

        /*
         * If there were already pages in the page cache, then we may have
         * left some gaps.  Let the regular readahead code take care of this
         * situation below.
         */
        if (!err)
                return;
fallback:
        /*
         * ->readahead() may have updated readahead window size so we have to
         * check there's still something to read.
         */
        if (ra->size > index - start)
                do_page_cache_ra(ractl, ra->size - (index - start),
                                 ra->async_size);
}

static unsigned long ractl_max_pages(struct readahead_control *ractl,
                unsigned long req_size)
{
        struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
        unsigned long max_pages = ractl->ra->ra_pages;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        if (req_size > max_pages && bdi->io_pages > max_pages)
                max_pages = min(req_size, bdi->io_pages);
        return max_pages;
}

void page_cache_sync_ra(struct readahead_control *ractl,
                unsigned long req_count)
{
        pgoff_t index = readahead_index(ractl);
        bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
        struct file_ra_state *ra = ractl->ra;
        unsigned long max_pages, contig_count;
        pgoff_t prev_index, miss;

        trace_page_cache_sync_ra(ractl->mapping->host, index, ra, req_count);
        /*
         * Even if readahead is disabled, issue this request as readahead
         * as we'll need it to satisfy the requested range. The forced
         * readahead will do the right thing and limit the read to just the
         * requested range, which we'll set to 1 page for this case.
         */
        if (!ra->ra_pages || blk_cgroup_congested()) {
                if (!ractl->file)
                        return;
                req_count = 1;
                do_forced_ra = true;
        }

        /* be dumb */
        if (do_forced_ra) {
                force_page_cache_ra(ractl, req_count);
                return;
        }

        max_pages = ractl_max_pages(ractl, req_count);
        prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
        /*
         * A start of file, oversized read, or sequential cache miss:
         * trivial case: (index - prev_index) == 1
         * unaligned reads: (index - prev_index) == 0
         */
        if (!index || req_count > max_pages || index - prev_index <= 1UL) {
                ra->start = index;
                ra->size = get_init_ra_size(req_count, max_pages);
                ra->async_size = ra->size > req_count ? ra->size - req_count :
                                                        ra->size >> 1;
                goto readit;
        }

        /*
         * Query the page cache and look for the traces(cached history pages)
         * that a sequential stream would leave behind.
         */
        rcu_read_lock();
        miss = page_cache_prev_miss(ractl->mapping, index - 1, max_pages);
        rcu_read_unlock();
        contig_count = index - miss - 1;
        /*
         * Standalone, small random read. Read as is, and do not pollute the
         * readahead state.
         */
        if (contig_count <= req_count) {
                do_page_cache_ra(ractl, req_count, 0);
                return;
        }
        /*
         * File cached from the beginning:
         * it is a strong indication of long-run stream (or whole-file-read)
         */
        if (miss == ULONG_MAX)
                contig_count *= 2;
        ra->start = index;
        ra->size = min(contig_count + req_count, max_pages);
        ra->async_size = 1;
readit:
        ra->order = 0;
        ractl->_index = ra->start;
        page_cache_ra_order(ractl, ra);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);

void page_cache_async_ra(struct readahead_control *ractl,
                struct folio *folio, unsigned long req_count)
{
        unsigned long max_pages;
        struct file_ra_state *ra = ractl->ra;
        pgoff_t index = readahead_index(ractl);
        pgoff_t expected, start, end, aligned_end, align;

        /* no readahead */
        if (!ra->ra_pages)
                return;

        /*
         * Same bit is used for PG_readahead and PG_reclaim.
         */
        if (folio_test_writeback(folio))
                return;

        trace_page_cache_async_ra(ractl->mapping->host, index, ra, req_count);
        folio_clear_readahead(folio);

        if (blk_cgroup_congested())
                return;

        max_pages = ractl_max_pages(ractl, req_count);
        /*
         * It's the expected callback index, assume sequential access.
         * Ramp up sizes, and push forward the readahead window.
         */
        expected = round_down(ra->start + ra->size - ra->async_size,
                        folio_nr_pages(folio));
        if (index == expected) {
                ra->start += ra->size;
                /*
                 * In the case of MADV_HUGEPAGE, the actual size might exceed
                 * the readahead window.
                 */
                ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
                goto readit;
        }

        /*
         * Hit a marked folio without valid readahead state.
         * E.g. interleaved reads.
         * Query the pagecache for async_size, which normally equals to
         * readahead size. Ramp it up and use it as the new readahead size.
         */
        rcu_read_lock();
        start = page_cache_next_miss(ractl->mapping, index + 1, max_pages);
        rcu_read_unlock();

        if (!start || start - index > max_pages)
                return;

        ra->start = start;
        ra->size = start - index;        /* old async_size */
        ra->size += req_count;
        ra->size = get_next_ra_size(ra, max_pages);
readit:
        ra->order += 2;
        align = 1UL << min(ra->order, ffs(max_pages) - 1);
        end = ra->start + ra->size;
        aligned_end = round_down(end, align);
        if (aligned_end > ra->start)
                ra->size -= end - aligned_end;
        ra->async_size = ra->size;
        ractl->_index = ra->start;
        page_cache_ra_order(ractl, ra);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);

ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
        struct file *file;
        const struct inode *inode;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        file = fd_file(f);
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;

        /*
         * The readahead() syscall is intended to run only on files
         * that can execute readahead. If readahead is not possible
         * on this file, then we must return -EINVAL.
         */
        if (!file->f_mapping)
                return -EINVAL;
        if (!file->f_mapping->a_ops)
                return -EINVAL;

        inode = file_inode(file);
        if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
                return -EINVAL;
        if (IS_ANON_FILE(inode))
                return -EINVAL;

        return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
}

SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
        return ksys_readahead(fd, offset, count);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD)
COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count)
{
        return ksys_readahead(fd, compat_arg_u64_glue(offset), count);
}
#endif

/**
 * readahead_expand - Expand a readahead request
 * @ractl: The request to be expanded
 * @new_start: The revised start
 * @new_len: The revised size of the request
 *
 * Attempt to expand a readahead request outwards from the current size to the
 * specified size by inserting locked pages before and after the current window
 * to increase the size to the new window.  This may involve the insertion of
 * THPs, in which case the window may get expanded even beyond what was
 * requested.
 *
 * The algorithm will stop if it encounters a conflicting page already in the
 * pagecache and leave a smaller expansion than requested.
 *
 * The caller must check for this by examining the revised @ractl object for a
 * different expansion than was requested.
 */
void readahead_expand(struct readahead_control *ractl,
                      loff_t new_start, size_t new_len)
{
        struct address_space *mapping = ractl->mapping;
        struct file_ra_state *ra = ractl->ra;
        pgoff_t new_index, new_nr_pages;
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
        unsigned long min_nrpages = mapping_min_folio_nrpages(mapping);
        unsigned int min_order = mapping_min_folio_order(mapping);

        new_index = new_start / PAGE_SIZE;
        /*
         * Readahead code should have aligned the ractl->_index to
         * min_nrpages before calling readahead aops.
         */
        VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages));

        /* Expand the leading edge downwards */
        while (ractl->_index > new_index) {
                unsigned long index = ractl->_index - 1;
                struct folio *folio = xa_load(&mapping->i_pages, index);

                if (folio && !xa_is_value(folio))
                        return; /* Folio apparently present */

                folio = ractl_alloc_folio(ractl, gfp_mask, min_order);
                if (!folio)
                        return;

                index = mapping_align_index(mapping, index);
                if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
                        folio_put(folio);
                        return;
                }
                if (unlikely(folio_test_workingset(folio)) &&
                                !ractl->_workingset) {
                        ractl->_workingset = true;
                        psi_memstall_enter(&ractl->_pflags);
                }
                ractl->_nr_pages += min_nrpages;
                ractl->_index = folio->index;
        }

        new_len += new_start - readahead_pos(ractl);
        new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);

        /* Expand the trailing edge upwards */
        while (ractl->_nr_pages < new_nr_pages) {
                unsigned long index = ractl->_index + ractl->_nr_pages;
                struct folio *folio = xa_load(&mapping->i_pages, index);

                if (folio && !xa_is_value(folio))
                        return; /* Folio apparently present */

                folio = ractl_alloc_folio(ractl, gfp_mask, min_order);
                if (!folio)
                        return;

                index = mapping_align_index(mapping, index);
                if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
                        folio_put(folio);
                        return;
                }
                if (unlikely(folio_test_workingset(folio)) &&
                                !ractl->_workingset) {
                        ractl->_workingset = true;
                        psi_memstall_enter(&ractl->_pflags);
                }
                ractl->_nr_pages += min_nrpages;
                if (ra) {
                        ra->size += min_nrpages;
                        ra->async_size += min_nrpages;
                }
        }
}
EXPORT_SYMBOL(readahead_expand);
































































   38 



   38 





























   28 
























































    2 



    2 



























    3 





















   63 












































































































   29 


    1 













































































































    1 

    1 


    1 



































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_FIND_H_
#define __LINUX_FIND_H_

#ifndef __LINUX_BITMAP_H
#error only <linux/bitmap.h> can be included directly
#endif

#include <linux/bitops.h>

unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits,
                                unsigned long start);
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
                                         unsigned long start);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n);
unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n);
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        const unsigned long *addr3, unsigned long size,
                                        unsigned long n);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
                                         const unsigned long *addr2, unsigned long size);
unsigned long _find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                 unsigned long size);
unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                      const unsigned long *addr3, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);

#ifdef __BIG_ENDIAN
unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
                                        long size, unsigned long offset);
unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
                                long size, unsigned long offset);
#endif

unsigned long find_random_bit(const unsigned long *addr, unsigned long size);

#ifndef find_next_bit
/**
 * find_next_bit - find the next set bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
                            unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_bit(addr, size, offset);
}
#endif

#ifndef find_next_and_bit
/**
 * find_next_and_bit - find the next set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr1 & *addr2 & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_and_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_andnot_bit
/**
 * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits
 *                        in *addr2
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_andnot_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr1 & ~*addr2 & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_andnot_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_or_bit
/**
 * find_next_or_bit - find the next set bit in either memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_or_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = (*addr1 | *addr2) & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_or_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_zero_bit
/**
 * find_next_zero_bit - find the next cleared bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number of the next zero bit
 * If no bits are zero, returns @size.
 */
static __always_inline
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
                                 unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr | ~GENMASK(size - 1, offset);
                return val == ~0UL ? size : ffz(val);
        }

        return _find_next_zero_bit(addr, size, offset);
}
#endif

#ifndef find_first_bit
/**
 * find_first_bit - find the first set bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 *
 * Returns the bit number of the first set bit.
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_bit(addr, size);
}
#endif

/**
 * find_nth_bit - find N'th set bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * The following is semantically equivalent:
 *         idx = find_nth_bit(addr, size, 0);
 *         idx = find_first_bit(addr, size);
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns >= @size.
 */
static __always_inline
unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_bit(addr, size, n);
}

/**
 * find_nth_and_bit - find N'th set bit in 2 memory regions
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static __always_inline
unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & *addr2 & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_and_bit(addr1, addr2, size, n);
}

/**
 * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
 *                             excluding those set in 3rd region
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @addr3: The 3rd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static __always_inline
unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        const unsigned long *addr3,
                                        unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
}

#ifndef find_first_and_bit
/**
 * find_first_and_bit - find the first set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_first_and_bit(const unsigned long *addr1,
                                 const unsigned long *addr2,
                                 unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_and_bit(addr1, addr2, size);
}
#endif

/**
 * find_first_andnot_bit - find the first bit set in 1st memory region and unset in 2nd
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the first set bit
 * If no bits are set, returns >= @size.
 */
static __always_inline
unsigned long find_first_andnot_bit(const unsigned long *addr1,
                                 const unsigned long *addr2,
                                 unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_andnot_bit(addr1, addr2, size);
}

/**
 * find_first_and_and_bit - find the first set bit in 3 memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @addr3: The third address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the first set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_first_and_and_bit(const unsigned long *addr1,
                                     const unsigned long *addr2,
                                     const unsigned long *addr3,
                                     unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_and_and_bit(addr1, addr2, addr3, size);
}

#ifndef find_first_zero_bit
/**
 * find_first_zero_bit - find the first cleared bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 *
 * Returns the bit number of the first cleared bit.
 * If no bits are zero, returns @size.
 */
static __always_inline
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr | ~GENMASK(size - 1, 0);

                return val == ~0UL ? size : ffz(val);
        }

        return _find_first_zero_bit(addr, size);
}
#endif

#ifndef find_last_bit
/**
 * find_last_bit - find the last set bit in a memory region
 * @addr: The address to start the search at
 * @size: The number of bits to search
 *
 * Returns the bit number of the last set bit, or size.
 */
static __always_inline
unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr & GENMASK(size - 1, 0);

                return val ? __fls(val) : size;
        }

        return _find_last_bit(addr, size);
}
#endif

/**
 * find_next_and_bit_wrap - find the next set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit, or first set bit up to @offset
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        unsigned long size, unsigned long offset)
{
        unsigned long bit = find_next_and_bit(addr1, addr2, size, offset);

        if (bit < size || offset == 0)
                return bit;

        bit = find_first_and_bit(addr1, addr2, offset);
        return bit < offset ? bit : size;
}

/**
 * find_next_bit_wrap - find the next set bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit, or first set bit up to @offset
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_bit_wrap(const unsigned long *addr,
                                        unsigned long size, unsigned long offset)
{
        unsigned long bit = find_next_bit(addr, size, offset);

        if (bit < size || offset == 0)
                return bit;

        bit = find_first_bit(addr, offset);
        return bit < offset ? bit : size;
}

/*
 * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing
 * before using it alone.
 */
static __always_inline
unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
                                 unsigned long start, unsigned long n)
{
        unsigned long bit;

        /* If not wrapped around */
        if (n > start) {
                /* and have a bit, just return it. */
                bit = find_next_bit(bitmap, size, n);
                if (bit < size)
                        return bit;

                /* Otherwise, wrap around and ... */
                n = 0;
        }

        /* Search the other part. */
        bit = find_next_bit(bitmap, start, n);
        return bit < start ? bit : size;
}

/**
 * find_next_clump8 - find next 8-bit clump with set bits in a memory region
 * @clump: location to store copy of found clump
 * @addr: address to base the search on
 * @size: bitmap size in number of bits
 * @offset: bit offset at which to start searching
 *
 * Returns the bit offset for the next set clump; the found clump value is
 * copied to the location pointed by @clump. If no bits are set, returns @size.
 */
extern unsigned long find_next_clump8(unsigned long *clump,
                                      const unsigned long *addr,
                                      unsigned long size, unsigned long offset);

#define find_first_clump8(clump, bits, size) \
        find_next_clump8((clump), (bits), (size), 0)

#if defined(__LITTLE_ENDIAN)

static __always_inline
unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset)
{
        return find_next_zero_bit(addr, size, offset);
}

static __always_inline
unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset)
{
        return find_next_bit(addr, size, offset);
}

static __always_inline
unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
{
        return find_first_zero_bit(addr, size);
}

#elif defined(__BIG_ENDIAN)

#ifndef find_next_zero_bit_le
static __always_inline
unsigned long find_next_zero_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val = *(const unsigned long *)addr;

                if (unlikely(offset >= size))
                        return size;

                val = swab(val) | ~GENMASK(size - 1, offset);
                return val == ~0UL ? size : ffz(val);
        }

        return _find_next_zero_bit_le(addr, size, offset);
}
#endif

#ifndef find_first_zero_bit_le
static __always_inline
unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0);

                return val == ~0UL ? size : ffz(val);
        }

        return _find_first_zero_bit_le(addr, size);
}
#endif

#ifndef find_next_bit_le
static __always_inline
unsigned long find_next_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val = *(const unsigned long *)addr;

                if (unlikely(offset >= size))
                        return size;

                val = swab(val) & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_bit_le(addr, size, offset);
}
#endif

#else
#error "Please fix <asm/byteorder.h>"
#endif

#define for_each_set_bit(bit, addr, size) \
        for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

#define for_each_and_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

#define for_each_andnot_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

#define for_each_or_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

/* same as for_each_set_bit() but use bit as value to start with */
#define for_each_set_bit_from(bit, addr, size) \
        for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

#define for_each_clear_bit(bit, addr, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size);                \
             (bit)++)

/* same as for_each_clear_bit() but use bit as value to start with */
#define for_each_clear_bit_from(bit, addr, size) \
        for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

/**
 * for_each_set_bitrange - iterate over all set bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit)
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_bitrange(b, e, addr, size)                        \
        for ((b) = 0;                                                \
             (b) = find_next_bit((addr), (size), b),                \
             (e) = find_next_zero_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_set_bitrange_from - iterate over all set bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit); must be initialized
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_bitrange_from(b, e, addr, size)                \
        for (;                                                        \
             (b) = find_next_bit((addr), (size), (b)),                \
             (e) = find_next_zero_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_clear_bitrange - iterate over all unset bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first unset bit)
 * @e: bit offset of end of current bitrange (first set bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_clear_bitrange(b, e, addr, size)                \
        for ((b) = 0;                                                \
             (b) = find_next_zero_bit((addr), (size), (b)),        \
             (e) = find_next_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit); must be initialized
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_clear_bitrange_from(b, e, addr, size)                \
        for (;                                                        \
             (b) = find_next_zero_bit((addr), (size), (b)),        \
             (e) = find_next_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_set_bit_wrap - iterate over all set bits starting from @start, and
 * wrapping around the end of bitmap.
 * @bit: offset for current iteration
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 * @start: Starting bit for bitmap traversing, wrapping around the bitmap end
 */
#define for_each_set_bit_wrap(bit, addr, size, start) \
        for ((bit) = find_next_bit_wrap((addr), (size), (start));                \
             (bit) < (size);                                                        \
             (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))

/**
 * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
 * @start: bit offset to start search and to store the current iteration offset
 * @clump: location to store copy of current 8-bit clump
 * @bits: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_clump8(start, clump, bits, size) \
        for ((start) = find_first_clump8(&(clump), (bits), (size)); \
             (start) < (size); \
             (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))

#endif /*__LINUX_FIND_H_ */
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NETFILTER_NETDEV_H_
#define _NETFILTER_NETDEV_H_

#include <linux/netfilter.h>
#include <linux/netdevice.h>

#ifdef CONFIG_NETFILTER_INGRESS
static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
{
#ifdef CONFIG_JUMP_LABEL
        if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
                return false;
#endif
        return rcu_access_pointer(skb->dev->nf_hooks_ingress);
}

/* caller must hold rcu_read_lock */
static inline int nf_hook_ingress(struct sk_buff *skb)
{
        struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
        struct nf_hook_state state;
        int ret;

        /* Must recheck the ingress hook head, in the event it became NULL
         * after the check in nf_hook_ingress_active evaluated to true.
         */
        if (unlikely(!e))
                return 0;

        nf_hook_state_init(&state, NF_NETDEV_INGRESS,
                           NFPROTO_NETDEV, skb->dev, NULL, NULL,
                           dev_net(skb->dev), NULL);
        ret = nf_hook_slow(skb, &state, e, 0);
        if (ret == 0)
                return -1;

        return ret;
}

#else /* CONFIG_NETFILTER_INGRESS */
static inline int nf_hook_ingress_active(struct sk_buff *skb)
{
        return 0;
}

static inline int nf_hook_ingress(struct sk_buff *skb)
{
        return 0;
}
#endif /* CONFIG_NETFILTER_INGRESS */

#ifdef CONFIG_NETFILTER_EGRESS
static inline bool nf_hook_egress_active(void)
{
#ifdef CONFIG_JUMP_LABEL
        if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS]))
                return false;
#endif
        return true;
}

/**
 * nf_hook_egress - classify packets before transmission
 * @skb: packet to be classified
 * @rc: result code which shall be returned by __dev_queue_xmit() on failure
 * @dev: netdev whose egress hooks shall be applied to @skb
 *
 * Caller must hold rcu_read_lock.
 *
 * On ingress, packets are classified first by tc, then by netfilter.
 * On egress, the order is reversed for symmetry.  Conceptually, tc and
 * netfilter can be thought of as layers, with netfilter layered above tc:
 * When tc redirects a packet to another interface, netfilter is not applied
 * because the packet is on the tc layer.
 *
 * The nf_skip_egress flag controls whether netfilter is applied on egress.
 * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the
 * packet passes through tc and netfilter.  Because __dev_queue_xmit() may be
 * called recursively by tunnel drivers such as vxlan, the flag is reverted to
 * false after sch_handle_egress().  This ensures that netfilter is applied
 * both on the overlay and underlying network.
 *
 * Returns: @skb on success or %NULL if the packet was consumed or filtered.
 */
static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
                                             struct net_device *dev)
{
        struct nf_hook_entries *e;
        struct nf_hook_state state;
        int ret;

#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        if (skb->nf_skip_egress)
                return skb;
#endif

        e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held());
        if (!e)
                return skb;

        nf_hook_state_init(&state, NF_NETDEV_EGRESS,
                           NFPROTO_NETDEV, NULL, dev, NULL,
                           dev_net(dev), NULL);

        /* nf assumes rcu_read_lock, not just read_lock_bh */
        rcu_read_lock();
        ret = nf_hook_slow(skb, &state, e, 0);
        rcu_read_unlock();

        if (ret == 1) {
                return skb;
        } else if (ret < 0) {
                *rc = NET_XMIT_DROP;
                return NULL;
        } else { /* ret == 0 */
                *rc = NET_XMIT_SUCCESS;
                return NULL;
        }
}
#else /* CONFIG_NETFILTER_EGRESS */
static inline bool nf_hook_egress_active(void)
{
        return false;
}

static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
                                             struct net_device *dev)
{
        return skb;
}
#endif /* CONFIG_NETFILTER_EGRESS */

static inline void nf_skip_egress(struct sk_buff *skb, bool skip)
{
#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        skb->nf_skip_egress = skip;
#endif
}

static inline void nf_hook_netdev_init(struct net_device *dev)
{
#ifdef CONFIG_NETFILTER_INGRESS
        RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        RCU_INIT_POINTER(dev->nf_hooks_egress, NULL);
#endif
}

#endif /* _NETFILTER_NETDEV_H_ */































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/extents_status.h
 *
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 * Modified by
 *        Allison Henderson <achender@linux.vnet.ibm.com>
 *        Zheng Liu <wenqing.lz@taobao.com>
 *
 */

#ifndef _EXT4_EXTENTS_STATUS_H
#define _EXT4_EXTENTS_STATUS_H

/*
 * Turn on ES_DEBUG__ to get lots of info about extent status operations.
 */
#ifdef ES_DEBUG__
#define es_debug(fmt, ...)        printk(fmt, ##__VA_ARGS__)
#else
#define es_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

/*
 * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
 * checked with old map_block's result.
 */
#define ES_AGGRESSIVE_TEST__

/*
 * These flags live in the high bits of extent_status.es_pblk
 */
enum {
        ES_WRITTEN_B,
        ES_UNWRITTEN_B,
        ES_DELAYED_B,
        ES_HOLE_B,
        ES_REFERENCED_B,
        ES_FLAGS
};

#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)

/*
 * Besides EXTENT_STATUS_REFERENCED, all these extent type masks
 * are exclusive, only one type can be set at a time.
 */
#define EXTENT_STATUS_WRITTEN        (1 << ES_WRITTEN_B)
#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
#define EXTENT_STATUS_DELAYED        (1 << ES_DELAYED_B)
#define EXTENT_STATUS_HOLE        (1 << ES_HOLE_B)
#define EXTENT_STATUS_REFERENCED        (1 << ES_REFERENCED_B)

#define ES_TYPE_MASK        ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
                          EXTENT_STATUS_UNWRITTEN | \
                          EXTENT_STATUS_DELAYED | \
                          EXTENT_STATUS_HOLE))

#define ES_TYPE_VALID(type)        ((type) && !((type) & ((type) - 1)))

struct ext4_sb_info;
struct ext4_extent;

struct extent_status {
        struct rb_node rb_node;
        ext4_lblk_t es_lblk;        /* first logical block extent covers */
        ext4_lblk_t es_len;        /* length of extent in block */
        ext4_fsblk_t es_pblk;        /* first physical block */
};

struct ext4_es_tree {
        struct rb_root root;
        struct extent_status *cache_es;        /* recently accessed extent */
};

struct ext4_es_stats {
        unsigned long es_stats_shrunk;
        struct percpu_counter es_stats_cache_hits;
        struct percpu_counter es_stats_cache_misses;
        u64 es_stats_scan_time;
        u64 es_stats_max_scan_time;
        struct percpu_counter es_stats_all_cnt;
        struct percpu_counter es_stats_shk_cnt;
};

/*
 * Pending cluster reservations for bigalloc file systems
 *
 * A cluster with a pending reservation is a logical cluster shared by at
 * least one extent in the extents status tree with delayed and unwritten
 * status and at least one other written or unwritten extent.  The
 * reservation is said to be pending because a cluster reservation would
 * have to be taken in the event all blocks in the cluster shared with
 * written or unwritten extents were deleted while the delayed and
 * unwritten blocks remained.
 *
 * The set of pending cluster reservations is an auxiliary data structure
 * used with the extents status tree to implement reserved cluster/block
 * accounting for bigalloc file systems.  The set is kept in memory and
 * records all pending cluster reservations.
 *
 * Its primary function is to avoid the need to read extents from the
 * disk when invalidating pages as a result of a truncate, punch hole, or
 * collapse range operation.  Page invalidation requires a decrease in the
 * reserved cluster count if it results in the removal of all delayed
 * and unwritten extents (blocks) from a cluster that is not shared with a
 * written or unwritten extent, and no decrease otherwise.  Determining
 * whether the cluster is shared can be done by searching for a pending
 * reservation on it.
 *
 * Secondarily, it provides a potentially faster method for determining
 * whether the reserved cluster count should be increased when a physical
 * cluster is deallocated as a result of a truncate, punch hole, or
 * collapse range operation.  The necessary information is also present
 * in the extents status tree, but might be more rapidly accessed in
 * the pending reservation set in many cases due to smaller size.
 *
 * The pending cluster reservation set is implemented as a red-black tree
 * with the goal of minimizing per page search time overhead.
 */

struct pending_reservation {
        struct rb_node rb_node;
        ext4_lblk_t lclu;
};

struct ext4_pending_tree {
        struct rb_root root;
};

extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);

extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                                  ext4_lblk_t len, ext4_fsblk_t pblk,
                                  unsigned int status,
                                  bool delalloc_reserve_used);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len, ext4_fsblk_t pblk,
                                 unsigned int status);
extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                                  ext4_lblk_t len);
extern void ext4_es_find_extent_range(struct inode *inode,
                                      int (*match_fn)(struct extent_status *es),
                                      ext4_lblk_t lblk, ext4_lblk_t end,
                                      struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t *next_lblk,
                                 struct extent_status *es, u64 *pseq);
extern bool ext4_es_scan_range(struct inode *inode,
                               int (*matching_fn)(struct extent_status *es),
                               ext4_lblk_t lblk, ext4_lblk_t end);
extern bool ext4_es_scan_clu(struct inode *inode,
                             int (*matching_fn)(struct extent_status *es),
                             ext4_lblk_t lblk);

static inline unsigned int ext4_es_status(struct extent_status *es)
{
        return es->es_pblk >> ES_SHIFT;
}

static inline unsigned int ext4_es_type(struct extent_status *es)
{
        return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK;
}

static inline int ext4_es_is_written(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
}

static inline int ext4_es_is_unwritten(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
}

static inline int ext4_es_is_delayed(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
}

static inline int ext4_es_is_hole(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
}

static inline int ext4_es_is_mapped(struct extent_status *es)
{
        return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}

static inline void ext4_es_set_referenced(struct extent_status *es)
{
        es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
}

static inline void ext4_es_clear_referenced(struct extent_status *es)
{
        es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
}

static inline int ext4_es_is_referenced(struct extent_status *es)
{
        return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
}

static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
{
        return es->es_pblk & ~ES_MASK;
}

static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es)
{
        ext4_fsblk_t pblock = ext4_es_pblock(es);
        return pblock == ~ES_MASK ? 0 : pblock;
}

static inline void ext4_es_store_pblock(struct extent_status *es,
                                        ext4_fsblk_t pb)
{
        ext4_fsblk_t block;

        block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
        es->es_pblk = block;
}

static inline void ext4_es_store_pblock_status(struct extent_status *es,
                                               ext4_fsblk_t pb,
                                               unsigned int status)
{
        WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK));

        es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
                      (pb & ~ES_MASK);
}

extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);

extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);

extern int __init ext4_init_pending(void);
extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
                                          ext4_lblk_t len, bool lclu_allocated,
                                          bool end_allocated);
extern void ext4_clear_inode_es(struct inode *inode);

#endif /* _EXT4_EXTENTS_STATUS_H */

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







    1 
















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/proc/base.c
 *
 *  Copyright (C) 1991, 1992 Linus Torvalds
 *
 *  proc base directory handling functions
 *
 *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
 *  Instead of using magical inumbers to determine the kind of object
 *  we allocate and fill in-core inodes upon lookup. They don't even
 *  go into icache. We cache the reference to task_struct upon lookup too.
 *  Eventually it should become a filesystem in its own. We don't use the
 *  rest of procfs anymore.
 *
 *
 *  Changelog:
 *  17-Jan-2005
 *  Allan Bezerra
 *  Bruna Moreira <bruna.moreira@indt.org.br>
 *  Edjard Mota <edjard.mota@indt.org.br>
 *  Ilias Biris <ilias.biris@indt.org.br>
 *  Mauricio Lin <mauricio.lin@indt.org.br>
 *
 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
 *
 *  A new process specific entry (smaps) included in /proc. It shows the
 *  size of rss for each memory area. The maps entry lacks information
 *  about physical memory size (rss) for each mapped file, i.e.,
 *  rss information for executables and library files.
 *  This additional information is useful for any tools that need to know
 *  about physical memory consumption for a process specific library.
 *
 *  Changelog:
 *  21-Feb-2005
 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
 *  Pud inclusion in the page table walking.
 *
 *  ChangeLog:
 *  10-Mar-2005
 *  10LE Instituto Nokia de Tecnologia - INdT:
 *  A better way to walks through the page table as suggested by Hugh Dickins.
 *
 *  Simo Piiroinen <simo.piiroinen@nokia.com>:
 *  Smaps information related to shared, private, clean and dirty pages.
 *
 *  Paul Mundt <paul.mundt@nokia.com>:
 *  Overall revision about smaps.
 */

#include <linux/uaccess.h>

#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/namei.h>
#include <linux/mnt_namespace.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/printk.h>
#include <linux/cache.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
#include <linux/poll.h>
#include <linux/nsproxy.h>
#include <linux/oom.h>
#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/fs_parser.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
#include <linux/posix-timers.h>
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
#include <linux/cn_proc.h>
#include <linux/ksm.h>
#include <uapi/linux/lsm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"

#include "../../lib/kstrtox.h"

/* NOTE:
 *        Implementing inode permission operations in /proc is almost
 *        certainly an error.  Permission checks need to happen during
 *        each system call not at open time.  The reason is that most of
 *        what we wish to check for permissions in /proc varies at runtime.
 *
 *        The classic example of a problem is opening file descriptors
 *        in /proc for a task before it execs a suid executable.
 */

static u8 nlink_tid __ro_after_init;
static u8 nlink_tgid __ro_after_init;

enum proc_mem_force {
        PROC_MEM_FORCE_ALWAYS,
        PROC_MEM_FORCE_PTRACE,
        PROC_MEM_FORCE_NEVER
};

static enum proc_mem_force proc_mem_force_override __ro_after_init =
        IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER :
        IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE :
        PROC_MEM_FORCE_ALWAYS;

static const struct constant_table proc_mem_force_table[] __initconst = {
        { "always", PROC_MEM_FORCE_ALWAYS },
        { "ptrace", PROC_MEM_FORCE_PTRACE },
        { "never", PROC_MEM_FORCE_NEVER },
        { }
};

static int __init early_proc_mem_force_override(char *buf)
{
        if (!buf)
                return -EINVAL;

        /*
         * lookup_constant() defaults to proc_mem_force_override to preseve
         * the initial Kconfig choice in case an invalid param gets passed.
         */
        proc_mem_force_override = lookup_constant(proc_mem_force_table,
                                                  buf, proc_mem_force_override);

        return 0;
}
early_param("proc_mem.force_override", early_proc_mem_force_override);

struct pid_entry {
        const char *name;
        unsigned int len;
        umode_t mode;
        const struct inode_operations *iop;
        const struct file_operations *fop;
        union proc_op op;
};

#define NOD(NAME, MODE, IOP, FOP, OP) {                        \
        .name = (NAME),                                        \
        .len  = sizeof(NAME) - 1,                        \
        .mode = MODE,                                        \
        .iop  = IOP,                                        \
        .fop  = FOP,                                        \
        .op   = OP,                                        \
}

#define DIR(NAME, MODE, iops, fops)        \
        NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
#define LNK(NAME, get_link)                                        \
        NOD(NAME, (S_IFLNK|S_IRWXUGO),                                \
                &proc_pid_link_inode_operations, NULL,                \
                { .proc_get_link = get_link } )
#define REG(NAME, MODE, fops)                                \
        NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
#define ONE(NAME, MODE, show)                                \
        NOD(NAME, (S_IFREG|(MODE)),                        \
                NULL, &proc_single_file_operations,        \
                { .proc_show = show } )
#define ATTR(LSMID, NAME, MODE)                                \
        NOD(NAME, (S_IFREG|(MODE)),                        \
                NULL, &proc_pid_attr_operations,        \
                { .lsmid = LSMID })

/*
 * Count the number of hardlinks for the pid_entry table, excluding the .
 * and .. links.
 */
static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
        unsigned int n)
{
        unsigned int i;
        unsigned int count;

        count = 2;
        for (i = 0; i < n; ++i) {
                if (S_ISDIR(entries[i].mode))
                        ++count;
        }

        return count;
}

static int get_task_root(struct task_struct *task, struct path *root)
{
        int result = -ENOENT;

        task_lock(task);
        if (task->fs) {
                get_fs_root(task->fs, root);
                result = 0;
        }
        task_unlock(task);
        return result;
}

static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
        struct task_struct *task = get_proc_task(d_inode(dentry));
        int result = -ENOENT;

        if (task) {
                task_lock(task);
                if (task->fs) {
                        get_fs_pwd(task->fs, path);
                        result = 0;
                }
                task_unlock(task);
                put_task_struct(task);
        }
        return result;
}

static int proc_root_link(struct dentry *dentry, struct path *path)
{
        struct task_struct *task = get_proc_task(d_inode(dentry));
        int result = -ENOENT;

        if (task) {
                result = get_task_root(task, path);
                put_task_struct(task);
        }
        return result;
}

/*
 * If the user used setproctitle(), we just get the string from
 * user space at arg_start, and limit it to a maximum of one page.
 */
static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
                                size_t count, unsigned long pos,
                                unsigned long arg_start)
{
        char *page;
        int ret, got;

        if (pos >= PAGE_SIZE)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        ret = 0;
        got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
        if (got > 0) {
                int len = strnlen(page, got);

                /* Include the NUL character if it was found */
                if (len < got)
                        len++;

                if (len > pos) {
                        len -= pos;
                        if (len > count)
                                len = count;
                        len -= copy_to_user(buf, page+pos, len);
                        if (!len)
                                len = -EFAULT;
                        ret = len;
                }
        }
        free_page((unsigned long)page);
        return ret;
}

static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
                              size_t count, loff_t *ppos)
{
        unsigned long arg_start, arg_end, env_start, env_end;
        unsigned long pos, len;
        char *page, c;

        /* Check if process spawned far enough to have cmdline. */
        if (!mm->env_end)
                return 0;

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        if (arg_start >= arg_end)
                return 0;

        /*
         * We allow setproctitle() to overwrite the argument
         * strings, and overflow past the original end. But
         * only when it overflows into the environment area.
         */
        if (env_start != arg_end || env_end < env_start)
                env_start = env_end = arg_end;
        len = env_end - arg_start;

        /* We're not going to care if "*ppos" has high bits set */
        pos = *ppos;
        if (pos >= len)
                return 0;
        if (count > len - pos)
                count = len - pos;
        if (!count)
                return 0;

        /*
         * Magical special case: if the argv[] end byte is not
         * zero, the user has overwritten it with setproctitle(3).
         *
         * Possible future enhancement: do this only once when
         * pos is 0, and set a flag in the 'struct file'.
         */
        if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
                return get_mm_proctitle(mm, buf, count, pos, arg_start);

        /*
         * For the non-setproctitle() case we limit things strictly
         * to the [arg_start, arg_end[ range.
         */
        pos += arg_start;
        if (pos < arg_start || pos >= arg_end)
                return 0;
        if (count > arg_end - pos)
                count = arg_end - pos;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        len = 0;
        while (count) {
                int got;
                size_t size = min_t(size_t, PAGE_SIZE, count);

                got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
                if (got <= 0)
                        break;
                got -= copy_to_user(buf, page, got);
                if (unlikely(!got)) {
                        if (!len)
                                len = -EFAULT;
                        break;
                }
                pos += got;
                buf += got;
                len += got;
                count -= got;
        }

        free_page((unsigned long)page);
        return len;
}

static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
                                size_t count, loff_t *pos)
{
        struct mm_struct *mm;
        ssize_t ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = get_mm_cmdline(mm, buf, count, pos);
        mmput(mm);
        return ret;
}

static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
                                     size_t count, loff_t *pos)
{
        struct task_struct *tsk;
        ssize_t ret;

        BUG_ON(*pos < 0);

        tsk = get_proc_task(file_inode(file));
        if (!tsk)
                return -ESRCH;
        ret = get_task_cmdline(tsk, buf, count, pos);
        put_task_struct(tsk);
        if (ret > 0)
                *pos += ret;
        return ret;
}

static const struct file_operations proc_pid_cmdline_ops = {
        .read        = proc_pid_cmdline_read,
        .llseek        = generic_file_llseek,
};

#ifdef CONFIG_KALLSYMS
/*
 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 * Returns the resolved symbol to user space.
 */
static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long wchan;
        char symname[KSYM_NAME_LEN];

        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto print0;

        wchan = get_wchan(task);
        if (wchan && !lookup_symbol_name(wchan, symname)) {
                seq_puts(m, symname);
                return 0;
        }

print0:
        seq_putc(m, '0');
        return 0;
}
#endif /* CONFIG_KALLSYMS */

static int lock_trace(struct task_struct *task)
{
        int err = down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return err;
        if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
                up_read(&task->signal->exec_update_lock);
                return -EPERM;
        }
        return 0;
}

static void unlock_trace(struct task_struct *task)
{
        up_read(&task->signal->exec_update_lock);
}

#ifdef CONFIG_STACKTRACE

#define MAX_STACK_TRACE_DEPTH        64

static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long *entries;
        int err;

        /*
         * The ability to racily run the kernel stack unwinder on a running task
         * and then observe the unwinder output is scary; while it is useful for
         * debugging kernel issues, it can also allow an attacker to leak kernel
         * stack contents.
         * Doing this in a manner that is at least safe from races would require
         * some work to ensure that the remote task can not be scheduled; and
         * even then, this would still expose the unwinder as local attack
         * surface.
         * Therefore, this interface is restricted to root.
         */
        if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
                return -EACCES;

        entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
                                GFP_KERNEL);
        if (!entries)
                return -ENOMEM;

        err = lock_trace(task);
        if (!err) {
                unsigned int i, nr_entries;

                nr_entries = stack_trace_save_tsk(task, entries,
                                                  MAX_STACK_TRACE_DEPTH, 0);

                for (i = 0; i < nr_entries; i++) {
                        seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
                }

                unlock_trace(task);
        }
        kfree(entries);

        return err;
}
#endif

#ifdef CONFIG_SCHED_INFO
/*
 * Provides /proc/PID/schedstat
 */
static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
                              struct pid *pid, struct task_struct *task)
{
        if (unlikely(!sched_info_on()))
                seq_puts(m, "0 0 0\n");
        else
                seq_printf(m, "%llu %llu %lu\n",
                   (unsigned long long)task->se.sum_exec_runtime,
                   (unsigned long long)task->sched_info.run_delay,
                   task->sched_info.pcount);

        return 0;
}
#endif

#ifdef CONFIG_LATENCYTOP
static int lstats_show_proc(struct seq_file *m, void *v)
{
        int i;
        struct inode *inode = m->private;
        struct task_struct *task = get_proc_task(inode);

        if (!task)
                return -ESRCH;
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < LT_SAVECOUNT; i++) {
                struct latency_record *lr = &task->latency_record[i];
                if (lr->backtrace[0]) {
                        int q;
                        seq_printf(m, "%i %li %li",
                                   lr->count, lr->time, lr->max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
                                unsigned long bt = lr->backtrace[q];

                                if (!bt)
                                        break;
                                seq_printf(m, " %ps", (void *)bt);
                        }
                        seq_putc(m, '\n');
                }

        }
        put_task_struct(task);
        return 0;
}

static int lstats_open(struct inode *inode, struct file *file)
{
        return single_open(file, lstats_show_proc, inode);
}

static ssize_t lstats_write(struct file *file, const char __user *buf,
                            size_t count, loff_t *offs)
{
        struct task_struct *task = get_proc_task(file_inode(file));

        if (!task)
                return -ESRCH;
        clear_tsk_latency_tracing(task);
        put_task_struct(task);

        return count;
}

static const struct file_operations proc_lstats_operations = {
        .open                = lstats_open,
        .read                = seq_read,
        .write                = lstats_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif

static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long totalpages = totalram_pages() + total_swap_pages;
        unsigned long points = 0;
        long badness;

        badness = oom_badness(task, totalpages);
        /*
         * Special case OOM_SCORE_ADJ_MIN for all others scale the
         * badness value into [0, 2000] range which we have been
         * exporting for a long time so userspace might depend on it.
         */
        if (badness != LONG_MIN)
                points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;

        seq_printf(m, "%lu\n", points);

        return 0;
}

struct limit_names {
        const char *name;
        const char *unit;
};

static const struct limit_names lnames[RLIM_NLIMITS] = {
        [RLIMIT_CPU] = {"Max cpu time", "seconds"},
        [RLIMIT_FSIZE] = {"Max file size", "bytes"},
        [RLIMIT_DATA] = {"Max data size", "bytes"},
        [RLIMIT_STACK] = {"Max stack size", "bytes"},
        [RLIMIT_CORE] = {"Max core file size", "bytes"},
        [RLIMIT_RSS] = {"Max resident set", "bytes"},
        [RLIMIT_NPROC] = {"Max processes", "processes"},
        [RLIMIT_NOFILE] = {"Max open files", "files"},
        [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
        [RLIMIT_AS] = {"Max address space", "bytes"},
        [RLIMIT_LOCKS] = {"Max file locks", "locks"},
        [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
        [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
        [RLIMIT_NICE] = {"Max nice priority", NULL},
        [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
        [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
};

/* Display limits for a process */
static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
                           struct pid *pid, struct task_struct *task)
{
        unsigned int i;
        unsigned long flags;

        struct rlimit rlim[RLIM_NLIMITS];

        if (!lock_task_sighand(task, &flags))
                return 0;
        memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
        unlock_task_sighand(task, &flags);

        /*
         * print the file header
         */
        seq_puts(m, "Limit                     "
                "Soft Limit           "
                "Hard Limit           "
                "Units     \n");

        for (i = 0; i < RLIM_NLIMITS; i++) {
                if (rlim[i].rlim_cur == RLIM_INFINITY)
                        seq_printf(m, "%-25s %-20s ",
                                   lnames[i].name, "unlimited");
                else
                        seq_printf(m, "%-25s %-20lu ",
                                   lnames[i].name, rlim[i].rlim_cur);

                if (rlim[i].rlim_max == RLIM_INFINITY)
                        seq_printf(m, "%-20s ", "unlimited");
                else
                        seq_printf(m, "%-20lu ", rlim[i].rlim_max);

                if (lnames[i].unit)
                        seq_printf(m, "%-10s\n", lnames[i].unit);
                else
                        seq_putc(m, '\n');
        }

        return 0;
}

#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
                            struct pid *pid, struct task_struct *task)
{
        struct syscall_info info;
        u64 *args = &info.data.args[0];
        int res;

        res = lock_trace(task);
        if (res)
                return res;

        if (task_current_syscall(task, &info))
                seq_puts(m, "running\n");
        else if (info.data.nr < 0)
                seq_printf(m, "%d 0x%llx 0x%llx\n",
                           info.data.nr, info.sp, info.data.instruction_pointer);
        else
                seq_printf(m,
                       "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
                       info.data.nr,
                       args[0], args[1], args[2], args[3], args[4], args[5],
                       info.sp, info.data.instruction_pointer);
        unlock_trace(task);

        return 0;
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */

/************************************************************************/
/*                       Here the fs part begins                        */
/************************************************************************/

/* permission checks */
static bool proc_fd_access_allowed(struct inode *inode)
{
        struct task_struct *task;
        bool allowed = false;
        /* Allow access to a task's file descriptors if it is us or we
         * may use ptrace attach to the process and find out that
         * information.
         */
        task = get_proc_task(inode);
        if (task) {
                allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
                put_task_struct(task);
        }
        return allowed;
}

int proc_nochmod_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                 struct iattr *attr)
{
        int error;
        struct inode *inode = d_inode(dentry);

        if (attr->ia_valid & ATTR_MODE)
                return -EPERM;

        error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
        if (error)
                return error;

        setattr_copy(&nop_mnt_idmap, inode, attr);
        return 0;
}

/*
 * May current process learn task's sched/cmdline info (for hide_pid_min=1)
 * or euid/egid (for hide_pid_min=2)?
 */
static bool has_pid_permissions(struct proc_fs_info *fs_info,
                                 struct task_struct *task,
                                 enum proc_hidepid hide_pid_min)
{
        /*
         * If 'hidpid' mount option is set force a ptrace check,
         * we indicate that we are using a filesystem syscall
         * by passing PTRACE_MODE_READ_FSCREDS
         */
        if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
                return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);

        if (fs_info->hide_pid < hide_pid_min)
                return true;
        if (in_group_p(fs_info->pid_gid))
                return true;
        return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
}


static int proc_pid_permission(struct mnt_idmap *idmap,
                               struct inode *inode, int mask)
{
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct task_struct *task;
        bool has_perms;

        task = get_proc_task(inode);
        if (!task)
                return -ESRCH;
        has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
        put_task_struct(task);

        if (!has_perms) {
                if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
                        /*
                         * Let's make getdents(), stat(), and open()
                         * consistent with each other.  If a process
                         * may not stat() a file, it shouldn't be seen
                         * in procfs at all.
                         */
                        return -ENOENT;
                }

                return -EPERM;
        }
        return generic_permission(&nop_mnt_idmap, inode, mask);
}



static const struct inode_operations proc_def_inode_operations = {
        .setattr        = proc_nochmod_setattr,
};

static int proc_single_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        struct pid *pid = proc_pid(inode);
        struct task_struct *task;
        int ret;

        task = get_pid_task(pid, PIDTYPE_PID);
        if (!task)
                return -ESRCH;

        ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);

        put_task_struct(task);
        return ret;
}

static int proc_single_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, proc_single_show, inode);
}

static const struct file_operations proc_single_file_operations = {
        .open                = proc_single_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = single_release,
};

/*
 * proc_mem_open() can return errno, NULL or mm_struct*.
 *
 *   - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING)
 *   - Returns mm_struct* on success
 *   - Returns error code on failure
 */
struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
        struct task_struct *task = get_proc_task(inode);
        struct mm_struct *mm;

        if (!task)
                return ERR_PTR(-ESRCH);

        mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
        put_task_struct(task);

        if (IS_ERR(mm))
                return mm == ERR_PTR(-ESRCH) ? NULL : mm;

        /* ensure this mm_struct can't be freed */
        mmgrab(mm);
        /* but do not pin its memory */
        mmput(mm);

        return mm;
}

static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
        struct mm_struct *mm = proc_mem_open(inode, mode);

        if (IS_ERR_OR_NULL(mm))
                return mm ? PTR_ERR(mm) : -ESRCH;

        file->private_data = mm;
        return 0;
}

static int mem_open(struct inode *inode, struct file *file)
{
        if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)))
                return -EINVAL;
        return __mem_open(inode, file, PTRACE_MODE_ATTACH);
}

static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm)
{
        struct task_struct *task;
        bool ptrace_active = false;

        switch (proc_mem_force_override) {
        case PROC_MEM_FORCE_NEVER:
                return false;
        case PROC_MEM_FORCE_PTRACE:
                task = get_proc_task(file_inode(file));
                if (task) {
                        ptrace_active =        READ_ONCE(task->ptrace) &&
                                        READ_ONCE(task->mm) == mm &&
                                        READ_ONCE(task->parent) == current;
                        put_task_struct(task);
                }
                return ptrace_active;
        default:
                return true;
        }
}

static ssize_t mem_rw(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos, int write)
{
        struct mm_struct *mm = file->private_data;
        unsigned long addr = *ppos;
        ssize_t copied;
        char *page;
        unsigned int flags;

        if (!mm)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        copied = 0;
        if (!mmget_not_zero(mm))
                goto free;

        flags = write ? FOLL_WRITE : 0;
        if (proc_mem_foll_force(file, mm))
                flags |= FOLL_FORCE;

        while (count > 0) {
                size_t this_len = min_t(size_t, count, PAGE_SIZE);

                if (write && copy_from_user(page, buf, this_len)) {
                        copied = -EFAULT;
                        break;
                }

                this_len = access_remote_vm(mm, addr, page, this_len, flags);
                if (!this_len) {
                        if (!copied)
                                copied = -EIO;
                        break;
                }

                if (!write && copy_to_user(buf, page, this_len)) {
                        copied = -EFAULT;
                        break;
                }

                buf += this_len;
                addr += this_len;
                copied += this_len;
                count -= this_len;
        }
        *ppos = addr;

        mmput(mm);
free:
        free_page((unsigned long) page);
        return copied;
}

static ssize_t mem_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        return mem_rw(file, buf, count, ppos, 0);
}

static ssize_t mem_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos)
{
        return mem_rw(file, (char __user*)buf, count, ppos, 1);
}

loff_t mem_lseek(struct file *file, loff_t offset, int orig)
{
        switch (orig) {
        case 0:
                file->f_pos = offset;
                break;
        case 1:
                file->f_pos += offset;
                break;
        default:
                return -EINVAL;
        }
        force_successful_syscall_return();
        return file->f_pos;
}

static int mem_release(struct inode *inode, struct file *file)
{
        struct mm_struct *mm = file->private_data;
        if (mm)
                mmdrop(mm);
        return 0;
}

static const struct file_operations proc_mem_operations = {
        .llseek                = mem_lseek,
        .read                = mem_read,
        .write                = mem_write,
        .open                = mem_open,
        .release        = mem_release,
        .fop_flags        = FOP_UNSIGNED_OFFSET,
};

static int environ_open(struct inode *inode, struct file *file)
{
        return __mem_open(inode, file, PTRACE_MODE_READ);
}

static ssize_t environ_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        char *page;
        unsigned long src = *ppos;
        int ret = 0;
        struct mm_struct *mm = file->private_data;
        unsigned long env_start, env_end;

        /* Ensure the process spawned far enough to have an environment. */
        if (!mm || !mm->env_end)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        ret = 0;
        if (!mmget_not_zero(mm))
                goto free;

        spin_lock(&mm->arg_lock);
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        while (count > 0) {
                size_t this_len, max_len;
                int retval;

                if (src >= (env_end - env_start))
                        break;

                this_len = env_end - (env_start + src);

                max_len = min_t(size_t, PAGE_SIZE, count);
                this_len = min(max_len, this_len);

                retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);

                if (retval <= 0) {
                        ret = retval;
                        break;
                }

                if (copy_to_user(buf, page, retval)) {
                        ret = -EFAULT;
                        break;
                }

                ret += retval;
                src += retval;
                buf += retval;
                count -= retval;
        }
        *ppos = src;
        mmput(mm);

free:
        free_page((unsigned long) page);
        return ret;
}

static const struct file_operations proc_environ_operations = {
        .open                = environ_open,
        .read                = environ_read,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

static int auxv_open(struct inode *inode, struct file *file)
{
        return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
}

static ssize_t auxv_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        struct mm_struct *mm = file->private_data;
        unsigned int nwords = 0;

        if (!mm)
                return 0;
        do {
                nwords += 2;
        } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
        return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
                                       nwords * sizeof(mm->saved_auxv[0]));
}

static const struct file_operations proc_auxv_operations = {
        .open                = auxv_open,
        .read                = auxv_read,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
                            loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        int oom_adj = OOM_ADJUST_MIN;
        size_t len;

        if (!task)
                return -ESRCH;
        if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
                oom_adj = OOM_ADJUST_MAX;
        else
                oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
                          OOM_SCORE_ADJ_MAX;
        put_task_struct(task);
        if (oom_adj > OOM_ADJUST_MAX)
                oom_adj = OOM_ADJUST_MAX;
        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        int err = 0;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;

        mutex_lock(&oom_adj_mutex);
        if (legacy) {
                if (oom_adj < task->signal->oom_score_adj &&
                                !capable(CAP_SYS_RESOURCE)) {
                        err = -EACCES;
                        goto err_unlock;
                }
                /*
                 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
                 * /proc/pid/oom_score_adj instead.
                 */
                pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
                          current->comm, task_pid_nr(current), task_pid_nr(task),
                          task_pid_nr(task));
        } else {
                if ((short)oom_adj < task->signal->oom_score_adj_min &&
                                !capable(CAP_SYS_RESOURCE)) {
                        err = -EACCES;
                        goto err_unlock;
                }
        }

        /*
         * Make sure we will check other processes sharing the mm if this is
         * not vfrok which wants its own oom_score_adj.
         * pin the mm so it doesn't go away and get reused after task_unlock
         */
        if (!task->vfork_done) {
                struct task_struct *p = find_lock_task_mm(task);

                if (p) {
                        if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) {
                                mm = p->mm;
                                mmgrab(mm);
                        }
                        task_unlock(p);
                }
        }

        task->signal->oom_score_adj = oom_adj;
        if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = (short)oom_adj;
        trace_oom_score_adj_update(task);

        if (mm) {
                struct task_struct *p;

                rcu_read_lock();
                for_each_process(p) {
                        if (same_thread_group(task, p))
                                continue;

                        /* do not touch kernel threads or the global init */
                        if (p->flags & PF_KTHREAD || is_global_init(p))
                                continue;

                        task_lock(p);
                        if (!p->vfork_done && process_shares_mm(p, mm)) {
                                p->signal->oom_score_adj = oom_adj;
                                if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
                                        p->signal->oom_score_adj_min = (short)oom_adj;
                        }
                        task_unlock(p);
                }
                rcu_read_unlock();
                mmdrop(mm);
        }
err_unlock:
        mutex_unlock(&oom_adj_mutex);
        put_task_struct(task);
        return err;
}

/*
 * /proc/pid/oom_adj exists solely for backwards compatibility with previous
 * kernels.  The effective policy is defined by oom_score_adj, which has a
 * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
 * Values written to oom_adj are simply mapped linearly to oom_score_adj.
 * Processes that become oom disabled via oom_adj will still be oom disabled
 * with this implementation.
 *
 * oom_adj cannot be removed since existing userspace binaries use it.
 */
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        char buffer[PROC_NUMBUF] = {};
        int oom_adj;
        int err;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count)) {
                err = -EFAULT;
                goto out;
        }

        err = kstrtoint(strstrip(buffer), 0, &oom_adj);
        if (err)
                goto out;
        if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
             oom_adj != OOM_DISABLE) {
                err = -EINVAL;
                goto out;
        }

        /*
         * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
         * value is always attainable.
         */
        if (oom_adj == OOM_ADJUST_MAX)
                oom_adj = OOM_SCORE_ADJ_MAX;
        else
                oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;

        err = __set_oom_adj(file, oom_adj, true);
out:
        return err < 0 ? err : count;
}

static const struct file_operations proc_oom_adj_operations = {
        .read                = oom_adj_read,
        .write                = oom_adj_write,
        .llseek                = generic_file_llseek,
};

static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                                        size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        short oom_score_adj = OOM_SCORE_ADJ_MIN;
        size_t len;

        if (!task)
                return -ESRCH;
        oom_score_adj = task->signal->oom_score_adj;
        put_task_struct(task);
        len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *ppos)
{
        char buffer[PROC_NUMBUF] = {};
        int oom_score_adj;
        int err;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count)) {
                err = -EFAULT;
                goto out;
        }

        err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
        if (err)
                goto out;
        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
                        oom_score_adj > OOM_SCORE_ADJ_MAX) {
                err = -EINVAL;
                goto out;
        }

        err = __set_oom_adj(file, oom_score_adj, false);
out:
        return err < 0 ? err : count;
}

static const struct file_operations proc_oom_score_adj_operations = {
        .read                = oom_score_adj_read,
        .write                = oom_score_adj_write,
        .llseek                = default_llseek,
};

#ifdef CONFIG_AUDIT
#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task = get_proc_task(inode);
        ssize_t length;
        char tmpbuf[TMPBUFLEN];

        if (!task)
                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
                           from_kuid(file->f_cred->user_ns,
                                     audit_get_loginuid(task)));
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}

static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        uid_t loginuid;
        kuid_t kloginuid;
        int rv;

        /* Don't let kthreads write their own loginuid */
        if (current->flags & PF_KTHREAD)
                return -EPERM;

        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
                rcu_read_unlock();
                return -EPERM;
        }
        rcu_read_unlock();

        if (*ppos != 0) {
                /* No partial writes. */
                return -EINVAL;
        }

        rv = kstrtou32_from_user(buf, count, 10, &loginuid);
        if (rv < 0)
                return rv;

        /* is userspace tring to explicitly UNSET the loginuid? */
        if (loginuid == AUDIT_UID_UNSET) {
                kloginuid = INVALID_UID;
        } else {
                kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
                if (!uid_valid(kloginuid))
                        return -EINVAL;
        }

        rv = audit_set_loginuid(kloginuid);
        if (rv < 0)
                return rv;
        return count;
}

static const struct file_operations proc_loginuid_operations = {
        .read                = proc_loginuid_read,
        .write                = proc_loginuid_write,
        .llseek                = generic_file_llseek,
};

static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task = get_proc_task(inode);
        ssize_t length;
        char tmpbuf[TMPBUFLEN];

        if (!task)
                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
                                audit_get_sessionid(task));
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}

static const struct file_operations proc_sessionid_operations = {
        .read                = proc_sessionid_read,
        .llseek                = generic_file_llseek,
};
#endif

#ifdef CONFIG_FAULT_INJECTION
static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
                                      size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        size_t len;
        int make_it_fail;

        if (!task)
                return -ESRCH;
        make_it_fail = task->make_it_fail;
        put_task_struct(task);

        len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);

        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static ssize_t proc_fault_inject_write(struct file * file,
                        const char __user * buf, size_t count, loff_t *ppos)
{
        struct task_struct *task;
        char buffer[PROC_NUMBUF] = {};
        int make_it_fail;
        int rv;

        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
        rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
        if (rv < 0)
                return rv;
        if (make_it_fail < 0 || make_it_fail > 1)
                return -EINVAL;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        task->make_it_fail = make_it_fail;
        put_task_struct(task);

        return count;
}

static const struct file_operations proc_fault_inject_operations = {
        .read                = proc_fault_inject_read,
        .write                = proc_fault_inject_write,
        .llseek                = generic_file_llseek,
};

static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
                                   size_t count, loff_t *ppos)
{
        struct task_struct *task;
        int err;
        unsigned int n;

        err = kstrtouint_from_user(buf, count, 0, &n);
        if (err)
                return err;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        task->fail_nth = n;
        put_task_struct(task);

        return count;
}

static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
                                  size_t count, loff_t *ppos)
{
        struct task_struct *task;
        char numbuf[PROC_NUMBUF];
        ssize_t len;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, numbuf, len);
}

static const struct file_operations proc_fail_nth_operations = {
        .read                = proc_fail_nth_read,
        .write                = proc_fail_nth_write,
};
#endif


/*
 * Print out various scheduling related per-task fields:
 */
static int sched_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_show_task(p, ns, m);

        put_task_struct(p);

        return 0;
}

static ssize_t
sched_write(struct file *file, const char __user *buf,
            size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_set_task(p);

        put_task_struct(p);

        return count;
}

static int sched_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, sched_show, inode);
}

static const struct file_operations proc_pid_sched_operations = {
        .open                = sched_open,
        .read                = seq_read,
        .write                = sched_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#ifdef CONFIG_SCHED_AUTOGROUP
/*
 * Print out autogroup related information:
 */
static int sched_autogroup_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_autogroup_show_task(p, m);

        put_task_struct(p);

        return 0;
}

static ssize_t
sched_autogroup_write(struct file *file, const char __user *buf,
            size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        char buffer[PROC_NUMBUF] = {};
        int nice;
        int err;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;

        err = kstrtoint(strstrip(buffer), 0, &nice);
        if (err < 0)
                return err;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        err = proc_sched_autogroup_set_nice(p, nice);
        if (err)
                count = err;

        put_task_struct(p);

        return count;
}

static int sched_autogroup_open(struct inode *inode, struct file *filp)
{
        int ret;

        ret = single_open(filp, sched_autogroup_show, NULL);
        if (!ret) {
                struct seq_file *m = filp->private_data;

                m->private = inode;
        }
        return ret;
}

static const struct file_operations proc_pid_sched_autogroup_operations = {
        .open                = sched_autogroup_open,
        .read                = seq_read,
        .write                = sched_autogroup_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif /* CONFIG_SCHED_AUTOGROUP */

#ifdef CONFIG_TIME_NS
static int timens_offsets_show(struct seq_file *m, void *v)
{
        struct task_struct *p;

        p = get_proc_task(file_inode(m->file));
        if (!p)
                return -ESRCH;
        proc_timens_show_offsets(p, m);

        put_task_struct(p);

        return 0;
}

static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
                                    size_t count, loff_t *ppos)
{
        struct inode *inode = file_inode(file);
        struct proc_timens_offset offsets[2];
        char *kbuf = NULL, *pos, *next_line;
        struct task_struct *p;
        int ret, noffsets;

        /* Only allow < page size writes at the beginning of the file */
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                return -EINVAL;

        /* Slurp in the user data */
        kbuf = memdup_user_nul(buf, count);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);

        /* Parse the user data */
        ret = -EINVAL;
        noffsets = 0;
        for (pos = kbuf; pos; pos = next_line) {
                struct proc_timens_offset *off = &offsets[noffsets];
                char clock[10];
                int err;

                /* Find the end of line and ensure we don't look past it */
                next_line = strchr(pos, '\n');
                if (next_line) {
                        *next_line = '\0';
                        next_line++;
                        if (*next_line == '\0')
                                next_line = NULL;
                }

                err = sscanf(pos, "%9s %lld %lu", clock,
                                &off->val.tv_sec, &off->val.tv_nsec);
                if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
                        goto out;

                clock[sizeof(clock) - 1] = 0;
                if (strcmp(clock, "monotonic") == 0 ||
                    strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
                        off->clockid = CLOCK_MONOTONIC;
                else if (strcmp(clock, "boottime") == 0 ||
                         strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
                        off->clockid = CLOCK_BOOTTIME;
                else
                        goto out;

                noffsets++;
                if (noffsets == ARRAY_SIZE(offsets)) {
                        if (next_line)
                                count = next_line - kbuf;
                        break;
                }
        }

        ret = -ESRCH;
        p = get_proc_task(inode);
        if (!p)
                goto out;
        ret = proc_timens_set_offset(file, p, offsets, noffsets);
        put_task_struct(p);
        if (ret)
                goto out;

        ret = count;
out:
        kfree(kbuf);
        return ret;
}

static int timens_offsets_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, timens_offsets_show, inode);
}

static const struct file_operations proc_timens_offsets_operations = {
        .open                = timens_offsets_open,
        .read                = seq_read,
        .write                = timens_offsets_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};
#endif /* CONFIG_TIME_NS */

static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        char buffer[TASK_COMM_LEN] = {};
        const size_t maxlen = sizeof(buffer) - 1;

        if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
                return -EFAULT;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (same_thread_group(current, p)) {
                set_task_comm(p, buffer);
                proc_comm_connector(p);
        }
        else
                count = -EINVAL;

        put_task_struct(p);

        return count;
}

static int comm_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        proc_task_name(m, p, false);
        seq_putc(m, '\n');

        put_task_struct(p);

        return 0;
}

static int comm_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, comm_show, inode);
}

static const struct file_operations proc_pid_set_comm_operations = {
        .open                = comm_open,
        .read                = seq_read,
        .write                = comm_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
        struct task_struct *task;
        struct file *exe_file;

        task = get_proc_task(d_inode(dentry));
        if (!task)
                return -ENOENT;
        exe_file = get_task_exe_file(task);
        put_task_struct(task);
        if (exe_file) {
                *exe_path = exe_file->f_path;
                path_get(&exe_file->f_path);
                fput(exe_file);
                return 0;
        } else
                return -ENOENT;
}

static const char *proc_pid_get_link(struct dentry *dentry,
                                     struct inode *inode,
                                     struct delayed_call *done)
{
        struct path path;
        int error = -EACCES;

        if (!dentry)
                return ERR_PTR(-ECHILD);

        /* Are we allowed to snoop on the tasks file descriptors? */
        if (!proc_fd_access_allowed(inode))
                goto out;

        error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;

        error = nd_jump_link(&path);
out:
        return ERR_PTR(error);
}

static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen)
{
        char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
        char *pathname;
        int len;

        if (!tmp)
                return -ENOMEM;

        pathname = d_path(path, tmp, PATH_MAX);
        len = PTR_ERR(pathname);
        if (IS_ERR(pathname))
                goto out;
        len = tmp + PATH_MAX - 1 - pathname;

        if (len > buflen)
                len = buflen;
        if (copy_to_user(buffer, pathname, len))
                len = -EFAULT;
 out:
        kfree(tmp);
        return len;
}

static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
{
        int error = -EACCES;
        struct inode *inode = d_inode(dentry);
        struct path path;

        /* Are we allowed to snoop on the tasks file descriptors? */
        if (!proc_fd_access_allowed(inode))
                goto out;

        error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;

        error = do_proc_readlink(&path, buffer, buflen);
        path_put(&path);
out:
        return error;
}

const struct inode_operations proc_pid_link_inode_operations = {
        .readlink        = proc_pid_readlink,
        .get_link        = proc_pid_get_link,
        .setattr        = proc_nochmod_setattr,
};


/* building an inode */

void task_dump_owner(struct task_struct *task, umode_t mode,
                     kuid_t *ruid, kgid_t *rgid)
{
        /* Depending on the state of dumpable compute who should own a
         * proc file for a task.
         */
        const struct cred *cred;
        kuid_t uid;
        kgid_t gid;

        if (unlikely(task->flags & PF_KTHREAD)) {
                *ruid = GLOBAL_ROOT_UID;
                *rgid = GLOBAL_ROOT_GID;
                return;
        }

        /* Default to the tasks effective ownership */
        rcu_read_lock();
        cred = __task_cred(task);
        uid = cred->euid;
        gid = cred->egid;
        rcu_read_unlock();

        /*
         * Before the /proc/pid/status file was created the only way to read
         * the effective uid of a /process was to stat /proc/pid.  Reading
         * /proc/pid/status is slow enough that procps and other packages
         * kept stating /proc/pid.  To keep the rules in /proc simple I have
         * made this apply to all per process world readable and executable
         * directories.
         */
        if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
                struct mm_struct *mm;
                task_lock(task);
                mm = task->mm;
                /* Make non-dumpable tasks owned by some root */
                if (mm) {
                        if (get_dumpable(mm) != SUID_DUMP_USER) {
                                struct user_namespace *user_ns = mm->user_ns;

                                uid = make_kuid(user_ns, 0);
                                if (!uid_valid(uid))
                                        uid = GLOBAL_ROOT_UID;

                                gid = make_kgid(user_ns, 0);
                                if (!gid_valid(gid))
                                        gid = GLOBAL_ROOT_GID;
                        }
                } else {
                        uid = GLOBAL_ROOT_UID;
                        gid = GLOBAL_ROOT_GID;
                }
                task_unlock(task);
        }
        *ruid = uid;
        *rgid = gid;
}

void proc_pid_evict_inode(struct proc_inode *ei)
{
        struct pid *pid = ei->pid;

        if (S_ISDIR(ei->vfs_inode.i_mode)) {
                spin_lock(&pid->lock);
                hlist_del_init_rcu(&ei->sibling_inodes);
                spin_unlock(&pid->lock);
        }
}

struct inode *proc_pid_make_inode(struct super_block *sb,
                                  struct task_struct *task, umode_t mode)
{
        struct inode * inode;
        struct proc_inode *ei;
        struct pid *pid;

        /* We need a new inode */

        inode = new_inode(sb);
        if (!inode)
                goto out;

        /* Common stuff */
        ei = PROC_I(inode);
        inode->i_mode = mode;
        inode->i_ino = get_next_ino();
        simple_inode_init_ts(inode);
        inode->i_op = &proc_def_inode_operations;

        /*
         * grab the reference to task.
         */
        pid = get_task_pid(task, PIDTYPE_PID);
        if (!pid)
                goto out_unlock;

        /* Let the pid remember us for quick removal */
        ei->pid = pid;

        task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
        security_task_to_inode(task, inode);

out:
        return inode;

out_unlock:
        iput(inode);
        return NULL;
}

/*
 * Generating an inode and adding it into @pid->inodes, so that task will
 * invalidate inode's dentry before being released.
 *
 * This helper is used for creating dir-type entries under '/proc' and
 * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
 * can be released by invalidating '/proc/<tgid>' dentry.
 * In theory, dentries under '/proc/<tgid>/task' can also be released by
 * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
 * thread exiting situation: Any one of threads should invalidate its
 * '/proc/<tgid>/task/<pid>' dentry before released.
 */
static struct inode *proc_pid_make_base_inode(struct super_block *sb,
                                struct task_struct *task, umode_t mode)
{
        struct inode *inode;
        struct proc_inode *ei;
        struct pid *pid;

        inode = proc_pid_make_inode(sb, task, mode);
        if (!inode)
                return NULL;

        /* Let proc_flush_pid find this directory inode */
        ei = PROC_I(inode);
        pid = ei->pid;
        spin_lock(&pid->lock);
        hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
        spin_unlock(&pid->lock);

        return inode;
}

int pid_getattr(struct mnt_idmap *idmap, const struct path *path,
                struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct task_struct *task;

        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);

        stat->uid = GLOBAL_ROOT_UID;
        stat->gid = GLOBAL_ROOT_GID;
        rcu_read_lock();
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (task) {
                if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
                        rcu_read_unlock();
                        /*
                         * This doesn't prevent learning whether PID exists,
                         * it only makes getattr() consistent with readdir().
                         */
                        return -ENOENT;
                }
                task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
        }
        rcu_read_unlock();
        return 0;
}

/* dentry stuff */

/*
 * Set <pid>/... inode ownership (can change due to setuid(), etc.)
 */
void pid_update_inode(struct task_struct *task, struct inode *inode)
{
        task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);

        inode->i_mode &= ~(S_ISUID | S_ISGID);
        security_task_to_inode(task, inode);
}

/*
 * Rewrite the inode's ownerships here because the owning task may have
 * performed a setuid(), etc.
 *
 */
static int pid_revalidate(struct inode *dir, const struct qstr *name,
                          struct dentry *dentry, unsigned int flags)
{
        struct inode *inode;
        struct task_struct *task;
        int ret = 0;

        rcu_read_lock();
        inode = d_inode_rcu(dentry);
        if (!inode)
                goto out;
        task = pid_task(proc_pid(inode), PIDTYPE_PID);

        if (task) {
                pid_update_inode(task, inode);
                ret = 1;
        }
out:
        rcu_read_unlock();
        return ret;
}

static inline bool proc_inode_is_dead(struct inode *inode)
{
        return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
}

int pid_delete_dentry(const struct dentry *dentry)
{
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
         * kill it immediately.
         */
        return proc_inode_is_dead(d_inode(dentry));
}

const struct dentry_operations pid_dentry_operations =
{
        .d_revalidate        = pid_revalidate,
        .d_delete        = pid_delete_dentry,
};

/* Lookups */

/*
 * Fill a directory entry.
 *
 * If possible create the dcache entry and derive our inode number and
 * file type from dcache entry.
 *
 * Since all of the proc inode numbers are dynamically generated, the inode
 * numbers do not exist until the inode is cache.  This means creating
 * the dcache entry in readdir is necessary to keep the inode numbers
 * reported by readdir in sync with the inode numbers reported
 * by stat.
 */
bool proc_fill_cache(struct file *file, struct dir_context *ctx,
        const char *name, unsigned int len,
        instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
        struct dentry *child, *dir = file->f_path.dentry;
        struct qstr qname = QSTR_INIT(name, len);
        struct inode *inode;
        unsigned type = DT_UNKNOWN;
        ino_t ino = 1;

        child = try_lookup_noperm(&qname, dir);
        if (IS_ERR(child))
                goto end_instantiate;

        if (!child) {
                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
                child = d_alloc_parallel(dir, &qname, &wq);
                if (IS_ERR(child))
                        goto end_instantiate;
                if (d_in_lookup(child)) {
                        struct dentry *res;
                        res = instantiate(child, task, ptr);
                        d_lookup_done(child);
                        if (unlikely(res)) {
                                dput(child);
                                child = res;
                                if (IS_ERR(child))
                                        goto end_instantiate;
                        }
                }
        }
        inode = d_inode(child);
        ino = inode->i_ino;
        type = inode->i_mode >> 12;
        dput(child);
end_instantiate:
        return dir_emit(ctx, name, len, ino, type);
}

/*
 * dname_to_vma_addr - maps a dentry name into two unsigned longs
 * which represent vma start and end addresses.
 */
static int dname_to_vma_addr(struct dentry *dentry,
                             unsigned long *start, unsigned long *end)
{
        const char *str = dentry->d_name.name;
        unsigned long long sval, eval;
        unsigned int len;

        if (str[0] == '0' && str[1] != '-')
                return -EINVAL;
        len = _parse_integer(str, 16, &sval);
        if (len & KSTRTOX_OVERFLOW)
                return -EINVAL;
        if (sval != (unsigned long)sval)
                return -EINVAL;
        str += len;

        if (*str != '-')
                return -EINVAL;
        str++;

        if (str[0] == '0' && str[1])
                return -EINVAL;
        len = _parse_integer(str, 16, &eval);
        if (len & KSTRTOX_OVERFLOW)
                return -EINVAL;
        if (eval != (unsigned long)eval)
                return -EINVAL;
        str += len;

        if (*str != '\0')
                return -EINVAL;

        *start = sval;
        *end = eval;

        return 0;
}

static int map_files_d_revalidate(struct inode *dir, const struct qstr *name,
                                  struct dentry *dentry, unsigned int flags)
{
        unsigned long vm_start, vm_end;
        bool exact_vma_exists = false;
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        struct inode *inode;
        int status = 0;

        if (flags & LOOKUP_RCU)
                return -ECHILD;

        inode = d_inode(dentry);
        task = get_proc_task(inode);
        if (!task)
                goto out_notask;

        mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
        if (IS_ERR(mm))
                goto out;

        if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
                status = mmap_read_lock_killable(mm);
                if (!status) {
                        exact_vma_exists = !!find_exact_vma(mm, vm_start,
                                                            vm_end);
                        mmap_read_unlock(mm);
                }
        }

        mmput(mm);

        if (exact_vma_exists) {
                task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);

                security_task_to_inode(task, inode);
                status = 1;
        }

out:
        put_task_struct(task);

out_notask:
        return status;
}

static const struct dentry_operations tid_map_files_dentry_operations = {
        .d_revalidate        = map_files_d_revalidate,
        .d_delete        = pid_delete_dentry,
};

static int map_files_get_link(struct dentry *dentry, struct path *path)
{
        unsigned long vm_start, vm_end;
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct mm_struct *mm;
        int rc;

        rc = -ENOENT;
        task = get_proc_task(d_inode(dentry));
        if (!task)
                goto out;

        mm = get_task_mm(task);
        put_task_struct(task);
        if (!mm)
                goto out;

        rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
        if (rc)
                goto out_mmput;

        rc = mmap_read_lock_killable(mm);
        if (rc)
                goto out_mmput;

        rc = -ENOENT;
        vma = find_exact_vma(mm, vm_start, vm_end);
        if (vma && vma->vm_file) {
                *path = *file_user_path(vma->vm_file);
                path_get(path);
                rc = 0;
        }
        mmap_read_unlock(mm);

out_mmput:
        mmput(mm);
out:
        return rc;
}

struct map_files_info {
        unsigned long        start;
        unsigned long        end;
        fmode_t                mode;
};

/*
 * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
 * to concerns about how the symlinks may be used to bypass permissions on
 * ancestor directories in the path to the file in question.
 */
static const char *
proc_map_files_get_link(struct dentry *dentry,
                        struct inode *inode,
                        struct delayed_call *done)
{
        if (!checkpoint_restore_ns_capable(&init_user_ns))
                return ERR_PTR(-EPERM);

        return proc_pid_get_link(dentry, inode, done);
}

/*
 * Identical to proc_pid_link_inode_operations except for get_link()
 */
static const struct inode_operations proc_map_files_link_inode_operations = {
        .readlink        = proc_pid_readlink,
        .get_link        = proc_map_files_get_link,
        .setattr        = proc_nochmod_setattr,
};

static struct dentry *
proc_map_files_instantiate(struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
{
        fmode_t mode = (fmode_t)(unsigned long)ptr;
        struct proc_inode *ei;
        struct inode *inode;

        inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
                                    ((mode & FMODE_READ ) ? S_IRUSR : 0) |
                                    ((mode & FMODE_WRITE) ? S_IWUSR : 0));
        if (!inode)
                return ERR_PTR(-ENOENT);

        ei = PROC_I(inode);
        ei->op.proc_get_link = map_files_get_link;

        inode->i_op = &proc_map_files_link_inode_operations;
        inode->i_size = 64;

        return proc_splice_unmountable(inode, dentry,
                                       &tid_map_files_dentry_operations);
}

static struct dentry *proc_map_files_lookup(struct inode *dir,
                struct dentry *dentry, unsigned int flags)
{
        unsigned long vm_start, vm_end;
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct dentry *result;
        struct mm_struct *mm;

        result = ERR_PTR(-ENOENT);
        task = get_proc_task(dir);
        if (!task)
                goto out;

        result = ERR_PTR(-EACCES);
        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out_put_task;

        result = ERR_PTR(-ENOENT);
        if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
                goto out_put_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_put_task;

        result = ERR_PTR(-EINTR);
        if (mmap_read_lock_killable(mm))
                goto out_put_mm;

        result = ERR_PTR(-ENOENT);
        vma = find_exact_vma(mm, vm_start, vm_end);
        if (!vma)
                goto out_no_vma;

        if (vma->vm_file)
                result = proc_map_files_instantiate(dentry, task,
                                (void *)(unsigned long)vma->vm_file->f_mode);

out_no_vma:
        mmap_read_unlock(mm);
out_put_mm:
        mmput(mm);
out_put_task:
        put_task_struct(task);
out:
        return result;
}

static const struct inode_operations proc_map_files_inode_operations = {
        .lookup                = proc_map_files_lookup,
        .permission        = proc_fd_permission,
        .setattr        = proc_nochmod_setattr,
};

static int
proc_map_files_readdir(struct file *file, struct dir_context *ctx)
{
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned long nr_files, pos, i;
        GENRADIX(struct map_files_info) fa;
        struct map_files_info *p;
        int ret;
        struct vma_iterator vmi;

        genradix_init(&fa);

        ret = -ENOENT;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out;

        ret = -EACCES;
        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out_put_task;

        ret = 0;
        if (!dir_emit_dots(file, ctx))
                goto out_put_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_put_task;

        ret = mmap_read_lock_killable(mm);
        if (ret) {
                mmput(mm);
                goto out_put_task;
        }

        nr_files = 0;

        /*
         * We need two passes here:
         *
         *  1) Collect vmas of mapped files with mmap_lock taken
         *  2) Release mmap_lock and instantiate entries
         *
         * otherwise we get lockdep complained, since filldir()
         * routine might require mmap_lock taken in might_fault().
         */

        pos = 2;
        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (!vma->vm_file)
                        continue;
                if (++pos <= ctx->pos)
                        continue;

                p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
                if (!p) {
                        ret = -ENOMEM;
                        mmap_read_unlock(mm);
                        mmput(mm);
                        goto out_put_task;
                }

                p->start = vma->vm_start;
                p->end = vma->vm_end;
                p->mode = vma->vm_file->f_mode;
        }
        mmap_read_unlock(mm);
        mmput(mm);

        for (i = 0; i < nr_files; i++) {
                char buf[4 * sizeof(long) + 2];        /* max: %lx-%lx\0 */
                unsigned int len;

                p = genradix_ptr(&fa, i);
                len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
                if (!proc_fill_cache(file, ctx,
                                      buf, len,
                                      proc_map_files_instantiate,
                                      task,
                                      (void *)(unsigned long)p->mode))
                        break;
                ctx->pos++;
        }

out_put_task:
        put_task_struct(task);
out:
        genradix_free(&fa);
        return ret;
}

static const struct file_operations proc_map_files_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_map_files_readdir,
        .llseek                = generic_file_llseek,
};

#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
        struct pid                *pid;
        struct task_struct        *task;
        struct pid_namespace        *ns;
};

static void *timers_start(struct seq_file *m, loff_t *pos)
{
        struct timers_private *tp = m->private;

        tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
        if (!tp->task)
                return ERR_PTR(-ESRCH);

        rcu_read_lock();
        return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos);
}

static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct timers_private *tp = m->private;

        return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos);
}

static void timers_stop(struct seq_file *m, void *v)
{
        struct timers_private *tp = m->private;

        if (tp->task) {
                put_task_struct(tp->task);
                tp->task = NULL;
                rcu_read_unlock();
        }
}

static int show_timer(struct seq_file *m, void *v)
{
        static const char * const nstr[] = {
                [SIGEV_SIGNAL]        = "signal",
                [SIGEV_NONE]        = "none",
                [SIGEV_THREAD]        = "thread",
        };

        struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
        struct timers_private *tp = m->private;
        int notify = timer->it_sigev_notify;

        guard(spinlock_irq)(&timer->it_lock);
        if (!posixtimer_valid(timer))
                return 0;

        seq_printf(m, "ID: %d\n", timer->it_id);
        seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo,
                   timer->sigq.info.si_value.sival_ptr);
        seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID],
                   (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
                   pid_nr_ns(timer->it_pid, tp->ns));
        seq_printf(m, "ClockID: %d\n", timer->it_clock);

        return 0;
}

static const struct seq_operations proc_timers_seq_ops = {
        .start        = timers_start,
        .next        = timers_next,
        .stop        = timers_stop,
        .show        = show_timer,
};

static int proc_timers_open(struct inode *inode, struct file *file)
{
        struct timers_private *tp;

        tp = __seq_open_private(file, &proc_timers_seq_ops,
                        sizeof(struct timers_private));
        if (!tp)
                return -ENOMEM;

        tp->pid = proc_pid(inode);
        tp->ns = proc_pid_ns(inode->i_sb);
        return 0;
}

static const struct file_operations proc_timers_operations = {
        .open                = proc_timers_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release_private,
};
#endif

static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        u64 slack_ns;
        int err;

        err = kstrtoull_from_user(buf, count, 10, &slack_ns);
        if (err < 0)
                return err;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (p != current) {
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
                        count = -EPERM;
                        goto out;
                }
                rcu_read_unlock();

                err = security_task_setscheduler(p);
                if (err) {
                        count = err;
                        goto out;
                }
        }

        task_lock(p);
        if (rt_or_dl_task_policy(p))
                slack_ns = 0;
        else if (slack_ns == 0)
                slack_ns = p->default_timer_slack_ns;
        p->timer_slack_ns = slack_ns;
        task_unlock(p);

out:
        put_task_struct(p);

        return count;
}

static int timerslack_ns_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;
        int err = 0;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (p != current) {
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
                        err = -EPERM;
                        goto out;
                }
                rcu_read_unlock();

                err = security_task_getscheduler(p);
                if (err)
                        goto out;
        }

        task_lock(p);
        seq_printf(m, "%llu\n", p->timer_slack_ns);
        task_unlock(p);

out:
        put_task_struct(p);

        return err;
}

static int timerslack_ns_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, timerslack_ns_show, inode);
}

static const struct file_operations proc_pid_set_timerslack_ns_operations = {
        .open                = timerslack_ns_open,
        .read                = seq_read,
        .write                = timerslack_ns_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static struct dentry *proc_pident_instantiate(struct dentry *dentry,
        struct task_struct *task, const void *ptr)
{
        const struct pid_entry *p = ptr;
        struct inode *inode;
        struct proc_inode *ei;

        inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
        if (!inode)
                return ERR_PTR(-ENOENT);

        ei = PROC_I(inode);
        if (S_ISDIR(inode->i_mode))
                set_nlink(inode, 2);        /* Use getattr to fix if necessary */
        if (p->iop)
                inode->i_op = p->iop;
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
        pid_update_inode(task, inode);
        return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}

static struct dentry *proc_pident_lookup(struct inode *dir, 
                                         struct dentry *dentry,
                                         const struct pid_entry *p,
                                         const struct pid_entry *end)
{
        struct task_struct *task = get_proc_task(dir);
        struct dentry *res = ERR_PTR(-ENOENT);

        if (!task)
                goto out_no_task;

        /*
         * Yes, it does not scale. And it should not. Don't add
         * new entries into /proc/<tgid>/ without very good reasons.
         */
        for (; p < end; p++) {
                if (p->len != dentry->d_name.len)
                        continue;
                if (!memcmp(dentry->d_name.name, p->name, p->len)) {
                        res = proc_pident_instantiate(dentry, task, p);
                        break;
                }
        }
        put_task_struct(task);
out_no_task:
        return res;
}

static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
                const struct pid_entry *ents, unsigned int nents)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        const struct pid_entry *p;

        if (!task)
                return -ENOENT;

        if (!dir_emit_dots(file, ctx))
                goto out;

        if (ctx->pos >= nents + 2)
                goto out;

        for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
                if (!proc_fill_cache(file, ctx, p->name, p->len,
                                proc_pident_instantiate, task, p))
                        break;
                ctx->pos++;
        }
out:
        put_task_struct(task);
        return 0;
}

#ifdef CONFIG_SECURITY
static int proc_pid_attr_open(struct inode *inode, struct file *file)
{
        file->private_data = NULL;
        __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
        return 0;
}

static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        char *p = NULL;
        ssize_t length;
        struct task_struct *task = get_proc_task(inode);

        if (!task)
                return -ESRCH;

        length = security_getprocattr(task, PROC_I(inode)->op.lsmid,
                                      file->f_path.dentry->d_name.name,
                                      &p);
        put_task_struct(task);
        if (length > 0)
                length = simple_read_from_buffer(buf, count, ppos, p, length);
        kfree(p);
        return length;
}

static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task;
        void *page;
        int rv;

        /* A task may only write when it was the opener. */
        if (file->private_data != current->mm)
                return -EPERM;

        rcu_read_lock();
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (!task) {
                rcu_read_unlock();
                return -ESRCH;
        }
        /* A task may only write its own attributes. */
        if (current != task) {
                rcu_read_unlock();
                return -EACCES;
        }
        /* Prevent changes to overridden credentials. */
        if (current_cred() != current_real_cred()) {
                rcu_read_unlock();
                return -EBUSY;
        }
        rcu_read_unlock();

        if (count > PAGE_SIZE)
                count = PAGE_SIZE;

        /* No partial writes. */
        if (*ppos != 0)
                return -EINVAL;

        page = memdup_user(buf, count);
        if (IS_ERR(page)) {
                rv = PTR_ERR(page);
                goto out;
        }

        /* Guard against adverse ptrace interaction */
        rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
        if (rv < 0)
                goto out_free;

        rv = security_setprocattr(PROC_I(inode)->op.lsmid,
                                  file->f_path.dentry->d_name.name, page,
                                  count);
        mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
        kfree(page);
out:
        return rv;
}

static const struct file_operations proc_pid_attr_operations = {
        .open                = proc_pid_attr_open,
        .read                = proc_pid_attr_read,
        .write                = proc_pid_attr_write,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

#define LSM_DIR_OPS(LSM) \
static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
                             struct dir_context *ctx) \
{ \
        return proc_pident_readdir(filp, ctx, \
                                   LSM##_attr_dir_stuff, \
                                   ARRAY_SIZE(LSM##_attr_dir_stuff)); \
} \
\
static const struct file_operations proc_##LSM##_attr_dir_ops = { \
        .read                = generic_read_dir, \
        .iterate_shared        = proc_##LSM##_attr_dir_iterate, \
        .llseek                = default_llseek, \
}; \
\
static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
                                struct dentry *dentry, unsigned int flags) \
{ \
        return proc_pident_lookup(dir, dentry, \
                                  LSM##_attr_dir_stuff, \
                                  LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
} \
\
static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
        .lookup                = proc_##LSM##_attr_dir_lookup, \
        .getattr        = pid_getattr, \
        .setattr        = proc_nochmod_setattr, \
}

#ifdef CONFIG_SECURITY_SMACK
static const struct pid_entry smack_attr_dir_stuff[] = {
        ATTR(LSM_ID_SMACK, "current",        0666),
};
LSM_DIR_OPS(smack);
#endif

#ifdef CONFIG_SECURITY_APPARMOR
static const struct pid_entry apparmor_attr_dir_stuff[] = {
        ATTR(LSM_ID_APPARMOR, "current",        0666),
        ATTR(LSM_ID_APPARMOR, "prev",                0444),
        ATTR(LSM_ID_APPARMOR, "exec",                0666),
};
LSM_DIR_OPS(apparmor);
#endif

static const struct pid_entry attr_dir_stuff[] = {
        ATTR(LSM_ID_UNDEF, "current",        0666),
        ATTR(LSM_ID_UNDEF, "prev",                0444),
        ATTR(LSM_ID_UNDEF, "exec",                0666),
        ATTR(LSM_ID_UNDEF, "fscreate",        0666),
        ATTR(LSM_ID_UNDEF, "keycreate",        0666),
        ATTR(LSM_ID_UNDEF, "sockcreate",        0666),
#ifdef CONFIG_SECURITY_SMACK
        DIR("smack",                        0555,
            proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
#endif
#ifdef CONFIG_SECURITY_APPARMOR
        DIR("apparmor",                        0555,
            proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
#endif
};

static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx, 
                                   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
}

static const struct file_operations proc_attr_dir_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_attr_dir_readdir,
        .llseek                = generic_file_llseek,
};

static struct dentry *proc_attr_dir_lookup(struct inode *dir,
                                struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  attr_dir_stuff,
                                  attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
}

static const struct inode_operations proc_attr_dir_inode_operations = {
        .lookup                = proc_attr_dir_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_nochmod_setattr,
};

#endif

#ifdef CONFIG_ELF_CORE
static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
                                         size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        struct mm_struct *mm;
        char buffer[PROC_NUMBUF];
        size_t len;
        int ret;

        if (!task)
                return -ESRCH;

        ret = 0;
        mm = get_task_mm(task);
        if (mm) {
                unsigned long flags = __mm_flags_get_dumpable(mm);

                len = snprintf(buffer, sizeof(buffer), "%08lx\n",
                               ((flags & MMF_DUMP_FILTER_MASK) >>
                                MMF_DUMP_FILTER_SHIFT));
                mmput(mm);
                ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
        }

        put_task_struct(task);

        return ret;
}

static ssize_t proc_coredump_filter_write(struct file *file,
                                          const char __user *buf,
                                          size_t count,
                                          loff_t *ppos)
{
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned int val;
        int ret;
        int i;
        unsigned long mask;

        ret = kstrtouint_from_user(buf, count, 0, &val);
        if (ret < 0)
                return ret;

        ret = -ESRCH;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out_no_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_no_mm;
        ret = 0;

        for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
                if (val & mask)
                        mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm);
                else
                        mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm);
        }

        mmput(mm);
 out_no_mm:
        put_task_struct(task);
 out_no_task:
        if (ret < 0)
                return ret;
        return count;
}

static const struct file_operations proc_coredump_filter_operations = {
        .read                = proc_coredump_filter_read,
        .write                = proc_coredump_filter_write,
        .llseek                = generic_file_llseek,
};
#endif

#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
        struct task_io_accounting acct;
        int result;

        result = down_read_killable(&task->signal->exec_update_lock);
        if (result)
                return result;

        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
                result = -EACCES;
                goto out_unlock;
        }

        if (whole) {
                struct signal_struct *sig = task->signal;
                struct task_struct *t;

                guard(rcu)();
                scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
                        acct = sig->ioac;
                        __for_each_thread(sig, t)
                                task_io_accounting_add(&acct, &t->ioac);

                }
        } else {
                acct = task->ioac;
        }

        seq_printf(m,
                   "rchar: %llu\n"
                   "wchar: %llu\n"
                   "syscr: %llu\n"
                   "syscw: %llu\n"
                   "read_bytes: %llu\n"
                   "write_bytes: %llu\n"
                   "cancelled_write_bytes: %llu\n",
                   (unsigned long long)acct.rchar,
                   (unsigned long long)acct.wchar,
                   (unsigned long long)acct.syscr,
                   (unsigned long long)acct.syscw,
                   (unsigned long long)acct.read_bytes,
                   (unsigned long long)acct.write_bytes,
                   (unsigned long long)acct.cancelled_write_bytes);
        result = 0;

out_unlock:
        up_read(&task->signal->exec_update_lock);
        return result;
}

static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
                                  struct pid *pid, struct task_struct *task)
{
        return do_io_accounting(task, m, 0);
}

static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
                                   struct pid *pid, struct task_struct *task)
{
        return do_io_accounting(task, m, 1);
}
#endif /* CONFIG_TASK_IO_ACCOUNTING */

#ifdef CONFIG_USER_NS
static int proc_id_map_open(struct inode *inode, struct file *file,
        const struct seq_operations *seq_ops)
{
        struct user_namespace *ns = NULL;
        struct task_struct *task;
        struct seq_file *seq;
        int ret = -EINVAL;

        task = get_proc_task(inode);
        if (task) {
                rcu_read_lock();
                ns = get_user_ns(task_cred_xxx(task, user_ns));
                rcu_read_unlock();
                put_task_struct(task);
        }
        if (!ns)
                goto err;

        ret = seq_open(file, seq_ops);
        if (ret)
                goto err_put_ns;

        seq = file->private_data;
        seq->private = ns;

        return 0;
err_put_ns:
        put_user_ns(ns);
err:
        return ret;
}

static int proc_id_map_release(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        put_user_ns(ns);
        return seq_release(inode, file);
}

static int proc_uid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_uid_seq_operations);
}

static int proc_gid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_gid_seq_operations);
}

static int proc_projid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_projid_seq_operations);
}

static const struct file_operations proc_uid_map_operations = {
        .open                = proc_uid_map_open,
        .write                = proc_uid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static const struct file_operations proc_gid_map_operations = {
        .open                = proc_gid_map_open,
        .write                = proc_gid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static const struct file_operations proc_projid_map_operations = {
        .open                = proc_projid_map_open,
        .write                = proc_projid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static int proc_setgroups_open(struct inode *inode, struct file *file)
{
        struct user_namespace *ns = NULL;
        struct task_struct *task;
        int ret;

        ret = -ESRCH;
        task = get_proc_task(inode);
        if (task) {
                rcu_read_lock();
                ns = get_user_ns(task_cred_xxx(task, user_ns));
                rcu_read_unlock();
                put_task_struct(task);
        }
        if (!ns)
                goto err;

        if (file->f_mode & FMODE_WRITE) {
                ret = -EACCES;
                if (!ns_capable(ns, CAP_SYS_ADMIN))
                        goto err_put_ns;
        }

        ret = single_open(file, &proc_setgroups_show, ns);
        if (ret)
                goto err_put_ns;

        return 0;
err_put_ns:
        put_user_ns(ns);
err:
        return ret;
}

static int proc_setgroups_release(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        int ret = single_release(inode, file);
        put_user_ns(ns);
        return ret;
}

static const struct file_operations proc_setgroups_operations = {
        .open                = proc_setgroups_open,
        .write                = proc_setgroups_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_setgroups_release,
};
#endif /* CONFIG_USER_NS */

static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        int err = lock_trace(task);
        if (!err) {
                seq_printf(m, "%08x\n", task->personality);
                unlock_trace(task);
        }
        return err;
}

#ifdef CONFIG_LIVEPATCH
static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        seq_printf(m, "%d\n", task->patch_state);
        return 0;
}
#endif /* CONFIG_LIVEPATCH */

#ifdef CONFIG_KSM
static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        struct mm_struct *mm;

        mm = get_task_mm(task);
        if (mm) {
                seq_printf(m, "%lu\n", mm->ksm_merging_pages);
                mmput(mm);
        }

        return 0;
}
static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        struct mm_struct *mm;
        int ret = 0;

        mm = get_task_mm(task);
        if (mm) {
                seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
                seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm));
                seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
                seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
                seq_printf(m, "ksm_merge_any: %s\n",
                                mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no");
                ret = mmap_read_lock_killable(mm);
                if (ret) {
                        mmput(mm);
                        return ret;
                }
                seq_printf(m, "ksm_mergeable: %s\n",
                                ksm_process_mergeable(mm) ? "yes" : "no");
                mmap_read_unlock(mm);
                mmput(mm);
        }

        return 0;
}
#endif /* CONFIG_KSM */

#ifdef CONFIG_KSTACK_ERASE_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        unsigned long prev_depth = THREAD_SIZE -
                                (task->prev_lowest_stack & (THREAD_SIZE - 1));
        unsigned long depth = THREAD_SIZE -
                                (task->lowest_stack & (THREAD_SIZE - 1));

        seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
                                                        prev_depth, depth);
        return 0;
}
#endif /* CONFIG_KSTACK_ERASE_METRICS */

/*
 * Thread groups
 */
static const struct file_operations proc_task_operations;
static const struct inode_operations proc_task_inode_operations;

static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
        DIR("fdinfo",     S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",          S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
        REG("environ",    S_IRUSR, proc_environ_operations),
        REG("auxv",       S_IRUSR, proc_auxv_operations),
        ONE("status",     S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
        ONE("limits",          S_IRUGO, proc_pid_limits),
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#ifdef CONFIG_SCHED_AUTOGROUP
        REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
#ifdef CONFIG_TIME_NS
        REG("timens_offsets",  S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
#endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",    S_IRUSR, proc_pid_syscall),
#endif
        REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
        ONE("statm",      S_IRUGO, proc_pid_statm),
        REG("maps",       S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_NUMA
        REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
#endif
        REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",        proc_cwd_link),
        LNK("root",       proc_root_link),
        LNK("exe",        proc_exe_link),
        REG("mounts",     S_IRUGO, proc_mounts_operations),
        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
        REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
        REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
        ONE("wchan",      S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHED_INFO
        ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
        REG("latency",  S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
        ONE("cpuset",     S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
        ONE("cgroup",  S_IRUGO, proc_cgroup_show),
#endif
#ifdef CONFIG_PROC_CPU_RESCTRL
        ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
#endif
        ONE("oom_score",  S_IRUGO, proc_oom_score),
        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDIT
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
        REG("fail-nth", 0644, proc_fail_nth_operations),
#endif
#ifdef CONFIG_ELF_CORE
        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
        ONE("io",        S_IRUSR, proc_tgid_io_accounting),
#endif
#ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
        REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
        REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
        REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
        REG("timers",          S_IRUGO, proc_timers_operations),
#endif
        REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
#ifdef CONFIG_LIVEPATCH
        ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
#endif
#ifdef CONFIG_KSTACK_ERASE_METRICS
        ONE("stack_depth", S_IRUGO, proc_stack_depth),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
        ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
        ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
#ifdef CONFIG_KSM
        ONE("ksm_merging_pages",  S_IRUSR, proc_pid_ksm_merging_pages),
        ONE("ksm_stat",  S_IRUSR, proc_pid_ksm_stat),
#endif
};

static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx,
                                   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}

static const struct file_operations proc_tgid_base_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_tgid_base_readdir,
        .llseek                = generic_file_llseek,
};

struct pid *tgid_pidfd_to_pid(const struct file *file)
{
        if (file->f_op != &proc_tgid_base_operations)
                return ERR_PTR(-EBADF);

        return proc_pid(file_inode(file));
}

static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  tgid_base_stuff,
                                  tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
}

static const struct inode_operations proc_tgid_base_inode_operations = {
        .lookup                = proc_tgid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_nochmod_setattr,
        .permission        = proc_pid_permission,
};

/**
 * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
 * @pid: pid that should be flushed.
 *
 * This function walks a list of inodes (that belong to any proc
 * filesystem) that are attached to the pid and flushes them from
 * the dentry cache.
 *
 * It is safe and reasonable to cache /proc entries for a task until
 * that task exits.  After that they just clog up the dcache with
 * useless entries, possibly causing useful dcache entries to be
 * flushed instead.  This routine is provided to flush those useless
 * dcache entries when a process is reaped.
 *
 * NOTE: This routine is just an optimization so it does not guarantee
 *       that no dcache entries will exist after a process is reaped
 *       it just makes it very unlikely that any will persist.
 */

void proc_flush_pid(struct pid *pid)
{
        proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
}

static struct dentry *proc_pid_instantiate(struct dentry * dentry,
                                   struct task_struct *task, const void *ptr)
{
        struct inode *inode;

        inode = proc_pid_make_base_inode(dentry->d_sb, task,
                                         S_IFDIR | S_IRUGO | S_IXUGO);
        if (!inode)
                return ERR_PTR(-ENOENT);

        inode->i_op = &proc_tgid_base_inode_operations;
        inode->i_fop = &proc_tgid_base_operations;
        inode->i_flags|=S_IMMUTABLE;

        set_nlink(inode, nlink_tgid);
        pid_update_inode(task, inode);

        return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}

struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
        struct task_struct *task;
        unsigned tgid;
        struct proc_fs_info *fs_info;
        struct pid_namespace *ns;
        struct dentry *result = ERR_PTR(-ENOENT);

        tgid = name_to_int(&dentry->d_name);
        if (tgid == ~0U)
                goto out;

        fs_info = proc_sb_info(dentry->d_sb);
        ns = fs_info->pid_ns;
        rcu_read_lock();
        task = find_task_by_pid_ns(tgid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
        if (!task)
                goto out;

        /* Limit procfs to only ptraceable tasks */
        if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
                if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
                        goto out_put_task;
        }

        result = proc_pid_instantiate(dentry, task, NULL);
out_put_task:
        put_task_struct(task);
out:
        return result;
}

/*
 * Find the first task with tgid >= tgid
 *
 */
struct tgid_iter {
        unsigned int tgid;
        struct task_struct *task;
};
static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
{
        struct pid *pid;

        if (iter.task)
                put_task_struct(iter.task);
        rcu_read_lock();
retry:
        iter.task = NULL;
        pid = find_ge_pid(iter.tgid, ns);
        if (pid) {
                iter.tgid = pid_nr_ns(pid, ns);
                iter.task = pid_task(pid, PIDTYPE_TGID);
                if (!iter.task) {
                        iter.tgid += 1;
                        goto retry;
                }
                get_task_struct(iter.task);
        }
        rcu_read_unlock();
        return iter;
}

#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)

/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
        struct tgid_iter iter;
        struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
        struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
        loff_t pos = ctx->pos;

        if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
                return 0;

        if (pos == TGID_OFFSET - 2) {
                if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK))
                        return 0;
                ctx->pos = pos = pos + 1;
        }
        if (pos == TGID_OFFSET - 1) {
                if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK))
                        return 0;
                ctx->pos = pos = pos + 1;
        }
        iter.tgid = pos - TGID_OFFSET;
        iter.task = NULL;
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
                char name[10 + 1];
                unsigned int len;

                cond_resched();
                if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
                        continue;

                len = snprintf(name, sizeof(name), "%u", iter.tgid);
                ctx->pos = iter.tgid + TGID_OFFSET;
                if (!proc_fill_cache(file, ctx, name, len,
                                     proc_pid_instantiate, iter.task, NULL)) {
                        put_task_struct(iter.task);
                        return 0;
                }
        }
        ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
        return 0;
}

/*
 * proc_tid_comm_permission is a special permission function exclusively
 * used for the node /proc/<pid>/task/<tid>/comm.
 * It bypasses generic permission checks in the case where a task of the same
 * task group attempts to access the node.
 * The rationale behind this is that glibc and bionic access this node for
 * cross thread naming (pthread_set/getname_np(!self)). However, if
 * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
 * which locks out the cross thread naming implementation.
 * This function makes sure that the node is always accessible for members of
 * same thread group.
 */
static int proc_tid_comm_permission(struct mnt_idmap *idmap,
                                    struct inode *inode, int mask)
{
        bool is_same_tgroup;
        struct task_struct *task;

        task = get_proc_task(inode);
        if (!task)
                return -ESRCH;
        is_same_tgroup = same_thread_group(current, task);
        put_task_struct(task);

        if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
                /* This file (/proc/<pid>/task/<tid>/comm) can always be
                 * read or written by the members of the corresponding
                 * thread group.
                 */
                return 0;
        }

        return generic_permission(&nop_mnt_idmap, inode, mask);
}

static const struct inode_operations proc_tid_comm_inode_operations = {
                .setattr        = proc_nochmod_setattr,
                .permission        = proc_tid_comm_permission,
};

/*
 * Tasks
 */
static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("fdinfo",    S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
        REG("environ",   S_IRUSR, proc_environ_operations),
        REG("auxv",      S_IRUSR, proc_auxv_operations),
        ONE("status",    S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
        ONE("limits",         S_IRUGO, proc_pid_limits),
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
        NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
                         &proc_tid_comm_inode_operations,
                         &proc_pid_set_comm_operations, {}),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",   S_IRUSR, proc_pid_syscall),
#endif
        REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
        ONE("stat",      S_IRUGO, proc_tid_stat),
        ONE("statm",     S_IRUGO, proc_pid_statm),
        REG("maps",      S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_PROC_CHILDREN
        REG("children",  S_IRUGO, proc_tid_children_operations),
#endif
#ifdef CONFIG_NUMA
        REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
        REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",       proc_cwd_link),
        LNK("root",      proc_root_link),
        LNK("exe",       proc_exe_link),
        REG("mounts",    S_IRUGO, proc_mounts_operations),
        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
        REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
        ONE("wchan",     S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHED_INFO
        ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
        REG("latency",  S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
        ONE("cpuset",    S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
        ONE("cgroup",  S_IRUGO, proc_cgroup_show),
#endif
#ifdef CONFIG_PROC_CPU_RESCTRL
        ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
#endif
        ONE("oom_score", S_IRUGO, proc_oom_score),
        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDIT
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
        REG("fail-nth", 0644, proc_fail_nth_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
        ONE("io",        S_IRUSR, proc_tid_io_accounting),
#endif
#ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
        REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
        REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
        REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#ifdef CONFIG_LIVEPATCH
        ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
        ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
        ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
#ifdef CONFIG_KSM
        ONE("ksm_merging_pages",  S_IRUSR, proc_pid_ksm_merging_pages),
        ONE("ksm_stat",  S_IRUSR, proc_pid_ksm_stat),
#endif
};

static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx,
                                   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
}

static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  tid_base_stuff,
                                  tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
}

static const struct file_operations proc_tid_base_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_tid_base_readdir,
        .llseek                = generic_file_llseek,
};

static const struct inode_operations proc_tid_base_inode_operations = {
        .lookup                = proc_tid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_nochmod_setattr,
};

static struct dentry *proc_task_instantiate(struct dentry *dentry,
        struct task_struct *task, const void *ptr)
{
        struct inode *inode;
        inode = proc_pid_make_base_inode(dentry->d_sb, task,
                                         S_IFDIR | S_IRUGO | S_IXUGO);
        if (!inode)
                return ERR_PTR(-ENOENT);

        inode->i_op = &proc_tid_base_inode_operations;
        inode->i_fop = &proc_tid_base_operations;
        inode->i_flags |= S_IMMUTABLE;

        set_nlink(inode, nlink_tid);
        pid_update_inode(task, inode);

        return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}

static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
        struct task_struct *task;
        struct task_struct *leader = get_proc_task(dir);
        unsigned tid;
        struct proc_fs_info *fs_info;
        struct pid_namespace *ns;
        struct dentry *result = ERR_PTR(-ENOENT);

        if (!leader)
                goto out_no_task;

        tid = name_to_int(&dentry->d_name);
        if (tid == ~0U)
                goto out;

        fs_info = proc_sb_info(dentry->d_sb);
        ns = fs_info->pid_ns;
        rcu_read_lock();
        task = find_task_by_pid_ns(tid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
        if (!task)
                goto out;
        if (!same_thread_group(leader, task))
                goto out_drop_task;

        result = proc_task_instantiate(dentry, task, NULL);
out_drop_task:
        put_task_struct(task);
out:
        put_task_struct(leader);
out_no_task:
        return result;
}

/*
 * Find the first tid of a thread group to return to user space.
 *
 * Usually this is just the thread group leader, but if the users
 * buffer was too small or there was a seek into the middle of the
 * directory we have more work todo.
 *
 * In the case of a short read we start with find_task_by_pid.
 *
 * In the case of a seek we start with the leader and walk nr
 * threads past it.
 */
static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
                                        struct pid_namespace *ns)
{
        struct task_struct *pos, *task;
        unsigned long nr = f_pos;

        if (nr != f_pos)        /* 32bit overflow? */
                return NULL;

        rcu_read_lock();
        task = pid_task(pid, PIDTYPE_PID);
        if (!task)
                goto fail;

        /* Attempt to start with the tid of a thread */
        if (tid && nr) {
                pos = find_task_by_pid_ns(tid, ns);
                if (pos && same_thread_group(pos, task))
                        goto found;
        }

        /* If nr exceeds the number of threads there is nothing todo */
        if (nr >= get_nr_threads(task))
                goto fail;

        /* If we haven't found our starting place yet start
         * with the leader and walk nr threads forward.
         */
        for_each_thread(task, pos) {
                if (!nr--)
                        goto found;
        }
fail:
        pos = NULL;
        goto out;
found:
        get_task_struct(pos);
out:
        rcu_read_unlock();
        return pos;
}

/*
 * Find the next thread in the thread list.
 * Return NULL if there is an error or no next thread.
 *
 * The reference to the input task_struct is released.
 */
static struct task_struct *next_tid(struct task_struct *start)
{
        struct task_struct *pos = NULL;
        rcu_read_lock();
        if (pid_alive(start)) {
                pos = __next_thread(start);
                if (pos)
                        get_task_struct(pos);
        }
        rcu_read_unlock();
        put_task_struct(start);
        return pos;
}

/* for the /proc/TGID/task/ directories */
static int proc_task_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct task_struct *task;
        struct pid_namespace *ns;
        int tid;

        if (proc_inode_is_dead(inode))
                return -ENOENT;

        if (!dir_emit_dots(file, ctx))
                return 0;

        /* We cache the tgid value that the last readdir call couldn't
         * return and lseek resets it to 0.
         */
        ns = proc_pid_ns(inode->i_sb);
        tid = (int)(intptr_t)file->private_data;
        file->private_data = NULL;
        for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
             task;
             task = next_tid(task), ctx->pos++) {
                char name[10 + 1];
                unsigned int len;

                tid = task_pid_nr_ns(task, ns);
                if (!tid)
                        continue;        /* The task has just exited. */
                len = snprintf(name, sizeof(name), "%d", tid);
                if (!proc_fill_cache(file, ctx, name, len,
                                proc_task_instantiate, task, NULL)) {
                        /* returning this tgid failed, save it as the first
                         * pid for the next readir call */
                        file->private_data = (void *)(intptr_t)tid;
                        put_task_struct(task);
                        break;
                }
        }

        return 0;
}

static int proc_task_getattr(struct mnt_idmap *idmap,
                             const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct task_struct *p = get_proc_task(inode);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);

        if (p) {
                stat->nlink += get_nr_threads(p);
                put_task_struct(p);
        }

        return 0;
}

/*
 * proc_task_readdir() set @file->private_data to a positive integer
 * value, so casting that to u64 is safe. generic_llseek_cookie() will
 * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is
 * here to catch any unexpected change in behavior either in
 * proc_task_readdir() or generic_llseek_cookie().
 */
static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence)
{
        u64 cookie = (u64)(intptr_t)file->private_data;
        loff_t off;

        off = generic_llseek_cookie(file, offset, whence, &cookie);
        WARN_ON_ONCE(cookie > INT_MAX);
        file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */
        return off;
}

static const struct inode_operations proc_task_inode_operations = {
        .lookup                = proc_task_lookup,
        .getattr        = proc_task_getattr,
        .setattr        = proc_nochmod_setattr,
        .permission        = proc_pid_permission,
};

static const struct file_operations proc_task_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_task_readdir,
        .llseek                = proc_dir_llseek,
};

void __init set_proc_pid_nlink(void)
{
        nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
        nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}
















































































































































    2 




    1 












    1 

    1 

    2 








   10 



    9 
    7 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        NET3        IP device support routines.
 *
 *        Derived from the IP parts of dev.c 1.0.19
 *                 Authors:        Ross Biro
 *                                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *
 *        Additional Authors:
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *        Changes:
 *                Alexey Kuznetsov:        pa_* fields are replaced with ifaddr
 *                                        lists.
 *                Cyrus Durgin:                updated for kmod
 *                Matthias Andree:        in devinet_ioctl, compare label and
 *                                        address (4.4BSD alias style support),
 *                                        fall back to comparing just the label
 *                                        if no match found.
 */


#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_addr.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include "igmp_internal.h"
#include <linux/slab.h>
#include <linux/hash.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/kmod.h>
#include <linux/netconf.h>

#include <net/arp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/addrconf.h>

#define IPV6ONLY_FLAGS        \
                (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \
                 IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \
                 IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY)

static struct ipv4_devconf ipv4_devconf = {
        .data = {
                [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
                [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
                [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,
                [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1,
        },
};

static struct ipv4_devconf ipv4_devconf_dflt = {
        .data = {
                [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
                [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
                [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
                [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,
                [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1,
        },
};

#define IPV4_DEVCONF_DFLT(net, attr) \
        IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr)

static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
        [IFA_LOCAL]             = { .type = NLA_U32 },
        [IFA_ADDRESS]           = { .type = NLA_U32 },
        [IFA_BROADCAST]         = { .type = NLA_U32 },
        [IFA_LABEL]             = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
        [IFA_CACHEINFO]                = { .len = sizeof(struct ifa_cacheinfo) },
        [IFA_FLAGS]                = { .type = NLA_U32 },
        [IFA_RT_PRIORITY]        = { .type = NLA_U32 },
        [IFA_TARGET_NETNSID]        = { .type = NLA_S32 },
        [IFA_PROTO]                = { .type = NLA_U8 },
};

#define IN4_ADDR_HSIZE_SHIFT        8
#define IN4_ADDR_HSIZE                (1U << IN4_ADDR_HSIZE_SHIFT)

static u32 inet_addr_hash(const struct net *net, __be32 addr)
{
        u32 val = __ipv4_addr_hash(addr, net_hash_mix(net));

        return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
}

static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
{
        u32 hash = inet_addr_hash(net, ifa->ifa_local);

        ASSERT_RTNL();
        hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]);
}

static void inet_hash_remove(struct in_ifaddr *ifa)
{
        ASSERT_RTNL();
        hlist_del_init_rcu(&ifa->addr_lst);
}

/**
 * __ip_dev_find - find the first device with a given source address.
 * @net: the net namespace
 * @addr: the source address
 * @devref: if true, take a reference on the found device
 *
 * If a caller uses devref=false, it should be protected by RCU, or RTNL
 */
struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
{
        struct net_device *result = NULL;
        struct in_ifaddr *ifa;

        rcu_read_lock();
        ifa = inet_lookup_ifaddr_rcu(net, addr);
        if (!ifa) {
                struct flowi4 fl4 = { .daddr = addr };
                struct fib_result res = { 0 };
                struct fib_table *local;

                /* Fallback to FIB local table so that communication
                 * over loopback subnets work.
                 */
                local = fib_get_table(net, RT_TABLE_LOCAL);
                if (local &&
                    !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
                    res.type == RTN_LOCAL)
                        result = FIB_RES_DEV(res);
        } else {
                result = ifa->ifa_dev->dev;
        }
        if (result && devref)
                dev_hold(result);
        rcu_read_unlock();
        return result;
}
EXPORT_SYMBOL(__ip_dev_find);

/* called under RCU lock */
struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr)
{
        u32 hash = inet_addr_hash(net, addr);
        struct in_ifaddr *ifa;

        hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst)
                if (ifa->ifa_local == addr)
                        return ifa;

        return NULL;
}

static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);

static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain);
static void inet_del_ifa(struct in_device *in_dev,
                         struct in_ifaddr __rcu **ifap,
                         int destroy);
#ifdef CONFIG_SYSCTL
static int devinet_sysctl_register(struct in_device *idev);
static void devinet_sysctl_unregister(struct in_device *idev);
#else
static int devinet_sysctl_register(struct in_device *idev)
{
        return 0;
}
static void devinet_sysctl_unregister(struct in_device *idev)
{
}
#endif

/* Locks all the inet devices. */

static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev)
{
        struct in_ifaddr *ifa;

        ifa = kzalloc_obj(*ifa, GFP_KERNEL_ACCOUNT);
        if (!ifa)
                return NULL;

        in_dev_hold(in_dev);
        ifa->ifa_dev = in_dev;

        INIT_HLIST_NODE(&ifa->addr_lst);

        return ifa;
}

static void inet_rcu_free_ifa(struct rcu_head *head)
{
        struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);

        in_dev_put(ifa->ifa_dev);
        kfree(ifa);
}

static void inet_free_ifa(struct in_ifaddr *ifa)
{
        /* Our reference to ifa->ifa_dev must be freed ASAP
         * to release the reference to the netdev the same way.
         * in_dev_put() -> in_dev_finish_destroy() -> netdev_put()
         */
        call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa);
}

static void in_dev_free_rcu(struct rcu_head *head)
{
        struct in_device *idev = container_of(head, struct in_device, rcu_head);

        kfree(rcu_dereference_protected(idev->mc_hash, 1));
        kfree(idev);
}

void in_dev_finish_destroy(struct in_device *idev)
{
        struct net_device *dev = idev->dev;

        WARN_ON(idev->ifa_list);
        WARN_ON(idev->mc_list);
#ifdef NET_REFCNT_DEBUG
        pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
        netdev_put(dev, &idev->dev_tracker);
        if (!idev->dead)
                pr_err("Freeing alive in_device %p\n", idev);
        else
                call_rcu(&idev->rcu_head, in_dev_free_rcu);
}
EXPORT_SYMBOL(in_dev_finish_destroy);

static struct in_device *inetdev_init(struct net_device *dev)
{
        struct in_device *in_dev;
        int err = -ENOMEM;

        ASSERT_RTNL();

        in_dev = kzalloc_obj(*in_dev);
        if (!in_dev)
                goto out;
        memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
                        sizeof(in_dev->cnf));
        in_dev->cnf.sysctl = NULL;
        in_dev->dev = dev;
        in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
        if (!in_dev->arp_parms)
                goto out_kfree;
        if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
                netif_disable_lro(dev);
        /* Reference in_dev->dev */
        netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL);
        /* Account for reference dev->ip_ptr (below) */
        refcount_set(&in_dev->refcnt, 1);

        if (dev != blackhole_netdev) {
                err = devinet_sysctl_register(in_dev);
                if (err) {
                        in_dev->dead = 1;
                        neigh_parms_release(&arp_tbl, in_dev->arp_parms);
                        in_dev_put(in_dev);
                        in_dev = NULL;
                        goto out;
                }
                ip_mc_init_dev(in_dev);
                if (dev->flags & IFF_UP)
                        ip_mc_up(in_dev);
        }

        /* we can receive as soon as ip_ptr is set -- do this last */
        rcu_assign_pointer(dev->ip_ptr, in_dev);
out:
        return in_dev ?: ERR_PTR(err);
out_kfree:
        kfree(in_dev);
        in_dev = NULL;
        goto out;
}

static void inetdev_destroy(struct in_device *in_dev)
{
        struct net_device *dev;
        struct in_ifaddr *ifa;

        ASSERT_RTNL();

        dev = in_dev->dev;

        in_dev->dead = 1;

        ip_mc_destroy_dev(in_dev);

        while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) {
                inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
                inet_free_ifa(ifa);
        }

        RCU_INIT_POINTER(dev->ip_ptr, NULL);

        devinet_sysctl_unregister(in_dev);
        neigh_parms_release(&arp_tbl, in_dev->arp_parms);
        arp_ifdown(dev);

        in_dev_put(in_dev);
}

static int __init inet_blackhole_dev_init(void)
{
        struct in_device *in_dev;

        rtnl_lock();
        in_dev = inetdev_init(blackhole_netdev);
        rtnl_unlock();

        return PTR_ERR_OR_ZERO(in_dev);
}
late_initcall(inet_blackhole_dev_init);

int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
{
        const struct in_ifaddr *ifa;

        rcu_read_lock();
        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (inet_ifa_match(a, ifa)) {
                        if (!b || inet_ifa_match(b, ifa)) {
                                rcu_read_unlock();
                                return 1;
                        }
                }
        }
        rcu_read_unlock();
        return 0;
}

static void __inet_del_ifa(struct in_device *in_dev,
                           struct in_ifaddr __rcu **ifap,
                           int destroy, struct nlmsghdr *nlh, u32 portid)
{
        struct in_ifaddr *promote = NULL;
        struct in_ifaddr *ifa, *ifa1;
        struct in_ifaddr __rcu **last_prim;
        struct in_ifaddr *prev_prom = NULL;
        int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);

        ASSERT_RTNL();

        ifa1 = rtnl_dereference(*ifap);
        last_prim = ifap;
        if (in_dev->dead)
                goto no_promotions;

        /* 1. Deleting primary ifaddr forces deletion all secondaries
         * unless alias promotion is set
         **/

        if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
                struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next;

                while ((ifa = rtnl_dereference(*ifap1)) != NULL) {
                        if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
                            ifa1->ifa_scope <= ifa->ifa_scope)
                                last_prim = &ifa->ifa_next;

                        if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
                            ifa1->ifa_mask != ifa->ifa_mask ||
                            !inet_ifa_match(ifa1->ifa_address, ifa)) {
                                ifap1 = &ifa->ifa_next;
                                prev_prom = ifa;
                                continue;
                        }

                        if (!do_promote) {
                                inet_hash_remove(ifa);
                                *ifap1 = ifa->ifa_next;

                                rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
                                blocking_notifier_call_chain(&inetaddr_chain,
                                                NETDEV_DOWN, ifa);
                                inet_free_ifa(ifa);
                        } else {
                                promote = ifa;
                                break;
                        }
                }
        }

        /* On promotion all secondaries from subnet are changing
         * the primary IP, we must remove all their routes silently
         * and later to add them back with new prefsrc. Do this
         * while all addresses are on the device list.
         */
        for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) {
                if (ifa1->ifa_mask == ifa->ifa_mask &&
                    inet_ifa_match(ifa1->ifa_address, ifa))
                        fib_del_ifaddr(ifa, ifa1);
        }

no_promotions:
        /* 2. Unlink it */

        *ifap = ifa1->ifa_next;
        inet_hash_remove(ifa1);

        /* 3. Announce address deletion */

        /* Send message first, then call notifier.
           At first sight, FIB update triggered by notifier
           will refer to already deleted ifaddr, that could confuse
           netlink listeners. It is not true: look, gated sees
           that route deleted and if it still thinks that ifaddr
           is valid, it will try to restore deleted routes... Grr.
           So that, this order is correct.
         */
        rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
        blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);

        if (promote) {
                struct in_ifaddr *next_sec;

                next_sec = rtnl_dereference(promote->ifa_next);
                if (prev_prom) {
                        struct in_ifaddr *last_sec;

                        rcu_assign_pointer(prev_prom->ifa_next, next_sec);

                        last_sec = rtnl_dereference(*last_prim);
                        rcu_assign_pointer(promote->ifa_next, last_sec);
                        rcu_assign_pointer(*last_prim, promote);
                }

                promote->ifa_flags &= ~IFA_F_SECONDARY;
                rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
                blocking_notifier_call_chain(&inetaddr_chain,
                                NETDEV_UP, promote);
                for (ifa = next_sec; ifa;
                     ifa = rtnl_dereference(ifa->ifa_next)) {
                        if (ifa1->ifa_mask != ifa->ifa_mask ||
                            !inet_ifa_match(ifa1->ifa_address, ifa))
                                        continue;
                        fib_add_ifaddr(ifa);
                }

        }
        if (destroy)
                inet_free_ifa(ifa1);
}

static void inet_del_ifa(struct in_device *in_dev,
                         struct in_ifaddr __rcu **ifap,
                         int destroy)
{
        __inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
}

static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
                             u32 portid, struct netlink_ext_ack *extack)
{
        struct in_ifaddr __rcu **last_primary, **ifap;
        struct in_device *in_dev = ifa->ifa_dev;
        struct net *net = dev_net(in_dev->dev);
        struct in_validator_info ivi;
        struct in_ifaddr *ifa1;
        int ret;

        ASSERT_RTNL();

        ifa->ifa_flags &= ~IFA_F_SECONDARY;
        last_primary = &in_dev->ifa_list;

        /* Don't set IPv6 only flags to IPv4 addresses */
        ifa->ifa_flags &= ~IPV6ONLY_FLAGS;

        ifap = &in_dev->ifa_list;
        ifa1 = rtnl_dereference(*ifap);

        while (ifa1) {
                if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
                    ifa->ifa_scope <= ifa1->ifa_scope)
                        last_primary = &ifa1->ifa_next;
                if (ifa1->ifa_mask == ifa->ifa_mask &&
                    inet_ifa_match(ifa1->ifa_address, ifa)) {
                        if (ifa1->ifa_local == ifa->ifa_local) {
                                inet_free_ifa(ifa);
                                return -EEXIST;
                        }
                        if (ifa1->ifa_scope != ifa->ifa_scope) {
                                NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value");
                                inet_free_ifa(ifa);
                                return -EINVAL;
                        }
                        ifa->ifa_flags |= IFA_F_SECONDARY;
                }

                ifap = &ifa1->ifa_next;
                ifa1 = rtnl_dereference(*ifap);
        }

        /* Allow any devices that wish to register ifaddr validtors to weigh
         * in now, before changes are committed.  The rntl lock is serializing
         * access here, so the state should not change between a validator call
         * and a final notify on commit.  This isn't invoked on promotion under
         * the assumption that validators are checking the address itself, and
         * not the flags.
         */
        ivi.ivi_addr = ifa->ifa_address;
        ivi.ivi_dev = ifa->ifa_dev;
        ivi.extack = extack;
        ret = blocking_notifier_call_chain(&inetaddr_validator_chain,
                                           NETDEV_UP, &ivi);
        ret = notifier_to_errno(ret);
        if (ret) {
                inet_free_ifa(ifa);
                return ret;
        }

        if (!(ifa->ifa_flags & IFA_F_SECONDARY))
                ifap = last_primary;

        rcu_assign_pointer(ifa->ifa_next, *ifap);
        rcu_assign_pointer(*ifap, ifa);

        inet_hash_insert(dev_net(in_dev->dev), ifa);

        cancel_delayed_work(&net->ipv4.addr_chk_work);
        queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0);

        /* Send message first, then call notifier.
           Notifier will trigger FIB update, so that
           listeners of netlink will know about new ifaddr */
        rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
        blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);

        return 0;
}

static int inet_insert_ifa(struct in_ifaddr *ifa)
{
        if (!ifa->ifa_local) {
                inet_free_ifa(ifa);
                return 0;
        }

        return __inet_insert_ifa(ifa, NULL, 0, NULL);
}

static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
{
        struct in_device *in_dev = __in_dev_get_rtnl_net(dev);

        ipv4_devconf_setall(in_dev);
        neigh_parms_data_state_setall(in_dev->arp_parms);

        if (ipv4_is_loopback(ifa->ifa_local))
                ifa->ifa_scope = RT_SCOPE_HOST;
        return inet_insert_ifa(ifa);
}

/* Caller must hold RCU or RTNL :
 * We dont take a reference on found in_device
 */
struct in_device *inetdev_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;
        struct in_device *in_dev = NULL;

        rcu_read_lock();
        dev = dev_get_by_index_rcu(net, ifindex);
        if (dev)
                in_dev = rcu_dereference_rtnl(dev->ip_ptr);
        rcu_read_unlock();
        return in_dev;
}
EXPORT_SYMBOL(inetdev_by_index);

/* Called only from RTNL semaphored context. No locks. */

struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
                                    __be32 mask)
{
        struct in_ifaddr *ifa;

        ASSERT_RTNL();

        in_dev_for_each_ifa_rtnl(ifa, in_dev) {
                if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
                        return ifa;
        }
        return NULL;
}

static int ip_mc_autojoin_config(struct net *net, bool join,
                                 const struct in_ifaddr *ifa)
{
#if defined(CONFIG_IP_MULTICAST)
        struct ip_mreqn mreq = {
                .imr_multiaddr.s_addr = ifa->ifa_address,
                .imr_ifindex = ifa->ifa_dev->dev->ifindex,
        };
        struct sock *sk = net->ipv4.mc_autojoin_sk;
        int ret;

        ASSERT_RTNL_NET(net);

        lock_sock(sk);
        if (join)
                ret = ip_mc_join_group(sk, &mreq);
        else
                ret = ip_mc_leave_group(sk, &mreq);
        release_sock(sk);

        return ret;
#else
        return -EOPNOTSUPP;
#endif
}

static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct in_ifaddr __rcu **ifap;
        struct nlattr *tb[IFA_MAX+1];
        struct in_device *in_dev;
        struct ifaddrmsg *ifm;
        struct in_ifaddr *ifa;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
                                     ifa_ipv4_policy, extack);
        if (err < 0)
                goto out;

        ifm = nlmsg_data(nlh);

        rtnl_net_lock(net);

        in_dev = inetdev_by_index(net, ifm->ifa_index);
        if (!in_dev) {
                NL_SET_ERR_MSG(extack, "ipv4: Device not found");
                err = -ENODEV;
                goto unlock;
        }

        for (ifap = &in_dev->ifa_list;
             (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
             ifap = &ifa->ifa_next) {
                if (tb[IFA_LOCAL] &&
                    ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
                        continue;

                if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
                        continue;

                if (tb[IFA_ADDRESS] &&
                    (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
                    !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa)))
                        continue;

                if (ipv4_is_multicast(ifa->ifa_address))
                        ip_mc_autojoin_config(net, false, ifa);

                __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
                goto unlock;
        }

        NL_SET_ERR_MSG(extack, "ipv4: Address not found");
        err = -EADDRNOTAVAIL;
unlock:
        rtnl_net_unlock(net);
out:
        return err;
}

static void check_lifetime(struct work_struct *work)
{
        unsigned long now, next, next_sec, next_sched;
        struct in_ifaddr *ifa;
        struct hlist_node *n;
        struct net *net;
        int i;

        net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work);
        now = jiffies;
        next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);

        for (i = 0; i < IN4_ADDR_HSIZE; i++) {
                struct hlist_head *head = &net->ipv4.inet_addr_lst[i];
                bool change_needed = false;

                rcu_read_lock();
                hlist_for_each_entry_rcu(ifa, head, addr_lst) {
                        unsigned long age, tstamp;
                        u32 preferred_lft;
                        u32 valid_lft;
                        u32 flags;

                        flags = READ_ONCE(ifa->ifa_flags);
                        if (flags & IFA_F_PERMANENT)
                                continue;

                        preferred_lft = READ_ONCE(ifa->ifa_preferred_lft);
                        valid_lft = READ_ONCE(ifa->ifa_valid_lft);
                        tstamp = READ_ONCE(ifa->ifa_tstamp);
                        /* We try to batch several events at once. */
                        age = (now - tstamp +
                               ADDRCONF_TIMER_FUZZ_MINUS) / HZ;

                        if (valid_lft != INFINITY_LIFE_TIME &&
                            age >= valid_lft) {
                                change_needed = true;
                        } else if (preferred_lft ==
                                   INFINITY_LIFE_TIME) {
                                continue;
                        } else if (age >= preferred_lft) {
                                if (time_before(tstamp + valid_lft * HZ, next))
                                        next = tstamp + valid_lft * HZ;

                                if (!(flags & IFA_F_DEPRECATED))
                                        change_needed = true;
                        } else if (time_before(tstamp + preferred_lft * HZ,
                                               next)) {
                                next = tstamp + preferred_lft * HZ;
                        }
                }
                rcu_read_unlock();
                if (!change_needed)
                        continue;

                rtnl_net_lock(net);
                hlist_for_each_entry_safe(ifa, n, head, addr_lst) {
                        unsigned long age;

                        if (ifa->ifa_flags & IFA_F_PERMANENT)
                                continue;

                        /* We try to batch several events at once. */
                        age = (now - ifa->ifa_tstamp +
                               ADDRCONF_TIMER_FUZZ_MINUS) / HZ;

                        if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
                            age >= ifa->ifa_valid_lft) {
                                struct in_ifaddr __rcu **ifap;
                                struct in_ifaddr *tmp;

                                ifap = &ifa->ifa_dev->ifa_list;
                                tmp = rtnl_net_dereference(net, *ifap);
                                while (tmp) {
                                        if (tmp == ifa) {
                                                inet_del_ifa(ifa->ifa_dev,
                                                             ifap, 1);
                                                break;
                                        }
                                        ifap = &tmp->ifa_next;
                                        tmp = rtnl_net_dereference(net, *ifap);
                                }
                        } else if (ifa->ifa_preferred_lft !=
                                   INFINITY_LIFE_TIME &&
                                   age >= ifa->ifa_preferred_lft &&
                                   !(ifa->ifa_flags & IFA_F_DEPRECATED)) {
                                ifa->ifa_flags |= IFA_F_DEPRECATED;
                                rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
                        }
                }
                rtnl_net_unlock(net);
        }

        next_sec = round_jiffies_up(next);
        next_sched = next;

        /* If rounded timeout is accurate enough, accept it. */
        if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
                next_sched = next_sec;

        now = jiffies;
        /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
        if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
                next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;

        queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work,
                           next_sched - now);
}

static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
                             __u32 prefered_lft)
{
        unsigned long timeout;
        u32 flags;

        flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);

        timeout = addrconf_timeout_fixup(valid_lft, HZ);
        if (addrconf_finite_timeout(timeout))
                WRITE_ONCE(ifa->ifa_valid_lft, timeout);
        else
                flags |= IFA_F_PERMANENT;

        timeout = addrconf_timeout_fixup(prefered_lft, HZ);
        if (addrconf_finite_timeout(timeout)) {
                if (timeout == 0)
                        flags |= IFA_F_DEPRECATED;
                WRITE_ONCE(ifa->ifa_preferred_lft, timeout);
        }
        WRITE_ONCE(ifa->ifa_flags, flags);
        WRITE_ONCE(ifa->ifa_tstamp, jiffies);
        if (!ifa->ifa_cstamp)
                WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp);
}

static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb,
                             struct netlink_ext_ack *extack,
                             __u32 *valid_lft, __u32 *prefered_lft)
{
        struct ifaddrmsg *ifm = nlmsg_data(nlh);
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
                                     ifa_ipv4_policy, extack);
        if (err < 0)
                return err;

        if (ifm->ifa_prefixlen > 32) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length");
                return -EINVAL;
        }

        if (!tb[IFA_LOCAL]) {
                NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied");
                return -EINVAL;
        }

        if (tb[IFA_CACHEINFO]) {
                struct ifa_cacheinfo *ci;

                ci = nla_data(tb[IFA_CACHEINFO]);
                if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
                        NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid");
                        return -EINVAL;
                }

                *valid_lft = ci->ifa_valid;
                *prefered_lft = ci->ifa_prefered;
        }

        return 0;
}

static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh,
                                         struct nlattr **tb,
                                         struct netlink_ext_ack *extack)
{
        struct ifaddrmsg *ifm = nlmsg_data(nlh);
        struct in_device *in_dev;
        struct net_device *dev;
        struct in_ifaddr *ifa;
        int err;

        dev = __dev_get_by_index(net, ifm->ifa_index);
        err = -ENODEV;
        if (!dev) {
                NL_SET_ERR_MSG(extack, "ipv4: Device not found");
                goto errout;
        }

        in_dev = __in_dev_get_rtnl_net(dev);
        err = -ENOBUFS;
        if (!in_dev)
                goto errout;

        ifa = inet_alloc_ifa(in_dev);
        if (!ifa)
                /*
                 * A potential indev allocation can be left alive, it stays
                 * assigned to its device and is destroy with it.
                 */
                goto errout;

        ipv4_devconf_setall(in_dev);
        neigh_parms_data_state_setall(in_dev->arp_parms);

        if (!tb[IFA_ADDRESS])
                tb[IFA_ADDRESS] = tb[IFA_LOCAL];

        ifa->ifa_prefixlen = ifm->ifa_prefixlen;
        ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
        ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);
        ifa->ifa_scope = ifm->ifa_scope;
        ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]);
        ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]);

        if (tb[IFA_BROADCAST])
                ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);

        if (tb[IFA_LABEL])
                nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
        else
                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);

        if (tb[IFA_RT_PRIORITY])
                ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);

        if (tb[IFA_PROTO])
                ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]);

        return ifa;

errout:
        return ERR_PTR(err);
}

static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa)
{
        struct in_device *in_dev = ifa->ifa_dev;
        struct in_ifaddr *ifa1;

        in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) {
                if (ifa1->ifa_mask == ifa->ifa_mask &&
                    inet_ifa_match(ifa1->ifa_address, ifa) &&
                    ifa1->ifa_local == ifa->ifa_local)
                        return ifa1;
        }

        return NULL;
}

static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct netlink_ext_ack *extack)
{
        __u32 prefered_lft = INFINITY_LIFE_TIME;
        __u32 valid_lft = INFINITY_LIFE_TIME;
        struct net *net = sock_net(skb->sk);
        struct in_ifaddr *ifa_existing;
        struct nlattr *tb[IFA_MAX + 1];
        struct in_ifaddr *ifa;
        int ret;

        ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft);
        if (ret < 0)
                return ret;

        if (!nla_get_in_addr(tb[IFA_LOCAL]))
                return 0;

        rtnl_net_lock(net);

        ifa = inet_rtm_to_ifa(net, nlh, tb, extack);
        if (IS_ERR(ifa)) {
                ret = PTR_ERR(ifa);
                goto unlock;
        }

        ifa_existing = find_matching_ifa(net, ifa);
        if (!ifa_existing) {
                /* It would be best to check for !NLM_F_CREATE here but
                 * userspace already relies on not having to provide this.
                 */
                set_ifa_lifetime(ifa, valid_lft, prefered_lft);
                if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) {
                        ret = ip_mc_autojoin_config(net, true, ifa);
                        if (ret < 0) {
                                NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed");
                                inet_free_ifa(ifa);
                                goto unlock;
                        }
                }

                ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack);
        } else {
                u32 new_metric = ifa->ifa_rt_priority;
                u8 new_proto = ifa->ifa_proto;

                inet_free_ifa(ifa);

                if (nlh->nlmsg_flags & NLM_F_EXCL ||
                    !(nlh->nlmsg_flags & NLM_F_REPLACE)) {
                        NL_SET_ERR_MSG(extack, "ipv4: Address already assigned");
                        ret = -EEXIST;
                        goto unlock;
                }
                ifa = ifa_existing;

                if (ifa->ifa_rt_priority != new_metric) {
                        fib_modify_prefix_metric(ifa, new_metric);
                        ifa->ifa_rt_priority = new_metric;
                }

                ifa->ifa_proto = new_proto;

                set_ifa_lifetime(ifa, valid_lft, prefered_lft);
                cancel_delayed_work(&net->ipv4.addr_chk_work);
                queue_delayed_work(system_power_efficient_wq,
                                   &net->ipv4.addr_chk_work, 0);
                rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
        }

unlock:
        rtnl_net_unlock(net);

        return ret;
}

/*
 *        Determine a default network mask, based on the IP address.
 */

static int inet_abc_len(__be32 addr)
{
        int rc = -1;        /* Something else, probably a multicast. */

        if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
                rc = 0;
        else {
                __u32 haddr = ntohl(addr);
                if (IN_CLASSA(haddr))
                        rc = 8;
                else if (IN_CLASSB(haddr))
                        rc = 16;
                else if (IN_CLASSC(haddr))
                        rc = 24;
                else if (IN_CLASSE(haddr))
                        rc = 32;
        }

        return rc;
}


int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
{
        struct sockaddr_in sin_orig;
        struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr;
        struct in_ifaddr __rcu **ifap = NULL;
        struct in_device *in_dev;
        struct in_ifaddr *ifa = NULL;
        struct net_device *dev;
        char *colon;
        int ret = -EFAULT;
        int tryaddrmatch = 0;

        ifr->ifr_name[IFNAMSIZ - 1] = 0;

        /* save original address for comparison */
        memcpy(&sin_orig, sin, sizeof(*sin));

        colon = strchr(ifr->ifr_name, ':');
        if (colon)
                *colon = 0;

        dev_load(net, ifr->ifr_name);

        switch (cmd) {
        case SIOCGIFADDR:        /* Get interface address */
        case SIOCGIFBRDADDR:        /* Get the broadcast address */
        case SIOCGIFDSTADDR:        /* Get the destination address */
        case SIOCGIFNETMASK:        /* Get the netmask for the interface */
                /* Note that these ioctls will not sleep,
                   so that we do not impose a lock.
                   One day we will be forced to put shlock here (I mean SMP)
                 */
                tryaddrmatch = (sin_orig.sin_family == AF_INET);
                memset(sin, 0, sizeof(*sin));
                sin->sin_family = AF_INET;
                break;

        case SIOCSIFFLAGS:
                ret = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto out;
                break;
        case SIOCSIFADDR:        /* Set interface address (and family) */
        case SIOCSIFBRDADDR:        /* Set the broadcast address */
        case SIOCSIFDSTADDR:        /* Set the destination address */
        case SIOCSIFNETMASK:         /* Set the netmask for the interface */
                ret = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto out;
                ret = -EINVAL;
                if (sin->sin_family != AF_INET)
                        goto out;
                break;
        default:
                ret = -EINVAL;
                goto out;
        }

        rtnl_net_lock(net);

        ret = -ENODEV;
        dev = __dev_get_by_name(net, ifr->ifr_name);
        if (!dev)
                goto done;

        if (colon)
                *colon = ':';

        in_dev = __in_dev_get_rtnl_net(dev);
        if (in_dev) {
                if (tryaddrmatch) {
                        /* Matthias Andree */
                        /* compare label and address (4.4BSD style) */
                        /* note: we only do this for a limited set of ioctls
                           and only if the original address family was AF_INET.
                           This is checked above. */

                        for (ifap = &in_dev->ifa_list;
                             (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
                             ifap = &ifa->ifa_next) {
                                if (!strcmp(ifr->ifr_name, ifa->ifa_label) &&
                                    sin_orig.sin_addr.s_addr ==
                                                        ifa->ifa_local) {
                                        break; /* found */
                                }
                        }
                }
                /* we didn't get a match, maybe the application is
                   4.3BSD-style and passed in junk so we fall back to
                   comparing just the label */
                if (!ifa) {
                        for (ifap = &in_dev->ifa_list;
                             (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
                             ifap = &ifa->ifa_next)
                                if (!strcmp(ifr->ifr_name, ifa->ifa_label))
                                        break;
                }
        }

        ret = -EADDRNOTAVAIL;
        if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
                goto done;

        switch (cmd) {
        case SIOCGIFADDR:        /* Get interface address */
                ret = 0;
                sin->sin_addr.s_addr = ifa->ifa_local;
                break;

        case SIOCGIFBRDADDR:        /* Get the broadcast address */
                ret = 0;
                sin->sin_addr.s_addr = ifa->ifa_broadcast;
                break;

        case SIOCGIFDSTADDR:        /* Get the destination address */
                ret = 0;
                sin->sin_addr.s_addr = ifa->ifa_address;
                break;

        case SIOCGIFNETMASK:        /* Get the netmask for the interface */
                ret = 0;
                sin->sin_addr.s_addr = ifa->ifa_mask;
                break;

        case SIOCSIFFLAGS:
                if (colon) {
                        ret = -EADDRNOTAVAIL;
                        if (!ifa)
                                break;
                        ret = 0;
                        if (!(ifr->ifr_flags & IFF_UP))
                                inet_del_ifa(in_dev, ifap, 1);
                        break;
                }

                /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */
                ASSERT_RTNL();
                ret = dev_change_flags(dev, ifr->ifr_flags, NULL);
                break;

        case SIOCSIFADDR:        /* Set interface address (and family) */
                ret = -EINVAL;
                if (inet_abc_len(sin->sin_addr.s_addr) < 0)
                        break;

                if (!ifa) {
                        ret = -ENOBUFS;
                        if (!in_dev)
                                break;
                        ifa = inet_alloc_ifa(in_dev);
                        if (!ifa)
                                break;

                        if (colon)
                                memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ);
                        else
                                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
                } else {
                        ret = 0;
                        if (ifa->ifa_local == sin->sin_addr.s_addr)
                                break;
                        inet_del_ifa(in_dev, ifap, 0);
                        ifa->ifa_broadcast = 0;
                        ifa->ifa_scope = 0;
                }

                ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;

                if (!(dev->flags & IFF_POINTOPOINT)) {
                        ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
                        ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
                        if ((dev->flags & IFF_BROADCAST) &&
                            ifa->ifa_prefixlen < 31)
                                ifa->ifa_broadcast = ifa->ifa_address |
                                                     ~ifa->ifa_mask;
                } else {
                        ifa->ifa_prefixlen = 32;
                        ifa->ifa_mask = inet_make_mask(32);
                }
                set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
                ret = inet_set_ifa(dev, ifa);
                break;

        case SIOCSIFBRDADDR:        /* Set the broadcast address */
                ret = 0;
                if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
                        inet_del_ifa(in_dev, ifap, 0);
                        ifa->ifa_broadcast = sin->sin_addr.s_addr;
                        inet_insert_ifa(ifa);
                }
                break;

        case SIOCSIFDSTADDR:        /* Set the destination address */
                ret = 0;
                if (ifa->ifa_address == sin->sin_addr.s_addr)
                        break;
                ret = -EINVAL;
                if (inet_abc_len(sin->sin_addr.s_addr) < 0)
                        break;
                ret = 0;
                inet_del_ifa(in_dev, ifap, 0);
                ifa->ifa_address = sin->sin_addr.s_addr;
                inet_insert_ifa(ifa);
                break;

        case SIOCSIFNETMASK:         /* Set the netmask for the interface */

                /*
                 *        The mask we set must be legal.
                 */
                ret = -EINVAL;
                if (bad_mask(sin->sin_addr.s_addr, 0))
                        break;
                ret = 0;
                if (ifa->ifa_mask != sin->sin_addr.s_addr) {
                        __be32 old_mask = ifa->ifa_mask;
                        inet_del_ifa(in_dev, ifap, 0);
                        ifa->ifa_mask = sin->sin_addr.s_addr;
                        ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);

                        /* See if current broadcast address matches
                         * with current netmask, then recalculate
                         * the broadcast address. Otherwise it's a
                         * funny address, so don't touch it since
                         * the user seems to know what (s)he's doing...
                         */
                        if ((dev->flags & IFF_BROADCAST) &&
                            (ifa->ifa_prefixlen < 31) &&
                            (ifa->ifa_broadcast ==
                             (ifa->ifa_local|~old_mask))) {
                                ifa->ifa_broadcast = (ifa->ifa_local |
                                                      ~sin->sin_addr.s_addr);
                        }
                        inet_insert_ifa(ifa);
                }
                break;
        }
done:
        rtnl_net_unlock(net);
out:
        return ret;
}

int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
{
        struct in_device *in_dev = __in_dev_get_rtnl_net(dev);
        const struct in_ifaddr *ifa;
        struct ifreq ifr;
        int done = 0;

        if (WARN_ON(size > sizeof(struct ifreq)))
                goto out;

        if (!in_dev)
                goto out;

        in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) {
                if (!buf) {
                        done += size;
                        continue;
                }
                if (len < size)
                        break;
                memset(&ifr, 0, sizeof(struct ifreq));
                strcpy(ifr.ifr_name, ifa->ifa_label);

                (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
                (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
                                                                ifa->ifa_local;

                if (copy_to_user(buf + done, &ifr, size)) {
                        done = -EFAULT;
                        break;
                }
                len  -= size;
                done += size;
        }
out:
        return done;
}

static __be32 in_dev_select_addr(const struct in_device *in_dev,
                                 int scope)
{
        const struct in_ifaddr *ifa;

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
                        continue;
                if (ifa->ifa_scope != RT_SCOPE_LINK &&
                    ifa->ifa_scope <= scope)
                        return ifa->ifa_local;
        }

        return 0;
}

__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
{
        const struct in_ifaddr *ifa;
        __be32 addr = 0;
        unsigned char localnet_scope = RT_SCOPE_HOST;
        struct in_device *in_dev;
        struct net *net;
        int master_idx;

        rcu_read_lock();
        net = dev_net_rcu(dev);
        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                goto no_in_dev;

        if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
                localnet_scope = RT_SCOPE_LINK;

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
                        continue;
                if (min(ifa->ifa_scope, localnet_scope) > scope)
                        continue;
                if (!dst || inet_ifa_match(dst, ifa)) {
                        addr = ifa->ifa_local;
                        break;
                }
                if (!addr)
                        addr = ifa->ifa_local;
        }

        if (addr)
                goto out_unlock;
no_in_dev:
        master_idx = l3mdev_master_ifindex_rcu(dev);

        /* For VRFs, the VRF device takes the place of the loopback device,
         * with addresses on it being preferred.  Note in such cases the
         * loopback device will be among the devices that fail the master_idx
         * equality check in the loop below.
         */
        if (master_idx &&
            (dev = dev_get_by_index_rcu(net, master_idx)) &&
            (in_dev = __in_dev_get_rcu(dev))) {
                addr = in_dev_select_addr(in_dev, scope);
                if (addr)
                        goto out_unlock;
        }

        /* Not loopback addresses on loopback should be preferred
           in this case. It is important that lo is the first interface
           in dev_base list.
         */
        for_each_netdev_rcu(net, dev) {
                if (l3mdev_master_ifindex_rcu(dev) != master_idx)
                        continue;

                in_dev = __in_dev_get_rcu(dev);
                if (!in_dev)
                        continue;

                addr = in_dev_select_addr(in_dev, scope);
                if (addr)
                        goto out_unlock;
        }
out_unlock:
        rcu_read_unlock();
        return addr;
}
EXPORT_SYMBOL(inet_select_addr);

static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
                              __be32 local, int scope)
{
        unsigned char localnet_scope = RT_SCOPE_HOST;
        const struct in_ifaddr *ifa;
        __be32 addr = 0;
        int same = 0;

        if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
                localnet_scope = RT_SCOPE_LINK;

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                unsigned char min_scope = min(ifa->ifa_scope, localnet_scope);

                if (!addr &&
                    (local == ifa->ifa_local || !local) &&
                    min_scope <= scope) {
                        addr = ifa->ifa_local;
                        if (same)
                                break;
                }
                if (!same) {
                        same = (!local || inet_ifa_match(local, ifa)) &&
                                (!dst || inet_ifa_match(dst, ifa));
                        if (same && addr) {
                                if (local || !dst)
                                        break;
                                /* Is the selected addr into dst subnet? */
                                if (inet_ifa_match(addr, ifa))
                                        break;
                                /* No, then can we use new local src? */
                                if (min_scope <= scope) {
                                        addr = ifa->ifa_local;
                                        break;
                                }
                                /* search for large dst subnet for addr */
                                same = 0;
                        }
                }
        }

        return same ? addr : 0;
}

/*
 * Confirm that local IP address exists using wildcards:
 * - net: netns to check, cannot be NULL
 * - in_dev: only on this interface, NULL=any interface
 * - dst: only in the same subnet as dst, 0=any dst
 * - local: address, 0=autoselect the local address
 * - scope: maximum allowed scope value for the local address
 */
__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
                         __be32 dst, __be32 local, int scope)
{
        __be32 addr = 0;
        struct net_device *dev;

        if (in_dev)
                return confirm_addr_indev(in_dev, dst, local, scope);

        rcu_read_lock();
        for_each_netdev_rcu(net, dev) {
                in_dev = __in_dev_get_rcu(dev);
                if (in_dev) {
                        addr = confirm_addr_indev(in_dev, dst, local, scope);
                        if (addr)
                                break;
                }
        }
        rcu_read_unlock();

        return addr;
}
EXPORT_SYMBOL(inet_confirm_addr);

/*
 *        Device notifier
 */

int register_inetaddr_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&inetaddr_chain, nb);
}
EXPORT_SYMBOL(register_inetaddr_notifier);

int unregister_inetaddr_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
}
EXPORT_SYMBOL(unregister_inetaddr_notifier);

int register_inetaddr_validator_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&inetaddr_validator_chain, nb);
}
EXPORT_SYMBOL(register_inetaddr_validator_notifier);

int unregister_inetaddr_validator_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&inetaddr_validator_chain,
            nb);
}
EXPORT_SYMBOL(unregister_inetaddr_validator_notifier);

/* Rename ifa_labels for a device name change. Make some effort to preserve
 * existing alias numbering and to create unique labels if possible.
*/
static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
{
        struct in_ifaddr *ifa;
        int named = 0;

        in_dev_for_each_ifa_rtnl(ifa, in_dev) {
                char old[IFNAMSIZ], *dot;

                memcpy(old, ifa->ifa_label, IFNAMSIZ);
                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
                if (named++ == 0)
                        goto skip;
                dot = strchr(old, ':');
                if (!dot) {
                        sprintf(old, ":%d", named);
                        dot = old;
                }
                if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
                        strcat(ifa->ifa_label, dot);
                else
                        strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
skip:
                rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
        }
}

static void inetdev_send_gratuitous_arp(struct net_device *dev,
                                        struct in_device *in_dev)

{
        const struct in_ifaddr *ifa;

        in_dev_for_each_ifa_rtnl(ifa, in_dev) {
                arp_send(ARPOP_REQUEST, ETH_P_ARP,
                         ifa->ifa_local, dev,
                         ifa->ifa_local, NULL,
                         dev->dev_addr, NULL);
        }
}

/* Called only under RTNL semaphore */

static int inetdev_event(struct notifier_block *this, unsigned long event,
                         void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct in_device *in_dev = __in_dev_get_rtnl(dev);

        ASSERT_RTNL();

        if (!in_dev) {
                if (event == NETDEV_REGISTER) {
                        in_dev = inetdev_init(dev);
                        if (IS_ERR(in_dev))
                                return notifier_from_errno(PTR_ERR(in_dev));
                        if (dev->flags & IFF_LOOPBACK) {
                                IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
                                IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
                        }
                } else if (event == NETDEV_CHANGEMTU) {
                        /* Re-enabling IP */
                        if (inetdev_valid_mtu(dev->mtu))
                                in_dev = inetdev_init(dev);
                }
                goto out;
        }

        switch (event) {
        case NETDEV_REGISTER:
                pr_debug("%s: bug\n", __func__);
                RCU_INIT_POINTER(dev->ip_ptr, NULL);
                break;
        case NETDEV_UP:
                if (!inetdev_valid_mtu(dev->mtu))
                        break;
                if (dev->flags & IFF_LOOPBACK) {
                        struct in_ifaddr *ifa = inet_alloc_ifa(in_dev);

                        if (ifa) {
                                ifa->ifa_local =
                                  ifa->ifa_address = htonl(INADDR_LOOPBACK);
                                ifa->ifa_prefixlen = 8;
                                ifa->ifa_mask = inet_make_mask(8);
                                ifa->ifa_scope = RT_SCOPE_HOST;
                                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
                                set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
                                                 INFINITY_LIFE_TIME);
                                ipv4_devconf_setall(in_dev);
                                neigh_parms_data_state_setall(in_dev->arp_parms);
                                inet_insert_ifa(ifa);
                        }
                }
                ip_mc_up(in_dev);
                fallthrough;
        case NETDEV_CHANGEADDR:
                if (!IN_DEV_ARP_NOTIFY(in_dev))
                        break;
                fallthrough;
        case NETDEV_NOTIFY_PEERS:
                /* Send gratuitous ARP to notify of link change */
                inetdev_send_gratuitous_arp(dev, in_dev);
                break;
        case NETDEV_DOWN:
                ip_mc_down(in_dev);
                break;
        case NETDEV_PRE_TYPE_CHANGE:
                ip_mc_unmap(in_dev);
                break;
        case NETDEV_POST_TYPE_CHANGE:
                ip_mc_remap(in_dev);
                break;
        case NETDEV_CHANGEMTU:
                if (inetdev_valid_mtu(dev->mtu))
                        break;
                /* disable IP when MTU is not enough */
                fallthrough;
        case NETDEV_UNREGISTER:
                inetdev_destroy(in_dev);
                break;
        case NETDEV_CHANGENAME:
                /* Do not notify about label change, this event is
                 * not interesting to applications using netlink.
                 */
                inetdev_changename(dev, in_dev);

                devinet_sysctl_unregister(in_dev);
                devinet_sysctl_register(in_dev);
                break;
        }
out:
        return NOTIFY_DONE;
}

static struct notifier_block ip_netdev_notifier = {
        .notifier_call = inetdev_event,
};

static size_t inet_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
               + nla_total_size(4) /* IFA_ADDRESS */
               + nla_total_size(4) /* IFA_LOCAL */
               + nla_total_size(4) /* IFA_BROADCAST */
               + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
               + nla_total_size(4)  /* IFA_FLAGS */
               + nla_total_size(1)  /* IFA_PROTO */
               + nla_total_size(4)  /* IFA_RT_PRIORITY */
               + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
}

static inline u32 cstamp_delta(unsigned long cstamp)
{
        return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
}

static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
                         unsigned long tstamp, u32 preferred, u32 valid)
{
        struct ifa_cacheinfo ci;

        ci.cstamp = cstamp_delta(cstamp);
        ci.tstamp = cstamp_delta(tstamp);
        ci.ifa_prefered = preferred;
        ci.ifa_valid = valid;

        return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
}

static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa,
                            struct inet_fill_args *args)
{
        struct ifaddrmsg *ifm;
        struct nlmsghdr  *nlh;
        unsigned long tstamp;
        u32 preferred, valid;
        u32 flags;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm),
                        args->flags);
        if (!nlh)
                return -EMSGSIZE;

        ifm = nlmsg_data(nlh);
        ifm->ifa_family = AF_INET;
        ifm->ifa_prefixlen = ifa->ifa_prefixlen;

        flags = READ_ONCE(ifa->ifa_flags);
        /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits.
         * The 32bit value is given in IFA_FLAGS attribute.
         */
        ifm->ifa_flags = (__u8)flags;

        ifm->ifa_scope = ifa->ifa_scope;
        ifm->ifa_index = ifa->ifa_dev->dev->ifindex;

        if (args->netnsid >= 0 &&
            nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
                goto nla_put_failure;

        tstamp = READ_ONCE(ifa->ifa_tstamp);
        if (!(flags & IFA_F_PERMANENT)) {
                preferred = READ_ONCE(ifa->ifa_preferred_lft);
                valid = READ_ONCE(ifa->ifa_valid_lft);
                if (preferred != INFINITY_LIFE_TIME) {
                        long tval = (jiffies - tstamp) / HZ;

                        if (preferred > tval)
                                preferred -= tval;
                        else
                                preferred = 0;
                        if (valid != INFINITY_LIFE_TIME) {
                                if (valid > tval)
                                        valid -= tval;
                                else
                                        valid = 0;
                        }
                }
        } else {
                preferred = INFINITY_LIFE_TIME;
                valid = INFINITY_LIFE_TIME;
        }
        if ((ifa->ifa_address &&
             nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) ||
            (ifa->ifa_local &&
             nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) ||
            (ifa->ifa_broadcast &&
             nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
            (ifa->ifa_label[0] &&
             nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
            (ifa->ifa_proto &&
             nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) ||
            nla_put_u32(skb, IFA_FLAGS, flags) ||
            (ifa->ifa_rt_priority &&
             nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
            put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp,
                          preferred, valid))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
                                      struct inet_fill_args *fillargs,
                                      struct net **tgt_net, struct sock *sk,
                                      struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[IFA_MAX+1];
        struct ifaddrmsg *ifm;
        int err, i;

        ifm = nlmsg_payload(nlh, sizeof(*ifm));
        if (!ifm) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request");
                return -EINVAL;
        }

        if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request");
                return -EINVAL;
        }

        fillargs->ifindex = ifm->ifa_index;
        if (fillargs->ifindex) {
                cb->answer_flags |= NLM_F_DUMP_FILTERED;
                fillargs->flags |= NLM_F_DUMP_FILTERED;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
                                            ifa_ipv4_policy, extack);
        if (err < 0)
                return err;

        for (i = 0; i <= IFA_MAX; ++i) {
                if (!tb[i])
                        continue;

                if (i == IFA_TARGET_NETNSID) {
                        struct net *net;

                        fillargs->netnsid = nla_get_s32(tb[i]);

                        net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
                        if (IS_ERR(net)) {
                                fillargs->netnsid = -1;
                                NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id");
                                return PTR_ERR(net);
                        }
                        *tgt_net = net;
                } else {
                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb,
                                struct netlink_callback *cb, int *s_ip_idx,
                                struct inet_fill_args *fillargs)
{
        struct ip_mc_list *im;
        int ip_idx = 0;
        int err;

        for (im = rcu_dereference(in_dev->mc_list);
             im;
             im = rcu_dereference(im->next_rcu)) {
                if (ip_idx < *s_ip_idx) {
                        ip_idx++;
                        continue;
                }
                err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs);
                if (err < 0)
                        goto done;

                nl_dump_check_consistent(cb, nlmsg_hdr(skb));
                ip_idx++;
        }
        err = 0;
        ip_idx = 0;
done:
        *s_ip_idx = ip_idx;
        return err;
}

static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb,
                              struct netlink_callback *cb, int *s_ip_idx,
                              struct inet_fill_args *fillargs)
{
        struct in_ifaddr *ifa;
        int ip_idx = 0;
        int err;

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (ip_idx < *s_ip_idx) {
                        ip_idx++;
                        continue;
                }
                err = inet_fill_ifaddr(skb, ifa, fillargs);
                if (err < 0)
                        goto done;

                nl_dump_check_consistent(cb, nlmsg_hdr(skb));
                ip_idx++;
        }
        err = 0;
        ip_idx = 0;
done:
        *s_ip_idx = ip_idx;

        return err;
}

static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
                            struct netlink_callback *cb, int *s_ip_idx,
                            struct inet_fill_args *fillargs)
{
        switch (fillargs->event) {
        case RTM_NEWADDR:
                return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs);
        case RTM_GETMULTICAST:
                return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx,
                                            fillargs);
        default:
                return -EINVAL;
        }
}

/* Combine dev_addr_genid and dev_base_seq to detect changes.
 */
static u32 inet_base_seq(const struct net *net)
{
        u32 res = atomic_read(&net->ipv4.dev_addr_genid) +
                  READ_ONCE(net->dev_base_seq);

        /* Must not return 0 (see nl_dump_check_consistent()).
         * Chose a value far away from 0.
         */
        if (!res)
                res = 0x80000000;
        return res;
}

static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
                          int event)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct inet_fill_args fillargs = {
                .portid = NETLINK_CB(cb->skb).portid,
                .seq = nlh->nlmsg_seq,
                .event = event,
                .flags = NLM_F_MULTI,
                .netnsid = -1,
        };
        struct net *net = sock_net(skb->sk);
        struct net *tgt_net = net;
        struct {
                unsigned long ifindex;
                int ip_idx;
        } *ctx = (void *)cb->ctx;
        struct in_device *in_dev;
        struct net_device *dev;
        int err = 0;

        rcu_read_lock();
        if (cb->strict_check) {
                err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
                                                 skb->sk, cb);
                if (err < 0)
                        goto done;

                if (fillargs.ifindex) {
                        dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex);
                        if (!dev) {
                                err = -ENODEV;
                                goto done;
                        }
                        in_dev = __in_dev_get_rcu(dev);
                        if (!in_dev)
                                goto done;
                        err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx,
                                               &fillargs);
                        goto done;
                }
        }

        cb->seq = inet_base_seq(tgt_net);

        for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
                in_dev = __in_dev_get_rcu(dev);
                if (!in_dev)
                        continue;
                err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx,
                                       &fillargs);
                if (err < 0)
                        goto done;
        }
done:
        if (fillargs.netnsid >= 0)
                put_net(tgt_net);
        rcu_read_unlock();
        return err;
}

static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
        return inet_dump_addr(skb, cb, RTM_NEWADDR);
}

static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
        return inet_dump_addr(skb, cb, RTM_GETMULTICAST);
}

static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
                      u32 portid)
{
        struct inet_fill_args fillargs = {
                .portid = portid,
                .seq = nlh ? nlh->nlmsg_seq : 0,
                .event = event,
                .flags = 0,
                .netnsid = -1,
        };
        struct sk_buff *skb;
        int err = -ENOBUFS;
        struct net *net;

        net = dev_net(ifa->ifa_dev->dev);
        skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
        if (!skb)
                goto errout;

        err = inet_fill_ifaddr(skb, ifa, &fillargs);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
}

static size_t inet_get_link_af_size(const struct net_device *dev,
                                    u32 ext_filter_mask)
{
        struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);

        if (!in_dev)
                return 0;

        return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
}

static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
                             u32 ext_filter_mask)
{
        struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
        struct nlattr *nla;
        int i;

        if (!in_dev)
                return -ENODATA;

        nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
        if (!nla)
                return -EMSGSIZE;

        for (i = 0; i < IPV4_DEVCONF_MAX; i++)
                ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]);

        return 0;
}

static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
        [IFLA_INET_CONF]        = { .type = NLA_NESTED },
};

static const struct nla_policy inet_devconf_policy[IPV4_DEVCONF_MAX + 1] = {
        [IPV4_DEVCONF_FORWARDING]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_MC_FORWARDING]        = { .type = NLA_REJECT },
        [IPV4_DEVCONF_PROXY_ARP]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_ACCEPT_REDIRECTS]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_SECURE_REDIRECTS]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_SEND_REDIRECTS]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_SHARED_MEDIA]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_RP_FILTER]        = NLA_POLICY_RANGE(NLA_U32, 0, 2),
        [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE] = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_BOOTP_RELAY]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_LOG_MARTIANS]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_TAG]                = { .type = NLA_U32 },
        [IPV4_DEVCONF_ARPFILTER]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_MEDIUM_ID]        = NLA_POLICY_MIN(NLA_S32, -1),
        [IPV4_DEVCONF_NOXFRM]                = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_NOPOLICY]                = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_FORCE_IGMP_VERSION] = NLA_POLICY_RANGE(NLA_U32, 0, 3),
        [IPV4_DEVCONF_ARP_ANNOUNCE]        = NLA_POLICY_RANGE(NLA_U32, 0, 2),
        [IPV4_DEVCONF_ARP_IGNORE]        = NLA_POLICY_RANGE(NLA_U32, 0, 8),
        [IPV4_DEVCONF_PROMOTE_SECONDARIES] = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_ARP_ACCEPT]        = NLA_POLICY_RANGE(NLA_U32, 0, 2),
        [IPV4_DEVCONF_ARP_NOTIFY]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_ACCEPT_LOCAL]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_SRC_VMARK]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_PROXY_ARP_PVLAN]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_ROUTE_LOCALNET]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_BC_FORWARDING]        = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL] = { .type = NLA_U32 },
        [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL] = { .type = NLA_U32 },
        [IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] =
                NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] =
                NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_DROP_GRATUITOUS_ARP] = NLA_POLICY_RANGE(NLA_U32, 0, 1),
        [IPV4_DEVCONF_ARP_EVICT_NOCARRIER] = NLA_POLICY_RANGE(NLA_U32, 0, 1),
};

static int inet_validate_link_af(const struct net_device *dev,
                                 const struct nlattr *nla,
                                 struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_INET_MAX + 1], *nested_tb[IPV4_DEVCONF_MAX + 1];
        int err;

        if (dev && !__in_dev_get_rtnl(dev))
                return -EAFNOSUPPORT;

        err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla,
                                          inet_af_policy, extack);
        if (err < 0)
                return err;

        if (tb[IFLA_INET_CONF]) {
                err = nla_parse_nested(nested_tb, IPV4_DEVCONF_MAX,
                                       tb[IFLA_INET_CONF], inet_devconf_policy,
                                       extack);

                if (err < 0)
                        return err;
        }

        return 0;
}

static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla,
                            struct netlink_ext_ack *extack)
{
        struct in_device *in_dev = __in_dev_get_rtnl(dev);
        struct nlattr *a, *tb[IFLA_INET_MAX+1];
        int rem;

        if (!in_dev)
                return -EAFNOSUPPORT;

        if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0)
                return -EINVAL;

        if (tb[IFLA_INET_CONF]) {
                nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
                        ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
        }

        return 0;
}

static int inet_netconf_msgsize_devconf(int type)
{
        int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
                   + nla_total_size(4);        /* NETCONFA_IFINDEX */
        bool all = false;

        if (type == NETCONFA_ALL)
                all = true;

        if (all || type == NETCONFA_FORWARDING)
                size += nla_total_size(4);
        if (all || type == NETCONFA_RP_FILTER)
                size += nla_total_size(4);
        if (all || type == NETCONFA_MC_FORWARDING)
                size += nla_total_size(4);
        if (all || type == NETCONFA_BC_FORWARDING)
                size += nla_total_size(4);
        if (all || type == NETCONFA_PROXY_NEIGH)
                size += nla_total_size(4);
        if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
                size += nla_total_size(4);

        return size;
}

static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
                                     const struct ipv4_devconf *devconf,
                                     u32 portid, u32 seq, int event,
                                     unsigned int flags, int type)
{
        struct nlmsghdr  *nlh;
        struct netconfmsg *ncm;
        bool all = false;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
                        flags);
        if (!nlh)
                return -EMSGSIZE;

        if (type == NETCONFA_ALL)
                all = true;

        ncm = nlmsg_data(nlh);
        ncm->ncm_family = AF_INET;

        if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
                goto nla_put_failure;

        if (!devconf)
                goto out;

        if ((all || type == NETCONFA_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_FORWARDING,
                        IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0)
                goto nla_put_failure;
        if ((all || type == NETCONFA_RP_FILTER) &&
            nla_put_s32(skb, NETCONFA_RP_FILTER,
                        IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0)
                goto nla_put_failure;
        if ((all || type == NETCONFA_MC_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_MC_FORWARDING,
                        IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0)
                goto nla_put_failure;
        if ((all || type == NETCONFA_BC_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_BC_FORWARDING,
                        IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0)
                goto nla_put_failure;
        if ((all || type == NETCONFA_PROXY_NEIGH) &&
            nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
                        IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0)
                goto nla_put_failure;
        if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
            nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                        IPV4_DEVCONF_RO(*devconf,
                                        IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
                goto nla_put_failure;

out:
        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

void inet_netconf_notify_devconf(struct net *net, int event, int type,
                                 int ifindex, struct ipv4_devconf *devconf)
{
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL);
        if (!skb)
                goto errout;

        err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
                                        event, 0, type);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
}

static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
        [NETCONFA_IFINDEX]        = { .len = sizeof(int) },
        [NETCONFA_FORWARDING]        = { .len = sizeof(int) },
        [NETCONFA_RP_FILTER]        = { .len = sizeof(int) },
        [NETCONFA_PROXY_NEIGH]        = { .len = sizeof(int) },
        [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN]        = { .len = sizeof(int) },
};

static int inet_netconf_valid_get_req(struct sk_buff *skb,
                                      const struct nlmsghdr *nlh,
                                      struct nlattr **tb,
                                      struct netlink_ext_ack *extack)
{
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg),
                                              tb, NETCONFA_MAX,
                                              devconf_ipv4_policy, extack);

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg),
                                            tb, NETCONFA_MAX,
                                            devconf_ipv4_policy, extack);
        if (err)
                return err;

        for (i = 0; i <= NETCONFA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NETCONFA_IFINDEX:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet_netconf_get_devconf(struct sk_buff *in_skb,
                                    struct nlmsghdr *nlh,
                                    struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[NETCONFA_MAX + 1];
        const struct ipv4_devconf *devconf;
        struct in_device *in_dev = NULL;
        struct net_device *dev = NULL;
        struct sk_buff *skb;
        int ifindex;
        int err;

        err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack);
        if (err)
                return err;

        if (!tb[NETCONFA_IFINDEX])
                return -EINVAL;

        ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
        switch (ifindex) {
        case NETCONFA_IFINDEX_ALL:
                devconf = net->ipv4.devconf_all;
                break;
        case NETCONFA_IFINDEX_DEFAULT:
                devconf = net->ipv4.devconf_dflt;
                break;
        default:
                err = -ENODEV;
                dev = dev_get_by_index(net, ifindex);
                if (dev)
                        in_dev = in_dev_get(dev);
                if (!in_dev)
                        goto errout;
                devconf = &in_dev->cnf;
                break;
        }

        err = -ENOBUFS;
        skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
        if (!skb)
                goto errout;

        err = inet_netconf_fill_devconf(skb, ifindex, devconf,
                                        NETLINK_CB(in_skb).portid,
                                        nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
                                        NETCONFA_ALL);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
        if (in_dev)
                in_dev_put(in_dev);
        dev_put(dev);
        return err;
}

static int inet_netconf_dump_devconf(struct sk_buff *skb,
                                     struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        struct {
                unsigned long ifindex;
                unsigned int all_default;
        } *ctx = (void *)cb->ctx;
        const struct in_device *in_dev;
        struct net_device *dev;
        int err = 0;

        if (cb->strict_check) {
                struct netlink_ext_ack *extack = cb->extack;
                struct netconfmsg *ncm;

                if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
                        NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request");
                        return -EINVAL;
                }

                if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
                        NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request");
                        return -EINVAL;
                }
        }

        rcu_read_lock();
        for_each_netdev_dump(net, dev, ctx->ifindex) {
                in_dev = __in_dev_get_rcu(dev);
                if (!in_dev)
                        continue;
                err = inet_netconf_fill_devconf(skb, dev->ifindex,
                                                &in_dev->cnf,
                                                NETLINK_CB(cb->skb).portid,
                                                nlh->nlmsg_seq,
                                                RTM_NEWNETCONF, NLM_F_MULTI,
                                                NETCONFA_ALL);
                if (err < 0)
                        goto done;
        }
        if (ctx->all_default == 0) {
                err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
                                                net->ipv4.devconf_all,
                                                NETLINK_CB(cb->skb).portid,
                                                nlh->nlmsg_seq,
                                                RTM_NEWNETCONF, NLM_F_MULTI,
                                                NETCONFA_ALL);
                if (err < 0)
                        goto done;
                ctx->all_default++;
        }
        if (ctx->all_default == 1) {
                err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
                                                net->ipv4.devconf_dflt,
                                                NETLINK_CB(cb->skb).portid,
                                                nlh->nlmsg_seq,
                                                RTM_NEWNETCONF, NLM_F_MULTI,
                                                NETCONFA_ALL);
                if (err < 0)
                        goto done;
                ctx->all_default++;
        }
done:
        rcu_read_unlock();
        return err;
}

#ifdef CONFIG_SYSCTL

static void devinet_copy_dflt_conf(struct net *net, int i)
{
        struct net_device *dev;

        rcu_read_lock();
        for_each_netdev_rcu(net, dev) {
                struct in_device *in_dev;

                in_dev = __in_dev_get_rcu(dev);
                if (in_dev && !test_bit(i, in_dev->cnf.state))
                        in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
        }
        rcu_read_unlock();
}

/* called with RTNL locked */
static void inet_forward_change(struct net *net)
{
        struct net_device *dev;
        int on = IPV4_DEVCONF_ALL(net, FORWARDING);

        IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
        IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                    NETCONFA_FORWARDING,
                                    NETCONFA_IFINDEX_ALL,
                                    net->ipv4.devconf_all);
        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                    NETCONFA_FORWARDING,
                                    NETCONFA_IFINDEX_DEFAULT,
                                    net->ipv4.devconf_dflt);

        for_each_netdev(net, dev) {
                struct in_device *in_dev;

                if (on)
                        dev_disable_lro(dev);

                in_dev = __in_dev_get_rtnl_net(dev);
                if (in_dev) {
                        IN_DEV_CONF_SET(in_dev, FORWARDING, on);
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_FORWARDING,
                                                    dev->ifindex, &in_dev->cnf);
                }
        }
}

static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
{
        if (cnf == net->ipv4.devconf_dflt)
                return NETCONFA_IFINDEX_DEFAULT;
        else if (cnf == net->ipv4.devconf_all)
                return NETCONFA_IFINDEX_ALL;
        else {
                struct in_device *idev
                        = container_of(cnf, struct in_device, cnf);
                return idev->dev->ifindex;
        }
}

static int devinet_conf_proc(const struct ctl_table *ctl, int write,
                             void *buffer, size_t *lenp, loff_t *ppos)
{
        int old_value = *(int *)ctl->data;
        int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
        int new_value = *(int *)ctl->data;

        if (write) {
                struct ipv4_devconf *cnf = ctl->extra1;
                struct net *net = ctl->extra2;
                int i = (int *)ctl->data - cnf->data;
                int ifindex;

                set_bit(i, cnf->state);

                if (cnf == net->ipv4.devconf_dflt)
                        devinet_copy_dflt_conf(net, i);
                if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
                    i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
                        if ((new_value == 0) && (old_value != 0))
                                rt_cache_flush(net);

                if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
                    new_value != old_value)
                        rt_cache_flush(net);

                if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
                    new_value != old_value) {
                        ifindex = devinet_conf_ifindex(net, cnf);
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_RP_FILTER,
                                                    ifindex, cnf);
                }
                if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
                    new_value != old_value) {
                        ifindex = devinet_conf_ifindex(net, cnf);
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_PROXY_NEIGH,
                                                    ifindex, cnf);
                }
                if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 &&
                    new_value != old_value) {
                        ifindex = devinet_conf_ifindex(net, cnf);
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                                                    ifindex, cnf);
                }
        }

        return ret;
}

static int devinet_sysctl_forward(const struct ctl_table *ctl, int write,
                                  void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        loff_t pos = *ppos;
        struct net *net = ctl->extra2;
        int ret;

        if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        ret = proc_dointvec(ctl, write, buffer, lenp, ppos);

        if (write && *valp != val) {
                if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
                        if (!rtnl_net_trylock(net)) {
                                /* Restore the original values before restarting */
                                *valp = val;
                                *ppos = pos;
                                return restart_syscall();
                        }
                        if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
                                inet_forward_change(net);
                        } else {
                                struct ipv4_devconf *cnf = ctl->extra1;
                                struct in_device *idev =
                                        container_of(cnf, struct in_device, cnf);
                                if (*valp)
                                        dev_disable_lro(idev->dev);
                                inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                            NETCONFA_FORWARDING,
                                                            idev->dev->ifindex,
                                                            cnf);
                        }
                        rtnl_net_unlock(net);
                        rt_cache_flush(net);
                } else
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_FORWARDING,
                                                    NETCONFA_IFINDEX_DEFAULT,
                                                    net->ipv4.devconf_dflt);
        }

        return ret;
}

static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
        struct net *net = ctl->extra2;

        if (write && *valp != val)
                rt_cache_flush(net);

        return ret;
}

#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
        { \
                .procname        = name, \
                .data                = ipv4_devconf.data + \
                                  IPV4_DEVCONF_ ## attr - 1, \
                .maxlen                = sizeof(int), \
                .mode                = mval, \
                .proc_handler        = proc, \
                .extra1                = &ipv4_devconf, \
        }

#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
        DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)

#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
        DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)

#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
        DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)

#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
        DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)

static struct devinet_sysctl_table {
        struct ctl_table_header *sysctl_header;
        struct ctl_table devinet_vars[IPV4_DEVCONF_MAX];
} devinet_sysctl = {
        .devinet_vars = {
                DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
                                             devinet_sysctl_forward),
                DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
                DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),

                DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
                DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
                DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
                DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
                DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
                DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
                                        "accept_source_route"),
                DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
                DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
                DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
                DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
                DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
                DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
                DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
                DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER,
                                        "arp_evict_nocarrier"),
                DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
                DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
                                        "force_igmp_version"),
                DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
                                        "igmpv2_unsolicited_report_interval"),
                DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
                                        "igmpv3_unsolicited_report_interval"),
                DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
                                        "ignore_routes_with_linkdown"),
                DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
                                        "drop_gratuitous_arp"),

                DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
                                              "promote_secondaries"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
                                              "route_localnet"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
                                              "drop_unicast_in_l2_multicast"),
        },
};

static int __devinet_sysctl_register(struct net *net, char *dev_name,
                                     int ifindex, struct ipv4_devconf *p)
{
        int i;
        struct devinet_sysctl_table *t;
        char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];

        t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT);
        if (!t)
                goto out;

        for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) {
                t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
                t->devinet_vars[i].extra1 = p;
                t->devinet_vars[i].extra2 = net;
        }

        snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);

        t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
        if (!t->sysctl_header)
                goto free;

        p->sysctl = t;

        inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL,
                                    ifindex, p);
        return 0;

free:
        kfree(t);
out:
        return -ENOMEM;
}

static void __devinet_sysctl_unregister(struct net *net,
                                        struct ipv4_devconf *cnf, int ifindex)
{
        struct devinet_sysctl_table *t = cnf->sysctl;

        if (t) {
                cnf->sysctl = NULL;
                unregister_net_sysctl_table(t->sysctl_header);
                kfree(t);
        }

        inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL);
}

static int devinet_sysctl_register(struct in_device *idev)
{
        int err;

        if (!sysctl_dev_name_is_allowed(idev->dev->name))
                return -EINVAL;

        err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
        if (err)
                return err;
        err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
                                        idev->dev->ifindex, &idev->cnf);
        if (err)
                neigh_sysctl_unregister(idev->arp_parms);
        return err;
}

static void devinet_sysctl_unregister(struct in_device *idev)
{
        struct net *net = dev_net(idev->dev);

        __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex);
        neigh_sysctl_unregister(idev->arp_parms);
}

static struct ctl_table ctl_forward_entry[] = {
        {
                .procname        = "ip_forward",
                .data                = &ipv4_devconf.data[
                                        IPV4_DEVCONF_FORWARDING - 1],
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = devinet_sysctl_forward,
                .extra1                = &ipv4_devconf,
                .extra2                = &init_net,
        },
};
#endif

static __net_init int devinet_init_net(struct net *net)
{
#ifdef CONFIG_SYSCTL
        struct ctl_table_header *forw_hdr;
        struct ctl_table *tbl;
#endif
        struct ipv4_devconf *all, *dflt;
        int err;
        int i;

        err = -ENOMEM;
        net->ipv4.inet_addr_lst = kmalloc_objs(struct hlist_head,
                                               IN4_ADDR_HSIZE);
        if (!net->ipv4.inet_addr_lst)
                goto err_alloc_hash;

        all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL);
        if (!all)
                goto err_alloc_all;

        dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
        if (!dflt)
                goto err_alloc_dflt;

#ifdef CONFIG_SYSCTL
        tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL);
        if (!tbl)
                goto err_alloc_ctl;

        tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
        tbl[0].extra1 = all;
        tbl[0].extra2 = net;
#endif

        if (!net_eq(net, &init_net)) {
                switch (net_inherit_devconf()) {
                case 3:
                        /* copy from the current netns */
                        memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all,
                               sizeof(ipv4_devconf));
                        memcpy(dflt,
                               current->nsproxy->net_ns->ipv4.devconf_dflt,
                               sizeof(ipv4_devconf_dflt));
                        break;
                case 0:
                case 1:
                        /* copy from init_net */
                        memcpy(all, init_net.ipv4.devconf_all,
                               sizeof(ipv4_devconf));
                        memcpy(dflt, init_net.ipv4.devconf_dflt,
                               sizeof(ipv4_devconf_dflt));
                        break;
                case 2:
                        /* use compiled values */
                        break;
                }
        }

#ifdef CONFIG_SYSCTL
        err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all);
        if (err < 0)
                goto err_reg_all;

        err = __devinet_sysctl_register(net, "default",
                                        NETCONFA_IFINDEX_DEFAULT, dflt);
        if (err < 0)
                goto err_reg_dflt;

        err = -ENOMEM;
        forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl,
                                          ARRAY_SIZE(ctl_forward_entry));
        if (!forw_hdr)
                goto err_reg_ctl;
        net->ipv4.forw_hdr = forw_hdr;
#endif

        for (i = 0; i < IN4_ADDR_HSIZE; i++)
                INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]);

        INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime);

        net->ipv4.devconf_all = all;
        net->ipv4.devconf_dflt = dflt;
        return 0;

#ifdef CONFIG_SYSCTL
err_reg_ctl:
        __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT);
err_reg_dflt:
        __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
err_reg_all:
        kfree(tbl);
err_alloc_ctl:
#endif
        kfree(dflt);
err_alloc_dflt:
        kfree(all);
err_alloc_all:
        kfree(net->ipv4.inet_addr_lst);
err_alloc_hash:
        return err;
}

static __net_exit void devinet_exit_net(struct net *net)
{
#ifdef CONFIG_SYSCTL
        const struct ctl_table *tbl;
#endif

        cancel_delayed_work_sync(&net->ipv4.addr_chk_work);

#ifdef CONFIG_SYSCTL
        tbl = net->ipv4.forw_hdr->ctl_table_arg;
        unregister_net_sysctl_table(net->ipv4.forw_hdr);
        __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt,
                                    NETCONFA_IFINDEX_DEFAULT);
        __devinet_sysctl_unregister(net, net->ipv4.devconf_all,
                                    NETCONFA_IFINDEX_ALL);
        kfree(tbl);
#endif
        kfree(net->ipv4.devconf_dflt);
        kfree(net->ipv4.devconf_all);
        kfree(net->ipv4.inet_addr_lst);
}

static __net_initdata struct pernet_operations devinet_ops = {
        .init = devinet_init_net,
        .exit = devinet_exit_net,
};

static struct rtnl_af_ops inet_af_ops __read_mostly = {
        .family                  = AF_INET,
        .fill_link_af          = inet_fill_link_af,
        .get_link_af_size = inet_get_link_af_size,
        .validate_link_af = inet_validate_link_af,
        .set_link_af          = inet_set_link_af,
};

static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = {
        {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr,
         .flags = RTNL_FLAG_DOIT_PERNET},
        {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr,
         .flags = RTNL_FLAG_DOIT_PERNET},
        {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr,
         .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
        {.protocol = PF_INET, .msgtype = RTM_GETNETCONF,
         .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf,
         .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
        {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST,
         .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED},
};

void __init devinet_init(void)
{
        register_pernet_subsys(&devinet_ops);
        register_netdevice_notifier(&ip_netdev_notifier);

        if (rtnl_af_register(&inet_af_ops))
                panic("Unable to register inet_af_ops\n");

        rtnl_register_many(devinet_rtnl_msg_handlers);
}
































































































































































































































































































































































































































































































































































































































































































    1 





    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/dir.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/dir.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  ext4 directory handling functions
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *
 * Hash Tree Directory indexing (c) 2001  Daniel Phillips
 *
 */

#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/filelock.h>
#include <linux/slab.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include "ext4.h"
#include "xattr.h"

static int ext4_dx_readdir(struct file *, struct dir_context *);

/**
 * is_dx_dir() - check if a directory is using htree indexing
 * @inode: directory inode
 *
 * Check if the given dir-inode refers to an htree-indexed directory
 * (or a directory which could potentially get converted to use htree
 * indexing).
 *
 * Return 1 if it is a dx dir, 0 if not
 */
static int is_dx_dir(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (ext4_has_feature_dir_index(inode->i_sb) &&
            ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
             ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
             ext4_has_inline_data(inode)))
                return 1;

        return 0;
}

static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de)
{
        /* Check if . or .. , or skip if namelen is 0 */
        if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') &&
            (de->name[1] == '.' || de->name[1] == '\0'))
                return true;
        /* Check if this is a csum entry */
        if (de->file_type == EXT4_FT_DIR_CSUM)
                return true;
        return false;
}

/*
 * Return 0 if the directory entry is OK, and 1 if there is a problem
 *
 * Note: this is the opposite of what ext2 and ext3 historically returned...
 *
 * bh passed here can be an inode block or a dir data block, depending
 * on the inode inline data flag.
 */
int __ext4_check_dir_entry(const char *function, unsigned int line,
                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
                           struct buffer_head *bh, char *buf, int size,
                           unsigned int offset)
{
        const char *error_msg = NULL;
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
        const int next_offset = ((char *) de - buf) + rlen;
        bool fake = is_fake_dir_entry(de);
        bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb);

        if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
                error_msg = "rec_len is smaller than minimal";
        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
        else if (unlikely(rlen < ext4_dir_rec_len(de->name_len,
                                                        fake ? NULL : dir)))
                error_msg = "rec_len is too small for name_len";
        else if (unlikely(next_offset > size))
                error_msg = "directory entry overrun";
        else if (unlikely(next_offset > size - ext4_dir_rec_len(1,
                                                  has_csum ? NULL : dir) &&
                          next_offset != size))
                error_msg = "directory entry too close to block end";
        else if (unlikely(le32_to_cpu(de->inode) >
                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
        else if (unlikely(next_offset == size && de->name_len == 1 &&
                          de->name[0] == '.'))
                error_msg = "'.' directory cannot be the last in data block";
        else
                return 0;

        if (filp)
                ext4_error_file(filp, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u, "
                                "inode=%u, rec_len=%d, size=%d fake=%d",
                                error_msg, offset, le32_to_cpu(de->inode),
                                rlen, size, fake);
        else
                ext4_error_inode(dir, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u, "
                                "inode=%u, rec_len=%d, size=%d fake=%d",
                                 error_msg, offset, le32_to_cpu(de->inode),
                                 rlen, size, fake);

        return 1;
}

static int ext4_readdir(struct file *file, struct dir_context *ctx)
{
        unsigned int offset;
        int i;
        struct ext4_dir_entry_2 *de;
        int err;
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bh = NULL;
        struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
        struct dir_private_info *info = file->private_data;

        err = fscrypt_prepare_readdir(inode);
        if (err)
                return err;

        if (is_dx_dir(inode)) {
                err = ext4_dx_readdir(file, ctx);
                if (err != ERR_BAD_DX_DIR)
                        return err;

                /* Can we just clear INDEX flag to ignore htree information? */
                if (!ext4_has_feature_metadata_csum(sb)) {
                        /*
                         * We don't set the inode dirty flag since it's not
                         * critical that it gets flushed back to the disk.
                         */
                        ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
                }
        }

        if (ext4_has_inline_data(inode)) {
                int has_inline_data = 1;
                err = ext4_read_inline_dir(file, ctx,
                                           &has_inline_data);
                if (has_inline_data)
                        return err;
        }

        if (IS_ENCRYPTED(inode)) {
                err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr);
                if (err < 0)
                        return err;
        }

        while (ctx->pos < inode->i_size) {
                struct ext4_map_blocks map;

                if (fatal_signal_pending(current)) {
                        err = -ERESTARTSYS;
                        goto errout;
                }
                cond_resched();
                offset = ctx->pos & (sb->s_blocksize - 1);
                map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                map.m_len = 1;
                err = ext4_map_blocks(NULL, inode, &map, 0);
                if (err == 0) {
                        /* m_len should never be zero but let's avoid
                         * an infinite loop if it somehow is */
                        if (map.m_len == 0)
                                map.m_len = 1;
                        ctx->pos += map.m_len * sb->s_blocksize;
                        continue;
                }
                if (err > 0) {
                        pgoff_t index = map.m_pblk << inode->i_blkbits >>
                                        PAGE_SHIFT;
                        if (!ra_has_index(&file->f_ra, index))
                                page_cache_sync_readahead(
                                        sb->s_bdev->bd_mapping,
                                        &file->f_ra, file, index,
                                        1 << EXT4_SB(sb)->s_min_folio_order);
                        file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
                        bh = ext4_bread(NULL, inode, map.m_lblk, 0);
                        if (IS_ERR(bh)) {
                                err = PTR_ERR(bh);
                                bh = NULL;
                                goto errout;
                        }
                }

                if (!bh) {
                        /* corrupt size?  Maybe no more blocks to read */
                        if (ctx->pos > inode->i_blocks << 9)
                                break;
                        ctx->pos += sb->s_blocksize - offset;
                        continue;
                }

                /* Check the checksum */
                if (!buffer_verified(bh) &&
                    !ext4_dirblock_csum_verify(inode, bh)) {
                        EXT4_ERROR_FILE(file, 0, "directory fails checksum "
                                        "at offset %llu",
                                        (unsigned long long)ctx->pos);
                        ctx->pos += sb->s_blocksize - offset;
                        brelse(bh);
                        bh = NULL;
                        continue;
                }
                set_buffer_verified(bh);

                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
                 * to make sure. */
                if (!inode_eq_iversion(inode, info->cookie)) {
                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
                                de = (struct ext4_dir_entry_2 *)
                                        (bh->b_data + i);
                                /* It's too expensive to do a full
                                 * dirent test each time round this
                                 * loop, but we do have to test at
                                 * least that it is non-zero.  A
                                 * failure will be detected in the
                                 * dirent test below. */
                                if (ext4_rec_len_from_disk(de->rec_len,
                                        sb->s_blocksize) < ext4_dir_rec_len(1,
                                                                        inode))
                                        break;
                                i += ext4_rec_len_from_disk(de->rec_len,
                                                            sb->s_blocksize);
                        }
                        offset = i;
                        ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                | offset;
                        info->cookie = inode_query_iversion(inode);
                }

                while (ctx->pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
                        if (ext4_check_dir_entry(inode, file, de, bh,
                                                 bh->b_data, bh->b_size,
                                                 offset)) {
                                /*
                                 * On error, skip to the next block
                                 */
                                ctx->pos = (ctx->pos |
                                                (sb->s_blocksize - 1)) + 1;
                                break;
                        }
                        offset += ext4_rec_len_from_disk(de->rec_len,
                                        sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
                                if (!IS_ENCRYPTED(inode)) {
                                        if (!dir_emit(ctx, de->name,
                                            de->name_len,
                                            le32_to_cpu(de->inode),
                                            get_dtype(sb, de->file_type)))
                                                goto done;
                                } else {
                                        int save_len = fstr.len;
                                        struct fscrypt_str de_name =
                                                        FSTR_INIT(de->name,
                                                                de->name_len);
                                        u32 hash;
                                        u32 minor_hash;

                                        if (IS_CASEFOLDED(inode)) {
                                                hash = EXT4_DIRENT_HASH(de);
                                                minor_hash = EXT4_DIRENT_MINOR_HASH(de);
                                        } else {
                                                hash = 0;
                                                minor_hash = 0;
                                        }

                                        /* Directory is encrypted */
                                        err = fscrypt_fname_disk_to_usr(inode,
                                                hash, minor_hash, &de_name, &fstr);
                                        de_name = fstr;
                                        fstr.len = save_len;
                                        if (err)
                                                goto errout;
                                        if (!dir_emit(ctx,
                                            de_name.name, de_name.len,
                                            le32_to_cpu(de->inode),
                                            get_dtype(sb, de->file_type)))
                                                goto done;
                                }
                        }
                        ctx->pos += ext4_rec_len_from_disk(de->rec_len,
                                                sb->s_blocksize);
                }
                if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode))
                        goto done;
                brelse(bh);
                bh = NULL;
        }
done:
        err = 0;
errout:
        fscrypt_fname_free_buffer(&fstr);
        brelse(bh);
        return err;
}

static inline int is_32bit_api(void)
{
#ifdef CONFIG_COMPAT
        return in_compat_syscall();
#else
        return (BITS_PER_LONG == 32);
#endif
}

/*
 * These functions convert from the major/minor hash to an f_pos
 * value for dx directories
 *
 * Upper layer (for example NFS) should specify FMODE_32BITHASH or
 * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
 * directly on both 32-bit and 64-bit nodes, under such case, neither
 * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
 */
static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return major >> 1;
        else
                return ((__u64)(major >> 1) << 32) | (__u64)minor;
}

static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return (pos << 1) & 0xffffffff;
        else
                return ((pos >> 32) << 1) & 0xffffffff;
}

static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return 0;
        else
                return pos & 0xffffffff;
}

/*
 * Return 32- or 64-bit end-of-file for dx directories
 */
static inline loff_t ext4_get_htree_eof(struct file *filp)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return EXT4_HTREE_EOF_32BIT;
        else
                return EXT4_HTREE_EOF_64BIT;
}


/*
 * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
 * directories, where the "offset" is in terms of the filename hash
 * value instead of the byte offset.
 *
 * Because we may return a 64-bit hash that is well beyond offset limits,
 * we need to pass the max hash as the maximum allowable offset in
 * the htree directory case.
 *
 * For non-htree, ext4_llseek already chooses the proper max offset.
 */
static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;
        struct dir_private_info *info = file->private_data;
        int dx_dir = is_dx_dir(inode);
        loff_t ret, htree_max = ext4_get_htree_eof(file);

        if (likely(dx_dir))
                ret = generic_file_llseek_size(file, offset, whence,
                                                    htree_max, htree_max);
        else
                ret = ext4_llseek(file, offset, whence);
        info->cookie = inode_peek_iversion(inode) - 1;
        return ret;
}

/*
 * This structure holds the nodes of the red-black tree used to store
 * the directory entry in hash order.
 */
struct fname {
        __u32                hash;
        __u32                minor_hash;
        struct rb_node        rb_hash;
        struct fname        *next;
        __u32                inode;
        __u8                name_len;
        __u8                file_type;
        char                name[] __counted_by(name_len);
};

/*
 * This function implements a non-recursive way of freeing all of the
 * nodes in the red-black tree.
 */
static void free_rb_tree_fname(struct rb_root *root)
{
        struct fname *fname, *next;

        rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
                while (fname) {
                        struct fname *old = fname;
                        fname = fname->next;
                        kfree(old);
                }

        *root = RB_ROOT;
}

static void ext4_htree_init_dir_info(struct file *filp, loff_t pos)
{
        struct dir_private_info *p = filp->private_data;

        if (is_dx_dir(file_inode(filp)) && !p->initialized) {
                p->curr_hash = pos2maj_hash(filp, pos);
                p->curr_minor_hash = pos2min_hash(filp, pos);
                p->initialized = true;
        }
}

void ext4_htree_free_dir_info(struct dir_private_info *p)
{
        free_rb_tree_fname(&p->root);
        kfree(p);
}

/*
 * Given a directory entry, enter it into the fname rb tree.
 *
 * When filename encryption is enabled, the dirent will hold the
 * encrypted filename, while the htree will hold decrypted filename.
 * The decrypted filename is passed in via ent_name.  parameter.
 */
int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                             __u32 minor_hash,
                            struct ext4_dir_entry_2 *dirent,
                            struct fscrypt_str *ent_name)
{
        struct rb_node **p, *parent = NULL;
        struct fname *fname, *new_fn;
        struct dir_private_info *info;

        info = dir_file->private_data;
        p = &info->root.rb_node;

        /* Create and allocate the fname structure */
        new_fn = kzalloc_flex(*new_fn, name, ent_name->len + 1);
        if (!new_fn)
                return -ENOMEM;
        new_fn->hash = hash;
        new_fn->minor_hash = minor_hash;
        new_fn->inode = le32_to_cpu(dirent->inode);
        new_fn->name_len = ent_name->len;
        new_fn->file_type = dirent->file_type;
        memcpy(new_fn->name, ent_name->name, ent_name->len);

        while (*p) {
                parent = *p;
                fname = rb_entry(parent, struct fname, rb_hash);

                /*
                 * If the hash and minor hash match up, then we put
                 * them on a linked list.  This rarely happens...
                 */
                if ((new_fn->hash == fname->hash) &&
                    (new_fn->minor_hash == fname->minor_hash)) {
                        new_fn->next = fname->next;
                        fname->next = new_fn;
                        return 0;
                }

                if (new_fn->hash < fname->hash)
                        p = &(*p)->rb_left;
                else if (new_fn->hash > fname->hash)
                        p = &(*p)->rb_right;
                else if (new_fn->minor_hash < fname->minor_hash)
                        p = &(*p)->rb_left;
                else /* if (new_fn->minor_hash > fname->minor_hash) */
                        p = &(*p)->rb_right;
        }

        rb_link_node(&new_fn->rb_hash, parent, p);
        rb_insert_color(&new_fn->rb_hash, &info->root);
        return 0;
}



/*
 * This is a helper function for ext4_dx_readdir.  It calls filldir
 * for all entries on the fname linked list.  (Normally there is only
 * one entry on the linked list, unless there are 62 bit hash collisions.)
 */
static int call_filldir(struct file *file, struct dir_context *ctx,
                        struct fname *fname)
{
        struct dir_private_info *info = file->private_data;
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;

        if (!fname) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: comm %s: "
                         "called with null fname?!?", __func__, __LINE__,
                         inode->i_ino, current->comm);
                return 0;
        }
        ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
        while (fname) {
                if (!dir_emit(ctx, fname->name,
                                fname->name_len,
                                fname->inode,
                                get_dtype(sb, fname->file_type))) {
                        info->extra_fname = fname;
                        return 1;
                }
                fname = fname->next;
        }
        return 0;
}

static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
{
        struct dir_private_info *info = file->private_data;
        struct inode *inode = file_inode(file);
        struct fname *fname;
        int ret = 0;

        ext4_htree_init_dir_info(file, ctx->pos);

        if (ctx->pos == ext4_get_htree_eof(file))
                return 0;        /* EOF */

        /* Some one has messed with f_pos; reset the world */
        if (info->last_pos != ctx->pos) {
                free_rb_tree_fname(&info->root);
                info->curr_node = NULL;
                info->extra_fname = NULL;
                info->curr_hash = pos2maj_hash(file, ctx->pos);
                info->curr_minor_hash = pos2min_hash(file, ctx->pos);
        }

        /*
         * If there are any leftover names on the hash collision
         * chain, return them first.
         */
        if (info->extra_fname) {
                if (call_filldir(file, ctx, info->extra_fname))
                        goto finished;
                info->extra_fname = NULL;
                goto next_node;
        } else if (!info->curr_node)
                info->curr_node = rb_first(&info->root);

        while (1) {
                /*
                 * Fill the rbtree if we have no more entries,
                 * or the inode has changed since we last read in the
                 * cached entries.
                 */
                if ((!info->curr_node) ||
                    !inode_eq_iversion(inode, info->cookie)) {
                        info->curr_node = NULL;
                        free_rb_tree_fname(&info->root);
                        info->cookie = inode_query_iversion(inode);
                        ret = ext4_htree_fill_tree(file, info->curr_hash,
                                                   info->curr_minor_hash,
                                                   &info->next_hash);
                        if (ret < 0)
                                goto finished;
                        if (ret == 0) {
                                ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_node = rb_first(&info->root);
                }

                fname = rb_entry(info->curr_node, struct fname, rb_hash);
                info->curr_hash = fname->hash;
                info->curr_minor_hash = fname->minor_hash;
                if (call_filldir(file, ctx, fname))
                        break;
        next_node:
                info->curr_node = rb_next(info->curr_node);
                if (info->curr_node) {
                        fname = rb_entry(info->curr_node, struct fname,
                                         rb_hash);
                        info->curr_hash = fname->hash;
                        info->curr_minor_hash = fname->minor_hash;
                } else {
                        if (info->next_hash == ~0) {
                                ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_hash = info->next_hash;
                        info->curr_minor_hash = 0;
                }
        }
finished:
        info->last_pos = ctx->pos;
        return ret < 0 ? ret : 0;
}

static int ext4_release_dir(struct inode *inode, struct file *filp)
{
        if (filp->private_data)
                ext4_htree_free_dir_info(filp->private_data);

        return 0;
}

int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
                      int buf_size)
{
        struct ext4_dir_entry_2 *de;
        int rlen;
        unsigned int offset = 0;
        char *top;

        de = buf;
        top = buf + buf_size;
        while ((char *) de < top) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                         buf, buf_size, offset))
                        return -EFSCORRUPTED;
                rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
                de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
                offset += rlen;
        }
        if ((char *) de > top)
                return -EFSCORRUPTED;

        return 0;
}

static int ext4_dir_open(struct inode *inode, struct file *file)
{
        struct dir_private_info *info;

        info = kzalloc_obj(*info);
        if (!info)
                return -ENOMEM;
        file->private_data = info;
        return 0;
}

const struct file_operations ext4_dir_operations = {
        .open                = ext4_dir_open,
        .llseek                = ext4_dir_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = ext4_readdir,
        .unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = ext4_compat_ioctl,
#endif
        .fsync                = ext4_sync_file,
        .release        = ext4_release_dir,
        .setlease        = generic_setlease,
};








































    5 

























































    5 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BIT_SPINLOCK_H
#define __LINUX_BIT_SPINLOCK_H

#include <linux/kernel.h>
#include <linux/preempt.h>
#include <linux/atomic.h>
#include <linux/bug.h>

#include <asm/processor.h>  /* for cpu_relax() */

/*
 * For static context analysis, we need a unique token for each possible bit
 * that can be used as a bit_spinlock. The easiest way to do that is to create a
 * fake context that we can cast to with the __bitlock(bitnum, addr) macro
 * below, which will give us unique instances for each (bit, addr) pair that the
 * static analysis can use.
 */
context_lock_struct(__context_bitlock) { };
#define __bitlock(bitnum, addr) (struct __context_bitlock *)(bitnum + (addr))

/*
 *  bit-based spin_lock()
 *
 * Don't use this unless you really need to: spin_lock() and spin_unlock()
 * are significantly faster.
 */
static __always_inline void bit_spin_lock(int bitnum, unsigned long *addr)
        __acquires(__bitlock(bitnum, addr))
{
        /*
         * Assuming the lock is uncontended, this never enters
         * the body of the outer loop. If it is contended, then
         * within the inner loop a non-atomic test is used to
         * busywait with less bus contention for a good time to
         * attempt to acquire the lock bit.
         */
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                do {
                        cpu_relax();
                } while (test_bit(bitnum, addr));
                preempt_disable();
        }
#endif
        __acquire(__bitlock(bitnum, addr));
}

/*
 * Return true if it was acquired
 */
static __always_inline int bit_spin_trylock(int bitnum, unsigned long *addr)
        __cond_acquires(true, __bitlock(bitnum, addr))
{
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        if (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                return 0;
        }
#endif
        __acquire(__bitlock(bitnum, addr));
        return 1;
}

/*
 *  bit-based spin_unlock()
 */
static __always_inline void bit_spin_unlock(int bitnum, unsigned long *addr)
        __releases(__bitlock(bitnum, addr))
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(__bitlock(bitnum, addr));
}

/*
 *  bit-based spin_unlock()
 *  non-atomic version, which can be used eg. if the bit lock itself is
 *  protecting the rest of the flags in the word.
 */
static __always_inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
        __releases(__bitlock(bitnum, addr))
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        __clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(__bitlock(bitnum, addr));
}

/*
 * Return true if the lock is held.
 */
static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        return test_bit(bitnum, addr);
#elif defined CONFIG_PREEMPT_COUNT
        return preempt_count();
#else
        return 1;
#endif
}

#endif /* __LINUX_BIT_SPINLOCK_H */










































































































































































































































































































































































































































    3 














    3 







































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
// SPDX-License-Identifier: GPL-2.0
/*
 * usb port device code
 *
 * Copyright (C) 2012 Intel Corp
 *
 * Author: Lan Tianyu <tianyu.lan@intel.com>
 */

#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/string_choices.h>
#include <linux/sysfs.h>
#include <linux/pm_qos.h>
#include <linux/component.h>
#include <linux/usb/of.h>

#include "hub.h"

static int usb_port_block_power_off;

static const struct attribute_group *port_dev_group[];

static bool usb_port_allow_power_off(struct usb_device *hdev,
                                     struct usb_hub *hub,
                                     struct usb_port *port_dev)
{
        if (hub_is_port_power_switchable(hub))
                return true;

        if (!IS_ENABLED(CONFIG_ACPI))
                return false;

        return port_dev->connect_type == USB_PORT_CONNECT_TYPE_HARD_WIRED &&
               usb_acpi_power_manageable(hdev, port_dev->portnum - 1);
}

static ssize_t early_stop_show(struct device *dev,
                            struct device_attribute *attr, char *buf)
{
        struct usb_port *port_dev = to_usb_port(dev);

        return sysfs_emit(buf, "%s\n", str_yes_no(port_dev->early_stop));
}

static ssize_t early_stop_store(struct device *dev, struct device_attribute *attr,
                                const char *buf, size_t count)
{
        struct usb_port *port_dev = to_usb_port(dev);
        bool value;

        if (kstrtobool(buf, &value))
                return -EINVAL;

        if (value)
                port_dev->early_stop = 1;
        else
                port_dev->early_stop = 0;

        return count;
}
static DEVICE_ATTR_RW(early_stop);

static ssize_t disable_show(struct device *dev,
                              struct device_attribute *attr, char *buf)
{
        struct usb_port *port_dev = to_usb_port(dev);
        struct usb_device *hdev = to_usb_device(dev->parent->parent);
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
        struct usb_interface *intf = to_usb_interface(dev->parent);
        int port1 = port_dev->portnum;
        u16 portstatus, unused;
        bool disabled;
        int rc;
        struct kernfs_node *kn;

        if (!hub)
                return -ENODEV;
        hub_get(hub);
        rc = usb_autopm_get_interface(intf);
        if (rc < 0)
                goto out_hub_get;

        /*
         * Prevent deadlock if another process is concurrently
         * trying to unregister hdev.
         */
        kn = sysfs_break_active_protection(&dev->kobj, &attr->attr);
        if (!kn) {
                rc = -ENODEV;
                goto out_autopm;
        }
        usb_lock_device(hdev);
        if (hub->disconnected) {
                rc = -ENODEV;
                goto out_hdev_lock;
        }

        usb_hub_port_status(hub, port1, &portstatus, &unused);
        disabled = !usb_port_is_power_on(hub, portstatus);

 out_hdev_lock:
        usb_unlock_device(hdev);
        sysfs_unbreak_active_protection(kn);
 out_autopm:
        usb_autopm_put_interface(intf);
 out_hub_get:
        hub_put(hub);

        if (rc)
                return rc;

        return sysfs_emit(buf, "%s\n", disabled ? "1" : "0");
}

static ssize_t disable_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        struct usb_port *port_dev = to_usb_port(dev);
        struct usb_device *hdev = to_usb_device(dev->parent->parent);
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
        struct usb_interface *intf = to_usb_interface(dev->parent);
        int port1 = port_dev->portnum;
        bool disabled;
        int rc;
        struct kernfs_node *kn;

        if (!hub)
                return -ENODEV;
        rc = kstrtobool(buf, &disabled);
        if (rc)
                return rc;

        hub_get(hub);
        rc = usb_autopm_get_interface(intf);
        if (rc < 0)
                goto out_hub_get;

        /*
         * Prevent deadlock if another process is concurrently
         * trying to unregister hdev.
         */
        kn = sysfs_break_active_protection(&dev->kobj, &attr->attr);
        if (!kn) {
                rc = -ENODEV;
                goto out_autopm;
        }
        usb_lock_device(hdev);
        if (hub->disconnected) {
                rc = -ENODEV;
                goto out_hdev_lock;
        }

        if (disabled && port_dev->child)
                usb_disconnect(&port_dev->child);

        rc = usb_hub_set_port_power(hdev, hub, port1, !disabled);
        msleep(2 * hub_power_on_good_delay(hub));

        if (disabled) {
                usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION);
                if (!port_dev->is_superspeed)
                        usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_ENABLE);
        }

        if (!rc)
                rc = count;

 out_hdev_lock:
        usb_unlock_device(hdev);
        sysfs_unbreak_active_protection(kn);
 out_autopm:
        usb_autopm_put_interface(intf);
 out_hub_get:
        hub_put(hub);

        return rc;
}
static DEVICE_ATTR_RW(disable);

static ssize_t location_show(struct device *dev,
                             struct device_attribute *attr, char *buf)
{
        struct usb_port *port_dev = to_usb_port(dev);

        return sysfs_emit(buf, "0x%08x\n", port_dev->location);
}
static DEVICE_ATTR_RO(location);

static ssize_t connect_type_show(struct device *dev,
                                 struct device_attribute *attr, char *buf)
{
        struct usb_port *port_dev = to_usb_port(dev);
        char *result;

        switch (port_dev->connect_type) {
        case USB_PORT_CONNECT_TYPE_HOT_PLUG:
                result = "hotplug";
                break;
        case USB_PORT_CONNECT_TYPE_HARD_WIRED:
                result = "hardwired";
                break;
        case USB_PORT_NOT_USED:
                result = "not used";
                break;
        default:
                result = "unknown";
                break;
        }

        return sysfs_emit(buf, "%s\n", result);
}
static DEVICE_ATTR_RO(connect_type);

static ssize_t state_show(struct device *dev,
                          struct device_attribute *attr, char *buf)
{
        struct usb_port *port_dev = to_usb_port(dev);
        enum usb_device_state state = READ_ONCE(port_dev->state);

        return sysfs_emit(buf, "%s\n", usb_state_string(state));
}
static DEVICE_ATTR_RO(state);

static ssize_t over_current_count_show(struct device *dev,
                                       struct device_attribute *attr, char *buf)
{
        struct usb_port *port_dev = to_usb_port(dev);

        return sysfs_emit(buf, "%u\n", port_dev->over_current_count);
}
static DEVICE_ATTR_RO(over_current_count);

static ssize_t quirks_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        struct usb_port *port_dev = to_usb_port(dev);

        return sysfs_emit(buf, "%08x\n", port_dev->quirks);
}

static ssize_t quirks_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        struct usb_port *port_dev = to_usb_port(dev);
        u32 value;

        if (kstrtou32(buf, 16, &value))
                return -EINVAL;

        port_dev->quirks = value;
        return count;
}
static DEVICE_ATTR_RW(quirks);

static ssize_t usb3_lpm_permit_show(struct device *dev,
                              struct device_attribute *attr, char *buf)
{
        struct usb_port *port_dev = to_usb_port(dev);
        const char *p;

        if (port_dev->usb3_lpm_u1_permit) {
                if (port_dev->usb3_lpm_u2_permit)
                        p = "u1_u2";
                else
                        p = "u1";
        } else {
                if (port_dev->usb3_lpm_u2_permit)
                        p = "u2";
                else
                        p = "0";
        }

        return sysfs_emit(buf, "%s\n", p);
}

static ssize_t usb3_lpm_permit_store(struct device *dev,
                               struct device_attribute *attr,
                               const char *buf, size_t count)
{
        struct usb_port *port_dev = to_usb_port(dev);
        struct usb_device *udev = port_dev->child;
        struct usb_hcd *hcd;

        if (!strncmp(buf, "u1_u2", 5)) {
                port_dev->usb3_lpm_u1_permit = 1;
                port_dev->usb3_lpm_u2_permit = 1;

        } else if (!strncmp(buf, "u1", 2)) {
                port_dev->usb3_lpm_u1_permit = 1;
                port_dev->usb3_lpm_u2_permit = 0;

        } else if (!strncmp(buf, "u2", 2)) {
                port_dev->usb3_lpm_u1_permit = 0;
                port_dev->usb3_lpm_u2_permit = 1;

        } else if (!strncmp(buf, "0", 1)) {
                port_dev->usb3_lpm_u1_permit = 0;
                port_dev->usb3_lpm_u2_permit = 0;
        } else
                return -EINVAL;

        /* If device is connected to the port, disable or enable lpm
         * to make new u1 u2 setting take effect immediately.
         */
        if (udev) {
                hcd = bus_to_hcd(udev->bus);
                if (!hcd)
                        return -EINVAL;
                usb_lock_device(udev);
                mutex_lock(hcd->bandwidth_mutex);
                if (!usb_disable_lpm(udev))
                        usb_enable_lpm(udev);
                mutex_unlock(hcd->bandwidth_mutex);
                usb_unlock_device(udev);
        }

        return count;
}
static DEVICE_ATTR_RW(usb3_lpm_permit);

static struct attribute *port_dev_attrs[] = {
        &dev_attr_connect_type.attr,
        &dev_attr_state.attr,
        &dev_attr_location.attr,
        &dev_attr_quirks.attr,
        &dev_attr_over_current_count.attr,
        &dev_attr_disable.attr,
        &dev_attr_early_stop.attr,
        NULL,
};

static const struct attribute_group port_dev_attr_grp = {
        .attrs = port_dev_attrs,
};

static const struct attribute_group *port_dev_group[] = {
        &port_dev_attr_grp,
        NULL,
};

static struct attribute *port_dev_usb3_attrs[] = {
        &dev_attr_usb3_lpm_permit.attr,
        NULL,
};

static const struct attribute_group port_dev_usb3_attr_grp = {
        .attrs = port_dev_usb3_attrs,
};

static const struct attribute_group *port_dev_usb3_group[] = {
        &port_dev_attr_grp,
        &port_dev_usb3_attr_grp,
        NULL,
};

static void usb_port_device_release(struct device *dev)
{
        struct usb_port *port_dev = to_usb_port(dev);

        kfree(port_dev->req);
        kfree(port_dev);
}

#ifdef CONFIG_PM
static int usb_port_runtime_resume(struct device *dev)
{
        struct usb_port *port_dev = to_usb_port(dev);
        struct usb_device *hdev = to_usb_device(dev->parent->parent);
        struct usb_interface *intf = to_usb_interface(dev->parent);
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
        struct usb_device *udev = port_dev->child;
        struct usb_port *peer = port_dev->peer;
        int port1 = port_dev->portnum;
        int retval;

        if (!hub)
                return -EINVAL;
        if (hub->in_reset) {
                set_bit(port1, hub->power_bits);
                return 0;
        }

        /*
         * Power on our usb3 peer before this usb2 port to prevent a usb3
         * device from degrading to its usb2 connection
         */
        if (!port_dev->is_superspeed && peer)
                pm_runtime_get_sync(&peer->dev);

        retval = usb_autopm_get_interface(intf);
        if (retval < 0)
                return retval;

        retval = usb_hub_set_port_power(hdev, hub, port1, true);
        msleep(hub_power_on_good_delay(hub));
        if (udev && !retval) {
                /*
                 * Our preference is to simply wait for the port to reconnect,
                 * as that is the lowest latency method to restart the port.
                 * However, there are cases where toggling port power results in
                 * the host port and the device port getting out of sync causing
                 * a link training live lock.  Upon timeout, flag the port as
                 * needing warm reset recovery (to be performed later by
                 * usb_port_resume() as requested via usb_wakeup_notification())
                 */
                if (hub_port_debounce_be_connected(hub, port1) < 0) {
                        dev_dbg(&port_dev->dev, "reconnect timeout\n");
                        if (hub_is_superspeed(hdev))
                                set_bit(port1, hub->warm_reset_bits);
                }

                /* Force the child awake to revalidate after the power loss. */
                if (!test_and_set_bit(port1, hub->child_usage_bits)) {
                        pm_runtime_get_noresume(&port_dev->dev);
                        pm_request_resume(&udev->dev);
                }
        }

        usb_autopm_put_interface(intf);

        return retval;
}

static int usb_port_runtime_suspend(struct device *dev)
{
        struct usb_port *port_dev = to_usb_port(dev);
        struct usb_device *hdev = to_usb_device(dev->parent->parent);
        struct usb_interface *intf = to_usb_interface(dev->parent);
        struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
        struct usb_port *peer = port_dev->peer;
        int port1 = port_dev->portnum;
        int retval;

        if (!hub)
                return -EINVAL;
        if (hub->in_reset)
                return -EBUSY;

        if (dev_pm_qos_flags(&port_dev->dev, PM_QOS_FLAG_NO_POWER_OFF)
                        == PM_QOS_FLAGS_ALL)
                return -EAGAIN;

        if (usb_port_block_power_off)
                return -EBUSY;

        retval = usb_autopm_get_interface(intf);
        if (retval < 0)
                return retval;

        retval = usb_hub_set_port_power(hdev, hub, port1, false);
        usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION);
        if (!port_dev->is_superspeed)
                usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_ENABLE);
        usb_autopm_put_interface(intf);

        /*
         * Our peer usb3 port may now be able to suspend, so
         * asynchronously queue a suspend request to observe that this
         * usb2 port is now off.
         */
        if (!port_dev->is_superspeed && peer)
                pm_runtime_put(&peer->dev);

        return retval;
}
#endif

static void usb_port_shutdown(struct device *dev)
{
        struct usb_port *port_dev = to_usb_port(dev);
        struct usb_device *udev = port_dev->child;

        if (udev && !udev->port_is_suspended) {
                usb_disable_usb2_hardware_lpm(udev);
                usb_unlocked_disable_lpm(udev);
        }
}

static const struct dev_pm_ops usb_port_pm_ops = {
#ifdef CONFIG_PM
        .runtime_suspend =        usb_port_runtime_suspend,
        .runtime_resume =        usb_port_runtime_resume,
#endif
};

const struct device_type usb_port_device_type = {
        .name =                "usb_port",
        .release =        usb_port_device_release,
        .pm =                &usb_port_pm_ops,
};

static struct device_driver usb_port_driver = {
        .name = "usb",
        .owner = THIS_MODULE,
        .shutdown = usb_port_shutdown,
};

static int link_peers(struct usb_port *left, struct usb_port *right)
{
        struct usb_port *ss_port, *hs_port;
        int rc;

        if (left->peer == right && right->peer == left)
                return 0;

        if (left->peer || right->peer) {
                struct usb_port *lpeer = left->peer;
                struct usb_port *rpeer = right->peer;
                char *method;

                if (left->location && left->location == right->location)
                        method = "location";
                else
                        method = "default";

                pr_debug("usb: failed to peer %s and %s by %s (%s:%s) (%s:%s)\n",
                        dev_name(&left->dev), dev_name(&right->dev), method,
                        dev_name(&left->dev),
                        lpeer ? dev_name(&lpeer->dev) : "none",
                        dev_name(&right->dev),
                        rpeer ? dev_name(&rpeer->dev) : "none");
                return -EBUSY;
        }

        rc = sysfs_create_link(&left->dev.kobj, &right->dev.kobj, "peer");
        if (rc)
                return rc;
        rc = sysfs_create_link(&right->dev.kobj, &left->dev.kobj, "peer");
        if (rc) {
                sysfs_remove_link(&left->dev.kobj, "peer");
                return rc;
        }

        /*
         * We need to wake the HiSpeed port to make sure we don't race
         * setting ->peer with usb_port_runtime_suspend().  Otherwise we
         * may miss a suspend event for the SuperSpeed port.
         */
        if (left->is_superspeed) {
                ss_port = left;
                WARN_ON(right->is_superspeed);
                hs_port = right;
        } else {
                ss_port = right;
                WARN_ON(!right->is_superspeed);
                hs_port = left;
        }
        pm_runtime_get_sync(&hs_port->dev);

        left->peer = right;
        right->peer = left;

        /*
         * The SuperSpeed reference is dropped when the HiSpeed port in
         * this relationship suspends, i.e. when it is safe to allow a
         * SuperSpeed connection to drop since there is no risk of a
         * device degrading to its powered-off HiSpeed connection.
         *
         * Also, drop the HiSpeed ref taken above.
         */
        pm_runtime_get_sync(&ss_port->dev);
        pm_runtime_put(&hs_port->dev);

        return 0;
}

static void link_peers_report(struct usb_port *left, struct usb_port *right)
{
        int rc;

        rc = link_peers(left, right);
        if (rc == 0) {
                dev_dbg(&left->dev, "peered to %s\n", dev_name(&right->dev));
        } else {
                dev_dbg(&left->dev, "failed to peer to %s (%d)\n",
                                dev_name(&right->dev), rc);
                pr_warn_once("usb: port power management may be unreliable\n");
                usb_port_block_power_off = 1;
        }
}

static void unlink_peers(struct usb_port *left, struct usb_port *right)
{
        struct usb_port *ss_port, *hs_port;

        WARN(right->peer != left || left->peer != right,
                        "%s and %s are not peers?\n",
                        dev_name(&left->dev), dev_name(&right->dev));

        /*
         * We wake the HiSpeed port to make sure we don't race its
         * usb_port_runtime_resume() event which takes a SuperSpeed ref
         * when ->peer is !NULL.
         */
        if (left->is_superspeed) {
                ss_port = left;
                hs_port = right;
        } else {
                ss_port = right;
                hs_port = left;
        }

        pm_runtime_get_sync(&hs_port->dev);

        sysfs_remove_link(&left->dev.kobj, "peer");
        right->peer = NULL;
        sysfs_remove_link(&right->dev.kobj, "peer");
        left->peer = NULL;

        /* Drop the SuperSpeed ref held on behalf of the active HiSpeed port */
        pm_runtime_put(&ss_port->dev);

        /* Drop the ref taken above */
        pm_runtime_put(&hs_port->dev);
}

/*
 * For each usb hub device in the system check to see if it is in the
 * peer domain of the given port_dev, and if it is check to see if it
 * has a port that matches the given port by location
 */
static int match_location(struct usb_device *peer_hdev, void *p)
{
        int port1;
        struct usb_hcd *hcd, *peer_hcd;
        struct usb_port *port_dev = p, *peer;
        struct usb_hub *peer_hub = usb_hub_to_struct_hub(peer_hdev);
        struct usb_device *hdev = to_usb_device(port_dev->dev.parent->parent);

        if (!peer_hub || port_dev->connect_type == USB_PORT_NOT_USED)
                return 0;

        hcd = bus_to_hcd(hdev->bus);
        peer_hcd = bus_to_hcd(peer_hdev->bus);
        /* peer_hcd is provisional until we verify it against the known peer */
        if (peer_hcd != hcd->shared_hcd)
                return 0;

        for (port1 = 1; port1 <= peer_hdev->maxchild; port1++) {
                peer = peer_hub->ports[port1 - 1];
                if (peer && peer->connect_type != USB_PORT_NOT_USED &&
                    peer->location == port_dev->location) {
                        link_peers_report(port_dev, peer);
                        return 1; /* done */
                }
        }

        return 0;
}

/*
 * Find the peer port either via explicit platform firmware "location"
 * data, the peer hcd for root hubs, or the upstream peer relationship
 * for all other hubs.
 */
static void find_and_link_peer(struct usb_hub *hub, int port1)
{
        struct usb_port *port_dev = hub->ports[port1 - 1], *peer;
        struct usb_device *hdev = hub->hdev;
        struct usb_device *peer_hdev;
        struct usb_hub *peer_hub;

        /*
         * If location data is available then we can only peer this port
         * by a location match, not the default peer (lest we create a
         * situation where we need to go back and undo a default peering
         * when the port is later peered by location data)
         */
        if (port_dev->location) {
                /* we link the peer in match_location() if found */
                usb_for_each_dev(port_dev, match_location);
                return;
        } else if (!hdev->parent) {
                struct usb_hcd *hcd = bus_to_hcd(hdev->bus);
                struct usb_hcd *peer_hcd = hcd->shared_hcd;

                if (!peer_hcd)
                        return;

                peer_hdev = peer_hcd->self.root_hub;
        } else {
                struct usb_port *upstream;
                struct usb_device *parent = hdev->parent;
                struct usb_hub *parent_hub = usb_hub_to_struct_hub(parent);

                if (!parent_hub)
                        return;

                upstream = parent_hub->ports[hdev->portnum - 1];
                if (!upstream || !upstream->peer)
                        return;

                peer_hdev = upstream->peer->child;
        }

        peer_hub = usb_hub_to_struct_hub(peer_hdev);
        if (!peer_hub || port1 > peer_hdev->maxchild)
                return;

        /*
         * we found a valid default peer, last check is to make sure it
         * does not have location data
         */
        peer = peer_hub->ports[port1 - 1];
        if (peer && peer->location == 0)
                link_peers_report(port_dev, peer);
}

static int connector_bind(struct device *dev, struct device *connector, void *data)
{
        struct usb_port *port_dev = to_usb_port(dev);
        int ret;

        ret = sysfs_create_link(&dev->kobj, &connector->kobj, "connector");
        if (ret)
                return ret;

        ret = sysfs_create_link(&connector->kobj, &dev->kobj, dev_name(dev));
        if (ret) {
                sysfs_remove_link(&dev->kobj, "connector");
                return ret;
        }

        port_dev->connector = data;

        /*
         * If there is already USB device connected to the port, letting the
         * Type-C connector know about it immediately.
         */
        if (port_dev->child)
                typec_attach(port_dev->connector, &port_dev->child->dev);

        return 0;
}

static void connector_unbind(struct device *dev, struct device *connector, void *data)
{
        struct usb_port *port_dev = to_usb_port(dev);

        sysfs_remove_link(&connector->kobj, dev_name(dev));
        sysfs_remove_link(&dev->kobj, "connector");
        port_dev->connector = NULL;
}

static const struct component_ops connector_ops = {
        .bind = connector_bind,
        .unbind = connector_unbind,
};

int usb_hub_create_port_device(struct usb_hub *hub, int port1)
{
        struct usb_port *port_dev;
        struct usb_device *hdev = hub->hdev;
        int retval;

        port_dev = kzalloc_obj(*port_dev);
        if (!port_dev)
                return -ENOMEM;

        port_dev->req = kzalloc_obj(*(port_dev->req));
        if (!port_dev->req) {
                kfree(port_dev);
                return -ENOMEM;
        }

        port_dev->connect_type = usb_of_get_connect_type(hdev, port1);
        hub->ports[port1 - 1] = port_dev;
        port_dev->portnum = port1;
        set_bit(port1, hub->power_bits);
        port_dev->dev.parent = hub->intfdev;
        if (hub_is_superspeed(hdev)) {
                port_dev->is_superspeed = 1;
                port_dev->usb3_lpm_u1_permit = 1;
                port_dev->usb3_lpm_u2_permit = 1;
                port_dev->dev.groups = port_dev_usb3_group;
        } else
                port_dev->dev.groups = port_dev_group;
        port_dev->dev.type = &usb_port_device_type;
        port_dev->dev.driver = &usb_port_driver;
        dev_set_name(&port_dev->dev, "%s-port%d", dev_name(&hub->hdev->dev),
                        port1);
        mutex_init(&port_dev->status_lock);
        retval = device_register(&port_dev->dev);
        if (retval) {
                put_device(&port_dev->dev);
                return retval;
        }

        port_dev->state_kn = sysfs_get_dirent(port_dev->dev.kobj.sd, "state");
        if (!port_dev->state_kn) {
                dev_err(&port_dev->dev, "failed to sysfs_get_dirent 'state'\n");
                retval = -ENODEV;
                goto err_unregister;
        }

        /* Set default policy of port-poweroff disabled. */
        retval = dev_pm_qos_add_request(&port_dev->dev, port_dev->req,
                        DEV_PM_QOS_FLAGS, PM_QOS_FLAG_NO_POWER_OFF);
        if (retval < 0) {
                goto err_put_kn;
        }

        retval = component_add(&port_dev->dev, &connector_ops);
        if (retval) {
                dev_warn(&port_dev->dev, "failed to add component\n");
                goto err_put_kn;
        }

        find_and_link_peer(hub, port1);

        /*
         * Enable runtime pm and hold a refernce that hub_configure()
         * will drop once the PM_QOS_NO_POWER_OFF flag state has been set
         * and the hub has been fully registered (hdev->maxchild set).
         */
        pm_runtime_set_active(&port_dev->dev);
        pm_runtime_get_noresume(&port_dev->dev);
        pm_runtime_enable(&port_dev->dev);
        device_enable_async_suspend(&port_dev->dev);

        /*
         * Keep hidden the ability to enable port-poweroff if neither the
         * USB hub nor platform firmware can manage downstream port power.
         */
        if (!usb_port_allow_power_off(hdev, hub, port_dev))
                return 0;

        /* Attempt to let userspace take over the policy. */
        retval = dev_pm_qos_expose_flags(&port_dev->dev,
                        PM_QOS_FLAG_NO_POWER_OFF);
        if (retval < 0) {
                dev_warn(&port_dev->dev, "failed to expose pm_qos_no_poweroff\n");
                return 0;
        }

        /* Userspace owns the policy, drop the kernel 'no_poweroff' request. */
        retval = dev_pm_qos_remove_request(port_dev->req);
        if (retval >= 0) {
                kfree(port_dev->req);
                port_dev->req = NULL;
        }
        return 0;

err_put_kn:
        sysfs_put(port_dev->state_kn);
err_unregister:
        device_unregister(&port_dev->dev);

        return retval;
}

void usb_hub_remove_port_device(struct usb_hub *hub, int port1)
{
        struct usb_port *port_dev = hub->ports[port1 - 1];
        struct usb_port *peer;

        peer = port_dev->peer;
        if (peer)
                unlink_peers(port_dev, peer);
        component_del(&port_dev->dev, &connector_ops);
        sysfs_put(port_dev->state_kn);
        device_unregister(&port_dev->dev);
}
























    1 















    1 








    1 



































































































































































































































































































































































































    1 








    1 







    1 

























    1 



















    1 




























    1 






    1 



























    1 


    1 







    1 










    1 







    1 



















    1 


    1 

    1 



















































    1 










    1 


    1 











    1 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfsplus/bnode.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Handle basic btree node operations
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
#include <linux/swap.h>

#include "hfsplus_fs.h"
#include "hfsplus_raw.h"


/* Copy a specified range of bytes from the raw data of a node */
void hfs_bnode_read(struct hfs_bnode *node, void *buf, u32 off, u32 len)
{
        struct page **pagep;
        u32 l;

        if (!is_bnode_offset_valid(node, off))
                return;

        if (len == 0) {
                pr_err("requested zero length: "
                       "NODE: id %u, type %#x, height %u, "
                       "node_size %u, offset %u, len %u\n",
                       node->this, node->type, node->height,
                       node->tree->node_size, off, len);
                return;
        }

        len = check_and_correct_requested_length(node, off, len);

        off += node->page_offset;
        pagep = node->page + (off >> PAGE_SHIFT);
        off &= ~PAGE_MASK;

        l = min_t(u32, len, PAGE_SIZE - off);
        memcpy_from_page(buf, *pagep, off, l);

        while ((len -= l) != 0) {
                buf += l;
                l = min_t(u32, len, PAGE_SIZE);
                memcpy_from_page(buf, *++pagep, 0, l);
        }
}

u16 hfs_bnode_read_u16(struct hfs_bnode *node, u32 off)
{
        __be16 data;
        /* TODO: optimize later... */
        hfs_bnode_read(node, &data, off, 2);
        return be16_to_cpu(data);
}

u8 hfs_bnode_read_u8(struct hfs_bnode *node, u32 off)
{
        u8 data;
        /* TODO: optimize later... */
        hfs_bnode_read(node, &data, off, 1);
        return data;
}

void hfs_bnode_read_key(struct hfs_bnode *node, void *key, u32 off)
{
        struct hfs_btree *tree;
        u32 key_len;

        tree = node->tree;
        if (node->type == HFS_NODE_LEAF ||
            tree->attributes & HFS_TREE_VARIDXKEYS ||
            node->tree->cnid == HFSPLUS_ATTR_CNID)
                key_len = hfs_bnode_read_u16(node, off) + 2;
        else
                key_len = tree->max_key_len + 2;

        if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) {
                memset(key, 0, sizeof(hfsplus_btree_key));
                pr_err("hfsplus: Invalid key length: %u\n", key_len);
                return;
        }

        hfs_bnode_read(node, key, off, key_len);
}

void hfs_bnode_write(struct hfs_bnode *node, void *buf, u32 off, u32 len)
{
        struct page **pagep;
        u32 l;

        if (!is_bnode_offset_valid(node, off))
                return;

        if (len == 0) {
                pr_err("requested zero length: "
                       "NODE: id %u, type %#x, height %u, "
                       "node_size %u, offset %u, len %u\n",
                       node->this, node->type, node->height,
                       node->tree->node_size, off, len);
                return;
        }

        len = check_and_correct_requested_length(node, off, len);

        off += node->page_offset;
        pagep = node->page + (off >> PAGE_SHIFT);
        off &= ~PAGE_MASK;

        l = min_t(u32, len, PAGE_SIZE - off);
        memcpy_to_page(*pagep, off, buf, l);
        set_page_dirty(*pagep);

        while ((len -= l) != 0) {
                buf += l;
                l = min_t(u32, len, PAGE_SIZE);
                memcpy_to_page(*++pagep, 0, buf, l);
                set_page_dirty(*pagep);
        }
}

void hfs_bnode_write_u16(struct hfs_bnode *node, u32 off, u16 data)
{
        __be16 v = cpu_to_be16(data);
        /* TODO: optimize later... */
        hfs_bnode_write(node, &v, off, 2);
}

void hfs_bnode_clear(struct hfs_bnode *node, u32 off, u32 len)
{
        struct page **pagep;
        u32 l;

        if (!is_bnode_offset_valid(node, off))
                return;

        if (len == 0) {
                pr_err("requested zero length: "
                       "NODE: id %u, type %#x, height %u, "
                       "node_size %u, offset %u, len %u\n",
                       node->this, node->type, node->height,
                       node->tree->node_size, off, len);
                return;
        }

        len = check_and_correct_requested_length(node, off, len);

        off += node->page_offset;
        pagep = node->page + (off >> PAGE_SHIFT);
        off &= ~PAGE_MASK;

        l = min_t(u32, len, PAGE_SIZE - off);
        memzero_page(*pagep, off, l);
        set_page_dirty(*pagep);

        while ((len -= l) != 0) {
                l = min_t(u32, len, PAGE_SIZE);
                memzero_page(*++pagep, 0, l);
                set_page_dirty(*pagep);
        }
}

void hfs_bnode_copy(struct hfs_bnode *dst_node, u32 dst,
                    struct hfs_bnode *src_node, u32 src, u32 len)
{
        struct page **src_page, **dst_page;
        u32 l;

        hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
        if (!len)
                return;

        len = check_and_correct_requested_length(src_node, src, len);
        len = check_and_correct_requested_length(dst_node, dst, len);

        src += src_node->page_offset;
        dst += dst_node->page_offset;
        src_page = src_node->page + (src >> PAGE_SHIFT);
        src &= ~PAGE_MASK;
        dst_page = dst_node->page + (dst >> PAGE_SHIFT);
        dst &= ~PAGE_MASK;

        if (src == dst) {
                l = min_t(u32, len, PAGE_SIZE - src);
                memcpy_page(*dst_page, src, *src_page, src, l);
                set_page_dirty(*dst_page);

                while ((len -= l) != 0) {
                        l = min_t(u32, len, PAGE_SIZE);
                        memcpy_page(*++dst_page, 0, *++src_page, 0, l);
                        set_page_dirty(*dst_page);
                }
        } else {
                void *src_ptr, *dst_ptr;

                do {
                        dst_ptr = kmap_local_page(*dst_page) + dst;
                        src_ptr = kmap_local_page(*src_page) + src;
                        if (PAGE_SIZE - src < PAGE_SIZE - dst) {
                                l = PAGE_SIZE - src;
                                src = 0;
                                dst += l;
                        } else {
                                l = PAGE_SIZE - dst;
                                src += l;
                                dst = 0;
                        }
                        l = min(len, l);
                        memcpy(dst_ptr, src_ptr, l);
                        kunmap_local(src_ptr);
                        set_page_dirty(*dst_page);
                        kunmap_local(dst_ptr);
                        if (!dst)
                                dst_page++;
                        else
                                src_page++;
                } while ((len -= l));
        }
}

void hfs_bnode_move(struct hfs_bnode *node, u32 dst, u32 src, u32 len)
{
        struct page **src_page, **dst_page;
        void *src_ptr, *dst_ptr;
        u32 l;

        hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
        if (!len)
                return;

        len = check_and_correct_requested_length(node, src, len);
        len = check_and_correct_requested_length(node, dst, len);

        src += node->page_offset;
        dst += node->page_offset;
        if (dst > src) {
                src += len - 1;
                src_page = node->page + (src >> PAGE_SHIFT);
                src = (src & ~PAGE_MASK) + 1;
                dst += len - 1;
                dst_page = node->page + (dst >> PAGE_SHIFT);
                dst = (dst & ~PAGE_MASK) + 1;

                if (src == dst) {
                        while (src < len) {
                                dst_ptr = kmap_local_page(*dst_page);
                                src_ptr = kmap_local_page(*src_page);
                                memmove(dst_ptr, src_ptr, src);
                                kunmap_local(src_ptr);
                                set_page_dirty(*dst_page);
                                kunmap_local(dst_ptr);
                                len -= src;
                                src = PAGE_SIZE;
                                src_page--;
                                dst_page--;
                        }
                        src -= len;
                        dst_ptr = kmap_local_page(*dst_page);
                        src_ptr = kmap_local_page(*src_page);
                        memmove(dst_ptr + src, src_ptr + src, len);
                        kunmap_local(src_ptr);
                        set_page_dirty(*dst_page);
                        kunmap_local(dst_ptr);
                } else {
                        do {
                                dst_ptr = kmap_local_page(*dst_page) + dst;
                                src_ptr = kmap_local_page(*src_page) + src;
                                if (src < dst) {
                                        l = src;
                                        src = PAGE_SIZE;
                                        dst -= l;
                                } else {
                                        l = dst;
                                        src -= l;
                                        dst = PAGE_SIZE;
                                }
                                l = min(len, l);
                                memmove(dst_ptr - l, src_ptr - l, l);
                                kunmap_local(src_ptr);
                                set_page_dirty(*dst_page);
                                kunmap_local(dst_ptr);
                                if (dst == PAGE_SIZE)
                                        dst_page--;
                                else
                                        src_page--;
                        } while ((len -= l));
                }
        } else {
                src_page = node->page + (src >> PAGE_SHIFT);
                src &= ~PAGE_MASK;
                dst_page = node->page + (dst >> PAGE_SHIFT);
                dst &= ~PAGE_MASK;

                if (src == dst) {
                        l = min_t(u32, len, PAGE_SIZE - src);

                        dst_ptr = kmap_local_page(*dst_page) + src;
                        src_ptr = kmap_local_page(*src_page) + src;
                        memmove(dst_ptr, src_ptr, l);
                        kunmap_local(src_ptr);
                        set_page_dirty(*dst_page);
                        kunmap_local(dst_ptr);

                        while ((len -= l) != 0) {
                                l = min_t(u32, len, PAGE_SIZE);
                                dst_ptr = kmap_local_page(*++dst_page);
                                src_ptr = kmap_local_page(*++src_page);
                                memmove(dst_ptr, src_ptr, l);
                                kunmap_local(src_ptr);
                                set_page_dirty(*dst_page);
                                kunmap_local(dst_ptr);
                        }
                } else {
                        do {
                                dst_ptr = kmap_local_page(*dst_page) + dst;
                                src_ptr = kmap_local_page(*src_page) + src;
                                if (PAGE_SIZE - src <
                                                PAGE_SIZE - dst) {
                                        l = PAGE_SIZE - src;
                                        src = 0;
                                        dst += l;
                                } else {
                                        l = PAGE_SIZE - dst;
                                        src += l;
                                        dst = 0;
                                }
                                l = min(len, l);
                                memmove(dst_ptr, src_ptr, l);
                                kunmap_local(src_ptr);
                                set_page_dirty(*dst_page);
                                kunmap_local(dst_ptr);
                                if (!dst)
                                        dst_page++;
                                else
                                        src_page++;
                        } while ((len -= l));
                }
        }
}

void hfs_bnode_dump(struct hfs_bnode *node)
{
        struct hfs_bnode_desc desc;
        __be32 cnid;
        int i, off, key_off;

        hfs_dbg("node %d\n", node->this);
        hfs_bnode_read(node, &desc, 0, sizeof(desc));
        hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n",
                be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
                desc.type, desc.height, be16_to_cpu(desc.num_recs));

        off = node->tree->node_size - 2;
        for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
                key_off = hfs_bnode_read_u16(node, off);
                hfs_dbg(" key_off %d", key_off);
                if (i && node->type == HFS_NODE_INDEX) {
                        int tmp;

                        if (node->tree->attributes & HFS_TREE_VARIDXKEYS ||
                                        node->tree->cnid == HFSPLUS_ATTR_CNID)
                                tmp = hfs_bnode_read_u16(node, key_off) + 2;
                        else
                                tmp = node->tree->max_key_len + 2;
                        hfs_dbg(" (%d", tmp);
                        hfs_bnode_read(node, &cnid, key_off + tmp, 4);
                        hfs_dbg(", cnid %d)", be32_to_cpu(cnid));
                } else if (i && node->type == HFS_NODE_LEAF) {
                        int tmp;

                        tmp = hfs_bnode_read_u16(node, key_off);
                        hfs_dbg(" (%d)", tmp);
                }
        }
        hfs_dbg("\n");
}

void hfs_bnode_unlink(struct hfs_bnode *node)
{
        struct hfs_btree *tree;
        struct hfs_bnode *tmp;
        __be32 cnid;

        tree = node->tree;
        if (node->prev) {
                tmp = hfs_bnode_find(tree, node->prev);
                if (IS_ERR(tmp))
                        return;
                tmp->next = node->next;
                cnid = cpu_to_be32(tmp->next);
                hfs_bnode_write(tmp, &cnid,
                        offsetof(struct hfs_bnode_desc, next), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_head = node->next;

        if (node->next) {
                tmp = hfs_bnode_find(tree, node->next);
                if (IS_ERR(tmp))
                        return;
                tmp->prev = node->prev;
                cnid = cpu_to_be32(tmp->prev);
                hfs_bnode_write(tmp, &cnid,
                        offsetof(struct hfs_bnode_desc, prev), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_tail = node->prev;

        /* move down? */
        if (!node->prev && !node->next)
                hfs_dbg("btree delete level\n");
        if (!node->parent) {
                tree->root = 0;
                tree->depth = 0;
        }

        spin_lock(&tree->hash_lock);
        set_bit(HFS_BNODE_DELETED, &node->flags);
        spin_unlock(&tree->hash_lock);
}

static inline int hfs_bnode_hash(u32 num)
{
        num = (num >> 16) + num;
        num += num >> 8;
        return num & (NODE_HASH_SIZE - 1);
}

struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
{
        struct hfs_bnode *node;

        if (cnid >= tree->node_count) {
                pr_err("request for non-existent node %d in B*Tree\n",
                       cnid);
                return NULL;
        }

        for (node = tree->node_hash[hfs_bnode_hash(cnid)];
                        node; node = node->next_hash)
                if (node->this == cnid)
                        return node;
        return NULL;
}

static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
{
        struct hfs_bnode *node, *node2;
        struct address_space *mapping;
        struct page *page;
        int size, block, i, hash;
        loff_t off;

        if (cnid >= tree->node_count) {
                pr_err("request for non-existent node %d in B*Tree\n",
                       cnid);
                return NULL;
        }

        size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
                sizeof(struct page *);
        node = kzalloc(size, GFP_KERNEL);
        if (!node)
                return NULL;
        node->tree = tree;
        node->this = cnid;
        set_bit(HFS_BNODE_NEW, &node->flags);
        atomic_set(&node->refcnt, 1);
        hfs_dbg("cnid %d, node %d, refcnt 1\n",
                node->tree->cnid, node->this);
        init_waitqueue_head(&node->lock_wq);
        spin_lock(&tree->hash_lock);
        node2 = hfs_bnode_findhash(tree, cnid);
        if (!node2) {
                hash = hfs_bnode_hash(cnid);
                node->next_hash = tree->node_hash[hash];
                tree->node_hash[hash] = node;
                tree->node_hash_cnt++;
        } else {
                hfs_bnode_get(node2);
                spin_unlock(&tree->hash_lock);
                kfree(node);
                wait_event(node2->lock_wq,
                        !test_bit(HFS_BNODE_NEW, &node2->flags));
                return node2;
        }
        spin_unlock(&tree->hash_lock);

        mapping = tree->inode->i_mapping;
        off = (loff_t)cnid << tree->node_size_shift;
        block = off >> PAGE_SHIFT;
        node->page_offset = off & ~PAGE_MASK;
        for (i = 0; i < tree->pages_per_bnode; block++, i++) {
                page = read_mapping_page(mapping, block, NULL);
                if (IS_ERR(page))
                        goto fail;
                node->page[i] = page;
        }

        return node;
fail:
        set_bit(HFS_BNODE_ERROR, &node->flags);
        return node;
}

void hfs_bnode_unhash(struct hfs_bnode *node)
{
        struct hfs_bnode **p;

        hfs_dbg("cnid %d, node %d, refcnt %d\n",
                node->tree->cnid, node->this, atomic_read(&node->refcnt));
        for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
             *p && *p != node; p = &(*p)->next_hash)
                ;
        BUG_ON(!*p);
        *p = node->next_hash;
        node->tree->node_hash_cnt--;
}

/* Load a particular node out of a tree */
struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
{
        struct hfs_bnode *node;
        struct hfs_bnode_desc *desc;
        int i, rec_off, off, next_off;
        int entry_size, key_size;

        spin_lock(&tree->hash_lock);
        node = hfs_bnode_findhash(tree, num);
        if (node) {
                hfs_bnode_get(node);
                spin_unlock(&tree->hash_lock);
                wait_event(node->lock_wq,
                        !test_bit(HFS_BNODE_NEW, &node->flags));
                if (test_bit(HFS_BNODE_ERROR, &node->flags))
                        goto node_error;
                return node;
        }
        spin_unlock(&tree->hash_lock);
        node = __hfs_bnode_create(tree, num);
        if (!node)
                return ERR_PTR(-ENOMEM);
        if (test_bit(HFS_BNODE_ERROR, &node->flags))
                goto node_error;
        if (!test_bit(HFS_BNODE_NEW, &node->flags))
                return node;

        desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) +
                                                         node->page_offset);
        node->prev = be32_to_cpu(desc->prev);
        node->next = be32_to_cpu(desc->next);
        node->num_recs = be16_to_cpu(desc->num_recs);
        node->type = desc->type;
        node->height = desc->height;
        kunmap_local(desc);

        switch (node->type) {
        case HFS_NODE_HEADER:
        case HFS_NODE_MAP:
                if (node->height != 0)
                        goto node_error;
                break;
        case HFS_NODE_LEAF:
                if (node->height != 1)
                        goto node_error;
                break;
        case HFS_NODE_INDEX:
                if (node->height <= 1 || node->height > tree->depth)
                        goto node_error;
                break;
        default:
                goto node_error;
        }

        rec_off = tree->node_size - 2;
        off = hfs_bnode_read_u16(node, rec_off);
        if (off != sizeof(struct hfs_bnode_desc))
                goto node_error;
        for (i = 1; i <= node->num_recs; off = next_off, i++) {
                rec_off -= 2;
                next_off = hfs_bnode_read_u16(node, rec_off);
                if (next_off <= off ||
                    next_off > tree->node_size ||
                    next_off & 1)
                        goto node_error;
                entry_size = next_off - off;
                if (node->type != HFS_NODE_INDEX &&
                    node->type != HFS_NODE_LEAF)
                        continue;
                key_size = hfs_bnode_read_u16(node, off) + 2;
                if (key_size >= entry_size || key_size & 1)
                        goto node_error;
        }
        clear_bit(HFS_BNODE_NEW, &node->flags);
        wake_up(&node->lock_wq);
        return node;

node_error:
        set_bit(HFS_BNODE_ERROR, &node->flags);
        clear_bit(HFS_BNODE_NEW, &node->flags);
        wake_up(&node->lock_wq);
        hfs_bnode_put(node);
        return ERR_PTR(-EIO);
}

void hfs_bnode_free(struct hfs_bnode *node)
{
        int i;

        for (i = 0; i < node->tree->pages_per_bnode; i++)
                if (node->page[i])
                        put_page(node->page[i]);
        kfree(node);
}

struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
{
        struct hfs_bnode *node;
        struct page **pagep;
        int i;

        spin_lock(&tree->hash_lock);
        node = hfs_bnode_findhash(tree, num);
        spin_unlock(&tree->hash_lock);
        if (node) {
                pr_crit("new node %u already hashed?\n", num);
                WARN_ON(1);
                return ERR_PTR(-EEXIST);
        }
        node = __hfs_bnode_create(tree, num);
        if (!node)
                return ERR_PTR(-ENOMEM);
        if (test_bit(HFS_BNODE_ERROR, &node->flags)) {
                hfs_bnode_put(node);
                return ERR_PTR(-EIO);
        }

        pagep = node->page;
        memzero_page(*pagep, node->page_offset,
                     min_t(int, PAGE_SIZE, tree->node_size));
        set_page_dirty(*pagep);
        for (i = 1; i < tree->pages_per_bnode; i++) {
                memzero_page(*++pagep, 0, PAGE_SIZE);
                set_page_dirty(*pagep);
        }
        clear_bit(HFS_BNODE_NEW, &node->flags);
        wake_up(&node->lock_wq);

        return node;
}

void hfs_bnode_get(struct hfs_bnode *node)
{
        if (node) {
                atomic_inc(&node->refcnt);
                hfs_dbg("cnid %d, node %d, refcnt %d\n",
                        node->tree->cnid, node->this,
                        atomic_read(&node->refcnt));
        }
}

/* Dispose of resources used by a node */
void hfs_bnode_put(struct hfs_bnode *node)
{
        if (node) {
                struct hfs_btree *tree = node->tree;
                int i;

                hfs_dbg("cnid %d, node %d, refcnt %d\n",
                        node->tree->cnid, node->this,
                        atomic_read(&node->refcnt));
                BUG_ON(!atomic_read(&node->refcnt));
                if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
                        return;
                for (i = 0; i < tree->pages_per_bnode; i++) {
                        if (!node->page[i])
                                continue;
                        mark_page_accessed(node->page[i]);
                }

                if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
                        hfs_bnode_unhash(node);
                        spin_unlock(&tree->hash_lock);
                        if (hfs_bnode_need_zeroout(tree))
                                hfs_bnode_clear(node, 0, tree->node_size);
                        hfs_bmap_free(node);
                        hfs_bnode_free(node);
                        return;
                }
                spin_unlock(&tree->hash_lock);
        }
}

/*
 * Unused nodes have to be zeroed if this is the catalog tree and
 * a corresponding flag in the volume header is set.
 */
bool hfs_bnode_need_zeroout(struct hfs_btree *tree)
{
        struct super_block *sb = tree->inode->i_sb;
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        const u32 volume_attr = be32_to_cpu(sbi->s_vhdr->attributes);

        return volume_attr & HFSPLUS_VOL_UNUSED_NODE_FIX;
}















































































































































    6 











































































































































    7 






















    6 
    7 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/anon_inodes.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 *  Thanks to Arnd Bergmann for code review and suggestions.
 *  More changes for Thomas Gleixner suggestions.
 *
 */

#include <linux/cred.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/anon_inodes.h>
#include <linux/pseudo_fs.h>

#include <linux/uaccess.h>

#include "internal.h"

static struct vfsmount *anon_inode_mnt __ro_after_init;
static struct inode *anon_inode_inode __ro_after_init;

/*
 * User space expects anonymous inodes to have no file type in st_mode.
 *
 * In particular, 'lsof' has this legacy logic:
 *
 *        type = s->st_mode & S_IFMT;
 *        switch (type) {
 *          ...
 *        case 0:
 *                if (!strcmp(p, "anon_inode"))
 *                        Lf->ntype = Ntype = N_ANON_INODE;
 *
 * to detect our old anon_inode logic.
 *
 * Rather than mess with our internal sane inode data, just fix it
 * up here in getattr() by masking off the format bits.
 */
int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
                       struct kstat *stat, u32 request_mask,
                       unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);

        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        stat->mode &= ~S_IFMT;
        return 0;
}

int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                       struct iattr *attr)
{
        return -EOPNOTSUPP;
}

static const struct inode_operations anon_inode_operations = {
        .getattr = anon_inode_getattr,
        .setattr = anon_inode_setattr,
};

/*
 * anon_inodefs_dname() is called from d_path().
 */
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(buffer, buflen, "anon_inode:%s",
                                dentry->d_name.name);
}

static const struct dentry_operations anon_inodefs_dentry_operations = {
        .d_dname        = anon_inodefs_dname,
};

static int anon_inodefs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        fc->s_iflags |= SB_I_NOEXEC;
        fc->s_iflags |= SB_I_NODEV;
        ctx->dops = &anon_inodefs_dentry_operations;
        return 0;
}

static struct file_system_type anon_inode_fs_type = {
        .name                = "anon_inodefs",
        .init_fs_context = anon_inodefs_init_fs_context,
        .kill_sb        = kill_anon_super,
};

/**
 * anon_inode_make_secure_inode - allocate an anonymous inode with security context
 * @sb:                [in]        Superblock to allocate from
 * @name:        [in]        Name of the class of the newfile (e.g., "secretmem")
 * @context_inode:
 *                [in]        Optional parent inode for security inheritance
 *
 * The function ensures proper security initialization through the LSM hook
 * security_inode_init_security_anon().
 *
 * Return:        Pointer to new inode on success, ERR_PTR on failure.
 */
struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
                                           const struct inode *context_inode)
{
        struct inode *inode;
        int error;

        inode = alloc_anon_inode(sb);
        if (IS_ERR(inode))
                return inode;
        inode->i_flags &= ~S_PRIVATE;
        inode->i_op = &anon_inode_operations;
        error =        security_inode_init_security_anon(inode, &QSTR(name),
                                                  context_inode);
        if (error) {
                iput(inode);
                return ERR_PTR(error);
        }
        return inode;
}
EXPORT_SYMBOL_FOR_MODULES(anon_inode_make_secure_inode, "kvm");

static struct file *__anon_inode_getfile(const char *name,
                                         const struct file_operations *fops,
                                         void *priv, int flags,
                                         const struct inode *context_inode,
                                         bool make_inode)
{
        struct inode *inode;
        struct file *file;

        if (fops->owner && !try_module_get(fops->owner))
                return ERR_PTR(-ENOENT);

        if (make_inode) {
                inode =        anon_inode_make_secure_inode(anon_inode_mnt->mnt_sb,
                                                     name, context_inode);
                if (IS_ERR(inode)) {
                        file = ERR_CAST(inode);
                        goto err;
                }
        } else {
                inode =        anon_inode_inode;
                if (IS_ERR(inode)) {
                        file = ERR_PTR(-ENODEV);
                        goto err;
                }
                /*
                 * We know the anon_inode inode count is always
                 * greater than zero, so ihold() is safe.
                 */
                ihold(inode);
        }

        file = alloc_file_pseudo(inode, anon_inode_mnt, name,
                                 flags & (O_ACCMODE | O_NONBLOCK), fops);
        if (IS_ERR(file))
                goto err_iput;

        file->f_mapping = inode->i_mapping;

        file->private_data = priv;

        return file;

err_iput:
        iput(inode);
err:
        module_put(fops->owner);
        return file;
}

/**
 * anon_inode_getfile - creates a new file instance by hooking it up to an
 *                      anonymous inode, and a dentry that describe the "class"
 *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfile() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup.  Returns the newly created file* or an error pointer.
 */
struct file *anon_inode_getfile(const char *name,
                                const struct file_operations *fops,
                                void *priv, int flags)
{
        return __anon_inode_getfile(name, fops, priv, flags, NULL, false);
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);

/**
 * anon_inode_getfile_fmode - creates a new file instance by hooking it up to an
 *                      anonymous inode, and a dentry that describe the "class"
 *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @f_mode:  [in]    fmode
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfile() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup. Allows setting the fmode. Returns the newly created file* or an error
 * pointer.
 */
struct file *anon_inode_getfile_fmode(const char *name,
                                const struct file_operations *fops,
                                void *priv, int flags, fmode_t f_mode)
{
        struct file *file;

        file = __anon_inode_getfile(name, fops, priv, flags, NULL, false);
        if (!IS_ERR(file))
                file->f_mode |= f_mode;

        return file;
}
EXPORT_SYMBOL_GPL(anon_inode_getfile_fmode);

/**
 * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
 *                             !S_PRIVATE anon inode rather than reuse the
 *                             singleton anon inode and calls the
 *                             inode_init_security_anon() LSM hook.
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @context_inode:
 *           [in]    the logical relationship with the new inode (optional)
 *
 * Create a new anonymous inode and file pair.  This can be done for two
 * reasons:
 *
 * - for the inode to have its own security context, so that LSMs can enforce
 *   policy on the inode's creation;
 *
 * - if the caller needs a unique inode, for example in order to customize
 *   the size returned by fstat()
 *
 * The LSM may use @context_inode in inode_init_security_anon(), but a
 * reference to it is not held.
 *
 * Returns the newly created file* or an error pointer.
 */
struct file *anon_inode_create_getfile(const char *name,
                                       const struct file_operations *fops,
                                       void *priv, int flags,
                                       const struct inode *context_inode)
{
        return __anon_inode_getfile(name, fops, priv, flags,
                                    context_inode, true);
}
EXPORT_SYMBOL_GPL(anon_inode_create_getfile);

static int __anon_inode_getfd(const char *name,
                              const struct file_operations *fops,
                              void *priv, int flags,
                              const struct inode *context_inode,
                              bool make_inode)
{
        return FD_ADD(flags, __anon_inode_getfile(name, fops, priv, flags,
                                                  context_inode, make_inode));
}

/**
 * anon_inode_getfd - creates a new file instance by hooking it up to
 *                    an anonymous inode and a dentry that describe
 *                    the "class" of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is
 * useful for files that do not need to have a full-fledged inode in
 * order to operate correctly.  All the files created with
 * anon_inode_getfd() will use the same singleton inode, reducing
 * memory use and avoiding code duplication for the file/inode/dentry
 * setup.  Returns a newly created file descriptor or an error code.
 */
int anon_inode_getfd(const char *name, const struct file_operations *fops,
                     void *priv, int flags)
{
        return __anon_inode_getfd(name, fops, priv, flags, NULL, false);
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);

/**
 * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new
 * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls
 * the inode_init_security_anon() LSM hook.
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @context_inode:
 *           [in]    the logical relationship with the new inode (optional)
 *
 * Create a new anonymous inode and file pair.  This can be done for two
 * reasons:
 *
 * - for the inode to have its own security context, so that LSMs can enforce
 *   policy on the inode's creation;
 *
 * - if the caller needs a unique inode, for example in order to customize
 *   the size returned by fstat()
 *
 * The LSM may use @context_inode in inode_init_security_anon(), but a
 * reference to it is not held.
 *
 * Returns a newly created file descriptor or an error code.
 */
int anon_inode_create_getfd(const char *name, const struct file_operations *fops,
                            void *priv, int flags,
                            const struct inode *context_inode)
{
        return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
}


static int __init anon_inode_init(void)
{
        anon_inode_mnt = kern_mount(&anon_inode_fs_type);
        if (IS_ERR(anon_inode_mnt))
                panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));

        anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
        if (IS_ERR(anon_inode_inode))
                panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
        anon_inode_inode->i_op = &anon_inode_operations;

        return 0;
}

fs_initcall(anon_inode_init);























































































































    3 








    3 


    3 


















    3 










    3 

    3 




























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/drivers/char/misc.c
 *
 * Generic misc open routine by Johan Myreen
 *
 * Based on code from Linus
 *
 * Teemu Rantanen's Microsoft Busmouse support and Derrick Cole's
 *   changes incorporated into 0.97pl4
 *   by Peter Cervasio (pete%q106fm.uucp@wupost.wustl.edu) (08SEP92)
 *   See busmouse.c for particulars.
 *
 * Made things a lot mode modular - easy to compile in just one or two
 * of the misc drivers, as they are now completely independent. Linus.
 *
 * Support for loadable modules. 8-Sep-95 Philip Blundell <pjb27@cam.ac.uk>
 *
 * Fixed a failing symbol register to free the device registration
 *                Alan Cox <alan@lxorguk.ukuu.org.uk> 21-Jan-96
 *
 * Dynamic minors and /proc/mice by Alessandro Rubini. 26-Mar-96
 *
 * Renamed to misc and miscdevice to be more accurate. Alan Cox 26-Mar-96
 *
 * Handling of mouse minor numbers for kerneld:
 *  Idea by Jacques Gelinas <jack@solucorp.qc.ca>,
 *  adapted by Bjorn Ekwall <bj0rn@blox.se>
 *  corrected by Alan Cox <alan@lxorguk.ukuu.org.uk>
 *
 * Changes for kmod (from kerneld):
 *        Cyrus Durgin <cider@speakeasy.org>
 *
 * Added devfs support. Richard Gooch <rgooch@atnf.csiro.au>  10-Jan-1998
 */

#include <linux/module.h>

#include <linux/fs.h>
#include <linux/errno.h>
#include <linux/miscdevice.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/device.h>
#include <linux/tty.h>
#include <linux/kmod.h>
#include <linux/gfp.h>

/*
 * Head entry for the doubly linked miscdevice list
 */
static LIST_HEAD(misc_list);
static DEFINE_MUTEX(misc_mtx);

/*
 * Assigned numbers.
 */
static DEFINE_IDA(misc_minors_ida);

static int misc_minor_alloc(int minor)
{
        int ret = 0;

        if (minor == MISC_DYNAMIC_MINOR) {
                /* allocate free id */
                ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1,
                                      MINORMASK, GFP_KERNEL);
        } else {
                ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL);
        }
        return ret;
}

static void misc_minor_free(int minor)
{
        ida_free(&misc_minors_ida, minor);
}

#ifdef CONFIG_PROC_FS
static void *misc_seq_start(struct seq_file *seq, loff_t *pos)
{
        mutex_lock(&misc_mtx);
        return seq_list_start(&misc_list, *pos);
}

static void *misc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        return seq_list_next(v, &misc_list, pos);
}

static void misc_seq_stop(struct seq_file *seq, void *v)
{
        mutex_unlock(&misc_mtx);
}

static int misc_seq_show(struct seq_file *seq, void *v)
{
        const struct miscdevice *p = list_entry(v, struct miscdevice, list);

        seq_printf(seq, "%3i %s\n", p->minor, p->name ? p->name : "");
        return 0;
}


static const struct seq_operations misc_seq_ops = {
        .start = misc_seq_start,
        .next  = misc_seq_next,
        .stop  = misc_seq_stop,
        .show  = misc_seq_show,
};
#endif

static int misc_open(struct inode *inode, struct file *file)
{
        int minor = iminor(inode);
        struct miscdevice *c = NULL, *iter;
        int err = -ENODEV;
        const struct file_operations *new_fops = NULL;

        mutex_lock(&misc_mtx);

        list_for_each_entry(iter, &misc_list, list) {
                if (iter->minor != minor)
                        continue;
                c = iter;
                new_fops = fops_get(iter->fops);
                break;
        }

        /* Only request module for fixed minor code */
        if (!new_fops && minor < MISC_DYNAMIC_MINOR) {
                mutex_unlock(&misc_mtx);
                request_module("char-major-%d-%d", MISC_MAJOR, minor);
                mutex_lock(&misc_mtx);

                list_for_each_entry(iter, &misc_list, list) {
                        if (iter->minor != minor)
                                continue;
                        c = iter;
                        new_fops = fops_get(iter->fops);
                        break;
                }
        }

        if (!new_fops)
                goto fail;

        /*
         * Place the miscdevice in the file's
         * private_data so it can be used by the
         * file operations, including f_op->open below
         */
        file->private_data = c;

        err = 0;
        replace_fops(file, new_fops);
        if (file->f_op->open)
                err = file->f_op->open(inode, file);
fail:
        mutex_unlock(&misc_mtx);
        return err;
}

static char *misc_devnode(const struct device *dev, umode_t *mode)
{
        const struct miscdevice *c = dev_get_drvdata(dev);

        if (mode && c->mode)
                *mode = c->mode;
        if (c->nodename)
                return kstrdup(c->nodename, GFP_KERNEL);
        return NULL;
}

static const struct class misc_class = {
        .name                = "misc",
        .devnode        = misc_devnode,
};

static const struct file_operations misc_fops = {
        .owner                = THIS_MODULE,
        .open                = misc_open,
        .llseek                = noop_llseek,
};

/**
 *        misc_register        -        register a miscellaneous device
 *        @misc: device structure
 *
 *        Register a miscellaneous device with the kernel. If the minor
 *        number is set to %MISC_DYNAMIC_MINOR a minor number is assigned
 *        and placed in the minor field of the structure. For other cases
 *        the minor number requested is used.
 *
 *        The structure passed is linked into the kernel and may not be
 *        destroyed until it has been unregistered. By default, an open()
 *        syscall to the device sets file->private_data to point to the
 *        structure. Drivers don't need open in fops for this.
 *
 *        A zero is returned on success and a negative errno code for
 *        failure.
 */

int misc_register(struct miscdevice *misc)
{
        dev_t dev;
        int err = 0;
        bool is_dynamic = (misc->minor == MISC_DYNAMIC_MINOR);

        if (misc->minor > MISC_DYNAMIC_MINOR) {
                pr_err("Invalid fixed minor %d for miscdevice '%s'\n",
                       misc->minor, misc->name);
                return -EINVAL;
        }

        INIT_LIST_HEAD(&misc->list);

        mutex_lock(&misc_mtx);

        if (is_dynamic) {
                int i = misc_minor_alloc(misc->minor);

                if (i < 0) {
                        err = -EBUSY;
                        goto out;
                }
                misc->minor = i;
        } else {
                struct miscdevice *c;
                int i;

                list_for_each_entry(c, &misc_list, list) {
                        if (c->minor == misc->minor) {
                                err = -EBUSY;
                                goto out;
                        }
                }

                i = misc_minor_alloc(misc->minor);
                if (i < 0) {
                        err = -EBUSY;
                        goto out;
                }
        }

        dev = MKDEV(MISC_MAJOR, misc->minor);

        misc->this_device =
                device_create_with_groups(&misc_class, misc->parent, dev,
                                          misc, misc->groups, "%s", misc->name);
        if (IS_ERR(misc->this_device)) {
                misc_minor_free(misc->minor);
                if (is_dynamic) {
                        misc->minor = MISC_DYNAMIC_MINOR;
                }
                err = PTR_ERR(misc->this_device);
                goto out;
        }

        /*
         * Add it to the front, so that later devices can "override"
         * earlier defaults
         */
        list_add(&misc->list, &misc_list);
 out:
        mutex_unlock(&misc_mtx);
        return err;
}
EXPORT_SYMBOL(misc_register);

/**
 *        misc_deregister - unregister a miscellaneous device
 *        @misc: device to unregister
 *
 *        Unregister a miscellaneous device that was previously
 *        successfully registered with misc_register().
 */

void misc_deregister(struct miscdevice *misc)
{
        mutex_lock(&misc_mtx);
        list_del_init(&misc->list);
        device_destroy(&misc_class, MKDEV(MISC_MAJOR, misc->minor));
        misc_minor_free(misc->minor);
        if (misc->minor > MISC_DYNAMIC_MINOR)
                misc->minor = MISC_DYNAMIC_MINOR;
        mutex_unlock(&misc_mtx);
}
EXPORT_SYMBOL(misc_deregister);

static int __init misc_init(void)
{
        int err;
        struct proc_dir_entry *misc_proc_file;

        misc_proc_file = proc_create_seq("misc", 0, NULL, &misc_seq_ops);
        err = class_register(&misc_class);
        if (err)
                goto fail_remove;

        err = __register_chrdev(MISC_MAJOR, 0, MINORMASK + 1, "misc", &misc_fops);
        if (err < 0)
                goto fail_printk;
        return 0;

fail_printk:
        pr_err("unable to get major %d for misc devices\n", MISC_MAJOR);
        class_unregister(&misc_class);
fail_remove:
        if (misc_proc_file)
                remove_proc_entry("misc", NULL);
        return err;
}
subsys_initcall(misc_init);







































































































































































































































   37 







   37 










   37 





































   31 










   25 




    1 









































   33 
   37 

   28 























    2 




    2 













    2 

    1 



























   33 





   33 


   34 



   14 










   35 
   32 

   36 













   33 





































































    1 






















   36 
   35 


   33 



    1 

















   30 



















   12 




























   33 



   36 
    3 

   32 
   34 



   17 

   29 


   33 













   22 




    4 

    3 
   18 















   20 
   23 
   19 


























   24 






























    2 




    1 
    1 









    2 












   32 






   33 




   32 






   37 















   33 

































   31 



   35 


   29 
   29 







   33 
   16 





   31 








   33 













































   35 
   11 

   34 




   13 






   35 




    1 





































    2 
















    2 















    1 

    2 









    2 

















    2 
    1 








    1 

    2 
























































   36 











   36 

   30 








   12 













































   31 


   28 
   20 
































   34 







   36 






   36 




   35 
   24 












   31 


   32 





















    2 





    2 






















    2 


































































































































































































































































































    2 



    2 



















   14 






   17 









   32 
   15 



   18 



















    2 










































































   34 












   36 











   35 













   31 


   13 







   34 



























   36 
   34 











   30 































   36 


























   33 



   36 





   33 














































































































































































































































































































































    2 






    1 





    2 














    2 














    1 




    2 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009                SUSE Linux Products GmbH
 * Copyright (C) 2009                Tejun Heo <tj@kernel.org>
 *
 * Copyright (C) 2017                Facebook Inc.
 * Copyright (C) 2017                Dennis Zhou <dennis@kernel.org>
 *
 * The percpu allocator handles both static and dynamic areas.  Percpu
 * areas are allocated in chunks which are divided into units.  There is
 * a 1-to-1 mapping for units to possible cpus.  These units are grouped
 * based on NUMA properties of the machine.
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
 *  -------------------  ......  -------------------  ....  ------------
 *
 * Allocation is done by offsets into a unit's address space.  Ie., an
 * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
 * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
 * and even sparse.  Access is handled by configuring percpu base
 * registers according to the cpu to unit mappings and offsetting the
 * base address using pcpu_unit_size.
 *
 * There is special consideration for the first chunk which must handle
 * the static percpu variables in the kernel image as allocation services
 * are not online yet.  In short, the first chunk is structured like so:
 *
 *                  <Static | [Reserved] | Dynamic>
 *
 * The static data is copied from the original section managed by the
 * linker.  The reserved section, if non-zero, primarily manages static
 * percpu variables from kernel modules.  Finally, the dynamic section
 * takes care of normal allocations.
 *
 * The allocator organizes chunks into lists according to free size and
 * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
 * flag should be passed.  All memcg-aware allocations are sharing one set
 * of chunks and all unaccounted allocations and allocations performed
 * by processes belonging to the root memory cgroup are using the second set.
 *
 * The allocator tries to allocate from the fullest chunk first. Each chunk
 * is managed by a bitmap with metadata blocks.  The allocation map is updated
 * on every allocation and free to reflect the current state while the boundary
 * map is only updated on allocation.  Each metadata block contains
 * information to help mitigate the need to iterate over large portions
 * of the bitmap.  The reverse mapping from page to chunk is stored in
 * the page's index.  Lastly, units are lazily backed and grow in unison.
 *
 * There is a unique conversion that goes on here between bytes and bits.
 * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
 * tracks the number of pages it is responsible for in nr_pages.  Helper
 * functions are used to convert from between the bytes, bits, and blocks.
 * All hints are managed in bits unless explicitly stated.
 *
 * To use this allocator, arch code should do the following:
 *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
 *   regular address to percpu pointer and back if they need to be
 *   different from the default
 *
 * - use pcpu_setup_first_chunk() during percpu area initialization to
 *   setup the first chunk containing the kernel static percpu area
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/bitmap.h>
#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>

#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/io.h>

#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>

#include "percpu-internal.h"

/*
 * The slots are sorted by the size of the biggest continuous free area.
 * 1-31 bytes share the same slot.
 */
#define PCPU_SLOT_BASE_SHIFT                5
/* chunks in slots below this are subject to being sidelined on failed alloc */
#define PCPU_SLOT_FAIL_THRESHOLD        3

#define PCPU_EMPTY_POP_PAGES_LOW        2
#define PCPU_EMPTY_POP_PAGES_HIGH        4

#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr)                                        \
        (void __percpu *)((unsigned long)(addr) -                        \
                          (unsigned long)pcpu_base_addr        +                \
                          (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr)                                                \
        (void __force *)((unsigned long)(ptr) +                                \
                         (unsigned long)pcpu_base_addr -                \
                         (unsigned long)__per_cpu_start)
#endif
#else        /* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr)        (void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr)                (void __force *)(ptr)
#endif        /* CONFIG_SMP */

static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static int pcpu_free_slot __ro_after_init;
int pcpu_sidelined_slot __ro_after_init;
int pcpu_to_depopulate_slot __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;

/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;

/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;

static const int *pcpu_unit_map __ro_after_init;                /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init;        /* cpu -> unit offset */

/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;

/*
 * The first chunk which always exists.  Note that unlike other
 * chunks, this one can be allocated and mapped in several different
 * ways and thus often doesn't live in the vmalloc area.
 */
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;

/*
 * Optional reserved chunk.  This chunk reserves part of the first
 * chunk and serves it for reserved allocations.  When the reserved
 * region doesn't exist, the following variable is NULL.
 */
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;

DEFINE_SPINLOCK(pcpu_lock);        /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex);        /* chunk create/destroy, [de]pop, map ext */

struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */

/*
 * The number of empty populated pages, protected by pcpu_lock.
 * The reserved chunk doesn't contribute to the count.
 */
int pcpu_nr_empty_pop_pages;

/*
 * The number of populated pages in use by the allocator, protected by
 * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
 * allocated/deallocated, it is allocated/deallocated in all units of a chunk
 * and increments/decrements this count by 1).
 */
static unsigned long pcpu_nr_populated;

/*
 * Balance work is used to populate or destroy chunks asynchronously.  We
 * try to keep the number of populated free pages between
 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
 * empty chunk.
 */
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;

static void pcpu_schedule_balance_work(void)
{
        if (pcpu_async_enabled)
                schedule_work(&pcpu_balance_work);
}

/**
 * pcpu_addr_in_chunk - check if the address is served from this chunk
 * @chunk: chunk of interest
 * @addr: percpu address
 *
 * RETURNS:
 * True if the address is served from this chunk.
 */
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
        void *start_addr, *end_addr;

        if (!chunk)
                return false;

        start_addr = chunk->base_addr + chunk->start_offset;
        end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
                   chunk->end_offset;

        return addr >= start_addr && addr < end_addr;
}

static int __pcpu_size_to_slot(int size)
{
        int highbit = fls(size);        /* size is in bytes */
        return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}

static int pcpu_size_to_slot(int size)
{
        if (size == pcpu_unit_size)
                return pcpu_free_slot;
        return __pcpu_size_to_slot(size);
}

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
        const struct pcpu_block_md *chunk_md = &chunk->chunk_md;

        if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
            chunk_md->contig_hint == 0)
                return 0;

        return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}

/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
        page->private = (unsigned long)pcpu;
}

/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
        return (struct pcpu_chunk *)page->private;
}

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
{
        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}

static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
        return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
                                     unsigned int cpu, int page_idx)
{
        return (unsigned long)chunk->base_addr +
               pcpu_unit_page_offset(cpu, page_idx);
}

/*
 * The following are helper functions to help access bitmaps and convert
 * between bitmap offsets to address offsets.
 */
static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
        return chunk->alloc_map +
               (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}

static unsigned long pcpu_off_to_block_index(int off)
{
        return off / PCPU_BITMAP_BLOCK_BITS;
}

static unsigned long pcpu_off_to_block_off(int off)
{
        return off & (PCPU_BITMAP_BLOCK_BITS - 1);
}

static unsigned long pcpu_block_off_to_off(int index, int off)
{
        return index * PCPU_BITMAP_BLOCK_BITS + off;
}

/**
 * pcpu_check_block_hint - check against the contig hint
 * @block: block of interest
 * @bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 *
 * Check to see if the allocation can fit in the block's contig hint.
 * Note, a chunk uses the same hints as a block so this can also check against
 * the chunk's contig hint.
 */
static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
                                  size_t align)
{
        int bit_off = ALIGN(block->contig_hint_start, align) -
                block->contig_hint_start;

        return bit_off + bits <= block->contig_hint;
}

/*
 * pcpu_next_hint - determine which hint to use
 * @block: block of interest
 * @alloc_bits: size of allocation
 *
 * This determines if we should scan based on the scan_hint or first_free.
 * In general, we want to scan from first_free to fulfill allocations by
 * first fit.  However, if we know a scan_hint at position scan_hint_start
 * cannot fulfill an allocation, we can begin scanning from there knowing
 * the contig_hint will be our fallback.
 */
static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{
        /*
         * The three conditions below determine if we can skip past the
         * scan_hint.  First, does the scan hint exist.  Second, is the
         * contig_hint after the scan_hint (possibly not true iff
         * contig_hint == scan_hint).  Third, is the allocation request
         * larger than the scan_hint.
         */
        if (block->scan_hint &&
            block->contig_hint_start > block->scan_hint_start &&
            alloc_bits > block->scan_hint)
                return block->scan_hint_start + block->scan_hint;

        return block->first_free;
}

/**
 * pcpu_next_md_free_region - finds the next hint free area
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Helper function for pcpu_for_each_md_free_region.  It checks
 * block->contig_hint and performs aggregation across blocks to find the
 * next hint.  It modifies bit_off and bits in-place to be consumed in the
 * loop.
 */
static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
                                     int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                        return;
                }

                /*
                 * This checks three things.  First is there a contig_hint to
                 * check.  Second, have we checked this hint before by
                 * comparing the block_off.  Third, is this the same as the
                 * right contig hint.  In the last case, it spills over into
                 * the next block and should be handled by the contig area
                 * across blocks code.
                 */
                *bits = block->contig_hint;
                if (*bits && block->contig_hint_start >= block_off &&
                    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
                        *bit_off = pcpu_block_off_to_off(i,
                                        block->contig_hint_start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bits = block->right_free;
                *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
        }
}

/**
 * pcpu_next_fit_region - finds fit areas for a given allocation request
 * @chunk: chunk of interest
 * @alloc_bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finds the next free region that is viable for use with a given size and
 * alignment.  This only returns if there is a valid area to be used for this
 * allocation.  block->first_free is returned if the allocation request fits
 * within the block to see if the request can be fulfilled prior to the contig
 * hint.
 */
static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
                                 int align, int *bit_off, int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (*bits >= alloc_bits)
                                return;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                }

                /* check block->contig_hint */
                *bits = ALIGN(block->contig_hint_start, align) -
                        block->contig_hint_start;
                /*
                 * This uses the block offset to determine if this has been
                 * checked in the prior iteration.
                 */
                if (block->contig_hint &&
                    block->contig_hint_start >= block_off &&
                    block->contig_hint >= *bits + alloc_bits) {
                        int start = pcpu_next_hint(block, alloc_bits);

                        *bits += alloc_bits + block->contig_hint_start -
                                 start;
                        *bit_off = pcpu_block_off_to_off(i, start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
                                 align);
                *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
                *bit_off = pcpu_block_off_to_off(i, *bit_off);
                if (*bits >= alloc_bits)
                        return;
        }

        /* no valid offsets were found - fail condition */
        *bit_off = pcpu_chunk_map_bits(chunk);
}

/*
 * Metadata free area iterators.  These perform aggregation of free areas
 * based on the metadata blocks and return the offset @bit_off and size in
 * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
 * a fit is found for the allocation request.
 */
#define pcpu_for_each_md_free_region(chunk, bit_off, bits)                \
        for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));        \
             (bit_off) < pcpu_chunk_map_bits((chunk));                        \
             (bit_off) += (bits) + 1,                                        \
             pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))

#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
        for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits));                                      \
             (bit_off) < pcpu_chunk_map_bits((chunk));                              \
             (bit_off) += (bits),                                              \
             pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits)))

/**
 * pcpu_mem_zalloc - allocate memory
 * @size: bytes to allocate
 * @gfp: allocation flags
 *
 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
 * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
 * This is to facilitate passing through whitelisted flags.  The
 * returned memory is always zeroed.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
        if (WARN_ON_ONCE(!slab_is_available()))
                return NULL;

        if (size <= PAGE_SIZE)
                return kzalloc(size, gfp);
        else
                return __vmalloc(size, gfp | __GFP_ZERO);
}

/**
 * pcpu_mem_free - free memory
 * @ptr: memory to free
 *
 * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
 */
static void pcpu_mem_free(void *ptr)
{
        kvfree(ptr);
}

static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
                              bool move_front)
{
        if (chunk != pcpu_reserved_chunk) {
                if (move_front)
                        list_move(&chunk->list, &pcpu_chunk_lists[slot]);
                else
                        list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
        }
}

static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
{
        __pcpu_chunk_move(chunk, slot, true);
}

/**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
 *
 * This function is called after an allocation or free changed @chunk.
 * New slot according to the changed state is determined and @chunk is
 * moved to the slot.  Note that the reserved chunk is never put on
 * chunk slots.
 *
 * CONTEXT:
 * pcpu_lock.
 */
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
        int nslot = pcpu_chunk_slot(chunk);

        /* leave isolated chunks in-place */
        if (chunk->isolated)
                return;

        if (oslot != nslot)
                __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}

static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        if (!chunk->isolated) {
                chunk->isolated = true;
                pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
        }
        list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
}

static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        if (chunk->isolated) {
                chunk->isolated = false;
                pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
                pcpu_chunk_relocate(chunk, -1);
        }
}

/*
 * pcpu_update_empty_pages - update empty page counters
 * @chunk: chunk of interest
 * @nr: nr of empty pages
 *
 * This is used to keep track of the empty pages now based on the premise
 * a md_block covers a page.  The hint update functions recognize if a block
 * is made full or broken to calculate deltas for keeping track of free pages.
 */
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
        chunk->nr_empty_pop_pages += nr;
        if (chunk != pcpu_reserved_chunk && !chunk->isolated)
                pcpu_nr_empty_pop_pages += nr;
}

/*
 * pcpu_region_overlap - determines if two regions overlap
 * @a: start of first region, inclusive
 * @b: end of first region, exclusive
 * @x: start of second region, inclusive
 * @y: end of second region, exclusive
 *
 * This is used to determine if the hint region [a, b) overlaps with the
 * allocated region [x, y).
 */
static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
        return (a < y) && (x < b);
}

/**
 * pcpu_block_update - updates a block given a free area
 * @block: block of interest
 * @start: start offset in block
 * @end: end offset in block
 *
 * Updates a block given a known free area.  The region [start, end) is
 * expected to be the entirety of the free area within a block.  Chooses
 * the best starting offset if the contig hints are equal.
 */
static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{
        int contig = end - start;

        block->first_free = min(block->first_free, start);
        if (start == 0)
                block->left_free = contig;

        if (end == block->nr_bits)
                block->right_free = contig;

        if (contig > block->contig_hint) {
                /* promote the old contig_hint to be the new scan_hint */
                if (start > block->contig_hint_start) {
                        if (block->contig_hint > block->scan_hint) {
                                block->scan_hint_start =
                                        block->contig_hint_start;
                                block->scan_hint = block->contig_hint;
                        } else if (start < block->scan_hint_start) {
                                /*
                                 * The old contig_hint == scan_hint.  But, the
                                 * new contig is larger so hold the invariant
                                 * scan_hint_start < contig_hint_start.
                                 */
                                block->scan_hint = 0;
                        }
                } else {
                        block->scan_hint = 0;
                }
                block->contig_hint_start = start;
                block->contig_hint = contig;
        } else if (contig == block->contig_hint) {
                if (block->contig_hint_start &&
                    (!start ||
                     __ffs(start) > __ffs(block->contig_hint_start))) {
                        /* start has a better alignment so use it */
                        block->contig_hint_start = start;
                        if (start < block->scan_hint_start &&
                            block->contig_hint > block->scan_hint)
                                block->scan_hint = 0;
                } else if (start > block->scan_hint_start ||
                           block->contig_hint > block->scan_hint) {
                        /*
                         * Knowing contig == contig_hint, update the scan_hint
                         * if it is farther than or larger than the current
                         * scan_hint.
                         */
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        } else {
                /*
                 * The region is smaller than the contig_hint.  So only update
                 * the scan_hint if it is larger than or equal and farther than
                 * the current scan_hint.
                 */
                if ((start < block->contig_hint_start &&
                     (contig > block->scan_hint ||
                      (contig == block->scan_hint &&
                       start > block->scan_hint_start)))) {
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        }
}

/*
 * pcpu_block_update_scan - update a block given a free area from a scan
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finding the final allocation spot first goes through pcpu_find_block_fit()
 * to find a block that can hold the allocation and then pcpu_alloc_area()
 * where a scan is used.  When allocations require specific alignments,
 * we can inadvertently create holes which will not be seen in the alloc
 * or free paths.
 *
 * This takes a given free area hole and updates a block as it may change the
 * scan_hint.  We need to scan backwards to ensure we don't miss free bits
 * from alignment.
 */
static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
                                   int bits)
{
        int s_off = pcpu_off_to_block_off(bit_off);
        int e_off = s_off + bits;
        int s_index, l_bit;
        struct pcpu_block_md *block;

        if (e_off > PCPU_BITMAP_BLOCK_BITS)
                return;

        s_index = pcpu_off_to_block_index(bit_off);
        block = chunk->md_blocks + s_index;

        /* scan backwards in case of alignment skipping free bits */
        l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
        s_off = (s_off == l_bit) ? 0 : l_bit + 1;

        pcpu_block_update(block, s_off, e_off);
}

/**
 * pcpu_chunk_refresh_hint - updates metadata about a chunk
 * @chunk: chunk of interest
 * @full_scan: if we should scan from the beginning
 *
 * Iterates over the metadata blocks to find the largest contig area.
 * A full scan can be avoided on the allocation path as this is triggered
 * if we broke the contig_hint.  In doing so, the scan_hint will be before
 * the contig_hint or after if the scan_hint == contig_hint.  This cannot
 * be prevented on freeing as we want to find the largest area possibly
 * spanning blocks.
 */
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits;

        /* promote scan_hint to contig_hint */
        if (!full_scan && chunk_md->scan_hint) {
                bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
                chunk_md->contig_hint_start = chunk_md->scan_hint_start;
                chunk_md->contig_hint = chunk_md->scan_hint;
                chunk_md->scan_hint = 0;
        } else {
                bit_off = chunk_md->first_free;
                chunk_md->contig_hint = 0;
        }

        bits = 0;
        pcpu_for_each_md_free_region(chunk, bit_off, bits)
                pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}

/**
 * pcpu_block_refresh_hint
 * @chunk: chunk of interest
 * @index: index of the metadata block
 *
 * Scans over the block beginning at first_free and updates the block
 * metadata accordingly.
 */
static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
        struct pcpu_block_md *block = chunk->md_blocks + index;
        unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
        unsigned int start, end;        /* region start, region end */

        /* promote scan_hint to contig_hint */
        if (block->scan_hint) {
                start = block->scan_hint_start + block->scan_hint;
                block->contig_hint_start = block->scan_hint_start;
                block->contig_hint = block->scan_hint;
                block->scan_hint = 0;
        } else {
                start = block->first_free;
                block->contig_hint = 0;
        }

        block->right_free = 0;

        /* iterate over free areas and update the contig hints */
        for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
                pcpu_block_update(block, start, end);
}

/**
 * pcpu_block_update_hint_alloc - update hint on allocation path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  The metadata only has to be
 * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
 * scans are required if the block's contig hint is broken.
 */
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
                                         int bits)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Update s_block.
         */
        if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;

        /*
         * block->first_free must be updated if the allocation takes its place.
         * If the allocation breaks the contig_hint, a scan is required to
         * restore this hint.
         */
        if (s_off == s_block->first_free)
                s_block->first_free = find_next_zero_bit(
                                        pcpu_index_alloc_map(chunk, s_index),
                                        PCPU_BITMAP_BLOCK_BITS,
                                        s_off + bits);

        if (pcpu_region_overlap(s_block->scan_hint_start,
                                s_block->scan_hint_start + s_block->scan_hint,
                                s_off,
                                s_off + bits))
                s_block->scan_hint = 0;

        if (pcpu_region_overlap(s_block->contig_hint_start,
                                s_block->contig_hint_start +
                                s_block->contig_hint,
                                s_off,
                                s_off + bits)) {
                /* block contig hint is broken - scan to fix it */
                if (!s_off)
                        s_block->left_free = 0;
                pcpu_block_refresh_hint(chunk, s_index);
        } else {
                /* update left and right contig manually */
                s_block->left_free = min(s_block->left_free, s_off);
                if (s_index == e_index)
                        s_block->right_free = min_t(int, s_block->right_free,
                                        PCPU_BITMAP_BLOCK_BITS - e_off);
                else
                        s_block->right_free = 0;
        }

        /*
         * Update e_block.
         */
        if (s_index != e_index) {
                if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;

                /*
                 * When the allocation is across blocks, the end is along
                 * the left part of the e_block.
                 */
                e_block->first_free = find_next_zero_bit(
                                pcpu_index_alloc_map(chunk, e_index),
                                PCPU_BITMAP_BLOCK_BITS, e_off);

                if (e_off == PCPU_BITMAP_BLOCK_BITS) {
                        /* reset the block */
                        e_block++;
                } else {
                        if (e_off > e_block->scan_hint_start)
                                e_block->scan_hint = 0;

                        e_block->left_free = 0;
                        if (e_off > e_block->contig_hint_start) {
                                /* contig hint is broken - scan to fix it */
                                pcpu_block_refresh_hint(chunk, e_index);
                        } else {
                                e_block->right_free =
                                        min_t(int, e_block->right_free,
                                              PCPU_BITMAP_BLOCK_BITS - e_off);
                        }
                }

                /* update in-between md_blocks */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->scan_hint = 0;
                        block->contig_hint = 0;
                        block->left_free = 0;
                        block->right_free = 0;
                }
        }

        /*
         * If the allocation is not atomic, some blocks may not be
         * populated with pages, while we account it here.  The number
         * of pages will be added back with pcpu_chunk_populated()
         * when populating pages.
         */
        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, -nr_empty_pages);

        if (pcpu_region_overlap(chunk_md->scan_hint_start,
                                chunk_md->scan_hint_start +
                                chunk_md->scan_hint,
                                bit_off,
                                bit_off + bits))
                chunk_md->scan_hint = 0;

        /*
         * The only time a full chunk scan is required is if the chunk
         * contig hint is broken.  Otherwise, it means a smaller space
         * was used and therefore the chunk contig hint is still correct.
         */
        if (pcpu_region_overlap(chunk_md->contig_hint_start,
                                chunk_md->contig_hint_start +
                                chunk_md->contig_hint,
                                bit_off,
                                bit_off + bits))
                pcpu_chunk_refresh_hint(chunk, false);
}

/**
 * pcpu_block_update_hint_free - updates the block hints on the free path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  This avoids a blind block
 * refresh by making use of the block contig hints.  If this fails, it scans
 * forward and backward to determine the extent of the free area.  This is
 * capped at the boundary of blocks.
 *
 * A chunk update is triggered if a page becomes free, a block becomes free,
 * or the free spans across blocks.  This tradeoff is to minimize iterating
 * over the block metadata to update chunk_md->contig_hint.
 * chunk_md->contig_hint may be off by up to a page, but it will never be more
 * than the available space.  If the contig hint is contained in one block, it
 * will be accurate.
 */
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
                                        int bits)
{
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */
        int start, end;                /* start and end of the whole free area */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Check if the freed area aligns with the block->contig_hint.
         * If it does, then the scan to find the beginning/end of the
         * larger free area can be avoided.
         *
         * start and end refer to beginning and end of the free area
         * within each their respective blocks.  This is not necessarily
         * the entire free area as it may span blocks past the beginning
         * or end of the block.
         */
        start = s_off;
        if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
                start = s_block->contig_hint_start;
        } else {
                /*
                 * Scan backwards to find the extent of the free area.
                 * find_last_bit returns the starting bit, so if the start bit
                 * is returned, that means there was no last bit and the
                 * remainder of the chunk is free.
                 */
                int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
                                          start);
                start = (start == l_bit) ? 0 : l_bit + 1;
        }

        end = e_off;
        if (e_off == e_block->contig_hint_start)
                end = e_block->contig_hint_start + e_block->contig_hint;
        else
                end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
                                    PCPU_BITMAP_BLOCK_BITS, end);

        /* update s_block */
        e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
        if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;
        pcpu_block_update(s_block, start, e_off);

        /* freeing in the same block */
        if (s_index != e_index) {
                /* update e_block */
                if (end == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;
                pcpu_block_update(e_block, 0, end);

                /* reset md_blocks in the middle */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->first_free = 0;
                        block->scan_hint = 0;
                        block->contig_hint_start = 0;
                        block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
                        block->left_free = PCPU_BITMAP_BLOCK_BITS;
                        block->right_free = PCPU_BITMAP_BLOCK_BITS;
                }
        }

        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, nr_empty_pages);

        /*
         * Refresh chunk metadata when the free makes a block free or spans
         * across blocks.  The contig_hint may be off by up to a page, but if
         * the contig_hint is contained in a block, it will be accurate with
         * the else condition below.
         */
        if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
                pcpu_chunk_refresh_hint(chunk, true);
        else
                pcpu_block_update(&chunk->chunk_md,
                                  pcpu_block_off_to_off(s_index, start),
                                  end);
}

/**
 * pcpu_is_populated - determines if the region is populated
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of area
 * @next_off: return value for the next offset to start searching
 *
 * For atomic allocations, check if the backing pages are populated.
 *
 * RETURNS:
 * Bool if the backing pages are populated.
 * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
 */
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
                              int *next_off)
{
        unsigned int start, end;

        start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
        end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);

        start = find_next_zero_bit(chunk->populated, end, start);
        if (start >= end)
                return true;

        end = find_next_bit(chunk->populated, end, start + 1);

        *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
        return false;
}

/**
 * pcpu_find_block_fit - finds the block index to start searching
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE bytes)
 * @pop_only: use populated regions only
 *
 * Given a chunk and an allocation spec, find the offset to begin searching
 * for a free region.  This iterates over the bitmap metadata blocks to
 * find an offset that will be guaranteed to fit the requirements.  It is
 * not quite first fit as if the allocation does not fit in the contig hint
 * of a block or chunk, it is skipped.  This errs on the side of caution
 * to prevent excess iteration.  Poor alignment can cause the allocator to
 * skip over blocks and chunks that have valid free areas.
 *
 * RETURNS:
 * The offset in the bitmap to begin searching.
 * -1 if no offset is found.
 */
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
                               size_t align, bool pop_only)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, next_off;

        /*
         * This is an optimization to prevent scanning by assuming if the
         * allocation cannot fit in the global hint, there is memory pressure
         * and creating a new chunk would happen soon.
         */
        if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
                return -1;

        bit_off = pcpu_next_hint(chunk_md, alloc_bits);
        bits = 0;
        pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
                if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
                                                   &next_off))
                        break;

                bit_off = next_off;
                bits = 0;
        }

        if (bit_off == pcpu_chunk_map_bits(chunk))
                return -1;

        return bit_off;
}

/*
 * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
 * @map: the address to base the search on
 * @size: the bitmap size in bits
 * @start: the bitnumber to start searching at
 * @nr: the number of zeroed bits we're looking for
 * @align_mask: alignment mask for zero area
 * @largest_off: offset of the largest area skipped
 * @largest_bits: size of the largest area skipped
 *
 * The @align_mask should be one less than a power of 2.
 *
 * This is a modified version of bitmap_find_next_zero_area_off() to remember
 * the largest area that was skipped.  This is imperfect, but in general is
 * good enough.  The largest remembered region is the largest failed region
 * seen.  This does not include anything we possibly skipped due to alignment.
 * pcpu_block_update_scan() does scan backwards to try and recover what was
 * lost to alignment.  While this can cause scanning to miss earlier possible
 * free areas, smaller allocations will eventually fill those holes.
 */
static unsigned long pcpu_find_zero_area(unsigned long *map,
                                         unsigned long size,
                                         unsigned long start,
                                         unsigned long nr,
                                         unsigned long align_mask,
                                         unsigned long *largest_off,
                                         unsigned long *largest_bits)
{
        unsigned long index, end, i, area_off, area_bits;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index, align_mask);
        area_off = index;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                area_bits = i - area_off;
                /* remember largest unused area with best alignment */
                if (area_bits > *largest_bits ||
                    (area_bits == *largest_bits && *largest_off &&
                     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
                        *largest_off = area_off;
                        *largest_bits = area_bits;
                }

                start = i + 1;
                goto again;
        }
        return index;
}

/**
 * pcpu_alloc_area - allocates an area from a pcpu_chunk
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE)
 * @start: bit_off to start searching
 *
 * This function takes in a @start offset to begin searching to fit an
 * allocation of @alloc_bits with alignment @align.  It needs to scan
 * the allocation map because if it fits within the block's contig hint,
 * @start will be block->first_free. This is an attempt to fill the
 * allocation prior to breaking the contig hint.  The allocation and
 * boundary maps are updated accordingly if it confirms a valid
 * free area.
 *
 * RETURNS:
 * Allocated addr offset in @chunk on success.
 * -1 if no matching area is found.
 */
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
                           size_t align, int start)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        size_t align_mask = (align) ? (align - 1) : 0;
        unsigned long area_off = 0, area_bits = 0;
        int bit_off, end, oslot;

        lockdep_assert_held(&pcpu_lock);

        oslot = pcpu_chunk_slot(chunk);

        /*
         * Search to find a fit.
         */
        end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
                    pcpu_chunk_map_bits(chunk));
        bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
                                      align_mask, &area_off, &area_bits);
        if (bit_off >= end)
                return -1;

        if (area_bits)
                pcpu_block_update_scan(chunk, area_off, area_bits);

        /* update alloc map */
        bitmap_set(chunk->alloc_map, bit_off, alloc_bits);

        /* update boundary map */
        set_bit(bit_off, chunk->bound_map);
        bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
        set_bit(bit_off + alloc_bits, chunk->bound_map);

        chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;

        /* update first free bit */
        if (bit_off == chunk_md->first_free)
                chunk_md->first_free = find_next_zero_bit(
                                        chunk->alloc_map,
                                        pcpu_chunk_map_bits(chunk),
                                        bit_off + alloc_bits);

        pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);

        pcpu_chunk_relocate(chunk, oslot);

        return bit_off * PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_free_area - frees the corresponding offset
 * @chunk: chunk of interest
 * @off: addr offset into chunk
 *
 * This function determines the size of an allocation to free using
 * the boundary bitmap and clears the allocation map.
 *
 * RETURNS:
 * Number of freed bytes.
 */
static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, end, oslot, freed;

        lockdep_assert_held(&pcpu_lock);

        oslot = pcpu_chunk_slot(chunk);

        bit_off = off / PCPU_MIN_ALLOC_SIZE;

        /* check invalid free */
        if (!test_bit(bit_off, chunk->alloc_map) ||
            !test_bit(bit_off, chunk->bound_map))
                return 0;

        /* find end index */
        end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
                            bit_off + 1);
        bits = end - bit_off;
        bitmap_clear(chunk->alloc_map, bit_off, bits);

        freed = bits * PCPU_MIN_ALLOC_SIZE;

        /* update metadata */
        chunk->free_bytes += freed;

        /* update first free bit */
        chunk_md->first_free = min(chunk_md->first_free, bit_off);

        pcpu_block_update_hint_free(chunk, bit_off, bits);

        pcpu_chunk_relocate(chunk, oslot);

        pcpu_stats_area_dealloc(chunk);

        return freed;
}

static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
{
        block->scan_hint = 0;
        block->contig_hint = nr_bits;
        block->left_free = nr_bits;
        block->right_free = nr_bits;
        block->first_free = 0;
        block->nr_bits = nr_bits;
}

static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
        struct pcpu_block_md *md_block;

        /* init the chunk's block */
        pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));

        for (md_block = chunk->md_blocks;
             md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
             md_block++)
                pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}

/**
 * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
 * @tmp_addr: the start of the region served
 * @map_size: size of the region served
 *
 * This is responsible for creating the chunks that serve the first chunk.  The
 * base_addr is page aligned down of @tmp_addr while the region end is page
 * aligned up.  Offsets are kept track of to determine the region served. All
 * this is done to appease the bitmap allocator in avoiding partial blocks.
 *
 * RETURNS:
 * Chunk serving the region at @tmp_addr of @map_size.
 */
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
                                                         int map_size)
{
        struct pcpu_chunk *chunk;
        unsigned long aligned_addr;
        int start_offset, offset_bits, region_size, region_bits;
        size_t alloc_size;

        /* region calculations */
        aligned_addr = tmp_addr & PAGE_MASK;

        start_offset = tmp_addr - aligned_addr;
        region_size = ALIGN(start_offset + map_size, PAGE_SIZE);

        /* allocate chunk */
        alloc_size = struct_size(chunk, populated,
                                 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
        chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        INIT_LIST_HEAD(&chunk->list);

        chunk->base_addr = (void *)aligned_addr;
        chunk->start_offset = start_offset;
        chunk->end_offset = region_size - chunk->start_offset - map_size;

        chunk->nr_pages = region_size >> PAGE_SHIFT;
        region_bits = pcpu_chunk_map_bits(chunk);

        alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
        chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size =
                BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
        chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
        chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
#ifdef NEED_PCPUOBJ_EXT
        /* first chunk is free to use */
        chunk->obj_exts = NULL;
#endif
        pcpu_init_md_blocks(chunk);

        /* manage populated page bitmap */
        chunk->immutable = true;
        bitmap_fill(chunk->populated, chunk->nr_pages);
        chunk->nr_populated = chunk->nr_pages;
        chunk->nr_empty_pop_pages = chunk->nr_pages;

        chunk->free_bytes = map_size;

        if (chunk->start_offset) {
                /* hide the beginning of the bitmap */
                offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map, 0, offset_bits);
                set_bit(0, chunk->bound_map);
                set_bit(offset_bits, chunk->bound_map);

                chunk->chunk_md.first_free = offset_bits;

                pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
        }

        if (chunk->end_offset) {
                /* hide the end of the bitmap */
                offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map,
                           pcpu_chunk_map_bits(chunk) - offset_bits,
                           offset_bits);
                set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
                        chunk->bound_map);
                set_bit(region_bits, chunk->bound_map);

                pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
                                             - offset_bits, offset_bits);
        }

        return chunk;
}

static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
        struct pcpu_chunk *chunk;
        int region_bits;

        chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
        if (!chunk)
                return NULL;

        INIT_LIST_HEAD(&chunk->list);
        chunk->nr_pages = pcpu_unit_pages;
        region_bits = pcpu_chunk_map_bits(chunk);

        chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
                                           sizeof(chunk->alloc_map[0]), gfp);
        if (!chunk->alloc_map)
                goto alloc_map_fail;

        chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
                                           sizeof(chunk->bound_map[0]), gfp);
        if (!chunk->bound_map)
                goto bound_map_fail;

        chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
                                           sizeof(chunk->md_blocks[0]), gfp);
        if (!chunk->md_blocks)
                goto md_blocks_fail;

#ifdef NEED_PCPUOBJ_EXT
        if (need_pcpuobj_ext()) {
                chunk->obj_exts =
                        pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
                                        sizeof(struct pcpuobj_ext), gfp);
                if (!chunk->obj_exts)
                        goto objcg_fail;
        }
#endif

        pcpu_init_md_blocks(chunk);

        /* init metadata */
        chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;

        return chunk;

#ifdef NEED_PCPUOBJ_EXT
objcg_fail:
        pcpu_mem_free(chunk->md_blocks);
#endif
md_blocks_fail:
        pcpu_mem_free(chunk->bound_map);
bound_map_fail:
        pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
        pcpu_mem_free(chunk);

        return NULL;
}

static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
        if (!chunk)
                return;
#ifdef NEED_PCPUOBJ_EXT
        pcpu_mem_free(chunk->obj_exts);
#endif
        pcpu_mem_free(chunk->md_blocks);
        pcpu_mem_free(chunk->bound_map);
        pcpu_mem_free(chunk->alloc_map);
        pcpu_mem_free(chunk);
}

/**
 * pcpu_chunk_populated - post-population bookkeeping
 * @chunk: pcpu_chunk which got populated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
 * the bookkeeping information accordingly.  Must be called after each
 * successful population.
 */
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
                                 int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_set(chunk->populated, page_start, nr);
        chunk->nr_populated += nr;
        pcpu_nr_populated += nr;

        pcpu_update_empty_pages(chunk, nr);
}

/**
 * pcpu_chunk_depopulated - post-depopulation bookkeeping
 * @chunk: pcpu_chunk which got depopulated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
 * Update the bookkeeping information accordingly.  Must be called after
 * each successful depopulation.
 */
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
                                   int page_start, int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_clear(chunk->populated, page_start, nr);
        chunk->nr_populated -= nr;
        pcpu_nr_populated -= nr;

        pcpu_update_empty_pages(chunk, -nr);
}

/*
 * Chunk management implementation.
 *
 * To allow different implementations, chunk alloc/free and
 * [de]population are implemented in a separate file which is pulled
 * into this file and compiled together.  The following functions
 * should be implemented.
 *
 * pcpu_populate_chunk                - populate the specified range of a chunk
 * pcpu_depopulate_chunk        - depopulate the specified range of a chunk
 * pcpu_post_unmap_tlb_flush        - flush tlb for the specified range of a chunk
 * pcpu_create_chunk                - create a new chunk
 * pcpu_destroy_chunk                - destroy a chunk, always preceded by full depop
 * pcpu_addr_to_page                - translate address to physical address
 * pcpu_verify_alloc_info        - check alloc_info is acceptable during init
 */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
                               int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                  int page_start, int page_end);
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
                                      int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);

#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
#include "percpu-vm.c"
#endif

/**
 * pcpu_chunk_addr_search - determine chunk containing specified address
 * @addr: address for which the chunk needs to be determined.
 *
 * This is an internal function that handles all but static allocations.
 * Static percpu address values should never be passed into the allocator.
 *
 * RETURNS:
 * The address of the found chunk.
 */
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
        /* is it in the dynamic region (first chunk)? */
        if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
                return pcpu_first_chunk;

        /* is it in the reserved region? */
        if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
                return pcpu_reserved_chunk;

        /*
         * The address is relative to unit0 which might be unused and
         * thus unmapped.  Offset the address to the unit space of the
         * current processor before looking it up in the vmalloc
         * space.  Note that any possible cpu id can be used here, so
         * there's no need to worry about preemption or cpu hotplug.
         */
        addr += pcpu_unit_offsets[raw_smp_processor_id()];
        return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}

#ifdef CONFIG_MEMCG
static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
                                      struct obj_cgroup **objcgp)
{
        struct obj_cgroup *objcg;

        if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
                return true;

        objcg = current_obj_cgroup();
        if (!objcg || obj_cgroup_is_root(objcg))
                return true;

        if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
                return false;

        *objcgp = objcg;
        return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
        if (!objcg)
                return;

        if (likely(chunk && chunk->obj_exts)) {
                obj_cgroup_get(objcg);
                chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;

                rcu_read_lock();
                mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                                pcpu_obj_full_size(size));
                rcu_read_unlock();
        } else {
                obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
        }
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        struct obj_cgroup *objcg;

        if (unlikely(!chunk->obj_exts))
                return;

        objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup;
        if (!objcg)
                return;
        chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL;

        obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));

        rcu_read_lock();
        mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                        -pcpu_obj_full_size(size));
        rcu_read_unlock();

        obj_cgroup_put(objcg);
}

#else /* CONFIG_MEMCG */
static bool
pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
{
        return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MEM_ALLOC_PROFILING
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
                                      size_t size)
{
        if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
                alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
                              current->alloc_tag, size);
        }
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
                alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
}
#else
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
                                      size_t size)
{
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif

/**
 * pcpu_alloc - the percpu allocator
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @reserved: allocate from the reserved chunk if available
 * @gfp: allocation flags
 *
 * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
 * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
 * then no warning will be triggered on invalid or failed allocation
 * requests.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
                                 gfp_t gfp)
{
        gfp_t pcpu_gfp;
        bool is_atomic;
        bool do_warn;
        struct obj_cgroup *objcg = NULL;
        static atomic_t warn_limit = ATOMIC_INIT(10);
        struct pcpu_chunk *chunk, *next;
        const char *err;
        int slot, off, cpu, ret;
        unsigned long flags;
        void __percpu *ptr;
        size_t bits, bit_align;

        gfp = current_gfp_context(gfp);
        /* whitelisted flags that can be passed to the backing allocators */
        pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
        is_atomic = !gfpflags_allow_blocking(gfp);
        do_warn = !(gfp & __GFP_NOWARN);

        /*
         * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
         * therefore alignment must be a minimum of that many bytes.
         * An allocation may have internal fragmentation from rounding up
         * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
         */
        if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
                align = PCPU_MIN_ALLOC_SIZE;

        size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
        bits = size >> PCPU_MIN_ALLOC_SHIFT;
        bit_align = align >> PCPU_MIN_ALLOC_SHIFT;

        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
                     !is_power_of_2(align))) {
                WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
                     size, align);
                return NULL;
        }

        if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
                return NULL;

        if (!is_atomic) {
                /*
                 * pcpu_balance_workfn() allocates memory under this mutex,
                 * and it may wait for memory reclaim. Allow current task
                 * to become OOM victim, in case of memory pressure.
                 */
                if (gfp & __GFP_NOFAIL) {
                        mutex_lock(&pcpu_alloc_mutex);
                } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
                        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
                        return NULL;
                }
        }

        spin_lock_irqsave(&pcpu_lock, flags);

        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;

                off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
                if (off < 0) {
                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }

                off = pcpu_alloc_area(chunk, bits, bit_align, off);
                if (off >= 0)
                        goto area_found;

                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }

restart:
        /* search through normal chunks */
        for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
                list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
                                         list) {
                        off = pcpu_find_block_fit(chunk, bits, bit_align,
                                                  is_atomic);
                        if (off < 0) {
                                if (slot < PCPU_SLOT_FAIL_THRESHOLD)
                                        pcpu_chunk_move(chunk, 0);
                                continue;
                        }

                        off = pcpu_alloc_area(chunk, bits, bit_align, off);
                        if (off >= 0) {
                                pcpu_reintegrate_chunk(chunk);
                                goto area_found;
                        }
                }
        }

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (is_atomic) {
                err = "atomic alloc failed, no space left";
                goto fail;
        }

        /* No space left.  Create a new chunk. */
        if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
                chunk = pcpu_create_chunk(pcpu_gfp);
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
                }

                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_chunk_relocate(chunk, -1);
        } else {
                spin_lock_irqsave(&pcpu_lock, flags);
        }

        goto restart;

area_found:
        pcpu_stats_area_alloc(chunk, size);

        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
                pcpu_schedule_balance_work();

        spin_unlock_irqrestore(&pcpu_lock, flags);

        /* populate if not all pages are already there */
        if (!is_atomic) {
                unsigned int page_end, rs, re;

                rs = PFN_DOWN(off);
                page_end = PFN_UP(off + size);

                for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
                        WARN_ON(chunk->immutable);

                        ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);

                        spin_lock_irqsave(&pcpu_lock, flags);
                        if (ret) {
                                pcpu_free_area(chunk, off);
                                err = "failed to populate";
                                goto fail_unlock;
                        }
                        pcpu_chunk_populated(chunk, rs, re);
                        spin_unlock_irqrestore(&pcpu_lock, flags);
                }

                mutex_unlock(&pcpu_alloc_mutex);
        }

        /* clear the areas and return address relative to base address */
        for_each_possible_cpu(cpu)
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

        ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
        kmemleak_alloc_percpu(ptr, size, gfp);

        trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
                                  chunk->base_addr, off, ptr,
                                  pcpu_obj_full_size(size), gfp);

        pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);

        pcpu_alloc_tag_alloc_hook(chunk, off, size);

        return ptr;

fail_unlock:
        spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
        trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);

        if (do_warn) {
                int remaining = atomic_dec_if_positive(&warn_limit);

                if (remaining >= 0) {
                        pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
                                size, align, is_atomic, err);
                        if (!is_atomic)
                                dump_stack();
                        if (remaining == 0)
                                pr_info("limit reached, disable warning\n");
                }
        }

        if (is_atomic) {
                /* see the flag handling in pcpu_balance_workfn() */
                pcpu_atomic_alloc_failed = true;
                pcpu_schedule_balance_work();
        } else {
                mutex_unlock(&pcpu_alloc_mutex);
        }

        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);

        return NULL;
}
EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);

/**
 * pcpu_balance_free - manage the amount of free chunks
 * @empty_only: free chunks only if there are no populated pages
 *
 * If empty_only is %false, reclaim all fully free chunks regardless of the
 * number of populated pages.  Otherwise, only reclaim chunks that have no
 * populated pages.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 */
static void pcpu_balance_free(bool empty_only)
{
        LIST_HEAD(to_free);
        struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
        struct pcpu_chunk *chunk, *next;

        lockdep_assert_held(&pcpu_lock);

        /*
         * There's no reason to keep around multiple unused chunks and VM
         * areas can be scarce.  Destroy all free chunks except for one.
         */
        list_for_each_entry_safe(chunk, next, free_head, list) {
                WARN_ON(chunk->immutable);

                /* spare the first one */
                if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                        continue;

                if (!empty_only || chunk->nr_empty_pop_pages == 0)
                        list_move(&chunk->list, &to_free);
        }

        if (list_empty(&to_free))
                return;

        spin_unlock_irq(&pcpu_lock);
        list_for_each_entry_safe(chunk, next, &to_free, list) {
                unsigned int rs, re;

                for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
                        pcpu_depopulate_chunk(chunk, rs, re);
                        spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_depopulated(chunk, rs, re);
                        spin_unlock_irq(&pcpu_lock);
                }
                pcpu_destroy_chunk(chunk);
                cond_resched();
        }
        spin_lock_irq(&pcpu_lock);
}

/**
 * pcpu_balance_populated - manage the amount of populated pages
 *
 * Maintain a certain amount of populated pages to satisfy atomic allocations.
 * It is possible that this is called when physical memory is scarce causing
 * OOM killer to be triggered.  We should avoid doing so until an actual
 * allocation causes the failure as it is possible that requests can be
 * serviced from already backed regions.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 */
static void pcpu_balance_populated(void)
{
        /* gfp flags passed to underlying allocators */
        const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
        struct pcpu_chunk *chunk;
        int slot, nr_to_pop, ret;

        lockdep_assert_held(&pcpu_lock);

        /*
         * Ensure there are certain number of free populated pages for
         * atomic allocs.  Fill up from the most packed so that atomic
         * allocs don't increase fragmentation.  If atomic allocation
         * failed previously, always populate the maximum amount.  This
         * should prevent atomic allocs larger than PAGE_SIZE from keeping
         * failing indefinitely; however, large atomic allocs are not
         * something we support properly and can be highly unreliable and
         * inefficient.
         */
retry_pop:
        if (pcpu_atomic_alloc_failed) {
                nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
                /* best effort anyway, don't worry about synchronization */
                pcpu_atomic_alloc_failed = false;
        } else {
                nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
                                  pcpu_nr_empty_pop_pages,
                                  0, PCPU_EMPTY_POP_PAGES_HIGH);
        }

        for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
                unsigned int nr_unpop = 0, rs, re;

                if (!nr_to_pop)
                        break;

                list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
                        nr_unpop = chunk->nr_pages - chunk->nr_populated;
                        if (nr_unpop)
                                break;
                }

                if (!nr_unpop)
                        continue;

                /* @chunk can't go away while pcpu_alloc_mutex is held */
                for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
                        int nr = min_t(int, re - rs, nr_to_pop);

                        spin_unlock_irq(&pcpu_lock);
                        ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);
                        if (!ret) {
                                nr_to_pop -= nr;
                                pcpu_chunk_populated(chunk, rs, rs + nr);
                        } else {
                                nr_to_pop = 0;
                        }

                        if (!nr_to_pop)
                                break;
                }
        }

        if (nr_to_pop) {
                /* ran out of chunks to populate, create a new one and retry */
                spin_unlock_irq(&pcpu_lock);
                chunk = pcpu_create_chunk(gfp);
                cond_resched();
                spin_lock_irq(&pcpu_lock);
                if (chunk) {
                        pcpu_chunk_relocate(chunk, -1);
                        goto retry_pop;
                }
        }
}

/**
 * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
 *
 * Scan over chunks in the depopulate list and try to release unused populated
 * pages back to the system.  Depopulated chunks are sidelined to prevent
 * repopulating these pages unless required.  Fully free chunks are reintegrated
 * and freed accordingly (1 is kept around).  If we drop below the empty
 * populated pages threshold, reintegrate the chunk if it has empty free pages.
 * Each chunk is scanned in the reverse order to keep populated pages close to
 * the beginning of the chunk.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 *
 */
static void pcpu_reclaim_populated(void)
{
        struct pcpu_chunk *chunk;
        struct pcpu_block_md *block;
        int freed_page_start, freed_page_end;
        int i, end;
        bool reintegrate;

        lockdep_assert_held(&pcpu_lock);

        /*
         * Once a chunk is isolated to the to_depopulate list, the chunk is no
         * longer discoverable to allocations whom may populate pages.  The only
         * other accessor is the free path which only returns area back to the
         * allocator not touching the populated bitmap.
         */
        while ((chunk = list_first_entry_or_null(
                        &pcpu_chunk_lists[pcpu_to_depopulate_slot],
                        struct pcpu_chunk, list))) {
                WARN_ON(chunk->immutable);

                /*
                 * Scan chunk's pages in the reverse order to keep populated
                 * pages close to the beginning of the chunk.
                 */
                freed_page_start = chunk->nr_pages;
                freed_page_end = 0;
                reintegrate = false;
                for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
                        /* no more work to do */
                        if (chunk->nr_empty_pop_pages == 0)
                                break;

                        /* reintegrate chunk to prevent atomic alloc failures */
                        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
                                reintegrate = true;
                                break;
                        }

                        /*
                         * If the page is empty and populated, start or
                         * extend the (i, end) range.  If i == 0, decrease
                         * i and perform the depopulation to cover the last
                         * (first) page in the chunk.
                         */
                        block = chunk->md_blocks + i;
                        if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
                            test_bit(i, chunk->populated)) {
                                if (end == -1)
                                        end = i;
                                if (i > 0)
                                        continue;
                                i--;
                        }

                        /* depopulate if there is an active range */
                        if (end == -1)
                                continue;

                        spin_unlock_irq(&pcpu_lock);
                        pcpu_depopulate_chunk(chunk, i + 1, end + 1);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);

                        pcpu_chunk_depopulated(chunk, i + 1, end + 1);
                        freed_page_start = min(freed_page_start, i + 1);
                        freed_page_end = max(freed_page_end, end + 1);

                        /* reset the range and continue */
                        end = -1;
                }

                /* batch tlb flush per chunk to amortize cost */
                if (freed_page_start < freed_page_end) {
                        spin_unlock_irq(&pcpu_lock);
                        pcpu_post_unmap_tlb_flush(chunk,
                                                  freed_page_start,
                                                  freed_page_end);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);
                }

                if (reintegrate || chunk->free_bytes == pcpu_unit_size)
                        pcpu_reintegrate_chunk(chunk);
                else
                        list_move_tail(&chunk->list,
                                       &pcpu_chunk_lists[pcpu_sidelined_slot]);
        }
}

/**
 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @work: unused
 *
 * For each chunk type, manage the number of fully free chunks and the number of
 * populated pages.  An important thing to consider is when pages are freed and
 * how they contribute to the global counts.
 */
static void pcpu_balance_workfn(struct work_struct *work)
{
        /*
         * pcpu_balance_free() is called twice because the first time we may
         * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
         * to grow other chunks.  This then gives pcpu_reclaim_populated() time
         * to move fully free chunks to the active list to be freed if
         * appropriate.
         *
         * Enforce GFP_NOIO allocations because we have pcpu_alloc users
         * constrained to GFP_NOIO/NOFS contexts and they could form lock
         * dependency through pcpu_alloc_mutex
         */
        unsigned int flags = memalloc_noio_save();
        mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irq(&pcpu_lock);

        pcpu_balance_free(false);
        pcpu_reclaim_populated();
        pcpu_balance_populated();
        pcpu_balance_free(true);

        spin_unlock_irq(&pcpu_lock);
        mutex_unlock(&pcpu_alloc_mutex);
        memalloc_noio_restore(flags);
}

/**
 * free_percpu - free percpu area
 * @ptr: pointer to area to free
 *
 * Free percpu area @ptr.
 *
 * CONTEXT:
 * Can be called from atomic context.
 */
void free_percpu(void __percpu *ptr)
{
        void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
        int size, off;
        bool need_balance = false;

        if (!ptr)
                return;

        kmemleak_free_percpu(ptr);

        addr = __pcpu_ptr_to_addr(ptr);
        chunk = pcpu_chunk_addr_search(addr);
        off = addr - chunk->base_addr;

        spin_lock_irqsave(&pcpu_lock, flags);
        size = pcpu_free_area(chunk, off);
        if (size == 0) {
                spin_unlock_irqrestore(&pcpu_lock, flags);

                /* invalid percpu free */
                WARN_ON_ONCE(1);
                return;
        }

        pcpu_alloc_tag_free_hook(chunk, off, size);

        pcpu_memcg_free_hook(chunk, off, size);

        /*
         * If there are more than one fully free chunks, wake up grim reaper.
         * If the chunk is isolated, it may be in the process of being
         * reclaimed.  Let reclaim manage cleaning up of that chunk.
         */
        if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
                struct pcpu_chunk *pos;

                list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
                        if (pos != chunk) {
                                need_balance = true;
                                break;
                        }
        } else if (pcpu_should_reclaim_chunk(chunk)) {
                pcpu_isolate_chunk(chunk);
                need_balance = true;
        }

        trace_percpu_free_percpu(chunk->base_addr, off, ptr);

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (need_balance)
                pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);

bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        unsigned int cpu;

        for_each_possible_cpu(cpu) {
                void *start = per_cpu_ptr(base, cpu);
                void *va = (void *)addr;

                if (va >= start && va < start + static_size) {
                        if (can_addr) {
                                *can_addr = (unsigned long) (va - start);
                                *can_addr += (unsigned long)
                                        per_cpu_ptr(base, get_boot_cpu_id());
                        }
                        return true;
                }
        }
#endif
        /* on UP, can't distinguish from other static vars, always false */
        return false;
}

/**
 * is_kernel_percpu_address - test whether address is from static percpu area
 * @addr: address to test
 *
 * Test whether @addr belongs to in-kernel static percpu area.  Module
 * static percpu areas are not considered.  For those, use
 * is_module_percpu_address().
 *
 * RETURNS:
 * %true if @addr is from in-kernel static percpu area, %false otherwise.
 */
bool is_kernel_percpu_address(unsigned long addr)
{
        return __is_kernel_percpu_address(addr, NULL);
}

/**
 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
 * @addr: the address to be converted to physical address
 *
 * Given @addr which is dereferenceable address obtained via one of
 * percpu access macros, this function translates it into its physical
 * address.  The caller is responsible for ensuring @addr stays valid
 * until this function finishes.
 *
 * percpu allocator has special setup for the first chunk, which currently
 * supports either embedding in linear address space or vmalloc mapping,
 * and, from the second one, the backing allocator (currently either vm or
 * km) provides translation.
 *
 * The addr can be translated simply without checking if it falls into the
 * first chunk. But the current code reflects better how percpu allocator
 * actually works, and the verification can discover both bugs in percpu
 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
 * code.
 *
 * RETURNS:
 * The physical address for @addr.
 */
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        bool in_first_chunk = false;
        unsigned long first_low, first_high;
        unsigned int cpu;

        /*
         * The following test on unit_low/high isn't strictly
         * necessary but will speed up lookups of addresses which
         * aren't in the first chunk.
         *
         * The address check is against full chunk sizes.  pcpu_base_addr
         * points to the beginning of the first chunk including the
         * static region.  Assumes good intent as the first chunk may
         * not be full (ie. < pcpu_unit_pages in size).
         */
        first_low = (unsigned long)pcpu_base_addr +
                    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
        first_high = (unsigned long)pcpu_base_addr +
                     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
        if ((unsigned long)addr >= first_low &&
            (unsigned long)addr < first_high) {
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(base, cpu);

                        if (addr >= start && addr < start + pcpu_unit_size) {
                                in_first_chunk = true;
                                break;
                        }
                }
        }

        if (in_first_chunk) {
                if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
                        return page_to_phys(vmalloc_to_page(addr)) +
                               offset_in_page(addr);
        } else
                return page_to_phys(pcpu_addr_to_page(addr)) +
                       offset_in_page(addr);
}

/**
 * pcpu_alloc_alloc_info - allocate percpu allocation info
 * @nr_groups: the number of groups
 * @nr_units: the number of units
 *
 * Allocate ai which is large enough for @nr_groups groups containing
 * @nr_units units.  The returned ai's groups[0].cpu_map points to the
 * cpu_map array which is long enough for @nr_units and filled with
 * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
 * pointer of other groups.
 *
 * RETURNS:
 * Pointer to the allocated pcpu_alloc_info on success, NULL on
 * failure.
 */
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
                                                      int nr_units)
{
        struct pcpu_alloc_info *ai;
        size_t base_size, ai_size;
        void *ptr;
        int unit;

        base_size = ALIGN(struct_size(ai, groups, nr_groups),
                          __alignof__(ai->groups[0].cpu_map[0]));
        ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);

        ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
        if (!ptr)
                return NULL;
        ai = ptr;
        ptr += base_size;

        ai->groups[0].cpu_map = ptr;

        for (unit = 0; unit < nr_units; unit++)
                ai->groups[0].cpu_map[unit] = NR_CPUS;

        ai->nr_groups = nr_groups;
        ai->__ai_size = PFN_ALIGN(ai_size);

        return ai;
}

/**
 * pcpu_free_alloc_info - free percpu allocation info
 * @ai: pcpu_alloc_info to free
 *
 * Free @ai which was allocated by pcpu_alloc_alloc_info().
 */
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
        memblock_free(ai, ai->__ai_size);
}

/**
 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
 * @lvl: loglevel
 * @ai: allocation info to dump
 *
 * Print out information about @ai using loglevel @lvl.
 */
static void pcpu_dump_alloc_info(const char *lvl,
                                 const struct pcpu_alloc_info *ai)
{
        int group_width = 1, cpu_width = 1, width;
        char empty_str[] = "--------";
        int alloc = 0, alloc_end = 0;
        int group, v;
        int upa, apl;        /* units per alloc, allocs per line */

        v = ai->nr_groups;
        while (v /= 10)
                group_width++;

        v = num_possible_cpus();
        while (v /= 10)
                cpu_width++;
        empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';

        upa = ai->alloc_size / ai->unit_size;
        width = upa * (cpu_width + 1) + group_width + 3;
        apl = rounddown_pow_of_two(max(60 / width, 1));

        printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
               lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
               ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);

        for (group = 0; group < ai->nr_groups; group++) {
                const struct pcpu_group_info *gi = &ai->groups[group];
                int unit = 0, unit_end = 0;

                BUG_ON(gi->nr_units % upa);
                for (alloc_end += gi->nr_units / upa;
                     alloc < alloc_end; alloc++) {
                        if (!(alloc % apl)) {
                                pr_cont("\n");
                                printk("%spcpu-alloc: ", lvl);
                        }
                        pr_cont("[%0*d] ", group_width, group);

                        for (unit_end += upa; unit < unit_end; unit++)
                                if (gi->cpu_map[unit] != NR_CPUS)
                                        pr_cont("%0*d ",
                                                cpu_width, gi->cpu_map[unit]);
                                else
                                        pr_cont("%s ", empty_str);
                }
        }
        pr_cont("\n");
}

/**
 * pcpu_setup_first_chunk - initialize the first percpu chunk
 * @ai: pcpu_alloc_info describing how to percpu area is shaped
 * @base_addr: mapped address
 *
 * Initialize the first percpu chunk which contains the kernel static
 * percpu area.  This function is to be called from arch percpu area
 * setup path.
 *
 * @ai contains all information necessary to initialize the first
 * chunk and prime the dynamic percpu allocator.
 *
 * @ai->static_size is the size of static percpu area.
 *
 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
 * reserve after the static area in the first chunk.  This reserves
 * the first chunk such that it's available only through reserved
 * percpu allocation.  This is primarily used to serve module percpu
 * static areas on architectures where the addressing model has
 * limited offset range for symbol relocations to guarantee module
 * percpu symbols fall inside the relocatable range.
 *
 * @ai->dyn_size determines the number of bytes available for dynamic
 * allocation in the first chunk.  The area between @ai->static_size +
 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
 *
 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
 * and equal to or larger than @ai->static_size + @ai->reserved_size +
 * @ai->dyn_size.
 *
 * @ai->atom_size is the allocation atom size and used as alignment
 * for vm areas.
 *
 * @ai->alloc_size is the allocation size and always multiple of
 * @ai->atom_size.  This is larger than @ai->atom_size if
 * @ai->unit_size is larger than @ai->atom_size.
 *
 * @ai->nr_groups and @ai->groups describe virtual memory layout of
 * percpu areas.  Units which should be colocated are put into the
 * same group.  Dynamic VM areas will be allocated according to these
 * groupings.  If @ai->nr_groups is zero, a single group containing
 * all units is assumed.
 *
 * The caller should have mapped the first chunk at @base_addr and
 * copied static data to each unit.
 *
 * The first chunk will always contain a static and a dynamic region.
 * However, the static region is not managed by any chunk.  If the first
 * chunk also contains a reserved region, it is served by two chunks -
 * one for the reserved region and one for the dynamic region.  They
 * share the same vm, but use offset regions in the area allocation map.
 * The chunk serving the dynamic region is circulated in the chunk slots
 * and available for dynamic allocation like any other chunk.
 */
void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                                   void *base_addr)
{
        size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        size_t static_size, dyn_size;
        unsigned long *group_offsets;
        size_t *group_sizes;
        unsigned long *unit_off;
        unsigned int cpu;
        int *unit_map;
        int group, unit, i;
        unsigned long tmp_addr;
        size_t alloc_size;

#define PCPU_SETUP_BUG_ON(cond)        do {                                        \
        if (unlikely(cond)) {                                                \
                pr_emerg("failed to initialize, %s\n", #cond);                \
                pr_emerg("cpu_possible_mask=%*pb\n",                        \
                         cpumask_pr_args(cpu_possible_mask));                \
                pcpu_dump_alloc_info(KERN_EMERG, ai);                        \
                BUG();                                                        \
        }                                                                \
} while (0)

        /* sanity checks */
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
        PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
        PCPU_SETUP_BUG_ON(!base_addr);
        PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
        PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
        PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
                            IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

        /* process group information and build config tables accordingly */
        alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
        group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
        group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
        unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
        unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;

        pcpu_low_unit_cpu = NR_CPUS;
        pcpu_high_unit_cpu = NR_CPUS;

        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
                const struct pcpu_group_info *gi = &ai->groups[group];

                group_offsets[group] = gi->base_offset;
                group_sizes[group] = gi->nr_units * ai->unit_size;

                for (i = 0; i < gi->nr_units; i++) {
                        cpu = gi->cpu_map[i];
                        if (cpu == NR_CPUS)
                                continue;

                        PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
                        PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
                        PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);

                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;

                        /* determine low/high unit_cpu */
                        if (pcpu_low_unit_cpu == NR_CPUS ||
                            unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
                                pcpu_low_unit_cpu = cpu;
                        if (pcpu_high_unit_cpu == NR_CPUS ||
                            unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
                                pcpu_high_unit_cpu = cpu;
                }
        }
        pcpu_nr_units = unit;

        for_each_possible_cpu(cpu)
                PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);

        /* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
        pcpu_dump_alloc_info(KERN_DEBUG, ai);

        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
        pcpu_group_sizes = group_sizes;
        pcpu_unit_map = unit_map;
        pcpu_unit_offsets = unit_off;

        /* determine basic parameters */
        pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
        pcpu_atom_size = ai->atom_size;
        pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
                                             BITS_TO_LONGS(pcpu_unit_pages));

        pcpu_stats_save_ai(ai);

        /*
         * Allocate chunk slots.  The slots after the active slots are:
         *   sidelined_slot - isolated, depopulated chunks
         *   free_slot - fully free chunks
         *   to_depopulate_slot - isolated, chunks to depopulate
         */
        pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
        pcpu_free_slot = pcpu_sidelined_slot + 1;
        pcpu_to_depopulate_slot = pcpu_free_slot + 1;
        pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
        pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots *
                                          sizeof(pcpu_chunk_lists[0]),
                                          SMP_CACHE_BYTES);

        for (i = 0; i < pcpu_nr_slots; i++)
                INIT_LIST_HEAD(&pcpu_chunk_lists[i]);

        /*
         * The end of the static region needs to be aligned with the
         * minimum allocation size as this offsets the reserved and
         * dynamic region.  The first chunk ends page aligned by
         * expanding the dynamic region, therefore the dynamic region
         * can be shrunk to compensate while still staying above the
         * configured sizes.
         */
        static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
        dyn_size = ai->dyn_size - (static_size - ai->static_size);

        /*
         * Initialize first chunk:
         * This chunk is broken up into 3 parts:
         *                < static | [reserved] | dynamic >
         * - static - there is no backing chunk because these allocations can
         *   never be freed.
         * - reserved (pcpu_reserved_chunk) - exists primarily to serve
         *   allocations from module load.
         * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
         *   chunk.
         */
        tmp_addr = (unsigned long)base_addr + static_size;
        if (ai->reserved_size)
                pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
                                                ai->reserved_size);
        tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
        pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);

        pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
        pcpu_chunk_relocate(pcpu_first_chunk, -1);

        /* include all regions of the first chunk */
        pcpu_nr_populated += PFN_DOWN(size_sum);

        pcpu_stats_chunk_alloc();
        trace_percpu_create_chunk(base_addr);

        /* we're done */
        pcpu_base_addr = base_addr;
}

#ifdef CONFIG_SMP

const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
        [PCPU_FC_AUTO]        = "auto",
        [PCPU_FC_EMBED]        = "embed",
        [PCPU_FC_PAGE]        = "page",
};

enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;

static int __init percpu_alloc_setup(char *str)
{
        if (!str)
                return -EINVAL;

        if (0)
                /* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
        else if (!strcmp(str, "embed"))
                pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
        else if (!strcmp(str, "page"))
                pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
        else
                pr_warn("unknown allocator %s specified\n", str);

        return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);

/*
 * pcpu_embed_first_chunk() is used by the generic percpu setup.
 * Build it if needed by the arch config or the generic setup is going
 * to be used.
 */
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
        !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
#define BUILD_EMBED_FIRST_CHUNK
#endif

/* build pcpu_page_first_chunk() iff needed by the arch config */
#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
#define BUILD_PAGE_FIRST_CHUNK
#endif

/* pcpu_build_alloc_info() is used by both embed and page first chunk */
#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
/**
 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 *
 * This function determines grouping of units, their mappings to cpus
 * and other parameters considering needed percpu size, allocation
 * atom size and distances between CPUs.
 *
 * Groups are always multiples of atom size and CPUs which are of
 * LOCAL_DISTANCE both ways are grouped together and share space for
 * units in the same group.  The returned configuration is guaranteed
 * to have CPUs on different nodes on different groups and >=75% usage
 * of allocated virtual address space.
 *
 * RETURNS:
 * On success, pointer to the new allocation_info is returned.  On
 * failure, ERR_PTR value is returned.
 */
static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
                                size_t reserved_size, size_t dyn_size,
                                size_t atom_size,
                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
        static int group_map[NR_CPUS] __initdata;
        static int group_cnt[NR_CPUS] __initdata;
        static struct cpumask mask __initdata;
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        int nr_groups = 1, nr_units = 0;
        size_t size_sum, min_unit_size, alloc_size;
        int upa, max_upa, best_upa;        /* units_per_alloc */
        int last_allocs, group, unit;
        unsigned int cpu, tcpu;
        struct pcpu_alloc_info *ai;
        unsigned int *cpu_map;

        /* this function may be called multiple times */
        memset(group_map, 0, sizeof(group_map));
        memset(group_cnt, 0, sizeof(group_cnt));
        cpumask_clear(&mask);

        /* calculate size_sum and ensure dyn_size is enough for early alloc */
        size_sum = PFN_ALIGN(static_size + reserved_size +
                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
        dyn_size = size_sum - static_size - reserved_size;

        /*
         * Determine min_unit_size, alloc_size and max_upa such that
         * alloc_size is multiple of atom_size and is the smallest
         * which can accommodate 4k aligned segments which are equal to
         * or larger than min_unit_size.
         */
        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

        /* determine the maximum # of units that can fit in an allocation */
        alloc_size = roundup(min_unit_size, atom_size);
        upa = alloc_size / min_unit_size;
        while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                upa--;
        max_upa = upa;

        cpumask_copy(&mask, cpu_possible_mask);

        /* group cpus according to their proximity */
        for (group = 0; !cpumask_empty(&mask); group++) {
                /* pop the group's first cpu */
                cpu = cpumask_first(&mask);
                group_map[cpu] = group;
                group_cnt[group]++;
                cpumask_clear_cpu(cpu, &mask);

                for_each_cpu(tcpu, &mask) {
                        if (!cpu_distance_fn ||
                            (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
                             cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
                                group_map[tcpu] = group;
                                group_cnt[group]++;
                                cpumask_clear_cpu(tcpu, &mask);
                        }
                }
        }
        nr_groups = group;

        /*
         * Wasted space is caused by a ratio imbalance of upa to group_cnt.
         * Expand the unit_size until we use >= 75% of the units allocated.
         * Related to atom_size, which could be much larger than the unit_size.
         */
        last_allocs = INT_MAX;
        best_upa = 0;
        for (upa = max_upa; upa; upa--) {
                int allocs = 0, wasted = 0;

                if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                        continue;

                for (group = 0; group < nr_groups; group++) {
                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
                        allocs += this_allocs;
                        wasted += this_allocs * upa - group_cnt[group];
                }

                /*
                 * Don't accept if wastage is over 1/3.  The
                 * greater-than comparison ensures upa==1 always
                 * passes the following check.
                 */
                if (wasted > num_possible_cpus() / 3)
                        continue;

                /* and then don't consume more memory */
                if (allocs > last_allocs)
                        break;
                last_allocs = allocs;
                best_upa = upa;
        }
        BUG_ON(!best_upa);
        upa = best_upa;

        /* allocate and fill alloc_info */
        for (group = 0; group < nr_groups; group++)
                nr_units += roundup(group_cnt[group], upa);

        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
        if (!ai)
                return ERR_PTR(-ENOMEM);
        cpu_map = ai->groups[0].cpu_map;

        for (group = 0; group < nr_groups; group++) {
                ai->groups[group].cpu_map = cpu_map;
                cpu_map += roundup(group_cnt[group], upa);
        }

        ai->static_size = static_size;
        ai->reserved_size = reserved_size;
        ai->dyn_size = dyn_size;
        ai->unit_size = alloc_size / upa;
        ai->atom_size = atom_size;
        ai->alloc_size = alloc_size;

        for (group = 0, unit = 0; group < nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];

                /*
                 * Initialize base_offset as if all groups are located
                 * back-to-back.  The caller should update this to
                 * reflect actual allocation.
                 */
                gi->base_offset = unit * ai->unit_size;

                for_each_possible_cpu(cpu)
                        if (group_map[cpu] == group)
                                gi->cpu_map[gi->nr_units++] = cpu;
                gi->nr_units = roundup(gi->nr_units, upa);
                unit += gi->nr_units;
        }
        BUG_ON(unit != nr_units);

        return ai;
}

static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
                                   pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NUMA
        int node = NUMA_NO_NODE;
        void *ptr;

        if (cpu_to_nd_fn)
                node = cpu_to_nd_fn(cpu);

        if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
                ptr = memblock_alloc_from(size, align, goal);
                pr_info("cpu %d has no node %d or node-local memory\n",
                        cpu, node);
                pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
                         cpu, size, (u64)__pa(ptr));
        } else {
                ptr = memblock_alloc_try_nid(size, align, goal,
                                             MEMBLOCK_ALLOC_ACCESSIBLE,
                                             node);

                pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
                         cpu, size, node, (u64)__pa(ptr));
        }
        return ptr;
#else
        return memblock_alloc_from(size, align, goal);
#endif
}

static void __init pcpu_fc_free(void *ptr, size_t size)
{
        memblock_free(ptr, size);
}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */

#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
 *
 * This is a helper to ease setting up embedded first percpu chunk and
 * can be called where pcpu_setup_first_chunk() is expected.
 *
 * If this function is used to setup the first chunk, it is allocated
 * by calling pcpu_fc_alloc and used as-is without being mapped into
 * vmalloc area.  Allocations are always whole multiples of @atom_size
 * aligned to @atom_size.
 *
 * This enables the first chunk to piggy back on the linear physical
 * mapping which often uses larger page size.  Please note that this
 * can result in very sparse cpu->unit mapping on NUMA machines thus
 * requiring large vmalloc address space.  Don't use this allocator if
 * vmalloc space is not orders of magnitude larger than distances
 * between node memory addresses (ie. 32bit NUMA machines).
 *
 * @dyn_size specifies the minimum dynamic area size.
 *
 * If the needed size is smaller than the minimum or specified unit
 * size, the leftover is returned using pcpu_fc_free.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                                  size_t atom_size,
                                  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
                                  pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        void *base = (void *)ULONG_MAX;
        void **areas = NULL;
        struct pcpu_alloc_info *ai;
        size_t size_sum, areas_size;
        unsigned long max_distance;
        int group, i, highest_group, rc = 0;

        ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
                                   cpu_distance_fn);
        if (IS_ERR(ai))
                return PTR_ERR(ai);

        size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));

        areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
        if (!areas) {
                rc = -ENOMEM;
                goto out_free;
        }

        /* allocate, copy and determine base address & max_distance */
        highest_group = 0;
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                unsigned int cpu = NR_CPUS;
                void *ptr;

                for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
                        cpu = gi->cpu_map[i];
                BUG_ON(cpu == NR_CPUS);

                /* allocate space for the whole group */
                ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
                if (!ptr) {
                        rc = -ENOMEM;
                        goto out_free_areas;
                }
                /* kmemleak tracks the percpu allocations separately */
                kmemleak_ignore_phys(__pa(ptr));
                areas[group] = ptr;

                base = min(ptr, base);
                if (ptr > areas[highest_group])
                        highest_group = group;
        }
        max_distance = areas[highest_group] - base;
        max_distance += ai->unit_size * ai->groups[highest_group].nr_units;

        /* warn if maximum distance is further than 75% of vmalloc space */
        if (max_distance > VMALLOC_TOTAL * 3 / 4) {
                pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
                                max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
                goto out_free_areas;
#endif
        }

        /*
         * Copy data and free unused parts.  This should happen after all
         * allocations are complete; otherwise, we may end up with
         * overlapping groups.
         */
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                void *ptr = areas[group];

                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
                                /* unused unit, free whole */
                                pcpu_fc_free(ptr, ai->unit_size);
                                continue;
                        }
                        /* copy and return the unused part */
                        memcpy(ptr, __per_cpu_start, ai->static_size);
                        pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
                }
        }

        /* base address is now known, determine group base offsets */
        for (group = 0; group < ai->nr_groups; group++) {
                ai->groups[group].base_offset = areas[group] - base;
        }

        pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
                PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
                ai->dyn_size, ai->unit_size);

        pcpu_setup_first_chunk(ai, base);
        goto out_free;

out_free_areas:
        for (group = 0; group < ai->nr_groups; group++)
                if (areas[group])
                        pcpu_fc_free(areas[group],
                                ai->groups[group].nr_units * ai->unit_size);
out_free:
        pcpu_free_alloc_info(ai);
        if (areas)
                memblock_free(areas, areas_size);
        return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */

#ifdef BUILD_PAGE_FIRST_CHUNK
#include <linux/pgalloc.h>

#ifndef P4D_TABLE_SIZE
#define P4D_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PUD_TABLE_SIZE
#define PUD_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PMD_TABLE_SIZE
#define PMD_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PTE_TABLE_SIZE
#define PTE_TABLE_SIZE PAGE_SIZE
#endif
void __init __weak pcpu_populate_pte(unsigned long addr)
{
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        if (pgd_none(*pgd)) {
                p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
                pgd_populate_kernel(addr, pgd, p4d);
        }

        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d)) {
                pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
                p4d_populate_kernel(addr, p4d, pud);
        }

        pud = pud_offset(p4d, addr);
        if (pud_none(*pud)) {
                pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
                pud_populate(&init_mm, pud, pmd);
        }

        pmd = pmd_offset(pud, addr);
        if (!pmd_present(*pmd)) {
                pte_t *new;

                new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
                pmd_populate_kernel(&init_mm, pmd, new);
        }

        return;
}

/**
 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
 * @reserved_size: the size of reserved percpu area in bytes
 * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
 *
 * This is a helper to ease setting up page-remapped first percpu
 * chunk and can be called where pcpu_setup_first_chunk() is expected.
 *
 * This is the basic allocator.  Static percpu area is allocated
 * page-by-page into vmalloc area.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        static struct vm_struct vm;
        struct pcpu_alloc_info *ai;
        char psize_str[16];
        int unit_pages;
        size_t pages_size;
        struct page **pages;
        int unit, i, j, rc = 0;
        int upa;
        int nr_g0_units;

        snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);

        ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
        if (IS_ERR(ai))
                return PTR_ERR(ai);
        BUG_ON(ai->nr_groups != 1);
        upa = ai->alloc_size/ai->unit_size;
        nr_g0_units = roundup(num_possible_cpus(), upa);
        if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
                pcpu_free_alloc_info(ai);
                return -EINVAL;
        }

        unit_pages = ai->unit_size >> PAGE_SHIFT;

        /* unaligned allocations can't be freed, round up to page size */
        pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
                               sizeof(pages[0]));
        pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES);

        /* allocate pages */
        j = 0;
        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned int cpu = ai->groups[0].cpu_map[unit];
                for (i = 0; i < unit_pages; i++) {
                        void *ptr;

                        ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
                        if (!ptr) {
                                pr_warn("failed to allocate %s page for cpu%u\n",
                                                psize_str, cpu);
                                goto enomem;
                        }
                        /* kmemleak tracks the percpu allocations separately */
                        kmemleak_ignore_phys(__pa(ptr));
                        pages[j++] = virt_to_page(ptr);
                }
        }

        /* allocate vm area, map the pages and copy static data */
        vm.flags = VM_ALLOC;
        vm.size = num_possible_cpus() * ai->unit_size;
        vm_area_register_early(&vm, PAGE_SIZE);

        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned long unit_addr =
                        (unsigned long)vm.addr + unit * ai->unit_size;

                for (i = 0; i < unit_pages; i++)
                        pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));

                /* pte already populated, the following shouldn't fail */
                rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
                                      unit_pages);
                if (rc < 0)
                        panic("failed to map percpu area, err=%d\n", rc);

                flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);

                /* copy static data */
                memcpy((void *)unit_addr, __per_cpu_start, ai->static_size);
        }

        /* we're ready, commit */
        pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
                unit_pages, psize_str, ai->static_size,
                ai->reserved_size, ai->dyn_size);

        pcpu_setup_first_chunk(ai, vm.addr);
        goto out_free_ar;

enomem:
        while (--j >= 0)
                pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
        rc = -ENOMEM;
out_free_ar:
        memblock_free(pages, pages_size);
        pcpu_free_alloc_info(ai);
        return rc;
}
#endif /* BUILD_PAGE_FIRST_CHUNK */

#ifndef        CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
 * Generic SMP percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
 * important because many archs have addressing restrictions and might
 * fail if the percpu area is located far away from the previous
 * location.  As an added bonus, in non-NUMA cases, embedding is
 * generally a good idea TLB-wise because percpu area can piggy back
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);

void __init setup_per_cpu_areas(void)
{
        unsigned long delta;
        unsigned int cpu;
        int rc;

        /*
         * Always reserve area for module percpu variables.  That's
         * what the legacy allocator did.
         */
        rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
                                    PAGE_SIZE, NULL, NULL);
        if (rc < 0)
                panic("Failed to initialize percpu areas.");

        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu)
                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif        /* CONFIG_HAVE_SETUP_PER_CPU_AREA */

#else        /* CONFIG_SMP */

/*
 * UP percpu area setup.
 *
 * UP always uses km-based percpu allocator with identity mapping.
 * Static percpu variables are indistinguishable from the usual static
 * variables and don't require any special preparation.
 */
void __init setup_per_cpu_areas(void)
{
        const size_t unit_size =
                roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
                                         PERCPU_DYNAMIC_RESERVE));
        struct pcpu_alloc_info *ai;
        void *fc;

        ai = pcpu_alloc_alloc_info(1, 1);
        fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
        /* kmemleak tracks the percpu allocations separately */
        kmemleak_ignore_phys(__pa(fc));

        ai->dyn_size = unit_size;
        ai->unit_size = unit_size;
        ai->atom_size = unit_size;
        ai->alloc_size = unit_size;
        ai->groups[0].nr_units = 1;
        ai->groups[0].cpu_map[0] = 0;

        pcpu_setup_first_chunk(ai, fc);
        pcpu_free_alloc_info(ai);
}

#endif        /* CONFIG_SMP */

/*
 * pcpu_nr_pages - calculate total number of populated backing pages
 *
 * This reflects the number of pages populated to back chunks.  Metadata is
 * excluded in the number exposed in meminfo as the number of backing pages
 * scales with the number of cpus and can quickly outweigh the memory used for
 * metadata.  It also keeps this calculation nice and simple.
 *
 * RETURNS:
 * Total number of populated backing pages in use by the allocator.
 */
unsigned long pcpu_nr_pages(void)
{
        return data_race(READ_ONCE(pcpu_nr_populated)) * pcpu_nr_units;
}

/*
 * Percpu allocator is initialized early during boot when neither slab or
 * workqueue is available.  Plug async management until everything is up
 * and running.
 */
static int __init percpu_enable_async(void)
{
        pcpu_async_enabled = true;
        return 0;
}
subsys_initcall(percpu_enable_async);































































    1 









    1 




    1 







    1 
































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
// SPDX-License-Identifier: GPL-2.0-or-later
/* Key garbage collector
 *
 * Copyright (C) 2009-2011 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/slab.h>
#include <linux/security.h>
#include <keys/keyring-type.h>
#include "internal.h"

/*
 * Delay between key revocation/expiry in seconds
 */
unsigned key_gc_delay = 5 * 60;

/*
 * Reaper for unused keys.
 */
static void key_garbage_collector(struct work_struct *work);
DECLARE_WORK(key_gc_work, key_garbage_collector);

/*
 * Reaper for links from keyrings to dead keys.
 */
static void key_gc_timer_func(struct timer_list *);
static DEFINE_TIMER(key_gc_timer, key_gc_timer_func);

static time64_t key_gc_next_run = TIME64_MAX;
static struct key_type *key_gc_dead_keytype;

static unsigned long key_gc_flags;
#define KEY_GC_KEY_EXPIRED        0        /* A key expired and needs unlinking */
#define KEY_GC_REAP_KEYTYPE        1        /* A keytype is being unregistered */
#define KEY_GC_REAPING_KEYTYPE        2        /* Cleared when keytype reaped */


/*
 * Any key whose type gets unregistered will be re-typed to this if it can't be
 * immediately unlinked.
 */
struct key_type key_type_dead = {
        .name = ".dead",
};

/*
 * Schedule a garbage collection run.
 * - time precision isn't particularly important
 */
void key_schedule_gc(time64_t gc_at)
{
        unsigned long expires;
        time64_t now = ktime_get_real_seconds();

        kenter("%lld", gc_at - now);

        if (gc_at <= now || test_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) {
                kdebug("IMMEDIATE");
                schedule_work(&key_gc_work);
        } else if (gc_at < key_gc_next_run) {
                kdebug("DEFERRED");
                key_gc_next_run = gc_at;
                expires = jiffies + (gc_at - now) * HZ;
                mod_timer(&key_gc_timer, expires);
        }
}

/*
 * Set the expiration time on a key.
 */
void key_set_expiry(struct key *key, time64_t expiry)
{
        key->expiry = expiry;
        if (expiry != TIME64_MAX) {
                if (!(key->type->flags & KEY_TYPE_INSTANT_REAP))
                        expiry += key_gc_delay;
                key_schedule_gc(expiry);
        }
}

/*
 * Schedule a dead links collection run.
 */
void key_schedule_gc_links(void)
{
        set_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags);
        schedule_work(&key_gc_work);
}

/*
 * Some key's cleanup time was met after it expired, so we need to get the
 * reaper to go through a cycle finding expired keys.
 */
static void key_gc_timer_func(struct timer_list *unused)
{
        kenter("");
        key_gc_next_run = TIME64_MAX;
        key_schedule_gc_links();
}

/*
 * Reap keys of dead type.
 *
 * We use three flags to make sure we see three complete cycles of the garbage
 * collector: the first to mark keys of that type as being dead, the second to
 * collect dead links and the third to clean up the dead keys.  We have to be
 * careful as there may already be a cycle in progress.
 *
 * The caller must be holding key_types_sem.
 */
void key_gc_keytype(struct key_type *ktype)
{
        kenter("%s", ktype->name);

        key_gc_dead_keytype = ktype;
        set_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags);
        smp_mb();
        set_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags);

        kdebug("schedule");
        schedule_work(&key_gc_work);

        kdebug("sleep");
        wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE,
                    TASK_UNINTERRUPTIBLE);

        key_gc_dead_keytype = NULL;
        kleave("");
}

/*
 * Garbage collect a list of unreferenced, detached keys
 */
static noinline void key_gc_unused_keys(struct list_head *keys)
{
        while (!list_empty(keys)) {
                struct key *key =
                        list_entry(keys->next, struct key, graveyard_link);
                short state = key->state;

                list_del(&key->graveyard_link);

                kdebug("- %u", key->serial);
                key_check(key);

#ifdef CONFIG_KEY_NOTIFICATIONS
                remove_watch_list(key->watchers, key->serial);
                key->watchers = NULL;
#endif

                /* Throw away the key data if the key is instantiated */
                if (state == KEY_IS_POSITIVE && key->type->destroy)
                        key->type->destroy(key);

                security_key_free(key);

                atomic_dec(&key->user->nkeys);
                if (state != KEY_IS_UNINSTANTIATED)
                        atomic_dec(&key->user->nikeys);

                key_user_put(key->user);
                key_put_tag(key->domain_tag);
                kfree(key->description);

                memzero_explicit(key, sizeof(*key));
                kmem_cache_free(key_jar, key);
        }
}

/*
 * Garbage collector for unused keys.
 *
 * This is done in process context so that we don't have to disable interrupts
 * all over the place.  key_put() schedules this rather than trying to do the
 * cleanup itself, which means key_put() doesn't have to sleep.
 */
static void key_garbage_collector(struct work_struct *work)
{
        static LIST_HEAD(graveyard);
        static u8 gc_state;                /* Internal persistent state */
#define KEY_GC_REAP_AGAIN        0x01        /* - Need another cycle */
#define KEY_GC_REAPING_LINKS        0x02        /* - We need to reap links */
#define KEY_GC_REAPING_DEAD_1        0x10        /* - We need to mark dead keys */
#define KEY_GC_REAPING_DEAD_2        0x20        /* - We need to reap dead key links */
#define KEY_GC_REAPING_DEAD_3        0x40        /* - We need to reap dead keys */
#define KEY_GC_FOUND_DEAD_KEY        0x80        /* - We found at least one dead key */

        struct rb_node *cursor;
        struct key *key;
        time64_t new_timer, limit, expiry;

        kenter("[%lx,%x]", key_gc_flags, gc_state);

        limit = ktime_get_real_seconds();

        /* Work out what we're going to be doing in this pass */
        gc_state &= KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2;
        gc_state <<= 1;
        if (test_and_clear_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags))
                gc_state |= KEY_GC_REAPING_LINKS;

        if (test_and_clear_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags))
                gc_state |= KEY_GC_REAPING_DEAD_1;
        kdebug("new pass %x", gc_state);

        new_timer = TIME64_MAX;

        /* As only this function is permitted to remove things from the key
         * serial tree, if cursor is non-NULL then it will always point to a
         * valid node in the tree - even if lock got dropped.
         */
        spin_lock(&key_serial_lock);
        cursor = rb_first(&key_serial_tree);

continue_scanning:
        while (cursor) {
                key = rb_entry(cursor, struct key, serial_node);
                cursor = rb_next(cursor);

                if (!test_bit_acquire(KEY_FLAG_USER_ALIVE, &key->flags)) {
                        /* Clobber key->user after final put seen. */
                        goto found_unreferenced_key;
                }

                if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) {
                        if (key->type == key_gc_dead_keytype) {
                                gc_state |= KEY_GC_FOUND_DEAD_KEY;
                                set_bit(KEY_FLAG_DEAD, &key->flags);
                                key->perm = 0;
                                goto skip_dead_key;
                        } else if (key->type == &key_type_keyring &&
                                   key->restrict_link) {
                                goto found_restricted_keyring;
                        }
                }

                expiry = key->expiry;
                if (expiry != TIME64_MAX) {
                        if (!(key->type->flags & KEY_TYPE_INSTANT_REAP))
                                expiry += key_gc_delay;
                        if (expiry > limit && expiry < new_timer) {
                                kdebug("will expire %x in %lld",
                                       key_serial(key), key->expiry - limit);
                                new_timer = key->expiry;
                        }
                }

                if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2))
                        if (key->type == key_gc_dead_keytype)
                                gc_state |= KEY_GC_FOUND_DEAD_KEY;

                if ((gc_state & KEY_GC_REAPING_LINKS) ||
                    unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) {
                        if (key->type == &key_type_keyring)
                                goto found_keyring;
                }

                if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3))
                        if (key->type == key_gc_dead_keytype)
                                goto destroy_dead_key;

        skip_dead_key:
                if (spin_is_contended(&key_serial_lock) || need_resched())
                        goto contended;
        }

contended:
        spin_unlock(&key_serial_lock);

maybe_resched:
        if (cursor) {
                cond_resched();
                spin_lock(&key_serial_lock);
                goto continue_scanning;
        }

        /* We've completed the pass.  Set the timer if we need to and queue a
         * new cycle if necessary.  We keep executing cycles until we find one
         * where we didn't reap any keys.
         */
        kdebug("pass complete");

        if (new_timer != TIME64_MAX) {
                new_timer += key_gc_delay;
                key_schedule_gc(new_timer);
        }

        if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2) ||
            !list_empty(&graveyard)) {
                /* Make sure that all pending keyring payload destructions are
                 * fulfilled and that people aren't now looking at dead or
                 * dying keys that they don't have a reference upon or a link
                 * to.
                 */
                kdebug("gc sync");
                synchronize_rcu();
        }

        if (!list_empty(&graveyard)) {
                kdebug("gc keys");
                key_gc_unused_keys(&graveyard);
        }

        if (unlikely(gc_state & (KEY_GC_REAPING_DEAD_1 |
                                 KEY_GC_REAPING_DEAD_2))) {
                if (!(gc_state & KEY_GC_FOUND_DEAD_KEY)) {
                        /* No remaining dead keys: short circuit the remaining
                         * keytype reap cycles.
                         */
                        kdebug("dead short");
                        gc_state &= ~(KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2);
                        gc_state |= KEY_GC_REAPING_DEAD_3;
                } else {
                        gc_state |= KEY_GC_REAP_AGAIN;
                }
        }

        if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) {
                kdebug("dead wake");
                smp_mb();
                clear_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags);
                wake_up_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE);
        }

        if (gc_state & KEY_GC_REAP_AGAIN)
                schedule_work(&key_gc_work);
        kleave(" [end %x]", gc_state);
        return;

        /* We found an unreferenced key - once we've removed it from the tree,
         * we can safely drop the lock.
         */
found_unreferenced_key:
        kdebug("unrefd key %d", key->serial);
        rb_erase(&key->serial_node, &key_serial_tree);
        spin_unlock(&key_serial_lock);

        list_add_tail(&key->graveyard_link, &graveyard);
        gc_state |= KEY_GC_REAP_AGAIN;
        goto maybe_resched;

        /* We found a restricted keyring and need to update the restriction if
         * it is associated with the dead key type.
         */
found_restricted_keyring:
        spin_unlock(&key_serial_lock);
        keyring_restriction_gc(key, key_gc_dead_keytype);
        goto maybe_resched;

        /* We found a keyring and we need to check the payload for links to
         * dead or expired keys.  We don't flag another reap immediately as we
         * have to wait for the old payload to be destroyed by RCU before we
         * can reap the keys to which it refers.
         */
found_keyring:
        spin_unlock(&key_serial_lock);
        keyring_gc(key, limit);
        goto maybe_resched;

        /* We found a dead key that is still referenced.  Reset its type and
         * destroy its payload with its semaphore held.
         */
destroy_dead_key:
        spin_unlock(&key_serial_lock);
        kdebug("destroy key %d", key->serial);
        down_write(&key->sem);
        key->type = &key_type_dead;
        if (key_gc_dead_keytype->destroy)
                key_gc_dead_keytype->destroy(key);
        memset(&key->payload, KEY_DESTROY, sizeof(key->payload));
        up_write(&key->sem);
        goto maybe_resched;
}



















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Private definitions for the generic associative array implementation.
 *
 * See Documentation/core-api/assoc_array.rst for information.
 *
 * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_ASSOC_ARRAY_PRIV_H
#define _LINUX_ASSOC_ARRAY_PRIV_H

#ifdef CONFIG_ASSOCIATIVE_ARRAY

#include <linux/assoc_array.h>

#define ASSOC_ARRAY_FAN_OUT                16        /* Number of slots per node */
#define ASSOC_ARRAY_FAN_MASK                (ASSOC_ARRAY_FAN_OUT - 1)
#define ASSOC_ARRAY_LEVEL_STEP                (ilog2(ASSOC_ARRAY_FAN_OUT))
#define ASSOC_ARRAY_LEVEL_STEP_MASK        (ASSOC_ARRAY_LEVEL_STEP - 1)
#define ASSOC_ARRAY_KEY_CHUNK_MASK        (ASSOC_ARRAY_KEY_CHUNK_SIZE - 1)
#define ASSOC_ARRAY_KEY_CHUNK_SHIFT        (ilog2(BITS_PER_LONG))

/*
 * Undefined type representing a pointer with type information in the bottom
 * two bits.
 */
struct assoc_array_ptr;

/*
 * An N-way node in the tree.
 *
 * Each slot contains one of four things:
 *
 *        (1) Nothing (NULL).
 *
 *        (2) A leaf object (pointer types 0).
 *
 *        (3) A next-level node (pointer type 1, subtype 0).
 *
 *        (4) A shortcut (pointer type 1, subtype 1).
 *
 * The tree is optimised for search-by-ID, but permits reasonable iteration
 * also.
 *
 * The tree is navigated by constructing an index key consisting of an array of
 * segments, where each segment is ilog2(ASSOC_ARRAY_FAN_OUT) bits in size.
 *
 * The segments correspond to levels of the tree (the first segment is used at
 * level 0, the second at level 1, etc.).
 */
struct assoc_array_node {
        struct assoc_array_ptr        *back_pointer;
        u8                        parent_slot;
        struct assoc_array_ptr        *slots[ASSOC_ARRAY_FAN_OUT];
        unsigned long                nr_leaves_on_branch;
};

/*
 * A shortcut through the index space out to where a collection of nodes/leaves
 * with the same IDs live.
 */
struct assoc_array_shortcut {
        struct assoc_array_ptr        *back_pointer;
        int                        parent_slot;
        int                        skip_to_level;
        struct assoc_array_ptr        *next_node;
        unsigned long                index_key[];
};

/*
 * Preallocation cache.
 */
struct assoc_array_edit {
        struct rcu_head                        rcu;
        struct assoc_array                *array;
        const struct assoc_array_ops        *ops;
        const struct assoc_array_ops        *ops_for_excised_subtree;
        struct assoc_array_ptr                *leaf;
        struct assoc_array_ptr                **leaf_p;
        struct assoc_array_ptr                *dead_leaf;
        struct assoc_array_ptr                *new_meta[3];
        struct assoc_array_ptr                *excised_meta[1];
        struct assoc_array_ptr                *excised_subtree;
        struct assoc_array_ptr                **set_backpointers[ASSOC_ARRAY_FAN_OUT];
        struct assoc_array_ptr                *set_backpointers_to;
        struct assoc_array_node                *adjust_count_on;
        long                                adjust_count_by;
        struct {
                struct assoc_array_ptr        **ptr;
                struct assoc_array_ptr        *to;
        } set[2];
        struct {
                u8                        *p;
                u8                        to;
        } set_parent_slot[1];
        u8                                segment_cache[ASSOC_ARRAY_FAN_OUT + 1];
};

/*
 * Internal tree member pointers are marked in the bottom one or two bits to
 * indicate what type they are so that we don't have to look behind every
 * pointer to see what it points to.
 *
 * We provide functions to test type annotations and to create and translate
 * the annotated pointers.
 */
#define ASSOC_ARRAY_PTR_TYPE_MASK 0x1UL
#define ASSOC_ARRAY_PTR_LEAF_TYPE 0x0UL        /* Points to leaf (or nowhere) */
#define ASSOC_ARRAY_PTR_META_TYPE 0x1UL        /* Points to node or shortcut */
#define ASSOC_ARRAY_PTR_SUBTYPE_MASK        0x2UL
#define ASSOC_ARRAY_PTR_NODE_SUBTYPE        0x0UL
#define ASSOC_ARRAY_PTR_SHORTCUT_SUBTYPE 0x2UL

static inline bool assoc_array_ptr_is_meta(const struct assoc_array_ptr *x)
{
        return (unsigned long)x & ASSOC_ARRAY_PTR_TYPE_MASK;
}
static inline bool assoc_array_ptr_is_leaf(const struct assoc_array_ptr *x)
{
        return !assoc_array_ptr_is_meta(x);
}
static inline bool assoc_array_ptr_is_shortcut(const struct assoc_array_ptr *x)
{
        return (unsigned long)x & ASSOC_ARRAY_PTR_SUBTYPE_MASK;
}
static inline bool assoc_array_ptr_is_node(const struct assoc_array_ptr *x)
{
        return !assoc_array_ptr_is_shortcut(x);
}

static inline void *assoc_array_ptr_to_leaf(const struct assoc_array_ptr *x)
{
        return (void *)((unsigned long)x & ~ASSOC_ARRAY_PTR_TYPE_MASK);
}

static inline
unsigned long __assoc_array_ptr_to_meta(const struct assoc_array_ptr *x)
{
        return (unsigned long)x &
                ~(ASSOC_ARRAY_PTR_SUBTYPE_MASK | ASSOC_ARRAY_PTR_TYPE_MASK);
}
static inline
struct assoc_array_node *assoc_array_ptr_to_node(const struct assoc_array_ptr *x)
{
        return (struct assoc_array_node *)__assoc_array_ptr_to_meta(x);
}
static inline
struct assoc_array_shortcut *assoc_array_ptr_to_shortcut(const struct assoc_array_ptr *x)
{
        return (struct assoc_array_shortcut *)__assoc_array_ptr_to_meta(x);
}

static inline
struct assoc_array_ptr *__assoc_array_x_to_ptr(const void *p, unsigned long t)
{
        return (struct assoc_array_ptr *)((unsigned long)p | t);
}
static inline
struct assoc_array_ptr *assoc_array_leaf_to_ptr(const void *p)
{
        return __assoc_array_x_to_ptr(p, ASSOC_ARRAY_PTR_LEAF_TYPE);
}
static inline
struct assoc_array_ptr *assoc_array_node_to_ptr(const struct assoc_array_node *p)
{
        return __assoc_array_x_to_ptr(
                p, ASSOC_ARRAY_PTR_META_TYPE | ASSOC_ARRAY_PTR_NODE_SUBTYPE);
}
static inline
struct assoc_array_ptr *assoc_array_shortcut_to_ptr(const struct assoc_array_shortcut *p)
{
        return __assoc_array_x_to_ptr(
                p, ASSOC_ARRAY_PTR_META_TYPE | ASSOC_ARRAY_PTR_SHORTCUT_SUBTYPE);
}

#endif /* CONFIG_ASSOCIATIVE_ARRAY */
#endif /* _LINUX_ASSOC_ARRAY_PRIV_H */






















































































































































































































































































































































































































    6 





























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ADDRCONF_H
#define _ADDRCONF_H

#define MAX_RTR_SOLICITATIONS                -1                /* unlimited */
#define RTR_SOLICITATION_INTERVAL        (4*HZ)
#define RTR_SOLICITATION_MAX_INTERVAL        (3600*HZ)        /* 1 hour */

#define MIN_VALID_LIFETIME                (2*3600)        /* 2 hours */

/* TEMP_VALID_LIFETIME default value as specified in RFC 8981 3.8 */
#define TEMP_VALID_LIFETIME                (2*86400)       /* 2 days */
#define TEMP_PREFERRED_LIFETIME                (86400)         /* 24 hours */
#define REGEN_MIN_ADVANCE                (2)             /* 2 seconds */
#define REGEN_MAX_RETRY                        (3)
#define MAX_DESYNC_FACTOR                (600)

#define ADDR_CHECK_FREQUENCY                (120*HZ)

#define IPV6_MAX_ADDRESSES                16

#define ADDRCONF_TIMER_FUZZ_MINUS        (HZ > 50 ? HZ / 50 : 1)
#define ADDRCONF_TIMER_FUZZ                (HZ / 4)
#define ADDRCONF_TIMER_FUZZ_MAX                (HZ)

#define ADDRCONF_NOTIFY_PRIORITY        0

#include <linux/in.h>
#include <linux/in6.h>

struct prefix_info {
        __u8                        type;
        __u8                        length;
        __u8                        prefix_len;

        union __packed {
                __u8                flags;
                struct __packed {
#if defined(__BIG_ENDIAN_BITFIELD)
                        __u8        onlink : 1,
                                autoconf : 1,
                                routeraddr : 1,
                                preferpd : 1,
                                reserved : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
                        __u8        reserved : 4,
                                preferpd : 1,
                                routeraddr : 1,
                                autoconf : 1,
                                onlink : 1;
#else
#error "Please fix <asm/byteorder.h>"
#endif
                };
        };
        __be32                        valid;
        __be32                        prefered;
        __be32                        reserved2;

        struct in6_addr                prefix;
};

/* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */
static_assert(sizeof(struct prefix_info) == 32);

#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <net/if_inet6.h>
#include <net/ipv6.h>

struct in6_validator_info {
        struct in6_addr                i6vi_addr;
        struct inet6_dev        *i6vi_dev;
        struct netlink_ext_ack        *extack;
};

struct ifa6_config {
        const struct in6_addr        *pfx;
        unsigned int                plen;

        u8                        ifa_proto;

        const struct in6_addr        *peer_pfx;

        u32                        rt_priority;
        u32                        ifa_flags;
        u32                        preferred_lft;
        u32                        valid_lft;
        u16                        scope;
};

enum addr_type_t {
        UNICAST_ADDR,
        MULTICAST_ADDR,
        ANYCAST_ADDR,
};

struct inet6_fill_args {
        u32 portid;
        u32 seq;
        int event;
        unsigned int flags;
        int netnsid;
        int ifindex;
        enum addr_type_t type;
        bool force_rt_scope_universe;
};

int addrconf_init(void);
void addrconf_cleanup(void);

int addrconf_add_ifaddr(struct net *net, void __user *arg);
int addrconf_del_ifaddr(struct net *net, void __user *arg);
int addrconf_set_dstaddr(struct net *net, void __user *arg);

int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
                  const struct net_device *dev, int strict);
int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
                            const struct net_device *dev, bool skip_dev_check,
                            int strict, u32 banned_flags);

#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
#endif

int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
                          unsigned char nsegs);

bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
                                   const unsigned int prefix_len,
                                   struct net_device *dev);

int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev);

struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
                                 struct net_device *dev);

struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev, int strict);

int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
                       const struct in6_addr *daddr, unsigned int srcprefs,
                       struct in6_addr *saddr);
int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
                    u32 banned_flags);
bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
                          bool match_wildcard);
bool inet_rcv_saddr_any(const struct sock *sk);
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);

void addrconf_add_linklocal(struct inet6_dev *idev,
                            const struct in6_addr *addr, u32 flags);

int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
                                 const struct prefix_info *pinfo,
                                 struct inet6_dev *in6_dev,
                                 const struct in6_addr *addr, int addr_type,
                                 u32 addr_flags, bool sllao, bool tokenized,
                                 __u32 valid_lft, u32 prefered_lft);

static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr)
{
        memcpy(eui, addr, 3);
        eui[3] = 0xFF;
        eui[4] = 0xFE;
        memcpy(eui + 5, addr + 3, 3);
}

static inline void addrconf_addr_eui48(u8 *eui, const char *const addr)
{
        addrconf_addr_eui48_base(eui, addr);
        eui[0] ^= 2;
}

static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
{
        if (dev->addr_len != ETH_ALEN)
                return -1;

        /*
         * The zSeries OSA network cards can be shared among various
         * OS instances, but the OSA cards have only one MAC address.
         * This leads to duplicate address conflicts in conjunction
         * with IPv6 if more than one instance uses the same card.
         *
         * The driver for these cards can deliver a unique 16-bit
         * identifier for each instance sharing the same card.  It is
         * placed instead of 0xFFFE in the interface identifier.  The
         * "u" bit of the interface identifier is not inverted in this
         * case.  Hence the resulting interface identifier has local
         * scope according to RFC2373.
         */

        addrconf_addr_eui48_base(eui, dev->dev_addr);

        if (dev->dev_id) {
                eui[3] = (dev->dev_id >> 8) & 0xFF;
                eui[4] = dev->dev_id & 0xFF;
        } else {
                eui[0] ^= 2;
        }

        return 0;
}

#define INFINITY_LIFE_TIME 0xFFFFFFFF

static inline unsigned long addrconf_timeout_fixup(u32 timeout,
                                                   unsigned int unit)
{
        if (timeout == INFINITY_LIFE_TIME)
                return ~0UL;

        /*
         * Avoid arithmetic overflow.
         * Assuming unit is constant and non-zero, this "if" statement
         * will go away on 64bit archs.
         */
        if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit)
                return LONG_MAX / unit;

        return timeout;
}

static inline int addrconf_finite_timeout(unsigned long timeout)
{
        return ~timeout;
}

/*
 *        IPv6 Address Label subsystem (addrlabel.c)
 */
int ipv6_addr_label_init(void);
void ipv6_addr_label_cleanup(void);
int ipv6_addr_label_rtnl_register(void);
u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr,
                    int type, int ifindex);

/*
 *        multicast prototypes (mcast.c)
 */
static inline bool ipv6_mc_may_pull(struct sk_buff *skb,
                                    unsigned int len)
{
        if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len)
                return false;

        return pskb_may_pull(skb, len);
}

int ipv6_sock_mc_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
void __ipv6_sock_mc_close(struct sock *sk);
void ipv6_sock_mc_close(struct sock *sk);
bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
                    const struct in6_addr *src_addr);

int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr);
int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr);
int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr);
void ipv6_mc_up(struct inet6_dev *idev);
void ipv6_mc_down(struct inet6_dev *idev);
void ipv6_mc_unmap(struct inet6_dev *idev);
void ipv6_mc_remap(struct inet6_dev *idev);
void ipv6_mc_init_dev(struct inet6_dev *idev);
void ipv6_mc_destroy_dev(struct inet6_dev *idev);
int ipv6_mc_check_mld(struct sk_buff *skb);
void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp);

bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
                         const struct in6_addr *src_addr);

void ipv6_mc_dad_complete(struct inet6_dev *idev);

/*
 * identify MLD packets for MLD filter exceptions
 */
static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset)
{
        struct icmp6hdr *hdr;

        if (nexthdr != IPPROTO_ICMPV6 ||
            !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr)))
                return false;

        hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset);

        switch (hdr->icmp6_type) {
        case ICMPV6_MGM_QUERY:
        case ICMPV6_MGM_REPORT:
        case ICMPV6_MGM_REDUCTION:
        case ICMPV6_MLD2_REPORT:
                return true;
        default:
                break;
        }
        return false;
}

void addrconf_prefix_rcv(struct net_device *dev,
                         u8 *opt, int len, bool sllao);

/*
 *        anycast prototypes (anycast.c)
 */
int ipv6_sock_ac_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_ac_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
void __ipv6_sock_ac_close(struct sock *sk);
void ipv6_sock_ac_close(struct sock *sk);

int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr);
int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr);
void ipv6_ac_destroy_dev(struct inet6_dev *idev);
bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
                         const struct in6_addr *addr);
bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
                             const struct in6_addr *addr);
int ipv6_anycast_init(void);
void ipv6_anycast_cleanup(void);

/* Device notifier */
int register_inet6addr_notifier(struct notifier_block *nb);
int unregister_inet6addr_notifier(struct notifier_block *nb);
int inet6addr_notifier_call_chain(unsigned long val, void *v);

int register_inet6addr_validator_notifier(struct notifier_block *nb);
int unregister_inet6addr_validator_notifier(struct notifier_block *nb);
int inet6addr_validator_notifier_call_chain(unsigned long val, void *v);

void inet6_netconf_notify_devconf(struct net *net, int event, int type,
                                  int ifindex, struct ipv6_devconf *devconf);

/**
 * __in6_dev_get - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * Caller must hold rcu_read_lock or RTNL, because this function
 * does not take a reference on the inet6_dev.
 */
static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev)
{
        return rcu_dereference_rtnl(dev->ip6_ptr);
}

static inline struct inet6_dev *in6_dev_rcu(const struct net_device *dev)
{
        return rcu_dereference(dev->ip6_ptr);
}

static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev)
{
        return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr);
}

/**
 * __in6_dev_stats_get - get inet6_dev pointer for stats
 * @dev: network device
 * @skb: skb for original incoming interface if needed
 *
 * Caller must hold rcu_read_lock or RTNL, because this function
 * does not take a reference on the inet6_dev.
 */
static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        if (netif_is_l3_master(dev))
                dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb));
        return __in6_dev_get(dev);
}

/**
 * __in6_dev_get_safely - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * This is a safer version of __in6_dev_get
 */
static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev)
{
        if (likely(dev))
                return rcu_dereference_rtnl(dev->ip6_ptr);
        else
                return NULL;
}

/**
 * in6_dev_get - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * This version can be used in any context, and takes a reference
 * on the inet6_dev. Callers must use in6_dev_put() later to
 * release this reference.
 */
static inline struct inet6_dev *in6_dev_get(const struct net_device *dev)
{
        struct inet6_dev *idev;

        rcu_read_lock();
        idev = rcu_dereference(dev->ip6_ptr);
        if (idev)
                refcount_inc(&idev->refcnt);
        rcu_read_unlock();
        return idev;
}

static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev)
{
        struct inet6_dev *idev = __in6_dev_get(dev);

        return idev ? idev->nd_parms : NULL;
}

void in6_dev_finish_destroy(struct inet6_dev *idev);

static inline void in6_dev_put(struct inet6_dev *idev)
{
        if (refcount_dec_and_test(&idev->refcnt))
                in6_dev_finish_destroy(idev);
}

static inline void in6_dev_put_clear(struct inet6_dev **pidev)
{
        struct inet6_dev *idev = *pidev;

        if (idev) {
                in6_dev_put(idev);
                *pidev = NULL;
        }
}

static inline void __in6_dev_put(struct inet6_dev *idev)
{
        refcount_dec(&idev->refcnt);
}

static inline void in6_dev_hold(struct inet6_dev *idev)
{
        refcount_inc(&idev->refcnt);
}

/* called with rcu_read_lock held */
static inline bool ip6_ignore_linkdown(const struct net_device *dev)
{
        const struct inet6_dev *idev = __in6_dev_get(dev);

        if (unlikely(!idev))
                return true;

        return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown);
}

void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp);

static inline void in6_ifa_put(struct inet6_ifaddr *ifp)
{
        if (refcount_dec_and_test(&ifp->refcnt))
                inet6_ifa_finish_destroy(ifp);
}

static inline void __in6_ifa_put(struct inet6_ifaddr *ifp)
{
        refcount_dec(&ifp->refcnt);
}

static inline void in6_ifa_hold(struct inet6_ifaddr *ifp)
{
        refcount_inc(&ifp->refcnt);
}

static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp)
{
        return refcount_inc_not_zero(&ifp->refcnt);
}

/*
 *        compute link-local solicited-node multicast address
 */

static inline void addrconf_addr_solict_mult(const struct in6_addr *addr,
                                             struct in6_addr *solicited)
{
        ipv6_addr_set(solicited,
                      htonl(0xFF020000), 0,
                      htonl(0x1),
                      htonl(0xFF000000) | addr->s6_addr32[3]);
}

static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0;
#endif
}

static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0;
#endif
}

static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr)
{
        return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE);
}

static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
                ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) &
                 cpu_to_be64(0xffffffffff000000UL))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] |
                (addr->s6_addr32[2] ^ htonl(0x00000001)) |
                (addr->s6_addr[12] ^ 0xff)) == 0;
#endif
}

static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;

        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
                (p[1] ^ cpu_to_be64(0x6a))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0;
#endif
}

#ifdef CONFIG_PROC_FS
int if6_proc_init(void);
void if6_proc_exit(void);
#endif

int inet6_fill_ifmcaddr(struct sk_buff *skb,
                        const struct ifmcaddr6 *ifmca,
                        struct inet6_fill_args *args);

int inet6_fill_ifacaddr(struct sk_buff *skb,
                        const struct ifacaddr6 *ifaca,
                        struct inet6_fill_args *args);
#endif




























































































































































































































































































































































































































    1 


    1 





































































































































































































































































































































































































    1 







    1 



    1 


    1 














    1 









    1 




    1 



























    1 








    1 
    1 





    1 






    1 






    1 






















    1 











    1 








    1 










    1 




    1 








    1 







    1 









    1 


    1 
    1 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Input driver to ExplorerPS/2 device driver module.
 *
 * Copyright (c) 1999-2002 Vojtech Pavlik
 * Copyright (c) 2004      Dmitry Torokhov
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#define MOUSEDEV_MINOR_BASE        32
#define MOUSEDEV_MINORS                31
#define MOUSEDEV_MIX                63

#include <linux/bitops.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/input.h>
#include <linux/random.h>
#include <linux/major.h>
#include <linux/device.h>
#include <linux/cdev.h>
#include <linux/kernel.h>

MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
MODULE_DESCRIPTION("Mouse (ExplorerPS/2) device interfaces");
MODULE_LICENSE("GPL");

#ifndef CONFIG_INPUT_MOUSEDEV_SCREEN_X
#define CONFIG_INPUT_MOUSEDEV_SCREEN_X        1024
#endif
#ifndef CONFIG_INPUT_MOUSEDEV_SCREEN_Y
#define CONFIG_INPUT_MOUSEDEV_SCREEN_Y        768
#endif

static int xres = CONFIG_INPUT_MOUSEDEV_SCREEN_X;
module_param(xres, uint, 0644);
MODULE_PARM_DESC(xres, "Horizontal screen resolution");

static int yres = CONFIG_INPUT_MOUSEDEV_SCREEN_Y;
module_param(yres, uint, 0644);
MODULE_PARM_DESC(yres, "Vertical screen resolution");

static unsigned tap_time = 200;
module_param(tap_time, uint, 0644);
MODULE_PARM_DESC(tap_time, "Tap time for touchpads in absolute mode (msecs)");

struct mousedev_hw_data {
        int dx, dy, dz;
        int x, y;
        int abs_event;
        unsigned long buttons;
};

struct mousedev {
        int open;
        struct input_handle handle;
        wait_queue_head_t wait;
        struct list_head client_list;
        spinlock_t client_lock; /* protects client_list */
        struct mutex mutex;
        struct device dev;
        struct cdev cdev;
        bool exist;

        struct list_head mixdev_node;
        bool opened_by_mixdev;

        struct mousedev_hw_data packet;
        unsigned int pkt_count;
        int old_x[4], old_y[4];
        int frac_dx, frac_dy;
        unsigned long touch;

        int (*open_device)(struct mousedev *mousedev);
        void (*close_device)(struct mousedev *mousedev);
};

enum mousedev_emul {
        MOUSEDEV_EMUL_PS2,
        MOUSEDEV_EMUL_IMPS,
        MOUSEDEV_EMUL_EXPS
};

struct mousedev_motion {
        int dx, dy, dz;
        unsigned long buttons;
};

#define PACKET_QUEUE_LEN        16
struct mousedev_client {
        struct fasync_struct *fasync;
        struct mousedev *mousedev;
        struct list_head node;

        struct mousedev_motion packets[PACKET_QUEUE_LEN];
        unsigned int head, tail;
        spinlock_t packet_lock;
        int pos_x, pos_y;

        u8 ps2[6];
        unsigned char ready, buffer, bufsiz;
        unsigned char imexseq, impsseq;
        enum mousedev_emul mode;
        unsigned long last_buttons;
};

#define MOUSEDEV_SEQ_LEN        6

static unsigned char mousedev_imps_seq[] = { 0xf3, 200, 0xf3, 100, 0xf3, 80 };
static unsigned char mousedev_imex_seq[] = { 0xf3, 200, 0xf3, 200, 0xf3, 80 };

static struct mousedev *mousedev_mix;
static LIST_HEAD(mousedev_mix_list);

#define fx(i)  (mousedev->old_x[(mousedev->pkt_count - (i)) & 03])
#define fy(i)  (mousedev->old_y[(mousedev->pkt_count - (i)) & 03])

static void mousedev_touchpad_event(struct input_dev *dev,
                                    struct mousedev *mousedev,
                                    unsigned int code, int value)
{
        int size, tmp;
        enum { FRACTION_DENOM = 128 };

        switch (code) {

        case ABS_X:

                fx(0) = value;
                if (mousedev->touch && mousedev->pkt_count >= 2) {
                        size = input_abs_get_max(dev, ABS_X) -
                                        input_abs_get_min(dev, ABS_X);
                        if (size == 0)
                                size = 256 * 2;

                        tmp = ((value - fx(2)) * 256 * FRACTION_DENOM) / size;
                        tmp += mousedev->frac_dx;
                        mousedev->packet.dx = tmp / FRACTION_DENOM;
                        mousedev->frac_dx =
                                tmp - mousedev->packet.dx * FRACTION_DENOM;
                }
                break;

        case ABS_Y:
                fy(0) = value;
                if (mousedev->touch && mousedev->pkt_count >= 2) {
                        /* use X size for ABS_Y to keep the same scale */
                        size = input_abs_get_max(dev, ABS_X) -
                                        input_abs_get_min(dev, ABS_X);
                        if (size == 0)
                                size = 256 * 2;

                        tmp = -((value - fy(2)) * 256 * FRACTION_DENOM) / size;
                        tmp += mousedev->frac_dy;
                        mousedev->packet.dy = tmp / FRACTION_DENOM;
                        mousedev->frac_dy = tmp -
                                mousedev->packet.dy * FRACTION_DENOM;
                }
                break;
        }
}

static void mousedev_abs_event(struct input_dev *dev, struct mousedev *mousedev,
                                unsigned int code, int value)
{
        int min, max, size;

        switch (code) {

        case ABS_X:
                min = input_abs_get_min(dev, ABS_X);
                max = input_abs_get_max(dev, ABS_X);

                size = max - min;
                if (size == 0)
                        size = xres ? : 1;

                value = clamp(value, min, max);

                mousedev->packet.x = ((value - min) * xres) / size;
                mousedev->packet.abs_event = 1;
                break;

        case ABS_Y:
                min = input_abs_get_min(dev, ABS_Y);
                max = input_abs_get_max(dev, ABS_Y);

                size = max - min;
                if (size == 0)
                        size = yres ? : 1;

                value = clamp(value, min, max);

                mousedev->packet.y = yres - ((value - min) * yres) / size;
                mousedev->packet.abs_event = 1;
                break;
        }
}

static void mousedev_rel_event(struct mousedev *mousedev,
                                unsigned int code, int value)
{
        switch (code) {
        case REL_X:
                mousedev->packet.dx += value;
                break;

        case REL_Y:
                mousedev->packet.dy -= value;
                break;

        case REL_WHEEL:
                mousedev->packet.dz -= value;
                break;
        }
}

static void mousedev_key_event(struct mousedev *mousedev,
                                unsigned int code, int value)
{
        int index;

        switch (code) {

        case BTN_TOUCH:
        case BTN_0:
        case BTN_LEFT:                index = 0; break;

        case BTN_STYLUS:
        case BTN_1:
        case BTN_RIGHT:                index = 1; break;

        case BTN_2:
        case BTN_FORWARD:
        case BTN_STYLUS2:
        case BTN_MIDDLE:        index = 2; break;

        case BTN_3:
        case BTN_BACK:
        case BTN_SIDE:                index = 3; break;

        case BTN_4:
        case BTN_EXTRA:                index = 4; break;

        default:                return;
        }

        if (value) {
                set_bit(index, &mousedev->packet.buttons);
                set_bit(index, &mousedev_mix->packet.buttons);
        } else {
                clear_bit(index, &mousedev->packet.buttons);
                clear_bit(index, &mousedev_mix->packet.buttons);
        }
}

static void mousedev_notify_readers(struct mousedev *mousedev,
                                    struct mousedev_hw_data *packet)
{
        struct mousedev_client *client;
        struct mousedev_motion *p;
        unsigned int new_head;
        int wake_readers = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(client, &mousedev->client_list, node) {

                /* Just acquire the lock, interrupts already disabled */
                spin_lock(&client->packet_lock);

                p = &client->packets[client->head];
                if (client->ready && p->buttons != mousedev->packet.buttons) {
                        new_head = (client->head + 1) % PACKET_QUEUE_LEN;
                        if (new_head != client->tail) {
                                p = &client->packets[client->head = new_head];
                                memset(p, 0, sizeof(struct mousedev_motion));
                        }
                }

                if (packet->abs_event) {
                        p->dx += packet->x - client->pos_x;
                        p->dy += packet->y - client->pos_y;
                        client->pos_x = packet->x;
                        client->pos_y = packet->y;
                }

                client->pos_x += packet->dx;
                client->pos_x = clamp_val(client->pos_x, 0, xres);

                client->pos_y += packet->dy;
                client->pos_y = clamp_val(client->pos_y, 0, yres);

                p->dx += packet->dx;
                p->dy += packet->dy;
                p->dz += packet->dz;
                p->buttons = mousedev->packet.buttons;

                if (p->dx || p->dy || p->dz ||
                    p->buttons != client->last_buttons)
                        client->ready = 1;

                spin_unlock(&client->packet_lock);

                if (client->ready) {
                        kill_fasync(&client->fasync, SIGIO, POLL_IN);
                        wake_readers = 1;
                }
        }
        rcu_read_unlock();

        if (wake_readers)
                wake_up_interruptible(&mousedev->wait);
}

static void mousedev_touchpad_touch(struct mousedev *mousedev, int value)
{
        if (!value) {
                if (mousedev->touch &&
                    time_before(jiffies,
                                mousedev->touch + msecs_to_jiffies(tap_time))) {
                        /*
                         * Toggle left button to emulate tap.
                         * We rely on the fact that mousedev_mix always has 0
                         * motion packet so we won't mess current position.
                         */
                        set_bit(0, &mousedev->packet.buttons);
                        set_bit(0, &mousedev_mix->packet.buttons);
                        mousedev_notify_readers(mousedev, &mousedev_mix->packet);
                        mousedev_notify_readers(mousedev_mix,
                                                &mousedev_mix->packet);
                        clear_bit(0, &mousedev->packet.buttons);
                        clear_bit(0, &mousedev_mix->packet.buttons);
                }
                mousedev->touch = mousedev->pkt_count = 0;
                mousedev->frac_dx = 0;
                mousedev->frac_dy = 0;

        } else if (!mousedev->touch)
                mousedev->touch = jiffies;
}

static void mousedev_event(struct input_handle *handle,
                           unsigned int type, unsigned int code, int value)
{
        struct mousedev *mousedev = handle->private;

        switch (type) {

        case EV_ABS:
                /* Ignore joysticks */
                if (test_bit(BTN_TRIGGER, handle->dev->keybit))
                        return;

                if (test_bit(BTN_TOOL_FINGER, handle->dev->keybit))
                        mousedev_touchpad_event(handle->dev,
                                                mousedev, code, value);
                else
                        mousedev_abs_event(handle->dev, mousedev, code, value);

                break;

        case EV_REL:
                mousedev_rel_event(mousedev, code, value);
                break;

        case EV_KEY:
                if (value != 2) {
                        if (code == BTN_TOUCH &&
                            test_bit(BTN_TOOL_FINGER, handle->dev->keybit))
                                mousedev_touchpad_touch(mousedev, value);
                        else
                                mousedev_key_event(mousedev, code, value);
                }
                break;

        case EV_SYN:
                if (code == SYN_REPORT) {
                        if (mousedev->touch) {
                                mousedev->pkt_count++;
                                /*
                                 * Input system eats duplicate events,
                                 * but we need all of them to do correct
                                 * averaging so apply present one forward
                                 */
                                fx(0) = fx(1);
                                fy(0) = fy(1);
                        }

                        mousedev_notify_readers(mousedev, &mousedev->packet);
                        mousedev_notify_readers(mousedev_mix, &mousedev->packet);

                        mousedev->packet.dx = mousedev->packet.dy =
                                mousedev->packet.dz = 0;
                        mousedev->packet.abs_event = 0;
                }
                break;
        }
}

static int mousedev_fasync(int fd, struct file *file, int on)
{
        struct mousedev_client *client = file->private_data;

        return fasync_helper(fd, file, on, &client->fasync);
}

static void mousedev_free(struct device *dev)
{
        struct mousedev *mousedev = container_of(dev, struct mousedev, dev);

        input_put_device(mousedev->handle.dev);
        kfree(mousedev);
}

static int mousedev_open_device(struct mousedev *mousedev)
{
        int retval;

        retval = mutex_lock_interruptible(&mousedev->mutex);
        if (retval)
                return retval;

        if (!mousedev->exist)
                retval = -ENODEV;
        else if (!mousedev->open++) {
                retval = input_open_device(&mousedev->handle);
                if (retval)
                        mousedev->open--;
        }

        mutex_unlock(&mousedev->mutex);
        return retval;
}

static void mousedev_close_device(struct mousedev *mousedev)
{
        mutex_lock(&mousedev->mutex);

        if (mousedev->exist && !--mousedev->open)
                input_close_device(&mousedev->handle);

        mutex_unlock(&mousedev->mutex);
}

/*
 * Open all available devices so they can all be multiplexed in one.
 * stream. Note that this function is called with mousedev_mix->mutex
 * held.
 */
static int mixdev_open_devices(struct mousedev *mixdev)
{
        int error;

        error = mutex_lock_interruptible(&mixdev->mutex);
        if (error)
                return error;

        if (!mixdev->open++) {
                struct mousedev *mousedev;

                list_for_each_entry(mousedev, &mousedev_mix_list, mixdev_node) {
                        if (!mousedev->opened_by_mixdev) {
                                if (mousedev_open_device(mousedev))
                                        continue;

                                mousedev->opened_by_mixdev = true;
                        }
                }
        }

        mutex_unlock(&mixdev->mutex);
        return 0;
}

/*
 * Close all devices that were opened as part of multiplexed
 * device. Note that this function is called with mousedev_mix->mutex
 * held.
 */
static void mixdev_close_devices(struct mousedev *mixdev)
{
        mutex_lock(&mixdev->mutex);

        if (!--mixdev->open) {
                struct mousedev *mousedev;

                list_for_each_entry(mousedev, &mousedev_mix_list, mixdev_node) {
                        if (mousedev->opened_by_mixdev) {
                                mousedev->opened_by_mixdev = false;
                                mousedev_close_device(mousedev);
                        }
                }
        }

        mutex_unlock(&mixdev->mutex);
}


static void mousedev_attach_client(struct mousedev *mousedev,
                                   struct mousedev_client *client)
{
        spin_lock(&mousedev->client_lock);
        list_add_tail_rcu(&client->node, &mousedev->client_list);
        spin_unlock(&mousedev->client_lock);
}

static void mousedev_detach_client(struct mousedev *mousedev,
                                   struct mousedev_client *client)
{
        spin_lock(&mousedev->client_lock);
        list_del_rcu(&client->node);
        spin_unlock(&mousedev->client_lock);
        synchronize_rcu();
}

static int mousedev_release(struct inode *inode, struct file *file)
{
        struct mousedev_client *client = file->private_data;
        struct mousedev *mousedev = client->mousedev;

        mousedev_detach_client(mousedev, client);
        kfree(client);

        mousedev->close_device(mousedev);

        return 0;
}

static int mousedev_open(struct inode *inode, struct file *file)
{
        struct mousedev_client *client;
        struct mousedev *mousedev;
        int error;

#ifdef CONFIG_INPUT_MOUSEDEV_PSAUX
        if (imajor(inode) == MISC_MAJOR)
                mousedev = mousedev_mix;
        else
#endif
                mousedev = container_of(inode->i_cdev, struct mousedev, cdev);

        client = kzalloc_obj(struct mousedev_client);
        if (!client)
                return -ENOMEM;

        spin_lock_init(&client->packet_lock);
        client->pos_x = xres / 2;
        client->pos_y = yres / 2;
        client->mousedev = mousedev;
        mousedev_attach_client(mousedev, client);

        error = mousedev->open_device(mousedev);
        if (error)
                goto err_free_client;

        file->private_data = client;
        stream_open(inode, file);

        return 0;

 err_free_client:
        mousedev_detach_client(mousedev, client);
        kfree(client);
        return error;
}

static void mousedev_packet(struct mousedev_client *client, u8 *ps2_data)
{
        struct mousedev_motion *p = &client->packets[client->tail];
        s8 dx, dy, dz;

        dx = clamp_val(p->dx, -127, 127);
        p->dx -= dx;

        dy = clamp_val(p->dy, -127, 127);
        p->dy -= dy;

        ps2_data[0] = BIT(3);
        ps2_data[0] |= ((dx & BIT(7)) >> 3) | ((dy & BIT(7)) >> 2);
        ps2_data[0] |= p->buttons & 0x07;
        ps2_data[1] = dx;
        ps2_data[2] = dy;

        switch (client->mode) {
        case MOUSEDEV_EMUL_EXPS:
                dz = clamp_val(p->dz, -7, 7);
                p->dz -= dz;

                ps2_data[3] = (dz & 0x0f) | ((p->buttons & 0x18) << 1);
                client->bufsiz = 4;
                break;

        case MOUSEDEV_EMUL_IMPS:
                dz = clamp_val(p->dz, -127, 127);
                p->dz -= dz;

                ps2_data[0] |= ((p->buttons & 0x10) >> 3) |
                               ((p->buttons & 0x08) >> 1);
                ps2_data[3] = dz;

                client->bufsiz = 4;
                break;

        case MOUSEDEV_EMUL_PS2:
        default:
                p->dz = 0;

                ps2_data[0] |= ((p->buttons & 0x10) >> 3) |
                               ((p->buttons & 0x08) >> 1);

                client->bufsiz = 3;
                break;
        }

        if (!p->dx && !p->dy && !p->dz) {
                if (client->tail == client->head) {
                        client->ready = 0;
                        client->last_buttons = p->buttons;
                } else
                        client->tail = (client->tail + 1) % PACKET_QUEUE_LEN;
        }
}

static void mousedev_generate_response(struct mousedev_client *client,
                                        int command)
{
        client->ps2[0] = 0xfa; /* ACK */

        switch (command) {

        case 0xeb: /* Poll */
                mousedev_packet(client, &client->ps2[1]);
                client->bufsiz++; /* account for leading ACK */
                break;

        case 0xf2: /* Get ID */
                switch (client->mode) {
                case MOUSEDEV_EMUL_PS2:
                        client->ps2[1] = 0;
                        break;
                case MOUSEDEV_EMUL_IMPS:
                        client->ps2[1] = 3;
                        break;
                case MOUSEDEV_EMUL_EXPS:
                        client->ps2[1] = 4;
                        break;
                }
                client->bufsiz = 2;
                break;

        case 0xe9: /* Get info */
                client->ps2[1] = 0x60; client->ps2[2] = 3; client->ps2[3] = 200;
                client->bufsiz = 4;
                break;

        case 0xff: /* Reset */
                client->impsseq = client->imexseq = 0;
                client->mode = MOUSEDEV_EMUL_PS2;
                client->ps2[1] = 0xaa; client->ps2[2] = 0x00;
                client->bufsiz = 3;
                break;

        default:
                client->bufsiz = 1;
                break;
        }
        client->buffer = client->bufsiz;
}

static ssize_t mousedev_write(struct file *file, const char __user *buffer,
                                size_t count, loff_t *ppos)
{
        struct mousedev_client *client = file->private_data;
        unsigned char c;
        unsigned int i;

        for (i = 0; i < count; i++) {

                if (get_user(c, buffer + i))
                        return -EFAULT;

                spin_lock_irq(&client->packet_lock);

                if (c == mousedev_imex_seq[client->imexseq]) {
                        if (++client->imexseq == MOUSEDEV_SEQ_LEN) {
                                client->imexseq = 0;
                                client->mode = MOUSEDEV_EMUL_EXPS;
                        }
                } else
                        client->imexseq = 0;

                if (c == mousedev_imps_seq[client->impsseq]) {
                        if (++client->impsseq == MOUSEDEV_SEQ_LEN) {
                                client->impsseq = 0;
                                client->mode = MOUSEDEV_EMUL_IMPS;
                        }
                } else
                        client->impsseq = 0;

                mousedev_generate_response(client, c);

                spin_unlock_irq(&client->packet_lock);
                cond_resched();
        }

        kill_fasync(&client->fasync, SIGIO, POLL_IN);
        wake_up_interruptible(&client->mousedev->wait);

        return count;
}

static ssize_t mousedev_read(struct file *file, char __user *buffer,
                             size_t count, loff_t *ppos)
{
        struct mousedev_client *client = file->private_data;
        struct mousedev *mousedev = client->mousedev;
        u8 data[sizeof(client->ps2)];
        int retval = 0;

        if (!client->ready && !client->buffer && mousedev->exist &&
            (file->f_flags & O_NONBLOCK))
                return -EAGAIN;

        retval = wait_event_interruptible(mousedev->wait,
                        !mousedev->exist || client->ready || client->buffer);
        if (retval)
                return retval;

        if (!mousedev->exist)
                return -ENODEV;

        spin_lock_irq(&client->packet_lock);

        if (!client->buffer && client->ready) {
                mousedev_packet(client, client->ps2);
                client->buffer = client->bufsiz;
        }

        if (count > client->buffer)
                count = client->buffer;

        memcpy(data, client->ps2 + client->bufsiz - client->buffer, count);
        client->buffer -= count;

        spin_unlock_irq(&client->packet_lock);

        if (copy_to_user(buffer, data, count))
                return -EFAULT;

        return count;
}

/* No kernel lock - fine */
static __poll_t mousedev_poll(struct file *file, poll_table *wait)
{
        struct mousedev_client *client = file->private_data;
        struct mousedev *mousedev = client->mousedev;
        __poll_t mask;

        poll_wait(file, &mousedev->wait, wait);

        mask = mousedev->exist ? EPOLLOUT | EPOLLWRNORM : EPOLLHUP | EPOLLERR;
        if (client->ready || client->buffer)
                mask |= EPOLLIN | EPOLLRDNORM;

        return mask;
}

static const struct file_operations mousedev_fops = {
        .owner                = THIS_MODULE,
        .read                = mousedev_read,
        .write                = mousedev_write,
        .poll                = mousedev_poll,
        .open                = mousedev_open,
        .release        = mousedev_release,
        .fasync                = mousedev_fasync,
        .llseek                = noop_llseek,
};

/*
 * Mark device non-existent. This disables writes, ioctls and
 * prevents new users from opening the device. Already posted
 * blocking reads will stay, however new ones will fail.
 */
static void mousedev_mark_dead(struct mousedev *mousedev)
{
        mutex_lock(&mousedev->mutex);
        mousedev->exist = false;
        mutex_unlock(&mousedev->mutex);
}

/*
 * Wake up users waiting for IO so they can disconnect from
 * dead device.
 */
static void mousedev_hangup(struct mousedev *mousedev)
{
        struct mousedev_client *client;

        spin_lock(&mousedev->client_lock);
        list_for_each_entry(client, &mousedev->client_list, node)
                kill_fasync(&client->fasync, SIGIO, POLL_HUP);
        spin_unlock(&mousedev->client_lock);

        wake_up_interruptible(&mousedev->wait);
}

static void mousedev_cleanup(struct mousedev *mousedev)
{
        struct input_handle *handle = &mousedev->handle;

        mousedev_mark_dead(mousedev);
        mousedev_hangup(mousedev);

        /* mousedev is marked dead so no one else accesses mousedev->open */
        if (mousedev->open)
                input_close_device(handle);
}

static int mousedev_reserve_minor(bool mixdev)
{
        int minor;

        if (mixdev) {
                minor = input_get_new_minor(MOUSEDEV_MIX, 1, false);
                if (minor < 0)
                        pr_err("failed to reserve mixdev minor: %d\n", minor);
        } else {
                minor = input_get_new_minor(MOUSEDEV_MINOR_BASE,
                                            MOUSEDEV_MINORS, true);
                if (minor < 0)
                        pr_err("failed to reserve new minor: %d\n", minor);
        }

        return minor;
}

static struct mousedev *mousedev_create(struct input_dev *dev,
                                        struct input_handler *handler,
                                        bool mixdev)
{
        struct mousedev *mousedev;
        int minor;
        int error;

        minor = mousedev_reserve_minor(mixdev);
        if (minor < 0) {
                error = minor;
                goto err_out;
        }

        mousedev = kzalloc_obj(struct mousedev);
        if (!mousedev) {
                error = -ENOMEM;
                goto err_free_minor;
        }

        INIT_LIST_HEAD(&mousedev->client_list);
        INIT_LIST_HEAD(&mousedev->mixdev_node);
        spin_lock_init(&mousedev->client_lock);
        mutex_init(&mousedev->mutex);
        lockdep_set_subclass(&mousedev->mutex,
                             mixdev ? SINGLE_DEPTH_NESTING : 0);
        init_waitqueue_head(&mousedev->wait);

        if (mixdev) {
                dev_set_name(&mousedev->dev, "mice");

                mousedev->open_device = mixdev_open_devices;
                mousedev->close_device = mixdev_close_devices;
        } else {
                int dev_no = minor;
                /* Normalize device number if it falls into legacy range */
                if (dev_no < MOUSEDEV_MINOR_BASE + MOUSEDEV_MINORS)
                        dev_no -= MOUSEDEV_MINOR_BASE;
                dev_set_name(&mousedev->dev, "mouse%d", dev_no);

                mousedev->open_device = mousedev_open_device;
                mousedev->close_device = mousedev_close_device;
        }

        mousedev->exist = true;
        mousedev->handle.dev = input_get_device(dev);
        mousedev->handle.name = dev_name(&mousedev->dev);
        mousedev->handle.handler = handler;
        mousedev->handle.private = mousedev;

        mousedev->dev.class = &input_class;
        if (dev)
                mousedev->dev.parent = &dev->dev;
        mousedev->dev.devt = MKDEV(INPUT_MAJOR, minor);
        mousedev->dev.release = mousedev_free;
        device_initialize(&mousedev->dev);

        if (!mixdev) {
                error = input_register_handle(&mousedev->handle);
                if (error)
                        goto err_free_mousedev;
        }

        cdev_init(&mousedev->cdev, &mousedev_fops);

        error = cdev_device_add(&mousedev->cdev, &mousedev->dev);
        if (error)
                goto err_cleanup_mousedev;

        return mousedev;

 err_cleanup_mousedev:
        mousedev_cleanup(mousedev);
        if (!mixdev)
                input_unregister_handle(&mousedev->handle);
 err_free_mousedev:
        put_device(&mousedev->dev);
 err_free_minor:
        input_free_minor(minor);
 err_out:
        return ERR_PTR(error);
}

static void mousedev_destroy(struct mousedev *mousedev)
{
        cdev_device_del(&mousedev->cdev, &mousedev->dev);
        mousedev_cleanup(mousedev);
        input_free_minor(MINOR(mousedev->dev.devt));
        if (mousedev != mousedev_mix)
                input_unregister_handle(&mousedev->handle);
        put_device(&mousedev->dev);
}

static int mixdev_add_device(struct mousedev *mousedev)
{
        int retval;

        retval = mutex_lock_interruptible(&mousedev_mix->mutex);
        if (retval)
                return retval;

        if (mousedev_mix->open) {
                retval = mousedev_open_device(mousedev);
                if (retval)
                        goto out;

                mousedev->opened_by_mixdev = true;
        }

        get_device(&mousedev->dev);
        list_add_tail(&mousedev->mixdev_node, &mousedev_mix_list);

 out:
        mutex_unlock(&mousedev_mix->mutex);
        return retval;
}

static void mixdev_remove_device(struct mousedev *mousedev)
{
        mutex_lock(&mousedev_mix->mutex);

        if (mousedev->opened_by_mixdev) {
                mousedev->opened_by_mixdev = false;
                mousedev_close_device(mousedev);
        }

        list_del_init(&mousedev->mixdev_node);
        mutex_unlock(&mousedev_mix->mutex);

        put_device(&mousedev->dev);
}

static int mousedev_connect(struct input_handler *handler,
                            struct input_dev *dev,
                            const struct input_device_id *id)
{
        struct mousedev *mousedev;
        int error;

        mousedev = mousedev_create(dev, handler, false);
        if (IS_ERR(mousedev))
                return PTR_ERR(mousedev);

        error = mixdev_add_device(mousedev);
        if (error) {
                mousedev_destroy(mousedev);
                return error;
        }

        return 0;
}

static void mousedev_disconnect(struct input_handle *handle)
{
        struct mousedev *mousedev = handle->private;

        mixdev_remove_device(mousedev);
        mousedev_destroy(mousedev);
}

static const struct input_device_id mousedev_ids[] = {
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_KEYBIT |
                                INPUT_DEVICE_ID_MATCH_RELBIT,
                .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) },
                .keybit = { [BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) },
                .relbit = { BIT_MASK(REL_X) | BIT_MASK(REL_Y) },
        },        /* A mouse like device, at least one button,
                   two relative axes */
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_RELBIT,
                .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) },
                .relbit = { BIT_MASK(REL_WHEEL) },
        },        /* A separate scrollwheel */
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_KEYBIT |
                                INPUT_DEVICE_ID_MATCH_ABSBIT,
                .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) },
                .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) },
                .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) },
        },        /* A tablet like device, at least touch detection,
                   two absolute axes */
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_KEYBIT |
                                INPUT_DEVICE_ID_MATCH_ABSBIT,
                .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) },
                .keybit = { [BIT_WORD(BTN_TOOL_FINGER)] =
                                BIT_MASK(BTN_TOOL_FINGER) },
                .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) |
                                BIT_MASK(ABS_PRESSURE) |
                                BIT_MASK(ABS_TOOL_WIDTH) },
        },        /* A touchpad */
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                        INPUT_DEVICE_ID_MATCH_KEYBIT |
                        INPUT_DEVICE_ID_MATCH_ABSBIT,
                .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) },
                .keybit = { [BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) },
                .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) },
        },        /* Mouse-like device with absolute X and Y but ordinary
                   clicks, like hp ILO2 High Performance mouse */

        { },        /* Terminating entry */
};

MODULE_DEVICE_TABLE(input, mousedev_ids);

static struct input_handler mousedev_handler = {
        .event                = mousedev_event,
        .connect        = mousedev_connect,
        .disconnect        = mousedev_disconnect,
        .legacy_minors        = true,
        .minor                = MOUSEDEV_MINOR_BASE,
        .name                = "mousedev",
        .id_table        = mousedev_ids,
};

#ifdef CONFIG_INPUT_MOUSEDEV_PSAUX
#include <linux/miscdevice.h>

static struct miscdevice psaux_mouse = {
        .minor        = PSMOUSE_MINOR,
        .name        = "psaux",
        .fops        = &mousedev_fops,
};

static bool psaux_registered;

static void __init mousedev_psaux_register(void)
{
        int error;

        error = misc_register(&psaux_mouse);
        if (error)
                pr_warn("could not register psaux device, error: %d\n",
                           error);
        else
                psaux_registered = true;
}

static void __exit mousedev_psaux_unregister(void)
{
        if (psaux_registered)
                misc_deregister(&psaux_mouse);
}
#else
static inline void mousedev_psaux_register(void) { }
static inline void mousedev_psaux_unregister(void) { }
#endif

static int __init mousedev_init(void)
{
        int error;

        mousedev_mix = mousedev_create(NULL, &mousedev_handler, true);
        if (IS_ERR(mousedev_mix))
                return PTR_ERR(mousedev_mix);

        error = input_register_handler(&mousedev_handler);
        if (error) {
                mousedev_destroy(mousedev_mix);
                return error;
        }

        mousedev_psaux_register();

        pr_info("PS/2 mouse device common for all mice\n");

        return 0;
}

static void __exit mousedev_exit(void)
{
        mousedev_psaux_unregister();
        input_unregister_handler(&mousedev_handler);
        mousedev_destroy(mousedev_mix);
}

module_init(mousedev_init);
module_exit(mousedev_exit);













































































































































































































































































































































































































































































































































































































































































































































   24 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  NOHZ implementation for low and high resolution timers
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 */
#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/percpu.h>
#include <linux/nmi.h>
#include <linux/profile.h>
#include <linux/sched/signal.h>
#include <linux/sched/clock.h>
#include <linux/sched/stat.h>
#include <linux/sched/nohz.h>
#include <linux/sched/loadavg.h>
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/context_tracking.h>
#include <linux/mm.h>

#include <asm/irq_regs.h>

#include "tick-internal.h"

#include <trace/events/timer.h>

/*
 * Per-CPU nohz control structure
 */
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);

struct tick_sched *tick_get_tick_sched(int cpu)
{
        return &per_cpu(tick_cpu_sched, cpu);
}

/*
 * The time when the last jiffy update happened. Write access must hold
 * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
 * consistent view of jiffies and last_jiffies_update.
 */
static ktime_t last_jiffies_update;

/*
 * Must be called with interrupts disabled !
 */
static void tick_do_update_jiffies64(ktime_t now)
{
        unsigned long ticks = 1;
        ktime_t delta, nextp;

        /*
         * 64-bit can do a quick check without holding the jiffies lock and
         * without looking at the sequence count. The smp_load_acquire()
         * pairs with the update done later in this function.
         *
         * 32-bit cannot do that because the store of 'tick_next_period'
         * consists of two 32-bit stores, and the first store could be
         * moved by the CPU to a random point in the future.
         */
        if (IS_ENABLED(CONFIG_64BIT)) {
                if (ktime_before(now, smp_load_acquire(&tick_next_period)))
                        return;
        } else {
                unsigned int seq;

                /*
                 * Avoid contention on 'jiffies_lock' and protect the quick
                 * check with the sequence count.
                 */
                do {
                        seq = read_seqcount_begin(&jiffies_seq);
                        nextp = tick_next_period;
                } while (read_seqcount_retry(&jiffies_seq, seq));

                if (ktime_before(now, nextp))
                        return;
        }

        /* Quick check failed, i.e. update is required. */
        raw_spin_lock(&jiffies_lock);
        /*
         * Re-evaluate with the lock held. Another CPU might have done the
         * update already.
         */
        if (ktime_before(now, tick_next_period)) {
                raw_spin_unlock(&jiffies_lock);
                return;
        }

        write_seqcount_begin(&jiffies_seq);

        delta = ktime_sub(now, tick_next_period);
        if (unlikely(delta >= TICK_NSEC)) {
                /* Slow path for long idle sleep times */
                s64 incr = TICK_NSEC;

                ticks += ktime_divns(delta, incr);

                last_jiffies_update = ktime_add_ns(last_jiffies_update,
                                                   incr * ticks);
        } else {
                last_jiffies_update = ktime_add_ns(last_jiffies_update,
                                                   TICK_NSEC);
        }

        /* Advance jiffies to complete the 'jiffies_seq' protected job */
        jiffies_64 += ticks;

        /* Keep the tick_next_period variable up to date */
        nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);

        if (IS_ENABLED(CONFIG_64BIT)) {
                /*
                 * Pairs with smp_load_acquire() in the lockless quick
                 * check above, and ensures that the update to 'jiffies_64' is
                 * not reordered vs. the store to 'tick_next_period', neither
                 * by the compiler nor by the CPU.
                 */
                smp_store_release(&tick_next_period, nextp);
        } else {
                /*
                 * A plain store is good enough on 32-bit, as the quick check
                 * above is protected by the sequence count.
                 */
                tick_next_period = nextp;
        }

        /*
         * Release the sequence count. calc_global_load() below is not
         * protected by it, but 'jiffies_lock' needs to be held to prevent
         * concurrent invocations.
         */
        write_seqcount_end(&jiffies_seq);

        calc_global_load();

        raw_spin_unlock(&jiffies_lock);
        update_wall_time();
}

/*
 * Initialize and return retrieve the jiffies update.
 */
static ktime_t tick_init_jiffy_update(void)
{
        ktime_t period;

        raw_spin_lock(&jiffies_lock);
        write_seqcount_begin(&jiffies_seq);

        /* Have we started the jiffies update yet ? */
        if (last_jiffies_update == 0) {
                u32 rem;

                /*
                 * Ensure that the tick is aligned to a multiple of
                 * TICK_NSEC.
                 */
                div_u64_rem(tick_next_period, TICK_NSEC, &rem);
                if (rem)
                        tick_next_period += TICK_NSEC - rem;

                last_jiffies_update = tick_next_period;
        }
        period = last_jiffies_update;

        write_seqcount_end(&jiffies_seq);
        raw_spin_unlock(&jiffies_lock);

        return period;
}

static inline int tick_sched_flag_test(struct tick_sched *ts,
                                       unsigned long flag)
{
        return !!(ts->flags & flag);
}

static inline void tick_sched_flag_set(struct tick_sched *ts,
                                       unsigned long flag)
{
        lockdep_assert_irqs_disabled();
        ts->flags |= flag;
}

static inline void tick_sched_flag_clear(struct tick_sched *ts,
                                         unsigned long flag)
{
        lockdep_assert_irqs_disabled();
        ts->flags &= ~flag;
}

/*
 * Allow only one non-timekeeper CPU at a time update jiffies from
 * the timer tick.
 *
 * Returns true if update was run.
 */
static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now)
{
        static atomic_t in_progress;
        int inp;

        inp = atomic_read(&in_progress);
        if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1))
                return false;

        if (ts->last_tick_jiffies == jiffies)
                tick_do_update_jiffies64(now);
        atomic_set(&in_progress, 0);
        return true;
}

#define MAX_STALLED_JIFFIES 5

static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
        int tick_cpu, cpu = smp_processor_id();

        /*
         * Check if the do_timer duty was dropped. We don't care about
         * concurrency: This happens only when the CPU in charge went
         * into a long sleep. If two CPUs happen to assign themselves to
         * this duty, then the jiffies update is still serialized by
         * 'jiffies_lock'.
         *
         * If nohz_full is enabled, this should not happen because the
         * 'tick_do_timer_cpu' CPU never relinquishes.
         */
        tick_cpu = READ_ONCE(tick_do_timer_cpu);

        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
                WARN_ON_ONCE(tick_nohz_full_running);
#endif
                WRITE_ONCE(tick_do_timer_cpu, cpu);
                tick_cpu = cpu;
        }

        /* Check if jiffies need an update */
        if (tick_cpu == cpu)
                tick_do_update_jiffies64(now);

        /*
         * If the jiffies update stalled for too long (timekeeper in stop_machine()
         * or VMEXIT'ed for several msecs), force an update.
         */
        if (ts->last_tick_jiffies != jiffies) {
                ts->stalled_jiffies = 0;
                ts->last_tick_jiffies = READ_ONCE(jiffies);
        } else {
                if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) {
                        if (tick_limited_update_jiffies64(ts, now)) {
                                ts->stalled_jiffies = 0;
                                ts->last_tick_jiffies = READ_ONCE(jiffies);
                        }
                }
        }

        if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
                ts->got_idle_tick = 1;
}

static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
{
        /*
         * When we are idle and the tick is stopped, we have to touch
         * the watchdog as we might not schedule for a really long
         * time. This happens on completely idle SMP systems while
         * waiting on the login prompt. We also increment the "start of
         * idle" jiffy stamp so the idle accounting adjustment we do
         * when we go busy again does not account too many ticks.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
            tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                touch_softlockup_watchdog_sched();
                if (is_idle_task(current))
                        ts->idle_jiffies++;
                /*
                 * In case the current tick fired too early past its expected
                 * expiration, make sure we don't bypass the next clock reprogramming
                 * to the same deadline.
                 */
                ts->next_tick = 0;
        }

        update_process_times(user_mode(regs));
        profile_tick(CPU_PROFILING);
}

/*
 * We rearm the timer until we get disabled by the idle code.
 * Called with interrupts disabled.
 */
static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer)
{
        struct tick_sched *ts =        container_of(timer, struct tick_sched, sched_timer);
        struct pt_regs *regs = get_irq_regs();
        ktime_t now = ktime_get();

        tick_sched_do_timer(ts, now);

        /*
         * Do not call when we are not in IRQ context and have
         * no valid 'regs' pointer
         */
        if (regs)
                tick_sched_handle(ts, regs);
        else
                ts->next_tick = 0;

        /*
         * In dynticks mode, tick reprogram is deferred:
         * - to the idle task if in dynticks-idle
         * - to IRQ exit if in full-dynticks.
         */
        if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED)))
                return HRTIMER_NORESTART;

        hrtimer_forward(timer, now, TICK_NSEC);

        return HRTIMER_RESTART;
}

#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
bool tick_nohz_full_running;
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;

static bool check_tick_dependency(atomic_t *dep)
{
        int val = atomic_read(dep);

        if (likely(!tracepoint_enabled(tick_stop)))
                return !!val;

        if (val & TICK_DEP_MASK_POSIX_TIMER) {
                trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
                return true;
        }

        if (val & TICK_DEP_MASK_PERF_EVENTS) {
                trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
                return true;
        }

        if (val & TICK_DEP_MASK_SCHED) {
                trace_tick_stop(0, TICK_DEP_MASK_SCHED);
                return true;
        }

        if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
                trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
                return true;
        }

        if (val & TICK_DEP_MASK_RCU) {
                trace_tick_stop(0, TICK_DEP_MASK_RCU);
                return true;
        }

        if (val & TICK_DEP_MASK_RCU_EXP) {
                trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
                return true;
        }

        return false;
}

static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
        lockdep_assert_irqs_disabled();

        if (unlikely(!cpu_online(cpu)))
                return false;

        if (check_tick_dependency(&tick_dep_mask))
                return false;

        if (check_tick_dependency(&ts->tick_dep_mask))
                return false;

        if (check_tick_dependency(&current->tick_dep_mask))
                return false;

        if (check_tick_dependency(&current->signal->tick_dep_mask))
                return false;

        return true;
}

static void nohz_full_kick_func(struct irq_work *work)
{
        /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}

static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
        IRQ_WORK_INIT_HARD(nohz_full_kick_func);

/*
 * Kick this CPU if it's full dynticks in order to force it to
 * re-evaluate its dependency on the tick and restart it if necessary.
 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
 * is NMI safe.
 */
static void tick_nohz_full_kick(void)
{
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
}

/*
 * Kick the CPU if it's full dynticks in order to force it to
 * re-evaluate its dependency on the tick and restart it if necessary.
 */
void tick_nohz_full_kick_cpu(int cpu)
{
        if (!tick_nohz_full_cpu(cpu))
                return;

        irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}

static void tick_nohz_kick_task(struct task_struct *tsk)
{
        int cpu;

        /*
         * If the task is not running, run_posix_cpu_timers()
         * has nothing to elapse, and an IPI can then be optimized out.
         *
         * activate_task()                      STORE p->tick_dep_mask
         *   STORE p->on_rq
         * __schedule() (switch to task 'p')    smp_mb() (atomic_fetch_or())
         *   LOCK rq->lock                      LOAD p->on_rq
         *   smp_mb__after_spin_lock()
         *   tick_nohz_task_switch()
         *     LOAD p->tick_dep_mask
         *
         * XXX given a task picks up the dependency on schedule(), should we
         * only care about tasks that are currently on the CPU instead of all
         * that are on the runqueue?
         *
         * That is, does this want to be: task_on_cpu() / task_curr()?
         */
        if (!sched_task_on_rq(tsk))
                return;

        /*
         * If the task concurrently migrates to another CPU,
         * we guarantee it sees the new tick dependency upon
         * schedule.
         *
         * set_task_cpu(p, cpu);
         *   STORE p->cpu = @cpu
         * __schedule() (switch to task 'p')
         *   LOCK rq->lock
         *   smp_mb__after_spin_lock()          STORE p->tick_dep_mask
         *   tick_nohz_task_switch()            smp_mb() (atomic_fetch_or())
         *      LOAD p->tick_dep_mask           LOAD p->cpu
         */
        cpu = task_cpu(tsk);

        preempt_disable();
        if (cpu_online(cpu))
                tick_nohz_full_kick_cpu(cpu);
        preempt_enable();
}

/*
 * Kick all full dynticks CPUs in order to force these to re-evaluate
 * their dependency on the tick and restart it if necessary.
 */
static void tick_nohz_full_kick_all(void)
{
        int cpu;

        if (!tick_nohz_full_running)
                return;

        preempt_disable();
        for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
                tick_nohz_full_kick_cpu(cpu);
        preempt_enable();
}

static void tick_nohz_dep_set_all(atomic_t *dep,
                                  enum tick_dep_bits bit)
{
        int prev;

        prev = atomic_fetch_or(BIT(bit), dep);
        if (!prev)
                tick_nohz_full_kick_all();
}

/*
 * Set a global tick dependency. Used by perf events that rely on freq and
 * unstable clocks.
 */
void tick_nohz_dep_set(enum tick_dep_bits bit)
{
        tick_nohz_dep_set_all(&tick_dep_mask, bit);
}

void tick_nohz_dep_clear(enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &tick_dep_mask);
}

/*
 * Set per-CPU tick dependency. Used by scheduler and perf events in order to
 * manage event-throttling.
 */
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
        int prev;
        struct tick_sched *ts;

        ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
        if (!prev) {
                preempt_disable();
                /* Perf needs local kick that is NMI safe */
                if (cpu == smp_processor_id()) {
                        tick_nohz_full_kick();
                } else {
                        /* Remote IRQ work not NMI-safe */
                        if (!WARN_ON_ONCE(in_nmi()))
                                tick_nohz_full_kick_cpu(cpu);
                }
                preempt_enable();
        }
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);

void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
        struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);

/*
 * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
 * in order to elapse per task timers.
 */
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
        if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
                tick_nohz_kick_task(tsk);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);

void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);

/*
 * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
 * per process timers.
 */
void tick_nohz_dep_set_signal(struct task_struct *tsk,
                              enum tick_dep_bits bit)
{
        int prev;
        struct signal_struct *sig = tsk->signal;

        prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
        if (!prev) {
                struct task_struct *t;

                lockdep_assert_held(&tsk->sighand->siglock);
                __for_each_thread(sig, t)
                        tick_nohz_kick_task(t);
        }
}

void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &sig->tick_dep_mask);
}

/*
 * Re-evaluate the need for the tick as we switch the current task.
 * It might need the tick due to per task/process properties:
 * perf events, posix CPU timers, ...
 */
void __tick_nohz_task_switch(void)
{
        struct tick_sched *ts;

        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                if (atomic_read(&current->tick_dep_mask) ||
                    atomic_read(&current->signal->tick_dep_mask))
                        tick_nohz_full_kick();
        }
}

/* Get the boot-time nohz CPU list from the kernel parameters. */
void __init tick_nohz_full_setup(cpumask_var_t cpumask)
{
        alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
        cpumask_copy(tick_nohz_full_mask, cpumask);
        tick_nohz_full_running = true;
}

bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
        /*
         * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
         * timers, workqueues, timekeeping, ...) on behalf of full dynticks
         * CPUs. It must remain online when nohz full is enabled.
         */
        if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu)
                return false;
        return true;
}

static int tick_nohz_cpu_down(unsigned int cpu)
{
        return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
}

void __init tick_nohz_init(void)
{
        int cpu, ret;

        if (!tick_nohz_full_running)
                return;

        /*
         * Full dynticks uses IRQ work to drive the tick rescheduling on safe
         * locking contexts. But then we need IRQ work to raise its own
         * interrupts to avoid circular dependency on the tick.
         */
        if (!arch_irq_work_has_interrupt()) {
                pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
                cpumask_clear(tick_nohz_full_mask);
                tick_nohz_full_running = false;
                return;
        }

        if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
                        !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
                cpu = smp_processor_id();

                if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
                        pr_warn("NO_HZ: Clearing %d from nohz_full range "
                                "for timekeeping\n", cpu);
                        cpumask_clear_cpu(cpu, tick_nohz_full_mask);
                }
        }

        for_each_cpu(cpu, tick_nohz_full_mask)
                ct_cpu_track_user(cpu);

        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
                                        "kernel/nohz:predown", NULL,
                                        tick_nohz_cpu_down);
        WARN_ON(ret < 0);
        pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
                cpumask_pr_args(tick_nohz_full_mask));
}
#endif /* #ifdef CONFIG_NO_HZ_FULL */

/*
 * NOHZ - aka dynamic tick functionality
 */
#ifdef CONFIG_NO_HZ_COMMON
/*
 * NO HZ enabled ?
 */
bool tick_nohz_enabled __read_mostly  = true;
static unsigned long tick_nohz_active  __read_mostly;
/*
 * Enable / Disable tickless mode
 */
static int __init setup_tick_nohz(char *str)
{
        return (kstrtobool(str, &tick_nohz_enabled) == 0);
}

__setup("nohz=", setup_tick_nohz);

bool tick_nohz_is_active(void)
{
        return tick_nohz_active;
}
EXPORT_SYMBOL_GPL(tick_nohz_is_active);

bool tick_nohz_tick_stopped(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
}

bool tick_nohz_tick_stopped_cpu(int cpu)
{
        struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
}

/**
 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 * @now: current ktime_t
 *
 * Called from interrupt entry when the CPU was idle
 *
 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
 * must be updated. Otherwise an interrupt handler could use a stale jiffy
 * value. We do this unconditionally on any CPU, as we don't know whether the
 * CPU, which has the update task assigned, is in a long sleep.
 */
static void tick_nohz_update_jiffies(ktime_t now)
{
        unsigned long flags;

        __this_cpu_write(tick_cpu_sched.idle_waketime, now);

        local_irq_save(flags);
        tick_do_update_jiffies64(now);
        local_irq_restore(flags);

        touch_softlockup_watchdog_sched();
}

static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
        ktime_t delta;

        if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
                return;

        delta = ktime_sub(now, ts->idle_entrytime);

        write_seqcount_begin(&ts->idle_sleeptime_seq);
        if (nr_iowait_cpu(smp_processor_id()) > 0)
                ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
        else
                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);

        ts->idle_entrytime = now;
        tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
        write_seqcount_end(&ts->idle_sleeptime_seq);

        sched_clock_idle_wakeup_event();
}

static void tick_nohz_start_idle(struct tick_sched *ts)
{
        write_seqcount_begin(&ts->idle_sleeptime_seq);
        ts->idle_entrytime = ktime_get();
        tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
        write_seqcount_end(&ts->idle_sleeptime_seq);

        sched_clock_idle_sleep_event();
}

static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
                                 bool compute_delta, u64 *last_update_time)
{
        ktime_t now, idle;
        unsigned int seq;

        if (!tick_nohz_active)
                return -1;

        now = ktime_get();
        if (last_update_time)
                *last_update_time = ktime_to_us(now);

        do {
                seq = read_seqcount_begin(&ts->idle_sleeptime_seq);

                if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);

                        idle = ktime_add(*sleeptime, delta);
                } else {
                        idle = *sleeptime;
                }
        } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));

        return ktime_to_us(idle);

}

/**
 * get_cpu_idle_time_us - get the total idle time of a CPU
 * @cpu: CPU number to query
 * @last_update_time: variable to store update time in. Do not update
 * counters if NULL.
 *
 * Return the cumulative idle time (since boot) for a given
 * CPU, in microseconds. Note that this is partially broken due to
 * the counter of iowait tasks that can be remotely updated without
 * any synchronization. Therefore it is possible to observe backward
 * values within two consecutive reads.
 *
 * This time is measured via accounting rather than sampling,
 * and is as accurate as ktime_get() is.
 *
 * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
 */
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);

        return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
                                     !nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);

/**
 * get_cpu_iowait_time_us - get the total iowait time of a CPU
 * @cpu: CPU number to query
 * @last_update_time: variable to store update time in. Do not update
 * counters if NULL.
 *
 * Return the cumulative iowait time (since boot) for a given
 * CPU, in microseconds. Note this is partially broken due to
 * the counter of iowait tasks that can be remotely updated without
 * any synchronization. Therefore it is possible to observe backward
 * values within two consecutive reads.
 *
 * This time is measured via accounting rather than sampling,
 * and is as accurate as ktime_get() is.
 *
 * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
 */
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);

        return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
                                     nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);

/* Simplified variant of hrtimer_forward_now() */
static ktime_t tick_forward_now(ktime_t expires, ktime_t now)
{
        ktime_t delta = now - expires;

        if (likely(delta < TICK_NSEC))
                return expires + TICK_NSEC;

        expires += TICK_NSEC * ktime_divns(delta, TICK_NSEC);
        if (expires > now)
                return expires;
        return expires + TICK_NSEC;
}

static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
        ktime_t expires = ts->last_tick;

        if (now >= expires)
                expires = tick_forward_now(expires, now);

        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
                hrtimer_start(&ts->sched_timer,        expires, HRTIMER_MODE_ABS_PINNED_HARD);
        } else {
                hrtimer_set_expires(&ts->sched_timer, expires);
                tick_program_event(expires, 1);
        }

        /*
         * Reset to make sure the next tick stop doesn't get fooled by past
         * cached clock deadline.
         */
        ts->next_tick = 0;
}

static inline bool local_timer_softirq_pending(void)
{
        return local_timers_pending() & BIT(TIMER_SOFTIRQ);
}

/*
 * Read jiffies and the time when jiffies were updated last
 */
u64 get_jiffies_update(unsigned long *basej)
{
        unsigned long basejiff;
        unsigned int seq;
        u64 basemono;

        do {
                seq = read_seqcount_begin(&jiffies_seq);
                basemono = last_jiffies_update;
                basejiff = jiffies;
        } while (read_seqcount_retry(&jiffies_seq, seq));
        *basej = basejiff;
        return basemono;
}

/**
 * tick_nohz_next_event() - return the clock monotonic based next event
 * @ts:                pointer to tick_sched struct
 * @cpu:        CPU number
 *
 * Return:
 * *%0                - When the next event is a maximum of TICK_NSEC in the future
 *                  and the tick is not stopped yet
 * *%next_event        - Next event based on clock monotonic
 */
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
        u64 basemono, next_tick, delta, expires;
        unsigned long basejiff;
        int tick_cpu;

        basemono = get_jiffies_update(&basejiff);
        ts->last_jiffies = basejiff;
        ts->timer_expires_base = basemono;

        /*
         * Keep the periodic tick, when RCU, architecture or irq_work
         * requests it.
         * Aside of that, check whether the local timer softirq is
         * pending. If so, its a bad idea to call get_next_timer_interrupt(),
         * because there is an already expired timer, so it will request
         * immediate expiry, which rearms the hardware timer with a
         * minimal delta, which brings us back to this place
         * immediately. Lather, rinse and repeat...
         */
        if (rcu_needs_cpu() || arch_needs_cpu() ||
            irq_work_needs_cpu() || local_timer_softirq_pending()) {
                next_tick = basemono + TICK_NSEC;
        } else {
                /*
                 * Get the next pending timer. If high resolution
                 * timers are enabled this only takes the timer wheel
                 * timers into account. If high resolution timers are
                 * disabled this also looks at the next expiring
                 * hrtimer.
                 */
                next_tick = get_next_timer_interrupt(basejiff, basemono);
                ts->next_timer = next_tick;
        }

        /* Make sure next_tick is never before basemono! */
        if (WARN_ON_ONCE(basemono > next_tick))
                next_tick = basemono;

        /*
         * If the tick is due in the next period, keep it ticking or
         * force prod the timer.
         */
        delta = next_tick - basemono;
        if (delta <= (u64)TICK_NSEC) {
                /*
                 * We've not stopped the tick yet, and there's a timer in the
                 * next period, so no point in stopping it either, bail.
                 */
                if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                        ts->timer_expires = 0;
                        goto out;
                }
        }

        /*
         * If this CPU is the one which had the do_timer() duty last, we limit
         * the sleep time to the timekeeping 'max_deferment' value.
         * Otherwise we can sleep as long as we want.
         */
        delta = timekeeping_max_deferment();
        tick_cpu = READ_ONCE(tick_do_timer_cpu);
        if (tick_cpu != cpu &&
            (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
                delta = KTIME_MAX;

        /* Calculate the next expiry time */
        if (delta < (KTIME_MAX - basemono))
                expires = basemono + delta;
        else
                expires = KTIME_MAX;

        ts->timer_expires = min_t(u64, expires, next_tick);

out:
        return ts->timer_expires;
}

static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        unsigned long basejiff = ts->last_jiffies;
        u64 basemono = ts->timer_expires_base;
        bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
        int tick_cpu;
        u64 expires;

        /* Make sure we won't be trying to stop it twice in a row. */
        ts->timer_expires_base = 0;

        /*
         * Now the tick should be stopped definitely - so the timer base needs
         * to be marked idle as well to not miss a newly queued timer.
         */
        expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle);
        if (expires > ts->timer_expires) {
                /*
                 * This path could only happen when the first timer was removed
                 * between calculating the possible sleep length and now (when
                 * high resolution mode is not active, timer could also be a
                 * hrtimer).
                 *
                 * We have to stick to the original calculated expiry value to
                 * not stop the tick for too long with a shallow C-state (which
                 * was programmed by cpuidle because of an early next expiration
                 * value).
                 */
                expires = ts->timer_expires;
        }

        /* If the timer base is not idle, retain the not yet stopped tick. */
        if (!timer_idle)
                return;

        /*
         * If this CPU is the one which updates jiffies, then give up
         * the assignment and let it be taken by the CPU which runs
         * the tick timer next, which might be this CPU as well. If we
         * don't drop this here, the jiffies might be stale and
         * do_timer() never gets invoked. Keep track of the fact that it
         * was the one which had the do_timer() duty last.
         */
        tick_cpu = READ_ONCE(tick_do_timer_cpu);
        if (tick_cpu == cpu) {
                WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE);
                tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST);
        } else if (tick_cpu != TICK_DO_TIMER_NONE) {
                tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST);
        }

        /* Skip reprogram of event if it's not changed */
        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) {
                /* Sanity check: make sure clockevent is actually programmed */
                if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
                        return;

                WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu "
                          "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick,
                          dev->next_event, hrtimer_active(&ts->sched_timer),
                          hrtimer_get_expires(&ts->sched_timer));
        }

        /*
         * tick_nohz_stop_tick() can be called several times before
         * tick_nohz_restart_sched_tick() is called. This happens when
         * interrupts arrive which do not cause a reschedule. In the first
         * call we save the current tick time, so we can restart the
         * scheduler tick in tick_nohz_restart_sched_tick().
         */
        if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                calc_load_nohz_start();
                quiet_vmstat();

                ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                tick_sched_flag_set(ts, TS_FLAG_STOPPED);
                trace_tick_stop(1, TICK_DEP_MASK_NONE);
        }

        ts->next_tick = expires;

        /*
         * If the expiration time == KTIME_MAX, then we simply stop
         * the tick timer.
         */
        if (unlikely(expires == KTIME_MAX)) {
                if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
                        hrtimer_cancel(&ts->sched_timer);
                else
                        tick_program_event(KTIME_MAX, 1);
                return;
        }

        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
                hrtimer_start(&ts->sched_timer, expires,
                              HRTIMER_MODE_ABS_PINNED_HARD);
        } else {
                hrtimer_set_expires(&ts->sched_timer, expires);
                tick_program_event(expires, 1);
        }
}

static void tick_nohz_retain_tick(struct tick_sched *ts)
{
        ts->timer_expires_base = 0;
}

#ifdef CONFIG_NO_HZ_FULL
static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu)
{
        if (tick_nohz_next_event(ts, cpu))
                tick_nohz_stop_tick(ts, cpu);
        else
                tick_nohz_retain_tick(ts);
}
#endif /* CONFIG_NO_HZ_FULL */

static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
        /* Update jiffies first */
        tick_do_update_jiffies64(now);

        /*
         * Clear the timer idle flag, so we avoid IPIs on remote queueing and
         * the clock forward checks in the enqueue path:
         */
        timer_clear_idle();

        calc_load_nohz_stop();
        touch_softlockup_watchdog_sched();

        /* Cancel the scheduled timer and restore the tick: */
        tick_sched_flag_clear(ts, TS_FLAG_STOPPED);
        tick_nohz_restart(ts, now);
}

static void __tick_nohz_full_update_tick(struct tick_sched *ts,
                                         ktime_t now)
{
#ifdef CONFIG_NO_HZ_FULL
        int cpu = smp_processor_id();

        if (can_stop_full_tick(cpu, ts))
                tick_nohz_full_stop_tick(ts, cpu);
        else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
                tick_nohz_restart_sched_tick(ts, now);
#endif
}

static void tick_nohz_full_update_tick(struct tick_sched *ts)
{
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ))
                return;

        __tick_nohz_full_update_tick(ts, ktime_get());
}

/*
 * A pending softirq outside an IRQ (or softirq disabled section) context
 * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
 * reach this code due to the need_resched() early check in can_stop_idle_tick().
 *
 * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
 * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
 * triggering the code below, since wakep_softirqd() is ignored.
 *
 */
static bool report_idle_softirq(void)
{
        static int ratelimit;
        unsigned int pending = local_softirq_pending();

        if (likely(!pending))
                return false;

        /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */
        if (!cpu_active(smp_processor_id())) {
                pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
                if (!pending)
                        return false;
        }

        /* On RT, softirq handling may be waiting on some lock */
        if (local_bh_blocked())
                return false;

        if (ratelimit < 10) {
                pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
                        pending);
                ratelimit++;
        }

        return true;
}

static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
        WARN_ON_ONCE(cpu_is_offline(cpu));

        if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ)))
                return false;

        if (need_resched())
                return false;

        if (unlikely(report_idle_softirq()))
                return false;

        if (tick_nohz_full_enabled()) {
                int tick_cpu = READ_ONCE(tick_do_timer_cpu);

                /*
                 * Keep the tick alive to guarantee timekeeping progression
                 * if there are full dynticks CPUs around
                 */
                if (tick_cpu == cpu)
                        return false;

                /* Should not happen for nohz-full */
                if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE))
                        return false;
        }

        return true;
}

/**
 * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
 *
 * When the next event is more than a tick into the future, stop the idle tick
 */
void tick_nohz_idle_stop_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        int cpu = smp_processor_id();
        ktime_t expires;

        /*
         * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
         * tick timer expiration time is known already.
         */
        if (ts->timer_expires_base)
                expires = ts->timer_expires;
        else if (can_stop_idle_tick(cpu, ts))
                expires = tick_nohz_next_event(ts, cpu);
        else
                return;

        ts->idle_calls++;

        if (expires > 0LL) {
                int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);

                tick_nohz_stop_tick(ts, cpu);

                ts->idle_sleeps++;
                ts->idle_expires = expires;

                if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                        ts->idle_jiffies = ts->last_jiffies;
                        nohz_balance_enter_idle(cpu);
                }
        } else {
                tick_nohz_retain_tick(ts);
        }
}

void tick_nohz_idle_retain_tick(void)
{
        tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
}

/**
 * tick_nohz_idle_enter - prepare for entering idle on the current CPU
 *
 * Called when we start the idle loop.
 */
void tick_nohz_idle_enter(void)
{
        struct tick_sched *ts;

        lockdep_assert_irqs_enabled();

        local_irq_disable();

        ts = this_cpu_ptr(&tick_cpu_sched);

        WARN_ON_ONCE(ts->timer_expires_base);

        tick_sched_flag_set(ts, TS_FLAG_INIDLE);
        tick_nohz_start_idle(ts);

        local_irq_enable();
}

/**
 * tick_nohz_irq_exit - Notify the tick about IRQ exit
 *
 * A timer may have been added/modified/deleted either by the current IRQ,
 * or by another place using this IRQ as a notification. This IRQ may have
 * also updated the RCU callback list. These events may require a
 * re-evaluation of the next tick. Depending on the context:
 *
 * 1) If the CPU is idle and no resched is pending, just proceed with idle
 *    time accounting. The next tick will be re-evaluated on the next idle
 *    loop iteration.
 *
 * 2) If the CPU is nohz_full:
 *
 *    2.1) If there is any tick dependency, restart the tick if stopped.
 *
 *    2.2) If there is no tick dependency, (re-)evaluate the next tick and
 *         stop/update it accordingly.
 */
void tick_nohz_irq_exit(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
                tick_nohz_start_idle(ts);
        else
                tick_nohz_full_update_tick(ts);
}

/**
 * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
 *
 * Return: %true if the tick handler has run, otherwise %false
 */
bool tick_nohz_idle_got_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (ts->got_idle_tick) {
                ts->got_idle_tick = 0;
                return true;
        }
        return false;
}

/**
 * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
 * or the tick, whichever expires first. Note that, if the tick has been
 * stopped, it returns the next hrtimer.
 *
 * Called from power state control code with interrupts disabled
 *
 * Return: the next expiration time
 */
ktime_t tick_nohz_get_next_hrtimer(void)
{
        return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
}

/**
 * tick_nohz_get_sleep_length - return the expected length of the current sleep
 * @delta_next: duration until the next event if the tick cannot be stopped
 *
 * Called from power state control code with interrupts disabled.
 *
 * The return value of this function and/or the value returned by it through the
 * @delta_next pointer can be negative which must be taken into account by its
 * callers.
 *
 * Return: the expected length of the current sleep
 */
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        int cpu = smp_processor_id();
        /*
         * The idle entry time is expected to be a sufficient approximation of
         * the current time at this point.
         */
        ktime_t now = ts->idle_entrytime;
        ktime_t next_event;

        WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));

        *delta_next = ktime_sub(dev->next_event, now);

        if (!can_stop_idle_tick(cpu, ts))
                return *delta_next;

        next_event = tick_nohz_next_event(ts, cpu);
        if (!next_event)
                return *delta_next;

        /*
         * If the next highres timer to expire is earlier than 'next_event', the
         * idle governor needs to know that.
         */
        next_event = min_t(u64, next_event,
                           hrtimer_next_event_without(&ts->sched_timer));

        return ktime_sub(next_event, now);
}

/**
 * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
 * for a particular CPU.
 * @cpu: target CPU number
 *
 * Called from the schedutil frequency scaling governor in scheduler context.
 *
 * Return: the current idle calls counter value for @cpu
 */
unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
{
        struct tick_sched *ts = tick_get_tick_sched(cpu);

        return ts->idle_calls;
}

static void tick_nohz_account_idle_time(struct tick_sched *ts,
                                        ktime_t now)
{
        unsigned long ticks;

        ts->idle_exittime = now;

        if (vtime_accounting_enabled_this_cpu())
                return;
        /*
         * We stopped the tick in idle. update_process_times() would miss the
         * time we slept, as it does only a 1 tick accounting.
         * Enforce that this is accounted to idle !
         */
        ticks = jiffies - ts->idle_jiffies;
        /*
         * We might be one off. Do not randomly account a huge number of ticks!
         */
        if (ticks && ticks < LONG_MAX)
                account_idle_ticks(ticks);
}

void tick_nohz_idle_restart_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                ktime_t now = ktime_get();
                tick_nohz_restart_sched_tick(ts, now);
                tick_nohz_account_idle_time(ts, now);
        }
}

static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
{
        if (tick_nohz_full_cpu(smp_processor_id()))
                __tick_nohz_full_update_tick(ts, now);
        else
                tick_nohz_restart_sched_tick(ts, now);

        tick_nohz_account_idle_time(ts, now);
}

/**
 * tick_nohz_idle_exit - Update the tick upon idle task exit
 *
 * When the idle task exits, update the tick depending on the
 * following situations:
 *
 * 1) If the CPU is not in nohz_full mode (most cases), then
 *    restart the tick.
 *
 * 2) If the CPU is in nohz_full mode (corner case):
 *   2.1) If the tick can be kept stopped (no tick dependencies)
 *        then re-evaluate the next tick and try to keep it stopped
 *        as long as possible.
 *   2.2) If the tick has dependencies, restart the tick.
 *
 */
void tick_nohz_idle_exit(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        bool idle_active, tick_stopped;
        ktime_t now;

        local_irq_disable();

        WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
        WARN_ON_ONCE(ts->timer_expires_base);

        tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
        idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
        tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);

        if (idle_active || tick_stopped)
                now = ktime_get();

        if (idle_active)
                tick_nohz_stop_idle(ts, now);

        if (tick_stopped)
                tick_nohz_idle_update_tick(ts, now);

        local_irq_enable();
}

/*
 * In low-resolution mode, the tick handler must be implemented directly
 * at the clockevent level. hrtimer can't be used instead, because its
 * infrastructure actually relies on the tick itself as a backend in
 * low-resolution mode (see hrtimer_run_queues()).
 */
static void tick_nohz_lowres_handler(struct clock_event_device *dev)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        dev->next_event = KTIME_MAX;
        dev->next_event_forced = 0;

        if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}

static inline void tick_nohz_activate(struct tick_sched *ts)
{
        if (!tick_nohz_enabled)
                return;
        tick_sched_flag_set(ts, TS_FLAG_NOHZ);
        /* One update is enough */
        if (!test_and_set_bit(0, &tick_nohz_active))
                timers_update_nohz();
}

/**
 * tick_nohz_switch_to_nohz - switch to NOHZ mode
 */
static void tick_nohz_switch_to_nohz(void)
{
        if (!tick_nohz_enabled)
                return;

        if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
                return;

        /*
         * Recycle the hrtimer in 'ts', so we can share the
         * highres code.
         */
        tick_setup_sched_timer(false);
}

static inline void tick_nohz_irq_enter(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        ktime_t now;

        if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
                return;
        now = ktime_get();
        if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
                tick_nohz_stop_idle(ts, now);
        /*
         * If all CPUs are idle we may need to update a stale jiffies value.
         * Note nohz_full is a special case: a timekeeper is guaranteed to stay
         * alive but it might be busy looping with interrupts disabled in some
         * rare case (typically stop machine). So we must make sure we have a
         * last resort.
         */
        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
                tick_nohz_update_jiffies(now);
}

#else

static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_nohz_irq_enter(void) { }
static inline void tick_nohz_activate(struct tick_sched *ts) { }

#endif /* CONFIG_NO_HZ_COMMON */

/*
 * Called from irq_enter() to notify about the possible interruption of idle()
 */
void tick_irq_enter(void)
{
        tick_check_oneshot_broadcast_this_cpu();
        tick_nohz_irq_enter();
}

static int sched_skew_tick;

static int __init skew_tick(char *str)
{
        get_option(&str, &sched_skew_tick);

        return 0;
}
early_param("skew_tick", skew_tick);

/**
 * tick_setup_sched_timer - setup the tick emulation timer
 * @hrtimer: whether to use the hrtimer or not
 */
void tick_setup_sched_timer(bool hrtimer)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        /* Emulate tick processing via per-CPU hrtimers: */
        hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);

        if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
                tick_sched_flag_set(ts, TS_FLAG_HIGHRES);

        /* Get the next period (per-CPU) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());

        /* Offset the tick to avert 'jiffies_lock' contention. */
        if (sched_skew_tick) {
                u64 offset = TICK_NSEC >> 1;
                do_div(offset, num_possible_cpus());
                offset *= smp_processor_id();
                hrtimer_add_expires_ns(&ts->sched_timer, offset);
        }

        hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
        if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
                hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
        else
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
        tick_nohz_activate(ts);
}

/*
 * Shut down the tick and make sure the CPU won't try to retake the timekeeping
 * duty before disabling IRQs in idle for the last time.
 */
void tick_sched_timer_dying(int cpu)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t idle_sleeptime, iowait_sleeptime;
        unsigned long idle_calls, idle_sleeps;

        /* This must happen before hrtimers are migrated! */
        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
                hrtimer_cancel(&ts->sched_timer);

        idle_sleeptime = ts->idle_sleeptime;
        iowait_sleeptime = ts->iowait_sleeptime;
        idle_calls = ts->idle_calls;
        idle_sleeps = ts->idle_sleeps;
        memset(ts, 0, sizeof(*ts));
        ts->idle_sleeptime = idle_sleeptime;
        ts->iowait_sleeptime = iowait_sleeptime;
        ts->idle_calls = idle_calls;
        ts->idle_sleeps = idle_sleeps;
}

/*
 * Async notification about clocksource changes
 */
void tick_clock_notify(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
}

/*
 * Async notification about clock event changes
 */
void tick_oneshot_notify(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        set_bit(0, &ts->check_clocks);
}

/*
 * Check if a change happened, which makes oneshot possible.
 *
 * Called cyclically from the hrtimer softirq (driven by the timer
 * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
 * mode, because high resolution timers are disabled (either compile
 * or runtime). Called with interrupts disabled.
 */
int tick_check_oneshot_change(int allow_nohz)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (!test_and_clear_bit(0, &ts->check_clocks))
                return 0;

        if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
                return 0;

        if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
                return 0;

        if (!allow_nohz)
                return 1;

        tick_nohz_switch_to_nohz();
        return 0;
}

















































































































































































































































































































































































































































































































































































    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 

















    1 


























































    3 





   22 


























































































































































































































































































    1 












































































































































































































































































































































































    1 
























































    2 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   24 










   24 





































   24 









   24 












   24 











   24 












































































































































































































































    1 































































    2 








    1 














































































































































































































































































































































































































































































































































    2 







































































































    2 











































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Interfaces handler.
 *
 * Version:        @(#)dev.h        1.0.10        08/12/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Donald J. Becker, <becker@cesdis.gsfc.nasa.gov>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Bjorn Ekwall. <bj0rn@blox.se>
 *              Pekka Riikonen <priikone@poseidon.pspt.fi>
 *
 *                Moved to /usr/include/linux for NET3
 */
#ifndef _LINUX_NETDEVICE_H
#define _LINUX_NETDEVICE_H

#include <linux/timer.h>
#include <linux/bug.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
#include <asm/cache.h>
#include <asm/byteorder.h>
#include <asm/local.h>

#include <linux/percpu.h>
#include <linux/rculist.h>
#include <linux/workqueue.h>
#include <linux/dynamic_queue_limits.h>

#include <net/net_namespace.h>
#ifdef CONFIG_DCB
#include <net/dcbnl.h>
#endif
#include <net/netprio_cgroup.h>
#include <linux/netdev_features.h>
#include <linux/neighbour.h>
#include <linux/netdevice_xmit.h>
#include <uapi/linux/netdevice.h>
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
#include <uapi/linux/netdev.h>
#include <linux/hashtable.h>
#include <linux/rbtree.h>
#include <net/net_trackers.h>
#include <net/net_debug.h>
#include <net/dropreason-core.h>
#include <net/neighbour_tables.h>

struct netpoll_info;
struct device;
struct ethtool_ops;
struct kernel_hwtstamp_config;
struct phy_device;
struct dsa_port;
struct ip_tunnel_parm_kern;
struct macsec_context;
struct macsec_ops;
struct netdev_config;
struct netdev_name_node;
struct sd_flow_limit;
struct sfp_bus;
/* 802.11 specific */
struct wireless_dev;
/* 802.15.4 specific */
struct wpan_dev;
struct mpls_dev;
/* UDP Tunnel offloads */
struct udp_tunnel_info;
struct udp_tunnel_nic_info;
struct udp_tunnel_nic;
struct bpf_prog;
struct xdp_buff;
struct xdp_frame;
struct xdp_metadata_ops;
struct xdp_md;
struct ethtool_netdev_state;
struct phy_link_topology;
struct hwtstamp_provider;

typedef u32 xdp_features_t;

void synchronize_net(void);
void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops);
void netdev_sw_irq_coalesce_default_on(struct net_device *dev);

/* Backlog congestion levels */
#define NET_RX_SUCCESS                0        /* keep 'em coming, baby */
#define NET_RX_DROP                1        /* packet dropped */

#define MAX_NEST_DEV 8

/*
 * Transmit return codes: transmit return codes originate from three different
 * namespaces:
 *
 * - qdisc return codes
 * - driver transmit return codes
 * - errno values
 *
 * Drivers are allowed to return any one of those in their hard_start_xmit()
 * function. Real network devices commonly used with qdiscs should only return
 * the driver transmit return codes though - when qdiscs are used, the actual
 * transmission happens asynchronously, so the value is not propagated to
 * higher layers. Virtual network devices transmit synchronously; in this case
 * the driver transmit return codes are consumed by dev_queue_xmit(), and all
 * others are propagated to higher layers.
 */

/* qdisc ->enqueue() return codes. */
#define NET_XMIT_SUCCESS        0x00
#define NET_XMIT_DROP                0x01        /* skb dropped                        */
#define NET_XMIT_CN                0x02        /* congestion notification        */
#define NET_XMIT_MASK                0x0f        /* qdisc flags in net/sch_generic.h */

/* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
 * indicates that the device will soon be dropping packets, or already drops
 * some packets of the same priority; prompting us to send less aggressively. */
#define net_xmit_eval(e)        ((e) == NET_XMIT_CN ? 0 : (e))
#define net_xmit_errno(e)        ((e) != NET_XMIT_CN ? -ENOBUFS : 0)

/* Driver transmit return codes */
#define NETDEV_TX_MASK                0xf0

enum netdev_tx {
        __NETDEV_TX_MIN         = INT_MIN,        /* make sure enum is signed */
        NETDEV_TX_OK         = 0x00,        /* driver took care of packet */
        NETDEV_TX_BUSY         = 0x10,        /* driver tx path was busy*/
};
typedef enum netdev_tx netdev_tx_t;

/*
 * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant;
 * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed.
 */
static inline bool dev_xmit_complete(int rc)
{
        /*
         * Positive cases with an skb consumed by a driver:
         * - successful transmission (rc == NETDEV_TX_OK)
         * - error while transmitting (rc < 0)
         * - error while queueing to a different device (rc & NET_XMIT_MASK)
         */
        if (likely(rc < NET_XMIT_MASK))
                return true;

        return false;
}

/*
 *        Compute the worst-case header length according to the protocols
 *        used.
 */

#if defined(CONFIG_HYPERV_NET)
# define LL_MAX_HEADER 128
#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25)
# if defined(CONFIG_MAC80211_MESH)
#  define LL_MAX_HEADER 128
# else
#  define LL_MAX_HEADER 96
# endif
#else
# define LL_MAX_HEADER 32
#endif

#if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \
    !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL)
#define MAX_HEADER LL_MAX_HEADER
#else
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif

/*
 *        Old network device statistics. Fields are native words
 *        (unsigned long) so they can be read and written atomically.
 */

#define NET_DEV_STAT(FIELD)                        \
        union {                                        \
                unsigned long FIELD;                \
                atomic_long_t __##FIELD;        \
        }

struct net_device_stats {
        NET_DEV_STAT(rx_packets);
        NET_DEV_STAT(tx_packets);
        NET_DEV_STAT(rx_bytes);
        NET_DEV_STAT(tx_bytes);
        NET_DEV_STAT(rx_errors);
        NET_DEV_STAT(tx_errors);
        NET_DEV_STAT(rx_dropped);
        NET_DEV_STAT(tx_dropped);
        NET_DEV_STAT(multicast);
        NET_DEV_STAT(collisions);
        NET_DEV_STAT(rx_length_errors);
        NET_DEV_STAT(rx_over_errors);
        NET_DEV_STAT(rx_crc_errors);
        NET_DEV_STAT(rx_frame_errors);
        NET_DEV_STAT(rx_fifo_errors);
        NET_DEV_STAT(rx_missed_errors);
        NET_DEV_STAT(tx_aborted_errors);
        NET_DEV_STAT(tx_carrier_errors);
        NET_DEV_STAT(tx_fifo_errors);
        NET_DEV_STAT(tx_heartbeat_errors);
        NET_DEV_STAT(tx_window_errors);
        NET_DEV_STAT(rx_compressed);
        NET_DEV_STAT(tx_compressed);
};
#undef NET_DEV_STAT

/* per-cpu stats, allocated on demand.
 * Try to fit them in a single cache line, for dev_get_stats() sake.
 */
struct net_device_core_stats {
        unsigned long        rx_dropped;
        unsigned long        tx_dropped;
        unsigned long        rx_nohandler;
        unsigned long        rx_otherhost_dropped;
} __aligned(4 * sizeof(unsigned long));

#include <linux/cache.h>
#include <linux/skbuff.h>

struct neighbour;
struct neigh_parms;
struct sk_buff;

struct netdev_hw_addr {
        struct list_head        list;
        struct rb_node                node;
        unsigned char                addr[MAX_ADDR_LEN];
        unsigned char                type;
#define NETDEV_HW_ADDR_T_LAN                1
#define NETDEV_HW_ADDR_T_SAN                2
#define NETDEV_HW_ADDR_T_UNICAST        3
#define NETDEV_HW_ADDR_T_MULTICAST        4
        bool                        global_use;
        int                        sync_cnt;
        int                        refcount;
        int                        synced;
        struct rcu_head                rcu_head;
};

struct netdev_hw_addr_list {
        struct list_head        list;
        int                        count;

        /* Auxiliary tree for faster lookup on addition and deletion */
        struct rb_root                tree;
};

#define netdev_hw_addr_list_count(l) ((l)->count)
#define netdev_hw_addr_list_empty(l) (netdev_hw_addr_list_count(l) == 0)
#define netdev_hw_addr_list_for_each(ha, l) \
        list_for_each_entry(ha, &(l)->list, list)

#define netdev_uc_count(dev) netdev_hw_addr_list_count(&(dev)->uc)
#define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc)
#define netdev_for_each_uc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->uc)
#define netdev_for_each_synced_uc_addr(_ha, _dev) \
        netdev_for_each_uc_addr((_ha), (_dev)) \
                if ((_ha)->sync_cnt)

#define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc)
#define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc)
#define netdev_for_each_mc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->mc)
#define netdev_for_each_synced_mc_addr(_ha, _dev) \
        netdev_for_each_mc_addr((_ha), (_dev)) \
                if ((_ha)->sync_cnt)

struct hh_cache {
        unsigned int        hh_len;
        seqlock_t        hh_lock;

        /* cached hardware header; allow for machine alignment needs.        */
#define HH_DATA_MOD        16
#define HH_DATA_OFF(__len) \
        (HH_DATA_MOD - (((__len - 1) & (HH_DATA_MOD - 1)) + 1))
#define HH_DATA_ALIGN(__len) \
        (((__len)+(HH_DATA_MOD-1))&~(HH_DATA_MOD - 1))
        unsigned long        hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
};

/* Reserve HH_DATA_MOD byte-aligned hard_header_len, but at least that much.
 * Alternative is:
 *   dev->hard_header_len ? (dev->hard_header_len +
 *                           (HH_DATA_MOD - 1)) & ~(HH_DATA_MOD - 1) : 0
 *
 * We could use other alignment values, but we must maintain the
 * relationship HH alignment <= LL alignment.
 */
#define LL_RESERVED_SPACE(dev) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
#define LL_RESERVED_SPACE_EXTRA(dev,extra) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)

struct header_ops {
        int        (*create) (struct sk_buff *skb, struct net_device *dev,
                           unsigned short type, const void *daddr,
                           const void *saddr, unsigned int len);
        int        (*parse)(const struct sk_buff *skb,
                         const struct net_device *dev,
                         unsigned char *haddr);
        int        (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type);
        void        (*cache_update)(struct hh_cache *hh,
                                const struct net_device *dev,
                                const unsigned char *haddr);
        bool        (*validate)(const char *ll_header, unsigned int len);
        __be16        (*parse_protocol)(const struct sk_buff *skb);
};

/* These flag bits are private to the generic network queueing
 * layer; they may not be explicitly referenced by any other
 * code.
 */

enum netdev_state_t {
        __LINK_STATE_START,
        __LINK_STATE_PRESENT,
        __LINK_STATE_NOCARRIER,
        __LINK_STATE_LINKWATCH_PENDING,
        __LINK_STATE_DORMANT,
        __LINK_STATE_TESTING,
};

struct gro_list {
        struct list_head        list;
        int                        count;
};

/*
 * size of gro hash buckets, must be <= the number of bits in
 * gro_node::bitmask
 */
#define GRO_HASH_BUCKETS        8

/**
 * struct gro_node - structure to support Generic Receive Offload
 * @bitmask: bitmask to indicate used buckets in @hash
 * @hash: hashtable of pending aggregated skbs, separated by flows
 * @rx_list: list of pending ``GRO_NORMAL`` skbs
 * @rx_count: cached current length of @rx_list
 * @cached_napi_id: napi_struct::napi_id cached for hotpath, 0 for standalone
 */
struct gro_node {
        unsigned long                bitmask;
        struct gro_list                hash[GRO_HASH_BUCKETS];
        struct list_head        rx_list;
        u32                        rx_count;
        u32                        cached_napi_id;
};

/*
 * Structure for per-NAPI config
 */
struct napi_config {
        u64 gro_flush_timeout;
        u64 irq_suspend_timeout;
        u32 defer_hard_irqs;
        cpumask_t affinity_mask;
        u8 threaded;
        unsigned int napi_id;
};

/*
 * Structure for NAPI scheduling similar to tasklet but with weighting
 */
struct napi_struct {
        /* This field should be first or softnet_data.backlog needs tweaks. */
        unsigned long                state;
        /* The poll_list must only be managed by the entity which
         * changes the state of the NAPI_STATE_SCHED bit.  This means
         * whoever atomically sets that bit can add this napi_struct
         * to the per-CPU poll_list, and whoever clears that bit
         * can remove from the list right before clearing the bit.
         */
        struct list_head        poll_list;

        int                        weight;
        u32                        defer_hard_irqs_count;
        int                        (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
        /* CPU actively polling if netpoll is configured */
        int                        poll_owner;
#endif
        /* CPU on which NAPI has been scheduled for processing */
        int                        list_owner;
        struct net_device        *dev;
        struct sk_buff                *skb;
        struct gro_node                gro;
        struct hrtimer                timer;
        /* all fields past this point are write-protected by netdev_lock */
        struct task_struct        *thread;
        unsigned long                gro_flush_timeout;
        unsigned long                irq_suspend_timeout;
        u32                        defer_hard_irqs;
        /* control-path-only fields follow */
        u32                        napi_id;
        struct list_head        dev_list;
        struct hlist_node        napi_hash_node;
        int                        irq;
        struct irq_affinity_notify notify;
        int                        napi_rmap_idx;
        int                        index;
        struct napi_config        *config;
};

enum {
        NAPI_STATE_SCHED,                /* Poll is scheduled */
        NAPI_STATE_MISSED,                /* reschedule a napi */
        NAPI_STATE_DISABLE,                /* Disable pending */
        NAPI_STATE_NPSVC,                /* Netpoll - don't dequeue from poll_list */
        NAPI_STATE_LISTED,                /* NAPI added to system lists */
        NAPI_STATE_NO_BUSY_POLL,        /* Do not add in napi_hash, no busy polling */
        NAPI_STATE_IN_BUSY_POLL,        /* Do not rearm NAPI interrupt */
        NAPI_STATE_PREFER_BUSY_POLL,        /* prefer busy-polling over softirq processing*/
        NAPI_STATE_THREADED,                /* The poll is performed inside its own thread*/
        NAPI_STATE_SCHED_THREADED,        /* Napi is currently scheduled in threaded mode */
        NAPI_STATE_HAS_NOTIFIER,        /* Napi has an IRQ notifier */
        NAPI_STATE_THREADED_BUSY_POLL,        /* The threaded NAPI poller will busy poll */
};

enum {
        NAPIF_STATE_SCHED                = BIT(NAPI_STATE_SCHED),
        NAPIF_STATE_MISSED                = BIT(NAPI_STATE_MISSED),
        NAPIF_STATE_DISABLE                = BIT(NAPI_STATE_DISABLE),
        NAPIF_STATE_NPSVC                = BIT(NAPI_STATE_NPSVC),
        NAPIF_STATE_LISTED                = BIT(NAPI_STATE_LISTED),
        NAPIF_STATE_NO_BUSY_POLL        = BIT(NAPI_STATE_NO_BUSY_POLL),
        NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
        NAPIF_STATE_PREFER_BUSY_POLL        = BIT(NAPI_STATE_PREFER_BUSY_POLL),
        NAPIF_STATE_THREADED                = BIT(NAPI_STATE_THREADED),
        NAPIF_STATE_SCHED_THREADED        = BIT(NAPI_STATE_SCHED_THREADED),
        NAPIF_STATE_HAS_NOTIFIER        = BIT(NAPI_STATE_HAS_NOTIFIER),
        NAPIF_STATE_THREADED_BUSY_POLL        = BIT(NAPI_STATE_THREADED_BUSY_POLL),
};

enum gro_result {
        GRO_MERGED,
        GRO_MERGED_FREE,
        GRO_HELD,
        GRO_NORMAL,
        GRO_CONSUMED,
};
typedef enum gro_result gro_result_t;

/*
 * enum rx_handler_result - Possible return values for rx_handlers.
 * @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it
 * further.
 * @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in
 * case skb->dev was changed by rx_handler.
 * @RX_HANDLER_EXACT: Force exact delivery, no wildcard.
 * @RX_HANDLER_PASS: Do nothing, pass the skb as if no rx_handler was called.
 *
 * rx_handlers are functions called from inside __netif_receive_skb(), to do
 * special processing of the skb, prior to delivery to protocol handlers.
 *
 * Currently, a net_device can only have a single rx_handler registered. Trying
 * to register a second rx_handler will return -EBUSY.
 *
 * To register a rx_handler on a net_device, use netdev_rx_handler_register().
 * To unregister a rx_handler on a net_device, use
 * netdev_rx_handler_unregister().
 *
 * Upon return, rx_handler is expected to tell __netif_receive_skb() what to
 * do with the skb.
 *
 * If the rx_handler consumed the skb in some way, it should return
 * RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for
 * the skb to be delivered in some other way.
 *
 * If the rx_handler changed skb->dev, to divert the skb to another
 * net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the
 * new device will be called if it exists.
 *
 * If the rx_handler decides the skb should be ignored, it should return
 * RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that
 * are registered on exact device (ptype->dev == skb->dev).
 *
 * If the rx_handler didn't change skb->dev, but wants the skb to be normally
 * delivered, it should return RX_HANDLER_PASS.
 *
 * A device without a registered rx_handler will behave as if rx_handler
 * returned RX_HANDLER_PASS.
 */

enum rx_handler_result {
        RX_HANDLER_CONSUMED,
        RX_HANDLER_ANOTHER,
        RX_HANDLER_EXACT,
        RX_HANDLER_PASS,
};
typedef enum rx_handler_result rx_handler_result_t;
typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);

void __napi_schedule(struct napi_struct *n);
void __napi_schedule_irqoff(struct napi_struct *n);

static inline bool napi_disable_pending(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_DISABLE, &n->state);
}

static inline bool napi_prefer_busy_poll(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
}

/**
 * napi_is_scheduled - test if NAPI is scheduled
 * @n: NAPI context
 *
 * This check is "best-effort". With no locking implemented,
 * a NAPI can be scheduled or terminate right after this check
 * and produce not precise results.
 *
 * NAPI_STATE_SCHED is an internal state, napi_is_scheduled
 * should not be used normally and napi_schedule should be
 * used instead.
 *
 * Use only if the driver really needs to check if a NAPI
 * is scheduled for example in the context of delayed timer
 * that can be skipped if a NAPI is already scheduled.
 *
 * Return: True if NAPI is scheduled, False otherwise.
 */
static inline bool napi_is_scheduled(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_SCHED, &n->state);
}

bool napi_schedule_prep(struct napi_struct *n);

/**
 *        napi_schedule - schedule NAPI poll
 *        @n: NAPI context
 *
 * Schedule NAPI poll routine to be called if it is not already
 * running.
 * Return: true if we schedule a NAPI or false if not.
 * Refer to napi_schedule_prep() for additional reason on why
 * a NAPI might not be scheduled.
 */
static inline bool napi_schedule(struct napi_struct *n)
{
        if (napi_schedule_prep(n)) {
                __napi_schedule(n);
                return true;
        }

        return false;
}

/**
 *        napi_schedule_irqoff - schedule NAPI poll
 *        @n: NAPI context
 *
 * Variant of napi_schedule(), assuming hard irqs are masked.
 */
static inline void napi_schedule_irqoff(struct napi_struct *n)
{
        if (napi_schedule_prep(n))
                __napi_schedule_irqoff(n);
}

/**
 * napi_complete_done - NAPI processing complete
 * @n: NAPI context
 * @work_done: number of packets processed
 *
 * Mark NAPI processing as complete. Should only be called if poll budget
 * has not been completely consumed.
 * Prefer over napi_complete().
 * Return: false if device should avoid rearming interrupts.
 */
bool napi_complete_done(struct napi_struct *n, int work_done);

static inline bool napi_complete(struct napi_struct *n)
{
        return napi_complete_done(n, 0);
}

void netif_threaded_enable(struct net_device *dev);
int dev_set_threaded(struct net_device *dev,
                     enum netdev_napi_threaded threaded);

void napi_disable(struct napi_struct *n);
void napi_disable_locked(struct napi_struct *n);

void napi_enable(struct napi_struct *n);
void napi_enable_locked(struct napi_struct *n);

/**
 *        napi_synchronize - wait until NAPI is not running
 *        @n: NAPI context
 *
 * Wait until NAPI is done being scheduled on this context.
 * Waits till any outstanding processing completes but
 * does not disable future activations.
 */
static inline void napi_synchronize(const struct napi_struct *n)
{
        if (IS_ENABLED(CONFIG_SMP))
                while (test_bit(NAPI_STATE_SCHED, &n->state))
                        msleep(1);
        else
                barrier();
}

/**
 *        napi_if_scheduled_mark_missed - if napi is running, set the
 *        NAPIF_STATE_MISSED
 *        @n: NAPI context
 *
 * If napi is running, set the NAPIF_STATE_MISSED, and return true if
 * NAPI is scheduled.
 **/
static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
{
        unsigned long val, new;

        val = READ_ONCE(n->state);
        do {
                if (val & NAPIF_STATE_DISABLE)
                        return true;

                if (!(val & NAPIF_STATE_SCHED))
                        return false;

                new = val | NAPIF_STATE_MISSED;
        } while (!try_cmpxchg(&n->state, &val, new));

        return true;
}

enum netdev_queue_state_t {
        __QUEUE_STATE_DRV_XOFF,
        __QUEUE_STATE_STACK_XOFF,
        __QUEUE_STATE_FROZEN,
};

#define QUEUE_STATE_DRV_XOFF        (1 << __QUEUE_STATE_DRV_XOFF)
#define QUEUE_STATE_STACK_XOFF        (1 << __QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_FROZEN        (1 << __QUEUE_STATE_FROZEN)

#define QUEUE_STATE_ANY_XOFF        (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \
                                        QUEUE_STATE_FROZEN)
#define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \
                                        QUEUE_STATE_FROZEN)

/*
 * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue.  The
 * netif_tx_* functions below are used to manipulate this flag.  The
 * __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit
 * queue independently.  The netif_xmit_*stopped functions below are called
 * to check if the queue has been stopped by the driver or stack (either
 * of the XOFF bits are set in the state).  Drivers should not need to call
 * netif_xmit*stopped functions, they should only be using netif_tx_*.
 */

struct netdev_queue {
/*
 * read-mostly part
 */
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;

        struct Qdisc __rcu        *qdisc;
        struct Qdisc __rcu        *qdisc_sleeping;
#ifdef CONFIG_SYSFS
        struct kobject                kobj;
        const struct attribute_group        **groups;
#endif
        unsigned long                tx_maxrate;
        /*
         * Number of TX timeouts for this queue
         * (/sys/class/net/DEV/Q/trans_timeout)
         */
        atomic_long_t                trans_timeout;

        /* Subordinate device that the queue has been assigned to */
        struct net_device        *sb_dev;
#ifdef CONFIG_XDP_SOCKETS
        /* "ops protected", see comment about net_device::lock */
        struct xsk_buff_pool    *pool;
#endif

/*
 * write-mostly part
 */
#ifdef CONFIG_BQL
        struct dql                dql;
#endif
        spinlock_t                _xmit_lock ____cacheline_aligned_in_smp;
        int                        xmit_lock_owner;
        /*
         * Time (in jiffies) of last Tx
         */
        unsigned long                trans_start;

        unsigned long                state;

/*
 * slow- / control-path part
 */
        /* NAPI instance for the queue
         * "ops protected", see comment about net_device::lock
         */
        struct napi_struct        *napi;

#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        int                        numa_node;
#endif
} ____cacheline_aligned_in_smp;

extern int sysctl_fb_tunnels_only_for_init_net;
extern int sysctl_devconf_inherit_init_net;

/*
 * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns
 *                                     == 1 : For initns only
 *                                     == 2 : For none.
 */
static inline bool net_has_fallback_tunnels(const struct net *net)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net);

        return !fb_tunnels_only_for_init_net ||
                (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1);
#else
        return true;
#endif
}

static inline int net_inherit_devconf(void)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        return READ_ONCE(sysctl_devconf_inherit_init_net);
#else
        return 0;
#endif
}

static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        return q->numa_node;
#else
        return NUMA_NO_NODE;
#endif
}

static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        q->numa_node = node;
#endif
}

#ifdef CONFIG_RFS_ACCEL
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
                         u16 filter_id);
#endif

/* XPS map type and offset of the xps map within net_device->xps_maps[]. */
enum xps_map_type {
        XPS_CPUS = 0,
        XPS_RXQS,
        XPS_MAPS_MAX,
};

#ifdef CONFIG_XPS
/*
 * This structure holds an XPS map which can be of variable length.  The
 * map is an array of queues.
 */
struct xps_map {
        unsigned int len;
        unsigned int alloc_len;
        struct rcu_head rcu;
        u16 queues[];
};
#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16)))
#define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \
       - sizeof(struct xps_map)) / sizeof(u16))

/*
 * This structure holds all XPS maps for device.  Maps are indexed by CPU.
 *
 * We keep track of the number of cpus/rxqs used when the struct is allocated,
 * in nr_ids. This will help not accessing out-of-bound memory.
 *
 * We keep track of the number of traffic classes used when the struct is
 * allocated, in num_tc. This will be used to navigate the maps, to ensure we're
 * not crossing its upper bound, as the original dev->num_tc can be updated in
 * the meantime.
 */
struct xps_dev_maps {
        struct rcu_head rcu;
        unsigned int nr_ids;
        s16 num_tc;
        struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
};

#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +        \
        (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))

#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
        (_rxqs * (_tcs) * sizeof(struct xps_map *)))

#endif /* CONFIG_XPS */

#define TC_MAX_QUEUE        16
#define TC_BITMASK        15
/* HW offloaded queuing disciplines txq count and offset maps */
struct netdev_tc_txq {
        u16 count;
        u16 offset;
};

#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/*
 * This structure is to hold information about the device
 * configured to run FCoE protocol stack.
 */
struct netdev_fcoe_hbainfo {
        char        manufacturer[64];
        char        serial_number[64];
        char        hardware_version[64];
        char        driver_version[64];
        char        optionrom_version[64];
        char        firmware_version[64];
        char        model[256];
        char        model_description[256];
};
#endif

#define MAX_PHYS_ITEM_ID_LEN 32

/* This structure holds a unique identifier to identify some
 * physical item (port for example) used by a netdevice.
 */
struct netdev_phys_item_id {
        unsigned char id[MAX_PHYS_ITEM_ID_LEN];
        unsigned char id_len;
};

static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
                                            struct netdev_phys_item_id *b)
{
        return a->id_len == b->id_len &&
               memcmp(a->id, b->id, a->id_len) == 0;
}

typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct net_device *sb_dev);

enum net_device_path_type {
        DEV_PATH_ETHERNET = 0,
        DEV_PATH_VLAN,
        DEV_PATH_BRIDGE,
        DEV_PATH_PPPOE,
        DEV_PATH_DSA,
        DEV_PATH_MTK_WDMA,
        DEV_PATH_TUN,
};

struct net_device_path {
        enum net_device_path_type        type;
        const struct net_device                *dev;
        union {
                struct {
                        u16                id;
                        __be16                proto;
                        u8                h_dest[ETH_ALEN];
                } encap;
                struct {
                        union {
                                struct in_addr        src_v4;
                                struct in6_addr        src_v6;
                        };
                        union {
                                struct in_addr        dst_v4;
                                struct in6_addr        dst_v6;
                        };

                        u8        l3_proto;
                } tun;
                struct {
                        enum {
                                DEV_PATH_BR_VLAN_KEEP,
                                DEV_PATH_BR_VLAN_TAG,
                                DEV_PATH_BR_VLAN_UNTAG,
                                DEV_PATH_BR_VLAN_UNTAG_HW,
                        }                vlan_mode;
                        u16                vlan_id;
                        __be16                vlan_proto;
                } bridge;
                struct {
                        int port;
                        u16 proto;
                } dsa;
                struct {
                        u8 wdma_idx;
                        u8 queue;
                        u16 wcid;
                        u8 bss;
                        u8 amsdu;
                } mtk_wdma;
        };
};

#define NET_DEVICE_PATH_STACK_MAX        5
#define NET_DEVICE_PATH_VLAN_MAX        2

struct net_device_path_stack {
        int                        num_paths;
        struct net_device_path        path[NET_DEVICE_PATH_STACK_MAX];
};

struct net_device_path_ctx {
        const struct net_device *dev;
        u8                        daddr[ETH_ALEN];

        int                        num_vlans;
        struct {
                u16                id;
                __be16                proto;
        } vlan[NET_DEVICE_PATH_VLAN_MAX];
};

enum tc_setup_type {
        TC_QUERY_CAPS,
        TC_SETUP_QDISC_MQPRIO,
        TC_SETUP_CLSU32,
        TC_SETUP_CLSFLOWER,
        TC_SETUP_CLSMATCHALL,
        TC_SETUP_CLSBPF,
        TC_SETUP_BLOCK,
        TC_SETUP_QDISC_CBS,
        TC_SETUP_QDISC_RED,
        TC_SETUP_QDISC_PRIO,
        TC_SETUP_QDISC_MQ,
        TC_SETUP_QDISC_ETF,
        TC_SETUP_ROOT_QDISC,
        TC_SETUP_QDISC_GRED,
        TC_SETUP_QDISC_TAPRIO,
        TC_SETUP_FT,
        TC_SETUP_QDISC_ETS,
        TC_SETUP_QDISC_TBF,
        TC_SETUP_QDISC_FIFO,
        TC_SETUP_QDISC_HTB,
        TC_SETUP_ACT,
};

/* These structures hold the attributes of bpf state that are being passed
 * to the netdevice through the bpf op.
 */
enum bpf_netdev_command {
        /* Set or clear a bpf program used in the earliest stages of packet
         * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
         * is responsible for calling bpf_prog_put on any old progs that are
         * stored. In case of error, the callee need not release the new prog
         * reference, but on success it takes ownership and must bpf_prog_put
         * when it is no longer used.
         */
        XDP_SETUP_PROG,
        XDP_SETUP_PROG_HW,
        /* BPF program for offload callbacks, invoked at program load time. */
        BPF_OFFLOAD_MAP_ALLOC,
        BPF_OFFLOAD_MAP_FREE,
        XDP_SETUP_XSK_POOL,
};

struct bpf_prog_offload_ops;
struct netlink_ext_ack;
struct xdp_umem;
struct xdp_dev_bulk_queue;
struct bpf_xdp_link;

enum bpf_xdp_mode {
        XDP_MODE_SKB = 0,
        XDP_MODE_DRV = 1,
        XDP_MODE_HW = 2,
        __MAX_XDP_MODE
};

struct bpf_xdp_entity {
        struct bpf_prog *prog;
        struct bpf_xdp_link *link;
};

struct netdev_bpf {
        enum bpf_netdev_command command;
        union {
                /* XDP_SETUP_PROG */
                struct {
                        u32 flags;
                        struct bpf_prog *prog;
                        struct netlink_ext_ack *extack;
                };
                /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
                struct {
                        struct bpf_offloaded_map *offmap;
                };
                /* XDP_SETUP_XSK_POOL */
                struct {
                        struct xsk_buff_pool *pool;
                        u16 queue_id;
                } xsk;
        };
};

/* Flags for ndo_xsk_wakeup. */
#define XDP_WAKEUP_RX (1 << 0)
#define XDP_WAKEUP_TX (1 << 1)

#ifdef CONFIG_XFRM_OFFLOAD
struct xfrmdev_ops {
        int        (*xdo_dev_state_add)(struct net_device *dev,
                                     struct xfrm_state *x,
                                     struct netlink_ext_ack *extack);
        void        (*xdo_dev_state_delete)(struct net_device *dev,
                                        struct xfrm_state *x);
        void        (*xdo_dev_state_free)(struct net_device *dev,
                                      struct xfrm_state *x);
        bool        (*xdo_dev_offload_ok) (struct sk_buff *skb,
                                       struct xfrm_state *x);
        void        (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
        void        (*xdo_dev_state_update_stats) (struct xfrm_state *x);
        int        (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack);
        void        (*xdo_dev_policy_delete) (struct xfrm_policy *x);
        void        (*xdo_dev_policy_free) (struct xfrm_policy *x);
};
#endif

struct dev_ifalias {
        struct rcu_head rcuhead;
        char ifalias[];
};

struct devlink;
struct tlsdev_ops;

struct netdev_net_notifier {
        struct list_head list;
        struct notifier_block *nb;
};

/*
 * This structure defines the management hooks for network devices.
 * The following hooks can be defined; unless noted otherwise, they are
 * optional and can be filled with a null pointer.
 *
 * int (*ndo_init)(struct net_device *dev);
 *     This function is called once when a network device is registered.
 *     The network device can use this for any late stage initialization
 *     or semantic validation. It can fail with an error code which will
 *     be propagated back to register_netdev.
 *
 * void (*ndo_uninit)(struct net_device *dev);
 *     This function is called when device is unregistered or when registration
 *     fails. It is not called if init fails.
 *
 * int (*ndo_open)(struct net_device *dev);
 *     This function is called when a network device transitions to the up
 *     state.
 *
 * int (*ndo_stop)(struct net_device *dev);
 *     This function is called when a network device transitions to the down
 *     state.
 *
 * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
 *                               struct net_device *dev);
 *        Called when a packet needs to be transmitted.
 *        Returns NETDEV_TX_OK.  Can return NETDEV_TX_BUSY, but you should stop
 *        the queue before that can happen; it's for obsolete devices and weird
 *        corner cases, but the stack really does a non-trivial amount
 *        of useless work if you return NETDEV_TX_BUSY.
 *        Required; cannot be NULL.
 *
 * netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
 *                                           struct net_device *dev
 *                                           netdev_features_t features);
 *        Called by core transmit path to determine if device is capable of
 *        performing offload operations on a given packet. This is to give
 *        the device an opportunity to implement any restrictions that cannot
 *        be otherwise expressed by feature flags. The check is called with
 *        the set of features that the stack has calculated and it returns
 *        those the driver believes to be appropriate.
 *
 * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
 *                         struct net_device *sb_dev);
 *        Called to decide which queue to use when device supports multiple
 *        transmit queues.
 *
 * void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
 *        This function is called to allow device receiver to make
 *        changes to configuration when multicast or promiscuous is enabled.
 *
 * void (*ndo_set_rx_mode)(struct net_device *dev);
 *        This function is called device changes address list filtering.
 *        If driver handles unicast address filtering, it should set
 *        IFF_UNICAST_FLT in its priv_flags.
 *
 * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
 *        This function  is called when the Media Access Control address
 *        needs to be changed. If this interface is not defined, the
 *        MAC address can not be changed.
 *
 * int (*ndo_validate_addr)(struct net_device *dev);
 *        Test if Media Access Control address is valid for the device.
 *
 * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Old-style ioctl entry point. This is used internally by the
 *        ieee802154 subsystem but is no longer called by the device
 *        ioctl handler.
 *
 * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Used by the bonding driver for its device specific ioctls:
 *        SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE,
 *        SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY
 *
 * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG,
 *        SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP.
 *
 * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
 *        Used to set network devices bus interface parameters. This interface
 *        is retained for legacy reasons; new devices should use the bus
 *        interface (PCI) for low level management.
 *
 * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
 *        Called when a user wants to change the Maximum Transfer Unit
 *        of a device.
 *
 * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue);
 *        Callback used when the transmitter has not made any progress
 *        for dev->watchdog ticks.
 *
 * void (*ndo_get_stats64)(struct net_device *dev,
 *                         struct rtnl_link_stats64 *storage);
 * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 *        Called when a user wants to get the network device usage
 *        statistics. Drivers must do one of the following:
 *        1. Define @ndo_get_stats64 to fill in a zero-initialised
 *           rtnl_link_stats64 structure passed by the caller.
 *        2. Define @ndo_get_stats to update a net_device_stats structure
 *           (which should normally be dev->stats) and return a pointer to
 *           it. The structure may be changed asynchronously only if each
 *           field is written atomically.
 *        3. Update dev->stats asynchronously and atomically, and define
 *           neither operation.
 *
 * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
 *        Return true if this device supports offload stats of this attr_id.
 *
 * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
 *        void *attr_data)
 *        Get statistics for offload operations by attr_id. Write it into the
 *        attr_data pointer.
 *
 * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is registered.
 *
 * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is unregistered.
 *
 * void (*ndo_poll_controller)(struct net_device *dev);
 *
 *        SR-IOV management functions.
 * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
 * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
 *                          u8 qos, __be16 proto);
 * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
 *                          int max_tx_rate);
 * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_config)(struct net_device *dev,
 *                            int vf, struct ifla_vf_info *ivf);
 * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state);
 * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
 *                          struct nlattr *port[]);
 *
 *      Enable or disable the VF ability to query its RSS Redirection Table and
 *      Hash Key. This is needed since on some devices VF share this information
 *      with PF and querying it may introduce a theoretical security risk.
 * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
 * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
 *                       void *type_data);
 *        Called to setup any 'tc' scheduler, classifier or action on @dev.
 *        This is always called from the stack with the rtnl lock held and netif
 *        tx queues stopped. This allows the netdevice to perform queue
 *        management safely.
 *
 *        Fiber Channel over Ethernet (FCoE) offload functions.
 * int (*ndo_fcoe_enable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to start using LLD for FCoE
 *        so the underlying device can perform whatever needed configuration or
 *        initialization to support acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_disable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to stop using LLD for FCoE
 *        so the underlying device can perform whatever needed clean-ups to
 *        stop supporting acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid,
 *                             struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Initiator wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_ddp_done)(struct net_device *dev,  u16 xid);
 *        Called when the FCoE Initiator/Target is done with the DDPed I/O as
 *        indicated by the FC exchange id 'xid', so the underlying device can
 *        clean up and reuse resources for later DDP requests.
 *
 * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid,
 *                              struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Target wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
 *                               struct netdev_fcoe_hbainfo *hbainfo);
 *        Called when the FCoE Protocol stack wants information on the underlying
 *        device. This information is utilized by the FCoE protocol stack to
 *        register attributes with Fiber Channel management service as per the
 *        FC-GS Fabric Device Management Information(FDMI) specification.
 *
 * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type);
 *        Called when the underlying device wants to override default World Wide
 *        Name (WWN) generation mechanism in FCoE protocol stack to pass its own
 *        World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE
 *        protocol stack to use.
 *
 *        RFS acceleration.
 * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
 *                            u16 rxq_index, u32 flow_id);
 *        Set hardware filter for RFS.  rxq_index is the target queue index;
 *        flow_id is a flow ID to be passed to rps_may_expire_flow() later.
 *        Return the filter ID on success, or a negative error code.
 *
 *        Slave management functions (for bridge, bonding, etc).
 * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to make another netdev an underling.
 *
 * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to release previously enslaved netdev.
 *
 * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev,
 *                                            struct sk_buff *skb,
 *                                            bool all_slaves);
 *        Get the xmit slave of master device. If all_slaves is true, function
 *        assume all the slaves can transmit.
 *
 *      Feature/offload setting functions.
 * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
 *                netdev_features_t features);
 *        Adjusts the requested feature flags according to device-specific
 *        constraints, and returns the resulting flags. Must not modify
 *        the device state.
 *
 * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features);
 *        Called to update device configuration to new features. Passed
 *        feature set might be less than what was returned by ndo_fix_features()).
 *        Must return >0 or -errno if it changed dev->features itself.
 *
 * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid, u16 flags,
 *                      bool *notified, struct netlink_ext_ack *extack);
 *        Adds an FDB entry to dev for addr.
 *        Callee shall set *notified to true if it sent any appropriate
 *        notification(s). Otherwise core will send a generic one.
 * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid
 *                      bool *notified, struct netlink_ext_ack *extack);
 *        Deletes the FDB entry from dev corresponding to addr.
 *        Callee shall set *notified to true if it sent any appropriate
 *        notification(s). Otherwise core will send a generic one.
 * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev,
 *                           struct netlink_ext_ack *extack);
 * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
 *                       struct net_device *dev, struct net_device *filter_dev,
 *                       int *idx)
 *        Used to add FDB entries to dump requests. Implementers should add
 *        entries to skb and update idx with the number of entries.
 *
 * int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[],
 *                      u16 nlmsg_flags, struct netlink_ext_ack *extack);
 *        Adds an MDB entry to dev.
 * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[],
 *                      struct netlink_ext_ack *extack);
 *        Deletes the MDB entry from dev.
 * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[],
 *                           struct netlink_ext_ack *extack);
 *        Bulk deletes MDB entries from dev.
 * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb,
 *                       struct netlink_callback *cb);
 *        Dumps MDB entries from dev. The first argument (marker) in the netlink
 *        callback is used by core rtnetlink code.
 *
 * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags, struct netlink_ext_ack *extack)
 * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
 *                             struct net_device *dev, u32 filter_mask,
 *                             int nlflags)
 * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags);
 *
 * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
 *        Called to change device carrier. Soft-devices (like dummy, team, etc)
 *        which do not represent real hardware may define this to allow their
 *        userspace components to manage their virtual carrier state. Devices
 *        that determine carrier state from physical hardware properties (eg
 *        network cables) or protocol-dependent mechanisms (eg
 *        USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
 *
 * int (*ndo_get_phys_port_id)(struct net_device *dev,
 *                               struct netdev_phys_item_id *ppid);
 *        Called to get ID of physical port of this device. If driver does
 *        not implement this, it is assumed that the hw is not able to have
 *        multiple net devices on single physical port.
 *
 * int (*ndo_get_port_parent_id)(struct net_device *dev,
 *                                 struct netdev_phys_item_id *ppid)
 *        Called to get the parent ID of the physical port of this device.
 *
 * void* (*ndo_dfwd_add_station)(struct net_device *pdev,
 *                                 struct net_device *dev)
 *        Called by upper layer devices to accelerate switching or other
 *        station functionality into hardware. 'pdev is the lowerdev
 *        to use for the offload and 'dev' is the net device that will
 *        back the offload. Returns a pointer to the private structure
 *        the upper layer will maintain.
 * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
 *        Called by upper layer device to delete the station created
 *        by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
 *        the station and priv is the structure returned by the add
 *        operation.
 * int (*ndo_set_tx_maxrate)(struct net_device *dev,
 *                             int queue_index, u32 maxrate);
 *        Called when a user wants to set a max-rate limitation of specific
 *        TX queue.
 * int (*ndo_get_iflink)(const struct net_device *dev);
 *        Called to get the iflink value of this device.
 * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
 *        This function is used to get egress tunnel information for given skb.
 *        This is useful for retrieving outer tunnel header parameters while
 *        sampling packet.
 * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
 *        This function is used to specify the headroom that the skb must
 *        consider when allocation skb during packet reception. Setting
 *        appropriate rx headroom value allows avoiding skb head copy on
 *        forward. Setting a negative value resets the rx headroom to the
 *        default value.
 * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
 *        This function is used to set or query state related to XDP on the
 *        netdevice and manage BPF offload. See definition of
 *        enum bpf_netdev_command for details.
 * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
 *                        u32 flags);
 *        This function is used to submit @n XDP packets for transmit on a
 *        netdevice. Returns number of frames successfully transmitted, frames
 *        that got dropped are freed/returned via xdp_return_frame().
 *        Returns negative number, means general error invoking ndo, meaning
 *        no frames were xmit'ed and core-caller will free all frames.
 * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev,
 *                                                struct xdp_buff *xdp);
 *      Get the xmit slave of master device based on the xdp_buff.
 * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
 *      This function is used to wake up the softirq, ksoftirqd or kthread
 *        responsible for sending and/or receiving packets on a specific
 *        queue id bound to an AF_XDP socket. The flags field specifies if
 *        only RX, only Tx, or both should be woken up using the flags
 *        XDP_WAKEUP_RX and XDP_WAKEUP_TX.
 * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p,
 *                         int cmd);
 *        Add, change, delete or get information on an IPv4 tunnel.
 * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
 *        If a device is paired with a peer device, return the peer instance.
 *        The caller must be under RCU read context.
 * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
 *     Get the forwarding path to reach the real device from the HW destination address
 * ktime_t (*ndo_get_tstamp)(struct net_device *dev,
 *                             const struct skb_shared_hwtstamps *hwtstamps,
 *                             bool cycles);
 *        Get hardware timestamp based on normal/adjustable time or free running
 *        cycle counter. This function is required if physical clock supports a
 *        free running cycle counter.
 *
 * int (*ndo_hwtstamp_get)(struct net_device *dev,
 *                           struct kernel_hwtstamp_config *kernel_config);
 *        Get the currently configured hardware timestamping parameters for the
 *        NIC device.
 *
 * int (*ndo_hwtstamp_set)(struct net_device *dev,
 *                           struct kernel_hwtstamp_config *kernel_config,
 *                           struct netlink_ext_ack *extack);
 *        Change the hardware timestamping parameters for NIC device.
 */
struct net_device_ops {
        int                        (*ndo_init)(struct net_device *dev);
        void                        (*ndo_uninit)(struct net_device *dev);
        int                        (*ndo_open)(struct net_device *dev);
        int                        (*ndo_stop)(struct net_device *dev);
        netdev_tx_t                (*ndo_start_xmit)(struct sk_buff *skb,
                                                  struct net_device *dev);
        netdev_features_t        (*ndo_features_check)(struct sk_buff *skb,
                                                      struct net_device *dev,
                                                      netdev_features_t features);
        u16                        (*ndo_select_queue)(struct net_device *dev,
                                                    struct sk_buff *skb,
                                                    struct net_device *sb_dev);
        void                        (*ndo_change_rx_flags)(struct net_device *dev,
                                                       int flags);
        void                        (*ndo_set_rx_mode)(struct net_device *dev);
        int                        (*ndo_set_mac_address)(struct net_device *dev,
                                                       void *addr);
        int                        (*ndo_validate_addr)(struct net_device *dev);
        int                        (*ndo_do_ioctl)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
        int                        (*ndo_eth_ioctl)(struct net_device *dev,
                                                 struct ifreq *ifr, int cmd);
        int                        (*ndo_siocbond)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
        int                        (*ndo_siocwandev)(struct net_device *dev,
                                                  struct if_settings *ifs);
        int                        (*ndo_siocdevprivate)(struct net_device *dev,
                                                      struct ifreq *ifr,
                                                      void __user *data, int cmd);
        int                        (*ndo_set_config)(struct net_device *dev,
                                                  struct ifmap *map);
        int                        (*ndo_change_mtu)(struct net_device *dev,
                                                  int new_mtu);
        int                        (*ndo_neigh_setup)(struct net_device *dev,
                                                   struct neigh_parms *);
        void                        (*ndo_tx_timeout) (struct net_device *dev,
                                                   unsigned int txqueue);

        void                        (*ndo_get_stats64)(struct net_device *dev,
                                                   struct rtnl_link_stats64 *storage);
        bool                        (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
        int                        (*ndo_get_offload_stats)(int attr_id,
                                                         const struct net_device *dev,
                                                         void *attr_data);
        struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);

        int                        (*ndo_vlan_rx_add_vid)(struct net_device *dev,
                                                       __be16 proto, u16 vid);
        int                        (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
                                                        __be16 proto, u16 vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
        void                    (*ndo_poll_controller)(struct net_device *dev);
        int                        (*ndo_netpoll_setup)(struct net_device *dev);
        void                        (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
        int                        (*ndo_set_vf_mac)(struct net_device *dev,
                                                  int queue, u8 *mac);
        int                        (*ndo_set_vf_vlan)(struct net_device *dev,
                                                   int queue, u16 vlan,
                                                   u8 qos, __be16 proto);
        int                        (*ndo_set_vf_rate)(struct net_device *dev,
                                                   int vf, int min_tx_rate,
                                                   int max_tx_rate);
        int                        (*ndo_set_vf_spoofchk)(struct net_device *dev,
                                                       int vf, bool setting);
        int                        (*ndo_set_vf_trust)(struct net_device *dev,
                                                    int vf, bool setting);
        int                        (*ndo_get_vf_config)(struct net_device *dev,
                                                     int vf,
                                                     struct ifla_vf_info *ivf);
        int                        (*ndo_set_vf_link_state)(struct net_device *dev,
                                                         int vf, int link_state);
        int                        (*ndo_get_vf_stats)(struct net_device *dev,
                                                    int vf,
                                                    struct ifla_vf_stats
                                                    *vf_stats);
        int                        (*ndo_set_vf_port)(struct net_device *dev,
                                                   int vf,
                                                   struct nlattr *port[]);
        int                        (*ndo_get_vf_port)(struct net_device *dev,
                                                   int vf, struct sk_buff *skb);
        int                        (*ndo_get_vf_guid)(struct net_device *dev,
                                                   int vf,
                                                   struct ifla_vf_guid *node_guid,
                                                   struct ifla_vf_guid *port_guid);
        int                        (*ndo_set_vf_guid)(struct net_device *dev,
                                                   int vf, u64 guid,
                                                   int guid_type);
        int                        (*ndo_set_vf_rss_query_en)(
                                                   struct net_device *dev,
                                                   int vf, bool setting);
        int                        (*ndo_setup_tc)(struct net_device *dev,
                                                enum tc_setup_type type,
                                                void *type_data);
#if IS_ENABLED(CONFIG_FCOE)
        int                        (*ndo_fcoe_enable)(struct net_device *dev);
        int                        (*ndo_fcoe_disable)(struct net_device *dev);
        int                        (*ndo_fcoe_ddp_setup)(struct net_device *dev,
                                                      u16 xid,
                                                      struct scatterlist *sgl,
                                                      unsigned int sgc);
        int                        (*ndo_fcoe_ddp_done)(struct net_device *dev,
                                                     u16 xid);
        int                        (*ndo_fcoe_ddp_target)(struct net_device *dev,
                                                       u16 xid,
                                                       struct scatterlist *sgl,
                                                       unsigned int sgc);
        int                        (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
                                                        struct netdev_fcoe_hbainfo *hbainfo);
#endif

#if IS_ENABLED(CONFIG_LIBFCOE)
#define NETDEV_FCOE_WWNN 0
#define NETDEV_FCOE_WWPN 1
        int                        (*ndo_fcoe_get_wwn)(struct net_device *dev,
                                                    u64 *wwn, int type);
#endif

#ifdef CONFIG_RFS_ACCEL
        int                        (*ndo_rx_flow_steer)(struct net_device *dev,
                                                     const struct sk_buff *skb,
                                                     u16 rxq_index,
                                                     u32 flow_id);
#endif
        int                        (*ndo_add_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev,
                                                 struct netlink_ext_ack *extack);
        int                        (*ndo_del_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev);
        struct net_device*        (*ndo_get_xmit_slave)(struct net_device *dev,
                                                      struct sk_buff *skb,
                                                      bool all_slaves);
        struct net_device*        (*ndo_sk_get_lower_dev)(struct net_device *dev,
                                                        struct sock *sk);
        netdev_features_t        (*ndo_fix_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_set_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_neigh_construct)(struct net_device *dev,
                                                       struct neighbour *n);
        void                        (*ndo_neigh_destroy)(struct net_device *dev,
                                                     struct neighbour *n);

        int                        (*ndo_fdb_add)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid,
                                               u16 flags,
                                               bool *notified,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_del)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid,
                                               bool *notified,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh,
                                                    struct net_device *dev,
                                                    struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_dump)(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                struct net_device *dev,
                                                struct net_device *filter_dev,
                                                int *idx);
        int                        (*ndo_fdb_get)(struct sk_buff *skb,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid, u32 portid, u32 seq,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_add)(struct net_device *dev,
                                               struct nlattr *tb[],
                                               u16 nlmsg_flags,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_del)(struct net_device *dev,
                                               struct nlattr *tb[],
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_del_bulk)(struct net_device *dev,
                                                    struct nlattr *tb[],
                                                    struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_dump)(struct net_device *dev,
                                                struct sk_buff *skb,
                                                struct netlink_callback *cb);
        int                        (*ndo_mdb_get)(struct net_device *dev,
                                               struct nlattr *tb[], u32 portid,
                                               u32 seq,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_setlink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags,
                                                      struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_getlink)(struct sk_buff *skb,
                                                      u32 pid, u32 seq,
                                                      struct net_device *dev,
                                                      u32 filter_mask,
                                                      int nlflags);
        int                        (*ndo_bridge_dellink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags);
        int                        (*ndo_change_carrier)(struct net_device *dev,
                                                      bool new_carrier);
        int                        (*ndo_get_phys_port_id)(struct net_device *dev,
                                                        struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_port_parent_id)(struct net_device *dev,
                                                          struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_phys_port_name)(struct net_device *dev,
                                                          char *name, size_t len);
        void*                        (*ndo_dfwd_add_station)(struct net_device *pdev,
                                                        struct net_device *dev);
        void                        (*ndo_dfwd_del_station)(struct net_device *pdev,
                                                        void *priv);

        int                        (*ndo_set_tx_maxrate)(struct net_device *dev,
                                                      int queue_index,
                                                      u32 maxrate);
        int                        (*ndo_get_iflink)(const struct net_device *dev);
        int                        (*ndo_fill_metadata_dst)(struct net_device *dev,
                                                       struct sk_buff *skb);
        void                        (*ndo_set_rx_headroom)(struct net_device *dev,
                                                       int needed_headroom);
        int                        (*ndo_bpf)(struct net_device *dev,
                                           struct netdev_bpf *bpf);
        int                        (*ndo_xdp_xmit)(struct net_device *dev, int n,
                                                struct xdp_frame **xdp,
                                                u32 flags);
        struct net_device *        (*ndo_xdp_get_xmit_slave)(struct net_device *dev,
                                                          struct xdp_buff *xdp);
        int                        (*ndo_xsk_wakeup)(struct net_device *dev,
                                                  u32 queue_id, u32 flags);
        int                        (*ndo_tunnel_ctl)(struct net_device *dev,
                                                  struct ip_tunnel_parm_kern *p,
                                                  int cmd);
        struct net_device *        (*ndo_get_peer_dev)(struct net_device *dev);
        int                     (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
                                                         struct net_device_path *path);
        ktime_t                        (*ndo_get_tstamp)(struct net_device *dev,
                                                  const struct skb_shared_hwtstamps *hwtstamps,
                                                  bool cycles);
        int                        (*ndo_hwtstamp_get)(struct net_device *dev,
                                                    struct kernel_hwtstamp_config *kernel_config);
        int                        (*ndo_hwtstamp_set)(struct net_device *dev,
                                                    struct kernel_hwtstamp_config *kernel_config,
                                                    struct netlink_ext_ack *extack);

#if IS_ENABLED(CONFIG_NET_SHAPER)
        /**
         * @net_shaper_ops: Device shaping offload operations
         * see include/net/net_shapers.h
         */
        const struct net_shaper_ops *net_shaper_ops;
#endif
};

/**
 * enum netdev_priv_flags - &struct net_device priv_flags
 *
 * These are the &struct net_device, they are only set internally
 * by drivers and used in the kernel. These flags are invisible to
 * userspace; this means that the order of these flags can change
 * during any kernel release.
 *
 * You should add bitfield booleans after either net_device::priv_flags
 * (hotpath) or ::threaded (slowpath) instead of extending these flags.
 *
 * @IFF_802_1Q_VLAN: 802.1Q VLAN device
 * @IFF_EBRIDGE: Ethernet bridging device
 * @IFF_BONDING: bonding master or slave
 * @IFF_ISATAP: ISATAP interface (RFC4214)
 * @IFF_WAN_HDLC: WAN HDLC device
 * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
 *        release skb->dst
 * @IFF_DONT_BRIDGE: disallow bridging this ether dev
 * @IFF_DISABLE_NETPOLL: disable netpoll at run-time
 * @IFF_MACVLAN_PORT: device used as macvlan port
 * @IFF_BRIDGE_PORT: device used as bridge port
 * @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port
 * @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit
 * @IFF_UNICAST_FLT: Supports unicast filtering
 * @IFF_TEAM_PORT: device used as team port
 * @IFF_SUPP_NOFCS: device supports sending custom FCS
 * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
 *        change when it's running
 * @IFF_MACVLAN: Macvlan device
 * @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account
 *        underlying stacked devices
 * @IFF_L3MDEV_MASTER: device is an L3 master device
 * @IFF_NO_QUEUE: device can run without qdisc attached
 * @IFF_OPENVSWITCH: device is a Open vSwitch master
 * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
 * @IFF_TEAM: device is a team device
 * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
 *        entity (i.e. the master device for bridged veth)
 * @IFF_MACSEC: device is a MACsec device
 * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
 * @IFF_FAILOVER: device is a failover master device
 * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
 * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
 * @IFF_NO_ADDRCONF: prevent ipv6 addrconf
 * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
 *        skb_headlen(skb) == 0 (data starts from frag0)
 */
enum netdev_priv_flags {
        IFF_802_1Q_VLAN                        = 1<<0,
        IFF_EBRIDGE                        = 1<<1,
        IFF_BONDING                        = 1<<2,
        IFF_ISATAP                        = 1<<3,
        IFF_WAN_HDLC                        = 1<<4,
        IFF_XMIT_DST_RELEASE                = 1<<5,
        IFF_DONT_BRIDGE                        = 1<<6,
        IFF_DISABLE_NETPOLL                = 1<<7,
        IFF_MACVLAN_PORT                = 1<<8,
        IFF_BRIDGE_PORT                        = 1<<9,
        IFF_OVS_DATAPATH                = 1<<10,
        IFF_TX_SKB_SHARING                = 1<<11,
        IFF_UNICAST_FLT                        = 1<<12,
        IFF_TEAM_PORT                        = 1<<13,
        IFF_SUPP_NOFCS                        = 1<<14,
        IFF_LIVE_ADDR_CHANGE                = 1<<15,
        IFF_MACVLAN                        = 1<<16,
        IFF_XMIT_DST_RELEASE_PERM        = 1<<17,
        IFF_L3MDEV_MASTER                = 1<<18,
        IFF_NO_QUEUE                        = 1<<19,
        IFF_OPENVSWITCH                        = 1<<20,
        IFF_L3MDEV_SLAVE                = 1<<21,
        IFF_TEAM                        = 1<<22,
        IFF_PHONY_HEADROOM                = 1<<24,
        IFF_MACSEC                        = 1<<25,
        IFF_NO_RX_HANDLER                = 1<<26,
        IFF_FAILOVER                        = 1<<27,
        IFF_FAILOVER_SLAVE                = 1<<28,
        IFF_L3MDEV_RX_HANDLER                = 1<<29,
        IFF_NO_ADDRCONF                        = BIT_ULL(30),
        IFF_TX_SKB_NO_LINEAR                = BIT_ULL(31),
};

/* Specifies the type of the struct net_device::ml_priv pointer */
enum netdev_ml_priv_type {
        ML_PRIV_NONE,
        ML_PRIV_CAN,
};

enum netdev_stat_type {
        NETDEV_PCPU_STAT_NONE,
        NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */
        NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */
        NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */
};

enum netdev_reg_state {
        NETREG_UNINITIALIZED = 0,
        NETREG_REGISTERED,        /* completed register_netdevice */
        NETREG_UNREGISTERING,        /* called unregister_netdevice */
        NETREG_UNREGISTERED,        /* completed unregister todo */
        NETREG_RELEASED,        /* called free_netdev */
        NETREG_DUMMY,                /* dummy device for NAPI poll */
};

/**
 *        struct net_device - The DEVICE structure.
 *
 *        Actually, this whole structure is a big mistake.  It mixes I/O
 *        data with strictly "high-level" data, and it has to know about
 *        almost every data structure used in the INET module.
 *
 *        @priv_flags:        flags invisible to userspace defined as bits, see
 *                        enum netdev_priv_flags for the definitions
 *        @lltx:                device supports lockless Tx. Deprecated for real HW
 *                        drivers. Mainly used by logical interfaces, such as
 *                        bonding and tunnels
 *        @netmem_tx:        device support netmem_tx.
 *
 *        @name:        This is the first field of the "visible" part of this structure
 *                (i.e. as seen by users in the "Space.c" file).  It is the name
 *                of the interface.
 *
 *        @name_node:        Name hashlist node
 *        @ifalias:        SNMP alias
 *        @mem_end:        Shared memory end
 *        @mem_start:        Shared memory start
 *        @base_addr:        Device I/O address
 *        @irq:                Device IRQ number
 *
 *        @state:                Generic network queuing layer state, see netdev_state_t
 *        @dev_list:        The global list of network devices
 *        @napi_list:        List entry used for polling NAPI devices
 *        @unreg_list:        List entry  when we are unregistering the
 *                        device; see the function unregister_netdev
 *        @close_list:        List entry used when we are closing the device
 *        @ptype_all:     Device-specific packet handlers for all protocols
 *        @ptype_specific: Device-specific, protocol-specific packet handlers
 *
 *        @adj_list:        Directly linked devices, like slaves for bonding
 *        @features:        Currently active device features
 *        @hw_features:        User-changeable features
 *
 *        @wanted_features:        User-requested features
 *        @vlan_features:                Mask of features inheritable by VLAN devices
 *
 *        @hw_enc_features:        Mask of features inherited by encapsulating devices
 *                                This field indicates what encapsulation
 *                                offloads the hardware is capable of doing,
 *                                and drivers will need to set them appropriately.
 *
 *        @mpls_features:        Mask of features inheritable by MPLS
 *        @gso_partial_features: value(s) from NETIF_F_GSO\*
 *        @mangleid_features:        Mask of features requiring MANGLEID, will be
 *                                disabled together with the latter.
 *
 *        @ifindex:        interface index
 *        @group:                The group the device belongs to
 *
 *        @stats:                Statistics struct, which was left as a legacy, use
 *                        rtnl_link_stats64 instead
 *
 *        @core_stats:        core networking counters,
 *                        do not use this in drivers
 *        @carrier_up_count:        Number of times the carrier has been up
 *        @carrier_down_count:        Number of times the carrier has been down
 *
 *        @wireless_handlers:        List of functions to handle Wireless Extensions,
 *                                instead of ioctl,
 *                                see <net/iw_handler.h> for details.
 *
 *        @netdev_ops:        Includes several pointers to callbacks,
 *                        if one wants to override the ndo_*() functions
 *        @xdp_metadata_ops:        Includes pointers to XDP metadata callbacks.
 *        @xsk_tx_metadata_ops:        Includes pointers to AF_XDP TX metadata callbacks.
 *        @ethtool_ops:        Management operations
 *        @l3mdev_ops:        Layer 3 master device operations
 *        @ndisc_ops:        Includes callbacks for different IPv6 neighbour
 *                        discovery handling. Necessary for e.g. 6LoWPAN.
 *        @xfrmdev_ops:        Transformation offload operations
 *        @tlsdev_ops:        Transport Layer Security offload operations
 *        @header_ops:        Includes callbacks for creating,parsing,caching,etc
 *                        of Layer 2 headers.
 *
 *        @flags:                Interface flags (a la BSD)
 *        @xdp_features:        XDP capability supported by the device
 *        @gflags:        Global flags ( kept as legacy )
 *        @priv_len:        Size of the ->priv flexible array
 *        @priv:                Flexible array containing private data
 *        @operstate:        RFC2863 operstate
 *        @link_mode:        Mapping policy to operstate
 *        @if_port:        Selectable AUI, TP, ...
 *        @dma:                DMA channel
 *        @mtu:                Interface MTU value
 *        @min_mtu:        Interface Minimum MTU value
 *        @max_mtu:        Interface Maximum MTU value
 *        @type:                Interface hardware type
 *        @hard_header_len: Maximum hardware header length.
 *        @min_header_len:  Minimum hardware header length
 *
 *        @needed_headroom: Extra headroom the hardware may need, but not in all
 *                          cases can this be guaranteed
 *        @needed_tailroom: Extra tailroom the hardware may need, but not in all
 *                          cases can this be guaranteed. Some cases also use
 *                          LL_MAX_HEADER instead to allocate the skb
 *
 *        interface address info:
 *
 *         @perm_addr:                Permanent hw address
 *         @addr_assign_type:        Hw address assignment type
 *         @addr_len:                Hardware address length
 *        @upper_level:                Maximum depth level of upper devices.
 *        @lower_level:                Maximum depth level of lower devices.
 *        @threaded:                napi threaded state.
 *        @neigh_priv_len:        Used in neigh_alloc()
 *         @dev_id:                Used to differentiate devices that share
 *                                 the same link layer address
 *         @dev_port:                Used to differentiate devices that share
 *                                 the same function
 *        @addr_list_lock:        XXX: need comments on this one
 *        @name_assign_type:        network interface name assignment type
 *        @uc_promisc:                Counter that indicates promiscuous mode
 *                                has been enabled due to the need to listen to
 *                                additional unicast addresses in a device that
 *                                does not implement ndo_set_rx_mode()
 *        @uc:                        unicast mac addresses
 *        @mc:                        multicast mac addresses
 *        @dev_addrs:                list of device hw addresses
 *        @queues_kset:                Group of all Kobjects in the Tx and RX queues
 *        @promiscuity:                Number of times the NIC is told to work in
 *                                promiscuous mode; if it becomes 0 the NIC will
 *                                exit promiscuous mode
 *        @allmulti:                Counter, enables or disables allmulticast mode
 *
 *        @vlan_info:        VLAN info
 *        @dsa_ptr:        dsa specific data
 *        @tipc_ptr:        TIPC specific data
 *        @atalk_ptr:        AppleTalk link
 *        @ip_ptr:        IPv4 specific data
 *        @ip6_ptr:        IPv6 specific data
 *        @ax25_ptr:        AX.25 specific data
 *        @ieee80211_ptr:        IEEE 802.11 specific data, assign before registering
 *        @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network
 *                         device struct
 *        @mpls_ptr:        mpls_dev struct pointer
 *        @mctp_ptr:        MCTP specific data
 *        @psp_dev:        PSP crypto device registered for this netdev
 *
 *        @dev_addr:        Hw address (before bcast,
 *                        because most packets are unicast)
 *
 *        @_rx:                        Array of RX queues
 *        @num_rx_queues:                Number of RX queues
 *                                allocated at register_netdev() time
 *        @real_num_rx_queues:         Number of RX queues currently active in device
 *        @xdp_prog:                XDP sockets filter program pointer
 *
 *        @rx_handler:                handler for received packets
 *        @rx_handler_data:         XXX: need comments on this one
 *        @tcx_ingress:                BPF & clsact qdisc specific data for ingress processing
 *        @ingress_queue:                XXX: need comments on this one
 *        @nf_hooks_ingress:        netfilter hooks executed for ingress packets
 *        @broadcast:                hw bcast address
 *
 *        @rx_cpu_rmap:        CPU reverse-mapping for RX completion interrupts,
 *                        indexed by RX queue number. Assigned by driver.
 *                        This must only be set if the ndo_rx_flow_steer
 *                        operation is defined
 *        @index_hlist:                Device index hash chain
 *
 *        @_tx:                        Array of TX queues
 *        @num_tx_queues:                Number of TX queues allocated at alloc_netdev_mq() time
 *        @real_num_tx_queues:         Number of TX queues currently active in device
 *        @qdisc:                        Root qdisc from userspace point of view
 *        @tx_queue_len:                Max frames per queue allowed
 *        @tx_global_lock:         XXX: need comments on this one
 *        @xdp_bulkq:                XDP device bulk queue
 *        @xps_maps:                all CPUs/RXQs maps for XPS device
 *
 *        @xps_maps:        XXX: need comments on this one
 *        @tcx_egress:                BPF & clsact qdisc specific data for egress processing
 *        @nf_hooks_egress:        netfilter hooks executed for egress packets
 *        @qdisc_hash:                qdisc hash table
 *        @watchdog_timeo:        Represents the timeout that is used by
 *                                the watchdog (see dev_watchdog())
 *        @watchdog_timer:        List of timers
 *
 *        @proto_down_reason:        reason a netdev interface is held down
 *        @pcpu_refcnt:                Number of references to this device
 *        @dev_refcnt:                Number of references to this device
 *        @refcnt_tracker:        Tracker directory for tracked references to this device
 *        @todo_list:                Delayed register/unregister
 *        @link_watch_list:        XXX: need comments on this one
 *
 *        @reg_state:                Register/unregister state machine
 *        @dismantle:                Device is going to be freed
 *        @needs_free_netdev:        Should unregister perform free_netdev?
 *        @priv_destructor:        Called from unregister
 *        @npinfo:                XXX: need comments on this one
 *         @nd_net:                Network namespace this network device is inside
 *                                protected by @lock
 *
 *         @ml_priv:        Mid-layer private
 *        @ml_priv_type:  Mid-layer private type
 *
 *        @pcpu_stat_type:        Type of device statistics which the core should
 *                                allocate/free: none, lstats, tstats, dstats. none
 *                                means the driver is handling statistics allocation/
 *                                freeing internally.
 *        @lstats:                Loopback statistics: packets, bytes
 *        @tstats:                Tunnel statistics: RX/TX packets, RX/TX bytes
 *        @dstats:                Dummy statistics: RX/TX/drop packets, RX/TX bytes
 *
 *        @garp_port:        GARP
 *        @mrp_port:        MRP
 *
 *        @dm_private:        Drop monitor private
 *
 *        @dev:                Class/net/name entry
 *        @sysfs_groups:        Space for optional device, statistics and wireless
 *                        sysfs groups
 *
 *        @sysfs_rx_queue_group:        Space for optional per-rx queue attributes
 *        @rtnl_link_ops:        Rtnl_link_ops
 *        @stat_ops:        Optional ops for queue-aware statistics
 *        @queue_mgmt_ops:        Optional ops for queue management
 *
 *        @gso_max_size:        Maximum size of generic segmentation offload
 *        @tso_max_size:        Device (as in HW) limit on the max TSO request size
 *        @gso_max_segs:        Maximum number of segments that can be passed to the
 *                        NIC for GSO
 *        @tso_max_segs:        Device (as in HW) limit on the max TSO segment count
 *         @gso_ipv4_max_size:        Maximum size of generic segmentation offload,
 *                                 for IPv4.
 *
 *        @dcbnl_ops:        Data Center Bridging netlink ops
 *        @num_tc:        Number of traffic classes in the net device
 *        @tc_to_txq:        XXX: need comments on this one
 *        @prio_tc_map:        XXX: need comments on this one
 *
 *        @fcoe_ddp_xid:        Max exchange id for FCoE LRO by ddp
 *
 *        @priomap:        XXX: need comments on this one
 *        @link_topo:        Physical link topology tracking attached PHYs
 *        @phydev:        Physical device may attach itself
 *                        for hardware timestamping
 *        @sfp_bus:        attached &struct sfp_bus structure.
 *
 *        @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
 *
 *        @proto_down:        protocol port state information can be sent to the
 *                        switch driver and used to set the phys state of the
 *                        switch port.
 *
 *        @irq_affinity_auto: driver wants the core to store and re-assign the IRQ
 *                            affinity. Set by netif_enable_irq_affinity(), then
 *                            the driver must create a persistent napi by
 *                            netif_napi_add_config() and finally bind the napi to
 *                            IRQ (via netif_napi_set_irq()).
 *
 *        @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap.
 *                           Set by calling netif_enable_cpu_rmap().
 *
 *        @see_all_hwtstamp_requests: device wants to see calls to
 *                        ndo_hwtstamp_set() for all timestamp requests
 *                        regardless of source, even if those aren't
 *                        HWTSTAMP_SOURCE_NETDEV
 *        @change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN
 *        @netns_immutable: interface can't change network namespaces
 *        @fcoe_mtu:        device supports maximum FCoE MTU, 2158 bytes
 *
 *        @net_notifier_list:        List of per-net netdev notifier block
 *                                that follow this device when it is moved
 *                                to another network namespace.
 *
 *        @macsec_ops:    MACsec offloading ops
 *
 *        @udp_tunnel_nic_info:        static structure describing the UDP tunnel
 *                                offload capabilities of the device
 *        @udp_tunnel_nic:        UDP tunnel offload state
 *        @ethtool:        ethtool related state
 *        @xdp_state:                stores info on attached XDP BPF programs
 *
 *        @nested_level:        Used as a parameter of spin_lock_nested() of
 *                        dev->addr_list_lock.
 *        @unlink_list:        As netif_addr_lock() can be called recursively,
 *                        keep a list of interfaces to be deleted.
 *        @gro_max_size:        Maximum size of aggregated packet in generic
 *                        receive offload (GRO)
 *         @gro_ipv4_max_size:        Maximum size of aggregated packet in generic
 *                                 receive offload (GRO), for IPv4.
 *        @xdp_zc_max_segs:        Maximum number of segments supported by AF_XDP
 *                                zero copy driver
 *
 *        @dev_addr_shadow:        Copy of @dev_addr to catch direct writes.
 *        @linkwatch_dev_tracker:        refcount tracker used by linkwatch.
 *        @watchdog_dev_tracker:        refcount tracker used by watchdog.
 *        @dev_registered_tracker:        tracker for reference held while
 *                                        registered
 *        @offload_xstats_l3:        L3 HW stats for this netdevice.
 *
 *        @devlink_port:        Pointer to related devlink port structure.
 *                        Assigned by a driver before netdev registration using
 *                        SET_NETDEV_DEVLINK_PORT macro. This pointer is static
 *                        during the time netdevice is registered.
 *
 *        @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
 *                   where the clock is recovered.
 *
 *        @max_pacing_offload_horizon: max EDT offload horizon in nsec.
 *        @napi_config: An array of napi_config structures containing per-NAPI
 *                      settings.
 *        @num_napi_configs:        number of allocated NAPI config structs,
 *                always >= max(num_rx_queues, num_tx_queues).
 *        @gro_flush_timeout:        timeout for GRO layer in NAPI
 *        @napi_defer_hard_irqs:        If not zero, provides a counter that would
 *                                allow to avoid NIC hard IRQ, on busy queues.
 *
 *        @neighbours:        List heads pointing to this device's neighbours'
 *                        dev_list, one per address-family.
 *        @hwprov: Tracks which PTP performs hardware packet time stamping.
 *
 *        FIXME: cleanup struct net_device such that network protocol info
 *        moves out.
 */

struct net_device {
        /* Cacheline organization can be found documented in
         * Documentation/networking/net_cachelines/net_device.rst.
         * Please update the document when adding new fields.
         */

        /* TX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_tx);
        struct_group(priv_flags_fast,
                unsigned long                priv_flags:32;
                unsigned long                lltx:1;
                unsigned long                netmem_tx:1;
        );
        const struct net_device_ops *netdev_ops;
        const struct header_ops *header_ops;
        struct netdev_queue        *_tx;
        netdev_features_t        gso_partial_features;
        unsigned int                real_num_tx_queues;
        unsigned int                gso_max_size;
        unsigned int                gso_ipv4_max_size;
        u16                        gso_max_segs;
        s16                        num_tc;
        /* Note : dev->mtu is often read without holding a lock.
         * Writers usually hold RTNL.
         * It is recommended to use READ_ONCE() to annotate the reads,
         * and to use WRITE_ONCE() to annotate the writes.
         */
        unsigned int                mtu;
        unsigned short                needed_headroom;
        struct netdev_tc_txq        tc_to_txq[TC_MAX_QUEUE];
#ifdef CONFIG_XPS
        struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        struct nf_hook_entries __rcu *nf_hooks_egress;
#endif
#ifdef CONFIG_NET_XGRESS
        struct bpf_mprog_entry __rcu *tcx_egress;
#endif
        __cacheline_group_end(net_device_read_tx);

        /* TXRX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_txrx);
        union {
                struct pcpu_lstats __percpu                *lstats;
                struct pcpu_sw_netstats __percpu        *tstats;
                struct pcpu_dstats __percpu                *dstats;
        };
        unsigned long                state;
        unsigned int                flags;
        unsigned short                hard_header_len;
        enum netdev_stat_type        pcpu_stat_type:8;
        netdev_features_t        features;
        struct inet6_dev __rcu        *ip6_ptr;
        __cacheline_group_end(net_device_read_txrx);

        /* RX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_rx);
        struct bpf_prog __rcu        *xdp_prog;
        struct list_head        ptype_specific;
        int                        ifindex;
        unsigned int                real_num_rx_queues;
        struct netdev_rx_queue        *_rx;
        unsigned int                gro_max_size;
        unsigned int                gro_ipv4_max_size;
        rx_handler_func_t __rcu        *rx_handler;
        void __rcu                *rx_handler_data;
        possible_net_t                        nd_net;
#ifdef CONFIG_NETPOLL
        struct netpoll_info __rcu        *npinfo;
#endif
#ifdef CONFIG_NET_XGRESS
        struct bpf_mprog_entry __rcu *tcx_ingress;
#endif
        __cacheline_group_end(net_device_read_rx);

        char                        name[IFNAMSIZ];
        struct netdev_name_node        *name_node;
        struct dev_ifalias        __rcu *ifalias;
        /*
         *        I/O specific fields
         *        FIXME: Merge these and struct ifmap into one
         */
        unsigned long                mem_end;
        unsigned long                mem_start;
        unsigned long                base_addr;

        /*
         *        Some hardware also needs these fields (state,dev_list,
         *        napi_list,unreg_list,close_list) but they are not
         *        part of the usual set specified in Space.c.
         */


        struct list_head        dev_list;
        struct list_head        napi_list;
        struct list_head        unreg_list;
        struct list_head        close_list;
        struct list_head        ptype_all;

        struct {
                struct list_head upper;
                struct list_head lower;
        } adj_list;

        /* Read-mostly cache-line for fast-path access */
        xdp_features_t                xdp_features;
        const struct xdp_metadata_ops *xdp_metadata_ops;
        const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
        unsigned short                gflags;

        unsigned short                needed_tailroom;

        netdev_features_t        hw_features;
        netdev_features_t        wanted_features;
        netdev_features_t        vlan_features;
        netdev_features_t        hw_enc_features;
        netdev_features_t        mpls_features;
        netdev_features_t        mangleid_features;

        unsigned int                min_mtu;
        unsigned int                max_mtu;
        unsigned short                type;
        unsigned char                min_header_len;
        unsigned char                name_assign_type;

        int                        group;

        struct net_device_stats        stats; /* not used by modern drivers */

        struct net_device_core_stats __percpu *core_stats;

        /* Stats to monitor link on/off, flapping */
        atomic_t                carrier_up_count;
        atomic_t                carrier_down_count;

#ifdef CONFIG_WIRELESS_EXT
        const struct iw_handler_def *wireless_handlers;
#endif
        const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_L3_MASTER_DEV
        const struct l3mdev_ops        *l3mdev_ops;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        const struct ndisc_ops *ndisc_ops;
#endif

#ifdef CONFIG_XFRM_OFFLOAD
        const struct xfrmdev_ops *xfrmdev_ops;
#endif

#if IS_ENABLED(CONFIG_TLS_DEVICE)
        const struct tlsdev_ops *tlsdev_ops;
#endif

        unsigned int                operstate;
        unsigned char                link_mode;

        unsigned char                if_port;
        unsigned char                dma;

        /* Interface address info. */
        unsigned char                perm_addr[MAX_ADDR_LEN];
        unsigned char                addr_assign_type;
        unsigned char                addr_len;
        unsigned char                upper_level;
        unsigned char                lower_level;
        u8                        threaded;

        unsigned short                neigh_priv_len;
        unsigned short          dev_id;
        unsigned short          dev_port;
        int                        irq;
        u32                        priv_len;

        spinlock_t                addr_list_lock;

        struct netdev_hw_addr_list        uc;
        struct netdev_hw_addr_list        mc;
        struct netdev_hw_addr_list        dev_addrs;

#ifdef CONFIG_SYSFS
        struct kset                *queues_kset;
#endif
#ifdef CONFIG_LOCKDEP
        struct list_head        unlink_list;
#endif
        unsigned int                promiscuity;
        unsigned int                allmulti;
        bool                        uc_promisc;
#ifdef CONFIG_LOCKDEP
        unsigned char                nested_level;
#endif


        /* Protocol-specific pointers */
        struct in_device __rcu        *ip_ptr;
        /** @fib_nh_head: nexthops associated with this netdev */
        struct hlist_head        fib_nh_head;

#if IS_ENABLED(CONFIG_VLAN_8021Q)
        struct vlan_info __rcu        *vlan_info;
#endif
#if IS_ENABLED(CONFIG_NET_DSA)
        struct dsa_port                *dsa_ptr;
#endif
#if IS_ENABLED(CONFIG_TIPC)
        struct tipc_bearer __rcu *tipc_ptr;
#endif
#if IS_ENABLED(CONFIG_ATALK)
        void                         *atalk_ptr;
#endif
#if IS_ENABLED(CONFIG_AX25)
        struct ax25_dev        __rcu        *ax25_ptr;
#endif
#if IS_ENABLED(CONFIG_CFG80211)
        struct wireless_dev        *ieee80211_ptr;
#endif
#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN)
        struct wpan_dev                *ieee802154_ptr;
#endif
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
        struct mpls_dev __rcu        *mpls_ptr;
#endif
#if IS_ENABLED(CONFIG_MCTP)
        struct mctp_dev __rcu        *mctp_ptr;
#endif
#if IS_ENABLED(CONFIG_INET_PSP)
        struct psp_dev __rcu        *psp_dev;
#endif

/*
 * Cache lines mostly used on receive path (including eth_type_trans())
 */
        /* Interface address info used in eth_type_trans() */
        const unsigned char        *dev_addr;

        unsigned int                num_rx_queues;
#define GRO_LEGACY_MAX_SIZE        65536u
/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
 * and shinfo->gso_segs is a 16bit field.
 */
#define GRO_MAX_SIZE                (8 * 65535u)
        unsigned int                xdp_zc_max_segs;
        struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
        struct nf_hook_entries __rcu *nf_hooks_ingress;
#endif

        unsigned char                broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap                *rx_cpu_rmap;
#endif
        struct hlist_node        index_hlist;

/*
 * Cache lines mostly used on transmit path
 */
        unsigned int                num_tx_queues;
        struct Qdisc __rcu        *qdisc;
        unsigned int                tx_queue_len;
        spinlock_t                tx_global_lock;

        struct xdp_dev_bulk_queue __percpu *xdp_bulkq;

#ifdef CONFIG_NET_SCHED
        DECLARE_HASHTABLE        (qdisc_hash, 4);
#endif
        /* These may be needed for future network-power-down code. */
        struct timer_list        watchdog_timer;
        int                        watchdog_timeo;

        u32                     proto_down_reason;

        struct list_head        todo_list;

#ifdef CONFIG_PCPU_DEV_REFCNT
        int __percpu                *pcpu_refcnt;
#else
        refcount_t                dev_refcnt;
#endif
        struct ref_tracker_dir        refcnt_tracker;

        struct list_head        link_watch_list;

        u8 reg_state;

        bool dismantle;

        /** @moving_ns: device is changing netns, protected by @lock */
        bool moving_ns;
        /** @rtnl_link_initializing: Device being created, suppress events */
        bool rtnl_link_initializing;

        bool needs_free_netdev;
        void (*priv_destructor)(struct net_device *dev);

        /* mid-layer private */
        void                                *ml_priv;
        enum netdev_ml_priv_type        ml_priv_type;

#if IS_ENABLED(CONFIG_GARP)
        struct garp_port __rcu        *garp_port;
#endif
#if IS_ENABLED(CONFIG_MRP)
        struct mrp_port __rcu        *mrp_port;
#endif
#if IS_ENABLED(CONFIG_NET_DROP_MONITOR)
        struct dm_hw_stat_delta __rcu *dm_private;
#endif
        struct device                dev;
        const struct attribute_group *sysfs_groups[5];
        const struct attribute_group *sysfs_rx_queue_group;

        const struct rtnl_link_ops *rtnl_link_ops;

        const struct netdev_stat_ops *stat_ops;

        const struct netdev_queue_mgmt_ops *queue_mgmt_ops;

        /* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SEGS                65535u
#define GSO_LEGACY_MAX_SIZE        65536u
/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
 * and shinfo->gso_segs is a 16bit field.
 */
#define GSO_MAX_SIZE                (8 * GSO_MAX_SEGS)

#define TSO_LEGACY_MAX_SIZE        65536
#define TSO_MAX_SIZE                UINT_MAX
        unsigned int                tso_max_size;
#define TSO_MAX_SEGS                U16_MAX
        u16                        tso_max_segs;

#ifdef CONFIG_DCB
        const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
        u8                        prio_tc_map[TC_BITMASK + 1];

#if IS_ENABLED(CONFIG_FCOE)
        unsigned int                fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
        struct netprio_map __rcu *priomap;
#endif
        struct phy_link_topology        *link_topo;
        struct phy_device        *phydev;
        struct sfp_bus                *sfp_bus;
        struct lock_class_key        *qdisc_tx_busylock;
        bool                        proto_down;
        bool                        irq_affinity_auto;
        bool                        rx_cpu_rmap_auto;

        /* priv_flags_slow, ungrouped to save space */
        unsigned long                see_all_hwtstamp_requests:1;
        unsigned long                change_proto_down:1;
        unsigned long                netns_immutable:1;
        unsigned long                fcoe_mtu:1;

        struct list_head        net_notifier_list;

#if IS_ENABLED(CONFIG_MACSEC)
        /* MACsec management functions */
        const struct macsec_ops *macsec_ops;
#endif
        const struct udp_tunnel_nic_info        *udp_tunnel_nic_info;
        struct udp_tunnel_nic        *udp_tunnel_nic;

        /** @cfg: net_device queue-related configuration */
        struct netdev_config        *cfg;
        /**
         * @cfg_pending: same as @cfg but when device is being actively
         *        reconfigured includes any changes to the configuration
         *        requested by the user, but which may or may not be rejected.
         */
        struct netdev_config        *cfg_pending;
        struct ethtool_netdev_state *ethtool;

        /* protected by rtnl_lock */
        struct bpf_xdp_entity        xdp_state[__MAX_XDP_MODE];

        u8 dev_addr_shadow[MAX_ADDR_LEN];
        netdevice_tracker        linkwatch_dev_tracker;
        netdevice_tracker        watchdog_dev_tracker;
        netdevice_tracker        dev_registered_tracker;
        struct rtnl_hw_stats64        *offload_xstats_l3;

        struct devlink_port        *devlink_port;

#if IS_ENABLED(CONFIG_DPLL)
        struct dpll_pin        __rcu        *dpll_pin;
#endif
#if IS_ENABLED(CONFIG_PAGE_POOL)
        /** @page_pools: page pools created for this netdevice */
        struct hlist_head        page_pools;
#endif

        /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */
        struct dim_irq_moder        *irq_moder;

        u64                        max_pacing_offload_horizon;
        struct napi_config        *napi_config;
        u32                        num_napi_configs;
        u32                        napi_defer_hard_irqs;
        unsigned long                gro_flush_timeout;

        /**
         * @up: copy of @state's IFF_UP, but safe to read with just @lock.
         *        May report false negatives while the device is being opened
         *        or closed (@lock does not protect .ndo_open, or .ndo_close).
         */
        bool                        up;

        /**
         * @request_ops_lock: request the core to run all @netdev_ops and
         * @ethtool_ops under the @lock.
         */
        bool                        request_ops_lock;

        /**
         * @lock: netdev-scope lock, protects a small selection of fields.
         * Should always be taken using netdev_lock() / netdev_unlock() helpers.
         * Drivers are free to use it for other protection.
         *
         * For the drivers that implement shaper or queue API, the scope
         * of this lock is expanded to cover most ndo/queue/ethtool/sysfs
         * operations. Drivers may opt-in to this behavior by setting
         * @request_ops_lock.
         *
         * @lock protection mixes with rtnl_lock in multiple ways, fields are
         * either:
         *
         * - simply protected by the instance @lock;
         *
         * - double protected - writers hold both locks, readers hold either;
         *
         * - ops protected - protected by the lock held around the NDOs
         *   and other callbacks, that is the instance lock on devices for
         *   which netdev_need_ops_lock() returns true, otherwise by rtnl_lock;
         *
         * - double ops protected - always protected by rtnl_lock but for
         *   devices for which netdev_need_ops_lock() returns true - also
         *   the instance lock.
         *
         * Simply protects:
         *        @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list,
         *        @net_shaper_hierarchy, @reg_state, @threaded
         *
         * Double protects:
         *        @up, @moving_ns, @nd_net, @xdp_features
         *
         * Double ops protects:
         *        @real_num_rx_queues, @real_num_tx_queues
         *
         * Also protects some fields in:
         *        struct napi_struct, struct netdev_queue, struct netdev_rx_queue
         *
         * Ordering:
         *
         * - take after rtnl_lock
         *
         * - for the case of netdev queue leasing, the netdev-scope lock is
         *   taken for both the virtual and the physical device; to prevent
         *   deadlocks, the virtual device's lock must always be acquired
         *   before the physical device's (see netdev_nl_queue_create_doit)
         */
        struct mutex                lock;

#if IS_ENABLED(CONFIG_NET_SHAPER)
        /**
         * @net_shaper_hierarchy: data tracking the current shaper status
         *  see include/net/net_shapers.h
         */
        struct net_shaper_hierarchy *net_shaper_hierarchy;
#endif

        struct hlist_head neighbours[NEIGH_NR_TABLES];

        struct hwtstamp_provider __rcu        *hwprov;

        u8                        priv[] ____cacheline_aligned
                                       __counted_by(priv_len);
} ____cacheline_aligned;
#define to_net_dev(d) container_of(d, struct net_device, dev)

/*
 * Driver should use this to assign devlink port instance to a netdevice
 * before it registers the netdevice. Therefore devlink_port is static
 * during the netdev lifetime after it is registered.
 */
#define SET_NETDEV_DEVLINK_PORT(dev, port)                        \
({                                                                \
        WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED);        \
        ((dev)->devlink_port = (port));                                \
})

static inline bool netif_elide_gro(const struct net_device *dev)
{
        if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
                return true;
        return false;
}

#define        NETDEV_ALIGN                32

static inline
int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
{
        return dev->prio_tc_map[prio & TC_BITMASK];
}

static inline
int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
{
        if (tc >= dev->num_tc)
                return -EINVAL;

        dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK;
        return 0;
}

int netdev_txq_to_tc(struct net_device *dev, unsigned int txq);
void netdev_reset_tc(struct net_device *dev);
int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
int netdev_set_num_tc(struct net_device *dev, u8 num_tc);

static inline
int netdev_get_num_tc(struct net_device *dev)
{
        return dev->num_tc;
}

static inline void net_prefetch(void *p)
{
        prefetch(p);
#if L1_CACHE_BYTES < 128
        prefetch((u8 *)p + L1_CACHE_BYTES);
#endif
}

static inline void net_prefetchw(void *p)
{
        prefetchw(p);
#if L1_CACHE_BYTES < 128
        prefetchw((u8 *)p + L1_CACHE_BYTES);
#endif
}

void netdev_unbind_sb_channel(struct net_device *dev,
                              struct net_device *sb_dev);
int netdev_bind_sb_channel_queue(struct net_device *dev,
                                 struct net_device *sb_dev,
                                 u8 tc, u16 count, u16 offset);
int netdev_set_sb_channel(struct net_device *dev, u16 channel);
static inline int netdev_get_sb_channel(struct net_device *dev)
{
        return max_t(int, -dev->num_tc, 0);
}

static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
                                         unsigned int index)
{
        DEBUG_NET_WARN_ON_ONCE(index >= dev->num_tx_queues);
        return &dev->_tx[index];
}

static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
}

static inline void netdev_for_each_tx_queue(struct net_device *dev,
                                            void (*f)(struct net_device *,
                                                      struct netdev_queue *,
                                                      void *),
                                            void *arg)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                f(dev, &dev->_tx[i], arg);
}

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);
struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb,
                                         struct net_device *sb_dev);

/* returns the headroom that the master device needs to take in account
 * when forwarding to this dev
 */
static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
{
        return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
}

static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
{
        if (dev->netdev_ops->ndo_set_rx_headroom)
                dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
}

/* set the device rx headroom to the dev's default */
static inline void netdev_reset_rx_headroom(struct net_device *dev)
{
        netdev_set_rx_headroom(dev, -1);
}

static inline void *netdev_get_ml_priv(struct net_device *dev,
                                       enum netdev_ml_priv_type type)
{
        if (dev->ml_priv_type != type)
                return NULL;

        return dev->ml_priv;
}

static inline void netdev_set_ml_priv(struct net_device *dev,
                                      void *ml_priv,
                                      enum netdev_ml_priv_type type)
{
        WARN(dev->ml_priv_type && dev->ml_priv_type != type,
             "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n",
             dev->ml_priv_type, type);
        WARN(!dev->ml_priv_type && dev->ml_priv,
             "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n");

        dev->ml_priv = ml_priv;
        dev->ml_priv_type = type;
}

/*
 * Net namespace inlines
 */
static inline
struct net *dev_net(const struct net_device *dev)
{
        return read_pnet(&dev->nd_net);
}

static inline
struct net *dev_net_rcu(const struct net_device *dev)
{
        return read_pnet_rcu(&dev->nd_net);
}

static inline
void dev_net_set(struct net_device *dev, struct net *net)
{
        write_pnet(&dev->nd_net, net);
}

/**
 *        netdev_priv - access network device private data
 *        @dev: network device
 *
 * Get network device private data
 */
static inline void *netdev_priv(const struct net_device *dev)
{
        return (void *)dev->priv;
}

/**
 * netdev_from_priv() - get network device from priv
 * @priv: network device private data
 *
 * Returns: net_device to which @priv belongs
 */
static inline struct net_device *netdev_from_priv(const void *priv)
{
        return container_of(priv, struct net_device, priv);
}

/* Set the sysfs physical device reference for the network logical device
 * if set prior to registration will cause a symlink during initialization.
 */
#define SET_NETDEV_DEV(net, pdev)        ((net)->dev.parent = (pdev))

/* Set the sysfs device type for the network logical device to allow
 * fine-grained identification of different network device types. For
 * example Ethernet, Wireless LAN, Bluetooth, WiMAX etc.
 */
#define SET_NETDEV_DEVTYPE(net, devtype)        ((net)->dev.type = (devtype))

void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
                          enum netdev_queue_type type,
                          struct napi_struct *napi);

static inline void netdev_lock(struct net_device *dev)
{
        mutex_lock(&dev->lock);
}

static inline void netdev_unlock(struct net_device *dev)
{
        mutex_unlock(&dev->lock);
}
/* Additional netdev_lock()-related helpers are in net/netdev_lock.h */

void netif_napi_set_irq_locked(struct napi_struct *napi, int irq);

static inline void netif_napi_set_irq(struct napi_struct *napi, int irq)
{
        netdev_lock(napi->dev);
        netif_napi_set_irq_locked(napi, irq);
        netdev_unlock(napi->dev);
}

/* Default NAPI poll() weight
 * Device drivers are strongly advised to not use bigger value
 */
#define NAPI_POLL_WEIGHT 64

void netif_napi_add_weight_locked(struct net_device *dev,
                                  struct napi_struct *napi,
                                  int (*poll)(struct napi_struct *, int),
                                  int weight);

static inline void
netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
                      int (*poll)(struct napi_struct *, int), int weight)
{
        netdev_lock(dev);
        netif_napi_add_weight_locked(dev, napi, poll, weight);
        netdev_unlock(dev);
}

/**
 * netif_napi_add() - initialize a NAPI context
 * @dev:  network device
 * @napi: NAPI context
 * @poll: polling function
 *
 * netif_napi_add() must be used to initialize a NAPI context prior to calling
 * *any* of the other NAPI-related functions.
 */
static inline void
netif_napi_add(struct net_device *dev, struct napi_struct *napi,
               int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
}

static inline void
netif_napi_add_locked(struct net_device *dev, struct napi_struct *napi,
                      int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT);
}

static inline void
netif_napi_add_tx_weight(struct net_device *dev,
                         struct napi_struct *napi,
                         int (*poll)(struct napi_struct *, int),
                         int weight)
{
        set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state);
        netif_napi_add_weight(dev, napi, poll, weight);
}

static inline void
netif_napi_add_config_locked(struct net_device *dev, struct napi_struct *napi,
                             int (*poll)(struct napi_struct *, int), int index)
{
        napi->index = index;
        napi->config = &dev->napi_config[index];
        netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT);
}

/**
 * netif_napi_add_config - initialize a NAPI context with persistent config
 * @dev: network device
 * @napi: NAPI context
 * @poll: polling function
 * @index: the NAPI index
 */
static inline void
netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
                      int (*poll)(struct napi_struct *, int), int index)
{
        netdev_lock(dev);
        netif_napi_add_config_locked(dev, napi, poll, index);
        netdev_unlock(dev);
}

/**
 * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
 * @dev:  network device
 * @napi: NAPI context
 * @poll: polling function
 *
 * This variant of netif_napi_add() should be used from drivers using NAPI
 * to exclusively poll a TX queue.
 * This will avoid we add it into napi_hash[], thus polluting this hash table.
 */
static inline void netif_napi_add_tx(struct net_device *dev,
                                     struct napi_struct *napi,
                                     int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
}

void __netif_napi_del_locked(struct napi_struct *napi);

/**
 *  __netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 * Warning: caller must observe RCU grace period before freeing memory
 * containing @napi. Drivers might want to call this helper to combine
 * all the needed RCU grace periods into a single one.
 */
static inline void __netif_napi_del(struct napi_struct *napi)
{
        netdev_lock(napi->dev);
        __netif_napi_del_locked(napi);
        netdev_unlock(napi->dev);
}

static inline void netif_napi_del_locked(struct napi_struct *napi)
{
        __netif_napi_del_locked(napi);
        synchronize_net();
}

/**
 *  netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 *  netif_napi_del() removes a NAPI context from the network device NAPI list
 */
static inline void netif_napi_del(struct napi_struct *napi)
{
        __netif_napi_del(napi);
        synchronize_net();
}

int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs);
void netif_set_affinity_auto(struct net_device *dev);

struct packet_type {
        __be16                        type;        /* This is really htons(ether_type). */
        bool                        ignore_outgoing;
        struct net_device        *dev;        /* NULL is wildcarded here             */
        netdevice_tracker        dev_tracker;
        int                        (*func) (struct sk_buff *,
                                         struct net_device *,
                                         struct packet_type *,
                                         struct net_device *);
        void                        (*list_func) (struct list_head *,
                                              struct packet_type *,
                                              struct net_device *);
        bool                        (*id_match)(struct packet_type *ptype,
                                            struct sock *sk);
        struct net                *af_packet_net;
        void                        *af_packet_priv;
        struct list_head        list;
};

struct offload_callbacks {
        struct sk_buff                *(*gso_segment)(struct sk_buff *skb,
                                                netdev_features_t features);
        struct sk_buff                *(*gro_receive)(struct list_head *head,
                                                struct sk_buff *skb);
        int                        (*gro_complete)(struct sk_buff *skb, int nhoff);
};

struct packet_offload {
        __be16                         type;        /* This is really htons(ether_type). */
        u16                         priority;
        struct offload_callbacks callbacks;
        struct list_head         list;
};

/* often modified stats are per-CPU, other are shared (netdev->stats) */
struct pcpu_sw_netstats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        struct u64_stats_sync   syncp;
} __aligned(4 * sizeof(u64));

struct pcpu_dstats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        u64_stats_t                rx_drops;
        u64_stats_t                tx_drops;
        struct u64_stats_sync        syncp;
} __aligned(8 * sizeof(u64));

struct pcpu_lstats {
        u64_stats_t packets;
        u64_stats_t bytes;
        struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes);

static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);

        u64_stats_update_begin(&tstats->syncp);
        u64_stats_add(&tstats->rx_bytes, len);
        u64_stats_inc(&tstats->rx_packets);
        u64_stats_update_end(&tstats->syncp);
}

static inline void dev_sw_netstats_tx_add(struct net_device *dev,
                                          unsigned int packets,
                                          unsigned int len)
{
        struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);

        u64_stats_update_begin(&tstats->syncp);
        u64_stats_add(&tstats->tx_bytes, len);
        u64_stats_add(&tstats->tx_packets, packets);
        u64_stats_update_end(&tstats->syncp);
}

static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats);

        u64_stats_update_begin(&lstats->syncp);
        u64_stats_add(&lstats->bytes, len);
        u64_stats_inc(&lstats->packets);
        u64_stats_update_end(&lstats->syncp);
}

static inline void dev_dstats_rx_add(struct net_device *dev,
                                     unsigned int len)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->rx_packets);
        u64_stats_add(&dstats->rx_bytes, len);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_rx_dropped(struct net_device *dev)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->rx_drops);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_rx_dropped_add(struct net_device *dev,
                                             unsigned int packets)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_add(&dstats->rx_drops, packets);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_tx_add(struct net_device *dev,
                                     unsigned int len)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->tx_packets);
        u64_stats_add(&dstats->tx_bytes, len);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_tx_dropped(struct net_device *dev)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->tx_drops);
        u64_stats_update_end(&dstats->syncp);
}

#define __netdev_alloc_pcpu_stats(type, gfp)                                \
({                                                                        \
        typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
        if (pcpu_stats)        {                                                \
                int __cpu;                                                \
                for_each_possible_cpu(__cpu) {                                \
                        typeof(type) *stat;                                \
                        stat = per_cpu_ptr(pcpu_stats, __cpu);                \
                        u64_stats_init(&stat->syncp);                        \
                }                                                        \
        }                                                                \
        pcpu_stats;                                                        \
})

#define netdev_alloc_pcpu_stats(type)                                        \
        __netdev_alloc_pcpu_stats(type, GFP_KERNEL)

#define devm_netdev_alloc_pcpu_stats(dev, type)                                \
({                                                                        \
        typeof(type) __percpu *pcpu_stats = devm_alloc_percpu(dev, type);\
        if (pcpu_stats) {                                                \
                int __cpu;                                                \
                for_each_possible_cpu(__cpu) {                                \
                        typeof(type) *stat;                                \
                        stat = per_cpu_ptr(pcpu_stats, __cpu);                \
                        u64_stats_init(&stat->syncp);                        \
                }                                                        \
        }                                                                \
        pcpu_stats;                                                        \
})

enum netdev_lag_tx_type {
        NETDEV_LAG_TX_TYPE_UNKNOWN,
        NETDEV_LAG_TX_TYPE_RANDOM,
        NETDEV_LAG_TX_TYPE_BROADCAST,
        NETDEV_LAG_TX_TYPE_ROUNDROBIN,
        NETDEV_LAG_TX_TYPE_ACTIVEBACKUP,
        NETDEV_LAG_TX_TYPE_HASH,
};

enum netdev_lag_hash {
        NETDEV_LAG_HASH_NONE,
        NETDEV_LAG_HASH_L2,
        NETDEV_LAG_HASH_L34,
        NETDEV_LAG_HASH_L23,
        NETDEV_LAG_HASH_E23,
        NETDEV_LAG_HASH_E34,
        NETDEV_LAG_HASH_VLAN_SRCMAC,
        NETDEV_LAG_HASH_UNKNOWN,
};

struct netdev_lag_upper_info {
        enum netdev_lag_tx_type tx_type;
        enum netdev_lag_hash hash_type;
};

struct netdev_lag_lower_state_info {
        u8 link_up : 1,
           tx_enabled : 1;
};

#include <linux/notifier.h>

/* netdevice notifier chain. Please remember to update netdev_cmd_to_name()
 * and the rtnetlink notification exclusion list in rtnetlink_event() when
 * adding new types.
 */
enum netdev_cmd {
        NETDEV_UP        = 1,        /* For now you can't veto a device up/down */
        NETDEV_DOWN,
        NETDEV_REBOOT,                /* Tell a protocol stack a network interface
                                   detected a hardware crash and restarted
                                   - we can use this eg to kick tcp sessions
                                   once done */
        NETDEV_CHANGE,                /* Notify device state change */
        NETDEV_REGISTER,
        NETDEV_UNREGISTER,
        NETDEV_CHANGEMTU,        /* notify after mtu change happened */
        NETDEV_CHANGEADDR,        /* notify after the address change */
        NETDEV_PRE_CHANGEADDR,        /* notify before the address change */
        NETDEV_GOING_DOWN,
        NETDEV_CHANGENAME,
        NETDEV_FEAT_CHANGE,
        NETDEV_BONDING_FAILOVER,
        NETDEV_PRE_UP,
        NETDEV_PRE_TYPE_CHANGE,
        NETDEV_POST_TYPE_CHANGE,
        NETDEV_POST_INIT,
        NETDEV_PRE_UNINIT,
        NETDEV_RELEASE,
        NETDEV_NOTIFY_PEERS,
        NETDEV_JOIN,
        NETDEV_CHANGEUPPER,
        NETDEV_RESEND_IGMP,
        NETDEV_PRECHANGEMTU,        /* notify before mtu change happened */
        NETDEV_CHANGEINFODATA,
        NETDEV_BONDING_INFO,
        NETDEV_PRECHANGEUPPER,
        NETDEV_CHANGELOWERSTATE,
        NETDEV_UDP_TUNNEL_PUSH_INFO,
        NETDEV_UDP_TUNNEL_DROP_INFO,
        NETDEV_CHANGE_TX_QUEUE_LEN,
        NETDEV_CVLAN_FILTER_PUSH_INFO,
        NETDEV_CVLAN_FILTER_DROP_INFO,
        NETDEV_SVLAN_FILTER_PUSH_INFO,
        NETDEV_SVLAN_FILTER_DROP_INFO,
        NETDEV_OFFLOAD_XSTATS_ENABLE,
        NETDEV_OFFLOAD_XSTATS_DISABLE,
        NETDEV_OFFLOAD_XSTATS_REPORT_USED,
        NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
        NETDEV_XDP_FEAT_CHANGE,
};
const char *netdev_cmd_to_name(enum netdev_cmd cmd);

int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb);
int unregister_netdevice_notifier_net(struct net *net,
                                      struct notifier_block *nb);
int register_netdevice_notifier_dev_net(struct net_device *dev,
                                        struct notifier_block *nb,
                                        struct netdev_net_notifier *nn);
int unregister_netdevice_notifier_dev_net(struct net_device *dev,
                                          struct notifier_block *nb,
                                          struct netdev_net_notifier *nn);

struct netdev_notifier_info {
        struct net_device        *dev;
        struct netlink_ext_ack        *extack;
};

struct netdev_notifier_info_ext {
        struct netdev_notifier_info info; /* must be first */
        union {
                u32 mtu;
        } ext;
};

struct netdev_notifier_change_info {
        struct netdev_notifier_info info; /* must be first */
        unsigned int flags_changed;
};

struct netdev_notifier_changeupper_info {
        struct netdev_notifier_info info; /* must be first */
        struct net_device *upper_dev; /* new upper dev */
        bool master; /* is upper dev master */
        bool linking; /* is the notification for link or unlink */
        void *upper_info; /* upper dev info */
};

struct netdev_notifier_changelowerstate_info {
        struct netdev_notifier_info info; /* must be first */
        void *lower_state_info; /* is lower dev state */
};

struct netdev_notifier_pre_changeaddr_info {
        struct netdev_notifier_info info; /* must be first */
        const unsigned char *dev_addr;
};

enum netdev_offload_xstats_type {
        NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
};

struct netdev_notifier_offload_xstats_info {
        struct netdev_notifier_info info; /* must be first */
        enum netdev_offload_xstats_type type;

        union {
                /* NETDEV_OFFLOAD_XSTATS_REPORT_DELTA */
                struct netdev_notifier_offload_xstats_rd *report_delta;
                /* NETDEV_OFFLOAD_XSTATS_REPORT_USED */
                struct netdev_notifier_offload_xstats_ru *report_used;
        };
};

int netdev_offload_xstats_enable(struct net_device *dev,
                                 enum netdev_offload_xstats_type type,
                                 struct netlink_ext_ack *extack);
int netdev_offload_xstats_disable(struct net_device *dev,
                                  enum netdev_offload_xstats_type type);
bool netdev_offload_xstats_enabled(const struct net_device *dev,
                                   enum netdev_offload_xstats_type type);
int netdev_offload_xstats_get(struct net_device *dev,
                              enum netdev_offload_xstats_type type,
                              struct rtnl_hw_stats64 *stats, bool *used,
                              struct netlink_ext_ack *extack);
void
netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *rd,
                                   const struct rtnl_hw_stats64 *stats);
void
netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *ru);
void netdev_offload_xstats_push_delta(struct net_device *dev,
                                      enum netdev_offload_xstats_type type,
                                      const struct rtnl_hw_stats64 *stats);

static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
                                             struct net_device *dev)
{
        info->dev = dev;
        info->extack = NULL;
}

static inline struct net_device *
netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
{
        return info->dev;
}

static inline struct netlink_ext_ack *
netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
{
        return info->extack;
}

int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
int call_netdevice_notifiers_info(unsigned long val,
                                  struct netdev_notifier_info *info);

#define for_each_netdev(net, d)                \
                list_for_each_entry(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_reverse(net, d)        \
                list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_rcu(net, d)                \
                list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_safe(net, d, n)        \
                list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue(net, d)                \
                list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue_reverse(net, d)                \
                list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \
                                                     dev_list)
#define for_each_netdev_continue_rcu(net, d)                \
        list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_in_bond_rcu(bond, slave)        \
                for_each_netdev_rcu(dev_net_rcu(bond), slave)        \
                        if (netdev_master_upper_dev_get_rcu(slave) == (bond))
#define net_device_entry(lh)        list_entry(lh, struct net_device, dev_list)

#define for_each_netdev_dump(net, d, ifindex)                                \
        for (; (d = xa_find(&(net)->dev_by_index, &ifindex,                \
                            ULONG_MAX, XA_PRESENT)); ifindex++)

static inline struct net_device *next_net_device(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = dev->dev_list.next;
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *next_net_device_rcu(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = rcu_dereference(list_next_rcu(&dev->dev_list));
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *first_net_device(struct net *net)
{
        return list_empty(&net->dev_base_head) ? NULL :
                net_device_entry(net->dev_base_head.next);
}

int netdev_boot_setup_check(struct net_device *dev);
struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
                                   const char *hwaddr);
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *hwaddr);
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
void dev_add_pack(struct packet_type *pt);
void dev_remove_pack(struct packet_type *pt);
void __dev_remove_pack(struct packet_type *pt);
void dev_add_offload(struct packet_offload *po);
void dev_remove_offload(struct packet_offload *po);

int dev_get_iflink(const struct net_device *dev);
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
                          struct net_device_path_stack *stack);
struct net_device *dev_get_by_name(struct net *net, const char *name);
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
struct net_device *__dev_get_by_name(struct net *net, const char *name);
bool netdev_name_in_use(struct net *net, const char *name);
int dev_alloc_name(struct net_device *dev, const char *name);
int netif_open(struct net_device *dev, struct netlink_ext_ack *extack);
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack);
void netif_close(struct net_device *dev);
void dev_close(struct net_device *dev);
void netif_close_many(struct list_head *head, bool unlink);
void netif_disable_lro(struct net_device *dev);
void dev_disable_lro(struct net_device *dev);
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);

int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev);
int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);

static inline int dev_queue_xmit(struct sk_buff *skb)
{
        return __dev_queue_xmit(skb, NULL);
}

static inline int dev_queue_xmit_accel(struct sk_buff *skb,
                                       struct net_device *sb_dev)
{
        return __dev_queue_xmit(skb, sb_dev);
}

static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
        int ret;

        ret = __dev_direct_xmit(skb, queue_id);
        if (!dev_xmit_complete(ret))
                kfree_skb(skb);
        return ret;
}

int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
bool unregister_netdevice_queued(const struct net_device *dev);

static inline void unregister_netdevice(struct net_device *dev)
{
        unregister_netdevice_queue(dev, NULL);
}

int netdev_refcnt_read(const struct net_device *dev);
void free_netdev(struct net_device *dev);

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
                                         bool all_slaves);
struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
                                            struct sock *sk);
struct net_device *dev_get_by_index(struct net *net, int ifindex);
struct net_device *__dev_get_by_index(struct net *net, int ifindex);
struct net_device *netdev_get_by_index(struct net *net, int ifindex,
                                       netdevice_tracker *tracker, gfp_t gfp);
struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex);
struct net_device *netdev_get_by_name(struct net *net, const char *name,
                                      netdevice_tracker *tracker, gfp_t gfp);
struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
                                           unsigned short flags, unsigned short mask);
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
void netdev_copy_name(struct net_device *dev, char *name);

static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
                                  unsigned short type,
                                  const void *daddr, const void *saddr,
                                  unsigned int len)
{
        if (!dev->header_ops || !dev->header_ops->create)
                return 0;

        return dev->header_ops->create(skb, dev, type, daddr, saddr, len);
}

static inline int dev_parse_header(const struct sk_buff *skb,
                                   unsigned char *haddr)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse)
                return 0;
        return dev->header_ops->parse(skb, dev, haddr);
}

static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse_protocol)
                return 0;
        return dev->header_ops->parse_protocol(skb);
}

/* ll_header must have at least hard_header_len allocated */
static inline bool dev_validate_header(const struct net_device *dev,
                                       char *ll_header, int len)
{
        if (likely(len >= dev->hard_header_len))
                return true;
        if (len < dev->min_header_len)
                return false;

        if (capable(CAP_SYS_RAWIO)) {
                memset(ll_header + len, 0, dev->hard_header_len - len);
                return true;
        }

        if (dev->header_ops && dev->header_ops->validate)
                return dev->header_ops->validate(ll_header, len);

        return false;
}

static inline bool dev_has_header(const struct net_device *dev)
{
        return dev->header_ops && dev->header_ops->create;
}

struct numa_drop_counters {
        atomic_t        drops0 ____cacheline_aligned_in_smp;
        atomic_t        drops1 ____cacheline_aligned_in_smp;
};

static inline int numa_drop_read(const struct numa_drop_counters *ndc)
{
        return atomic_read(&ndc->drops0) + atomic_read(&ndc->drops1);
}

static inline void numa_drop_add(struct numa_drop_counters *ndc, int val)
{
        int n = numa_node_id() % 2;

        if (n)
                atomic_add(val, &ndc->drops1);
        else
                atomic_add(val, &ndc->drops0);
}

static inline void numa_drop_reset(struct numa_drop_counters *ndc)
{
        atomic_set(&ndc->drops0, 0);
        atomic_set(&ndc->drops1, 0);
}

/*
 * Incoming packets are placed on per-CPU queues
 */
struct softnet_data {
        struct list_head        poll_list;
        struct sk_buff_head        process_queue;
        local_lock_t                process_queue_bh_lock;

        /* stats */
        unsigned int                processed;
        unsigned int                time_squeeze;
#ifdef CONFIG_RPS
        struct softnet_data        *rps_ipi_list;
#endif

        unsigned int                received_rps;
        bool                        in_net_rx_action;
        bool                        in_napi_threaded_poll;

#ifdef CONFIG_NET_FLOW_LIMIT
        struct sd_flow_limit __rcu *flow_limit;
#endif
        struct Qdisc                *output_queue;
        struct Qdisc                **output_queue_tailp;
        struct sk_buff                *completion_queue;
#ifdef CONFIG_XFRM_OFFLOAD
        struct sk_buff_head        xfrm_backlog;
#endif
        /* written and read only by owning cpu: */
        struct netdev_xmit xmit;
#ifdef CONFIG_RPS
        /* input_queue_head should be written by cpu owning this struct,
         * and only read by other cpus. Worth using a cache line.
         */
        unsigned int                input_queue_head ____cacheline_aligned_in_smp;

        /* Elements below can be accessed between CPUs for RPS/RFS */
        call_single_data_t        csd ____cacheline_aligned_in_smp;
        struct softnet_data        *rps_ipi_next;
        unsigned int                cpu;

        /* We force a cacheline alignment from here, to hold together
         * input_queue_tail, input_pkt_queue and backlog.state.
         * We add holes so that backlog.state is the last field
         * of this cache line.
         */
        long                        pad[3] ____cacheline_aligned_in_smp;
        unsigned int                input_queue_tail;
#endif
        struct sk_buff_head        input_pkt_queue;

        struct napi_struct        backlog;

        struct numa_drop_counters drop_counters;

        int                        defer_ipi_scheduled ____cacheline_aligned_in_smp;
        call_single_data_t        defer_csd;
};

DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);

struct page_pool_bh {
        struct page_pool *pool;
        local_lock_t bh_lock;
};
DECLARE_PER_CPU(struct page_pool_bh, system_page_pool);

#define XMIT_RECURSION_LIMIT        8

#ifndef CONFIG_PREEMPT_RT
static inline int dev_recursion_level(void)
{
        return this_cpu_read(softnet_data.xmit.recursion);
}

static inline bool dev_xmit_recursion(void)
{
        return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
                        XMIT_RECURSION_LIMIT);
}

static inline void dev_xmit_recursion_inc(void)
{
        __this_cpu_inc(softnet_data.xmit.recursion);
}

static inline void dev_xmit_recursion_dec(void)
{
        __this_cpu_dec(softnet_data.xmit.recursion);
}
#else
static inline int dev_recursion_level(void)
{
        return current->net_xmit.recursion;
}

static inline bool dev_xmit_recursion(void)
{
        return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
}

static inline void dev_xmit_recursion_inc(void)
{
        current->net_xmit.recursion++;
}

static inline void dev_xmit_recursion_dec(void)
{
        current->net_xmit.recursion--;
}
#endif

void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);

static inline void netif_tx_schedule_all(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                netif_schedule_queue(netdev_get_tx_queue(dev, i));
}

static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
{
        clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_start_queue - allow transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 */
static inline void netif_start_queue(struct net_device *dev)
{
        netif_tx_start_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_start_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_start_queue(txq);
        }
}

void netif_tx_wake_queue(struct netdev_queue *dev_queue);

/**
 *        netif_wake_queue - restart transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are available.
 */
static inline void netif_wake_queue(struct net_device *dev)
{
        netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_wake_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_wake_queue(txq);
        }
}

static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
{
        /* Paired with READ_ONCE() from dev_watchdog() */
        WRITE_ONCE(dev_queue->trans_start, jiffies);

        /* This barrier is paired with smp_mb() from dev_watchdog() */
        smp_mb__before_atomic();

        /* Must be an atomic op see netif_txq_try_stop() */
        set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_stop_queue - stop transmitted packets
 *        @dev: network device
 *
 *        Stop upper layers calling the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are unavailable.
 */
static inline void netif_stop_queue(struct net_device *dev)
{
        netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
}

void netif_tx_stop_all_queues(struct net_device *dev);

static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
{
        return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_queue_stopped - test if transmit queue is flowblocked
 *        @dev: network device
 *
 *        Test if transmit queue on device is currently unable to send.
 */
static inline bool netif_queue_stopped(const struct net_device *dev)
{
        return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
}

static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF;
}

static inline bool
netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN;
}

static inline bool
netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
}

/**
 *        netdev_queue_set_dql_min_limit - set dql minimum limit
 *        @dev_queue: pointer to transmit queue
 *        @min_limit: dql minimum limit
 *
 * Forces xmit_more() to return true until the minimum threshold
 * defined by @min_limit is reached (or until the tx queue is
 * empty). Warning: to be use with care, misuse will impact the
 * latency.
 */
static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue,
                                                  unsigned int min_limit)
{
#ifdef CONFIG_BQL
        dev_queue->dql.min_limit = min_limit;
#endif
}

static inline int netdev_queue_dql_avail(const struct netdev_queue *txq)
{
#ifdef CONFIG_BQL
        /* Non-BQL migrated drivers will return 0, too. */
        return dql_avail(&txq->dql);
#else
        return 0;
#endif
}

/**
 *        netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their ndo_start_xmit(),
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.num_queued);
#endif
}

/**
 *        netdev_txq_bql_complete_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their TX completion path,
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.limit);
#endif
}

/**
 *        netdev_tx_sent_queue - report the number of bytes queued to a given tx queue
 *        @dev_queue: network device queue
 *        @bytes: number of bytes queued to the device queue
 *
 *        Report the number of bytes queued for sending/completion to the network
 *        device hardware queue. @bytes should be a good approximation and should
 *        exactly match netdev_completed_queue() @bytes.
 *        This is typically called once per packet, from ndo_start_xmit().
 */
static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                        unsigned int bytes)
{
#ifdef CONFIG_BQL
        dql_queued(&dev_queue->dql, bytes);

        if (likely(dql_avail(&dev_queue->dql) >= 0))
                return;

        /* Paired with READ_ONCE() from dev_watchdog() */
        WRITE_ONCE(dev_queue->trans_start, jiffies);

        /* This barrier is paired with smp_mb() from dev_watchdog() */
        smp_mb__before_atomic();

        set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);

        /*
         * The XOFF flag must be set before checking the dql_avail below,
         * because in netdev_tx_completed_queue we update the dql_completed
         * before checking the XOFF flag.
         */
        smp_mb__after_atomic();

        /* check again in case another CPU has just made room avail */
        if (unlikely(dql_avail(&dev_queue->dql) >= 0))
                clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);
#endif
}

/* Variant of netdev_tx_sent_queue() for drivers that are aware
 * that they should not test BQL status themselves.
 * We do want to change __QUEUE_STATE_STACK_XOFF only for the last
 * skb of a batch.
 * Returns true if the doorbell must be used to kick the NIC.
 */
static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                          unsigned int bytes,
                                          bool xmit_more)
{
        if (xmit_more) {
#ifdef CONFIG_BQL
                dql_queued(&dev_queue->dql, bytes);
#endif
                return netif_tx_queue_stopped(dev_queue);
        }
        netdev_tx_sent_queue(dev_queue, bytes);
        return true;
}

/**
 *        netdev_sent_queue - report the number of bytes queued to hardware
 *        @dev: network device
 *        @bytes: number of bytes queued to the hardware device queue
 *
 *        Report the number of bytes queued for sending/completion to the network
 *        device hardware queue#0. @bytes should be a good approximation and should
 *        exactly match netdev_completed_queue() @bytes.
 *        This is typically called once per packet, from ndo_start_xmit().
 */
static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes)
{
        netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes);
}

static inline bool __netdev_sent_queue(struct net_device *dev,
                                       unsigned int bytes,
                                       bool xmit_more)
{
        return __netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes,
                                      xmit_more);
}

/**
 *        netdev_tx_completed_queue - report number of packets/bytes at TX completion.
 *        @dev_queue: network device queue
 *        @pkts: number of packets (currently ignored)
 *        @bytes: number of bytes dequeued from the device queue
 *
 *        Must be called at most once per TX completion round (and not per
 *        individual packet), so that BQL can adjust its limits appropriately.
 */
static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
                                             unsigned int pkts, unsigned int bytes)
{
#ifdef CONFIG_BQL
        if (unlikely(!bytes))
                return;

        dql_completed(&dev_queue->dql, bytes);

        /*
         * Without the memory barrier there is a small possibility that
         * netdev_tx_sent_queue will miss the update and cause the queue to
         * be stopped forever
         */
        smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */

        if (unlikely(dql_avail(&dev_queue->dql) < 0))
                return;

        if (test_and_clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state))
                netif_schedule_queue(dev_queue);
#endif
}

/**
 *         netdev_completed_queue - report bytes and packets completed by device
 *         @dev: network device
 *         @pkts: actual number of packets sent over the medium
 *         @bytes: actual number of bytes sent over the medium
 *
 *         Report the number of bytes and packets transmitted by the network device
 *         hardware queue over the physical medium, @bytes must exactly match the
 *         @bytes amount passed to netdev_sent_queue()
 */
static inline void netdev_completed_queue(struct net_device *dev,
                                          unsigned int pkts, unsigned int bytes)
{
        netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes);
}

static inline void netdev_tx_reset_queue(struct netdev_queue *q)
{
#ifdef CONFIG_BQL
        clear_bit(__QUEUE_STATE_STACK_XOFF, &q->state);
        dql_reset(&q->dql);
#endif
}

/**
 * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue
 * @dev: network device
 * @qid: stack index of the queue to reset
 */
static inline void netdev_tx_reset_subqueue(const struct net_device *dev,
                                            u32 qid)
{
        netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid));
}

/**
 *         netdev_reset_queue - reset the packets and bytes count of a network device
 *         @dev_queue: network device
 *
 *         Reset the bytes and packet count of a network device and clear the
 *         software flow control OFF bit for this network device
 */
static inline void netdev_reset_queue(struct net_device *dev_queue)
{
        netdev_tx_reset_subqueue(dev_queue, 0);
}

/**
 *         netdev_cap_txqueue - check if selected tx queue exceeds device queues
 *         @dev: network device
 *         @queue_index: given tx queue index
 *
 *         Returns 0 if given tx queue index >= number of device tx queues,
 *         otherwise returns the originally passed tx queue index.
 */
static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index)
{
        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
                net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
                                     dev->name, queue_index,
                                     dev->real_num_tx_queues);
                return 0;
        }

        return queue_index;
}

/**
 *        netif_running - test if up
 *        @dev: network device
 *
 *        Test if the device has been brought up.
 */
static inline bool netif_running(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_START, &dev->state);
}

/*
 * Routines to manage the subqueues on a device.  We only need start,
 * stop, and a check if it's stopped.  All other device management is
 * done at the overall netdevice level.
 * Also test the device if we're multiqueue.
 */

/**
 *        netif_start_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Start individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_start_queue(txq);
}

/**
 *        netif_stop_subqueue - stop sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Stop individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
        netif_tx_stop_queue(txq);
}

/**
 *        __netif_subqueue_stopped - test status of subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Check individual transmit queue of a device with multiple transmit queues.
 */
static inline bool __netif_subqueue_stopped(const struct net_device *dev,
                                            u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        return netif_tx_queue_stopped(txq);
}

/**
 *        netif_subqueue_stopped - test status of subqueue
 *        @dev: network device
 *        @skb: sub queue buffer pointer
 *
 * Check individual transmit queue of a device with multiple transmit queues.
 */
static inline bool netif_subqueue_stopped(const struct net_device *dev,
                                          struct sk_buff *skb)
{
        return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
}

/**
 *        netif_wake_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Resume individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_wake_queue(txq);
}

#ifdef CONFIG_XPS
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index);
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
                          u16 index, enum xps_map_type type);

/**
 *        netif_attr_test_mask - Test a CPU or Rx queue set in a mask
 *        @j: CPU/Rx queue index
 *        @mask: bitmask of all cpus/rx queues
 *        @nr_bits: number of bits in the bitmask
 *
 * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
 */
static inline bool netif_attr_test_mask(unsigned long j,
                                        const unsigned long *mask,
                                        unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);
        return test_bit(j, mask);
}

/**
 *        netif_attr_test_online - Test for online CPU/Rx queue
 *        @j: CPU/Rx queue index
 *        @online_mask: bitmask for CPUs/Rx queues that are online
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns: true if a CPU/Rx queue is online.
 */
static inline bool netif_attr_test_online(unsigned long j,
                                          const unsigned long *online_mask,
                                          unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);

        if (online_mask)
                return test_bit(j, online_mask);

        return (j < nr_bits);
}

/**
 *        netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
 *        @n: CPU/Rx queue index
 *        @srcp: the cpumask/Rx queue mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns: next (after n) CPU/Rx queue index in the mask;
 * >= nr_bits if no further CPUs/Rx queues set.
 */
static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
                                               unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (srcp)
                return find_next_bit(srcp, nr_bits, n + 1);

        return n + 1;
}

/**
 *        netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p
 *        @n: CPU/Rx queue index
 *        @src1p: the first CPUs/Rx queues mask pointer
 *        @src2p: the second CPUs/Rx queues mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns: next (after n) CPU/Rx queue index set in both masks;
 * >= nr_bits if no further CPUs/Rx queues set in both.
 */
static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
                                          const unsigned long *src2p,
                                          unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (src1p && src2p)
                return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
        else if (src1p)
                return find_next_bit(src1p, nr_bits, n + 1);
        else if (src2p)
                return find_next_bit(src2p, nr_bits, n + 1);

        return n + 1;
}
#else
static inline int netif_set_xps_queue(struct net_device *dev,
                                      const struct cpumask *mask,
                                      u16 index)
{
        return 0;
}

static inline int __netif_set_xps_queue(struct net_device *dev,
                                        const unsigned long *mask,
                                        u16 index, enum xps_map_type type)
{
        return 0;
}
#endif

/**
 *        netif_is_multiqueue - test if device has multiple transmit queues
 *        @dev: network device
 *
 * Check if device has multiple transmit queues
 */
static inline bool netif_is_multiqueue(const struct net_device *dev)
{
        return dev->num_tx_queues > 1;
}

int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
int netif_set_real_num_queues(struct net_device *dev,
                              unsigned int txq, unsigned int rxq);

int netif_get_num_default_rss_queues(void);

void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason);
void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason);

/*
 * It is not allowed to call kfree_skb() or consume_skb() from hardware
 * interrupt context or with hardware interrupts being disabled.
 * (in_hardirq() || irqs_disabled())
 *
 * We provide four helpers that can be used in following contexts :
 *
 * dev_kfree_skb_irq(skb) when caller drops a packet from irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_irq(skb) when caller consumes a packet from irq context.
 *  Typically used in place of consume_skb(skb) in TX completion path
 *
 * dev_kfree_skb_any(skb) when caller doesn't know its current irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_any(skb) when caller doesn't know its current irq context,
 *  and consumed a packet. Used in place of consume_skb(skb)
 */
static inline void dev_kfree_skb_irq(struct sk_buff *skb)
{
        dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

static inline void dev_consume_skb_irq(struct sk_buff *skb)
{
        dev_kfree_skb_irq_reason(skb, SKB_CONSUMED);
}

static inline void dev_kfree_skb_any(struct sk_buff *skb)
{
        dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

static inline void dev_consume_skb_any(struct sk_buff *skb)
{
        dev_kfree_skb_any_reason(skb, SKB_CONSUMED);
}

u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                             const struct bpf_prog *xdp_prog);
void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog);
int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb);
int netif_rx(struct sk_buff *skb);
int __netif_rx(struct sk_buff *skb);

int netif_receive_skb(struct sk_buff *skb);
int netif_receive_skb_core(struct sk_buff *skb);
void netif_receive_skb_list_internal(struct list_head *head);
void netif_receive_skb_list(struct list_head *head);
gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb);

static inline gro_result_t napi_gro_receive(struct napi_struct *napi,
                                            struct sk_buff *skb)
{
        return gro_receive_skb(&napi->gro, skb);
}

struct sk_buff *napi_get_frags(struct napi_struct *napi);
gro_result_t napi_gro_frags(struct napi_struct *napi);

static inline void napi_free_frags(struct napi_struct *napi)
{
        kfree_skb(napi->skb);
        napi->skb = NULL;
}

bool netdev_is_rx_handler_busy(struct net_device *dev);
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data);
void netdev_rx_handler_unregister(struct net_device *dev);

bool dev_valid_name(const char *name);
static inline bool is_socket_ioctl_cmd(unsigned int cmd)
{
        return _IOC_TYPE(cmd) == SOCK_IOC_TYPE;
}
int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg);
int put_user_ifreq(struct ifreq *ifr, void __user *arg);
int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
                void __user *data, bool *need_copyout);
int dev_ifconf(struct net *net, struct ifconf __user *ifc);
int dev_eth_ioctl(struct net_device *dev,
                  struct ifreq *ifr, unsigned int cmd);
int generic_hwtstamp_get_lower(struct net_device *dev,
                               struct kernel_hwtstamp_config *kernel_cfg);
int generic_hwtstamp_set_lower(struct net_device *dev,
                               struct kernel_hwtstamp_config *kernel_cfg,
                               struct netlink_ext_ack *extack);
int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata);
unsigned int netif_get_flags(const struct net_device *dev);
int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack);
int netif_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack);
int dev_change_flags(struct net_device *dev, unsigned int flags,
                     struct netlink_ext_ack *extack);
int netif_set_alias(struct net_device *dev, const char *alias, size_t len);
int dev_set_alias(struct net_device *, const char *, size_t);
int dev_get_alias(const struct net_device *, char *, size_t);
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
                               const char *pat, int new_ifindex,
                               struct netlink_ext_ack *extack);
int dev_change_net_namespace(struct net_device *dev, struct net *net,
                             const char *pat);
int __netif_set_mtu(struct net_device *dev, int new_mtu);
int netif_set_mtu(struct net_device *dev, int new_mtu);
int dev_set_mtu(struct net_device *, int);
int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
                                struct netlink_ext_ack *extack);
int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
                          struct netlink_ext_ack *extack);
int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
                        struct netlink_ext_ack *extack);
int dev_set_mac_address_user(struct net_device *dev, struct sockaddr_storage *ss,
                             struct netlink_ext_ack *extack);
int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
int netif_get_port_parent_id(struct net_device *dev,
                             struct netdev_phys_item_id *ppid, bool recurse);
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);

struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret);

int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
u8 dev_xdp_prog_count(struct net_device *dev);
int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf);
int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf);
u8 dev_xdp_sb_prog_count(struct net_device *dev);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);

u32 dev_get_min_mp_channel_count(const struct net_device *dev);

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
                        const struct sk_buff *skb);

static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
                                                 const struct sk_buff *skb,
                                                 const bool check_mtu)
{
        const u32 vlan_hdr_len = 4; /* VLAN_HLEN */
        unsigned int len;

        if (!(dev->flags & IFF_UP))
                return false;

        if (!check_mtu)
                return true;

        len = dev->mtu + dev->hard_header_len + vlan_hdr_len;
        if (skb->len <= len)
                return true;

        /* if TSO is enabled, we don't care about the length as the packet
         * could be forwarded without being segmented before
         */
        if (skb_is_gso(skb))
                return true;

        return false;
}

void netdev_core_stats_inc(struct net_device *dev, u32 offset);

#define DEV_CORE_STATS_INC(FIELD)                                                \
static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)                \
{                                                                                \
        netdev_core_stats_inc(dev,                                                \
                        offsetof(struct net_device_core_stats, FIELD));                \
}
DEV_CORE_STATS_INC(rx_dropped)
DEV_CORE_STATS_INC(tx_dropped)
DEV_CORE_STATS_INC(rx_nohandler)
DEV_CORE_STATS_INC(rx_otherhost_dropped)
#undef DEV_CORE_STATS_INC

static __always_inline int ____dev_forward_skb(struct net_device *dev,
                                               struct sk_buff *skb,
                                               const bool check_mtu)
{
        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
            unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) {
                dev_core_stats_rx_dropped_inc(dev);
                kfree_skb(skb);
                return NET_RX_DROP;
        }

        skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev)));
        skb->priority = 0;
        return 0;
}

bool dev_nit_active_rcu(const struct net_device *dev);
static inline bool dev_nit_active(const struct net_device *dev)
{
        bool ret;

        rcu_read_lock();
        ret = dev_nit_active_rcu(dev);
        rcu_read_unlock();
        return ret;
}

void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);

static inline void __dev_put(struct net_device *dev)
{
        if (dev) {
#ifdef CONFIG_PCPU_DEV_REFCNT
                this_cpu_dec(*dev->pcpu_refcnt);
#else
                refcount_dec(&dev->dev_refcnt);
#endif
        }
}

static inline void __dev_hold(struct net_device *dev)
{
        if (dev) {
#ifdef CONFIG_PCPU_DEV_REFCNT
                this_cpu_inc(*dev->pcpu_refcnt);
#else
                refcount_inc(&dev->dev_refcnt);
#endif
        }
}

static inline void __netdev_tracker_alloc(struct net_device *dev,
                                          netdevice_tracker *tracker,
                                          gfp_t gfp)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp);
#endif
}

/* netdev_tracker_alloc() can upgrade a prior untracked reference
 * taken by dev_get_by_name()/dev_get_by_index() to a tracked one.
 */
static inline void netdev_tracker_alloc(struct net_device *dev,
                                        netdevice_tracker *tracker, gfp_t gfp)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        refcount_dec(&dev->refcnt_tracker.no_tracker);
        __netdev_tracker_alloc(dev, tracker, gfp);
#endif
}

static inline void netdev_tracker_free(struct net_device *dev,
                                       netdevice_tracker *tracker)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        ref_tracker_free(&dev->refcnt_tracker, tracker);
#endif
}

static inline void netdev_hold(struct net_device *dev,
                               netdevice_tracker *tracker, gfp_t gfp)
{
        if (dev) {
                __dev_hold(dev);
                __netdev_tracker_alloc(dev, tracker, gfp);
        }
}

static inline void netdev_put(struct net_device *dev,
                              netdevice_tracker *tracker)
{
        if (dev) {
                netdev_tracker_free(dev, tracker);
                __dev_put(dev);
        }
}

/**
 *        dev_hold - get reference to device
 *        @dev: network device
 *
 * Hold reference to device to keep it from being freed.
 * Try using netdev_hold() instead.
 */
static inline void dev_hold(struct net_device *dev)
{
        netdev_hold(dev, NULL, GFP_ATOMIC);
}

/**
 *        dev_put - release reference to device
 *        @dev: network device
 *
 * Release reference to device to allow it to be freed.
 * Try using netdev_put() instead.
 */
static inline void dev_put(struct net_device *dev)
{
        netdev_put(dev, NULL);
}

DEFINE_FREE(dev_put, struct net_device *, if (_T) dev_put(_T))

static inline void netdev_ref_replace(struct net_device *odev,
                                      struct net_device *ndev,
                                      netdevice_tracker *tracker,
                                      gfp_t gfp)
{
        if (odev)
                netdev_tracker_free(odev, tracker);

        __dev_hold(ndev);
        __dev_put(odev);

        if (ndev)
                __netdev_tracker_alloc(ndev, tracker, gfp);
}

/* Carrier loss detection, dial on demand. The functions netif_carrier_on
 * and _off may be called from IRQ context, but it is caller
 * who is responsible for serialization of these calls.
 *
 * The name carrier is inappropriate, these functions should really be
 * called netif_lowerlayer_*() because they represent the state of any
 * kind of lower layer not just hardware media.
 */
void linkwatch_fire_event(struct net_device *dev);

/**
 * linkwatch_sync_dev - sync linkwatch for the given device
 * @dev: network device to sync linkwatch for
 *
 * Sync linkwatch for the given device, removing it from the
 * pending work list (if queued).
 */
void linkwatch_sync_dev(struct net_device *dev);
void __linkwatch_sync_dev(struct net_device *dev);

/**
 *        netif_carrier_ok - test if carrier present
 *        @dev: network device
 *
 * Check if carrier is present on device
 */
static inline bool netif_carrier_ok(const struct net_device *dev)
{
        return !test_bit(__LINK_STATE_NOCARRIER, &dev->state);
}

unsigned long dev_trans_start(struct net_device *dev);

void netdev_watchdog_up(struct net_device *dev);

void netif_carrier_on(struct net_device *dev);
void netif_carrier_off(struct net_device *dev);
void netif_carrier_event(struct net_device *dev);

/**
 *        netif_dormant_on - mark device as dormant.
 *        @dev: network device
 *
 * Mark device as dormant (as per RFC2863).
 *
 * The dormant state indicates that the relevant interface is not
 * actually in a condition to pass packets (i.e., it is not 'up') but is
 * in a "pending" state, waiting for some external event.  For "on-
 * demand" interfaces, this new state identifies the situation where the
 * interface is waiting for events to place it in the up state.
 */
static inline void netif_dormant_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant_off - set device as not dormant.
 *        @dev: network device
 *
 * Device is not in dormant state.
 */
static inline void netif_dormant_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant - test if device is dormant
 *        @dev: network device
 *
 * Check if device is dormant.
 */
static inline bool netif_dormant(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_DORMANT, &dev->state);
}


/**
 *        netif_testing_on - mark device as under test.
 *        @dev: network device
 *
 * Mark device as under test (as per RFC2863).
 *
 * The testing state indicates that some test(s) must be performed on
 * the interface. After completion, of the test, the interface state
 * will change to up, dormant, or down, as appropriate.
 */
static inline void netif_testing_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing_off - set device as not under test.
 *        @dev: network device
 *
 * Device is not in testing state.
 */
static inline void netif_testing_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing - test if device is under test
 *        @dev: network device
 *
 * Check if device is under test
 */
static inline bool netif_testing(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_TESTING, &dev->state);
}


/**
 *        netif_oper_up - test if device is operational
 *        @dev: network device
 *
 * Check if carrier is operational
 */
static inline bool netif_oper_up(const struct net_device *dev)
{
        unsigned int operstate = READ_ONCE(dev->operstate);

        return        operstate == IF_OPER_UP ||
                operstate == IF_OPER_UNKNOWN /* backward compat */;
}

/**
 *        netif_device_present - is device available or removed
 *        @dev: network device
 *
 * Check if device has not been removed from system.
 */
static inline bool netif_device_present(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_PRESENT, &dev->state);
}

void netif_device_detach(struct net_device *dev);

void netif_device_attach(struct net_device *dev);

/*
 * Network interface message level settings
 */

enum {
        NETIF_MSG_DRV_BIT,
        NETIF_MSG_PROBE_BIT,
        NETIF_MSG_LINK_BIT,
        NETIF_MSG_TIMER_BIT,
        NETIF_MSG_IFDOWN_BIT,
        NETIF_MSG_IFUP_BIT,
        NETIF_MSG_RX_ERR_BIT,
        NETIF_MSG_TX_ERR_BIT,
        NETIF_MSG_TX_QUEUED_BIT,
        NETIF_MSG_INTR_BIT,
        NETIF_MSG_TX_DONE_BIT,
        NETIF_MSG_RX_STATUS_BIT,
        NETIF_MSG_PKTDATA_BIT,
        NETIF_MSG_HW_BIT,
        NETIF_MSG_WOL_BIT,

        /* When you add a new bit above, update netif_msg_class_names array
         * in net/ethtool/common.c
         */
        NETIF_MSG_CLASS_COUNT,
};
/* Both ethtool_ops interface and internal driver implementation use u32 */
static_assert(NETIF_MSG_CLASS_COUNT <= 32);

#define __NETIF_MSG_BIT(bit)        ((u32)1 << (bit))
#define __NETIF_MSG(name)        __NETIF_MSG_BIT(NETIF_MSG_ ## name ## _BIT)

#define NETIF_MSG_DRV                __NETIF_MSG(DRV)
#define NETIF_MSG_PROBE                __NETIF_MSG(PROBE)
#define NETIF_MSG_LINK                __NETIF_MSG(LINK)
#define NETIF_MSG_TIMER                __NETIF_MSG(TIMER)
#define NETIF_MSG_IFDOWN        __NETIF_MSG(IFDOWN)
#define NETIF_MSG_IFUP                __NETIF_MSG(IFUP)
#define NETIF_MSG_RX_ERR        __NETIF_MSG(RX_ERR)
#define NETIF_MSG_TX_ERR        __NETIF_MSG(TX_ERR)
#define NETIF_MSG_TX_QUEUED        __NETIF_MSG(TX_QUEUED)
#define NETIF_MSG_INTR                __NETIF_MSG(INTR)
#define NETIF_MSG_TX_DONE        __NETIF_MSG(TX_DONE)
#define NETIF_MSG_RX_STATUS        __NETIF_MSG(RX_STATUS)
#define NETIF_MSG_PKTDATA        __NETIF_MSG(PKTDATA)
#define NETIF_MSG_HW                __NETIF_MSG(HW)
#define NETIF_MSG_WOL                __NETIF_MSG(WOL)

#define netif_msg_drv(p)        ((p)->msg_enable & NETIF_MSG_DRV)
#define netif_msg_probe(p)        ((p)->msg_enable & NETIF_MSG_PROBE)
#define netif_msg_link(p)        ((p)->msg_enable & NETIF_MSG_LINK)
#define netif_msg_timer(p)        ((p)->msg_enable & NETIF_MSG_TIMER)
#define netif_msg_ifdown(p)        ((p)->msg_enable & NETIF_MSG_IFDOWN)
#define netif_msg_ifup(p)        ((p)->msg_enable & NETIF_MSG_IFUP)
#define netif_msg_rx_err(p)        ((p)->msg_enable & NETIF_MSG_RX_ERR)
#define netif_msg_tx_err(p)        ((p)->msg_enable & NETIF_MSG_TX_ERR)
#define netif_msg_tx_queued(p)        ((p)->msg_enable & NETIF_MSG_TX_QUEUED)
#define netif_msg_intr(p)        ((p)->msg_enable & NETIF_MSG_INTR)
#define netif_msg_tx_done(p)        ((p)->msg_enable & NETIF_MSG_TX_DONE)
#define netif_msg_rx_status(p)        ((p)->msg_enable & NETIF_MSG_RX_STATUS)
#define netif_msg_pktdata(p)        ((p)->msg_enable & NETIF_MSG_PKTDATA)
#define netif_msg_hw(p)                ((p)->msg_enable & NETIF_MSG_HW)
#define netif_msg_wol(p)        ((p)->msg_enable & NETIF_MSG_WOL)

static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
{
        /* use default */
        if (debug_value < 0 || debug_value >= (sizeof(u32) * 8))
                return default_msg_enable_bits;
        if (debug_value == 0)        /* no output */
                return 0;
        /* set low N bits */
        return (1U << debug_value) - 1;
}

static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
{
        spin_lock(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in netif_tx_owned() */
        WRITE_ONCE(txq->xmit_lock_owner, cpu);
}

static inline bool __netif_tx_acquire(struct netdev_queue *txq)
{
        __acquire(&txq->_xmit_lock);
        return true;
}

static inline void __netif_tx_release(struct netdev_queue *txq)
{
        __release(&txq->_xmit_lock);
}

static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
{
        spin_lock_bh(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in netif_tx_owned() */
        WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
}

static inline bool __netif_tx_trylock(struct netdev_queue *txq)
{
        bool ok = spin_trylock(&txq->_xmit_lock);

        if (likely(ok)) {
                /* Pairs with READ_ONCE() in netif_tx_owned() */
                WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
        }
        return ok;
}

static inline void __netif_tx_unlock(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in netif_tx_owned() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock(&txq->_xmit_lock);
}

static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in netif_tx_owned() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock_bh(&txq->_xmit_lock);
}

/*
 * txq->trans_start can be read locklessly from dev_watchdog()
 */
static inline void txq_trans_update(const struct net_device *dev,
                                    struct netdev_queue *txq)
{
        if (!dev->lltx)
                WRITE_ONCE(txq->trans_start, jiffies);
}

static inline void txq_trans_cond_update(struct netdev_queue *txq)
{
        unsigned long now = jiffies;

        if (READ_ONCE(txq->trans_start) != now)
                WRITE_ONCE(txq->trans_start, now);
}

/* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
static inline void netif_trans_update(struct net_device *dev)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);

        txq_trans_cond_update(txq);
}

/**
 *        netif_tx_lock - grab network device transmit lock
 *        @dev: network device
 *
 * Get network device transmit lock
 */
void netif_tx_lock(struct net_device *dev);

static inline void netif_tx_lock_bh(struct net_device *dev)
{
        local_bh_disable();
        netif_tx_lock(dev);
}

void netif_tx_unlock(struct net_device *dev);

static inline void netif_tx_unlock_bh(struct net_device *dev)
{
        netif_tx_unlock(dev);
        local_bh_enable();
}

#define HARD_TX_LOCK(dev, txq, cpu) {                        \
        if (!(dev)->lltx) {                                \
                __netif_tx_lock(txq, cpu);                \
        } else {                                        \
                __netif_tx_acquire(txq);                \
        }                                                \
}

#define HARD_TX_TRYLOCK(dev, txq)                        \
        (!(dev)->lltx ?                                        \
                __netif_tx_trylock(txq) :                \
                __netif_tx_acquire(txq))

#define HARD_TX_UNLOCK(dev, txq) {                        \
        if (!(dev)->lltx) {                                \
                __netif_tx_unlock(txq);                        \
        } else {                                        \
                __netif_tx_release(txq);                \
        }                                                \
}

static inline void netif_tx_disable(struct net_device *dev)
{
        unsigned int i;
        int cpu;

        local_bh_disable();
        cpu = smp_processor_id();
        spin_lock(&dev->tx_global_lock);
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                __netif_tx_lock(txq, cpu);
                netif_tx_stop_queue(txq);
                __netif_tx_unlock(txq);
        }
        spin_unlock(&dev->tx_global_lock);
        local_bh_enable();
}

#ifndef CONFIG_PREEMPT_RT
static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu)
{
        /* Other cpus might concurrently change txq->xmit_lock_owner
         * to -1 or to their cpu id, but not to our id.
         */
        return READ_ONCE(txq->xmit_lock_owner) == cpu;
}

#else
static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu)
{
        return rt_mutex_owner(&txq->_xmit_lock.lock) == current;
}

#endif

static inline void netif_addr_lock(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_lock_bh(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        local_bh_disable();
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_unlock(struct net_device *dev)
{
        spin_unlock(&dev->addr_list_lock);
}

static inline void netif_addr_unlock_bh(struct net_device *dev)
{
        spin_unlock_bh(&dev->addr_list_lock);
}

/*
 * dev_addrs walker. Should be used only for read access. Call with
 * rcu_read_lock held.
 */
#define for_each_dev_addr(dev, ha) \
                list_for_each_entry_rcu(ha, &dev->dev_addrs.list, list)

/* These functions live elsewhere (drivers/net/net_init.c, but related) */

void ether_setup(struct net_device *dev);

/* Allocate dummy net_device */
struct net_device *alloc_netdev_dummy(int sizeof_priv);

/* Support for loadable net-drivers */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
                                    unsigned char name_assign_type,
                                    void (*setup)(struct net_device *),
                                    unsigned int txqs, unsigned int rxqs);
#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)

#define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \
                         count)

int register_netdev(struct net_device *dev);
void unregister_netdev(struct net_device *dev);

int devm_register_netdev(struct device *dev, struct net_device *ndev);

/* General hardware address lists handling functions */
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
                   struct netdev_hw_addr_list *from_list, int addr_len);
int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
                            struct netdev_hw_addr_list *from_list,
                            int addr_len);
void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
                      struct netdev_hw_addr_list *from_list, int addr_len);
int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
                       struct net_device *dev,
                       int (*sync)(struct net_device *, const unsigned char *),
                       int (*unsync)(struct net_device *,
                                     const unsigned char *));
int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
                           struct net_device *dev,
                           int (*sync)(struct net_device *,
                                       const unsigned char *, int),
                           int (*unsync)(struct net_device *,
                                         const unsigned char *, int));
void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
                              struct net_device *dev,
                              int (*unsync)(struct net_device *,
                                            const unsigned char *, int));
void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
                          struct net_device *dev,
                          int (*unsync)(struct net_device *,
                                        const unsigned char *));
void __hw_addr_init(struct netdev_hw_addr_list *list);

/* Functions used for device addresses handling */
void dev_addr_mod(struct net_device *dev, unsigned int offset,
                  const void *addr, size_t len);

static inline void
__dev_addr_set(struct net_device *dev, const void *addr, size_t len)
{
        dev_addr_mod(dev, 0, addr, len);
}

static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
{
        __dev_addr_set(dev, addr, dev->addr_len);
}

int dev_addr_add(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);
int dev_addr_del(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);

/* Functions used for unicast addresses handling */
int dev_uc_add(struct net_device *dev, const unsigned char *addr);
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_uc_del(struct net_device *dev, const unsigned char *addr);
int dev_uc_sync(struct net_device *to, struct net_device *from);
int dev_uc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_uc_unsync(struct net_device *to, struct net_device *from);
void dev_uc_flush(struct net_device *dev);
void dev_uc_init(struct net_device *dev);

/**
 *  __dev_uc_sync - Synchronize device's unicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_uc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->uc, dev, sync, unsync);
}

/**
 *  __dev_uc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_uc_sync().
 */
static inline void __dev_uc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->uc, dev, unsync);
}

/* Functions used for multicast addresses handling */
int dev_mc_add(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_mc_del(struct net_device *dev, const unsigned char *addr);
int dev_mc_del_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_sync(struct net_device *to, struct net_device *from);
int dev_mc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_mc_unsync(struct net_device *to, struct net_device *from);
void dev_mc_flush(struct net_device *dev);
void dev_mc_init(struct net_device *dev);

/**
 *  __dev_mc_sync - Synchronize device's multicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_mc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->mc, dev, sync, unsync);
}

/**
 *  __dev_mc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_mc_sync().
 */
static inline void __dev_mc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->mc, dev, unsync);
}

/* Functions used for secondary unicast and multicast support */
void dev_set_rx_mode(struct net_device *dev);
int netif_set_promiscuity(struct net_device *dev, int inc);
int dev_set_promiscuity(struct net_device *dev, int inc);
int netif_set_allmulti(struct net_device *dev, int inc, bool notify);
int dev_set_allmulti(struct net_device *dev, int inc);
void netif_state_change(struct net_device *dev);
void netdev_state_change(struct net_device *dev);
void __netdev_notify_peers(struct net_device *dev);
void netdev_notify_peers(struct net_device *dev);
void netdev_features_change(struct net_device *dev);
/* Load a device via the kmod */
void dev_load(struct net *net, const char *name);
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage);
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
                             const struct net_device_stats *netdev_stats);
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
                           const struct pcpu_sw_netstats __percpu *netstats);
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);

enum {
        NESTED_SYNC_IMM_BIT,
        NESTED_SYNC_TODO_BIT,
};

#define __NESTED_SYNC_BIT(bit)        ((u32)1 << (bit))
#define __NESTED_SYNC(name)        __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT)

#define NESTED_SYNC_IMM                __NESTED_SYNC(IMM)
#define NESTED_SYNC_TODO        __NESTED_SYNC(TODO)

struct netdev_nested_priv {
        unsigned char flags;
        void *data;
};

bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
                                                     struct list_head **iter);

/* iterate through upper list, must be called under RCU read lock */
#define netdev_for_each_upper_dev_rcu(dev, updev, iter) \
        for (iter = &(dev)->adj_list.upper, \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \
             updev; \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)))

int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *upper_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev);

bool netdev_has_any_upper_dev(struct net_device *dev);

void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter);
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
                                        struct list_head **iter);

#define netdev_for_each_lower_private(dev, priv, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             priv = netdev_lower_get_next_private(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private(dev, &(iter)))

#define netdev_for_each_lower_private_rcu(dev, priv, iter) \
        for (iter = &(dev)->adj_list.lower, \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)))

void *netdev_lower_get_next(struct net_device *dev,
                                struct list_head **iter);

#define netdev_for_each_lower_dev(dev, ldev, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             ldev = netdev_lower_get_next(dev, &(iter)); \
             ldev; \
             ldev = netdev_lower_get_next(dev, &(iter)))

struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
                                             struct list_head **iter);
int netdev_walk_all_lower_dev(struct net_device *dev,
                              int (*fn)(struct net_device *lower_dev,
                                        struct netdev_nested_priv *priv),
                              struct netdev_nested_priv *priv);
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *lower_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

void *netdev_adjacent_get_private(struct list_head *adj_list);
void *netdev_lower_get_first_private_rcu(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
                          struct netlink_ext_ack *extack);
int netdev_master_upper_dev_link(struct net_device *dev,
                                 struct net_device *upper_dev,
                                 void *upper_priv, void *upper_info,
                                 struct netlink_ext_ack *extack);
void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev);
int netdev_adjacent_change_prepare(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev,
                                   struct netlink_ext_ack *extack);
void netdev_adjacent_change_commit(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev);
void netdev_adjacent_change_abort(struct net_device *old_dev,
                                  struct net_device *new_dev,
                                  struct net_device *dev);
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
void *netdev_lower_dev_get_private(struct net_device *dev,
                                   struct net_device *lower_dev);
void netdev_lower_state_changed(struct net_device *lower_dev,
                                void *lower_state_info);

#define NETDEV_RSS_KEY_LEN 256
extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
void netdev_rss_key_fill(void *buffer, size_t len);

int skb_checksum_help(struct sk_buff *skb);
int skb_crc32c_csum_help(struct sk_buff *skb);
int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features);

struct netdev_bonding_info {
        ifslave        slave;
        ifbond        master;
};

struct netdev_notifier_bonding_info {
        struct netdev_notifier_info info; /* must be first */
        struct netdev_bonding_info  bonding_info;
};

void netdev_bonding_info_change(struct net_device *dev,
                                struct netdev_bonding_info *bonding_info);

#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
void ethtool_notify(struct net_device *dev, unsigned int cmd);
#else
static inline void ethtool_notify(struct net_device *dev, unsigned int cmd)
{
}
#endif

__be16 skb_network_protocol(struct sk_buff *skb, int *depth);

static inline bool can_checksum_protocol(netdev_features_t features,
                                         __be16 protocol)
{
        if (protocol == htons(ETH_P_FCOE))
                return !!(features & NETIF_F_FCOE_CRC);

        /* Assume this is an IP checksum (not SCTP CRC) */

        if (features & NETIF_F_HW_CSUM) {
                /* Can checksum everything */
                return true;
        }

        switch (protocol) {
        case htons(ETH_P_IP):
                return !!(features & NETIF_F_IP_CSUM);
        case htons(ETH_P_IPV6):
                return !!(features & NETIF_F_IPV6_CSUM);
        default:
                return false;
        }
}

#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb);
#else
static inline void netdev_rx_csum_fault(struct net_device *dev,
                                        struct sk_buff *skb)
{
}
#endif
/* rx skb timestamps */
void net_enable_timestamp(void);
void net_disable_timestamp(void);

static inline ktime_t netdev_get_tstamp(struct net_device *dev,
                                        const struct skb_shared_hwtstamps *hwtstamps,
                                        bool cycles)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_get_tstamp)
                return ops->ndo_get_tstamp(dev, hwtstamps, cycles);

        return hwtstamps->hwtstamp;
}

#ifndef CONFIG_PREEMPT_RT
static inline void netdev_xmit_set_more(bool more)
{
        __this_cpu_write(softnet_data.xmit.more, more);
}

static inline bool netdev_xmit_more(void)
{
        return __this_cpu_read(softnet_data.xmit.more);
}
#else
static inline void netdev_xmit_set_more(bool more)
{
        current->net_xmit.more = more;
}

static inline bool netdev_xmit_more(void)
{
        return current->net_xmit.more;
}
#endif

static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
                                              struct sk_buff *skb, struct net_device *dev,
                                              bool more)
{
        netdev_xmit_set_more(more);
        return ops->ndo_start_xmit(skb, dev);
}

static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                            struct netdev_queue *txq, bool more)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        netdev_tx_t rc;

        rc = __netdev_start_xmit(ops, skb, dev, more);
        if (rc == NETDEV_TX_OK)
                txq_trans_update(dev, txq);

        return rc;
}

int netdev_class_create_file_ns(const struct class_attribute *class_attr,
                                const struct ns_common *ns);
void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
                                 const struct ns_common *ns);

extern const struct kobj_ns_type_operations net_ns_type_operations;

const char *netdev_drivername(const struct net_device *dev);

static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
                                                          netdev_features_t f2)
{
        if ((f1 ^ f2) & NETIF_F_HW_CSUM) {
                if (f1 & NETIF_F_HW_CSUM)
                        f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
                else
                        f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
        }

        return f1 & f2;
}

static inline netdev_features_t netdev_get_wanted_features(
        struct net_device *dev)
{
        return (dev->features & ~dev->hw_features) | dev->wanted_features;
}
netdev_features_t netdev_increment_features(netdev_features_t all,
        netdev_features_t one, netdev_features_t mask);

/* Allow TSO being used on stacked device :
 * Performing the GSO segmentation before last device
 * is a performance improvement.
 */
static inline netdev_features_t netdev_add_tso_features(netdev_features_t features,
                                                        netdev_features_t mask)
{
        return netdev_increment_features(features, NETIF_F_ALL_TSO |
                                         NETIF_F_ALL_FOR_ALL, mask);
}

int __netdev_update_features(struct net_device *dev);
void netdev_update_features(struct net_device *dev);
void netdev_change_features(struct net_device *dev);
void netdev_compute_master_upper_features(struct net_device *dev, bool update_header);

void netif_stacked_transfer_operstate(const struct net_device *rootdev,
                                        struct net_device *dev);

netdev_features_t passthru_features_check(struct sk_buff *skb,
                                          struct net_device *dev,
                                          netdev_features_t features);
netdev_features_t netif_skb_features(struct sk_buff *skb);
void skb_warn_bad_offload(const struct sk_buff *skb);

static inline bool net_gso_ok(netdev_features_t features, int gso_type)
{
        netdev_features_t feature;

        if (gso_type & (SKB_GSO_TCP_FIXEDID | SKB_GSO_TCP_FIXEDID_INNER))
                gso_type |= __SKB_GSO_TCP_FIXEDID;

        feature = ((netdev_features_t)gso_type << NETIF_F_GSO_SHIFT) & NETIF_F_GSO_MASK;

        /* check flags correspondence */
        BUILD_BUG_ON(SKB_GSO_TCPV4   != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(__SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP4  != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP6  != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_ACCECN !=
                     (NETIF_F_GSO_ACCECN >> NETIF_F_GSO_SHIFT));

        return (features & feature) == feature;
}

static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
{
        return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
               (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
}

static inline bool netif_needs_gso(struct sk_buff *skb,
                                   netdev_features_t features)
{
        return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
                unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
                         (skb->ip_summed != CHECKSUM_UNNECESSARY)));
}

void netif_set_tso_max_size(struct net_device *dev, unsigned int size);
void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs);
void netif_inherit_tso_max(struct net_device *to,
                           const struct net_device *from);

static inline unsigned int
netif_get_gro_max_size(const struct net_device *dev, const struct sk_buff *skb)
{
        /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
        return skb->protocol == htons(ETH_P_IPV6) ?
               READ_ONCE(dev->gro_max_size) :
               READ_ONCE(dev->gro_ipv4_max_size);
}

static inline unsigned int
netif_get_gso_max_size(const struct net_device *dev, const struct sk_buff *skb)
{
        /* pairs with WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
        return skb->protocol == htons(ETH_P_IPV6) ?
               READ_ONCE(dev->gso_max_size) :
               READ_ONCE(dev->gso_ipv4_max_size);
}

static inline bool netif_is_macsec(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACSEC;
}

static inline bool netif_is_macvlan(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN;
}

static inline bool netif_is_macvlan_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN_PORT;
}

static inline bool netif_is_bond_master(const struct net_device *dev)
{
        return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_is_bond_slave(const struct net_device *dev)
{
        return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_supports_nofcs(struct net_device *dev)
{
        return dev->priv_flags & IFF_SUPP_NOFCS;
}

static inline bool netif_has_l3_rx_handler(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_RX_HANDLER;
}

static inline bool netif_is_l3_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_MASTER;
}

static inline bool netif_is_l3_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_SLAVE;
}

static inline int dev_sdif(const struct net_device *dev)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
        if (netif_is_l3_slave(dev))
                return dev->ifindex;
#endif
        return 0;
}

static inline bool netif_is_bridge_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_EBRIDGE;
}

static inline bool netif_is_bridge_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_BRIDGE_PORT;
}

static inline bool netif_is_ovs_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OPENVSWITCH;
}

static inline bool netif_is_ovs_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OVS_DATAPATH;
}

static inline bool netif_is_any_bridge_master(const struct net_device *dev)
{
        return netif_is_bridge_master(dev) || netif_is_ovs_master(dev);
}

static inline bool netif_is_any_bridge_port(const struct net_device *dev)
{
        return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
}

static inline bool netif_is_team_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM;
}

static inline bool netif_is_team_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM_PORT;
}

static inline bool netif_is_lag_master(const struct net_device *dev)
{
        return netif_is_bond_master(dev) || netif_is_team_master(dev);
}

static inline bool netif_is_lag_port(const struct net_device *dev)
{
        return netif_is_bond_slave(dev) || netif_is_team_port(dev);
}

bool netif_is_rxfh_configured(const struct net_device *dev);

static inline bool netif_is_failover(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER;
}

static inline bool netif_is_failover_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER_SLAVE;
}

/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
        dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
}

/* return true if dev can't cope with mtu frames that need vlan tag insertion */
static inline bool netif_reduces_vlan_mtu(struct net_device *dev)
{
        /* TODO: reserve and use an additional IFF bit, if we get more users */
        return netif_is_macsec(dev);
}

extern struct pernet_operations __net_initdata loopback_net_ops;

/* Logging, debugging and troubleshooting/diagnostic helpers. */

/* netdev_printk helpers, similar to dev_printk */

static inline const char *netdev_name(const struct net_device *dev)
{
        if (!dev->name[0] || strchr(dev->name, '%'))
                return "(unnamed net_device)";
        return dev->name;
}

static inline const char *netdev_reg_state(const struct net_device *dev)
{
        u8 reg_state = READ_ONCE(dev->reg_state);

        switch (reg_state) {
        case NETREG_UNINITIALIZED: return " (uninitialized)";
        case NETREG_REGISTERED: return "";
        case NETREG_UNREGISTERING: return " (unregistering)";
        case NETREG_UNREGISTERED: return " (unregistered)";
        case NETREG_RELEASED: return " (released)";
        case NETREG_DUMMY: return " (dummy)";
        }

        WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state);
        return " (unknown)";
}

#define MODULE_ALIAS_NETDEV(device) \
        MODULE_ALIAS("netdev-" device)

/*
 * netdev_WARN() acts like dev_printk(), but with the key difference
 * of using a WARN/WARN_ON to get the message out, including the
 * file/line information and a backtrace.
 */
#define netdev_WARN(dev, format, args...)                        \
        WARN(1, "netdevice: %s%s: " format, netdev_name(dev),        \
             netdev_reg_state(dev), ##args)

#define netdev_WARN_ONCE(dev, format, args...)                                \
        WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev),        \
                  netdev_reg_state(dev), ##args)

/*
 *        The list of packet types we will receive (as opposed to discard)
 *        and the routines to invoke.
 *
 *        Why 16. Because with 16 the only overlap we get on a hash of the
 *        low nibble of the protocol value is RARP/SNAP/X.25.
 *
 *                0800        IP
 *                0001        802.3
 *                0002        AX.25
 *                0004        802.2
 *                8035        RARP
 *                0005        SNAP
 *                0805        X.25
 *                0806        ARP
 *                8137        IPX
 *                0009        Localtalk
 *                86DD        IPv6
 */
#define PTYPE_HASH_SIZE        (16)
#define PTYPE_HASH_MASK        (PTYPE_HASH_SIZE - 1)

extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

extern struct net_device *blackhole_netdev;

/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */
#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
#define DEV_STATS_ADD(DEV, FIELD, VAL)         \
                atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
#define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD)

#endif        /* _LINUX_NETDEVICE_H */










































   12 









   12 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM timestamp

#if !defined(_TRACE_TIMESTAMP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TIMESTAMP_H

#include <linux/tracepoint.h>
#include <linux/fs.h>

#define CTIME_QUERIED_FLAGS \
        { I_CTIME_QUERIED, "Q" }

DECLARE_EVENT_CLASS(ctime,
        TP_PROTO(struct inode *inode,
                 struct timespec64 *ctime),

        TP_ARGS(inode, ctime),

        TP_STRUCT__entry(
                __field(u64,                ino)
                __field(time64_t,        ctime_s)
                __field(dev_t,                dev)
                __field(u32,                ctime_ns)
                __field(u32,                gen)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->gen                = inode->i_generation;
                __entry->ctime_s        = ctime->tv_sec;
                __entry->ctime_ns        = ctime->tv_nsec;
        ),

        TP_printk("ino=%d:%d:%llu:%u ctime=%lld.%u",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
                __entry->ctime_s, __entry->ctime_ns
        )
);

DEFINE_EVENT(ctime, inode_set_ctime_to_ts,
                TP_PROTO(struct inode *inode,
                         struct timespec64 *ctime),
                TP_ARGS(inode, ctime));

DEFINE_EVENT(ctime, ctime_xchg_skip,
                TP_PROTO(struct inode *inode,
                         struct timespec64 *ctime),
                TP_ARGS(inode, ctime));

TRACE_EVENT(ctime_ns_xchg,
        TP_PROTO(struct inode *inode,
                 u32 old,
                 u32 new,
                 u32 cur),

        TP_ARGS(inode, old, new, cur),

        TP_STRUCT__entry(
                __field(u64,                ino)
                __field(dev_t,                dev)
                __field(u32,                gen)
                __field(u32,                old)
                __field(u32,                new)
                __field(u32,                cur)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->gen                = inode->i_generation;
                __entry->old                = old;
                __entry->new                = new;
                __entry->cur                = cur;
        ),

        TP_printk("ino=%d:%d:%llu:%u old=%u:%s new=%u cur=%u:%s",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
                __entry->old & ~I_CTIME_QUERIED,
                __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS),
                __entry->new,
                __entry->cur & ~I_CTIME_QUERIED,
                __print_flags(__entry->cur & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS)
        )
);

TRACE_EVENT(fill_mg_cmtime,
        TP_PROTO(struct inode *inode,
                 struct timespec64 *ctime,
                 struct timespec64 *mtime),

        TP_ARGS(inode, ctime, mtime),

        TP_STRUCT__entry(
                __field(u64,                ino)
                __field(time64_t,        ctime_s)
                __field(time64_t,        mtime_s)
                __field(dev_t,                dev)
                __field(u32,                ctime_ns)
                __field(u32,                mtime_ns)
                __field(u32,                gen)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->gen                = inode->i_generation;
                __entry->ctime_s        = ctime->tv_sec;
                __entry->mtime_s        = mtime->tv_sec;
                __entry->ctime_ns        = ctime->tv_nsec;
                __entry->mtime_ns        = mtime->tv_nsec;
        ),

        TP_printk("ino=%d:%d:%llu:%u ctime=%lld.%u mtime=%lld.%u",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
                __entry->ctime_s, __entry->ctime_ns,
                __entry->mtime_s, __entry->mtime_ns
        )
);
#endif /* _TRACE_TIMESTAMP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>










   38 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ipi

#if !defined(_TRACE_IPI_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IPI_H

#include <linux/tracepoint.h>

TRACE_EVENT(ipi_send_cpu,

        TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback),

        TP_ARGS(cpu, callsite, callback),

        TP_STRUCT__entry(
                __field(unsigned int, cpu)
                __field(void *, callsite)
                __field(void *, callback)
        ),

        TP_fast_assign(
                __entry->cpu = cpu;
                __entry->callsite = (void *)callsite;
                __entry->callback = callback;
        ),

        TP_printk("cpu=%u callsite=%pS callback=%pS",
                  __entry->cpu, __entry->callsite, __entry->callback)
);

TRACE_EVENT(ipi_send_cpumask,

        TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback),

        TP_ARGS(cpumask, callsite, callback),

        TP_STRUCT__entry(
                __cpumask(cpumask)
                __field(void *, callsite)
                __field(void *, callback)
        ),

        TP_fast_assign(
                __assign_cpumask(cpumask, cpumask_bits(cpumask));
                __entry->callsite = (void *)callsite;
                __entry->callback = callback;
        ),

        TP_printk("cpumask=%s callsite=%pS callback=%pS",
                  __get_cpumask(cpumask), __entry->callsite, __entry->callback)
);

#ifdef CONFIG_HAVE_EXTRA_IPI_TRACEPOINTS
/**
 * ipi_raise - called when a smp cross call is made
 *
 * @mask: mask of recipient CPUs for the IPI
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string.
 */
TRACE_EVENT(ipi_raise,

        TP_PROTO(const struct cpumask *mask, const char *reason),

        TP_ARGS(mask, reason),

        TP_STRUCT__entry(
                __cpumask(target_cpus)
                __field(const char *, reason)
        ),

        TP_fast_assign(
                __assign_cpumask(target_cpus, cpumask_bits(mask));
                __entry->reason = reason;
        ),

        TP_printk("target_mask=%s (%s)", __get_cpumask(target_cpus), __entry->reason)
);

DECLARE_EVENT_CLASS(ipi_handler,

        TP_PROTO(const char *reason),

        TP_ARGS(reason),

        TP_STRUCT__entry(
                __field(const char *, reason)
        ),

        TP_fast_assign(
                __entry->reason = reason;
        ),

        TP_printk("(%s)", __entry->reason)
);

/**
 * ipi_entry - called immediately before the IPI handler
 *
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string, ideally the same as used with trace_ipi_raise
 * for that IPI.
 */
DEFINE_EVENT(ipi_handler, ipi_entry,

        TP_PROTO(const char *reason),

        TP_ARGS(reason)
);

/**
 * ipi_exit - called immediately after the IPI handler returns
 *
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string, ideally the same as used with trace_ipi_raise for
 * that IPI.
 */
DEFINE_EVENT(ipi_handler, ipi_exit,

        TP_PROTO(const char *reason),

        TP_ARGS(reason)
);
#endif /* CONFIG_HAVE_EXTRA_IPI_TRACEPOINTS */

#endif /* _TRACE_IPI_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 




















    1 



    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/userfaultfd.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *  Copyright (C) 2008-2009 Red Hat, Inc.
 *  Copyright (C) 2015  Red Hat, Inc.
 *
 *  Some part derived from fs/eventfd.c (anon inode setup) and
 *  mm/ksm.c (mm hashing).
 */

#include <linux/list.h>
#include <linux/hashtable.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/file.h>
#include <linux/bug.h>
#include <linux/anon_inodes.h>
#include <linux/syscalls.h>
#include <linux/userfaultfd_k.h>
#include <linux/mempolicy.h>
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/leafops.h>
#include <linux/miscdevice.h>
#include <linux/uio.h>

static int sysctl_unprivileged_userfaultfd __read_mostly;

#ifdef CONFIG_SYSCTL
static const struct ctl_table vm_userfaultfd_table[] = {
        {
                .procname        = "unprivileged_userfaultfd",
                .data                = &sysctl_unprivileged_userfaultfd,
                .maxlen                = sizeof(sysctl_unprivileged_userfaultfd),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};
#endif

static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;

struct userfaultfd_fork_ctx {
        struct userfaultfd_ctx *orig;
        struct userfaultfd_ctx *new;
        struct list_head list;
};

struct userfaultfd_unmap_ctx {
        struct userfaultfd_ctx *ctx;
        unsigned long start;
        unsigned long end;
        struct list_head list;
};

struct userfaultfd_wait_queue {
        struct uffd_msg msg;
        wait_queue_entry_t wq;
        struct userfaultfd_ctx *ctx;
        bool waken;
};

struct userfaultfd_wake_range {
        unsigned long start;
        unsigned long len;
};

/* internal indication that UFFD_API ioctl was successfully executed */
#define UFFD_FEATURE_INITIALIZED                (1u << 31)

static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
{
        return ctx->features & UFFD_FEATURE_INITIALIZED;
}

static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
{
        return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
}

/*
 * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
 * meaningful when userfaultfd_wp()==true on the vma and when it's
 * anonymous.
 */
bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
{
        struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx)
                return false;

        return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
}

static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
                                     int wake_flags, void *key)
{
        struct userfaultfd_wake_range *range = key;
        int ret;
        struct userfaultfd_wait_queue *uwq;
        unsigned long start, len;

        uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
        ret = 0;
        /* len == 0 means wake all */
        start = range->start;
        len = range->len;
        if (len && (start > uwq->msg.arg.pagefault.address ||
                    start + len <= uwq->msg.arg.pagefault.address))
                goto out;
        WRITE_ONCE(uwq->waken, true);
        /*
         * The Program-Order guarantees provided by the scheduler
         * ensure uwq->waken is visible before the task is woken.
         */
        ret = wake_up_state(wq->private, mode);
        if (ret) {
                /*
                 * Wake only once, autoremove behavior.
                 *
                 * After the effect of list_del_init is visible to the other
                 * CPUs, the waitqueue may disappear from under us, see the
                 * !list_empty_careful() in handle_userfault().
                 *
                 * try_to_wake_up() has an implicit smp_mb(), and the
                 * wq->private is read before calling the extern function
                 * "wake_up_state" (which in turns calls try_to_wake_up).
                 */
                list_del_init(&wq->entry);
        }
out:
        return ret;
}

/**
 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
 * context.
 * @ctx: [in] Pointer to the userfaultfd context.
 */
static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
{
        refcount_inc(&ctx->refcount);
}

/**
 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
 * context.
 * @ctx: [in] Pointer to userfaultfd context.
 *
 * The userfaultfd context reference must have been previously acquired either
 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
 */
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
        if (refcount_dec_and_test(&ctx->refcount)) {
                VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock));
                VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh));
                VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock));
                VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh));
                VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock));
                VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh));
                VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock));
                VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh));
                mmdrop(ctx->mm);
                kmem_cache_free(userfaultfd_ctx_cachep, ctx);
        }
}

static inline void msg_init(struct uffd_msg *msg)
{
        BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
        /*
         * Must use memset to zero out the paddings or kernel data is
         * leaked to userland.
         */
        memset(msg, 0, sizeof(struct uffd_msg));
}

static inline struct uffd_msg userfault_msg(unsigned long address,
                                            unsigned long real_address,
                                            unsigned int flags,
                                            unsigned long reason,
                                            unsigned int features)
{
        struct uffd_msg msg;

        msg_init(&msg);
        msg.event = UFFD_EVENT_PAGEFAULT;

        msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
                                    real_address : address;

        /*
         * These flags indicate why the userfault occurred:
         * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
         * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
         * - Neither of these flags being set indicates a MISSING fault.
         *
         * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
         * fault. Otherwise, it was a read fault.
         */
        if (flags & FAULT_FLAG_WRITE)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
        if (reason & VM_UFFD_WP)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
        if (reason & VM_UFFD_MINOR)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
        if (features & UFFD_FEATURE_THREAD_ID)
                msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
        return msg;
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * Same functionality as userfaultfd_must_wait below with modifications for
 * hugepmd ranges.
 */
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
                                              struct vm_fault *vmf,
                                              unsigned long reason)
{
        struct vm_area_struct *vma = vmf->vma;
        pte_t *ptep, pte;

        assert_fault_locked(vmf);

        ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
        if (!ptep)
                return true;

        pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);

        /*
         * Lockless access: we're in a wait_event so it's ok if it
         * changes under us.
         */

        /* Entry is still missing, wait for userspace to resolve the fault. */
        if (huge_pte_none(pte))
                return true;
        /* UFFD PTE markers require userspace to resolve the fault. */
        if (pte_is_uffd_marker(pte))
                return true;
        /*
         * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
         * resolve the fault.
         */
        if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
                return true;

        return false;
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
                                              struct vm_fault *vmf,
                                              unsigned long reason)
{
        /* Should never get here. */
        VM_WARN_ON_ONCE(1);
        return false;
}
#endif /* CONFIG_HUGETLB_PAGE */

/*
 * Verify the pagetables are still not ok after having registered into
 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
 * userfault that has already been resolved, if userfaultfd_read_iter and
 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
 * threads.
 */
static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
                                         struct vm_fault *vmf,
                                         unsigned long reason)
{
        struct mm_struct *mm = ctx->mm;
        unsigned long address = vmf->address;
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pte_t ptent;
        bool ret;

        assert_fault_locked(vmf);

        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                return true;
        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                return true;
        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                return true;
        pmd = pmd_offset(pud, address);
again:
        _pmd = pmdp_get_lockless(pmd);
        if (pmd_none(_pmd))
                return true;

        /*
         * A race could arise which would result in a softleaf entry such as
         * migration entry unexpectedly being present in the PMD, so explicitly
         * check for this and bail out if so.
         */
        if (!pmd_present(_pmd))
                return false;

        if (pmd_trans_huge(_pmd))
                return !pmd_write(_pmd) && (reason & VM_UFFD_WP);

        pte = pte_offset_map(pmd, address);
        if (!pte)
                goto again;

        /*
         * Lockless access: we're in a wait_event so it's ok if it
         * changes under us.
         */
        ptent = ptep_get(pte);

        ret = true;
        /* Entry is still missing, wait for userspace to resolve the fault. */
        if (pte_none(ptent))
                goto out;
        /* UFFD PTE markers require userspace to resolve the fault. */
        if (pte_is_uffd_marker(ptent))
                goto out;
        /*
         * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
         * resolve the fault.
         */
        if (!pte_write(ptent) && (reason & VM_UFFD_WP))
                goto out;

        ret = false;
out:
        pte_unmap(pte);
        return ret;
}

static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
{
        if (flags & FAULT_FLAG_INTERRUPTIBLE)
                return TASK_INTERRUPTIBLE;

        if (flags & FAULT_FLAG_KILLABLE)
                return TASK_KILLABLE;

        return TASK_UNINTERRUPTIBLE;
}

/*
 * The locking rules involved in returning VM_FAULT_RETRY depending on
 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
 * recommendation in __lock_page_or_retry is not an understatement.
 *
 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
 * not set.
 *
 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
 * set, VM_FAULT_RETRY can still be returned if and only if there are
 * fatal_signal_pending()s, and the mmap_lock must be released before
 * returning it.
 */
vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct userfaultfd_ctx *ctx;
        struct userfaultfd_wait_queue uwq;
        vm_fault_t ret = VM_FAULT_SIGBUS;
        bool must_wait;
        unsigned int blocking_state;

        /*
         * We don't do userfault handling for the final child pid update
         * and when coredumping (faults triggered by get_dump_page()).
         */
        if (current->flags & (PF_EXITING|PF_DUMPCORE))
                goto out;

        assert_fault_locked(vmf);

        ctx = vma->vm_userfaultfd_ctx.ctx;
        if (!ctx)
                goto out;

        VM_WARN_ON_ONCE(ctx->mm != mm);

        /* Any unrecognized flag is a bug. */
        VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS);
        /* 0 or > 1 flags set is a bug; we expect exactly 1. */
        VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));

        if (ctx->features & UFFD_FEATURE_SIGBUS)
                goto out;
        if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
                goto out;

        /*
         * Check that we can return VM_FAULT_RETRY.
         *
         * NOTE: it should become possible to return VM_FAULT_RETRY
         * even if FAULT_FLAG_TRIED is set without leading to gup()
         * -EBUSY failures, if the userfaultfd is to be extended for
         * VM_UFFD_WP tracking and we intend to arm the userfault
         * without first stopping userland access to the memory. For
         * VM_UFFD_MISSING userfaults this is enough for now.
         */
        if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
                /*
                 * Validate the invariant that nowait must allow retry
                 * to be sure not to return SIGBUS erroneously on
                 * nowait invocations.
                 */
                VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
                if (printk_ratelimit()) {
                        pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n",
                                vmf->flags);
                        dump_stack();
                }
#endif
                goto out;
        }

        /*
         * Handle nowait, not much to do other than tell it to retry
         * and wait.
         */
        ret = VM_FAULT_RETRY;
        if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
                goto out;

        if (unlikely(READ_ONCE(ctx->released))) {
                /*
                 * If a concurrent release is detected, do not return
                 * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
                 * return VM_FAULT_RETRY with lock released proactively.
                 *
                 * If we were to return VM_FAULT_SIGBUS here, the non
                 * cooperative manager would be instead forced to
                 * always call UFFDIO_UNREGISTER before it can safely
                 * close the uffd, to avoid involuntary SIGBUS triggered.
                 *
                 * If we were to return VM_FAULT_NOPAGE, it would work for
                 * the fault path, in which the lock will be released
                 * later.  However for GUP, faultin_page() does nothing
                 * special on NOPAGE, so GUP would spin retrying without
                 * releasing the mmap read lock, causing possible livelock.
                 *
                 * Here only VM_FAULT_RETRY would make sure the mmap lock
                 * be released immediately, so that the thread concurrently
                 * releasing the userfault would always make progress.
                 */
                release_fault_lock(vmf);
                goto out;
        }

        /* take the reference before dropping the mmap_lock */
        userfaultfd_ctx_get(ctx);

        init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
        uwq.wq.private = current;
        uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
                                reason, ctx->features);
        uwq.ctx = ctx;
        uwq.waken = false;

        blocking_state = userfaultfd_get_blocking_state(vmf->flags);

        /*
         * Take the vma lock now, in order to safely call
         * userfaultfd_huge_must_wait() later. Since acquiring the
         * (sleepable) vma lock can modify the current task state, that
         * must be before explicitly calling set_current_state().
         */
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_lock_read(vma);

        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        /*
         * After the __add_wait_queue the uwq is visible to userland
         * through poll/read().
         */
        __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
        /*
         * The smp_mb() after __set_current_state prevents the reads
         * following the spin_unlock to happen before the list_add in
         * __add_wait_queue.
         */
        set_current_state(blocking_state);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        if (is_vm_hugetlb_page(vma)) {
                must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
                hugetlb_vma_unlock_read(vma);
        } else {
                must_wait = userfaultfd_must_wait(ctx, vmf, reason);
        }

        release_fault_lock(vmf);

        if (likely(must_wait && !READ_ONCE(ctx->released))) {
                wake_up_poll(&ctx->fd_wqh, EPOLLIN);
                schedule();
        }

        __set_current_state(TASK_RUNNING);

        /*
         * Here we race with the list_del; list_add in
         * userfaultfd_ctx_read(), however because we don't ever run
         * list_del_init() to refile across the two lists, the prev
         * and next pointers will never point to self. list_add also
         * would never let any of the two pointers to point to
         * self. So list_empty_careful won't risk to see both pointers
         * pointing to self at any time during the list refile. The
         * only case where list_del_init() is called is the full
         * removal in the wake function and there we don't re-list_add
         * and it's fine not to block on the spinlock. The uwq on this
         * kernel stack can be released after the list_del_init.
         */
        if (!list_empty_careful(&uwq.wq.entry)) {
                spin_lock_irq(&ctx->fault_pending_wqh.lock);
                /*
                 * No need of list_del_init(), the uwq on the stack
                 * will be freed shortly anyway.
                 */
                list_del(&uwq.wq.entry);
                spin_unlock_irq(&ctx->fault_pending_wqh.lock);
        }

        /*
         * ctx may go away after this if the userfault pseudo fd is
         * already released.
         */
        userfaultfd_ctx_put(ctx);

out:
        return ret;
}

static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
                                              struct userfaultfd_wait_queue *ewq)
{
        struct userfaultfd_ctx *release_new_ctx;

        if (WARN_ON_ONCE(current->flags & PF_EXITING))
                goto out;

        ewq->ctx = ctx;
        init_waitqueue_entry(&ewq->wq, current);
        release_new_ctx = NULL;

        spin_lock_irq(&ctx->event_wqh.lock);
        /*
         * After the __add_wait_queue the uwq is visible to userland
         * through poll/read().
         */
        __add_wait_queue(&ctx->event_wqh, &ewq->wq);
        for (;;) {
                set_current_state(TASK_KILLABLE);
                if (ewq->msg.event == 0)
                        break;
                if (READ_ONCE(ctx->released) ||
                    fatal_signal_pending(current)) {
                        /*
                         * &ewq->wq may be queued in fork_event, but
                         * __remove_wait_queue ignores the head
                         * parameter. It would be a problem if it
                         * didn't.
                         */
                        __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
                        if (ewq->msg.event == UFFD_EVENT_FORK) {
                                struct userfaultfd_ctx *new;

                                new = (struct userfaultfd_ctx *)
                                        (unsigned long)
                                        ewq->msg.arg.reserved.reserved1;
                                release_new_ctx = new;
                        }
                        break;
                }

                spin_unlock_irq(&ctx->event_wqh.lock);

                wake_up_poll(&ctx->fd_wqh, EPOLLIN);
                schedule();

                spin_lock_irq(&ctx->event_wqh.lock);
        }
        __set_current_state(TASK_RUNNING);
        spin_unlock_irq(&ctx->event_wqh.lock);

        if (release_new_ctx) {
                userfaultfd_release_new(release_new_ctx);
                userfaultfd_ctx_put(release_new_ctx);
        }

        /*
         * ctx may go away after this if the userfault pseudo fd is
         * already released.
         */
out:
        atomic_dec(&ctx->mmap_changing);
        VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0);
        userfaultfd_ctx_put(ctx);
}

static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
                                       struct userfaultfd_wait_queue *ewq)
{
        ewq->msg.event = 0;
        wake_up_locked(&ctx->event_wqh);
        __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
}

int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
{
        struct userfaultfd_ctx *ctx = NULL, *octx;
        struct userfaultfd_fork_ctx *fctx;

        octx = vma->vm_userfaultfd_ctx.ctx;
        if (!octx)
                return 0;

        if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
                userfaultfd_reset_ctx(vma);
                return 0;
        }

        list_for_each_entry(fctx, fcs, list)
                if (fctx->orig == octx) {
                        ctx = fctx->new;
                        break;
                }

        if (!ctx) {
                fctx = kmalloc_obj(*fctx);
                if (!fctx)
                        return -ENOMEM;

                ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
                if (!ctx) {
                        kfree(fctx);
                        return -ENOMEM;
                }

                refcount_set(&ctx->refcount, 1);
                ctx->flags = octx->flags;
                ctx->features = octx->features;
                ctx->released = false;
                init_rwsem(&ctx->map_changing_lock);
                atomic_set(&ctx->mmap_changing, 0);
                ctx->mm = vma->vm_mm;
                mmgrab(ctx->mm);

                userfaultfd_ctx_get(octx);
                down_write(&octx->map_changing_lock);
                atomic_inc(&octx->mmap_changing);
                up_write(&octx->map_changing_lock);
                fctx->orig = octx;
                fctx->new = ctx;
                list_add_tail(&fctx->list, fcs);
        }

        vma->vm_userfaultfd_ctx.ctx = ctx;
        return 0;
}

static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
{
        struct userfaultfd_ctx *ctx = fctx->orig;
        struct userfaultfd_wait_queue ewq;

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_FORK;
        ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;

        userfaultfd_event_wait_completion(ctx, &ewq);
}

void dup_userfaultfd_complete(struct list_head *fcs)
{
        struct userfaultfd_fork_ctx *fctx, *n;

        list_for_each_entry_safe(fctx, n, fcs, list) {
                dup_fctx(fctx);
                list_del(&fctx->list);
                kfree(fctx);
        }
}

void dup_userfaultfd_fail(struct list_head *fcs)
{
        struct userfaultfd_fork_ctx *fctx, *n;

        /*
         * An error has occurred on fork, we will tear memory down, but have
         * allocated memory for fctx's and raised reference counts for both the
         * original and child contexts (and on the mm for each as a result).
         *
         * These would ordinarily be taken care of by a user handling the event,
         * but we are no longer doing so, so manually clean up here.
         *
         * mm tear down will take care of cleaning up VMA contexts.
         */
        list_for_each_entry_safe(fctx, n, fcs, list) {
                struct userfaultfd_ctx *octx = fctx->orig;
                struct userfaultfd_ctx *ctx = fctx->new;

                atomic_dec(&octx->mmap_changing);
                VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0);
                userfaultfd_ctx_put(octx);
                userfaultfd_ctx_put(ctx);

                list_del(&fctx->list);
                kfree(fctx);
        }
}

void mremap_userfaultfd_prep(struct vm_area_struct *vma,
                             struct vm_userfaultfd_ctx *vm_ctx)
{
        struct userfaultfd_ctx *ctx;

        ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx)
                return;

        if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
                vm_ctx->ctx = ctx;
                userfaultfd_ctx_get(ctx);
                down_write(&ctx->map_changing_lock);
                atomic_inc(&ctx->mmap_changing);
                up_write(&ctx->map_changing_lock);
        } else {
                /* Drop uffd context if remap feature not enabled */
                userfaultfd_reset_ctx(vma);
        }
}

void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
                                 unsigned long from, unsigned long to,
                                 unsigned long len)
{
        struct userfaultfd_ctx *ctx = vm_ctx->ctx;
        struct userfaultfd_wait_queue ewq;

        if (!ctx)
                return;

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_REMAP;
        ewq.msg.arg.remap.from = from;
        ewq.msg.arg.remap.to = to;
        ewq.msg.arg.remap.len = len;

        userfaultfd_event_wait_completion(ctx, &ewq);
}

void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx)
{
        struct userfaultfd_ctx *ctx = vm_ctx->ctx;

        if (!ctx)
                return;

        userfaultfd_ctx_put(ctx);
}

bool userfaultfd_remove(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end)
{
        struct mm_struct *mm = vma->vm_mm;
        struct userfaultfd_ctx *ctx;
        struct userfaultfd_wait_queue ewq;

        ctx = vma->vm_userfaultfd_ctx.ctx;
        if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
                return true;

        userfaultfd_ctx_get(ctx);
        down_write(&ctx->map_changing_lock);
        atomic_inc(&ctx->mmap_changing);
        up_write(&ctx->map_changing_lock);
        mmap_read_unlock(mm);

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_REMOVE;
        ewq.msg.arg.remove.start = start;
        ewq.msg.arg.remove.end = end;

        userfaultfd_event_wait_completion(ctx, &ewq);

        return false;
}

static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
                          unsigned long start, unsigned long end)
{
        struct userfaultfd_unmap_ctx *unmap_ctx;

        list_for_each_entry(unmap_ctx, unmaps, list)
                if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
                    unmap_ctx->end == end)
                        return true;

        return false;
}

int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
                           unsigned long end, struct list_head *unmaps)
{
        struct userfaultfd_unmap_ctx *unmap_ctx;
        struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
            has_unmap_ctx(ctx, unmaps, start, end))
                return 0;

        unmap_ctx = kzalloc_obj(*unmap_ctx);
        if (!unmap_ctx)
                return -ENOMEM;

        userfaultfd_ctx_get(ctx);
        down_write(&ctx->map_changing_lock);
        atomic_inc(&ctx->mmap_changing);
        up_write(&ctx->map_changing_lock);
        unmap_ctx->ctx = ctx;
        unmap_ctx->start = start;
        unmap_ctx->end = end;
        list_add_tail(&unmap_ctx->list, unmaps);

        return 0;
}

void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
{
        struct userfaultfd_unmap_ctx *ctx, *n;
        struct userfaultfd_wait_queue ewq;

        list_for_each_entry_safe(ctx, n, uf, list) {
                msg_init(&ewq.msg);

                ewq.msg.event = UFFD_EVENT_UNMAP;
                ewq.msg.arg.remove.start = ctx->start;
                ewq.msg.arg.remove.end = ctx->end;

                userfaultfd_event_wait_completion(ctx->ctx, &ewq);

                list_del(&ctx->list);
                kfree(ctx);
        }
}

static int userfaultfd_release(struct inode *inode, struct file *file)
{
        struct userfaultfd_ctx *ctx = file->private_data;
        struct mm_struct *mm = ctx->mm;
        /* len == 0 means wake all */
        struct userfaultfd_wake_range range = { .len = 0, };

        WRITE_ONCE(ctx->released, true);

        userfaultfd_release_all(mm, ctx);

        /*
         * After no new page faults can wait on this fault_*wqh, flush
         * the last page faults that may have been already waiting on
         * the fault_*wqh.
         */
        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
        __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        /* Flush pending events that may still wait on event_wqh */
        wake_up_all(&ctx->event_wqh);

        wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
        userfaultfd_ctx_put(ctx);
        return 0;
}

/* fault_pending_wqh.lock must be hold by the caller */
static inline struct userfaultfd_wait_queue *find_userfault_in(
                wait_queue_head_t *wqh)
{
        wait_queue_entry_t *wq;
        struct userfaultfd_wait_queue *uwq;

        lockdep_assert_held(&wqh->lock);

        uwq = NULL;
        if (!waitqueue_active(wqh))
                goto out;
        /* walk in reverse to provide FIFO behavior to read userfaults */
        wq = list_last_entry(&wqh->head, typeof(*wq), entry);
        uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
out:
        return uwq;
}

static inline struct userfaultfd_wait_queue *find_userfault(
                struct userfaultfd_ctx *ctx)
{
        return find_userfault_in(&ctx->fault_pending_wqh);
}

static inline struct userfaultfd_wait_queue *find_userfault_evt(
                struct userfaultfd_ctx *ctx)
{
        return find_userfault_in(&ctx->event_wqh);
}

static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
{
        struct userfaultfd_ctx *ctx = file->private_data;
        __poll_t ret;

        poll_wait(file, &ctx->fd_wqh, wait);

        if (!userfaultfd_is_initialized(ctx))
                return EPOLLERR;

        /*
         * poll() never guarantees that read won't block.
         * userfaults can be waken before they're read().
         */
        if (unlikely(!(file->f_flags & O_NONBLOCK)))
                return EPOLLERR;
        /*
         * lockless access to see if there are pending faults
         * __pollwait last action is the add_wait_queue but
         * the spin_unlock would allow the waitqueue_active to
         * pass above the actual list_add inside
         * add_wait_queue critical section. So use a full
         * memory barrier to serialize the list_add write of
         * add_wait_queue() with the waitqueue_active read
         * below.
         */
        ret = 0;
        smp_mb();
        if (waitqueue_active(&ctx->fault_pending_wqh))
                ret = EPOLLIN;
        else if (waitqueue_active(&ctx->event_wqh))
                ret = EPOLLIN;

        return ret;
}

static const struct file_operations userfaultfd_fops;

static int resolve_userfault_fork(struct userfaultfd_ctx *new,
                                  struct inode *inode,
                                  struct uffd_msg *msg)
{
        int fd;

        fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
                        O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
        if (fd < 0)
                return fd;

        msg->arg.reserved.reserved1 = 0;
        msg->arg.fork.ufd = fd;
        return 0;
}

static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                                    struct uffd_msg *msg, struct inode *inode)
{
        ssize_t ret;
        DECLARE_WAITQUEUE(wait, current);
        struct userfaultfd_wait_queue *uwq;
        /*
         * Handling fork event requires sleeping operations, so
         * we drop the event_wqh lock, then do these ops, then
         * lock it back and wake up the waiter. While the lock is
         * dropped the ewq may go away so we keep track of it
         * carefully.
         */
        LIST_HEAD(fork_event);
        struct userfaultfd_ctx *fork_nctx = NULL;

        /* always take the fd_wqh lock before the fault_pending_wqh lock */
        spin_lock_irq(&ctx->fd_wqh.lock);
        __add_wait_queue(&ctx->fd_wqh, &wait);
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                spin_lock(&ctx->fault_pending_wqh.lock);
                uwq = find_userfault(ctx);
                if (uwq) {
                        /*
                         * Use a seqcount to repeat the lockless check
                         * in wake_userfault() to avoid missing
                         * wakeups because during the refile both
                         * waitqueue could become empty if this is the
                         * only userfault.
                         */
                        write_seqcount_begin(&ctx->refile_seq);

                        /*
                         * The fault_pending_wqh.lock prevents the uwq
                         * to disappear from under us.
                         *
                         * Refile this userfault from
                         * fault_pending_wqh to fault_wqh, it's not
                         * pending anymore after we read it.
                         *
                         * Use list_del() by hand (as
                         * userfaultfd_wake_function also uses
                         * list_del_init() by hand) to be sure nobody
                         * changes __remove_wait_queue() to use
                         * list_del_init() in turn breaking the
                         * !list_empty_careful() check in
                         * handle_userfault(). The uwq->wq.head list
                         * must never be empty at any time during the
                         * refile, or the waitqueue could disappear
                         * from under us. The "wait_queue_head_t"
                         * parameter of __remove_wait_queue() is unused
                         * anyway.
                         */
                        list_del(&uwq->wq.entry);
                        add_wait_queue(&ctx->fault_wqh, &uwq->wq);

                        write_seqcount_end(&ctx->refile_seq);

                        /* careful to always initialize msg if ret == 0 */
                        *msg = uwq->msg;
                        spin_unlock(&ctx->fault_pending_wqh.lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&ctx->fault_pending_wqh.lock);

                spin_lock(&ctx->event_wqh.lock);
                uwq = find_userfault_evt(ctx);
                if (uwq) {
                        *msg = uwq->msg;

                        if (uwq->msg.event == UFFD_EVENT_FORK) {
                                fork_nctx = (struct userfaultfd_ctx *)
                                        (unsigned long)
                                        uwq->msg.arg.reserved.reserved1;
                                list_move(&uwq->wq.entry, &fork_event);
                                /*
                                 * fork_nctx can be freed as soon as
                                 * we drop the lock, unless we take a
                                 * reference on it.
                                 */
                                userfaultfd_ctx_get(fork_nctx);
                                spin_unlock(&ctx->event_wqh.lock);
                                ret = 0;
                                break;
                        }

                        userfaultfd_event_complete(ctx, uwq);
                        spin_unlock(&ctx->event_wqh.lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&ctx->event_wqh.lock);

                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                if (no_wait) {
                        ret = -EAGAIN;
                        break;
                }
                spin_unlock_irq(&ctx->fd_wqh.lock);
                schedule();
                spin_lock_irq(&ctx->fd_wqh.lock);
        }
        __remove_wait_queue(&ctx->fd_wqh, &wait);
        __set_current_state(TASK_RUNNING);
        spin_unlock_irq(&ctx->fd_wqh.lock);

        if (!ret && msg->event == UFFD_EVENT_FORK) {
                ret = resolve_userfault_fork(fork_nctx, inode, msg);
                spin_lock_irq(&ctx->event_wqh.lock);
                if (!list_empty(&fork_event)) {
                        /*
                         * The fork thread didn't abort, so we can
                         * drop the temporary refcount.
                         */
                        userfaultfd_ctx_put(fork_nctx);

                        uwq = list_first_entry(&fork_event,
                                               typeof(*uwq),
                                               wq.entry);
                        /*
                         * If fork_event list wasn't empty and in turn
                         * the event wasn't already released by fork
                         * (the event is allocated on fork kernel
                         * stack), put the event back to its place in
                         * the event_wq. fork_event head will be freed
                         * as soon as we return so the event cannot
                         * stay queued there no matter the current
                         * "ret" value.
                         */
                        list_del(&uwq->wq.entry);
                        __add_wait_queue(&ctx->event_wqh, &uwq->wq);

                        /*
                         * Leave the event in the waitqueue and report
                         * error to userland if we failed to resolve
                         * the userfault fork.
                         */
                        if (likely(!ret))
                                userfaultfd_event_complete(ctx, uwq);
                } else {
                        /*
                         * Here the fork thread aborted and the
                         * refcount from the fork thread on fork_nctx
                         * has already been released. We still hold
                         * the reference we took before releasing the
                         * lock above. If resolve_userfault_fork
                         * failed we've to drop it because the
                         * fork_nctx has to be freed in such case. If
                         * it succeeded we'll hold it because the new
                         * uffd references it.
                         */
                        if (ret)
                                userfaultfd_ctx_put(fork_nctx);
                }
                spin_unlock_irq(&ctx->event_wqh.lock);
        }

        return ret;
}

static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct userfaultfd_ctx *ctx = file->private_data;
        ssize_t _ret, ret = 0;
        struct uffd_msg msg;
        struct inode *inode = file_inode(file);
        bool no_wait;

        if (!userfaultfd_is_initialized(ctx))
                return -EINVAL;

        no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT;
        for (;;) {
                if (iov_iter_count(to) < sizeof(msg))
                        return ret ? ret : -EINVAL;
                _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
                if (_ret < 0)
                        return ret ? ret : _ret;
                _ret = !copy_to_iter_full(&msg, sizeof(msg), to);
                if (_ret)
                        return ret ? ret : -EFAULT;
                ret += sizeof(msg);
                /*
                 * Allow to read more than one fault at time but only
                 * block if waiting for the very first one.
                 */
                no_wait = true;
        }
}

static void __wake_userfault(struct userfaultfd_ctx *ctx,
                             struct userfaultfd_wake_range *range)
{
        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        /* wake all in the range and autoremove */
        if (waitqueue_active(&ctx->fault_pending_wqh))
                __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
                                     range);
        if (waitqueue_active(&ctx->fault_wqh))
                __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);
}

static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
                                           struct userfaultfd_wake_range *range)
{
        unsigned seq;
        bool need_wakeup;

        /*
         * To be sure waitqueue_active() is not reordered by the CPU
         * before the pagetable update, use an explicit SMP memory
         * barrier here. PT lock release or mmap_read_unlock(mm) still
         * have release semantics that can allow the
         * waitqueue_active() to be reordered before the pte update.
         */
        smp_mb();

        /*
         * Use waitqueue_active because it's very frequent to
         * change the address space atomically even if there are no
         * userfaults yet. So we take the spinlock only when we're
         * sure we've userfaults to wake.
         */
        do {
                seq = read_seqcount_begin(&ctx->refile_seq);
                need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
                        waitqueue_active(&ctx->fault_wqh);
                cond_resched();
        } while (read_seqcount_retry(&ctx->refile_seq, seq));
        if (need_wakeup)
                __wake_userfault(ctx, range);
}

static __always_inline int validate_unaligned_range(
        struct mm_struct *mm, __u64 start, __u64 len)
{
        __u64 task_size = mm->task_size;

        if (len & ~PAGE_MASK)
                return -EINVAL;
        if (!len)
                return -EINVAL;
        if (start >= task_size)
                return -EINVAL;
        if (len > task_size - start)
                return -EINVAL;
        if (start + len <= start)
                return -EINVAL;
        return 0;
}

static __always_inline int validate_range(struct mm_struct *mm,
                                          __u64 start, __u64 len)
{
        if (start & ~PAGE_MASK)
                return -EINVAL;

        return validate_unaligned_range(mm, start, len);
}

static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                                unsigned long arg)
{
        struct mm_struct *mm = ctx->mm;
        struct vm_area_struct *vma, *cur;
        int ret;
        struct uffdio_register uffdio_register;
        struct uffdio_register __user *user_uffdio_register;
        vm_flags_t vm_flags;
        bool found;
        bool basic_ioctls;
        unsigned long start, end;
        struct vma_iterator vmi;
        bool wp_async = userfaultfd_wp_async_ctx(ctx);

        user_uffdio_register = (struct uffdio_register __user *) arg;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_register, user_uffdio_register,
                           sizeof(uffdio_register)-sizeof(__u64)))
                goto out;

        ret = -EINVAL;
        if (!uffdio_register.mode)
                goto out;
        if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
                goto out;
        vm_flags = 0;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
                vm_flags |= VM_UFFD_MISSING;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
                if (!pgtable_supports_uffd_wp())
                        goto out;

                vm_flags |= VM_UFFD_WP;
        }
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
                goto out;
#endif
                vm_flags |= VM_UFFD_MINOR;
        }

        ret = validate_range(mm, uffdio_register.range.start,
                             uffdio_register.range.len);
        if (ret)
                goto out;

        start = uffdio_register.range.start;
        end = start + uffdio_register.range.len;

        ret = -ENOMEM;
        if (!mmget_not_zero(mm))
                goto out;

        ret = -EINVAL;
        mmap_write_lock(mm);
        vma_iter_init(&vmi, mm, start);
        vma = vma_find(&vmi, end);
        if (!vma)
                goto out_unlock;

        /*
         * If the first vma contains huge pages, make sure start address
         * is aligned to huge page size.
         */
        if (is_vm_hugetlb_page(vma)) {
                unsigned long vma_hpagesize = vma_kernel_pagesize(vma);

                if (start & (vma_hpagesize - 1))
                        goto out_unlock;
        }

        /*
         * Search for not compatible vmas.
         */
        found = false;
        basic_ioctls = false;
        cur = vma;
        do {
                cond_resched();

                VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
                                !!(cur->vm_flags & __VM_UFFD_FLAGS));

                /* check not compatible vmas */
                ret = -EINVAL;
                if (!vma_can_userfault(cur, vm_flags, wp_async))
                        goto out_unlock;

                /*
                 * UFFDIO_COPY will fill file holes even without
                 * PROT_WRITE. This check enforces that if this is a
                 * MAP_SHARED, the process has write permission to the backing
                 * file. If VM_MAYWRITE is set it also enforces that on a
                 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
                 * F_WRITE_SEAL can be taken until the vma is destroyed.
                 */
                ret = -EPERM;
                if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
                        goto out_unlock;

                /*
                 * If this vma contains ending address, and huge pages
                 * check alignment.
                 */
                if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
                    end > cur->vm_start) {
                        unsigned long vma_hpagesize = vma_kernel_pagesize(cur);

                        ret = -EINVAL;

                        if (end & (vma_hpagesize - 1))
                                goto out_unlock;
                }
                if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
                        goto out_unlock;

                /*
                 * Check that this vma isn't already owned by a
                 * different userfaultfd. We can't allow more than one
                 * userfaultfd to own a single vma simultaneously or we
                 * wouldn't know which one to deliver the userfaults to.
                 */
                ret = -EBUSY;
                if (cur->vm_userfaultfd_ctx.ctx &&
                    cur->vm_userfaultfd_ctx.ctx != ctx)
                        goto out_unlock;

                /*
                 * Note vmas containing huge pages
                 */
                if (is_vm_hugetlb_page(cur))
                        basic_ioctls = true;

                found = true;
        } for_each_vma_range(vmi, cur, end);
        VM_WARN_ON_ONCE(!found);

        ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
                                         wp_async);

out_unlock:
        mmap_write_unlock(mm);
        mmput(mm);
        if (!ret) {
                __u64 ioctls_out;

                ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
                    UFFD_API_RANGE_IOCTLS;

                /*
                 * Declare the WP ioctl only if the WP mode is
                 * specified and all checks passed with the range
                 */
                if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
                        ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);

                /* CONTINUE ioctl is only supported for MINOR ranges. */
                if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
                        ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);

                /*
                 * Now that we scanned all vmas we can already tell
                 * userland which ioctls methods are guaranteed to
                 * succeed on this range.
                 */
                if (put_user(ioctls_out, &user_uffdio_register->ioctls))
                        ret = -EFAULT;
        }
out:
        return ret;
}

static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                                  unsigned long arg)
{
        struct mm_struct *mm = ctx->mm;
        struct vm_area_struct *vma, *prev, *cur;
        int ret;
        struct uffdio_range uffdio_unregister;
        bool found;
        unsigned long start, end, vma_end;
        const void __user *buf = (void __user *)arg;
        struct vma_iterator vmi;
        bool wp_async = userfaultfd_wp_async_ctx(ctx);

        ret = -EFAULT;
        if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
                goto out;

        ret = validate_range(mm, uffdio_unregister.start,
                             uffdio_unregister.len);
        if (ret)
                goto out;

        start = uffdio_unregister.start;
        end = start + uffdio_unregister.len;

        ret = -ENOMEM;
        if (!mmget_not_zero(mm))
                goto out;

        mmap_write_lock(mm);
        ret = -EINVAL;
        vma_iter_init(&vmi, mm, start);
        vma = vma_find(&vmi, end);
        if (!vma)
                goto out_unlock;

        /*
         * If the first vma contains huge pages, make sure start address
         * is aligned to huge page size.
         */
        if (is_vm_hugetlb_page(vma)) {
                unsigned long vma_hpagesize = vma_kernel_pagesize(vma);

                if (start & (vma_hpagesize - 1))
                        goto out_unlock;
        }

        /*
         * Search for not compatible vmas.
         */
        found = false;
        cur = vma;
        do {
                cond_resched();

                VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
                                !!(cur->vm_flags & __VM_UFFD_FLAGS));

                /*
                 * Prevent unregistering through a different userfaultfd than
                 * the one used for registration.
                 */
                if (cur->vm_userfaultfd_ctx.ctx &&
                    cur->vm_userfaultfd_ctx.ctx != ctx)
                        goto out_unlock;

                /*
                 * Check not compatible vmas, not strictly required
                 * here as not compatible vmas cannot have an
                 * userfaultfd_ctx registered on them, but this
                 * provides for more strict behavior to notice
                 * unregistration errors.
                 */
                if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
                        goto out_unlock;

                found = true;
        } for_each_vma_range(vmi, cur, end);
        VM_WARN_ON_ONCE(!found);

        vma_iter_set(&vmi, start);
        prev = vma_prev(&vmi);
        if (vma->vm_start < start)
                prev = vma;

        ret = 0;
        for_each_vma_range(vmi, vma, end) {
                cond_resched();

                /* VMA not registered with userfaultfd. */
                if (!vma->vm_userfaultfd_ctx.ctx)
                        goto skip;

                VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx);
                VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async));
                VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));

                if (vma->vm_start > start)
                        start = vma->vm_start;
                vma_end = min(end, vma->vm_end);

                if (userfaultfd_missing(vma)) {
                        /*
                         * Wake any concurrent pending userfault while
                         * we unregister, so they will not hang
                         * permanently and it avoids userland to call
                         * UFFDIO_WAKE explicitly.
                         */
                        struct userfaultfd_wake_range range;
                        range.start = start;
                        range.len = vma_end - start;
                        wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
                }

                vma = userfaultfd_clear_vma(&vmi, prev, vma,
                                            start, vma_end);
                if (IS_ERR(vma)) {
                        ret = PTR_ERR(vma);
                        break;
                }

        skip:
                prev = vma;
                start = vma->vm_end;
        }

out_unlock:
        mmap_write_unlock(mm);
        mmput(mm);
out:
        return ret;
}

/*
 * userfaultfd_wake may be used in combination with the
 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
 */
static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        int ret;
        struct uffdio_range uffdio_wake;
        struct userfaultfd_wake_range range;
        const void __user *buf = (void __user *)arg;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
                goto out;

        ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
        if (ret)
                goto out;

        range.start = uffdio_wake.start;
        range.len = uffdio_wake.len;

        /*
         * len == 0 means wake all and we don't want to wake all here,
         * so check it again to be sure.
         */
        VM_WARN_ON_ONCE(!range.len);

        wake_userfault(ctx, &range);
        ret = 0;

out:
        return ret;
}

static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        __s64 ret;
        struct uffdio_copy uffdio_copy;
        struct uffdio_copy __user *user_uffdio_copy;
        struct userfaultfd_wake_range range;
        uffd_flags_t flags = 0;

        user_uffdio_copy = (struct uffdio_copy __user *) arg;

        ret = -EAGAIN;
        if (unlikely(atomic_read(&ctx->mmap_changing))) {
                if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
                        return -EFAULT;
                goto out;
        }

        ret = -EFAULT;
        if (copy_from_user(&uffdio_copy, user_uffdio_copy,
                           /* don't copy "copy" last field */
                           sizeof(uffdio_copy)-sizeof(__s64)))
                goto out;

        ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
                                       uffdio_copy.len);
        if (ret)
                goto out;
        ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
                goto out;
        if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
                flags |= MFILL_ATOMIC_WP;
        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
                                        uffdio_copy.len, flags);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }
        if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
                return -EFAULT;
        if (ret < 0)
                goto out;
        VM_WARN_ON_ONCE(!ret);
        /* len == 0 would wake all */
        range.len = ret;
        if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
                range.start = uffdio_copy.dst;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
out:
        return ret;
}

static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
                                unsigned long arg)
{
        __s64 ret;
        struct uffdio_zeropage uffdio_zeropage;
        struct uffdio_zeropage __user *user_uffdio_zeropage;
        struct userfaultfd_wake_range range;

        user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;

        ret = -EAGAIN;
        if (unlikely(atomic_read(&ctx->mmap_changing))) {
                if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
                        return -EFAULT;
                goto out;
        }

        ret = -EFAULT;
        if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
                           /* don't copy "zeropage" last field */
                           sizeof(uffdio_zeropage)-sizeof(__s64)))
                goto out;

        ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
                             uffdio_zeropage.range.len);
        if (ret)
                goto out;
        ret = -EINVAL;
        if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
                goto out;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
                                           uffdio_zeropage.range.len);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }
        if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
                return -EFAULT;
        if (ret < 0)
                goto out;
        /* len == 0 would wake all */
        VM_WARN_ON_ONCE(!ret);
        range.len = ret;
        if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
                range.start = uffdio_zeropage.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
out:
        return ret;
}

static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
                                    unsigned long arg)
{
        int ret;
        struct uffdio_writeprotect uffdio_wp;
        struct uffdio_writeprotect __user *user_uffdio_wp;
        struct userfaultfd_wake_range range;
        bool mode_wp, mode_dontwake;

        if (atomic_read(&ctx->mmap_changing))
                return -EAGAIN;

        user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;

        if (copy_from_user(&uffdio_wp, user_uffdio_wp,
                           sizeof(struct uffdio_writeprotect)))
                return -EFAULT;

        ret = validate_range(ctx->mm, uffdio_wp.range.start,
                             uffdio_wp.range.len);
        if (ret)
                return ret;

        if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
                               UFFDIO_WRITEPROTECT_MODE_WP))
                return -EINVAL;

        mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
        mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;

        if (mode_wp && mode_dontwake)
                return -EINVAL;

        if (mmget_not_zero(ctx->mm)) {
                ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
                                          uffdio_wp.range.len, mode_wp);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (ret)
                return ret;

        if (!mode_wp && !mode_dontwake) {
                range.start = uffdio_wp.range.start;
                range.len = uffdio_wp.range.len;
                wake_userfault(ctx, &range);
        }
        return ret;
}

static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
        __s64 ret;
        struct uffdio_continue uffdio_continue;
        struct uffdio_continue __user *user_uffdio_continue;
        struct userfaultfd_wake_range range;
        uffd_flags_t flags = 0;

        user_uffdio_continue = (struct uffdio_continue __user *)arg;

        ret = -EAGAIN;
        if (unlikely(atomic_read(&ctx->mmap_changing))) {
                if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
                        return -EFAULT;
                goto out;
        }

        ret = -EFAULT;
        if (copy_from_user(&uffdio_continue, user_uffdio_continue,
                           /* don't copy the output fields */
                           sizeof(uffdio_continue) - (sizeof(__s64))))
                goto out;

        ret = validate_range(ctx->mm, uffdio_continue.range.start,
                             uffdio_continue.range.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
                                     UFFDIO_CONTINUE_MODE_WP))
                goto out;
        if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
                flags |= MFILL_ATOMIC_WP;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
                                            uffdio_continue.range.len, flags);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        VM_WARN_ON_ONCE(!ret);
        range.len = ret;
        if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
                range.start = uffdio_continue.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;

out:
        return ret;
}

static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
{
        __s64 ret;
        struct uffdio_poison uffdio_poison;
        struct uffdio_poison __user *user_uffdio_poison;
        struct userfaultfd_wake_range range;

        user_uffdio_poison = (struct uffdio_poison __user *)arg;

        ret = -EAGAIN;
        if (unlikely(atomic_read(&ctx->mmap_changing))) {
                if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
                        return -EFAULT;
                goto out;
        }

        ret = -EFAULT;
        if (copy_from_user(&uffdio_poison, user_uffdio_poison,
                           /* don't copy the output fields */
                           sizeof(uffdio_poison) - (sizeof(__s64))))
                goto out;

        ret = validate_range(ctx->mm, uffdio_poison.range.start,
                             uffdio_poison.range.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
                goto out;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
                                          uffdio_poison.range.len, 0);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        VM_WARN_ON_ONCE(!ret);
        range.len = ret;
        if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
                range.start = uffdio_poison.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;

out:
        return ret;
}

bool userfaultfd_wp_async(struct vm_area_struct *vma)
{
        return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
}

static inline unsigned int uffd_ctx_features(__u64 user_features)
{
        /*
         * For the current set of features the bits just coincide. Set
         * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
         */
        return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}

static int userfaultfd_move(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        __s64 ret;
        struct uffdio_move uffdio_move;
        struct uffdio_move __user *user_uffdio_move;
        struct userfaultfd_wake_range range;
        struct mm_struct *mm = ctx->mm;

        user_uffdio_move = (struct uffdio_move __user *) arg;

        ret = -EAGAIN;
        if (unlikely(atomic_read(&ctx->mmap_changing))) {
                if (unlikely(put_user(ret, &user_uffdio_move->move)))
                        return -EFAULT;
                goto out;
        }

        if (copy_from_user(&uffdio_move, user_uffdio_move,
                           /* don't copy "move" last field */
                           sizeof(uffdio_move)-sizeof(__s64)))
                return -EFAULT;

        /* Do not allow cross-mm moves. */
        if (mm != current->mm)
                return -EINVAL;

        ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
        if (ret)
                return ret;

        ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
        if (ret)
                return ret;

        if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
                                  UFFDIO_MOVE_MODE_DONTWAKE))
                return -EINVAL;

        if (mmget_not_zero(mm)) {
                ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
                                 uffdio_move.len, uffdio_move.mode);
                mmput(mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_move->move)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        VM_WARN_ON(!ret);
        range.len = ret;
        if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
                range.start = uffdio_move.dst;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_move.len ? 0 : -EAGAIN;

out:
        return ret;
}

/*
 * userland asks for a certain API version and we return which bits
 * and ioctl commands are implemented in this kernel for such API
 * version or -EINVAL if unknown.
 */
static int userfaultfd_api(struct userfaultfd_ctx *ctx,
                           unsigned long arg)
{
        struct uffdio_api uffdio_api;
        void __user *buf = (void __user *)arg;
        unsigned int ctx_features;
        int ret;
        __u64 features;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
                goto out;
        features = uffdio_api.features;
        ret = -EINVAL;
        if (uffdio_api.api != UFFD_API)
                goto err_out;
        ret = -EPERM;
        if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
                goto err_out;

        /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
        if (features & UFFD_FEATURE_WP_ASYNC)
                features |= UFFD_FEATURE_WP_UNPOPULATED;

        /* report all available features and ioctls to userland */
        uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
        uffdio_api.features &=
                ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif
        if (!pgtable_supports_uffd_wp())
                uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;

        if (!uffd_supports_wp_marker()) {
                uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
                uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
                uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
        }

        ret = -EINVAL;
        if (features & ~uffdio_api.features)
                goto err_out;

        uffdio_api.ioctls = UFFD_API_IOCTLS;
        ret = -EFAULT;
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                goto out;

        /* only enable the requested features for this uffd context */
        ctx_features = uffd_ctx_features(features);
        ret = -EINVAL;
        if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
                goto err_out;

        ret = 0;
out:
        return ret;
err_out:
        memset(&uffdio_api, 0, sizeof(uffdio_api));
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                ret = -EFAULT;
        goto out;
}

static long userfaultfd_ioctl(struct file *file, unsigned cmd,
                              unsigned long arg)
{
        int ret = -EINVAL;
        struct userfaultfd_ctx *ctx = file->private_data;

        if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
                return -EINVAL;

        switch(cmd) {
        case UFFDIO_API:
                ret = userfaultfd_api(ctx, arg);
                break;
        case UFFDIO_REGISTER:
                ret = userfaultfd_register(ctx, arg);
                break;
        case UFFDIO_UNREGISTER:
                ret = userfaultfd_unregister(ctx, arg);
                break;
        case UFFDIO_WAKE:
                ret = userfaultfd_wake(ctx, arg);
                break;
        case UFFDIO_COPY:
                ret = userfaultfd_copy(ctx, arg);
                break;
        case UFFDIO_ZEROPAGE:
                ret = userfaultfd_zeropage(ctx, arg);
                break;
        case UFFDIO_MOVE:
                ret = userfaultfd_move(ctx, arg);
                break;
        case UFFDIO_WRITEPROTECT:
                ret = userfaultfd_writeprotect(ctx, arg);
                break;
        case UFFDIO_CONTINUE:
                ret = userfaultfd_continue(ctx, arg);
                break;
        case UFFDIO_POISON:
                ret = userfaultfd_poison(ctx, arg);
                break;
        }
        return ret;
}

#ifdef CONFIG_PROC_FS
static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct userfaultfd_ctx *ctx = f->private_data;
        wait_queue_entry_t *wq;
        unsigned long pending = 0, total = 0;

        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
                pending++;
                total++;
        }
        list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
                total++;
        }
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        /*
         * If more protocols will be added, there will be all shown
         * separated by a space. Like this:
         *        protocols: aa:... bb:...
         */
        seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
                   pending, total, UFFD_API, ctx->features,
                   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
}
#endif

static const struct file_operations userfaultfd_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = userfaultfd_show_fdinfo,
#endif
        .release        = userfaultfd_release,
        .poll                = userfaultfd_poll,
        .read_iter        = userfaultfd_read_iter,
        .unlocked_ioctl = userfaultfd_ioctl,
        .compat_ioctl        = compat_ptr_ioctl,
        .llseek                = noop_llseek,
};

static void init_once_userfaultfd_ctx(void *mem)
{
        struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;

        init_waitqueue_head(&ctx->fault_pending_wqh);
        init_waitqueue_head(&ctx->fault_wqh);
        init_waitqueue_head(&ctx->event_wqh);
        init_waitqueue_head(&ctx->fd_wqh);
        seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
}

static int new_userfaultfd(int flags)
{
        struct userfaultfd_ctx *ctx __free(kfree) = NULL;

        VM_WARN_ON_ONCE(!current->mm);

        /* Check the UFFD_* constants for consistency.  */
        BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);

        if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
                return -EINVAL;

        ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        refcount_set(&ctx->refcount, 1);
        ctx->flags = flags;
        ctx->features = 0;
        ctx->released = false;
        init_rwsem(&ctx->map_changing_lock);
        atomic_set(&ctx->mmap_changing, 0);
        ctx->mm = current->mm;

        FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS,
                   anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
                                             O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS),
                                             NULL));
        if (fdf.err)
                return fdf.err;

        /* prevent the mm struct to be freed */
        mmgrab(ctx->mm);
        fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT;
        retain_and_null_ptr(ctx);
        return fd_publish(fdf);
}

static inline bool userfaultfd_syscall_allowed(int flags)
{
        /* Userspace-only page faults are always allowed */
        if (flags & UFFD_USER_MODE_ONLY)
                return true;

        /*
         * The user is requesting a userfaultfd which can handle kernel faults.
         * Privileged users are always allowed to do this.
         */
        if (capable(CAP_SYS_PTRACE))
                return true;

        /* Otherwise, access to kernel fault handling is sysctl controlled. */
        return sysctl_unprivileged_userfaultfd;
}

SYSCALL_DEFINE1(userfaultfd, int, flags)
{
        if (!userfaultfd_syscall_allowed(flags))
                return -EPERM;

        return new_userfaultfd(flags);
}

static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
{
        if (cmd != USERFAULTFD_IOC_NEW)
                return -EINVAL;

        return new_userfaultfd(flags);
}

static const struct file_operations userfaultfd_dev_fops = {
        .unlocked_ioctl = userfaultfd_dev_ioctl,
        .compat_ioctl = userfaultfd_dev_ioctl,
        .owner = THIS_MODULE,
        .llseek = noop_llseek,
};

static struct miscdevice userfaultfd_misc = {
        .minor = MISC_DYNAMIC_MINOR,
        .name = "userfaultfd",
        .fops = &userfaultfd_dev_fops
};

static int __init userfaultfd_init(void)
{
        int ret;

        ret = misc_register(&userfaultfd_misc);
        if (ret)
                return ret;

        userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
                                                sizeof(struct userfaultfd_ctx),
                                                0,
                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                                init_once_userfaultfd_ctx);
#ifdef CONFIG_SYSCTL
        register_sysctl_init("vm", vm_userfaultfd_table);
#endif
        return 0;
}
__initcall(userfaultfd_init);


































































   95 








   22 


























   23 




















   17 


















   17 
   15 
   16 
   17 

















   61 






   63 







   61 















   63 






















































































































































































































































































   22 
    2 
   19 

   20 


   20 


   22 


   15 







   22 





   23 

   23 


    2 
   21 






















































































































































































































































































































































































   65 
































   91 
   90 

   68 


   69 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1994 Linus Torvalds
 *
 *  Pentium III FXSR, SSE support
 *  General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 */
#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/sched.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/msr.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>

#include <uapi/asm/kvm.h>

#include <linux/hardirq.h>
#include <linux/kvm_types.h>
#include <linux/pkeys.h>
#include <linux/vmalloc.h>

#include "context.h"
#include "internal.h"
#include "legacy.h"
#include "xstate.h"

#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>

#ifdef CONFIG_X86_64
DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
DEFINE_PER_CPU(u64, xfd_state);
#endif

/* The FPU state configuration data for kernel and user space */
struct fpu_state_config        fpu_kernel_cfg __ro_after_init;
struct fpu_state_config fpu_user_cfg __ro_after_init;
struct vcpu_fpu_config guest_default_cfg __ro_after_init;

/*
 * Represents the initial FPU state. It's mostly (but not completely) zeroes,
 * depending on the FPU hardware format:
 */
struct fpstate init_fpstate __ro_after_init;

/*
 * Track FPU initialization and kernel-mode usage. 'true' means the FPU is
 * initialized and is not currently being used by the kernel:
 */
DEFINE_PER_CPU(bool, kernel_fpu_allowed);

/*
 * Track which context is using the FPU on the CPU:
 */
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

#ifdef CONFIG_X86_DEBUG_FPU
struct fpu *x86_task_fpu(struct task_struct *task)
{
        if (WARN_ON_ONCE(task->flags & PF_KTHREAD))
                return NULL;

        return (void *)task + sizeof(*task);
}
#endif

/*
 * Can we use the FPU in kernel mode with the
 * whole "kernel_fpu_begin/end()" sequence?
 */
bool irq_fpu_usable(void)
{
        if (WARN_ON_ONCE(in_nmi()))
                return false;

        /*
         * Return false in the following cases:
         *
         * - FPU is not yet initialized. This can happen only when the call is
         *   coming from CPU onlining, for example for microcode checksumming.
         * - The kernel is already using the FPU, either because of explicit
         *   nesting (which should never be done), or because of implicit
         *   nesting when a hardirq interrupted a kernel-mode FPU section.
         *
         * The single boolean check below handles both cases:
         */
        if (!this_cpu_read(kernel_fpu_allowed))
                return false;

        /*
         * When not in NMI or hard interrupt context, FPU can be used in:
         *
         * - Task context except from within fpregs_lock()'ed critical
         *   regions.
         *
         * - Soft interrupt processing context which cannot happen
         *   while in a fpregs_lock()'ed critical region.
         */
        if (!in_hardirq())
                return true;

        /*
         * In hard interrupt context it's safe when soft interrupts
         * are enabled, which means the interrupt did not hit in
         * a fpregs_lock()'ed critical region.
         */
        return !softirq_count();
}
EXPORT_SYMBOL(irq_fpu_usable);

/*
 * Track AVX512 state use because it is known to slow the max clock
 * speed of the core.
 */
static void update_avx_timestamp(struct fpu *fpu)
{

#define AVX512_TRACKING_MASK        (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)

        if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
                fpu->avx512_timestamp = jiffies;
}

/*
 * Save the FPU register state in fpu->fpstate->regs. The register state is
 * preserved.
 *
 * Must be called with fpregs_lock() held.
 *
 * The legacy FNSAVE instruction clears all FPU state unconditionally, so
 * register state has to be reloaded. That might be a pointless exercise
 * when the FPU is going to be used by another task right after that. But
 * this only affects 20+ years old 32bit systems and avoids conditionals all
 * over the place.
 *
 * FXSAVE and all XSAVE variants preserve the FPU register state.
 */
void save_fpregs_to_fpstate(struct fpu *fpu)
{
        if (likely(use_xsave())) {
                os_xsave(fpu->fpstate);
                update_avx_timestamp(fpu);
                return;
        }

        if (likely(use_fxsr())) {
                fxsave(&fpu->fpstate->regs.fxsave);
                return;
        }

        /*
         * Legacy FPU register saving, FNSAVE always clears FPU registers,
         * so we have to reload them from the memory state.
         */
        asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
        frstor(&fpu->fpstate->regs.fsave);
}

void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
{
        /*
         * AMD K7/K8 and later CPUs up to Zen don't save/restore
         * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
         * here by setting it to fixed values.  "m" is a random variable
         * that should be in L1.
         */
        if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
                asm volatile(
                        "fnclex\n\t"
                        "emms\n\t"
                        "fildl %[addr]"        /* set F?P to defined value */
                        : : [addr] "m" (*fpstate));
        }

        if (use_xsave()) {
                /*
                 * Dynamically enabled features are enabled in XCR0, but
                 * usage requires also that the corresponding bits in XFD
                 * are cleared.  If the bits are set then using a related
                 * instruction will raise #NM. This allows to do the
                 * allocation of the larger FPU buffer lazy from #NM or if
                 * the task has no permission to kill it which would happen
                 * via #UD if the feature is disabled in XCR0.
                 *
                 * XFD state is following the same life time rules as
                 * XSTATE and to restore state correctly XFD has to be
                 * updated before XRSTORS otherwise the component would
                 * stay in or go into init state even if the bits are set
                 * in fpstate::regs::xsave::xfeatures.
                 */
                xfd_update_state(fpstate);

                /*
                 * Restoring state always needs to modify all features
                 * which are in @mask even if the current task cannot use
                 * extended features.
                 *
                 * So fpstate->xfeatures cannot be used here, because then
                 * a feature for which the task has no permission but was
                 * used by the previous task would not go into init state.
                 */
                mask = fpu_kernel_cfg.max_features & mask;

                os_xrstor(fpstate, mask);
        } else {
                if (use_fxsr())
                        fxrstor(&fpstate->regs.fxsave);
                else
                        frstor(&fpstate->regs.fsave);
        }
}

void fpu_reset_from_exception_fixup(void)
{
        restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
}

#if IS_ENABLED(CONFIG_KVM)
static void __fpstate_reset(struct fpstate *fpstate);

static void fpu_lock_guest_permissions(void)
{
        struct fpu_state_perm *fpuperm;
        u64 perm;

        if (!IS_ENABLED(CONFIG_X86_64))
                return;

        spin_lock_irq(&current->sighand->siglock);
        fpuperm = &x86_task_fpu(current->group_leader)->guest_perm;
        perm = fpuperm->__state_perm;

        /* First fpstate allocation locks down permissions. */
        WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);

        spin_unlock_irq(&current->sighand->siglock);
}

bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
{
        struct fpstate *fpstate;
        unsigned int size;

        size = guest_default_cfg.size + ALIGN(offsetof(struct fpstate, regs), 64);

        fpstate = vzalloc(size);
        if (!fpstate)
                return false;

        /* Initialize indicators to reflect properties of the fpstate */
        fpstate->is_valloc        = true;
        fpstate->is_guest        = true;

        __fpstate_reset(fpstate);
        fpstate_init_user(fpstate);

        gfpu->fpstate                = fpstate;
        gfpu->xfeatures                = guest_default_cfg.features;

        /*
         * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
         * to userspace, even when XSAVE is unsupported, so that restoring FPU
         * state on a different CPU that does support XSAVE can cleanly load
         * the incoming state using its natural XSAVE.  In other words, KVM's
         * uABI size may be larger than this host's default size.  Conversely,
         * the default size should never be larger than KVM's base uABI size;
         * all features that can expand the uABI size must be opt-in.
         */
        gfpu->uabi_size                = sizeof(struct kvm_xsave);
        if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
                gfpu->uabi_size = fpu_user_cfg.default_size;

        fpu_lock_guest_permissions();

        return true;
}
EXPORT_SYMBOL_FOR_KVM(fpu_alloc_guest_fpstate);

void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
{
        struct fpstate *fpstate = gfpu->fpstate;

        if (!fpstate)
                return;

        if (WARN_ON_ONCE(!fpstate->is_valloc || !fpstate->is_guest || fpstate->in_use))
                return;

        gfpu->fpstate = NULL;
        vfree(fpstate);
}
EXPORT_SYMBOL_FOR_KVM(fpu_free_guest_fpstate);

/*
  * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
  * @guest_fpu:         Pointer to the guest FPU container
  * @xfeatures:         Features requested by guest CPUID
  *
  * Enable all dynamic xfeatures according to guest perm and requested CPUID.
  *
  * Return: 0 on success, error code otherwise
  */
int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
{
        lockdep_assert_preemption_enabled();

        /* Nothing to do if all requested features are already enabled. */
        xfeatures &= ~guest_fpu->xfeatures;
        if (!xfeatures)
                return 0;

        return __xfd_enable_feature(xfeatures, guest_fpu);
}
EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features);

#ifdef CONFIG_X86_64
void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
{
        struct fpstate *fpstate = guest_fpu->fpstate;

        fpregs_lock();

        /*
         * KVM's guest ABI is that setting XFD[i]=1 *can* immediately revert the
         * save state to its initial configuration.  Likewise, KVM_GET_XSAVE does
         * the same as XSAVE and returns XSTATE_BV[i]=0 whenever XFD[i]=1.
         *
         * If the guest's FPU state is in hardware, just update XFD: the XSAVE
         * in fpu_swap_kvm_fpstate will clear XSTATE_BV[i] whenever XFD[i]=1.
         *
         * If however the guest's FPU state is NOT resident in hardware, clear
         * disabled components in XSTATE_BV now, or a subsequent XRSTOR will
         * attempt to load disabled components and generate #NM _in the host_.
         */
        if (xfd && test_thread_flag(TIF_NEED_FPU_LOAD))
                fpstate->regs.xsave.header.xfeatures &= ~xfd;

        fpstate->xfd = xfd;
        if (fpstate->in_use)
                xfd_update_state(fpstate);

        fpregs_unlock();
}
EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd);

/**
 * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
 *
 * Must be invoked from KVM after a VMEXIT before enabling interrupts when
 * XFD write emulation is disabled. This is required because the guest can
 * freely modify XFD and the state at VMEXIT is not guaranteed to be the
 * same as the state on VMENTER. So software state has to be updated before
 * any operation which depends on it can take place.
 *
 * Note: It can be invoked unconditionally even when write emulation is
 * enabled for the price of a then pointless MSR read.
 */
void fpu_sync_guest_vmexit_xfd_state(void)
{
        struct fpstate *fpstate = x86_task_fpu(current)->fpstate;

        lockdep_assert_irqs_disabled();
        if (fpu_state_size_dynamic()) {
                rdmsrq(MSR_IA32_XFD, fpstate->xfd);
                __this_cpu_write(xfd_state, fpstate->xfd);
        }
}
EXPORT_SYMBOL_FOR_KVM(fpu_sync_guest_vmexit_xfd_state);
#endif /* CONFIG_X86_64 */

int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
{
        struct fpstate *guest_fps = guest_fpu->fpstate;
        struct fpu *fpu = x86_task_fpu(current);
        struct fpstate *cur_fps = fpu->fpstate;

        fpregs_lock();
        if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
                save_fpregs_to_fpstate(fpu);

        /* Swap fpstate */
        if (enter_guest) {
                fpu->__task_fpstate = cur_fps;
                fpu->fpstate = guest_fps;
                guest_fps->in_use = true;
        } else {
                guest_fps->in_use = false;
                fpu->fpstate = fpu->__task_fpstate;
                fpu->__task_fpstate = NULL;
        }

        cur_fps = fpu->fpstate;

        if (!cur_fps->is_confidential) {
                /* Includes XFD update */
                restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
        } else {
                /*
                 * XSTATE is restored by firmware from encrypted
                 * memory. Make sure XFD state is correct while
                 * running with guest fpstate
                 */
                xfd_update_state(cur_fps);
        }

        fpregs_mark_activate();
        fpregs_unlock();
        return 0;
}
EXPORT_SYMBOL_FOR_KVM(fpu_swap_kvm_fpstate);

void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
                                    unsigned int size, u64 xfeatures, u32 pkru)
{
        struct fpstate *kstate = gfpu->fpstate;
        union fpregs_state *ustate = buf;
        struct membuf mb = { .p = buf, .left = size };

        if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
                __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru,
                                          XSTATE_COPY_XSAVE);
        } else {
                memcpy(&ustate->fxsave, &kstate->regs.fxsave,
                       sizeof(ustate->fxsave));
                /* Make it restorable on a XSAVE enabled host */
                ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
        }
}
EXPORT_SYMBOL_FOR_KVM(fpu_copy_guest_fpstate_to_uabi);

int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
                                   u64 xcr0, u32 *vpkru)
{
        struct fpstate *kstate = gfpu->fpstate;
        const union fpregs_state *ustate = buf;

        if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
                if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
                        return -EINVAL;
                if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
                        return -EINVAL;
                memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
                return 0;
        }

        if (ustate->xsave.header.xfeatures & ~xcr0)
                return -EINVAL;

        /*
         * Disabled features must be in their initial state, otherwise XRSTOR
         * causes an exception.
         */
        if (WARN_ON_ONCE(ustate->xsave.header.xfeatures & kstate->xfd))
                return -EINVAL;

        /*
         * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
         * in the header.  KVM's odd ABI is to leave PKRU untouched in this
         * case (all other components are eventually re-initialized).
         */
        if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU))
                vpkru = NULL;

        return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
EXPORT_SYMBOL_FOR_KVM(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */

void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
        if (!irqs_disabled())
                fpregs_lock();

        WARN_ON_FPU(!irq_fpu_usable());

        /* Toggle kernel_fpu_allowed to false: */
        WARN_ON_FPU(!this_cpu_read(kernel_fpu_allowed));
        this_cpu_write(kernel_fpu_allowed, false);

        if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
            !test_thread_flag(TIF_NEED_FPU_LOAD)) {
                set_thread_flag(TIF_NEED_FPU_LOAD);
                save_fpregs_to_fpstate(x86_task_fpu(current));
        }
        __cpu_invalidate_fpregs_state();

        /* Put sane initial values into the control registers. */
        if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
                ldmxcsr(MXCSR_DEFAULT);

        if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
                asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);

void kernel_fpu_end(void)
{
        /* Toggle kernel_fpu_allowed back to true: */
        WARN_ON_FPU(this_cpu_read(kernel_fpu_allowed));
        this_cpu_write(kernel_fpu_allowed, true);

        if (!irqs_disabled())
                fpregs_unlock();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);

/*
 * Sync the FPU register state to current's memory register state when the
 * current task owns the FPU. The hardware register state is preserved.
 */
void fpu_sync_fpstate(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != x86_task_fpu(current));

        fpregs_lock();
        trace_x86_fpu_before_save(fpu);

        if (!test_thread_flag(TIF_NEED_FPU_LOAD))
                save_fpregs_to_fpstate(fpu);

        trace_x86_fpu_after_save(fpu);
        fpregs_unlock();
}

static inline unsigned int init_fpstate_copy_size(void)
{
        if (!use_xsave())
                return fpu_kernel_cfg.default_size;

        /* XSAVE(S) just needs the legacy and the xstate header part */
        return sizeof(init_fpstate.regs.xsave);
}

static inline void fpstate_init_fxstate(struct fpstate *fpstate)
{
        fpstate->regs.fxsave.cwd = 0x37f;
        fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
}

/*
 * Legacy x87 fpstate state init:
 */
static inline void fpstate_init_fstate(struct fpstate *fpstate)
{
        fpstate->regs.fsave.cwd = 0xffff037fu;
        fpstate->regs.fsave.swd = 0xffff0000u;
        fpstate->regs.fsave.twd = 0xffffffffu;
        fpstate->regs.fsave.fos = 0xffff0000u;
}

/*
 * Used in two places:
 * 1) Early boot to setup init_fpstate for non XSAVE systems
 * 2) fpu_alloc_guest_fpstate() which is invoked from KVM
 */
void fpstate_init_user(struct fpstate *fpstate)
{
        if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
                fpstate_init_soft(&fpstate->regs.soft);
                return;
        }

        xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);

        if (cpu_feature_enabled(X86_FEATURE_FXSR))
                fpstate_init_fxstate(fpstate);
        else
                fpstate_init_fstate(fpstate);
}

static void __fpstate_reset(struct fpstate *fpstate)
{
        /*
         * Supervisor features (and thus sizes) may diverge between guest
         * FPUs and host FPUs, as some supervisor features are supported
         * for guests despite not being utilized by the host. User
         * features and sizes are always identical, which allows for
         * common guest and userspace ABI.
         *
         * For the host, set XFD to the kernel's desired initialization
         * value. For guests, set XFD to its architectural RESET value.
         */
        if (fpstate->is_guest) {
                fpstate->size                = guest_default_cfg.size;
                fpstate->xfeatures        = guest_default_cfg.features;
                fpstate->xfd                = 0;
        } else {
                fpstate->size                = fpu_kernel_cfg.default_size;
                fpstate->xfeatures        = fpu_kernel_cfg.default_features;
                fpstate->xfd                = init_fpstate.xfd;
        }

        fpstate->user_size        = fpu_user_cfg.default_size;
        fpstate->user_xfeatures        = fpu_user_cfg.default_features;
}

void fpstate_reset(struct fpu *fpu)
{
        /* Set the fpstate pointer to the default fpstate */
        fpu->fpstate = &fpu->__fpstate;
        __fpstate_reset(fpu->fpstate);

        /* Initialize the permission related info in fpu */
        fpu->perm.__state_perm                = fpu_kernel_cfg.default_features;
        fpu->perm.__state_size                = fpu_kernel_cfg.default_size;
        fpu->perm.__user_state_size        = fpu_user_cfg.default_size;

        fpu->guest_perm.__state_perm        = guest_default_cfg.features;
        fpu->guest_perm.__state_size        = guest_default_cfg.size;
        /*
         * User features and sizes are always identical between host and
         * guest FPUs, which allows for common guest and userspace ABI.
         */
        fpu->guest_perm.__user_state_size = fpu_user_cfg.default_size;
}

static inline void fpu_inherit_perms(struct fpu *dst_fpu)
{
        if (fpu_state_size_dynamic()) {
                struct fpu *src_fpu = x86_task_fpu(current->group_leader);

                spin_lock_irq(&current->sighand->siglock);
                /* Fork also inherits the permissions of the parent */
                dst_fpu->perm = src_fpu->perm;
                dst_fpu->guest_perm = src_fpu->guest_perm;
                spin_unlock_irq(&current->sighand->siglock);
        }
}

/* A passed ssp of zero will not cause any update */
static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
{
#ifdef CONFIG_X86_USER_SHADOW_STACK
        struct cet_user_state *xstate;

        /* If ssp update is not needed. */
        if (!ssp)
                return 0;

        xstate = get_xsave_addr(&x86_task_fpu(dst)->fpstate->regs.xsave,
                                XFEATURE_CET_USER);

        /*
         * If there is a non-zero ssp, then 'dst' must be configured with a shadow
         * stack and the fpu state should be up to date since it was just copied
         * from the parent in fpu_clone(). So there must be a valid non-init CET
         * state location in the buffer.
         */
        if (WARN_ON_ONCE(!xstate))
                return 1;

        xstate->user_ssp = (u64)ssp;
#endif
        return 0;
}

/* Clone current's FPU state on fork */
int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal,
              unsigned long ssp)
{
        /*
         * We allocate the new FPU structure right after the end of the task struct.
         * task allocation size already took this into account.
         *
         * This is safe because task_struct size is a multiple of cacheline size,
         * thus x86_task_fpu() will always be cacheline aligned as well.
         */
        struct fpu *dst_fpu = (void *)dst + sizeof(*dst);

        BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);

        /* The new task's FPU state cannot be valid in the hardware. */
        dst_fpu->last_cpu = -1;

        fpstate_reset(dst_fpu);

        if (!cpu_feature_enabled(X86_FEATURE_FPU))
                return 0;

        /*
         * Enforce reload for user space tasks and prevent kernel threads
         * from trying to save the FPU registers on context switch.
         */
        set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);

        /*
         * No FPU state inheritance for kernel threads and IO
         * worker threads.
         */
        if (minimal) {
                /* Clear out the minimal state */
                memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
                       init_fpstate_copy_size());
                return 0;
        }

        /*
         * If a new feature is added, ensure all dynamic features are
         * caller-saved from here!
         */
        BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);

        /*
         * Save the default portion of the current FPU state into the
         * clone. Assume all dynamic features to be defined as caller-
         * saved, which enables skipping both the expansion of fpstate
         * and the copying of any dynamic state.
         *
         * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
         * copying is not valid when current uses non-default states.
         */
        fpregs_lock();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();
        save_fpregs_to_fpstate(dst_fpu);
        fpregs_unlock();
        if (!(clone_flags & CLONE_THREAD))
                fpu_inherit_perms(dst_fpu);

        /*
         * Children never inherit PASID state.
         * Force it to have its init value:
         */
        if (use_xsave())
                dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;

        /*
         * Update shadow stack pointer, in case it changed during clone.
         */
        if (update_fpu_shstk(dst, ssp))
                return 1;

        trace_x86_fpu_copy_dst(dst_fpu);

        return 0;
}

/*
 * While struct fpu is no longer part of struct thread_struct, it is still
 * allocated after struct task_struct in the "task_struct" kmem cache. But
 * since FPU is expected to be part of struct thread_struct, we have to
 * adjust for it here.
 */
void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        /* The allocation follows struct task_struct. */
        *offset = sizeof(struct task_struct) - offsetof(struct task_struct, thread);
        *offset += offsetof(struct fpu, __fpstate.regs);
        *size = fpu_kernel_cfg.default_size;
}

/*
 * Drops current FPU state: deactivates the fpregs and
 * the fpstate. NOTE: it still leaves previous contents
 * in the fpregs in the eager-FPU case.
 *
 * This function can be used in cases where we know that
 * a state-restore is coming: either an explicit one,
 * or a reschedule.
 */
void fpu__drop(struct task_struct *tsk)
{
        struct fpu *fpu;

        if (test_tsk_thread_flag(tsk, TIF_NEED_FPU_LOAD))
                return;

        fpu = x86_task_fpu(tsk);

        preempt_disable();

        if (fpu == x86_task_fpu(current)) {
                /* Ignore delayed exceptions from user space */
                asm volatile("1: fwait\n"
                             "2:\n"
                             _ASM_EXTABLE(1b, 2b));
                fpregs_deactivate(fpu);
        }

        trace_x86_fpu_dropped(fpu);

        preempt_enable();
}

/*
 * Clear FPU registers by setting them up from the init fpstate.
 * Caller must do fpregs_[un]lock() around it.
 */
static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
{
        if (use_xsave())
                os_xrstor(&init_fpstate, features_mask);
        else if (use_fxsr())
                fxrstor(&init_fpstate.regs.fxsave);
        else
                frstor(&init_fpstate.regs.fsave);

        pkru_write_default();
}

/*
 * Reset current->fpu memory state to the init values.
 */
static void fpu_reset_fpstate_regs(void)
{
        struct fpu *fpu = x86_task_fpu(current);

        fpregs_lock();
        __fpu_invalidate_fpregs_state(fpu);
        /*
         * This does not change the actual hardware registers. It just
         * resets the memory image and sets TIF_NEED_FPU_LOAD so a
         * subsequent return to usermode will reload the registers from the
         * task's memory image.
         *
         * Do not use fpstate_init() here. Just copy init_fpstate which has
         * the correct content already except for PKRU.
         *
         * PKRU handling does not rely on the xstate when restoring for
         * user space as PKRU is eagerly written in switch_to() and
         * flush_thread().
         */
        memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
        set_thread_flag(TIF_NEED_FPU_LOAD);
        fpregs_unlock();
}

/*
 * Reset current's user FPU states to the init states.  current's
 * supervisor states, if any, are not modified by this function.  The
 * caller guarantees that the XSTATE header in memory is intact.
 */
void fpu__clear_user_states(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != x86_task_fpu(current));

        fpregs_lock();
        if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
                fpu_reset_fpstate_regs();
                fpregs_unlock();
                return;
        }

        /*
         * Ensure that current's supervisor states are loaded into their
         * corresponding registers.
         */
        if (xfeatures_mask_supervisor() &&
            !fpregs_state_valid(fpu, smp_processor_id()))
                os_xrstor_supervisor(fpu->fpstate);

        /* Ensure XFD state is in sync before reloading XSTATE */
        xfd_update_state(fpu->fpstate);

        /* Reset user states in registers. */
        restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);

        /*
         * Now all FPU registers have their desired values.  Inform the FPU
         * state machine that current's FPU registers are in the hardware
         * registers. The memory image does not need to be updated because
         * any operation relying on it has to save the registers first when
         * current's FPU is marked active.
         */
        fpregs_mark_activate();
        fpregs_unlock();
}

void fpu_flush_thread(void)
{
        fpstate_reset(x86_task_fpu(current));
        fpu_reset_fpstate_regs();
}
/*
 * Load FPU context before returning to userspace.
 */
void switch_fpu_return(void)
{
        if (!static_cpu_has(X86_FEATURE_FPU))
                return;

        fpregs_restore_userregs();
}
EXPORT_SYMBOL_FOR_KVM(switch_fpu_return);

void fpregs_lock_and_load(void)
{
        /*
         * fpregs_lock() only disables preemption (mostly). So modifying state
         * in an interrupt could screw up some in progress fpregs operation.
         * Warn about it.
         */
        WARN_ON_ONCE(!irq_fpu_usable());
        WARN_ON_ONCE(current->flags & PF_KTHREAD);

        fpregs_lock();

        fpregs_assert_state_consistent();

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();
}

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * If current FPU state according to its tracking (loaded FPU context on this
 * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
 * loaded on return to userland.
 */
void fpregs_assert_state_consistent(void)
{
        struct fpu *fpu = x86_task_fpu(current);

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                return;

        WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
}
EXPORT_SYMBOL_FOR_KVM(fpregs_assert_state_consistent);
#endif

void fpregs_mark_activate(void)
{
        struct fpu *fpu = x86_task_fpu(current);

        fpregs_activate(fpu);
        fpu->last_cpu = smp_processor_id();
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

/*
 * x87 math exception handling:
 */

int fpu__exception_code(struct fpu *fpu, int trap_nr)
{
        int err;

        if (trap_nr == X86_TRAP_MF) {
                unsigned short cwd, swd;
                /*
                 * (~cwd & swd) will mask out exceptions that are not set to unmasked
                 * status.  0x3f is the exception bits in these regs, 0x200 is the
                 * C1 reg you need in case of a stack fault, 0x040 is the stack
                 * fault bit.  We should only be taking one exception at a time,
                 * so if this combination doesn't produce any single exception,
                 * then we have a bad program that isn't synchronizing its FPU usage
                 * and it will suffer the consequences since we won't be able to
                 * fully reproduce the context of the exception.
                 */
                if (boot_cpu_has(X86_FEATURE_FXSR)) {
                        cwd = fpu->fpstate->regs.fxsave.cwd;
                        swd = fpu->fpstate->regs.fxsave.swd;
                } else {
                        cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
                        swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
                }

                err = swd & ~cwd;
        } else {
                /*
                 * The SIMD FPU exceptions are handled a little differently, as there
                 * is only a single status/control register.  Thus, to determine which
                 * unmasked exception was caught we must mask the exception mask bits
                 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
                 */
                unsigned short mxcsr = MXCSR_DEFAULT;

                if (boot_cpu_has(X86_FEATURE_XMM))
                        mxcsr = fpu->fpstate->regs.fxsave.mxcsr;

                err = ~(mxcsr >> 7) & mxcsr;
        }

        if (err & 0x001) {        /* Invalid op */
                /*
                 * swd & 0x240 == 0x040: Stack Underflow
                 * swd & 0x240 == 0x240: Stack Overflow
                 * User must clear the SF bit (0x40) if set
                 */
                return FPE_FLTINV;
        } else if (err & 0x004) { /* Divide by Zero */
                return FPE_FLTDIV;
        } else if (err & 0x008) { /* Overflow */
                return FPE_FLTOVF;
        } else if (err & 0x012) { /* Denormal, Underflow */
                return FPE_FLTUND;
        } else if (err & 0x020) { /* Precision */
                return FPE_FLTRES;
        }

        /*
         * If we're using IRQ 13, or supposedly even some trap
         * X86_TRAP_MF implementations, it's possible
         * we get a spurious trap, which is not an error.
         */
        return 0;
}

/*
 * Initialize register state that may prevent from entering low-power idle.
 * This function will be invoked from the cpuidle driver only when needed.
 */
noinstr void fpu_idle_fpregs(void)
{
        /* Note: AMX_TILE being enabled implies XGETBV1 support */
        if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
            (xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
                tile_release();
                __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        }
}




































    1 



    1 






    1 
    1 





    1 



























    1 
    1 


    1 
    1 





























    1 











    1 

























































































































































































































































































































































































































































































































    1 











    1 





































































































































































































































































































































































































    1 










    1 

    1 
























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org>
 *
 * Fixes from William Schumacher incorporated on 15 March 2001.
 *    (Reported by Charles Bertsch, <CBertsch@microtest.com>).
 */

/*
 *  This file contains generic functions for manipulating
 *  POSIX 1003.1e draft standard 17 ACLs.
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/mnt_idmapping.h>
#include <linux/iversion.h>
#include <linux/security.h>
#include <linux/fsnotify.h>
#include <linux/filelock.h>

#include "internal.h"

static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
        switch (type) {
        case ACL_TYPE_ACCESS:
                return &inode->i_acl;
        case ACL_TYPE_DEFAULT:
                return &inode->i_default_acl;
        default:
                BUG();
        }
}

struct posix_acl *get_cached_acl(struct inode *inode, int type)
{
        struct posix_acl **p = acl_by_type(inode, type);
        struct posix_acl *acl;

        for (;;) {
                rcu_read_lock();
                acl = rcu_dereference(*p);
                if (!acl || is_uncached_acl(acl) ||
                    refcount_inc_not_zero(&acl->a_refcount))
                        break;
                rcu_read_unlock();
                cpu_relax();
        }
        rcu_read_unlock();
        return acl;
}
EXPORT_SYMBOL(get_cached_acl);

struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
{
        struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type));

        if (acl == ACL_DONT_CACHE) {
                struct posix_acl *ret;

                ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU);
                if (!IS_ERR(ret))
                        acl = ret;
        }

        return acl;
}
EXPORT_SYMBOL(get_cached_acl_rcu);

void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
{
        struct posix_acl **p = acl_by_type(inode, type);
        struct posix_acl *old;

        old = xchg(p, posix_acl_dup(acl));
        if (!is_uncached_acl(old))
                posix_acl_release(old);
}
EXPORT_SYMBOL(set_cached_acl);

static void __forget_cached_acl(struct posix_acl **p)
{
        struct posix_acl *old;

        old = xchg(p, ACL_NOT_CACHED);
        if (!is_uncached_acl(old))
                posix_acl_release(old);
}

void forget_cached_acl(struct inode *inode, int type)
{
        __forget_cached_acl(acl_by_type(inode, type));
}
EXPORT_SYMBOL(forget_cached_acl);

void forget_all_cached_acls(struct inode *inode)
{
        __forget_cached_acl(&inode->i_acl);
        __forget_cached_acl(&inode->i_default_acl);
}
EXPORT_SYMBOL(forget_all_cached_acls);

static struct posix_acl *__get_acl(struct mnt_idmap *idmap,
                                   struct dentry *dentry, struct inode *inode,
                                   int type)
{
        struct posix_acl *sentinel;
        struct posix_acl **p;
        struct posix_acl *acl;

        /*
         * The sentinel is used to detect when another operation like
         * set_cached_acl() or forget_cached_acl() races with get_inode_acl().
         * It is guaranteed that is_uncached_acl(sentinel) is true.
         */

        acl = get_cached_acl(inode, type);
        if (!is_uncached_acl(acl))
                return acl;

        if (!IS_POSIXACL(inode))
                return NULL;

        sentinel = uncached_acl_sentinel(current);
        p = acl_by_type(inode, type);

        /*
         * If the ACL isn't being read yet, set our sentinel.  Otherwise, the
         * current value of the ACL will not be ACL_NOT_CACHED and so our own
         * sentinel will not be set; another task will update the cache.  We
         * could wait for that other task to complete its job, but it's easier
         * to just call ->get_inode_acl to fetch the ACL ourself.  (This is
         * going to be an unlikely race.)
         */
        cmpxchg(p, ACL_NOT_CACHED, sentinel);

        /*
         * Normally, the ACL returned by ->get{_inode}_acl will be cached.
         * A filesystem can prevent that by calling
         * forget_cached_acl(inode, type) in ->get{_inode}_acl.
         *
         * If the filesystem doesn't have a get{_inode}_ acl() function at all,
         * we'll just create the negative cache entry.
         */
        if (dentry && inode->i_op->get_acl) {
                acl = inode->i_op->get_acl(idmap, dentry, type);
        } else if (inode->i_op->get_inode_acl) {
                acl = inode->i_op->get_inode_acl(inode, type, false);
        } else {
                set_cached_acl(inode, type, NULL);
                return NULL;
        }
        if (IS_ERR(acl)) {
                /*
                 * Remove our sentinel so that we don't block future attempts
                 * to cache the ACL.
                 */
                cmpxchg(p, sentinel, ACL_NOT_CACHED);
                return acl;
        }

        /*
         * Cache the result, but only if our sentinel is still in place.
         */
        posix_acl_dup(acl);
        if (unlikely(!try_cmpxchg(p, &sentinel, acl)))
                posix_acl_release(acl);
        return acl;
}

struct posix_acl *get_inode_acl(struct inode *inode, int type)
{
        return __get_acl(&nop_mnt_idmap, NULL, inode, type);
}
EXPORT_SYMBOL(get_inode_acl);

/*
 * Init a fresh posix_acl
 */
void
posix_acl_init(struct posix_acl *acl, int count)
{
        refcount_set(&acl->a_refcount, 1);
        acl->a_count = count;
}
EXPORT_SYMBOL(posix_acl_init);

/*
 * Allocate a new ACL with the specified number of entries.
 */
struct posix_acl *
posix_acl_alloc(unsigned int count, gfp_t flags)
{
        struct posix_acl *acl;

        acl = kmalloc_flex(*acl, a_entries, count, flags);
        if (acl)
                posix_acl_init(acl, count);
        return acl;
}
EXPORT_SYMBOL(posix_acl_alloc);

/*
 * Clone an ACL.
 */
struct posix_acl *
posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
{
        struct posix_acl *clone = NULL;

        if (acl) {
                clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count),
                                flags);
                if (clone)
                        refcount_set(&clone->a_refcount, 1);
        }
        return clone;
}
EXPORT_SYMBOL_GPL(posix_acl_clone);

/*
 * Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
 */
int
posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl)
{
        const struct posix_acl_entry *pa, *pe;
        int state = ACL_USER_OBJ;
        int needs_mask = 0;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                if (pa->e_perm & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
                        return -EINVAL;
                switch (pa->e_tag) {
                        case ACL_USER_OBJ:
                                if (state == ACL_USER_OBJ) {
                                        state = ACL_USER;
                                        break;
                                }
                                return -EINVAL;

                        case ACL_USER:
                                if (state != ACL_USER)
                                        return -EINVAL;
                                if (!kuid_has_mapping(user_ns, pa->e_uid))
                                        return -EINVAL;
                                needs_mask = 1;
                                break;

                        case ACL_GROUP_OBJ:
                                if (state == ACL_USER) {
                                        state = ACL_GROUP;
                                        break;
                                }
                                return -EINVAL;

                        case ACL_GROUP:
                                if (state != ACL_GROUP)
                                        return -EINVAL;
                                if (!kgid_has_mapping(user_ns, pa->e_gid))
                                        return -EINVAL;
                                needs_mask = 1;
                                break;

                        case ACL_MASK:
                                if (state != ACL_GROUP)
                                        return -EINVAL;
                                state = ACL_OTHER;
                                break;

                        case ACL_OTHER:
                                if (state == ACL_OTHER ||
                                    (state == ACL_GROUP && !needs_mask)) {
                                        state = 0;
                                        break;
                                }
                                return -EINVAL;

                        default:
                                return -EINVAL;
                }
        }
        if (state == 0)
                return 0;
        return -EINVAL;
}
EXPORT_SYMBOL(posix_acl_valid);

/*
 * Returns 0 if the acl can be exactly represented in the traditional
 * file mode permission bits, or else 1. Returns -E... on error.
 */
int
posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
{
        const struct posix_acl_entry *pa, *pe;
        umode_t mode = 0;
        int not_equiv = 0;

        /*
         * A null ACL can always be presented as mode bits.
         */
        if (!acl)
                return 0;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch (pa->e_tag) {
                        case ACL_USER_OBJ:
                                mode |= (pa->e_perm & S_IRWXO) << 6;
                                break;
                        case ACL_GROUP_OBJ:
                                mode |= (pa->e_perm & S_IRWXO) << 3;
                                break;
                        case ACL_OTHER:
                                mode |= pa->e_perm & S_IRWXO;
                                break;
                        case ACL_MASK:
                                mode = (mode & ~S_IRWXG) |
                                       ((pa->e_perm & S_IRWXO) << 3);
                                not_equiv = 1;
                                break;
                        case ACL_USER:
                        case ACL_GROUP:
                                not_equiv = 1;
                                break;
                        default:
                                return -EINVAL;
                }
        }
        if (mode_p)
                *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
        return not_equiv;
}
EXPORT_SYMBOL(posix_acl_equiv_mode);

/*
 * Create an ACL representing the file mode permission bits of an inode.
 */
struct posix_acl *
posix_acl_from_mode(umode_t mode, gfp_t flags)
{
        struct posix_acl *acl = posix_acl_alloc(3, flags);
        if (!acl)
                return ERR_PTR(-ENOMEM);

        acl->a_entries[0].e_tag  = ACL_USER_OBJ;
        acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6;

        acl->a_entries[1].e_tag  = ACL_GROUP_OBJ;
        acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3;

        acl->a_entries[2].e_tag  = ACL_OTHER;
        acl->a_entries[2].e_perm = (mode & S_IRWXO);
        return acl;
}
EXPORT_SYMBOL(posix_acl_from_mode);

/*
 * Return 0 if current is granted want access to the inode
 * by the acl. Returns -E... otherwise.
 */
int
posix_acl_permission(struct mnt_idmap *idmap, struct inode *inode,
                     const struct posix_acl *acl, int want)
{
        const struct posix_acl_entry *pa, *pe, *mask_obj;
        struct user_namespace *fs_userns = i_user_ns(inode);
        int found = 0;
        vfsuid_t vfsuid;
        vfsgid_t vfsgid;

        want &= MAY_READ | MAY_WRITE | MAY_EXEC;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                /* (May have been checked already) */
                                vfsuid = i_uid_into_vfsuid(idmap, inode);
                                if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                                        goto check_perm;
                                break;
                        case ACL_USER:
                                vfsuid = make_vfsuid(idmap, fs_userns,
                                                     pa->e_uid);
                                if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                                        goto mask;
                                break;
                        case ACL_GROUP_OBJ:
                                vfsgid = i_gid_into_vfsgid(idmap, inode);
                                if (vfsgid_in_group_p(vfsgid)) {
                                        found = 1;
                                        if ((pa->e_perm & want) == want)
                                                goto mask;
                                }
                                break;
                        case ACL_GROUP:
                                vfsgid = make_vfsgid(idmap, fs_userns,
                                                     pa->e_gid);
                                if (vfsgid_in_group_p(vfsgid)) {
                                        found = 1;
                                        if ((pa->e_perm & want) == want)
                                                goto mask;
                                }
                                break;
                        case ACL_MASK:
                                break;
                        case ACL_OTHER:
                                if (found)
                                        return -EACCES;
                                else
                                        goto check_perm;
                        default:
                                return -EIO;
                }
        }
        return -EIO;

mask:
        for (mask_obj = pa+1; mask_obj != pe; mask_obj++) {
                if (mask_obj->e_tag == ACL_MASK) {
                        if ((pa->e_perm & mask_obj->e_perm & want) == want)
                                return 0;
                        return -EACCES;
                }
        }

check_perm:
        if ((pa->e_perm & want) == want)
                return 0;
        return -EACCES;
}

/*
 * Modify acl when creating a new inode. The caller must ensure the acl is
 * only referenced once.
 *
 * mode_p initially must contain the mode parameter to the open() / creat()
 * system calls. All permissions that are not granted by the acl are removed.
 * The permissions in the acl are changed to reflect the mode_p parameter.
 */
static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
{
        struct posix_acl_entry *pa, *pe;
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
        umode_t mode = *mode_p;
        int not_equiv = 0;

        /* assert(atomic_read(acl->a_refcount) == 1); */

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                pa->e_perm &= (mode >> 6) | ~S_IRWXO;
                                mode &= (pa->e_perm << 6) | ~S_IRWXU;
                                break;

                        case ACL_USER:
                        case ACL_GROUP:
                                not_equiv = 1;
                                break;

                        case ACL_GROUP_OBJ:
                                group_obj = pa;
                                break;

                        case ACL_OTHER:
                                pa->e_perm &= mode | ~S_IRWXO;
                                mode &= pa->e_perm | ~S_IRWXO;
                                break;

                        case ACL_MASK:
                                mask_obj = pa;
                                not_equiv = 1;
                                break;

                        default:
                                return -EIO;
                }
        }

        if (mask_obj) {
                mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
                mode &= (mask_obj->e_perm << 3) | ~S_IRWXG;
        } else {
                if (!group_obj)
                        return -EIO;
                group_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
                mode &= (group_obj->e_perm << 3) | ~S_IRWXG;
        }

        *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
        return not_equiv;
}

/*
 * Modify the ACL for the chmod syscall.
 */
static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
{
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
        struct posix_acl_entry *pa, *pe;

        /* assert(atomic_read(acl->a_refcount) == 1); */

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                pa->e_perm = (mode & S_IRWXU) >> 6;
                                break;

                        case ACL_USER:
                        case ACL_GROUP:
                                break;

                        case ACL_GROUP_OBJ:
                                group_obj = pa;
                                break;

                        case ACL_MASK:
                                mask_obj = pa;
                                break;

                        case ACL_OTHER:
                                pa->e_perm = (mode & S_IRWXO);
                                break;

                        default:
                                return -EIO;
                }
        }

        if (mask_obj) {
                mask_obj->e_perm = (mode & S_IRWXG) >> 3;
        } else {
                if (!group_obj)
                        return -EIO;
                group_obj->e_perm = (mode & S_IRWXG) >> 3;
        }

        return 0;
}

int
__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
{
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
        if (clone) {
                err = posix_acl_create_masq(clone, mode_p);
                if (err < 0) {
                        posix_acl_release(clone);
                        clone = NULL;
                }
        }
        posix_acl_release(*acl);
        *acl = clone;
        return err;
}
EXPORT_SYMBOL(__posix_acl_create);

int
__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
{
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
        if (clone) {
                err = __posix_acl_chmod_masq(clone, mode);
                if (err) {
                        posix_acl_release(clone);
                        clone = NULL;
                }
        }
        posix_acl_release(*acl);
        *acl = clone;
        return err;
}
EXPORT_SYMBOL(__posix_acl_chmod);

/**
 * posix_acl_chmod - chmod a posix acl
 *
 * @idmap:        idmap of the mount @inode was found from
 * @dentry:        dentry to check permissions on
 * @mode:        the new mode of @inode
 *
 * If the dentry has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
int
 posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry,
                    umode_t mode)
{
        struct inode *inode = d_inode(dentry);
        struct posix_acl *acl;
        int ret = 0;

        if (!IS_POSIXACL(inode))
                return 0;
        if (!inode->i_op->set_acl)
                return -EOPNOTSUPP;

        acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR_OR_NULL(acl)) {
                if (acl == ERR_PTR(-EOPNOTSUPP))
                        return 0;
                return PTR_ERR(acl);
        }

        ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
        if (ret)
                return ret;
        ret = inode->i_op->set_acl(idmap, dentry, acl, ACL_TYPE_ACCESS);
        posix_acl_release(acl);
        return ret;
}
EXPORT_SYMBOL(posix_acl_chmod);

int
posix_acl_create(struct inode *dir, umode_t *mode,
                struct posix_acl **default_acl, struct posix_acl **acl)
{
        struct posix_acl *p;
        struct posix_acl *clone;
        int ret;

        *acl = NULL;
        *default_acl = NULL;

        if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
                return 0;

        p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
        if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
                *mode &= ~current_umask();
                return 0;
        }
        if (IS_ERR(p))
                return PTR_ERR(p);

        ret = -ENOMEM;
        clone = posix_acl_clone(p, GFP_NOFS);
        if (!clone)
                goto err_release;

        ret = posix_acl_create_masq(clone, mode);
        if (ret < 0)
                goto err_release_clone;

        if (ret == 0)
                posix_acl_release(clone);
        else
                *acl = clone;

        if (!S_ISDIR(*mode))
                posix_acl_release(p);
        else
                *default_acl = p;

        return 0;

err_release_clone:
        posix_acl_release(clone);
err_release:
        posix_acl_release(p);
        return ret;
}
EXPORT_SYMBOL_GPL(posix_acl_create);

/**
 * posix_acl_update_mode  -  update mode in set_acl
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        target inode
 * @mode_p:        mode (pointer) for update
 * @acl:        acl pointer
 *
 * Update the file mode when setting an ACL: compute the new file permission
 * bits based on the ACL.  In addition, if the ACL is equivalent to the new
 * file mode, set *@acl to NULL to indicate that no ACL should be set.
 *
 * As with chmod, clear the setgid bit if the caller is not in the owning group
 * or capable of CAP_FSETID (see inode_change_ok).
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Called from set_acl inode operations.
 */
int posix_acl_update_mode(struct mnt_idmap *idmap,
                          struct inode *inode, umode_t *mode_p,
                          struct posix_acl **acl)
{
        umode_t mode = inode->i_mode;
        int error;

        error = posix_acl_equiv_mode(*acl, &mode);
        if (error < 0)
                return error;
        if (error == 0)
                *acl = NULL;
        if (!in_group_or_capable(idmap, inode,
                                 i_gid_into_vfsgid(idmap, inode)))
                mode &= ~S_ISGID;
        *mode_p = mode;
        return 0;
}
EXPORT_SYMBOL(posix_acl_update_mode);

/*
 * Fix up the uids and gids in posix acl extended attributes in place.
 */
static int posix_acl_fix_xattr_common(const void *value, size_t size)
{
        const struct posix_acl_xattr_header *header = value;
        int count;

        if (!header)
                return -EINVAL;
        if (size < sizeof(struct posix_acl_xattr_header))
                return -EINVAL;
        if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
                return -EOPNOTSUPP;

        count = posix_acl_xattr_count(size);
        if (count < 0)
                return -EINVAL;
        if (count == 0)
                return 0;

        return count;
}

/**
 * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
 * @userns: the filesystem's idmapping
 * @value: the uapi representation of POSIX ACLs
 * @size: the size of @void
 *
 * Filesystems that store POSIX ACLs in the unaltered uapi format should use
 * posix_acl_from_xattr() when reading them from the backing store and
 * converting them into the struct posix_acl VFS format. The helper is
 * specifically intended to be called from the acl inode operation.
 *
 * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
 * in ACL_{GROUP,USER} entries into idmapping in @userns.
 *
 * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
 * If it did it calling it from the get acl inode operation would return POSIX
 * ACLs mapped according to an idmapped mount which would mean that the value
 * couldn't be cached for the filesystem. Idmapped mounts are taken into
 * account on the fly during permission checking or right at the VFS -
 * userspace boundary before reporting them to the user.
 *
 * Return: Allocated struct posix_acl on success, NULL for a valid header but
 *         without actual POSIX ACL entries, or ERR_PTR() encoded error code.
 */
struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns,
                                       const void *value, size_t size)
{
        const struct posix_acl_xattr_header *header = value;
        const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
        int count;
        struct posix_acl *acl;
        struct posix_acl_entry *acl_e;

        count = posix_acl_fix_xattr_common(value, size);
        if (count < 0)
                return ERR_PTR(count);
        if (count == 0)
                return NULL;

        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        acl_e = acl->a_entries;

        for (end = entry + count; entry != end; acl_e++, entry++) {
                acl_e->e_tag  = le16_to_cpu(entry->e_tag);
                acl_e->e_perm = le16_to_cpu(entry->e_perm);

                switch(acl_e->e_tag) {
                        case ACL_USER_OBJ:
                        case ACL_GROUP_OBJ:
                        case ACL_MASK:
                        case ACL_OTHER:
                                break;

                        case ACL_USER:
                                acl_e->e_uid = make_kuid(userns,
                                                le32_to_cpu(entry->e_id));
                                if (!uid_valid(acl_e->e_uid))
                                        goto fail;
                                break;
                        case ACL_GROUP:
                                acl_e->e_gid = make_kgid(userns,
                                                le32_to_cpu(entry->e_id));
                                if (!gid_valid(acl_e->e_gid))
                                        goto fail;
                                break;

                        default:
                                goto fail;
                }
        }
        return acl;

fail:
        posix_acl_release(acl);
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL (posix_acl_from_xattr);

/*
 * Convert from in-memory to extended attribute representation.
 */
void *
posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
                   size_t *sizep, gfp_t gfp)
{
        struct posix_acl_xattr_header *ext_acl;
        struct posix_acl_xattr_entry *ext_entry;
        size_t size;
        int n;

        size = posix_acl_xattr_size(acl->a_count);
        ext_acl = kmalloc(size, gfp);
        if (!ext_acl)
                return NULL;

        ext_entry = (void *)(ext_acl + 1);
        ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);

        for (n=0; n < acl->a_count; n++, ext_entry++) {
                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
                ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
                ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
                switch(acl_e->e_tag) {
                case ACL_USER:
                        ext_entry->e_id =
                                cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
                        break;
                case ACL_GROUP:
                        ext_entry->e_id =
                                cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
                        break;
                default:
                        ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
                        break;
                }
        }
        *sizep = size;
        return ext_acl;
}
EXPORT_SYMBOL (posix_acl_to_xattr);

/**
 * vfs_posix_acl_to_xattr - convert from kernel to userspace representation
 * @idmap: idmap of the mount
 * @inode: inode the posix acls are set on
 * @acl: the posix acls as represented by the vfs
 * @buffer: the buffer into which to convert @acl
 * @size: size of @buffer
 *
 * This converts @acl from the VFS representation in the filesystem idmapping
 * to the uapi form reportable to userspace. And mount and caller idmappings
 * are handled appropriately.
 *
 * Return: On success, the size of the stored uapi posix acls, on error a
 * negative errno.
 */
static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap,
                                      struct inode *inode,
                                      const struct posix_acl *acl, void *buffer,
                                      size_t size)

{
        struct posix_acl_xattr_header *ext_acl = buffer;
        struct posix_acl_xattr_entry *ext_entry;
        struct user_namespace *fs_userns, *caller_userns;
        ssize_t real_size, n;
        vfsuid_t vfsuid;
        vfsgid_t vfsgid;

        real_size = posix_acl_xattr_size(acl->a_count);
        if (!buffer)
                return real_size;
        if (real_size > size)
                return -ERANGE;

        ext_entry = (void *)(ext_acl + 1);
        ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);

        fs_userns = i_user_ns(inode);
        caller_userns = current_user_ns();
        for (n=0; n < acl->a_count; n++, ext_entry++) {
                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
                ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
                ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
                switch(acl_e->e_tag) {
                case ACL_USER:
                        vfsuid = make_vfsuid(idmap, fs_userns, acl_e->e_uid);
                        ext_entry->e_id = cpu_to_le32(from_kuid(
                                caller_userns, vfsuid_into_kuid(vfsuid)));
                        break;
                case ACL_GROUP:
                        vfsgid = make_vfsgid(idmap, fs_userns, acl_e->e_gid);
                        ext_entry->e_id = cpu_to_le32(from_kgid(
                                caller_userns, vfsgid_into_kgid(vfsgid)));
                        break;
                default:
                        ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
                        break;
                }
        }
        return real_size;
}

int
set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry,
              int type, struct posix_acl *acl)
{
        struct inode *inode = d_inode(dentry);

        if (!IS_POSIXACL(inode))
                return -EOPNOTSUPP;
        if (!inode->i_op->set_acl)
                return -EOPNOTSUPP;

        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
                return acl ? -EACCES : 0;
        if (!inode_owner_or_capable(idmap, inode))
                return -EPERM;

        if (acl) {
                int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
                if (ret)
                        return ret;
        }
        return inode->i_op->set_acl(idmap, dentry, acl, type);
}
EXPORT_SYMBOL(set_posix_acl);

int posix_acl_listxattr(struct inode *inode, char **buffer,
                        ssize_t *remaining_size)
{
        int err;

        if (!IS_POSIXACL(inode))
                return 0;

        if (inode->i_acl) {
                err = xattr_list_one(buffer, remaining_size,
                                     XATTR_NAME_POSIX_ACL_ACCESS);
                if (err)
                        return err;
        }

        if (inode->i_default_acl) {
                err = xattr_list_one(buffer, remaining_size,
                                     XATTR_NAME_POSIX_ACL_DEFAULT);
                if (err)
                        return err;
        }

        return 0;
}

static bool
posix_acl_xattr_list(struct dentry *dentry)
{
        return IS_POSIXACL(d_backing_inode(dentry));
}

/*
 * nop_posix_acl_access - legacy xattr handler for access POSIX ACLs
 *
 * This is the legacy POSIX ACL access xattr handler. It is used by some
 * filesystems to implement their ->listxattr() inode operation. New code
 * should never use them.
 */
const struct xattr_handler nop_posix_acl_access = {
        .name = XATTR_NAME_POSIX_ACL_ACCESS,
        .list = posix_acl_xattr_list,
};
EXPORT_SYMBOL_GPL(nop_posix_acl_access);

/*
 * nop_posix_acl_default - legacy xattr handler for default POSIX ACLs
 *
 * This is the legacy POSIX ACL default xattr handler. It is used by some
 * filesystems to implement their ->listxattr() inode operation. New code
 * should never use them.
 */
const struct xattr_handler nop_posix_acl_default = {
        .name = XATTR_NAME_POSIX_ACL_DEFAULT,
        .list = posix_acl_xattr_list,
};
EXPORT_SYMBOL_GPL(nop_posix_acl_default);

int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   struct posix_acl *acl, int type)
{
        int error;
        struct inode *inode = d_inode(dentry);

        if (type == ACL_TYPE_ACCESS) {
                error = posix_acl_update_mode(idmap, inode,
                                &inode->i_mode, &acl);
                if (error)
                        return error;
        }

        inode_set_ctime_current(inode);
        if (IS_I_VERSION(inode))
                inode_inc_iversion(inode);
        set_cached_acl(inode, type, acl);
        return 0;
}

int simple_acl_create(struct inode *dir, struct inode *inode)
{
        struct posix_acl *default_acl, *acl;
        int error;

        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
        if (error)
                return error;

        set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
        set_cached_acl(inode, ACL_TYPE_ACCESS, acl);

        if (default_acl)
                posix_acl_release(default_acl);
        if (acl)
                posix_acl_release(acl);
        return 0;
}

static int vfs_set_acl_idmapped_mnt(struct mnt_idmap *idmap,
                                    struct user_namespace *fs_userns,
                                    struct posix_acl *acl)
{
        for (int n = 0; n < acl->a_count; n++) {
                struct posix_acl_entry *acl_e = &acl->a_entries[n];

                switch (acl_e->e_tag) {
                case ACL_USER:
                        acl_e->e_uid = from_vfsuid(idmap, fs_userns,
                                                   VFSUIDT_INIT(acl_e->e_uid));
                        break;
                case ACL_GROUP:
                        acl_e->e_gid = from_vfsgid(idmap, fs_userns,
                                                   VFSGIDT_INIT(acl_e->e_gid));
                        break;
                }
        }

        return 0;
}

/**
 * vfs_set_acl - set posix acls
 * @idmap: idmap of the mount
 * @dentry: the dentry based on which to set the posix acls
 * @acl_name: the name of the posix acl
 * @kacl: the posix acls in the appropriate VFS format
 *
 * This function sets @kacl. The caller must all posix_acl_release() on @kacl
 * afterwards.
 *
 * Return: On success 0, on error negative errno.
 */
int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                const char *acl_name, struct posix_acl *kacl)
{
        int acl_type;
        int error;
        struct inode *inode = d_inode(dentry);
        struct delegated_inode delegated_inode = { };

        acl_type = posix_acl_type(acl_name);
        if (acl_type < 0)
                return -EINVAL;

        if (kacl) {
                /*
                 * If we're on an idmapped mount translate from mount specific
                 * vfs{g,u}id_t into global filesystem k{g,u}id_t.
                 * Afterwards we can cache the POSIX ACLs filesystem wide and -
                 * if this is a filesystem with a backing store - ultimately
                 * translate them to backing store values.
                 */
                error = vfs_set_acl_idmapped_mnt(idmap, i_user_ns(inode), kacl);
                if (error)
                        return error;
        }

retry_deleg:
        inode_lock(inode);

        /*
         * We only care about restrictions the inode struct itself places upon
         * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
         */
        error = may_write_xattr(idmap, inode);
        if (error)
                goto out_inode_unlock;

        error = security_inode_set_acl(idmap, dentry, acl_name, kacl);
        if (error)
                goto out_inode_unlock;

        error = try_break_deleg(inode, &delegated_inode);
        if (error)
                goto out_inode_unlock;

        if (likely(!is_bad_inode(inode)))
                error = set_posix_acl(idmap, dentry, acl_type, kacl);
        else
                error = -EIO;
        if (!error) {
                fsnotify_xattr(dentry);
                security_inode_post_set_acl(dentry, acl_name, kacl);
        }

out_inode_unlock:
        inode_unlock(inode);

        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        return error;
}
EXPORT_SYMBOL_GPL(vfs_set_acl);

/**
 * vfs_get_acl - get posix acls
 * @idmap: idmap of the mount
 * @dentry: the dentry based on which to retrieve the posix acls
 * @acl_name: the name of the posix acl
 *
 * This function retrieves @kacl from the filesystem. The caller must all
 * posix_acl_release() on @kacl.
 *
 * Return: On success POSIX ACLs in VFS format, on error negative errno.
 */
struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name)
{
        struct inode *inode = d_inode(dentry);
        struct posix_acl *acl;
        int acl_type, error;

        acl_type = posix_acl_type(acl_name);
        if (acl_type < 0)
                return ERR_PTR(-EINVAL);

        /*
         * The VFS has no restrictions on reading POSIX ACLs so calling
         * something like xattr_permission() isn't needed. Only LSMs get a say.
         */
        error = security_inode_get_acl(idmap, dentry, acl_name);
        if (error)
                return ERR_PTR(error);

        if (!IS_POSIXACL(inode))
                return ERR_PTR(-EOPNOTSUPP);
        if (S_ISLNK(inode->i_mode))
                return ERR_PTR(-EOPNOTSUPP);

        acl = __get_acl(idmap, dentry, inode, acl_type);
        if (IS_ERR(acl))
                return acl;
        if (!acl)
                return ERR_PTR(-ENODATA);

        return acl;
}
EXPORT_SYMBOL_GPL(vfs_get_acl);

/**
 * vfs_remove_acl - remove posix acls
 * @idmap: idmap of the mount
 * @dentry: the dentry based on which to retrieve the posix acls
 * @acl_name: the name of the posix acl
 *
 * This function removes posix acls.
 *
 * Return: On success 0, on error negative errno.
 */
int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name)
{
        int acl_type;
        int error;
        struct inode *inode = d_inode(dentry);
        struct delegated_inode delegated_inode = { };

        acl_type = posix_acl_type(acl_name);
        if (acl_type < 0)
                return -EINVAL;

retry_deleg:
        inode_lock(inode);

        /*
         * We only care about restrictions the inode struct itself places upon
         * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
         */
        error = may_write_xattr(idmap, inode);
        if (error)
                goto out_inode_unlock;

        error = security_inode_remove_acl(idmap, dentry, acl_name);
        if (error)
                goto out_inode_unlock;

        error = try_break_deleg(inode, &delegated_inode);
        if (error)
                goto out_inode_unlock;

        if (likely(!is_bad_inode(inode)))
                error = set_posix_acl(idmap, dentry, acl_type, NULL);
        else
                error = -EIO;
        if (!error) {
                fsnotify_xattr(dentry);
                security_inode_post_remove_acl(idmap, dentry, acl_name);
        }

out_inode_unlock:
        inode_unlock(inode);

        if (is_delegated(&delegated_inode)) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        return error;
}
EXPORT_SYMBOL_GPL(vfs_remove_acl);

int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
               const char *acl_name, const void *kvalue, size_t size)
{
        int error;
        struct posix_acl *acl = NULL;

        if (size) {
                /*
                 * Note that posix_acl_from_xattr() uses GFP_NOFS when it
                 * probably doesn't need to here.
                 */
                acl = posix_acl_from_xattr(current_user_ns(), kvalue, size);
                if (IS_ERR(acl))
                        return PTR_ERR(acl);
        }

        error = vfs_set_acl(idmap, dentry, acl_name, acl);
        posix_acl_release(acl);
        return error;
}

ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name, void *kvalue, size_t size)
{
        ssize_t error;
        struct posix_acl *acl;

        acl = vfs_get_acl(idmap, dentry, acl_name);
        if (IS_ERR(acl))
                return PTR_ERR(acl);

        error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry),
                                       acl, kvalue, size);
        posix_acl_release(acl);
        return error;
}























































    1 






































   17 



   17 


    5 










    5 



   16 





   17 










    3 










    9 





   10 






    8 
    2 








   19 










   19 



















































































































































































































































    8 



   18 












   18 




   10 















   10 
    1 













    8 

    8 
    1 
    9 




    2 



































    1 





    1 


    1 







    2 



    3 


    3 







   17 
















   18 








   18 


   18 
   17 





















   18 






   18 







   17 
    8 
























   17 














    2 







   17 




   18 
    3 
   16 




   16 

   18 















   16 





































































































   18 
   17 





   18 

   18 
   16 






    3 



   18 
















   18 
   16 











   17 
    1 












   17 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <linux/sort.h>

#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)

/* non-recursive DFS pseudo code
 * 1  procedure DFS-iterative(G,v):
 * 2      label v as discovered
 * 3      let S be a stack
 * 4      S.push(v)
 * 5      while S is not empty
 * 6            t <- S.peek()
 * 7            if t is what we're looking for:
 * 8                return t
 * 9            for all edges e in G.adjacentEdges(t) do
 * 10               if edge e is already labelled
 * 11                   continue with the next edge
 * 12               w <- G.adjacentVertex(t,e)
 * 13               if vertex w is not discovered and not explored
 * 14                   label e as tree-edge
 * 15                   label w as discovered
 * 16                   S.push(w)
 * 17                   continue at 5
 * 18               else if vertex w is discovered
 * 19                   label e as back-edge
 * 20               else
 * 21                   // vertex w is explored
 * 22                   label e as forward- or cross-edge
 * 23           label t as explored
 * 24           S.pop()
 *
 * convention:
 * 0x10 - discovered
 * 0x11 - discovered and fall-through edge labelled
 * 0x12 - discovered and fall-through and branch edges labelled
 * 0x20 - explored
 */

enum {
        DISCOVERED = 0x10,
        EXPLORED = 0x20,
        FALLTHROUGH = 1,
        BRANCH = 2,
};


static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
{
        struct bpf_subprog_info *subprog;

        subprog = bpf_find_containing_subprog(env, off);
        subprog->changes_pkt_data = true;
}

static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
{
        struct bpf_subprog_info *subprog;

        subprog = bpf_find_containing_subprog(env, off);
        subprog->might_sleep = true;
}

/* 't' is an index of a call-site.
 * 'w' is a callee entry point.
 * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
 * Rely on DFS traversal order and absence of recursive calls to guarantee that
 * callee's change_pkt_data marks would be correct at that moment.
 */
static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
{
        struct bpf_subprog_info *caller, *callee;

        caller = bpf_find_containing_subprog(env, t);
        callee = bpf_find_containing_subprog(env, w);
        caller->changes_pkt_data |= callee->changes_pkt_data;
        caller->might_sleep |= callee->might_sleep;
}

enum {
        DONE_EXPLORING = 0,
        KEEP_EXPLORING = 1,
};

/* t, w, e - match pseudo-code above:
 * t - index of current instruction
 * w - next instruction
 * e - edge
 */
static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
{
        int *insn_stack = env->cfg.insn_stack;
        int *insn_state = env->cfg.insn_state;

        if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
                return DONE_EXPLORING;

        if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
                return DONE_EXPLORING;

        if (w < 0 || w >= env->prog->len) {
                verbose_linfo(env, t, "%d: ", t);
                verbose(env, "jump out of range from insn %d to %d\n", t, w);
                return -EINVAL;
        }

        if (e == BRANCH) {
                /* mark branch target for state pruning */
                mark_prune_point(env, w);
                mark_jmp_point(env, w);
        }

        if (insn_state[w] == 0) {
                /* tree-edge */
                insn_state[t] = DISCOVERED | e;
                insn_state[w] = DISCOVERED;
                if (env->cfg.cur_stack >= env->prog->len)
                        return -E2BIG;
                insn_stack[env->cfg.cur_stack++] = w;
                return KEEP_EXPLORING;
        } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
                if (env->bpf_capable)
                        return DONE_EXPLORING;
                verbose_linfo(env, t, "%d: ", t);
                verbose_linfo(env, w, "%d: ", w);
                verbose(env, "back-edge from insn %d to %d\n", t, w);
                return -EINVAL;
        } else if (insn_state[w] == EXPLORED) {
                /* forward- or cross-edge */
                insn_state[t] = DISCOVERED | e;
        } else {
                verifier_bug(env, "insn state internal bug");
                return -EFAULT;
        }
        return DONE_EXPLORING;
}

static int visit_func_call_insn(int t, struct bpf_insn *insns,
                                struct bpf_verifier_env *env,
                                bool visit_callee)
{
        int ret, insn_sz;
        int w;

        insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
        ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
        if (ret)
                return ret;

        mark_prune_point(env, t + insn_sz);
        /* when we exit from subprog, we need to record non-linear history */
        mark_jmp_point(env, t + insn_sz);

        if (visit_callee) {
                w = t + insns[t].imm + 1;
                mark_prune_point(env, t);
                merge_callee_effects(env, t, w);
                ret = push_insn(t, w, BRANCH, env);
        }
        return ret;
}

struct bpf_iarray *bpf_iarray_realloc(struct bpf_iarray *old, size_t n_elem)
{
        size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]);
        struct bpf_iarray *new;

        new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT);
        if (!new) {
                /* this is what callers always want, so simplify the call site */
                kvfree(old);
                return NULL;
        }

        new->cnt = n_elem;
        return new;
}

static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items)
{
        struct bpf_insn_array_value *value;
        u32 i;

        for (i = start; i <= end; i++) {
                value = map->ops->map_lookup_elem(map, &i);
                /*
                 * map_lookup_elem of an array map will never return an error,
                 * but not checking it makes some static analysers to worry
                 */
                if (IS_ERR(value))
                        return PTR_ERR(value);
                else if (!value)
                        return -EINVAL;
                items[i - start] = value->xlated_off;
        }
        return 0;
}

static int cmp_ptr_to_u32(const void *a, const void *b)
{
        return *(u32 *)a - *(u32 *)b;
}

static int sort_insn_array_uniq(u32 *items, int cnt)
{
        int unique = 1;
        int i;

        sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL);

        for (i = 1; i < cnt; i++)
                if (items[i] != items[unique - 1])
                        items[unique++] = items[i];

        return unique;
}

/*
 * sort_unique({map[start], ..., map[end]}) into off
 */
int bpf_copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off)
{
        u32 n = end - start + 1;
        int err;

        err = copy_insn_array(map, start, end, off);
        if (err)
                return err;

        return sort_insn_array_uniq(off, n);
}

/*
 * Copy all unique offsets from the map
 */
static struct bpf_iarray *jt_from_map(struct bpf_map *map)
{
        struct bpf_iarray *jt;
        int err;
        int n;

        jt = bpf_iarray_realloc(NULL, map->max_entries);
        if (!jt)
                return ERR_PTR(-ENOMEM);

        n = bpf_copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items);
        if (n < 0) {
                err = n;
                goto err_free;
        }
        if (n == 0) {
                err = -EINVAL;
                goto err_free;
        }
        jt->cnt = n;
        return jt;

err_free:
        kvfree(jt);
        return ERR_PTR(err);
}

/*
 * Find and collect all maps which fit in the subprog. Return the result as one
 * combined jump table in jt->items (allocated with kvcalloc)
 */
static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
                                          int subprog_start, int subprog_end)
{
        struct bpf_iarray *jt = NULL;
        struct bpf_map *map;
        struct bpf_iarray *jt_cur;
        int i;

        for (i = 0; i < env->insn_array_map_cnt; i++) {
                /*
                 * TODO (when needed): collect only jump tables, not static keys
                 * or maps for indirect calls
                 */
                map = env->insn_array_maps[i];

                jt_cur = jt_from_map(map);
                if (IS_ERR(jt_cur)) {
                        kvfree(jt);
                        return jt_cur;
                }

                /*
                 * This is enough to check one element. The full table is
                 * checked to fit inside the subprog later in create_jt()
                 */
                if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) {
                        u32 old_cnt = jt ? jt->cnt : 0;
                        jt = bpf_iarray_realloc(jt, old_cnt + jt_cur->cnt);
                        if (!jt) {
                                kvfree(jt_cur);
                                return ERR_PTR(-ENOMEM);
                        }
                        memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2);
                }

                kvfree(jt_cur);
        }

        if (!jt) {
                verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
                return ERR_PTR(-EINVAL);
        }

        jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt);
        return jt;
}

static struct bpf_iarray *
create_jt(int t, struct bpf_verifier_env *env)
{
        struct bpf_subprog_info *subprog;
        int subprog_start, subprog_end;
        struct bpf_iarray *jt;
        int i;

        subprog = bpf_find_containing_subprog(env, t);
        subprog_start = subprog->start;
        subprog_end = (subprog + 1)->start;
        jt = jt_from_subprog(env, subprog_start, subprog_end);
        if (IS_ERR(jt))
                return jt;

        /* Check that the every element of the jump table fits within the given subprogram */
        for (i = 0; i < jt->cnt; i++) {
                if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) {
                        verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n",
                                        t, subprog_start, subprog_end);
                        kvfree(jt);
                        return ERR_PTR(-EINVAL);
                }
        }

        return jt;
}

/* "conditional jump with N edges" */
static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
{
        int *insn_stack = env->cfg.insn_stack;
        int *insn_state = env->cfg.insn_state;
        bool keep_exploring = false;
        struct bpf_iarray *jt;
        int i, w;

        jt = env->insn_aux_data[t].jt;
        if (!jt) {
                jt = create_jt(t, env);
                if (IS_ERR(jt))
                        return PTR_ERR(jt);

                env->insn_aux_data[t].jt = jt;
        }

        mark_prune_point(env, t);
        for (i = 0; i < jt->cnt; i++) {
                w = jt->items[i];
                if (w < 0 || w >= env->prog->len) {
                        verbose(env, "indirect jump out of range from insn %d to %d\n", t, w);
                        return -EINVAL;
                }

                mark_jmp_point(env, w);

                /* EXPLORED || DISCOVERED */
                if (insn_state[w])
                        continue;

                if (env->cfg.cur_stack >= env->prog->len)
                        return -E2BIG;

                insn_stack[env->cfg.cur_stack++] = w;
                insn_state[w] |= DISCOVERED;
                keep_exploring = true;
        }

        return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
}

/*
 * Instructions that can abnormally return from a subprog (tail_call
 * upon success, ld_{abs,ind} upon load failure) have a hidden exit
 * that the verifier must account for.
 */
static int visit_abnormal_return_insn(struct bpf_verifier_env *env, int t)
{
        struct bpf_subprog_info *subprog;
        struct bpf_iarray *jt;

        if (env->insn_aux_data[t].jt)
                return 0;

        jt = bpf_iarray_realloc(NULL, 2);
        if (!jt)
                return -ENOMEM;

        subprog = bpf_find_containing_subprog(env, t);
        jt->items[0] = t + 1;
        jt->items[1] = subprog->exit_idx;
        env->insn_aux_data[t].jt = jt;
        return 0;
}

/* Visits the instruction at index t and returns one of the following:
 *  < 0 - an error occurred
 *  DONE_EXPLORING - the instruction was fully explored
 *  KEEP_EXPLORING - there is still work to be done before it is fully explored
 */
static int visit_insn(int t, struct bpf_verifier_env *env)
{
        struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
        int ret, off, insn_sz;

        if (bpf_pseudo_func(insn))
                return visit_func_call_insn(t, insns, env, true);

        /* All non-branch instructions have a single fall-through edge. */
        if (BPF_CLASS(insn->code) != BPF_JMP &&
            BPF_CLASS(insn->code) != BPF_JMP32) {
                if (BPF_CLASS(insn->code) == BPF_LD &&
                    (BPF_MODE(insn->code) == BPF_ABS ||
                     BPF_MODE(insn->code) == BPF_IND)) {
                        ret = visit_abnormal_return_insn(env, t);
                        if (ret)
                                return ret;
                }
                insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
                return push_insn(t, t + insn_sz, FALLTHROUGH, env);
        }

        switch (BPF_OP(insn->code)) {
        case BPF_EXIT:
                return DONE_EXPLORING;

        case BPF_CALL:
                if (bpf_is_async_callback_calling_insn(insn))
                        /* Mark this call insn as a prune point to trigger
                         * is_state_visited() check before call itself is
                         * processed by __check_func_call(). Otherwise new
                         * async state will be pushed for further exploration.
                         */
                        mark_prune_point(env, t);
                /* For functions that invoke callbacks it is not known how many times
                 * callback would be called. Verifier models callback calling functions
                 * by repeatedly visiting callback bodies and returning to origin call
                 * instruction.
                 * In order to stop such iteration verifier needs to identify when a
                 * state identical some state from a previous iteration is reached.
                 * Check below forces creation of checkpoint before callback calling
                 * instruction to allow search for such identical states.
                 */
                if (bpf_is_sync_callback_calling_insn(insn)) {
                        mark_calls_callback(env, t);
                        mark_force_checkpoint(env, t);
                        mark_prune_point(env, t);
                        mark_jmp_point(env, t);
                }
                if (bpf_helper_call(insn)) {
                        const struct bpf_func_proto *fp;

                        ret = bpf_get_helper_proto(env, insn->imm, &fp);
                        /* If called in a non-sleepable context program will be
                         * rejected anyway, so we should end up with precise
                         * sleepable marks on subprogs, except for dead code
                         * elimination.
                         */
                        if (ret == 0 && fp->might_sleep)
                                mark_subprog_might_sleep(env, t);
                        if (bpf_helper_changes_pkt_data(insn->imm))
                                mark_subprog_changes_pkt_data(env, t);
                        if (insn->imm == BPF_FUNC_tail_call) {
                                ret = visit_abnormal_return_insn(env, t);
                                if (ret)
                                        return ret;
                        }
                } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
                        struct bpf_kfunc_call_arg_meta meta;

                        ret = bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
                        if (ret == 0 && bpf_is_iter_next_kfunc(&meta)) {
                                mark_prune_point(env, t);
                                /* Checking and saving state checkpoints at iter_next() call
                                 * is crucial for fast convergence of open-coded iterator loop
                                 * logic, so we need to force it. If we don't do that,
                                 * is_state_visited() might skip saving a checkpoint, causing
                                 * unnecessarily long sequence of not checkpointed
                                 * instructions and jumps, leading to exhaustion of jump
                                 * history buffer, and potentially other undesired outcomes.
                                 * It is expected that with correct open-coded iterators
                                 * convergence will happen quickly, so we don't run a risk of
                                 * exhausting memory.
                                 */
                                mark_force_checkpoint(env, t);
                        }
                        /* Same as helpers, if called in a non-sleepable context
                         * program will be rejected anyway, so we should end up
                         * with precise sleepable marks on subprogs, except for
                         * dead code elimination.
                         */
                        if (ret == 0 && bpf_is_kfunc_sleepable(&meta))
                                mark_subprog_might_sleep(env, t);
                        if (ret == 0 && bpf_is_kfunc_pkt_changing(&meta))
                                mark_subprog_changes_pkt_data(env, t);
                }
                return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);

        case BPF_JA:
                if (BPF_SRC(insn->code) == BPF_X)
                        return visit_gotox_insn(t, env);

                if (BPF_CLASS(insn->code) == BPF_JMP)
                        off = insn->off;
                else
                        off = insn->imm;

                /* unconditional jump with single edge */
                ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
                if (ret)
                        return ret;

                mark_prune_point(env, t + off + 1);
                mark_jmp_point(env, t + off + 1);

                return ret;

        default:
                /* conditional jump with two edges */
                mark_prune_point(env, t);
                if (bpf_is_may_goto_insn(insn))
                        mark_force_checkpoint(env, t);

                ret = push_insn(t, t + 1, FALLTHROUGH, env);
                if (ret)
                        return ret;

                return push_insn(t, t + insn->off + 1, BRANCH, env);
        }
}

/* non-recursive depth-first-search to detect loops in BPF program
 * loop == back-edge in directed graph
 */
int bpf_check_cfg(struct bpf_verifier_env *env)
{
        int insn_cnt = env->prog->len;
        int *insn_stack, *insn_state;
        int ex_insn_beg, i, ret = 0;

        insn_state = env->cfg.insn_state = kvzalloc_objs(int, insn_cnt,
                                                         GFP_KERNEL_ACCOUNT);
        if (!insn_state)
                return -ENOMEM;

        insn_stack = env->cfg.insn_stack = kvzalloc_objs(int, insn_cnt,
                                                         GFP_KERNEL_ACCOUNT);
        if (!insn_stack) {
                kvfree(insn_state);
                return -ENOMEM;
        }

        ex_insn_beg = env->exception_callback_subprog
                      ? env->subprog_info[env->exception_callback_subprog].start
                      : 0;

        insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
        insn_stack[0] = 0; /* 0 is the first instruction */
        env->cfg.cur_stack = 1;

walk_cfg:
        while (env->cfg.cur_stack > 0) {
                int t = insn_stack[env->cfg.cur_stack - 1];

                ret = visit_insn(t, env);
                switch (ret) {
                case DONE_EXPLORING:
                        insn_state[t] = EXPLORED;
                        env->cfg.cur_stack--;
                        break;
                case KEEP_EXPLORING:
                        break;
                default:
                        if (ret > 0) {
                                verifier_bug(env, "visit_insn internal bug");
                                ret = -EFAULT;
                        }
                        goto err_free;
                }
        }

        if (env->cfg.cur_stack < 0) {
                verifier_bug(env, "pop stack internal bug");
                ret = -EFAULT;
                goto err_free;
        }

        if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) {
                insn_state[ex_insn_beg] = DISCOVERED;
                insn_stack[0] = ex_insn_beg;
                env->cfg.cur_stack = 1;
                goto walk_cfg;
        }

        for (i = 0; i < insn_cnt; i++) {
                struct bpf_insn *insn = &env->prog->insnsi[i];

                if (insn_state[i] != EXPLORED) {
                        verbose(env, "unreachable insn %d\n", i);
                        ret = -EINVAL;
                        goto err_free;
                }
                if (bpf_is_ldimm64(insn)) {
                        if (insn_state[i + 1] != 0) {
                                verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
                                ret = -EINVAL;
                                goto err_free;
                        }
                        i++; /* skip second half of ldimm64 */
                }
        }
        ret = 0; /* cfg looks good */
        env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
        env->prog->aux->might_sleep = env->subprog_info[0].might_sleep;

err_free:
        kvfree(insn_state);
        kvfree(insn_stack);
        env->cfg.insn_state = env->cfg.insn_stack = NULL;
        return ret;
}

/*
 * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range
 * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start)
 * with indices of 'i' instructions in postorder.
 */
int bpf_compute_postorder(struct bpf_verifier_env *env)
{
        u32 cur_postorder, i, top, stack_sz, s;
        int *stack = NULL, *postorder = NULL, *state = NULL;
        struct bpf_iarray *succ;

        postorder = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
        state = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
        stack = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
        if (!postorder || !state || !stack) {
                kvfree(postorder);
                kvfree(state);
                kvfree(stack);
                return -ENOMEM;
        }
        cur_postorder = 0;
        for (i = 0; i < env->subprog_cnt; i++) {
                env->subprog_info[i].postorder_start = cur_postorder;
                stack[0] = env->subprog_info[i].start;
                stack_sz = 1;
                do {
                        top = stack[stack_sz - 1];
                        state[top] |= DISCOVERED;
                        if (state[top] & EXPLORED) {
                                postorder[cur_postorder++] = top;
                                stack_sz--;
                                continue;
                        }
                        succ = bpf_insn_successors(env, top);
                        for (s = 0; s < succ->cnt; ++s) {
                                if (!state[succ->items[s]]) {
                                        stack[stack_sz++] = succ->items[s];
                                        state[succ->items[s]] |= DISCOVERED;
                                }
                        }
                        state[top] |= EXPLORED;
                } while (stack_sz);
        }
        env->subprog_info[i].postorder_start = cur_postorder;
        env->cfg.insn_postorder = postorder;
        env->cfg.cur_postorder = cur_postorder;
        kvfree(stack);
        kvfree(state);
        return 0;
}

/*
 * Compute strongly connected components (SCCs) on the CFG.
 * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc.
 * If instruction is a sole member of its SCC and there are no self edges,
 * assign it SCC number of zero.
 * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation.
 */
int bpf_compute_scc(struct bpf_verifier_env *env)
{
        const u32 NOT_ON_STACK = U32_MAX;

        struct bpf_insn_aux_data *aux = env->insn_aux_data;
        const u32 insn_cnt = env->prog->len;
        int stack_sz, dfs_sz, err = 0;
        u32 *stack, *pre, *low, *dfs;
        u32 i, j, t, w;
        u32 next_preorder_num;
        u32 next_scc_id;
        bool assign_scc;
        struct bpf_iarray *succ;

        next_preorder_num = 1;
        next_scc_id = 1;
        /*
         * - 'stack' accumulates vertices in DFS order, see invariant comment below;
         * - 'pre[t] == p' => preorder number of vertex 't' is 'p';
         * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n';
         * - 'dfs' DFS traversal stack, used to emulate explicit recursion.
         */
        stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
        pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
        low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
        dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT);
        if (!stack || !pre || !low || !dfs) {
                err = -ENOMEM;
                goto exit;
        }
        /*
         * References:
         * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms"
         * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components"
         *
         * The algorithm maintains the following invariant:
         * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]';
         * - then, vertex 'u' remains on stack while vertex 'v' is on stack.
         *
         * Consequently:
         * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u',
         *   such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack,
         *   and thus there is an SCC (loop) containing both 'u' and 'v'.
         * - If 'low[v] == pre[v]', loops containing 'v' have been explored,
         *   and 'v' can be considered the root of some SCC.
         *
         * Here is a pseudo-code for an explicitly recursive version of the algorithm:
         *
         *    NOT_ON_STACK = insn_cnt + 1
         *    pre = [0] * insn_cnt
         *    low = [0] * insn_cnt
         *    scc = [0] * insn_cnt
         *    stack = []
         *
         *    next_preorder_num = 1
         *    next_scc_id = 1
         *
         *    def recur(w):
         *        nonlocal next_preorder_num
         *        nonlocal next_scc_id
         *
         *        pre[w] = next_preorder_num
         *        low[w] = next_preorder_num
         *        next_preorder_num += 1
         *        stack.append(w)
         *        for s in successors(w):
         *            # Note: for classic algorithm the block below should look as:
         *            #
         *            # if pre[s] == 0:
         *            #     recur(s)
         *            #     low[w] = min(low[w], low[s])
         *            # elif low[s] != NOT_ON_STACK:
         *            #     low[w] = min(low[w], pre[s])
         *            #
         *            # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])'
         *            # does not break the invariant and makes iterative version of the algorithm
         *            # simpler. See 'Algorithm #3' from [2].
         *
         *            # 's' not yet visited
         *            if pre[s] == 0:
         *                recur(s)
         *            # if 's' is on stack, pick lowest reachable preorder number from it;
         *            # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]',
         *            # so 'min' would be a noop.
         *            low[w] = min(low[w], low[s])
         *
         *        if low[w] == pre[w]:
         *            # 'w' is the root of an SCC, pop all vertices
         *            # below 'w' on stack and assign same SCC to them.
         *            while True:
         *                t = stack.pop()
         *                low[t] = NOT_ON_STACK
         *                scc[t] = next_scc_id
         *                if t == w:
         *                    break
         *            next_scc_id += 1
         *
         *    for i in range(0, insn_cnt):
         *        if pre[i] == 0:
         *            recur(i)
         *
         * Below implementation replaces explicit recursion with array 'dfs'.
         */
        for (i = 0; i < insn_cnt; i++) {
                if (pre[i])
                        continue;
                stack_sz = 0;
                dfs_sz = 1;
                dfs[0] = i;
dfs_continue:
                while (dfs_sz) {
                        w = dfs[dfs_sz - 1];
                        if (pre[w] == 0) {
                                low[w] = next_preorder_num;
                                pre[w] = next_preorder_num;
                                next_preorder_num++;
                                stack[stack_sz++] = w;
                        }
                        /* Visit 'w' successors */
                        succ = bpf_insn_successors(env, w);
                        for (j = 0; j < succ->cnt; ++j) {
                                if (pre[succ->items[j]]) {
                                        low[w] = min(low[w], low[succ->items[j]]);
                                } else {
                                        dfs[dfs_sz++] = succ->items[j];
                                        goto dfs_continue;
                                }
                        }
                        /*
                         * Preserve the invariant: if some vertex above in the stack
                         * is reachable from 'w', keep 'w' on the stack.
                         */
                        if (low[w] < pre[w]) {
                                dfs_sz--;
                                goto dfs_continue;
                        }
                        /*
                         * Assign SCC number only if component has two or more elements,
                         * or if component has a self reference, or if instruction is a
                         * callback calling function (implicit loop).
                         */
                        assign_scc = stack[stack_sz - 1] != w;        /* two or more elements? */
                        for (j = 0; j < succ->cnt; ++j) {        /* self reference? */
                                if (succ->items[j] == w) {
                                        assign_scc = true;
                                        break;
                                }
                        }
                        if (bpf_calls_callback(env, w)) /* implicit loop? */
                                assign_scc = true;
                        /* Pop component elements from stack */
                        do {
                                t = stack[--stack_sz];
                                low[t] = NOT_ON_STACK;
                                if (assign_scc)
                                        aux[t].scc = next_scc_id;
                        } while (t != w);
                        if (assign_scc)
                                next_scc_id++;
                        dfs_sz--;
                }
        }
        env->scc_info = kvzalloc_objs(*env->scc_info, next_scc_id,
                                      GFP_KERNEL_ACCOUNT);
        if (!env->scc_info) {
                err = -ENOMEM;
                goto exit;
        }
        env->scc_cnt = next_scc_id;
exit:
        kvfree(stack);
        kvfree(pre);
        kvfree(low);
        kvfree(dfs);
        return err;
}
















































































































    2 




    2 
    2 





    1 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 *
 * This is an implementation of the BLAKE2s hash and PRF functions.
 *
 * Information: https://blake2.net/
 *
 */

#include <crypto/blake2s.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/unroll.h>
#include <linux/types.h>

static const u8 blake2s_sigma[10][16] = {
        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
};

static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
{
        ctx->t[0] += inc;
        ctx->t[1] += (ctx->t[0] < inc);
}

static void __maybe_unused
blake2s_compress_generic(struct blake2s_ctx *ctx,
                         const u8 *data, size_t nblocks, u32 inc)
{
        u32 m[16];
        u32 v[16];
        int i;

        WARN_ON(IS_ENABLED(DEBUG) &&
                (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));

        while (nblocks > 0) {
                blake2s_increment_counter(ctx, inc);
                memcpy(m, data, BLAKE2S_BLOCK_SIZE);
                le32_to_cpu_array(m, ARRAY_SIZE(m));
                memcpy(v, ctx->h, 32);
                v[ 8] = BLAKE2S_IV0;
                v[ 9] = BLAKE2S_IV1;
                v[10] = BLAKE2S_IV2;
                v[11] = BLAKE2S_IV3;
                v[12] = BLAKE2S_IV4 ^ ctx->t[0];
                v[13] = BLAKE2S_IV5 ^ ctx->t[1];
                v[14] = BLAKE2S_IV6 ^ ctx->f[0];
                v[15] = BLAKE2S_IV7 ^ ctx->f[1];

#define G(r, i, a, b, c, d) do { \
        a += b + m[blake2s_sigma[r][2 * i + 0]]; \
        d = ror32(d ^ a, 16); \
        c += d; \
        b = ror32(b ^ c, 12); \
        a += b + m[blake2s_sigma[r][2 * i + 1]]; \
        d = ror32(d ^ a, 8); \
        c += d; \
        b = ror32(b ^ c, 7); \
} while (0)

                /*
                 * Unroll the rounds loop to enable constant-folding of the
                 * blake2s_sigma values.
                 */
                unrolled_full
                for (int r = 0; r < 10; r++) {
                        G(r, 0, v[0], v[4], v[8], v[12]);
                        G(r, 1, v[1], v[5], v[9], v[13]);
                        G(r, 2, v[2], v[6], v[10], v[14]);
                        G(r, 3, v[3], v[7], v[11], v[15]);
                        G(r, 4, v[0], v[5], v[10], v[15]);
                        G(r, 5, v[1], v[6], v[11], v[12]);
                        G(r, 6, v[2], v[7], v[8], v[13]);
                        G(r, 7, v[3], v[4], v[9], v[14]);
                }
#undef G

                for (i = 0; i < 8; ++i)
                        ctx->h[i] ^= v[i] ^ v[i + 8];

                data += BLAKE2S_BLOCK_SIZE;
                --nblocks;
        }
}

#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_ARCH
#include "blake2s.h" /* $(SRCARCH)/blake2s.h */
#else
#define blake2s_compress blake2s_compress_generic
#endif

static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
{
        ctx->f[0] = -1;
}

void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
{
        const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;

        if (unlikely(!inlen))
                return;
        if (inlen > fill) {
                memcpy(ctx->buf + ctx->buflen, in, fill);
                blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
                ctx->buflen = 0;
                in += fill;
                inlen -= fill;
        }
        if (inlen > BLAKE2S_BLOCK_SIZE) {
                const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);

                blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
                in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
                inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
        }
        memcpy(ctx->buf + ctx->buflen, in, inlen);
        ctx->buflen += inlen;
}
EXPORT_SYMBOL(blake2s_update);

void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
{
        WARN_ON(IS_ENABLED(DEBUG) && !out);
        blake2s_set_lastblock(ctx);
        memset(ctx->buf + ctx->buflen, 0,
               BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
        blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
        cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
        memcpy(out, ctx->h, ctx->outlen);
        memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL(blake2s_final);

#ifdef blake2s_mod_init_arch
static int __init blake2s_mod_init(void)
{
        blake2s_mod_init_arch();
        return 0;
}
subsys_initcall(blake2s_mod_init);
#endif

MODULE_DESCRIPTION("BLAKE2s hash function");
MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");










































































   15 




   16 






















































   15 
   16 
















   12 


   12 






















   11 






















































































































   11 
   11 
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
// SPDX-License-Identifier: GPL-2.0
/*
 * Disk events - monitor disk events like media change and eject request.
 */
#include <linux/export.h>
#include <linux/moduleparam.h>
#include <linux/blkdev.h>
#include "blk.h"

struct disk_events {
        struct list_head        node;                /* all disk_event's */
        struct gendisk                *disk;                /* the associated disk */
        spinlock_t                lock;

        struct mutex                block_mutex;        /* protects blocking */
        int                        block;                /* event blocking depth */
        unsigned int                pending;        /* events already sent out */
        unsigned int                clearing;        /* events being cleared */

        long                        poll_msecs;        /* interval, -1 for default */
        struct delayed_work        dwork;
};

static const char *disk_events_strs[] = {
        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "media_change",
        [ilog2(DISK_EVENT_EJECT_REQUEST)]        = "eject_request",
};

static char *disk_uevents[] = {
        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "DISK_MEDIA_CHANGE=1",
        [ilog2(DISK_EVENT_EJECT_REQUEST)]        = "DISK_EJECT_REQUEST=1",
};

/* list of all disk_events */
static DEFINE_MUTEX(disk_events_mutex);
static LIST_HEAD(disk_events);

/* disable in-kernel polling by default */
static unsigned long disk_events_dfl_poll_msecs;

static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
{
        struct disk_events *ev = disk->ev;
        long intv_msecs = 0;

        /*
         * If device-specific poll interval is set, always use it.  If
         * the default is being used, poll if the POLL flag is set.
         */
        if (ev->poll_msecs >= 0)
                intv_msecs = ev->poll_msecs;
        else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
                intv_msecs = disk_events_dfl_poll_msecs;

        return msecs_to_jiffies(intv_msecs);
}

/**
 * disk_block_events - block and flush disk event checking
 * @disk: disk to block events for
 *
 * On return from this function, it is guaranteed that event checking
 * isn't in progress and won't happen until unblocked by
 * disk_unblock_events().  Events blocking is counted and the actual
 * unblocking happens after the matching number of unblocks are done.
 *
 * Note that this intentionally does not block event checking from
 * disk_clear_events().
 *
 * CONTEXT:
 * Might sleep.
 */
void disk_block_events(struct gendisk *disk)
{
        struct disk_events *ev = disk->ev;
        unsigned long flags;
        bool cancel;

        if (!ev)
                return;

        /*
         * Outer mutex ensures that the first blocker completes canceling
         * the event work before further blockers are allowed to finish.
         */
        mutex_lock(&ev->block_mutex);

        spin_lock_irqsave(&ev->lock, flags);
        cancel = !ev->block++;
        spin_unlock_irqrestore(&ev->lock, flags);

        if (cancel)
                cancel_delayed_work_sync(&disk->ev->dwork);

        mutex_unlock(&ev->block_mutex);
}

static void __disk_unblock_events(struct gendisk *disk, bool check_now)
{
        struct disk_events *ev = disk->ev;
        unsigned long intv;
        unsigned long flags;

        spin_lock_irqsave(&ev->lock, flags);

        if (WARN_ON_ONCE(ev->block <= 0))
                goto out_unlock;

        if (--ev->block)
                goto out_unlock;

        intv = disk_events_poll_jiffies(disk);
        if (check_now)
                queue_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, 0);
        else if (intv)
                queue_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, intv);
out_unlock:
        spin_unlock_irqrestore(&ev->lock, flags);
}

/**
 * disk_unblock_events - unblock disk event checking
 * @disk: disk to unblock events for
 *
 * Undo disk_block_events().  When the block count reaches zero, it
 * starts events polling if configured.
 *
 * CONTEXT:
 * Don't care.  Safe to call from irq context.
 */
void disk_unblock_events(struct gendisk *disk)
{
        if (disk->ev)
                __disk_unblock_events(disk, false);
}

/**
 * disk_flush_events - schedule immediate event checking and flushing
 * @disk: disk to check and flush events for
 * @mask: events to flush
 *
 * Schedule immediate event checking on @disk if not blocked.  Events in
 * @mask are scheduled to be cleared from the driver.  Note that this
 * doesn't clear the events from @disk->ev.
 *
 * CONTEXT:
 * If @mask is non-zero must be called with disk->open_mutex held.
 */
void disk_flush_events(struct gendisk *disk, unsigned int mask)
{
        struct disk_events *ev = disk->ev;

        if (!ev)
                return;

        spin_lock_irq(&ev->lock);
        ev->clearing |= mask;
        if (!ev->block)
                mod_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, 0);
        spin_unlock_irq(&ev->lock);
}

/*
 * Tell userland about new events.  Only the events listed in @disk->events are
 * reported, and only if DISK_EVENT_FLAG_UEVENT is set.  Otherwise, events are
 * processed internally but never get reported to userland.
 */
static void disk_event_uevent(struct gendisk *disk, unsigned int events)
{
        char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
        int nr_events = 0, i;

        for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
                if (events & disk->events & (1 << i))
                        envp[nr_events++] = disk_uevents[i];

        if (nr_events)
                kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
}

static void disk_check_events(struct disk_events *ev,
                              unsigned int *clearing_ptr)
{
        struct gendisk *disk = ev->disk;
        unsigned int clearing = *clearing_ptr;
        unsigned int events;
        unsigned long intv;

        /* check events */
        events = disk->fops->check_events(disk, clearing);

        /* accumulate pending events and schedule next poll if necessary */
        spin_lock_irq(&ev->lock);

        events &= ~ev->pending;
        ev->pending |= events;
        *clearing_ptr &= ~clearing;

        intv = disk_events_poll_jiffies(disk);
        if (!ev->block && intv)
                queue_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, intv);

        spin_unlock_irq(&ev->lock);

        if (events & DISK_EVENT_MEDIA_CHANGE)
                inc_diskseq(disk);

        if (disk->event_flags & DISK_EVENT_FLAG_UEVENT)
                disk_event_uevent(disk, events);
}

/**
 * disk_clear_events - synchronously check, clear and return pending events
 * @disk: disk to fetch and clear events from
 * @mask: mask of events to be fetched and cleared
 *
 * Disk events are synchronously checked and pending events in @mask
 * are cleared and returned.  This ignores the block count.
 *
 * CONTEXT:
 * Might sleep.
 */
static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{
        struct disk_events *ev = disk->ev;
        unsigned int pending;
        unsigned int clearing = mask;

        if (!ev)
                return 0;

        disk_block_events(disk);

        /*
         * store the union of mask and ev->clearing on the stack so that the
         * race with disk_flush_events does not cause ambiguity (ev->clearing
         * can still be modified even if events are blocked).
         */
        spin_lock_irq(&ev->lock);
        clearing |= ev->clearing;
        ev->clearing = 0;
        spin_unlock_irq(&ev->lock);

        disk_check_events(ev, &clearing);
        /*
         * if ev->clearing is not 0, the disk_flush_events got called in the
         * middle of this function, so we want to run the workfn without delay.
         */
        __disk_unblock_events(disk, ev->clearing ? true : false);

        /* then, fetch and clear pending events */
        spin_lock_irq(&ev->lock);
        pending = ev->pending & mask;
        ev->pending &= ~mask;
        spin_unlock_irq(&ev->lock);
        WARN_ON_ONCE(clearing & mask);

        return pending;
}

/**
 * disk_check_media_change - check if a removable media has been changed
 * @disk: gendisk to check
 *
 * Returns %true and marks the disk for a partition rescan whether a removable
 * media has been changed, and %false if the media did not change.
 */
bool disk_check_media_change(struct gendisk *disk)
{
        unsigned int events;

        events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
                                   DISK_EVENT_EJECT_REQUEST);
        if (events & DISK_EVENT_MEDIA_CHANGE) {
                set_bit(GD_NEED_PART_SCAN, &disk->state);
                return true;
        }
        return false;
}
EXPORT_SYMBOL(disk_check_media_change);

/**
 * disk_force_media_change - force a media change event
 * @disk: the disk which will raise the event
 *
 * Should be called when the media changes for @disk.  Generates a uevent
 * and attempts to free all dentries and inodes and invalidates all block
 * device page cache entries in that case.
 *
 * Callers that need a partition re-scan should arrange for one explicitly.
 */
void disk_force_media_change(struct gendisk *disk)
{
        disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE);
        inc_diskseq(disk);
        bdev_mark_dead(disk->part0, true);
}
EXPORT_SYMBOL_GPL(disk_force_media_change);

/*
 * Separate this part out so that a different pointer for clearing_ptr can be
 * passed in for disk_clear_events.
 */
static void disk_events_workfn(struct work_struct *work)
{
        struct delayed_work *dwork = to_delayed_work(work);
        struct disk_events *ev = container_of(dwork, struct disk_events, dwork);

        disk_check_events(ev, &ev->clearing);
}

/*
 * A disk events enabled device has the following sysfs nodes under
 * its /sys/block/X/ directory.
 *
 * events                : list of all supported events
 * events_async                : list of events which can be detected w/o polling
 *                          (always empty, only for backwards compatibility)
 * events_poll_msecs        : polling interval, 0: disable, -1: system default
 */
static ssize_t __disk_events_show(unsigned int events, char *buf)
{
        const char *delim = "";
        ssize_t pos = 0;
        int i;

        for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
                if (events & (1 << i)) {
                        pos += sprintf(buf + pos, "%s%s",
                                       delim, disk_events_strs[i]);
                        delim = " ";
                }
        if (pos)
                pos += sprintf(buf + pos, "\n");
        return pos;
}

static ssize_t disk_events_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
                return 0;
        return __disk_events_show(disk->events, buf);
}

static ssize_t disk_events_async_show(struct device *dev,
                                      struct device_attribute *attr, char *buf)
{
        return 0;
}

static ssize_t disk_events_poll_msecs_show(struct device *dev,
                                           struct device_attribute *attr,
                                           char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!disk->ev)
                return sprintf(buf, "-1\n");
        return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
}

static ssize_t disk_events_poll_msecs_store(struct device *dev,
                                            struct device_attribute *attr,
                                            const char *buf, size_t count)
{
        struct gendisk *disk = dev_to_disk(dev);
        long intv;

        if (!count || !sscanf(buf, "%ld", &intv))
                return -EINVAL;

        if (intv < 0 && intv != -1)
                return -EINVAL;

        if (!disk->ev)
                return -ENODEV;

        disk_block_events(disk);
        disk->ev->poll_msecs = intv;
        __disk_unblock_events(disk, true);
        return count;
}

DEVICE_ATTR(events, 0444, disk_events_show, NULL);
DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show,
            disk_events_poll_msecs_store);

/*
 * The default polling interval can be specified by the kernel
 * parameter block.events_dfl_poll_msecs which defaults to 0
 * (disable).  This can also be modified runtime by writing to
 * /sys/module/block/parameters/events_dfl_poll_msecs.
 */
static int disk_events_set_dfl_poll_msecs(const char *val,
                                          const struct kernel_param *kp)
{
        struct disk_events *ev;
        int ret;

        ret = param_set_ulong(val, kp);
        if (ret < 0)
                return ret;

        mutex_lock(&disk_events_mutex);
        list_for_each_entry(ev, &disk_events, node)
                disk_flush_events(ev->disk, 0);
        mutex_unlock(&disk_events_mutex);
        return 0;
}

static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
        .set        = disk_events_set_dfl_poll_msecs,
        .get        = param_get_ulong,
};

#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX        "block."

module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
                &disk_events_dfl_poll_msecs, 0644);

/*
 * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
 */
int disk_alloc_events(struct gendisk *disk)
{
        struct disk_events *ev;

        if (!disk->fops->check_events || !disk->events)
                return 0;

        ev = kzalloc_obj(*ev);
        if (!ev) {
                pr_warn("%s: failed to initialize events\n", disk->disk_name);
                return -ENOMEM;
        }

        INIT_LIST_HEAD(&ev->node);
        ev->disk = disk;
        spin_lock_init(&ev->lock);
        mutex_init(&ev->block_mutex);
        ev->block = 1;
        ev->poll_msecs = -1;
        INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);

        disk->ev = ev;
        return 0;
}

void disk_add_events(struct gendisk *disk)
{
        if (!disk->ev)
                return;

        mutex_lock(&disk_events_mutex);
        list_add_tail(&disk->ev->node, &disk_events);
        mutex_unlock(&disk_events_mutex);

        /*
         * Block count is initialized to 1 and the following initial
         * unblock kicks it into action.
         */
        __disk_unblock_events(disk, true);
}

void disk_del_events(struct gendisk *disk)
{
        if (disk->ev) {
                disk_block_events(disk);

                mutex_lock(&disk_events_mutex);
                list_del_init(&disk->ev->node);
                mutex_unlock(&disk_events_mutex);
        }
}

void disk_release_events(struct gendisk *disk)
{
        /* the block count should be 1 from disk_del_events() */
        WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
        kfree(disk->ev);
}






































































































































































































































































































































































































































































































































































































































   18 



   21 

   20 







































































    3 


    3 




































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
// SPDX-License-Identifier: GPL-2.0
/* netfilter.c: look after the filters for various protocols.
 * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
 *
 * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
 * way.
 */
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <net/protocol.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/wait.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/if.h>
#include <linux/netdevice.h>
#include <linux/netfilter_ipv6.h>
#include <linux/inetdevice.h>
#include <linux/proc_fs.h>
#include <linux/mutex.h>
#include <linux/mm.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>
#include <net/netfilter/nf_queue.h>
#include <net/sock.h>

#include "nf_internals.h"

#ifdef CONFIG_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
#endif

static DEFINE_MUTEX(nf_hook_mutex);

/* max hooks per family/hooknum */
#define MAX_HOOK_COUNT                1024

#define nf_entry_dereference(e) \
        rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex))

static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
{
        struct nf_hook_entries *e;
        size_t alloc = sizeof(*e) +
                       sizeof(struct nf_hook_entry) * num +
                       sizeof(struct nf_hook_ops *) * num +
                       sizeof(struct nf_hook_entries_rcu_head);

        if (num == 0)
                return NULL;

        e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT);
        if (e)
                e->num_hook_entries = num;
        return e;
}

static void __nf_hook_entries_free(struct rcu_head *h)
{
        struct nf_hook_entries_rcu_head *head;

        head = container_of(h, struct nf_hook_entries_rcu_head, head);
        kvfree(head->allocation);
}

static void nf_hook_entries_free(struct nf_hook_entries *e)
{
        struct nf_hook_entries_rcu_head *head;
        struct nf_hook_ops **ops;
        unsigned int num;

        if (!e)
                return;

        num = e->num_hook_entries;
        ops = nf_hook_entries_get_hook_ops(e);
        head = (void *)&ops[num];
        head->allocation = e;
        call_rcu(&head->head, __nf_hook_entries_free);
}

static unsigned int accept_all(void *priv,
                               struct sk_buff *skb,
                               const struct nf_hook_state *state)
{
        return NF_ACCEPT; /* ACCEPT makes nf_hook_slow call next hook */
}

static const struct nf_hook_ops dummy_ops = {
        .hook = accept_all,
        .priority = INT_MIN,
};

static struct nf_hook_entries *
nf_hook_entries_grow(const struct nf_hook_entries *old,
                     const struct nf_hook_ops *reg)
{
        unsigned int i, alloc_entries, nhooks, old_entries;
        struct nf_hook_ops **orig_ops = NULL;
        struct nf_hook_ops **new_ops;
        struct nf_hook_entries *new;
        bool inserted = false;

        alloc_entries = 1;
        old_entries = old ? old->num_hook_entries : 0;

        if (old) {
                orig_ops = nf_hook_entries_get_hook_ops(old);

                for (i = 0; i < old_entries; i++) {
                        if (orig_ops[i] != &dummy_ops)
                                alloc_entries++;

                        /* Restrict BPF hook type to force a unique priority, not
                         * shared at attach time.
                         *
                         * This is mainly to avoid ordering issues between two
                         * different bpf programs, this doesn't prevent a normal
                         * hook at same priority as a bpf one (we don't want to
                         * prevent defrag, conntrack, iptables etc from attaching).
                         */
                        if (reg->priority == orig_ops[i]->priority &&
                            reg->hook_ops_type == NF_HOOK_OP_BPF)
                                return ERR_PTR(-EBUSY);
                }
        }

        if (alloc_entries > MAX_HOOK_COUNT)
                return ERR_PTR(-E2BIG);

        new = allocate_hook_entries_size(alloc_entries);
        if (!new)
                return ERR_PTR(-ENOMEM);

        new_ops = nf_hook_entries_get_hook_ops(new);

        i = 0;
        nhooks = 0;
        while (i < old_entries) {
                if (orig_ops[i] == &dummy_ops) {
                        ++i;
                        continue;
                }

                if (inserted || reg->priority > orig_ops[i]->priority) {
                        new_ops[nhooks] = (void *)orig_ops[i];
                        new->hooks[nhooks] = old->hooks[i];
                        i++;
                } else {
                        new_ops[nhooks] = (void *)reg;
                        new->hooks[nhooks].hook = reg->hook;
                        new->hooks[nhooks].priv = reg->priv;
                        inserted = true;
                }
                nhooks++;
        }

        if (!inserted) {
                new_ops[nhooks] = (void *)reg;
                new->hooks[nhooks].hook = reg->hook;
                new->hooks[nhooks].priv = reg->priv;
        }

        return new;
}

static void hooks_validate(const struct nf_hook_entries *hooks)
{
#ifdef CONFIG_DEBUG_MISC
        struct nf_hook_ops **orig_ops;
        int prio = INT_MIN;
        size_t i = 0;

        orig_ops = nf_hook_entries_get_hook_ops(hooks);

        for (i = 0; i < hooks->num_hook_entries; i++) {
                if (orig_ops[i] == &dummy_ops)
                        continue;

                WARN_ON(orig_ops[i]->priority < prio);

                if (orig_ops[i]->priority > prio)
                        prio = orig_ops[i]->priority;
        }
#endif
}

int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
                                const struct nf_hook_ops *reg)
{
        struct nf_hook_entries *new_hooks;
        struct nf_hook_entries *p;

        p = rcu_dereference_raw(*pp);
        new_hooks = nf_hook_entries_grow(p, reg);
        if (IS_ERR(new_hooks))
                return PTR_ERR(new_hooks);

        hooks_validate(new_hooks);

        rcu_assign_pointer(*pp, new_hooks);

        BUG_ON(p == new_hooks);
        nf_hook_entries_free(p);
        return 0;
}
EXPORT_SYMBOL_GPL(nf_hook_entries_insert_raw);

/*
 * __nf_hook_entries_try_shrink - try to shrink hook array
 *
 * @old -- current hook blob at @pp
 * @pp -- location of hook blob
 *
 * Hook unregistration must always succeed, so to-be-removed hooks
 * are replaced by a dummy one that will just move to next hook.
 *
 * This counts the current dummy hooks, attempts to allocate new blob,
 * copies the live hooks, then replaces and discards old one.
 *
 * return values:
 *
 * Returns address to free, or NULL.
 */
static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old,
                                          struct nf_hook_entries __rcu **pp)
{
        unsigned int i, j, skip = 0, hook_entries;
        struct nf_hook_entries *new = NULL;
        struct nf_hook_ops **orig_ops;
        struct nf_hook_ops **new_ops;

        if (WARN_ON_ONCE(!old))
                return NULL;

        orig_ops = nf_hook_entries_get_hook_ops(old);
        for (i = 0; i < old->num_hook_entries; i++) {
                if (orig_ops[i] == &dummy_ops)
                        skip++;
        }

        /* if skip == hook_entries all hooks have been removed */
        hook_entries = old->num_hook_entries;
        if (skip == hook_entries)
                goto out_assign;

        if (skip == 0)
                return NULL;

        hook_entries -= skip;
        new = allocate_hook_entries_size(hook_entries);
        if (!new)
                return NULL;

        new_ops = nf_hook_entries_get_hook_ops(new);
        for (i = 0, j = 0; i < old->num_hook_entries; i++) {
                if (orig_ops[i] == &dummy_ops)
                        continue;
                new->hooks[j] = old->hooks[i];
                new_ops[j] = (void *)orig_ops[i];
                j++;
        }
        hooks_validate(new);
out_assign:
        rcu_assign_pointer(*pp, new);
        return old;
}

static struct nf_hook_entries __rcu **
nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
                   struct net_device *dev)
{
        switch (pf) {
        case NFPROTO_NETDEV:
                break;
#ifdef CONFIG_NETFILTER_FAMILY_ARP
        case NFPROTO_ARP:
                if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_arp) <= hooknum))
                        return NULL;
                return net->nf.hooks_arp + hooknum;
#endif
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
        case NFPROTO_BRIDGE:
                if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum))
                        return NULL;
                return net->nf.hooks_bridge + hooknum;
#endif
#ifdef CONFIG_NETFILTER_INGRESS
        case NFPROTO_INET:
                if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS))
                        return NULL;
                if (!dev || dev_net(dev) != net) {
                        WARN_ON_ONCE(1);
                        return NULL;
                }
                return &dev->nf_hooks_ingress;
#endif
        case NFPROTO_IPV4:
                if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum))
                        return NULL;
                return net->nf.hooks_ipv4 + hooknum;
        case NFPROTO_IPV6:
                if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum))
                        return NULL;
                return net->nf.hooks_ipv6 + hooknum;
        default:
                WARN_ON_ONCE(1);
                return NULL;
        }

#ifdef CONFIG_NETFILTER_INGRESS
        if (hooknum == NF_NETDEV_INGRESS) {
                if (dev && dev_net(dev) == net)
                        return &dev->nf_hooks_ingress;
        }
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        if (hooknum == NF_NETDEV_EGRESS) {
                if (dev && dev_net(dev) == net)
                        return &dev->nf_hooks_egress;
        }
#endif
        WARN_ON_ONCE(1);
        return NULL;
}

static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg,
                            int hooknum)
{
#ifndef CONFIG_NETFILTER_INGRESS
        if (reg->hooknum == hooknum)
                return -EOPNOTSUPP;
#endif
        if (reg->hooknum != hooknum ||
            !reg->dev || dev_net(reg->dev) != net)
                return -EINVAL;

        return 0;
}

static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg,
                                                  int pf)
{
        if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) ||
            (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS))
                return true;

        return false;
}

static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg,
                                                 int pf)
{
        return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS;
}

static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf)
{
#ifdef CONFIG_JUMP_LABEL
        int hooknum;

        if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
                pf = NFPROTO_NETDEV;
                hooknum = NF_NETDEV_INGRESS;
        } else {
                hooknum = reg->hooknum;
        }
        static_key_slow_inc(&nf_hooks_needed[pf][hooknum]);
#endif
}

static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf)
{
#ifdef CONFIG_JUMP_LABEL
        int hooknum;

        if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
                pf = NFPROTO_NETDEV;
                hooknum = NF_NETDEV_INGRESS;
        } else {
                hooknum = reg->hooknum;
        }
        static_key_slow_dec(&nf_hooks_needed[pf][hooknum]);
#endif
}

static int __nf_register_net_hook(struct net *net, int pf,
                                  const struct nf_hook_ops *reg)
{
        struct nf_hook_entries *p, *new_hooks;
        struct nf_hook_entries __rcu **pp;
        int err;

        switch (pf) {
        case NFPROTO_NETDEV:
#ifndef CONFIG_NETFILTER_INGRESS
                if (reg->hooknum == NF_NETDEV_INGRESS)
                        return -EOPNOTSUPP;
#endif
#ifndef CONFIG_NETFILTER_EGRESS
                if (reg->hooknum == NF_NETDEV_EGRESS)
                        return -EOPNOTSUPP;
#endif
                if ((reg->hooknum != NF_NETDEV_INGRESS &&
                     reg->hooknum != NF_NETDEV_EGRESS) ||
                    !reg->dev || dev_net(reg->dev) != net)
                        return -EINVAL;
                break;
        case NFPROTO_INET:
                if (reg->hooknum != NF_INET_INGRESS)
                        break;

                err = nf_ingress_check(net, reg, NF_INET_INGRESS);
                if (err < 0)
                        return err;
                break;
        }

        pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
        if (!pp)
                return -EINVAL;

        mutex_lock(&nf_hook_mutex);

        p = nf_entry_dereference(*pp);
        new_hooks = nf_hook_entries_grow(p, reg);

        if (!IS_ERR(new_hooks)) {
                hooks_validate(new_hooks);
                rcu_assign_pointer(*pp, new_hooks);
        }

        mutex_unlock(&nf_hook_mutex);
        if (IS_ERR(new_hooks))
                return PTR_ERR(new_hooks);

#ifdef CONFIG_NETFILTER_INGRESS
        if (nf_ingress_hook(reg, pf))
                net_inc_ingress_queue();
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        if (nf_egress_hook(reg, pf))
                net_inc_egress_queue();
#endif
        nf_static_key_inc(reg, pf);

        BUG_ON(p == new_hooks);
        nf_hook_entries_free(p);
        return 0;
}

/*
 * nf_remove_net_hook - remove a hook from blob
 *
 * @oldp: current address of hook blob
 * @unreg: hook to unregister
 *
 * This cannot fail, hook unregistration must always succeed.
 * Therefore replace the to-be-removed hook with a dummy hook.
 */
static bool nf_remove_net_hook(struct nf_hook_entries *old,
                               const struct nf_hook_ops *unreg)
{
        struct nf_hook_ops **orig_ops;
        unsigned int i;

        orig_ops = nf_hook_entries_get_hook_ops(old);
        for (i = 0; i < old->num_hook_entries; i++) {
                if (orig_ops[i] != unreg)
                        continue;
                WRITE_ONCE(old->hooks[i].hook, accept_all);
                WRITE_ONCE(orig_ops[i], (void *)&dummy_ops);
                return true;
        }

        return false;
}

static void __nf_unregister_net_hook(struct net *net, int pf,
                                     const struct nf_hook_ops *reg)
{
        struct nf_hook_entries __rcu **pp;
        struct nf_hook_entries *p;

        pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
        if (!pp)
                return;

        mutex_lock(&nf_hook_mutex);

        p = nf_entry_dereference(*pp);
        if (WARN_ON_ONCE(!p)) {
                mutex_unlock(&nf_hook_mutex);
                return;
        }

        if (nf_remove_net_hook(p, reg)) {
#ifdef CONFIG_NETFILTER_INGRESS
                if (nf_ingress_hook(reg, pf))
                        net_dec_ingress_queue();
#endif
#ifdef CONFIG_NETFILTER_EGRESS
                if (nf_egress_hook(reg, pf))
                        net_dec_egress_queue();
#endif
                nf_static_key_dec(reg, pf);
        } else {
                WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
        }

        p = __nf_hook_entries_try_shrink(p, pp);
        mutex_unlock(&nf_hook_mutex);
        if (!p)
                return;

        nf_queue_nf_hook_drop(net);
        nf_hook_entries_free(p);
}

void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
        if (reg->pf == NFPROTO_INET) {
                if (reg->hooknum == NF_INET_INGRESS) {
                        __nf_unregister_net_hook(net, NFPROTO_INET, reg);
                } else {
                        __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
                        __nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
                }
        } else {
                __nf_unregister_net_hook(net, reg->pf, reg);
        }
}
EXPORT_SYMBOL(nf_unregister_net_hook);

void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
                                const struct nf_hook_ops *reg)
{
        struct nf_hook_entries *p;

        p = rcu_dereference_raw(*pp);
        if (nf_remove_net_hook(p, reg)) {
                p = __nf_hook_entries_try_shrink(p, pp);
                nf_hook_entries_free(p);
        }
}
EXPORT_SYMBOL_GPL(nf_hook_entries_delete_raw);

int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
        int err;

        if (reg->pf == NFPROTO_INET) {
                if (reg->hooknum == NF_INET_INGRESS) {
                        err = __nf_register_net_hook(net, NFPROTO_INET, reg);
                        if (err < 0)
                                return err;
                } else {
                        err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
                        if (err < 0)
                                return err;

                        err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
                        if (err < 0) {
                                __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
                                return err;
                        }
                }
        } else {
                err = __nf_register_net_hook(net, reg->pf, reg);
                if (err < 0)
                        return err;
        }

        return 0;
}
EXPORT_SYMBOL(nf_register_net_hook);

int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                          unsigned int n)
{
        unsigned int i;
        int err = 0;

        for (i = 0; i < n; i++) {
                err = nf_register_net_hook(net, &reg[i]);
                if (err)
                        goto err;
        }
        return err;

err:
        if (i > 0)
                nf_unregister_net_hooks(net, reg, i);
        return err;
}
EXPORT_SYMBOL(nf_register_net_hooks);

void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                             unsigned int hookcount)
{
        unsigned int i;

        for (i = 0; i < hookcount; i++)
                nf_unregister_net_hook(net, &reg[i]);
}
EXPORT_SYMBOL(nf_unregister_net_hooks);

/* Returns 1 if okfn() needs to be executed by the caller,
 * -EPERM for NF_DROP, 0 otherwise.  Caller must hold rcu_read_lock. */
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
                 const struct nf_hook_entries *e, unsigned int s)
{
        unsigned int verdict;
        int ret;

        for (; s < e->num_hook_entries; s++) {
                verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
                switch (verdict & NF_VERDICT_MASK) {
                case NF_ACCEPT:
                        break;
                case NF_DROP:
                        kfree_skb_reason(skb,
                                         SKB_DROP_REASON_NETFILTER_DROP);
                        ret = NF_DROP_GETERR(verdict);
                        if (ret == 0)
                                ret = -EPERM;
                        return ret;
                case NF_QUEUE:
                        ret = nf_queue(skb, state, s, verdict);
                        if (ret == 1)
                                continue;
                        return ret;
                case NF_STOLEN:
                        return NF_DROP_GETERR(verdict);
                default:
                        WARN_ON_ONCE(1);
                        return 0;
                }
        }

        return 1;
}
EXPORT_SYMBOL(nf_hook_slow);

void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
                       const struct nf_hook_entries *e)
{
        struct sk_buff *skb, *next;
        LIST_HEAD(sublist);
        int ret;

        list_for_each_entry_safe(skb, next, head, list) {
                skb_list_del_init(skb);
                ret = nf_hook_slow(skb, state, e, 0);
                if (ret == 1)
                        list_add_tail(&skb->list, &sublist);
        }
        /* Put passed packets back on main list */
        list_splice(&sublist, head);
}
EXPORT_SYMBOL(nf_hook_slow_list);

/* This needs to be compiled in any case to avoid dependencies between the
 * nfnetlink_queue code and nf_conntrack.
 */
const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nfnl_ct_hook);

const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_hook);

const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_defrag_v4_hook);

const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_defrag_v6_hook);

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
u8 nf_ctnetlink_has_listener;
EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener);

const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_hook);

/* This does not belong here, but locally generated errors need it if connection
 * tracking in use: without this, connection may not be in hash table, and hence
 * manufactured ICMP or RST packets will not be associated with it.
 */
void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
        const struct nf_ct_hook *ct_hook;

        if (skb->_nfct) {
                rcu_read_lock();
                ct_hook = rcu_dereference(nf_ct_hook);
                if (ct_hook)
                        ct_hook->attach(new, skb);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(nf_ct_attach);

void nf_conntrack_destroy(struct nf_conntrack *nfct)
{
        const struct nf_ct_hook *ct_hook;

        rcu_read_lock();
        ct_hook = rcu_dereference(nf_ct_hook);
        if (ct_hook)
                ct_hook->destroy(nfct);
        rcu_read_unlock();

        WARN_ON(!ct_hook);
}
EXPORT_SYMBOL(nf_conntrack_destroy);

void nf_ct_set_closing(struct nf_conntrack *nfct)
{
        const struct nf_ct_hook *ct_hook;

        if (!nfct)
                return;

        rcu_read_lock();
        ct_hook = rcu_dereference(nf_ct_hook);
        if (ct_hook)
                ct_hook->set_closing(nfct);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_ct_set_closing);

bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                         const struct sk_buff *skb)
{
        const struct nf_ct_hook *ct_hook;
        bool ret = false;

        rcu_read_lock();
        ct_hook = rcu_dereference(nf_ct_hook);
        if (ct_hook)
                ret = ct_hook->get_tuple_skb(dst_tuple, skb);
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(nf_ct_get_tuple_skb);

/* Built-in default zone used e.g. by modules. */
const struct nf_conntrack_zone nf_ct_zone_dflt = {
        .id        = NF_CT_DEFAULT_ZONE_ID,
        .dir        = NF_CT_DEFAULT_ZONE_DIR,
};
EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
#endif /* CONFIG_NF_CONNTRACK */

static void __net_init
__netfilter_net_init(struct nf_hook_entries __rcu **e, int max)
{
        int h;

        for (h = 0; h < max; h++)
                RCU_INIT_POINTER(e[h], NULL);
}

static int __net_init netfilter_net_init(struct net *net)
{
        __netfilter_net_init(net->nf.hooks_ipv4, ARRAY_SIZE(net->nf.hooks_ipv4));
        __netfilter_net_init(net->nf.hooks_ipv6, ARRAY_SIZE(net->nf.hooks_ipv6));
#ifdef CONFIG_NETFILTER_FAMILY_ARP
        __netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp));
#endif
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
        __netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
#endif
#ifdef CONFIG_PROC_FS
        net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
                                                net->proc_net);
        if (!net->nf.proc_netfilter) {
                if (!net_eq(net, &init_net))
                        pr_err("cannot create netfilter proc entry");

                return -ENOMEM;
        }
#endif

        return 0;
}

static void __net_exit netfilter_net_exit(struct net *net)
{
        remove_proc_entry("netfilter", net->proc_net);
}

static struct pernet_operations netfilter_net_ops = {
        .init = netfilter_net_init,
        .exit = netfilter_net_exit,
};

int __init netfilter_init(void)
{
        int ret;

        ret = register_pernet_subsys(&netfilter_net_ops);
        if (ret < 0)
                goto err;

#ifdef CONFIG_LWTUNNEL
        ret = netfilter_lwtunnel_init();
        if (ret < 0)
                goto err_lwtunnel_pernet;
#endif
        ret = netfilter_log_init();
        if (ret < 0)
                goto err_log_pernet;

        return 0;
err_log_pernet:
#ifdef CONFIG_LWTUNNEL
        netfilter_lwtunnel_fini();
err_lwtunnel_pernet:
#endif
        unregister_pernet_subsys(&netfilter_net_ops);
err:
        return ret;
}



















































    1 




















































    1 




































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/posix_acl.h

  (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/


#ifndef __LINUX_POSIX_ACL_H
#define __LINUX_POSIX_ACL_H

#include <linux/bug.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <uapi/linux/posix_acl.h>

struct user_namespace;

struct posix_acl_entry {
        short                        e_tag;
        unsigned short                e_perm;
        union {
                kuid_t                e_uid;
                kgid_t                e_gid;
        };
};

struct posix_acl {
        /* New members MUST be added within the struct_group() macro below. */
        struct_group_tagged(posix_acl_hdr, hdr,
                refcount_t                a_refcount;
                unsigned int                a_count;
                struct rcu_head                a_rcu;
        );
        struct posix_acl_entry        a_entries[] __counted_by(a_count);
};
static_assert(offsetof(struct posix_acl, a_entries) == sizeof(struct posix_acl_hdr),
              "struct member likely outside of struct_group_tagged()");

#define FOREACH_ACL_ENTRY(pa, acl, pe) \
        for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++)


/*
 * Duplicate an ACL handle.
 */
static inline struct posix_acl *
posix_acl_dup(struct posix_acl *acl)
{
        if (acl)
                refcount_inc(&acl->a_refcount);
        return acl;
}

/*
 * Free an ACL handle.
 */
static inline void
posix_acl_release(struct posix_acl *acl)
{
        if (acl && refcount_dec_and_test(&acl->a_refcount))
                kfree_rcu(acl, a_rcu);
}


/* posix_acl.c */

extern void posix_acl_init(struct posix_acl *, int);
extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags);
extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);

extern struct posix_acl *get_posix_acl(struct inode *, int);
int set_posix_acl(struct mnt_idmap *, struct dentry *, int,
                  struct posix_acl *);

struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags);

#ifdef CONFIG_FS_POSIX_ACL
int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t);
extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
                struct posix_acl **);
int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *,
                          struct posix_acl **);

int simple_set_acl(struct mnt_idmap *, struct dentry *,
                   struct posix_acl *, int);
extern int simple_acl_create(struct inode *, struct inode *);

struct posix_acl *get_cached_acl(struct inode *inode, int type);
void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
void forget_cached_acl(struct inode *inode, int type);
void forget_all_cached_acls(struct inode *inode);
int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
int posix_acl_permission(struct mnt_idmap *, struct inode *,
                         const struct posix_acl *, int);

static inline void cache_no_acl(struct inode *inode)
{
        inode->i_acl = NULL;
        inode->i_default_acl = NULL;
}

int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                const char *acl_name, struct posix_acl *kacl);
struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name);
int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name);
int posix_acl_listxattr(struct inode *inode, char **buffer,
                        ssize_t *remaining_size);
#else
static inline int posix_acl_chmod(struct mnt_idmap *idmap,
                                  struct dentry *dentry, umode_t mode)
{
        return 0;
}

#define simple_set_acl                NULL

static inline int simple_acl_create(struct inode *dir, struct inode *inode)
{
        return 0;
}
static inline void cache_no_acl(struct inode *inode)
{
}

static inline int posix_acl_create(struct inode *inode, umode_t *mode,
                struct posix_acl **default_acl, struct posix_acl **acl)
{
        *default_acl = *acl = NULL;
        return 0;
}

static inline void forget_all_cached_acls(struct inode *inode)
{
}

static inline int vfs_set_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *name,
                              struct posix_acl *acl)
{
        return -EOPNOTSUPP;
}

static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                                            struct dentry *dentry,
                                            const char *acl_name)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int vfs_remove_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name)
{
        return -EOPNOTSUPP;
}
static inline int posix_acl_listxattr(struct inode *inode, char **buffer,
                                      ssize_t *remaining_size)
{
        return 0;
}
#endif /* CONFIG_FS_POSIX_ACL */

struct posix_acl *get_inode_acl(struct inode *inode, int type);

#endif  /* __LINUX_POSIX_ACL_H */


















    1 














    1 














    1 



























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
// SPDX-License-Identifier: GPL-2.0
/*
 * USB Serial Converter Bus specific functions
 *
 * Copyright (C) 2002 Greg Kroah-Hartman (greg@kroah.com)
 */

#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/tty.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/usb.h>
#include <linux/usb/serial.h>

static int usb_serial_device_match(struct device *dev,
                                   const struct device_driver *drv)
{
        const struct usb_serial_port *port = to_usb_serial_port(dev);
        const struct usb_serial_driver *driver = to_usb_serial_driver(drv);

        /*
         * drivers are already assigned to ports in serial_probe so it's
         * a simple check here.
         */
        if (driver == port->serial->type)
                return 1;

        return 0;
}

static int usb_serial_device_probe(struct device *dev)
{
        struct usb_serial_port *port = to_usb_serial_port(dev);
        struct usb_serial_driver *driver;
        struct device *tty_dev;
        int retval = 0;
        int minor;

        /* make sure suspend/resume doesn't race against port_probe */
        retval = usb_autopm_get_interface(port->serial->interface);
        if (retval)
                return retval;

        driver = port->serial->type;
        if (driver->port_probe) {
                retval = driver->port_probe(port);
                if (retval)
                        goto err_autopm_put;
        }

        minor = port->minor;
        tty_dev = tty_port_register_device(&port->port, usb_serial_tty_driver,
                                           minor, dev);
        if (IS_ERR(tty_dev)) {
                retval = PTR_ERR(tty_dev);
                goto err_port_remove;
        }

        usb_autopm_put_interface(port->serial->interface);

        dev_info(&port->serial->dev->dev,
                 "%s converter now attached to ttyUSB%d\n",
                 driver->description, minor);

        return 0;

err_port_remove:
        if (driver->port_remove)
                driver->port_remove(port);
err_autopm_put:
        usb_autopm_put_interface(port->serial->interface);

        return retval;
}

static void usb_serial_device_remove(struct device *dev)
{
        struct usb_serial_port *port = to_usb_serial_port(dev);
        struct usb_serial_driver *driver;
        int minor;
        int autopm_err;

        /*
         * Make sure suspend/resume doesn't race against port_remove.
         *
         * Note that no further runtime PM callbacks will be made if
         * autopm_get fails.
         */
        autopm_err = usb_autopm_get_interface(port->serial->interface);

        minor = port->minor;
        tty_unregister_device(usb_serial_tty_driver, minor);

        driver = port->serial->type;
        if (driver->port_remove)
                driver->port_remove(port);

        dev_info(dev, "%s converter now disconnected from ttyUSB%d\n",
                 driver->description, minor);

        if (!autopm_err)
                usb_autopm_put_interface(port->serial->interface);
}

static ssize_t new_id_store(struct device_driver *driver,
                            const char *buf, size_t count)
{
        struct usb_serial_driver *usb_drv = to_usb_serial_driver(driver);
        ssize_t retval = usb_store_new_id(&usb_drv->dynids, usb_drv->id_table,
                                         driver, buf, count);

        if (retval >= 0 && usb_drv->usb_driver != NULL)
                retval = usb_store_new_id(&usb_drv->usb_driver->dynids,
                                          usb_drv->usb_driver->id_table,
                                          &usb_drv->usb_driver->driver,
                                          buf, count);
        return retval;
}

static ssize_t new_id_show(struct device_driver *driver, char *buf)
{
        struct usb_serial_driver *usb_drv = to_usb_serial_driver(driver);

        return usb_show_dynids(&usb_drv->dynids, buf);
}
static DRIVER_ATTR_RW(new_id);

static struct attribute *usb_serial_drv_attrs[] = {
        &driver_attr_new_id.attr,
        NULL,
};
ATTRIBUTE_GROUPS(usb_serial_drv);

static void free_dynids(struct usb_serial_driver *drv)
{
        struct usb_dynid *dynid, *n;

        guard(mutex)(&usb_dynids_lock);
        list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
                list_del(&dynid->node);
                kfree(dynid);
        }
}

const struct bus_type usb_serial_bus_type = {
        .name =                "usb-serial",
        .match =        usb_serial_device_match,
        .probe =        usb_serial_device_probe,
        .remove =        usb_serial_device_remove,
        .drv_groups =         usb_serial_drv_groups,
};

int usb_serial_bus_register(struct usb_serial_driver *driver)
{
        int retval;

        driver->driver.bus = &usb_serial_bus_type;
        INIT_LIST_HEAD(&driver->dynids.list);

        retval = driver_register(&driver->driver);

        return retval;
}

void usb_serial_bus_deregister(struct usb_serial_driver *driver)
{
        free_dynids(driver);
        driver_unregister(&driver->driver);
}







































































































   15 













































   15 




























   13 



























































    5 




























































































































































































































































































































































































































   15 
   10 




















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_H
#define _LINUX_RCULIST_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list.h>
#include <linux/rcupdate.h>

/*
 * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
 * @list: list to be initialized
 *
 * You should instead use INIT_LIST_HEAD() for normal initialization and
 * cleanup tasks, when readers have no access to the list being initialized.
 * However, if the list being initialized is visible to readers, you
 * need to keep the compiler from being too mischievous.
 */
static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

/*
 * return the ->next pointer of a list_head in an rcu safe
 * way, we must not access it directly
 */
#define list_next_rcu(list)        (*((struct list_head __rcu **)(&(list)->next)))
/*
 * Return the ->prev pointer of a list_head in an rcu safe way. Don't
 * access it directly.
 *
 * Any list traversed with list_bidir_prev_rcu() must never use
 * list_del_rcu().  Doing so will poison the ->prev pointer that
 * list_bidir_prev_rcu() relies on, which will result in segfaults.
 * To prevent these segfaults, use list_bidir_del_rcu() instead
 * of list_del_rcu().
 */
#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev)))

/**
 * list_for_each_rcu - Iterate over a list in an RCU-safe fashion
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_rcu(pos, head)                  \
        for (pos = rcu_dereference((head)->next); \
             !list_is_head(pos, (head)); \
             pos = rcu_dereference(pos->next))

/**
 * list_tail_rcu - returns the prev pointer of the head of the list
 * @head: the head of the list
 *
 * Note: This should only be used with the list header, and even then
 * only if list_del() and similar primitives are not also used on the
 * list header.
 */
#define list_tail_rcu(head)        (*((struct list_head __rcu **)(&(head)->prev)))

/*
 * Check during list traversal that we are within an RCU reader
 */

#define check_arg_count_one(dummy)

#ifdef CONFIG_PROVE_RCU_LIST
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({                                                                \
        check_arg_count_one(extra);                                        \
        RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(),                \
                         "RCU-list traversed in non-reader section!");        \
        })

#define __list_check_srcu(cond)                                         \
        ({                                                                 \
        RCU_LOCKDEP_WARN(!(cond),                                         \
                "RCU-list traversed without holding the required lock!");\
        })
#else
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({ check_arg_count_one(extra); })

#define __list_check_srcu(cond) ({ })
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add_rcu(struct list_head *new,
                struct list_head *prev, struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        new->next = next;
        new->prev = prev;
        rcu_assign_pointer(list_next_rcu(prev), new);
        next->prev = new;
}

/**
 * list_add_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_rcu(struct list_head *new, struct list_head *head)
{
        __list_add_rcu(new, head, head->next);
}

/**
 * list_add_tail_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_tail_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_tail_rcu(struct list_head *new,
                                        struct list_head *head)
{
        __list_add_rcu(new, head->prev, head);
}

/**
 * list_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * Note: list_empty() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_del_rcu()
 * or list_add_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_del_rcu(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->prev = LIST_POISON2;
}

/**
 * list_bidir_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * In contrast to list_del_rcu() doesn't poison the prev pointer thus
 * allowing backwards traversal via list_bidir_prev_rcu().
 *
 * Note: list_empty() on entry does not return true after this because
 * the entry is in a special undefined state that permits RCU-based
 * lockfree reverse traversal. In particular this means that we can not
 * poison the forward and backwards pointers that may still be used for
 * walking the list.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another list-mutation
 * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on
 * this same list. However, it is perfectly legal to run concurrently
 * with the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on
 * the same list.
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_bidir_del_rcu(struct list_head *entry)
{
        __list_del_entry(entry);
}

/**
 * hlist_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_add_head_rcu() or
 * hlist_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_for_each_entry_rcu().
 */
static inline void hlist_del_init_rcu(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * list_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically from
 * the perspective of concurrent readers.  It is the caller's responsibility
 * to synchronize with concurrent updaters, if any.
 *
 * Note: @old should not be empty.
 */
static inline void list_replace_rcu(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->prev = old->prev;
        rcu_assign_pointer(list_next_rcu(new->prev), new);
        new->next->prev = new;
        old->prev = LIST_POISON2;
}

/**
 * __list_splice_init_rcu - join an RCU-protected list into an existing list.
 * @list:        the RCU-protected list to splice
 * @prev:        points to the last element of the existing list
 * @next:        points to the first element of the existing list
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 *
 * The list pointed to by @prev and @next can be RCU-read traversed
 * concurrently with this function.
 *
 * Note that this function blocks.
 *
 * Important note: the caller must take whatever action is necessary to prevent
 * any other updates to the existing list.  In principle, it is possible to
 * modify the list as soon as sync() begins execution. If this sort of thing
 * becomes necessary, an alternative version based on call_rcu() could be
 * created.  But only if -really- needed -- there is no shortage of RCU API
 * members.
 */
static inline void __list_splice_init_rcu(struct list_head *list,
                                          struct list_head *prev,
                                          struct list_head *next,
                                          void (*sync)(void))
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        /*
         * "first" and "last" tracking list, so initialize it.  RCU readers
         * have access to this list, so we must use INIT_LIST_HEAD_RCU()
         * instead of INIT_LIST_HEAD().
         */

        INIT_LIST_HEAD_RCU(list);

        /*
         * At this point, the list body still points to the source list.
         * Wait for any readers to finish using the list before splicing
         * the list body into the new list.  Any new readers will see
         * an empty list.
         */

        sync();
        ASSERT_EXCLUSIVE_ACCESS(*first);
        ASSERT_EXCLUSIVE_ACCESS(*last);

        /*
         * Readers are finished with the source list, so perform splice.
         * The order is important if the new list is global and accessible
         * to concurrent RCU readers.  Note that RCU readers are not
         * permitted to traverse the prev pointers without excluding
         * this function.
         */

        last->next = next;
        rcu_assign_pointer(list_next_rcu(prev), first);
        first->prev = prev;
        next->prev = last;
}

/**
 * list_splice_init_rcu - splice an RCU-protected list into an existing list,
 *                        designed for stacks.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_init_rcu(struct list_head *list,
                                        struct list_head *head,
                                        void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head, head->next, sync);
}

/**
 * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
 *                             list, designed for queues.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_tail_init_rcu(struct list_head *list,
                                             struct list_head *head,
                                             void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head->prev, head, sync);
}

/**
 * list_entry_rcu - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_entry_rcu(ptr, type, member) \
        container_of(READ_ONCE(ptr), type, member)

/*
 * Where are list_empty_rcu() and list_first_entry_rcu()?
 *
 * They do not exist because they would lead to subtle race conditions:
 *
 * if (!list_empty_rcu(mylist)) {
 *        struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
 *        do_something(bar);
 * }
 *
 * The list might be non-empty when list_empty_rcu() checks it, but it
 * might have become empty by the time that list_first_entry_rcu() rereads
 * the ->next pointer, which would result in a SEGV.
 *
 * When not using RCU, it is OK for list_first_entry() to re-read that
 * pointer because both functions should be protected by some lock that
 * blocks writers.
 *
 * When using RCU, list_empty() uses READ_ONCE() to fetch the
 * RCU-protected ->next pointer and then compares it to the address of the
 * list head.  However, it neither dereferences this pointer nor provides
 * this pointer to its caller.  Thus, READ_ONCE() suffices (that is,
 * rcu_dereference() is not needed), which means that list_empty() can be
 * used anywhere you would want to use list_empty_rcu().  Just don't
 * expect anything useful to happen if you do a subsequent lockless
 * call to list_first_entry_rcu()!!!
 *
 * See list_first_or_null_rcu for an alternative.
 */

/**
 * list_first_or_null_rcu - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_first_or_null_rcu(ptr, type, member) \
({ \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
})

/**
 * list_next_or_null_rcu - get the next element from a list
 * @head:        the head for the list.
 * @ptr:        the list head to take the next element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the ptr is at the end of the list, NULL is returned.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_next_or_null_rcu(head, ptr, type, member) \
({ \
        struct list_head *__head = (head); \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__next != __head) ? list_entry_rcu(__next, type, \
                                                  member) : NULL; \
})

/**
 * list_for_each_entry_rcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define list_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_srcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define list_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_entry_lockless - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_entry_lockless(ptr, type, member) \
        container_of((typeof(ptr))READ_ONCE(ptr), type, member)

/**
 * list_for_each_entry_lockless - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_struct within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_for_each_entry_lockless(pos, head, member) \
        for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
             &pos->member != (head); \
             pos = list_entry_lockless(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_continue_rcu - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position which must have been in the list when the RCU read
 * lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_from_rcu() except
 * this starts after the given position and that one starts at the given
 * position.
 */
#define list_for_each_entry_continue_rcu(pos, head, member)                 \
        for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
             &pos->member != (head);        \
             pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_from_rcu - iterate over a list from current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_node within the struct.
 *
 * Iterate over the tail of a list starting from a given position,
 * which must have been in the list when the RCU read lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_continue_rcu() except
 * this starts from the given position and that one starts from the position
 * after the given position.
 */
#define list_for_each_entry_from_rcu(pos, head, member)                        \
        for (; &(pos)->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member))

/**
 * hlist_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry().
 */
static inline void hlist_del_rcu(struct hlist_node *n)
{
        __hlist_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically from
 * the perspective of concurrent readers.  It is the caller's responsibility
 * to synchronize with concurrent updaters, if any.
 */
static inline void hlist_replace_rcu(struct hlist_node *old,
                                        struct hlist_node *new)
{
        struct hlist_node *next = old->next;

        new->next = next;
        WRITE_ONCE(new->pprev, old->pprev);
        rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
        if (next)
                WRITE_ONCE(new->next->pprev, &new->next);
        WRITE_ONCE(old->pprev, LIST_POISON2);
}

/**
 * hlists_swap_heads_rcu - swap the lists the hlist heads point to
 * @left:  The hlist head on the left
 * @right: The hlist head on the right
 *
 * The lists start out as [@left  ][node1 ... ] and
 *                        [@right ][node2 ... ]
 * The lists end up as    [@left  ][node2 ... ]
 *                        [@right ][node1 ... ]
 */
static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right)
{
        struct hlist_node *node1 = left->first;
        struct hlist_node *node2 = right->first;

        rcu_assign_pointer(left->first, node2);
        rcu_assign_pointer(right->first, node1);
        WRITE_ONCE(node2->pprev, &left->first);
        WRITE_ONCE(node1->pprev, &right->first);
}

/*
 * return the first or the next element in an RCU protected hlist
 */
#define hlist_first_rcu(head)        (*((struct hlist_node __rcu **)(&(head)->first)))
#define hlist_next_rcu(node)        (*((struct hlist_node __rcu **)(&(node)->next)))
#define hlist_pprev_rcu(node)        (*((struct hlist_node __rcu **)((node)->pprev)))

/**
 * hlist_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_head_rcu(struct hlist_node *n,
                                        struct hlist_head *h)
{
        struct hlist_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_first_rcu(h), n);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_tail_rcu(struct hlist_node *n,
                                      struct hlist_head *h)
{
        struct hlist_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; i; i = i->next)
                last = i;

        if (last) {
                n->next = last->next;
                WRITE_ONCE(n->pprev, &last->next);
                rcu_assign_pointer(hlist_next_rcu(last), n);
        } else {
                hlist_add_head_rcu(n, h);
        }
}

/**
 * hlist_add_before_rcu
 * @n: the new element to add to the hash list.
 * @next: the existing element to add the new element before.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * before the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_before_rcu(struct hlist_node *n,
                                        struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        n->next = next;
        rcu_assign_pointer(hlist_pprev_rcu(n), n);
        WRITE_ONCE(next->pprev, &n->next);
}

/**
 * hlist_add_behind_rcu
 * @n: the new element to add to the hash list.
 * @prev: the existing element to add the new element after.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * after the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_behind_rcu(struct hlist_node *n,
                                        struct hlist_node *prev)
{
        n->next = prev->next;
        WRITE_ONCE(n->pprev, &prev->next);
        rcu_assign_pointer(hlist_next_rcu(prev), n);
        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

#define __hlist_for_each_rcu(pos, head)                                \
        for (pos = rcu_dereference(hlist_first_rcu(head));        \
             pos;                                                \
             pos = rcu_dereference(hlist_next_rcu(pos)))

/**
 * hlist_for_each_entry_rcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_srcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define hlist_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing)
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 *
 * This is the same as hlist_for_each_entry_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hlist_for_each_entry_rcu_notrace(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu_bh(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu(pos, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu_bh(pos, member)                \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from_rcu(pos, member)                        \
        for (; pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

#endif        /* __KERNEL__ */
#endif































































































































































































    5 








    9 

    6 

    1 





































    9 

    5 


    8 





    2 

















    4 





















    2 






    5 








    6 


    5 


    5 





   10 







   10 
    8 
    8 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  (C) 2002  David Woodhouse <dwmw2@infradead.org>
  (C) 2012  Michel Lespinasse <walken@google.com>


  linux/include/linux/rbtree_augmented.h
*/

#ifndef _LINUX_RBTREE_AUGMENTED_H
#define _LINUX_RBTREE_AUGMENTED_H

#include <linux/compiler.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>

/*
 * Please note - only struct rb_augment_callbacks and the prototypes for
 * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
 * The rest are implementation details you are not expected to depend on.
 *
 * See Documentation/core-api/rbtree.rst for documentation and samples.
 */

struct rb_augment_callbacks {
        void (*propagate)(struct rb_node *node, struct rb_node *stop);
        void (*copy)(struct rb_node *old, struct rb_node *new);
        void (*rotate)(struct rb_node *old, struct rb_node *new);
};

extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

/*
 * Fixup the rbtree and update the augmented information when rebalancing.
 *
 * On insertion, the user must update the augmented information on the path
 * leading to the inserted node, then call rb_link_node() as usual and
 * rb_insert_augmented() instead of the usual rb_insert_color() call.
 * If rb_insert_augmented() rebalances the rbtree, it will callback into
 * a user provided function to update the augmented information on the
 * affected subtrees.
 */
static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
                    const struct rb_augment_callbacks *augment)
{
        __rb_insert_augmented(node, root, augment->rotate);
}

static inline void
rb_insert_augmented_cached(struct rb_node *node,
                           struct rb_root_cached *root, bool newleft,
                           const struct rb_augment_callbacks *augment)
{
        if (newleft)
                root->rb_leftmost = node;
        rb_insert_augmented(node, &root->rb_root, augment);
}

static __always_inline struct rb_node *
rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
                        bool (*less)(struct rb_node *, const struct rb_node *),
                        const struct rb_augment_callbacks *augment)
{
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        while (*link) {
                parent = *link;
                if (less(node, parent)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(node, parent, link);
        augment->propagate(parent, NULL); /* suboptimal */
        rb_insert_augmented_cached(node, tree, leftmost, augment);

        return leftmost ? node : NULL;
}

/*
 * Template for declaring augmented rbtree callbacks (generic case)
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
 */

#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                \
                             RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)        \
static inline void                                                        \
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)                \
{                                                                        \
        while (rb != stop) {                                                \
                RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);        \
                if (RBCOMPUTE(node, true))                                \
                        break;                                                \
                rb = rb_parent(&node->RBFIELD);                                \
        }                                                                \
}                                                                        \
static inline void                                                        \
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)                \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
}                                                                        \
static void                                                                \
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)        \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
        RBCOMPUTE(old, false);                                                \
}                                                                        \
RBSTATIC const struct rb_augment_callbacks RBNAME = {                        \
        .propagate = RBNAME ## _propagate,                                \
        .copy = RBNAME ## _copy,                                        \
        .rotate = RBNAME ## _rotate                                        \
};

/*
 * Template for declaring augmented rbtree callbacks,
 * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBTYPE:      type of the RBAUGMENTED field
 * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
 */

#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,              \
                                 RBTYPE, RBAUGMENTED, RBCOMPUTE)              \
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)              \
{                                                                              \
        RBSTRUCT *child;                                                      \
        RBTYPE max = RBCOMPUTE(node);                                              \
        if (node->RBFIELD.rb_left) {                                              \
                child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (node->RBFIELD.rb_right) {                                              \
                child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (exit && node->RBAUGMENTED == max)                                      \
                return true;                                                      \
        node->RBAUGMENTED = max;                                              \
        return false;                                                              \
}                                                                              \
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                              \
                     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)


#define        RB_RED                0
#define        RB_BLACK        1

#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))

#define __rb_color(pc)     ((pc) & 1)
#define __rb_is_black(pc)  __rb_color(pc)
#define __rb_is_red(pc)    (!__rb_color(pc))
#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)

static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
        rb->__rb_parent_color = rb_color(rb) + (unsigned long)p;
}

static inline void rb_set_parent_color(struct rb_node *rb,
                                       struct rb_node *p, int color)
{
        rb->__rb_parent_color = (unsigned long)p + color;
}

static inline void
__rb_change_child(struct rb_node *old, struct rb_node *new,
                  struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        WRITE_ONCE(parent->rb_left, new);
                else
                        WRITE_ONCE(parent->rb_right, new);
        } else
                WRITE_ONCE(root->rb_node, new);
}

static inline void
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
                      struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        rcu_assign_pointer(parent->rb_left, new);
                else
                        rcu_assign_pointer(parent->rb_right, new);
        } else
                rcu_assign_pointer(root->rb_node, new);
}

extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                     const struct rb_augment_callbacks *augment)
{
        struct rb_node *child = node->rb_right;
        struct rb_node *tmp = node->rb_left;
        struct rb_node *parent, *rebalance;
        unsigned long pc;

        if (!tmp) {
                /*
                 * Case 1: node to erase has no more than 1 child (easy!)
                 *
                 * Note that if there is one child it must be red due to 5)
                 * and node must be black due to 4). We adjust colors locally
                 * so as to bypass __rb_erase_color() later on.
                 */
                pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, child, parent, root);
                if (child) {
                        child->__rb_parent_color = pc;
                        rebalance = NULL;
                } else
                        rebalance = __rb_is_black(pc) ? parent : NULL;
                tmp = parent;
        } else if (!child) {
                /* Still case 1, but this time the child is node->rb_left */
                tmp->__rb_parent_color = pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, tmp, parent, root);
                rebalance = NULL;
                tmp = parent;
        } else {
                struct rb_node *successor = child, *child2;

                tmp = child->rb_left;
                if (!tmp) {
                        /*
                         * Case 2: node's successor is its right child
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (s)  ->  (x) (c)
                         *        \
                         *        (c)
                         */
                        parent = successor;
                        child2 = successor->rb_right;

                        augment->copy(node, successor);
                } else {
                        /*
                         * Case 3: node's successor is leftmost under
                         * node's right child subtree
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (y)  ->  (x) (y)
                         *      /            /
                         *    (p)          (p)
                         *    /            /
                         *  (s)          (c)
                         *    \
                         *    (c)
                         */
                        do {
                                parent = successor;
                                successor = tmp;
                                tmp = tmp->rb_left;
                        } while (tmp);
                        child2 = successor->rb_right;
                        WRITE_ONCE(parent->rb_left, child2);
                        WRITE_ONCE(successor->rb_right, child);
                        rb_set_parent(child, successor);

                        augment->copy(node, successor);
                        augment->propagate(parent, successor);
                }

                tmp = node->rb_left;
                WRITE_ONCE(successor->rb_left, tmp);
                rb_set_parent(tmp, successor);

                pc = node->__rb_parent_color;
                tmp = __rb_parent(pc);
                __rb_change_child(node, successor, tmp, root);

                if (child2) {
                        rb_set_parent_color(child2, parent, RB_BLACK);
                        rebalance = NULL;
                } else {
                        rebalance = rb_is_black(successor) ? parent : NULL;
                }
                successor->__rb_parent_color = pc;
                tmp = successor;
        }

        augment->propagate(tmp, NULL);
        return rebalance;
}

static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                   const struct rb_augment_callbacks *augment)
{
        struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
        if (rebalance)
                __rb_erase_color(rebalance, root, augment->rotate);
}

static __always_inline void
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
                          const struct rb_augment_callbacks *augment)
{
        if (root->rb_leftmost == node)
                root->rb_leftmost = rb_next(node);
        rb_erase_augmented(node, &root->rb_root, augment);
}

#endif        /* _LINUX_RBTREE_AUGMENTED_H */

























































   77 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * x86 TSC related functions
 */
#ifndef _ASM_X86_TSC_H
#define _ASM_X86_TSC_H

#include <asm/asm.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
#include <asm/msr.h>

/**
 * rdtsc() - returns the current TSC without ordering constraints
 *
 * rdtsc() returns the result of RDTSC as a 64-bit integer.  The
 * only ordering constraint it supplies is the ordering implied by
 * "asm volatile": it will put the RDTSC in the place you expect.  The
 * CPU can and will speculatively execute that RDTSC, though, so the
 * results can be non-monotonic if compared on different CPUs.
 */
static __always_inline u64 rdtsc(void)
{
        EAX_EDX_DECLARE_ARGS(val, low, high);

        asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

        return EAX_EDX_VAL(val, low, high);
}

/**
 * rdtsc_ordered() - read the current TSC in program order
 *
 * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
 * It is ordered like a load to a global in-memory counter.  It should
 * be impossible to observe non-monotonic rdtsc_unordered() behavior
 * across multiple CPUs as long as the TSC is synced.
 */
static __always_inline u64 rdtsc_ordered(void)
{
        EAX_EDX_DECLARE_ARGS(val, low, high);

        /*
         * The RDTSC instruction is not ordered relative to memory
         * access.  The Intel SDM and the AMD APM are both vague on this
         * point, but empirically an RDTSC instruction can be
         * speculatively executed before prior loads.  An RDTSC
         * immediately after an appropriate barrier appears to be
         * ordered as a normal load, that is, it provides the same
         * ordering guarantees as reading from a global memory location
         * that some other imaginary CPU is updating continuously with a
         * time stamp.
         *
         * Thus, use the preferred barrier on the respective CPU, aiming for
         * RDTSCP as the default.
         */
        asm volatile(ALTERNATIVE_2("rdtsc",
                                   "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC,
                                   "rdtscp", X86_FEATURE_RDTSCP)
                        : EAX_EDX_RET(val, low, high)
                        /* RDTSCP clobbers ECX with MSR_TSC_AUX. */
                        :: "ecx");

        return EAX_EDX_VAL(val, low, high);
}

/*
 * Standard way to access the cycle counter.
 */
typedef unsigned long long cycles_t;

extern unsigned int cpu_khz;
extern unsigned int tsc_khz;

extern void disable_TSC(void);

static inline cycles_t get_cycles(void)
{
        if (!IS_ENABLED(CONFIG_X86_TSC) &&
            !cpu_feature_enabled(X86_FEATURE_TSC))
                return 0;
        return rdtsc();
}
#define get_cycles get_cycles

extern void tsc_early_init(void);
extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern void mark_tsc_async_resets(char *reason);
extern unsigned long native_calibrate_cpu_early(void);
extern unsigned long native_calibrate_tsc(void);
extern unsigned long long native_sched_clock_from_tsc(u64 tsc);

extern int tsc_clocksource_reliable;
#ifdef CONFIG_X86_TSC
extern bool tsc_async_resets;
#else
# define tsc_async_resets        false
#endif

/*
 * Boot-time check whether the TSCs are synchronized across
 * all CPUs/cores:
 */
#ifdef CONFIG_X86_TSC
extern bool tsc_store_and_check_tsc_adjust(bool bootcpu);
extern void tsc_verify_tsc_adjust(bool resume);
extern void check_tsc_sync_target(void);
#else
static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; }
static inline void tsc_verify_tsc_adjust(bool resume) { }
static inline void check_tsc_sync_target(void) { }
#endif

extern int notsc_setup(char *);
extern void tsc_save_sched_clock_state(void);
extern void tsc_restore_sched_clock_state(void);

unsigned long cpu_khz_from_msr(void);

#endif /* _ASM_X86_TSC_H */


















































































































    1 











    1 

























































































    3 









    1 
    3 

















    2 
    2 


















    3 


    3 



































































































































































   10 



   10 
   10 












   10 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                IPv4 Forwarding Information Base: FIB frontend.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 */

#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>

#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
#include <net/l3mdev.h>
#include <net/lwtunnel.h>
#include <trace/events/fib.h>

#ifndef CONFIG_IP_MULTIPLE_TABLES

static int __net_init fib4_rules_init(struct net *net)
{
        struct fib_table *local_table, *main_table;

        main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
        if (!main_table)
                return -ENOMEM;

        local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
        if (!local_table)
                goto fail;

        hlist_add_head_rcu(&local_table->tb_hlist,
                                &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
        hlist_add_head_rcu(&main_table->tb_hlist,
                                &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
        return 0;

fail:
        fib_free_table(main_table);
        return -ENOMEM;
}
#else

struct fib_table *fib_new_table(struct net *net, u32 id)
{
        struct fib_table *tb, *alias = NULL;
        unsigned int h;

        if (id == 0)
                id = RT_TABLE_MAIN;
        tb = fib_get_table(net, id);
        if (tb)
                return tb;

        if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
                alias = fib_new_table(net, RT_TABLE_MAIN);

        tb = fib_trie_table(id, alias);
        if (!tb)
                return NULL;

        switch (id) {
        case RT_TABLE_MAIN:
                rcu_assign_pointer(net->ipv4.fib_main, tb);
                break;
        case RT_TABLE_DEFAULT:
                rcu_assign_pointer(net->ipv4.fib_default, tb);
                break;
        default:
                break;
        }

        h = id & (FIB_TABLE_HASHSZ - 1);
        hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
        return tb;
}
EXPORT_SYMBOL_GPL(fib_new_table);

/* caller must hold either rtnl or rcu read lock */
struct fib_table *fib_get_table(struct net *net, u32 id)
{
        struct fib_table *tb;
        struct hlist_head *head;
        unsigned int h;

        if (id == 0)
                id = RT_TABLE_MAIN;
        h = id & (FIB_TABLE_HASHSZ - 1);

        head = &net->ipv4.fib_table_hash[h];
        hlist_for_each_entry_rcu(tb, head, tb_hlist,
                                 lockdep_rtnl_is_held()) {
                if (tb->tb_id == id)
                        return tb;
        }
        return NULL;
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */

static void fib_replace_table(struct net *net, struct fib_table *old,
                              struct fib_table *new)
{
#ifdef CONFIG_IP_MULTIPLE_TABLES
        switch (new->tb_id) {
        case RT_TABLE_MAIN:
                rcu_assign_pointer(net->ipv4.fib_main, new);
                break;
        case RT_TABLE_DEFAULT:
                rcu_assign_pointer(net->ipv4.fib_default, new);
                break;
        default:
                break;
        }

#endif
        /* replace the old table in the hlist */
        hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
}

int fib_unmerge(struct net *net)
{
        struct fib_table *old, *new, *main_table;

        /* attempt to fetch local table if it has been allocated */
        old = fib_get_table(net, RT_TABLE_LOCAL);
        if (!old)
                return 0;

        new = fib_trie_unmerge(old);
        if (!new)
                return -ENOMEM;

        /* table is already unmerged */
        if (new == old)
                return 0;

        /* replace merged table with clean table */
        fib_replace_table(net, old, new);
        fib_free_table(old);

        /* attempt to fetch main table if it has been allocated */
        main_table = fib_get_table(net, RT_TABLE_MAIN);
        if (!main_table)
                return 0;

        /* flush local entries from main table */
        fib_table_flush_external(main_table);

        return 0;
}

void fib_flush(struct net *net)
{
        int flushed = 0;
        unsigned int h;

        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
                struct hlist_node *tmp;
                struct fib_table *tb;

                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
                        flushed += fib_table_flush(net, tb, false);
        }

        if (flushed)
                rt_cache_flush(net);
}

/*
 * Find address type as if only "dev" was present in the system. If
 * on_dev is NULL then all interfaces are taken into consideration.
 */
static inline unsigned int __inet_dev_addr_type(struct net *net,
                                                const struct net_device *dev,
                                                __be32 addr, u32 tb_id)
{
        struct flowi4                fl4 = { .daddr = addr };
        struct fib_result        res;
        unsigned int ret = RTN_BROADCAST;
        struct fib_table *table;

        if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
                return RTN_BROADCAST;
        if (ipv4_is_multicast(addr))
                return RTN_MULTICAST;

        rcu_read_lock();

        table = fib_get_table(net, tb_id);
        if (table) {
                ret = RTN_UNICAST;
                if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
                        struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);

                        if (!dev || dev == nhc->nhc_dev)
                                ret = res.type;
                }
        }

        rcu_read_unlock();
        return ret;
}

unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
{
        return __inet_dev_addr_type(net, NULL, addr, tb_id);
}
EXPORT_SYMBOL(inet_addr_type_table);

unsigned int inet_addr_type(struct net *net, __be32 addr)
{
        return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
}
EXPORT_SYMBOL(inet_addr_type);

unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
                                __be32 addr)
{
        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;

        return __inet_dev_addr_type(net, dev, addr, rt_table);
}
EXPORT_SYMBOL(inet_dev_addr_type);

/* inet_addr_type with dev == NULL but using the table from a dev
 * if one is associated
 */
unsigned int inet_addr_type_dev_table(struct net *net,
                                      const struct net_device *dev,
                                      __be32 addr)
{
        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;

        return __inet_dev_addr_type(net, NULL, addr, rt_table);
}
EXPORT_SYMBOL(inet_addr_type_dev_table);

__be32 fib_compute_spec_dst(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
        struct fib_result res;
        struct rtable *rt;
        struct net *net;
        int scope;

        rt = skb_rtable(skb);
        if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
            RTCF_LOCAL)
                return ip_hdr(skb)->daddr;

        in_dev = __in_dev_get_rcu(dev);

        net = dev_net(dev);

        scope = RT_SCOPE_UNIVERSE;
        if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
                bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
                struct flowi4 fl4 = {
                        .flowi4_iif = LOOPBACK_IFINDEX,
                        .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
                        .daddr = ip_hdr(skb)->saddr,
                        .flowi4_dscp = ip4h_dscp(ip_hdr(skb)),
                        .flowi4_scope = scope,
                        .flowi4_mark = vmark ? skb->mark : 0,
                };
                if (!fib_lookup(net, &fl4, &res, 0))
                        return fib_result_prefsrc(net, &res);
        } else {
                scope = RT_SCOPE_LINK;
        }

        return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
}

bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
{
        bool dev_match = false;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (unlikely(fi->nh)) {
                dev_match = nexthop_uses_dev(fi->nh, dev);
        } else {
                int ret;

                for (ret = 0; ret < fib_info_num_path(fi); ret++) {
                        const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);

                        if (nhc_l3mdev_matches_dev(nhc, dev)) {
                                dev_match = true;
                                break;
                        }
                }
        }
#else
        if (fib_info_nhc(fi, 0)->nhc_dev == dev)
                dev_match = true;
#endif

        return dev_match;
}
EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);

/* Given (packet source, input interface) and optional (dst, oif, tos):
 * - (main) check, that source is valid i.e. not broadcast or our local
 *   address.
 * - figure out what "logical" interface this packet arrived
 *   and calculate "specific destination" address.
 * - check, that packet arrived from expected physical interface.
 * called with rcu_read_lock()
 */
static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                                 dscp_t dscp, int oif, struct net_device *dev,
                                 int rpf, struct in_device *idev, u32 *itag)
{
        struct net *net = dev_net(dev);
        enum skb_drop_reason reason;
        struct flow_keys flkeys;
        int ret, no_addr;
        struct fib_result res;
        struct flowi4 fl4;
        bool dev_match;

        fl4.flowi4_oif = 0;
        fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev);
        fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
        fl4.daddr = src;
        fl4.saddr = dst;
        fl4.flowi4_dscp = dscp;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_tun_key.tun_id = 0;
        fl4.flowi4_flags = 0;
        fl4.flowi4_uid = sock_net_uid(net, NULL);
        fl4.flowi4_multipath_hash = 0;

        no_addr = idev->ifa_list == NULL;

        fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
        if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
                fl4.flowi4_proto = 0;
                fl4.fl4_sport = 0;
                fl4.fl4_dport = 0;
        } else {
                swap(fl4.fl4_sport, fl4.fl4_dport);
        }

        if (fib_lookup(net, &fl4, &res, 0))
                goto last_resort;
        if (res.type != RTN_UNICAST) {
                if (res.type != RTN_LOCAL) {
                        reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
                        goto e_inval;
                } else if (!IN_DEV_ACCEPT_LOCAL(idev)) {
                        reason = SKB_DROP_REASON_IP_LOCAL_SOURCE;
                        goto e_inval;
                }
        }
        fib_combine_itag(itag, &res);

        dev_match = fib_info_nh_uses_dev(res.fi, dev);
        /* This is not common, loopback packets retain skb_dst so normally they
         * would not even hit this slow path.
         */
        dev_match = dev_match || (res.type == RTN_LOCAL &&
                                  dev == net->loopback_dev);
        if (dev_match) {
                ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
                return ret;
        }
        if (no_addr)
                goto last_resort;
        if (rpf == 1)
                goto e_rpf;
        fl4.flowi4_oif = dev->ifindex;

        ret = 0;
        if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
                if (res.type == RTN_UNICAST)
                        ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
        }
        return ret;

last_resort:
        if (rpf)
                goto e_rpf;
        *itag = 0;
        return 0;

e_inval:
        return -reason;
e_rpf:
        return -SKB_DROP_REASON_IP_RPFILTER;
}

/* Ignore rp_filter for packets protected by IPsec. */
int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                        dscp_t dscp, int oif, struct net_device *dev,
                        struct in_device *idev, u32 *itag)
{
        int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
        struct net *net = dev_net(dev);

        if (!r && !fib_num_tclassid_users(net) &&
            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
                if (IN_DEV_ACCEPT_LOCAL(idev))
                        goto ok;
                /* with custom local routes in place, checking local addresses
                 * only will be too optimistic, with custom rules, checking
                 * local addresses only can be too strict, e.g. due to vrf
                 */
                if (net->ipv4.fib_has_custom_local_routes ||
                    fib4_has_custom_rules(net))
                        goto full_check;
                /* Within the same container, it is regarded as a martian source,
                 * and the same host but different containers are not.
                 */
                if (inet_lookup_ifaddr_rcu(net, src))
                        return -SKB_DROP_REASON_IP_LOCAL_SOURCE;

ok:
                *itag = 0;
                return 0;
        }

full_check:
        return __fib_validate_source(skb, src, dst, dscp, oif, dev, r, idev,
                                     itag);
}

static inline __be32 sk_extract_addr(struct sockaddr *addr)
{
        return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
}

static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
{
        struct nlattr *nla;

        nla = (struct nlattr *) ((char *) mx + len);
        nla->nla_type = type;
        nla->nla_len = nla_attr_size(4);
        *(u32 *) nla_data(nla) = value;

        return len + nla_total_size(4);
}

static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
                                 struct fib_config *cfg)
{
        __be32 addr;
        int plen;

        memset(cfg, 0, sizeof(*cfg));
        cfg->fc_nlinfo.nl_net = net;

        if (rt->rt_dst.sa_family != AF_INET)
                return -EAFNOSUPPORT;

        /*
         * Check mask for validity:
         * a) it must be contiguous.
         * b) destination must have all host bits clear.
         * c) if application forgot to set correct family (AF_INET),
         *    reject request unless it is absolutely clear i.e.
         *    both family and mask are zero.
         */
        plen = 32;
        addr = sk_extract_addr(&rt->rt_dst);
        if (!(rt->rt_flags & RTF_HOST)) {
                __be32 mask = sk_extract_addr(&rt->rt_genmask);

                if (rt->rt_genmask.sa_family != AF_INET) {
                        if (mask || rt->rt_genmask.sa_family)
                                return -EAFNOSUPPORT;
                }

                if (bad_mask(mask, addr))
                        return -EINVAL;

                plen = inet_mask_len(mask);
        }

        cfg->fc_dst_len = plen;
        cfg->fc_dst = addr;

        if (cmd != SIOCDELRT) {
                cfg->fc_nlflags = NLM_F_CREATE;
                cfg->fc_protocol = RTPROT_BOOT;
        }

        if (rt->rt_metric)
                cfg->fc_priority = rt->rt_metric - 1;

        if (rt->rt_flags & RTF_REJECT) {
                cfg->fc_scope = RT_SCOPE_HOST;
                cfg->fc_type = RTN_UNREACHABLE;
                return 0;
        }

        cfg->fc_scope = RT_SCOPE_NOWHERE;
        cfg->fc_type = RTN_UNICAST;

        if (rt->rt_dev) {
                char *colon;
                struct net_device *dev;
                char devname[IFNAMSIZ];

                if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
                        return -EFAULT;

                devname[IFNAMSIZ-1] = 0;
                colon = strchr(devname, ':');
                if (colon)
                        *colon = 0;
                dev = __dev_get_by_name(net, devname);
                if (!dev)
                        return -ENODEV;
                cfg->fc_oif = dev->ifindex;
                cfg->fc_table = l3mdev_fib_table(dev);
                if (colon) {
                        const struct in_ifaddr *ifa;
                        struct in_device *in_dev;

                        in_dev = __in_dev_get_rtnl_net(dev);
                        if (!in_dev)
                                return -ENODEV;

                        *colon = ':';

                        in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) {
                                if (strcmp(ifa->ifa_label, devname) == 0)
                                        break;
                        }

                        if (!ifa)
                                return -ENODEV;
                        cfg->fc_prefsrc = ifa->ifa_local;
                }
        }

        addr = sk_extract_addr(&rt->rt_gateway);
        if (rt->rt_gateway.sa_family == AF_INET && addr) {
                unsigned int addr_type;

                cfg->fc_gw4 = addr;
                cfg->fc_gw_family = AF_INET;
                addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
                if (rt->rt_flags & RTF_GATEWAY &&
                    addr_type == RTN_UNICAST)
                        cfg->fc_scope = RT_SCOPE_UNIVERSE;
        }

        if (!cfg->fc_table)
                cfg->fc_table = RT_TABLE_MAIN;

        if (cmd == SIOCDELRT)
                return 0;

        if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
                return -EINVAL;

        if (cfg->fc_scope == RT_SCOPE_NOWHERE)
                cfg->fc_scope = RT_SCOPE_LINK;

        if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
                struct nlattr *mx;
                int len = 0;

                mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
                if (!mx)
                        return -ENOMEM;

                if (rt->rt_flags & RTF_MTU)
                        len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);

                if (rt->rt_flags & RTF_WINDOW)
                        len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);

                if (rt->rt_flags & RTF_IRTT)
                        len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);

                cfg->fc_mx = mx;
                cfg->fc_mx_len = len;
        }

        return 0;
}

/*
 * Handle IP routing ioctl calls.
 * These are used to manipulate the routing tables
 */
int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
{
        struct fib_config cfg;
        int err;

        switch (cmd) {
        case SIOCADDRT:                /* Add a route */
        case SIOCDELRT:                /* Delete a route */
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;

                rtnl_net_lock(net);
                err = rtentry_to_fib_config(net, cmd, rt, &cfg);
                if (err == 0) {
                        struct fib_table *tb;

                        if (cmd == SIOCDELRT) {
                                tb = fib_get_table(net, cfg.fc_table);
                                if (tb)
                                        err = fib_table_delete(net, tb, &cfg,
                                                               NULL);
                                else
                                        err = -ESRCH;
                        } else {
                                tb = fib_new_table(net, cfg.fc_table);
                                if (tb)
                                        err = fib_table_insert(net, tb,
                                                               &cfg, NULL);
                                else
                                        err = -ENOBUFS;
                        }

                        /* allocated by rtentry_to_fib_config() */
                        kfree(cfg.fc_mx);
                }
                rtnl_net_unlock(net);
                return err;
        }
        return -EINVAL;
}

const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
        [RTA_UNSPEC]                = { .strict_start_type = RTA_DPORT + 1 },
        [RTA_DST]                = { .type = NLA_U32 },
        [RTA_SRC]                = { .type = NLA_U32 },
        [RTA_IIF]                = { .type = NLA_U32 },
        [RTA_OIF]                = { .type = NLA_U32 },
        [RTA_GATEWAY]                = { .type = NLA_U32 },
        [RTA_PRIORITY]                = { .type = NLA_U32 },
        [RTA_PREFSRC]                = { .type = NLA_U32 },
        [RTA_METRICS]                = { .type = NLA_NESTED },
        [RTA_MULTIPATH]                = { .len = sizeof(struct rtnexthop) },
        [RTA_FLOW]                = { .type = NLA_U32 },
        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
        [RTA_ENCAP]                = { .type = NLA_NESTED },
        [RTA_UID]                = { .type = NLA_U32 },
        [RTA_MARK]                = { .type = NLA_U32 },
        [RTA_TABLE]                = { .type = NLA_U32 },
        [RTA_IP_PROTO]                = { .type = NLA_U8 },
        [RTA_SPORT]                = { .type = NLA_U16 },
        [RTA_DPORT]                = { .type = NLA_U16 },
        [RTA_NH_ID]                = { .type = NLA_U32 },
};

int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
                    struct netlink_ext_ack *extack)
{
        struct rtvia *via;
        int alen;

        if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
                NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
                return -EINVAL;
        }

        via = nla_data(nla);
        alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);

        switch (via->rtvia_family) {
        case AF_INET:
                if (alen != sizeof(__be32)) {
                        NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
                        return -EINVAL;
                }
                cfg->fc_gw_family = AF_INET;
                cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
                break;
        case AF_INET6:
#if IS_ENABLED(CONFIG_IPV6)
                if (alen != sizeof(struct in6_addr)) {
                        NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
                        return -EINVAL;
                }
                cfg->fc_gw_family = AF_INET6;
                cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
#else
                NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
                return -EINVAL;
#endif
                break;
        default:
                NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
                return -EINVAL;
        }

        return 0;
}

static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
                             struct nlmsghdr *nlh, struct fib_config *cfg,
                             struct netlink_ext_ack *extack)
{
        bool has_gw = false, has_via = false;
        struct nlattr *attr;
        int err, remaining;
        struct rtmsg *rtm;

        err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
                                        rtm_ipv4_policy, extack);
        if (err < 0)
                goto errout;

        memset(cfg, 0, sizeof(*cfg));

        rtm = nlmsg_data(nlh);

        if (!inet_validate_dscp(rtm->rtm_tos)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid dsfield (tos): ECN bits must be 0");
                err = -EINVAL;
                goto errout;
        }
        cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);

        cfg->fc_dst_len = rtm->rtm_dst_len;
        cfg->fc_table = rtm->rtm_table;
        cfg->fc_protocol = rtm->rtm_protocol;
        cfg->fc_scope = rtm->rtm_scope;
        cfg->fc_type = rtm->rtm_type;
        cfg->fc_flags = rtm->rtm_flags;
        cfg->fc_nlflags = nlh->nlmsg_flags;

        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
        cfg->fc_nlinfo.nlh = nlh;
        cfg->fc_nlinfo.nl_net = net;

        if (cfg->fc_type > RTN_MAX) {
                NL_SET_ERR_MSG(extack, "Invalid route type");
                err = -EINVAL;
                goto errout;
        }

        nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
                switch (nla_type(attr)) {
                case RTA_DST:
                        cfg->fc_dst = nla_get_be32(attr);
                        break;
                case RTA_OIF:
                        cfg->fc_oif = nla_get_u32(attr);
                        break;
                case RTA_GATEWAY:
                        has_gw = true;
                        cfg->fc_gw4 = nla_get_be32(attr);
                        if (cfg->fc_gw4)
                                cfg->fc_gw_family = AF_INET;
                        break;
                case RTA_VIA:
                        has_via = true;
                        err = fib_gw_from_via(cfg, attr, extack);
                        if (err)
                                goto errout;
                        break;
                case RTA_PRIORITY:
                        cfg->fc_priority = nla_get_u32(attr);
                        break;
                case RTA_PREFSRC:
                        cfg->fc_prefsrc = nla_get_be32(attr);
                        break;
                case RTA_METRICS:
                        cfg->fc_mx = nla_data(attr);
                        cfg->fc_mx_len = nla_len(attr);
                        break;
                case RTA_MULTIPATH:
                        err = lwtunnel_valid_encap_type_attr(nla_data(attr),
                                                             nla_len(attr),
                                                             extack);
                        if (err < 0)
                                goto errout;
                        cfg->fc_mp = nla_data(attr);
                        cfg->fc_mp_len = nla_len(attr);
                        break;
                case RTA_FLOW:
                        cfg->fc_flow = nla_get_u32(attr);
                        break;
                case RTA_TABLE:
                        cfg->fc_table = nla_get_u32(attr);
                        break;
                case RTA_ENCAP:
                        cfg->fc_encap = attr;
                        break;
                case RTA_ENCAP_TYPE:
                        cfg->fc_encap_type = nla_get_u16(attr);
                        err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
                                                        extack);
                        if (err < 0)
                                goto errout;
                        break;
                case RTA_NH_ID:
                        cfg->fc_nh_id = nla_get_u32(attr);
                        break;
                }
        }

        if (cfg->fc_dst_len > 32) {
                NL_SET_ERR_MSG(extack, "Invalid prefix length");
                err = -EINVAL;
                goto errout;
        }

        if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) {
                NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length");
                err = -EINVAL;
                goto errout;
        }

        if (cfg->fc_nh_id) {
                if (cfg->fc_oif || cfg->fc_gw_family ||
                    cfg->fc_encap || cfg->fc_mp) {
                        NL_SET_ERR_MSG(extack,
                                       "Nexthop specification and nexthop id are mutually exclusive");
                        err = -EINVAL;
                        goto errout;
                }
        }

        if (has_gw && has_via) {
                NL_SET_ERR_MSG(extack,
                               "Nexthop configuration can not contain both GATEWAY and VIA");
                err = -EINVAL;
                goto errout;
        }

        if (!cfg->fc_table)
                cfg->fc_table = RT_TABLE_MAIN;

        return 0;
errout:
        return err;
}

static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct fib_config cfg;
        struct fib_table *tb;
        int err;

        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
        if (err < 0)
                goto errout;

        rtnl_net_lock(net);

        if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
                NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                err = -EINVAL;
                goto unlock;
        }

        tb = fib_get_table(net, cfg.fc_table);
        if (!tb) {
                NL_SET_ERR_MSG(extack, "FIB table does not exist");
                err = -ESRCH;
                goto unlock;
        }

        err = fib_table_delete(net, tb, &cfg, extack);
unlock:
        rtnl_net_unlock(net);
errout:
        return err;
}

static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct fib_config cfg;
        struct fib_table *tb;
        int err;

        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
        if (err < 0)
                goto errout;

        rtnl_net_lock(net);

        tb = fib_new_table(net, cfg.fc_table);
        if (!tb) {
                err = -ENOBUFS;
                goto unlock;
        }

        err = fib_table_insert(net, tb, &cfg, extack);
        if (!err && cfg.fc_type == RTN_LOCAL)
                net->ipv4.fib_has_custom_local_routes = true;

unlock:
        rtnl_net_unlock(net);
errout:
        return err;
}

int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
                          struct fib_dump_filter *filter,
                          struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[RTA_MAX + 1];
        struct rtmsg *rtm;
        int err, i;

        if (filter->rtnl_held)
                ASSERT_RTNL();

        rtm = nlmsg_payload(nlh, sizeof(*rtm));
        if (!rtm) {
                NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
                return -EINVAL;
        }

        if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
            rtm->rtm_scope) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
                return -EINVAL;
        }

        if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
                NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
                return -EINVAL;
        }
        if (rtm->rtm_flags & RTM_F_CLONED)
                filter->dump_routes = false;
        else
                filter->dump_exceptions = false;

        filter->flags    = rtm->rtm_flags;
        filter->protocol = rtm->rtm_protocol;
        filter->rt_type  = rtm->rtm_type;
        filter->table_id = rtm->rtm_table;

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
                                            rtm_ipv4_policy, extack);
        if (err < 0)
                return err;

        for (i = 0; i <= RTA_MAX; ++i) {
                int ifindex;

                if (!tb[i])
                        continue;

                switch (i) {
                case RTA_TABLE:
                        filter->table_id = nla_get_u32(tb[i]);
                        break;
                case RTA_OIF:
                        ifindex = nla_get_u32(tb[i]);
                        if (filter->rtnl_held)
                                filter->dev = __dev_get_by_index(net, ifindex);
                        else
                                filter->dev = dev_get_by_index_rcu(net, ifindex);
                        if (!filter->dev)
                                return -ENODEV;
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
                        return -EINVAL;
                }
        }

        if (filter->flags || filter->protocol || filter->rt_type ||
            filter->table_id || filter->dev) {
                filter->filter_set = 1;
                cb->answer_flags = NLM_F_DUMP_FILTERED;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);

static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct fib_dump_filter filter = {
                .dump_routes = true,
                .dump_exceptions = true,
                .rtnl_held = false,
        };
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        unsigned int h, s_h;
        unsigned int e = 0, s_e;
        struct fib_table *tb;
        struct hlist_head *head;
        int dumped = 0, err = 0;

        rcu_read_lock();
        if (cb->strict_check) {
                err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
                if (err < 0)
                        goto unlock;
        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
                struct rtmsg *rtm = nlmsg_data(nlh);

                filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
        }

        /* ipv4 does not use prefix flag */
        if (filter.flags & RTM_F_PREFIX)
                goto unlock;

        if (filter.table_id) {
                tb = fib_get_table(net, filter.table_id);
                if (!tb) {
                        if (rtnl_msg_family(cb->nlh) != PF_INET)
                                goto unlock;

                        NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
                        err = -ENOENT;
                        goto unlock;
                }
                err = fib_table_dump(tb, skb, cb, &filter);
                goto unlock;
        }

        s_h = cb->args[0];
        s_e = cb->args[1];

        err = 0;
        for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
                e = 0;
                head = &net->ipv4.fib_table_hash[h];
                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
                        if (e < s_e)
                                goto next;
                        if (dumped)
                                memset(&cb->args[2], 0, sizeof(cb->args) -
                                                 2 * sizeof(cb->args[0]));
                        err = fib_table_dump(tb, skb, cb, &filter);
                        if (err < 0)
                                goto out;
                        dumped = 1;
next:
                        e++;
                }
        }
out:

        cb->args[1] = e;
        cb->args[0] = h;

unlock:
        rcu_read_unlock();
        return err;
}

/* Prepare and feed intra-kernel routing request.
 * Really, it should be netlink message, but :-( netlink
 * can be not configured, so that we feed it directly
 * to fib engine. It is legal, because all events occur
 * only when netlink is already locked.
 */
static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
                      struct in_ifaddr *ifa, u32 rt_priority)
{
        struct net *net = dev_net(ifa->ifa_dev->dev);
        u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
        struct fib_table *tb;
        struct fib_config cfg = {
                .fc_protocol = RTPROT_KERNEL,
                .fc_type = type,
                .fc_dst = dst,
                .fc_dst_len = dst_len,
                .fc_priority = rt_priority,
                .fc_prefsrc = ifa->ifa_local,
                .fc_oif = ifa->ifa_dev->dev->ifindex,
                .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
                .fc_nlinfo = {
                        .nl_net = net,
                },
        };

        if (!tb_id)
                tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;

        tb = fib_new_table(net, tb_id);
        if (!tb)
                return;

        cfg.fc_table = tb->tb_id;

        if (type != RTN_LOCAL)
                cfg.fc_scope = RT_SCOPE_LINK;
        else
                cfg.fc_scope = RT_SCOPE_HOST;

        if (cmd == RTM_NEWROUTE)
                fib_table_insert(net, tb, &cfg, NULL);
        else
                fib_table_delete(net, tb, &cfg, NULL);
}

void fib_add_ifaddr(struct in_ifaddr *ifa)
{
        struct in_device *in_dev = ifa->ifa_dev;
        struct net_device *dev = in_dev->dev;
        struct in_ifaddr *prim = ifa;
        __be32 mask = ifa->ifa_mask;
        __be32 addr = ifa->ifa_local;
        __be32 prefix = ifa->ifa_address & mask;

        if (ifa->ifa_flags & IFA_F_SECONDARY) {
                prim = inet_ifa_byprefix(in_dev, prefix, mask);
                if (!prim) {
                        pr_warn("%s: bug: prim == NULL\n", __func__);
                        return;
                }
        }

        fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);

        if (!(dev->flags & IFF_UP))
                return;

        /* Add broadcast address, if it is explicitly assigned. */
        if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) {
                fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
                          prim, 0);
                arp_invalidate(dev, ifa->ifa_broadcast, false);
        }

        if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
            (prefix != addr || ifa->ifa_prefixlen < 32)) {
                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
                        fib_magic(RTM_NEWROUTE,
                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
                                  prefix, ifa->ifa_prefixlen, prim,
                                  ifa->ifa_rt_priority);

                /* Add the network broadcast address, when it makes sense */
                if (ifa->ifa_prefixlen < 31) {
                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
                                  32, prim, 0);
                        arp_invalidate(dev, prefix | ~mask, false);
                }
        }
}

void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
{
        __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
        struct in_device *in_dev = ifa->ifa_dev;
        struct net_device *dev = in_dev->dev;

        if (!(dev->flags & IFF_UP) ||
            ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
            ipv4_is_zeronet(prefix) ||
            (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
                return;

        /* add the new */
        fib_magic(RTM_NEWROUTE,
                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
                  prefix, ifa->ifa_prefixlen, ifa, new_metric);

        /* delete the old */
        fib_magic(RTM_DELROUTE,
                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
                  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
}

/* Delete primary or secondary address.
 * Optionally, on secondary address promotion consider the addresses
 * from subnet iprim as deleted, even if they are in device list.
 * In this case the secondary ifa can be in device list.
 */
void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
{
        struct in_device *in_dev = ifa->ifa_dev;
        struct net_device *dev = in_dev->dev;
        struct in_ifaddr *ifa1;
        struct in_ifaddr *prim = ifa, *prim1 = NULL;
        __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
        __be32 any = ifa->ifa_address & ifa->ifa_mask;
#define LOCAL_OK        1
#define BRD_OK                2
#define BRD0_OK                4
#define BRD1_OK                8
        unsigned int ok = 0;
        int subnet = 0;                /* Primary network */
        int gone = 1;                /* Address is missing */
        int same_prefsrc = 0;        /* Another primary with same IP */

        if (ifa->ifa_flags & IFA_F_SECONDARY) {
                prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
                if (!prim) {
                        /* if the device has been deleted, we don't perform
                         * address promotion
                         */
                        if (!in_dev->dead)
                                pr_warn("%s: bug: prim == NULL\n", __func__);
                        return;
                }
                if (iprim && iprim != prim) {
                        pr_warn("%s: bug: iprim != prim\n", __func__);
                        return;
                }
        } else if (!ipv4_is_zeronet(any) &&
                   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
                        fib_magic(RTM_DELROUTE,
                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
                                  any, ifa->ifa_prefixlen, prim, 0);
                subnet = 1;
        }

        if (in_dev->dead)
                goto no_promotions;

        /* Deletion is more complicated than add.
         * We should take care of not to delete too much :-)
         *
         * Scan address list to be sure that addresses are really gone.
         */
        rcu_read_lock();
        in_dev_for_each_ifa_rcu(ifa1, in_dev) {
                if (ifa1 == ifa) {
                        /* promotion, keep the IP */
                        gone = 0;
                        continue;
                }
                /* Ignore IFAs from our subnet */
                if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
                    inet_ifa_match(ifa1->ifa_address, iprim))
                        continue;

                /* Ignore ifa1 if it uses different primary IP (prefsrc) */
                if (ifa1->ifa_flags & IFA_F_SECONDARY) {
                        /* Another address from our subnet? */
                        if (ifa1->ifa_mask == prim->ifa_mask &&
                            inet_ifa_match(ifa1->ifa_address, prim))
                                prim1 = prim;
                        else {
                                /* We reached the secondaries, so
                                 * same_prefsrc should be determined.
                                 */
                                if (!same_prefsrc)
                                        continue;
                                /* Search new prim1 if ifa1 is not
                                 * using the current prim1
                                 */
                                if (!prim1 ||
                                    ifa1->ifa_mask != prim1->ifa_mask ||
                                    !inet_ifa_match(ifa1->ifa_address, prim1))
                                        prim1 = inet_ifa_byprefix(in_dev,
                                                        ifa1->ifa_address,
                                                        ifa1->ifa_mask);
                                if (!prim1)
                                        continue;
                                if (prim1->ifa_local != prim->ifa_local)
                                        continue;
                        }
                } else {
                        if (prim->ifa_local != ifa1->ifa_local)
                                continue;
                        prim1 = ifa1;
                        if (prim != prim1)
                                same_prefsrc = 1;
                }
                if (ifa->ifa_local == ifa1->ifa_local)
                        ok |= LOCAL_OK;
                if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
                        ok |= BRD_OK;
                if (brd == ifa1->ifa_broadcast)
                        ok |= BRD1_OK;
                if (any == ifa1->ifa_broadcast)
                        ok |= BRD0_OK;
                /* primary has network specific broadcasts */
                if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
                        __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
                        __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;

                        if (!ipv4_is_zeronet(any1)) {
                                if (ifa->ifa_broadcast == brd1 ||
                                    ifa->ifa_broadcast == any1)
                                        ok |= BRD_OK;
                                if (brd == brd1 || brd == any1)
                                        ok |= BRD1_OK;
                                if (any == brd1 || any == any1)
                                        ok |= BRD0_OK;
                        }
                }
        }
        rcu_read_unlock();

no_promotions:
        if (!(ok & BRD_OK))
                fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
                          prim, 0);
        if (subnet && ifa->ifa_prefixlen < 31) {
                if (!(ok & BRD1_OK))
                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
                                  prim, 0);
                if (!(ok & BRD0_OK))
                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
                                  prim, 0);
        }
        if (!(ok & LOCAL_OK)) {
                unsigned int addr_type;

                fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);

                /* Check, that this local address finally disappeared. */
                addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
                                                     ifa->ifa_local);
                if (gone && addr_type != RTN_LOCAL) {
                        /* And the last, but not the least thing.
                         * We must flush stray FIB entries.
                         *
                         * First of all, we scan fib_info list searching
                         * for stray nexthop entries, then ignite fib_flush.
                         */
                        if (fib_sync_down_addr(dev, ifa->ifa_local))
                                fib_flush(dev_net(dev));
                }
        }
#undef LOCAL_OK
#undef BRD_OK
#undef BRD0_OK
#undef BRD1_OK
}

static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
{

        struct fib_result       res;
        struct flowi4           fl4 = {
                .flowi4_mark = frn->fl_mark,
                .daddr = frn->fl_addr,
                .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos),
                .flowi4_scope = frn->fl_scope,
        };
        struct fib_table *tb;

        rcu_read_lock();

        tb = fib_get_table(net, frn->tb_id_in);

        frn->err = -ENOENT;
        if (tb) {
                local_bh_disable();

                frn->tb_id = tb->tb_id;
                frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);

                if (!frn->err) {
                        frn->prefixlen = res.prefixlen;
                        frn->nh_sel = res.nh_sel;
                        frn->type = res.type;
                        frn->scope = res.scope;
                }
                local_bh_enable();
        }

        rcu_read_unlock();
}

static void nl_fib_input(struct sk_buff *skb)
{
        struct net *net;
        struct fib_result_nl *frn;
        struct nlmsghdr *nlh;
        u32 portid;

        net = sock_net(skb->sk);
        nlh = nlmsg_hdr(skb);
        if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
            skb->len < nlh->nlmsg_len ||
            nlmsg_len(nlh) < sizeof(*frn))
                return;

        skb = netlink_skb_clone(skb, GFP_KERNEL);
        if (!skb)
                return;
        nlh = nlmsg_hdr(skb);

        frn = nlmsg_data(nlh);
        nl_fib_lookup(net, frn);

        portid = NETLINK_CB(skb).portid;      /* netlink portid */
        NETLINK_CB(skb).portid = 0;        /* from kernel */
        NETLINK_CB(skb).dst_group = 0;  /* unicast */
        nlmsg_unicast(net->ipv4.fibnl, skb, portid);
}

static int __net_init nl_fib_lookup_init(struct net *net)
{
        struct sock *sk;
        struct netlink_kernel_cfg cfg = {
                .input        = nl_fib_input,
        };

        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
        if (!sk)
                return -EAFNOSUPPORT;
        net->ipv4.fibnl = sk;
        return 0;
}

static void nl_fib_lookup_exit(struct net *net)
{
        netlink_kernel_release(net->ipv4.fibnl);
        net->ipv4.fibnl = NULL;
}

static void fib_disable_ip(struct net_device *dev, unsigned long event,
                           bool force)
{
        if (fib_sync_down_dev(dev, event, force))
                fib_flush(dev_net(dev));
        else
                rt_cache_flush(dev_net(dev));
        arp_ifdown(dev);
}

static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct in_ifaddr *ifa = ptr;
        struct net_device *dev = ifa->ifa_dev->dev;
        struct net *net = dev_net(dev);

        switch (event) {
        case NETDEV_UP:
                fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
                fib_sync_up(dev, RTNH_F_DEAD);
#endif
                atomic_inc(&net->ipv4.dev_addr_genid);
                rt_cache_flush(net);
                break;
        case NETDEV_DOWN:
                fib_del_ifaddr(ifa, NULL);
                atomic_inc(&net->ipv4.dev_addr_genid);
                if (!ifa->ifa_dev->ifa_list) {
                        /* Last address was deleted from this interface.
                         * Disable IP.
                         */
                        fib_disable_ip(dev, event, true);
                } else {
                        rt_cache_flush(net);
                }
                break;
        }
        return NOTIFY_DONE;
}

static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct netdev_notifier_changeupper_info *upper_info = ptr;
        struct netdev_notifier_info_ext *info_ext = ptr;
        struct in_device *in_dev;
        struct net *net = dev_net(dev);
        struct in_ifaddr *ifa;
        unsigned int flags;

        if (event == NETDEV_UNREGISTER) {
                fib_disable_ip(dev, event, true);
                rt_flush_dev(dev);
                return NOTIFY_DONE;
        }

        in_dev = __in_dev_get_rtnl(dev);
        if (!in_dev)
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_UP:
                in_dev_for_each_ifa_rtnl(ifa, in_dev) {
                        fib_add_ifaddr(ifa);
                }
#ifdef CONFIG_IP_ROUTE_MULTIPATH
                fib_sync_up(dev, RTNH_F_DEAD);
#endif
                atomic_inc(&net->ipv4.dev_addr_genid);
                rt_cache_flush(net);
                break;
        case NETDEV_DOWN:
                fib_disable_ip(dev, event, false);
                break;
        case NETDEV_CHANGE:
                flags = netif_get_flags(dev);
                if (flags & (IFF_RUNNING | IFF_LOWER_UP))
                        fib_sync_up(dev, RTNH_F_LINKDOWN);
                else
                        fib_sync_down_dev(dev, event, false);
                rt_cache_flush(net);
                break;
        case NETDEV_CHANGEMTU:
                fib_sync_mtu(dev, info_ext->ext.mtu);
                rt_cache_flush(net);
                break;
        case NETDEV_CHANGEUPPER:
                upper_info = ptr;
                /* flush all routes if dev is linked to or unlinked from
                 * an L3 master device (e.g., VRF)
                 */
                if (upper_info->upper_dev &&
                    netif_is_l3_master(upper_info->upper_dev))
                        fib_disable_ip(dev, NETDEV_DOWN, true);
                break;
        }
        return NOTIFY_DONE;
}

static struct notifier_block fib_inetaddr_notifier = {
        .notifier_call = fib_inetaddr_event,
};

static struct notifier_block fib_netdev_notifier = {
        .notifier_call = fib_netdev_event,
};

static int __net_init ip_fib_net_init(struct net *net)
{
        int err;
        size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;

        err = fib4_notifier_init(net);
        if (err)
                return err;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
        /* Default to 3-tuple */
        net->ipv4.sysctl_fib_multipath_hash_fields =
                FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
#endif

        /* Avoid false sharing : Use at least a full cache line */
        size = max_t(size_t, size, L1_CACHE_BYTES);

        net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
        if (!net->ipv4.fib_table_hash) {
                err = -ENOMEM;
                goto err_table_hash_alloc;
        }

        err = fib4_rules_init(net);
        if (err < 0)
                goto err_rules_init;
        return 0;

err_rules_init:
        kfree(net->ipv4.fib_table_hash);
err_table_hash_alloc:
        fib4_notifier_exit(net);
        return err;
}

static void ip_fib_net_exit(struct net *net)
{
        int i;

        ASSERT_RTNL_NET(net);
#ifdef CONFIG_IP_MULTIPLE_TABLES
        RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
        RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
#endif
        /* Destroy the tables in reverse order to guarantee that the
         * local table, ID 255, is destroyed before the main table, ID
         * 254. This is necessary as the local table may contain
         * references to data contained in the main table.
         */
        for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
                struct hlist_head *head = &net->ipv4.fib_table_hash[i];
                struct hlist_node *tmp;
                struct fib_table *tb;

                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
                        hlist_del(&tb->tb_hlist);
                        fib_table_flush(net, tb, true);
                        fib_free_table(tb);
                }
        }

#ifdef CONFIG_IP_MULTIPLE_TABLES
        fib4_rules_exit(net);
#endif

        kfree(net->ipv4.fib_table_hash);
        fib4_notifier_exit(net);
}

static int __net_init fib_net_init(struct net *net)
{
        int error;

#ifdef CONFIG_IP_ROUTE_CLASSID
        atomic_set(&net->ipv4.fib_num_tclassid_users, 0);
#endif
        error = ip_fib_net_init(net);
        if (error < 0)
                goto out;

        error = fib4_semantics_init(net);
        if (error)
                goto out_semantics;

        error = nl_fib_lookup_init(net);
        if (error < 0)
                goto out_nlfl;

        error = fib_proc_init(net);
        if (error < 0)
                goto out_proc;
out:
        return error;

out_proc:
        nl_fib_lookup_exit(net);
out_nlfl:
        fib4_semantics_exit(net);
out_semantics:
        rtnl_net_lock(net);
        ip_fib_net_exit(net);
        rtnl_net_unlock(net);
        goto out;
}

static void __net_exit fib_net_exit(struct net *net)
{
        fib_proc_exit(net);
        nl_fib_lookup_exit(net);
}

static void __net_exit fib_net_exit_batch(struct list_head *net_list)
{
        struct net *net;

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list) {
                __rtnl_net_lock(net);
                ip_fib_net_exit(net);
                __rtnl_net_unlock(net);
        }
        rtnl_unlock();

        list_for_each_entry(net, net_list, exit_list)
                fib4_semantics_exit(net);
}

static struct pernet_operations fib_net_ops = {
        .init = fib_net_init,
        .exit = fib_net_exit,
        .exit_batch = fib_net_exit_batch,
};

static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = {
        {.protocol = PF_INET, .msgtype = RTM_NEWROUTE,
         .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET},
        {.protocol = PF_INET, .msgtype = RTM_DELROUTE,
         .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET},
        {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib,
         .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
};

void __init ip_fib_init(void)
{
        fib_trie_init();

        register_pernet_subsys(&fib_net_ops);

        register_netdevice_notifier(&fib_netdev_notifier);
        register_inetaddr_notifier(&fib_inetaddr_notifier);

        rtnl_register_many(fib_rtnl_msg_handlers);
}
































































































































































































































































































































































































































































































    1 



































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RMAP_H
#define _LINUX_RMAP_H
/*
 * Declarations for Reverse Mapping functions in mm/rmap.c
 */

#include <linux/list.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/rwsem.h>
#include <linux/memcontrol.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/bit_spinlock.h>

/*
 * The anon_vma heads a list of private "related" vmas, to scan if
 * an anonymous page pointing to this anon_vma needs to be unmapped:
 * the vmas on the list will be related by forking, or by splitting.
 *
 * Since vmas come and go as they are split and merged (particularly
 * in mprotect), the mapping field of an anonymous page cannot point
 * directly to a vma: instead it points to an anon_vma, on whose list
 * the related vmas can be easily linked or unlinked.
 *
 * After unlinking the last vma on the list, we must garbage collect
 * the anon_vma object itself: we're guaranteed no page can be
 * pointing to this anon_vma once its vma list is empty.
 */
struct anon_vma {
        struct anon_vma *root;                /* Root of this anon_vma tree */
        struct rw_semaphore rwsem;        /* W: modification, R: walking the list */
        /*
         * The refcount is taken on an anon_vma when there is no
         * guarantee that the vma of page tables will exist for
         * the duration of the operation. A caller that takes
         * the reference is responsible for clearing up the
         * anon_vma if they are the last user on release
         */
        atomic_t refcount;

        /*
         * Count of child anon_vmas. Equals to the count of all anon_vmas that
         * have ->parent pointing to this one, including itself.
         *
         * This counter is used for making decision about reusing anon_vma
         * instead of forking new one. See comments in function anon_vma_clone.
         */
        unsigned long num_children;
        /* Count of VMAs whose ->anon_vma pointer points to this object. */
        unsigned long num_active_vmas;

        struct anon_vma *parent;        /* Parent of this anon_vma */

        /*
         * NOTE: the LSB of the rb_root.rb_node is set by
         * mm_take_all_locks() _after_ taking the above lock. So the
         * rb_root must only be read/written after taking the above lock
         * to be sure to see a valid next pointer. The LSB bit itself
         * is serialized by a system wide lock only visible to
         * mm_take_all_locks() (mm_all_locks_mutex).
         */

        /* Interval tree of private "related" vmas */
        struct rb_root_cached rb_root;
};

/*
 * The copy-on-write semantics of fork mean that an anon_vma
 * can become associated with multiple processes. Furthermore,
 * each child process will have its own anon_vma, where new
 * pages for that process are instantiated.
 *
 * This structure allows us to find the anon_vmas associated
 * with a VMA, or the VMAs associated with an anon_vma.
 * The "same_vma" list contains the anon_vma_chains linking
 * all the anon_vmas associated with this VMA.
 * The "rb" field indexes on an interval tree the anon_vma_chains
 * which link all the VMAs associated with this anon_vma.
 */
struct anon_vma_chain {
        struct vm_area_struct *vma;
        struct anon_vma *anon_vma;
        struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
        struct rb_node rb;                        /* locked by anon_vma->rwsem */
        unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
        unsigned long cached_vma_start, cached_vma_last;
#endif
};

enum ttu_flags {
        TTU_USE_SHARED_ZEROPAGE        = 0x2,        /* for unused pages of large folios */
        TTU_SPLIT_HUGE_PMD        = 0x4,        /* split huge PMD if any */
        TTU_IGNORE_MLOCK        = 0x8,        /* ignore mlock */
        TTU_SYNC                = 0x10,        /* avoid racy checks with PVMW_SYNC */
        TTU_HWPOISON                = 0x20,        /* do convert pte to hwpoison entry */
        TTU_BATCH_FLUSH                = 0x40,        /* Batch TLB flushes where possible
                                         * and caller guarantees they will
                                         * do a final flush if necessary */
        TTU_RMAP_LOCKED                = 0x80,        /* do not grab rmap lock:
                                         * caller holds it */
};

#ifdef CONFIG_MMU

void anon_vma_init(void);        /* create anon_vma_cachep */

#ifdef CONFIG_MM_ID
static __always_inline void folio_lock_large_mapcount(struct folio *folio)
{
        bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
}

static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
{
        __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
}

static inline unsigned int folio_mm_id(const struct folio *folio, int idx)
{
        VM_WARN_ON_ONCE(idx != 0 && idx != 1);
        return folio->_mm_id[idx] & MM_ID_MASK;
}

static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id)
{
        VM_WARN_ON_ONCE(idx != 0 && idx != 1);
        folio->_mm_id[idx] &= ~MM_ID_MASK;
        folio->_mm_id[idx] |= id;
}

static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
                int diff, mm_id_t mm_id)
{
        VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio));
        VM_WARN_ON_ONCE(diff <= 0);
        VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX);

        /*
         * Make sure we can detect at least one complete PTE mapping of the
         * folio in a single MM as "exclusively mapped". This is primarily
         * a check on 32bit, where we currently reduce the size of the per-MM
         * mapcount to a short.
         */
        VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
        VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX);

        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[0] != -1);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[0] < 0);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[1] != -1);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[1] < 0);
        VM_WARN_ON_ONCE(!folio_mapped(folio) &&
                        test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids));
}

static __always_inline void folio_set_large_mapcount(struct folio *folio,
                int mapcount, struct vm_area_struct *vma)
{
        __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);

        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY);

        /* Note: mapcounts start at -1. */
        atomic_set(&folio->_large_mapcount, mapcount - 1);
        folio->_mm_id_mapcount[0] = mapcount - 1;
        folio_set_mm_id(folio, 0, vma->vm_mm->mm_id);
}

static __always_inline int folio_add_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        const mm_id_t mm_id = vma->vm_mm->mm_id;
        int new_mapcount_val;

        folio_lock_large_mapcount(folio);
        __folio_large_mapcount_sanity_checks(folio, diff, mm_id);

        new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff;
        atomic_set(&folio->_large_mapcount, new_mapcount_val);

        /*
         * If a folio is mapped more than once into an MM on 32bit, we
         * can in theory overflow the per-MM mapcount (although only for
         * fairly large folios), turning it negative. In that case, just
         * free up the slot and mark the folio "mapped shared", otherwise
         * we might be in trouble when unmapping pages later.
         */
        if (folio_mm_id(folio, 0) == mm_id) {
                folio->_mm_id_mapcount[0] += diff;
                if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) {
                        folio->_mm_id_mapcount[0] = -1;
                        folio_set_mm_id(folio, 0, MM_ID_DUMMY);
                        folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
                }
        } else if (folio_mm_id(folio, 1) == mm_id) {
                folio->_mm_id_mapcount[1] += diff;
                if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) {
                        folio->_mm_id_mapcount[1] = -1;
                        folio_set_mm_id(folio, 1, MM_ID_DUMMY);
                        folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
                }
        } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) {
                folio_set_mm_id(folio, 0, mm_id);
                folio->_mm_id_mapcount[0] = diff - 1;
                /* We might have other mappings already. */
                if (new_mapcount_val != diff - 1)
                        folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
        } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) {
                folio_set_mm_id(folio, 1, mm_id);
                folio->_mm_id_mapcount[1] = diff - 1;
                /* Slot 0 certainly has mappings as well. */
                folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
        }
        folio_unlock_large_mapcount(folio);
        return new_mapcount_val + 1;
}
#define folio_add_large_mapcount folio_add_return_large_mapcount

static __always_inline int folio_sub_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        const mm_id_t mm_id = vma->vm_mm->mm_id;
        int new_mapcount_val;

        folio_lock_large_mapcount(folio);
        __folio_large_mapcount_sanity_checks(folio, diff, mm_id);

        new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff;
        atomic_set(&folio->_large_mapcount, new_mapcount_val);

        /*
         * There are valid corner cases where we might underflow a per-MM
         * mapcount (some mappings added when no slot was free, some mappings
         * added once a slot was free), so we always set it to -1 once we go
         * negative.
         */
        if (folio_mm_id(folio, 0) == mm_id) {
                folio->_mm_id_mapcount[0] -= diff;
                if (folio->_mm_id_mapcount[0] >= 0)
                        goto out;
                folio->_mm_id_mapcount[0] = -1;
                folio_set_mm_id(folio, 0, MM_ID_DUMMY);
        } else if (folio_mm_id(folio, 1) == mm_id) {
                folio->_mm_id_mapcount[1] -= diff;
                if (folio->_mm_id_mapcount[1] >= 0)
                        goto out;
                folio->_mm_id_mapcount[1] = -1;
                folio_set_mm_id(folio, 1, MM_ID_DUMMY);
        }

        /*
         * If one MM slot owns all mappings, the folio is mapped exclusively.
         * Note that if the folio is now unmapped (new_mapcount_val == -1), both
         * slots must be free (mapcount == -1), and we'll also mark it as
         * exclusive.
         */
        if (folio->_mm_id_mapcount[0] == new_mapcount_val ||
            folio->_mm_id_mapcount[1] == new_mapcount_val)
                folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
out:
        folio_unlock_large_mapcount(folio);
        return new_mapcount_val + 1;
}
#define folio_sub_large_mapcount folio_sub_return_large_mapcount
#else /* !CONFIG_MM_ID */
/*
 * See __folio_rmap_sanity_checks(), we might map large folios even without
 * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
 */
static inline void folio_set_large_mapcount(struct folio *folio, int mapcount,
                struct vm_area_struct *vma)
{
        /* Note: mapcounts start at -1. */
        atomic_set(&folio->_large_mapcount, mapcount - 1);
}

static inline void folio_add_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        atomic_add(diff, &folio->_large_mapcount);
}

static inline int folio_add_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        BUILD_BUG();
}

static inline void folio_sub_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        atomic_sub(diff, &folio->_large_mapcount);
}

static inline int folio_sub_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        BUILD_BUG();
}
#endif /* CONFIG_MM_ID */

#define folio_inc_large_mapcount(folio, vma) \
        folio_add_large_mapcount(folio, 1, vma)
#define folio_inc_return_large_mapcount(folio, vma) \
        folio_add_return_large_mapcount(folio, 1, vma)
#define folio_dec_large_mapcount(folio, vma) \
        folio_sub_large_mapcount(folio, 1, vma)
#define folio_dec_return_large_mapcount(folio, vma) \
        folio_sub_return_large_mapcount(folio, 1, vma)

/* RMAP flags, currently only relevant for some anon rmap operations. */
typedef int __bitwise rmap_t;

/*
 * No special request: A mapped anonymous (sub)page is possibly shared between
 * processes.
 */
#define RMAP_NONE                ((__force rmap_t)0)

/* The anonymous (sub)page is exclusive to a single process. */
#define RMAP_EXCLUSIVE                ((__force rmap_t)BIT(0))

static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio,
                const struct page *page, int nr_pages, enum pgtable_level level)
{
        /* hugetlb folios are handled separately. */
        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);

        /* When (un)mapping zeropages, we should never touch ref+mapcount. */
        VM_WARN_ON_FOLIO(is_zero_folio(folio), folio);

        /*
         * TODO: we get driver-allocated folios that have nothing to do with
         * the rmap using vm_insert_page(); therefore, we cannot assume that
         * folio_test_large_rmappable() holds for large folios. We should
         * handle any desired mapcount+stats accounting for these folios in
         * VM_MIXEDMAP VMAs separately, and then sanity-check here that
         * we really only get rmappable folios.
         */

        VM_WARN_ON_ONCE(nr_pages <= 0);
        VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
        VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);

        switch (level) {
        case PGTABLE_LEVEL_PTE:
                break;
        case PGTABLE_LEVEL_PMD:
                /*
                 * We don't support folios larger than a single PMD yet. So
                 * when PGTABLE_LEVEL_PMD is set, we assume that we are creating
                 * a single "entire" mapping of the folio.
                 */
                VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
                VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
                break;
        case PGTABLE_LEVEL_PUD:
                /*
                 * Assume that we are creating a single "entire" mapping of the
                 * folio.
                 */
                VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
                VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
                break;
        default:
                BUILD_BUG();
        }

        /*
         * Anon folios must have an associated live anon_vma as long as they're
         * mapped into userspace.
         * Note that the atomic_read() mainly does two things:
         *
         * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to
         *    check that the associated anon_vma has not yet been freed (subject
         *    to KASAN's usual limitations). This check will pass if the
         *    anon_vma's refcount has already dropped to 0 but an RCU grace
         *    period hasn't passed since then.
         * 2. If the anon_vma has not yet been freed, it checks that the
         *    anon_vma still has a nonzero refcount (as opposed to being in the
         *    middle of an RCU delay for getting freed).
         */
        if (folio_test_anon(folio) && !folio_test_ksm(folio)) {
                unsigned long mapping = (unsigned long)folio->mapping;
                struct anon_vma *anon_vma;

                anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON);
                VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio);
        }
}

/*
 * rmap interfaces called when adding or removing pte of page
 */
void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *, unsigned long address, rmap_t flags);
#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
        folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
void folio_add_anon_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *, unsigned long address, rmap_t flags);
void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address, rmap_t flags);
void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *);
#define folio_add_file_rmap_pte(folio, page, vma) \
        folio_add_file_rmap_ptes(folio, page, 1, vma)
void folio_add_file_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_add_file_rmap_pud(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *);
#define folio_remove_rmap_pte(folio, page, vma) \
        folio_remove_rmap_ptes(folio, page, 1, vma)
void folio_remove_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_remove_rmap_pud(struct folio *, struct page *,
                struct vm_area_struct *);

void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address, rmap_t flags);
void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address);

/* See folio_try_dup_anon_rmap_*() */
static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
                struct vm_area_struct *vma)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        if (PageAnonExclusive(&folio->page)) {
                if (unlikely(folio_needs_cow_for_dma(vma, folio)))
                        return -EBUSY;
                ClearPageAnonExclusive(&folio->page);
        }
        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
        return 0;
}

/* See folio_try_share_anon_rmap_*() */
static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);

        /* Paired with the memory barrier in try_grab_folio(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb();

        if (unlikely(folio_maybe_dma_pinned(folio)))
                return -EBUSY;
        ClearPageAnonExclusive(&folio->page);

        /*
         * This is conceptually a smp_wmb() paired with the smp_rmb() in
         * gup_must_unshare().
         */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb__after_atomic();
        return 0;
}

static inline void hugetlb_add_file_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);

        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
}

static inline void hugetlb_remove_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);

        atomic_dec(&folio->_entire_mapcount);
        atomic_dec(&folio->_large_mapcount);
}

static __always_inline void __folio_dup_file_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
                enum pgtable_level level)
{
        const int orig_nr_pages = nr_pages;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case PGTABLE_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        atomic_inc(&folio->_mapcount);
                        break;
                }

                if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
                        do {
                                atomic_inc(&page->_mapcount);
                        } while (page++, --nr_pages > 0);
                }
                folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
                break;
        case PGTABLE_LEVEL_PMD:
        case PGTABLE_LEVEL_PUD:
                atomic_inc(&folio->_entire_mapcount);
                folio_inc_large_mapcount(folio, dst_vma);
                break;
        default:
                BUILD_BUG();
        }
}

/**
 * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
 * @folio:        The folio to duplicate the mappings of
 * @page:        The first page to duplicate the mappings of
 * @nr_pages:        The number of pages of which the mapping will be duplicated
 * @dst_vma:        The destination vm area
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
static inline void folio_dup_file_rmap_ptes(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma)
{
        __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE);
}

static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma)
{
        __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE);
}

/**
 * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
 * @folio:        The folio to duplicate the mapping of
 * @page:        The first page to duplicate the mapping of
 * @dst_vma:        The destination vm area
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
static inline void folio_dup_file_rmap_pmd(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE);
#else
        WARN_ON_ONCE(true);
#endif
}

static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, enum pgtable_level level)
{
        const int orig_nr_pages = nr_pages;
        bool maybe_pinned;
        int i;

        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        /*
         * If this folio may have been pinned by the parent process,
         * don't allow to duplicate the mappings but instead require to e.g.,
         * copy the subpage immediately for the child so that we'll always
         * guarantee the pinned folio won't be randomly replaced in the
         * future on write faults.
         */
        maybe_pinned = likely(!folio_is_device_private(folio)) &&
                       unlikely(folio_needs_cow_for_dma(src_vma, folio));

        /*
         * No need to check+clear for already shared PTEs/PMDs of the
         * folio. But if any page is PageAnonExclusive, we must fallback to
         * copying if the folio maybe pinned.
         */
        switch (level) {
        case PGTABLE_LEVEL_PTE:
                if (unlikely(maybe_pinned)) {
                        for (i = 0; i < nr_pages; i++)
                                if (PageAnonExclusive(page + i))
                                        return -EBUSY;
                }

                if (!folio_test_large(folio)) {
                        if (PageAnonExclusive(page))
                                ClearPageAnonExclusive(page);
                        atomic_inc(&folio->_mapcount);
                        break;
                }

                do {
                        if (PageAnonExclusive(page))
                                ClearPageAnonExclusive(page);
                        if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                                atomic_inc(&page->_mapcount);
                } while (page++, --nr_pages > 0);
                folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
                break;
        case PGTABLE_LEVEL_PMD:
        case PGTABLE_LEVEL_PUD:
                if (PageAnonExclusive(page)) {
                        if (unlikely(maybe_pinned))
                                return -EBUSY;
                        ClearPageAnonExclusive(page);
                }
                atomic_inc(&folio->_entire_mapcount);
                folio_inc_large_mapcount(folio, dst_vma);
                break;
        default:
                BUILD_BUG();
        }
        return 0;
}

/**
 * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
 *                                  of a folio
 * @folio:        The folio to duplicate the mappings of
 * @page:        The first page to duplicate the mappings of
 * @nr_pages:        The number of pages of which the mapping will be duplicated
 * @dst_vma:        The destination vm area
 * @src_vma:        The vm area from which the mappings are duplicated
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock and the
 * vma->vma_mm->write_protect_seq.
 *
 * Duplicating the mappings can only fail if the folio may be pinned; device
 * private folios cannot get pinned and consequently this function cannot fail
 * for them.
 *
 * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
 * the parent and the child. They must *not* be writable after this call
 * succeeded.
 *
 * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
 */
static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma)
{
        return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
                                         src_vma, PGTABLE_LEVEL_PTE);
}

static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma)
{
        return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma,
                                         PGTABLE_LEVEL_PTE);
}

/**
 * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
 *                                 of a folio
 * @folio:        The folio to duplicate the mapping of
 * @page:        The first page to duplicate the mapping of
 * @dst_vma:        The destination vm area
 * @src_vma:        The vm area from which the mapping is duplicated
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock and the
 * vma->vma_mm->write_protect_seq.
 *
 * Duplicating the mapping can only fail if the folio may be pinned; device
 * private folios cannot get pinned and consequently this function cannot fail
 * for them.
 *
 * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
 * the parent and the child. They must *not* be writable after this call
 * succeeded.
 *
 * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
 */
static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
                                         src_vma, PGTABLE_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
        return -EBUSY;
#endif
}

static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, enum pgtable_level level)
{
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        /* device private folios cannot get pinned via GUP. */
        if (unlikely(folio_is_device_private(folio))) {
                ClearPageAnonExclusive(page);
                return 0;
        }

        /*
         * We have to make sure that when we clear PageAnonExclusive, that
         * the page is not pinned and that concurrent GUP-fast won't succeed in
         * concurrently pinning the page.
         *
         * Conceptually, PageAnonExclusive clearing consists of:
         * (A1) Clear PTE
         * (A2) Check if the page is pinned; back off if so.
         * (A3) Clear PageAnonExclusive
         * (A4) Restore PTE (optional, but certainly not writable)
         *
         * When clearing PageAnonExclusive, we cannot possibly map the page
         * writable again, because anon pages that may be shared must never
         * be writable. So in any case, if the PTE was writable it cannot
         * be writable anymore afterwards and there would be a PTE change. Only
         * if the PTE wasn't writable, there might not be a PTE change.
         *
         * Conceptually, GUP-fast pinning of an anon page consists of:
         * (B1) Read the PTE
         * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
         * (B3) Pin the mapped page
         * (B4) Check if the PTE changed by re-reading it; back off if so.
         * (B5) If the original PTE is not writable, check if
         *        PageAnonExclusive is not set; back off if so.
         *
         * If the PTE was writable, we only have to make sure that GUP-fast
         * observes a PTE change and properly backs off.
         *
         * If the PTE was not writable, we have to make sure that GUP-fast either
         * detects a (temporary) PTE change or that PageAnonExclusive is cleared
         * and properly backs off.
         *
         * Consequently, when clearing PageAnonExclusive(), we have to make
         * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
         * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
         * and (B5) happen in the right memory order.
         *
         * We assume that there might not be a memory barrier after
         * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
         * so we use explicit ones here.
         */

        /* Paired with the memory barrier in try_grab_folio(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb();

        if (unlikely(folio_maybe_dma_pinned(folio)))
                return -EBUSY;
        ClearPageAnonExclusive(page);

        /*
         * This is conceptually a smp_wmb() paired with the smp_rmb() in
         * gup_must_unshare().
         */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb__after_atomic();
        return 0;
}

/**
 * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
 *                                   mapped by a PTE possibly shared to prepare
 *                                   for KSM or temporary unmapping
 * @folio:        The folio to share a mapping of
 * @page:        The mapped exclusive page
 *
 * The caller needs to hold the page table lock and has to have the page table
 * entries cleared/invalidated.
 *
 * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
 * fork() to duplicate mappings, but instead to prepare for KSM or temporarily
 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
 *
 * Marking the mapped page shared can only fail if the folio maybe pinned;
 * device private folios cannot get pinned and consequently this function cannot
 * fail.
 *
 * Returns 0 if marking the mapped page possibly shared succeeded. Returns
 * -EBUSY otherwise.
 */
static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
                struct page *page)
{
        return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE);
}

/**
 * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
 *                                   range mapped by a PMD possibly shared to
 *                                   prepare for temporary unmapping
 * @folio:        The folio to share the mapping of
 * @page:        The first page to share the mapping of
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock and has to have the page table
 * entries cleared/invalidated.
 *
 * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
 * fork() to duplicate a mapping, but instead to prepare for temporarily
 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
 *
 * Marking the mapped pages shared can only fail if the folio maybe pinned;
 * device private folios cannot get pinned and consequently this function cannot
 * fail.
 *
 * Returns 0 if marking the mapped pages possibly shared succeeded. Returns
 * -EBUSY otherwise.
 */
static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
                struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
                                           PGTABLE_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
        return -EBUSY;
#endif
}

/*
 * Called from mm/vmscan.c to handle paging out
 */
int folio_referenced(struct folio *, int is_locked,
                        struct mem_cgroup *memcg, vm_flags_t *vm_flags);

void try_to_migrate(struct folio *folio, enum ttu_flags flags);
void try_to_unmap(struct folio *, enum ttu_flags flags);

struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
                void *owner, struct folio **foliop);

/* Avoid racy checks */
#define PVMW_SYNC                (1 << 0)
/* Look for migration entries rather than present PTEs */
#define PVMW_MIGRATION                (1 << 1)

/* Result flags */

/* The page is mapped across page table boundary */
#define PVMW_PGTABLE_CROSSED        (1 << 16)

struct page_vma_mapped_walk {
        unsigned long pfn;
        unsigned long nr_pages;
        pgoff_t pgoff;
        struct vm_area_struct *vma;
        unsigned long address;
        pmd_t *pmd;
        pte_t *pte;
        spinlock_t *ptl;
        unsigned int flags;
};

#define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags)        \
        struct page_vma_mapped_walk name = {                                \
                .pfn = folio_pfn(_folio),                                \
                .nr_pages = folio_nr_pages(_folio),                        \
                .pgoff = folio_pgoff(_folio),                                \
                .vma = _vma,                                                \
                .address = _address,                                        \
                .flags = _flags,                                        \
        }

static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
{
        /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
        if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
                pte_unmap(pvmw->pte);
        if (pvmw->ptl)
                spin_unlock(pvmw->ptl);
}

/**
 * page_vma_mapped_walk_restart - Restart the page table walk.
 * @pvmw: Pointer to struct page_vma_mapped_walk.
 *
 * It restarts the page table walk when changes occur in the page
 * table, such as splitting a PMD. Ensures that the PTL held during
 * the previous walk is released and resets the state to allow for
 * a new walk starting at the current address stored in pvmw->address.
 */
static inline void
page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw)
{
        WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte);

        if (likely(pvmw->ptl))
                spin_unlock(pvmw->ptl);
        else
                WARN_ON_ONCE(1);

        pvmw->ptl = NULL;
        pvmw->pmd = NULL;
        pvmw->pte = NULL;
}

bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
unsigned long page_address_in_vma(const struct folio *folio,
                const struct page *, const struct vm_area_struct *);

/*
 * Cleans the PTEs of shared mappings.
 * (and since clean PTEs should also be readonly, write protects them too)
 *
 * returns the number of cleaned PTEs.
 */
int folio_mkclean(struct folio *);

int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
                unsigned long pfn, unsigned long nr_pages);

int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
                      struct vm_area_struct *vma);

void remove_migration_ptes(struct folio *src, struct folio *dst,
                enum ttu_flags flags);

/*
 * rmap_walk_control: To control rmap traversing for specific needs
 *
 * arg: passed to rmap_one() and invalid_vma()
 * try_lock: bail out if the rmap lock is contended
 * contended: indicate the rmap traversal bailed out due to lock contention
 * rmap_one: executed on each vma where page is mapped
 * done: for checking traversing termination condition
 * anon_lock: for getting anon_lock by optimized way rather than default
 * invalid_vma: for skipping uninterested vma
 */
struct rmap_walk_control {
        void *arg;
        bool try_lock;
        bool contended;
        /*
         * Return false if page table scanning in rmap_walk should be stopped.
         * Otherwise, return true.
         */
        bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
                                        unsigned long addr, void *arg);
        int (*done)(struct folio *folio);
        struct anon_vma *(*anon_lock)(const struct folio *folio,
                                      struct rmap_walk_control *rwc);
        bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};

void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
                                          struct rmap_walk_control *rwc);

#else        /* !CONFIG_MMU */

#define anon_vma_init()                do {} while (0)
#define anon_vma_prepare(vma)        (0)

static inline int folio_referenced(struct folio *folio, int is_locked,
                                  struct mem_cgroup *memcg,
                                  vm_flags_t *vm_flags)
{
        *vm_flags = 0;
        return 0;
}

static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
}

static inline int folio_mkclean(struct folio *folio)
{
        return 0;
}
#endif        /* CONFIG_MMU */

#endif        /* _LINUX_RMAP_H */
































































































































































































































































































































































































    2 












    1 


    1 



    1 





    2 
    2 








    2 
















    2 





    1 





























































































































































































































    1 


    1 





















































































































































































































    1 








    1 

    1 



























































































































































































































































































































   21 





   20 










   20 








































































































































    7 


    7 


















    1 

    6 




    6 







    6 













    1 

    5 







    3 























    2 



    3 







































    2 









    2 











    1 
    1 






    1 











    1 


































































































































































































































































































































































































































































































































































































































































































































































   19 





   21 























    6 





    5 












    6 






    6 

    6 





    5 
































   14 







   13 








   14 




































































































































    6 






    6 





























































    3 




















    3 

















   19 










    1 










   19 
   19 




















   20 



   16 


















































    1 



    1 







    1 














   21 






























   15 



























































    6 



    6 















   20 






















   20 

   21 











































   21 

   18 























   20 

































































   19 










   20 























   20 





















   14 





























    6 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 
    4 
















































































































































































    3 








    1 





    1 




    2 




    3 











































    1 








    1 












    1 
















































































































    1 






    1 



    1 



































































































































































































































































































































































































































































































































































































































































































































































   10 








    9 
















   10 






































































































































































































































































































































































































































































































































































































































































































































   38 










   37 








    5 

















   18 








    3 


    1 





    1 


























   10 

































































   39 































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 */
#include <crypto/sha2.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
#include <linux/bsearch.h>
#include <linux/btf.h>
#include <linux/hex.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/kernel.h>
#include <linux/idr.h>
#include <linux/cred.h>
#include <linux/timekeeping.h>
#include <linux/ctype.h>
#include <linux/nospec.h>
#include <linux/audit.h>
#include <uapi/linux/btf.h>
#include <linux/pgtable.h>
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
#include <linux/tracepoint.h>
#include <linux/overflow.h>
#include <linux/cookie.h>
#include <linux/verification.h>

#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
#include <net/tcx.h>

#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
                          (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
                          (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
                        IS_FD_HASH(map))

#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)

DEFINE_PER_CPU(int, bpf_prog_active);
DEFINE_COOKIE(bpf_map_cookie);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
static DEFINE_IDR(link_idr);
static DEFINE_SPINLOCK(link_idr_lock);

int sysctl_unprivileged_bpf_disabled __read_mostly =
        IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;

static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops) \
        [_id] = &_ops,
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
};

/*
 * If we're handed a bigger struct than we know of, ensure all the unknown bits
 * are 0 - i.e. new user-space does not rely on any kernel feature extensions
 * we don't know about yet.
 *
 * There is a ToCToU between this function call and the following
 * copy_from_user() call. However, this is not a concern since this function is
 * meant to be a future-proofing of bits.
 */
int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
                             size_t expected_size,
                             size_t actual_size)
{
        int res;

        if (unlikely(actual_size > PAGE_SIZE))        /* silly large */
                return -E2BIG;

        if (actual_size <= expected_size)
                return 0;

        if (uaddr.is_kernel)
                res = memchr_inv(uaddr.kernel + expected_size, 0,
                                 actual_size - expected_size) == NULL;
        else
                res = check_zeroed_user(uaddr.user + expected_size,
                                        actual_size - expected_size);
        if (res < 0)
                return res;
        return res ? 0 : -E2BIG;
}

const struct bpf_map_ops bpf_map_offload_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc = bpf_map_offload_map_alloc,
        .map_free = bpf_map_offload_map_free,
        .map_check_btf = map_check_no_btf,
        .map_mem_usage = bpf_map_offload_map_mem_usage,
};

static void bpf_map_write_active_inc(struct bpf_map *map)
{
        atomic64_inc(&map->writecnt);
}

static void bpf_map_write_active_dec(struct bpf_map *map)
{
        atomic64_dec(&map->writecnt);
}

bool bpf_map_write_active(const struct bpf_map *map)
{
        return atomic64_read(&map->writecnt) != 0;
}

static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags)
{
        if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS))
                return map->value_size;
        else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
                 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
                 map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
                 map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
                return round_up(map->value_size, 8) * num_possible_cpus();
        else if (IS_FD_MAP(map))
                return sizeof(u32);
        else
                return  map->value_size;
}

static void maybe_wait_bpf_programs(struct bpf_map *map)
{
        /* Wait for any running non-sleepable BPF programs to complete so that
         * userspace, when we return to it, knows that all non-sleepable
         * programs that could be running use the new map value. For sleepable
         * BPF programs, synchronize_rcu_tasks_trace() should be used to wait
         * for the completions of these programs, but considering the waiting
         * time can be very long and userspace may think it will hang forever,
         * so don't handle sleepable BPF programs now.
         */
        if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
            map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
                synchronize_rcu_expedited();
}

static void unpin_uptr_kaddr(void *kaddr)
{
        if (kaddr)
                unpin_user_page(virt_to_page(kaddr));
}

static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj)
{
        const struct btf_field *field;
        void **uptr_addr;
        int i;

        for (i = 0, field = rec->fields; i < cnt; i++, field++) {
                if (field->type != BPF_UPTR)
                        continue;

                uptr_addr = obj + field->offset;
                unpin_uptr_kaddr(*uptr_addr);
        }
}

static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj)
{
        if (!btf_record_has_field(rec, BPF_UPTR))
                return;

        __bpf_obj_unpin_uptrs(rec, rec->cnt, obj);
}

static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
{
        const struct btf_field *field;
        const struct btf_type *t;
        unsigned long start, end;
        struct page *page;
        void **uptr_addr;
        int i, err;

        if (!btf_record_has_field(rec, BPF_UPTR))
                return 0;

        for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
                if (field->type != BPF_UPTR)
                        continue;

                uptr_addr = obj + field->offset;
                start = *(unsigned long *)uptr_addr;
                if (!start)
                        continue;

                t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
                /* t->size was checked for zero before */
                if (check_add_overflow(start, t->size - 1, &end)) {
                        err = -EFAULT;
                        goto unpin_all;
                }

                /* The uptr's struct cannot span across two pages */
                if ((start & PAGE_MASK) != (end & PAGE_MASK)) {
                        err = -EOPNOTSUPP;
                        goto unpin_all;
                }

                err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page);
                if (err != 1)
                        goto unpin_all;

                if (PageHighMem(page)) {
                        err = -EOPNOTSUPP;
                        unpin_user_page(page);
                        goto unpin_all;
                }

                *uptr_addr = page_address(page) + offset_in_page(start);
        }

        return 0;

unpin_all:
        __bpf_obj_unpin_uptrs(rec, i, obj);
        return err;
}

static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
                                void *key, void *value, __u64 flags)
{
        int err;

        /* Need to create a kthread, thus must support schedule */
        if (bpf_map_is_offloaded(map)) {
                return bpf_map_offload_update_elem(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
                   map->map_type == BPF_MAP_TYPE_ARENA ||
                   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
                return map->ops->map_update_elem(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
                   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
                return sock_map_update_elem_sys(map, key, value, flags);
        } else if (IS_FD_PROG_ARRAY(map)) {
                return bpf_fd_array_map_update_elem(map, map_file, key, value,
                                                    flags);
        }

        bpf_disable_instrumentation();
        if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
            map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
                err = bpf_percpu_hash_update(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                err = bpf_percpu_array_update(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
                err = bpf_percpu_cgroup_storage_update(map, key, value,
                                                       flags);
        } else if (IS_FD_ARRAY(map)) {
                err = bpf_fd_array_map_update_elem(map, map_file, key, value,
                                                   flags);
        } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
                err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
                                                  flags);
        } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
                /* rcu_read_lock() is not needed */
                err = bpf_fd_reuseport_array_update_elem(map, key, value,
                                                         flags);
        } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
                   map->map_type == BPF_MAP_TYPE_STACK ||
                   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
                err = map->ops->map_push_elem(map, value, flags);
        } else {
                err = bpf_obj_pin_uptrs(map->record, value);
                if (!err) {
                        rcu_read_lock();
                        err = map->ops->map_update_elem(map, key, value, flags);
                        rcu_read_unlock();
                        if (err)
                                bpf_obj_unpin_uptrs(map->record, value);
                }
        }
        bpf_enable_instrumentation();

        return err;
}

static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
                              __u64 flags)
{
        void *ptr;
        int err;

        if (bpf_map_is_offloaded(map))
                return bpf_map_offload_lookup_elem(map, key, value);

        bpf_disable_instrumentation();
        if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
            map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
                err = bpf_percpu_hash_copy(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                err = bpf_percpu_array_copy(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
                err = bpf_percpu_cgroup_storage_copy(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
                err = bpf_stackmap_extract(map, key, value, false);
        } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
                err = bpf_fd_array_map_lookup_elem(map, key, value);
        } else if (IS_FD_HASH(map)) {
                err = bpf_fd_htab_map_lookup_elem(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
                err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
                   map->map_type == BPF_MAP_TYPE_STACK ||
                   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
                err = map->ops->map_peek_elem(map, value);
        } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
                /* struct_ops map requires directly updating "value" */
                err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
        } else {
                rcu_read_lock();
                if (map->ops->map_lookup_elem_sys_only)
                        ptr = map->ops->map_lookup_elem_sys_only(map, key);
                else
                        ptr = map->ops->map_lookup_elem(map, key);
                if (IS_ERR(ptr)) {
                        err = PTR_ERR(ptr);
                } else if (!ptr) {
                        err = -ENOENT;
                } else {
                        err = 0;
                        if (flags & BPF_F_LOCK)
                                /* lock 'ptr' and copy everything but lock */
                                copy_map_value_locked(map, value, ptr, true);
                        else
                                copy_map_value(map, value, ptr);
                        /* mask lock and timer, since value wasn't zero inited */
                        check_and_init_map_value(map, value);
                }
                rcu_read_unlock();
        }

        bpf_enable_instrumentation();

        return err;
}

/* Please, do not use this function outside from the map creation path
 * (e.g. in map update path) without taking care of setting the active
 * memory cgroup (see at bpf_map_kmalloc_node() for example).
 */
static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
{
        /* We really just want to fail instead of triggering OOM killer
         * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
         * which is used for lower order allocation requests.
         *
         * It has been observed that higher order allocation requests done by
         * vmalloc with __GFP_NORETRY being set might fail due to not trying
         * to reclaim memory from the page cache, thus we set
         * __GFP_RETRY_MAYFAIL to avoid such situations.
         */

        gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
        unsigned int flags = 0;
        unsigned long align = 1;
        void *area;

        if (size >= SIZE_MAX)
                return NULL;

        /* kmalloc()'ed memory can't be mmap()'ed */
        if (mmapable) {
                BUG_ON(!PAGE_ALIGNED(size));
                align = SHMLBA;
                flags = VM_USERMAP;
        } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
                area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
                                    numa_node);
                if (area != NULL)
                        return area;
        }

        return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
                        gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
                        flags, numa_node, __builtin_return_address(0));
}

void *bpf_map_area_alloc(u64 size, int numa_node)
{
        return __bpf_map_area_alloc(size, numa_node, false);
}

void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
{
        return __bpf_map_area_alloc(size, numa_node, true);
}

void bpf_map_area_free(void *area)
{
        kvfree(area);
}

static u32 bpf_map_flags_retain_permanent(u32 flags)
{
        /* Some map creation flags are not tied to the map object but
         * rather to the map fd instead, so they have no meaning upon
         * map object inspection since multiple file descriptors with
         * different (access) properties can exist here. Thus, given
         * this has zero meaning for the map itself, lets clear these
         * from here.
         */
        return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
}

void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
{
        map->map_type = attr->map_type;
        map->key_size = attr->key_size;
        map->value_size = attr->value_size;
        map->max_entries = attr->max_entries;
        map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
        map->numa_node = bpf_map_attr_numa_node(attr);
        map->map_extra = attr->map_extra;
}

static int bpf_map_alloc_id(struct bpf_map *map)
{
        int id;

        idr_preload(GFP_KERNEL);
        spin_lock_bh(&map_idr_lock);
        id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
        if (id > 0)
                map->id = id;
        spin_unlock_bh(&map_idr_lock);
        idr_preload_end();

        if (WARN_ON_ONCE(!id))
                return -ENOSPC;

        return id > 0 ? 0 : id;
}

void bpf_map_free_id(struct bpf_map *map)
{
        unsigned long flags;

        /* Offloaded maps are removed from the IDR store when their device
         * disappears - even if someone holds an fd to them they are unusable,
         * the memory is gone, all ops will fail; they are simply waiting for
         * refcnt to drop to be freed.
         */
        if (!map->id)
                return;

        spin_lock_irqsave(&map_idr_lock, flags);

        idr_remove(&map_idr, map->id);
        map->id = 0;

        spin_unlock_irqrestore(&map_idr_lock, flags);
}

#ifdef CONFIG_MEMCG
static void bpf_map_save_memcg(struct bpf_map *map)
{
        /* Currently if a map is created by a process belonging to the root
         * memory cgroup, get_obj_cgroup_from_current() will return NULL.
         * So we have to check map->objcg for being NULL each time it's
         * being used.
         */
        if (memcg_bpf_enabled())
                map->objcg = get_obj_cgroup_from_current();
}

static void bpf_map_release_memcg(struct bpf_map *map)
{
        if (map->objcg)
                obj_cgroup_put(map->objcg);
}

static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
{
        if (map->objcg)
                return get_mem_cgroup_from_objcg(map->objcg);

        return root_mem_cgroup;
}

void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
                         struct mem_cgroup **new_memcg)
{
        *new_memcg = bpf_map_get_memcg(map);
        *old_memcg = set_active_memcg(*new_memcg);
}

void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
                        struct mem_cgroup *new_memcg)
{
        set_active_memcg(old_memcg);
        mem_cgroup_put(new_memcg);
}

void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
                           int node)
{
        struct mem_cgroup *memcg, *old_memcg;
        void *ptr;

        bpf_map_memcg_enter(map, &old_memcg, &memcg);
        ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
        bpf_map_memcg_exit(old_memcg, memcg);

        return ptr;
}

void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
                             int node)
{
        struct mem_cgroup *memcg, *old_memcg;
        void *ptr;

        bpf_map_memcg_enter(map, &old_memcg, &memcg);
        ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
        bpf_map_memcg_exit(old_memcg, memcg);

        return ptr;
}

void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
        struct mem_cgroup *memcg, *old_memcg;
        void *ptr;

        bpf_map_memcg_enter(map, &old_memcg, &memcg);
        ptr = kzalloc(size, flags | __GFP_ACCOUNT);
        bpf_map_memcg_exit(old_memcg, memcg);

        return ptr;
}

void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
                       gfp_t flags)
{
        struct mem_cgroup *memcg, *old_memcg;
        void *ptr;

        bpf_map_memcg_enter(map, &old_memcg, &memcg);
        ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
        bpf_map_memcg_exit(old_memcg, memcg);

        return ptr;
}

void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
                                    size_t align, gfp_t flags)
{
        struct mem_cgroup *memcg, *old_memcg;
        void __percpu *ptr;

        bpf_map_memcg_enter(map, &old_memcg, &memcg);
        ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
        bpf_map_memcg_exit(old_memcg, memcg);

        return ptr;
}

#else
static void bpf_map_save_memcg(struct bpf_map *map)
{
}

static void bpf_map_release_memcg(struct bpf_map *map)
{
}
#endif

static bool can_alloc_pages(void)
{
        return preempt_count() == 0 && !irqs_disabled() &&
                !IS_ENABLED(CONFIG_PREEMPT_RT);
}

static struct page *__bpf_alloc_page(int nid)
{
        if (!can_alloc_pages())
                return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0);

        return alloc_pages_node(nid,
                                GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
                                | __GFP_NOWARN,
                                0);
}

int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
                        unsigned long nr_pages, struct page **pages)
{
        unsigned long i, j;
        struct page *pg;
        int ret = 0;

        for (i = 0; i < nr_pages; i++) {
                pg = __bpf_alloc_page(nid);

                if (pg) {
                        pages[i] = pg;
                        continue;
                }
                for (j = 0; j < i; j++)
                        free_pages_nolock(pages[j], 0);
                ret = -ENOMEM;
                break;
        }

        return ret;
}


static int btf_field_cmp(const void *a, const void *b)
{
        const struct btf_field *f1 = a, *f2 = b;

        if (f1->offset < f2->offset)
                return -1;
        else if (f1->offset > f2->offset)
                return 1;
        return 0;
}

struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
                                  u32 field_mask)
{
        struct btf_field *field;

        if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
                return NULL;
        field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
        if (!field || !(field->type & field_mask))
                return NULL;
        return field;
}

void btf_record_free(struct btf_record *rec)
{
        int i;

        if (IS_ERR_OR_NULL(rec))
                return;
        for (i = 0; i < rec->cnt; i++) {
                switch (rec->fields[i].type) {
                case BPF_KPTR_UNREF:
                case BPF_KPTR_REF:
                case BPF_KPTR_PERCPU:
                case BPF_UPTR:
                        if (rec->fields[i].kptr.module)
                                module_put(rec->fields[i].kptr.module);
                        if (btf_is_kernel(rec->fields[i].kptr.btf))
                                btf_put(rec->fields[i].kptr.btf);
                        break;
                case BPF_LIST_HEAD:
                case BPF_LIST_NODE:
                case BPF_RB_ROOT:
                case BPF_RB_NODE:
                case BPF_SPIN_LOCK:
                case BPF_RES_SPIN_LOCK:
                case BPF_TIMER:
                case BPF_REFCOUNT:
                case BPF_WORKQUEUE:
                case BPF_TASK_WORK:
                        /* Nothing to release */
                        break;
                default:
                        WARN_ON_ONCE(1);
                        continue;
                }
        }
        kfree(rec);
}

void bpf_map_free_record(struct bpf_map *map)
{
        btf_record_free(map->record);
        map->record = NULL;
}

struct btf_record *btf_record_dup(const struct btf_record *rec)
{
        const struct btf_field *fields;
        struct btf_record *new_rec;
        int ret, size, i;

        if (IS_ERR_OR_NULL(rec))
                return NULL;
        size = struct_size(rec, fields, rec->cnt);
        new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
        if (!new_rec)
                return ERR_PTR(-ENOMEM);
        /* Do a deep copy of the btf_record */
        fields = rec->fields;
        new_rec->cnt = 0;
        for (i = 0; i < rec->cnt; i++) {
                switch (fields[i].type) {
                case BPF_KPTR_UNREF:
                case BPF_KPTR_REF:
                case BPF_KPTR_PERCPU:
                case BPF_UPTR:
                        if (btf_is_kernel(fields[i].kptr.btf))
                                btf_get(fields[i].kptr.btf);
                        if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
                                ret = -ENXIO;
                                goto free;
                        }
                        break;
                case BPF_LIST_HEAD:
                case BPF_LIST_NODE:
                case BPF_RB_ROOT:
                case BPF_RB_NODE:
                case BPF_SPIN_LOCK:
                case BPF_RES_SPIN_LOCK:
                case BPF_TIMER:
                case BPF_REFCOUNT:
                case BPF_WORKQUEUE:
                case BPF_TASK_WORK:
                        /* Nothing to acquire */
                        break;
                default:
                        ret = -EFAULT;
                        WARN_ON_ONCE(1);
                        goto free;
                }
                new_rec->cnt++;
        }
        return new_rec;
free:
        btf_record_free(new_rec);
        return ERR_PTR(ret);
}

bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
{
        bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
        int size;

        if (!a_has_fields && !b_has_fields)
                return true;
        if (a_has_fields != b_has_fields)
                return false;
        if (rec_a->cnt != rec_b->cnt)
                return false;
        size = struct_size(rec_a, fields, rec_a->cnt);
        /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
         * members are zeroed out. So memcmp is safe to do without worrying
         * about padding/unused fields.
         *
         * While spin_lock, timer, and kptr have no relation to map BTF,
         * list_head metadata is specific to map BTF, the btf and value_rec
         * members in particular. btf is the map BTF, while value_rec points to
         * btf_record in that map BTF.
         *
         * So while by default, we don't rely on the map BTF (which the records
         * were parsed from) matching for both records, which is not backwards
         * compatible, in case list_head is part of it, we implicitly rely on
         * that by way of depending on memcmp succeeding for it.
         */
        return !memcmp(rec_a, rec_b, size);
}

void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
{
        if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
                return;
        bpf_timer_cancel_and_free(obj + rec->timer_off);
}

void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
{
        if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
                return;
        bpf_wq_cancel_and_free(obj + rec->wq_off);
}

void bpf_obj_free_task_work(const struct btf_record *rec, void *obj)
{
        if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK)))
                return;
        bpf_task_work_cancel_and_free(obj + rec->task_work_off);
}

void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
        const struct btf_field *fields;
        int i;

        if (IS_ERR_OR_NULL(rec))
                return;
        fields = rec->fields;
        for (i = 0; i < rec->cnt; i++) {
                struct btf_struct_meta *pointee_struct_meta;
                const struct btf_field *field = &fields[i];
                void *field_ptr = obj + field->offset;
                void *xchgd_field;

                switch (fields[i].type) {
                case BPF_SPIN_LOCK:
                case BPF_RES_SPIN_LOCK:
                        break;
                case BPF_TIMER:
                        bpf_timer_cancel_and_free(field_ptr);
                        break;
                case BPF_WORKQUEUE:
                        bpf_wq_cancel_and_free(field_ptr);
                        break;
                case BPF_TASK_WORK:
                        bpf_task_work_cancel_and_free(field_ptr);
                        break;
                case BPF_KPTR_UNREF:
                        WRITE_ONCE(*(u64 *)field_ptr, 0);
                        break;
                case BPF_KPTR_REF:
                case BPF_KPTR_PERCPU:
                        xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
                        if (!xchgd_field)
                                break;

                        if (!btf_is_kernel(field->kptr.btf)) {
                                pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
                                                                           field->kptr.btf_id);
                                __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
                                                                 pointee_struct_meta->record : NULL,
                                                                 fields[i].type == BPF_KPTR_PERCPU);
                        } else {
                                field->kptr.dtor(xchgd_field);
                        }
                        break;
                case BPF_UPTR:
                        /* The caller ensured that no one is using the uptr */
                        unpin_uptr_kaddr(*(void **)field_ptr);
                        break;
                case BPF_LIST_HEAD:
                        if (WARN_ON_ONCE(rec->spin_lock_off < 0))
                                continue;
                        bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
                        break;
                case BPF_RB_ROOT:
                        if (WARN_ON_ONCE(rec->spin_lock_off < 0))
                                continue;
                        bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
                        break;
                case BPF_LIST_NODE:
                case BPF_RB_NODE:
                case BPF_REFCOUNT:
                        break;
                default:
                        WARN_ON_ONCE(1);
                        continue;
                }
        }
}

static void bpf_map_free(struct bpf_map *map)
{
        struct btf_record *rec = map->record;
        struct btf *btf = map->btf;

        /* implementation dependent freeing. Disabling migration to simplify
         * the free of values or special fields allocated from bpf memory
         * allocator.
         */
        kfree(map->excl_prog_sha);
        migrate_disable();
        map->ops->map_free(map);
        migrate_enable();

        /* Delay freeing of btf_record for maps, as map_free
         * callback usually needs access to them. It is better to do it here
         * than require each callback to do the free itself manually.
         *
         * Note that the btf_record stashed in map->inner_map_meta->record was
         * already freed using the map_free callback for map in map case which
         * eventually calls bpf_map_free_meta, since inner_map_meta is only a
         * template bpf_map struct used during verification.
         */
        btf_record_free(rec);
        /* Delay freeing of btf for maps, as map_free callback may need
         * struct_meta info which will be freed with btf_put().
         */
        btf_put(btf);
}

/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
        struct bpf_map *map = container_of(work, struct bpf_map, work);

        security_bpf_map_free(map);
        bpf_map_release_memcg(map);
        bpf_map_owner_free(map);
        bpf_map_free(map);
}

static void bpf_map_put_uref(struct bpf_map *map)
{
        if (atomic64_dec_and_test(&map->usercnt)) {
                if (map->ops->map_release_uref)
                        map->ops->map_release_uref(map);
        }
}

static void bpf_map_free_in_work(struct bpf_map *map)
{
        INIT_WORK(&map->work, bpf_map_free_deferred);
        /* Avoid spawning kworkers, since they all might contend
         * for the same mutex like slab_mutex.
         */
        queue_work(system_dfl_wq, &map->work);
}

static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
{
        bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
}

/* decrement map refcnt and schedule it for freeing via workqueue
 * (underlying map implementation ops->map_free() might sleep)
 */
void bpf_map_put(struct bpf_map *map)
{
        if (atomic64_dec_and_test(&map->refcnt)) {
                /* bpf_map_free_id() must be called first */
                bpf_map_free_id(map);

                WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
                /* RCU tasks trace grace period implies RCU grace period. */
                if (READ_ONCE(map->free_after_mult_rcu_gp))
                        call_rcu_tasks_trace(&map->rcu, bpf_map_free_rcu_gp);
                else if (READ_ONCE(map->free_after_rcu_gp))
                        call_rcu(&map->rcu, bpf_map_free_rcu_gp);
                else
                        bpf_map_free_in_work(map);
        }
}
EXPORT_SYMBOL_GPL(bpf_map_put);

void bpf_map_put_with_uref(struct bpf_map *map)
{
        bpf_map_put_uref(map);
        bpf_map_put(map);
}

static int bpf_map_release(struct inode *inode, struct file *filp)
{
        struct bpf_map *map = filp->private_data;

        if (map->ops->map_release)
                map->ops->map_release(map, filp);

        bpf_map_put_with_uref(map);
        return 0;
}

static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
{
        fmode_t mode = fd_file(f)->f_mode;

        /* Our file permissions may have been overridden by global
         * map permissions facing syscall side.
         */
        if (READ_ONCE(map->frozen))
                mode &= ~FMODE_CAN_WRITE;
        return mode;
}

#ifdef CONFIG_PROC_FS
/* Show the memory usage of a bpf map */
static u64 bpf_map_memory_usage(const struct bpf_map *map)
{
        return map->ops->map_mem_usage(map);
}

static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
        struct bpf_map *map = filp->private_data;
        u32 type = 0, jited = 0;

        spin_lock(&map->owner_lock);
        if (map->owner) {
                type  = map->owner->type;
                jited = map->owner->jited;
        }
        spin_unlock(&map->owner_lock);

        seq_printf(m,
                   "map_type:\t%u\n"
                   "key_size:\t%u\n"
                   "value_size:\t%u\n"
                   "max_entries:\t%u\n"
                   "map_flags:\t%#x\n"
                   "map_extra:\t%#llx\n"
                   "memlock:\t%llu\n"
                   "map_id:\t%u\n"
                   "frozen:\t%u\n",
                   map->map_type,
                   map->key_size,
                   map->value_size,
                   map->max_entries,
                   map->map_flags,
                   (unsigned long long)map->map_extra,
                   bpf_map_memory_usage(map),
                   map->id,
                   READ_ONCE(map->frozen));
        if (type) {
                seq_printf(m, "owner_prog_type:\t%u\n", type);
                seq_printf(m, "owner_jited:\t%u\n", jited);
        }
}
#endif

static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
                              loff_t *ppos)
{
        /* We need this handler such that alloc_file() enables
         * f_mode with FMODE_CAN_READ.
         */
        return -EINVAL;
}

static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
                               size_t siz, loff_t *ppos)
{
        /* We need this handler such that alloc_file() enables
         * f_mode with FMODE_CAN_WRITE.
         */
        return -EINVAL;
}

/* called for any extra memory-mapped regions (except initial) */
static void bpf_map_mmap_open(struct vm_area_struct *vma)
{
        struct bpf_map *map = vma->vm_file->private_data;

        if (vma->vm_flags & VM_MAYWRITE)
                bpf_map_write_active_inc(map);
}

/* called for all unmapped memory region (including initial) */
static void bpf_map_mmap_close(struct vm_area_struct *vma)
{
        struct bpf_map *map = vma->vm_file->private_data;

        if (vma->vm_flags & VM_MAYWRITE)
                bpf_map_write_active_dec(map);
}

static const struct vm_operations_struct bpf_map_default_vmops = {
        .open                = bpf_map_mmap_open,
        .close                = bpf_map_mmap_close,
};

static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
        struct bpf_map *map = filp->private_data;
        int err = 0;

        if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
                return -ENOTSUPP;

        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;

        mutex_lock(&map->freeze_mutex);

        if (vma->vm_flags & VM_WRITE) {
                if (map->frozen) {
                        err = -EPERM;
                        goto out;
                }
                /* map is meant to be read-only, so do not allow mapping as
                 * writable, because it's possible to leak a writable page
                 * reference and allows user-space to still modify it after
                 * freezing, while verifier will assume contents do not change
                 */
                if (map->map_flags & BPF_F_RDONLY_PROG) {
                        err = -EACCES;
                        goto out;
                }
                bpf_map_write_active_inc(map);
        }
out:
        mutex_unlock(&map->freeze_mutex);
        if (err)
                return err;

        /* set default open/close callbacks */
        vma->vm_ops = &bpf_map_default_vmops;
        vma->vm_private_data = map;
        vm_flags_clear(vma, VM_MAYEXEC);
        /* If mapping is read-only, then disallow potentially re-mapping with
         * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing
         * means that as far as BPF map's memory-mapped VMAs are concerned,
         * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set,
         * both should be set, so we can forget about VM_MAYWRITE and always
         * check just VM_WRITE
         */
        if (!(vma->vm_flags & VM_WRITE))
                vm_flags_clear(vma, VM_MAYWRITE);

        err = map->ops->map_mmap(map, vma);
        if (err) {
                if (vma->vm_flags & VM_WRITE)
                        bpf_map_write_active_dec(map);
        }

        return err;
}

static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
{
        struct bpf_map *map = filp->private_data;

        if (map->ops->map_poll)
                return map->ops->map_poll(map, filp, pts);

        return EPOLLERR;
}

static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr,
                                           unsigned long len, unsigned long pgoff,
                                           unsigned long flags)
{
        struct bpf_map *map = filp->private_data;

        if (map->ops->map_get_unmapped_area)
                return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
#ifdef CONFIG_MMU
        return mm_get_unmapped_area(filp, addr, len, pgoff, flags);
#else
        return addr;
#endif
}

const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = bpf_map_show_fdinfo,
#endif
        .release        = bpf_map_release,
        .read                = bpf_dummy_read,
        .write                = bpf_dummy_write,
        .mmap                = bpf_map_mmap,
        .poll                = bpf_map_poll,
        .get_unmapped_area = bpf_get_unmapped_area,
};

int bpf_map_new_fd(struct bpf_map *map, int flags)
{
        int ret;

        ret = security_bpf_map(map, OPEN_FMODE(flags));
        if (ret < 0)
                return ret;

        return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
                                flags | O_CLOEXEC);
}

int bpf_get_file_flag(int flags)
{
        if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
                return -EINVAL;
        if (flags & BPF_F_RDONLY)
                return O_RDONLY;
        if (flags & BPF_F_WRONLY)
                return O_WRONLY;
        return O_RDWR;
}

/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
        memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
                   sizeof(attr->CMD##_LAST_FIELD), 0, \
                   sizeof(*attr) - \
                   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
                   sizeof(attr->CMD##_LAST_FIELD)) != NULL

/* dst and src must have at least "size" number of bytes.
 * Return strlen on success and < 0 on error.
 */
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
{
        const char *end = src + size;
        const char *orig_src = src;

        memset(dst, 0, size);
        /* Copy all isalnum(), '_' and '.' chars. */
        while (src < end && *src) {
                if (!isalnum(*src) &&
                    *src != '_' && *src != '.')
                        return -EINVAL;
                *dst++ = *src++;
        }

        /* No '\0' found in "size" number of bytes */
        if (src == end)
                return -EINVAL;

        return src - orig_src;
}
EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);

int map_check_no_btf(struct bpf_map *map,
                     const struct btf *btf,
                     const struct btf_type *key_type,
                     const struct btf_type *value_type)
{
        return -ENOTSUPP;
}

static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
                         const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
{
        const struct btf_type *key_type, *value_type;
        u32 key_size, value_size;
        int ret = 0;

        /* Some maps allow key to be unspecified. */
        if (btf_key_id) {
                key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
                if (!key_type || key_size != map->key_size)
                        return -EINVAL;
        } else {
                key_type = btf_type_by_id(btf, 0);
                if (!map->ops->map_check_btf)
                        return -EINVAL;
        }

        value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
        if (!value_type || value_size != map->value_size)
                return -EINVAL;

        map->record = btf_parse_fields(btf, value_type,
                                       BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
                                       BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR |
                                       BPF_TASK_WORK,
                                       map->value_size);
        if (!IS_ERR_OR_NULL(map->record)) {
                int i;

                if (!bpf_token_capable(token, CAP_BPF)) {
                        ret = -EPERM;
                        goto free_map_tab;
                }
                if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
                        ret = -EACCES;
                        goto free_map_tab;
                }
                for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
                        switch (map->record->field_mask & (1 << i)) {
                        case 0:
                                continue;
                        case BPF_SPIN_LOCK:
                        case BPF_RES_SPIN_LOCK:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
                                    map->map_type != BPF_MAP_TYPE_ARRAY &&
                                    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
                                        ret = -EOPNOTSUPP;
                                        goto free_map_tab;
                                }
                                break;
                        case BPF_TIMER:
                        case BPF_WORKQUEUE:
                        case BPF_TASK_WORK:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_ARRAY) {
                                        ret = -EOPNOTSUPP;
                                        goto free_map_tab;
                                }
                                break;
                        case BPF_KPTR_UNREF:
                        case BPF_KPTR_REF:
                        case BPF_KPTR_PERCPU:
                        case BPF_REFCOUNT:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
                                    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_ARRAY &&
                                    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
                                    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
                                        ret = -EOPNOTSUPP;
                                        goto free_map_tab;
                                }
                                break;
                        case BPF_UPTR:
                                if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
                                        ret = -EOPNOTSUPP;
                                        goto free_map_tab;
                                }
                                break;
                        case BPF_LIST_HEAD:
                        case BPF_RB_ROOT:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_ARRAY) {
                                        ret = -EOPNOTSUPP;
                                        goto free_map_tab;
                                }
                                break;
                        default:
                                /* Fail if map_type checks are missing for a field type */
                                ret = -EOPNOTSUPP;
                                goto free_map_tab;
                        }
                }
        }

        ret = btf_check_and_fixup_fields(btf, map->record);
        if (ret < 0)
                goto free_map_tab;

        if (map->ops->map_check_btf) {
                ret = map->ops->map_check_btf(map, btf, key_type, value_type);
                if (ret < 0)
                        goto free_map_tab;
        }

        return ret;
free_map_tab:
        bpf_map_free_record(map);
        return ret;
}

#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
static int map_create(union bpf_attr *attr, bpfptr_t uattr)
{
        const struct bpf_map_ops *ops;
        struct bpf_token *token = NULL;
        int numa_node = bpf_map_attr_numa_node(attr);
        u32 map_type = attr->map_type;
        struct bpf_map *map;
        bool token_flag;
        int f_flags;
        int err;

        err = CHECK_ATTR(BPF_MAP_CREATE);
        if (err)
                return -EINVAL;

        /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
         * to avoid per-map type checks tripping on unknown flag
         */
        token_flag = attr->map_flags & BPF_F_TOKEN_FD;
        attr->map_flags &= ~BPF_F_TOKEN_FD;

        if (attr->btf_vmlinux_value_type_id) {
                if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
                    attr->btf_key_type_id || attr->btf_value_type_id)
                        return -EINVAL;
        } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
                return -EINVAL;
        }

        if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
            attr->map_type != BPF_MAP_TYPE_ARENA &&
            attr->map_extra != 0)
                return -EINVAL;

        f_flags = bpf_get_file_flag(attr->map_flags);
        if (f_flags < 0)
                return f_flags;

        if (numa_node != NUMA_NO_NODE &&
            ((unsigned int)numa_node >= nr_node_ids ||
             !node_online(numa_node)))
                return -EINVAL;

        /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
        map_type = attr->map_type;
        if (map_type >= ARRAY_SIZE(bpf_map_types))
                return -EINVAL;
        map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
        ops = bpf_map_types[map_type];
        if (!ops)
                return -EINVAL;

        if (ops->map_alloc_check) {
                err = ops->map_alloc_check(attr);
                if (err)
                        return err;
        }
        if (attr->map_ifindex)
                ops = &bpf_map_offload_ops;
        if (!ops->map_mem_usage)
                return -EINVAL;

        if (token_flag) {
                token = bpf_token_get_from_fd(attr->map_token_fd);
                if (IS_ERR(token))
                        return PTR_ERR(token);

                /* if current token doesn't grant map creation permissions,
                 * then we can't use this token, so ignore it and rely on
                 * system-wide capabilities checks
                 */
                if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
                    !bpf_token_allow_map_type(token, attr->map_type)) {
                        bpf_token_put(token);
                        token = NULL;
                }
        }

        err = -EPERM;

        /* Intent here is for unprivileged_bpf_disabled to block BPF map
         * creation for unprivileged users; other actions depend
         * on fd availability and access to bpffs, so are dependent on
         * object creation success. Even with unprivileged BPF disabled,
         * capability checks are still carried out.
         */
        if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
                goto put_token;

        /* check privileged map type permissions */
        switch (map_type) {
        case BPF_MAP_TYPE_ARRAY:
        case BPF_MAP_TYPE_PERCPU_ARRAY:
        case BPF_MAP_TYPE_PROG_ARRAY:
        case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
        case BPF_MAP_TYPE_CGROUP_ARRAY:
        case BPF_MAP_TYPE_ARRAY_OF_MAPS:
        case BPF_MAP_TYPE_HASH:
        case BPF_MAP_TYPE_PERCPU_HASH:
        case BPF_MAP_TYPE_HASH_OF_MAPS:
        case BPF_MAP_TYPE_RINGBUF:
        case BPF_MAP_TYPE_USER_RINGBUF:
        case BPF_MAP_TYPE_CGROUP_STORAGE:
        case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
                /* unprivileged */
                break;
        case BPF_MAP_TYPE_SK_STORAGE:
        case BPF_MAP_TYPE_INODE_STORAGE:
        case BPF_MAP_TYPE_TASK_STORAGE:
        case BPF_MAP_TYPE_CGRP_STORAGE:
        case BPF_MAP_TYPE_BLOOM_FILTER:
        case BPF_MAP_TYPE_LPM_TRIE:
        case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
        case BPF_MAP_TYPE_STACK_TRACE:
        case BPF_MAP_TYPE_QUEUE:
        case BPF_MAP_TYPE_STACK:
        case BPF_MAP_TYPE_LRU_HASH:
        case BPF_MAP_TYPE_LRU_PERCPU_HASH:
        case BPF_MAP_TYPE_STRUCT_OPS:
        case BPF_MAP_TYPE_CPUMAP:
        case BPF_MAP_TYPE_ARENA:
        case BPF_MAP_TYPE_INSN_ARRAY:
                if (!bpf_token_capable(token, CAP_BPF))
                        goto put_token;
                break;
        case BPF_MAP_TYPE_SOCKMAP:
        case BPF_MAP_TYPE_SOCKHASH:
        case BPF_MAP_TYPE_DEVMAP:
        case BPF_MAP_TYPE_DEVMAP_HASH:
        case BPF_MAP_TYPE_XSKMAP:
                if (!bpf_token_capable(token, CAP_NET_ADMIN))
                        goto put_token;
                break;
        default:
                WARN(1, "unsupported map type %d", map_type);
                goto put_token;
        }

        map = ops->map_alloc(attr);
        if (IS_ERR(map)) {
                err = PTR_ERR(map);
                goto put_token;
        }
        map->ops = ops;
        map->map_type = map_type;

        err = bpf_obj_name_cpy(map->name, attr->map_name,
                               sizeof(attr->map_name));
        if (err < 0)
                goto free_map;

        preempt_disable();
        map->cookie = gen_cookie_next(&bpf_map_cookie);
        preempt_enable();

        atomic64_set(&map->refcnt, 1);
        atomic64_set(&map->usercnt, 1);
        mutex_init(&map->freeze_mutex);
        spin_lock_init(&map->owner_lock);

        if (attr->btf_key_type_id || attr->btf_value_type_id ||
            /* Even the map's value is a kernel's struct,
             * the bpf_prog.o must have BTF to begin with
             * to figure out the corresponding kernel's
             * counter part.  Thus, attr->btf_fd has
             * to be valid also.
             */
            attr->btf_vmlinux_value_type_id) {
                struct btf *btf;

                btf = btf_get_by_fd(attr->btf_fd);
                if (IS_ERR(btf)) {
                        err = PTR_ERR(btf);
                        goto free_map;
                }
                if (btf_is_kernel(btf)) {
                        btf_put(btf);
                        err = -EACCES;
                        goto free_map;
                }
                map->btf = btf;

                if (attr->btf_value_type_id) {
                        err = map_check_btf(map, token, btf, attr->btf_key_type_id,
                                            attr->btf_value_type_id);
                        if (err)
                                goto free_map;
                }

                map->btf_key_type_id = attr->btf_key_type_id;
                map->btf_value_type_id = attr->btf_value_type_id;
                map->btf_vmlinux_value_type_id =
                        attr->btf_vmlinux_value_type_id;
        }

        if (attr->excl_prog_hash) {
                bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel);

                if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) {
                        err = -EINVAL;
                        goto free_map;
                }

                map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
                if (!map->excl_prog_sha) {
                        err = -ENOMEM;
                        goto free_map;
                }

                if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) {
                        err = -EFAULT;
                        goto free_map;
                }
        } else if (attr->excl_prog_hash_size) {
                err = -EINVAL;
                goto free_map;
        }

        err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
        if (err)
                goto free_map_sec;

        err = bpf_map_alloc_id(map);
        if (err)
                goto free_map_sec;

        bpf_map_save_memcg(map);
        bpf_token_put(token);

        err = bpf_map_new_fd(map, f_flags);
        if (err < 0) {
                /* failed to allocate fd.
                 * bpf_map_put_with_uref() is needed because the above
                 * bpf_map_alloc_id() has published the map
                 * to the userspace and the userspace may
                 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
                 */
                bpf_map_put_with_uref(map);
                return err;
        }

        return err;

free_map_sec:
        security_bpf_map_free(map);
free_map:
        bpf_map_free(map);
put_token:
        bpf_token_put(token);
        return err;
}

void bpf_map_inc(struct bpf_map *map)
{
        atomic64_inc(&map->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_map_inc);

void bpf_map_inc_with_uref(struct bpf_map *map)
{
        atomic64_inc(&map->refcnt);
        atomic64_inc(&map->usercnt);
}
EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);

struct bpf_map *bpf_map_get(u32 ufd)
{
        CLASS(fd, f)(ufd);
        struct bpf_map *map = __bpf_map_get(f);

        if (!IS_ERR(map))
                bpf_map_inc(map);

        return map;
}
EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL");

struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
        CLASS(fd, f)(ufd);
        struct bpf_map *map = __bpf_map_get(f);

        if (!IS_ERR(map))
                bpf_map_inc_with_uref(map);

        return map;
}

/* map_idr_lock should have been held or the map should have been
 * protected by rcu read lock.
 */
struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
        int refold;

        refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
        if (!refold)
                return ERR_PTR(-ENOENT);
        if (uref)
                atomic64_inc(&map->usercnt);

        return map;
}

struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
{
        lockdep_assert(rcu_read_lock_held());
        return __bpf_map_inc_not_zero(map, false);
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);

int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
                                bool delete)
{
        return -ENOTSUPP;
}

static void *__bpf_copy_key(void __user *ukey, u64 key_size)
{
        if (key_size)
                return vmemdup_user(ukey, key_size);

        if (ukey)
                return ERR_PTR(-EINVAL);

        return NULL;
}

static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
{
        if (key_size)
                return kvmemdup_bpfptr(ukey, key_size);

        if (!bpfptr_is_null(ukey))
                return ERR_PTR(-EINVAL);

        return NULL;
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags

static int map_lookup_elem(union bpf_attr *attr)
{
        void __user *ukey = u64_to_user_ptr(attr->key);
        void __user *uvalue = u64_to_user_ptr(attr->value);
        struct bpf_map *map;
        void *key, *value;
        u32 value_size;
        int err;

        if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
                return -EINVAL;

        CLASS(fd, f)(attr->map_fd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
                return -EPERM;

        err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU);
        if (err)
                return err;

        key = __bpf_copy_key(ukey, map->key_size);
        if (IS_ERR(key))
                return PTR_ERR(key);

        value_size = bpf_map_value_size(map, attr->flags);

        err = -ENOMEM;
        value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value)
                goto free_key;

        if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
                if (copy_from_user(value, uvalue, value_size))
                        err = -EFAULT;
                else
                        err = bpf_map_copy_value(map, key, value, attr->flags);
                goto free_value;
        }

        err = bpf_map_copy_value(map, key, value, attr->flags);
        if (err)
                goto free_value;

        err = -EFAULT;
        if (copy_to_user(uvalue, value, value_size) != 0)
                goto free_value;

        err = 0;

free_value:
        kvfree(value);
free_key:
        kvfree(key);
        return err;
}


#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags

static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
        bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
        bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
        struct bpf_map *map;
        void *key, *value;
        u32 value_size;
        int err;

        if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
                return -EINVAL;

        CLASS(fd, f)(attr->map_fd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        bpf_map_write_active_inc(map);
        if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                err = -EPERM;
                goto err_put;
        }

        err = bpf_map_check_op_flags(map, attr->flags, ~0);
        if (err)
                goto err_put;

        key = ___bpf_copy_key(ukey, map->key_size);
        if (IS_ERR(key)) {
                err = PTR_ERR(key);
                goto err_put;
        }

        value_size = bpf_map_value_size(map, attr->flags);
        value = kvmemdup_bpfptr(uvalue, value_size);
        if (IS_ERR(value)) {
                err = PTR_ERR(value);
                goto free_key;
        }

        err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
        if (!err)
                maybe_wait_bpf_programs(map);

        kvfree(value);
free_key:
        kvfree(key);
err_put:
        bpf_map_write_active_dec(map);
        return err;
}

#define BPF_MAP_DELETE_ELEM_LAST_FIELD key

static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
        bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
        struct bpf_map *map;
        void *key;
        int err;

        if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
                return -EINVAL;

        CLASS(fd, f)(attr->map_fd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        bpf_map_write_active_inc(map);
        if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                err = -EPERM;
                goto err_put;
        }

        key = ___bpf_copy_key(ukey, map->key_size);
        if (IS_ERR(key)) {
                err = PTR_ERR(key);
                goto err_put;
        }

        if (bpf_map_is_offloaded(map)) {
                err = bpf_map_offload_delete_elem(map, key);
                goto out;
        } else if (IS_FD_PROG_ARRAY(map) ||
                   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
                /* These maps require sleepable context */
                err = map->ops->map_delete_elem(map, key);
                goto out;
        }

        bpf_disable_instrumentation();
        rcu_read_lock();
        err = map->ops->map_delete_elem(map, key);
        rcu_read_unlock();
        bpf_enable_instrumentation();
        if (!err)
                maybe_wait_bpf_programs(map);
out:
        kvfree(key);
err_put:
        bpf_map_write_active_dec(map);
        return err;
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key

static int map_get_next_key(union bpf_attr *attr)
{
        void __user *ukey = u64_to_user_ptr(attr->key);
        void __user *unext_key = u64_to_user_ptr(attr->next_key);
        struct bpf_map *map;
        void *key, *next_key;
        int err;

        if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
                return -EINVAL;

        CLASS(fd, f)(attr->map_fd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
                return -EPERM;

        if (ukey) {
                key = __bpf_copy_key(ukey, map->key_size);
                if (IS_ERR(key))
                        return PTR_ERR(key);
        } else {
                key = NULL;
        }

        err = -ENOMEM;
        next_key = kvmalloc(map->key_size, GFP_USER);
        if (!next_key)
                goto free_key;

        if (bpf_map_is_offloaded(map)) {
                err = bpf_map_offload_get_next_key(map, key, next_key);
                goto out;
        }

        rcu_read_lock();
        err = map->ops->map_get_next_key(map, key, next_key);
        rcu_read_unlock();
out:
        if (err)
                goto free_next_key;

        err = -EFAULT;
        if (copy_to_user(unext_key, next_key, map->key_size) != 0)
                goto free_next_key;

        err = 0;

free_next_key:
        kvfree(next_key);
free_key:
        kvfree(key);
        return err;
}

int generic_map_delete_batch(struct bpf_map *map,
                             const union bpf_attr *attr,
                             union bpf_attr __user *uattr)
{
        void __user *keys = u64_to_user_ptr(attr->batch.keys);
        u32 cp, max_count;
        int err = 0;
        void *key;

        if (attr->batch.elem_flags & ~BPF_F_LOCK)
                return -EINVAL;

        if ((attr->batch.elem_flags & BPF_F_LOCK) &&
            !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
                return -EINVAL;
        }

        max_count = attr->batch.count;
        if (!max_count)
                return 0;

        if (put_user(0, &uattr->batch.count))
                return -EFAULT;

        key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
        if (!key)
                return -ENOMEM;

        for (cp = 0; cp < max_count; cp++) {
                err = -EFAULT;
                if (copy_from_user(key, keys + cp * map->key_size,
                                   map->key_size))
                        break;

                if (bpf_map_is_offloaded(map)) {
                        err = bpf_map_offload_delete_elem(map, key);
                        break;
                }

                bpf_disable_instrumentation();
                rcu_read_lock();
                err = map->ops->map_delete_elem(map, key);
                rcu_read_unlock();
                bpf_enable_instrumentation();
                if (err)
                        break;
                cond_resched();
        }
        if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
                err = -EFAULT;

        kvfree(key);

        return err;
}

int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
                             const union bpf_attr *attr,
                             union bpf_attr __user *uattr)
{
        void __user *values = u64_to_user_ptr(attr->batch.values);
        void __user *keys = u64_to_user_ptr(attr->batch.keys);
        u32 value_size, cp, max_count;
        void *key, *value;
        int err = 0;

        err = bpf_map_check_op_flags(map, attr->batch.elem_flags,
                                     BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS);
        if (err)
                return err;

        value_size = bpf_map_value_size(map, attr->batch.elem_flags);

        max_count = attr->batch.count;
        if (!max_count)
                return 0;

        if (put_user(0, &uattr->batch.count))
                return -EFAULT;

        key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
        if (!key)
                return -ENOMEM;

        value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value) {
                kvfree(key);
                return -ENOMEM;
        }

        for (cp = 0; cp < max_count; cp++) {
                err = -EFAULT;
                if (copy_from_user(key, keys + cp * map->key_size,
                    map->key_size) ||
                    copy_from_user(value, values + cp * value_size, value_size))
                        break;

                err = bpf_map_update_value(map, map_file, key, value,
                                           attr->batch.elem_flags);

                if (err)
                        break;
                cond_resched();
        }

        if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
                err = -EFAULT;

        kvfree(value);
        kvfree(key);

        return err;
}

int generic_map_lookup_batch(struct bpf_map *map,
                                    const union bpf_attr *attr,
                                    union bpf_attr __user *uattr)
{
        void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
        void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
        void __user *values = u64_to_user_ptr(attr->batch.values);
        void __user *keys = u64_to_user_ptr(attr->batch.keys);
        void *buf, *buf_prevkey, *prev_key, *key, *value;
        u32 value_size, cp, max_count;
        int err;

        err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU);
        if (err)
                return err;

        value_size = bpf_map_value_size(map, attr->batch.elem_flags);

        max_count = attr->batch.count;
        if (!max_count)
                return 0;

        if (put_user(0, &uattr->batch.count))
                return -EFAULT;

        buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
        if (!buf_prevkey)
                return -ENOMEM;

        buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
        if (!buf) {
                kvfree(buf_prevkey);
                return -ENOMEM;
        }

        err = -EFAULT;
        prev_key = NULL;
        if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
                goto free_buf;
        key = buf;
        value = key + map->key_size;
        if (ubatch)
                prev_key = buf_prevkey;

        for (cp = 0; cp < max_count;) {
                rcu_read_lock();
                err = map->ops->map_get_next_key(map, prev_key, key);
                rcu_read_unlock();
                if (err)
                        break;
                err = bpf_map_copy_value(map, key, value,
                                         attr->batch.elem_flags);

                if (err == -ENOENT)
                        goto next_key;

                if (err)
                        goto free_buf;

                if (copy_to_user(keys + cp * map->key_size, key,
                                 map->key_size)) {
                        err = -EFAULT;
                        goto free_buf;
                }
                if (copy_to_user(values + cp * value_size, value, value_size)) {
                        err = -EFAULT;
                        goto free_buf;
                }

                cp++;
next_key:
                if (!prev_key)
                        prev_key = buf_prevkey;

                swap(prev_key, key);
                cond_resched();
        }

        if (err == -EFAULT)
                goto free_buf;

        if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
                    (cp && copy_to_user(uobatch, prev_key, map->key_size))))
                err = -EFAULT;

free_buf:
        kvfree(buf_prevkey);
        kvfree(buf);
        return err;
}

#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags

static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
        void __user *ukey = u64_to_user_ptr(attr->key);
        void __user *uvalue = u64_to_user_ptr(attr->value);
        struct bpf_map *map;
        void *key, *value;
        u32 value_size;
        int err;

        if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
                return -EINVAL;

        if (attr->flags & ~BPF_F_LOCK)
                return -EINVAL;

        CLASS(fd, f)(attr->map_fd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        bpf_map_write_active_inc(map);
        if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
            !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                err = -EPERM;
                goto err_put;
        }

        if (attr->flags &&
            (map->map_type == BPF_MAP_TYPE_QUEUE ||
             map->map_type == BPF_MAP_TYPE_STACK)) {
                err = -EINVAL;
                goto err_put;
        }

        if ((attr->flags & BPF_F_LOCK) &&
            !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
                err = -EINVAL;
                goto err_put;
        }

        key = __bpf_copy_key(ukey, map->key_size);
        if (IS_ERR(key)) {
                err = PTR_ERR(key);
                goto err_put;
        }

        value_size = bpf_map_value_size(map, 0);

        err = -ENOMEM;
        value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value)
                goto free_key;

        err = -ENOTSUPP;
        if (map->map_type == BPF_MAP_TYPE_QUEUE ||
            map->map_type == BPF_MAP_TYPE_STACK) {
                err = map->ops->map_pop_elem(map, value);
        } else if (map->map_type == BPF_MAP_TYPE_HASH ||
                   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
                   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
                   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
                   map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
                if (!bpf_map_is_offloaded(map)) {
                        bpf_disable_instrumentation();
                        rcu_read_lock();
                        err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
                        rcu_read_unlock();
                        bpf_enable_instrumentation();
                }
        }

        if (err)
                goto free_value;

        if (copy_to_user(uvalue, value, value_size) != 0) {
                err = -EFAULT;
                goto free_value;
        }

        err = 0;

free_value:
        kvfree(value);
free_key:
        kvfree(key);
err_put:
        bpf_map_write_active_dec(map);
        return err;
}

#define BPF_MAP_FREEZE_LAST_FIELD map_fd

static int map_freeze(const union bpf_attr *attr)
{
        int err = 0;
        struct bpf_map *map;

        if (CHECK_ATTR(BPF_MAP_FREEZE))
                return -EINVAL;

        CLASS(fd, f)(attr->map_fd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);

        if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record))
                return -ENOTSUPP;

        if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE))
                return -EPERM;

        mutex_lock(&map->freeze_mutex);
        if (bpf_map_write_active(map)) {
                err = -EBUSY;
                goto err_put;
        }
        if (READ_ONCE(map->frozen)) {
                err = -EBUSY;
                goto err_put;
        }

        WRITE_ONCE(map->frozen, true);
err_put:
        mutex_unlock(&map->freeze_mutex);
        return err;
}

static const struct bpf_prog_ops * const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        [_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
};

static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
{
        const struct bpf_prog_ops *ops;

        if (type >= ARRAY_SIZE(bpf_prog_types))
                return -EINVAL;
        type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
        ops = bpf_prog_types[type];
        if (!ops)
                return -EINVAL;

        if (!bpf_prog_is_offloaded(prog->aux))
                prog->aux->ops = ops;
        else
                prog->aux->ops = &bpf_offload_prog_ops;
        prog->type = type;
        return 0;
}

enum bpf_audit {
        BPF_AUDIT_LOAD,
        BPF_AUDIT_UNLOAD,
        BPF_AUDIT_MAX,
};

static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
        [BPF_AUDIT_LOAD]   = "LOAD",
        [BPF_AUDIT_UNLOAD] = "UNLOAD",
};

static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
{
        struct audit_context *ctx = NULL;
        struct audit_buffer *ab;

        if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
                return;
        if (audit_enabled == AUDIT_OFF)
                return;
        if (!in_hardirq() && !irqs_disabled())
                ctx = audit_context();
        ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
        if (unlikely(!ab))
                return;
        audit_log_format(ab, "prog-id=%u op=%s",
                         prog->aux->id, bpf_audit_str[op]);
        audit_log_end(ab);
}

static int bpf_prog_alloc_id(struct bpf_prog *prog)
{
        int id;

        idr_preload(GFP_KERNEL);
        spin_lock_bh(&prog_idr_lock);
        id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
        if (id > 0)
                prog->aux->id = id;
        spin_unlock_bh(&prog_idr_lock);
        idr_preload_end();

        /* id is in [1, INT_MAX) */
        if (WARN_ON_ONCE(!id))
                return -ENOSPC;

        return id > 0 ? 0 : id;
}

void bpf_prog_free_id(struct bpf_prog *prog)
{
        unsigned long flags;

        /* cBPF to eBPF migrations are currently not in the idr store.
         * Offloaded programs are removed from the store when their device
         * disappears - even if someone grabs an fd to them they are unusable,
         * simply waiting for refcnt to drop to be freed.
         */
        if (!prog->aux->id)
                return;

        spin_lock_irqsave(&prog_idr_lock, flags);
        idr_remove(&prog_idr, prog->aux->id);
        prog->aux->id = 0;
        spin_unlock_irqrestore(&prog_idr_lock, flags);
}

static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
        struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);

        kvfree(aux->func_info);
        kfree(aux->func_info_aux);
        free_uid(aux->user);
        security_bpf_prog_free(aux->prog);
        bpf_prog_free(aux->prog);
}

static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
        bpf_prog_kallsyms_del_all(prog);
        btf_put(prog->aux->btf);
        module_put(prog->aux->mod);
        kvfree(prog->aux->jited_linfo);
        kvfree(prog->aux->linfo);
        kfree(prog->aux->kfunc_tab);
        kfree(prog->aux->ctx_arg_info);
        if (prog->aux->attach_btf)
                btf_put(prog->aux->attach_btf);

        if (deferred) {
                if (prog->sleepable)
                        call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
                else
                        call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
        } else {
                __bpf_prog_put_rcu(&prog->aux->rcu);
        }
}

static void bpf_prog_put_deferred(struct work_struct *work)
{
        struct bpf_prog_aux *aux;
        struct bpf_prog *prog;

        aux = container_of(work, struct bpf_prog_aux, work);
        prog = aux->prog;
        perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
        bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
        bpf_prog_free_id(prog);
        __bpf_prog_put_noref(prog, true);
}

static void __bpf_prog_put(struct bpf_prog *prog)
{
        struct bpf_prog_aux *aux = prog->aux;

        if (atomic64_dec_and_test(&aux->refcnt)) {
                if (in_hardirq() || irqs_disabled()) {
                        INIT_WORK(&aux->work, bpf_prog_put_deferred);
                        schedule_work(&aux->work);
                } else {
                        bpf_prog_put_deferred(&aux->work);
                }
        }
}

void bpf_prog_put(struct bpf_prog *prog)
{
        __bpf_prog_put(prog);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);

static int bpf_prog_release(struct inode *inode, struct file *filp)
{
        struct bpf_prog *prog = filp->private_data;

        bpf_prog_put(prog);
        return 0;
}

struct bpf_prog_kstats {
        u64 nsecs;
        u64 cnt;
        u64 misses;
};

void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
{
        struct bpf_prog_stats *stats;
        unsigned int flags;

        if (unlikely(!prog->stats))
                return;

        stats = this_cpu_ptr(prog->stats);
        flags = u64_stats_update_begin_irqsave(&stats->syncp);
        u64_stats_inc(&stats->misses);
        u64_stats_update_end_irqrestore(&stats->syncp, flags);
}

static void bpf_prog_get_stats(const struct bpf_prog *prog,
                               struct bpf_prog_kstats *stats)
{
        u64 nsecs = 0, cnt = 0, misses = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                const struct bpf_prog_stats *st;
                unsigned int start;
                u64 tnsecs, tcnt, tmisses;

                st = per_cpu_ptr(prog->stats, cpu);
                do {
                        start = u64_stats_fetch_begin(&st->syncp);
                        tnsecs = u64_stats_read(&st->nsecs);
                        tcnt = u64_stats_read(&st->cnt);
                        tmisses = u64_stats_read(&st->misses);
                } while (u64_stats_fetch_retry(&st->syncp, start));
                nsecs += tnsecs;
                cnt += tcnt;
                misses += tmisses;
        }
        stats->nsecs = nsecs;
        stats->cnt = cnt;
        stats->misses = misses;
}

#ifdef CONFIG_PROC_FS
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
        const struct bpf_prog *prog = filp->private_data;
        char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
        struct bpf_prog_kstats stats;

        bpf_prog_get_stats(prog, &stats);
        bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
        seq_printf(m,
                   "prog_type:\t%u\n"
                   "prog_jited:\t%u\n"
                   "prog_tag:\t%s\n"
                   "memlock:\t%llu\n"
                   "prog_id:\t%u\n"
                   "run_time_ns:\t%llu\n"
                   "run_cnt:\t%llu\n"
                   "recursion_misses:\t%llu\n"
                   "verified_insns:\t%u\n",
                   prog->type,
                   prog->jited,
                   prog_tag,
                   prog->pages * 1ULL << PAGE_SHIFT,
                   prog->aux->id,
                   stats.nsecs,
                   stats.cnt,
                   stats.misses,
                   prog->aux->verified_insns);
}
#endif

const struct file_operations bpf_prog_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = bpf_prog_show_fdinfo,
#endif
        .release        = bpf_prog_release,
        .read                = bpf_dummy_read,
        .write                = bpf_dummy_write,
};

int bpf_prog_new_fd(struct bpf_prog *prog)
{
        int ret;

        ret = security_bpf_prog(prog);
        if (ret < 0)
                return ret;

        return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
                                O_RDWR | O_CLOEXEC);
}

void bpf_prog_add(struct bpf_prog *prog, int i)
{
        atomic64_add(i, &prog->aux->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_prog_add);

void bpf_prog_sub(struct bpf_prog *prog, int i)
{
        /* Only to be used for undoing previous bpf_prog_add() in some
         * error path. We still know that another entity in our call
         * path holds a reference to the program, thus atomic_sub() can
         * be safely used in such cases!
         */
        WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
}
EXPORT_SYMBOL_GPL(bpf_prog_sub);

void bpf_prog_inc(struct bpf_prog *prog)
{
        atomic64_inc(&prog->aux->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_prog_inc);

/* prog_idr_lock should have been held */
struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
{
        int refold;

        refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);

        if (!refold)
                return ERR_PTR(-ENOENT);

        return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);

bool bpf_prog_get_ok(struct bpf_prog *prog,
                            enum bpf_prog_type *attach_type, bool attach_drv)
{
        /* not an attachment, just a refcount inc, always allow */
        if (!attach_type)
                return true;

        if (prog->type != *attach_type)
                return false;
        if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
                return false;

        return true;
}

static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
                                       bool attach_drv)
{
        CLASS(fd, f)(ufd);
        struct bpf_prog *prog;

        if (fd_empty(f))
                return ERR_PTR(-EBADF);
        if (fd_file(f)->f_op != &bpf_prog_fops)
                return ERR_PTR(-EINVAL);

        prog = fd_file(f)->private_data;
        if (!bpf_prog_get_ok(prog, attach_type, attach_drv))
                return ERR_PTR(-EINVAL);

        bpf_prog_inc(prog);
        return prog;
}

struct bpf_prog *bpf_prog_get(u32 ufd)
{
        return __bpf_prog_get(ufd, NULL, false);
}

struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
                                       bool attach_drv)
{
        return __bpf_prog_get(ufd, &type, attach_drv);
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);

/* Initially all BPF programs could be loaded w/o specifying
 * expected_attach_type. Later for some of them specifying expected_attach_type
 * at load time became required so that program could be validated properly.
 * Programs of types that are allowed to be loaded both w/ and w/o (for
 * backward compatibility) expected_attach_type, should have the default attach
 * type assigned to expected_attach_type for the latter case, so that it can be
 * validated later at attach time.
 *
 * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
 * prog type requires it but has some attach types that have to be backward
 * compatible.
 */
static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
{
        switch (attr->prog_type) {
        case BPF_PROG_TYPE_CGROUP_SOCK:
                /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
                 * exist so checking for non-zero is the way to go here.
                 */
                if (!attr->expected_attach_type)
                        attr->expected_attach_type =
                                BPF_CGROUP_INET_SOCK_CREATE;
                break;
        case BPF_PROG_TYPE_SK_REUSEPORT:
                if (!attr->expected_attach_type)
                        attr->expected_attach_type =
                                BPF_SK_REUSEPORT_SELECT;
                break;
        }
}

static int
bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
                           enum bpf_attach_type expected_attach_type,
                           struct btf *attach_btf, u32 btf_id,
                           struct bpf_prog *dst_prog)
{
        if (btf_id) {
                if (btf_id > BTF_MAX_TYPE)
                        return -EINVAL;

                if (!attach_btf && !dst_prog)
                        return -EINVAL;

                switch (prog_type) {
                case BPF_PROG_TYPE_TRACING:
                case BPF_PROG_TYPE_LSM:
                case BPF_PROG_TYPE_STRUCT_OPS:
                case BPF_PROG_TYPE_EXT:
                        break;
                default:
                        return -EINVAL;
                }
        }

        if (attach_btf && (!btf_id || dst_prog))
                return -EINVAL;

        if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
            prog_type != BPF_PROG_TYPE_EXT)
                return -EINVAL;

        switch (prog_type) {
        case BPF_PROG_TYPE_CGROUP_SOCK:
                switch (expected_attach_type) {
                case BPF_CGROUP_INET_SOCK_CREATE:
                case BPF_CGROUP_INET_SOCK_RELEASE:
                case BPF_CGROUP_INET4_POST_BIND:
                case BPF_CGROUP_INET6_POST_BIND:
                        return 0;
                default:
                        return -EINVAL;
                }
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
                switch (expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UNIX_CONNECT:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UNIX_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                        return 0;
                default:
                        return -EINVAL;
                }
        case BPF_PROG_TYPE_CGROUP_SKB:
                switch (expected_attach_type) {
                case BPF_CGROUP_INET_INGRESS:
                case BPF_CGROUP_INET_EGRESS:
                        return 0;
                default:
                        return -EINVAL;
                }
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                switch (expected_attach_type) {
                case BPF_CGROUP_SETSOCKOPT:
                case BPF_CGROUP_GETSOCKOPT:
                        return 0;
                default:
                        return -EINVAL;
                }
        case BPF_PROG_TYPE_SK_LOOKUP:
                if (expected_attach_type == BPF_SK_LOOKUP)
                        return 0;
                return -EINVAL;
        case BPF_PROG_TYPE_SK_REUSEPORT:
                switch (expected_attach_type) {
                case BPF_SK_REUSEPORT_SELECT:
                case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
                        return 0;
                default:
                        return -EINVAL;
                }
        case BPF_PROG_TYPE_NETFILTER:
                if (expected_attach_type == BPF_NETFILTER)
                        return 0;
                return -EINVAL;
        case BPF_PROG_TYPE_SYSCALL:
        case BPF_PROG_TYPE_EXT:
                if (expected_attach_type)
                        return -EINVAL;
                fallthrough;
        default:
                return 0;
        }
}

static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
{
        switch (prog_type) {
        case BPF_PROG_TYPE_SCHED_CLS:
        case BPF_PROG_TYPE_SCHED_ACT:
        case BPF_PROG_TYPE_XDP:
        case BPF_PROG_TYPE_LWT_IN:
        case BPF_PROG_TYPE_LWT_OUT:
        case BPF_PROG_TYPE_LWT_XMIT:
        case BPF_PROG_TYPE_LWT_SEG6LOCAL:
        case BPF_PROG_TYPE_SK_SKB:
        case BPF_PROG_TYPE_SK_MSG:
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_SOCK_OPS:
        case BPF_PROG_TYPE_EXT: /* extends any prog */
        case BPF_PROG_TYPE_NETFILTER:
                return true;
        case BPF_PROG_TYPE_CGROUP_SKB:
                /* always unpriv */
        case BPF_PROG_TYPE_SK_REUSEPORT:
                /* equivalent to SOCKET_FILTER. need CAP_BPF only */
        default:
                return false;
        }
}

static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
{
        switch (prog_type) {
        case BPF_PROG_TYPE_KPROBE:
        case BPF_PROG_TYPE_TRACEPOINT:
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
        case BPF_PROG_TYPE_TRACING:
        case BPF_PROG_TYPE_LSM:
        case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
        case BPF_PROG_TYPE_EXT: /* extends any prog */
                return true;
        default:
                return false;
        }
}

static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr,
                                     bool is_kernel)
{
        bpfptr_t usig = make_bpfptr(attr->signature, is_kernel);
        struct bpf_dynptr_kern sig_ptr, insns_ptr;
        struct bpf_key *key = NULL;
        void *sig;
        int err = 0;

        /*
         * Don't attempt to use kmalloc_large or vmalloc for signatures.
         * Practical signature for BPF program should be below this limit.
         */
        if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE)
                return -EINVAL;

        if (system_keyring_id_check(attr->keyring_id) == 0)
                key = bpf_lookup_system_key(attr->keyring_id);
        else
                key = bpf_lookup_user_key(attr->keyring_id, 0);

        if (!key)
                return -EINVAL;

        sig = kvmemdup_bpfptr(usig, attr->signature_size);
        if (IS_ERR(sig)) {
                bpf_key_put(key);
                return PTR_ERR(sig);
        }

        bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0,
                        attr->signature_size);
        bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0,
                        prog->len * sizeof(struct bpf_insn));

        err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr,
                                         (struct bpf_dynptr *)&sig_ptr, key);

        bpf_key_put(key);
        kvfree(sig);
        return err;
}

static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
{
        int err;
        int i;

        for (i = 0; i < prog->aux->used_map_cnt; i++) {
                if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY)
                        continue;

                err = bpf_insn_array_ready(prog->aux->used_maps[i]);
                if (err)
                        return err;
        }

        return 0;
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD keyring_id

static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
        enum bpf_prog_type type = attr->prog_type;
        struct bpf_prog *prog, *dst_prog = NULL;
        struct btf *attach_btf = NULL;
        struct bpf_token *token = NULL;
        bool bpf_cap;
        int err;
        char license[128];

        if (CHECK_ATTR(BPF_PROG_LOAD))
                return -EINVAL;

        if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
                                 BPF_F_ANY_ALIGNMENT |
                                 BPF_F_TEST_STATE_FREQ |
                                 BPF_F_SLEEPABLE |
                                 BPF_F_TEST_RND_HI32 |
                                 BPF_F_XDP_HAS_FRAGS |
                                 BPF_F_XDP_DEV_BOUND_ONLY |
                                 BPF_F_TEST_REG_INVARIANTS |
                                 BPF_F_TOKEN_FD))
                return -EINVAL;

        bpf_prog_load_fixup_attach_type(attr);

        if (attr->prog_flags & BPF_F_TOKEN_FD) {
                token = bpf_token_get_from_fd(attr->prog_token_fd);
                if (IS_ERR(token))
                        return PTR_ERR(token);
                /* if current token doesn't grant prog loading permissions,
                 * then we can't use this token, so ignore it and rely on
                 * system-wide capabilities checks
                 */
                if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
                    !bpf_token_allow_prog_type(token, attr->prog_type,
                                               attr->expected_attach_type)) {
                        bpf_token_put(token);
                        token = NULL;
                }
        }

        bpf_cap = bpf_token_capable(token, CAP_BPF);
        err = -EPERM;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
            (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
            !bpf_cap)
                goto put_token;

        /* Intent here is for unprivileged_bpf_disabled to block BPF program
         * creation for unprivileged users; other actions depend
         * on fd availability and access to bpffs, so are dependent on
         * object creation success. Even with unprivileged BPF disabled,
         * capability checks are still carried out for these
         * and other operations.
         */
        if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
                goto put_token;

        if (attr->insn_cnt == 0 ||
            attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
                err = -E2BIG;
                goto put_token;
        }
        if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
            type != BPF_PROG_TYPE_CGROUP_SKB &&
            !bpf_cap)
                goto put_token;

        if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
                goto put_token;
        if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
                goto put_token;

        /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
         * or btf, we need to check which one it is
         */
        if (attr->attach_prog_fd) {
                dst_prog = bpf_prog_get(attr->attach_prog_fd);
                if (IS_ERR(dst_prog)) {
                        dst_prog = NULL;
                        attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
                        if (IS_ERR(attach_btf)) {
                                err = -EINVAL;
                                goto put_token;
                        }
                        if (!btf_is_kernel(attach_btf)) {
                                /* attaching through specifying bpf_prog's BTF
                                 * objects directly might be supported eventually
                                 */
                                btf_put(attach_btf);
                                err = -ENOTSUPP;
                                goto put_token;
                        }
                }
        } else if (attr->attach_btf_id) {
                /* fall back to vmlinux BTF, if BTF type ID is specified */
                attach_btf = bpf_get_btf_vmlinux();
                if (IS_ERR(attach_btf)) {
                        err = PTR_ERR(attach_btf);
                        goto put_token;
                }
                if (!attach_btf) {
                        err = -EINVAL;
                        goto put_token;
                }
                btf_get(attach_btf);
        }

        if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
                                       attach_btf, attr->attach_btf_id,
                                       dst_prog)) {
                if (dst_prog)
                        bpf_prog_put(dst_prog);
                if (attach_btf)
                        btf_put(attach_btf);
                err = -EINVAL;
                goto put_token;
        }

        /* plain bpf_prog allocation */
        prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
        if (!prog) {
                if (dst_prog)
                        bpf_prog_put(dst_prog);
                if (attach_btf)
                        btf_put(attach_btf);
                err = -EINVAL;
                goto put_token;
        }

        prog->expected_attach_type = attr->expected_attach_type;
        prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
        prog->aux->attach_btf = attach_btf;
        prog->aux->attach_btf_id = attr->attach_btf_id;
        prog->aux->dst_prog = dst_prog;
        prog->aux->dev_bound = !!attr->prog_ifindex;
        prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;

        /* move token into prog->aux, reuse taken refcnt */
        prog->aux->token = token;
        token = NULL;

        prog->aux->user = get_current_user();
        prog->len = attr->insn_cnt;

        err = -EFAULT;
        if (copy_from_bpfptr(prog->insns,
                             make_bpfptr(attr->insns, uattr.is_kernel),
                             bpf_prog_insn_size(prog)) != 0)
                goto free_prog;
        /* copy eBPF program license from user space */
        if (strncpy_from_bpfptr(license,
                                make_bpfptr(attr->license, uattr.is_kernel),
                                sizeof(license) - 1) < 0)
                goto free_prog;
        license[sizeof(license) - 1] = 0;

        /* eBPF programs must be GPL compatible to use GPL-ed functions */
        prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;

        if (attr->signature) {
                err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel);
                if (err)
                        goto free_prog;
        }

        prog->orig_prog = NULL;
        prog->jited = 0;

        atomic64_set(&prog->aux->refcnt, 1);

        if (bpf_prog_is_dev_bound(prog->aux)) {
                err = bpf_prog_dev_bound_init(prog, attr);
                if (err)
                        goto free_prog;
        }

        if (type == BPF_PROG_TYPE_EXT && dst_prog &&
            bpf_prog_is_dev_bound(dst_prog->aux)) {
                err = bpf_prog_dev_bound_inherit(prog, dst_prog);
                if (err)
                        goto free_prog;
        }

        /*
         * Bookkeeping for managing the program attachment chain.
         *
         * It might be tempting to set attach_tracing_prog flag at the attachment
         * time, but this will not prevent from loading bunch of tracing prog
         * first, then attach them one to another.
         *
         * The flag attach_tracing_prog is set for the whole program lifecycle, and
         * doesn't have to be cleared in bpf_tracing_link_release, since tracing
         * programs cannot change attachment target.
         */
        if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
            dst_prog->type == BPF_PROG_TYPE_TRACING) {
                prog->aux->attach_tracing_prog = true;
        }

        /* find program type: socket_filter vs tracing_filter */
        err = find_prog_type(type, prog);
        if (err < 0)
                goto free_prog;

        prog->aux->load_time = ktime_get_boottime_ns();
        err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
                               sizeof(attr->prog_name));
        if (err < 0)
                goto free_prog;

        err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel);
        if (err)
                goto free_prog_sec;

        /* run eBPF verifier */
        err = bpf_check(&prog, attr, uattr, uattr_size);
        if (err < 0)
                goto free_used_maps;

        err = bpf_prog_mark_insn_arrays_ready(prog);
        if (err < 0)
                goto free_used_maps;

        err = bpf_prog_alloc_id(prog);
        if (err)
                goto free_used_maps;

        /* Upon success of bpf_prog_alloc_id(), the BPF prog is
         * effectively publicly exposed. However, retrieving via
         * bpf_prog_get_fd_by_id() will take another reference,
         * therefore it cannot be gone underneath us.
         *
         * Only for the time /after/ successful bpf_prog_new_fd()
         * and before returning to userspace, we might just hold
         * one reference and any parallel close on that fd could
         * rip everything out. Hence, below notifications must
         * happen before bpf_prog_new_fd().
         *
         * Also, any failure handling from this point onwards must
         * be using bpf_prog_put() given the program is exposed.
         */
        bpf_prog_kallsyms_add(prog);
        perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
        bpf_audit_prog(prog, BPF_AUDIT_LOAD);

        err = bpf_prog_new_fd(prog);
        if (err < 0)
                bpf_prog_put(prog);
        return err;

free_used_maps:
        /* In case we have subprogs, we need to wait for a grace
         * period before we can tear down JIT memory since symbols
         * are already exposed under kallsyms.
         */
        __bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
        return err;

free_prog_sec:
        security_bpf_prog_free(prog);
free_prog:
        free_uid(prog->aux->user);
        if (prog->aux->attach_btf)
                btf_put(prog->aux->attach_btf);
        bpf_prog_free(prog);
put_token:
        bpf_token_put(token);
        return err;
}

#define BPF_OBJ_LAST_FIELD path_fd

static int bpf_obj_pin(const union bpf_attr *attr)
{
        int path_fd;

        if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
                return -EINVAL;

        /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
        if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
                return -EINVAL;

        path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
        return bpf_obj_pin_user(attr->bpf_fd, path_fd,
                                u64_to_user_ptr(attr->pathname));
}

static int bpf_obj_get(const union bpf_attr *attr)
{
        int path_fd;

        if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
            attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
                return -EINVAL;

        /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
        if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
                return -EINVAL;

        path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
        return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
                                attr->file_flags);
}

/* bpf_link_init_sleepable() allows to specify whether BPF link itself has
 * "sleepable" semantics, which normally would mean that BPF link's attach
 * hook can dereference link or link's underlying program for some time after
 * detachment due to RCU Tasks Trace-based lifetime protection scheme.
 * BPF program itself can be non-sleepable, yet, because it's transitively
 * reachable through BPF link, its freeing has to be delayed until after RCU
 * Tasks Trace GP.
 */
void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
                             const struct bpf_link_ops *ops, struct bpf_prog *prog,
                             enum bpf_attach_type attach_type, bool sleepable)
{
        WARN_ON(ops->dealloc && ops->dealloc_deferred);
        atomic64_set(&link->refcnt, 1);
        link->type = type;
        link->sleepable = sleepable;
        link->id = 0;
        link->ops = ops;
        link->prog = prog;
        link->attach_type = attach_type;
}

void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
                   const struct bpf_link_ops *ops, struct bpf_prog *prog,
                   enum bpf_attach_type attach_type)
{
        bpf_link_init_sleepable(link, type, ops, prog, attach_type, false);
}

static void bpf_link_free_id(int id)
{
        if (!id)
                return;

        spin_lock_bh(&link_idr_lock);
        idr_remove(&link_idr, id);
        spin_unlock_bh(&link_idr_lock);
}

/* Clean up bpf_link and corresponding anon_inode file and FD. After
 * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
 * anon_inode's release() call. This helper marks bpf_link as
 * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
 * is not decremented, it's the responsibility of a calling code that failed
 * to complete bpf_link initialization.
 * This helper eventually calls link's dealloc callback, but does not call
 * link's release callback.
 */
void bpf_link_cleanup(struct bpf_link_primer *primer)
{
        primer->link->prog = NULL;
        bpf_link_free_id(primer->id);
        fput(primer->file);
        put_unused_fd(primer->fd);
}

void bpf_link_inc(struct bpf_link *link)
{
        atomic64_inc(&link->refcnt);
}

static void bpf_link_dealloc(struct bpf_link *link)
{
        /* now that we know that bpf_link itself can't be reached, put underlying BPF program */
        if (link->prog)
                bpf_prog_put(link->prog);

        /* free bpf_link and its containing memory */
        if (link->ops->dealloc_deferred)
                link->ops->dealloc_deferred(link);
        else
                link->ops->dealloc(link);
}

static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
{
        struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);

        bpf_link_dealloc(link);
}

static bool bpf_link_is_tracepoint(struct bpf_link *link)
{
        /*
         * Only these combinations support a tracepoint bpf_link.
         * BPF_LINK_TYPE_TRACING raw_tp progs are hardcoded to use
         * bpf_raw_tp_link_lops and thus dealloc_deferred(), see
         * bpf_raw_tp_link_attach().
         */
        return link->type == BPF_LINK_TYPE_RAW_TRACEPOINT ||
               (link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP);
}

/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
        const struct bpf_link_ops *ops = link->ops;

        bpf_link_free_id(link->id);
        /* detach BPF program, clean up used resources */
        if (link->prog)
                ops->release(link);
        if (ops->dealloc_deferred) {
                /*
                 * Schedule BPF link deallocation, which will only then
                 * trigger putting BPF program refcount.
                 * If underlying BPF program is sleepable or BPF link's target
                 * attach hookpoint is sleepable or otherwise requires RCU GPs
                 * to ensure link and its underlying BPF program is not
                 * reachable anymore, we need to first wait for RCU tasks
                 * trace sync, and then go through "classic" RCU grace period.
                 *
                 * For tracepoint BPF links, we need to go through SRCU grace
                 * period wait instead when non-faultable tracepoint is used. We
                 * don't need to chain SRCU grace period waits, however, for the
                 * faultable case, since it exclusively uses RCU Tasks Trace.
                 */
                if (link->sleepable || (link->prog && link->prog->sleepable))
                        /* RCU Tasks Trace grace period implies RCU grace period. */
                        call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
                /* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */
                else if (bpf_link_is_tracepoint(link))
                        call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
                else
                        call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
        } else if (ops->dealloc) {
                bpf_link_dealloc(link);
        }
}

static void bpf_link_put_deferred(struct work_struct *work)
{
        struct bpf_link *link = container_of(work, struct bpf_link, work);

        bpf_link_free(link);
}

/* bpf_link_put might be called from atomic context. It needs to be called
 * from sleepable context in order to acquire sleeping locks during the process.
 */
void bpf_link_put(struct bpf_link *link)
{
        if (!atomic64_dec_and_test(&link->refcnt))
                return;

        INIT_WORK(&link->work, bpf_link_put_deferred);
        schedule_work(&link->work);
}
EXPORT_SYMBOL(bpf_link_put);

static void bpf_link_put_direct(struct bpf_link *link)
{
        if (!atomic64_dec_and_test(&link->refcnt))
                return;
        bpf_link_free(link);
}

static int bpf_link_release(struct inode *inode, struct file *filp)
{
        struct bpf_link *link = filp->private_data;

        bpf_link_put_direct(link);
        return 0;
}

#ifdef CONFIG_PROC_FS
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
static const char *bpf_link_type_strs[] = {
        [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
#include <linux/bpf_types.h>
};
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE

static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
        const struct bpf_link *link = filp->private_data;
        const struct bpf_prog *prog = link->prog;
        enum bpf_link_type type = link->type;
        char prog_tag[sizeof(prog->tag) * 2 + 1] = { };

        if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) {
                if (link->type == BPF_LINK_TYPE_KPROBE_MULTI)
                        seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ?
                                   "kretprobe_multi" : "kprobe_multi");
                else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI)
                        seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ?
                                   "uretprobe_multi" : "uprobe_multi");
                else
                        seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
        } else {
                WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type);
                seq_printf(m, "link_type:\t<%u>\n", type);
        }
        seq_printf(m, "link_id:\t%u\n", link->id);

        if (prog) {
                bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
                seq_printf(m,
                           "prog_tag:\t%s\n"
                           "prog_id:\t%u\n",
                           prog_tag,
                           prog->aux->id);
        }
        if (link->ops->show_fdinfo)
                link->ops->show_fdinfo(link, m);
}
#endif

static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts)
{
        struct bpf_link *link = file->private_data;

        return link->ops->poll(file, pts);
}

static const struct file_operations bpf_link_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = bpf_link_show_fdinfo,
#endif
        .release        = bpf_link_release,
        .read                = bpf_dummy_read,
        .write                = bpf_dummy_write,
};

static const struct file_operations bpf_link_fops_poll = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = bpf_link_show_fdinfo,
#endif
        .release        = bpf_link_release,
        .read                = bpf_dummy_read,
        .write                = bpf_dummy_write,
        .poll                = bpf_link_poll,
};

static int bpf_link_alloc_id(struct bpf_link *link)
{
        int id;

        idr_preload(GFP_KERNEL);
        spin_lock_bh(&link_idr_lock);
        id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
        spin_unlock_bh(&link_idr_lock);
        idr_preload_end();

        return id;
}

/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
 * reserving unused FD and allocating ID from link_idr. This is to be paired
 * with bpf_link_settle() to install FD and ID and expose bpf_link to
 * user-space, if bpf_link is successfully attached. If not, bpf_link and
 * pre-allocated resources are to be freed with bpf_cleanup() call. All the
 * transient state is passed around in struct bpf_link_primer.
 * This is preferred way to create and initialize bpf_link, especially when
 * there are complicated and expensive operations in between creating bpf_link
 * itself and attaching it to BPF hook. By using bpf_link_prime() and
 * bpf_link_settle() kernel code using bpf_link doesn't have to perform
 * expensive (and potentially failing) roll back operations in a rare case
 * that file, FD, or ID can't be allocated.
 */
int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
{
        struct file *file;
        int fd, id;

        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0)
                return fd;


        id = bpf_link_alloc_id(link);
        if (id < 0) {
                put_unused_fd(fd);
                return id;
        }

        file = anon_inode_getfile("bpf_link",
                                  link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
                                  link, O_CLOEXEC);
        if (IS_ERR(file)) {
                bpf_link_free_id(id);
                put_unused_fd(fd);
                return PTR_ERR(file);
        }

        primer->link = link;
        primer->file = file;
        primer->fd = fd;
        primer->id = id;
        return 0;
}

int bpf_link_settle(struct bpf_link_primer *primer)
{
        /* make bpf_link fetchable by ID */
        spin_lock_bh(&link_idr_lock);
        primer->link->id = primer->id;
        spin_unlock_bh(&link_idr_lock);
        /* make bpf_link fetchable by FD */
        fd_install(primer->fd, primer->file);
        /* pass through installed FD */
        return primer->fd;
}

int bpf_link_new_fd(struct bpf_link *link)
{
        return anon_inode_getfd("bpf-link",
                                link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
                                link, O_CLOEXEC);
}

struct bpf_link *bpf_link_get_from_fd(u32 ufd)
{
        CLASS(fd, f)(ufd);
        struct bpf_link *link;

        if (fd_empty(f))
                return ERR_PTR(-EBADF);
        if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll)
                return ERR_PTR(-EINVAL);

        link = fd_file(f)->private_data;
        bpf_link_inc(link);
        return link;
}
EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL");

static void bpf_tracing_link_release(struct bpf_link *link)
{
        struct bpf_tracing_link *tr_link =
                container_of(link, struct bpf_tracing_link, link.link);

        WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
                                                tr_link->trampoline,
                                                tr_link->tgt_prog));

        bpf_trampoline_put(tr_link->trampoline);

        /* tgt_prog is NULL if target is a kernel function */
        if (tr_link->tgt_prog)
                bpf_prog_put(tr_link->tgt_prog);
}

static void bpf_tracing_link_dealloc(struct bpf_link *link)
{
        struct bpf_tracing_link *tr_link =
                container_of(link, struct bpf_tracing_link, link.link);

        kfree(tr_link);
}

static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
                                         struct seq_file *seq)
{
        struct bpf_tracing_link *tr_link =
                container_of(link, struct bpf_tracing_link, link.link);
        u32 target_btf_id, target_obj_id;

        bpf_trampoline_unpack_key(tr_link->trampoline->key,
                                  &target_obj_id, &target_btf_id);
        seq_printf(seq,
                   "attach_type:\t%d\n"
                   "target_obj_id:\t%u\n"
                   "target_btf_id:\t%u\n"
                   "cookie:\t%llu\n",
                   link->attach_type,
                   target_obj_id,
                   target_btf_id,
                   tr_link->link.cookie);
}

static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
                                           struct bpf_link_info *info)
{
        struct bpf_tracing_link *tr_link =
                container_of(link, struct bpf_tracing_link, link.link);

        info->tracing.attach_type = link->attach_type;
        info->tracing.cookie = tr_link->link.cookie;
        bpf_trampoline_unpack_key(tr_link->trampoline->key,
                                  &info->tracing.target_obj_id,
                                  &info->tracing.target_btf_id);

        return 0;
}

static const struct bpf_link_ops bpf_tracing_link_lops = {
        .release = bpf_tracing_link_release,
        .dealloc = bpf_tracing_link_dealloc,
        .show_fdinfo = bpf_tracing_link_show_fdinfo,
        .fill_link_info = bpf_tracing_link_fill_link_info,
};

static int bpf_tracing_prog_attach(struct bpf_prog *prog,
                                   int tgt_prog_fd,
                                   u32 btf_id,
                                   u64 bpf_cookie,
                                   enum bpf_attach_type attach_type)
{
        struct bpf_link_primer link_primer;
        struct bpf_prog *tgt_prog = NULL;
        struct bpf_trampoline *tr = NULL;
        struct bpf_tracing_link *link;
        u64 key = 0;
        int err;

        switch (prog->type) {
        case BPF_PROG_TYPE_TRACING:
                if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
                    prog->expected_attach_type != BPF_TRACE_FEXIT &&
                    prog->expected_attach_type != BPF_TRACE_FSESSION &&
                    prog->expected_attach_type != BPF_MODIFY_RETURN) {
                        err = -EINVAL;
                        goto out_put_prog;
                }
                break;
        case BPF_PROG_TYPE_EXT:
                if (prog->expected_attach_type != 0) {
                        err = -EINVAL;
                        goto out_put_prog;
                }
                break;
        case BPF_PROG_TYPE_LSM:
                if (prog->expected_attach_type != BPF_LSM_MAC) {
                        err = -EINVAL;
                        goto out_put_prog;
                }
                break;
        default:
                err = -EINVAL;
                goto out_put_prog;
        }

        if (!!tgt_prog_fd != !!btf_id) {
                err = -EINVAL;
                goto out_put_prog;
        }

        if (tgt_prog_fd) {
                /*
                 * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
                 * part would be changed to implement the same for
                 * BPF_PROG_TYPE_TRACING, do not forget to update the way how
                 * attach_tracing_prog flag is set.
                 */
                if (prog->type != BPF_PROG_TYPE_EXT) {
                        err = -EINVAL;
                        goto out_put_prog;
                }

                tgt_prog = bpf_prog_get(tgt_prog_fd);
                if (IS_ERR(tgt_prog)) {
                        err = PTR_ERR(tgt_prog);
                        tgt_prog = NULL;
                        goto out_put_prog;
                }

                key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
        }

        if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
                struct bpf_fsession_link *fslink;

                fslink = kzalloc_obj(*fslink, GFP_USER);
                if (fslink) {
                        bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING,
                                      &bpf_tracing_link_lops, prog, attach_type);
                        fslink->fexit.cookie = bpf_cookie;
                        link = &fslink->link;
                } else {
                        link = NULL;
                }
        } else {
                link = kzalloc_obj(*link, GFP_USER);
        }
        if (!link) {
                err = -ENOMEM;
                goto out_put_prog;
        }
        bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
                      &bpf_tracing_link_lops, prog, attach_type);

        link->link.cookie = bpf_cookie;

        mutex_lock(&prog->aux->dst_mutex);

        /* There are a few possible cases here:
         *
         * - if prog->aux->dst_trampoline is set, the program was just loaded
         *   and not yet attached to anything, so we can use the values stored
         *   in prog->aux
         *
         * - if prog->aux->dst_trampoline is NULL, the program has already been
         *   attached to a target and its initial target was cleared (below)
         *
         * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
         *   target_btf_id using the link_create API.
         *
         * - if tgt_prog == NULL when this function was called using the old
         *   raw_tracepoint_open API, and we need a target from prog->aux
         *
         * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
         *   was detached and is going for re-attachment.
         *
         * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf
         *   are NULL, then program was already attached and user did not provide
         *   tgt_prog_fd so we have no way to find out or create trampoline
         */
        if (!prog->aux->dst_trampoline && !tgt_prog) {
                /*
                 * Allow re-attach for TRACING and LSM programs. If it's
                 * currently linked, bpf_trampoline_link_prog will fail.
                 * EXT programs need to specify tgt_prog_fd, so they
                 * re-attach in separate code path.
                 */
                if (prog->type != BPF_PROG_TYPE_TRACING &&
                    prog->type != BPF_PROG_TYPE_LSM) {
                        err = -EINVAL;
                        goto out_unlock;
                }
                /* We can allow re-attach only if we have valid attach_btf. */
                if (!prog->aux->attach_btf) {
                        err = -EINVAL;
                        goto out_unlock;
                }
                btf_id = prog->aux->attach_btf_id;
                key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
        }

        if (!prog->aux->dst_trampoline ||
            (key && key != prog->aux->dst_trampoline->key)) {
                /* If there is no saved target, or the specified target is
                 * different from the destination specified at load time, we
                 * need a new trampoline and a check for compatibility
                 */
                struct bpf_attach_target_info tgt_info = {};

                err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
                                              &tgt_info);
                if (err)
                        goto out_unlock;

                if (tgt_info.tgt_mod) {
                        module_put(prog->aux->mod);
                        prog->aux->mod = tgt_info.tgt_mod;
                }

                tr = bpf_trampoline_get(key, &tgt_info);
                if (!tr) {
                        err = -ENOMEM;
                        goto out_unlock;
                }
        } else {
                /* The caller didn't specify a target, or the target was the
                 * same as the destination supplied during program load. This
                 * means we can reuse the trampoline and reference from program
                 * load time, and there is no need to allocate a new one. This
                 * can only happen once for any program, as the saved values in
                 * prog->aux are cleared below.
                 */
                tr = prog->aux->dst_trampoline;
                tgt_prog = prog->aux->dst_prog;
        }
        /*
         * It is to prevent modifying struct pt_regs via kprobe_write_ctx=true
         * freplace prog. Without this check, kprobe_write_ctx=true freplace
         * prog is allowed to attach to kprobe_write_ctx=false kprobe prog, and
         * then modify the registers of the kprobe prog's target kernel
         * function.
         *
         * This also blocks the combination of uprobe+freplace, because it is
         * unable to recognize the use of the tgt_prog as an uprobe or a kprobe
         * by tgt_prog itself. At attach time, uprobe/kprobe is recognized by
         * the target perf event flags in __perf_event_set_bpf_prog().
         */
        if (prog->type == BPF_PROG_TYPE_EXT &&
            prog->aux->kprobe_write_ctx != tgt_prog->aux->kprobe_write_ctx) {
                err = -EINVAL;
                goto out_unlock;
        }

        err = bpf_link_prime(&link->link.link, &link_primer);
        if (err)
                goto out_unlock;

        err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog);
        if (err) {
                bpf_link_cleanup(&link_primer);
                link = NULL;
                goto out_unlock;
        }

        link->tgt_prog = tgt_prog;
        link->trampoline = tr;

        /* Always clear the trampoline and target prog from prog->aux to make
         * sure the original attach destination is not kept alive after a
         * program is (re-)attached to another target.
         */
        if (prog->aux->dst_prog &&
            (tgt_prog_fd || tr != prog->aux->dst_trampoline))
                /* got extra prog ref from syscall, or attaching to different prog */
                bpf_prog_put(prog->aux->dst_prog);
        if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
                /* we allocated a new trampoline, so free the old one */
                bpf_trampoline_put(prog->aux->dst_trampoline);

        prog->aux->dst_prog = NULL;
        prog->aux->dst_trampoline = NULL;
        mutex_unlock(&prog->aux->dst_mutex);

        return bpf_link_settle(&link_primer);
out_unlock:
        if (tr && tr != prog->aux->dst_trampoline)
                bpf_trampoline_put(tr);
        mutex_unlock(&prog->aux->dst_mutex);
        kfree(link);
out_put_prog:
        if (tgt_prog_fd && tgt_prog)
                bpf_prog_put(tgt_prog);
        return err;
}

static void bpf_raw_tp_link_release(struct bpf_link *link)
{
        struct bpf_raw_tp_link *raw_tp =
                container_of(link, struct bpf_raw_tp_link, link);

        bpf_probe_unregister(raw_tp->btp, raw_tp);
        bpf_put_raw_tracepoint(raw_tp->btp);
}

static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
{
        struct bpf_raw_tp_link *raw_tp =
                container_of(link, struct bpf_raw_tp_link, link);

        kfree(raw_tp);
}

static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
                                        struct seq_file *seq)
{
        struct bpf_raw_tp_link *raw_tp_link =
                container_of(link, struct bpf_raw_tp_link, link);

        seq_printf(seq,
                   "tp_name:\t%s\n"
                   "cookie:\t%llu\n",
                   raw_tp_link->btp->tp->name,
                   raw_tp_link->cookie);
}

static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
                            u32 len)
{
        if (ulen >= len + 1) {
                if (copy_to_user(ubuf, buf, len + 1))
                        return -EFAULT;
        } else {
                char zero = '\0';

                if (copy_to_user(ubuf, buf, ulen - 1))
                        return -EFAULT;
                if (put_user(zero, ubuf + ulen - 1))
                        return -EFAULT;
                return -ENOSPC;
        }

        return 0;
}

static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
                                          struct bpf_link_info *info)
{
        struct bpf_raw_tp_link *raw_tp_link =
                container_of(link, struct bpf_raw_tp_link, link);
        char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
        const char *tp_name = raw_tp_link->btp->tp->name;
        u32 ulen = info->raw_tracepoint.tp_name_len;
        size_t tp_len = strlen(tp_name);

        if (!ulen ^ !ubuf)
                return -EINVAL;

        info->raw_tracepoint.tp_name_len = tp_len + 1;
        info->raw_tracepoint.cookie = raw_tp_link->cookie;

        if (!ubuf)
                return 0;

        return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
}

static const struct bpf_link_ops bpf_raw_tp_link_lops = {
        .release = bpf_raw_tp_link_release,
        .dealloc_deferred = bpf_raw_tp_link_dealloc,
        .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
        .fill_link_info = bpf_raw_tp_link_fill_link_info,
};

#ifdef CONFIG_PERF_EVENTS
struct bpf_perf_link {
        struct bpf_link link;
        struct file *perf_file;
};

static void bpf_perf_link_release(struct bpf_link *link)
{
        struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
        struct perf_event *event = perf_link->perf_file->private_data;

        perf_event_free_bpf_prog(event);
        fput(perf_link->perf_file);
}

static void bpf_perf_link_dealloc(struct bpf_link *link)
{
        struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);

        kfree(perf_link);
}

static int bpf_perf_link_fill_common(const struct perf_event *event,
                                     char __user *uname, u32 *ulenp,
                                     u64 *probe_offset, u64 *probe_addr,
                                     u32 *fd_type, unsigned long *missed)
{
        const char *buf;
        u32 prog_id, ulen;
        size_t len;
        int err;

        ulen = *ulenp;
        if (!ulen ^ !uname)
                return -EINVAL;

        err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
                                      probe_offset, probe_addr, missed);
        if (err)
                return err;

        if (buf) {
                len = strlen(buf);
                *ulenp = len + 1;
        } else {
                *ulenp = 1;
        }
        if (!uname)
                return 0;

        if (buf) {
                err = bpf_copy_to_user(uname, buf, ulen, len);
                if (err)
                        return err;
        } else {
                char zero = '\0';

                if (put_user(zero, uname))
                        return -EFAULT;
        }
        return 0;
}

#ifdef CONFIG_KPROBE_EVENTS
static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
                                     struct bpf_link_info *info)
{
        unsigned long missed;
        char __user *uname;
        u64 addr, offset;
        u32 ulen, type;
        int err;

        uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
        ulen = info->perf_event.kprobe.name_len;
        err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr,
                                        &type, &missed);
        if (err)
                return err;
        if (type == BPF_FD_TYPE_KRETPROBE)
                info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
        else
                info->perf_event.type = BPF_PERF_EVENT_KPROBE;
        info->perf_event.kprobe.name_len = ulen;
        info->perf_event.kprobe.offset = offset;
        info->perf_event.kprobe.missed = missed;
        if (!kallsyms_show_value(current_cred()))
                addr = 0;
        info->perf_event.kprobe.addr = addr;
        info->perf_event.kprobe.cookie = event->bpf_cookie;
        return 0;
}

static void bpf_perf_link_fdinfo_kprobe(const struct perf_event *event,
                                        struct seq_file *seq)
{
        const char *name;
        int err;
        u32 prog_id, type;
        u64 offset, addr;
        unsigned long missed;

        err = bpf_get_perf_event_info(event, &prog_id, &type, &name,
                                      &offset, &addr, &missed);
        if (err)
                return;

        seq_printf(seq,
                   "name:\t%s\n"
                   "offset:\t%#llx\n"
                   "missed:\t%lu\n"
                   "addr:\t%#llx\n"
                   "event_type:\t%s\n"
                   "cookie:\t%llu\n",
                   name, offset, missed, addr,
                   type == BPF_FD_TYPE_KRETPROBE ?  "kretprobe" : "kprobe",
                   event->bpf_cookie);
}
#endif

#ifdef CONFIG_UPROBE_EVENTS
static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
                                     struct bpf_link_info *info)
{
        u64 ref_ctr_offset, offset;
        char __user *uname;
        u32 ulen, type;
        int err;

        uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
        ulen = info->perf_event.uprobe.name_len;
        err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset,
                                        &type, NULL);
        if (err)
                return err;

        if (type == BPF_FD_TYPE_URETPROBE)
                info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
        else
                info->perf_event.type = BPF_PERF_EVENT_UPROBE;
        info->perf_event.uprobe.name_len = ulen;
        info->perf_event.uprobe.offset = offset;
        info->perf_event.uprobe.cookie = event->bpf_cookie;
        info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset;
        return 0;
}

static void bpf_perf_link_fdinfo_uprobe(const struct perf_event *event,
                                        struct seq_file *seq)
{
        const char *name;
        int err;
        u32 prog_id, type;
        u64 offset, ref_ctr_offset;
        unsigned long missed;

        err = bpf_get_perf_event_info(event, &prog_id, &type, &name,
                                      &offset, &ref_ctr_offset, &missed);
        if (err)
                return;

        seq_printf(seq,
                   "name:\t%s\n"
                   "offset:\t%#llx\n"
                   "ref_ctr_offset:\t%#llx\n"
                   "event_type:\t%s\n"
                   "cookie:\t%llu\n",
                   name, offset, ref_ctr_offset,
                   type == BPF_FD_TYPE_URETPROBE ?  "uretprobe" : "uprobe",
                   event->bpf_cookie);
}
#endif

static int bpf_perf_link_fill_probe(const struct perf_event *event,
                                    struct bpf_link_info *info)
{
#ifdef CONFIG_KPROBE_EVENTS
        if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
                return bpf_perf_link_fill_kprobe(event, info);
#endif
#ifdef CONFIG_UPROBE_EVENTS
        if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
                return bpf_perf_link_fill_uprobe(event, info);
#endif
        return -EOPNOTSUPP;
}

static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
                                         struct bpf_link_info *info)
{
        char __user *uname;
        u32 ulen;
        int err;

        uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
        ulen = info->perf_event.tracepoint.name_len;
        err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL);
        if (err)
                return err;

        info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
        info->perf_event.tracepoint.name_len = ulen;
        info->perf_event.tracepoint.cookie = event->bpf_cookie;
        return 0;
}

static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
                                         struct bpf_link_info *info)
{
        info->perf_event.event.type = event->attr.type;
        info->perf_event.event.config = event->attr.config;
        info->perf_event.event.cookie = event->bpf_cookie;
        info->perf_event.type = BPF_PERF_EVENT_EVENT;
        return 0;
}

static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
                                        struct bpf_link_info *info)
{
        struct bpf_perf_link *perf_link;
        const struct perf_event *event;

        perf_link = container_of(link, struct bpf_perf_link, link);
        event = perf_get_event(perf_link->perf_file);
        if (IS_ERR(event))
                return PTR_ERR(event);

        switch (event->prog->type) {
        case BPF_PROG_TYPE_PERF_EVENT:
                return bpf_perf_link_fill_perf_event(event, info);
        case BPF_PROG_TYPE_TRACEPOINT:
                return bpf_perf_link_fill_tracepoint(event, info);
        case BPF_PROG_TYPE_KPROBE:
                return bpf_perf_link_fill_probe(event, info);
        default:
                return -EOPNOTSUPP;
        }
}

static void bpf_perf_event_link_show_fdinfo(const struct perf_event *event,
                                            struct seq_file *seq)
{
        seq_printf(seq,
                   "type:\t%u\n"
                   "config:\t%llu\n"
                   "event_type:\t%s\n"
                   "cookie:\t%llu\n",
                   event->attr.type, event->attr.config,
                   "event", event->bpf_cookie);
}

static void bpf_tracepoint_link_show_fdinfo(const struct perf_event *event,
                                            struct seq_file *seq)
{
        int err;
        const char *name;
        u32 prog_id;

        err = bpf_get_perf_event_info(event, &prog_id, NULL, &name, NULL,
                                      NULL, NULL);
        if (err)
                return;

        seq_printf(seq,
                   "tp_name:\t%s\n"
                   "event_type:\t%s\n"
                   "cookie:\t%llu\n",
                   name, "tracepoint", event->bpf_cookie);
}

static void bpf_probe_link_show_fdinfo(const struct perf_event *event,
                                       struct seq_file *seq)
{
#ifdef CONFIG_KPROBE_EVENTS
        if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
                return bpf_perf_link_fdinfo_kprobe(event, seq);
#endif

#ifdef CONFIG_UPROBE_EVENTS
        if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
                return bpf_perf_link_fdinfo_uprobe(event, seq);
#endif
}

static void bpf_perf_link_show_fdinfo(const struct bpf_link *link,
                                      struct seq_file *seq)
{
        struct bpf_perf_link *perf_link;
        const struct perf_event *event;

        perf_link = container_of(link, struct bpf_perf_link, link);
        event = perf_get_event(perf_link->perf_file);
        if (IS_ERR(event))
                return;

        switch (event->prog->type) {
        case BPF_PROG_TYPE_PERF_EVENT:
                return bpf_perf_event_link_show_fdinfo(event, seq);
        case BPF_PROG_TYPE_TRACEPOINT:
                return bpf_tracepoint_link_show_fdinfo(event, seq);
        case BPF_PROG_TYPE_KPROBE:
                return bpf_probe_link_show_fdinfo(event, seq);
        default:
                return;
        }
}

static const struct bpf_link_ops bpf_perf_link_lops = {
        .release = bpf_perf_link_release,
        .dealloc = bpf_perf_link_dealloc,
        .fill_link_info = bpf_perf_link_fill_link_info,
        .show_fdinfo = bpf_perf_link_show_fdinfo,
};

static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct bpf_link_primer link_primer;
        struct bpf_perf_link *link;
        struct perf_event *event;
        struct file *perf_file;
        int err;

        if (attr->link_create.flags)
                return -EINVAL;

        perf_file = perf_event_get(attr->link_create.target_fd);
        if (IS_ERR(perf_file))
                return PTR_ERR(perf_file);

        link = kzalloc_obj(*link, GFP_USER);
        if (!link) {
                err = -ENOMEM;
                goto out_put_file;
        }
        bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog,
                      attr->link_create.attach_type);
        link->perf_file = perf_file;

        err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                goto out_put_file;
        }

        event = perf_file->private_data;
        err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
        if (err) {
                bpf_link_cleanup(&link_primer);
                goto out_put_file;
        }
        /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
        bpf_prog_inc(prog);

        return bpf_link_settle(&link_primer);

out_put_file:
        fput(perf_file);
        return err;
}
#else
static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}
#endif /* CONFIG_PERF_EVENTS */

static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
                                  const char __user *user_tp_name, u64 cookie,
                                  enum bpf_attach_type attach_type)
{
        struct bpf_link_primer link_primer;
        struct bpf_raw_tp_link *link;
        struct bpf_raw_event_map *btp;
        const char *tp_name;
        char buf[128];
        int err;

        switch (prog->type) {
        case BPF_PROG_TYPE_TRACING:
        case BPF_PROG_TYPE_EXT:
        case BPF_PROG_TYPE_LSM:
                if (user_tp_name)
                        /* The attach point for this category of programs
                         * should be specified via btf_id during program load.
                         */
                        return -EINVAL;
                if (prog->type == BPF_PROG_TYPE_TRACING &&
                    prog->expected_attach_type == BPF_TRACE_RAW_TP) {
                        tp_name = prog->aux->attach_func_name;
                        break;
                }
                return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type);
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
                if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
                        return -EFAULT;
                buf[sizeof(buf) - 1] = 0;
                tp_name = buf;
                break;
        default:
                return -EINVAL;
        }

        btp = bpf_get_raw_tracepoint(tp_name);
        if (!btp)
                return -ENOENT;

        link = kzalloc_obj(*link, GFP_USER);
        if (!link) {
                err = -ENOMEM;
                goto out_put_btp;
        }
        bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
                                &bpf_raw_tp_link_lops, prog, attach_type,
                                tracepoint_is_faultable(btp->tp));
        link->btp = btp;
        link->cookie = cookie;

        err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                goto out_put_btp;
        }

        err = bpf_probe_register(link->btp, link);
        if (err) {
                bpf_link_cleanup(&link_primer);
                goto out_put_btp;
        }

        return bpf_link_settle(&link_primer);

out_put_btp:
        bpf_put_raw_tracepoint(btp);
        return err;
}

#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie

static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
        struct bpf_prog *prog;
        void __user *tp_name;
        __u64 cookie;
        int fd;

        if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
                return -EINVAL;

        prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
        cookie = attr->raw_tracepoint.cookie;
        fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type);
        if (fd < 0)
                bpf_prog_put(prog);
        return fd;
}

static enum bpf_prog_type
attach_type_to_prog_type(enum bpf_attach_type attach_type)
{
        switch (attach_type) {
        case BPF_CGROUP_INET_INGRESS:
        case BPF_CGROUP_INET_EGRESS:
                return BPF_PROG_TYPE_CGROUP_SKB;
        case BPF_CGROUP_INET_SOCK_CREATE:
        case BPF_CGROUP_INET_SOCK_RELEASE:
        case BPF_CGROUP_INET4_POST_BIND:
        case BPF_CGROUP_INET6_POST_BIND:
                return BPF_PROG_TYPE_CGROUP_SOCK;
        case BPF_CGROUP_INET4_BIND:
        case BPF_CGROUP_INET6_BIND:
        case BPF_CGROUP_INET4_CONNECT:
        case BPF_CGROUP_INET6_CONNECT:
        case BPF_CGROUP_UNIX_CONNECT:
        case BPF_CGROUP_INET4_GETPEERNAME:
        case BPF_CGROUP_INET6_GETPEERNAME:
        case BPF_CGROUP_UNIX_GETPEERNAME:
        case BPF_CGROUP_INET4_GETSOCKNAME:
        case BPF_CGROUP_INET6_GETSOCKNAME:
        case BPF_CGROUP_UNIX_GETSOCKNAME:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
        case BPF_CGROUP_UNIX_SENDMSG:
        case BPF_CGROUP_UDP4_RECVMSG:
        case BPF_CGROUP_UDP6_RECVMSG:
        case BPF_CGROUP_UNIX_RECVMSG:
                return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
        case BPF_CGROUP_SOCK_OPS:
                return BPF_PROG_TYPE_SOCK_OPS;
        case BPF_CGROUP_DEVICE:
                return BPF_PROG_TYPE_CGROUP_DEVICE;
        case BPF_SK_MSG_VERDICT:
                return BPF_PROG_TYPE_SK_MSG;
        case BPF_SK_SKB_STREAM_PARSER:
        case BPF_SK_SKB_STREAM_VERDICT:
        case BPF_SK_SKB_VERDICT:
                return BPF_PROG_TYPE_SK_SKB;
        case BPF_LIRC_MODE2:
                return BPF_PROG_TYPE_LIRC_MODE2;
        case BPF_FLOW_DISSECTOR:
                return BPF_PROG_TYPE_FLOW_DISSECTOR;
        case BPF_CGROUP_SYSCTL:
                return BPF_PROG_TYPE_CGROUP_SYSCTL;
        case BPF_CGROUP_GETSOCKOPT:
        case BPF_CGROUP_SETSOCKOPT:
                return BPF_PROG_TYPE_CGROUP_SOCKOPT;
        case BPF_TRACE_ITER:
        case BPF_TRACE_RAW_TP:
        case BPF_TRACE_FENTRY:
        case BPF_TRACE_FEXIT:
        case BPF_TRACE_FSESSION:
        case BPF_MODIFY_RETURN:
                return BPF_PROG_TYPE_TRACING;
        case BPF_LSM_MAC:
                return BPF_PROG_TYPE_LSM;
        case BPF_SK_LOOKUP:
                return BPF_PROG_TYPE_SK_LOOKUP;
        case BPF_XDP:
                return BPF_PROG_TYPE_XDP;
        case BPF_LSM_CGROUP:
                return BPF_PROG_TYPE_LSM;
        case BPF_TCX_INGRESS:
        case BPF_TCX_EGRESS:
        case BPF_NETKIT_PRIMARY:
        case BPF_NETKIT_PEER:
                return BPF_PROG_TYPE_SCHED_CLS;
        default:
                return BPF_PROG_TYPE_UNSPEC;
        }
}

static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
                                             enum bpf_attach_type attach_type)
{
        enum bpf_prog_type ptype;

        switch (prog->type) {
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
        case BPF_PROG_TYPE_SK_LOOKUP:
                return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
        case BPF_PROG_TYPE_CGROUP_SKB:
                if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
                        /* cg-skb progs can be loaded by unpriv user.
                         * check permissions at attach time.
                         */
                        return -EPERM;

                ptype = attach_type_to_prog_type(attach_type);
                if (prog->type != ptype)
                        return -EINVAL;

                return prog->enforce_expected_attach_type &&
                        prog->expected_attach_type != attach_type ?
                        -EINVAL : 0;
        case BPF_PROG_TYPE_EXT:
                return 0;
        case BPF_PROG_TYPE_NETFILTER:
                if (attach_type != BPF_NETFILTER)
                        return -EINVAL;
                return 0;
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_TRACEPOINT:
                if (attach_type != BPF_PERF_EVENT)
                        return -EINVAL;
                return 0;
        case BPF_PROG_TYPE_KPROBE:
                if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
                    attach_type != BPF_TRACE_KPROBE_MULTI)
                        return -EINVAL;
                if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION &&
                    attach_type != BPF_TRACE_KPROBE_SESSION)
                        return -EINVAL;
                if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
                    attach_type != BPF_TRACE_UPROBE_MULTI)
                        return -EINVAL;
                if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION &&
                    attach_type != BPF_TRACE_UPROBE_SESSION)
                        return -EINVAL;
                if (attach_type != BPF_PERF_EVENT &&
                    attach_type != BPF_TRACE_KPROBE_MULTI &&
                    attach_type != BPF_TRACE_KPROBE_SESSION &&
                    attach_type != BPF_TRACE_UPROBE_MULTI &&
                    attach_type != BPF_TRACE_UPROBE_SESSION)
                        return -EINVAL;
                return 0;
        case BPF_PROG_TYPE_SCHED_CLS:
                if (attach_type != BPF_TCX_INGRESS &&
                    attach_type != BPF_TCX_EGRESS &&
                    attach_type != BPF_NETKIT_PRIMARY &&
                    attach_type != BPF_NETKIT_PEER)
                        return -EINVAL;
                return 0;
        default:
                ptype = attach_type_to_prog_type(attach_type);
                if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
                        return -EINVAL;
                return 0;
        }
}

static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype,
                                bool check_atype)
{
        switch (ptype) {
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SKB:
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_SOCK_OPS:
                return true;
        case BPF_PROG_TYPE_LSM:
                return check_atype ? atype == BPF_LSM_CGROUP : true;
        default:
                return false;
        }
}

#define BPF_PROG_ATTACH_LAST_FIELD expected_revision

#define BPF_F_ATTACH_MASK_BASE        \
        (BPF_F_ALLOW_OVERRIDE |        \
         BPF_F_ALLOW_MULTI |        \
         BPF_F_REPLACE |        \
         BPF_F_PREORDER)

#define BPF_F_ATTACH_MASK_MPROG        \
        (BPF_F_REPLACE |        \
         BPF_F_BEFORE |                \
         BPF_F_AFTER |                \
         BPF_F_ID |                \
         BPF_F_LINK)

static int bpf_prog_attach(const union bpf_attr *attr)
{
        enum bpf_prog_type ptype;
        struct bpf_prog *prog;
        int ret;

        if (CHECK_ATTR(BPF_PROG_ATTACH))
                return -EINVAL;

        ptype = attach_type_to_prog_type(attr->attach_type);
        if (ptype == BPF_PROG_TYPE_UNSPEC)
                return -EINVAL;
        if (bpf_mprog_supported(ptype)) {
                if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
                        return -EINVAL;
        } else if (is_cgroup_prog_type(ptype, 0, false)) {
                if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG))
                        return -EINVAL;
        } else {
                if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
                        return -EINVAL;
                if (attr->relative_fd ||
                    attr->expected_revision)
                        return -EINVAL;
        }

        prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
                bpf_prog_put(prog);
                return -EINVAL;
        }

        if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) {
                ret = cgroup_bpf_prog_attach(attr, ptype, prog);
                goto out;
        }

        switch (ptype) {
        case BPF_PROG_TYPE_SK_SKB:
        case BPF_PROG_TYPE_SK_MSG:
                ret = sock_map_get_from_fd(attr, prog);
                break;
        case BPF_PROG_TYPE_LIRC_MODE2:
                ret = lirc_prog_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
                ret = netns_bpf_prog_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_SCHED_CLS:
                if (attr->attach_type == BPF_TCX_INGRESS ||
                    attr->attach_type == BPF_TCX_EGRESS)
                        ret = tcx_prog_attach(attr, prog);
                else
                        ret = netkit_prog_attach(attr, prog);
                break;
        default:
                ret = -EINVAL;
        }
out:
        if (ret)
                bpf_prog_put(prog);
        return ret;
}

#define BPF_PROG_DETACH_LAST_FIELD expected_revision

static int bpf_prog_detach(const union bpf_attr *attr)
{
        struct bpf_prog *prog = NULL;
        enum bpf_prog_type ptype;
        int ret;

        if (CHECK_ATTR(BPF_PROG_DETACH))
                return -EINVAL;

        ptype = attach_type_to_prog_type(attr->attach_type);
        if (bpf_mprog_supported(ptype)) {
                if (ptype == BPF_PROG_TYPE_UNSPEC)
                        return -EINVAL;
                if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
                        return -EINVAL;
                if (attr->attach_bpf_fd) {
                        prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
                        if (IS_ERR(prog))
                                return PTR_ERR(prog);
                } else if (!bpf_mprog_detach_empty(ptype)) {
                        return -EPERM;
                }
        } else if (is_cgroup_prog_type(ptype, 0, false)) {
                if (attr->attach_flags || attr->relative_fd)
                        return -EINVAL;
        } else if (attr->attach_flags ||
                   attr->relative_fd ||
                   attr->expected_revision) {
                return -EINVAL;
        }

        switch (ptype) {
        case BPF_PROG_TYPE_SK_MSG:
        case BPF_PROG_TYPE_SK_SKB:
                ret = sock_map_prog_detach(attr, ptype);
                break;
        case BPF_PROG_TYPE_LIRC_MODE2:
                ret = lirc_prog_detach(attr);
                break;
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
                ret = netns_bpf_prog_detach(attr, ptype);
                break;
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SKB:
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_SOCK_OPS:
        case BPF_PROG_TYPE_LSM:
                ret = cgroup_bpf_prog_detach(attr, ptype);
                break;
        case BPF_PROG_TYPE_SCHED_CLS:
                if (attr->attach_type == BPF_TCX_INGRESS ||
                    attr->attach_type == BPF_TCX_EGRESS)
                        ret = tcx_prog_detach(attr, prog);
                else
                        ret = netkit_prog_detach(attr, prog);
                break;
        default:
                ret = -EINVAL;
        }

        if (prog)
                bpf_prog_put(prog);
        return ret;
}

#define BPF_PROG_QUERY_LAST_FIELD query.revision

static int bpf_prog_query(const union bpf_attr *attr,
                          union bpf_attr __user *uattr)
{
        if (!bpf_net_capable())
                return -EPERM;
        if (CHECK_ATTR(BPF_PROG_QUERY))
                return -EINVAL;
        if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
                return -EINVAL;

        switch (attr->query.attach_type) {
        case BPF_CGROUP_INET_INGRESS:
        case BPF_CGROUP_INET_EGRESS:
        case BPF_CGROUP_INET_SOCK_CREATE:
        case BPF_CGROUP_INET_SOCK_RELEASE:
        case BPF_CGROUP_INET4_BIND:
        case BPF_CGROUP_INET6_BIND:
        case BPF_CGROUP_INET4_POST_BIND:
        case BPF_CGROUP_INET6_POST_BIND:
        case BPF_CGROUP_INET4_CONNECT:
        case BPF_CGROUP_INET6_CONNECT:
        case BPF_CGROUP_UNIX_CONNECT:
        case BPF_CGROUP_INET4_GETPEERNAME:
        case BPF_CGROUP_INET6_GETPEERNAME:
        case BPF_CGROUP_UNIX_GETPEERNAME:
        case BPF_CGROUP_INET4_GETSOCKNAME:
        case BPF_CGROUP_INET6_GETSOCKNAME:
        case BPF_CGROUP_UNIX_GETSOCKNAME:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
        case BPF_CGROUP_UNIX_SENDMSG:
        case BPF_CGROUP_UDP4_RECVMSG:
        case BPF_CGROUP_UDP6_RECVMSG:
        case BPF_CGROUP_UNIX_RECVMSG:
        case BPF_CGROUP_SOCK_OPS:
        case BPF_CGROUP_DEVICE:
        case BPF_CGROUP_SYSCTL:
        case BPF_CGROUP_GETSOCKOPT:
        case BPF_CGROUP_SETSOCKOPT:
        case BPF_LSM_CGROUP:
                return cgroup_bpf_prog_query(attr, uattr);
        case BPF_LIRC_MODE2:
                return lirc_prog_query(attr, uattr);
        case BPF_FLOW_DISSECTOR:
        case BPF_SK_LOOKUP:
                return netns_bpf_prog_query(attr, uattr);
        case BPF_SK_SKB_STREAM_PARSER:
        case BPF_SK_SKB_STREAM_VERDICT:
        case BPF_SK_MSG_VERDICT:
        case BPF_SK_SKB_VERDICT:
                return sock_map_bpf_prog_query(attr, uattr);
        case BPF_TCX_INGRESS:
        case BPF_TCX_EGRESS:
                return tcx_prog_query(attr, uattr);
        case BPF_NETKIT_PRIMARY:
        case BPF_NETKIT_PEER:
                return netkit_prog_query(attr, uattr);
        default:
                return -EINVAL;
        }
}

#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size

static int bpf_prog_test_run(const union bpf_attr *attr,
                             union bpf_attr __user *uattr)
{
        struct bpf_prog *prog;
        int ret = -ENOTSUPP;

        if (CHECK_ATTR(BPF_PROG_TEST_RUN))
                return -EINVAL;

        if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
            (!attr->test.ctx_size_in && attr->test.ctx_in))
                return -EINVAL;

        if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
            (!attr->test.ctx_size_out && attr->test.ctx_out))
                return -EINVAL;

        prog = bpf_prog_get(attr->test.prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (prog->aux->ops->test_run)
                ret = prog->aux->ops->test_run(prog, attr, uattr);

        bpf_prog_put(prog);
        return ret;
}

#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id

static int bpf_obj_get_next_id(const union bpf_attr *attr,
                               union bpf_attr __user *uattr,
                               struct idr *idr,
                               spinlock_t *lock)
{
        u32 next_id = attr->start_id;
        int err = 0;

        if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        next_id++;
        spin_lock_bh(lock);
        if (!idr_get_next(idr, &next_id))
                err = -ENOENT;
        spin_unlock_bh(lock);

        if (!err)
                err = put_user(next_id, &uattr->next_id);

        return err;
}

struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
{
        struct bpf_map *map;

        spin_lock_bh(&map_idr_lock);
again:
        map = idr_get_next(&map_idr, id);
        if (map) {
                map = __bpf_map_inc_not_zero(map, false);
                if (IS_ERR(map)) {
                        (*id)++;
                        goto again;
                }
        }
        spin_unlock_bh(&map_idr_lock);

        return map;
}

struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
{
        struct bpf_prog *prog;

        spin_lock_bh(&prog_idr_lock);
again:
        prog = idr_get_next(&prog_idr, id);
        if (prog) {
                prog = bpf_prog_inc_not_zero(prog);
                if (IS_ERR(prog)) {
                        (*id)++;
                        goto again;
                }
        }
        spin_unlock_bh(&prog_idr_lock);

        return prog;
}

#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id

struct bpf_prog *bpf_prog_by_id(u32 id)
{
        struct bpf_prog *prog;

        if (!id)
                return ERR_PTR(-ENOENT);

        spin_lock_bh(&prog_idr_lock);
        prog = idr_find(&prog_idr, id);
        if (prog)
                prog = bpf_prog_inc_not_zero(prog);
        else
                prog = ERR_PTR(-ENOENT);
        spin_unlock_bh(&prog_idr_lock);
        return prog;
}

static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
{
        struct bpf_prog *prog;
        u32 id = attr->prog_id;
        int fd;

        if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        prog = bpf_prog_by_id(id);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        fd = bpf_prog_new_fd(prog);
        if (fd < 0)
                bpf_prog_put(prog);

        return fd;
}

#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags

static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
{
        struct bpf_map *map;
        u32 id = attr->map_id;
        int f_flags;
        int fd;

        if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
            attr->open_flags & ~BPF_OBJ_FLAG_MASK)
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        f_flags = bpf_get_file_flag(attr->open_flags);
        if (f_flags < 0)
                return f_flags;

        spin_lock_bh(&map_idr_lock);
        map = idr_find(&map_idr, id);
        if (map)
                map = __bpf_map_inc_not_zero(map, true);
        else
                map = ERR_PTR(-ENOENT);
        spin_unlock_bh(&map_idr_lock);

        if (IS_ERR(map))
                return PTR_ERR(map);

        fd = bpf_map_new_fd(map, f_flags);
        if (fd < 0)
                bpf_map_put_with_uref(map);

        return fd;
}

static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
                                              unsigned long addr, u32 *off,
                                              u32 *type)
{
        const struct bpf_map *map;
        int i;

        mutex_lock(&prog->aux->used_maps_mutex);
        for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
                map = prog->aux->used_maps[i];
                if (map == (void *)addr) {
                        *type = BPF_PSEUDO_MAP_FD;
                        goto out;
                }
                if (!map->ops->map_direct_value_meta)
                        continue;
                if (!map->ops->map_direct_value_meta(map, addr, off)) {
                        *type = BPF_PSEUDO_MAP_VALUE;
                        goto out;
                }
        }
        map = NULL;

out:
        mutex_unlock(&prog->aux->used_maps_mutex);
        return map;
}

static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
                                              const struct cred *f_cred)
{
        const struct bpf_map *map;
        struct bpf_insn *insns;
        u32 off, type;
        u64 imm;
        u8 code;
        int i;

        insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
                        GFP_USER);
        if (!insns)
                return insns;

        for (i = 0; i < prog->len; i++) {
                code = insns[i].code;

                if (code == (BPF_JMP | BPF_TAIL_CALL)) {
                        insns[i].code = BPF_JMP | BPF_CALL;
                        insns[i].imm = BPF_FUNC_tail_call;
                        /* fall-through */
                }
                if (code == (BPF_JMP | BPF_CALL) ||
                    code == (BPF_JMP | BPF_CALL_ARGS)) {
                        if (code == (BPF_JMP | BPF_CALL_ARGS))
                                insns[i].code = BPF_JMP | BPF_CALL;
                        if (!bpf_dump_raw_ok(f_cred))
                                insns[i].imm = 0;
                        continue;
                }
                if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
                        insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
                        continue;
                }

                if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
                     BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
                        insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
                        continue;
                }

                if (code != (BPF_LD | BPF_IMM | BPF_DW))
                        continue;

                imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
                map = bpf_map_from_imm(prog, imm, &off, &type);
                if (map) {
                        insns[i].src_reg = type;
                        insns[i].imm = map->id;
                        insns[i + 1].imm = off;
                        continue;
                }
        }

        return insns;
}

static int set_info_rec_size(struct bpf_prog_info *info)
{
        /*
         * Ensure info.*_rec_size is the same as kernel expected size
         *
         * or
         *
         * Only allow zero *_rec_size if both _rec_size and _cnt are
         * zero.  In this case, the kernel will set the expected
         * _rec_size back to the info.
         */

        if ((info->nr_func_info || info->func_info_rec_size) &&
            info->func_info_rec_size != sizeof(struct bpf_func_info))
                return -EINVAL;

        if ((info->nr_line_info || info->line_info_rec_size) &&
            info->line_info_rec_size != sizeof(struct bpf_line_info))
                return -EINVAL;

        if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
            info->jited_line_info_rec_size != sizeof(__u64))
                return -EINVAL;

        info->func_info_rec_size = sizeof(struct bpf_func_info);
        info->line_info_rec_size = sizeof(struct bpf_line_info);
        info->jited_line_info_rec_size = sizeof(__u64);

        return 0;
}

static int bpf_prog_get_info_by_fd(struct file *file,
                                   struct bpf_prog *prog,
                                   const union bpf_attr *attr,
                                   union bpf_attr __user *uattr)
{
        struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
        struct btf *attach_btf = bpf_prog_get_target_btf(prog);
        struct bpf_prog_info info;
        u32 info_len = attr->info.info_len;
        struct bpf_prog_kstats stats;
        char __user *uinsns;
        u32 ulen;
        int err;

        err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
        if (err)
                return err;
        info_len = min_t(u32, sizeof(info), info_len);

        memset(&info, 0, sizeof(info));
        if (copy_from_user(&info, uinfo, info_len))
                return -EFAULT;

        info.type = prog->type;
        info.id = prog->aux->id;
        info.load_time = prog->aux->load_time;
        info.created_by_uid = from_kuid_munged(current_user_ns(),
                                               prog->aux->user->uid);
        info.gpl_compatible = prog->gpl_compatible;

        memcpy(info.tag, prog->tag, sizeof(prog->tag));
        memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));

        mutex_lock(&prog->aux->used_maps_mutex);
        ulen = info.nr_map_ids;
        info.nr_map_ids = prog->aux->used_map_cnt;
        ulen = min_t(u32, info.nr_map_ids, ulen);
        if (ulen) {
                u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
                u32 i;

                for (i = 0; i < ulen; i++)
                        if (put_user(prog->aux->used_maps[i]->id,
                                     &user_map_ids[i])) {
                                mutex_unlock(&prog->aux->used_maps_mutex);
                                return -EFAULT;
                        }
        }
        mutex_unlock(&prog->aux->used_maps_mutex);

        err = set_info_rec_size(&info);
        if (err)
                return err;

        bpf_prog_get_stats(prog, &stats);
        info.run_time_ns = stats.nsecs;
        info.run_cnt = stats.cnt;
        info.recursion_misses = stats.misses;

        info.verified_insns = prog->aux->verified_insns;
        if (prog->aux->btf)
                info.btf_id = btf_obj_id(prog->aux->btf);

        if (!bpf_capable()) {
                info.jited_prog_len = 0;
                info.xlated_prog_len = 0;
                info.nr_jited_ksyms = 0;
                info.nr_jited_func_lens = 0;
                info.nr_func_info = 0;
                info.nr_line_info = 0;
                info.nr_jited_line_info = 0;
                goto done;
        }

        ulen = info.xlated_prog_len;
        info.xlated_prog_len = bpf_prog_insn_size(prog);
        if (info.xlated_prog_len && ulen) {
                struct bpf_insn *insns_sanitized;
                bool fault;

                if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) {
                        insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
                        if (!insns_sanitized)
                                return -ENOMEM;
                        uinsns = u64_to_user_ptr(info.xlated_prog_insns);
                        ulen = min_t(u32, info.xlated_prog_len, ulen);
                        fault = copy_to_user(uinsns, insns_sanitized, ulen);
                        kfree(insns_sanitized);
                        if (fault)
                                return -EFAULT;
                } else {
                        info.xlated_prog_insns = 0;
                }
        }

        if (bpf_prog_is_offloaded(prog->aux)) {
                err = bpf_prog_offload_info_fill(&info, prog);
                if (err)
                        return err;
                goto done;
        }

        /* NOTE: the following code is supposed to be skipped for offload.
         * bpf_prog_offload_info_fill() is the place to fill similar fields
         * for offload.
         */
        ulen = info.jited_prog_len;
        if (prog->aux->func_cnt) {
                u32 i;

                info.jited_prog_len = 0;
                for (i = 0; i < prog->aux->func_cnt; i++)
                        info.jited_prog_len += prog->aux->func[i]->jited_len;
        } else {
                info.jited_prog_len = prog->jited_len;
        }

        if (info.jited_prog_len && ulen) {
                if (bpf_dump_raw_ok(file->f_cred)) {
                        uinsns = u64_to_user_ptr(info.jited_prog_insns);
                        ulen = min_t(u32, info.jited_prog_len, ulen);

                        /* for multi-function programs, copy the JITed
                         * instructions for all the functions
                         */
                        if (prog->aux->func_cnt) {
                                u32 len, free, i;
                                u8 *img;

                                free = ulen;
                                for (i = 0; i < prog->aux->func_cnt; i++) {
                                        len = prog->aux->func[i]->jited_len;
                                        len = min_t(u32, len, free);
                                        img = (u8 *) prog->aux->func[i]->bpf_func;
                                        if (copy_to_user(uinsns, img, len))
                                                return -EFAULT;
                                        uinsns += len;
                                        free -= len;
                                        if (!free)
                                                break;
                                }
                        } else {
                                if (copy_to_user(uinsns, prog->bpf_func, ulen))
                                        return -EFAULT;
                        }
                } else {
                        info.jited_prog_insns = 0;
                }
        }

        ulen = info.nr_jited_ksyms;
        info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
        if (ulen) {
                if (bpf_dump_raw_ok(file->f_cred)) {
                        unsigned long ksym_addr;
                        u64 __user *user_ksyms;
                        u32 i;

                        /* copy the address of the kernel symbol
                         * corresponding to each function
                         */
                        ulen = min_t(u32, info.nr_jited_ksyms, ulen);
                        user_ksyms = u64_to_user_ptr(info.jited_ksyms);
                        if (prog->aux->func_cnt) {
                                for (i = 0; i < ulen; i++) {
                                        ksym_addr = (unsigned long)
                                                prog->aux->func[i]->bpf_func;
                                        if (put_user((u64) ksym_addr,
                                                     &user_ksyms[i]))
                                                return -EFAULT;
                                }
                        } else {
                                ksym_addr = (unsigned long) prog->bpf_func;
                                if (put_user((u64) ksym_addr, &user_ksyms[0]))
                                        return -EFAULT;
                        }
                } else {
                        info.jited_ksyms = 0;
                }
        }

        ulen = info.nr_jited_func_lens;
        info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
        if (ulen) {
                if (bpf_dump_raw_ok(file->f_cred)) {
                        u32 __user *user_lens;
                        u32 func_len, i;

                        /* copy the JITed image lengths for each function */
                        ulen = min_t(u32, info.nr_jited_func_lens, ulen);
                        user_lens = u64_to_user_ptr(info.jited_func_lens);
                        if (prog->aux->func_cnt) {
                                for (i = 0; i < ulen; i++) {
                                        func_len =
                                                prog->aux->func[i]->jited_len;
                                        if (put_user(func_len, &user_lens[i]))
                                                return -EFAULT;
                                }
                        } else {
                                func_len = prog->jited_len;
                                if (put_user(func_len, &user_lens[0]))
                                        return -EFAULT;
                        }
                } else {
                        info.jited_func_lens = 0;
                }
        }

        info.attach_btf_id = prog->aux->attach_btf_id;
        if (attach_btf)
                info.attach_btf_obj_id = btf_obj_id(attach_btf);

        ulen = info.nr_func_info;
        info.nr_func_info = prog->aux->func_info_cnt;
        if (info.nr_func_info && ulen) {
                char __user *user_finfo;

                user_finfo = u64_to_user_ptr(info.func_info);
                ulen = min_t(u32, info.nr_func_info, ulen);
                if (copy_to_user(user_finfo, prog->aux->func_info,
                                 info.func_info_rec_size * ulen))
                        return -EFAULT;
        }

        ulen = info.nr_line_info;
        info.nr_line_info = prog->aux->nr_linfo;
        if (info.nr_line_info && ulen) {
                __u8 __user *user_linfo;

                user_linfo = u64_to_user_ptr(info.line_info);
                ulen = min_t(u32, info.nr_line_info, ulen);
                if (copy_to_user(user_linfo, prog->aux->linfo,
                                 info.line_info_rec_size * ulen))
                        return -EFAULT;
        }

        ulen = info.nr_jited_line_info;
        if (prog->aux->jited_linfo)
                info.nr_jited_line_info = prog->aux->nr_linfo;
        else
                info.nr_jited_line_info = 0;
        if (info.nr_jited_line_info && ulen) {
                if (bpf_dump_raw_ok(file->f_cred)) {
                        unsigned long line_addr;
                        __u64 __user *user_linfo;
                        u32 i;

                        user_linfo = u64_to_user_ptr(info.jited_line_info);
                        ulen = min_t(u32, info.nr_jited_line_info, ulen);
                        for (i = 0; i < ulen; i++) {
                                line_addr = (unsigned long)prog->aux->jited_linfo[i];
                                if (put_user((__u64)line_addr, &user_linfo[i]))
                                        return -EFAULT;
                        }
                } else {
                        info.jited_line_info = 0;
                }
        }

        ulen = info.nr_prog_tags;
        info.nr_prog_tags = prog->aux->func_cnt ? : 1;
        if (ulen) {
                __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
                u32 i;

                user_prog_tags = u64_to_user_ptr(info.prog_tags);
                ulen = min_t(u32, info.nr_prog_tags, ulen);
                if (prog->aux->func_cnt) {
                        for (i = 0; i < ulen; i++) {
                                if (copy_to_user(user_prog_tags[i],
                                                 prog->aux->func[i]->tag,
                                                 BPF_TAG_SIZE))
                                        return -EFAULT;
                        }
                } else {
                        if (copy_to_user(user_prog_tags[0],
                                         prog->tag, BPF_TAG_SIZE))
                                return -EFAULT;
                }
        }

done:
        if (copy_to_user(uinfo, &info, info_len) ||
            put_user(info_len, &uattr->info.info_len))
                return -EFAULT;

        return 0;
}

static int bpf_map_get_info_by_fd(struct file *file,
                                  struct bpf_map *map,
                                  const union bpf_attr *attr,
                                  union bpf_attr __user *uattr)
{
        struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
        struct bpf_map_info info;
        u32 info_len = attr->info.info_len;
        int err;

        err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
        if (err)
                return err;
        info_len = min_t(u32, sizeof(info), info_len);

        memset(&info, 0, sizeof(info));
        if (copy_from_user(&info, uinfo, info_len))
                return -EFAULT;

        info.type = map->map_type;
        info.id = map->id;
        info.key_size = map->key_size;
        info.value_size = map->value_size;
        info.max_entries = map->max_entries;
        info.map_flags = map->map_flags;
        info.map_extra = map->map_extra;
        memcpy(info.name, map->name, sizeof(map->name));

        if (map->btf) {
                info.btf_id = btf_obj_id(map->btf);
                info.btf_key_type_id = map->btf_key_type_id;
                info.btf_value_type_id = map->btf_value_type_id;
        }
        info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
        if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
                bpf_map_struct_ops_info_fill(&info, map);

        if (bpf_map_is_offloaded(map)) {
                err = bpf_map_offload_info_fill(&info, map);
                if (err)
                        return err;
        }

        if (info.hash) {
                char __user *uhash = u64_to_user_ptr(info.hash);

                if (!map->ops->map_get_hash)
                        return -EINVAL;

                if (info.hash_size != SHA256_DIGEST_SIZE)
                        return -EINVAL;

                if (!READ_ONCE(map->frozen))
                        return -EPERM;

                err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
                if (err != 0)
                        return err;

                if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0)
                        return -EFAULT;
        } else if (info.hash_size) {
                return -EINVAL;
        }

        if (copy_to_user(uinfo, &info, info_len) ||
            put_user(info_len, &uattr->info.info_len))
                return -EFAULT;

        return 0;
}

static int bpf_btf_get_info_by_fd(struct file *file,
                                  struct btf *btf,
                                  const union bpf_attr *attr,
                                  union bpf_attr __user *uattr)
{
        struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
        u32 info_len = attr->info.info_len;
        int err;

        err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
        if (err)
                return err;

        return btf_get_info_by_fd(btf, attr, uattr);
}

static int bpf_link_get_info_by_fd(struct file *file,
                                  struct bpf_link *link,
                                  const union bpf_attr *attr,
                                  union bpf_attr __user *uattr)
{
        struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
        struct bpf_link_info info;
        u32 info_len = attr->info.info_len;
        int err;

        err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
        if (err)
                return err;
        info_len = min_t(u32, sizeof(info), info_len);

        memset(&info, 0, sizeof(info));
        if (copy_from_user(&info, uinfo, info_len))
                return -EFAULT;

        info.type = link->type;
        info.id = link->id;
        if (link->prog)
                info.prog_id = link->prog->aux->id;

        if (link->ops->fill_link_info) {
                err = link->ops->fill_link_info(link, &info);
                if (err)
                        return err;
        }

        if (copy_to_user(uinfo, &info, info_len) ||
            put_user(info_len, &uattr->info.info_len))
                return -EFAULT;

        return 0;
}


static int token_get_info_by_fd(struct file *file,
                                struct bpf_token *token,
                                const union bpf_attr *attr,
                                union bpf_attr __user *uattr)
{
        struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info);
        u32 info_len = attr->info.info_len;
        int err;

        err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
        if (err)
                return err;
        return bpf_token_get_info_by_fd(token, attr, uattr);
}

#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info

static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
                                  union bpf_attr __user *uattr)
{
        if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
                return -EINVAL;

        CLASS(fd, f)(attr->info.bpf_fd);
        if (fd_empty(f))
                return -EBADFD;

        if (fd_file(f)->f_op == &bpf_prog_fops)
                return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
                                              uattr);
        else if (fd_file(f)->f_op == &bpf_map_fops)
                return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
                                             uattr);
        else if (fd_file(f)->f_op == &btf_fops)
                return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr);
        else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
                return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
                                              attr, uattr);
        else if (fd_file(f)->f_op == &bpf_token_fops)
                return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
                                            attr, uattr);
        return -EINVAL;
}

#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd

static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
        struct bpf_token *token = NULL;

        if (CHECK_ATTR(BPF_BTF_LOAD))
                return -EINVAL;

        if (attr->btf_flags & ~BPF_F_TOKEN_FD)
                return -EINVAL;

        if (attr->btf_flags & BPF_F_TOKEN_FD) {
                token = bpf_token_get_from_fd(attr->btf_token_fd);
                if (IS_ERR(token))
                        return PTR_ERR(token);
                if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
                        bpf_token_put(token);
                        token = NULL;
                }
        }

        if (!bpf_token_capable(token, CAP_BPF)) {
                bpf_token_put(token);
                return -EPERM;
        }

        bpf_token_put(token);

        return btf_new_fd(attr, uattr, uattr_size);
}

#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd

static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
{
        struct bpf_token *token = NULL;

        if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
                return -EINVAL;

        if (attr->open_flags & ~BPF_F_TOKEN_FD)
                return -EINVAL;

        if (attr->open_flags & BPF_F_TOKEN_FD) {
                token = bpf_token_get_from_fd(attr->fd_by_id_token_fd);
                if (IS_ERR(token))
                        return PTR_ERR(token);
                if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) {
                        bpf_token_put(token);
                        token = NULL;
                }
        }

        if (!bpf_token_capable(token, CAP_SYS_ADMIN)) {
                bpf_token_put(token);
                return -EPERM;
        }

        bpf_token_put(token);

        return btf_get_fd_by_id(attr->btf_id);
}

static int bpf_task_fd_query_copy(const union bpf_attr *attr,
                                    union bpf_attr __user *uattr,
                                    u32 prog_id, u32 fd_type,
                                    const char *buf, u64 probe_offset,
                                    u64 probe_addr)
{
        char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
        u32 len = buf ? strlen(buf) : 0, input_len;
        int err = 0;

        if (put_user(len, &uattr->task_fd_query.buf_len))
                return -EFAULT;
        input_len = attr->task_fd_query.buf_len;
        if (input_len && ubuf) {
                if (!len) {
                        /* nothing to copy, just make ubuf NULL terminated */
                        char zero = '\0';

                        if (put_user(zero, ubuf))
                                return -EFAULT;
                } else {
                        err = bpf_copy_to_user(ubuf, buf, input_len, len);
                        if (err == -EFAULT)
                                return err;
                }
        }

        if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
            put_user(fd_type, &uattr->task_fd_query.fd_type) ||
            put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
            put_user(probe_addr, &uattr->task_fd_query.probe_addr))
                return -EFAULT;

        return err;
}

#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr

static int bpf_task_fd_query(const union bpf_attr *attr,
                             union bpf_attr __user *uattr)
{
        pid_t pid = attr->task_fd_query.pid;
        u32 fd = attr->task_fd_query.fd;
        const struct perf_event *event;
        struct task_struct *task;
        struct file *file;
        int err;

        if (CHECK_ATTR(BPF_TASK_FD_QUERY))
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (attr->task_fd_query.flags != 0)
                return -EINVAL;

        rcu_read_lock();
        task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
        rcu_read_unlock();
        if (!task)
                return -ENOENT;

        err = 0;
        file = fget_task(task, fd);
        put_task_struct(task);
        if (!file)
                return -EBADF;

        if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {
                struct bpf_link *link = file->private_data;

                if (link->ops == &bpf_raw_tp_link_lops) {
                        struct bpf_raw_tp_link *raw_tp =
                                container_of(link, struct bpf_raw_tp_link, link);
                        struct bpf_raw_event_map *btp = raw_tp->btp;

                        err = bpf_task_fd_query_copy(attr, uattr,
                                                     raw_tp->link.prog->aux->id,
                                                     BPF_FD_TYPE_RAW_TRACEPOINT,
                                                     btp->tp->name, 0, 0);
                        goto put_file;
                }
                goto out_not_supp;
        }

        event = perf_get_event(file);
        if (!IS_ERR(event)) {
                u64 probe_offset, probe_addr;
                u32 prog_id, fd_type;
                const char *buf;

                err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
                                              &buf, &probe_offset,
                                              &probe_addr, NULL);
                if (!err)
                        err = bpf_task_fd_query_copy(attr, uattr, prog_id,
                                                     fd_type, buf,
                                                     probe_offset,
                                                     probe_addr);
                goto put_file;
        }

out_not_supp:
        err = -ENOTSUPP;
put_file:
        fput(file);
        return err;
}

#define BPF_MAP_BATCH_LAST_FIELD batch.flags

#define BPF_DO_BATCH(fn, ...)                        \
        do {                                        \
                if (!fn) {                        \
                        err = -ENOTSUPP;        \
                        goto err_put;                \
                }                                \
                err = fn(__VA_ARGS__);                \
        } while (0)

static int bpf_map_do_batch(const union bpf_attr *attr,
                            union bpf_attr __user *uattr,
                            int cmd)
{
        bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
                         cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
        bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
        struct bpf_map *map;
        int err;

        if (CHECK_ATTR(BPF_MAP_BATCH))
                return -EINVAL;

        CLASS(fd, f)(attr->batch.map_fd);

        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        if (has_write)
                bpf_map_write_active_inc(map);
        if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
                err = -EPERM;
                goto err_put;
        }
        if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                err = -EPERM;
                goto err_put;
        }

        if (cmd == BPF_MAP_LOOKUP_BATCH)
                BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
        else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
                BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
        else if (cmd == BPF_MAP_UPDATE_BATCH)
                BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr);
        else
                BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
        if (has_write) {
                maybe_wait_bpf_programs(map);
                bpf_map_write_active_dec(map);
        }
        return err;
}

#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
        struct bpf_prog *prog;
        int ret;

        if (CHECK_ATTR(BPF_LINK_CREATE))
                return -EINVAL;

        if (attr->link_create.attach_type == BPF_STRUCT_OPS)
                return bpf_struct_ops_link_create(attr);

        prog = bpf_prog_get(attr->link_create.prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        ret = bpf_prog_attach_check_attach_type(prog,
                                                attr->link_create.attach_type);
        if (ret)
                goto out;

        switch (prog->type) {
        case BPF_PROG_TYPE_CGROUP_SKB:
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_SOCK_OPS:
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                ret = cgroup_bpf_link_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_EXT:
                ret = bpf_tracing_prog_attach(prog,
                                              attr->link_create.target_fd,
                                              attr->link_create.target_btf_id,
                                              attr->link_create.tracing.cookie,
                                              attr->link_create.attach_type);
                break;
        case BPF_PROG_TYPE_LSM:
        case BPF_PROG_TYPE_TRACING:
                if (attr->link_create.attach_type != prog->expected_attach_type) {
                        ret = -EINVAL;
                        goto out;
                }
                if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
                        ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie,
                                                     attr->link_create.attach_type);
                else if (prog->expected_attach_type == BPF_TRACE_ITER)
                        ret = bpf_iter_link_attach(attr, uattr, prog);
                else if (prog->expected_attach_type == BPF_LSM_CGROUP)
                        ret = cgroup_bpf_link_attach(attr, prog);
                else
                        ret = bpf_tracing_prog_attach(prog,
                                                      attr->link_create.target_fd,
                                                      attr->link_create.target_btf_id,
                                                      attr->link_create.tracing.cookie,
                                                      attr->link_create.attach_type);
                break;
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
        case BPF_PROG_TYPE_SK_LOOKUP:
                ret = netns_bpf_link_create(attr, prog);
                break;
        case BPF_PROG_TYPE_SK_MSG:
        case BPF_PROG_TYPE_SK_SKB:
                ret = sock_map_link_create(attr, prog);
                break;
#ifdef CONFIG_NET
        case BPF_PROG_TYPE_XDP:
                ret = bpf_xdp_link_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_SCHED_CLS:
                if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
                    attr->link_create.attach_type == BPF_TCX_EGRESS)
                        ret = tcx_link_attach(attr, prog);
                else
                        ret = netkit_link_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_NETFILTER:
                ret = bpf_nf_link_attach(attr, prog);
                break;
#endif
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_TRACEPOINT:
                ret = bpf_perf_link_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_KPROBE:
                if (attr->link_create.attach_type == BPF_PERF_EVENT)
                        ret = bpf_perf_link_attach(attr, prog);
                else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
                         attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
                        ret = bpf_kprobe_multi_link_attach(attr, prog);
                else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI ||
                         attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION)
                        ret = bpf_uprobe_multi_link_attach(attr, prog);
                break;
        default:
                ret = -EINVAL;
        }

out:
        if (ret < 0)
                bpf_prog_put(prog);
        return ret;
}

static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
{
        struct bpf_map *new_map, *old_map = NULL;
        int ret;

        new_map = bpf_map_get(attr->link_update.new_map_fd);
        if (IS_ERR(new_map))
                return PTR_ERR(new_map);

        if (attr->link_update.flags & BPF_F_REPLACE) {
                old_map = bpf_map_get(attr->link_update.old_map_fd);
                if (IS_ERR(old_map)) {
                        ret = PTR_ERR(old_map);
                        goto out_put;
                }
        } else if (attr->link_update.old_map_fd) {
                ret = -EINVAL;
                goto out_put;
        }

        ret = link->ops->update_map(link, new_map, old_map);

        if (old_map)
                bpf_map_put(old_map);
out_put:
        bpf_map_put(new_map);
        return ret;
}

#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd

static int link_update(union bpf_attr *attr)
{
        struct bpf_prog *old_prog = NULL, *new_prog;
        struct bpf_link *link;
        u32 flags;
        int ret;

        if (CHECK_ATTR(BPF_LINK_UPDATE))
                return -EINVAL;

        flags = attr->link_update.flags;
        if (flags & ~BPF_F_REPLACE)
                return -EINVAL;

        link = bpf_link_get_from_fd(attr->link_update.link_fd);
        if (IS_ERR(link))
                return PTR_ERR(link);

        if (link->ops->update_map) {
                ret = link_update_map(link, attr);
                goto out_put_link;
        }

        new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
        if (IS_ERR(new_prog)) {
                ret = PTR_ERR(new_prog);
                goto out_put_link;
        }

        if (flags & BPF_F_REPLACE) {
                old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
                if (IS_ERR(old_prog)) {
                        ret = PTR_ERR(old_prog);
                        old_prog = NULL;
                        goto out_put_progs;
                }
        } else if (attr->link_update.old_prog_fd) {
                ret = -EINVAL;
                goto out_put_progs;
        }

        if (link->ops->update_prog)
                ret = link->ops->update_prog(link, new_prog, old_prog);
        else
                ret = -EINVAL;

out_put_progs:
        if (old_prog)
                bpf_prog_put(old_prog);
        if (ret)
                bpf_prog_put(new_prog);
out_put_link:
        bpf_link_put_direct(link);
        return ret;
}

#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd

static int link_detach(union bpf_attr *attr)
{
        struct bpf_link *link;
        int ret;

        if (CHECK_ATTR(BPF_LINK_DETACH))
                return -EINVAL;

        link = bpf_link_get_from_fd(attr->link_detach.link_fd);
        if (IS_ERR(link))
                return PTR_ERR(link);

        if (link->ops->detach)
                ret = link->ops->detach(link);
        else
                ret = -EOPNOTSUPP;

        bpf_link_put_direct(link);
        return ret;
}

struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
        return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL(bpf_link_inc_not_zero);

struct bpf_link *bpf_link_by_id(u32 id)
{
        struct bpf_link *link;

        if (!id)
                return ERR_PTR(-ENOENT);

        spin_lock_bh(&link_idr_lock);
        /* before link is "settled", ID is 0, pretend it doesn't exist yet */
        link = idr_find(&link_idr, id);
        if (link) {
                if (link->id)
                        link = bpf_link_inc_not_zero(link);
                else
                        link = ERR_PTR(-EAGAIN);
        } else {
                link = ERR_PTR(-ENOENT);
        }
        spin_unlock_bh(&link_idr_lock);
        return link;
}

struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
{
        struct bpf_link *link;

        spin_lock_bh(&link_idr_lock);
again:
        link = idr_get_next(&link_idr, id);
        if (link) {
                link = bpf_link_inc_not_zero(link);
                if (IS_ERR(link)) {
                        (*id)++;
                        goto again;
                }
        }
        spin_unlock_bh(&link_idr_lock);

        return link;
}

#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id

static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
{
        struct bpf_link *link;
        u32 id = attr->link_id;
        int fd;

        if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        link = bpf_link_by_id(id);
        if (IS_ERR(link))
                return PTR_ERR(link);

        fd = bpf_link_new_fd(link);
        if (fd < 0)
                bpf_link_put_direct(link);

        return fd;
}

DEFINE_MUTEX(bpf_stats_enabled_mutex);

static int bpf_stats_release(struct inode *inode, struct file *file)
{
        mutex_lock(&bpf_stats_enabled_mutex);
        static_key_slow_dec(&bpf_stats_enabled_key.key);
        mutex_unlock(&bpf_stats_enabled_mutex);
        return 0;
}

static const struct file_operations bpf_stats_fops = {
        .release = bpf_stats_release,
};

static int bpf_enable_runtime_stats(void)
{
        int fd;

        mutex_lock(&bpf_stats_enabled_mutex);

        /* Set a very high limit to avoid overflow */
        if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
                mutex_unlock(&bpf_stats_enabled_mutex);
                return -EBUSY;
        }

        fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
        if (fd >= 0)
                static_key_slow_inc(&bpf_stats_enabled_key.key);

        mutex_unlock(&bpf_stats_enabled_mutex);
        return fd;
}

#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type

static int bpf_enable_stats(union bpf_attr *attr)
{

        if (CHECK_ATTR(BPF_ENABLE_STATS))
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        switch (attr->enable_stats.type) {
        case BPF_STATS_RUN_TIME:
                return bpf_enable_runtime_stats();
        default:
                break;
        }
        return -EINVAL;
}

#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags

static int bpf_iter_create(union bpf_attr *attr)
{
        struct bpf_link *link;
        int err;

        if (CHECK_ATTR(BPF_ITER_CREATE))
                return -EINVAL;

        if (attr->iter_create.flags)
                return -EINVAL;

        link = bpf_link_get_from_fd(attr->iter_create.link_fd);
        if (IS_ERR(link))
                return PTR_ERR(link);

        err = bpf_iter_new_fd(link);
        bpf_link_put_direct(link);

        return err;
}

#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags

static int bpf_prog_bind_map(union bpf_attr *attr)
{
        struct bpf_prog *prog;
        struct bpf_map *map;
        struct bpf_map **used_maps_old, **used_maps_new;
        int i, ret = 0;

        if (CHECK_ATTR(BPF_PROG_BIND_MAP))
                return -EINVAL;

        if (attr->prog_bind_map.flags)
                return -EINVAL;

        prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        map = bpf_map_get(attr->prog_bind_map.map_fd);
        if (IS_ERR(map)) {
                ret = PTR_ERR(map);
                goto out_prog_put;
        }

        mutex_lock(&prog->aux->used_maps_mutex);

        used_maps_old = prog->aux->used_maps;

        for (i = 0; i < prog->aux->used_map_cnt; i++)
                if (used_maps_old[i] == map) {
                        bpf_map_put(map);
                        goto out_unlock;
                }

        used_maps_new = kmalloc_objs(used_maps_new[0],
                                     prog->aux->used_map_cnt + 1);
        if (!used_maps_new) {
                ret = -ENOMEM;
                goto out_unlock;
        }

        /* The bpf program will not access the bpf map, but for the sake of
         * simplicity, increase sleepable_refcnt for sleepable program as well.
         */
        if (prog->sleepable)
                atomic64_inc(&map->sleepable_refcnt);
        memcpy(used_maps_new, used_maps_old,
               sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
        used_maps_new[prog->aux->used_map_cnt] = map;

        prog->aux->used_map_cnt++;
        prog->aux->used_maps = used_maps_new;

        kfree(used_maps_old);

out_unlock:
        mutex_unlock(&prog->aux->used_maps_mutex);

        if (ret)
                bpf_map_put(map);
out_prog_put:
        bpf_prog_put(prog);
        return ret;
}

#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd

static int token_create(union bpf_attr *attr)
{
        if (CHECK_ATTR(BPF_TOKEN_CREATE))
                return -EINVAL;

        /* no flags are supported yet */
        if (attr->token_create.flags)
                return -EINVAL;

        return bpf_token_create(attr);
}

#define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd

static int prog_stream_read(union bpf_attr *attr)
{
        char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf);
        u32 len = attr->prog_stream_read.stream_buf_len;
        struct bpf_prog *prog;
        int ret;

        if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD))
                return -EINVAL;

        prog = bpf_prog_get(attr->prog_stream_read.prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len);
        bpf_prog_put(prog);

        return ret;
}

#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd

static int prog_assoc_struct_ops(union bpf_attr *attr)
{
        struct bpf_prog *prog;
        struct bpf_map *map;
        int ret;

        if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS))
                return -EINVAL;

        if (attr->prog_assoc_struct_ops.flags)
                return -EINVAL;

        prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
                ret = -EINVAL;
                goto put_prog;
        }

        map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd);
        if (IS_ERR(map)) {
                ret = PTR_ERR(map);
                goto put_prog;
        }

        if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
                ret = -EINVAL;
                goto put_map;
        }

        ret = bpf_prog_assoc_struct_ops(prog, map);

put_map:
        bpf_map_put(map);
put_prog:
        bpf_prog_put(prog);
        return ret;
}

static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
        union bpf_attr attr;
        int err;

        err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
        if (err)
                return err;
        size = min_t(u32, size, sizeof(attr));

        /* copy attributes from user space, may be less than sizeof(bpf_attr) */
        memset(&attr, 0, sizeof(attr));
        if (copy_from_bpfptr(&attr, uattr, size) != 0)
                return -EFAULT;

        err = security_bpf(cmd, &attr, size, uattr.is_kernel);
        if (err < 0)
                return err;

        switch (cmd) {
        case BPF_MAP_CREATE:
                err = map_create(&attr, uattr);
                break;
        case BPF_MAP_LOOKUP_ELEM:
                err = map_lookup_elem(&attr);
                break;
        case BPF_MAP_UPDATE_ELEM:
                err = map_update_elem(&attr, uattr);
                break;
        case BPF_MAP_DELETE_ELEM:
                err = map_delete_elem(&attr, uattr);
                break;
        case BPF_MAP_GET_NEXT_KEY:
                err = map_get_next_key(&attr);
                break;
        case BPF_MAP_FREEZE:
                err = map_freeze(&attr);
                break;
        case BPF_PROG_LOAD:
                err = bpf_prog_load(&attr, uattr, size);
                break;
        case BPF_OBJ_PIN:
                err = bpf_obj_pin(&attr);
                break;
        case BPF_OBJ_GET:
                err = bpf_obj_get(&attr);
                break;
        case BPF_PROG_ATTACH:
                err = bpf_prog_attach(&attr);
                break;
        case BPF_PROG_DETACH:
                err = bpf_prog_detach(&attr);
                break;
        case BPF_PROG_QUERY:
                err = bpf_prog_query(&attr, uattr.user);
                break;
        case BPF_PROG_TEST_RUN:
                err = bpf_prog_test_run(&attr, uattr.user);
                break;
        case BPF_PROG_GET_NEXT_ID:
                err = bpf_obj_get_next_id(&attr, uattr.user,
                                          &prog_idr, &prog_idr_lock);
                break;
        case BPF_MAP_GET_NEXT_ID:
                err = bpf_obj_get_next_id(&attr, uattr.user,
                                          &map_idr, &map_idr_lock);
                break;
        case BPF_BTF_GET_NEXT_ID:
                err = bpf_obj_get_next_id(&attr, uattr.user,
                                          &btf_idr, &btf_idr_lock);
                break;
        case BPF_PROG_GET_FD_BY_ID:
                err = bpf_prog_get_fd_by_id(&attr);
                break;
        case BPF_MAP_GET_FD_BY_ID:
                err = bpf_map_get_fd_by_id(&attr);
                break;
        case BPF_OBJ_GET_INFO_BY_FD:
                err = bpf_obj_get_info_by_fd(&attr, uattr.user);
                break;
        case BPF_RAW_TRACEPOINT_OPEN:
                err = bpf_raw_tracepoint_open(&attr);
                break;
        case BPF_BTF_LOAD:
                err = bpf_btf_load(&attr, uattr, size);
                break;
        case BPF_BTF_GET_FD_BY_ID:
                err = bpf_btf_get_fd_by_id(&attr);
                break;
        case BPF_TASK_FD_QUERY:
                err = bpf_task_fd_query(&attr, uattr.user);
                break;
        case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
                err = map_lookup_and_delete_elem(&attr);
                break;
        case BPF_MAP_LOOKUP_BATCH:
                err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
                break;
        case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
                err = bpf_map_do_batch(&attr, uattr.user,
                                       BPF_MAP_LOOKUP_AND_DELETE_BATCH);
                break;
        case BPF_MAP_UPDATE_BATCH:
                err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
                break;
        case BPF_MAP_DELETE_BATCH:
                err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
                break;
        case BPF_LINK_CREATE:
                err = link_create(&attr, uattr);
                break;
        case BPF_LINK_UPDATE:
                err = link_update(&attr);
                break;
        case BPF_LINK_GET_FD_BY_ID:
                err = bpf_link_get_fd_by_id(&attr);
                break;
        case BPF_LINK_GET_NEXT_ID:
                err = bpf_obj_get_next_id(&attr, uattr.user,
                                          &link_idr, &link_idr_lock);
                break;
        case BPF_ENABLE_STATS:
                err = bpf_enable_stats(&attr);
                break;
        case BPF_ITER_CREATE:
                err = bpf_iter_create(&attr);
                break;
        case BPF_LINK_DETACH:
                err = link_detach(&attr);
                break;
        case BPF_PROG_BIND_MAP:
                err = bpf_prog_bind_map(&attr);
                break;
        case BPF_TOKEN_CREATE:
                err = token_create(&attr);
                break;
        case BPF_PROG_STREAM_READ_BY_FD:
                err = prog_stream_read(&attr);
                break;
        case BPF_PROG_ASSOC_STRUCT_OPS:
                err = prog_assoc_struct_ops(&attr);
                break;
        default:
                err = -EINVAL;
                break;
        }

        return err;
}

SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
        return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
}

static bool syscall_prog_is_valid_access(int off, int size,
                                         enum bpf_access_type type,
                                         const struct bpf_prog *prog,
                                         struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= U16_MAX)
                return false;
        /* No alignment requirements for syscall ctx accesses. */
        return true;
}

BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
        switch (cmd) {
        case BPF_MAP_CREATE:
        case BPF_MAP_DELETE_ELEM:
        case BPF_MAP_UPDATE_ELEM:
        case BPF_MAP_FREEZE:
        case BPF_MAP_GET_FD_BY_ID:
        case BPF_PROG_LOAD:
        case BPF_BTF_LOAD:
        case BPF_LINK_CREATE:
        case BPF_RAW_TRACEPOINT_OPEN:
                break;
        default:
                return -EINVAL;
        }
        return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
}


/* To shut up -Wmissing-prototypes.
 * This function is used by the kernel light skeleton
 * to load bpf programs when modules are loaded or during kernel boot.
 * See tools/lib/bpf/skel_internal.h
 */
int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);

int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
{
        struct bpf_prog * __maybe_unused prog;
        struct bpf_tramp_run_ctx __maybe_unused run_ctx;

        switch (cmd) {
#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
        case BPF_PROG_TEST_RUN:
                if (attr->test.data_in || attr->test.data_out ||
                    attr->test.ctx_out || attr->test.duration ||
                    attr->test.repeat || attr->test.flags)
                        return -EINVAL;

                prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
                if (IS_ERR(prog))
                        return PTR_ERR(prog);

                if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
                    attr->test.ctx_size_in > U16_MAX) {
                        bpf_prog_put(prog);
                        return -EINVAL;
                }

                run_ctx.bpf_cookie = 0;
                if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
                        /* recursion detected */
                        __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
                        bpf_prog_put(prog);
                        return -EBUSY;
                }
                attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
                __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
                                                &run_ctx);
                bpf_prog_put(prog);
                return 0;
#endif
        default:
                return ____bpf_sys_bpf(cmd, attr, size);
        }
}
EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL");

static const struct bpf_func_proto bpf_sys_bpf_proto = {
        .func                = bpf_sys_bpf,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

const struct bpf_func_proto * __weak
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        return bpf_base_func_proto(func_id, prog);
}

BPF_CALL_1(bpf_sys_close, u32, fd)
{
        /* When bpf program calls this helper there should not be
         * an fdget() without matching completed fdput().
         * This helper is allowed in the following callchain only:
         * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
         */
        return close_fd(fd);
}

static const struct bpf_func_proto bpf_sys_close_proto = {
        .func                = bpf_sys_close,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
{
        *res = 0;
        if (flags)
                return -EINVAL;

        if (name_sz <= 1 || name[name_sz - 1])
                return -EINVAL;

        if (!bpf_dump_raw_ok(current_cred()))
                return -EPERM;

        *res = kallsyms_lookup_name(name);
        return *res ? 0 : -ENOENT;
}

static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
        .func                = bpf_kallsyms_lookup_name,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
        .arg4_size        = sizeof(u64),
};

static const struct bpf_func_proto *
syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_sys_bpf:
                return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
                       ? NULL : &bpf_sys_bpf_proto;
        case BPF_FUNC_btf_find_by_name_kind:
                return &bpf_btf_find_by_name_kind_proto;
        case BPF_FUNC_sys_close:
                return &bpf_sys_close_proto;
        case BPF_FUNC_kallsyms_lookup_name:
                return &bpf_kallsyms_lookup_name_proto;
        default:
                return tracing_prog_func_proto(func_id, prog);
        }
}

const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
        .get_func_proto  = syscall_prog_func_proto,
        .is_valid_access = syscall_prog_is_valid_access,
};

const struct bpf_prog_ops bpf_syscall_prog_ops = {
        .test_run = bpf_prog_test_run_syscall,
};

#ifdef CONFIG_SYSCTL
static int bpf_stats_handler(const struct ctl_table *table, int write,
                             void *buffer, size_t *lenp, loff_t *ppos)
{
        struct static_key *key = (struct static_key *)table->data;
        static int saved_val;
        int val, ret;
        struct ctl_table tmp = {
                .data   = &val,
                .maxlen = sizeof(val),
                .mode   = table->mode,
                .extra1 = SYSCTL_ZERO,
                .extra2 = SYSCTL_ONE,
        };

        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;

        mutex_lock(&bpf_stats_enabled_mutex);
        val = saved_val;
        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
        if (write && !ret && val != saved_val) {
                if (val)
                        static_key_slow_inc(key);
                else
                        static_key_slow_dec(key);
                saved_val = val;
        }
        mutex_unlock(&bpf_stats_enabled_mutex);
        return ret;
}

void __weak unpriv_ebpf_notify(int new_state)
{
}

static int bpf_unpriv_handler(const struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret, unpriv_enable = *(int *)table->data;
        bool locked_state = unpriv_enable == 1;
        struct ctl_table tmp = *table;

        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;

        tmp.data = &unpriv_enable;
        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
        if (write && !ret) {
                if (locked_state && unpriv_enable != 1)
                        return -EPERM;
                *(int *)table->data = unpriv_enable;
        }

        if (write)
                unpriv_ebpf_notify(unpriv_enable);

        return ret;
}

static const struct ctl_table bpf_syscall_table[] = {
        {
                .procname        = "unprivileged_bpf_disabled",
                .data                = &sysctl_unprivileged_bpf_disabled,
                .maxlen                = sizeof(sysctl_unprivileged_bpf_disabled),
                .mode                = 0644,
                .proc_handler        = bpf_unpriv_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
        {
                .procname        = "bpf_stats_enabled",
                .data                = &bpf_stats_enabled_key.key,
                .mode                = 0644,
                .proc_handler        = bpf_stats_handler,
        },
};

static int __init bpf_syscall_sysctl_init(void)
{
        register_sysctl_init("kernel", bpf_syscall_table);
        return 0;
}
late_initcall(bpf_syscall_sysctl_init);
#endif /* CONFIG_SYSCTL */

















































































   52 












































   46 

   52 

   48 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 































    1 
    1 






















































    1 



    1 


































































































































































    1 


    1 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
// SPDX-License-Identifier: GPL-2.0-or-later
/* Common capabilities, needed by capability.o.
 */

#include <linux/capability.h>
#include <linux/audit.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/lsm_hooks.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/ptrace.h>
#include <linux/xattr.h>
#include <linux/hugetlb.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/securebits.h>
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
#include <linux/personality.h>
#include <linux/mnt_idmapping.h>
#include <uapi/linux/lsm.h>

#define CREATE_TRACE_POINTS
#include <trace/events/capability.h>

/*
 * If a non-root user executes a setuid-root binary in
 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
 * However if fE is also set, then the intent is for only
 * the file capabilities to be applied, and the setuid-root
 * bit is left on either to change the uid (plausible) or
 * to get full privilege on a kernel without file capabilities
 * support.  So in that case we do not raise capabilities.
 *
 * Warn if that happens, once per boot.
 */
static void warn_setuid_and_fcaps_mixed(const char *fname)
{
        static int warned;
        if (!warned) {
                printk(KERN_INFO "warning: `%s' has both setuid-root and"
                        " effective capabilities. Therefore not raising all"
                        " capabilities.\n", fname);
                warned = 1;
        }
}

/**
 * cap_capable_helper - Determine whether a task has a particular effective
 * capability.
 * @cred: The credentials to use
 * @target_ns:  The user namespace of the resource being accessed
 * @cred_ns:  The user namespace of the credentials
 * @cap: The capability to check for
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * See cap_capable for more details.
 */
static inline int cap_capable_helper(const struct cred *cred,
                                     struct user_namespace *target_ns,
                                     const struct user_namespace *cred_ns,
                                     int cap)
{
        struct user_namespace *ns = target_ns;

        /* See if cred has the capability in the target user namespace
         * by examining the target user namespace and all of the target
         * user namespace's parents.
         */
        for (;;) {
                /* Do we have the necessary capabilities? */
                if (likely(ns == cred_ns))
                        return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;

                /*
                 * If we're already at a lower level than we're looking for,
                 * we're done searching.
                 */
                if (ns->level <= cred_ns->level)
                        return -EPERM;

                /* 
                 * The owner of the user namespace in the parent of the
                 * user namespace has all caps.
                 */
                if ((ns->parent == cred_ns) && uid_eq(ns->owner, cred->euid))
                        return 0;

                /*
                 * If you have a capability in a parent user ns, then you have
                 * it over all children user namespaces as well.
                 */
                ns = ns->parent;
        }

        /* We never get here */
}

/**
 * cap_capable - Determine whether a task has a particular effective capability
 * @cred: The credentials to use
 * @target_ns:  The user namespace of the resource being accessed
 * @cap: The capability to check for
 * @opts: Bitmask of options defined in include/linux/security.h (unused)
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * NOTE WELL: cap_capable() has reverse semantics to the capable() call
 * and friends. That is cap_capable() returns an int 0 when a task has
 * a capability, while the kernel's capable(), has_ns_capability(),
 * has_ns_capability_noaudit(), and has_capability_noaudit() return a
 * bool true (1) for this case.
 */
int cap_capable(const struct cred *cred, struct user_namespace *target_ns,
                int cap, unsigned int opts)
{
        const struct user_namespace *cred_ns = cred->user_ns;
        int ret = cap_capable_helper(cred, target_ns, cred_ns, cap);

        trace_cap_capable(cred, target_ns, cred_ns, cap, ret);
        return ret;
}

/**
 * cap_settime - Determine whether the current process may set the system clock
 * @ts: The time to set
 * @tz: The timezone to set
 *
 * Determine whether the current process may set the system clock and timezone
 * information, returning 0 if permission granted, -ve if denied.
 */
int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
{
        if (!capable(CAP_SYS_TIME))
                return -EPERM;
        return 0;
}

/**
 * cap_ptrace_access_check - Determine whether the current process may access
 *                           another
 * @child: The process to be accessed
 * @mode: The mode of attachment.
 *
 * If we are in the same or an ancestor user_ns and have all the target
 * task's capabilities, then ptrace access is allowed.
 * If we have the ptrace capability to the target user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether a process may access another, returning 0 if permission
 * granted, -ve if denied.
 */
int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        int ret = 0;
        const struct cred *cred, *child_cred;
        const kernel_cap_t *caller_caps;

        rcu_read_lock();
        cred = current_cred();
        child_cred = __task_cred(child);
        if (mode & PTRACE_MODE_FSCREDS)
                caller_caps = &cred->cap_effective;
        else
                caller_caps = &cred->cap_permitted;
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, *caller_caps))
                goto out;
        if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_ptrace_traceme - Determine whether another process may trace the current
 * @parent: The task proposed to be the tracer
 *
 * If parent is in the same or an ancestor user_ns and has all current's
 * capabilities, then ptrace access is allowed.
 * If parent has the ptrace capability to current's user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether the nominated task is permitted to trace the current
 * process, returning 0 if permission is granted, -ve if denied.
 */
int cap_ptrace_traceme(struct task_struct *parent)
{
        int ret = 0;
        const struct cred *cred, *child_cred;

        rcu_read_lock();
        cred = __task_cred(parent);
        child_cred = current_cred();
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
                goto out;
        if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_capget - Retrieve a task's capability sets
 * @target: The task from which to retrieve the capability sets
 * @effective: The place to record the effective set
 * @inheritable: The place to record the inheritable set
 * @permitted: The place to record the permitted set
 *
 * This function retrieves the capabilities of the nominated task and returns
 * them to the caller.
 */
int cap_capget(const struct task_struct *target, kernel_cap_t *effective,
               kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        const struct cred *cred;

        /* Derived from kernel/capability.c:sys_capget. */
        rcu_read_lock();
        cred = __task_cred(target);
        *effective   = cred->cap_effective;
        *inheritable = cred->cap_inheritable;
        *permitted   = cred->cap_permitted;
        rcu_read_unlock();
        return 0;
}

/*
 * Determine whether the inheritable capabilities are limited to the old
 * permitted set.  Returns 1 if they are limited, 0 if they are not.
 */
static inline int cap_inh_is_capped(void)
{
        /* they are so limited unless the current task has the CAP_SETPCAP
         * capability
         */
        if (cap_capable(current_cred(), current_cred()->user_ns,
                        CAP_SETPCAP, CAP_OPT_NONE) == 0)
                return 0;
        return 1;
}

/**
 * cap_capset - Validate and apply proposed changes to current's capabilities
 * @new: The proposed new credentials; alterations should be made here
 * @old: The current task's current credentials
 * @effective: A pointer to the proposed new effective capabilities set
 * @inheritable: A pointer to the proposed new inheritable capabilities set
 * @permitted: A pointer to the proposed new permitted capabilities set
 *
 * This function validates and applies a proposed mass change to the current
 * process's capability sets.  The changes are made to the proposed new
 * credentials, and assuming no error, will be committed by the caller of LSM.
 */
int cap_capset(struct cred *new,
               const struct cred *old,
               const kernel_cap_t *effective,
               const kernel_cap_t *inheritable,
               const kernel_cap_t *permitted)
{
        if (cap_inh_is_capped() &&
            !cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_permitted)))
                /* incapable of using this inheritable set */
                return -EPERM;

        if (!cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_bset)))
                /* no new pI capabilities outside bounding set */
                return -EPERM;

        /* verify restrictions on target's new Permitted set */
        if (!cap_issubset(*permitted, old->cap_permitted))
                return -EPERM;

        /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
        if (!cap_issubset(*effective, *permitted))
                return -EPERM;

        new->cap_effective   = *effective;
        new->cap_inheritable = *inheritable;
        new->cap_permitted   = *permitted;

        /*
         * Mask off ambient bits that are no longer both permitted and
         * inheritable.
         */
        new->cap_ambient = cap_intersect(new->cap_ambient,
                                         cap_intersect(*permitted,
                                                       *inheritable));
        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EINVAL;
        return 0;
}

/**
 * cap_inode_need_killpriv - Determine if inode change affects privileges
 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
 *
 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
 * affects the security markings on that inode, and if it is, should
 * inode_killpriv() be invoked or the change rejected.
 *
 * Return: 1 if security.capability has a value, meaning inode_killpriv()
 * is required, 0 otherwise, meaning inode_killpriv() is not required.
 */
int cap_inode_need_killpriv(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        int error;

        error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
        return error > 0;
}

/**
 * cap_inode_killpriv - Erase the security markings on an inode
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        The inode/dentry to alter
 *
 * Erase the privilege-enhancing security markings on an inode.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Return: 0 if successful, -ve on error.
 */
int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry)
{
        int error;

        error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS);
        if (error == -EOPNOTSUPP)
                error = 0;
        return error;
}

/**
 * kuid_root_in_ns - check whether the given kuid is root in the given ns
 * @kuid: the kuid to be tested
 * @ns: the user namespace to test against
 *
 * Returns true if @kuid represents the root user in @ns, false otherwise.
 */
static bool kuid_root_in_ns(kuid_t kuid, struct user_namespace *ns)
{
        for (;; ns = ns->parent) {
                if (from_kuid(ns, kuid) == 0)
                        return true;
                if (ns == &init_user_ns)
                        break;
        }

        return false;
}

static bool vfsuid_root_in_currentns(vfsuid_t vfsuid)
{
        kuid_t kuid;

        if (!vfsuid_valid(vfsuid))
                return false;
        kuid = vfsuid_into_kuid(vfsuid);
        return kuid_root_in_ns(kuid, current_user_ns());
}

static __u32 sansflags(__u32 m)
{
        return m & ~VFS_CAP_FLAGS_EFFECTIVE;
}

static bool is_v2header(int size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_2)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
}

static bool is_v3header(int size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_3)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
}

/*
 * getsecurity: We are called for security.* before any attempt to read the
 * xattr from the inode itself.
 *
 * This gives us a chance to read the on-disk value and convert it.  If we
 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
 *
 * Note we are not called by vfs_getxattr_alloc(), but that is only called
 * by the integrity subsystem, which really wants the unconverted values -
 * so that's good.
 */
int cap_inode_getsecurity(struct mnt_idmap *idmap,
                          struct inode *inode, const char *name, void **buffer,
                          bool alloc)
{
        int size;
        kuid_t kroot;
        vfsuid_t vfsroot;
        u32 nsmagic, magic;
        uid_t root, mappedroot;
        char *tmpbuf = NULL;
        struct vfs_cap_data *cap;
        struct vfs_ns_cap_data *nscap = NULL;
        struct dentry *dentry;
        struct user_namespace *fs_ns;

        if (strcmp(name, "capability") != 0)
                return -EOPNOTSUPP;

        dentry = d_find_any_alias(inode);
        if (!dentry)
                return -EINVAL;
        size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf,
                                  sizeof(struct vfs_ns_cap_data), GFP_NOFS);
        dput(dentry);
        /* gcc11 complains if we don't check for !tmpbuf */
        if (size < 0 || !tmpbuf)
                goto out_free;

        fs_ns = inode->i_sb->s_user_ns;
        cap = (struct vfs_cap_data *) tmpbuf;
        if (is_v2header(size, cap)) {
                root = 0;
        } else if (is_v3header(size, cap)) {
                nscap = (struct vfs_ns_cap_data *) tmpbuf;
                root = le32_to_cpu(nscap->rootid);
        } else {
                size = -EINVAL;
                goto out_free;
        }

        kroot = make_kuid(fs_ns, root);

        /* If this is an idmapped mount shift the kuid. */
        vfsroot = make_vfsuid(idmap, fs_ns, kroot);

        /* If the root kuid maps to a valid uid in current ns, then return
         * this as a nscap. */
        mappedroot = from_kuid(current_user_ns(), vfsuid_into_kuid(vfsroot));
        if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
                size = sizeof(struct vfs_ns_cap_data);
                if (alloc) {
                        if (!nscap) {
                                /* v2 -> v3 conversion */
                                nscap = kzalloc(size, GFP_ATOMIC);
                                if (!nscap) {
                                        size = -ENOMEM;
                                        goto out_free;
                                }
                                nsmagic = VFS_CAP_REVISION_3;
                                magic = le32_to_cpu(cap->magic_etc);
                                if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                                        nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
                                memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                                nscap->magic_etc = cpu_to_le32(nsmagic);
                        } else {
                                /* use allocated v3 buffer */
                                tmpbuf = NULL;
                        }
                        nscap->rootid = cpu_to_le32(mappedroot);
                        *buffer = nscap;
                }
                goto out_free;
        }

        if (!vfsuid_root_in_currentns(vfsroot)) {
                size = -EOVERFLOW;
                goto out_free;
        }

        /* This comes from a parent namespace.  Return as a v2 capability */
        size = sizeof(struct vfs_cap_data);
        if (alloc) {
                if (nscap) {
                        /* v3 -> v2 conversion */
                        cap = kzalloc(size, GFP_ATOMIC);
                        if (!cap) {
                                size = -ENOMEM;
                                goto out_free;
                        }
                        magic = VFS_CAP_REVISION_2;
                        nsmagic = le32_to_cpu(nscap->magic_etc);
                        if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
                                magic |= VFS_CAP_FLAGS_EFFECTIVE;
                        memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                        cap->magic_etc = cpu_to_le32(magic);
                } else {
                        /* use unconverted v2 */
                        tmpbuf = NULL;
                }
                *buffer = cap;
        }
out_free:
        kfree(tmpbuf);
        return size;
}

/**
 * rootid_from_xattr - translate root uid of vfs caps
 *
 * @value:        vfs caps value which may be modified by this function
 * @size:        size of @ivalue
 * @task_ns:        user namespace of the caller
 */
static vfsuid_t rootid_from_xattr(const void *value, size_t size,
                                  struct user_namespace *task_ns)
{
        const struct vfs_ns_cap_data *nscap = value;
        uid_t rootid = 0;

        if (size == XATTR_CAPS_SZ_3)
                rootid = le32_to_cpu(nscap->rootid);

        return VFSUIDT_INIT(make_kuid(task_ns, rootid));
}

static bool validheader(size_t size, const struct vfs_cap_data *cap)
{
        return is_v2header(size, cap) || is_v3header(size, cap);
}

/**
 * cap_convert_nscap - check vfs caps
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        used to retrieve inode to check permissions on
 * @ivalue:        vfs caps value which may be modified by this function
 * @size:        size of @ivalue
 *
 * User requested a write of security.capability.  If needed, update the
 * xattr to change from v2 to v3, or to fixup the v3 rootid.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Return: On success, return the new size; on error, return < 0.
 */
int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry,
                      const void **ivalue, size_t size)
{
        struct vfs_ns_cap_data *nscap;
        uid_t nsrootid;
        const struct vfs_cap_data *cap = *ivalue;
        __u32 magic, nsmagic;
        struct inode *inode = d_backing_inode(dentry);
        struct user_namespace *task_ns = current_user_ns(),
                *fs_ns = inode->i_sb->s_user_ns;
        kuid_t rootid;
        vfsuid_t vfsrootid;
        size_t newsize;

        if (!*ivalue)
                return -EINVAL;
        if (!validheader(size, cap))
                return -EINVAL;
        if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
                return -EPERM;
        if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap))
                if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
                        /* user is privileged, just write the v2 */
                        return size;

        vfsrootid = rootid_from_xattr(*ivalue, size, task_ns);
        if (!vfsuid_valid(vfsrootid))
                return -EINVAL;

        rootid = from_vfsuid(idmap, fs_ns, vfsrootid);
        if (!uid_valid(rootid))
                return -EINVAL;

        nsrootid = from_kuid(fs_ns, rootid);
        if (nsrootid == -1)
                return -EINVAL;

        newsize = sizeof(struct vfs_ns_cap_data);
        nscap = kmalloc(newsize, GFP_ATOMIC);
        if (!nscap)
                return -ENOMEM;
        nscap->rootid = cpu_to_le32(nsrootid);
        nsmagic = VFS_CAP_REVISION_3;
        magic = le32_to_cpu(cap->magic_etc);
        if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
        nscap->magic_etc = cpu_to_le32(nsmagic);
        memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);

        *ivalue = nscap;
        return newsize;
}

/*
 * Calculate the new process capability sets from the capability sets attached
 * to a file.
 */
static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
                                          struct linux_binprm *bprm,
                                          bool *effective,
                                          bool *has_fcap)
{
        struct cred *new = bprm->cred;
        int ret = 0;

        if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
                *effective = true;

        if (caps->magic_etc & VFS_CAP_REVISION_MASK)
                *has_fcap = true;

        /*
         * pP' = (X & fP) | (pI & fI)
         * The addition of pA' is handled later.
         */
        new->cap_permitted.val =
                (new->cap_bset.val & caps->permitted.val) |
                (new->cap_inheritable.val & caps->inheritable.val);

        if (caps->permitted.val & ~new->cap_permitted.val)
                /* insufficient to execute correctly */
                ret = -EPERM;

        /*
         * For legacy apps, with no internal support for recognizing they
         * do not have enough capabilities, we return an error if they are
         * missing some "forced" (aka file-permitted) capabilities.
         */
        return *effective ? ret : 0;
}

/**
 * get_vfs_caps_from_disk - retrieve vfs caps from disk
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        dentry from which @inode is retrieved
 * @cpu_caps:        vfs capabilities
 *
 * Extract the on-exec-apply capability sets for an executable file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
                           const struct dentry *dentry,
                           struct cpu_vfs_cap_data *cpu_caps)
{
        struct inode *inode = d_backing_inode(dentry);
        __u32 magic_etc;
        int size;
        struct vfs_ns_cap_data data, *nscaps = &data;
        struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
        kuid_t rootkuid;
        vfsuid_t rootvfsuid;
        struct user_namespace *fs_ns;

        memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));

        if (!inode)
                return -ENODATA;

        fs_ns = inode->i_sb->s_user_ns;
        size = __vfs_getxattr((struct dentry *)dentry, inode,
                              XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
        if (size == -ENODATA || size == -EOPNOTSUPP)
                /* no data, that's ok */
                return -ENODATA;

        if (size < 0)
                return size;

        if (size < sizeof(magic_etc))
                return -EINVAL;

        cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);

        rootkuid = make_kuid(fs_ns, 0);
        switch (magic_etc & VFS_CAP_REVISION_MASK) {
        case VFS_CAP_REVISION_1:
                if (size != XATTR_CAPS_SZ_1)
                        return -EINVAL;
                break;
        case VFS_CAP_REVISION_2:
                if (size != XATTR_CAPS_SZ_2)
                        return -EINVAL;
                break;
        case VFS_CAP_REVISION_3:
                if (size != XATTR_CAPS_SZ_3)
                        return -EINVAL;
                rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
                break;

        default:
                return -EINVAL;
        }

        rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
        if (!vfsuid_valid(rootvfsuid))
                return -ENODATA;

        /* Limit the caps to the mounter of the filesystem
         * or the more limited uid specified in the xattr.
         */
        if (!vfsuid_root_in_currentns(rootvfsuid))
                return -ENODATA;

        cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
        cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);

        /*
         * Rev1 had just a single 32-bit word, later expanded
         * to a second one for the high bits
         */
        if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
                cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
                cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
        }

        cpu_caps->permitted.val &= CAP_VALID_MASK;
        cpu_caps->inheritable.val &= CAP_VALID_MASK;

        cpu_caps->rootid = vfsuid_into_kuid(rootvfsuid);

        return 0;
}

/*
 * Attempt to get the on-exec apply capability sets for an executable file from
 * its xattrs and, if present, apply them to the proposed credentials being
 * constructed by execve().
 */
static int get_file_caps(struct linux_binprm *bprm, const struct file *file,
                         bool *effective, bool *has_fcap)
{
        int rc = 0;
        struct cpu_vfs_cap_data vcaps;

        cap_clear(bprm->cred->cap_permitted);

        if (!file_caps_enabled)
                return 0;

        if (!mnt_may_suid(file->f_path.mnt))
                return 0;

        /*
         * This check is redundant with mnt_may_suid() but is kept to make
         * explicit that capability bits are limited to s_user_ns and its
         * descendants.
         */
        if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
                return 0;

        rc = get_vfs_caps_from_disk(file_mnt_idmap(file),
                                    file->f_path.dentry, &vcaps);
        if (rc < 0) {
                if (rc == -EINVAL)
                        printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
                                        bprm->filename);
                else if (rc == -ENODATA)
                        rc = 0;
                goto out;
        }

        rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);

out:
        if (rc)
                cap_clear(bprm->cred->cap_permitted);

        return rc;
}

static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }

static inline bool __is_real(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->uid, uid); }

static inline bool __is_eff(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->euid, uid); }

static inline bool __is_suid(kuid_t uid, struct cred *cred)
{ return !__is_real(uid, cred) && __is_eff(uid, cred); }

/*
 * handle_privileged_root - Handle case of privileged root
 * @bprm: The execution parameters, including the proposed creds
 * @has_fcap: Are any file capabilities set?
 * @effective: Do we have effective root privilege?
 * @root_uid: This namespace' root UID WRT initial USER namespace
 *
 * Handle the case where root is privileged and hasn't been neutered by
 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
 * set UID root and nothing is changed.  If we are root, cap_permitted is
 * updated.  If we have become set UID root, the effective bit is set.
 */
static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
                                   bool *effective, kuid_t root_uid)
{
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;

        if (!root_privileged())
                return;
        /*
         * If the legacy file capability is set, then don't set privs
         * for a setuid root binary run by a non-root user.  Do set it
         * for a root user just to cause least surprise to an admin.
         */
        if (has_fcap && __is_suid(root_uid, new)) {
                warn_setuid_and_fcaps_mixed(bprm->filename);
                return;
        }
        /*
         * To support inheritance of root-permissions and suid-root
         * executables under compatibility mode, we override the
         * capability sets for the file.
         */
        if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
                /* pP' = (cap_bset & ~0) | (pI & ~0) */
                new->cap_permitted = cap_combine(old->cap_bset,
                                                 old->cap_inheritable);
        }
        /*
         * If only the real uid is 0, we do not set the effective bit.
         */
        if (__is_eff(root_uid, new))
                *effective = true;
}

#define __cap_gained(field, target, source) \
        !cap_issubset(target->cap_##field, source->cap_##field)
#define __cap_grew(target, source, cred) \
        !cap_issubset(cred->cap_##target, cred->cap_##source)
#define __cap_full(field, cred) \
        cap_issubset(CAP_FULL_SET, cred->cap_##field)

/*
 * 1) Audit candidate if current->cap_effective is set
 *
 * We do not bother to audit if 3 things are true:
 *   1) cap_effective has all caps
 *   2) we became root *OR* are were already root
 *   3) root is supposed to have all caps (SECURE_NOROOT)
 * Since this is just a normal root execing a process.
 *
 * Number 1 above might fail if you don't have a full bset, but I think
 * that is interesting information to audit.
 *
 * A number of other conditions require logging:
 * 2) something prevented setuid root getting all caps
 * 3) non-setuid root gets fcaps
 * 4) non-setuid root gets ambient
 */
static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
                                     kuid_t root, bool has_fcap)
{
        bool ret = false;

        if ((__cap_grew(effective, ambient, new) &&
             !(__cap_full(effective, new) &&
               (__is_eff(root, new) || __is_real(root, new)) &&
               root_privileged())) ||
            (root_privileged() &&
             __is_suid(root, new) &&
             !__cap_full(effective, new)) ||
            (uid_eq(new->euid, old->euid) &&
             ((has_fcap &&
               __cap_gained(permitted, new, old)) ||
              __cap_gained(ambient, new, old))))

                ret = true;

        return ret;
}

/**
 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
 * @bprm: The execution parameters, including the proposed creds
 * @file: The file to pull the credentials from
 *
 * Set up the proposed credentials for a new execution context being
 * constructed by execve().  The proposed creds in @bprm->cred is altered,
 * which won't take effect immediately.
 *
 * Return: 0 if successful, -ve on error.
 */
int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
{
        /* Process setpcap binaries and capabilities for uid 0 */
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;
        bool effective = false, has_fcap = false, id_changed;
        int ret;
        kuid_t root_uid;

        if (WARN_ON(!cap_ambient_invariant_ok(old)))
                return -EPERM;

        ret = get_file_caps(bprm, file, &effective, &has_fcap);
        if (ret < 0)
                return ret;

        root_uid = make_kuid(new->user_ns, 0);

        handle_privileged_root(bprm, has_fcap, &effective, root_uid);

        /* if we have fs caps, clear dangerous personality flags */
        if (__cap_gained(permitted, new, old))
                bprm->per_clear |= PER_CLEAR_ON_SETID;

        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
         * credentials unless they have the appropriate permit.
         *
         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
        id_changed = !uid_eq(new->euid, old->euid) || !in_group_p(new->egid);

        if ((id_changed || __cap_gained(permitted, new, old)) &&
            ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
             !ptracer_capable(current, new->user_ns))) {
                /* downgrade; they get no more than they had, and maybe less */
                if (!ns_capable(new->user_ns, CAP_SETUID) ||
                    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
                        new->euid = new->uid;
                        new->egid = new->gid;
                }
                new->cap_permitted = cap_intersect(new->cap_permitted,
                                                   old->cap_permitted);
        }

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        /* File caps or setid cancels ambient. */
        if (has_fcap || id_changed)
                cap_clear(new->cap_ambient);

        /*
         * Now that we've computed pA', update pP' to give:
         *   pP' = (X & fP) | (pI & fI) | pA'
         */
        new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);

        /*
         * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
         * this is the same as pE' = (fE ? pP' : 0) | pA'.
         */
        if (effective)
                new->cap_effective = new->cap_permitted;
        else
                new->cap_effective = new->cap_ambient;

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
                ret = audit_log_bprm_fcaps(bprm, new, old);
                if (ret < 0)
                        return ret;
        }

        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        /* Check for privilege-elevated exec. */
        if (id_changed ||
            !uid_eq(new->euid, old->uid) ||
            !gid_eq(new->egid, old->gid) ||
            (!__is_real(root_uid, new) &&
             (effective ||
              __cap_grew(permitted, ambient, new))))
                bprm->secureexec = 1;

        return 0;
}

/**
 * cap_inode_setxattr - Determine whether an xattr may be altered
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 * @value: The value that the xattr will be changed to
 * @size: The size of value
 * @flags: The replacement flag
 *
 * Determine whether an xattr may be altered or set on an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get updated or set by those
 * who aren't privileged to do so.
 */
int cap_inode_setxattr(struct dentry *dentry, const char *name,
                       const void *value, size_t size, int flags)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        /*
         * For XATTR_NAME_CAPS the check will be done in
         * cap_convert_nscap(), called by setxattr()
         */
        if (strcmp(name, XATTR_NAME_CAPS) == 0)
                return 0;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/**
 * cap_inode_removexattr - Determine whether an xattr may be removed
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        The inode/dentry being altered
 * @name:        The name of the xattr to be changed
 *
 * Determine whether an xattr may be removed from an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * This is used to make sure security xattrs don't get removed by those who
 * aren't privileged to remove them.
 */
int cap_inode_removexattr(struct mnt_idmap *idmap,
                          struct dentry *dentry, const char *name)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        if (strcmp(name, XATTR_NAME_CAPS) == 0) {
                /* security.capability gets namespaced */
                struct inode *inode = d_backing_inode(dentry);
                if (!inode)
                        return -EINVAL;
                if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
                        return -EPERM;
                return 0;
        }

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/*
 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
 * a process after a call to setuid, setreuid, or setresuid.
 *
 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
 *  {r,e,s}uid != 0, the permitted and effective capabilities are
 *  cleared.
 *
 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
 *  capabilities of the process are cleared.
 *
 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
 *  capabilities are set to the permitted capabilities.
 *
 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
 *  never happen.
 *
 *  -astor
 *
 * cevans - New behaviour, Oct '99
 * A process may, via prctl(), elect to keep its capabilities when it
 * calls setuid() and switches away from uid==0. Both permitted and
 * effective sets will be retained.
 * Without this change, it was impossible for a daemon to drop only some
 * of its privilege. The call to setuid(!=0) would drop all privileges!
 * Keeping uid 0 is not an option because uid 0 owns too many vital
 * files..
 * Thanks to Olaf Kirch and Peter Benie for spotting this.
 */
static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
{
        kuid_t root_uid = make_kuid(old->user_ns, 0);

        if ((uid_eq(old->uid, root_uid) ||
             uid_eq(old->euid, root_uid) ||
             uid_eq(old->suid, root_uid)) &&
            (!uid_eq(new->uid, root_uid) &&
             !uid_eq(new->euid, root_uid) &&
             !uid_eq(new->suid, root_uid))) {
                if (!issecure(SECURE_KEEP_CAPS)) {
                        cap_clear(new->cap_permitted);
                        cap_clear(new->cap_effective);
                }

                /*
                 * Pre-ambient programs expect setresuid to nonroot followed
                 * by exec to drop capabilities.  We should make sure that
                 * this remains the case.
                 */
                cap_clear(new->cap_ambient);
        }
        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
        if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
                new->cap_effective = new->cap_permitted;
}

/**
 * cap_task_fix_setuid - Fix up the results of setuid() call
 * @new: The proposed credentials
 * @old: The current task's current credentials
 * @flags: Indications of what has changed
 *
 * Fix up the results of setuid() call before the credential changes are
 * actually applied.
 *
 * Return: 0 to grant the changes, -ve to deny them.
 */
int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
{
        switch (flags) {
        case LSM_SETID_RE:
        case LSM_SETID_ID:
        case LSM_SETID_RES:
                /* juggle the capabilities to follow [RES]UID changes unless
                 * otherwise suppressed */
                if (!issecure(SECURE_NO_SETUID_FIXUP))
                        cap_emulate_setxuid(new, old);
                break;

        case LSM_SETID_FS:
                /* juggle the capabilities to follow FSUID changes, unless
                 * otherwise suppressed
                 *
                 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
                 *          if not, we might be a bit too harsh here.
                 */
                if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                        kuid_t root_uid = make_kuid(old->user_ns, 0);
                        if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_drop_fs_set(new->cap_effective);

                        if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_raise_fs_set(new->cap_effective,
                                                         new->cap_permitted);
                }
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * Rationale: code calling task_setscheduler, task_setioprio, and
 * task_setnice, assumes that
 *   . if capable(cap_sys_nice), then those actions should be allowed
 *   . if not capable(cap_sys_nice), but acting on your own processes,
 *           then those actions should be allowed
 * This is insufficient now since you can call code without suid, but
 * yet with increased caps.
 * So we check for increased caps on the target process.
 */
static int cap_safe_nice(struct task_struct *p)
{
        int is_subset, ret = 0;

        rcu_read_lock();
        is_subset = cap_issubset(__task_cred(p)->cap_permitted,
                                 current_cred()->cap_permitted);
        if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
                ret = -EPERM;
        rcu_read_unlock();

        return ret;
}

/**
 * cap_task_setscheduler - Determine if scheduler policy change is permitted
 * @p: The task to affect
 *
 * Determine if the requested scheduler policy change is permitted for the
 * specified task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setscheduler(struct task_struct *p)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_setioprio - Determine if I/O priority change is permitted
 * @p: The task to affect
 * @ioprio: The I/O priority to set
 *
 * Determine if the requested I/O priority change is permitted for the specified
 * task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setioprio(struct task_struct *p, int ioprio)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_setnice - Determine if task priority change is permitted
 * @p: The task to affect
 * @nice: The nice value to set
 *
 * Determine if the requested task priority change is permitted for the
 * specified task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setnice(struct task_struct *p, int nice)
{
        return cap_safe_nice(p);
}

/*
 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
 * the current task's bounding set.  Returns 0 on success, -ve on error.
 */
static int cap_prctl_drop(unsigned long cap)
{
        struct cred *new;

        if (!ns_capable(current_user_ns(), CAP_SETPCAP))
                return -EPERM;
        if (!cap_valid(cap))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        cap_lower(new->cap_bset, cap);
        return commit_creds(new);
}

/**
 * cap_task_prctl - Implement process control functions for this security module
 * @option: The process control function requested
 * @arg2: The argument data for this function
 * @arg3: The argument data for this function
 * @arg4: The argument data for this function
 * @arg5: The argument data for this function
 *
 * Allow process control functions (sys_prctl()) to alter capabilities; may
 * also deny access to other functions not otherwise implemented here.
 *
 * Return: 0 or +ve on success, -ENOSYS if this function is not implemented
 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
 * modules will consider performing the function.
 */
int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                   unsigned long arg4, unsigned long arg5)
{
        const struct cred *old = current_cred();
        struct cred *new;

        switch (option) {
        case PR_CAPBSET_READ:
                if (!cap_valid(arg2))
                        return -EINVAL;
                return !!cap_raised(old->cap_bset, arg2);

        case PR_CAPBSET_DROP:
                return cap_prctl_drop(arg2);

        /*
         * The next four prctl's remain to assist with transitioning a
         * system from legacy UID=0 based privilege (when filesystem
         * capabilities are not in use) to a system using filesystem
         * capabilities only - as the POSIX.1e draft intended.
         *
         * Note:
         *
         *  PR_SET_SECUREBITS =
         *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
         *    | issecure_mask(SECURE_NOROOT)
         *    | issecure_mask(SECURE_NOROOT_LOCKED)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
         *
         * will ensure that the current process and all of its
         * children will be locked into a pure
         * capability-based-privilege environment.
         */
        case PR_SET_SECUREBITS:
                if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
                     & (old->securebits ^ arg2))                        /*[1]*/
                    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))        /*[2]*/
                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))        /*[3]*/
                        /*
                         * [1] no changing of bits that are locked
                         * [2] no unlocking of locks
                         * [3] no setting of unsupported bits
                         */
                    )
                        /* cannot change a locked bit */
                        return -EPERM;

                /*
                 * Doing anything requires privilege (go read about the
                 * "sendmail capabilities bug"), except for unprivileged bits.
                 * Indeed, the SECURE_ALL_UNPRIVILEGED bits are not
                 * restrictions enforced by the kernel but by user space on
                 * itself.
                 */
                if (cap_capable(current_cred(), current_cred()->user_ns,
                                CAP_SETPCAP, CAP_OPT_NONE) != 0) {
                        const unsigned long unpriv_and_locks =
                                SECURE_ALL_UNPRIVILEGED |
                                SECURE_ALL_UNPRIVILEGED << 1;
                        const unsigned long changed = old->securebits ^ arg2;

                        /* For legacy reason, denies non-change. */
                        if (!changed)
                                return -EPERM;

                        /* Denies privileged changes. */
                        if (changed & ~unpriv_and_locks)
                                return -EPERM;
                }

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                new->securebits = arg2;
                return commit_creds(new);

        case PR_GET_SECUREBITS:
                return old->securebits;

        case PR_GET_KEEPCAPS:
                return !!issecure(SECURE_KEEP_CAPS);

        case PR_SET_KEEPCAPS:
                if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
                        return -EINVAL;
                if (issecure(SECURE_KEEP_CAPS_LOCKED))
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                if (arg2)
                        new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
                else
                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
                return commit_creds(new);

        case PR_CAP_AMBIENT:
                if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
                        if (arg3 | arg4 | arg5)
                                return -EINVAL;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        cap_clear(new->cap_ambient);
                        return commit_creds(new);
                }

                if (((!cap_valid(arg3)) | arg4 | arg5))
                        return -EINVAL;

                if (arg2 == PR_CAP_AMBIENT_IS_SET) {
                        return !!cap_raised(current_cred()->cap_ambient, arg3);
                } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
                           arg2 != PR_CAP_AMBIENT_LOWER) {
                        return -EINVAL;
                } else {
                        if (arg2 == PR_CAP_AMBIENT_RAISE &&
                            (!cap_raised(current_cred()->cap_permitted, arg3) ||
                             !cap_raised(current_cred()->cap_inheritable,
                                         arg3) ||
                             issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
                                return -EPERM;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        if (arg2 == PR_CAP_AMBIENT_RAISE)
                                cap_raise(new->cap_ambient, arg3);
                        else
                                cap_lower(new->cap_ambient, arg3);
                        return commit_creds(new);
                }

        default:
                /* No functionality available - continue with default */
                return -ENOSYS;
        }
}

/**
 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
 * @mm: The VM space in which the new mapping is to be made
 * @pages: The size of the mapping
 *
 * Determine whether the allocation of a new virtual mapping by the current
 * task is permitted.
 *
 * Return: 0 if permission granted, negative error code if not.
 */
int cap_vm_enough_memory(struct mm_struct *mm, long pages)
{
        return cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN,
                           CAP_OPT_NOAUDIT);
}

/**
 * cap_mmap_addr - check if able to map given addr
 * @addr: address attempting to be mapped
 *
 * If the process is attempting to map memory below dac_mmap_min_addr they need
 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
 * capability security module.
 *
 * Return: 0 if this mapping should be allowed or -EPERM if not.
 */
int cap_mmap_addr(unsigned long addr)
{
        int ret = 0;

        if (addr < dac_mmap_min_addr) {
                ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
                                  CAP_OPT_NONE);
                /* set PF_SUPERPRIV if it turns out we allow the low mmap */
                if (ret == 0)
                        current->flags |= PF_SUPERPRIV;
        }
        return ret;
}

#ifdef CONFIG_SECURITY

static const struct lsm_id capability_lsmid = {
        .name = "capability",
        .id = LSM_ID_CAPABILITY,
};

static struct security_hook_list capability_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(capable, cap_capable),
        LSM_HOOK_INIT(settime, cap_settime),
        LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
        LSM_HOOK_INIT(capget, cap_capget),
        LSM_HOOK_INIT(capset, cap_capset),
        LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
        LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
        LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
        LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
        LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
        LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
        LSM_HOOK_INIT(task_prctl, cap_task_prctl),
        LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
        LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
        LSM_HOOK_INIT(task_setnice, cap_task_setnice),
        LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
};

static int __init capability_init(void)
{
        security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
                           &capability_lsmid);
        return 0;
}

DEFINE_LSM(capability) = {
        .id = &capability_lsmid,
        .order = LSM_ORDER_FIRST,
        .init = capability_init,
};

#endif /* CONFIG_SECURITY */

#ifdef CONFIG_SECURITY_COMMONCAP_KUNIT_TEST
#include "commoncap_test.c"
#endif





















































































































































































































































































































































   23 


























   22 




   23 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/audit.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/**
 * tomoyo_print_bprm - Print "struct linux_binprm" for auditing.
 *
 * @bprm: Pointer to "struct linux_binprm".
 * @dump: Pointer to "struct tomoyo_page_dump".
 *
 * Returns the contents of @bprm on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
static char *tomoyo_print_bprm(struct linux_binprm *bprm,
                               struct tomoyo_page_dump *dump)
{
        static const int tomoyo_buffer_len = 4096 * 2;
        char *buffer = kzalloc(tomoyo_buffer_len, GFP_NOFS);
        char *cp;
        char *last_start;
        int len;
        unsigned long pos = bprm->p;
        int offset = pos % PAGE_SIZE;
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        bool truncated = false;

        if (!buffer)
                return NULL;
        len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ ");
        cp = buffer + len;
        if (!argv_count) {
                memmove(cp, "} envp[]={ ", 11);
                cp += 11;
        }
        last_start = cp;
        while (argv_count || envp_count) {
                if (!tomoyo_dump_page(bprm, pos, dump))
                        goto out;
                pos += PAGE_SIZE - offset;
                /* Read. */
                while (offset < PAGE_SIZE) {
                        const char *kaddr = dump->data;
                        const unsigned char c = kaddr[offset++];

                        if (cp == last_start)
                                *cp++ = '"';
                        if (cp >= buffer + tomoyo_buffer_len - 32) {
                                /* Reserve some room for "..." string. */
                                truncated = true;
                        } else if (c == '\\') {
                                *cp++ = '\\';
                                *cp++ = '\\';
                        } else if (c > ' ' && c < 127) {
                                *cp++ = c;
                        } else if (!c) {
                                *cp++ = '"';
                                *cp++ = ' ';
                                last_start = cp;
                        } else {
                                *cp++ = '\\';
                                *cp++ = (c >> 6) + '0';
                                *cp++ = ((c >> 3) & 7) + '0';
                                *cp++ = (c & 7) + '0';
                        }
                        if (c)
                                continue;
                        if (argv_count) {
                                if (--argv_count == 0) {
                                        if (truncated) {
                                                cp = last_start;
                                                memmove(cp, "... ", 4);
                                                cp += 4;
                                        }
                                        memmove(cp, "} envp[]={ ", 11);
                                        cp += 11;
                                        last_start = cp;
                                        truncated = false;
                                }
                        } else if (envp_count) {
                                if (--envp_count == 0) {
                                        if (truncated) {
                                                cp = last_start;
                                                memmove(cp, "... ", 4);
                                                cp += 4;
                                        }
                                }
                        }
                        if (!argv_count && !envp_count)
                                break;
                }
                offset = 0;
        }
        *cp++ = '}';
        *cp = '\0';
        return buffer;
out:
        snprintf(buffer, tomoyo_buffer_len - 1,
                 "argv[]={ ... } envp[]= { ... }");
        return buffer;
}

/**
 * tomoyo_filetype - Get string representation of file type.
 *
 * @mode: Mode value for stat().
 *
 * Returns file type string.
 */
static inline const char *tomoyo_filetype(const umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFREG:
        case 0:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FILE];
        case S_IFDIR:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_DIRECTORY];
        case S_IFLNK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SYMLINK];
        case S_IFIFO:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FIFO];
        case S_IFSOCK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SOCKET];
        case S_IFBLK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_BLOCK_DEV];
        case S_IFCHR:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_CHAR_DEV];
        }
        return "unknown"; /* This should not happen. */
}

/**
 * tomoyo_print_header - Get header line of audit log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns string representation.
 *
 * This function uses kmalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
static char *tomoyo_print_header(struct tomoyo_request_info *r)
{
        struct tomoyo_time stamp;
        const pid_t gpid = task_pid_nr(current);
        struct tomoyo_obj_info *obj = r->obj;
        static const int tomoyo_buffer_len = 4096;
        char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS);
        int pos;
        u8 i;

        if (!buffer)
                return NULL;

        tomoyo_convert_time(ktime_get_real_seconds(), &stamp);

        pos = snprintf(buffer, tomoyo_buffer_len - 1,
                       "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s granted=%s (global-pid=%u) task={ pid=%u ppid=%u uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u fsuid=%u fsgid=%u }",
                       stamp.year, stamp.month, stamp.day, stamp.hour,
                       stamp.min, stamp.sec, r->profile, tomoyo_mode[r->mode],
                       str_yes_no(r->granted), gpid, tomoyo_sys_getpid(),
                       tomoyo_sys_getppid(),
                       from_kuid(&init_user_ns, current_uid()),
                       from_kgid(&init_user_ns, current_gid()),
                       from_kuid(&init_user_ns, current_euid()),
                       from_kgid(&init_user_ns, current_egid()),
                       from_kuid(&init_user_ns, current_suid()),
                       from_kgid(&init_user_ns, current_sgid()),
                       from_kuid(&init_user_ns, current_fsuid()),
                       from_kgid(&init_user_ns, current_fsgid()));
        if (!obj)
                goto no_obj_info;
        if (!obj->validate_done) {
                tomoyo_get_attributes(obj);
                obj->validate_done = true;
        }
        for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) {
                struct tomoyo_mini_stat *stat;
                unsigned int dev;
                umode_t mode;

                if (!obj->stat_valid[i])
                        continue;
                stat = &obj->stat[i];
                dev = stat->dev;
                mode = stat->mode;
                if (i & 1) {
                        pos += snprintf(buffer + pos,
                                        tomoyo_buffer_len - 1 - pos,
                                        " path%u.parent={ uid=%u gid=%u ino=%lu perm=0%o }",
                                        (i >> 1) + 1,
                                        from_kuid(&init_user_ns, stat->uid),
                                        from_kgid(&init_user_ns, stat->gid),
                                        (unsigned long)stat->ino,
                                        stat->mode & S_IALLUGO);
                        continue;
                }
                pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos,
                                " path%u={ uid=%u gid=%u ino=%lu major=%u minor=%u perm=0%o type=%s",
                                (i >> 1) + 1,
                                from_kuid(&init_user_ns, stat->uid),
                                from_kgid(&init_user_ns, stat->gid),
                                (unsigned long)stat->ino,
                                MAJOR(dev), MINOR(dev),
                                mode & S_IALLUGO, tomoyo_filetype(mode));
                if (S_ISCHR(mode) || S_ISBLK(mode)) {
                        dev = stat->rdev;
                        pos += snprintf(buffer + pos,
                                        tomoyo_buffer_len - 1 - pos,
                                        " dev_major=%u dev_minor=%u",
                                        MAJOR(dev), MINOR(dev));
                }
                pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos,
                                " }");
        }
no_obj_info:
        if (pos < tomoyo_buffer_len - 1)
                return buffer;
        kfree(buffer);
        return NULL;
}

/**
 * tomoyo_init_log - Allocate buffer for audit logs.
 *
 * @r:    Pointer to "struct tomoyo_request_info".
 * @len:  Buffer size needed for @fmt and @args.
 * @fmt:  The printf()'s format string.
 * @args: va_list structure for @fmt.
 *
 * Returns pointer to allocated memory.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
                      va_list args)
{
        char *buf = NULL;
        char *bprm_info = NULL;
        const char *header = NULL;
        char *realpath = NULL;
        const char *symlink = NULL;
        int pos;
        const char *domainname = r->domain->domainname->name;

        header = tomoyo_print_header(r);
        if (!header)
                return NULL;
        /* +10 is for '\n' etc. and '\0'. */
        len += strlen(domainname) + strlen(header) + 10;
        if (r->ee) {
                struct file *file = r->ee->bprm->file;

                realpath = tomoyo_realpath_from_path(&file->f_path);
                bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump);
                if (!realpath || !bprm_info)
                        goto out;
                /* +80 is for " exec={ realpath=\"%s\" argc=%d envc=%d %s }" */
                len += strlen(realpath) + 80 + strlen(bprm_info);
        } else if (r->obj && r->obj->symlink_target) {
                symlink = r->obj->symlink_target->name;
                /* +18 is for " symlink.target=\"%s\"" */
                len += 18 + strlen(symlink);
        }
        len = kmalloc_size_roundup(len);
        buf = kzalloc(len, GFP_NOFS);
        if (!buf)
                goto out;
        len--;
        pos = snprintf(buf, len, "%s", header);
        if (realpath) {
                struct linux_binprm *bprm = r->ee->bprm;

                pos += snprintf(buf + pos, len - pos,
                                " exec={ realpath=\"%s\" argc=%d envc=%d %s }",
                                realpath, bprm->argc, bprm->envc, bprm_info);
        } else if (symlink)
                pos += snprintf(buf + pos, len - pos, " symlink.target=\"%s\"",
                                symlink);
        pos += snprintf(buf + pos, len - pos, "\n%s\n", domainname);
        vsnprintf(buf + pos, len - pos, fmt, args);
out:
        kfree(realpath);
        kfree(bprm_info);
        kfree(header);
        return buf;
}

/* Wait queue for /sys/kernel/security/tomoyo/audit. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_log_wait);

/* Structure for audit log. */
struct tomoyo_log {
        struct list_head list;
        char *log;
        int size;
};

/* The list for "struct tomoyo_log". */
static LIST_HEAD(tomoyo_log);

/* Lock for "struct list_head tomoyo_log". */
static DEFINE_SPINLOCK(tomoyo_log_lock);

/* Length of "struct list_head tomoyo_log". */
static unsigned int tomoyo_log_count;

/**
 * tomoyo_get_audit - Get audit mode.
 *
 * @ns:          Pointer to "struct tomoyo_policy_namespace".
 * @profile:     Profile number.
 * @index:       Index number of functionality.
 * @matched_acl: Pointer to "struct tomoyo_acl_info".
 * @is_granted:  True if granted log, false otherwise.
 *
 * Returns true if this request should be audited, false otherwise.
 */
static bool tomoyo_get_audit(const struct tomoyo_policy_namespace *ns,
                             const u8 profile, const u8 index,
                             const struct tomoyo_acl_info *matched_acl,
                             const bool is_granted)
{
        u8 mode;
        const u8 category = tomoyo_index2category[index] +
                TOMOYO_MAX_MAC_INDEX;
        struct tomoyo_profile *p;

        if (!tomoyo_policy_loaded)
                return false;
        p = tomoyo_profile(ns, profile);
        if (tomoyo_log_count >= p->pref[TOMOYO_PREF_MAX_AUDIT_LOG])
                return false;
        if (is_granted && matched_acl && matched_acl->cond &&
            matched_acl->cond->grant_log != TOMOYO_GRANTLOG_AUTO)
                return matched_acl->cond->grant_log == TOMOYO_GRANTLOG_YES;
        mode = p->config[index];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->config[category];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->default_config;
        if (is_granted)
                return mode & TOMOYO_CONFIG_WANT_GRANT_LOG;
        return mode & TOMOYO_CONFIG_WANT_REJECT_LOG;
}

/**
 * tomoyo_write_log2 - Write an audit log.
 *
 * @r:    Pointer to "struct tomoyo_request_info".
 * @len:  Buffer size needed for @fmt and @args.
 * @fmt:  The printf()'s format string.
 * @args: va_list structure for @fmt.
 *
 * Returns nothing.
 */
void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
                       va_list args)
{
        char *buf;
        struct tomoyo_log *entry;
        bool quota_exceeded = false;

        if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type,
                              r->matched_acl, r->granted))
                goto out;
        buf = tomoyo_init_log(r, len, fmt, args);
        if (!buf)
                goto out;
        entry = kzalloc_obj(*entry, GFP_NOFS);
        if (!entry) {
                kfree(buf);
                goto out;
        }
        entry->log = buf;
        len = kmalloc_size_roundup(strlen(buf) + 1);
        /*
         * The entry->size is used for memory quota checks.
         * Don't go beyond strlen(entry->log).
         */
        entry->size = len + kmalloc_size_roundup(sizeof(*entry));
        spin_lock(&tomoyo_log_lock);
        if (tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT] &&
            tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] + entry->size >=
            tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT]) {
                quota_exceeded = true;
        } else {
                tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] += entry->size;
                list_add_tail(&entry->list, &tomoyo_log);
                tomoyo_log_count++;
        }
        spin_unlock(&tomoyo_log_lock);
        if (quota_exceeded) {
                kfree(buf);
                kfree(entry);
                goto out;
        }
        wake_up(&tomoyo_log_wait);
out:
        return;
}

/**
 * tomoyo_write_log - Write an audit log.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @fmt: The printf()'s format string, followed by parameters.
 *
 * Returns nothing.
 */
void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
{
        va_list args;
        int len;

        va_start(args, fmt);
        len = vsnprintf(NULL, 0, fmt, args) + 1;
        va_end(args);
        va_start(args, fmt);
        tomoyo_write_log2(r, len, fmt, args);
        va_end(args);
}

/**
 * tomoyo_read_log - Read an audit log.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
void tomoyo_read_log(struct tomoyo_io_buffer *head)
{
        struct tomoyo_log *ptr = NULL;

        if (head->r.w_pos)
                return;
        kfree(head->read_buf);
        head->read_buf = NULL;
        spin_lock(&tomoyo_log_lock);
        if (!list_empty(&tomoyo_log)) {
                ptr = list_entry(tomoyo_log.next, typeof(*ptr), list);
                list_del(&ptr->list);
                tomoyo_log_count--;
                tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] -= ptr->size;
        }
        spin_unlock(&tomoyo_log_lock);
        if (ptr) {
                head->read_buf = ptr->log;
                head->r.w[head->r.w_pos++] = head->read_buf;
                kfree(ptr);
        }
}

/**
 * tomoyo_poll_log - Wait for an audit log.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table". Maybe NULL.
 *
 * Returns EPOLLIN | EPOLLRDNORM when ready to read an audit log.
 */
__poll_t tomoyo_poll_log(struct file *file, poll_table *wait)
{
        if (tomoyo_log_count)
                return EPOLLIN | EPOLLRDNORM;
        poll_wait(file, &tomoyo_log_wait, wait);
        if (tomoyo_log_count)
                return EPOLLIN | EPOLLRDNORM;
        return 0;
}














































































































    2 























    6 
















































































    5 













































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _IPV6_H
#define _IPV6_H

#include <uapi/linux/ipv6.h>
#include <linux/cache.h>

#define ipv6_optlen(p)  (((p)->hdrlen+1) << 3)
#define ipv6_authlen(p) (((p)->hdrlen+2) << 2)
/*
 * This structure contains configuration options per IPv6 link.
 */
struct ipv6_devconf {
        /* RX & TX fastpath fields. */
        __cacheline_group_begin(ipv6_devconf_read_txrx);
        __s32                disable_ipv6;
        __s32                hop_limit;
        __s32                mtu6;
        __s32                forwarding;
        __s32                force_forwarding;
        __s32                disable_policy;
        __s32                proxy_ndp;
        __cacheline_group_end(ipv6_devconf_read_txrx);

        __s32                accept_ra;
        __s32                accept_redirects;
        __s32                autoconf;
        __s32                dad_transmits;
        __s32                rtr_solicits;
        __s32                rtr_solicit_interval;
        __s32                rtr_solicit_max_interval;
        __s32                rtr_solicit_delay;
        __s32                force_mld_version;
        __s32                mldv1_unsolicited_report_interval;
        __s32                mldv2_unsolicited_report_interval;
        __s32                use_tempaddr;
        __s32                temp_valid_lft;
        __s32                temp_prefered_lft;
        __s32                regen_min_advance;
        __s32                regen_max_retry;
        __s32                max_desync_factor;
        __s32                max_addresses;
        __s32                accept_ra_defrtr;
        __u32                ra_defrtr_metric;
        __s32                accept_ra_min_hop_limit;
        __s32                accept_ra_min_lft;
        __s32                accept_ra_pinfo;
        __s32                ignore_routes_with_linkdown;
#ifdef CONFIG_IPV6_ROUTER_PREF
        __s32                accept_ra_rtr_pref;
        __s32                rtr_probe_interval;
#ifdef CONFIG_IPV6_ROUTE_INFO
        __s32                accept_ra_rt_info_min_plen;
        __s32                accept_ra_rt_info_max_plen;
#endif
#endif
        __s32                accept_source_route;
        __s32                accept_ra_from_local;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        __s32                optimistic_dad;
        __s32                use_optimistic;
#endif
#ifdef CONFIG_IPV6_MROUTE
        atomic_t        mc_forwarding;
#endif
        __s32                drop_unicast_in_l2_multicast;
        __s32                accept_dad;
        __s32                force_tllao;
        __s32           ndisc_notify;
        __s32                suppress_frag_ndisc;
        __s32                accept_ra_mtu;
        __s32                drop_unsolicited_na;
        __s32                accept_untracked_na;
        struct ipv6_stable_secret {
                bool initialized;
                struct in6_addr secret;
        } stable_secret;
        __s32                use_oif_addrs_only;
        __s32                keep_addr_on_down;
        __s32                seg6_enabled;
#ifdef CONFIG_IPV6_SEG6_HMAC
        __s32                seg6_require_hmac;
#endif
        __u32                enhanced_dad;
        __u32                addr_gen_mode;
        __s32           ndisc_tclass;
        __s32                rpl_seg_enabled;
        __u32                ioam6_id;
        __u32                ioam6_id_wide;
        __u8                ioam6_enabled;
        __u8                ndisc_evict_nocarrier;
        __u8                ra_honor_pio_life;
        __u8                ra_honor_pio_pflag;

        struct ctl_table_header *sysctl_header;
};

struct ipv6_params {
        __s32 disable_ipv6;
        __s32 autoconf;
};
extern struct ipv6_params ipv6_defaults;
#include <linux/tcp.h>
#include <linux/udp.h>

#include <net/inet_sock.h>

static inline struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_network_header(skb);
}

static inline struct ipv6hdr *inner_ipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_inner_network_header(skb);
}

static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_transport_header(skb);
}

static inline unsigned int ipv6_transport_len(const struct sk_buff *skb)
{
        return ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr) -
               skb_network_header_len(skb);
}

static inline unsigned int
ipv6_payload_len(const struct sk_buff *skb, const struct ipv6hdr *ip6)
{
        u32 len = ntohs(ip6->payload_len);

        return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ?
                len :
                skb->len - skb_network_offset(skb) - sizeof(struct ipv6hdr);
}

static inline unsigned int skb_ipv6_payload_len(const struct sk_buff *skb)
{
        return ipv6_payload_len(skb, ipv6_hdr(skb));
}

#define IPV6_MAXPLEN                65535

static inline void ipv6_set_payload_len(struct ipv6hdr *ip6, unsigned int len)
{
        ip6->payload_len = len <= IPV6_MAXPLEN ? htons(len) : 0;
}

/* 
   This structure contains results of exthdrs parsing
   as offsets from skb->nh.
 */

struct inet6_skb_parm {
        int                        iif;
        __be16                        ra;
        __u16                        dst0;
        __u16                        srcrt;
        __u16                        dst1;
        __u16                        lastopt;
        __u16                        nhoff;
        __u16                        flags;
#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
        __u16                        dsthao;
#endif
        __u16                        frag_max_size;
        __u16                        srhoff;

#define IP6SKB_XFRM_TRANSFORMED        1
#define IP6SKB_FORWARDED        2
#define IP6SKB_REROUTED                4
#define IP6SKB_ROUTERALERT        8
#define IP6SKB_FRAGMENTED      16
#define IP6SKB_HOPBYHOP        32
#define IP6SKB_L3SLAVE         64
#define IP6SKB_JUMBOGRAM      128
#define IP6SKB_SEG6              256
#define IP6SKB_MULTIPATH      1024
#define IP6SKB_MCROUTE        2048
};

#if defined(CONFIG_NET_L3_MASTER_DEV)
static inline bool ipv6_l3mdev_skb(__u16 flags)
{
        return flags & IP6SKB_L3SLAVE;
}
#else
static inline bool ipv6_l3mdev_skb(__u16 flags)
{
        return false;
}
#endif

#define IP6CB(skb)        ((struct inet6_skb_parm*)((skb)->cb))
#define IP6CBMTU(skb)        ((struct ip6_mtuinfo *)((skb)->cb))

static inline int inet6_iif(const struct sk_buff *skb)
{
        bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags);

        return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
}

static inline bool inet6_is_jumbogram(const struct sk_buff *skb)
{
        return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM);
}

/* can not be used in TCP layer after tcp_v6_fill_cb */
static inline int inet6_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
                return IP6CB(skb)->iif;
#endif
        return 0;
}

struct tcp6_request_sock {
        struct tcp_request_sock          tcp6rsk_tcp;
};

struct ipv6_mc_socklist;
struct ipv6_ac_socklist;
struct ipv6_fl_socklist;

/* struct ipv6_pinfo - ipv6 private area */
struct ipv6_pinfo {
        /* Used in tx path (inet6_csk_route_socket(), ip6_xmit()) */
        struct in6_addr         saddr;
        union {
                struct in6_addr daddr;
                struct in6_addr final;
        };
        __be32                        flow_label;
        u32                        dst_cookie;
        struct ipv6_txoptions __rcu        *opt;
        s16                        hop_limit;
        u8                        pmtudisc;
        u8                        tclass;
#ifdef CONFIG_IPV6_SUBTREES
        bool                        saddr_cache;
#endif
        bool                        daddr_cache;

        u8                        mcast_hops;
        u32                        frag_size;

        int                        ucast_oif;
        int                        mcast_oif;

        /* pktoption flags */
        union {
                struct {
                        u16        srcrt:1,
                                osrcrt:1,
                                rxinfo:1,
                                rxoinfo:1,
                                rxhlim:1,
                                rxohlim:1,
                                hopopts:1,
                                ohopopts:1,
                                dstopts:1,
                                odstopts:1,
                                rxflow:1,
                                rxtclass:1,
                                rxpmtu:1,
                                rxorigdstaddr:1,
                                recvfragsize:1;
                                /* 1 bits hole */
                } bits;
                u16                all;
        } rxopt;

        /* sockopt flags */
        u8                        srcprefs;        /* 001: prefer temporary address
                                                 * 010: prefer public address
                                                 * 100: prefer care-of address
                                                 */
        u8                        min_hopcount;
        __be32                        rcv_flowinfo;
        struct in6_pktinfo        sticky_pktinfo;

        struct sk_buff                *pktoptions;
        struct sk_buff                *rxpmtu;

        struct ipv6_mc_socklist        __rcu *ipv6_mc_list;
        struct ipv6_ac_socklist        *ipv6_ac_list;
};

/* We currently use available bits from inet_sk(sk)->inet_flags,
 * this could change in the future.
 */
#define inet6_test_bit(nr, sk)                        \
        test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet6_set_bit(nr, sk)                        \
        set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet6_clear_bit(nr, sk)                        \
        clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet6_assign_bit(nr, sk, val)                \
        assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)

/* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
struct raw6_sock {
        /* inet_sock has to be the first member of raw6_sock */
        struct inet_sock        inet;
        __u32                        checksum;        /* perform checksum */
        __u32                        offset;                /* checksum offset  */
        struct icmp6_filter        filter;
        __u32                        ip6mr_table;
        struct numa_drop_counters drop_counters;
        struct ipv6_pinfo        inet6;
};

struct udp6_sock {
        struct udp_sock          udp;

        struct ipv6_pinfo inet6;
};

struct tcp6_sock {
        struct tcp_sock          tcp;

        struct ipv6_pinfo inet6;
};

extern int inet6_sk_rebuild_header(struct sock *sk);

struct tcp6_timewait_sock {
        struct tcp_timewait_sock   tcp6tw_tcp;
};

#if IS_ENABLED(CONFIG_IPV6)
extern int disable_ipv6_mod;

static inline bool ipv6_mod_enabled(void)
{
        return disable_ipv6_mod == 0;
}

static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk)
{
        return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL;
}

#define raw6_sk(ptr) container_of_const(ptr, struct raw6_sock, inet.sk)

#define ipv6_only_sock(sk)        (sk->sk_ipv6only)
#define ipv6_sk_rxinfo(sk)        ((sk)->sk_family == PF_INET6 && \
                                 inet6_sk(sk)->rxopt.bits.rxinfo)

static inline const struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
{
        if (sk->sk_family == AF_INET6)
                return &sk->sk_v6_rcv_saddr;
        return NULL;
}

static inline int inet_v6_ipv6only(const struct sock *sk)
{
        /* ipv6only field is at same position for timewait and other sockets */
        return ipv6_only_sock(sk);
}
#else
#define ipv6_only_sock(sk)        0
#define ipv6_sk_rxinfo(sk)        0

static inline bool ipv6_mod_enabled(void)
{
        return false;
}

static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
{
        return NULL;
}

static inline struct raw6_sock *raw6_sk(const struct sock *sk)
{
        return NULL;
}

#define inet6_rcv_saddr(__sk)        NULL
#define inet_v6_ipv6only(__sk)                0
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif /* _IPV6_H */




























   18 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM pagemap

#if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGEMAP_H

#include <linux/tracepoint.h>
#include <linux/mm.h>

#define        PAGEMAP_MAPPED                0x0001u
#define PAGEMAP_ANONYMOUS        0x0002u
#define PAGEMAP_FILE                0x0004u
#define PAGEMAP_SWAPCACHE        0x0008u
#define PAGEMAP_SWAPBACKED        0x0010u
#define PAGEMAP_MAPPEDDISK        0x0020u
#define PAGEMAP_BUFFERS                0x0040u

#define trace_pagemap_flags(folio) ( \
        (folio_test_anon(folio)                ? PAGEMAP_ANONYMOUS  : PAGEMAP_FILE) | \
        (folio_mapped(folio)                ? PAGEMAP_MAPPED     : 0) | \
        (folio_test_swapcache(folio)        ? PAGEMAP_SWAPCACHE  : 0) | \
        (folio_test_swapbacked(folio)        ? PAGEMAP_SWAPBACKED : 0) | \
        (folio_test_mappedtodisk(folio)        ? PAGEMAP_MAPPEDDISK : 0) | \
        (folio_test_private(folio)        ? PAGEMAP_BUFFERS    : 0) \
        )

TRACE_EVENT(mm_lru_insertion,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(struct folio *,        folio        )
                __field(unsigned long,        pfn        )
                __field(enum lru_list,        lru        )
                __field(unsigned long,        flags        )
        ),

        TP_fast_assign(
                __entry->folio        = folio;
                __entry->pfn        = folio_pfn(folio);
                __entry->lru        = folio_lru_list(folio);
                __entry->flags        = trace_pagemap_flags(folio);
        ),

        /* Flag format is based on page-types.c formatting for pagemap */
        TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s",
                        __entry->folio,
                        __entry->pfn,
                        __entry->lru,
                        __entry->flags & PAGEMAP_MAPPED                ? "M" : " ",
                        __entry->flags & PAGEMAP_ANONYMOUS        ? "a" : "f",
                        __entry->flags & PAGEMAP_SWAPCACHE        ? "s" : " ",
                        __entry->flags & PAGEMAP_SWAPBACKED        ? "b" : " ",
                        __entry->flags & PAGEMAP_MAPPEDDISK        ? "d" : " ",
                        __entry->flags & PAGEMAP_BUFFERS        ? "B" : " ")
);

TRACE_EVENT(mm_lru_activate,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(struct folio *,        folio        )
                __field(unsigned long,        pfn        )
        ),

        TP_fast_assign(
                __entry->folio        = folio;
                __entry->pfn        = folio_pfn(folio);
        ),

        TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn)
);

#endif /* _TRACE_PAGEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>












































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (c) 2018 Red Hat, Inc.
 * All rights reserved.
 */

#ifndef __LIBXFS_AG_H
#define __LIBXFS_AG_H 1

#include "xfs_group.h"

struct xfs_mount;
struct xfs_trans;
struct xfs_perag;

/*
 * Per-ag infrastructure
 */

/* per-AG block reservation data structures*/
struct xfs_ag_resv {
        /* number of blocks originally reserved here */
        xfs_extlen_t                        ar_orig_reserved;
        /* number of blocks reserved here */
        xfs_extlen_t                        ar_reserved;
        /* number of blocks originally asked for */
        xfs_extlen_t                        ar_asked;
};

/*
 * Per-ag incore structure, copies of information in agf and agi, to improve the
 * performance of allocation group selection.
 */
struct xfs_perag {
        struct xfs_group pag_group;
        unsigned long        pag_opstate;
        uint8_t                pagf_bno_level;        /* # of levels in bno btree */
        uint8_t                pagf_cnt_level;        /* # of levels in cnt btree */
        uint8_t                pagf_rmap_level;/* # of levels in rmap btree */
        uint32_t        pagf_flcount;        /* count of blocks in freelist */
        xfs_extlen_t        pagf_freeblks;        /* total free blocks */
        xfs_extlen_t        pagf_longest;        /* longest free space */
        uint32_t        pagf_btreeblks;        /* # of blocks held in AGF btrees */
        xfs_agino_t        pagi_freecount;        /* number of free inodes */
        xfs_agino_t        pagi_count;        /* number of allocated inodes */

        /*
         * Inode allocation search lookup optimisation.
         * If the pagino matches, the search for new inodes
         * doesn't need to search the near ones again straight away
         */
        xfs_agino_t        pagl_pagino;
        xfs_agino_t        pagl_leftrec;
        xfs_agino_t        pagl_rightrec;

        uint8_t                pagf_refcount_level; /* recount btree height */

        /* Blocks reserved for all kinds of metadata. */
        struct xfs_ag_resv        pag_meta_resv;
        /* Blocks reserved for the reverse mapping btree. */
        struct xfs_ag_resv        pag_rmapbt_resv;

        /* Precalculated geometry info */
        xfs_agino_t                agino_min;
        xfs_agino_t                agino_max;

#ifdef __KERNEL__
        /* -- kernel only structures below this line -- */

#ifdef CONFIG_XFS_ONLINE_REPAIR
        /*
         * Alternate btree heights so that online repair won't trip the write
         * verifiers while rebuilding the AG btrees.
         */
        uint8_t                pagf_repair_bno_level;
        uint8_t                pagf_repair_cnt_level;
        uint8_t                pagf_repair_refcount_level;
        uint8_t                pagf_repair_rmap_level;
#endif

        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */

        spinlock_t        pag_ici_lock;        /* incore inode cache lock */
        struct radix_tree_root pag_ici_root;        /* incore inode cache root */
        int                pag_ici_reclaimable;        /* reclaimable inodes */
        unsigned long        pag_ici_reclaim_cursor;        /* reclaim restart point */

        /* background prealloc block trimming */
        struct delayed_work        pag_blockgc_work;
#endif /* __KERNEL__ */
};

static inline struct xfs_perag *to_perag(struct xfs_group *xg)
{
        return container_of(xg, struct xfs_perag, pag_group);
}

static inline struct xfs_group *pag_group(struct xfs_perag *pag)
{
        return &pag->pag_group;
}

static inline struct xfs_mount *pag_mount(const struct xfs_perag *pag)
{
        return pag->pag_group.xg_mount;
}

static inline xfs_agnumber_t pag_agno(const struct xfs_perag *pag)
{
        return pag->pag_group.xg_gno;
}

/*
 * Per-AG operational state. These are atomic flag bits.
 */
#define XFS_AGSTATE_AGF_INIT                0
#define XFS_AGSTATE_AGI_INIT                1
#define XFS_AGSTATE_PREFERS_METADATA        2
#define XFS_AGSTATE_ALLOWS_INODES        3
#define XFS_AGSTATE_AGFL_NEEDS_RESET        4

#define __XFS_AG_OPSTATE(name, NAME) \
static inline bool xfs_perag_ ## name (struct xfs_perag *pag) \
{ \
        return test_bit(XFS_AGSTATE_ ## NAME, &pag->pag_opstate); \
}

__XFS_AG_OPSTATE(initialised_agf, AGF_INIT)
__XFS_AG_OPSTATE(initialised_agi, AGI_INIT)
__XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA)
__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES)
__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET)

int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t orig_agcount,
                xfs_agnumber_t new_agcount, xfs_rfsblock_t dcount,
                xfs_agnumber_t *maxagi);
void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno,
                xfs_agnumber_t end_agno);
int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount);

/* Passive AG references */
static inline struct xfs_perag *
xfs_perag_get(
        struct xfs_mount        *mp,
        xfs_agnumber_t                agno)
{
        return to_perag(xfs_group_get(mp, agno, XG_TYPE_AG));
}

static inline struct xfs_perag *
xfs_perag_hold(
        struct xfs_perag        *pag)
{
        return to_perag(xfs_group_hold(pag_group(pag)));
}

static inline void
xfs_perag_put(
        struct xfs_perag        *pag)
{
        xfs_group_put(pag_group(pag));
}

/* Active AG references */
static inline struct xfs_perag *
xfs_perag_grab(
        struct xfs_mount        *mp,
        xfs_agnumber_t                agno)
{
        return to_perag(xfs_group_grab(mp, agno, XG_TYPE_AG));
}

static inline void
xfs_perag_rele(
        struct xfs_perag        *pag)
{
        xfs_group_rele(pag_group(pag));
}

static inline struct xfs_perag *
xfs_perag_next_range(
        struct xfs_mount        *mp,
        struct xfs_perag        *pag,
        xfs_agnumber_t                start_agno,
        xfs_agnumber_t                end_agno)
{
        return to_perag(xfs_group_next_range(mp, pag ? pag_group(pag) : NULL,
                        start_agno, end_agno, XG_TYPE_AG));
}

static inline struct xfs_perag *
xfs_perag_next_from(
        struct xfs_mount        *mp,
        struct xfs_perag        *pag,
        xfs_agnumber_t                start_agno)
{
        return xfs_perag_next_range(mp, pag, start_agno, mp->m_sb.sb_agcount - 1);
}

static inline struct xfs_perag *
xfs_perag_next(
        struct xfs_mount        *mp,
        struct xfs_perag        *pag)
{
        return xfs_perag_next_from(mp, pag, 0);
}

/*
 * Per-ag geometry infomation and validation
 */
xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
                xfs_agino_t *first, xfs_agino_t *last);

static inline bool
xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
{
        return xfs_verify_gbno(pag_group(pag), agbno);
}

static inline bool
xfs_verify_agbext(
        struct xfs_perag        *pag,
        xfs_agblock_t                agbno,
        xfs_agblock_t                len)
{
        return xfs_verify_gbext(pag_group(pag), agbno, len);
}

/*
 * Verify that an AG inode number pointer neither points outside the AG
 * nor points at static metadata.
 */
static inline bool
xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino)
{
        if (agino < pag->agino_min)
                return false;
        if (agino > pag->agino_max)
                return false;
        return true;
}

/*
 * Verify that an AG inode number pointer neither points outside the AG
 * nor points at static metadata, or is NULLAGINO.
 */
static inline bool
xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino)
{
        if (agino == NULLAGINO)
                return true;
        return xfs_verify_agino(pag, agino);
}

static inline bool
xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno)
{
        return mp->m_sb.sb_logstart > 0 &&
               agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
}

static inline struct xfs_perag *
xfs_perag_next_wrap(
        struct xfs_perag        *pag,
        xfs_agnumber_t                *agno,
        xfs_agnumber_t                stop_agno,
        xfs_agnumber_t                restart_agno,
        xfs_agnumber_t                wrap_agno)
{
        struct xfs_mount        *mp = pag_mount(pag);

        *agno = pag_agno(pag) + 1;
        xfs_perag_rele(pag);
        while (*agno != stop_agno) {
                if (*agno >= wrap_agno) {
                        if (restart_agno >= stop_agno)
                                break;
                        *agno = restart_agno;
                }

                pag = xfs_perag_grab(mp, *agno);
                if (pag)
                        return pag;
                (*agno)++;
        }
        return NULL;
}

/*
 * Iterate all AGs from start_agno through wrap_agno, then restart_agno through
 * (start_agno - 1).
 */
#define for_each_perag_wrap_range(mp, start_agno, restart_agno, wrap_agno, agno, pag) \
        for ((agno) = (start_agno), (pag) = xfs_perag_grab((mp), (agno)); \
                (pag) != NULL; \
                (pag) = xfs_perag_next_wrap((pag), &(agno), (start_agno), \
                                (restart_agno), (wrap_agno)))
/*
 * Iterate all AGs from start_agno through wrap_agno, then 0 through
 * (start_agno - 1).
 */
#define for_each_perag_wrap_at(mp, start_agno, wrap_agno, agno, pag) \
        for_each_perag_wrap_range((mp), (start_agno), 0, (wrap_agno), (agno), (pag))

/*
 * Iterate all AGs from start_agno through to the end of the filesystem, then 0
 * through (start_agno - 1).
 */
#define for_each_perag_wrap(mp, start_agno, agno, pag) \
        for_each_perag_wrap_at((mp), (start_agno), (mp)->m_sb.sb_agcount, \
                                (agno), (pag))


struct aghdr_init_data {
        /* per ag data */
        xfs_agblock_t                agno;                /* ag to init */
        xfs_extlen_t                agsize;                /* new AG size */
        struct list_head        buffer_list;        /* buffer writeback list */
        xfs_rfsblock_t                nfree;                /* cumulative new free space */

        /* per header data */
        xfs_daddr_t                daddr;                /* header location */
        size_t                        numblks;        /* size of header */
        const struct xfs_btree_ops *bc_ops;        /* btree ops */
};

int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp,
                        xfs_extlen_t delta);
void
xfs_growfs_compute_deltas(struct xfs_mount *mp, xfs_rfsblock_t nb,
                        int64_t *deltap, xfs_agnumber_t *nagcountp);
int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp,
                        xfs_extlen_t len);
int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);

static inline xfs_fsblock_t
xfs_agbno_to_fsb(
        struct xfs_perag        *pag,
        xfs_agblock_t                agbno)
{
        return XFS_AGB_TO_FSB(pag_mount(pag), pag_agno(pag), agbno);
}

static inline xfs_daddr_t
xfs_agbno_to_daddr(
        struct xfs_perag        *pag,
        xfs_agblock_t                agbno)
{
        return XFS_AGB_TO_DADDR(pag_mount(pag), pag_agno(pag), agbno);
}

static inline xfs_ino_t
xfs_agino_to_ino(
        struct xfs_perag        *pag,
        xfs_agino_t                agino)
{
        return XFS_AGINO_TO_INO(pag_mount(pag), pag_agno(pag), agino);
}

#endif /* __LIBXFS_AG_H */


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 

























































    1 



    1 




































































    1 
    1 


    1 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 











    1 





    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xfrm_state.c
 *
 * Changes:
 *        Mitsuru KANDA @USAGI
 *         Kazunori MIYAZAWA @USAGI
 *         Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *                 IPv6 support
 *         YOSHIFUJI Hideaki @USAGI
 *                 Split up af-specific functions
 *        Derek Atkins <derek@ihtfp.com>
 *                Add UDP Encapsulation
 *
 */

#include <linux/compat.h>
#include <linux/workqueue.h>
#include <net/xfrm.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/audit.h>
#include <linux/uaccess.h>
#include <linux/ktime.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>

#include <crypto/aead.h>

#include "xfrm_hash.h"

#define xfrm_state_deref_prot(table, net) \
        rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock))
#define xfrm_state_deref_check(table, net) \
        rcu_dereference_check((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock))

static void xfrm_state_gc_task(struct work_struct *work);

/* Each xfrm_state may be linked to two tables:

   1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
   2. Hash table by (daddr,family,reqid) to find what SAs exist for given
      destination/tunnel endpoint. (output)
 */

static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
static struct kmem_cache *xfrm_state_cache __ro_after_init;

static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
static HLIST_HEAD(xfrm_state_gc_list);
static HLIST_HEAD(xfrm_state_dev_gc_list);

static inline bool xfrm_state_hold_rcu(struct xfrm_state *x)
{
        return refcount_inc_not_zero(&x->refcnt);
}

static inline unsigned int xfrm_dst_hash(struct net *net,
                                         const xfrm_address_t *daddr,
                                         const xfrm_address_t *saddr,
                                         u32 reqid,
                                         unsigned short family)
{
        lockdep_assert_held(&net->xfrm.xfrm_state_lock);

        return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask);
}

static inline unsigned int xfrm_src_hash(struct net *net,
                                         const xfrm_address_t *daddr,
                                         const xfrm_address_t *saddr,
                                         unsigned short family)
{
        lockdep_assert_held(&net->xfrm.xfrm_state_lock);

        return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask);
}

static inline unsigned int
xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr,
              __be32 spi, u8 proto, unsigned short family)
{
        lockdep_assert_held(&net->xfrm.xfrm_state_lock);

        return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask);
}

static unsigned int xfrm_seq_hash(struct net *net, u32 seq)
{
        lockdep_assert_held(&net->xfrm.xfrm_state_lock);

        return __xfrm_seq_hash(seq, net->xfrm.state_hmask);
}

#define XFRM_STATE_INSERT(by, _n, _h, _type)                               \
        {                                                                  \
                struct xfrm_state *_x = NULL;                              \
                                                                           \
                if (_type != XFRM_DEV_OFFLOAD_PACKET) {                    \
                        hlist_for_each_entry_rcu(_x, _h, by) {             \
                                if (_x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \
                                        continue;                          \
                                break;                                     \
                        }                                                  \
                }                                                          \
                                                                           \
                if (!_x || _x->xso.type == XFRM_DEV_OFFLOAD_PACKET)        \
                        /* SAD is empty or consist from HW SAs only */     \
                        hlist_add_head_rcu(_n, _h);                        \
                else                                                       \
                        hlist_add_before_rcu(_n, &_x->by);                 \
        }

static void xfrm_hash_transfer(struct hlist_head *list,
                               struct hlist_head *ndsttable,
                               struct hlist_head *nsrctable,
                               struct hlist_head *nspitable,
                               struct hlist_head *nseqtable,
                               unsigned int nhashmask)
{
        struct hlist_node *tmp;
        struct xfrm_state *x;

        hlist_for_each_entry_safe(x, tmp, list, bydst) {
                unsigned int h;

                h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
                                    x->props.reqid, x->props.family,
                                    nhashmask);
                XFRM_STATE_INSERT(bydst, &x->bydst, ndsttable + h, x->xso.type);

                h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr,
                                    x->props.family,
                                    nhashmask);
                XFRM_STATE_INSERT(bysrc, &x->bysrc, nsrctable + h, x->xso.type);

                if (x->id.spi) {
                        h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
                                            x->id.proto, x->props.family,
                                            nhashmask);
                        XFRM_STATE_INSERT(byspi, &x->byspi, nspitable + h,
                                          x->xso.type);
                }

                if (x->km.seq) {
                        h = __xfrm_seq_hash(x->km.seq, nhashmask);
                        XFRM_STATE_INSERT(byseq, &x->byseq, nseqtable + h,
                                          x->xso.type);
                }
        }
}

static unsigned long xfrm_hash_new_size(unsigned int state_hmask)
{
        return ((state_hmask + 1) << 1) * sizeof(struct hlist_head);
}

static void xfrm_hash_resize(struct work_struct *work)
{
        struct net *net = container_of(work, struct net, xfrm.state_hash_work);
        struct hlist_head *ndst, *nsrc, *nspi, *nseq, *odst, *osrc, *ospi, *oseq;
        unsigned long nsize, osize;
        unsigned int nhashmask, ohashmask;
        int i;

        nsize = xfrm_hash_new_size(net->xfrm.state_hmask);
        ndst = xfrm_hash_alloc(nsize);
        if (!ndst)
                return;
        nsrc = xfrm_hash_alloc(nsize);
        if (!nsrc) {
                xfrm_hash_free(ndst, nsize);
                return;
        }
        nspi = xfrm_hash_alloc(nsize);
        if (!nspi) {
                xfrm_hash_free(ndst, nsize);
                xfrm_hash_free(nsrc, nsize);
                return;
        }
        nseq = xfrm_hash_alloc(nsize);
        if (!nseq) {
                xfrm_hash_free(ndst, nsize);
                xfrm_hash_free(nsrc, nsize);
                xfrm_hash_free(nspi, nsize);
                return;
        }

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        write_seqcount_begin(&net->xfrm.xfrm_state_hash_generation);

        nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
        odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net);
        for (i = net->xfrm.state_hmask; i >= 0; i--)
                xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nseq, nhashmask);

        osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net);
        ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net);
        oseq = xfrm_state_deref_prot(net->xfrm.state_byseq, net);
        ohashmask = net->xfrm.state_hmask;

        rcu_assign_pointer(net->xfrm.state_bydst, ndst);
        rcu_assign_pointer(net->xfrm.state_bysrc, nsrc);
        rcu_assign_pointer(net->xfrm.state_byspi, nspi);
        rcu_assign_pointer(net->xfrm.state_byseq, nseq);
        net->xfrm.state_hmask = nhashmask;

        write_seqcount_end(&net->xfrm.xfrm_state_hash_generation);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        osize = (ohashmask + 1) * sizeof(struct hlist_head);

        synchronize_rcu();

        xfrm_hash_free(odst, osize);
        xfrm_hash_free(osrc, osize);
        xfrm_hash_free(ospi, osize);
        xfrm_hash_free(oseq, osize);
}

static DEFINE_SPINLOCK(xfrm_state_afinfo_lock);
static struct xfrm_state_afinfo __rcu *xfrm_state_afinfo[NPROTO];

static DEFINE_SPINLOCK(xfrm_state_gc_lock);
static DEFINE_SPINLOCK(xfrm_state_dev_gc_lock);

int __xfrm_state_delete(struct xfrm_state *x);

int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol);
static bool km_is_alive(const struct km_event *c);
void km_state_expired(struct xfrm_state *x, int hard, u32 portid);

int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
{
        struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
        int err = 0;

        if (!afinfo)
                return -EAFNOSUPPORT;

#define X(afi, T, name) do {                        \
                WARN_ON((afi)->type_ ## name);        \
                (afi)->type_ ## name = (T);        \
        } while (0)

        switch (type->proto) {
        case IPPROTO_COMP:
                X(afinfo, type, comp);
                break;
        case IPPROTO_AH:
                X(afinfo, type, ah);
                break;
        case IPPROTO_ESP:
                X(afinfo, type, esp);
                break;
        case IPPROTO_IPIP:
                X(afinfo, type, ipip);
                break;
        case IPPROTO_DSTOPTS:
                X(afinfo, type, dstopts);
                break;
        case IPPROTO_ROUTING:
                X(afinfo, type, routing);
                break;
        case IPPROTO_IPV6:
                X(afinfo, type, ipip6);
                break;
        default:
                WARN_ON(1);
                err = -EPROTONOSUPPORT;
                break;
        }
#undef X
        rcu_read_unlock();
        return err;
}
EXPORT_SYMBOL(xfrm_register_type);

void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
{
        struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);

        if (unlikely(afinfo == NULL))
                return;

#define X(afi, T, name) do {                                \
                WARN_ON((afi)->type_ ## name != (T));        \
                (afi)->type_ ## name = NULL;                \
        } while (0)

        switch (type->proto) {
        case IPPROTO_COMP:
                X(afinfo, type, comp);
                break;
        case IPPROTO_AH:
                X(afinfo, type, ah);
                break;
        case IPPROTO_ESP:
                X(afinfo, type, esp);
                break;
        case IPPROTO_IPIP:
                X(afinfo, type, ipip);
                break;
        case IPPROTO_DSTOPTS:
                X(afinfo, type, dstopts);
                break;
        case IPPROTO_ROUTING:
                X(afinfo, type, routing);
                break;
        case IPPROTO_IPV6:
                X(afinfo, type, ipip6);
                break;
        default:
                WARN_ON(1);
                break;
        }
#undef X
        rcu_read_unlock();
}
EXPORT_SYMBOL(xfrm_unregister_type);

static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
{
        const struct xfrm_type *type = NULL;
        struct xfrm_state_afinfo *afinfo;
        int modload_attempted = 0;

retry:
        afinfo = xfrm_state_get_afinfo(family);
        if (unlikely(afinfo == NULL))
                return NULL;

        switch (proto) {
        case IPPROTO_COMP:
                type = afinfo->type_comp;
                break;
        case IPPROTO_AH:
                type = afinfo->type_ah;
                break;
        case IPPROTO_ESP:
                type = afinfo->type_esp;
                break;
        case IPPROTO_IPIP:
                type = afinfo->type_ipip;
                break;
        case IPPROTO_DSTOPTS:
                type = afinfo->type_dstopts;
                break;
        case IPPROTO_ROUTING:
                type = afinfo->type_routing;
                break;
        case IPPROTO_IPV6:
                type = afinfo->type_ipip6;
                break;
        default:
                break;
        }

        if (unlikely(type && !try_module_get(type->owner)))
                type = NULL;

        rcu_read_unlock();

        if (!type && !modload_attempted) {
                request_module("xfrm-type-%d-%d", family, proto);
                modload_attempted = 1;
                goto retry;
        }

        return type;
}

static void xfrm_put_type(const struct xfrm_type *type)
{
        module_put(type->owner);
}

int xfrm_register_type_offload(const struct xfrm_type_offload *type,
                               unsigned short family)
{
        struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
        int err = 0;

        if (unlikely(afinfo == NULL))
                return -EAFNOSUPPORT;

        switch (type->proto) {
        case IPPROTO_ESP:
                WARN_ON(afinfo->type_offload_esp);
                afinfo->type_offload_esp = type;
                break;
        default:
                WARN_ON(1);
                err = -EPROTONOSUPPORT;
                break;
        }

        rcu_read_unlock();
        return err;
}
EXPORT_SYMBOL(xfrm_register_type_offload);

void xfrm_unregister_type_offload(const struct xfrm_type_offload *type,
                                  unsigned short family)
{
        struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);

        if (unlikely(afinfo == NULL))
                return;

        switch (type->proto) {
        case IPPROTO_ESP:
                WARN_ON(afinfo->type_offload_esp != type);
                afinfo->type_offload_esp = NULL;
                break;
        default:
                WARN_ON(1);
                break;
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(xfrm_unregister_type_offload);

void xfrm_set_type_offload(struct xfrm_state *x, bool try_load)
{
        const struct xfrm_type_offload *type = NULL;
        struct xfrm_state_afinfo *afinfo;

retry:
        afinfo = xfrm_state_get_afinfo(x->props.family);
        if (unlikely(afinfo == NULL))
                goto out;

        switch (x->id.proto) {
        case IPPROTO_ESP:
                type = afinfo->type_offload_esp;
                break;
        default:
                break;
        }

        if ((type && !try_module_get(type->owner)))
                type = NULL;

        rcu_read_unlock();

        if (!type && try_load) {
                request_module("xfrm-offload-%d-%d", x->props.family,
                               x->id.proto);
                try_load = false;
                goto retry;
        }

out:
        x->type_offload = type;
}
EXPORT_SYMBOL(xfrm_set_type_offload);

static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = {
        [XFRM_MODE_BEET] = {
                .encap = XFRM_MODE_BEET,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET,
        },
        [XFRM_MODE_TRANSPORT] = {
                .encap = XFRM_MODE_TRANSPORT,
                .family = AF_INET,
        },
        [XFRM_MODE_TUNNEL] = {
                .encap = XFRM_MODE_TUNNEL,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET,
        },
        [XFRM_MODE_IPTFS] = {
                .encap = XFRM_MODE_IPTFS,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET,
        },
};

static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = {
        [XFRM_MODE_BEET] = {
                .encap = XFRM_MODE_BEET,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET6,
        },
        [XFRM_MODE_ROUTEOPTIMIZATION] = {
                .encap = XFRM_MODE_ROUTEOPTIMIZATION,
                .family = AF_INET6,
        },
        [XFRM_MODE_TRANSPORT] = {
                .encap = XFRM_MODE_TRANSPORT,
                .family = AF_INET6,
        },
        [XFRM_MODE_TUNNEL] = {
                .encap = XFRM_MODE_TUNNEL,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET6,
        },
        [XFRM_MODE_IPTFS] = {
                .encap = XFRM_MODE_IPTFS,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET6,
        },
};

static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
{
        const struct xfrm_mode *mode;

        if (unlikely(encap >= XFRM_MODE_MAX))
                return NULL;

        switch (family) {
        case AF_INET:
                mode = &xfrm4_mode_map[encap];
                if (mode->family == family)
                        return mode;
                break;
        case AF_INET6:
                mode = &xfrm6_mode_map[encap];
                if (mode->family == family)
                        return mode;
                break;
        default:
                break;
        }

        return NULL;
}

static const struct xfrm_mode_cbs  __rcu *xfrm_mode_cbs_map[XFRM_MODE_MAX];
static DEFINE_SPINLOCK(xfrm_mode_cbs_map_lock);

int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs)
{
        if (mode >= XFRM_MODE_MAX)
                return -EINVAL;

        spin_lock_bh(&xfrm_mode_cbs_map_lock);
        rcu_assign_pointer(xfrm_mode_cbs_map[mode], mode_cbs);
        spin_unlock_bh(&xfrm_mode_cbs_map_lock);

        return 0;
}
EXPORT_SYMBOL(xfrm_register_mode_cbs);

void xfrm_unregister_mode_cbs(u8 mode)
{
        if (mode >= XFRM_MODE_MAX)
                return;

        spin_lock_bh(&xfrm_mode_cbs_map_lock);
        RCU_INIT_POINTER(xfrm_mode_cbs_map[mode], NULL);
        spin_unlock_bh(&xfrm_mode_cbs_map_lock);
        synchronize_rcu();
}
EXPORT_SYMBOL(xfrm_unregister_mode_cbs);

static const struct xfrm_mode_cbs *xfrm_get_mode_cbs(u8 mode)
{
        const struct xfrm_mode_cbs *cbs;
        bool try_load = true;

        if (mode >= XFRM_MODE_MAX)
                return NULL;

retry:
        rcu_read_lock();

        cbs = rcu_dereference(xfrm_mode_cbs_map[mode]);
        if (cbs && !try_module_get(cbs->owner))
                cbs = NULL;

        rcu_read_unlock();

        if (mode == XFRM_MODE_IPTFS && !cbs && try_load) {
                request_module("xfrm-iptfs");
                try_load = false;
                goto retry;
        }

        return cbs;
}

void xfrm_state_free(struct xfrm_state *x)
{
        kmem_cache_free(xfrm_state_cache, x);
}
EXPORT_SYMBOL(xfrm_state_free);

static void xfrm_state_delete_tunnel(struct xfrm_state *x);
static void xfrm_state_gc_destroy(struct xfrm_state *x)
{
        if (x->mode_cbs && x->mode_cbs->destroy_state)
                x->mode_cbs->destroy_state(x);
        hrtimer_cancel(&x->mtimer);
        timer_delete_sync(&x->rtimer);
        kfree_sensitive(x->aead);
        kfree_sensitive(x->aalg);
        kfree_sensitive(x->ealg);
        kfree(x->calg);
        kfree(x->encap);
        kfree(x->coaddr);
        kfree(x->replay_esn);
        kfree(x->preplay_esn);
        xfrm_unset_type_offload(x);
        xfrm_state_delete_tunnel(x);
        if (x->type) {
                x->type->destructor(x);
                xfrm_put_type(x->type);
        }
        if (x->xfrag.page)
                put_page(x->xfrag.page);
        xfrm_dev_state_free(x);
        security_xfrm_state_free(x);
        xfrm_state_free(x);
}

static void xfrm_state_gc_task(struct work_struct *work)
{
        struct xfrm_state *x;
        struct hlist_node *tmp;
        struct hlist_head gc_list;

        spin_lock_bh(&xfrm_state_gc_lock);
        hlist_move_list(&xfrm_state_gc_list, &gc_list);
        spin_unlock_bh(&xfrm_state_gc_lock);

        synchronize_rcu();

        hlist_for_each_entry_safe(x, tmp, &gc_list, gclist)
                xfrm_state_gc_destroy(x);
}

static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
{
        struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer);
        enum hrtimer_restart ret = HRTIMER_NORESTART;
        time64_t now = ktime_get_real_seconds();
        time64_t next = TIME64_MAX;
        int warn = 0;
        int err = 0;

        spin_lock(&x->lock);
        xfrm_dev_state_update_stats(x);

        if (x->km.state == XFRM_STATE_DEAD)
                goto out;
        if (x->km.state == XFRM_STATE_EXPIRED)
                goto expired;
        if (x->lft.hard_add_expires_seconds) {
                time64_t tmo = x->lft.hard_add_expires_seconds +
                        x->curlft.add_time - now;
                if (tmo <= 0) {
                        if (x->xflags & XFRM_SOFT_EXPIRE) {
                                /* enter hard expire without soft expire first?!
                                 * setting a new date could trigger this.
                                 * workaround: fix x->curflt.add_time by below:
                                 */
                                x->curlft.add_time = now - x->saved_tmo - 1;
                                tmo = x->lft.hard_add_expires_seconds - x->saved_tmo;
                        } else
                                goto expired;
                }
                if (tmo < next)
                        next = tmo;
        }
        if (x->lft.hard_use_expires_seconds) {
                time64_t tmo = x->lft.hard_use_expires_seconds +
                        (READ_ONCE(x->curlft.use_time) ? : now) - now;
                if (tmo <= 0)
                        goto expired;
                if (tmo < next)
                        next = tmo;
        }
        if (x->km.dying)
                goto resched;
        if (x->lft.soft_add_expires_seconds) {
                time64_t tmo = x->lft.soft_add_expires_seconds +
                        x->curlft.add_time - now;
                if (tmo <= 0) {
                        warn = 1;
                        x->xflags &= ~XFRM_SOFT_EXPIRE;
                } else if (tmo < next) {
                        next = tmo;
                        x->xflags |= XFRM_SOFT_EXPIRE;
                        x->saved_tmo = tmo;
                }
        }
        if (x->lft.soft_use_expires_seconds) {
                time64_t tmo = x->lft.soft_use_expires_seconds +
                        (READ_ONCE(x->curlft.use_time) ? : now) - now;
                if (tmo <= 0)
                        warn = 1;
                else if (tmo < next)
                        next = tmo;
        }

        x->km.dying = warn;
        if (warn)
                km_state_expired(x, 0, 0);
resched:
        if (next != TIME64_MAX) {
                hrtimer_forward_now(&x->mtimer, ktime_set(next, 0));
                ret = HRTIMER_RESTART;
        }

        goto out;

expired:
        if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0)
                x->km.state = XFRM_STATE_EXPIRED;

        err = __xfrm_state_delete(x);
        if (!err)
                km_state_expired(x, 1, 0);

        xfrm_audit_state_delete(x, err ? 0 : 1, true);

out:
        spin_unlock(&x->lock);
        return ret;
}

static void xfrm_replay_timer_handler(struct timer_list *t);

struct xfrm_state *xfrm_state_alloc(struct net *net)
{
        struct xfrm_state *x;

        x = kmem_cache_zalloc(xfrm_state_cache, GFP_ATOMIC);

        if (x) {
                write_pnet(&x->xs_net, net);
                refcount_set(&x->refcnt, 1);
                atomic_set(&x->tunnel_users, 0);
                INIT_LIST_HEAD(&x->km.all);
                INIT_HLIST_NODE(&x->state_cache);
                INIT_HLIST_NODE(&x->bydst);
                INIT_HLIST_NODE(&x->bysrc);
                INIT_HLIST_NODE(&x->byspi);
                INIT_HLIST_NODE(&x->byseq);
                hrtimer_setup(&x->mtimer, xfrm_timer_handler, CLOCK_BOOTTIME,
                              HRTIMER_MODE_ABS_SOFT);
                timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0);
                x->curlft.add_time = ktime_get_real_seconds();
                x->lft.soft_byte_limit = XFRM_INF;
                x->lft.soft_packet_limit = XFRM_INF;
                x->lft.hard_byte_limit = XFRM_INF;
                x->lft.hard_packet_limit = XFRM_INF;
                x->replay_maxage = 0;
                x->replay_maxdiff = 0;
                x->pcpu_num = UINT_MAX;
                spin_lock_init(&x->lock);
                x->mode_data = NULL;
        }
        return x;
}
EXPORT_SYMBOL(xfrm_state_alloc);

#ifdef CONFIG_XFRM_OFFLOAD
void xfrm_dev_state_delete(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;
        struct net_device *dev = READ_ONCE(xso->dev);

        if (dev) {
                dev->xfrmdev_ops->xdo_dev_state_delete(dev, x);
                spin_lock_bh(&xfrm_state_dev_gc_lock);
                hlist_add_head(&x->dev_gclist, &xfrm_state_dev_gc_list);
                spin_unlock_bh(&xfrm_state_dev_gc_lock);
        }
}
EXPORT_SYMBOL_GPL(xfrm_dev_state_delete);

void xfrm_dev_state_free(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;
        struct net_device *dev = READ_ONCE(xso->dev);

        if (dev && dev->xfrmdev_ops) {
                spin_lock_bh(&xfrm_state_dev_gc_lock);
                if (!hlist_unhashed(&x->dev_gclist))
                        hlist_del(&x->dev_gclist);
                spin_unlock_bh(&xfrm_state_dev_gc_lock);

                if (dev->xfrmdev_ops->xdo_dev_state_free)
                        dev->xfrmdev_ops->xdo_dev_state_free(dev, x);
                WRITE_ONCE(xso->dev, NULL);
                xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
                netdev_put(dev, &xso->dev_tracker);
        }
}
#endif

void __xfrm_state_destroy(struct xfrm_state *x)
{
        WARN_ON(x->km.state != XFRM_STATE_DEAD);

        spin_lock_bh(&xfrm_state_gc_lock);
        hlist_add_head(&x->gclist, &xfrm_state_gc_list);
        spin_unlock_bh(&xfrm_state_gc_lock);
        schedule_work(&xfrm_state_gc_work);
}
EXPORT_SYMBOL(__xfrm_state_destroy);

int __xfrm_state_delete(struct xfrm_state *x)
{
        struct net *net = xs_net(x);
        int err = -ESRCH;

        if (x->km.state != XFRM_STATE_DEAD) {
                x->km.state = XFRM_STATE_DEAD;

                spin_lock(&net->xfrm.xfrm_state_lock);
                list_del(&x->km.all);
                hlist_del_rcu(&x->bydst);
                hlist_del_rcu(&x->bysrc);
                if (x->km.seq)
                        hlist_del_rcu(&x->byseq);
                if (!hlist_unhashed(&x->state_cache))
                        hlist_del_rcu(&x->state_cache);
                if (!hlist_unhashed(&x->state_cache_input))
                        hlist_del_rcu(&x->state_cache_input);

                if (x->id.spi)
                        hlist_del_rcu(&x->byspi);
                net->xfrm.state_num--;
                xfrm_nat_keepalive_state_updated(x);
                spin_unlock(&net->xfrm.xfrm_state_lock);

                xfrm_dev_state_delete(x);

                xfrm_state_delete_tunnel(x);

                /* All xfrm_state objects are created by xfrm_state_alloc.
                 * The xfrm_state_alloc call gives a reference, and that
                 * is what we are dropping here.
                 */
                xfrm_state_put(x);
                err = 0;
        }

        return err;
}
EXPORT_SYMBOL(__xfrm_state_delete);

int xfrm_state_delete(struct xfrm_state *x)
{
        int err;

        spin_lock_bh(&x->lock);
        err = __xfrm_state_delete(x);
        spin_unlock_bh(&x->lock);

        return err;
}
EXPORT_SYMBOL(xfrm_state_delete);

#ifdef CONFIG_SECURITY_NETWORK_XFRM
static inline int
xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid)
{
        int i, err = 0;

        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                struct xfrm_state *x;

                hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + i, bydst) {
                        if (xfrm_id_proto_match(x->id.proto, proto) &&
                           (err = security_xfrm_state_delete(x)) != 0) {
                                xfrm_audit_state_delete(x, 0, task_valid);
                                return err;
                        }
                }
        }

        return err;
}

static inline int
xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid)
{
        int i, err = 0;

        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                struct xfrm_state *x;
                struct xfrm_dev_offload *xso;

                hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + i, bydst) {
                        xso = &x->xso;

                        if (xso->dev == dev &&
                           (err = security_xfrm_state_delete(x)) != 0) {
                                xfrm_audit_state_delete(x, 0, task_valid);
                                return err;
                        }
                }
        }

        return err;
}
#else
static inline int
xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid)
{
        return 0;
}

static inline int
xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid)
{
        return 0;
}
#endif

int xfrm_state_flush(struct net *net, u8 proto, bool task_valid)
{
        int i, err = 0, cnt = 0;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        err = xfrm_state_flush_secctx_check(net, proto, task_valid);
        if (err)
                goto out;

        err = -ESRCH;
        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                struct xfrm_state *x;
restart:
                hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + i, bydst) {
                        if (!xfrm_state_kern(x) &&
                            xfrm_id_proto_match(x->id.proto, proto)) {
                                xfrm_state_hold(x);
                                spin_unlock_bh(&net->xfrm.xfrm_state_lock);

                                err = xfrm_state_delete(x);
                                xfrm_audit_state_delete(x, err ? 0 : 1,
                                                        task_valid);
                                xfrm_state_put(x);
                                if (!err)
                                        cnt++;

                                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                                goto restart;
                        }
                }
        }
out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        if (cnt)
                err = 0;

        return err;
}
EXPORT_SYMBOL(xfrm_state_flush);

int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid)
{
        struct xfrm_state *x;
        struct hlist_node *tmp;
        struct xfrm_dev_offload *xso;
        int i, err = 0, cnt = 0;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        err = xfrm_dev_state_flush_secctx_check(net, dev, task_valid);
        if (err)
                goto out;

        err = -ESRCH;
        for (i = 0; i <= net->xfrm.state_hmask; i++) {
restart:
                hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + i, bydst) {
                        xso = &x->xso;

                        if (!xfrm_state_kern(x) && xso->dev == dev) {
                                xfrm_state_hold(x);
                                spin_unlock_bh(&net->xfrm.xfrm_state_lock);

                                err = xfrm_state_delete(x);
                                xfrm_dev_state_free(x);

                                xfrm_audit_state_delete(x, err ? 0 : 1,
                                                        task_valid);
                                xfrm_state_put(x);
                                if (!err)
                                        cnt++;

                                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                                goto restart;
                        }
                }
        }
        if (cnt)
                err = 0;

out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        spin_lock_bh(&xfrm_state_dev_gc_lock);
restart_gc:
        hlist_for_each_entry_safe(x, tmp, &xfrm_state_dev_gc_list, dev_gclist) {
                xso = &x->xso;

                if (xso->dev == dev) {
                        spin_unlock_bh(&xfrm_state_dev_gc_lock);
                        xfrm_dev_state_free(x);
                        spin_lock_bh(&xfrm_state_dev_gc_lock);
                        goto restart_gc;
                }

        }
        spin_unlock_bh(&xfrm_state_dev_gc_lock);

        xfrm_flush_gc();

        return err;
}
EXPORT_SYMBOL(xfrm_dev_state_flush);

void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
{
        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        si->sadcnt = net->xfrm.state_num;
        si->sadhcnt = net->xfrm.state_hmask + 1;
        si->sadhmcnt = xfrm_state_hashmax;
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
}
EXPORT_SYMBOL(xfrm_sad_getinfo);

static void
__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi4 *fl4 = &fl->u.ip4;

        sel->daddr.a4 = fl4->daddr;
        sel->saddr.a4 = fl4->saddr;
        sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
        sel->dport_mask = htons(0xffff);
        sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
        sel->sport_mask = htons(0xffff);
        sel->family = AF_INET;
        sel->prefixlen_d = 32;
        sel->prefixlen_s = 32;
        sel->proto = fl4->flowi4_proto;
        sel->ifindex = fl4->flowi4_oif;
}

static void
__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi6 *fl6 = &fl->u.ip6;

        /* Initialize temporary selector matching only to current session. */
        *(struct in6_addr *)&sel->daddr = fl6->daddr;
        *(struct in6_addr *)&sel->saddr = fl6->saddr;
        sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
        sel->dport_mask = htons(0xffff);
        sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
        sel->sport_mask = htons(0xffff);
        sel->family = AF_INET6;
        sel->prefixlen_d = 128;
        sel->prefixlen_s = 128;
        sel->proto = fl6->flowi6_proto;
        sel->ifindex = fl6->flowi6_oif;
}

static void
xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
                    const struct xfrm_tmpl *tmpl,
                    const xfrm_address_t *daddr, const xfrm_address_t *saddr,
                    unsigned short family)
{
        switch (family) {
        case AF_INET:
                __xfrm4_init_tempsel(&x->sel, fl);
                break;
        case AF_INET6:
                __xfrm6_init_tempsel(&x->sel, fl);
                break;
        }

        x->id = tmpl->id;

        switch (tmpl->encap_family) {
        case AF_INET:
                if (x->id.daddr.a4 == 0)
                        x->id.daddr.a4 = daddr->a4;
                x->props.saddr = tmpl->saddr;
                if (x->props.saddr.a4 == 0)
                        x->props.saddr.a4 = saddr->a4;
                break;
        case AF_INET6:
                if (ipv6_addr_any((struct in6_addr *)&x->id.daddr))
                        memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
                memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
                if (ipv6_addr_any((struct in6_addr *)&x->props.saddr))
                        memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
                break;
        }

        x->props.mode = tmpl->mode;
        x->props.reqid = tmpl->reqid;
        x->props.family = tmpl->encap_family;
}

struct xfrm_hash_state_ptrs {
        const struct hlist_head *bydst;
        const struct hlist_head *bysrc;
        const struct hlist_head *byspi;
        unsigned int hmask;
};

static void xfrm_hash_ptrs_get(const struct net *net, struct xfrm_hash_state_ptrs *ptrs)
{
        unsigned int sequence;

        do {
                sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation);

                ptrs->bydst = xfrm_state_deref_check(net->xfrm.state_bydst, net);
                ptrs->bysrc = xfrm_state_deref_check(net->xfrm.state_bysrc, net);
                ptrs->byspi = xfrm_state_deref_check(net->xfrm.state_byspi, net);
                ptrs->hmask = net->xfrm.state_hmask;
        } while (read_seqcount_retry(&net->xfrm.xfrm_state_hash_generation, sequence));
}

static struct xfrm_state *__xfrm_state_lookup_all(const struct xfrm_hash_state_ptrs *state_ptrs,
                                                  u32 mark,
                                                  const xfrm_address_t *daddr,
                                                  __be32 spi, u8 proto,
                                                  unsigned short family,
                                                  struct xfrm_dev_offload *xdo)
{
        unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask);
        struct xfrm_state *x;

        hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) {
#ifdef CONFIG_XFRM_OFFLOAD
                if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) {
                        if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
                                /* HW states are in the head of list, there is
                                 * no need to iterate further.
                                 */
                                break;

                        /* Packet offload: both policy and SA should
                         * have same device.
                         */
                        if (xdo->dev != x->xso.dev)
                                continue;
                } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
                        /* Skip HW policy for SW lookups */
                        continue;
#endif
                if (x->props.family != family ||
                    x->id.spi       != spi ||
                    x->id.proto     != proto ||
                    !xfrm_addr_equal(&x->id.daddr, daddr, family))
                        continue;

                if ((mark & x->mark.m) != x->mark.v)
                        continue;
                if (!xfrm_state_hold_rcu(x))
                        continue;
                return x;
        }

        return NULL;
}

static struct xfrm_state *__xfrm_state_lookup(const struct xfrm_hash_state_ptrs *state_ptrs,
                                              u32 mark,
                                              const xfrm_address_t *daddr,
                                              __be32 spi, u8 proto,
                                              unsigned short family)
{
        unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask);
        struct xfrm_state *x;

        hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) {
                if (x->props.family != family ||
                    x->id.spi       != spi ||
                    x->id.proto     != proto ||
                    !xfrm_addr_equal(&x->id.daddr, daddr, family))
                        continue;

                if ((mark & x->mark.m) != x->mark.v)
                        continue;
                if (!xfrm_state_hold_rcu(x))
                        continue;
                return x;
        }

        return NULL;
}

struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark,
                                           const xfrm_address_t *daddr,
                                           __be32 spi, u8 proto,
                                           unsigned short family)
{
        struct xfrm_hash_state_ptrs state_ptrs;
        struct hlist_head *state_cache_input;
        struct xfrm_state *x = NULL;

        state_cache_input = raw_cpu_ptr(net->xfrm.state_cache_input);

        rcu_read_lock();
        hlist_for_each_entry_rcu(x, state_cache_input, state_cache_input) {
                if (x->props.family != family ||
                    x->id.spi       != spi ||
                    x->id.proto     != proto ||
                    !xfrm_addr_equal(&x->id.daddr, daddr, family))
                        continue;

                if ((mark & x->mark.m) != x->mark.v)
                        continue;
                if (!xfrm_state_hold_rcu(x))
                        continue;
                goto out;
        }

        xfrm_hash_ptrs_get(net, &state_ptrs);

        x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family);

        if (x && x->km.state == XFRM_STATE_VALID) {
                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                if (hlist_unhashed(&x->state_cache_input)) {
                        hlist_add_head_rcu(&x->state_cache_input, state_cache_input);
                } else {
                        hlist_del_rcu(&x->state_cache_input);
                        hlist_add_head_rcu(&x->state_cache_input, state_cache_input);
                }
                spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        }

out:
        rcu_read_unlock();
        return x;
}
EXPORT_SYMBOL(xfrm_input_state_lookup);

static struct xfrm_state *__xfrm_state_lookup_byaddr(const struct xfrm_hash_state_ptrs *state_ptrs,
                                                     u32 mark,
                                                     const xfrm_address_t *daddr,
                                                     const xfrm_address_t *saddr,
                                                     u8 proto, unsigned short family)
{
        unsigned int h = __xfrm_src_hash(daddr, saddr, family, state_ptrs->hmask);
        struct xfrm_state *x;

        hlist_for_each_entry_rcu(x, state_ptrs->bysrc + h, bysrc) {
                if (x->props.family != family ||
                    x->id.proto     != proto ||
                    !xfrm_addr_equal(&x->id.daddr, daddr, family) ||
                    !xfrm_addr_equal(&x->props.saddr, saddr, family))
                        continue;

                if ((mark & x->mark.m) != x->mark.v)
                        continue;
                if (!xfrm_state_hold_rcu(x))
                        continue;
                return x;
        }

        return NULL;
}

static inline struct xfrm_state *
__xfrm_state_locate(struct xfrm_state *x, int use_spi, int family)
{
        struct xfrm_hash_state_ptrs state_ptrs;
        struct net *net = xs_net(x);
        u32 mark = x->mark.v & x->mark.m;

        xfrm_hash_ptrs_get(net, &state_ptrs);

        if (use_spi)
                return __xfrm_state_lookup(&state_ptrs, mark, &x->id.daddr,
                                           x->id.spi, x->id.proto, family);
        else
                return __xfrm_state_lookup_byaddr(&state_ptrs, mark,
                                                  &x->id.daddr,
                                                  &x->props.saddr,
                                                  x->id.proto, family);
}

static void xfrm_hash_grow_check(struct net *net, int have_hash_collision)
{
        if (have_hash_collision &&
            (net->xfrm.state_hmask + 1) < xfrm_state_hashmax &&
            net->xfrm.state_num > net->xfrm.state_hmask)
                schedule_work(&net->xfrm.state_hash_work);
}

static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
                               const struct flowi *fl, unsigned short family,
                               struct xfrm_state **best, int *acq_in_progress,
                               int *error, unsigned int pcpu_id)
{
        /* Resolution logic:
         * 1. There is a valid state with matching selector. Done.
         * 2. Valid state with inappropriate selector. Skip.
         *
         * Entering area of "sysdeps".
         *
         * 3. If state is not valid, selector is temporary, it selects
         *    only session which triggered previous resolution. Key
         *    manager will do something to install a state with proper
         *    selector.
         */
        if (x->km.state == XFRM_STATE_VALID) {
                if ((x->sel.family &&
                     (x->sel.family != family ||
                      !xfrm_selector_match(&x->sel, fl, family))) ||
                    !security_xfrm_state_pol_flow_match(x, pol,
                                                        &fl->u.__fl_common))
                        return;

                if (x->pcpu_num != UINT_MAX && x->pcpu_num != pcpu_id)
                        return;

                if (!*best ||
                    ((*best)->pcpu_num == UINT_MAX && x->pcpu_num == pcpu_id) ||
                    (*best)->km.dying > x->km.dying ||
                    ((*best)->km.dying == x->km.dying &&
                     (*best)->curlft.add_time < x->curlft.add_time))
                        *best = x;
        } else if (x->km.state == XFRM_STATE_ACQ) {
                if (!*best || x->pcpu_num == pcpu_id)
                        *acq_in_progress = 1;
        } else if (x->km.state == XFRM_STATE_ERROR ||
                   x->km.state == XFRM_STATE_EXPIRED) {
                if ((!x->sel.family ||
                     (x->sel.family == family &&
                      xfrm_selector_match(&x->sel, fl, family))) &&
                    security_xfrm_state_pol_flow_match(x, pol,
                                                       &fl->u.__fl_common))
                        *error = -ESRCH;
        }
}

struct xfrm_state *
xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
                const struct flowi *fl, struct xfrm_tmpl *tmpl,
                struct xfrm_policy *pol, int *err,
                unsigned short family, u32 if_id)
{
        static xfrm_address_t saddr_wildcard = { };
        struct xfrm_hash_state_ptrs state_ptrs;
        struct net *net = xp_net(pol);
        unsigned int h, h_wildcard;
        struct xfrm_state *x, *x0, *to_put;
        int acquire_in_progress = 0;
        int error = 0;
        struct xfrm_state *best = NULL;
        u32 mark = pol->mark.v & pol->mark.m;
        unsigned short encap_family = tmpl->encap_family;
        unsigned int sequence;
        struct km_event c;
        unsigned int pcpu_id;
        bool cached = false;

        /* We need the cpu id just as a lookup key,
         * we don't require it to be stable.
         */
        pcpu_id = raw_smp_processor_id();

        to_put = NULL;

        sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation);

        rcu_read_lock();
        xfrm_hash_ptrs_get(net, &state_ptrs);

        hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) {
                if (x->props.family == encap_family &&
                    x->props.reqid == tmpl->reqid &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->if_id == if_id &&
                    !(x->props.flags & XFRM_STATE_WILDRECV) &&
                    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
                    tmpl->mode == x->props.mode &&
                    tmpl->id.proto == x->id.proto &&
                    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
                        xfrm_state_look_at(pol, x, fl, encap_family,
                                           &best, &acquire_in_progress, &error, pcpu_id);
        }

        if (best)
                goto cached;

        hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) {
                if (x->props.family == encap_family &&
                    x->props.reqid == tmpl->reqid &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->if_id == if_id &&
                    !(x->props.flags & XFRM_STATE_WILDRECV) &&
                    xfrm_addr_equal(&x->id.daddr, daddr, encap_family) &&
                    tmpl->mode == x->props.mode &&
                    tmpl->id.proto == x->id.proto &&
                    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
                        xfrm_state_look_at(pol, x, fl, family,
                                           &best, &acquire_in_progress, &error, pcpu_id);
        }

cached:
        cached = true;
        if (best)
                goto found;
        else if (error)
                best = NULL;
        else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */
                WARN_ON(1);

        h = __xfrm_dst_hash(daddr, saddr, tmpl->reqid, encap_family, state_ptrs.hmask);
        hlist_for_each_entry_rcu(x, state_ptrs.bydst + h, bydst) {
#ifdef CONFIG_XFRM_OFFLOAD
                if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
                        if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
                                /* HW states are in the head of list, there is
                                 * no need to iterate further.
                                 */
                                break;

                        /* Packet offload: both policy and SA should
                         * have same device.
                         */
                        if (pol->xdo.dev != x->xso.dev)
                                continue;
                } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
                        /* Skip HW policy for SW lookups */
                        continue;
#endif
                if (x->props.family == encap_family &&
                    x->props.reqid == tmpl->reqid &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->if_id == if_id &&
                    !(x->props.flags & XFRM_STATE_WILDRECV) &&
                    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
                    tmpl->mode == x->props.mode &&
                    tmpl->id.proto == x->id.proto &&
                    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
                        xfrm_state_look_at(pol, x, fl, family,
                                           &best, &acquire_in_progress, &error, pcpu_id);
        }
        if (best || acquire_in_progress)
                goto found;

        h_wildcard = __xfrm_dst_hash(daddr, &saddr_wildcard, tmpl->reqid,
                                     encap_family, state_ptrs.hmask);
        hlist_for_each_entry_rcu(x, state_ptrs.bydst + h_wildcard, bydst) {
#ifdef CONFIG_XFRM_OFFLOAD
                if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
                        if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
                                /* HW states are in the head of list, there is
                                 * no need to iterate further.
                                 */
                                break;

                        /* Packet offload: both policy and SA should
                         * have same device.
                         */
                        if (pol->xdo.dev != x->xso.dev)
                                continue;
                } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
                        /* Skip HW policy for SW lookups */
                        continue;
#endif
                if (x->props.family == encap_family &&
                    x->props.reqid == tmpl->reqid &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->if_id == if_id &&
                    !(x->props.flags & XFRM_STATE_WILDRECV) &&
                    xfrm_addr_equal(&x->id.daddr, daddr, encap_family) &&
                    tmpl->mode == x->props.mode &&
                    tmpl->id.proto == x->id.proto &&
                    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
                        xfrm_state_look_at(pol, x, fl, family,
                                           &best, &acquire_in_progress, &error, pcpu_id);
        }

found:
        if (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) ||
            (best && (best->pcpu_num == pcpu_id)))
                x = best;

        if (!x && !error && !acquire_in_progress) {
                if (tmpl->id.spi &&
                    (x0 = __xfrm_state_lookup_all(&state_ptrs, mark, daddr,
                                                  tmpl->id.spi, tmpl->id.proto,
                                                  encap_family,
                                                  &pol->xdo)) != NULL) {
                        to_put = x0;
                        error = -EEXIST;
                        goto out;
                }

                c.net = net;
                /* If the KMs have no listeners (yet...), avoid allocating an SA
                 * for each and every packet - garbage collection might not
                 * handle the flood.
                 */
                if (!km_is_alive(&c)) {
                        error = -ESRCH;
                        goto out;
                }

                x = xfrm_state_alloc(net);
                if (x == NULL) {
                        error = -ENOMEM;
                        goto out;
                }
                /* Initialize temporary state matching only
                 * to current session. */
                xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
                memcpy(&x->mark, &pol->mark, sizeof(x->mark));
                x->if_id = if_id;
                if ((pol->flags & XFRM_POLICY_CPU_ACQUIRE) && best)
                        x->pcpu_num = pcpu_id;

                error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
                if (error) {
                        x->km.state = XFRM_STATE_DEAD;
                        to_put = x;
                        x = NULL;
                        goto out;
                }
#ifdef CONFIG_XFRM_OFFLOAD
                if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
                        struct xfrm_dev_offload *xdo = &pol->xdo;
                        struct xfrm_dev_offload *xso = &x->xso;
                        struct net_device *dev = xdo->dev;

                        xso->type = XFRM_DEV_OFFLOAD_PACKET;
                        xso->dir = xdo->dir;
                        xso->dev = dev;
                        xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ;
                        netdev_hold(dev, &xso->dev_tracker, GFP_ATOMIC);
                        error = dev->xfrmdev_ops->xdo_dev_state_add(dev, x,
                                                                    NULL);
                        if (error) {
                                xso->dir = 0;
                                netdev_put(dev, &xso->dev_tracker);
                                xso->dev = NULL;
                                xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
                                x->km.state = XFRM_STATE_DEAD;
                                to_put = x;
                                x = NULL;
                                goto out;
                        }
                }
#endif
                if (km_query(x, tmpl, pol) == 0) {
                        spin_lock_bh(&net->xfrm.xfrm_state_lock);
                        x->km.state = XFRM_STATE_ACQ;
                        x->dir = XFRM_SA_DIR_OUT;
                        list_add(&x->km.all, &net->xfrm.state_all);
                        h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
                        XFRM_STATE_INSERT(bydst, &x->bydst,
                                          xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h,
                                          x->xso.type);
                        h = xfrm_src_hash(net, daddr, saddr, encap_family);
                        XFRM_STATE_INSERT(bysrc, &x->bysrc,
                                          xfrm_state_deref_prot(net->xfrm.state_bysrc, net) + h,
                                          x->xso.type);
                        INIT_HLIST_NODE(&x->state_cache);
                        if (x->id.spi) {
                                h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
                                XFRM_STATE_INSERT(byspi, &x->byspi,
                                                  xfrm_state_deref_prot(net->xfrm.state_byspi, net) + h,
                                                  x->xso.type);
                        }
                        if (x->km.seq) {
                                h = xfrm_seq_hash(net, x->km.seq);
                                XFRM_STATE_INSERT(byseq, &x->byseq,
                                                  xfrm_state_deref_prot(net->xfrm.state_byseq, net) + h,
                                                  x->xso.type);
                        }
                        x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
                        hrtimer_start(&x->mtimer,
                                      ktime_set(net->xfrm.sysctl_acq_expires, 0),
                                      HRTIMER_MODE_REL_SOFT);
                        net->xfrm.state_num++;
                        xfrm_hash_grow_check(net, x->bydst.next != NULL);
                        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
                } else {
#ifdef CONFIG_XFRM_OFFLOAD
                        struct xfrm_dev_offload *xso = &x->xso;

                        if (xso->type == XFRM_DEV_OFFLOAD_PACKET) {
                                xfrm_dev_state_delete(x);
                                xfrm_dev_state_free(x);
                        }
#endif
                        x->km.state = XFRM_STATE_DEAD;
                        to_put = x;
                        x = NULL;
                        error = -ESRCH;
                }

                /* Use the already installed 'fallback' while the CPU-specific
                 * SA acquire is handled*/
                if (best)
                        x = best;
        }
out:
        if (x) {
                if (!xfrm_state_hold_rcu(x)) {
                        *err = -EAGAIN;
                        x = NULL;
                }
        } else {
                *err = acquire_in_progress ? -EAGAIN : error;
        }

        if (x && x->km.state == XFRM_STATE_VALID && !cached &&
            (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || x->pcpu_num == pcpu_id)) {
                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                if (hlist_unhashed(&x->state_cache))
                        hlist_add_head_rcu(&x->state_cache, &pol->state_cache_list);
                spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        }

        rcu_read_unlock();
        if (to_put)
                xfrm_state_put(to_put);

        if (read_seqcount_retry(&net->xfrm.xfrm_state_hash_generation, sequence)) {
                *err = -EAGAIN;
                if (x) {
                        xfrm_state_put(x);
                        x = NULL;
                }
        }

        return x;
}

struct xfrm_state *
xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
                    xfrm_address_t *daddr, xfrm_address_t *saddr,
                    unsigned short family, u8 mode, u8 proto, u32 reqid)
{
        unsigned int h;
        struct xfrm_state *rx = NULL, *x = NULL;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        h = xfrm_dst_hash(net, daddr, saddr, reqid, family);
        hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, bydst) {
                if (x->props.family == family &&
                    x->props.reqid == reqid &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->if_id == if_id &&
                    !(x->props.flags & XFRM_STATE_WILDRECV) &&
                    xfrm_state_addr_check(x, daddr, saddr, family) &&
                    mode == x->props.mode &&
                    proto == x->id.proto &&
                    x->km.state == XFRM_STATE_VALID) {
                        rx = x;
                        break;
                }
        }

        if (rx)
                xfrm_state_hold(rx);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);


        return rx;
}
EXPORT_SYMBOL(xfrm_stateonly_find);

struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
                                              unsigned short family)
{
        struct xfrm_state *x;
        struct xfrm_state_walk *w;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        list_for_each_entry(w, &net->xfrm.state_all, all) {
                x = container_of(w, struct xfrm_state, km);
                if (x->props.family != family ||
                        x->id.spi != spi)
                        continue;

                xfrm_state_hold(x);
                spin_unlock_bh(&net->xfrm.xfrm_state_lock);
                return x;
        }
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        return NULL;
}
EXPORT_SYMBOL(xfrm_state_lookup_byspi);

static struct xfrm_state *xfrm_state_lookup_spi_proto(struct net *net, __be32 spi, u8 proto)
{
        struct xfrm_state *x;
        unsigned int i;

        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_byspi, net) + i, byspi) {
                        if (x->id.spi == spi && x->id.proto == proto)
                                return x;
                }
        }
        return NULL;
}

static void __xfrm_state_insert(struct xfrm_state *x)
{
        struct net *net = xs_net(x);
        unsigned int h;

        list_add(&x->km.all, &net->xfrm.state_all);

        /* Sanitize mark before store */
        x->mark.v &= x->mark.m;

        h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr,
                          x->props.reqid, x->props.family);
        XFRM_STATE_INSERT(bydst, &x->bydst,
                          xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h,
                          x->xso.type);

        h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family);
        XFRM_STATE_INSERT(bysrc, &x->bysrc,
                          xfrm_state_deref_prot(net->xfrm.state_bysrc, net) + h,
                          x->xso.type);

        if (x->id.spi) {
                h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto,
                                  x->props.family);

                XFRM_STATE_INSERT(byspi, &x->byspi,
                                  xfrm_state_deref_prot(net->xfrm.state_byspi, net) + h,
                                  x->xso.type);
        }

        if (x->km.seq) {
                h = xfrm_seq_hash(net, x->km.seq);

                XFRM_STATE_INSERT(byseq, &x->byseq,
                                  xfrm_state_deref_prot(net->xfrm.state_byseq, net) + h,
                                  x->xso.type);
        }

        hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
        if (x->replay_maxage)
                mod_timer(&x->rtimer, jiffies + x->replay_maxage);

        net->xfrm.state_num++;

        xfrm_hash_grow_check(net, x->bydst.next != NULL);
        xfrm_nat_keepalive_state_updated(x);
}

/* net->xfrm.xfrm_state_lock is held */
static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
{
        struct net *net = xs_net(xnew);
        unsigned short family = xnew->props.family;
        u32 reqid = xnew->props.reqid;
        struct xfrm_state *x;
        unsigned int h;
        u32 mark = xnew->mark.v & xnew->mark.m;
        u32 if_id = xnew->if_id;
        u32 cpu_id = xnew->pcpu_num;

        h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
        hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, bydst) {
                if (x->props.family        == family &&
                    x->props.reqid        == reqid &&
                    x->if_id                == if_id &&
                    x->pcpu_num                == cpu_id &&
                    (mark & x->mark.m) == x->mark.v &&
                    xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) &&
                    xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family))
                        x->genid++;
        }
}

void xfrm_state_insert(struct xfrm_state *x)
{
        struct net *net = xs_net(x);

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        __xfrm_state_bump_genids(x);
        __xfrm_state_insert(x);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
}
EXPORT_SYMBOL(xfrm_state_insert);

/* net->xfrm.xfrm_state_lock is held */
static struct xfrm_state *__find_acq_core(struct net *net,
                                          const struct xfrm_mark *m,
                                          unsigned short family, u8 mode,
                                          u32 reqid, u32 if_id, u32 pcpu_num, u8 proto,
                                          const xfrm_address_t *daddr,
                                          const xfrm_address_t *saddr,
                                          int create)
{
        unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family);
        struct xfrm_state *x;
        u32 mark = m->v & m->m;

        hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, bydst) {
                if (x->props.reqid  != reqid ||
                    x->props.mode   != mode ||
                    x->props.family != family ||
                    x->km.state     != XFRM_STATE_ACQ ||
                    x->id.spi       != 0 ||
                    x->id.proto            != proto ||
                    (mark & x->mark.m) != x->mark.v ||
                    x->pcpu_num != pcpu_num ||
                    !xfrm_addr_equal(&x->id.daddr, daddr, family) ||
                    !xfrm_addr_equal(&x->props.saddr, saddr, family))
                        continue;

                xfrm_state_hold(x);
                return x;
        }

        if (!create)
                return NULL;

        x = xfrm_state_alloc(net);
        if (likely(x)) {
                switch (family) {
                case AF_INET:
                        x->sel.daddr.a4 = daddr->a4;
                        x->sel.saddr.a4 = saddr->a4;
                        x->sel.prefixlen_d = 32;
                        x->sel.prefixlen_s = 32;
                        x->props.saddr.a4 = saddr->a4;
                        x->id.daddr.a4 = daddr->a4;
                        break;

                case AF_INET6:
                        x->sel.daddr.in6 = daddr->in6;
                        x->sel.saddr.in6 = saddr->in6;
                        x->sel.prefixlen_d = 128;
                        x->sel.prefixlen_s = 128;
                        x->props.saddr.in6 = saddr->in6;
                        x->id.daddr.in6 = daddr->in6;
                        break;
                }

                x->pcpu_num = pcpu_num;
                x->km.state = XFRM_STATE_ACQ;
                x->id.proto = proto;
                x->props.family = family;
                x->props.mode = mode;
                x->props.reqid = reqid;
                x->if_id = if_id;
                x->mark.v = m->v;
                x->mark.m = m->m;
                x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
                xfrm_state_hold(x);
                hrtimer_start(&x->mtimer,
                              ktime_set(net->xfrm.sysctl_acq_expires, 0),
                              HRTIMER_MODE_REL_SOFT);
                list_add(&x->km.all, &net->xfrm.state_all);
                XFRM_STATE_INSERT(bydst, &x->bydst,
                                  xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h,
                                  x->xso.type);
                h = xfrm_src_hash(net, daddr, saddr, family);
                XFRM_STATE_INSERT(bysrc, &x->bysrc,
                                  xfrm_state_deref_prot(net->xfrm.state_bysrc, net) + h,
                                  x->xso.type);

                net->xfrm.state_num++;

                xfrm_hash_grow_check(net, x->bydst.next != NULL);
        }

        return x;
}

static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num);

int xfrm_state_add(struct xfrm_state *x)
{
        struct net *net = xs_net(x);
        struct xfrm_state *x1, *to_put;
        int family;
        int err;
        u32 mark = x->mark.v & x->mark.m;
        int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);

        family = x->props.family;

        to_put = NULL;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);

        x1 = __xfrm_state_locate(x, use_spi, family);
        if (x1) {
                to_put = x1;
                x1 = NULL;
                err = -EEXIST;
                goto out;
        }

        if (use_spi && x->km.seq) {
                x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq, x->pcpu_num);
                if (x1 && ((x1->id.proto != x->id.proto) ||
                    !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) {
                        to_put = x1;
                        x1 = NULL;
                }
        }

        if (use_spi && !x1)
                x1 = __find_acq_core(net, &x->mark, family, x->props.mode,
                                     x->props.reqid, x->if_id, x->pcpu_num, x->id.proto,
                                     &x->id.daddr, &x->props.saddr, 0);

        __xfrm_state_bump_genids(x);
        __xfrm_state_insert(x);
        err = 0;

out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        if (x1) {
                xfrm_state_delete(x1);
                xfrm_state_put(x1);
        }

        if (to_put)
                xfrm_state_put(to_put);

        return err;
}
EXPORT_SYMBOL(xfrm_state_add);

#ifdef CONFIG_XFRM_MIGRATE
static inline int clone_security(struct xfrm_state *x, struct xfrm_sec_ctx *security)
{
        struct xfrm_user_sec_ctx *uctx;
        int size = sizeof(*uctx) + security->ctx_len;
        int err;

        uctx = kmalloc(size, GFP_KERNEL);
        if (!uctx)
                return -ENOMEM;

        uctx->exttype = XFRMA_SEC_CTX;
        uctx->len = size;
        uctx->ctx_doi = security->ctx_doi;
        uctx->ctx_alg = security->ctx_alg;
        uctx->ctx_len = security->ctx_len;
        memcpy(uctx + 1, security->ctx_str, security->ctx_len);
        err = security_xfrm_state_alloc(x, uctx);
        kfree(uctx);
        if (err)
                return err;

        return 0;
}

static struct xfrm_state *xfrm_state_clone_and_setup(struct xfrm_state *orig,
                                           struct xfrm_encap_tmpl *encap,
                                           struct xfrm_migrate *m)
{
        struct net *net = xs_net(orig);
        struct xfrm_state *x = xfrm_state_alloc(net);
        if (!x)
                goto out;

        memcpy(&x->id, &orig->id, sizeof(x->id));
        memcpy(&x->sel, &orig->sel, sizeof(x->sel));
        memcpy(&x->lft, &orig->lft, sizeof(x->lft));
        x->props.mode = orig->props.mode;
        x->props.replay_window = orig->props.replay_window;
        x->props.reqid = orig->props.reqid;
        x->props.family = orig->props.family;
        x->props.saddr = orig->props.saddr;

        if (orig->aalg) {
                x->aalg = xfrm_algo_auth_clone(orig->aalg);
                if (!x->aalg)
                        goto error;
        }
        x->props.aalgo = orig->props.aalgo;

        if (orig->aead) {
                x->aead = xfrm_algo_aead_clone(orig->aead);
                x->geniv = orig->geniv;
                if (!x->aead)
                        goto error;
        }
        if (orig->ealg) {
                x->ealg = xfrm_algo_clone(orig->ealg);
                if (!x->ealg)
                        goto error;
        }
        x->props.ealgo = orig->props.ealgo;

        if (orig->calg) {
                x->calg = xfrm_algo_clone(orig->calg);
                if (!x->calg)
                        goto error;
        }
        x->props.calgo = orig->props.calgo;

        if (encap || orig->encap) {
                if (encap)
                        x->encap = kmemdup(encap, sizeof(*x->encap),
                                        GFP_KERNEL);
                else
                        x->encap = kmemdup(orig->encap, sizeof(*x->encap),
                                        GFP_KERNEL);

                if (!x->encap)
                        goto error;
        }

        if (orig->security)
                if (clone_security(x, orig->security))
                        goto error;

        if (orig->coaddr) {
                x->coaddr = kmemdup(orig->coaddr, sizeof(*x->coaddr),
                                    GFP_KERNEL);
                if (!x->coaddr)
                        goto error;
        }

        if (orig->replay_esn) {
                if (xfrm_replay_clone(x, orig))
                        goto error;
        }

        memcpy(&x->mark, &orig->mark, sizeof(x->mark));
        memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark));

        x->props.flags = orig->props.flags;
        x->props.extra_flags = orig->props.extra_flags;

        x->pcpu_num = orig->pcpu_num;
        x->if_id = orig->if_id;
        x->tfcpad = orig->tfcpad;
        x->replay_maxdiff = orig->replay_maxdiff;
        x->replay_maxage = orig->replay_maxage;
        memcpy(&x->curlft, &orig->curlft, sizeof(x->curlft));
        x->km.state = orig->km.state;
        x->km.seq = orig->km.seq;
        x->replay = orig->replay;
        x->preplay = orig->preplay;
        x->mapping_maxage = orig->mapping_maxage;
        x->lastused = orig->lastused;
        x->new_mapping = 0;
        x->new_mapping_sport = 0;
        x->dir = orig->dir;

        x->mode_cbs = orig->mode_cbs;
        if (x->mode_cbs && x->mode_cbs->clone_state) {
                if (x->mode_cbs->clone_state(x, orig))
                        goto error;
        }


        x->props.family = m->new_family;
        memcpy(&x->id.daddr, &m->new_daddr, sizeof(x->id.daddr));
        memcpy(&x->props.saddr, &m->new_saddr, sizeof(x->props.saddr));

        return x;

 error:
        x->km.state = XFRM_STATE_DEAD;
        xfrm_state_put(x);
out:
        return NULL;
}

struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net,
                                                u32 if_id)
{
        unsigned int h;
        struct xfrm_state *x = NULL;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);

        if (m->reqid) {
                h = xfrm_dst_hash(net, &m->old_daddr, &m->old_saddr,
                                  m->reqid, m->old_family);
                hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, bydst) {
                        if (x->props.mode != m->mode ||
                            x->id.proto != m->proto)
                                continue;
                        if (m->reqid && x->props.reqid != m->reqid)
                                continue;
                        if (if_id != 0 && x->if_id != if_id)
                                continue;
                        if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr,
                                             m->old_family) ||
                            !xfrm_addr_equal(&x->props.saddr, &m->old_saddr,
                                             m->old_family))
                                continue;
                        xfrm_state_hold(x);
                        break;
                }
        } else {
                h = xfrm_src_hash(net, &m->old_daddr, &m->old_saddr,
                                  m->old_family);
                hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bysrc, net) + h, bysrc) {
                        if (x->props.mode != m->mode ||
                            x->id.proto != m->proto)
                                continue;
                        if (if_id != 0 && x->if_id != if_id)
                                continue;
                        if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr,
                                             m->old_family) ||
                            !xfrm_addr_equal(&x->props.saddr, &m->old_saddr,
                                             m->old_family))
                                continue;
                        xfrm_state_hold(x);
                        break;
                }
        }

        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        return x;
}
EXPORT_SYMBOL(xfrm_migrate_state_find);

struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
                                      struct xfrm_migrate *m,
                                      struct xfrm_encap_tmpl *encap,
                                      struct net *net,
                                      struct xfrm_user_offload *xuo,
                                      struct netlink_ext_ack *extack)
{
        struct xfrm_state *xc;

        xc = xfrm_state_clone_and_setup(x, encap, m);
        if (!xc)
                return NULL;

        if (xfrm_init_state(xc) < 0)
                goto error;

        /* configure the hardware if offload is requested */
        if (xuo && xfrm_dev_state_add(net, xc, xuo, extack))
                goto error;

        /* add state */
        if (xfrm_addr_equal(&x->id.daddr, &m->new_daddr, m->new_family)) {
                /* a care is needed when the destination address of the
                   state is to be updated as it is a part of triplet */
                xfrm_state_insert(xc);
        } else {
                if (xfrm_state_add(xc) < 0)
                        goto error_add;
        }

        return xc;
error_add:
        if (xuo)
                xfrm_dev_state_delete(xc);
error:
        xc->km.state = XFRM_STATE_DEAD;
        xfrm_state_put(xc);
        return NULL;
}
EXPORT_SYMBOL(xfrm_state_migrate);
#endif

int xfrm_state_update(struct xfrm_state *x)
{
        struct xfrm_state *x1, *to_put;
        int err;
        int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);
        struct net *net = xs_net(x);

        to_put = NULL;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        x1 = __xfrm_state_locate(x, use_spi, x->props.family);

        err = -ESRCH;
        if (!x1)
                goto out;

        if (xfrm_state_kern(x1)) {
                to_put = x1;
                err = -EEXIST;
                goto out;
        }

        if (x1->km.state == XFRM_STATE_ACQ) {
                if (x->dir && x1->dir != x->dir) {
                        to_put = x1;
                        goto out;
                }

                __xfrm_state_insert(x);
                x = NULL;
        } else {
                if (x1->dir != x->dir) {
                        to_put = x1;
                        goto out;
                }
        }
        err = 0;

out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        if (to_put)
                xfrm_state_put(to_put);

        if (err)
                return err;

        if (!x) {
                xfrm_state_delete(x1);
                xfrm_state_put(x1);
                return 0;
        }

        err = -EINVAL;
        spin_lock_bh(&x1->lock);
        if (likely(x1->km.state == XFRM_STATE_VALID)) {
                if (x->encap && x1->encap &&
                    x->encap->encap_type == x1->encap->encap_type)
                        memcpy(x1->encap, x->encap, sizeof(*x1->encap));
                else if (x->encap || x1->encap)
                        goto fail;

                if (x->coaddr && x1->coaddr) {
                        memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr));
                }
                if (!use_spi && memcmp(&x1->sel, &x->sel, sizeof(x1->sel)))
                        memcpy(&x1->sel, &x->sel, sizeof(x1->sel));
                memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
                x1->km.dying = 0;

                hrtimer_start(&x1->mtimer, ktime_set(1, 0),
                              HRTIMER_MODE_REL_SOFT);
                if (READ_ONCE(x1->curlft.use_time))
                        xfrm_state_check_expire(x1);

                if (x->props.smark.m || x->props.smark.v || x->if_id) {
                        spin_lock_bh(&net->xfrm.xfrm_state_lock);

                        if (x->props.smark.m || x->props.smark.v)
                                x1->props.smark = x->props.smark;

                        if (x->if_id)
                                x1->if_id = x->if_id;

                        __xfrm_state_bump_genids(x1);
                        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
                }

                err = 0;
                x->km.state = XFRM_STATE_DEAD;
                xfrm_dev_state_delete(x);
                __xfrm_state_put(x);
        }

fail:
        spin_unlock_bh(&x1->lock);

        xfrm_state_put(x1);

        return err;
}
EXPORT_SYMBOL(xfrm_state_update);

int xfrm_state_check_expire(struct xfrm_state *x)
{
        /* All counters which are needed to decide if state is expired
         * are handled by SW for non-packet offload modes. Simply skip
         * the following update and save extra boilerplate in drivers.
         */
        if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
                xfrm_dev_state_update_stats(x);

        if (!READ_ONCE(x->curlft.use_time))
                WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds());

        if (x->curlft.bytes >= x->lft.hard_byte_limit ||
            x->curlft.packets >= x->lft.hard_packet_limit) {
                x->km.state = XFRM_STATE_EXPIRED;
                hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT);
                return -EINVAL;
        }

        if (!x->km.dying &&
            (x->curlft.bytes >= x->lft.soft_byte_limit ||
             x->curlft.packets >= x->lft.soft_packet_limit)) {
                x->km.dying = 1;
                km_state_expired(x, 0, 0);
        }
        return 0;
}
EXPORT_SYMBOL(xfrm_state_check_expire);

void xfrm_state_update_stats(struct net *net)
{
        struct xfrm_state *x;
        int i;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + i, bydst)
                        xfrm_dev_state_update_stats(x);
        }
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
}

struct xfrm_state *
xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi,
                  u8 proto, unsigned short family)
{
        struct xfrm_hash_state_ptrs state_ptrs;
        struct xfrm_state *x;

        rcu_read_lock();
        xfrm_hash_ptrs_get(net, &state_ptrs);

        x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family);
        rcu_read_unlock();
        return x;
}
EXPORT_SYMBOL(xfrm_state_lookup);

struct xfrm_state *
xfrm_state_lookup_byaddr(struct net *net, u32 mark,
                         const xfrm_address_t *daddr, const xfrm_address_t *saddr,
                         u8 proto, unsigned short family)
{
        struct xfrm_hash_state_ptrs state_ptrs;
        struct xfrm_state *x;

        rcu_read_lock();

        xfrm_hash_ptrs_get(net, &state_ptrs);

        x = __xfrm_state_lookup_byaddr(&state_ptrs, mark, daddr, saddr, proto, family);
        rcu_read_unlock();
        return x;
}
EXPORT_SYMBOL(xfrm_state_lookup_byaddr);

struct xfrm_state *
xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
              u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr,
              const xfrm_address_t *saddr, int create, unsigned short family)
{
        struct xfrm_state *x;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        x = __find_acq_core(net, mark, family, mode, reqid, if_id, pcpu_num,
                            proto, daddr, saddr, create);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        return x;
}
EXPORT_SYMBOL(xfrm_find_acq);

#ifdef CONFIG_XFRM_SUB_POLICY
#if IS_ENABLED(CONFIG_IPV6)
/* distribution counting sort function for xfrm_state and xfrm_tmpl */
static void
__xfrm6_sort(void **dst, void **src, int n,
             int (*cmp)(const void *p), int maxclass)
{
        int count[XFRM_MAX_DEPTH] = { };
        int class[XFRM_MAX_DEPTH];
        int i;

        for (i = 0; i < n; i++) {
                int c = cmp(src[i]);

                class[i] = c;
                count[c]++;
        }

        for (i = 2; i < maxclass; i++)
                count[i] += count[i - 1];

        for (i = 0; i < n; i++) {
                dst[count[class[i] - 1]++] = src[i];
                src[i] = NULL;
        }
}

/* Rule for xfrm_state:
 *
 * rule 1: select IPsec transport except AH
 * rule 2: select MIPv6 RO or inbound trigger
 * rule 3: select IPsec transport AH
 * rule 4: select IPsec tunnel
 * rule 5: others
 */
static int __xfrm6_state_sort_cmp(const void *p)
{
        const struct xfrm_state *v = p;

        switch (v->props.mode) {
        case XFRM_MODE_TRANSPORT:
                if (v->id.proto != IPPROTO_AH)
                        return 1;
                else
                        return 3;
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        case XFRM_MODE_ROUTEOPTIMIZATION:
        case XFRM_MODE_IN_TRIGGER:
                return 2;
#endif
        case XFRM_MODE_TUNNEL:
        case XFRM_MODE_BEET:
        case XFRM_MODE_IPTFS:
                return 4;
        }
        return 5;
}

/* Rule for xfrm_tmpl:
 *
 * rule 1: select IPsec transport
 * rule 2: select MIPv6 RO or inbound trigger
 * rule 3: select IPsec tunnel
 * rule 4: others
 */
static int __xfrm6_tmpl_sort_cmp(const void *p)
{
        const struct xfrm_tmpl *v = p;

        switch (v->mode) {
        case XFRM_MODE_TRANSPORT:
                return 1;
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        case XFRM_MODE_ROUTEOPTIMIZATION:
        case XFRM_MODE_IN_TRIGGER:
                return 2;
#endif
        case XFRM_MODE_TUNNEL:
        case XFRM_MODE_BEET:
        case XFRM_MODE_IPTFS:
                return 3;
        }
        return 4;
}
#else
static inline int __xfrm6_state_sort_cmp(const void *p) { return 5; }
static inline int __xfrm6_tmpl_sort_cmp(const void *p) { return 4; }

static inline void
__xfrm6_sort(void **dst, void **src, int n,
             int (*cmp)(const void *p), int maxclass)
{
        int i;

        for (i = 0; i < n; i++)
                dst[i] = src[i];
}
#endif /* CONFIG_IPV6 */

void
xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
               unsigned short family)
{
        int i;

        if (family == AF_INET6)
                __xfrm6_sort((void **)dst, (void **)src, n,
                             __xfrm6_tmpl_sort_cmp, 5);
        else
                for (i = 0; i < n; i++)
                        dst[i] = src[i];
}

void
xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
                unsigned short family)
{
        int i;

        if (family == AF_INET6)
                __xfrm6_sort((void **)dst, (void **)src, n,
                             __xfrm6_state_sort_cmp, 6);
        else
                for (i = 0; i < n; i++)
                        dst[i] = src[i];
}
#endif

/* Silly enough, but I'm lazy to build resolution list */

static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num)
{
        unsigned int h = xfrm_seq_hash(net, seq);
        struct xfrm_state *x;

        hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_byseq, net) + h, byseq) {
                if (x->km.seq == seq &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->pcpu_num == pcpu_num &&
                    x->km.state == XFRM_STATE_ACQ) {
                        xfrm_state_hold(x);
                        return x;
                }
        }

        return NULL;
}

struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num)
{
        struct xfrm_state *x;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        x = __xfrm_find_acq_byseq(net, mark, seq, pcpu_num);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        return x;
}
EXPORT_SYMBOL(xfrm_find_acq_byseq);

u32 xfrm_get_acqseq(void)
{
        u32 res;
        static atomic_t acqseq;

        do {
                res = atomic_inc_return(&acqseq);
        } while (!res);

        return res;
}
EXPORT_SYMBOL(xfrm_get_acqseq);

int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack)
{
        switch (proto) {
        case IPPROTO_AH:
        case IPPROTO_ESP:
                break;

        case IPPROTO_COMP:
                /* IPCOMP spi is 16-bits. */
                if (max >= 0x10000) {
                        NL_SET_ERR_MSG(extack, "IPCOMP SPI must be <= 65535");
                        return -EINVAL;
                }
                break;

        default:
                NL_SET_ERR_MSG(extack, "Invalid protocol, must be one of AH, ESP, IPCOMP");
                return -EINVAL;
        }

        if (min > max) {
                NL_SET_ERR_MSG(extack, "Invalid SPI range: min > max");
                return -EINVAL;
        }

        return 0;
}
EXPORT_SYMBOL(verify_spi_info);

int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high,
                   struct netlink_ext_ack *extack)
{
        struct net *net = xs_net(x);
        unsigned int h;
        struct xfrm_state *x0;
        int err = -ENOENT;
        u32 range = high - low + 1;
        __be32 newspi = 0;

        spin_lock_bh(&x->lock);
        if (x->km.state == XFRM_STATE_DEAD) {
                NL_SET_ERR_MSG(extack, "Target ACQUIRE is in DEAD state");
                goto unlock;
        }

        err = 0;
        if (x->id.spi)
                goto unlock;

        err = -ENOENT;

        for (h = 0; h < range; h++) {
                u32 spi = (low == high) ? low : get_random_u32_inclusive(low, high);
                if (spi == 0)
                        goto next;
                newspi = htonl(spi);

                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                x0 = xfrm_state_lookup_spi_proto(net, newspi, x->id.proto);
                if (!x0) {
                        x->id.spi = newspi;
                        h = xfrm_spi_hash(net, &x->id.daddr, newspi, x->id.proto, x->props.family);
                        XFRM_STATE_INSERT(byspi, &x->byspi,
                                          xfrm_state_deref_prot(net->xfrm.state_byspi, net) + h,
                                          x->xso.type);
                        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
                        err = 0;
                        goto unlock;
                }
                spin_unlock_bh(&net->xfrm.xfrm_state_lock);

next:
                if (signal_pending(current)) {
                        err = -ERESTARTSYS;
                        goto unlock;
                }

                if (low == high)
                        break;
        }

        if (err)
                NL_SET_ERR_MSG(extack, "No SPI available in the requested range");

unlock:
        spin_unlock_bh(&x->lock);

        return err;
}
EXPORT_SYMBOL(xfrm_alloc_spi);

static bool __xfrm_state_filter_match(struct xfrm_state *x,
                                      struct xfrm_address_filter *filter)
{
        if (filter) {
                if ((filter->family == AF_INET ||
                     filter->family == AF_INET6) &&
                    x->props.family != filter->family)
                        return false;

                return addr_match(&x->props.saddr, &filter->saddr,
                                  filter->splen) &&
                       addr_match(&x->id.daddr, &filter->daddr,
                                  filter->dplen);
        }
        return true;
}

int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
                    int (*func)(struct xfrm_state *, int, void*),
                    void *data)
{
        struct xfrm_state *state;
        struct xfrm_state_walk *x;
        int err = 0;

        if (walk->seq != 0 && list_empty(&walk->all))
                return 0;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        if (list_empty(&walk->all))
                x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all);
        else
                x = list_first_entry(&walk->all, struct xfrm_state_walk, all);
        list_for_each_entry_from(x, &net->xfrm.state_all, all) {
                if (x->state == XFRM_STATE_DEAD)
                        continue;
                state = container_of(x, struct xfrm_state, km);
                if (!xfrm_id_proto_match(state->id.proto, walk->proto))
                        continue;
                if (!__xfrm_state_filter_match(state, walk->filter))
                        continue;
                err = func(state, walk->seq, data);
                if (err) {
                        list_move_tail(&walk->all, &x->all);
                        goto out;
                }
                walk->seq++;
        }
        if (walk->seq == 0) {
                err = -ENOENT;
                goto out;
        }
        list_del_init(&walk->all);
out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_state_walk);

void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
                          struct xfrm_address_filter *filter)
{
        INIT_LIST_HEAD(&walk->all);
        walk->proto = proto;
        walk->state = XFRM_STATE_DEAD;
        walk->seq = 0;
        walk->filter = filter;
}
EXPORT_SYMBOL(xfrm_state_walk_init);

void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net)
{
        kfree(walk->filter);

        if (list_empty(&walk->all))
                return;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        list_del(&walk->all);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
}
EXPORT_SYMBOL(xfrm_state_walk_done);

static void xfrm_replay_timer_handler(struct timer_list *t)
{
        struct xfrm_state *x = timer_container_of(x, t, rtimer);

        spin_lock(&x->lock);

        if (x->km.state == XFRM_STATE_VALID) {
                if (xfrm_aevent_is_on(xs_net(x)))
                        xfrm_replay_notify(x, XFRM_REPLAY_TIMEOUT);
                else
                        x->xflags |= XFRM_TIME_DEFER;
        }

        spin_unlock(&x->lock);
}

static LIST_HEAD(xfrm_km_list);

void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
        struct xfrm_mgr *km;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list)
                if (km->notify_policy)
                        km->notify_policy(xp, dir, c);
        rcu_read_unlock();
}

void km_state_notify(struct xfrm_state *x, const struct km_event *c)
{
        struct xfrm_mgr *km;
        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list)
                if (km->notify)
                        km->notify(x, c);
        rcu_read_unlock();
}

EXPORT_SYMBOL(km_policy_notify);
EXPORT_SYMBOL(km_state_notify);

void km_state_expired(struct xfrm_state *x, int hard, u32 portid)
{
        struct km_event c;

        c.data.hard = hard;
        c.portid = portid;
        c.event = XFRM_MSG_EXPIRE;
        km_state_notify(x, &c);
}

EXPORT_SYMBOL(km_state_expired);
/*
 * We send to all registered managers regardless of failure
 * We are happy with one success
*/
int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
{
        int err = -EINVAL, acqret;
        struct xfrm_mgr *km;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                acqret = km->acquire(x, t, pol);
                if (!acqret)
                        err = acqret;
        }
        rcu_read_unlock();
        return err;
}
EXPORT_SYMBOL(km_query);

static int __km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport)
{
        int err = -EINVAL;
        struct xfrm_mgr *km;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                if (km->new_mapping)
                        err = km->new_mapping(x, ipaddr, sport);
                if (!err)
                        break;
        }
        rcu_read_unlock();
        return err;
}

int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport)
{
        int ret = 0;

        if (x->mapping_maxage) {
                if ((jiffies / HZ - x->new_mapping) > x->mapping_maxage ||
                    x->new_mapping_sport != sport) {
                        x->new_mapping_sport = sport;
                        x->new_mapping = jiffies / HZ;
                        ret = __km_new_mapping(x, ipaddr, sport);
                }
        } else {
                ret = __km_new_mapping(x, ipaddr, sport);
        }

        return ret;
}
EXPORT_SYMBOL(km_new_mapping);

void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid)
{
        struct km_event c;

        c.data.hard = hard;
        c.portid = portid;
        c.event = XFRM_MSG_POLEXPIRE;
        km_policy_notify(pol, dir, &c);
}
EXPORT_SYMBOL(km_policy_expired);

#ifdef CONFIG_XFRM_MIGRATE
int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
               const struct xfrm_migrate *m, int num_migrate,
               const struct xfrm_kmaddress *k,
               const struct xfrm_encap_tmpl *encap)
{
        int err = -EINVAL;
        int ret;
        struct xfrm_mgr *km;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                if (km->migrate) {
                        ret = km->migrate(sel, dir, type, m, num_migrate, k,
                                          encap);
                        if (!ret)
                                err = ret;
                }
        }
        rcu_read_unlock();
        return err;
}
EXPORT_SYMBOL(km_migrate);
#endif

int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr)
{
        int err = -EINVAL;
        int ret;
        struct xfrm_mgr *km;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                if (km->report) {
                        ret = km->report(net, proto, sel, addr);
                        if (!ret)
                                err = ret;
                }
        }
        rcu_read_unlock();
        return err;
}
EXPORT_SYMBOL(km_report);

static bool km_is_alive(const struct km_event *c)
{
        struct xfrm_mgr *km;
        bool is_alive = false;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                if (km->is_alive && km->is_alive(c)) {
                        is_alive = true;
                        break;
                }
        }
        rcu_read_unlock();

        return is_alive;
}

#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT)
static DEFINE_SPINLOCK(xfrm_translator_lock);
static struct xfrm_translator __rcu *xfrm_translator;

struct xfrm_translator *xfrm_get_translator(void)
{
        struct xfrm_translator *xtr;

        rcu_read_lock();
        xtr = rcu_dereference(xfrm_translator);
        if (unlikely(!xtr))
                goto out;
        if (!try_module_get(xtr->owner))
                xtr = NULL;
out:
        rcu_read_unlock();
        return xtr;
}
EXPORT_SYMBOL_GPL(xfrm_get_translator);

void xfrm_put_translator(struct xfrm_translator *xtr)
{
        module_put(xtr->owner);
}
EXPORT_SYMBOL_GPL(xfrm_put_translator);

int xfrm_register_translator(struct xfrm_translator *xtr)
{
        int err = 0;

        spin_lock_bh(&xfrm_translator_lock);
        if (unlikely(xfrm_translator != NULL))
                err = -EEXIST;
        else
                rcu_assign_pointer(xfrm_translator, xtr);
        spin_unlock_bh(&xfrm_translator_lock);

        return err;
}
EXPORT_SYMBOL_GPL(xfrm_register_translator);

int xfrm_unregister_translator(struct xfrm_translator *xtr)
{
        int err = 0;

        spin_lock_bh(&xfrm_translator_lock);
        if (likely(xfrm_translator != NULL)) {
                if (rcu_access_pointer(xfrm_translator) != xtr)
                        err = -EINVAL;
                else
                        RCU_INIT_POINTER(xfrm_translator, NULL);
        }
        spin_unlock_bh(&xfrm_translator_lock);
        synchronize_rcu();

        return err;
}
EXPORT_SYMBOL_GPL(xfrm_unregister_translator);
#endif

int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen)
{
        int err;
        u8 *data;
        struct xfrm_mgr *km;
        struct xfrm_policy *pol = NULL;

        if (sockptr_is_null(optval) && !optlen) {
                xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
                xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL);
                __sk_dst_reset(sk);
                return 0;
        }

        if (optlen <= 0 || optlen > PAGE_SIZE)
                return -EMSGSIZE;

        data = memdup_sockptr(optval, optlen);
        if (IS_ERR(data))
                return PTR_ERR(data);

        if (in_compat_syscall()) {
                struct xfrm_translator *xtr = xfrm_get_translator();

                if (!xtr) {
                        kfree(data);
                        return -EOPNOTSUPP;
                }

                err = xtr->xlate_user_policy_sockptr(&data, optlen);
                xfrm_put_translator(xtr);
                if (err) {
                        kfree(data);
                        return err;
                }
        }

        err = -EINVAL;
        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                pol = km->compile_policy(sk, optname, data,
                                         optlen, &err);
                if (err >= 0)
                        break;
        }
        rcu_read_unlock();

        if (err >= 0) {
                xfrm_sk_policy_insert(sk, err, pol);
                xfrm_pol_put(pol);
                __sk_dst_reset(sk);
                err = 0;
        }

        kfree(data);
        return err;
}
EXPORT_SYMBOL(xfrm_user_policy);

static DEFINE_SPINLOCK(xfrm_km_lock);

void xfrm_register_km(struct xfrm_mgr *km)
{
        spin_lock_bh(&xfrm_km_lock);
        list_add_tail_rcu(&km->list, &xfrm_km_list);
        spin_unlock_bh(&xfrm_km_lock);
}
EXPORT_SYMBOL(xfrm_register_km);

void xfrm_unregister_km(struct xfrm_mgr *km)
{
        spin_lock_bh(&xfrm_km_lock);
        list_del_rcu(&km->list);
        spin_unlock_bh(&xfrm_km_lock);
        synchronize_rcu();
}
EXPORT_SYMBOL(xfrm_unregister_km);

int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
{
        int err = 0;

        if (WARN_ON(afinfo->family >= NPROTO))
                return -EAFNOSUPPORT;

        spin_lock_bh(&xfrm_state_afinfo_lock);
        if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
                err = -EEXIST;
        else
                rcu_assign_pointer(xfrm_state_afinfo[afinfo->family], afinfo);
        spin_unlock_bh(&xfrm_state_afinfo_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_state_register_afinfo);

int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
{
        int err = 0, family = afinfo->family;

        if (WARN_ON(family >= NPROTO))
                return -EAFNOSUPPORT;

        spin_lock_bh(&xfrm_state_afinfo_lock);
        if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
                if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo)
                        err = -EINVAL;
                else
                        RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL);
        }
        spin_unlock_bh(&xfrm_state_afinfo_lock);
        synchronize_rcu();
        return err;
}
EXPORT_SYMBOL(xfrm_state_unregister_afinfo);

struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family)
{
        if (unlikely(family >= NPROTO))
                return NULL;

        return rcu_dereference(xfrm_state_afinfo[family]);
}
EXPORT_SYMBOL_GPL(xfrm_state_afinfo_get_rcu);

struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
{
        struct xfrm_state_afinfo *afinfo;
        if (unlikely(family >= NPROTO))
                return NULL;
        rcu_read_lock();
        afinfo = rcu_dereference(xfrm_state_afinfo[family]);
        if (unlikely(!afinfo))
                rcu_read_unlock();
        return afinfo;
}

void xfrm_flush_gc(void)
{
        flush_work(&xfrm_state_gc_work);
}
EXPORT_SYMBOL(xfrm_flush_gc);

static void xfrm_state_delete_tunnel(struct xfrm_state *x)
{
        if (x->tunnel) {
                struct xfrm_state *t = x->tunnel;

                if (atomic_dec_return(&t->tunnel_users) == 1)
                        xfrm_state_delete(t);
                xfrm_state_put(t);
                x->tunnel = NULL;
        }
}

u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
{
        const struct xfrm_type *type = READ_ONCE(x->type);
        struct crypto_aead *aead;
        u32 blksize, net_adj = 0;

        if (x->km.state != XFRM_STATE_VALID ||
            !type || type->proto != IPPROTO_ESP)
                return mtu - x->props.header_len;

        aead = x->data;
        blksize = ALIGN(crypto_aead_blocksize(aead), 4);

        switch (x->props.mode) {
        case XFRM_MODE_TRANSPORT:
        case XFRM_MODE_BEET:
                if (x->props.family == AF_INET)
                        net_adj = sizeof(struct iphdr);
                else if (x->props.family == AF_INET6)
                        net_adj = sizeof(struct ipv6hdr);
                break;
        case XFRM_MODE_TUNNEL:
                break;
        default:
                if (x->mode_cbs && x->mode_cbs->get_inner_mtu)
                        return x->mode_cbs->get_inner_mtu(x, mtu);

                WARN_ON_ONCE(1);
                break;
        }

        return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
                 net_adj) & ~(blksize - 1)) + net_adj - 2;
}
EXPORT_SYMBOL_GPL(xfrm_state_mtu);

int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        const struct xfrm_mode *inner_mode;
        const struct xfrm_mode *outer_mode;
        int family = x->props.family;
        int err;

        if (family == AF_INET &&
            (!x->dir || x->dir == XFRM_SA_DIR_OUT) &&
            READ_ONCE(xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc))
                x->props.flags |= XFRM_STATE_NOPMTUDISC;

        err = -EPROTONOSUPPORT;

        if (x->sel.family != AF_UNSPEC) {
                inner_mode = xfrm_get_mode(x->props.mode, x->sel.family);
                if (inner_mode == NULL) {
                        NL_SET_ERR_MSG(extack, "Requested mode not found");
                        goto error;
                }

                if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL) &&
                    family != x->sel.family) {
                        NL_SET_ERR_MSG(extack, "Only tunnel modes can accommodate a change of family");
                        goto error;
                }

                x->inner_mode = *inner_mode;
        } else {
                const struct xfrm_mode *inner_mode_iaf;
                int iafamily = AF_INET;

                inner_mode = xfrm_get_mode(x->props.mode, x->props.family);
                if (inner_mode == NULL) {
                        NL_SET_ERR_MSG(extack, "Requested mode not found");
                        goto error;
                }

                x->inner_mode = *inner_mode;

                if (x->props.family == AF_INET)
                        iafamily = AF_INET6;

                inner_mode_iaf = xfrm_get_mode(x->props.mode, iafamily);
                if (inner_mode_iaf) {
                        if (inner_mode_iaf->flags & XFRM_MODE_FLAG_TUNNEL)
                                x->inner_mode_iaf = *inner_mode_iaf;
                }
        }

        x->type = xfrm_get_type(x->id.proto, family);
        if (x->type == NULL) {
                NL_SET_ERR_MSG(extack, "Requested type not found");
                goto error;
        }

        err = x->type->init_state(x, extack);
        if (err)
                goto error;

        outer_mode = xfrm_get_mode(x->props.mode, family);
        if (!outer_mode) {
                NL_SET_ERR_MSG(extack, "Requested mode not found");
                err = -EPROTONOSUPPORT;
                goto error;
        }

        x->outer_mode = *outer_mode;
        if (x->nat_keepalive_interval) {
                if (x->dir != XFRM_SA_DIR_OUT) {
                        NL_SET_ERR_MSG(extack, "NAT keepalive is only supported for outbound SAs");
                        err = -EINVAL;
                        goto error;
                }

                if (!x->encap || x->encap->encap_type != UDP_ENCAP_ESPINUDP) {
                        NL_SET_ERR_MSG(extack,
                                       "NAT keepalive is only supported for UDP encapsulation");
                        err = -EINVAL;
                        goto error;
                }
        }

        x->mode_cbs = xfrm_get_mode_cbs(x->props.mode);
        if (x->mode_cbs) {
                if (x->mode_cbs->init_state)
                        err = x->mode_cbs->init_state(x);
                module_put(x->mode_cbs->owner);
        }
error:
        return err;
}

EXPORT_SYMBOL(__xfrm_init_state);

int xfrm_init_state(struct xfrm_state *x)
{
        int err;

        err = __xfrm_init_state(x, NULL);
        if (err)
                return err;

        err = xfrm_init_replay(x, NULL);
        if (err)
                return err;

        x->km.state = XFRM_STATE_VALID;
        return 0;
}

EXPORT_SYMBOL(xfrm_init_state);

int __net_init xfrm_state_init(struct net *net)
{
        struct hlist_head *ndst, *nsrc, *nspi, *nseq;
        unsigned int sz;

        if (net_eq(net, &init_net))
                xfrm_state_cache = KMEM_CACHE(xfrm_state,
                                              SLAB_HWCACHE_ALIGN | SLAB_PANIC);

        INIT_LIST_HEAD(&net->xfrm.state_all);

        sz = sizeof(struct hlist_head) * 8;

        ndst = xfrm_hash_alloc(sz);
        if (!ndst)
                goto out_bydst;
        rcu_assign_pointer(net->xfrm.state_bydst, ndst);

        nsrc = xfrm_hash_alloc(sz);
        if (!nsrc)
                goto out_bysrc;
        rcu_assign_pointer(net->xfrm.state_bysrc, nsrc);

        nspi = xfrm_hash_alloc(sz);
        if (!nspi)
                goto out_byspi;
        rcu_assign_pointer(net->xfrm.state_byspi, nspi);

        nseq = xfrm_hash_alloc(sz);
        if (!nseq)
                goto out_byseq;
        rcu_assign_pointer(net->xfrm.state_byseq, nseq);

        net->xfrm.state_cache_input = alloc_percpu(struct hlist_head);
        if (!net->xfrm.state_cache_input)
                goto out_state_cache_input;

        net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1);

        net->xfrm.state_num = 0;
        INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize);
        spin_lock_init(&net->xfrm.xfrm_state_lock);
        seqcount_spinlock_init(&net->xfrm.xfrm_state_hash_generation,
                               &net->xfrm.xfrm_state_lock);
        return 0;

out_state_cache_input:
        xfrm_hash_free(nseq, sz);
out_byseq:
        xfrm_hash_free(nspi, sz);
out_byspi:
        xfrm_hash_free(nsrc, sz);
out_bysrc:
        xfrm_hash_free(ndst, sz);
out_bydst:
        return -ENOMEM;
}

#define xfrm_state_deref_netexit(table) \
        rcu_dereference_protected((table), true /* netns is going away */)
void xfrm_state_fini(struct net *net)
{
        unsigned int sz;
        int i;

        flush_work(&net->xfrm.state_hash_work);
        xfrm_state_flush(net, 0, false);
        flush_work(&xfrm_state_gc_work);

        WARN_ON(!list_empty(&net->xfrm.state_all));

        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                WARN_ON(!hlist_empty(xfrm_state_deref_netexit(net->xfrm.state_byseq) + i));
                WARN_ON(!hlist_empty(xfrm_state_deref_netexit(net->xfrm.state_byspi) + i));
                WARN_ON(!hlist_empty(xfrm_state_deref_netexit(net->xfrm.state_bysrc) + i));
                WARN_ON(!hlist_empty(xfrm_state_deref_netexit(net->xfrm.state_bydst) + i));
        }

        sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head);
        xfrm_hash_free(xfrm_state_deref_netexit(net->xfrm.state_byseq), sz);
        xfrm_hash_free(xfrm_state_deref_netexit(net->xfrm.state_byspi), sz);
        xfrm_hash_free(xfrm_state_deref_netexit(net->xfrm.state_bysrc), sz);
        xfrm_hash_free(xfrm_state_deref_netexit(net->xfrm.state_bydst), sz);
        free_percpu(net->xfrm.state_cache_input);
}

#ifdef CONFIG_AUDITSYSCALL
static void xfrm_audit_helper_sainfo(struct xfrm_state *x,
                                     struct audit_buffer *audit_buf)
{
        struct xfrm_sec_ctx *ctx = x->security;
        u32 spi = ntohl(x->id.spi);

        if (ctx)
                audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
                                 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);

        switch (x->props.family) {
        case AF_INET:
                audit_log_format(audit_buf, " src=%pI4 dst=%pI4",
                                 &x->props.saddr.a4, &x->id.daddr.a4);
                break;
        case AF_INET6:
                audit_log_format(audit_buf, " src=%pI6 dst=%pI6",
                                 x->props.saddr.a6, x->id.daddr.a6);
                break;
        }

        audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi);
}

static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family,
                                      struct audit_buffer *audit_buf)
{
        const struct iphdr *iph4;
        const struct ipv6hdr *iph6;

        switch (family) {
        case AF_INET:
                iph4 = ip_hdr(skb);
                audit_log_format(audit_buf, " src=%pI4 dst=%pI4",
                                 &iph4->saddr, &iph4->daddr);
                break;
        case AF_INET6:
                iph6 = ipv6_hdr(skb);
                audit_log_format(audit_buf,
                                 " src=%pI6 dst=%pI6 flowlbl=0x%x%02x%02x",
                                 &iph6->saddr, &iph6->daddr,
                                 iph6->flow_lbl[0] & 0x0f,
                                 iph6->flow_lbl[1],
                                 iph6->flow_lbl[2]);
                break;
        }
}

void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SAD-add");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        xfrm_audit_helper_sainfo(x, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_add);

void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SAD-delete");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        xfrm_audit_helper_sainfo(x, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_delete);

void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
                                      struct sk_buff *skb)
{
        struct audit_buffer *audit_buf;
        u32 spi;

        audit_buf = xfrm_audit_start("SA-replay-overflow");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
        /* don't record the sequence number because it's inherent in this kind
         * of audit message */
        spi = ntohl(x->id.spi);
        audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_replay_overflow);

void xfrm_audit_state_replay(struct xfrm_state *x,
                             struct sk_buff *skb, __be32 net_seq)
{
        struct audit_buffer *audit_buf;
        u32 spi;

        audit_buf = xfrm_audit_start("SA-replayed-pkt");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
        spi = ntohl(x->id.spi);
        audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
                         spi, spi, ntohl(net_seq));
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_replay);

void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SA-notfound");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_pktinfo(skb, family, audit_buf);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound_simple);

void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
                               __be32 net_spi, __be32 net_seq)
{
        struct audit_buffer *audit_buf;
        u32 spi;

        audit_buf = xfrm_audit_start("SA-notfound");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_pktinfo(skb, family, audit_buf);
        spi = ntohl(net_spi);
        audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
                         spi, spi, ntohl(net_seq));
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound);

void xfrm_audit_state_icvfail(struct xfrm_state *x,
                              struct sk_buff *skb, u8 proto)
{
        struct audit_buffer *audit_buf;
        __be32 net_spi;
        __be32 net_seq;

        audit_buf = xfrm_audit_start("SA-icv-failure");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
        if (xfrm_parse_spi(skb, proto, &net_spi, &net_seq) == 0) {
                u32 spi = ntohl(net_spi);
                audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
                                 spi, spi, ntohl(net_seq));
        }
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_icvfail);
#endif /* CONFIG_AUDITSYSCALL */


























   48 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM capability

#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CAPABILITY_H

#include <linux/cred.h>
#include <linux/tracepoint.h>
#include <linux/user_namespace.h>

/**
 * cap_capable - called after it's determined if a task has a particular
 * effective capability
 *
 * @cred: The credentials used
 * @target_ns: The user namespace of the resource being accessed
 * @capable_ns: The user namespace in which the credential provides the
 *              capability to access the targeted resource.
 *              This will be NULL if ret is not 0.
 * @cap: The capability to check for
 * @ret: The return value of the check: 0 if it does, -ve if it does not
 *
 * Allows to trace calls to cap_capable in commoncap.c
 */
TRACE_EVENT(cap_capable,

        TP_PROTO(const struct cred *cred, struct user_namespace *target_ns,
                const struct user_namespace *capable_ns, int cap, int ret),

        TP_ARGS(cred, target_ns, capable_ns, cap, ret),

        TP_STRUCT__entry(
                __field(const struct cred *, cred)
                __field(struct user_namespace *, target_ns)
                __field(const struct user_namespace *, capable_ns)
                __field(int, cap)
                __field(int, ret)
        ),

        TP_fast_assign(
                __entry->cred       = cred;
                __entry->target_ns    = target_ns;
                __entry->capable_ns = ret == 0 ? capable_ns : NULL;
                __entry->cap        = cap;
                __entry->ret        = ret;
        ),

        TP_printk("cred %p, target_ns %p, capable_ns %p, cap %d, ret %d",
                __entry->cred, __entry->target_ns, __entry->capable_ns, __entry->cap,
                __entry->ret)
);

#endif /* _TRACE_CAPABILITY_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 
































    1 


















































































































































































































































































































































































































































   17 


   19 













































































































    1 
    1 























































































































































































   25 




















   14 




   18 






































































































   14 



































   14 
   14 


























































   16 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Landlock - Filesystem management and hooks
 *
 * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
 * Copyright © 2018-2020 ANSSI
 * Copyright © 2021-2025 Microsoft Corporation
 * Copyright © 2022 Günther Noack <gnoack3000@gmail.com>
 * Copyright © 2023-2024 Google LLC
 */

#include <asm/ioctls.h>
#include <kunit/test.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/bits.h>
#include <linux/compiler_types.h>
#include <linux/dcache.h>
#include <linux/err.h>
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/limits.h>
#include <linux/list.h>
#include <linux/lsm_audit.h>
#include <linux/lsm_hooks.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/net.h>
#include <linux/path.h>
#include <linux/pid.h>
#include <linux/rcupdate.h>
#include <linux/sched/signal.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/types.h>
#include <linux/wait_bit.h>
#include <linux/workqueue.h>
#include <net/af_unix.h>
#include <uapi/linux/fiemap.h>
#include <uapi/linux/landlock.h>

#include "access.h"
#include "audit.h"
#include "common.h"
#include "cred.h"
#include "domain.h"
#include "fs.h"
#include "limits.h"
#include "object.h"
#include "ruleset.h"
#include "setup.h"

/* Underlying object management */

static void release_inode(struct landlock_object *const object)
        __releases(object->lock)
{
        struct inode *const inode = object->underobj;
        struct super_block *sb;

        if (!inode) {
                spin_unlock(&object->lock);
                return;
        }

        /*
         * Protects against concurrent use by hook_sb_delete() of the reference
         * to the underlying inode.
         */
        object->underobj = NULL;
        /*
         * Makes sure that if the filesystem is concurrently unmounted,
         * hook_sb_delete() will wait for us to finish iput().
         */
        sb = inode->i_sb;
        atomic_long_inc(&landlock_superblock(sb)->inode_refs);
        spin_unlock(&object->lock);
        /*
         * Because object->underobj was not NULL, hook_sb_delete() and
         * get_inode_object() guarantee that it is safe to reset
         * landlock_inode(inode)->object while it is not NULL.  It is therefore
         * not necessary to lock inode->i_lock.
         */
        rcu_assign_pointer(landlock_inode(inode)->object, NULL);
        /*
         * Now, new rules can safely be tied to @inode with get_inode_object().
         */

        iput(inode);
        if (atomic_long_dec_and_test(&landlock_superblock(sb)->inode_refs))
                wake_up_var(&landlock_superblock(sb)->inode_refs);
}

static const struct landlock_object_underops landlock_fs_underops = {
        .release = release_inode
};

/* IOCTL helpers */

/**
 * is_masked_device_ioctl - Determine whether an IOCTL command is always
 * permitted with Landlock for device files.  These commands can not be
 * restricted on device files by enforcing a Landlock policy.
 *
 * @cmd: The IOCTL command that is supposed to be run.
 *
 * By default, any IOCTL on a device file requires the
 * LANDLOCK_ACCESS_FS_IOCTL_DEV right.  However, we blanket-permit some
 * commands, if:
 *
 * 1. The command is implemented in fs/ioctl.c's do_vfs_ioctl(),
 *    not in f_ops->unlocked_ioctl() or f_ops->compat_ioctl().
 *
 * 2. The command is harmless when invoked on devices.
 *
 * We also permit commands that do not make sense for devices, but where the
 * do_vfs_ioctl() implementation returns a more conventional error code.
 *
 * Any new IOCTL commands that are implemented in fs/ioctl.c's do_vfs_ioctl()
 * should be considered for inclusion here.
 *
 * Return: True if the IOCTL @cmd can not be restricted with Landlock for
 * device files, false otherwise.
 */
static __attribute_const__ bool is_masked_device_ioctl(const unsigned int cmd)
{
        switch (cmd) {
        /*
         * FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's
         * close-on-exec and the file's buffered-IO and async flags.  These
         * operations are also available through fcntl(2), and are
         * unconditionally permitted in Landlock.
         */
        case FIOCLEX:
        case FIONCLEX:
        case FIONBIO:
        case FIOASYNC:
        /*
         * FIOQSIZE queries the size of a regular file, directory, or link.
         *
         * We still permit it, because it always returns -ENOTTY for
         * other file types.
         */
        case FIOQSIZE:
        /*
         * FIFREEZE and FITHAW freeze and thaw the file system which the
         * given file belongs to.  Requires CAP_SYS_ADMIN.
         *
         * These commands operate on the file system's superblock rather
         * than on the file itself.  The same operations can also be
         * done through any other file or directory on the same file
         * system, so it is safe to permit these.
         */
        case FIFREEZE:
        case FITHAW:
        /*
         * FS_IOC_FIEMAP queries information about the allocation of
         * blocks within a file.
         *
         * This IOCTL command only makes sense for regular files and is
         * not implemented by devices. It is harmless to permit.
         */
        case FS_IOC_FIEMAP:
        /*
         * FIGETBSZ queries the file system's block size for a file or
         * directory.
         *
         * This command operates on the file system's superblock rather
         * than on the file itself.  The same operation can also be done
         * through any other file or directory on the same file system,
         * so it is safe to permit it.
         */
        case FIGETBSZ:
        /*
         * FICLONE, FICLONERANGE and FIDEDUPERANGE make files share
         * their underlying storage ("reflink") between source and
         * destination FDs, on file systems which support that.
         *
         * These IOCTL commands only apply to regular files
         * and are harmless to permit for device files.
         */
        case FICLONE:
        case FICLONERANGE:
        case FIDEDUPERANGE:
        /*
         * FS_IOC_GETFSUUID and FS_IOC_GETFSSYSFSPATH both operate on
         * the file system superblock, not on the specific file, so
         * these operations are available through any other file on the
         * same file system as well.
         */
        case FS_IOC_GETFSUUID:
        case FS_IOC_GETFSSYSFSPATH:
                return true;

        /*
         * FIONREAD, FS_IOC_GETFLAGS, FS_IOC_SETFLAGS, FS_IOC_FSGETXATTR and
         * FS_IOC_FSSETXATTR are forwarded to device implementations.
         */

        /*
         * file_ioctl() commands (FIBMAP, FS_IOC_RESVSP, FS_IOC_RESVSP64,
         * FS_IOC_UNRESVSP, FS_IOC_UNRESVSP64 and FS_IOC_ZERO_RANGE) are
         * forwarded to device implementations, so not permitted.
         */

        /* Other commands are guarded by the access right. */
        default:
                return false;
        }
}

/*
 * is_masked_device_ioctl_compat - same as the helper above, but checking the
 * "compat" IOCTL commands.
 *
 * The IOCTL commands with special handling in compat-mode should behave the
 * same as their non-compat counterparts.
 */
static __attribute_const__ bool
is_masked_device_ioctl_compat(const unsigned int cmd)
{
        switch (cmd) {
        /* FICLONE is permitted, same as in the non-compat variant. */
        case FICLONE:
                return true;

#if defined(CONFIG_X86_64)
        /*
         * FS_IOC_RESVSP_32, FS_IOC_RESVSP64_32, FS_IOC_UNRESVSP_32,
         * FS_IOC_UNRESVSP64_32, FS_IOC_ZERO_RANGE_32: not blanket-permitted,
         * for consistency with their non-compat variants.
         */
        case FS_IOC_RESVSP_32:
        case FS_IOC_RESVSP64_32:
        case FS_IOC_UNRESVSP_32:
        case FS_IOC_UNRESVSP64_32:
        case FS_IOC_ZERO_RANGE_32:
#endif

        /*
         * FS_IOC32_GETFLAGS, FS_IOC32_SETFLAGS are forwarded to their device
         * implementations.
         */
        case FS_IOC32_GETFLAGS:
        case FS_IOC32_SETFLAGS:
                return false;
        default:
                return is_masked_device_ioctl(cmd);
        }
}

/* Ruleset management */

static struct landlock_object *get_inode_object(struct inode *const inode)
{
        struct landlock_object *object, *new_object;
        struct landlock_inode_security *inode_sec = landlock_inode(inode);

        rcu_read_lock();
retry:
        object = rcu_dereference(inode_sec->object);
        if (object) {
                if (likely(refcount_inc_not_zero(&object->usage))) {
                        rcu_read_unlock();
                        return object;
                }
                /*
                 * We are racing with release_inode(), the object is going
                 * away.  Wait for release_inode(), then retry.
                 */
                spin_lock(&object->lock);
                spin_unlock(&object->lock);
                goto retry;
        }
        rcu_read_unlock();

        /*
         * If there is no object tied to @inode, then create a new one (without
         * holding any locks).
         */
        new_object = landlock_create_object(&landlock_fs_underops, inode);
        if (IS_ERR(new_object))
                return new_object;

        /*
         * Protects against concurrent calls to get_inode_object() or
         * hook_sb_delete().
         */
        spin_lock(&inode->i_lock);
        if (unlikely(rcu_access_pointer(inode_sec->object))) {
                /* Someone else just created the object, bail out and retry. */
                spin_unlock(&inode->i_lock);
                kfree(new_object);

                rcu_read_lock();
                goto retry;
        }

        /*
         * @inode will be released by hook_sb_delete() on its superblock
         * shutdown, or by release_inode() when no more ruleset references the
         * related object.
         */
        ihold(inode);
        rcu_assign_pointer(inode_sec->object, new_object);
        spin_unlock(&inode->i_lock);
        return new_object;
}

/* All access rights that can be tied to files. */
/* clang-format off */
#define ACCESS_FILE ( \
        LANDLOCK_ACCESS_FS_EXECUTE | \
        LANDLOCK_ACCESS_FS_WRITE_FILE | \
        LANDLOCK_ACCESS_FS_READ_FILE | \
        LANDLOCK_ACCESS_FS_TRUNCATE | \
        LANDLOCK_ACCESS_FS_IOCTL_DEV | \
        LANDLOCK_ACCESS_FS_RESOLVE_UNIX)
/* clang-format on */

/*
 * @path: Should have been checked by get_path_from_fd().
 */
int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
                            const struct path *const path,
                            access_mask_t access_rights)
{
        int err;
        struct landlock_id id = {
                .type = LANDLOCK_KEY_INODE,
        };

        /* Files only get access rights that make sense. */
        if (!d_is_dir(path->dentry) &&
            !access_mask_subset(access_rights, ACCESS_FILE))
                return -EINVAL;
        if (WARN_ON_ONCE(ruleset->num_layers != 1))
                return -EINVAL;

        /* Transforms relative access rights to absolute ones. */
        access_rights |= LANDLOCK_MASK_ACCESS_FS &
                         ~landlock_get_fs_access_mask(ruleset, 0);
        id.key.object = get_inode_object(d_backing_inode(path->dentry));
        if (IS_ERR(id.key.object))
                return PTR_ERR(id.key.object);
        mutex_lock(&ruleset->lock);
        err = landlock_insert_rule(ruleset, id, access_rights);
        mutex_unlock(&ruleset->lock);
        /*
         * No need to check for an error because landlock_insert_rule()
         * increments the refcount for the new object if needed.
         */
        landlock_put_object(id.key.object);
        return err;
}

/* Access-control management */

/*
 * The lifetime of the returned rule is tied to @domain.
 *
 * Returns NULL if no rule is found or if @dentry is negative.
 */
static const struct landlock_rule *
find_rule(const struct landlock_ruleset *const domain,
          const struct dentry *const dentry)
{
        const struct landlock_rule *rule;
        const struct inode *inode;
        struct landlock_id id = {
                .type = LANDLOCK_KEY_INODE,
        };

        /* Ignores nonexistent leafs. */
        if (d_is_negative(dentry))
                return NULL;

        inode = d_backing_inode(dentry);
        rcu_read_lock();
        id.key.object = rcu_dereference(landlock_inode(inode)->object);
        rule = landlock_find_rule(domain, id);
        rcu_read_unlock();
        return rule;
}

/*
 * Allows access to pseudo filesystems that will never be mountable (e.g.
 * sockfs, pipefs), but can still be reachable through
 * /proc/<pid>/fd/<file-descriptor>
 */
static bool is_nouser_or_private(const struct dentry *dentry)
{
        return (dentry->d_sb->s_flags & SB_NOUSER) ||
               (d_is_positive(dentry) &&
                unlikely(IS_PRIVATE(d_backing_inode(dentry))));
}

static const struct access_masks any_fs = {
        .fs = ~0,
};

/*
 * Returns true iff the child file with the given src_child access rights under
 * src_parent would result in having the same or fewer access rights if it were
 * moved under new_parent.
 */
static bool may_refer(const struct layer_access_masks *const src_parent,
                      const struct layer_access_masks *const src_child,
                      const struct layer_access_masks *const new_parent,
                      const bool child_is_dir)
{
        for (size_t i = 0; i < ARRAY_SIZE(new_parent->access); i++) {
                access_mask_t child_access = src_parent->access[i] &
                                             src_child->access[i];
                access_mask_t parent_access = new_parent->access[i];

                if (!child_is_dir) {
                        child_access &= ACCESS_FILE;
                        parent_access &= ACCESS_FILE;
                }

                if (!access_mask_subset(child_access, parent_access))
                        return false;
        }
        return true;
}

/*
 * Check that a destination file hierarchy has more restrictions than a source
 * file hierarchy.  This is only used for link and rename actions.
 *
 * Return: True if child1 may be moved from parent1 to parent2 without
 * increasing its access rights (if child2 is set, an additional condition is
 * that child2 may be used from parent2 to parent1 without increasing its access
 * rights), false otherwise.
 */
static bool no_more_access(const struct layer_access_masks *const parent1,
                           const struct layer_access_masks *const child1,
                           const bool child1_is_dir,
                           const struct layer_access_masks *const parent2,
                           const struct layer_access_masks *const child2,
                           const bool child2_is_dir)
{
        if (!may_refer(parent1, child1, parent2, child1_is_dir))
                return false;

        if (!child2)
                return true;

        return may_refer(parent2, child2, parent1, child2_is_dir);
}

#define NMA_TRUE(...) KUNIT_EXPECT_TRUE(test, no_more_access(__VA_ARGS__))
#define NMA_FALSE(...) KUNIT_EXPECT_FALSE(test, no_more_access(__VA_ARGS__))

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_no_more_access(struct kunit *const test)
{
        const struct layer_access_masks rx0 = {
                .access[0] = LANDLOCK_ACCESS_FS_EXECUTE |
                             LANDLOCK_ACCESS_FS_READ_FILE,
        };
        const struct layer_access_masks mx0 = {
                .access[0] = LANDLOCK_ACCESS_FS_EXECUTE |
                             LANDLOCK_ACCESS_FS_MAKE_REG,
        };
        const struct layer_access_masks x0 = {
                .access[0] = LANDLOCK_ACCESS_FS_EXECUTE,
        };
        const struct layer_access_masks x1 = {
                .access[1] = LANDLOCK_ACCESS_FS_EXECUTE,
        };
        const struct layer_access_masks x01 = {
                .access[0] = LANDLOCK_ACCESS_FS_EXECUTE,
                .access[1] = LANDLOCK_ACCESS_FS_EXECUTE,
        };
        const struct layer_access_masks allows_all = {};

        /* Checks without restriction. */
        NMA_TRUE(&x0, &allows_all, false, &allows_all, NULL, false);
        NMA_TRUE(&allows_all, &x0, false, &allows_all, NULL, false);
        NMA_FALSE(&x0, &x0, false, &allows_all, NULL, false);

        /*
         * Checks that we can only refer a file if no more access could be
         * inherited.
         */
        NMA_TRUE(&x0, &x0, false, &rx0, NULL, false);
        NMA_TRUE(&rx0, &rx0, false, &rx0, NULL, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, NULL, false);
        NMA_FALSE(&rx0, &rx0, false, &x1, NULL, false);

        /* Checks allowed referring with different nested domains. */
        NMA_TRUE(&x0, &x1, false, &x0, NULL, false);
        NMA_TRUE(&x1, &x0, false, &x0, NULL, false);
        NMA_TRUE(&x0, &x01, false, &x0, NULL, false);
        NMA_TRUE(&x0, &x01, false, &rx0, NULL, false);
        NMA_TRUE(&x01, &x0, false, &x0, NULL, false);
        NMA_TRUE(&x01, &x0, false, &rx0, NULL, false);
        NMA_FALSE(&x01, &x01, false, &x0, NULL, false);

        /* Checks that file access rights are also enforced for a directory. */
        NMA_FALSE(&rx0, &rx0, true, &x0, NULL, false);

        /* Checks that directory access rights don't impact file referring... */
        NMA_TRUE(&mx0, &mx0, false, &x0, NULL, false);
        /* ...but only directory referring. */
        NMA_FALSE(&mx0, &mx0, true, &x0, NULL, false);

        /* Checks directory exchange. */
        NMA_TRUE(&mx0, &mx0, true, &mx0, &mx0, true);
        NMA_TRUE(&mx0, &mx0, true, &mx0, &x0, true);
        NMA_FALSE(&mx0, &mx0, true, &x0, &mx0, true);
        NMA_FALSE(&mx0, &mx0, true, &x0, &x0, true);
        NMA_FALSE(&mx0, &mx0, true, &x1, &x1, true);

        /* Checks file exchange with directory access rights... */
        NMA_TRUE(&mx0, &mx0, false, &mx0, &mx0, false);
        NMA_TRUE(&mx0, &mx0, false, &mx0, &x0, false);
        NMA_TRUE(&mx0, &mx0, false, &x0, &mx0, false);
        NMA_TRUE(&mx0, &mx0, false, &x0, &x0, false);
        /* ...and with file access rights. */
        NMA_TRUE(&rx0, &rx0, false, &rx0, &rx0, false);
        NMA_TRUE(&rx0, &rx0, false, &rx0, &x0, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, &rx0, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, &x0, false);
        NMA_FALSE(&rx0, &rx0, false, &x1, &x1, false);

        /*
         * Allowing the following requests should not be a security risk
         * because domain 0 denies execute access, and domain 1 is always
         * nested with domain 0.  However, adding an exception for this case
         * would mean to check all nested domains to make sure none can get
         * more privileges (e.g. processes only sandboxed by domain 0).
         * Moreover, this behavior (i.e. composition of N domains) could then
         * be inconsistent compared to domain 1's ruleset alone (e.g. it might
         * be denied to link/rename with domain 1's ruleset, whereas it would
         * be allowed if nested on top of domain 0).  Another drawback would be
         * to create a cover channel that could enable sandboxed processes to
         * infer most of the filesystem restrictions from their domain.  To
         * make it simple, efficient, safe, and more consistent, this case is
         * always denied.
         */
        NMA_FALSE(&x1, &x1, false, &x0, NULL, false);
        NMA_FALSE(&x1, &x1, false, &rx0, NULL, false);
        NMA_FALSE(&x1, &x1, true, &x0, NULL, false);
        NMA_FALSE(&x1, &x1, true, &rx0, NULL, false);

        /* Checks the same case of exclusive domains with a file... */
        NMA_TRUE(&x1, &x1, false, &x01, NULL, false);
        NMA_FALSE(&x1, &x1, false, &x01, &x0, false);
        NMA_FALSE(&x1, &x1, false, &x01, &x01, false);
        NMA_FALSE(&x1, &x1, false, &x0, &x0, false);
        /* ...and with a directory. */
        NMA_FALSE(&x1, &x1, false, &x0, &x0, true);
        NMA_FALSE(&x1, &x1, true, &x0, &x0, false);
        NMA_FALSE(&x1, &x1, true, &x0, &x0, true);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

#undef NMA_TRUE
#undef NMA_FALSE

static bool is_layer_masks_allowed(const struct layer_access_masks *masks)
{
        return mem_is_zero(&masks->access, sizeof(masks->access));
}

/*
 * Removes @masks accesses that are not requested.
 *
 * Returns true if the request is allowed, false otherwise.
 */
static bool scope_to_request(const access_mask_t access_request,
                             struct layer_access_masks *masks)
{
        bool saw_unfulfilled_access = false;

        if (WARN_ON_ONCE(!masks))
                return true;

        for (size_t i = 0; i < ARRAY_SIZE(masks->access); i++) {
                masks->access[i] &= access_request;
                if (masks->access[i])
                        saw_unfulfilled_access = true;
        }
        return !saw_unfulfilled_access;
}

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_scope_to_request_with_exec_none(struct kunit *const test)
{
        /* Allows everything. */
        struct layer_access_masks masks = {};

        /* Checks and scopes with execute. */
        KUNIT_EXPECT_TRUE(test,
                          scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE, &masks));
        KUNIT_EXPECT_EQ(test, 0, masks.access[0]);
}

static void test_scope_to_request_with_exec_some(struct kunit *const test)
{
        /* Denies execute and write. */
        struct layer_access_masks masks = {
                .access[0] = LANDLOCK_ACCESS_FS_EXECUTE,
                .access[1] = LANDLOCK_ACCESS_FS_WRITE_FILE,
        };

        /* Checks and scopes with execute. */
        KUNIT_EXPECT_FALSE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
                                                  &masks));
        KUNIT_EXPECT_EQ(test, LANDLOCK_ACCESS_FS_EXECUTE, masks.access[0]);
        KUNIT_EXPECT_EQ(test, 0, masks.access[1]);
}

static void test_scope_to_request_without_access(struct kunit *const test)
{
        /* Denies execute and write. */
        struct layer_access_masks masks = {
                .access[0] = LANDLOCK_ACCESS_FS_EXECUTE,
                .access[1] = LANDLOCK_ACCESS_FS_WRITE_FILE,
        };

        /* Checks and scopes without access request. */
        KUNIT_EXPECT_TRUE(test, scope_to_request(0, &masks));
        KUNIT_EXPECT_EQ(test, 0, masks.access[0]);
        KUNIT_EXPECT_EQ(test, 0, masks.access[1]);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

/*
 * Returns true if there is at least one access right different than
 * LANDLOCK_ACCESS_FS_REFER.
 */
static bool is_eacces(const struct layer_access_masks *masks,
                      const access_mask_t access_request)
{
        if (!masks)
                return false;

        for (size_t i = 0; i < ARRAY_SIZE(masks->access); i++) {
                /* LANDLOCK_ACCESS_FS_REFER alone must return -EXDEV. */
                if (masks->access[i] & access_request &
                    ~LANDLOCK_ACCESS_FS_REFER)
                        return true;
        }
        return false;
}

#define IE_TRUE(...) KUNIT_EXPECT_TRUE(test, is_eacces(__VA_ARGS__))
#define IE_FALSE(...) KUNIT_EXPECT_FALSE(test, is_eacces(__VA_ARGS__))

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_is_eacces_with_none(struct kunit *const test)
{
        const struct layer_access_masks masks = {};

        IE_FALSE(&masks, 0);
        IE_FALSE(&masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&masks, LANDLOCK_ACCESS_FS_EXECUTE);
        IE_FALSE(&masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

static void test_is_eacces_with_refer(struct kunit *const test)
{
        const struct layer_access_masks masks = {
                .access[0] = LANDLOCK_ACCESS_FS_REFER,
        };

        IE_FALSE(&masks, 0);
        IE_FALSE(&masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&masks, LANDLOCK_ACCESS_FS_EXECUTE);
        IE_FALSE(&masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

static void test_is_eacces_with_write(struct kunit *const test)
{
        const struct layer_access_masks masks = {
                .access[0] = LANDLOCK_ACCESS_FS_WRITE_FILE,
        };

        IE_FALSE(&masks, 0);
        IE_FALSE(&masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&masks, LANDLOCK_ACCESS_FS_EXECUTE);

        IE_TRUE(&masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

#undef IE_TRUE
#undef IE_FALSE

/**
 * is_access_to_paths_allowed - Check accesses for requests with a common path
 *
 * @domain: Domain to check against.
 * @path: File hierarchy to walk through.  For refer checks, this would be
 *     the common mountpoint.
 * @access_request_parent1: Accesses to check, once @layer_masks_parent1 is
 *     equal to @layer_masks_parent2 (if any).  This is tied to the unique
 *     requested path for most actions, or the source in case of a refer action
 *     (i.e. rename or link), or the source and destination in case of
 *     RENAME_EXCHANGE.
 * @layer_masks_parent1: Pointer to a matrix of layer masks per access
 *     masks, identifying the layers that forbid a specific access.  Bits from
 *     this matrix can be unset according to the @path walk.  An empty matrix
 *     means that @domain allows all possible Landlock accesses (i.e. not only
 *     those identified by @access_request_parent1).  This matrix can
 *     initially refer to domain layer masks and, when the accesses for the
 *     destination and source are the same, to requested layer masks.
 * @log_request_parent1: Audit request to fill if the related access is denied.
 * @dentry_child1: Dentry to the initial child of the parent1 path.  This
 *     pointer must be NULL for non-refer actions (i.e. not link nor rename).
 * @access_request_parent2: Similar to @access_request_parent1 but for a
 *     request involving a source and a destination.  This refers to the
 *     destination, except in case of RENAME_EXCHANGE where it also refers to
 *     the source.  Must be set to 0 when using a simple path request.
 * @layer_masks_parent2: Similar to @layer_masks_parent1 but for a refer
 *     action.  This must be NULL otherwise.
 * @log_request_parent2: Audit request to fill if the related access is denied.
 * @dentry_child2: Dentry to the initial child of the parent2 path.  This
 *     pointer is only set for RENAME_EXCHANGE actions and must be NULL
 *     otherwise.
 *
 * This helper first checks that the destination has a superset of restrictions
 * compared to the source (if any) for a common path.  Because of
 * RENAME_EXCHANGE actions, source and destinations may be swapped.  It then
 * checks that the collected accesses and the remaining ones are enough to
 * allow the request.
 *
 * Return: True if the access request is granted, false otherwise.
 */
static bool
is_access_to_paths_allowed(const struct landlock_ruleset *const domain,
                           const struct path *const path,
                           const access_mask_t access_request_parent1,
                           struct layer_access_masks *layer_masks_parent1,
                           struct landlock_request *const log_request_parent1,
                           struct dentry *const dentry_child1,
                           const access_mask_t access_request_parent2,
                           struct layer_access_masks *layer_masks_parent2,
                           struct landlock_request *const log_request_parent2,
                           struct dentry *const dentry_child2)
{
        bool allowed_parent1 = false, allowed_parent2 = false, is_dom_check,
             child1_is_directory = true, child2_is_directory = true;
        struct path walker_path;
        access_mask_t access_masked_parent1, access_masked_parent2;
        struct layer_access_masks _layer_masks_child1, _layer_masks_child2;
        struct layer_access_masks *layer_masks_child1 = NULL,
                                  *layer_masks_child2 = NULL;

        if (!access_request_parent1 && !access_request_parent2)
                return true;

        if (WARN_ON_ONCE(!path))
                return true;

        if (is_nouser_or_private(path->dentry))
                return true;

        if (WARN_ON_ONCE(!layer_masks_parent1))
                return false;

        allowed_parent1 = is_layer_masks_allowed(layer_masks_parent1);

        if (unlikely(layer_masks_parent2)) {
                if (WARN_ON_ONCE(!dentry_child1))
                        return false;

                allowed_parent2 = is_layer_masks_allowed(layer_masks_parent2);

                /*
                 * For a double request, first check for potential privilege
                 * escalation by looking at domain handled accesses (which are
                 * a superset of the meaningful requested accesses).
                 */
                access_masked_parent1 = access_masked_parent2 =
                        landlock_union_access_masks(domain).fs;
                is_dom_check = true;
        } else {
                if (WARN_ON_ONCE(dentry_child1 || dentry_child2))
                        return false;
                /* For a simple request, only check for requested accesses. */
                access_masked_parent1 = access_request_parent1;
                access_masked_parent2 = access_request_parent2;
                is_dom_check = false;
        }

        if (unlikely(dentry_child1)) {
                if (landlock_init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
                                              &_layer_masks_child1,
                                              LANDLOCK_KEY_INODE))
                        landlock_unmask_layers(find_rule(domain, dentry_child1),
                                               &_layer_masks_child1);
                layer_masks_child1 = &_layer_masks_child1;
                child1_is_directory = d_is_dir(dentry_child1);
        }
        if (unlikely(dentry_child2)) {
                if (landlock_init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
                                              &_layer_masks_child2,
                                              LANDLOCK_KEY_INODE))
                        landlock_unmask_layers(find_rule(domain, dentry_child2),
                                               &_layer_masks_child2);
                layer_masks_child2 = &_layer_masks_child2;
                child2_is_directory = d_is_dir(dentry_child2);
        }

        walker_path = *path;
        path_get(&walker_path);
        /*
         * We need to walk through all the hierarchy to not miss any relevant
         * restriction.
         */
        while (true) {
                const struct landlock_rule *rule;

                /*
                 * If at least all accesses allowed on the destination are
                 * already allowed on the source, respectively if there is at
                 * least as much as restrictions on the destination than on the
                 * source, then we can safely refer files from the source to
                 * the destination without risking a privilege escalation.
                 * This also applies in the case of RENAME_EXCHANGE, which
                 * implies checks on both direction.  This is crucial for
                 * standalone multilayered security policies.  Furthermore,
                 * this helps avoid policy writers to shoot themselves in the
                 * foot.
                 */
                if (unlikely(is_dom_check &&
                             no_more_access(
                                     layer_masks_parent1, layer_masks_child1,
                                     child1_is_directory, layer_masks_parent2,
                                     layer_masks_child2,
                                     child2_is_directory))) {
                        /*
                         * Now, downgrades the remaining checks from domain
                         * handled accesses to requested accesses.
                         */
                        is_dom_check = false;
                        access_masked_parent1 = access_request_parent1;
                        access_masked_parent2 = access_request_parent2;

                        allowed_parent1 =
                                allowed_parent1 ||
                                scope_to_request(access_masked_parent1,
                                                 layer_masks_parent1);
                        allowed_parent2 =
                                allowed_parent2 ||
                                scope_to_request(access_masked_parent2,
                                                 layer_masks_parent2);

                        /* Stops when all accesses are granted. */
                        if (allowed_parent1 && allowed_parent2)
                                break;
                }

                rule = find_rule(domain, walker_path.dentry);
                allowed_parent1 =
                        allowed_parent1 ||
                        landlock_unmask_layers(rule, layer_masks_parent1);
                allowed_parent2 =
                        allowed_parent2 ||
                        landlock_unmask_layers(rule, layer_masks_parent2);

                /* Stops when a rule from each layer grants access. */
                if (allowed_parent1 && allowed_parent2)
                        break;

jump_up:
                if (walker_path.dentry == walker_path.mnt->mnt_root) {
                        if (follow_up(&walker_path)) {
                                /* Ignores hidden mount points. */
                                goto jump_up;
                        } else {
                                /*
                                 * Stops at the real root.  Denies access
                                 * because not all layers have granted access.
                                 */
                                break;
                        }
                }

                if (unlikely(IS_ROOT(walker_path.dentry))) {
                        if (likely(walker_path.mnt->mnt_flags & MNT_INTERNAL)) {
                                /*
                                 * Stops and allows access when reaching disconnected root
                                 * directories that are part of internal filesystems (e.g. nsfs,
                                 * which is reachable through /proc/<pid>/ns/<namespace>).
                                 */
                                allowed_parent1 = true;
                                allowed_parent2 = true;
                                break;
                        }

                        /*
                         * We reached a disconnected root directory from a bind mount.
                         * Let's continue the walk with the mount point we missed.
                         */
                        dput(walker_path.dentry);
                        walker_path.dentry = walker_path.mnt->mnt_root;
                        dget(walker_path.dentry);
                } else {
                        struct dentry *const parent_dentry =
                                dget_parent(walker_path.dentry);

                        dput(walker_path.dentry);
                        walker_path.dentry = parent_dentry;
                }
        }
        path_put(&walker_path);

        /*
         * Check CONFIG_AUDIT to enable elision of log_request_parent* and
         * associated caller's stack variables thanks to dead code elimination.
         */
#ifdef CONFIG_AUDIT
        if (!allowed_parent1 && log_request_parent1) {
                log_request_parent1->type = LANDLOCK_REQUEST_FS_ACCESS;
                log_request_parent1->audit.type = LSM_AUDIT_DATA_PATH;
                log_request_parent1->audit.u.path = *path;
                log_request_parent1->access = access_masked_parent1;
                log_request_parent1->layer_masks = layer_masks_parent1;
        }

        if (!allowed_parent2 && log_request_parent2) {
                log_request_parent2->type = LANDLOCK_REQUEST_FS_ACCESS;
                log_request_parent2->audit.type = LSM_AUDIT_DATA_PATH;
                log_request_parent2->audit.u.path = *path;
                log_request_parent2->access = access_masked_parent2;
                log_request_parent2->layer_masks = layer_masks_parent2;
        }
#endif /* CONFIG_AUDIT */

        return allowed_parent1 && allowed_parent2;
}

static int current_check_access_path(const struct path *const path,
                                     access_mask_t access_request)
{
        const struct access_masks masks = {
                .fs = access_request,
        };
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), masks, NULL);
        struct layer_access_masks layer_masks;
        struct landlock_request request = {};

        if (!subject)
                return 0;

        access_request = landlock_init_layer_masks(subject->domain,
                                                   access_request, &layer_masks,
                                                   LANDLOCK_KEY_INODE);
        if (is_access_to_paths_allowed(subject->domain, path, access_request,
                                       &layer_masks, &request, NULL, 0, NULL,
                                       NULL, NULL))
                return 0;

        landlock_log_denial(subject, &request);
        return -EACCES;
}

static __attribute_const__ access_mask_t get_mode_access(const umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFLNK:
                return LANDLOCK_ACCESS_FS_MAKE_SYM;
        case S_IFDIR:
                return LANDLOCK_ACCESS_FS_MAKE_DIR;
        case S_IFCHR:
                return LANDLOCK_ACCESS_FS_MAKE_CHAR;
        case S_IFBLK:
                return LANDLOCK_ACCESS_FS_MAKE_BLOCK;
        case S_IFIFO:
                return LANDLOCK_ACCESS_FS_MAKE_FIFO;
        case S_IFSOCK:
                return LANDLOCK_ACCESS_FS_MAKE_SOCK;
        case S_IFREG:
        case 0:
                /* A zero mode translates to S_IFREG. */
        default:
                /* Treats weird files as regular files. */
                return LANDLOCK_ACCESS_FS_MAKE_REG;
        }
}

static access_mask_t maybe_remove(const struct dentry *const dentry)
{
        if (d_is_negative(dentry))
                return 0;
        return d_is_dir(dentry) ? LANDLOCK_ACCESS_FS_REMOVE_DIR :
                                  LANDLOCK_ACCESS_FS_REMOVE_FILE;
}

/**
 * collect_domain_accesses - Walk through a file path and collect accesses
 *
 * @domain: Domain to check against.
 * @mnt_root: Last directory to check.
 * @dir: Directory to start the walk from.
 * @layer_masks_dom: Where to store the collected accesses.
 *
 * This helper is useful to begin a path walk from the @dir directory to a
 * @mnt_root directory used as a mount point.  This mount point is the common
 * ancestor between the source and the destination of a renamed and linked
 * file.  While walking from @dir to @mnt_root, we record all the domain's
 * allowed accesses in @layer_masks_dom.
 *
 * Because of disconnected directories, this walk may not reach @mnt_dir.  In
 * this case, the walk will continue to @mnt_dir after this call.
 *
 * This is similar to is_access_to_paths_allowed() but much simpler because it
 * only handles walking on the same mount point and only checks one set of
 * accesses.
 *
 * Return: True if all the domain access rights are allowed for @dir, false if
 * the walk reached @mnt_root.
 */
static bool collect_domain_accesses(const struct landlock_ruleset *const domain,
                                    const struct dentry *const mnt_root,
                                    struct dentry *dir,
                                    struct layer_access_masks *layer_masks_dom)
{
        bool ret = false;

        if (WARN_ON_ONCE(!domain || !mnt_root || !dir || !layer_masks_dom))
                return true;
        if (is_nouser_or_private(dir))
                return true;

        if (!landlock_init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
                                       layer_masks_dom, LANDLOCK_KEY_INODE))
                return true;

        dget(dir);
        while (true) {
                struct dentry *parent_dentry;

                /* Gets all layers allowing all domain accesses. */
                if (landlock_unmask_layers(find_rule(domain, dir),
                                           layer_masks_dom)) {
                        /*
                         * Stops when all handled accesses are allowed by at
                         * least one rule in each layer.
                         */
                        ret = true;
                        break;
                }

                /*
                 * Stops at the mount point or the filesystem root for a disconnected
                 * directory.
                 */
                if (dir == mnt_root || unlikely(IS_ROOT(dir)))
                        break;

                parent_dentry = dget_parent(dir);
                dput(dir);
                dir = parent_dentry;
        }
        dput(dir);
        return ret;
}

/**
 * current_check_refer_path - Check if a rename or link action is allowed
 *
 * @old_dentry: File or directory requested to be moved or linked.
 * @new_dir: Destination parent directory.
 * @new_dentry: Destination file or directory.
 * @removable: Sets to true if it is a rename operation.
 * @exchange: Sets to true if it is a rename operation with RENAME_EXCHANGE.
 *
 * Because of its unprivileged constraints, Landlock relies on file hierarchies
 * (and not only inodes) to tie access rights to files.  Being able to link or
 * rename a file hierarchy brings some challenges.  Indeed, moving or linking a
 * file (i.e. creating a new reference to an inode) can have an impact on the
 * actions allowed for a set of files if it would change its parent directory
 * (i.e. reparenting).
 *
 * To avoid trivial access right bypasses, Landlock first checks if the file or
 * directory requested to be moved would gain new access rights inherited from
 * its new hierarchy.  Before returning any error, Landlock then checks that
 * the parent source hierarchy and the destination hierarchy would allow the
 * link or rename action.  If it is not the case, an error with EACCES is
 * returned to inform user space that there is no way to remove or create the
 * requested source file type.  If it should be allowed but the new inherited
 * access rights would be greater than the source access rights, then the
 * kernel returns an error with EXDEV.  Prioritizing EACCES over EXDEV enables
 * user space to abort the whole operation if there is no way to do it, or to
 * manually copy the source to the destination if this remains allowed, e.g.
 * because file creation is allowed on the destination directory but not direct
 * linking.
 *
 * To achieve this goal, the kernel needs to compare two file hierarchies: the
 * one identifying the source file or directory (including itself), and the
 * destination one.  This can be seen as a multilayer partial ordering problem.
 * The kernel walks through these paths and collects in a matrix the access
 * rights that are denied per layer.  These matrices are then compared to see
 * if the destination one has more (or the same) restrictions as the source
 * one.  If this is the case, the requested action will not return EXDEV, which
 * doesn't mean the action is allowed.  The parent hierarchy of the source
 * (i.e. parent directory), and the destination hierarchy must also be checked
 * to verify that they explicitly allow such action (i.e.  referencing,
 * creation and potentially removal rights).  The kernel implementation is then
 * required to rely on potentially four matrices of access rights: one for the
 * source file or directory (i.e. the child), a potentially other one for the
 * other source/destination (in case of RENAME_EXCHANGE), one for the source
 * parent hierarchy and a last one for the destination hierarchy.  These
 * ephemeral matrices take some space on the stack, which limits the number of
 * layers to a deemed reasonable number: 16.
 *
 * Return: 0 if access is allowed, -EXDEV if @old_dentry would inherit new
 * access rights from @new_dir, or -EACCES if file removal or creation is
 * denied.
 */
static int current_check_refer_path(struct dentry *const old_dentry,
                                    const struct path *const new_dir,
                                    struct dentry *const new_dentry,
                                    const bool removable, const bool exchange)
{
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs, NULL);
        bool allow_parent1, allow_parent2;
        access_mask_t access_request_parent1, access_request_parent2;
        struct path mnt_dir;
        struct dentry *old_parent;
        struct layer_access_masks layer_masks_parent1 = {},
                                  layer_masks_parent2 = {};
        struct landlock_request request1 = {}, request2 = {};

        if (!subject)
                return 0;

        if (unlikely(d_is_negative(old_dentry)))
                return -ENOENT;
        if (exchange) {
                if (unlikely(d_is_negative(new_dentry)))
                        return -ENOENT;
                access_request_parent1 =
                        get_mode_access(d_backing_inode(new_dentry)->i_mode);
        } else {
                access_request_parent1 = 0;
        }
        access_request_parent2 =
                get_mode_access(d_backing_inode(old_dentry)->i_mode);
        if (removable) {
                access_request_parent1 |= maybe_remove(old_dentry);
                access_request_parent2 |= maybe_remove(new_dentry);
        }

        /* The mount points are the same for old and new paths, cf. EXDEV. */
        if (old_dentry->d_parent == new_dir->dentry) {
                /*
                 * The LANDLOCK_ACCESS_FS_REFER access right is not required
                 * for same-directory referer (i.e. no reparenting).
                 */
                access_request_parent1 = landlock_init_layer_masks(
                        subject->domain,
                        access_request_parent1 | access_request_parent2,
                        &layer_masks_parent1, LANDLOCK_KEY_INODE);
                if (is_access_to_paths_allowed(subject->domain, new_dir,
                                               access_request_parent1,
                                               &layer_masks_parent1, &request1,
                                               NULL, 0, NULL, NULL, NULL))
                        return 0;

                landlock_log_denial(subject, &request1);
                return -EACCES;
        }

        access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER;
        access_request_parent2 |= LANDLOCK_ACCESS_FS_REFER;

        /* Saves the common mount point. */
        mnt_dir.mnt = new_dir->mnt;
        mnt_dir.dentry = new_dir->mnt->mnt_root;

        /*
         * old_dentry may be the root of the common mount point and
         * !IS_ROOT(old_dentry) at the same time (e.g. with open_tree() and
         * OPEN_TREE_CLONE).  We do not need to call dget(old_parent) because
         * we keep a reference to old_dentry.
         */
        old_parent = (old_dentry == mnt_dir.dentry) ? old_dentry :
                                                      old_dentry->d_parent;

        /* new_dir->dentry is equal to new_dentry->d_parent */
        allow_parent1 = collect_domain_accesses(subject->domain, mnt_dir.dentry,
                                                old_parent,
                                                &layer_masks_parent1);
        allow_parent2 = collect_domain_accesses(subject->domain, mnt_dir.dentry,
                                                new_dir->dentry,
                                                &layer_masks_parent2);

        if (allow_parent1 && allow_parent2)
                return 0;

        /*
         * To be able to compare source and destination domain access rights,
         * take into account the @old_dentry access rights aggregated with its
         * parent access rights.  This will be useful to compare with the
         * destination parent access rights.
         */
        if (is_access_to_paths_allowed(
                    subject->domain, &mnt_dir, access_request_parent1,
                    &layer_masks_parent1, &request1, old_dentry,
                    access_request_parent2, &layer_masks_parent2, &request2,
                    exchange ? new_dentry : NULL))
                return 0;

        if (request1.access) {
                request1.audit.u.path.dentry = old_parent;
                landlock_log_denial(subject, &request1);
        }
        if (request2.access) {
                request2.audit.u.path.dentry = new_dir->dentry;
                landlock_log_denial(subject, &request2);
        }

        /*
         * This prioritizes EACCES over EXDEV for all actions, including
         * renames with RENAME_EXCHANGE.
         */
        if (likely(is_eacces(&layer_masks_parent1, access_request_parent1) ||
                   is_eacces(&layer_masks_parent2, access_request_parent2)))
                return -EACCES;

        /*
         * Gracefully forbids reparenting if the destination directory
         * hierarchy is not a superset of restrictions of the source directory
         * hierarchy, or if LANDLOCK_ACCESS_FS_REFER is not allowed by the
         * source or the destination.
         */
        return -EXDEV;
}

/* Inode hooks */

static void hook_inode_free_security_rcu(void *inode_security)
{
        struct landlock_inode_security *inode_sec;

        /*
         * All inodes must already have been untied from their object by
         * release_inode() or hook_sb_delete().
         */
        inode_sec = inode_security + landlock_blob_sizes.lbs_inode;
        WARN_ON_ONCE(inode_sec->object);
}

/* Super-block hooks */

/*
 * Release the inodes used in a security policy.
 *
 * Cf. fsnotify_unmount_inodes() and evict_inodes()
 */
static void hook_sb_delete(struct super_block *const sb)
{
        struct inode *inode, *prev_inode = NULL;

        if (!landlock_initialized)
                return;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                struct landlock_object *object;

                /* Only handles referenced inodes. */
                if (!icount_read(inode))
                        continue;

                /*
                 * Protects against concurrent modification of inode (e.g.
                 * from get_inode_object()).
                 */
                spin_lock(&inode->i_lock);
                /*
                 * Checks I_FREEING and I_WILL_FREE  to protect against a race
                 * condition when release_inode() just called iput(), which
                 * could lead to a NULL dereference of inode->security or a
                 * second call to iput() for the same Landlock object.  Also
                 * checks I_NEW because such inode cannot be tied to an object.
                 */
                if (inode_state_read(inode) &
                    (I_FREEING | I_WILL_FREE | I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                rcu_read_lock();
                object = rcu_dereference(landlock_inode(inode)->object);
                if (!object) {
                        rcu_read_unlock();
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                /* Keeps a reference to this inode until the next loop walk. */
                __iget(inode);
                spin_unlock(&inode->i_lock);

                /*
                 * If there is no concurrent release_inode() ongoing, then we
                 * are in charge of calling iput() on this inode, otherwise we
                 * will just wait for it to finish.
                 */
                spin_lock(&object->lock);
                if (object->underobj == inode) {
                        object->underobj = NULL;
                        spin_unlock(&object->lock);
                        rcu_read_unlock();

                        /*
                         * Because object->underobj was not NULL,
                         * release_inode() and get_inode_object() guarantee
                         * that it is safe to reset
                         * landlock_inode(inode)->object while it is not NULL.
                         * It is therefore not necessary to lock inode->i_lock.
                         */
                        rcu_assign_pointer(landlock_inode(inode)->object, NULL);
                        /*
                         * At this point, we own the ihold() reference that was
                         * originally set up by get_inode_object() and the
                         * __iget() reference that we just set in this loop
                         * walk.  Therefore there are at least two references
                         * on the inode.
                         */
                        iput_not_last(inode);
                } else {
                        spin_unlock(&object->lock);
                        rcu_read_unlock();
                }

                if (prev_inode) {
                        /*
                         * At this point, we still own the __iget() reference
                         * that we just set in this loop walk.  Therefore we
                         * can drop the list lock and know that the inode won't
                         * disappear from under us until the next loop walk.
                         */
                        spin_unlock(&sb->s_inode_list_lock);
                        /*
                         * We can now actually put the inode reference from the
                         * previous loop walk, which is not needed anymore.
                         */
                        iput(prev_inode);
                        cond_resched();
                        spin_lock(&sb->s_inode_list_lock);
                }
                prev_inode = inode;
        }
        spin_unlock(&sb->s_inode_list_lock);

        /* Puts the inode reference from the last loop walk, if any. */
        if (prev_inode)
                iput(prev_inode);
        /* Waits for pending iput() in release_inode(). */
        wait_var_event(&landlock_superblock(sb)->inode_refs,
                       !atomic_long_read(&landlock_superblock(sb)->inode_refs));
}

static void
log_fs_change_topology_path(const struct landlock_cred_security *const subject,
                            size_t handle_layer, const struct path *const path)
{
        landlock_log_denial(subject, &(struct landlock_request) {
                .type = LANDLOCK_REQUEST_FS_CHANGE_TOPOLOGY,
                .audit = {
                        .type = LSM_AUDIT_DATA_PATH,
                        .u.path = *path,
                },
                .layer_plus_one = handle_layer + 1,
        });
}

static void log_fs_change_topology_dentry(
        const struct landlock_cred_security *const subject, size_t handle_layer,
        struct dentry *const dentry)
{
        landlock_log_denial(subject, &(struct landlock_request) {
                .type = LANDLOCK_REQUEST_FS_CHANGE_TOPOLOGY,
                .audit = {
                        .type = LSM_AUDIT_DATA_DENTRY,
                        .u.dentry = dentry,
                },
                .layer_plus_one = handle_layer + 1,
        });
}

/*
 * Because a Landlock security policy is defined according to the filesystem
 * topology (i.e. the mount namespace), changing it may grant access to files
 * not previously allowed.
 *
 * To make it simple, deny any filesystem topology modification by landlocked
 * processes.  Non-landlocked processes may still change the namespace of a
 * landlocked process, but this kind of threat must be handled by a system-wide
 * access-control security policy.
 *
 * This could be lifted in the future if Landlock can safely handle mount
 * namespace updates requested by a landlocked process.  Indeed, we could
 * update the current domain (which is currently read-only) by taking into
 * account the accesses of the source and the destination of a new mount point.
 * However, it would also require to make all the child domains dynamically
 * inherit these new constraints.  Anyway, for backward compatibility reasons,
 * a dedicated user space option would be required (e.g. as a ruleset flag).
 */
static int hook_sb_mount(const char *const dev_name,
                         const struct path *const path, const char *const type,
                         const unsigned long flags, void *const data)
{
        size_t handle_layer;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs,
                                                &handle_layer);

        if (!subject)
                return 0;

        log_fs_change_topology_path(subject, handle_layer, path);
        return -EPERM;
}

static int hook_move_mount(const struct path *const from_path,
                           const struct path *const to_path)
{
        size_t handle_layer;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs,
                                                &handle_layer);

        if (!subject)
                return 0;

        log_fs_change_topology_path(subject, handle_layer, to_path);
        return -EPERM;
}

/*
 * Removing a mount point may reveal a previously hidden file hierarchy, which
 * may then grant access to files, which may have previously been forbidden.
 */
static int hook_sb_umount(struct vfsmount *const mnt, const int flags)
{
        size_t handle_layer;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs,
                                                &handle_layer);

        if (!subject)
                return 0;

        log_fs_change_topology_dentry(subject, handle_layer, mnt->mnt_root);
        return -EPERM;
}

static int hook_sb_remount(struct super_block *const sb, void *const mnt_opts)
{
        size_t handle_layer;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs,
                                                &handle_layer);

        if (!subject)
                return 0;

        log_fs_change_topology_dentry(subject, handle_layer, sb->s_root);
        return -EPERM;
}

/*
 * pivot_root(2), like mount(2), changes the current mount namespace.  It must
 * then be forbidden for a landlocked process.
 *
 * However, chroot(2) may be allowed because it only changes the relative root
 * directory of the current process.  Moreover, it can be used to restrict the
 * view of the filesystem.
 */
static int hook_sb_pivotroot(const struct path *const old_path,
                             const struct path *const new_path)
{
        size_t handle_layer;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs,
                                                &handle_layer);

        if (!subject)
                return 0;

        log_fs_change_topology_path(subject, handle_layer, new_path);
        return -EPERM;
}

/* Path hooks */

static int hook_path_link(struct dentry *const old_dentry,
                          const struct path *const new_dir,
                          struct dentry *const new_dentry)
{
        return current_check_refer_path(old_dentry, new_dir, new_dentry, false,
                                        false);
}

static int hook_path_rename(const struct path *const old_dir,
                            struct dentry *const old_dentry,
                            const struct path *const new_dir,
                            struct dentry *const new_dentry,
                            const unsigned int flags)
{
        /* old_dir refers to old_dentry->d_parent and new_dir->mnt */
        return current_check_refer_path(old_dentry, new_dir, new_dentry, true,
                                        !!(flags & RENAME_EXCHANGE));
}

static int hook_path_mkdir(const struct path *const dir,
                           struct dentry *const dentry, const umode_t mode)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_DIR);
}

static int hook_path_mknod(const struct path *const dir,
                           struct dentry *const dentry, const umode_t mode,
                           const unsigned int dev)
{
        return current_check_access_path(dir, get_mode_access(mode));
}

static int hook_path_symlink(const struct path *const dir,
                             struct dentry *const dentry,
                             const char *const old_name)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_SYM);
}

static int hook_path_unlink(const struct path *const dir,
                            struct dentry *const dentry)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_FILE);
}

static int hook_path_rmdir(const struct path *const dir,
                           struct dentry *const dentry)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_DIR);
}

static int hook_path_truncate(const struct path *const path)
{
        return current_check_access_path(path, LANDLOCK_ACCESS_FS_TRUNCATE);
}

/**
 * unmask_scoped_access - Remove access right bits in @masks in all layers
 *                        where @client and @server have the same domain
 *
 * This does the same as domain_is_scoped(), but unmasks bits in @masks.
 * It can not return early as domain_is_scoped() does.
 *
 * A scoped access for a given access right bit is allowed iff, for all layer
 * depths where the access bit is set, the client and server domain are the
 * same.  This function clears the access rights @access in @masks at all layer
 * depths where the client and server domain are the same, so that, when they
 * are all cleared, the access is allowed.
 *
 * @client: Client domain
 * @server: Server domain
 * @masks: Layer access masks to unmask
 * @access: Access bits that control scoping
 */
static void unmask_scoped_access(const struct landlock_ruleset *const client,
                                 const struct landlock_ruleset *const server,
                                 struct layer_access_masks *const masks,
                                 const access_mask_t access)
{
        int client_layer, server_layer;
        const struct landlock_hierarchy *client_walker, *server_walker;

        /* This should not happen. */
        if (WARN_ON_ONCE(!client))
                return;

        /* Server has no Landlock domain; nothing to clear. */
        if (!server)
                return;

        /*
         * client_layer must be able to represent all numbers from
         * LANDLOCK_MAX_NUM_LAYERS - 1 to -1 for the loop below to terminate.
         * (It must be large enough, and it must be signed.)
         */
        BUILD_BUG_ON(!is_signed_type(typeof(client_layer)));
        BUILD_BUG_ON(LANDLOCK_MAX_NUM_LAYERS - 1 >
                     type_max(typeof(client_layer)));

        client_layer = client->num_layers - 1;
        client_walker = client->hierarchy;
        server_layer = server->num_layers - 1;
        server_walker = server->hierarchy;

        /*
         * Clears the access bits at all layers where the client domain is the
         * same as the server domain.  We start the walk at min(client_layer,
         * server_layer).  The layer bits until there can not be cleared because
         * either the client or the server domain is missing.
         */
        for (; client_layer > server_layer; client_layer--)
                client_walker = client_walker->parent;

        for (; server_layer > client_layer; server_layer--)
                server_walker = server_walker->parent;

        for (; client_layer >= 0; client_layer--) {
                if (masks->access[client_layer] & access &&
                    client_walker == server_walker)
                        masks->access[client_layer] &= ~access;

                client_walker = client_walker->parent;
                server_walker = server_walker->parent;
        }
}

static int hook_unix_find(const struct path *const path, struct sock *other,
                          int flags)
{
        const struct landlock_ruleset *dom_other;
        const struct landlock_cred_security *subject;
        struct layer_access_masks layer_masks;
        struct landlock_request request = {};
        static const struct access_masks fs_resolve_unix = {
                .fs = LANDLOCK_ACCESS_FS_RESOLVE_UNIX,
        };

        /* Lookup for the purpose of saving coredumps is OK. */
        if (unlikely(flags & SOCK_COREDUMP))
                return 0;

        subject = landlock_get_applicable_subject(current_cred(),
                                                  fs_resolve_unix, NULL);

        if (!subject)
                return 0;

        /*
         * Ignoring return value: that the domains apply was already checked in
         * landlock_get_applicable_subject() above.
         */
        landlock_init_layer_masks(subject->domain, fs_resolve_unix.fs,
                                  &layer_masks, LANDLOCK_KEY_INODE);

        /* Checks the layers in which we are connecting within the same domain. */
        unix_state_lock(other);
        if (unlikely(sock_flag(other, SOCK_DEAD) || !other->sk_socket ||
                     !other->sk_socket->file)) {
                unix_state_unlock(other);
                /*
                 * We rely on the caller to catch the (non-reversible) SOCK_DEAD
                 * condition and retry the lookup.  If we returned an error
                 * here, the lookup would not get retried.
                 */
                return 0;
        }
        dom_other = landlock_cred(other->sk_socket->file->f_cred)->domain;

        /* Access to the same (or a lower) domain is always allowed. */
        unmask_scoped_access(subject->domain, dom_other, &layer_masks,
                             fs_resolve_unix.fs);
        unix_state_unlock(other);

        /* Checks the connections to allow-listed paths. */
        if (is_access_to_paths_allowed(subject->domain, path,
                                       fs_resolve_unix.fs, &layer_masks,
                                       &request, NULL, 0, NULL, NULL, NULL))
                return 0;

        landlock_log_denial(subject, &request);
        return -EACCES;
}

/* File hooks */

/**
 * get_required_file_open_access - Get access needed to open a file
 *
 * @file: File being opened.
 *
 * Return: The access rights that are required for opening the given file,
 * depending on the file type and open mode.
 */
static access_mask_t
get_required_file_open_access(const struct file *const file)
{
        access_mask_t access = 0;

        if (file->f_mode & FMODE_READ) {
                /* A directory can only be opened in read mode. */
                if (S_ISDIR(file_inode(file)->i_mode))
                        return LANDLOCK_ACCESS_FS_READ_DIR;
                access = LANDLOCK_ACCESS_FS_READ_FILE;
        }
        if (file->f_mode & FMODE_WRITE)
                access |= LANDLOCK_ACCESS_FS_WRITE_FILE;
        /* __FMODE_EXEC is indeed part of f_flags, not f_mode. */
        if (file->f_flags & __FMODE_EXEC)
                access |= LANDLOCK_ACCESS_FS_EXECUTE;
        return access;
}

static int hook_file_alloc_security(struct file *const file)
{
        /*
         * Grants all access rights, even if most of them are not checked later
         * on. It is more consistent.
         *
         * Notably, file descriptors for regular files can also be acquired
         * without going through the file_open hook, for example when using
         * memfd_create(2).
         */
        landlock_file(file)->allowed_access = LANDLOCK_MASK_ACCESS_FS;
        return 0;
}

static bool is_device(const struct file *const file)
{
        const struct inode *inode = file_inode(file);

        return S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode);
}

static int hook_file_open(struct file *const file)
{
        struct layer_access_masks layer_masks = {};
        access_mask_t open_access_request, full_access_request, allowed_access,
                optional_access;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(file->f_cred, any_fs, NULL);
        struct landlock_request request = {};

        if (!subject)
                return 0;

        /*
         * Because a file may be opened with O_PATH, get_required_file_open_access()
         * may return 0.  This case will be handled with a future Landlock
         * evolution.
         */
        open_access_request = get_required_file_open_access(file);

        /*
         * We look up more access than what we immediately need for open(), so
         * that we can later authorize operations on opened files.
         */
        optional_access = LANDLOCK_ACCESS_FS_TRUNCATE;
        if (is_device(file))
                optional_access |= LANDLOCK_ACCESS_FS_IOCTL_DEV;

        full_access_request = open_access_request | optional_access;

        if (is_access_to_paths_allowed(
                    subject->domain, &file->f_path,
                    landlock_init_layer_masks(subject->domain,
                                              full_access_request, &layer_masks,
                                              LANDLOCK_KEY_INODE),
                    &layer_masks, &request, NULL, 0, NULL, NULL, NULL)) {
                allowed_access = full_access_request;
        } else {
                /*
                 * Calculate the actual allowed access rights from layer_masks.
                 * Remove the access rights from the full access request which
                 * are still unfulfilled in any of the layers.
                 */
                allowed_access = full_access_request;
                for (size_t i = 0; i < ARRAY_SIZE(layer_masks.access); i++)
                        allowed_access &= ~layer_masks.access[i];
        }

        /*
         * For operations on already opened files (i.e. ftruncate()), it is the
         * access rights at the time of open() which decide whether the
         * operation is permitted. Therefore, we record the relevant subset of
         * file access rights in the opened struct file.
         */
        landlock_file(file)->allowed_access = allowed_access;
#ifdef CONFIG_AUDIT
        landlock_file(file)->deny_masks = landlock_get_deny_masks(
                _LANDLOCK_ACCESS_FS_OPTIONAL, optional_access, &layer_masks);
#endif /* CONFIG_AUDIT */

        if (access_mask_subset(open_access_request, allowed_access))
                return 0;

        /* Sets access to reflect the actual request. */
        request.access = open_access_request;
        landlock_log_denial(subject, &request);
        return -EACCES;
}

static int hook_file_truncate(struct file *const file)
{
        /*
         * Allows truncation if the truncate right was available at the time of
         * opening the file, to get a consistent access check as for read, write
         * and execute operations.
         *
         * Note: For checks done based on the file's Landlock allowed access, we
         * enforce them independently of whether the current thread is in a
         * Landlock domain, so that open files passed between independent
         * processes retain their behaviour.
         */
        if (landlock_file(file)->allowed_access & LANDLOCK_ACCESS_FS_TRUNCATE)
                return 0;

        landlock_log_denial(landlock_cred(file->f_cred), &(struct landlock_request) {
                .type = LANDLOCK_REQUEST_FS_ACCESS,
                .audit = {
                        .type = LSM_AUDIT_DATA_FILE,
                        .u.file = file,
                },
                .all_existing_optional_access = _LANDLOCK_ACCESS_FS_OPTIONAL,
                .access = LANDLOCK_ACCESS_FS_TRUNCATE,
#ifdef CONFIG_AUDIT
                .deny_masks = landlock_file(file)->deny_masks,
#endif /* CONFIG_AUDIT */
        });
        return -EACCES;
}

static int hook_file_ioctl_common(const struct file *const file,
                                  const unsigned int cmd, const bool is_compat)
{
        access_mask_t allowed_access = landlock_file(file)->allowed_access;

        /*
         * It is the access rights at the time of opening the file which
         * determine whether IOCTL can be used on the opened file later.
         *
         * The access right is attached to the opened file in hook_file_open().
         */
        if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV)
                return 0;

        if (!is_device(file))
                return 0;

        if (unlikely(is_compat) ? is_masked_device_ioctl_compat(cmd) :
                                  is_masked_device_ioctl(cmd))
                return 0;

        landlock_log_denial(landlock_cred(file->f_cred), &(struct landlock_request) {
                .type = LANDLOCK_REQUEST_FS_ACCESS,
                .audit = {
                        .type = LSM_AUDIT_DATA_IOCTL_OP,
                        .u.op = &(struct lsm_ioctlop_audit) {
                                .path = file->f_path,
                                .cmd = cmd,
                        },
                },
                .all_existing_optional_access = _LANDLOCK_ACCESS_FS_OPTIONAL,
                .access = LANDLOCK_ACCESS_FS_IOCTL_DEV,
#ifdef CONFIG_AUDIT
                .deny_masks = landlock_file(file)->deny_masks,
#endif /* CONFIG_AUDIT */
        });
        return -EACCES;
}

static int hook_file_ioctl(struct file *file, unsigned int cmd,
                           unsigned long arg)
{
        return hook_file_ioctl_common(file, cmd, false);
}

static int hook_file_ioctl_compat(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        return hook_file_ioctl_common(file, cmd, true);
}

/*
 * Always allow sending signals between threads of the same process.  This
 * ensures consistency with hook_task_kill().
 */
static bool control_current_fowner(struct fown_struct *const fown)
{
        struct task_struct *p;

        /*
         * Lock already held by __f_setown(), see commit 26f204380a3c ("fs: Fix
         * file_set_fowner LSM hook inconsistencies").
         */
        lockdep_assert_held(&fown->lock);

        /*
         * Some callers (e.g. fcntl_dirnotify) may not be in an RCU read-side
         * critical section.
         */
        guard(rcu)();
        p = pid_task(fown->pid, fown->pid_type);
        if (!p)
                return true;

        return !same_thread_group(p, current);
}

static void hook_file_set_fowner(struct file *file)
{
        struct landlock_ruleset *prev_dom;
        struct landlock_cred_security fown_subject = {};
        size_t fown_layer = 0;

        if (control_current_fowner(file_f_owner(file))) {
                static const struct access_masks signal_scope = {
                        .scope = LANDLOCK_SCOPE_SIGNAL,
                };
                const struct landlock_cred_security *new_subject =
                        landlock_get_applicable_subject(
                                current_cred(), signal_scope, &fown_layer);
                if (new_subject) {
                        landlock_get_ruleset(new_subject->domain);
                        fown_subject = *new_subject;
                }
        }

        prev_dom = landlock_file(file)->fown_subject.domain;
        landlock_file(file)->fown_subject = fown_subject;
#ifdef CONFIG_AUDIT
        landlock_file(file)->fown_layer = fown_layer;
#endif /* CONFIG_AUDIT*/

        /* May be called in an RCU read-side critical section. */
        landlock_put_ruleset_deferred(prev_dom);
}

static void hook_file_free_security(struct file *file)
{
        landlock_put_ruleset_deferred(landlock_file(file)->fown_subject.domain);
}

static struct security_hook_list landlock_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(inode_free_security_rcu, hook_inode_free_security_rcu),

        LSM_HOOK_INIT(sb_delete, hook_sb_delete),
        LSM_HOOK_INIT(sb_mount, hook_sb_mount),
        LSM_HOOK_INIT(move_mount, hook_move_mount),
        LSM_HOOK_INIT(sb_umount, hook_sb_umount),
        LSM_HOOK_INIT(sb_remount, hook_sb_remount),
        LSM_HOOK_INIT(sb_pivotroot, hook_sb_pivotroot),

        LSM_HOOK_INIT(path_link, hook_path_link),
        LSM_HOOK_INIT(path_rename, hook_path_rename),
        LSM_HOOK_INIT(path_mkdir, hook_path_mkdir),
        LSM_HOOK_INIT(path_mknod, hook_path_mknod),
        LSM_HOOK_INIT(path_symlink, hook_path_symlink),
        LSM_HOOK_INIT(path_unlink, hook_path_unlink),
        LSM_HOOK_INIT(path_rmdir, hook_path_rmdir),
        LSM_HOOK_INIT(path_truncate, hook_path_truncate),
        LSM_HOOK_INIT(unix_find, hook_unix_find),

        LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security),
        LSM_HOOK_INIT(file_open, hook_file_open),
        LSM_HOOK_INIT(file_truncate, hook_file_truncate),
        LSM_HOOK_INIT(file_ioctl, hook_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, hook_file_ioctl_compat),
        LSM_HOOK_INIT(file_set_fowner, hook_file_set_fowner),
        LSM_HOOK_INIT(file_free_security, hook_file_free_security),
};

__init void landlock_add_fs_hooks(void)
{
        security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
                           &landlock_lsmid);
}

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

/* clang-format off */
static struct kunit_case test_cases[] = {
        KUNIT_CASE(test_no_more_access),
        KUNIT_CASE(test_scope_to_request_with_exec_none),
        KUNIT_CASE(test_scope_to_request_with_exec_some),
        KUNIT_CASE(test_scope_to_request_without_access),
        KUNIT_CASE(test_is_eacces_with_none),
        KUNIT_CASE(test_is_eacces_with_refer),
        KUNIT_CASE(test_is_eacces_with_write),
        {}
};
/* clang-format on */

static struct kunit_suite test_suite = {
        .name = "landlock_fs",
        .test_cases = test_cases,
};

kunit_test_suite(test_suite);

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */







































































































    3 










    3 





    3 










    3 











    3 

























































    3 







    1 


    2 




    3 





    3 







    3 



    3 

    3 













































































    3 



    3 





    3 


































    1 








    2 













    2 


































    2 



    2 






    1 








    1 








































































































































































































































































































































































































































































































































    3 




















    3 
    3 




    3 















    1 

    1 

































    3 












    3 
















    3 


    3 














    2 























    3 

    3 



    3 





















    3 













    3 




    3 













    3 
    3 




















    2 

    2 





    3 










































































    2 

















    3 








    3 










    3 

    3 




















    3 

























    2 








    3 






    3 




















    3 
















    3 

    3 






















    3 
    2 



    3 






    3 




    2 




    3 


    3 





    3 


    3 





    3 










    3 














    3 
    2 
    1 









    3 







    3 















































































    1 


















    1 






    1 



    1 




    1 



    1 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
// SPDX-License-Identifier: GPL-2.0-only
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                The Internet Protocol (IP) output module.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Donald Becker, <becker@super.org>
 *                Alan Cox, <Alan.Cox@linux.org>
 *                Richard Underwood
 *                Stefan Becker, <stefanb@yello.ping.de>
 *                Jorge Cwik, <jorge@laser.satlink.net>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 *        See ip_input.c for original log
 *
 *        Fixes:
 *                Alan Cox        :        Missing nonblock feature in ip_build_xmit.
 *                Mike Kilburn        :        htons() missing in ip_build_xmit.
 *                Bradford Johnson:        Fix faulty handling of some frames when
 *                                        no route is found.
 *                Alexander Demenshin:        Missing sk/skb free in ip_queue_xmit
 *                                        (in case if packet not accepted by
 *                                        output firewall rules)
 *                Mike McLagan        :        Routing by source
 *                Alexey Kuznetsov:        use new route cache
 *                Andi Kleen:                Fix broken PMTU recovery and remove
 *                                        some redundant tests.
 *        Vitaly E. Lavrov        :        Transparent proxy revived after year coma.
 *                Andi Kleen        :         Replace ip_reply with ip_send_reply.
 *                Andi Kleen        :        Split fast and slow ip_build_xmit path
 *                                        for decreased register pressure on x86
 *                                        and more readability.
 *                Marc Boucher        :        When call_out_firewall returns FW_QUEUE,
 *                                        silently drop skb instead of failing with -EPERM.
 *                Detlev Wengorz        :        Copy protocol for fragments.
 *                Hirokazu Takahashi:        HW checksumming for outgoing UDP
 *                                        datagrams.
 *                Hirokazu Takahashi:        sendfile() on UDP works now.
 */

#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/slab.h>

#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/init.h>

#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
#include <net/checksum.h>
#include <net/gso.h>
#include <net/inetpeer.h>
#include <net/lwtunnel.h>
#include <net/inet_dscp.h>
#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
#include <linux/netlink.h>
#include <linux/tcp.h>
#include <net/psp.h>

static int
ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
            unsigned int mtu,
            int (*output)(struct net *, struct sock *, struct sk_buff *));

/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
{
        iph->check = 0;
        iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
EXPORT_SYMBOL(ip_send_check);

int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct iphdr *iph = ip_hdr(skb);

        IP_INC_STATS(net, IPSTATS_MIB_OUTREQUESTS);

        iph_set_totlen(iph, skb->len);
        ip_send_check(iph);

        /* if egress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
        skb = l3mdev_ip_out(sk, skb);
        if (unlikely(!skb))
                return 0;

        skb->protocol = htons(ETH_P_IP);

        return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
                       net, sk, skb, NULL, skb_dst_dev(skb),
                       dst_output);
}

int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        int err;

        err = __ip_local_out(net, sk, skb);
        if (likely(err == 1))
                err = dst_output(net, sk, skb);

        return err;
}
EXPORT_SYMBOL_GPL(ip_local_out);

static inline int ip_select_ttl(const struct inet_sock *inet,
                                const struct dst_entry *dst)
{
        int ttl = READ_ONCE(inet->uc_ttl);

        if (ttl < 0)
                ttl = ip4_dst_hoplimit(dst);
        return ttl;
}

/*
 *                Add an ip header to a skbuff and send it out.
 *
 */
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
                          __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
                          u8 tos)
{
        const struct inet_sock *inet = inet_sk(sk);
        struct rtable *rt = skb_rtable(skb);
        struct net *net = sock_net(sk);
        struct iphdr *iph;

        /* Build the IP header. */
        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
        iph->version  = 4;
        iph->ihl      = 5;
        iph->tos      = tos;
        iph->ttl      = ip_select_ttl(inet, &rt->dst);
        iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
        iph->saddr    = saddr;
        iph->protocol = sk->sk_protocol;
        /* Do not bother generating IPID for small packets (eg SYNACK) */
        if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) {
                iph->frag_off = htons(IP_DF);
                iph->id = 0;
        } else {
                iph->frag_off = 0;
                /* TCP packets here are SYNACK with fat IPv4/TCP options.
                 * Avoid using the hashed IP ident generator.
                 */
                if (sk->sk_protocol == IPPROTO_TCP)
                        iph->id = (__force __be16)get_random_u16();
                else
                        __ip_select_ident(net, iph, 1);
        }

        if (opt && opt->opt.optlen) {
                iph->ihl += opt->opt.optlen>>2;
                ip_options_build(skb, &opt->opt, daddr, rt);
        }

        skb->priority = READ_ONCE(sk->sk_priority);
        if (!skb->mark)
                skb->mark = READ_ONCE(sk->sk_mark);

        /* Send it out. */
        return ip_local_out(net, skb->sk, skb);
}
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);

static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        struct rtable *rt = dst_rtable(dst);
        struct net_device *dev = dst_dev(dst);
        unsigned int hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
        bool is_v6gw = false;

        if (rt->rt_type == RTN_MULTICAST) {
                IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
        } else if (rt->rt_type == RTN_BROADCAST)
                IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);

        /* OUTOCTETS should be counted after fragment */
        IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb)
                        return -ENOMEM;
        }

        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
                int res = lwtunnel_xmit(skb);

                if (res != LWTUNNEL_XMIT_CONTINUE)
                        return res;
        }

        rcu_read_lock();
        neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
        if (!IS_ERR(neigh)) {
                int res;

                sock_confirm_neigh(skb, neigh);
                /* if crossing protocols, can not use the cached header */
                res = neigh_output(neigh, skb, is_v6gw);
                rcu_read_unlock();
                return res;
        }
        rcu_read_unlock();

        net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
                            __func__);
        kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
        return PTR_ERR(neigh);
}

static int ip_finish_output_gso(struct net *net, struct sock *sk,
                                struct sk_buff *skb, unsigned int mtu)
{
        struct sk_buff *segs, *nskb;
        netdev_features_t features;
        int ret = 0;

        /* common case: seglen is <= mtu
         */
        if (skb_gso_validate_network_len(skb, mtu))
                return ip_finish_output2(net, sk, skb);

        /* Slowpath -  GSO segment length exceeds the egress MTU.
         *
         * This can happen in several cases:
         *  - Forwarding of a TCP GRO skb, when DF flag is not set.
         *  - Forwarding of an skb that arrived on a virtualization interface
         *    (virtio-net/vhost/tap) with TSO/GSO size set by other network
         *    stack.
         *  - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
         *    interface with a smaller MTU.
         *  - Arriving GRO skb (or GSO skb in a virtualized environment) that is
         *    bridged to a NETIF_F_TSO tunnel stacked over an interface with an
         *    insufficient MTU.
         */
        features = netif_skb_features(skb);
        BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
        if (IS_ERR_OR_NULL(segs)) {
                kfree_skb(skb);
                return -ENOMEM;
        }

        consume_skb(skb);

        skb_list_walk_safe(segs, segs, nskb) {
                int err;

                skb_mark_not_on_list(segs);
                err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);

                if (err && ret == 0)
                        ret = err;
        }

        return ret;
}

static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        unsigned int mtu;

#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
        /* Policy lookup after SNAT yielded a new policy */
        if (skb_dst(skb)->xfrm) {
                IPCB(skb)->flags |= IPSKB_REROUTED;
                return dst_output(net, sk, skb);
        }
#endif
        mtu = ip_skb_dst_mtu(sk, skb);
        if (skb_is_gso(skb))
                return ip_finish_output_gso(net, sk, skb, mtu);

        if (skb->len > mtu || IPCB(skb)->frag_max_size)
                return ip_fragment(net, sk, skb, mtu, ip_finish_output2);

        return ip_finish_output2(net, sk, skb);
}

static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        int ret;

        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
        switch (ret) {
        case NET_XMIT_SUCCESS:
                return __ip_finish_output(net, sk, skb);
        case NET_XMIT_CN:
                return __ip_finish_output(net, sk, skb) ? : ret;
        default:
                kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
                return ret;
        }
}

static int ip_mc_finish_output(struct net *net, struct sock *sk,
                               struct sk_buff *skb)
{
        struct rtable *new_rt;
        bool do_cn = false;
        int ret, err;

        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
        switch (ret) {
        case NET_XMIT_CN:
                do_cn = true;
                fallthrough;
        case NET_XMIT_SUCCESS:
                break;
        default:
                kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
                return ret;
        }

        /* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting
         * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten,
         * see ipv4_pktinfo_prepare().
         */
        new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
        if (new_rt) {
                new_rt->rt_iif = 0;
                skb_dst_drop(skb);
                skb_dst_set(skb, &new_rt->dst);
        }

        err = dev_loopback_xmit(net, sk, skb);
        return (do_cn && err) ? ret : err;
}

int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct net_device *dev = rt->dst.dev;

        /*
         *        If the indicated interface is up and running, send the packet.
         */
        skb->dev = dev;
        skb->protocol = htons(ETH_P_IP);

        /*
         *        Multicasts are looped back for other local users
         */

        if (rt->rt_flags&RTCF_MULTICAST) {
                if (sk_mc_loop(sk)
#ifdef CONFIG_IP_MROUTE
                /* Small optimization: do not loopback not local frames,
                   which returned after forwarding; they will be  dropped
                   by ip_mr_input in any case.
                   Note, that local frames are looped back to be delivered
                   to local recipients.

                   This check is duplicated in ip_mr_input at the moment.
                 */
                    &&
                    ((rt->rt_flags & RTCF_LOCAL) ||
                     !(IPCB(skb)->flags & IPSKB_FORWARDED))
#endif
                   ) {
                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
                        if (newskb)
                                NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                                        net, sk, newskb, NULL, newskb->dev,
                                        ip_mc_finish_output);
                }

                /* Multicasts with ttl 0 must not go beyond the host */

                if (ip_hdr(skb)->ttl == 0) {
                        kfree_skb(skb);
                        return 0;
                }
        }

        if (rt->rt_flags&RTCF_BROADCAST) {
                struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
                if (newskb)
                        NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                                net, sk, newskb, NULL, newskb->dev,
                                ip_mc_finish_output);
        }

        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                            net, sk, skb, NULL, skb->dev,
                            ip_finish_output,
                            !(IPCB(skb)->flags & IPSKB_REROUTED));
}

int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct net_device *dev, *indev = skb->dev;
        int ret_val;

        rcu_read_lock();
        dev = skb_dst_dev_rcu(skb);
        skb->dev = dev;
        skb->protocol = htons(ETH_P_IP);

        ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                                net, sk, skb, indev, dev,
                                ip_finish_output,
                                !(IPCB(skb)->flags & IPSKB_REROUTED));
        rcu_read_unlock();
        return ret_val;
}
EXPORT_SYMBOL(ip_output);

/*
 * copy saddr and daddr, possibly using 64bit load/stores
 * Equivalent to :
 *   iph->saddr = fl4->saddr;
 *   iph->daddr = fl4->daddr;
 */
static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
{
        BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
                     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));

        iph->saddr = fl4->saddr;
        iph->daddr = fl4->daddr;
}

/* Note: skb->sk can be different from sk, in case of tunnels */
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                    __u8 tos)
{
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        struct ip_options_rcu *inet_opt;
        struct flowi4 *fl4;
        struct rtable *rt;
        struct iphdr *iph;
        int res;

        /* Skip all of this if the packet is already routed,
         * f.e. by something like SCTP.
         */
        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        fl4 = &fl->u.ip4;
        rt = skb_rtable(skb);
        if (rt)
                goto packet_routed;

        /* Make sure we can route this packet. */
        rt = dst_rtable(__sk_dst_check(sk, 0));
        if (!rt) {
                inet_sk_init_flowi4(inet, fl4);

                /* sctp_v4_xmit() uses its own DSCP value */
                fl4->flowi4_dscp = inet_dsfield_to_dscp(tos);

                /* If this fails, retransmit mechanism of transport layer will
                 * keep trying until route appears or the connection times
                 * itself out.
                 */
                rt = ip_route_output_flow(net, fl4, sk);
                if (IS_ERR(rt))
                        goto no_route;
                sk_setup_caps(sk, &rt->dst);
        }
        skb_dst_set_noref(skb, &rt->dst);

packet_routed:
        if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
                goto no_route;

        /* OK, we know where to send it, allocate and build IP header. */
        skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
        *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
        if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
                iph->frag_off = htons(IP_DF);
        else
                iph->frag_off = 0;
        iph->ttl      = ip_select_ttl(inet, &rt->dst);
        iph->protocol = sk->sk_protocol;
        ip_copy_addrs(iph, fl4);

        /* Transport layer set skb->h.foo itself. */

        if (inet_opt && inet_opt->opt.optlen) {
                iph->ihl += inet_opt->opt.optlen >> 2;
                ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
        }

        ip_select_ident_segs(net, skb, sk,
                             skb_shinfo(skb)->gso_segs ?: 1);

        /* TODO : should we use skb->sk here instead of sk ? */
        skb->priority = READ_ONCE(sk->sk_priority);
        skb->mark = READ_ONCE(sk->sk_mark);

        res = ip_local_out(net, sk, skb);
        rcu_read_unlock();
        return res;

no_route:
        rcu_read_unlock();
        IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
        kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
        return -EHOSTUNREACH;
}
EXPORT_SYMBOL(__ip_queue_xmit);

int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
        return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
}
EXPORT_SYMBOL(ip_queue_xmit);

static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
        to->pkt_type = from->pkt_type;
        to->priority = from->priority;
        to->protocol = from->protocol;
        to->skb_iif = from->skb_iif;
        skb_dst_drop(to);
        skb_dst_copy(to, from);
        to->dev = from->dev;
        to->mark = from->mark;

        skb_copy_hash(to, from);

#ifdef CONFIG_NET_SCHED
        to->tc_index = from->tc_index;
#endif
        nf_copy(to, from);
        skb_ext_copy(to, from);
#if IS_ENABLED(CONFIG_IP_VS)
        to->ipvs_property = from->ipvs_property;
#endif
        skb_copy_secmark(to, from);
}

static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                       unsigned int mtu,
                       int (*output)(struct net *, struct sock *, struct sk_buff *))
{
        struct iphdr *iph = ip_hdr(skb);

        if ((iph->frag_off & htons(IP_DF)) == 0)
                return ip_do_fragment(net, sk, skb, output);

        if (unlikely(!skb->ignore_df ||
                     (IPCB(skb)->frag_max_size &&
                      IPCB(skb)->frag_max_size > mtu))) {
                IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
                          htonl(mtu));
                kfree_skb(skb);
                return -EMSGSIZE;
        }

        return ip_do_fragment(net, sk, skb, output);
}

void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
                      unsigned int hlen, struct ip_fraglist_iter *iter)
{
        unsigned int first_len = skb_pagelen(skb);

        iter->frag = skb_shinfo(skb)->frag_list;
        skb_frag_list_init(skb);

        iter->offset = 0;
        iter->iph = iph;
        iter->hlen = hlen;

        skb->data_len = first_len - skb_headlen(skb);
        skb->len = first_len;
        iph->tot_len = htons(first_len);
        iph->frag_off = htons(IP_MF);
        ip_send_check(iph);
}
EXPORT_SYMBOL(ip_fraglist_init);

void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
{
        unsigned int hlen = iter->hlen;
        struct iphdr *iph = iter->iph;
        struct sk_buff *frag;

        frag = iter->frag;
        frag->ip_summed = CHECKSUM_NONE;
        skb_reset_transport_header(frag);
        __skb_push(frag, hlen);
        skb_reset_network_header(frag);
        memcpy(skb_network_header(frag), iph, hlen);
        iter->iph = ip_hdr(frag);
        iph = iter->iph;
        iph->tot_len = htons(frag->len);
        ip_copy_metadata(frag, skb);
        iter->offset += skb->len - hlen;
        iph->frag_off = htons(iter->offset >> 3);
        if (frag->next)
                iph->frag_off |= htons(IP_MF);
        /* Ready, complete checksum */
        ip_send_check(iph);
}
EXPORT_SYMBOL(ip_fraglist_prepare);

void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
                  unsigned int ll_rs, unsigned int mtu, bool DF,
                  struct ip_frag_state *state)
{
        struct iphdr *iph = ip_hdr(skb);

        state->DF = DF;
        state->hlen = hlen;
        state->ll_rs = ll_rs;
        state->mtu = mtu;

        state->left = skb->len - hlen;        /* Space per frame */
        state->ptr = hlen;                /* Where to start from */

        state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
        state->not_last_frag = iph->frag_off & htons(IP_MF);
}
EXPORT_SYMBOL(ip_frag_init);

static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
                         bool first_frag)
{
        /* Copy the flags to each fragment. */
        IPCB(to)->flags = IPCB(from)->flags;

        /* ANK: dirty, but effective trick. Upgrade options only if
         * the segment to be fragmented was THE FIRST (otherwise,
         * options are already fixed) and make it ONCE
         * on the initial skb, so that all the following fragments
         * will inherit fixed options.
         */
        if (first_frag)
                ip_options_fragment(from);
}

struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
{
        unsigned int len = state->left;
        struct sk_buff *skb2;
        struct iphdr *iph;

        /* IF: it doesn't fit, use 'mtu' - the data space left */
        if (len > state->mtu)
                len = state->mtu;
        /* IF: we are not sending up to and including the packet end
           then align the next start on an eight byte boundary */
        if (len < state->left)        {
                len &= ~7;
        }

        /* Allocate buffer */
        skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
        if (!skb2)
                return ERR_PTR(-ENOMEM);

        /*
         *        Set up data on packet
         */

        ip_copy_metadata(skb2, skb);
        skb_reserve(skb2, state->ll_rs);
        skb_put(skb2, len + state->hlen);
        skb_reset_network_header(skb2);
        skb2->transport_header = skb2->network_header + state->hlen;

        /*
         *        Charge the memory for the fragment to any owner
         *        it might possess
         */

        if (skb->sk)
                skb_set_owner_w(skb2, skb->sk);

        /*
         *        Copy the packet header into the new buffer.
         */

        skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);

        /*
         *        Copy a block of the IP datagram.
         */
        if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
                BUG();
        state->left -= len;

        /*
         *        Fill in the new header fields.
         */
        iph = ip_hdr(skb2);
        iph->frag_off = htons((state->offset >> 3));
        if (state->DF)
                iph->frag_off |= htons(IP_DF);

        /*
         *        Added AC : If we are fragmenting a fragment that's not the
         *                   last fragment then keep MF on each bit
         */
        if (state->left > 0 || state->not_last_frag)
                iph->frag_off |= htons(IP_MF);
        state->ptr += len;
        state->offset += len;

        iph->tot_len = htons(len + state->hlen);

        ip_send_check(iph);

        return skb2;
}
EXPORT_SYMBOL(ip_frag_next);

/*
 *        This IP datagram is too large to be sent in one piece.  Break it up into
 *        smaller pieces (each of size equal to IP header plus
 *        a block of the data of the original IP data part) that will yet fit in a
 *        single device frame, and queue such a frame for sending.
 */

int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                   int (*output)(struct net *, struct sock *, struct sk_buff *))
{
        struct iphdr *iph;
        struct sk_buff *skb2;
        u8 tstamp_type = skb->tstamp_type;
        struct rtable *rt = skb_rtable(skb);
        unsigned int mtu, hlen, ll_rs;
        struct ip_fraglist_iter iter;
        ktime_t tstamp = skb->tstamp;
        struct ip_frag_state state;
        int err = 0;

        /* for offloaded checksums cleanup checksum before fragmentation */
        if (skb->ip_summed == CHECKSUM_PARTIAL &&
            (err = skb_checksum_help(skb)))
                goto fail;

        /*
         *        Point into the IP datagram header.
         */

        iph = ip_hdr(skb);

        mtu = ip_skb_dst_mtu(sk, skb);
        if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
                mtu = IPCB(skb)->frag_max_size;

        /*
         *        Setup starting values.
         */

        hlen = iph->ihl * 4;
        mtu = mtu - hlen;        /* Size of data space */
        IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
        ll_rs = LL_RESERVED_SPACE(rt->dst.dev);

        /* When frag_list is given, use it. First, check its validity:
         * some transformers could create wrong frag_list or break existing
         * one, it is not prohibited. In this case fall back to copying.
         *
         * LATER: this step can be merged to real generation of fragments,
         * we can switch to copy when see the first bad fragment.
         */
        if (skb_has_frag_list(skb)) {
                struct sk_buff *frag, *frag2;
                unsigned int first_len = skb_pagelen(skb);

                if (first_len - hlen > mtu ||
                    ((first_len - hlen) & 7) ||
                    ip_is_fragment(iph) ||
                    skb_cloned(skb) ||
                    skb_headroom(skb) < ll_rs)
                        goto slow_path;

                skb_walk_frags(skb, frag) {
                        /* Correct geometry. */
                        if (frag->len > mtu ||
                            ((frag->len & 7) && frag->next) ||
                            skb_headroom(frag) < hlen + ll_rs)
                                goto slow_path_clean;

                        /* Partially cloned skb? */
                        if (skb_shared(frag))
                                goto slow_path_clean;

                        BUG_ON(frag->sk);
                        if (skb->sk) {
                                frag->sk = skb->sk;
                                frag->destructor = sock_wfree;
                        }
                        skb->truesize -= frag->truesize;
                }

                /* Everything is OK. Generate! */
                ip_fraglist_init(skb, iph, hlen, &iter);

                for (;;) {
                        /* Prepare header of the next frame,
                         * before previous one went down. */
                        if (iter.frag) {
                                bool first_frag = (iter.offset == 0);

                                IPCB(iter.frag)->flags = IPCB(skb)->flags;
                                ip_fraglist_prepare(skb, &iter);
                                if (first_frag && IPCB(skb)->opt.optlen) {
                                        /* ipcb->opt is not populated for frags
                                         * coming from __ip_make_skb(),
                                         * ip_options_fragment() needs optlen
                                         */
                                        IPCB(iter.frag)->opt.optlen =
                                                IPCB(skb)->opt.optlen;
                                        ip_options_fragment(iter.frag);
                                        ip_send_check(iter.iph);
                                }
                        }

                        skb_set_delivery_time(skb, tstamp, tstamp_type);
                        err = output(net, sk, skb);

                        if (!err)
                                IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
                        if (err || !iter.frag)
                                break;

                        skb = ip_fraglist_next(&iter);
                }

                if (err == 0) {
                        IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
                        return 0;
                }

                kfree_skb_list(iter.frag);

                IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
                return err;

slow_path_clean:
                skb_walk_frags(skb, frag2) {
                        if (frag2 == frag)
                                break;
                        frag2->sk = NULL;
                        frag2->destructor = NULL;
                        skb->truesize += frag2->truesize;
                }
        }

slow_path:
        /*
         *        Fragment the datagram.
         */

        ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
                     &state);

        /*
         *        Keep copying data until we run out.
         */

        while (state.left > 0) {
                bool first_frag = (state.offset == 0);

                skb2 = ip_frag_next(skb, &state);
                if (IS_ERR(skb2)) {
                        err = PTR_ERR(skb2);
                        goto fail;
                }
                ip_frag_ipcb(skb, skb2, first_frag);

                /*
                 *        Put this fragment into the sending queue.
                 */
                skb_set_delivery_time(skb2, tstamp, tstamp_type);
                err = output(net, sk, skb2);
                if (err)
                        goto fail;

                IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
        }
        consume_skb(skb);
        IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
        return err;

fail:
        kfree_skb(skb);
        IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
        return err;
}
EXPORT_SYMBOL(ip_do_fragment);

int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
        struct msghdr *msg = from;

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                if (!copy_from_iter_full(to, len, &msg->msg_iter))
                        return -EFAULT;
        } else {
                __wsum csum = 0;
                if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
                        return -EFAULT;
                skb->csum = csum_block_add(skb->csum, csum, odd);
        }
        return 0;
}
EXPORT_SYMBOL(ip_generic_getfrag);

static int __ip_append_data(struct sock *sk,
                            struct flowi4 *fl4,
                            struct sk_buff_head *queue,
                            struct inet_cork *cork,
                            struct page_frag *pfrag,
                            int getfrag(void *from, char *to, int offset,
                                        int len, int odd, struct sk_buff *skb),
                            void *from, int length, int transhdrlen,
                            unsigned int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ubuf_info *uarg = NULL;
        struct sk_buff *skb;
        struct ip_options *opt = cork->opt;
        int hh_len;
        int exthdrlen;
        int mtu;
        int copy;
        int err;
        int offset = 0;
        bool zc = false;
        unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
        int csummode = CHECKSUM_NONE;
        struct rtable *rt = dst_rtable(cork->dst);
        bool paged, hold_tskey = false, extra_uref = false;
        unsigned int wmem_alloc_delta = 0;
        u32 tskey = 0;

        skb = skb_peek_tail(queue);

        exthdrlen = !skb ? rt->dst.header_len : 0;
        mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
        paged = !!cork->gso_size;

        hh_len = LL_RESERVED_SPACE(rt->dst.dev);

        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
        maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;

        if (cork->length + length > maxnonfragsize - fragheaderlen) {
                ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
                               mtu - (opt ? opt->optlen : 0));
                return -EMSGSIZE;
        }

        /*
         * transhdrlen > 0 means that this is the first fragment and we wish
         * it won't be fragmented in the future.
         */
        if (transhdrlen &&
            length + fragheaderlen <= mtu &&
            rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
            (!(flags & MSG_MORE) || cork->gso_size) &&
            (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
                csummode = CHECKSUM_PARTIAL;

        if ((flags & MSG_ZEROCOPY) && length) {
                struct msghdr *msg = from;

                if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
                        if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
                                return -EINVAL;

                        /* Leave uarg NULL if can't zerocopy, callers should
                         * be able to handle it.
                         */
                        if ((rt->dst.dev->features & NETIF_F_SG) &&
                            csummode == CHECKSUM_PARTIAL) {
                                paged = true;
                                zc = true;
                                uarg = msg->msg_ubuf;
                        }
                } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
                        uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
                                                    false);
                        if (!uarg)
                                return -ENOBUFS;
                        extra_uref = !skb_zcopy(skb);        /* only ref on new uarg */
                        if (rt->dst.dev->features & NETIF_F_SG &&
                            csummode == CHECKSUM_PARTIAL) {
                                paged = true;
                                zc = true;
                        } else {
                                uarg_to_msgzc(uarg)->zerocopy = 0;
                                skb_zcopy_set(skb, uarg, &extra_uref);
                        }
                }
        } else if ((flags & MSG_SPLICE_PAGES) && length) {
                if (inet_test_bit(HDRINCL, sk))
                        return -EPERM;
                if (rt->dst.dev->features & NETIF_F_SG &&
                    getfrag == ip_generic_getfrag)
                        /* We need an empty buffer to attach stuff to */
                        paged = true;
                else
                        flags &= ~MSG_SPLICE_PAGES;
        }

        cork->length += length;

        if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
            READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
                if (cork->flags & IPCORK_TS_OPT_ID) {
                        tskey = cork->ts_opt_id;
                } else {
                        tskey = atomic_inc_return(&sk->sk_tskey) - 1;
                        hold_tskey = true;
                }
        }

        /* So, what's going on in the loop below?
         *
         * We use calculated fragment length to generate chained skb,
         * each of segments is IP fragment ready for sending to network after
         * adding appropriate IP header.
         */

        if (!skb)
                goto alloc_new_skb;

        while (length > 0) {
                /* Check if the remaining data fits into current packet. */
                copy = mtu - skb->len;
                if (copy < length)
                        copy = maxfraglen - skb->len;
                if (copy <= 0) {
                        char *data;
                        unsigned int datalen;
                        unsigned int fraglen;
                        unsigned int fraggap;
                        unsigned int alloclen, alloc_extra;
                        unsigned int pagedlen;
                        struct sk_buff *skb_prev;
alloc_new_skb:
                        skb_prev = skb;
                        if (skb_prev)
                                fraggap = skb_prev->len - maxfraglen;
                        else
                                fraggap = 0;

                        /*
                         * If remaining data exceeds the mtu,
                         * we know we need more fragment(s).
                         */
                        datalen = length + fraggap;
                        if (datalen > mtu - fragheaderlen)
                                datalen = maxfraglen - fragheaderlen;
                        fraglen = datalen + fragheaderlen;
                        pagedlen = 0;

                        alloc_extra = hh_len + 15;
                        alloc_extra += exthdrlen;

                        /* The last fragment gets additional space at tail.
                         * Note, with MSG_MORE we overallocate on fragments,
                         * because we have no idea what fragment will be
                         * the last.
                         */
                        if (datalen == length + fraggap)
                                alloc_extra += rt->dst.trailer_len;

                        if ((flags & MSG_MORE) &&
                            !(rt->dst.dev->features&NETIF_F_SG))
                                alloclen = mtu;
                        else if (!paged &&
                                 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
                                  !(rt->dst.dev->features & NETIF_F_SG)))
                                alloclen = fraglen;
                        else {
                                alloclen = fragheaderlen + transhdrlen;
                                pagedlen = datalen - transhdrlen;
                        }

                        alloclen += alloc_extra;

                        if (transhdrlen) {
                                skb = sock_alloc_send_skb(sk, alloclen,
                                                (flags & MSG_DONTWAIT), &err);
                        } else {
                                skb = NULL;
                                if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
                                    2 * sk->sk_sndbuf)
                                        skb = alloc_skb(alloclen,
                                                        sk->sk_allocation);
                                if (unlikely(!skb))
                                        err = -ENOBUFS;
                        }
                        if (!skb)
                                goto error;

                        /*
                         *        Fill in the control structures
                         */
                        skb->ip_summed = csummode;
                        skb->csum = 0;
                        skb_reserve(skb, hh_len);

                        /*
                         *        Find where to start putting bytes.
                         */
                        data = skb_put(skb, fraglen + exthdrlen - pagedlen);
                        skb_set_network_header(skb, exthdrlen);
                        skb->transport_header = (skb->network_header +
                                                 fragheaderlen);
                        data += fragheaderlen + exthdrlen;

                        if (fraggap) {
                                skb->csum = skb_copy_and_csum_bits(
                                        skb_prev, maxfraglen,
                                        data + transhdrlen, fraggap);
                                skb_prev->csum = csum_sub(skb_prev->csum,
                                                          skb->csum);
                                data += fraggap;
                                pskb_trim_unique(skb_prev, maxfraglen);
                        }

                        copy = datalen - transhdrlen - fraggap - pagedlen;
                        /* [!] NOTE: copy will be negative if pagedlen>0
                         * because then the equation reduces to -fraggap.
                         */
                        if (copy > 0 &&
                            INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
                                            from, data + transhdrlen, offset,
                                            copy, fraggap, skb) < 0) {
                                err = -EFAULT;
                                kfree_skb(skb);
                                goto error;
                        } else if (flags & MSG_SPLICE_PAGES) {
                                copy = 0;
                        }

                        offset += copy;
                        length -= copy + transhdrlen;
                        transhdrlen = 0;
                        exthdrlen = 0;
                        csummode = CHECKSUM_NONE;

                        /* only the initial fragment is time stamped */
                        skb_shinfo(skb)->tx_flags = cork->tx_flags;
                        cork->tx_flags = 0;
                        skb_shinfo(skb)->tskey = tskey;
                        tskey = 0;
                        skb_zcopy_set(skb, uarg, &extra_uref);

                        if ((flags & MSG_CONFIRM) && !skb_prev)
                                skb_set_dst_pending_confirm(skb, 1);

                        /*
                         * Put the packet on the pending queue.
                         */
                        if (!skb->destructor) {
                                skb->destructor = sock_wfree;
                                skb->sk = sk;
                                wmem_alloc_delta += skb->truesize;
                        }
                        __skb_queue_tail(queue, skb);
                        continue;
                }

                if (copy > length)
                        copy = length;

                if (!(rt->dst.dev->features&NETIF_F_SG) &&
                    skb_tailroom(skb) >= copy) {
                        unsigned int off;

                        off = skb->len;
                        if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
                                            from, skb_put(skb, copy),
                                            offset, copy, off, skb) < 0) {
                                __skb_trim(skb, off);
                                err = -EFAULT;
                                goto error;
                        }
                } else if (flags & MSG_SPLICE_PAGES) {
                        struct msghdr *msg = from;

                        err = -EIO;
                        if (WARN_ON_ONCE(copy > msg->msg_iter.count))
                                goto error;

                        err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
                        if (err < 0)
                                goto error;
                        copy = err;
                        wmem_alloc_delta += copy;
                } else if (!zc) {
                        int i = skb_shinfo(skb)->nr_frags;

                        err = -ENOMEM;
                        if (!sk_page_frag_refill(sk, pfrag))
                                goto error;

                        skb_zcopy_downgrade_managed(skb);
                        if (!skb_can_coalesce(skb, i, pfrag->page,
                                              pfrag->offset)) {
                                err = -EMSGSIZE;
                                if (i == MAX_SKB_FRAGS)
                                        goto error;

                                __skb_fill_page_desc(skb, i, pfrag->page,
                                                     pfrag->offset, 0);
                                skb_shinfo(skb)->nr_frags = ++i;
                                get_page(pfrag->page);
                        }
                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
                        if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
                                    from,
                                    page_address(pfrag->page) + pfrag->offset,
                                    offset, copy, skb->len, skb) < 0)
                                goto error_efault;

                        pfrag->offset += copy;
                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
                        skb_len_add(skb, copy);
                        wmem_alloc_delta += copy;
                } else {
                        err = skb_zerocopy_iter_dgram(skb, from, copy);
                        if (err < 0)
                                goto error;
                }
                offset += copy;
                length -= copy;
        }

        if (wmem_alloc_delta)
                refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
        return 0;

error_efault:
        err = -EFAULT;
error:
        net_zcopy_put_abort(uarg, extra_uref);
        cork->length -= length;
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
        refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
        if (hold_tskey)
                atomic_dec(&sk->sk_tskey);
        return err;
}

static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
                         struct ipcm_cookie *ipc, struct rtable **rtp)
{
        struct ip_options_rcu *opt;
        struct rtable *rt;

        rt = *rtp;
        if (unlikely(!rt))
                return -EFAULT;

        cork->fragsize = ip_sk_use_pmtu(sk) ?
                         dst4_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);

        if (!inetdev_valid_mtu(cork->fragsize))
                return -ENETUNREACH;

        /*
         * setup for corking.
         */
        opt = ipc->opt;
        if (opt) {
                if (!cork->opt) {
                        cork->opt = kmalloc(sizeof(struct ip_options) + 40,
                                            sk->sk_allocation);
                        if (unlikely(!cork->opt))
                                return -ENOBUFS;
                }
                memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
                cork->flags |= IPCORK_OPT;
                cork->addr = ipc->addr;
        }

        cork->gso_size = ipc->gso_size;

        cork->dst = &rt->dst;
        /* We stole this route, caller should not release it. */
        *rtp = NULL;

        cork->length = 0;
        cork->ttl = ipc->ttl;
        cork->tos = ipc->tos;
        cork->mark = ipc->sockc.mark;
        cork->priority = ipc->sockc.priority;
        cork->transmit_time = ipc->sockc.transmit_time;
        cork->tx_flags = 0;
        sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags);
        if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
                cork->flags |= IPCORK_TS_OPT_ID;
                cork->ts_opt_id = ipc->sockc.ts_opt_id;
        }

        return 0;
}

/*
 *        ip_append_data() can make one large IP datagram from many pieces of
 *        data.  Each piece will be held on the socket until
 *        ip_push_pending_frames() is called. Each piece can be a page or
 *        non-page data.
 *
 *        Not only UDP, other transport protocols - e.g. raw sockets - can use
 *        this interface potentially.
 *
 *        LATER: length must be adjusted by pad at tail, when it is required.
 */
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
                   int getfrag(void *from, char *to, int offset, int len,
                               int odd, struct sk_buff *skb),
                   void *from, int length, int transhdrlen,
                   struct ipcm_cookie *ipc, struct rtable **rtp,
                   unsigned int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        int err;

        if (flags&MSG_PROBE)
                return 0;

        if (skb_queue_empty(&sk->sk_write_queue)) {
                err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
                if (err)
                        return err;
        } else {
                transhdrlen = 0;
        }

        return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
                                sk_page_frag(sk), getfrag,
                                from, length, transhdrlen, flags);
}

static void ip_cork_release(struct inet_cork *cork)
{
        cork->flags &= ~IPCORK_OPT;
        kfree(cork->opt);
        cork->opt = NULL;
        dst_release(cork->dst);
        cork->dst = NULL;
}

/*
 *        Combined all pending IP fragments on the socket as one IP datagram
 *        and push them out.
 */
struct sk_buff *__ip_make_skb(struct sock *sk,
                              struct flowi4 *fl4,
                              struct sk_buff_head *queue,
                              struct inet_cork *cork)
{
        struct sk_buff *skb, *tmp_skb;
        struct sk_buff **tail_skb;
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        struct ip_options *opt = NULL;
        struct rtable *rt = dst_rtable(cork->dst);
        struct iphdr *iph;
        u8 pmtudisc, ttl;
        __be16 df = 0;

        skb = __skb_dequeue(queue);
        if (!skb)
                goto out;
        tail_skb = &(skb_shinfo(skb)->frag_list);

        /* move skb->data to ip header from ext header */
        if (skb->data < skb_network_header(skb))
                __skb_pull(skb, skb_network_offset(skb));
        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
                __skb_pull(tmp_skb, skb_network_header_len(skb));
                *tail_skb = tmp_skb;
                tail_skb = &(tmp_skb->next);
                skb->len += tmp_skb->len;
                skb->data_len += tmp_skb->len;
                skb->truesize += tmp_skb->truesize;
                tmp_skb->destructor = NULL;
                tmp_skb->sk = NULL;
        }

        /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
         * to fragment the frame generated here. No matter, what transforms
         * how transforms change size of the packet, it will come out.
         */
        skb->ignore_df = ip_sk_ignore_df(sk);

        /* DF bit is set when we want to see DF on outgoing frames.
         * If ignore_df is set too, we still allow to fragment this frame
         * locally. */
        pmtudisc = READ_ONCE(inet->pmtudisc);
        if (pmtudisc == IP_PMTUDISC_DO ||
            pmtudisc == IP_PMTUDISC_PROBE ||
            (skb->len <= dst4_mtu(&rt->dst) &&
             ip_dont_fragment(sk, &rt->dst)))
                df = htons(IP_DF);

        if (cork->flags & IPCORK_OPT)
                opt = cork->opt;

        if (cork->ttl != 0)
                ttl = cork->ttl;
        else if (rt->rt_type == RTN_MULTICAST)
                ttl = READ_ONCE(inet->mc_ttl);
        else
                ttl = ip_select_ttl(inet, &rt->dst);

        iph = ip_hdr(skb);
        iph->version = 4;
        iph->ihl = 5;
        iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
        iph->frag_off = df;
        iph->ttl = ttl;
        iph->protocol = sk->sk_protocol;
        ip_copy_addrs(iph, fl4);
        ip_select_ident(net, skb, sk);

        if (opt) {
                iph->ihl += opt->optlen >> 2;
                ip_options_build(skb, opt, cork->addr, rt);
        }

        skb->priority = cork->priority;
        skb->mark = cork->mark;
        if (sk_is_tcp(sk))
                skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
        else
                skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid);
        /*
         * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
         * on dst refcount
         */
        cork->dst = NULL;
        skb_dst_set(skb, &rt->dst);

        if (iph->protocol == IPPROTO_ICMP) {
                u8 icmp_type;

                /* For such sockets, transhdrlen is zero when do ip_append_data(),
                 * so icmphdr does not in skb linear region and can not get icmp_type
                 * by icmp_hdr(skb)->type.
                 */
                if (sk->sk_type == SOCK_RAW &&
                    !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH))
                        icmp_type = fl4->fl4_icmp_type;
                else
                        icmp_type = icmp_hdr(skb)->type;
                icmp_out_count(net, icmp_type);
        }

        ip_cork_release(cork);
out:
        return skb;
}

int ip_send_skb(struct net *net, struct sk_buff *skb)
{
        int err;

        err = ip_local_out(net, skb->sk, skb);
        if (err) {
                if (err > 0)
                        err = net_xmit_errno(err);
                if (err)
                        IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
        }

        return err;
}

int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
{
        struct sk_buff *skb;

        skb = ip_finish_skb(sk, fl4);
        if (!skb)
                return 0;

        /* Netfilter gets whole the not fragmented skb. */
        return ip_send_skb(sock_net(sk), skb);
}

/*
 *        Throw away all pending data on the socket.
 */
static void __ip_flush_pending_frames(struct sock *sk,
                                      struct sk_buff_head *queue,
                                      struct inet_cork *cork)
{
        struct sk_buff *skb;

        while ((skb = __skb_dequeue_tail(queue)) != NULL)
                kfree_skb(skb);

        ip_cork_release(cork);
}

void ip_flush_pending_frames(struct sock *sk)
{
        __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
}

struct sk_buff *ip_make_skb(struct sock *sk,
                            struct flowi4 *fl4,
                            int getfrag(void *from, char *to, int offset,
                                        int len, int odd, struct sk_buff *skb),
                            void *from, int length, int transhdrlen,
                            struct ipcm_cookie *ipc, struct rtable **rtp,
                            struct inet_cork *cork, unsigned int flags)
{
        struct sk_buff_head queue;
        int err;

        if (flags & MSG_PROBE)
                return NULL;

        __skb_queue_head_init(&queue);

        cork->flags = 0;
        cork->addr = 0;
        cork->opt = NULL;
        err = ip_setup_cork(sk, cork, ipc, rtp);
        if (err)
                return ERR_PTR(err);

        err = __ip_append_data(sk, fl4, &queue, cork,
                               &current->task_frag, getfrag,
                               from, length, transhdrlen, flags);
        if (err) {
                __ip_flush_pending_frames(sk, &queue, cork);
                return ERR_PTR(err);
        }

        return __ip_make_skb(sk, fl4, &queue, cork);
}

/*
 *        Fetch data from kernel space and fill in checksum if needed.
 */
static int ip_reply_glue_bits(void *dptr, char *to, int offset,
                              int len, int odd, struct sk_buff *skb)
{
        __wsum csum;

        csum = csum_partial_copy_nocheck(dptr+offset, to, len);
        skb->csum = csum_block_add(skb->csum, csum, odd);
        return 0;
}

/*
 *        Generic function to send a packet as reply to another packet.
 *        Used to send some TCP resets/acks so far.
 */
void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
                           struct sk_buff *skb,
                           const struct ip_options *sopt,
                           __be32 daddr, __be32 saddr,
                           const struct ip_reply_arg *arg,
                           unsigned int len, u64 transmit_time, u32 txhash)
{
        DEFINE_RAW_FLEX(struct ip_options_rcu, replyopts, opt.__data,
                        IP_OPTIONS_DATA_FIXED_SIZE);
        struct ipcm_cookie ipc;
        struct flowi4 fl4;
        struct rtable *rt = skb_rtable(skb);
        struct net *net = sock_net(sk);
        struct sk_buff *nskb;
        int err;
        int oif;

        if (__ip_options_echo(net, &replyopts->opt, skb, sopt))
                return;

        ipcm_init(&ipc);
        ipc.addr = daddr;
        ipc.sockc.transmit_time = transmit_time;

        if (replyopts->opt.optlen) {
                ipc.opt = replyopts;

                if (replyopts->opt.srr)
                        daddr = replyopts->opt.faddr;
        }

        oif = arg->bound_dev_if;
        if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
                oif = skb->skb_iif;

        flowi4_init_output(&fl4, oif,
                           IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
                           arg->tos & INET_DSCP_MASK,
                           RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
                           ip_reply_arg_flowi_flags(arg),
                           daddr, saddr,
                           tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
                           arg->uid);
        security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
        rt = ip_route_output_flow(net, &fl4, sk);
        if (IS_ERR(rt))
                return;

        inet_sk(sk)->tos = arg->tos;

        sk->sk_protocol = ip_hdr(skb)->protocol;
        sk->sk_bound_dev_if = arg->bound_dev_if;
        sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
        ipc.sockc.mark = fl4.flowi4_mark;
        err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
                             len, 0, &ipc, &rt, MSG_DONTWAIT);
        if (unlikely(err)) {
                ip_flush_pending_frames(sk);
                goto out;
        }

        nskb = skb_peek(&sk->sk_write_queue);
        if (nskb) {
                if (arg->csumoffset >= 0)
                        *((__sum16 *)skb_transport_header(nskb) +
                          arg->csumoffset) = csum_fold(csum_add(nskb->csum,
                                                                arg->csum));
                nskb->ip_summed = CHECKSUM_NONE;
                if (orig_sk) {
                        skb_set_owner_edemux(nskb, (struct sock *)orig_sk);
                        psp_reply_set_decrypted(orig_sk, nskb);
                }
                if (transmit_time)
                        nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
                if (txhash)
                        skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4);
                ip_push_pending_frames(sk, &fl4);
        }
out:
        ip_rt_put(rt);
}

void __init ip_init(void)
{
        ip_rt_init();
        inet_initpeers();

#if defined(CONFIG_IP_MULTICAST)
        igmp_mc_init();
#endif
}
















    2 





















    2 

























    2 
































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sched

#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SCHED_H

#include <linux/kthread.h>
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>

/*
 * Tracepoint for calling kthread_stop, performed to end a kthread:
 */
TRACE_EVENT(sched_kthread_stop,

        TP_PROTO(struct task_struct *t),

        TP_ARGS(t),

        TP_STRUCT__entry(
                __string(        comm,        t->comm                )
                __field(        pid_t,        pid                )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid        = t->pid;
        ),

        TP_printk("comm=%s pid=%d", __get_str(comm), __entry->pid)
);

/*
 * Tracepoint for the return value of the kthread stopping:
 */
TRACE_EVENT(sched_kthread_stop_ret,

        TP_PROTO(int ret),

        TP_ARGS(ret),

        TP_STRUCT__entry(
                __field(        int,        ret        )
        ),

        TP_fast_assign(
                __entry->ret        = ret;
        ),

        TP_printk("ret=%d", __entry->ret)
);

/**
 * sched_kthread_work_queue_work - called when a work gets queued
 * @worker:        pointer to the kthread_worker
 * @work:        pointer to struct kthread_work
 *
 * This event occurs when a work is queued immediately or once a
 * delayed work is actually queued (ie: once the delay has been
 * reached).
 */
TRACE_EVENT(sched_kthread_work_queue_work,

        TP_PROTO(struct kthread_worker *worker,
                 struct kthread_work *work),

        TP_ARGS(worker, work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
                __field( void *,        worker)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
                __entry->worker                = worker;
        ),

        TP_printk("work struct=%p function=%ps worker=%p",
                  __entry->work, __entry->function, __entry->worker)
);

/**
 * sched_kthread_work_execute_start - called immediately before the work callback
 * @work:        pointer to struct kthread_work
 *
 * Allows to track kthread work execution.
 */
TRACE_EVENT(sched_kthread_work_execute_start,

        TP_PROTO(struct kthread_work *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/**
 * sched_kthread_work_execute_end - called immediately after the work callback
 * @work:        pointer to struct work_struct
 * @function:   pointer to worker function
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(sched_kthread_work_execute_end,

        TP_PROTO(struct kthread_work *work, kthread_work_func_t function),

        TP_ARGS(work, function),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = function;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/*
 * Tracepoint for waking up a task:
 */
DECLARE_EVENT_CLASS(sched_wakeup_template,

        TP_PROTO(struct task_struct *p),

        TP_ARGS(__perf_task(p)),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
                __field(        int,        target_cpu                )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
                __entry->target_cpu        = task_cpu(p);
        ),

        TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d",
                  __entry->comm, __entry->pid, __entry->prio,
                  __entry->target_cpu)
);

/*
 * Tracepoint called when waking a task; this tracepoint is guaranteed to be
 * called from the waking context.
 */
DEFINE_EVENT(sched_wakeup_template, sched_waking,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING.
 * It is not always called from the waking context.
 */
DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint for waking up a new task:
 */
DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

#ifdef CREATE_TRACE_POINTS
static inline long __trace_sched_switch_state(bool preempt,
                                              unsigned int prev_state,
                                              struct task_struct *p)
{
        unsigned int state;

        BUG_ON(p != current);

        /*
         * Preemption ignores task state, therefore preempted tasks are always
         * RUNNING (we will not have dequeued if state != RUNNING).
         */
        if (preempt)
                return TASK_REPORT_MAX;

        /*
         * task_state_index() uses fls() and returns a value from 0-8 range.
         * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
         * it for left shift operation to get the correct task->state
         * mapping.
         */
        state = __task_state_index(prev_state, p->exit_state);

        return state ? (1 << (state - 1)) : state;
}
#endif /* CREATE_TRACE_POINTS */

/*
 * Tracepoint for task switches, performed by the scheduler:
 */
TRACE_EVENT(sched_switch,

        TP_PROTO(bool preempt,
                 struct task_struct *prev,
                 struct task_struct *next,
                 unsigned int prev_state),

        TP_ARGS(preempt, prev, next, prev_state),

        TP_STRUCT__entry(
                __array(        char,        prev_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        prev_pid                        )
                __field(        int,        prev_prio                        )
                __field(        long,        prev_state                        )
                __array(        char,        next_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        next_pid                        )
                __field(        int,        next_prio                        )
        ),

        TP_fast_assign(
                memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
                __entry->prev_pid        = prev->pid;
                __entry->prev_prio        = prev->prio;
                __entry->prev_state        = __trace_sched_switch_state(preempt, prev_state, prev);
                memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
                __entry->next_pid        = next->pid;
                __entry->next_prio        = next->prio;
                /* XXX SCHED_DEADLINE */
        ),

        TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
                __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,

                (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
                  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
                                { TASK_INTERRUPTIBLE, "S" },
                                { TASK_UNINTERRUPTIBLE, "D" },
                                { __TASK_STOPPED, "T" },
                                { __TASK_TRACED, "t" },
                                { EXIT_DEAD, "X" },
                                { EXIT_ZOMBIE, "Z" },
                                { TASK_PARKED, "P" },
                                { TASK_DEAD, "I" }) :
                  "R",

                __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
                __entry->next_comm, __entry->next_pid, __entry->next_prio)
);

/*
 * Tracepoint for a task being migrated:
 */
TRACE_EVENT(sched_migrate_task,

        TP_PROTO(struct task_struct *p, int dest_cpu),

        TP_ARGS(p, dest_cpu),

        TP_STRUCT__entry(
                __string(        comm,        p->comm                )
                __field(        pid_t,        pid                )
                __field(        int,        prio                )
                __field(        int,        orig_cpu        )
                __field(        int,        dest_cpu        )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
                __entry->orig_cpu        = task_cpu(p);
                __entry->dest_cpu        = dest_cpu;
        ),

        TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d",
                  __get_str(comm), __entry->pid, __entry->prio,
                  __entry->orig_cpu, __entry->dest_cpu)
);

DECLARE_EVENT_CLASS(sched_process_template,

        TP_PROTO(struct task_struct *p),

        TP_ARGS(p),

        TP_STRUCT__entry(
                __string(        comm,        p->comm                )
                __field(        pid_t,        pid                )
                __field(        int,        prio                )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
        ),

        TP_printk("comm=%s pid=%d prio=%d",
                  __get_str(comm), __entry->pid, __entry->prio)
);

/*
 * Tracepoint for freeing a task:
 */
DEFINE_EVENT(sched_process_template, sched_process_free,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint for a task exiting.
 * Note, it's a superset of sched_process_template and should be kept
 * compatible as much as possible. sched_process_exits has an extra
 * `group_dead` argument, so sched_process_template can't be used,
 * unfortunately, just like sched_migrate_task above.
 */
TRACE_EVENT(sched_process_exit,

        TP_PROTO(struct task_struct *p, bool group_dead),

        TP_ARGS(p, group_dead),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
                __field(        bool,        group_dead                )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
                __entry->group_dead        = group_dead;
        ),

        TP_printk("comm=%s pid=%d prio=%d group_dead=%s",
                  __entry->comm, __entry->pid, __entry->prio,
                  __entry->group_dead ? "true" : "false"
        )
);

/*
 * Tracepoint for waiting on task to unschedule:
 */
DEFINE_EVENT(sched_process_template, sched_wait_task,
        TP_PROTO(struct task_struct *p),
        TP_ARGS(p));

/*
 * Tracepoint for a waiting task:
 */
TRACE_EVENT(sched_process_wait,

        TP_PROTO(struct pid *pid),

        TP_ARGS(pid),

        TP_STRUCT__entry(
                __string(        comm,        current->comm                )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid                = pid_nr(pid);
                __entry->prio                = current->prio; /* XXX SCHED_DEADLINE */
        ),

        TP_printk("comm=%s pid=%d prio=%d",
                  __get_str(comm), __entry->pid, __entry->prio)
);

/*
 * Tracepoint for kernel_clone:
 */
TRACE_EVENT(sched_process_fork,

        TP_PROTO(struct task_struct *parent, struct task_struct *child),

        TP_ARGS(parent, child),

        TP_STRUCT__entry(
                __string(        parent_comm,        parent->comm        )
                __field(        pid_t,                parent_pid        )
                __string(        child_comm,        child->comm        )
                __field(        pid_t,                child_pid        )
        ),

        TP_fast_assign(
                __assign_str(parent_comm);
                __entry->parent_pid        = parent->pid;
                __assign_str(child_comm);
                __entry->child_pid        = child->pid;
        ),

        TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d",
                __get_str(parent_comm), __entry->parent_pid,
                __get_str(child_comm), __entry->child_pid)
);

/*
 * Tracepoint for exec:
 */
TRACE_EVENT(sched_process_exec,

        TP_PROTO(struct task_struct *p, pid_t old_pid,
                 struct linux_binprm *bprm),

        TP_ARGS(p, old_pid, bprm),

        TP_STRUCT__entry(
                __string(        filename,        bprm->filename        )
                __field(        pid_t,                pid                )
                __field(        pid_t,                old_pid                )
        ),

        TP_fast_assign(
                __assign_str(filename);
                __entry->pid                = p->pid;
                __entry->old_pid        = old_pid;
        ),

        TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename),
                  __entry->pid, __entry->old_pid)
);

/**
 * sched_prepare_exec - called before setting up new exec
 * @task:        pointer to the current task
 * @bprm:        pointer to linux_binprm used for new exec
 *
 * Called before flushing the old exec, where @task is still unchanged, but at
 * the point of no return during switching to the new exec. At the point it is
 * called the exec will either succeed, or on failure terminate the task. Also
 * see the "sched_process_exec" tracepoint, which is called right after @task
 * has successfully switched to the new exec.
 */
TRACE_EVENT(sched_prepare_exec,

        TP_PROTO(struct task_struct *task, struct linux_binprm *bprm),

        TP_ARGS(task, bprm),

        TP_STRUCT__entry(
                __string(        interp,                bprm->interp        )
                __string(        filename,        bprm->filename        )
                __field(        pid_t,                pid                )
                __string(        comm,                task->comm        )
        ),

        TP_fast_assign(
                __assign_str(interp);
                __assign_str(filename);
                __entry->pid = task->pid;
                __assign_str(comm);
        ),

        TP_printk("interp=%s filename=%s pid=%d comm=%s",
                  __get_str(interp), __get_str(filename),
                  __entry->pid, __get_str(comm))
);

#ifdef CONFIG_SCHEDSTATS
#define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT
#define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS
#else
#define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT_NOP
#define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS_NOP
#endif

/*
 * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
 *     adding sched_stat support to SCHED_FIFO/RR would be welcome.
 */
DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template,

        TP_PROTO(struct task_struct *tsk, u64 delay),

        TP_ARGS(__perf_task(tsk), __perf_count(delay)),

        TP_STRUCT__entry(
                __string( comm,        tsk->comm        )
                __field(  pid_t,        pid        )
                __field(  u64,                delay        )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid        = tsk->pid;
                __entry->delay        = delay;
        ),

        TP_printk("comm=%s pid=%d delay=%Lu [ns]",
                        __get_str(comm), __entry->pid,
                        (unsigned long long)__entry->delay)
);

/*
 * Tracepoint for accounting wait time (time the task is runnable
 * but not actually running due to scheduler contention).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_wait,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting sleep time (time the task is not runnable,
 * including iowait, see below).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_sleep,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting iowait time (time the task is not runnable
 * due to waiting on IO to complete).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_iowait,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting blocked time (time the task is in uninterruptible).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting runtime (time the task is executing
 * on a CPU).
 */
DECLARE_EVENT_CLASS(sched_stat_runtime,

        TP_PROTO(struct task_struct *tsk, u64 runtime),

        TP_ARGS(tsk, __perf_count(runtime)),

        TP_STRUCT__entry(
                __string( comm,                tsk->comm        )
                __field(  pid_t,        pid                )
                __field(  u64,                runtime                )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid                = tsk->pid;
                __entry->runtime        = runtime;
        ),

        TP_printk("comm=%s pid=%d runtime=%Lu [ns]",
                        __get_str(comm), __entry->pid,
                        (unsigned long long)__entry->runtime)
);

DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
             TP_PROTO(struct task_struct *tsk, u64 runtime),
             TP_ARGS(tsk, runtime));

/*
 * Tracepoint for showing priority inheritance modifying a tasks
 * priority.
 */
TRACE_EVENT(sched_pi_setprio,

        TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),

        TP_ARGS(tsk, pi_task),

        TP_STRUCT__entry(
                __string( comm,                tsk->comm        )
                __field(  pid_t,        pid                )
                __field(  int,                oldprio                )
                __field(  int,                newprio                )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid                = tsk->pid;
                __entry->oldprio        = tsk->prio;
                __entry->newprio        = pi_task ?
                                min(tsk->normal_prio, pi_task->prio) :
                                tsk->normal_prio;
                /* XXX SCHED_DEADLINE bits missing */
        ),

        TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
                        __get_str(comm), __entry->pid,
                        __entry->oldprio, __entry->newprio)
);

#ifdef CONFIG_DETECT_HUNG_TASK
TRACE_EVENT(sched_process_hang,
        TP_PROTO(struct task_struct *tsk),
        TP_ARGS(tsk),

        TP_STRUCT__entry(
                __string( comm,                tsk->comm        )
                __field(  pid_t,        pid                )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid = tsk->pid;
        ),

        TP_printk("comm=%s pid=%d", __get_str(comm), __entry->pid)
);
#endif /* CONFIG_DETECT_HUNG_TASK */

#ifdef CONFIG_NUMA_BALANCING
/*
 * Tracks migration of tasks from one runqueue to another. Can be used to
 * detect if automatic NUMA balancing is bouncing between nodes.
 */
TRACE_EVENT(sched_move_numa,

        TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),

        TP_ARGS(tsk, src_cpu, dst_cpu),

        TP_STRUCT__entry(
                __field( pid_t,        pid                        )
                __field( pid_t,        tgid                        )
                __field( pid_t,        ngid                        )
                __field( int,        src_cpu                        )
                __field( int,        src_nid                        )
                __field( int,        dst_cpu                        )
                __field( int,        dst_nid                        )
        ),

        TP_fast_assign(
                __entry->pid                = task_pid_nr(tsk);
                __entry->tgid                = task_tgid_nr(tsk);
                __entry->ngid                = task_numa_group_id(tsk);
                __entry->src_cpu        = src_cpu;
                __entry->src_nid        = cpu_to_node(src_cpu);
                __entry->dst_cpu        = dst_cpu;
                __entry->dst_nid        = cpu_to_node(dst_cpu);
        ),

        TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d",
                        __entry->pid, __entry->tgid, __entry->ngid,
                        __entry->src_cpu, __entry->src_nid,
                        __entry->dst_cpu, __entry->dst_nid)
);

DECLARE_EVENT_CLASS(sched_numa_pair_template,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu),

        TP_STRUCT__entry(
                __field( pid_t,        src_pid                        )
                __field( pid_t,        src_tgid                )
                __field( pid_t,        src_ngid                )
                __field( int,        src_cpu                        )
                __field( int,        src_nid                        )
                __field( pid_t,        dst_pid                        )
                __field( pid_t,        dst_tgid                )
                __field( pid_t,        dst_ngid                )
                __field( int,        dst_cpu                        )
                __field( int,        dst_nid                        )
        ),

        TP_fast_assign(
                __entry->src_pid        = task_pid_nr(src_tsk);
                __entry->src_tgid        = task_tgid_nr(src_tsk);
                __entry->src_ngid        = task_numa_group_id(src_tsk);
                __entry->src_cpu        = src_cpu;
                __entry->src_nid        = cpu_to_node(src_cpu);
                __entry->dst_pid        = dst_tsk ? task_pid_nr(dst_tsk) : 0;
                __entry->dst_tgid        = dst_tsk ? task_tgid_nr(dst_tsk) : 0;
                __entry->dst_ngid        = dst_tsk ? task_numa_group_id(dst_tsk) : 0;
                __entry->dst_cpu        = dst_cpu;
                __entry->dst_nid        = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1;
        ),

        TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d",
                        __entry->src_pid, __entry->src_tgid, __entry->src_ngid,
                        __entry->src_cpu, __entry->src_nid,
                        __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
                        __entry->dst_cpu, __entry->dst_nid)
);

DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
);

DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
);

#define NUMAB_SKIP_REASON                                        \
        EM( NUMAB_SKIP_UNSUITABLE,                "unsuitable" )        \
        EM( NUMAB_SKIP_SHARED_RO,                "shared_ro" )        \
        EM( NUMAB_SKIP_INACCESSIBLE,                "inaccessible" )        \
        EM( NUMAB_SKIP_SCAN_DELAY,                "scan_delay" )        \
        EM( NUMAB_SKIP_PID_INACTIVE,                "pid_inactive" )        \
        EM( NUMAB_SKIP_IGNORE_PID,                "ignore_pid_inactive" )                \
        EMe(NUMAB_SKIP_SEQ_COMPLETED,                "seq_completed" )

/* Redefine for export. */
#undef EM
#undef EMe
#define EM(a, b)        TRACE_DEFINE_ENUM(a);
#define EMe(a, b)        TRACE_DEFINE_ENUM(a);

NUMAB_SKIP_REASON

/* Redefine for symbolic printing. */
#undef EM
#undef EMe
#define EM(a, b)        { a, b },
#define EMe(a, b)        { a, b }

TRACE_EVENT(sched_skip_vma_numa,

        TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma,
                 enum numa_vmaskip_reason reason),

        TP_ARGS(mm, vma, reason),

        TP_STRUCT__entry(
                __field(unsigned long, numa_scan_offset)
                __field(unsigned long, vm_start)
                __field(unsigned long, vm_end)
                __field(enum numa_vmaskip_reason, reason)
        ),

        TP_fast_assign(
                __entry->numa_scan_offset        = mm->numa_scan_offset;
                __entry->vm_start                = vma->vm_start;
                __entry->vm_end                        = vma->vm_end;
                __entry->reason                        = reason;
        ),

        TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s",
                  __entry->numa_scan_offset,
                  __entry->vm_start,
                  __entry->vm_end,
                  __print_symbolic(__entry->reason, NUMAB_SKIP_REASON))
);

TRACE_EVENT(sched_skip_cpuset_numa,

        TP_PROTO(struct task_struct *tsk, nodemask_t *mem_allowed_ptr),

        TP_ARGS(tsk, mem_allowed_ptr),

        TP_STRUCT__entry(
                __array( char,                comm,                TASK_COMM_LEN                )
                __field( pid_t,                pid                                        )
                __field( pid_t,                tgid                                        )
                __field( pid_t,                ngid                                        )
                __array( unsigned long, mem_allowed, BITS_TO_LONGS(MAX_NUMNODES))
        ),

        TP_fast_assign(
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid                 = task_pid_nr(tsk);
                __entry->tgid                 = task_tgid_nr(tsk);
                __entry->ngid                 = task_numa_group_id(tsk);
                BUILD_BUG_ON(sizeof(nodemask_t) != \
                             BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long));
                memcpy(__entry->mem_allowed, mem_allowed_ptr->bits,
                       sizeof(__entry->mem_allowed));
        ),

        TP_printk("comm=%s pid=%d tgid=%d ngid=%d mem_nodes_allowed=%*pbl",
                  __entry->comm,
                  __entry->pid,
                  __entry->tgid,
                  __entry->ngid,
                  MAX_NUMNODES, __entry->mem_allowed)
);
#endif /* CONFIG_NUMA_BALANCING */

/*
 * Tracepoint for waking a polling cpu without an IPI.
 */
TRACE_EVENT(sched_wake_idle_without_ipi,

        TP_PROTO(int cpu),

        TP_ARGS(cpu),

        TP_STRUCT__entry(
                __field(        int,        cpu        )
        ),

        TP_fast_assign(
                __entry->cpu        = cpu;
        ),

        TP_printk("cpu=%d", __entry->cpu)
);

/*
 * Following tracepoints are not exported in tracefs and provide hooking
 * mechanisms only for testing and debugging purposes.
 */
DECLARE_TRACE(pelt_cfs,
        TP_PROTO(struct cfs_rq *cfs_rq),
        TP_ARGS(cfs_rq));

DECLARE_TRACE(pelt_rt,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_dl,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_hw,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_irq,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_se,
        TP_PROTO(struct sched_entity *se),
        TP_ARGS(se));

DECLARE_TRACE(sched_cpu_capacity,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(sched_overutilized,
        TP_PROTO(struct root_domain *rd, bool overutilized),
        TP_ARGS(rd, overutilized));

DECLARE_TRACE(sched_util_est_cfs,
        TP_PROTO(struct cfs_rq *cfs_rq),
        TP_ARGS(cfs_rq));

DECLARE_TRACE(sched_util_est_se,
        TP_PROTO(struct sched_entity *se),
        TP_ARGS(se));

DECLARE_TRACE(sched_update_nr_running,
        TP_PROTO(struct rq *rq, int change),
        TP_ARGS(rq, change));

DECLARE_TRACE(sched_compute_energy,
        TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy,
                 unsigned long max_util, unsigned long busy_time),
        TP_ARGS(p, dst_cpu, energy, max_util, busy_time));

DECLARE_TRACE(sched_entry,
        TP_PROTO(bool preempt),
        TP_ARGS(preempt));

DECLARE_TRACE(sched_exit,
        TP_PROTO(bool is_switch),
        TP_ARGS(is_switch));

DECLARE_TRACE_CONDITION(sched_set_state,
        TP_PROTO(struct task_struct *tsk, int state),
        TP_ARGS(tsk, state),
        TP_CONDITION(!!(tsk->__state) != !!state));

DECLARE_TRACE(sched_set_need_resched,
        TP_PROTO(struct task_struct *tsk, int cpu, int tif),
        TP_ARGS(tsk, cpu, tif));

#define DL_OTHER 0
#define DL_TASK 1
#define DL_SERVER_FAIR 2
#define DL_SERVER_EXT 3

DECLARE_TRACE(sched_dl_throttle,
        TP_PROTO(struct sched_dl_entity *dl_se, int cpu, u8 type),
        TP_ARGS(dl_se, cpu, type));

DECLARE_TRACE(sched_dl_replenish,
        TP_PROTO(struct sched_dl_entity *dl_se, int cpu, u8 type),
        TP_ARGS(dl_se, cpu, type));

/* Call to update_curr_dl_se not involving throttle or replenish */
DECLARE_TRACE(sched_dl_update,
        TP_PROTO(struct sched_dl_entity *dl_se, int cpu, u8 type),
        TP_ARGS(dl_se, cpu, type));

DECLARE_TRACE(sched_dl_server_start,
        TP_PROTO(struct sched_dl_entity *dl_se, int cpu, u8 type),
        TP_ARGS(dl_se, cpu, type));

DECLARE_TRACE(sched_dl_server_stop,
        TP_PROTO(struct sched_dl_entity *dl_se, int cpu, u8 type),
        TP_ARGS(dl_se, cpu, type));

#endif /* _TRACE_SCHED_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






















    2 

    2 




    2 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>

/*
 * This is an implementation of the notion of "decrement a
 * reference count, and return locked if it decremented to zero".
 *
 * NOTE NOTE NOTE! This is _not_ equivalent to
 *
 *        if (atomic_dec_and_test(&atomic)) {
 *                spin_lock(&lock);
 *                return 1;
 *        }
 *        return 0;
 *
 * because the spin-lock and the decrement must be
 * "atomic".
 */
int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        spin_lock(lock);
        if (atomic_dec_and_test(atomic))
                return 1;
        spin_unlock(lock);
        return 0;
}

EXPORT_SYMBOL(atomic_dec_and_lock);

int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                 unsigned long *flags)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        spin_lock_irqsave(lock, *flags);
        if (atomic_dec_and_test(atomic))
                return 1;
        spin_unlock_irqrestore(lock, *flags);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave);

int atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        raw_spin_lock(lock);
        if (atomic_dec_and_test(atomic))
                return 1;
        raw_spin_unlock(lock);
        return 0;
}
EXPORT_SYMBOL(atomic_dec_and_raw_lock);

int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock,
                                     unsigned long *flags)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        raw_spin_lock_irqsave(lock, *flags);
        if (atomic_dec_and_test(atomic))
                return 1;
        raw_spin_unlock_irqrestore(lock, *flags);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave);

























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_GFP_H
#define __LINUX_GFP_H

#include <linux/gfp_types.h>

#include <linux/mmzone.h>
#include <linux/topology.h>
#include <linux/alloc_tag.h>
#include <linux/cleanup.h>
#include <linux/sched.h>

struct vm_area_struct;
struct mempolicy;

/* Helper macro to avoid gfp flags if they are the default one */
#define __default_gfp(a,b,...) b
#define default_gfp(...) __default_gfp(,##__VA_ARGS__,GFP_KERNEL)

/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3

static inline int gfp_migratetype(const gfp_t gfp_flags)
{
        VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
        BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
        BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
        BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE);
        BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >>
                      GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC);

        if (unlikely(page_group_by_mobility_disabled))
                return MIGRATE_UNMOVABLE;

        /* Group based on mobility */
        return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}
#undef GFP_MOVABLE_MASK
#undef GFP_MOVABLE_SHIFT

static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{
        return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}

static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
{
        /*
         * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
         * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
         * All GFP_* flags including GFP_NOWAIT use one or both flags.
         * alloc_pages_nolock() is the only API that doesn't specify either flag.
         *
         * This is stronger than GFP_NOWAIT or GFP_ATOMIC because
         * those are guaranteed to never block on a sleeping lock.
         * Here we are enforcing that the allocation doesn't ever spin
         * on any locks (i.e. only trylocks). There is no high level
         * GFP_$FOO flag for this use in alloc_pages_nolock() as the
         * regular page allocator doesn't fully support this
         * allocation mode.
         */
        return !!(gfp_flags & __GFP_RECLAIM);
}

#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA
#define OPT_ZONE_DMA ZONE_DMA
#else
#define OPT_ZONE_DMA ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA32
#define OPT_ZONE_DMA32 ZONE_DMA32
#else
#define OPT_ZONE_DMA32 ZONE_NORMAL
#endif

/*
 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
 * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
 * bits long and there are 16 of them to cover all possible combinations of
 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
 *
 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
 * But GFP_MOVABLE is not only a zone specifier but also an allocation
 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
 * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
 *
 *       bit       result
 *       =================
 *       0x0    => NORMAL
 *       0x1    => DMA or NORMAL
 *       0x2    => HIGHMEM or NORMAL
 *       0x3    => BAD (DMA+HIGHMEM)
 *       0x4    => DMA32 or NORMAL
 *       0x5    => BAD (DMA+DMA32)
 *       0x6    => BAD (HIGHMEM+DMA32)
 *       0x7    => BAD (HIGHMEM+DMA32+DMA)
 *       0x8    => NORMAL (MOVABLE+0)
 *       0x9    => DMA or NORMAL (MOVABLE+DMA)
 *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
 *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
 *       0xc    => DMA32 or NORMAL (MOVABLE+DMA32)
 *       0xd    => BAD (MOVABLE+DMA32+DMA)
 *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
 *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
 *
 * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
 */

#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif

#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif

#define GFP_ZONE_TABLE ( \
        (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)                                       \
        | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)               \
        | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)                       \
        | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)    \
        | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
        | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)

/*
 * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
 * entry starting with bit 0. Bit is set if the combination is not
 * allowed.
 */
#define GFP_ZONE_BAD ( \
        1 << (___GFP_DMA | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32)                                      \
        | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
)

static inline enum zone_type gfp_zone(gfp_t flags)
{
        enum zone_type z;
        int bit = (__force int) (flags & GFP_ZONEMASK);

        z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
                                         ((1 << GFP_ZONES_SHIFT) - 1);
        VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
        return z;
}

/*
 * There is only one page-allocator function, and two main namespaces to
 * it. The alloc_page*() variants return 'struct page *' and as such
 * can allocate highmem pages, the *get*page*() variants return
 * virtual kernel addresses to the allocated page(s).
 */

static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
        if (unlikely(flags & __GFP_THISNODE))
                return ZONELIST_NOFALLBACK;
#endif
        return ZONELIST_FALLBACK;
}

/*
 * gfp flag masking for nested internal allocations.
 *
 * For code that needs to do allocations inside the public allocation API (e.g.
 * memory allocation tracking code) the allocations need to obey the caller
 * allocation context constrains to prevent allocation context mismatches (e.g.
 * GFP_KERNEL allocations in GFP_NOFS contexts) from potential deadlock
 * situations.
 *
 * It is also assumed that these nested allocations are for internal kernel
 * object storage purposes only and are not going to be used for DMA, etc. Hence
 * we strip out all the zone information and leave just the context information
 * intact.
 *
 * Further, internal allocations must fail before the higher level allocation
 * can fail, so we must make them fail faster and fail silently. We also don't
 * want them to deplete emergency reserves.  Hence nested allocations must be
 * prepared for these allocations to fail.
 */
static inline gfp_t gfp_nested_mask(gfp_t flags)
{
        return ((flags & (GFP_KERNEL | GFP_ATOMIC | __GFP_NOLOCKDEP)) |
                (__GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN));
}

/*
 * We get the zone list from the current node and the gfp_mask.
 * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
 * There are two zonelists per node, one for all zones with memory and
 * one containing just zones from the node the zonelist belongs to.
 *
 * For the case of non-NUMA systems the NODE_DATA() gets optimized to
 * &contig_page_data at compile-time.
 */
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
        return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}

#ifndef HAVE_ARCH_FREE_PAGE
static inline void arch_free_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_ALLOC_PAGE
static inline void arch_alloc_page(struct page *page, int order) { }
#endif

struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
#define __alloc_pages(...)                        alloc_hooks(__alloc_pages_noprof(__VA_ARGS__))

struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
#define __folio_alloc(...)                        alloc_hooks(__folio_alloc_noprof(__VA_ARGS__))

unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
                                nodemask_t *nodemask, int nr_pages,
                                struct page **page_array);
#define __alloc_pages_bulk(...)                        alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))

unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
                                unsigned long nr_pages,
                                struct page **page_array);
#define  alloc_pages_bulk_mempolicy(...)                                \
        alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__))

/* Bulk allocate order-0 pages */
#define alloc_pages_bulk(_gfp, _nr_pages, _page_array)                \
        __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array)

static inline unsigned long
alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
                                   struct page **page_array)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array);
}

#define alloc_pages_bulk_node(...)                                \
        alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__))

static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
{
        gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);

        if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN))
                return;

        if (node_online(this_node))
                return;

        pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node);
        dump_stack();
}

/*
 * Allocate pages, preferring the node given as nid. The node must be valid and
 * online. For more general interface, see alloc_pages_node().
 */
static inline struct page *
__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        warn_if_node_offline(nid, gfp_mask);

        return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
}

#define  __alloc_pages_node(...)                alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__))

static inline
struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        warn_if_node_offline(nid, gfp);

        return __folio_alloc_noprof(gfp, order, nid, NULL);
}

#define  __folio_alloc_node(...)                alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__))

/*
 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
 * prefer the current CPU's closest node. Otherwise node must be valid and
 * online.
 */
static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
                                                   unsigned int order)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return __alloc_pages_node_noprof(nid, gfp_mask, order);
}

#define  alloc_pages_node(...)                        alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__))

#ifdef CONFIG_NUMA
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order);
struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *mpol, pgoff_t ilx, int nid);
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr);
#else
static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order)
{
        return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order);
}
static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
        return __folio_alloc_node_noprof(gfp, order, numa_node_id());
}
static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *mpol, pgoff_t ilx, int nid)
{
        return folio_alloc_noprof(gfp, order);
}
static inline struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order,
                struct vm_area_struct *vma, unsigned long addr)
{
        return folio_alloc_noprof(gfp, order);
}
#endif

#define alloc_pages(...)                        alloc_hooks(alloc_pages_noprof(__VA_ARGS__))
#define folio_alloc(...)                        alloc_hooks(folio_alloc_noprof(__VA_ARGS__))
#define folio_alloc_mpol(...)                        alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__))
#define vma_alloc_folio(...)                        alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__))

#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
                struct vm_area_struct *vma, unsigned long addr)
{
        struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr);

        return &folio->page;
}
#define alloc_page_vma(...)                        alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))

struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
#define alloc_pages_nolock(...)                        alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__))

extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...)                        alloc_hooks(get_free_pages_noprof(__VA_ARGS__))

extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask);
#define get_zeroed_page(...)                        alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__))

void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1);
#define alloc_pages_exact(...)                        alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__))

void free_pages_exact(void *virt, size_t size);

__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
#define alloc_pages_exact_nid(...)                                        \
        alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__))

#define __get_free_page(gfp_mask)                                        \
        __get_free_pages((gfp_mask), 0)

#define __get_dma_pages(gfp_mask, order)                                \
        __get_free_pages((gfp_mask) | GFP_DMA, (order))

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages_nolock(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)

void page_alloc_init_cpuhp(void);
bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);

void page_alloc_init_late(void);
void setup_pcp_cacheinfo(unsigned int cpu);

/*
 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 * GFP flags are used before interrupts are enabled. Once interrupts are
 * enabled, it is set to __GFP_BITS_MASK while the system is running. During
 * hibernation, it is used by PM to avoid I/O during memory allocation while
 * devices are suspended.
 */
extern gfp_t gfp_allowed_mask;

/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);

/* A helper for checking if gfp includes all the specified flags */
static inline bool gfp_has_flags(gfp_t gfp, gfp_t flags)
{
        return (gfp & flags) == flags;
}

static inline bool gfp_has_io_fs(gfp_t gfp)
{
        return gfp_has_flags(gfp, __GFP_IO | __GFP_FS);
}

/*
 * Check if the gfp flags allow compaction - GFP_NOIO is a really
 * tricky context because the migration might require IO.
 */
static inline bool gfp_compaction_allowed(gfp_t gfp_mask)
{
        return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO);
}

extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);

#ifdef CONFIG_CONTIG_ALLOC

typedef unsigned int __bitwise acr_flags_t;
#define ACR_FLAGS_NONE ((__force acr_flags_t)0) // ordinary allocation request
#define ACR_FLAGS_CMA ((__force acr_flags_t)BIT(0)) // allocate for CMA

/* The below functions must be run on a range from a single zone. */
int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end,
                acr_flags_t alloc_flags, gfp_t gfp_mask);
#define alloc_contig_frozen_range(...)        \
        alloc_hooks(alloc_contig_frozen_range_noprof(__VA_ARGS__))

int alloc_contig_range_noprof(unsigned long start, unsigned long end,
                acr_flags_t alloc_flags, gfp_t gfp_mask);
#define alloc_contig_range(...)        \
        alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))

struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages,
                gfp_t gfp_mask, int nid, nodemask_t *nodemask);
#define alloc_contig_frozen_pages(...) \
        alloc_hooks(alloc_contig_frozen_pages_noprof(__VA_ARGS__))

struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
                int nid, nodemask_t *nodemask);
#define alloc_contig_pages(...)        \
        alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))

void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages);
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
#endif

DEFINE_FREE(free_page, void *, free_page((unsigned long)_T))

#endif /* __LINUX_GFP_H */































































































































































































   13 
































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * include/linux/idr.h
 * 
 * 2002-10-18  written by Jim Houston jim.houston@ccur.com
 *        Copyright (C) 2002 by Concurrent Computer Corporation
 *
 * Small id to pointer translation service avoiding fixed sized
 * tables.
 */

#ifndef __IDR_H__
#define __IDR_H__

#include <linux/radix-tree.h>
#include <linux/gfp.h>
#include <linux/percpu.h>
#include <linux/cleanup.h>

struct idr {
        struct radix_tree_root        idr_rt;
        unsigned int                idr_base;
        unsigned int                idr_next;
};

/*
 * The IDR API does not expose the tagging functionality of the radix tree
 * to users.  Use tag 0 to track whether a node has free space below it.
 */
#define IDR_FREE        0

/* Set the IDR flag and the IDR_FREE tag */
#define IDR_RT_MARKER        (ROOT_IS_IDR | (__force gfp_t)                        \
                                        (1 << (ROOT_TAG_SHIFT + IDR_FREE)))

#define IDR_INIT_BASE(name, base) {                                        \
        .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER),                        \
        .idr_base = (base),                                                \
        .idr_next = 0,                                                        \
}

/**
 * IDR_INIT() - Initialise an IDR.
 * @name: Name of IDR.
 *
 * A freshly-initialised IDR contains no IDs.
 */
#define IDR_INIT(name)        IDR_INIT_BASE(name, 0)

/**
 * DEFINE_IDR() - Define a statically-allocated IDR.
 * @name: Name of IDR.
 *
 * An IDR defined using this macro is ready for use with no additional
 * initialisation required.  It contains no IDs.
 */
#define DEFINE_IDR(name)        struct idr name = IDR_INIT(name)

/**
 * idr_get_cursor - Return the current position of the cyclic allocator
 * @idr: idr handle
 *
 * The value returned is the value that will be next returned from
 * idr_alloc_cyclic() if it is free (otherwise the search will start from
 * this position).
 */
static inline unsigned int idr_get_cursor(const struct idr *idr)
{
        return READ_ONCE(idr->idr_next);
}

/**
 * idr_set_cursor - Set the current position of the cyclic allocator
 * @idr: idr handle
 * @val: new position
 *
 * The next call to idr_alloc_cyclic() will return @val if it is free
 * (otherwise the search will start from this position).
 */
static inline void idr_set_cursor(struct idr *idr, unsigned int val)
{
        WRITE_ONCE(idr->idr_next, val);
}

/**
 * DOC: idr sync
 * idr synchronization (stolen from radix-tree.h)
 *
 * idr_find() is able to be called locklessly, using RCU. The caller must
 * ensure calls to this function are made within rcu_read_lock() regions.
 * Other readers (lock-free or otherwise) and modifications may be running
 * concurrently.
 *
 * It is still required that the caller manage the synchronization and
 * lifetimes of the items. So if RCU lock-free lookups are used, typically
 * this would mean that the items have their own locks, or are amenable to
 * lock-free access; and that the items are freed by RCU (or only freed after
 * having been deleted from the idr tree *and* a synchronize_rcu() grace
 * period).
 */

#define idr_lock(idr)                xa_lock(&(idr)->idr_rt)
#define idr_unlock(idr)                xa_unlock(&(idr)->idr_rt)
#define idr_lock_bh(idr)        xa_lock_bh(&(idr)->idr_rt)
#define idr_unlock_bh(idr)        xa_unlock_bh(&(idr)->idr_rt)
#define idr_lock_irq(idr)        xa_lock_irq(&(idr)->idr_rt)
#define idr_unlock_irq(idr)        xa_unlock_irq(&(idr)->idr_rt)
#define idr_lock_irqsave(idr, flags) \
                                xa_lock_irqsave(&(idr)->idr_rt, flags)
#define idr_unlock_irqrestore(idr, flags) \
                                xa_unlock_irqrestore(&(idr)->idr_rt, flags)

void idr_preload(gfp_t gfp_mask);

int idr_alloc(struct idr *, void *ptr, int start, int end, gfp_t);
int __must_check idr_alloc_u32(struct idr *, void *ptr, u32 *id,
                                unsigned long max, gfp_t);
int idr_alloc_cyclic(struct idr *, void *ptr, int start, int end, gfp_t);
void *idr_remove(struct idr *, unsigned long id);
void *idr_find(const struct idr *, unsigned long id);
int idr_for_each(const struct idr *,
                 int (*fn)(int id, void *p, void *data), void *data);
void *idr_get_next(struct idr *, int *nextid);
void *idr_get_next_ul(struct idr *, unsigned long *nextid);
void *idr_replace(struct idr *, void *, unsigned long id);
void idr_destroy(struct idr *);

struct __class_idr {
        struct idr *idr;
        int id;
};

#define idr_null ((struct __class_idr){ NULL, -1 })
#define take_idr_id(id) __get_and_null(id, idr_null)

DEFINE_CLASS(idr_alloc, struct __class_idr,
             if (_T.id >= 0) idr_remove(_T.idr, _T.id),
             ((struct __class_idr){
                     .idr = idr,
                .id = idr_alloc(idr, ptr, start, end, gfp),
             }),
             struct idr *idr, void *ptr, int start, int end, gfp_t gfp);

/**
 * idr_init_base() - Initialise an IDR.
 * @idr: IDR handle.
 * @base: The base value for the IDR.
 *
 * This variation of idr_init() creates an IDR which will allocate IDs
 * starting at %base.
 */
static inline void idr_init_base(struct idr *idr, int base)
{
        INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
        idr->idr_base = base;
        idr->idr_next = 0;
}

/**
 * idr_init() - Initialise an IDR.
 * @idr: IDR handle.
 *
 * Initialise a dynamically allocated IDR.  To initialise a
 * statically allocated IDR, use DEFINE_IDR().
 */
static inline void idr_init(struct idr *idr)
{
        idr_init_base(idr, 0);
}

/**
 * idr_is_empty() - Are there any IDs allocated?
 * @idr: IDR handle.
 *
 * Return: %true if any IDs have been allocated from this IDR.
 */
static inline bool idr_is_empty(const struct idr *idr)
{
        return radix_tree_empty(&idr->idr_rt) &&
                radix_tree_tagged(&idr->idr_rt, IDR_FREE);
}

/**
 * idr_preload_end - end preload section started with idr_preload()
 *
 * Each idr_preload() should be matched with an invocation of this
 * function.  See idr_preload() for details.
 */
static inline void idr_preload_end(void)
{
        local_unlock(&radix_tree_preloads.lock);
}

/**
 * idr_for_each_entry() - Iterate over an IDR's elements of a given type.
 * @idr: IDR handle.
 * @entry: The type * to use as cursor
 * @id: Entry ID.
 *
 * @entry and @id do not need to be initialized before the loop, and
 * after normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry(idr, entry, id)                        \
        for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U)

/**
 * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type.
 * @idr: IDR handle.
 * @entry: The type * to use as cursor.
 * @tmp: A temporary placeholder for ID.
 * @id: Entry ID.
 *
 * @entry and @id do not need to be initialized before the loop, and
 * after normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry_ul(idr, entry, tmp, id)                        \
        for (tmp = 0, id = 0;                                                \
             ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
             tmp = id, ++id)

/**
 * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type
 * @idr: IDR handle.
 * @entry: The type * to use as a cursor.
 * @id: Entry ID.
 *
 * Continue to iterate over entries, continuing after the current position.
 */
#define idr_for_each_entry_continue(idr, entry, id)                        \
        for ((entry) = idr_get_next((idr), &(id));                        \
             entry;                                                        \
             ++id, (entry) = idr_get_next((idr), &(id)))

/**
 * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type
 * @idr: IDR handle.
 * @entry: The type * to use as a cursor.
 * @tmp: A temporary placeholder for ID.
 * @id: Entry ID.
 *
 * Continue to iterate over entries, continuing after the current position.
 * After normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry_continue_ul(idr, entry, tmp, id)                \
        for (tmp = id;                                                        \
             ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
             tmp = id, ++id)

/*
 * IDA - ID Allocator, use when translation from id to pointer isn't necessary.
 */
#define IDA_CHUNK_SIZE                128        /* 128 bytes per chunk */
#define IDA_BITMAP_LONGS        (IDA_CHUNK_SIZE / sizeof(long))
#define IDA_BITMAP_BITS         (IDA_BITMAP_LONGS * sizeof(long) * 8)

struct ida_bitmap {
        unsigned long                bitmap[IDA_BITMAP_LONGS];
};

struct ida {
        struct xarray xa;
};

#define IDA_INIT_FLAGS        (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC)

#define IDA_INIT(name)        {                                                \
        .xa = XARRAY_INIT(name, IDA_INIT_FLAGS)                                \
}
#define DEFINE_IDA(name)        struct ida name = IDA_INIT(name)

int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t);
void ida_free(struct ida *, unsigned int id);
void ida_destroy(struct ida *ida);
int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max);

/**
 * ida_alloc() - Allocate an unused ID.
 * @ida: IDA handle.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between 0 and %INT_MAX, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc(struct ida *ida, gfp_t gfp)
{
        return ida_alloc_range(ida, 0, ~0, gfp);
}

/**
 * ida_alloc_min() - Allocate an unused ID.
 * @ida: IDA handle.
 * @min: Lowest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between @min and %INT_MAX, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp)
{
        return ida_alloc_range(ida, min, ~0, gfp);
}

/**
 * ida_alloc_max() - Allocate an unused ID.
 * @ida: IDA handle.
 * @max: Highest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between 0 and @max, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp)
{
        return ida_alloc_range(ida, 0, max, gfp);
}

static inline void ida_init(struct ida *ida)
{
        xa_init_flags(&ida->xa, IDA_INIT_FLAGS);
}

static inline bool ida_is_empty(const struct ida *ida)
{
        return xa_empty(&ida->xa);
}

static inline bool ida_exists(struct ida *ida, unsigned int id)
{
        return ida_find_first_range(ida, id, id) == id;
}

static inline int ida_find_first(struct ida *ida)
{
        return ida_find_first_range(ida, 0, ~0);
}
#endif /* __IDR_H__ */






































































































































































































































































    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_64_H
#define _ASM_X86_PGTABLE_64_H

#include <linux/const.h>
#include <asm/pgtable_64_types.h>

#ifndef __ASSEMBLER__

/*
 * This file contains the functions and defines necessary to modify and use
 * the x86-64 page table tree.
 */
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
#include <asm/fixmap.h>

extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
extern pgd_t init_top_pgt[];

#define swapper_pg_dir init_top_pgt

extern void paging_init(void);
static inline void sync_initial_page_table(void) { }

#define pte_ERROR(e)                                        \
        pr_err("%s:%d: bad pte %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pte_val(e))
#define pmd_ERROR(e)                                        \
        pr_err("%s:%d: bad pmd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pmd_val(e))
#define pud_ERROR(e)                                        \
        pr_err("%s:%d: bad pud %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pud_val(e))

#define p4d_ERROR(e)                                        \
        pr_err("%s:%d: bad p4d %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), p4d_val(e))

#define pgd_ERROR(e)                                        \
        pr_err("%s:%d: bad pgd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pgd_val(e))

struct mm_struct;

#define mm_p4d_folded mm_p4d_folded
static inline bool mm_p4d_folded(struct mm_struct *mm)
{
        return !pgtable_l5_enabled();
}

void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);

static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
        WRITE_ONCE(*ptep, pte);
}

static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
                                    pte_t *ptep)
{
        native_set_pte(ptep, native_make_pte(0));
}

static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
        native_set_pte(ptep, pte);
}

static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
        WRITE_ONCE(*pmdp, pmd);
}

static inline void native_pmd_clear(pmd_t *pmd)
{
        native_set_pmd(pmd, native_make_pmd(0));
}

static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pte(xchg(&xp->pte, 0));
#else
        /* native_local_ptep_get_and_clear,
           but duplicated because of cyclic dependency */
        pte_t ret = *xp;
        native_pte_clear(NULL, 0, xp);
        return ret;
#endif
}

static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pmd(xchg(&xp->pmd, 0));
#else
        /* native_local_pmdp_get_and_clear,
           but duplicated because of cyclic dependency */
        pmd_t ret = *xp;
        native_pmd_clear(xp);
        return ret;
#endif
}

static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
        WRITE_ONCE(*pudp, pud);
}

static inline void native_pud_clear(pud_t *pud)
{
        native_set_pud(pud, native_make_pud(0));
}

static inline pud_t native_pudp_get_and_clear(pud_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pud(xchg(&xp->pud, 0));
#else
        /* native_local_pudp_get_and_clear,
         * but duplicated because of cyclic dependency
         */
        pud_t ret = *xp;

        native_pud_clear(xp);
        return ret;
#endif
}

static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
        pgd_t pgd;

        if (pgtable_l5_enabled() ||
            !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) {
                WRITE_ONCE(*p4dp, p4d);
                return;
        }

        pgd = native_make_pgd(native_p4d_val(p4d));
        pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
        WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
}

static inline void native_p4d_clear(p4d_t *p4d)
{
        native_set_p4d(p4d, native_make_p4d(0));
}

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
        WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}

static inline void native_pgd_clear(pgd_t *pgd)
{
        native_set_pgd(pgd, native_make_pgd(0));
}

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 */

/* PGD - Level 4 access */

/* PUD - Level 3 access */

/* PMD - Level 2 access */

/* PTE - Level 1 access */

/*
 * Encode and de-code a swap entry
 *
 * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
 * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
 * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| E|F|SD|0| <- swp entry
 *
 * G (8) is aliased and used as a PROT_NONE indicator for
 * !present ptes.  We need to start storing swap entries above
 * there.  We also need to avoid using A and D because of an
 * erratum where they can be incorrectly set by hardware on
 * non-present PTEs.
 *
 * SD Bits 1-4 are not used in non-present format and available for
 * special use described below:
 *
 * SD (1) in swp entry is used to store soft dirty bit, which helps us
 * remember soft dirty over page migration
 *
 * F (2) in swp entry is used to record when a pagetable is
 * writeprotected by userfaultfd WP support.
 *
 * E (3) in swp entry is used to remember PG_anon_exclusive.
 *
 * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
 * but also L and G.
 *
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define SWP_TYPE_BITS                5

#define SWP_OFFSET_FIRST_BIT        (_PAGE_BIT_PROTNONE + 1)

/* We always extract/encode the offset by shifting it all the way up, and then down again */
#define SWP_OFFSET_SHIFT        (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)

#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)

/* Extract the high bits for type */
#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))

/* Shift up (to get rid of type), then down to get value */
#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)

/*
 * Shift the offset up "too far" by TYPE bits, then down again
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define __swp_entry(type, offset) ((swp_entry_t) { \
        (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
        | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })

#define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd)                ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x)                (__pte((x).val))
#define __swp_entry_to_pmd(x)                (__pmd((x).val))

extern void cleanup_highmap(void);

#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN

#define PAGE_AGP    PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1

/* fs/proc/kcore.c */
#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
#define        kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)

#define __HAVE_ARCH_PTE_SAME

#define vmemmap ((struct page *)VMEMMAP_START)

extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);

#define gup_fast_permitted gup_fast_permitted
static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
{
        if (end >> __VIRTUAL_MASK_SHIFT)
                return false;
        return true;
}

#include <asm/pgtable-invert.h>

#else /* __ASSEMBLER__ */

#define l4_index(x)        (((x) >> 39) & 511)
#define pud_index(x)        (((x) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))

L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
L4_START_KERNEL = l4_index(__START_KERNEL_map)

L3_START_KERNEL = pud_index(__START_KERNEL_map)

#define SYM_DATA_START_PAGE_ALIGNED(name)                        \
        SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)

/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT)                        \
        i = 0 ;                                                \
        .rept (COUNT) ;                                        \
        .quad        (START) + (i << PMD_SHIFT) + (PERM) ;        \
        i = i + 1 ;                                        \
        .endr

#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PGTABLE_64_H */




















































































































































































































































































































































































































































































   36 
















































    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_PREEMPT_H
#define __LINUX_PREEMPT_H

/*
 * include/linux/preempt.h - macros for accessing and manipulating
 * preempt_count (used for kernel preemption, interrupt count, etc.)
 */

#include <linux/linkage.h>
#include <linux/cleanup.h>
#include <linux/types.h>

/*
 * We put the hardirq and softirq counter into the preemption
 * counter. The bitmask has the following meaning:
 *
 * - bits 0-7 are the preemption count (max preemption depth: 256)
 * - bits 8-15 are the softirq count (max # of softirqs: 256)
 *
 * The hardirq count could in theory be the same as the number of
 * interrupts in the system, but we run all interrupt handlers with
 * interrupts disabled, so we cannot have nesting interrupts. Though
 * there are a few palaeontologic drivers which reenable interrupts in
 * the handler, so we need more than one bit here.
 *
 *         PREEMPT_MASK:        0x000000ff
 *         SOFTIRQ_MASK:        0x0000ff00
 *         HARDIRQ_MASK:        0x000f0000
 *             NMI_MASK:        0x00f00000
 * PREEMPT_NEED_RESCHED:        0x80000000
 */
#define PREEMPT_BITS        8
#define SOFTIRQ_BITS        8
#define HARDIRQ_BITS        4
#define NMI_BITS        4

#define PREEMPT_SHIFT        0
#define SOFTIRQ_SHIFT        (PREEMPT_SHIFT + PREEMPT_BITS)
#define HARDIRQ_SHIFT        (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
#define NMI_SHIFT        (HARDIRQ_SHIFT + HARDIRQ_BITS)

#define __IRQ_MASK(x)        ((1UL << (x))-1)

#define PREEMPT_MASK        (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
#define SOFTIRQ_MASK        (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
#define HARDIRQ_MASK        (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
#define NMI_MASK        (__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)

#define PREEMPT_OFFSET        (1UL << PREEMPT_SHIFT)
#define SOFTIRQ_OFFSET        (1UL << SOFTIRQ_SHIFT)
#define HARDIRQ_OFFSET        (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET        (1UL << NMI_SHIFT)

#define SOFTIRQ_DISABLE_OFFSET        (2 * SOFTIRQ_OFFSET)

#define PREEMPT_DISABLED        (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)

/*
 * Disable preemption until the scheduler is running -- use an unconditional
 * value so that it also works on !PREEMPT_COUNT kernels.
 *
 * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
 */
#define INIT_PREEMPT_COUNT        PREEMPT_OFFSET

/*
 * Initial preempt_count value; reflects the preempt_count schedule invariant
 * which states that during context switches:
 *
 *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
 *
 * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
 * Note: See finish_task_switch().
 */
#define FORK_PREEMPT_COUNT        (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)

/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
#include <asm/preempt.h>

/**
 * interrupt_context_level - return interrupt context level
 *
 * Returns the current interrupt context level.
 *  0 - normal context
 *  1 - softirq context
 *  2 - hardirq context
 *  3 - NMI context
 */
static __always_inline unsigned char interrupt_context_level(void)
{
        unsigned long pc = preempt_count();
        unsigned char level = 0;

        level += !!(pc & (NMI_MASK));
        level += !!(pc & (NMI_MASK | HARDIRQ_MASK));
        level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));

        return level;
}

/*
 * These macro definitions avoid redundant invocations of preempt_count()
 * because such invocations would result in redundant loads given that
 * preempt_count() is commonly implemented with READ_ONCE().
 */

#define nmi_count()        (preempt_count() & NMI_MASK)
#define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
#ifdef CONFIG_PREEMPT_RT
# define softirq_count()        (current->softirq_disable_cnt & SOFTIRQ_MASK)
# define irq_count()                ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
#else
# define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
# define irq_count()                (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
#endif

/*
 * Macros to retrieve the current execution context:
 *
 * in_nmi()                - We're in NMI context
 * in_hardirq()                - We're in hard IRQ context
 * in_serving_softirq()        - We're in softirq context
 * in_task()                - We're in task context
 */
#define in_nmi()                (nmi_count())
#define in_hardirq()                (hardirq_count())
#define in_serving_softirq()        (softirq_count() & SOFTIRQ_OFFSET)
#ifdef CONFIG_PREEMPT_RT
# define in_task()                (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
#else
# define in_task()                (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
#endif

/*
 * The following macros are deprecated and should not be used in new code:
 * in_softirq()   - We have BH disabled, or are processing softirqs
 * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
 */
#define in_softirq()                (softirq_count())
#define in_interrupt()                (irq_count())

/*
 * The preempt_count offset after preempt_disable();
 */
#if defined(CONFIG_PREEMPT_COUNT)
# define PREEMPT_DISABLE_OFFSET        PREEMPT_OFFSET
#else
# define PREEMPT_DISABLE_OFFSET        0
#endif

/*
 * The preempt_count offset after spin_lock()
 */
#if !defined(CONFIG_PREEMPT_RT)
#define PREEMPT_LOCK_OFFSET                PREEMPT_DISABLE_OFFSET
#else
/* Locks on RT do not disable preemption */
#define PREEMPT_LOCK_OFFSET                0
#endif

/*
 * The preempt_count offset needed for things like:
 *
 *  spin_lock_bh()
 *
 * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
 * softirqs, such that unlock sequences of:
 *
 *  spin_unlock();
 *  local_bh_enable();
 *
 * Work as expected.
 */
#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET)

/*
 * Are we running in atomic context?  WARNING: this macro cannot
 * always detect atomic context; in particular, it cannot know about
 * held spinlocks in non-preemptible kernels.  Thus it should not be
 * used in the general case to determine whether sleeping is possible.
 * Do not use in_atomic() in driver code.
 */
#define in_atomic()        (preempt_count() != 0)

/*
 * Check whether we were atomic before we did preempt_disable():
 * (used by the scheduler)
 */
#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)

#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
extern void preempt_count_add(int val);
extern void preempt_count_sub(int val);
#define preempt_count_dec_and_test() \
        ({ preempt_count_sub(1); should_resched(0); })
#else
#define preempt_count_add(val)        __preempt_count_add(val)
#define preempt_count_sub(val)        __preempt_count_sub(val)
#define preempt_count_dec_and_test() __preempt_count_dec_and_test()
#endif

#define __preempt_count_inc() __preempt_count_add(1)
#define __preempt_count_dec() __preempt_count_sub(1)

#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)

#ifdef CONFIG_PREEMPT_COUNT

#define preempt_disable() \
do { \
        preempt_count_inc(); \
        barrier(); \
} while (0)

#define sched_preempt_enable_no_resched() \
do { \
        barrier(); \
        preempt_count_dec(); \
} while (0)

#define preempt_enable_no_resched() sched_preempt_enable_no_resched()

#define preemptible()        (preempt_count() == 0 && !irqs_disabled())

#ifdef CONFIG_PREEMPTION
#define preempt_enable() \
do { \
        barrier(); \
        if (unlikely(preempt_count_dec_and_test())) \
                __preempt_schedule(); \
} while (0)

#define preempt_enable_notrace() \
do { \
        barrier(); \
        if (unlikely(__preempt_count_dec_and_test())) \
                __preempt_schedule_notrace(); \
} while (0)

#define preempt_check_resched() \
do { \
        if (should_resched(0)) \
                __preempt_schedule(); \
} while (0)

#else /* !CONFIG_PREEMPTION */
#define preempt_enable() \
do { \
        barrier(); \
        preempt_count_dec(); \
} while (0)

#define preempt_enable_notrace() \
do { \
        barrier(); \
        __preempt_count_dec(); \
} while (0)

#define preempt_check_resched() do { } while (0)
#endif /* CONFIG_PREEMPTION */

#define preempt_disable_notrace() \
do { \
        __preempt_count_inc(); \
        barrier(); \
} while (0)

#define preempt_enable_no_resched_notrace() \
do { \
        barrier(); \
        __preempt_count_dec(); \
} while (0)

#else /* !CONFIG_PREEMPT_COUNT */

/*
 * Even if we don't have any preemption, we need preempt disable/enable
 * to be barriers, so that we don't have things like get_user/put_user
 * that can cause faults and scheduling migrate into our preempt-protected
 * region.
 */
#define preempt_disable()                        barrier()
#define sched_preempt_enable_no_resched()        barrier()
#define preempt_enable_no_resched()                barrier()
#define preempt_enable()                        barrier()
#define preempt_check_resched()                        do { } while (0)

#define preempt_disable_notrace()                barrier()
#define preempt_enable_no_resched_notrace()        barrier()
#define preempt_enable_notrace()                barrier()
#define preemptible()                                0

#endif /* CONFIG_PREEMPT_COUNT */

#ifdef MODULE
/*
 * Modules have no business playing preemption tricks.
 */
#undef sched_preempt_enable_no_resched
#undef preempt_enable_no_resched
#undef preempt_enable_no_resched_notrace
#undef preempt_check_resched
#endif

#define preempt_set_need_resched() \
do { \
        set_preempt_need_resched(); \
} while (0)
#define preempt_fold_need_resched() \
do { \
        if (tif_need_resched()) \
                set_preempt_need_resched(); \
} while (0)

#ifdef CONFIG_PREEMPT_NOTIFIERS

struct preempt_notifier;
struct task_struct;

/**
 * preempt_ops - notifiers called when a task is preempted and rescheduled
 * @sched_in: we're about to be rescheduled:
 *    notifier: struct preempt_notifier for the task being scheduled
 *    cpu:  cpu we're scheduled on
 * @sched_out: we've just been preempted
 *    notifier: struct preempt_notifier for the task being preempted
 *    next: the task that's kicking us out
 *
 * Please note that sched_in and out are called under different
 * contexts.  sched_out is called with rq lock held and irq disabled
 * while sched_in is called without rq lock and irq enabled.  This
 * difference is intentional and depended upon by its users.
 */
struct preempt_ops {
        void (*sched_in)(struct preempt_notifier *notifier, int cpu);
        void (*sched_out)(struct preempt_notifier *notifier,
                          struct task_struct *next);
};

/**
 * preempt_notifier - key for installing preemption notifiers
 * @link: internal use
 * @ops: defines the notifier functions to be called
 *
 * Usually used in conjunction with container_of().
 */
struct preempt_notifier {
        struct hlist_node link;
        struct preempt_ops *ops;
};

void preempt_notifier_inc(void);
void preempt_notifier_dec(void);
void preempt_notifier_register(struct preempt_notifier *notifier);
void preempt_notifier_unregister(struct preempt_notifier *notifier);

static inline void preempt_notifier_init(struct preempt_notifier *notifier,
                                     struct preempt_ops *ops)
{
        /* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */
        notifier->link.next = NULL;
        notifier->link.pprev = NULL;
        notifier->ops = ops;
}

#endif

/*
 * Migrate-Disable and why it is undesired.
 *
 * When a preempted task becomes eligible to run under the ideal model (IOW it
 * becomes one of the M highest priority tasks), it might still have to wait
 * for the preemptee's migrate_disable() section to complete. Thereby suffering
 * a reduction in bandwidth in the exact duration of the migrate_disable()
 * section.
 *
 * Per this argument, the change from preempt_disable() to migrate_disable()
 * gets us:
 *
 * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
 *   it would have had to wait for the lower priority task.
 *
 * - a lower priority tasks; which under preempt_disable() could've instantly
 *   migrated away when another CPU becomes available, is now constrained
 *   by the ability to push the higher priority task away, which might itself be
 *   in a migrate_disable() section, reducing its available bandwidth.
 *
 * IOW it trades latency / moves the interference term, but it stays in the
 * system, and as long as it remains unbounded, the system is not fully
 * deterministic.
 *
 *
 * The reason we have it anyway.
 *
 * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
 * number of primitives into becoming preemptible, they would also allow
 * migration. This turns out to break a bunch of per-cpu usage. To this end,
 * all these primitives employ migrate_disable() to restore this implicit
 * assumption.
 *
 * This is a 'temporary' work-around at best. The correct solution is getting
 * rid of the above assumptions and reworking the code to employ explicit
 * per-cpu locking or short preempt-disable regions.
 *
 * The end goal must be to get rid of migrate_disable(), alternatively we need
 * a schedulability theory that does not depend on arbitrary migration.
 *
 *
 * Notes on the implementation.
 *
 * The implementation is particularly tricky since existing code patterns
 * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
 * This means that it cannot use cpus_read_lock() to serialize against hotplug,
 * nor can it easily migrate itself into a pending affinity mask change on
 * migrate_enable().
 *
 *
 * Note: even non-work-conserving schedulers like semi-partitioned depends on
 *       migration, so migrate_disable() is not only a problem for
 *       work-conserving schedulers.
 *
 */

/**
 * preempt_disable_nested - Disable preemption inside a normally preempt disabled section
 *
 * Use for code which requires preemption protection inside a critical
 * section which has preemption disabled implicitly on non-PREEMPT_RT
 * enabled kernels, by e.g.:
 *  - holding a spinlock/rwlock
 *  - soft interrupt context
 *  - regular interrupt handlers
 *
 * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft
 * interrupt context and regular interrupt handlers are preemptible and
 * only prevent migration. preempt_disable_nested() ensures that preemption
 * is disabled for cases which require CPU local serialization even on
 * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP.
 *
 * The use cases are code sequences which are not serialized by a
 * particular lock instance, e.g.:
 *  - seqcount write side critical sections where the seqcount is not
 *    associated to a particular lock and therefore the automatic
 *    protection mechanism does not work. This prevents a live lock
 *    against a preempting high priority reader.
 *  - RMW per CPU variable updates like vmstat.
 */
/* Macro to avoid header recursion hell vs. lockdep */
#define preempt_disable_nested()                                \
do {                                                                \
        if (IS_ENABLED(CONFIG_PREEMPT_RT))                        \
                preempt_disable();                                \
        else                                                        \
                lockdep_assert_preemption_disabled();                \
} while (0)

/**
 * preempt_enable_nested - Undo the effect of preempt_disable_nested()
 */
static __always_inline void preempt_enable_nested(void)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable();
}

DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())

#ifdef CONFIG_PREEMPT_DYNAMIC

extern bool preempt_model_none(void);
extern bool preempt_model_voluntary(void);
extern bool preempt_model_full(void);
extern bool preempt_model_lazy(void);

#else

static inline bool preempt_model_none(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_NONE);
}
static inline bool preempt_model_voluntary(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
}
static inline bool preempt_model_full(void)
{
        return IS_ENABLED(CONFIG_PREEMPT);
}

static inline bool preempt_model_lazy(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_LAZY);
}

#endif

static inline bool preempt_model_rt(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_RT);
}

extern const char *preempt_model_str(void);

/*
 * Does the preemption model allow non-cooperative preemption?
 *
 * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
 * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
 * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
 * PREEMPT_NONE model.
 */
static inline bool preempt_model_preemptible(void)
{
        return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
}

#endif /* __LINUX_PREEMPT_H */




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 







    2 
    2 





    2 


    2 































    2 


    2 
    2 










    2 


    2 



























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/udp.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <net/genetlink.h>
#include <net/gro.h>
#include <net/gue.h>
#include <net/fou.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/udp.h>
#include <net/udp_tunnel.h>
#include <uapi/linux/fou.h>
#include <uapi/linux/genetlink.h>

#include "fou_nl.h"

struct fou {
        struct socket *sock;
        u8 protocol;
        u8 flags;
        __be16 port;
        u8 family;
        u16 type;
        struct list_head list;
        struct rcu_head rcu;
};

#define FOU_F_REMCSUM_NOPARTIAL BIT(0)

struct fou_cfg {
        u16 type;
        u8 protocol;
        u8 flags;
        struct udp_port_cfg udp_config;
};

static unsigned int fou_net_id;

struct fou_net {
        struct list_head fou_list;
        struct mutex fou_lock;
};

static inline struct fou *fou_from_sock(struct sock *sk)
{
        return rcu_dereference_sk_user_data(sk);
}

static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len)
{
        /* Remove 'len' bytes from the packet (UDP header and
         * FOU header if present).
         */
        if (fou->family == AF_INET)
                ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
        else
                ipv6_hdr(skb)->payload_len =
                    htons(ntohs(ipv6_hdr(skb)->payload_len) - len);

        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, udp_hdr(skb), len);
        skb_reset_transport_header(skb);
        return iptunnel_pull_offloads(skb);
}

static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
{
        struct fou *fou = fou_from_sock(sk);

        if (!fou)
                return 1;

        if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
                goto drop;

        return -fou->protocol;

drop:
        kfree_skb(skb);
        return 0;
}

static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
                                  void *data, size_t hdrlen, u8 ipproto,
                                  bool nopartial)
{
        __be16 *pd = data;
        size_t start = ntohs(pd[0]);
        size_t offset = ntohs(pd[1]);
        size_t plen = sizeof(struct udphdr) + hdrlen +
            max_t(size_t, offset + sizeof(u16), start);

        if (skb->remcsum_offload)
                return guehdr;

        if (!pskb_may_pull(skb, plen))
                return NULL;
        guehdr = (struct guehdr *)&udp_hdr(skb)[1];

        skb_remcsum_process(skb, (void *)guehdr + hdrlen,
                            start, offset, nopartial);

        return guehdr;
}

static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr)
{
        /* No support yet */
        kfree_skb(skb);
        return 0;
}

static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
{
        struct fou *fou = fou_from_sock(sk);
        size_t len, optlen, hdrlen;
        struct guehdr *guehdr;
        void *data;
        u16 doffset = 0;
        u8 proto_ctype;

        if (!fou)
                return 1;

        len = sizeof(struct udphdr) + sizeof(struct guehdr);
        if (!pskb_may_pull(skb, len))
                goto drop;

        guehdr = (struct guehdr *)&udp_hdr(skb)[1];

        switch (guehdr->version) {
        case 0: /* Full GUE header present */
                break;

        case 1: {
                /* Direct encapsulation of IPv4 or IPv6 */

                int prot;

                switch (((struct iphdr *)guehdr)->version) {
                case 4:
                        prot = IPPROTO_IPIP;
                        break;
                case 6:
                        prot = IPPROTO_IPV6;
                        break;
                default:
                        goto drop;
                }

                if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
                        goto drop;

                return -prot;
        }

        default: /* Undefined version */
                goto drop;
        }

        optlen = guehdr->hlen << 2;
        len += optlen;

        if (!pskb_may_pull(skb, len))
                goto drop;

        /* guehdr may change after pull */
        guehdr = (struct guehdr *)&udp_hdr(skb)[1];

        if (validate_gue_flags(guehdr, optlen))
                goto drop;

        hdrlen = sizeof(struct guehdr) + optlen;

        if (fou->family == AF_INET)
                ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
        else
                ipv6_hdr(skb)->payload_len =
                    htons(ntohs(ipv6_hdr(skb)->payload_len) - len);

        /* Pull csum through the guehdr now . This can be used if
         * there is a remote checksum offload.
         */
        skb_postpull_rcsum(skb, udp_hdr(skb), len);

        data = &guehdr[1];

        if (guehdr->flags & GUE_FLAG_PRIV) {
                __be32 flags = *(__be32 *)(data + doffset);

                doffset += GUE_LEN_PRIV;

                if (flags & GUE_PFLAG_REMCSUM) {
                        guehdr = gue_remcsum(skb, guehdr, data + doffset,
                                             hdrlen, guehdr->proto_ctype,
                                             !!(fou->flags &
                                                FOU_F_REMCSUM_NOPARTIAL));
                        if (!guehdr)
                                goto drop;

                        data = &guehdr[1];

                        doffset += GUE_PLEN_REMCSUM;
                }
        }

        if (unlikely(guehdr->control))
                return gue_control_message(skb, guehdr);

        proto_ctype = guehdr->proto_ctype;
        if (unlikely(!proto_ctype))
                goto drop;

        __skb_pull(skb, sizeof(struct udphdr) + hdrlen);
        skb_reset_transport_header(skb);

        if (iptunnel_pull_offloads(skb))
                goto drop;

        return -proto_ctype;

drop:
        kfree_skb(skb);
        return 0;
}

static const struct net_offload *fou_gro_ops(const struct sock *sk,
                                             int proto)
{
        const struct net_offload __rcu **offloads;

        /* FOU doesn't allow IPv4 on IPv6 sockets. */
        offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads;
        return rcu_dereference(offloads[proto]);
}

static struct sk_buff *fou_gro_receive(struct sock *sk,
                                       struct list_head *head,
                                       struct sk_buff *skb)
{
        struct fou *fou = fou_from_sock(sk);
        const struct net_offload *ops;
        struct sk_buff *pp = NULL;

        if (!fou)
                goto out;

        /* We can clear the encap_mark for FOU as we are essentially doing
         * one of two possible things.  We are either adding an L4 tunnel
         * header to the outer L3 tunnel header, or we are simply
         * treating the GRE tunnel header as though it is a UDP protocol
         * specific header such as VXLAN or GENEVE.
         */
        NAPI_GRO_CB(skb)->encap_mark = 0;

        /* Flag this frame as already having an outer encap header */
        NAPI_GRO_CB(skb)->is_fou = 1;

        ops = fou_gro_ops(sk, fou->protocol);
        if (!ops || !ops->callbacks.gro_receive)
                goto out;

        pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);

out:
        return pp;
}

static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
                            int nhoff)
{
        struct fou *fou = fou_from_sock(sk);
        const struct net_offload *ops;
        int err;

        if (!fou) {
                err = -ENOENT;
                goto out;
        }

        ops = fou_gro_ops(sk, fou->protocol);
        if (WARN_ON(!ops || !ops->callbacks.gro_complete)) {
                err = -ENOSYS;
                goto out;
        }

        err = ops->callbacks.gro_complete(skb, nhoff);

        skb_set_inner_mac_header(skb, nhoff);

out:
        return err;
}

static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
                                      struct guehdr *guehdr, void *data,
                                      size_t hdrlen, struct gro_remcsum *grc,
                                      bool nopartial)
{
        __be16 *pd = data;
        size_t start = ntohs(pd[0]);
        size_t offset = ntohs(pd[1]);

        if (skb->remcsum_offload)
                return guehdr;

        if (!NAPI_GRO_CB(skb)->csum_valid)
                return NULL;

        guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen,
                                         start, offset, grc, nopartial);

        skb->remcsum_offload = 1;

        return guehdr;
}

static struct sk_buff *gue_gro_receive(struct sock *sk,
                                       struct list_head *head,
                                       struct sk_buff *skb)
{
        const struct net_offload *ops;
        struct sk_buff *pp = NULL;
        struct sk_buff *p;
        struct guehdr *guehdr;
        size_t len, optlen, hdrlen, off;
        void *data;
        u16 doffset = 0;
        int flush = 1;
        struct fou *fou = fou_from_sock(sk);
        struct gro_remcsum grc;
        u8 proto;

        skb_gro_remcsum_init(&grc);

        if (!fou)
                goto out;

        off = skb_gro_offset(skb);
        len = off + sizeof(*guehdr);

        guehdr = skb_gro_header(skb, len, off);
        if (unlikely(!guehdr))
                goto out;

        switch (guehdr->version) {
        case 0:
                break;
        case 1:
                switch (((struct iphdr *)guehdr)->version) {
                case 4:
                        proto = IPPROTO_IPIP;
                        break;
                case 6:
                        proto = IPPROTO_IPV6;
                        break;
                default:
                        goto out;
                }
                goto next_proto;
        default:
                goto out;
        }

        optlen = guehdr->hlen << 2;
        len += optlen;

        if (!skb_gro_may_pull(skb, len)) {
                guehdr = skb_gro_header_slow(skb, len, off);
                if (unlikely(!guehdr))
                        goto out;
        }

        if (unlikely(guehdr->control) || guehdr->version != 0 ||
            validate_gue_flags(guehdr, optlen))
                goto out;

        hdrlen = sizeof(*guehdr) + optlen;

        /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr,
         * this is needed if there is a remote checkcsum offload.
         */
        skb_gro_postpull_rcsum(skb, guehdr, hdrlen);

        data = &guehdr[1];

        if (guehdr->flags & GUE_FLAG_PRIV) {
                __be32 flags = *(__be32 *)(data + doffset);

                doffset += GUE_LEN_PRIV;

                if (flags & GUE_PFLAG_REMCSUM) {
                        guehdr = gue_gro_remcsum(skb, off, guehdr,
                                                 data + doffset, hdrlen, &grc,
                                                 !!(fou->flags &
                                                    FOU_F_REMCSUM_NOPARTIAL));

                        if (!guehdr)
                                goto out;

                        data = &guehdr[1];

                        doffset += GUE_PLEN_REMCSUM;
                }
        }

        skb_gro_pull(skb, hdrlen);

        list_for_each_entry(p, head, list) {
                const struct guehdr *guehdr2;

                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                guehdr2 = (struct guehdr *)(p->data + off);

                /* Compare base GUE header to be equal (covers
                 * hlen, version, proto_ctype, and flags.
                 */
                if (guehdr->word != guehdr2->word) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }

                /* Compare optional fields are the same. */
                if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1],
                                           guehdr->hlen << 2)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }
        }

        proto = guehdr->proto_ctype;

next_proto:

        /* We can clear the encap_mark for GUE as we are essentially doing
         * one of two possible things.  We are either adding an L4 tunnel
         * header to the outer L3 tunnel header, or we are simply
         * treating the GRE tunnel header as though it is a UDP protocol
         * specific header such as VXLAN or GENEVE.
         */
        NAPI_GRO_CB(skb)->encap_mark = 0;

        /* Flag this frame as already having an outer encap header */
        NAPI_GRO_CB(skb)->is_fou = 1;

        ops = fou_gro_ops(sk, proto);
        if (!ops || !ops->callbacks.gro_receive)
                goto out;

        pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
        flush = 0;

out:
        skb_gro_flush_final_remcsum(skb, pp, flush, &grc);

        return pp;
}

static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
        struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
        const struct net_offload *ops;
        unsigned int guehlen = 0;
        u8 proto;
        int err = -ENOENT;

        switch (guehdr->version) {
        case 0:
                proto = guehdr->proto_ctype;
                guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
                break;
        case 1:
                switch (((struct iphdr *)guehdr)->version) {
                case 4:
                        proto = IPPROTO_IPIP;
                        break;
                case 6:
                        proto = IPPROTO_IPV6;
                        break;
                default:
                        return err;
                }
                break;
        default:
                return err;
        }

        ops = fou_gro_ops(sk, proto);
        if (WARN_ON(!ops || !ops->callbacks.gro_complete))
                goto out;

        err = ops->callbacks.gro_complete(skb, nhoff + guehlen);

        skb_set_inner_mac_header(skb, nhoff + guehlen);

out:
        return err;
}

static bool fou_cfg_cmp(struct fou *fou, struct fou_cfg *cfg)
{
        struct sock *sk = fou->sock->sk;
        struct udp_port_cfg *udp_cfg = &cfg->udp_config;

        if (fou->family != udp_cfg->family ||
            fou->port != udp_cfg->local_udp_port ||
            sk->sk_dport != udp_cfg->peer_udp_port ||
            sk->sk_bound_dev_if != udp_cfg->bind_ifindex)
                return false;

        if (fou->family == AF_INET) {
                if (sk->sk_rcv_saddr != udp_cfg->local_ip.s_addr ||
                    sk->sk_daddr != udp_cfg->peer_ip.s_addr)
                        return false;
                else
                        return true;
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                if (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, &udp_cfg->local_ip6) ||
                    ipv6_addr_cmp(&sk->sk_v6_daddr, &udp_cfg->peer_ip6))
                        return false;
                else
                        return true;
#endif
        }

        return false;
}

static int fou_add_to_port_list(struct net *net, struct fou *fou,
                                struct fou_cfg *cfg)
{
        struct fou_net *fn = net_generic(net, fou_net_id);
        struct fou *fout;

        mutex_lock(&fn->fou_lock);
        list_for_each_entry(fout, &fn->fou_list, list) {
                if (fou_cfg_cmp(fout, cfg)) {
                        mutex_unlock(&fn->fou_lock);
                        return -EALREADY;
                }
        }

        list_add(&fou->list, &fn->fou_list);
        mutex_unlock(&fn->fou_lock);

        return 0;
}

static void fou_release(struct fou *fou)
{
        struct socket *sock = fou->sock;

        list_del(&fou->list);
        udp_tunnel_sock_release(sock);

        kfree_rcu(fou, rcu);
}

static int fou_create(struct net *net, struct fou_cfg *cfg,
                      struct socket **sockp)
{
        struct socket *sock = NULL;
        struct fou *fou = NULL;
        struct sock *sk;
        struct udp_tunnel_sock_cfg tunnel_cfg;
        int err;

        /* Open UDP socket */
        err = udp_sock_create(net, &cfg->udp_config, &sock);
        if (err < 0)
                goto error;

        /* Allocate FOU port structure */
        fou = kzalloc_obj(*fou);
        if (!fou) {
                err = -ENOMEM;
                goto error;
        }

        sk = sock->sk;

        fou->port = cfg->udp_config.local_udp_port;
        fou->family = cfg->udp_config.family;
        fou->flags = cfg->flags;
        fou->type = cfg->type;
        fou->sock = sock;

        memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
        tunnel_cfg.encap_type = 1;
        tunnel_cfg.sk_user_data = fou;
        tunnel_cfg.encap_destroy = NULL;

        /* Initial for fou type */
        switch (cfg->type) {
        case FOU_ENCAP_DIRECT:
                tunnel_cfg.encap_rcv = fou_udp_recv;
                tunnel_cfg.gro_receive = fou_gro_receive;
                tunnel_cfg.gro_complete = fou_gro_complete;
                fou->protocol = cfg->protocol;
                break;
        case FOU_ENCAP_GUE:
                tunnel_cfg.encap_rcv = gue_udp_recv;
                tunnel_cfg.gro_receive = gue_gro_receive;
                tunnel_cfg.gro_complete = gue_gro_complete;
                break;
        default:
                err = -EINVAL;
                goto error;
        }

        setup_udp_tunnel_sock(net, sock, &tunnel_cfg);

        sk->sk_allocation = GFP_ATOMIC;

        err = fou_add_to_port_list(net, fou, cfg);
        if (err)
                goto error;

        if (sockp)
                *sockp = sock;

        return 0;

error:
        kfree(fou);
        if (sock)
                udp_tunnel_sock_release(sock);

        return err;
}

static int fou_destroy(struct net *net, struct fou_cfg *cfg)
{
        struct fou_net *fn = net_generic(net, fou_net_id);
        int err = -EINVAL;
        struct fou *fou;

        mutex_lock(&fn->fou_lock);
        list_for_each_entry(fou, &fn->fou_list, list) {
                if (fou_cfg_cmp(fou, cfg)) {
                        fou_release(fou);
                        err = 0;
                        break;
                }
        }
        mutex_unlock(&fn->fou_lock);

        return err;
}

static struct genl_family fou_nl_family;

static int parse_nl_config(struct genl_info *info,
                           struct fou_cfg *cfg)
{
        bool has_local = false, has_peer = false;
        struct nlattr *attr;
        int ifindex;
        __be16 port;

        memset(cfg, 0, sizeof(*cfg));

        cfg->udp_config.family = AF_INET;

        if (info->attrs[FOU_ATTR_AF]) {
                u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);

                switch (family) {
                case AF_INET:
                        break;
                case AF_INET6:
                        cfg->udp_config.ipv6_v6only = 1;
                        break;
                default:
                        return -EAFNOSUPPORT;
                }

                cfg->udp_config.family = family;
        }

        if (info->attrs[FOU_ATTR_PORT]) {
                port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
                cfg->udp_config.local_udp_port = port;
        }

        if (info->attrs[FOU_ATTR_IPPROTO])
                cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]);

        if (info->attrs[FOU_ATTR_TYPE])
                cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);

        if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL])
                cfg->flags |= FOU_F_REMCSUM_NOPARTIAL;

        if (cfg->udp_config.family == AF_INET) {
                if (info->attrs[FOU_ATTR_LOCAL_V4]) {
                        attr = info->attrs[FOU_ATTR_LOCAL_V4];
                        cfg->udp_config.local_ip.s_addr = nla_get_in_addr(attr);
                        has_local = true;
                }

                if (info->attrs[FOU_ATTR_PEER_V4]) {
                        attr = info->attrs[FOU_ATTR_PEER_V4];
                        cfg->udp_config.peer_ip.s_addr = nla_get_in_addr(attr);
                        has_peer = true;
                }
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                if (info->attrs[FOU_ATTR_LOCAL_V6]) {
                        attr = info->attrs[FOU_ATTR_LOCAL_V6];
                        cfg->udp_config.local_ip6 = nla_get_in6_addr(attr);
                        has_local = true;
                }

                if (info->attrs[FOU_ATTR_PEER_V6]) {
                        attr = info->attrs[FOU_ATTR_PEER_V6];
                        cfg->udp_config.peer_ip6 = nla_get_in6_addr(attr);
                        has_peer = true;
                }
#endif
        }

        if (has_peer) {
                if (info->attrs[FOU_ATTR_PEER_PORT]) {
                        port = nla_get_be16(info->attrs[FOU_ATTR_PEER_PORT]);
                        cfg->udp_config.peer_udp_port = port;
                } else {
                        return -EINVAL;
                }
        }

        if (info->attrs[FOU_ATTR_IFINDEX]) {
                if (!has_local)
                        return -EINVAL;

                ifindex = nla_get_s32(info->attrs[FOU_ATTR_IFINDEX]);

                cfg->udp_config.bind_ifindex = ifindex;
        }

        return 0;
}

int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);
        struct fou_cfg cfg;
        int err;

        err = parse_nl_config(info, &cfg);
        if (err)
                return err;

        return fou_create(net, &cfg, NULL);
}

int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);
        struct fou_cfg cfg;
        int err;

        err = parse_nl_config(info, &cfg);
        if (err)
                return err;

        return fou_destroy(net, &cfg);
}

static int fou_fill_info(struct fou *fou, struct sk_buff *msg)
{
        struct sock *sk = fou->sock->sk;

        if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) ||
            nla_put_be16(msg, FOU_ATTR_PORT, fou->port) ||
            nla_put_be16(msg, FOU_ATTR_PEER_PORT, sk->sk_dport) ||
            nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) ||
            nla_put_u8(msg, FOU_ATTR_TYPE, fou->type) ||
            nla_put_s32(msg, FOU_ATTR_IFINDEX, sk->sk_bound_dev_if))
                return -1;

        if (fou->flags & FOU_F_REMCSUM_NOPARTIAL)
                if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL))
                        return -1;

        if (fou->sock->sk->sk_family == AF_INET) {
                if (nla_put_in_addr(msg, FOU_ATTR_LOCAL_V4, sk->sk_rcv_saddr))
                        return -1;

                if (nla_put_in_addr(msg, FOU_ATTR_PEER_V4, sk->sk_daddr))
                        return -1;
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                if (nla_put_in6_addr(msg, FOU_ATTR_LOCAL_V6,
                                     &sk->sk_v6_rcv_saddr))
                        return -1;

                if (nla_put_in6_addr(msg, FOU_ATTR_PEER_V6, &sk->sk_v6_daddr))
                        return -1;
#endif
        }

        return 0;
}

static int fou_dump_info(struct fou *fou, u32 portid, u32 seq,
                         u32 flags, struct sk_buff *skb, u8 cmd)
{
        void *hdr;

        hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd);
        if (!hdr)
                return -ENOMEM;

        if (fou_fill_info(fou, skb) < 0)
                goto nla_put_failure;

        genlmsg_end(skb, hdr);
        return 0;

nla_put_failure:
        genlmsg_cancel(skb, hdr);
        return -EMSGSIZE;
}

int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);
        struct fou_net *fn = net_generic(net, fou_net_id);
        struct sk_buff *msg;
        struct fou_cfg cfg;
        struct fou *fout;
        __be16 port;
        u8 family;
        int ret;

        ret = parse_nl_config(info, &cfg);
        if (ret)
                return ret;
        port = cfg.udp_config.local_udp_port;
        if (port == 0)
                return -EINVAL;

        family = cfg.udp_config.family;
        if (family != AF_INET && family != AF_INET6)
                return -EINVAL;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        ret = -ESRCH;
        mutex_lock(&fn->fou_lock);
        list_for_each_entry(fout, &fn->fou_list, list) {
                if (fou_cfg_cmp(fout, &cfg)) {
                        ret = fou_dump_info(fout, info->snd_portid,
                                            info->snd_seq, 0, msg,
                                            info->genlhdr->cmd);
                        break;
                }
        }
        mutex_unlock(&fn->fou_lock);
        if (ret < 0)
                goto out_free;

        return genlmsg_reply(msg, info);

out_free:
        nlmsg_free(msg);
        return ret;
}

int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct fou_net *fn = net_generic(net, fou_net_id);
        struct fou *fout;
        int idx = 0, ret;

        mutex_lock(&fn->fou_lock);
        list_for_each_entry(fout, &fn->fou_list, list) {
                if (idx++ < cb->args[0])
                        continue;
                ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid,
                                    cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                    skb, FOU_CMD_GET);
                if (ret)
                        break;
        }
        mutex_unlock(&fn->fou_lock);

        cb->args[0] = idx;
        return skb->len;
}

static struct genl_family fou_nl_family __ro_after_init = {
        .hdrsize        = 0,
        .name                = FOU_GENL_NAME,
        .version        = FOU_GENL_VERSION,
        .maxattr        = FOU_ATTR_MAX,
        .policy                = fou_nl_policy,
        .netnsok        = true,
        .module                = THIS_MODULE,
        .small_ops        = fou_nl_ops,
        .n_small_ops        = ARRAY_SIZE(fou_nl_ops),
        .resv_start_op        = FOU_CMD_GET + 1,
};

size_t fou_encap_hlen(struct ip_tunnel_encap *e)
{
        return sizeof(struct udphdr);
}
EXPORT_SYMBOL(fou_encap_hlen);

size_t gue_encap_hlen(struct ip_tunnel_encap *e)
{
        size_t len;
        bool need_priv = false;

        len = sizeof(struct udphdr) + sizeof(struct guehdr);

        if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) {
                len += GUE_PLEN_REMCSUM;
                need_priv = true;
        }

        len += need_priv ? GUE_LEN_PRIV : 0;

        return len;
}
EXPORT_SYMBOL(gue_encap_hlen);

int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
                       u8 *protocol, __be16 *sport, int type)
{
        int err;

        err = iptunnel_handle_offloads(skb, type);
        if (err)
                return err;

        *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
                                                skb, 0, 0, false);

        return 0;
}
EXPORT_SYMBOL(__fou_build_header);

int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
                       u8 *protocol, __be16 *sport, int type)
{
        struct guehdr *guehdr;
        size_t hdrlen, optlen = 0;
        void *data;
        bool need_priv = false;
        int err;

        if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
            skb->ip_summed == CHECKSUM_PARTIAL) {
                optlen += GUE_PLEN_REMCSUM;
                type |= SKB_GSO_TUNNEL_REMCSUM;
                need_priv = true;
        }

        optlen += need_priv ? GUE_LEN_PRIV : 0;

        err = iptunnel_handle_offloads(skb, type);
        if (err)
                return err;

        /* Get source port (based on flow hash) before skb_push */
        *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
                                                skb, 0, 0, false);

        hdrlen = sizeof(struct guehdr) + optlen;

        skb_push(skb, hdrlen);

        guehdr = (struct guehdr *)skb->data;

        guehdr->control = 0;
        guehdr->version = 0;
        guehdr->hlen = optlen >> 2;
        guehdr->flags = 0;
        guehdr->proto_ctype = *protocol;

        data = &guehdr[1];

        if (need_priv) {
                __be32 *flags = data;

                guehdr->flags |= GUE_FLAG_PRIV;
                *flags = 0;
                data += GUE_LEN_PRIV;

                if (type & SKB_GSO_TUNNEL_REMCSUM) {
                        u16 csum_start = skb_checksum_start_offset(skb);
                        __be16 *pd = data;

                        if (csum_start < hdrlen)
                                return -EINVAL;

                        csum_start -= hdrlen;
                        pd[0] = htons(csum_start);
                        pd[1] = htons(csum_start + skb->csum_offset);

                        if (!skb_is_gso(skb)) {
                                skb->ip_summed = CHECKSUM_NONE;
                                skb->encapsulation = 0;
                        }

                        *flags |= GUE_PFLAG_REMCSUM;
                        data += GUE_PLEN_REMCSUM;
                }

        }

        return 0;
}
EXPORT_SYMBOL(__gue_build_header);

#ifdef CONFIG_NET_FOU_IP_TUNNELS

static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
                          struct flowi4 *fl4, u8 *protocol, __be16 sport)
{
        struct udphdr *uh;

        skb_push(skb, sizeof(struct udphdr));
        skb_reset_transport_header(skb);

        uh = udp_hdr(skb);

        uh->dest = e->dport;
        uh->source = sport;
        uh->len = htons(skb->len);
        udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
                     fl4->saddr, fl4->daddr, skb->len);

        *protocol = IPPROTO_UDP;
}

static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
                            u8 *protocol, struct flowi4 *fl4)
{
        int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
                                                       SKB_GSO_UDP_TUNNEL;
        __be16 sport;
        int err;

        err = __fou_build_header(skb, e, protocol, &sport, type);
        if (err)
                return err;

        fou_build_udp(skb, e, fl4, protocol, sport);

        return 0;
}

static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
                            u8 *protocol, struct flowi4 *fl4)
{
        int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
                                                       SKB_GSO_UDP_TUNNEL;
        __be16 sport;
        int err;

        err = __gue_build_header(skb, e, protocol, &sport, type);
        if (err)
                return err;

        fou_build_udp(skb, e, fl4, protocol, sport);

        return 0;
}

static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info)
{
        const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]);

        if (ipprot && ipprot->err_handler) {
                if (!ipprot->err_handler(skb, info))
                        return 0;
        }

        return -ENOENT;
}

static int gue_err(struct sk_buff *skb, u32 info)
{
        int transport_offset = skb_transport_offset(skb);
        struct guehdr *guehdr;
        size_t len, optlen;
        int ret;

        len = sizeof(struct udphdr) + sizeof(struct guehdr);
        if (!pskb_may_pull(skb, transport_offset + len))
                return -EINVAL;

        guehdr = (struct guehdr *)&udp_hdr(skb)[1];

        switch (guehdr->version) {
        case 0: /* Full GUE header present */
                break;
        case 1: {
                /* Direct encapsulation of IPv4 or IPv6 */
                skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));

                switch (((struct iphdr *)guehdr)->version) {
                case 4:
                        ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info);
                        goto out;
#if IS_ENABLED(CONFIG_IPV6)
                case 6:
                        ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info);
                        goto out;
#endif
                default:
                        ret = -EOPNOTSUPP;
                        goto out;
                }
        }
        default: /* Undefined version */
                return -EOPNOTSUPP;
        }

        if (guehdr->control)
                return -ENOENT;

        optlen = guehdr->hlen << 2;

        if (!pskb_may_pull(skb, transport_offset + len + optlen))
                return -EINVAL;

        guehdr = (struct guehdr *)&udp_hdr(skb)[1];
        if (validate_gue_flags(guehdr, optlen))
                return -EINVAL;

        /* Handling exceptions for direct UDP encapsulation in GUE would lead to
         * recursion. Besides, this kind of encapsulation can't even be
         * configured currently. Discard this.
         */
        if (guehdr->proto_ctype == IPPROTO_UDP)
                return -EOPNOTSUPP;

        skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
        ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info);

out:
        skb_set_transport_header(skb, transport_offset);
        return ret;
}


static const struct ip_tunnel_encap_ops fou_iptun_ops = {
        .encap_hlen = fou_encap_hlen,
        .build_header = fou_build_header,
        .err_handler = gue_err,
};

static const struct ip_tunnel_encap_ops gue_iptun_ops = {
        .encap_hlen = gue_encap_hlen,
        .build_header = gue_build_header,
        .err_handler = gue_err,
};

static int ip_tunnel_encap_add_fou_ops(void)
{
        int ret;

        ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
        if (ret < 0) {
                pr_err("can't add fou ops\n");
                return ret;
        }

        ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
        if (ret < 0) {
                pr_err("can't add gue ops\n");
                ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
                return ret;
        }

        return 0;
}

static void ip_tunnel_encap_del_fou_ops(void)
{
        ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
        ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
}

#else

static int ip_tunnel_encap_add_fou_ops(void)
{
        return 0;
}

static void ip_tunnel_encap_del_fou_ops(void)
{
}

#endif

static __net_init int fou_init_net(struct net *net)
{
        struct fou_net *fn = net_generic(net, fou_net_id);

        INIT_LIST_HEAD(&fn->fou_list);
        mutex_init(&fn->fou_lock);
        return 0;
}

static __net_exit void fou_exit_net(struct net *net)
{
        struct fou_net *fn = net_generic(net, fou_net_id);
        struct fou *fou, *next;

        /* Close all the FOU sockets */
        mutex_lock(&fn->fou_lock);
        list_for_each_entry_safe(fou, next, &fn->fou_list, list)
                fou_release(fou);
        mutex_unlock(&fn->fou_lock);
}

static struct pernet_operations fou_net_ops = {
        .init = fou_init_net,
        .exit = fou_exit_net,
        .id   = &fou_net_id,
        .size = sizeof(struct fou_net),
};

static int __init fou_init(void)
{
        int ret;

        ret = register_pernet_device(&fou_net_ops);
        if (ret)
                goto exit;

        ret = genl_register_family(&fou_nl_family);
        if (ret < 0)
                goto unregister;

        ret = register_fou_bpf();
        if (ret < 0)
                goto kfunc_failed;

        ret = ip_tunnel_encap_add_fou_ops();
        if (ret == 0)
                return 0;

kfunc_failed:
        genl_unregister_family(&fou_nl_family);
unregister:
        unregister_pernet_device(&fou_net_ops);
exit:
        return ret;
}

static void __exit fou_fini(void)
{
        ip_tunnel_encap_del_fou_ops();
        genl_unregister_family(&fou_nl_family);
        unregister_pernet_device(&fou_net_ops);
}

module_init(fou_init);
module_exit(fou_fini);
MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Foo over UDP");










































    1 

































    1 













    1 




























    1 



































    1 





    1 





    1 



































































    1 

















    1 









































































































































































    1 
















    1 

















    1 



















































    1 





























    1 












    1 






















    1 




































    1 












    1 








    1 
    1 







    1 






































































































































    1 

















    1 









    1 


    1 










    1 
















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
// SPDX-License-Identifier: GPL-2.0-or-later
/* Manage a process's keyrings
 *
 * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/init.h>
#include <linux/sched.h>
#include <linux/sched/user.h>
#include <linux/keyctl.h>
#include <linux/fs.h>
#include <linux/err.h>
#include <linux/mutex.h>
#include <linux/security.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>
#include <linux/init_task.h>
#include <keys/request_key_auth-type.h>
#include "internal.h"

/* Session keyring create vs join semaphore */
static DEFINE_MUTEX(key_session_mutex);

/* The root user's tracking struct */
struct key_user root_key_user = {
        .usage                = REFCOUNT_INIT(3),
        .cons_lock        = __MUTEX_INITIALIZER(root_key_user.cons_lock),
        .lock                = __SPIN_LOCK_UNLOCKED(root_key_user.lock),
        .nkeys                = ATOMIC_INIT(2),
        .nikeys                = ATOMIC_INIT(2),
        .uid                = GLOBAL_ROOT_UID,
};

/*
 * Get or create a user register keyring.
 */
static struct key *get_user_register(struct user_namespace *user_ns)
{
        struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register);

        if (reg_keyring)
                return reg_keyring;

        down_write(&user_ns->keyring_sem);

        /* Make sure there's a register keyring.  It gets owned by the
         * user_namespace's owner.
         */
        reg_keyring = user_ns->user_keyring_register;
        if (!reg_keyring) {
                reg_keyring = keyring_alloc(".user_reg",
                                            user_ns->owner, INVALID_GID,
                                            kernel_cred(),
                                            KEY_POS_WRITE | KEY_POS_SEARCH |
                                            KEY_USR_VIEW | KEY_USR_READ,
                                            0,
                                            NULL, NULL);
                if (!IS_ERR(reg_keyring))
                        smp_store_release(&user_ns->user_keyring_register,
                                          reg_keyring);
        }

        up_write(&user_ns->keyring_sem);

        /* We don't return a ref since the keyring is pinned by the user_ns */
        return reg_keyring;
}

/*
 * Look up the user and user session keyrings for the current process's UID,
 * creating them if they don't exist.
 */
int look_up_user_keyrings(struct key **_user_keyring,
                          struct key **_user_session_keyring)
{
        const struct cred *cred = current_cred();
        struct user_namespace *user_ns = current_user_ns();
        struct key *reg_keyring, *uid_keyring, *session_keyring;
        key_perm_t user_keyring_perm;
        key_ref_t uid_keyring_r, session_keyring_r;
        uid_t uid = from_kuid(user_ns, cred->user->uid);
        char buf[20];
        int ret;

        user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL;

        kenter("%u", uid);

        reg_keyring = get_user_register(user_ns);
        if (IS_ERR(reg_keyring))
                return PTR_ERR(reg_keyring);

        down_write(&user_ns->keyring_sem);
        ret = 0;

        /* Get the user keyring.  Note that there may be one in existence
         * already as it may have been pinned by a session, but the user_struct
         * pointing to it may have been destroyed by setuid.
         */
        snprintf(buf, sizeof(buf), "_uid.%u", uid);
        uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true),
                                       &key_type_keyring, buf, false);
        kdebug("_uid %p", uid_keyring_r);
        if (uid_keyring_r == ERR_PTR(-EAGAIN)) {
                uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID,
                                            cred, user_keyring_perm,
                                            KEY_ALLOC_UID_KEYRING |
                                            KEY_ALLOC_IN_QUOTA,
                                            NULL, reg_keyring);
                if (IS_ERR(uid_keyring)) {
                        ret = PTR_ERR(uid_keyring);
                        goto error;
                }
        } else if (IS_ERR(uid_keyring_r)) {
                ret = PTR_ERR(uid_keyring_r);
                goto error;
        } else {
                uid_keyring = key_ref_to_ptr(uid_keyring_r);
        }

        /* Get a default session keyring (which might also exist already) */
        snprintf(buf, sizeof(buf), "_uid_ses.%u", uid);
        session_keyring_r = keyring_search(make_key_ref(reg_keyring, true),
                                           &key_type_keyring, buf, false);
        kdebug("_uid_ses %p", session_keyring_r);
        if (session_keyring_r == ERR_PTR(-EAGAIN)) {
                session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID,
                                                cred, user_keyring_perm,
                                                KEY_ALLOC_UID_KEYRING |
                                                KEY_ALLOC_IN_QUOTA,
                                                NULL, NULL);
                if (IS_ERR(session_keyring)) {
                        ret = PTR_ERR(session_keyring);
                        goto error_release;
                }

                /* We install a link from the user session keyring to
                 * the user keyring.
                 */
                ret = key_link(session_keyring, uid_keyring);
                if (ret < 0)
                        goto error_release_session;

                /* And only then link the user-session keyring to the
                 * register.
                 */
                ret = key_link(reg_keyring, session_keyring);
                if (ret < 0)
                        goto error_release_session;
        } else if (IS_ERR(session_keyring_r)) {
                ret = PTR_ERR(session_keyring_r);
                goto error_release;
        } else {
                session_keyring = key_ref_to_ptr(session_keyring_r);
        }

        up_write(&user_ns->keyring_sem);

        if (_user_session_keyring)
                *_user_session_keyring = session_keyring;
        else
                key_put(session_keyring);
        if (_user_keyring)
                *_user_keyring = uid_keyring;
        else
                key_put(uid_keyring);
        kleave(" = 0");
        return 0;

error_release_session:
        key_put(session_keyring);
error_release:
        key_put(uid_keyring);
error:
        up_write(&user_ns->keyring_sem);
        kleave(" = %d", ret);
        return ret;
}

/*
 * Get the user session keyring if it exists, but don't create it if it
 * doesn't.
 */
struct key *get_user_session_keyring_rcu(const struct cred *cred)
{
        struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register);
        key_ref_t session_keyring_r;
        char buf[20];

        struct keyring_search_context ctx = {
                .index_key.type                = &key_type_keyring,
                .index_key.description        = buf,
                .cred                        = cred,
                .match_data.cmp                = key_default_cmp,
                .match_data.raw_data        = buf,
                .match_data.lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
                .flags                        = KEYRING_SEARCH_DO_STATE_CHECK,
        };

        if (!reg_keyring)
                return NULL;

        ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u",
                                          from_kuid(cred->user_ns,
                                                    cred->user->uid));

        session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true),
                                               &ctx);
        if (IS_ERR(session_keyring_r))
                return NULL;
        return key_ref_to_ptr(session_keyring_r);
}

/*
 * Install a thread keyring to the given credentials struct if it didn't have
 * one already.  This is allowed to overrun the quota.
 *
 * Return: 0 if a thread keyring is now present; -errno on failure.
 */
int install_thread_keyring_to_cred(struct cred *new)
{
        struct key *keyring;

        if (new->thread_keyring)
                return 0;

        keyring = keyring_alloc("_tid", new->uid, new->gid, new,
                                KEY_POS_ALL | KEY_USR_VIEW,
                                KEY_ALLOC_QUOTA_OVERRUN,
                                NULL, NULL);
        if (IS_ERR(keyring))
                return PTR_ERR(keyring);

        new->thread_keyring = keyring;
        return 0;
}

/*
 * Install a thread keyring to the current task if it didn't have one already.
 *
 * Return: 0 if a thread keyring is now present; -errno on failure.
 */
static int install_thread_keyring(void)
{
        struct cred *new;
        int ret;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = install_thread_keyring_to_cred(new);
        if (ret < 0) {
                abort_creds(new);
                return ret;
        }

        return commit_creds(new);
}

/*
 * Install a process keyring to the given credentials struct if it didn't have
 * one already.  This is allowed to overrun the quota.
 *
 * Return: 0 if a process keyring is now present; -errno on failure.
 */
int install_process_keyring_to_cred(struct cred *new)
{
        struct key *keyring;

        if (new->process_keyring)
                return 0;

        keyring = keyring_alloc("_pid", new->uid, new->gid, new,
                                KEY_POS_ALL | KEY_USR_VIEW,
                                KEY_ALLOC_QUOTA_OVERRUN,
                                NULL, NULL);
        if (IS_ERR(keyring))
                return PTR_ERR(keyring);

        new->process_keyring = keyring;
        return 0;
}

/*
 * Install a process keyring to the current task if it didn't have one already.
 *
 * Return: 0 if a process keyring is now present; -errno on failure.
 */
static int install_process_keyring(void)
{
        struct cred *new;
        int ret;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = install_process_keyring_to_cred(new);
        if (ret < 0) {
                abort_creds(new);
                return ret;
        }

        return commit_creds(new);
}

/*
 * Install the given keyring as the session keyring of the given credentials
 * struct, replacing the existing one if any.  If the given keyring is NULL,
 * then install a new anonymous session keyring.
 * @cred can not be in use by any task yet.
 *
 * Return: 0 on success; -errno on failure.
 */
int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
{
        unsigned long flags;
        struct key *old;

        might_sleep();

        /* create an empty session keyring */
        if (!keyring) {
                flags = KEY_ALLOC_QUOTA_OVERRUN;
                if (cred->session_keyring)
                        flags = KEY_ALLOC_IN_QUOTA;

                keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred,
                                        KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
                                        flags, NULL, NULL);
                if (IS_ERR(keyring))
                        return PTR_ERR(keyring);
        } else {
                __key_get(keyring);
        }

        /* install the keyring */
        old = cred->session_keyring;
        cred->session_keyring = keyring;

        if (old)
                key_put(old);

        return 0;
}

/*
 * Install the given keyring as the session keyring of the current task,
 * replacing the existing one if any.  If the given keyring is NULL, then
 * install a new anonymous session keyring.
 *
 * Return: 0 on success; -errno on failure.
 */
static int install_session_keyring(struct key *keyring)
{
        struct cred *new;
        int ret;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = install_session_keyring_to_cred(new, keyring);
        if (ret < 0) {
                abort_creds(new);
                return ret;
        }

        return commit_creds(new);
}

/*
 * Handle the fsuid changing.
 */
void key_fsuid_changed(struct cred *new_cred)
{
        /* update the ownership of the thread keyring */
        if (new_cred->thread_keyring) {
                down_write(&new_cred->thread_keyring->sem);
                new_cred->thread_keyring->uid = new_cred->fsuid;
                up_write(&new_cred->thread_keyring->sem);
        }
}

/*
 * Handle the fsgid changing.
 */
void key_fsgid_changed(struct cred *new_cred)
{
        /* update the ownership of the thread keyring */
        if (new_cred->thread_keyring) {
                down_write(&new_cred->thread_keyring->sem);
                new_cred->thread_keyring->gid = new_cred->fsgid;
                up_write(&new_cred->thread_keyring->sem);
        }
}

/*
 * Search the process keyrings attached to the supplied cred for the first
 * matching key under RCU conditions (the caller must be holding the RCU read
 * lock).
 *
 * The search criteria are the type and the match function.  The description is
 * given to the match function as a parameter, but doesn't otherwise influence
 * the search.  Typically the match function will compare the description
 * parameter to the key's description.
 *
 * This can only search keyrings that grant Search permission to the supplied
 * credentials.  Keyrings linked to searched keyrings will also be searched if
 * they grant Search permission too.  Keys can only be found if they grant
 * Search permission to the credentials.
 *
 * Returns a pointer to the key with the key usage count incremented if
 * successful, -EAGAIN if we didn't find any matching key or -ENOKEY if we only
 * matched negative keys.
 *
 * In the case of a successful return, the possession attribute is set on the
 * returned key reference.
 */
key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx)
{
        struct key *user_session;
        key_ref_t key_ref, ret, err;
        const struct cred *cred = ctx->cred;

        /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
         * searchable, but we failed to find a key or we found a negative key;
         * otherwise we want to return a sample error (probably -EACCES) if
         * none of the keyrings were searchable
         *
         * in terms of priority: success > -ENOKEY > -EAGAIN > other error
         */
        key_ref = NULL;
        ret = NULL;
        err = ERR_PTR(-EAGAIN);

        /* search the thread keyring first */
        if (cred->thread_keyring) {
                key_ref = keyring_search_rcu(
                        make_key_ref(cred->thread_keyring, 1), ctx);
                if (!IS_ERR(key_ref))
                        goto found;

                switch (PTR_ERR(key_ref)) {
                case -EAGAIN: /* no key */
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
                default:
                        err = key_ref;
                        break;
                }
        }

        /* search the process keyring second */
        if (cred->process_keyring) {
                key_ref = keyring_search_rcu(
                        make_key_ref(cred->process_keyring, 1), ctx);
                if (!IS_ERR(key_ref))
                        goto found;

                switch (PTR_ERR(key_ref)) {
                case -EAGAIN: /* no key */
                        if (ret)
                                break;
                        fallthrough;
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
                default:
                        err = key_ref;
                        break;
                }
        }

        /* search the session keyring */
        if (cred->session_keyring) {
                key_ref = keyring_search_rcu(
                        make_key_ref(cred->session_keyring, 1), ctx);

                if (!IS_ERR(key_ref))
                        goto found;

                switch (PTR_ERR(key_ref)) {
                case -EAGAIN: /* no key */
                        if (ret)
                                break;
                        fallthrough;
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
                default:
                        err = key_ref;
                        break;
                }
        }
        /* or search the user-session keyring */
        else if ((user_session = get_user_session_keyring_rcu(cred))) {
                key_ref = keyring_search_rcu(make_key_ref(user_session, 1),
                                             ctx);
                key_put(user_session);

                if (!IS_ERR(key_ref))
                        goto found;

                switch (PTR_ERR(key_ref)) {
                case -EAGAIN: /* no key */
                        if (ret)
                                break;
                        fallthrough;
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
                default:
                        err = key_ref;
                        break;
                }
        }

        /* no key - decide on the error we're going to go for */
        key_ref = ret ? ret : err;

found:
        return key_ref;
}

/*
 * Search the process keyrings attached to the supplied cred for the first
 * matching key in the manner of search_my_process_keyrings(), but also search
 * the keys attached to the assumed authorisation key using its credentials if
 * one is available.
 *
 * The caller must be holding the RCU read lock.
 *
 * Return same as search_cred_keyrings_rcu().
 */
key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx)
{
        struct request_key_auth *rka;
        key_ref_t key_ref, ret = ERR_PTR(-EACCES), err;

        key_ref = search_cred_keyrings_rcu(ctx);
        if (!IS_ERR(key_ref))
                goto found;
        err = key_ref;

        /* if this process has an instantiation authorisation key, then we also
         * search the keyrings of the process mentioned there
         * - we don't permit access to request_key auth keys via this method
         */
        if (ctx->cred->request_key_auth &&
            ctx->cred == current_cred() &&
            ctx->index_key.type != &key_type_request_key_auth
            ) {
                const struct cred *cred = ctx->cred;

                if (key_validate(cred->request_key_auth) == 0) {
                        rka = ctx->cred->request_key_auth->payload.data[0];

                        //// was search_process_keyrings() [ie. recursive]
                        ctx->cred = rka->cred;
                        key_ref = search_cred_keyrings_rcu(ctx);
                        ctx->cred = cred;

                        if (!IS_ERR(key_ref))
                                goto found;
                        ret = key_ref;
                }
        }

        /* no key - decide on the error we're going to go for */
        if (err == ERR_PTR(-ENOKEY) || ret == ERR_PTR(-ENOKEY))
                key_ref = ERR_PTR(-ENOKEY);
        else if (err == ERR_PTR(-EACCES))
                key_ref = ret;
        else
                key_ref = err;

found:
        return key_ref;
}
/*
 * See if the key we're looking at is the target key.
 */
bool lookup_user_key_possessed(const struct key *key,
                               const struct key_match_data *match_data)
{
        return key == match_data->raw_data;
}

/*
 * Look up a key ID given us by userspace with a given permissions mask to get
 * the key it refers to.
 *
 * Flags can be passed to request that special keyrings be created if referred
 * to directly, to permit partially constructed keys to be found and to skip
 * validity and permission checks on the found key.
 *
 * Returns a pointer to the key with an incremented usage count if successful;
 * -EINVAL if the key ID is invalid; -ENOKEY if the key ID does not correspond
 * to a key or the best found key was a negative key; -EKEYREVOKED or
 * -EKEYEXPIRED if the best found key was revoked or expired; -EACCES if the
 * found key doesn't grant the requested permit or the LSM denied access to it;
 * or -ENOMEM if a special keyring couldn't be created.
 *
 * In the case of a successful return, the possession attribute is set on the
 * returned key reference.
 */
key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
                          enum key_need_perm need_perm)
{
        struct keyring_search_context ctx = {
                .match_data.cmp                = lookup_user_key_possessed,
                .match_data.lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
                .flags                        = (KEYRING_SEARCH_NO_STATE_CHECK |
                                           KEYRING_SEARCH_RECURSE),
        };
        struct request_key_auth *rka;
        struct key *key, *user_session;
        key_ref_t key_ref, skey_ref;
        int ret;

try_again:
        ctx.cred = get_current_cred();
        key_ref = ERR_PTR(-ENOKEY);

        switch (id) {
        case KEY_SPEC_THREAD_KEYRING:
                if (!ctx.cred->thread_keyring) {
                        if (!(lflags & KEY_LOOKUP_CREATE))
                                goto error;

                        ret = install_thread_keyring();
                        if (ret < 0) {
                                key_ref = ERR_PTR(ret);
                                goto error;
                        }
                        goto reget_creds;
                }

                key = ctx.cred->thread_keyring;
                __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_PROCESS_KEYRING:
                if (!ctx.cred->process_keyring) {
                        if (!(lflags & KEY_LOOKUP_CREATE))
                                goto error;

                        ret = install_process_keyring();
                        if (ret < 0) {
                                key_ref = ERR_PTR(ret);
                                goto error;
                        }
                        goto reget_creds;
                }

                key = ctx.cred->process_keyring;
                __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_SESSION_KEYRING:
                if (!ctx.cred->session_keyring) {
                        /* always install a session keyring upon access if one
                         * doesn't exist yet */
                        ret = look_up_user_keyrings(NULL, &user_session);
                        if (ret < 0)
                                goto error;
                        if (lflags & KEY_LOOKUP_CREATE)
                                ret = join_session_keyring(NULL);
                        else
                                ret = install_session_keyring(user_session);

                        key_put(user_session);
                        if (ret < 0)
                                goto error;
                        goto reget_creds;
                } else if (test_bit(KEY_FLAG_UID_KEYRING,
                                    &ctx.cred->session_keyring->flags) &&
                           lflags & KEY_LOOKUP_CREATE) {
                        ret = join_session_keyring(NULL);
                        if (ret < 0)
                                goto error;
                        goto reget_creds;
                }

                key = ctx.cred->session_keyring;
                __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_USER_KEYRING:
                ret = look_up_user_keyrings(&key, NULL);
                if (ret < 0)
                        goto error;
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_USER_SESSION_KEYRING:
                ret = look_up_user_keyrings(NULL, &key);
                if (ret < 0)
                        goto error;
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_GROUP_KEYRING:
                /* group keyrings are not yet supported */
                key_ref = ERR_PTR(-EINVAL);
                goto error;

        case KEY_SPEC_REQKEY_AUTH_KEY:
                key = ctx.cred->request_key_auth;
                if (!key)
                        goto error;

                __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;

        case KEY_SPEC_REQUESTOR_KEYRING:
                if (!ctx.cred->request_key_auth)
                        goto error;

                down_read(&ctx.cred->request_key_auth->sem);
                if (test_bit(KEY_FLAG_REVOKED,
                             &ctx.cred->request_key_auth->flags)) {
                        key_ref = ERR_PTR(-EKEYREVOKED);
                        key = NULL;
                } else {
                        rka = ctx.cred->request_key_auth->payload.data[0];
                        key = rka->dest_keyring;
                        __key_get(key);
                }
                up_read(&ctx.cred->request_key_auth->sem);
                if (!key)
                        goto error;
                key_ref = make_key_ref(key, 1);
                break;

        default:
                key_ref = ERR_PTR(-EINVAL);
                if (id < 1)
                        goto error;

                key = key_lookup(id);
                if (IS_ERR(key)) {
                        key_ref = ERR_CAST(key);
                        goto error;
                }

                key_ref = make_key_ref(key, 0);

                /* check to see if we possess the key */
                ctx.index_key                        = key->index_key;
                ctx.match_data.raw_data                = key;
                kdebug("check possessed");
                rcu_read_lock();
                skey_ref = search_process_keyrings_rcu(&ctx);
                rcu_read_unlock();
                kdebug("possessed=%p", skey_ref);

                if (!IS_ERR(skey_ref)) {
                        key_put(key);
                        key_ref = skey_ref;
                }

                break;
        }

        /* unlink does not use the nominated key in any way, so can skip all
         * the permission checks as it is only concerned with the keyring */
        if (need_perm != KEY_NEED_UNLINK) {
                if (!(lflags & KEY_LOOKUP_PARTIAL)) {
                        ret = wait_for_key_construction(key, true);
                        switch (ret) {
                        case -ERESTARTSYS:
                                goto invalid_key;
                        default:
                                if (need_perm != KEY_AUTHTOKEN_OVERRIDE &&
                                    need_perm != KEY_DEFER_PERM_CHECK)
                                        goto invalid_key;
                                break;
                        case 0:
                                break;
                        }
                } else if (need_perm != KEY_DEFER_PERM_CHECK) {
                        ret = key_validate(key);
                        if (ret < 0)
                                goto invalid_key;
                }

                ret = -EIO;
                if (!(lflags & KEY_LOOKUP_PARTIAL) &&
                    key_read_state(key) == KEY_IS_UNINSTANTIATED)
                        goto invalid_key;
        }

        /* check the permissions */
        ret = key_task_permission(key_ref, ctx.cred, need_perm);
        if (ret < 0)
                goto invalid_key;

        key->last_used_at = ktime_get_real_seconds();

error:
        put_cred(ctx.cred);
        return key_ref;

invalid_key:
        key_ref_put(key_ref);
        key_ref = ERR_PTR(ret);
        goto error;

        /* if we attempted to install a keyring, then it may have caused new
         * creds to be installed */
reget_creds:
        put_cred(ctx.cred);
        goto try_again;
}
EXPORT_SYMBOL(lookup_user_key);

/*
 * Join the named keyring as the session keyring if possible else attempt to
 * create a new one of that name and join that.
 *
 * If the name is NULL, an empty anonymous keyring will be installed as the
 * session keyring.
 *
 * Named session keyrings are joined with a semaphore held to prevent the
 * keyrings from going away whilst the attempt is made to going them and also
 * to prevent a race in creating compatible session keyrings.
 */
long join_session_keyring(const char *name)
{
        const struct cred *old;
        struct cred *new;
        struct key *keyring;
        long ret, serial;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        /* if no name is provided, install an anonymous keyring */
        if (!name) {
                ret = install_session_keyring_to_cred(new, NULL);
                if (ret < 0)
                        goto error;

                serial = new->session_keyring->serial;
                ret = commit_creds(new);
                if (ret == 0)
                        ret = serial;
                goto okay;
        }

        /* allow the user to join or create a named keyring */
        mutex_lock(&key_session_mutex);

        /* look for an existing keyring of this name */
        keyring = find_keyring_by_name(name, false);
        if (PTR_ERR(keyring) == -ENOKEY) {
                /* not found - try and create a new one */
                keyring = keyring_alloc(
                        name, old->uid, old->gid, old,
                        KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK,
                        KEY_ALLOC_IN_QUOTA, NULL, NULL);
                if (IS_ERR(keyring)) {
                        ret = PTR_ERR(keyring);
                        goto error2;
                }
        } else if (IS_ERR(keyring)) {
                ret = PTR_ERR(keyring);
                goto error2;
        } else if (keyring == new->session_keyring) {
                ret = 0;
                goto error3;
        }

        /* we've got a keyring - now to install it */
        ret = install_session_keyring_to_cred(new, keyring);
        if (ret < 0)
                goto error3;

        commit_creds(new);
        mutex_unlock(&key_session_mutex);

        ret = keyring->serial;
        key_put(keyring);
okay:
        return ret;

error3:
        key_put(keyring);
error2:
        mutex_unlock(&key_session_mutex);
error:
        abort_creds(new);
        return ret;
}

/*
 * Replace a process's session keyring on behalf of one of its children when
 * the target  process is about to resume userspace execution.
 */
void key_change_session_keyring(struct callback_head *twork)
{
        const struct cred *old = current_cred();
        struct cred *new = container_of(twork, struct cred, rcu);

        if (unlikely(current->flags & PF_EXITING)) {
                put_cred(new);
                return;
        }

        /* If get_ucounts fails more bits are needed in the refcount */
        if (unlikely(!get_ucounts(old->ucounts))) {
                WARN_ONCE(1, "In %s get_ucounts failed\n", __func__);
                put_cred(new);
                return;
        }

        new->  uid        = old->  uid;
        new-> euid        = old-> euid;
        new-> suid        = old-> suid;
        new->fsuid        = old->fsuid;
        new->  gid        = old->  gid;
        new-> egid        = old-> egid;
        new-> sgid        = old-> sgid;
        new->fsgid        = old->fsgid;
        new->user        = get_uid(old->user);
        new->ucounts        = old->ucounts;
        new->user_ns        = get_user_ns(old->user_ns);
        new->group_info        = get_group_info(old->group_info);

        new->securebits        = old->securebits;
        new->cap_inheritable        = old->cap_inheritable;
        new->cap_permitted        = old->cap_permitted;
        new->cap_effective        = old->cap_effective;
        new->cap_ambient        = old->cap_ambient;
        new->cap_bset                = old->cap_bset;

        new->jit_keyring        = old->jit_keyring;
        new->thread_keyring        = key_get(old->thread_keyring);
        new->process_keyring        = key_get(old->process_keyring);

        security_transfer_creds(new, old);

        commit_creds(new);
}

/*
 * Make sure that root's user and user-session keyrings exist.
 */
static int __init init_root_keyring(void)
{
        return look_up_user_keyrings(NULL, NULL);
}

late_initcall(init_root_keyring);















































































   19 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   17 


















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-long.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_LONG_H
#define _LINUX_ATOMIC_LONG_H

#include <linux/compiler.h>
#include <asm/types.h>

#ifdef CONFIG_64BIT
typedef atomic64_t atomic_long_t;
#define ATOMIC_LONG_INIT(i)                ATOMIC64_INIT(i)
#define atomic_long_cond_read_acquire        atomic64_cond_read_acquire
#define atomic_long_cond_read_relaxed        atomic64_cond_read_relaxed
#else
typedef atomic_t atomic_long_t;
#define ATOMIC_LONG_INIT(i)                ATOMIC_INIT(i)
#define atomic_long_cond_read_acquire        atomic_cond_read_acquire
#define atomic_long_cond_read_relaxed        atomic_cond_read_relaxed
#endif

/**
 * raw_atomic_long_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
raw_atomic_long_read(const atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_read(v);
#else
        return raw_atomic_read(v);
#endif
}

/**
 * raw_atomic_long_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
raw_atomic_long_read_acquire(const atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_read_acquire(v);
#else
        return raw_atomic_read_acquire(v);
#endif
}

/**
 * raw_atomic_long_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_set(atomic_long_t *v, long i)
{
#ifdef CONFIG_64BIT
        raw_atomic64_set(v, i);
#else
        raw_atomic_set(v, i);
#endif
}

/**
 * raw_atomic_long_set_release() - atomic set with release ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_set_release(atomic_long_t *v, long i)
{
#ifdef CONFIG_64BIT
        raw_atomic64_set_release(v, i);
#else
        raw_atomic_set_release(v, i);
#endif
}

/**
 * raw_atomic_long_add() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_add(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_add(i, v);
#else
        raw_atomic_add(i, v);
#endif
}

/**
 * raw_atomic_long_add_return() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return(i, v);
#else
        return raw_atomic_add_return(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_acquire(i, v);
#else
        return raw_atomic_add_return_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_release(i, v);
#else
        return raw_atomic_add_return_release(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_relaxed(i, v);
#else
        return raw_atomic_add_return_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add(i, v);
#else
        return raw_atomic_fetch_add(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_acquire(i, v);
#else
        return raw_atomic_fetch_add_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_release(i, v);
#else
        return raw_atomic_fetch_add_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_relaxed(i, v);
#else
        return raw_atomic_fetch_add_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_sub() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_sub(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_sub(i, v);
#else
        raw_atomic_sub(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return(i, v);
#else
        return raw_atomic_sub_return(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_acquire(i, v);
#else
        return raw_atomic_sub_return_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_release(i, v);
#else
        return raw_atomic_sub_return_release(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_relaxed(i, v);
#else
        return raw_atomic_sub_return_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub(i, v);
#else
        return raw_atomic_fetch_sub(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_acquire(i, v);
#else
        return raw_atomic_fetch_sub_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_release(i, v);
#else
        return raw_atomic_fetch_sub_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_relaxed(i, v);
#else
        return raw_atomic_fetch_sub_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_inc(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_inc(v);
#else
        raw_atomic_inc(v);
#endif
}

/**
 * raw_atomic_long_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return(v);
#else
        return raw_atomic_inc_return(v);
#endif
}

/**
 * raw_atomic_long_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_acquire(v);
#else
        return raw_atomic_inc_return_acquire(v);
#endif
}

/**
 * raw_atomic_long_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_release(v);
#else
        return raw_atomic_inc_return_release(v);
#endif
}

/**
 * raw_atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_relaxed(v);
#else
        return raw_atomic_inc_return_relaxed(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc(v);
#else
        return raw_atomic_fetch_inc(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_acquire(v);
#else
        return raw_atomic_fetch_inc_acquire(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_release(v);
#else
        return raw_atomic_fetch_inc_release(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_relaxed(v);
#else
        return raw_atomic_fetch_inc_relaxed(v);
#endif
}

/**
 * raw_atomic_long_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_dec(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_dec(v);
#else
        raw_atomic_dec(v);
#endif
}

/**
 * raw_atomic_long_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return(v);
#else
        return raw_atomic_dec_return(v);
#endif
}

/**
 * raw_atomic_long_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_acquire(v);
#else
        return raw_atomic_dec_return_acquire(v);
#endif
}

/**
 * raw_atomic_long_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_release(v);
#else
        return raw_atomic_dec_return_release(v);
#endif
}

/**
 * raw_atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_relaxed(v);
#else
        return raw_atomic_dec_return_relaxed(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec(v);
#else
        return raw_atomic_fetch_dec(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_acquire(v);
#else
        return raw_atomic_fetch_dec_acquire(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_release(v);
#else
        return raw_atomic_fetch_dec_release(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_relaxed(v);
#else
        return raw_atomic_fetch_dec_relaxed(v);
#endif
}

/**
 * raw_atomic_long_and() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_and(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_and(i, v);
#else
        raw_atomic_and(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and() - atomic bitwise AND with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and(i, v);
#else
        return raw_atomic_fetch_and(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_acquire(i, v);
#else
        return raw_atomic_fetch_and_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_release(i, v);
#else
        return raw_atomic_fetch_and_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_relaxed(i, v);
#else
        return raw_atomic_fetch_and_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_andnot(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_andnot(i, v);
#else
        raw_atomic_andnot(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_andnot(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_acquire(i, v);
#else
        return raw_atomic_fetch_andnot_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_release(i, v);
#else
        return raw_atomic_fetch_andnot_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_relaxed(i, v);
#else
        return raw_atomic_fetch_andnot_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_or() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_or(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_or(i, v);
#else
        raw_atomic_or(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or() - atomic bitwise OR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or(i, v);
#else
        return raw_atomic_fetch_or(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_acquire(i, v);
#else
        return raw_atomic_fetch_or_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_release(i, v);
#else
        return raw_atomic_fetch_or_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_relaxed(i, v);
#else
        return raw_atomic_fetch_or_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_xor() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_xor(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_xor(i, v);
#else
        raw_atomic_xor(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor(i, v);
#else
        return raw_atomic_fetch_xor(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_acquire(i, v);
#else
        return raw_atomic_fetch_xor_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_release(i, v);
#else
        return raw_atomic_fetch_xor_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_relaxed(i, v);
#else
        return raw_atomic_fetch_xor_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg(v, new);
#else
        return raw_atomic_xchg(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_acquire(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_acquire(v, new);
#else
        return raw_atomic_xchg_acquire(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_release(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_release(v, new);
#else
        return raw_atomic_xchg_release(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_relaxed(v, new);
#else
        return raw_atomic_xchg_relaxed(v, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg(v, old, new);
#else
        return raw_atomic_cmpxchg(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_acquire(v, old, new);
#else
        return raw_atomic_cmpxchg_acquire(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_release(v, old, new);
#else
        return raw_atomic_cmpxchg_release(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_relaxed(v, old, new);
#else
        return raw_atomic_cmpxchg_relaxed(v, old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_acquire(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_acquire(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_release(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_release(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_relaxed(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_sub_and_test(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_and_test(i, v);
#else
        return raw_atomic_sub_and_test(i, v);
#endif
}

/**
 * raw_atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_dec_and_test(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_and_test(v);
#else
        return raw_atomic_dec_and_test(v);
#endif
}

/**
 * raw_atomic_long_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_and_test(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_and_test(v);
#else
        return raw_atomic_inc_and_test(v);
#endif
}

/**
 * raw_atomic_long_add_negative() - atomic add and test if negative with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative(i, v);
#else
        return raw_atomic_add_negative(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_acquire(i, v);
#else
        return raw_atomic_add_negative_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_release() - atomic add and test if negative with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_release(i, v);
#else
        return raw_atomic_add_negative_release(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_relaxed(i, v);
#else
        return raw_atomic_add_negative_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_unless(v, a, u);
#else
        return raw_atomic_fetch_add_unless(v, a, u);
#endif
}

/**
 * raw_atomic_long_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_unless(atomic_long_t *v, long a, long u)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_unless(v, a, u);
#else
        return raw_atomic_add_unless(v, a, u);
#endif
}

/**
 * raw_atomic_long_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_not_zero(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_not_zero(v);
#else
        return raw_atomic_inc_not_zero(v);
#endif
}

/**
 * raw_atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_unless_negative(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_unless_negative(v);
#else
        return raw_atomic_inc_unless_negative(v);
#endif
}

/**
 * raw_atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_dec_unless_positive(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_unless_positive(v);
#else
        return raw_atomic_dec_unless_positive(v);
#endif
}

/**
 * raw_atomic_long_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline long
raw_atomic_long_dec_if_positive(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_if_positive(v);
#else
        return raw_atomic_dec_if_positive(v);
#endif
}

#endif /* _LINUX_ATOMIC_LONG_H */
// 4b882bf19018602c10816c52f8b4ae280adc887b






































































































































































































































































































































































































































































































































































    1 


    1 






























































































































































































































    1 






    1 

































    1 
    1 







    1 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
// SPDX-License-Identifier: GPL-2.0
/*
 * Key setup facility for FS encryption support.
 *
 * Copyright (C) 2015, Google, Inc.
 *
 * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
 * Heavily modified since then.
 */

#include <crypto/skcipher.h>
#include <linux/export.h>
#include <linux/random.h>

#include "fscrypt_private.h"

struct fscrypt_mode fscrypt_modes[] = {
        [FSCRYPT_MODE_AES_256_XTS] = {
                .friendly_name = "AES-256-XTS",
                .cipher_str = "xts(aes)",
                .keysize = 64,
                .security_strength = 32,
                .ivsize = 16,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
        },
        [FSCRYPT_MODE_AES_256_CTS] = {
                .friendly_name = "AES-256-CBC-CTS",
                .cipher_str = "cts(cbc(aes))",
                .keysize = 32,
                .security_strength = 32,
                .ivsize = 16,
        },
        [FSCRYPT_MODE_AES_128_CBC] = {
                .friendly_name = "AES-128-CBC-ESSIV",
                .cipher_str = "essiv(cbc(aes),sha256)",
                .keysize = 16,
                .security_strength = 16,
                .ivsize = 16,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
        },
        [FSCRYPT_MODE_AES_128_CTS] = {
                .friendly_name = "AES-128-CBC-CTS",
                .cipher_str = "cts(cbc(aes))",
                .keysize = 16,
                .security_strength = 16,
                .ivsize = 16,
        },
        [FSCRYPT_MODE_SM4_XTS] = {
                .friendly_name = "SM4-XTS",
                .cipher_str = "xts(sm4)",
                .keysize = 32,
                .security_strength = 16,
                .ivsize = 16,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS,
        },
        [FSCRYPT_MODE_SM4_CTS] = {
                .friendly_name = "SM4-CBC-CTS",
                .cipher_str = "cts(cbc(sm4))",
                .keysize = 16,
                .security_strength = 16,
                .ivsize = 16,
        },
        [FSCRYPT_MODE_ADIANTUM] = {
                .friendly_name = "Adiantum",
                .cipher_str = "adiantum(xchacha12,aes)",
                .keysize = 32,
                .security_strength = 32,
                .ivsize = 32,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
        },
        [FSCRYPT_MODE_AES_256_HCTR2] = {
                .friendly_name = "AES-256-HCTR2",
                .cipher_str = "hctr2(aes)",
                .keysize = 32,
                .security_strength = 32,
                .ivsize = 32,
        },
};

static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex);

static struct fscrypt_mode *
select_encryption_mode(const union fscrypt_policy *policy,
                       const struct inode *inode)
{
        BUILD_BUG_ON(ARRAY_SIZE(fscrypt_modes) != FSCRYPT_MODE_MAX + 1);

        if (S_ISREG(inode->i_mode))
                return &fscrypt_modes[fscrypt_policy_contents_mode(policy)];

        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)];

        WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %llu, which is not encryptable (file type %d)\n",
                  inode->i_ino, (inode->i_mode & S_IFMT));
        return ERR_PTR(-EINVAL);
}

/* Create a symmetric cipher object for the given encryption mode and key */
static struct crypto_sync_skcipher *
fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
                          const struct inode *inode)
{
        struct crypto_sync_skcipher *tfm;
        int err;

        tfm = crypto_alloc_sync_skcipher(mode->cipher_str, 0,
                                         FSCRYPT_CRYPTOAPI_MASK);
        if (IS_ERR(tfm)) {
                if (PTR_ERR(tfm) == -ENOENT) {
                        fscrypt_warn(inode,
                                     "Missing crypto API support for %s (API name: \"%s\")",
                                     mode->friendly_name, mode->cipher_str);
                        return ERR_PTR(-ENOPKG);
                }
                fscrypt_err(inode, "Error allocating '%s' transform: %ld",
                            mode->cipher_str, PTR_ERR(tfm));
                return tfm;
        }
        if (!xchg(&mode->logged_cryptoapi_impl, 1)) {
                /*
                 * fscrypt performance can vary greatly depending on which
                 * crypto algorithm implementation is used.  Help people debug
                 * performance problems by logging the ->cra_driver_name the
                 * first time a mode is used.
                 */
                pr_info("fscrypt: %s using implementation \"%s\"\n",
                        mode->friendly_name,
                        crypto_skcipher_driver_name(&tfm->base));
        }
        if (WARN_ON_ONCE(crypto_sync_skcipher_ivsize(tfm) != mode->ivsize)) {
                err = -EINVAL;
                goto err_free_tfm;
        }
        crypto_sync_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
        err = crypto_sync_skcipher_setkey(tfm, raw_key, mode->keysize);
        if (err)
                goto err_free_tfm;

        return tfm;

err_free_tfm:
        crypto_free_sync_skcipher(tfm);
        return ERR_PTR(err);
}

/*
 * Prepare the crypto transform object or blk-crypto key in @prep_key, given the
 * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption
 * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt),
 * and IV generation method (@ci->ci_policy.flags).
 */
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
                        const u8 *raw_key, const struct fscrypt_inode_info *ci)
{
        struct crypto_sync_skcipher *tfm;

        if (fscrypt_using_inline_encryption(ci))
                return fscrypt_prepare_inline_crypt_key(prep_key, raw_key,
                                                        ci->ci_mode->keysize,
                                                        false, ci);

        tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode);
        if (IS_ERR(tfm))
                return PTR_ERR(tfm);
        /*
         * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared().
         * I.e., here we publish ->tfm with a RELEASE barrier so that
         * concurrent tasks can ACQUIRE it.  Note that this concurrency is only
         * possible for per-mode keys, not for per-file keys.
         */
        smp_store_release(&prep_key->tfm, tfm);
        return 0;
}

/* Destroy a crypto transform object and/or blk-crypto key. */
void fscrypt_destroy_prepared_key(struct super_block *sb,
                                  struct fscrypt_prepared_key *prep_key)
{
        crypto_free_sync_skcipher(prep_key->tfm);
        fscrypt_destroy_inline_crypt_key(sb, prep_key);
        memzero_explicit(prep_key, sizeof(*prep_key));
}

/* Given a per-file encryption key, set up the file's crypto transform object */
int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
                                 const u8 *raw_key)
{
        ci->ci_owns_key = true;
        return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci);
}

static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
                                  struct fscrypt_master_key *mk,
                                  struct fscrypt_prepared_key *keys,
                                  u8 hkdf_context, bool include_fs_uuid)
{
        const struct inode *inode = ci->ci_inode;
        const struct super_block *sb = inode->i_sb;
        struct fscrypt_mode *mode = ci->ci_mode;
        const u8 mode_num = mode - fscrypt_modes;
        struct fscrypt_prepared_key *prep_key;
        u8 mode_key[FSCRYPT_MAX_RAW_KEY_SIZE];
        u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)];
        unsigned int hkdf_infolen = 0;
        bool use_hw_wrapped_key = false;
        int err;

        if (WARN_ON_ONCE(mode_num > FSCRYPT_MODE_MAX))
                return -EINVAL;

        if (mk->mk_secret.is_hw_wrapped && S_ISREG(inode->i_mode)) {
                /* Using a hardware-wrapped key for file contents encryption */
                if (!fscrypt_using_inline_encryption(ci)) {
                        if (sb->s_flags & SB_INLINECRYPT)
                                fscrypt_warn(ci->ci_inode,
                                             "Hardware-wrapped key required, but no suitable inline encryption capabilities are available");
                        else
                                fscrypt_warn(ci->ci_inode,
                                             "Hardware-wrapped keys require inline encryption (-o inlinecrypt)");
                        return -EINVAL;
                }
                use_hw_wrapped_key = true;
        }

        prep_key = &keys[mode_num];
        if (fscrypt_is_key_prepared(prep_key, ci)) {
                ci->ci_enc_key = *prep_key;
                return 0;
        }

        mutex_lock(&fscrypt_mode_key_setup_mutex);

        if (fscrypt_is_key_prepared(prep_key, ci))
                goto done_unlock;

        if (use_hw_wrapped_key) {
                err = fscrypt_prepare_inline_crypt_key(prep_key,
                                                       mk->mk_secret.bytes,
                                                       mk->mk_secret.size, true,
                                                       ci);
                if (err)
                        goto out_unlock;
                goto done_unlock;
        }

        BUILD_BUG_ON(sizeof(mode_num) != 1);
        BUILD_BUG_ON(sizeof(sb->s_uuid) != 16);
        BUILD_BUG_ON(sizeof(hkdf_info) != 17);
        hkdf_info[hkdf_infolen++] = mode_num;
        if (include_fs_uuid) {
                memcpy(&hkdf_info[hkdf_infolen], &sb->s_uuid,
                       sizeof(sb->s_uuid));
                hkdf_infolen += sizeof(sb->s_uuid);
        }
        fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info,
                            hkdf_infolen, mode_key, mode->keysize);
        err = fscrypt_prepare_key(prep_key, mode_key, ci);
        memzero_explicit(mode_key, mode->keysize);
        if (err)
                goto out_unlock;
done_unlock:
        ci->ci_enc_key = *prep_key;
        err = 0;
out_unlock:
        mutex_unlock(&fscrypt_mode_key_setup_mutex);
        return err;
}

/*
 * Derive a SipHash key from the given fscrypt master key and the given
 * application-specific information string.
 *
 * Note that the KDF produces a byte array, but the SipHash APIs expect the key
 * as a pair of 64-bit words.  Therefore, on big endian CPUs we have to do an
 * endianness swap in order to get the same results as on little endian CPUs.
 */
static void fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
                                       u8 context, const u8 *info,
                                       unsigned int infolen, siphash_key_t *key)
{
        fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen,
                            (u8 *)key, sizeof(*key));
        BUILD_BUG_ON(sizeof(*key) != 16);
        BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2);
        le64_to_cpus(&key->key[0]);
        le64_to_cpus(&key->key[1]);
}

void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
                                const struct fscrypt_master_key *mk)
{
        fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY,
                                   ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
                                   &ci->ci_dirhash_key);
        ci->ci_dirhash_key_initialized = true;
}

void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
                               const struct fscrypt_master_key *mk)
{
        WARN_ON_ONCE(ci->ci_inode->i_ino == 0);
        WARN_ON_ONCE(!mk->mk_ino_hash_key_initialized);

        ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
                                              &mk->mk_ino_hash_key);
}

static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci,
                                            struct fscrypt_master_key *mk)
{
        int err;

        err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys,
                                     HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true);
        if (err)
                return err;

        /* pairs with smp_store_release() below */
        if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) {

                mutex_lock(&fscrypt_mode_key_setup_mutex);

                if (mk->mk_ino_hash_key_initialized)
                        goto unlock;

                fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY,
                                           NULL, 0, &mk->mk_ino_hash_key);
                /* pairs with smp_load_acquire() above */
                smp_store_release(&mk->mk_ino_hash_key_initialized, true);
unlock:
                mutex_unlock(&fscrypt_mode_key_setup_mutex);
        }

        /*
         * New inodes may not have an inode number assigned yet.
         * Hashing their inode number is delayed until later.
         */
        if (ci->ci_inode->i_ino)
                fscrypt_hash_inode_number(ci, mk);
        return 0;
}

static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
                                     struct fscrypt_master_key *mk,
                                     bool need_dirhash_key)
{
        int err;

        if (mk->mk_secret.is_hw_wrapped &&
            !(ci->ci_policy.v2.flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
                                        FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))) {
                fscrypt_warn(ci->ci_inode,
                             "Hardware-wrapped keys are only supported with IV_INO_LBLK policies");
                return -EINVAL;
        }

        if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
                /*
                 * DIRECT_KEY: instead of deriving per-file encryption keys, the
                 * per-file nonce will be included in all the IVs.  But unlike
                 * v1 policies, for v2 policies in this case we don't encrypt
                 * with the master key directly but rather derive a per-mode
                 * encryption key.  This ensures that the master key is
                 * consistently used only for HKDF, avoiding key reuse issues.
                 */
                err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys,
                                             HKDF_CONTEXT_DIRECT_KEY, false);
        } else if (ci->ci_policy.v2.flags &
                   FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
                /*
                 * IV_INO_LBLK_64: encryption keys are derived from (master_key,
                 * mode_num, filesystem_uuid), and inode number is included in
                 * the IVs.  This format is optimized for use with inline
                 * encryption hardware compliant with the UFS standard.
                 */
                err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys,
                                             HKDF_CONTEXT_IV_INO_LBLK_64_KEY,
                                             true);
        } else if (ci->ci_policy.v2.flags &
                   FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
                err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk);
        } else {
                u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE];

                fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
                                    HKDF_CONTEXT_PER_FILE_ENC_KEY,
                                    ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
                                    derived_key, ci->ci_mode->keysize);
                err = fscrypt_set_per_file_enc_key(ci, derived_key);
                memzero_explicit(derived_key, ci->ci_mode->keysize);
        }
        if (err)
                return err;

        /* Derive a secret dirhash key for directories that need it. */
        if (need_dirhash_key)
                fscrypt_derive_dirhash_key(ci, mk);

        return 0;
}

/*
 * Check whether the size of the given master key (@mk) is appropriate for the
 * encryption settings which a particular file will use (@ci).
 *
 * If the file uses a v1 encryption policy, then the master key must be at least
 * as long as the derived key, as this is a requirement of the v1 KDF.
 *
 * Otherwise, the KDF can accept any size key, so we enforce a slightly looser
 * requirement: we require that the size of the master key be at least the
 * maximum security strength of any algorithm whose key will be derived from it
 * (but in practice we only need to consider @ci->ci_mode, since any other
 * possible subkeys such as DIRHASH and INODE_HASH will never increase the
 * required key size over @ci->ci_mode).  This allows AES-256-XTS keys to be
 * derived from a 256-bit master key, which is cryptographically sufficient,
 * rather than requiring a 512-bit master key which is unnecessarily long.  (We
 * still allow 512-bit master keys if the user chooses to use them, though.)
 */
static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
                                          const struct fscrypt_inode_info *ci)
{
        unsigned int min_keysize;

        if (ci->ci_policy.version == FSCRYPT_POLICY_V1)
                min_keysize = ci->ci_mode->keysize;
        else
                min_keysize = ci->ci_mode->security_strength;

        if (mk->mk_secret.size < min_keysize) {
                fscrypt_warn(NULL,
                             "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
                             master_key_spec_type(&mk->mk_spec),
                             master_key_spec_len(&mk->mk_spec),
                             (u8 *)&mk->mk_spec.u,
                             mk->mk_secret.size, min_keysize);
                return false;
        }
        return true;
}

/*
 * Find the master key, then set up the inode's actual encryption key.
 *
 * If the master key is found in the filesystem-level keyring, then it is
 * returned in *mk_ret with its semaphore read-locked.  This is needed to ensure
 * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes
 * (as multiple tasks may race to create an fscrypt_inode_info for the same
 * inode), and to synchronize the master key being removed with a new inode
 * starting to use it.
 */
static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
                                     bool need_dirhash_key,
                                     struct fscrypt_master_key **mk_ret)
{
        struct super_block *sb = ci->ci_inode->i_sb;
        struct fscrypt_key_specifier mk_spec;
        struct fscrypt_master_key *mk;
        int err;

        err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec);
        if (err)
                return err;

        mk = fscrypt_find_master_key(sb, &mk_spec);
        if (unlikely(!mk)) {
                const union fscrypt_policy *dummy_policy =
                        fscrypt_get_dummy_policy(sb);

                /*
                 * Add the test_dummy_encryption key on-demand.  In principle,
                 * it should be added at mount time.  Do it here instead so that
                 * the individual filesystems don't need to worry about adding
                 * this key at mount time and cleaning up on mount failure.
                 */
                if (dummy_policy &&
                    fscrypt_policies_equal(dummy_policy, &ci->ci_policy)) {
                        err = fscrypt_add_test_dummy_key(sb, &mk_spec);
                        if (err)
                                return err;
                        mk = fscrypt_find_master_key(sb, &mk_spec);
                }
        }
        if (unlikely(!mk)) {
                if (ci->ci_policy.version != FSCRYPT_POLICY_V1)
                        return -ENOKEY;

                err = fscrypt_select_encryption_impl(ci, false);
                if (err)
                        return err;

                /*
                 * As a legacy fallback for v1 policies, search for the key in
                 * the current task's subscribed keyrings too.  Don't move this
                 * to before the search of ->s_master_keys, since users
                 * shouldn't be able to override filesystem-level keys.
                 */
                return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
        }
        down_read(&mk->mk_sem);

        if (!mk->mk_present) {
                /* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */
                err = -ENOKEY;
                goto out_release_key;
        }

        if (!fscrypt_valid_master_key_size(mk, ci)) {
                err = -ENOKEY;
                goto out_release_key;
        }

        err = fscrypt_select_encryption_impl(ci, mk->mk_secret.is_hw_wrapped);
        if (err)
                goto out_release_key;

        switch (ci->ci_policy.version) {
        case FSCRYPT_POLICY_V1:
                if (WARN_ON_ONCE(mk->mk_secret.is_hw_wrapped)) {
                        /*
                         * This should never happen, as adding a v1 policy key
                         * that is hardware-wrapped isn't allowed.
                         */
                        err = -EINVAL;
                        goto out_release_key;
                }
                err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.bytes);
                break;
        case FSCRYPT_POLICY_V2:
                err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key);
                break;
        default:
                WARN_ON_ONCE(1);
                err = -EINVAL;
                break;
        }
        if (err)
                goto out_release_key;

        *mk_ret = mk;
        return 0;

out_release_key:
        up_read(&mk->mk_sem);
        fscrypt_put_master_key(mk);
        return err;
}

static void put_crypt_info(struct fscrypt_inode_info *ci)
{
        struct fscrypt_master_key *mk;

        if (!ci)
                return;

        if (ci->ci_direct_key)
                fscrypt_put_direct_key(ci->ci_direct_key);
        else if (ci->ci_owns_key)
                fscrypt_destroy_prepared_key(ci->ci_inode->i_sb,
                                             &ci->ci_enc_key);

        mk = ci->ci_master_key;
        if (mk) {
                /*
                 * Remove this inode from the list of inodes that were unlocked
                 * with the master key.  In addition, if we're removing the last
                 * inode from an incompletely removed key, then complete the
                 * full removal of the key.
                 */
                spin_lock(&mk->mk_decrypted_inodes_lock);
                list_del(&ci->ci_master_key_link);
                spin_unlock(&mk->mk_decrypted_inodes_lock);
                fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
        }
        memzero_explicit(ci, sizeof(*ci));
        kmem_cache_free(fscrypt_inode_info_cachep, ci);
}

static int
fscrypt_setup_encryption_info(struct inode *inode,
                              const union fscrypt_policy *policy,
                              const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
                              bool need_dirhash_key)
{
        struct fscrypt_inode_info *crypt_info;
        struct fscrypt_mode *mode;
        struct fscrypt_master_key *mk = NULL;
        int res;

        res = fscrypt_initialize(inode->i_sb);
        if (res)
                return res;

        crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL);
        if (!crypt_info)
                return -ENOMEM;

        crypt_info->ci_inode = inode;
        crypt_info->ci_policy = *policy;
        memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);

        mode = select_encryption_mode(&crypt_info->ci_policy, inode);
        if (IS_ERR(mode)) {
                res = PTR_ERR(mode);
                goto out;
        }
        WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
        crypt_info->ci_mode = mode;

        crypt_info->ci_data_unit_bits =
                fscrypt_policy_du_bits(&crypt_info->ci_policy, inode);

        res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
        if (res)
                goto out;

        /*
         * For existing inodes, multiple tasks may race to set the inode's
         * fscrypt info pointer.  So use cmpxchg_release().  This pairs with the
         * smp_load_acquire() in fscrypt_get_inode_info().  I.e., publish the
         * pointer with a RELEASE barrier so that other tasks can ACQUIRE it.
         */
        if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) ==
            NULL) {
                /*
                 * We won the race and set the inode's fscrypt info to our
                 * crypt_info.  Now link it into the master key's inode list.
                 */
                if (mk) {
                        crypt_info->ci_master_key = mk;
                        refcount_inc(&mk->mk_active_refs);
                        spin_lock(&mk->mk_decrypted_inodes_lock);
                        list_add(&crypt_info->ci_master_key_link,
                                 &mk->mk_decrypted_inodes);
                        spin_unlock(&mk->mk_decrypted_inodes_lock);
                }
                crypt_info = NULL;
        }
        res = 0;
out:
        if (mk) {
                up_read(&mk->mk_sem);
                fscrypt_put_master_key(mk);
        }
        put_crypt_info(crypt_info);
        return res;
}

/**
 * fscrypt_get_encryption_info() - set up an inode's encryption key
 * @inode: the inode to set up the key for.  Must be encrypted.
 * @allow_unsupported: if %true, treat an unsupported encryption policy (or
 *                       unrecognized encryption context) the same way as the key
 *                       being unavailable, instead of returning an error.  Use
 *                       %false unless the operation being performed is needed in
 *                       order for files (or directories) to be deleted.
 *
 * Set up the inode's encryption key, if it hasn't already been done.
 *
 * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe.  So
 * generally this shouldn't be called from within a filesystem transaction.
 *
 * Return: 0 if the key is now set up, *or* if it couldn't be set up because the
 *           needed master key is absent.  (Use fscrypt_has_encryption_key() to
 *           distinguish these cases.)  Also can return another -errno code.
 */
int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
{
        int res;
        union fscrypt_context ctx;
        union fscrypt_policy policy;

        if (fscrypt_has_encryption_key(inode))
                return 0;

        res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
        if (res < 0) {
                if (res == -ERANGE && allow_unsupported)
                        return 0;
                fscrypt_warn(inode, "Error %d getting encryption context", res);
                return res;
        }

        res = fscrypt_policy_from_context(&policy, &ctx, res);
        if (res) {
                if (allow_unsupported)
                        return 0;
                fscrypt_warn(inode,
                             "Unrecognized or corrupt encryption context");
                return res;
        }

        if (!fscrypt_supported_policy(&policy, inode)) {
                if (allow_unsupported)
                        return 0;
                return -EINVAL;
        }

        res = fscrypt_setup_encryption_info(inode, &policy,
                                            fscrypt_context_nonce(&ctx),
                                            IS_CASEFOLDED(inode) &&
                                            S_ISDIR(inode->i_mode));

        if (res == -ENOPKG && allow_unsupported) /* Algorithm unavailable? */
                res = 0;
        if (res == -ENOKEY)
                res = 0;
        return res;
}

/**
 * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory
 * @dir: a possibly-encrypted directory
 * @inode: the new inode.  ->i_mode and ->i_blkbits must be set already.
 *           ->i_ino doesn't need to be set yet.
 * @encrypt_ret: (output) set to %true if the new inode will be encrypted
 *
 * If the directory is encrypted, set up its encryption key in preparation for
 * encrypting the name of the new file.  Also, if the new inode will be
 * encrypted, set up its encryption key too and set *encrypt_ret=true.
 *
 * This isn't %GFP_NOFS-safe, and therefore it should be called before starting
 * any filesystem transaction to create the inode.  For this reason, ->i_ino
 * isn't required to be set yet, as the filesystem may not have set it yet.
 *
 * This doesn't persist the new inode's encryption context.  That still needs to
 * be done later by calling fscrypt_set_context().
 *
 * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode
 *           but the needed master key is absent, or another -errno code
 */
int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
                              bool *encrypt_ret)
{
        const union fscrypt_policy *policy;
        u8 nonce[FSCRYPT_FILE_NONCE_SIZE];

        policy = fscrypt_policy_to_inherit(dir);
        if (policy == NULL)
                return 0;
        if (IS_ERR(policy))
                return PTR_ERR(policy);

        if (WARN_ON_ONCE(inode->i_blkbits == 0))
                return -EINVAL;

        if (WARN_ON_ONCE(inode->i_mode == 0))
                return -EINVAL;

        /*
         * Only regular files, directories, and symlinks are encrypted.
         * Special files like device nodes and named pipes aren't.
         */
        if (!S_ISREG(inode->i_mode) &&
            !S_ISDIR(inode->i_mode) &&
            !S_ISLNK(inode->i_mode))
                return 0;

        *encrypt_ret = true;

        get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
        return fscrypt_setup_encryption_info(inode, policy, nonce,
                                             IS_CASEFOLDED(dir) &&
                                             S_ISDIR(inode->i_mode));
}
EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);

/**
 * fscrypt_put_encryption_info() - free most of an inode's fscrypt data
 * @inode: an inode being evicted
 *
 * Free the inode's fscrypt_inode_info.  Filesystems must call this when the
 * inode is being evicted.  An RCU grace period need not have elapsed yet.
 */
void fscrypt_put_encryption_info(struct inode *inode)
{
        /*
         * Ideally we'd start with a lightweight IS_ENCRYPTED() check here
         * before proceeding to retrieve and check the pointer.  However, during
         * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED.  If
         * an error occurs, it needs to be cleaned up regardless.
         */
        struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode);

        put_crypt_info(*ci_addr);
        *ci_addr = NULL;
}
EXPORT_SYMBOL(fscrypt_put_encryption_info);

/**
 * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay
 * @inode: an inode being freed
 *
 * Free the inode's cached decrypted symlink target, if any.  Filesystems must
 * call this after an RCU grace period, just before they free the inode.
 */
void fscrypt_free_inode(struct inode *inode)
{
        if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) {
                kfree(inode->i_link);
                inode->i_link = NULL;
        }
}
EXPORT_SYMBOL(fscrypt_free_inode);

/**
 * fscrypt_drop_inode() - check whether the inode's master key has been removed
 * @inode: an inode being considered for eviction
 *
 * Filesystems supporting fscrypt must call this from their ->drop_inode()
 * method so that encrypted inodes are evicted as soon as they're no longer in
 * use and their master key has been removed.
 *
 * Return: 1 if fscrypt wants the inode to be evicted now, otherwise 0
 */
int fscrypt_drop_inode(struct inode *inode)
{
        const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode);

        /*
         * If ci is NULL, then the inode doesn't have an encryption key set up
         * so it's irrelevant.  If ci_master_key is NULL, then the master key
         * was provided via the legacy mechanism of the process-subscribed
         * keyrings, so we don't know whether it's been removed or not.
         */
        if (!ci || !ci->ci_master_key)
                return 0;

        /*
         * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes
         * protected by the key were cleaned by sync_filesystem().  But if
         * userspace is still using the files, inodes can be dirtied between
         * then and now.  We mustn't lose any writes, so skip dirty inodes here.
         */
        if (inode_state_read(inode) & I_DIRTY_ALL)
                return 0;

        /*
         * We can't take ->mk_sem here, since this runs in atomic context.
         * Therefore, ->mk_present can change concurrently, and our result may
         * immediately become outdated.  But there's no correctness problem with
         * unnecessarily evicting.  Nor is there a correctness problem with not
         * evicting while iput() is racing with the key being removed, since
         * then the thread removing the key will either evict the inode itself
         * or will correctly detect that it wasn't evicted due to the race.
         */
        return !READ_ONCE(ci->ci_master_key->mk_present);
}
EXPORT_SYMBOL_GPL(fscrypt_drop_inode);








































































































































































































































































































































































































































































































































































































    1 












    1 


    1 



    1 


    1 

    1 





    1 


    1 

    1 








    1 

    1 








    1 



    1 



    1 















































    1 




























































    1 





    1 








    1 






    1 















    1 


    1 


    1 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008 IBM Corporation
 * Author: Mimi Zohar <zohar@us.ibm.com>
 *
 * ima_policy.c
 *        - initialize default measure policy rules
 */

#include <linux/init.h>
#include <linux/list.h>
#include <linux/kernel_read_file.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/magic.h>
#include <linux/parser.h>
#include <linux/slab.h>
#include <linux/rculist.h>
#include <linux/seq_file.h>
#include <linux/ima.h>

#include "ima.h"

/* flags definitions */
#define IMA_FUNC        0x0001
#define IMA_MASK        0x0002
#define IMA_FSMAGIC        0x0004
#define IMA_UID                0x0008
#define IMA_FOWNER        0x0010
#define IMA_FSUUID        0x0020
#define IMA_INMASK        0x0040
#define IMA_EUID        0x0080
#define IMA_PCR                0x0100
#define IMA_FSNAME        0x0200
#define IMA_KEYRINGS        0x0400
#define IMA_LABEL        0x0800
#define IMA_VALIDATE_ALGOS        0x1000
#define IMA_GID                0x2000
#define IMA_EGID        0x4000
#define IMA_FGROUP        0x8000
#define IMA_FS_SUBTYPE        0x10000

#define UNKNOWN                0
#define MEASURE                0x0001        /* same as IMA_MEASURE */
#define DONT_MEASURE        0x0002
#define APPRAISE        0x0004        /* same as IMA_APPRAISE */
#define DONT_APPRAISE        0x0008
#define AUDIT                0x0040
#define DONT_AUDIT        0x0080
#define HASH                0x0100
#define DONT_HASH        0x0200

#define INVALID_PCR(a) (((a) < 0) || \
        (a) >= (sizeof_field(struct ima_iint_cache, measured_pcrs) * 8))

int ima_policy_flag;
static int temp_ima_appraise;
static int build_ima_appraise __ro_after_init;

atomic_t ima_setxattr_allowed_hash_algorithms;

#define MAX_LSM_RULES 6
enum lsm_rule_types { LSM_OBJ_USER, LSM_OBJ_ROLE, LSM_OBJ_TYPE,
        LSM_SUBJ_USER, LSM_SUBJ_ROLE, LSM_SUBJ_TYPE
};

enum policy_types { ORIGINAL_TCB = 1, DEFAULT_TCB };

enum policy_rule_list { IMA_DEFAULT_POLICY = 1, IMA_CUSTOM_POLICY };

struct ima_rule_opt_list {
        size_t count;
        char *items[] __counted_by(count);
};

/*
 * These comparators are needed nowhere outside of ima so just define them here.
 * This pattern should hopefully never be needed outside of ima.
 */
static inline bool vfsuid_gt_kuid(vfsuid_t vfsuid, kuid_t kuid)
{
        return __vfsuid_val(vfsuid) > __kuid_val(kuid);
}

static inline bool vfsgid_gt_kgid(vfsgid_t vfsgid, kgid_t kgid)
{
        return __vfsgid_val(vfsgid) > __kgid_val(kgid);
}

static inline bool vfsuid_lt_kuid(vfsuid_t vfsuid, kuid_t kuid)
{
        return __vfsuid_val(vfsuid) < __kuid_val(kuid);
}

static inline bool vfsgid_lt_kgid(vfsgid_t vfsgid, kgid_t kgid)
{
        return __vfsgid_val(vfsgid) < __kgid_val(kgid);
}

struct ima_rule_entry {
        struct list_head list;
        int action;
        unsigned int flags;
        enum ima_hooks func;
        int mask;
        unsigned long fsmagic;
        uuid_t fsuuid;
        kuid_t uid;
        kgid_t gid;
        kuid_t fowner;
        kgid_t fgroup;
        bool (*uid_op)(kuid_t cred_uid, kuid_t rule_uid);    /* Handlers for operators       */
        bool (*gid_op)(kgid_t cred_gid, kgid_t rule_gid);
        bool (*fowner_op)(vfsuid_t vfsuid, kuid_t rule_uid); /* vfsuid_eq_kuid(), vfsuid_gt_kuid(), vfsuid_lt_kuid() */
        bool (*fgroup_op)(vfsgid_t vfsgid, kgid_t rule_gid); /* vfsgid_eq_kgid(), vfsgid_gt_kgid(), vfsgid_lt_kgid() */
        int pcr;
        unsigned int allowed_algos; /* bitfield of allowed hash algorithms */
        struct {
                void *rule;        /* LSM file metadata specific */
                char *args_p;        /* audit value */
                int type;        /* audit type */
        } lsm[MAX_LSM_RULES];
        char *fsname;
        char *fs_subtype;
        struct ima_rule_opt_list *keyrings; /* Measure keys added to these keyrings */
        struct ima_rule_opt_list *label; /* Measure data grouped under this label */
        struct ima_template_desc *template;
};

/*
 * sanity check in case the kernels gains more hash algorithms that can
 * fit in an unsigned int
 */
static_assert(
        8 * sizeof(unsigned int) >= HASH_ALGO__LAST,
        "The bitfield allowed_algos in ima_rule_entry is too small to contain all the supported hash algorithms, consider using a bigger type");

/*
 * Without LSM specific knowledge, the default policy can only be
 * written in terms of .action, .func, .mask, .fsmagic, .uid, .gid,
 * .fowner, and .fgroup
 */

/*
 * The minimum rule set to allow for full TCB coverage.  Measures all files
 * opened or mmap for exec and everything read by root.  Dangerous because
 * normal users can easily run the machine out of memory simply building
 * and running executables.
 */
static struct ima_rule_entry dont_measure_rules[] __ro_after_init = {
        {.action = DONT_MEASURE, .fsmagic = PROC_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SYSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = DEBUGFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = TMPFS_MAGIC, .func = FILE_CHECK,
         .flags = IMA_FSMAGIC | IMA_FUNC},
        {.action = DONT_MEASURE, .fsmagic = DEVPTS_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = BINFMTFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SECURITYFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SELINUX_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SMACK_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = CGROUP_SUPER_MAGIC,
         .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = CGROUP2_SUPER_MAGIC,
         .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = NSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = EFIVARFS_MAGIC, .flags = IMA_FSMAGIC}
};

static struct ima_rule_entry original_measurement_rules[] __ro_after_init = {
        {.action = MEASURE, .func = MMAP_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = BPRM_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ,
         .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq,
         .flags = IMA_FUNC | IMA_MASK | IMA_UID},
        {.action = MEASURE, .func = MODULE_CHECK, .flags = IMA_FUNC},
        {.action = MEASURE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC},
};

static struct ima_rule_entry default_measurement_rules[] __ro_after_init = {
        {.action = MEASURE, .func = MMAP_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = BPRM_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ,
         .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq,
         .flags = IMA_FUNC | IMA_INMASK | IMA_EUID},
        {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ,
         .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq,
         .flags = IMA_FUNC | IMA_INMASK | IMA_UID},
        {.action = MEASURE, .func = MODULE_CHECK, .flags = IMA_FUNC},
        {.action = MEASURE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC},
        {.action = MEASURE, .func = POLICY_CHECK, .flags = IMA_FUNC},
};

static struct ima_rule_entry default_appraise_rules[] __ro_after_init = {
        {.action = DONT_APPRAISE, .fsmagic = PROC_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SYSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = DEBUGFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = TMPFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = RAMFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = DEVPTS_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = BINFMTFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SECURITYFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SELINUX_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SMACK_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = NSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = EFIVARFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = CGROUP_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = CGROUP2_SUPER_MAGIC, .flags = IMA_FSMAGIC},
#ifdef CONFIG_IMA_WRITE_POLICY
        {.action = APPRAISE, .func = POLICY_CHECK,
        .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifndef CONFIG_IMA_APPRAISE_SIGNED_INIT
        {.action = APPRAISE, .fowner = GLOBAL_ROOT_UID, .fowner_op = &vfsuid_eq_kuid,
         .flags = IMA_FOWNER},
#else
        /* force signature */
        {.action = APPRAISE, .fowner = GLOBAL_ROOT_UID, .fowner_op = &vfsuid_eq_kuid,
         .flags = IMA_FOWNER | IMA_DIGSIG_REQUIRED},
#endif
};

static struct ima_rule_entry build_appraise_rules[] __ro_after_init = {
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_MODULE_SIGS
        {.action = APPRAISE, .func = MODULE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_FIRMWARE_SIGS
        {.action = APPRAISE, .func = FIRMWARE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_KEXEC_SIGS
        {.action = APPRAISE, .func = KEXEC_KERNEL_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_POLICY_SIGS
        {.action = APPRAISE, .func = POLICY_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
};

static struct ima_rule_entry secure_boot_rules[] __ro_after_init = {
        {.action = APPRAISE, .func = MODULE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED | IMA_MODSIG_ALLOWED |
                  IMA_CHECK_BLACKLIST},
        {.action = APPRAISE, .func = FIRMWARE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
        {.action = APPRAISE, .func = KEXEC_KERNEL_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
        {.action = APPRAISE, .func = POLICY_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
};

static struct ima_rule_entry critical_data_rules[] __ro_after_init = {
        {.action = MEASURE, .func = CRITICAL_DATA, .flags = IMA_FUNC},
};

/* An array of architecture specific rules */
static struct ima_rule_entry *arch_policy_entry __ro_after_init;

static LIST_HEAD(ima_default_rules);
static LIST_HEAD(ima_policy_rules);
static LIST_HEAD(ima_temp_rules);
static struct list_head __rcu *ima_rules = (struct list_head __rcu *)(&ima_default_rules);

static int ima_policy __initdata;

static int __init default_measure_policy_setup(char *str)
{
        if (ima_policy)
                return 1;

        ima_policy = ORIGINAL_TCB;
        return 1;
}
__setup("ima_tcb", default_measure_policy_setup);

static bool ima_use_appraise_tcb __initdata;
static bool ima_use_secure_boot __initdata;
static bool ima_use_critical_data __initdata;
static bool ima_fail_unverifiable_sigs __ro_after_init;
static int __init policy_setup(char *str)
{
        char *p;

        while ((p = strsep(&str, " |\n")) != NULL) {
                if (*p == ' ')
                        continue;
                if ((strcmp(p, "tcb") == 0) && !ima_policy)
                        ima_policy = DEFAULT_TCB;
                else if (strcmp(p, "appraise_tcb") == 0)
                        ima_use_appraise_tcb = true;
                else if (strcmp(p, "secure_boot") == 0)
                        ima_use_secure_boot = true;
                else if (strcmp(p, "critical_data") == 0)
                        ima_use_critical_data = true;
                else if (strcmp(p, "fail_securely") == 0)
                        ima_fail_unverifiable_sigs = true;
                else
                        pr_err("policy \"%s\" not found", p);
        }

        return 1;
}
__setup("ima_policy=", policy_setup);

static int __init default_appraise_policy_setup(char *str)
{
        ima_use_appraise_tcb = true;
        return 1;
}
__setup("ima_appraise_tcb", default_appraise_policy_setup);

static struct ima_rule_opt_list *ima_alloc_rule_opt_list(const substring_t *src)
{
        struct ima_rule_opt_list *opt_list;
        size_t count = 0;
        char *src_copy;
        char *cur, *next;
        size_t i;

        src_copy = match_strdup(src);
        if (!src_copy)
                return ERR_PTR(-ENOMEM);

        next = src_copy;
        while ((cur = strsep(&next, "|"))) {
                /* Don't accept an empty list item */
                if (!(*cur)) {
                        kfree(src_copy);
                        return ERR_PTR(-EINVAL);
                }
                count++;
        }

        /* Don't accept an empty list */
        if (!count) {
                kfree(src_copy);
                return ERR_PTR(-EINVAL);
        }

        opt_list = kzalloc_flex(*opt_list, items, count);
        if (!opt_list) {
                kfree(src_copy);
                return ERR_PTR(-ENOMEM);
        }
        opt_list->count = count;

        /*
         * strsep() has already replaced all instances of '|' with '\0',
         * leaving a byte sequence of NUL-terminated strings. Reference each
         * string with the array of items.
         *
         * IMPORTANT: Ownership of the allocated buffer is transferred from
         * src_copy to the first element in the items array. To free the
         * buffer, kfree() must only be called on the first element of the
         * array.
         */
        for (i = 0, cur = src_copy; i < count; i++) {
                opt_list->items[i] = cur;
                cur = strchr(cur, '\0') + 1;
        }

        return opt_list;
}

static void ima_free_rule_opt_list(struct ima_rule_opt_list *opt_list)
{
        if (!opt_list)
                return;

        if (opt_list->count) {
                kfree(opt_list->items[0]);
                opt_list->count = 0;
        }

        kfree(opt_list);
}

static void ima_lsm_free_rule(struct ima_rule_entry *entry)
{
        int i;

        for (i = 0; i < MAX_LSM_RULES; i++) {
                ima_filter_rule_free(entry->lsm[i].rule);
                kfree(entry->lsm[i].args_p);
        }
}

static void ima_free_rule(struct ima_rule_entry *entry)
{
        if (!entry)
                return;

        /*
         * entry->template->fields may be allocated in ima_parse_rule() but that
         * reference is owned by the corresponding ima_template_desc element in
         * the defined_templates list and cannot be freed here
         */
        kfree(entry->fsname);
        kfree(entry->fs_subtype);
        ima_free_rule_opt_list(entry->keyrings);
        ima_lsm_free_rule(entry);
        kfree(entry);
}

static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry,
                                                gfp_t gfp)
{
        struct ima_rule_entry *nentry;
        int i;

        /*
         * Immutable elements are copied over as pointers and data; only
         * lsm rules can change
         */
        nentry = kmemdup(entry, sizeof(*nentry), gfp);
        if (!nentry)
                return NULL;

        memset(nentry->lsm, 0, sizeof_field(struct ima_rule_entry, lsm));

        for (i = 0; i < MAX_LSM_RULES; i++) {
                if (!entry->lsm[i].args_p)
                        continue;

                nentry->lsm[i].type = entry->lsm[i].type;
                nentry->lsm[i].args_p = entry->lsm[i].args_p;

                ima_filter_rule_init(nentry->lsm[i].type, Audit_equal,
                                     nentry->lsm[i].args_p,
                                     &nentry->lsm[i].rule,
                                     gfp);
                if (!nentry->lsm[i].rule)
                        pr_warn("rule for LSM \'%s\' is undefined\n",
                                nentry->lsm[i].args_p);
        }
        return nentry;
}

static int ima_lsm_update_rule(struct ima_rule_entry *entry)
{
        int i;
        struct ima_rule_entry *nentry;

        nentry = ima_lsm_copy_rule(entry, GFP_KERNEL);
        if (!nentry)
                return -ENOMEM;

        list_replace_rcu(&entry->list, &nentry->list);
        synchronize_rcu();
        /*
         * ima_lsm_copy_rule() shallow copied all references, except for the
         * LSM references, from entry to nentry so we only want to free the LSM
         * references and the entry itself. All other memory references will now
         * be owned by nentry.
         */
        for (i = 0; i < MAX_LSM_RULES; i++)
                ima_filter_rule_free(entry->lsm[i].rule);
        kfree(entry);

        return 0;
}

static bool ima_rule_contains_lsm_cond(struct ima_rule_entry *entry)
{
        int i;

        for (i = 0; i < MAX_LSM_RULES; i++)
                if (entry->lsm[i].args_p)
                        return true;

        return false;
}

/*
 * The LSM policy can be reloaded, leaving the IMA LSM based rules referring
 * to the old, stale LSM policy.  Update the IMA LSM based rules to reflect
 * the reloaded LSM policy.
 */
static void ima_lsm_update_rules(void)
{
        struct ima_rule_entry *entry, *e;
        int result;

        list_for_each_entry_safe(entry, e, &ima_policy_rules, list) {
                if (!ima_rule_contains_lsm_cond(entry))
                        continue;

                result = ima_lsm_update_rule(entry);
                if (result) {
                        pr_err("lsm rule update error %d\n", result);
                        return;
                }
        }
}

int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event,
                          void *lsm_data)
{
        if (event != LSM_POLICY_CHANGE)
                return NOTIFY_DONE;

        ima_lsm_update_rules();
        return NOTIFY_OK;
}

/**
 * ima_match_rule_data - determine whether func_data matches the policy rule
 * @rule: a pointer to a rule
 * @func_data: data to match against the measure rule data
 * @cred: a pointer to a credentials structure for user validation
 *
 * Returns true if func_data matches one in the rule, false otherwise.
 */
static bool ima_match_rule_data(struct ima_rule_entry *rule,
                                const char *func_data,
                                const struct cred *cred)
{
        const struct ima_rule_opt_list *opt_list = NULL;
        bool matched = false;
        size_t i;

        if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid))
                return false;

        switch (rule->func) {
        case KEY_CHECK:
                if (!rule->keyrings)
                        return true;

                opt_list = rule->keyrings;
                break;
        case CRITICAL_DATA:
                if (!rule->label)
                        return true;

                opt_list = rule->label;
                break;
        default:
                return false;
        }

        if (!func_data)
                return false;

        for (i = 0; i < opt_list->count; i++) {
                if (!strcmp(opt_list->items[i], func_data)) {
                        matched = true;
                        break;
                }
        }

        return matched;
}

/**
 * ima_match_rules - determine whether an inode matches the policy rule.
 * @rule: a pointer to a rule
 * @idmap: idmap of the mount the inode was found from
 * @inode: a pointer to an inode
 * @cred: a pointer to a credentials structure for user validation
 * @prop: LSM properties of the task to be validated
 * @func: LIM hook identifier
 * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
 * @func_data: func specific data, may be NULL
 *
 * Returns true on rule match, false on failure.
 */
static bool ima_match_rules(struct ima_rule_entry *rule,
                            struct mnt_idmap *idmap,
                            struct inode *inode, const struct cred *cred,
                            struct lsm_prop *prop, enum ima_hooks func, int mask,
                            const char *func_data)
{
        int i;
        bool result = false;
        struct ima_rule_entry *lsm_rule = rule;
        bool rule_reinitialized = false;

        if ((rule->flags & IMA_FUNC) &&
            (rule->func != func && func != POST_SETATTR))
                return false;

        switch (func) {
        case KEY_CHECK:
        case CRITICAL_DATA:
                return ((rule->func == func) &&
                        ima_match_rule_data(rule, func_data, cred));
        default:
                break;
        }

        if ((rule->flags & IMA_MASK) &&
            (rule->mask != mask && func != POST_SETATTR))
                return false;
        if ((rule->flags & IMA_INMASK) &&
            (!(rule->mask & mask) && func != POST_SETATTR))
                return false;
        if ((rule->flags & IMA_FSMAGIC)
            && rule->fsmagic != inode->i_sb->s_magic)
                return false;
        if ((rule->flags & IMA_FSNAME)
            && strcmp(rule->fsname, inode->i_sb->s_type->name))
                return false;
        if (rule->flags & IMA_FS_SUBTYPE) {
                if (!inode->i_sb->s_subtype)
                        return false;
                if (strcmp(rule->fs_subtype, inode->i_sb->s_subtype))
                        return false;
        }
        if ((rule->flags & IMA_FSUUID) &&
            !uuid_equal(&rule->fsuuid, &inode->i_sb->s_uuid))
                return false;
        if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid))
                return false;
        if (rule->flags & IMA_EUID) {
                if (has_capability_noaudit(current, CAP_SETUID)) {
                        if (!rule->uid_op(cred->euid, rule->uid)
                            && !rule->uid_op(cred->suid, rule->uid)
                            && !rule->uid_op(cred->uid, rule->uid))
                                return false;
                } else if (!rule->uid_op(cred->euid, rule->uid))
                        return false;
        }
        if ((rule->flags & IMA_GID) && !rule->gid_op(cred->gid, rule->gid))
                return false;
        if (rule->flags & IMA_EGID) {
                if (has_capability_noaudit(current, CAP_SETGID)) {
                        if (!rule->gid_op(cred->egid, rule->gid)
                            && !rule->gid_op(cred->sgid, rule->gid)
                            && !rule->gid_op(cred->gid, rule->gid))
                                return false;
                } else if (!rule->gid_op(cred->egid, rule->gid))
                        return false;
        }
        if ((rule->flags & IMA_FOWNER) &&
            !rule->fowner_op(i_uid_into_vfsuid(idmap, inode),
                             rule->fowner))
                return false;
        if ((rule->flags & IMA_FGROUP) &&
            !rule->fgroup_op(i_gid_into_vfsgid(idmap, inode),
                             rule->fgroup))
                return false;
        for (i = 0; i < MAX_LSM_RULES; i++) {
                int rc = 0;
                struct lsm_prop inode_prop = { };

                if (!lsm_rule->lsm[i].rule) {
                        if (!lsm_rule->lsm[i].args_p)
                                continue;
                        else
                                return false;
                }

retry:
                switch (i) {
                case LSM_OBJ_USER:
                case LSM_OBJ_ROLE:
                case LSM_OBJ_TYPE:
                        security_inode_getlsmprop(inode, &inode_prop);
                        rc = ima_filter_rule_match(&inode_prop,
                                                   lsm_rule->lsm[i].type,
                                                   Audit_equal,
                                                   lsm_rule->lsm[i].rule);
                        break;
                case LSM_SUBJ_USER:
                case LSM_SUBJ_ROLE:
                case LSM_SUBJ_TYPE:
                        rc = ima_filter_rule_match(prop, lsm_rule->lsm[i].type,
                                                   Audit_equal,
                                                   lsm_rule->lsm[i].rule);
                        break;
                default:
                        break;
                }

                if (rc == -ESTALE && !rule_reinitialized) {
                        lsm_rule = ima_lsm_copy_rule(rule, GFP_ATOMIC);
                        if (lsm_rule) {
                                rule_reinitialized = true;
                                goto retry;
                        }
                }
                if (rc <= 0) {
                        result = false;
                        goto out;
                }
        }
        result = true;

out:
        if (rule_reinitialized) {
                for (i = 0; i < MAX_LSM_RULES; i++)
                        ima_filter_rule_free(lsm_rule->lsm[i].rule);
                kfree(lsm_rule);
        }
        return result;
}

/*
 * In addition to knowing that we need to appraise the file in general,
 * we need to differentiate between calling hooks, for hook specific rules.
 */
static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
{
        if (!(rule->flags & IMA_FUNC))
                return IMA_FILE_APPRAISE;

        switch (func) {
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
                return IMA_MMAP_APPRAISE;
        case BPRM_CHECK:
                return IMA_BPRM_APPRAISE;
        case CREDS_CHECK:
                return IMA_CREDS_APPRAISE;
        case FILE_CHECK:
        case POST_SETATTR:
                return IMA_FILE_APPRAISE;
        case MODULE_CHECK ... MAX_CHECK - 1:
        default:
                return IMA_READ_APPRAISE;
        }
}

/**
 * ima_match_policy - decision based on LSM and other conditions
 * @idmap: idmap of the mount the inode was found from
 * @inode: pointer to an inode for which the policy decision is being made
 * @cred: pointer to a credentials structure for which the policy decision is
 *        being made
 * @prop: LSM properties of the task to be validated
 * @func: IMA hook identifier
 * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
 * @flags: IMA actions to consider (e.g. IMA_MEASURE | IMA_APPRAISE)
 * @pcr: set the pcr to extend
 * @template_desc: the template that should be used for this rule
 * @func_data: func specific data, may be NULL
 * @allowed_algos: allowlist of hash algorithms for the IMA xattr
 *
 * Measure decision based on func/mask/fsmagic and LSM(subj/obj/type)
 * conditions.
 *
 * Since the IMA policy may be updated multiple times we need to lock the
 * list when walking it.  Reads are many orders of magnitude more numerous
 * than writes so ima_match_policy() is classical RCU candidate.
 */
int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
                     const struct cred *cred, struct lsm_prop *prop,
                     enum ima_hooks func, int mask, int flags, int *pcr,
                     struct ima_template_desc **template_desc,
                     const char *func_data, unsigned int *allowed_algos)
{
        struct ima_rule_entry *entry;
        int action = 0, actmask = flags | (flags << 1);
        struct list_head *ima_rules_tmp;

        if (template_desc && !*template_desc)
                *template_desc = ima_template_desc_current();

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {

                if (!(entry->action & actmask))
                        continue;

                if (!ima_match_rules(entry, idmap, inode, cred, prop,
                                     func, mask, func_data))
                        continue;

                action |= entry->flags & IMA_NONACTION_FLAGS;

                action |= entry->action & IMA_DO_MASK;
                if (entry->action & IMA_APPRAISE) {
                        action |= get_subaction(entry, func);
                        action &= ~IMA_HASH;
                        if (ima_fail_unverifiable_sigs)
                                action |= IMA_FAIL_UNVERIFIABLE_SIGS;

                        if (allowed_algos &&
                            entry->flags & IMA_VALIDATE_ALGOS)
                                *allowed_algos = entry->allowed_algos;
                }

                if (entry->action & IMA_DO_MASK)
                        actmask &= ~(entry->action | entry->action << 1);
                else
                        actmask &= ~(entry->action | entry->action >> 1);

                if ((pcr) && (entry->flags & IMA_PCR))
                        *pcr = entry->pcr;

                if (template_desc && entry->template)
                        *template_desc = entry->template;

                if (!actmask)
                        break;
        }
        rcu_read_unlock();

        return action;
}

/**
 * ima_update_policy_flags() - Update global IMA variables
 *
 * Update ima_policy_flag and ima_setxattr_allowed_hash_algorithms
 * based on the currently loaded policy.
 *
 * With ima_policy_flag, the decision to short circuit out of a function
 * or not call the function in the first place can be made earlier.
 *
 * With ima_setxattr_allowed_hash_algorithms, the policy can restrict the
 * set of hash algorithms accepted when updating the security.ima xattr of
 * a file.
 *
 * Context: called after a policy update and at system initialization.
 */
void ima_update_policy_flags(void)
{
        struct ima_rule_entry *entry;
        int new_policy_flag = 0;
        struct list_head *ima_rules_tmp;

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {
                /*
                 * SETXATTR_CHECK rules do not implement a full policy check
                 * because rule checking would probably have an important
                 * performance impact on setxattr(). As a consequence, only one
                 * SETXATTR_CHECK can be active at a given time.
                 * Because we want to preserve that property, we set out to use
                 * atomic_cmpxchg. Either:
                 * - the atomic was non-zero: a setxattr hash policy is
                 *   already enforced, we do nothing
                 * - the atomic was zero: no setxattr policy was set, enable
                 *   the setxattr hash policy
                 */
                if (entry->func == SETXATTR_CHECK) {
                        atomic_cmpxchg(&ima_setxattr_allowed_hash_algorithms,
                                       0, entry->allowed_algos);
                        /* SETXATTR_CHECK doesn't impact ima_policy_flag */
                        continue;
                }

                if (entry->action & IMA_DO_MASK)
                        new_policy_flag |= entry->action;
        }
        rcu_read_unlock();

        ima_appraise |= (build_ima_appraise | temp_ima_appraise);
        if (!ima_appraise)
                new_policy_flag &= ~IMA_APPRAISE;

        ima_policy_flag = new_policy_flag;
}

static int ima_appraise_flag(enum ima_hooks func)
{
        if (func == MODULE_CHECK)
                return IMA_APPRAISE_MODULES;
        else if (func == FIRMWARE_CHECK)
                return IMA_APPRAISE_FIRMWARE;
        else if (func == POLICY_CHECK)
                return IMA_APPRAISE_POLICY;
        else if (func == KEXEC_KERNEL_CHECK)
                return IMA_APPRAISE_KEXEC;
        return 0;
}

static void add_rules(struct ima_rule_entry *entries, int count,
                      enum policy_rule_list policy_rule)
{
        int i = 0;

        for (i = 0; i < count; i++) {
                struct ima_rule_entry *entry;

                if (policy_rule & IMA_DEFAULT_POLICY)
                        list_add_tail(&entries[i].list, &ima_default_rules);

                if (policy_rule & IMA_CUSTOM_POLICY) {
                        entry = kmemdup(&entries[i], sizeof(*entry),
                                        GFP_KERNEL);
                        if (!entry)
                                continue;

                        list_add_tail(&entry->list, &ima_policy_rules);
                }
                if (entries[i].action == APPRAISE) {
                        if (entries != build_appraise_rules)
                                temp_ima_appraise |=
                                        ima_appraise_flag(entries[i].func);
                        else
                                build_ima_appraise |=
                                        ima_appraise_flag(entries[i].func);
                }
        }
}

static int ima_parse_rule(char *rule, struct ima_rule_entry *entry);

static int __init ima_init_arch_policy(void)
{
        const char * const *arch_rules;
        const char * const *rules;
        int arch_entries = 0;
        int i = 0;

        arch_rules = arch_get_ima_policy();
        if (!arch_rules)
                return arch_entries;

        /* Get number of rules */
        for (rules = arch_rules; *rules != NULL; rules++)
                arch_entries++;

        arch_policy_entry = kzalloc_objs(*arch_policy_entry, arch_entries + 1);
        if (!arch_policy_entry)
                return 0;

        /* Convert each policy string rules to struct ima_rule_entry format */
        for (rules = arch_rules, i = 0; *rules != NULL; rules++) {
                char rule[255];
                int result;

                result = strscpy(rule, *rules, sizeof(rule));

                INIT_LIST_HEAD(&arch_policy_entry[i].list);
                result = ima_parse_rule(rule, &arch_policy_entry[i]);
                if (result) {
                        pr_warn("Skipping unknown architecture policy rule: %s\n",
                                rule);
                        memset(&arch_policy_entry[i], 0,
                               sizeof(*arch_policy_entry));
                        continue;
                }
                i++;
        }
        return i;
}

/**
 * ima_init_policy - initialize the default measure rules.
 *
 * ima_rules points to either the ima_default_rules or the new ima_policy_rules.
 */
void __init ima_init_policy(void)
{
        int build_appraise_entries, arch_entries;

        /* if !ima_policy, we load NO default rules */
        if (ima_policy)
                add_rules(dont_measure_rules, ARRAY_SIZE(dont_measure_rules),
                          IMA_DEFAULT_POLICY);

        switch (ima_policy) {
        case ORIGINAL_TCB:
                add_rules(original_measurement_rules,
                          ARRAY_SIZE(original_measurement_rules),
                          IMA_DEFAULT_POLICY);
                break;
        case DEFAULT_TCB:
                add_rules(default_measurement_rules,
                          ARRAY_SIZE(default_measurement_rules),
                          IMA_DEFAULT_POLICY);
                break;
        default:
                break;
        }

        /*
         * Based on runtime secure boot flags, insert arch specific measurement
         * and appraise rules requiring file signatures for both the initial
         * and custom policies, prior to other appraise rules.
         * (Highest priority)
         */
        arch_entries = ima_init_arch_policy();
        if (!arch_entries)
                pr_info("No architecture policies found\n");
        else
                add_rules(arch_policy_entry, arch_entries,
                          IMA_DEFAULT_POLICY | IMA_CUSTOM_POLICY);

        /*
         * Insert the builtin "secure_boot" policy rules requiring file
         * signatures, prior to other appraise rules.
         */
        if (ima_use_secure_boot)
                add_rules(secure_boot_rules, ARRAY_SIZE(secure_boot_rules),
                          IMA_DEFAULT_POLICY);

        /*
         * Insert the build time appraise rules requiring file signatures
         * for both the initial and custom policies, prior to other appraise
         * rules. As the secure boot rules includes all of the build time
         * rules, include either one or the other set of rules, but not both.
         */
        build_appraise_entries = ARRAY_SIZE(build_appraise_rules);
        if (build_appraise_entries) {
                if (ima_use_secure_boot)
                        add_rules(build_appraise_rules, build_appraise_entries,
                                  IMA_CUSTOM_POLICY);
                else
                        add_rules(build_appraise_rules, build_appraise_entries,
                                  IMA_DEFAULT_POLICY | IMA_CUSTOM_POLICY);
        }

        if (ima_use_appraise_tcb)
                add_rules(default_appraise_rules,
                          ARRAY_SIZE(default_appraise_rules),
                          IMA_DEFAULT_POLICY);

        if (ima_use_critical_data)
                add_rules(critical_data_rules,
                          ARRAY_SIZE(critical_data_rules),
                          IMA_DEFAULT_POLICY);

        atomic_set(&ima_setxattr_allowed_hash_algorithms, 0);

        ima_update_policy_flags();
}

/* Make sure we have a valid policy, at least containing some rules. */
int ima_check_policy(void)
{
        if (list_empty(&ima_temp_rules))
                return -EINVAL;
        return 0;
}

/**
 * ima_update_policy - update default_rules with new measure rules
 *
 * Called on file .release to update the default rules with a complete new
 * policy.  What we do here is to splice ima_policy_rules and ima_temp_rules so
 * they make a queue.  The policy may be updated multiple times and this is the
 * RCU updater.
 *
 * Policy rules are never deleted so ima_policy_flag gets zeroed only once when
 * we switch from the default policy to user defined.
 */
void ima_update_policy(void)
{
        struct list_head *policy = &ima_policy_rules;

        list_splice_tail_init_rcu(&ima_temp_rules, policy, synchronize_rcu);

        if (ima_rules != (struct list_head __rcu *)policy) {
                ima_policy_flag = 0;

                rcu_assign_pointer(ima_rules, policy);
                /*
                 * IMA architecture specific policy rules are specified
                 * as strings and converted to an array of ima_entry_rules
                 * on boot.  After loading a custom policy, free the
                 * architecture specific rules stored as an array.
                 */
                kfree(arch_policy_entry);
        }
        ima_update_policy_flags();

        /* Custom IMA policy has been loaded */
        ima_process_queued_keys();
}

/* Keep the enumeration in sync with the policy_tokens! */
enum policy_opt {
        Opt_measure, Opt_dont_measure,
        Opt_appraise, Opt_dont_appraise,
        Opt_audit, Opt_dont_audit, Opt_hash, Opt_dont_hash,
        Opt_obj_user, Opt_obj_role, Opt_obj_type,
        Opt_subj_user, Opt_subj_role, Opt_subj_type,
        Opt_func, Opt_mask, Opt_fsmagic, Opt_fsname, Opt_fs_subtype, Opt_fsuuid,
        Opt_uid_eq, Opt_euid_eq, Opt_gid_eq, Opt_egid_eq,
        Opt_fowner_eq, Opt_fgroup_eq,
        Opt_uid_gt, Opt_euid_gt, Opt_gid_gt, Opt_egid_gt,
        Opt_fowner_gt, Opt_fgroup_gt,
        Opt_uid_lt, Opt_euid_lt, Opt_gid_lt, Opt_egid_lt,
        Opt_fowner_lt, Opt_fgroup_lt,
        Opt_digest_type,
        Opt_appraise_type, Opt_appraise_flag, Opt_appraise_algos,
        Opt_permit_directio, Opt_pcr, Opt_template, Opt_keyrings,
        Opt_label, Opt_err
};

static const match_table_t policy_tokens = {
        {Opt_measure, "measure"},
        {Opt_dont_measure, "dont_measure"},
        {Opt_appraise, "appraise"},
        {Opt_dont_appraise, "dont_appraise"},
        {Opt_audit, "audit"},
        {Opt_dont_audit, "dont_audit"},
        {Opt_hash, "hash"},
        {Opt_dont_hash, "dont_hash"},
        {Opt_obj_user, "obj_user=%s"},
        {Opt_obj_role, "obj_role=%s"},
        {Opt_obj_type, "obj_type=%s"},
        {Opt_subj_user, "subj_user=%s"},
        {Opt_subj_role, "subj_role=%s"},
        {Opt_subj_type, "subj_type=%s"},
        {Opt_func, "func=%s"},
        {Opt_mask, "mask=%s"},
        {Opt_fsmagic, "fsmagic=%s"},
        {Opt_fsname, "fsname=%s"},
        {Opt_fs_subtype, "fs_subtype=%s"},
        {Opt_fsuuid, "fsuuid=%s"},
        {Opt_uid_eq, "uid=%s"},
        {Opt_euid_eq, "euid=%s"},
        {Opt_gid_eq, "gid=%s"},
        {Opt_egid_eq, "egid=%s"},
        {Opt_fowner_eq, "fowner=%s"},
        {Opt_fgroup_eq, "fgroup=%s"},
        {Opt_uid_gt, "uid>%s"},
        {Opt_euid_gt, "euid>%s"},
        {Opt_gid_gt, "gid>%s"},
        {Opt_egid_gt, "egid>%s"},
        {Opt_fowner_gt, "fowner>%s"},
        {Opt_fgroup_gt, "fgroup>%s"},
        {Opt_uid_lt, "uid<%s"},
        {Opt_euid_lt, "euid<%s"},
        {Opt_gid_lt, "gid<%s"},
        {Opt_egid_lt, "egid<%s"},
        {Opt_fowner_lt, "fowner<%s"},
        {Opt_fgroup_lt, "fgroup<%s"},
        {Opt_digest_type, "digest_type=%s"},
        {Opt_appraise_type, "appraise_type=%s"},
        {Opt_appraise_flag, "appraise_flag=%s"},
        {Opt_appraise_algos, "appraise_algos=%s"},
        {Opt_permit_directio, "permit_directio"},
        {Opt_pcr, "pcr=%s"},
        {Opt_template, "template=%s"},
        {Opt_keyrings, "keyrings=%s"},
        {Opt_label, "label=%s"},
        {Opt_err, NULL}
};

static int ima_lsm_rule_init(struct ima_rule_entry *entry,
                             substring_t *args, int lsm_rule, int audit_type)
{
        int result;

        if (entry->lsm[lsm_rule].rule)
                return -EINVAL;

        entry->lsm[lsm_rule].args_p = match_strdup(args);
        if (!entry->lsm[lsm_rule].args_p)
                return -ENOMEM;

        entry->lsm[lsm_rule].type = audit_type;
        result = ima_filter_rule_init(entry->lsm[lsm_rule].type, Audit_equal,
                                      entry->lsm[lsm_rule].args_p,
                                      &entry->lsm[lsm_rule].rule,
                                      GFP_KERNEL);
        if (!entry->lsm[lsm_rule].rule) {
                pr_warn("rule for LSM \'%s\' is undefined\n",
                        entry->lsm[lsm_rule].args_p);

                if (ima_rules == (struct list_head __rcu *)(&ima_default_rules)) {
                        kfree(entry->lsm[lsm_rule].args_p);
                        entry->lsm[lsm_rule].args_p = NULL;
                        result = -EINVAL;
                } else
                        result = 0;
        }

        return result;
}

static void ima_log_string_op(struct audit_buffer *ab, char *key, char *value,
                              enum policy_opt rule_operator)
{
        if (!ab)
                return;

        switch (rule_operator) {
        case Opt_uid_gt:
        case Opt_euid_gt:
        case Opt_gid_gt:
        case Opt_egid_gt:
        case Opt_fowner_gt:
        case Opt_fgroup_gt:
                audit_log_format(ab, "%s>", key);
                break;
        case Opt_uid_lt:
        case Opt_euid_lt:
        case Opt_gid_lt:
        case Opt_egid_lt:
        case Opt_fowner_lt:
        case Opt_fgroup_lt:
                audit_log_format(ab, "%s<", key);
                break;
        default:
                audit_log_format(ab, "%s=", key);
        }
        audit_log_format(ab, "%s ", value);
}
static void ima_log_string(struct audit_buffer *ab, char *key, char *value)
{
        ima_log_string_op(ab, key, value, Opt_err);
}

/*
 * Validating the appended signature included in the measurement list requires
 * the file hash calculated without the appended signature (i.e., the 'd-modsig'
 * field). Therefore, notify the user if they have the 'modsig' field but not
 * the 'd-modsig' field in the template.
 */
static void check_template_modsig(const struct ima_template_desc *template)
{
#define MSG "template with 'modsig' field also needs 'd-modsig' field\n"
        bool has_modsig, has_dmodsig;
        static bool checked;
        int i;

        /* We only need to notify the user once. */
        if (checked)
                return;

        has_modsig = has_dmodsig = false;
        for (i = 0; i < template->num_fields; i++) {
                if (!strcmp(template->fields[i]->field_id, "modsig"))
                        has_modsig = true;
                else if (!strcmp(template->fields[i]->field_id, "d-modsig"))
                        has_dmodsig = true;
        }

        if (has_modsig && !has_dmodsig)
                pr_notice(MSG);

        checked = true;
#undef MSG
}

/*
 * Warn if the template does not contain the given field.
 */
static void check_template_field(const struct ima_template_desc *template,
                                 const char *field, const char *msg)
{
        int i;

        for (i = 0; i < template->num_fields; i++)
                if (!strcmp(template->fields[i]->field_id, field))
                        return;

        pr_notice_once("%s", msg);
}

static bool ima_validate_rule(struct ima_rule_entry *entry)
{
        /* Ensure that the action is set and is compatible with the flags */
        if (entry->action == UNKNOWN)
                return false;

        if (entry->action != MEASURE && entry->flags & IMA_PCR)
                return false;

        if (entry->action != APPRAISE &&
            entry->flags & (IMA_DIGSIG_REQUIRED | IMA_MODSIG_ALLOWED |
                            IMA_CHECK_BLACKLIST | IMA_VALIDATE_ALGOS))
                return false;

        /*
         * The IMA_FUNC bit must be set if and only if there's a valid hook
         * function specified, and vice versa. Enforcing this property allows
         * for the NONE case below to validate a rule without an explicit hook
         * function.
         */
        if (((entry->flags & IMA_FUNC) && entry->func == NONE) ||
            (!(entry->flags & IMA_FUNC) && entry->func != NONE))
                return false;

        /*
         * Ensure that the hook function is compatible with the other
         * components of the rule
         */
        switch (entry->func) {
        case NONE:
        case FILE_CHECK:
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
        case BPRM_CHECK:
        case CREDS_CHECK:
        case POST_SETATTR:
        case FIRMWARE_CHECK:
        case POLICY_CHECK:
                if (entry->flags & ~(IMA_FUNC | IMA_MASK | IMA_FSMAGIC |
                                     IMA_UID | IMA_FOWNER | IMA_FSUUID |
                                     IMA_INMASK | IMA_EUID | IMA_PCR |
                                     IMA_FSNAME | IMA_FS_SUBTYPE |
                                     IMA_GID | IMA_EGID |
                                     IMA_FGROUP | IMA_DIGSIG_REQUIRED |
                                     IMA_PERMIT_DIRECTIO | IMA_VALIDATE_ALGOS |
                                     IMA_CHECK_BLACKLIST | IMA_VERITY_REQUIRED |
                                     IMA_SIGV3_REQUIRED))
                        return false;

                break;
        case MODULE_CHECK:
        case KEXEC_KERNEL_CHECK:
        case KEXEC_INITRAMFS_CHECK:
                if (entry->flags & ~(IMA_FUNC | IMA_MASK | IMA_FSMAGIC |
                                     IMA_UID | IMA_FOWNER | IMA_FSUUID |
                                     IMA_INMASK | IMA_EUID | IMA_PCR |
                                     IMA_FSNAME | IMA_FS_SUBTYPE |
                                     IMA_GID | IMA_EGID |
                                     IMA_FGROUP | IMA_DIGSIG_REQUIRED |
                                     IMA_PERMIT_DIRECTIO | IMA_MODSIG_ALLOWED |
                                     IMA_CHECK_BLACKLIST | IMA_VALIDATE_ALGOS))
                        return false;

                break;
        case KEXEC_CMDLINE:
                if (entry->action & ~(MEASURE | DONT_MEASURE))
                        return false;

                if (entry->flags & ~(IMA_FUNC | IMA_FSMAGIC | IMA_UID |
                                     IMA_FOWNER | IMA_FSUUID | IMA_EUID |
                                     IMA_PCR | IMA_FSNAME | IMA_FS_SUBTYPE |
                                     IMA_GID | IMA_EGID |
                                     IMA_FGROUP))
                        return false;

                break;
        case KEY_CHECK:
                if (entry->action & ~(MEASURE | DONT_MEASURE))
                        return false;

                if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_GID | IMA_PCR |
                                     IMA_KEYRINGS))
                        return false;

                if (ima_rule_contains_lsm_cond(entry))
                        return false;

                break;
        case CRITICAL_DATA:
                if (entry->action & ~(MEASURE | DONT_MEASURE))
                        return false;

                if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_GID | IMA_PCR |
                                     IMA_LABEL))
                        return false;

                if (ima_rule_contains_lsm_cond(entry))
                        return false;

                break;
        case SETXATTR_CHECK:
                /* any action other than APPRAISE is unsupported */
                if (entry->action != APPRAISE)
                        return false;

                /* SETXATTR_CHECK requires an appraise_algos parameter */
                if (!(entry->flags & IMA_VALIDATE_ALGOS))
                        return false;

                /*
                 * full policies are not supported, they would have too
                 * much of a performance impact
                 */
                if (entry->flags & ~(IMA_FUNC | IMA_VALIDATE_ALGOS))
                        return false;

                break;
        default:
                return false;
        }

        /* Ensure that combinations of flags are compatible with each other */
        if (entry->flags & IMA_CHECK_BLACKLIST &&
            !(entry->flags & IMA_DIGSIG_REQUIRED))
                return false;

        /*
         * Unlike for regular IMA 'appraise' policy rules where security.ima
         * xattr may contain either a file hash or signature, the security.ima
         * xattr for fsverity must contain a file signature (sigv3).  Ensure
         * that 'appraise' rules for fsverity require file signatures by
         * checking the IMA_DIGSIG_REQUIRED flag is set.
         */
        if (entry->action == APPRAISE &&
            (entry->flags & IMA_VERITY_REQUIRED) &&
            !(entry->flags & IMA_DIGSIG_REQUIRED))
                return false;

        return true;
}

static unsigned int ima_parse_appraise_algos(char *arg)
{
        unsigned int res = 0;
        int idx;
        char *token;

        while ((token = strsep(&arg, ",")) != NULL) {
                idx = match_string(hash_algo_name, HASH_ALGO__LAST, token);

                if (idx < 0) {
                        pr_err("unknown hash algorithm \"%s\"",
                               token);
                        return 0;
                }

                if (!crypto_has_alg(hash_algo_name[idx], 0, 0)) {
                        pr_err("unavailable hash algorithm \"%s\", check your kernel configuration",
                               token);
                        return 0;
                }

                /* Add the hash algorithm to the 'allowed' bitfield */
                res |= (1U << idx);
        }

        return res;
}

static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
{
        struct audit_buffer *ab;
        char *from;
        char *p;
        bool eid_token; /* either euid or egid */
        struct ima_template_desc *template_desc;
        int result = 0;

        ab = integrity_audit_log_start(audit_context(), GFP_KERNEL,
                                       AUDIT_INTEGRITY_POLICY_RULE);

        entry->uid = INVALID_UID;
        entry->gid = INVALID_GID;
        entry->fowner = INVALID_UID;
        entry->fgroup = INVALID_GID;
        entry->uid_op = &uid_eq;
        entry->gid_op = &gid_eq;
        entry->fowner_op = &vfsuid_eq_kuid;
        entry->fgroup_op = &vfsgid_eq_kgid;
        entry->action = UNKNOWN;
        while ((p = strsep(&rule, " \t")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
                int token;
                unsigned long lnum;

                if (result < 0 || *p == '#')  /* ignore suffixed comment */
                        break;
                if ((*p == '\0') || (*p == ' ') || (*p == '\t'))
                        continue;
                token = match_token(p, policy_tokens, args);
                switch (token) {
                case Opt_measure:
                        ima_log_string(ab, "action", "measure");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = MEASURE;
                        break;
                case Opt_dont_measure:
                        ima_log_string(ab, "action", "dont_measure");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = DONT_MEASURE;
                        break;
                case Opt_appraise:
                        ima_log_string(ab, "action", "appraise");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = APPRAISE;
                        break;
                case Opt_dont_appraise:
                        ima_log_string(ab, "action", "dont_appraise");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = DONT_APPRAISE;
                        break;
                case Opt_audit:
                        ima_log_string(ab, "action", "audit");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = AUDIT;
                        break;
                case Opt_dont_audit:
                        ima_log_string(ab, "action", "dont_audit");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = DONT_AUDIT;
                        break;
                case Opt_hash:
                        ima_log_string(ab, "action", "hash");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = HASH;
                        break;
                case Opt_dont_hash:
                        ima_log_string(ab, "action", "dont_hash");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = DONT_HASH;
                        break;
                case Opt_func:
                        ima_log_string(ab, "func", args[0].from);

                        if (entry->func)
                                result = -EINVAL;

                        if (strcmp(args[0].from, "FILE_CHECK") == 0)
                                entry->func = FILE_CHECK;
                        /* PATH_CHECK is for backwards compat */
                        else if (strcmp(args[0].from, "PATH_CHECK") == 0)
                                entry->func = FILE_CHECK;
                        else if (strcmp(args[0].from, "MODULE_CHECK") == 0)
                                entry->func = MODULE_CHECK;
                        else if (strcmp(args[0].from, "FIRMWARE_CHECK") == 0)
                                entry->func = FIRMWARE_CHECK;
                        else if ((strcmp(args[0].from, "FILE_MMAP") == 0)
                                || (strcmp(args[0].from, "MMAP_CHECK") == 0))
                                entry->func = MMAP_CHECK;
                        else if ((strcmp(args[0].from, "MMAP_CHECK_REQPROT") == 0))
                                entry->func = MMAP_CHECK_REQPROT;
                        else if (strcmp(args[0].from, "BPRM_CHECK") == 0)
                                entry->func = BPRM_CHECK;
                        else if (strcmp(args[0].from, "CREDS_CHECK") == 0)
                                entry->func = CREDS_CHECK;
                        else if (strcmp(args[0].from, "KEXEC_KERNEL_CHECK") ==
                                 0)
                                entry->func = KEXEC_KERNEL_CHECK;
                        else if (strcmp(args[0].from, "KEXEC_INITRAMFS_CHECK")
                                 == 0)
                                entry->func = KEXEC_INITRAMFS_CHECK;
                        else if (strcmp(args[0].from, "POLICY_CHECK") == 0)
                                entry->func = POLICY_CHECK;
                        else if (strcmp(args[0].from, "KEXEC_CMDLINE") == 0)
                                entry->func = KEXEC_CMDLINE;
                        else if (IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) &&
                                 strcmp(args[0].from, "KEY_CHECK") == 0)
                                entry->func = KEY_CHECK;
                        else if (strcmp(args[0].from, "CRITICAL_DATA") == 0)
                                entry->func = CRITICAL_DATA;
                        else if (strcmp(args[0].from, "SETXATTR_CHECK") == 0)
                                entry->func = SETXATTR_CHECK;
                        else
                                result = -EINVAL;
                        if (!result)
                                entry->flags |= IMA_FUNC;
                        break;
                case Opt_mask:
                        ima_log_string(ab, "mask", args[0].from);

                        if (entry->mask)
                                result = -EINVAL;

                        from = args[0].from;
                        if (*from == '^')
                                from++;

                        if ((strcmp(from, "MAY_EXEC")) == 0)
                                entry->mask = MAY_EXEC;
                        else if (strcmp(from, "MAY_WRITE") == 0)
                                entry->mask = MAY_WRITE;
                        else if (strcmp(from, "MAY_READ") == 0)
                                entry->mask = MAY_READ;
                        else if (strcmp(from, "MAY_APPEND") == 0)
                                entry->mask = MAY_APPEND;
                        else
                                result = -EINVAL;
                        if (!result)
                                entry->flags |= (*args[0].from == '^')
                                     ? IMA_INMASK : IMA_MASK;
                        break;
                case Opt_fsmagic:
                        ima_log_string(ab, "fsmagic", args[0].from);

                        if (entry->fsmagic) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 16, &entry->fsmagic);
                        if (!result)
                                entry->flags |= IMA_FSMAGIC;
                        break;
                case Opt_fsname:
                        ima_log_string(ab, "fsname", args[0].from);

                        entry->fsname = kstrdup(args[0].from, GFP_KERNEL);
                        if (!entry->fsname) {
                                result = -ENOMEM;
                                break;
                        }
                        result = 0;
                        entry->flags |= IMA_FSNAME;
                        break;
                case Opt_fs_subtype:
                        ima_log_string(ab, "fs_subtype", args[0].from);

                        if (entry->fs_subtype) {
                                result = -EINVAL;
                                break;
                        }

                        entry->fs_subtype = kstrdup(args[0].from, GFP_KERNEL);
                        if (!entry->fs_subtype) {
                                result = -ENOMEM;
                                break;
                        }
                        result = 0;
                        entry->flags |= IMA_FS_SUBTYPE;
                        break;
                case Opt_keyrings:
                        ima_log_string(ab, "keyrings", args[0].from);

                        if (!IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) ||
                            entry->keyrings) {
                                result = -EINVAL;
                                break;
                        }

                        entry->keyrings = ima_alloc_rule_opt_list(args);
                        if (IS_ERR(entry->keyrings)) {
                                result = PTR_ERR(entry->keyrings);
                                entry->keyrings = NULL;
                                break;
                        }

                        entry->flags |= IMA_KEYRINGS;
                        break;
                case Opt_label:
                        ima_log_string(ab, "label", args[0].from);

                        if (entry->label) {
                                result = -EINVAL;
                                break;
                        }

                        entry->label = ima_alloc_rule_opt_list(args);
                        if (IS_ERR(entry->label)) {
                                result = PTR_ERR(entry->label);
                                entry->label = NULL;
                                break;
                        }

                        entry->flags |= IMA_LABEL;
                        break;
                case Opt_fsuuid:
                        ima_log_string(ab, "fsuuid", args[0].from);

                        if (!uuid_is_null(&entry->fsuuid)) {
                                result = -EINVAL;
                                break;
                        }

                        result = uuid_parse(args[0].from, &entry->fsuuid);
                        if (!result)
                                entry->flags |= IMA_FSUUID;
                        break;
                case Opt_uid_gt:
                case Opt_euid_gt:
                        entry->uid_op = &uid_gt;
                        fallthrough;
                case Opt_uid_lt:
                case Opt_euid_lt:
                        if ((token == Opt_uid_lt) || (token == Opt_euid_lt))
                                entry->uid_op = &uid_lt;
                        fallthrough;
                case Opt_uid_eq:
                case Opt_euid_eq:
                        eid_token = (token == Opt_euid_eq) ||
                                    (token == Opt_euid_gt) ||
                                    (token == Opt_euid_lt);

                        ima_log_string_op(ab, eid_token ? "euid" : "uid",
                                          args[0].from, token);

                        if (uid_valid(entry->uid)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->uid = make_kuid(current_user_ns(),
                                                       (uid_t) lnum);
                                if (!uid_valid(entry->uid) ||
                                    (uid_t)lnum != lnum)
                                        result = -EINVAL;
                                else
                                        entry->flags |= eid_token
                                            ? IMA_EUID : IMA_UID;
                        }
                        break;
                case Opt_gid_gt:
                case Opt_egid_gt:
                        entry->gid_op = &gid_gt;
                        fallthrough;
                case Opt_gid_lt:
                case Opt_egid_lt:
                        if ((token == Opt_gid_lt) || (token == Opt_egid_lt))
                                entry->gid_op = &gid_lt;
                        fallthrough;
                case Opt_gid_eq:
                case Opt_egid_eq:
                        eid_token = (token == Opt_egid_eq) ||
                                    (token == Opt_egid_gt) ||
                                    (token == Opt_egid_lt);

                        ima_log_string_op(ab, eid_token ? "egid" : "gid",
                                          args[0].from, token);

                        if (gid_valid(entry->gid)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->gid = make_kgid(current_user_ns(),
                                                       (gid_t)lnum);
                                if (!gid_valid(entry->gid) ||
                                    (((gid_t)lnum) != lnum))
                                        result = -EINVAL;
                                else
                                        entry->flags |= eid_token
                                            ? IMA_EGID : IMA_GID;
                        }
                        break;
                case Opt_fowner_gt:
                        entry->fowner_op = &vfsuid_gt_kuid;
                        fallthrough;
                case Opt_fowner_lt:
                        if (token == Opt_fowner_lt)
                                entry->fowner_op = &vfsuid_lt_kuid;
                        fallthrough;
                case Opt_fowner_eq:
                        ima_log_string_op(ab, "fowner", args[0].from, token);

                        if (uid_valid(entry->fowner)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->fowner = make_kuid(current_user_ns(),
                                                          (uid_t)lnum);
                                if (!uid_valid(entry->fowner) ||
                                    (((uid_t)lnum) != lnum))
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_FOWNER;
                        }
                        break;
                case Opt_fgroup_gt:
                        entry->fgroup_op = &vfsgid_gt_kgid;
                        fallthrough;
                case Opt_fgroup_lt:
                        if (token == Opt_fgroup_lt)
                                entry->fgroup_op = &vfsgid_lt_kgid;
                        fallthrough;
                case Opt_fgroup_eq:
                        ima_log_string_op(ab, "fgroup", args[0].from, token);

                        if (gid_valid(entry->fgroup)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->fgroup = make_kgid(current_user_ns(),
                                                          (gid_t)lnum);
                                if (!gid_valid(entry->fgroup) ||
                                    (((gid_t)lnum) != lnum))
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_FGROUP;
                        }
                        break;
                case Opt_obj_user:
                        ima_log_string(ab, "obj_user", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_OBJ_USER,
                                                   AUDIT_OBJ_USER);
                        break;
                case Opt_obj_role:
                        ima_log_string(ab, "obj_role", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_OBJ_ROLE,
                                                   AUDIT_OBJ_ROLE);
                        break;
                case Opt_obj_type:
                        ima_log_string(ab, "obj_type", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_OBJ_TYPE,
                                                   AUDIT_OBJ_TYPE);
                        break;
                case Opt_subj_user:
                        ima_log_string(ab, "subj_user", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_SUBJ_USER,
                                                   AUDIT_SUBJ_USER);
                        break;
                case Opt_subj_role:
                        ima_log_string(ab, "subj_role", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_SUBJ_ROLE,
                                                   AUDIT_SUBJ_ROLE);
                        break;
                case Opt_subj_type:
                        ima_log_string(ab, "subj_type", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_SUBJ_TYPE,
                                                   AUDIT_SUBJ_TYPE);
                        break;
                case Opt_digest_type:
                        ima_log_string(ab, "digest_type", args[0].from);
                        if ((strcmp(args[0].from, "verity")) == 0)
                                entry->flags |= IMA_VERITY_REQUIRED;
                        else
                                result = -EINVAL;
                        break;
                case Opt_appraise_type:
                        ima_log_string(ab, "appraise_type", args[0].from);

                        if ((strcmp(args[0].from, "imasig")) == 0) {
                                if (entry->flags & IMA_VERITY_REQUIRED)
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_DIGSIG_REQUIRED | IMA_CHECK_BLACKLIST;
                        } else if (strcmp(args[0].from, "sigv3") == 0) {
                                entry->flags |= IMA_SIGV3_REQUIRED |
                                        IMA_DIGSIG_REQUIRED |
                                        IMA_CHECK_BLACKLIST;
                        } else if (IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG) &&
                                 strcmp(args[0].from, "imasig|modsig") == 0) {
                                if ((entry->flags & IMA_VERITY_REQUIRED) ||
                                    (entry->flags & IMA_SIGV3_REQUIRED))
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_DIGSIG_REQUIRED |
                                                IMA_MODSIG_ALLOWED | IMA_CHECK_BLACKLIST;
                        } else {
                                result = -EINVAL;
                        }
                        break;
                case Opt_appraise_flag:
                        ima_log_string(ab, "appraise_flag", args[0].from);
                        break;
                case Opt_appraise_algos:
                        ima_log_string(ab, "appraise_algos", args[0].from);

                        if (entry->allowed_algos) {
                                result = -EINVAL;
                                break;
                        }

                        entry->allowed_algos =
                                ima_parse_appraise_algos(args[0].from);
                        /* invalid or empty list of algorithms */
                        if (!entry->allowed_algos) {
                                result = -EINVAL;
                                break;
                        }

                        entry->flags |= IMA_VALIDATE_ALGOS;

                        break;
                case Opt_permit_directio:
                        entry->flags |= IMA_PERMIT_DIRECTIO;
                        break;
                case Opt_pcr:
                        ima_log_string(ab, "pcr", args[0].from);

                        result = kstrtoint(args[0].from, 10, &entry->pcr);
                        if (result || INVALID_PCR(entry->pcr))
                                result = -EINVAL;
                        else
                                entry->flags |= IMA_PCR;

                        break;
                case Opt_template:
                        ima_log_string(ab, "template", args[0].from);
                        if (entry->action != MEASURE) {
                                result = -EINVAL;
                                break;
                        }
                        template_desc = lookup_template_desc(args[0].from);
                        if (!template_desc || entry->template) {
                                result = -EINVAL;
                                break;
                        }

                        /*
                         * template_desc_init_fields() does nothing if
                         * the template is already initialised, so
                         * it's safe to do this unconditionally
                         */
                        template_desc_init_fields(template_desc->fmt,
                                                 &(template_desc->fields),
                                                 &(template_desc->num_fields));
                        entry->template = template_desc;
                        break;
                case Opt_err:
                        ima_log_string(ab, "UNKNOWN", p);
                        result = -EINVAL;
                        break;
                }
        }
        if (!result && !ima_validate_rule(entry))
                result = -EINVAL;
        else if (entry->action == APPRAISE)
                temp_ima_appraise |= ima_appraise_flag(entry->func);

        if (!result && entry->flags & IMA_MODSIG_ALLOWED) {
                template_desc = entry->template ? entry->template :
                                                  ima_template_desc_current();
                check_template_modsig(template_desc);
        }

        /* d-ngv2 template field recommended for unsigned fs-verity digests */
        if (!result && entry->action == MEASURE &&
            (entry->flags & IMA_VERITY_REQUIRED)) {
                template_desc = entry->template ? entry->template :
                                                  ima_template_desc_current();
                check_template_field(template_desc, "d-ngv2",
                                     "verity rules should include d-ngv2");
        }

        audit_log_format(ab, "res=%d", !result);
        audit_log_end(ab);
        return result;
}

/**
 * ima_parse_add_rule - add a rule to ima_policy_rules
 * @rule: ima measurement policy rule
 *
 * Avoid locking by allowing just one writer at a time in ima_write_policy()
 * Returns the length of the rule parsed, an error code on failure
 */
ssize_t ima_parse_add_rule(char *rule)
{
        static const char op[] = "update_policy";
        char *p;
        struct ima_rule_entry *entry;
        ssize_t result, len;
        int audit_info = 0;

        p = strsep(&rule, "\n");
        len = strlen(p) + 1;
        p += strspn(p, " \t");

        if (*p == '#' || *p == '\0')
                return len;

        entry = kzalloc_obj(*entry);
        if (!entry) {
                integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL,
                                    NULL, op, "-ENOMEM", -ENOMEM, audit_info);
                return -ENOMEM;
        }

        INIT_LIST_HEAD(&entry->list);

        result = ima_parse_rule(p, entry);
        if (result) {
                ima_free_rule(entry);
                integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL,
                                    NULL, op, "invalid-policy", result,
                                    audit_info);
                return result;
        }

        list_add_tail(&entry->list, &ima_temp_rules);

        return len;
}

/**
 * ima_delete_rules() - called to cleanup invalid in-flight policy.
 *
 * We don't need locking as we operate on the temp list, which is
 * different from the active one.  There is also only one user of
 * ima_delete_rules() at a time.
 */
void ima_delete_rules(void)
{
        struct ima_rule_entry *entry, *tmp;

        temp_ima_appraise = 0;
        list_for_each_entry_safe(entry, tmp, &ima_temp_rules, list) {
                list_del(&entry->list);
                ima_free_rule(entry);
        }
}

#define __ima_hook_stringify(func, str)        (#func),

const char *const func_tokens[] = {
        __ima_hooks(__ima_hook_stringify)
};

#ifdef        CONFIG_IMA_READ_POLICY
enum {
        mask_exec = 0, mask_write, mask_read, mask_append
};

static const char *const mask_tokens[] = {
        "^MAY_EXEC",
        "^MAY_WRITE",
        "^MAY_READ",
        "^MAY_APPEND"
};

void *ima_policy_start(struct seq_file *m, loff_t *pos)
{
        loff_t l = *pos;
        struct ima_rule_entry *entry;
        struct list_head *ima_rules_tmp;

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {
                if (!l--) {
                        rcu_read_unlock();
                        return entry;
                }
        }
        rcu_read_unlock();
        return NULL;
}

void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct ima_rule_entry *entry = v;

        rcu_read_lock();
        entry = list_entry_rcu(entry->list.next, struct ima_rule_entry, list);
        rcu_read_unlock();
        (*pos)++;

        return (&entry->list == &ima_default_rules ||
                &entry->list == &ima_policy_rules) ? NULL : entry;
}

void ima_policy_stop(struct seq_file *m, void *v)
{
}

#define pt(token)        policy_tokens[token].pattern
#define mt(token)        mask_tokens[token]

/*
 * policy_func_show - display the ima_hooks policy rule
 */
static void policy_func_show(struct seq_file *m, enum ima_hooks func)
{
        if (func > 0 && func < MAX_CHECK)
                seq_printf(m, "func=%s ", func_tokens[func]);
        else
                seq_printf(m, "func=%d ", func);
}

static void ima_show_rule_opt_list(struct seq_file *m,
                                   const struct ima_rule_opt_list *opt_list)
{
        size_t i;

        for (i = 0; i < opt_list->count; i++)
                seq_printf(m, "%s%s", i ? "|" : "", opt_list->items[i]);
}

static void ima_policy_show_appraise_algos(struct seq_file *m,
                                           unsigned int allowed_hashes)
{
        int idx, list_size = 0;

        for (idx = 0; idx < HASH_ALGO__LAST; idx++) {
                if (!(allowed_hashes & (1U << idx)))
                        continue;

                /* only add commas if the list contains multiple entries */
                if (list_size++)
                        seq_puts(m, ",");

                seq_puts(m, hash_algo_name[idx]);
        }
}

int ima_policy_show(struct seq_file *m, void *v)
{
        struct ima_rule_entry *entry = v;
        int i;
        char tbuf[64] = {0,};
        int offset = 0;

        rcu_read_lock();

        /* Do not print rules with inactive LSM labels */
        for (i = 0; i < MAX_LSM_RULES; i++) {
                if (entry->lsm[i].args_p && !entry->lsm[i].rule) {
                        rcu_read_unlock();
                        return 0;
                }
        }

        if (entry->action & MEASURE)
                seq_puts(m, pt(Opt_measure));
        if (entry->action & DONT_MEASURE)
                seq_puts(m, pt(Opt_dont_measure));
        if (entry->action & APPRAISE)
                seq_puts(m, pt(Opt_appraise));
        if (entry->action & DONT_APPRAISE)
                seq_puts(m, pt(Opt_dont_appraise));
        if (entry->action & AUDIT)
                seq_puts(m, pt(Opt_audit));
        if (entry->action & DONT_AUDIT)
                seq_puts(m, pt(Opt_dont_audit));
        if (entry->action & HASH)
                seq_puts(m, pt(Opt_hash));
        if (entry->action & DONT_HASH)
                seq_puts(m, pt(Opt_dont_hash));

        seq_puts(m, " ");

        if (entry->flags & IMA_FUNC)
                policy_func_show(m, entry->func);

        if ((entry->flags & IMA_MASK) || (entry->flags & IMA_INMASK)) {
                if (entry->flags & IMA_MASK)
                        offset = 1;
                if (entry->mask & MAY_EXEC)
                        seq_printf(m, pt(Opt_mask), mt(mask_exec) + offset);
                if (entry->mask & MAY_WRITE)
                        seq_printf(m, pt(Opt_mask), mt(mask_write) + offset);
                if (entry->mask & MAY_READ)
                        seq_printf(m, pt(Opt_mask), mt(mask_read) + offset);
                if (entry->mask & MAY_APPEND)
                        seq_printf(m, pt(Opt_mask), mt(mask_append) + offset);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FSMAGIC) {
                snprintf(tbuf, sizeof(tbuf), "0x%lx", entry->fsmagic);
                seq_printf(m, pt(Opt_fsmagic), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FSNAME) {
                snprintf(tbuf, sizeof(tbuf), "%s", entry->fsname);
                seq_printf(m, pt(Opt_fsname), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FS_SUBTYPE) {
                snprintf(tbuf, sizeof(tbuf), "%s", entry->fs_subtype);
                seq_printf(m, pt(Opt_fs_subtype), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_KEYRINGS) {
                seq_puts(m, "keyrings=");
                ima_show_rule_opt_list(m, entry->keyrings);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_LABEL) {
                seq_puts(m, "label=");
                ima_show_rule_opt_list(m, entry->label);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_PCR) {
                snprintf(tbuf, sizeof(tbuf), "%d", entry->pcr);
                seq_printf(m, pt(Opt_pcr), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FSUUID) {
                seq_printf(m, "fsuuid=%pU", &entry->fsuuid);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_UID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->uid));
                if (entry->uid_op == &uid_gt)
                        seq_printf(m, pt(Opt_uid_gt), tbuf);
                else if (entry->uid_op == &uid_lt)
                        seq_printf(m, pt(Opt_uid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_uid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_EUID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->uid));
                if (entry->uid_op == &uid_gt)
                        seq_printf(m, pt(Opt_euid_gt), tbuf);
                else if (entry->uid_op == &uid_lt)
                        seq_printf(m, pt(Opt_euid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_euid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_GID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->gid));
                if (entry->gid_op == &gid_gt)
                        seq_printf(m, pt(Opt_gid_gt), tbuf);
                else if (entry->gid_op == &gid_lt)
                        seq_printf(m, pt(Opt_gid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_gid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_EGID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->gid));
                if (entry->gid_op == &gid_gt)
                        seq_printf(m, pt(Opt_egid_gt), tbuf);
                else if (entry->gid_op == &gid_lt)
                        seq_printf(m, pt(Opt_egid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_egid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FOWNER) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->fowner));
                if (entry->fowner_op == &vfsuid_gt_kuid)
                        seq_printf(m, pt(Opt_fowner_gt), tbuf);
                else if (entry->fowner_op == &vfsuid_lt_kuid)
                        seq_printf(m, pt(Opt_fowner_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_fowner_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FGROUP) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->fgroup));
                if (entry->fgroup_op == &vfsgid_gt_kgid)
                        seq_printf(m, pt(Opt_fgroup_gt), tbuf);
                else if (entry->fgroup_op == &vfsgid_lt_kgid)
                        seq_printf(m, pt(Opt_fgroup_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_fgroup_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_VALIDATE_ALGOS) {
                seq_puts(m, "appraise_algos=");
                ima_policy_show_appraise_algos(m, entry->allowed_algos);
                seq_puts(m, " ");
        }

        for (i = 0; i < MAX_LSM_RULES; i++) {
                if (entry->lsm[i].rule) {
                        switch (i) {
                        case LSM_OBJ_USER:
                                seq_printf(m, pt(Opt_obj_user),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_OBJ_ROLE:
                                seq_printf(m, pt(Opt_obj_role),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_OBJ_TYPE:
                                seq_printf(m, pt(Opt_obj_type),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_SUBJ_USER:
                                seq_printf(m, pt(Opt_subj_user),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_SUBJ_ROLE:
                                seq_printf(m, pt(Opt_subj_role),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_SUBJ_TYPE:
                                seq_printf(m, pt(Opt_subj_type),
                                           entry->lsm[i].args_p);
                                break;
                        }
                        seq_puts(m, " ");
                }
        }
        if (entry->template)
                seq_printf(m, "template=%s ", entry->template->name);
        if (entry->flags & IMA_DIGSIG_REQUIRED) {
                if (entry->flags & IMA_SIGV3_REQUIRED)
                        seq_puts(m, "appraise_type=sigv3 ");
                else if (entry->flags & IMA_MODSIG_ALLOWED)
                        seq_puts(m, "appraise_type=imasig|modsig ");
                else
                        seq_puts(m, "appraise_type=imasig ");
        }
        if (entry->flags & IMA_VERITY_REQUIRED)
                seq_puts(m, "digest_type=verity ");
        if (entry->flags & IMA_PERMIT_DIRECTIO)
                seq_puts(m, "permit_directio ");
        rcu_read_unlock();
        seq_puts(m, "\n");
        return 0;
}
#endif        /* CONFIG_IMA_READ_POLICY */

#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING)
/*
 * ima_appraise_signature: whether IMA will appraise a given function using
 * an IMA digital signature. This is restricted to cases where the kernel
 * has a set of built-in trusted keys in order to avoid an attacker simply
 * loading additional keys.
 */
bool ima_appraise_signature(enum kernel_read_file_id id)
{
        struct ima_rule_entry *entry;
        bool found = false;
        enum ima_hooks func;
        struct list_head *ima_rules_tmp;

        if (id >= READING_MAX_ID)
                return false;

        if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE)
            && security_locked_down(LOCKDOWN_KEXEC))
                return false;

        func = read_idmap[id] ?: FILE_CHECK;

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {
                if (entry->action != APPRAISE)
                        continue;

                /*
                 * A generic entry will match, but otherwise require that it
                 * match the func we're looking for
                 */
                if (entry->func && entry->func != func)
                        continue;

                /*
                 * We require this to be a digital signature, not a raw IMA
                 * hash.
                 */
                if (entry->flags & IMA_DIGSIG_REQUIRED)
                        found = true;

                /*
                 * We've found a rule that matches, so break now even if it
                 * didn't require a digital signature - a later rule that does
                 * won't override it, so would be a false positive.
                 */
                break;
        }

        rcu_read_unlock();
        return found;
}
#endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */











































































































    2 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// SPDX-License-Identifier: GPL-2.0
/* User-mappable watch queue
 *
 * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * See Documentation/core-api/watch_queue.rst
 */

#ifndef _LINUX_WATCH_QUEUE_H
#define _LINUX_WATCH_QUEUE_H

#include <uapi/linux/watch_queue.h>
#include <linux/kref.h>
#include <linux/rcupdate.h>

#ifdef CONFIG_WATCH_QUEUE

struct cred;

struct watch_type_filter {
        enum watch_notification_type type;
        __u32                subtype_filter[1];        /* Bitmask of subtypes to filter on */
        __u32                info_filter;                /* Filter on watch_notification::info */
        __u32                info_mask;                /* Mask of relevant bits in info_filter */
};

struct watch_filter {
        union {
                struct rcu_head        rcu;
                /* Bitmask of accepted types */
                DECLARE_BITMAP(type_filter, WATCH_TYPE__NR);
        };
        u32                        nr_filters;        /* Number of filters */
        struct watch_type_filter filters[] __counted_by(nr_filters);
};

struct watch_queue {
        struct rcu_head                rcu;
        struct watch_filter __rcu *filter;
        struct pipe_inode_info        *pipe;                /* Pipe we use as a buffer, NULL if queue closed */
        struct hlist_head        watches;        /* Contributory watches */
        struct page                **notes;        /* Preallocated notifications */
        unsigned long                *notes_bitmap;        /* Allocation bitmap for notes */
        struct kref                usage;                /* Object usage count */
        spinlock_t                lock;
        unsigned int                nr_notes;        /* Number of notes */
        unsigned int                nr_pages;        /* Number of pages in notes[] */
};

/*
 * Representation of a watch on an object.
 */
struct watch {
        union {
                struct rcu_head        rcu;
                u32                info_id;        /* ID to be OR'd in to info field */
        };
        struct watch_queue __rcu *queue;        /* Queue to post events to */
        struct hlist_node        queue_node;        /* Link in queue->watches */
        struct watch_list __rcu        *watch_list;
        struct hlist_node        list_node;        /* Link in watch_list->watchers */
        const struct cred        *cred;                /* Creds of the owner of the watch */
        void                        *private;        /* Private data for the watched object */
        u64                        id;                /* Internal identifier */
        struct kref                usage;                /* Object usage count */
};

/*
 * List of watches on an object.
 */
struct watch_list {
        struct rcu_head                rcu;
        struct hlist_head        watchers;
        void (*release_watch)(struct watch *);
        spinlock_t                lock;
};

extern void __post_watch_notification(struct watch_list *,
                                      struct watch_notification *,
                                      const struct cred *,
                                      u64);
extern struct watch_queue *get_watch_queue(int);
extern void put_watch_queue(struct watch_queue *);
extern void init_watch(struct watch *, struct watch_queue *);
extern int add_watch_to_object(struct watch *, struct watch_list *);
extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool);
extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int);
extern long watch_queue_set_filter(struct pipe_inode_info *,
                                   struct watch_notification_filter __user *);
extern int watch_queue_init(struct pipe_inode_info *);
extern void watch_queue_clear(struct watch_queue *);

static inline void init_watch_list(struct watch_list *wlist,
                                   void (*release_watch)(struct watch *))
{
        INIT_HLIST_HEAD(&wlist->watchers);
        spin_lock_init(&wlist->lock);
        wlist->release_watch = release_watch;
}

static inline void post_watch_notification(struct watch_list *wlist,
                                           struct watch_notification *n,
                                           const struct cred *cred,
                                           u64 id)
{
        if (unlikely(wlist))
                __post_watch_notification(wlist, n, cred, id);
}

static inline void remove_watch_list(struct watch_list *wlist, u64 id)
{
        if (wlist) {
                remove_watch_from_object(wlist, NULL, id, true);
                kfree_rcu(wlist, rcu);
        }
}

/**
 * watch_sizeof - Calculate the information part of the size of a watch record,
 * given the structure size.
 */
#define watch_sizeof(STRUCT) (sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT)

#else
static inline int watch_queue_init(struct pipe_inode_info *pipe)
{
        return -ENOPKG;
}

#endif

#endif /* _LINUX_WATCH_QUEUE_H */























































































































































































































































































































































    1 
    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
// SPDX-License-Identifier: GPL-2.0-only
/*
 * async.c: Asynchronous function calls for boot performance
 *
 * (C) Copyright 2009 Intel Corporation
 * Author: Arjan van de Ven <arjan@linux.intel.com>
 */


/*

Goals and Theory of Operation

The primary goal of this feature is to reduce the kernel boot time,
by doing various independent hardware delays and discovery operations
decoupled and not strictly serialized.

More specifically, the asynchronous function call concept allows
certain operations (primarily during system boot) to happen
asynchronously, out of order, while these operations still
have their externally visible parts happen sequentially and in-order.
(not unlike how out-of-order CPUs retire their instructions in order)

Key to the asynchronous function call implementation is the concept of
a "sequence cookie" (which, although it has an abstracted type, can be
thought of as a monotonically incrementing number).

The async core will assign each scheduled event such a sequence cookie and
pass this to the called functions.

The asynchronously called function should before doing a globally visible
operation, such as registering device numbers, call the
async_synchronize_cookie() function and pass in its own cookie. The
async_synchronize_cookie() function will make sure that all asynchronous
operations that were scheduled prior to the operation corresponding with the
cookie have completed.

Subsystem/driver initialization code that scheduled asynchronous probe
functions, but which shares global resources with other drivers/subsystems
that do not use the asynchronous call feature, need to do a full
synchronization with the async_synchronize_full() function, before returning
from their init function. This is to maintain strict ordering between the
asynchronous and synchronous parts of the kernel.

*/

#include <linux/async.h>
#include <linux/atomic.h>
#include <linux/export.h>
#include <linux/ktime.h>
#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/workqueue.h>

#include "workqueue_internal.h"

static async_cookie_t next_cookie = 1;

#define MAX_WORK                32768
#define ASYNC_COOKIE_MAX        ULLONG_MAX        /* infinity cookie */

static LIST_HEAD(async_global_pending);        /* pending from all registered doms */
static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
static struct workqueue_struct *async_wq;

struct async_entry {
        struct list_head        domain_list;
        struct list_head        global_list;
        struct work_struct        work;
        async_cookie_t                cookie;
        async_func_t                func;
        void                        *data;
        struct async_domain        *domain;
};

static DECLARE_WAIT_QUEUE_HEAD(async_done);

static atomic_t entry_count;

static long long microseconds_since(ktime_t start)
{
        ktime_t now = ktime_get();
        return ktime_to_ns(ktime_sub(now, start)) >> 10;
}

static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
        struct async_entry *first = NULL;
        async_cookie_t ret = ASYNC_COOKIE_MAX;
        unsigned long flags;

        spin_lock_irqsave(&async_lock, flags);

        if (domain) {
                if (!list_empty(&domain->pending))
                        first = list_first_entry(&domain->pending,
                                        struct async_entry, domain_list);
        } else {
                if (!list_empty(&async_global_pending))
                        first = list_first_entry(&async_global_pending,
                                        struct async_entry, global_list);
        }

        if (first)
                ret = first->cookie;

        spin_unlock_irqrestore(&async_lock, flags);
        return ret;
}

/*
 * pick the first pending entry and run it
 */
static void async_run_entry_fn(struct work_struct *work)
{
        struct async_entry *entry =
                container_of(work, struct async_entry, work);
        unsigned long flags;
        ktime_t calltime;

        /* 1) run (and print duration) */
        pr_debug("calling  %lli_%pS @ %i\n", (long long)entry->cookie,
                 entry->func, task_pid_nr(current));
        calltime = ktime_get();

        entry->func(entry->data, entry->cookie);

        pr_debug("initcall %lli_%pS returned after %lld usecs\n",
                 (long long)entry->cookie, entry->func,
                 microseconds_since(calltime));

        /* 2) remove self from the pending queues */
        spin_lock_irqsave(&async_lock, flags);
        list_del_init(&entry->domain_list);
        list_del_init(&entry->global_list);

        /* 3) free the entry */
        kfree(entry);
        atomic_dec(&entry_count);

        spin_unlock_irqrestore(&async_lock, flags);

        /* 4) wake up any waiters */
        wake_up(&async_done);
}

static async_cookie_t __async_schedule_node_domain(async_func_t func,
                                                   void *data, int node,
                                                   struct async_domain *domain,
                                                   struct async_entry *entry)
{
        async_cookie_t newcookie;
        unsigned long flags;

        INIT_LIST_HEAD(&entry->domain_list);
        INIT_LIST_HEAD(&entry->global_list);
        INIT_WORK(&entry->work, async_run_entry_fn);
        entry->func = func;
        entry->data = data;
        entry->domain = domain;

        spin_lock_irqsave(&async_lock, flags);

        /* allocate cookie and queue */
        newcookie = entry->cookie = next_cookie++;

        list_add_tail(&entry->domain_list, &domain->pending);
        if (domain->registered)
                list_add_tail(&entry->global_list, &async_global_pending);

        atomic_inc(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);

        /* schedule for execution */
        queue_work_node(node, async_wq, &entry->work);

        return newcookie;
}

/**
 * async_schedule_node_domain - NUMA specific version of async_schedule_domain
 * @func: function to execute asynchronously
 * @data: data pointer to pass to the function
 * @node: NUMA node that we want to schedule this on or close to
 * @domain: the domain
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * @domain may be used in the async_synchronize_*_domain() functions to
 * wait within a certain synchronization domain rather than globally.
 *
 * Note: This function may be called from atomic or non-atomic contexts.
 *
 * The node requested will be honored on a best effort basis. If the node
 * has no CPUs associated with it then the work is distributed among all
 * available CPUs.
 */
async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
                                          int node, struct async_domain *domain)
{
        struct async_entry *entry;
        unsigned long flags;
        async_cookie_t newcookie;

        /* allow irq-off callers */
        entry = kzalloc_obj(struct async_entry, GFP_ATOMIC);

        /*
         * If we're out of memory or if there's too much work
         * pending already, we execute synchronously.
         */
        if (!entry || atomic_read(&entry_count) > MAX_WORK) {
                kfree(entry);
                spin_lock_irqsave(&async_lock, flags);
                newcookie = next_cookie++;
                spin_unlock_irqrestore(&async_lock, flags);

                /* low on memory.. run synchronously */
                func(data, newcookie);
                return newcookie;
        }

        return __async_schedule_node_domain(func, data, node, domain, entry);
}
EXPORT_SYMBOL_GPL(async_schedule_node_domain);

/**
 * async_schedule_node - NUMA specific version of async_schedule
 * @func: function to execute asynchronously
 * @data: data pointer to pass to the function
 * @node: NUMA node that we want to schedule this on or close to
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * Note: This function may be called from atomic or non-atomic contexts.
 *
 * The node requested will be honored on a best effort basis. If the node
 * has no CPUs associated with it then the work is distributed among all
 * available CPUs.
 */
async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
{
        return async_schedule_node_domain(func, data, node, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule_node);

/**
 * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
 * @func: function to execute asynchronously
 * @dev: device argument to be passed to function
 *
 * @dev is used as both the argument for the function and to provide NUMA
 * context for where to run the function.
 *
 * If the asynchronous execution of @func is scheduled successfully, return
 * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
 * that will run the function synchronously then.
 */
bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
{
        struct async_entry *entry;

        entry = kzalloc_obj(struct async_entry);

        /* Give up if there is no memory or too much work. */
        if (!entry || atomic_read(&entry_count) > MAX_WORK) {
                kfree(entry);
                return false;
        }

        __async_schedule_node_domain(func, dev, dev_to_node(dev),
                                     &async_dfl_domain, entry);
        return true;
}

/**
 * async_synchronize_full - synchronize all asynchronous function calls
 *
 * This function waits until all asynchronous function calls have been done.
 */
void async_synchronize_full(void)
{
        async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);

/**
 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
 * @domain: the domain to synchronize
 *
 * This function waits until all asynchronous function calls for the
 * synchronization domain specified by @domain have been done.
 */
void async_synchronize_full_domain(struct async_domain *domain)
{
        async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);

/**
 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
 * @cookie: async_cookie_t to use as checkpoint
 * @domain: the domain to synchronize (%NULL for all registered domains)
 *
 * This function waits until all asynchronous function calls for the
 * synchronization domain specified by @domain submitted prior to @cookie
 * have been done.
 */
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
        ktime_t starttime;

        pr_debug("async_waiting @ %i\n", task_pid_nr(current));
        starttime = ktime_get();

        wait_event(async_done, lowest_in_progress(domain) >= cookie);

        pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
                 microseconds_since(starttime));
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);

/**
 * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
 * @cookie: async_cookie_t to use as checkpoint
 *
 * This function waits until all asynchronous function calls prior to @cookie
 * have been done.
 */
void async_synchronize_cookie(async_cookie_t cookie)
{
        async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);

/**
 * current_is_async - is %current an async worker task?
 *
 * Returns %true if %current is an async worker task.
 */
bool current_is_async(void)
{
        struct worker *worker = current_wq_worker();

        return worker && worker->current_func == async_run_entry_fn;
}
EXPORT_SYMBOL_GPL(current_is_async);

void __init async_init(void)
{
        /*
         * Async can schedule a number of interdependent work items. However,
         * unbound workqueues can handle only upto min_active interdependent
         * work items. The default min_active of 8 isn't sufficient for async
         * and can lead to stalls. Let's use a dedicated workqueue with raised
         * min_active.
         */
        async_wq = alloc_workqueue("async", WQ_UNBOUND, 0);
        BUG_ON(!async_wq);
        workqueue_set_min_active(async_wq, WQ_DFL_ACTIVE);
}



































































































































































































   13 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BLK_INTEGRITY_H
#define _LINUX_BLK_INTEGRITY_H

#include <linux/blk-mq.h>
#include <linux/bio-integrity.h>
#include <linux/blk-mq-dma.h>

struct request;

enum blk_integrity_flags {
        BLK_INTEGRITY_NOVERIFY                = 1 << 0,
        BLK_INTEGRITY_NOGENERATE        = 1 << 1,
        BLK_INTEGRITY_DEVICE_CAPABLE        = 1 << 2,
        BLK_INTEGRITY_REF_TAG                = 1 << 3,
        BLK_INTEGRITY_STACKED                = 1 << 4,
        BLK_SPLIT_INTERVAL_CAPABLE        = 1 << 5,
};

const char *blk_integrity_profile_name(struct blk_integrity *bi);
bool queue_limits_stack_integrity(struct queue_limits *t,
                struct queue_limits *b);
static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t,
                struct block_device *bdev)
{
        return queue_limits_stack_integrity(t, &bdev->bd_disk->queue->limits);
}

#ifdef CONFIG_BLK_DEV_INTEGRITY
int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);

int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
                              ssize_t bytes);
int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
                     struct logical_block_metadata_cap __user *argp);
bool blk_rq_integrity_dma_map_iter_start(struct request *req,
                struct device *dma_dev,  struct dma_iova_state *state,
                struct blk_dma_iter *iter);
bool blk_rq_integrity_dma_map_iter_next(struct request *req,
                struct device *dma_dev, struct blk_dma_iter *iter);

static inline bool
blk_integrity_queue_supports_integrity(struct request_queue *q)
{
        return q->limits.integrity.metadata_size;
}

static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
{
        if (!blk_integrity_queue_supports_integrity(disk->queue))
                return NULL;
        return &disk->queue->limits.integrity;
}

static inline struct blk_integrity *
bdev_get_integrity(struct block_device *bdev)
{
        return blk_get_integrity(bdev->bd_disk);
}

static inline unsigned short
queue_max_integrity_segments(const struct request_queue *q)
{
        return q->limits.max_integrity_segments;
}

/**
 * bio_integrity_intervals - Return number of integrity intervals for a bio
 * @bi:                blk_integrity profile for device
 * @sectors:        Size of the bio in 512-byte sectors
 *
 * Description: The block layer calculates everything in 512 byte
 * sectors but integrity metadata is done in terms of the data integrity
 * interval size of the storage device.  Convert the block layer sectors
 * to the appropriate number of integrity intervals.
 */
static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
                                                   unsigned int sectors)
{
        return sectors >> (bi->interval_exp - 9);
}

static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
                                               unsigned int sectors)
{
        return bio_integrity_intervals(bi, sectors) * bi->metadata_size;
}

static inline bool blk_integrity_rq(const struct request *rq)
{
        return rq->cmd_flags & REQ_INTEGRITY;
}

/*
 * Return the current bvec that contains the integrity data. bip_iter may be
 * advanced to iterate over the integrity data.
 */
static inline struct bio_vec rq_integrity_vec(struct request *rq)
{
        return mp_bvec_iter_bvec(rq->bio->bi_integrity->bip_vec,
                                 rq->bio->bi_integrity->bip_iter);
}
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
                                   struct logical_block_metadata_cap __user *argp)
{
        return -ENOIOCTLCMD;
}
static inline int blk_rq_count_integrity_sg(struct request_queue *q,
                                            struct bio *b)
{
        return 0;
}
static inline int blk_rq_map_integrity_sg(struct request *q,
                                          struct scatterlist *s)
{
        return 0;
}
static inline int blk_rq_integrity_map_user(struct request *rq,
                                            void __user *ubuf,
                                            ssize_t bytes)
{
        return -EINVAL;
}
static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req,
                struct device *dma_dev,  struct dma_iova_state *state,
                struct blk_dma_iter *iter)
{
        return false;
}
static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req,
                struct device *dma_dev, struct blk_dma_iter *iter)
{
        return false;
}
static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
{
        return NULL;
}
static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
{
        return NULL;
}
static inline bool
blk_integrity_queue_supports_integrity(struct request_queue *q)
{
        return false;
}
static inline unsigned short
queue_max_integrity_segments(const struct request_queue *q)
{
        return 0;
}

static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
                                                   unsigned int sectors)
{
        return 0;
}

static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
                                               unsigned int sectors)
{
        return 0;
}
static inline bool blk_integrity_rq(const struct request *rq)
{
        return false;
}

static inline struct bio_vec rq_integrity_vec(struct request *rq)
{
        /* the optimizer will remove all calls to this function */
        return (struct bio_vec){ };
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */

enum bio_integrity_action {
        BI_ACT_BUFFER                = (1u << 0),        /* allocate buffer */
        BI_ACT_CHECK                = (1u << 1),        /* generate / verify PI */
        BI_ACT_ZERO                = (1u << 2),        /* zero buffer */
};

/**
 * bio_integrity_action - return the integrity action needed for a bio
 * @bio:        bio to operate on
 *
 * Returns the mask of integrity actions (BI_ACT_*) that need to be performed
 * for @bio.
 */
unsigned int __bio_integrity_action(struct bio *bio);
static inline unsigned int bio_integrity_action(struct bio *bio)
{
        if (!blk_get_integrity(bio->bi_bdev->bd_disk))
                return 0;
        if (bio_integrity(bio))
                return 0;
        return __bio_integrity_action(bio);
}

#endif /* _LINUX_BLK_INTEGRITY_H */




























































































   17 






















   16 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Derived from arch/ppc/mm/extable.c and arch/i386/mm/extable.c.
 *
 * Copyright (C) 2004 Paul Mackerras, IBM Corp.
 */

#include <linux/bsearch.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sort.h>
#include <linux/uaccess.h>
#include <linux/extable.h>

#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define ex_to_insn(x)        ((x)->insn)
#else
static inline unsigned long ex_to_insn(const struct exception_table_entry *x)
{
        return (unsigned long)&x->insn + x->insn;
}
#endif

#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define swap_ex                NULL
#else
static void swap_ex(void *a, void *b, int size)
{
        struct exception_table_entry *x = a, *y = b, tmp;
        int delta = b - a;

        tmp = *x;
        x->insn = y->insn + delta;
        y->insn = tmp.insn - delta;

#ifdef swap_ex_entry_fixup
        swap_ex_entry_fixup(x, y, tmp, delta);
#else
        x->fixup = y->fixup + delta;
        y->fixup = tmp.fixup - delta;
#endif
}
#endif /* ARCH_HAS_RELATIVE_EXTABLE */

/*
 * The exception table needs to be sorted so that the binary
 * search that we use to find entries in it works properly.
 * This is used both for the kernel exception table and for
 * the exception tables of modules that get loaded.
 */
static int cmp_ex_sort(const void *a, const void *b)
{
        const struct exception_table_entry *x = a, *y = b;

        /* avoid overflow */
        if (ex_to_insn(x) > ex_to_insn(y))
                return 1;
        if (ex_to_insn(x) < ex_to_insn(y))
                return -1;
        return 0;
}

void sort_extable(struct exception_table_entry *start,
                  struct exception_table_entry *finish)
{
        sort(start, finish - start, sizeof(struct exception_table_entry),
             cmp_ex_sort, swap_ex);
}

#ifdef CONFIG_MODULES
/*
 * If the exception table is sorted, any referring to the module init
 * will be at the beginning or the end.
 */
void trim_init_extable(struct module *m)
{
        /*trim the beginning*/
        while (m->num_exentries &&
               within_module_init(ex_to_insn(&m->extable[0]), m)) {
                m->extable++;
                m->num_exentries--;
        }
        /*trim the end*/
        while (m->num_exentries &&
               within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]),
                                  m))
                m->num_exentries--;
}
#endif /* CONFIG_MODULES */

static int cmp_ex_search(const void *key, const void *elt)
{
        const struct exception_table_entry *_elt = elt;
        unsigned long _key = *(unsigned long *)key;

        /* avoid overflow */
        if (_key > ex_to_insn(_elt))
                return 1;
        if (_key < ex_to_insn(_elt))
                return -1;
        return 0;
}

/*
 * Search one exception table for an entry corresponding to the
 * given instruction address, and return the address of the entry,
 * or NULL if none is found.
 * We use a binary search, and thus we assume that the table is
 * already sorted.
 */
const struct exception_table_entry *
search_extable(const struct exception_table_entry *base,
               const size_t num,
               unsigned long value)
{
        return bsearch(&value, base, num,
                       sizeof(struct exception_table_entry), cmp_ex_search);
}






















































































































   38 
















   26 

   29 





















































































































































































































   32 





   27 





































   16 







   16 






















   17 
   17 








   15 



























   15 













































































































































































































































































   17 









   17 





   17 
















































   29 
























   28 


   32 





   31 


   28 
    6 






   27 

   32 






   29 





   32 
   28 











   31 
    6 




   10 
   20 



   31 
   30 


    5 
   31 



   32 





























































































































































   30 







   29 








































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic helpers for smp ipi calls
 *
 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/irq_work.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <linux/hypervisor.h>
#include <linux/sched/clock.h>
#include <linux/nmi.h>
#include <linux/sched/debug.h>
#include <linux/jump_label.h>
#include <linux/string_choices.h>

#include <trace/events/ipi.h>
#define CREATE_TRACE_POINTS
#include <trace/events/csd.h>
#undef CREATE_TRACE_POINTS

#include "smpboot.h"
#include "sched/smp.h"

#define CSD_TYPE(_csd)        ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)

struct call_function_data {
        call_single_data_t        __percpu *csd;
        cpumask_var_t                cpumask;
        cpumask_var_t                cpumask_ipi;
};

static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);

static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);

static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);

static void __flush_smp_call_function_queue(bool warn_cpu_offline);

int smpcfd_prepare_cpu(unsigned int cpu)
{
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

        if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
                                     cpu_to_node(cpu)))
                return -ENOMEM;
        if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
                                     cpu_to_node(cpu))) {
                free_cpumask_var(cfd->cpumask);
                return -ENOMEM;
        }
        cfd->csd = alloc_percpu(call_single_data_t);
        if (!cfd->csd) {
                free_cpumask_var(cfd->cpumask);
                free_cpumask_var(cfd->cpumask_ipi);
                return -ENOMEM;
        }

        return 0;
}

int smpcfd_dead_cpu(unsigned int cpu)
{
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

        free_cpumask_var(cfd->cpumask);
        free_cpumask_var(cfd->cpumask_ipi);
        free_percpu(cfd->csd);
        return 0;
}

int smpcfd_dying_cpu(unsigned int cpu)
{
        /*
         * The IPIs for the smp-call-function callbacks queued by other CPUs
         * might arrive late, either due to hardware latencies or because this
         * CPU disabled interrupts (inside stop-machine) before the IPIs were
         * sent. So flush out any pending callbacks explicitly (without waiting
         * for the IPIs to arrive), to ensure that the outgoing CPU doesn't go
         * offline with work still pending.
         *
         * This runs with interrupts disabled inside the stopper task invoked by
         * stop_machine(), ensuring mutually exclusive CPU offlining and IPI flush.
         */
        __flush_smp_call_function_queue(false);
        irq_work_run();
        return 0;
}

void __init call_function_init(void)
{
        int i;

        for_each_possible_cpu(i)
                init_llist_head(&per_cpu(call_single_queue, i));

        smpcfd_prepare_cpu(smp_processor_id());
}

static __always_inline void
send_call_function_single_ipi(int cpu)
{
        if (call_function_single_prep_ipi(cpu)) {
                trace_ipi_send_cpu(cpu, _RET_IP_,
                                   generic_smp_call_function_single_interrupt);
                arch_send_call_function_single_ipi(cpu);
        }
}

static __always_inline void
send_call_function_ipi_mask(struct cpumask *mask)
{
        trace_ipi_send_cpumask(mask, _RET_IP_,
                               generic_smp_call_function_single_interrupt);
        arch_send_call_function_ipi_mask(mask);
}

static __always_inline void
csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
{
        trace_csd_function_entry(func, csd);
        func(info);
        trace_csd_function_exit(func, csd);
}

#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG

static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);

/*
 * Parse the csdlock_debug= kernel boot parameter.
 *
 * If you need to restore the old "ext" value that once provided
 * additional debugging information, reapply the following commits:
 *
 * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
 * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
 */
static int __init csdlock_debug(char *str)
{
        int ret;
        unsigned int val = 0;

        ret = get_option(&str, &val);
        if (ret) {
                if (val)
                        static_branch_enable(&csdlock_debug_enabled);
                else
                        static_branch_disable(&csdlock_debug_enabled);
        }

        return 1;
}
__setup("csdlock_debug=", csdlock_debug);

static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
static DEFINE_PER_CPU(void *, cur_csd_info);

static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
module_param(csd_lock_timeout, ulong, 0644);
static int panic_on_ipistall;  /* CSD panic timeout in milliseconds, 300000 for five minutes. */
module_param(panic_on_ipistall, int, 0644);

static atomic_t csd_bug_count = ATOMIC_INIT(0);

/* Record current CSD work for current CPU, NULL to erase. */
static void __csd_lock_record(call_single_data_t *csd)
{
        if (!csd) {
                smp_mb(); /* NULL cur_csd after unlock. */
                __this_cpu_write(cur_csd, NULL);
                return;
        }
        __this_cpu_write(cur_csd_func, csd->func);
        __this_cpu_write(cur_csd_info, csd->info);
        smp_wmb(); /* func and info before csd. */
        __this_cpu_write(cur_csd, csd);
        smp_mb(); /* Update cur_csd before function call. */
                  /* Or before unlock, as the case may be. */
}

static __always_inline void csd_lock_record(call_single_data_t *csd)
{
        if (static_branch_unlikely(&csdlock_debug_enabled))
                __csd_lock_record(csd);
}

static int csd_lock_wait_getcpu(call_single_data_t *csd)
{
        unsigned int csd_type;

        csd_type = CSD_TYPE(csd);
        if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
                return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
        return -1;
}

static atomic_t n_csd_lock_stuck;

/**
 * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long?
 *
 * Returns: @true if a CSD-lock acquisition is stuck and has been stuck
 * long enough for a "non-responsive CSD lock" message to be printed.
 */
bool csd_lock_is_stuck(void)
{
        return !!atomic_read(&n_csd_lock_stuck);
}

/*
 * Complain if too much time spent waiting.  Note that only
 * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
 * so waiting on other types gets much less information.
 */
static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages)
{
        int cpu = -1;
        int cpux;
        bool firsttime;
        u64 ts2, ts_delta;
        call_single_data_t *cpu_cur_csd;
        unsigned int flags = READ_ONCE(csd->node.u_flags);
        unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;

        if (!(flags & CSD_FLAG_LOCK)) {
                if (!unlikely(*bug_id))
                        return true;
                cpu = csd_lock_wait_getcpu(csd);
                pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
                         *bug_id, raw_smp_processor_id(), cpu);
                atomic_dec(&n_csd_lock_stuck);
                return true;
        }

        ts2 = ktime_get_mono_fast_ns();
        /* How long since we last checked for a stuck CSD lock.*/
        ts_delta = ts2 - *ts1;
        if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) *
                               (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) ||
                   csd_lock_timeout_ns == 0))
                return false;

        if (ts0 > ts2) {
                /* Our own sched_clock went backward; don't blame another CPU. */
                ts_delta = ts0 - ts2;
                pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta);
                *ts1 = ts2;
                return false;
        }

        firsttime = !*bug_id;
        if (firsttime)
                *bug_id = atomic_inc_return(&csd_bug_count);
        cpu = csd_lock_wait_getcpu(csd);
        if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
                cpux = 0;
        else
                cpux = cpu;
        cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
        /* How long since this CSD lock was stuck. */
        ts_delta = ts2 - ts0;
        pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n",
                 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta,
                 cpu, csd->func, csd->info);
        (*nmessages)++;
        if (firsttime)
                atomic_inc(&n_csd_lock_stuck);
        /*
         * If the CSD lock is still stuck after 5 minutes, it is unlikely
         * to become unstuck. Use a signed comparison to avoid triggering
         * on underflows when the TSC is out of sync between sockets.
         */
        BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
        if (cpu_cur_csd && csd != cpu_cur_csd) {
                pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
                         *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
                         READ_ONCE(per_cpu(cur_csd_info, cpux)));
        } else {
                pr_alert("\tcsd: CSD lock (#%d) %s.\n",
                         *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
        }
        if (cpu >= 0) {
                if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0))
                        dump_cpu_task(cpu);
                if (!cpu_cur_csd) {
                        pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
                        arch_send_call_function_single_ipi(cpu);
                }
        }
        if (firsttime)
                dump_stack();
        *ts1 = ts2;

        return false;
}

/*
 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
 *
 * For non-synchronous ipi calls the csd can still be in use by the
 * previous function call. For multi-cpu calls its even more interesting
 * as we'll have to ensure no other cpu is observing our csd.
 */
static void __csd_lock_wait(call_single_data_t *csd)
{
        unsigned long nmessages = 0;
        int bug_id = 0;
        u64 ts0, ts1;

        ts1 = ts0 = ktime_get_mono_fast_ns();
        for (;;) {
                if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages))
                        break;
                cpu_relax();
        }
        smp_acquire__after_ctrl_dep();
}

static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
        if (static_branch_unlikely(&csdlock_debug_enabled)) {
                __csd_lock_wait(csd);
                return;
        }

        smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#else
static void csd_lock_record(call_single_data_t *csd)
{
}

static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
        smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#endif

static __always_inline void csd_lock(call_single_data_t *csd)
{
        csd_lock_wait(csd);
        csd->node.u_flags |= CSD_FLAG_LOCK;

        /*
         * prevent CPU from reordering the above assignment
         * to ->flags with any subsequent assignments to other
         * fields of the specified call_single_data_t structure:
         */
        smp_wmb();
}

static __always_inline void csd_unlock(call_single_data_t *csd)
{
        WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));

        /*
         * ensure we're all done before releasing data:
         */
        smp_store_release(&csd->node.u_flags, 0);
}

static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);

#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
static call_single_data_t *get_single_csd_data(int cpu)
{
        if (static_branch_unlikely(&csdlock_debug_enabled))
                return per_cpu_ptr(&csd_data, cpu);
        return this_cpu_ptr(&csd_data);
}
#else
static call_single_data_t *get_single_csd_data(int cpu)
{
        return this_cpu_ptr(&csd_data);
}
#endif

void __smp_call_single_queue(int cpu, struct llist_node *node)
{
        /*
         * We have to check the type of the CSD before queueing it, because
         * once queued it can have its flags cleared by
         *   flush_smp_call_function_queue()
         * even if we haven't sent the smp_call IPI yet (e.g. the stopper
         * executes migration_cpu_stop() on the remote CPU).
         */
        if (trace_csd_queue_cpu_enabled()) {
                call_single_data_t *csd;
                smp_call_func_t func;

                csd = container_of(node, call_single_data_t, node.llist);
                func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
                        sched_ttwu_pending : csd->func;

                trace_call__csd_queue_cpu(cpu, _RET_IP_, func, csd);
        }

        /*
         * The list addition should be visible to the target CPU when it pops
         * the head of the list to pull the entry off it in the IPI handler
         * because of normal cache coherency rules implied by the underlying
         * llist ops.
         *
         * If IPIs can go out of order to the cache coherency protocol
         * in an architecture, sufficient synchronisation should be added
         * to arch code to make it appear to obey cache coherency WRT
         * locking and barrier primitives. Generic code isn't really
         * equipped to do the right thing...
         */
        if (llist_add(node, &per_cpu(call_single_queue, cpu)))
                send_call_function_single_ipi(cpu);
}

/*
 * Insert a previously allocated call_single_data_t element
 * for execution on the given CPU. data must already have
 * ->func, ->info, and ->flags set.
 */
static int generic_exec_single(int cpu, call_single_data_t *csd)
{
        /*
         * Preemption already disabled here so stopper cannot run on this CPU,
         * ensuring mutually exclusive CPU offlining and last IPI flush.
         */
        if (cpu == smp_processor_id()) {
                smp_call_func_t func = csd->func;
                void *info = csd->info;
                unsigned long flags;

                /*
                 * We can unlock early even for the synchronous on-stack case,
                 * since we're doing this from the same CPU..
                 */
                csd_lock_record(csd);
                csd_unlock(csd);
                local_irq_save(flags);
                csd_do_func(func, info, NULL);
                csd_lock_record(NULL);
                local_irq_restore(flags);
                return 0;
        }

        if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
                csd_unlock(csd);
                return -ENXIO;
        }

        __smp_call_single_queue(cpu, &csd->node.llist);

        return 0;
}

/**
 * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
 *
 * Invoked by arch to handle an IPI for call function single.
 * Must be called with interrupts disabled.
 */
void generic_smp_call_function_single_interrupt(void)
{
        __flush_smp_call_function_queue(true);
}

/**
 * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 *
 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
 *                      offline CPU. Skip this check if set to 'false'.
 *
 * Flush any pending smp-call-function callbacks queued on this CPU. This is
 * invoked by the generic IPI handler, as well as by a CPU about to go offline,
 * to ensure that all pending IPI callbacks are run before it goes completely
 * offline.
 *
 * Loop through the call_single_queue and run all the queued callbacks.
 * Must be called with interrupts disabled.
 */
static void __flush_smp_call_function_queue(bool warn_cpu_offline)
{
        call_single_data_t *csd, *csd_next;
        struct llist_node *entry, *prev;
        struct llist_head *head;
        static bool warned;
        atomic_t *tbt;

        lockdep_assert_irqs_disabled();

        /* Allow waiters to send backtrace NMI from here onwards */
        tbt = this_cpu_ptr(&trigger_backtrace);
        atomic_set_release(tbt, 1);

        head = this_cpu_ptr(&call_single_queue);
        entry = llist_del_all(head);
        entry = llist_reverse_order(entry);

        /* There shouldn't be any pending callbacks on an offline CPU. */
        if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
                     !warned && entry != NULL)) {
                warned = true;
                WARN(1, "IPI on offline CPU %d\n", smp_processor_id());

                /*
                 * We don't have to use the _safe() variant here
                 * because we are not invoking the IPI handlers yet.
                 */
                llist_for_each_entry(csd, entry, node.llist) {
                        switch (CSD_TYPE(csd)) {
                        case CSD_TYPE_ASYNC:
                        case CSD_TYPE_SYNC:
                        case CSD_TYPE_IRQ_WORK:
                                pr_warn("IPI callback %pS sent to offline CPU\n",
                                        csd->func);
                                break;

                        case CSD_TYPE_TTWU:
                                pr_warn("IPI task-wakeup sent to offline CPU\n");
                                break;

                        default:
                                pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
                                        CSD_TYPE(csd));
                                break;
                        }
                }
        }

        /*
         * First; run all SYNC callbacks, people are waiting for us.
         */
        prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
                /* Do we wait until *after* callback? */
                if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
                        smp_call_func_t func = csd->func;
                        void *info = csd->info;

                        if (prev) {
                                prev->next = &csd_next->node.llist;
                        } else {
                                entry = &csd_next->node.llist;
                        }

                        csd_lock_record(csd);
                        csd_do_func(func, info, csd);
                        csd_unlock(csd);
                        csd_lock_record(NULL);
                } else {
                        prev = &csd->node.llist;
                }
        }

        if (!entry)
                return;

        /*
         * Second; run all !SYNC callbacks.
         */
        prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
                int type = CSD_TYPE(csd);

                if (type != CSD_TYPE_TTWU) {
                        if (prev) {
                                prev->next = &csd_next->node.llist;
                        } else {
                                entry = &csd_next->node.llist;
                        }

                        if (type == CSD_TYPE_ASYNC) {
                                smp_call_func_t func = csd->func;
                                void *info = csd->info;

                                csd_lock_record(csd);
                                csd_unlock(csd);
                                csd_do_func(func, info, csd);
                                csd_lock_record(NULL);
                        } else if (type == CSD_TYPE_IRQ_WORK) {
                                irq_work_single(csd);
                        }

                } else {
                        prev = &csd->node.llist;
                }
        }

        /*
         * Third; only CSD_TYPE_TTWU is left, issue those.
         */
        if (entry) {
                csd = llist_entry(entry, typeof(*csd), node.llist);
                csd_do_func(sched_ttwu_pending, entry, csd);
        }
}


/**
 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 *                                   from task context (idle, migration thread)
 *
 * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
 * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
 * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
 * handle queued SMP function calls before scheduling.
 *
 * The migration thread has to ensure that an eventually pending wakeup has
 * been handled before it migrates a task.
 */
void flush_smp_call_function_queue(void)
{
        unsigned int was_pending;
        unsigned long flags;

        if (llist_empty(this_cpu_ptr(&call_single_queue)))
                return;

        local_irq_save(flags);
        /* Get the already pending soft interrupts for RT enabled kernels */
        was_pending = local_softirq_pending();
        __flush_smp_call_function_queue(true);
        if (local_softirq_pending())
                do_softirq_post_smp_call_flush(was_pending);

        local_irq_restore(flags);
}

/**
 * smp_call_function_single - Run a function on a specific CPU
 * @cpu: Specific target CPU for this function.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed on other CPUs.
 *
 * Returns: %0 on success, else a negative status code.
 */
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                             int wait)
{
        call_single_data_t *csd;
        call_single_data_t csd_stack = {
                .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
        };
        int this_cpu;
        int err;

        /*
         * Prevent preemption and reschedule on another CPU, as well as CPU
         * removal. This prevents stopper from running on this CPU, thus
         * providing mutual exclusion of the below cpu_online() check and
         * IPI sending ensuring IPI are not missed by CPU going offline.
         */
        this_cpu = get_cpu();

        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
         * send smp call function interrupt to this cpu and as such deadlocks
         * can't happen.
         */
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress);

        /*
         * When @wait we can deadlock when we interrupt between llist_add() and
         * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
         * csd_lock() on because the interrupt context uses the same csd
         * storage.
         */
        WARN_ON_ONCE(!in_task());

        csd = &csd_stack;
        if (!wait) {
                csd = get_single_csd_data(cpu);
                csd_lock(csd);
        }

        csd->func = func;
        csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
        csd->node.src = this_cpu;
        csd->node.dst = cpu;
#endif

        err = generic_exec_single(cpu, csd);

        if (wait)
                csd_lock_wait(csd);

        put_cpu();

        return err;
}
EXPORT_SYMBOL(smp_call_function_single);

/**
 * smp_call_function_single_async() - Run an asynchronous function on a
 *                                  specific CPU.
 * @cpu: The CPU to run on.
 * @csd: Pre-allocated and setup data structure
 *
 * Like smp_call_function_single(), but the call is asynchonous and
 * can thus be done from contexts with disabled interrupts.
 *
 * The caller passes his own pre-allocated data structure
 * (ie: embedded in an object) and is responsible for synchronizing it
 * such that the IPIs performed on the @csd are strictly serialized.
 *
 * If the function is called with one csd which has not yet been
 * processed by previous call to smp_call_function_single_async(), the
 * function will return immediately with -EBUSY showing that the csd
 * object is still in progress.
 *
 * NOTE: Be careful, there is unfortunately no current debugging facility to
 * validate the correctness of this serialization.
 *
 * Return: %0 on success or negative errno value on error
 */
int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
        int err = 0;

        preempt_disable();

        if (csd->node.u_flags & CSD_FLAG_LOCK) {
                err = -EBUSY;
                goto out;
        }

        csd->node.u_flags = CSD_FLAG_LOCK;
        smp_wmb();

        err = generic_exec_single(cpu, csd);

out:
        preempt_enable();

        return err;
}
EXPORT_SYMBOL_GPL(smp_call_function_single_async);

/**
 * smp_call_function_any - Run a function on any of the given cpus
 * @mask: The mask of cpus it can run on.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed.
 *
 * Selection preference:
 *        1) current cpu if in @mask
 *        2) nearest cpu in @mask, based on NUMA topology
 *
 * Returns: %0 on success, else a negative status code (if no cpus were online).
 */
int smp_call_function_any(const struct cpumask *mask,
                          smp_call_func_t func, void *info, int wait)
{
        unsigned int cpu;
        int ret;

        /* Try for same CPU (cheapest) */
        cpu = get_cpu();
        if (!cpumask_test_cpu(cpu, mask))
                cpu = sched_numa_find_nth_cpu(mask, 0, cpu_to_node(cpu));

        ret = smp_call_function_single(cpu, func, info, wait);
        put_cpu();
        return ret;
}
EXPORT_SYMBOL_GPL(smp_call_function_any);

/*
 * Flags to be used as scf_flags argument of smp_call_function_many_cond().
 *
 * %SCF_WAIT:                Wait until function execution is completed
 * %SCF_RUN_LOCAL:        Run also locally if local cpu is set in cpumask
 */
#define SCF_WAIT        (1U << 0)
#define SCF_RUN_LOCAL        (1U << 1)

static void smp_call_function_many_cond(const struct cpumask *mask,
                                        smp_call_func_t func, void *info,
                                        unsigned int scf_flags,
                                        smp_cond_func_t cond_func)
{
        int cpu, last_cpu, this_cpu = smp_processor_id();
        struct call_function_data *cfd;
        bool wait = scf_flags & SCF_WAIT;
        int nr_cpus = 0;
        bool run_remote = false;

        lockdep_assert_preemption_disabled();

        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
         * send smp call function interrupt to this cpu and as such deadlocks
         * can't happen.
         */
        if (cpu_online(this_cpu) && !oops_in_progress &&
            !early_boot_irqs_disabled)
                lockdep_assert_irqs_enabled();

        /*
         * When @wait we can deadlock when we interrupt between llist_add() and
         * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
         * csd_lock() on because the interrupt context uses the same csd
         * storage.
         */
        WARN_ON_ONCE(!in_task());

        /* Check if we need remote execution, i.e., any CPU excluding this one. */
        if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) {
                cfd = this_cpu_ptr(&cfd_data);
                cpumask_and(cfd->cpumask, mask, cpu_online_mask);
                __cpumask_clear_cpu(this_cpu, cfd->cpumask);

                cpumask_clear(cfd->cpumask_ipi);
                for_each_cpu(cpu, cfd->cpumask) {
                        call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);

                        if (cond_func && !cond_func(cpu, info)) {
                                __cpumask_clear_cpu(cpu, cfd->cpumask);
                                continue;
                        }

                        /* Work is enqueued on a remote CPU. */
                        run_remote = true;

                        csd_lock(csd);
                        if (wait)
                                csd->node.u_flags |= CSD_TYPE_SYNC;
                        csd->func = func;
                        csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
                        csd->node.src = this_cpu;
                        csd->node.dst = cpu;
#endif
                        trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);

                        /*
                         * Kick the remote CPU if this is the first work
                         * item enqueued.
                         */
                        if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
                                __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
                                nr_cpus++;
                                last_cpu = cpu;
                        }
                }

                /*
                 * Choose the most efficient way to send an IPI. Note that the
                 * number of CPUs might be zero due to concurrent changes to the
                 * provided mask.
                 */
                if (nr_cpus == 1)
                        send_call_function_single_ipi(last_cpu);
                else if (likely(nr_cpus > 1))
                        send_call_function_ipi_mask(cfd->cpumask_ipi);
        }

        /* Check if we need local execution. */
        if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) &&
            (!cond_func || cond_func(this_cpu, info))) {
                unsigned long flags;

                local_irq_save(flags);
                csd_do_func(func, info, NULL);
                local_irq_restore(flags);
        }

        if (run_remote && wait) {
                for_each_cpu(cpu, cfd->cpumask) {
                        call_single_data_t *csd;

                        csd = per_cpu_ptr(cfd->csd, cpu);
                        csd_lock_wait(csd);
                }
        }
}

/**
 * smp_call_function_many() - Run a function on a set of CPUs.
 * @mask: The set of cpus to run on (only runs on online subset).
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler. Preemption
 * must be disabled when calling this function.
 *
 * @func is not called on the local CPU even if @mask contains it.  Consider
 * using on_each_cpu_cond_mask() instead if this is not desirable.
 */
void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
{
        smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
}
EXPORT_SYMBOL(smp_call_function_many);

/**
 * smp_call_function() - Run a function on all other CPUs.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
 * If @wait is true, then returns once @func has returned; otherwise
 * it returns just before the target cpu calls @func.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
 */
void smp_call_function(smp_call_func_t func, void *info, int wait)
{
        preempt_disable();
        smp_call_function_many(cpu_online_mask, func, info, wait);
        preempt_enable();
}
EXPORT_SYMBOL(smp_call_function);

/* Setup configured maximum number of CPUs to activate */
unsigned int setup_max_cpus = NR_CPUS;
EXPORT_SYMBOL(setup_max_cpus);


/*
 * Setup routine for controlling SMP activation
 *
 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
 * activation entirely (the MPS table probe still happens, though).
 *
 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
 * greater than 0, limits the maximum number of CPUs activated in
 * SMP mode to <NUM>.
 */

void __weak __init arch_disable_smp_support(void) { }

static int __init nosmp(char *str)
{
        setup_max_cpus = 0;
        arch_disable_smp_support();

        return 0;
}

early_param("nosmp", nosmp);

/* this is hard limit */
static int __init nrcpus(char *str)
{
        int nr_cpus;

        if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
                set_nr_cpu_ids(nr_cpus);

        return 0;
}

early_param("nr_cpus", nrcpus);

static int __init maxcpus(char *str)
{
        get_option(&str, &setup_max_cpus);
        if (setup_max_cpus == 0)
                arch_disable_smp_support();

        return 0;
}

early_param("maxcpus", maxcpus);

#if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
/* Setup number of possible processor ids */
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
#endif

/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
        set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
}

/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
        int num_nodes, num_cpus;

        idle_threads_init();
        cpuhp_threads_init();

        pr_info("Bringing up secondary CPUs ...\n");

        bringup_nonboot_cpus(setup_max_cpus);

        num_nodes = num_online_nodes();
        num_cpus  = num_online_cpus();
        pr_info("Brought up %d node%s, %d CPU%s\n",
                num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus));

        /* Any cleanup work */
        smp_cpus_done(setup_max_cpus);
}

/**
 * on_each_cpu_cond_mask() - Call a function on each processor for which
 * the supplied function cond_func returns true, optionally waiting
 * for all the required CPUs to finish. This may include the local
 * processor.
 * @cond_func:        A callback function that is passed a cpu id and
 *                the info parameter. The function is called
 *                with preemption disabled. The function should
 *                return a boolean value indicating whether to IPI
 *                the specified CPU.
 * @func:        The function to run on all applicable CPUs.
 *                This must be fast and non-blocking.
 * @info:        An arbitrary pointer to pass to both functions.
 * @wait:        If true, wait (atomically) until function has
 *                completed on other CPUs.
 * @mask:        The set of cpus to run on (only runs on online subset).
 *
 * Preemption is disabled to protect against CPUs going offline but not online.
 * CPUs going online during the call will not be seen or sent an IPI.
 *
 * You must not call this function with disabled interrupts or
 * from a hardware interrupt handler or from a bottom half handler.
 */
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
                           void *info, bool wait, const struct cpumask *mask)
{
        unsigned int scf_flags = SCF_RUN_LOCAL;

        if (wait)
                scf_flags |= SCF_WAIT;

        preempt_disable();
        smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
        preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);

static void do_nothing(void *unused)
{
}

/**
 * kick_all_cpus_sync - Force all cpus out of idle
 *
 * Used to synchronize the update of pm_idle function pointer. It's
 * called after the pointer is updated and returns after the dummy
 * callback function has been executed on all cpus. The execution of
 * the function can only happen on the remote cpus after they have
 * left the idle function which had been called via pm_idle function
 * pointer. So it's guaranteed that nothing uses the previous pointer
 * anymore.
 */
void kick_all_cpus_sync(void)
{
        /* Make sure the change is visible before we kick the cpus */
        smp_mb();
        smp_call_function(do_nothing, NULL, 1);
}
EXPORT_SYMBOL_GPL(kick_all_cpus_sync);

/**
 * wake_up_all_idle_cpus - break all cpus out of idle
 * wake_up_all_idle_cpus try to break all cpus which is in idle state even
 * including idle polling cpus, for non-idle cpus, we will do nothing
 * for them.
 */
void wake_up_all_idle_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                preempt_disable();
                if (cpu != smp_processor_id() && cpu_online(cpu))
                        wake_up_if_idle(cpu);
                preempt_enable();
        }
}
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);

/**
 * cpus_peek_for_pending_ipi - Check for pending IPI for CPUs
 * @mask: The CPU mask for the CPUs to check.
 *
 * This function walks through the @mask to check if there are any pending IPIs
 * scheduled, for any of the CPUs in the @mask. It does not guarantee
 * correctness as it only provides a racy snapshot.
 *
 * Returns: true if there is a pending IPI scheduled and false otherwise.
 */
bool cpus_peek_for_pending_ipi(const struct cpumask *mask)
{
        unsigned int cpu;

        for_each_cpu(cpu, mask) {
                if (!llist_empty(per_cpu_ptr(&call_single_queue, cpu)))
                        return true;
        }

        return false;
}

/**
 * struct smp_call_on_cpu_struct - Call a function on a specific CPU
 * @work: &work_struct
 * @done: &completion to signal
 * @func: function to call
 * @data: function's data argument
 * @ret: return value from @func
 * @cpu: target CPU (%-1 for any CPU)
 *
 * Used to call a function on a specific cpu and wait for it to return.
 * Optionally make sure the call is done on a specified physical cpu via vcpu
 * pinning in order to support virtualized environments.
 */
struct smp_call_on_cpu_struct {
        struct work_struct        work;
        struct completion        done;
        int                        (*func)(void *);
        void                        *data;
        int                        ret;
        int                        cpu;
};

static void smp_call_on_cpu_callback(struct work_struct *work)
{
        struct smp_call_on_cpu_struct *sscs;

        sscs = container_of(work, struct smp_call_on_cpu_struct, work);
        if (sscs->cpu >= 0)
                hypervisor_pin_vcpu(sscs->cpu);
        sscs->ret = sscs->func(sscs->data);
        if (sscs->cpu >= 0)
                hypervisor_pin_vcpu(-1);

        complete(&sscs->done);
}

/**
 * smp_call_on_cpu() - Call a function on a specific CPU and wait
 *        for it to return.
 * @cpu: The CPU to run on.
 * @func: The function to run
 * @par: An arbitrary pointer parameter for @func.
 * @phys: If @true, force to run on physical @cpu. See
 *        &struct smp_call_on_cpu_struct for more info.
 *
 * Returns: %-ENXIO if the @cpu is invalid; otherwise the return value
 *        from @func.
 */
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
{
        struct smp_call_on_cpu_struct sscs = {
                .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
                .func = func,
                .data = par,
                .cpu  = phys ? cpu : -1,
        };

        INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);

        if (cpu >= nr_cpu_ids || !cpu_online(cpu))
                return -ENXIO;

        queue_work_on(cpu, system_percpu_wq, &sscs.work);
        wait_for_completion(&sscs.done);
        destroy_work_on_stack(&sscs.work);

        return sscs.ret;
}
EXPORT_SYMBOL_GPL(smp_call_on_cpu);










































































































































































































































































   19 

















































































































    1 




































































































































































































































































































































































































































    2 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FORTIFY_STRING_H_
#define _LINUX_FORTIFY_STRING_H_

#include <linux/bug.h>
#include <linux/const.h>
#include <linux/limits.h>

#define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable
#define __RENAME(x) __asm__(#x)

#define FORTIFY_REASON_DIR(r)                ((r) & 1)
#define FORTIFY_REASON_FUNC(r)                ((r) >> 1)
#define FORTIFY_REASON(func, write)        ((func) << 1 | (write))

/* Overridden by KUnit tests. */
#ifndef fortify_panic
# define fortify_panic(func, write, avail, size, retfail)        \
         __fortify_panic(FORTIFY_REASON(func, write), avail, size)
#endif
#ifndef fortify_warn_once
# define fortify_warn_once(x...)        WARN_ONCE(x)
#endif

#define FORTIFY_READ                 0
#define FORTIFY_WRITE                 1

#define EACH_FORTIFY_FUNC(macro)        \
        macro(strncpy),                        \
        macro(strnlen),                        \
        macro(strlen),                        \
        macro(strscpy),                        \
        macro(strlcat),                        \
        macro(strcat),                        \
        macro(strncat),                        \
        macro(memset),                        \
        macro(memcpy),                        \
        macro(memmove),                        \
        macro(memscan),                        \
        macro(memcmp),                        \
        macro(memchr),                        \
        macro(memchr_inv),                \
        macro(kmemdup),                        \
        macro(strcpy),                        \
        macro(UNKNOWN),

#define MAKE_FORTIFY_FUNC(func)        FORTIFY_FUNC_##func

enum fortify_func {
        EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC)
};

void __fortify_report(const u8 reason, const size_t avail, const size_t size);
void __fortify_panic(const u8 reason, const size_t avail, const size_t size) __cold __noreturn;
void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)");
void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)");
void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?");
void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)");
void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?");

#define __compiletime_strlen(p)                                        \
({                                                                \
        char *__p = (char *)(p);                                \
        size_t __ret = SIZE_MAX;                                \
        const size_t __p_size = __member_size(p);                \
        if (__p_size != SIZE_MAX &&                                \
            __builtin_constant_p(*__p)) {                        \
                size_t __p_len = __p_size - 1;                        \
                if (__builtin_constant_p(__p[__p_len]) &&        \
                    __p[__p_len] == '\0')                        \
                        __ret = __builtin_strlen(__p);                \
        }                                                        \
        __ret;                                                        \
})

#if defined(__SANITIZE_ADDRESS__)

#if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY)
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
#elif defined(CONFIG_KASAN_GENERIC)
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__asan_memset);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memmove);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memcpy);
#else /* CONFIG_KASAN_SW_TAGS */
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__hwasan_memset);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memmove);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memcpy);
#endif

extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);

#else

#if defined(__SANITIZE_MEMORY__)
/*
 * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the
 * corresponding __msan_XXX functions.
 */
#include <linux/kmsan_string.h>
#define __underlying_memcpy        __msan_memcpy
#define __underlying_memmove        __msan_memmove
#define __underlying_memset        __msan_memset
#else
#define __underlying_memcpy        __builtin_memcpy
#define __underlying_memmove        __builtin_memmove
#define __underlying_memset        __builtin_memset
#endif

#define __underlying_memchr        __builtin_memchr
#define __underlying_memcmp        __builtin_memcmp
#define __underlying_strcat        __builtin_strcat
#define __underlying_strcpy        __builtin_strcpy
#define __underlying_strlen        __builtin_strlen
#define __underlying_strncat        __builtin_strncat
#define __underlying_strncpy        __builtin_strncpy

#endif

/**
 * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking
 *
 * @dst: Destination memory address to write to
 * @src: Source memory address to read from
 * @bytes: How many bytes to write to @dst from @src
 * @justification: Free-form text or comment describing why the use is needed
 *
 * This should be used for corner cases where the compiler cannot do the
 * right thing, or during transitions between APIs, etc. It should be used
 * very rarely, and includes a place for justification detailing where bounds
 * checking has happened, and why existing solutions cannot be employed.
 */
#define unsafe_memcpy(dst, src, bytes, justification)                \
        __underlying_memcpy(dst, src, bytes)

/*
 * Clang's use of __builtin_*object_size() within inlines needs hinting via
 * __pass_*object_size(). The preference is to only ever use type 1 (member
 * size, rather than struct size), but there remain some stragglers using
 * type 0 that will be converted in the future.
 */
#if __has_builtin(__builtin_dynamic_object_size)
#define POS                        __pass_dynamic_object_size(1)
#define POS0                        __pass_dynamic_object_size(0)
#else
#define POS                        __pass_object_size(1)
#define POS0                        __pass_object_size(0)
#endif

#define __compiletime_lessthan(bounds, length)        (        \
        __builtin_constant_p((bounds) < (length)) &&        \
        (bounds) < (length)                                \
)

/**
 * strncpy - Copy a string to memory with non-guaranteed NUL padding
 *
 * @p: pointer to destination of copy
 * @q: pointer to NUL-terminated source string to copy
 * @size: bytes to write at @p
 *
 * If strlen(@q) >= @size, the copy of @q will stop after @size bytes,
 * and @p will NOT be NUL-terminated
 *
 * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes
 * will be written to @p until @size total bytes have been written.
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * over-reads of @q, it cannot defend against writing unterminated
 * results to @p. Using strncpy() remains ambiguous and fragile.
 * Instead, please choose an alternative, so that the expectation
 * of @p's contents is unambiguous:
 *
 * +--------------------+--------------------+------------+
 * | **p** needs to be: | padded to **size** | not padded |
 * +====================+====================+============+
 * |     NUL-terminated | strscpy_pad()      | strscpy()  |
 * +--------------------+--------------------+------------+
 * | not NUL-terminated | strtomem_pad()     | strtomem() |
 * +--------------------+--------------------+------------+
 *
 * Note strscpy*()'s differing return values for detecting truncation,
 * and strtomem*()'s expectation that the destination is marked with
 * __nonstring when it is a character array.
 *
 */
__FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3)
char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
{
        const size_t p_size = __member_size(p);

        if (__compiletime_lessthan(p_size, size))
                __write_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE, p_size, size, p);
        return __underlying_strncpy(p, q, size);
}

extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
/**
 * strnlen - Return bounded count of characters in a NUL-terminated string
 *
 * @p: pointer to NUL-terminated string to count.
 * @maxlen: maximum number of characters to count.
 *
 * Returns number of characters in @p (NOT including the final NUL), or
 * @maxlen, if no NUL has been found up to there.
 *
 */
__FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen)
{
        const size_t p_size = __member_size(p);
        const size_t p_len = __compiletime_strlen(p);
        size_t ret;

        /* We can take compile-time actions when maxlen is const. */
        if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) {
                /* If p is const, we can use its compile-time-known len. */
                if (maxlen >= p_size)
                        return p_len;
        }

        /* Do not check characters beyond the end of p. */
        ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
        if (p_size <= ret && maxlen != ret)
                fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ, p_size, ret + 1, ret);
        return ret;
}

/*
 * Defined after fortified strnlen to reuse it. However, it must still be
 * possible for strlen() to be used on compile-time strings for use in
 * static initializers (i.e. as a constant expression).
 */
/**
 * strlen - Return count of characters in a NUL-terminated string
 *
 * @p: pointer to NUL-terminated string to count.
 *
 * Do not use this function unless the string length is known at
 * compile-time. When @p is unterminated, this function may crash
 * or return unexpected counts that could lead to memory content
 * exposures. Prefer strnlen().
 *
 * Returns number of characters in @p (NOT including the final NUL).
 *
 */
#define strlen(p)                                                        \
        __builtin_choose_expr(__is_constexpr(__builtin_strlen(p)),        \
                __builtin_strlen(p), __fortify_strlen(p))
__FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1)
__kernel_size_t __fortify_strlen(const char * const POS p)
{
        const size_t p_size = __member_size(p);
        __kernel_size_t ret;

        /* Give up if we don't know how large p is. */
        if (p_size == SIZE_MAX)
                return __underlying_strlen(p);
        ret = strnlen(p, p_size);
        if (p_size <= ret)
                fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ, p_size, ret + 1, ret);
        return ret;
}

/* Defined after fortified strnlen() to reuse it. */
extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(sized_strscpy);
__FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const POS q, size_t size)
{
        /* Use string size rather than possible enclosing struct size. */
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t len;

        /* If we cannot get size of p and q default to call strscpy. */
        if (p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __real_strscpy(p, q, size);

        /*
         * If size can be known at compile time and is greater than
         * p_size, generate a compile time write overflow error.
         */
        if (__compiletime_lessthan(p_size, size))
                __write_overflow();

        /* Short-circuit for compile-time known-safe lengths. */
        if (__compiletime_lessthan(p_size, SIZE_MAX)) {
                len = __compiletime_strlen(q);

                if (len < SIZE_MAX && __compiletime_lessthan(len, size)) {
                        __underlying_memcpy(p, q, len + 1);
                        return len;
                }
        }

        /*
         * This call protects from read overflow, because len will default to q
         * length if it smaller than size.
         */
        len = strnlen(q, size);
        /*
         * If len equals size, we will copy only size bytes which leads to
         * -E2BIG being returned.
         * Otherwise we will copy len + 1 because of the final '\O'.
         */
        len = len == size ? size : len + 1;

        /*
         * Generate a runtime write overflow error if len is greater than
         * p_size.
         */
        if (p_size < len)
                fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE, p_size, len, -E2BIG);

        /*
         * We can now safely call vanilla strscpy because we are protected from:
         * 1. Read overflow thanks to call to strnlen().
         * 2. Write overflow thanks to above ifs.
         */
        return __real_strscpy(p, q, len);
}

/* Defined after fortified strlen() to reuse it. */
extern size_t __real_strlcat(char *p, const char *q, size_t avail) __RENAME(strlcat);
/**
 * strlcat - Append a string to an existing string
 *
 * @p: pointer to %NUL-terminated string to append to
 * @q: pointer to %NUL-terminated string to append from
 * @avail: Maximum bytes available in @p
 *
 * Appends %NUL-terminated string @q after the %NUL-terminated
 * string at @p, but will not write beyond @avail bytes total,
 * potentially truncating the copy from @q. @p will stay
 * %NUL-terminated only if a %NUL already existed within
 * the @avail bytes of @p. If so, the resulting number of
 * bytes copied from @q will be at most "@avail - strlen(@p) - 1".
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * read and write overflows, this is only possible when the sizes
 * of @p and @q are known to the compiler. Prefer building the
 * string with formatting, via scnprintf(), seq_buf, or similar.
 *
 * Returns total bytes that _would_ have been contained by @p
 * regardless of truncation, similar to snprintf(). If return
 * value is >= @avail, the string has been truncated.
 *
 */
__FORTIFY_INLINE
size_t strlcat(char * const POS p, const char * const POS q, size_t avail)
{
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t p_len, copy_len;
        size_t actual, wanted;

        /* Give up immediately if both buffer sizes are unknown. */
        if (p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __real_strlcat(p, q, avail);

        p_len = strnlen(p, avail);
        copy_len = strlen(q);
        wanted = actual = p_len + copy_len;

        /* Cannot append any more: report truncation. */
        if (avail <= p_len)
                return wanted;

        /* Give up if string is already overflowed. */
        if (p_size <= p_len)
                fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ, p_size, p_len + 1, wanted);

        if (actual >= avail) {
                copy_len = avail - p_len - 1;
                actual = p_len + copy_len;
        }

        /* Give up if copy will overflow. */
        if (p_size <= actual)
                fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE, p_size, actual + 1, wanted);
        __underlying_memcpy(p + p_len, q, copy_len);
        p[actual] = '\0';

        return wanted;
}

/* Defined after fortified strlcat() to reuse it. */
/**
 * strcat - Append a string to an existing string
 *
 * @p: pointer to NUL-terminated string to append to
 * @q: pointer to NUL-terminated source string to append from
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * read and write overflows, this is only possible when the
 * destination buffer size is known to the compiler. Prefer
 * building the string with formatting, via scnprintf() or similar.
 * At the very least, use strncat().
 *
 * Returns @p.
 *
 */
__FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2)
char *strcat(char * const POS p, const char *q)
{
        const size_t p_size = __member_size(p);
        const size_t wanted = strlcat(p, q, p_size);

        if (p_size <= wanted)
                fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE, p_size, wanted + 1, p);
        return p;
}

/**
 * strncat - Append a string to an existing string
 *
 * @p: pointer to NUL-terminated string to append to
 * @q: pointer to source string to append from
 * @count: Maximum bytes to read from @q
 *
 * Appends at most @count bytes from @q (stopping at the first
 * NUL byte) after the NUL-terminated string at @p. @p will be
 * NUL-terminated.
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * read and write overflows, this is only possible when the sizes
 * of @p and @q are known to the compiler. Prefer building the
 * string with formatting, via scnprintf() or similar.
 *
 * Returns @p.
 *
 */
/* Defined after fortified strlen() and strnlen() to reuse them. */
__FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3)
char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count)
{
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t p_len, copy_len, total;

        if (p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __underlying_strncat(p, q, count);
        p_len = strlen(p);
        copy_len = strnlen(q, count);
        total = p_len + copy_len + 1;
        if (p_size < total)
                fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE, p_size, total, p);
        __underlying_memcpy(p + p_len, q, copy_len);
        p[p_len + copy_len] = '\0';
        return p;
}

__FORTIFY_INLINE bool fortify_memset_chk(__kernel_size_t size,
                                         const size_t p_size,
                                         const size_t p_size_field)
{
        if (__builtin_constant_p(size)) {
                /*
                 * Length argument is a constant expression, so we
                 * can perform compile-time bounds checking where
                 * buffer sizes are also known at compile time.
                 */

                /* Error when size is larger than enclosing struct. */
                if (__compiletime_lessthan(p_size_field, p_size) &&
                    __compiletime_lessthan(p_size, size))
                        __write_overflow();

                /* Warn when write size is larger than dest field. */
                if (__compiletime_lessthan(p_size_field, size))
                        __write_overflow_field(p_size_field, size);
        }
        /*
         * At this point, length argument may not be a constant expression,
         * so run-time bounds checking can be done where buffer sizes are
         * known. (This is not an "else" because the above checks may only
         * be compile-time warnings, and we want to still warn for run-time
         * overflows.)
         */

        /*
         * Always stop accesses beyond the struct that contains the
         * field, when the buffer's remaining size is known.
         * (The SIZE_MAX test is to optimize away checks where the buffer
         * lengths are unknown.)
         */
        if (p_size != SIZE_MAX && p_size < size)
                fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE, p_size, size, true);
        return false;
}

#define __fortify_memset_chk(p, c, size, p_size, p_size_field) ({        \
        size_t __fortify_size = (size_t)(size);                                \
        fortify_memset_chk(__fortify_size, p_size, p_size_field),        \
        __underlying_memset(p, c, __fortify_size);                        \
})

/*
 * __struct_size() vs __member_size() must be captured here to avoid
 * evaluating argument side-effects further into the macro layers.
 */
#ifndef CONFIG_KMSAN
#define memset(p, c, s) __fortify_memset_chk(p, c, s,                        \
                __struct_size(p), __member_size(p))
#endif

/*
 * To make sure the compiler can enforce protection against buffer overflows,
 * memcpy(), memmove(), and memset() must not be used beyond individual
 * struct members. If you need to copy across multiple members, please use
 * struct_group() to create a named mirror of an anonymous struct union.
 * (e.g. see struct sk_buff.) Read overflow checking is currently only
 * done when a write overflow is also present, or when building with W=1.
 *
 * Mitigation coverage matrix
 *                                        Bounds checking at:
 *                                        +-------+-------+-------+-------+
 *                                        | Compile time  |   Run time    |
 * memcpy() argument sizes:                | write | read  | write | read  |
 *        dest     source   length      +-------+-------+-------+-------+
 * memcpy(known,   known,   constant)        |   y   |   y   |  n/a  |  n/a  |
 * memcpy(known,   unknown, constant)        |   y   |   n   |  n/a  |   V   |
 * memcpy(known,   known,   dynamic)        |   n   |   n   |   B   |   B   |
 * memcpy(known,   unknown, dynamic)        |   n   |   n   |   B   |   V   |
 * memcpy(unknown, known,   constant)        |   n   |   y   |   V   |  n/a  |
 * memcpy(unknown, unknown, constant)        |   n   |   n   |   V   |   V   |
 * memcpy(unknown, known,   dynamic)        |   n   |   n   |   V   |   B   |
 * memcpy(unknown, unknown, dynamic)        |   n   |   n   |   V   |   V   |
 *                                        +-------+-------+-------+-------+
 *
 * y = perform deterministic compile-time bounds checking
 * n = cannot perform deterministic compile-time bounds checking
 * n/a = no run-time bounds checking needed since compile-time deterministic
 * B = can perform run-time bounds checking (currently unimplemented)
 * V = vulnerable to run-time overflow (will need refactoring to solve)
 *
 */
__FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
                                         const size_t p_size,
                                         const size_t q_size,
                                         const size_t p_size_field,
                                         const size_t q_size_field,
                                         const u8 func)
{
        if (__builtin_constant_p(size)) {
                /*
                 * Length argument is a constant expression, so we
                 * can perform compile-time bounds checking where
                 * buffer sizes are also known at compile time.
                 */

                /* Error when size is larger than enclosing struct. */
                if (__compiletime_lessthan(p_size_field, p_size) &&
                    __compiletime_lessthan(p_size, size))
                        __write_overflow();
                if (__compiletime_lessthan(q_size_field, q_size) &&
                    __compiletime_lessthan(q_size, size))
                        __read_overflow2();

                /* Warn when write size argument larger than dest field. */
                if (__compiletime_lessthan(p_size_field, size))
                        __write_overflow_field(p_size_field, size);
                /*
                 * Warn for source field over-read when building with W=1
                 * or when an over-write happened, so both can be fixed at
                 * the same time.
                 */
                if ((IS_ENABLED(KBUILD_EXTRA_WARN1) ||
                     __compiletime_lessthan(p_size_field, size)) &&
                    __compiletime_lessthan(q_size_field, size))
                        __read_overflow2_field(q_size_field, size);
        }
        /*
         * At this point, length argument may not be a constant expression,
         * so run-time bounds checking can be done where buffer sizes are
         * known. (This is not an "else" because the above checks may only
         * be compile-time warnings, and we want to still warn for run-time
         * overflows.)
         */

        /*
         * Always stop accesses beyond the struct that contains the
         * field, when the buffer's remaining size is known.
         * (The SIZE_MAX test is to optimize away checks where the buffer
         * lengths are unknown.)
         */
        if (p_size != SIZE_MAX && p_size < size)
                fortify_panic(func, FORTIFY_WRITE, p_size, size, true);
        else if (q_size != SIZE_MAX && q_size < size)
                fortify_panic(func, FORTIFY_READ, q_size, size, true);

        /*
         * Warn when writing beyond destination field size.
         *
         * Note the implementation of __builtin_*object_size() behaves
         * like sizeof() when not directly referencing a flexible
         * array member, which means there will be many bounds checks
         * that will appear at run-time, without a way for them to be
         * detected at compile-time (as can be done when the destination
         * is specifically the flexible array member).
         * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832
         */
        if (p_size_field != SIZE_MAX &&
            p_size != p_size_field && p_size_field < size)
                return true;

        return false;
}

/*
 * To work around what seems to be an optimizer bug, the macro arguments
 * need to have const copies or the values end up changed by the time they
 * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture
 * __bos() results in const temp vars") for more details.
 */
#define __fortify_memcpy_chk(p, q, size, p_size, q_size,                \
                             p_size_field, q_size_field, op) ({                \
        const size_t __fortify_size = (size_t)(size);                        \
        const size_t __p_size = (p_size);                                \
        const size_t __q_size = (q_size);                                \
        const size_t __p_size_field = (p_size_field);                        \
        const size_t __q_size_field = (q_size_field);                        \
        /* Keep a mutable version of the size for the final copy. */        \
        size_t __copy_size = __fortify_size;                                \
        fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size,        \
                                     __q_size, __p_size_field,                \
                                     __q_size_field, FORTIFY_FUNC_ ##op), \
                  #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
                  __fortify_size,                                        \
                  "field \"" #p "\" at " FILE_LINE,                        \
                  __p_size_field);                                        \
        /* Hide only the run-time size from value range tracking to */        \
        /* silence compile-time false positive bounds warnings. */        \
        if (!__builtin_constant_p(__copy_size))                                \
                OPTIMIZER_HIDE_VAR(__copy_size);                        \
        __underlying_##op(p, q, __copy_size);                                \
})

/*
 * Notes about compile-time buffer size detection:
 *
 * With these types...
 *
 *        struct middle {
 *                u16 a;
 *                u8 middle_buf[16];
 *                int b;
 *        };
 *        struct end {
 *                u16 a;
 *                u8 end_buf[16];
 *        };
 *        struct flex {
 *                int a;
 *                u8 flex_buf[];
 *        };
 *
 *        void func(TYPE *ptr) { ... }
 *
 * Cases where destination size cannot be currently detected:
 * - the size of ptr's object (seemingly by design, gcc & clang fail):
 *        __builtin_object_size(ptr, 1) == SIZE_MAX
 * - the size of flexible arrays in ptr's obj (by design, dynamic size):
 *        __builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX
 * - the size of ANY array at the end of ptr's obj (gcc and clang bug):
 *        __builtin_object_size(ptr->end_buf, 1) == SIZE_MAX
 *        https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836
 *
 * Cases where destination size is currently detected:
 * - the size of non-array members within ptr's object:
 *        __builtin_object_size(ptr->a, 1) == 2
 * - the size of non-flexible-array in the middle of ptr's obj:
 *        __builtin_object_size(ptr->middle_buf, 1) == 16
 *
 */

/*
 * __struct_size() vs __member_size() must be captured here to avoid
 * evaluating argument side-effects further into the macro layers.
 */
#define memcpy(p, q, s)  __fortify_memcpy_chk(p, q, s,                        \
                __struct_size(p), __struct_size(q),                        \
                __member_size(p), __member_size(q),                        \
                memcpy)
#define memmove(p, q, s)  __fortify_memcpy_chk(p, q, s,                        \
                __struct_size(p), __struct_size(q),                        \
                __member_size(p), __member_size(q),                        \
                memmove)

extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
__FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ, p_size, size, NULL);
        return __real_memscan(p, c, size);
}

__FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3)
int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size)
{
        const size_t p_size = __struct_size(p);
        const size_t q_size = __struct_size(q);

        if (__builtin_constant_p(size)) {
                if (__compiletime_lessthan(p_size, size))
                        __read_overflow();
                if (__compiletime_lessthan(q_size, size))
                        __read_overflow2();
        }
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, p_size, size, INT_MIN);
        else if (q_size < size)
                fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, q_size, size, INT_MIN);
        return __underlying_memcmp(p, q, size);
}

__FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3)
void *memchr(const void * const POS0 p, int c, __kernel_size_t size)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ, p_size, size, NULL);
        return __underlying_memchr(p, c, size);
}

void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
__FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ, p_size, size, NULL);
        return __real_memchr_inv(p, c, size);
}

extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof)
                                                                    __realloc_size(2);
__FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size,
                              __real_kmemdup(p, 0, gfp));
        return __real_kmemdup(p, size, gfp);
}
#define kmemdup(...)        alloc_hooks(kmemdup_noprof(__VA_ARGS__))

/**
 * strcpy - Copy a string into another string buffer
 *
 * @p: pointer to destination of copy
 * @q: pointer to NUL-terminated source string to copy
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * overflows, this is only possible when the sizes of @q and @p are
 * known to the compiler. Prefer strscpy(), though note its different
 * return values for detecting truncation.
 *
 * Returns @p.
 *
 */
/* Defined after fortified strlen to reuse it. */
__FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2)
char *strcpy(char * const POS p, const char * const POS q)
{
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t size;

        /* If neither buffer size is known, immediately give up. */
        if (__builtin_constant_p(p_size) &&
            __builtin_constant_p(q_size) &&
            p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __underlying_strcpy(p, q);
        size = strlen(q) + 1;
        /* Compile-time check for const size overflow. */
        if (__compiletime_lessthan(p_size, size))
                __write_overflow();
        /* Run-time check for dynamic size overflow. */
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE, p_size, size, p);
        __underlying_memcpy(p, q, size);
        return p;
}

/* Don't use these outside the FORITFY_SOURCE implementation */
#undef __underlying_memchr
#undef __underlying_memcmp
#undef __underlying_strcat
#undef __underlying_strcpy
#undef __underlying_strlen
#undef __underlying_strncat
#undef __underlying_strncpy

#undef POS
#undef POS0

#endif /* _LINUX_FORTIFY_STRING_H_ */



























































    3 





    3 





    3 





    3 






    3 











































    3 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
/* SPDX-License-Identifier: GPL-2.0 */

#undef TRACE_SYSTEM
#define TRACE_SYSTEM rpm

#if !defined(_TRACE_RUNTIME_POWER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_RUNTIME_POWER_H

#include <linux/ktime.h>
#include <linux/tracepoint.h>

struct device;

/*
 * The rpm_internal events are used for tracing some important
 * runtime pm internal functions.
 */
DECLARE_EVENT_CLASS(rpm_internal,

        TP_PROTO(struct device *dev, int flags),

        TP_ARGS(dev, flags),

        TP_STRUCT__entry(
                __string(       name,                dev_name(dev)        )
                __field(        int,            flags           )
                __field(        int ,           usage_count        )
                __field(        int ,           disable_depth   )
                __field(        int ,           runtime_auto        )
                __field(        int ,           request_pending        )
                __field(        int ,           irq_safe        )
                __field(        int ,           child_count         )
        ),

        TP_fast_assign(
                __assign_str(name);
                __entry->flags = flags;
                __entry->usage_count = atomic_read(
                        &dev->power.usage_count);
                __entry->disable_depth = dev->power.disable_depth;
                __entry->runtime_auto = dev->power.runtime_auto;
                __entry->request_pending = dev->power.request_pending;
                __entry->irq_safe = dev->power.irq_safe;
                __entry->child_count = atomic_read(
                        &dev->power.child_count);
        ),

        TP_printk("%s flags-%x cnt-%-2d dep-%-2d auto-%-1d p-%-1d"
                        " irq-%-1d child-%d",
                        __get_str(name), __entry->flags,
                        __entry->usage_count,
                        __entry->disable_depth,
                        __entry->runtime_auto,
                        __entry->request_pending,
                        __entry->irq_safe,
                        __entry->child_count
                 )
);
DEFINE_EVENT(rpm_internal, rpm_suspend,

        TP_PROTO(struct device *dev, int flags),

        TP_ARGS(dev, flags)
);
DEFINE_EVENT(rpm_internal, rpm_resume,

        TP_PROTO(struct device *dev, int flags),

        TP_ARGS(dev, flags)
);
DEFINE_EVENT(rpm_internal, rpm_idle,

        TP_PROTO(struct device *dev, int flags),

        TP_ARGS(dev, flags)
);
DEFINE_EVENT(rpm_internal, rpm_usage,

        TP_PROTO(struct device *dev, int flags),

        TP_ARGS(dev, flags)
);

TRACE_EVENT(rpm_return_int,
        TP_PROTO(struct device *dev, unsigned long ip, int ret),
        TP_ARGS(dev, ip, ret),

        TP_STRUCT__entry(
                __string(       name,                dev_name(dev))
                __field(        unsigned long,                ip        )
                __field(        int,                        ret        )
        ),

        TP_fast_assign(
                __assign_str(name);
                __entry->ip = ip;
                __entry->ret = ret;
        ),

        TP_printk("%pS:%s ret=%d", (void *)__entry->ip, __get_str(name),
                __entry->ret)
);

#define RPM_STATUS_STRINGS \
        EM(RPM_INVALID, "RPM_INVALID") \
        EM(RPM_ACTIVE, "RPM_ACTIVE") \
        EM(RPM_RESUMING, "RPM_RESUMING") \
        EM(RPM_SUSPENDED, "RPM_SUSPENDED") \
        EMe(RPM_SUSPENDING, "RPM_SUSPENDING")

/* Enums require being exported to userspace, for user tool parsing. */
#undef EM
#undef EMe
#define EM(a, b)        TRACE_DEFINE_ENUM(a);
#define EMe(a, b)        TRACE_DEFINE_ENUM(a);

RPM_STATUS_STRINGS

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings that
 * will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a, b)        { a, b },
#define EMe(a, b)        { a, b }

TRACE_EVENT(rpm_status,
        TP_PROTO(struct device *dev, enum rpm_status status),
        TP_ARGS(dev, status),

        TP_STRUCT__entry(
                __string(name,        dev_name(dev))
                __field(int,        status)
        ),

        TP_fast_assign(
                __assign_str(name);
                __entry->status = status;
        ),

        TP_printk("%s status=%s", __get_str(name),
                __print_symbolic(__entry->status, RPM_STATUS_STRINGS))
);

#endif /* _TRACE_RUNTIME_POWER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
 * Written by Alex Tomas <alex@clusterfs.com>
 *
 * Architecture independence:
 *   Copyright (c) 2005, Bull S.A.
 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
 */

/*
 * Extents support for EXT4
 *
 * TODO:
 *   - ext4*_error() should be used in some situations
 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
 *   - smart tree reduction
 */

#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fiemap.h>
#include <linux/iomap.h>
#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
#include <kunit/static_stub.h>

#include <trace/events/ext4.h>

/*
 * used by extent splitting.
 */
#define EXT4_EXT_MAY_ZEROOUT        0x1  /* safe to zeroout if split fails \
                                        due to ENOSPC */
static struct ext4_ext_path *ext4_split_convert_extents(
        handle_t *handle, struct inode *inode, struct ext4_map_blocks *map,
        struct ext4_ext_path *path, int flags, unsigned int *allocated);

static __le32 ext4_extent_block_csum(struct inode *inode,
                                     struct ext4_extent_header *eh)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        __u32 csum;

        csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh,
                           EXT4_EXTENT_TAIL_OFFSET(eh));
        return cpu_to_le32(csum);
}

static int ext4_extent_block_csum_verify(struct inode *inode,
                                         struct ext4_extent_header *eh)
{
        struct ext4_extent_tail *et;

        if (!ext4_has_feature_metadata_csum(inode->i_sb))
                return 1;

        et = find_ext4_extent_tail(eh);
        if (et->et_checksum != ext4_extent_block_csum(inode, eh))
                return 0;
        return 1;
}

static void ext4_extent_block_csum_set(struct inode *inode,
                                       struct ext4_extent_header *eh)
{
        struct ext4_extent_tail *et;

        if (!ext4_has_feature_metadata_csum(inode->i_sb))
                return;

        et = find_ext4_extent_tail(eh);
        et->et_checksum = ext4_extent_block_csum(inode, eh);
}

static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
                                                  struct inode *inode,
                                                  struct ext4_ext_path *path,
                                                  ext4_lblk_t split, int flags);

static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
        /*
         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
         * moment, get_block can be called only for blocks inside i_size since
         * page cache has been already dropped and writes are blocked by
         * i_rwsem. So we can safely drop the i_data_sem here.
         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        ext4_discard_preallocations(inode);
        up_write(&EXT4_I(inode)->i_data_sem);
        *dropped = 1;
        return 0;
}

static inline void ext4_ext_path_brelse(struct ext4_ext_path *path)
{
        brelse(path->p_bh);
        path->p_bh = NULL;
}

static void ext4_ext_drop_refs(struct ext4_ext_path *path)
{
        int depth, i;

        if (IS_ERR_OR_NULL(path))
                return;
        depth = path->p_depth;
        for (i = 0; i <= depth; i++, path++)
                ext4_ext_path_brelse(path);
}

void ext4_free_ext_path(struct ext4_ext_path *path)
{
        if (IS_ERR_OR_NULL(path))
                return;
        ext4_ext_drop_refs(path);
        kfree(path);
}

/*
 * Make sure 'handle' has at least 'check_cred' credits. If not, restart
 * transaction with 'restart_cred' credits. The function drops i_data_sem
 * when restarting transaction and gets it after transaction is restarted.
 *
 * The function returns 0 on success, 1 if transaction had to be restarted,
 * and < 0 in case of fatal error.
 */
int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
                                int check_cred, int restart_cred,
                                int revoke_cred)
{
        int ret;
        int dropped = 0;

        ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
                revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
        if (dropped)
                down_write(&EXT4_I(inode)->i_data_sem);
        return ret;
}

/*
 * could return:
 *  - EROFS
 *  - ENOMEM
 */
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path)
{
        int err = 0;

        if (path->p_bh) {
                /* path points to block */
                BUFFER_TRACE(path->p_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode->i_sb,
                                                    path->p_bh, EXT4_JTR_NONE);
                /*
                 * The extent buffer's verified bit will be set again in
                 * __ext4_ext_dirty(). We could leave an inconsistent
                 * buffer if the extents updating procudure break off du
                 * to some error happens, force to check it again.
                 */
                if (!err)
                        clear_buffer_verified(path->p_bh);
        }
        /* path points to leaf/index in inode body */
        /* we use in-core data, no need to protect them */
        return err;
}

/*
 * could return:
 *  - EROFS
 *  - ENOMEM
 *  - EIO
 */
int __ext4_ext_dirty(const char *where, unsigned int line,
                     handle_t *handle, struct inode *inode,
                     struct ext4_ext_path *path)
{
        int err;

        KUNIT_STATIC_STUB_REDIRECT(__ext4_ext_dirty, where, line, handle, inode,
                                   path);

        WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
        if (path->p_bh) {
                ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
                /* path points to block */
                err = __ext4_handle_dirty_metadata(where, line, handle,
                                                   inode, path->p_bh);
                /* Extents updating done, re-set verified flag */
                if (!err)
                        set_buffer_verified(path->p_bh);
        } else {
                /* path points to leaf/index in inode body */
                err = ext4_mark_inode_dirty(handle, inode);
        }
        return err;
}

#define ext4_ext_dirty(handle, inode, path) \
                __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))

static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                              struct ext4_ext_path *path,
                              ext4_lblk_t block)
{
        if (path) {
                int depth = path->p_depth;
                struct ext4_extent *ex;

                /*
                 * Try to predict block placement assuming that we are
                 * filling in a file which will eventually be
                 * non-sparse --- i.e., in the case of libbfd writing
                 * an ELF object sections out-of-order but in a way
                 * the eventually results in a contiguous object or
                 * executable file, or some database extending a table
                 * space file.  However, this is actually somewhat
                 * non-ideal if we are writing a sparse file such as
                 * qemu or KVM writing a raw image file that is going
                 * to stay fairly sparse, since it will end up
                 * fragmenting the file system's free space.  Maybe we
                 * should have some hueristics or some way to allow
                 * userspace to pass a hint to file system,
                 * especially if the latter case turns out to be
                 * common.
                 */
                ex = path[depth].p_ext;
                if (ex) {
                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);

                        if (block > ext_block)
                                return ext_pblk + (block - ext_block);
                        else
                                return ext_pblk - (ext_block - block);
                }

                /* it looks like index is empty;
                 * try to find starting block from index itself */
                if (path[depth].p_bh)
                        return path[depth].p_bh->b_blocknr;
        }

        /* OK. use inode's group */
        return ext4_inode_to_goal_block(inode);
}

/*
 * Allocation for a meta data block
 */
static ext4_fsblk_t
ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path,
                        struct ext4_extent *ex, int *err, unsigned int flags)
{
        ext4_fsblk_t goal, newblock;

        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
                                        NULL, err);
        return newblock;
}

static inline int ext4_ext_space_block(struct inode *inode, int check)
{
        int size;

        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 6)
                size = 6;
#endif
        return size;
}

static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
{
        int size;

        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent_idx);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 5)
                size = 5;
#endif
        return size;
}

static inline int ext4_ext_space_root(struct inode *inode, int check)
{
        int size;

        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 3)
                size = 3;
#endif
        return size;
}

static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
{
        int size;

        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent_idx);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 4)
                size = 4;
#endif
        return size;
}

static inline struct ext4_ext_path *
ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
                           struct ext4_ext_path *path, ext4_lblk_t lblk,
                           int nofail)
{
        int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE;

        if (nofail)
                flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;

        return ext4_split_extent_at(handle, inode, path, lblk, flags);
}

static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
        int max;

        if (depth == ext_depth(inode)) {
                if (depth == 0)
                        max = ext4_ext_space_root(inode, 1);
                else
                        max = ext4_ext_space_root_idx(inode, 1);
        } else {
                if (depth == 0)
                        max = ext4_ext_space_block(inode, 1);
                else
                        max = ext4_ext_space_block_idx(inode, 1);
        }

        return max;
}

static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
{
        ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
        ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);

        /*
         * We allow neither:
         *  - zero length
         *  - overflow/wrap-around
         */
        if (lblock + len <= lblock)
                return 0;
        return ext4_inode_block_valid(inode, block, len);
}

static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
{
        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);

        return ext4_inode_block_valid(inode, block, 1);
}

static int ext4_valid_extent_entries(struct inode *inode,
                                     struct ext4_extent_header *eh,
                                     ext4_lblk_t lblk, ext4_fsblk_t *pblk,
                                     int depth)
{
        unsigned short entries;
        ext4_lblk_t lblock = 0;
        ext4_lblk_t cur = 0;

        if (eh->eh_entries == 0)
                return 1;

        entries = le16_to_cpu(eh->eh_entries);

        if (depth == 0) {
                /* leaf entries */
                struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);

                /*
                 * The logical block in the first entry should equal to
                 * the number in the index block.
                 */
                if (depth != ext_depth(inode) &&
                    lblk != le32_to_cpu(ext->ee_block))
                        return 0;
                while (entries) {
                        if (!ext4_valid_extent(inode, ext))
                                return 0;

                        /* Check for overlapping extents */
                        lblock = le32_to_cpu(ext->ee_block);
                        if (lblock < cur) {
                                *pblk = ext4_ext_pblock(ext);
                                return 0;
                        }
                        cur = lblock + ext4_ext_get_actual_len(ext);
                        ext++;
                        entries--;
                }
        } else {
                struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);

                /*
                 * The logical block in the first entry should equal to
                 * the number in the parent index block.
                 */
                if (depth != ext_depth(inode) &&
                    lblk != le32_to_cpu(ext_idx->ei_block))
                        return 0;
                while (entries) {
                        if (!ext4_valid_extent_idx(inode, ext_idx))
                                return 0;

                        /* Check for overlapping index extents */
                        lblock = le32_to_cpu(ext_idx->ei_block);
                        if (lblock < cur) {
                                *pblk = ext4_idx_pblock(ext_idx);
                                return 0;
                        }
                        ext_idx++;
                        entries--;
                        cur = lblock + 1;
                }
        }
        return 1;
}

static int __ext4_ext_check(const char *function, unsigned int line,
                            struct inode *inode, struct ext4_extent_header *eh,
                            int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
{
        const char *error_msg;
        int max = 0, err = -EFSCORRUPTED;

        if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
                error_msg = "invalid magic";
                goto corrupted;
        }
        if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
                error_msg = "unexpected eh_depth";
                goto corrupted;
        }
        if (unlikely(eh->eh_max == 0)) {
                error_msg = "invalid eh_max";
                goto corrupted;
        }
        max = ext4_ext_max_entries(inode, depth);
        if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
                error_msg = "too large eh_max";
                goto corrupted;
        }
        if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
                error_msg = "invalid eh_entries";
                goto corrupted;
        }
        if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
                error_msg = "eh_entries is 0 but eh_depth is > 0";
                goto corrupted;
        }
        if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
                error_msg = "invalid extent entries";
                goto corrupted;
        }
        if (unlikely(depth > 32)) {
                error_msg = "too large eh_depth";
                goto corrupted;
        }
        /* Verify checksum on non-root extent tree nodes */
        if (ext_depth(inode) != depth &&
            !ext4_extent_block_csum_verify(inode, eh)) {
                error_msg = "extent tree corrupted";
                err = -EFSBADCRC;
                goto corrupted;
        }
        return 0;

corrupted:
        ext4_error_inode_err(inode, function, line, 0, -err,
                             "pblk %llu bad header/extent: %s - magic %x, "
                             "entries %u, max %u(%u), depth %u(%u)",
                             (unsigned long long) pblk, error_msg,
                             le16_to_cpu(eh->eh_magic),
                             le16_to_cpu(eh->eh_entries),
                             le16_to_cpu(eh->eh_max),
                             max, le16_to_cpu(eh->eh_depth), depth);
        return err;
}

#define ext4_ext_check(inode, eh, depth, pblk)                        \
        __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)

int ext4_ext_check_inode(struct inode *inode)
{
        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}

static void ext4_cache_extents(struct inode *inode,
                               struct ext4_extent_header *eh)
{
        struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
        ext4_lblk_t prev = 0;
        int i;

        KUNIT_STATIC_STUB_REDIRECT(ext4_cache_extents, inode, eh);

        for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
                unsigned int status = EXTENT_STATUS_WRITTEN;
                ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
                int len = ext4_ext_get_actual_len(ex);

                if (prev && (prev != lblk))
                        ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
                                             EXTENT_STATUS_HOLE);

                if (ext4_ext_is_unwritten(ex))
                        status = EXTENT_STATUS_UNWRITTEN;
                ext4_es_cache_extent(inode, lblk, len,
                                     ext4_ext_pblock(ex), status);
                prev = lblk + len;
        }
}

static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
                         struct inode *inode, struct ext4_extent_idx *idx,
                         int depth, int flags)
{
        struct buffer_head                *bh;
        int                                err;
        gfp_t                                gfp_flags = __GFP_MOVABLE | GFP_NOFS;
        ext4_fsblk_t                        pblk;

        if (flags & EXT4_EX_NOFAIL)
                gfp_flags |= __GFP_NOFAIL;

        pblk = ext4_idx_pblock(idx);
        bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);

        if (!bh_uptodate_or_lock(bh)) {
                trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
                err = ext4_read_bh(bh, 0, NULL, false);
                if (err < 0)
                        goto errout;
        }
        if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
                return bh;
        err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
                               depth, pblk, le32_to_cpu(idx->ei_block));
        if (err)
                goto errout;
        set_buffer_verified(bh);
        /*
         * If this is a leaf block, cache all of its entries
         */
        if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
                struct ext4_extent_header *eh = ext_block_hdr(bh);
                ext4_cache_extents(inode, eh);
        }
        return bh;
errout:
        put_bh(bh);
        return ERR_PTR(err);

}

#define read_extent_tree_block(inode, idx, depth, flags)                \
        __read_extent_tree_block(__func__, __LINE__, (inode), (idx),        \
                                 (depth), (flags))

/*
 * This function is called to cache a file's extent information in the
 * extent status tree
 */
int ext4_ext_precache(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_ext_path *path = NULL;
        struct buffer_head *bh;
        int i = 0, depth, ret = 0;

        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return 0;        /* not an extent-mapped inode */

        ext4_check_map_extents_env(inode);

        down_read(&ei->i_data_sem);
        depth = ext_depth(inode);

        /* Don't cache anything if there are no external extent blocks */
        if (!depth) {
                up_read(&ei->i_data_sem);
                return ret;
        }

        path = kzalloc_objs(struct ext4_ext_path, depth + 1, GFP_NOFS);
        if (path == NULL) {
                up_read(&ei->i_data_sem);
                return -ENOMEM;
        }

        path[0].p_hdr = ext_inode_hdr(inode);
        ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
        if (ret)
                goto out;
        path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
        while (i >= 0) {
                /*
                 * If this is a leaf block or we've reached the end of
                 * the index block, go up
                 */
                if ((i == depth) ||
                    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
                        ext4_ext_path_brelse(path + i);
                        i--;
                        continue;
                }
                bh = read_extent_tree_block(inode, path[i].p_idx++,
                                            depth - i - 1,
                                            EXT4_EX_FORCE_CACHE);
                if (IS_ERR(bh)) {
                        ret = PTR_ERR(bh);
                        break;
                }
                i++;
                path[i].p_bh = bh;
                path[i].p_hdr = ext_block_hdr(bh);
                path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
        }
        ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
out:
        up_read(&ei->i_data_sem);
        ext4_free_ext_path(path);
        return ret;
}

#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
{
        int k, l = path->p_depth;

        ext_debug(inode, "path:");
        for (k = 0; k <= l; k++, path++) {
                if (path->p_idx) {
                        ext_debug(inode, "  %d->%llu",
                                  le32_to_cpu(path->p_idx->ei_block),
                                  ext4_idx_pblock(path->p_idx));
                } else if (path->p_ext) {
                        ext_debug(inode, "  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
                                  ext4_ext_is_unwritten(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
                                  ext4_ext_pblock(path->p_ext));
                } else
                        ext_debug(inode, "  []");
        }
        ext_debug(inode, "\n");
}

static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
{
        int depth = ext_depth(inode);
        struct ext4_extent_header *eh;
        struct ext4_extent *ex;
        int i;

        if (IS_ERR_OR_NULL(path))
                return;

        eh = path[depth].p_hdr;
        ex = EXT_FIRST_EXTENT(eh);

        ext_debug(inode, "Displaying leaf extents\n");

        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                          ext4_ext_is_unwritten(ex),
                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
        }
        ext_debug(inode, "\n");
}

static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
                        ext4_fsblk_t newblock, int level)
{
        int depth = ext_depth(inode);
        struct ext4_extent *ex;

        if (depth != level) {
                struct ext4_extent_idx *idx;
                idx = path[level].p_idx;
                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
                        ext_debug(inode, "%d: move %d:%llu in new index %llu\n",
                                  level, le32_to_cpu(idx->ei_block),
                                  ext4_idx_pblock(idx), newblock);
                        idx++;
                }

                return;
        }

        ex = path[depth].p_ext;
        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
                ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n",
                                le32_to_cpu(ex->ee_block),
                                ext4_ext_pblock(ex),
                                ext4_ext_is_unwritten(ex),
                                ext4_ext_get_actual_len(ex),
                                newblock);
                ex++;
        }
}

#else
#define ext4_ext_show_path(inode, path)
#define ext4_ext_show_leaf(inode, path)
#define ext4_ext_show_move(inode, path, newblock, level)
#endif

/*
 * ext4_ext_binsearch_idx:
 * binary search for the closest index of the given block
 * the header must be checked before calling this
 */
static void
ext4_ext_binsearch_idx(struct inode *inode,
                        struct ext4_ext_path *path, ext4_lblk_t block)
{
        struct ext4_extent_header *eh = path->p_hdr;
        struct ext4_extent_idx *r, *l, *m;


        ext_debug(inode, "binsearch for %u(idx):  ", block);

        l = EXT_FIRST_INDEX(eh) + 1;
        r = EXT_LAST_INDEX(eh);
        while (l <= r) {
                m = l + (r - l) / 2;
                ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
                          le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
                          r, le32_to_cpu(r->ei_block));

                if (block < le32_to_cpu(m->ei_block))
                        r = m - 1;
                else
                        l = m + 1;
        }

        path->p_idx = l - 1;
        ext_debug(inode, "  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
                  ext4_idx_pblock(path->p_idx));

#ifdef CHECK_BINSEARCH
        {
                struct ext4_extent_idx *chix, *ix;
                int k;

                chix = ix = EXT_FIRST_INDEX(eh);
                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
                        if (k != 0 && le32_to_cpu(ix->ei_block) <=
                            le32_to_cpu(ix[-1].ei_block)) {
                                printk(KERN_DEBUG "k=%d, ix=0x%p, "
                                       "first=0x%p\n", k,
                                       ix, EXT_FIRST_INDEX(eh));
                                printk(KERN_DEBUG "%u <= %u\n",
                                       le32_to_cpu(ix->ei_block),
                                       le32_to_cpu(ix[-1].ei_block));
                        }
                        BUG_ON(k && le32_to_cpu(ix->ei_block)
                                           <= le32_to_cpu(ix[-1].ei_block));
                        if (block < le32_to_cpu(ix->ei_block))
                                break;
                        chix = ix;
                }
                BUG_ON(chix != path->p_idx);
        }
#endif

}

/*
 * ext4_ext_binsearch:
 * binary search for closest extent of the given block
 * the header must be checked before calling this
 */
static void
ext4_ext_binsearch(struct inode *inode,
                struct ext4_ext_path *path, ext4_lblk_t block)
{
        struct ext4_extent_header *eh = path->p_hdr;
        struct ext4_extent *r, *l, *m;

        if (eh->eh_entries == 0) {
                /*
                 * this leaf is empty:
                 * we get such a leaf in split/add case
                 */
                return;
        }

        ext_debug(inode, "binsearch for %u:  ", block);

        l = EXT_FIRST_EXTENT(eh) + 1;
        r = EXT_LAST_EXTENT(eh);

        while (l <= r) {
                m = l + (r - l) / 2;
                ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
                          le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
                          r, le32_to_cpu(r->ee_block));

                if (block < le32_to_cpu(m->ee_block))
                        r = m - 1;
                else
                        l = m + 1;
        }

        path->p_ext = l - 1;
        ext_debug(inode, "  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
                        ext4_ext_pblock(path->p_ext),
                        ext4_ext_is_unwritten(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));

#ifdef CHECK_BINSEARCH
        {
                struct ext4_extent *chex, *ex;
                int k;

                chex = ex = EXT_FIRST_EXTENT(eh);
                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
                        BUG_ON(k && le32_to_cpu(ex->ee_block)
                                          <= le32_to_cpu(ex[-1].ee_block));
                        if (block < le32_to_cpu(ex->ee_block))
                                break;
                        chex = ex;
                }
                BUG_ON(chex != path->p_ext);
        }
#endif

}

void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
{
        struct ext4_extent_header *eh;

        eh = ext_inode_hdr(inode);
        eh->eh_depth = 0;
        eh->eh_entries = 0;
        eh->eh_magic = EXT4_EXT_MAGIC;
        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
        eh->eh_generation = 0;
        ext4_mark_inode_dirty(handle, inode);
}

struct ext4_ext_path *
ext4_find_extent(struct inode *inode, ext4_lblk_t block,
                 struct ext4_ext_path *path, int flags)
{
        struct ext4_extent_header *eh;
        struct buffer_head *bh;
        short int depth, i, ppos = 0;
        int ret;
        gfp_t gfp_flags = GFP_NOFS;

        KUNIT_STATIC_STUB_REDIRECT(ext4_find_extent, inode, block, path, flags);

        if (flags & EXT4_EX_NOFAIL)
                gfp_flags |= __GFP_NOFAIL;

        eh = ext_inode_hdr(inode);
        depth = ext_depth(inode);
        if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
                EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
                                 depth);
                ret = -EFSCORRUPTED;
                goto err;
        }

        if (path) {
                ext4_ext_drop_refs(path);
                if (depth > path[0].p_maxdepth) {
                        kfree(path);
                        path = NULL;
                }
        }
        if (!path) {
                /* account possible depth increase */
                path = kzalloc_objs(struct ext4_ext_path, depth + 2, gfp_flags);
                if (unlikely(!path))
                        return ERR_PTR(-ENOMEM);
                path[0].p_maxdepth = depth + 1;
        }
        path[0].p_hdr = eh;
        path[0].p_bh = NULL;

        i = depth;
        if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
                ext4_cache_extents(inode, eh);
        /* walk through the tree */
        while (i) {
                ext_debug(inode, "depth %d: num %d, max %d\n",
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));

                ext4_ext_binsearch_idx(inode, path + ppos, block);
                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;

                bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
                if (IS_ERR(bh)) {
                        ret = PTR_ERR(bh);
                        goto err;
                }

                eh = ext_block_hdr(bh);
                ppos++;
                path[ppos].p_bh = bh;
                path[ppos].p_hdr = eh;
        }

        path[ppos].p_depth = i;
        path[ppos].p_ext = NULL;
        path[ppos].p_idx = NULL;

        /* find extent */
        ext4_ext_binsearch(inode, path + ppos, block);
        /* if not an empty leaf */
        if (path[ppos].p_ext)
                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);

        ext4_ext_show_path(inode, path);

        return path;

err:
        ext4_free_ext_path(path);
        return ERR_PTR(ret);
}

/*
 * ext4_ext_insert_index:
 * insert new index [@logical;@ptr] into the block at @curp;
 * check where to insert: before @curp or after @curp
 */
static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                                 struct ext4_ext_path *curp,
                                 int logical, ext4_fsblk_t ptr)
{
        struct ext4_extent_idx *ix;
        int len, err;

        err = ext4_ext_get_access(handle, inode, curp);
        if (err)
                return err;

        if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
                EXT4_ERROR_INODE(inode,
                                 "logical %d == ei_block %d!",
                                 logical, le32_to_cpu(curp->p_idx->ei_block));
                return -EFSCORRUPTED;
        }

        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
                             >= le16_to_cpu(curp->p_hdr->eh_max))) {
                EXT4_ERROR_INODE(inode,
                                 "eh_entries %d >= eh_max %d!",
                                 le16_to_cpu(curp->p_hdr->eh_entries),
                                 le16_to_cpu(curp->p_hdr->eh_max));
                return -EFSCORRUPTED;
        }

        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
                /* insert after */
                ext_debug(inode, "insert new index %d after: %llu\n",
                          logical, ptr);
                ix = curp->p_idx + 1;
        } else {
                /* insert before */
                ext_debug(inode, "insert new index %d before: %llu\n",
                          logical, ptr);
                ix = curp->p_idx;
        }

        if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
                EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
                return -EFSCORRUPTED;
        }

        len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
        BUG_ON(len < 0);
        if (len > 0) {
                ext_debug(inode, "insert new index %d: "
                                "move %d indices from 0x%p to 0x%p\n",
                                logical, len, ix, ix + 1);
                memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
        }

        ix->ei_block = cpu_to_le32(logical);
        ext4_idx_store_pblock(ix, ptr);
        le16_add_cpu(&curp->p_hdr->eh_entries, 1);

        if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
                EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
                return -EFSCORRUPTED;
        }

        err = ext4_ext_dirty(handle, inode, curp);
        ext4_std_error(inode->i_sb, err);

        return err;
}

/*
 * ext4_ext_split:
 * inserts new subtree into the path, using free index entry
 * at depth @at:
 * - allocates all needed blocks (new leaf and all intermediate index blocks)
 * - makes decision where to split
 * - moves remaining extents and index entries (right to the split point)
 *   into the newly allocated blocks
 * - initializes subtree
 */
static int ext4_ext_split(handle_t *handle, struct inode *inode,
                          unsigned int flags,
                          struct ext4_ext_path *path,
                          struct ext4_extent *newext, int at)
{
        struct buffer_head *bh = NULL;
        int depth = ext_depth(inode);
        struct ext4_extent_header *neh;
        struct ext4_extent_idx *fidx;
        int i = at, k, m, a;
        ext4_fsblk_t newblock, oldblock;
        __le32 border;
        ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
        gfp_t gfp_flags = GFP_NOFS;
        int err = 0;
        size_t ext_size = 0;

        if (flags & EXT4_EX_NOFAIL)
                gfp_flags |= __GFP_NOFAIL;

        /* make decision: where to split? */
        /* FIXME: now decision is simplest: at current extent */

        /* if current leaf will be split, then we should use
         * border from split point */
        if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
                EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
                return -EFSCORRUPTED;
        }
        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
                border = path[depth].p_ext[1].ee_block;
                ext_debug(inode, "leaf will be split."
                                " next leaf starts at %d\n",
                                  le32_to_cpu(border));
        } else {
                border = newext->ee_block;
                ext_debug(inode, "leaf will be added."
                                " next leaf starts at %d\n",
                                le32_to_cpu(border));
        }

        /*
         * If error occurs, then we break processing
         * and mark filesystem read-only. index won't
         * be inserted and tree will be in consistent
         * state. Next mount will repair buffers too.
         */

        /*
         * Get array to track all allocated blocks.
         * We need this to handle errors and free blocks
         * upon them.
         */
        ablocks = kzalloc_objs(ext4_fsblk_t, depth, gfp_flags);
        if (!ablocks)
                return -ENOMEM;

        /* allocate all needed blocks */
        ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
        for (a = 0; a < depth - at; a++) {
                newblock = ext4_ext_new_meta_block(handle, inode, path,
                                                   newext, &err, flags);
                if (newblock == 0)
                        goto cleanup;
                ablocks[a] = newblock;
        }

        /* initialize new leaf */
        newblock = ablocks[--a];
        if (unlikely(newblock == 0)) {
                EXT4_ERROR_INODE(inode, "newblock == 0!");
                err = -EFSCORRUPTED;
                goto cleanup;
        }
        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
        if (unlikely(!bh)) {
                err = -ENOMEM;
                goto cleanup;
        }
        lock_buffer(bh);

        err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
                                             EXT4_JTR_NONE);
        if (err)
                goto cleanup;

        neh = ext_block_hdr(bh);
        neh->eh_entries = 0;
        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        neh->eh_depth = 0;
        neh->eh_generation = 0;

        /* move remainder of path[depth] to the new leaf */
        if (unlikely(path[depth].p_hdr->eh_entries !=
                     path[depth].p_hdr->eh_max)) {
                EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
                                 path[depth].p_hdr->eh_entries,
                                 path[depth].p_hdr->eh_max);
                err = -EFSCORRUPTED;
                goto cleanup;
        }
        /* start copy from next extent */
        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
        ext4_ext_show_move(inode, path, newblock, depth);
        if (m) {
                struct ext4_extent *ex;
                ex = EXT_FIRST_EXTENT(neh);
                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
                le16_add_cpu(&neh->eh_entries, m);
        }

        /* zero out unused area in the extent block */
        ext_size = sizeof(struct ext4_extent_header) +
                sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
        ext4_extent_block_csum_set(inode, neh);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);

        err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto cleanup;
        brelse(bh);
        bh = NULL;

        /* correct old leaf */
        if (m) {
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto cleanup;
                le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto cleanup;

        }

        /* create intermediate indexes */
        k = depth - at - 1;
        if (unlikely(k < 0)) {
                EXT4_ERROR_INODE(inode, "k %d < 0!", k);
                err = -EFSCORRUPTED;
                goto cleanup;
        }
        if (k)
                ext_debug(inode, "create %d intermediate indices\n", k);
        /* insert new index into current index block */
        /* current depth stored in i var */
        i = depth - 1;
        while (k--) {
                oldblock = newblock;
                newblock = ablocks[--a];
                bh = sb_getblk(inode->i_sb, newblock);
                if (unlikely(!bh)) {
                        err = -ENOMEM;
                        goto cleanup;
                }
                lock_buffer(bh);

                err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
                                                     EXT4_JTR_NONE);
                if (err)
                        goto cleanup;

                neh = ext_block_hdr(bh);
                neh->eh_entries = cpu_to_le16(1);
                neh->eh_magic = EXT4_EXT_MAGIC;
                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
                neh->eh_depth = cpu_to_le16(depth - i);
                neh->eh_generation = 0;
                fidx = EXT_FIRST_INDEX(neh);
                fidx->ei_block = border;
                ext4_idx_store_pblock(fidx, oldblock);

                ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n",
                                i, newblock, le32_to_cpu(border), oldblock);

                /* move remainder of path[i] to the new index block */
                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
                                        EXT_LAST_INDEX(path[i].p_hdr))) {
                        EXT4_ERROR_INODE(inode,
                                         "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
                                         le32_to_cpu(path[i].p_ext->ee_block));
                        err = -EFSCORRUPTED;
                        goto cleanup;
                }
                /* start copy indexes */
                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
                ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
                                EXT_MAX_INDEX(path[i].p_hdr));
                ext4_ext_show_move(inode, path, newblock, i);
                if (m) {
                        memmove(++fidx, path[i].p_idx,
                                sizeof(struct ext4_extent_idx) * m);
                        le16_add_cpu(&neh->eh_entries, m);
                }
                /* zero out unused area in the extent block */
                ext_size = sizeof(struct ext4_extent_header) +
                   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
                memset(bh->b_data + ext_size, 0,
                        inode->i_sb->s_blocksize - ext_size);
                ext4_extent_block_csum_set(inode, neh);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);

                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (err)
                        goto cleanup;
                brelse(bh);
                bh = NULL;

                /* correct old index */
                if (m) {
                        err = ext4_ext_get_access(handle, inode, path + i);
                        if (err)
                                goto cleanup;
                        le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
                        err = ext4_ext_dirty(handle, inode, path + i);
                        if (err)
                                goto cleanup;
                }

                i--;
        }

        /* insert new index */
        err = ext4_ext_insert_index(handle, inode, path + at,
                                    le32_to_cpu(border), newblock);

cleanup:
        if (bh) {
                if (buffer_locked(bh))
                        unlock_buffer(bh);
                brelse(bh);
        }

        if (err) {
                /* free all allocated blocks in error case */
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
                                         EXT4_FREE_BLOCKS_METADATA);
                }
        }
        kfree(ablocks);

        return err;
}

/*
 * ext4_ext_grow_indepth:
 * implements tree growing procedure:
 * - allocates new block
 * - moves top-level data (index block or leaf) into the new block
 * - initializes new top-level, creating index that points to the
 *   just created block
 */
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
                                 unsigned int flags)
{
        struct ext4_extent_header *neh;
        struct buffer_head *bh;
        ext4_fsblk_t newblock, goal = 0;
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        int err = 0;
        size_t ext_size = 0;

        /* Try to prepend new index to old one */
        if (ext_depth(inode))
                goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
        if (goal > le32_to_cpu(es->s_first_data_block)) {
                flags |= EXT4_MB_HINT_TRY_GOAL;
                goal--;
        } else
                goal = ext4_inode_to_goal_block(inode);
        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
                                        NULL, &err);
        if (newblock == 0)
                return err;

        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
        if (unlikely(!bh))
                return -ENOMEM;
        lock_buffer(bh);

        err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
                                             EXT4_JTR_NONE);
        if (err) {
                unlock_buffer(bh);
                goto out;
        }

        ext_size = sizeof(EXT4_I(inode)->i_data);
        /* move top-level index/leaf into new block */
        memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
        /* zero out unused area in the extent block */
        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);

        /* set size of new block */
        neh = ext_block_hdr(bh);
        /* old root could have indexes or leaves
         * so calculate e_max right way */
        if (ext_depth(inode))
                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
        else
                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        ext4_extent_block_csum_set(inode, neh);
        set_buffer_uptodate(bh);
        set_buffer_verified(bh);
        unlock_buffer(bh);

        err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto out;

        /* Update top-level index: num,max,pointer */
        neh = ext_inode_hdr(inode);
        neh->eh_entries = cpu_to_le16(1);
        ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
        if (neh->eh_depth == 0) {
                /* Root extent block becomes index block */
                neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
                EXT_FIRST_INDEX(neh)->ei_block =
                        EXT_FIRST_EXTENT(neh)->ee_block;
        }
        ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));

        le16_add_cpu(&neh->eh_depth, 1);
        err = ext4_mark_inode_dirty(handle, inode);
out:
        brelse(bh);

        return err;
}

/*
 * ext4_ext_create_new_leaf:
 * finds empty index and adds new leaf.
 * if no free index is found, then it requests in-depth growing.
 */
static struct ext4_ext_path *
ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
                         unsigned int mb_flags, unsigned int gb_flags,
                         struct ext4_ext_path *path,
                         struct ext4_extent *newext)
{
        struct ext4_ext_path *curp;
        int depth, i, err = 0;
        ext4_lblk_t ee_block = le32_to_cpu(newext->ee_block);

repeat:
        i = depth = ext_depth(inode);

        /* walk up to the tree and look for free index entry */
        curp = path + depth;
        while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
                i--;
                curp--;
        }

        /* we use already allocated block for index block,
         * so subsequent data blocks should be contiguous */
        if (EXT_HAS_FREE_INDEX(curp)) {
                /* if we found index with free entry, then use that
                 * entry: create all needed subtree and add new leaf */
                err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
                if (err)
                        goto errout;

                /* refill path */
                path = ext4_find_extent(inode, ee_block, path, gb_flags);
                return path;
        }

        /* tree is full, time to grow in depth */
        err = ext4_ext_grow_indepth(handle, inode, mb_flags);
        if (err)
                goto errout;

        /* refill path */
        path = ext4_find_extent(inode, ee_block, path, gb_flags);
        if (IS_ERR(path))
                return path;

        /*
         * only first (depth 0 -> 1) produces free space;
         * in all other cases we have to split the grown tree
         */
        depth = ext_depth(inode);
        if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
                /* now we need to split */
                goto repeat;
        }

        return path;

errout:
        ext4_free_ext_path(path);
        return ERR_PTR(err);
}

/*
 * search the closest allocated block to the left for *logical
 * and returns it at @logical + it's physical address at @phys
 * if *logical is the smallest allocated block, the function
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
static int ext4_ext_search_left(struct inode *inode,
                                struct ext4_ext_path *path,
                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
{
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
        int depth, ee_len;

        if (unlikely(path == NULL)) {
                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
                return -EFSCORRUPTED;
        }
        depth = path->p_depth;
        *phys = 0;

        if (depth == 0 && path->p_ext == NULL)
                return 0;

        /* usually extent in the path covers blocks smaller
         * then *logical, but it can be that extent is the
         * first one in the file */

        ex = path[depth].p_ext;
        ee_len = ext4_ext_get_actual_len(ex);
        if (*logical < le32_to_cpu(ex->ee_block)) {
                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
                        EXT4_ERROR_INODE(inode,
                                         "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
                                         *logical, le32_to_cpu(ex->ee_block));
                        return -EFSCORRUPTED;
                }
                while (--depth >= 0) {
                        ix = path[depth].p_idx;
                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
                                EXT4_ERROR_INODE(inode,
                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
                                  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
                                  le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block),
                                  depth);
                                return -EFSCORRUPTED;
                        }
                }
                return 0;
        }

        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
                EXT4_ERROR_INODE(inode,
                                 "logical %d < ee_block %d + ee_len %d!",
                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
                return -EFSCORRUPTED;
        }

        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
        *phys = ext4_ext_pblock(ex) + ee_len - 1;
        return 0;
}

/*
 * Search the closest allocated block to the right for *logical
 * and returns it at @logical + it's physical address at @phys.
 * If not exists, return 0 and @phys is set to 0. We will return
 * 1 which means we found an allocated block and ret_ex is valid.
 * Or return a (< 0) error code.
 */
static int ext4_ext_search_right(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 ext4_lblk_t *logical, ext4_fsblk_t *phys,
                                 struct ext4_extent *ret_ex, int flags)
{
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
        int depth;        /* Note, NOT eh_depth; depth from top of tree */
        int ee_len;

        if (unlikely(path == NULL)) {
                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
                return -EFSCORRUPTED;
        }
        depth = path->p_depth;
        *phys = 0;

        if (depth == 0 && path->p_ext == NULL)
                return 0;

        /* usually extent in the path covers blocks smaller
         * then *logical, but it can be that extent is the
         * first one in the file */

        ex = path[depth].p_ext;
        ee_len = ext4_ext_get_actual_len(ex);
        if (*logical < le32_to_cpu(ex->ee_block)) {
                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
                        EXT4_ERROR_INODE(inode,
                                         "first_extent(path[%d].p_hdr) != ex",
                                         depth);
                        return -EFSCORRUPTED;
                }
                while (--depth >= 0) {
                        ix = path[depth].p_idx;
                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
                                EXT4_ERROR_INODE(inode,
                                                 "ix != EXT_FIRST_INDEX *logical %d!",
                                                 *logical);
                                return -EFSCORRUPTED;
                        }
                }
                goto found_extent;
        }

        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
                EXT4_ERROR_INODE(inode,
                                 "logical %d < ee_block %d + ee_len %d!",
                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
                return -EFSCORRUPTED;
        }

        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
                /* next allocated block in this leaf */
                ex++;
                goto found_extent;
        }

        /* go up and search for index to the right */
        while (--depth >= 0) {
                ix = path[depth].p_idx;
                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
                        goto got_index;
        }

        /* we've gone up to the root and found no index to the right */
        return 0;

got_index:
        /* we've found index to the right, let's
         * follow it and find the closest allocated
         * block to the right */
        ix++;
        while (++depth < path->p_depth) {
                /* subtract from p_depth to get proper eh_depth */
                bh = read_extent_tree_block(inode, ix, path->p_depth - depth,
                                            flags);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                eh = ext_block_hdr(bh);
                ix = EXT_FIRST_INDEX(eh);
                put_bh(bh);
        }

        bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        eh = ext_block_hdr(bh);
        ex = EXT_FIRST_EXTENT(eh);
found_extent:
        *logical = le32_to_cpu(ex->ee_block);
        *phys = ext4_ext_pblock(ex);
        if (ret_ex)
                *ret_ex = *ex;
        if (bh)
                put_bh(bh);
        return 1;
}

/*
 * ext4_ext_next_allocated_block:
 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
 * NOTE: it considers block number from index entry as
 * allocated block. Thus, index entries have to be consistent
 * with leaves.
 */
ext4_lblk_t
ext4_ext_next_allocated_block(struct ext4_ext_path *path)
{
        int depth;

        BUG_ON(path == NULL);
        depth = path->p_depth;

        if (depth == 0 && path->p_ext == NULL)
                return EXT_MAX_BLOCKS;

        while (depth >= 0) {
                struct ext4_ext_path *p = &path[depth];

                if (depth == path->p_depth) {
                        /* leaf */
                        if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
                                return le32_to_cpu(p->p_ext[1].ee_block);
                } else {
                        /* index */
                        if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
                                return le32_to_cpu(p->p_idx[1].ei_block);
                }
                depth--;
        }

        return EXT_MAX_BLOCKS;
}

/*
 * ext4_ext_next_leaf_block:
 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
 */
static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
{
        int depth;

        BUG_ON(path == NULL);
        depth = path->p_depth;

        /* zero-tree has no leaf blocks at all */
        if (depth == 0)
                return EXT_MAX_BLOCKS;

        /* go to index block */
        depth--;

        while (depth >= 0) {
                if (path[depth].p_idx !=
                                EXT_LAST_INDEX(path[depth].p_hdr))
                        return (ext4_lblk_t)
                                le32_to_cpu(path[depth].p_idx[1].ei_block);
                depth--;
        }

        return EXT_MAX_BLOCKS;
}

/*
 * ext4_ext_correct_indexes:
 * if leaf gets modified and modified extent is first in the leaf,
 * then we have to correct all indexes above.
 * TODO: do we need to correct tree in all cases?
 */
static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path)
{
        struct ext4_extent_header *eh;
        int depth = ext_depth(inode);
        struct ext4_extent *ex;
        __le32 border;
        int k, err = 0;

        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;

        if (unlikely(ex == NULL || eh == NULL)) {
                EXT4_ERROR_INODE(inode,
                                 "ex %p == NULL or eh %p == NULL", ex, eh);
                return -EFSCORRUPTED;
        }

        if (depth == 0) {
                /* there is no tree at all */
                return 0;
        }

        if (ex != EXT_FIRST_EXTENT(eh)) {
                /* we correct tree if first leaf got modified only */
                return 0;
        }

        /*
         * TODO: we need correction if border is smaller than current one
         */
        k = depth - 1;
        border = path[depth].p_ext->ee_block;
        err = ext4_ext_get_access(handle, inode, path + k);
        if (err)
                return err;
        if (unlikely(path[k].p_idx > EXT_LAST_INDEX(path[k].p_hdr))) {
                EXT4_ERROR_INODE(inode,
                                 "path[%d].p_idx %p > EXT_LAST_INDEX %p",
                                 k, path[k].p_idx,
                                 EXT_LAST_INDEX(path[k].p_hdr));
                return -EFSCORRUPTED;
        }
        path[k].p_idx->ei_block = border;
        err = ext4_ext_dirty(handle, inode, path + k);
        if (err)
                return err;

        while (k--) {
                /* change all left-side indexes */
                if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
                        break;
                err = ext4_ext_get_access(handle, inode, path + k);
                if (err)
                        goto clean;
                if (unlikely(path[k].p_idx > EXT_LAST_INDEX(path[k].p_hdr))) {
                        EXT4_ERROR_INODE(inode,
                                         "path[%d].p_idx %p > EXT_LAST_INDEX %p",
                                         k, path[k].p_idx,
                                         EXT_LAST_INDEX(path[k].p_hdr));
                        err = -EFSCORRUPTED;
                        goto clean;
                }
                path[k].p_idx->ei_block = border;
                err = ext4_ext_dirty(handle, inode, path + k);
                if (err)
                        goto clean;
        }
        return 0;

clean:
        /*
         * The path[k].p_bh is either unmodified or with no verified bit
         * set (see ext4_ext_get_access()). So just clear the verified bit
         * of the successfully modified extents buffers, which will force
         * these extents to be checked to avoid using inconsistent data.
         */
        while (++k < depth)
                clear_buffer_verified(path[k].p_bh);

        return err;
}

static int ext4_can_extents_be_merged(struct inode *inode,
                                      struct ext4_extent *ex1,
                                      struct ext4_extent *ex2)
{
        unsigned short ext1_ee_len, ext2_ee_len;

        if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
                return 0;

        ext1_ee_len = ext4_ext_get_actual_len(ex1);
        ext2_ee_len = ext4_ext_get_actual_len(ex2);

        if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
                        le32_to_cpu(ex2->ee_block))
                return 0;

        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
                return 0;

        if (ext4_ext_is_unwritten(ex1) &&
            ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
                return 0;
#ifdef AGGRESSIVE_TEST
        if (ext1_ee_len >= 4)
                return 0;
#endif

        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                return 1;
        return 0;
}

/*
 * This function tries to merge the "ex" extent to the next extent in the tree.
 * It always tries to merge towards right. If you want to merge towards
 * left, pass "ex - 1" as argument instead of "ex".
 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
 * 1 if they got merged.
 */
static int ext4_ext_try_to_merge_right(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *ex)
{
        struct ext4_extent_header *eh;
        unsigned int depth, len;
        int merge_done = 0, unwritten;

        depth = ext_depth(inode);
        BUG_ON(path[depth].p_hdr == NULL);
        eh = path[depth].p_hdr;

        while (ex < EXT_LAST_EXTENT(eh)) {
                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
                        break;
                /* merge with next extent! */
                unwritten = ext4_ext_is_unwritten(ex);
                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                + ext4_ext_get_actual_len(ex + 1));
                if (unwritten)
                        ext4_ext_mark_unwritten(ex);

                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
                                * sizeof(struct ext4_extent);
                        memmove(ex + 1, ex + 2, len);
                }
                le16_add_cpu(&eh->eh_entries, -1);
                merge_done = 1;
                WARN_ON(eh->eh_entries == 0);
                if (!eh->eh_entries)
                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
        }

        return merge_done;
}

/*
 * This function does a very simple check to see if we can collapse
 * an extent tree with a single extent tree leaf block into the inode.
 */
static void ext4_ext_try_to_merge_up(handle_t *handle,
                                     struct inode *inode,
                                     struct ext4_ext_path *path)
{
        size_t s;
        unsigned max_root = ext4_ext_space_root(inode, 0);
        ext4_fsblk_t blk;

        if ((path[0].p_depth != 1) ||
            (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
            (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
                return;

        /*
         * We need to modify the block allocation bitmap and the block
         * group descriptor to release the extent tree block.  If we
         * can't get the journal credits, give up.
         */
        if (ext4_journal_extend(handle, 2,
                        ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
                return;

        /*
         * Copy the extent data up to the inode
         */
        blk = ext4_idx_pblock(path[0].p_idx);
        s = le16_to_cpu(path[1].p_hdr->eh_entries) *
                sizeof(struct ext4_extent_idx);
        s += sizeof(struct ext4_extent_header);

        path[1].p_maxdepth = path[0].p_maxdepth;
        memcpy(path[0].p_hdr, path[1].p_hdr, s);
        path[0].p_depth = 0;
        path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
                (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
        path[0].p_hdr->eh_max = cpu_to_le16(max_root);

        ext4_ext_path_brelse(path + 1);
        ext4_free_blocks(handle, inode, NULL, blk, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
}

/*
 * This function tries to merge the @ex extent to neighbours in the tree, then
 * tries to collapse the extent tree into the inode.
 */
static void ext4_ext_try_to_merge(handle_t *handle,
                                  struct inode *inode,
                                  struct ext4_ext_path *path,
                                  struct ext4_extent *ex)
{
        struct ext4_extent_header *eh;
        unsigned int depth;
        int merge_done = 0;

        depth = ext_depth(inode);
        BUG_ON(path[depth].p_hdr == NULL);
        eh = path[depth].p_hdr;

        if (ex > EXT_FIRST_EXTENT(eh))
                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);

        if (!merge_done)
                (void) ext4_ext_try_to_merge_right(inode, path, ex);

        ext4_ext_try_to_merge_up(handle, inode, path);
}

/*
 * check if a portion of the "newext" extent overlaps with an
 * existing extent.
 *
 * If there is an overlap discovered, it updates the length of the newext
 * such that there will be no overlap, and then returns 1.
 * If there is no overlap found, it returns 0.
 */
static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
                                           struct inode *inode,
                                           struct ext4_extent *newext,
                                           struct ext4_ext_path *path)
{
        ext4_lblk_t b1, b2;
        unsigned int depth, len1;
        unsigned int ret = 0;

        b1 = le32_to_cpu(newext->ee_block);
        len1 = ext4_ext_get_actual_len(newext);
        depth = ext_depth(inode);
        if (!path[depth].p_ext)
                goto out;
        b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));

        /*
         * get the next allocated block if the extent in the path
         * is before the requested block(s)
         */
        if (b2 < b1) {
                b2 = ext4_ext_next_allocated_block(path);
                if (b2 == EXT_MAX_BLOCKS)
                        goto out;
                b2 = EXT4_LBLK_CMASK(sbi, b2);
        }

        /* check for wrap through zero on extent logical start block*/
        if (b1 + len1 < b1) {
                len1 = EXT_MAX_BLOCKS - b1;
                newext->ee_len = cpu_to_le16(len1);
                ret = 1;
        }

        /* check for overlap */
        if (b1 + len1 > b2) {
                newext->ee_len = cpu_to_le16(b2 - b1);
                ret = 1;
        }
out:
        return ret;
}

/*
 * ext4_ext_insert_extent:
 * tries to merge requested extent into the existing extent or
 * inserts requested extent as new one into the tree,
 * creating new leaf in the no-space case.
 */
struct ext4_ext_path *
ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                       struct ext4_ext_path *path,
                       struct ext4_extent *newext, int gb_flags)
{
        struct ext4_extent_header *eh;
        struct ext4_extent *ex, *fex;
        struct ext4_extent *nearex; /* nearest extent */
        int depth, len, err = 0;
        ext4_lblk_t next;
        int mb_flags = 0, unwritten;

        KUNIT_STATIC_STUB_REDIRECT(ext4_ext_insert_extent, handle, inode, path,
                                   newext, gb_flags);

        if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                mb_flags |= EXT4_MB_DELALLOC_RESERVED;
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
                err = -EFSCORRUPTED;
                goto errout;
        }
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        eh = path[depth].p_hdr;
        if (unlikely(path[depth].p_hdr == NULL)) {
                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
                err = -EFSCORRUPTED;
                goto errout;
        }

        /* try to insert block into found extent and return */
        if (ex && !(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) {

                /*
                 * Try to see whether we should rather test the extent on
                 * right from ex, or from the left of ex. This is because
                 * ext4_find_extent() can return either extent on the
                 * left, or on the right from the searched position. This
                 * will make merging more effective.
                 */
                if (ex < EXT_LAST_EXTENT(eh) &&
                    (le32_to_cpu(ex->ee_block) +
                    ext4_ext_get_actual_len(ex) <
                    le32_to_cpu(newext->ee_block))) {
                        ex += 1;
                        goto prepend;
                } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
                           (le32_to_cpu(newext->ee_block) +
                           ext4_ext_get_actual_len(newext) <
                           le32_to_cpu(ex->ee_block)))
                        ex -= 1;

                /* Try to append newex to the ex */
                if (ext4_can_extents_be_merged(inode, ex, newext)) {
                        ext_debug(inode, "append [%d]%d block to %u:[%d]%d"
                                  "(from %llu)\n",
                                  ext4_ext_is_unwritten(newext),
                                  ext4_ext_get_actual_len(newext),
                                  le32_to_cpu(ex->ee_block),
                                  ext4_ext_is_unwritten(ex),
                                  ext4_ext_get_actual_len(ex),
                                  ext4_ext_pblock(ex));
                        err = ext4_ext_get_access(handle, inode,
                                                  path + depth);
                        if (err)
                                goto errout;
                        unwritten = ext4_ext_is_unwritten(ex);
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
                        if (unwritten)
                                ext4_ext_mark_unwritten(ex);
                        nearex = ex;
                        goto merge;
                }

prepend:
                /* Try to prepend newex to the ex */
                if (ext4_can_extents_be_merged(inode, newext, ex)) {
                        ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d"
                                  "(from %llu)\n",
                                  le32_to_cpu(newext->ee_block),
                                  ext4_ext_is_unwritten(newext),
                                  ext4_ext_get_actual_len(newext),
                                  le32_to_cpu(ex->ee_block),
                                  ext4_ext_is_unwritten(ex),
                                  ext4_ext_get_actual_len(ex),
                                  ext4_ext_pblock(ex));
                        err = ext4_ext_get_access(handle, inode,
                                                  path + depth);
                        if (err)
                                goto errout;

                        unwritten = ext4_ext_is_unwritten(ex);
                        ex->ee_block = newext->ee_block;
                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
                        if (unwritten)
                                ext4_ext_mark_unwritten(ex);
                        nearex = ex;
                        goto merge;
                }
        }

        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
                goto has_space;

        /* probably next leaf has space for us? */
        fex = EXT_LAST_EXTENT(eh);
        next = EXT_MAX_BLOCKS;
        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
                next = ext4_ext_next_leaf_block(path);
        if (next != EXT_MAX_BLOCKS) {
                struct ext4_ext_path *npath;

                ext_debug(inode, "next leaf block - %u\n", next);
                npath = ext4_find_extent(inode, next, NULL, gb_flags);
                if (IS_ERR(npath)) {
                        err = PTR_ERR(npath);
                        goto errout;
                }
                BUG_ON(npath->p_depth != path->p_depth);
                eh = npath[depth].p_hdr;
                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
                        ext_debug(inode, "next leaf isn't full(%d)\n",
                                  le16_to_cpu(eh->eh_entries));
                        ext4_free_ext_path(path);
                        path = npath;
                        goto has_space;
                }
                ext_debug(inode, "next leaf has no free space(%d,%d)\n",
                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
                ext4_free_ext_path(npath);
        }

        /*
         * There is no free space in the found leaf.
         * We're gonna add a new leaf in the tree.
         */
        if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
                mb_flags |= EXT4_MB_USE_RESERVED;
        path = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
                                        path, newext);
        if (IS_ERR(path))
                return path;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;

has_space:
        nearex = path[depth].p_ext;

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto errout;

        if (!nearex) {
                /* there is no extent in this leaf, create first one */
                ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
                                ext4_ext_pblock(newext),
                                ext4_ext_is_unwritten(newext),
                                ext4_ext_get_actual_len(newext));
                nearex = EXT_FIRST_EXTENT(eh);
        } else {
                if (le32_to_cpu(newext->ee_block)
                           > le32_to_cpu(nearex->ee_block)) {
                        /* Insert after */
                        ext_debug(inode, "insert %u:%llu:[%d]%d before: "
                                        "nearest %p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex);
                        nearex++;
                } else {
                        /* Insert before */
                        BUG_ON(newext->ee_block == nearex->ee_block);
                        ext_debug(inode, "insert %u:%llu:[%d]%d after: "
                                        "nearest %p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex);
                }
                len = EXT_LAST_EXTENT(eh) - nearex + 1;
                if (len > 0) {
                        ext_debug(inode, "insert %u:%llu:[%d]%d: "
                                        "move %d extents from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        len, nearex, nearex + 1);
                        memmove(nearex + 1, nearex,
                                len * sizeof(struct ext4_extent));
                }
        }

        le16_add_cpu(&eh->eh_entries, 1);
        path[depth].p_ext = nearex;
        nearex->ee_block = newext->ee_block;
        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;

merge:
        /* try to merge extents */
        if (!(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
                ext4_ext_try_to_merge(handle, inode, path, nearex);

        /* time to correct all indexes above */
        err = ext4_ext_correct_indexes(handle, inode, path);
        if (err)
                goto errout;

        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
        if (err)
                goto errout;

        return path;

errout:
        ext4_free_ext_path(path);
        return ERR_PTR(err);
}

static int ext4_fill_es_cache_info(struct inode *inode,
                                   ext4_lblk_t block, ext4_lblk_t num,
                                   struct fiemap_extent_info *fieinfo)
{
        ext4_lblk_t next, end = block + num - 1;
        struct extent_status es;
        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned int flags;
        int err;

        while (block <= end) {
                next = 0;
                flags = 0;
                if (!ext4_es_lookup_extent(inode, block, &next, &es, NULL))
                        break;
                if (ext4_es_is_unwritten(&es))
                        flags |= FIEMAP_EXTENT_UNWRITTEN;
                if (ext4_es_is_delayed(&es))
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
                if (ext4_es_is_hole(&es))
                        flags |= EXT4_FIEMAP_EXTENT_HOLE;
                if (next == 0)
                        flags |= FIEMAP_EXTENT_LAST;
                if (flags & (FIEMAP_EXTENT_DELALLOC|
                             EXT4_FIEMAP_EXTENT_HOLE))
                        es.es_pblk = 0;
                else
                        es.es_pblk = ext4_es_pblock(&es);
                err = fiemap_fill_next_extent(fieinfo,
                                (__u64)es.es_lblk << blksize_bits,
                                (__u64)es.es_pblk << blksize_bits,
                                (__u64)es.es_len << blksize_bits,
                                flags);
                if (next == 0)
                        break;
                block = next;
                if (err < 0)
                        return err;
                if (err == 1)
                        return 0;
        }
        return 0;
}


/*
 * ext4_ext_find_hole - find hole around given block according to the given path
 * @inode:        inode we lookup in
 * @path:        path in extent tree to @lblk
 * @lblk:        pointer to logical block around which we want to determine hole
 *
 * Determine hole length (and start if easily possible) around given logical
 * block. We don't try too hard to find the beginning of the hole but @path
 * actually points to extent before @lblk, we provide it.
 *
 * The function returns the length of a hole starting at @lblk. We update @lblk
 * to the beginning of the hole if we managed to find it.
 */
static ext4_lblk_t ext4_ext_find_hole(struct inode *inode,
                                      struct ext4_ext_path *path,
                                      ext4_lblk_t *lblk)
{
        int depth = ext_depth(inode);
        struct ext4_extent *ex;
        ext4_lblk_t len;

        ex = path[depth].p_ext;
        if (ex == NULL) {
                /* there is no extent yet, so gap is [0;-] */
                *lblk = 0;
                len = EXT_MAX_BLOCKS;
        } else if (*lblk < le32_to_cpu(ex->ee_block)) {
                len = le32_to_cpu(ex->ee_block) - *lblk;
        } else if (*lblk >= le32_to_cpu(ex->ee_block)
                        + ext4_ext_get_actual_len(ex)) {
                ext4_lblk_t next;

                *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
                next = ext4_ext_next_allocated_block(path);
                BUG_ON(next == *lblk);
                len = next - *lblk;
        } else {
                BUG();
        }
        return len;
}

/*
 * ext4_ext_rm_idx:
 * removes index from the index block.
 */
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path, int depth)
{
        int err;
        ext4_fsblk_t leaf;
        int k = depth - 1;

        /* free index block */
        leaf = ext4_idx_pblock(path[k].p_idx);
        if (unlikely(path[k].p_hdr->eh_entries == 0)) {
                EXT4_ERROR_INODE(inode, "path[%d].p_hdr->eh_entries == 0", k);
                return -EFSCORRUPTED;
        }
        err = ext4_ext_get_access(handle, inode, path + k);
        if (err)
                return err;

        if (path[k].p_idx != EXT_LAST_INDEX(path[k].p_hdr)) {
                int len = EXT_LAST_INDEX(path[k].p_hdr) - path[k].p_idx;
                len *= sizeof(struct ext4_extent_idx);
                memmove(path[k].p_idx, path[k].p_idx + 1, len);
        }

        le16_add_cpu(&path[k].p_hdr->eh_entries, -1);
        err = ext4_ext_dirty(handle, inode, path + k);
        if (err)
                return err;
        ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
        trace_ext4_ext_rm_idx(inode, leaf);

        ext4_free_blocks(handle, inode, NULL, leaf, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);

        while (--k >= 0) {
                if (path[k + 1].p_idx != EXT_FIRST_INDEX(path[k + 1].p_hdr))
                        break;
                err = ext4_ext_get_access(handle, inode, path + k);
                if (err)
                        goto clean;
                path[k].p_idx->ei_block = path[k + 1].p_idx->ei_block;
                err = ext4_ext_dirty(handle, inode, path + k);
                if (err)
                        goto clean;
        }
        return 0;

clean:
        /*
         * The path[k].p_bh is either unmodified or with no verified bit
         * set (see ext4_ext_get_access()). So just clear the verified bit
         * of the successfully modified extents buffers, which will force
         * these extents to be checked to avoid using inconsistent data.
         */
        while (++k < depth)
                clear_buffer_verified(path[k].p_bh);

        return err;
}

/*
 * ext4_ext_calc_credits_for_single_extent:
 * This routine returns max. credits that needed to insert an extent
 * to the extent tree.
 * When pass the actual path, the caller should calculate credits
 * under i_data_sem.
 */
int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                                                struct ext4_ext_path *path)
{
        if (path) {
                int depth = ext_depth(inode);
                int ret = 0;

                /* probably there is space in leaf? */
                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {

                        /*
                         *  There are some space in the leaf tree, no
                         *  need to account for leaf block credit
                         *
                         *  bitmaps and block group descriptor blocks
                         *  and other metadata blocks still need to be
                         *  accounted.
                         */
                        /* 1 bitmap, 1 block group descriptor */
                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
                        return ret;
                }
        }

        return ext4_chunk_trans_blocks(inode, nrblocks);
}

/*
 * How many index/leaf blocks need to change/allocate to add @extents extents?
 *
 * If we add a single extent, then in the worse case, each tree level
 * index/leaf need to be changed in case of the tree split.
 *
 * If more extents are inserted, they could cause the whole tree split more
 * than once, but this is really rare.
 */
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
        int index;

        /* If we are converting the inline data, only one is needed here. */
        if (ext4_has_inline_data(inode))
                return 1;

        /*
         * Extent tree can change between the time we estimate credits and
         * the time we actually modify the tree. Assume the worst case.
         */
        if (extents <= 1)
                index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents;
        else
                index = (EXT4_MAX_EXTENT_DEPTH * 3) +
                        DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0));

        return index;
}

static inline int get_default_free_blocks_flags(struct inode *inode)
{
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
                return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
        else if (ext4_should_journal_data(inode))
                return EXT4_FREE_BLOCKS_FORGET;
        return 0;
}

/*
 * ext4_rereserve_cluster - increment the reserved cluster count when
 *                          freeing a cluster with a pending reservation
 *
 * @inode - file containing the cluster
 * @lblk - logical block in cluster to be reserved
 *
 * Increments the reserved cluster count and adjusts quota in a bigalloc
 * file system when freeing a partial cluster containing at least one
 * delayed and unwritten block.  A partial cluster meeting that
 * requirement will have a pending reservation.  If so, the
 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
 * defer reserved and allocated space accounting to a subsequent call
 * to this function.
 */
static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));

        spin_lock(&ei->i_block_reservation_lock);
        ei->i_reserved_data_blocks++;
        percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
        spin_unlock(&ei->i_block_reservation_lock);

        percpu_counter_add(&sbi->s_freeclusters_counter, 1);
        ext4_remove_pending(inode, lblk);
}

static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                              struct ext4_extent *ex,
                              struct partial_cluster *partial,
                              ext4_lblk_t from, ext4_lblk_t to)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned short ee_len = ext4_ext_get_actual_len(ex);
        ext4_fsblk_t last_pblk, pblk;
        ext4_lblk_t num;
        int flags;

        /* only extent tail removal is allowed */
        if (from < le32_to_cpu(ex->ee_block) ||
            to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
                ext4_error(sbi->s_sb,
                           "strange request: removal(2) %u-%u from %u:%u",
                           from, to, le32_to_cpu(ex->ee_block), ee_len);
                return 0;
        }

#ifdef EXTENTS_STATS
        spin_lock(&sbi->s_ext_stats_lock);
        sbi->s_ext_blocks += ee_len;
        sbi->s_ext_extents++;
        if (ee_len < sbi->s_ext_min)
                sbi->s_ext_min = ee_len;
        if (ee_len > sbi->s_ext_max)
                sbi->s_ext_max = ee_len;
        if (ext_depth(inode) > sbi->s_depth_max)
                sbi->s_depth_max = ext_depth(inode);
        spin_unlock(&sbi->s_ext_stats_lock);
#endif

        trace_ext4_remove_blocks(inode, ex, from, to, partial);

        /*
         * if we have a partial cluster, and it's different from the
         * cluster of the last block in the extent, we free it
         */
        last_pblk = ext4_ext_pblock(ex) + ee_len - 1;

        if (partial->state != initial &&
            partial->pclu != EXT4_B2C(sbi, last_pblk)) {
                if (partial->state == tofree) {
                        flags = get_default_free_blocks_flags(inode);
                        if (ext4_is_pending(inode, partial->lblk))
                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                        ext4_free_blocks(handle, inode, NULL,
                                         EXT4_C2B(sbi, partial->pclu),
                                         sbi->s_cluster_ratio, flags);
                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                                ext4_rereserve_cluster(inode, partial->lblk);
                }
                partial->state = initial;
        }

        num = le32_to_cpu(ex->ee_block) + ee_len - from;
        pblk = ext4_ext_pblock(ex) + ee_len - num;

        /*
         * We free the partial cluster at the end of the extent (if any),
         * unless the cluster is used by another extent (partial_cluster
         * state is nofree).  If a partial cluster exists here, it must be
         * shared with the last block in the extent.
         */
        flags = get_default_free_blocks_flags(inode);

        /* partial, left end cluster aligned, right end unaligned */
        if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
            (EXT4_LBLK_CMASK(sbi, to) >= from) &&
            (partial->state != nofree)) {
                if (ext4_is_pending(inode, to))
                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_PBLK_CMASK(sbi, last_pblk),
                                 sbi->s_cluster_ratio, flags);
                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                        ext4_rereserve_cluster(inode, to);
                partial->state = initial;
                flags = get_default_free_blocks_flags(inode);
        }

        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;

        /*
         * For bigalloc file systems, we never free a partial cluster
         * at the beginning of the extent.  Instead, we check to see if we
         * need to free it on a subsequent call to ext4_remove_blocks,
         * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
         */
        flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
        ext4_free_blocks(handle, inode, NULL, pblk, num, flags);

        /* reset the partial cluster if we've freed past it */
        if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
                partial->state = initial;

        /*
         * If we've freed the entire extent but the beginning is not left
         * cluster aligned and is not marked as ineligible for freeing we
         * record the partial cluster at the beginning of the extent.  It
         * wasn't freed by the preceding ext4_free_blocks() call, and we
         * need to look farther to the left to determine if it's to be freed
         * (not shared with another extent). Else, reset the partial
         * cluster - we're either  done freeing or the beginning of the
         * extent is left cluster aligned.
         */
        if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
                if (partial->state == initial) {
                        partial->pclu = EXT4_B2C(sbi, pblk);
                        partial->lblk = from;
                        partial->state = tofree;
                }
        } else {
                partial->state = initial;
        }

        return 0;
}

/*
 * ext4_ext_rm_leaf() Removes the extents associated with the
 * blocks appearing between "start" and "end".  Both "start"
 * and "end" must appear in the same extent or EIO is returned.
 *
 * @handle: The journal handle
 * @inode:  The files inode
 * @path:   The path to the leaf
 * @partial_cluster: The cluster which we'll have to free if all extents
 *                   has been released from it.  However, if this value is
 *                   negative, it's a cluster just to the right of the
 *                   punched region and it must not be freed.
 * @start:  The first block to remove
 * @end:   The last block to remove
 */
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 struct ext4_ext_path *path,
                 struct partial_cluster *partial,
                 ext4_lblk_t start, ext4_lblk_t end)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err = 0, correct_index = 0;
        int depth = ext_depth(inode), credits, revoke_credits;
        struct ext4_extent_header *eh;
        ext4_lblk_t a, b;
        unsigned num;
        ext4_lblk_t ex_ee_block;
        unsigned short ex_ee_len;
        unsigned unwritten = 0;
        struct ext4_extent *ex;
        ext4_fsblk_t pblk;

        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug(inode, "truncate since %u in leaf to %u\n", start, end);
        if (!path[depth].p_hdr)
                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
        eh = path[depth].p_hdr;
        if (unlikely(path[depth].p_hdr == NULL)) {
                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
                return -EFSCORRUPTED;
        }
        /* find where to start removing */
        ex = path[depth].p_ext;
        if (!ex)
                ex = EXT_LAST_EXTENT(eh);

        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);

        trace_ext4_ext_rm_leaf(inode, start, ex, partial);

        while (ex >= EXT_FIRST_EXTENT(eh) &&
                        ex_ee_block + ex_ee_len > start) {

                if (ext4_ext_is_unwritten(ex))
                        unwritten = 1;
                else
                        unwritten = 0;

                ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block,
                          unwritten, ex_ee_len);
                path[depth].p_ext = ex;

                a = max(ex_ee_block, start);
                b = min(ex_ee_block + ex_ee_len - 1, end);

                ext_debug(inode, "  border %u:%u\n", a, b);

                /* If this extent is beyond the end of the hole, skip it */
                if (end < ex_ee_block) {
                        /*
                         * We're going to skip this extent and move to another,
                         * so note that its first cluster is in use to avoid
                         * freeing it when removing blocks.  Eventually, the
                         * right edge of the truncated/punched region will
                         * be just to the left.
                         */
                        if (sbi->s_cluster_ratio > 1) {
                                pblk = ext4_ext_pblock(ex);
                                partial->pclu = EXT4_B2C(sbi, pblk);
                                partial->state = nofree;
                        }
                        ex--;
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
                        continue;
                } else if (b != ex_ee_block + ex_ee_len - 1) {
                        EXT4_ERROR_INODE(inode,
                                         "can not handle truncate %u:%u "
                                         "on extent %u:%u",
                                         start, end, ex_ee_block,
                                         ex_ee_block + ex_ee_len - 1);
                        err = -EFSCORRUPTED;
                        goto out;
                } else if (a != ex_ee_block) {
                        /* remove tail of the extent */
                        num = a - ex_ee_block;
                } else {
                        /* remove whole extent: excellent! */
                        num = 0;
                }
                /*
                 * 3 for leaf, sb, and inode plus 2 (bmap and group
                 * descriptor) for each block group; assume two block
                 * groups plus ex_ee_len/blocks_per_block_group for
                 * the worst case
                 */
                credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
                if (ex == EXT_FIRST_EXTENT(eh)) {
                        correct_index = 1;
                        credits += (ext_depth(inode)) + 1;
                }
                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
                /*
                 * We may end up freeing some index blocks and data from the
                 * punched range. Note that partial clusters are accounted for
                 * by ext4_free_data_revoke_credits().
                 */
                revoke_credits =
                        ext4_free_metadata_revoke_credits(inode->i_sb,
                                                          ext_depth(inode)) +
                        ext4_free_data_revoke_credits(inode, b - a + 1);

                err = ext4_datasem_ensure_credits(handle, inode, credits,
                                                  credits, revoke_credits);
                if (err) {
                        if (err > 0)
                                err = -EAGAIN;
                        goto out;
                }

                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto out;

                err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
                if (err)
                        goto out;

                if (num == 0)
                        /* this extent is removed; mark slot entirely unused */
                        ext4_ext_store_pblock(ex, 0);

                ex->ee_len = cpu_to_le16(num);
                /*
                 * Do not mark unwritten if all the blocks in the
                 * extent have been removed.
                 */
                if (unwritten && num)
                        ext4_ext_mark_unwritten(ex);
                /*
                 * If the extent was completely released,
                 * we need to remove it from the leaf
                 */
                if (num == 0) {
                        if (end != EXT_MAX_BLOCKS - 1) {
                                /*
                                 * For hole punching, we need to scoot all the
                                 * extents up when an extent is removed so that
                                 * we dont have blank extents in the middle
                                 */
                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
                                        sizeof(struct ext4_extent));

                                /* Now get rid of the one at the end */
                                memset(EXT_LAST_EXTENT(eh), 0,
                                        sizeof(struct ext4_extent));
                        }
                        le16_add_cpu(&eh->eh_entries, -1);
                }

                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto out;

                ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num,
                                ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
                ex_ee_len = ext4_ext_get_actual_len(ex);
        }

        if (correct_index && eh->eh_entries)
                err = ext4_ext_correct_indexes(handle, inode, path);

        /*
         * If there's a partial cluster and at least one extent remains in
         * the leaf, free the partial cluster if it isn't shared with the
         * current extent.  If it is shared with the current extent
         * we reset the partial cluster because we've reached the start of the
         * truncated/punched region and we're done removing blocks.
         */
        if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
                pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
                if (partial->pclu != EXT4_B2C(sbi, pblk)) {
                        int flags = get_default_free_blocks_flags(inode);

                        if (ext4_is_pending(inode, partial->lblk))
                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                        ext4_free_blocks(handle, inode, NULL,
                                         EXT4_C2B(sbi, partial->pclu),
                                         sbi->s_cluster_ratio, flags);
                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                                ext4_rereserve_cluster(inode, partial->lblk);
                }
                partial->state = initial;
        }

        /* if this leaf is free, then we should
         * remove it from index block above */
        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
                err = ext4_ext_rm_idx(handle, inode, path, depth);

out:
        return err;
}

/*
 * ext4_ext_more_to_rm:
 * returns 1 if current index has to be freed (even partial)
 */
static int
ext4_ext_more_to_rm(struct ext4_ext_path *path)
{
        BUG_ON(path->p_idx == NULL);

        if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
                return 0;

        /*
         * if truncate on deeper level happened, it wasn't partial,
         * so we have to consider current index for truncation
         */
        if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
                return 0;
        return 1;
}

int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                          ext4_lblk_t end)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int depth = ext_depth(inode);
        struct ext4_ext_path *path = NULL;
        struct partial_cluster partial;
        handle_t *handle;
        int i = 0, err = 0;
        int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL;

        partial.pclu = 0;
        partial.lblk = 0;
        partial.state = initial;

        ext_debug(inode, "truncate since %u to %u\n", start, end);

        /* probably first extent we're gonna free will be last in block */
        handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
                        depth + 1,
                        ext4_free_metadata_revoke_credits(inode->i_sb, depth));
        if (IS_ERR(handle))
                return PTR_ERR(handle);

again:
        trace_ext4_ext_remove_space(inode, start, end, depth);

        /*
         * Check if we are removing extents inside the extent tree. If that
         * is the case, we are going to punch a hole inside the extent tree
         * so we have to check whether we need to split the extent covering
         * the last block to remove so we can easily remove the part of it
         * in ext4_ext_rm_leaf().
         */
        if (end < EXT_MAX_BLOCKS - 1) {
                struct ext4_extent *ex;
                ext4_lblk_t ee_block, ex_end, lblk;
                ext4_fsblk_t pblk;

                /* find extent for or closest extent to this block */
                path = ext4_find_extent(inode, end, NULL, flags);
                if (IS_ERR(path)) {
                        ext4_journal_stop(handle);
                        return PTR_ERR(path);
                }
                depth = ext_depth(inode);
                /* Leaf not may not exist only if inode has no blocks at all */
                ex = path[depth].p_ext;
                if (!ex) {
                        if (depth) {
                                EXT4_ERROR_INODE(inode,
                                                 "path[%d].p_hdr == NULL",
                                                 depth);
                                err = -EFSCORRUPTED;
                        }
                        goto out;
                }

                ee_block = le32_to_cpu(ex->ee_block);
                ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;

                /*
                 * See if the last block is inside the extent, if so split
                 * the extent at 'end' block so we can easily remove the
                 * tail of the first part of the split extent in
                 * ext4_ext_rm_leaf().
                 */
                if (end >= ee_block && end < ex_end) {

                        /*
                         * If we're going to split the extent, note that
                         * the cluster containing the block after 'end' is
                         * in use to avoid freeing it when removing blocks.
                         */
                        if (sbi->s_cluster_ratio > 1) {
                                pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
                                partial.pclu = EXT4_B2C(sbi, pblk);
                                partial.state = nofree;
                        }

                        /*
                         * Split the extent in two so that 'end' is the last
                         * block in the first new extent. Also we should not
                         * fail removing space due to ENOSPC so try to use
                         * reserved block if that happens.
                         */
                        path = ext4_force_split_extent_at(handle, inode, path,
                                                          end + 1, 1);
                        if (IS_ERR(path)) {
                                err = PTR_ERR(path);
                                goto out;
                        }
                } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
                           partial.state == initial) {
                        /*
                         * If we're punching, there's an extent to the right.
                         * If the partial cluster hasn't been set, set it to
                         * that extent's first cluster and its state to nofree
                         * so it won't be freed should it contain blocks to be
                         * removed. If it's already set (tofree/nofree), we're
                         * retrying and keep the original partial cluster info
                         * so a cluster marked tofree as a result of earlier
                         * extent removal is not lost.
                         */
                        lblk = ex_end + 1;
                        err = ext4_ext_search_right(inode, path, &lblk, &pblk,
                                                    NULL, flags);
                        if (err < 0)
                                goto out;
                        if (pblk) {
                                partial.pclu = EXT4_B2C(sbi, pblk);
                                partial.state = nofree;
                        }
                }
        }
        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
         */
        depth = ext_depth(inode);
        if (path) {
                int k = i = depth;
                while (--k > 0)
                        path[k].p_block =
                                le16_to_cpu(path[k].p_hdr->eh_entries)+1;
        } else {
                path = kzalloc_objs(struct ext4_ext_path, depth + 1,
                                    GFP_NOFS | __GFP_NOFAIL);
                path[0].p_maxdepth = path[0].p_depth = depth;
                path[0].p_hdr = ext_inode_hdr(inode);
                i = 0;

                if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
                        err = -EFSCORRUPTED;
                        goto out;
                }
        }
        err = 0;

        while (i >= 0 && err == 0) {
                if (i == depth) {
                        /* this is leaf block */
                        err = ext4_ext_rm_leaf(handle, inode, path,
                                               &partial, start, end);
                        /* root level has p_bh == NULL, brelse() eats this */
                        ext4_ext_path_brelse(path + i);
                        i--;
                        continue;
                }

                /* this is index block */
                if (!path[i].p_hdr) {
                        ext_debug(inode, "initialize header\n");
                        path[i].p_hdr = ext_block_hdr(path[i].p_bh);
                }

                if (!path[i].p_idx) {
                        /* this level hasn't been touched yet */
                        path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
                        ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
                                  path[i].p_hdr,
                                  le16_to_cpu(path[i].p_hdr->eh_entries));
                } else {
                        /* we were already here, see at next index */
                        path[i].p_idx--;
                }

                ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
                                i, EXT_FIRST_INDEX(path[i].p_hdr),
                                path[i].p_idx);
                if (ext4_ext_more_to_rm(path + i)) {
                        struct buffer_head *bh;
                        /* go to the next level */
                        ext_debug(inode, "move to level %d (block %llu)\n",
                                  i + 1, ext4_idx_pblock(path[i].p_idx));
                        memset(path + i + 1, 0, sizeof(*path));
                        bh = read_extent_tree_block(inode, path[i].p_idx,
                                                    depth - i - 1, flags);
                        if (IS_ERR(bh)) {
                                /* should we reset i_size? */
                                err = PTR_ERR(bh);
                                break;
                        }
                        /* Yield here to deal with large extent trees.
                         * Should be a no-op if we did IO above. */
                        cond_resched();
                        if (WARN_ON(i + 1 > depth)) {
                                err = -EFSCORRUPTED;
                                break;
                        }
                        path[i + 1].p_bh = bh;

                        /* save actual number of indexes since this
                         * number is changed at the next iteration */
                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
                        i++;
                } else {
                        /* we finished processing this index, go up */
                        if (path[i].p_hdr->eh_entries == 0 && i > 0) {
                                /* index is empty, remove it;
                                 * handle must be already prepared by the
                                 * truncatei_leaf() */
                                err = ext4_ext_rm_idx(handle, inode, path, i);
                        }
                        /* root level has p_bh == NULL, brelse() eats this */
                        ext4_ext_path_brelse(path + i);
                        i--;
                        ext_debug(inode, "return to level %d\n", i);
                }
        }

        trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
                                         path->p_hdr->eh_entries);

        /*
         * if there's a partial cluster and we have removed the first extent
         * in the file, then we also free the partial cluster, if any
         */
        if (partial.state == tofree && err == 0) {
                int flags = get_default_free_blocks_flags(inode);

                if (ext4_is_pending(inode, partial.lblk))
                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(sbi, partial.pclu),
                                 sbi->s_cluster_ratio, flags);
                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                        ext4_rereserve_cluster(inode, partial.lblk);
                partial.state = initial;
        }

        /* TODO: flexible tree reduction should be here */
        if (path->p_hdr->eh_entries == 0) {
                /*
                 * truncate to zero freed all the tree,
                 * so we need to correct eh_depth
                 */
                err = ext4_ext_get_access(handle, inode, path);
                if (err == 0) {
                        ext_inode_hdr(inode)->eh_depth = 0;
                        ext_inode_hdr(inode)->eh_max =
                                cpu_to_le16(ext4_ext_space_root(inode, 0));
                        err = ext4_ext_dirty(handle, inode, path);
                }
        }
out:
        ext4_free_ext_path(path);
        path = NULL;
        if (err == -EAGAIN)
                goto again;
        ext4_journal_stop(handle);

        return err;
}

/*
 * called at mount time
 */
void ext4_ext_init(struct super_block *sb)
{
        /*
         * possible initialization would be here
         */

        if (ext4_has_feature_extents(sb)) {
#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
                printk(KERN_INFO "EXT4-fs: file extents enabled"
#ifdef AGGRESSIVE_TEST
                       ", aggressive tests"
#endif
#ifdef CHECK_BINSEARCH
                       ", check binsearch"
#endif
#ifdef EXTENTS_STATS
                       ", stats"
#endif
                       "\n");
#endif
#ifdef EXTENTS_STATS
                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
                EXT4_SB(sb)->s_ext_min = 1 << 30;
                EXT4_SB(sb)->s_ext_max = 0;
#endif
        }
}

/*
 * called at umount time
 */
void ext4_ext_release(struct super_block *sb)
{
        if (!ext4_has_feature_extents(sb))
                return;

#ifdef EXTENTS_STATS
        if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
                struct ext4_sb_info *sbi = EXT4_SB(sb);
                printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
                        sbi->s_ext_blocks, sbi->s_ext_extents,
                        sbi->s_ext_blocks / sbi->s_ext_extents);
                printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
                        sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
        }
#endif
}

static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
{
        ext4_lblk_t  ee_block;
        ext4_fsblk_t ee_pblock;
        unsigned int ee_len;

        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        ee_pblock = ext4_ext_pblock(ex);

        if (ee_len == 0)
                return;

        ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
                              EXTENT_STATUS_WRITTEN, false);
}

/* FIXME!! we need to try to merge to left or right after zero-out  */
int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
        ext4_fsblk_t ee_pblock;
        unsigned int ee_len;

        KUNIT_STATIC_STUB_REDIRECT(ext4_ext_zeroout, inode, ex);

        ee_len    = ext4_ext_get_actual_len(ex);
        ee_pblock = ext4_ext_pblock(ex);
        return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
                                  ee_len);
}

/*
 * ext4_split_extent_at() splits an extent at given block.
 *
 * @handle: the journal handle
 * @inode: the file inode
 * @path: the path to the extent
 * @split: the logical block where the extent is splitted.
 * @flags: flags used to insert new extent to extent tree.
 *
 *
 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
 * of which are same as the original extent. No conversion is performed.
 *
 * Return an extent path pointer on success, or an error pointer on failure. On
 * failure, the extent is restored to original state.
 */
static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
                                                  struct inode *inode,
                                                  struct ext4_ext_path *path,
                                                  ext4_lblk_t split,
                                                  int flags)
{
        ext4_fsblk_t newblock;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex, newex, orig_ex;
        struct ext4_extent *ex2 = NULL;
        unsigned int ee_len, depth;
        int err = 0, insert_err = 0, is_unwrit = 0;

        /* Do not cache extents that are in the process of being modified. */
        flags |= EXT4_EX_NOCACHE;

        ext_debug(inode, "logical block %llu\n", (unsigned long long)split);

        ext4_ext_show_leaf(inode, path);

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        newblock = split - ee_block + ext4_ext_pblock(ex);
        is_unwrit = ext4_ext_is_unwritten(ex);

        BUG_ON(split < ee_block || split >= (ee_block + ee_len));

        /*
         * No split needed
         */
        if (split == ee_block)
                goto out;

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;

        /* case a */
        memcpy(&orig_ex, ex, sizeof(orig_ex));
        ex->ee_len = cpu_to_le16(split - ee_block);
        if (is_unwrit)
                ext4_ext_mark_unwritten(ex);

        /*
         * path may lead to new leaf, not to original leaf any more
         * after ext4_ext_insert_extent() returns,
         */
        err = ext4_ext_dirty(handle, inode, path + depth);
        if (err)
                goto fix_extent_len;

        ex2 = &newex;
        ex2->ee_block = cpu_to_le32(split);
        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
        ext4_ext_store_pblock(ex2, newblock);
        if (is_unwrit)
                ext4_ext_mark_unwritten(ex2);

        path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (!IS_ERR(path))
                return path;

        insert_err = PTR_ERR(path);
        err = 0;
        if (insert_err != -ENOSPC && insert_err != -EDQUOT &&
            insert_err != -ENOMEM)
                goto out_path;

        /*
         * Get a new path to try to zeroout or fix the extent length.
         * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent()
         * will not return -ENOMEM, otherwise -ENOMEM will cause a
         * retry in do_writepages(), and a WARN_ON may be triggered
         * in ext4_da_update_reserve_space() due to an incorrect
         * ee_len causing the i_reserved_data_blocks exception.
         */
        path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL);
        if (IS_ERR(path)) {
                EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
                                 split, PTR_ERR(path));
                goto out_path;
        }

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        if (!ex) {
                EXT4_ERROR_INODE(inode,
                                 "bad extent address lblock: %lu, depth: %d pblock %llu",
                                 (unsigned long)ee_block, depth, path[depth].p_block);
                err = -EFSCORRUPTED;
                goto out;
        }

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;

fix_extent_len:
        ex->ee_len = orig_ex.ee_len;
        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
        if (err || insert_err) {
                ext4_free_ext_path(path);
                path = err ? ERR_PTR(err) : ERR_PTR(insert_err);
        }
out_path:
        if (IS_ERR(path))
                /* Remove all remaining potentially stale extents. */
                ext4_es_remove_extent(inode, ee_block, ee_len);
        ext4_ext_show_leaf(inode, path);
        return path;
}

static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
                                     struct ext4_ext_path *path,
                                     struct ext4_map_blocks *map, int flags)
{
        struct ext4_extent *ex;
        unsigned int ee_len, depth;
        ext4_lblk_t ee_block;
        uint64_t lblk, pblk, len;
        int is_unwrit;
        int err = 0;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        is_unwrit = ext4_ext_is_unwritten(ex);

        if (flags & EXT4_GET_BLOCKS_CONVERT) {
                /*
                 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
                 * map to be initialized. Zeroout everything except the map
                 * range.
                 */

                loff_t map_end = (loff_t) map->m_lblk + map->m_len;
                loff_t ex_end = (loff_t) ee_block + ee_len;

                if (!is_unwrit)
                        /* Shouldn't happen. Just exit */
                        return -EINVAL;

                /* zeroout left */
                if (map->m_lblk > ee_block) {
                        lblk = ee_block;
                        len = map->m_lblk - ee_block;
                        pblk = ext4_ext_pblock(ex);
                        err = ext4_issue_zeroout(inode, lblk, pblk, len);
                        if (err)
                                /* ZEROOUT failed, just return original error */
                                return err;
                }

                /* zeroout right */
                if (map_end < ex_end) {
                        lblk = map_end;
                        len = ex_end - map_end;
                        pblk = ext4_ext_pblock(ex) + (map_end - ee_block);
                        err = ext4_issue_zeroout(inode, lblk, pblk, len);
                        if (err)
                                /* ZEROOUT failed, just return original error */
                                return err;
                }
        } else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
                /*
                 * EXT4_GET_BLOCKS_CONVERT_UNWRITTEN: Caller wants the
                 * range specified by map to be marked unwritten.
                 * Zeroout the map range leaving rest as it is.
                 */

                if (is_unwrit)
                        /* Shouldn't happen. Just exit */
                        return -EINVAL;

                lblk = map->m_lblk;
                len = map->m_len;
                pblk = ext4_ext_pblock(ex) + (map->m_lblk - ee_block);
                err = ext4_issue_zeroout(inode, lblk, pblk, len);
                if (err)
                        /* ZEROOUT failed, just return original error */
                        return err;
        } else {
                /*
                 * We no longer perform unwritten to unwritten splits in IO paths.
                 * Hence this should not happen.
                 */
                WARN_ON_ONCE(true);
                return -EINVAL;
        }

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                return err;

        ext4_ext_mark_initialized(ex);

        err = ext4_ext_dirty(handle, inode, path + depth);
        if (err)
                return err;

        return 0;
}

/*
 * ext4_split_extent() splits an extent and mark extent which is covered
 * by @map as split_flags indicates
 *
 * It may result in splitting the extent into multiple extents (up to three)
 * There are three possibilities:
 *   a> There is no split required
 *   b> Splits in two extents: Split is happening at either end of the extent
 *   c> Splits in three extents: Somone is splitting in middle of the extent
 *
 */
static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
                                               struct inode *inode,
                                               struct ext4_ext_path *path,
                                               struct ext4_map_blocks *map,
                                               int split_flag, int flags,
                                               unsigned int *allocated, bool *did_zeroout)
{
        ext4_lblk_t ee_block, orig_ee_block;
        struct ext4_extent *ex;
        unsigned int ee_len, orig_ee_len, depth;
        int unwritten, orig_unwritten;
        int orig_err = 0;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        unwritten = ext4_ext_is_unwritten(ex);

        orig_ee_block = ee_block;
        orig_ee_len = ee_len;
        orig_unwritten = unwritten;

        /* Do not cache extents that are in the process of being modified. */
        flags |= EXT4_EX_NOCACHE;

        if (map->m_lblk + map->m_len < ee_block + ee_len) {
                path = ext4_split_extent_at(handle, inode, path,
                                            map->m_lblk + map->m_len, flags);
                if (IS_ERR(path))
                        goto try_zeroout;

                /*
                 * Update path is required because previous ext4_split_extent_at
                 * may result in split of original leaf or extent zeroout.
                 */
                path = ext4_find_extent(inode, map->m_lblk, path, flags);
                if (IS_ERR(path))
                        goto try_zeroout;

                depth = ext_depth(inode);
                ex = path[depth].p_ext;
                if (!ex) {
                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
                                        (unsigned long) map->m_lblk);
                        ext4_free_ext_path(path);
                        return ERR_PTR(-EFSCORRUPTED);
                }

                /* extent would have changed so update original values */
                orig_ee_block = le32_to_cpu(ex->ee_block);
                orig_ee_len = ext4_ext_get_actual_len(ex);
                orig_unwritten = ext4_ext_is_unwritten(ex);
        }

        if (map->m_lblk >= ee_block) {
                path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
                                            flags);
                if (IS_ERR(path))
                        goto try_zeroout;
        }

        goto success;

try_zeroout:
        /*
         * There was an error in splitting the extent. So instead, just zeroout
         * unwritten portions and convert it to initialized as a last resort. If
         * there is any failure here we just return the original error
         */

        orig_err = PTR_ERR(path);
        if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM)
                goto out_orig_err;

        /* we can't zeroout? just return the original err */
        if (!(split_flag & EXT4_EXT_MAY_ZEROOUT))
                goto out_orig_err;

        if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
                int max_zeroout_blks =
                        EXT4_SB(inode->i_sb)->s_extent_max_zeroout_kb >>
                        (inode->i_sb->s_blocksize_bits - 10);

                if (map->m_len > max_zeroout_blks)
                        goto out_orig_err;
        }

        path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
        if (IS_ERR(path))
                goto out_orig_err;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        unwritten = ext4_ext_is_unwritten(ex);

        /* extent to zeroout should have been unchanged but its not */
        if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len ||
                    unwritten != orig_unwritten))
                goto out_free_path;

        if (ext4_split_extent_zeroout(handle, inode, path, map, flags))
                goto out_free_path;

        /* zeroout succeeded */
        if (did_zeroout)
                *did_zeroout = true;

success:
        if (allocated) {
                if (map->m_lblk + map->m_len > ee_block + ee_len)
                        *allocated = ee_len - (map->m_lblk - ee_block);
                else
                        *allocated = map->m_len;
        }
        ext4_ext_show_leaf(inode, path);
        return path;

out_free_path:
        ext4_free_ext_path(path);
out_orig_err:
        return ERR_PTR(orig_err);

}

/*
 * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an unwritten extent. It may result in splitting the unwritten
 * extent into multiple extents (up to three - one initialized and two
 * unwritten).
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be initialized
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * Pre-conditions:
 *  - The extent pointed to by 'path' is unwritten.
 *  - The extent pointed to by 'path' contains a superset
 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
 *
 * Post-conditions on success:
 *  - the returned value is the number of blocks beyond map->l_lblk
 *    that are allocated and initialized.
 *    It is guaranteed to be >= map->m_len.
 */
static struct ext4_ext_path *
ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map, struct ext4_ext_path *path,
                        int flags, unsigned int *allocated)
{
        struct ext4_sb_info *sbi;
        struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
        struct ext4_extent zero_ex1, zero_ex2;
        struct ext4_extent *ex, *abut_ex;
        ext4_lblk_t ee_block, eof_block;
        unsigned int ee_len, depth, map_len = map->m_len;
        int err = 0;
        unsigned int max_zeroout = 0;

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)map->m_lblk, map_len);

        sbi = EXT4_SB(inode->i_sb);
        eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
                        >> inode->i_sb->s_blocksize_bits;
        if (eof_block < map->m_lblk + map_len)
                eof_block = map->m_lblk + map_len;

        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        zero_ex1.ee_len = 0;
        zero_ex2.ee_len = 0;

        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);

        /* Pre-conditions */
        BUG_ON(!ext4_ext_is_unwritten(ex));
        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));

        /*
         * Attempt to transfer newly initialized blocks from the currently
         * unwritten extent to its neighbor. This is much cheaper
         * than an insertion followed by a merge as those involve costly
         * memmove() calls. Transferring to the left is the common case in
         * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
         * followed by append writes.
         *
         * Limitations of the current logic:
         *  - L1: we do not deal with writes covering the whole extent.
         *    This would require removing the extent if the transfer
         *    is possible.
         *  - L2: we only attempt to merge with an extent stored in the
         *    same extent tree node.
         */
        *allocated = 0;
        if ((map->m_lblk == ee_block) &&
                /* See if we can merge left */
                (map_len < ee_len) &&                /*L1*/
                (ex > EXT_FIRST_EXTENT(eh))) {        /*L2*/
                ext4_lblk_t prev_lblk;
                ext4_fsblk_t prev_pblk, ee_pblk;
                unsigned int prev_len;

                abut_ex = ex - 1;
                prev_lblk = le32_to_cpu(abut_ex->ee_block);
                prev_len = ext4_ext_get_actual_len(abut_ex);
                prev_pblk = ext4_ext_pblock(abut_ex);
                ee_pblk = ext4_ext_pblock(ex);

                /*
                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                 * upon those conditions:
                 * - C1: abut_ex is initialized,
                 * - C2: abut_ex is logically abutting ex,
                 * - C3: abut_ex is physically abutting ex,
                 * - C4: abut_ex can receive the additional blocks without
                 *   overflowing the (initialized) length limit.
                 */
                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
                        ((prev_lblk + prev_len) == ee_block) &&                /*C2*/
                        ((prev_pblk + prev_len) == ee_pblk) &&                /*C3*/
                        (prev_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto errout;

                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
                                map, ex, abut_ex);

                        /* Shift the start of ex by 'map_len' blocks */
                        ex->ee_block = cpu_to_le32(ee_block + map_len);
                        ext4_ext_store_pblock(ex, ee_pblk + map_len);
                        ex->ee_len = cpu_to_le16(ee_len - map_len);
                        ext4_ext_mark_unwritten(ex); /* Restore the flag */

                        /* Extend abut_ex by 'map_len' blocks */
                        abut_ex->ee_len = cpu_to_le16(prev_len + map_len);

                        /* Result: number of initialized blocks past m_lblk */
                        *allocated = map_len;
                }
        } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
                   (map_len < ee_len) &&        /*L1*/
                   ex < EXT_LAST_EXTENT(eh)) {        /*L2*/
                /* See if we can merge right */
                ext4_lblk_t next_lblk;
                ext4_fsblk_t next_pblk, ee_pblk;
                unsigned int next_len;

                abut_ex = ex + 1;
                next_lblk = le32_to_cpu(abut_ex->ee_block);
                next_len = ext4_ext_get_actual_len(abut_ex);
                next_pblk = ext4_ext_pblock(abut_ex);
                ee_pblk = ext4_ext_pblock(ex);

                /*
                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                 * upon those conditions:
                 * - C1: abut_ex is initialized,
                 * - C2: abut_ex is logically abutting ex,
                 * - C3: abut_ex is physically abutting ex,
                 * - C4: abut_ex can receive the additional blocks without
                 *   overflowing the (initialized) length limit.
                 */
                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
                    ((map->m_lblk + map_len) == next_lblk) &&                /*C2*/
                    ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
                    (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto errout;

                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
                                map, ex, abut_ex);

                        /* Shift the start of abut_ex by 'map_len' blocks */
                        abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
                        ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
                        ex->ee_len = cpu_to_le16(ee_len - map_len);
                        ext4_ext_mark_unwritten(ex); /* Restore the flag */

                        /* Extend abut_ex by 'map_len' blocks */
                        abut_ex->ee_len = cpu_to_le16(next_len + map_len);

                        /* Result: number of initialized blocks past m_lblk */
                        *allocated = map_len;
                }
        }
        if (*allocated) {
                /* Mark the block containing both extents as dirty */
                err = ext4_ext_dirty(handle, inode, path + depth);

                /* Update path to point to the right extent */
                path[depth].p_ext = abut_ex;
                if (err)
                        goto errout;
                goto out;
        } else
                *allocated = ee_len - (map->m_lblk - ee_block);

        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully inside i_size or new_size.
         */
        if (ee_block + ee_len <= eof_block)
                max_zeroout = sbi->s_extent_max_zeroout_kb >>
                        (inode->i_sb->s_blocksize_bits - 10);

        /*
         * five cases:
         * 1. split the extent into three extents.
         * 2. split the extent into two extents, zeroout the head of the first
         *    extent.
         * 3. split the extent into two extents, zeroout the tail of the second
         *    extent.
         * 4. split the extent into two extents with out zeroout.
         * 5. no splitting needed, just possibly zeroout the head and / or the
         *    tail of the extent.
         */
        split_map.m_lblk = map->m_lblk;
        split_map.m_len = map->m_len;

        if (max_zeroout && (*allocated > split_map.m_len)) {
                if (*allocated <= max_zeroout) {
                        /* case 3 or 5 */
                        zero_ex1.ee_block =
                                 cpu_to_le32(split_map.m_lblk +
                                             split_map.m_len);
                        zero_ex1.ee_len =
                                cpu_to_le16(*allocated - split_map.m_len);
                        ext4_ext_store_pblock(&zero_ex1,
                                ext4_ext_pblock(ex) + split_map.m_lblk +
                                split_map.m_len - ee_block);
                        err = ext4_ext_zeroout(inode, &zero_ex1);
                        if (err)
                                goto fallback;
                        split_map.m_len = *allocated;
                }
                if (split_map.m_lblk - ee_block + split_map.m_len <
                                                                max_zeroout) {
                        /* case 2 or 5 */
                        if (split_map.m_lblk != ee_block) {
                                zero_ex2.ee_block = ex->ee_block;
                                zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
                                                        ee_block);
                                ext4_ext_store_pblock(&zero_ex2,
                                                      ext4_ext_pblock(ex));
                                err = ext4_ext_zeroout(inode, &zero_ex2);
                                if (err)
                                        goto fallback;
                        }

                        split_map.m_len += split_map.m_lblk - ee_block;
                        split_map.m_lblk = ee_block;
                        *allocated = map->m_len;
                }
        }

fallback:
        path = ext4_split_convert_extents(handle, inode, &split_map, path,
                                          flags | EXT4_GET_BLOCKS_CONVERT, NULL);
        if (IS_ERR(path))
                return path;
out:
        /* If we have gotten a failure, don't zero out status tree */
        ext4_zeroout_es(inode, &zero_ex1);
        ext4_zeroout_es(inode, &zero_ex2);
        return path;

errout:
        ext4_free_ext_path(path);
        return ERR_PTR(err);
}

/*
 * This function is called by ext4_ext_map_blocks() from
 * ext4_get_blocks_dio_write() when DIO to write
 * to an unwritten extent.
 *
 * Writing to an unwritten extent may result in splitting the unwritten
 * extent into multiple initialized/unwritten extents (up to three)
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be unwritten
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * This works the same way in the case of initialized -> unwritten conversion.
 *
 * One of more index blocks maybe needed if the extent tree grow after
 * the unwritten extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the unwritten extent before DIO submit
 * the IO. The unwritten extent called at this time will be split
 * into three unwritten extent(at most). After IO complete, the part
 * being filled will be convert to initialized by the end_io callback function
 * via ext4_convert_unwritten_extents().
 *
 * The size of unwritten extent to be written is passed to the caller via the
 * allocated pointer. Return an extent path pointer on success, or an error
 * pointer on failure.
 */
static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
                                        struct inode *inode,
                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path *path,
                                        int flags, unsigned int *allocated)
{
        ext4_lblk_t eof_block;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex;
        unsigned int ee_len;
        int split_flag = 0, depth, err = 0;
        bool did_zeroout = false;

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)map->m_lblk, map->m_len);

        eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
                        >> inode->i_sb->s_blocksize_bits;
        if (eof_block < map->m_lblk + map->m_len)
                eof_block = map->m_lblk + map->m_len;
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);

        /* No split needed */
        if (ee_block == map->m_lblk && ee_len == map->m_len)
                goto convert;

        /*
         * It is only safe to convert extent to initialized via explicit
         * zeroout only if extent is fully inside i_size or new_size.
         */
        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;

        /*
         * pass SPLIT_NOMERGE explicitly so we don't end up merging extents we
         * just split.
         */
        path = ext4_split_extent(handle, inode, path, map, split_flag,
                                 flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE,
                                 allocated, &did_zeroout);
        if (IS_ERR(path))
                return path;

convert:
        path = ext4_find_extent(inode, map->m_lblk, path, flags);
        if (IS_ERR(path))
                return path;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;

        /*
         * Conversion is already handled in case of zeroout
         */
        if (!did_zeroout) {
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto err;

                if (flags & EXT4_GET_BLOCKS_CONVERT)
                        ext4_ext_mark_initialized(ex);
                else if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)
                        ext4_ext_mark_unwritten(ex);

                if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE))
                       /*
                        * note: ext4_ext_correct_indexes() isn't needed here because
                        * borders are not changed
                        */
                        ext4_ext_try_to_merge(handle, inode, path, ex);

                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto err;
        }

        /* Lets update the extent status tree after conversion */
        if (!(flags & EXT4_EX_NOCACHE))
                ext4_es_insert_extent(inode, le32_to_cpu(ex->ee_block),
                                      ext4_ext_get_actual_len(ex),
                                      ext4_ext_pblock(ex),
                                      ext4_ext_is_unwritten(ex) ?
                                              EXTENT_STATUS_UNWRITTEN :
                                              EXTENT_STATUS_WRITTEN,
                                      false);

err:
        if (err) {
                ext4_free_ext_path(path);
                return ERR_PTR(err);
        }

        return path;
}

static struct ext4_ext_path *
ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode,
                                     struct ext4_map_blocks *map,
                                     struct ext4_ext_path *path, int flags)
{
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        unsigned int ee_len;
        int depth;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)ee_block, ee_len);

        return ext4_split_convert_extents(handle, inode, map, path, flags,
                                          NULL);
}

static struct ext4_ext_path *
convert_initialized_extent(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map,
                           struct ext4_ext_path *path,
                           int flags,
                           unsigned int *allocated)
{
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        unsigned int ee_len;
        int depth;

        /*
         * Make sure that the extent is no bigger than we support with
         * unwritten extent
         */
        if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
                map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)ee_block, ee_len);

        path = ext4_split_convert_extents(handle, inode, map, path, flags,
                                          NULL);
        if (IS_ERR(path))
                return path;

        ext4_ext_show_leaf(inode, path);

        ext4_update_inode_fsync_trans(handle, inode, 1);

        /*
         * The extent might be initialized in case of zeroout.
         */
        path = ext4_find_extent(inode, map->m_lblk, path, flags);
        if (IS_ERR(path))
                return path;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;

        if (ext4_ext_is_unwritten(ex))
                map->m_flags |= EXT4_MAP_UNWRITTEN;
        else
                map->m_flags |= EXT4_MAP_MAPPED;
        if (*allocated > map->m_len)
                *allocated = map->m_len;
        map->m_len = *allocated;
        return path;
}

static struct ext4_ext_path *
ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
                        struct ext4_ext_path *path, int flags,
                        unsigned int *allocated, ext4_fsblk_t newblock)
{
        int err = 0;

        ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
                  (unsigned long long)map->m_lblk, map->m_len, flags,
                  *allocated);
        ext4_ext_show_leaf(inode, path);

        /*
         * When writing into unwritten space, we should not fail to
         * allocate metadata blocks for the new extent block if needed.
         */
        flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;

        trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
                                                *allocated, newblock);

        /* IO end_io complete, convert the filled extent to written */
        if (flags & EXT4_GET_BLOCKS_CONVERT) {
                path = ext4_convert_unwritten_extents_endio(handle, inode,
                                                            map, path, flags);
                if (IS_ERR(path))
                        return path;
                ext4_update_inode_fsync_trans(handle, inode, 1);
                goto map_out;
        }
        /* buffered IO cases */
        /*
         * repeat fallocate creation request
         * we already have an unwritten extent
         */
        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto map_out;
        }

        /* buffered READ or buffered write_begin() lookup */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                /*
                 * We have blocks reserved already.  We
                 * return allocated blocks so that delalloc
                 * won't do block reservation for us.  But
                 * the buffer head will be unmapped so that
                 * a read from the block returns 0s.
                 */
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto out1;
        }

        /*
         * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
         * For buffered writes, at writepage time, etc.  Convert a
         * discovered unwritten extent to written.
         */
        path = ext4_ext_convert_to_initialized(handle, inode, map, path,
                                               flags, allocated);
        if (IS_ERR(path))
                return path;
        ext4_update_inode_fsync_trans(handle, inode, 1);
        /*
         * shouldn't get a 0 allocated when converting an unwritten extent
         * unless m_len is 0 (bug) or extent has been corrupted
         */
        if (unlikely(*allocated == 0)) {
                EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u",
                                 map->m_len);
                err = -EFSCORRUPTED;
                goto errout;
        }

        map->m_flags |= EXT4_MAP_NEW;
map_out:
        map->m_flags |= EXT4_MAP_MAPPED;
out1:
        map->m_pblk = newblock;
        if (*allocated > map->m_len)
                *allocated = map->m_len;
        map->m_len = *allocated;
        ext4_ext_show_leaf(inode, path);
        return path;

errout:
        ext4_free_ext_path(path);
        return ERR_PTR(err);
}

/*
 * get_implied_cluster_alloc - check to see if the requested
 * allocation (in the map structure) overlaps with a cluster already
 * allocated in an extent.
 *        @sb        The filesystem superblock structure
 *        @map        The requested lblk->pblk mapping
 *        @ex        The extent structure which might contain an implied
 *                        cluster allocation
 *
 * This function is called by ext4_ext_map_blocks() after we failed to
 * find blocks that were already in the inode's extent tree.  Hence,
 * we know that the beginning of the requested region cannot overlap
 * the extent from the inode's extent tree.  There are three cases we
 * want to catch.  The first is this case:
 *
 *                 |--- cluster # N--|
 *    |--- extent ---|        |---- requested region ---|
 *                        |==========|
 *
 * The second case that we need to test for is this one:
 *
 *   |--------- cluster # N ----------------|
 *           |--- requested region --|   |------- extent ----|
 *           |=======================|
 *
 * The third case is when the requested region lies between two extents
 * within the same cluster:
 *          |------------- cluster # N-------------|
 * |----- ex -----|                  |---- ex_right ----|
 *                  |------ requested region ------|
 *                  |================|
 *
 * In each of the above cases, we need to set the map->m_pblk and
 * map->m_len so it corresponds to the return the extent labelled as
 * "|====|" from cluster #N, since it is already in use for data in
 * cluster EXT4_B2C(sbi, map->m_lblk).        We will then return 1 to
 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
 * as a new "allocated" block region.  Otherwise, we will return 0 and
 * ext4_ext_map_blocks() will then allocate one or more new clusters
 * by calling ext4_mb_new_blocks().
 */
static int get_implied_cluster_alloc(struct super_block *sb,
                                     struct ext4_map_blocks *map,
                                     struct ext4_extent *ex,
                                     struct ext4_ext_path *path)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
        ext4_lblk_t ex_cluster_start, ex_cluster_end;
        ext4_lblk_t rr_cluster_start;
        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
        ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
        unsigned short ee_len = ext4_ext_get_actual_len(ex);

        /* The extent passed in that we are trying to match */
        ex_cluster_start = EXT4_B2C(sbi, ee_block);
        ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);

        /* The requested region passed into ext4_map_blocks() */
        rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);

        if ((rr_cluster_start == ex_cluster_end) ||
            (rr_cluster_start == ex_cluster_start)) {
                if (rr_cluster_start == ex_cluster_end)
                        ee_start += ee_len - 1;
                map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
                map->m_len = min(map->m_len,
                                 (unsigned) sbi->s_cluster_ratio - c_offset);
                /*
                 * Check for and handle this case:
                 *
                 *   |--------- cluster # N-------------|
                 *                       |------- extent ----|
                 *           |--- requested region ---|
                 *           |===========|
                 */

                if (map->m_lblk < ee_block)
                        map->m_len = min(map->m_len, ee_block - map->m_lblk);

                /*
                 * Check for the case where there is already another allocated
                 * block to the right of 'ex' but before the end of the cluster.
                 *
                 *          |------------- cluster # N-------------|
                 * |----- ex -----|                  |---- ex_right ----|
                 *                  |------ requested region ------|
                 *                  |================|
                 */
                if (map->m_lblk > ee_block) {
                        ext4_lblk_t next = ext4_ext_next_allocated_block(path);
                        map->m_len = min(map->m_len, next - map->m_lblk);
                }

                trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
                return 1;
        }

        trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
        return 0;
}

/*
 * Determine hole length around the given logical block, first try to
 * locate and expand the hole from the given @path, and then adjust it
 * if it's partially or completely converted to delayed extents, insert
 * it into the extent cache tree if it's indeed a hole, finally return
 * the length of the determined extent.
 */
static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
                                                  struct ext4_ext_path *path,
                                                  ext4_lblk_t lblk)
{
        ext4_lblk_t hole_start, len;
        struct extent_status es;

        hole_start = lblk;
        len = ext4_ext_find_hole(inode, path, &hole_start);
again:
        ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
                                  hole_start + len - 1, &es);
        if (!es.es_len)
                goto insert_hole;

        /*
         * There's a delalloc extent in the hole, handle it if the delalloc
         * extent is in front of, behind and straddle the queried range.
         */
        if (lblk >= es.es_lblk + es.es_len) {
                /*
                 * The delalloc extent is in front of the queried range,
                 * find again from the queried start block.
                 */
                len -= lblk - hole_start;
                hole_start = lblk;
                goto again;
        } else if (in_range(lblk, es.es_lblk, es.es_len)) {
                /*
                 * The delalloc extent containing lblk, it must have been
                 * added after ext4_map_blocks() checked the extent status
                 * tree so we are not holding i_rwsem and delalloc info is
                 * only stabilized by i_data_sem we are going to release
                 * soon. Don't modify the extent status tree and report
                 * extent as a hole, just adjust the length to the delalloc
                 * extent's after lblk.
                 */
                len = es.es_lblk + es.es_len - lblk;
                return len;
        } else {
                /*
                 * The delalloc extent is partially or completely behind
                 * the queried range, update hole length until the
                 * beginning of the delalloc extent.
                 */
                len = min(es.es_lblk - hole_start, len);
        }

insert_hole:
        /* Put just found gap into cache to speed up subsequent requests */
        ext_debug(inode, " -> %u:%u\n", hole_start, len);
        ext4_es_cache_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);

        /* Update hole_len to reflect hole size after lblk */
        if (hole_start != lblk)
                len -= lblk - hole_start;

        return len;
}

/*
 * Block allocation/map/preallocation routine for extents based files
 *
 *
 * Need to be called with
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
 * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
 *
 * return > 0, number of blocks already mapped/allocated
 *          if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks
 *                  buffer head is unmapped
 *          otherwise blocks are mapped
 *
 * return = 0, if plain look up failed (blocks have not been allocated)
 *          buffer head is unmapped
 *
 * return < 0, error case.
 */
int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map, int flags)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent newex, *ex, ex2;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_fsblk_t newblock = 0, pblk;
        int err = 0, depth;
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
        struct ext4_allocation_request ar;
        ext4_lblk_t cluster_offset;

        ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len);
        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);

        /* find extent for this block */
        path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                goto out;
        }

        depth = ext_depth(inode);

        /*
         * consistent leaf must not be empty;
         * this situation is possible, though, _during_ tree modification;
         * this is why assert can't be put in ext4_find_extent()
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode, "bad extent address "
                                 "lblock: %lu, depth: %d pblock %lld",
                                 (unsigned long) map->m_lblk, depth,
                                 path[depth].p_block);
                err = -EFSCORRUPTED;
                goto out;
        }

        ex = path[depth].p_ext;
        if (ex) {
                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;


                /*
                 * unwritten extents are treated as holes, except that
                 * we split out initialized portions during a write.
                 */
                ee_len = ext4_ext_get_actual_len(ex);

                trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);

                /* if found extent covers block, simply return it */
                if (in_range(map->m_lblk, ee_block, ee_len)) {
                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (map->m_lblk - ee_block);
                        ext_debug(inode, "%u fit into %u:%d -> %llu\n",
                                  map->m_lblk, ee_block, ee_len, newblock);

                        /*
                         * If the extent is initialized check whether the
                         * caller wants to convert it to unwritten.
                         */
                        if ((!ext4_ext_is_unwritten(ex)) &&
                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
                                path = convert_initialized_extent(handle,
                                        inode, map, path, flags, &allocated);
                                if (IS_ERR(path))
                                        err = PTR_ERR(path);
                                goto out;
                        } else if (!ext4_ext_is_unwritten(ex)) {
                                map->m_flags |= EXT4_MAP_MAPPED;
                                map->m_pblk = newblock;
                                if (allocated > map->m_len)
                                        allocated = map->m_len;
                                map->m_len = allocated;
                                ext4_ext_show_leaf(inode, path);
                                goto out;
                        }

                        path = ext4_ext_handle_unwritten_extents(
                                handle, inode, map, path, flags,
                                &allocated, newblock);
                        if (IS_ERR(path))
                                err = PTR_ERR(path);
                        goto out;
                }
        }

        /*
         * requested block isn't allocated yet;
         * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE
         */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                ext4_lblk_t len;

                len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk);

                map->m_pblk = 0;
                map->m_len = min_t(unsigned int, map->m_len, len);
                goto out;
        }

        /*
         * Okay, we need to do block allocation.
         */
        newex.ee_block = cpu_to_le32(map->m_lblk);
        cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);

        /*
         * If we are doing bigalloc, check to see if the extent returned
         * by ext4_find_extent() implies a cluster we can use.
         */
        if (cluster_offset && ex &&
            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
                ar.len = allocated = map->m_len;
                newblock = map->m_pblk;
                goto got_allocated_blocks;
        }

        /* find neighbour allocated blocks */
        ar.lleft = map->m_lblk;
        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
        if (err)
                goto out;
        ar.lright = map->m_lblk;
        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright,
                                    &ex2, flags);
        if (err < 0)
                goto out;

        /* Check if the extent after searching to the right implies a
         * cluster we can use. */
        if ((sbi->s_cluster_ratio > 1) && err &&
            get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
                ar.len = allocated = map->m_len;
                newblock = map->m_pblk;
                err = 0;
                goto got_allocated_blocks;
        }

        /*
         * See if request is beyond maximum number of blocks we can have in
         * a single extent. For an initialized extent this limit is
         * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
         * EXT_UNWRITTEN_MAX_LEN.
         */
        if (map->m_len > EXT_INIT_MAX_LEN &&
            !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
                map->m_len = EXT_INIT_MAX_LEN;
        else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
                 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
                map->m_len = EXT_UNWRITTEN_MAX_LEN;

        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
        newex.ee_len = cpu_to_le16(map->m_len);
        err = ext4_ext_check_overlap(sbi, inode, &newex, path);
        if (err)
                allocated = ext4_ext_get_actual_len(&newex);
        else
                allocated = map->m_len;

        /* allocate new block */
        ar.inode = inode;
        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
        ar.logical = map->m_lblk;
        /*
         * We calculate the offset from the beginning of the cluster
         * for the logical block number, since when we allocate a
         * physical cluster, the physical block should start at the
         * same offset from the beginning of the cluster.  This is
         * needed so that future calls to get_implied_cluster_alloc()
         * work correctly.
         */
        offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
        ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
        ar.goal -= offset;
        ar.logical -= offset;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
        else
                /* disable in-core preallocation for non-regular files */
                ar.flags = 0;
        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
        if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
                ar.flags |= EXT4_MB_USE_RESERVED;
        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out;
        allocated_clusters = ar.len;
        ar.len = EXT4_C2B(sbi, ar.len) - offset;
        ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n",
                  ar.goal, newblock, ar.len, allocated);
        if (ar.len > allocated)
                ar.len = allocated;

got_allocated_blocks:
        /* try to insert new extent into found leaf and return */
        pblk = newblock + offset;
        ext4_ext_store_pblock(&newex, pblk);
        newex.ee_len = cpu_to_le16(ar.len);
        /* Mark unwritten */
        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
                ext4_ext_mark_unwritten(&newex);
                map->m_flags |= EXT4_MAP_UNWRITTEN;
        }

        path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                /*
                 * Gracefully handle out of space conditions. If the filesystem
                 * is inconsistent, we'll just leak allocated blocks to avoid
                 * causing even more damage.
                 */
                if (allocated_clusters && (err == -EDQUOT || err == -ENOSPC)) {
                        int fb_flags = 0;
                        /*
                         * free data blocks we just allocated.
                         * not a good idea to call discard here directly,
                         * but otherwise we'd need to call it every free().
                         */
                        ext4_discard_preallocations(inode);
                        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                                fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
                        ext4_free_blocks(handle, inode, NULL, newblock,
                                         EXT4_C2B(sbi, allocated_clusters),
                                         fb_flags);
                }
                goto out;
        }

        /*
         * Cache the extent and update transaction to commit on fdatasync only
         * when it is _not_ an unwritten extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
        else
                ext4_update_inode_fsync_trans(handle, inode, 0);

        map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED);
        map->m_pblk = pblk;
        map->m_len = ar.len;
        allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
out:
        /*
         * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag.
         * So we know that the depth used here is correct, since there was no
         * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set.
         * If tomorrow we start using this QUERY flag with CREATE, then we will
         * need to re-calculate the depth as it might have changed due to block
         * allocation.
         */
        if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) {
                WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE);
                if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr)))
                        map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF;
        }

        ext4_free_ext_path(path);

        trace_ext4_ext_map_blocks_exit(inode, flags, map,
                                       err ? err : allocated);
        return err ? err : allocated;
}

int ext4_ext_truncate(handle_t *handle, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t last_block;
        int err = 0;

        /*
         * TODO: optimization is possible here.
         * Probably we need not scan at all,
         * because page truncation is enough.
         */

        /* we have to know where to truncate from in crash case */
        EXT4_I(inode)->i_disksize = inode->i_size;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err)
                return err;

        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
        ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);

retry_remove_space:
        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
        if (err == -ENOMEM) {
                memalloc_retry_wait(GFP_ATOMIC);
                goto retry_remove_space;
        }
        return err;
}

static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
                                  loff_t new_size, int flags)
{
        struct inode *inode = file_inode(file);
        handle_t *handle;
        int ret = 0, ret2 = 0, ret3 = 0;
        int retries = 0;
        int depth = 0;
        ext4_lblk_t len_lblk;
        struct ext4_map_blocks map;
        unsigned int credits;
        loff_t epos = 0, old_size = i_size_read(inode);
        unsigned int blkbits = inode->i_blkbits;
        bool alloc_zero = false;

        BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
        map.m_lblk = offset >> blkbits;
        map.m_len = len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
        /*
         * Don't normalize the request if it can fit in one extent so
         * that it doesn't get unnecessarily split into multiple
         * extents.
         */
        if (len_lblk <= EXT_UNWRITTEN_MAX_LEN)
                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;

        /*
         * Do the actual write zero during a running journal transaction
         * costs a lot. First allocate an unwritten extent and then
         * convert it to written after zeroing it out.
         */
        if (flags & EXT4_GET_BLOCKS_ZERO) {
                flags &= ~EXT4_GET_BLOCKS_ZERO;
                flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
                alloc_zero = true;
        }

        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, len_lblk);
        depth = ext_depth(inode);

        /* Zero to the end of the block containing i_size */
        if (new_size > old_size) {
                ret = ext4_block_zero_eof(inode, old_size, LLONG_MAX);
                if (ret)
                        return ret;
        }

retry:
        while (len_lblk) {
                /*
                 * Recalculate credits when extent tree depth changes.
                 */
                if (depth != ext_depth(inode)) {
                        credits = ext4_chunk_trans_blocks(inode, len_lblk);
                        depth = ext_depth(inode);
                }

                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                            credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
                ret = ext4_map_blocks(handle, inode, &map, flags);
                if (ret <= 0) {
                        ext4_debug("inode #%llu: block %u: len %u: "
                                   "ext4_ext_map_blocks returned %d",
                                   inode->i_ino, map.m_lblk,
                                   map.m_len, ret);
                        ext4_mark_inode_dirty(handle, inode);
                        ext4_journal_stop(handle);
                        break;
                }
                ext4_update_inode_fsync_trans(handle, inode, 1);
                ret = ext4_journal_stop(handle);
                if (unlikely(ret))
                        break;

                /*
                 * allow a full retry cycle for any remaining allocations
                 */
                retries = 0;

                if (alloc_zero &&
                    (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
                        ret = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
                                                 map.m_len);
                        if (likely(!ret))
                                ret = ext4_convert_unwritten_extents(NULL,
                                        inode, (loff_t)map.m_lblk << blkbits,
                                        (loff_t)map.m_len << blkbits);
                        if (ret)
                                break;
                }

                map.m_lblk += map.m_len;
                map.m_len = len_lblk = len_lblk - map.m_len;
                epos = EXT4_LBLK_TO_B(inode, map.m_lblk);
        }

        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;

        if (!epos || !new_size)
                return ret;

        /*
         * Allocate blocks, update the file size to match the size of the
         * already successfully allocated blocks.
         */
        if (epos > new_size)
                epos = new_size;

        handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
        if (IS_ERR(handle))
                return ret ? ret : PTR_ERR(handle);

        ext4_update_inode_size(inode, epos);
        ret2 = ext4_mark_inode_dirty(handle, inode);
        ext4_update_inode_fsync_trans(handle, inode, 1);
        ret3 = ext4_journal_stop(handle);
        ret2 = ret3 ? ret3 : ret2;

        if (epos > old_size)
                pagecache_isize_extended(inode, old_size, epos);

        return ret ? ret : ret2;
}

static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);

static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);

static long ext4_zero_range(struct file *file, loff_t offset,
                            loff_t len, int mode)
{
        struct inode *inode = file_inode(file);
        handle_t *handle = NULL;
        loff_t align_start, align_end, new_size = 0;
        loff_t end = offset + len;
        unsigned int blocksize = i_blocksize(inode);
        bool partial_zeroed = false;
        int ret, flags;

        trace_ext4_zero_range(inode, offset, len, mode);
        WARN_ON_ONCE(!inode_is_locked(inode));

        /* Indirect files do not support unwritten extents */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;

        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
            (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
                new_size = end;
                ret = inode_newsize_ok(inode, new_size);
                if (ret)
                        return ret;
        }

        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
        /* Preallocate the range including the unaligned edges */
        if (!IS_ALIGNED(offset | end, blocksize)) {
                ret = ext4_alloc_file_blocks(file, offset, len, new_size,
                                             flags);
                if (ret)
                        return ret;
        }

        ret = ext4_update_disksize_before_punch(inode, offset, len);
        if (ret)
                return ret;

        /* Now release the pages and zero block aligned part of pages */
        ret = ext4_truncate_page_cache_block_range(inode, offset, end);
        if (ret)
                return ret;

        /* Zero range excluding the unaligned edges */
        align_start = round_up(offset, blocksize);
        align_end = round_down(end, blocksize);
        if (align_end > align_start) {
                if (mode & FALLOC_FL_WRITE_ZEROES)
                        flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE;
                else
                        flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
                                  EXT4_EX_NOCACHE);
                ret = ext4_alloc_file_blocks(file, align_start,
                                             align_end - align_start, new_size,
                                             flags);
                if (ret)
                        return ret;
        }
        /* Finish zeroing out if it doesn't contain partial block */
        if (IS_ALIGNED(offset | end, blocksize))
                return ret;

        /* Zero out partial block at the edges of the range */
        ret = ext4_zero_partial_blocks(inode, offset, len, &partial_zeroed);
        if (ret)
                return ret;
        if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) {
                ret = filemap_write_and_wait_range(inode->i_mapping, offset,
                                                   end - 1);
                if (ret)
                        return ret;
        }

        handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                ext4_std_error(inode->i_sb, ret);
                return ret;
        }

        if (new_size)
                ext4_update_inode_size(inode, new_size);
        ret = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(ret))
                goto out_handle;

        ext4_update_inode_fsync_trans(handle, inode, 1);
        if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
                ext4_handle_sync(handle);

out_handle:
        ext4_journal_stop(handle);
        return ret;
}

static long ext4_do_fallocate(struct file *file, loff_t offset,
                              loff_t len, int mode)
{
        struct inode *inode = file_inode(file);
        loff_t end = offset + len;
        loff_t new_size = 0;
        int ret;

        trace_ext4_fallocate_enter(inode, offset, len, mode);
        WARN_ON_ONCE(!inode_is_locked(inode));

        /* We only support preallocation for extent-based files only. */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
            (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
                new_size = end;
                ret = inode_newsize_ok(inode, new_size);
                if (ret)
                        goto out;
        }

        ret = ext4_alloc_file_blocks(file, offset, len, new_size,
                                     EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
        if (ret)
                goto out;

        if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) &&
            EXT4_SB(inode->i_sb)->s_journal) {
                ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
                                        EXT4_I(inode)->i_sync_tid);
        }
out:
        trace_ext4_fallocate_exit(inode, offset,
                        EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits), ret);
        return ret;
}

/*
 * preallocate space for a file. This implements ext4's fallocate file
 * operation, which gets called from sys_fallocate system call.
 * For block-mapped files, posix_fallocate should fall back to the method
 * of writing zeroes to the required new blocks (the same behavior which is
 * expected for file systems which do not support fallocate() system call).
 */
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        struct address_space *mapping = file->f_mapping;
        int ret;

        /*
         * Encrypted inodes can't handle collapse range or insert
         * range since we would need to re-encrypt blocks with a
         * different IV or XTS tweak (which are based on the logical
         * block number).
         */
        if (IS_ENCRYPTED(inode) &&
            (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
                return -EOPNOTSUPP;
        /*
         * Don't allow writing zeroes if the underlying device does not
         * enable the unmap write zeroes operation.
         */
        if ((mode & FALLOC_FL_WRITE_ZEROES) &&
            !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev))
                return -EOPNOTSUPP;

        /* Return error if mode is not supported */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
                     FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE |
                     FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES))
                return -EOPNOTSUPP;

        inode_lock(inode);
        ret = ext4_convert_inline_data(inode);
        if (ret)
                goto out_inode_lock;

        /* Wait all existing dio workers, newcomers will block on i_rwsem */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out_inode_lock;

        if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) {
                ret = ext4_do_fallocate(file, offset, len, mode);
                goto out_inode_lock;
        }

        /*
         * Follow-up operations will drop page cache, hold invalidate lock
         * to prevent page faults from reinstantiating pages we have
         * released from page cache.
         */
        filemap_invalidate_lock(mapping);

        ret = ext4_break_layouts(inode);
        if (ret)
                goto out_invalidate_lock;

        switch (mode & FALLOC_FL_MODE_MASK) {
        case FALLOC_FL_PUNCH_HOLE:
                ret = ext4_punch_hole(file, offset, len);
                break;
        case FALLOC_FL_COLLAPSE_RANGE:
                ret = ext4_collapse_range(file, offset, len);
                break;
        case FALLOC_FL_INSERT_RANGE:
                ret = ext4_insert_range(file, offset, len);
                break;
        case FALLOC_FL_ZERO_RANGE:
        case FALLOC_FL_WRITE_ZEROES:
                ret = ext4_zero_range(file, offset, len, mode);
                break;
        default:
                ret = -EOPNOTSUPP;
        }

out_invalidate_lock:
        filemap_invalidate_unlock(mapping);
out_inode_lock:
        inode_unlock(inode);
        return ret;
}

/*
 * This function converts a range of blocks to written extents. The caller of
 * this function will pass the start offset and the size. all unwritten extents
 * within this range will be converted to written extents.
 *
 * This function is called from the direct IO end io call back function for
 * atomic writes, to convert the unwritten extents after IO is completed.
 *
 * Note that the requirement for atomic writes is that all conversion should
 * happen atomically in a single fs journal transaction. We mainly only allocate
 * unwritten extents either on a hole on a pre-exiting unwritten extent range in
 * ext4_map_blocks_atomic_write(). The only case where we can have multiple
 * unwritten extents in a range [offset, offset+len) is when there is a split
 * unwritten extent between two leaf nodes which was cached in extent status
 * cache during ext4_iomap_alloc() time. That will allow
 * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going
 * into the slow path. That means we might need a loop for conversion of this
 * unwritten extent split across leaf block within a single journal transaction.
 * Split extents across leaf nodes is a rare case, but let's still handle that
 * to meet the requirements of multi-fsblock atomic writes.
 *
 * Returns 0 on success.
 */
int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode,
                                          loff_t offset, ssize_t len)
{
        unsigned int max_blocks;
        int ret = 0, ret2 = 0, ret3 = 0;
        struct ext4_map_blocks map;
        unsigned int blkbits = inode->i_blkbits;
        unsigned int credits = 0;
        int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE;

        map.m_lblk = offset >> blkbits;
        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);

        if (!handle) {
                /*
                 * TODO: An optimization can be added later by having an extent
                 * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that
                 * it can tell if the extent in the cache is a split extent.
                 * But for now let's assume pextents as 2 always.
                 */
                credits = ext4_meta_trans_blocks(inode, max_blocks, 2);
        }

        if (credits) {
                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        return ret;
                }
        }

        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk += ret;
                map.m_len = (max_blocks -= ret);
                ret = ext4_map_blocks(handle, inode, &map, flags);
                if (ret != max_blocks)
                        ext4_msg(inode->i_sb, KERN_INFO,
                                     "inode #%llu: block %u: len %u: "
                                     "split block mapping found for atomic write, "
                                     "ret = %d",
                                     inode->i_ino, map.m_lblk,
                                     map.m_len, ret);
                if (ret <= 0)
                        break;
        }

        ret2 = ext4_mark_inode_dirty(handle, inode);

        if (credits) {
                ret3 = ext4_journal_stop(handle);
                if (unlikely(ret3))
                        ret2 = ret3;
        }

        if (ret <= 0 || ret2)
                ext4_warning(inode->i_sb,
                             "inode #%llu: block %u: len %u: "
                             "returned %d or %d",
                             inode->i_ino, map.m_lblk,
                             map.m_len, ret, ret2);

        return ret > 0 ? ret2 : ret;
}

/*
 * This function convert a range of blocks to written extents
 * The caller of this function will pass the start offset and the size.
 * all unwritten extents within this range will be converted to
 * written extents.
 *
 * This function is called from the direct IO end io call back
 * function, to convert the fallocated extents after IO is completed.
 * Returns 0 on success.
 */
int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                   loff_t offset, ssize_t len)
{
        unsigned int max_blocks;
        int ret = 0, ret2 = 0, ret3 = 0;
        struct ext4_map_blocks map;
        unsigned int blkbits = inode->i_blkbits;
        unsigned int credits = 0;

        map.m_lblk = offset >> blkbits;
        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);

        if (!handle) {
                /*
                 * credits to insert 1 extent into extent tree
                 */
                credits = ext4_chunk_trans_blocks(inode, max_blocks);
        }
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk += ret;
                map.m_len = (max_blocks -= ret);
                if (credits) {
                        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                                    credits);
                        if (IS_ERR(handle)) {
                                ret = PTR_ERR(handle);
                                break;
                        }
                }
                /*
                 * Do not cache any unrelated extents, as it does not hold the
                 * i_rwsem or invalidate_lock, which could corrupt the extent
                 * status tree.
                 */
                ret = ext4_map_blocks(handle, inode, &map,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT |
                                      EXT4_EX_NOCACHE);
                if (ret <= 0)
                        ext4_warning(inode->i_sb,
                                     "inode #%llu: block %u: len %u: "
                                     "ext4_ext_map_blocks returned %d",
                                     inode->i_ino, map.m_lblk,
                                     map.m_len, ret);
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (credits) {
                        ret3 = ext4_journal_stop(handle);
                        if (unlikely(ret3))
                                ret2 = ret3;
                }

                if (ret <= 0 || ret2)
                        break;
        }
        return ret > 0 ? ret2 : ret;
}

int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
{
        int ret = 0, err = 0;
        struct ext4_io_end_vec *io_end_vec;

        /*
         * This is somewhat ugly but the idea is clear: When transaction is
         * reserved, everything goes into it. Otherwise we rather start several
         * smaller transactions for conversion of each extent separately.
         */
        if (handle) {
                handle = ext4_journal_start_reserved(handle,
                                                     EXT4_HT_EXT_CONVERT);
                if (IS_ERR(handle))
                        return PTR_ERR(handle);
        }

        list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
                ret = ext4_convert_unwritten_extents(handle, io_end->inode,
                                                     io_end_vec->offset,
                                                     io_end_vec->size);
                if (ret)
                        break;
        }

        if (handle)
                err = ext4_journal_stop(handle);

        return ret < 0 ? ret : err;
}

static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
{
        __u64 physical = 0;
        __u64 length = 0;
        int blockbits = inode->i_sb->s_blocksize_bits;
        int error = 0;
        u16 iomap_type;

        /* in-inode? */
        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                struct ext4_iloc iloc;
                int offset;        /* offset of xattr in inode */

                error = ext4_get_inode_loc(inode, &iloc);
                if (error)
                        return error;
                physical = (__u64)iloc.bh->b_blocknr << blockbits;
                offset = EXT4_GOOD_OLD_INODE_SIZE +
                                EXT4_I(inode)->i_extra_isize;
                physical += offset;
                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
                brelse(iloc.bh);
                iomap_type = IOMAP_INLINE;
        } else if (EXT4_I(inode)->i_file_acl) { /* external block */
                physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
                length = inode->i_sb->s_blocksize;
                iomap_type = IOMAP_MAPPED;
        } else {
                /* no in-inode or external block for xattr, so return -ENOENT */
                error = -ENOENT;
                goto out;
        }

        iomap->addr = physical;
        iomap->offset = 0;
        iomap->length = length;
        iomap->type = iomap_type;
        iomap->flags = 0;
out:
        return error;
}

static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
                                  loff_t length, unsigned flags,
                                  struct iomap *iomap, struct iomap *srcmap)
{
        int error;

        error = ext4_iomap_xattr_fiemap(inode, iomap);
        if (error == 0 && (offset >= iomap->length))
                error = -ENOENT;
        return error;
}

static const struct iomap_ops ext4_iomap_xattr_ops = {
        .iomap_begin                = ext4_iomap_xattr_begin,
};

static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
        u64 maxbytes = ext4_get_maxbytes(inode);

        if (*len == 0)
                return -EINVAL;
        if (start > maxbytes)
                return -EFBIG;

        /*
         * Shrink request scope to what the fs can actually handle.
         */
        if (*len > maxbytes || (maxbytes - *len) < start)
                *len = maxbytes - start;
        return 0;
}

int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                u64 start, u64 len)
{
        int error = 0;

        inode_lock_shared(inode);
        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
                error = ext4_ext_precache(inode);
                if (error)
                        goto unlock;
                fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
        }

        /*
         * For bitmap files the maximum size limit could be smaller than
         * s_maxbytes, so check len here manually instead of just relying on the
         * generic check.
         */
        error = ext4_fiemap_check_ranges(inode, start, &len);
        if (error)
                goto unlock;

        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
                fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
                error = iomap_fiemap(inode, fieinfo, start, len,
                                     &ext4_iomap_xattr_ops);
        } else {
                error = iomap_fiemap(inode, fieinfo, start, len,
                                     &ext4_iomap_report_ops);
        }
unlock:
        inode_unlock_shared(inode);
        return error;
}

int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
                      __u64 start, __u64 len)
{
        ext4_lblk_t start_blk, len_blks;
        __u64 last_blk;
        int error = 0;

        if (ext4_has_inline_data(inode)) {
                int has_inline;

                down_read(&EXT4_I(inode)->xattr_sem);
                has_inline = ext4_has_inline_data(inode);
                up_read(&EXT4_I(inode)->xattr_sem);
                if (has_inline)
                        return 0;
        }

        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
                inode_lock_shared(inode);
                error = ext4_ext_precache(inode);
                inode_unlock_shared(inode);
                if (error)
                        return error;
                fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
        }

        error = fiemap_prep(inode, fieinfo, start, &len, 0);
        if (error)
                return error;

        error = ext4_fiemap_check_ranges(inode, start, &len);
        if (error)
                return error;

        start_blk = start >> inode->i_sb->s_blocksize_bits;
        last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
        if (last_blk >= EXT_MAX_BLOCKS)
                last_blk = EXT_MAX_BLOCKS-1;
        len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;

        /*
         * Walk the extent tree gathering extent information
         * and pushing extents back to the user.
         */
        return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo);
}

/*
 * ext4_ext_shift_path_extents:
 * Shift the extents of a path structure lying between path[depth].p_ext
 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
 * if it is right shift or left shift operation.
 */
static int
ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
                            struct inode *inode, handle_t *handle,
                            enum SHIFT_DIRECTION SHIFT)
{
        int depth, err = 0;
        struct ext4_extent *ex_start, *ex_last;
        bool update = false;
        int credits, restart_credits;
        depth = path->p_depth;

        while (depth >= 0) {
                if (depth == path->p_depth) {
                        ex_start = path[depth].p_ext;
                        if (!ex_start)
                                return -EFSCORRUPTED;

                        ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
                        /* leaf + sb + inode */
                        credits = 3;
                        if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
                                update = true;
                                /* extent tree + sb + inode */
                                credits = depth + 2;
                        }

                        restart_credits = ext4_chunk_trans_extent(inode, 0);
                        err = ext4_datasem_ensure_credits(handle, inode, credits,
                                        restart_credits, 0);
                        if (err) {
                                if (err > 0)
                                        err = -EAGAIN;
                                goto out;
                        }

                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto out;

                        while (ex_start <= ex_last) {
                                if (SHIFT == SHIFT_LEFT) {
                                        le32_add_cpu(&ex_start->ee_block,
                                                -shift);
                                        /* Try to merge to the left. */
                                        if ((ex_start >
                                            EXT_FIRST_EXTENT(path[depth].p_hdr))
                                            &&
                                            ext4_ext_try_to_merge_right(inode,
                                            path, ex_start - 1))
                                                ex_last--;
                                        else
                                                ex_start++;
                                } else {
                                        le32_add_cpu(&ex_last->ee_block, shift);
                                        ext4_ext_try_to_merge_right(inode, path,
                                                ex_last);
                                        ex_last--;
                                }
                        }
                        err = ext4_ext_dirty(handle, inode, path + depth);
                        if (err)
                                goto out;

                        if (--depth < 0 || !update)
                                break;
                }

                /* Update index too */
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto out;

                if (SHIFT == SHIFT_LEFT)
                        le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
                else
                        le32_add_cpu(&path[depth].p_idx->ei_block, shift);
                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto out;

                /* we are done if current index is not a starting index */
                if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
                        break;

                depth--;
        }

out:
        return err;
}

/*
 * ext4_ext_shift_extents:
 * All the extents which lies in the range from @start to the last allocated
 * block for the @inode are shifted either towards left or right (depending
 * upon @SHIFT) by @shift blocks.
 * On success, 0 is returned, error otherwise.
 */
static int
ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
                       ext4_lblk_t start, ext4_lblk_t shift,
                       enum SHIFT_DIRECTION SHIFT)
{
        struct ext4_ext_path *path;
        int ret = 0, depth;
        struct ext4_extent *extent;
        ext4_lblk_t stop, *iterator, ex_start, ex_end;
        ext4_lblk_t tmp = EXT_MAX_BLOCKS;

        /* Let path point to the last extent */
        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
                                EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return PTR_ERR(path);

        depth = path->p_depth;
        extent = path[depth].p_ext;
        if (!extent)
                goto out;

        stop = le32_to_cpu(extent->ee_block);

       /*
        * For left shifts, make sure the hole on the left is big enough to
        * accommodate the shift.  For right shifts, make sure the last extent
        * won't be shifted beyond EXT_MAX_BLOCKS.
        */
        if (SHIFT == SHIFT_LEFT) {
                path = ext4_find_extent(inode, start - 1, path,
                                        EXT4_EX_NOCACHE);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = path->p_depth;
                extent =  path[depth].p_ext;
                if (extent) {
                        ex_start = le32_to_cpu(extent->ee_block);
                        ex_end = le32_to_cpu(extent->ee_block) +
                                ext4_ext_get_actual_len(extent);
                } else {
                        ex_start = 0;
                        ex_end = 0;
                }

                if ((start == ex_start && shift > ex_start) ||
                    (shift > start - ex_end)) {
                        ret = -EINVAL;
                        goto out;
                }
        } else {
                if (shift > EXT_MAX_BLOCKS -
                    (stop + ext4_ext_get_actual_len(extent))) {
                        ret = -EINVAL;
                        goto out;
                }
        }

        /*
         * In case of left shift, iterator points to start and it is increased
         * till we reach stop. In case of right shift, iterator points to stop
         * and it is decreased till we reach start.
         */
again:
        ret = 0;
        if (SHIFT == SHIFT_LEFT)
                iterator = &start;
        else
                iterator = &stop;

        if (tmp != EXT_MAX_BLOCKS)
                *iterator = tmp;

        /*
         * Its safe to start updating extents.  Start and stop are unsigned, so
         * in case of right shift if extent with 0 block is reached, iterator
         * becomes NULL to indicate the end of the loop.
         */
        while (iterator && start <= stop) {
                path = ext4_find_extent(inode, *iterator, path,
                                        EXT4_EX_NOCACHE);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = path->p_depth;
                extent = path[depth].p_ext;
                if (!extent) {
                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
                                         (unsigned long) *iterator);
                        ret = -EFSCORRUPTED;
                        goto out;
                }
                if (SHIFT == SHIFT_LEFT && *iterator >
                    le32_to_cpu(extent->ee_block)) {
                        /* Hole, move to the next extent */
                        if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
                                path[depth].p_ext++;
                        } else {
                                *iterator = ext4_ext_next_allocated_block(path);
                                continue;
                        }
                }

                tmp = *iterator;
                if (SHIFT == SHIFT_LEFT) {
                        extent = EXT_LAST_EXTENT(path[depth].p_hdr);
                        *iterator = le32_to_cpu(extent->ee_block) +
                                        ext4_ext_get_actual_len(extent);
                } else {
                        extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
                        if (le32_to_cpu(extent->ee_block) > start)
                                *iterator = le32_to_cpu(extent->ee_block) - 1;
                        else if (le32_to_cpu(extent->ee_block) == start)
                                iterator = NULL;
                        else {
                                extent = EXT_LAST_EXTENT(path[depth].p_hdr);
                                while (le32_to_cpu(extent->ee_block) >= start)
                                        extent--;

                                if (extent == EXT_LAST_EXTENT(path[depth].p_hdr))
                                        break;

                                extent++;
                                iterator = NULL;
                        }
                        path[depth].p_ext = extent;
                }
                ret = ext4_ext_shift_path_extents(path, shift, inode,
                                handle, SHIFT);
                /* iterator can be NULL which means we should break */
                if (ret == -EAGAIN)
                        goto again;
                if (ret)
                        break;
        }
out:
        ext4_free_ext_path(path);
        return ret;
}

/*
 * ext4_collapse_range:
 * This implements the fallocate's collapse range functionality for ext4
 * Returns: 0 and non-zero on error.
 */
static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        loff_t end = offset + len;
        ext4_lblk_t start_lblk, end_lblk;
        handle_t *handle;
        unsigned int credits;
        loff_t start, new_size;
        int ret;

        trace_ext4_collapse_range(inode, offset, len);
        WARN_ON_ONCE(!inode_is_locked(inode));

        /* Currently just for extent based files */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return -EOPNOTSUPP;
        /* Collapse range works only on fs cluster size aligned regions. */
        if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
                return -EINVAL;
        /*
         * There is no need to overlap collapse range with EOF, in which case
         * it is effectively a truncate operation
         */
        if (end >= inode->i_size)
                return -EINVAL;

        /*
         * Write tail of the last page before removed range and data that
         * will be shifted since they will get removed from the page cache
         * below. We are also protected from pages becoming dirty by
         * i_rwsem and invalidate_lock.
         * Need to round down offset to be aligned with page size boundary
         * for page size > block size.
         */
        start = round_down(offset, PAGE_SIZE);
        ret = filemap_write_and_wait_range(mapping, start, offset);
        if (!ret)
                ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
        if (ret)
                return ret;

        truncate_pagecache(inode, start);

        credits = ext4_chunk_trans_extent(inode, 0);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);

        start_lblk = offset >> inode->i_blkbits;
        end_lblk = (offset + len) >> inode->i_blkbits;

        ext4_check_map_extents_env(inode);

        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode);
        ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);

        ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1);
        if (ret) {
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_handle;
        }
        ext4_discard_preallocations(inode);

        ret = ext4_ext_shift_extents(inode, handle, end_lblk,
                                     end_lblk - start_lblk, SHIFT_LEFT);
        if (ret) {
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_handle;
        }

        new_size = inode->i_size - len;
        i_size_write(inode, new_size);
        EXT4_I(inode)->i_disksize = new_size;

        up_write(&EXT4_I(inode)->i_data_sem);
        ret = ext4_mark_inode_dirty(handle, inode);
        if (ret)
                goto out_handle;

        ext4_update_inode_fsync_trans(handle, inode, 1);
        if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
                ext4_handle_sync(handle);

out_handle:
        ext4_journal_stop(handle);
        return ret;
}

/*
 * ext4_insert_range:
 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
 * The data blocks starting from @offset to the EOF are shifted by @len
 * towards right to create a hole in the @inode. Inode size is increased
 * by len bytes.
 * Returns 0 on success, error otherwise.
 */
static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        handle_t *handle;
        struct ext4_ext_path *path;
        struct ext4_extent *extent;
        ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
        unsigned int credits, ee_len;
        int ret, depth;
        loff_t start;

        trace_ext4_insert_range(inode, offset, len);
        WARN_ON_ONCE(!inode_is_locked(inode));

        /* Currently just for extent based files */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return -EOPNOTSUPP;
        /* Insert range works only on fs cluster size aligned regions. */
        if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
                return -EINVAL;
        /* Offset must be less than i_size */
        if (offset >= inode->i_size)
                return -EINVAL;
        /* Check whether the maximum file size would be exceeded */
        if (len > inode->i_sb->s_maxbytes - inode->i_size)
                return -EFBIG;

        /*
         * Write out all dirty pages. Need to round down to align start offset
         * to page size boundary for page size > block size.
         */
        start = round_down(offset, PAGE_SIZE);
        ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
        if (ret)
                return ret;

        truncate_pagecache(inode, start);

        credits = ext4_chunk_trans_extent(inode, 0);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);

        /* Expand file to avoid data loss if there is error while shifting */
        inode->i_size += len;
        EXT4_I(inode)->i_disksize += len;
        ret = ext4_mark_inode_dirty(handle, inode);
        if (ret)
                goto out_handle;

        start_lblk = offset >> inode->i_blkbits;
        len_lblk = len >> inode->i_blkbits;

        ext4_check_map_extents_env(inode);

        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode);

        path = ext4_find_extent(inode, start_lblk, NULL, 0);
        if (IS_ERR(path)) {
                up_write(&EXT4_I(inode)->i_data_sem);
                ret = PTR_ERR(path);
                goto out_handle;
        }

        depth = ext_depth(inode);
        extent = path[depth].p_ext;
        if (extent) {
                ee_start_lblk = le32_to_cpu(extent->ee_block);
                ee_len = ext4_ext_get_actual_len(extent);

                /*
                 * If start_lblk is not the starting block of extent, split
                 * the extent @start_lblk
                 */
                if ((start_lblk > ee_start_lblk) &&
                                (start_lblk < (ee_start_lblk + ee_len))) {
                        path = ext4_split_extent_at(handle, inode, path,
                                        start_lblk, EXT4_EX_NOCACHE |
                                        EXT4_GET_BLOCKS_SPLIT_NOMERGE |
                                        EXT4_GET_BLOCKS_METADATA_NOFAIL);
                }

                if (IS_ERR(path)) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        ret = PTR_ERR(path);
                        goto out_handle;
                }
        }

        ext4_free_ext_path(path);
        ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);

        /*
         * if start_lblk lies in a hole which is at start of file, use
         * ee_start_lblk to shift extents
         */
        ret = ext4_ext_shift_extents(inode, handle,
                max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT);
        up_write(&EXT4_I(inode)->i_data_sem);
        if (ret)
                goto out_handle;

        ext4_update_inode_fsync_trans(handle, inode, 1);
        if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
                ext4_handle_sync(handle);

out_handle:
        ext4_journal_stop(handle);
        return ret;
}

/**
 * ext4_swap_extents() - Swap extents between two inodes
 * @handle: handle for this transaction
 * @inode1:        First inode
 * @inode2:        Second inode
 * @lblk1:        Start block for first inode
 * @lblk2:        Start block for second inode
 * @count:        Number of blocks to swap
 * @unwritten: Mark second inode's extents as unwritten after swap
 * @erp:        Pointer to save error value
 *
 * This helper routine does exactly what is promise "swap extents". All other
 * stuff such as page-cache locking consistency, bh mapping consistency or
 * extent's data copying must be performed by caller.
 * Locking:
 *                i_rwsem is held for both inodes
 *                 i_data_sem is locked for write for both inodes
 * Assumptions:
 *                All pages from requested range are locked for both inodes
 */
int
ext4_swap_extents(handle_t *handle, struct inode *inode1,
                  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
                  ext4_lblk_t count, int unwritten, int *erp)
{
        struct ext4_ext_path *path1 = NULL;
        struct ext4_ext_path *path2 = NULL;
        int replaced_count = 0;

        BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
        BUG_ON(!inode_is_locked(inode1));
        BUG_ON(!inode_is_locked(inode2));

        ext4_es_remove_extent(inode1, lblk1, count);
        ext4_es_remove_extent(inode2, lblk2, count);

        while (count) {
                struct ext4_extent *ex1, *ex2, tmp_ex;
                ext4_lblk_t e1_blk, e2_blk;
                int e1_len, e2_len, len;
                int split = 0;

                path1 = ext4_find_extent(inode1, lblk1, path1, EXT4_EX_NOCACHE);
                if (IS_ERR(path1)) {
                        *erp = PTR_ERR(path1);
                        goto errout;
                }
                path2 = ext4_find_extent(inode2, lblk2, path2, EXT4_EX_NOCACHE);
                if (IS_ERR(path2)) {
                        *erp = PTR_ERR(path2);
                        goto errout;
                }
                ex1 = path1[path1->p_depth].p_ext;
                ex2 = path2[path2->p_depth].p_ext;
                /* Do we have something to swap ? */
                if (unlikely(!ex2 || !ex1))
                        goto errout;

                e1_blk = le32_to_cpu(ex1->ee_block);
                e2_blk = le32_to_cpu(ex2->ee_block);
                e1_len = ext4_ext_get_actual_len(ex1);
                e2_len = ext4_ext_get_actual_len(ex2);

                /* Hole handling */
                if (!in_range(lblk1, e1_blk, e1_len) ||
                    !in_range(lblk2, e2_blk, e2_len)) {
                        ext4_lblk_t next1, next2;

                        /* if hole after extent, then go to next extent */
                        next1 = ext4_ext_next_allocated_block(path1);
                        next2 = ext4_ext_next_allocated_block(path2);
                        /* If hole before extent, then shift to that extent */
                        if (e1_blk > lblk1)
                                next1 = e1_blk;
                        if (e2_blk > lblk2)
                                next2 = e2_blk;
                        /* Do we have something to swap */
                        if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
                                goto errout;
                        /* Move to the rightest boundary */
                        len = next1 - lblk1;
                        if (len < next2 - lblk2)
                                len = next2 - lblk2;
                        if (len > count)
                                len = count;
                        lblk1 += len;
                        lblk2 += len;
                        count -= len;
                        continue;
                }

                /* Prepare left boundary */
                if (e1_blk < lblk1) {
                        split = 1;
                        path1 = ext4_force_split_extent_at(handle, inode1,
                                                           path1, lblk1, 0);
                        if (IS_ERR(path1)) {
                                *erp = PTR_ERR(path1);
                                goto errout;
                        }
                }
                if (e2_blk < lblk2) {
                        split = 1;
                        path2 = ext4_force_split_extent_at(handle, inode2,
                                                           path2, lblk2, 0);
                        if (IS_ERR(path2)) {
                                *erp = PTR_ERR(path2);
                                goto errout;
                        }
                }
                /* ext4_split_extent_at() may result in leaf extent split,
                 * path must to be revalidated. */
                if (split)
                        continue;

                /* Prepare right boundary */
                len = count;
                if (len > e1_blk + e1_len - lblk1)
                        len = e1_blk + e1_len - lblk1;
                if (len > e2_blk + e2_len - lblk2)
                        len = e2_blk + e2_len - lblk2;

                if (len != e1_len) {
                        split = 1;
                        path1 = ext4_force_split_extent_at(handle, inode1,
                                                        path1, lblk1 + len, 0);
                        if (IS_ERR(path1)) {
                                *erp = PTR_ERR(path1);
                                goto errout;
                        }
                }
                if (len != e2_len) {
                        split = 1;
                        path2 = ext4_force_split_extent_at(handle, inode2,
                                                        path2, lblk2 + len, 0);
                        if (IS_ERR(path2)) {
                                *erp = PTR_ERR(path2);
                                goto errout;
                        }
                }
                /* ext4_split_extent_at() may result in leaf extent split,
                 * path must to be revalidated. */
                if (split)
                        continue;

                BUG_ON(e2_len != e1_len);
                *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
                if (unlikely(*erp))
                        goto errout;
                *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
                if (unlikely(*erp))
                        goto errout;

                /* Both extents are fully inside boundaries. Swap it now */
                tmp_ex = *ex1;
                ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
                ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
                ex1->ee_len = cpu_to_le16(e2_len);
                ex2->ee_len = cpu_to_le16(e1_len);
                if (unwritten)
                        ext4_ext_mark_unwritten(ex2);
                if (ext4_ext_is_unwritten(&tmp_ex))
                        ext4_ext_mark_unwritten(ex1);

                ext4_ext_try_to_merge(handle, inode2, path2, ex2);
                ext4_ext_try_to_merge(handle, inode1, path1, ex1);
                *erp = ext4_ext_dirty(handle, inode2, path2 +
                                      path2->p_depth);
                if (unlikely(*erp))
                        goto errout;
                *erp = ext4_ext_dirty(handle, inode1, path1 +
                                      path1->p_depth);
                /*
                 * Looks scarry ah..? second inode already points to new blocks,
                 * and it was successfully dirtied. But luckily error may happen
                 * only due to journal error, so full transaction will be
                 * aborted anyway.
                 */
                if (unlikely(*erp))
                        goto errout;

                lblk1 += len;
                lblk2 += len;
                replaced_count += len;
                count -= len;
        }

errout:
        ext4_free_ext_path(path1);
        ext4_free_ext_path(path2);
        return replaced_count;
}

/*
 * ext4_clu_mapped - determine whether any block in a logical cluster has
 *                   been mapped to a physical cluster
 *
 * @inode - file containing the logical cluster
 * @lclu - logical cluster of interest
 *
 * Returns 1 if any block in the logical cluster is mapped, signifying
 * that a physical cluster has been allocated for it.  Otherwise,
 * returns 0.  Can also return negative error codes.  Derived from
 * ext4_ext_map_blocks().
 */
int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_ext_path *path;
        int depth, mapped = 0, err = 0;
        struct ext4_extent *extent;
        ext4_lblk_t first_lblk, first_lclu, last_lclu;

        /*
         * if data can be stored inline, the logical cluster isn't
         * mapped - no physical clusters have been allocated, and the
         * file has no extents
         */
        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ||
            ext4_has_inline_data(inode))
                return 0;

        /* search for the extent closest to the first block in the cluster */
        path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
        if (IS_ERR(path))
                return PTR_ERR(path);

        depth = ext_depth(inode);

        /*
         * A consistent leaf must not be empty.  This situation is possible,
         * though, _during_ tree modification, and it's why an assert can't
         * be put in ext4_find_extent().
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode,
                    "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
                                 (unsigned long) EXT4_C2B(sbi, lclu),
                                 depth, path[depth].p_block);
                err = -EFSCORRUPTED;
                goto out;
        }

        extent = path[depth].p_ext;

        /* can't be mapped if the extent tree is empty */
        if (extent == NULL)
                goto out;

        first_lblk = le32_to_cpu(extent->ee_block);
        first_lclu = EXT4_B2C(sbi, first_lblk);

        /*
         * Three possible outcomes at this point - found extent spanning
         * the target cluster, to the left of the target cluster, or to the
         * right of the target cluster.  The first two cases are handled here.
         * The last case indicates the target cluster is not mapped.
         */
        if (lclu >= first_lclu) {
                last_lclu = EXT4_B2C(sbi, first_lblk +
                                     ext4_ext_get_actual_len(extent) - 1);
                if (lclu <= last_lclu) {
                        mapped = 1;
                } else {
                        first_lblk = ext4_ext_next_allocated_block(path);
                        first_lclu = EXT4_B2C(sbi, first_lblk);
                        if (lclu == first_lclu)
                                mapped = 1;
                }
        }

out:
        ext4_free_ext_path(path);

        return err ? err : mapped;
}

/*
 * Updates physical block address and unwritten status of extent
 * starting at lblk start and of len. If such an extent doesn't exist,
 * this function splits the extent tree appropriately to create an
 * extent like this.  This function is called in the fast commit
 * replay path.  Returns 0 on success and error on failure.
 */
int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
                              int len, int unwritten, ext4_fsblk_t pblk)
{
        struct ext4_ext_path *path;
        struct ext4_extent *ex;
        int ret;

        path = ext4_find_extent(inode, start, NULL, 0);
        if (IS_ERR(path))
                return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex) {
                ret = -EFSCORRUPTED;
                goto out;
        }

        if (le32_to_cpu(ex->ee_block) != start ||
                ext4_ext_get_actual_len(ex) != len) {
                /* We need to split this extent to match our extent first */
                down_write(&EXT4_I(inode)->i_data_sem);
                path = ext4_force_split_extent_at(NULL, inode, path, start, 1);
                up_write(&EXT4_I(inode)->i_data_sem);
                if (IS_ERR(path)) {
                        ret = PTR_ERR(path);
                        goto out;
                }

                path = ext4_find_extent(inode, start, path, 0);
                if (IS_ERR(path))
                        return PTR_ERR(path);

                ex = path[path->p_depth].p_ext;
                WARN_ON(le32_to_cpu(ex->ee_block) != start);

                if (ext4_ext_get_actual_len(ex) != len) {
                        down_write(&EXT4_I(inode)->i_data_sem);
                        path = ext4_force_split_extent_at(NULL, inode, path,
                                                          start + len, 1);
                        up_write(&EXT4_I(inode)->i_data_sem);
                        if (IS_ERR(path)) {
                                ret = PTR_ERR(path);
                                goto out;
                        }

                        path = ext4_find_extent(inode, start, path, 0);
                        if (IS_ERR(path))
                                return PTR_ERR(path);
                        ex = path[path->p_depth].p_ext;
                }
        }
        if (unwritten)
                ext4_ext_mark_unwritten(ex);
        else
                ext4_ext_mark_initialized(ex);
        ext4_ext_store_pblock(ex, pblk);
        down_write(&EXT4_I(inode)->i_data_sem);
        ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
        up_write(&EXT4_I(inode)->i_data_sem);
out:
        ext4_free_ext_path(path);
        ext4_mark_inode_dirty(NULL, inode);
        return ret;
}

/* Try to shrink the extent tree */
void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t old_cur, cur = 0;

        while (cur < end) {
                path = ext4_find_extent(inode, cur, NULL, 0);
                if (IS_ERR(path))
                        return;
                ex = path[path->p_depth].p_ext;
                if (!ex) {
                        ext4_free_ext_path(path);
                        ext4_mark_inode_dirty(NULL, inode);
                        return;
                }
                old_cur = cur;
                cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
                if (cur <= old_cur)
                        cur = old_cur + 1;
                ext4_ext_try_to_merge(NULL, inode, path, ex);
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
                up_write(&EXT4_I(inode)->i_data_sem);
                ext4_mark_inode_dirty(NULL, inode);
                ext4_free_ext_path(path);
        }
}

/* Check if *cur is a hole and if it is, skip it */
static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
{
        int ret;
        struct ext4_map_blocks map;

        map.m_lblk = *cur;
        map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;

        ret = ext4_map_blocks(NULL, inode, &map, 0);
        if (ret < 0)
                return ret;
        if (ret != 0)
                return 0;
        *cur = *cur + map.m_len;
        return 0;
}

/* Count number of blocks used by this inode and update i_blocks */
int ext4_ext_replay_set_iblocks(struct inode *inode)
{
        struct ext4_ext_path *path = NULL, *path2 = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t cur = 0, end;
        int numblks = 0, i, ret = 0;
        ext4_fsblk_t cmp1, cmp2;
        struct ext4_map_blocks map;

        /* Determin the size of the file first */
        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
                                        EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex)
                goto out;
        end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);

        /* Count the number of data blocks */
        cur = 0;
        while (cur < end) {
                map.m_lblk = cur;
                map.m_len = end - cur;
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        break;
                if (ret > 0)
                        numblks += ret;
                cur = cur + map.m_len;
        }

        /*
         * Count the number of extent tree blocks. We do it by looking up
         * two successive extents and determining the difference between
         * their paths. When path is different for 2 successive extents
         * we compare the blocks in the path at each level and increment
         * iblocks by total number of differences found.
         */
        cur = 0;
        ret = skip_hole(inode, &cur);
        if (ret < 0)
                goto out;
        path = ext4_find_extent(inode, cur, path, 0);
        if (IS_ERR(path))
                goto out;
        numblks += path->p_depth;
        while (cur < end) {
                path = ext4_find_extent(inode, cur, path, 0);
                if (IS_ERR(path))
                        break;
                ex = path[path->p_depth].p_ext;
                if (!ex)
                        goto cleanup;

                cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
                                        ext4_ext_get_actual_len(ex));
                ret = skip_hole(inode, &cur);
                if (ret < 0)
                        break;

                path2 = ext4_find_extent(inode, cur, path2, 0);
                if (IS_ERR(path2))
                        break;

                for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
                        cmp1 = cmp2 = 0;
                        if (i <= path->p_depth)
                                cmp1 = path[i].p_bh ?
                                        path[i].p_bh->b_blocknr : 0;
                        if (i <= path2->p_depth)
                                cmp2 = path2[i].p_bh ?
                                        path2[i].p_bh->b_blocknr : 0;
                        if (cmp1 != cmp2 && cmp2 != 0)
                                numblks++;
                }
        }

out:
        inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
        ext4_mark_inode_dirty(NULL, inode);
cleanup:
        ext4_free_ext_path(path);
        ext4_free_ext_path(path2);
        return 0;
}

int ext4_ext_clear_bb(struct inode *inode)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t cur = 0, end;
        int j, ret = 0;
        struct ext4_map_blocks map;

        if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
                return 0;

        /* Determin the size of the file first */
        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
                                        EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex)
                goto out;
        end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);

        cur = 0;
        while (cur < end) {
                map.m_lblk = cur;
                map.m_len = end - cur;
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        break;
                if (ret > 0) {
                        path = ext4_find_extent(inode, map.m_lblk, path, 0);
                        if (!IS_ERR(path)) {
                                for (j = 0; j < path->p_depth; j++) {
                                        ext4_mb_mark_bb(inode->i_sb,
                                                        path[j].p_block, 1, false);
                                        ext4_fc_record_regions(inode->i_sb, inode->i_ino,
                                                        0, path[j].p_block, 1, 1);
                                }
                        } else {
                                path = NULL;
                        }
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                        ext4_fc_record_regions(inode->i_sb, inode->i_ino,
                                        map.m_lblk, map.m_pblk, map.m_len, 1);
                }
                cur = cur + map.m_len;
        }

out:
        ext4_free_ext_path(path);
        return 0;
}

#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
int ext4_ext_space_root_idx_test(struct inode *inode, int check)
{
        return ext4_ext_space_root_idx(inode, check);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_space_root_idx_test);

struct ext4_ext_path *ext4_split_convert_extents_test(handle_t *handle,
                        struct inode *inode, struct ext4_map_blocks *map,
                        struct ext4_ext_path *path, int flags,
                        unsigned int *allocated)
{
        return ext4_split_convert_extents(handle, inode, map, path,
                                          flags, allocated);
}
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_split_convert_extents_test);

EXPORT_SYMBOL_FOR_EXT4_TEST(__ext4_ext_dirty);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_zeroout);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_register_shrinker);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_unregister_shrinker);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_map_create_blocks);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_init_tree);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_lookup_extent);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_insert_extent);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_insert_extent);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_find_extent);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_issue_zeroout);
EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_map_query_blocks);
#endif




































































































































































































    2 
    2 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
/* SPDX-License-Identifier: GPL-2.0-or-later */

#ifndef _LINUX_RSTREASON_H
#define _LINUX_RSTREASON_H
#include <net/dropreason-core.h>
#include <uapi/linux/mptcp.h>

#define DEFINE_RST_REASON(FN, FNe)        \
        FN(NOT_SPECIFIED)                \
        FN(NO_SOCKET)                        \
        FN(TCP_INVALID_ACK_SEQUENCE)        \
        FN(TCP_RFC7323_PAWS)                \
        FN(TCP_TOO_OLD_ACK)                \
        FN(TCP_ACK_UNSENT_DATA)                \
        FN(TCP_FLAGS)                        \
        FN(TCP_OLD_ACK)                        \
        FN(TCP_ABORT_ON_DATA)                \
        FN(TCP_TIMEWAIT_SOCKET)                \
        FN(INVALID_SYN)                        \
        FN(TCP_ABORT_ON_CLOSE)                \
        FN(TCP_ABORT_ON_LINGER)                \
        FN(TCP_ABORT_ON_MEMORY)                \
        FN(TCP_STATE)                        \
        FN(TCP_KEEPALIVE_TIMEOUT)        \
        FN(TCP_DISCONNECT_WITH_DATA)        \
        FN(MPTCP_RST_EUNSPEC)                \
        FN(MPTCP_RST_EMPTCP)                \
        FN(MPTCP_RST_ERESOURCE)                \
        FN(MPTCP_RST_EPROHIBIT)                \
        FN(MPTCP_RST_EWQ2BIG)                \
        FN(MPTCP_RST_EBADPERF)                \
        FN(MPTCP_RST_EMIDDLEBOX)        \
        FN(ERROR)                        \
        FNe(MAX)

/**
 * enum sk_rst_reason - the reasons of socket reset
 *
 * The reasons of sk reset, which are used in TCP/MPTCP protocols.
 *
 * There are three parts in order:
 * 1) skb drop reasons: relying on drop reasons for such as passive reset
 * 2) independent reset reasons: such as active reset reasons
 * 3) reset reasons in MPTCP: only for MPTCP use
 */
enum sk_rst_reason {
        /* Refer to include/net/dropreason-core.h
         * Rely on skb drop reasons because it indicates exactly why RST
         * could happen.
         */
        /** @SK_RST_REASON_NOT_SPECIFIED: reset reason is not specified */
        SK_RST_REASON_NOT_SPECIFIED,
        /** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */
        SK_RST_REASON_NO_SOCKET,
        /**
         * @SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ
         * field because ack sequence is not in the window between snd_una
         * and snd_nxt
         */
        SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE,
        /**
         * @SK_RST_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to
         * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED
         */
        SK_RST_REASON_TCP_RFC7323_PAWS,
        /** @SK_RST_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */
        SK_RST_REASON_TCP_TOO_OLD_ACK,
        /**
         * @SK_RST_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't
         * sent yet
         */
        SK_RST_REASON_TCP_ACK_UNSENT_DATA,
        /** @SK_RST_REASON_TCP_FLAGS: TCP flags invalid */
        SK_RST_REASON_TCP_FLAGS,
        /** @SK_RST_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */
        SK_RST_REASON_TCP_OLD_ACK,
        /**
         * @SK_RST_REASON_TCP_ABORT_ON_DATA: abort on data
         * corresponding to LINUX_MIB_TCPABORTONDATA
         */
        SK_RST_REASON_TCP_ABORT_ON_DATA,

        /* Here start with the independent reasons */
        /** @SK_RST_REASON_TCP_TIMEWAIT_SOCKET: happen on the timewait socket */
        SK_RST_REASON_TCP_TIMEWAIT_SOCKET,
        /**
         * @SK_RST_REASON_INVALID_SYN: receive bad syn packet
         * RFC 793 says if the state is not CLOSED/LISTEN/SYN-SENT then
         * "fourth, check the SYN bit,...If the SYN is in the window it is
         * an error, send a reset"
         */
        SK_RST_REASON_INVALID_SYN,
        /**
         * @SK_RST_REASON_TCP_ABORT_ON_CLOSE: abort on close
         * corresponding to LINUX_MIB_TCPABORTONCLOSE
         */
        SK_RST_REASON_TCP_ABORT_ON_CLOSE,
        /**
         * @SK_RST_REASON_TCP_ABORT_ON_LINGER: abort on linger
         * corresponding to LINUX_MIB_TCPABORTONLINGER
         */
        SK_RST_REASON_TCP_ABORT_ON_LINGER,
        /**
         * @SK_RST_REASON_TCP_ABORT_ON_MEMORY: abort on memory
         * corresponding to LINUX_MIB_TCPABORTONMEMORY
         */
        SK_RST_REASON_TCP_ABORT_ON_MEMORY,
        /**
         * @SK_RST_REASON_TCP_STATE: abort on tcp state
         * Please see RFC 9293 for all possible reset conditions
         */
        SK_RST_REASON_TCP_STATE,
        /**
         * @SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT: time to timeout
         * When we have already run out of all the chances, which means
         * keepalive timeout, we have to reset the connection
         */
        SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT,
        /**
         * @SK_RST_REASON_TCP_DISCONNECT_WITH_DATA: disconnect when write
         * queue is not empty
         * It means user has written data into the write queue when doing
         * disconnecting, so we have to send an RST.
         */
        SK_RST_REASON_TCP_DISCONNECT_WITH_DATA,

        /* Copy from include/uapi/linux/mptcp.h.
         * These reset fields will not be changed since they adhere to
         * RFC 8684. So do not touch them. I'm going to list each definition
         * of them respectively.
         */
        /**
         * @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error.
         * This is the default error; it implies that the subflow is no
         * longer available. The presence of this option shows that the
         * RST was generated by an MPTCP-aware device.
         */
        SK_RST_REASON_MPTCP_RST_EUNSPEC,
        /**
         * @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error.
         * An error has been detected in the processing of MPTCP options.
         * This is the usual reason code to return in the cases where a RST
         * is being sent to close a subflow because of an invalid response.
         */
        SK_RST_REASON_MPTCP_RST_EMPTCP,
        /**
         * @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources.
         * This code indicates that the sending host does not have enough
         * resources to support the terminated subflow.
         */
        SK_RST_REASON_MPTCP_RST_ERESOURCE,
        /**
         * @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited.
         * This code indicates that the requested subflow is prohibited by
         * the policies of the sending host.
         */
        SK_RST_REASON_MPTCP_RST_EPROHIBIT,
        /**
         * @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data.
         * This code indicates that there is an excessive amount of data
         * that needs to be transmitted over the terminated subflow while
         * having already been acknowledged over one or more other subflows.
         * This may occur if a path has been unavailable for a short period
         * and it is more efficient to reset and start again than it is to
         * retransmit the queued data.
         */
        SK_RST_REASON_MPTCP_RST_EWQ2BIG,
        /**
         * @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable performance.
         * This code indicates that the performance of this subflow was
         * too low compared to the other subflows of this Multipath TCP
         * connection.
         */
        SK_RST_REASON_MPTCP_RST_EBADPERF,
        /**
         * @SK_RST_REASON_MPTCP_RST_EMIDDLEBOX: Middlebox interference.
         * Middlebox interference has been detected over this subflow,
         * making MPTCP signaling invalid. For example, this may be sent
         * if the checksum does not validate.
         */
        SK_RST_REASON_MPTCP_RST_EMIDDLEBOX,

        /** @SK_RST_REASON_ERROR: unexpected error happens */
        SK_RST_REASON_ERROR,

        /**
         * @SK_RST_REASON_MAX: Maximum of socket reset reasons.
         * It shouldn't be used as a real 'reason'.
         */
        SK_RST_REASON_MAX,
};

/* Convert skb drop reasons to enum sk_rst_reason type */
static inline enum sk_rst_reason
sk_rst_convert_drop_reason(enum skb_drop_reason reason)
{
        switch (reason) {
        case SKB_DROP_REASON_NOT_SPECIFIED:
                return SK_RST_REASON_NOT_SPECIFIED;
        case SKB_DROP_REASON_NO_SOCKET:
                return SK_RST_REASON_NO_SOCKET;
        case SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE:
                return SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE;
        case SKB_DROP_REASON_TCP_RFC7323_PAWS:
                return SK_RST_REASON_TCP_RFC7323_PAWS;
        case SKB_DROP_REASON_TCP_TOO_OLD_ACK:
                return SK_RST_REASON_TCP_TOO_OLD_ACK;
        case SKB_DROP_REASON_TCP_ACK_UNSENT_DATA:
                return SK_RST_REASON_TCP_ACK_UNSENT_DATA;
        case SKB_DROP_REASON_TCP_FLAGS:
                return SK_RST_REASON_TCP_FLAGS;
        case SKB_DROP_REASON_TCP_OLD_ACK:
                return SK_RST_REASON_TCP_OLD_ACK;
        case SKB_DROP_REASON_TCP_ABORT_ON_DATA:
                return SK_RST_REASON_TCP_ABORT_ON_DATA;
        default:
                /* If we don't have our own corresponding reason */
                return SK_RST_REASON_NOT_SPECIFIED;
        }
}
#endif




























































































































    2 
   15 
















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* fs/ internal definitions
 *
 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

struct super_block;
struct file_system_type;
struct iomap;
struct iomap_ops;
struct linux_binprm;
struct path;
struct mount;
struct shrink_control;
struct fs_context;
struct pipe_inode_info;
struct iov_iter;
struct mnt_idmap;
struct ns_common;

/*
 * block/bdev.c
 */
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
#else
static inline void bdev_cache_init(void)
{
}
#endif /* CONFIG_BLOCK */

/*
 * buffer.c
 */
int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block, const struct iomap *iomap);

/*
 * char_dev.c
 */
extern void __init chrdev_init(void);

/*
 * fs_context.c
 */
extern int parse_monolithic_mount_data(struct fs_context *, void *);
extern void vfs_clean_context(struct fs_context *fc);
extern int finish_clean_context(struct fs_context *fc);

/*
 * namei.c
 */
extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
                           struct path *path, const struct path *root);
int filename_rmdir(int dfd, struct filename *name);
int filename_unlinkat(int dfd, struct filename *name);
int may_linkat(struct mnt_idmap *idmap, const struct path *link);
int filename_renameat2(int olddfd, struct filename *oldname, int newdfd,
                 struct filename *newname, unsigned int flags);
int filename_mkdirat(int dfd, struct filename *name, umode_t mode);
int filename_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev);
int filename_symlinkat(struct filename *from, int newdfd, struct filename *to);
int filename_linkat(int olddfd, struct filename *old, int newdfd,
                        struct filename *new, int flags);
int vfs_tmpfile(struct mnt_idmap *idmap,
                const struct path *parentpath,
                struct file *file, umode_t mode);
struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
struct dentry *start_dirop(struct dentry *parent, struct qstr *name,
                           unsigned int lookup_flags);
int lookup_noperm_common(struct qstr *qname, struct dentry *base);

void __init filename_init(void);

/*
 * namespace.c
 */
extern struct vfsmount *lookup_mnt(const struct path *);
extern int finish_automount(struct vfsmount *, const struct path *);

extern int sb_prepare_remount_readonly(struct super_block *);

extern void __init mnt_init(void);

int mnt_get_write_access_file(struct file *file);
void mnt_put_write_access_file(struct file *file);

extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);

int path_mount(const char *dev_name, const struct path *path,
                const char *type_page, unsigned long flags, void *data_page);
int path_umount(const struct path *path, int flags);
int path_pivot_root(struct path *new, struct path *old);

int show_path(struct seq_file *m, struct dentry *root);

/*
 * fs_struct.c
 */
extern void chroot_fs_refs(const struct path *, const struct path *);

/*
 * file_table.c
 */
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred,
                                      const struct file *user_file);
void backing_file_set_user_path(struct file *f, const struct path *path);

static inline void file_put_write_access(struct file *file)
{
        put_write_access(file->f_inode);
        mnt_put_write_access(file->f_path.mnt);
        if (unlikely(file->f_mode & FMODE_BACKING))
                mnt_put_write_access(backing_file_user_path(file)->mnt);
}

static inline void put_file_access(struct file *file)
{
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
                i_readcount_dec(file->f_inode);
        } else if (file->f_mode & FMODE_WRITER) {
                file_put_write_access(file);
        }
}

void fput_close_sync(struct file *);
void fput_close(struct file *);

/*
 * super.c
 */
extern int reconfigure_super(struct fs_context *);
extern bool super_trylock_shared(struct super_block *sb);
struct super_block *user_get_super(dev_t, bool excl);
void put_super(struct super_block *sb);
extern bool mount_capable(struct fs_context *);
int sb_init_dio_done_wq(struct super_block *sb);

/*
 * Prepare superblock for changing its read-only state (i.e., either remount
 * read-write superblock read-only or vice versa). After this function returns
 * mnt_is_readonly() will return true for any mount of the superblock if its
 * caller is able to observe any changes done by the remount. This holds until
 * sb_end_ro_state_change() is called.
 */
static inline void sb_start_ro_state_change(struct super_block *sb)
{
        WRITE_ONCE(sb->s_readonly_remount, 1);
        /*
         * For RO->RW transition, the barrier pairs with the barrier in
         * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
         * cleared, it will see s_readonly_remount set.
         * For RW->RO transition, the barrier pairs with the barrier in
         * mnt_get_write_access() before the mnt_is_readonly() check.
         * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
         * already cleared, it will see s_readonly_remount set.
         */
        smp_wmb();
}

/*
 * Ends section changing read-only state of the superblock. After this function
 * returns if mnt_is_readonly() returns false, the caller will be able to
 * observe all the changes remount did to the superblock.
 */
static inline void sb_end_ro_state_change(struct super_block *sb)
{
        /*
         * This barrier provides release semantics that pairs with
         * the smp_rmb() acquire semantics in mnt_is_readonly().
         * This barrier pair ensure that when mnt_is_readonly() sees
         * 0 for sb->s_readonly_remount, it will also see all the
         * preceding flag changes that were made during the RO state
         * change.
         */
        smp_wmb();
        WRITE_ONCE(sb->s_readonly_remount, 0);
}

/*
 * open.c
 */
struct open_flags {
        int open_flag;
        umode_t mode;
        int acc_mode;
        int intent;
        int lookup_flags;
};
extern struct file *do_file_open(int dfd, struct filename *pathname,
                const struct open_flags *op);
extern struct file *do_file_open_root(const struct path *,
                const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);

int do_ftruncate(struct file *file, loff_t length, unsigned int flags);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
                int flag);
int chown_common(const struct path *path, uid_t user, gid_t group);
extern int vfs_open(const struct path *, struct file *);

/*
 * inode.c
 */
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid);

/*
 * fs-writeback.c
 */
long get_nr_dirty_inodes(void);
bool sync_lazytime(struct inode *inode);

/*
 * dcache.c
 */
extern int d_set_mounted(struct dentry *dentry);
extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
extern struct dentry *d_alloc_cursor(struct dentry *);
extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
extern char *simple_dname(struct dentry *, char *, int);
extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
extern void shrink_dcache_for_umount(struct super_block *);
extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name, unsigned *seq);

/*
 * pipe.c
 */
extern const struct file_operations pipefifo_fops;

/*
 * fs_pin.c
 */
extern void group_pin_kill(struct hlist_head *p);
extern void mnt_pin_kill(struct mount *m);

/*
 * fs/nsfs.c
 */
extern const struct dentry_operations ns_dentry_operations;
int open_namespace(struct ns_common *ns);
struct file *open_namespace_file(struct ns_common *ns);

/*
 * fs/stat.c:
 */

int do_statx(int dfd, struct filename *filename, unsigned int flags,
             unsigned int mask, struct statx __user *buffer);
int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
                struct statx __user *buffer);

/*
 * fs/splice.c:
 */
ssize_t splice_file_to_pipe(struct file *in,
                            struct pipe_inode_info *opipe,
                            loff_t *offset,
                            size_t len, unsigned int flags);

/*
 * fs/xattr.c:
 */
struct xattr_name {
        char name[XATTR_NAME_MAX + 1];
};

struct kernel_xattr_ctx {
        /* Value of attribute */
        union {
                const void __user *cvalue;
                void __user *value;
        };
        void *kvalue;
        size_t size;
        /* Attribute name */
        struct xattr_name *kname;
        unsigned int flags;
};

ssize_t file_getxattr(struct file *file, struct kernel_xattr_ctx *ctx);
ssize_t filename_getxattr(int dfd, struct filename *filename,
                          unsigned int lookup_flags, struct kernel_xattr_ctx *ctx);
int file_setxattr(struct file *file, struct kernel_xattr_ctx *ctx);
int filename_setxattr(int dfd, struct filename *filename,
                      unsigned int lookup_flags, struct kernel_xattr_ctx *ctx);
int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx);
int import_xattr_name(struct xattr_name *kname, const char __user *name);

int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode);

#ifdef CONFIG_FS_POSIX_ACL
int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
               const char *acl_name, const void *kvalue, size_t size);
ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name, void *kvalue, size_t size);
#else
static inline int do_set_acl(struct mnt_idmap *idmap,
                             struct dentry *dentry, const char *acl_name,
                             const void *kvalue, size_t size)
{
        return -EOPNOTSUPP;
}
static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name,
                                 void *kvalue, size_t size)
{
        return -EOPNOTSUPP;
}
#endif

ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);

/*
 * fs/attr.c
 */
struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);
struct stashed_operations {
        struct dentry *(*stash_dentry)(struct dentry **stashed,
                                       struct dentry *dentry);
        void (*put_data)(void *data);
        int (*init_inode)(struct inode *inode, void *data);
};
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
                      struct path *path);
void stashed_dentry_prune(struct dentry *dentry);
struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry);
struct dentry *stashed_dentry_get(struct dentry **stashed);
/**
 * path_mounted - check whether path is mounted
 * @path: path to check
 *
 * Determine whether @path refers to the root of a mount.
 *
 * Return: true if @path is the root of a mount, false if not.
 */
static inline bool path_mounted(const struct path *path)
{
        return path->mnt->mnt_root == path->dentry;
}
void file_f_owner_release(struct file *file);
bool file_seek_cur_needs_f_lock(struct file *file);
int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
struct dentry *find_next_child(struct dentry *parent, struct dentry *prev);
int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
                       struct kstat *stat, u32 request_mask,
                       unsigned int query_flags);
int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                       struct iattr *attr);
void pidfs_get_root(struct path *path);
void nsfs_get_root(struct path *path);
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
 * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Code partially derived from nft_hash
 * Rewritten with rehash code from br_multicast plus single list
 * pointer as suggested by Josh Triplett
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#ifndef _LINUX_RHASHTABLE_H
#define _LINUX_RHASHTABLE_H

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jhash.h>
#include <linux/list_nulls.h>
#include <linux/workqueue.h>
#include <linux/rculist.h>
#include <linux/bit_spinlock.h>

#include <linux/rhashtable-types.h>
/*
 * Objects in an rhashtable have an embedded struct rhash_head
 * which is linked into as hash chain from the hash table - or one
 * of two or more hash tables when the rhashtable is being resized.
 * The end of the chain is marked with a special nulls marks which has
 * the least significant bit set but otherwise stores the address of
 * the hash bucket.  This allows us to be sure we've found the end
 * of the right list.
 * The value stored in the hash bucket has BIT(0) used as a lock bit.
 * This bit must be atomically set before any changes are made to
 * the chain.  To avoid dereferencing this pointer without clearing
 * the bit first, we use an opaque 'struct rhash_lock_head *' for the
 * pointer stored in the bucket.  This struct needs to be defined so
 * that rcu_dereference() works on it, but it has no content so a
 * cast is needed for it to be useful.  This ensures it isn't
 * used by mistake with clearing the lock bit first.
 */
struct rhash_lock_head {};

/* Maximum chain length before rehash
 *
 * The maximum (not average) chain length grows with the size of the hash
 * table, at a rate of (log N)/(log log N).
 *
 * The value of 16 is selected so that even if the hash table grew to
 * 2^32 you would not expect the maximum chain length to exceed it
 * unless we are under attack (or extremely unlucky).
 *
 * As this limit is only to detect attacks, we don't need to set it to a
 * lower value as you'd need the chain length to vastly exceed 16 to have
 * any real effect on the system.
 */
#define RHT_ELASTICITY        16u

/**
 * struct bucket_table - Table of hash buckets
 * @size: Number of hash buckets
 * @nest: Number of bits of first-level nested table.
 * @rehash: Current bucket being rehashed
 * @hash_rnd: Random seed to fold into hash
 * @walkers: List of active walkers
 * @rcu: RCU structure for freeing the table
 * @future_tbl: Table under construction during rehashing
 * @ntbl: Nested table used when out of memory.
 * @buckets: size * hash buckets
 */
struct bucket_table {
        unsigned int                size;
        unsigned int                nest;
        u32                        hash_rnd;
        struct list_head        walkers;
        struct rcu_head                rcu;

        struct bucket_table __rcu *future_tbl;

        struct lockdep_map        dep_map;

        struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
};

/*
 * NULLS_MARKER() expects a hash value with the low
 * bits mostly likely to be significant, and it discards
 * the msb.
 * We give it an address, in which the bottom bit is
 * always 0, and the msb might be significant.
 * So we shift the address down one bit to align with
 * expectations and avoid losing a significant bit.
 *
 * We never store the NULLS_MARKER in the hash table
 * itself as we need the lsb for locking.
 * Instead we store a NULL
 */
#define        RHT_NULLS_MARKER(ptr)        \
        ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
#define INIT_RHT_NULLS_HEAD(ptr)        \
        ((ptr) = NULL)

static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
{
        return ((unsigned long) ptr & 1);
}

static inline void *rht_obj(const struct rhashtable *ht,
                            const struct rhash_head *he)
{
        return (char *)he - ht->p.head_offset;
}

static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
                                            unsigned int hash)
{
        return hash & (tbl->size - 1);
}

static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht,
        const void *key, const struct rhashtable_params params,
        unsigned int hash_rnd)
{
        unsigned int hash;

        /* params must be equal to ht->p if it isn't constant. */
        if (!__builtin_constant_p(params.key_len)) {
                hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
        } else {
                unsigned int key_len = params.key_len ? : ht->p.key_len;

                if (params.hashfn)
                        hash = params.hashfn(key, key_len, hash_rnd);
                else if (key_len & (sizeof(u32) - 1))
                        hash = jhash(key, key_len, hash_rnd);
                else
                        hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
        }

        return hash;
}

static __always_inline unsigned int rht_key_hashfn(
        struct rhashtable *ht, const struct bucket_table *tbl,
        const void *key, const struct rhashtable_params params)
{
        unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);

        return rht_bucket_index(tbl, hash);
}

static __always_inline unsigned int rht_head_hashfn(
        struct rhashtable *ht, const struct bucket_table *tbl,
        const struct rhash_head *he, const struct rhashtable_params params)
{
        const char *ptr = rht_obj(ht, he);

        return likely(params.obj_hashfn) ?
               rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?:
                                                            ht->p.key_len,
                                                       tbl->hash_rnd)) :
               rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
}

/**
 * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_75(const struct rhashtable *ht,
                                     const struct bucket_table *tbl)
{
        /* Expand table when exceeding 75% load */
        return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
               (!ht->p.max_size || tbl->size < ht->p.max_size);
}

/**
 * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_shrink_below_30(const struct rhashtable *ht,
                                       const struct bucket_table *tbl)
{
        /* Shrink table beneath 30% load */
        return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
               tbl->size > ht->p.min_size;
}

/**
 * rht_grow_above_100 - returns true if nelems > table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_100(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
{
        return atomic_read(&ht->nelems) > tbl->size &&
                (!ht->p.max_size || tbl->size < ht->p.max_size);
}

/**
 * rht_grow_above_max - returns true if table is above maximum
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_max(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
{
        return atomic_read(&ht->nelems) >= ht->max_elems;
}

#ifdef CONFIG_PROVE_LOCKING
int lockdep_rht_mutex_is_held(struct rhashtable *ht);
int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
#else
static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
        return 1;
}

static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
                                             u32 hash)
{
        return 1;
}
#endif /* CONFIG_PROVE_LOCKING */

void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
                             struct rhash_head *obj);

void rhashtable_walk_enter(struct rhashtable *ht,
                           struct rhashtable_iter *iter);
void rhashtable_walk_exit(struct rhashtable_iter *iter);
int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires_shared(RCU);

static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
        __acquires_shared(RCU)
{
        (void)rhashtable_walk_start_check(iter);
}

void *rhashtable_walk_next(struct rhashtable_iter *iter);
void *rhashtable_walk_peek(struct rhashtable_iter *iter);
void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases_shared(RCU);

void rhashtable_free_and_destroy(struct rhashtable *ht,
                                 void (*free_fn)(void *ptr, void *arg),
                                 void *arg);
void rhashtable_destroy(struct rhashtable *ht);

struct rhash_lock_head __rcu **rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **__rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **rht_bucket_nested_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);

#define rht_dereference(p, ht) \
        rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))

#define rht_dereference_rcu(p, ht) \
        rcu_dereference_all_check(p, lockdep_rht_mutex_is_held(ht))

#define rht_dereference_bucket(p, tbl, hash) \
        rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash))

#define rht_dereference_bucket_rcu(p, tbl, hash) \
        rcu_dereference_all_check(p, lockdep_rht_bucket_is_held(tbl, hash))

#define rht_entry(tpos, pos, member) \
        ({ tpos = container_of(pos, typeof(*tpos), member); 1; })

static inline struct rhash_lock_head __rcu *const *rht_bucket(
        const struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
                                     &tbl->buckets[hash];
}

static inline struct rhash_lock_head __rcu **rht_bucket_var(
        struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
                                     &tbl->buckets[hash];
}

static inline struct rhash_lock_head __rcu **rht_bucket_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
                                     &tbl->buckets[hash];
}

/*
 * We lock a bucket by setting BIT(0) in the pointer - this is always
 * zero in real pointers.  The NULLS mark is never stored in the bucket,
 * rather we store NULL if the bucket is empty.
 * bit_spin_locks do not handle contention well, but the whole point
 * of the hashtable design is to achieve minimum per-bucket contention.
 * A nested hash table might not have a bucket pointer.  In that case
 * we cannot get a lock.  For remove and replace the bucket cannot be
 * interesting and doesn't need locking.
 * For insert we allocate the bucket if this is the last bucket_table,
 * and then take the lock.
 * Sometimes we unlock a bucket by writing a new pointer there.  In that
 * case we don't need to unlock, but we do need to reset state such as
 * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
 * provides the same release semantics that bit_spin_unlock() provides,
 * this is safe.
 * When we write to a bucket without unlocking, we use rht_assign_locked().
 */

static inline unsigned long rht_lock(struct bucket_table *tbl,
                                     struct rhash_lock_head __rcu **bkt)
        __acquires(__bitlock(0, bkt))
{
        unsigned long flags;

        local_irq_save(flags);
        bit_spin_lock(0, (unsigned long *)bkt);
        lock_map_acquire(&tbl->dep_map);
        return flags;
}

static inline unsigned long rht_lock_nested(struct bucket_table *tbl,
                                        struct rhash_lock_head __rcu **bucket,
                                        unsigned int subclass)
        __acquires(__bitlock(0, bucket))
{
        unsigned long flags;

        local_irq_save(flags);
        bit_spin_lock(0, (unsigned long *)bucket);
        lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
        return flags;
}

static inline void rht_unlock(struct bucket_table *tbl,
                              struct rhash_lock_head __rcu **bkt,
                              unsigned long flags)
        __releases(__bitlock(0, bkt))
{
        lock_map_release(&tbl->dep_map);
        bit_spin_unlock(0, (unsigned long *)bkt);
        local_irq_restore(flags);
}

enum rht_lookup_freq {
        RHT_LOOKUP_NORMAL,
        RHT_LOOKUP_LIKELY,
};

static __always_inline struct rhash_head *__rht_ptr(
        struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt,
        const enum rht_lookup_freq freq)
{
        unsigned long p_val = (unsigned long)p & ~BIT(0);

        BUILD_BUG_ON(!__builtin_constant_p(freq));

        if (freq == RHT_LOOKUP_LIKELY)
                return (struct rhash_head *)
                        (likely(p_val) ? p_val : (unsigned long)RHT_NULLS_MARKER(bkt));
        else
                return (struct rhash_head *)
                        (p_val ?: (unsigned long)RHT_NULLS_MARKER(bkt));
}

/*
 * Where 'bkt' is a bucket and might be locked:
 *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
 *   rht_ptr() dereferences in a context where the bucket is locked.
 *   rht_ptr_exclusive() dereferences in a context where exclusive
 *            access is guaranteed, such as when destroying the table.
 */
static __always_inline struct rhash_head *__rht_ptr_rcu(
        struct rhash_lock_head __rcu *const *bkt,
        const enum rht_lookup_freq freq)
{
        return __rht_ptr(rcu_dereference_all(*bkt), bkt, freq);
}

static inline struct rhash_head *rht_ptr_rcu(
        struct rhash_lock_head __rcu *const *bkt)
{
        return __rht_ptr_rcu(bkt, RHT_LOOKUP_NORMAL);
}

static inline struct rhash_head *rht_ptr(
        struct rhash_lock_head __rcu *const *bkt,
        struct bucket_table *tbl,
        unsigned int hash)
{
        return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt,
                         RHT_LOOKUP_NORMAL);
}

static inline struct rhash_head *rht_ptr_exclusive(
        struct rhash_lock_head __rcu *const *bkt)
{
        return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt,
                         RHT_LOOKUP_NORMAL);
}

static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
                                     struct rhash_head *obj)
{
        if (rht_is_a_nulls(obj))
                obj = NULL;
        rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
}

static inline void rht_assign_unlock(struct bucket_table *tbl,
                                     struct rhash_lock_head __rcu **bkt,
                                     struct rhash_head *obj,
                                     unsigned long flags)
        __releases(__bitlock(0, bkt))
{
        if (rht_is_a_nulls(obj))
                obj = NULL;
        lock_map_release(&tbl->dep_map);
        rcu_assign_pointer(*bkt, (void *)obj);
        preempt_enable();
        __release(__bitlock(0, bkt));
        local_irq_restore(flags);
}

/**
 * rht_for_each_from - iterate over hash chain from given head
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 */
#define rht_for_each_from(pos, head, tbl, hash) \
        for (pos = head;                        \
             !rht_is_a_nulls(pos);                \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))

/**
 * rht_for_each - iterate over hash chain
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 */
#define rht_for_each(pos, tbl, hash) \
        rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash),  \
                          tbl, hash)

/**
 * rht_for_each_entry_from - iterate over hash chain from given head
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 */
#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)        \
        for (pos = head;                                                \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);        \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))

/**
 * rht_for_each_entry - iterate over hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 */
#define rht_for_each_entry(tpos, pos, tbl, hash, member)                \
        rht_for_each_entry_from(tpos, pos,                                \
                                rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
                                tbl, hash, member)

/**
 * rht_for_each_entry_safe - safely iterate over hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @next:        the &struct rhash_head to use as next in loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive allows for the looped code to
 * remove the loop cursor from the list.
 */
#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)              \
        for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash),                      \
             next = !rht_is_a_nulls(pos) ?                                      \
                       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);              \
             pos = next,                                                      \
             next = !rht_is_a_nulls(pos) ?                                      \
                       rht_dereference_bucket(pos->next, tbl, hash) : NULL)

/**
 * rht_for_each_rcu_from - iterate over rcu hash chain from given head
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_rcu_from(pos, head, tbl, hash)                        \
        for (({barrier(); }),                                                \
             pos = head;                                                \
             !rht_is_a_nulls(pos);                                        \
             pos = rcu_dereference_all(pos->next))

/**
 * rht_for_each_rcu - iterate over rcu hash chain
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_rcu(pos, tbl, hash)                        \
        for (({barrier(); }),                                        \
             pos = rht_ptr_rcu(rht_bucket(tbl, hash));                \
             !rht_is_a_nulls(pos);                                \
             pos = rcu_dereference_all(pos->next))

/**
 * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
        for (({barrier(); }),                                                    \
             pos = head;                                                    \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);            \
             pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))

/**
 * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)                   \
        rht_for_each_entry_rcu_from(tpos, pos,                                   \
                                    rht_ptr_rcu(rht_bucket(tbl, hash)),           \
                                    tbl, hash, member)

/**
 * rhl_for_each_rcu - iterate over rcu hash table list
 * @pos:        the &struct rlist_head to use as a loop cursor.
 * @list:        the head of the list
 *
 * This hash chain list-traversal primitive should be used on the
 * list returned by rhltable_lookup.
 */
#define rhl_for_each_rcu(pos, list)                                        \
        for (pos = list; pos; pos = rcu_dereference_all(pos->next))

/**
 * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rlist_head to use as a loop cursor.
 * @list:        the head of the list
 * @member:        name of the &struct rlist_head within the hashable struct.
 *
 * This hash chain list-traversal primitive should be used on the
 * list returned by rhltable_lookup.
 */
#define rhl_for_each_entry_rcu(tpos, pos, list, member)                        \
        for (pos = list; pos && rht_entry(tpos, pos, member);                \
             pos = rcu_dereference_all(pos->next))

static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
                                     const void *obj)
{
        struct rhashtable *ht = arg->ht;
        const char *ptr = obj;

        return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
}

/* Internal function, do not use. */
static __always_inline struct rhash_head *__rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params,
        const enum rht_lookup_freq freq)
        __must_hold_shared(RCU)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_lock_head __rcu *const *bkt;
        struct bucket_table *tbl;
        struct rhash_head *he;
        unsigned int hash;

        BUILD_BUG_ON(!__builtin_constant_p(freq));
        tbl = rht_dereference_rcu(ht->tbl, ht);
restart:
        hash = rht_key_hashfn(ht, tbl, key, params);
        bkt = rht_bucket(tbl, hash);
        do {
                rht_for_each_rcu_from(he, __rht_ptr_rcu(bkt, freq), tbl, hash) {
                        if (params.obj_cmpfn ?
                            params.obj_cmpfn(&arg, rht_obj(ht, he)) :
                            rhashtable_compare(&arg, rht_obj(ht, he)))
                                continue;
                        return he;
                }
                /* An object might have been moved to a different hash chain,
                 * while we walk along it - better check and retry.
                 */
        } while (he != RHT_NULLS_MARKER(bkt));

        /* Ensure we see any new tables. */
        smp_rmb();

        tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (unlikely(tbl))
                goto restart;

        return NULL;
}

/**
 * rhashtable_lookup - search hash table
 * @ht:                hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for an entry with an identical key. The first matching entry is returned.
 *
 * This must only be called under the RCU read lock.
 *
 * Returns the first entry on which the compare function returned true.
 */
static __always_inline void *rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
        __must_hold_shared(RCU)
{
        struct rhash_head *he = __rhashtable_lookup(ht, key, params,
                                                    RHT_LOOKUP_NORMAL);

        return he ? rht_obj(ht, he) : NULL;
}

static __always_inline void *rhashtable_lookup_likely(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
        __must_hold_shared(RCU)
{
        struct rhash_head *he = __rhashtable_lookup(ht, key, params,
                                                    RHT_LOOKUP_LIKELY);

        return likely(he) ? rht_obj(ht, he) : NULL;
}

/**
 * rhashtable_lookup_fast - search hash table, without RCU read lock
 * @ht:                hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for an entry with an identical key. The first matching entry is returned.
 *
 * Only use this function when you have other mechanisms guaranteeing
 * that the object won't go away after the RCU read lock is released.
 *
 * Returns the first entry on which the compare function returned true.
 */
static __always_inline void *rhashtable_lookup_fast(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        void *obj;

        rcu_read_lock();
        obj = rhashtable_lookup(ht, key, params);
        rcu_read_unlock();

        return obj;
}

/**
 * rhltable_lookup - search hash list table
 * @hlt:        hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for an entry with an identical key.  All matching entries are returned
 * in a list.
 *
 * This must only be called under the RCU read lock.
 *
 * Returns the list of entries that match the given key.
 */
static __always_inline struct rhlist_head *rhltable_lookup(
        struct rhltable *hlt, const void *key,
        const struct rhashtable_params params)
        __must_hold_shared(RCU)
{
        struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params,
                                                    RHT_LOOKUP_NORMAL);

        return he ? container_of(he, struct rhlist_head, rhead) : NULL;
}

static __always_inline struct rhlist_head *rhltable_lookup_likely(
        struct rhltable *hlt, const void *key,
        const struct rhashtable_params params)
        __must_hold_shared(RCU)
{
        struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params,
                                                    RHT_LOOKUP_LIKELY);

        return likely(he) ? container_of(he, struct rhlist_head, rhead) : NULL;
}

/* Internal function, please use rhashtable_insert_fast() instead. This
 * function returns the existing element already in hashes if there is a clash,
 * otherwise it returns an error via ERR_PTR().
 */
static __always_inline void *__rhashtable_insert_fast(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params, bool rhlist)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct bucket_table *tbl;
        struct rhash_head *head;
        unsigned long flags;
        unsigned int hash;
        int elasticity;
        void *data;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);
        hash = rht_head_hashfn(ht, tbl, obj, params);
        elasticity = RHT_ELASTICITY;
        bkt = rht_bucket_insert(ht, tbl, hash);
        data = ERR_PTR(-ENOMEM);
        if (!bkt)
                goto out;
        pprev = NULL;
        flags = rht_lock(tbl, bkt);

        if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
slow_path:
                rht_unlock(tbl, bkt, flags);
                rcu_read_unlock();
                return rhashtable_insert_slow(ht, key, obj);
        }

        rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *plist;
                struct rhlist_head *list;

                elasticity--;
                if (!key ||
                    (params.obj_cmpfn ?
                     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
                        pprev = &head->next;
                        continue;
                }

                data = rht_obj(ht, head);

                if (!rhlist)
                        goto out_unlock;


                list = container_of(obj, struct rhlist_head, rhead);
                plist = container_of(head, struct rhlist_head, rhead);

                RCU_INIT_POINTER(list->next, plist);
                head = rht_dereference_bucket(head->next, tbl, hash);
                RCU_INIT_POINTER(list->rhead.next, head);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
                        rht_unlock(tbl, bkt, flags);
                } else
                        rht_assign_unlock(tbl, bkt, obj, flags);
                data = NULL;
                goto out;
        }

        if (elasticity <= 0)
                goto slow_path;

        data = ERR_PTR(-E2BIG);
        if (unlikely(rht_grow_above_max(ht, tbl)))
                goto out_unlock;

        if (unlikely(rht_grow_above_100(ht, tbl)))
                goto slow_path;

        /* Inserting at head of list makes unlocking free. */
        head = rht_ptr(bkt, tbl, hash);

        RCU_INIT_POINTER(obj->next, head);
        if (rhlist) {
                struct rhlist_head *list;

                list = container_of(obj, struct rhlist_head, rhead);
                RCU_INIT_POINTER(list->next, NULL);
        }

        atomic_inc(&ht->nelems);
        rht_assign_unlock(tbl, bkt, obj, flags);

        if (rht_grow_above_75(ht, tbl))
                schedule_work(&ht->run_work);

        data = NULL;
out:
        rcu_read_unlock();

        return data;

out_unlock:
        rht_unlock(tbl, bkt, flags);
        goto out;
}

/**
 * rhashtable_insert_fast - insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static __always_inline int rhashtable_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        void *ret;

        ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhltable_insert_key - insert object into hash list table
 * @hlt:        hash list table
 * @key:        the pointer to the key
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static __always_inline int rhltable_insert_key(
        struct rhltable *hlt, const void *key, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
                                                params, true));
}

/**
 * rhltable_insert - insert object into hash list table
 * @hlt:        hash list table
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static __always_inline int rhltable_insert(
        struct rhltable *hlt, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(&hlt->ht, &list->rhead);

        key += params.key_offset;

        return rhltable_insert_key(hlt, key, list, params);
}

/**
 * rhashtable_lookup_insert_fast - lookup and insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * This lookup function may only be used for fixed key hash table (key_len
 * parameter set). It will BUG() if used inappropriately.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static __always_inline int rhashtable_lookup_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(ht, obj);
        void *ret;

        BUG_ON(ht->p.obj_hashfn);

        ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
                                       false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Just like rhashtable_lookup_insert_fast(), but this function returns the
 * object if it exists, NULL if it did not and the insertion was successful,
 * and an ERR_PTR otherwise.
 */
static __always_inline void *rhashtable_lookup_get_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(ht, obj);

        BUG_ON(ht->p.obj_hashfn);

        return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
                                        false);
}

/**
 * rhashtable_lookup_insert_key - search and insert object to hash table
 *                                  with explicit key
 * @ht:                hash table
 * @key:        key
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Lookups may occur in parallel with hashtable mutations and resizing.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 *
 * Returns zero on success.
 */
static __always_inline int rhashtable_lookup_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        void *ret;

        BUG_ON(!ht->p.obj_hashfn || !key);

        ret = __rhashtable_insert_fast(ht, key, obj, params, false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
 * @ht:                hash table
 * @key:        key
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Just like rhashtable_lookup_insert_key(), but this function returns the
 * object if it exists, NULL if it does not and the insertion was successful,
 * and an ERR_PTR otherwise.
 */
static __always_inline void *rhashtable_lookup_get_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        BUG_ON(!ht->p.obj_hashfn || !key);

        return __rhashtable_insert_fast(ht, key, obj, params, false);
}

/* Internal function, please use rhashtable_remove_fast() instead */
static __always_inline int __rhashtable_remove_fast_one(
        struct rhashtable *ht, struct bucket_table *tbl,
        struct rhash_head *obj, const struct rhashtable_params params,
        bool rhlist)
{
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
        unsigned long flags;
        unsigned int hash;
        int err = -ENOENT;

        hash = rht_head_hashfn(ht, tbl, obj, params);
        bkt = rht_bucket_var(tbl, hash);
        if (!bkt)
                return -ENOENT;
        pprev = NULL;
        flags = rht_lock(tbl, bkt);

        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *list;

                list = container_of(he, struct rhlist_head, rhead);

                if (he != obj) {
                        struct rhlist_head __rcu **lpprev;

                        pprev = &he->next;

                        if (!rhlist)
                                continue;

                        do {
                                lpprev = &list->next;
                                list = rht_dereference_bucket(list->next,
                                                              tbl, hash);
                        } while (list && obj != &list->rhead);

                        if (!list)
                                continue;

                        list = rht_dereference_bucket(list->next, tbl, hash);
                        RCU_INIT_POINTER(*lpprev, list);
                        err = 0;
                        break;
                }

                obj = rht_dereference_bucket(obj->next, tbl, hash);
                err = 1;

                if (rhlist) {
                        list = rht_dereference_bucket(list->next, tbl, hash);
                        if (list) {
                                RCU_INIT_POINTER(list->rhead.next, obj);
                                obj = &list->rhead;
                                err = 0;
                        }
                }

                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
                        rht_unlock(tbl, bkt, flags);
                } else {
                        rht_assign_unlock(tbl, bkt, obj, flags);
                }
                goto unlocked;
        }

        rht_unlock(tbl, bkt, flags);
unlocked:
        if (err > 0) {
                atomic_dec(&ht->nelems);
                if (unlikely(ht->p.automatic_shrinking &&
                             rht_shrink_below_30(ht, tbl)))
                        schedule_work(&ht->run_work);
                err = 0;
        }

        return err;
}

/* Internal function, please use rhashtable_remove_fast() instead */
static __always_inline int __rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params, bool rhlist)
{
        struct bucket_table *tbl;
        int err;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);

        /* Because we have already taken (and released) the bucket
         * lock in old_tbl, if we find that future_tbl is not yet
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
        while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
                                                   rhlist)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;

        rcu_read_unlock();

        return err;
}

/**
 * rhashtable_remove_fast - remove object from hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Since the hash chain is single linked, the removal operation needs to
 * walk the bucket chain upon removal. The removal operation is thus
 * considerable slow if the hash table is not correctly sized.
 *
 * Will automatically shrink the table if permitted when residency drops
 * below 30%.
 *
 * Returns zero on success, -ENOENT if the entry could not be found.
 */
static __always_inline int rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        return __rhashtable_remove_fast(ht, obj, params, false);
}

/**
 * rhltable_remove - remove object from hash list table
 * @hlt:        hash list table
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Since the hash chain is single linked, the removal operation needs to
 * walk the bucket chain upon removal. The removal operation is thus
 * considerably slower if the hash table is not correctly sized.
 *
 * Will automatically shrink the table if permitted when residency drops
 * below 30%
 *
 * Returns zero on success, -ENOENT if the entry could not be found.
 */
static __always_inline int rhltable_remove(
        struct rhltable *hlt, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
}

/* Internal function, please use rhashtable_replace_fast() instead */
static __always_inline int __rhashtable_replace_fast(
        struct rhashtable *ht, struct bucket_table *tbl,
        struct rhash_head *obj_old, struct rhash_head *obj_new,
        const struct rhashtable_params params)
{
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
        unsigned long flags;
        unsigned int hash;
        int err = -ENOENT;

        /* Minimally, the old and new objects must have same hash
         * (which should mean identifiers are the same).
         */
        hash = rht_head_hashfn(ht, tbl, obj_old, params);
        if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
                return -EINVAL;

        bkt = rht_bucket_var(tbl, hash);
        if (!bkt)
                return -ENOENT;

        pprev = NULL;
        flags = rht_lock(tbl, bkt);

        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                if (he != obj_old) {
                        pprev = &he->next;
                        continue;
                }

                rcu_assign_pointer(obj_new->next, obj_old->next);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj_new);
                        rht_unlock(tbl, bkt, flags);
                } else {
                        rht_assign_unlock(tbl, bkt, obj_new, flags);
                }
                err = 0;
                goto unlocked;
        }

        rht_unlock(tbl, bkt, flags);

unlocked:
        return err;
}

/**
 * rhashtable_replace_fast - replace an object in hash table
 * @ht:                hash table
 * @obj_old:        pointer to hash head inside object being replaced
 * @obj_new:        pointer to hash head inside object which is new
 * @params:        hash table parameters
 *
 * Replacing an object doesn't affect the number of elements in the hash table
 * or bucket, so we don't need to worry about shrinking or expanding the
 * table here.
 *
 * Returns zero on success, -ENOENT if the entry could not be found,
 * -EINVAL if hash is not the same for the old and new objects.
 */
static __always_inline int rhashtable_replace_fast(
        struct rhashtable *ht, struct rhash_head *obj_old,
        struct rhash_head *obj_new,
        const struct rhashtable_params params)
{
        struct bucket_table *tbl;
        int err;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);

        /* Because we have already taken (and released) the bucket
         * lock in old_tbl, if we find that future_tbl is not yet
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
        while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
                                                obj_new, params)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;

        rcu_read_unlock();

        return err;
}

/**
 * rhltable_walk_enter - Initialise an iterator
 * @hlt:        Table to walk over
 * @iter:        Hash table Iterator
 *
 * This function prepares a hash table walk.
 *
 * Note that if you restart a walk after rhashtable_walk_stop you
 * may see the same object twice.  Also, you may miss objects if
 * there are removals in between rhashtable_walk_stop and the next
 * call to rhashtable_walk_start.
 *
 * For a completely stable walk you should construct your own data
 * structure outside the hash table.
 *
 * This function may be called from any process context, including
 * non-preemptable context, but cannot be called from softirq or
 * hardirq context.
 *
 * You must call rhashtable_walk_exit after this function returns.
 */
static inline void rhltable_walk_enter(struct rhltable *hlt,
                                       struct rhashtable_iter *iter)
{
        rhashtable_walk_enter(&hlt->ht, iter);
}

/**
 * rhltable_free_and_destroy - free elements and destroy hash list table
 * @hlt:        the hash list table to destroy
 * @free_fn:        callback to release resources of element
 * @arg:        pointer passed to free_fn
 *
 * See documentation for rhashtable_free_and_destroy.
 */
static inline void rhltable_free_and_destroy(struct rhltable *hlt,
                                             void (*free_fn)(void *ptr,
                                                             void *arg),
                                             void *arg)
{
        rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
}

static inline void rhltable_destroy(struct rhltable *hlt)
{
        rhltable_free_and_destroy(hlt, NULL, NULL);
}

#endif /* _LINUX_RHASHTABLE_H */


































   24 






   23 




    2 
   24 























   23 




    1 
















































    1 






































































    1 































    3 



































    3 







    1 















    1 











    1 


    1 



































































































































    1 




    1 





    1 








    1 











    1 





















































































    1 


    1 
    1 













    1 





    1 
    1 












    1 

    1 
















   23 

   22 





   21 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAP_LOCK_H
#define _LINUX_MMAP_LOCK_H

/* Avoid a dependency loop by declaring here. */
extern int rcuwait_wake_up(struct rcuwait *w);

#include <linux/lockdep.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/rwsem.h>
#include <linux/tracepoint-defs.h>
#include <linux/types.h>
#include <linux/cleanup.h>
#include <linux/sched/mm.h>

#define MMAP_LOCK_INITIALIZER(name) \
        .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),

DECLARE_TRACEPOINT(mmap_lock_start_locking);
DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
DECLARE_TRACEPOINT(mmap_lock_released);

#ifdef CONFIG_TRACING

void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
                                           bool success);
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);

static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
                                                   bool write)
{
        if (tracepoint_enabled(mmap_lock_start_locking))
                __mmap_lock_do_trace_start_locking(mm, write);
}

static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
                                                      bool write, bool success)
{
        if (tracepoint_enabled(mmap_lock_acquire_returned))
                __mmap_lock_do_trace_acquire_returned(mm, write, success);
}

static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
        if (tracepoint_enabled(mmap_lock_released))
                __mmap_lock_do_trace_released(mm, write);
}

#else /* !CONFIG_TRACING */

static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
                                                   bool write)
{
}

static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
                                                      bool write, bool success)
{
}

static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
}

#endif /* CONFIG_TRACING */

static inline void mmap_assert_locked(const struct mm_struct *mm)
{
        rwsem_assert_held(&mm->mmap_lock);
}

static inline void mmap_assert_write_locked(const struct mm_struct *mm)
{
        rwsem_assert_held_write(&mm->mmap_lock);
}

#ifdef CONFIG_PER_VMA_LOCK

#ifdef CONFIG_LOCKDEP
#define __vma_lockdep_map(vma) (&vma->vmlock_dep_map)
#else
#define __vma_lockdep_map(vma) NULL
#endif

/*
 * VMA locks do not behave like most ordinary locks found in the kernel, so we
 * cannot quite have full lockdep tracking in the way we would ideally prefer.
 *
 * Read locks act as shared locks which exclude an exclusive lock being
 * taken. We therefore mark these accordingly on read lock acquire/release.
 *
 * Write locks are acquired exclusively per-VMA, but released in a shared
 * fashion, that is upon vma_end_write_all(), we update the mmap's seqcount such
 * that write lock is released.
 *
 * We therefore cannot track write locks per-VMA, nor do we try. Mitigating this
 * is the fact that, of course, we do lockdep-track the mmap lock rwsem which
 * must be held when taking a VMA write lock.
 *
 * We do, however, want to indicate that during either acquisition of a VMA
 * write lock or detachment of a VMA that we require the lock held be exclusive,
 * so we utilise lockdep to do so.
 */
#define __vma_lockdep_acquire_read(vma) \
        lock_acquire_shared(__vma_lockdep_map(vma), 0, 1, NULL, _RET_IP_)
#define __vma_lockdep_release_read(vma) \
        lock_release(__vma_lockdep_map(vma), _RET_IP_)
#define __vma_lockdep_acquire_exclusive(vma) \
        lock_acquire_exclusive(__vma_lockdep_map(vma), 0, 0, NULL, _RET_IP_)
#define __vma_lockdep_release_exclusive(vma) \
        lock_release(__vma_lockdep_map(vma), _RET_IP_)
/* Only meaningful if CONFIG_LOCK_STAT is defined. */
#define __vma_lockdep_stat_mark_acquired(vma) \
        lock_acquired(__vma_lockdep_map(vma), _RET_IP_)

static inline void mm_lock_seqcount_init(struct mm_struct *mm)
{
        seqcount_init(&mm->mm_lock_seq);
}

static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
{
        do_raw_write_seqcount_begin(&mm->mm_lock_seq);
}

static inline void mm_lock_seqcount_end(struct mm_struct *mm)
{
        ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
        do_raw_write_seqcount_end(&mm->mm_lock_seq);
}

static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
        /*
         * Since mmap_lock is a sleeping lock, and waiting for it to become
         * unlocked is more or less equivalent with taking it ourselves, don't
         * bother with the speculative path if mmap_lock is already write-locked
         * and take the slow path, which takes the lock.
         */
        return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq);
}

static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
{
        return read_seqcount_retry(&mm->mm_lock_seq, seq);
}

static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        static struct lock_class_key lockdep_key;

        lockdep_init_map(__vma_lockdep_map(vma), "vm_lock", &lockdep_key, 0);
#endif
        if (reset_refcnt)
                refcount_set(&vma->vm_refcnt, 0);
        vma->vm_lock_seq = UINT_MAX;
}

/*
 * This function determines whether the input VMA reference count describes a
 * VMA which has excluded all VMA read locks.
 *
 * In the case of a detached VMA, we may incorrectly indicate that readers are
 * excluded when one remains, because in that scenario we target a refcount of
 * VM_REFCNT_EXCLUDE_READERS_FLAG, rather than the attached target of
 * VM_REFCNT_EXCLUDE_READERS_FLAG + 1.
 *
 * However, the race window for that is very small so it is unlikely.
 *
 * Returns: true if readers are excluded, false otherwise.
 */
static inline bool __vma_are_readers_excluded(int refcnt)
{
        /*
         * See the comment describing the vm_area_struct->vm_refcnt field for
         * details of possible refcnt values.
         */
        return (refcnt & VM_REFCNT_EXCLUDE_READERS_FLAG) &&
                refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1;
}

/*
 * Actually decrement the VMA reference count.
 *
 * The function returns the reference count as it was immediately after the
 * decrement took place. If it returns zero, the VMA is now detached.
 */
static inline __must_check unsigned int
__vma_refcount_put_return(struct vm_area_struct *vma)
{
        int oldcnt;

        if (__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt))
                return 0;

        return oldcnt - 1;
}

/**
 * vma_refcount_put() - Drop reference count in VMA vm_refcnt field due to a
 * read-lock being dropped.
 * @vma: The VMA whose reference count we wish to decrement.
 *
 * If we were the last reader, wake up threads waiting to obtain an exclusive
 * lock.
 */
static inline void vma_refcount_put(struct vm_area_struct *vma)
{
        /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt. */
        struct mm_struct *mm = vma->vm_mm;
        int newcnt;

        __vma_lockdep_release_read(vma);
        newcnt = __vma_refcount_put_return(vma);

        /*
         * __vma_start_exclude_readers() may be sleeping waiting for readers to
         * drop their reference count, so wake it up if we were the last reader
         * blocking it from being acquired.
         *
         * We may be raced by other readers temporarily incrementing the
         * reference count, though the race window is very small, this might
         * cause spurious wakeups.
         */
        if (newcnt && __vma_are_readers_excluded(newcnt))
                rcuwait_wake_up(&mm->vma_writer_wait);
}

/*
 * Use only while holding mmap read lock which guarantees that locking will not
 * fail (nobody can concurrently write-lock the vma). vma_start_read() should
 * not be used in such cases because it might fail due to mm_lock_seq overflow.
 * This functionality is used to obtain vma read lock and drop the mmap read lock.
 */
static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
{
        int oldcnt;

        mmap_assert_locked(vma->vm_mm);
        if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
                                                              VM_REFCNT_LIMIT)))
                return false;

        __vma_lockdep_acquire_read(vma);
        return true;
}

/*
 * Use only while holding mmap read lock which guarantees that locking will not
 * fail (nobody can concurrently write-lock the vma). vma_start_read() should
 * not be used in such cases because it might fail due to mm_lock_seq overflow.
 * This functionality is used to obtain vma read lock and drop the mmap read lock.
 */
static inline bool vma_start_read_locked(struct vm_area_struct *vma)
{
        return vma_start_read_locked_nested(vma, 0);
}

static inline void vma_end_read(struct vm_area_struct *vma)
{
        vma_refcount_put(vma);
}

static inline unsigned int __vma_raw_mm_seqnum(struct vm_area_struct *vma)
{
        const struct mm_struct *mm = vma->vm_mm;

        /* We must hold an exclusive write lock for this access to be valid. */
        mmap_assert_write_locked(vma->vm_mm);
        return mm->mm_lock_seq.sequence;
}

/*
 * Determine whether a VMA is write-locked. Must be invoked ONLY if the mmap
 * write lock is held.
 *
 * Returns true if write-locked, otherwise false.
 */
static inline bool __is_vma_write_locked(struct vm_area_struct *vma)
{
        /*
         * current task is holding mmap_write_lock, both vma->vm_lock_seq and
         * mm->mm_lock_seq can't be concurrently modified.
         */
        return vma->vm_lock_seq == __vma_raw_mm_seqnum(vma);
}

int __vma_start_write(struct vm_area_struct *vma, int state);

/*
 * Begin writing to a VMA.
 * Exclude concurrent readers under the per-VMA lock until the currently
 * write-locked mmap_lock is dropped or downgraded.
 */
static inline void vma_start_write(struct vm_area_struct *vma)
{
        if (__is_vma_write_locked(vma))
                return;

        __vma_start_write(vma, TASK_UNINTERRUPTIBLE);
}

/**
 * vma_start_write_killable - Begin writing to a VMA.
 * @vma: The VMA we are going to modify.
 *
 * Exclude concurrent readers under the per-VMA lock until the currently
 * write-locked mmap_lock is dropped or downgraded.
 *
 * Context: May sleep while waiting for readers to drop the vma read lock.
 * Caller must already hold the mmap_lock for write.
 *
 * Return: 0 for a successful acquisition.  -EINTR if a fatal signal was
 * received.
 */
static inline __must_check
int vma_start_write_killable(struct vm_area_struct *vma)
{
        if (__is_vma_write_locked(vma))
                return 0;

        return __vma_start_write(vma, TASK_KILLABLE);
}

/**
 * vma_assert_write_locked() - assert that @vma holds a VMA write lock.
 * @vma: The VMA to assert.
 */
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
        VM_WARN_ON_ONCE_VMA(!__is_vma_write_locked(vma), vma);
}

/**
 * vma_assert_locked() - assert that @vma holds either a VMA read or a VMA write
 * lock and is not detached.
 * @vma: The VMA to assert.
 */
static inline void vma_assert_locked(struct vm_area_struct *vma)
{
        unsigned int refcnt;

        if (IS_ENABLED(CONFIG_LOCKDEP)) {
                if (!lock_is_held(__vma_lockdep_map(vma)))
                        vma_assert_write_locked(vma);
                return;
        }

        /*
         * See the comment describing the vm_area_struct->vm_refcnt field for
         * details of possible refcnt values.
         */
        refcnt = refcount_read(&vma->vm_refcnt);

        /*
         * In this case we're either read-locked, write-locked with temporary
         * readers, or in the midst of excluding readers, all of which means
         * we're locked.
         */
        if (refcnt > 1)
                return;

        /* It is a bug for the VMA to be detached here. */
        VM_WARN_ON_ONCE_VMA(!refcnt, vma);

        /*
         * OK, the VMA has a reference count of 1 which means it is either
         * unlocked and attached or write-locked, so assert that it is
         * write-locked.
         */
        vma_assert_write_locked(vma);
}

/**
 * vma_assert_stabilised() - assert that this VMA cannot be changed from
 * underneath us either by having a VMA or mmap lock held.
 * @vma: The VMA whose stability we wish to assess.
 *
 * If lockdep is enabled we can precisely ensure stability via either an mmap
 * lock owned by us or a specific VMA lock.
 *
 * With lockdep disabled we may sometimes race with other threads acquiring the
 * mmap read lock simultaneous with our VMA read lock.
 */
static inline void vma_assert_stabilised(struct vm_area_struct *vma)
{
        /*
         * If another thread owns an mmap lock, it may go away at any time, and
         * thus is no guarantee of stability.
         *
         * If lockdep is enabled we can accurately determine if an mmap lock is
         * held and owned by us. Otherwise we must approximate.
         *
         * It doesn't necessarily mean we are not stabilised however, as we may
         * hold a VMA read lock (not a write lock as this would require an owned
         * mmap lock).
         *
         * If (assuming lockdep is not enabled) we were to assert a VMA read
         * lock first we may also run into issues, as other threads can hold VMA
         * read locks simlutaneous to us.
         *
         * Therefore if lockdep is not enabled we risk a false negative (i.e. no
         * assert fired). If accurate checking is required, enable lockdep.
         */
        if (IS_ENABLED(CONFIG_LOCKDEP)) {
                if (lockdep_is_held(&vma->vm_mm->mmap_lock))
                        return;
        } else {
                if (rwsem_is_locked(&vma->vm_mm->mmap_lock))
                        return;
        }

        /*
         * We're not stabilised by the mmap lock, so assert that we're
         * stabilised by a VMA lock.
         */
        vma_assert_locked(vma);
}

static inline bool vma_is_attached(struct vm_area_struct *vma)
{
        return refcount_read(&vma->vm_refcnt);
}

/*
 * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
 * assertions should be made either under mmap_write_lock or when the object
 * has been isolated under mmap_write_lock, ensuring no competing writers.
 */
static inline void vma_assert_attached(struct vm_area_struct *vma)
{
        WARN_ON_ONCE(!vma_is_attached(vma));
}

static inline void vma_assert_detached(struct vm_area_struct *vma)
{
        WARN_ON_ONCE(vma_is_attached(vma));
}

static inline void vma_mark_attached(struct vm_area_struct *vma)
{
        vma_assert_write_locked(vma);
        vma_assert_detached(vma);
        refcount_set_release(&vma->vm_refcnt, 1);
}

void __vma_exclude_readers_for_detach(struct vm_area_struct *vma);

static inline void vma_mark_detached(struct vm_area_struct *vma)
{
        vma_assert_write_locked(vma);
        vma_assert_attached(vma);

        /*
         * The VMA still being attached (refcnt > 0) - is unlikely, because the
         * vma has been already write-locked and readers can increment vm_refcnt
         * only temporarily before they check vm_lock_seq, realize the vma is
         * locked and drop back the vm_refcnt. That is a narrow window for
         * observing a raised vm_refcnt.
         *
         * See the comment describing the vm_area_struct->vm_refcnt field for
         * details of possible refcnt values.
         */
        if (likely(!__vma_refcount_put_return(vma)))
                return;

        __vma_exclude_readers_for_detach(vma);
}

struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address);

/*
 * Locks next vma pointed by the iterator. Confirms the locked vma has not
 * been modified and will retry under mmap_lock protection if modification
 * was detected. Should be called from read RCU section.
 * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the
 * process was interrupted.
 */
struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
                                     struct vma_iterator *iter,
                                     unsigned long address);

#else /* CONFIG_PER_VMA_LOCK */

static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}

static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
        return false;
}

static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
{
        return true;
}
static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
static inline __must_check
int vma_start_write_killable(struct vm_area_struct *vma) { return 0; }
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
                { mmap_assert_write_locked(vma->vm_mm); }
static inline void vma_assert_attached(struct vm_area_struct *vma) {}
static inline void vma_assert_detached(struct vm_area_struct *vma) {}
static inline void vma_mark_attached(struct vm_area_struct *vma) {}
static inline void vma_mark_detached(struct vm_area_struct *vma) {}

static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                unsigned long address)
{
        return NULL;
}

static inline void vma_assert_locked(struct vm_area_struct *vma)
{
        mmap_assert_locked(vma->vm_mm);
}

static inline void vma_assert_stabilised(struct vm_area_struct *vma)
{
        /* If no VMA locks, then either mmap lock suffices to stabilise. */
        mmap_assert_locked(vma->vm_mm);
}

#endif /* CONFIG_PER_VMA_LOCK */

static inline void mmap_write_lock(struct mm_struct *mm)
{
        __mmap_lock_trace_start_locking(mm, true);
        down_write(&mm->mmap_lock);
        mm_lock_seqcount_begin(mm);
        __mmap_lock_trace_acquire_returned(mm, true, true);
}

static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
        __mmap_lock_trace_start_locking(mm, true);
        down_write_nested(&mm->mmap_lock, subclass);
        mm_lock_seqcount_begin(mm);
        __mmap_lock_trace_acquire_returned(mm, true, true);
}

static inline int __must_check mmap_write_lock_killable(struct mm_struct *mm)
{
        int ret;

        __mmap_lock_trace_start_locking(mm, true);
        ret = down_write_killable(&mm->mmap_lock);
        if (!ret)
                mm_lock_seqcount_begin(mm);
        __mmap_lock_trace_acquire_returned(mm, true, ret == 0);
        return ret;
}

/*
 * Drop all currently-held per-VMA locks.
 * This is called from the mmap_lock implementation directly before releasing
 * a write-locked mmap_lock (or downgrading it to read-locked).
 * This should normally NOT be called manually from other places.
 * If you want to call this manually anyway, keep in mind that this will release
 * *all* VMA write locks, including ones from further up the stack.
 */
static inline void vma_end_write_all(struct mm_struct *mm)
{
        mmap_assert_write_locked(mm);
        mm_lock_seqcount_end(mm);
}

static inline void mmap_write_unlock(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, true);
        vma_end_write_all(mm);
        up_write(&mm->mmap_lock);
}

static inline void mmap_write_downgrade(struct mm_struct *mm)
{
        __mmap_lock_trace_acquire_returned(mm, false, true);
        vma_end_write_all(mm);
        downgrade_write(&mm->mmap_lock);
}

static inline void mmap_read_lock(struct mm_struct *mm)
{
        __mmap_lock_trace_start_locking(mm, false);
        down_read(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, false, true);
}

static inline int __must_check mmap_read_lock_killable(struct mm_struct *mm)
{
        int ret;

        __mmap_lock_trace_start_locking(mm, false);
        ret = down_read_killable(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, false, ret == 0);
        return ret;
}

static inline bool __must_check mmap_read_trylock(struct mm_struct *mm)
{
        bool ret;

        __mmap_lock_trace_start_locking(mm, false);
        ret = down_read_trylock(&mm->mmap_lock) != 0;
        __mmap_lock_trace_acquire_returned(mm, false, ret);
        return ret;
}

static inline void mmap_read_unlock(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, false);
        up_read(&mm->mmap_lock);
}

DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
             mmap_read_lock(_T), mmap_read_unlock(_T))

static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, false);
        up_read_non_owner(&mm->mmap_lock);
}

static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
        return rwsem_is_contended(&mm->mmap_lock);
}

#endif /* _LINUX_MMAP_LOCK_H */















































































































































































































































































































































































































































































































































































































































































    3 


    3 


    2 









































































































































































    3 




    3 


    3 


    3 






















































































    3 









    3 
    3 
































































































































    3 







    3 
    3 


    3 












    3 


    2 















































































































































































    2 



    1 




























    2 



    2 

































    2 








    2 







    2 



    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/usb/core/sysfs.c
 *
 * (C) Copyright 2002 David Brownell
 * (C) Copyright 2002,2004 Greg Kroah-Hartman
 * (C) Copyright 2002,2004 IBM Corp.
 *
 * All of the sysfs file attributes for usb devices and interfaces.
 *
 * Released under the GPLv2 only.
 */


#include <linux/kernel.h>
#include <linux/kstrtox.h>
#include <linux/string.h>
#include <linux/usb.h>
#include <linux/usb/hcd.h>
#include <linux/usb/quirks.h>
#include <linux/of.h>
#include "usb.h"

/* Active configuration fields */
#define usb_actconfig_show(field, format_string)                        \
static ssize_t field##_show(struct device *dev,                                \
                            struct device_attribute *attr, char *buf)        \
{                                                                        \
        struct usb_device *udev;                                        \
        struct usb_host_config *actconfig;                                \
        ssize_t rc;                                                        \
                                                                        \
        udev = to_usb_device(dev);                                        \
        rc = usb_lock_device_interruptible(udev);                        \
        if (rc < 0)                                                        \
                return -EINTR;                                                \
        actconfig = udev->actconfig;                                        \
        if (actconfig)                                                        \
                rc = sysfs_emit(buf, format_string,                        \
                                actconfig->desc.field);                        \
        usb_unlock_device(udev);                                        \
        return rc;                                                        \
}                                                                        \

#define usb_actconfig_attr(field, format_string)                \
        usb_actconfig_show(field, format_string)                \
        static DEVICE_ATTR_RO(field)

usb_actconfig_attr(bNumInterfaces, "%2d\n");
usb_actconfig_attr(bmAttributes, "%2x\n");

static ssize_t bMaxPower_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        struct usb_device *udev;
        struct usb_host_config *actconfig;
        ssize_t rc;

        udev = to_usb_device(dev);
        rc = usb_lock_device_interruptible(udev);
        if (rc < 0)
                return -EINTR;
        actconfig = udev->actconfig;
        if (actconfig)
                rc = sysfs_emit(buf, "%dmA\n", usb_get_max_power(udev, actconfig));
        usb_unlock_device(udev);
        return rc;
}
static DEVICE_ATTR_RO(bMaxPower);

static ssize_t configuration_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        struct usb_device *udev;
        struct usb_host_config *actconfig;
        ssize_t rc;

        udev = to_usb_device(dev);
        rc = usb_lock_device_interruptible(udev);
        if (rc < 0)
                return -EINTR;
        actconfig = udev->actconfig;
        if (actconfig && actconfig->string)
                rc = sysfs_emit(buf, "%s\n", actconfig->string);
        usb_unlock_device(udev);
        return rc;
}
static DEVICE_ATTR_RO(configuration);

/* configuration value is always present, and r/w */
usb_actconfig_show(bConfigurationValue, "%u\n");

static ssize_t bConfigurationValue_store(struct device *dev,
                                         struct device_attribute *attr,
                                         const char *buf, size_t count)
{
        struct usb_device        *udev = to_usb_device(dev);
        int                        config, value, rc;

        if (sscanf(buf, "%d", &config) != 1 || config < -1 || config > 255)
                return -EINVAL;
        rc = usb_lock_device_interruptible(udev);
        if (rc < 0)
                return -EINTR;
        value = usb_set_configuration(udev, config);
        usb_unlock_device(udev);
        return (value < 0) ? value : count;
}
static DEVICE_ATTR_IGNORE_LOCKDEP(bConfigurationValue, S_IRUGO | S_IWUSR,
                bConfigurationValue_show, bConfigurationValue_store);

#ifdef CONFIG_OF
static ssize_t devspec_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct device_node *of_node = dev->of_node;

        return sysfs_emit(buf, "%pOF\n", of_node);
}
static DEVICE_ATTR_RO(devspec);
#endif

/* String fields */
#define usb_string_attr(name)                                                \
static ssize_t  name##_show(struct device *dev,                                \
                struct device_attribute *attr, char *buf)                \
{                                                                        \
        struct usb_device *udev;                                        \
        int retval;                                                        \
                                                                        \
        udev = to_usb_device(dev);                                        \
        retval = usb_lock_device_interruptible(udev);                        \
        if (retval < 0)                                                        \
                return -EINTR;                                                \
        retval = sysfs_emit(buf, "%s\n", udev->name);                        \
        usb_unlock_device(udev);                                        \
        return retval;                                                        \
}                                                                        \
static DEVICE_ATTR_RO(name)

usb_string_attr(product);
usb_string_attr(manufacturer);
usb_string_attr(serial);

static ssize_t speed_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct usb_device *udev;
        char *speed;

        udev = to_usb_device(dev);

        switch (udev->speed) {
        case USB_SPEED_LOW:
                speed = "1.5";
                break;
        case USB_SPEED_UNKNOWN:
        case USB_SPEED_FULL:
                speed = "12";
                break;
        case USB_SPEED_HIGH:
                speed = "480";
                break;
        case USB_SPEED_SUPER:
                speed = "5000";
                break;
        case USB_SPEED_SUPER_PLUS:
                if (udev->ssp_rate == USB_SSP_GEN_2x2)
                        speed = "20000";
                else
                        speed = "10000";
                break;
        default:
                speed = "unknown";
        }
        return sysfs_emit(buf, "%s\n", speed);
}
static DEVICE_ATTR_RO(speed);

static ssize_t rx_lanes_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct usb_device *udev;

        udev = to_usb_device(dev);
        return sysfs_emit(buf, "%d\n", udev->rx_lanes);
}
static DEVICE_ATTR_RO(rx_lanes);

static ssize_t tx_lanes_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct usb_device *udev;

        udev = to_usb_device(dev);
        return sysfs_emit(buf, "%d\n", udev->tx_lanes);
}
static DEVICE_ATTR_RO(tx_lanes);

static ssize_t busnum_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct usb_device *udev;

        udev = to_usb_device(dev);
        return sysfs_emit(buf, "%d\n", udev->bus->busnum);
}
static DEVICE_ATTR_RO(busnum);

static ssize_t devnum_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct usb_device *udev;

        udev = to_usb_device(dev);
        return sysfs_emit(buf, "%d\n", udev->devnum);
}
static DEVICE_ATTR_RO(devnum);

static ssize_t devpath_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct usb_device *udev;

        udev = to_usb_device(dev);
        return sysfs_emit(buf, "%s\n", udev->devpath);
}
static DEVICE_ATTR_RO(devpath);

static ssize_t version_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct usb_device *udev;
        u16 bcdUSB;

        udev = to_usb_device(dev);
        bcdUSB = le16_to_cpu(udev->descriptor.bcdUSB);
        return sysfs_emit(buf, "%2x.%02x\n", bcdUSB >> 8, bcdUSB & 0xff);
}
static DEVICE_ATTR_RO(version);

static ssize_t maxchild_show(struct device *dev, struct device_attribute *attr,
                             char *buf)
{
        struct usb_device *udev;

        udev = to_usb_device(dev);
        return sysfs_emit(buf, "%d\n", udev->maxchild);
}
static DEVICE_ATTR_RO(maxchild);

static ssize_t quirks_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct usb_device *udev;

        udev = to_usb_device(dev);
        return sysfs_emit(buf, "0x%x\n", udev->quirks);
}
static DEVICE_ATTR_RO(quirks);

static ssize_t avoid_reset_quirk_show(struct device *dev,
                                      struct device_attribute *attr, char *buf)
{
        struct usb_device *udev;

        udev = to_usb_device(dev);
        return sysfs_emit(buf, "%d\n", !!(udev->quirks & USB_QUIRK_RESET));
}

static ssize_t avoid_reset_quirk_store(struct device *dev,
                                      struct device_attribute *attr,
                                      const char *buf, size_t count)
{
        struct usb_device        *udev = to_usb_device(dev);
        bool                        val;
        int                        rc;

        if (kstrtobool(buf, &val) != 0)
                return -EINVAL;
        rc = usb_lock_device_interruptible(udev);
        if (rc < 0)
                return -EINTR;
        if (val)
                udev->quirks |= USB_QUIRK_RESET;
        else
                udev->quirks &= ~USB_QUIRK_RESET;
        usb_unlock_device(udev);
        return count;
}
static DEVICE_ATTR_RW(avoid_reset_quirk);

static ssize_t urbnum_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct usb_device *udev;

        udev = to_usb_device(dev);
        return sysfs_emit(buf, "%d\n", atomic_read(&udev->urbnum));
}
static DEVICE_ATTR_RO(urbnum);

static ssize_t ltm_capable_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        if (usb_device_supports_ltm(to_usb_device(dev)))
                return sysfs_emit(buf, "%s\n", "yes");
        return sysfs_emit(buf, "%s\n", "no");
}
static DEVICE_ATTR_RO(ltm_capable);

#ifdef        CONFIG_PM

static ssize_t persist_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct usb_device *udev = to_usb_device(dev);

        return sysfs_emit(buf, "%d\n", udev->persist_enabled);
}

static ssize_t persist_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        struct usb_device *udev = to_usb_device(dev);
        bool value;
        int rc;

        /* Hubs are always enabled for USB_PERSIST */
        if (udev->descriptor.bDeviceClass == USB_CLASS_HUB)
                return -EPERM;

        if (kstrtobool(buf, &value) != 0)
                return -EINVAL;

        rc = usb_lock_device_interruptible(udev);
        if (rc < 0)
                return -EINTR;
        udev->persist_enabled = !!value;
        usb_unlock_device(udev);
        return count;
}
static DEVICE_ATTR_RW(persist);

static int add_persist_attributes(struct device *dev)
{
        int rc = 0;

        if (is_usb_device(dev)) {
                struct usb_device *udev = to_usb_device(dev);

                /* Hubs are automatically enabled for USB_PERSIST,
                 * no point in creating the attribute file.
                 */
                if (udev->descriptor.bDeviceClass != USB_CLASS_HUB)
                        rc = sysfs_add_file_to_group(&dev->kobj,
                                        &dev_attr_persist.attr,
                                        power_group_name);
        }
        return rc;
}

static void remove_persist_attributes(struct device *dev)
{
        sysfs_remove_file_from_group(&dev->kobj,
                        &dev_attr_persist.attr,
                        power_group_name);
}

static ssize_t connected_duration_show(struct device *dev,
                                       struct device_attribute *attr, char *buf)
{
        struct usb_device *udev = to_usb_device(dev);

        return sysfs_emit(buf, "%u\n",
                        jiffies_to_msecs(jiffies - udev->connect_time));
}
static DEVICE_ATTR_RO(connected_duration);

/*
 * If the device is resumed, the last time the device was suspended has
 * been pre-subtracted from active_duration.  We add the current time to
 * get the duration that the device was actually active.
 *
 * If the device is suspended, the active_duration is up-to-date.
 */
static ssize_t active_duration_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        struct usb_device *udev = to_usb_device(dev);
        int duration;

        if (udev->state != USB_STATE_SUSPENDED)
                duration = jiffies_to_msecs(jiffies + udev->active_duration);
        else
                duration = jiffies_to_msecs(udev->active_duration);
        return sysfs_emit(buf, "%u\n", duration);
}
static DEVICE_ATTR_RO(active_duration);

static ssize_t autosuspend_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n", dev->power.autosuspend_delay / 1000);
}

static ssize_t autosuspend_store(struct device *dev,
                                 struct device_attribute *attr, const char *buf,
                                 size_t count)
{
        int value;

        if (sscanf(buf, "%d", &value) != 1 || value >= INT_MAX/1000 ||
                        value <= -INT_MAX/1000)
                return -EINVAL;

        pm_runtime_set_autosuspend_delay(dev, value * 1000);
        return count;
}
static DEVICE_ATTR_RW(autosuspend);

static const char on_string[] = "on";
static const char auto_string[] = "auto";

static void warn_level(void)
{
        static int level_warned;

        if (!level_warned) {
                level_warned = 1;
                printk(KERN_WARNING "WARNING! power/level is deprecated; "
                                "use power/control instead\n");
        }
}

static ssize_t level_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct usb_device *udev = to_usb_device(dev);
        const char *p = auto_string;

        warn_level();
        if (udev->state != USB_STATE_SUSPENDED && !udev->dev.power.runtime_auto)
                p = on_string;
        return sysfs_emit(buf, "%s\n", p);
}

static ssize_t level_store(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count)
{
        struct usb_device *udev = to_usb_device(dev);
        int len = count;
        char *cp;
        int rc = count;
        int rv;

        warn_level();
        cp = memchr(buf, '\n', count);
        if (cp)
                len = cp - buf;

        rv = usb_lock_device_interruptible(udev);
        if (rv < 0)
                return -EINTR;

        if (len == sizeof on_string - 1 &&
                        strncmp(buf, on_string, len) == 0)
                usb_disable_autosuspend(udev);

        else if (len == sizeof auto_string - 1 &&
                        strncmp(buf, auto_string, len) == 0)
                usb_enable_autosuspend(udev);

        else
                rc = -EINVAL;

        usb_unlock_device(udev);
        return rc;
}
static DEVICE_ATTR_RW(level);

static ssize_t usb2_hardware_lpm_show(struct device *dev,
                                      struct device_attribute *attr, char *buf)
{
        struct usb_device *udev = to_usb_device(dev);
        const char *p;

        if (udev->usb2_hw_lpm_allowed == 1)
                p = "enabled";
        else
                p = "disabled";

        return sysfs_emit(buf, "%s\n", p);
}

static ssize_t usb2_hardware_lpm_store(struct device *dev,
                                       struct device_attribute *attr,
                                       const char *buf, size_t count)
{
        struct usb_device *udev = to_usb_device(dev);
        bool value;
        int ret;

        ret = usb_lock_device_interruptible(udev);
        if (ret < 0)
                return -EINTR;

        ret = kstrtobool(buf, &value);

        if (!ret) {
                udev->usb2_hw_lpm_allowed = value;
                if (value)
                        ret = usb_enable_usb2_hardware_lpm(udev);
                else
                        ret = usb_disable_usb2_hardware_lpm(udev);
        }

        usb_unlock_device(udev);

        if (!ret)
                return count;

        return ret;
}
static DEVICE_ATTR_RW(usb2_hardware_lpm);

static ssize_t usb2_lpm_l1_timeout_show(struct device *dev,
                                        struct device_attribute *attr,
                                        char *buf)
{
        struct usb_device *udev = to_usb_device(dev);
        return sysfs_emit(buf, "%d\n", udev->l1_params.timeout);
}

static ssize_t usb2_lpm_l1_timeout_store(struct device *dev,
                                         struct device_attribute *attr,
                                         const char *buf, size_t count)
{
        struct usb_device *udev = to_usb_device(dev);
        u16 timeout;

        if (kstrtou16(buf, 0, &timeout))
                return -EINVAL;

        udev->l1_params.timeout = timeout;

        return count;
}
static DEVICE_ATTR_RW(usb2_lpm_l1_timeout);

static ssize_t usb2_lpm_besl_show(struct device *dev,
                                  struct device_attribute *attr, char *buf)
{
        struct usb_device *udev = to_usb_device(dev);
        return sysfs_emit(buf, "%d\n", udev->l1_params.besl);
}

static ssize_t usb2_lpm_besl_store(struct device *dev,
                                   struct device_attribute *attr,
                                   const char *buf, size_t count)
{
        struct usb_device *udev = to_usb_device(dev);
        u8 besl;

        if (kstrtou8(buf, 0, &besl) || besl > 15)
                return -EINVAL;

        udev->l1_params.besl = besl;

        return count;
}
static DEVICE_ATTR_RW(usb2_lpm_besl);

static ssize_t usb3_hardware_lpm_u1_show(struct device *dev,
                                      struct device_attribute *attr, char *buf)
{
        struct usb_device *udev = to_usb_device(dev);
        const char *p;
        int rc;

        rc = usb_lock_device_interruptible(udev);
        if (rc < 0)
                return -EINTR;

        if (udev->usb3_lpm_u1_enabled)
                p = "enabled";
        else
                p = "disabled";

        usb_unlock_device(udev);

        return sysfs_emit(buf, "%s\n", p);
}
static DEVICE_ATTR_RO(usb3_hardware_lpm_u1);

static ssize_t usb3_hardware_lpm_u2_show(struct device *dev,
                                      struct device_attribute *attr, char *buf)
{
        struct usb_device *udev = to_usb_device(dev);
        const char *p;
        int rc;

        rc = usb_lock_device_interruptible(udev);
        if (rc < 0)
                return -EINTR;

        if (udev->usb3_lpm_u2_enabled)
                p = "enabled";
        else
                p = "disabled";

        usb_unlock_device(udev);

        return sysfs_emit(buf, "%s\n", p);
}
static DEVICE_ATTR_RO(usb3_hardware_lpm_u2);

static struct attribute *usb2_hardware_lpm_attr[] = {
        &dev_attr_usb2_hardware_lpm.attr,
        &dev_attr_usb2_lpm_l1_timeout.attr,
        &dev_attr_usb2_lpm_besl.attr,
        NULL,
};
static const struct attribute_group usb2_hardware_lpm_attr_group = {
        .name        = power_group_name,
        .attrs        = usb2_hardware_lpm_attr,
};

static struct attribute *usb3_hardware_lpm_attr[] = {
        &dev_attr_usb3_hardware_lpm_u1.attr,
        &dev_attr_usb3_hardware_lpm_u2.attr,
        NULL,
};
static const struct attribute_group usb3_hardware_lpm_attr_group = {
        .name        = power_group_name,
        .attrs        = usb3_hardware_lpm_attr,
};

static struct attribute *power_attrs[] = {
        &dev_attr_autosuspend.attr,
        &dev_attr_level.attr,
        &dev_attr_connected_duration.attr,
        &dev_attr_active_duration.attr,
        NULL,
};
static const struct attribute_group power_attr_group = {
        .name        = power_group_name,
        .attrs        = power_attrs,
};

static int add_power_attributes(struct device *dev)
{
        int rc = 0;

        if (is_usb_device(dev)) {
                struct usb_device *udev = to_usb_device(dev);
                rc = sysfs_merge_group(&dev->kobj, &power_attr_group);
                if (udev->usb2_hw_lpm_capable == 1)
                        rc = sysfs_merge_group(&dev->kobj,
                                        &usb2_hardware_lpm_attr_group);
                if ((udev->speed == USB_SPEED_SUPER ||
                     udev->speed == USB_SPEED_SUPER_PLUS) &&
                                udev->lpm_capable == 1)
                        rc = sysfs_merge_group(&dev->kobj,
                                        &usb3_hardware_lpm_attr_group);
        }

        return rc;
}

static void remove_power_attributes(struct device *dev)
{
        sysfs_unmerge_group(&dev->kobj, &usb3_hardware_lpm_attr_group);
        sysfs_unmerge_group(&dev->kobj, &usb2_hardware_lpm_attr_group);
        sysfs_unmerge_group(&dev->kobj, &power_attr_group);
}

#else

#define add_persist_attributes(dev)        0
#define remove_persist_attributes(dev)        do {} while (0)

#define add_power_attributes(dev)        0
#define remove_power_attributes(dev)        do {} while (0)

#endif        /* CONFIG_PM */


/* Descriptor fields */
#define usb_descriptor_attr_le16(field, format_string)                        \
static ssize_t                                                                \
field##_show(struct device *dev, struct device_attribute *attr,        \
                char *buf)                                                \
{                                                                        \
        struct usb_device *udev;                                        \
                                                                        \
        udev = to_usb_device(dev);                                        \
        return sysfs_emit(buf, format_string,                                \
                        le16_to_cpu(udev->descriptor.field));                \
}                                                                        \
static DEVICE_ATTR_RO(field)

usb_descriptor_attr_le16(idVendor, "%04x\n");
usb_descriptor_attr_le16(idProduct, "%04x\n");
usb_descriptor_attr_le16(bcdDevice, "%04x\n");

#define usb_descriptor_attr(field, format_string)                        \
static ssize_t                                                                \
field##_show(struct device *dev, struct device_attribute *attr,        \
                char *buf)                                                \
{                                                                        \
        struct usb_device *udev;                                        \
                                                                        \
        udev = to_usb_device(dev);                                        \
        return sysfs_emit(buf, format_string, udev->descriptor.field);        \
}                                                                        \
static DEVICE_ATTR_RO(field)

usb_descriptor_attr(bDeviceClass, "%02x\n");
usb_descriptor_attr(bDeviceSubClass, "%02x\n");
usb_descriptor_attr(bDeviceProtocol, "%02x\n");
usb_descriptor_attr(bNumConfigurations, "%d\n");
usb_descriptor_attr(bMaxPacketSize0, "%d\n");


/* show if the device is authorized (1) or not (0) */
static ssize_t authorized_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct usb_device *usb_dev = to_usb_device(dev);
        return sysfs_emit(buf, "%u\n", usb_dev->authorized);
}

/*
 * Authorize a device to be used in the system
 *
 * Writing a 0 deauthorizes the device, writing a 1 authorizes it.
 */
static ssize_t authorized_store(struct device *dev,
                                struct device_attribute *attr, const char *buf,
                                size_t size)
{
        ssize_t result;
        struct usb_device *usb_dev = to_usb_device(dev);
        bool val;

        if (kstrtobool(buf, &val) != 0)
                result = -EINVAL;
        else if (val)
                result = usb_authorize_device(usb_dev);
        else
                result = usb_deauthorize_device(usb_dev);
        return result < 0 ? result : size;
}
static DEVICE_ATTR_IGNORE_LOCKDEP(authorized, S_IRUGO | S_IWUSR,
                                  authorized_show, authorized_store);

/* "Safely remove a device" */
static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        struct usb_device *udev = to_usb_device(dev);
        int rc = 0;

        usb_lock_device(udev);
        if (udev->state != USB_STATE_NOTATTACHED) {

                /* To avoid races, first unconfigure and then remove */
                usb_set_configuration(udev, -1);
                rc = usb_remove_device(udev);
        }
        if (rc == 0)
                rc = count;
        usb_unlock_device(udev);
        return rc;
}
static DEVICE_ATTR_IGNORE_LOCKDEP(remove, S_IWUSR, NULL, remove_store);


static struct attribute *dev_attrs[] = {
        /* current configuration's attributes */
        &dev_attr_configuration.attr,
        &dev_attr_bNumInterfaces.attr,
        &dev_attr_bConfigurationValue.attr,
        &dev_attr_bmAttributes.attr,
        &dev_attr_bMaxPower.attr,
        /* device attributes */
        &dev_attr_urbnum.attr,
        &dev_attr_idVendor.attr,
        &dev_attr_idProduct.attr,
        &dev_attr_bcdDevice.attr,
        &dev_attr_bDeviceClass.attr,
        &dev_attr_bDeviceSubClass.attr,
        &dev_attr_bDeviceProtocol.attr,
        &dev_attr_bNumConfigurations.attr,
        &dev_attr_bMaxPacketSize0.attr,
        &dev_attr_speed.attr,
        &dev_attr_rx_lanes.attr,
        &dev_attr_tx_lanes.attr,
        &dev_attr_busnum.attr,
        &dev_attr_devnum.attr,
        &dev_attr_devpath.attr,
        &dev_attr_version.attr,
        &dev_attr_maxchild.attr,
        &dev_attr_quirks.attr,
        &dev_attr_avoid_reset_quirk.attr,
        &dev_attr_authorized.attr,
        &dev_attr_remove.attr,
        &dev_attr_ltm_capable.attr,
#ifdef CONFIG_OF
        &dev_attr_devspec.attr,
#endif
        NULL,
};
static const struct attribute_group dev_attr_grp = {
        .attrs = dev_attrs,
};

/* When modifying this list, be sure to modify dev_string_attrs_are_visible()
 * accordingly.
 */
static struct attribute *dev_string_attrs[] = {
        &dev_attr_manufacturer.attr,
        &dev_attr_product.attr,
        &dev_attr_serial.attr,
        NULL
};

static umode_t dev_string_attrs_are_visible(struct kobject *kobj,
                struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct usb_device *udev = to_usb_device(dev);

        if (a == &dev_attr_manufacturer.attr) {
                if (udev->manufacturer == NULL)
                        return 0;
        } else if (a == &dev_attr_product.attr) {
                if (udev->product == NULL)
                        return 0;
        } else if (a == &dev_attr_serial.attr) {
                if (udev->serial == NULL)
                        return 0;
        }
        return a->mode;
}

static const struct attribute_group dev_string_attr_grp = {
        .attrs =        dev_string_attrs,
        .is_visible =        dev_string_attrs_are_visible,
};

/* Binary descriptors */

static ssize_t
descriptors_read(struct file *filp, struct kobject *kobj,
                const struct bin_attribute *attr,
                char *buf, loff_t off, size_t count)
{
        struct device *dev = kobj_to_dev(kobj);
        struct usb_device *udev = to_usb_device(dev);
        size_t nleft = count;
        size_t srclen, n;
        int cfgno;
        void *src;

        /* The binary attribute begins with the device descriptor.
         * Following that are the raw descriptor entries for all the
         * configurations (config plus subsidiary descriptors).
         */
        for (cfgno = -1; cfgno < udev->descriptor.bNumConfigurations &&
                        nleft > 0; ++cfgno) {
                if (cfgno < 0) {
                        src = &udev->descriptor;
                        srclen = sizeof(struct usb_device_descriptor);
                } else {
                        src = udev->rawdescriptors[cfgno];
                        srclen = le16_to_cpu(udev->config[cfgno].desc.
                                        wTotalLength);
                }
                if (off < srclen) {
                        n = min(nleft, srclen - (size_t) off);
                        memcpy(buf, src + off, n);
                        nleft -= n;
                        buf += n;
                        off = 0;
                } else {
                        off -= srclen;
                }
        }
        return count - nleft;
}
static const BIN_ATTR_RO(descriptors, 18 + 65535); /* dev descr + max-size raw descriptor */

static ssize_t
bos_descriptors_read(struct file *filp, struct kobject *kobj,
                const struct bin_attribute *attr,
                char *buf, loff_t off, size_t count)
{
        struct device *dev = kobj_to_dev(kobj);
        struct usb_device *udev = to_usb_device(dev);
        struct usb_host_bos *bos = udev->bos;
        struct usb_bos_descriptor *desc;
        size_t desclen, n = 0;

        if (bos) {
                desc = bos->desc;
                desclen = le16_to_cpu(desc->wTotalLength);
                if (off < desclen) {
                        n = min(count, desclen - (size_t) off);
                        memcpy(buf, (void *) desc + off, n);
                }
        }
        return n;
}
static const BIN_ATTR_RO(bos_descriptors, 65535); /* max-size BOS */

/* When modifying this list, be sure to modify dev_bin_attrs_are_visible()
 * accordingly.
 */
static const struct bin_attribute *const dev_bin_attrs[] = {
        &bin_attr_descriptors,
        &bin_attr_bos_descriptors,
        NULL
};

static umode_t dev_bin_attrs_are_visible(struct kobject *kobj,
                const struct bin_attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct usb_device *udev = to_usb_device(dev);

        /*
         * There's no need to check if the descriptors attribute should
         * be visible because all devices have a device descriptor. The
         * bos_descriptors attribute should be visible if and only if
         * the device has a BOS, so check if it exists here.
         */
        if (a == &bin_attr_bos_descriptors) {
                if (udev->bos == NULL)
                        return 0;
        }
        return a->attr.mode;
}

static const struct attribute_group dev_bin_attr_grp = {
        .bin_attrs =        dev_bin_attrs,
        .is_bin_visible =        dev_bin_attrs_are_visible,
};

const struct attribute_group *usb_device_groups[] = {
        &dev_attr_grp,
        &dev_string_attr_grp,
        &dev_bin_attr_grp,
        NULL
};

/*
 * Show & store the current value of authorized_default
 */
static ssize_t authorized_default_show(struct device *dev,
                                       struct device_attribute *attr, char *buf)
{
        struct usb_device *rh_usb_dev = to_usb_device(dev);
        struct usb_bus *usb_bus = rh_usb_dev->bus;
        struct usb_hcd *hcd;

        hcd = bus_to_hcd(usb_bus);
        return sysfs_emit(buf, "%u\n", hcd->dev_policy);
}

static ssize_t authorized_default_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *buf, size_t size)
{
        ssize_t result;
        unsigned int val;
        struct usb_device *rh_usb_dev = to_usb_device(dev);
        struct usb_bus *usb_bus = rh_usb_dev->bus;
        struct usb_hcd *hcd;

        hcd = bus_to_hcd(usb_bus);
        result = sscanf(buf, "%u\n", &val);
        if (result == 1) {
                hcd->dev_policy = val <= USB_DEVICE_AUTHORIZE_INTERNAL ?
                        val : USB_DEVICE_AUTHORIZE_ALL;
                result = size;
        } else {
                result = -EINVAL;
        }
        return result;
}
static DEVICE_ATTR_RW(authorized_default);

/*
 * interface_authorized_default_show - show default authorization status
 * for USB interfaces
 *
 * note: interface_authorized_default is the default value
 *       for initializing the authorized attribute of interfaces
 */
static ssize_t interface_authorized_default_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        struct usb_device *usb_dev = to_usb_device(dev);
        struct usb_hcd *hcd = bus_to_hcd(usb_dev->bus);

        return sysfs_emit(buf, "%u\n", !!HCD_INTF_AUTHORIZED(hcd));
}

/*
 * interface_authorized_default_store - store default authorization status
 * for USB interfaces
 *
 * note: interface_authorized_default is the default value
 *       for initializing the authorized attribute of interfaces
 */
static ssize_t interface_authorized_default_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct usb_device *usb_dev = to_usb_device(dev);
        struct usb_hcd *hcd = bus_to_hcd(usb_dev->bus);
        int rc = count;
        bool val;

        if (kstrtobool(buf, &val) != 0)
                return -EINVAL;

        if (val)
                set_bit(HCD_FLAG_INTF_AUTHORIZED, &hcd->flags);
        else
                clear_bit(HCD_FLAG_INTF_AUTHORIZED, &hcd->flags);

        return rc;
}
static DEVICE_ATTR_RW(interface_authorized_default);

/* Group all the USB bus attributes */
static struct attribute *usb_bus_attrs[] = {
                &dev_attr_authorized_default.attr,
                &dev_attr_interface_authorized_default.attr,
                NULL,
};

static const struct attribute_group usb_bus_attr_group = {
        .name = NULL,        /* we want them in the same directory */
        .attrs = usb_bus_attrs,
};


static int add_default_authorized_attributes(struct device *dev)
{
        int rc = 0;

        if (is_usb_device(dev))
                rc = sysfs_create_group(&dev->kobj, &usb_bus_attr_group);

        return rc;
}

static void remove_default_authorized_attributes(struct device *dev)
{
        if (is_usb_device(dev)) {
                sysfs_remove_group(&dev->kobj, &usb_bus_attr_group);
        }
}

int usb_create_sysfs_dev_files(struct usb_device *udev)
{
        struct device *dev = &udev->dev;
        int retval;

        retval = add_persist_attributes(dev);
        if (retval)
                goto error;

        retval = add_power_attributes(dev);
        if (retval)
                goto error;

        if (is_root_hub(udev)) {
                retval = add_default_authorized_attributes(dev);
                if (retval)
                        goto error;
        }
        return retval;

error:
        usb_remove_sysfs_dev_files(udev);
        return retval;
}

void usb_remove_sysfs_dev_files(struct usb_device *udev)
{
        struct device *dev = &udev->dev;

        if (is_root_hub(udev))
                remove_default_authorized_attributes(dev);

        remove_power_attributes(dev);
        remove_persist_attributes(dev);
}

/* Interface Association Descriptor fields */
#define usb_intf_assoc_attr(field, format_string)                        \
static ssize_t                                                                \
iad_##field##_show(struct device *dev, struct device_attribute *attr,        \
                char *buf)                                                \
{                                                                        \
        struct usb_interface *intf = to_usb_interface(dev);                \
                                                                        \
        return sysfs_emit(buf, format_string,                                \
                        intf->intf_assoc->field);                         \
}                                                                        \
static DEVICE_ATTR_RO(iad_##field)

usb_intf_assoc_attr(bFirstInterface, "%02x\n");
usb_intf_assoc_attr(bInterfaceCount, "%02d\n");
usb_intf_assoc_attr(bFunctionClass, "%02x\n");
usb_intf_assoc_attr(bFunctionSubClass, "%02x\n");
usb_intf_assoc_attr(bFunctionProtocol, "%02x\n");

/* Interface fields */
#define usb_intf_attr(field, format_string)                                \
static ssize_t                                                                \
field##_show(struct device *dev, struct device_attribute *attr,                \
                char *buf)                                                \
{                                                                        \
        struct usb_interface *intf = to_usb_interface(dev);                \
                                                                        \
        return sysfs_emit(buf, format_string,                                \
                        intf->cur_altsetting->desc.field);                 \
}                                                                        \
static DEVICE_ATTR_RO(field)

usb_intf_attr(bInterfaceNumber, "%02x\n");
usb_intf_attr(bAlternateSetting, "%2d\n");
usb_intf_attr(bNumEndpoints, "%02x\n");
usb_intf_attr(bInterfaceClass, "%02x\n");
usb_intf_attr(bInterfaceSubClass, "%02x\n");
usb_intf_attr(bInterfaceProtocol, "%02x\n");

static ssize_t interface_show(struct device *dev, struct device_attribute *attr,
                              char *buf)
{
        struct usb_interface *intf;
        char *string;

        intf = to_usb_interface(dev);
        string = READ_ONCE(intf->cur_altsetting->string);
        if (!string)
                return 0;
        return sysfs_emit(buf, "%s\n", string);
}
static DEVICE_ATTR_RO(interface);

static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
                             char *buf)
{
        struct usb_interface *intf;
        struct usb_device *udev;
        struct usb_host_interface *alt;

        intf = to_usb_interface(dev);
        udev = interface_to_usbdev(intf);
        alt = READ_ONCE(intf->cur_altsetting);

        return sysfs_emit(buf,
                        "usb:v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02X"
                        "ic%02Xisc%02Xip%02Xin%02X\n",
                        le16_to_cpu(udev->descriptor.idVendor),
                        le16_to_cpu(udev->descriptor.idProduct),
                        le16_to_cpu(udev->descriptor.bcdDevice),
                        udev->descriptor.bDeviceClass,
                        udev->descriptor.bDeviceSubClass,
                        udev->descriptor.bDeviceProtocol,
                        alt->desc.bInterfaceClass,
                        alt->desc.bInterfaceSubClass,
                        alt->desc.bInterfaceProtocol,
                        alt->desc.bInterfaceNumber);
}
static DEVICE_ATTR_RO(modalias);

static ssize_t supports_autosuspend_show(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf)
{
        int s;

        s = device_lock_interruptible(dev);
        if (s < 0)
                return -EINTR;
        /* Devices will be autosuspended even when an interface isn't claimed */
        s = (!dev->driver || to_usb_driver(dev->driver)->supports_autosuspend);
        device_unlock(dev);

        return sysfs_emit(buf, "%u\n", s);
}
static DEVICE_ATTR_RO(supports_autosuspend);

/*
 * interface_authorized_show - show authorization status of an USB interface
 * 1 is authorized, 0 is deauthorized
 */
static ssize_t interface_authorized_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        struct usb_interface *intf = to_usb_interface(dev);

        return sysfs_emit(buf, "%u\n", intf->authorized);
}

/*
 * interface_authorized_store - authorize or deauthorize an USB interface
 */
static ssize_t interface_authorized_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct usb_interface *intf = to_usb_interface(dev);
        bool val;
        struct kernfs_node *kn;

        if (kstrtobool(buf, &val) != 0)
                return -EINVAL;

        if (val) {
                usb_authorize_interface(intf);
        } else {
                /*
                 * Prevent deadlock if another process is concurrently
                 * trying to unregister intf.
                 */
                kn = sysfs_break_active_protection(&dev->kobj, &attr->attr);
                if (kn) {
                        usb_deauthorize_interface(intf);
                        sysfs_unbreak_active_protection(kn);
                }
        }

        return count;
}
static struct device_attribute dev_attr_interface_authorized =
                __ATTR(authorized, S_IRUGO | S_IWUSR,
                interface_authorized_show, interface_authorized_store);

static struct attribute *intf_attrs[] = {
        &dev_attr_bInterfaceNumber.attr,
        &dev_attr_bAlternateSetting.attr,
        &dev_attr_bNumEndpoints.attr,
        &dev_attr_bInterfaceClass.attr,
        &dev_attr_bInterfaceSubClass.attr,
        &dev_attr_bInterfaceProtocol.attr,
        &dev_attr_modalias.attr,
        &dev_attr_supports_autosuspend.attr,
        &dev_attr_interface_authorized.attr,
        NULL,
};
static const struct attribute_group intf_attr_grp = {
        .attrs = intf_attrs,
};

static struct attribute *intf_assoc_attrs[] = {
        &dev_attr_iad_bFirstInterface.attr,
        &dev_attr_iad_bInterfaceCount.attr,
        &dev_attr_iad_bFunctionClass.attr,
        &dev_attr_iad_bFunctionSubClass.attr,
        &dev_attr_iad_bFunctionProtocol.attr,
        NULL,
};

static umode_t intf_assoc_attrs_are_visible(struct kobject *kobj,
                struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct usb_interface *intf = to_usb_interface(dev);

        if (intf->intf_assoc == NULL)
                return 0;
        return a->mode;
}

static const struct attribute_group intf_assoc_attr_grp = {
        .attrs =        intf_assoc_attrs,
        .is_visible =        intf_assoc_attrs_are_visible,
};

static ssize_t wireless_status_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        struct usb_interface *intf;

        intf = to_usb_interface(dev);
        if (intf->wireless_status == USB_WIRELESS_STATUS_DISCONNECTED)
                return sysfs_emit(buf, "%s\n", "disconnected");
        return sysfs_emit(buf, "%s\n", "connected");
}
static DEVICE_ATTR_RO(wireless_status);

static struct attribute *intf_wireless_status_attrs[] = {
        &dev_attr_wireless_status.attr,
        NULL
};

static umode_t intf_wireless_status_attr_is_visible(struct kobject *kobj,
                struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct usb_interface *intf = to_usb_interface(dev);

        if (a != &dev_attr_wireless_status.attr ||
            intf->wireless_status != USB_WIRELESS_STATUS_NA)
                return a->mode;
        return 0;
}

static const struct attribute_group intf_wireless_status_attr_grp = {
        .attrs =        intf_wireless_status_attrs,
        .is_visible =        intf_wireless_status_attr_is_visible,
};

int usb_update_wireless_status_attr(struct usb_interface *intf)
{
        struct device *dev = &intf->dev;
        int ret;

        ret = sysfs_update_group(&dev->kobj, &intf_wireless_status_attr_grp);
        if (ret < 0)
                return ret;

        sysfs_notify(&dev->kobj, NULL, "wireless_status");
        kobject_uevent(&dev->kobj, KOBJ_CHANGE);

        return 0;
}

const struct attribute_group *usb_interface_groups[] = {
        &intf_attr_grp,
        &intf_assoc_attr_grp,
        &intf_wireless_status_attr_grp,
        NULL
};

void usb_create_sysfs_intf_files(struct usb_interface *intf)
{
        struct usb_device *udev = interface_to_usbdev(intf);
        struct usb_host_interface *alt = intf->cur_altsetting;

        if (intf->sysfs_files_created || intf->unregistering)
                return;

        if (!alt->string && !(udev->quirks & USB_QUIRK_CONFIG_INTF_STRINGS))
                alt->string = usb_cache_string(udev, alt->desc.iInterface);
        if (alt->string && device_create_file(&intf->dev, &dev_attr_interface)) {
                /* This is not a serious error */
                dev_dbg(&intf->dev, "interface string descriptor file not created\n");
        }
        intf->sysfs_files_created = 1;
}

void usb_remove_sysfs_intf_files(struct usb_interface *intf)
{
        if (!intf->sysfs_files_created)
                return;

        device_remove_file(&intf->dev, &dev_attr_interface);
        intf->sysfs_files_created = 0;
}















































































































































































































































































































































































































   19 
































   19 



















































































































































































































   12 



















    1 







   15 




























































































   18 




   18 





















   10 








































   18 











































































































































    8 






    8 








































































































































































































































































































































































































































































































































   19 




















   20 
   20 






























    2 

    2 





























































































































































   23 





























    1 



















































































































































    3 





    3 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* memcontrol.h - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 */

#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
#include <linux/cgroup.h>
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/page_counter.h>
#include <linux/vmpressure.h>
#include <linux/eventfd.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>
#include <linux/shrinker.h>

struct mem_cgroup;
struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;

/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
        MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
        MEMCG_SOCK,
        MEMCG_PERCPU_B,
        MEMCG_KMEM,
        MEMCG_ZSWAP_B,
        MEMCG_ZSWAPPED,
        MEMCG_ZSWAP_INCOMP,
        MEMCG_NR_STAT,
};

enum memcg_memory_event {
        MEMCG_LOW,
        MEMCG_HIGH,
        MEMCG_MAX,
        MEMCG_OOM,
        MEMCG_OOM_KILL,
        MEMCG_OOM_GROUP_KILL,
        MEMCG_SWAP_HIGH,
        MEMCG_SWAP_MAX,
        MEMCG_SWAP_FAIL,
        MEMCG_SOCK_THROTTLED,
        MEMCG_NR_MEMORY_EVENTS,
};

struct mem_cgroup_reclaim_cookie {
        pg_data_t *pgdat;
        int generation;
};

#ifdef CONFIG_MEMCG

#define MEM_CGROUP_ID_SHIFT        16

struct mem_cgroup_private_id {
        int id;
        refcount_t ref;
};

struct memcg_vmstats_percpu;
struct memcg1_events_percpu;
struct memcg_vmstats;
struct lruvec_stats_percpu;
struct lruvec_stats;

struct mem_cgroup_reclaim_iter {
        struct mem_cgroup *position;
        /* scan generation, increased every round-trip */
        atomic_t generation;
};

/*
 * per-node information in memory controller.
 */
struct mem_cgroup_per_node {
        /* Keep the read-only fields at the start */
        struct mem_cgroup        *memcg;                /* Back pointer, we cannot */
                                                /* use container_of           */

        struct lruvec_stats_percpu __percpu        *lruvec_stats_percpu;
        struct lruvec_stats                        *lruvec_stats;
        struct shrinker_info __rcu        *shrinker_info;

#ifdef CONFIG_MEMCG_V1
        /*
         * Memcg-v1 only stuff in middle as buffer between read mostly fields
         * and update often fields to avoid false sharing. If v1 stuff is
         * not present, an explicit padding is needed.
         */

        struct rb_node                tree_node;        /* RB tree node */
        unsigned long                usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                        on_tree;
#else
        CACHELINE_PADDING(_pad1_);
#endif

        /* Fields which get updated often at the end. */
        struct lruvec                lruvec;
        CACHELINE_PADDING(_pad2_);
        unsigned long                lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
        struct mem_cgroup_reclaim_iter        iter;

        /*
         * objcg is wiped out as a part of the objcg repaprenting process.
         * orig_objcg preserves a pointer (and a reference) to the original
         * objcg until the end of live of memcg.
         */
        struct obj_cgroup __rcu        *objcg;
        struct obj_cgroup        *orig_objcg;
        /* list of inherited objcgs, protected by objcg_lock */
        struct list_head objcg_list;

#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
        /* slab stats for nmi context */
        atomic_t                slab_reclaimable;
        atomic_t                slab_unreclaimable;
#endif
};

struct mem_cgroup_threshold {
        struct eventfd_ctx *eventfd;
        unsigned long threshold;
};

/* For threshold */
struct mem_cgroup_threshold_ary {
        /* An array index points to threshold just below or equal to usage. */
        int current_threshold;
        /* Size of entries[] */
        unsigned int size;
        /* Array of thresholds */
        struct mem_cgroup_threshold entries[] __counted_by(size);
};

struct mem_cgroup_thresholds {
        /* Primary thresholds array */
        struct mem_cgroup_threshold_ary *primary;
        /*
         * Spare threshold array.
         * This is needed to make mem_cgroup_unregister_event() "never fail".
         * It must be able to store at least primary->size - 1 entries.
         */
        struct mem_cgroup_threshold_ary *spare;
};

/*
 * Remember four most recent foreign writebacks with dirty pages in this
 * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
 * one in a given round, we're likely to catch it later if it keeps
 * foreign-dirtying, so a fairly low count should be enough.
 *
 * See mem_cgroup_track_foreign_dirty_slowpath() for details.
 */
#define MEMCG_CGWB_FRN_CNT        4

struct memcg_cgwb_frn {
        u64 bdi_id;                        /* bdi->id of the foreign inode */
        int memcg_id;                        /* memcg->css.id of foreign inode */
        u64 at;                                /* jiffies_64 at the time of dirtying */
        struct wb_completion done;        /* tracks in-flight foreign writebacks */
};

/*
 * Bucket for arbitrarily byte-sized objects charged to a memory
 * cgroup. The bucket can be reparented in one piece when the cgroup
 * is destroyed, without having to round up the individual references
 * of all live memory objects in the wild.
 */
struct obj_cgroup {
        struct percpu_ref refcnt;
        struct mem_cgroup *memcg;
        atomic_t nr_charged_bytes;
        union {
                struct list_head list; /* protected by objcg_lock */
                struct rcu_head rcu;
        };
        bool is_root;
};

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 */
struct mem_cgroup {
        struct cgroup_subsys_state css;

        /* Private memcg ID. Used to ID objects that outlive the cgroup */
        struct mem_cgroup_private_id id;

        /* Accounted resources */
        struct page_counter memory;                /* Both v1 & v2 */

        union {
                struct page_counter swap;        /* v2 only */
                struct page_counter memsw;        /* v1 only */
        };

        /* registered local peak watchers */
        struct list_head memory_peaks;
        struct list_head swap_peaks;
        spinlock_t         peaks_lock;

        /* Range enforcement for interrupt charges */
        struct work_struct high_work;

#ifdef CONFIG_ZSWAP
        unsigned long zswap_max;

        /*
         * Prevent pages from this memcg from being written back from zswap to
         * swap, and from being swapped out on zswap store failures.
         */
        bool zswap_writeback;
#endif

        /* vmpressure notifications */
        struct vmpressure vmpressure;

        /*
         * Should the OOM killer kill all belonging tasks, had it kill one?
         */
        bool oom_group;

        int swappiness;

        /* memory.events and memory.events.local */
        struct cgroup_file events_file;
        struct cgroup_file events_local_file;

        /* handle for "memory.swap.events" */
        struct cgroup_file swap_events_file;

        /* memory.stat */
        struct memcg_vmstats        *vmstats;

        /* memory.events */
        atomic_long_t                memory_events[MEMCG_NR_MEMORY_EVENTS];
        atomic_long_t                memory_events_local[MEMCG_NR_MEMORY_EVENTS];

#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
        /* MEMCG_KMEM for nmi context */
        atomic_t                kmem_stat;
#endif
        /*
         * Hint of reclaim pressure for socket memroy management. Note
         * that this indicator should NOT be used in legacy cgroup mode
         * where socket memory is accounted/charged separately.
         */
        u64                        socket_pressure;
#if BITS_PER_LONG < 64
        seqlock_t                socket_pressure_seqlock;
#endif
        int kmemcg_id;

        struct memcg_vmstats_percpu __percpu *vmstats_percpu;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head cgwb_list;
        struct wb_domain cgwb_domain;
        struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* per-memcg mm_struct list */
        struct lru_gen_mm_list mm_list;
#endif

#ifdef CONFIG_MEMCG_V1
        /* Legacy consumer-oriented counters */
        struct page_counter kmem;                /* v1 only */
        struct page_counter tcpmem;                /* v1 only */

        struct memcg1_events_percpu __percpu *events_percpu;

        unsigned long soft_limit;

        /* protected by memcg_oom_lock */
        bool oom_lock;
        int under_oom;

        /* OOM-Killer disable */
        int oom_kill_disable;

        /* protect arrays of thresholds */
        struct mutex thresholds_lock;

        /* thresholds for memory usage. RCU-protected */
        struct mem_cgroup_thresholds thresholds;

        /* thresholds for mem+swap usage. RCU-protected */
        struct mem_cgroup_thresholds memsw_thresholds;

        /* For oom notifier event fd */
        struct list_head oom_notify;

        /* Legacy tcp memory accounting */
        bool tcpmem_active;
        int tcpmem_pressure;

        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;
#endif /* CONFIG_MEMCG_V1 */

        struct mem_cgroup_per_node *nodeinfo[];
};

/*
 * size of first charge trial.
 * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
 * workload.
 */
#define MEMCG_CHARGE_BATCH 64U

extern struct mem_cgroup *root_mem_cgroup;

enum page_memcg_data_flags {
        /* page->memcg_data is a pointer to an slabobj_ext vector */
        MEMCG_DATA_OBJEXTS = (1UL << 0),
        /* page has been accounted as a non-slab kernel page */
        MEMCG_DATA_KMEM = (1UL << 1),
        /* the next bit after the last actual flag */
        __NR_MEMCG_DATA_FLAGS  = (1UL << 2),
};

#define __OBJEXTS_ALLOC_FAIL        MEMCG_DATA_OBJEXTS
#define __FIRST_OBJEXT_FLAG        __NR_MEMCG_DATA_FLAGS

#else /* CONFIG_MEMCG */

#define __OBJEXTS_ALLOC_FAIL        (1UL << 0)
#define __FIRST_OBJEXT_FLAG        (1UL << 0)

#endif /* CONFIG_MEMCG */

enum objext_flags {
        /*
         * Use bit 0 with zero other bits to signal that slabobj_ext vector
         * failed to allocate. The same bit 0 with valid upper bits means
         * MEMCG_DATA_OBJEXTS.
         */
        OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL,
        __OBJEXTS_FLAG_UNUSED = __FIRST_OBJEXT_FLAG,
        /* the next bit after the last actual flag */
        __NR_OBJEXTS_FLAGS  = (__FIRST_OBJEXT_FLAG << 1),
};

#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)

#ifdef CONFIG_MEMCG
/*
 * After the initialization objcg->memcg is always pointing at
 * a valid memcg, but can be atomically swapped to the parent memcg.
 *
 * The caller must ensure that the returned memcg won't be released.
 */
static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
{
        lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex));
        return READ_ONCE(objcg->memcg);
}

/*
 * folio_objcg - get the object cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the object cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper object cgroup pointer.
 */
static inline struct obj_cgroup *folio_objcg(struct folio *folio)
{
        unsigned long memcg_data = folio->memcg_data;

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);

        return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * folio_memcg - Get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios.
 *
 * For a folio any of the following ensures folio and objcg binding stability:
 *
 * - the folio lock
 * - LRU isolation
 * - exclusive reference
 *
 * Based on the stable binding of folio and objcg, for a folio any of the
 * following ensures folio and memcg binding stability:
 *
 * - cgroup_mutex
 * - the lruvec lock
 *
 * If the caller only want to ensure that the page counters of memcg are
 * updated correctly, ensure that the binding stability of folio and objcg
 * is sufficient.
 *
 * Note: The caller should hold an rcu read lock or cgroup_mutex to protect
 * memcg associated with a folio from being released.
 */
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
        struct obj_cgroup *objcg = folio_objcg(folio);

        return objcg ? obj_cgroup_memcg(objcg) : NULL;
}

/*
 * folio_memcg_charged - If a folio is charged to a memory cgroup.
 * @folio: Pointer to the folio.
 *
 * Returns true if folio is charged to a memory cgroup, otherwise returns false.
 */
static inline bool folio_memcg_charged(struct folio *folio)
{
        return folio->memcg_data != 0;
}

/*
 * folio_memcg_check - Get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function unlike folio_memcg() can take any folio
 * as an argument. It has to be used in cases when it's not known if a folio
 * has an associated memory cgroup pointer or an object cgroups vector or
 * an object cgroup.
 *
 * The page and objcg or memcg binding rules can refer to folio_memcg().
 *
 * A caller should hold an rcu read lock to protect memcg associated with a
 * page from being released.
 */
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
        /*
         * Because folio->memcg_data might be changed asynchronously
         * for slabs, READ_ONCE() should be used here.
         */
        unsigned long memcg_data = READ_ONCE(folio->memcg_data);
        struct obj_cgroup *objcg;

        if (memcg_data & MEMCG_DATA_OBJEXTS)
                return NULL;

        objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);

        return objcg ? obj_cgroup_memcg(objcg) : NULL;
}

static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
        if (PageTail(page))
                return NULL;
        return folio_memcg_check((struct folio *)page);
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
        struct mem_cgroup *memcg;

        rcu_read_lock();
retry:
        memcg = obj_cgroup_memcg(objcg);
        if (unlikely(!css_tryget(&memcg->css)))
                goto retry;
        rcu_read_unlock();

        return memcg;
}

/*
 * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
 * @folio: Pointer to the folio.
 *
 * Checks if the folio has MemcgKmem flag set. The caller must ensure
 * that the folio has an associated memory cgroup. It's not safe to call
 * this function against some types of folios, e.g. slab folios.
 */
static inline bool folio_memcg_kmem(struct folio *folio)
{
        VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
        VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
        return folio->memcg_data & MEMCG_DATA_KMEM;
}

static inline bool PageMemcgKmem(struct page *page)
{
        return folio_memcg_kmem(page_folio(page));
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return (memcg == root_mem_cgroup);
}

static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
{
        return objcg->is_root;
}

static inline bool mem_cgroup_disabled(void)
{
        return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low,
                                         unsigned long *usage)
{
        *min = *low = *usage = 0;

        if (mem_cgroup_disabled())
                return;

        *usage = page_counter_read(&memcg->memory);
        /*
         * There is no reclaim protection applied to a targeted reclaim.
         * We are special casing this specific case here because
         * mem_cgroup_calculate_protection is not robust enough to keep
         * the protection invariant for calculated effective values for
         * parallel reclaimers with different reclaim target. This is
         * especially a problem for tail memcgs (as they have pages on LRU)
         * which would want to have effective values 0 for targeted reclaim
         * but a different value for external reclaim.
         *
         * Example
         * Let's have global and A's reclaim in parallel:
         *  |
         *  A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
         *  |\
         *  | C (low = 1G, usage = 2.5G)
         *  B (low = 1G, usage = 0.5G)
         *
         * For the global reclaim
         * A.elow = A.low
         * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
         * C.elow = min(C.usage, C.low)
         *
         * With the effective values resetting we have A reclaim
         * A.elow = 0
         * B.elow = B.low
         * C.elow = C.low
         *
         * If the global reclaim races with A's reclaim then
         * B.elow = C.elow = 0 because children_low_usage > A.elow)
         * is possible and reclaiming B would be violating the protection.
         *
         */
        if (root == memcg)
                return;

        *min = READ_ONCE(memcg->memory.emin);
        *low = READ_ONCE(memcg->memory.elow);
}

void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                     struct mem_cgroup *memcg);

static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
                                          struct mem_cgroup *memcg)
{
        /*
         * The root memcg doesn't account charges, and doesn't support
         * protection. The target memcg's protection is ignored, see
         * mem_cgroup_calculate_protection() and mem_cgroup_protection()
         */
        return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) ||
                memcg == target;
}

static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        if (mem_cgroup_unprotected(target, memcg))
                return false;

        return READ_ONCE(memcg->memory.elow) >=
                page_counter_read(&memcg->memory);
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        if (mem_cgroup_unprotected(target, memcg))
                return false;

        return READ_ONCE(memcg->memory.emin) >=
                page_counter_read(&memcg->memory);
}

int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);

/**
 * mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
 * @folio: Folio to charge.
 * @mm: mm context of the allocating task.
 * @gfp: Reclaim mode.
 *
 * Try to charge @folio to the memcg that @mm belongs to, reclaiming
 * pages according to @gfp if necessary.  If @mm is NULL, try to
 * charge to the active memcg.
 *
 * Do not use this for folios allocated for swapin.
 *
 * Return: 0 on success. Otherwise, an error code is returned.
 */
static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
                                    gfp_t gfp)
{
        if (mem_cgroup_disabled())
                return 0;
        return __mem_cgroup_charge(folio, mm, gfp);
}

int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp);

int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
                                  gfp_t gfp, swp_entry_t entry);

void __mem_cgroup_uncharge(struct folio *folio);

/**
 * mem_cgroup_uncharge - Uncharge a folio.
 * @folio: Folio to uncharge.
 *
 * Uncharge a folio previously charged with mem_cgroup_charge().
 */
static inline void mem_cgroup_uncharge(struct folio *folio)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge(folio);
}

void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge_folios(folios);
}

void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
void mem_cgroup_migrate(struct folio *old, struct folio *new);

/**
 * mem_cgroup_lruvec - get the lru list vector for a memcg & node
 * @memcg: memcg of the wanted lruvec
 * @pgdat: pglist_data
 *
 * Returns the lru list vector holding pages for a given @memcg &
 * @pgdat combination. This can be the node lruvec, if the memory
 * controller is disabled.
 */
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        struct mem_cgroup_per_node *mz;
        struct lruvec *lruvec;

        if (mem_cgroup_disabled()) {
                lruvec = &pgdat->__lruvec;
                goto out;
        }

        if (!memcg)
                memcg = root_mem_cgroup;

        mz = memcg->nodeinfo[pgdat->node_id];
        lruvec = &mz->lruvec;
out:
        /*
         * Since a node can be onlined after the mem_cgroup was created,
         * we have to be prepared to initialize lruvec->pgdat here;
         * and if offlined then reonlined, we need to reinitialize it.
         */
        if (unlikely(lruvec->pgdat != pgdat))
                lruvec->pgdat = pgdat;
        return lruvec;
}

/**
 * folio_lruvec - return lruvec for isolating/putting an LRU folio
 * @folio: Pointer to the folio.
 *
 * Call with rcu_read_lock() held to ensure the lifetime of the returned lruvec.
 * Note that this alone will NOT guarantee the stability of the folio->lruvec
 * association; the folio can be reparented to an ancestor if this races with
 * cgroup deletion.
 *
 * Use folio_lruvec_lock() to ensure both lifetime and stability of the binding.
 * Once a lruvec is locked, folio_lruvec() can be called on other folios, and
 * their binding is stable if the returned lruvec matches the one the caller has
 * locked. Useful for lock batching.
 */
static inline struct lruvec *folio_lruvec(struct folio *folio)
{
        struct mem_cgroup *memcg = folio_memcg(folio);

        VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio);
        return mem_cgroup_lruvec(memcg, folio_pgdat(folio));
}

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);

struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);

struct mem_cgroup *get_mem_cgroup_from_current(void);

struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio);

struct lruvec *folio_lruvec_lock(struct folio *folio);
struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                                                unsigned long *flags);

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
        return css ? container_of(css, struct mem_cgroup, css) : NULL;
}

static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
{
        if (obj_cgroup_is_root(objcg))
                return true;
        return percpu_ref_tryget(&objcg->refcnt);
}

static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
                                       unsigned long nr)
{
        if (!obj_cgroup_is_root(objcg))
                percpu_ref_get_many(&objcg->refcnt, nr);
}

static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
        obj_cgroup_get_many(objcg, 1);
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
        if (objcg && !obj_cgroup_is_root(objcg))
                percpu_ref_put(&objcg->refcnt);
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
        return !memcg || css_tryget(&memcg->css);
}

static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
{
        return !memcg || css_tryget_online(&memcg->css);
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
        if (memcg)
                css_put(&memcg->css);
}

#define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                   struct mem_cgroup *,
                                   struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                           int (*)(struct task_struct *, void *), void *arg);

static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return 0;

        return memcg->id.id;
}
struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id);

static inline u64 mem_cgroup_id(struct mem_cgroup *memcg)
{
        return memcg ? cgroup_id(memcg->css.cgroup) : 0;
}

struct mem_cgroup *mem_cgroup_get_from_id(u64 id);

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return mem_cgroup_from_css(seq_css(m));
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        struct mem_cgroup_per_node *mz;

        if (mem_cgroup_disabled())
                return NULL;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return mz->memcg;
}

/**
 * parent_mem_cgroup - find the accounting parent of a memcg
 * @memcg: memcg whose parent to find
 *
 * Returns the parent memcg, or NULL if this is the root.
 */
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return mem_cgroup_from_css(memcg->css.parent);
}

static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
                              struct mem_cgroup *root)
{
        if (root == memcg)
                return true;
        return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                                   struct mem_cgroup *memcg)
{
        struct mem_cgroup *task_memcg;
        bool match = false;

        rcu_read_lock();
        task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (task_memcg)
                match = mem_cgroup_is_descendant(task_memcg, memcg);
        rcu_read_unlock();
        return match;
}

struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio);
ino_t page_cgroup_ino(struct page *page);

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return true;
        return css_is_online(&memcg->css);
}

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                int zid, long nr_pages);

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        struct mem_cgroup_per_node *mz;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}

void __mem_cgroup_handle_over_high(gfp_t gfp_mask);

static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
        if (unlikely(current->memcg_nr_pages_over_high))
                __mem_cgroup_handle_over_high(gfp_mask);
}

unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);

void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
                                struct task_struct *p);

void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);

struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
                                            struct mem_cgroup *oom_domain);
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);

/* idx can be of type enum memcg_stat_item or node_stat_item */
void mod_memcg_state(struct mem_cgroup *memcg,
                     enum memcg_stat_item idx, int val);

static inline void mod_memcg_page_state(struct page *page,
                                        enum memcg_stat_item idx, int val)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = folio_memcg(page_folio(page));
        if (memcg)
                mod_memcg_state(memcg, idx, val);
        rcu_read_unlock();
}

unsigned long memcg_events(struct mem_cgroup *memcg, int event);
unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
bool memcg_stat_item_valid(int idx);
bool memcg_vm_event_item_valid(enum vm_event_item idx);
unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                      enum node_stat_item idx);

void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);

void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);

void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                        unsigned long count);

static inline void count_memcg_folio_events(struct folio *folio,
                enum vm_event_item idx, unsigned long nr)
{
        struct mem_cgroup *memcg;

        if (!folio_memcg_charged(folio))
                return;

        rcu_read_lock();
        memcg = folio_memcg(folio);
        count_memcg_events(memcg, idx, nr);
        rcu_read_unlock();
}

static inline void count_memcg_events_mm(struct mm_struct *mm,
                                        enum vm_event_item idx, unsigned long count)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                count_memcg_events(memcg, idx, count);
        rcu_read_unlock();
}

static inline void count_memcg_event_mm(struct mm_struct *mm,
                                        enum vm_event_item idx)
{
        count_memcg_events_mm(mm, idx, 1);
}

void __memcg_memory_event(struct mem_cgroup *memcg,
                          enum memcg_memory_event event, bool allow_spinning);

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
        __memcg_memory_event(memcg, event, true);
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                memcg_memory_event(memcg, event);
        rcu_read_unlock();
}

void split_page_memcg(struct page *first, unsigned order);
void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
                unsigned new_order);

static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
{
        struct mem_cgroup *memcg;
        u64 id;

        if (mem_cgroup_disabled())
                return 0;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (!memcg)
                memcg = root_mem_cgroup;
        id = cgroup_id(memcg->css.cgroup);
        rcu_read_unlock();
        return id;
}

void mem_cgroup_flush_workqueue(void);

extern int mem_cgroup_init(void);
#else /* CONFIG_MEMCG */

#define MEM_CGROUP_ID_SHIFT        0

#define root_mem_cgroup                (NULL)

static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
        return NULL;
}

static inline bool folio_memcg_charged(struct folio *folio)
{
        return false;
}

static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
        return NULL;
}

static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
        return NULL;
}

static inline bool folio_memcg_kmem(struct folio *folio)
{
        return false;
}

static inline bool PageMemcgKmem(struct page *page)
{
        return false;
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
{
        return true;
}

static inline bool mem_cgroup_disabled(void)
{
        return true;
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low,
                                         unsigned long *usage)
{
        *min = *low = *usage = 0;
}

static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                                   struct mem_cgroup *memcg)
{
}

static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
                                          struct mem_cgroup *memcg)
{
        return true;
}
static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        return false;
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        return false;
}

static inline int mem_cgroup_charge(struct folio *folio,
                struct mm_struct *mm, gfp_t gfp)
{
        return 0;
}

static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp)
{
        return 0;
}

static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
                        struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
{
        return 0;
}

static inline void mem_cgroup_uncharge(struct folio *folio)
{
}

static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
}

static inline void mem_cgroup_replace_folio(struct folio *old,
                struct folio *new)
{
}

static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
}

static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);
        return &pgdat->__lruvec;
}

static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return NULL;
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                struct mem_cgroup *memcg)
{
        return true;
}

static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
{
        return NULL;
}

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
{
        return NULL;
}

static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        rcu_read_lock();
        spin_lock(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        rcu_read_lock();
        spin_lock_irq(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                unsigned long *flagsp)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        rcu_read_lock();
        spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
        return &pgdat->__lruvec;
}

static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
                struct mem_cgroup *prev,
                struct mem_cgroup_reclaim_cookie *reclaim)
{
        return NULL;
}

static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
                                         struct mem_cgroup *prev)
{
}

static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                int (*fn)(struct task_struct *, void *), void *arg)
{
}

static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id)
{
        WARN_ON_ONCE(id);
        /* XXX: This should always return root_mem_cgroup */
        return NULL;
}

static inline u64 mem_cgroup_id(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_get_from_id(u64 id)
{
        return NULL;
}

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return NULL;
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        return NULL;
}

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        return 0;
}

static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
        return 0;
}

static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
}

static inline void
mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
}

static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
}

static inline struct mem_cgroup *mem_cgroup_get_oom_group(
        struct task_struct *victim, struct mem_cgroup *oom_domain)
{
        return NULL;
}

static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
}

static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   enum memcg_stat_item idx,
                                   int nr)
{
}

static inline void mod_memcg_page_state(struct page *page,
                                        enum memcg_stat_item idx, int val)
{
}

static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
        return 0;
}

static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
{
        return 0;
}

static inline bool memcg_stat_item_valid(int idx)
{
        return false;
}

static inline bool memcg_vm_event_item_valid(enum vm_event_item idx)
{
        return false;
}

static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                                    enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
}

static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
}

static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                         int val)
{
        struct page *page = virt_to_head_page(p);

        mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                        enum vm_event_item idx,
                                        unsigned long count)
{
}

static inline void count_memcg_folio_events(struct folio *folio,
                enum vm_event_item idx, unsigned long nr)
{
}

static inline void count_memcg_events_mm(struct mm_struct *mm,
                                        enum vm_event_item idx, unsigned long count)
{
}

static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}

static inline void split_page_memcg(struct page *first, unsigned order)
{
}

static inline void folio_split_memcg_refs(struct folio *folio,
                unsigned old_order, unsigned new_order)
{
}

static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
{
        return 0;
}

static inline void mem_cgroup_flush_workqueue(void) { }

static inline int mem_cgroup_init(void) { return 0; }
#endif /* CONFIG_MEMCG */

/*
 * Extended information for slab objects stored as an array in page->memcg_data
 * if MEMCG_DATA_OBJEXTS is set.
 */
struct slabobj_ext {
#ifdef CONFIG_MEMCG
        struct obj_cgroup *objcg;
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        union codetag_ref ref;
#endif
} __aligned(8);

static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
        struct mem_cgroup *memcg;

        memcg = lruvec_memcg(lruvec);
        if (!memcg)
                return NULL;
        memcg = parent_mem_cgroup(memcg);
        if (!memcg)
                return NULL;
        return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

static inline void lruvec_lock_irq(struct lruvec *lruvec)
{
        rcu_read_lock();
        spin_lock_irq(&lruvec->lru_lock);
}

static inline void lruvec_unlock(struct lruvec *lruvec)
{
        spin_unlock(&lruvec->lru_lock);
        rcu_read_unlock();
}

static inline void lruvec_unlock_irq(struct lruvec *lruvec)
{
        spin_unlock_irq(&lruvec->lru_lock);
        rcu_read_unlock();
}

static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, unsigned long flags)
{
        spin_unlock_irqrestore(&lruvec->lru_lock, flags);
        rcu_read_unlock();
}

/* Test requires a stable folio->memcg binding, see folio_memcg() */
static inline bool folio_matches_lruvec(struct folio *folio,
                struct lruvec *lruvec)
{
        return lruvec_pgdat(lruvec) == folio_pgdat(folio) &&
               lruvec_memcg(lruvec) == folio_memcg(folio);
}

/* Don't lock again iff page's lruvec locked */
static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
                struct lruvec *locked_lruvec)
{
        if (locked_lruvec) {
                if (folio_matches_lruvec(folio, locked_lruvec))
                        return locked_lruvec;

                lruvec_unlock_irq(locked_lruvec);
        }

        return folio_lruvec_lock_irq(folio);
}

/* Don't lock again iff folio's lruvec locked */
static inline void folio_lruvec_relock_irqsave(struct folio *folio,
                struct lruvec **lruvecp, unsigned long *flags)
{
        if (*lruvecp) {
                if (folio_matches_lruvec(folio, *lruvecp))
                        return;

                lruvec_unlock_irqrestore(*lruvecp, *flags);
        }

        *lruvecp = folio_lruvec_lock_irqsave(folio, flags);
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
                         unsigned long *pheadroom, unsigned long *pdirty,
                         unsigned long *pwriteback);

void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
                                             struct bdi_writeback *wb);

static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        if (!folio_memcg_charged(folio))
                return;

        rcu_read_lock();
        memcg = folio_memcg(folio);
        if (unlikely(&memcg->css != wb->memcg_css))
                mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
        rcu_read_unlock();
}

void mem_cgroup_flush_foreign(struct bdi_writeback *wb);

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
{
        return NULL;
}

static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
                                       unsigned long *pfilepages,
                                       unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
{
}

static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
{
}

static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

struct sock;
#ifdef CONFIG_MEMCG
extern struct static_key_false memcg_sockets_enabled_key;
#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)

void mem_cgroup_sk_alloc(struct sock *sk);
void mem_cgroup_sk_free(struct sock *sk);
void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk);
bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages,
                          gfp_t gfp_mask);
void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages);

#if BITS_PER_LONG < 64
static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg)
{
        u64 val = get_jiffies_64() + HZ;
        unsigned long flags;

        write_seqlock_irqsave(&memcg->socket_pressure_seqlock, flags);
        memcg->socket_pressure = val;
        write_sequnlock_irqrestore(&memcg->socket_pressure_seqlock, flags);
}

static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg)
{
        unsigned int seq;
        u64 val;

        do {
                seq = read_seqbegin(&memcg->socket_pressure_seqlock);
                val = memcg->socket_pressure;
        } while (read_seqretry(&memcg->socket_pressure_seqlock, seq));

        return val;
}
#else
static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg)
{
        WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
}

static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg)
{
        return READ_ONCE(memcg->socket_pressure);
}
#endif

int alloc_shrinker_info(struct mem_cgroup *memcg);
void free_shrinker_info(struct mem_cgroup *memcg);
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
void reparent_shrinker_deferred(struct mem_cgroup *memcg);

static inline int shrinker_id(struct shrinker *shrinker)
{
        return shrinker->id;
}
#else
#define mem_cgroup_sockets_enabled 0

static inline void mem_cgroup_sk_alloc(struct sock *sk)
{
}

static inline void mem_cgroup_sk_free(struct sock *sk)
{
}

static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk)
{
}

static inline bool mem_cgroup_sk_charge(const struct sock *sk,
                                        unsigned int nr_pages,
                                        gfp_t gfp_mask)
{
        return false;
}

static inline void mem_cgroup_sk_uncharge(const struct sock *sk,
                                          unsigned int nr_pages)
{
}

static inline void set_shrinker_bit(struct mem_cgroup *memcg,
                                    int nid, int shrinker_id)
{
}

static inline int shrinker_id(struct shrinker *shrinker)
{
        return -1;
}
#endif

#ifdef CONFIG_MEMCG
bool mem_cgroup_kmem_disabled(void);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);

/*
 * The returned objcg pointer is safe to use without additional
 * protection within a scope. The scope is defined either by
 * the current task (similar to the "current" global variable)
 * or by set_active_memcg() pair.
 * Please, use obj_cgroup_get() to get a reference if the pointer
 * needs to be used outside of the local scope.
 */
struct obj_cgroup *current_obj_cgroup(void);
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);

static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
        struct obj_cgroup *objcg = current_obj_cgroup();

        if (objcg)
                obj_cgroup_get(objcg);

        return objcg;
}

int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);

extern struct static_key_false memcg_bpf_enabled_key;
static inline bool memcg_bpf_enabled(void)
{
        return static_branch_likely(&memcg_bpf_enabled_key);
}

extern struct static_key_false memcg_kmem_online_key;

static inline bool memcg_kmem_online(void)
{
        return static_branch_likely(&memcg_kmem_online_key);
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        if (memcg_kmem_online())
                return __memcg_kmem_charge_page(page, gfp, order);
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
        if (memcg_kmem_online())
                __memcg_kmem_uncharge_page(page, order);
}

/*
 * A helper for accessing memcg's kmem_id, used for getting
 * corresponding LRU lists.
 */
static inline int memcg_kmem_id(struct mem_cgroup *memcg)
{
        return memcg ? memcg->kmemcg_id : -1;
}

struct mem_cgroup *mem_cgroup_from_virt(void *p);

static inline void count_objcg_events(struct obj_cgroup *objcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
        struct mem_cgroup *memcg;

        if (!memcg_kmem_online())
                return;

        rcu_read_lock();
        memcg = obj_cgroup_memcg(objcg);
        count_memcg_events(memcg, idx, count);
        rcu_read_unlock();
}

void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, nodemask_t *mask);

void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);

static inline bool memcg_is_dying(struct mem_cgroup *memcg)
{
        return memcg ? css_is_dying(&memcg->css) : false;
}

#else
static inline bool mem_cgroup_kmem_disabled(void)
{
        return true;
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                           int order)
{
        return 0;
}

static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
        return NULL;
}

static inline bool memcg_bpf_enabled(void)
{
        return false;
}

static inline bool memcg_kmem_online(void)
{
        return false;
}

static inline int memcg_kmem_id(struct mem_cgroup *memcg)
{
        return -1;
}

static inline struct mem_cgroup *mem_cgroup_from_virt(void *p)
{
        return NULL;
}

static inline void count_objcg_events(struct obj_cgroup *objcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
}

static inline ino_t page_cgroup_ino(struct page *page)
{
        return 0;
}

static inline void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg,
                                                  nodemask_t *mask)
{
}

static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
{
}

static inline bool memcg_is_dying(struct mem_cgroup *memcg)
{
        return false;
}
#endif /* CONFIG_MEMCG */

#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
#else
static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
{
        return true;
}
static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg,
                                           size_t size)
{
}
static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
                                             size_t size)
{
}
static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
{
        /* if zswap is disabled, do not block pages going to the swapping device */
        return true;
}
#endif


/* Cgroup v1-related declarations */

#ifdef CONFIG_MEMCG_V1
unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                        gfp_t gfp_mask,
                                        unsigned long *total_scanned);

bool mem_cgroup_oom_synchronize(bool wait);

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return p->memcg_in_oom;
}

static inline void mem_cgroup_enter_user_fault(void)
{
        WARN_ON(current->in_user_fault);
        current->in_user_fault = 1;
}

static inline void mem_cgroup_exit_user_fault(void)
{
        WARN_ON(!current->in_user_fault);
        current->in_user_fault = 0;
}

void memcg1_swapout(struct folio *folio, swp_entry_t entry);
void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages);

#else /* CONFIG_MEMCG_V1 */
static inline
unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                        gfp_t gfp_mask,
                                        unsigned long *total_scanned)
{
        return 0;
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return false;
}

static inline bool mem_cgroup_oom_synchronize(bool wait)
{
        return false;
}

static inline void mem_cgroup_enter_user_fault(void)
{
}

static inline void mem_cgroup_exit_user_fault(void)
{
}

static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry)
{
}

static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
{
}

#endif /* CONFIG_MEMCG_V1 */

#endif /* _LINUX_MEMCONTROL_H */

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 

































































































































































































































































































































































































    6 





    6 






    6 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
13093
13094
13095
13096
13097
13098
13099
13100
13101
13102
13103
13104
13105
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126
13127
13128
13129
13130
13131
13132
13133
13134
13135
13136
13137
13138
13139
13140
13141
13142
13143
13144
13145
13146
13147
13148
13149
13150
13151
13152
13153
13154
13155
13156
13157
13158
13159
13160
13161
13162
13163
13164
13165
13166
13167
13168
13169
13170
13171
13172
13173
13174
13175
13176
13177
13178
13179
13180
13181
13182
13183
13184
13185
13186
13187
13188
13189
13190
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203
13204
13205
13206
13207
13208
13209
13210
13211
13212
13213
13214
13215
13216
13217
13218
13219
13220
13221
13222
13223
13224
13225
13226
13227
13228
13229
13230
13231
13232
13233
13234
13235
13236
13237
13238
13239
13240
13241
13242
13243
13244
13245
13246
13247
13248
13249
13250
13251
13252
13253
13254
13255
13256
13257
13258
13259
13260
13261
13262
13263
13264
13265
13266
13267
13268
13269
13270
13271
13272
13273
13274
13275
13276
13277
13278
13279
13280
13281
13282
13283
13284
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
13343
13344
13345
13346
13347
13348
13349
13350
13351
13352
13353
13354
13355
13356
13357
13358
13359
13360
13361
13362
13363
13364
13365
13366
13367
13368
13369
13370
13371
13372
13373
13374
13375
13376
13377
13378
13379
13380
13381
13382
13383
13384
13385
13386
13387
13388
13389
13390
13391
13392
13393
13394
13395
13396
13397
13398
13399
13400
13401
13402
13403
13404
13405
13406
13407
13408
13409
13410
13411
13412
13413
13414
13415
13416
13417
13418
13419
13420
13421
13422
13423
13424
13425
13426
13427
13428
13429
13430
13431
13432
13433
13434
13435
13436
13437
13438
13439
13440
13441
13442
13443
13444
13445
13446
13447
13448
13449
13450
13451
13452
13453
13454
13455
13456
13457
13458
13459
13460
13461
13462
13463
13464
13465
13466
13467
13468
13469
13470
13471
13472
13473
13474
13475
13476
13477
13478
13479
13480
13481
13482
13483
13484
13485
13486
13487
13488
13489
13490
13491
13492
13493
13494
13495
13496
13497
13498
13499
13500
13501
13502
13503
13504
13505
13506
13507
13508
13509
13510
13511
13512
13513
13514
13515
13516
13517
13518
13519
13520
13521
13522
13523
13524
13525
13526
13527
13528
13529
13530
13531
13532
13533
13534
13535
13536
13537
13538
13539
13540
13541
13542
13543
13544
13545
13546
13547
13548
13549
13550
13551
13552
13553
13554
13555
13556
13557
13558
13559
13560
13561
13562
13563
13564
13565
13566
13567
13568
13569
13570
13571
13572
13573
13574
13575
13576
13577
13578
13579
13580
13581
13582
13583
13584
13585
13586
13587
13588
13589
13590
13591
13592
13593
13594
13595
13596
13597
13598
13599
13600
13601
13602
13603
13604
13605
13606
13607
13608
13609
13610
13611
13612
13613
13614
13615
13616
13617
13618
13619
13620
13621
13622
13623
13624
13625
13626
13627
13628
13629
13630
13631
13632
13633
13634
13635
13636
13637
13638
13639
13640
13641
13642
13643
13644
13645
13646
13647
13648
13649
13650
13651
13652
13653
13654
13655
13656
13657
13658
13659
13660
13661
13662
13663
13664
13665
13666
13667
13668
13669
13670
13671
13672
13673
13674
13675
13676
13677
13678
13679
13680
13681
13682
13683
13684
13685
13686
13687
13688
13689
13690
13691
13692
13693
13694
13695
13696
13697
13698
13699
13700
13701
13702
13703
13704
13705
13706
13707
13708
13709
13710
13711
13712
13713
13714
13715
13716
13717
13718
13719
13720
13721
13722
13723
13724
13725
13726
13727
13728
13729
13730
13731
13732
13733
13734
13735
13736
13737
13738
13739
13740
13741
13742
13743
13744
13745
13746
13747
13748
13749
13750
13751
13752
13753
13754
13755
13756
13757
13758
13759
13760
13761
13762
13763
13764
13765
13766
13767
13768
13769
13770
13771
13772
13773
13774
13775
13776
13777
13778
13779
13780
13781
13782
13783
13784
13785
13786
13787
13788
13789
13790
13791
13792
13793
13794
13795
13796
13797
13798
13799
13800
13801
13802
13803
13804
13805
13806
13807
13808
13809
13810
13811
13812
13813
13814
13815
13816
13817
13818
13819
13820
13821
13822
13823
13824
13825
13826
13827
13828
13829
13830
13831
13832
13833
13834
13835
13836
13837
13838
13839
13840
13841
13842
13843
13844
13845
13846
13847
13848
13849
13850
13851
13852
13853
13854
13855
13856
13857
13858
13859
13860
13861
13862
13863
13864
13865
13866
13867
13868
13869
13870
13871
13872
13873
13874
13875
13876
13877
13878
13879
13880
13881
13882
13883
13884
13885
13886
13887
13888
13889
13890
13891
13892
13893
13894
13895
13896
13897
13898
13899
13900
13901
13902
13903
13904
13905
13906
13907
13908
13909
13910
13911
13912
13913
13914
13915
13916
13917
13918
13919
13920
13921
13922
13923
13924
13925
13926
13927
13928
13929
13930
13931
13932
13933
13934
13935
13936
13937
13938
13939
13940
13941
13942
13943
13944
13945
13946
13947
13948
13949
13950
13951
13952
13953
13954
13955
13956
13957
13958
13959
13960
13961
13962
13963
13964
13965
13966
13967
13968
13969
13970
13971
13972
13973
13974
13975
13976
13977
13978
13979
13980
13981
13982
13983
13984
13985
13986
13987
13988
13989
13990
13991
13992
13993
13994
13995
13996
13997
13998
13999
14000
14001
14002
14003
14004
14005
14006
14007
14008
14009
14010
14011
14012
14013
14014
14015
14016
14017
14018
14019
14020
14021
14022
14023
14024
14025
14026
14027
14028
14029
14030
14031
14032
14033
14034
14035
14036
14037
14038
14039
14040
14041
14042
14043
14044
14045
14046
14047
14048
14049
14050
14051
14052
14053
14054
14055
14056
14057
14058
14059
14060
14061
14062
14063
14064
14065
14066
14067
14068
14069
14070
14071
14072
14073
14074
14075
14076
14077
14078
14079
14080
14081
14082
14083
14084
14085
14086
14087
14088
14089
14090
14091
14092
14093
14094
14095
14096
14097
14098
14099
14100
14101
14102
14103
14104
14105
14106
14107
14108
14109
14110
14111
14112
14113
14114
14115
14116
14117
14118
14119
14120
14121
14122
14123
14124
14125
14126
14127
14128
14129
14130
14131
14132
14133
14134
14135
14136
14137
14138
14139
14140
14141
14142
14143
14144
14145
14146
14147
14148
14149
14150
14151
14152
14153
14154
14155
14156
14157
14158
14159
14160
14161
14162
14163
14164
14165
14166
14167
14168
14169
14170
14171
14172
14173
14174
14175
14176
14177
14178
14179
14180
14181
14182
14183
14184
14185
14186
14187
14188
14189
14190
14191
14192
14193
14194
14195
14196
14197
14198
14199
14200
14201
14202
14203
14204
14205
14206
14207
14208
14209
14210
14211
14212
14213
14214
14215
14216
14217
14218
14219
14220
14221
14222
14223
14224
14225
14226
14227
14228
14229
14230
14231
14232
14233
14234
14235
14236
14237
14238
14239
14240
14241
14242
14243
14244
14245
14246
14247
14248
14249
14250
14251
14252
14253
14254
14255
14256
14257
14258
14259
14260
14261
14262
14263
14264
14265
14266
14267
14268
14269
14270
14271
14272
14273
14274
14275
14276
14277
14278
14279
14280
14281
14282
14283
14284
14285
14286
14287
14288
14289
14290
14291
14292
14293
14294
14295
14296
14297
14298
14299
14300
14301
14302
14303
14304
14305
14306
14307
14308
14309
14310
14311
14312
14313
14314
14315
14316
14317
14318
14319
14320
14321
14322
14323
14324
14325
14326
14327
14328
14329
14330
14331
14332
14333
14334
14335
14336
14337
14338
14339
14340
14341
14342
14343
14344
14345
14346
14347
14348
14349
14350
14351
14352
14353
14354
14355
14356
14357
14358
14359
14360
14361
14362
14363
14364
14365
14366
14367
14368
14369
14370
14371
14372
14373
14374
14375
14376
14377
14378
14379
14380
14381
14382
14383
14384
14385
14386
14387
14388
14389
14390
14391
14392
14393
14394
14395
14396
14397
14398
14399
14400
14401
14402
14403
14404
14405
14406
14407
14408
14409
14410
14411
14412
14413
14414
14415
14416
14417
14418
14419
14420
14421
14422
14423
14424
14425
14426
14427
14428
14429
14430
14431
14432
14433
14434
14435
14436
14437
14438
14439
14440
14441
14442
14443
14444
14445
14446
14447
14448
14449
14450
14451
14452
14453
14454
14455
14456
14457
14458
14459
14460
14461
14462
14463
14464
14465
14466
14467
14468
14469
14470
14471
14472
14473
14474
14475
14476
14477
14478
14479
14480
14481
14482
14483
14484
14485
14486
14487
14488
14489
14490
14491
14492
14493
14494
14495
14496
14497
14498
14499
14500
14501
14502
14503
14504
14505
14506
14507
14508
14509
14510
14511
14512
14513
14514
14515
14516
14517
14518
14519
14520
14521
14522
14523
14524
14525
14526
14527
14528
14529
14530
14531
14532
14533
14534
14535
14536
14537
14538
14539
14540
14541
14542
14543
14544
14545
14546
14547
14548
14549
14550
14551
14552
14553
14554
14555
14556
14557
14558
14559
14560
14561
14562
14563
14564
14565
14566
14567
14568
14569
14570
14571
14572
14573
14574
14575
14576
14577
14578
14579
14580
14581
14582
14583
14584
14585
14586
14587
14588
14589
14590
14591
14592
14593
14594
14595
14596
14597
14598
14599
14600
14601
14602
14603
14604
14605
14606
14607
14608
14609
14610
14611
14612
14613
14614
14615
14616
14617
14618
14619
14620
14621
14622
14623
14624
14625
14626
14627
14628
14629
14630
14631
14632
14633
14634
14635
14636
14637
14638
14639
14640
14641
14642
14643
14644
14645
14646
14647
14648
14649
14650
14651
14652
14653
14654
14655
14656
14657
14658
14659
14660
14661
14662
14663
14664
14665
14666
14667
14668
14669
14670
14671
14672
14673
14674
14675
14676
14677
14678
14679
14680
14681
14682
14683
14684
14685
14686
14687
14688
14689
14690
14691
14692
14693
14694
14695
14696
14697
14698
14699
14700
14701
14702
14703
14704
14705
14706
14707
14708
14709
14710
14711
14712
14713
14714
14715
14716
14717
14718
14719
14720
14721
14722
14723
14724
14725
14726
14727
14728
14729
14730
14731
14732
14733
14734
14735
14736
14737
14738
14739
14740
14741
14742
14743
14744
14745
14746
14747
14748
14749
14750
14751
14752
14753
14754
14755
14756
14757
14758
14759
14760
14761
14762
14763
14764
14765
14766
14767
14768
14769
14770
14771
14772
14773
14774
14775
14776
14777
14778
14779
14780
14781
14782
14783
14784
14785
14786
14787
14788
14789
14790
14791
14792
14793
14794
14795
14796
14797
14798
14799
14800
14801
14802
14803
14804
14805
14806
14807
14808
14809
14810
14811
14812
14813
14814
14815
14816
14817
14818
14819
14820
14821
14822
14823
14824
14825
14826
14827
14828
14829
14830
14831
14832
14833
14834
14835
14836
14837
14838
14839
14840
14841
14842
14843
14844
14845
14846
14847
14848
14849
14850
14851
14852
14853
14854
14855
14856
14857
14858
14859
14860
14861
14862
14863
14864
14865
14866
14867
14868
14869
14870
14871
14872
14873
14874
14875
14876
14877
14878
14879
14880
14881
14882
14883
14884
14885
14886
14887
14888
14889
14890
14891
14892
14893
14894
14895
14896
14897
14898
14899
14900
14901
14902
14903
14904
14905
14906
14907
14908
14909
14910
14911
14912
14913
14914
14915
14916
14917
14918
14919
14920
14921
14922
14923
14924
14925
14926
14927
14928
14929
14930
14931
14932
14933
14934
14935
14936
14937
14938
14939
14940
14941
14942
14943
14944
14945
14946
14947
14948
14949
14950
14951
14952
14953
14954
14955
14956
14957
14958
14959
14960
14961
14962
14963
14964
14965
14966
14967
14968
14969
14970
14971
14972
14973
14974
14975
14976
14977
14978
14979
14980
14981
14982
14983
14984
14985
14986
14987
14988
14989
14990
14991
14992
14993
14994
14995
14996
14997
14998
14999
15000
15001
15002
15003
15004
15005
15006
15007
15008
15009
15010
15011
15012
15013
15014
15015
15016
15017
15018
15019
15020
15021
15022
15023
15024
15025
15026
15027
15028
15029
15030
15031
15032
15033
15034
15035
15036
15037
15038
15039
15040
15041
15042
15043
15044
15045
15046
15047
15048
15049
15050
15051
15052
15053
15054
15055
15056
15057
15058
15059
15060
15061
15062
15063
15064
15065
15066
15067
15068
15069
15070
15071
15072
15073
15074
15075
15076
15077
15078
15079
15080
15081
15082
15083
15084
15085
15086
15087
15088
15089
15090
15091
15092
15093
15094
15095
15096
15097
15098
15099
15100
15101
15102
15103
15104
15105
15106
15107
15108
15109
15110
15111
15112
15113
15114
15115
15116
15117
15118
15119
15120
15121
15122
15123
15124
15125
15126
15127
15128
15129
15130
15131
15132
15133
15134
15135
15136
15137
15138
15139
15140
15141
15142
15143
15144
15145
15146
15147
15148
15149
15150
15151
15152
15153
15154
15155
15156
15157
15158
15159
15160
15161
15162
15163
15164
15165
15166
15167
15168
15169
15170
15171
15172
15173
15174
15175
15176
15177
15178
15179
15180
15181
15182
15183
15184
15185
15186
15187
15188
15189
15190
15191
15192
15193
15194
15195
15196
15197
15198
15199
15200
15201
15202
15203
15204
15205
15206
15207
15208
15209
15210
15211
15212
15213
15214
15215
15216
15217
15218
15219
15220
15221
15222
15223
15224
15225
15226
15227
15228
15229
15230
15231
15232
15233
15234
15235
15236
15237
15238
15239
15240
15241
15242
15243
15244
15245
15246
15247
15248
15249
15250
15251
15252
15253
15254
15255
15256
15257
15258
15259
15260
15261
15262
15263
15264
15265
15266
15267
15268
15269
15270
15271
15272
15273
15274
15275
15276
15277
15278
15279
15280
15281
15282
15283
15284
15285
15286
15287
15288
15289
15290
15291
15292
15293
15294
15295
15296
15297
15298
15299
15300
15301
15302
15303
15304
15305
15306
15307
15308
15309
15310
15311
15312
15313
15314
15315
15316
15317
15318
15319
15320
15321
15322
15323
15324
15325
15326
15327
15328
15329
15330
15331
15332
15333
15334
15335
15336
15337
15338
15339
15340
15341
15342
15343
15344
15345
15346
15347
15348
15349
15350
15351
15352
15353
15354
15355
15356
15357
15358
15359
15360
15361
15362
15363
15364
15365
15366
15367
15368
15369
15370
15371
// SPDX-License-Identifier: GPL-2.0
/*
 * Performance events core code:
 *
 *  Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 */

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/reboot.h>
#include <linux/vmstat.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
#include <linux/trace_events.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
#include <linux/highmem.h>
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>
#include <linux/percpu-rwsem.h>
#include <linux/unwind_deferred.h>
#include <linux/kvm_types.h>

#include "internal.h"

#include <asm/irq_regs.h>

typedef int (*remote_function_f)(void *);

struct remote_function_call {
        struct task_struct        *p;
        remote_function_f        func;
        void                        *info;
        int                        ret;
};

static void remote_function(void *data)
{
        struct remote_function_call *tfc = data;
        struct task_struct *p = tfc->p;

        if (p) {
                /* -EAGAIN */
                if (task_cpu(p) != smp_processor_id())
                        return;

                /*
                 * Now that we're on right CPU with IRQs disabled, we can test
                 * if we hit the right task without races.
                 */

                tfc->ret = -ESRCH; /* No such (running) process */
                if (p != current)
                        return;
        }

        tfc->ret = tfc->func(tfc->info);
}

/**
 * task_function_call - call a function on the cpu on which a task runs
 * @p:                the task to evaluate
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func when the task is currently running. This might
 * be on the current CPU, which just calls the function directly.  This will
 * retry due to any failures in smp_call_function_single(), such as if the
 * task_cpu() goes offline concurrently.
 *
 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
 */
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = p,
                .func        = func,
                .info        = info,
                .ret        = -EAGAIN,
        };
        int ret;

        for (;;) {
                ret = smp_call_function_single(task_cpu(p), remote_function,
                                               &data, 1);
                if (!ret)
                        ret = data.ret;

                if (ret != -EAGAIN)
                        break;

                cond_resched();
        }

        return ret;
}

/**
 * cpu_function_call - call a function on the cpu
 * @cpu:        target cpu to queue this function
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func on the remote cpu.
 *
 * returns: @func return value or -ENXIO when the cpu is offline
 */
static int cpu_function_call(int cpu, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = NULL,
                .func        = func,
                .info        = info,
                .ret        = -ENXIO, /* No such CPU */
        };

        smp_call_function_single(cpu, remote_function, &data, 1);

        return data.ret;
}

enum event_type_t {
        EVENT_FLEXIBLE        = 0x01,
        EVENT_PINNED        = 0x02,
        EVENT_TIME        = 0x04,
        EVENT_FROZEN        = 0x08,
        /* see ctx_resched() for details */
        EVENT_CPU        = 0x10,
        EVENT_CGROUP        = 0x20,

        /*
         * EVENT_GUEST is set when scheduling in/out events between the host
         * and a guest with a mediated vPMU.  Among other things, EVENT_GUEST
         * is used:
         *
         * - In for_each_epc() to skip PMUs that don't support events in a
         *   MEDIATED_VPMU guest, i.e. don't need to be context switched.
         * - To indicate the start/end point of the events in a guest.  Guest
         *   running time is deducted for host-only (exclude_guest) events.
         */
        EVENT_GUEST        = 0x40,
        EVENT_FLAGS        = EVENT_CGROUP | EVENT_GUEST,
        /* compound helpers */
        EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
        EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
};

static inline void __perf_ctx_lock(struct perf_event_context *ctx)
{
        raw_spin_lock(&ctx->lock);
        WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
}

static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
                          struct perf_event_context *ctx)
{
        __perf_ctx_lock(&cpuctx->ctx);
        if (ctx)
                __perf_ctx_lock(ctx);
}

static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
{
        /*
         * If ctx_sched_in() didn't again set any ALL flags, clean up
         * after ctx_sched_out() by clearing is_active.
         */
        if (ctx->is_active & EVENT_FROZEN) {
                if (!(ctx->is_active & EVENT_ALL))
                        ctx->is_active = 0;
                else
                        ctx->is_active &= ~EVENT_FROZEN;
        }
        raw_spin_unlock(&ctx->lock);
}

static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
                            struct perf_event_context *ctx)
{
        if (ctx)
                __perf_ctx_unlock(ctx);
        __perf_ctx_unlock(&cpuctx->ctx);
}

typedef struct {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
} class_perf_ctx_lock_t;

static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }

static inline class_perf_ctx_lock_t
class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx)
{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }

#define TASK_TOMBSTONE ((void *)-1L)

static bool is_kernel_event(struct perf_event *event)
{
        return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}

static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);

struct perf_event_context *perf_cpu_task_ctx(void)
{
        lockdep_assert_irqs_disabled();
        return this_cpu_ptr(&perf_cpu_context)->task_ctx;
}

/*
 * On task ctx scheduling...
 *
 * When !ctx->nr_events a task context will not be scheduled. This means
 * we can disable the scheduler hooks (for performance) without leaving
 * pending task ctx state.
 *
 * This however results in two special cases:
 *
 *  - removing the last event from a task ctx; this is relatively straight
 *    forward and is done in __perf_remove_from_context.
 *
 *  - adding the first event to a task ctx; this is tricky because we cannot
 *    rely on ctx->is_active and therefore cannot use event_function_call().
 *    See perf_install_in_context().
 *
 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 */

typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
                        struct perf_event_context *, void *);

struct event_function_struct {
        struct perf_event *event;
        event_f func;
        void *data;
};

static int event_function(void *info)
{
        struct event_function_struct *efs = info;
        struct perf_event *event = efs->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        int ret = 0;

        lockdep_assert_irqs_disabled();

        perf_ctx_lock(cpuctx, task_ctx);
        /*
         * Since we do the IPI call without holding ctx->lock things can have
         * changed, double check we hit the task we set out to hit.
         */
        if (ctx->task) {
                if (ctx->task != current) {
                        ret = -ESRCH;
                        goto unlock;
                }

                /*
                 * We only use event_function_call() on established contexts,
                 * and event_function() is only ever called when active (or
                 * rather, we'll have bailed in task_function_call() or the
                 * above ctx->task != current test), therefore we must have
                 * ctx->is_active here.
                 */
                WARN_ON_ONCE(!ctx->is_active);
                /*
                 * And since we have ctx->is_active, cpuctx->task_ctx must
                 * match.
                 */
                WARN_ON_ONCE(task_ctx != ctx);
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        efs->func(event, cpuctx, ctx, efs->data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static void event_function_call(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
        struct perf_cpu_context *cpuctx;
        struct event_function_struct efs = {
                .event = event,
                .func = func,
                .data = data,
        };

        if (!event->parent) {
                /*
                 * If this is a !child event, we must hold ctx::mutex to
                 * stabilize the event->ctx relation. See
                 * perf_event_ctx_lock().
                 */
                lockdep_assert_held(&ctx->mutex);
        }

        if (!task) {
                cpu_function_call(event->cpu, event_function, &efs);
                return;
        }

        if (task == TASK_TOMBSTONE)
                return;

again:
        if (!task_function_call(task, event_function, &efs))
                return;

        local_irq_disable();
        cpuctx = this_cpu_ptr(&perf_cpu_context);
        perf_ctx_lock(cpuctx, ctx);
        /*
         * Reload the task pointer, it might have been changed by
         * a concurrent perf_event_context_sched_out().
         */
        task = ctx->task;
        if (task == TASK_TOMBSTONE)
                goto unlock;
        if (ctx->is_active) {
                perf_ctx_unlock(cpuctx, ctx);
                local_irq_enable();
                goto again;
        }
        func(event, NULL, ctx, data);
unlock:
        perf_ctx_unlock(cpuctx, ctx);
        local_irq_enable();
}

/*
 * Similar to event_function_call() + event_function(), but hard assumes IRQs
 * are already disabled and we're on the right CPU.
 */
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct task_struct *task = READ_ONCE(ctx->task);
        struct perf_event_context *task_ctx = NULL;

        lockdep_assert_irqs_disabled();

        if (task) {
                if (task == TASK_TOMBSTONE)
                        return;

                task_ctx = ctx;
        }

        perf_ctx_lock(cpuctx, task_ctx);

        task = ctx->task;
        if (task == TASK_TOMBSTONE)
                goto unlock;

        if (task) {
                /*
                 * We must be either inactive or active and the right task,
                 * otherwise we're screwed, since we cannot IPI to somewhere
                 * else.
                 */
                if (ctx->is_active) {
                        if (WARN_ON_ONCE(task != current))
                                goto unlock;

                        if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
                                goto unlock;
                }
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        func(event, cpuctx, ctx, data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);
}

#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
                       PERF_FLAG_FD_CLOEXEC)

/*
 * branch priv levels that need permission checks
 */
#define PERF_SAMPLE_BRANCH_PERM_PLM \
        (PERF_SAMPLE_BRANCH_KERNEL |\
         PERF_SAMPLE_BRANCH_HV)

/*
 * perf_sched_events : >0 events exist
 */

static void perf_sched_delayed(struct work_struct *work);
DEFINE_STATIC_KEY_FALSE(perf_sched_events);
static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;

static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
static atomic_t nr_text_poke_events __read_mostly;
static atomic_t nr_build_id_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
static cpumask_var_t perf_online_core_mask;
static cpumask_var_t perf_online_die_mask;
static cpumask_var_t perf_online_cluster_mask;
static cpumask_var_t perf_online_pkg_mask;
static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;

#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU
static DEFINE_PER_CPU(bool, guest_ctx_loaded);

static __always_inline bool is_guest_mediated_pmu_loaded(void)
{
        return __this_cpu_read(guest_ctx_loaded);
}
#else
static __always_inline bool is_guest_mediated_pmu_loaded(void)
{
        return false;
}
#endif

/*
 * perf event paranoia level:
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
 *   1 - disallow cpu events for unpriv
 *   2 - disallow kernel profiling for unpriv
 */
int sysctl_perf_event_paranoid __read_mostly = 2;

/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */
static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);

/*
 * max perf event sample rate
 */
#define DEFAULT_MAX_SAMPLE_RATE                100000
#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
#define DEFAULT_CPU_TIME_MAX_PERCENT        25

int sysctl_perf_event_sample_rate __read_mostly        = DEFAULT_MAX_SAMPLE_RATE;
static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;

static int max_samples_per_tick __read_mostly        = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly        = DEFAULT_SAMPLE_PERIOD_NS;

static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;

static void update_perf_cpu_limits(void)
{
        u64 tmp = perf_sample_period_ns;

        tmp *= sysctl_perf_cpu_time_max_percent;
        tmp = div_u64(tmp, 100);
        if (!tmp)
                tmp = 1;

        WRITE_ONCE(perf_sample_allowed_ns, tmp);
}

static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);

static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
                                       void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;
        int perf_cpu = sysctl_perf_cpu_time_max_percent;
        /*
         * If throttling is disabled don't allow the write:
         */
        if (write && (perf_cpu == 100 || perf_cpu == 0))
                return -EINVAL;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        update_perf_cpu_limits();

        return 0;
}

static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (ret || !write)
                return ret;

        if (sysctl_perf_cpu_time_max_percent == 100 ||
            sysctl_perf_cpu_time_max_percent == 0) {
                printk(KERN_WARNING
                       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
                WRITE_ONCE(perf_sample_allowed_ns, 0);
        } else {
                update_perf_cpu_limits();
        }

        return 0;
}

static const struct ctl_table events_core_sysctl_table[] = {
        /*
         * User-space relies on this file as a feature check for
         * perf_events being enabled. It's an ABI, do not remove!
         */
        {
                .procname        = "perf_event_paranoid",
                .data                = &sysctl_perf_event_paranoid,
                .maxlen                = sizeof(sysctl_perf_event_paranoid),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "perf_event_mlock_kb",
                .data                = &sysctl_perf_event_mlock,
                .maxlen                = sizeof(sysctl_perf_event_mlock),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "perf_event_max_sample_rate",
                .data                = &sysctl_perf_event_sample_rate,
                .maxlen                = sizeof(sysctl_perf_event_sample_rate),
                .mode                = 0644,
                .proc_handler        = perf_event_max_sample_rate_handler,
                .extra1                = SYSCTL_ONE,
        },
        {
                .procname        = "perf_cpu_time_max_percent",
                .data                = &sysctl_perf_cpu_time_max_percent,
                .maxlen                = sizeof(sysctl_perf_cpu_time_max_percent),
                .mode                = 0644,
                .proc_handler        = perf_cpu_time_max_percent_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE_HUNDRED,
        },
};

static int __init init_events_core_sysctls(void)
{
        register_sysctl_init("kernel", events_core_sysctl_table);
        return 0;
}
core_initcall(init_events_core_sysctls);


/*
 * perf samples are done in some very critical code paths (NMIs).
 * If they take too much CPU time, the system can lock up and not
 * get any real work done.  This will drop the sample rate when
 * we detect that events are taking too long.
 */
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);

static u64 __report_avg;
static u64 __report_allowed;

static void perf_duration_warn(struct irq_work *w)
{
        printk_ratelimited(KERN_INFO
                "perf: interrupt took too long (%lld > %lld), lowering "
                "kernel.perf_event_max_sample_rate to %d\n",
                __report_avg, __report_allowed,
                sysctl_perf_event_sample_rate);
}

static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);

void perf_sample_event_took(u64 sample_len_ns)
{
        u64 max_len = READ_ONCE(perf_sample_allowed_ns);
        u64 running_len;
        u64 avg_len;
        u32 max;

        if (max_len == 0)
                return;

        /* Decay the counter by 1 average sample. */
        running_len = __this_cpu_read(running_sample_length);
        running_len -= running_len/NR_ACCUMULATED_SAMPLES;
        running_len += sample_len_ns;
        __this_cpu_write(running_sample_length, running_len);

        /*
         * Note: this will be biased artificially low until we have
         * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
         * from having to maintain a count.
         */
        avg_len = running_len/NR_ACCUMULATED_SAMPLES;
        if (avg_len <= max_len)
                return;

        __report_avg = avg_len;
        __report_allowed = max_len;

        /*
         * Compute a throttle threshold 25% below the current duration.
         */
        avg_len += avg_len / 4;
        max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
        if (avg_len < max)
                max /= (u32)avg_len;
        else
                max = 1;

        WRITE_ONCE(perf_sample_allowed_ns, avg_len);
        WRITE_ONCE(max_samples_per_tick, max);

        sysctl_perf_event_sample_rate = max * HZ;
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;

        if (!irq_work_queue(&perf_duration_work)) {
                early_printk("perf: interrupt took too long (%lld > %lld), lowering "
                             "kernel.perf_event_max_sample_rate to %d\n",
                             __report_avg, __report_allowed,
                             sysctl_perf_event_sample_rate);
        }
}

static atomic64_t perf_event_id;

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);

void __weak perf_event_print_debug(void)        { }

static inline u64 perf_clock(void)
{
        return local_clock();
}

static inline u64 perf_event_clock(struct perf_event *event)
{
        return event->clock();
}

/*
 * State based event timekeeping...
 *
 * The basic idea is to use event->state to determine which (if any) time
 * fields to increment with the current delta. This means we only need to
 * update timestamps when we change state or when they are explicitly requested
 * (read).
 *
 * Event groups make things a little more complicated, but not terribly so. The
 * rules for a group are that if the group leader is OFF the entire group is
 * OFF, irrespective of what the group member states are. This results in
 * __perf_effective_state().
 *
 * A further ramification is that when a group leader flips between OFF and
 * !OFF, we need to update all group member times.
 *
 *
 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 * need to make sure the relevant context time is updated before we try and
 * update our timestamps.
 */

static __always_inline enum perf_event_state
__perf_effective_state(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;

        if (leader->state <= PERF_EVENT_STATE_OFF)
                return leader->state;

        return event->state;
}

static __always_inline void
__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
{
        enum perf_event_state state = __perf_effective_state(event);
        u64 delta = now - event->tstamp;

        *enabled = event->total_time_enabled;
        if (state >= PERF_EVENT_STATE_INACTIVE)
                *enabled += delta;

        *running = event->total_time_running;
        if (state >= PERF_EVENT_STATE_ACTIVE)
                *running += delta;
}

static void perf_event_update_time(struct perf_event *event)
{
        u64 now = perf_event_time(event);

        __perf_update_times(event, now, &event->total_time_enabled,
                                        &event->total_time_running);
        event->tstamp = now;
}

static void perf_event_update_sibling_time(struct perf_event *leader)
{
        struct perf_event *sibling;

        for_each_sibling_event(sibling, leader)
                perf_event_update_time(sibling);
}

static void
perf_event_set_state(struct perf_event *event, enum perf_event_state state)
{
        if (event->state == state)
                return;

        perf_event_update_time(event);
        /*
         * If a group leader gets enabled/disabled all its siblings
         * are affected too.
         */
        if ((event->state < 0) ^ (state < 0))
                perf_event_update_sibling_time(event);

        WRITE_ONCE(event->state, state);
}

/*
 * UP store-release, load-acquire
 */

#define __store_release(ptr, val)                                        \
do {                                                                        \
        barrier();                                                        \
        WRITE_ONCE(*(ptr), (val));                                        \
} while (0)

#define __load_acquire(ptr)                                                \
({                                                                        \
        __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));        \
        barrier();                                                        \
        ___p;                                                                \
})

static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx,
                              enum event_type_t event_type)
{
        if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups)
                return true;
        if ((event_type & EVENT_GUEST) &&
            !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU))
                return true;
        return false;
}

#define for_each_epc(_epc, _ctx, _pmu, _event_type)                        \
        list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
                if (perf_skip_pmu_ctx(_epc, _event_type))                \
                        continue;                                        \
                else if (_pmu && _epc->pmu != _pmu)                        \
                        continue;                                        \
                else

static void perf_ctx_disable(struct perf_event_context *ctx,
                             enum event_type_t event_type)
{
        struct perf_event_pmu_context *pmu_ctx;

        for_each_epc(pmu_ctx, ctx, NULL, event_type)
                perf_pmu_disable(pmu_ctx->pmu);
}

static void perf_ctx_enable(struct perf_event_context *ctx,
                            enum event_type_t event_type)
{
        struct perf_event_pmu_context *pmu_ctx;

        for_each_epc(pmu_ctx, ctx, NULL, event_type)
                perf_pmu_enable(pmu_ctx->pmu);
}

static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);

static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv)
{
        if (adv)
                time->time += now - time->stamp;
        time->stamp = now;

        /*
         * The above: time' = time + (now - timestamp), can be re-arranged
         * into: time` = now + (time - timestamp), which gives a single value
         * offset to compute future time without locks on.
         *
         * See perf_event_time_now(), which can be used from NMI context where
         * it's (obviously) not possible to acquire ctx->lock in order to read
         * both the above values in a consistent manner.
         */
        WRITE_ONCE(time->offset, time->time - time->stamp);
}

static_assert(offsetof(struct perf_event_context, timeguest) -
              offsetof(struct perf_event_context, time) ==
              sizeof(struct perf_time_ctx));

#define T_TOTAL                0
#define T_GUEST                1

static inline u64 __perf_event_time_ctx(struct perf_event *event,
                                        struct perf_time_ctx *times)
{
        u64 time = times[T_TOTAL].time;

        if (event->attr.exclude_guest)
                time -= times[T_GUEST].time;

        return time;
}

static inline u64 __perf_event_time_ctx_now(struct perf_event *event,
                                            struct perf_time_ctx *times,
                                            u64 now)
{
        if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) {
                /*
                 * (now + times[total].offset) - (now + times[guest].offset) :=
                 * times[total].offset - times[guest].offset
                 */
                return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset);
        }

        return now + READ_ONCE(times[T_TOTAL].offset);
}

#ifdef CONFIG_CGROUP_PERF

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

        /* @event doesn't care about cgroup */
        if (!event->cgrp)
                return true;

        /* wants specific cgroup scope but @cpuctx isn't associated with any */
        if (!cpuctx->cgrp)
                return false;

        /*
         * Cgroup scoping is recursive.  An event enabled for a cgroup is
         * also enabled for all its descendant cgroups.  If @cpuctx's
         * cgroup is a descendant of @event's (the test covers identity
         * case), it's a match.
         */
        return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
                                    event->cgrp->css.cgroup);
}

static inline void perf_detach_cgroup(struct perf_event *event)
{
        css_put(&event->cgrp->css);
        event->cgrp = NULL;
}

static inline int is_cgroup_event(struct perf_event *event)
{
        return event->cgrp != NULL;
}

static_assert(offsetof(struct perf_cgroup_info, timeguest) -
              offsetof(struct perf_cgroup_info, time) ==
              sizeof(struct perf_time_ctx));

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        return __perf_event_time_ctx(event, &t->time);
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        if (!__load_acquire(&t->active))
                return __perf_event_time_ctx(event, &t->time);

        return __perf_event_time_ctx_now(event, &t->time, now);
}

static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv)
{
        update_perf_time_ctx(&info->timeguest, now, adv);
}

static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now)
{
        update_perf_time_ctx(&info->time, now, true);
        if (is_guest_mediated_pmu_loaded())
                __update_cgrp_guest_time(info, now, true);
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
        struct perf_cgroup *cgrp = cpuctx->cgrp;
        struct cgroup_subsys_state *css;
        struct perf_cgroup_info *info;

        if (cgrp) {
                u64 now = perf_clock();

                for (css = &cgrp->css; css; css = css->parent) {
                        cgrp = container_of(css, struct perf_cgroup, css);
                        info = this_cpu_ptr(cgrp->info);

                        update_cgrp_time(info, now);
                        if (final)
                                __store_release(&info->active, 0);
                }
        }
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
        struct perf_cgroup_info *info;

        /*
         * ensure we access cgroup data only when needed and
         * when we know the cgroup is pinned (css_get)
         */
        if (!is_cgroup_event(event))
                return;

        info = this_cpu_ptr(event->cgrp->info);
        /*
         * Do not update time when cgroup is not active
         */
        if (info->active)
                update_cgrp_time(info, perf_clock());
}

static inline void
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
{
        struct perf_event_context *ctx = &cpuctx->ctx;
        struct perf_cgroup *cgrp = cpuctx->cgrp;
        struct perf_cgroup_info *info;
        struct cgroup_subsys_state *css;

        /*
         * ctx->lock held by caller
         * ensure we do not access cgroup data
         * unless we have the cgroup pinned (css_get)
         */
        if (!cgrp)
                return;

        WARN_ON_ONCE(!ctx->nr_cgroups);

        for (css = &cgrp->css; css; css = css->parent) {
                cgrp = container_of(css, struct perf_cgroup, css);
                info = this_cpu_ptr(cgrp->info);
                if (guest) {
                        __update_cgrp_guest_time(info, ctx->time.stamp, false);
                } else {
                        update_perf_time_ctx(&info->time, ctx->time.stamp, false);
                        __store_release(&info->active, 1);
                }
        }
}

/*
 * reschedule events based on the cgroup constraint of task.
 */
static void perf_cgroup_switch(struct task_struct *task)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_cgroup *cgrp;

        /*
         * cpuctx->cgrp is set when the first cgroup event enabled,
         * and is cleared when the last cgroup event disabled.
         */
        if (READ_ONCE(cpuctx->cgrp) == NULL)
                return;

        cgrp = perf_cgroup_from_task(task, NULL);
        if (READ_ONCE(cpuctx->cgrp) == cgrp)
                return;

        guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
        /*
         * Re-check, could've raced vs perf_remove_from_context().
         */
        if (READ_ONCE(cpuctx->cgrp) == NULL)
                return;

        WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
        perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP);

        ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
        /*
         * must not be done before ctxswout due
         * to update_cgrp_time_from_cpuctx() in
         * ctx_sched_out()
         */
        cpuctx->cgrp = cgrp;
        /*
         * set cgrp before ctxsw in to allow
         * perf_cgroup_set_timestamp() in ctx_sched_in()
         * to not have to pass task around
         */
        ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);

        perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP);
}

static int perf_cgroup_ensure_storage(struct perf_event *event,
                                struct cgroup_subsys_state *css)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event **storage;
        int cpu, heap_size, ret = 0;

        /*
         * Allow storage to have sufficient space for an iterator for each
         * possibly nested cgroup plus an iterator for events with no cgroup.
         */
        for (heap_size = 1; css; css = css->parent)
                heap_size++;

        for_each_possible_cpu(cpu) {
                cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                if (heap_size <= cpuctx->heap_size)
                        continue;

                storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
                                       GFP_KERNEL, cpu_to_node(cpu));
                if (!storage) {
                        ret = -ENOMEM;
                        break;
                }

                raw_spin_lock_irq(&cpuctx->ctx.lock);
                if (cpuctx->heap_size < heap_size) {
                        swap(cpuctx->heap, storage);
                        if (storage == cpuctx->heap_default)
                                storage = NULL;
                        cpuctx->heap_size = heap_size;
                }
                raw_spin_unlock_irq(&cpuctx->ctx.lock);

                kfree(storage);
        }

        return ret;
}

static inline int perf_cgroup_connect(int fd, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        struct perf_cgroup *cgrp;
        struct cgroup_subsys_state *css;
        CLASS(fd, f)(fd);
        int ret = 0;

        if (fd_empty(f))
                return -EBADF;

        css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
                                         &perf_event_cgrp_subsys);
        if (IS_ERR(css))
                return PTR_ERR(css);

        ret = perf_cgroup_ensure_storage(event, css);
        if (ret)
                return ret;

        cgrp = container_of(css, struct perf_cgroup, css);
        event->cgrp = cgrp;

        /*
         * all events in a group must monitor
         * the same cgroup because a task belongs
         * to only one perf cgroup at a time
         */
        if (group_leader && group_leader->cgrp != cgrp) {
                perf_detach_cgroup(event);
                ret = -EINVAL;
        }
        return ret;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        event->pmu_ctx->nr_cgroups++;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        if (ctx->nr_cgroups++)
                return;

        cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        event->pmu_ctx->nr_cgroups--;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        if (--ctx->nr_cgroups)
                return;

        cpuctx->cgrp = NULL;
}

#else /* !CONFIG_CGROUP_PERF */

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        return true;
}

static inline void perf_detach_cgroup(struct perf_event *event)
{}

static inline int is_cgroup_event(struct perf_event *event)
{
        return 0;
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
                                                bool final)
{
}

static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        return -EINVAL;
}

static inline void
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
{
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        return 0;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        return 0;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}

static void perf_cgroup_switch(struct task_struct *task)
{
}
#endif

/*
 * set default to be dependent on timer tick just
 * like original code
 */
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
 * function must be called with interrupts disabled
 */
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
        struct perf_cpu_pmu_context *cpc;
        bool rotations;

        lockdep_assert_irqs_disabled();

        cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
        rotations = perf_rotate_context(cpc);

        raw_spin_lock(&cpc->hrtimer_lock);
        if (rotations)
                hrtimer_forward_now(hr, cpc->hrtimer_interval);
        else
                cpc->hrtimer_active = 0;
        raw_spin_unlock(&cpc->hrtimer_lock);

        return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}

static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
        struct hrtimer *timer = &cpc->hrtimer;
        struct pmu *pmu = cpc->epc.pmu;
        u64 interval;

        /*
         * check default is sane, if not set then force to
         * default interval (1/tick)
         */
        interval = pmu->hrtimer_interval_ms;
        if (interval < 1)
                interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;

        cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);

        raw_spin_lock_init(&cpc->hrtimer_lock);
        hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC,
                      HRTIMER_MODE_ABS_PINNED_HARD);
}

static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
        struct hrtimer *timer = &cpc->hrtimer;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
        if (!cpc->hrtimer_active) {
                cpc->hrtimer_active = 1;
                hrtimer_forward_now(timer, cpc->hrtimer_interval);
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
        }
        raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);

        return 0;
}

static int perf_mux_hrtimer_restart_ipi(void *arg)
{
        return perf_mux_hrtimer_restart(arg);
}

static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu)
{
        return *this_cpu_ptr(pmu->cpu_pmu_context);
}

void perf_pmu_disable(struct pmu *pmu)
{
        int *count = &this_cpc(pmu)->pmu_disable_count;
        if (!(*count)++)
                pmu->pmu_disable(pmu);
}

void perf_pmu_enable(struct pmu *pmu)
{
        int *count = &this_cpc(pmu)->pmu_disable_count;
        if (!--(*count))
                pmu->pmu_enable(pmu);
}

static void perf_assert_pmu_disabled(struct pmu *pmu)
{
        int *count = &this_cpc(pmu)->pmu_disable_count;
        WARN_ON_ONCE(*count == 0);
}

static inline void perf_pmu_read(struct perf_event *event)
{
        if (event->state == PERF_EVENT_STATE_ACTIVE)
                event->pmu->read(event);
}

static void get_ctx(struct perf_event_context *ctx)
{
        refcount_inc(&ctx->refcount);
}

static void free_ctx(struct rcu_head *head)
{
        struct perf_event_context *ctx;

        ctx = container_of(head, struct perf_event_context, rcu_head);
        kfree(ctx);
}

static void put_ctx(struct perf_event_context *ctx)
{
        if (refcount_dec_and_test(&ctx->refcount)) {
                if (ctx->parent_ctx)
                        put_ctx(ctx->parent_ctx);
                if (ctx->task && ctx->task != TASK_TOMBSTONE)
                        put_task_struct(ctx->task);
                call_rcu(&ctx->rcu_head, free_ctx);
        } else {
                smp_mb__after_atomic(); /* pairs with wait_var_event() */
                if (ctx->task == TASK_TOMBSTONE)
                        wake_up_var(&ctx->refcount);
        }
}

/*
 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 * perf_pmu_migrate_context() we need some magic.
 *
 * Those places that change perf_event::ctx will hold both
 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 *
 * Lock ordering is by mutex address. There are two other sites where
 * perf_event_context::mutex nests and those are:
 *
 *  - perf_event_exit_task_context()        [ child , 0 ]
 *      perf_event_exit_event()
 *        put_event()                        [ parent, 1 ]
 *
 *  - perf_event_init_context()                [ parent, 0 ]
 *      inherit_task_group()
 *        inherit_group()
 *          inherit_event()
 *            perf_event_alloc()
 *              perf_init_event()
 *                perf_try_init_event()        [ child , 1 ]
 *
 * While it appears there is an obvious deadlock here -- the parent and child
 * nesting levels are inverted between the two. This is in fact safe because
 * life-time rules separate them. That is an exiting task cannot fork, and a
 * spawning task cannot (yet) exit.
 *
 * But remember that these are parent<->child context relations, and
 * migration does not affect children, therefore these two orderings should not
 * interact.
 *
 * The change in perf_event::ctx does not affect children (as claimed above)
 * because the sys_perf_event_open() case will install a new event and break
 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 * concerned with cpuctx and that doesn't have children.
 *
 * The places that change perf_event::ctx will issue:
 *
 *   perf_remove_from_context();
 *   synchronize_rcu();
 *   perf_install_in_context();
 *
 * to affect the change. The remove_from_context() + synchronize_rcu() should
 * quiesce the event, after which we can install it in the new location. This
 * means that only external vectors (perf_fops, prctl) can perturb the event
 * while in transit. Therefore all such accessors should also acquire
 * perf_event_context::mutex to serialize against this.
 *
 * However; because event->ctx can change while we're waiting to acquire
 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 * function.
 *
 * Lock order:
 *    exec_update_lock
 *        task_struct::perf_event_mutex
 *          perf_event_context::mutex
 *            perf_event::child_mutex;
 *              perf_event_context::lock
 *            mmap_lock
 *              perf_event::mmap_mutex
 *                perf_buffer::aux_mutex
 *              perf_addr_filters_head::lock
 *
 *    cpu_hotplug_lock
 *      pmus_lock
 *          cpuctx->mutex / perf_event_context::mutex
 */
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
{
        struct perf_event_context *ctx;

again:
        rcu_read_lock();
        ctx = READ_ONCE(event->ctx);
        if (!refcount_inc_not_zero(&ctx->refcount)) {
                rcu_read_unlock();
                goto again;
        }
        rcu_read_unlock();

        mutex_lock_nested(&ctx->mutex, nesting);
        if (event->ctx != ctx) {
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);
                goto again;
        }

        return ctx;
}

static inline struct perf_event_context *
perf_event_ctx_lock(struct perf_event *event)
{
        return perf_event_ctx_lock_nested(event, 0);
}

static void perf_event_ctx_unlock(struct perf_event *event,
                                  struct perf_event_context *ctx)
{
        mutex_unlock(&ctx->mutex);
        put_ctx(ctx);
}

/*
 * This must be done under the ctx->lock, such as to serialize against
 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 * calling scheduler related locks and ctx->lock nests inside those.
 */
static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context *ctx)
{
        struct perf_event_context *parent_ctx = ctx->parent_ctx;

        lockdep_assert_held(&ctx->lock);

        if (parent_ctx)
                ctx->parent_ctx = NULL;
        ctx->generation++;

        return parent_ctx;
}

static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
                                enum pid_type type)
{
        u32 nr;
        /*
         * only top level events have the pid namespace they were created in
         */
        if (event->parent)
                event = event->parent;

        nr = __task_pid_nr_ns(p, type, event->ns);
        /* avoid -1 if it is idle thread or runs in another ns */
        if (!nr && !pid_alive(p))
                nr = -1;
        return nr;
}

static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_TGID);
}

static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_PID);
}

/*
 * If we inherit events we want to return the parent event id
 * to userspace.
 */
static u64 primary_event_id(struct perf_event *event)
{
        u64 id = event->id;

        if (event->parent)
                id = event->parent->id;

        return id;
}

/*
 * Get the perf_event_context for a task and lock it.
 *
 * This has to cope with the fact that until it is locked,
 * the context could get moved to another task.
 */
static struct perf_event_context *
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
        struct perf_event_context *ctx;

retry:
        /*
         * One of the few rules of preemptible RCU is that one cannot do
         * rcu_read_unlock() while holding a scheduler (or nested) lock when
         * part of the read side critical section was irqs-enabled -- see
         * rcu_read_unlock_special().
         *
         * Since ctx->lock nests under rq->lock we must ensure the entire read
         * side critical section has interrupts disabled.
         */
        local_irq_save(*flags);
        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
                 * get swapped for another underneath us by
                 * perf_event_task_sched_out, though the
                 * rcu_read_lock() protects us from any context
                 * getting freed.  Lock the context and check if it
                 * got swapped before we could get the lock, and retry
                 * if so.  If we locked the right context, then it
                 * can't get swapped on us any more.
                 */
                raw_spin_lock(&ctx->lock);
                if (ctx != rcu_dereference(task->perf_event_ctxp)) {
                        raw_spin_unlock(&ctx->lock);
                        rcu_read_unlock();
                        local_irq_restore(*flags);
                        goto retry;
                }

                if (ctx->task == TASK_TOMBSTONE ||
                    !refcount_inc_not_zero(&ctx->refcount)) {
                        raw_spin_unlock(&ctx->lock);
                        ctx = NULL;
                } else {
                        WARN_ON_ONCE(ctx->task != task);
                }
        }
        rcu_read_unlock();
        if (!ctx)
                local_irq_restore(*flags);
        return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task)
{
        struct perf_event_context *ctx;
        unsigned long flags;

        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return ctx;
}

static void perf_unpin_context(struct perf_event_context *ctx)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        --ctx->pin_count;
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
}

/*
 * Update the record of the current time in a context.
 */
static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
        lockdep_assert_held(&ctx->lock);

        update_perf_time_ctx(&ctx->time, perf_clock(), adv);
}

static void __update_context_guest_time(struct perf_event_context *ctx, bool adv)
{
        lockdep_assert_held(&ctx->lock);

        /* must be called after __update_context_time(); */
        update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv);
}

static void update_context_time(struct perf_event_context *ctx)
{
        __update_context_time(ctx, true);
        if (is_guest_mediated_pmu_loaded())
                __update_context_guest_time(ctx, true);
}

static u64 perf_event_time(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time(event);

        return __perf_event_time_ctx(event, &ctx->time);
}

static u64 perf_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time_now(event, now);

        if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
                return __perf_event_time_ctx(event, &ctx->time);

        return __perf_event_time_ctx_now(event, &ctx->time, now);
}

static enum event_type_t get_event_type(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        enum event_type_t event_type;

        lockdep_assert_held(&ctx->lock);

        /*
         * It's 'group type', really, because if our group leader is
         * pinned, so are we.
         */
        if (event->group_leader != event)
                event = event->group_leader;

        event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
        if (!ctx->task)
                event_type |= EVENT_CPU;

        return event_type;
}

/*
 * Helper function to initialize event group nodes.
 */
static void init_event_group(struct perf_event *event)
{
        RB_CLEAR_NODE(&event->group_node);
        event->group_index = 0;
}

/*
 * Extract pinned or flexible groups from the context
 * based on event attrs bits.
 */
static struct perf_event_groups *
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        if (event->attr.pinned)
                return &ctx->pinned_groups;
        else
                return &ctx->flexible_groups;
}

/*
 * Helper function to initializes perf_event_group trees.
 */
static void perf_event_groups_init(struct perf_event_groups *groups)
{
        groups->tree = RB_ROOT;
        groups->index = 0;
}

static inline struct cgroup *event_cgroup(const struct perf_event *event)
{
        struct cgroup *cgroup = NULL;

#ifdef CONFIG_CGROUP_PERF
        if (event->cgrp)
                cgroup = event->cgrp->css.cgroup;
#endif

        return cgroup;
}

/*
 * Compare function for event groups;
 *
 * Implements complex key that first sorts by CPU and then by virtual index
 * which provides ordering when rotating groups for the same CPU.
 */
static __always_inline int
perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
                      const struct cgroup *left_cgroup, const u64 left_group_index,
                      const struct perf_event *right)
{
        if (left_cpu < right->cpu)
                return -1;
        if (left_cpu > right->cpu)
                return 1;

        if (left_pmu) {
                if (left_pmu < right->pmu_ctx->pmu)
                        return -1;
                if (left_pmu > right->pmu_ctx->pmu)
                        return 1;
        }

#ifdef CONFIG_CGROUP_PERF
        {
                const struct cgroup *right_cgroup = event_cgroup(right);

                if (left_cgroup != right_cgroup) {
                        if (!left_cgroup) {
                                /*
                                 * Left has no cgroup but right does, no
                                 * cgroups come first.
                                 */
                                return -1;
                        }
                        if (!right_cgroup) {
                                /*
                                 * Right has no cgroup but left does, no
                                 * cgroups come first.
                                 */
                                return 1;
                        }
                        /* Two dissimilar cgroups, order by id. */
                        if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
                                return -1;

                        return 1;
                }
        }
#endif

        if (left_group_index < right->group_index)
                return -1;
        if (left_group_index > right->group_index)
                return 1;

        return 0;
}

#define __node_2_pe(node) \
        rb_entry((node), struct perf_event, group_node)

static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
{
        struct perf_event *e = __node_2_pe(a);
        return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
                                     e->group_index, __node_2_pe(b)) < 0;
}

struct __group_key {
        int cpu;
        struct pmu *pmu;
        struct cgroup *cgroup;
};

static inline int __group_cmp(const void *key, const struct rb_node *node)
{
        const struct __group_key *a = key;
        const struct perf_event *b = __node_2_pe(node);

        /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
        return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
}

static inline int
__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
{
        const struct __group_key *a = key;
        const struct perf_event *b = __node_2_pe(node);

        /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
        return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
                                     b->group_index, b);
}

/*
 * Insert @event into @groups' tree; using
 *   {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
 * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
 */
static void
perf_event_groups_insert(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        event->group_index = ++groups->index;

        rb_add(&event->group_node, &groups->tree, __group_less);
}

/*
 * Helper function to insert event into the pinned or flexible groups.
 */
static void
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_insert(groups, event);
}

/*
 * Delete a group from a tree.
 */
static void
perf_event_groups_delete(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
                     RB_EMPTY_ROOT(&groups->tree));

        rb_erase(&event->group_node, &groups->tree);
        init_event_group(event);
}

/*
 * Helper function to delete event from its groups.
 */
static void
del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_delete(groups, event);
}

/*
 * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
 */
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
                        struct pmu *pmu, struct cgroup *cgrp)
{
        struct __group_key key = {
                .cpu = cpu,
                .pmu = pmu,
                .cgroup = cgrp,
        };
        struct rb_node *node;

        node = rb_find_first(&key, &groups->tree, __group_cmp);
        if (node)
                return __node_2_pe(node);

        return NULL;
}

static struct perf_event *
perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
        struct __group_key key = {
                .cpu = event->cpu,
                .pmu = pmu,
                .cgroup = event_cgroup(event),
        };
        struct rb_node *next;

        next = rb_next_match(&key, &event->group_node, __group_cmp);
        if (next)
                return __node_2_pe(next);

        return NULL;
}

#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu)                \
        for (event = perf_event_groups_first(groups, cpu, pmu, NULL);        \
             event; event = perf_event_groups_next(event, pmu))

/*
 * Iterate through the whole groups tree.
 */
#define perf_event_groups_for_each(event, groups)                        \
        for (event = rb_entry_safe(rb_first(&((groups)->tree)),                \
                                typeof(*event), group_node); event;        \
                event = rb_entry_safe(rb_next(&event->group_node),        \
                                typeof(*event), group_node))

/*
 * Does the event attribute request inherit with PERF_SAMPLE_READ
 */
static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
{
        return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
}

/*
 * Add an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
        lockdep_assert_held(&ctx->lock);

        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
        event->attach_state |= PERF_ATTACH_CONTEXT;

        event->tstamp = perf_event_time(event);

        /*
         * If we're a stand alone event or group leader, we go to the context
         * list, group events are kept attached to the group so that
         * perf_group_detach can, at all times, locate all siblings.
         */
        if (event->group_leader == event) {
                event->group_caps = event->event_caps;
                add_event_to_groups(event, ctx);
        }

        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
        if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
                ctx->nr_user++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
        if (has_inherit_and_sample_read(&event->attr))
                local_inc(&ctx->nr_no_switch_fast);

        if (event->state > PERF_EVENT_STATE_OFF)
                perf_cgroup_event_enable(event, ctx);

        ctx->generation++;
        event->pmu_ctx->nr_events++;
}

/*
 * Initialize event state based on the perf_event_attr::disabled.
 */
static inline void perf_event__state_init(struct perf_event *event)
{
        event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
                                              PERF_EVENT_STATE_INACTIVE;
}

static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
        int entry = sizeof(u64); /* value */
        int size = 0;
        int nr = 1;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_ID)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_LOST)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_GROUP) {
                nr += nr_siblings;
                size += sizeof(u64);
        }

        /*
         * Since perf_event_validate_size() limits this to 16k and inhibits
         * adding more siblings, this will never overflow.
         */
        return size + nr * entry;
}

static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
{
        struct perf_sample_data *data;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);

        if (sample_type & PERF_SAMPLE_ADDR)
                size += sizeof(data->addr);

        if (sample_type & PERF_SAMPLE_PERIOD)
                size += sizeof(data->period);

        if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
                size += sizeof(data->weight.full);

        if (sample_type & PERF_SAMPLE_READ)
                size += event->read_size;

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                size += sizeof(data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                size += sizeof(data->txn);

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                size += sizeof(data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                size += sizeof(data->cgroup);

        if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
                size += sizeof(data->data_page_size);

        if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
                size += sizeof(data->code_page_size);

        event->header_size = size;
}

/*
 * Called at perf_event creation and when events are attached/detached from a
 * group.
 */
static void perf_event__header_size(struct perf_event *event)
{
        event->read_size =
                __perf_event_read_size(event->attr.read_format,
                                       event->group_leader->nr_siblings);
        __perf_event_header_size(event, event->attr.sample_type);
}

static void perf_event__id_header_size(struct perf_event *event)
{
        struct perf_sample_data *data;
        u64 sample_type = event->attr.sample_type;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_TID)
                size += sizeof(data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                size += sizeof(data->time);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_ID)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                size += sizeof(data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                size += sizeof(data->cpu_entry);

        event->id_header_size = size;
}

/*
 * Check that adding an event to the group does not result in anybody
 * overflowing the 64k event limit imposed by the output buffer.
 *
 * Specifically, check that the read_size for the event does not exceed 16k,
 * read_size being the one term that grows with groups size. Since read_size
 * depends on per-event read_format, also (re)check the existing events.
 *
 * This leaves 48k for the constant size fields and things like callchains,
 * branch stacks and register sets.
 */
static bool perf_event_validate_size(struct perf_event *event)
{
        struct perf_event *sibling, *group_leader = event->group_leader;

        if (__perf_event_read_size(event->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        if (__perf_event_read_size(group_leader->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        /*
         * When creating a new group leader, group_leader->ctx is initialized
         * after the size has been validated, but we cannot safely use
         * for_each_sibling_event() until group_leader->ctx is set. A new group
         * leader cannot have any siblings yet, so we can safely skip checking
         * the non-existent siblings.
         */
        if (event == group_leader)
                return true;

        for_each_sibling_event(sibling, group_leader) {
                if (__perf_event_read_size(sibling->attr.read_format,
                                           group_leader->nr_siblings + 1) > 16*1024)
                        return false;
        }

        return true;
}

static void perf_group_attach(struct perf_event *event)
{
        struct perf_event *group_leader = event->group_leader, *pos;

        lockdep_assert_held(&event->ctx->lock);

        /*
         * We can have double attach due to group movement (move_group) in
         * perf_event_open().
         */
        if (event->attach_state & PERF_ATTACH_GROUP)
                return;

        event->attach_state |= PERF_ATTACH_GROUP;

        if (group_leader == event)
                return;

        WARN_ON_ONCE(group_leader->ctx != event->ctx);

        group_leader->group_caps &= event->event_caps;

        list_add_tail(&event->sibling_list, &group_leader->sibling_list);
        group_leader->nr_siblings++;
        group_leader->group_generation++;

        perf_event__header_size(group_leader);

        for_each_sibling_event(pos, group_leader)
                perf_event__header_size(pos);
}

/*
 * Remove an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_CONTEXT))
                return;

        event->attach_state &= ~PERF_ATTACH_CONTEXT;

        ctx->nr_events--;
        if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
                ctx->nr_user--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
        if (has_inherit_and_sample_read(&event->attr))
                local_dec(&ctx->nr_no_switch_fast);

        list_del_rcu(&event->event_entry);

        if (event->group_leader == event)
                del_event_from_groups(event, ctx);

        ctx->generation++;
        event->pmu_ctx->nr_events--;
}

static int
perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
{
        if (!has_aux(aux_event))
                return 0;

        if (!event->pmu->aux_output_match)
                return 0;

        return event->pmu->aux_output_match(aux_event);
}

static void put_event(struct perf_event *event);
static void __event_disable(struct perf_event *event,
                            struct perf_event_context *ctx,
                            enum perf_event_state state);

static void perf_put_aux_event(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *iter;

        /*
         * If event uses aux_event tear down the link
         */
        if (event->aux_event) {
                iter = event->aux_event;
                event->aux_event = NULL;
                put_event(iter);
                return;
        }

        /*
         * If the event is an aux_event, tear down all links to
         * it from other events.
         */
        for_each_sibling_event(iter, event) {
                if (iter->aux_event != event)
                        continue;

                iter->aux_event = NULL;
                put_event(event);

                /*
                 * If it's ACTIVE, schedule it out and put it into ERROR
                 * state so that we don't try to schedule it again. Note
                 * that perf_event_enable() will clear the ERROR status.
                 */
                __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
        }
}

static bool perf_need_aux_event(struct perf_event *event)
{
        return event->attr.aux_output || has_aux_action(event);
}

static int perf_get_aux_event(struct perf_event *event,
                              struct perf_event *group_leader)
{
        /*
         * Our group leader must be an aux event if we want to be
         * an aux_output. This way, the aux event will precede its
         * aux_output events in the group, and therefore will always
         * schedule first.
         */
        if (!group_leader)
                return 0;

        /*
         * aux_output and aux_sample_size are mutually exclusive.
         */
        if (event->attr.aux_output && event->attr.aux_sample_size)
                return 0;

        if (event->attr.aux_output &&
            !perf_aux_output_match(event, group_leader))
                return 0;

        if ((event->attr.aux_pause || event->attr.aux_resume) &&
            !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
                return 0;

        if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
                return 0;

        if (!atomic_long_inc_not_zero(&group_leader->refcount))
                return 0;

        /*
         * Link aux_outputs to their aux event; this is undone in
         * perf_group_detach() by perf_put_aux_event(). When the
         * group in torn down, the aux_output events loose their
         * link to the aux_event and can't schedule any more.
         */
        event->aux_event = group_leader;

        return 1;
}

static inline struct list_head *get_event_list(struct perf_event *event)
{
        return event->attr.pinned ? &event->pmu_ctx->pinned_active :
                                    &event->pmu_ctx->flexible_active;
}

static void perf_group_detach(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event *sibling, *tmp;
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_GROUP))
                return;

        event->attach_state &= ~PERF_ATTACH_GROUP;

        perf_put_aux_event(event);

        /*
         * If this is a sibling, remove it from its group.
         */
        if (leader != event) {
                list_del_init(&event->sibling_list);
                event->group_leader->nr_siblings--;
                event->group_leader->group_generation++;
                goto out;
        }

        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {

                /*
                 * Events that have PERF_EV_CAP_SIBLING require being part of
                 * a group and cannot exist on their own, schedule them out
                 * and move them into the ERROR state. Also see
                 * _perf_event_enable(), it will not be able to recover this
                 * ERROR state.
                 */
                if (sibling->event_caps & PERF_EV_CAP_SIBLING)
                        __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);

                sibling->group_leader = sibling;
                list_del_init(&sibling->sibling_list);

                /* Inherit group flags from the previous leader */
                sibling->group_caps = event->group_caps;

                if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
                        add_event_to_groups(sibling, event->ctx);

                        if (sibling->state == PERF_EVENT_STATE_ACTIVE)
                                list_add_tail(&sibling->active_list, get_event_list(sibling));
                }

                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }

out:
        for_each_sibling_event(tmp, leader)
                perf_event__header_size(tmp);

        perf_event__header_size(leader);
}

static void perf_child_detach(struct perf_event *event)
{
        struct perf_event *parent_event = event->parent;

        if (!(event->attach_state & PERF_ATTACH_CHILD))
                return;

        event->attach_state &= ~PERF_ATTACH_CHILD;

        if (WARN_ON_ONCE(!parent_event))
                return;

        /*
         * Can't check this from an IPI, the holder is likey another CPU.
         *
        lockdep_assert_held(&parent_event->child_mutex);
         */

        list_del_init(&event->child_list);
}

static bool is_orphaned_event(struct perf_event *event)
{
        return event->state == PERF_EVENT_STATE_DEAD;
}

static inline int
event_filter_match(struct perf_event *event)
{
        return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
               perf_cgroup_match(event);
}

static inline bool is_event_in_freq_mode(struct perf_event *event)
{
        return event->attr.freq && event->attr.sample_freq;
}

static void
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
        enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;

        // XXX cpc serialization, probably per-cpu IRQ disabled

        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        /*
         * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
         * we can schedule events _OUT_ individually through things like
         * __perf_remove_from_context().
         */
        list_del_init(&event->active_list);

        perf_pmu_disable(event->pmu);

        event->pmu->del(event, 0);
        event->oncpu = -1;

        if (event->pending_disable) {
                event->pending_disable = 0;
                perf_cgroup_event_disable(event, ctx);
                state = PERF_EVENT_STATE_OFF;
        }

        perf_event_set_state(event, state);

        if (!is_software_event(event))
                cpc->active_oncpu--;
        if (is_event_in_freq_mode(event)) {
                ctx->nr_freq--;
                epc->nr_freq--;
        }
        if (event->attr.exclusive || !cpc->active_oncpu)
                cpc->exclusive = 0;

        perf_pmu_enable(event->pmu);
}

static void
group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
        struct perf_event *event;

        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);

        event_sched_out(group_event, ctx);

        /*
         * Schedule out siblings (if any):
         */
        for_each_sibling_event(event, group_event)
                event_sched_out(event, ctx);
}

static inline void
__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx,
                  bool final, enum event_type_t event_type)
{
        if (ctx->is_active & EVENT_TIME) {
                if (ctx->is_active & EVENT_FROZEN)
                        return;

                update_context_time(ctx);
                /* vPMU should not stop time */
                update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final);
        }
}

static inline void
ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
{
        __ctx_time_update(cpuctx, ctx, false, 0);
}

/*
 * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
 */
static inline void
ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
{
        ctx_time_update(cpuctx, ctx);
        if (ctx->is_active & EVENT_TIME)
                ctx->is_active |= EVENT_FROZEN;
}

static inline void
ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
{
        if (ctx->is_active & EVENT_TIME) {
                if (ctx->is_active & EVENT_FROZEN)
                        return;
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }
}

#define DETACH_GROUP        0x01UL
#define DETACH_CHILD        0x02UL
#define DETACH_EXIT        0x04UL
#define DETACH_REVOKE        0x08UL
#define DETACH_DEAD        0x10UL

/*
 * Cross CPU call to remove a performance event
 *
 * We disable the event on the hardware level first. After that we
 * remove it from the context list.
 */
static void
__perf_remove_from_context(struct perf_event *event,
                           struct perf_cpu_context *cpuctx,
                           struct perf_event_context *ctx,
                           void *info)
{
        struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
        enum perf_event_state state = PERF_EVENT_STATE_OFF;
        unsigned long flags = (unsigned long)info;

        ctx_time_update(cpuctx, ctx);

        /*
         * Ensure event_sched_out() switches to OFF, at the very least
         * this avoids raising perf_pending_task() at this time.
         */
        if (flags & DETACH_EXIT)
                state = PERF_EVENT_STATE_EXIT;
        if (flags & DETACH_REVOKE)
                state = PERF_EVENT_STATE_REVOKED;
        if (flags & DETACH_DEAD)
                state = PERF_EVENT_STATE_DEAD;

        event_sched_out(event, ctx);

        if (event->state > PERF_EVENT_STATE_OFF)
                perf_cgroup_event_disable(event, ctx);

        perf_event_set_state(event, min(event->state, state));

        if (flags & DETACH_GROUP)
                perf_group_detach(event);
        if (flags & DETACH_CHILD)
                perf_child_detach(event);
        list_del_event(event, ctx);

        if (!pmu_ctx->nr_events) {
                pmu_ctx->rotate_necessary = 0;

                if (ctx->task && ctx->is_active) {
                        struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu);

                        WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
                        cpc->task_epc = NULL;
                }
        }

        if (!ctx->nr_events && ctx->is_active) {
                if (ctx == &cpuctx->ctx)
                        update_cgrp_time_from_cpuctx(cpuctx, true);

                ctx->is_active = 0;
                if (ctx->task) {
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                        cpuctx->task_ctx = NULL;
                }
        }
}

/*
 * Remove the event from a task's (or a CPU's) list of events.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
 * When called from perf_event_exit_task, it's OK because the
 * context has been detached from its task.
 */
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->mutex);

        /*
         * Because of perf_event_exit_task(), perf_remove_from_context() ought
         * to work in the face of TASK_TOMBSTONE, unlike every other
         * event_function_call() user.
         */
        raw_spin_lock_irq(&ctx->lock);
        if (!ctx->is_active) {
                __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
                                           ctx, (void *)flags);
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_remove_from_context, (void *)flags);
}

static void __event_disable(struct perf_event *event,
                            struct perf_event_context *ctx,
                            enum perf_event_state state)
{
        event_sched_out(event, ctx);
        perf_cgroup_event_disable(event, ctx);
        perf_event_set_state(event, state);
}

/*
 * Cross CPU call to disable a performance event
 */
static void __perf_event_disable(struct perf_event *event,
                                 struct perf_cpu_context *cpuctx,
                                 struct perf_event_context *ctx,
                                 void *info)
{
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return;

        perf_pmu_disable(event->pmu_ctx->pmu);
        ctx_time_update_event(ctx, event);

        /*
         * When disabling a group leader, the whole group becomes ineligible
         * to run, so schedule out the full group.
         */
        if (event == event->group_leader)
                group_sched_out(event, ctx);

        /*
         * But only mark the leader OFF; the siblings will remain
         * INACTIVE.
         */
        __event_disable(event, ctx, PERF_EVENT_STATE_OFF);

        perf_pmu_enable(event->pmu_ctx->pmu);
}

/*
 * Disable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in perf_event_exit_event().
 *
 * When called from perf_pending_disable it's OK because event->ctx
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
static void _perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state <= PERF_EVENT_STATE_OFF) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_disable, NULL);
}

void perf_event_disable_local(struct perf_event *event)
{
        event_function_local(event, __perf_event_disable, NULL);
}

/*
 * Strictly speaking kernel users cannot create groups and therefore this
 * interface does not need the perf_event_ctx_lock() magic.
 */
void perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_disable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_disable);

void perf_event_disable_inatomic(struct perf_event *event)
{
        event->pending_disable = 1;
        irq_work_queue(&event->pending_disable_irq);
}

#define MAX_INTERRUPTS (~0ULL)

static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);

static void perf_event_unthrottle(struct perf_event *event, bool start)
{
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        event->hw.interrupts = 0;
        if (start)
                event->pmu->start(event, 0);
        if (event == event->group_leader)
                perf_log_throttle(event, 1);
}

static void perf_event_throttle(struct perf_event *event)
{
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        event->hw.interrupts = MAX_INTERRUPTS;
        event->pmu->stop(event, 0);
        if (event == event->group_leader)
                perf_log_throttle(event, 0);
}

static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
{
        struct perf_event *sibling, *leader = event->group_leader;

        perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
        for_each_sibling_event(sibling, leader)
                perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
}

static void perf_event_throttle_group(struct perf_event *event)
{
        struct perf_event *sibling, *leader = event->group_leader;

        perf_event_throttle(leader);
        for_each_sibling_event(sibling, leader)
                perf_event_throttle(sibling);
}

static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
        int ret = 0;

        WARN_ON_ONCE(event->ctx != ctx);

        lockdep_assert_held(&ctx->lock);

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        WRITE_ONCE(event->oncpu, smp_processor_id());
        /*
         * Order event::oncpu write to happen before the ACTIVE state is
         * visible. This allows perf_event_{stop,read}() to observe the correct
         * ->oncpu if it sees ACTIVE.
         */
        smp_wmb();
        perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);

        /*
         * Unthrottle events, since we scheduled we might have missed several
         * ticks already, also for a heavily scheduling task there is little
         * guarantee it'll get a tick in a timely manner.
         */
        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
                perf_event_unthrottle(event, false);

        perf_pmu_disable(event->pmu);

        perf_log_itrace_start(event);

        if (event->pmu->add(event, PERF_EF_START)) {
                perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
                event->oncpu = -1;
                ret = -EAGAIN;
                goto out;
        }

        if (!is_software_event(event))
                cpc->active_oncpu++;
        if (is_event_in_freq_mode(event)) {
                ctx->nr_freq++;
                epc->nr_freq++;
        }
        if (event->attr.exclusive)
                cpc->exclusive = 1;

out:
        perf_pmu_enable(event->pmu);

        return ret;
}

static int
group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
        struct perf_event *event, *partial_group = NULL;
        struct pmu *pmu = group_event->pmu_ctx->pmu;

        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;

        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);

        if (event_sched_in(group_event, ctx))
                goto error;

        /*
         * Schedule in siblings as one group (if any):
         */
        for_each_sibling_event(event, group_event) {
                if (event_sched_in(event, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
        }

        if (!pmu->commit_txn(pmu))
                return 0;

group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
         * The events up to the failed event are scheduled out normally.
         */
        for_each_sibling_event(event, group_event) {
                if (event == partial_group)
                        break;

                event_sched_out(event, ctx);
        }
        event_sched_out(group_event, ctx);

error:
        pmu->cancel_txn(pmu);
        return -EAGAIN;
}

/*
 * Work out whether we can put this event group on the CPU now.
 */
static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);

        /*
         * Groups consisting entirely of software events can always go on.
         */
        if (event->group_caps & PERF_EV_CAP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
         * events can go on.
         */
        if (cpc->exclusive)
                return 0;
        /*
         * If this group is exclusive and there are already
         * events on the CPU, it can't go on.
         */
        if (event->attr.exclusive && !list_empty(get_event_list(event)))
                return 0;
        /*
         * Otherwise, try to add it if all previous groups were able
         * to go on.
         */
        return can_add_hw;
}

static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
{
        list_add_event(event, ctx);
        perf_group_attach(event);
}

static void task_ctx_sched_out(struct perf_event_context *ctx,
                               struct pmu *pmu,
                               enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

        if (!cpuctx->task_ctx)
                return;

        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;

        ctx_sched_out(ctx, pmu, event_type);
}

static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                struct pmu *pmu,
                                enum event_type_t event_type)
{
        ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type);
        if (ctx)
                ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type);
        ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type);
        if (ctx)
                ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type);
}

/*
 * We want to maintain the following priority of scheduling:
 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
 *  - task pinned (EVENT_PINNED)
 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
 *  - task flexible (EVENT_FLEXIBLE).
 *
 * In order to avoid unscheduling and scheduling back in everything every
 * time an event is added, only do it for the groups of equal priority and
 * below.
 *
 * This can be called after a batch operation on task events, in which case
 * event_type is a bit mask of the types of events involved. For CPU events,
 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
 */
static void ctx_resched(struct perf_cpu_context *cpuctx,
                        struct perf_event_context *task_ctx,
                        struct pmu *pmu, enum event_type_t event_type)
{
        bool cpu_event = !!(event_type & EVENT_CPU);
        struct perf_event_pmu_context *epc;

        /*
         * If pinned groups are involved, flexible groups also need to be
         * scheduled out.
         */
        if (event_type & EVENT_PINNED)
                event_type |= EVENT_FLEXIBLE;

        event_type &= EVENT_ALL;

        for_each_epc(epc, &cpuctx->ctx, pmu, 0)
                perf_pmu_disable(epc->pmu);

        if (task_ctx) {
                for_each_epc(epc, task_ctx, pmu, 0)
                        perf_pmu_disable(epc->pmu);

                task_ctx_sched_out(task_ctx, pmu, event_type);
        }

        /*
         * Decide which cpu ctx groups to schedule out based on the types
         * of events that caused rescheduling:
         *  - EVENT_CPU: schedule out corresponding groups;
         *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
         *  - otherwise, do nothing more.
         */
        if (cpu_event)
                ctx_sched_out(&cpuctx->ctx, pmu, event_type);
        else if (event_type & EVENT_PINNED)
                ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);

        perf_event_sched_in(cpuctx, task_ctx, pmu, 0);

        for_each_epc(epc, &cpuctx->ctx, pmu, 0)
                perf_pmu_enable(epc->pmu);

        if (task_ctx) {
                for_each_epc(epc, task_ctx, pmu, 0)
                        perf_pmu_enable(epc->pmu);
        }
}

void perf_pmu_resched(struct pmu *pmu)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;

        perf_ctx_lock(cpuctx, task_ctx);
        ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
        perf_ctx_unlock(cpuctx, task_ctx);
}

/*
 * Cross CPU call to install and enable a performance event
 *
 * Very similar to remote_function() + event_function() but cannot assume that
 * things like ctx->is_active and cpuctx->task_ctx are set.
 */
static int  __perf_install_in_context(void *info)
{
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        bool reprogram = true;
        int ret = 0;

        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx->task) {
                raw_spin_lock(&ctx->lock);
                task_ctx = ctx;

                reprogram = (ctx->task == current);

                /*
                 * If the task is running, it must be running on this CPU,
                 * otherwise we cannot reprogram things.
                 *
                 * If its not running, we don't care, ctx->lock will
                 * serialize against it becoming runnable.
                 */
                if (task_curr(ctx->task) && !reprogram) {
                        ret = -ESRCH;
                        goto unlock;
                }

                WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
        } else if (task_ctx) {
                raw_spin_lock(&task_ctx->lock);
        }

#ifdef CONFIG_CGROUP_PERF
        if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
                /*
                 * If the current cgroup doesn't match the event's
                 * cgroup, we should not try to schedule it.
                 */
                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
                reprogram = cgroup_is_descendant(cgrp->css.cgroup,
                                        event->cgrp->css.cgroup);
        }
#endif

        if (reprogram) {
                ctx_time_freeze(cpuctx, ctx);
                add_event_to_ctx(event, ctx);
                ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
                            get_event_type(event));
        } else {
                add_event_to_ctx(event, ctx);
        }

unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx);

/*
 * Attach a performance event to a context.
 *
 * Very similar to event_function_call, see comment there.
 */
static void
perf_install_in_context(struct perf_event_context *ctx,
                        struct perf_event *event,
                        int cpu)
{
        struct task_struct *task = READ_ONCE(ctx->task);

        lockdep_assert_held(&ctx->mutex);

        WARN_ON_ONCE(!exclusive_event_installable(event, ctx));

        if (event->cpu != -1)
                WARN_ON_ONCE(event->cpu != cpu);

        /*
         * Ensures that if we can observe event->ctx, both the event and ctx
         * will be 'complete'. See perf_iterate_sb_cpu().
         */
        smp_store_release(&event->ctx, ctx);

        /*
         * perf_event_attr::disabled events will not run and can be initialized
         * without IPI. Except when this is the first event for the context, in
         * that case we need the magic of the IPI to set ctx->is_active.
         *
         * The IOC_ENABLE that is sure to follow the creation of a disabled
         * event will issue the IPI and reprogram the hardware.
         */
        if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
            ctx->nr_events && !is_cgroup_event(event)) {
                raw_spin_lock_irq(&ctx->lock);
                if (ctx->task == TASK_TOMBSTONE) {
                        raw_spin_unlock_irq(&ctx->lock);
                        return;
                }
                add_event_to_ctx(event, ctx);
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        if (!task) {
                cpu_function_call(cpu, __perf_install_in_context, event);
                return;
        }

        /*
         * Should not happen, we validate the ctx is still alive before calling.
         */
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
                return;

        /*
         * Installing events is tricky because we cannot rely on ctx->is_active
         * to be set in case this is the nr_events 0 -> 1 transition.
         *
         * Instead we use task_curr(), which tells us if the task is running.
         * However, since we use task_curr() outside of rq::lock, we can race
         * against the actual state. This means the result can be wrong.
         *
         * If we get a false positive, we retry, this is harmless.
         *
         * If we get a false negative, things are complicated. If we are after
         * perf_event_context_sched_in() ctx::lock will serialize us, and the
         * value must be correct. If we're before, it doesn't matter since
         * perf_event_context_sched_in() will program the counter.
         *
         * However, this hinges on the remote context switch having observed
         * our task->perf_event_ctxp[] store, such that it will in fact take
         * ctx::lock in perf_event_context_sched_in().
         *
         * We do this by task_function_call(), if the IPI fails to hit the task
         * we know any future context switch of task must see the
         * perf_event_ctpx[] store.
         */

        /*
         * This smp_mb() orders the task->perf_event_ctxp[] store with the
         * task_cpu() load, such that if the IPI then does not find the task
         * running, a future context switch of that task must observe the
         * store.
         */
        smp_mb();
again:
        if (!task_function_call(task, __perf_install_in_context, event))
                return;

        raw_spin_lock_irq(&ctx->lock);
        task = ctx->task;
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
                /*
                 * Cannot happen because we already checked above (which also
                 * cannot happen), and we hold ctx->mutex, which serializes us
                 * against perf_event_exit_task_context().
                 */
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        /*
         * If the task is not running, ctx->lock will avoid it becoming so,
         * thus we can safely install the event.
         */
        if (task_curr(task)) {
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
        add_event_to_ctx(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
}

/*
 * Cross CPU call to enable a performance event
 */
static void __perf_event_enable(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event_context *task_ctx;

        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <= PERF_EVENT_STATE_ERROR)
                return;

        ctx_time_freeze(cpuctx, ctx);

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
        perf_cgroup_event_enable(event, ctx);

        if (!ctx->is_active)
                return;

        if (!event_filter_match(event))
                return;

        /*
         * If the event is in a group and isn't the group leader,
         * then don't put it on unless the group is on.
         */
        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
                return;

        task_ctx = cpuctx->task_ctx;
        if (ctx->task)
                WARN_ON_ONCE(task_ctx != ctx);

        ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
}

/*
 * Enable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
static void _perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <  PERF_EVENT_STATE_ERROR) {
out:
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        /*
         * If the event is in error state, clear that first.
         *
         * That way, if we see the event in error state below, we know that it
         * has gone back into error state, as distinct from the task having
         * been scheduled away before the cross-call arrived.
         */
        if (event->state == PERF_EVENT_STATE_ERROR) {
                /*
                 * Detached SIBLING events cannot leave ERROR state.
                 */
                if (event->event_caps & PERF_EV_CAP_SIBLING &&
                    event->group_leader == event)
                        goto out;

                event->state = PERF_EVENT_STATE_OFF;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_enable, NULL);
}

/*
 * See perf_event_disable();
 */
void perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_enable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_enable);

struct stop_event_data {
        struct perf_event        *event;
        unsigned int                restart;
};

static int __perf_event_stop(void *info)
{
        struct stop_event_data *sd = info;
        struct perf_event *event = sd->event;

        /* if it's already INACTIVE, do nothing */
        if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                return 0;

        /* matches smp_wmb() in event_sched_in() */
        smp_rmb();

        /*
         * There is a window with interrupts enabled before we get here,
         * so we need to check again lest we try to stop another CPU's event.
         */
        if (READ_ONCE(event->oncpu) != smp_processor_id())
                return -EAGAIN;

        event->pmu->stop(event, PERF_EF_UPDATE);

        /*
         * May race with the actual stop (through perf_pmu_output_stop()),
         * but it is only used for events with AUX ring buffer, and such
         * events will refuse to restart because of rb::aux_mmap_count==0,
         * see comments in perf_aux_output_begin().
         *
         * Since this is happening on an event-local CPU, no trace is lost
         * while restarting.
         */
        if (sd->restart)
                event->pmu->start(event, 0);

        return 0;
}

static int perf_event_stop(struct perf_event *event, int restart)
{
        struct stop_event_data sd = {
                .event                = event,
                .restart        = restart,
        };
        int ret = 0;

        do {
                if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                        return 0;

                /* matches smp_wmb() in event_sched_in() */
                smp_rmb();

                /*
                 * We only want to restart ACTIVE events, so if the event goes
                 * inactive here (event->oncpu==-1), there's nothing more to do;
                 * fall through with ret==-ENXIO.
                 */
                ret = cpu_function_call(READ_ONCE(event->oncpu),
                                        __perf_event_stop, &sd);
        } while (ret == -EAGAIN);

        return ret;
}

/*
 * In order to contain the amount of racy and tricky in the address filter
 * configuration management, it is a two part process:
 *
 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
 *      we update the addresses of corresponding vmas in
 *        event::addr_filter_ranges array and bump the event::addr_filters_gen;
 * (p2) when an event is scheduled in (pmu::add), it calls
 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
 *      if the generation has changed since the previous call.
 *
 * If (p1) happens while the event is active, we restart it to force (p2).
 *
 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
 *     ioctl;
 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
 *     for reading;
 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
 *     of exec.
 */
void perf_event_addr_filters_sync(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

        if (!has_addr_filter(event))
                return;

        raw_spin_lock(&ifh->lock);
        if (event->addr_filters_gen != event->hw.addr_filters_gen) {
                event->pmu->addr_filters_sync(event);
                event->hw.addr_filters_gen = event->addr_filters_gen;
        }
        raw_spin_unlock(&ifh->lock);
}
EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);

static int _perf_event_refresh(struct perf_event *event, int refresh)
{
        /*
         * not supported on inherited events
         */
        if (event->attr.inherit || !is_sampling_event(event))
                return -EINVAL;

        atomic_add(refresh, &event->event_limit);
        _perf_event_enable(event);

        return 0;
}

/*
 * See perf_event_disable()
 */
int perf_event_refresh(struct perf_event *event, int refresh)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_refresh(event, refresh);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_refresh);

static int perf_event_modify_breakpoint(struct perf_event *bp,
                                         struct perf_event_attr *attr)
{
        int err;

        _perf_event_disable(bp);

        err = modify_user_hw_breakpoint_check(bp, attr, true);

        if (!bp->attr.disabled)
                _perf_event_enable(bp);

        return err;
}

/*
 * Copy event-type-independent attributes that may be modified.
 */
static void perf_event_modify_copy_attr(struct perf_event_attr *to,
                                        const struct perf_event_attr *from)
{
        to->sig_data = from->sig_data;
}

static int perf_event_modify_attr(struct perf_event *event,
                                  struct perf_event_attr *attr)
{
        int (*func)(struct perf_event *, struct perf_event_attr *);
        struct perf_event *child;
        int err;

        if (event->attr.type != attr->type)
                return -EINVAL;

        switch (event->attr.type) {
        case PERF_TYPE_BREAKPOINT:
                func = perf_event_modify_breakpoint;
                break;
        default:
                /* Place holder for future additions. */
                return -EOPNOTSUPP;
        }

        WARN_ON_ONCE(event->ctx->parent_ctx);

        mutex_lock(&event->child_mutex);
        /*
         * Event-type-independent attributes must be copied before event-type
         * modification, which will validate that final attributes match the
         * source attributes after all relevant attributes have been copied.
         */
        perf_event_modify_copy_attr(&event->attr, attr);
        err = func(event, attr);
        if (err)
                goto out;
        list_for_each_entry(child, &event->child_list, child_list) {
                perf_event_modify_copy_attr(&child->attr, attr);
                err = func(child, attr);
                if (err)
                        goto out;
        }
out:
        mutex_unlock(&event->child_mutex);
        return err;
}

static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
                                enum event_type_t event_type)
{
        struct perf_event_context *ctx = pmu_ctx->ctx;
        struct perf_event *event, *tmp;
        struct pmu *pmu = pmu_ctx->pmu;

        if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
                struct perf_cpu_pmu_context *cpc = this_cpc(pmu);

                WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
                cpc->task_epc = NULL;
        }

        if (!(event_type & EVENT_ALL))
                return;

        perf_pmu_disable(pmu);
        if (event_type & EVENT_PINNED) {
                list_for_each_entry_safe(event, tmp,
                                         &pmu_ctx->pinned_active,
                                         active_list)
                        group_sched_out(event, ctx);
        }

        if (event_type & EVENT_FLEXIBLE) {
                list_for_each_entry_safe(event, tmp,
                                         &pmu_ctx->flexible_active,
                                         active_list)
                        group_sched_out(event, ctx);
                /*
                 * Since we cleared EVENT_FLEXIBLE, also clear
                 * rotate_necessary, is will be reset by
                 * ctx_flexible_sched_in() when needed.
                 */
                pmu_ctx->rotate_necessary = 0;
        }
        perf_pmu_enable(pmu);
}

/*
 * Be very careful with the @pmu argument since this will change ctx state.
 * The @pmu argument works for ctx_resched(), because that is symmetric in
 * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
 *
 * However, if you were to be asymmetrical, you could end up with messed up
 * state, eg. ctx->is_active cleared even though most EPCs would still actually
 * be active.
 */
static void
ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        enum event_type_t active_type = event_type & ~EVENT_FLAGS;
        struct perf_event_pmu_context *pmu_ctx;
        int is_active = ctx->is_active;


        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events)) {
                /*
                 * See __perf_remove_from_context().
                 */
                WARN_ON_ONCE(ctx->is_active);
                if (ctx->task)
                        WARN_ON_ONCE(cpuctx->task_ctx);
                return;
        }

        /*
         * Always update time if it was set; not only when it changes.
         * Otherwise we can 'forget' to update time for any but the last
         * context we sched out. For example:
         *
         *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
         *   ctx_sched_out(.event_type = EVENT_PINNED)
         *
         * would only update time for the pinned events.
         */
        __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type);

        /*
         * CPU-release for the below ->is_active store,
         * see __load_acquire() in perf_event_time_now()
         */
        barrier();
        ctx->is_active &= ~active_type;

        if (!(ctx->is_active & EVENT_ALL)) {
                /*
                 * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
                 * does not observe a hole. perf_ctx_unlock() will clean up.
                 */
                if (ctx->is_active & EVENT_FROZEN)
                        ctx->is_active &= EVENT_TIME_FROZEN;
                else
                        ctx->is_active = 0;
        }

        if (ctx->task) {
                WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                if (!(ctx->is_active & EVENT_ALL))
                        cpuctx->task_ctx = NULL;
        }

        if (event_type & EVENT_GUEST) {
                /*
                 * Schedule out all exclude_guest events of PMU
                 * with PERF_PMU_CAP_MEDIATED_VPMU.
                 */
                is_active = EVENT_ALL;
                __update_context_guest_time(ctx, false);
                perf_cgroup_set_timestamp(cpuctx, true);
                barrier();
        } else {
                is_active ^= ctx->is_active; /* changed bits */
        }

        for_each_epc(pmu_ctx, ctx, pmu, event_type)
                __pmu_ctx_sched_out(pmu_ctx, is_active);
}

/*
 * Test whether two contexts are equivalent, i.e. whether they have both been
 * cloned from the same version of the same context.
 *
 * Equivalence is measured using a generation number in the context that is
 * incremented on each modification to it; see unclone_ctx(), list_add_event()
 * and list_del_event().
 */
static int context_equiv(struct perf_event_context *ctx1,
                         struct perf_event_context *ctx2)
{
        lockdep_assert_held(&ctx1->lock);
        lockdep_assert_held(&ctx2->lock);

        /* Pinning disables the swap optimization */
        if (ctx1->pin_count || ctx2->pin_count)
                return 0;

        /* If ctx1 is the parent of ctx2 */
        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
                return 1;

        /* If ctx2 is the parent of ctx1 */
        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
                return 1;

        /*
         * If ctx1 and ctx2 have the same parent; we flatten the parent
         * hierarchy, see perf_event_init_context().
         */
        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
                        ctx1->parent_gen == ctx2->parent_gen)
                return 1;

        /* Unmatched */
        return 0;
}

static void __perf_event_sync_stat(struct perf_event *event,
                                     struct perf_event *next_event)
{
        u64 value;

        if (!event->attr.inherit_stat)
                return;

        /*
         * Update the event value, we cannot use perf_event_read()
         * because we're in the middle of a context switch and have IRQs
         * disabled, which upsets smp_call_function_single(), however
         * we know the event must be on the current CPU, therefore we
         * don't need to use it.
         */
        perf_pmu_read(event);

        perf_event_update_time(event);

        /*
         * In order to keep per-task stats reliable we need to flip the event
         * values when we flip the contexts.
         */
        value = local64_read(&next_event->count);
        value = local64_xchg(&event->count, value);
        local64_set(&next_event->count, value);

        swap(event->total_time_enabled, next_event->total_time_enabled);
        swap(event->total_time_running, next_event->total_time_running);

        /*
         * Since we swizzled the values, update the user visible data too.
         */
        perf_event_update_userpage(event);
        perf_event_update_userpage(next_event);
}

static void perf_event_sync_stat(struct perf_event_context *ctx,
                                   struct perf_event_context *next_ctx)
{
        struct perf_event *event, *next_event;

        if (!ctx->nr_stat)
                return;

        update_context_time(ctx);

        event = list_first_entry(&ctx->event_list,
                                   struct perf_event, event_entry);

        next_event = list_first_entry(&next_ctx->event_list,
                                        struct perf_event, event_entry);

        while (&event->event_entry != &ctx->event_list &&
               &next_event->event_entry != &next_ctx->event_list) {

                __perf_event_sync_stat(event, next_event);

                event = list_next_entry(event, event_entry);
                next_event = list_next_entry(next_event, event_entry);
        }
}

static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
                                   struct task_struct *task, bool sched_in)
{
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_cpu_pmu_context *cpc;

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                cpc = this_cpc(pmu_ctx->pmu);

                if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
                        pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in);
        }
}

static void
perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
{
        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent, *next_parent;
        int do_switch = 1;

        if (likely(!ctx))
                return;

        rcu_read_lock();
        next_ctx = rcu_dereference(next->perf_event_ctxp);
        if (!next_ctx)
                goto unlock;

        parent = rcu_dereference(ctx->parent_ctx);
        next_parent = rcu_dereference(next_ctx->parent_ctx);

        /* If neither context have a parent context; they cannot be clones. */
        if (!parent && !next_parent)
                goto unlock;

        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
                /*
                 * Looks like the two contexts are clones, so we might be
                 * able to optimize the context switch.  We lock both
                 * contexts and check that they are clones under the
                 * lock (including re-checking that neither has been
                 * uncloned in the meantime).  It doesn't matter which
                 * order we take the locks because no other cpu could
                 * be trying to lock both of these tasks.
                 */
                raw_spin_lock(&ctx->lock);
                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {

                        perf_ctx_disable(ctx, 0);

                        /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
                        if (local_read(&ctx->nr_no_switch_fast) ||
                            local_read(&next_ctx->nr_no_switch_fast)) {
                                /*
                                 * Must not swap out ctx when there's pending
                                 * events that rely on the ctx->task relation.
                                 *
                                 * Likewise, when a context contains inherit +
                                 * SAMPLE_READ events they should be switched
                                 * out using the slow path so that they are
                                 * treated as if they were distinct contexts.
                                 */
                                raw_spin_unlock(&next_ctx->lock);
                                rcu_read_unlock();
                                goto inside_switch;
                        }

                        WRITE_ONCE(ctx->task, next);
                        WRITE_ONCE(next_ctx->task, task);

                        perf_ctx_sched_task_cb(ctx, task, false);

                        perf_ctx_enable(ctx, 0);

                        /*
                         * RCU_INIT_POINTER here is safe because we've not
                         * modified the ctx and the above modification of
                         * ctx->task is immaterial since this value is
                         * always verified under ctx->lock which we're now
                         * holding.
                         */
                        RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
                        RCU_INIT_POINTER(next->perf_event_ctxp, ctx);

                        do_switch = 0;

                        perf_event_sync_stat(ctx, next_ctx);
                }
                raw_spin_unlock(&next_ctx->lock);
                raw_spin_unlock(&ctx->lock);
        }
unlock:
        rcu_read_unlock();

        if (do_switch) {
                raw_spin_lock(&ctx->lock);
                perf_ctx_disable(ctx, 0);

inside_switch:
                perf_ctx_sched_task_cb(ctx, task, false);
                task_ctx_sched_out(ctx, NULL, EVENT_ALL);

                perf_ctx_enable(ctx, 0);
                raw_spin_unlock(&ctx->lock);
        }
}

static DEFINE_PER_CPU(struct list_head, sched_cb_list);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);

void perf_sched_cb_dec(struct pmu *pmu)
{
        struct perf_cpu_pmu_context *cpc = this_cpc(pmu);

        this_cpu_dec(perf_sched_cb_usages);
        barrier();

        if (!--cpc->sched_cb_usage)
                list_del(&cpc->sched_cb_entry);
}


void perf_sched_cb_inc(struct pmu *pmu)
{
        struct perf_cpu_pmu_context *cpc = this_cpc(pmu);

        if (!cpc->sched_cb_usage++)
                list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));

        barrier();
        this_cpu_inc(perf_sched_cb_usages);
}

/*
 * This function provides the context switch callback to the lower code
 * layer. It is invoked ONLY when the context switch callback is enabled.
 *
 * This callback is relevant even to per-cpu events; for example multi event
 * PEBS requires this to provide PID/TID information. This requires we flush
 * all queued PEBS records before we context switch to a new task.
 */
static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc,
                                  struct task_struct *task, bool sched_in)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu;

        pmu = cpc->epc.pmu;

        /* software PMUs will not have sched_task */
        if (WARN_ON_ONCE(!pmu->sched_task))
                return;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);

        pmu->sched_task(cpc->task_epc, task, sched_in);

        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static void perf_pmu_sched_task(struct task_struct *prev,
                                struct task_struct *next,
                                bool sched_in)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_cpu_pmu_context *cpc;

        /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
        if (prev == next || cpuctx->task_ctx)
                return;

        list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
                __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in);
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in);

/*
 * Called from scheduler to remove the events of the current task,
 * with interrupts disabled.
 *
 * We stop each event and update the event value in event->count.
 *
 * This does not protect us against NMI, but disable()
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
 */
void __perf_event_task_sched_out(struct task_struct *task,
                                 struct task_struct *next)
{
        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(task, next, false);

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, next, false);

        perf_event_context_sched_out(task, next);

        /*
         * if cgroup events exist on this CPU, then we need
         * to check if we have to switch out PMU state.
         * cgroup event are system-wide mode only
         */
        perf_cgroup_switch(next);
}

static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args)
{
        const struct perf_event *le = *(const struct perf_event **)l;
        const struct perf_event *re = *(const struct perf_event **)r;

        return le->group_index < re->group_index;
}

DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap);

static const struct min_heap_callbacks perf_min_heap = {
        .less = perf_less_group_idx,
        .swp = NULL,
};

static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event)
{
        struct perf_event **itrs = heap->data;

        if (event) {
                itrs[heap->nr] = event;
                heap->nr++;
        }
}

static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
{
        struct perf_cpu_pmu_context *cpc;

        if (!pmu_ctx->ctx->task)
                return;

        cpc = this_cpc(pmu_ctx->pmu);
        WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
        cpc->task_epc = pmu_ctx;
}

static noinline int visit_groups_merge(struct perf_event_context *ctx,
                                struct perf_event_groups *groups, int cpu,
                                struct pmu *pmu,
                                int (*func)(struct perf_event *, void *),
                                void *data)
{
#ifdef CONFIG_CGROUP_PERF
        struct cgroup_subsys_state *css = NULL;
#endif
        struct perf_cpu_context *cpuctx = NULL;
        /* Space for per CPU and/or any CPU event iterators. */
        struct perf_event *itrs[2];
        struct perf_event_min_heap event_heap;
        struct perf_event **evt;
        int ret;

        if (pmu->filter && pmu->filter(pmu, cpu))
                return 0;

        if (!ctx->task) {
                cpuctx = this_cpu_ptr(&perf_cpu_context);
                event_heap = (struct perf_event_min_heap){
                        .data = cpuctx->heap,
                        .nr = 0,
                        .size = cpuctx->heap_size,
                };

                lockdep_assert_held(&cpuctx->ctx.lock);

#ifdef CONFIG_CGROUP_PERF
                if (cpuctx->cgrp)
                        css = &cpuctx->cgrp->css;
#endif
        } else {
                event_heap = (struct perf_event_min_heap){
                        .data = itrs,
                        .nr = 0,
                        .size = ARRAY_SIZE(itrs),
                };
                /* Events not within a CPU context may be on any CPU. */
                __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
        }
        evt = event_heap.data;

        __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));

#ifdef CONFIG_CGROUP_PERF
        for (; css; css = css->parent)
                __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif

        if (event_heap.nr) {
                __link_epc((*evt)->pmu_ctx);
                perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
        }

        min_heapify_all_inline(&event_heap, &perf_min_heap, NULL);

        while (event_heap.nr) {
                ret = func(*evt, data);
                if (ret)
                        return ret;

                *evt = perf_event_groups_next(*evt, pmu);
                if (*evt)
                        min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL);
                else
                        min_heap_pop_inline(&event_heap, &perf_min_heap, NULL);
        }

        return 0;
}

/*
 * Because the userpage is strictly per-event (there is no concept of context,
 * so there cannot be a context indirection), every userpage must be updated
 * when context time starts :-(
 *
 * IOW, we must not miss EVENT_TIME edges.
 */
static inline bool event_update_userpage(struct perf_event *event)
{
        if (likely(!refcount_read(&event->mmap_count)))
                return false;

        perf_event_update_time(event);
        perf_event_update_userpage(event);

        return true;
}

static inline void group_update_userpage(struct perf_event *group_event)
{
        struct perf_event *event;

        if (!event_update_userpage(group_event))
                return;

        for_each_sibling_event(event, group_event)
                event_update_userpage(event);
}

struct merge_sched_data {
        int can_add_hw;
        enum event_type_t event_type;
};

static int merge_sched_in(struct perf_event *event, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct merge_sched_data *msd = data;

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        if (!event_filter_match(event))
                return 0;

        /*
         * Don't schedule in any host events from PMU with
         * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running.
         */
        if (is_guest_mediated_pmu_loaded() &&
            event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU &&
            !(msd->event_type & EVENT_GUEST))
                return 0;

        if (group_can_go_on(event, msd->can_add_hw)) {
                if (!group_sched_in(event, ctx))
                        list_add_tail(&event->active_list, get_event_list(event));
        }

        if (event->state == PERF_EVENT_STATE_INACTIVE) {
                msd->can_add_hw = 0;
                if (event->attr.pinned) {
                        perf_cgroup_event_disable(event, ctx);
                        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);

                        if (*perf_event_fasync(event))
                                event->pending_kill = POLL_ERR;

                        event->pending_wakeup = 1;
                        irq_work_queue(&event->pending_irq);
                } else {
                        struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);

                        event->pmu_ctx->rotate_necessary = 1;
                        perf_mux_hrtimer_restart(cpc);
                        group_update_userpage(event);
                }
        }

        return 0;
}

static void pmu_groups_sched_in(struct perf_event_context *ctx,
                                struct perf_event_groups *groups,
                                struct pmu *pmu,
                                enum event_type_t event_type)
{
        struct merge_sched_data msd = {
                .can_add_hw = 1,
                .event_type = event_type,
        };
        visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
                           merge_sched_in, &msd);
}

static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
                               enum event_type_t event_type)
{
        struct perf_event_context *ctx = pmu_ctx->ctx;

        if (event_type & EVENT_PINNED)
                pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type);
        if (event_type & EVENT_FLEXIBLE)
                pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type);
}

static void
ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        enum event_type_t active_type = event_type & ~EVENT_FLAGS;
        struct perf_event_pmu_context *pmu_ctx;
        int is_active = ctx->is_active;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events))
                return;

        if (!(is_active & EVENT_TIME)) {
                /* EVENT_TIME should be active while the guest runs */
                WARN_ON_ONCE(event_type & EVENT_GUEST);
                /* start ctx time */
                __update_context_time(ctx, false);
                perf_cgroup_set_timestamp(cpuctx, false);
                /*
                 * CPU-release for the below ->is_active store,
                 * see __load_acquire() in perf_event_time_now()
                 */
                barrier();
        }

        ctx->is_active |= active_type | EVENT_TIME;
        if (ctx->task) {
                if (!(is_active & EVENT_ALL))
                        cpuctx->task_ctx = ctx;
                else
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
        }

        if (event_type & EVENT_GUEST) {
                /*
                 * Schedule in the required exclude_guest events of PMU
                 * with PERF_PMU_CAP_MEDIATED_VPMU.
                 */
                is_active = event_type & EVENT_ALL;

                /*
                 * Update ctx time to set the new start time for
                 * the exclude_guest events.
                 */
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx, false);
                barrier();
        } else {
                is_active ^= ctx->is_active; /* changed bits */
        }

        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
         */
        if (is_active & EVENT_PINNED) {
                for_each_epc(pmu_ctx, ctx, pmu, event_type)
                        __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST));
        }

        /* Then walk through the lower prio flexible groups */
        if (is_active & EVENT_FLEXIBLE) {
                for_each_epc(pmu_ctx, ctx, pmu, event_type)
                        __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST));
        }
}

static void perf_event_context_sched_in(struct task_struct *task)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx;

        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp);
        if (!ctx)
                goto rcu_unlock;

        if (cpuctx->task_ctx == ctx) {
                perf_ctx_lock(cpuctx, ctx);
                perf_ctx_disable(ctx, 0);

                perf_ctx_sched_task_cb(ctx, task, true);

                perf_ctx_enable(ctx, 0);
                perf_ctx_unlock(cpuctx, ctx);
                goto rcu_unlock;
        }

        perf_ctx_lock(cpuctx, ctx);
        /*
         * We must check ctx->nr_events while holding ctx->lock, such
         * that we serialize against perf_install_in_context().
         */
        if (!ctx->nr_events)
                goto unlock;

        perf_ctx_disable(ctx, 0);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
         * cpu flexible, task flexible.
         *
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
                perf_ctx_disable(&cpuctx->ctx, 0);
                ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
        }

        perf_event_sched_in(cpuctx, ctx, NULL, 0);

        perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);

        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
                perf_ctx_enable(&cpuctx->ctx, 0);

        perf_ctx_enable(ctx, 0);

unlock:
        perf_ctx_unlock(cpuctx, ctx);
rcu_unlock:
        rcu_read_unlock();
}

/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
 * We restore the event value and then enable it.
 *
 * This does not protect us against NMI, but enable()
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
void __perf_event_task_sched_in(struct task_struct *prev,
                                struct task_struct *task)
{
        perf_event_context_sched_in(task);

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, prev, true);

        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(prev, task, true);
}

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
        u64 frequency = event->attr.sample_freq;
        u64 sec = NSEC_PER_SEC;
        u64 divisor, dividend;

        int count_fls, nsec_fls, frequency_fls, sec_fls;

        count_fls = fls64(count);
        nsec_fls = fls64(nsec);
        frequency_fls = fls64(frequency);
        sec_fls = 30;

        /*
         * We got @count in @nsec, with a target of sample_freq HZ
         * the target period becomes:
         *
         *             @count * 10^9
         * period = -------------------
         *          @nsec * sample_freq
         *
         */

        /*
         * Reduce accuracy by one bit such that @a and @b converge
         * to a similar magnitude.
         */
#define REDUCE_FLS(a, b)                \
do {                                        \
        if (a##_fls > b##_fls) {        \
                a >>= 1;                \
                a##_fls--;                \
        } else {                        \
                b >>= 1;                \
                b##_fls--;                \
        }                                \
} while (0)

        /*
         * Reduce accuracy until either term fits in a u64, then proceed with
         * the other, so that finally we can do a u64/u64 division.
         */
        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
                REDUCE_FLS(nsec, frequency);
                REDUCE_FLS(sec, count);
        }

        if (count_fls + sec_fls > 64) {
                divisor = nsec * frequency;

                while (count_fls + sec_fls > 64) {
                        REDUCE_FLS(count, sec);
                        divisor >>= 1;
                }

                dividend = count * sec;
        } else {
                dividend = count * sec;

                while (nsec_fls + frequency_fls > 64) {
                        REDUCE_FLS(nsec, frequency);
                        dividend >>= 1;
                }

                divisor = nsec * frequency;
        }

        if (!divisor)
                return dividend;

        return div64_u64(dividend, divisor);
}

static DEFINE_PER_CPU(int, perf_throttled_count);
static DEFINE_PER_CPU(u64, perf_throttled_seq);

static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period, sample_period;
        s64 delta;

        period = perf_calculate_period(event, nsec, count);

        delta = (s64)(period - hwc->sample_period);
        if (delta >= 0)
                delta += 7;
        else
                delta -= 7;
        delta /= 8; /* low pass filter */

        sample_period = hwc->sample_period + delta;

        if (!sample_period)
                sample_period = 1;

        hwc->sample_period = sample_period;

        if (local64_read(&hwc->period_left) > 8*sample_period) {
                if (disable)
                        event->pmu->stop(event, PERF_EF_UPDATE);

                local64_set(&hwc->period_left, 0);

                if (disable)
                        event->pmu->start(event, PERF_EF_RELOAD);
        }
}

static void perf_adjust_freq_unthr_events(struct list_head *event_list)
{
        struct perf_event *event;
        struct hw_perf_event *hwc;
        u64 now, period = TICK_NSEC;
        s64 delta;

        list_for_each_entry(event, event_list, active_list) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;

                // XXX use visit thingy to avoid the -1,cpu match
                if (!event_filter_match(event))
                        continue;

                hwc = &event->hw;

                if (hwc->interrupts == MAX_INTERRUPTS)
                        perf_event_unthrottle_group(event, is_event_in_freq_mode(event));

                if (!is_event_in_freq_mode(event))
                        continue;

                /*
                 * stop the event and update event->count
                 */
                event->pmu->stop(event, PERF_EF_UPDATE);

                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
                hwc->freq_count_stamp = now;

                /*
                 * restart the event
                 * reload only if value has changed
                 * we have stopped the event so tell that
                 * to perf_adjust_period() to avoid stopping it
                 * twice.
                 */
                if (delta > 0)
                        perf_adjust_period(event, period, delta, false);

                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
        }
}

/*
 * combine freq adjustment with unthrottling to avoid two passes over the
 * events. At the same time, make sure, having freq events does not change
 * the rate of unthrottling as that would introduce bias.
 */
static void
perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
        struct perf_event_pmu_context *pmu_ctx;

        /*
         * only need to iterate over all events iff:
         * - context have events in frequency mode (needs freq adjust)
         * - there are events to unthrottle on this cpu
         */
        if (!(ctx->nr_freq || unthrottle))
                return;

        raw_spin_lock(&ctx->lock);

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (!(pmu_ctx->nr_freq || unthrottle))
                        continue;
                if (!perf_pmu_ctx_is_active(pmu_ctx))
                        continue;
                if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
                        continue;

                perf_pmu_disable(pmu_ctx->pmu);
                perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
                perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
                perf_pmu_enable(pmu_ctx->pmu);
        }

        raw_spin_unlock(&ctx->lock);
}

/*
 * Move @event to the tail of the @ctx's elegible events.
 */
static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
        if (ctx->rotate_disable)
                return;

        perf_event_groups_delete(&ctx->flexible_groups, event);
        perf_event_groups_insert(&ctx->flexible_groups, event);
}

/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
        struct perf_event *event;
        struct rb_node *node;
        struct rb_root *tree;
        struct __group_key key = {
                .pmu = pmu_ctx->pmu,
        };

        /* pick the first active flexible event */
        event = list_first_entry_or_null(&pmu_ctx->flexible_active,
                                         struct perf_event, active_list);
        if (event)
                goto out;

        /* if no active flexible event, pick the first event */
        tree = &pmu_ctx->ctx->flexible_groups.tree;

        if (!pmu_ctx->ctx->task) {
                key.cpu = smp_processor_id();

                node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
                if (node)
                        event = __node_2_pe(node);
                goto out;
        }

        key.cpu = -1;
        node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
        if (node) {
                event = __node_2_pe(node);
                goto out;
        }

        key.cpu = smp_processor_id();
        node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
        if (node)
                event = __node_2_pe(node);

out:
        /*
         * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
         * finds there are unschedulable events, it will set it again.
         */
        pmu_ctx->rotate_necessary = 0;

        return event;
}

static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
        struct perf_event *cpu_event = NULL, *task_event = NULL;
        int cpu_rotate, task_rotate;
        struct pmu *pmu;

        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */

        cpu_epc = &cpc->epc;
        pmu = cpu_epc->pmu;
        task_epc = cpc->task_epc;

        cpu_rotate = cpu_epc->rotate_necessary;
        task_rotate = task_epc ? task_epc->rotate_necessary : 0;

        if (!(cpu_rotate || task_rotate))
                return false;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);

        if (task_rotate)
                task_event = ctx_event_to_rotate(task_epc);
        if (cpu_rotate)
                cpu_event = ctx_event_to_rotate(cpu_epc);

        /*
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
        if (task_event || (task_epc && cpu_event)) {
                update_context_time(task_epc->ctx);
                __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
        }

        if (cpu_event) {
                update_context_time(&cpuctx->ctx);
                __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
                rotate_ctx(&cpuctx->ctx, cpu_event);
                __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
        }

        if (task_event)
                rotate_ctx(task_epc->ctx, task_event);

        if (task_event || (task_epc && cpu_event))
                __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);

        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);

        return true;
}

void perf_event_task_tick(void)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx;
        int throttled;

        lockdep_assert_irqs_disabled();

        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
        tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);

        perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);

        rcu_read_lock();
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_adjust_freq_unthr_context(ctx, !!throttled);
        rcu_read_unlock();
}

static int event_enable_on_exec(struct perf_event *event,
                                struct perf_event_context *ctx)
{
        if (!event->attr.enable_on_exec)
                return 0;

        event->attr.enable_on_exec = 0;
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);

        return 1;
}

/*
 * Enable all of a task's events that have been marked enable-on-exec.
 * This expects task == current.
 */
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
        struct perf_event_context *clone_ctx = NULL;
        enum event_type_t event_type = 0;
        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;

        local_irq_save(flags);
        if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
                goto out;

        if (!ctx->nr_events)
                goto out;

        cpuctx = this_cpu_ptr(&perf_cpu_context);
        perf_ctx_lock(cpuctx, ctx);
        ctx_time_freeze(cpuctx, ctx);

        list_for_each_entry(event, &ctx->event_list, event_entry) {
                enabled |= event_enable_on_exec(event, ctx);
                event_type |= get_event_type(event);
        }

        /*
         * Unclone and reschedule this context if we enabled any event.
         */
        if (enabled) {
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, NULL, event_type);
        }
        perf_ctx_unlock(cpuctx, ctx);

out:
        local_irq_restore(flags);

        if (clone_ctx)
                put_ctx(clone_ctx);
}

static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
                                  struct perf_event_context *ctx,
                                  struct task_struct *task,
                                  bool revoke);

/*
 * Removes all events from the current task that have been marked
 * remove-on-exec, and feeds their values back to parent events.
 */
static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
        struct perf_event_context *clone_ctx = NULL;
        struct perf_event *event, *next;
        unsigned long flags;
        bool modified = false;

        mutex_lock(&ctx->mutex);

        if (WARN_ON_ONCE(ctx->task != current))
                goto unlock;

        list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
                if (!event->attr.remove_on_exec)
                        continue;

                if (!is_kernel_event(event))
                        perf_remove_from_owner(event);

                modified = true;

                perf_event_exit_event(event, ctx, ctx->task, false);
        }

        raw_spin_lock_irqsave(&ctx->lock, flags);
        if (modified)
                clone_ctx = unclone_ctx(ctx);
        raw_spin_unlock_irqrestore(&ctx->lock, flags);

unlock:
        mutex_unlock(&ctx->mutex);

        if (clone_ctx)
                put_ctx(clone_ctx);
}

struct perf_read_data {
        struct perf_event *event;
        bool group;
        int ret;
};

static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);

static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
        int local_cpu = smp_processor_id();
        u16 local_pkg, event_pkg;

        if ((unsigned)event_cpu >= nr_cpu_ids)
                return event_cpu;

        if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
                const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);

                if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
                        return local_cpu;
        }

        if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
                event_pkg = topology_physical_package_id(event_cpu);
                local_pkg = topology_physical_package_id(local_cpu);

                if (event_pkg == local_pkg)
                        return local_cpu;
        }

        return event_cpu;
}

/*
 * Cross CPU call to read the hardware event
 */
static void __perf_event_read(void *info)
{
        struct perf_read_data *data = info;
        struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu;

        /*
         * If this is a task context, we need to check whether it is
         * the current task context of this cpu.  If not it has been
         * scheduled out before the smp call arrived.  In that case
         * event->count would have been updated to a recent sample
         * when the event was scheduled out.
         */
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;

        guard(raw_spinlock)(&ctx->lock);
        ctx_time_update_event(ctx, event);

        perf_event_update_time(event);
        if (data->group)
                perf_event_update_sibling_time(event);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        if (!data->group) {
                perf_pmu_read(event);
                data->ret = 0;
                return;
        }

        pmu = event->pmu_ctx->pmu;
        pmu->start_txn(pmu, PERF_PMU_TXN_READ);

        perf_pmu_read(event);
        for_each_sibling_event(sub, event)
                perf_pmu_read(sub);

        data->ret = pmu->commit_txn(pmu);
}

static inline u64 perf_event_count(struct perf_event *event, bool self)
{
        if (self)
                return local64_read(&event->count);

        return local64_read(&event->count) + atomic64_read(&event->child_count);
}

static void calc_timer_values(struct perf_event *event,
                                u64 *now,
                                u64 *enabled,
                                u64 *running)
{
        u64 ctx_time;

        *now = perf_clock();
        ctx_time = perf_event_time_now(event, *now);
        __perf_update_times(event, ctx_time, enabled, running);
}

/*
 * NMI-safe method to read a local event, that is an event that
 * is:
 *   - either for the current task, or for this CPU
 *   - does not have inherit set, for inherited task events
 *     will not be local and we cannot read them atomically
 *   - must not have a pmu::count method
 */
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running)
{
        unsigned long flags;
        int event_oncpu;
        int event_cpu;
        int ret = 0;

        /*
         * Disabling interrupts avoids all counter scheduling (context
         * switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        /*
         * It must not be an event with inherit set, we cannot read
         * all child counters from atomic context.
         */
        if (event->attr.inherit) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        /* If this is a per-task event, it must be for current */
        if ((event->attach_state & PERF_ATTACH_TASK) &&
            event->hw.target != current) {
                ret = -EINVAL;
                goto out;
        }

        /*
         * Get the event CPU numbers, and adjust them to local if the event is
         * a per-package event that can be read locally
         */
        event_oncpu = __perf_event_read_cpu(event, event->oncpu);
        event_cpu = __perf_event_read_cpu(event, event->cpu);

        /* If this is a per-CPU event, it must be for this CPU */
        if (!(event->attach_state & PERF_ATTACH_TASK) &&
            event_cpu != smp_processor_id()) {
                ret = -EINVAL;
                goto out;
        }

        /* If this is a pinned event it must be running on this CPU */
        if (event->attr.pinned && event_oncpu != smp_processor_id()) {
                ret = -EBUSY;
                goto out;
        }

        /*
         * If the event is currently on this CPU, its either a per-task event,
         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
         * oncpu == -1).
         */
        if (event_oncpu == smp_processor_id())
                event->pmu->read(event);

        *value = local64_read(&event->count);
        if (enabled || running) {
                u64 __enabled, __running, __now;

                calc_timer_values(event, &__now, &__enabled, &__running);
                if (enabled)
                        *enabled = __enabled;
                if (running)
                        *running = __running;
        }
out:
        local_irq_restore(flags);

        return ret;
}

static int perf_event_read(struct perf_event *event, bool group)
{
        enum perf_event_state state = READ_ONCE(event->state);
        int event_cpu, ret = 0;

        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
again:
        if (state == PERF_EVENT_STATE_ACTIVE) {
                struct perf_read_data data;

                /*
                 * Orders the ->state and ->oncpu loads such that if we see
                 * ACTIVE we must also see the right ->oncpu.
                 *
                 * Matches the smp_wmb() from event_sched_in().
                 */
                smp_rmb();

                event_cpu = READ_ONCE(event->oncpu);
                if ((unsigned)event_cpu >= nr_cpu_ids)
                        return 0;

                data = (struct perf_read_data){
                        .event = event,
                        .group = group,
                        .ret = 0,
                };

                preempt_disable();
                event_cpu = __perf_event_read_cpu(event, event_cpu);

                /*
                 * Purposely ignore the smp_call_function_single() return
                 * value.
                 *
                 * If event_cpu isn't a valid CPU it means the event got
                 * scheduled out and that will have updated the event count.
                 *
                 * Therefore, either way, we'll have an up-to-date event count
                 * after this.
                 */
                (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
                preempt_enable();
                ret = data.ret;

        } else if (state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;

                raw_spin_lock_irqsave(&ctx->lock, flags);
                state = event->state;
                if (state != PERF_EVENT_STATE_INACTIVE) {
                        raw_spin_unlock_irqrestore(&ctx->lock, flags);
                        goto again;
                }

                /*
                 * May read while context is not active (e.g., thread is
                 * blocked), in that case we cannot update context time
                 */
                ctx_time_update_event(ctx, event);

                perf_event_update_time(event);
                if (group)
                        perf_event_update_sibling_time(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }

        return ret;
}

/*
 * Initialize the perf_event context in a task_struct:
 */
static void __perf_event_init_context(struct perf_event_context *ctx)
{
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->pmu_ctx_list);
        perf_event_groups_init(&ctx->pinned_groups);
        perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        refcount_set(&ctx->refcount, 1);
}

static void
__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
{
        epc->pmu = pmu;
        INIT_LIST_HEAD(&epc->pmu_ctx_entry);
        INIT_LIST_HEAD(&epc->pinned_active);
        INIT_LIST_HEAD(&epc->flexible_active);
        atomic_set(&epc->refcount, 1);
}

static struct perf_event_context *
alloc_perf_context(struct task_struct *task)
{
        struct perf_event_context *ctx;

        ctx = kzalloc_obj(struct perf_event_context);
        if (!ctx)
                return NULL;

        __perf_event_init_context(ctx);
        if (task)
                ctx->task = get_task_struct(task);

        return ctx;
}

static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
        struct task_struct *task;

        rcu_read_lock();
        if (!vpid)
                task = current;
        else
                task = find_task_by_vpid(vpid);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        if (!task)
                return ERR_PTR(-ESRCH);

        return task;
}

/*
 * Returns a matching context with refcount and pincount.
 */
static struct perf_event_context *
find_get_context(struct task_struct *task, struct perf_event *event)
{
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
        unsigned long flags;
        int err;

        if (!task) {
                /* Must be root to operate on a CPU event: */
                err = perf_allow_cpu();
                if (err)
                        return ERR_PTR(err);

                cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
                raw_spin_lock_irqsave(&ctx->lock, flags);
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                return ctx;
        }

        err = -EINVAL;
retry:
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;

                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                if (clone_ctx)
                        put_ctx(clone_ctx);
        } else {
                ctx = alloc_perf_context(task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;

                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
                 * If it has already passed perf_event_exit_task().
                 * we must see PF_EXITING, it takes this mutex too.
                 */
                if (task->flags & PF_EXITING)
                        err = -ESRCH;
                else if (task->perf_event_ctxp)
                        err = -EAGAIN;
                else {
                        get_ctx(ctx);
                        ++ctx->pin_count;
                        rcu_assign_pointer(task->perf_event_ctxp, ctx);
                }
                mutex_unlock(&task->perf_event_mutex);

                if (unlikely(err)) {
                        put_ctx(ctx);

                        if (err == -EAGAIN)
                                goto retry;
                        goto errout;
                }
        }

        return ctx;

errout:
        return ERR_PTR(err);
}

static struct perf_event_pmu_context *
find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
                     struct perf_event *event)
{
        struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;

        if (!ctx->task) {
                /*
                 * perf_pmu_migrate_context() / __perf_pmu_install_event()
                 * relies on the fact that find_get_pmu_context() cannot fail
                 * for CPU contexts.
                 */
                struct perf_cpu_pmu_context *cpc;

                cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
                epc = &cpc->epc;
                raw_spin_lock_irq(&ctx->lock);
                if (!epc->ctx) {
                        /*
                         * One extra reference for the pmu; see perf_pmu_free().
                         */
                        atomic_set(&epc->refcount, 2);
                        epc->embedded = 1;
                        list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
                        epc->ctx = ctx;
                } else {
                        WARN_ON_ONCE(epc->ctx != ctx);
                        atomic_inc(&epc->refcount);
                }
                raw_spin_unlock_irq(&ctx->lock);
                return epc;
        }

        new = kzalloc_obj(*epc);
        if (!new)
                return ERR_PTR(-ENOMEM);

        __perf_init_event_pmu_context(new, pmu);

        /*
         * XXX
         *
         * lockdep_assert_held(&ctx->mutex);
         *
         * can't because perf_event_init_task() doesn't actually hold the
         * child_ctx->mutex.
         */

        raw_spin_lock_irq(&ctx->lock);
        list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (epc->pmu == pmu) {
                        WARN_ON_ONCE(epc->ctx != ctx);
                        atomic_inc(&epc->refcount);
                        goto found_epc;
                }
                /* Make sure the pmu_ctx_list is sorted by PMU type: */
                if (!pos && epc->pmu->type > pmu->type)
                        pos = epc;
        }

        epc = new;
        new = NULL;

        if (!pos)
                list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
        else
                list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev);

        epc->ctx = ctx;

found_epc:
        raw_spin_unlock_irq(&ctx->lock);
        kfree(new);

        return epc;
}

static void get_pmu_ctx(struct perf_event_pmu_context *epc)
{
        WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
}

static void free_cpc_rcu(struct rcu_head *head)
{
        struct perf_cpu_pmu_context *cpc =
                container_of(head, typeof(*cpc), epc.rcu_head);

        kfree(cpc);
}

static void free_epc_rcu(struct rcu_head *head)
{
        struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);

        kfree(epc);
}

static void put_pmu_ctx(struct perf_event_pmu_context *epc)
{
        struct perf_event_context *ctx = epc->ctx;
        unsigned long flags;

        /*
         * XXX
         *
         * lockdep_assert_held(&ctx->mutex);
         *
         * can't because of the call-site in _free_event()/put_event()
         * which isn't always called under ctx->mutex.
         */
        if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
                return;

        WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));

        list_del_init(&epc->pmu_ctx_entry);
        epc->ctx = NULL;

        WARN_ON_ONCE(!list_empty(&epc->pinned_active));
        WARN_ON_ONCE(!list_empty(&epc->flexible_active));

        raw_spin_unlock_irqrestore(&ctx->lock, flags);

        if (epc->embedded) {
                call_rcu(&epc->rcu_head, free_cpc_rcu);
                return;
        }

        call_rcu(&epc->rcu_head, free_epc_rcu);
}

static void perf_event_free_filter(struct perf_event *event);

static void free_event_rcu(struct rcu_head *head)
{
        struct perf_event *event = container_of(head, typeof(*event), rcu_head);

        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
        kmem_cache_free(perf_event_cache, event);
}

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb);

static void detach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_del_rcu(&event->sb_list);
        raw_spin_unlock(&pel->lock);
}

static bool is_sb_event(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        if (event->parent)
                return false;

        if (event->attach_state & PERF_ATTACH_TASK)
                return false;

        if (attr->mmap || attr->mmap_data || attr->mmap2 ||
            attr->comm || attr->comm_exec ||
            attr->task || attr->ksymbol ||
            attr->context_switch || attr->text_poke ||
            attr->bpf_event)
                return true;

        return false;
}

static void unaccount_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                detach_sb_event(event);
}

#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif

static void unaccount_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        spin_lock(&nr_freq_lock);
        if (atomic_dec_and_test(&nr_freq_events))
                tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void unaccount_freq_event(void)
{
        if (tick_nohz_full_enabled())
                unaccount_freq_event_nohz();
        else
                atomic_dec(&nr_freq_events);
}


static struct perf_ctx_data *
alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global, gfp_t gfp_flags)
{
        struct perf_ctx_data *cd;

        cd = kzalloc_obj(*cd, gfp_flags);
        if (!cd)
                return NULL;

        cd->data = kmem_cache_zalloc(ctx_cache, gfp_flags);
        if (!cd->data) {
                kfree(cd);
                return NULL;
        }

        cd->global = global;
        cd->ctx_cache = ctx_cache;
        refcount_set(&cd->refcount, 1);

        return cd;
}

static void free_perf_ctx_data(struct perf_ctx_data *cd)
{
        kmem_cache_free(cd->ctx_cache, cd->data);
        kfree(cd);
}

static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
{
        struct perf_ctx_data *cd;

        cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
        free_perf_ctx_data(cd);
}

static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
{
        call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
}

static int
attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
                     bool global, gfp_t gfp_flags)
{
        struct perf_ctx_data *cd, *old = NULL;

        cd = alloc_perf_ctx_data(ctx_cache, global, gfp_flags);
        if (!cd)
                return -ENOMEM;

        for (;;) {
                if (try_cmpxchg(&task->perf_ctx_data, &old, cd)) {
                        if (old)
                                perf_free_ctx_data_rcu(old);
                        /*
                         * Above try_cmpxchg() pairs with try_cmpxchg() from
                         * detach_task_ctx_data() such that
                         * if we race with perf_event_exit_task(), we must
                         * observe PF_EXITING.
                         */
                        if (task->flags & PF_EXITING) {
                                /* detach_task_ctx_data() may free it already */
                                if (try_cmpxchg(&task->perf_ctx_data, &cd, NULL))
                                        perf_free_ctx_data_rcu(cd);
                        }
                        return 0;
                }

                if (!old) {
                        /*
                         * After seeing a dead @old, we raced with
                         * removal and lost, try again to install @cd.
                         */
                        continue;
                }

                if (refcount_inc_not_zero(&old->refcount)) {
                        free_perf_ctx_data(cd); /* unused */
                        return 0;
                }

                /*
                 * @old is a dead object, refcount==0 is stable, try and
                 * replace it with @cd.
                 */
        }
        return 0;
}

static void __detach_global_ctx_data(void);
DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
static refcount_t global_ctx_data_ref;

static int
attach_global_ctx_data(struct kmem_cache *ctx_cache)
{
        struct task_struct *g, *p;
        struct perf_ctx_data *cd;
        int ret;

        if (refcount_inc_not_zero(&global_ctx_data_ref))
                return 0;

        guard(percpu_write)(&global_ctx_data_rwsem);
        if (refcount_inc_not_zero(&global_ctx_data_ref))
                return 0;
again:
        /* Allocate everything */
        scoped_guard (rcu) {
                for_each_process_thread(g, p) {
                        if (p->flags & PF_EXITING)
                                continue;
                        cd = rcu_dereference(p->perf_ctx_data);
                        if (cd && !cd->global) {
                                cd->global = 1;
                                if (!refcount_inc_not_zero(&cd->refcount))
                                        cd = NULL;
                        }
                        if (!cd) {
                                /*
                                 * Try to allocate context quickly before
                                 * traversing the whole thread list again.
                                 */
                                if (!attach_task_ctx_data(p, ctx_cache, true, GFP_NOWAIT))
                                        continue;
                                get_task_struct(p);
                                goto alloc;
                        }
                }
        }

        refcount_set(&global_ctx_data_ref, 1);

        return 0;
alloc:
        ret = attach_task_ctx_data(p, ctx_cache, true, GFP_KERNEL);
        put_task_struct(p);
        if (ret) {
                __detach_global_ctx_data();
                return ret;
        }
        goto again;
}

static int
attach_perf_ctx_data(struct perf_event *event)
{
        struct task_struct *task = event->hw.target;
        struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
        int ret;

        if (!ctx_cache)
                return -ENOMEM;

        if (task)
                return attach_task_ctx_data(task, ctx_cache, false, GFP_KERNEL);

        ret = attach_global_ctx_data(ctx_cache);
        if (ret)
                return ret;

        event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
        return 0;
}

static void
detach_task_ctx_data(struct task_struct *p)
{
        struct perf_ctx_data *cd;

        scoped_guard (rcu) {
                cd = rcu_dereference(p->perf_ctx_data);
                if (!cd || !refcount_dec_and_test(&cd->refcount))
                        return;
        }

        /*
         * The old ctx_data may be lost because of the race.
         * Nothing is required to do for the case.
         * See attach_task_ctx_data().
         */
        if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
                perf_free_ctx_data_rcu(cd);
}

static void __detach_global_ctx_data(void)
{
        struct task_struct *g, *p;
        struct perf_ctx_data *cd;

        scoped_guard (rcu) {
                for_each_process_thread(g, p) {
                        cd = rcu_dereference(p->perf_ctx_data);
                        if (cd && cd->global) {
                                cd->global = 0;
                                detach_task_ctx_data(p);
                        }
                }
        }
}

static void detach_global_ctx_data(void)
{
        if (refcount_dec_not_one(&global_ctx_data_ref))
                return;

        guard(percpu_write)(&global_ctx_data_rwsem);
        if (!refcount_dec_and_test(&global_ctx_data_ref))
                return;

        /* remove everything */
        __detach_global_ctx_data();
}

static void detach_perf_ctx_data(struct perf_event *event)
{
        struct task_struct *task = event->hw.target;

        event->attach_state &= ~PERF_ATTACH_TASK_DATA;

        if (task)
                return detach_task_ctx_data(task);

        if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
                detach_global_ctx_data();
                event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
        }
}

static void unaccount_event(struct perf_event *event)
{
        bool dec = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                dec = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_dec(&nr_mmap_events);
        if (event->attr.build_id)
                atomic_dec(&nr_build_id_events);
        if (event->attr.comm)
                atomic_dec(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_dec(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_dec(&nr_cgroup_events);
        if (event->attr.task)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
                unaccount_freq_event();
        if (event->attr.context_switch) {
                dec = true;
                atomic_dec(&nr_switch_events);
        }
        if (is_cgroup_event(event))
                dec = true;
        if (has_branch_stack(event))
                dec = true;
        if (event->attr.ksymbol)
                atomic_dec(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_dec(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_dec(&nr_text_poke_events);

        if (dec) {
                if (!atomic_add_unless(&perf_sched_count, -1, 1))
                        schedule_delayed_work(&perf_sched_work, HZ);
        }

        unaccount_pmu_sb_event(event);
}

static void perf_sched_delayed(struct work_struct *work)
{
        mutex_lock(&perf_sched_mutex);
        if (atomic_dec_and_test(&perf_sched_count))
                static_branch_disable(&perf_sched_events);
        mutex_unlock(&perf_sched_mutex);
}

/*
 * The following implement mutual exclusion of events on "exclusive" pmus
 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
 * at a time, so we disallow creating events that might conflict, namely:
 *
 *  1) cpu-wide events in the presence of per-task events,
 *  2) per-task events in the presence of cpu-wide events,
 *  3) two matching events on the same perf_event_context.
 *
 * The former two cases are handled in the allocation path (perf_event_alloc(),
 * _free_event()), the latter -- before the first perf_install_in_context().
 */
static int exclusive_event_init(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        if (!is_exclusive_pmu(pmu))
                return 0;

        /*
         * Prevent co-existence of per-task and cpu-wide events on the
         * same exclusive pmu.
         *
         * Negative pmu::exclusive_cnt means there are cpu-wide
         * events on this "exclusive" pmu, positive means there are
         * per-task events.
         *
         * Since this is called in perf_event_alloc() path, event::ctx
         * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
         * to mean "per-task event", because unlike other attach states it
         * never gets cleared.
         */
        if (event->attach_state & PERF_ATTACH_TASK) {
                if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
                        return -EBUSY;
        } else {
                if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
                        return -EBUSY;
        }

        event->attach_state |= PERF_ATTACH_EXCLUSIVE;

        return 0;
}

static void exclusive_event_destroy(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        /* see comment in exclusive_event_init() */
        if (event->attach_state & PERF_ATTACH_TASK)
                atomic_dec(&pmu->exclusive_cnt);
        else
                atomic_inc(&pmu->exclusive_cnt);

        event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
}

static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
{
        if ((e1->pmu == e2->pmu) &&
            (e1->cpu == e2->cpu ||
             e1->cpu == -1 ||
             e2->cpu == -1))
                return true;
        return false;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx)
{
        struct perf_event *iter_event;
        struct pmu *pmu = event->pmu;

        lockdep_assert_held(&ctx->mutex);

        if (!is_exclusive_pmu(pmu))
                return true;

        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
                if (exclusive_event_match(iter_event, event))
                        return false;
        }

        return true;
}

static void perf_free_addr_filters(struct perf_event *event);

/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        security_perf_event_free(event);

        if (event->attach_state & PERF_ATTACH_CALLCHAIN)
                put_callchain_buffers();

        kfree(event->addr_filter_ranges);

        if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
                exclusive_event_destroy(event);

        if (is_cgroup_event(event))
                perf_detach_cgroup(event);

        if (event->attach_state & PERF_ATTACH_TASK_DATA)
                detach_perf_ctx_data(event);

        if (event->destroy)
                event->destroy(event);

        /*
         * Must be after ->destroy(), due to uprobe_perf_close() using
         * hw.target.
         */
        if (event->hw.target)
                put_task_struct(event->hw.target);

        if (event->pmu_ctx) {
                /*
                 * put_pmu_ctx() needs an event->ctx reference, because of
                 * epc->ctx.
                 */
                WARN_ON_ONCE(!pmu);
                WARN_ON_ONCE(!event->ctx);
                WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
                put_pmu_ctx(event->pmu_ctx);
        }

        /*
         * perf_event_free_task() relies on put_ctx() being 'last', in
         * particular all task references must be cleaned up.
         */
        if (event->ctx)
                put_ctx(event->ctx);

        if (pmu) {
                module_put(pmu->module);
                scoped_guard (spinlock, &pmu->events_lock) {
                        list_del(&event->pmu_list);
                        wake_up_var(pmu);
                }
        }

        call_rcu(&event->rcu_head, free_event_rcu);
}

static void mediated_pmu_unaccount_event(struct perf_event *event);

DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))

/* vs perf_event_alloc() success */
static void _free_event(struct perf_event *event)
{
        irq_work_sync(&event->pending_irq);
        irq_work_sync(&event->pending_disable_irq);

        unaccount_event(event);
        mediated_pmu_unaccount_event(event);

        if (event->rb) {
                /*
                 * Can happen when we close an event with re-directed output.
                 *
                 * Since we have a 0 refcount, perf_mmap_close() will skip
                 * over us; possibly making our ring_buffer_put() the last.
                 */
                mutex_lock(&event->mmap_mutex);
                ring_buffer_attach(event, NULL);
                mutex_unlock(&event->mmap_mutex);
        }

        perf_event_free_bpf_prog(event);
        perf_free_addr_filters(event);

        __free_event(event);
}

/*
 * Used to free events which have a known refcount of 1, such as in error paths
 * of inherited events.
 */
static void free_event(struct perf_event *event)
{
        if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
                                     "unexpected event refcount: %ld; ptr=%p\n",
                                     atomic_long_read(&event->refcount), event)) {
                /* leak to avoid use-after-free */
                return;
        }

        _free_event(event);
}

/*
 * Remove user event from the owner task.
 */
static void perf_remove_from_owner(struct perf_event *event)
{
        struct task_struct *owner;

        rcu_read_lock();
        /*
         * Matches the smp_store_release() in perf_event_exit_task(). If we
         * observe !owner it means the list deletion is complete and we can
         * indeed free this event, otherwise we need to serialize on
         * owner->perf_event_mutex.
         */
        owner = READ_ONCE(event->owner);
        if (owner) {
                /*
                 * Since delayed_put_task_struct() also drops the last
                 * task reference we can safely take a new reference
                 * while holding the rcu_read_lock().
                 */
                get_task_struct(owner);
        }
        rcu_read_unlock();

        if (owner) {
                /*
                 * If we're here through perf_event_exit_task() we're already
                 * holding ctx->mutex which would be an inversion wrt. the
                 * normal lock order.
                 *
                 * However we can safely take this lock because its the child
                 * ctx->mutex.
                 */
                mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);

                /*
                 * We have to re-check the event->owner field, if it is cleared
                 * we raced with perf_event_exit_task(), acquiring the mutex
                 * ensured they're done, and we can proceed with freeing the
                 * event.
                 */
                if (event->owner) {
                        list_del_init(&event->owner_entry);
                        smp_store_release(&event->owner, NULL);
                }
                mutex_unlock(&owner->perf_event_mutex);
                put_task_struct(owner);
        }
}

static void put_event(struct perf_event *event)
{
        struct perf_event *parent;

        if (!atomic_long_dec_and_test(&event->refcount))
                return;

        parent = event->parent;
        _free_event(event);

        /* Matches the refcount bump in inherit_event() */
        if (parent)
                put_event(parent);
}

/*
 * Kill an event dead; while event:refcount will preserve the event
 * object, it will not preserve its functionality. Once the last 'user'
 * gives up the object, we'll destroy the thing.
 */
int perf_event_release_kernel(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *child, *tmp;

        /*
         * If we got here through err_alloc: free_event(event); we will not
         * have attached to a context yet.
         */
        if (!ctx) {
                WARN_ON_ONCE(event->attach_state &
                                (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
                goto no_ctx;
        }

        if (!is_kernel_event(event))
                perf_remove_from_owner(event);

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(ctx->parent_ctx);

        /*
         * Mark this event as STATE_DEAD, there is no external reference to it
         * anymore.
         *
         * Anybody acquiring event->child_mutex after the below loop _must_
         * also see this, most importantly inherit_event() which will avoid
         * placing more children on the list.
         *
         * Thus this guarantees that we will in fact observe and kill _ALL_
         * child events.
         */
        if (event->state > PERF_EVENT_STATE_REVOKED) {
                perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
        } else {
                event->state = PERF_EVENT_STATE_DEAD;
        }

        perf_event_ctx_unlock(event, ctx);

again:
        mutex_lock(&event->child_mutex);
        list_for_each_entry(child, &event->child_list, child_list) {
                /*
                 * Cannot change, child events are not migrated, see the
                 * comment with perf_event_ctx_lock_nested().
                 */
                ctx = READ_ONCE(child->ctx);
                /*
                 * Since child_mutex nests inside ctx::mutex, we must jump
                 * through hoops. We start by grabbing a reference on the ctx.
                 *
                 * Since the event cannot get freed while we hold the
                 * child_mutex, the context must also exist and have a !0
                 * reference count.
                 */
                get_ctx(ctx);

                /*
                 * Now that we have a ctx ref, we can drop child_mutex, and
                 * acquire ctx::mutex without fear of it going away. Then we
                 * can re-acquire child_mutex.
                 */
                mutex_unlock(&event->child_mutex);
                mutex_lock(&ctx->mutex);
                mutex_lock(&event->child_mutex);

                /*
                 * Now that we hold ctx::mutex and child_mutex, revalidate our
                 * state, if child is still the first entry, it didn't get freed
                 * and we can continue doing so.
                 */
                tmp = list_first_entry_or_null(&event->child_list,
                                               struct perf_event, child_list);
                if (tmp == child) {
                        perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
                } else {
                        child = NULL;
                }

                mutex_unlock(&event->child_mutex);
                mutex_unlock(&ctx->mutex);

                if (child) {
                        /* Last reference unless ->pending_task work is pending */
                        put_event(child);
                }
                put_ctx(ctx);

                goto again;
        }
        mutex_unlock(&event->child_mutex);

no_ctx:
        /*
         * Last reference unless ->pending_task work is pending on this event
         * or any of its children.
         */
        put_event(event);
        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);

/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
{
        perf_event_release_kernel(file->private_data);
        return 0;
}

static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event *child;
        u64 total = 0;

        *enabled = 0;
        *running = 0;

        mutex_lock(&event->child_mutex);

        (void)perf_event_read(event, false);
        total += perf_event_count(event, false);

        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);

        list_for_each_entry(child, &event->child_list, child_list) {
                (void)perf_event_read(child, false);
                total += perf_event_count(child, false);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
        mutex_unlock(&event->child_mutex);

        return total;
}

u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        count = __perf_event_read_value(event, enabled, running);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_read_value);

static int __perf_read_group_add(struct perf_event *leader,
                                        u64 read_format, u64 *values)
{
        struct perf_event_context *ctx = leader->ctx;
        struct perf_event *sub, *parent;
        unsigned long flags;
        int n = 1; /* skip @nr */
        int ret;

        ret = perf_event_read(leader, true);
        if (ret)
                return ret;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        /*
         * Verify the grouping between the parent and child (inherited)
         * events is still in tact.
         *
         * Specifically:
         *  - leader->ctx->lock pins leader->sibling_list
         *  - parent->child_mutex pins parent->child_list
         *  - parent->ctx->mutex pins parent->sibling_list
         *
         * Because parent->ctx != leader->ctx (and child_list nests inside
         * ctx->mutex), group destruction is not atomic between children, also
         * see perf_event_release_kernel(). Additionally, parent can grow the
         * group.
         *
         * Therefore it is possible to have parent and child groups in a
         * different configuration and summing over such a beast makes no sense
         * what so ever.
         *
         * Reject this.
         */
        parent = leader->parent;
        if (parent &&
            (parent->group_generation != leader->group_generation ||
             parent->nr_siblings != leader->nr_siblings)) {
                ret = -ECHILD;
                goto unlock;
        }

        /*
         * Since we co-schedule groups, {enabled,running} times of siblings
         * will be identical to those of the leader, so we only publish one
         * set.
         */
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] += leader->total_time_enabled +
                        atomic64_read(&leader->child_total_time_enabled);
        }

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] += leader->total_time_running +
                        atomic64_read(&leader->child_total_time_running);
        }

        /*
         * Write {count,id} tuples for every sibling.
         */
        values[n++] += perf_event_count(leader, false);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        for_each_sibling_event(sub, leader) {
                values[n++] += perf_event_count(sub, false);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);
        }

unlock:
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
        return ret;
}

static int perf_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
{
        struct perf_event *leader = event->group_leader, *child;
        struct perf_event_context *ctx = leader->ctx;
        int ret;
        u64 *values;

        lockdep_assert_held(&ctx->mutex);

        values = kzalloc(event->read_size, GFP_KERNEL);
        if (!values)
                return -ENOMEM;

        values[0] = 1 + leader->nr_siblings;

        mutex_lock(&leader->child_mutex);

        ret = __perf_read_group_add(leader, read_format, values);
        if (ret)
                goto unlock;

        list_for_each_entry(child, &leader->child_list, child_list) {
                ret = __perf_read_group_add(child, read_format, values);
                if (ret)
                        goto unlock;
        }

        mutex_unlock(&leader->child_mutex);

        ret = event->read_size;
        if (copy_to_user(buf, values, event->read_size))
                ret = -EFAULT;
        goto out;

unlock:
        mutex_unlock(&leader->child_mutex);
out:
        kfree(values);
        return ret;
}

static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
{
        u64 enabled, running;
        u64 values[5];
        int n = 0;

        values[n++] = __perf_event_read_value(event, &enabled, &running);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        if (copy_to_user(buf, values, n * sizeof(u64)))
                return -EFAULT;

        return n * sizeof(u64);
}

static bool is_event_hup(struct perf_event *event)
{
        bool no_children;

        if (event->state > PERF_EVENT_STATE_EXIT)
                return false;

        mutex_lock(&event->child_mutex);
        no_children = list_empty(&event->child_list);
        mutex_unlock(&event->child_mutex);
        return no_children;
}

/*
 * Read the performance event - simple non blocking version for now
 */
static ssize_t
__perf_read(struct perf_event *event, char __user *buf, size_t count)
{
        u64 read_format = event->attr.read_format;
        int ret;

        /*
         * Return end-of-file for a read on an event that is in
         * error state (i.e. because it was pinned but it couldn't be
         * scheduled on to the CPU at some point).
         */
        if (event->state == PERF_EVENT_STATE_ERROR)
                return 0;

        if (count < event->read_size)
                return -ENOSPC;

        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
                ret = perf_read_group(event, read_format, buf);
        else
                ret = perf_read_one(event, read_format, buf);

        return ret;
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        int ret;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

static __poll_t perf_poll(struct file *file, poll_table *wait)
{
        struct perf_event *event = file->private_data;
        struct perf_buffer *rb;
        __poll_t events = EPOLLHUP;

        if (event->state <= PERF_EVENT_STATE_REVOKED)
                return EPOLLERR;

        poll_wait(file, &event->waitq, wait);

        if (event->state <= PERF_EVENT_STATE_REVOKED)
                return EPOLLERR;

        if (is_event_hup(event))
                return events;

        if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
                     event->attr.pinned))
                return EPOLLERR;

        /*
         * Pin the event->rb by taking event->mmap_mutex; otherwise
         * perf_event_set_output() can swizzle our rb and make us miss wakeups.
         */
        mutex_lock(&event->mmap_mutex);
        rb = event->rb;
        if (rb)
                events = atomic_xchg(&rb->poll, 0);
        mutex_unlock(&event->mmap_mutex);
        return events;
}

static void _perf_event_reset(struct perf_event *event)
{
        (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
}

/* Assume it's not an event with inherit set. */
u64 perf_event_pause(struct perf_event *event, bool reset)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(event->attr.inherit);
        _perf_event_disable(event);
        count = local64_read(&event->count);
        if (reset)
                local64_set(&event->count, 0);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_pause);

#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU
static atomic_t nr_include_guest_events __read_mostly;

static atomic_t nr_mediated_pmu_vms __read_mostly;
static DEFINE_MUTEX(perf_mediated_pmu_mutex);

/* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */
static inline bool is_include_guest_event(struct perf_event *event)
{
        if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) &&
            !event->attr.exclude_guest)
                return true;

        return false;
}

static int mediated_pmu_account_event(struct perf_event *event)
{
        if (!is_include_guest_event(event))
                return 0;

        if (atomic_inc_not_zero(&nr_include_guest_events))
                return 0;

        guard(mutex)(&perf_mediated_pmu_mutex);
        if (atomic_read(&nr_mediated_pmu_vms))
                return -EOPNOTSUPP;

        atomic_inc(&nr_include_guest_events);
        return 0;
}

static void mediated_pmu_unaccount_event(struct perf_event *event)
{
        if (!is_include_guest_event(event))
                return;

        if (WARN_ON_ONCE(!atomic_read(&nr_include_guest_events)))
                return;

        atomic_dec(&nr_include_guest_events);
}

/*
 * Currently invoked at VM creation to
 * - Check whether there are existing !exclude_guest events of PMU with
 *   PERF_PMU_CAP_MEDIATED_VPMU
 * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on
 *   PMUs with PERF_PMU_CAP_MEDIATED_VPMU
 *
 * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf
 * still owns all the PMU resources.
 */
int perf_create_mediated_pmu(void)
{
        if (atomic_inc_not_zero(&nr_mediated_pmu_vms))
                return 0;

        guard(mutex)(&perf_mediated_pmu_mutex);
        if (atomic_read(&nr_include_guest_events))
                return -EBUSY;

        atomic_inc(&nr_mediated_pmu_vms);
        return 0;
}
EXPORT_SYMBOL_FOR_KVM(perf_create_mediated_pmu);

void perf_release_mediated_pmu(void)
{
        if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms)))
                return;

        atomic_dec(&nr_mediated_pmu_vms);
}
EXPORT_SYMBOL_FOR_KVM(perf_release_mediated_pmu);

/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */
void perf_load_guest_context(void)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

        lockdep_assert_irqs_disabled();

        guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);

        if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded)))
                return;

        perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
        ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
        if (cpuctx->task_ctx) {
                perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
                task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
        }

        perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
        if (cpuctx->task_ctx)
                perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);

        __this_cpu_write(guest_ctx_loaded, true);
}
EXPORT_SYMBOL_GPL(perf_load_guest_context);

void perf_put_guest_context(void)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

        lockdep_assert_irqs_disabled();

        guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);

        if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded)))
                return;

        perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
        if (cpuctx->task_ctx)
                perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);

        perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);

        if (cpuctx->task_ctx)
                perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
        perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);

        __this_cpu_write(guest_ctx_loaded, false);
}
EXPORT_SYMBOL_GPL(perf_put_guest_context);
#else
static int mediated_pmu_account_event(struct perf_event *event) { return 0; }
static void mediated_pmu_unaccount_event(struct perf_event *event) {}
#endif

/*
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in perf_event_exit_event() if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
 */
static void perf_event_for_each_child(struct perf_event *event,
                                        void (*func)(struct perf_event *))
{
        struct perf_event *child;

        WARN_ON_ONCE(event->ctx->parent_ctx);

        mutex_lock(&event->child_mutex);
        func(event);
        list_for_each_entry(child, &event->child_list, child_list)
                func(child);
        mutex_unlock(&event->child_mutex);
}

static void perf_event_for_each(struct perf_event *event,
                                  void (*func)(struct perf_event *))
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *sibling;

        lockdep_assert_held(&ctx->mutex);

        event = event->group_leader;

        perf_event_for_each_child(event, func);
        for_each_sibling_event(sibling, event)
                perf_event_for_each_child(sibling, func);
}

static void __perf_event_period(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        u64 value = *((u64 *)info);
        bool active;

        if (event->attr.freq) {
                event->attr.sample_freq = value;
        } else {
                event->attr.sample_period = value;
                event->hw.sample_period = value;
        }

        active = (event->state == PERF_EVENT_STATE_ACTIVE);
        if (active) {
                perf_pmu_disable(event->pmu);
                event->pmu->stop(event, PERF_EF_UPDATE);
        }

        local64_set(&event->hw.period_left, 0);

        if (active) {
                event->pmu->start(event, PERF_EF_RELOAD);
                /*
                 * Once the period is force-reset, the event starts immediately.
                 * But the event/group could be throttled. Unthrottle the
                 * event/group now to avoid the next tick trying to unthrottle
                 * while we already re-started the event/group.
                 */
                if (event->hw.interrupts == MAX_INTERRUPTS)
                        perf_event_unthrottle_group(event, true);
                perf_pmu_enable(event->pmu);
        }
}

static int perf_event_check_period(struct perf_event *event, u64 value)
{
        return event->pmu->check_period(event, value);
}

static int _perf_event_period(struct perf_event *event, u64 value)
{
        if (!is_sampling_event(event))
                return -EINVAL;

        if (!value)
                return -EINVAL;

        if (event->attr.freq) {
                if (value > sysctl_perf_event_sample_rate)
                        return -EINVAL;
        } else {
                if (perf_event_check_period(event, value))
                        return -EINVAL;
                if (value & (1ULL << 63))
                        return -EINVAL;
        }

        event_function_call(event, __perf_event_period, &value);

        return 0;
}

int perf_event_period(struct perf_event *event, u64 value)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_period(event, value);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_period);

static const struct file_operations perf_fops;

static inline bool is_perf_file(struct fd f)
{
        return !fd_empty(f) && fd_file(f)->f_op == &perf_fops;
}

static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr);
static int __perf_event_set_bpf_prog(struct perf_event *event,
                                     struct bpf_prog *prog,
                                     u64 bpf_cookie);

static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
        void (*func)(struct perf_event *);
        u32 flags = arg;

        if (event->state <= PERF_EVENT_STATE_REVOKED)
                return -ENODEV;

        switch (cmd) {
        case PERF_EVENT_IOC_ENABLE:
                func = _perf_event_enable;
                break;
        case PERF_EVENT_IOC_DISABLE:
                func = _perf_event_disable;
                break;
        case PERF_EVENT_IOC_RESET:
                func = _perf_event_reset;
                break;

        case PERF_EVENT_IOC_REFRESH:
                return _perf_event_refresh(event, arg);

        case PERF_EVENT_IOC_PERIOD:
        {
                u64 value;

                if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
                        return -EFAULT;

                return _perf_event_period(event, value);
        }
        case PERF_EVENT_IOC_ID:
        {
                u64 id = primary_event_id(event);

                if (copy_to_user((void __user *)arg, &id, sizeof(id)))
                        return -EFAULT;
                return 0;
        }

        case PERF_EVENT_IOC_SET_OUTPUT:
        {
                CLASS(fd, output)(arg);             // arg == -1 => empty
                struct perf_event *output_event = NULL;
                if (arg != -1) {
                        if (!is_perf_file(output))
                                return -EBADF;
                        output_event = fd_file(output)->private_data;
                }
                return perf_event_set_output(event, output_event);
        }

        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);

        case PERF_EVENT_IOC_SET_BPF:
        {
                struct bpf_prog *prog;
                int err;

                prog = bpf_prog_get(arg);
                if (IS_ERR(prog))
                        return PTR_ERR(prog);

                err = __perf_event_set_bpf_prog(event, prog, 0);
                if (err) {
                        bpf_prog_put(prog);
                        return err;
                }

                return 0;
        }

        case PERF_EVENT_IOC_PAUSE_OUTPUT: {
                struct perf_buffer *rb;

                rcu_read_lock();
                rb = rcu_dereference(event->rb);
                if (!rb || !rb->nr_pages) {
                        rcu_read_unlock();
                        return -EINVAL;
                }
                rb_toggle_paused(rb, !!arg);
                rcu_read_unlock();
                return 0;
        }

        case PERF_EVENT_IOC_QUERY_BPF:
                return perf_event_query_prog_array(event, (void __user *)arg);

        case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
                struct perf_event_attr new_attr;
                int err = perf_copy_attr((struct perf_event_attr __user *)arg,
                                         &new_attr);

                if (err)
                        return err;

                return perf_event_modify_attr(event,  &new_attr);
        }
        default:
                return -ENOTTY;
        }

        if (flags & PERF_IOC_FLAG_GROUP)
                perf_event_for_each(event, func);
        else
                perf_event_for_each_child(event, func);

        return 0;
}

static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        long ret;

        /* Treat ioctl like writes as it is likely a mutating operation. */
        ret = security_perf_event_write(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_ioctl(event, cmd, arg);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

#ifdef CONFIG_COMPAT
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
{
        switch (_IOC_NR(cmd)) {
        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
        case _IOC_NR(PERF_EVENT_IOC_ID):
        case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
        case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
                        cmd &= ~IOCSIZE_MASK;
                        cmd |= sizeof(void *) << IOCSIZE_SHIFT;
                }
                break;
        }
        return perf_ioctl(file, cmd, arg);
}
#else
# define perf_compat_ioctl NULL
#endif

int perf_event_task_enable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_enable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

int perf_event_task_disable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_disable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

static int perf_event_index(struct perf_event *event)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;

        return event->pmu->event_idx(event);
}

static void perf_event_init_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        userpg = rb->user_page;

        /* Allow new userspace to detect that bit 0 is deprecated */
        userpg->cap_bit0_is_deprecated = 1;
        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
        userpg->data_offset = PAGE_SIZE;
        userpg->data_size = perf_data_size(rb);

unlock:
        rcu_read_unlock();
}

void __weak arch_perf_update_userpage(
        struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}

/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
void perf_event_update_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;
        u64 enabled, running, now;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        /*
         * Disable preemption to guarantee consistent time stamps are stored to
         * the user page.
         */
        preempt_disable();

        /*
         * Compute total_time_enabled, total_time_running based on snapshot
         * values taken when the event was last scheduled in.
         *
         * We cannot simply call update_context_time() because doing so would
         * lead to deadlock when called from NMI context.
         */
        calc_timer_values(event, &now, &enabled, &running);

        userpg = rb->user_page;

        ++userpg->lock;
        barrier();
        userpg->index = perf_event_index(event);
        userpg->offset = perf_event_count(event, false);
        if (userpg->index)
                userpg->offset -= local64_read(&event->hw.prev_count);

        userpg->time_enabled = enabled +
                        atomic64_read(&event->child_total_time_enabled);

        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);

        arch_perf_update_userpage(event, userpg, now);

        barrier();
        ++userpg->lock;
        preempt_enable();
unlock:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb)
{
        struct perf_buffer *old_rb = NULL;
        unsigned long flags;

        WARN_ON_ONCE(event->parent);

        if (event->rb) {
                /*
                 * Should be impossible, we set this when removing
                 * event->rb_entry and wait/clear when adding event->rb_entry.
                 */
                WARN_ON_ONCE(event->rcu_pending);

                old_rb = event->rb;
                spin_lock_irqsave(&old_rb->event_lock, flags);
                list_del_rcu(&event->rb_entry);
                spin_unlock_irqrestore(&old_rb->event_lock, flags);

                event->rcu_batches = get_state_synchronize_rcu();
                event->rcu_pending = 1;
        }

        if (rb) {
                if (event->rcu_pending) {
                        cond_synchronize_rcu(event->rcu_batches);
                        event->rcu_pending = 0;
                }

                spin_lock_irqsave(&rb->event_lock, flags);
                list_add_rcu(&event->rb_entry, &rb->event_list);
                spin_unlock_irqrestore(&rb->event_lock, flags);
        }

        /*
         * Avoid racing with perf_mmap_close(AUX): stop the event
         * before swizzling the event::rb pointer; if it's getting
         * unmapped, its aux_mmap_count will be 0 and it won't
         * restart. See the comment in __perf_pmu_output_stop().
         *
         * Data will inevitably be lost when set_output is done in
         * mid-air, but then again, whoever does it like this is
         * not in for the data anyway.
         */
        if (has_aux(event))
                perf_event_stop(event, 0);

        rcu_assign_pointer(event->rb, rb);

        if (old_rb) {
                ring_buffer_put(old_rb);
                /*
                 * Since we detached before setting the new rb, so that we
                 * could attach the new rb, we could have missed a wakeup.
                 * Provide it now.
                 */
                wake_up_all(&event->waitq);
        }
}

static void ring_buffer_wakeup(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
                        wake_up_all(&event->waitq);
        }
        rcu_read_unlock();
}

struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                if (!refcount_inc_not_zero(&rb->refcount))
                        rb = NULL;
        }
        rcu_read_unlock();

        return rb;
}

void ring_buffer_put(struct perf_buffer *rb)
{
        if (!refcount_dec_and_test(&rb->refcount))
                return;

        WARN_ON_ONCE(!list_empty(&rb->event_list));

        call_rcu(&rb->rcu_head, rb_free_rcu);
}

typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm);

#define get_mapped(event, func)                        \
({        struct pmu *pmu;                        \
        mapped_f f = NULL;                        \
        guard(rcu)();                                \
        pmu = READ_ONCE(event->pmu);                \
        if (pmu)                                \
                f = pmu->func;                        \
        f;                                        \
})

static void perf_mmap_open(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;
        mapped_f mapped = get_mapped(event, event_mapped);

        refcount_inc(&event->mmap_count);
        refcount_inc(&event->rb->mmap_count);

        if (vma->vm_pgoff)
                refcount_inc(&event->rb->aux_mmap_count);

        if (mapped)
                mapped(event, vma->vm_mm);
}

static void perf_pmu_output_stop(struct perf_event *event);

/*
 * A buffer can be mmap()ed multiple times; either directly through the same
 * event, or through other events by use of perf_event_set_output().
 *
 * In order to undo the VM accounting done by perf_mmap() we need to destroy
 * the buffer here, where we still have a VM context. This means we need
 * to detach all events redirecting to us.
 */
static void perf_mmap_close(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;
        mapped_f unmapped = get_mapped(event, event_unmapped);
        struct perf_buffer *rb = ring_buffer_get(event);
        struct user_struct *mmap_user = rb->mmap_user;
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
        bool detach_rest = false;

        /* FIXIES vs perf_pmu_unregister() */
        if (unmapped)
                unmapped(event, vma->vm_mm);

        /*
         * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
         * to avoid complications.
         */
        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
            refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
                /*
                 * Stop all AUX events that are writing to this buffer,
                 * so that we can free its AUX pages and corresponding PMU
                 * data. Note that after rb::aux_mmap_count dropped to zero,
                 * they won't start any more (see perf_aux_output_begin()).
                 */
                perf_pmu_output_stop(event);

                /* now it's safe to free the pages */
                atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
                atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);

                /* this has to be the last one */
                rb_free_aux(rb);
                WARN_ON_ONCE(refcount_read(&rb->aux_refcount));

                mutex_unlock(&rb->aux_mutex);
        }

        if (refcount_dec_and_test(&rb->mmap_count))
                detach_rest = true;

        if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
                goto out_put;

        ring_buffer_attach(event, NULL);
        mutex_unlock(&event->mmap_mutex);

        /* If there's still other mmap()s of this buffer, we're done. */
        if (!detach_rest)
                goto out_put;

        /*
         * No other mmap()s, detach from all other events that might redirect
         * into the now unreachable buffer. Somewhat complicated by the
         * fact that rb::event_lock otherwise nests inside mmap_mutex.
         */
again:
        rcu_read_lock();
        list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
                if (!atomic_long_inc_not_zero(&event->refcount)) {
                        /*
                         * This event is en-route to free_event() which will
                         * detach it and remove it from the list.
                         */
                        continue;
                }
                rcu_read_unlock();

                mutex_lock(&event->mmap_mutex);
                /*
                 * Check we didn't race with perf_event_set_output() which can
                 * swizzle the rb from under us while we were waiting to
                 * acquire mmap_mutex.
                 *
                 * If we find a different rb; ignore this event, a next
                 * iteration will no longer find it on the list. We have to
                 * still restart the iteration to make sure we're not now
                 * iterating the wrong list.
                 */
                if (event->rb == rb)
                        ring_buffer_attach(event, NULL);

                mutex_unlock(&event->mmap_mutex);
                put_event(event);

                /*
                 * Restart the iteration; either we're on the wrong list or
                 * destroyed its integrity by doing a deletion.
                 */
                goto again;
        }
        rcu_read_unlock();

        /*
         * It could be there's still a few 0-ref events on the list; they'll
         * get cleaned up by free_event() -- they'll also still have their
         * ref on the rb and will free it whenever they are done with it.
         *
         * Aside from that, this buffer is 'fully' detached and unmapped,
         * undo the VM accounting.
         */

        atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
                        &mmap_user->locked_vm);
        atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
        free_uid(mmap_user);

out_put:
        ring_buffer_put(rb); /* could be last */
}

static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
{
        /* The first page is the user control page, others are read-only. */
        return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
}

static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
{
        /*
         * Forbid splitting perf mappings to prevent refcount leaks due to
         * the resulting non-matching offsets and sizes. See open()/close().
         */
        return -EINVAL;
}

static const struct vm_operations_struct perf_mmap_vmops = {
        .open                = perf_mmap_open,
        .close                = perf_mmap_close, /* non mergeable */
        .pfn_mkwrite        = perf_mmap_pfn_mkwrite,
        .may_split        = perf_mmap_may_split,
};

static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
{
        unsigned long nr_pages = vma_pages(vma);
        int err = 0;
        unsigned long pagenum;

        /*
         * We map this as a VM_PFNMAP VMA.
         *
         * This is not ideal as this is designed broadly for mappings of PFNs
         * referencing memory-mapped I/O ranges or non-system RAM i.e. for which
         * !pfn_valid(pfn).
         *
         * We are mapping kernel-allocated memory (memory we manage ourselves)
         * which would more ideally be mapped using vm_insert_page() or a
         * similar mechanism, that is as a VM_MIXEDMAP mapping.
         *
         * However this won't work here, because:
         *
         * 1. It uses vma->vm_page_prot, but this field has not been completely
         *    setup at the point of the f_op->mmp() hook, so we are unable to
         *    indicate that this should be mapped CoW in order that the
         *    mkwrite() hook can be invoked to make the first page R/W and the
         *    rest R/O as desired.
         *
         * 2. Anything other than a VM_PFNMAP of valid PFNs will result in
         *    vm_normal_page() returning a struct page * pointer, which means
         *    vm_ops->page_mkwrite() will be invoked rather than
         *    vm_ops->pfn_mkwrite(), and this means we have to set page->mapping
         *    to work around retry logic in the fault handler, however this
         *    field is no longer allowed to be used within struct page.
         *
         * 3. Having a struct page * made available in the fault logic also
         *    means that the page gets put on the rmap and becomes
         *    inappropriately accessible and subject to map and ref counting.
         *
         * Ideally we would have a mechanism that could explicitly express our
         * desires, but this is not currently the case, so we instead use
         * VM_PFNMAP.
         *
         * We manage the lifetime of these mappings with internal refcounts (see
         * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of
         * this mapping is maintained correctly.
         */
        for (pagenum = 0; pagenum < nr_pages; pagenum++) {
                unsigned long va = vma->vm_start + PAGE_SIZE * pagenum;
                struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);

                if (page == NULL) {
                        err = -EINVAL;
                        break;
                }

                /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
                err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
                                      vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
                if (err)
                        break;
        }

#ifdef CONFIG_MMU
        /* Clear any partial mappings on error. */
        if (err)
                zap_vma_range(vma, vma->vm_start, nr_pages * PAGE_SIZE);
#endif

        return err;
}

static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
{
        unsigned long user_locked, user_lock_limit, locked, lock_limit;
        struct user_struct *user = current_user();

        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
        /* Increase the limit linearly with more CPUs */
        user_lock_limit *= num_online_cpus();

        user_locked = atomic_long_read(&user->locked_vm);

        /*
         * sysctl_perf_event_mlock may have changed, so that
         *     user->locked_vm > user_lock_limit
         */
        if (user_locked > user_lock_limit)
                user_locked = user_lock_limit;
        user_locked += *user_extra;

        if (user_locked > user_lock_limit) {
                /*
                 * charge locked_vm until it hits user_lock_limit;
                 * charge the rest from pinned_vm
                 */
                *extra = user_locked - user_lock_limit;
                *user_extra -= *extra;
        }

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;

        return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
}

static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
{
        struct user_struct *user = current_user();

        atomic_long_add(user_extra, &user->locked_vm);
        atomic64_add(extra, &vma->vm_mm->pinned_vm);
}

static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
                        unsigned long nr_pages)
{
        long extra = 0, user_extra = nr_pages;
        struct perf_buffer *rb;
        int rb_flags = 0;

        nr_pages -= 1;

        /*
         * If we have rb pages ensure they're a power-of-two number, so we
         * can do bitmasks instead of modulo.
         */
        if (nr_pages != 0 && !is_power_of_2(nr_pages))
                return -EINVAL;

        WARN_ON_ONCE(event->ctx->parent_ctx);

        if (event->rb) {
                if (data_page_nr(event->rb) != nr_pages)
                        return -EINVAL;

                /*
                 * If this event doesn't have mmap_count, we're attempting to
                 * create an alias of another event's mmap(); this would mean
                 * both events will end up scribbling the same user_page;
                 * which makes no sense.
                 */
                if (!refcount_read(&event->mmap_count))
                        return -EBUSY;

                if (refcount_inc_not_zero(&event->rb->mmap_count)) {
                        /*
                         * Success -- managed to mmap() the same buffer
                         * multiple times.
                         */
                        perf_mmap_account(vma, user_extra, extra);
                        refcount_inc(&event->mmap_count);
                        return 0;
                }

                /*
                 * Raced against perf_mmap_close()'s
                 * refcount_dec_and_mutex_lock() remove the
                 * event and continue as if !event->rb
                 */
                ring_buffer_attach(event, NULL);
        }

        if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
                return -EPERM;

        if (vma->vm_flags & VM_WRITE)
                rb_flags |= RING_BUFFER_WRITABLE;

        rb = rb_alloc(nr_pages,
                      event->attr.watermark ? event->attr.wakeup_watermark : 0,
                      event->cpu, rb_flags);

        if (!rb)
                return -ENOMEM;

        refcount_set(&rb->mmap_count, 1);
        rb->mmap_user = get_current_user();
        rb->mmap_locked = extra;

        ring_buffer_attach(event, rb);

        perf_event_update_time(event);
        perf_event_init_userpage(event);
        perf_event_update_userpage(event);

        perf_mmap_account(vma, user_extra, extra);
        refcount_set(&event->mmap_count, 1);

        return 0;
}

static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
                         unsigned long nr_pages)
{
        long extra = 0, user_extra = nr_pages;
        u64 aux_offset, aux_size;
        struct perf_buffer *rb;
        int ret, rb_flags = 0;

        rb = event->rb;
        if (!rb)
                return -EINVAL;

        guard(mutex)(&rb->aux_mutex);

        /*
         * AUX area mapping: if rb->aux_nr_pages != 0, it's already
         * mapped, all subsequent mappings should have the same size
         * and offset. Must be above the normal perf buffer.
         */
        aux_offset = READ_ONCE(rb->user_page->aux_offset);
        aux_size = READ_ONCE(rb->user_page->aux_size);

        if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
                return -EINVAL;

        if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
                return -EINVAL;

        /* already mapped with a different offset */
        if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
                return -EINVAL;

        if (aux_size != nr_pages * PAGE_SIZE)
                return -EINVAL;

        /* already mapped with a different size */
        if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
                return -EINVAL;

        if (!is_power_of_2(nr_pages))
                return -EINVAL;

        if (!refcount_inc_not_zero(&rb->mmap_count))
                return -EINVAL;

        if (rb_has_aux(rb)) {
                refcount_inc(&rb->aux_mmap_count);

        } else {
                if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
                        refcount_dec(&rb->mmap_count);
                        return -EPERM;
                }

                WARN_ON(!rb && event->rb);

                if (vma->vm_flags & VM_WRITE)
                        rb_flags |= RING_BUFFER_WRITABLE;

                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
                                   event->attr.aux_watermark, rb_flags);
                if (ret) {
                        refcount_dec(&rb->mmap_count);
                        return ret;
                }

                refcount_set(&rb->aux_mmap_count, 1);
                rb->aux_mmap_locked = extra;
        }

        perf_mmap_account(vma, user_extra, extra);
        refcount_inc(&event->mmap_count);

        return 0;
}

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct perf_event *event = file->private_data;
        unsigned long vma_size, nr_pages;
        mapped_f mapped;
        int ret;

        /*
         * Don't allow mmap() of inherited per-task counters. This would
         * create a performance issue due to all children writing to the
         * same rb.
         */
        if (event->cpu == -1 && event->attr.inherit)
                return -EINVAL;

        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        vma_size = vma->vm_end - vma->vm_start;
        nr_pages = vma_size / PAGE_SIZE;

        if (nr_pages > INT_MAX)
                return -ENOMEM;

        if (vma_size != PAGE_SIZE * nr_pages)
                return -EINVAL;

        scoped_guard (mutex, &event->mmap_mutex) {
                /*
                 * This relies on __pmu_detach_event() taking mmap_mutex after marking
                 * the event REVOKED. Either we observe the state, or __pmu_detach_event()
                 * will detach the rb created here.
                 */
                if (event->state <= PERF_EVENT_STATE_REVOKED)
                        return -ENODEV;

                if (vma->vm_pgoff == 0)
                        ret = perf_mmap_rb(vma, event, nr_pages);
                else
                        ret = perf_mmap_aux(vma, event, nr_pages);
                if (ret)
                        return ret;

                /*
                 * Since pinned accounting is per vm we cannot allow fork() to copy our
                 * vma.
                 */
                vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
                vma->vm_ops = &perf_mmap_vmops;

                mapped = get_mapped(event, event_mapped);
                if (mapped)
                        mapped(event, vma->vm_mm);

                /*
                 * Try to map it into the page table. On fail, invoke
                 * perf_mmap_close() to undo the above, as the callsite expects
                 * full cleanup in this case and therefore does not invoke
                 * vmops::close().
                 */
                ret = map_range(event->rb, vma);
                if (ret)
                        perf_mmap_close(vma);
        }

        return ret;
}

static int perf_fasync(int fd, struct file *filp, int on)
{
        struct inode *inode = file_inode(filp);
        struct perf_event *event = filp->private_data;
        int retval;

        if (event->state <= PERF_EVENT_STATE_REVOKED)
                return -ENODEV;

        inode_lock(inode);
        retval = fasync_helper(fd, filp, on, &event->fasync);
        inode_unlock(inode);

        if (retval < 0)
                return retval;

        return 0;
}

static const struct file_operations perf_fops = {
        .release                = perf_release,
        .read                        = perf_read,
        .poll                        = perf_poll,
        .unlocked_ioctl                = perf_ioctl,
        .compat_ioctl                = perf_compat_ioctl,
        .mmap                        = perf_mmap,
        .fasync                        = perf_fasync,
};

/*
 * Perf event wakeup
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

void perf_event_wakeup(struct perf_event *event)
{
        ring_buffer_wakeup(event);

        if (event->pending_kill) {
                kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
                event->pending_kill = 0;
        }
}

static void perf_sigtrap(struct perf_event *event)
{
        /*
         * Both perf_pending_task() and perf_pending_irq() can race with the
         * task exiting.
         */
        if (current->flags & PF_EXITING)
                return;

        /*
         * We'd expect this to only occur if the irq_work is delayed and either
         * ctx->task or current has changed in the meantime. This can be the
         * case on architectures that do not implement arch_irq_work_raise().
         */
        if (WARN_ON_ONCE(event->ctx->task != current))
                return;

        send_sig_perf((void __user *)event->pending_addr,
                      event->orig_type, event->attr.sig_data);
}

/*
 * Deliver the pending work in-event-context or follow the context.
 */
static void __perf_pending_disable(struct perf_event *event)
{
        int cpu = READ_ONCE(event->oncpu);

        /*
         * If the event isn't running; we done. event_sched_out() will have
         * taken care of things.
         */
        if (cpu < 0)
                return;

        /*
         * Yay, we hit home and are in the context of the event.
         */
        if (cpu == smp_processor_id()) {
                if (event->pending_disable) {
                        event->pending_disable = 0;
                        perf_event_disable_local(event);
                }
                return;
        }

        /*
         *  CPU-A                        CPU-B
         *
         *  perf_event_disable_inatomic()
         *    @pending_disable = 1;
         *    irq_work_queue();
         *
         *  sched-out
         *    @pending_disable = 0;
         *
         *                                sched-in
         *                                perf_event_disable_inatomic()
         *                                  @pending_disable = 1;
         *                                  irq_work_queue(); // FAILS
         *
         *  irq_work_run()
         *    perf_pending_disable()
         *
         * But the event runs on CPU-B and wants disabling there.
         */
        irq_work_queue_on(&event->pending_disable_irq, cpu);
}

static void perf_pending_disable(struct irq_work *entry)
{
        struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq);
        int rctx;

        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */
        rctx = perf_swevent_get_recursion_context();
        __perf_pending_disable(event);
        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
}

static void perf_pending_irq(struct irq_work *entry)
{
        struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
        int rctx;

        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */
        rctx = perf_swevent_get_recursion_context();

        /*
         * The wakeup isn't bound to the context of the event -- it can happen
         * irrespective of where the event is.
         */
        if (event->pending_wakeup) {
                event->pending_wakeup = 0;
                perf_event_wakeup(event);
        }

        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
}

static void perf_pending_task(struct callback_head *head)
{
        struct perf_event *event = container_of(head, struct perf_event, pending_task);
        int rctx;

        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */
        rctx = perf_swevent_get_recursion_context();

        if (event->pending_work) {
                event->pending_work = 0;
                perf_sigtrap(event);
                local_dec(&event->ctx->nr_no_switch_fast);
        }
        put_event(event);

        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
}

#ifdef CONFIG_GUEST_PERF_EVENTS
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi);

void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
                return;

        rcu_assign_pointer(perf_guest_cbs, cbs);
        static_call_update(__perf_guest_state, cbs->state);
        static_call_update(__perf_guest_get_ip, cbs->get_ip);

        /* Implementing ->handle_intel_pt_intr is optional. */
        if (cbs->handle_intel_pt_intr)
                static_call_update(__perf_guest_handle_intel_pt_intr,
                                   cbs->handle_intel_pt_intr);

        if (cbs->handle_mediated_pmi)
                static_call_update(__perf_guest_handle_mediated_pmi,
                                   cbs->handle_mediated_pmi);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);

void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
                return;

        rcu_assign_pointer(perf_guest_cbs, NULL);
        static_call_update(__perf_guest_state, (void *)&__static_call_return0);
        static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
        static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0);
        static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0);
        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
#endif

static bool should_sample_guest(struct perf_event *event)
{
        return !event->attr.exclude_guest && perf_guest_state();
}

unsigned long perf_misc_flags(struct perf_event *event,
                              struct pt_regs *regs)
{
        if (should_sample_guest(event))
                return perf_arch_guest_misc_flags(regs);

        return perf_arch_misc_flags(regs);
}

unsigned long perf_instruction_pointer(struct perf_event *event,
                                       struct pt_regs *regs)
{
        if (should_sample_guest(event))
                return perf_guest_get_ip();

        return perf_arch_instruction_pointer(regs);
}

static void
perf_output_sample_regs(struct perf_output_handle *handle,
                        struct pt_regs *regs, u64 mask)
{
        int bit;
        DECLARE_BITMAP(_mask, 64);

        bitmap_from_u64(_mask, mask);
        for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
                u64 val;

                val = perf_reg_value(regs, bit);
                perf_output_put(handle, val);
        }
}

static void perf_sample_regs_user(struct perf_regs *regs_user,
                                  struct pt_regs *regs)
{
        if (user_mode(regs)) {
                regs_user->abi = perf_reg_abi(current);
                regs_user->regs = regs;
        } else if (is_user_task(current)) {
                perf_get_regs_user(regs_user, regs);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
                regs_user->regs = NULL;
        }
}

static void perf_sample_regs_intr(struct perf_regs *regs_intr,
                                  struct pt_regs *regs)
{
        regs_intr->regs = regs;
        regs_intr->abi  = perf_reg_abi(current);
}


/*
 * Get remaining task size from user stack pointer.
 *
 * It'd be better to take stack vma map and limit this more
 * precisely, but there's no way to get it safely under interrupt,
 * so using TASK_SIZE as limit.
 */
static u64 perf_ustack_task_size(struct pt_regs *regs)
{
        unsigned long addr = perf_user_stack_pointer(regs);

        if (!addr || addr >= TASK_SIZE)
                return 0;

        return TASK_SIZE - addr;
}

static u16
perf_sample_ustack_size(u16 stack_size, u16 header_size,
                        struct pt_regs *regs)
{
        u64 task_size;

        /* No regs, no stack pointer, no dump. */
        if (!regs)
                return 0;

        /* No mm, no stack, no dump. */
        if (!current->mm)
                return 0;

        /*
         * Check if we fit in with the requested stack size into the:
         * - TASK_SIZE
         *   If we don't, we limit the size to the TASK_SIZE.
         *
         * - remaining sample size
         *   If we don't, we customize the stack size to
         *   fit in to the remaining sample size.
         */

        task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
        stack_size = min(stack_size, (u16) task_size);

        /* Current header size plus static size and dynamic size. */
        header_size += 2 * sizeof(u64);

        /* Do we fit in with the current stack dump size? */
        if ((u16) (header_size + stack_size) < header_size) {
                /*
                 * If we overflow the maximum size for the sample,
                 * we customize the stack dump size to fit in.
                 */
                stack_size = USHRT_MAX - header_size - sizeof(u64);
                stack_size = round_up(stack_size, sizeof(u64));
        }

        return stack_size;
}

static void
perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
                          struct pt_regs *regs)
{
        /* Case of a kernel thread, nothing to dump */
        if (!regs) {
                u64 size = 0;
                perf_output_put(handle, size);
        } else {
                unsigned long sp;
                unsigned int rem;
                u64 dyn_size;

                /*
                 * We dump:
                 * static size
                 *   - the size requested by user or the best one we can fit
                 *     in to the sample max size
                 * data
                 *   - user stack dump data
                 * dynamic size
                 *   - the actual dumped size
                 */

                /* Static size. */
                perf_output_put(handle, dump_size);

                /* Data. */
                sp = perf_user_stack_pointer(regs);
                rem = __output_copy_user(handle, (void *) sp, dump_size);
                dyn_size = dump_size - rem;

                perf_output_skip(handle, rem);

                /* Dynamic size. */
                perf_output_put(handle, dyn_size);
        }
}

static unsigned long perf_prepare_sample_aux(struct perf_event *event,
                                          struct perf_sample_data *data,
                                          size_t size)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;

        data->aux_size = 0;

        if (!sampler)
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
                goto out;

        rb = ring_buffer_get(sampler);
        if (!rb)
                goto out;

        /*
         * If this is an NMI hit inside sampling code, don't take
         * the sample. See also perf_aux_sample_output().
         */
        if (READ_ONCE(rb->aux_in_sampling)) {
                data->aux_size = 0;
        } else {
                size = min_t(size_t, size, perf_aux_size(rb));
                data->aux_size = ALIGN(size, sizeof(u64));
        }
        ring_buffer_put(rb);

out:
        return data->aux_size;
}

static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
                                 struct perf_event *event,
                                 struct perf_output_handle *handle,
                                 unsigned long size)
{
        unsigned long flags;
        long ret;

        /*
         * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
         * paths. If we start calling them in NMI context, they may race with
         * the IRQ ones, that is, for example, re-starting an event that's just
         * been stopped, which is why we're using a separate callback that
         * doesn't change the event state.
         *
         * IRQs need to be disabled to prevent IPIs from racing with us.
         */
        local_irq_save(flags);
        /*
         * Guard against NMI hits inside the critical section;
         * see also perf_prepare_sample_aux().
         */
        WRITE_ONCE(rb->aux_in_sampling, 1);
        barrier();

        ret = event->pmu->snapshot_aux(event, handle, size);

        barrier();
        WRITE_ONCE(rb->aux_in_sampling, 0);
        local_irq_restore(flags);

        return ret;
}

static void perf_aux_sample_output(struct perf_event *event,
                                   struct perf_output_handle *handle,
                                   struct perf_sample_data *data)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;
        unsigned long pad;
        long size;

        if (WARN_ON_ONCE(!sampler || !data->aux_size))
                return;

        rb = ring_buffer_get(sampler);
        if (!rb)
                return;

        size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);

        /*
         * An error here means that perf_output_copy() failed (returned a
         * non-zero surplus that it didn't copy), which in its current
         * enlightened implementation is not possible. If that changes, we'd
         * like to know.
         */
        if (WARN_ON_ONCE(size < 0))
                goto out_put;

        /*
         * The pad comes from ALIGN()ing data->aux_size up to u64 in
         * perf_prepare_sample_aux(), so should not be more than that.
         */
        pad = data->aux_size - size;
        if (WARN_ON_ONCE(pad >= sizeof(u64)))
                pad = 8;

        if (pad) {
                u64 zero = 0;
                perf_output_copy(handle, &zero, pad);
        }

out_put:
        ring_buffer_put(rb);
}

/*
 * A set of common sample data types saved even for non-sample records
 * when event->attr.sample_id_all is set.
 */
#define PERF_SAMPLE_ID_ALL  (PERF_SAMPLE_TID | PERF_SAMPLE_TIME |        \
                             PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID |        \
                             PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)

static void __perf_event_header__init_id(struct perf_sample_data *data,
                                         struct perf_event *event,
                                         u64 sample_type)
{
        data->type = event->attr.sample_type;
        data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;

        if (sample_type & PERF_SAMPLE_TID) {
                /* namespace issues */
                data->tid_entry.pid = perf_event_pid(event, current);
                data->tid_entry.tid = perf_event_tid(event, current);
        }

        if (sample_type & PERF_SAMPLE_TIME)
                data->time = perf_event_clock(event);

        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
                data->id = primary_event_id(event);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                data->stream_id = event->id;

        if (sample_type & PERF_SAMPLE_CPU) {
                data->cpu_entry.cpu         = raw_smp_processor_id();
                data->cpu_entry.reserved = 0;
        }
}

void perf_event_header__init_id(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event)
{
        if (event->attr.sample_id_all) {
                header->size += event->id_header_size;
                __perf_event_header__init_id(data, event, event->attr.sample_type);
        }
}

static void __perf_event__output_id_sample(struct perf_output_handle *handle,
                                           struct perf_sample_data *data)
{
        u64 sample_type = data->type;

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);
}

void perf_event__output_id_sample(struct perf_event *event,
                                  struct perf_output_handle *handle,
                                  struct perf_sample_data *sample)
{
        if (event->attr.sample_id_all)
                __perf_event__output_id_sample(handle, sample);
}

static void perf_output_read_one(struct perf_output_handle *handle,
                                 struct perf_event *event,
                                 u64 enabled, u64 running)
{
        u64 read_format = event->attr.read_format;
        u64 values[5];
        int n = 0;

        values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] = enabled +
                        atomic64_read(&event->child_total_time_enabled);
        }
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] = running +
                        atomic64_read(&event->child_total_time_running);
        }
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));
}

static void perf_output_read_group(struct perf_output_handle *handle,
                                   struct perf_event *event,
                                   u64 enabled, u64 running)
{
        struct perf_event *leader = event->group_leader, *sub;
        u64 read_format = event->attr.read_format;
        unsigned long flags;
        u64 values[6];
        int n = 0;
        bool self = has_inherit_and_sample_read(&event->attr);

        /*
         * Disabling interrupts avoids all counter scheduling
         * (context switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        values[n++] = 1 + leader->nr_siblings;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;

        if ((leader != event) && !handle->skip_read)
                perf_pmu_read(leader);

        values[n++] = perf_event_count(leader, self);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));

        for_each_sibling_event(sub, leader) {
                n = 0;

                if ((sub != event) && !handle->skip_read)
                        perf_pmu_read(sub);

                values[n++] = perf_event_count(sub, self);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);

                __output_copy(handle, values, n * sizeof(u64));
        }

        local_irq_restore(flags);
}

#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
                                 PERF_FORMAT_TOTAL_TIME_RUNNING)

/*
 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
 *
 * The problem is that its both hard and excessively expensive to iterate the
 * child list, not to mention that its impossible to IPI the children running
 * on another CPU, from interrupt/NMI context.
 *
 * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
 * counts rather than attempting to accumulate some value across all children on
 * all cores.
 */
static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
{
        u64 enabled = 0, running = 0, now;
        u64 read_format = event->attr.read_format;

        /*
         * Compute total_time_enabled, total_time_running based on snapshot
         * values taken when the event was last scheduled in.
         *
         * We cannot simply call update_context_time() because doing so would
         * lead to deadlock when called from NMI context.
         */
        if (read_format & PERF_FORMAT_TOTAL_TIMES)
                calc_timer_values(event, &now, &enabled, &running);

        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
        else
                perf_output_read_one(handle, event, enabled, running);
}

void perf_output_sample(struct perf_output_handle *handle,
                        struct perf_event_header *header,
                        struct perf_sample_data *data,
                        struct perf_event *event)
{
        u64 sample_type = data->type;

        if (data->sample_flags & PERF_SAMPLE_READ)
                handle->skip_read = 1;

        perf_output_put(handle, *header);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_IP)
                perf_output_put(handle, data->ip);

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ADDR)
                perf_output_put(handle, data->addr);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_PERIOD)
                perf_output_put(handle, data->period);

        if (sample_type & PERF_SAMPLE_READ)
                perf_output_read(handle, event);

        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;

                size += data->callchain->nr;
                size *= sizeof(u64);
                __output_copy(handle, data->callchain, size);
        }

        if (sample_type & PERF_SAMPLE_RAW) {
                struct perf_raw_record *raw = data->raw;

                if (raw) {
                        struct perf_raw_frag *frag = &raw->frag;

                        perf_output_put(handle, raw->size);
                        do {
                                if (frag->copy) {
                                        __output_custom(handle, frag->copy,
                                                        frag->data, frag->size);
                                } else {
                                        __output_copy(handle, frag->data,
                                                      frag->size);
                                }
                                if (perf_raw_frag_last(frag))
                                        break;
                                frag = frag->next;
                        } while (1);
                        if (frag->pad)
                                __output_skip(handle, NULL, frag->pad);
                } else {
                        struct {
                                u32        size;
                                u32        data;
                        } raw = {
                                .size = sizeof(u32),
                                .data = 0,
                        };
                        perf_output_put(handle, raw);
                }
        }

        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
                if (data->br_stack) {
                        size_t size;

                        size = data->br_stack->nr
                             * sizeof(struct perf_branch_entry);

                        perf_output_put(handle, data->br_stack->nr);
                        if (branch_sample_hw_index(event))
                                perf_output_put(handle, data->br_stack->hw_idx);
                        perf_output_copy(handle, data->br_stack->entries, size);
                        /*
                         * Add the extension space which is appended
                         * right after the struct perf_branch_stack.
                         */
                        if (data->br_stack_cntr) {
                                size = data->br_stack->nr * sizeof(u64);
                                perf_output_copy(handle, data->br_stack_cntr, size);
                        }
                } else {
                        /*
                         * we always store at least the value of nr
                         */
                        u64 nr = 0;
                        perf_output_put(handle, nr);
                }
        }

        if (sample_type & PERF_SAMPLE_REGS_USER) {
                u64 abi = data->regs_user.abi;

                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_user;
                        perf_output_sample_regs(handle,
                                                data->regs_user.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_STACK_USER) {
                perf_output_sample_ustack(handle,
                                          data->stack_user_size,
                                          data->regs_user.regs);
        }

        if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
                perf_output_put(handle, data->weight.full);

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                perf_output_put(handle, data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                perf_output_put(handle, data->txn);

        if (sample_type & PERF_SAMPLE_REGS_INTR) {
                u64 abi = data->regs_intr.abi;
                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_intr;

                        perf_output_sample_regs(handle,
                                                data->regs_intr.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                perf_output_put(handle, data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                perf_output_put(handle, data->cgroup);

        if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
                perf_output_put(handle, data->data_page_size);

        if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
                perf_output_put(handle, data->code_page_size);

        if (sample_type & PERF_SAMPLE_AUX) {
                perf_output_put(handle, data->aux_size);

                if (data->aux_size)
                        perf_aux_sample_output(event, handle, data);
        }

        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;

                if (wakeup_events) {
                        struct perf_buffer *rb = handle->rb;
                        int events = local_inc_return(&rb->events);

                        if (events >= wakeup_events) {
                                local_sub(wakeup_events, &rb->events);
                                local_inc(&rb->wakeup);
                        }
                }
        }
}

static u64 perf_virt_to_phys(u64 virt)
{
        u64 phys_addr = 0;

        if (!virt)
                return 0;

        if (virt >= TASK_SIZE) {
                /* If it's vmalloc()d memory, leave phys_addr as 0 */
                if (virt_addr_valid((void *)(uintptr_t)virt) &&
                    !(virt >= VMALLOC_START && virt < VMALLOC_END))
                        phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
        } else {
                /*
                 * Walking the pages tables for user address.
                 * Interrupts are disabled, so it prevents any tear down
                 * of the page tables.
                 * Try IRQ-safe get_user_page_fast_only first.
                 * If failed, leave phys_addr as 0.
                 */
                if (is_user_task(current)) {
                        struct page *p;

                        pagefault_disable();
                        if (get_user_page_fast_only(virt, 0, &p)) {
                                phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
                                put_page(p);
                        }
                        pagefault_enable();
                }
        }

        return phys_addr;
}

/*
 * Return the pagetable size of a given virtual address.
 */
static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
{
        u64 size = 0;

#ifdef CONFIG_HAVE_GUP_FAST
        pgd_t *pgdp, pgd;
        p4d_t *p4dp, p4d;
        pud_t *pudp, pud;
        pmd_t *pmdp, pmd;
        pte_t *ptep, pte;

        pgdp = pgd_offset(mm, addr);
        pgd = pgdp_get(pgdp);
        if (pgd_none(pgd))
                return 0;

        if (pgd_leaf(pgd))
                return pgd_leaf_size(pgd);

        p4dp = p4d_offset_lockless(pgdp, pgd, addr);
        p4d = p4dp_get(p4dp);
        if (!p4d_present(p4d))
                return 0;

        if (p4d_leaf(p4d))
                return p4d_leaf_size(p4d);

        pudp = pud_offset_lockless(p4dp, p4d, addr);
        pud = pudp_get(pudp);
        if (!pud_present(pud))
                return 0;

        if (pud_leaf(pud))
                return pud_leaf_size(pud);

        pmdp = pmd_offset_lockless(pudp, pud, addr);
again:
        pmd = pmdp_get_lockless(pmdp);
        if (!pmd_present(pmd))
                return 0;

        if (pmd_leaf(pmd))
                return pmd_leaf_size(pmd);

        ptep = pte_offset_map(&pmd, addr);
        if (!ptep)
                goto again;

        pte = ptep_get_lockless(ptep);
        if (pte_present(pte))
                size = __pte_leaf_size(pmd, pte);
        pte_unmap(ptep);
#endif /* CONFIG_HAVE_GUP_FAST */

        return size;
}

static u64 perf_get_page_size(unsigned long addr)
{
        struct mm_struct *mm;
        unsigned long flags;
        u64 size;

        if (!addr)
                return 0;

        /*
         * Software page-table walkers must disable IRQs,
         * which prevents any tear down of the page tables.
         */
        local_irq_save(flags);

        mm = current->mm;
        if (!mm) {
                /*
                 * For kernel threads and the like, use init_mm so that
                 * we can find kernel memory.
                 */
                mm = &init_mm;
        }

        size = perf_get_pgtable_size(mm, addr);

        local_irq_restore(flags);

        return size;
}

static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

static struct unwind_work perf_unwind_work;

struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
        bool kernel = !event->attr.exclude_callchain_kernel;
        bool user   = !event->attr.exclude_callchain_user &&
                is_user_task(current);
        /* Disallow cross-task user callchains. */
        bool crosstask = event->ctx->task && event->ctx->task != current;
        bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
                          event->attr.defer_callchain;
        const u32 max_stack = event->attr.sample_max_stack;
        struct perf_callchain_entry *callchain;
        u64 defer_cookie;

        if (!current->mm)
                user = false;

        if (!kernel && !user)
                return &__empty_callchain;

        if (!(user && defer_user && !crosstask &&
              unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
                defer_cookie = 0;

        callchain = get_perf_callchain(regs, kernel, user, max_stack,
                                       crosstask, true, defer_cookie);

        return callchain ?: &__empty_callchain;
}

static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
{
        return d * !!(flags & s);
}

void perf_prepare_sample(struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
        u64 sample_type = event->attr.sample_type;
        u64 filtered_sample_type;

        /*
         * Add the sample flags that are dependent to others.  And clear the
         * sample flags that have already been done by the PMU driver.
         */
        filtered_sample_type = sample_type;
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
                                           PERF_SAMPLE_IP);
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
                                           PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
                                           PERF_SAMPLE_REGS_USER);
        filtered_sample_type &= ~data->sample_flags;

        if (filtered_sample_type == 0) {
                /* Make sure it has the correct data->type for output */
                data->type = event->attr.sample_type;
                return;
        }

        __perf_event_header__init_id(data, event, filtered_sample_type);

        if (filtered_sample_type & PERF_SAMPLE_IP) {
                data->ip = perf_instruction_pointer(event, regs);
                data->sample_flags |= PERF_SAMPLE_IP;
        }

        if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
                perf_sample_save_callchain(data, event, regs);

        if (filtered_sample_type & PERF_SAMPLE_RAW) {
                data->raw = NULL;
                data->dyn_size += sizeof(u64);
                data->sample_flags |= PERF_SAMPLE_RAW;
        }

        if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
                data->br_stack = NULL;
                data->dyn_size += sizeof(u64);
                data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
        }

        if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
                perf_sample_regs_user(&data->regs_user, regs);

        /*
         * It cannot use the filtered_sample_type here as REGS_USER can be set
         * by STACK_USER (using __cond_set() above) and we don't want to update
         * the dyn_size if it's not requested by users.
         */
        if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                if (data->regs_user.regs) {
                        u64 mask = event->attr.sample_regs_user;
                        size += hweight64(mask) * sizeof(u64);
                }

                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_REGS_USER;
        }

        if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
                /*
                 * Either we need PERF_SAMPLE_STACK_USER bit to be always
                 * processed as the last one or have additional check added
                 * in case new sample type is added, because we could eat
                 * up the rest of the sample size.
                 */
                u16 stack_size = event->attr.sample_stack_user;
                u16 header_size = perf_sample_data_size(data, event);
                u16 size = sizeof(u64);

                stack_size = perf_sample_ustack_size(stack_size, header_size,
                                                     data->regs_user.regs);

                /*
                 * If there is something to dump, add space for the dump
                 * itself and for the field that tells the dynamic size,
                 * which is how many have been actually dumped.
                 */
                if (stack_size)
                        size += sizeof(u64) + stack_size;

                data->stack_user_size = stack_size;
                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_STACK_USER;
        }

        if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
                data->weight.full = 0;
                data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
        }

        if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
                data->data_src.val = PERF_MEM_NA;
                data->sample_flags |= PERF_SAMPLE_DATA_SRC;
        }

        if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
                data->txn = 0;
                data->sample_flags |= PERF_SAMPLE_TRANSACTION;
        }

        if (filtered_sample_type & PERF_SAMPLE_ADDR) {
                data->addr = 0;
                data->sample_flags |= PERF_SAMPLE_ADDR;
        }

        if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                perf_sample_regs_intr(&data->regs_intr, regs);

                if (data->regs_intr.regs) {
                        u64 mask = event->attr.sample_regs_intr;

                        size += hweight64(mask) * sizeof(u64);
                }

                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_REGS_INTR;
        }

        if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
                data->phys_addr = perf_virt_to_phys(data->addr);
                data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
        }

#ifdef CONFIG_CGROUP_PERF
        if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
                struct cgroup *cgrp;

                /* protected by RCU */
                cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
                data->cgroup = cgroup_id(cgrp);
                data->sample_flags |= PERF_SAMPLE_CGROUP;
        }
#endif

        /*
         * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
         * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
         * but the value will not dump to the userspace.
         */
        if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
                data->data_page_size = perf_get_page_size(data->addr);
                data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
        }

        if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
                data->code_page_size = perf_get_page_size(data->ip);
                data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
        }

        if (filtered_sample_type & PERF_SAMPLE_AUX) {
                u64 size;
                u16 header_size = perf_sample_data_size(data, event);

                header_size += sizeof(u64); /* size */

                /*
                 * Given the 16bit nature of header::size, an AUX sample can
                 * easily overflow it, what with all the preceding sample bits.
                 * Make sure this doesn't happen by using up to U16_MAX bytes
                 * per sample in total (rounded down to 8 byte boundary).
                 */
                size = min_t(size_t, U16_MAX - header_size,
                             event->attr.aux_sample_size);
                size = rounddown(size, 8);
                size = perf_prepare_sample_aux(event, data, size);

                WARN_ON_ONCE(size + header_size > U16_MAX);
                data->dyn_size += size + sizeof(u64); /* size above */
                data->sample_flags |= PERF_SAMPLE_AUX;
        }
}

void perf_prepare_header(struct perf_event_header *header,
                         struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
        header->type = PERF_RECORD_SAMPLE;
        header->size = perf_sample_data_size(data, event);
        header->misc = perf_misc_flags(event, regs);

        /*
         * If you're adding more sample types here, you likely need to do
         * something about the overflowing header::size, like repurpose the
         * lowest 3 bits of size, which should be always zero at the moment.
         * This raises a more important question, do we really need 512k sized
         * samples and why, so good argumentation is in order for whatever you
         * do here next.
         */
        WARN_ON_ONCE(header->size & 7);
}

static void __perf_event_aux_pause(struct perf_event *event, bool pause)
{
        if (pause) {
                if (!event->hw.aux_paused) {
                        event->hw.aux_paused = 1;
                        event->pmu->stop(event, PERF_EF_PAUSE);
                }
        } else {
                if (event->hw.aux_paused) {
                        event->hw.aux_paused = 0;
                        event->pmu->start(event, PERF_EF_RESUME);
                }
        }
}

static void perf_event_aux_pause(struct perf_event *event, bool pause)
{
        struct perf_buffer *rb;

        if (WARN_ON_ONCE(!event))
                return;

        rb = ring_buffer_get(event);
        if (!rb)
                return;

        scoped_guard (irqsave) {
                /*
                 * Guard against self-recursion here. Another event could trip
                 * this same from NMI context.
                 */
                if (READ_ONCE(rb->aux_in_pause_resume))
                        break;

                WRITE_ONCE(rb->aux_in_pause_resume, 1);
                barrier();
                __perf_event_aux_pause(event, pause);
                barrier();
                WRITE_ONCE(rb->aux_in_pause_resume, 0);
        }
        ring_buffer_put(rb);
}

static __always_inline int
__perf_event_output(struct perf_event *event,
                    struct perf_sample_data *data,
                    struct pt_regs *regs,
                    int (*output_begin)(struct perf_output_handle *,
                                        struct perf_sample_data *,
                                        struct perf_event *,
                                        unsigned int))
{
        struct perf_output_handle handle;
        struct perf_event_header header;
        int err;

        /* protect the callchain buffers */
        rcu_read_lock();

        perf_prepare_sample(data, event, regs);
        perf_prepare_header(&header, data, event, regs);

        err = output_begin(&handle, data, event, header.size);
        if (err)
                goto exit;

        perf_output_sample(&handle, &header, data, event);

        perf_output_end(&handle);

exit:
        rcu_read_unlock();
        return err;
}

void
perf_event_output_forward(struct perf_event *event,
                         struct perf_sample_data *data,
                         struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_forward);
}

void
perf_event_output_backward(struct perf_event *event,
                           struct perf_sample_data *data,
                           struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_backward);
}

int
perf_event_output(struct perf_event *event,
                  struct perf_sample_data *data,
                  struct pt_regs *regs)
{
        return __perf_event_output(event, data, regs, perf_output_begin);
}

/*
 * read event_id
 */

struct perf_read_event {
        struct perf_event_header        header;

        u32                                pid;
        u32                                tid;
};

static void
perf_event_read_event(struct perf_event *event,
                        struct task_struct *task)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_read_event read_event = {
                .header = {
                        .type = PERF_RECORD_READ,
                        .misc = 0,
                        .size = sizeof(read_event) + event->read_size,
                },
                .pid = perf_event_pid(event, task),
                .tid = perf_event_tid(event, task),
        };
        int ret;

        perf_event_header__init_id(&read_event.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, read_event);
        perf_output_read(&handle, event);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

typedef void (perf_iterate_f)(struct perf_event *event, void *data);

static void
perf_iterate_ctx(struct perf_event_context *ctx,
                   perf_iterate_f output,
                   void *data, bool all)
{
        struct perf_event *event;

        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (!all) {
                        if (event->state < PERF_EVENT_STATE_INACTIVE)
                                continue;
                        if (!event_filter_match(event))
                                continue;
                }

                output(event, data);
        }
}

static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
{
        struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
        struct perf_event *event;

        list_for_each_entry_rcu(event, &pel->list, sb_list) {
                /*
                 * Skip events that are not fully formed yet; ensure that
                 * if we observe event->ctx, both event and ctx will be
                 * complete enough. See perf_install_in_context().
                 */
                if (!smp_load_acquire(&event->ctx))
                        continue;

                if (event->state < PERF_EVENT_STATE_INACTIVE)
                        continue;
                if (!event_filter_match(event))
                        continue;
                output(event, data);
        }
}

/*
 * Iterate all events that need to receive side-band events.
 *
 * For new callers; ensure that account_pmu_sb_event() includes
 * your event, otherwise it might not get delivered.
 */
static void
perf_iterate_sb(perf_iterate_f output, void *data,
               struct perf_event_context *task_ctx)
{
        struct perf_event_context *ctx;

        rcu_read_lock();
        preempt_disable();

        /*
         * If we have task_ctx != NULL we only notify the task context itself.
         * The task_ctx is set only for EXIT events before releasing task
         * context.
         */
        if (task_ctx) {
                perf_iterate_ctx(task_ctx, output, data, false);
                goto done;
        }

        perf_iterate_sb_cpu(output, data);

        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_iterate_ctx(ctx, output, data, false);
done:
        preempt_enable();
        rcu_read_unlock();
}

/*
 * Clear all file-based filters at exec, they'll have to be
 * re-instated when/if these objects are mmapped again.
 */
static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;
                        restart++;
                }

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

void perf_event_exec(void)
{
        struct perf_event_context *ctx;

        ctx = perf_pin_task_context(current);
        if (!ctx)
                return;

        perf_event_enable_on_exec(ctx);
        perf_event_remove_on_exec(ctx);
        scoped_guard(rcu)
                perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);

        perf_unpin_context(ctx);
        put_ctx(ctx);
}

struct remote_output {
        struct perf_buffer        *rb;
        int                        err;
};

static void __perf_event_output_stop(struct perf_event *event, void *data)
{
        struct perf_event *parent = event->parent;
        struct remote_output *ro = data;
        struct perf_buffer *rb = ro->rb;
        struct stop_event_data sd = {
                .event        = event,
        };

        if (!has_aux(event))
                return;

        if (!parent)
                parent = event;

        /*
         * In case of inheritance, it will be the parent that links to the
         * ring-buffer, but it will be the child that's actually using it.
         *
         * We are using event::rb to determine if the event should be stopped,
         * however this may race with ring_buffer_attach() (through set_output),
         * which will make us skip the event that actually needs to be stopped.
         * So ring_buffer_attach() has to stop an aux event before re-assigning
         * its rb pointer.
         */
        if (rcu_dereference(parent->rb) == rb)
                ro->err = __perf_event_stop(&sd);
}

static int __perf_pmu_output_stop(void *info)
{
        struct perf_event *event = info;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct remote_output ro = {
                .rb        = event->rb,
        };

        rcu_read_lock();
        perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
        if (cpuctx->task_ctx)
                perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
                                   &ro, false);
        rcu_read_unlock();

        return ro.err;
}

static void perf_pmu_output_stop(struct perf_event *event)
{
        struct perf_event *iter;
        int err, cpu;

restart:
        rcu_read_lock();
        list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
                /*
                 * For per-CPU events, we need to make sure that neither they
                 * nor their children are running; for cpu==-1 events it's
                 * sufficient to stop the event itself if it's active, since
                 * it can't have children.
                 */
                cpu = iter->cpu;
                if (cpu == -1)
                        cpu = READ_ONCE(iter->oncpu);

                if (cpu == -1)
                        continue;

                err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
                if (err == -EAGAIN) {
                        rcu_read_unlock();
                        goto restart;
                }
        }
        rcu_read_unlock();
}

/*
 * task tracking -- fork/exit
 *
 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
 */

struct perf_task_event {
        struct task_struct                *task;
        struct perf_event_context        *task_ctx;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                ppid;
                u32                                tid;
                u32                                ptid;
                u64                                time;
        } event_id;
};

static int perf_event_task_match(struct perf_event *event)
{
        return event->attr.comm  || event->attr.mmap ||
               event->attr.mmap2 || event->attr.mmap_data ||
               event->attr.task;
}

static void perf_event_task_output(struct perf_event *event,
                                   void *data)
{
        struct perf_task_event *task_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data        sample;
        struct task_struct *task = task_event->task;
        int ret, size = task_event->event_id.header.size;

        if (!perf_event_task_match(event))
                return;

        perf_event_header__init_id(&task_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                task_event->event_id.header.size);
        if (ret)
                goto out;

        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.tid = perf_event_tid(event, task);

        if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
                task_event->event_id.ppid = perf_event_pid(event,
                                                        task->real_parent);
                task_event->event_id.ptid = perf_event_pid(event,
                                                        task->real_parent);
        } else {  /* PERF_RECORD_FORK */
                task_event->event_id.ppid = perf_event_pid(event, current);
                task_event->event_id.ptid = perf_event_tid(event, current);
        }

        task_event->event_id.time = perf_event_clock(event);

        perf_output_put(&handle, task_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        task_event->event_id.header.size = size;
}

static void perf_event_task(struct task_struct *task,
                              struct perf_event_context *task_ctx,
                              int new)
{
        struct perf_task_event task_event;

        if (!atomic_read(&nr_comm_events) &&
            !atomic_read(&nr_mmap_events) &&
            !atomic_read(&nr_task_events))
                return;

        task_event = (struct perf_task_event){
                .task          = task,
                .task_ctx = task_ctx,
                .event_id    = {
                        .header = {
                                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
                                .misc = 0,
                                .size = sizeof(task_event.event_id),
                        },
                        /* .pid  */
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
                        /* .time */
                },
        };

        perf_iterate_sb(perf_event_task_output,
                       &task_event,
                       task_ctx);
}

/*
 * Allocate data for a new task when profiling system-wide
 * events which require PMU specific data
 */
static void
perf_event_alloc_task_data(struct task_struct *child,
                           struct task_struct *parent)
{
        struct kmem_cache *ctx_cache = NULL;
        struct perf_ctx_data *cd;

        if (!refcount_read(&global_ctx_data_ref))
                return;

        scoped_guard (rcu) {
                cd = rcu_dereference(parent->perf_ctx_data);
                if (cd)
                        ctx_cache = cd->ctx_cache;
        }

        if (!ctx_cache)
                return;

        guard(percpu_read)(&global_ctx_data_rwsem);
        scoped_guard (rcu) {
                cd = rcu_dereference(child->perf_ctx_data);
                if (!cd) {
                        /*
                         * A system-wide event may be unaccount,
                         * when attaching the perf_ctx_data.
                         */
                        if (!refcount_read(&global_ctx_data_ref))
                                return;
                        goto attach;
                }

                if (!cd->global) {
                        cd->global = 1;
                        refcount_inc(&cd->refcount);
                }
        }

        return;
attach:
        attach_task_ctx_data(child, ctx_cache, true, GFP_KERNEL);
}

void perf_event_fork(struct task_struct *task)
{
        perf_event_task(task, NULL, 1);
        perf_event_namespaces(task);
        perf_event_alloc_task_data(task, current);
}

/*
 * comm tracking
 */

struct perf_comm_event {
        struct task_struct        *task;
        char                        *comm;
        int                        comm_size;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
        } event_id;
};

static int perf_event_comm_match(struct perf_event *event)
{
        return event->attr.comm;
}

static void perf_event_comm_output(struct perf_event *event,
                                   void *data)
{
        struct perf_comm_event *comm_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = comm_event->event_id.header.size;
        int ret;

        if (!perf_event_comm_match(event))
                return;

        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                comm_event->event_id.header.size);

        if (ret)
                goto out;

        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);

        perf_output_put(&handle, comm_event->event_id);
        __output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        comm_event->event_id.header.size = size;
}

static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
        char comm[TASK_COMM_LEN];
        unsigned int size;

        memset(comm, 0, sizeof(comm));
        strscpy(comm, comm_event->task->comm);
        size = ALIGN(strlen(comm)+1, sizeof(u64));

        comm_event->comm = comm;
        comm_event->comm_size = size;

        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;

        perf_iterate_sb(perf_event_comm_output,
                       comm_event,
                       NULL);
}

void perf_event_comm(struct task_struct *task, bool exec)
{
        struct perf_comm_event comm_event;

        if (!atomic_read(&nr_comm_events))
                return;

        comm_event = (struct perf_comm_event){
                .task        = task,
                /* .comm      */
                /* .comm_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_COMM,
                                .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                },
        };

        perf_event_comm_event(&comm_event);
}

/*
 * namespaces tracking
 */

struct perf_namespaces_event {
        struct task_struct                *task;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                nr_namespaces;
                struct perf_ns_link_info        link_info[NR_NAMESPACES];
        } event_id;
};

static int perf_event_namespaces_match(struct perf_event *event)
{
        return event->attr.namespaces;
}

static void perf_event_namespaces_output(struct perf_event *event,
                                         void *data)
{
        struct perf_namespaces_event *namespaces_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = namespaces_event->event_id.header.size;
        int ret;

        if (!perf_event_namespaces_match(event))
                return;

        perf_event_header__init_id(&namespaces_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                namespaces_event->event_id.header.size);
        if (ret)
                goto out;

        namespaces_event->event_id.pid = perf_event_pid(event,
                                                        namespaces_event->task);
        namespaces_event->event_id.tid = perf_event_tid(event,
                                                        namespaces_event->task);

        perf_output_put(&handle, namespaces_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        namespaces_event->event_id.header.size = header_size;
}

static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
                                   struct task_struct *task,
                                   const struct proc_ns_operations *ns_ops)
{
        struct path ns_path;
        struct inode *ns_inode;
        int error;

        error = ns_get_path(&ns_path, task, ns_ops);
        if (!error) {
                ns_inode = ns_path.dentry->d_inode;
                ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
                ns_link_info->ino = ns_inode->i_ino;
                path_put(&ns_path);
        }
}

void perf_event_namespaces(struct task_struct *task)
{
        struct perf_namespaces_event namespaces_event;
        struct perf_ns_link_info *ns_link_info;

        if (!atomic_read(&nr_namespaces_events))
                return;

        namespaces_event = (struct perf_namespaces_event){
                .task        = task,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_NAMESPACES,
                                .misc = 0,
                                .size = sizeof(namespaces_event.event_id),
                        },
                        /* .pid */
                        /* .tid */
                        .nr_namespaces = NR_NAMESPACES,
                        /* .link_info[NR_NAMESPACES] */
                },
        };

        ns_link_info = namespaces_event.event_id.link_info;

        perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
                               task, &mntns_operations);

#ifdef CONFIG_USER_NS
        perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
                               task, &userns_operations);
#endif
#ifdef CONFIG_NET_NS
        perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
                               task, &netns_operations);
#endif
#ifdef CONFIG_UTS_NS
        perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
                               task, &utsns_operations);
#endif
#ifdef CONFIG_IPC_NS
        perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
                               task, &ipcns_operations);
#endif
#ifdef CONFIG_PID_NS
        perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
                               task, &pidns_operations);
#endif
#ifdef CONFIG_CGROUPS
        perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
                               task, &cgroupns_operations);
#endif

        perf_iterate_sb(perf_event_namespaces_output,
                        &namespaces_event,
                        NULL);
}

/*
 * cgroup tracking
 */
#ifdef CONFIG_CGROUP_PERF

struct perf_cgroup_event {
        char                                *path;
        int                                path_size;
        struct {
                struct perf_event_header        header;
                u64                                id;
                char                                path[];
        } event_id;
};

static int perf_event_cgroup_match(struct perf_event *event)
{
        return event->attr.cgroup;
}

static void perf_event_cgroup_output(struct perf_event *event, void *data)
{
        struct perf_cgroup_event *cgroup_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = cgroup_event->event_id.header.size;
        int ret;

        if (!perf_event_cgroup_match(event))
                return;

        perf_event_header__init_id(&cgroup_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                cgroup_event->event_id.header.size);
        if (ret)
                goto out;

        perf_output_put(&handle, cgroup_event->event_id);
        __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        cgroup_event->event_id.header.size = header_size;
}

static void perf_event_cgroup(struct cgroup *cgrp)
{
        struct perf_cgroup_event cgroup_event;
        char path_enomem[16] = "//enomem";
        char *pathname;
        size_t size;

        if (!atomic_read(&nr_cgroup_events))
                return;

        cgroup_event = (struct perf_cgroup_event){
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_CGROUP,
                                .misc = 0,
                                .size = sizeof(cgroup_event.event_id),
                        },
                        .id = cgroup_id(cgrp),
                },
        };

        pathname = kmalloc(PATH_MAX, GFP_KERNEL);
        if (pathname == NULL) {
                cgroup_event.path = path_enomem;
        } else {
                /* just to be sure to have enough space for alignment */
                cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
                cgroup_event.path = pathname;
        }

        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(cgroup_event.path) + 1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                cgroup_event.path[size++] = '\0';

        cgroup_event.event_id.header.size += size;
        cgroup_event.path_size = size;

        perf_iterate_sb(perf_event_cgroup_output,
                        &cgroup_event,
                        NULL);

        kfree(pathname);
}

#endif

/*
 * mmap tracking
 */

struct perf_mmap_event {
        struct vm_area_struct        *vma;

        const char                *file_name;
        int                        file_size;
        int                        maj, min;
        u64                        ino;
        u64                        ino_generation;
        u32                        prot, flags;
        u8                        build_id[BUILD_ID_SIZE_MAX];
        u32                        build_id_size;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                start;
                u64                                len;
                u64                                pgoff;
        } event_id;
};

static int perf_event_mmap_match(struct perf_event *event,
                                 void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct vm_area_struct *vma = mmap_event->vma;
        int executable = vma->vm_flags & VM_EXEC;

        return (!executable && event->attr.mmap_data) ||
               (executable && (event->attr.mmap || event->attr.mmap2));
}

static void perf_event_mmap_output(struct perf_event *event,
                                   void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
        u32 type = mmap_event->event_id.header.type;
        bool use_build_id;
        int ret;

        if (!perf_event_mmap_match(event, data))
                return;

        if (event->attr.mmap2) {
                mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
                mmap_event->event_id.header.size += sizeof(mmap_event->maj);
                mmap_event->event_id.header.size += sizeof(mmap_event->min);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
                mmap_event->event_id.header.size += sizeof(mmap_event->prot);
                mmap_event->event_id.header.size += sizeof(mmap_event->flags);
        }

        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                mmap_event->event_id.header.size);
        if (ret)
                goto out;

        mmap_event->event_id.pid = perf_event_pid(event, current);
        mmap_event->event_id.tid = perf_event_tid(event, current);

        use_build_id = event->attr.build_id && mmap_event->build_id_size;

        if (event->attr.mmap2 && use_build_id)
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;

        perf_output_put(&handle, mmap_event->event_id);

        if (event->attr.mmap2) {
                if (use_build_id) {
                        u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };

                        __output_copy(&handle, size, 4);
                        __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
                } else {
                        perf_output_put(&handle, mmap_event->maj);
                        perf_output_put(&handle, mmap_event->min);
                        perf_output_put(&handle, mmap_event->ino);
                        perf_output_put(&handle, mmap_event->ino_generation);
                }
                perf_output_put(&handle, mmap_event->prot);
                perf_output_put(&handle, mmap_event->flags);
        }

        __output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        mmap_event->event_id.header.size = size;
        mmap_event->event_id.header.type = type;
}

static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
        struct vm_area_struct *vma = mmap_event->vma;
        struct file *file = vma->vm_file;
        int maj = 0, min = 0;
        u64 ino = 0, gen = 0;
        u32 prot = 0, flags = 0;
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
        char *name = NULL;

        if (vma->vm_flags & VM_READ)
                prot |= PROT_READ;
        if (vma->vm_flags & VM_WRITE)
                prot |= PROT_WRITE;
        if (vma->vm_flags & VM_EXEC)
                prot |= PROT_EXEC;

        if (vma->vm_flags & VM_MAYSHARE)
                flags = MAP_SHARED;
        else
                flags = MAP_PRIVATE;

        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;
        if (is_vm_hugetlb_page(vma))
                flags |= MAP_HUGETLB;

        if (file) {
                const struct inode *inode;
                dev_t dev;

                buf = kmalloc(PATH_MAX, GFP_KERNEL);
                if (!buf) {
                        name = "//enomem";
                        goto cpy_name;
                }
                /*
                 * d_path() works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
                name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64));
                if (IS_ERR(name)) {
                        name = "//toolong";
                        goto cpy_name;
                }
                inode = file_user_inode(vma->vm_file);
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
                gen = inode->i_generation;
                maj = MAJOR(dev);
                min = MINOR(dev);

                goto got_name;
        } else {
                if (vma->vm_ops && vma->vm_ops->name)
                        name = (char *) vma->vm_ops->name(vma);
                if (!name)
                        name = (char *)arch_vma_name(vma);
                if (!name) {
                        if (vma_is_initial_heap(vma))
                                name = "[heap]";
                        else if (vma_is_initial_stack(vma))
                                name = "[stack]";
                        else
                                name = "//anon";
                }
        }

cpy_name:
        strscpy(tmp, name);
        name = tmp;
got_name:
        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(name)+1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                name[size++] = '\0';

        mmap_event->file_name = name;
        mmap_event->file_size = size;
        mmap_event->maj = maj;
        mmap_event->min = min;
        mmap_event->ino = ino;
        mmap_event->ino_generation = gen;
        mmap_event->prot = prot;
        mmap_event->flags = flags;

        if (!(vma->vm_flags & VM_EXEC))
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;

        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;

        if (atomic_read(&nr_build_id_events))
                build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);

        perf_iterate_sb(perf_event_mmap_output,
                       mmap_event,
                       NULL);

        kfree(buf);
}

/*
 * Check whether inode and address range match filter criteria.
 */
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
                                     struct file *file, unsigned long offset,
                                     unsigned long size)
{
        /* d_inode(NULL) won't be equal to any mapped user-space file */
        if (!filter->path.dentry)
                return false;

        if (d_inode(filter->path.dentry) != file_user_inode(file))
                return false;

        if (filter->offset > offset + size)
                return false;

        if (filter->offset + filter->size < offset)
                return false;

        return true;
}

static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
                                        struct vm_area_struct *vma,
                                        struct perf_addr_filter_range *fr)
{
        unsigned long vma_size = vma->vm_end - vma->vm_start;
        unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
        struct file *file = vma->vm_file;

        if (!perf_addr_filter_match(filter, file, off, vma_size))
                return false;

        if (filter->offset < off) {
                fr->start = vma->vm_start;
                fr->size = min(vma_size, filter->size - (off - filter->offset));
        } else {
                fr->start = vma->vm_start + filter->offset - off;
                fr->size = min(vma->vm_end - fr->start, filter->size);
        }

        return true;
}

static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct vm_area_struct *vma = data;
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        if (!vma->vm_file)
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (perf_addr_filter_vma_adjust(filter, vma,
                                                &event->addr_filter_ranges[count]))
                        restart++;

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

/*
 * Adjust all task's events' filters to the new vma
 */
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
        struct perf_event_context *ctx;

        /*
         * Data tracing isn't supported yet and as such there is no need
         * to keep track of anything that isn't related to executable code:
         */
        if (!(vma->vm_flags & VM_EXEC))
                return;

        rcu_read_lock();
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
        rcu_read_unlock();
}

void perf_event_mmap(struct vm_area_struct *vma)
{
        struct perf_mmap_event mmap_event;

        if (!atomic_read(&nr_mmap_events))
                return;

        mmap_event = (struct perf_mmap_event){
                .vma        = vma,
                /* .file_name */
                /* .file_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_MMAP,
                                .misc = PERF_RECORD_MISC_USER,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
                /* .maj (attr_mmap2 only) */
                /* .min (attr_mmap2 only) */
                /* .ino (attr_mmap2 only) */
                /* .ino_generation (attr_mmap2 only) */
                /* .prot (attr_mmap2 only) */
                /* .flags (attr_mmap2 only) */
        };

        perf_addr_filters_adjust(vma);
        perf_event_mmap_event(&mmap_event);
}

void perf_event_aux_event(struct perf_event *event, unsigned long head,
                          unsigned long size, u64 flags)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u64                                offset;
                u64                                size;
                u64                                flags;
        } rec = {
                .header = {
                        .type = PERF_RECORD_AUX,
                        .misc = 0,
                        .size = sizeof(rec),
                },
                .offset                = head,
                .size                = size,
                .flags                = flags,
        };
        int ret;

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

/*
 * Lost/dropped samples logging
 */
void perf_log_lost_samples(struct perf_event *event, u64 lost)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                lost;
        } lost_samples_event = {
                .header = {
                        .type = PERF_RECORD_LOST_SAMPLES,
                        .misc = 0,
                        .size = sizeof(lost_samples_event),
                },
                .lost                = lost,
        };

        perf_event_header__init_id(&lost_samples_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                lost_samples_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, lost_samples_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * context_switch tracking
 */

struct perf_switch_event {
        struct task_struct        *task;
        struct task_struct        *next_prev;

        struct {
                struct perf_event_header        header;
                u32                                next_prev_pid;
                u32                                next_prev_tid;
        } event_id;
};

static int perf_event_switch_match(struct perf_event *event)
{
        return event->attr.context_switch;
}

static void perf_event_switch_output(struct perf_event *event, void *data)
{
        struct perf_switch_event *se = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_switch_match(event))
                return;

        /* Only CPU-wide events are allowed to see next/prev pid/tid */
        if (event->ctx->task) {
                se->event_id.header.type = PERF_RECORD_SWITCH;
                se->event_id.header.size = sizeof(se->event_id.header);
        } else {
                se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
                se->event_id.header.size = sizeof(se->event_id);
                se->event_id.next_prev_pid =
                                        perf_event_pid(event, se->next_prev);
                se->event_id.next_prev_tid =
                                        perf_event_tid(event, se->next_prev);
        }

        perf_event_header__init_id(&se->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
        if (ret)
                return;

        if (event->ctx->task)
                perf_output_put(&handle, se->event_id.header);
        else
                perf_output_put(&handle, se->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in)
{
        struct perf_switch_event switch_event;

        /* N.B. caller checks nr_switch_events != 0 */

        switch_event = (struct perf_switch_event){
                .task                = task,
                .next_prev        = next_prev,
                .event_id        = {
                        .header = {
                                /* .type */
                                .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
                                /* .size */
                        },
                        /* .next_prev_pid */
                        /* .next_prev_tid */
                },
        };

        if (!sched_in && task_is_runnable(task)) {
                switch_event.event_id.header.misc |=
                                PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
        }

        perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
}

/*
 * IRQ throttle logging
 */

static void perf_log_throttle(struct perf_event *event, int enable)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                time;
                u64                                id;
                u64                                stream_id;
        } throttle_event = {
                .header = {
                        .type = PERF_RECORD_THROTTLE,
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
                .time                = perf_event_clock(event),
                .id                = primary_event_id(event),
                .stream_id        = event->id,
        };

        if (enable)
                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;

        perf_event_header__init_id(&throttle_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                throttle_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, throttle_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * ksymbol register/unregister tracking
 */

struct perf_ksymbol_event {
        const char        *name;
        int                name_len;
        struct {
                struct perf_event_header        header;
                u64                                addr;
                u32                                len;
                u16                                ksym_type;
                u16                                flags;
        } event_id;
};

static int perf_event_ksymbol_match(struct perf_event *event)
{
        return event->attr.ksymbol;
}

static void perf_event_ksymbol_output(struct perf_event *event, void *data)
{
        struct perf_ksymbol_event *ksymbol_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_ksymbol_match(event))
                return;

        perf_event_header__init_id(&ksymbol_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                ksymbol_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, ksymbol_event->event_id);
        __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
                        const char *sym)
{
        struct perf_ksymbol_event ksymbol_event;
        char name[KSYM_NAME_LEN];
        u16 flags = 0;
        int name_len;

        if (!atomic_read(&nr_ksymbol_events))
                return;

        if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
            ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
                goto err;

        strscpy(name, sym);
        name_len = strlen(name) + 1;
        while (!IS_ALIGNED(name_len, sizeof(u64)))
                name[name_len++] = '\0';
        BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));

        if (unregister)
                flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;

        ksymbol_event = (struct perf_ksymbol_event){
                .name = name,
                .name_len = name_len,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_KSYMBOL,
                                .size = sizeof(ksymbol_event.event_id) +
                                        name_len,
                        },
                        .addr = addr,
                        .len = len,
                        .ksym_type = ksym_type,
                        .flags = flags,
                },
        };

        perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
        return;
err:
        WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
}

/*
 * bpf program load/unload tracking
 */

struct perf_bpf_event {
        struct bpf_prog        *prog;
        struct {
                struct perf_event_header        header;
                u16                                type;
                u16                                flags;
                u32                                id;
                u8                                tag[BPF_TAG_SIZE];
        } event_id;
};

static int perf_event_bpf_match(struct perf_event *event)
{
        return event->attr.bpf_event;
}

static void perf_event_bpf_output(struct perf_event *event, void *data)
{
        struct perf_bpf_event *bpf_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_bpf_match(event))
                return;

        perf_event_header__init_id(&bpf_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                bpf_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, bpf_event->event_id);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
                                         enum perf_bpf_event_type type)
{
        bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
        int i;

        perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
                           (u64)(unsigned long)prog->bpf_func,
                           prog->jited_len, unregister,
                           prog->aux->ksym.name);

        for (i = 1; i < prog->aux->func_cnt; i++) {
                struct bpf_prog *subprog = prog->aux->func[i];

                perf_event_ksymbol(
                        PERF_RECORD_KSYMBOL_TYPE_BPF,
                        (u64)(unsigned long)subprog->bpf_func,
                        subprog->jited_len, unregister,
                        subprog->aux->ksym.name);
        }
}

void perf_event_bpf_event(struct bpf_prog *prog,
                          enum perf_bpf_event_type type,
                          u16 flags)
{
        struct perf_bpf_event bpf_event;

        switch (type) {
        case PERF_BPF_EVENT_PROG_LOAD:
        case PERF_BPF_EVENT_PROG_UNLOAD:
                if (atomic_read(&nr_ksymbol_events))
                        perf_event_bpf_emit_ksymbols(prog, type);
                break;
        default:
                return;
        }

        if (!atomic_read(&nr_bpf_events))
                return;

        bpf_event = (struct perf_bpf_event){
                .prog = prog,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_BPF_EVENT,
                                .size = sizeof(bpf_event.event_id),
                        },
                        .type = type,
                        .flags = flags,
                        .id = prog->aux->id,
                },
        };

        BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));

        memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
        perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}

struct perf_callchain_deferred_event {
        struct unwind_stacktrace *trace;
        struct {
                struct perf_event_header        header;
                u64                                cookie;
                u64                                nr;
                u64                                ips[];
        } event;
};

static void perf_callchain_deferred_output(struct perf_event *event, void *data)
{
        struct perf_callchain_deferred_event *deferred_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret, size = deferred_event->event.header.size;

        if (!event->attr.defer_output)
                return;

        /* XXX do we really need sample_id_all for this ??? */
        perf_event_header__init_id(&deferred_event->event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                deferred_event->event.header.size);
        if (ret)
                goto out;

        perf_output_put(&handle, deferred_event->event);
        for (int i = 0; i < deferred_event->trace->nr; i++) {
                u64 entry = deferred_event->trace->entries[i];
                perf_output_put(&handle, entry);
        }
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        deferred_event->event.header.size = size;
}

static void perf_unwind_deferred_callback(struct unwind_work *work,
                                         struct unwind_stacktrace *trace, u64 cookie)
{
        struct perf_callchain_deferred_event deferred_event = {
                .trace = trace,
                .event = {
                        .header = {
                                .type = PERF_RECORD_CALLCHAIN_DEFERRED,
                                .misc = PERF_RECORD_MISC_USER,
                                .size = sizeof(deferred_event.event) +
                                        (trace->nr * sizeof(u64)),
                        },
                        .cookie = cookie,
                        .nr = trace->nr,
                },
        };

        perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
}

struct perf_text_poke_event {
        const void                *old_bytes;
        const void                *new_bytes;
        size_t                        pad;
        u16                        old_len;
        u16                        new_len;

        struct {
                struct perf_event_header        header;

                u64                                addr;
        } event_id;
};

static int perf_event_text_poke_match(struct perf_event *event)
{
        return event->attr.text_poke;
}

static void perf_event_text_poke_output(struct perf_event *event, void *data)
{
        struct perf_text_poke_event *text_poke_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u64 padding = 0;
        int ret;

        if (!perf_event_text_poke_match(event))
                return;

        perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                text_poke_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, text_poke_event->event_id);
        perf_output_put(&handle, text_poke_event->old_len);
        perf_output_put(&handle, text_poke_event->new_len);

        __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
        __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);

        if (text_poke_event->pad)
                __output_copy(&handle, &padding, text_poke_event->pad);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_text_poke(const void *addr, const void *old_bytes,
                          size_t old_len, const void *new_bytes, size_t new_len)
{
        struct perf_text_poke_event text_poke_event;
        size_t tot, pad;

        if (!atomic_read(&nr_text_poke_events))
                return;

        tot  = sizeof(text_poke_event.old_len) + old_len;
        tot += sizeof(text_poke_event.new_len) + new_len;
        pad  = ALIGN(tot, sizeof(u64)) - tot;

        text_poke_event = (struct perf_text_poke_event){
                .old_bytes    = old_bytes,
                .new_bytes    = new_bytes,
                .pad          = pad,
                .old_len      = old_len,
                .new_len      = new_len,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_TEXT_POKE,
                                .misc = PERF_RECORD_MISC_KERNEL,
                                .size = sizeof(text_poke_event.event_id) + tot + pad,
                        },
                        .addr = (unsigned long)addr,
                },
        };

        perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
}

void perf_event_itrace_started(struct perf_event *event)
{
        WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE);
}

static void perf_log_itrace_start(struct perf_event *event)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u32                                pid;
                u32                                tid;
        } rec;
        int ret;

        if (event->parent)
                event = event->parent;

        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
            event->attach_state & PERF_ATTACH_ITRACE)
                return;

        rec.header.type        = PERF_RECORD_ITRACE_START;
        rec.header.misc        = 0;
        rec.header.size        = sizeof(rec);
        rec.pid        = perf_event_pid(event, current);
        rec.tid        = perf_event_tid(event, current);

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u64                                hw_id;
        } rec;
        int ret;

        if (event->parent)
                event = event->parent;

        rec.header.type        = PERF_RECORD_AUX_OUTPUT_HW_ID;
        rec.header.misc        = 0;
        rec.header.size        = sizeof(rec);
        rec.hw_id        = hw_id;

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}
EXPORT_SYMBOL_GPL(perf_report_aux_output_id);

static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
        u64 seq;

        seq = __this_cpu_read(perf_throttled_seq);
        if (seq != hwc->interrupts_seq) {
                hwc->interrupts_seq = seq;
                hwc->interrupts = 1;
        } else {
                hwc->interrupts++;
        }

        if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
                __this_cpu_inc(perf_throttled_count);
                tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
                perf_event_throttle_group(event);
                ret = 1;
        }

        if (event->attr.freq) {
                u64 now = perf_clock();
                s64 delta = now - hwc->freq_time_stamp;

                hwc->freq_time_stamp = now;

                if (delta > 0 && delta < 2*TICK_NSEC)
                        perf_adjust_period(event, delta, hwc->last_period, true);
        }

        return ret;
}

int perf_event_account_interrupt(struct perf_event *event)
{
        return __perf_event_account_interrupt(event, 1);
}

static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
{
        /*
         * Due to interrupt latency (AKA "skid"), we may enter the
         * kernel before taking an overflow, even if the PMU is only
         * counting user events.
         */
        if (event->attr.exclude_kernel && !user_mode(regs))
                return false;

        return true;
}

#ifdef CONFIG_BPF_SYSCALL
static int bpf_overflow_handler(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        struct bpf_perf_event_data_kern ctx = {
                .data = data,
                .event = event,
        };
        struct bpf_prog *prog;
        int ret = 0;

        ctx.regs = perf_arch_bpf_user_pt_regs(regs);
        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
                goto out;
        rcu_read_lock();
        prog = READ_ONCE(event->prog);
        if (prog) {
                perf_prepare_sample(data, event, regs);
                ret = bpf_prog_run(prog, &ctx);
        }
        rcu_read_unlock();
out:
        __this_cpu_dec(bpf_prog_active);

        return ret;
}

static inline int perf_event_set_bpf_handler(struct perf_event *event,
                                             struct bpf_prog *prog,
                                             u64 bpf_cookie)
{
        if (event->overflow_handler_context)
                /* hw breakpoint or kernel counter */
                return -EINVAL;

        if (event->prog)
                return -EEXIST;

        if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
                return -EINVAL;

        if (event->attr.precise_ip &&
            prog->call_get_stack &&
            (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
             event->attr.exclude_callchain_kernel ||
             event->attr.exclude_callchain_user)) {
                /*
                 * On perf_event with precise_ip, calling bpf_get_stack()
                 * may trigger unwinder warnings and occasional crashes.
                 * bpf_get_[stack|stackid] works around this issue by using
                 * callchain attached to perf_sample_data. If the
                 * perf_event does not full (kernel and user) callchain
                 * attached to perf_sample_data, do not allow attaching BPF
                 * program that calls bpf_get_[stack|stackid].
                 */
                return -EPROTO;
        }

        event->prog = prog;
        event->bpf_cookie = bpf_cookie;
        return 0;
}

static inline void perf_event_free_bpf_handler(struct perf_event *event)
{
        struct bpf_prog *prog = event->prog;

        if (!prog)
                return;

        event->prog = NULL;
        bpf_prog_put(prog);
}
#else
static inline int bpf_overflow_handler(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs)
{
        return 1;
}

static inline int perf_event_set_bpf_handler(struct perf_event *event,
                                             struct bpf_prog *prog,
                                             u64 bpf_cookie)
{
        return -EOPNOTSUPP;
}

static inline void perf_event_free_bpf_handler(struct perf_event *event)
{
}
#endif

/*
 * Generic event overflow handling, sampling.
 */

static int __perf_event_overflow(struct perf_event *event,
                                 int throttle, struct perf_sample_data *data,
                                 struct pt_regs *regs)
{
        int events = atomic_read(&event->event_limit);
        int ret = 0;

        /*
         * Non-sampling counters might still use the PMI to fold short
         * hardware counters, ignore those.
         */
        if (unlikely(!is_sampling_event(event)))
                return 0;

        ret = __perf_event_account_interrupt(event, throttle);

        if (event->attr.aux_pause)
                perf_event_aux_pause(event->aux_event, true);

        if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
            !bpf_overflow_handler(event, data, regs))
                goto out;

        /*
         * XXX event_limit might not quite work as expected on inherited
         * events
         */

        event->pending_kill = POLL_IN;
        if (events && atomic_dec_and_test(&event->event_limit)) {
                ret = 1;
                event->pending_kill = POLL_HUP;
                perf_event_disable_inatomic(event);
                event->pmu->stop(event, 0);
        }

        if (event->attr.sigtrap) {
                /*
                 * The desired behaviour of sigtrap vs invalid samples is a bit
                 * tricky; on the one hand, one should not loose the SIGTRAP if
                 * it is the first event, on the other hand, we should also not
                 * trigger the WARN or override the data address.
                 */
                bool valid_sample = sample_is_allowed(event, regs);
                unsigned int pending_id = 1;
                enum task_work_notify_mode notify_mode;

                if (regs)
                        pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;

                notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;

                if (!event->pending_work &&
                    !task_work_add(current, &event->pending_task, notify_mode)) {
                        event->pending_work = pending_id;
                        local_inc(&event->ctx->nr_no_switch_fast);
                        WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));

                        event->pending_addr = 0;
                        if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
                                event->pending_addr = data->addr;

                } else if (event->attr.exclude_kernel && valid_sample) {
                        /*
                         * Should not be able to return to user space without
                         * consuming pending_work; with exceptions:
                         *
                         *  1. Where !exclude_kernel, events can overflow again
                         *     in the kernel without returning to user space.
                         *
                         *  2. Events that can overflow again before the IRQ-
                         *     work without user space progress (e.g. hrtimer).
                         *     To approximate progress (with false negatives),
                         *     check 32-bit hash of the current IP.
                         */
                        WARN_ON_ONCE(event->pending_work != pending_id);
                }
        }

        READ_ONCE(event->overflow_handler)(event, data, regs);

        if (*perf_event_fasync(event) && event->pending_kill) {
                event->pending_wakeup = 1;
                irq_work_queue(&event->pending_irq);
        }
out:
        if (event->attr.aux_resume)
                perf_event_aux_pause(event->aux_event, false);

        return ret;
}

int perf_event_overflow(struct perf_event *event,
                        struct perf_sample_data *data,
                        struct pt_regs *regs)
{
        /*
         * Entry point from hardware PMI, interrupts should be disabled here.
         * This serializes us against perf_event_remove_from_context() in
         * things like perf_event_release_kernel().
         */
        lockdep_assert_irqs_disabled();

        return __perf_event_overflow(event, 1, data, regs);
}

/*
 * Generic software event infrastructure
 */

struct swevent_htable {
        struct swevent_hlist                *swevent_hlist;
        struct mutex                        hlist_mutex;
        int                                hlist_refcount;
};
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);

/*
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

u64 perf_swevent_set_period(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        u64 period = hwc->last_period;
        u64 nr, offset;
        s64 old, val;

        hwc->last_period = hwc->sample_period;

        old = local64_read(&hwc->period_left);
        do {
                val = old;
                if (val < 0)
                        return 0;

                nr = div64_u64(period + val, period);
                offset = nr * period;
                val -= offset;
        } while (!local64_try_cmpxchg(&hwc->period_left, &old, val));

        return nr;
}

static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;

        if (!overflow)
                overflow = perf_swevent_set_period(event);

        if (hwc->interrupts == MAX_INTERRUPTS)
                return;

        for (; overflow; overflow--) {
                if (__perf_event_overflow(event, throttle,
                                            data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
                         * hwc->interrupts == MAX_INTERRUPTS.
                         */
                        break;
                }
                throttle = 1;
        }
}

static void perf_swevent_event(struct perf_event *event, u64 nr,
                               struct perf_sample_data *data,
                               struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;

        /*
         * This is:
         *   - software                preempt
         *   - tracepoint        preempt
         *   -   tp_target_task        irq (ctx->lock)
         *   - uprobes                preempt/irq
         *   - kprobes                preempt/irq
         *   - hw_breakpoint        irq
         *
         * Any of these are sufficient to hold off RCU and thus ensure @event
         * exists.
         */
        lockdep_assert_preemption_disabled();
        local64_add(nr, &event->count);

        if (!regs)
                return;

        if (!is_sampling_event(event))
                return;

        /*
         * Serialize against event_function_call() IPIs like normal overflow
         * event handling. Specifically, must not allow
         * perf_event_release_kernel() -> perf_remove_from_context() to make
         * progress and 'release' the event from under us.
         */
        guard(irqsave)();
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
                data->period = nr;
                return perf_swevent_overflow(event, 1, data, regs);
        } else
                data->period = event->hw.last_period;

        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);

        if (local64_add_negative(nr, &hwc->period_left))
                return;

        perf_swevent_overflow(event, 0, data, regs);
}

int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 1;

        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
                        return 1;

                if (event->attr.exclude_kernel && !user_mode(regs))
                        return 1;
        }

        return 0;
}

static int perf_swevent_match(struct perf_event *event,
                                enum perf_type_id type,
                                u32 event_id,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        if (event->attr.type != type)
                return 0;

        if (event->attr.config != event_id)
                return 0;

        if (perf_exclude_event(event, regs))
                return 0;

        return 1;
}

static inline u64 swevent_hash(u64 type, u32 event_id)
{
        u64 val = event_id | (type << 32);

        return hash_64(val, SWEVENT_HLIST_BITS);
}

static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
{
        u64 hash = swevent_hash(type, event_id);

        return &hlist->heads[hash];
}

/* For the read side: events when they trigger */
static inline struct hlist_head *
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
{
        struct swevent_hlist *hlist;

        hlist = rcu_dereference(swhash->swevent_hlist);
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

/* For the event head insertion and removal in the hlist */
static inline struct hlist_head *
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
{
        struct swevent_hlist *hlist;
        u32 event_id = event->attr.config;
        u64 type = event->attr.type;

        /*
         * Event scheduling is always serialized against hlist allocation
         * and release. Which makes the protected version suitable here.
         * The context lock guarantees that.
         */
        hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                          lockdep_is_held(&event->ctx->lock));
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    u64 nr,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct perf_event *event;
        struct hlist_head *head;

        rcu_read_lock();
        head = find_swevent_head_rcu(swhash, type, event_id);
        if (!head)
                goto end;

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_event(event, nr, data, regs);
        }
end:
        rcu_read_unlock();
}

DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);

int perf_swevent_get_recursion_context(void)
{
        return get_recursion_context(current->perf_recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);

void perf_swevent_put_recursion_context(int rctx)
{
        put_recursion_context(current->perf_recursion, rctx);
}

void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        struct perf_sample_data data;

        if (WARN_ON_ONCE(!regs))
                return;

        perf_sample_data_init(&data, addr, 0);
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
}

void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        int rctx;

        preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();
        if (unlikely(rctx < 0))
                goto fail;

        ___perf_sw_event(event_id, nr, regs, addr);

        perf_swevent_put_recursion_context(rctx);
fail:
        preempt_enable_notrace();
}

static void perf_swevent_read(struct perf_event *event)
{
}

static int perf_swevent_add(struct perf_event *event, int flags)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct hw_perf_event *hwc = &event->hw;
        struct hlist_head *head;

        if (is_sampling_event(event)) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }

        hwc->state = !(flags & PERF_EF_START);

        head = find_swevent_head(swhash, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;

        hlist_add_head_rcu(&event->hlist_entry, head);
        perf_event_update_userpage(event);

        return 0;
}

static void perf_swevent_del(struct perf_event *event, int flags)
{
        hlist_del_rcu(&event->hlist_entry);
}

static void perf_swevent_start(struct perf_event *event, int flags)
{
        event->hw.state = 0;
}

static void perf_swevent_stop(struct perf_event *event, int flags)
{
        event->hw.state = PERF_HES_STOPPED;
}

/* Deref the hlist from the update side */
static inline struct swevent_hlist *
swevent_hlist_deref(struct swevent_htable *swhash)
{
        return rcu_dereference_protected(swhash->swevent_hlist,
                                         lockdep_is_held(&swhash->hlist_mutex));
}

static void swevent_hlist_release(struct swevent_htable *swhash)
{
        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);

        if (!hlist)
                return;

        RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
        kfree_rcu(hlist, rcu_head);
}

static void swevent_hlist_put_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);

        if (!--swhash->hlist_refcount)
                swevent_hlist_release(swhash);

        mutex_unlock(&swhash->hlist_mutex);
}

static void swevent_hlist_put(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                swevent_hlist_put_cpu(cpu);
}

static int swevent_hlist_get_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        int err = 0;

        mutex_lock(&swhash->hlist_mutex);
        if (!swevent_hlist_deref(swhash) &&
            cpumask_test_cpu(cpu, perf_online_mask)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc_obj(*hlist);
                if (!hlist) {
                        err = -ENOMEM;
                        goto exit;
                }
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        swhash->hlist_refcount++;
exit:
        mutex_unlock(&swhash->hlist_mutex);

        return err;
}

static int swevent_hlist_get(void)
{
        int err, cpu, failed_cpu;

        mutex_lock(&pmus_lock);
        for_each_possible_cpu(cpu) {
                err = swevent_hlist_get_cpu(cpu);
                if (err) {
                        failed_cpu = cpu;
                        goto fail;
                }
        }
        mutex_unlock(&pmus_lock);
        return 0;
fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
                swevent_hlist_put_cpu(cpu);
        }
        mutex_unlock(&pmus_lock);
        return err;
}

struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

static void sw_perf_event_destroy(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        WARN_ON(event->parent);

        static_key_slow_dec(&perf_swevent_enabled[event_id]);
        swevent_hlist_put();
}

static struct pmu perf_cpu_clock; /* fwd declaration */
static struct pmu perf_task_clock;

static int perf_swevent_init(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        switch (event_id) {
        case PERF_COUNT_SW_CPU_CLOCK:
                event->attr.type = perf_cpu_clock.type;
                return -ENOENT;
        case PERF_COUNT_SW_TASK_CLOCK:
                event->attr.type = perf_task_clock.type;
                return -ENOENT;

        default:
                break;
        }

        if (event_id >= PERF_COUNT_SW_MAX)
                return -ENOENT;

        if (!event->parent) {
                int err;

                err = swevent_hlist_get();
                if (err)
                        return err;

                static_key_slow_inc(&perf_swevent_enabled[event_id]);
                event->destroy = sw_perf_event_destroy;
        }

        return 0;
}

static struct pmu perf_swevent = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,

        .event_init        = perf_swevent_init,
        .add                = perf_swevent_add,
        .del                = perf_swevent_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

#ifdef CONFIG_EVENT_TRACING

static void tp_perf_event_destroy(struct perf_event *event)
{
        perf_trace_destroy(event);
}

static int perf_tp_event_init(struct perf_event *event)
{
        int err;

        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;

        /*
         * no branch sampling for tracepoint events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        err = perf_trace_init(event);
        if (err)
                return err;

        event->destroy = tp_perf_event_destroy;

        return 0;
}

static struct pmu perf_tracepoint = {
        .task_ctx_nr        = perf_sw_context,

        .event_init        = perf_tp_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_raw_record *raw)
{
        void *record = raw->frag.data;

        /* only top level events have filters set */
        if (event->parent)
                event = event->parent;

        if (likely(!event->filter) || filter_match_preds(event->filter, record))
                return 1;
        return 0;
}

static int perf_tp_event_match(struct perf_event *event,
                                struct perf_raw_record *raw,
                                struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;
        /*
         * If exclude_kernel, only trace user-space tracepoints (uprobes)
         */
        if (event->attr.exclude_kernel && !user_mode(regs))
                return 0;

        if (!perf_tp_filter_match(event, raw))
                return 0;

        return 1;
}

void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
                               struct trace_event_call *call, u64 count,
                               struct pt_regs *regs, struct hlist_head *head,
                               struct task_struct *task)
{
        if (bpf_prog_array_valid(call)) {
                *(struct pt_regs **)raw_data = regs;
                if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
                        perf_swevent_put_recursion_context(rctx);
                        return;
                }
        }
        perf_tp_event(call->event.type, count, raw_data, size, regs, head,
                      rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);

static void __perf_tp_event_target_task(u64 count, void *record,
                                        struct pt_regs *regs,
                                        struct perf_sample_data *data,
                                        struct perf_raw_record *raw,
                                        struct perf_event *event)
{
        struct trace_entry *entry = record;

        if (event->attr.config != entry->type)
                return;
        /* Cannot deliver synchronous signal to other task. */
        if (event->attr.sigtrap)
                return;
        if (perf_tp_event_match(event, raw, regs)) {
                perf_sample_data_init(data, 0, 0);
                perf_sample_save_raw_data(data, event, raw);
                perf_swevent_event(event, count, data, regs);
        }
}

static void perf_tp_event_target_task(u64 count, void *record,
                                      struct pt_regs *regs,
                                      struct perf_sample_data *data,
                                      struct perf_raw_record *raw,
                                      struct perf_event_context *ctx)
{
        unsigned int cpu = smp_processor_id();
        struct pmu *pmu = &perf_tracepoint;
        struct perf_event *event, *sibling;

        perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
                __perf_tp_event_target_task(count, record, regs, data, raw, event);
                for_each_sibling_event(sibling, event)
                        __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
        }

        perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
                __perf_tp_event_target_task(count, record, regs, data, raw, event);
                for_each_sibling_event(sibling, event)
                        __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
        }
}

void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
                   struct pt_regs *regs, struct hlist_head *head, int rctx,
                   struct task_struct *task)
{
        struct perf_sample_data data;
        struct perf_event *event;

        /*
         * Per being a tracepoint, this runs with preemption disabled.
         */
        lockdep_assert_preemption_disabled();

        struct perf_raw_record raw = {
                .frag = {
                        .size = entry_size,
                        .data = record,
                },
        };

        perf_trace_buf_update(record, event_type);

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_tp_event_match(event, &raw, regs)) {
                        /*
                         * Here use the same on-stack perf_sample_data,
                         * some members in data are event-specific and
                         * need to be re-computed for different sweveents.
                         * Re-initialize data->sample_flags safely to avoid
                         * the problem that next event skips preparing data
                         * because data->sample_flags is set.
                         */
                        perf_sample_data_init(&data, 0, 0);
                        perf_sample_save_raw_data(&data, event, &raw);
                        perf_swevent_event(event, count, &data, regs);
                }
        }

        /*
         * If we got specified a target task, also iterate its context and
         * deliver this event there too.
         */
        if (task && task != current) {
                struct perf_event_context *ctx;

                rcu_read_lock();
                ctx = rcu_dereference(task->perf_event_ctxp);
                if (!ctx)
                        goto unlock;

                raw_spin_lock(&ctx->lock);
                perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
                raw_spin_unlock(&ctx->lock);
unlock:
                rcu_read_unlock();
        }

        perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);

#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
 * Flags in config, used by dynamic PMU kprobe and uprobe
 * The flags should match following PMU_FORMAT_ATTR().
 *
 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
 *                               if not set, create kprobe/uprobe
 *
 * The following values specify a reference counter (or semaphore in the
 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
 *
 * PERF_UPROBE_REF_CTR_OFFSET_BITS        # of bits in config as th offset
 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT        # of bits to shift left
 */
enum perf_probe_config {
        PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
        PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
        PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
};

PMU_FORMAT_ATTR(retprobe, "config:0");
#endif

#ifdef CONFIG_KPROBE_EVENTS
static struct attribute *kprobe_attrs[] = {
        &format_attr_retprobe.attr,
        NULL,
};

static struct attribute_group kprobe_format_group = {
        .name = "format",
        .attrs = kprobe_attrs,
};

static const struct attribute_group *kprobe_attr_groups[] = {
        &kprobe_format_group,
        NULL,
};

static int perf_kprobe_event_init(struct perf_event *event);
static struct pmu perf_kprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_kprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = kprobe_attr_groups,
};

static int perf_kprobe_event_init(struct perf_event *event)
{
        int err;
        bool is_retprobe;

        if (event->attr.type != perf_kprobe.type)
                return -ENOENT;

        if (!perfmon_capable())
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        err = perf_kprobe_init(event, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_kprobe_destroy;

        return 0;
}
#endif /* CONFIG_KPROBE_EVENTS */

#ifdef CONFIG_UPROBE_EVENTS
PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");

static struct attribute *uprobe_attrs[] = {
        &format_attr_retprobe.attr,
        &format_attr_ref_ctr_offset.attr,
        NULL,
};

static struct attribute_group uprobe_format_group = {
        .name = "format",
        .attrs = uprobe_attrs,
};

static const struct attribute_group *uprobe_attr_groups[] = {
        &uprobe_format_group,
        NULL,
};

static int perf_uprobe_event_init(struct perf_event *event);
static struct pmu perf_uprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_uprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = uprobe_attr_groups,
};

static int perf_uprobe_event_init(struct perf_event *event)
{
        int err;
        unsigned long ref_ctr_offset;
        bool is_retprobe;

        if (event->attr.type != perf_uprobe.type)
                return -ENOENT;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
        err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_uprobe_destroy;

        return 0;
}
#endif /* CONFIG_UPROBE_EVENTS */

static inline void perf_tp_register(void)
{
        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
#ifdef CONFIG_KPROBE_EVENTS
        perf_pmu_register(&perf_kprobe, "kprobe", -1);
#endif
#ifdef CONFIG_UPROBE_EVENTS
        perf_pmu_register(&perf_uprobe, "uprobe", -1);
#endif
}

static void perf_event_free_filter(struct perf_event *event)
{
        ftrace_profile_free_filter(event);
}

/*
 * returns true if the event is a tracepoint, or a kprobe/upprobe created
 * with perf_event_open()
 */
static inline bool perf_event_is_tracing(struct perf_event *event)
{
        if (event->pmu == &perf_tracepoint)
                return true;
#ifdef CONFIG_KPROBE_EVENTS
        if (event->pmu == &perf_kprobe)
                return true;
#endif
#ifdef CONFIG_UPROBE_EVENTS
        if (event->pmu == &perf_uprobe)
                return true;
#endif
        return false;
}

static int __perf_event_set_bpf_prog(struct perf_event *event,
                                     struct bpf_prog *prog,
                                     u64 bpf_cookie)
{
        bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;

        if (event->state <= PERF_EVENT_STATE_REVOKED)
                return -ENODEV;

        if (!perf_event_is_tracing(event))
                return perf_event_set_bpf_handler(event, prog, bpf_cookie);

        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
        is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
        is_syscall_tp = is_syscall_trace_event(event->tp_event);
        if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
                /* bpf programs can only be attached to u/kprobe or tracepoint */
                return -EINVAL;

        if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
            (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
            (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
                return -EINVAL;

        if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
                /* only uprobe programs are allowed to be sleepable */
                return -EINVAL;

        /* Kprobe override only works for kprobes, not uprobes. */
        if (prog->kprobe_override && !is_kprobe)
                return -EINVAL;

        /* Writing to context allowed only for uprobes. */
        if (prog->aux->kprobe_write_ctx && !is_uprobe)
                return -EINVAL;

        if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);

                if (prog->aux->max_ctx_offset > off)
                        return -EACCES;
        }

        return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}

int perf_event_set_bpf_prog(struct perf_event *event,
                            struct bpf_prog *prog,
                            u64 bpf_cookie)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

void perf_event_free_bpf_prog(struct perf_event *event)
{
        if (!event->prog)
                return;

        if (!perf_event_is_tracing(event)) {
                perf_event_free_bpf_handler(event);
                return;
        }
        perf_event_detach_bpf_prog(event);
}

#else

static inline void perf_tp_register(void)
{
}

static void perf_event_free_filter(struct perf_event *event)
{
}

static int __perf_event_set_bpf_prog(struct perf_event *event,
                                     struct bpf_prog *prog,
                                     u64 bpf_cookie)
{
        return -ENOENT;
}

int perf_event_set_bpf_prog(struct perf_event *event,
                            struct bpf_prog *prog,
                            u64 bpf_cookie)
{
        return -ENOENT;
}

void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */

#ifdef CONFIG_HAVE_HW_BREAKPOINT
void perf_bp_event(struct perf_event *bp, void *data)
{
        struct perf_sample_data sample;
        struct pt_regs *regs = data;

        /*
         * Exception context, will have interrupts disabled.
         */
        lockdep_assert_irqs_disabled();

        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);

        if (!bp->hw.state && !perf_exclude_event(bp, regs))
                perf_swevent_event(bp, 1, &sample, regs);
}
#endif

/*
 * Allocate a new address filter
 */
static struct perf_addr_filter *
perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
{
        int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
        struct perf_addr_filter *filter;

        filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
        if (!filter)
                return NULL;

        INIT_LIST_HEAD(&filter->entry);
        list_add_tail(&filter->entry, filters);

        return filter;
}

static void free_filters_list(struct list_head *filters)
{
        struct perf_addr_filter *filter, *iter;

        list_for_each_entry_safe(filter, iter, filters, entry) {
                path_put(&filter->path);
                list_del(&filter->entry);
                kfree(filter);
        }
}

/*
 * Free existing address filters and optionally install new ones
 */
static void perf_addr_filters_splice(struct perf_event *event,
                                     struct list_head *head)
{
        unsigned long flags;
        LIST_HEAD(list);

        if (!has_addr_filter(event))
                return;

        /* don't bother with children, they don't have their own filters */
        if (event->parent)
                return;

        raw_spin_lock_irqsave(&event->addr_filters.lock, flags);

        list_splice_init(&event->addr_filters.list, &list);
        if (head)
                list_splice(head, &event->addr_filters.list);

        raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);

        free_filters_list(&list);
}

static void perf_free_addr_filters(struct perf_event *event)
{
        /*
         * Used during free paths, there is no concurrency.
         */
        if (list_empty(&event->addr_filters.list))
                return;

        perf_addr_filters_splice(event, NULL);
}

/*
 * Scan through mm's vmas and see if one of them matches the
 * @filter; if so, adjust filter's address range.
 * Called with mm::mmap_lock down for reading.
 */
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
                                   struct mm_struct *mm,
                                   struct perf_addr_filter_range *fr)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        for_each_vma(vmi, vma) {
                if (!vma->vm_file)
                        continue;

                if (perf_addr_filter_vma_adjust(filter, vma, fr))
                        return;
        }
}

/*
 * Update event's address range filters based on the
 * task's existing mappings, if any.
 */
static void perf_event_addr_filters_apply(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct task_struct *task = READ_ONCE(event->ctx->task);
        struct perf_addr_filter *filter;
        struct mm_struct *mm = NULL;
        unsigned int count = 0;
        unsigned long flags;

        /*
         * We may observe TASK_TOMBSTONE, which means that the event tear-down
         * will stop on the parent's child_mutex that our caller is also holding
         */
        if (task == TASK_TOMBSTONE)
                return;

        if (ifh->nr_file_filters) {
                mm = get_task_mm(task);
                if (!mm)
                        goto restart;

                mmap_read_lock(mm);
        }

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        /*
                         * Adjust base offset if the filter is associated to a
                         * binary that needs to be mapped:
                         */
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;

                        perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
                } else {
                        event->addr_filter_ranges[count].start = filter->offset;
                        event->addr_filter_ranges[count].size  = filter->size;
                }

                count++;
        }

        event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (ifh->nr_file_filters) {
                mmap_read_unlock(mm);

                mmput(mm);
        }

restart:
        perf_event_stop(event, 1);
}

/*
 * Address range filtering: limiting the data to certain
 * instruction address ranges. Filters are ioctl()ed to us from
 * userspace as ascii strings.
 *
 * Filter string format:
 *
 * ACTION RANGE_SPEC
 * where ACTION is one of the
 *  * "filter": limit the trace to this region
 *  * "start": start tracing from this address
 *  * "stop": stop tracing at this address/region;
 * RANGE_SPEC is
 *  * for kernel addresses: <start address>[/<size>]
 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
 *
 * if <size> is not specified or is zero, the range is treated as a single
 * address; not valid for ACTION=="filter".
 */
enum {
        IF_ACT_NONE = -1,
        IF_ACT_FILTER,
        IF_ACT_START,
        IF_ACT_STOP,
        IF_SRC_FILE,
        IF_SRC_KERNEL,
        IF_SRC_FILEADDR,
        IF_SRC_KERNELADDR,
};

enum {
        IF_STATE_ACTION = 0,
        IF_STATE_SOURCE,
        IF_STATE_END,
};

static const match_table_t if_tokens = {
        { IF_ACT_FILTER,        "filter" },
        { IF_ACT_START,                "start" },
        { IF_ACT_STOP,                "stop" },
        { IF_SRC_FILE,                "%u/%u@%s" },
        { IF_SRC_KERNEL,        "%u/%u" },
        { IF_SRC_FILEADDR,        "%u@%s" },
        { IF_SRC_KERNELADDR,        "%u" },
        { IF_ACT_NONE,                NULL },
};

/*
 * Address filter string parser
 */
static int
perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                             struct list_head *filters)
{
        struct perf_addr_filter *filter = NULL;
        char *start, *orig, *filename = NULL;
        substring_t args[MAX_OPT_ARGS];
        int state = IF_STATE_ACTION, token;
        unsigned int kernel = 0;
        int ret = -EINVAL;

        orig = fstr = kstrdup(fstr, GFP_KERNEL);
        if (!fstr)
                return -ENOMEM;

        while ((start = strsep(&fstr, " ,\n")) != NULL) {
                static const enum perf_addr_filter_action_t actions[] = {
                        [IF_ACT_FILTER]        = PERF_ADDR_FILTER_ACTION_FILTER,
                        [IF_ACT_START]        = PERF_ADDR_FILTER_ACTION_START,
                        [IF_ACT_STOP]        = PERF_ADDR_FILTER_ACTION_STOP,
                };
                ret = -EINVAL;

                if (!*start)
                        continue;

                /* filter definition begins */
                if (state == IF_STATE_ACTION) {
                        filter = perf_addr_filter_new(event, filters);
                        if (!filter)
                                goto fail;
                }

                token = match_token(start, if_tokens, args);
                switch (token) {
                case IF_ACT_FILTER:
                case IF_ACT_START:
                case IF_ACT_STOP:
                        if (state != IF_STATE_ACTION)
                                goto fail;

                        filter->action = actions[token];
                        state = IF_STATE_SOURCE;
                        break;

                case IF_SRC_KERNELADDR:
                case IF_SRC_KERNEL:
                        kernel = 1;
                        fallthrough;

                case IF_SRC_FILEADDR:
                case IF_SRC_FILE:
                        if (state != IF_STATE_SOURCE)
                                goto fail;

                        *args[0].to = 0;
                        ret = kstrtoul(args[0].from, 0, &filter->offset);
                        if (ret)
                                goto fail;

                        if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
                                *args[1].to = 0;
                                ret = kstrtoul(args[1].from, 0, &filter->size);
                                if (ret)
                                        goto fail;
                        }

                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
                                int fpos = token == IF_SRC_FILE ? 2 : 1;

                                kfree(filename);
                                filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
                                        goto fail;
                                }
                        }

                        state = IF_STATE_END;
                        break;

                default:
                        goto fail;
                }

                /*
                 * Filter definition is fully parsed, validate and install it.
                 * Make sure that it doesn't contradict itself or the event's
                 * attribute.
                 */
                if (state == IF_STATE_END) {
                        ret = -EINVAL;

                        /*
                         * ACTION "filter" must have a non-zero length region
                         * specified.
                         */
                        if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
                            !filter->size)
                                goto fail;

                        if (!kernel) {
                                if (!filename)
                                        goto fail;

                                /*
                                 * For now, we only support file-based filters
                                 * in per-task events; doing so for CPU-wide
                                 * events requires additional context switching
                                 * trickery, since same object code will be
                                 * mapped at different virtual addresses in
                                 * different processes.
                                 */
                                ret = -EOPNOTSUPP;
                                if (!event->ctx->task)
                                        goto fail;

                                /* look up the path and grab its inode */
                                ret = kern_path(filename, LOOKUP_FOLLOW,
                                                &filter->path);
                                if (ret)
                                        goto fail;

                                ret = -EINVAL;
                                if (!filter->path.dentry ||
                                    !S_ISREG(d_inode(filter->path.dentry)
                                             ->i_mode))
                                        goto fail;

                                event->addr_filters.nr_file_filters++;
                        }

                        /* ready to consume more filters */
                        kfree(filename);
                        filename = NULL;
                        state = IF_STATE_ACTION;
                        filter = NULL;
                        kernel = 0;
                }
        }

        if (state != IF_STATE_ACTION)
                goto fail;

        kfree(filename);
        kfree(orig);

        return 0;

fail:
        kfree(filename);
        free_filters_list(filters);
        kfree(orig);

        return ret;
}

static int
perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
{
        LIST_HEAD(filters);
        int ret;

        /*
         * Since this is called in perf_ioctl() path, we're already holding
         * ctx::mutex.
         */
        lockdep_assert_held(&event->ctx->mutex);

        if (WARN_ON_ONCE(event->parent))
                return -EINVAL;

        ret = perf_event_parse_addr_filter(event, filter_str, &filters);
        if (ret)
                goto fail_clear_files;

        ret = event->pmu->addr_filters_validate(&filters);
        if (ret)
                goto fail_free_filters;

        /* remove existing filters, if any */
        perf_addr_filters_splice(event, &filters);

        /* install new filters */
        perf_event_for_each_child(event, perf_event_addr_filters_apply);

        return ret;

fail_free_filters:
        free_filters_list(&filters);

fail_clear_files:
        event->addr_filters.nr_file_filters = 0;

        return ret;
}

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
        int ret = -EINVAL;
        char *filter_str;

        filter_str = strndup_user(arg, PAGE_SIZE);
        if (IS_ERR(filter_str))
                return PTR_ERR(filter_str);

#ifdef CONFIG_EVENT_TRACING
        if (perf_event_is_tracing(event)) {
                struct perf_event_context *ctx = event->ctx;

                /*
                 * Beware, here be dragons!!
                 *
                 * the tracepoint muck will deadlock against ctx->mutex, but
                 * the tracepoint stuff does not actually need it. So
                 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
                 * already have a reference on ctx.
                 *
                 * This can result in event getting moved to a different ctx,
                 * but that does not affect the tracepoint state.
                 */
                mutex_unlock(&ctx->mutex);
                ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
                mutex_lock(&ctx->mutex);
        } else
#endif
        if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);

        kfree(filter_str);
        return ret;
}

/*
 * hrtimer based swevent callback
 */

static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
        enum hrtimer_restart ret = HRTIMER_RESTART;
        struct perf_sample_data data;
        struct pt_regs *regs;
        struct perf_event *event;
        u64 period;

        event = container_of(hrtimer, struct perf_event, hw.hrtimer);

        if (event->state != PERF_EVENT_STATE_ACTIVE ||
            event->hw.state & PERF_HES_STOPPED)
                return HRTIMER_NORESTART;

        event->pmu->read(event);

        perf_sample_data_init(&data, 0, event->hw.last_period);
        regs = get_irq_regs();

        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (perf_event_overflow(event, &data, regs))
                                ret = HRTIMER_NORESTART;
        }

        period = max_t(u64, 10000, event->hw.sample_period);
        hrtimer_forward_now(hrtimer, ns_to_ktime(period));

        return ret;
}

static void perf_swevent_start_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period;

        if (!is_sampling_event(event))
                return;

        period = local64_read(&hwc->period_left);
        if (period) {
                if (period < 0)
                        period = 10000;

                local64_set(&hwc->period_left, 0);
        } else {
                period = max_t(u64, 10000, hwc->sample_period);
        }
        hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
                      HRTIMER_MODE_REL_PINNED_HARD);
}

static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        /*
         * Careful: this function can be triggered in the hrtimer handler,
         * for cpu-clock events, so hrtimer_cancel() would cause a
         * deadlock.
         *
         * So use hrtimer_try_to_cancel() to try to stop the hrtimer,
         * and the cpu-clock handler also sets the PERF_HES_STOPPED flag,
         * which guarantees that perf_swevent_hrtimer() will stop the
         * hrtimer once it sees the PERF_HES_STOPPED flag.
         */
        if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
                local64_set(&hwc->period_left, ktime_to_ns(remaining));

                hrtimer_try_to_cancel(&hwc->hrtimer);
        }
}

static void perf_swevent_destroy_hrtimer(struct perf_event *event)
{
        hrtimer_cancel(&event->hw.hrtimer);
}

static void perf_swevent_init_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (!is_sampling_event(event))
                return;

        hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        event->destroy = perf_swevent_destroy_hrtimer;

        /*
         * Since hrtimers have a fixed rate, we can do a static freq->period
         * mapping and avoid the whole period adjust feedback stuff.
         */
        if (event->attr.freq) {
                long freq = event->attr.sample_freq;

                event->attr.sample_period = NSEC_PER_SEC / freq;
                hwc->sample_period = event->attr.sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
                hwc->last_period = hwc->sample_period;
                event->attr.freq = 0;
        }
}

/*
 * Software event: cpu wall time clock
 */

static void cpu_clock_event_update(struct perf_event *event)
{
        s64 prev;
        u64 now;

        now = local_clock();
        prev = local64_xchg(&event->hw.prev_count, now);
        local64_add(now - prev, &event->count);
}

static void cpu_clock_event_start(struct perf_event *event, int flags)
{
        event->hw.state = 0;
        local64_set(&event->hw.prev_count, local_clock());
        perf_swevent_start_hrtimer(event);
}

static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
        event->hw.state = PERF_HES_STOPPED;
        perf_swevent_cancel_hrtimer(event);
        if (flags & PERF_EF_UPDATE)
                cpu_clock_event_update(event);
}

static int cpu_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                cpu_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void cpu_clock_event_del(struct perf_event *event, int flags)
{
        cpu_clock_event_stop(event, PERF_EF_UPDATE);
}

static void cpu_clock_event_read(struct perf_event *event)
{
        cpu_clock_event_update(event);
}

static int cpu_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != perf_cpu_clock.type)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_cpu_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,
        .dev                = PMU_NULL_DEV,

        .event_init        = cpu_clock_event_init,
        .add                = cpu_clock_event_add,
        .del                = cpu_clock_event_del,
        .start                = cpu_clock_event_start,
        .stop                = cpu_clock_event_stop,
        .read                = cpu_clock_event_read,
};

/*
 * Software event: task time clock
 */

static void task_clock_event_update(struct perf_event *event, u64 now)
{
        u64 prev;
        s64 delta;

        prev = local64_xchg(&event->hw.prev_count, now);
        delta = now - prev;
        local64_add(delta, &event->count);
}

static void task_clock_event_start(struct perf_event *event, int flags)
{
        event->hw.state = 0;
        local64_set(&event->hw.prev_count, event->ctx->time.time);
        perf_swevent_start_hrtimer(event);
}

static void task_clock_event_stop(struct perf_event *event, int flags)
{
        event->hw.state = PERF_HES_STOPPED;
        perf_swevent_cancel_hrtimer(event);
        if (flags & PERF_EF_UPDATE)
                task_clock_event_update(event, event->ctx->time.time);
}

static int task_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                task_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void task_clock_event_del(struct perf_event *event, int flags)
{
        task_clock_event_stop(event, PERF_EF_UPDATE);
}

static void task_clock_event_read(struct perf_event *event)
{
        u64 now = perf_clock();
        u64 delta = now - event->ctx->time.stamp;
        u64 time = event->ctx->time.time + delta;

        task_clock_event_update(event, time);
}

static int task_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != perf_task_clock.type)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_task_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,
        .dev                = PMU_NULL_DEV,

        .event_init        = task_clock_event_init,
        .add                = task_clock_event_add,
        .del                = task_clock_event_del,
        .start                = task_clock_event_start,
        .stop                = task_clock_event_stop,
        .read                = task_clock_event_read,
};

static void perf_pmu_nop_void(struct pmu *pmu)
{
}

static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
{
}

static int perf_pmu_nop_int(struct pmu *pmu)
{
        return 0;
}

static int perf_event_nop_int(struct perf_event *event, u64 value)
{
        return 0;
}

static DEFINE_PER_CPU(unsigned int, nop_txn_flags);

static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
{
        __this_cpu_write(nop_txn_flags, flags);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_disable(pmu);
}

static int perf_pmu_commit_txn(struct pmu *pmu)
{
        unsigned int flags = __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return 0;

        perf_pmu_enable(pmu);
        return 0;
}

static void perf_pmu_cancel_txn(struct pmu *pmu)
{
        unsigned int flags =  __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_enable(pmu);
}

static int perf_event_idx_default(struct perf_event *event)
{
        return 0;
}

/*
 * Let userspace know that this PMU supports address range filtering:
 */
static ssize_t nr_addr_filters_show(struct device *dev,
                                    struct device_attribute *attr,
                                    char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return sysfs_emit(page, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);

static struct idr pmu_idr;

static ssize_t
type_show(struct device *dev, struct device_attribute *attr, char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return sysfs_emit(page, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);

static ssize_t
perf_event_mux_interval_ms_show(struct device *dev,
                                struct device_attribute *attr,
                                char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms);
}

static DEFINE_MUTEX(mux_interval_mutex);

static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
                                 struct device_attribute *attr,
                                 const char *buf, size_t count)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        int timer, cpu, ret;

        ret = kstrtoint(buf, 0, &timer);
        if (ret)
                return ret;

        if (timer < 1)
                return -EINVAL;

        /* same value, noting to do */
        if (timer == pmu->hrtimer_interval_ms)
                return count;

        mutex_lock(&mux_interval_mutex);
        pmu->hrtimer_interval_ms = timer;

        /* update all cpuctx for this PMU */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                struct perf_cpu_pmu_context *cpc;
                cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
                cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);

                cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
        }
        cpus_read_unlock();
        mutex_unlock(&mux_interval_mutex);

        return count;
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);

static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
{
        switch (scope) {
        case PERF_PMU_SCOPE_CORE:
                return topology_sibling_cpumask(cpu);
        case PERF_PMU_SCOPE_DIE:
                return topology_die_cpumask(cpu);
        case PERF_PMU_SCOPE_CLUSTER:
                return topology_cluster_cpumask(cpu);
        case PERF_PMU_SCOPE_PKG:
                return topology_core_cpumask(cpu);
        case PERF_PMU_SCOPE_SYS_WIDE:
                return cpu_online_mask;
        }

        return NULL;
}

static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
{
        switch (scope) {
        case PERF_PMU_SCOPE_CORE:
                return perf_online_core_mask;
        case PERF_PMU_SCOPE_DIE:
                return perf_online_die_mask;
        case PERF_PMU_SCOPE_CLUSTER:
                return perf_online_cluster_mask;
        case PERF_PMU_SCOPE_PKG:
                return perf_online_pkg_mask;
        case PERF_PMU_SCOPE_SYS_WIDE:
                return perf_online_sys_mask;
        }

        return NULL;
}

static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        struct cpumask *mask = perf_scope_cpumask(pmu->scope);

        if (mask)
                return cpumap_print_to_pagebuf(true, buf, mask);
        return 0;
}

static DEVICE_ATTR_RO(cpumask);

static struct attribute *pmu_dev_attrs[] = {
        &dev_attr_type.attr,
        &dev_attr_perf_event_mux_interval_ms.attr,
        &dev_attr_nr_addr_filters.attr,
        &dev_attr_cpumask.attr,
        NULL,
};

static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct pmu *pmu = dev_get_drvdata(dev);

        if (n == 2 && !pmu->nr_addr_filters)
                return 0;

        /* cpumask */
        if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
                return 0;

        return a->mode;
}

static struct attribute_group pmu_dev_attr_group = {
        .is_visible = pmu_dev_is_visible,
        .attrs = pmu_dev_attrs,
};

static const struct attribute_group *pmu_dev_groups[] = {
        &pmu_dev_attr_group,
        NULL,
};

static int pmu_bus_running;
static const struct bus_type pmu_bus = {
        .name                = "event_source",
        .dev_groups        = pmu_dev_groups,
};

static void pmu_dev_release(struct device *dev)
{
        kfree(dev);
}

static int pmu_dev_alloc(struct pmu *pmu)
{
        int ret = -ENOMEM;

        pmu->dev = kzalloc_obj(struct device);
        if (!pmu->dev)
                goto out;

        pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);

        dev_set_drvdata(pmu->dev, pmu);
        pmu->dev->bus = &pmu_bus;
        pmu->dev->parent = pmu->parent;
        pmu->dev->release = pmu_dev_release;

        ret = dev_set_name(pmu->dev, "%s", pmu->name);
        if (ret)
                goto free_dev;

        ret = device_add(pmu->dev);
        if (ret)
                goto free_dev;

        if (pmu->attr_update) {
                ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
                if (ret)
                        goto del_dev;
        }

out:
        return ret;

del_dev:
        device_del(pmu->dev);

free_dev:
        put_device(pmu->dev);
        pmu->dev = NULL;
        goto out;
}

static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;

static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new)
{
        void *tmp, *val = idr_find(idr, id);

        if (val != old)
                return false;

        tmp = idr_replace(idr, new, id);
        if (IS_ERR(tmp))
                return false;

        WARN_ON_ONCE(tmp != val);
        return true;
}

static void perf_pmu_free(struct pmu *pmu)
{
        if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
                if (pmu->nr_addr_filters)
                        device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
                device_del(pmu->dev);
                put_device(pmu->dev);
        }

        if (pmu->cpu_pmu_context) {
                int cpu;

                for_each_possible_cpu(cpu) {
                        struct perf_cpu_pmu_context *cpc;

                        cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
                        if (!cpc)
                                continue;
                        if (cpc->epc.embedded) {
                                /* refcount managed */
                                put_pmu_ctx(&cpc->epc);
                                continue;
                        }
                        kfree(cpc);
                }
                free_percpu(pmu->cpu_pmu_context);
        }
}

DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T))

int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
{
        int cpu, max = PERF_TYPE_MAX;

        struct pmu *pmu __free(pmu_unregister) = _pmu;
        guard(mutex)(&pmus_lock);

        if (WARN_ONCE(!name, "Can not register anonymous pmu.\n"))
                return -EINVAL;

        if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE,
                      "Can not register a pmu with an invalid scope.\n"))
                return -EINVAL;

        pmu->name = name;

        if (type >= 0)
                max = type;

        CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL);
        if (pmu_type.id < 0)
                return pmu_type.id;

        WARN_ON(type >= 0 && pmu_type.id != type);

        pmu->type = pmu_type.id;
        atomic_set(&pmu->exclusive_cnt, 0);

        if (pmu_bus_running && !pmu->dev) {
                int ret = pmu_dev_alloc(pmu);
                if (ret)
                        return ret;
        }

        pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *);
        if (!pmu->cpu_pmu_context)
                return -ENOMEM;

        for_each_possible_cpu(cpu) {
                struct perf_cpu_pmu_context *cpc =
                        kmalloc_node(sizeof(struct perf_cpu_pmu_context),
                                     GFP_KERNEL | __GFP_ZERO,
                                     cpu_to_node(cpu));

                if (!cpc)
                        return -ENOMEM;

                *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc;
                __perf_init_event_pmu_context(&cpc->epc, pmu);
                __perf_mux_hrtimer_init(cpc, cpu);
        }

        if (!pmu->start_txn) {
                if (pmu->pmu_enable) {
                        /*
                         * If we have pmu_enable/pmu_disable calls, install
                         * transaction stubs that use that to try and batch
                         * hardware accesses.
                         */
                        pmu->start_txn  = perf_pmu_start_txn;
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
                        pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
        }

        if (!pmu->pmu_enable) {
                pmu->pmu_enable  = perf_pmu_nop_void;
                pmu->pmu_disable = perf_pmu_nop_void;
        }

        if (!pmu->check_period)
                pmu->check_period = perf_event_nop_int;

        if (!pmu->event_idx)
                pmu->event_idx = perf_event_idx_default;

        INIT_LIST_HEAD(&pmu->events);
        spin_lock_init(&pmu->events_lock);

        /*
         * Now that the PMU is complete, make it visible to perf_try_init_event().
         */
        if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu))
                return -EINVAL;
        list_add_rcu(&pmu->entry, &pmus);

        take_idr_id(pmu_type);
        _pmu = no_free_ptr(pmu); // let it rip
        return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);

static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
                               struct perf_event_context *ctx)
{
        /*
         * De-schedule the event and mark it REVOKED.
         */
        perf_event_exit_event(event, ctx, ctx->task, true);

        /*
         * All _free_event() bits that rely on event->pmu:
         *
         * Notably, perf_mmap() relies on the ordering here.
         */
        scoped_guard (mutex, &event->mmap_mutex) {
                WARN_ON_ONCE(pmu->event_unmapped);
                /*
                 * Mostly an empty lock sequence, such that perf_mmap(), which
                 * relies on mmap_mutex, is sure to observe the state change.
                 */
        }

        perf_event_free_bpf_prog(event);
        perf_free_addr_filters(event);

        if (event->destroy) {
                event->destroy(event);
                event->destroy = NULL;
        }

        if (event->pmu_ctx) {
                put_pmu_ctx(event->pmu_ctx);
                event->pmu_ctx = NULL;
        }

        exclusive_event_destroy(event);
        module_put(pmu->module);

        event->pmu = NULL; /* force fault instead of UAF */
}

static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        __pmu_detach_event(pmu, event, ctx);
        perf_event_ctx_unlock(event, ctx);

        scoped_guard (spinlock, &pmu->events_lock)
                list_del(&event->pmu_list);
}

static struct perf_event *pmu_get_event(struct pmu *pmu)
{
        struct perf_event *event;

        guard(spinlock)(&pmu->events_lock);
        list_for_each_entry(event, &pmu->events, pmu_list) {
                if (atomic_long_inc_not_zero(&event->refcount))
                        return event;
        }

        return NULL;
}

static bool pmu_empty(struct pmu *pmu)
{
        guard(spinlock)(&pmu->events_lock);
        return list_empty(&pmu->events);
}

static void pmu_detach_events(struct pmu *pmu)
{
        struct perf_event *event;

        for (;;) {
                event = pmu_get_event(pmu);
                if (!event)
                        break;

                pmu_detach_event(pmu, event);
                put_event(event);
        }

        /*
         * wait for pending _free_event()s
         */
        wait_var_event(pmu, pmu_empty(pmu));
}

int perf_pmu_unregister(struct pmu *pmu)
{
        scoped_guard (mutex, &pmus_lock) {
                if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
                        return -EINVAL;

                list_del_rcu(&pmu->entry);
        }

        /*
         * We dereference the pmu list under both SRCU and regular RCU, so
         * synchronize against both of those.
         *
         * Notably, the entirety of event creation, from perf_init_event()
         * (which will now fail, because of the above) until
         * perf_install_in_context() should be under SRCU such that
         * this synchronizes against event creation. This avoids trying to
         * detach events that are not fully formed.
         */
        synchronize_srcu(&pmus_srcu);
        synchronize_rcu();

        if (pmu->event_unmapped && !pmu_empty(pmu)) {
                /*
                 * Can't force remove events when pmu::event_unmapped()
                 * is used in perf_mmap_close().
                 */
                guard(mutex)(&pmus_lock);
                idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
                list_add_rcu(&pmu->entry, &pmus);
                return -EBUSY;
        }

        scoped_guard (mutex, &pmus_lock)
                idr_remove(&pmu_idr, pmu->type);

        /*
         * PMU is removed from the pmus list, so no new events will
         * be created, now take care of the existing ones.
         */
        pmu_detach_events(pmu);

        /*
         * PMU is unused, make it go away.
         */
        perf_pmu_free(pmu);
        return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);

static inline bool has_extended_regs(struct perf_event *event)
{
        return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
               (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
}

static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
        struct perf_event_context *ctx = NULL;
        int ret;

        if (!try_module_get(pmu->module))
                return -ENODEV;

        /*
         * A number of pmu->event_init() methods iterate the sibling_list to,
         * for example, validate if the group fits on the PMU. Therefore,
         * if this is a sibling event, acquire the ctx->mutex to protect
         * the sibling_list.
         */
        if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
                /*
                 * This ctx->mutex can nest when we're called through
                 * inheritance. See the perf_event_ctx_lock_nested() comment.
                 */
                ctx = perf_event_ctx_lock_nested(event->group_leader,
                                                 SINGLE_DEPTH_NESTING);
                BUG_ON(!ctx);
        }

        event->pmu = pmu;
        ret = pmu->event_init(event);

        if (ctx)
                perf_event_ctx_unlock(event->group_leader, ctx);

        if (ret)
                goto err_pmu;

        if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
            has_extended_regs(event)) {
                ret = -EOPNOTSUPP;
                goto err_destroy;
        }

        if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
            event_has_any_exclude_flag(event)) {
                ret = -EINVAL;
                goto err_destroy;
        }

        if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
                const struct cpumask *cpumask;
                struct cpumask *pmu_cpumask;
                int cpu;

                cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
                pmu_cpumask = perf_scope_cpumask(pmu->scope);

                ret = -ENODEV;
                if (!pmu_cpumask || !cpumask)
                        goto err_destroy;

                cpu = cpumask_any_and(pmu_cpumask, cpumask);
                if (cpu >= nr_cpu_ids)
                        goto err_destroy;

                event->event_caps |= PERF_EV_CAP_READ_SCOPE;
        }

        return 0;

err_destroy:
        if (event->destroy) {
                event->destroy(event);
                event->destroy = NULL;
        }

err_pmu:
        event->pmu = NULL;
        module_put(pmu->module);
        return ret;
}

static struct pmu *perf_init_event(struct perf_event *event)
{
        bool extended_type = false;
        struct pmu *pmu;
        int type, ret;

        guard(srcu)(&pmus_srcu); /* pmu idr/list access */

        /*
         * Save original type before calling pmu->event_init() since certain
         * pmus overwrites event->attr.type to forward event to another pmu.
         */
        event->orig_type = event->attr.type;

        /* Try parent's PMU first: */
        if (event->parent && event->parent->pmu) {
                pmu = event->parent->pmu;
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        return pmu;
        }

        /*
         * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
         * are often aliases for PERF_TYPE_RAW.
         */
        type = event->attr.type;
        if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
                type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
                if (!type) {
                        type = PERF_TYPE_RAW;
                } else {
                        extended_type = true;
                        event->attr.config &= PERF_HW_EVENT_MASK;
                }
        }

again:
        scoped_guard (rcu)
                pmu = idr_find(&pmu_idr, type);
        if (pmu) {
                if (event->attr.type != type && type != PERF_TYPE_RAW &&
                    !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
                        return ERR_PTR(-ENOENT);

                ret = perf_try_init_event(pmu, event);
                if (ret == -ENOENT && event->attr.type != type && !extended_type) {
                        type = event->attr.type;
                        goto again;
                }

                if (ret)
                        return ERR_PTR(ret);

                return pmu;
        }

        list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        return pmu;

                if (ret != -ENOENT)
                        return ERR_PTR(ret);
        }

        return ERR_PTR(-ENOENT);
}

static void attach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_add_rcu(&event->sb_list, &pel->list);
        raw_spin_unlock(&pel->lock);
}

/*
 * We keep a list of all !task (and therefore per-cpu) events
 * that need to receive side-band records.
 *
 * This avoids having to scan all the various PMU per-cpu contexts
 * looking for them.
 */
static void account_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                attach_sb_event(event);
}

/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        /* Lock so we don't race with concurrent unaccount */
        spin_lock(&nr_freq_lock);
        if (atomic_inc_return(&nr_freq_events) == 1)
                tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void account_freq_event(void)
{
        if (tick_nohz_full_enabled())
                account_freq_event_nohz();
        else
                atomic_inc(&nr_freq_events);
}


static void account_event(struct perf_event *event)
{
        bool inc = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                inc = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_inc(&nr_mmap_events);
        if (event->attr.build_id)
                atomic_inc(&nr_build_id_events);
        if (event->attr.comm)
                atomic_inc(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_inc(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_inc(&nr_cgroup_events);
        if (event->attr.task)
                atomic_inc(&nr_task_events);
        if (event->attr.freq)
                account_freq_event();
        if (event->attr.context_switch) {
                atomic_inc(&nr_switch_events);
                inc = true;
        }
        if (has_branch_stack(event))
                inc = true;
        if (is_cgroup_event(event))
                inc = true;
        if (event->attr.ksymbol)
                atomic_inc(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_inc(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_inc(&nr_text_poke_events);

        if (inc) {
                /*
                 * We need the mutex here because static_branch_enable()
                 * must complete *before* the perf_sched_count increment
                 * becomes visible.
                 */
                if (atomic_inc_not_zero(&perf_sched_count))
                        goto enabled;

                mutex_lock(&perf_sched_mutex);
                if (!atomic_read(&perf_sched_count)) {
                        static_branch_enable(&perf_sched_events);
                        /*
                         * Guarantee that all CPUs observe they key change and
                         * call the perf scheduling hooks before proceeding to
                         * install events that need them.
                         */
                        synchronize_rcu();
                }
                /*
                 * Now that we have waited for the sync_sched(), allow further
                 * increments to by-pass the mutex.
                 */
                atomic_inc(&perf_sched_count);
                mutex_unlock(&perf_sched_mutex);
        }
enabled:

        account_pmu_sb_event(event);
}

/*
 * Allocate and initialize an event structure
 */
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 struct task_struct *task,
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
                 perf_overflow_handler_t overflow_handler,
                 void *context, int cgroup_fd)
{
        struct pmu *pmu;
        struct hw_perf_event *hwc;
        long err = -EINVAL;
        int node;

        if ((unsigned)cpu >= nr_cpu_ids) {
                if (!task || cpu != -1)
                        return ERR_PTR(-EINVAL);
        }
        if (attr->sigtrap && !task) {
                /* Requires a task: avoid signalling random tasks. */
                return ERR_PTR(-EINVAL);
        }

        node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
        struct perf_event *event __free(__free_event) =
                kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node);
        if (!event)
                return ERR_PTR(-ENOMEM);

        /*
         * Single events are their own group leaders, with an
         * empty sibling list:
         */
        if (!group_leader)
                group_leader = event;

        mutex_init(&event->child_mutex);
        INIT_LIST_HEAD(&event->child_list);

        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        INIT_LIST_HEAD(&event->active_list);
        init_event_group(event);
        INIT_LIST_HEAD(&event->rb_entry);
        INIT_LIST_HEAD(&event->active_entry);
        INIT_LIST_HEAD(&event->addr_filters.list);
        INIT_HLIST_NODE(&event->hlist_entry);
        INIT_LIST_HEAD(&event->pmu_list);


        init_waitqueue_head(&event->waitq);
        init_irq_work(&event->pending_irq, perf_pending_irq);
        event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
        init_task_work(&event->pending_task, perf_pending_task);

        mutex_init(&event->mmap_mutex);
        raw_spin_lock_init(&event->addr_filters.lock);

        atomic_long_set(&event->refcount, 1);
        event->cpu                = cpu;
        event->attr                = *attr;
        event->group_leader        = group_leader;
        event->pmu                = NULL;
        event->oncpu                = -1;

        event->parent                = parent_event;

        event->ns                = get_pid_ns(task_active_pid_ns(current));
        event->id                = atomic64_inc_return(&perf_event_id);

        event->state                = PERF_EVENT_STATE_INACTIVE;

        if (parent_event)
                event->event_caps = parent_event->event_caps;

        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
                /*
                 * XXX pmu::event_init needs to know what task to account to
                 * and we cannot use the ctx information because we need the
                 * pmu before we get a ctx.
                 */
                event->hw.target = get_task_struct(task);
        }

        event->clock = &local_clock;
        if (parent_event)
                event->clock = parent_event->clock;

        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
                context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
                if (parent_event->prog) {
                        struct bpf_prog *prog = parent_event->prog;

                        bpf_prog_inc(prog);
                        event->prog = prog;
                }
#endif
        }

        if (overflow_handler) {
                event->overflow_handler        = overflow_handler;
                event->overflow_handler_context = context;
        } else if (is_write_backward(event)){
                event->overflow_handler = perf_event_output_backward;
                event->overflow_handler_context = NULL;
        } else {
                event->overflow_handler = perf_event_output_forward;
                event->overflow_handler_context = NULL;
        }

        perf_event__state_init(event);

        pmu = NULL;

        hwc = &event->hw;
        hwc->sample_period = attr->sample_period;
        if (is_event_in_freq_mode(event))
                hwc->sample_period = 1;
        hwc->last_period = hwc->sample_period;

        local64_set(&hwc->period_left, hwc->sample_period);

        /*
         * We do not support PERF_SAMPLE_READ on inherited events unless
         * PERF_SAMPLE_TID is also selected, which allows inherited events to
         * collect per-thread samples.
         * See perf_output_read().
         */
        if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
                return ERR_PTR(-EINVAL);

        if (!has_branch_stack(event))
                event->attr.branch_sample_type = 0;

        pmu = perf_init_event(event);
        if (IS_ERR(pmu))
                return (void*)pmu;

        /*
         * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
         * The attach should be right after the perf_init_event().
         * Otherwise, the __free_event() would mistakenly detach the non-exist
         * perf_ctx_data because of the other errors between them.
         */
        if (event->attach_state & PERF_ATTACH_TASK_DATA) {
                err = attach_perf_ctx_data(event);
                if (err)
                        return ERR_PTR(err);
        }

        /*
         * Disallow uncore-task events. Similarly, disallow uncore-cgroup
         * events (they don't make sense as the cgroup will be different
         * on other CPUs in the uncore mask).
         */
        if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1))
                return ERR_PTR(-EINVAL);

        if (event->attr.aux_output &&
            (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
             event->attr.aux_pause || event->attr.aux_resume))
                return ERR_PTR(-EOPNOTSUPP);

        if (event->attr.aux_pause && event->attr.aux_resume)
                return ERR_PTR(-EINVAL);

        if (event->attr.aux_start_paused) {
                if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
                        return ERR_PTR(-EOPNOTSUPP);
                event->hw.aux_paused = 1;
        }

        if (cgroup_fd != -1) {
                err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
                if (err)
                        return ERR_PTR(err);
        }

        err = exclusive_event_init(event);
        if (err)
                return ERR_PTR(err);

        if (has_addr_filter(event)) {
                event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
                                                    sizeof(struct perf_addr_filter_range),
                                                    GFP_KERNEL);
                if (!event->addr_filter_ranges)
                        return ERR_PTR(-ENOMEM);

                /*
                 * Clone the parent's vma offsets: they are valid until exec()
                 * even if the mm is not shared with the parent.
                 */
                if (event->parent) {
                        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

                        raw_spin_lock_irq(&ifh->lock);
                        memcpy(event->addr_filter_ranges,
                               event->parent->addr_filter_ranges,
                               pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
                        raw_spin_unlock_irq(&ifh->lock);
                }

                /* force hw sync on the address filters */
                event->addr_filters_gen = 1;
        }

        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
                        err = get_callchain_buffers(attr->sample_max_stack);
                        if (err)
                                return ERR_PTR(err);
                        event->attach_state |= PERF_ATTACH_CALLCHAIN;
                }
        }

        err = security_perf_event_alloc(event);
        if (err)
                return ERR_PTR(err);

        err = mediated_pmu_account_event(event);
        if (err)
                return ERR_PTR(err);

        /* symmetric to unaccount_event() in _free_event() */
        account_event(event);

        /*
         * Event creation should be under SRCU, see perf_pmu_unregister().
         */
        lockdep_assert_held(&pmus_srcu);
        scoped_guard (spinlock, &pmu->events_lock)
                list_add(&event->pmu_list, &pmu->events);

        return_ptr(event);
}

static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr)
{
        u32 size;
        int ret;

        /* Zero the full structure, so that a short copy will be nice. */
        memset(attr, 0, sizeof(*attr));

        ret = get_user(size, &uattr->size);
        if (ret)
                return ret;

        /* ABI compatibility quirk: */
        if (!size)
                size = PERF_ATTR_SIZE_VER0;
        if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
                goto err_size;

        ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
        if (ret) {
                if (ret == -E2BIG)
                        goto err_size;
                return ret;
        }

        attr->size = size;

        if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
                return -EINVAL;

        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
                return -EINVAL;

        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                return -EINVAL;

        if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
                u64 mask = attr->branch_sample_type;

                /* only using defined bits */
                if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
                        return -EINVAL;

                /* at least one branch bit must be set */
                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
                        return -EINVAL;

                /* propagate priv level, when not set for branch */
                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {

                        /* exclude_kernel checked on syscall entry */
                        if (!attr->exclude_kernel)
                                mask |= PERF_SAMPLE_BRANCH_KERNEL;

                        if (!attr->exclude_user)
                                mask |= PERF_SAMPLE_BRANCH_USER;

                        if (!attr->exclude_hv)
                                mask |= PERF_SAMPLE_BRANCH_HV;
                        /*
                         * adjust user setting (for HW filter setup)
                         */
                        attr->branch_sample_type = mask;
                }
                /* privileged levels capture (kernel, hv): check permissions */
                if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
                        ret = perf_allow_kernel();
                        if (ret)
                                return ret;
                }
        }

        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
                ret = perf_reg_validate(attr->sample_regs_user);
                if (ret)
                        return ret;
        }

        if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
                if (!arch_perf_have_user_stack_dump())
                        return -ENOSYS;

                /*
                 * We have __u32 type for the size, but so far
                 * we can only use __u16 as maximum due to the
                 * __u16 sample size limit.
                 */
                if (attr->sample_stack_user >= USHRT_MAX)
                        return -EINVAL;
                else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
                        return -EINVAL;
        }

        if (!attr->sample_max_stack)
                attr->sample_max_stack = sysctl_perf_event_max_stack;

        if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
                ret = perf_reg_validate(attr->sample_regs_intr);

#ifndef CONFIG_CGROUP_PERF
        if (attr->sample_type & PERF_SAMPLE_CGROUP)
                return -EINVAL;
#endif
        if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
            (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
                return -EINVAL;

        if (!attr->inherit && attr->inherit_thread)
                return -EINVAL;

        if (attr->remove_on_exec && attr->enable_on_exec)
                return -EINVAL;

        if (attr->sigtrap && !attr->remove_on_exec)
                return -EINVAL;

out:
        return ret;

err_size:
        put_user(sizeof(*attr), &uattr->size);
        ret = -E2BIG;
        goto out;
}

static void mutex_lock_double(struct mutex *a, struct mutex *b)
{
        if (b < a)
                swap(a, b);

        mutex_lock(a);
        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}

static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
        struct perf_buffer *rb = NULL;
        int ret = -EINVAL;

        if (!output_event) {
                mutex_lock(&event->mmap_mutex);
                goto set;
        }

        /* don't allow circular references */
        if (event == output_event)
                goto out;

        /*
         * Don't allow cross-cpu buffers
         */
        if (output_event->cpu != event->cpu)
                goto out;

        /*
         * If its not a per-cpu rb, it must be the same task.
         */
        if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
                goto out;

        /*
         * Mixing clocks in the same buffer is trouble you don't need.
         */
        if (output_event->clock != event->clock)
                goto out;

        /*
         * Either writing ring buffer from beginning or from end.
         * Mixing is not allowed.
         */
        if (is_write_backward(output_event) != is_write_backward(event))
                goto out;

        /*
         * If both events generate aux data, they must be on the same PMU
         */
        if (has_aux(event) && has_aux(output_event) &&
            event->pmu != output_event->pmu)
                goto out;

        /*
         * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
         * output_event is already on rb->event_list, and the list iteration
         * restarts after every removal, it is guaranteed this new event is
         * observed *OR* if output_event is already removed, it's guaranteed we
         * observe !rb->mmap_count.
         */
        mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
        /* Can't redirect output if we've got an active mmap() */
        if (refcount_read(&event->mmap_count))
                goto unlock;

        if (output_event) {
                if (output_event->state <= PERF_EVENT_STATE_REVOKED)
                        goto unlock;

                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
                if (!rb)
                        goto unlock;

                /* did we race against perf_mmap_close() */
                if (!refcount_read(&rb->mmap_count)) {
                        ring_buffer_put(rb);
                        goto unlock;
                }
        }

        ring_buffer_attach(event, rb);

        ret = 0;
unlock:
        mutex_unlock(&event->mmap_mutex);
        if (output_event)
                mutex_unlock(&output_event->mmap_mutex);

out:
        return ret;
}

static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
        bool nmi_safe = false;

        switch (clk_id) {
        case CLOCK_MONOTONIC:
                event->clock = &ktime_get_mono_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_MONOTONIC_RAW:
                event->clock = &ktime_get_raw_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_REALTIME:
                event->clock = &ktime_get_real_ns;
                break;

        case CLOCK_BOOTTIME:
                event->clock = &ktime_get_boottime_ns;
                break;

        case CLOCK_TAI:
                event->clock = &ktime_get_clocktai_ns;
                break;

        default:
                return -EINVAL;
        }

        if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
                return -EINVAL;

        return 0;
}

static bool
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
        unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
        bool is_capable = perfmon_capable();

        if (attr->sigtrap) {
                /*
                 * perf_event_attr::sigtrap sends signals to the other task.
                 * Require the current task to also have CAP_KILL.
                 */
                rcu_read_lock();
                is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
                rcu_read_unlock();

                /*
                 * If the required capabilities aren't available, checks for
                 * ptrace permissions: upgrade to ATTACH, since sending signals
                 * can effectively change the target task.
                 */
                ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
        }

        /*
         * Preserve ptrace permission check for backwards compatibility. The
         * ptrace check also includes checks that the current task and other
         * task have matching uids, and is therefore not done here explicitly.
         */
        return is_capable || ptrace_may_access(task, ptrace_mode);
}

/**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
 * @attr_uptr:        event_id type attributes for monitoring/sampling
 * @pid:                target pid
 * @cpu:                target cpu
 * @group_fd:                group leader event fd
 * @flags:                perf event open flags
 */
SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
        struct perf_event *group_leader = NULL, *output_event = NULL;
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct task_struct *task = NULL;
        struct pmu *pmu;
        int event_fd;
        int move_group = 0;
        int err;
        int f_flags = O_RDWR;
        int cgroup_fd = -1;

        /* for future expandability... */
        if (flags & ~PERF_FLAG_ALL)
                return -EINVAL;

        err = perf_copy_attr(attr_uptr, &attr);
        if (err)
                return err;

        /* Do we allow access to perf_event_open(2) ? */
        err = security_perf_event_open(PERF_SECURITY_OPEN);
        if (err)
                return err;

        if (!attr.exclude_kernel) {
                err = perf_allow_kernel();
                if (err)
                        return err;
        }

        if (attr.namespaces) {
                if (!perfmon_capable())
                        return -EACCES;
        }

        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
        } else {
                if (attr.sample_period & (1ULL << 63))
                        return -EINVAL;
        }

        /* Only privileged users can get physical addresses */
        if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
                err = perf_allow_kernel();
                if (err)
                        return err;
        }

        /* REGS_INTR can leak data, lockdown must prevent this */
        if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
                err = security_locked_down(LOCKDOWN_PERF);
                if (err)
                        return err;
        }

        /*
         * In cgroup mode, the pid argument is used to pass the fd
         * opened to the cgroup directory in cgroupfs. The cpu argument
         * designates the cpu on which to monitor threads from that
         * cgroup.
         */
        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
                return -EINVAL;

        if (flags & PERF_FLAG_FD_CLOEXEC)
                f_flags |= O_CLOEXEC;

        event_fd = get_unused_fd_flags(f_flags);
        if (event_fd < 0)
                return event_fd;

        /*
         * Event creation should be under SRCU, see perf_pmu_unregister().
         */
        guard(srcu)(&pmus_srcu);

        CLASS(fd, group)(group_fd);     // group_fd == -1 => empty
        if (group_fd != -1) {
                if (!is_perf_file(group)) {
                        err = -EBADF;
                        goto err_fd;
                }
                group_leader = fd_file(group)->private_data;
                if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
                        err = -ENODEV;
                        goto err_fd;
                }
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
                        group_leader = NULL;
        }

        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
                task = find_lively_task_by_vpid(pid);
                if (IS_ERR(task)) {
                        err = PTR_ERR(task);
                        goto err_fd;
                }
        }

        if (task && group_leader &&
            group_leader->attr.inherit != attr.inherit) {
                err = -EINVAL;
                goto err_task;
        }

        if (flags & PERF_FLAG_PID_CGROUP)
                cgroup_fd = pid;

        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                                 NULL, NULL, cgroup_fd);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_task;
        }

        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                        err = -EOPNOTSUPP;
                        goto err_alloc;
                }
        }

        /*
         * Special case software events and allow them to be part of
         * any hardware group.
         */
        pmu = event->pmu;

        if (attr.use_clockid) {
                err = perf_event_set_clock(event, attr.clockid);
                if (err)
                        goto err_alloc;
        }

        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;

        if (task) {
                err = down_read_interruptible(&task->signal->exec_update_lock);
                if (err)
                        goto err_alloc;

                /*
                 * We must hold exec_update_lock across this and any potential
                 * perf_install_in_context() call for this new event to
                 * serialize against exec() altering our credentials (and the
                 * perf_event_exit_task() that could imply).
                 */
                err = -EACCES;
                if (!perf_check_permission(&attr, task))
                        goto err_cred;
        }

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_cred;
        }

        mutex_lock(&ctx->mutex);

        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_locked;
        }

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);

                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_locked;
                }
        }

        if (group_leader) {
                err = -EINVAL;

                /*
                 * Do not allow a recursive hierarchy (this new sibling
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
                        goto err_locked;

                /* All events in a group should have the same clock */
                if (group_leader->clock != event->clock)
                        goto err_locked;

                /*
                 * Make sure we're both events for the same CPU;
                 * grouping events for different CPUs is broken; since
                 * you can never concurrently schedule them anyhow.
                 */
                if (group_leader->cpu != event->cpu)
                        goto err_locked;

                /*
                 * Make sure we're both on the same context; either task or cpu.
                 */
                if (group_leader->ctx != ctx)
                        goto err_locked;

                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
                        goto err_locked;

                if (is_software_event(event) &&
                    !in_software_context(group_leader)) {
                        /*
                         * If the event is a sw event, but the group_leader
                         * is on hw context.
                         *
                         * Allow the addition of software events to hw
                         * groups, this is safe because software events
                         * never fail to schedule.
                         *
                         * Note the comment that goes with struct
                         * perf_event_pmu_context.
                         */
                        pmu = group_leader->pmu_ctx->pmu;
                } else if (!is_software_event(event)) {
                        if (is_software_event(group_leader) &&
                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                                /*
                                 * In case the group is a pure software group, and we
                                 * try to add a hardware event, move the whole group to
                                 * the hardware context.
                                 */
                                move_group = 1;
                        }

                        /* Don't allow group of multiple hw events from different pmus */
                        if (!in_software_context(group_leader) &&
                            group_leader->pmu_ctx->pmu != pmu)
                                goto err_locked;
                }
        }

        /*
         * Now that we're certain of the pmu; find the pmu_ctx.
         */
        pmu_ctx = find_get_pmu_context(pmu, ctx, event);
        if (IS_ERR(pmu_ctx)) {
                err = PTR_ERR(pmu_ctx);
                goto err_locked;
        }
        event->pmu_ctx = pmu_ctx;

        if (output_event) {
                err = perf_event_set_output(event, output_event);
                if (err)
                        goto err_context;
        }

        if (!perf_event_validate_size(event)) {
                err = -E2BIG;
                goto err_context;
        }

        if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
                err = -EINVAL;
                goto err_context;
        }

        /*
         * Must be under the same ctx::mutex as perf_install_in_context(),
         * because we need to serialize with concurrent event creation.
         */
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_context;
        }

        WARN_ON_ONCE(ctx->parent_ctx);

        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
        if (IS_ERR(event_file)) {
                err = PTR_ERR(event_file);
                event_file = NULL;
                goto err_context;
        }

        /*
         * This is the point on no return; we cannot fail hereafter. This is
         * where we start modifying current state.
         */

        if (move_group) {
                perf_remove_from_context(group_leader, 0);
                put_pmu_ctx(group_leader->pmu_ctx);

                for_each_sibling_event(sibling, group_leader) {
                        perf_remove_from_context(sibling, 0);
                        put_pmu_ctx(sibling->pmu_ctx);
                }

                /*
                 * Install the group siblings before the group leader.
                 *
                 * Because a group leader will try and install the entire group
                 * (through the sibling list, which is still in-tact), we can
                 * end up with siblings installed in the wrong context.
                 *
                 * By installing siblings first we NO-OP because they're not
                 * reachable through the group lists.
                 */
                for_each_sibling_event(sibling, group_leader) {
                        sibling->pmu_ctx = pmu_ctx;
                        get_pmu_ctx(pmu_ctx);
                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                }

                /*
                 * Removing from the context ends up with disabled
                 * event. What we want here is event in the initial
                 * startup state, ready to be add into new context.
                 */
                group_leader->pmu_ctx = pmu_ctx;
                get_pmu_ctx(pmu_ctx);
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
        }

        /*
         * Precalculate sample_data sizes; do while holding ctx::mutex such
         * that we're serialized against further additions and before
         * perf_install_in_context() which is the point the event is active and
         * can use these values.
         */
        perf_event__header_size(event);
        perf_event__id_header_size(event);

        event->owner = current;

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);

        mutex_unlock(&ctx->mutex);

        if (task) {
                up_read(&task->signal->exec_update_lock);
                put_task_struct(task);
        }

        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);

        /*
         * File reference in group guarantees that group_leader has been
         * kept alive until we place the new event on the sibling_list.
         * This ensures destruction of the group leader will find
         * the pointer to itself in perf_group_detach().
         */
        fd_install(event_fd, event_file);
        return event_fd;

err_context:
        put_pmu_ctx(event->pmu_ctx);
        event->pmu_ctx = NULL; /* _free_event() */
err_locked:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_cred:
        if (task)
                up_read(&task->signal->exec_update_lock);
err_alloc:
        put_event(event);
err_task:
        if (task)
                put_task_struct(task);
err_fd:
        put_unused_fd(event_fd);
        return err;
}

/**
 * perf_event_create_kernel_counter
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
 * @task: task to profile (NULL for percpu)
 * @overflow_handler: callback to trigger when we hit the event
 * @context: context data could be used in overflow_handler callback
 */
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                                 struct task_struct *task,
                                 perf_overflow_handler_t overflow_handler,
                                 void *context)
{
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event_context *ctx;
        struct perf_event *event;
        struct pmu *pmu;
        int err;

        /*
         * Grouping is not supported for kernel events, neither is 'AUX',
         * make sure the caller's intentions are adjusted.
         */
        if (attr->aux_output || attr->aux_action)
                return ERR_PTR(-EINVAL);

        /*
         * Event creation should be under SRCU, see perf_pmu_unregister().
         */
        guard(srcu)(&pmus_srcu);

        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
                                 overflow_handler, context, -1);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
        }

        /* Mark owner so we could distinguish it from user events. */
        event->owner = TASK_TOMBSTONE;
        pmu = event->pmu;

        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
        }

        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_unlock;
        }

        pmu_ctx = find_get_pmu_context(pmu, ctx, event);
        if (IS_ERR(pmu_ctx)) {
                err = PTR_ERR(pmu_ctx);
                goto err_unlock;
        }
        event->pmu_ctx = pmu_ctx;

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx =
                        container_of(ctx, struct perf_cpu_context, ctx);
                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_pmu_ctx;
                }
        }

        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_pmu_ctx;
        }

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);

        return event;

err_pmu_ctx:
        put_pmu_ctx(pmu_ctx);
        event->pmu_ctx = NULL; /* _free_event() */
err_unlock:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_alloc:
        put_event(event);
err:
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);

static void __perf_pmu_remove(struct perf_event_context *ctx,
                              int cpu, struct pmu *pmu,
                              struct perf_event_groups *groups,
                              struct list_head *events)
{
        struct perf_event *event, *sibling;

        perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
                perf_remove_from_context(event, 0);
                put_pmu_ctx(event->pmu_ctx);
                list_add(&event->migrate_entry, events);

                for_each_sibling_event(sibling, event) {
                        perf_remove_from_context(sibling, 0);
                        put_pmu_ctx(sibling->pmu_ctx);
                        list_add(&sibling->migrate_entry, events);
                }
        }
}

static void __perf_pmu_install_event(struct pmu *pmu,
                                     struct perf_event_context *ctx,
                                     int cpu, struct perf_event *event)
{
        struct perf_event_pmu_context *epc;
        struct perf_event_context *old_ctx = event->ctx;

        get_ctx(ctx); /* normally find_get_context() */

        event->cpu = cpu;
        epc = find_get_pmu_context(pmu, ctx, event);
        event->pmu_ctx = epc;

        if (event->state >= PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_INACTIVE;
        perf_install_in_context(ctx, event, cpu);

        /*
         * Now that event->ctx is updated and visible, put the old ctx.
         */
        put_ctx(old_ctx);
}

static void __perf_pmu_install(struct perf_event_context *ctx,
                               int cpu, struct pmu *pmu, struct list_head *events)
{
        struct perf_event *event, *tmp;

        /*
         * Re-instate events in 2 passes.
         *
         * Skip over group leaders and only install siblings on this first
         * pass, siblings will not get enabled without a leader, however a
         * leader will enable its siblings, even if those are still on the old
         * context.
         */
        list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                if (event->group_leader == event)
                        continue;

                list_del(&event->migrate_entry);
                __perf_pmu_install_event(pmu, ctx, cpu, event);
        }

        /*
         * Once all the siblings are setup properly, install the group leaders
         * to make it go.
         */
        list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                list_del(&event->migrate_entry);
                __perf_pmu_install_event(pmu, ctx, cpu, event);
        }
}

void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
{
        struct perf_event_context *src_ctx, *dst_ctx;
        LIST_HEAD(events);

        /*
         * Since per-cpu context is persistent, no need to grab an extra
         * reference.
         */
        src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;

        /*
         * See perf_event_ctx_lock() for comments on the details
         * of swizzling perf_event::ctx.
         */
        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);

        __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
        __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);

        if (!list_empty(&events)) {
                /*
                 * Wait for the events to quiesce before re-instating them.
                 */
                synchronize_rcu();

                __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
        }

        mutex_unlock(&dst_ctx->mutex);
        mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);

static void sync_child_event(struct perf_event *child_event,
                             struct task_struct *task)
{
        struct perf_event *parent_event = child_event->parent;
        u64 child_val;

        if (child_event->attr.inherit_stat) {
                if (task && task != TASK_TOMBSTONE)
                        perf_event_read_event(child_event, task);
        }

        child_val = perf_event_count(child_event, false);

        /*
         * Add back the child's count to the parent's count:
         */
        atomic64_add(child_val, &parent_event->child_count);
        atomic64_add(child_event->total_time_enabled,
                     &parent_event->child_total_time_enabled);
        atomic64_add(child_event->total_time_running,
                     &parent_event->child_total_time_running);
}

static void
perf_event_exit_event(struct perf_event *event,
                      struct perf_event_context *ctx,
                      struct task_struct *task,
                      bool revoke)
{
        struct perf_event *parent_event = event->parent;
        unsigned long detach_flags = DETACH_EXIT;
        unsigned int attach_state;

        if (parent_event) {
                /*
                 * Do not destroy the 'original' grouping; because of the
                 * context switch optimization the original events could've
                 * ended up in a random child task.
                 *
                 * If we were to destroy the original group, all group related
                 * operations would cease to function properly after this
                 * random child dies.
                 *
                 * Do destroy all inherited groups, we don't care about those
                 * and being thorough is better.
                 */
                detach_flags |= DETACH_GROUP | DETACH_CHILD;
                mutex_lock(&parent_event->child_mutex);
                /* PERF_ATTACH_ITRACE might be set concurrently */
                attach_state = READ_ONCE(event->attach_state);

                if (attach_state & PERF_ATTACH_CHILD)
                        sync_child_event(event, task);
        }

        if (revoke)
                detach_flags |= DETACH_GROUP | DETACH_REVOKE;

        perf_remove_from_context(event, detach_flags);
        /*
         * Child events can be freed.
         */
        if (parent_event) {
                mutex_unlock(&parent_event->child_mutex);

                /*
                 * Match the refcount initialization. Make sure it doesn't happen
                 * twice if pmu_detach_event() calls it on an already exited task.
                 */
                if (attach_state & PERF_ATTACH_CHILD) {
                        /*
                         * Kick perf_poll() for is_event_hup();
                         */
                        perf_event_wakeup(parent_event);
                        /*
                         * pmu_detach_event() will have an extra refcount.
                         * perf_pending_task() might have one too.
                         */
                        put_event(event);
                }

                return;
        }

        /*
         * Parent events are governed by their filedesc, retain them.
         */
        perf_event_wakeup(event);
}

static void perf_event_exit_task_context(struct task_struct *task, bool exit)
{
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_event *child_event, *next;

        ctx = perf_pin_task_context(task);
        if (!ctx)
                return;

        /*
         * In order to reduce the amount of tricky in ctx tear-down, we hold
         * ctx::mutex over the entire thing. This serializes against almost
         * everything that wants to access the ctx.
         *
         * The exception is sys_perf_event_open() /
         * perf_event_create_kernel_count() which does find_get_context()
         * without ctx::mutex (it cannot because of the move_group double mutex
         * lock thing). See the comments in perf_install_in_context().
         */
        mutex_lock(&ctx->mutex);

        /*
         * In a single ctx::lock section, de-schedule the events and detach the
         * context from the task such that we cannot ever get it scheduled back
         * in.
         */
        raw_spin_lock_irq(&ctx->lock);
        if (exit)
                task_ctx_sched_out(ctx, NULL, EVENT_ALL);

        /*
         * Now that the context is inactive, destroy the task <-> ctx relation
         * and mark the context dead.
         */
        RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
        put_ctx(ctx); /* cannot be last */
        WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
        put_task_struct(task); /* cannot be last */

        clone_ctx = unclone_ctx(ctx);
        raw_spin_unlock_irq(&ctx->lock);

        if (clone_ctx)
                put_ctx(clone_ctx);

        /*
         * Report the task dead after unscheduling the events so that we
         * won't get any samples after PERF_RECORD_EXIT. We can however still
         * get a few PERF_RECORD_READ events.
         */
        if (exit)
                perf_event_task(task, ctx, 0);

        list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
                perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);

        mutex_unlock(&ctx->mutex);

        if (!exit) {
                /*
                 * perf_event_release_kernel() could still have a reference on
                 * this context. In that case we must wait for these events to
                 * have been freed (in particular all their references to this
                 * task must've been dropped).
                 *
                 * Without this copy_process() will unconditionally free this
                 * task (irrespective of its reference count) and
                 * _free_event()'s put_task_struct(event->hw.target) will be a
                 * use-after-free.
                 *
                 * Wait for all events to drop their context reference.
                 */
                wait_var_event(&ctx->refcount,
                               refcount_read(&ctx->refcount) == 1);
        }
        put_ctx(ctx);
}

/*
 * When a task exits, feed back event values to parent events.
 *
 * Can be called with exec_update_lock held when called from
 * setup_new_exec().
 */
void perf_event_exit_task(struct task_struct *task)
{
        struct perf_event *event, *tmp;

        WARN_ON_ONCE(task != current);

        mutex_lock(&task->perf_event_mutex);
        list_for_each_entry_safe(event, tmp, &task->perf_event_list,
                                 owner_entry) {
                list_del_init(&event->owner_entry);

                /*
                 * Ensure the list deletion is visible before we clear
                 * the owner, closes a race against perf_release() where
                 * we need to serialize on the owner->perf_event_mutex.
                 */
                smp_store_release(&event->owner, NULL);
        }
        mutex_unlock(&task->perf_event_mutex);

        perf_event_exit_task_context(task, true);

        /*
         * The perf_event_exit_task_context calls perf_event_task
         * with task's task_ctx, which generates EXIT events for
         * task contexts and sets task->perf_event_ctxp[] to NULL.
         * At this point we need to send EXIT events to cpu contexts.
         */
        perf_event_task(task, NULL, 0);

        /*
         * Detach the perf_ctx_data for the system-wide event.
         *
         * Done without holding global_ctx_data_rwsem; typically
         * attach_global_ctx_data() will skip over this task, but otherwise
         * attach_task_ctx_data() will observe PF_EXITING.
         */
        detach_task_ctx_data(task);
}

/*
 * Free a context as created by inheritance by perf_event_init_task() below,
 * used by fork() in case of fail.
 *
 * Even though the task has never lived, the context and events have been
 * exposed through the child_list, so we must take care tearing it all down.
 */
void perf_event_free_task(struct task_struct *task)
{
        perf_event_exit_task_context(task, false);
}

void perf_event_delayed_put(struct task_struct *task)
{
        WARN_ON_ONCE(task->perf_event_ctxp);
}

struct file *perf_event_get(unsigned int fd)
{
        struct file *file = fget(fd);
        if (!file)
                return ERR_PTR(-EBADF);

        if (file->f_op != &perf_fops) {
                fput(file);
                return ERR_PTR(-EBADF);
        }

        return file;
}

const struct perf_event *perf_get_event(struct file *file)
{
        if (file->f_op != &perf_fops)
                return ERR_PTR(-EINVAL);

        return file->private_data;
}

const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        if (!event)
                return ERR_PTR(-EINVAL);

        return &event->attr;
}

int perf_allow_kernel(void)
{
        if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(PERF_SECURITY_KERNEL);
}
EXPORT_SYMBOL_GPL(perf_allow_kernel);

/*
 * Inherit an event from parent task to child task.
 *
 * Returns:
 *  - valid pointer on success
 *  - NULL for orphaned events
 *  - IS_ERR() on error
 */
static struct perf_event *
inherit_event(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event *group_leader,
              struct perf_event_context *child_ctx)
{
        enum perf_event_state parent_state = parent_event->state;
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *child_event;
        unsigned long flags;

        /*
         * Instead of creating recursive hierarchies of events,
         * we link inherited events back to the original parent,
         * which has a filp for sure, which we use as the reference
         * count:
         */
        if (parent_event->parent)
                parent_event = parent_event->parent;

        if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
                return NULL;

        /*
         * Event creation should be under SRCU, see perf_pmu_unregister().
         */
        guard(srcu)(&pmus_srcu);

        child_event = perf_event_alloc(&parent_event->attr,
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
                                           NULL, NULL, -1);
        if (IS_ERR(child_event))
                return child_event;

        get_ctx(child_ctx);
        child_event->ctx = child_ctx;

        pmu_ctx = find_get_pmu_context(parent_event->pmu_ctx->pmu, child_ctx, child_event);
        if (IS_ERR(pmu_ctx)) {
                free_event(child_event);
                return ERR_CAST(pmu_ctx);
        }
        child_event->pmu_ctx = pmu_ctx;

        /*
         * is_orphaned_event() and list_add_tail(&parent_event->child_list)
         * must be under the same lock in order to serialize against
         * perf_event_release_kernel(), such that either we must observe
         * is_orphaned_event() or they will observe us on the child_list.
         */
        mutex_lock(&parent_event->child_mutex);
        if (is_orphaned_event(parent_event) ||
            !atomic_long_inc_not_zero(&parent_event->refcount)) {
                mutex_unlock(&parent_event->child_mutex);
                free_event(child_event);
                return NULL;
        }

        /*
         * Make the child state follow the state of the parent event,
         * not its attr.disabled bit.  We hold the parent's mutex,
         * so we won't race with perf_event_{en, dis}able_family.
         */
        if (parent_state >= PERF_EVENT_STATE_INACTIVE)
                child_event->state = PERF_EVENT_STATE_INACTIVE;
        else
                child_event->state = PERF_EVENT_STATE_OFF;

        if (parent_event->attr.freq) {
                u64 sample_period = parent_event->hw.sample_period;
                struct hw_perf_event *hwc = &child_event->hw;

                hwc->sample_period = sample_period;
                hwc->last_period   = sample_period;

                local64_set(&hwc->period_left, sample_period);
        }

        child_event->overflow_handler = parent_event->overflow_handler;
        child_event->overflow_handler_context
                = parent_event->overflow_handler_context;

        /*
         * Precalculate sample_data sizes
         */
        perf_event__header_size(child_event);
        perf_event__id_header_size(child_event);

        /*
         * Link it up in the child's context:
         */
        raw_spin_lock_irqsave(&child_ctx->lock, flags);
        add_event_to_ctx(child_event, child_ctx);
        child_event->attach_state |= PERF_ATTACH_CHILD;
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);

        /*
         * Link this into the parent event's child list
         */
        list_add_tail(&child_event->child_list, &parent_event->child_list);
        mutex_unlock(&parent_event->child_mutex);

        return child_event;
}

/*
 * Inherits an event group.
 *
 * This will quietly suppress orphaned events; !inherit_event() is not an error.
 * This matches with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int inherit_group(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event_context *child_ctx)
{
        struct perf_event *leader;
        struct perf_event *sub;
        struct perf_event *child_ctr;

        leader = inherit_event(parent_event, parent, parent_ctx,
                                 child, NULL, child_ctx);
        if (IS_ERR(leader))
                return PTR_ERR(leader);
        /*
         * @leader can be NULL here because of is_orphaned_event(). In this
         * case inherit_event() will create individual events, similar to what
         * perf_group_detach() would do anyway.
         */
        for_each_sibling_event(sub, parent_event) {
                child_ctr = inherit_event(sub, parent, parent_ctx,
                                            child, leader, child_ctx);
                if (IS_ERR(child_ctr))
                        return PTR_ERR(child_ctr);

                if (sub->aux_event == parent_event && child_ctr &&
                    !perf_get_aux_event(child_ctr, leader))
                        return -EINVAL;
        }
        if (leader)
                leader->group_generation = parent_event->group_generation;
        return 0;
}

/*
 * Creates the child task context and tries to inherit the event-group.
 *
 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
 * inherited_all set when we 'fail' to inherit an orphaned event; this is
 * consistent with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
                   struct task_struct *child,
                   u64 clone_flags, int *inherited_all)
{
        struct perf_event_context *child_ctx;
        int ret;

        if (!event->attr.inherit ||
            (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
            /* Do not inherit if sigtrap and signal handlers were cleared. */
            (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
                *inherited_all = 0;
                return 0;
        }

        child_ctx = child->perf_event_ctxp;
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
                 * inherit events that have been marked for cloning.
                 * First allocate and initialize a context for the
                 * child.
                 */
                child_ctx = alloc_perf_context(child);
                if (!child_ctx)
                        return -ENOMEM;

                child->perf_event_ctxp = child_ctx;
        }

        ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
        if (ret)
                *inherited_all = 0;

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
        unsigned long flags;
        int ret = 0;

        if (likely(!parent->perf_event_ctxp))
                return 0;

        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
        parent_ctx = perf_pin_task_context(parent);
        if (!parent_ctx)
                return 0;

        /*
         * No need to check if parent_ctx != NULL here; since we saw
         * it non-NULL earlier, the only reason for it to become NULL
         * is if we exit, and since we're currently in the middle of
         * a fork we can't be exiting at the same time.
         */

        /*
         * Lock the parent list. No need to lock the child - not PID
         * hashed yet and not running, so nobody can access it.
         */
        mutex_lock(&parent_ctx->mutex);

        /*
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
        perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        /*
         * We can't hold ctx->lock when iterating the ->flexible_group list due
         * to allocations, but we need to prevent rotation because
         * rotate_ctx() will change the list from interrupt context.
         */
        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 1;
        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

        perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 0;

        child_ctx = child->perf_event_ctxp;

        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
                 *
                 * Note that if the parent is a clone, the holding of
                 * parent_ctx->lock avoids it from being uncloned.
                 */
                cloned_ctx = parent_ctx->parent_ctx;
                if (cloned_ctx) {
                        child_ctx->parent_ctx = cloned_ctx;
                        child_ctx->parent_gen = parent_ctx->parent_gen;
                } else {
                        child_ctx->parent_ctx = parent_ctx;
                        child_ctx->parent_gen = parent_ctx->generation;
                }
                get_ctx(child_ctx->parent_ctx);
        }

        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
out_unlock:
        mutex_unlock(&parent_ctx->mutex);

        perf_unpin_context(parent_ctx);
        put_ctx(parent_ctx);

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
        int ret;

        memset(child->perf_recursion, 0, sizeof(child->perf_recursion));
        child->perf_event_ctxp = NULL;
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);
        child->perf_ctx_data = NULL;

        ret = perf_event_init_context(child, clone_flags);
        if (ret) {
                perf_event_free_task(child);
                return ret;
        }

        return 0;
}

static void __init perf_event_init_all_cpus(void)
{
        struct swevent_htable *swhash;
        struct perf_cpu_context *cpuctx;
        int cpu;

        zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
        zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
        zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
        zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
        zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
        zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);


        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);

                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));

                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));

                cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                __perf_event_init_context(&cpuctx->ctx);
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
                cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
                cpuctx->heap = cpuctx->heap_default;
        }
}

static void perf_swevent_init_cpu(unsigned int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);
        if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
                WARN_ON(!hlist);
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        mutex_unlock(&swhash->hlist_mutex);
}

#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx = __info;
        struct perf_event *event;

        raw_spin_lock(&ctx->lock);
        ctx_sched_out(ctx, NULL, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry)
                __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
        raw_spin_unlock(&ctx->lock);
}

static void perf_event_clear_cpumask(unsigned int cpu)
{
        int target[PERF_PMU_MAX_SCOPE];
        unsigned int scope;
        struct pmu *pmu;

        cpumask_clear_cpu(cpu, perf_online_mask);

        for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
                const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
                struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);

                target[scope] = -1;
                if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
                        continue;

                if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
                        continue;
                target[scope] = cpumask_any_but(cpumask, cpu);
                if (target[scope] < nr_cpu_ids)
                        cpumask_set_cpu(target[scope], pmu_cpumask);
        }

        /* migrate */
        list_for_each_entry(pmu, &pmus, entry) {
                if (pmu->scope == PERF_PMU_SCOPE_NONE ||
                    WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
                        continue;

                if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
                        perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
        }
}

static void perf_event_exit_cpu_context(int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;

        // XXX simplify cpuctx->online
        mutex_lock(&pmus_lock);
        /*
         * Clear the cpumasks, and migrate to other CPUs if possible.
         * Must be invoked before the __perf_event_exit_context.
         */
        perf_event_clear_cpumask(cpu);
        cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
        ctx = &cpuctx->ctx;

        mutex_lock(&ctx->mutex);
        if (ctx->nr_events)
                smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
        cpuctx->online = 0;
        mutex_unlock(&ctx->mutex);
        mutex_unlock(&pmus_lock);
}
#else

static void perf_event_exit_cpu_context(int cpu) { }

#endif

static void perf_event_setup_cpumask(unsigned int cpu)
{
        struct cpumask *pmu_cpumask;
        unsigned int scope;

        /*
         * Early boot stage, the cpumask hasn't been set yet.
         * The perf_online_<domain>_masks includes the first CPU of each domain.
         * Always unconditionally set the boot CPU for the perf_online_<domain>_masks.
         */
        if (cpumask_empty(perf_online_mask)) {
                for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
                        pmu_cpumask = perf_scope_cpumask(scope);
                        if (WARN_ON_ONCE(!pmu_cpumask))
                                continue;
                        cpumask_set_cpu(cpu, pmu_cpumask);
                }
                goto end;
        }

        for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
                const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);

                pmu_cpumask = perf_scope_cpumask(scope);

                if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
                        continue;

                if (!cpumask_empty(cpumask) &&
                    cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
                        cpumask_set_cpu(cpu, pmu_cpumask);
        }
end:
        cpumask_set_cpu(cpu, perf_online_mask);
}

int perf_event_init_cpu(unsigned int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;

        perf_swevent_init_cpu(cpu);

        mutex_lock(&pmus_lock);
        perf_event_setup_cpumask(cpu);
        cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
        ctx = &cpuctx->ctx;

        mutex_lock(&ctx->mutex);
        cpuctx->online = 1;
        mutex_unlock(&ctx->mutex);
        mutex_unlock(&pmus_lock);

        return 0;
}

int perf_event_exit_cpu(unsigned int cpu)
{
        perf_event_exit_cpu_context(cpu);
        return 0;
}

static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
{
        int cpu;

        for_each_online_cpu(cpu)
                perf_event_exit_cpu(cpu);

        return NOTIFY_OK;
}

/*
 * Run the perf reboot notifier at the very last possible moment so that
 * the generic watchdog code runs as long as possible.
 */
static struct notifier_block perf_reboot_notifier = {
        .notifier_call = perf_reboot,
        .priority = INT_MIN,
};

void __init perf_event_init(void)
{
        int ret;

        idr_init(&pmu_idr);

        unwind_deferred_init(&perf_unwind_work,
                             perf_unwind_deferred_callback);

        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
        perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
        perf_pmu_register(&perf_task_clock, "task_clock", -1);
        perf_tp_register();
        perf_event_init_cpu(smp_processor_id());
        register_reboot_notifier(&perf_reboot_notifier);

        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);

        perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);

        /*
         * Build time assertion that we keep the data_head at the intended
         * location.  IOW, validation we got the __reserved[] size right.
         */
        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
                     != 1024);
}

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page)
{
        struct perf_pmu_events_attr *pmu_attr =
                container_of(attr, struct perf_pmu_events_attr, attr);

        if (pmu_attr->event_str)
                return sprintf(page, "%s\n", pmu_attr->event_str);

        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_sysfs_show);

static int __init perf_event_sysfs_init(void)
{
        struct pmu *pmu;
        int ret;

        mutex_lock(&pmus_lock);

        ret = bus_register(&pmu_bus);
        if (ret)
                goto unlock;

        list_for_each_entry(pmu, &pmus, entry) {
                if (pmu->dev)
                        continue;

                ret = pmu_dev_alloc(pmu);
                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
        }
        pmu_bus_running = 1;
        ret = 0;

unlock:
        mutex_unlock(&pmus_lock);

        return ret;
}
device_initcall(perf_event_sysfs_init);

#ifdef CONFIG_CGROUP_PERF
static struct cgroup_subsys_state *
perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct perf_cgroup *jc;

        jc = kzalloc_obj(*jc);
        if (!jc)
                return ERR_PTR(-ENOMEM);

        jc->info = alloc_percpu(struct perf_cgroup_info);
        if (!jc->info) {
                kfree(jc);
                return ERR_PTR(-ENOMEM);
        }

        return &jc->css;
}

static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
        struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);

        free_percpu(jc->info);
        kfree(jc);
}

static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
{
        perf_event_cgroup(css->cgroup);
        return 0;
}

static int __perf_cgroup_move(void *info)
{
        struct task_struct *task = info;

        preempt_disable();
        perf_cgroup_switch(task);
        preempt_enable();

        return 0;
}

static void perf_cgroup_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct cgroup_subsys_state *css;

        cgroup_taskset_for_each(task, css, tset)
                task_function_call(task, __perf_cgroup_move, task);
}

struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc        = perf_cgroup_css_alloc,
        .css_free        = perf_cgroup_css_free,
        .css_online        = perf_cgroup_css_online,
        .attach                = perf_cgroup_attach,
        /*
         * Implicitly enable on dfl hierarchy so that perf events can
         * always be filtered by cgroup2 path as long as perf_event
         * controller is not mounted on a legacy hierarchy.
         */
        .implicit_on_dfl = true,
        .threaded        = true,
};
#endif /* CONFIG_CGROUP_PERF */

DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Common values and helper functions for the ChaCha and XChaCha stream ciphers.
 *
 * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's
 * security.  Here they share the same key size, tfm context, and setkey
 * function; only their IV size and encrypt/decrypt function differ.
 *
 * The ChaCha paper specifies 20, 12, and 8-round variants.  In general, it is
 * recommended to use the 20-round variant ChaCha20.  However, the other
 * variants can be needed in some performance-sensitive scenarios.  The generic
 * ChaCha code currently allows only the 20 and 12-round variants.
 */

#ifndef _CRYPTO_CHACHA_H
#define _CRYPTO_CHACHA_H

#include <linux/unaligned.h>
#include <linux/string.h>
#include <linux/types.h>

/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
#define CHACHA_IV_SIZE                16

#define CHACHA_KEY_SIZE                32
#define CHACHA_BLOCK_SIZE        64
#define CHACHAPOLY_IV_SIZE        12

#define CHACHA_KEY_WORDS        8
#define CHACHA_STATE_WORDS        16
#define HCHACHA_OUT_WORDS        8

/* 192-bit nonce, then 64-bit stream position */
#define XCHACHA_IV_SIZE                32

struct chacha_state {
        u32 x[CHACHA_STATE_WORDS];
};

void chacha_block_generic(struct chacha_state *state,
                          u8 out[at_least CHACHA_BLOCK_SIZE], int nrounds);
static inline void chacha20_block(struct chacha_state *state,
                                  u8 out[at_least CHACHA_BLOCK_SIZE])
{
        chacha_block_generic(state, out, 20);
}

void hchacha_block_generic(const struct chacha_state *state,
                           u32 out[at_least HCHACHA_OUT_WORDS], int nrounds);

void hchacha_block(const struct chacha_state *state,
                   u32 out[at_least HCHACHA_OUT_WORDS], int nrounds);

enum chacha_constants { /* expand 32-byte k */
        CHACHA_CONSTANT_EXPA = 0x61707865U,
        CHACHA_CONSTANT_ND_3 = 0x3320646eU,
        CHACHA_CONSTANT_2_BY = 0x79622d32U,
        CHACHA_CONSTANT_TE_K = 0x6b206574U
};

static inline void chacha_init_consts(struct chacha_state *state)
{
        state->x[0]  = CHACHA_CONSTANT_EXPA;
        state->x[1]  = CHACHA_CONSTANT_ND_3;
        state->x[2]  = CHACHA_CONSTANT_2_BY;
        state->x[3]  = CHACHA_CONSTANT_TE_K;
}

static inline void chacha_init(struct chacha_state *state,
                               const u32 key[at_least CHACHA_KEY_WORDS],
                               const u8 iv[at_least CHACHA_IV_SIZE])
{
        chacha_init_consts(state);
        state->x[4]  = key[0];
        state->x[5]  = key[1];
        state->x[6]  = key[2];
        state->x[7]  = key[3];
        state->x[8]  = key[4];
        state->x[9]  = key[5];
        state->x[10] = key[6];
        state->x[11] = key[7];
        state->x[12] = get_unaligned_le32(iv +  0);
        state->x[13] = get_unaligned_le32(iv +  4);
        state->x[14] = get_unaligned_le32(iv +  8);
        state->x[15] = get_unaligned_le32(iv + 12);
}

void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src,
                  unsigned int bytes, int nrounds);

static inline void chacha20_crypt(struct chacha_state *state,
                                  u8 *dst, const u8 *src, unsigned int bytes)
{
        chacha_crypt(state, dst, src, bytes, 20);
}

static inline void chacha_zeroize_state(struct chacha_state *state)
{
        memzero_explicit(state, sizeof(*state));
}

#endif /* _CRYPTO_CHACHA_H */



































































































































































































































































































































































































































































































































































































































    1 















    1 
    1 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/sysfs.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Theodore Ts'o (tytso@mit.edu)
 *
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/part_stat.h>

#include "ext4.h"
#include "ext4_jbd2.h"

typedef enum {
        attr_noop,
        attr_delayed_allocation_blocks,
        attr_session_write_kbytes,
        attr_lifetime_write_kbytes,
        attr_reserved_clusters,
        attr_sra_exceeded_retry_limit,
        attr_inode_readahead,
        attr_trigger_test_error,
        attr_first_error_time,
        attr_last_error_time,
        attr_clusters_in_group,
        attr_mb_order,
        attr_feature,
        attr_pointer_pi,
        attr_pointer_ui,
        attr_pointer_ul,
        attr_pointer_u64,
        attr_pointer_u8,
        attr_pointer_string,
        attr_pointer_atomic,
        attr_journal_task,
        attr_err_report_sec,
} attr_id_t;

typedef enum {
        ptr_explicit,
        ptr_ext4_sb_info_offset,
        ptr_ext4_super_block_offset,
} attr_ptr_t;

static const char proc_dirname[] = "fs/ext4";
static struct proc_dir_entry *ext4_proc_root;

struct ext4_attr {
        struct attribute attr;
        short attr_id;
        short attr_ptr;
        unsigned short attr_size;
        union {
                int offset;
                void *explicit_ptr;
        } u;
};

static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
        struct super_block *sb = sbi->s_buddy_cache->i_sb;

        return sysfs_emit(buf, "%lu\n",
                        (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
                         sbi->s_sectors_written_start) >> 1);
}

static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
        struct super_block *sb = sbi->s_buddy_cache->i_sb;

        return sysfs_emit(buf, "%llu\n",
                        (unsigned long long)(sbi->s_kbytes_written +
                        ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
                          EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}

static ssize_t inode_readahead_blks_store(struct ext4_sb_info *sbi,
                                          const char *buf, size_t count)
{
        unsigned long t;
        int ret;

        ret = kstrtoul(skip_spaces(buf), 0, &t);
        if (ret)
                return ret;

        if (t && (!is_power_of_2(t) || t > 0x40000000))
                return -EINVAL;

        sbi->s_inode_readahead_blks = t;
        return count;
}

static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi,
                                   const char *buf, size_t count)
{
        unsigned long long val;
        ext4_fsblk_t clusters = (ext4_blocks_count(sbi->s_es) >>
                                 sbi->s_cluster_bits);
        int ret;

        ret = kstrtoull(skip_spaces(buf), 0, &val);
        if (ret || val >= clusters || (s64)val < 0)
                return -EINVAL;

        atomic64_set(&sbi->s_resv_clusters, val);
        return count;
}

static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
                                  const char *buf, size_t count)
{
        int len = count;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (len && buf[len-1] == '\n')
                len--;

        if (len)
                ext4_error(sbi->s_sb, "%.*s", len, buf);
        return count;
}

static ssize_t err_report_sec_store(struct ext4_sb_info *sbi,
                                    const char *buf, size_t count)
{
        unsigned long t;
        int ret;

        ret = kstrtoul(skip_spaces(buf), 0, &t);
        if (ret)
                return ret;

        /*the maximum time interval must not exceed one year.*/
        if (t > (365*24*60*60))
                return -EINVAL;

        if (sbi->s_err_report_sec == t)                /*nothing to do*/
                goto out;
        else if (!sbi->s_err_report_sec && t) {
                timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
        } else if (sbi->s_err_report_sec && !t) {
                timer_delete_sync(&sbi->s_err_report);
                goto out;
        }

        sbi->s_err_report_sec = t;
        mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec));

out:
        return count;
}

static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
{
        if (!sbi->s_journal)
                return sysfs_emit(buf, "<none>\n");
        return sysfs_emit(buf, "%d\n",
                        task_pid_vnr(sbi->s_journal->j_task));
}

#define EXT4_ATTR(_name,_mode,_id)                                        \
static struct ext4_attr ext4_attr_##_name = {                                \
        .attr = {.name = __stringify(_name), .mode = _mode },                \
        .attr_id = attr_##_id,                                                \
}

#define EXT4_ATTR_FUNC(_name,_mode)  EXT4_ATTR(_name,_mode,_name)

#define EXT4_ATTR_FEATURE(_name)   EXT4_ATTR(_name, 0444, feature)

#define EXT4_ATTR_OFFSET(_name,_mode,_id,_struct,_elname)        \
static struct ext4_attr ext4_attr_##_name = {                        \
        .attr = {.name = __stringify(_name), .mode = _mode },        \
        .attr_id = attr_##_id,                                        \
        .attr_ptr = ptr_##_struct##_offset,                        \
        .u = {                                                        \
                .offset = offsetof(struct _struct, _elname),\
        },                                                        \
}

#define EXT4_ATTR_STRING(_name,_mode,_size,_struct,_elname)        \
static struct ext4_attr ext4_attr_##_name = {                        \
        .attr = {.name = __stringify(_name), .mode = _mode },        \
        .attr_id = attr_pointer_string,                                \
        .attr_size = _size,                                        \
        .attr_ptr = ptr_##_struct##_offset,                        \
        .u = {                                                        \
                .offset = offsetof(struct _struct, _elname),\
        },                                                        \
}

#define EXT4_RO_ATTR_ES_UI(_name,_elname)                                \
        EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname)

#define EXT4_RO_ATTR_ES_U8(_name,_elname)                                \
        EXT4_ATTR_OFFSET(_name, 0444, pointer_u8, ext4_super_block, _elname)

#define EXT4_RO_ATTR_ES_U64(_name,_elname)                                \
        EXT4_ATTR_OFFSET(_name, 0444, pointer_u64, ext4_super_block, _elname)

#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size)                        \
        EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)

#define EXT4_RW_ATTR_SBI_PI(_name,_elname)      \
        EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname)

#define EXT4_RW_ATTR_SBI_UI(_name,_elname)        \
        EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)

#define EXT4_RW_ATTR_SBI_UL(_name,_elname)        \
        EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)

#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname)        \
        EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname)

#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
static struct ext4_attr ext4_attr_##_name = {                        \
        .attr = {.name = __stringify(_name), .mode = _mode },        \
        .attr_id = attr_##_id,                                        \
        .attr_ptr = ptr_explicit,                                \
        .u = {                                                        \
                .explicit_ptr = _ptr,                                \
        },                                                        \
}

#define ATTR_LIST(name) &ext4_attr_##name.attr

EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
EXT4_ATTR_FUNC(session_write_kbytes, 0444);
EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
EXT4_ATTR_FUNC(reserved_clusters, 0644);
EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);

EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
                 ext4_sb_info, s_inode_readahead_blks);
EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group,
                 ext4_sb_info, s_mb_group_prealloc);
EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order,
                 ext4_sb_info, s_mb_best_avail_max_trim_order);
EXT4_ATTR_OFFSET(err_report_sec, 0644, err_report_sec, ext4_sb_info, s_err_report_sec);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst);
EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
#ifdef CONFIG_EXT4_DEBUG
EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
#endif
EXT4_RO_ATTR_SBI_ATOMIC(warning_count, s_warning_count);
EXT4_RO_ATTR_SBI_ATOMIC(msg_count, s_msg_count);
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode);
EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode);
EXT4_RO_ATTR_ES_UI(first_error_ino, s_first_error_ino);
EXT4_RO_ATTR_ES_UI(last_error_ino, s_last_error_ino);
EXT4_RO_ATTR_ES_U64(first_error_block, s_first_error_block);
EXT4_RO_ATTR_ES_U64(last_error_block, s_last_error_block);
EXT4_RO_ATTR_ES_UI(first_error_line, s_first_error_line);
EXT4_RO_ATTR_ES_UI(last_error_line, s_last_error_line);
EXT4_RO_ATTR_ES_STRING(first_error_func, s_first_error_func, 32);
EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec);
EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb);

static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);

static struct attribute *ext4_attrs[] = {
        ATTR_LIST(delayed_allocation_blocks),
        ATTR_LIST(session_write_kbytes),
        ATTR_LIST(lifetime_write_kbytes),
        ATTR_LIST(reserved_clusters),
        ATTR_LIST(sra_exceeded_retry_limit),
        ATTR_LIST(inode_readahead_blks),
        ATTR_LIST(inode_goal),
        ATTR_LIST(mb_stats),
        ATTR_LIST(mb_max_to_scan),
        ATTR_LIST(mb_min_to_scan),
        ATTR_LIST(mb_order2_req),
        ATTR_LIST(mb_stream_req),
        ATTR_LIST(mb_group_prealloc),
        ATTR_LIST(mb_max_linear_groups),
        ATTR_LIST(max_writeback_mb_bump),
        ATTR_LIST(extent_max_zeroout_kb),
        ATTR_LIST(trigger_fs_error),
        ATTR_LIST(err_ratelimit_interval_ms),
        ATTR_LIST(err_ratelimit_burst),
        ATTR_LIST(warning_ratelimit_interval_ms),
        ATTR_LIST(warning_ratelimit_burst),
        ATTR_LIST(msg_ratelimit_interval_ms),
        ATTR_LIST(msg_ratelimit_burst),
        ATTR_LIST(mb_best_avail_max_trim_order),
        ATTR_LIST(errors_count),
        ATTR_LIST(warning_count),
        ATTR_LIST(msg_count),
        ATTR_LIST(first_error_ino),
        ATTR_LIST(last_error_ino),
        ATTR_LIST(first_error_block),
        ATTR_LIST(last_error_block),
        ATTR_LIST(first_error_line),
        ATTR_LIST(last_error_line),
        ATTR_LIST(first_error_func),
        ATTR_LIST(last_error_func),
        ATTR_LIST(first_error_errcode),
        ATTR_LIST(last_error_errcode),
        ATTR_LIST(first_error_time),
        ATTR_LIST(last_error_time),
        ATTR_LIST(journal_task),
#ifdef CONFIG_EXT4_DEBUG
        ATTR_LIST(simulate_fail),
#endif
        ATTR_LIST(mb_prefetch),
        ATTR_LIST(mb_prefetch_limit),
        ATTR_LIST(last_trim_minblks),
        ATTR_LIST(sb_update_sec),
        ATTR_LIST(sb_update_kb),
        ATTR_LIST(err_report_sec),
        NULL,
};
ATTRIBUTE_GROUPS(ext4);

/* Features this copy of ext4 supports */
EXT4_ATTR_FEATURE(lazy_itable_init);
EXT4_ATTR_FEATURE(batched_discard);
EXT4_ATTR_FEATURE(meta_bg_resize);
#ifdef CONFIG_FS_ENCRYPTION
EXT4_ATTR_FEATURE(encryption);
EXT4_ATTR_FEATURE(test_dummy_encryption_v2);
#endif
#if IS_ENABLED(CONFIG_UNICODE)
EXT4_ATTR_FEATURE(casefold);
#endif
#ifdef CONFIG_FS_VERITY
EXT4_ATTR_FEATURE(verity);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
EXT4_ATTR_FEATURE(fast_commit);
#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
EXT4_ATTR_FEATURE(encrypted_casefold);
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
EXT4_ATTR_FEATURE(blocksize_gt_pagesize);
#endif

static struct attribute *ext4_feat_attrs[] = {
        ATTR_LIST(lazy_itable_init),
        ATTR_LIST(batched_discard),
        ATTR_LIST(meta_bg_resize),
#ifdef CONFIG_FS_ENCRYPTION
        ATTR_LIST(encryption),
        ATTR_LIST(test_dummy_encryption_v2),
#endif
#if IS_ENABLED(CONFIG_UNICODE)
        ATTR_LIST(casefold),
#endif
#ifdef CONFIG_FS_VERITY
        ATTR_LIST(verity),
#endif
        ATTR_LIST(metadata_csum_seed),
        ATTR_LIST(fast_commit),
#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
        ATTR_LIST(encrypted_casefold),
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        ATTR_LIST(blocksize_gt_pagesize),
#endif
        NULL,
};
ATTRIBUTE_GROUPS(ext4_feat);

static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
{
        switch (a->attr_ptr) {
        case ptr_explicit:
                return a->u.explicit_ptr;
        case ptr_ext4_sb_info_offset:
                return (void *) (((char *) sbi) + a->u.offset);
        case ptr_ext4_super_block_offset:
                return (void *) (((char *) sbi->s_es) + a->u.offset);
        }
        return NULL;
}

static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
{
        return sysfs_emit(buf, "%lld\n",
                        ((time64_t)hi << 32) + le32_to_cpu(lo));
}

#define print_tstamp(buf, es, tstamp) \
        __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi)

static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
                                      struct ext4_sb_info *sbi, char *buf)
{
        void *ptr = calc_ptr(a, sbi);

        if (!ptr)
                return 0;

        switch (a->attr_id) {
        case attr_inode_readahead:
        case attr_clusters_in_group:
        case attr_mb_order:
        case attr_pointer_pi:
        case attr_pointer_ui:
                if (a->attr_ptr == ptr_ext4_super_block_offset)
                        return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
                return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr));
        case attr_pointer_ul:
        case attr_err_report_sec:
                return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr));
        case attr_pointer_u8:
                return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr));
        case attr_pointer_u64:
                if (a->attr_ptr == ptr_ext4_super_block_offset)
                        return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr));
                return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr));
        case attr_pointer_string:
                return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr);
        case attr_pointer_atomic:
                return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr));
        }
        return 0;
}

static ssize_t ext4_attr_show(struct kobject *kobj,
                              struct attribute *attr, char *buf)
{
        struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
                                                s_kobj);
        struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);

        switch (a->attr_id) {
        case attr_delayed_allocation_blocks:
                return sysfs_emit(buf, "%llu\n",
                                (s64) EXT4_C2B(sbi,
                       percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
        case attr_session_write_kbytes:
                return session_write_kbytes_show(sbi, buf);
        case attr_lifetime_write_kbytes:
                return lifetime_write_kbytes_show(sbi, buf);
        case attr_reserved_clusters:
                return sysfs_emit(buf, "%llu\n",
                                (unsigned long long)
                                atomic64_read(&sbi->s_resv_clusters));
        case attr_sra_exceeded_retry_limit:
                return sysfs_emit(buf, "%llu\n",
                                (unsigned long long)
                        percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
        case attr_feature:
                return sysfs_emit(buf, "supported\n");
        case attr_first_error_time:
                return print_tstamp(buf, sbi->s_es, s_first_error_time);
        case attr_last_error_time:
                return print_tstamp(buf, sbi->s_es, s_last_error_time);
        case attr_journal_task:
                return journal_task_show(sbi, buf);
        default:
                return ext4_generic_attr_show(a, sbi, buf);
        }
}

static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
                                       struct ext4_sb_info *sbi,
                                       const char *buf, size_t len)
{
        int ret;
        unsigned int t;
        unsigned long lt;
        void *ptr = calc_ptr(a, sbi);

        if (!ptr)
                return 0;

        switch (a->attr_id) {
        case attr_pointer_pi:
                ret = kstrtouint(skip_spaces(buf), 0, &t);
                if (ret)
                        return ret;
                if ((int)t < 0)
                        return -EINVAL;
                *((unsigned int *) ptr) = t;
                return len;
        case attr_pointer_ui:
                ret = kstrtouint(skip_spaces(buf), 0, &t);
                if (ret)
                        return ret;
                if (a->attr_ptr == ptr_ext4_super_block_offset)
                        *((__le32 *) ptr) = cpu_to_le32(t);
                else
                        *((unsigned int *) ptr) = t;
                return len;
        case attr_mb_order:
                ret = kstrtouint(skip_spaces(buf), 0, &t);
                if (ret)
                        return ret;
                if (t > 64)
                        return -EINVAL;
                *((unsigned int *) ptr) = t;
                return len;
        case attr_clusters_in_group:
                ret = kstrtouint(skip_spaces(buf), 0, &t);
                if (ret)
                        return ret;
                if (t > sbi->s_clusters_per_group)
                        return -EINVAL;
                *((unsigned int *) ptr) = t;
                return len;
        case attr_pointer_ul:
                ret = kstrtoul(skip_spaces(buf), 0, &lt);
                if (ret)
                        return ret;
                *((unsigned long *) ptr) = lt;
                return len;
        }
        return 0;
}

static ssize_t ext4_attr_store(struct kobject *kobj,
                               struct attribute *attr,
                               const char *buf, size_t len)
{
        struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
                                                s_kobj);
        struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);

        switch (a->attr_id) {
        case attr_reserved_clusters:
                return reserved_clusters_store(sbi, buf, len);
        case attr_inode_readahead:
                return inode_readahead_blks_store(sbi, buf, len);
        case attr_trigger_test_error:
                return trigger_test_error(sbi, buf, len);
        case attr_err_report_sec:
                return err_report_sec_store(sbi, buf, len);
        default:
                return ext4_generic_attr_store(a, sbi, buf, len);
        }
}

static void ext4_sb_release(struct kobject *kobj)
{
        struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
                                                s_kobj);
        complete(&sbi->s_kobj_unregister);
}

static void ext4_feat_release(struct kobject *kobj)
{
        kfree(kobj);
}

static const struct sysfs_ops ext4_attr_ops = {
        .show        = ext4_attr_show,
        .store        = ext4_attr_store,
};

static const struct kobj_type ext4_sb_ktype = {
        .default_groups = ext4_groups,
        .sysfs_ops        = &ext4_attr_ops,
        .release        = ext4_sb_release,
};

static const struct kobj_type ext4_feat_ktype = {
        .default_groups = ext4_feat_groups,
        .sysfs_ops        = &ext4_attr_ops,
        .release        = ext4_feat_release,
};

void ext4_notify_error_sysfs(struct ext4_sb_info *sbi)
{
        mutex_lock(&sbi->s_error_notify_mutex);
        if (sbi->s_kobj.state_in_sysfs)
                sysfs_notify(&sbi->s_kobj, NULL, "errors_count");
        mutex_unlock(&sbi->s_error_notify_mutex);
}

static struct kobject *ext4_root;

static struct kobject *ext4_feat;

int ext4_register_sysfs(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;

        init_completion(&sbi->s_kobj_unregister);
        mutex_lock(&sbi->s_error_notify_mutex);
        err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root,
                                   "%s", sb->s_id);
        mutex_unlock(&sbi->s_error_notify_mutex);
        if (err) {
                kobject_put(&sbi->s_kobj);
                wait_for_completion(&sbi->s_kobj_unregister);
                return err;
        }

        if (ext4_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
        if (sbi->s_proc) {
                proc_create_single_data("options", S_IRUGO, sbi->s_proc,
                                ext4_seq_options_show, sb);
                proc_create_single_data("es_shrinker_info", S_IRUGO,
                                sbi->s_proc, ext4_seq_es_shrinker_info_show,
                                sb);
                proc_create_single_data("fc_info", 0444, sbi->s_proc,
                                        ext4_fc_info_show, sb);
                proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
                                &ext4_mb_seq_groups_ops, sb);
                proc_create_single_data("mb_stats", 0444, sbi->s_proc,
                                ext4_seq_mb_stats_show, sb);
                proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc,
                                &ext4_mb_seq_structs_summary_ops, sb);
        }
        return 0;
}

void ext4_unregister_sysfs(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (sbi->s_proc)
                remove_proc_subtree(sb->s_id, ext4_proc_root);

        mutex_lock(&sbi->s_error_notify_mutex);
        kobject_del(&sbi->s_kobj);
        mutex_unlock(&sbi->s_error_notify_mutex);
}

int __init ext4_init_sysfs(void)
{
        int ret;

        ext4_root = kobject_create_and_add("ext4", fs_kobj);
        if (!ext4_root)
                return -ENOMEM;

        ext4_feat = kzalloc_obj(*ext4_feat);
        if (!ext4_feat) {
                ret = -ENOMEM;
                goto root_err;
        }

        ret = kobject_init_and_add(ext4_feat, &ext4_feat_ktype,
                                   ext4_root, "features");
        if (ret)
                goto feat_err;

        ext4_proc_root = proc_mkdir(proc_dirname, NULL);
        return ret;

feat_err:
        kobject_put(ext4_feat);
        ext4_feat = NULL;
root_err:
        kobject_put(ext4_root);
        ext4_root = NULL;
        return ret;
}

void ext4_exit_sysfs(void)
{
        kobject_put(ext4_feat);
        ext4_feat = NULL;
        kobject_put(ext4_root);
        ext4_root = NULL;
        remove_proc_entry(proc_dirname, NULL);
        ext4_proc_root = NULL;
}





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 


























    6 






































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Linux Socket Filter Data Structures
 */
#ifndef __LINUX_FILTER_H__
#define __LINUX_FILTER_H__

#include <linux/atomic.h>
#include <linux/bpf.h>
#include <linux/refcount.h>
#include <linux/compat.h>
#include <linux/skbuff.h>
#include <linux/linkage.h>
#include <linux/printk.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/capability.h>
#include <linux/set_memory.h>
#include <linux/kallsyms.h>
#include <linux/if_vlan.h>
#include <linux/vmalloc.h>
#include <linux/sockptr.h>
#include <linux/u64_stats_sync.h>

#include <net/sch_generic.h>

#include <asm/byteorder.h>
#include <uapi/linux/filter.h>

struct sk_buff;
struct sock;
struct seccomp_data;
struct bpf_prog_aux;
struct xdp_rxq_info;
struct xdp_buff;
struct sock_reuseport;
struct ctl_table;
struct ctl_table_header;

/* ArgX, context and stack frame pointer register positions. Note,
 * Arg1, Arg2, Arg3, etc are used as argument mappings of function
 * calls in BPF_CALL instruction.
 */
#define BPF_REG_ARG1        BPF_REG_1
#define BPF_REG_ARG2        BPF_REG_2
#define BPF_REG_ARG3        BPF_REG_3
#define BPF_REG_ARG4        BPF_REG_4
#define BPF_REG_ARG5        BPF_REG_5
#define BPF_REG_CTX        BPF_REG_6
#define BPF_REG_FP        BPF_REG_10

/* Additional register mappings for converted user programs. */
#define BPF_REG_A        BPF_REG_0
#define BPF_REG_X        BPF_REG_7
#define BPF_REG_TMP        BPF_REG_2        /* scratch reg */
#define BPF_REG_D        BPF_REG_8        /* data, callee-saved */
#define BPF_REG_H        BPF_REG_9        /* hlen, callee-saved */

/* Kernel hidden auxiliary/helper register. */
#define BPF_REG_AX                MAX_BPF_REG
#define MAX_BPF_EXT_REG                (MAX_BPF_REG + 1)
#define MAX_BPF_JIT_REG                MAX_BPF_EXT_REG

/* unused opcode to mark special call to bpf_tail_call() helper */
#define BPF_TAIL_CALL        0xf0

/* unused opcode to mark special load instruction. Same as BPF_ABS */
#define BPF_PROBE_MEM        0x20

/* unused opcode to mark special ldsx instruction. Same as BPF_IND */
#define BPF_PROBE_MEMSX        0x40

/* unused opcode to mark special load instruction. Same as BPF_MSH */
#define BPF_PROBE_MEM32        0xa0

/* unused opcode to mark special atomic instruction */
#define BPF_PROBE_ATOMIC 0xe0

/* unused opcode to mark special ldsx instruction. Same as BPF_NOSPEC */
#define BPF_PROBE_MEM32SX 0xc0

/* unused opcode to mark call to interpreter with arguments */
#define BPF_CALL_ARGS        0xe0

/* unused opcode to mark speculation barrier for mitigating
 * Spectre v1 and v4
 */
#define BPF_NOSPEC        0xc0

/* As per nm, we expose JITed images as text (code) section for
 * kallsyms. That way, tools like perf can find it to match
 * addresses.
 */
#define BPF_SYM_ELF_TYPE        't'

/* BPF program can access up to 512 bytes of stack space. */
#define MAX_BPF_STACK        512

/* Helper macros for filter block array initializers. */

/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */

#define BPF_ALU64_REG_OFF(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_ALU64_REG(OP, DST, SRC)                                \
        BPF_ALU64_REG_OFF(OP, DST, SRC, 0)

#define BPF_ALU32_REG_OFF(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_ALU32_REG(OP, DST, SRC)                                \
        BPF_ALU32_REG_OFF(OP, DST, SRC, 0)

/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */

#define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })
#define BPF_ALU64_IMM(OP, DST, IMM)                                \
        BPF_ALU64_IMM_OFF(OP, DST, IMM, 0)

#define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })
#define BPF_ALU32_IMM(OP, DST, IMM)                                \
        BPF_ALU32_IMM_OFF(OP, DST, IMM, 0)

/* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */

#define BPF_ENDIAN(TYPE, DST, LEN)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_END | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = LEN })

/* Byte Swap, bswap16/32/64 */

#define BPF_BSWAP(DST, LEN)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE),        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = LEN })

/* Short form of mov, dst_reg = src_reg */

#define BPF_MOV64_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

#define BPF_MOV32_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
 * dst_reg = src_reg + <percpu_base_off>
 * BPF_ADDR_PERCPU is used as a special insn->off value.
 */
#define BPF_ADDR_PERCPU        (-1)

#define BPF_MOV64_PERCPU_REG(DST, SRC)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = BPF_ADDR_PERCPU,                        \
                .imm   = 0 })

static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
}

/* Short form of mov, dst_reg = imm32 */

#define BPF_MOV64_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */

#define BPF_MOVSX64_REG(DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_MOVSX32_REG(DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Special form of mov32, used for doing explicit zero extension on dst. */
#define BPF_ZEXT_REG(DST)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = DST,                                        \
                .off   = 0,                                        \
                .imm   = 1 })

static inline bool insn_is_zext(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
}

/* addr_space_cast from as(0) to as(1) is for converting bpf arena pointers
 * to pointers in user vma.
 */
static inline bool insn_is_cast_user(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
                              insn->off == BPF_ADDR_SPACE_CAST &&
                              insn->imm == 1U << 16;
}

/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
#define BPF_LD_IMM64(DST, IMM)                                        \
        BPF_LD_IMM64_RAW(DST, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_DW | BPF_IMM,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = (__u32) (IMM) }),                        \
        ((struct bpf_insn) {                                        \
                .code  = 0, /* zero is reserved opcode */        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = ((__u64) (IMM)) >> 32 })

/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD)                                \
        BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)

/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */

#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */

#define BPF_LD_ABS(SIZE, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */

#define BPF_LD_IND(SIZE, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_IND,        \
                .dst_reg = 0,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Memory load, dst_reg = *(uint *) (src_reg + off16) */

#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory load, dst_reg = *(signed size *) (src_reg + off16) */

#define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory store, *(uint *) (dst_reg + off16) = src_reg */

#define BPF_STX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })


/*
 * Atomic operations:
 *
 *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
 *   BPF_AND                  *(uint *) (dst_reg + off16) &= src_reg
 *   BPF_OR                   *(uint *) (dst_reg + off16) |= src_reg
 *   BPF_XOR                  *(uint *) (dst_reg + off16) ^= src_reg
 *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
 *   BPF_AND | BPF_FETCH      src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
 *   BPF_OR | BPF_FETCH       src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
 *   BPF_XOR | BPF_FETCH      src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
 *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
 *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
 *   BPF_LOAD_ACQ             dst_reg = smp_load_acquire(src_reg + off16)
 *   BPF_STORE_REL            smp_store_release(dst_reg + off16, src_reg)
 */

#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = OP })

/* Legacy alias */
#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF)

/* Memory store, *(uint *) (dst_reg + off16) = imm32 */

#define BPF_ST_MEM(SIZE, DST, OFF, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */

#define BPF_JMP_REG(OP, DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */

#define BPF_JMP_IMM(OP, DST, IMM, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_REG(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_IMM(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Unconditional jumps, goto pc + off16 */

#define BPF_JMP_A(OFF)                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_JA,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Unconditional jumps, gotol pc + imm32 */

#define BPF_JMP32_A(IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_JA,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Relative call */

#define BPF_CALL_REL(TGT)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = BPF_PSEUDO_CALL,                        \
                .off   = 0,                                        \
                .imm   = TGT })

/* Convert function address to BPF immediate */

#define BPF_CALL_IMM(x)        ((void *)(x) - (void *)__bpf_call_base)

#define BPF_EMIT_CALL(FUNC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = BPF_CALL_IMM(FUNC) })

/* Kfunc call */

#define BPF_CALL_KFUNC(OFF, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = BPF_PSEUDO_KFUNC_CALL,                \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Raw code statement block */

#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = CODE,                                        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Program exit */

#define BPF_EXIT_INSN()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_EXIT,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Speculation barrier */

#define BPF_ST_NOSPEC()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_NOSPEC,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Internal classic blocks for direct assignment */

#define __BPF_STMT(CODE, K)                                        \
        ((struct sock_filter) BPF_STMT(CODE, K))

#define __BPF_JUMP(CODE, K, JT, JF)                                \
        ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF))

#define bytes_to_bpf_size(bytes)                                \
({                                                                \
        int bpf_size = -EINVAL;                                        \
                                                                \
        if (bytes == sizeof(u8))                                \
                bpf_size = BPF_B;                                \
        else if (bytes == sizeof(u16))                                \
                bpf_size = BPF_H;                                \
        else if (bytes == sizeof(u32))                                \
                bpf_size = BPF_W;                                \
        else if (bytes == sizeof(u64))                                \
                bpf_size = BPF_DW;                                \
                                                                \
        bpf_size;                                                \
})

#define bpf_size_to_bytes(bpf_size)                                \
({                                                                \
        int bytes = -EINVAL;                                        \
                                                                \
        if (bpf_size == BPF_B)                                        \
                bytes = sizeof(u8);                                \
        else if (bpf_size == BPF_H)                                \
                bytes = sizeof(u16);                                \
        else if (bpf_size == BPF_W)                                \
                bytes = sizeof(u32);                                \
        else if (bpf_size == BPF_DW)                                \
                bytes = sizeof(u64);                                \
                                                                \
        bytes;                                                        \
})

#define BPF_SIZEOF(type)                                        \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof(type)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_FIELD_SIZEOF(type, field)                                \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_LDST_BYTES(insn)                                        \
        ({                                                        \
                const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \
                WARN_ON(__size < 0);                                \
                __size;                                                \
        })

#define __BPF_MAP_0(m, v, ...) v
#define __BPF_MAP_1(m, v, t, a, ...) m(t, a)
#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__)
#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__)
#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__)
#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__)

#define __BPF_REG_0(...) __BPF_PAD(5)
#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4)
#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3)
#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2)
#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1)
#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__)

#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__)
#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__)

#define __BPF_CAST(t, a)                                                       \
        (__force t)                                                               \
        (__force                                                               \
         typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long),      \
                                      (unsigned long)0, (t)0))) a
#define __BPF_V void
#define __BPF_N

#define __BPF_DECL_ARGS(t, a) t   a
#define __BPF_DECL_REGS(t, a) u64 a

#define __BPF_PAD(n)                                                               \
        __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2,       \
                  u64, __ur_3, u64, __ur_4, u64, __ur_5)

#define BPF_CALL_x(x, attr, name, ...)                                               \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
        typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));    \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))     \
        {                                                                       \
                return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
        }                                                                       \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))

#define __NOATTR
#define BPF_CALL_0(name, ...)        BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_1(name, ...)        BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_2(name, ...)        BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_3(name, ...)        BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_4(name, ...)        BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_5(name, ...)        BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__)

#define NOTRACE_BPF_CALL_1(name, ...)        BPF_CALL_x(1, notrace, name, __VA_ARGS__)

#define bpf_ctx_range(TYPE, MEMBER)                                                \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2)                                \
        offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1
#if BITS_PER_LONG == 64
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#else
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1
#endif /* BITS_PER_LONG == 64 */

#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE)                                \
        ({                                                                        \
                BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE));                \
                *(PTR_SIZE) = (SIZE);                                                \
                offsetof(TYPE, MEMBER);                                                \
        })

/* A struct sock_filter is architecture independent. */
struct compat_sock_fprog {
        u16                len;
        compat_uptr_t        filter;        /* struct sock_filter * */
};

struct sock_fprog_kern {
        u16                        len;
        struct sock_filter        *filter;
};

/* Some arches need doubleword alignment for their instructions and/or data */
#define BPF_IMAGE_ALIGNMENT 8

struct bpf_binary_header {
        u32 size;
        u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
};

struct bpf_prog_stats {
        u64_stats_t cnt;
        u64_stats_t nsecs;
        u64_stats_t misses;
        struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

struct bpf_timed_may_goto {
        u64 count;
        u64 timestamp;
};

struct sk_filter {
        refcount_t        refcnt;
        struct rcu_head        rcu;
        struct bpf_prog        *prog;
};

DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

extern struct mutex nf_conn_btf_access_lock;
extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
                                     const struct bpf_reg_state *reg,
                                     int off, int size);

typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
                                          const struct bpf_insn *insnsi,
                                          unsigned int (*bpf_func)(const void *,
                                                                   const struct bpf_insn *));

static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
                                          const void *ctx,
                                          bpf_dispatcher_fn dfunc)
{
        u32 ret;

        cant_migrate();
        if (static_branch_unlikely(&bpf_stats_enabled_key)) {
                struct bpf_prog_stats *stats;
                u64 duration, start = sched_clock();
                unsigned long flags;

                ret = dfunc(ctx, prog->insnsi, prog->bpf_func);

                duration = sched_clock() - start;
                if (likely(prog->stats)) {
                        stats = this_cpu_ptr(prog->stats);
                        flags = u64_stats_update_begin_irqsave(&stats->syncp);
                        u64_stats_inc(&stats->cnt);
                        u64_stats_add(&stats->nsecs, duration);
                        u64_stats_update_end_irqrestore(&stats->syncp, flags);
                }
        } else {
                ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
        }
        return ret;
}

static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx)
{
        return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
}

/*
 * Use in preemptible and therefore migratable context to make sure that
 * the execution of the BPF program runs on one CPU.
 *
 * This uses migrate_disable/enable() explicitly to document that the
 * invocation of a BPF program does not require reentrancy protection
 * against a BPF program which is invoked from a preempting task.
 */
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
                                          const void *ctx)
{
        u32 ret;

        migrate_disable();
        ret = bpf_prog_run(prog, ctx);
        migrate_enable();
        return ret;
}

#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN

struct bpf_skb_data_end {
        struct qdisc_skb_cb qdisc_cb;
        void *data_meta;
        void *data_end;
};

struct bpf_nh_params {
        u32 nh_family;
        union {
                u32 ipv4_nh;
                struct in6_addr ipv6_nh;
        };
};

/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT        BIT(0)        /* no napi_direct on return_frame */
#define BPF_RI_F_RI_INIT        BIT(1)
#define BPF_RI_F_CPU_MAP_INIT        BIT(2)
#define BPF_RI_F_DEV_MAP_INIT        BIT(3)
#define BPF_RI_F_XSK_MAP_INIT        BIT(4)

struct bpf_redirect_info {
        u64 tgt_index;
        void *tgt_value;
        struct bpf_map *map;
        u32 flags;
        u32 map_id;
        enum bpf_map_type map_type;
        struct bpf_nh_params nh;
        u32 kern_flags;
};

struct bpf_net_context {
        struct bpf_redirect_info ri;
        struct list_head cpu_map_flush_list;
        struct list_head dev_map_flush_list;
        struct list_head xskmap_map_flush_list;
};

static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx)
{
        struct task_struct *tsk = current;

        if (tsk->bpf_net_context != NULL)
                return NULL;
        bpf_net_ctx->ri.kern_flags = 0;

        tsk->bpf_net_context = bpf_net_ctx;
        return bpf_net_ctx;
}

static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx)
{
        if (bpf_net_ctx)
                current->bpf_net_context = NULL;
}

static inline struct bpf_net_context *bpf_net_ctx_get(void)
{
        return current->bpf_net_context;
}

static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) {
                memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh));
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT;
        }

        return &bpf_net_ctx->ri;
}

static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT;
        }

        return &bpf_net_ctx->cpu_map_flush_list;
}

static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT;
        }

        return &bpf_net_ctx->dev_map_flush_list;
}

static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT;
        }

        return &bpf_net_ctx->xskmap_map_flush_list;
}

static inline void bpf_net_ctx_get_all_used_flush_lists(struct list_head **lh_map,
                                                        struct list_head **lh_dev,
                                                        struct list_head **lh_xsk)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
        u32 kern_flags = bpf_net_ctx->ri.kern_flags;
        struct list_head *lh;

        *lh_map = *lh_dev = *lh_xsk = NULL;

        if (!IS_ENABLED(CONFIG_BPF_SYSCALL))
                return;

        lh = &bpf_net_ctx->dev_map_flush_list;
        if (kern_flags & BPF_RI_F_DEV_MAP_INIT && !list_empty(lh))
                *lh_dev = lh;

        lh = &bpf_net_ctx->cpu_map_flush_list;
        if (kern_flags & BPF_RI_F_CPU_MAP_INIT && !list_empty(lh))
                *lh_map = lh;

        lh = &bpf_net_ctx->xskmap_map_flush_list;
        if (IS_ENABLED(CONFIG_XDP_SOCKETS) &&
            kern_flags & BPF_RI_F_XSK_MAP_INIT && !list_empty(lh))
                *lh_xsk = lh;
}

/* Compute the linear packet data range [data, data_end) which
 * will be accessed by various program types (cls_bpf, act_bpf,
 * lwt, ...). Subsystems allowing direct data access must (!)
 * ensure that cb[] area can be written to when BPF program is
 * invoked (otherwise cb[] save/restore is necessary).
 */
static inline void bpf_compute_data_pointers(struct sk_buff *skb)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
        cb->data_meta = skb->data - skb_metadata_len(skb);
        cb->data_end  = skb->data + skb_headlen(skb);
}

static inline int bpf_prog_run_data_pointers(
        const struct bpf_prog *prog,
        struct sk_buff *skb)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
        void *save_data_meta, *save_data_end;
        int res;

        save_data_meta = cb->data_meta;
        save_data_end = cb->data_end;

        bpf_compute_data_pointers(skb);
        res = bpf_prog_run(prog, skb);

        cb->data_meta = save_data_meta;
        cb->data_end = save_data_end;

        return res;
}

/* Similar to bpf_compute_data_pointers(), except that save orginal
 * data in cb->data and cb->meta_data for restore.
 */
static inline void bpf_compute_and_save_data_end(
        struct sk_buff *skb, void **saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        *saved_data_end = cb->data_end;
        cb->data_end  = skb->data + skb_headlen(skb);
}

/* Restore data saved by bpf_compute_and_save_data_end(). */
static inline void bpf_restore_data_end(
        struct sk_buff *skb, void *saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        cb->data_end = saved_data_end;
}

static inline u8 *bpf_skb_cb(const struct sk_buff *skb)
{
        /* eBPF programs may read/write skb->cb[] area to transfer meta
         * data between tail calls. Since this also needs to work with
         * tc, that scratch memory is mapped to qdisc_skb_cb's data area.
         *
         * In some socket filter cases, the cb unfortunately needs to be
         * saved/restored so that protocol specific skb->cb[] data won't
         * be lost. In any case, due to unpriviledged eBPF programs
         * attached to sockets, we need to clear the bpf_skb_cb() area
         * to not leak previous contents to user space.
         */
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN);
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) !=
                     sizeof_field(struct qdisc_skb_cb, data));

        return qdisc_skb_cb(skb)->data;
}

/* Must be invoked with migration disabled */
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                         const void *ctx)
{
        const struct sk_buff *skb = ctx;
        u8 *cb_data = bpf_skb_cb(skb);
        u8 cb_saved[BPF_SKB_CB_LEN];
        u32 res;

        if (unlikely(prog->cb_access)) {
                memcpy(cb_saved, cb_data, sizeof(cb_saved));
                memset(cb_data, 0, sizeof(cb_saved));
        }

        res = bpf_prog_run(prog, skb);

        if (unlikely(prog->cb_access))
                memcpy(cb_data, cb_saved, sizeof(cb_saved));

        return res;
}

static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                       struct sk_buff *skb)
{
        u32 res;

        migrate_disable();
        res = __bpf_prog_run_save_cb(prog, skb);
        migrate_enable();
        return res;
}

static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
                                        struct sk_buff *skb)
{
        u8 *cb_data = bpf_skb_cb(skb);
        u32 res;

        if (unlikely(prog->cb_access))
                memset(cb_data, 0, BPF_SKB_CB_LEN);

        res = bpf_prog_run_pin_on_cpu(prog, skb);
        return res;
}

DECLARE_BPF_DISPATCHER(xdp)

DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);

u32 xdp_master_redirect(struct xdp_buff *xdp);

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);

static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
{
        return prog->len * sizeof(struct bpf_insn);
}

static inline unsigned int bpf_prog_size(unsigned int proglen)
{
        return max(sizeof(struct bpf_prog),
                   offsetof(struct bpf_prog, insns[proglen]));
}

static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
{
        /* When classic BPF programs have been loaded and the arch
         * does not have a classic BPF JIT (anymore), they have been
         * converted via bpf_migrate_filter() to eBPF and thus always
         * have an unspec program type.
         */
        return prog->type == BPF_PROG_TYPE_UNSPEC;
}

static inline u32 bpf_ctx_off_adjust_machine(u32 size)
{
        const u32 size_machine = sizeof(unsigned long);

        if (size > size_machine && size % size_machine == 0)
                size = size_machine;

        return size;
}

static inline bool
bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
{
        return size <= size_default && (size & (size - 1)) == 0;
}

static inline u8
bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
{
        u8 access_off = off & (size_default - 1);

#ifdef __LITTLE_ENDIAN
        return access_off;
#else
        return size_default - (access_off + size);
#endif
}

#define bpf_ctx_wide_access_ok(off, size, type, field)                        \
        (size == sizeof(__u64) &&                                        \
        off >= offsetof(type, field) &&                                        \
        off + sizeof(__u64) <= offsetofend(type, field) &&                \
        off % sizeof(__u64) == 0)

#define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))

static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        if (!fp->jited) {
                set_vm_flush_reset_perms(fp);
                return set_memory_ro((unsigned long)fp, fp->pages);
        }
#endif
        return 0;
}

static inline int __must_check
bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
        set_vm_flush_reset_perms(hdr);
        return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
}

enum skb_drop_reason
sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);

static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason drop_reason;

        drop_reason = sk_filter_trim_cap(sk, skb, 1);
        return drop_reason ? -EPERM : 0;
}

static inline enum skb_drop_reason
sk_filter_reason(struct sock *sk, struct sk_buff *skb)
{
        return sk_filter_trim_cap(sk, skb, 1);
}

struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct bpf_prog *fp,
                                           int *err);
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
void bpf_prog_free(struct bpf_prog *fp);

bool bpf_opcode_in_insntable(u8 code);

void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off);
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
void bpf_prog_jit_attempt_done(struct bpf_prog *prog);

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags);
void __bpf_prog_free(struct bpf_prog *fp);

static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
{
        __bpf_prog_free(fp);
}

typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
                                       unsigned int flen);

int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig);
void bpf_prog_destroy(struct bpf_prog *fp);

int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
void sk_reuseport_prog_free(struct bpf_prog *prog);
int sk_detach_filter(struct sock *sk);
int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);

u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
#define __bpf_call_base_args \
        ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
         (void *)__bpf_call_base)

struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog);
bool bpf_jit_needs_zext(void);
bool bpf_jit_inlines_helper_call(s32 imm);
bool bpf_jit_supports_subprog_tailcalls(void);
bool bpf_jit_supports_percpu_insn(void);
bool bpf_jit_supports_kfunc_call(void);
bool bpf_jit_supports_far_kfunc_call(void);
bool bpf_jit_supports_exceptions(void);
bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
bool bpf_jit_supports_private_stack(void);
bool bpf_jit_supports_timed_may_goto(void);
bool bpf_jit_supports_fsession(void);
u64 bpf_arch_uaddress_limit(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
u64 arch_bpf_timed_may_goto(void);
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *);
bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id);

static inline bool bpf_dump_raw_ok(const struct cred *cred)
{
        /* Reconstruction of call-sites is dependent on kallsyms,
         * thus make dump the same restriction.
         */
        return kallsyms_show_value(cred);
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len);

#ifdef CONFIG_BPF_SYSCALL
struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
                                     const struct bpf_insn *patch, u32 len);
struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env);
void bpf_restore_insn_aux_data(struct bpf_verifier_env *env,
                               struct bpf_insn_aux_data *orig_insn_aux);
#else
static inline struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
                                                   const struct bpf_insn *patch, u32 len)
{
        return ERR_PTR(-ENOTSUPP);
}

static inline struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env)
{
        return NULL;
}

static inline void bpf_restore_insn_aux_data(struct bpf_verifier_env *env,
                                             struct bpf_insn_aux_data *orig_insn_aux)
{
}
#endif /* CONFIG_BPF_SYSCALL */

int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);

static inline bool xdp_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_set_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_clear_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
}

static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
                                 unsigned int pktlen)
{
        unsigned int len;

        if (unlikely(!(fwd->flags & IFF_UP)))
                return -ENETDOWN;

        len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
        if (pktlen > len)
                return -EMSGSIZE;

        return 0;
}

/* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the
 * same cpu context. Further for best results no more than a single map
 * for the do_redirect/do_flush pair should be used. This limitation is
 * because we only track one map and force a flush when the map changes.
 * This does not appear to be a real limitation for existing software.
 */
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp, const struct bpf_prog *prog);
int xdp_do_redirect(struct net_device *dev,
                    struct xdp_buff *xdp,
                    const struct bpf_prog *prog);
int xdp_do_redirect_frame(struct net_device *dev,
                          struct xdp_buff *xdp,
                          struct xdp_frame *xdpf,
                          const struct bpf_prog *prog);
void xdp_do_flush(void);

void bpf_warn_invalid_xdp_action(const struct net_device *dev,
                                 const struct bpf_prog *prog, u32 act);

#ifdef CONFIG_INET
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  struct sock *migrating_sk,
                                  u32 hash);
#else
static inline struct sock *
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                     struct bpf_prog *prog, struct sk_buff *skb,
                     struct sock *migrating_sk,
                     u32 hash)
{
        return NULL;
}
#endif

#ifdef CONFIG_BPF_JIT
extern int bpf_jit_enable;
extern int bpf_jit_harden;
extern int bpf_jit_kallsyms;
extern long bpf_jit_limit;
extern long bpf_jit_limit_max;

typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);

void bpf_jit_fill_hole_with_zero(void *area, unsigned int size);

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns);
void bpf_jit_binary_free(struct bpf_binary_header *hdr);
u64 bpf_jit_alloc_exec_limit(void);
void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);
struct bpf_binary_header *
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);

void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
void bpf_prog_pack_free(void *ptr, u32 size);

static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
{
        return list_empty(&fp->aux->ksym.lnode) ||
               fp->aux->ksym.lnode.prev == LIST_POISON2;
}

struct bpf_binary_header *
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image,
                          unsigned int alignment,
                          struct bpf_binary_header **rw_hdr,
                          u8 **rw_image,
                          bpf_jit_fill_hole_t bpf_fill_ill_insns);
int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
                                 struct bpf_binary_header *rw_header);
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
                              struct bpf_binary_header *rw_header);

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke);

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed);

const char *bpf_jit_get_prog_name(struct bpf_prog *prog);

struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bpf_prog *prog);
void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other);

static inline bool bpf_prog_need_blind(const struct bpf_prog *prog)
{
        return prog->blinding_requested && !prog->blinded;
}

static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
                                u32 pass, void *image)
{
        pr_err("flen=%u proglen=%u pass=%u image=%p from=%s pid=%d\n", flen,
               proglen, pass, image, current->comm, task_pid_nr(current));

        if (image)
                print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
                               16, 1, image, proglen, false);
}

static inline bool bpf_jit_is_ebpf(void)
{
# ifdef CONFIG_HAVE_EBPF_JIT
        return true;
# else
        return false;
# endif
}

static inline bool ebpf_jit_enabled(void)
{
        return bpf_jit_enable && bpf_jit_is_ebpf();
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return fp->jited && bpf_jit_is_ebpf();
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        /* These are the prerequisites, should someone ever have the
         * idea to call blinding outside of them, we make sure to
         * bail out.
         */
        if (!bpf_jit_is_ebpf())
                return false;
        if (!prog->jit_requested)
                return false;
        if (!bpf_jit_harden)
                return false;
        if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
                return false;

        return true;
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        /* There are a couple of corner cases where kallsyms should
         * not be enabled f.e. on hardening.
         */
        if (bpf_jit_harden)
                return false;
        if (!bpf_jit_kallsyms)
                return false;
        if (bpf_jit_kallsyms == 1)
                return true;

        return false;
}

int bpf_address_lookup(unsigned long addr, unsigned long *size,
                       unsigned long *off, char *sym);
bool is_bpf_text_address(unsigned long addr);
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym);
struct bpf_prog *bpf_prog_ksym_find(unsigned long addr);

void bpf_prog_kallsyms_add(struct bpf_prog *fp);
void bpf_prog_kallsyms_del(struct bpf_prog *fp);

#else /* CONFIG_BPF_JIT */

static inline bool ebpf_jit_enabled(void)
{
        return false;
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        return false;
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return false;
}

static inline int
bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                            struct bpf_jit_poke_descriptor *poke)
{
        return -ENOTSUPP;
}

static inline void bpf_jit_free(struct bpf_prog *fp)
{
        bpf_prog_unlock_free(fp);
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        return false;
}

static inline int
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char *sym)
{
        return 0;
}

static inline bool is_bpf_text_address(unsigned long addr)
{
        return false;
}

static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
                                  char *type, char *sym)
{
        return -ERANGE;
}

static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
        return NULL;
}

static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
}

static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
}

static inline bool bpf_prog_need_blind(const struct bpf_prog *prog)
{
        return false;
}

static inline
struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bpf_prog *prog)
{
        return prog;
}

static inline void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
{
}
#endif /* CONFIG_BPF_JIT */

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp);

#define BPF_ANC                BIT(15)

static inline bool bpf_needs_clear_a(const struct sock_filter *first)
{
        switch (first->code) {
        case BPF_RET | BPF_K:
        case BPF_LD | BPF_W | BPF_LEN:
                return false;

        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
                if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X)
                        return true;
                return false;

        default:
                return true;
        }
}

static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
{
        BUG_ON(ftest->code & BPF_ANC);

        switch (ftest->code) {
        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
#define BPF_ANCILLARY(CODE)        case SKF_AD_OFF + SKF_AD_##CODE:        \
                                return BPF_ANC | SKF_AD_##CODE
                switch (ftest->k) {
                BPF_ANCILLARY(PROTOCOL);
                BPF_ANCILLARY(PKTTYPE);
                BPF_ANCILLARY(IFINDEX);
                BPF_ANCILLARY(NLATTR);
                BPF_ANCILLARY(NLATTR_NEST);
                BPF_ANCILLARY(MARK);
                BPF_ANCILLARY(QUEUE);
                BPF_ANCILLARY(HATYPE);
                BPF_ANCILLARY(RXHASH);
                BPF_ANCILLARY(CPU);
                BPF_ANCILLARY(ALU_XOR_X);
                BPF_ANCILLARY(VLAN_TAG);
                BPF_ANCILLARY(VLAN_TAG_PRESENT);
                BPF_ANCILLARY(PAY_OFFSET);
                BPF_ANCILLARY(RANDOM);
                BPF_ANCILLARY(VLAN_TPID);
                }
                fallthrough;
        default:
                return ftest->code;
        }
}

void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
                                           int k, unsigned int size);

static inline int bpf_tell_extensions(void)
{
        return SKF_AD_MAX;
}

struct bpf_sock_addr_kern {
        struct sock *sk;
        struct sockaddr_unsized *uaddr;
        /* Temporary "register" to make indirect stores to nested structures
         * defined above. We need three registers to make such a store, but
         * only two (src and dst) are available at convert_ctx_access time
         */
        u64 tmp_reg;
        void *t_ctx;        /* Attach type specific context. */
        u32 uaddrlen;
};

struct bpf_sock_ops_kern {
        struct        sock *sk;
        union {
                u32 args[4];
                u32 reply;
                u32 replylong[4];
        };
        struct sk_buff        *syn_skb;
        struct sk_buff        *skb;
        void        *skb_data_end;
        u8        op;
        u8        is_fullsock;
        u8        is_locked_tcp_sock;
        u8        remaining_opt_len;
        u64        temp;                        /* temp and everything after is not
                                         * initialized to 0 before calling
                                         * the BPF program. New fields that
                                         * should be initialized to 0 should
                                         * be inserted before temp.
                                         * temp is scratch storage used by
                                         * sock_ops_convert_ctx_access
                                         * as temporary storage of a register.
                                         */
};

struct bpf_sysctl_kern {
        struct ctl_table_header *head;
        const struct ctl_table *table;
        void *cur_val;
        size_t cur_len;
        void *new_val;
        size_t new_len;
        int new_updated;
        int write;
        loff_t *ppos;
        /* Temporary "register" for indirect stores to ppos. */
        u64 tmp_reg;
};

#define BPF_SOCKOPT_KERN_BUF_SIZE        32
struct bpf_sockopt_buf {
        u8                data[BPF_SOCKOPT_KERN_BUF_SIZE];
};

struct bpf_sockopt_kern {
        struct sock        *sk;
        u8                *optval;
        u8                *optval_end;
        s32                level;
        s32                optname;
        s32                optlen;
        /* for retval in struct bpf_cg_run_ctx */
        struct task_struct *current_task;
        /* Temporary "register" for indirect stores to ppos. */
        u64                tmp_reg;
};

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);

struct bpf_sk_lookup_kern {
        u16                family;
        u16                protocol;
        __be16                sport;
        u16                dport;
        struct {
                __be32 saddr;
                __be32 daddr;
        } v4;
        struct {
                const struct in6_addr *saddr;
                const struct in6_addr *daddr;
        } v6;
        struct sock        *selected_sk;
        u32                ingress_ifindex;
        bool                no_reuseport;
};

extern struct static_key_false bpf_sk_lookup_enabled;

/* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup.
 *
 * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and
 * SK_DROP. Their meaning is as follows:
 *
 *  SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result
 *  SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup
 *  SK_DROP                           : terminate lookup with -ECONNREFUSED
 *
 * This macro aggregates return values and selected sockets from
 * multiple BPF programs according to following rules in order:
 *
 *  1. If any program returned SK_PASS and a non-NULL ctx.selected_sk,
 *     macro result is SK_PASS and last ctx.selected_sk is used.
 *  2. If any program returned SK_DROP return value,
 *     macro result is SK_DROP.
 *  3. Otherwise result is SK_PASS and ctx.selected_sk is NULL.
 *
 * Caller must ensure that the prog array is non-NULL, and that the
 * array as well as the programs it contains remain valid.
 */
#define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func)                        \
        ({                                                                \
                struct bpf_sk_lookup_kern *_ctx = &(ctx);                \
                struct bpf_prog_array_item *_item;                        \
                struct sock *_selected_sk = NULL;                        \
                bool _no_reuseport = false;                                \
                struct bpf_prog *_prog;                                        \
                bool _all_pass = true;                                        \
                u32 _ret;                                                \
                                                                        \
                migrate_disable();                                        \
                _item = &(array)->items[0];                                \
                while ((_prog = READ_ONCE(_item->prog))) {                \
                        /* restore most recent selection */                \
                        _ctx->selected_sk = _selected_sk;                \
                        _ctx->no_reuseport = _no_reuseport;                \
                                                                        \
                        _ret = func(_prog, _ctx);                        \
                        if (_ret == SK_PASS && _ctx->selected_sk) {        \
                                /* remember last non-NULL socket */        \
                                _selected_sk = _ctx->selected_sk;        \
                                _no_reuseport = _ctx->no_reuseport;        \
                        } else if (_ret == SK_DROP && _all_pass) {        \
                                _all_pass = false;                        \
                        }                                                \
                        _item++;                                        \
                }                                                        \
                _ctx->selected_sk = _selected_sk;                        \
                _ctx->no_reuseport = _no_reuseport;                        \
                migrate_enable();                                        \
                _all_pass || _selected_sk ? SK_PASS : SK_DROP;                \
         })

static inline bool bpf_sk_lookup_run_v4(const struct net *net, int protocol,
                                        const __be32 saddr, const __be16 sport,
                                        const __be32 daddr, const u16 dport,
                                        const int ifindex, struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET,
                        .protocol        = protocol,
                        .v4.saddr        = saddr,
                        .v4.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                        .ingress_ifindex        = ifindex,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}

#if IS_ENABLED(CONFIG_IPV6)
static inline bool bpf_sk_lookup_run_v6(const struct net *net, int protocol,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 dport,
                                        const int ifindex, struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET6,
                        .protocol        = protocol,
                        .v6.saddr        = saddr,
                        .v6.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                        .ingress_ifindex        = ifindex,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
                                                   u64 flags, const u64 flag_mask,
                                                   void *lookup_elem(struct bpf_map *map, u32 key))
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;

        /* Lower bits of the flags are used as return code on lookup failure */
        if (unlikely(flags & ~(action_mask | flag_mask)))
                return XDP_ABORTED;

        ri->tgt_value = lookup_elem(map, index);
        if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
                /* If the lookup fails we want to clear out the state in the
                 * redirect_info struct completely, so that if an eBPF program
                 * performs multiple lookups, the last one always takes
                 * precedence.
                 */
                ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
                ri->map_type = BPF_MAP_TYPE_UNSPEC;
                return flags & action_mask;
        }

        ri->tgt_index = index;
        ri->map_id = map->id;
        ri->map_type = map->map_type;

        if (flags & BPF_F_BROADCAST) {
                WRITE_ONCE(ri->map, map);
                ri->flags = flags;
        } else {
                WRITE_ONCE(ri->map, NULL);
                ri->flags = 0;
        }

        return XDP_REDIRECT;
}

#ifdef CONFIG_NET
int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
                          u32 len, u64 flags);
int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len);
void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
                      void *buf, unsigned long len, bool flush);
int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
                               const void *from, u32 len, u64 flags);
void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset);
#else /* CONFIG_NET */
static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
                                       void *to, u32 len)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset,
                                        const void *from, u32 len, u64 flags)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset,
                                       void *buf, u32 len)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset,
                                        void *buf, u32 len)
{
        return -EOPNOTSUPP;
}

static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
{
        return NULL;
}

static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf,
                                    unsigned long len, bool flush)
{
}

static inline int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
                                             const void *from, u32 len,
                                             u64 flags)
{
        return -EOPNOTSUPP;
}

static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
{
        return ERR_PTR(-EOPNOTSUPP);
}
#endif /* CONFIG_NET */

#endif /* __LINUX_FILTER_H__ */









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 */
#ifndef __XFS_FORMAT_H__
#define __XFS_FORMAT_H__

/*
 * XFS On Disk Format Definitions
 *
 * This header file defines all the on-disk format definitions for
 * general XFS objects. Directory and attribute related objects are defined in
 * xfs_da_format.h, which log and log item formats are defined in
 * xfs_log_format.h. Everything else goes here.
 */

struct xfs_mount;
struct xfs_trans;
struct xfs_inode;
struct xfs_buf;
struct xfs_ifork;

/*
 * Super block
 * Fits into a sector-sized buffer at address 0 of each allocation group.
 * Only the first of these is ever updated except during growfs.
 */
#define        XFS_SB_MAGIC                0x58465342        /* 'XFSB' */
#define        XFS_SB_VERSION_1        1                /* 5.3, 6.0.1, 6.1 */
#define        XFS_SB_VERSION_2        2                /* 6.2 - attributes */
#define        XFS_SB_VERSION_3        3                /* 6.2 - new inode version */
#define        XFS_SB_VERSION_4        4                /* 6.2+ - bitmask version */
#define        XFS_SB_VERSION_5        5                /* CRC enabled filesystem */
#define        XFS_SB_VERSION_NUMBITS                0x000f
#define        XFS_SB_VERSION_ALLFBITS                0xfff0
#define        XFS_SB_VERSION_ATTRBIT                0x0010
#define        XFS_SB_VERSION_NLINKBIT                0x0020
#define        XFS_SB_VERSION_QUOTABIT                0x0040
#define        XFS_SB_VERSION_ALIGNBIT                0x0080
#define        XFS_SB_VERSION_DALIGNBIT        0x0100
#define        XFS_SB_VERSION_SHAREDBIT        0x0200
#define XFS_SB_VERSION_LOGV2BIT                0x0400
#define XFS_SB_VERSION_SECTORBIT        0x0800
#define        XFS_SB_VERSION_EXTFLGBIT        0x1000
#define        XFS_SB_VERSION_DIRV2BIT                0x2000
#define        XFS_SB_VERSION_BORGBIT                0x4000        /* ASCII only case-insens. */
#define        XFS_SB_VERSION_MOREBITSBIT        0x8000

/*
 * The size of a single extended attribute on disk is limited by
 * the size of index values within the attribute entries themselves.
 * These are be16 fields, so we can only support attribute data
 * sizes up to 2^16 bytes in length.
 */
#define XFS_XATTR_SIZE_MAX (1 << 16)

/*
 * Supported feature bit list is just all bits in the versionnum field because
 * we've used them all up and understand them all. Except, of course, for the
 * shared superblock bit, which nobody knows what it does and so is unsupported.
 */
#define        XFS_SB_VERSION_OKBITS                \
        ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
                ~XFS_SB_VERSION_SHAREDBIT)

/*
 * There are two words to hold XFS "feature" bits: the original
 * word, sb_versionnum, and sb_features2.  Whenever a bit is set in
 * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
 *
 * These defines represent bits in sb_features2.
 */
#define XFS_SB_VERSION2_RESERVED1BIT        0x00000001
#define XFS_SB_VERSION2_LAZYSBCOUNTBIT        0x00000002        /* Superblk counters */
#define XFS_SB_VERSION2_RESERVED4BIT        0x00000004
#define XFS_SB_VERSION2_ATTR2BIT        0x00000008        /* Inline attr rework */
#define XFS_SB_VERSION2_PARENTBIT        0x00000010        /* parent pointers */
#define XFS_SB_VERSION2_PROJID32BIT        0x00000080        /* 32 bit project id */
#define XFS_SB_VERSION2_CRCBIT                0x00000100        /* metadata CRCs */
#define XFS_SB_VERSION2_FTYPE                0x00000200        /* inode type in dir */

#define        XFS_SB_VERSION2_OKBITS                \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT        | \
         XFS_SB_VERSION2_ATTR2BIT        | \
         XFS_SB_VERSION2_PROJID32BIT        | \
         XFS_SB_VERSION2_FTYPE)

/* Maximum size of the xfs filesystem label, no terminating NULL */
#define XFSLABEL_MAX                        12

/*
 * Superblock - in core version.  Must be padded to 64 bit alignment.
 */
typedef struct xfs_sb {
        uint32_t        sb_magicnum;        /* magic number == XFS_SB_MAGIC */
        uint32_t        sb_blocksize;        /* logical block size, bytes */
        xfs_rfsblock_t        sb_dblocks;        /* number of data blocks */
        xfs_rfsblock_t        sb_rblocks;        /* number of realtime blocks */
        xfs_rtbxlen_t        sb_rextents;        /* number of realtime extents */
        uuid_t                sb_uuid;        /* user-visible file system unique id */
        xfs_fsblock_t        sb_logstart;        /* starting block of log if internal */
        xfs_ino_t        sb_rootino;        /* root inode number */
        xfs_ino_t        sb_rbmino;        /* bitmap inode for realtime extents */
        xfs_ino_t        sb_rsumino;        /* summary inode for rt bitmap */
        xfs_agblock_t        sb_rextsize;        /* realtime extent size, blocks */
        xfs_agblock_t        sb_agblocks;        /* size of an allocation group */
        xfs_agnumber_t        sb_agcount;        /* number of allocation groups */
        xfs_extlen_t        sb_rbmblocks;        /* number of rt bitmap blocks */
        xfs_extlen_t        sb_logblocks;        /* number of log blocks */
        uint16_t        sb_versionnum;        /* header version == XFS_SB_VERSION */
        uint16_t        sb_sectsize;        /* volume sector size, bytes */
        uint16_t        sb_inodesize;        /* inode size, bytes */
        uint16_t        sb_inopblock;        /* inodes per block */
        char                sb_fname[XFSLABEL_MAX] __nonstring; /* file system name */
        uint8_t                sb_blocklog;        /* log2 of sb_blocksize */
        uint8_t                sb_sectlog;        /* log2 of sb_sectsize */
        uint8_t                sb_inodelog;        /* log2 of sb_inodesize */
        uint8_t                sb_inopblog;        /* log2 of sb_inopblock */
        uint8_t                sb_agblklog;        /* log2 of sb_agblocks (rounded up) */
        uint8_t                sb_rextslog;        /* log2 of sb_rextents */
        uint8_t                sb_inprogress;        /* mkfs is in progress, don't mount */
        uint8_t                sb_imax_pct;        /* max % of fs for inode space */
                                        /* statistics */
        /*
         * These fields must remain contiguous.  If you really
         * want to change their layout, make sure you fix the
         * code in xfs_trans_apply_sb_deltas().
         */
        uint64_t        sb_icount;        /* allocated inodes */
        uint64_t        sb_ifree;        /* free inodes */
        uint64_t        sb_fdblocks;        /* free data blocks */
        uint64_t        sb_frextents;        /* free realtime extents */
        /*
         * End contiguous fields.
         */
        xfs_ino_t        sb_uquotino;        /* user quota inode */
        xfs_ino_t        sb_gquotino;        /* group quota inode */
        uint16_t        sb_qflags;        /* quota flags */
        uint8_t                sb_flags;        /* misc. flags */
        uint8_t                sb_shared_vn;        /* shared version number */
        xfs_extlen_t        sb_inoalignmt;        /* inode chunk alignment, fsblocks */
        uint32_t        sb_unit;        /* stripe or raid unit */
        uint32_t        sb_width;        /* stripe or raid width */
        uint8_t                sb_dirblklog;        /* log2 of dir block size (fsbs) */
        uint8_t                sb_logsectlog;        /* log2 of the log sector size */
        uint16_t        sb_logsectsize;        /* sector size for the log, bytes */
        uint32_t        sb_logsunit;        /* stripe unit size for the log */
        uint32_t        sb_features2;        /* additional feature bits */

        /*
         * bad features2 field as a result of failing to pad the sb structure to
         * 64 bits. Some machines will be using this field for features2 bits.
         * Easiest just to mark it bad and not use it for anything else.
         *
         * This is not kept up to date in memory; it is always overwritten by
         * the value in sb_features2 when formatting the incore superblock to
         * the disk buffer.
         */
        uint32_t        sb_bad_features2;

        /* version 5 superblock fields start here */

        /* feature masks */
        uint32_t        sb_features_compat;
        uint32_t        sb_features_ro_compat;
        uint32_t        sb_features_incompat;
        uint32_t        sb_features_log_incompat;

        uint32_t        sb_crc;                /* superblock crc */
        xfs_extlen_t        sb_spino_align;        /* sparse inode chunk alignment */

        xfs_ino_t        sb_pquotino;        /* project quota inode */
        xfs_lsn_t        sb_lsn;                /* last write sequence */
        uuid_t                sb_meta_uuid;        /* metadata file system unique id */

        xfs_ino_t        sb_metadirino;        /* metadata directory tree root */

        xfs_rgnumber_t        sb_rgcount;        /* number of realtime groups */
        xfs_rtxlen_t        sb_rgextents;        /* size of a realtime group in rtx */
        uint8_t                sb_rgblklog;    /* rt group number shift */
        uint8_t                sb_pad[7];        /* zeroes */
        xfs_rfsblock_t        sb_rtstart;        /* start of internal RT section (FSB) */
        xfs_filblks_t        sb_rtreserved;        /* reserved (zoned) RT blocks */

        /* must be padded to 64 bit alignment */
} xfs_sb_t;

/*
 * Superblock - on disk version.
 * Must be padded to 64 bit alignment.
 */
struct xfs_dsb {
        __be32                sb_magicnum;        /* magic number == XFS_SB_MAGIC */
        __be32                sb_blocksize;        /* logical block size, bytes */
        __be64                sb_dblocks;        /* number of data blocks */
        __be64                sb_rblocks;        /* number of realtime blocks */
        __be64                sb_rextents;        /* number of realtime extents */
        uuid_t                sb_uuid;        /* user-visible file system unique id */
        __be64                sb_logstart;        /* starting block of log if internal */
        __be64                sb_rootino;        /* root inode number */
        __be64                sb_rbmino;        /* bitmap inode for realtime extents */
        __be64                sb_rsumino;        /* summary inode for rt bitmap */
        __be32                sb_rextsize;        /* realtime extent size, blocks */
        __be32                sb_agblocks;        /* size of an allocation group */
        __be32                sb_agcount;        /* number of allocation groups */
        __be32                sb_rbmblocks;        /* number of rt bitmap blocks */
        __be32                sb_logblocks;        /* number of log blocks */
        __be16                sb_versionnum;        /* header version == XFS_SB_VERSION */
        __be16                sb_sectsize;        /* volume sector size, bytes */
        __be16                sb_inodesize;        /* inode size, bytes */
        __be16                sb_inopblock;        /* inodes per block */
        char                sb_fname[XFSLABEL_MAX]; /* file system name */
        __u8                sb_blocklog;        /* log2 of sb_blocksize */
        __u8                sb_sectlog;        /* log2 of sb_sectsize */
        __u8                sb_inodelog;        /* log2 of sb_inodesize */
        __u8                sb_inopblog;        /* log2 of sb_inopblock */
        __u8                sb_agblklog;        /* log2 of sb_agblocks (rounded up) */
        __u8                sb_rextslog;        /* log2 of sb_rextents */
        __u8                sb_inprogress;        /* mkfs is in progress, don't mount */
        __u8                sb_imax_pct;        /* max % of fs for inode space */
                                        /* statistics */
        /*
         * These fields must remain contiguous.  If you really
         * want to change their layout, make sure you fix the
         * code in xfs_trans_apply_sb_deltas().
         */
        __be64                sb_icount;        /* allocated inodes */
        __be64                sb_ifree;        /* free inodes */
        __be64                sb_fdblocks;        /* free data blocks */
        __be64                sb_frextents;        /* free realtime extents */
        /*
         * End contiguous fields.
         */
        __be64                sb_uquotino;        /* user quota inode */
        __be64                sb_gquotino;        /* group quota inode */
        __be16                sb_qflags;        /* quota flags */
        __u8                sb_flags;        /* misc. flags */
        __u8                sb_shared_vn;        /* shared version number */
        __be32                sb_inoalignmt;        /* inode chunk alignment, fsblocks */
        __be32                sb_unit;        /* stripe or raid unit */
        __be32                sb_width;        /* stripe or raid width */
        __u8                sb_dirblklog;        /* log2 of dir block size (fsbs) */
        __u8                sb_logsectlog;        /* log2 of the log sector size */
        __be16                sb_logsectsize;        /* sector size for the log, bytes */
        __be32                sb_logsunit;        /* stripe unit size for the log */
        __be32                sb_features2;        /* additional feature bits */
        /*
         * bad features2 field as a result of failing to pad the sb
         * structure to 64 bits. Some machines will be using this field
         * for features2 bits. Easiest just to mark it bad and not use
         * it for anything else.
         */
        __be32                sb_bad_features2;

        /* version 5 superblock fields start here */

        /* feature masks */
        __be32                sb_features_compat;
        __be32                sb_features_ro_compat;
        __be32                sb_features_incompat;
        __be32                sb_features_log_incompat;

        __le32                sb_crc;                /* superblock crc */
        __be32                sb_spino_align;        /* sparse inode chunk alignment */

        __be64                sb_pquotino;        /* project quota inode */
        __be64                sb_lsn;                /* last write sequence */
        uuid_t                sb_meta_uuid;        /* metadata file system unique id */

        __be64                sb_metadirino;        /* metadata directory tree root */
        __be32                sb_rgcount;        /* # of realtime groups */
        __be32                sb_rgextents;        /* size of rtgroup in rtx */
        __u8                sb_rgblklog;    /* rt group number shift */
        __u8                sb_pad[7];        /* zeroes */
        __be64                sb_rtstart;        /* start of internal RT section (FSB) */
        __be64                sb_rtreserved;        /* reserved (zoned) RT blocks */

        /*
         * The size of this structure must be padded to 64 bit alignment.
         *
         * NOTE: Don't forget to update secondary_sb_whack in xfs_repair when
         * adding new fields here.
         */
};

#define XFS_SB_CRC_OFF                offsetof(struct xfs_dsb, sb_crc)

/*
 * Misc. Flags - warning - these will be cleared by xfs_repair unless
 * a feature bit is set when the flag is used.
 */
#define XFS_SBF_NOFLAGS                0x00        /* no flags set */
#define XFS_SBF_READONLY        0x01        /* only read-only mounts allowed */

/*
 * define max. shared version we can interoperate with
 */
#define XFS_SB_MAX_SHARED_VN        0

#define        XFS_SB_VERSION_NUM(sbp)        ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)

static inline bool xfs_sb_is_v5(const struct xfs_sb *sbp)
{
        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}

/*
 * Detect a mismatched features2 field.  Older kernels read/wrote
 * this into the wrong slot, so to be safe we keep them in sync.
 */
static inline bool xfs_sb_has_mismatched_features2(const struct xfs_sb *sbp)
{
        return sbp->sb_bad_features2 != sbp->sb_features2;
}

static inline bool xfs_sb_version_hasmorebits(const struct xfs_sb *sbp)
{
        return xfs_sb_is_v5(sbp) ||
               (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
}

static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
{
        sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
}

static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
{
        sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
}

static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
{
        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
        sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
}

static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp)
{
        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
        sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
}

/*
 * Extended v5 superblock feature masks. These are to be used for new v5
 * superblock features only.
 *
 * Compat features are new features that old kernels will not notice or affect
 * and so can mount read-write without issues.
 *
 * RO-Compat (read only) are features that old kernels can read but will break
 * if they write. Hence only read-only mounts of such filesystems are allowed on
 * kernels that don't support the feature bit.
 *
 * InCompat features are features which old kernels will not understand and so
 * must not mount.
 *
 * Log-InCompat features are for changes to log formats or new transactions that
 * can't be replayed on older kernels. The fields are set when the filesystem is
 * mounted, and a clean unmount clears the fields.
 */
#define XFS_SB_FEAT_COMPAT_ALL 0
#define XFS_SB_FEAT_COMPAT_UNKNOWN        ~XFS_SB_FEAT_COMPAT_ALL
static inline bool
xfs_sb_has_compat_feature(
        const struct xfs_sb        *sbp,
        uint32_t                feature)
{
        return (sbp->sb_features_compat & feature) != 0;
}

#define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)                /* free inode btree */
#define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)                /* reverse map btree */
#define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)                /* reflinked files */
#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)                /* inobt block counts */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
                (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
                 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
                 XFS_SB_FEAT_RO_COMPAT_REFLINK| \
                 XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN        ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
        const struct xfs_sb        *sbp,
        uint32_t                feature)
{
        return (sbp->sb_features_ro_compat & feature) != 0;
}

#define XFS_SB_FEAT_INCOMPAT_FTYPE        (1 << 0)  /* filetype in dirent */
#define XFS_SB_FEAT_INCOMPAT_SPINODES        (1 << 1)  /* sparse inode chunks */
#define XFS_SB_FEAT_INCOMPAT_META_UUID        (1 << 2)  /* metadata UUID */
#define XFS_SB_FEAT_INCOMPAT_BIGTIME        (1 << 3)  /* large timestamps */
#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
#define XFS_SB_FEAT_INCOMPAT_NREXT64        (1 << 5)  /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE        (1 << 6)  /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT        (1 << 7)  /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_METADIR        (1 << 8)  /* metadata dir tree */
#define XFS_SB_FEAT_INCOMPAT_ZONED        (1 << 9)  /* zoned RT allocator */
#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS        (1 << 10) /* RTGs have LBA gaps */

#define XFS_SB_FEAT_INCOMPAT_ALL \
                (XFS_SB_FEAT_INCOMPAT_FTYPE | \
                 XFS_SB_FEAT_INCOMPAT_SPINODES | \
                 XFS_SB_FEAT_INCOMPAT_META_UUID | \
                 XFS_SB_FEAT_INCOMPAT_BIGTIME | \
                 XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \
                 XFS_SB_FEAT_INCOMPAT_NREXT64 | \
                 XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
                 XFS_SB_FEAT_INCOMPAT_PARENT | \
                 XFS_SB_FEAT_INCOMPAT_METADIR | \
                 XFS_SB_FEAT_INCOMPAT_ZONED | \
                 XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)

#define XFS_SB_FEAT_INCOMPAT_UNKNOWN        ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
xfs_sb_has_incompat_feature(
        const struct xfs_sb        *sbp,
        uint32_t                feature)
{
        return (sbp->sb_features_incompat & feature) != 0;
}

#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS   (1 << 0)        /* Delayed Attributes */
#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
        (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN        ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
static inline bool
xfs_sb_has_incompat_log_feature(
        const struct xfs_sb        *sbp,
        uint32_t                feature)
{
        return (sbp->sb_features_log_incompat & feature) != 0;
}

static inline void
xfs_sb_remove_incompat_log_features(
        struct xfs_sb        *sbp)
{
        sbp->sb_features_log_incompat &= ~XFS_SB_FEAT_INCOMPAT_LOG_ALL;
}

static inline void
xfs_sb_add_incompat_log_features(
        struct xfs_sb        *sbp,
        unsigned int        features)
{
        sbp->sb_features_log_incompat |= features;
}

static inline bool xfs_sb_version_haslogxattrs(const struct xfs_sb *sbp)
{
        return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
                 XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
}

static inline bool
xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
{
        return (ino == sbp->sb_uquotino ||
                ino == sbp->sb_gquotino ||
                ino == sbp->sb_pquotino);
}

#define XFS_SB_DADDR                ((xfs_daddr_t)0) /* daddr in filesystem/ag */
#define        XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)

#define        XFS_HDR_BLOCK(mp,d)        ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
#define        XFS_DADDR_TO_FSB(mp,d)        XFS_AGB_TO_FSB(mp, \
                        xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
#define        XFS_FSB_TO_DADDR(mp,fsbno)        XFS_AGB_TO_DADDR(mp, \
                        XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))

/*
 * File system sector to basic block conversions.
 */
#define XFS_FSS_TO_BB(mp,sec)        ((sec) << (mp)->m_sectbb_log)

/*
 * File system block to basic block conversions.
 */
#define        XFS_FSB_TO_BB(mp,fsbno)        ((fsbno) << (mp)->m_blkbb_log)
#define        XFS_BB_TO_FSB(mp,bb)        \
        (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
#define        XFS_BB_TO_FSBT(mp,bb)        ((bb) >> (mp)->m_blkbb_log)

/*
 * File system block to byte conversions.
 */
#define XFS_FSB_TO_B(mp,fsbno)        ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
#define XFS_B_TO_FSB(mp,b)        \
        ((((uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
#define XFS_B_TO_FSBT(mp,b)        (((uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)

/*
 * Allocation group header
 *
 * This is divided into three structures, placed in sequential 512-byte
 * buffers after a copy of the superblock (also in a 512-byte buffer).
 */
#define        XFS_AGF_MAGIC        0x58414746        /* 'XAGF' */
#define        XFS_AGI_MAGIC        0x58414749        /* 'XAGI' */
#define        XFS_AGFL_MAGIC        0x5841464c        /* 'XAFL' */
#define        XFS_AGF_VERSION        1
#define        XFS_AGI_VERSION        1

#define        XFS_AGF_GOOD_VERSION(v)        ((v) == XFS_AGF_VERSION)
#define        XFS_AGI_GOOD_VERSION(v)        ((v) == XFS_AGI_VERSION)

/*
 * agf_cnt_level in the first AGF overlaps the EFS superblock's magic number.
 * Since the magic numbers valid for EFS are > 64k, our value cannot be confused
 * for an EFS superblock.
 */

typedef struct xfs_agf {
        /*
         * Common allocation group header information
         */
        __be32                agf_magicnum;        /* magic number == XFS_AGF_MAGIC */
        __be32                agf_versionnum;        /* header version == XFS_AGF_VERSION */
        __be32                agf_seqno;        /* sequence # starting from 0 */
        __be32                agf_length;        /* size in blocks of a.g. */
        /*
         * Freespace and rmap information
         */
        __be32                agf_bno_root;        /* bnobt root block */
        __be32                agf_cnt_root;        /* cntbt root block */
        __be32                agf_rmap_root;        /* rmapbt root block */

        __be32                agf_bno_level;        /* bnobt btree levels */
        __be32                agf_cnt_level;        /* cntbt btree levels */
        __be32                agf_rmap_level;        /* rmapbt btree levels */

        __be32                agf_flfirst;        /* first freelist block's index */
        __be32                agf_fllast;        /* last freelist block's index */
        __be32                agf_flcount;        /* count of blocks in freelist */
        __be32                agf_freeblks;        /* total free blocks */

        __be32                agf_longest;        /* longest free space */
        __be32                agf_btreeblks;        /* # of blocks held in AGF btrees */
        uuid_t                agf_uuid;        /* uuid of filesystem */

        __be32                agf_rmap_blocks;        /* rmapbt blocks used */
        __be32                agf_refcount_blocks;        /* refcountbt blocks used */

        __be32                agf_refcount_root;        /* refcount tree root block */
        __be32                agf_refcount_level;        /* refcount btree levels */

        /*
         * reserve some contiguous space for future logged fields before we add
         * the unlogged fields. This makes the range logging via flags and
         * structure offsets much simpler.
         */
        __be64                agf_spare64[14];

        /* unlogged fields, written during buffer writeback. */
        __be64                agf_lsn;        /* last write sequence */
        __be32                agf_crc;        /* crc of agf sector */
        __be32                agf_spare2;

        /* structure must be padded to 64 bit alignment */
} xfs_agf_t;

#define XFS_AGF_CRC_OFF                offsetof(struct xfs_agf, agf_crc)

#define        XFS_AGF_MAGICNUM        (1u << 0)
#define        XFS_AGF_VERSIONNUM        (1u << 1)
#define        XFS_AGF_SEQNO                (1u << 2)
#define        XFS_AGF_LENGTH                (1u << 3)
#define        XFS_AGF_ROOTS                (1u << 4)
#define        XFS_AGF_LEVELS                (1u << 5)
#define        XFS_AGF_FLFIRST                (1u << 6)
#define        XFS_AGF_FLLAST                (1u << 7)
#define        XFS_AGF_FLCOUNT                (1u << 8)
#define        XFS_AGF_FREEBLKS        (1u << 9)
#define        XFS_AGF_LONGEST                (1u << 10)
#define        XFS_AGF_BTREEBLKS        (1u << 11)
#define        XFS_AGF_UUID                (1u << 12)
#define        XFS_AGF_RMAP_BLOCKS        (1u << 13)
#define        XFS_AGF_REFCOUNT_BLOCKS        (1u << 14)
#define        XFS_AGF_REFCOUNT_ROOT        (1u << 15)
#define        XFS_AGF_REFCOUNT_LEVEL        (1u << 16)
#define        XFS_AGF_SPARE64                (1u << 17)
#define        XFS_AGF_NUM_BITS        18
#define        XFS_AGF_ALL_BITS        ((1u << XFS_AGF_NUM_BITS) - 1)

#define XFS_AGF_FLAGS \
        { XFS_AGF_MAGICNUM,        "MAGICNUM" }, \
        { XFS_AGF_VERSIONNUM,        "VERSIONNUM" }, \
        { XFS_AGF_SEQNO,        "SEQNO" }, \
        { XFS_AGF_LENGTH,        "LENGTH" }, \
        { XFS_AGF_ROOTS,        "ROOTS" }, \
        { XFS_AGF_LEVELS,        "LEVELS" }, \
        { XFS_AGF_FLFIRST,        "FLFIRST" }, \
        { XFS_AGF_FLLAST,        "FLLAST" }, \
        { XFS_AGF_FLCOUNT,        "FLCOUNT" }, \
        { XFS_AGF_FREEBLKS,        "FREEBLKS" }, \
        { XFS_AGF_LONGEST,        "LONGEST" }, \
        { XFS_AGF_BTREEBLKS,        "BTREEBLKS" }, \
        { XFS_AGF_UUID,                "UUID" }, \
        { XFS_AGF_RMAP_BLOCKS,        "RMAP_BLOCKS" }, \
        { XFS_AGF_REFCOUNT_BLOCKS,        "REFCOUNT_BLOCKS" }, \
        { XFS_AGF_REFCOUNT_ROOT,        "REFCOUNT_ROOT" }, \
        { XFS_AGF_REFCOUNT_LEVEL,        "REFCOUNT_LEVEL" }, \
        { XFS_AGF_SPARE64,        "SPARE64" }

/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGF_DADDR(mp)        ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
#define        XFS_AGF_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))

/*
 * Size of the unlinked inode hash table in the agi.
 */
#define        XFS_AGI_UNLINKED_BUCKETS        64

typedef struct xfs_agi {
        /*
         * Common allocation group header information
         */
        __be32                agi_magicnum;        /* magic number == XFS_AGI_MAGIC */
        __be32                agi_versionnum;        /* header version == XFS_AGI_VERSION */
        __be32                agi_seqno;        /* sequence # starting from 0 */
        __be32                agi_length;        /* size in blocks of a.g. */
        /*
         * Inode information
         * Inodes are mapped by interpreting the inode number, so no
         * mapping data is needed here.
         */
        __be32                agi_count;        /* count of allocated inodes */
        __be32                agi_root;        /* root of inode btree */
        __be32                agi_level;        /* levels in inode btree */
        __be32                agi_freecount;        /* number of free inodes */

        __be32                agi_newino;        /* new inode just allocated */
        __be32                agi_dirino;        /* last directory inode chunk */
        /*
         * Hash table of inodes which have been unlinked but are
         * still being referenced.
         */
        __be32                agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
        /*
         * This marks the end of logging region 1 and start of logging region 2.
         */
        uuid_t                agi_uuid;        /* uuid of filesystem */
        __be32                agi_crc;        /* crc of agi sector */
        __be32                agi_pad32;
        __be64                agi_lsn;        /* last write sequence */

        __be32                agi_free_root; /* root of the free inode btree */
        __be32                agi_free_level;/* levels in free inode btree */

        __be32                agi_iblocks;        /* inobt blocks used */
        __be32                agi_fblocks;        /* finobt blocks used */

        /* structure must be padded to 64 bit alignment */
} xfs_agi_t;

#define XFS_AGI_CRC_OFF                offsetof(struct xfs_agi, agi_crc)

#define        XFS_AGI_MAGICNUM        (1u << 0)
#define        XFS_AGI_VERSIONNUM        (1u << 1)
#define        XFS_AGI_SEQNO                (1u << 2)
#define        XFS_AGI_LENGTH                (1u << 3)
#define        XFS_AGI_COUNT                (1u << 4)
#define        XFS_AGI_ROOT                (1u << 5)
#define        XFS_AGI_LEVEL                (1u << 6)
#define        XFS_AGI_FREECOUNT        (1u << 7)
#define        XFS_AGI_NEWINO                (1u << 8)
#define        XFS_AGI_DIRINO                (1u << 9)
#define        XFS_AGI_UNLINKED        (1u << 10)
#define        XFS_AGI_NUM_BITS_R1        11        /* end of the 1st agi logging region */
#define        XFS_AGI_ALL_BITS_R1        ((1u << XFS_AGI_NUM_BITS_R1) - 1)
#define        XFS_AGI_FREE_ROOT        (1u << 11)
#define        XFS_AGI_FREE_LEVEL        (1u << 12)
#define        XFS_AGI_IBLOCKS                (1u << 13) /* both inobt/finobt block counters */
#define        XFS_AGI_NUM_BITS_R2        14

/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGI_DADDR(mp)        ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
#define        XFS_AGI_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))

/*
 * The third a.g. block contains the a.g. freelist, an array
 * of block pointers to blocks owned by the allocation btree code.
 */
#define XFS_AGFL_DADDR(mp)        ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
#define        XFS_AGFL_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
#define        XFS_BUF_TO_AGFL(bp)        ((struct xfs_agfl *)((bp)->b_addr))

struct xfs_agfl {
        __be32                agfl_magicnum;
        __be32                agfl_seqno;
        uuid_t                agfl_uuid;
        __be64                agfl_lsn;
        __be32                agfl_crc;
} __attribute__((packed));

#define XFS_AGFL_CRC_OFF        offsetof(struct xfs_agfl, agfl_crc)

#define XFS_AGB_TO_FSB(mp,agno,agbno)        \
        (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
#define        XFS_FSB_TO_AGNO(mp,fsbno)        \
        ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
#define        XFS_FSB_TO_AGBNO(mp,fsbno)        \
        ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
#define        XFS_AGB_TO_DADDR(mp,agno,agbno)        \
        ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
                (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
#define        XFS_AG_DADDR(mp,agno,d)                (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))

/*
 * For checking for bad ranges of xfs_daddr_t's, covering multiple
 * allocation groups or a single xfs_daddr_t that's a superblock copy.
 */
#define        XFS_AG_CHECK_DADDR(mp,d,len)        \
        ((len) == 1 ? \
            ASSERT((d) == XFS_SB_DADDR || \
                   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
            ASSERT(xfs_daddr_to_agno(mp, d) == \
                   xfs_daddr_to_agno(mp, (d) + (len) - 1)))

/*
 * Realtime bitmap information is accessed by the word, which is currently
 * stored in host-endian format.  Starting with the realtime groups feature,
 * the words are stored in be32 ondisk.
 */
union xfs_rtword_raw {
        __u32                old;
        __be32                rtg;
};

/*
 * Realtime summary counts are accessed by the word, which is currently
 * stored in host-endian format.  Starting with the realtime groups feature,
 * the words are stored in be32 ondisk.
 */
union xfs_suminfo_raw {
        __u32                old;
        __be32                rtg;
};

/*
 * Realtime allocation groups break the rt section into multiple pieces that
 * could be locked independently.  Realtime block group numbers are 32-bit
 * quantities.  Block numbers within a group are also 32-bit quantities, but
 * the upper bit must never be set.  rtgroup 0 might have a superblock in it,
 * so the minimum size of an rtgroup is 2 rtx.
 */
#define XFS_MAX_RGBLOCKS        ((xfs_rgblock_t)(1U << 31) - 1)
#define XFS_MIN_RGEXTENTS        ((xfs_rtxlen_t)2)
#define XFS_MAX_RGNUMBER        ((xfs_rgnumber_t)(-1U))

#define XFS_RTSB_MAGIC        0x46726F67        /* 'Frog' */

/*
 * Realtime superblock - on disk version.  Must be padded to 64 bit alignment.
 * The first block of the realtime volume contains this superblock.
 */
struct xfs_rtsb {
        __be32                rsb_magicnum;        /* magic number == XFS_RTSB_MAGIC */
        __le32                rsb_crc;        /* superblock crc */

        __be32                rsb_pad;        /* zero */
        unsigned char        rsb_fname[XFSLABEL_MAX]; /* file system name */

        uuid_t                rsb_uuid;        /* user-visible file system unique id */
        uuid_t                rsb_meta_uuid;        /* metadata file system unique id */

        /* must be padded to 64 bit alignment */
};

#define XFS_RTSB_CRC_OFF        offsetof(struct xfs_rtsb, rsb_crc)
#define XFS_RTSB_DADDR                ((xfs_daddr_t)0) /* daddr in rt section */

/*
 * XFS Timestamps
 * ==============
 *
 * Traditional ondisk inode timestamps consist of signed 32-bit counters for
 * seconds and nanoseconds; time zero is the Unix epoch, Jan  1 00:00:00 UTC
 * 1970, which means that the timestamp epoch is the same as the Unix epoch.
 * Therefore, the ondisk min and max defined here can be used directly to
 * constrain the incore timestamps on a Unix system.  Note that we actually
 * encode a __be64 value on disk.
 *
 * When the bigtime feature is enabled, ondisk inode timestamps become an
 * unsigned 64-bit nanoseconds counter.  This means that the bigtime inode
 * timestamp epoch is the start of the classic timestamp range, which is
 * Dec 13 20:45:52 UTC 1901.  Because the epochs are not the same, callers
 * /must/ use the bigtime conversion functions when encoding and decoding raw
 * timestamps.
 */
typedef __be64 xfs_timestamp_t;

/* Legacy timestamp encoding format. */
struct xfs_legacy_timestamp {
        __be32                t_sec;                /* timestamp seconds */
        __be32                t_nsec;                /* timestamp nanoseconds */
};

/*
 * Smallest possible ondisk seconds value with traditional timestamps.  This
 * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901.
 */
#define XFS_LEGACY_TIME_MIN        ((int64_t)S32_MIN)

/*
 * Largest possible ondisk seconds value with traditional timestamps.  This
 * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038.
 */
#define XFS_LEGACY_TIME_MAX        ((int64_t)S32_MAX)

/*
 * Smallest possible ondisk seconds value with bigtime timestamps.  This
 * corresponds (after conversion to a Unix timestamp) with the traditional
 * minimum timestamp of Dec 13 20:45:52 UTC 1901.
 */
#define XFS_BIGTIME_TIME_MIN        ((int64_t)0)

/*
 * Largest supported ondisk seconds value with bigtime timestamps.  This
 * corresponds (after conversion to a Unix timestamp) with an incore timestamp
 * of Jul  2 20:20:24 UTC 2486.
 *
 * We round down the ondisk limit so that the bigtime quota and inode max
 * timestamps will be the same.
 */
#define XFS_BIGTIME_TIME_MAX        ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL))

/*
 * Bigtime epoch is set exactly to the minimum time value that a traditional
 * 32-bit timestamp can represent when using the Unix epoch as a reference.
 * Hence the Unix epoch is at a fixed offset into the supported bigtime
 * timestamp range.
 *
 * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS
 * timestamp can represent so we will not lose any fidelity in converting
 * to/from unix and bigtime timestamps.
 *
 * The following conversion factor converts a seconds counter from the Unix
 * epoch to the bigtime epoch.
 */
#define XFS_BIGTIME_EPOCH_OFFSET        (-(int64_t)S32_MIN)

/* Convert a timestamp from the Unix epoch to the bigtime epoch. */
static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds)
{
        return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET;
}

/* Convert a timestamp from the bigtime epoch to the Unix epoch. */
static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
{
        return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
}

enum xfs_metafile_type {
        XFS_METAFILE_UNKNOWN,                /* unknown */
        XFS_METAFILE_DIR,                /* metadir directory */
        XFS_METAFILE_USRQUOTA,                /* user quota */
        XFS_METAFILE_GRPQUOTA,                /* group quota */
        XFS_METAFILE_PRJQUOTA,                /* project quota */
        XFS_METAFILE_RTBITMAP,                /* rt bitmap */
        XFS_METAFILE_RTSUMMARY,                /* rt summary */
        XFS_METAFILE_RTRMAP,                /* rt rmap */
        XFS_METAFILE_RTREFCOUNT,        /* rt refcount */

        XFS_METAFILE_MAX
} __packed;

#define XFS_METAFILE_TYPE_STR \
        { XFS_METAFILE_UNKNOWN,                "unknown" }, \
        { XFS_METAFILE_DIR,                "dir" }, \
        { XFS_METAFILE_USRQUOTA,        "usrquota" }, \
        { XFS_METAFILE_GRPQUOTA,        "grpquota" }, \
        { XFS_METAFILE_PRJQUOTA,        "prjquota" }, \
        { XFS_METAFILE_RTBITMAP,        "rtbitmap" }, \
        { XFS_METAFILE_RTSUMMARY,        "rtsummary" }, \
        { XFS_METAFILE_RTRMAP,                "rtrmap" }, \
        { XFS_METAFILE_RTREFCOUNT,        "rtrefcount" }

/*
 * On-disk inode structure.
 *
 * This is just the header or "dinode core", the inode is expanded to fill a
 * variable size the leftover area split into a data and an attribute fork.
 * The format of the data and attribute fork depends on the format of the
 * inode as indicated by di_format and di_aformat.  To access the data and
 * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
 * below.
 *
 * There is a very similar struct xfs_log_dinode which matches the layout of
 * this structure, but is kept in native format instead of big endian.
 *
 * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
 * padding field for v3 inodes.
 */
#define        XFS_DINODE_MAGIC                0x494e        /* 'IN' */
struct xfs_dinode {
        __be16                di_magic;        /* inode magic # = XFS_DINODE_MAGIC */
        __be16                di_mode;        /* mode and type of file */
        __u8                di_version;        /* inode version */
        __u8                di_format;        /* format of di_c data */
        __be16                di_metatype;        /* XFS_METAFILE_*; was di_onlink */
        __be32                di_uid;                /* owner's user id */
        __be32                di_gid;                /* owner's group id */
        __be32                di_nlink;        /* number of links to file */
        __be16                di_projid_lo;        /* lower part of owner's project id */
        __be16                di_projid_hi;        /* higher part owner's project id */
        union {
                /* Number of data fork extents if NREXT64 is set */
                __be64        di_big_nextents;

                /* Padding for V3 inodes without NREXT64 set. */
                __be64        di_v3_pad;

                /* Padding and inode flush counter for V2 inodes. */
                struct {
                        __u8        di_v2_pad[6];
                        __be16        di_flushiter;
                };
        };
        xfs_timestamp_t        di_atime;        /* time last accessed */
        xfs_timestamp_t        di_mtime;        /* time last modified */
        xfs_timestamp_t        di_ctime;        /* time created/inode modified */
        __be64                di_size;        /* number of bytes in file */
        __be64                di_nblocks;        /* # of direct & btree blocks used */
        __be32                di_extsize;        /* basic/minimum extent size for file */
        union {
                /*
                 * For V2 inodes and V3 inodes without NREXT64 set, this
                 * is the number of data and attr fork extents.
                 */
                struct {
                        __be32        di_nextents;
                        __be16        di_anextents;
                } __packed;

                /* Number of attr fork extents if NREXT64 is set. */
                struct {
                        __be32        di_big_anextents;
                        __be16        di_nrext64_pad;
                } __packed;
        } __packed;
        __u8                di_forkoff;        /* attr fork offs, <<3 for 64b align */
        __s8                di_aformat;        /* format of attr fork's data */
        __be32                di_dmevmask;        /* DMIG event mask */
        __be16                di_dmstate;        /* DMIG state info */
        __be16                di_flags;        /* random flags, XFS_DIFLAG_... */
        __be32                di_gen;                /* generation number */

        /* di_next_unlinked is the only non-core field in the old dinode */
        __be32                di_next_unlinked;/* agi unlinked list ptr */

        /* start of the extended dinode, writable fields */
        __le32                di_crc;                /* CRC of the inode */
        __be64                di_changecount;        /* number of attribute changes */
        __be64                di_lsn;                /* flush sequence */
        __be64                di_flags2;        /* more random flags */
        union {
                /* basic cow extent size for (regular) file */
                __be32                di_cowextsize;
                /* used blocks in RTG for (zoned) rtrmap inode */
                __be32                di_used_blocks;
        };
        __u8                di_pad2[12];        /* more padding for future expansion */

        /* fields only written to during inode creation */
        xfs_timestamp_t        di_crtime;        /* time created */
        __be64                di_ino;                /* inode number */
        uuid_t                di_uuid;        /* UUID of the filesystem */

        /* structure must be padded to 64 bit alignment */
};

#define XFS_DINODE_CRC_OFF        offsetof(struct xfs_dinode, di_crc)

#define DI_MAX_FLUSH 0xffff

/*
 * Size of the core inode on disk.  Version 1 and 2 inodes have
 * the same size, but version 3 has grown a few additional fields.
 */
static inline uint xfs_dinode_size(int version)
{
        if (version == 3)
                return sizeof(struct xfs_dinode);
        return offsetof(struct xfs_dinode, di_crc);
}

/*
 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
 * Since the pathconf interface is signed, we use 2^31 - 1 instead.
 */
#define        XFS_MAXLINK                ((1U << 31) - 1U)

/*
 * Any file that hits the maximum ondisk link count should be pinned to avoid
 * a use-after-free situation.
 */
#define        XFS_NLINK_PINNED        (~0U)

/*
 * Values for di_format
 *
 * This enum is used in string mapping in xfs_trace.h; please keep the
 * TRACE_DEFINE_ENUMs for it up to date.
 */
enum xfs_dinode_fmt {
        XFS_DINODE_FMT_DEV,                /* xfs_dev_t */
        XFS_DINODE_FMT_LOCAL,                /* bulk data */
        XFS_DINODE_FMT_EXTENTS,                /* struct xfs_bmbt_rec */
        XFS_DINODE_FMT_BTREE,                /* struct xfs_bmdr_block */
        XFS_DINODE_FMT_UUID,                /* added long ago, but never used */
        XFS_DINODE_FMT_META_BTREE,        /* metadata btree */
};

#define XFS_INODE_FORMAT_STR \
        { XFS_DINODE_FMT_DEV,                "dev" }, \
        { XFS_DINODE_FMT_LOCAL,                "local" }, \
        { XFS_DINODE_FMT_EXTENTS,        "extent" }, \
        { XFS_DINODE_FMT_BTREE,                "btree" }, \
        { XFS_DINODE_FMT_UUID,                "uuid" }, \
        { XFS_DINODE_FMT_META_BTREE,        "meta_btree" }

/*
 * Max values for extnum and aextnum.
 *
 * The original on-disk extent counts were held in signed fields, resulting in
 * maximum extent counts of 2^31 and 2^15 for the data and attr forks
 * respectively. Similarly the maximum extent length is limited to 2^21 blocks
 * by the 21-bit wide blockcount field of a BMBT extent record.
 *
 * The newly introduced data fork extent counter can hold a 64-bit value,
 * however the maximum number of extents in a file is also limited to 2^54
 * extents by the 54-bit wide startoff field of a BMBT extent record.
 *
 * It is further limited by the maximum supported file size of 2^63
 * *bytes*. This leads to a maximum extent count for maximally sized filesystem
 * blocks (64kB) of:
 *
 * 2^63 bytes / 2^16 bytes per block = 2^47 blocks
 *
 * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence
 * 2^48 was chosen as the maximum data fork extent count.
 *
 * The maximum file size that can be represented by the data fork extent counter
 * in the worst case occurs when all extents are 1 block in length and each
 * block is 1KB in size.
 *
 * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and
 * with 1KB sized blocks, a file can reach upto,
 * 1KB * (2^31) = 2TB
 *
 * This is much larger than the theoretical maximum size of a directory
 * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB.
 *
 * Hence, a directory inode can never overflow its data fork extent counter.
 */
#define XFS_MAX_EXTCNT_DATA_FORK_LARGE        ((xfs_extnum_t)((1ULL << 48) - 1))
#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE        ((xfs_extnum_t)((1ULL << 32) - 1))
#define XFS_MAX_EXTCNT_DATA_FORK_SMALL        ((xfs_extnum_t)((1ULL << 31) - 1))
#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL        ((xfs_extnum_t)((1ULL << 15) - 1))

/*
 * When we upgrade an inode to the large extent counts, the maximum value by
 * which the extent count can increase is bound by the change in size of the
 * on-disk field. No upgrade operation should ever be adding more than a few
 * tens of extents, so if we get a really large value it is a sign of a code bug
 * or corruption.
 */
#define XFS_MAX_EXTCNT_UPGRADE_NR        \
        min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL,        \
            XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL)

/*
 * Inode minimum and maximum sizes.
 */
#define        XFS_DINODE_MIN_LOG        8
#define        XFS_DINODE_MAX_LOG        11
#define        XFS_DINODE_MIN_SIZE        (1 << XFS_DINODE_MIN_LOG)
#define        XFS_DINODE_MAX_SIZE        (1 << XFS_DINODE_MAX_LOG)

/*
 * Inode size for given fs.
 */
#define XFS_DINODE_SIZE(mp) \
        (xfs_has_v3inodes(mp) ? \
                sizeof(struct xfs_dinode) : \
                offsetof(struct xfs_dinode, di_crc))
#define XFS_LITINO(mp) \
        ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(mp))

/*
 * Inode data & attribute fork sizes, per inode.
 */
#define XFS_DFORK_BOFF(dip)                ((int)((dip)->di_forkoff << 3))

#define XFS_DFORK_DSIZE(dip,mp) \
        ((dip)->di_forkoff ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp))
#define XFS_DFORK_ASIZE(dip,mp) \
        ((dip)->di_forkoff ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0)
#define XFS_DFORK_SIZE(dip,mp,w) \
        ((w) == XFS_DATA_FORK ? \
                XFS_DFORK_DSIZE(dip, mp) : \
                XFS_DFORK_ASIZE(dip, mp))

#define XFS_DFORK_MAXEXT(dip, mp, w) \
        (XFS_DFORK_SIZE(dip, mp, w) / sizeof(struct xfs_bmbt_rec))

/*
 * Return pointers to the data or attribute forks.
 */
#define XFS_DFORK_DPTR(dip) \
        ((void *)dip + xfs_dinode_size(dip->di_version))
#define XFS_DFORK_APTR(dip)        \
        (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
#define XFS_DFORK_PTR(dip,w)        \
        ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))

#define XFS_DFORK_FORMAT(dip,w) \
        ((w) == XFS_DATA_FORK ? \
                (dip)->di_format : \
                (dip)->di_aformat)

/*
 * For block and character special files the 32bit dev_t is stored at the
 * beginning of the data fork.
 */
static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
{
        return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
}

static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
{
        *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
}

/*
 * Values for di_flags
 */
#define XFS_DIFLAG_REALTIME_BIT  0        /* file's blocks come from rt area */
#define XFS_DIFLAG_PREALLOC_BIT  1        /* file space has been preallocated */
#define XFS_DIFLAG_NEWRTBM_BIT   2        /* for rtbitmap inode, new format */
#define XFS_DIFLAG_IMMUTABLE_BIT 3        /* inode is immutable */
#define XFS_DIFLAG_APPEND_BIT    4        /* inode is append-only */
#define XFS_DIFLAG_SYNC_BIT      5        /* inode is written synchronously */
#define XFS_DIFLAG_NOATIME_BIT   6        /* do not update atime */
#define XFS_DIFLAG_NODUMP_BIT    7        /* do not dump */
#define XFS_DIFLAG_RTINHERIT_BIT 8        /* create with realtime bit set */
#define XFS_DIFLAG_PROJINHERIT_BIT   9        /* create with parents projid */
#define XFS_DIFLAG_NOSYMLINKS_BIT   10        /* disallow symlink creation */
#define XFS_DIFLAG_EXTSIZE_BIT      11        /* inode extent size allocator hint */
#define XFS_DIFLAG_EXTSZINHERIT_BIT 12        /* inherit inode extent size */
#define XFS_DIFLAG_NODEFRAG_BIT     13        /* do not reorganize/defragment */
#define XFS_DIFLAG_FILESTREAM_BIT   14  /* use filestream allocator */
/* Do not use bit 15, di_flags is legacy and unchanging now */

#define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
#define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
#define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
#define XFS_DIFLAG_IMMUTABLE     (1 << XFS_DIFLAG_IMMUTABLE_BIT)
#define XFS_DIFLAG_APPEND        (1 << XFS_DIFLAG_APPEND_BIT)
#define XFS_DIFLAG_SYNC          (1 << XFS_DIFLAG_SYNC_BIT)
#define XFS_DIFLAG_NOATIME       (1 << XFS_DIFLAG_NOATIME_BIT)
#define XFS_DIFLAG_NODUMP        (1 << XFS_DIFLAG_NODUMP_BIT)
#define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
#define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
#define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
#define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
#define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
#define XFS_DIFLAG_FILESTREAM    (1 << XFS_DIFLAG_FILESTREAM_BIT)

#define XFS_DIFLAG_ANY \
        (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
         XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
         XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
         XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
         XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)

/*
 * Values for di_flags2 These start by being exposed to userspace in the upper
 * 16 bits of the XFS_XFLAG_s range.
 */
/* use DAX for this inode */
#define XFS_DIFLAG2_DAX_BIT                0

/* file's blocks may be shared */
#define XFS_DIFLAG2_REFLINK_BIT                1

/* copy on write extent size hint */
#define XFS_DIFLAG2_COWEXTSIZE_BIT        2

/* big timestamps */
#define XFS_DIFLAG2_BIGTIME_BIT                3

/* large extent counters */
#define XFS_DIFLAG2_NREXT64_BIT                4

/*
 * The inode contains filesystem metadata and can be found through the metadata
 * directory tree.  Metadata inodes must satisfy the following constraints:
 *
 * - V5 filesystem (and ftype) are enabled;
 * - The only valid modes are regular files and directories;
 * - The access bits must be zero;
 * - DMAPI event and state masks are zero;
 * - The user and group IDs must be zero;
 * - The project ID can be used as a u32 annotation;
 * - The immutable, sync, noatime, nodump, nodefrag flags must be set.
 * - The dax flag must not be set.
 * - Directories must have nosymlinks set.
 *
 * These requirements are chosen defensively to minimize the ability of
 * userspace to read or modify the contents, should a metadata file ever
 * escape to userspace.
 *
 * There are further constraints on the directory tree itself:
 *
 * - Metadata inodes must never be resolvable through the root directory;
 * - They must never be accessed by userspace;
 * - Metadata directory entries must have correct ftype.
 *
 * Superblock-rooted metadata files must have the METADATA iflag set even
 * though they do not have a parent directory.
 */
#define XFS_DIFLAG2_METADATA_BIT        5

#define XFS_DIFLAG2_DAX                (1ULL << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK        (1ULL << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE        (1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT)
#define XFS_DIFLAG2_BIGTIME        (1ULL << XFS_DIFLAG2_BIGTIME_BIT)
#define XFS_DIFLAG2_NREXT64        (1ULL << XFS_DIFLAG2_NREXT64_BIT)
#define XFS_DIFLAG2_METADATA        (1ULL << XFS_DIFLAG2_METADATA_BIT)

#define XFS_DIFLAG2_ANY \
        (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
         XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA)

static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
        return dip->di_version >= 3 &&
               (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
}

static inline bool xfs_dinode_has_large_extent_counts(
        const struct xfs_dinode *dip)
{
        return dip->di_version >= 3 &&
               (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
}

static inline bool xfs_dinode_is_metadir(const struct xfs_dinode *dip)
{
        return dip->di_version >= 3 &&
               (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA));
}

/*
 * Inode number format:
 * low inopblog bits - offset in block
 * next agblklog bits - block number in ag
 * next agno_log bits - ag number
 * high agno_log-agblklog-inopblog bits - 0
 */
#define        XFS_INO_MASK(k)                        (uint32_t)((1ULL << (k)) - 1)
#define        XFS_INO_OFFSET_BITS(mp)                (mp)->m_sb.sb_inopblog
#define        XFS_INO_AGBNO_BITS(mp)                (mp)->m_sb.sb_agblklog
#define        XFS_INO_AGINO_BITS(mp)                ((mp)->m_ino_geo.agino_log)
#define        XFS_INO_AGNO_BITS(mp)                (mp)->m_agno_log
#define        XFS_INO_BITS(mp)                \
        XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
#define        XFS_INO_TO_AGNO(mp,i)                \
        ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
#define        XFS_INO_TO_AGINO(mp,i)                \
        ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
#define        XFS_INO_TO_AGBNO(mp,i)                \
        (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
                XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
#define        XFS_INO_TO_OFFSET(mp,i)                \
        ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
#define        XFS_INO_TO_FSB(mp,i)                \
        XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
#define        XFS_AGINO_TO_INO(mp,a,i)        \
        (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
#define        XFS_AGINO_TO_AGBNO(mp,i)        ((i) >> XFS_INO_OFFSET_BITS(mp))
#define        XFS_AGINO_TO_OFFSET(mp,i)        \
        ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
#define        XFS_OFFBNO_TO_AGINO(mp,b,o)        \
        ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
#define        XFS_FSB_TO_INO(mp, b)        ((xfs_ino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
#define        XFS_AGB_TO_AGINO(mp, b)        ((xfs_agino_t)((b) << XFS_INO_OFFSET_BITS(mp)))

#define        XFS_MAXINUMBER                ((xfs_ino_t)((1ULL << 56) - 1ULL))
#define        XFS_MAXINUMBER_32        ((xfs_ino_t)((1ULL << 32) - 1ULL))

/*
 * RealTime Device format definitions
 */

/* Min and max rt extent sizes, specified in bytes */
#define        XFS_MAX_RTEXTSIZE        (1024 * 1024 * 1024)        /* 1GB */
#define        XFS_DFL_RTEXTSIZE        (64 * 1024)                /* 64kB */
#define        XFS_MIN_RTEXTSIZE        (4 * 1024)                /* 4kB */

/*
 * RT bit manipulation macros.
 */
#define XFS_RTBITMAP_MAGIC        0x424D505A        /* BMPZ */
#define XFS_RTSUMMARY_MAGIC        0x53554D59        /* SUMY */

struct xfs_rtbuf_blkinfo {
        __be32                rt_magic;        /* validity check on block */
        __be32                rt_crc;                /* CRC of block */
        __be64                rt_owner;        /* inode that owns the block */
        __be64                rt_blkno;        /* first block of the buffer */
        __be64                rt_lsn;                /* sequence number of last write */
        uuid_t                rt_uuid;        /* filesystem we belong to */
};

#define XFS_RTBUF_CRC_OFF \
        offsetof(struct xfs_rtbuf_blkinfo, rt_crc)

/*
 * Dquot and dquot block format definitions
 */
#define XFS_DQUOT_MAGIC                0x4451                /* 'DQ' */
#define XFS_DQUOT_VERSION        (uint8_t)0x01        /* latest version number */

#define XFS_DQTYPE_USER                (1u << 0)        /* user dquot record */
#define XFS_DQTYPE_PROJ                (1u << 1)        /* project dquot record */
#define XFS_DQTYPE_GROUP        (1u << 2)        /* group dquot record */
#define XFS_DQTYPE_BIGTIME        (1u << 7)        /* large expiry timestamps */

/* bitmask to determine if this is a user/group/project dquot */
#define XFS_DQTYPE_REC_MASK        (XFS_DQTYPE_USER | \
                                 XFS_DQTYPE_PROJ | \
                                 XFS_DQTYPE_GROUP)

#define XFS_DQTYPE_ANY                (XFS_DQTYPE_REC_MASK | \
                                 XFS_DQTYPE_BIGTIME)

/*
 * XFS Quota Timers
 * ================
 *
 * Traditional quota grace period expiration timers are an unsigned 32-bit
 * seconds counter; time zero is the Unix epoch, Jan  1 00:00:01 UTC 1970.
 * Note that an expiration value of zero means that the quota limit has not
 * been reached, and therefore no expiration has been set.  Therefore, the
 * ondisk min and max defined here can be used directly to constrain the incore
 * quota expiration timestamps on a Unix system.
 *
 * When bigtime is enabled, we trade two bits of precision to expand the
 * expiration timeout range to match that of big inode timestamps.  The min and
 * max recorded here are the on-disk limits, not a Unix timestamp.
 *
 * The grace period for each quota type is stored in the root dquot (id = 0)
 * and is applied to a non-root dquot when it exceeds the soft or hard limits.
 * The length of quota grace periods are unsigned 32-bit quantities measured in
 * units of seconds.  A value of zero means to use the default period.
 */

/*
 * Smallest possible ondisk quota expiration value with traditional timestamps.
 * This corresponds exactly with the incore expiration Jan  1 00:00:01 UTC 1970.
 */
#define XFS_DQ_LEGACY_EXPIRY_MIN        ((int64_t)1)

/*
 * Largest possible ondisk quota expiration value with traditional timestamps.
 * This corresponds exactly with the incore expiration Feb  7 06:28:15 UTC 2106.
 */
#define XFS_DQ_LEGACY_EXPIRY_MAX        ((int64_t)U32_MAX)

/*
 * Smallest possible ondisk quota expiration value with bigtime timestamps.
 * This corresponds (after conversion to a Unix timestamp) with the incore
 * expiration of Jan  1 00:00:04 UTC 1970.
 */
#define XFS_DQ_BIGTIME_EXPIRY_MIN        (XFS_DQ_LEGACY_EXPIRY_MIN)

/*
 * Largest supported ondisk quota expiration value with bigtime timestamps.
 * This corresponds (after conversion to a Unix timestamp) with an incore
 * expiration of Jul  2 20:20:24 UTC 2486.
 *
 * The ondisk field supports values up to -1U, which corresponds to an incore
 * expiration in 2514.  This is beyond the maximum the bigtime inode timestamp,
 * so we cap the maximum bigtime quota expiration to the max inode timestamp.
 */
#define XFS_DQ_BIGTIME_EXPIRY_MAX        ((int64_t)4074815106U)

/*
 * The following conversion factors assist in converting a quota expiration
 * timestamp between the incore and ondisk formats.
 */
#define XFS_DQ_BIGTIME_SHIFT        (2)
#define XFS_DQ_BIGTIME_SLACK        ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1)

/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */
static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds)
{
        /*
         * Round the expiration timestamp up to the nearest bigtime timestamp
         * that we can store, to give users the most time to fix problems.
         */
        return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >>
                        XFS_DQ_BIGTIME_SHIFT;
}

/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */
static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
{
        return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT;
}

/*
 * Default quota grace periods, ranging from zero (use the compiled defaults)
 * to ~136 years.  These are applied to a non-root dquot that has exceeded
 * either limit.
 */
#define XFS_DQ_GRACE_MIN                ((int64_t)0)
#define XFS_DQ_GRACE_MAX                ((int64_t)U32_MAX)

/* Maximum id value for a quota record */
#define XFS_DQ_ID_MAX                        (U32_MAX)

/*
 * This is the main portion of the on-disk representation of quota information
 * for a user.  We pad this with some more expansion room to construct the on
 * disk structure.
 */
struct xfs_disk_dquot {
        __be16                d_magic;        /* dquot magic = XFS_DQUOT_MAGIC */
        __u8                d_version;        /* dquot version */
        __u8                d_type;                /* XFS_DQTYPE_USER/PROJ/GROUP */
        __be32                d_id;                /* user,project,group id */
        __be64                d_blk_hardlimit;/* absolute limit on disk blks */
        __be64                d_blk_softlimit;/* preferred limit on disk blks */
        __be64                d_ino_hardlimit;/* maximum # allocated inodes */
        __be64                d_ino_softlimit;/* preferred inode limit */
        __be64                d_bcount;        /* disk blocks owned by the user */
        __be64                d_icount;        /* inodes owned by the user */
        __be32                d_itimer;        /* zero if within inode limits if not,
                                           this is when we refuse service */
        __be32                d_btimer;        /* similar to above; for disk blocks */
        __be16                d_iwarns;        /* warnings issued wrt num inodes */
        __be16                d_bwarns;        /* warnings issued wrt disk blocks */
        __be32                d_pad0;                /* 64 bit align */
        __be64                d_rtb_hardlimit;/* absolute limit on realtime blks */
        __be64                d_rtb_softlimit;/* preferred limit on RT disk blks */
        __be64                d_rtbcount;        /* realtime blocks owned */
        __be32                d_rtbtimer;        /* similar to above; for RT disk blocks */
        __be16                d_rtbwarns;        /* warnings issued wrt RT disk blocks */
        __be16                d_pad;
};

/*
 * This is what goes on disk. This is separated from the xfs_disk_dquot because
 * carrying the unnecessary padding would be a waste of memory.
 */
struct xfs_dqblk {
        struct xfs_disk_dquot        dd_diskdq; /* portion living incore as well */
        char                        dd_fill[4];/* filling for posterity */

        /*
         * These two are only present on filesystems with the CRC bits set.
         */
        __be32                  dd_crc;        /* checksum */
        __be64                  dd_lsn;        /* last modification in log */
        uuid_t                  dd_uuid;        /* location information */
};

#define XFS_DQUOT_CRC_OFF        offsetof(struct xfs_dqblk, dd_crc)

/*
 * This defines the unit of allocation of dquots.
 *
 * Currently, it is just one file system block, and a 4K blk contains 30
 * (136 * 30 = 4080) dquots. It's probably not worth trying to make
 * this more dynamic.
 *
 * However, if this number is changed, we have to make sure that we don't
 * implicitly assume that we do allocations in chunks of a single filesystem
 * block in the dquot/xqm code.
 *
 * This is part of the ondisk format because the structure size is not a power
 * of two, which leaves slack at the end of the disk block.
 */
#define XFS_DQUOT_CLUSTER_SIZE_FSB        (xfs_filblks_t)1

/*
 * Remote symlink format and access functions.
 */
#define XFS_SYMLINK_MAGIC        0x58534c4d        /* XSLM */

struct xfs_dsymlink_hdr {
        __be32        sl_magic;
        __be32        sl_offset;
        __be32        sl_bytes;
        __be32        sl_crc;
        uuid_t        sl_uuid;
        __be64        sl_owner;
        __be64        sl_blkno;
        __be64        sl_lsn;
};

#define XFS_SYMLINK_CRC_OFF        offsetof(struct xfs_dsymlink_hdr, sl_crc)

#define XFS_SYMLINK_MAXLEN        1024
/*
 * The maximum pathlen is 1024 bytes. Since the minimum file system
 * blocksize is 512 bytes, we can get a max of 3 extents back from
 * bmapi when crc headers are taken into account.
 */
#define XFS_SYMLINK_MAPS 3

#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)        \
        ((bufsize) - (xfs_has_crc((mp)) ? \
                        sizeof(struct xfs_dsymlink_hdr) : 0))


/*
 * Allocation Btree format definitions
 *
 * There are two on-disk btrees, one sorted by blockno and one sorted
 * by blockcount and blockno.  All blocks look the same to make the code
 * simpler; if we have time later, we'll make the optimizations.
 */
#define        XFS_ABTB_MAGIC                0x41425442        /* 'ABTB' for bno tree */
#define        XFS_ABTB_CRC_MAGIC        0x41423342        /* 'AB3B' */
#define        XFS_ABTC_MAGIC                0x41425443        /* 'ABTC' for cnt tree */
#define        XFS_ABTC_CRC_MAGIC        0x41423343        /* 'AB3C' */

/*
 * Data record/key structure
 */
typedef struct xfs_alloc_rec {
        __be32                ar_startblock;        /* starting block number */
        __be32                ar_blockcount;        /* count of free blocks */
} xfs_alloc_rec_t, xfs_alloc_key_t;

typedef struct xfs_alloc_rec_incore {
        xfs_agblock_t        ar_startblock;        /* starting block number */
        xfs_extlen_t        ar_blockcount;        /* count of free blocks */
} xfs_alloc_rec_incore_t;

/* btree pointer type */
typedef __be32 xfs_alloc_ptr_t;

/*
 * Block numbers in the AG:
 * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
 */
#define        XFS_BNO_BLOCK(mp)        ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
#define        XFS_CNT_BLOCK(mp)        ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))


/*
 * Inode Allocation Btree format definitions
 *
 * There is a btree for the inode map per allocation group.
 */
#define        XFS_IBT_MAGIC                0x49414254        /* 'IABT' */
#define        XFS_IBT_CRC_MAGIC        0x49414233        /* 'IAB3' */
#define        XFS_FIBT_MAGIC                0x46494254        /* 'FIBT' */
#define        XFS_FIBT_CRC_MAGIC        0x46494233        /* 'FIB3' */

typedef uint64_t        xfs_inofree_t;
#define        XFS_INODES_PER_CHUNK                (NBBY * sizeof(xfs_inofree_t))
#define        XFS_INODES_PER_CHUNK_LOG        (XFS_NBBYLOG + 3)
#define        XFS_INOBT_ALL_FREE                ((xfs_inofree_t)-1)
#define        XFS_INOBT_MASK(i)                ((xfs_inofree_t)1 << (i))

#define XFS_INOBT_HOLEMASK_FULL                0        /* holemask for full chunk */
#define XFS_INOBT_HOLEMASK_BITS                (NBBY * sizeof(uint16_t))
#define XFS_INODES_PER_HOLEMASK_BIT        \
        (XFS_INODES_PER_CHUNK / (NBBY * sizeof(uint16_t)))

static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
{
        return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
}

/*
 * The on-disk inode record structure has two formats. The original "full"
 * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
 * and replaces the 3 high-order freecount bytes wth the holemask and inode
 * count.
 *
 * The holemask of the sparse record format allows an inode chunk to have holes
 * that refer to blocks not owned by the inode record. This facilitates inode
 * allocation in the event of severe free space fragmentation.
 */
typedef struct xfs_inobt_rec {
        __be32                ir_startino;        /* starting inode number */
        union {
                struct {
                        __be32        ir_freecount;        /* count of free inodes */
                } f;
                struct {
                        __be16        ir_holemask;/* hole mask for sparse chunks */
                        __u8        ir_count;        /* total inode count */
                        __u8        ir_freecount;        /* count of free inodes */
                } sp;
        } ir_u;
        __be64                ir_free;        /* free inode mask */
} xfs_inobt_rec_t;

typedef struct xfs_inobt_rec_incore {
        xfs_agino_t        ir_startino;        /* starting inode number */
        uint16_t        ir_holemask;        /* hole mask for sparse chunks */
        uint8_t                ir_count;        /* total inode count */
        uint8_t                ir_freecount;        /* count of free inodes (set bits) */
        xfs_inofree_t        ir_free;        /* free inode mask */
} xfs_inobt_rec_incore_t;

static inline bool xfs_inobt_issparse(uint16_t holemask)
{
        /* non-zero holemask represents a sparse rec. */
        return holemask;
}

/*
 * Key structure
 */
typedef struct xfs_inobt_key {
        __be32                ir_startino;        /* starting inode number */
} xfs_inobt_key_t;

/* btree pointer type */
typedef __be32 xfs_inobt_ptr_t;

/*
 * block numbers in the AG.
 */
#define        XFS_IBT_BLOCK(mp)                ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
#define        XFS_FIBT_BLOCK(mp)                ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))

/*
 * Reverse mapping btree format definitions
 *
 * There is a btree for the reverse map per allocation group
 */
#define        XFS_RMAP_CRC_MAGIC        0x524d4233        /* 'RMB3' */

/*
 * Ownership info for an extent.  This is used to create reverse-mapping
 * entries.
 */
#define XFS_OWNER_INFO_ATTR_FORK        (1 << 0)
#define XFS_OWNER_INFO_BMBT_BLOCK        (1 << 1)
struct xfs_owner_info {
        uint64_t                oi_owner;
        xfs_fileoff_t                oi_offset;
        unsigned int                oi_flags;
};

/*
 * Special owner types.
 *
 * Seeing as we only support up to 8EB, we have the upper bit of the owner field
 * to tell us we have a special owner value. We use these for static metadata
 * allocated at mkfs/growfs time, as well as for freespace management metadata.
 */
#define XFS_RMAP_OWN_NULL        (-1ULL)        /* No owner, for growfs */
#define XFS_RMAP_OWN_UNKNOWN        (-2ULL)        /* Unknown owner, for EFI recovery */
#define XFS_RMAP_OWN_FS                (-3ULL)        /* static fs metadata */
#define XFS_RMAP_OWN_LOG        (-4ULL)        /* static fs metadata */
#define XFS_RMAP_OWN_AG                (-5ULL)        /* AG freespace btree blocks */
#define XFS_RMAP_OWN_INOBT        (-6ULL)        /* Inode btree blocks */
#define XFS_RMAP_OWN_INODES        (-7ULL)        /* Inode chunk */
#define XFS_RMAP_OWN_REFC        (-8ULL) /* refcount tree */
#define XFS_RMAP_OWN_COW        (-9ULL) /* cow allocations */
#define XFS_RMAP_OWN_MIN        (-10ULL) /* guard */

#define XFS_RMAP_NON_INODE_OWNER(owner)        (!!((owner) & (1ULL << 63)))

/*
 * Data record structure
 */
struct xfs_rmap_rec {
        __be32                rm_startblock;        /* extent start block */
        __be32                rm_blockcount;        /* extent length */
        __be64                rm_owner;        /* extent owner */
        __be64                rm_offset;        /* offset within the owner */
};

/*
 * rmap btree record
 *  rm_offset:63 is the attribute fork flag
 *  rm_offset:62 is the bmbt block flag
 *  rm_offset:61 is the unwritten extent flag (same as l0:63 in bmbt)
 *  rm_offset:54-60 aren't used and should be zero
 *  rm_offset:0-53 is the block offset within the inode
 */
#define XFS_RMAP_OFF_ATTR_FORK        ((uint64_t)1ULL << 63)
#define XFS_RMAP_OFF_BMBT_BLOCK        ((uint64_t)1ULL << 62)
#define XFS_RMAP_OFF_UNWRITTEN        ((uint64_t)1ULL << 61)

#define XFS_RMAP_LEN_MAX        ((uint32_t)~0U)
#define XFS_RMAP_OFF_FLAGS        (XFS_RMAP_OFF_ATTR_FORK | \
                                 XFS_RMAP_OFF_BMBT_BLOCK | \
                                 XFS_RMAP_OFF_UNWRITTEN)
#define XFS_RMAP_OFF_MASK        ((uint64_t)0x3FFFFFFFFFFFFFULL)

#define XFS_RMAP_OFF(off)                ((off) & XFS_RMAP_OFF_MASK)

#define XFS_RMAP_IS_BMBT_BLOCK(off)        (!!((off) & XFS_RMAP_OFF_BMBT_BLOCK))
#define XFS_RMAP_IS_ATTR_FORK(off)        (!!((off) & XFS_RMAP_OFF_ATTR_FORK))
#define XFS_RMAP_IS_UNWRITTEN(len)        (!!((off) & XFS_RMAP_OFF_UNWRITTEN))

#define RMAPBT_STARTBLOCK_BITLEN        32
#define RMAPBT_BLOCKCOUNT_BITLEN        32
#define RMAPBT_OWNER_BITLEN                64
#define RMAPBT_ATTRFLAG_BITLEN                1
#define RMAPBT_BMBTFLAG_BITLEN                1
#define RMAPBT_EXNTFLAG_BITLEN                1
#define RMAPBT_UNUSED_OFFSET_BITLEN        7
#define RMAPBT_OFFSET_BITLEN                54

/*
 * Key structure
 *
 * We don't use the length for lookups
 */
struct xfs_rmap_key {
        __be32                rm_startblock;        /* extent start block */
        __be64                rm_owner;        /* extent owner */
        __be64                rm_offset;        /* offset within the owner */
} __attribute__((packed));

/* btree pointer type */
typedef __be32 xfs_rmap_ptr_t;

#define        XFS_RMAP_BLOCK(mp) \
        (xfs_has_finobt(((mp))) ? \
         XFS_FIBT_BLOCK(mp) + 1 : \
         XFS_IBT_BLOCK(mp) + 1)

/*
 * Realtime Reverse mapping btree format definitions
 *
 * This is a btree for reverse mapping records for realtime volumes
 */
#define        XFS_RTRMAP_CRC_MAGIC        0x4d415052        /* 'MAPR' */

/*
 * rtrmap root header, on-disk form only.
 */
struct xfs_rtrmap_root {
        __be16                bb_level;        /* 0 is a leaf */
        __be16                bb_numrecs;        /* current # of data records */
};

/* inode-based btree pointer type */
typedef __be64 xfs_rtrmap_ptr_t;

/*
 * Reference Count Btree format definitions
 *
 */
#define        XFS_REFC_CRC_MAGIC        0x52334643        /* 'R3FC' */

unsigned int xfs_refc_block(struct xfs_mount *mp);

/*
 * Data record/key structure
 *
 * Each record associates a range of physical blocks (starting at
 * rc_startblock and ending rc_blockcount blocks later) with a reference
 * count (rc_refcount).  Extents that are being used to stage a copy on
 * write (CoW) operation are recorded in the refcount btree with a
 * refcount of 1.  All other records must have a refcount > 1 and must
 * track an extent mapped only by file data forks.
 *
 * Extents with a single owner (attributes, metadata, non-shared file
 * data) are not tracked here.  Free space is also not tracked here.
 * This is consistent with pre-reflink XFS.
 */

/*
 * Extents that are being used to stage a copy on write are stored
 * in the refcount btree with a refcount of 1 and the upper bit set
 * on the startblock.  This speeds up mount time deletion of stale
 * staging extents because they're all at the right side of the tree.
 */
#define XFS_REFC_COWFLAG                (1U << 31)
#define REFCNTBT_COWFLAG_BITLEN                1
#define REFCNTBT_AGBLOCK_BITLEN                31

struct xfs_refcount_rec {
        __be32                rc_startblock;        /* starting block number */
        __be32                rc_blockcount;        /* count of blocks */
        __be32                rc_refcount;        /* number of inodes linked here */
};

struct xfs_refcount_key {
        __be32                rc_startblock;        /* starting block number */
};

#define XFS_REFC_REFCOUNT_MAX        ((xfs_nlink_t)~0U)
#define XFS_REFC_LEN_MAX        ((xfs_extlen_t)~0U)

/* btree pointer type */
typedef __be32 xfs_refcount_ptr_t;

/*
 * Realtime Reference Count btree format definitions
 *
 * This is a btree for reference count records for realtime volumes
 */
#define        XFS_RTREFC_CRC_MAGIC        0x52434e54        /* 'RCNT' */

/*
 * rt refcount root header, on-disk form only.
 */
struct xfs_rtrefcount_root {
        __be16                bb_level;        /* 0 is a leaf */
        __be16                bb_numrecs;        /* current # of data records */
};

/* inode-rooted btree pointer type */
typedef __be64 xfs_rtrefcount_ptr_t;

/*
 * BMAP Btree format definitions
 *
 * This includes both the root block definition that sits inside an inode fork
 * and the record/pointer formats for the leaf/node in the blocks.
 */
#define XFS_BMAP_MAGIC                0x424d4150        /* 'BMAP' */
#define XFS_BMAP_CRC_MAGIC        0x424d4133        /* 'BMA3' */

/*
 * Bmap root header, on-disk form only.
 */
typedef struct xfs_bmdr_block {
        __be16                bb_level;        /* 0 is a leaf */
        __be16                bb_numrecs;        /* current # of data records */
} xfs_bmdr_block_t;

/*
 * Bmap btree record and extent descriptor.
 *  l0:63 is an extent flag (value 1 indicates non-normal).
 *  l0:9-62 are startoff.
 *  l0:0-8 and l1:21-63 are startblock.
 *  l1:0-20 are blockcount.
 */
#define BMBT_EXNTFLAG_BITLEN        1
#define BMBT_STARTOFF_BITLEN        54
#define BMBT_STARTBLOCK_BITLEN        52
#define BMBT_BLOCKCOUNT_BITLEN        21

#define BMBT_STARTOFF_MASK        ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
#define BMBT_BLOCKCOUNT_MASK        ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)

#define XFS_MAX_BMBT_EXTLEN        ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK))

/*
 * bmbt records have a file offset (block) field that is 54 bits wide, so this
 * is the largest xfs_fileoff_t that we ever expect to see.
 */
#define XFS_MAX_FILEOFF                (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK)

typedef struct xfs_bmbt_rec {
        __be64                        l0, l1;
} xfs_bmbt_rec_t;

typedef uint64_t        xfs_bmbt_rec_base_t;        /* use this for casts */
typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;

/*
 * Values and macros for delayed-allocation startblock fields.
 */
#define STARTBLOCKVALBITS        17
#define STARTBLOCKMASKBITS        (15 + 20)
#define STARTBLOCKMASK                \
        (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)

static inline int isnullstartblock(xfs_fsblock_t x)
{
        return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
}

static inline xfs_fsblock_t nullstartblock(int k)
{
        ASSERT(k < (1 << STARTBLOCKVALBITS));
        return STARTBLOCKMASK | (k);
}

static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
{
        return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
}

/*
 * Key structure for non-leaf levels of the tree.
 */
typedef struct xfs_bmbt_key {
        __be64                br_startoff;        /* starting file offset */
} xfs_bmbt_key_t, xfs_bmdr_key_t;

/* btree pointer type */
typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;


/*
 * Generic Btree block format definitions
 *
 * This is a combination of the actual format used on disk for short and long
 * format btrees.  The first three fields are shared by both format, but the
 * pointers are different and should be used with care.
 *
 * To get the size of the actual short or long form headers please use the size
 * macros below.  Never use sizeof(xfs_btree_block).
 *
 * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
 * with the crc feature bit, and all accesses to them must be conditional on
 * that flag.
 */
/* short form block header */
struct xfs_btree_block_shdr {
        __be32                bb_leftsib;
        __be32                bb_rightsib;

        __be64                bb_blkno;
        __be64                bb_lsn;
        uuid_t                bb_uuid;
        __be32                bb_owner;
        __le32                bb_crc;
};

/* long form block header */
struct xfs_btree_block_lhdr {
        __be64                bb_leftsib;
        __be64                bb_rightsib;

        __be64                bb_blkno;
        __be64                bb_lsn;
        uuid_t                bb_uuid;
        __be64                bb_owner;
        __le32                bb_crc;
        __be32                bb_pad; /* padding for alignment */
};

struct xfs_btree_block {
        __be32                bb_magic;        /* magic number for block type */
        __be16                bb_level;        /* 0 is a leaf */
        __be16                bb_numrecs;        /* current # of data records */
        union {
                struct xfs_btree_block_shdr s;
                struct xfs_btree_block_lhdr l;
        } bb_u;                                /* rest */
};

/* size of a short form block */
#define XFS_BTREE_SBLOCK_LEN \
        (offsetof(struct xfs_btree_block, bb_u) + \
         offsetof(struct xfs_btree_block_shdr, bb_blkno))
/* size of a long form block */
#define XFS_BTREE_LBLOCK_LEN \
        (offsetof(struct xfs_btree_block, bb_u) + \
         offsetof(struct xfs_btree_block_lhdr, bb_blkno))

/* sizes of CRC enabled btree blocks */
#define XFS_BTREE_SBLOCK_CRC_LEN \
        (offsetof(struct xfs_btree_block, bb_u) + \
         sizeof(struct xfs_btree_block_shdr))
#define XFS_BTREE_LBLOCK_CRC_LEN \
        (offsetof(struct xfs_btree_block, bb_u) + \
         sizeof(struct xfs_btree_block_lhdr))

#define XFS_BTREE_SBLOCK_CRC_OFF \
        offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
#define XFS_BTREE_LBLOCK_CRC_OFF \
        offsetof(struct xfs_btree_block, bb_u.l.bb_crc)

/*
 * On-disk XFS access control list structure.
 */
struct xfs_acl_entry {
        __be32        ae_tag;
        __be32        ae_id;
        __be16        ae_perm;
        __be16        ae_pad;                /* fill the implicit hole in the structure */
};

struct xfs_acl {
        __be32                        acl_cnt;
        struct xfs_acl_entry        acl_entry[];
};

/*
 * The number of ACL entries allowed is defined by the on-disk format.
 * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
 * limited only by the maximum size of the xattr that stores the information.
 */
#define XFS_ACL_MAX_ENTRIES(mp)        \
        (xfs_has_crc(mp) \
                ?  (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
                                                sizeof(struct xfs_acl_entry) \
                : 25)

#define XFS_ACL_SIZE(cnt) \
        (sizeof(struct xfs_acl) + \
                sizeof(struct xfs_acl_entry) * cnt)

#define XFS_ACL_MAX_SIZE(mp) \
        XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp)))


/* On-disk XFS extended attribute names */
#define SGI_ACL_FILE                "SGI_ACL_FILE"
#define SGI_ACL_DEFAULT                "SGI_ACL_DEFAULT"
#define SGI_ACL_FILE_SIZE        (sizeof(SGI_ACL_FILE)-1)
#define SGI_ACL_DEFAULT_SIZE        (sizeof(SGI_ACL_DEFAULT)-1)

#endif /* __XFS_FORMAT_H__ */





























































































































































































































































































































































































































































    2 


















    2 






    2 






































































































































































































    2 









    2 










    2 














    2 



















    2 



































































































    2 


    2 


































    2 


    2 
















    2 









    2 

    2 
















    2 

    2 












    2 

    2 


    2 
    1 




    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                The User Datagram Protocol (UDP).
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 * Fixes:
 *                Alan Cox        :        verify_area() calls
 *                Alan Cox        :         stopped close while in use off icmp
 *                                        messages. Not a fix but a botch that
 *                                        for udp at least is 'valid'.
 *                Alan Cox        :        Fixed icmp handling properly
 *                Alan Cox        :         Correct error for oversized datagrams
 *                Alan Cox        :        Tidied select() semantics.
 *                Alan Cox        :        udp_err() fixed properly, also now
 *                                        select and read wake correctly on errors
 *                Alan Cox        :        udp_send verify_area moved to avoid mem leak
 *                Alan Cox        :        UDP can count its memory
 *                Alan Cox        :        send to an unknown connection causes
 *                                        an ECONNREFUSED off the icmp, but
 *                                        does NOT close.
 *                Alan Cox        :        Switched to new sk_buff handlers. No more backlog!
 *                Alan Cox        :        Using generic datagram code. Even smaller and the PEEK
 *                                        bug no longer crashes it.
 *                Fred Van Kempen        :         Net2e support for sk->broadcast.
 *                Alan Cox        :        Uses skb_free_datagram
 *                Alan Cox        :        Added get/set sockopt support.
 *                Alan Cox        :        Broadcasting without option set returns EACCES.
 *                Alan Cox        :        No wakeup calls. Instead we now use the callbacks.
 *                Alan Cox        :        Use ip_tos and ip_ttl
 *                Alan Cox        :        SNMP Mibs
 *                Alan Cox        :        MSG_DONTROUTE, and 0.0.0.0 support.
 *                Matt Dillon        :        UDP length checks.
 *                Alan Cox        :        Smarter af_inet used properly.
 *                Alan Cox        :        Use new kernel side addressing.
 *                Alan Cox        :        Incorrect return on truncated datagram receive.
 *        Arnt Gulbrandsen         :        New udp_send and stuff
 *                Alan Cox        :        Cache last socket
 *                Alan Cox        :        Route cache
 *                Jon Peatfield        :        Minor efficiency fix to sendto().
 *                Mike Shaver        :        RFC1122 checks.
 *                Alan Cox        :        Nonblocking error fix.
 *        Willy Konynenberg        :        Transparent proxying support.
 *                Mike McLagan        :        Routing by source
 *                David S. Miller        :        New socket lookup architecture.
 *                                        Last socket cache retained as it
 *                                        does have a high hit rate.
 *                Olaf Kirch        :        Don't linearise iovec on sendmsg.
 *                Andi Kleen        :        Some cleanups, cache destination entry
 *                                        for connect.
 *        Vitaly E. Lavrov        :        Transparent proxy revived after year coma.
 *                Melvin Smith        :        Check msg_name not msg_namelen in sendto(),
 *                                        return ENOTCONN for unconnected sockets (POSIX)
 *                Janos Farkas        :        don't deliver multi/broadcasts to a different
 *                                        bound-to-device socket
 *        Hirokazu Takahashi        :        HW checksumming for outgoing UDP
 *                                        datagrams.
 *        Hirokazu Takahashi        :        sendfile() on UDP works now.
 *                Arnaldo C. Melo :        convert /proc/net/udp to seq_file
 *        YOSHIFUJI Hideaki @USAGI and:        Support IPV6_V6ONLY socket option, which
 *        Alexey Kuznetsov:                allow both IPv4 and IPv6 sockets to bind
 *                                        a single port at the same time.
 *        Derek Atkins <derek@ihtfp.com>: Add Encapsulation Support
 *        James Chapman                :        Add L2TP encapsulation type.
 */

#define pr_fmt(fmt) "UDP: " fmt

#include <linux/bpf-cgroup.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/igmp.h>
#include <linux/inetdevice.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/sock_diag.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/aligned_data.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
#include <net/ip.h>
#include <net/ip_tunnels.h>
#include <net/route.h>
#include <net/checksum.h>
#include <net/gso.h>
#include <net/xfrm.h>
#include <trace/events/udp.h>
#include <linux/static_key.h>
#include <linux/btf_ids.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
#include <net/udp_tunnel.h>
#include <net/gro.h>
#include <net/rps.h>

struct udp_table udp_table __read_mostly;

long sysctl_udp_mem[3] __read_mostly;

DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);

#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)

static int udp_lib_lport_inuse(struct net *net, __u16 num,
                               const struct udp_hslot *hslot,
                               unsigned long *bitmap,
                               struct sock *sk, unsigned int log)
{
        kuid_t uid = sk_uid(sk);
        struct sock *sk2;

        sk_for_each(sk2, &hslot->head) {
                if (net_eq(sock_net(sk2), net) &&
                    sk2 != sk &&
                    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
                    (!sk2->sk_reuse || !sk->sk_reuse) &&
                    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
                     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
                    inet_rcv_saddr_equal(sk, sk2, true)) {
                        if (sk2->sk_reuseport && sk->sk_reuseport &&
                            !rcu_access_pointer(sk->sk_reuseport_cb) &&
                            uid_eq(uid, sk_uid(sk2))) {
                                if (!bitmap)
                                        return 0;
                        } else {
                                if (!bitmap)
                                        return 1;
                                __set_bit(udp_sk(sk2)->udp_port_hash >> log,
                                          bitmap);
                        }
                }
        }
        return 0;
}

/*
 * Note: we still hold spinlock of primary hash chain, so no other writer
 * can insert/delete a socket with local_port == num
 */
static int udp_lib_lport_inuse2(struct net *net, __u16 num,
                                struct udp_hslot *hslot2,
                                struct sock *sk)
{
        kuid_t uid = sk_uid(sk);
        struct sock *sk2;
        int res = 0;

        spin_lock(&hslot2->lock);
        udp_portaddr_for_each_entry(sk2, &hslot2->head) {
                if (net_eq(sock_net(sk2), net) &&
                    sk2 != sk &&
                    (udp_sk(sk2)->udp_port_hash == num) &&
                    (!sk2->sk_reuse || !sk->sk_reuse) &&
                    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
                     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
                    inet_rcv_saddr_equal(sk, sk2, true)) {
                        if (sk2->sk_reuseport && sk->sk_reuseport &&
                            !rcu_access_pointer(sk->sk_reuseport_cb) &&
                            uid_eq(uid, sk_uid(sk2))) {
                                res = 0;
                        } else {
                                res = 1;
                        }
                        break;
                }
        }
        spin_unlock(&hslot2->lock);
        return res;
}

static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
{
        struct net *net = sock_net(sk);
        kuid_t uid = sk_uid(sk);
        struct sock *sk2;

        sk_for_each(sk2, &hslot->head) {
                if (net_eq(sock_net(sk2), net) &&
                    sk2 != sk &&
                    sk2->sk_family == sk->sk_family &&
                    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
                    (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
                    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
                    sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
                    inet_rcv_saddr_equal(sk, sk2, false)) {
                        return reuseport_add_sock(sk, sk2,
                                                  inet_rcv_saddr_any(sk));
                }
        }

        return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}

/**
 *  udp_lib_get_port  -  UDP port lookup for IPv4 and IPv6
 *
 *  @sk:          socket struct in question
 *  @snum:        port number to look up
 *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
 *                   with NULL address
 */
int udp_lib_get_port(struct sock *sk, unsigned short snum,
                     unsigned int hash2_nulladdr)
{
        struct udp_hslot *hslot, *hslot2;
        struct net *net = sock_net(sk);
        struct udp_table *udptable;
        int error = -EADDRINUSE;

        udptable = net->ipv4.udp_table;

        if (!snum) {
                DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
                unsigned short first, last;
                int low, high, remaining;
                unsigned int rand;

                inet_sk_get_local_port_range(sk, &low, &high);
                remaining = (high - low) + 1;

                rand = get_random_u32();
                first = reciprocal_scale(rand, remaining) + low;
                /*
                 * force rand to be an odd multiple of UDP_HTABLE_SIZE
                 */
                rand = (rand | 1) * (udptable->mask + 1);
                last = first + udptable->mask + 1;
                do {
                        hslot = udp_hashslot(udptable, net, first);
                        bitmap_zero(bitmap, PORTS_PER_CHAIN);
                        spin_lock_bh(&hslot->lock);
                        udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
                                            udptable->log);

                        snum = first;
                        /*
                         * Iterate on all possible values of snum for this hash.
                         * Using steps of an odd multiple of UDP_HTABLE_SIZE
                         * give us randomization and full range coverage.
                         */
                        do {
                                if (low <= snum && snum <= high &&
                                    !test_bit(snum >> udptable->log, bitmap) &&
                                    !inet_is_local_reserved_port(net, snum))
                                        goto found;
                                snum += rand;
                        } while (snum != first);
                        spin_unlock_bh(&hslot->lock);
                        cond_resched();
                } while (++first != last);
                goto fail;
        } else {
                hslot = udp_hashslot(udptable, net, snum);
                spin_lock_bh(&hslot->lock);
                if (inet_use_hash2_on_bind(sk) && hslot->count > 10) {
                        int exist;
                        unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;

                        slot2          &= udptable->mask;
                        hash2_nulladdr &= udptable->mask;

                        hslot2 = udp_hashslot2(udptable, slot2);
                        if (hslot->count < hslot2->count)
                                goto scan_primary_hash;

                        exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
                        if (!exist && (hash2_nulladdr != slot2)) {
                                hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
                                exist = udp_lib_lport_inuse2(net, snum, hslot2,
                                                             sk);
                        }
                        if (exist)
                                goto fail_unlock;
                        else
                                goto found;
                }
scan_primary_hash:
                if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
                        goto fail_unlock;
        }
found:
        inet_sk(sk)->inet_num = snum;
        udp_sk(sk)->udp_port_hash = snum;
        udp_sk(sk)->udp_portaddr_hash ^= snum;
        if (sk_unhashed(sk)) {
                if (sk->sk_reuseport &&
                    udp_reuseport_add_sock(sk, hslot)) {
                        inet_sk(sk)->inet_num = 0;
                        udp_sk(sk)->udp_port_hash = 0;
                        udp_sk(sk)->udp_portaddr_hash ^= snum;
                        goto fail_unlock;
                }

                sock_set_flag(sk, SOCK_RCU_FREE);

                sk_add_node_rcu(sk, &hslot->head);
                hslot->count++;
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

                hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
                spin_lock(&hslot2->lock);
                if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
                    sk->sk_family == AF_INET6)
                        hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
                                           &hslot2->head);
                else
                        hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
                                           &hslot2->head);
                hslot2->count++;
                spin_unlock(&hslot2->lock);
        }

        error = 0;
fail_unlock:
        spin_unlock_bh(&hslot->lock);
fail:
        return error;
}

static int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
        unsigned int hash2_nulladdr =
                ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
        unsigned int hash2_partial =
                ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);

        /* precompute partial secondary hash */
        udp_sk(sk)->udp_portaddr_hash = hash2_partial;
        return udp_lib_get_port(sk, snum, hash2_nulladdr);
}

static __always_inline int
compute_score(struct sock *sk, const struct net *net,
              __be32 saddr, __be16 sport, __be32 daddr,
              unsigned short hnum, int dif, int sdif)
{
        int score;
        struct inet_sock *inet;
        bool dev_match;

        if (!net_eq(sock_net(sk), net) ||
            udp_sk(sk)->udp_port_hash != hnum ||
            ipv6_only_sock(sk))
                return -1;

        if (sk->sk_rcv_saddr != daddr)
                return -1;

        score = (sk->sk_family == PF_INET) ? 2 : 1;

        inet = inet_sk(sk);
        if (inet->inet_daddr) {
                if (inet->inet_daddr != saddr)
                        return -1;
                score += 4;
        }

        if (inet->inet_dport) {
                if (inet->inet_dport != sport)
                        return -1;
                score += 4;
        }

        dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
                                        dif, sdif);
        if (!dev_match)
                return -1;
        if (sk->sk_bound_dev_if)
                score += 4;

        if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
                score++;
        return score;
}

u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
                const __be32 faddr, const __be16 fport)
{
        net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));

        return __inet_ehashfn(laddr, lport, faddr, fport,
                              udp_ehash_secret + net_hash_mix(net));
}

/**
 * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port)
 * @net:        Network namespace
 * @saddr:        Source address, network order
 * @sport:        Source port, network order
 * @daddr:        Destination address, network order
 * @hnum:        Destination port, host order
 * @dif:        Destination interface index
 * @sdif:        Destination bridge port index, if relevant
 * @udptable:        Set of UDP hash tables
 *
 * Simplified lookup to be used as fallback if no sockets are found due to a
 * potential race between (receive) address change, and lookup happening before
 * the rehash operation. This function ignores SO_REUSEPORT groups while scoring
 * result sockets, because if we have one, we don't need the fallback at all.
 *
 * Called under rcu_read_lock().
 *
 * Return: socket with highest matching score if any, NULL if none
 */
static struct sock *udp4_lib_lookup1(const struct net *net,
                                     __be32 saddr, __be16 sport,
                                     __be32 daddr, unsigned int hnum,
                                     int dif, int sdif,
                                     const struct udp_table *udptable)
{
        unsigned int slot = udp_hashfn(net, hnum, udptable->mask);
        struct udp_hslot *hslot = &udptable->hash[slot];
        struct sock *sk, *result = NULL;
        int score, badness = 0;

        sk_for_each_rcu(sk, &hslot->head) {
                score = compute_score(sk, net,
                                      saddr, sport, daddr, hnum, dif, sdif);
                if (score > badness) {
                        result = sk;
                        badness = score;
                }
        }

        return result;
}

/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(const struct net *net,
                                     __be32 saddr, __be16 sport,
                                     __be32 daddr, unsigned int hnum,
                                     int dif, int sdif,
                                     struct udp_hslot *hslot2,
                                     struct sk_buff *skb)
{
        struct sock *sk, *result;
        int score, badness;
        bool need_rescore;

        result = NULL;
        badness = 0;
        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                need_rescore = false;
rescore:
                score = compute_score(need_rescore ? result : sk, net, saddr,
                                      sport, daddr, hnum, dif, sdif);
                if (score > badness) {
                        badness = score;

                        if (need_rescore)
                                continue;

                        if (sk->sk_state == TCP_ESTABLISHED) {
                                result = sk;
                                continue;
                        }

                        result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
                                                       saddr, sport, daddr, hnum, udp_ehashfn);
                        if (!result) {
                                result = sk;
                                continue;
                        }

                        /* Fall back to scoring if group has connections */
                        if (!reuseport_has_conns(sk))
                                return result;

                        /* Reuseport logic returned an error, keep original score. */
                        if (IS_ERR(result))
                                continue;

                        /* compute_score is too long of a function to be
                         * inlined twice here, and calling it uninlined
                         * here yields measurable overhead for some
                         * workloads. Work around it by jumping
                         * backwards to rescore 'result'.
                         */
                        need_rescore = true;
                        goto rescore;
                }
        }
        return result;
}

#if IS_ENABLED(CONFIG_BASE_SMALL)
static struct sock *udp4_lib_lookup4(const struct net *net,
                                     __be32 saddr, __be16 sport,
                                     __be32 daddr, unsigned int hnum,
                                     int dif, int sdif,
                                     struct udp_table *udptable)
{
        return NULL;
}

static void udp_rehash4(struct udp_table *udptable, struct sock *sk,
                        u16 newhash4)
{
}

static void udp_unhash4(struct udp_table *udptable, struct sock *sk)
{
}
#else /* !CONFIG_BASE_SMALL */
static struct sock *udp4_lib_lookup4(const struct net *net,
                                     __be32 saddr, __be16 sport,
                                     __be32 daddr, unsigned int hnum,
                                     int dif, int sdif,
                                     struct udp_table *udptable)
{
        const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
        const struct hlist_nulls_node *node;
        struct udp_hslot *hslot4;
        unsigned int hash4, slot;
        struct udp_sock *up;
        struct sock *sk;

        hash4 = udp_ehashfn(net, daddr, hnum, saddr, sport);
        slot = hash4 & udptable->mask;
        hslot4 = &udptable->hash4[slot];
        INET_ADDR_COOKIE(acookie, saddr, daddr);

begin:
        /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */
        udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) {
                sk = (struct sock *)up;
                if (inet_match(net, sk, acookie, ports, dif, sdif))
                        return sk;
        }

        /* if the nulls value we got at the end of this lookup is not the
         * expected one, we must restart lookup. We probably met an item that
         * was moved to another chain due to rehash.
         */
        if (get_nulls_value(node) != slot)
                goto begin;

        return NULL;
}

/* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */
static void udp_rehash4(struct udp_table *udptable, struct sock *sk,
                        u16 newhash4)
{
        struct udp_hslot *hslot4, *nhslot4;

        hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
        nhslot4 = udp_hashslot4(udptable, newhash4);
        udp_sk(sk)->udp_lrpa_hash = newhash4;

        if (hslot4 != nhslot4) {
                spin_lock_bh(&hslot4->lock);
                hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
                hslot4->count--;
                spin_unlock_bh(&hslot4->lock);

                spin_lock_bh(&nhslot4->lock);
                hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node,
                                         &nhslot4->nulls_head);
                nhslot4->count++;
                spin_unlock_bh(&nhslot4->lock);
        }
}

static void udp_unhash4(struct udp_table *udptable, struct sock *sk)
{
        struct udp_hslot *hslot2, *hslot4;

        if (udp_hashed4(sk)) {
                hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
                hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);

                spin_lock(&hslot4->lock);
                hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
                hslot4->count--;
                spin_unlock(&hslot4->lock);

                spin_lock(&hslot2->lock);
                udp_hash4_dec(hslot2);
                spin_unlock(&hslot2->lock);
        }
}

void udp_lib_hash4(struct sock *sk, u16 hash)
{
        struct udp_hslot *hslot, *hslot2, *hslot4;
        struct net *net = sock_net(sk);
        struct udp_table *udptable;

        /* Connected udp socket can re-connect to another remote address, which
         * will be handled by rehash. Thus no need to redo hash4 here.
         */
        if (udp_hashed4(sk))
                return;

        udptable = net->ipv4.udp_table;
        hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash);
        hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
        hslot4 = udp_hashslot4(udptable, hash);
        udp_sk(sk)->udp_lrpa_hash = hash;

        spin_lock_bh(&hslot->lock);
        if (rcu_access_pointer(sk->sk_reuseport_cb))
                reuseport_detach_sock(sk);

        spin_lock(&hslot4->lock);
        hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node,
                                 &hslot4->nulls_head);
        hslot4->count++;
        spin_unlock(&hslot4->lock);

        spin_lock(&hslot2->lock);
        udp_hash4_inc(hslot2);
        spin_unlock(&hslot2->lock);

        spin_unlock_bh(&hslot->lock);
}

/* call with sock lock */
void udp4_hash4(struct sock *sk)
{
        struct net *net = sock_net(sk);
        unsigned int hash;

        if (sk_unhashed(sk) || sk->sk_rcv_saddr == htonl(INADDR_ANY))
                return;

        hash = udp_ehashfn(net, sk->sk_rcv_saddr, sk->sk_num,
                           sk->sk_daddr, sk->sk_dport);

        udp_lib_hash4(sk, hash);
}
#endif /* CONFIG_BASE_SMALL */

/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
 * harder than this. -DaveM
 */
struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
                               __be16 sport, __be32 daddr, __be16 dport,
                               int dif, int sdif, struct sk_buff *skb)
{
        struct udp_table *udptable = net->ipv4.udp_table;
        unsigned short hnum = ntohs(dport);
        struct udp_hslot *hslot2;
        struct sock *result, *sk;
        unsigned int hash2;

        hash2 = ipv4_portaddr_hash(net, daddr, hnum);
        hslot2 = udp_hashslot2(udptable, hash2);

        if (udp_has_hash4(hslot2)) {
                result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum,
                                          dif, sdif, udptable);
                if (result) /* udp4_lib_lookup4 return sk or NULL */
                        return result;
        }

        /* Lookup connected or non-wildcard socket */
        result = udp4_lib_lookup2(net, saddr, sport,
                                  daddr, hnum, dif, sdif,
                                  hslot2, skb);
        if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
                goto done;

        /* Lookup redirect from BPF */
        if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
                sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
                                               saddr, sport, daddr, hnum, dif,
                                               udp_ehashfn);
                if (sk) {
                        result = sk;
                        goto done;
                }
        }

        /* Got non-wildcard socket or error on first lookup */
        if (result)
                goto done;

        /* Lookup wildcard sockets */
        hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
        hslot2 = udp_hashslot2(udptable, hash2);

        result = udp4_lib_lookup2(net, saddr, sport,
                                  htonl(INADDR_ANY), hnum, dif, sdif,
                                  hslot2, skb);
        if (!IS_ERR_OR_NULL(result))
                goto done;

        /* Primary hash (destination port) lookup as fallback for this race:
         *   1. __ip4_datagram_connect() sets sk_rcv_saddr
         *   2. lookup (this function): new sk_rcv_saddr, hashes not updated yet
         *   3. rehash operation updating _secondary and four-tuple_ hashes
         * The primary hash doesn't need an update after 1., so, thanks to this
         * further step, 1. and 3. don't need to be atomic against the lookup.
         */
        result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif,
                                  udptable);

done:
        if (IS_ERR(result))
                return NULL;
        return result;
}
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);

static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
                                                 __be16 sport, __be16 dport)
{
        const struct iphdr *iph = ip_hdr(skb);

        return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
                                 iph->daddr, dport, inet_iif(skb),
                                 inet_sdif(skb), skb);
}

struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
                                 __be16 sport, __be16 dport)
{
        const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
        const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
        int iif, sdif;

        inet_get_iif_sdif(skb, &iif, &sdif);

        return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
                                 iph->daddr, dport, iif, sdif, NULL);
}

/* Must be called under rcu_read_lock().
 * Does increment socket refcount.
 */
#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport,
                             __be32 daddr, __be16 dport, int dif)
{
        struct sock *sk;

        sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, 0, NULL);
        if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
#endif

static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk,
                                       __be16 loc_port, __be32 loc_addr,
                                       __be16 rmt_port, __be32 rmt_addr,
                                       int dif, int sdif, unsigned short hnum)
{
        const struct inet_sock *inet = inet_sk(sk);

        if (!net_eq(sock_net(sk), net) ||
            udp_sk(sk)->udp_port_hash != hnum ||
            (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
            (inet->inet_dport != rmt_port && inet->inet_dport) ||
            (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
            ipv6_only_sock(sk) ||
            !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
                return false;
        if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
                return false;
        return true;
}

DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);

#if IS_ENABLED(CONFIG_IPV6)
DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
#endif

void udp_encap_enable(void)
{
        static_branch_inc(&udp_encap_needed_key);
}
EXPORT_SYMBOL(udp_encap_enable);

void udp_encap_disable(void)
{
        static_branch_dec(&udp_encap_needed_key);
}
EXPORT_SYMBOL(udp_encap_disable);

/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
 * through error handlers in encapsulations looking for a match.
 */
static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
{
        int i;

        for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
                int (*handler)(struct sk_buff *skb, u32 info);
                const struct ip_tunnel_encap_ops *encap;

                encap = rcu_dereference(iptun_encaps[i]);
                if (!encap)
                        continue;
                handler = encap->err_handler;
                if (handler && !handler(skb, info))
                        return 0;
        }

        return -ENOENT;
}

/* Try to match ICMP errors to UDP tunnels by looking up a socket without
 * reversing source and destination port: this will match tunnels that force the
 * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
 * lwtunnels might actually break this assumption by being configured with
 * different destination ports on endpoints, in this case we won't be able to
 * trace ICMP messages back to them.
 *
 * If this doesn't match any socket, probe tunnels with arbitrary destination
 * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
 * we've sent packets to won't necessarily match the local destination port.
 *
 * Then ask the tunnel implementation to match the error against a valid
 * association.
 *
 * Return an error if we can't find a match, the socket if we need further
 * processing, zero otherwise.
 */
static struct sock *__udp4_lib_err_encap(struct net *net,
                                         const struct iphdr *iph,
                                         struct udphdr *uh,
                                         struct sock *sk,
                                         struct sk_buff *skb, u32 info)
{
        int (*lookup)(struct sock *sk, struct sk_buff *skb);
        int network_offset, transport_offset;
        struct udp_sock *up;

        network_offset = skb_network_offset(skb);
        transport_offset = skb_transport_offset(skb);

        /* Network header needs to point to the outer IPv4 header inside ICMP */
        skb_reset_network_header(skb);

        /* Transport header needs to point to the UDP header */
        skb_set_transport_header(skb, iph->ihl << 2);

        if (sk) {
                up = udp_sk(sk);

                lookup = READ_ONCE(up->encap_err_lookup);
                if (lookup && lookup(sk, skb))
                        sk = NULL;

                goto out;
        }

        sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
                               iph->saddr, uh->dest, skb->dev->ifindex, 0, NULL);
        if (sk) {
                up = udp_sk(sk);

                lookup = READ_ONCE(up->encap_err_lookup);
                if (!lookup || lookup(sk, skb))
                        sk = NULL;
        }

out:
        if (!sk)
                sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));

        skb_set_transport_header(skb, transport_offset);
        skb_set_network_header(skb, network_offset);

        return sk;
}

/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.
 * Header points to the ip header of the error packet. We move
 * on past this. Then (as it used to claim before adjustment)
 * header points to the first 8 bytes of the udp header.  We need
 * to find the appropriate port.
 */
int udp_err(struct sk_buff *skb, u32 info)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        struct net *net = dev_net(skb->dev);
        struct inet_sock *inet;
        bool tunnel = false;
        struct udphdr *uh;
        struct sock *sk;
        int harderr;
        int err;

        uh = (struct udphdr *)(skb->data + (iph->ihl << 2));
        sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
                               iph->saddr, uh->source, skb->dev->ifindex,
                               inet_sdif(skb), NULL);

        if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
                /* No socket for error: try tunnels before discarding */
                if (static_branch_unlikely(&udp_encap_needed_key)) {
                        sk = __udp4_lib_err_encap(net, iph, uh, sk, skb, info);
                        if (!sk)
                                return 0;
                } else
                        sk = ERR_PTR(-ENOENT);

                if (IS_ERR(sk)) {
                        __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
                        return PTR_ERR(sk);
                }

                tunnel = true;
        }

        err = 0;
        harderr = 0;
        inet = inet_sk(sk);

        switch (type) {
        default:
        case ICMP_TIME_EXCEEDED:
                err = EHOSTUNREACH;
                break;
        case ICMP_SOURCE_QUENCH:
                goto out;
        case ICMP_PARAMETERPROB:
                err = EPROTO;
                harderr = 1;
                break;
        case ICMP_DEST_UNREACH:
                if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
                        ipv4_sk_update_pmtu(skb, sk, info);
                        if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
                                err = EMSGSIZE;
                                harderr = 1;
                                break;
                        }
                        goto out;
                }
                err = EHOSTUNREACH;
                if (code <= NR_ICMP_UNREACH) {
                        harderr = icmp_err_convert[code].fatal;
                        err = icmp_err_convert[code].errno;
                }
                break;
        case ICMP_REDIRECT:
                ipv4_sk_redirect(skb, sk);
                goto out;
        }

        /*
         *      RFC1122: OK.  Passes ICMP errors back to application, as per
         *        4.1.3.3.
         */
        if (tunnel) {
                /* ...not for tunnels though: we don't have a sending socket */
                if (udp_sk(sk)->encap_err_rcv)
                        udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
                                                  (u8 *)(uh+1));
                goto out;
        }
        if (!inet_test_bit(RECVERR, sk)) {
                if (!harderr || sk->sk_state != TCP_ESTABLISHED)
                        goto out;
        } else
                ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));

        sk->sk_err = err;
        sk_error_report(sk);
out:
        return 0;
}

/*
 * Throw away all pending data and cancel the corking. Socket is locked.
 */
void udp_flush_pending_frames(struct sock *sk)
{
        struct udp_sock *up = udp_sk(sk);

        if (up->pending) {
                up->len = 0;
                WRITE_ONCE(up->pending, 0);
                ip_flush_pending_frames(sk);
        }
}

/**
 *         udp4_hwcsum  -  handle outgoing HW checksumming
 *         @skb:         sk_buff containing the filled-in UDP header
 *                 (checksum field must be zeroed out)
 *        @src:        source IP address
 *        @dst:        destination IP address
 */
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
{
        struct udphdr *uh = udp_hdr(skb);
        int offset = skb_transport_offset(skb);
        int len = skb->len - offset;
        int hlen = len;
        __wsum csum = 0;

        if (!skb_has_frag_list(skb)) {
                /*
                 * Only one fragment on the socket.
                 */
                skb->csum_start = skb_transport_header(skb) - skb->head;
                skb->csum_offset = offsetof(struct udphdr, check);
                uh->check = ~csum_tcpudp_magic(src, dst, len,
                                               IPPROTO_UDP, 0);
        } else {
                struct sk_buff *frags;

                /*
                 * HW-checksum won't work as there are two or more
                 * fragments on the socket so that all csums of sk_buffs
                 * should be together
                 */
                skb_walk_frags(skb, frags) {
                        csum = csum_add(csum, frags->csum);
                        hlen -= frags->len;
                }

                csum = skb_checksum(skb, offset, hlen, csum);
                skb->ip_summed = CHECKSUM_NONE;

                uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
                if (uh->check == 0)
                        uh->check = CSUM_MANGLED_0;
        }
}
EXPORT_SYMBOL_GPL(udp4_hwcsum);

/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
 * for the simple case like when setting the checksum for a UDP tunnel.
 */
void udp_set_csum(bool nocheck, struct sk_buff *skb,
                  __be32 saddr, __be32 daddr, int len)
{
        struct udphdr *uh = udp_hdr(skb);

        if (nocheck) {
                uh->check = 0;
        } else if (skb_is_gso(skb)) {
                uh->check = ~udp_v4_check(len, saddr, daddr, 0);
        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                uh->check = 0;
                uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
                if (uh->check == 0)
                        uh->check = CSUM_MANGLED_0;
        } else {
                skb->ip_summed = CHECKSUM_PARTIAL;
                skb->csum_start = skb_transport_header(skb) - skb->head;
                skb->csum_offset = offsetof(struct udphdr, check);
                uh->check = ~udp_v4_check(len, saddr, daddr, 0);
        }
}
EXPORT_SYMBOL(udp_set_csum);

static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
                        struct inet_cork *cork)
{
        struct sock *sk = skb->sk;
        int offset, len, datalen;
        struct udphdr *uh;
        int err;

        offset = skb_transport_offset(skb);
        len = skb->len - offset;
        datalen = len - sizeof(*uh);

        /*
         * Create a UDP header
         */
        uh = udp_hdr(skb);
        uh->source = inet_sk(sk)->inet_sport;
        uh->dest = fl4->fl4_dport;
        uh->len = htons(len);
        uh->check = 0;

        if (cork->gso_size) {
                const int hlen = skb_network_header_len(skb) +
                                 sizeof(struct udphdr);

                if (hlen + min(datalen, cork->gso_size) > cork->fragsize) {
                        kfree_skb(skb);
                        return -EMSGSIZE;
                }
                if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
                        kfree_skb(skb);
                        return -EINVAL;
                }
                if (sk->sk_no_check_tx) {
                        kfree_skb(skb);
                        return -EINVAL;
                }
                if (dst_xfrm(skb_dst(skb))) {
                        kfree_skb(skb);
                        return -EIO;
                }

                if (datalen > cork->gso_size) {
                        skb_shinfo(skb)->gso_size = cork->gso_size;
                        skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
                        skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
                                                                 cork->gso_size);

                        /* Don't checksum the payload, skb will get segmented */
                        goto csum_partial;
                }
        }

        if (sk->sk_no_check_tx) {                         /* UDP csum off */
                skb->ip_summed = CHECKSUM_NONE;
                goto send;
        } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
csum_partial:
                udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
                goto send;
        }

        /* add protocol-dependent pseudo-header */
        uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
                                      IPPROTO_UDP, udp_csum(skb));
        if (uh->check == 0)
                uh->check = CSUM_MANGLED_0;

send:
        err = ip_send_skb(sock_net(sk), skb);
        if (unlikely(err)) {
                if (err == -ENOBUFS &&
                    !inet_test_bit(RECVERR, sk)) {
                        UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS);
                        err = 0;
                }
        } else {
                UDP_INC_STATS(sock_net(sk), UDP_MIB_OUTDATAGRAMS);
        }
        return err;
}

/*
 * Push out all pending data as one UDP datagram. Socket is locked.
 */
int udp_push_pending_frames(struct sock *sk)
{
        struct udp_sock  *up = udp_sk(sk);
        struct inet_sock *inet = inet_sk(sk);
        struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
        struct sk_buff *skb;
        int err = 0;

        skb = ip_finish_skb(sk, fl4);
        if (!skb)
                goto out;

        err = udp_send_skb(skb, fl4, &inet->cork.base);

out:
        up->len = 0;
        WRITE_ONCE(up->pending, 0);
        return err;
}

static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
{
        switch (cmsg->cmsg_type) {
        case UDP_SEGMENT:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
                        return -EINVAL;
                *gso_size = *(__u16 *)CMSG_DATA(cmsg);
                return 0;
        default:
                return -EINVAL;
        }
}

int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
{
        struct cmsghdr *cmsg;
        bool need_ip = false;
        int err;

        for_each_cmsghdr(cmsg, msg) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;

                if (cmsg->cmsg_level != SOL_UDP) {
                        need_ip = true;
                        continue;
                }

                err = __udp_cmsg_send(cmsg, gso_size);
                if (err)
                        return err;
        }

        return need_ip;
}

int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
        int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
        DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
                        IP_OPTIONS_DATA_FIXED_SIZE);
        DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
        int ulen = len, free = 0, connected = 0;
        struct inet_sock *inet = inet_sk(sk);
        struct udp_sock *up = udp_sk(sk);
        __be32 daddr, faddr, saddr;
        struct rtable *rt = NULL;
        struct flowi4 fl4_stack;
        struct ipcm_cookie ipc;
        struct sk_buff *skb;
        struct flowi4 *fl4;
        __be16 dport;
        int uc_index;
        u8 scope;
        int err;

        if (len > 0xFFFF)
                return -EMSGSIZE;

        /*
         *        Check the flags.
         */

        if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
                return -EOPNOTSUPP;

        fl4 = &inet->cork.fl.u.ip4;
        if (READ_ONCE(up->pending)) {
                /*
                 * There are pending frames.
                 * The socket lock must be held while it's corked.
                 */
                lock_sock(sk);
                if (likely(up->pending)) {
                        if (unlikely(up->pending != AF_INET)) {
                                release_sock(sk);
                                return -EINVAL;
                        }
                        goto do_append_data;
                }
                release_sock(sk);
        }
        ulen += sizeof(struct udphdr);

        /*
         *        Get and verify the address.
         */
        if (usin) {
                if (msg->msg_namelen < sizeof(*usin))
                        return -EINVAL;
                if (usin->sin_family != AF_INET) {
                        if (usin->sin_family != AF_UNSPEC)
                                return -EAFNOSUPPORT;
                }

                daddr = usin->sin_addr.s_addr;
                dport = usin->sin_port;
                if (dport == 0)
                        return -EINVAL;
        } else {
                if (sk->sk_state != TCP_ESTABLISHED)
                        return -EDESTADDRREQ;
                daddr = inet->inet_daddr;
                dport = inet->inet_dport;
                /* Open fast path for connected socket.
                   Route will not be used, if at least one option is set.
                 */
                connected = 1;
        }

        ipcm_init_sk(&ipc, inet);
        ipc.gso_size = READ_ONCE(up->gso_size);

        if (msg->msg_controllen) {
                err = udp_cmsg_send(sk, msg, &ipc.gso_size);
                if (err > 0) {
                        err = ip_cmsg_send(sk, msg, &ipc,
                                           sk->sk_family == AF_INET6);
                        connected = 0;
                }
                if (unlikely(err < 0)) {
                        kfree(ipc.opt);
                        return err;
                }
                if (ipc.opt)
                        free = 1;
        }
        if (!ipc.opt) {
                struct ip_options_rcu *inet_opt;

                rcu_read_lock();
                inet_opt = rcu_dereference(inet->inet_opt);
                if (inet_opt) {
                        memcpy(opt_copy, inet_opt,
                               sizeof(*inet_opt) + inet_opt->opt.optlen);
                        ipc.opt = opt_copy;
                }
                rcu_read_unlock();
        }

        if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
                err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
                                            (struct sockaddr *)usin,
                                            &msg->msg_namelen,
                                            &ipc.addr);
                if (err)
                        goto out_free;
                if (usin) {
                        if (usin->sin_port == 0) {
                                /* BPF program set invalid port. Reject it. */
                                err = -EINVAL;
                                goto out_free;
                        }
                        daddr = usin->sin_addr.s_addr;
                        dport = usin->sin_port;
                }
        }

        saddr = ipc.addr;
        ipc.addr = faddr = daddr;

        if (ipc.opt && ipc.opt->opt.srr) {
                if (!daddr) {
                        err = -EINVAL;
                        goto out_free;
                }
                faddr = ipc.opt->opt.faddr;
                connected = 0;
        }
        scope = ip_sendmsg_scope(inet, &ipc, msg);
        if (scope == RT_SCOPE_LINK)
                connected = 0;

        uc_index = READ_ONCE(inet->uc_index);
        if (ipv4_is_multicast(daddr)) {
                if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
                        ipc.oif = READ_ONCE(inet->mc_index);
                if (!saddr)
                        saddr = READ_ONCE(inet->mc_addr);
                connected = 0;
        } else if (!ipc.oif) {
                ipc.oif = uc_index;
        } else if (ipv4_is_lbcast(daddr) && uc_index) {
                /* oif is set, packet is to local broadcast and
                 * uc_index is set. oif is most likely set
                 * by sk_bound_dev_if. If uc_index != oif check if the
                 * oif is an L3 master and uc_index is an L3 slave.
                 * If so, we want to allow the send using the uc_index.
                 */
                if (ipc.oif != uc_index &&
                    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
                                                              uc_index)) {
                        ipc.oif = uc_index;
                }
        }

        if (connected)
                rt = dst_rtable(sk_dst_check(sk, 0));

        if (!rt) {
                struct net *net = sock_net(sk);
                __u8 flow_flags = inet_sk_flowi_flags(sk);

                fl4 = &fl4_stack;

                flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark,
                                   ipc.tos & INET_DSCP_MASK, scope,
                                   IPPROTO_UDP, flow_flags, faddr, saddr,
                                   dport, inet->inet_sport,
                                   sk_uid(sk));

                security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
                rt = ip_route_output_flow(net, fl4, sk);
                if (IS_ERR(rt)) {
                        err = PTR_ERR(rt);
                        rt = NULL;
                        if (err == -ENETUNREACH)
                                IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
                        goto out;
                }

                err = -EACCES;
                if ((rt->rt_flags & RTCF_BROADCAST) &&
                    !sock_flag(sk, SOCK_BROADCAST))
                        goto out;
                if (connected)
                        sk_dst_set(sk, dst_clone(&rt->dst));
        }

        if (msg->msg_flags&MSG_CONFIRM)
                goto do_confirm;
back_from_confirm:

        saddr = fl4->saddr;
        if (!ipc.addr)
                daddr = ipc.addr = fl4->daddr;

        /* Lockless fast path for the non-corking case. */
        if (!corkreq) {
                struct inet_cork cork;

                skb = ip_make_skb(sk, fl4, ip_generic_getfrag, msg, ulen,
                                  sizeof(struct udphdr), &ipc, &rt,
                                  &cork, msg->msg_flags);
                err = PTR_ERR(skb);
                if (!IS_ERR_OR_NULL(skb))
                        err = udp_send_skb(skb, fl4, &cork);
                goto out;
        }

        lock_sock(sk);
        if (unlikely(up->pending)) {
                /* The socket is already corked while preparing it. */
                /* ... which is an evident application bug. --ANK */
                release_sock(sk);

                net_dbg_ratelimited("socket already corked\n");
                err = -EINVAL;
                goto out;
        }
        /*
         *        Now cork the socket to pend data.
         */
        fl4 = &inet->cork.fl.u.ip4;
        fl4->daddr = daddr;
        fl4->saddr = saddr;
        fl4->fl4_dport = dport;
        fl4->fl4_sport = inet->inet_sport;
        WRITE_ONCE(up->pending, AF_INET);

do_append_data:
        up->len += ulen;
        err = ip_append_data(sk, fl4, ip_generic_getfrag, msg, ulen,
                             sizeof(struct udphdr), &ipc, &rt,
                             corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
        if (err)
                udp_flush_pending_frames(sk);
        else if (!corkreq)
                err = udp_push_pending_frames(sk);
        else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
                WRITE_ONCE(up->pending, 0);
        release_sock(sk);

out:
        ip_rt_put(rt);
out_free:
        if (free)
                kfree(ipc.opt);
        if (!err)
                return len;
        /*
         * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
         * ENOBUFS might not be good (it's not tunable per se), but otherwise
         * we don't have a good statistic (IpOutDiscards but it can be too many
         * things).  We could add another new stat but at least for now that
         * seems like overkill.
         */
        if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
                UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS);

        return err;

do_confirm:
        if (msg->msg_flags & MSG_PROBE)
                dst_confirm_neigh(&rt->dst, &fl4->daddr);
        if (!(msg->msg_flags&MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
        goto out;
}
EXPORT_SYMBOL(udp_sendmsg);

void udp_splice_eof(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct udp_sock *up = udp_sk(sk);

        if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk))
                return;

        lock_sock(sk);
        if (up->pending && !udp_test_bit(CORK, sk))
                udp_push_pending_frames(sk);
        release_sock(sk);
}

#define UDP_SKB_IS_STATELESS 0x80000000

/* all head states (dst, sk, nf conntrack) except skb extensions are
 * cleared by udp_rcv().
 *
 * We need to preserve secpath, if present, to eventually process
 * IP_CMSG_PASSSEC at recvmsg() time.
 *
 * Other extensions can be cleared.
 */
static bool udp_try_make_stateless(struct sk_buff *skb)
{
        if (!skb_has_extensions(skb))
                return true;

        if (!secpath_exists(skb)) {
                skb_ext_reset(skb);
                return true;
        }

        return false;
}

static void udp_set_dev_scratch(struct sk_buff *skb)
{
        struct udp_dev_scratch *scratch = udp_skb_scratch(skb);

        BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long));
        scratch->_tsize_state = skb->truesize;
#if BITS_PER_LONG == 64
        scratch->len = skb->len;
        scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
        scratch->is_linear = !skb_is_nonlinear(skb);
#endif
        if (udp_try_make_stateless(skb))
                scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
}

static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
{
        /* We come here after udp_lib_checksum_complete() returned 0.
         * This means that __skb_checksum_complete() might have
         * set skb->csum_valid to 1.
         * On 64bit platforms, we can set csum_unnecessary
         * to true, but only if the skb is not shared.
         */
#if BITS_PER_LONG == 64
        if (!skb_shared(skb))
                udp_skb_scratch(skb)->csum_unnecessary = true;
#endif
}

static int udp_skb_truesize(struct sk_buff *skb)
{
        return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
}

static bool udp_skb_has_head_state(struct sk_buff *skb)
{
        return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS);
}

/* fully reclaim rmem/fwd memory allocated for skb */
static void udp_rmem_release(struct sock *sk, unsigned int size,
                             int partial, bool rx_queue_lock_held)
{
        struct udp_sock *up = udp_sk(sk);
        struct sk_buff_head *sk_queue;
        unsigned int amt;

        if (likely(partial)) {
                up->forward_deficit += size;
                size = up->forward_deficit;
                if (size < READ_ONCE(up->forward_threshold) &&
                    !skb_queue_empty(&up->reader_queue))
                        return;
        } else {
                size += up->forward_deficit;
        }
        up->forward_deficit = 0;

        /* acquire the sk_receive_queue for fwd allocated memory scheduling,
         * if the called don't held it already
         */
        sk_queue = &sk->sk_receive_queue;
        if (!rx_queue_lock_held)
                spin_lock(&sk_queue->lock);

        amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
        sk_forward_alloc_add(sk, size - amt);

        if (amt)
                __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);

        atomic_sub(size, &sk->sk_rmem_alloc);

        /* this can save us from acquiring the rx queue lock on next receive */
        skb_queue_splice_tail_init(sk_queue, &up->reader_queue);

        if (!rx_queue_lock_held)
                spin_unlock(&sk_queue->lock);
}

/* Note: called with reader_queue.lock held.
 * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
 * This avoids a cache line miss while receive_queue lock is held.
 * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
 */
void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
{
        prefetch(&skb->data);
        udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
}

/* as above, but the caller held the rx queue lock, too */
static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
{
        prefetch(&skb->data);
        udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
}

static int udp_rmem_schedule(struct sock *sk, int size)
{
        int delta;

        delta = size - sk->sk_forward_alloc;
        if (delta > 0 && !__sk_mem_schedule(sk, delta, SK_MEM_RECV))
                return -ENOBUFS;

        return 0;
}

int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
{
        struct sk_buff_head *list = &sk->sk_receive_queue;
        struct udp_prod_queue *udp_prod_queue;
        struct sk_buff *next, *to_drop = NULL;
        struct llist_node *ll_list;
        unsigned int rmem, rcvbuf;
        int size, err = -ENOMEM;
        int total_size = 0;
        int q_size = 0;
        int dropcount;
        int nb = 0;

        rmem = atomic_read(&sk->sk_rmem_alloc);
        rcvbuf = READ_ONCE(sk->sk_rcvbuf);
        size = skb->truesize;

        udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()];

        rmem += atomic_read(&udp_prod_queue->rmem_alloc);

        /* Immediately drop when the receive queue is full.
         * Cast to unsigned int performs the boundary check for INT_MAX.
         */
        if (rmem + size > rcvbuf) {
                if (rcvbuf > INT_MAX >> 1)
                        goto drop;

                /* Accept the packet if queue is empty. */
                if (rmem)
                        goto drop;
        }

        /* Under mem pressure, it might be helpful to help udp_recvmsg()
         * having linear skbs :
         * - Reduce memory overhead and thus increase receive queue capacity
         * - Less cache line misses at copyout() time
         * - Less work at consume_skb() (less alien page frag freeing)
         */
        if (rmem > (rcvbuf >> 1)) {
                skb_condense(skb);
                size = skb->truesize;
        }

        udp_set_dev_scratch(skb);

        atomic_add(size, &udp_prod_queue->rmem_alloc);

        if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root))
                return 0;

        dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0;

        spin_lock(&list->lock);

        ll_list = llist_del_all(&udp_prod_queue->ll_root);

        ll_list = llist_reverse_order(ll_list);

        llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
                size = udp_skb_truesize(skb);
                total_size += size;
                err = udp_rmem_schedule(sk, size);
                if (unlikely(err)) {
                        /*  Free the skbs outside of locked section. */
                        skb->next = to_drop;
                        to_drop = skb;
                        continue;
                }

                q_size += size;
                sk_forward_alloc_add(sk, -size);

                /* no need to setup a destructor, we will explicitly release the
                 * forward allocated memory on dequeue
                 */
                SOCK_SKB_CB(skb)->dropcount = dropcount;
                nb++;
                __skb_queue_tail(list, skb);
        }

        atomic_add(q_size, &sk->sk_rmem_alloc);

        spin_unlock(&list->lock);

        if (!sock_flag(sk, SOCK_DEAD)) {
                /* Multiple threads might be blocked in recvmsg(),
                 * using prepare_to_wait_exclusive().
                 */
                while (nb) {
                        INDIRECT_CALL_1(READ_ONCE(sk->sk_data_ready),
                                        sock_def_readable, sk);
                        nb--;
                }
        }

        if (unlikely(to_drop)) {
                int err_ipv4 = 0;
                int err_ipv6 = 0;

                for (nb = 0; to_drop != NULL; nb++) {
                        skb = to_drop;
                        if (skb->protocol == htons(ETH_P_IP))
                                err_ipv4++;
                        else
                                err_ipv6++;
                        to_drop = skb->next;
                        skb_mark_not_on_list(skb);
                        sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM);
                }
                numa_drop_add(&udp_sk(sk)->drop_counters, nb);
                if (err_ipv4 > 0) {
                        SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_MEMERRORS,
                                       err_ipv4);
                        SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_INERRORS,
                                       err_ipv4);
                }
                if (err_ipv6 > 0) {
                        SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_MEMERRORS,
                                       err_ipv6);
                        SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_INERRORS,
                                       err_ipv6);
                }
        }

        atomic_sub(total_size, &udp_prod_queue->rmem_alloc);

        return 0;

drop:
        udp_drops_inc(sk);
        return err;
}

void udp_destruct_common(struct sock *sk)
{
        /* reclaim completely the forward allocated memory */
        struct udp_sock *up = udp_sk(sk);
        unsigned int total = 0;
        struct sk_buff *skb;

        skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue);
        while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) {
                total += skb->truesize;
                kfree_skb(skb);
        }
        udp_rmem_release(sk, total, 0, true);
        kfree(up->udp_prod_queue);
}

static void udp_destruct_sock(struct sock *sk)
{
        udp_destruct_common(sk);
        inet_sock_destruct(sk);
}

static int udp_init_sock(struct sock *sk)
{
        int res = udp_lib_init_sock(sk);

        sk->sk_destruct = udp_destruct_sock;
        set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
        return res;
}

void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
{
        if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset)))
                sk_peek_offset_bwd(sk, len);

        if (!skb_shared(skb)) {
                skb_orphan(skb);
                skb_attempt_defer_free(skb);
                return;
        }

        if (!skb_unref(skb))
                return;

        /* In the more common cases we cleared the head states previously,
         * see __udp_queue_rcv_skb().
         */
        if (unlikely(udp_skb_has_head_state(skb)))
                skb_release_head_state(skb);
        __consume_stateless_skb(skb);
}

static struct sk_buff *__first_packet_length(struct sock *sk,
                                             struct sk_buff_head *rcvq,
                                             unsigned int *total)
{
        struct sk_buff *skb;

        while ((skb = skb_peek(rcvq)) != NULL) {
                if (udp_lib_checksum_complete(skb)) {
                        struct net *net = sock_net(sk);

                        __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS);
                        __UDP_INC_STATS(net, UDP_MIB_INERRORS);
                        udp_drops_inc(sk);
                        __skb_unlink(skb, rcvq);
                        *total += skb->truesize;
                        kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
                } else {
                        udp_skb_csum_unnecessary_set(skb);
                        break;
                }
        }
        return skb;
}

/**
 *        first_packet_length        - return length of first packet in receive queue
 *        @sk: socket
 *
 *        Drops all bad checksum frames, until a valid one is found.
 *        Returns the length of found skb, or -1 if none is found.
 */
static int first_packet_length(struct sock *sk)
{
        struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
        struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
        unsigned int total = 0;
        struct sk_buff *skb;
        int res;

        spin_lock_bh(&rcvq->lock);
        skb = __first_packet_length(sk, rcvq, &total);
        if (!skb && !skb_queue_empty_lockless(sk_queue)) {
                spin_lock(&sk_queue->lock);
                skb_queue_splice_tail_init(sk_queue, rcvq);
                spin_unlock(&sk_queue->lock);

                skb = __first_packet_length(sk, rcvq, &total);
        }
        res = skb ? skb->len : -1;
        if (total)
                udp_rmem_release(sk, total, 1, false);
        spin_unlock_bh(&rcvq->lock);
        return res;
}

/*
 *        IOCTL requests applicable to the UDP protocol
 */

int udp_ioctl(struct sock *sk, int cmd, int *karg)
{
        switch (cmd) {
        case SIOCOUTQ:
        {
                *karg = sk_wmem_alloc_get(sk);
                return 0;
        }

        case SIOCINQ:
        {
                *karg = max_t(int, 0, first_packet_length(sk));
                return 0;
        }

        default:
                return -ENOIOCTLCMD;
        }

        return 0;
}

struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
                               int *off, int *err)
{
        struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
        struct sk_buff_head *queue;
        struct sk_buff *last;
        long timeo;
        int error;

        queue = &udp_sk(sk)->reader_queue;
        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
        do {
                struct sk_buff *skb;

                error = sock_error(sk);
                if (error)
                        break;

                error = -EAGAIN;
                do {
                        spin_lock_bh(&queue->lock);
                        skb = __skb_try_recv_from_queue(queue, flags, off, err,
                                                        &last);
                        if (skb) {
                                if (!(flags & MSG_PEEK))
                                        udp_skb_destructor(sk, skb);
                                spin_unlock_bh(&queue->lock);
                                return skb;
                        }

                        if (skb_queue_empty_lockless(sk_queue)) {
                                spin_unlock_bh(&queue->lock);
                                goto busy_check;
                        }

                        /* refill the reader queue and walk it again
                         * keep both queues locked to avoid re-acquiring
                         * the sk_receive_queue lock if fwd memory scheduling
                         * is needed.
                         */
                        spin_lock(&sk_queue->lock);
                        skb_queue_splice_tail_init(sk_queue, queue);

                        skb = __skb_try_recv_from_queue(queue, flags, off, err,
                                                        &last);
                        if (skb && !(flags & MSG_PEEK))
                                udp_skb_dtor_locked(sk, skb);
                        spin_unlock(&sk_queue->lock);
                        spin_unlock_bh(&queue->lock);
                        if (skb)
                                return skb;

busy_check:
                        if (!sk_can_busy_loop(sk))
                                break;

                        sk_busy_loop(sk, flags & MSG_DONTWAIT);
                } while (!skb_queue_empty_lockless(sk_queue));

                /* sk_queue is empty, reader_queue may contain peeked packets */
        } while (timeo &&
                 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
                                              &error, &timeo,
                                              (struct sk_buff *)sk_queue));

        *err = error;
        return NULL;
}
EXPORT_SYMBOL(__skb_recv_udp);

int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
        struct sk_buff *skb;
        int err;

try_again:
        skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
        if (!skb)
                return err;

        if (udp_lib_checksum_complete(skb)) {
                struct net *net = sock_net(sk);

                __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS);
                __UDP_INC_STATS(net, UDP_MIB_INERRORS);
                udp_drops_inc(sk);
                kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
                goto try_again;
        }

        WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
        return recv_actor(sk, skb);
}

/*
 *         This should be easy, if there is something there we
 *         return it, otherwise we block.
 */

INDIRECT_CALLABLE_SCOPE
int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
{
        DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
        int off, err, peeking = flags & MSG_PEEK;
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        bool checksum_valid = false;
        unsigned int ulen, copied;
        struct sk_buff *skb;

        if (flags & MSG_ERRQUEUE)
                return ip_recv_error(sk, msg, len);

try_again:
        off = sk_peek_offset(sk, flags);
        skb = __skb_recv_udp(sk, flags, &off, &err);
        if (!skb)
                return err;

        ulen = udp_skb_len(skb);
        copied = len;
        if (copied > ulen - off)
                copied = ulen - off;
        else if (copied < ulen)
                msg->msg_flags |= MSG_TRUNC;

        /* If checksum is needed at all, try to do it while copying the
         * data.  If the data is truncated, do it before the copy.
         */
        if (copied < ulen || peeking) {
                checksum_valid = udp_skb_csum_unnecessary(skb) ||
                                !__udp_lib_checksum_complete(skb);
                if (!checksum_valid)
                        goto csum_copy_err;
        }

        if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
                if (udp_skb_is_linear(skb))
                        err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
                else
                        err = skb_copy_datagram_msg(skb, off, msg, copied);
        } else {
                err = skb_copy_and_csum_datagram_msg(skb, off, msg);

                if (err == -EINVAL)
                        goto csum_copy_err;
        }

        if (unlikely(err)) {
                if (!peeking) {
                        udp_drops_inc(sk);
                        UDP_INC_STATS(net, UDP_MIB_INERRORS);
                }
                kfree_skb(skb);
                return err;
        }

        if (!peeking)
                UDP_INC_STATS(net, UDP_MIB_INDATAGRAMS);

        sock_recv_cmsgs(msg, sk, skb);

        /* Copy the address. */
        if (sin) {
                sin->sin_family = AF_INET;
                sin->sin_port = udp_hdr(skb)->source;
                sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
                memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
                msg->msg_namelen = sizeof(*sin);

                BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
                                                      (struct sockaddr *)sin,
                                                      &msg->msg_namelen);
        }

        if (udp_test_bit(GRO_ENABLED, sk))
                udp_cmsg_recv(msg, sk, skb);

        if (inet_cmsg_flags(inet))
                ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);

        err = copied;
        if (flags & MSG_TRUNC)
                err = ulen;

        skb_consume_udp(sk, skb, peeking ? -err : err);
        return err;

csum_copy_err:
        if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
                                 udp_skb_destructor)) {
                UDP_INC_STATS(net, UDP_MIB_CSUMERRORS);
                UDP_INC_STATS(net, UDP_MIB_INERRORS);
        }
        kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);

        /* starting over for a new packet, but check if we need to yield */
        cond_resched();
        msg->msg_flags &= ~MSG_TRUNC;
        goto try_again;
}

int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
                    int addr_len)
{
        /* This check is replicated from __ip4_datagram_connect() and
         * intended to prevent BPF program called below from accessing bytes
         * that are out of the bound specified by user in addr_len.
         */
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
}

static int udp_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
                       int addr_len)
{
        int res;

        lock_sock(sk);
        res = __ip4_datagram_connect(sk, uaddr, addr_len);
        if (!res)
                udp4_hash4(sk);
        release_sock(sk);
        return res;
}

int __udp_disconnect(struct sock *sk, int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        /*
         *        1003.1g - break association.
         */

        sk->sk_state = TCP_CLOSE;
        inet->inet_daddr = 0;
        inet->inet_dport = 0;
        sock_rps_reset_rxhash(sk);
        sk->sk_bound_dev_if = 0;
        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
                inet_reset_saddr(sk);
                if (sk->sk_prot->rehash &&
                    (sk->sk_userlocks & SOCK_BINDPORT_LOCK))
                        sk->sk_prot->rehash(sk);
        }

        if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
                sk->sk_prot->unhash(sk);
                inet->inet_sport = 0;
        }
        sk_dst_reset(sk);
        return 0;
}
EXPORT_SYMBOL(__udp_disconnect);

int udp_disconnect(struct sock *sk, int flags)
{
        lock_sock(sk);
        __udp_disconnect(sk, flags);
        release_sock(sk);
        return 0;
}

void udp_lib_unhash(struct sock *sk)
{
        if (sk_hashed(sk)) {
                struct udp_hslot *hslot, *hslot2;
                struct net *net = sock_net(sk);
                struct udp_table *udptable;

                sock_rps_delete_flow(sk);
                udptable = net->ipv4.udp_table;
                hslot  = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash);
                hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);

                spin_lock_bh(&hslot->lock);
                if (rcu_access_pointer(sk->sk_reuseport_cb))
                        reuseport_detach_sock(sk);
                if (sk_del_node_init_rcu(sk)) {
                        hslot->count--;
                        inet_sk(sk)->inet_num = 0;
                        sock_prot_inuse_add(net, sk->sk_prot, -1);

                        spin_lock(&hslot2->lock);
                        hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
                        hslot2->count--;
                        spin_unlock(&hslot2->lock);

                        udp_unhash4(udptable, sk);
                }
                spin_unlock_bh(&hslot->lock);
        }
}

/*
 * inet_rcv_saddr was changed, we must rehash secondary hash
 */
void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4)
{
        if (sk_hashed(sk)) {
                struct udp_hslot *hslot, *hslot2, *nhslot2;
                struct net *net = sock_net(sk);
                struct udp_table *udptable;

                udptable = net->ipv4.udp_table;
                hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash);
                hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
                nhslot2 = udp_hashslot2(udptable, newhash);

                if (hslot2 != nhslot2 ||
                    rcu_access_pointer(sk->sk_reuseport_cb)) {
                        /* we must lock primary chain too */
                        spin_lock_bh(&hslot->lock);
                        if (rcu_access_pointer(sk->sk_reuseport_cb))
                                reuseport_detach_sock(sk);

                        if (hslot2 != nhslot2) {
                                spin_lock(&hslot2->lock);
                                hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
                                hslot2->count--;
                                spin_unlock(&hslot2->lock);

                                spin_lock(&nhslot2->lock);
                                hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
                                                         &nhslot2->head);
                                nhslot2->count++;
                                spin_unlock(&nhslot2->lock);
                        }

                        spin_unlock_bh(&hslot->lock);
                }

                /* Now process hash4 if necessary:
                 * (1) update hslot4;
                 * (2) update hslot2->hash4_cnt.
                 * Note that hslot2/hslot4 should be checked separately, as
                 * either of them may change with the other unchanged.
                 */
                if (udp_hashed4(sk)) {
                        spin_lock_bh(&hslot->lock);

                        if (inet_rcv_saddr_any(sk)) {
                                udp_unhash4(udptable, sk);
                        } else {
                                udp_rehash4(udptable, sk, newhash4);
                                if (hslot2 != nhslot2) {
                                        spin_lock(&hslot2->lock);
                                        udp_hash4_dec(hslot2);
                                        spin_unlock(&hslot2->lock);

                                        spin_lock(&nhslot2->lock);
                                        udp_hash4_inc(nhslot2);
                                        spin_unlock(&nhslot2->lock);
                                }
                        }

                        spin_unlock_bh(&hslot->lock);
                }

                udp_sk(sk)->udp_portaddr_hash = newhash;
        }
}

static void udp_v4_rehash(struct sock *sk)
{
        u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
                                          inet_sk(sk)->inet_rcv_saddr,
                                          inet_sk(sk)->inet_num);
        u16 new_hash4 = udp_ehashfn(sock_net(sk),
                                    sk->sk_rcv_saddr, sk->sk_num,
                                    sk->sk_daddr, sk->sk_dport);

        udp_lib_rehash(sk, new_hash, new_hash4);
}

static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        int rc;

        if (inet_sk(sk)->inet_daddr) {
                sock_rps_save_rxhash(sk, skb);
                sk_mark_napi_id(sk, skb);
                sk_incoming_cpu_update(sk);
        } else {
                sk_mark_napi_id_once(sk, skb);
        }

        rc = __udp_enqueue_schedule_skb(sk, skb);
        if (rc < 0) {
                struct net *net = sock_net(sk);
                int drop_reason;

                /* Note that an ENOMEM error is charged twice */
                if (rc == -ENOMEM) {
                        UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS);
                        drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
                } else {
                        UDP_INC_STATS(net, UDP_MIB_MEMERRORS);
                        drop_reason = SKB_DROP_REASON_PROTO_MEM;
                }
                UDP_INC_STATS(net, UDP_MIB_INERRORS);
                trace_udp_fail_queue_rcv_skb(rc, sk, skb);
                sk_skb_reason_drop(sk, skb, drop_reason);
                return -1;
        }

        return 0;
}

/* returns:
 *  -1: error
 *   0: success
 *  >0: "udp encap" protocol resubmission
 *
 * Note that in the success and error cases, the skb is assumed to
 * have either been requeued or freed.
 */
static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct udp_sock *up = udp_sk(sk);
        struct net *net = sock_net(sk);

        /*
         *        Charge it to the socket, dropping if the queue is full.
         */
        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                goto drop;
        }
        nf_reset_ct(skb);

        if (static_branch_unlikely(&udp_encap_needed_key) &&
            READ_ONCE(up->encap_type)) {
                int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);

                /*
                 * This is an encapsulation socket so pass the skb to
                 * the socket's udp_encap_rcv() hook. Otherwise, just
                 * fall through and pass this up the UDP socket.
                 * up->encap_rcv() returns the following value:
                 * =0 if skb was successfully passed to the encap
                 *    handler or was discarded by it.
                 * >0 if skb should be passed on to UDP.
                 * <0 if skb should be resubmitted as proto -N
                 */

                /* if we're overly short, let UDP handle it */
                encap_rcv = READ_ONCE(up->encap_rcv);
                if (encap_rcv) {
                        int ret;

                        /* Verify checksum before giving to encap */
                        if (udp_lib_checksum_complete(skb))
                                goto csum_error;

                        ret = encap_rcv(sk, skb);
                        if (ret <= 0) {
                                __UDP_INC_STATS(net, UDP_MIB_INDATAGRAMS);
                                return -ret;
                        }
                }

                /* FALLTHROUGH -- it's a UDP Packet */
        }

        prefetch(&sk->sk_rmem_alloc);
        if (rcu_access_pointer(sk->sk_filter) &&
            udp_lib_checksum_complete(skb))
                        goto csum_error;

        drop_reason = sk_filter_trim_cap(sk, skb, sizeof(struct udphdr));
        if (drop_reason)
                goto drop;

        udp_csum_pull_header(skb);

        ipv4_pktinfo_prepare(sk, skb, true);
        return __udp_queue_rcv_skb(sk, skb);

csum_error:
        drop_reason = SKB_DROP_REASON_UDP_CSUM;
        __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS);
drop:
        __UDP_INC_STATS(net, UDP_MIB_INERRORS);
        udp_drops_inc(sk);
        sk_skb_reason_drop(sk, skb, drop_reason);
        return -1;
}

static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        struct sk_buff *next, *segs;
        int ret;

        if (likely(!udp_unexpected_gso(sk, skb)))
                return udp_queue_rcv_one_skb(sk, skb);

        BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
        __skb_push(skb, -skb_mac_offset(skb));
        segs = udp_rcv_segment(sk, skb, true);
        skb_list_walk_safe(segs, skb, next) {
                __skb_pull(skb, skb_transport_offset(skb));

                udp_post_segment_fix_csum(skb);
                ret = udp_queue_rcv_one_skb(sk, skb);
                if (ret > 0)
                        ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
        }
        return 0;
}

/* For TCP sockets, sk_rx_dst is protected by socket lock
 * For UDP, we use xchg() to guard against concurrent changes.
 */
bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old;

        if (dst_hold_safe(dst)) {
                old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst)));
                dst_release(old);
                return old != dst;
        }
        return false;
}

/*
 *        Multicasts and broadcasts go to each listener.
 *
 *        Note: called only from the BH handler context.
 */
static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
                                    struct udphdr  *uh,
                                    __be32 saddr, __be32 daddr)
{
        struct udp_table *udptable = net->ipv4.udp_table;
        unsigned int hash2, hash2_any, offset;
        unsigned short hnum = ntohs(uh->dest);
        struct sock *sk, *first = NULL;
        int dif = skb->dev->ifindex;
        int sdif = inet_sdif(skb);
        struct hlist_node *node;
        struct udp_hslot *hslot;
        struct sk_buff *nskb;
        bool use_hash2;

        hash2_any = 0;
        hash2 = 0;
        hslot = udp_hashslot(udptable, net, hnum);
        use_hash2 = hslot->count > 10;
        offset = offsetof(typeof(*sk), sk_node);

        if (use_hash2) {
                hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
                            udptable->mask;
                hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
                hslot = &udptable->hash2[hash2].hslot;
                offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
        }

        sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
                if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
                                         uh->source, saddr, dif, sdif, hnum))
                        continue;

                if (!first) {
                        first = sk;
                        continue;
                }
                nskb = skb_clone(skb, GFP_ATOMIC);

                if (unlikely(!nskb)) {
                        udp_drops_inc(sk);
                        __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS);
                        __UDP_INC_STATS(net, UDP_MIB_INERRORS);
                        continue;
                }
                if (udp_queue_rcv_skb(sk, nskb) > 0)
                        consume_skb(nskb);
        }

        /* Also lookup *:port if we are using hash2 and haven't done so yet. */
        if (use_hash2 && hash2 != hash2_any) {
                hash2 = hash2_any;
                goto start_lookup;
        }

        if (first) {
                if (udp_queue_rcv_skb(first, skb) > 0)
                        consume_skb(skb);
        } else {
                kfree_skb(skb);
                __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI);
        }
        return 0;
}

/* Initialize UDP checksum. If exited with zero value (success),
 * CHECKSUM_UNNECESSARY means, that no more checks are required.
 * Otherwise, csum completion requires checksumming packet body,
 * including udp header and folding it to skb->csum.
 */
static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh)
{
        int err;

        /* Note, we are only interested in != 0 or == 0, thus the
         * force to int.
         */
        err = (__force int)skb_checksum_init_zero_check(skb, IPPROTO_UDP, uh->check,
                                                        inet_compute_pseudo);
        if (err)
                return err;

        if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
                /* If SW calculated the value, we know it's bad */
                if (skb->csum_complete_sw)
                        return 1;

                /* HW says the value is bad. Let's validate that.
                 * skb->csum is no longer the full packet checksum,
                 * so don't treat it as such.
                 */
                skb_checksum_complete_unset(skb);
        }

        return 0;
}

/* wrapper for udp_queue_rcv_skb taking care of csum conversion and
 * return code conversion for ip layer consumption
 */
static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
                               struct udphdr *uh)
{
        int ret;

        if (inet_get_convert_csum(sk) && uh->check)
                skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);

        ret = udp_queue_rcv_skb(sk, skb);

        /* a return value > 0 means to resubmit the input, but
         * it wants the return to be -protocol, or 0
         */
        if (ret > 0)
                return -ret;
        return 0;
}

/*
 *        All we need to do is get the socket, and then do a checksum.
 */

int udp_rcv(struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct net *net = dev_net(skb->dev);
        struct sock *sk = NULL;
        unsigned short ulen;
        __be32 saddr, daddr;
        struct udphdr *uh;
        bool refcounted;
        int drop_reason;

        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;

        /*
         *  Validate the packet.
         */
        if (!pskb_may_pull(skb, sizeof(struct udphdr)))
                goto drop;                /* No space for header. */

        uh   = udp_hdr(skb);
        ulen = ntohs(uh->len);
        saddr = ip_hdr(skb)->saddr;
        daddr = ip_hdr(skb)->daddr;

        if (ulen > skb->len)
                goto short_packet;

        if (ulen < sizeof(*uh))
                goto short_packet;

        if (ulen < skb->len) {
                if (pskb_trim_rcsum(skb, ulen))
                        goto short_packet;

                uh = udp_hdr(skb);
        }

        if (udp4_csum_init(skb, uh))
                goto csum_error;

        sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
                             &refcounted, udp_ehashfn);
        if (IS_ERR(sk))
                goto no_sk;

        if (sk) {
                struct dst_entry *dst = skb_dst(skb);
                int ret;

                if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
                        udp_sk_rx_dst_set(sk, dst);

                ret = udp_unicast_rcv_skb(sk, skb, uh);
                if (refcounted)
                        sock_put(sk);
                return ret;
        }

        if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
                return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr);

        sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest);
        if (sk)
                return udp_unicast_rcv_skb(sk, skb, uh);
no_sk:
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
                goto drop;
        nf_reset_ct(skb);

        /* No socket. Drop packet silently, if checksum is wrong */
        if (udp_lib_checksum_complete(skb))
                goto csum_error;

        drop_reason = SKB_DROP_REASON_NO_SOCKET;
        __UDP_INC_STATS(net, UDP_MIB_NOPORTS);
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

        /*
         * Hmm.  We got an UDP packet to a port to which we
         * don't wanna listen.  Ignore it.
         */
        sk_skb_reason_drop(sk, skb, drop_reason);
        return 0;

short_packet:
        drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
        net_dbg_ratelimited("UDP: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
                            &saddr, ntohs(uh->source),
                            ulen, skb->len,
                            &daddr, ntohs(uh->dest));
        goto drop;

csum_error:
        /*
         * RFC1122: OK.  Discards the bad packet silently (as far as
         * the network is concerned, anyway) as per 4.1.3.4 (MUST).
         */
        drop_reason = SKB_DROP_REASON_UDP_CSUM;
        net_dbg_ratelimited("UDP: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
                            &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
                            ulen);
        __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS);
drop:
        __UDP_INC_STATS(net, UDP_MIB_INERRORS);
        sk_skb_reason_drop(sk, skb, drop_reason);
        return 0;
}

/* We can only early demux multicast if there is a single matching socket.
 * If more than one socket found returns NULL
 */
static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
                                                  __be16 loc_port, __be32 loc_addr,
                                                  __be16 rmt_port, __be32 rmt_addr,
                                                  int dif, int sdif)
{
        struct udp_table *udptable = net->ipv4.udp_table;
        unsigned short hnum = ntohs(loc_port);
        struct sock *sk, *result;
        struct udp_hslot *hslot;
        unsigned int slot;

        slot = udp_hashfn(net, hnum, udptable->mask);
        hslot = &udptable->hash[slot];

        /* Do not bother scanning a too big list */
        if (hslot->count > 10)
                return NULL;

        result = NULL;
        sk_for_each_rcu(sk, &hslot->head) {
                if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
                                        rmt_port, rmt_addr, dif, sdif, hnum)) {
                        if (result)
                                return NULL;
                        result = sk;
                }
        }

        return result;
}

/* For unicast we should only early demux connected sockets or we can
 * break forwarding setups.  The chains here can be long so only check
 * if the first socket is an exact match and if not move on.
 */
static struct sock *__udp4_lib_demux_lookup(struct net *net,
                                            __be16 loc_port, __be32 loc_addr,
                                            __be16 rmt_port, __be32 rmt_addr,
                                            int dif, int sdif)
{
        struct udp_table *udptable = net->ipv4.udp_table;
        INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
        unsigned short hnum = ntohs(loc_port);
        struct udp_hslot *hslot2;
        unsigned int hash2;
        __portpair ports;
        struct sock *sk;

        hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
        hslot2 = udp_hashslot2(udptable, hash2);
        ports = INET_COMBINED_PORTS(rmt_port, hnum);

        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
                if (inet_match(net, sk, acookie, ports, dif, sdif))
                        return sk;
                /* Only check first socket in chain */
                break;
        }
        return NULL;
}

enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb)
{
        struct net *net = dev_net(skb->dev);
        struct in_device *in_dev = NULL;
        const struct iphdr *iph;
        const struct udphdr *uh;
        struct sock *sk = NULL;
        struct dst_entry *dst;
        int dif = skb->dev->ifindex;
        int sdif = inet_sdif(skb);
        int ours;

        /* validate the packet */
        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
                return SKB_NOT_DROPPED_YET;

        iph = ip_hdr(skb);
        uh = udp_hdr(skb);

        if (skb->pkt_type == PACKET_MULTICAST) {
                in_dev = __in_dev_get_rcu(skb->dev);

                if (!in_dev)
                        return SKB_NOT_DROPPED_YET;

                ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
                                       iph->protocol);
                if (!ours)
                        return SKB_NOT_DROPPED_YET;

                sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
                                                   uh->source, iph->saddr,
                                                   dif, sdif);
        } else if (skb->pkt_type == PACKET_HOST) {
                sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
                                             uh->source, iph->saddr, dif, sdif);
        }

        if (!sk)
                return SKB_NOT_DROPPED_YET;

        skb->sk = sk;
        DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk));
        skb->destructor = sock_pfree;
        dst = rcu_dereference(sk->sk_rx_dst);

        if (dst)
                dst = dst_check(dst, 0);
        if (dst) {
                u32 itag = 0;

                /* set noref for now.
                 * any place which wants to hold dst has to call
                 * dst_hold_safe()
                 */
                skb_dst_set_noref(skb, dst);

                /* for unconnected multicast sockets we need to validate
                 * the source on each packet
                 */
                if (!inet_sk(sk)->inet_daddr && in_dev)
                        return ip_mc_validate_source(skb, iph->daddr,
                                                     iph->saddr,
                                                     ip4h_dscp(iph),
                                                     skb->dev, in_dev, &itag);
        }
        return SKB_NOT_DROPPED_YET;
}

static void udp_destroy_sock(struct sock *sk)
{
        struct udp_sock *up = udp_sk(sk);
        bool slow = lock_sock_fast(sk);

        /* protects from races with udp_abort() */
        sock_set_flag(sk, SOCK_DEAD);
        udp_flush_pending_frames(sk);
        unlock_sock_fast(sk, slow);
        if (static_branch_unlikely(&udp_encap_needed_key)) {
                if (up->encap_type) {
                        void (*encap_destroy)(struct sock *sk);
                        encap_destroy = READ_ONCE(up->encap_destroy);
                        if (encap_destroy)
                                encap_destroy(sk);
                }
                if (udp_test_bit(ENCAP_ENABLED, sk)) {
                        static_branch_dec(&udp_encap_needed_key);
                        udp_tunnel_cleanup_gro(sk);
                }
        }
}

typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk,
                                             struct list_head *head,
                                             struct sk_buff *skb);

static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family,
                                       struct sock *sk)
{
#ifdef CONFIG_XFRM
        udp_gro_receive_t new_gro_receive;

        if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) {
                if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
                        new_gro_receive = xfrm6_gro_udp_encap_rcv;
                else
                        new_gro_receive = xfrm4_gro_udp_encap_rcv;

                if (udp_sk(sk)->gro_receive != new_gro_receive) {
                        /*
                         * With IPV6_ADDRFORM the gro callback could change
                         * after being set, unregister the old one, if valid.
                         */
                        if (udp_sk(sk)->gro_receive)
                                udp_tunnel_update_gro_rcv(sk, false);

                        WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive);
                        udp_tunnel_update_gro_rcv(sk, true);
                }
        }
#endif
}

/*
 *        Socket option code for UDP
 */
int udp_lib_setsockopt(struct sock *sk, int level, int optname,
                       sockptr_t optval, unsigned int optlen,
                       int (*push_pending_frames)(struct sock *))
{
        struct udp_sock *up = udp_sk(sk);
        int val, valbool;
        int err = 0;

        if (level == SOL_SOCKET) {
                err = sk_setsockopt(sk, level, optname, optval, optlen);

                if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) {
                        sockopt_lock_sock(sk);
                        /* paired with READ_ONCE in udp_rmem_release() */
                        WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2);
                        sockopt_release_sock(sk);
                }
                return err;
        }

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        valbool = val ? 1 : 0;

        switch (optname) {
        case UDP_CORK:
                if (val != 0) {
                        udp_set_bit(CORK, sk);
                } else {
                        udp_clear_bit(CORK, sk);
                        lock_sock(sk);
                        push_pending_frames(sk);
                        release_sock(sk);
                }
                break;

        case UDP_ENCAP:
                sockopt_lock_sock(sk);
                switch (val) {
                case 0:
#ifdef CONFIG_XFRM
                case UDP_ENCAP_ESPINUDP:
                        set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk);
#if IS_ENABLED(CONFIG_IPV6)
                        if (sk->sk_family == AF_INET6)
                                WRITE_ONCE(up->encap_rcv,
                                           xfrm6_udp_encap_rcv);
                        else
#endif
                                WRITE_ONCE(up->encap_rcv,
                                           xfrm4_udp_encap_rcv);
#endif
                        fallthrough;
                case UDP_ENCAP_L2TPINUDP:
                        WRITE_ONCE(up->encap_type, val);
                        udp_tunnel_encap_enable(sk);
                        break;
                default:
                        err = -ENOPROTOOPT;
                        break;
                }
                sockopt_release_sock(sk);
                break;

        case UDP_NO_CHECK6_TX:
                udp_set_no_check6_tx(sk, valbool);
                break;

        case UDP_NO_CHECK6_RX:
                udp_set_no_check6_rx(sk, valbool);
                break;

        case UDP_SEGMENT:
                if (val < 0 || val > USHRT_MAX)
                        return -EINVAL;
                WRITE_ONCE(up->gso_size, val);
                break;

        case UDP_GRO:
                sockopt_lock_sock(sk);
                /* when enabling GRO, accept the related GSO packet type */
                if (valbool)
                        udp_tunnel_encap_enable(sk);
                udp_assign_bit(GRO_ENABLED, sk, valbool);
                udp_assign_bit(ACCEPT_L4, sk, valbool);
                set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk);
                sockopt_release_sock(sk);
                break;

        default:
                err = -ENOPROTOOPT;
                break;
        }

        return err;
}

static int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                          unsigned int optlen)
{
        if (level == SOL_UDP || level == SOL_SOCKET)
                return udp_lib_setsockopt(sk, level, optname,
                                          optval, optlen,
                                          udp_push_pending_frames);
        return ip_setsockopt(sk, level, optname, optval, optlen);
}

int udp_lib_getsockopt(struct sock *sk, int level, int optname,
                       char __user *optval, int __user *optlen)
{
        struct udp_sock *up = udp_sk(sk);
        int val, len;

        if (get_user(len, optlen))
                return -EFAULT;

        if (len < 0)
                return -EINVAL;

        len = min_t(unsigned int, len, sizeof(int));

        switch (optname) {
        case UDP_CORK:
                val = udp_test_bit(CORK, sk);
                break;

        case UDP_ENCAP:
                val = READ_ONCE(up->encap_type);
                break;

        case UDP_NO_CHECK6_TX:
                val = udp_get_no_check6_tx(sk);
                break;

        case UDP_NO_CHECK6_RX:
                val = udp_get_no_check6_rx(sk);
                break;

        case UDP_SEGMENT:
                val = READ_ONCE(up->gso_size);
                break;

        case UDP_GRO:
                val = udp_test_bit(GRO_ENABLED, sk);
                break;

        default:
                return -ENOPROTOOPT;
        }

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;
        return 0;
}

static int udp_getsockopt(struct sock *sk, int level, int optname,
                          char __user *optval, int __user *optlen)
{
        if (level == SOL_UDP)
                return udp_lib_getsockopt(sk, level, optname, optval, optlen);
        return ip_getsockopt(sk, level, optname, optval, optlen);
}

/**
 *         udp_poll - wait for a UDP event.
 *        @file: - file struct
 *        @sock: - socket
 *        @wait: - poll table
 *
 *        This is same as datagram poll, except for the special case of
 *        blocking sockets. If application is using a blocking fd
 *        and a packet with checksum error is in the queue;
 *        then it could get return from select indicating data available
 *        but then block when reading it. Add special case code
 *        to work around these arguably broken applications.
 */
__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
        __poll_t mask = datagram_poll(file, sock, wait);
        struct sock *sk = sock->sk;

        if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
                mask |= EPOLLIN | EPOLLRDNORM;

        /* Check for false positives due to checksum errors */
        if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
            !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
                mask &= ~(EPOLLIN | EPOLLRDNORM);

        /* psock ingress_msg queue should not contain any bad checksum frames */
        if (sk_is_readable(sk))
                mask |= EPOLLIN | EPOLLRDNORM;
        return mask;

}

int udp_abort(struct sock *sk, int err)
{
        if (!has_current_bpf_ctx())
                lock_sock(sk);

        /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
         * with close()
         */
        if (sock_flag(sk, SOCK_DEAD))
                goto out;

        sk->sk_err = err;
        sk_error_report(sk);
        __udp_disconnect(sk, 0);

out:
        if (!has_current_bpf_ctx())
                release_sock(sk);

        return 0;
}

struct proto udp_prot = {
        .name                        = "UDP",
        .owner                        = THIS_MODULE,
        .close                        = udp_lib_close,
        .pre_connect                = udp_pre_connect,
        .connect                = udp_connect,
        .disconnect                = udp_disconnect,
        .ioctl                        = udp_ioctl,
        .init                        = udp_init_sock,
        .destroy                = udp_destroy_sock,
        .setsockopt                = udp_setsockopt,
        .getsockopt                = udp_getsockopt,
        .sendmsg                = udp_sendmsg,
        .recvmsg                = udp_recvmsg,
        .splice_eof                = udp_splice_eof,
        .release_cb                = ip4_datagram_release_cb,
        .hash                        = udp_lib_hash,
        .unhash                        = udp_lib_unhash,
        .rehash                        = udp_v4_rehash,
        .get_port                = udp_v4_get_port,
        .put_port                = udp_lib_unhash,
#ifdef CONFIG_BPF_SYSCALL
        .psock_update_sk_prot        = udp_bpf_update_proto,
#endif
        .memory_allocated        = &net_aligned_data.udp_memory_allocated,
        .per_cpu_fw_alloc        = &udp_memory_per_cpu_fw_alloc,

        .sysctl_mem                = sysctl_udp_mem,
        .sysctl_wmem_offset        = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
        .sysctl_rmem_offset        = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
        .obj_size                = sizeof(struct udp_sock),
        .diag_destroy                = udp_abort,
};
EXPORT_SYMBOL(udp_prot);

/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS

static unsigned short seq_file_family(const struct seq_file *seq);
static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
{
        unsigned short family = seq_file_family(seq);

        /* AF_UNSPEC is used as a match all */
        return ((family == AF_UNSPEC || family == sk->sk_family) &&
                net_eq(sock_net(sk), seq_file_net(seq)));
}

#ifdef CONFIG_BPF_SYSCALL
static const struct seq_operations bpf_iter_udp_seq_ops;
#endif

static struct sock *udp_get_first(struct seq_file *seq, int start)
{
        struct udp_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);
        struct udp_table *udptable;
        struct sock *sk;

        udptable = net->ipv4.udp_table;

        for (state->bucket = start; state->bucket <= udptable->mask;
             ++state->bucket) {
                struct udp_hslot *hslot = &udptable->hash[state->bucket];

                if (hlist_empty(&hslot->head))
                        continue;

                spin_lock_bh(&hslot->lock);
                sk_for_each(sk, &hslot->head) {
                        if (seq_sk_match(seq, sk))
                                goto found;
                }
                spin_unlock_bh(&hslot->lock);
        }
        sk = NULL;
found:
        return sk;
}

static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
{
        struct udp_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);
        struct udp_table *udptable;

        do {
                sk = sk_next(sk);
        } while (sk && !seq_sk_match(seq, sk));

        if (!sk) {
                udptable = net->ipv4.udp_table;

                if (state->bucket <= udptable->mask)
                        spin_unlock_bh(&udptable->hash[state->bucket].lock);

                return udp_get_first(seq, state->bucket + 1);
        }
        return sk;
}

static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
{
        struct sock *sk = udp_get_first(seq, 0);

        if (sk)
                while (pos && (sk = udp_get_next(seq, sk)) != NULL)
                        --pos;
        return pos ? NULL : sk;
}

void *udp_seq_start(struct seq_file *seq, loff_t *pos)
{
        struct udp_iter_state *state = seq->private;
        state->bucket = MAX_UDP_PORTS;

        return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}

void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct sock *sk;

        if (v == SEQ_START_TOKEN)
                sk = udp_get_idx(seq, 0);
        else
                sk = udp_get_next(seq, v);

        ++*pos;
        return sk;
}

void udp_seq_stop(struct seq_file *seq, void *v)
{
        struct udp_iter_state *state = seq->private;
        struct udp_table *udptable;

        udptable = seq_file_net(seq)->ipv4.udp_table;

        if (state->bucket <= udptable->mask)
                spin_unlock_bh(&udptable->hash[state->bucket].lock);
}

/* ------------------------------------------------------------------------ */
static void udp4_format_sock(struct sock *sp, struct seq_file *f,
                int bucket)
{
        struct inet_sock *inet = inet_sk(sp);
        __be32 dest = inet->inet_daddr;
        __be32 src  = inet->inet_rcv_saddr;
        __u16 destp          = ntohs(inet->inet_dport);
        __u16 srcp          = ntohs(inet->inet_sport);

        seq_printf(f, "%5d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %llu %d %pK %u",
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                udp_rqueue_get(sp),
                0, 0L, 0,
                from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
                0, sock_i_ino(sp),
                refcount_read(&sp->sk_refcnt), sp,
                sk_drops_read(sp));
}

static int udp4_seq_show(struct seq_file *seq, void *v)
{
        seq_setwidth(seq, 127);
        if (v == SEQ_START_TOKEN)
                seq_puts(seq, "   sl  local_address rem_address   st tx_queue "
                           "rx_queue tr tm->when retrnsmt   uid  timeout "
                           "inode ref pointer drops");
        else {
                struct udp_iter_state *state = seq->private;

                udp4_format_sock(v, seq, state->bucket);
        }
        seq_pad(seq, '\n');
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__udp {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct udp_sock *, udp_sk);
        uid_t uid __aligned(8);
        int bucket __aligned(8);
};

union bpf_udp_iter_batch_item {
        struct sock *sk;
        __u64 cookie;
};

struct bpf_udp_iter_state {
        struct udp_iter_state state;
        unsigned int cur_sk;
        unsigned int end_sk;
        unsigned int max_sk;
        union bpf_udp_iter_batch_item *batch;
};

static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
                                      unsigned int new_batch_sz, gfp_t flags);
static struct sock *bpf_iter_udp_resume(struct sock *first_sk,
                                        union bpf_udp_iter_batch_item *cookies,
                                        int n_cookies)
{
        struct sock *sk = NULL;
        int i;

        for (i = 0; i < n_cookies; i++) {
                sk = first_sk;
                udp_portaddr_for_each_entry_from(sk)
                        if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
                                goto done;
        }
done:
        return sk;
}

static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
{
        struct bpf_udp_iter_state *iter = seq->private;
        struct udp_iter_state *state = &iter->state;
        unsigned int find_cookie, end_cookie;
        struct net *net = seq_file_net(seq);
        struct udp_table *udptable;
        unsigned int batch_sks = 0;
        int resume_bucket;
        int resizes = 0;
        struct sock *sk;
        int err = 0;

        resume_bucket = state->bucket;

        /* The current batch is done, so advance the bucket. */
        if (iter->cur_sk == iter->end_sk)
                state->bucket++;

        udptable = net->ipv4.udp_table;

again:
        /* New batch for the next bucket.
         * Iterate over the hash table to find a bucket with sockets matching
         * the iterator attributes, and return the first matching socket from
         * the bucket. The remaining matched sockets from the bucket are batched
         * before releasing the bucket lock. This allows BPF programs that are
         * called in seq_show to acquire the bucket lock if needed.
         */
        find_cookie = iter->cur_sk;
        end_cookie = iter->end_sk;
        iter->cur_sk = 0;
        iter->end_sk = 0;
        batch_sks = 0;

        for (; state->bucket <= udptable->mask; state->bucket++) {
                struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;

                if (hlist_empty(&hslot2->head))
                        goto next_bucket;

                spin_lock_bh(&hslot2->lock);
                sk = hlist_entry_safe(hslot2->head.first, struct sock,
                                      __sk_common.skc_portaddr_node);
                /* Resume from the first (in iteration order) unseen socket from
                 * the last batch that still exists in resume_bucket. Most of
                 * the time this will just be where the last iteration left off
                 * in resume_bucket unless that socket disappeared between
                 * reads.
                 */
                if (state->bucket == resume_bucket)
                        sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie],
                                                 end_cookie - find_cookie);
fill_batch:
                udp_portaddr_for_each_entry_from(sk) {
                        if (seq_sk_match(seq, sk)) {
                                if (iter->end_sk < iter->max_sk) {
                                        sock_hold(sk);
                                        iter->batch[iter->end_sk++].sk = sk;
                                }
                                batch_sks++;
                        }
                }

                /* Allocate a larger batch and try again. */
                if (unlikely(resizes <= 1 && iter->end_sk &&
                             iter->end_sk != batch_sks)) {
                        resizes++;

                        /* First, try with GFP_USER to maximize the chances of
                         * grabbing more memory.
                         */
                        if (resizes == 1) {
                                spin_unlock_bh(&hslot2->lock);
                                err = bpf_iter_udp_realloc_batch(iter,
                                                                 batch_sks * 3 / 2,
                                                                 GFP_USER);
                                if (err)
                                        return ERR_PTR(err);
                                /* Start over. */
                                goto again;
                        }

                        /* Next, hold onto the lock, so the bucket doesn't
                         * change while we get the rest of the sockets.
                         */
                        err = bpf_iter_udp_realloc_batch(iter, batch_sks,
                                                         GFP_NOWAIT);
                        if (err) {
                                spin_unlock_bh(&hslot2->lock);
                                return ERR_PTR(err);
                        }

                        /* Pick up where we left off. */
                        sk = iter->batch[iter->end_sk - 1].sk;
                        sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
                                              struct sock,
                                              __sk_common.skc_portaddr_node);
                        batch_sks = iter->end_sk;
                        goto fill_batch;
                }

                spin_unlock_bh(&hslot2->lock);

                if (iter->end_sk)
                        break;
next_bucket:
                resizes = 0;
        }

        WARN_ON_ONCE(iter->end_sk != batch_sks);
        return iter->end_sk ? iter->batch[0].sk : NULL;
}

static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct bpf_udp_iter_state *iter = seq->private;
        struct sock *sk;

        /* Whenever seq_next() is called, the iter->cur_sk is
         * done with seq_show(), so unref the iter->cur_sk.
         */
        if (iter->cur_sk < iter->end_sk)
                sock_put(iter->batch[iter->cur_sk++].sk);

        /* After updating iter->cur_sk, check if there are more sockets
         * available in the current bucket batch.
         */
        if (iter->cur_sk < iter->end_sk)
                sk = iter->batch[iter->cur_sk].sk;
        else
                /* Prepare a new batch. */
                sk = bpf_iter_udp_batch(seq);

        ++*pos;
        return sk;
}

static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
{
        /* bpf iter does not support lseek, so it always
         * continue from where it was stop()-ped.
         */
        if (*pos)
                return bpf_iter_udp_batch(seq);

        return SEQ_START_TOKEN;
}

static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
                             struct udp_sock *udp_sk, uid_t uid, int bucket)
{
        struct bpf_iter__udp ctx;

        meta->seq_num--;  /* skip SEQ_START_TOKEN */
        ctx.meta = meta;
        ctx.udp_sk = udp_sk;
        ctx.uid = uid;
        ctx.bucket = bucket;
        return bpf_iter_run_prog(prog, &ctx);
}

static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
{
        struct udp_iter_state *state = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        struct sock *sk = v;
        uid_t uid;
        int ret;

        if (v == SEQ_START_TOKEN)
                return 0;

        lock_sock(sk);

        if (unlikely(sk_unhashed(sk))) {
                ret = SEQ_SKIP;
                goto unlock;
        }

        uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);

unlock:
        release_sock(sk);
        return ret;
}

static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
{
        union bpf_udp_iter_batch_item *item;
        unsigned int cur_sk = iter->cur_sk;
        __u64 cookie;

        /* Remember the cookies of the sockets we haven't seen yet, so we can
         * pick up where we left off next time around.
         */
        while (cur_sk < iter->end_sk) {
                item = &iter->batch[cur_sk++];
                cookie = sock_gen_cookie(item->sk);
                sock_put(item->sk);
                item->cookie = cookie;
        }
}

static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_udp_iter_state *iter = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)udp_prog_seq_show(prog, &meta, v, 0, 0);
        }

        if (iter->cur_sk < iter->end_sk)
                bpf_iter_udp_put_batch(iter);
}

static const struct seq_operations bpf_iter_udp_seq_ops = {
        .start                = bpf_iter_udp_seq_start,
        .next                = bpf_iter_udp_seq_next,
        .stop                = bpf_iter_udp_seq_stop,
        .show                = bpf_iter_udp_seq_show,
};
#endif

static unsigned short seq_file_family(const struct seq_file *seq)
{
        const struct udp_seq_afinfo *afinfo;

#ifdef CONFIG_BPF_SYSCALL
        /* BPF iterator: bpf programs to filter sockets. */
        if (seq->op == &bpf_iter_udp_seq_ops)
                return AF_UNSPEC;
#endif

        /* Proc fs iterator */
        afinfo = pde_data(file_inode(seq->file));
        return afinfo->family;
}

static const struct seq_operations udp_seq_ops = {
        .start                = udp_seq_start,
        .next                = udp_seq_next,
        .stop                = udp_seq_stop,
        .show                = udp4_seq_show,
};

static struct udp_seq_afinfo udp4_seq_afinfo = {
        .family                = AF_INET,
};

static int __net_init udp4_proc_init_net(struct net *net)
{
        if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops,
                        sizeof(struct udp_iter_state), &udp4_seq_afinfo))
                return -ENOMEM;
        return 0;
}

static void __net_exit udp4_proc_exit_net(struct net *net)
{
        remove_proc_entry("udp", net->proc_net);
}

static struct pernet_operations udp4_net_ops = {
        .init = udp4_proc_init_net,
        .exit = udp4_proc_exit_net,
};

int __init udp4_proc_init(void)
{
        return register_pernet_subsys(&udp4_net_ops);
}

void udp4_proc_exit(void)
{
        unregister_pernet_subsys(&udp4_net_ops);
}
#endif /* CONFIG_PROC_FS */

static __initdata unsigned long uhash_entries;
static int __init set_uhash_entries(char *str)
{
        ssize_t ret;

        if (!str)
                return 0;

        ret = kstrtoul(str, 0, &uhash_entries);
        if (ret)
                return 0;

        if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
                uhash_entries = UDP_HTABLE_SIZE_MIN;
        return 1;
}
__setup("uhash_entries=", set_uhash_entries);

static void __init udp_table_init(struct udp_table *table, const char *name)
{
        unsigned int i, slot_size;

        slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) +
                    udp_hash4_slot_size();
        table->hash = alloc_large_system_hash(name,
                                              slot_size,
                                              uhash_entries,
                                              21, /* one slot per 2 MB */
                                              0,
                                              &table->log,
                                              &table->mask,
                                              UDP_HTABLE_SIZE_MIN,
                                              UDP_HTABLE_SIZE_MAX);

        table->hash2 = (void *)(table->hash + (table->mask + 1));
        for (i = 0; i <= table->mask; i++) {
                INIT_HLIST_HEAD(&table->hash[i].head);
                table->hash[i].count = 0;
                spin_lock_init(&table->hash[i].lock);
        }
        for (i = 0; i <= table->mask; i++) {
                INIT_HLIST_HEAD(&table->hash2[i].hslot.head);
                table->hash2[i].hslot.count = 0;
                spin_lock_init(&table->hash2[i].hslot.lock);
        }
        udp_table_hash4_init(table);
}

u32 udp_flow_hashrnd(void)
{
        static u32 hashrnd __read_mostly;

        net_get_random_once(&hashrnd, sizeof(hashrnd));

        return hashrnd;
}
EXPORT_SYMBOL(udp_flow_hashrnd);

static void __net_init udp_sysctl_init(struct net *net)
{
        net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
        net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;

#ifdef CONFIG_NET_L3_MASTER_DEV
        net->ipv4.sysctl_udp_l3mdev_accept = 0;
#endif
}

static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
{
        struct udp_table *udptable;
        unsigned int slot_size;
        int i;

        udptable = kmalloc_obj(*udptable);
        if (!udptable)
                goto out;

        slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) +
                    udp_hash4_slot_size();
        udptable->hash = vmalloc_huge(hash_entries * slot_size,
                                      GFP_KERNEL_ACCOUNT);
        if (!udptable->hash)
                goto free_table;

        udptable->hash2 = (void *)(udptable->hash + hash_entries);
        udptable->mask = hash_entries - 1;
        udptable->log = ilog2(hash_entries);

        for (i = 0; i < hash_entries; i++) {
                INIT_HLIST_HEAD(&udptable->hash[i].head);
                udptable->hash[i].count = 0;
                spin_lock_init(&udptable->hash[i].lock);

                INIT_HLIST_HEAD(&udptable->hash2[i].hslot.head);
                udptable->hash2[i].hslot.count = 0;
                spin_lock_init(&udptable->hash2[i].hslot.lock);
        }
        udp_table_hash4_init(udptable);

        return udptable;

free_table:
        kfree(udptable);
out:
        return NULL;
}

static void __net_exit udp_pernet_table_free(struct net *net)
{
        struct udp_table *udptable = net->ipv4.udp_table;

        if (udptable == &udp_table)
                return;

        kvfree(udptable->hash);
        kfree(udptable);
}

static void __net_init udp_set_table(struct net *net)
{
        struct udp_table *udptable;
        unsigned int hash_entries;
        struct net *old_net;

        if (net_eq(net, &init_net))
                goto fallback;

        old_net = current->nsproxy->net_ns;
        hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
        if (!hash_entries)
                goto fallback;

        /* Set min to keep the bitmap on stack in udp_lib_get_port() */
        if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
                hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
        else
                hash_entries = roundup_pow_of_two(hash_entries);

        udptable = udp_pernet_table_alloc(hash_entries);
        if (udptable) {
                net->ipv4.udp_table = udptable;
        } else {
                pr_warn("Failed to allocate UDP hash table (entries: %u) "
                        "for a netns, fallback to the global one\n",
                        hash_entries);
fallback:
                net->ipv4.udp_table = &udp_table;
        }
}

static int __net_init udp_pernet_init(struct net *net)
{
#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
        int i;

        /* No tunnel is configured */
        for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) {
                INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list);
                RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL);
        }
#endif
        udp_sysctl_init(net);
        udp_set_table(net);

        return 0;
}

static void __net_exit udp_pernet_exit(struct net *net)
{
        udp_pernet_table_free(net);
}

static struct pernet_operations __net_initdata udp_sysctl_ops = {
        .init        = udp_pernet_init,
        .exit        = udp_pernet_exit,
};

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
                     struct udp_sock *udp_sk, uid_t uid, int bucket)

static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
                                      unsigned int new_batch_sz, gfp_t flags)
{
        union bpf_udp_iter_batch_item *new_batch;

        new_batch = kvmalloc_objs(*new_batch, new_batch_sz,
                                  flags | __GFP_NOWARN);
        if (!new_batch)
                return -ENOMEM;

        if (flags != GFP_NOWAIT)
                bpf_iter_udp_put_batch(iter);

        memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
        kvfree(iter->batch);
        iter->batch = new_batch;
        iter->max_sk = new_batch_sz;

        return 0;
}

#define INIT_BATCH_SZ 16

static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
{
        struct bpf_udp_iter_state *iter = priv_data;
        int ret;

        ret = bpf_iter_init_seq_net(priv_data, aux);
        if (ret)
                return ret;

        ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
        if (ret)
                bpf_iter_fini_seq_net(priv_data);

        iter->state.bucket = -1;

        return ret;
}

static void bpf_iter_fini_udp(void *priv_data)
{
        struct bpf_udp_iter_state *iter = priv_data;

        bpf_iter_fini_seq_net(priv_data);
        kvfree(iter->batch);
}

static const struct bpf_iter_seq_info udp_seq_info = {
        .seq_ops                = &bpf_iter_udp_seq_ops,
        .init_seq_private        = bpf_iter_init_udp,
        .fini_seq_private        = bpf_iter_fini_udp,
        .seq_priv_size                = sizeof(struct bpf_udp_iter_state),
};

static struct bpf_iter_reg udp_reg_info = {
        .target                        = "udp",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__udp, udp_sk),
                  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
        },
        .seq_info                = &udp_seq_info,
};

static void __init bpf_iter_register(void)
{
        udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP];
        if (bpf_iter_reg_target(&udp_reg_info))
                pr_warn("Warning: could not register bpf iterator udp\n");
}
#endif

void __init udp_init(void)
{
        unsigned long limit;

        udp_table_init(&udp_table, "UDP");
        limit = nr_free_buffer_pages() / 8;
        limit = max(limit, 128UL);
        sysctl_udp_mem[0] = limit / 4 * 3;
        sysctl_udp_mem[1] = limit;
        sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;

        if (register_pernet_subsys(&udp_sysctl_ops))
                panic("UDP: failed to init sysctl parameters.\n");

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        bpf_iter_register();
#endif
}








































































































































































































































































































































































































































































































    2 

































    2 






    2 










    7 






    1 






























































































    1 





    1 





    1 
    1 










    1 
    1 
    1 

    1 
    1 
























































































































    1 
    7 











    3 



























































    7 







    9 








    2 









    6 




    6 







    4 


    4 





    4 
    2 











    1 




















    5 


    6 





    1 




    7 



    1 

    7 

























































































































































































































































































































































































































































































































































































    6 









    7 






    6 








   10 




    2 


















    8 



    7 

    4 
    4 


















    1 






















    2 
    2 




    1 





    1 


    2 
    2 













    1 







    1 





    1 















    2 













    8 


































    6 

























    1 

    1 



































    9 



    7 











    7 









    9 










    9 








    9 










    9 
    9 
    7 

    9 






    7 























   16 
   15 




























































    1 





    3 



    1 





    1 






    3 

























    1 









    2 
    1 

    4 

























































































































































    3 




    3 



    3 

























    3 






























































































































































    1 















































    1 




































































































































































































































































































    3 

































    3 












    1 





    1 







    1 









































































    1 





    1 







    1 


















































































    3 






























































































    4 



























    1 
















    3 






    3 








    3 






    2 

    2 


    1 







    1 
    1 


































    2 

































































































































    2 

















    4 
    2 







    4 
    2 







    2 
    2 










































    4 








    4 
























    2 

    2 


    1 































    3 





































    1 






    1 


    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

















































    1 
















    1 





















































































    4 



















    4 






    3 











    1 




    2 

    2 




    2 



































    1 

    1 









    1 




































































































    1 























    1 














    1 








































































    1 
































































































































































































































    2 


    2 










    2 


    2 














    2 








    1 




    1 










    1 
    1 
    2 





    1 




























































































    9 





















    9 



    5 



    9 
    3 


    6 













    1 


    1 
    1 
































    6 





    5 
    4 





    4 




    1 



    1 










    6 









    1 

    7 


    7 


    3 



    1 
    4 









   10 














   10 



    7 



    9 








   10 













    9 








   10 




    9 




































    9 



   10 




    9 









    9 



    9 









    9 






















   10 








    8 




















   10 
















   10 
















   10 


















    1 












    1 







    1 













    1 





    1 






    1 












    1 







































































    1 




    1 




    1 


    1 
































    1 






   10 
















































    9 



















    7 













    1 













    1 







    9 





    8 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   10 



   10 
    8 


    1 












    1 






    1 



    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2018 Facebook */

#include <uapi/linux/btf.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/bpf_perf_event.h>
#include <uapi/linux/types.h>
#include <linux/seq_file.h>
#include <linux/compiler.h>
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/idr.h>
#include <linux/sort.h>
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/bpf.h>
#include <linux/bpf_lsm.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
#include <linux/bsearch.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/overflow.h>

#include <net/netfilter/nf_bpf_link.h>

#include <net/sock.h>
#include <net/xdp.h>
#include "../tools/lib/bpf/relo_core.h"

/* BTF (BPF Type Format) is the meta data format which describes
 * the data types of BPF program/map.  Hence, it basically focus
 * on the C programming language which the modern BPF is primary
 * using.
 *
 * ELF Section:
 * ~~~~~~~~~~~
 * The BTF data is stored under the ".BTF" ELF section
 *
 * struct btf_type:
 * ~~~~~~~~~~~~~~~
 * Each 'struct btf_type' object describes a C data type.
 * Depending on the type it is describing, a 'struct btf_type'
 * object may be followed by more data.  F.e.
 * To describe an array, 'struct btf_type' is followed by
 * 'struct btf_array'.
 *
 * 'struct btf_type' and any extra data following it are
 * 4 bytes aligned.
 *
 * Type section:
 * ~~~~~~~~~~~~~
 * The BTF type section contains a list of 'struct btf_type' objects.
 * Each one describes a C type.  Recall from the above section
 * that a 'struct btf_type' object could be immediately followed by extra
 * data in order to describe some particular C types.
 *
 * type_id:
 * ~~~~~~~
 * Each btf_type object is identified by a type_id.  The type_id
 * is implicitly implied by the location of the btf_type object in
 * the BTF type section.  The first one has type_id 1.  The second
 * one has type_id 2...etc.  Hence, an earlier btf_type has
 * a smaller type_id.
 *
 * A btf_type object may refer to another btf_type object by using
 * type_id (i.e. the "type" in the "struct btf_type").
 *
 * NOTE that we cannot assume any reference-order.
 * A btf_type object can refer to an earlier btf_type object
 * but it can also refer to a later btf_type object.
 *
 * For example, to describe "const void *".  A btf_type
 * object describing "const" may refer to another btf_type
 * object describing "void *".  This type-reference is done
 * by specifying type_id:
 *
 * [1] CONST (anon) type_id=2
 * [2] PTR (anon) type_id=0
 *
 * The above is the btf_verifier debug log:
 *   - Each line started with "[?]" is a btf_type object
 *   - [?] is the type_id of the btf_type object.
 *   - CONST/PTR is the BTF_KIND_XXX
 *   - "(anon)" is the name of the type.  It just
 *     happens that CONST and PTR has no name.
 *   - type_id=XXX is the 'u32 type' in btf_type
 *
 * NOTE: "void" has type_id 0
 *
 * String section:
 * ~~~~~~~~~~~~~~
 * The BTF string section contains the names used by the type section.
 * Each string is referred by an "offset" from the beginning of the
 * string section.
 *
 * Each string is '\0' terminated.
 *
 * The first character in the string section must be '\0'
 * which is used to mean 'anonymous'. Some btf_type may not
 * have a name.
 */

/* BTF verification:
 *
 * To verify BTF data, two passes are needed.
 *
 * Pass #1
 * ~~~~~~~
 * The first pass is to collect all btf_type objects to
 * an array: "btf->types".
 *
 * Depending on the C type that a btf_type is describing,
 * a btf_type may be followed by extra data.  We don't know
 * how many btf_type is there, and more importantly we don't
 * know where each btf_type is located in the type section.
 *
 * Without knowing the location of each type_id, most verifications
 * cannot be done.  e.g. an earlier btf_type may refer to a later
 * btf_type (recall the "const void *" above), so we cannot
 * check this type-reference in the first pass.
 *
 * In the first pass, it still does some verifications (e.g.
 * checking the name is a valid offset to the string section).
 *
 * Pass #2
 * ~~~~~~~
 * The main focus is to resolve a btf_type that is referring
 * to another type.
 *
 * We have to ensure the referring type:
 * 1) does exist in the BTF (i.e. in btf->types[])
 * 2) does not cause a loop:
 *        struct A {
 *                struct B b;
 *        };
 *
 *        struct B {
 *                struct A a;
 *        };
 *
 * btf_type_needs_resolve() decides if a btf_type needs
 * to be resolved.
 *
 * The needs_resolve type implements the "resolve()" ops which
 * essentially does a DFS and detects backedge.
 *
 * During resolve (or DFS), different C types have different
 * "RESOLVED" conditions.
 *
 * When resolving a BTF_KIND_STRUCT, we need to resolve all its
 * members because a member is always referring to another
 * type.  A struct's member can be treated as "RESOLVED" if
 * it is referring to a BTF_KIND_PTR.  Otherwise, the
 * following valid C struct would be rejected:
 *
 *        struct A {
 *                int m;
 *                struct A *a;
 *        };
 *
 * When resolving a BTF_KIND_PTR, it needs to keep resolving if
 * it is referring to another BTF_KIND_PTR.  Otherwise, we cannot
 * detect a pointer loop, e.g.:
 * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR +
 *                        ^                                         |
 *                        +-----------------------------------------+
 *
 */

#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2)
#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
#define BITS_ROUNDUP_BYTES(bits) \
        (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))

#define BTF_INFO_MASK 0x9f00ffff
#define BTF_INT_MASK 0x0fffffff
#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)

/* 16MB for 64k structs and each has 16 members and
 * a few MB spaces for the string section.
 * The hard limit is S32_MAX.
 */
#define BTF_MAX_SIZE (16 * 1024 * 1024)

#define for_each_member_from(i, from, struct_type, member)                \
        for (i = from, member = btf_type_member(struct_type) + from;        \
             i < btf_type_vlen(struct_type);                                \
             i++, member++)

#define for_each_vsi_from(i, from, struct_type, member)                                \
        for (i = from, member = btf_type_var_secinfo(struct_type) + from;        \
             i < btf_type_vlen(struct_type);                                        \
             i++, member++)

DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);

enum btf_kfunc_hook {
        BTF_KFUNC_HOOK_COMMON,
        BTF_KFUNC_HOOK_XDP,
        BTF_KFUNC_HOOK_TC,
        BTF_KFUNC_HOOK_STRUCT_OPS,
        BTF_KFUNC_HOOK_TRACING,
        BTF_KFUNC_HOOK_SYSCALL,
        BTF_KFUNC_HOOK_FMODRET,
        BTF_KFUNC_HOOK_CGROUP,
        BTF_KFUNC_HOOK_SCHED_ACT,
        BTF_KFUNC_HOOK_SK_SKB,
        BTF_KFUNC_HOOK_SOCKET_FILTER,
        BTF_KFUNC_HOOK_LWT,
        BTF_KFUNC_HOOK_NETFILTER,
        BTF_KFUNC_HOOK_KPROBE,
        BTF_KFUNC_HOOK_MAX,
};

enum {
        BTF_KFUNC_SET_MAX_CNT = 256,
        BTF_DTOR_KFUNC_MAX_CNT = 256,
        BTF_KFUNC_FILTER_MAX_CNT = 16,
};

struct btf_kfunc_hook_filter {
        btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT];
        u32 nr_filters;
};

struct btf_kfunc_set_tab {
        struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
        struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX];
};

struct btf_id_dtor_kfunc_tab {
        u32 cnt;
        struct btf_id_dtor_kfunc dtors[];
};

struct btf_struct_ops_tab {
        u32 cnt;
        u32 capacity;
        struct bpf_struct_ops_desc ops[];
};

struct btf {
        void *data;
        struct btf_type **types;
        u32 *resolved_ids;
        u32 *resolved_sizes;
        const char *strings;
        void *nohdr_data;
        struct btf_header hdr;
        u32 nr_types; /* includes VOID for base BTF */
        u32 named_start_id;
        u32 types_size;
        u32 data_size;
        refcount_t refcnt;
        u32 id;
        struct rcu_head rcu;
        struct btf_kfunc_set_tab *kfunc_set_tab;
        struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
        struct btf_struct_metas *struct_meta_tab;
        struct btf_struct_ops_tab *struct_ops_tab;
        struct btf_layout *layout;

        /* split BTF support */
        struct btf *base_btf;
        u32 start_id; /* first type ID in this BTF (0 for base BTF) */
        u32 start_str_off; /* first string offset (0 for base BTF) */
        char name[MODULE_NAME_LEN];
        bool kernel_btf;
        __u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */
};

enum verifier_phase {
        CHECK_META,
        CHECK_TYPE,
};

struct resolve_vertex {
        const struct btf_type *t;
        u32 type_id;
        u16 next_member;
};

enum visit_state {
        NOT_VISITED,
        VISITED,
        RESOLVED,
};

enum resolve_mode {
        RESOLVE_TBD,        /* To Be Determined */
        RESOLVE_PTR,        /* Resolving for Pointer */
        RESOLVE_STRUCT_OR_ARRAY,        /* Resolving for struct/union
                                         * or array
                                         */
};

#define MAX_RESOLVE_DEPTH 32

struct btf_sec_info {
        u32 off;
        u32 len;
};

struct btf_verifier_env {
        struct btf *btf;
        u8 *visit_states;
        struct resolve_vertex stack[MAX_RESOLVE_DEPTH];
        struct bpf_verifier_log log;
        u32 log_type_id;
        u32 top_stack;
        enum verifier_phase phase;
        enum resolve_mode resolve_mode;
};

static const char * const btf_kind_str[NR_BTF_KINDS] = {
        [BTF_KIND_UNKN]                = "UNKNOWN",
        [BTF_KIND_INT]                = "INT",
        [BTF_KIND_PTR]                = "PTR",
        [BTF_KIND_ARRAY]        = "ARRAY",
        [BTF_KIND_STRUCT]        = "STRUCT",
        [BTF_KIND_UNION]        = "UNION",
        [BTF_KIND_ENUM]                = "ENUM",
        [BTF_KIND_FWD]                = "FWD",
        [BTF_KIND_TYPEDEF]        = "TYPEDEF",
        [BTF_KIND_VOLATILE]        = "VOLATILE",
        [BTF_KIND_CONST]        = "CONST",
        [BTF_KIND_RESTRICT]        = "RESTRICT",
        [BTF_KIND_FUNC]                = "FUNC",
        [BTF_KIND_FUNC_PROTO]        = "FUNC_PROTO",
        [BTF_KIND_VAR]                = "VAR",
        [BTF_KIND_DATASEC]        = "DATASEC",
        [BTF_KIND_FLOAT]        = "FLOAT",
        [BTF_KIND_DECL_TAG]        = "DECL_TAG",
        [BTF_KIND_TYPE_TAG]        = "TYPE_TAG",
        [BTF_KIND_ENUM64]        = "ENUM64",
};

const char *btf_type_str(const struct btf_type *t)
{
        return btf_kind_str[BTF_INFO_KIND(t->info)];
}

/* Chunk size we use in safe copy of data to be shown. */
#define BTF_SHOW_OBJ_SAFE_SIZE                32

/*
 * This is the maximum size of a base type value (equivalent to a
 * 128-bit int); if we are at the end of our safe buffer and have
 * less than 16 bytes space we can't be assured of being able
 * to copy the next type safely, so in such cases we will initiate
 * a new copy.
 */
#define BTF_SHOW_OBJ_BASE_TYPE_SIZE        16

/* Type name size */
#define BTF_SHOW_NAME_SIZE                80

/*
 * The suffix of a type that indicates it cannot alias another type when
 * comparing BTF IDs for kfunc invocations.
 */
#define NOCAST_ALIAS_SUFFIX                "___init"

/*
 * Common data to all BTF show operations. Private show functions can add
 * their own data to a structure containing a struct btf_show and consult it
 * in the show callback.  See btf_type_show() below.
 *
 * One challenge with showing nested data is we want to skip 0-valued
 * data, but in order to figure out whether a nested object is all zeros
 * we need to walk through it.  As a result, we need to make two passes
 * when handling structs, unions and arrays; the first path simply looks
 * for nonzero data, while the second actually does the display.  The first
 * pass is signalled by show->state.depth_check being set, and if we
 * encounter a non-zero value we set show->state.depth_to_show to
 * the depth at which we encountered it.  When we have completed the
 * first pass, we will know if anything needs to be displayed if
 * depth_to_show > depth.  See btf_[struct,array]_show() for the
 * implementation of this.
 *
 * Another problem is we want to ensure the data for display is safe to
 * access.  To support this, the anonymous "struct {} obj" tracks the data
 * object and our safe copy of it.  We copy portions of the data needed
 * to the object "copy" buffer, but because its size is limited to
 * BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we
 * traverse larger objects for display.
 *
 * The various data type show functions all start with a call to
 * btf_show_start_type() which returns a pointer to the safe copy
 * of the data needed (or if BTF_SHOW_UNSAFE is specified, to the
 * raw data itself).  btf_show_obj_safe() is responsible for
 * using copy_from_kernel_nofault() to update the safe data if necessary
 * as we traverse the object's data.  skbuff-like semantics are
 * used:
 *
 * - obj.head points to the start of the toplevel object for display
 * - obj.size is the size of the toplevel object
 * - obj.data points to the current point in the original data at
 *   which our safe data starts.  obj.data will advance as we copy
 *   portions of the data.
 *
 * In most cases a single copy will suffice, but larger data structures
 * such as "struct task_struct" will require many copies.  The logic in
 * btf_show_obj_safe() handles the logic that determines if a new
 * copy_from_kernel_nofault() is needed.
 */
struct btf_show {
        u64 flags;
        void *target;        /* target of show operation (seq file, buffer) */
        __printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
        const struct btf *btf;
        /* below are used during iteration */
        struct {
                u8 depth;
                u8 depth_to_show;
                u8 depth_check;
                u8 array_member:1,
                   array_terminated:1;
                u16 array_encoding;
                u32 type_id;
                int status;                        /* non-zero for error */
                const struct btf_type *type;
                const struct btf_member *member;
                char name[BTF_SHOW_NAME_SIZE];        /* space for member name/type */
        } state;
        struct {
                u32 size;
                void *head;
                void *data;
                u8 safe[BTF_SHOW_OBJ_SAFE_SIZE];
        } obj;
};

struct btf_kind_operations {
        s32 (*check_meta)(struct btf_verifier_env *env,
                          const struct btf_type *t,
                          u32 meta_left);
        int (*resolve)(struct btf_verifier_env *env,
                       const struct resolve_vertex *v);
        int (*check_member)(struct btf_verifier_env *env,
                            const struct btf_type *struct_type,
                            const struct btf_member *member,
                            const struct btf_type *member_type);
        int (*check_kflag_member)(struct btf_verifier_env *env,
                                  const struct btf_type *struct_type,
                                  const struct btf_member *member,
                                  const struct btf_type *member_type);
        void (*log_details)(struct btf_verifier_env *env,
                            const struct btf_type *t);
        void (*show)(const struct btf *btf, const struct btf_type *t,
                         u32 type_id, void *data, u8 bits_offsets,
                         struct btf_show *show);
};

static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
static struct btf_type btf_void;

static int btf_resolve(struct btf_verifier_env *env,
                       const struct btf_type *t, u32 type_id);

static int btf_func_check(struct btf_verifier_env *env,
                          const struct btf_type *t);

static bool btf_type_is_modifier(const struct btf_type *t)
{
        /* Some of them is not strictly a C modifier
         * but they are grouped into the same bucket
         * for BTF concern:
         *   A type (t) that refers to another
         *   type through t->type AND its size cannot
         *   be determined without following the t->type.
         *
         * ptr does not fall into this bucket
         * because its size is always sizeof(void *).
         */
        switch (BTF_INFO_KIND(t->info)) {
        case BTF_KIND_TYPEDEF:
        case BTF_KIND_VOLATILE:
        case BTF_KIND_CONST:
        case BTF_KIND_RESTRICT:
        case BTF_KIND_TYPE_TAG:
                return true;
        }

        return false;
}

static int btf_start_id(const struct btf *btf)
{
        return btf->start_id + (btf->base_btf ? 0 : 1);
}

bool btf_type_is_void(const struct btf_type *t)
{
        return t == &btf_void;
}

static bool btf_type_is_datasec(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}

static bool btf_type_is_decl_tag(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
}

static bool btf_type_nosize(const struct btf_type *t)
{
        return btf_type_is_void(t) || btf_type_is_fwd(t) ||
               btf_type_is_func(t) || btf_type_is_func_proto(t) ||
               btf_type_is_decl_tag(t);
}

static bool btf_type_nosize_or_null(const struct btf_type *t)
{
        return !t || btf_type_nosize(t);
}

static bool btf_type_is_decl_tag_target(const struct btf_type *t)
{
        return btf_type_is_func(t) || btf_type_is_struct(t) ||
               btf_type_is_var(t) || btf_type_is_typedef(t);
}

bool btf_is_vmlinux(const struct btf *btf)
{
        return btf->kernel_btf && !btf->base_btf;
}

u32 btf_nr_types(const struct btf *btf)
{
        u32 total = 0;

        while (btf) {
                total += btf->nr_types;
                btf = btf->base_btf;
        }

        return total;
}

/*
 * Note that vmlinux and kernel module BTFs are always sorted
 * during the building phase.
 */
static void btf_check_sorted(struct btf *btf)
{
        u32 i, n, named_start_id = 0;

        n = btf_nr_types(btf);
        if (btf_is_vmlinux(btf)) {
                for (i = btf_start_id(btf); i < n; i++) {
                        const struct btf_type *t = btf_type_by_id(btf, i);
                        const char *n = btf_name_by_offset(btf, t->name_off);

                        if (n[0] != '\0') {
                                btf->named_start_id = i;
                                return;
                        }
                }
                return;
        }

        for (i = btf_start_id(btf) + 1; i < n; i++) {
                const struct btf_type *ta = btf_type_by_id(btf, i - 1);
                const struct btf_type *tb = btf_type_by_id(btf, i);
                const char *na = btf_name_by_offset(btf, ta->name_off);
                const char *nb = btf_name_by_offset(btf, tb->name_off);

                if (strcmp(na, nb) > 0)
                        return;

                if (named_start_id == 0 && na[0] != '\0')
                        named_start_id = i - 1;
                if (named_start_id == 0 && nb[0] != '\0')
                        named_start_id = i;
        }

        if (named_start_id)
                btf->named_start_id = named_start_id;
}

/*
 * btf_named_start_id - Get the named starting ID for the BTF
 * @btf: Pointer to the target BTF object
 * @own: Flag indicating whether to query only the current BTF (true = current BTF only,
 *       false = recursively traverse the base BTF chain)
 *
 * Return value rules:
 * 1. For a sorted btf, return its named_start_id
 * 2. Else for a split BTF, return its start_id
 * 3. Else for a base BTF, return 1
 */
u32 btf_named_start_id(const struct btf *btf, bool own)
{
        const struct btf *base_btf = btf;

        while (!own && base_btf->base_btf)
                base_btf = base_btf->base_btf;

        return base_btf->named_start_id ?: (base_btf->start_id ?: 1);
}

static s32 btf_find_by_name_kind_bsearch(const struct btf *btf, const char *name)
{
        const struct btf_type *t;
        const char *tname;
        s32 l, r, m;

        l = btf_named_start_id(btf, true);
        r = btf_nr_types(btf) - 1;
        while (l <= r) {
                m = l + (r - l) / 2;
                t = btf_type_by_id(btf, m);
                tname = btf_name_by_offset(btf, t->name_off);
                if (strcmp(tname, name) >= 0) {
                        if (l == r)
                                return r;
                        r = m;
                } else {
                        l = m + 1;
                }
        }

        return btf_nr_types(btf);
}

s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
{
        const struct btf *base_btf = btf_base_btf(btf);
        const struct btf_type *t;
        const char *tname;
        s32 id, total;

        if (base_btf) {
                id = btf_find_by_name_kind(base_btf, name, kind);
                if (id > 0)
                        return id;
        }

        total = btf_nr_types(btf);
        if (btf->named_start_id > 0 && name[0]) {
                id = btf_find_by_name_kind_bsearch(btf, name);
                for (; id < total; id++) {
                        t = btf_type_by_id(btf, id);
                        tname = btf_name_by_offset(btf, t->name_off);
                        if (strcmp(tname, name) != 0)
                                return -ENOENT;
                        if (BTF_INFO_KIND(t->info) == kind)
                                return id;
                }
        } else {
                for (id = btf_start_id(btf); id < total; id++) {
                        t = btf_type_by_id(btf, id);
                        if (BTF_INFO_KIND(t->info) != kind)
                                continue;
                        tname = btf_name_by_offset(btf, t->name_off);
                        if (strcmp(tname, name) == 0)
                                return id;
                }
        }

        return -ENOENT;
}

s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
{
        struct btf *btf;
        s32 ret;
        int id;

        btf = bpf_get_btf_vmlinux();
        if (IS_ERR(btf))
                return PTR_ERR(btf);
        if (!btf)
                return -EINVAL;

        ret = btf_find_by_name_kind(btf, name, kind);
        /* ret is never zero, since btf_find_by_name_kind returns
         * positive btf_id or negative error.
         */
        if (ret > 0) {
                btf_get(btf);
                *btf_p = btf;
                return ret;
        }

        /* If name is not found in vmlinux's BTF then search in module's BTFs */
        spin_lock_bh(&btf_idr_lock);
        idr_for_each_entry(&btf_idr, btf, id) {
                if (!btf_is_module(btf))
                        continue;
                /* linear search could be slow hence unlock/lock
                 * the IDR to avoiding holding it for too long
                 */
                btf_get(btf);
                spin_unlock_bh(&btf_idr_lock);
                ret = btf_find_by_name_kind(btf, name, kind);
                if (ret > 0) {
                        *btf_p = btf;
                        return ret;
                }
                btf_put(btf);
                spin_lock_bh(&btf_idr_lock);
        }
        spin_unlock_bh(&btf_idr_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(bpf_find_btf_id);

const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
                                               u32 id, u32 *res_id)
{
        const struct btf_type *t = btf_type_by_id(btf, id);

        while (btf_type_is_modifier(t)) {
                id = t->type;
                t = btf_type_by_id(btf, t->type);
        }

        if (res_id)
                *res_id = id;

        return t;
}

const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
                                            u32 id, u32 *res_id)
{
        const struct btf_type *t;

        t = btf_type_skip_modifiers(btf, id, NULL);
        if (!btf_type_is_ptr(t))
                return NULL;

        return btf_type_skip_modifiers(btf, t->type, res_id);
}

const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
                                                 u32 id, u32 *res_id)
{
        const struct btf_type *ptype;

        ptype = btf_type_resolve_ptr(btf, id, res_id);
        if (ptype && btf_type_is_func_proto(ptype))
                return ptype;

        return NULL;
}

/* Types that act only as a source, not sink or intermediate
 * type when resolving.
 */
static bool btf_type_is_resolve_source_only(const struct btf_type *t)
{
        return btf_type_is_var(t) ||
               btf_type_is_decl_tag(t) ||
               btf_type_is_datasec(t);
}

/* What types need to be resolved?
 *
 * btf_type_is_modifier() is an obvious one.
 *
 * btf_type_is_struct() because its member refers to
 * another type (through member->type).
 *
 * btf_type_is_var() because the variable refers to
 * another type. btf_type_is_datasec() holds multiple
 * btf_type_is_var() types that need resolving.
 *
 * btf_type_is_array() because its element (array->type)
 * refers to another type.  Array can be thought of a
 * special case of struct while array just has the same
 * member-type repeated by array->nelems of times.
 */
static bool btf_type_needs_resolve(const struct btf_type *t)
{
        return btf_type_is_modifier(t) ||
               btf_type_is_ptr(t) ||
               btf_type_is_struct(t) ||
               btf_type_is_array(t) ||
               btf_type_is_var(t) ||
               btf_type_is_func(t) ||
               btf_type_is_decl_tag(t) ||
               btf_type_is_datasec(t);
}

/* t->size can be used */
static bool btf_type_has_size(const struct btf_type *t)
{
        switch (BTF_INFO_KIND(t->info)) {
        case BTF_KIND_INT:
        case BTF_KIND_STRUCT:
        case BTF_KIND_UNION:
        case BTF_KIND_ENUM:
        case BTF_KIND_DATASEC:
        case BTF_KIND_FLOAT:
        case BTF_KIND_ENUM64:
                return true;
        }

        return false;
}

static const char *btf_int_encoding_str(u8 encoding)
{
        if (encoding == 0)
                return "(none)";
        else if (encoding == BTF_INT_SIGNED)
                return "SIGNED";
        else if (encoding == BTF_INT_CHAR)
                return "CHAR";
        else if (encoding == BTF_INT_BOOL)
                return "BOOL";
        else
                return "UNKN";
}

static u32 btf_type_int(const struct btf_type *t)
{
        return *(u32 *)(t + 1);
}

static const struct btf_array *btf_type_array(const struct btf_type *t)
{
        return (const struct btf_array *)(t + 1);
}

static const struct btf_enum *btf_type_enum(const struct btf_type *t)
{
        return (const struct btf_enum *)(t + 1);
}

static const struct btf_var *btf_type_var(const struct btf_type *t)
{
        return (const struct btf_var *)(t + 1);
}

static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
{
        return (const struct btf_decl_tag *)(t + 1);
}

static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t)
{
        return (const struct btf_enum64 *)(t + 1);
}

static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
        return kind_ops[BTF_INFO_KIND(t->info)];
}

static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
{
        if (!BTF_STR_OFFSET_VALID(offset))
                return false;

        while (offset < btf->start_str_off)
                btf = btf->base_btf;

        offset -= btf->start_str_off;
        return offset < btf->hdr.str_len;
}

static bool __btf_name_char_ok(char c, bool first)
{
        if ((first ? !isalpha(c) :
                     !isalnum(c)) &&
            c != '_' &&
            c != '.')
                return false;
        return true;
}

const char *btf_str_by_offset(const struct btf *btf, u32 offset)
{
        while (offset < btf->start_str_off)
                btf = btf->base_btf;

        offset -= btf->start_str_off;
        if (offset < btf->hdr.str_len)
                return &btf->strings[offset];

        return NULL;
}

static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
{
        /* offset must be valid */
        const char *src = btf_str_by_offset(btf, offset);
        const char *src_limit;

        if (!__btf_name_char_ok(*src, true))
                return false;

        /* set a limit on identifier length */
        src_limit = src + KSYM_NAME_LEN;
        src++;
        while (*src && src < src_limit) {
                if (!__btf_name_char_ok(*src, false))
                        return false;
                src++;
        }

        return !*src;
}

/* Allow any printable character in DATASEC names */
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
        /* offset must be valid */
        const char *src = btf_str_by_offset(btf, offset);
        const char *src_limit;

        if (!*src)
                return false;

        /* set a limit on identifier length */
        src_limit = src + KSYM_NAME_LEN;
        while (*src && src < src_limit) {
                if (!isprint(*src))
                        return false;
                src++;
        }

        return !*src;
}

static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
{
        const char *name;

        if (!offset)
                return "(anon)";

        name = btf_str_by_offset(btf, offset);
        return name ?: "(invalid-name-offset)";
}

const char *btf_name_by_offset(const struct btf *btf, u32 offset)
{
        return btf_str_by_offset(btf, offset);
}

const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
{
        while (type_id < btf->start_id)
                btf = btf->base_btf;

        type_id -= btf->start_id;
        if (type_id >= btf->nr_types)
                return NULL;
        return btf->types[type_id];
}
EXPORT_SYMBOL_GPL(btf_type_by_id);

/*
 * Check that the type @t is a regular int. This means that @t is not
 * a bit field and it has the same size as either of u8/u16/u32/u64
 * or __int128. If @expected_size is not zero, then size of @t should
 * be the same. A caller should already have checked that the type @t
 * is an integer.
 */
static bool __btf_type_int_is_regular(const struct btf_type *t, size_t expected_size)
{
        u32 int_data = btf_type_int(t);
        u8 nr_bits = BTF_INT_BITS(int_data);
        u8 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);

        return BITS_PER_BYTE_MASKED(nr_bits) == 0 &&
               BTF_INT_OFFSET(int_data) == 0 &&
               (nr_bytes <= 16 && is_power_of_2(nr_bytes)) &&
               (expected_size == 0 || nr_bytes == expected_size);
}

static bool btf_type_int_is_regular(const struct btf_type *t)
{
        return __btf_type_int_is_regular(t, 0);
}

bool btf_type_is_i32(const struct btf_type *t)
{
        return btf_type_is_int(t) && __btf_type_int_is_regular(t, 4);
}

bool btf_type_is_i64(const struct btf_type *t)
{
        return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8);
}

bool btf_type_is_primitive(const struct btf_type *t)
{
        return (btf_type_is_int(t) && btf_type_int_is_regular(t)) ||
               btf_is_any_enum(t);
}

/*
 * Check that given struct member is a regular int with expected
 * offset and size.
 */
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
                           const struct btf_member *m,
                           u32 expected_offset, u32 expected_size)
{
        const struct btf_type *t;
        u32 id, int_data;
        u8 nr_bits;

        id = m->type;
        t = btf_type_id_size(btf, &id, NULL);
        if (!t || !btf_type_is_int(t))
                return false;

        int_data = btf_type_int(t);
        nr_bits = BTF_INT_BITS(int_data);
        if (btf_type_kflag(s)) {
                u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset);
                u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset);

                /* if kflag set, int should be a regular int and
                 * bit offset should be at byte boundary.
                 */
                return !bitfield_size &&
                       BITS_ROUNDUP_BYTES(bit_offset) == expected_offset &&
                       BITS_ROUNDUP_BYTES(nr_bits) == expected_size;
        }

        if (BTF_INT_OFFSET(int_data) ||
            BITS_PER_BYTE_MASKED(m->offset) ||
            BITS_ROUNDUP_BYTES(m->offset) != expected_offset ||
            BITS_PER_BYTE_MASKED(nr_bits) ||
            BITS_ROUNDUP_BYTES(nr_bits) != expected_size)
                return false;

        return true;
}

/* Similar to btf_type_skip_modifiers() but does not skip typedefs. */
static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf,
                                                       u32 id)
{
        const struct btf_type *t = btf_type_by_id(btf, id);

        while (btf_type_is_modifier(t) &&
               BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
                t = btf_type_by_id(btf, t->type);
        }

        return t;
}

#define BTF_SHOW_MAX_ITER        10

#define BTF_KIND_BIT(kind)        (1ULL << kind)

/*
 * Populate show->state.name with type name information.
 * Format of type name is
 *
 * [.member_name = ] (type_name)
 */
static const char *btf_show_name(struct btf_show *show)
{
        /* BTF_MAX_ITER array suffixes "[]" */
        const char *array_suffixes = "[][][][][][][][][][]";
        const char *array_suffix = &array_suffixes[strlen(array_suffixes)];
        /* BTF_MAX_ITER pointer suffixes "*" */
        const char *ptr_suffixes = "**********";
        const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
        const char *name = NULL, *prefix = "", *parens = "";
        const struct btf_member *m = show->state.member;
        const struct btf_type *t;
        const struct btf_array *array;
        u32 id = show->state.type_id;
        const char *member = NULL;
        bool show_member = false;
        u64 kinds = 0;
        int i;

        show->state.name[0] = '\0';

        /*
         * Don't show type name if we're showing an array member;
         * in that case we show the array type so don't need to repeat
         * ourselves for each member.
         */
        if (show->state.array_member)
                return "";

        /* Retrieve member name, if any. */
        if (m) {
                member = btf_name_by_offset(show->btf, m->name_off);
                show_member = strlen(member) > 0;
                id = m->type;
        }

        /*
         * Start with type_id, as we have resolved the struct btf_type *
         * via btf_modifier_show() past the parent typedef to the child
         * struct, int etc it is defined as.  In such cases, the type_id
         * still represents the starting type while the struct btf_type *
         * in our show->state points at the resolved type of the typedef.
         */
        t = btf_type_by_id(show->btf, id);
        if (!t)
                return "";

        /*
         * The goal here is to build up the right number of pointer and
         * array suffixes while ensuring the type name for a typedef
         * is represented.  Along the way we accumulate a list of
         * BTF kinds we have encountered, since these will inform later
         * display; for example, pointer types will not require an
         * opening "{" for struct, we will just display the pointer value.
         *
         * We also want to accumulate the right number of pointer or array
         * indices in the format string while iterating until we get to
         * the typedef/pointee/array member target type.
         *
         * We start by pointing at the end of pointer and array suffix
         * strings; as we accumulate pointers and arrays we move the pointer
         * or array string backwards so it will show the expected number of
         * '*' or '[]' for the type.  BTF_SHOW_MAX_ITER of nesting of pointers
         * and/or arrays and typedefs are supported as a precaution.
         *
         * We also want to get typedef name while proceeding to resolve
         * type it points to so that we can add parentheses if it is a
         * "typedef struct" etc.
         */
        for (i = 0; i < BTF_SHOW_MAX_ITER; i++) {

                switch (BTF_INFO_KIND(t->info)) {
                case BTF_KIND_TYPEDEF:
                        if (!name)
                                name = btf_name_by_offset(show->btf,
                                                               t->name_off);
                        kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF);
                        id = t->type;
                        break;
                case BTF_KIND_ARRAY:
                        kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY);
                        parens = "[";
                        if (!t)
                                return "";
                        array = btf_type_array(t);
                        if (array_suffix > array_suffixes)
                                array_suffix -= 2;
                        id = array->type;
                        break;
                case BTF_KIND_PTR:
                        kinds |= BTF_KIND_BIT(BTF_KIND_PTR);
                        if (ptr_suffix > ptr_suffixes)
                                ptr_suffix -= 1;
                        id = t->type;
                        break;
                default:
                        id = 0;
                        break;
                }
                if (!id)
                        break;
                t = btf_type_skip_qualifiers(show->btf, id);
        }
        /* We may not be able to represent this type; bail to be safe */
        if (i == BTF_SHOW_MAX_ITER)
                return "";

        if (!name)
                name = btf_name_by_offset(show->btf, t->name_off);

        switch (BTF_INFO_KIND(t->info)) {
        case BTF_KIND_STRUCT:
        case BTF_KIND_UNION:
                prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ?
                         "struct" : "union";
                /* if it's an array of struct/union, parens is already set */
                if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY))))
                        parens = "{";
                break;
        case BTF_KIND_ENUM:
        case BTF_KIND_ENUM64:
                prefix = "enum";
                break;
        default:
                break;
        }

        /* pointer does not require parens */
        if (kinds & BTF_KIND_BIT(BTF_KIND_PTR))
                parens = "";
        /* typedef does not require struct/union/enum prefix */
        if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF))
                prefix = "";

        if (!name)
                name = "";

        /* Even if we don't want type name info, we want parentheses etc */
        if (show->flags & BTF_SHOW_NONAME)
                snprintf(show->state.name, sizeof(show->state.name), "%s",
                         parens);
        else
                snprintf(show->state.name, sizeof(show->state.name),
                         "%s%s%s(%s%s%s%s%s%s)%s",
                         /* first 3 strings comprise ".member = " */
                         show_member ? "." : "",
                         show_member ? member : "",
                         show_member ? " = " : "",
                         /* ...next is our prefix (struct, enum, etc) */
                         prefix,
                         strlen(prefix) > 0 && strlen(name) > 0 ? " " : "",
                         /* ...this is the type name itself */
                         name,
                         /* ...suffixed by the appropriate '*', '[]' suffixes */
                         strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix,
                         array_suffix, parens);

        return show->state.name;
}

static const char *__btf_show_indent(struct btf_show *show)
{
        const char *indents = "                                ";
        const char *indent = &indents[strlen(indents)];

        if ((indent - show->state.depth) >= indents)
                return indent - show->state.depth;
        return indents;
}

static const char *btf_show_indent(struct btf_show *show)
{
        return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show);
}

static const char *btf_show_newline(struct btf_show *show)
{
        return show->flags & BTF_SHOW_COMPACT ? "" : "\n";
}

static const char *btf_show_delim(struct btf_show *show)
{
        if (show->state.depth == 0)
                return "";

        if ((show->flags & BTF_SHOW_COMPACT) && show->state.type &&
                BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION)
                return "|";

        return ",";
}

__printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...)
{
        va_list args;

        if (!show->state.depth_check) {
                va_start(args, fmt);
                show->showfn(show, fmt, args);
                va_end(args);
        }
}

/* Macros are used here as btf_show_type_value[s]() prepends and appends
 * format specifiers to the format specifier passed in; these do the work of
 * adding indentation, delimiters etc while the caller simply has to specify
 * the type value(s) in the format specifier + value(s).
 */
#define btf_show_type_value(show, fmt, value)                                       \
        do {                                                                       \
                if ((value) != (__typeof__(value))0 ||                               \
                    (show->flags & BTF_SHOW_ZERO) ||                               \
                    show->state.depth == 0) {                                       \
                        btf_show(show, "%s%s" fmt "%s%s",                       \
                                 btf_show_indent(show),                               \
                                 btf_show_name(show),                               \
                                 value, btf_show_delim(show),                       \
                                 btf_show_newline(show));                       \
                        if (show->state.depth > show->state.depth_to_show)     \
                                show->state.depth_to_show = show->state.depth; \
                }                                                               \
        } while (0)

#define btf_show_type_values(show, fmt, ...)                                       \
        do {                                                                       \
                btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show),       \
                         btf_show_name(show),                                       \
                         __VA_ARGS__, btf_show_delim(show),                       \
                         btf_show_newline(show));                               \
                if (show->state.depth > show->state.depth_to_show)               \
                        show->state.depth_to_show = show->state.depth;               \
        } while (0)

/* How much is left to copy to safe buffer after @data? */
static int btf_show_obj_size_left(struct btf_show *show, void *data)
{
        return show->obj.head + show->obj.size - data;
}

/* Is object pointed to by @data of @size already copied to our safe buffer? */
static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size)
{
        return data >= show->obj.data &&
               (data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE);
}

/*
 * If object pointed to by @data of @size falls within our safe buffer, return
 * the equivalent pointer to the same safe data.  Assumes
 * copy_from_kernel_nofault() has already happened and our safe buffer is
 * populated.
 */
static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size)
{
        if (btf_show_obj_is_safe(show, data, size))
                return show->obj.safe + (data - show->obj.data);
        return NULL;
}

/*
 * Return a safe-to-access version of data pointed to by @data.
 * We do this by copying the relevant amount of information
 * to the struct btf_show obj.safe buffer using copy_from_kernel_nofault().
 *
 * If BTF_SHOW_UNSAFE is specified, just return data as-is; no
 * safe copy is needed.
 *
 * Otherwise we need to determine if we have the required amount
 * of data (determined by the @data pointer and the size of the
 * largest base type we can encounter (represented by
 * BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures
 * that we will be able to print some of the current object,
 * and if more is needed a copy will be triggered.
 * Some objects such as structs will not fit into the buffer;
 * in such cases additional copies when we iterate over their
 * members may be needed.
 *
 * btf_show_obj_safe() is used to return a safe buffer for
 * btf_show_start_type(); this ensures that as we recurse into
 * nested types we always have safe data for the given type.
 * This approach is somewhat wasteful; it's possible for example
 * that when iterating over a large union we'll end up copying the
 * same data repeatedly, but the goal is safety not performance.
 * We use stack data as opposed to per-CPU buffers because the
 * iteration over a type can take some time, and preemption handling
 * would greatly complicate use of the safe buffer.
 */
static void *btf_show_obj_safe(struct btf_show *show,
                               const struct btf_type *t,
                               void *data)
{
        const struct btf_type *rt;
        int size_left, size;
        void *safe = NULL;

        if (show->flags & BTF_SHOW_UNSAFE)
                return data;

        rt = btf_resolve_size(show->btf, t, &size);
        if (IS_ERR(rt)) {
                show->state.status = PTR_ERR(rt);
                return NULL;
        }

        /*
         * Is this toplevel object? If so, set total object size and
         * initialize pointers.  Otherwise check if we still fall within
         * our safe object data.
         */
        if (show->state.depth == 0) {
                show->obj.size = size;
                show->obj.head = data;
        } else {
                /*
                 * If the size of the current object is > our remaining
                 * safe buffer we _may_ need to do a new copy.  However
                 * consider the case of a nested struct; it's size pushes
                 * us over the safe buffer limit, but showing any individual
                 * struct members does not.  In such cases, we don't need
                 * to initiate a fresh copy yet; however we definitely need
                 * at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left
                 * in our buffer, regardless of the current object size.
                 * The logic here is that as we resolve types we will
                 * hit a base type at some point, and we need to be sure
                 * the next chunk of data is safely available to display
                 * that type info safely.  We cannot rely on the size of
                 * the current object here because it may be much larger
                 * than our current buffer (e.g. task_struct is 8k).
                 * All we want to do here is ensure that we can print the
                 * next basic type, which we can if either
                 * - the current type size is within the safe buffer; or
                 * - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in
                 *   the safe buffer.
                 */
                safe = __btf_show_obj_safe(show, data,
                                           min(size,
                                               BTF_SHOW_OBJ_BASE_TYPE_SIZE));
        }

        /*
         * We need a new copy to our safe object, either because we haven't
         * yet copied and are initializing safe data, or because the data
         * we want falls outside the boundaries of the safe object.
         */
        if (!safe) {
                size_left = btf_show_obj_size_left(show, data);
                if (size_left > BTF_SHOW_OBJ_SAFE_SIZE)
                        size_left = BTF_SHOW_OBJ_SAFE_SIZE;
                show->state.status = copy_from_kernel_nofault(show->obj.safe,
                                                              data, size_left);
                if (!show->state.status) {
                        show->obj.data = data;
                        safe = show->obj.safe;
                }
        }

        return safe;
}

/*
 * Set the type we are starting to show and return a safe data pointer
 * to be used for showing the associated data.
 */
static void *btf_show_start_type(struct btf_show *show,
                                 const struct btf_type *t,
                                 u32 type_id, void *data)
{
        show->state.type = t;
        show->state.type_id = type_id;
        show->state.name[0] = '\0';

        return btf_show_obj_safe(show, t, data);
}

static void btf_show_end_type(struct btf_show *show)
{
        show->state.type = NULL;
        show->state.type_id = 0;
        show->state.name[0] = '\0';
}

static void *btf_show_start_aggr_type(struct btf_show *show,
                                      const struct btf_type *t,
                                      u32 type_id, void *data)
{
        void *safe_data = btf_show_start_type(show, t, type_id, data);

        if (!safe_data)
                return safe_data;

        btf_show(show, "%s%s%s", btf_show_indent(show),
                 btf_show_name(show),
                 btf_show_newline(show));
        show->state.depth++;
        return safe_data;
}

static void btf_show_end_aggr_type(struct btf_show *show,
                                   const char *suffix)
{
        show->state.depth--;
        btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix,
                 btf_show_delim(show), btf_show_newline(show));
        btf_show_end_type(show);
}

static void btf_show_start_member(struct btf_show *show,
                                  const struct btf_member *m)
{
        show->state.member = m;
}

static void btf_show_start_array_member(struct btf_show *show)
{
        show->state.array_member = 1;
        btf_show_start_member(show, NULL);
}

static void btf_show_end_member(struct btf_show *show)
{
        show->state.member = NULL;
}

static void btf_show_end_array_member(struct btf_show *show)
{
        show->state.array_member = 0;
        btf_show_end_member(show);
}

static void *btf_show_start_array_type(struct btf_show *show,
                                       const struct btf_type *t,
                                       u32 type_id,
                                       u16 array_encoding,
                                       void *data)
{
        show->state.array_encoding = array_encoding;
        show->state.array_terminated = 0;
        return btf_show_start_aggr_type(show, t, type_id, data);
}

static void btf_show_end_array_type(struct btf_show *show)
{
        show->state.array_encoding = 0;
        show->state.array_terminated = 0;
        btf_show_end_aggr_type(show, "]");
}

static void *btf_show_start_struct_type(struct btf_show *show,
                                        const struct btf_type *t,
                                        u32 type_id,
                                        void *data)
{
        return btf_show_start_aggr_type(show, t, type_id, data);
}

static void btf_show_end_struct_type(struct btf_show *show)
{
        btf_show_end_aggr_type(show, "}");
}

__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
                                              const char *fmt, ...)
{
        va_list args;

        va_start(args, fmt);
        bpf_verifier_vlog(log, fmt, args);
        va_end(args);
}

__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env,
                                            const char *fmt, ...)
{
        struct bpf_verifier_log *log = &env->log;
        va_list args;

        if (!bpf_verifier_log_needed(log))
                return;

        va_start(args, fmt);
        bpf_verifier_vlog(log, fmt, args);
        va_end(args);
}

__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
                                                   const struct btf_type *t,
                                                   bool log_details,
                                                   const char *fmt, ...)
{
        struct bpf_verifier_log *log = &env->log;
        struct btf *btf = env->btf;
        va_list args;

        if (!bpf_verifier_log_needed(log))
                return;

        if (log->level == BPF_LOG_KERNEL) {
                /* btf verifier prints all types it is processing via
                 * btf_verifier_log_type(..., fmt = NULL).
                 * Skip those prints for in-kernel BTF verification.
                 */
                if (!fmt)
                        return;

                /* Skip logging when loading module BTF with mismatches permitted */
                if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
                        return;
        }

        __btf_verifier_log(log, "[%u] %s %s%s",
                           env->log_type_id,
                           btf_type_str(t),
                           __btf_name_by_offset(btf, t->name_off),
                           log_details ? " " : "");

        if (log_details)
                btf_type_ops(t)->log_details(env, t);

        if (fmt && *fmt) {
                __btf_verifier_log(log, " ");
                va_start(args, fmt);
                bpf_verifier_vlog(log, fmt, args);
                va_end(args);
        }

        __btf_verifier_log(log, "\n");
}

#define btf_verifier_log_type(env, t, ...) \
        __btf_verifier_log_type((env), (t), true, __VA_ARGS__)
#define btf_verifier_log_basic(env, t, ...) \
        __btf_verifier_log_type((env), (t), false, __VA_ARGS__)

__printf(4, 5)
static void btf_verifier_log_member(struct btf_verifier_env *env,
                                    const struct btf_type *struct_type,
                                    const struct btf_member *member,
                                    const char *fmt, ...)
{
        struct bpf_verifier_log *log = &env->log;
        struct btf *btf = env->btf;
        va_list args;

        if (!bpf_verifier_log_needed(log))
                return;

        if (log->level == BPF_LOG_KERNEL) {
                if (!fmt)
                        return;

                /* Skip logging when loading module BTF with mismatches permitted */
                if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
                        return;
        }

        /* The CHECK_META phase already did a btf dump.
         *
         * If member is logged again, it must hit an error in
         * parsing this member.  It is useful to print out which
         * struct this member belongs to.
         */
        if (env->phase != CHECK_META)
                btf_verifier_log_type(env, struct_type, NULL);

        if (btf_type_kflag(struct_type))
                __btf_verifier_log(log,
                                   "\t%s type_id=%u bitfield_size=%u bits_offset=%u",
                                   __btf_name_by_offset(btf, member->name_off),
                                   member->type,
                                   BTF_MEMBER_BITFIELD_SIZE(member->offset),
                                   BTF_MEMBER_BIT_OFFSET(member->offset));
        else
                __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
                                   __btf_name_by_offset(btf, member->name_off),
                                   member->type, member->offset);

        if (fmt && *fmt) {
                __btf_verifier_log(log, " ");
                va_start(args, fmt);
                bpf_verifier_vlog(log, fmt, args);
                va_end(args);
        }

        __btf_verifier_log(log, "\n");
}

__printf(4, 5)
static void btf_verifier_log_vsi(struct btf_verifier_env *env,
                                 const struct btf_type *datasec_type,
                                 const struct btf_var_secinfo *vsi,
                                 const char *fmt, ...)
{
        struct bpf_verifier_log *log = &env->log;
        va_list args;

        if (!bpf_verifier_log_needed(log))
                return;
        if (log->level == BPF_LOG_KERNEL && !fmt)
                return;
        if (env->phase != CHECK_META)
                btf_verifier_log_type(env, datasec_type, NULL);

        __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u",
                           vsi->type, vsi->offset, vsi->size);
        if (fmt && *fmt) {
                __btf_verifier_log(log, " ");
                va_start(args, fmt);
                bpf_verifier_vlog(log, fmt, args);
                va_end(args);
        }

        __btf_verifier_log(log, "\n");
}

static void btf_verifier_log_hdr(struct btf_verifier_env *env,
                                 u32 btf_data_size)
{
        struct bpf_verifier_log *log = &env->log;
        const struct btf *btf = env->btf;
        const struct btf_header *hdr;

        if (!bpf_verifier_log_needed(log))
                return;

        if (log->level == BPF_LOG_KERNEL)
                return;
        hdr = &btf->hdr;
        __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
        __btf_verifier_log(log, "version: %u\n", hdr->version);
        __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
        __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len);
        __btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
        __btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
        __btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
        __btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
        if (hdr->hdr_len >= sizeof(struct btf_header) &&
            btf_data_size >= hdr->hdr_len) {
                __btf_verifier_log(log, "layout_off: %u\n", hdr->layout_off);
                __btf_verifier_log(log, "layout_len: %u\n", hdr->layout_len);
        }
        __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
}

static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
{
        struct btf *btf = env->btf;

        if (btf->types_size == btf->nr_types) {
                /* Expand 'types' array */

                struct btf_type **new_types;
                u32 expand_by, new_size;

                if (btf->start_id + btf->types_size == BTF_MAX_TYPE) {
                        btf_verifier_log(env, "Exceeded max num of types");
                        return -E2BIG;
                }

                expand_by = max_t(u32, btf->types_size >> 2, 16);
                new_size = min_t(u32, BTF_MAX_TYPE,
                                 btf->types_size + expand_by);

                new_types = kvzalloc_objs(*new_types, new_size,
                                          GFP_KERNEL | __GFP_NOWARN);
                if (!new_types)
                        return -ENOMEM;

                if (btf->nr_types == 0) {
                        if (!btf->base_btf) {
                                /* lazily init VOID type */
                                new_types[0] = &btf_void;
                                btf->nr_types++;
                        }
                } else {
                        memcpy(new_types, btf->types,
                               sizeof(*btf->types) * btf->nr_types);
                }

                kvfree(btf->types);
                btf->types = new_types;
                btf->types_size = new_size;
        }

        btf->types[btf->nr_types++] = t;

        return 0;
}

static int btf_alloc_id(struct btf *btf)
{
        int id;

        idr_preload(GFP_KERNEL);
        spin_lock_bh(&btf_idr_lock);
        id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC);
        if (id > 0)
                btf->id = id;
        spin_unlock_bh(&btf_idr_lock);
        idr_preload_end();

        if (WARN_ON_ONCE(!id))
                return -ENOSPC;

        return id > 0 ? 0 : id;
}

static void btf_free_id(struct btf *btf)
{
        unsigned long flags;

        /*
         * In map-in-map, calling map_delete_elem() on outer
         * map will call bpf_map_put on the inner map.
         * It will then eventually call btf_free_id()
         * on the inner map.  Some of the map_delete_elem()
         * implementation may have irq disabled, so
         * we need to use the _irqsave() version instead
         * of the _bh() version.
         */
        spin_lock_irqsave(&btf_idr_lock, flags);
        if (btf->id) {
                idr_remove(&btf_idr, btf->id);
                /*
                 * Clear the id here to make this function idempotent, since it will get
                 * called a couple of times for module BTFs: on module unload, and then
                 * the final btf_put(). btf_alloc_id() starts IDs with 1, so we can use
                 * 0 as sentinel value.
                 */
                WRITE_ONCE(btf->id, 0);
        }
        spin_unlock_irqrestore(&btf_idr_lock, flags);
}

static void btf_free_kfunc_set_tab(struct btf *btf)
{
        struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
        int hook;

        if (!tab)
                return;
        for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
                kfree(tab->sets[hook]);
        kfree(tab);
        btf->kfunc_set_tab = NULL;
}

static void btf_free_dtor_kfunc_tab(struct btf *btf)
{
        struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;

        if (!tab)
                return;
        kfree(tab);
        btf->dtor_kfunc_tab = NULL;
}

static void btf_struct_metas_free(struct btf_struct_metas *tab)
{
        int i;

        if (!tab)
                return;
        for (i = 0; i < tab->cnt; i++)
                btf_record_free(tab->types[i].record);
        kfree(tab);
}

static void btf_free_struct_meta_tab(struct btf *btf)
{
        struct btf_struct_metas *tab = btf->struct_meta_tab;

        btf_struct_metas_free(tab);
        btf->struct_meta_tab = NULL;
}

static void btf_free_struct_ops_tab(struct btf *btf)
{
        struct btf_struct_ops_tab *tab = btf->struct_ops_tab;
        u32 i;

        if (!tab)
                return;

        for (i = 0; i < tab->cnt; i++)
                bpf_struct_ops_desc_release(&tab->ops[i]);

        kfree(tab);
        btf->struct_ops_tab = NULL;
}

static void btf_free(struct btf *btf)
{
        btf_free_struct_meta_tab(btf);
        btf_free_dtor_kfunc_tab(btf);
        btf_free_kfunc_set_tab(btf);
        btf_free_struct_ops_tab(btf);
        kvfree(btf->types);
        kvfree(btf->resolved_sizes);
        kvfree(btf->resolved_ids);
        /* vmlinux does not allocate btf->data, it simply points it at
         * __start_BTF.
         */
        if (!btf_is_vmlinux(btf))
                kvfree(btf->data);
        kvfree(btf->base_id_map);
        kfree(btf);
}

static void btf_free_rcu(struct rcu_head *rcu)
{
        struct btf *btf = container_of(rcu, struct btf, rcu);

        btf_free(btf);
}

const char *btf_get_name(const struct btf *btf)
{
        return btf->name;
}

void btf_get(struct btf *btf)
{
        refcount_inc(&btf->refcnt);
}

void btf_put(struct btf *btf)
{
        if (btf && refcount_dec_and_test(&btf->refcnt)) {
                btf_free_id(btf);
                call_rcu(&btf->rcu, btf_free_rcu);
        }
}

struct btf *btf_base_btf(const struct btf *btf)
{
        return btf->base_btf;
}

const struct btf_header *btf_header(const struct btf *btf)
{
        return &btf->hdr;
}

void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
{
        btf->base_btf = (struct btf *)base_btf;
        btf->start_id = btf_nr_types(base_btf);
        btf->start_str_off = base_btf->hdr.str_len;
}

static int env_resolve_init(struct btf_verifier_env *env)
{
        struct btf *btf = env->btf;
        u32 nr_types = btf->nr_types;
        u32 *resolved_sizes = NULL;
        u32 *resolved_ids = NULL;
        u8 *visit_states = NULL;

        resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes),
                                  GFP_KERNEL | __GFP_NOWARN);
        if (!resolved_sizes)
                goto nomem;

        resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids),
                                GFP_KERNEL | __GFP_NOWARN);
        if (!resolved_ids)
                goto nomem;

        visit_states = kvcalloc(nr_types, sizeof(*visit_states),
                                GFP_KERNEL | __GFP_NOWARN);
        if (!visit_states)
                goto nomem;

        btf->resolved_sizes = resolved_sizes;
        btf->resolved_ids = resolved_ids;
        env->visit_states = visit_states;

        return 0;

nomem:
        kvfree(resolved_sizes);
        kvfree(resolved_ids);
        kvfree(visit_states);
        return -ENOMEM;
}

static void btf_verifier_env_free(struct btf_verifier_env *env)
{
        kvfree(env->visit_states);
        kfree(env);
}

static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
                                     const struct btf_type *next_type)
{
        switch (env->resolve_mode) {
        case RESOLVE_TBD:
                /* int, enum or void is a sink */
                return !btf_type_needs_resolve(next_type);
        case RESOLVE_PTR:
                /* int, enum, void, struct, array, func or func_proto is a sink
                 * for ptr
                 */
                return !btf_type_is_modifier(next_type) &&
                        !btf_type_is_ptr(next_type);
        case RESOLVE_STRUCT_OR_ARRAY:
                /* int, enum, void, ptr, func or func_proto is a sink
                 * for struct and array
                 */
                return !btf_type_is_modifier(next_type) &&
                        !btf_type_is_array(next_type) &&
                        !btf_type_is_struct(next_type);
        default:
                BUG();
        }
}

static bool env_type_is_resolved(const struct btf_verifier_env *env,
                                 u32 type_id)
{
        /* base BTF types should be resolved by now */
        if (type_id < env->btf->start_id)
                return true;

        return env->visit_states[type_id - env->btf->start_id] == RESOLVED;
}

static int env_stack_push(struct btf_verifier_env *env,
                          const struct btf_type *t, u32 type_id)
{
        const struct btf *btf = env->btf;
        struct resolve_vertex *v;

        if (env->top_stack == MAX_RESOLVE_DEPTH)
                return -E2BIG;

        if (type_id < btf->start_id
            || env->visit_states[type_id - btf->start_id] != NOT_VISITED)
                return -EEXIST;

        env->visit_states[type_id - btf->start_id] = VISITED;

        v = &env->stack[env->top_stack++];
        v->t = t;
        v->type_id = type_id;
        v->next_member = 0;

        if (env->resolve_mode == RESOLVE_TBD) {
                if (btf_type_is_ptr(t))
                        env->resolve_mode = RESOLVE_PTR;
                else if (btf_type_is_struct(t) || btf_type_is_array(t))
                        env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY;
        }

        return 0;
}

static void env_stack_set_next_member(struct btf_verifier_env *env,
                                      u16 next_member)
{
        env->stack[env->top_stack - 1].next_member = next_member;
}

static void env_stack_pop_resolved(struct btf_verifier_env *env,
                                   u32 resolved_type_id,
                                   u32 resolved_size)
{
        u32 type_id = env->stack[--(env->top_stack)].type_id;
        struct btf *btf = env->btf;

        type_id -= btf->start_id; /* adjust to local type id */
        btf->resolved_sizes[type_id] = resolved_size;
        btf->resolved_ids[type_id] = resolved_type_id;
        env->visit_states[type_id] = RESOLVED;
}

static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
{
        return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
}

/* Resolve the size of a passed-in "type"
 *
 * type: is an array (e.g. u32 array[x][y])
 * return type: type "u32[x][y]", i.e. BTF_KIND_ARRAY,
 * *type_size: (x * y * sizeof(u32)).  Hence, *type_size always
 *             corresponds to the return type.
 * *elem_type: u32
 * *elem_id: id of u32
 * *total_nelems: (x * y).  Hence, individual elem size is
 *                (*type_size / *total_nelems)
 * *type_id: id of type if it's changed within the function, 0 if not
 *
 * type: is not an array (e.g. const struct X)
 * return type: type "struct X"
 * *type_size: sizeof(struct X)
 * *elem_type: same as return type ("struct X")
 * *elem_id: 0
 * *total_nelems: 1
 * *type_id: id of type if it's changed within the function, 0 if not
 */
static const struct btf_type *
__btf_resolve_size(const struct btf *btf, const struct btf_type *type,
                   u32 *type_size, const struct btf_type **elem_type,
                   u32 *elem_id, u32 *total_nelems, u32 *type_id)
{
        const struct btf_type *array_type = NULL;
        const struct btf_array *array = NULL;
        u32 i, size, nelems = 1, id = 0;

        for (i = 0; i < MAX_RESOLVE_DEPTH; i++) {
                switch (BTF_INFO_KIND(type->info)) {
                /* type->size can be used */
                case BTF_KIND_INT:
                case BTF_KIND_STRUCT:
                case BTF_KIND_UNION:
                case BTF_KIND_ENUM:
                case BTF_KIND_FLOAT:
                case BTF_KIND_ENUM64:
                        size = type->size;
                        goto resolved;

                case BTF_KIND_PTR:
                        size = sizeof(void *);
                        goto resolved;

                /* Modifiers */
                case BTF_KIND_TYPEDEF:
                case BTF_KIND_VOLATILE:
                case BTF_KIND_CONST:
                case BTF_KIND_RESTRICT:
                case BTF_KIND_TYPE_TAG:
                        id = type->type;
                        type = btf_type_by_id(btf, type->type);
                        break;

                case BTF_KIND_ARRAY:
                        if (!array_type)
                                array_type = type;
                        array = btf_type_array(type);
                        if (nelems && array->nelems > U32_MAX / nelems)
                                return ERR_PTR(-EINVAL);
                        nelems *= array->nelems;
                        type = btf_type_by_id(btf, array->type);
                        break;

                /* type without size */
                default:
                        return ERR_PTR(-EINVAL);
                }
        }

        return ERR_PTR(-EINVAL);

resolved:
        if (nelems && size > U32_MAX / nelems)
                return ERR_PTR(-EINVAL);

        *type_size = nelems * size;
        if (total_nelems)
                *total_nelems = nelems;
        if (elem_type)
                *elem_type = type;
        if (elem_id)
                *elem_id = array ? array->type : 0;
        if (type_id && id)
                *type_id = id;

        return array_type ? : type;
}

const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
                 u32 *type_size)
{
        return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL);
}

static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id)
{
        while (type_id < btf->start_id)
                btf = btf->base_btf;

        return btf->resolved_ids[type_id - btf->start_id];
}

/* The input param "type_id" must point to a needs_resolve type */
static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
                                                  u32 *type_id)
{
        *type_id = btf_resolved_type_id(btf, *type_id);
        return btf_type_by_id(btf, *type_id);
}

static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id)
{
        while (type_id < btf->start_id)
                btf = btf->base_btf;

        return btf->resolved_sizes[type_id - btf->start_id];
}

const struct btf_type *btf_type_id_size(const struct btf *btf,
                                        u32 *type_id, u32 *ret_size)
{
        const struct btf_type *size_type;
        u32 size_type_id = *type_id;
        u32 size = 0;

        size_type = btf_type_by_id(btf, size_type_id);
        if (btf_type_nosize_or_null(size_type))
                return NULL;

        if (btf_type_has_size(size_type)) {
                size = size_type->size;
        } else if (btf_type_is_array(size_type)) {
                size = btf_resolved_type_size(btf, size_type_id);
        } else if (btf_type_is_ptr(size_type)) {
                size = sizeof(void *);
        } else {
                if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) &&
                                 !btf_type_is_var(size_type)))
                        return NULL;

                size_type_id = btf_resolved_type_id(btf, size_type_id);
                size_type = btf_type_by_id(btf, size_type_id);
                if (btf_type_nosize_or_null(size_type))
                        return NULL;
                else if (btf_type_has_size(size_type))
                        size = size_type->size;
                else if (btf_type_is_array(size_type))
                        size = btf_resolved_type_size(btf, size_type_id);
                else if (btf_type_is_ptr(size_type))
                        size = sizeof(void *);
                else
                        return NULL;
        }

        *type_id = size_type_id;
        if (ret_size)
                *ret_size = size;

        return size_type;
}

static int btf_df_check_member(struct btf_verifier_env *env,
                               const struct btf_type *struct_type,
                               const struct btf_member *member,
                               const struct btf_type *member_type)
{
        btf_verifier_log_basic(env, struct_type,
                               "Unsupported check_member");
        return -EINVAL;
}

static int btf_df_check_kflag_member(struct btf_verifier_env *env,
                                     const struct btf_type *struct_type,
                                     const struct btf_member *member,
                                     const struct btf_type *member_type)
{
        btf_verifier_log_basic(env, struct_type,
                               "Unsupported check_kflag_member");
        return -EINVAL;
}

/* Used for ptr, array struct/union and float type members.
 * int, enum and modifier types have their specific callback functions.
 */
static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
                                          const struct btf_type *struct_type,
                                          const struct btf_member *member,
                                          const struct btf_type *member_type)
{
        if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Invalid member bitfield_size");
                return -EINVAL;
        }

        /* bitfield size is 0, so member->offset represents bit offset only.
         * It is safe to call non kflag check_member variants.
         */
        return btf_type_ops(member_type)->check_member(env, struct_type,
                                                       member,
                                                       member_type);
}

static int btf_df_resolve(struct btf_verifier_env *env,
                          const struct resolve_vertex *v)
{
        btf_verifier_log_basic(env, v->t, "Unsupported resolve");
        return -EINVAL;
}

static void btf_df_show(const struct btf *btf, const struct btf_type *t,
                        u32 type_id, void *data, u8 bits_offsets,
                        struct btf_show *show)
{
        btf_show(show, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
}

static int btf_int_check_member(struct btf_verifier_env *env,
                                const struct btf_type *struct_type,
                                const struct btf_member *member,
                                const struct btf_type *member_type)
{
        u32 int_data = btf_type_int(member_type);
        u32 struct_bits_off = member->offset;
        u32 struct_size = struct_type->size;
        u32 nr_copy_bits;
        u32 bytes_offset;

        if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) {
                btf_verifier_log_member(env, struct_type, member,
                                        "bits_offset exceeds U32_MAX");
                return -EINVAL;
        }

        struct_bits_off += BTF_INT_OFFSET(int_data);
        bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
        nr_copy_bits = BTF_INT_BITS(int_data) +
                BITS_PER_BYTE_MASKED(struct_bits_off);

        if (nr_copy_bits > BITS_PER_U128) {
                btf_verifier_log_member(env, struct_type, member,
                                        "nr_copy_bits exceeds 128");
                return -EINVAL;
        }

        if (struct_size < bytes_offset ||
            struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member exceeds struct_size");
                return -EINVAL;
        }

        return 0;
}

static int btf_int_check_kflag_member(struct btf_verifier_env *env,
                                      const struct btf_type *struct_type,
                                      const struct btf_member *member,
                                      const struct btf_type *member_type)
{
        u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset;
        u32 int_data = btf_type_int(member_type);
        u32 struct_size = struct_type->size;
        u32 nr_copy_bits;

        /* a regular int type is required for the kflag int member */
        if (!btf_type_int_is_regular(member_type)) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Invalid member base type");
                return -EINVAL;
        }

        /* check sanity of bitfield size */
        nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
        struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
        nr_int_data_bits = BTF_INT_BITS(int_data);
        if (!nr_bits) {
                /* Not a bitfield member, member offset must be at byte
                 * boundary.
                 */
                if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
                        btf_verifier_log_member(env, struct_type, member,
                                                "Invalid member offset");
                        return -EINVAL;
                }

                nr_bits = nr_int_data_bits;
        } else if (nr_bits > nr_int_data_bits) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Invalid member bitfield_size");
                return -EINVAL;
        }

        bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
        nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off);
        if (nr_copy_bits > BITS_PER_U128) {
                btf_verifier_log_member(env, struct_type, member,
                                        "nr_copy_bits exceeds 128");
                return -EINVAL;
        }

        if (struct_size < bytes_offset ||
            struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member exceeds struct_size");
                return -EINVAL;
        }

        return 0;
}

static s32 btf_int_check_meta(struct btf_verifier_env *env,
                              const struct btf_type *t,
                              u32 meta_left)
{
        u32 int_data, nr_bits, meta_needed = sizeof(int_data);
        u16 encoding;

        if (meta_left < meta_needed) {
                btf_verifier_log_basic(env, t,
                                       "meta_left:%u meta_needed:%u",
                                       meta_left, meta_needed);
                return -EINVAL;
        }

        if (btf_type_vlen(t)) {
                btf_verifier_log_type(env, t, "vlen != 0");
                return -EINVAL;
        }

        if (btf_type_kflag(t)) {
                btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
                return -EINVAL;
        }

        int_data = btf_type_int(t);
        if (int_data & ~BTF_INT_MASK) {
                btf_verifier_log_basic(env, t, "Invalid int_data:%x",
                                       int_data);
                return -EINVAL;
        }

        nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);

        if (nr_bits > BITS_PER_U128) {
                btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
                                      BITS_PER_U128);
                return -EINVAL;
        }

        if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) {
                btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
                return -EINVAL;
        }

        /*
         * Only one of the encoding bits is allowed and it
         * should be sufficient for the pretty print purpose (i.e. decoding).
         * Multiple bits can be allowed later if it is found
         * to be insufficient.
         */
        encoding = BTF_INT_ENCODING(int_data);
        if (encoding &&
            encoding != BTF_INT_SIGNED &&
            encoding != BTF_INT_CHAR &&
            encoding != BTF_INT_BOOL) {
                btf_verifier_log_type(env, t, "Unsupported encoding");
                return -ENOTSUPP;
        }

        btf_verifier_log_type(env, t, NULL);

        return meta_needed;
}

static void btf_int_log(struct btf_verifier_env *env,
                        const struct btf_type *t)
{
        int int_data = btf_type_int(t);

        btf_verifier_log(env,
                         "size=%u bits_offset=%u nr_bits=%u encoding=%s",
                         t->size, BTF_INT_OFFSET(int_data),
                         BTF_INT_BITS(int_data),
                         btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}

static void btf_int128_print(struct btf_show *show, void *data)
{
        /* data points to a __int128 number.
         * Suppose
         *     int128_num = *(__int128 *)data;
         * The below formulas shows what upper_num and lower_num represents:
         *     upper_num = int128_num >> 64;
         *     lower_num = int128_num & 0xffffffffFFFFFFFFULL;
         */
        u64 upper_num, lower_num;

#ifdef __BIG_ENDIAN_BITFIELD
        upper_num = *(u64 *)data;
        lower_num = *(u64 *)(data + 8);
#else
        upper_num = *(u64 *)(data + 8);
        lower_num = *(u64 *)data;
#endif
        if (upper_num == 0)
                btf_show_type_value(show, "0x%llx", lower_num);
        else
                btf_show_type_values(show, "0x%llx%016llx", upper_num,
                                     lower_num);
}

static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
                             u16 right_shift_bits)
{
        u64 upper_num, lower_num;

#ifdef __BIG_ENDIAN_BITFIELD
        upper_num = print_num[0];
        lower_num = print_num[1];
#else
        upper_num = print_num[1];
        lower_num = print_num[0];
#endif

        /* shake out un-needed bits by shift/or operations */
        if (left_shift_bits >= 64) {
                upper_num = lower_num << (left_shift_bits - 64);
                lower_num = 0;
        } else {
                upper_num = (upper_num << left_shift_bits) |
                            (lower_num >> (64 - left_shift_bits));
                lower_num = lower_num << left_shift_bits;
        }

        if (right_shift_bits >= 64) {
                lower_num = upper_num >> (right_shift_bits - 64);
                upper_num = 0;
        } else {
                lower_num = (lower_num >> right_shift_bits) |
                            (upper_num << (64 - right_shift_bits));
                upper_num = upper_num >> right_shift_bits;
        }

#ifdef __BIG_ENDIAN_BITFIELD
        print_num[0] = upper_num;
        print_num[1] = lower_num;
#else
        print_num[0] = lower_num;
        print_num[1] = upper_num;
#endif
}

static void btf_bitfield_show(void *data, u8 bits_offset,
                              u8 nr_bits, struct btf_show *show)
{
        u16 left_shift_bits, right_shift_bits;
        u8 nr_copy_bytes;
        u8 nr_copy_bits;
        u64 print_num[2] = {};

        nr_copy_bits = nr_bits + bits_offset;
        nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);

        memcpy(print_num, data, nr_copy_bytes);

#ifdef __BIG_ENDIAN_BITFIELD
        left_shift_bits = bits_offset;
#else
        left_shift_bits = BITS_PER_U128 - nr_copy_bits;
#endif
        right_shift_bits = BITS_PER_U128 - nr_bits;

        btf_int128_shift(print_num, left_shift_bits, right_shift_bits);
        btf_int128_print(show, print_num);
}


static void btf_int_bits_show(const struct btf *btf,
                              const struct btf_type *t,
                              void *data, u8 bits_offset,
                              struct btf_show *show)
{
        u32 int_data = btf_type_int(t);
        u8 nr_bits = BTF_INT_BITS(int_data);
        u8 total_bits_offset;

        /*
         * bits_offset is at most 7.
         * BTF_INT_OFFSET() cannot exceed 128 bits.
         */
        total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
        data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
        bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
        btf_bitfield_show(data, bits_offset, nr_bits, show);
}

static void btf_int_show(const struct btf *btf, const struct btf_type *t,
                         u32 type_id, void *data, u8 bits_offset,
                         struct btf_show *show)
{
        u32 int_data = btf_type_int(t);
        u8 encoding = BTF_INT_ENCODING(int_data);
        bool sign = encoding & BTF_INT_SIGNED;
        u8 nr_bits = BTF_INT_BITS(int_data);
        void *safe_data;

        safe_data = btf_show_start_type(show, t, type_id, data);
        if (!safe_data)
                return;

        if (bits_offset || BTF_INT_OFFSET(int_data) ||
            BITS_PER_BYTE_MASKED(nr_bits)) {
                btf_int_bits_show(btf, t, safe_data, bits_offset, show);
                goto out;
        }

        switch (nr_bits) {
        case 128:
                btf_int128_print(show, safe_data);
                break;
        case 64:
                if (sign)
                        btf_show_type_value(show, "%lld", *(s64 *)safe_data);
                else
                        btf_show_type_value(show, "%llu", *(u64 *)safe_data);
                break;
        case 32:
                if (sign)
                        btf_show_type_value(show, "%d", *(s32 *)safe_data);
                else
                        btf_show_type_value(show, "%u", *(u32 *)safe_data);
                break;
        case 16:
                if (sign)
                        btf_show_type_value(show, "%d", *(s16 *)safe_data);
                else
                        btf_show_type_value(show, "%u", *(u16 *)safe_data);
                break;
        case 8:
                if (show->state.array_encoding == BTF_INT_CHAR) {
                        /* check for null terminator */
                        if (show->state.array_terminated)
                                break;
                        if (*(char *)data == '\0') {
                                show->state.array_terminated = 1;
                                break;
                        }
                        if (isprint(*(char *)data)) {
                                btf_show_type_value(show, "'%c'",
                                                    *(char *)safe_data);
                                break;
                        }
                }
                if (sign)
                        btf_show_type_value(show, "%d", *(s8 *)safe_data);
                else
                        btf_show_type_value(show, "%u", *(u8 *)safe_data);
                break;
        default:
                btf_int_bits_show(btf, t, safe_data, bits_offset, show);
                break;
        }
out:
        btf_show_end_type(show);
}

static const struct btf_kind_operations int_ops = {
        .check_meta = btf_int_check_meta,
        .resolve = btf_df_resolve,
        .check_member = btf_int_check_member,
        .check_kflag_member = btf_int_check_kflag_member,
        .log_details = btf_int_log,
        .show = btf_int_show,
};

static int btf_modifier_check_member(struct btf_verifier_env *env,
                                     const struct btf_type *struct_type,
                                     const struct btf_member *member,
                                     const struct btf_type *member_type)
{
        const struct btf_type *resolved_type;
        u32 resolved_type_id = member->type;
        struct btf_member resolved_member;
        struct btf *btf = env->btf;

        resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
        if (!resolved_type) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Invalid member");
                return -EINVAL;
        }

        resolved_member = *member;
        resolved_member.type = resolved_type_id;

        return btf_type_ops(resolved_type)->check_member(env, struct_type,
                                                         &resolved_member,
                                                         resolved_type);
}

static int btf_modifier_check_kflag_member(struct btf_verifier_env *env,
                                           const struct btf_type *struct_type,
                                           const struct btf_member *member,
                                           const struct btf_type *member_type)
{
        const struct btf_type *resolved_type;
        u32 resolved_type_id = member->type;
        struct btf_member resolved_member;
        struct btf *btf = env->btf;

        resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
        if (!resolved_type) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Invalid member");
                return -EINVAL;
        }

        resolved_member = *member;
        resolved_member.type = resolved_type_id;

        return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type,
                                                               &resolved_member,
                                                               resolved_type);
}

static int btf_ptr_check_member(struct btf_verifier_env *env,
                                const struct btf_type *struct_type,
                                const struct btf_member *member,
                                const struct btf_type *member_type)
{
        u32 struct_size, struct_bits_off, bytes_offset;

        struct_size = struct_type->size;
        struct_bits_off = member->offset;
        bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);

        if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member is not byte aligned");
                return -EINVAL;
        }

        if (struct_size - bytes_offset < sizeof(void *)) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member exceeds struct_size");
                return -EINVAL;
        }

        return 0;
}

static int btf_ref_type_check_meta(struct btf_verifier_env *env,
                                   const struct btf_type *t,
                                   u32 meta_left)
{
        const char *value;

        if (btf_type_vlen(t)) {
                btf_verifier_log_type(env, t, "vlen != 0");
                return -EINVAL;
        }

        if (btf_type_kflag(t) && !btf_type_is_type_tag(t)) {
                btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
                return -EINVAL;
        }

        if (!BTF_TYPE_ID_VALID(t->type)) {
                btf_verifier_log_type(env, t, "Invalid type_id");
                return -EINVAL;
        }

        /* typedef/type_tag type must have a valid name, and other ref types,
         * volatile, const, restrict, should have a null name.
         */
        if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
                if (!t->name_off ||
                    !btf_name_valid_identifier(env->btf, t->name_off)) {
                        btf_verifier_log_type(env, t, "Invalid name");
                        return -EINVAL;
                }
        } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
                value = btf_name_by_offset(env->btf, t->name_off);
                if (!value || !value[0]) {
                        btf_verifier_log_type(env, t, "Invalid name");
                        return -EINVAL;
                }
        } else {
                if (t->name_off) {
                        btf_verifier_log_type(env, t, "Invalid name");
                        return -EINVAL;
                }
        }

        btf_verifier_log_type(env, t, NULL);

        return 0;
}

static int btf_modifier_resolve(struct btf_verifier_env *env,
                                const struct resolve_vertex *v)
{
        const struct btf_type *t = v->t;
        const struct btf_type *next_type;
        u32 next_type_id = t->type;
        struct btf *btf = env->btf;

        next_type = btf_type_by_id(btf, next_type_id);
        if (!next_type || btf_type_is_resolve_source_only(next_type)) {
                btf_verifier_log_type(env, v->t, "Invalid type_id");
                return -EINVAL;
        }

        if (!env_type_is_resolve_sink(env, next_type) &&
            !env_type_is_resolved(env, next_type_id))
                return env_stack_push(env, next_type, next_type_id);

        /* Figure out the resolved next_type_id with size.
         * They will be stored in the current modifier's
         * resolved_ids and resolved_sizes such that it can
         * save us a few type-following when we use it later (e.g. in
         * pretty print).
         */
        if (!btf_type_id_size(btf, &next_type_id, NULL)) {
                if (env_type_is_resolved(env, next_type_id))
                        next_type = btf_type_id_resolve(btf, &next_type_id);

                /* "typedef void new_void", "const void"...etc */
                if (!btf_type_is_void(next_type) &&
                    !btf_type_is_fwd(next_type) &&
                    !btf_type_is_func_proto(next_type)) {
                        btf_verifier_log_type(env, v->t, "Invalid type_id");
                        return -EINVAL;
                }
        }

        env_stack_pop_resolved(env, next_type_id, 0);

        return 0;
}

static int btf_var_resolve(struct btf_verifier_env *env,
                           const struct resolve_vertex *v)
{
        const struct btf_type *next_type;
        const struct btf_type *t = v->t;
        u32 next_type_id = t->type;
        struct btf *btf = env->btf;

        next_type = btf_type_by_id(btf, next_type_id);
        if (!next_type || btf_type_is_resolve_source_only(next_type)) {
                btf_verifier_log_type(env, v->t, "Invalid type_id");
                return -EINVAL;
        }

        if (!env_type_is_resolve_sink(env, next_type) &&
            !env_type_is_resolved(env, next_type_id))
                return env_stack_push(env, next_type, next_type_id);

        if (btf_type_is_modifier(next_type)) {
                const struct btf_type *resolved_type;
                u32 resolved_type_id;

                resolved_type_id = next_type_id;
                resolved_type = btf_type_id_resolve(btf, &resolved_type_id);

                if (btf_type_is_ptr(resolved_type) &&
                    !env_type_is_resolve_sink(env, resolved_type) &&
                    !env_type_is_resolved(env, resolved_type_id))
                        return env_stack_push(env, resolved_type,
                                              resolved_type_id);
        }

        /* We must resolve to something concrete at this point, no
         * forward types or similar that would resolve to size of
         * zero is allowed.
         */
        if (!btf_type_id_size(btf, &next_type_id, NULL)) {
                btf_verifier_log_type(env, v->t, "Invalid type_id");
                return -EINVAL;
        }

        env_stack_pop_resolved(env, next_type_id, 0);

        return 0;
}

static int btf_ptr_resolve(struct btf_verifier_env *env,
                           const struct resolve_vertex *v)
{
        const struct btf_type *next_type;
        const struct btf_type *t = v->t;
        u32 next_type_id = t->type;
        struct btf *btf = env->btf;

        next_type = btf_type_by_id(btf, next_type_id);
        if (!next_type || btf_type_is_resolve_source_only(next_type)) {
                btf_verifier_log_type(env, v->t, "Invalid type_id");
                return -EINVAL;
        }

        if (!env_type_is_resolve_sink(env, next_type) &&
            !env_type_is_resolved(env, next_type_id))
                return env_stack_push(env, next_type, next_type_id);

        /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY,
         * the modifier may have stopped resolving when it was resolved
         * to a ptr (last-resolved-ptr).
         *
         * We now need to continue from the last-resolved-ptr to
         * ensure the last-resolved-ptr will not referring back to
         * the current ptr (t).
         */
        if (btf_type_is_modifier(next_type)) {
                const struct btf_type *resolved_type;
                u32 resolved_type_id;

                resolved_type_id = next_type_id;
                resolved_type = btf_type_id_resolve(btf, &resolved_type_id);

                if (btf_type_is_ptr(resolved_type) &&
                    !env_type_is_resolve_sink(env, resolved_type) &&
                    !env_type_is_resolved(env, resolved_type_id))
                        return env_stack_push(env, resolved_type,
                                              resolved_type_id);
        }

        if (!btf_type_id_size(btf, &next_type_id, NULL)) {
                if (env_type_is_resolved(env, next_type_id))
                        next_type = btf_type_id_resolve(btf, &next_type_id);

                if (!btf_type_is_void(next_type) &&
                    !btf_type_is_fwd(next_type) &&
                    !btf_type_is_func_proto(next_type)) {
                        btf_verifier_log_type(env, v->t, "Invalid type_id");
                        return -EINVAL;
                }
        }

        env_stack_pop_resolved(env, next_type_id, 0);

        return 0;
}

static void btf_modifier_show(const struct btf *btf,
                              const struct btf_type *t,
                              u32 type_id, void *data,
                              u8 bits_offset, struct btf_show *show)
{
        if (btf->resolved_ids)
                t = btf_type_id_resolve(btf, &type_id);
        else
                t = btf_type_skip_modifiers(btf, type_id, NULL);

        btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
}

static void btf_var_show(const struct btf *btf, const struct btf_type *t,
                         u32 type_id, void *data, u8 bits_offset,
                         struct btf_show *show)
{
        t = btf_type_id_resolve(btf, &type_id);

        btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
}

static void btf_ptr_show(const struct btf *btf, const struct btf_type *t,
                         u32 type_id, void *data, u8 bits_offset,
                         struct btf_show *show)
{
        void *safe_data;

        safe_data = btf_show_start_type(show, t, type_id, data);
        if (!safe_data)
                return;

        /* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */
        if (show->flags & BTF_SHOW_PTR_RAW)
                btf_show_type_value(show, "0x%px", *(void **)safe_data);
        else
                btf_show_type_value(show, "0x%p", *(void **)safe_data);
        btf_show_end_type(show);
}

static void btf_ref_type_log(struct btf_verifier_env *env,
                             const struct btf_type *t)
{
        btf_verifier_log(env, "type_id=%u", t->type);
}

static const struct btf_kind_operations modifier_ops = {
        .check_meta = btf_ref_type_check_meta,
        .resolve = btf_modifier_resolve,
        .check_member = btf_modifier_check_member,
        .check_kflag_member = btf_modifier_check_kflag_member,
        .log_details = btf_ref_type_log,
        .show = btf_modifier_show,
};

static const struct btf_kind_operations ptr_ops = {
        .check_meta = btf_ref_type_check_meta,
        .resolve = btf_ptr_resolve,
        .check_member = btf_ptr_check_member,
        .check_kflag_member = btf_generic_check_kflag_member,
        .log_details = btf_ref_type_log,
        .show = btf_ptr_show,
};

static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
                              const struct btf_type *t,
                              u32 meta_left)
{
        if (btf_type_vlen(t)) {
                btf_verifier_log_type(env, t, "vlen != 0");
                return -EINVAL;
        }

        if (t->type) {
                btf_verifier_log_type(env, t, "type != 0");
                return -EINVAL;
        }

        /* fwd type must have a valid name */
        if (!t->name_off ||
            !btf_name_valid_identifier(env->btf, t->name_off)) {
                btf_verifier_log_type(env, t, "Invalid name");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        return 0;
}

static void btf_fwd_type_log(struct btf_verifier_env *env,
                             const struct btf_type *t)
{
        btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
}

static const struct btf_kind_operations fwd_ops = {
        .check_meta = btf_fwd_check_meta,
        .resolve = btf_df_resolve,
        .check_member = btf_df_check_member,
        .check_kflag_member = btf_df_check_kflag_member,
        .log_details = btf_fwd_type_log,
        .show = btf_df_show,
};

static int btf_array_check_member(struct btf_verifier_env *env,
                                  const struct btf_type *struct_type,
                                  const struct btf_member *member,
                                  const struct btf_type *member_type)
{
        u32 struct_bits_off = member->offset;
        u32 struct_size, bytes_offset;
        u32 array_type_id, array_size;
        struct btf *btf = env->btf;

        if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member is not byte aligned");
                return -EINVAL;
        }

        array_type_id = member->type;
        btf_type_id_size(btf, &array_type_id, &array_size);
        struct_size = struct_type->size;
        bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
        if (struct_size - bytes_offset < array_size) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member exceeds struct_size");
                return -EINVAL;
        }

        return 0;
}

static s32 btf_array_check_meta(struct btf_verifier_env *env,
                                const struct btf_type *t,
                                u32 meta_left)
{
        const struct btf_array *array = btf_type_array(t);
        u32 meta_needed = sizeof(*array);

        if (meta_left < meta_needed) {
                btf_verifier_log_basic(env, t,
                                       "meta_left:%u meta_needed:%u",
                                       meta_left, meta_needed);
                return -EINVAL;
        }

        /* array type should not have a name */
        if (t->name_off) {
                btf_verifier_log_type(env, t, "Invalid name");
                return -EINVAL;
        }

        if (btf_type_vlen(t)) {
                btf_verifier_log_type(env, t, "vlen != 0");
                return -EINVAL;
        }

        if (btf_type_kflag(t)) {
                btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
                return -EINVAL;
        }

        if (t->size) {
                btf_verifier_log_type(env, t, "size != 0");
                return -EINVAL;
        }

        /* Array elem type and index type cannot be in type void,
         * so !array->type and !array->index_type are not allowed.
         */
        if (!array->type || !BTF_TYPE_ID_VALID(array->type)) {
                btf_verifier_log_type(env, t, "Invalid elem");
                return -EINVAL;
        }

        if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) {
                btf_verifier_log_type(env, t, "Invalid index");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        return meta_needed;
}

static int btf_array_resolve(struct btf_verifier_env *env,
                             const struct resolve_vertex *v)
{
        const struct btf_array *array = btf_type_array(v->t);
        const struct btf_type *elem_type, *index_type;
        u32 elem_type_id, index_type_id;
        struct btf *btf = env->btf;
        u32 elem_size;

        /* Check array->index_type */
        index_type_id = array->index_type;
        index_type = btf_type_by_id(btf, index_type_id);
        if (btf_type_nosize_or_null(index_type) ||
            btf_type_is_resolve_source_only(index_type)) {
                btf_verifier_log_type(env, v->t, "Invalid index");
                return -EINVAL;
        }

        if (!env_type_is_resolve_sink(env, index_type) &&
            !env_type_is_resolved(env, index_type_id))
                return env_stack_push(env, index_type, index_type_id);

        index_type = btf_type_id_size(btf, &index_type_id, NULL);
        if (!index_type || !btf_type_is_int(index_type) ||
            !btf_type_int_is_regular(index_type)) {
                btf_verifier_log_type(env, v->t, "Invalid index");
                return -EINVAL;
        }

        /* Check array->type */
        elem_type_id = array->type;
        elem_type = btf_type_by_id(btf, elem_type_id);
        if (btf_type_nosize_or_null(elem_type) ||
            btf_type_is_resolve_source_only(elem_type)) {
                btf_verifier_log_type(env, v->t,
                                      "Invalid elem");
                return -EINVAL;
        }

        if (!env_type_is_resolve_sink(env, elem_type) &&
            !env_type_is_resolved(env, elem_type_id))
                return env_stack_push(env, elem_type, elem_type_id);

        elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
        if (!elem_type) {
                btf_verifier_log_type(env, v->t, "Invalid elem");
                return -EINVAL;
        }

        if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) {
                btf_verifier_log_type(env, v->t, "Invalid array of int");
                return -EINVAL;
        }

        if (array->nelems && elem_size > U32_MAX / array->nelems) {
                btf_verifier_log_type(env, v->t,
                                      "Array size overflows U32_MAX");
                return -EINVAL;
        }

        env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems);

        return 0;
}

static void btf_array_log(struct btf_verifier_env *env,
                          const struct btf_type *t)
{
        const struct btf_array *array = btf_type_array(t);

        btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u",
                         array->type, array->index_type, array->nelems);
}

static void __btf_array_show(const struct btf *btf, const struct btf_type *t,
                             u32 type_id, void *data, u8 bits_offset,
                             struct btf_show *show)
{
        const struct btf_array *array = btf_type_array(t);
        const struct btf_kind_operations *elem_ops;
        const struct btf_type *elem_type;
        u32 i, elem_size = 0, elem_type_id;
        u16 encoding = 0;

        elem_type_id = array->type;
        elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL);
        if (elem_type && btf_type_has_size(elem_type))
                elem_size = elem_type->size;

        if (elem_type && btf_type_is_int(elem_type)) {
                u32 int_type = btf_type_int(elem_type);

                encoding = BTF_INT_ENCODING(int_type);

                /*
                 * BTF_INT_CHAR encoding never seems to be set for
                 * char arrays, so if size is 1 and element is
                 * printable as a char, we'll do that.
                 */
                if (elem_size == 1)
                        encoding = BTF_INT_CHAR;
        }

        if (!btf_show_start_array_type(show, t, type_id, encoding, data))
                return;

        if (!elem_type)
                goto out;
        elem_ops = btf_type_ops(elem_type);

        for (i = 0; i < array->nelems; i++) {

                btf_show_start_array_member(show);

                elem_ops->show(btf, elem_type, elem_type_id, data,
                               bits_offset, show);
                data += elem_size;

                btf_show_end_array_member(show);

                if (show->state.array_terminated)
                        break;
        }
out:
        btf_show_end_array_type(show);
}

static void btf_array_show(const struct btf *btf, const struct btf_type *t,
                           u32 type_id, void *data, u8 bits_offset,
                           struct btf_show *show)
{
        const struct btf_member *m = show->state.member;

        /*
         * First check if any members would be shown (are non-zero).
         * See comments above "struct btf_show" definition for more
         * details on how this works at a high-level.
         */
        if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
                if (!show->state.depth_check) {
                        show->state.depth_check = show->state.depth + 1;
                        show->state.depth_to_show = 0;
                }
                __btf_array_show(btf, t, type_id, data, bits_offset, show);
                show->state.member = m;

                if (show->state.depth_check != show->state.depth + 1)
                        return;
                show->state.depth_check = 0;

                if (show->state.depth_to_show <= show->state.depth)
                        return;
                /*
                 * Reaching here indicates we have recursed and found
                 * non-zero array member(s).
                 */
        }
        __btf_array_show(btf, t, type_id, data, bits_offset, show);
}

static const struct btf_kind_operations array_ops = {
        .check_meta = btf_array_check_meta,
        .resolve = btf_array_resolve,
        .check_member = btf_array_check_member,
        .check_kflag_member = btf_generic_check_kflag_member,
        .log_details = btf_array_log,
        .show = btf_array_show,
};

static int btf_struct_check_member(struct btf_verifier_env *env,
                                   const struct btf_type *struct_type,
                                   const struct btf_member *member,
                                   const struct btf_type *member_type)
{
        u32 struct_bits_off = member->offset;
        u32 struct_size, bytes_offset;

        if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member is not byte aligned");
                return -EINVAL;
        }

        struct_size = struct_type->size;
        bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
        if (struct_size - bytes_offset < member_type->size) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member exceeds struct_size");
                return -EINVAL;
        }

        return 0;
}

static s32 btf_struct_check_meta(struct btf_verifier_env *env,
                                 const struct btf_type *t,
                                 u32 meta_left)
{
        bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
        const struct btf_member *member;
        u32 meta_needed, last_offset;
        struct btf *btf = env->btf;
        u32 struct_size = t->size;
        u32 offset;
        u16 i;

        meta_needed = btf_type_vlen(t) * sizeof(*member);
        if (meta_left < meta_needed) {
                btf_verifier_log_basic(env, t,
                                       "meta_left:%u meta_needed:%u",
                                       meta_left, meta_needed);
                return -EINVAL;
        }

        /* struct type either no name or a valid one */
        if (t->name_off &&
            !btf_name_valid_identifier(env->btf, t->name_off)) {
                btf_verifier_log_type(env, t, "Invalid name");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        last_offset = 0;
        for_each_member(i, t, member) {
                if (!btf_name_offset_valid(btf, member->name_off)) {
                        btf_verifier_log_member(env, t, member,
                                                "Invalid member name_offset:%u",
                                                member->name_off);
                        return -EINVAL;
                }

                /* struct member either no name or a valid one */
                if (member->name_off &&
                    !btf_name_valid_identifier(btf, member->name_off)) {
                        btf_verifier_log_member(env, t, member, "Invalid name");
                        return -EINVAL;
                }
                /* A member cannot be in type void */
                if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
                        btf_verifier_log_member(env, t, member,
                                                "Invalid type_id");
                        return -EINVAL;
                }

                offset = __btf_member_bit_offset(t, member);
                if (is_union && offset) {
                        btf_verifier_log_member(env, t, member,
                                                "Invalid member bits_offset");
                        return -EINVAL;
                }

                /*
                 * ">" instead of ">=" because the last member could be
                 * "char a[0];"
                 */
                if (last_offset > offset) {
                        btf_verifier_log_member(env, t, member,
                                                "Invalid member bits_offset");
                        return -EINVAL;
                }

                if (BITS_ROUNDUP_BYTES(offset) > struct_size) {
                        btf_verifier_log_member(env, t, member,
                                                "Member bits_offset exceeds its struct size");
                        return -EINVAL;
                }

                btf_verifier_log_member(env, t, member, NULL);
                last_offset = offset;
        }

        return meta_needed;
}

static int btf_struct_resolve(struct btf_verifier_env *env,
                              const struct resolve_vertex *v)
{
        const struct btf_member *member;
        int err;
        u16 i;

        /* Before continue resolving the next_member,
         * ensure the last member is indeed resolved to a
         * type with size info.
         */
        if (v->next_member) {
                const struct btf_type *last_member_type;
                const struct btf_member *last_member;
                u32 last_member_type_id;

                last_member = btf_type_member(v->t) + v->next_member - 1;
                last_member_type_id = last_member->type;
                if (WARN_ON_ONCE(!env_type_is_resolved(env,
                                                       last_member_type_id)))
                        return -EINVAL;

                last_member_type = btf_type_by_id(env->btf,
                                                  last_member_type_id);
                if (btf_type_kflag(v->t))
                        err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t,
                                                                last_member,
                                                                last_member_type);
                else
                        err = btf_type_ops(last_member_type)->check_member(env, v->t,
                                                                last_member,
                                                                last_member_type);
                if (err)
                        return err;
        }

        for_each_member_from(i, v->next_member, v->t, member) {
                u32 member_type_id = member->type;
                const struct btf_type *member_type = btf_type_by_id(env->btf,
                                                                member_type_id);

                if (btf_type_nosize_or_null(member_type) ||
                    btf_type_is_resolve_source_only(member_type)) {
                        btf_verifier_log_member(env, v->t, member,
                                                "Invalid member");
                        return -EINVAL;
                }

                if (!env_type_is_resolve_sink(env, member_type) &&
                    !env_type_is_resolved(env, member_type_id)) {
                        env_stack_set_next_member(env, i + 1);
                        return env_stack_push(env, member_type, member_type_id);
                }

                if (btf_type_kflag(v->t))
                        err = btf_type_ops(member_type)->check_kflag_member(env, v->t,
                                                                            member,
                                                                            member_type);
                else
                        err = btf_type_ops(member_type)->check_member(env, v->t,
                                                                      member,
                                                                      member_type);
                if (err)
                        return err;
        }

        env_stack_pop_resolved(env, 0, 0);

        return 0;
}

static void btf_struct_log(struct btf_verifier_env *env,
                           const struct btf_type *t)
{
        btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}

enum {
        BTF_FIELD_IGNORE = 0,
        BTF_FIELD_FOUND  = 1,
};

struct btf_field_info {
        enum btf_field_type type;
        u32 off;
        union {
                struct {
                        u32 type_id;
                } kptr;
                struct {
                        const char *node_name;
                        u32 value_btf_id;
                } graph_root;
        };
};

static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
                           u32 off, int sz, enum btf_field_type field_type,
                           struct btf_field_info *info)
{
        if (!__btf_type_is_struct(t))
                return BTF_FIELD_IGNORE;
        if (t->size != sz)
                return BTF_FIELD_IGNORE;
        info->type = field_type;
        info->off = off;
        return BTF_FIELD_FOUND;
}

static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
                         u32 off, int sz, struct btf_field_info *info, u32 field_mask)
{
        enum btf_field_type type;
        const char *tag_value;
        bool is_type_tag;
        u32 res_id;

        /* Permit modifiers on the pointer itself */
        if (btf_type_is_volatile(t))
                t = btf_type_by_id(btf, t->type);
        /* For PTR, sz is always == 8 */
        if (!btf_type_is_ptr(t))
                return BTF_FIELD_IGNORE;
        t = btf_type_by_id(btf, t->type);
        is_type_tag = btf_type_is_type_tag(t) && !btf_type_kflag(t);
        if (!is_type_tag)
                return BTF_FIELD_IGNORE;
        /* Reject extra tags */
        if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
                return -EINVAL;
        tag_value = __btf_name_by_offset(btf, t->name_off);
        if (!strcmp("kptr_untrusted", tag_value))
                type = BPF_KPTR_UNREF;
        else if (!strcmp("kptr", tag_value))
                type = BPF_KPTR_REF;
        else if (!strcmp("percpu_kptr", tag_value))
                type = BPF_KPTR_PERCPU;
        else if (!strcmp("uptr", tag_value))
                type = BPF_UPTR;
        else
                return -EINVAL;

        if (!(type & field_mask))
                return BTF_FIELD_IGNORE;

        /* Get the base type */
        t = btf_type_skip_modifiers(btf, t->type, &res_id);
        /* Only pointer to struct is allowed */
        if (!__btf_type_is_struct(t))
                return -EINVAL;

        info->type = type;
        info->off = off;
        info->kptr.type_id = res_id;
        return BTF_FIELD_FOUND;
}

int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
                           int comp_idx, const char *tag_key, int last_id)
{
        int len = strlen(tag_key);
        int i, n;

        for (i = last_id + 1, n = btf_nr_types(btf); i < n; i++) {
                const struct btf_type *t = btf_type_by_id(btf, i);

                if (!btf_type_is_decl_tag(t))
                        continue;
                if (pt != btf_type_by_id(btf, t->type))
                        continue;
                if (btf_type_decl_tag(t)->component_idx != comp_idx)
                        continue;
                if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
                        continue;
                return i;
        }
        return -ENOENT;
}

const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
                                    int comp_idx, const char *tag_key)
{
        const char *value = NULL;
        const struct btf_type *t;
        int len, id;

        id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key,
                                    btf_named_start_id(btf, false) - 1);
        if (id < 0)
                return ERR_PTR(id);

        t = btf_type_by_id(btf, id);
        len = strlen(tag_key);
        value = __btf_name_by_offset(btf, t->name_off) + len;

        /* Prevent duplicate entries for same type */
        id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, id);
        if (id >= 0)
                return ERR_PTR(-EEXIST);

        return value;
}

static int
btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
                    const struct btf_type *t, int comp_idx, u32 off,
                    int sz, struct btf_field_info *info,
                    enum btf_field_type head_type)
{
        const char *node_field_name;
        const char *value_type;
        s32 id;

        if (!__btf_type_is_struct(t))
                return BTF_FIELD_IGNORE;
        if (t->size != sz)
                return BTF_FIELD_IGNORE;
        value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
        if (IS_ERR(value_type))
                return -EINVAL;
        node_field_name = strstr(value_type, ":");
        if (!node_field_name)
                return -EINVAL;
        value_type = kstrndup(value_type, node_field_name - value_type,
                              GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
        if (!value_type)
                return -ENOMEM;
        id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
        kfree(value_type);
        if (id < 0)
                return id;
        node_field_name++;
        if (str_is_empty(node_field_name))
                return -EINVAL;
        info->type = head_type;
        info->off = off;
        info->graph_root.value_btf_id = id;
        info->graph_root.node_name = node_field_name;
        return BTF_FIELD_FOUND;
}

static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
                              u32 field_mask, u32 *seen_mask, int *align, int *sz)
{
        const struct {
                enum btf_field_type type;
                const char *const name;
                const bool is_unique;
        } field_types[] = {
                { BPF_SPIN_LOCK, "bpf_spin_lock", true },
                { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true },
                { BPF_TIMER, "bpf_timer", true },
                { BPF_WORKQUEUE, "bpf_wq", true },
                { BPF_TASK_WORK, "bpf_task_work", true },
                { BPF_LIST_HEAD, "bpf_list_head", false },
                { BPF_LIST_NODE, "bpf_list_node", false },
                { BPF_RB_ROOT, "bpf_rb_root", false },
                { BPF_RB_NODE, "bpf_rb_node", false },
                { BPF_REFCOUNT, "bpf_refcount", false },
        };
        int type = 0, i;
        const char *name = __btf_name_by_offset(btf, var_type->name_off);
        const char *field_type_name;
        enum btf_field_type field_type;
        bool is_unique;

        for (i = 0; i < ARRAY_SIZE(field_types); ++i) {
                field_type = field_types[i].type;
                field_type_name = field_types[i].name;
                is_unique = field_types[i].is_unique;
                if (!(field_mask & field_type) || strcmp(name, field_type_name))
                        continue;
                if (is_unique) {
                        if (*seen_mask & field_type)
                                return -E2BIG;
                        *seen_mask |= field_type;
                }
                type = field_type;
                goto end;
        }

        /* Only return BPF_KPTR when all other types with matchable names fail */
        if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
                type = BPF_KPTR_REF;
                goto end;
        }
        return 0;
end:
        *sz = btf_field_type_size(type);
        *align = btf_field_type_align(type);
        return type;
}

/* Repeat a number of fields for a specified number of times.
 *
 * Copy the fields starting from the first field and repeat them for
 * repeat_cnt times. The fields are repeated by adding the offset of each
 * field with
 *   (i + 1) * elem_size
 * where i is the repeat index and elem_size is the size of an element.
 */
static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
                             u32 field_cnt, u32 repeat_cnt, u32 elem_size)
{
        u32 i, j;
        u32 cur;

        /* Ensure not repeating fields that should not be repeated. */
        for (i = 0; i < field_cnt; i++) {
                switch (info[i].type) {
                case BPF_KPTR_UNREF:
                case BPF_KPTR_REF:
                case BPF_KPTR_PERCPU:
                case BPF_UPTR:
                case BPF_LIST_HEAD:
                case BPF_RB_ROOT:
                        break;
                default:
                        return -EINVAL;
                }
        }

        /* The type of struct size or variable size is u32,
         * so the multiplication will not overflow.
         */
        if (field_cnt * (repeat_cnt + 1) > info_cnt)
                return -E2BIG;

        cur = field_cnt;
        for (i = 0; i < repeat_cnt; i++) {
                memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0]));
                for (j = 0; j < field_cnt; j++)
                        info[cur++].off += (i + 1) * elem_size;
        }

        return 0;
}

static int btf_find_struct_field(const struct btf *btf,
                                 const struct btf_type *t, u32 field_mask,
                                 struct btf_field_info *info, int info_cnt,
                                 u32 level);

/* Find special fields in the struct type of a field.
 *
 * This function is used to find fields of special types that is not a
 * global variable or a direct field of a struct type. It also handles the
 * repetition if it is the element type of an array.
 */
static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t,
                                  u32 off, u32 nelems,
                                  u32 field_mask, struct btf_field_info *info,
                                  int info_cnt, u32 level)
{
        int ret, err, i;

        level++;
        if (level >= MAX_RESOLVE_DEPTH)
                return -E2BIG;

        ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level);

        if (ret <= 0)
                return ret;

        /* Shift the offsets of the nested struct fields to the offsets
         * related to the container.
         */
        for (i = 0; i < ret; i++)
                info[i].off += off;

        if (nelems > 1) {
                err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size);
                if (err == 0)
                        ret *= nelems;
                else
                        ret = err;
        }

        return ret;
}

static int btf_find_field_one(const struct btf *btf,
                              const struct btf_type *var,
                              const struct btf_type *var_type,
                              int var_idx,
                              u32 off, u32 expected_size,
                              u32 field_mask, u32 *seen_mask,
                              struct btf_field_info *info, int info_cnt,
                              u32 level)
{
        int ret, align, sz, field_type;
        struct btf_field_info tmp;
        const struct btf_array *array;
        u32 i, nelems = 1;

        /* Walk into array types to find the element type and the number of
         * elements in the (flattened) array.
         */
        for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) {
                array = btf_array(var_type);
                nelems *= array->nelems;
                var_type = btf_type_by_id(btf, array->type);
        }
        if (i == MAX_RESOLVE_DEPTH)
                return -E2BIG;
        if (nelems == 0)
                return 0;

        field_type = btf_get_field_type(btf, var_type,
                                        field_mask, seen_mask, &align, &sz);
        /* Look into variables of struct types */
        if (!field_type && __btf_type_is_struct(var_type)) {
                sz = var_type->size;
                if (expected_size && expected_size != sz * nelems)
                        return 0;
                ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask,
                                             &info[0], info_cnt, level);
                return ret;
        }

        if (field_type == 0)
                return 0;
        if (field_type < 0)
                return field_type;

        if (expected_size && expected_size != sz * nelems)
                return 0;
        if (off % align)
                return 0;

        switch (field_type) {
        case BPF_SPIN_LOCK:
        case BPF_RES_SPIN_LOCK:
        case BPF_TIMER:
        case BPF_WORKQUEUE:
        case BPF_LIST_NODE:
        case BPF_RB_NODE:
        case BPF_REFCOUNT:
        case BPF_TASK_WORK:
                ret = btf_find_struct(btf, var_type, off, sz, field_type,
                                      info_cnt ? &info[0] : &tmp);
                if (ret < 0)
                        return ret;
                break;
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
        case BPF_KPTR_PERCPU:
        case BPF_UPTR:
                ret = btf_find_kptr(btf, var_type, off, sz,
                                    info_cnt ? &info[0] : &tmp, field_mask);
                if (ret < 0)
                        return ret;
                break;
        case BPF_LIST_HEAD:
        case BPF_RB_ROOT:
                ret = btf_find_graph_root(btf, var, var_type,
                                          var_idx, off, sz,
                                          info_cnt ? &info[0] : &tmp,
                                          field_type);
                if (ret < 0)
                        return ret;
                break;
        default:
                return -EFAULT;
        }

        if (ret == BTF_FIELD_IGNORE)
                return 0;
        if (!info_cnt)
                return -E2BIG;
        if (nelems > 1) {
                ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz);
                if (ret < 0)
                        return ret;
        }
        return nelems;
}

static int btf_find_struct_field(const struct btf *btf,
                                 const struct btf_type *t, u32 field_mask,
                                 struct btf_field_info *info, int info_cnt,
                                 u32 level)
{
        int ret, idx = 0;
        const struct btf_member *member;
        u32 i, off, seen_mask = 0;

        for_each_member(i, t, member) {
                const struct btf_type *member_type = btf_type_by_id(btf,
                                                                    member->type);

                off = __btf_member_bit_offset(t, member);
                if (off % 8)
                        /* valid C code cannot generate such BTF */
                        return -EINVAL;
                off /= 8;

                ret = btf_find_field_one(btf, t, member_type, i,
                                         off, 0,
                                         field_mask, &seen_mask,
                                         &info[idx], info_cnt - idx, level);
                if (ret < 0)
                        return ret;
                idx += ret;
        }
        return idx;
}

static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
                                u32 field_mask, struct btf_field_info *info,
                                int info_cnt, u32 level)
{
        int ret, idx = 0;
        const struct btf_var_secinfo *vsi;
        u32 i, off, seen_mask = 0;

        for_each_vsi(i, t, vsi) {
                const struct btf_type *var = btf_type_by_id(btf, vsi->type);
                const struct btf_type *var_type = btf_type_by_id(btf, var->type);

                off = vsi->offset;
                ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size,
                                         field_mask, &seen_mask,
                                         &info[idx], info_cnt - idx,
                                         level);
                if (ret < 0)
                        return ret;
                idx += ret;
        }
        return idx;
}

static int btf_find_field(const struct btf *btf, const struct btf_type *t,
                          u32 field_mask, struct btf_field_info *info,
                          int info_cnt)
{
        if (__btf_type_is_struct(t))
                return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0);
        else if (btf_type_is_datasec(t))
                return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0);
        return -EINVAL;
}

/* Callers have to ensure the life cycle of btf if it is program BTF */
static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
                          struct btf_field_info *info)
{
        struct module *mod = NULL;
        const struct btf_type *t;
        /* If a matching btf type is found in kernel or module BTFs, kptr_ref
         * is that BTF, otherwise it's program BTF
         */
        struct btf *kptr_btf;
        int ret;
        s32 id;

        /* Find type in map BTF, and use it to look up the matching type
         * in vmlinux or module BTFs, by name and kind.
         */
        t = btf_type_by_id(btf, info->kptr.type_id);
        id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
                             &kptr_btf);
        if (id == -ENOENT) {
                /* btf_parse_kptr should only be called w/ btf = program BTF */
                WARN_ON_ONCE(btf_is_kernel(btf));

                /* Type exists only in program BTF. Assume that it's a MEM_ALLOC
                 * kptr allocated via bpf_obj_new
                 */
                field->kptr.dtor = NULL;
                id = info->kptr.type_id;
                kptr_btf = (struct btf *)btf;
                goto found_dtor;
        }
        if (id < 0)
                return id;

        /* Find and stash the function pointer for the destruction function that
         * needs to be eventually invoked from the map free path.
         */
        if (info->type == BPF_KPTR_REF) {
                const struct btf_type *dtor_func;
                const char *dtor_func_name;
                unsigned long addr;
                s32 dtor_btf_id;

                /* This call also serves as a whitelist of allowed objects that
                 * can be used as a referenced pointer and be stored in a map at
                 * the same time.
                 */
                dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
                if (dtor_btf_id < 0) {
                        ret = dtor_btf_id;
                        goto end_btf;
                }

                dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
                if (!dtor_func) {
                        ret = -ENOENT;
                        goto end_btf;
                }

                if (btf_is_module(kptr_btf)) {
                        mod = btf_try_get_module(kptr_btf);
                        if (!mod) {
                                ret = -ENXIO;
                                goto end_btf;
                        }
                }

                /* We already verified dtor_func to be btf_type_is_func
                 * in register_btf_id_dtor_kfuncs.
                 */
                dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
                addr = kallsyms_lookup_name(dtor_func_name);
                if (!addr) {
                        ret = -EINVAL;
                        goto end_mod;
                }
                field->kptr.dtor = (void *)addr;
        }

found_dtor:
        field->kptr.btf_id = id;
        field->kptr.btf = kptr_btf;
        field->kptr.module = mod;
        return 0;
end_mod:
        module_put(mod);
end_btf:
        btf_put(kptr_btf);
        return ret;
}

static int btf_parse_graph_root(const struct btf *btf,
                                struct btf_field *field,
                                struct btf_field_info *info,
                                const char *node_type_name,
                                size_t node_type_align)
{
        const struct btf_type *t, *n = NULL;
        const struct btf_member *member;
        u32 offset;
        int i;

        t = btf_type_by_id(btf, info->graph_root.value_btf_id);
        /* We've already checked that value_btf_id is a struct type. We
         * just need to figure out the offset of the list_node, and
         * verify its type.
         */
        for_each_member(i, t, member) {
                if (strcmp(info->graph_root.node_name,
                           __btf_name_by_offset(btf, member->name_off)))
                        continue;
                /* Invalid BTF, two members with same name */
                if (n)
                        return -EINVAL;
                n = btf_type_by_id(btf, member->type);
                if (!__btf_type_is_struct(n))
                        return -EINVAL;
                if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off)))
                        return -EINVAL;
                offset = __btf_member_bit_offset(n, member);
                if (offset % 8)
                        return -EINVAL;
                offset /= 8;
                if (offset % node_type_align)
                        return -EINVAL;

                field->graph_root.btf = (struct btf *)btf;
                field->graph_root.value_btf_id = info->graph_root.value_btf_id;
                field->graph_root.node_offset = offset;
        }
        if (!n)
                return -ENOENT;
        return 0;
}

static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
                               struct btf_field_info *info)
{
        return btf_parse_graph_root(btf, field, info, "bpf_list_node",
                                            __alignof__(struct bpf_list_node));
}

static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
                             struct btf_field_info *info)
{
        return btf_parse_graph_root(btf, field, info, "bpf_rb_node",
                                            __alignof__(struct bpf_rb_node));
}

static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
{
        const struct btf_field *a = (const struct btf_field *)_a;
        const struct btf_field *b = (const struct btf_field *)_b;

        if (a->offset < b->offset)
                return -1;
        else if (a->offset > b->offset)
                return 1;
        return 0;
}

struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
                                    u32 field_mask, u32 value_size)
{
        struct btf_field_info info_arr[BTF_FIELDS_MAX];
        u32 next_off = 0, field_type_size;
        struct btf_record *rec;
        int ret, i, cnt;

        ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
        if (ret < 0)
                return ERR_PTR(ret);
        if (!ret)
                return NULL;

        cnt = ret;
        /* This needs to be kzalloc to zero out padding and unused fields, see
         * comment in btf_record_equal.
         */
        rec = kzalloc_flex(*rec, fields, cnt, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
        if (!rec)
                return ERR_PTR(-ENOMEM);

        rec->spin_lock_off = -EINVAL;
        rec->res_spin_lock_off = -EINVAL;
        rec->timer_off = -EINVAL;
        rec->wq_off = -EINVAL;
        rec->refcount_off = -EINVAL;
        rec->task_work_off = -EINVAL;
        for (i = 0; i < cnt; i++) {
                field_type_size = btf_field_type_size(info_arr[i].type);
                if (info_arr[i].off + field_type_size > value_size) {
                        WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
                        ret = -EFAULT;
                        goto end;
                }
                if (info_arr[i].off < next_off) {
                        ret = -EEXIST;
                        goto end;
                }
                next_off = info_arr[i].off + field_type_size;

                rec->field_mask |= info_arr[i].type;
                rec->fields[i].offset = info_arr[i].off;
                rec->fields[i].type = info_arr[i].type;
                rec->fields[i].size = field_type_size;

                switch (info_arr[i].type) {
                case BPF_SPIN_LOCK:
                        WARN_ON_ONCE(rec->spin_lock_off >= 0);
                        /* Cache offset for faster lookup at runtime */
                        rec->spin_lock_off = rec->fields[i].offset;
                        break;
                case BPF_RES_SPIN_LOCK:
                        WARN_ON_ONCE(rec->spin_lock_off >= 0);
                        /* Cache offset for faster lookup at runtime */
                        rec->res_spin_lock_off = rec->fields[i].offset;
                        break;
                case BPF_TIMER:
                        WARN_ON_ONCE(rec->timer_off >= 0);
                        /* Cache offset for faster lookup at runtime */
                        rec->timer_off = rec->fields[i].offset;
                        break;
                case BPF_WORKQUEUE:
                        WARN_ON_ONCE(rec->wq_off >= 0);
                        /* Cache offset for faster lookup at runtime */
                        rec->wq_off = rec->fields[i].offset;
                        break;
                case BPF_TASK_WORK:
                        WARN_ON_ONCE(rec->task_work_off >= 0);
                        rec->task_work_off = rec->fields[i].offset;
                        break;
                case BPF_REFCOUNT:
                        WARN_ON_ONCE(rec->refcount_off >= 0);
                        /* Cache offset for faster lookup at runtime */
                        rec->refcount_off = rec->fields[i].offset;
                        break;
                case BPF_KPTR_UNREF:
                case BPF_KPTR_REF:
                case BPF_KPTR_PERCPU:
                case BPF_UPTR:
                        ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
                        if (ret < 0)
                                goto end;
                        break;
                case BPF_LIST_HEAD:
                        ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
                        if (ret < 0)
                                goto end;
                        break;
                case BPF_RB_ROOT:
                        ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]);
                        if (ret < 0)
                                goto end;
                        break;
                case BPF_LIST_NODE:
                case BPF_RB_NODE:
                        break;
                default:
                        ret = -EFAULT;
                        goto end;
                }
                rec->cnt++;
        }

        if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) {
                ret = -EINVAL;
                goto end;
        }

        /* bpf_{list_head, rb_node} require bpf_spin_lock */
        if ((btf_record_has_field(rec, BPF_LIST_HEAD) ||
             btf_record_has_field(rec, BPF_RB_ROOT)) &&
                 (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) {
                ret = -EINVAL;
                goto end;
        }

        if (rec->refcount_off < 0 &&
            btf_record_has_field(rec, BPF_LIST_NODE) &&
            btf_record_has_field(rec, BPF_RB_NODE)) {
                ret = -EINVAL;
                goto end;
        }

        sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp,
               NULL, rec);

        return rec;
end:
        btf_record_free(rec);
        return ERR_PTR(ret);
}

int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
{
        int i;

        /* There are three types that signify ownership of some other type:
         *  kptr_ref, bpf_list_head, bpf_rb_root.
         * kptr_ref only supports storing kernel types, which can't store
         * references to program allocated local types.
         *
         * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
         * does not form cycles.
         */
        if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR)))
                return 0;
        for (i = 0; i < rec->cnt; i++) {
                struct btf_struct_meta *meta;
                const struct btf_type *t;
                u32 btf_id;

                if (rec->fields[i].type == BPF_UPTR) {
                        /* The uptr only supports pinning one page and cannot
                         * point to a kernel struct
                         */
                        if (btf_is_kernel(rec->fields[i].kptr.btf))
                                return -EINVAL;
                        t = btf_type_by_id(rec->fields[i].kptr.btf,
                                           rec->fields[i].kptr.btf_id);
                        if (!t->size)
                                return -EINVAL;
                        if (t->size > PAGE_SIZE)
                                return -E2BIG;
                        continue;
                }

                if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
                        continue;
                btf_id = rec->fields[i].graph_root.value_btf_id;
                meta = btf_find_struct_meta(btf, btf_id);
                if (!meta)
                        return -EFAULT;
                rec->fields[i].graph_root.value_rec = meta->record;

                /* We need to set value_rec for all root types, but no need
                 * to check ownership cycle for a type unless it's also a
                 * node type.
                 */
                if (!(rec->field_mask & BPF_GRAPH_NODE))
                        continue;

                /* We need to ensure ownership acyclicity among all types. The
                 * proper way to do it would be to topologically sort all BTF
                 * IDs based on the ownership edges, since there can be multiple
                 * bpf_{list_head,rb_node} in a type. Instead, we use the
                 * following resaoning:
                 *
                 * - A type can only be owned by another type in user BTF if it
                 *   has a bpf_{list,rb}_node. Let's call these node types.
                 * - A type can only _own_ another type in user BTF if it has a
                 *   bpf_{list_head,rb_root}. Let's call these root types.
                 *
                 * We ensure that if a type is both a root and node, its
                 * element types cannot be root types.
                 *
                 * To ensure acyclicity:
                 *
                 * When A is an root type but not a node, its ownership
                 * chain can be:
                 *        A -> B -> C
                 * Where:
                 * - A is an root, e.g. has bpf_rb_root.
                 * - B is both a root and node, e.g. has bpf_rb_node and
                 *   bpf_list_head.
                 * - C is only an root, e.g. has bpf_list_node
                 *
                 * When A is both a root and node, some other type already
                 * owns it in the BTF domain, hence it can not own
                 * another root type through any of the ownership edges.
                 *        A -> B
                 * Where:
                 * - A is both an root and node.
                 * - B is only an node.
                 */
                if (meta->record->field_mask & BPF_GRAPH_ROOT)
                        return -ELOOP;
        }
        return 0;
}

static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
                              u32 type_id, void *data, u8 bits_offset,
                              struct btf_show *show)
{
        const struct btf_member *member;
        void *safe_data;
        u32 i;

        safe_data = btf_show_start_struct_type(show, t, type_id, data);
        if (!safe_data)
                return;

        for_each_member(i, t, member) {
                const struct btf_type *member_type = btf_type_by_id(btf,
                                                                member->type);
                const struct btf_kind_operations *ops;
                u32 member_offset, bitfield_size;
                u32 bytes_offset;
                u8 bits8_offset;

                btf_show_start_member(show, member);

                member_offset = __btf_member_bit_offset(t, member);
                bitfield_size = __btf_member_bitfield_size(t, member);
                bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
                bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
                if (bitfield_size) {
                        safe_data = btf_show_start_type(show, member_type,
                                                        member->type,
                                                        data + bytes_offset);
                        if (safe_data)
                                btf_bitfield_show(safe_data,
                                                  bits8_offset,
                                                  bitfield_size, show);
                        btf_show_end_type(show);
                } else {
                        ops = btf_type_ops(member_type);
                        ops->show(btf, member_type, member->type,
                                  data + bytes_offset, bits8_offset, show);
                }

                btf_show_end_member(show);
        }

        btf_show_end_struct_type(show);
}

static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
                            u32 type_id, void *data, u8 bits_offset,
                            struct btf_show *show)
{
        const struct btf_member *m = show->state.member;

        /*
         * First check if any members would be shown (are non-zero).
         * See comments above "struct btf_show" definition for more
         * details on how this works at a high-level.
         */
        if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
                if (!show->state.depth_check) {
                        show->state.depth_check = show->state.depth + 1;
                        show->state.depth_to_show = 0;
                }
                __btf_struct_show(btf, t, type_id, data, bits_offset, show);
                /* Restore saved member data here */
                show->state.member = m;
                if (show->state.depth_check != show->state.depth + 1)
                        return;
                show->state.depth_check = 0;

                if (show->state.depth_to_show <= show->state.depth)
                        return;
                /*
                 * Reaching here indicates we have recursed and found
                 * non-zero child values.
                 */
        }

        __btf_struct_show(btf, t, type_id, data, bits_offset, show);
}

static const struct btf_kind_operations struct_ops = {
        .check_meta = btf_struct_check_meta,
        .resolve = btf_struct_resolve,
        .check_member = btf_struct_check_member,
        .check_kflag_member = btf_generic_check_kflag_member,
        .log_details = btf_struct_log,
        .show = btf_struct_show,
};

static int btf_enum_check_member(struct btf_verifier_env *env,
                                 const struct btf_type *struct_type,
                                 const struct btf_member *member,
                                 const struct btf_type *member_type)
{
        u32 struct_bits_off = member->offset;
        u32 struct_size, bytes_offset;

        if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member is not byte aligned");
                return -EINVAL;
        }

        struct_size = struct_type->size;
        bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
        if (struct_size - bytes_offset < member_type->size) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member exceeds struct_size");
                return -EINVAL;
        }

        return 0;
}

static int btf_enum_check_kflag_member(struct btf_verifier_env *env,
                                       const struct btf_type *struct_type,
                                       const struct btf_member *member,
                                       const struct btf_type *member_type)
{
        u32 struct_bits_off, nr_bits, bytes_end, struct_size;
        u32 int_bitsize = sizeof(int) * BITS_PER_BYTE;

        struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
        nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
        if (!nr_bits) {
                if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
                        btf_verifier_log_member(env, struct_type, member,
                                                "Member is not byte aligned");
                        return -EINVAL;
                }

                nr_bits = int_bitsize;
        } else if (nr_bits > int_bitsize) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Invalid member bitfield_size");
                return -EINVAL;
        }

        struct_size = struct_type->size;
        bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits);
        if (struct_size < bytes_end) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member exceeds struct_size");
                return -EINVAL;
        }

        return 0;
}

static s32 btf_enum_check_meta(struct btf_verifier_env *env,
                               const struct btf_type *t,
                               u32 meta_left)
{
        const struct btf_enum *enums = btf_type_enum(t);
        struct btf *btf = env->btf;
        const char *fmt_str;
        u16 i, nr_enums;
        u32 meta_needed;

        nr_enums = btf_type_vlen(t);
        meta_needed = nr_enums * sizeof(*enums);

        if (meta_left < meta_needed) {
                btf_verifier_log_basic(env, t,
                                       "meta_left:%u meta_needed:%u",
                                       meta_left, meta_needed);
                return -EINVAL;
        }

        if (t->size > 8 || !is_power_of_2(t->size)) {
                btf_verifier_log_type(env, t, "Unexpected size");
                return -EINVAL;
        }

        /* enum type either no name or a valid one */
        if (t->name_off &&
            !btf_name_valid_identifier(env->btf, t->name_off)) {
                btf_verifier_log_type(env, t, "Invalid name");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        for (i = 0; i < nr_enums; i++) {
                if (!btf_name_offset_valid(btf, enums[i].name_off)) {
                        btf_verifier_log(env, "\tInvalid name_offset:%u",
                                         enums[i].name_off);
                        return -EINVAL;
                }

                /* enum member must have a valid name */
                if (!enums[i].name_off ||
                    !btf_name_valid_identifier(btf, enums[i].name_off)) {
                        btf_verifier_log_type(env, t, "Invalid name");
                        return -EINVAL;
                }

                if (env->log.level == BPF_LOG_KERNEL)
                        continue;
                fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n";
                btf_verifier_log(env, fmt_str,
                                 __btf_name_by_offset(btf, enums[i].name_off),
                                 enums[i].val);
        }

        return meta_needed;
}

static void btf_enum_log(struct btf_verifier_env *env,
                         const struct btf_type *t)
{
        btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}

static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
                          u32 type_id, void *data, u8 bits_offset,
                          struct btf_show *show)
{
        const struct btf_enum *enums = btf_type_enum(t);
        u32 i, nr_enums = btf_type_vlen(t);
        void *safe_data;
        int v;

        safe_data = btf_show_start_type(show, t, type_id, data);
        if (!safe_data)
                return;

        v = *(int *)safe_data;

        for (i = 0; i < nr_enums; i++) {
                if (v != enums[i].val)
                        continue;

                btf_show_type_value(show, "%s",
                                    __btf_name_by_offset(btf,
                                                         enums[i].name_off));

                btf_show_end_type(show);
                return;
        }

        if (btf_type_kflag(t))
                btf_show_type_value(show, "%d", v);
        else
                btf_show_type_value(show, "%u", v);
        btf_show_end_type(show);
}

static const struct btf_kind_operations enum_ops = {
        .check_meta = btf_enum_check_meta,
        .resolve = btf_df_resolve,
        .check_member = btf_enum_check_member,
        .check_kflag_member = btf_enum_check_kflag_member,
        .log_details = btf_enum_log,
        .show = btf_enum_show,
};

static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
                                 const struct btf_type *t,
                                 u32 meta_left)
{
        const struct btf_enum64 *enums = btf_type_enum64(t);
        struct btf *btf = env->btf;
        const char *fmt_str;
        u16 i, nr_enums;
        u32 meta_needed;

        nr_enums = btf_type_vlen(t);
        meta_needed = nr_enums * sizeof(*enums);

        if (meta_left < meta_needed) {
                btf_verifier_log_basic(env, t,
                                       "meta_left:%u meta_needed:%u",
                                       meta_left, meta_needed);
                return -EINVAL;
        }

        if (t->size > 8 || !is_power_of_2(t->size)) {
                btf_verifier_log_type(env, t, "Unexpected size");
                return -EINVAL;
        }

        /* enum type either no name or a valid one */
        if (t->name_off &&
            !btf_name_valid_identifier(env->btf, t->name_off)) {
                btf_verifier_log_type(env, t, "Invalid name");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        for (i = 0; i < nr_enums; i++) {
                if (!btf_name_offset_valid(btf, enums[i].name_off)) {
                        btf_verifier_log(env, "\tInvalid name_offset:%u",
                                         enums[i].name_off);
                        return -EINVAL;
                }

                /* enum member must have a valid name */
                if (!enums[i].name_off ||
                    !btf_name_valid_identifier(btf, enums[i].name_off)) {
                        btf_verifier_log_type(env, t, "Invalid name");
                        return -EINVAL;
                }

                if (env->log.level == BPF_LOG_KERNEL)
                        continue;

                fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n";
                btf_verifier_log(env, fmt_str,
                                 __btf_name_by_offset(btf, enums[i].name_off),
                                 btf_enum64_value(enums + i));
        }

        return meta_needed;
}

static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
                            u32 type_id, void *data, u8 bits_offset,
                            struct btf_show *show)
{
        const struct btf_enum64 *enums = btf_type_enum64(t);
        u32 i, nr_enums = btf_type_vlen(t);
        void *safe_data;
        s64 v;

        safe_data = btf_show_start_type(show, t, type_id, data);
        if (!safe_data)
                return;

        v = *(u64 *)safe_data;

        for (i = 0; i < nr_enums; i++) {
                if (v != btf_enum64_value(enums + i))
                        continue;

                btf_show_type_value(show, "%s",
                                    __btf_name_by_offset(btf,
                                                         enums[i].name_off));

                btf_show_end_type(show);
                return;
        }

        if (btf_type_kflag(t))
                btf_show_type_value(show, "%lld", v);
        else
                btf_show_type_value(show, "%llu", v);
        btf_show_end_type(show);
}

static const struct btf_kind_operations enum64_ops = {
        .check_meta = btf_enum64_check_meta,
        .resolve = btf_df_resolve,
        .check_member = btf_enum_check_member,
        .check_kflag_member = btf_enum_check_kflag_member,
        .log_details = btf_enum_log,
        .show = btf_enum64_show,
};

static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
                                     const struct btf_type *t,
                                     u32 meta_left)
{
        u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param);

        if (meta_left < meta_needed) {
                btf_verifier_log_basic(env, t,
                                       "meta_left:%u meta_needed:%u",
                                       meta_left, meta_needed);
                return -EINVAL;
        }

        if (t->name_off) {
                btf_verifier_log_type(env, t, "Invalid name");
                return -EINVAL;
        }

        if (btf_type_kflag(t)) {
                btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        return meta_needed;
}

static void btf_func_proto_log(struct btf_verifier_env *env,
                               const struct btf_type *t)
{
        const struct btf_param *args = (const struct btf_param *)(t + 1);
        u16 nr_args = btf_type_vlen(t), i;

        btf_verifier_log(env, "return=%u args=(", t->type);
        if (!nr_args) {
                btf_verifier_log(env, "void");
                goto done;
        }

        if (nr_args == 1 && !args[0].type) {
                /* Only one vararg */
                btf_verifier_log(env, "vararg");
                goto done;
        }

        btf_verifier_log(env, "%u %s", args[0].type,
                         __btf_name_by_offset(env->btf,
                                              args[0].name_off));
        for (i = 1; i < nr_args - 1; i++)
                btf_verifier_log(env, ", %u %s", args[i].type,
                                 __btf_name_by_offset(env->btf,
                                                      args[i].name_off));

        if (nr_args > 1) {
                const struct btf_param *last_arg = &args[nr_args - 1];

                if (last_arg->type)
                        btf_verifier_log(env, ", %u %s", last_arg->type,
                                         __btf_name_by_offset(env->btf,
                                                              last_arg->name_off));
                else
                        btf_verifier_log(env, ", vararg");
        }

done:
        btf_verifier_log(env, ")");
}

static const struct btf_kind_operations func_proto_ops = {
        .check_meta = btf_func_proto_check_meta,
        .resolve = btf_df_resolve,
        /*
         * BTF_KIND_FUNC_PROTO cannot be directly referred by
         * a struct's member.
         *
         * It should be a function pointer instead.
         * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
         *
         * Hence, there is no btf_func_check_member().
         */
        .check_member = btf_df_check_member,
        .check_kflag_member = btf_df_check_kflag_member,
        .log_details = btf_func_proto_log,
        .show = btf_df_show,
};

static s32 btf_func_check_meta(struct btf_verifier_env *env,
                               const struct btf_type *t,
                               u32 meta_left)
{
        if (!t->name_off ||
            !btf_name_valid_identifier(env->btf, t->name_off)) {
                btf_verifier_log_type(env, t, "Invalid name");
                return -EINVAL;
        }

        if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) {
                btf_verifier_log_type(env, t, "Invalid func linkage");
                return -EINVAL;
        }

        if (btf_type_kflag(t)) {
                btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        return 0;
}

static int btf_func_resolve(struct btf_verifier_env *env,
                            const struct resolve_vertex *v)
{
        const struct btf_type *t = v->t;
        u32 next_type_id = t->type;
        int err;

        err = btf_func_check(env, t);
        if (err)
                return err;

        env_stack_pop_resolved(env, next_type_id, 0);
        return 0;
}

static const struct btf_kind_operations func_ops = {
        .check_meta = btf_func_check_meta,
        .resolve = btf_func_resolve,
        .check_member = btf_df_check_member,
        .check_kflag_member = btf_df_check_kflag_member,
        .log_details = btf_ref_type_log,
        .show = btf_df_show,
};

static s32 btf_var_check_meta(struct btf_verifier_env *env,
                              const struct btf_type *t,
                              u32 meta_left)
{
        const struct btf_var *var;
        u32 meta_needed = sizeof(*var);

        if (meta_left < meta_needed) {
                btf_verifier_log_basic(env, t,
                                       "meta_left:%u meta_needed:%u",
                                       meta_left, meta_needed);
                return -EINVAL;
        }

        if (btf_type_vlen(t)) {
                btf_verifier_log_type(env, t, "vlen != 0");
                return -EINVAL;
        }

        if (btf_type_kflag(t)) {
                btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
                return -EINVAL;
        }

        if (!t->name_off ||
            !btf_name_valid_identifier(env->btf, t->name_off)) {
                btf_verifier_log_type(env, t, "Invalid name");
                return -EINVAL;
        }

        /* A var cannot be in type void */
        if (!t->type || !BTF_TYPE_ID_VALID(t->type)) {
                btf_verifier_log_type(env, t, "Invalid type_id");
                return -EINVAL;
        }

        var = btf_type_var(t);
        if (var->linkage != BTF_VAR_STATIC &&
            var->linkage != BTF_VAR_GLOBAL_ALLOCATED) {
                btf_verifier_log_type(env, t, "Linkage not supported");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        return meta_needed;
}

static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t)
{
        const struct btf_var *var = btf_type_var(t);

        btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage);
}

static const struct btf_kind_operations var_ops = {
        .check_meta                = btf_var_check_meta,
        .resolve                = btf_var_resolve,
        .check_member                = btf_df_check_member,
        .check_kflag_member        = btf_df_check_kflag_member,
        .log_details                = btf_var_log,
        .show                        = btf_var_show,
};

static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
                                  const struct btf_type *t,
                                  u32 meta_left)
{
        const struct btf_var_secinfo *vsi;
        u64 last_vsi_end_off = 0, sum = 0;
        u32 i, meta_needed;

        meta_needed = btf_type_vlen(t) * sizeof(*vsi);
        if (meta_left < meta_needed) {
                btf_verifier_log_basic(env, t,
                                       "meta_left:%u meta_needed:%u",
                                       meta_left, meta_needed);
                return -EINVAL;
        }

        if (!t->size) {
                btf_verifier_log_type(env, t, "size == 0");
                return -EINVAL;
        }

        if (btf_type_kflag(t)) {
                btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
                return -EINVAL;
        }

        if (!t->name_off ||
            !btf_name_valid_section(env->btf, t->name_off)) {
                btf_verifier_log_type(env, t, "Invalid name");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        for_each_vsi(i, t, vsi) {
                /* A var cannot be in type void */
                if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) {
                        btf_verifier_log_vsi(env, t, vsi,
                                             "Invalid type_id");
                        return -EINVAL;
                }

                if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) {
                        btf_verifier_log_vsi(env, t, vsi,
                                             "Invalid offset");
                        return -EINVAL;
                }

                if (!vsi->size || vsi->size > t->size) {
                        btf_verifier_log_vsi(env, t, vsi,
                                             "Invalid size");
                        return -EINVAL;
                }

                last_vsi_end_off = vsi->offset + vsi->size;
                if (last_vsi_end_off > t->size) {
                        btf_verifier_log_vsi(env, t, vsi,
                                             "Invalid offset+size");
                        return -EINVAL;
                }

                btf_verifier_log_vsi(env, t, vsi, NULL);
                sum += vsi->size;
        }

        if (t->size < sum) {
                btf_verifier_log_type(env, t, "Invalid btf_info size");
                return -EINVAL;
        }

        return meta_needed;
}

static int btf_datasec_resolve(struct btf_verifier_env *env,
                               const struct resolve_vertex *v)
{
        const struct btf_var_secinfo *vsi;
        struct btf *btf = env->btf;
        u16 i;

        env->resolve_mode = RESOLVE_TBD;
        for_each_vsi_from(i, v->next_member, v->t, vsi) {
                u32 var_type_id = vsi->type, type_id, type_size = 0;
                const struct btf_type *var_type = btf_type_by_id(env->btf,
                                                                 var_type_id);
                if (!var_type || !btf_type_is_var(var_type)) {
                        btf_verifier_log_vsi(env, v->t, vsi,
                                             "Not a VAR kind member");
                        return -EINVAL;
                }

                if (!env_type_is_resolve_sink(env, var_type) &&
                    !env_type_is_resolved(env, var_type_id)) {
                        env_stack_set_next_member(env, i + 1);
                        return env_stack_push(env, var_type, var_type_id);
                }

                type_id = var_type->type;
                if (!btf_type_id_size(btf, &type_id, &type_size)) {
                        btf_verifier_log_vsi(env, v->t, vsi, "Invalid type");
                        return -EINVAL;
                }

                if (vsi->size < type_size) {
                        btf_verifier_log_vsi(env, v->t, vsi, "Invalid size");
                        return -EINVAL;
                }
        }

        env_stack_pop_resolved(env, 0, 0);
        return 0;
}

static void btf_datasec_log(struct btf_verifier_env *env,
                            const struct btf_type *t)
{
        btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}

static void btf_datasec_show(const struct btf *btf,
                             const struct btf_type *t, u32 type_id,
                             void *data, u8 bits_offset,
                             struct btf_show *show)
{
        const struct btf_var_secinfo *vsi;
        const struct btf_type *var;
        u32 i;

        if (!btf_show_start_type(show, t, type_id, data))
                return;

        btf_show_type_value(show, "section (\"%s\") = {",
                            __btf_name_by_offset(btf, t->name_off));
        for_each_vsi(i, t, vsi) {
                var = btf_type_by_id(btf, vsi->type);
                if (i)
                        btf_show(show, ",");
                btf_type_ops(var)->show(btf, var, vsi->type,
                                        data + vsi->offset, bits_offset, show);
        }
        btf_show_end_type(show);
}

static const struct btf_kind_operations datasec_ops = {
        .check_meta                = btf_datasec_check_meta,
        .resolve                = btf_datasec_resolve,
        .check_member                = btf_df_check_member,
        .check_kflag_member        = btf_df_check_kflag_member,
        .log_details                = btf_datasec_log,
        .show                        = btf_datasec_show,
};

static s32 btf_float_check_meta(struct btf_verifier_env *env,
                                const struct btf_type *t,
                                u32 meta_left)
{
        if (btf_type_vlen(t)) {
                btf_verifier_log_type(env, t, "vlen != 0");
                return -EINVAL;
        }

        if (btf_type_kflag(t)) {
                btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
                return -EINVAL;
        }

        if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 &&
            t->size != 16) {
                btf_verifier_log_type(env, t, "Invalid type_size");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        return 0;
}

static int btf_float_check_member(struct btf_verifier_env *env,
                                  const struct btf_type *struct_type,
                                  const struct btf_member *member,
                                  const struct btf_type *member_type)
{
        u64 start_offset_bytes;
        u64 end_offset_bytes;
        u64 misalign_bits;
        u64 align_bytes;
        u64 align_bits;

        /* Different architectures have different alignment requirements, so
         * here we check only for the reasonable minimum. This way we ensure
         * that types after CO-RE can pass the kernel BTF verifier.
         */
        align_bytes = min_t(u64, sizeof(void *), member_type->size);
        align_bits = align_bytes * BITS_PER_BYTE;
        div64_u64_rem(member->offset, align_bits, &misalign_bits);
        if (misalign_bits) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member is not properly aligned");
                return -EINVAL;
        }

        start_offset_bytes = member->offset / BITS_PER_BYTE;
        end_offset_bytes = start_offset_bytes + member_type->size;
        if (end_offset_bytes > struct_type->size) {
                btf_verifier_log_member(env, struct_type, member,
                                        "Member exceeds struct_size");
                return -EINVAL;
        }

        return 0;
}

static void btf_float_log(struct btf_verifier_env *env,
                          const struct btf_type *t)
{
        btf_verifier_log(env, "size=%u", t->size);
}

static const struct btf_kind_operations float_ops = {
        .check_meta = btf_float_check_meta,
        .resolve = btf_df_resolve,
        .check_member = btf_float_check_member,
        .check_kflag_member = btf_generic_check_kflag_member,
        .log_details = btf_float_log,
        .show = btf_df_show,
};

static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
                              const struct btf_type *t,
                              u32 meta_left)
{
        const struct btf_decl_tag *tag;
        u32 meta_needed = sizeof(*tag);
        s32 component_idx;
        const char *value;

        if (meta_left < meta_needed) {
                btf_verifier_log_basic(env, t,
                                       "meta_left:%u meta_needed:%u",
                                       meta_left, meta_needed);
                return -EINVAL;
        }

        value = btf_name_by_offset(env->btf, t->name_off);
        if (!value || !value[0]) {
                btf_verifier_log_type(env, t, "Invalid value");
                return -EINVAL;
        }

        if (btf_type_vlen(t)) {
                btf_verifier_log_type(env, t, "vlen != 0");
                return -EINVAL;
        }

        component_idx = btf_type_decl_tag(t)->component_idx;
        if (component_idx < -1) {
                btf_verifier_log_type(env, t, "Invalid component_idx");
                return -EINVAL;
        }

        btf_verifier_log_type(env, t, NULL);

        return meta_needed;
}

static int btf_decl_tag_resolve(struct btf_verifier_env *env,
                           const struct resolve_vertex *v)
{
        const struct btf_type *next_type;
        const struct btf_type *t = v->t;
        u32 next_type_id = t->type;
        struct btf *btf = env->btf;
        s32 component_idx;
        u32 vlen;

        next_type = btf_type_by_id(btf, next_type_id);
        if (!next_type || !btf_type_is_decl_tag_target(next_type)) {
                btf_verifier_log_type(env, v->t, "Invalid type_id");
                return -EINVAL;
        }

        if (!env_type_is_resolve_sink(env, next_type) &&
            !env_type_is_resolved(env, next_type_id))
                return env_stack_push(env, next_type, next_type_id);

        component_idx = btf_type_decl_tag(t)->component_idx;
        if (component_idx != -1) {
                if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) {
                        btf_verifier_log_type(env, v->t, "Invalid component_idx");
                        return -EINVAL;
                }

                if (btf_type_is_struct(next_type)) {
                        vlen = btf_type_vlen(next_type);
                } else {
                        /* next_type should be a function */
                        next_type = btf_type_by_id(btf, next_type->type);
                        vlen = btf_type_vlen(next_type);
                }

                if ((u32)component_idx >= vlen) {
                        btf_verifier_log_type(env, v->t, "Invalid component_idx");
                        return -EINVAL;
                }
        }

        env_stack_pop_resolved(env, next_type_id, 0);

        return 0;
}

static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t)
{
        btf_verifier_log(env, "type=%u component_idx=%d", t->type,
                         btf_type_decl_tag(t)->component_idx);
}

static const struct btf_kind_operations decl_tag_ops = {
        .check_meta = btf_decl_tag_check_meta,
        .resolve = btf_decl_tag_resolve,
        .check_member = btf_df_check_member,
        .check_kflag_member = btf_df_check_kflag_member,
        .log_details = btf_decl_tag_log,
        .show = btf_df_show,
};

static int btf_func_proto_check(struct btf_verifier_env *env,
                                const struct btf_type *t)
{
        const struct btf_type *ret_type;
        const struct btf_param *args;
        const struct btf *btf;
        u16 nr_args, i;
        int err;

        btf = env->btf;
        args = (const struct btf_param *)(t + 1);
        nr_args = btf_type_vlen(t);

        /* Check func return type which could be "void" (t->type == 0) */
        if (t->type) {
                u32 ret_type_id = t->type;

                ret_type = btf_type_by_id(btf, ret_type_id);
                if (!ret_type) {
                        btf_verifier_log_type(env, t, "Invalid return type");
                        return -EINVAL;
                }

                if (btf_type_is_resolve_source_only(ret_type)) {
                        btf_verifier_log_type(env, t, "Invalid return type");
                        return -EINVAL;
                }

                if (btf_type_needs_resolve(ret_type) &&
                    !env_type_is_resolved(env, ret_type_id)) {
                        err = btf_resolve(env, ret_type, ret_type_id);
                        if (err)
                                return err;
                }

                /* Ensure the return type is a type that has a size */
                if (!btf_type_id_size(btf, &ret_type_id, NULL)) {
                        btf_verifier_log_type(env, t, "Invalid return type");
                        return -EINVAL;
                }
        }

        if (!nr_args)
                return 0;

        /* Last func arg type_id could be 0 if it is a vararg */
        if (!args[nr_args - 1].type) {
                if (args[nr_args - 1].name_off) {
                        btf_verifier_log_type(env, t, "Invalid arg#%u",
                                              nr_args);
                        return -EINVAL;
                }
                nr_args--;
        }

        for (i = 0; i < nr_args; i++) {
                const struct btf_type *arg_type;
                u32 arg_type_id;

                arg_type_id = args[i].type;
                arg_type = btf_type_by_id(btf, arg_type_id);
                if (!arg_type) {
                        btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
                        return -EINVAL;
                }

                if (btf_type_is_resolve_source_only(arg_type)) {
                        btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
                        return -EINVAL;
                }

                if (args[i].name_off &&
                    (!btf_name_offset_valid(btf, args[i].name_off) ||
                     !btf_name_valid_identifier(btf, args[i].name_off))) {
                        btf_verifier_log_type(env, t,
                                              "Invalid arg#%u", i + 1);
                        return -EINVAL;
                }

                if (btf_type_needs_resolve(arg_type) &&
                    !env_type_is_resolved(env, arg_type_id)) {
                        err = btf_resolve(env, arg_type, arg_type_id);
                        if (err)
                                return err;
                }

                if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
                        btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
                        return -EINVAL;
                }
        }

        return 0;
}

static int btf_func_check(struct btf_verifier_env *env,
                          const struct btf_type *t)
{
        const struct btf_type *proto_type;
        const struct btf_param *args;
        const struct btf *btf;
        u16 nr_args, i;

        btf = env->btf;
        proto_type = btf_type_by_id(btf, t->type);

        if (!proto_type || !btf_type_is_func_proto(proto_type)) {
                btf_verifier_log_type(env, t, "Invalid type_id");
                return -EINVAL;
        }

        args = (const struct btf_param *)(proto_type + 1);
        nr_args = btf_type_vlen(proto_type);
        for (i = 0; i < nr_args; i++) {
                if (!args[i].name_off && args[i].type) {
                        btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
                        return -EINVAL;
                }
        }

        return 0;
}

static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
        [BTF_KIND_INT] = &int_ops,
        [BTF_KIND_PTR] = &ptr_ops,
        [BTF_KIND_ARRAY] = &array_ops,
        [BTF_KIND_STRUCT] = &struct_ops,
        [BTF_KIND_UNION] = &struct_ops,
        [BTF_KIND_ENUM] = &enum_ops,
        [BTF_KIND_FWD] = &fwd_ops,
        [BTF_KIND_TYPEDEF] = &modifier_ops,
        [BTF_KIND_VOLATILE] = &modifier_ops,
        [BTF_KIND_CONST] = &modifier_ops,
        [BTF_KIND_RESTRICT] = &modifier_ops,
        [BTF_KIND_FUNC] = &func_ops,
        [BTF_KIND_FUNC_PROTO] = &func_proto_ops,
        [BTF_KIND_VAR] = &var_ops,
        [BTF_KIND_DATASEC] = &datasec_ops,
        [BTF_KIND_FLOAT] = &float_ops,
        [BTF_KIND_DECL_TAG] = &decl_tag_ops,
        [BTF_KIND_TYPE_TAG] = &modifier_ops,
        [BTF_KIND_ENUM64] = &enum64_ops,
};

static s32 btf_check_meta(struct btf_verifier_env *env,
                          const struct btf_type *t,
                          u32 meta_left)
{
        u32 saved_meta_left = meta_left;
        s32 var_meta_size;

        if (meta_left < sizeof(*t)) {
                btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu",
                                 env->log_type_id, meta_left, sizeof(*t));
                return -EINVAL;
        }
        meta_left -= sizeof(*t);

        if (t->info & ~BTF_INFO_MASK) {
                btf_verifier_log(env, "[%u] Invalid btf_info:%x",
                                 env->log_type_id, t->info);
                return -EINVAL;
        }

        if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
            BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
                btf_verifier_log(env, "[%u] Invalid kind:%u",
                                 env->log_type_id, BTF_INFO_KIND(t->info));
                return -EINVAL;
        }

        if (!btf_name_offset_valid(env->btf, t->name_off)) {
                btf_verifier_log(env, "[%u] Invalid name_offset:%u",
                                 env->log_type_id, t->name_off);
                return -EINVAL;
        }

        var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left);
        if (var_meta_size < 0)
                return var_meta_size;

        meta_left -= var_meta_size;

        return saved_meta_left - meta_left;
}

static int btf_check_all_metas(struct btf_verifier_env *env)
{
        struct btf *btf = env->btf;
        struct btf_header *hdr;
        void *cur, *end;

        hdr = &btf->hdr;
        cur = btf->nohdr_data + hdr->type_off;
        end = cur + hdr->type_len;

        env->log_type_id = btf->base_btf ? btf->start_id : 1;
        while (cur < end) {
                struct btf_type *t = cur;
                s32 meta_size;

                meta_size = btf_check_meta(env, t, end - cur);
                if (meta_size < 0)
                        return meta_size;

                btf_add_type(env, t);
                cur += meta_size;
                env->log_type_id++;
        }

        return 0;
}

static bool btf_resolve_valid(struct btf_verifier_env *env,
                              const struct btf_type *t,
                              u32 type_id)
{
        struct btf *btf = env->btf;

        if (!env_type_is_resolved(env, type_id))
                return false;

        if (btf_type_is_struct(t) || btf_type_is_datasec(t))
                return !btf_resolved_type_id(btf, type_id) &&
                       !btf_resolved_type_size(btf, type_id);

        if (btf_type_is_decl_tag(t) || btf_type_is_func(t))
                return btf_resolved_type_id(btf, type_id) &&
                       !btf_resolved_type_size(btf, type_id);

        if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
            btf_type_is_var(t)) {
                t = btf_type_id_resolve(btf, &type_id);
                return t &&
                       !btf_type_is_modifier(t) &&
                       !btf_type_is_var(t) &&
                       !btf_type_is_datasec(t);
        }

        if (btf_type_is_array(t)) {
                const struct btf_array *array = btf_type_array(t);
                const struct btf_type *elem_type;
                u32 elem_type_id = array->type;
                u32 elem_size;

                elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
                return elem_type && !btf_type_is_modifier(elem_type) &&
                        (array->nelems * elem_size ==
                         btf_resolved_type_size(btf, type_id));
        }

        return false;
}

static int btf_resolve(struct btf_verifier_env *env,
                       const struct btf_type *t, u32 type_id)
{
        u32 save_log_type_id = env->log_type_id;
        const struct resolve_vertex *v;
        int err = 0;

        env->resolve_mode = RESOLVE_TBD;
        env_stack_push(env, t, type_id);
        while (!err && (v = env_stack_peak(env))) {
                env->log_type_id = v->type_id;
                err = btf_type_ops(v->t)->resolve(env, v);
        }

        env->log_type_id = type_id;
        if (err == -E2BIG) {
                btf_verifier_log_type(env, t,
                                      "Exceeded max resolving depth:%u",
                                      MAX_RESOLVE_DEPTH);
        } else if (err == -EEXIST) {
                btf_verifier_log_type(env, t, "Loop detected");
        }

        /* Final sanity check */
        if (!err && !btf_resolve_valid(env, t, type_id)) {
                btf_verifier_log_type(env, t, "Invalid resolve state");
                err = -EINVAL;
        }

        env->log_type_id = save_log_type_id;
        return err;
}

static int btf_check_all_types(struct btf_verifier_env *env)
{
        struct btf *btf = env->btf;
        const struct btf_type *t;
        u32 type_id, i;
        int err;

        err = env_resolve_init(env);
        if (err)
                return err;

        env->phase++;
        for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) {
                type_id = btf->start_id + i;
                t = btf_type_by_id(btf, type_id);

                env->log_type_id = type_id;
                if (btf_type_needs_resolve(t) &&
                    !env_type_is_resolved(env, type_id)) {
                        err = btf_resolve(env, t, type_id);
                        if (err)
                                return err;
                }

                if (btf_type_is_func_proto(t)) {
                        err = btf_func_proto_check(env, t);
                        if (err)
                                return err;
                }
        }

        return 0;
}

static int btf_parse_type_sec(struct btf_verifier_env *env)
{
        const struct btf_header *hdr = &env->btf->hdr;
        int err;

        /* Type section must align to 4 bytes */
        if (hdr->type_off & (sizeof(u32) - 1)) {
                btf_verifier_log(env, "Unaligned type_off");
                return -EINVAL;
        }

        if (!env->btf->base_btf && !hdr->type_len) {
                btf_verifier_log(env, "No type found");
                return -EINVAL;
        }

        err = btf_check_all_metas(env);
        if (err)
                return err;

        return btf_check_all_types(env);
}

static int btf_parse_str_sec(struct btf_verifier_env *env)
{
        const struct btf_header *hdr;
        struct btf *btf = env->btf;
        const char *start, *end;

        hdr = &btf->hdr;
        start = btf->nohdr_data + hdr->str_off;
        end = start + hdr->str_len;

        if (hdr->hdr_len < sizeof(struct btf_header) &&
            end != btf->data + btf->data_size) {
                btf_verifier_log(env, "String section is not at the end");
                return -EINVAL;
        }

        btf->strings = start;

        if (btf->base_btf && !hdr->str_len)
                return 0;
        if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) {
                btf_verifier_log(env, "Invalid string section");
                return -EINVAL;
        }
        if (!btf->base_btf && start[0]) {
                btf_verifier_log(env, "Invalid string section");
                return -EINVAL;
        }

        return 0;
}

static int btf_parse_layout_sec(struct btf_verifier_env *env)
{
        const struct btf_header *hdr = &env->btf->hdr;
        struct btf *btf = env->btf;
        void *start, *end;

        if (hdr->hdr_len < sizeof(struct btf_header) ||
            hdr->layout_len == 0)
                return 0;

        /* Layout section must align to 4 bytes */
        if (hdr->layout_off & (sizeof(u32) - 1)) {
                btf_verifier_log(env, "Unaligned layout_off");
                return -EINVAL;
        }
        start = btf->nohdr_data + hdr->layout_off;
        end = start + hdr->layout_len;

        if (hdr->layout_len < sizeof(struct btf_layout)) {
                btf_verifier_log(env, "Layout section is too small");
                return -EINVAL;
        }
        if (hdr->layout_len % sizeof(struct btf_layout) != 0) {
                btf_verifier_log(env, "layout_len is not multiple of %zu",
                                 sizeof(struct btf_layout));
                return -EINVAL;
        }
        if (end > btf->data + btf->data_size) {
                btf_verifier_log(env, "Layout section is too big");
                return -EINVAL;
        }
        btf->layout = start;

        return 0;
}

static const size_t btf_sec_info_offset[] = {
        offsetof(struct btf_header, type_off),
        offsetof(struct btf_header, str_off),
        offsetof(struct btf_header, layout_off)
};

static int btf_sec_info_cmp(const void *a, const void *b)
{
        const struct btf_sec_info *x = a;
        const struct btf_sec_info *y = b;

        return (int)(x->off - y->off) ? : (int)(x->len - y->len);
}

static int btf_check_sec_info(struct btf_verifier_env *env,
                              u32 btf_data_size)
{
        struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];
        u32 total, expected_total, i;
        u32 nr_secs = ARRAY_SIZE(btf_sec_info_offset);
        const struct btf_header *hdr;
        const struct btf *btf;

        btf = env->btf;
        hdr = &btf->hdr;

        if (hdr->hdr_len < sizeof(struct btf_header) || hdr->layout_len == 0)
                nr_secs--;

        /* Populate the secs from hdr */
        for (i = 0; i < nr_secs; i++)
                secs[i] = *(struct btf_sec_info *)((void *)hdr +
                                                   btf_sec_info_offset[i]);

        sort(secs, nr_secs,
             sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL);

        /* Check for gaps and overlap among sections */
        total = 0;
        expected_total = btf_data_size - hdr->hdr_len;
        for (i = 0; i < nr_secs; i++) {
                if (expected_total < secs[i].off) {
                        btf_verifier_log(env, "Invalid section offset");
                        return -EINVAL;
                }
                if (total < secs[i].off) {
                        /* gap */
                        btf_verifier_log(env, "Unsupported section found");
                        return -EINVAL;
                }
                if (total > secs[i].off) {
                        btf_verifier_log(env, "Section overlap found");
                        return -EINVAL;
                }
                if (expected_total - total < secs[i].len) {
                        btf_verifier_log(env,
                                         "Total section length too long");
                        return -EINVAL;
                }
                total += secs[i].len;
        }

        /* There is data other than hdr and known sections */
        if (expected_total != total) {
                btf_verifier_log(env, "Unsupported section found");
                return -EINVAL;
        }

        return 0;
}

static int btf_parse_hdr(struct btf_verifier_env *env)
{
        u32 hdr_len, hdr_copy, btf_data_size;
        const struct btf_header *hdr;
        struct btf *btf;

        btf = env->btf;
        btf_data_size = btf->data_size;

        if (btf_data_size < offsetofend(struct btf_header, hdr_len)) {
                btf_verifier_log(env, "hdr_len not found");
                return -EINVAL;
        }

        hdr = btf->data;
        hdr_len = hdr->hdr_len;
        if (btf_data_size < hdr_len) {
                btf_verifier_log(env, "btf_header not found");
                return -EINVAL;
        }

        /* Ensure the unsupported header fields are zero */
        if (hdr_len > sizeof(btf->hdr)) {
                u8 *expected_zero = btf->data + sizeof(btf->hdr);
                u8 *end = btf->data + hdr_len;

                for (; expected_zero < end; expected_zero++) {
                        if (*expected_zero) {
                                btf_verifier_log(env, "Unsupported btf_header");
                                return -E2BIG;
                        }
                }
        }

        hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
        memcpy(&btf->hdr, btf->data, hdr_copy);

        hdr = &btf->hdr;

        btf_verifier_log_hdr(env, btf_data_size);

        if (hdr->magic != BTF_MAGIC) {
                btf_verifier_log(env, "Invalid magic");
                return -EINVAL;
        }

        if (hdr->version != BTF_VERSION) {
                btf_verifier_log(env, "Unsupported version");
                return -ENOTSUPP;
        }

        if (hdr->flags) {
                btf_verifier_log(env, "Unsupported flags");
                return -ENOTSUPP;
        }

        if (!btf->base_btf && btf_data_size == hdr->hdr_len) {
                btf_verifier_log(env, "No data");
                return -EINVAL;
        }

        return btf_check_sec_info(env, btf_data_size);
}

static const char *alloc_obj_fields[] = {
        "bpf_spin_lock",
        "bpf_list_head",
        "bpf_list_node",
        "bpf_rb_root",
        "bpf_rb_node",
        "bpf_refcount",
};

static struct btf_struct_metas *
btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
{
        struct btf_struct_metas *tab = NULL;
        struct btf_id_set *aof;
        int i, n, id, ret;

        BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
        BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));

        aof = kmalloc_obj(*aof, GFP_KERNEL | __GFP_NOWARN);
        if (!aof)
                return ERR_PTR(-ENOMEM);
        aof->cnt = 0;

        for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
                /* Try to find whether this special type exists in user BTF, and
                 * if so remember its ID so we can easily find it among members
                 * of structs that we iterate in the next loop.
                 */
                struct btf_id_set *new_aof;

                id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
                if (id < 0)
                        continue;

                new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
                                   GFP_KERNEL | __GFP_NOWARN);
                if (!new_aof) {
                        ret = -ENOMEM;
                        goto free_aof;
                }
                aof = new_aof;
                aof->ids[aof->cnt++] = id;
        }

        n = btf_nr_types(btf);
        for (i = 1; i < n; i++) {
                /* Try to find if there are kptrs in user BTF and remember their ID */
                struct btf_id_set *new_aof;
                struct btf_field_info tmp;
                const struct btf_type *t;

                t = btf_type_by_id(btf, i);
                if (!t) {
                        ret = -EINVAL;
                        goto free_aof;
                }

                ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR);
                if (ret != BTF_FIELD_FOUND)
                        continue;

                new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
                                   GFP_KERNEL | __GFP_NOWARN);
                if (!new_aof) {
                        ret = -ENOMEM;
                        goto free_aof;
                }
                aof = new_aof;
                aof->ids[aof->cnt++] = i;
        }

        if (!aof->cnt) {
                kfree(aof);
                return NULL;
        }
        sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL);

        for (i = 1; i < n; i++) {
                struct btf_struct_metas *new_tab;
                const struct btf_member *member;
                struct btf_struct_meta *type;
                struct btf_record *record;
                const struct btf_type *t;
                int j, tab_cnt;

                t = btf_type_by_id(btf, i);
                if (!__btf_type_is_struct(t))
                        continue;

                cond_resched();

                for_each_member(j, t, member) {
                        if (btf_id_set_contains(aof, member->type))
                                goto parse;
                }
                continue;
        parse:
                tab_cnt = tab ? tab->cnt : 0;
                new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1),
                                   GFP_KERNEL | __GFP_NOWARN);
                if (!new_tab) {
                        ret = -ENOMEM;
                        goto free;
                }
                if (!tab)
                        new_tab->cnt = 0;
                tab = new_tab;

                type = &tab->types[tab->cnt];
                type->btf_id = i;
                record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
                                                  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
                                                  BPF_KPTR, t->size);
                /* The record cannot be unset, treat it as an error if so */
                if (IS_ERR_OR_NULL(record)) {
                        ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
                        goto free;
                }
                type->record = record;
                tab->cnt++;
        }
        kfree(aof);
        return tab;
free:
        btf_struct_metas_free(tab);
free_aof:
        kfree(aof);
        return ERR_PTR(ret);
}

struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
{
        struct btf_struct_metas *tab;

        BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0);
        tab = btf->struct_meta_tab;
        if (!tab)
                return NULL;
        return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func);
}

static int btf_check_type_tags(struct btf_verifier_env *env,
                               struct btf *btf, int start_id)
{
        int i, n, good_id = start_id - 1;
        bool in_tags;

        n = btf_nr_types(btf);
        for (i = start_id; i < n; i++) {
                const struct btf_type *t;
                int chain_limit = 32;
                u32 cur_id = i;

                t = btf_type_by_id(btf, i);
                if (!t)
                        return -EINVAL;
                if (!btf_type_is_modifier(t))
                        continue;

                cond_resched();

                in_tags = btf_type_is_type_tag(t);
                while (btf_type_is_modifier(t)) {
                        if (!chain_limit--) {
                                btf_verifier_log(env, "Max chain length or cycle detected");
                                return -ELOOP;
                        }
                        if (btf_type_is_type_tag(t)) {
                                if (!in_tags) {
                                        btf_verifier_log(env, "Type tags don't precede modifiers");
                                        return -EINVAL;
                                }
                        } else if (in_tags) {
                                in_tags = false;
                        }
                        if (cur_id <= good_id)
                                break;
                        /* Move to next type */
                        cur_id = t->type;
                        t = btf_type_by_id(btf, cur_id);
                        if (!t)
                                return -EINVAL;
                }
                good_id = i;
        }
        return 0;
}

static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
{
        u32 log_true_size;
        int err;

        err = bpf_vlog_finalize(log, &log_true_size);

        if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
            copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
                                  &log_true_size, sizeof(log_true_size)))
                err = -EFAULT;

        return err;
}

static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
        bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
        char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
        struct btf_struct_metas *struct_meta_tab;
        struct btf_verifier_env *env = NULL;
        struct btf *btf = NULL;
        u8 *data;
        int err, ret;

        if (attr->btf_size > BTF_MAX_SIZE)
                return ERR_PTR(-E2BIG);

        env = kzalloc_obj(*env, GFP_KERNEL | __GFP_NOWARN);
        if (!env)
                return ERR_PTR(-ENOMEM);

        /* user could have requested verbose verifier output
         * and supplied buffer to store the verification trace
         */
        err = bpf_vlog_init(&env->log, attr->btf_log_level,
                            log_ubuf, attr->btf_log_size);
        if (err)
                goto errout_free;

        btf = kzalloc_obj(*btf, GFP_KERNEL | __GFP_NOWARN);
        if (!btf) {
                err = -ENOMEM;
                goto errout;
        }
        env->btf = btf;
        btf->named_start_id = 0;

        data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
        if (!data) {
                err = -ENOMEM;
                goto errout;
        }

        btf->data = data;
        btf->data_size = attr->btf_size;

        if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
                err = -EFAULT;
                goto errout;
        }

        err = btf_parse_hdr(env);
        if (err)
                goto errout;

        btf->nohdr_data = btf->data + btf->hdr.hdr_len;

        err = btf_parse_str_sec(env);
        if (err)
                goto errout;

        err = btf_parse_layout_sec(env);
        if (err)
                goto errout;

        err = btf_parse_type_sec(env);
        if (err)
                goto errout;

        err = btf_check_type_tags(env, btf, 1);
        if (err)
                goto errout;

        struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
        if (IS_ERR(struct_meta_tab)) {
                err = PTR_ERR(struct_meta_tab);
                goto errout;
        }
        btf->struct_meta_tab = struct_meta_tab;

        if (struct_meta_tab) {
                int i;

                for (i = 0; i < struct_meta_tab->cnt; i++) {
                        err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record);
                        if (err < 0)
                                goto errout_meta;
                }
        }

        err = finalize_log(&env->log, uattr, uattr_size);
        if (err)
                goto errout_free;

        btf_verifier_env_free(env);
        refcount_set(&btf->refcnt, 1);
        return btf;

errout_meta:
        btf_free_struct_meta_tab(btf);
errout:
        /* overwrite err with -ENOSPC or -EFAULT */
        ret = finalize_log(&env->log, uattr, uattr_size);
        if (ret)
                err = ret;
errout_free:
        btf_verifier_env_free(env);
        if (btf)
                btf_free(btf);
        return ERR_PTR(err);
}

extern char __start_BTF[];
extern char __stop_BTF[];
extern struct btf *btf_vmlinux;

#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name)
static union {
        struct bpf_ctx_convert {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        prog_ctx_type _id##_prog; \
        kern_ctx_type _id##_kern;
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
        } *__t;
        /* 't' is written once under lock. Read many times. */
        const struct btf_type *t;
} bpf_ctx_convert;
enum {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        __ctx_convert##_id,
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
        __ctx_convert_unused, /* to avoid empty enum in extreme .config */
};
static u8 bpf_ctx_convert_map[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        [_id] = __ctx_convert##_id,
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
        0, /* avoid empty array */
};
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE

static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type)
{
        const struct btf_type *conv_struct;
        const struct btf_member *ctx_type;

        conv_struct = bpf_ctx_convert.t;
        if (!conv_struct)
                return NULL;
        /* prog_type is valid bpf program type. No need for bounds check. */
        ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
        /* ctx_type is a pointer to prog_ctx_type in vmlinux.
         * Like 'struct __sk_buff'
         */
        return btf_type_by_id(btf_vmlinux, ctx_type->type);
}

static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
{
        const struct btf_type *conv_struct;
        const struct btf_member *ctx_type;

        conv_struct = bpf_ctx_convert.t;
        if (!conv_struct)
                return -EFAULT;
        /* prog_type is valid bpf program type. No need for bounds check. */
        ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
        /* ctx_type is a pointer to prog_ctx_type in vmlinux.
         * Like 'struct sk_buff'
         */
        return ctx_type->type;
}

bool btf_is_projection_of(const char *pname, const char *tname)
{
        if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
                return true;
        if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
                return true;
        return false;
}

bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
                          const struct btf_type *t, enum bpf_prog_type prog_type,
                          int arg)
{
        const struct btf_type *ctx_type;
        const char *tname, *ctx_tname;

        t = btf_type_by_id(btf, t->type);

        /* KPROBE programs allow bpf_user_pt_regs_t typedef, which we need to
         * check before we skip all the typedef below.
         */
        if (prog_type == BPF_PROG_TYPE_KPROBE) {
                while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
                        t = btf_type_by_id(btf, t->type);

                if (btf_type_is_typedef(t)) {
                        tname = btf_name_by_offset(btf, t->name_off);
                        if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
                                return true;
                }
        }

        while (btf_type_is_modifier(t))
                t = btf_type_by_id(btf, t->type);
        if (!btf_type_is_struct(t)) {
                /* Only pointer to struct is supported for now.
                 * That means that BPF_PROG_TYPE_TRACEPOINT with BTF
                 * is not supported yet.
                 * BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
                 */
                return false;
        }
        tname = btf_name_by_offset(btf, t->name_off);
        if (!tname) {
                bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
                return false;
        }

        ctx_type = find_canonical_prog_ctx_type(prog_type);
        if (!ctx_type) {
                bpf_log(log, "btf_vmlinux is malformed\n");
                /* should not happen */
                return false;
        }
again:
        ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
        if (!ctx_tname) {
                /* should not happen */
                bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
                return false;
        }
        /* program types without named context types work only with arg:ctx tag */
        if (ctx_tname[0] == '\0')
                return false;
        /* only compare that prog's ctx type name is the same as
         * kernel expects. No need to compare field by field.
         * It's ok for bpf prog to do:
         * struct __sk_buff {};
         * int socket_filter_bpf_prog(struct __sk_buff *skb)
         * { // no fields of skb are ever used }
         */
        if (btf_is_projection_of(ctx_tname, tname))
                return true;
        if (strcmp(ctx_tname, tname)) {
                /* bpf_user_pt_regs_t is a typedef, so resolve it to
                 * underlying struct and check name again
                 */
                if (!btf_type_is_modifier(ctx_type))
                        return false;
                while (btf_type_is_modifier(ctx_type))
                        ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
                goto again;
        }
        return true;
}

/* forward declarations for arch-specific underlying types of
 * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef
 * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still
 * works correctly with __builtin_types_compatible_p() on respective
 * architectures
 */
struct user_regs_struct;
struct user_pt_regs;

static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
                                      const struct btf_type *t, int arg,
                                      enum bpf_prog_type prog_type,
                                      enum bpf_attach_type attach_type)
{
        const struct btf_type *ctx_type;
        const char *tname, *ctx_tname;

        if (!btf_is_ptr(t)) {
                bpf_log(log, "arg#%d type isn't a pointer\n", arg);
                return -EINVAL;
        }
        t = btf_type_by_id(btf, t->type);

        /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */
        if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) {
                while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
                        t = btf_type_by_id(btf, t->type);

                if (btf_type_is_typedef(t)) {
                        tname = btf_name_by_offset(btf, t->name_off);
                        if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
                                return 0;
                }
        }

        /* all other program types don't use typedefs for context type */
        while (btf_type_is_modifier(t))
                t = btf_type_by_id(btf, t->type);

        /* `void *ctx __arg_ctx` is always valid */
        if (btf_type_is_void(t))
                return 0;

        tname = btf_name_by_offset(btf, t->name_off);
        if (str_is_empty(tname)) {
                bpf_log(log, "arg#%d type doesn't have a name\n", arg);
                return -EINVAL;
        }

        /* special cases */
        switch (prog_type) {
        case BPF_PROG_TYPE_KPROBE:
                if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
                        return 0;
                break;
        case BPF_PROG_TYPE_PERF_EVENT:
                if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) &&
                    __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
                        return 0;
                if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) &&
                    __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0)
                        return 0;
                if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) &&
                    __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0)
                        return 0;
                break;
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
                /* allow u64* as ctx */
                if (btf_is_int(t) && t->size == 8)
                        return 0;
                break;
        case BPF_PROG_TYPE_TRACING:
                switch (attach_type) {
                case BPF_TRACE_RAW_TP:
                        /* tp_btf program is TRACING, so need special case here */
                        if (__btf_type_is_struct(t) &&
                            strcmp(tname, "bpf_raw_tracepoint_args") == 0)
                                return 0;
                        /* allow u64* as ctx */
                        if (btf_is_int(t) && t->size == 8)
                                return 0;
                        break;
                case BPF_TRACE_ITER:
                        /* allow struct bpf_iter__xxx types only */
                        if (__btf_type_is_struct(t) &&
                            strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0)
                                return 0;
                        break;
                case BPF_TRACE_FENTRY:
                case BPF_TRACE_FEXIT:
                case BPF_MODIFY_RETURN:
                case BPF_TRACE_FSESSION:
                        /* allow u64* as ctx */
                        if (btf_is_int(t) && t->size == 8)
                                return 0;
                        break;
                default:
                        break;
                }
                break;
        case BPF_PROG_TYPE_LSM:
        case BPF_PROG_TYPE_STRUCT_OPS:
                /* allow u64* as ctx */
                if (btf_is_int(t) && t->size == 8)
                        return 0;
                break;
        case BPF_PROG_TYPE_TRACEPOINT:
        case BPF_PROG_TYPE_SYSCALL:
        case BPF_PROG_TYPE_EXT:
                return 0; /* anything goes */
        default:
                break;
        }

        ctx_type = find_canonical_prog_ctx_type(prog_type);
        if (!ctx_type) {
                /* should not happen */
                bpf_log(log, "btf_vmlinux is malformed\n");
                return -EINVAL;
        }

        /* resolve typedefs and check that underlying structs are matching as well */
        while (btf_type_is_modifier(ctx_type))
                ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);

        /* if program type doesn't have distinctly named struct type for
         * context, then __arg_ctx argument can only be `void *`, which we
         * already checked above
         */
        if (!__btf_type_is_struct(ctx_type)) {
                bpf_log(log, "arg#%d should be void pointer\n", arg);
                return -EINVAL;
        }

        ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
        if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) {
                bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname);
                return -EINVAL;
        }

        return 0;
}

static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
                                     struct btf *btf,
                                     const struct btf_type *t,
                                     enum bpf_prog_type prog_type,
                                     int arg)
{
        if (!btf_is_prog_ctx_type(log, btf, t, prog_type, arg))
                return -ENOENT;
        return find_kern_ctx_type_id(prog_type);
}

int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
{
        const struct btf_member *kctx_member;
        const struct btf_type *conv_struct;
        const struct btf_type *kctx_type;
        u32 kctx_type_id;

        conv_struct = bpf_ctx_convert.t;
        /* get member for kernel ctx type */
        kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
        kctx_type_id = kctx_member->type;
        kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id);
        if (!btf_type_is_struct(kctx_type)) {
                bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id);
                return -EINVAL;
        }

        return kctx_type_id;
}

BTF_ID_LIST_SINGLE(bpf_ctx_convert_btf_id, struct, bpf_ctx_convert)

static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name,
                                  void *data, unsigned int data_size)
{
        struct btf *btf = NULL;
        int err;

        if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF))
                return ERR_PTR(-ENOENT);

        btf = kzalloc_obj(*btf, GFP_KERNEL | __GFP_NOWARN);
        if (!btf) {
                err = -ENOMEM;
                goto errout;
        }
        env->btf = btf;

        btf->data = data;
        btf->data_size = data_size;
        btf->kernel_btf = true;
        btf->named_start_id = 0;
        strscpy(btf->name, name);

        err = btf_parse_hdr(env);
        if (err)
                goto errout;

        btf->nohdr_data = btf->data + btf->hdr.hdr_len;

        err = btf_parse_str_sec(env);
        if (err)
                goto errout;

        err = btf_check_all_metas(env);
        if (err)
                goto errout;

        err = btf_check_type_tags(env, btf, 1);
        if (err)
                goto errout;

        btf_check_sorted(btf);
        refcount_set(&btf->refcnt, 1);

        return btf;

errout:
        if (btf) {
                kvfree(btf->types);
                kfree(btf);
        }
        return ERR_PTR(err);
}

struct btf *btf_parse_vmlinux(void)
{
        struct btf_verifier_env *env = NULL;
        struct bpf_verifier_log *log;
        struct btf *btf;
        int err;

        env = kzalloc_obj(*env, GFP_KERNEL | __GFP_NOWARN);
        if (!env)
                return ERR_PTR(-ENOMEM);

        log = &env->log;
        log->level = BPF_LOG_KERNEL;
        btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF);
        if (IS_ERR(btf))
                goto err_out;

        /* btf_parse_vmlinux() runs under bpf_verifier_lock */
        bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
        err = btf_alloc_id(btf);
        if (err) {
                btf_free(btf);
                btf = ERR_PTR(err);
        }
err_out:
        btf_verifier_env_free(env);
        return btf;
}

/* If .BTF_ids section was created with distilled base BTF, both base and
 * split BTF ids will need to be mapped to actual base/split ids for
 * BTF now that it has been relocated.
 */
static __u32 btf_relocate_id(const struct btf *btf, __u32 id)
{
        if (!btf->base_btf || !btf->base_id_map)
                return id;
        return btf->base_id_map[id];
}

#ifdef CONFIG_DEBUG_INFO_BTF_MODULES

static struct btf *btf_parse_module(const char *module_name, const void *data,
                                    unsigned int data_size, void *base_data,
                                    unsigned int base_data_size)
{
        struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL;
        struct btf_verifier_env *env = NULL;
        struct bpf_verifier_log *log;
        int err = 0;

        vmlinux_btf = bpf_get_btf_vmlinux();
        if (IS_ERR(vmlinux_btf))
                return vmlinux_btf;
        if (!vmlinux_btf)
                return ERR_PTR(-EINVAL);

        env = kzalloc_obj(*env, GFP_KERNEL | __GFP_NOWARN);
        if (!env)
                return ERR_PTR(-ENOMEM);

        log = &env->log;
        log->level = BPF_LOG_KERNEL;

        if (base_data) {
                base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size);
                if (IS_ERR(base_btf)) {
                        err = PTR_ERR(base_btf);
                        goto errout;
                }
        } else {
                base_btf = vmlinux_btf;
        }

        btf = kzalloc_obj(*btf, GFP_KERNEL | __GFP_NOWARN);
        if (!btf) {
                err = -ENOMEM;
                goto errout;
        }
        env->btf = btf;

        btf->base_btf = base_btf;
        btf->start_id = base_btf->nr_types;
        btf->start_str_off = base_btf->hdr.str_len;
        btf->kernel_btf = true;
        btf->named_start_id = 0;
        strscpy(btf->name, module_name);

        btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
        if (!btf->data) {
                err = -ENOMEM;
                goto errout;
        }
        btf->data_size = data_size;

        err = btf_parse_hdr(env);
        if (err)
                goto errout;

        btf->nohdr_data = btf->data + btf->hdr.hdr_len;

        err = btf_parse_str_sec(env);
        if (err)
                goto errout;

        err = btf_check_all_metas(env);
        if (err)
                goto errout;

        err = btf_check_type_tags(env, btf, btf_nr_types(base_btf));
        if (err)
                goto errout;

        if (base_btf != vmlinux_btf) {
                err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map);
                if (err)
                        goto errout;
                btf_free(base_btf);
                base_btf = vmlinux_btf;
        }

        btf_verifier_env_free(env);
        btf_check_sorted(btf);
        refcount_set(&btf->refcnt, 1);
        return btf;

errout:
        btf_verifier_env_free(env);
        if (!IS_ERR(base_btf) && base_btf != vmlinux_btf)
                btf_free(base_btf);
        if (btf) {
                kvfree(btf->data);
                kvfree(btf->types);
                kfree(btf);
        }
        return ERR_PTR(err);
}

#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */

struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
{
        struct bpf_prog *tgt_prog = prog->aux->dst_prog;

        if (tgt_prog)
                return tgt_prog->aux->btf;
        else
                return prog->aux->attach_btf;
}

u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
                    int off)
{
        const struct btf_param *args;
        const struct btf_type *t;
        u32 offset = 0, nr_args;
        int i;

        if (!func_proto)
                return off / 8;

        nr_args = btf_type_vlen(func_proto);
        args = (const struct btf_param *)(func_proto + 1);
        for (i = 0; i < nr_args; i++) {
                t = btf_type_skip_modifiers(btf, args[i].type, NULL);
                offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
                if (off < offset)
                        return i;
        }

        t = btf_type_skip_modifiers(btf, func_proto->type, NULL);
        offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
        if (off < offset)
                return nr_args;

        return nr_args + 1;
}

static bool prog_args_trusted(const struct bpf_prog *prog)
{
        enum bpf_attach_type atype = prog->expected_attach_type;

        switch (prog->type) {
        case BPF_PROG_TYPE_TRACING:
                return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER;
        case BPF_PROG_TYPE_LSM:
                return bpf_lsm_is_trusted(prog);
        case BPF_PROG_TYPE_STRUCT_OPS:
                return true;
        default:
                return false;
        }
}

int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
                       u32 arg_no)
{
        const struct btf_param *args;
        const struct btf_type *t;
        int off = 0, i;
        u32 sz;

        args = btf_params(func_proto);
        for (i = 0; i < arg_no; i++) {
                t = btf_type_by_id(btf, args[i].type);
                t = btf_resolve_size(btf, t, &sz);
                if (IS_ERR(t))
                        return PTR_ERR(t);
                off += roundup(sz, 8);
        }

        return off;
}

struct bpf_raw_tp_null_args {
        const char *func;
        u64 mask;
};

static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
        /* sched */
        { "sched_pi_setprio", 0x10 },
        /* ... from sched_numa_pair_template event class */
        { "sched_stick_numa", 0x100 },
        { "sched_swap_numa", 0x100 },
        /* afs */
        { "afs_make_fs_call", 0x10 },
        { "afs_make_fs_calli", 0x10 },
        { "afs_make_fs_call1", 0x10 },
        { "afs_make_fs_call2", 0x10 },
        { "afs_protocol_error", 0x1 },
        { "afs_flock_ev", 0x10 },
        /* cachefiles */
        { "cachefiles_lookup", 0x1 | 0x200 },
        { "cachefiles_unlink", 0x1 },
        { "cachefiles_rename", 0x1 },
        { "cachefiles_prep_read", 0x1 },
        { "cachefiles_mark_active", 0x1 },
        { "cachefiles_mark_failed", 0x1 },
        { "cachefiles_mark_inactive", 0x1 },
        { "cachefiles_vfs_error", 0x1 },
        { "cachefiles_io_error", 0x1 },
        { "cachefiles_ondemand_open", 0x1 },
        { "cachefiles_ondemand_copen", 0x1 },
        { "cachefiles_ondemand_close", 0x1 },
        { "cachefiles_ondemand_read", 0x1 },
        { "cachefiles_ondemand_cread", 0x1 },
        { "cachefiles_ondemand_fd_write", 0x1 },
        { "cachefiles_ondemand_fd_release", 0x1 },
        /* ext4, from ext4__mballoc event class */
        { "ext4_mballoc_discard", 0x10 },
        { "ext4_mballoc_free", 0x10 },
        /* fib */
        { "fib_table_lookup", 0x100 },
        /* filelock */
        /* ... from filelock_lock event class */
        { "posix_lock_inode", 0x10 },
        { "fcntl_setlk", 0x10 },
        { "locks_remove_posix", 0x10 },
        { "flock_lock_inode", 0x10 },
        /* ... from filelock_lease event class */
        { "break_lease_noblock", 0x10 },
        { "break_lease_block", 0x10 },
        { "break_lease_unblock", 0x10 },
        { "generic_delete_lease", 0x10 },
        { "time_out_leases", 0x10 },
        /* host1x */
        { "host1x_cdma_push_gather", 0x10000 },
        /* huge_memory */
        { "mm_khugepaged_scan_pmd", 0x10 },
        { "mm_collapse_huge_page_isolate", 0x1 },
        { "mm_khugepaged_scan_file", 0x10 },
        { "mm_khugepaged_collapse_file", 0x10 },
        /* kmem */
        { "mm_page_alloc", 0x1 },
        { "mm_page_pcpu_drain", 0x1 },
        /* .. from mm_page event class */
        { "mm_page_alloc_zone_locked", 0x1 },
        /* netfs */
        { "netfs_failure", 0x10 },
        /* power */
        { "device_pm_callback_start", 0x10 },
        /* qdisc */
        { "qdisc_dequeue", 0x1000 },
        /* rxrpc */
        { "rxrpc_recvdata", 0x1 },
        { "rxrpc_resend", 0x10 },
        { "rxrpc_tq", 0x10 },
        { "rxrpc_client", 0x1 },
        /* skb */
        {"kfree_skb", 0x1000},
        /* sunrpc */
        { "xs_stream_read_data", 0x1 },
        /* ... from xprt_cong_event event class */
        { "xprt_reserve_cong", 0x10 },
        { "xprt_release_cong", 0x10 },
        { "xprt_get_cong", 0x10 },
        { "xprt_put_cong", 0x10 },
        /* tcp */
        { "tcp_send_reset", 0x11 },
        { "tcp_sendmsg_locked", 0x100 },
        /* tegra_apb_dma */
        { "tegra_dma_tx_status", 0x100 },
        /* timer_migration */
        { "tmigr_update_events", 0x1 },
        /* writeback, from writeback_folio_template event class */
        { "writeback_dirty_folio", 0x10 },
        { "folio_wait_writeback", 0x10 },
        /* rdma */
        { "mr_integ_alloc", 0x2000 },
        /* bpf_testmod */
        { "bpf_testmod_test_read", 0x0 },
        /* amdgpu */
        { "amdgpu_vm_bo_map", 0x1 },
        { "amdgpu_vm_bo_unmap", 0x1 },
        /* netfs */
        { "netfs_folioq", 0x1 },
        /* xfs from xfs_defer_pending_class */
        { "xfs_defer_create_intent", 0x1 },
        { "xfs_defer_cancel_list", 0x1 },
        { "xfs_defer_pending_finish", 0x1 },
        { "xfs_defer_pending_abort", 0x1 },
        { "xfs_defer_relog_intent", 0x1 },
        { "xfs_defer_isolate_paused", 0x1 },
        { "xfs_defer_item_pause", 0x1 },
        { "xfs_defer_item_unpause", 0x1 },
        /* xfs from xfs_defer_pending_item_class */
        { "xfs_defer_add_item", 0x1 },
        { "xfs_defer_cancel_item", 0x1 },
        { "xfs_defer_finish_item", 0x1 },
        /* xfs from xfs_icwalk_class */
        { "xfs_ioc_free_eofblocks", 0x10 },
        { "xfs_blockgc_free_space", 0x10 },
        /* xfs from xfs_btree_cur_class */
        { "xfs_btree_updkeys", 0x100 },
        { "xfs_btree_overlapped_query_range", 0x100 },
        /* xfs from xfs_imap_class*/
        { "xfs_map_blocks_found", 0x10000 },
        { "xfs_map_blocks_alloc", 0x10000 },
        { "xfs_iomap_alloc", 0x1000 },
        { "xfs_iomap_found", 0x1000 },
        /* xfs from xfs_fs_class */
        { "xfs_inodegc_flush", 0x1 },
        { "xfs_inodegc_push", 0x1 },
        { "xfs_inodegc_start", 0x1 },
        { "xfs_inodegc_stop", 0x1 },
        { "xfs_inodegc_queue", 0x1 },
        { "xfs_inodegc_throttle", 0x1 },
        { "xfs_fs_sync_fs", 0x1 },
        { "xfs_blockgc_start", 0x1 },
        { "xfs_blockgc_stop", 0x1 },
        { "xfs_blockgc_worker", 0x1 },
        { "xfs_blockgc_flush_all", 0x1 },
        /* xfs_scrub */
        { "xchk_nlinks_live_update", 0x10 },
        /* xfs_scrub from xchk_metapath_class */
        { "xchk_metapath_lookup", 0x100 },
        /* nfsd */
        { "nfsd_dirent", 0x1 },
        { "nfsd_file_acquire", 0x1001 },
        { "nfsd_file_insert_err", 0x1 },
        { "nfsd_file_cons_err", 0x1 },
        /* nfs4 */
        { "nfs4_setup_sequence", 0x1 },
        { "pnfs_update_layout", 0x10000 },
        { "nfs4_inode_callback_event", 0x200 },
        { "nfs4_inode_stateid_callback_event", 0x200 },
        /* nfs from pnfs_layout_event */
        { "pnfs_mds_fallback_pg_init_read", 0x10000 },
        { "pnfs_mds_fallback_pg_init_write", 0x10000 },
        { "pnfs_mds_fallback_pg_get_mirror_count", 0x10000 },
        { "pnfs_mds_fallback_read_done", 0x10000 },
        { "pnfs_mds_fallback_write_done", 0x10000 },
        { "pnfs_mds_fallback_read_pagelist", 0x10000 },
        { "pnfs_mds_fallback_write_pagelist", 0x10000 },
        /* coda */
        { "coda_dec_pic_run", 0x10 },
        { "coda_dec_pic_done", 0x10 },
        /* cfg80211 */
        { "cfg80211_scan_done", 0x11 },
        { "rdev_set_coalesce", 0x10 },
        { "cfg80211_report_wowlan_wakeup", 0x100 },
        { "cfg80211_inform_bss_frame", 0x100 },
        { "cfg80211_michael_mic_failure", 0x10000 },
        /* cfg80211 from wiphy_work_event */
        { "wiphy_work_queue", 0x10 },
        { "wiphy_work_run", 0x10 },
        { "wiphy_work_cancel", 0x10 },
        { "wiphy_work_flush", 0x10 },
        /* hugetlbfs */
        { "hugetlbfs_alloc_inode", 0x10 },
        /* spufs */
        { "spufs_context", 0x10 },
        /* kvm_hv */
        { "kvm_page_fault_enter", 0x100 },
        /* dpu */
        { "dpu_crtc_setup_mixer", 0x100 },
        /* binder */
        { "binder_transaction", 0x100 },
        /* bcachefs */
        { "btree_path_free", 0x100 },
        /* hfi1_tx */
        { "hfi1_sdma_progress", 0x1000 },
        /* iptfs */
        { "iptfs_ingress_postq_event", 0x1000 },
        /* neigh */
        { "neigh_update", 0x10 },
        /* snd_firewire_lib */
        { "amdtp_packet", 0x100 },
};

bool btf_ctx_access(int off, int size, enum bpf_access_type type,
                    const struct bpf_prog *prog,
                    struct bpf_insn_access_aux *info)
{
        const struct btf_type *t = prog->aux->attach_func_proto;
        struct bpf_prog *tgt_prog = prog->aux->dst_prog;
        struct btf *btf = bpf_prog_get_target_btf(prog);
        const char *tname = prog->aux->attach_func_name;
        struct bpf_verifier_log *log = info->log;
        const struct btf_param *args;
        bool ptr_err_raw_tp = false;
        const char *tag_value;
        u32 nr_args, arg;
        int i, ret;

        if (off % 8) {
                bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
                        tname, off);
                return false;
        }
        arg = btf_ctx_arg_idx(btf, t, off);
        args = (const struct btf_param *)(t + 1);
        /* if (t == NULL) Fall back to default BPF prog with
         * MAX_BPF_FUNC_REG_ARGS u64 arguments.
         */
        nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS;
        if (prog->aux->attach_btf_trace) {
                /* skip first 'void *__data' argument in btf_trace_##name typedef */
                args++;
                nr_args--;
        }

        if (arg > nr_args) {
                bpf_log(log, "func '%s' doesn't have %d-th argument\n",
                        tname, arg + 1);
                return false;
        }

        if (arg == nr_args) {
                switch (prog->expected_attach_type) {
                case BPF_LSM_MAC:
                        /* mark we are accessing the return value */
                        info->is_retval = true;
                        fallthrough;
                case BPF_LSM_CGROUP:
                case BPF_TRACE_FEXIT:
                case BPF_TRACE_FSESSION:
                        /* When LSM programs are attached to void LSM hooks
                         * they use FEXIT trampolines and when attached to
                         * int LSM hooks, they use MODIFY_RETURN trampolines.
                         *
                         * While the LSM programs are BPF_MODIFY_RETURN-like
                         * the check:
                         *
                         *        if (ret_type != 'int')
                         *                return -EINVAL;
                         *
                         * is _not_ done here. This is still safe as LSM hooks
                         * have only void and int return types.
                         */
                        if (!t)
                                return true;
                        t = btf_type_by_id(btf, t->type);
                        break;
                case BPF_MODIFY_RETURN:
                        /* For now the BPF_MODIFY_RETURN can only be attached to
                         * functions that return an int.
                         */
                        if (!t)
                                return false;

                        t = btf_type_skip_modifiers(btf, t->type, NULL);
                        if (!btf_type_is_small_int(t)) {
                                bpf_log(log,
                                        "ret type %s not allowed for fmod_ret\n",
                                        btf_type_str(t));
                                return false;
                        }
                        break;
                default:
                        bpf_log(log, "func '%s' doesn't have %d-th argument\n",
                                tname, arg + 1);
                        return false;
                }
        } else {
                if (!t)
                        /* Default prog with MAX_BPF_FUNC_REG_ARGS args */
                        return true;
                t = btf_type_by_id(btf, args[arg].type);
        }

        /* skip modifiers */
        while (btf_type_is_modifier(t))
                t = btf_type_by_id(btf, t->type);
        if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
                /* accessing a scalar */
                return true;
        if (!btf_type_is_ptr(t)) {
                bpf_log(log,
                        "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
                        tname, arg,
                        __btf_name_by_offset(btf, t->name_off),
                        btf_type_str(t));
                return false;
        }

        if (size != sizeof(u64)) {
                bpf_log(log, "func '%s' size %d must be 8\n",
                        tname, size);
                return false;
        }

        /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
        for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
                const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
                u32 type, flag;

                type = base_type(ctx_arg_info->reg_type);
                flag = type_flag(ctx_arg_info->reg_type);
                if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
                    (flag & PTR_MAYBE_NULL)) {
                        info->reg_type = ctx_arg_info->reg_type;
                        return true;
                }
        }

        /*
         * If it's a single or multilevel pointer, except a pointer
         * to a structure, it's the same as scalar from the verifier
         * safety POV. Multilevel pointers to structures are treated as
         * scalars. The verifier lacks the context to infer the size of
         * their target memory regions. Either way, no further pointer
         * walking is allowed.
         */
        if (!btf_type_is_struct_ptr(btf, t))
                return true;

        /* this is a pointer to another type */
        for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
                const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];

                if (ctx_arg_info->offset == off) {
                        if (!ctx_arg_info->btf_id) {
                                bpf_log(log,"invalid btf_id for context argument offset %u\n", off);
                                return false;
                        }

                        info->reg_type = ctx_arg_info->reg_type;
                        info->btf = ctx_arg_info->btf ? : btf_vmlinux;
                        info->btf_id = ctx_arg_info->btf_id;
                        info->ref_obj_id = ctx_arg_info->ref_obj_id;
                        return true;
                }
        }

        info->reg_type = PTR_TO_BTF_ID;
        if (prog_args_trusted(prog))
                info->reg_type |= PTR_TRUSTED;

        if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
                info->reg_type |= PTR_MAYBE_NULL;

        if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
                struct btf *btf = prog->aux->attach_btf;
                const struct btf_type *t;
                const char *tname;

                /* BTF lookups cannot fail, return false on error */
                t = btf_type_by_id(btf, prog->aux->attach_btf_id);
                if (!t)
                        return false;
                tname = btf_name_by_offset(btf, t->name_off);
                if (!tname)
                        return false;
                /* Checked by bpf_check_attach_target */
                tname += sizeof("btf_trace_") - 1;
                for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) {
                        /* Is this a func with potential NULL args? */
                        if (strcmp(tname, raw_tp_null_args[i].func))
                                continue;
                        if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4)))
                                info->reg_type |= PTR_MAYBE_NULL;
                        /* Is the current arg IS_ERR? */
                        if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4)))
                                ptr_err_raw_tp = true;
                        break;
                }
                /* If we don't know NULL-ness specification and the tracepoint
                 * is coming from a loadable module, be conservative and mark
                 * argument as PTR_MAYBE_NULL.
                 */
                if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf))
                        info->reg_type |= PTR_MAYBE_NULL;
        }

        if (tgt_prog) {
                enum bpf_prog_type tgt_type;

                if (tgt_prog->type == BPF_PROG_TYPE_EXT)
                        tgt_type = tgt_prog->aux->saved_dst_prog_type;
                else
                        tgt_type = tgt_prog->type;

                ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg);
                if (ret > 0) {
                        info->btf = btf_vmlinux;
                        info->btf_id = ret;
                        return true;
                } else {
                        return false;
                }
        }

        info->btf = btf;
        info->btf_id = t->type;
        t = btf_type_by_id(btf, t->type);

        if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) {
                tag_value = __btf_name_by_offset(btf, t->name_off);
                if (strcmp(tag_value, "user") == 0)
                        info->reg_type |= MEM_USER;
                if (strcmp(tag_value, "percpu") == 0)
                        info->reg_type |= MEM_PERCPU;
        }

        /* skip modifiers */
        while (btf_type_is_modifier(t)) {
                info->btf_id = t->type;
                t = btf_type_by_id(btf, t->type);
        }
        if (!btf_type_is_struct(t)) {
                bpf_log(log,
                        "func '%s' arg%d type %s is not a struct\n",
                        tname, arg, btf_type_str(t));
                return false;
        }
        bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
                tname, arg, info->btf_id, btf_type_str(t),
                __btf_name_by_offset(btf, t->name_off));

        /* Perform all checks on the validity of type for this argument, but if
         * we know it can be IS_ERR at runtime, scrub pointer type and mark as
         * scalar.
         */
        if (ptr_err_raw_tp) {
                bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg);
                info->reg_type = SCALAR_VALUE;
        }
        return true;
}
EXPORT_SYMBOL_GPL(btf_ctx_access);

enum bpf_struct_walk_result {
        /* < 0 error */
        WALK_SCALAR = 0,
        WALK_PTR,
        WALK_PTR_UNTRUSTED,
        WALK_STRUCT,
};

static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
                           const struct btf_type *t, int off, int size,
                           u32 *next_btf_id, enum bpf_type_flag *flag,
                           const char **field_name)
{
        u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
        const struct btf_type *mtype, *elem_type = NULL;
        const struct btf_member *member;
        const char *tname, *mname, *tag_value;
        u32 vlen, elem_id, mid;

again:
        if (btf_type_is_modifier(t))
                t = btf_type_skip_modifiers(btf, t->type, NULL);
        tname = __btf_name_by_offset(btf, t->name_off);
        if (!btf_type_is_struct(t)) {
                bpf_log(log, "Type '%s' is not a struct\n", tname);
                return -EINVAL;
        }

        vlen = btf_type_vlen(t);
        if (BTF_INFO_KIND(t->info) == BTF_KIND_UNION && vlen != 1 && !(*flag & PTR_UNTRUSTED))
                /*
                 * walking unions yields untrusted pointers
                 * with exception of __bpf_md_ptr and other
                 * unions with a single member
                 */
                *flag |= PTR_UNTRUSTED;

        if (off + size > t->size) {
                /* If the last element is a variable size array, we may
                 * need to relax the rule.
                 */
                struct btf_array *array_elem;

                if (vlen == 0)
                        goto error;

                member = btf_type_member(t) + vlen - 1;
                mtype = btf_type_skip_modifiers(btf, member->type,
                                                NULL);
                if (!btf_type_is_array(mtype))
                        goto error;

                array_elem = (struct btf_array *)(mtype + 1);
                if (array_elem->nelems != 0)
                        goto error;

                moff = __btf_member_bit_offset(t, member) / 8;
                if (off < moff)
                        goto error;

                /* allow structure and integer */
                t = btf_type_skip_modifiers(btf, array_elem->type,
                                            NULL);

                if (btf_type_is_int(t))
                        return WALK_SCALAR;

                if (!btf_type_is_struct(t))
                        goto error;

                off = (off - moff) % t->size;
                goto again;

error:
                bpf_log(log, "access beyond struct %s at off %u size %u\n",
                        tname, off, size);
                return -EACCES;
        }

        for_each_member(i, t, member) {
                /* offset of the field in bytes */
                moff = __btf_member_bit_offset(t, member) / 8;
                if (off + size <= moff)
                        /* won't find anything, field is already too far */
                        break;

                if (__btf_member_bitfield_size(t, member)) {
                        u32 end_bit = __btf_member_bit_offset(t, member) +
                                __btf_member_bitfield_size(t, member);

                        /* off <= moff instead of off == moff because clang
                         * does not generate a BTF member for anonymous
                         * bitfield like the ":16" here:
                         * struct {
                         *        int :16;
                         *        int x:8;
                         * };
                         */
                        if (off <= moff &&
                            BITS_ROUNDUP_BYTES(end_bit) <= off + size)
                                return WALK_SCALAR;

                        /* off may be accessing a following member
                         *
                         * or
                         *
                         * Doing partial access at either end of this
                         * bitfield.  Continue on this case also to
                         * treat it as not accessing this bitfield
                         * and eventually error out as field not
                         * found to keep it simple.
                         * It could be relaxed if there was a legit
                         * partial access case later.
                         */
                        continue;
                }

                /* In case of "off" is pointing to holes of a struct */
                if (off < moff)
                        break;

                /* type of the field */
                mid = member->type;
                mtype = btf_type_by_id(btf, member->type);
                mname = __btf_name_by_offset(btf, member->name_off);

                mtype = __btf_resolve_size(btf, mtype, &msize,
                                           &elem_type, &elem_id, &total_nelems,
                                           &mid);
                if (IS_ERR(mtype)) {
                        bpf_log(log, "field %s doesn't have size\n", mname);
                        return -EFAULT;
                }

                mtrue_end = moff + msize;
                if (off >= mtrue_end)
                        /* no overlap with member, keep iterating */
                        continue;

                if (btf_type_is_array(mtype)) {
                        u32 elem_idx;

                        /* __btf_resolve_size() above helps to
                         * linearize a multi-dimensional array.
                         *
                         * The logic here is treating an array
                         * in a struct as the following way:
                         *
                         * struct outer {
                         *        struct inner array[2][2];
                         * };
                         *
                         * looks like:
                         *
                         * struct outer {
                         *        struct inner array_elem0;
                         *        struct inner array_elem1;
                         *        struct inner array_elem2;
                         *        struct inner array_elem3;
                         * };
                         *
                         * When accessing outer->array[1][0], it moves
                         * moff to "array_elem2", set mtype to
                         * "struct inner", and msize also becomes
                         * sizeof(struct inner).  Then most of the
                         * remaining logic will fall through without
                         * caring the current member is an array or
                         * not.
                         *
                         * Unlike mtype/msize/moff, mtrue_end does not
                         * change.  The naming difference ("_true") tells
                         * that it is not always corresponding to
                         * the current mtype/msize/moff.
                         * It is the true end of the current
                         * member (i.e. array in this case).  That
                         * will allow an int array to be accessed like
                         * a scratch space,
                         * i.e. allow access beyond the size of
                         *      the array's element as long as it is
                         *      within the mtrue_end boundary.
                         */

                        /* skip empty array */
                        if (moff == mtrue_end)
                                continue;

                        msize /= total_nelems;
                        elem_idx = (off - moff) / msize;
                        moff += elem_idx * msize;
                        mtype = elem_type;
                        mid = elem_id;
                }

                /* the 'off' we're looking for is either equal to start
                 * of this field or inside of this struct
                 */
                if (btf_type_is_struct(mtype)) {
                        /* our field must be inside that union or struct */
                        t = mtype;

                        /* return if the offset matches the member offset */
                        if (off == moff) {
                                *next_btf_id = mid;
                                return WALK_STRUCT;
                        }

                        /* adjust offset we're looking for */
                        off -= moff;
                        goto again;
                }

                if (btf_type_is_ptr(mtype)) {
                        const struct btf_type *stype, *t;
                        enum bpf_type_flag tmp_flag = 0;
                        u32 id;

                        if (msize != size || off != moff) {
                                bpf_log(log,
                                        "cannot access ptr member %s with moff %u in struct %s with off %u size %u\n",
                                        mname, moff, tname, off, size);
                                return -EACCES;
                        }

                        /* check type tag */
                        t = btf_type_by_id(btf, mtype->type);
                        if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) {
                                tag_value = __btf_name_by_offset(btf, t->name_off);
                                /* check __user tag */
                                if (strcmp(tag_value, "user") == 0)
                                        tmp_flag = MEM_USER;
                                /* check __percpu tag */
                                if (strcmp(tag_value, "percpu") == 0)
                                        tmp_flag = MEM_PERCPU;
                                /* check __rcu tag */
                                if (strcmp(tag_value, "rcu") == 0)
                                        tmp_flag = MEM_RCU;
                        }

                        stype = btf_type_skip_modifiers(btf, mtype->type, &id);
                        if (btf_type_is_struct(stype)) {
                                *next_btf_id = id;
                                *flag |= tmp_flag;
                                if (field_name)
                                        *field_name = mname;
                                return WALK_PTR;
                        }

                        return WALK_PTR_UNTRUSTED;
                }

                /* Allow more flexible access within an int as long as
                 * it is within mtrue_end.
                 * Since mtrue_end could be the end of an array,
                 * that also allows using an array of int as a scratch
                 * space. e.g. skb->cb[].
                 */
                if (off + size > mtrue_end && !(*flag & PTR_UNTRUSTED)) {
                        bpf_log(log,
                                "access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n",
                                mname, mtrue_end, tname, off, size);
                        return -EACCES;
                }

                return WALK_SCALAR;
        }
        bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off);
        return -EINVAL;
}

int btf_struct_access(struct bpf_verifier_log *log,
                      const struct bpf_reg_state *reg,
                      int off, int size, enum bpf_access_type atype __maybe_unused,
                      u32 *next_btf_id, enum bpf_type_flag *flag,
                      const char **field_name)
{
        const struct btf *btf = reg->btf;
        enum bpf_type_flag tmp_flag = 0;
        const struct btf_type *t;
        u32 id = reg->btf_id;
        int err;

        while (type_is_alloc(reg->type)) {
                struct btf_struct_meta *meta;
                struct btf_record *rec;
                int i;

                meta = btf_find_struct_meta(btf, id);
                if (!meta)
                        break;
                rec = meta->record;
                for (i = 0; i < rec->cnt; i++) {
                        struct btf_field *field = &rec->fields[i];
                        u32 offset = field->offset;
                        if (off < offset + field->size && offset < off + size) {
                                bpf_log(log,
                                        "direct access to %s is disallowed\n",
                                        btf_field_type_name(field->type));
                                return -EACCES;
                        }
                }
                break;
        }

        t = btf_type_by_id(btf, id);
        do {
                err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);

                switch (err) {
                case WALK_PTR:
                        /* For local types, the destination register cannot
                         * become a pointer again.
                         */
                        if (type_is_alloc(reg->type))
                                return SCALAR_VALUE;
                        /* If we found the pointer or scalar on t+off,
                         * we're done.
                         */
                        *next_btf_id = id;
                        *flag = tmp_flag;
                        return PTR_TO_BTF_ID;
                case WALK_PTR_UNTRUSTED:
                        *flag = MEM_RDONLY | PTR_UNTRUSTED;
                        return PTR_TO_MEM;
                case WALK_SCALAR:
                        return SCALAR_VALUE;
                case WALK_STRUCT:
                        /* We found nested struct, so continue the search
                         * by diving in it. At this point the offset is
                         * aligned with the new type, so set it to 0.
                         */
                        t = btf_type_by_id(btf, id);
                        off = 0;
                        break;
                default:
                        /* It's either error or unknown return value..
                         * scream and leave.
                         */
                        if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value"))
                                return -EINVAL;
                        return err;
                }
        } while (t);

        return -EINVAL;
}

/* Check that two BTF types, each specified as an BTF object + id, are exactly
 * the same. Trivial ID check is not enough due to module BTFs, because we can
 * end up with two different module BTFs, but IDs point to the common type in
 * vmlinux BTF.
 */
bool btf_types_are_same(const struct btf *btf1, u32 id1,
                        const struct btf *btf2, u32 id2)
{
        if (id1 != id2)
                return false;
        if (btf1 == btf2)
                return true;
        return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2);
}

bool btf_struct_ids_match(struct bpf_verifier_log *log,
                          const struct btf *btf, u32 id, int off,
                          const struct btf *need_btf, u32 need_type_id,
                          bool strict)
{
        const struct btf_type *type;
        enum bpf_type_flag flag = 0;
        int err;

        /* Are we already done? */
        if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
                return true;
        /* In case of strict type match, we do not walk struct, the top level
         * type match must succeed. When strict is true, off should have already
         * been 0.
         */
        if (strict)
                return false;
again:
        type = btf_type_by_id(btf, id);
        if (!type)
                return false;
        err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
        if (err != WALK_STRUCT)
                return false;

        /* We found nested struct object. If it matches
         * the requested ID, we're done. Otherwise let's
         * continue the search with offset 0 in the new
         * type.
         */
        if (!btf_types_are_same(btf, id, need_btf, need_type_id)) {
                off = 0;
                goto again;
        }

        return true;
}

static int __get_type_size(struct btf *btf, u32 btf_id,
                           const struct btf_type **ret_type)
{
        const struct btf_type *t;

        *ret_type = btf_type_by_id(btf, 0);
        if (!btf_id)
                /* void */
                return 0;
        t = btf_type_by_id(btf, btf_id);
        while (t && btf_type_is_modifier(t))
                t = btf_type_by_id(btf, t->type);
        if (!t)
                return -EINVAL;
        *ret_type = t;
        if (btf_type_is_ptr(t))
                /* kernel size of pointer. Not BPF's size of pointer*/
                return sizeof(void *);
        if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
                return t->size;
        return -EINVAL;
}

static u8 __get_type_fmodel_flags(const struct btf_type *t)
{
        u8 flags = 0;

        if (btf_type_is_struct(t))
                flags |= BTF_FMODEL_STRUCT_ARG;
        if (btf_type_is_signed_int(t))
                flags |= BTF_FMODEL_SIGNED_ARG;

        return flags;
}

int btf_distill_func_proto(struct bpf_verifier_log *log,
                           struct btf *btf,
                           const struct btf_type *func,
                           const char *tname,
                           struct btf_func_model *m)
{
        const struct btf_param *args;
        const struct btf_type *t;
        u32 i, nargs;
        int ret;

        if (!func) {
                /* BTF function prototype doesn't match the verifier types.
                 * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
                 */
                for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
                        m->arg_size[i] = 8;
                        m->arg_flags[i] = 0;
                }
                m->ret_size = 8;
                m->ret_flags = 0;
                m->nr_args = MAX_BPF_FUNC_REG_ARGS;
                return 0;
        }
        args = (const struct btf_param *)(func + 1);
        nargs = btf_type_vlen(func);
        if (nargs > MAX_BPF_FUNC_ARGS) {
                bpf_log(log,
                        "The function %s has %d arguments. Too many.\n",
                        tname, nargs);
                return -EINVAL;
        }
        ret = __get_type_size(btf, func->type, &t);
        if (ret < 0 || btf_type_is_struct(t)) {
                bpf_log(log,
                        "The function %s return type %s is unsupported.\n",
                        tname, btf_type_str(t));
                return -EINVAL;
        }
        m->ret_size = ret;
        m->ret_flags = __get_type_fmodel_flags(t);

        for (i = 0; i < nargs; i++) {
                if (i == nargs - 1 && args[i].type == 0) {
                        bpf_log(log,
                                "The function %s with variable args is unsupported.\n",
                                tname);
                        return -EINVAL;
                }
                ret = __get_type_size(btf, args[i].type, &t);

                /* No support of struct argument size greater than 16 bytes */
                if (ret < 0 || ret > 16) {
                        bpf_log(log,
                                "The function %s arg%d type %s is unsupported.\n",
                                tname, i, btf_type_str(t));
                        return -EINVAL;
                }
                if (ret == 0) {
                        bpf_log(log,
                                "The function %s has malformed void argument.\n",
                                tname);
                        return -EINVAL;
                }
                m->arg_size[i] = ret;
                m->arg_flags[i] = __get_type_fmodel_flags(t);
        }
        m->nr_args = nargs;
        return 0;
}

/* Compare BTFs of two functions assuming only scalars and pointers to context.
 * t1 points to BTF_KIND_FUNC in btf1
 * t2 points to BTF_KIND_FUNC in btf2
 * Returns:
 * EINVAL - function prototype mismatch
 * EFAULT - verifier bug
 * 0 - 99% match. The last 1% is validated by the verifier.
 */
static int btf_check_func_type_match(struct bpf_verifier_log *log,
                                     struct btf *btf1, const struct btf_type *t1,
                                     struct btf *btf2, const struct btf_type *t2)
{
        const struct btf_param *args1, *args2;
        const char *fn1, *fn2, *s1, *s2;
        u32 nargs1, nargs2, i;

        fn1 = btf_name_by_offset(btf1, t1->name_off);
        fn2 = btf_name_by_offset(btf2, t2->name_off);

        if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) {
                bpf_log(log, "%s() is not a global function\n", fn1);
                return -EINVAL;
        }
        if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) {
                bpf_log(log, "%s() is not a global function\n", fn2);
                return -EINVAL;
        }

        t1 = btf_type_by_id(btf1, t1->type);
        if (!t1 || !btf_type_is_func_proto(t1))
                return -EFAULT;
        t2 = btf_type_by_id(btf2, t2->type);
        if (!t2 || !btf_type_is_func_proto(t2))
                return -EFAULT;

        args1 = (const struct btf_param *)(t1 + 1);
        nargs1 = btf_type_vlen(t1);
        args2 = (const struct btf_param *)(t2 + 1);
        nargs2 = btf_type_vlen(t2);

        if (nargs1 != nargs2) {
                bpf_log(log, "%s() has %d args while %s() has %d args\n",
                        fn1, nargs1, fn2, nargs2);
                return -EINVAL;
        }

        t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
        t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
        if (t1->info != t2->info) {
                bpf_log(log,
                        "Return type %s of %s() doesn't match type %s of %s()\n",
                        btf_type_str(t1), fn1,
                        btf_type_str(t2), fn2);
                return -EINVAL;
        }

        for (i = 0; i < nargs1; i++) {
                t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
                t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL);

                if (t1->info != t2->info) {
                        bpf_log(log, "arg%d in %s() is %s while %s() has %s\n",
                                i, fn1, btf_type_str(t1),
                                fn2, btf_type_str(t2));
                        return -EINVAL;
                }
                if (btf_type_has_size(t1) && t1->size != t2->size) {
                        bpf_log(log,
                                "arg%d in %s() has size %d while %s() has %d\n",
                                i, fn1, t1->size,
                                fn2, t2->size);
                        return -EINVAL;
                }

                /* global functions are validated with scalars and pointers
                 * to context only. And only global functions can be replaced.
                 * Hence type check only those types.
                 */
                if (btf_type_is_int(t1) || btf_is_any_enum(t1))
                        continue;
                if (!btf_type_is_ptr(t1)) {
                        bpf_log(log,
                                "arg%d in %s() has unrecognized type\n",
                                i, fn1);
                        return -EINVAL;
                }
                t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
                t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
                if (!btf_type_is_struct(t1)) {
                        bpf_log(log,
                                "arg%d in %s() is not a pointer to context\n",
                                i, fn1);
                        return -EINVAL;
                }
                if (!btf_type_is_struct(t2)) {
                        bpf_log(log,
                                "arg%d in %s() is not a pointer to context\n",
                                i, fn2);
                        return -EINVAL;
                }
                /* This is an optional check to make program writing easier.
                 * Compare names of structs and report an error to the user.
                 * btf_prepare_func_args() already checked that t2 struct
                 * is a context type. btf_prepare_func_args() will check
                 * later that t1 struct is a context type as well.
                 */
                s1 = btf_name_by_offset(btf1, t1->name_off);
                s2 = btf_name_by_offset(btf2, t2->name_off);
                if (strcmp(s1, s2)) {
                        bpf_log(log,
                                "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n",
                                i, fn1, s1, fn2, s2);
                        return -EINVAL;
                }
        }
        return 0;
}

/* Compare BTFs of given program with BTF of target program */
int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
                         struct btf *btf2, const struct btf_type *t2)
{
        struct btf *btf1 = prog->aux->btf;
        const struct btf_type *t1;
        u32 btf_id = 0;

        if (!prog->aux->func_info) {
                bpf_log(log, "Program extension requires BTF\n");
                return -EINVAL;
        }

        btf_id = prog->aux->func_info[0].type_id;
        if (!btf_id)
                return -EFAULT;

        t1 = btf_type_by_id(btf1, btf_id);
        if (!t1 || !btf_type_is_func(t1))
                return -EFAULT;

        return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}

static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
{
        const char *name;

        t = btf_type_by_id(btf, t->type); /* skip PTR */

        while (btf_type_is_modifier(t))
                t = btf_type_by_id(btf, t->type);

        /* allow either struct or struct forward declaration */
        if (btf_type_is_struct(t) ||
            (btf_type_is_fwd(t) && btf_type_kflag(t) == 0)) {
                name = btf_str_by_offset(btf, t->name_off);
                return name && strcmp(name, "bpf_dynptr") == 0;
        }

        return false;
}

struct bpf_cand_cache {
        const char *name;
        u32 name_len;
        u16 kind;
        u16 cnt;
        struct {
                const struct btf *btf;
                u32 id;
        } cands[];
};

static DEFINE_MUTEX(cand_cache_mutex);

static struct bpf_cand_cache *
bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id);

static int btf_get_ptr_to_btf_id(struct bpf_verifier_log *log, int arg_idx,
                                 const struct btf *btf, const struct btf_type *t)
{
        struct bpf_cand_cache *cc;
        struct bpf_core_ctx ctx = {
                .btf = btf,
                .log = log,
        };
        u32 kern_type_id, type_id;
        int err = 0;

        /* skip PTR and modifiers */
        type_id = t->type;
        t = btf_type_by_id(btf, t->type);
        while (btf_type_is_modifier(t)) {
                type_id = t->type;
                t = btf_type_by_id(btf, t->type);
        }

        mutex_lock(&cand_cache_mutex);
        cc = bpf_core_find_cands(&ctx, type_id);
        if (IS_ERR(cc)) {
                err = PTR_ERR(cc);
                bpf_log(log, "arg#%d reference type('%s %s') candidate matching error: %d\n",
                        arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
                        err);
                goto cand_cache_unlock;
        }
        if (cc->cnt != 1) {
                bpf_log(log, "arg#%d reference type('%s %s') %s\n",
                        arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
                        cc->cnt == 0 ? "has no matches" : "is ambiguous");
                err = cc->cnt == 0 ? -ENOENT : -ESRCH;
                goto cand_cache_unlock;
        }
        if (btf_is_module(cc->cands[0].btf)) {
                bpf_log(log, "arg#%d reference type('%s %s') points to kernel module type (unsupported)\n",
                        arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off));
                err = -EOPNOTSUPP;
                goto cand_cache_unlock;
        }
        kern_type_id = cc->cands[0].id;

cand_cache_unlock:
        mutex_unlock(&cand_cache_mutex);
        if (err)
                return err;

        return kern_type_id;
}

enum btf_arg_tag {
        ARG_TAG_CTX          = BIT_ULL(0),
        ARG_TAG_NONNULL   = BIT_ULL(1),
        ARG_TAG_TRUSTED   = BIT_ULL(2),
        ARG_TAG_UNTRUSTED = BIT_ULL(3),
        ARG_TAG_NULLABLE  = BIT_ULL(4),
        ARG_TAG_ARENA          = BIT_ULL(5),
};

/* Process BTF of a function to produce high-level expectation of function
 * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
 * is cached in subprog info for reuse.
 * Returns:
 * EFAULT - there is a verifier bug. Abort verification.
 * EINVAL - cannot convert BTF.
 * 0 - Successfully processed BTF and constructed argument expectations.
 */
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
{
        bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL;
        struct bpf_subprog_info *sub = subprog_info(env, subprog);
        struct bpf_verifier_log *log = &env->log;
        struct bpf_prog *prog = env->prog;
        enum bpf_prog_type prog_type = prog->type;
        struct btf *btf = prog->aux->btf;
        const struct btf_param *args;
        const struct btf_type *t, *ref_t, *fn_t;
        u32 i, nargs, btf_id;
        const char *tname;

        if (sub->args_cached)
                return 0;

        if (!prog->aux->func_info) {
                verifier_bug(env, "func_info undefined");
                return -EFAULT;
        }

        btf_id = prog->aux->func_info[subprog].type_id;
        if (!btf_id) {
                if (!is_global) /* not fatal for static funcs */
                        return -EINVAL;
                bpf_log(log, "Global functions need valid BTF\n");
                return -EFAULT;
        }

        fn_t = btf_type_by_id(btf, btf_id);
        if (!fn_t || !btf_type_is_func(fn_t)) {
                /* These checks were already done by the verifier while loading
                 * struct bpf_func_info
                 */
                bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
                        subprog);
                return -EFAULT;
        }
        tname = btf_name_by_offset(btf, fn_t->name_off);

        if (prog->aux->func_info_aux[subprog].unreliable) {
                verifier_bug(env, "unreliable BTF for function %s()", tname);
                return -EFAULT;
        }
        if (prog_type == BPF_PROG_TYPE_EXT)
                prog_type = prog->aux->dst_prog->type;

        t = btf_type_by_id(btf, fn_t->type);
        if (!t || !btf_type_is_func_proto(t)) {
                bpf_log(log, "Invalid type of function %s()\n", tname);
                return -EFAULT;
        }
        args = (const struct btf_param *)(t + 1);
        nargs = btf_type_vlen(t);
        if (nargs > MAX_BPF_FUNC_REG_ARGS) {
                if (!is_global)
                        return -EINVAL;
                bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
                        tname, nargs, MAX_BPF_FUNC_REG_ARGS);
                return -EINVAL;
        }
        /* check that function is void or returns int, exception cb also requires this */
        t = btf_type_by_id(btf, t->type);
        while (btf_type_is_modifier(t))
                t = btf_type_by_id(btf, t->type);
        if (!btf_type_is_void(t) && !btf_type_is_int(t) && !btf_is_any_enum(t)) {
                if (!is_global)
                        return -EINVAL;
                bpf_log(log,
                        "Global function %s() return value not void or scalar. "
                        "Only those are supported.\n",
                        tname);
                return -EINVAL;
        }

        /* Convert BTF function arguments into verifier types.
         * Only PTR_TO_CTX and SCALAR are supported atm.
         */
        for (i = 0; i < nargs; i++) {
                u32 tags = 0;
                int id = btf_named_start_id(btf, false) - 1;

                /* 'arg:<tag>' decl_tag takes precedence over derivation of
                 * register type from BTF type itself
                 */
                while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) {
                        const struct btf_type *tag_t = btf_type_by_id(btf, id);
                        const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4;

                        /* disallow arg tags in static subprogs */
                        if (!is_global) {
                                bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
                                return -EOPNOTSUPP;
                        }

                        if (strcmp(tag, "ctx") == 0) {
                                tags |= ARG_TAG_CTX;
                        } else if (strcmp(tag, "trusted") == 0) {
                                tags |= ARG_TAG_TRUSTED;
                        } else if (strcmp(tag, "untrusted") == 0) {
                                tags |= ARG_TAG_UNTRUSTED;
                        } else if (strcmp(tag, "nonnull") == 0) {
                                tags |= ARG_TAG_NONNULL;
                        } else if (strcmp(tag, "nullable") == 0) {
                                tags |= ARG_TAG_NULLABLE;
                        } else if (strcmp(tag, "arena") == 0) {
                                tags |= ARG_TAG_ARENA;
                        } else {
                                bpf_log(log, "arg#%d has unsupported set of tags\n", i);
                                return -EOPNOTSUPP;
                        }
                }
                if (id != -ENOENT) {
                        bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id);
                        return id;
                }

                t = btf_type_by_id(btf, args[i].type);
                while (btf_type_is_modifier(t))
                        t = btf_type_by_id(btf, t->type);
                if (!btf_type_is_ptr(t))
                        goto skip_pointer;

                if ((tags & ARG_TAG_CTX) || btf_is_prog_ctx_type(log, btf, t, prog_type, i)) {
                        if (tags & ~ARG_TAG_CTX) {
                                bpf_log(log, "arg#%d has invalid combination of tags\n", i);
                                return -EINVAL;
                        }
                        if ((tags & ARG_TAG_CTX) &&
                            btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
                                                       prog->expected_attach_type))
                                return -EINVAL;
                        sub->args[i].arg_type = ARG_PTR_TO_CTX;
                        continue;
                }
                if (btf_is_dynptr_ptr(btf, t)) {
                        if (tags) {
                                bpf_log(log, "arg#%d has invalid combination of tags\n", i);
                                return -EINVAL;
                        }
                        sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
                        continue;
                }
                if (tags & ARG_TAG_TRUSTED) {
                        int kern_type_id;

                        if (tags & ARG_TAG_NONNULL) {
                                bpf_log(log, "arg#%d has invalid combination of tags\n", i);
                                return -EINVAL;
                        }

                        kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
                        if (kern_type_id < 0)
                                return kern_type_id;

                        sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED;
                        if (tags & ARG_TAG_NULLABLE)
                                sub->args[i].arg_type |= PTR_MAYBE_NULL;
                        sub->args[i].btf_id = kern_type_id;
                        continue;
                }
                if (tags & ARG_TAG_UNTRUSTED) {
                        struct btf *vmlinux_btf;
                        int kern_type_id;

                        if (tags & ~ARG_TAG_UNTRUSTED) {
                                bpf_log(log, "arg#%d untrusted cannot be combined with any other tags\n", i);
                                return -EINVAL;
                        }

                        ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
                        if (btf_type_is_void(ref_t) || btf_type_is_primitive(ref_t)) {
                                sub->args[i].arg_type = ARG_PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED;
                                sub->args[i].mem_size = 0;
                                continue;
                        }

                        kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
                        if (kern_type_id < 0)
                                return kern_type_id;

                        vmlinux_btf = bpf_get_btf_vmlinux();
                        ref_t = btf_type_by_id(vmlinux_btf, kern_type_id);
                        if (!btf_type_is_struct(ref_t)) {
                                tname = __btf_name_by_offset(vmlinux_btf, t->name_off);
                                bpf_log(log, "arg#%d has type %s '%s', but only struct or primitive types are allowed\n",
                                        i, btf_type_str(ref_t), tname);
                                return -EINVAL;
                        }
                        sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_UNTRUSTED;
                        sub->args[i].btf_id = kern_type_id;
                        continue;
                }
                if (tags & ARG_TAG_ARENA) {
                        if (tags & ~ARG_TAG_ARENA) {
                                bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i);
                                return -EINVAL;
                        }
                        sub->args[i].arg_type = ARG_PTR_TO_ARENA;
                        continue;
                }
                if (is_global) { /* generic user data pointer */
                        u32 mem_size;

                        if (tags & ARG_TAG_NULLABLE) {
                                bpf_log(log, "arg#%d has invalid combination of tags\n", i);
                                return -EINVAL;
                        }

                        t = btf_type_skip_modifiers(btf, t->type, NULL);
                        ref_t = btf_resolve_size(btf, t, &mem_size);
                        if (IS_ERR(ref_t)) {
                                bpf_log(log, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
                                        i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
                                        PTR_ERR(ref_t));
                                return -EINVAL;
                        }

                        sub->args[i].arg_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL;
                        if (tags & ARG_TAG_NONNULL)
                                sub->args[i].arg_type &= ~PTR_MAYBE_NULL;
                        sub->args[i].mem_size = mem_size;
                        continue;
                }

skip_pointer:
                if (tags) {
                        bpf_log(log, "arg#%d has pointer tag, but is not a pointer type\n", i);
                        return -EINVAL;
                }
                if (btf_type_is_int(t) || btf_is_any_enum(t)) {
                        sub->args[i].arg_type = ARG_ANYTHING;
                        continue;
                }
                if (!is_global)
                        return -EINVAL;
                bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
                        i, btf_type_str(t), tname);
                return -EINVAL;
        }

        sub->arg_cnt = nargs;
        sub->args_cached = true;

        return 0;
}

static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
                          struct btf_show *show)
{
        const struct btf_type *t = btf_type_by_id(btf, type_id);

        show->btf = btf;
        memset(&show->state, 0, sizeof(show->state));
        memset(&show->obj, 0, sizeof(show->obj));

        btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
}

__printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt,
                                        va_list args)
{
        seq_vprintf((struct seq_file *)show->target, fmt, args);
}

int btf_type_seq_show_flags(const struct btf *btf, u32 type_id,
                            void *obj, struct seq_file *m, u64 flags)
{
        struct btf_show sseq;

        sseq.target = m;
        sseq.showfn = btf_seq_show;
        sseq.flags = flags;

        btf_type_show(btf, type_id, obj, &sseq);

        return sseq.state.status;
}

void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
                       struct seq_file *m)
{
        (void) btf_type_seq_show_flags(btf, type_id, obj, m,
                                       BTF_SHOW_NONAME | BTF_SHOW_COMPACT |
                                       BTF_SHOW_ZERO | BTF_SHOW_UNSAFE);
}

struct btf_show_snprintf {
        struct btf_show show;
        int len_left;                /* space left in string */
        int len;                /* length we would have written */
};

__printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt,
                                             va_list args)
{
        struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
        int len;

        len = vsnprintf(show->target, ssnprintf->len_left, fmt, args);

        if (len < 0) {
                ssnprintf->len_left = 0;
                ssnprintf->len = len;
        } else if (len >= ssnprintf->len_left) {
                /* no space, drive on to get length we would have written */
                ssnprintf->len_left = 0;
                ssnprintf->len += len;
        } else {
                ssnprintf->len_left -= len;
                ssnprintf->len += len;
                show->target += len;
        }
}

int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
                           char *buf, int len, u64 flags)
{
        struct btf_show_snprintf ssnprintf;

        ssnprintf.show.target = buf;
        ssnprintf.show.flags = flags;
        ssnprintf.show.showfn = btf_snprintf_show;
        ssnprintf.len_left = len;
        ssnprintf.len = 0;

        btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);

        /* If we encountered an error, return it. */
        if (ssnprintf.show.state.status)
                return ssnprintf.show.state.status;

        /* Otherwise return length we would have written */
        return ssnprintf.len;
}

#ifdef CONFIG_PROC_FS
static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp)
{
        const struct btf *btf = filp->private_data;

        seq_printf(m, "btf_id:\t%u\n", READ_ONCE(btf->id));
}
#endif

static int btf_release(struct inode *inode, struct file *filp)
{
        btf_put(filp->private_data);
        return 0;
}

const struct file_operations btf_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = bpf_btf_show_fdinfo,
#endif
        .release        = btf_release,
};

static int __btf_new_fd(struct btf *btf)
{
        return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}

int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
        struct btf *btf;
        int ret;

        btf = btf_parse(attr, uattr, uattr_size);
        if (IS_ERR(btf))
                return PTR_ERR(btf);

        ret = btf_alloc_id(btf);
        if (ret) {
                btf_free(btf);
                return ret;
        }

        /*
         * The BTF ID is published to the userspace.
         * All BTF free must go through call_rcu() from
         * now on (i.e. free by calling btf_put()).
         */

        ret = __btf_new_fd(btf);
        if (ret < 0)
                btf_put(btf);

        return ret;
}

struct btf *btf_get_by_fd(int fd)
{
        struct btf *btf;
        CLASS(fd, f)(fd);

        btf = __btf_get_by_fd(f);
        if (!IS_ERR(btf))
                refcount_inc(&btf->refcnt);

        return btf;
}

int btf_get_info_by_fd(const struct btf *btf,
                       const union bpf_attr *attr,
                       union bpf_attr __user *uattr)
{
        struct bpf_btf_info __user *uinfo;
        struct bpf_btf_info info;
        u32 info_copy, btf_copy;
        void __user *ubtf;
        char __user *uname;
        u32 uinfo_len, uname_len, name_len;
        int ret = 0;

        uinfo = u64_to_user_ptr(attr->info.info);
        uinfo_len = attr->info.info_len;

        info_copy = min_t(u32, uinfo_len, sizeof(info));
        memset(&info, 0, sizeof(info));
        if (copy_from_user(&info, uinfo, info_copy))
                return -EFAULT;

        info.id = READ_ONCE(btf->id);
        ubtf = u64_to_user_ptr(info.btf);
        btf_copy = min_t(u32, btf->data_size, info.btf_size);
        if (copy_to_user(ubtf, btf->data, btf_copy))
                return -EFAULT;
        info.btf_size = btf->data_size;

        info.kernel_btf = btf->kernel_btf;

        uname = u64_to_user_ptr(info.name);
        uname_len = info.name_len;
        if (!uname ^ !uname_len)
                return -EINVAL;

        name_len = strlen(btf->name);
        info.name_len = name_len;

        if (uname) {
                if (uname_len >= name_len + 1) {
                        if (copy_to_user(uname, btf->name, name_len + 1))
                                return -EFAULT;
                } else {
                        char zero = '\0';

                        if (copy_to_user(uname, btf->name, uname_len - 1))
                                return -EFAULT;
                        if (put_user(zero, uname + uname_len - 1))
                                return -EFAULT;
                        /* let user-space know about too short buffer */
                        ret = -ENOSPC;
                }
        }

        if (copy_to_user(uinfo, &info, info_copy) ||
            put_user(info_copy, &uattr->info.info_len))
                return -EFAULT;

        return ret;
}

int btf_get_fd_by_id(u32 id)
{
        struct btf *btf;
        int fd;

        rcu_read_lock();
        btf = idr_find(&btf_idr, id);
        if (!btf || !refcount_inc_not_zero(&btf->refcnt))
                btf = ERR_PTR(-ENOENT);
        rcu_read_unlock();

        if (IS_ERR(btf))
                return PTR_ERR(btf);

        fd = __btf_new_fd(btf);
        if (fd < 0)
                btf_put(btf);

        return fd;
}

u32 btf_obj_id(const struct btf *btf)
{
        return READ_ONCE(btf->id);
}

bool btf_is_kernel(const struct btf *btf)
{
        return btf->kernel_btf;
}

bool btf_is_module(const struct btf *btf)
{
        return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
}

enum {
        BTF_MODULE_F_LIVE = (1 << 0),
};

#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
struct btf_module {
        struct list_head list;
        struct module *module;
        struct btf *btf;
        struct bin_attribute *sysfs_attr;
        int flags;
};

static LIST_HEAD(btf_modules);
static DEFINE_MUTEX(btf_module_mutex);

static void purge_cand_cache(struct btf *btf);

static int btf_module_notify(struct notifier_block *nb, unsigned long op,
                             void *module)
{
        struct btf_module *btf_mod, *tmp;
        struct module *mod = module;
        struct btf *btf;
        int err = 0;

        if (mod->btf_data_size == 0 ||
            (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE &&
             op != MODULE_STATE_GOING))
                goto out;

        switch (op) {
        case MODULE_STATE_COMING:
                btf_mod = kzalloc_obj(*btf_mod);
                if (!btf_mod) {
                        err = -ENOMEM;
                        goto out;
                }
                btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size,
                                       mod->btf_base_data, mod->btf_base_data_size);
                if (IS_ERR(btf)) {
                        kfree(btf_mod);
                        if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
                                pr_warn("failed to validate module [%s] BTF: %ld\n",
                                        mod->name, PTR_ERR(btf));
                                err = PTR_ERR(btf);
                        } else {
                                pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n");
                        }
                        goto out;
                }
                err = btf_alloc_id(btf);
                if (err) {
                        btf_free(btf);
                        kfree(btf_mod);
                        goto out;
                }

                purge_cand_cache(NULL);
                mutex_lock(&btf_module_mutex);
                btf_mod->module = module;
                btf_mod->btf = btf;
                list_add(&btf_mod->list, &btf_modules);
                mutex_unlock(&btf_module_mutex);

                if (IS_ENABLED(CONFIG_SYSFS)) {
                        struct bin_attribute *attr;

                        attr = kzalloc_obj(*attr);
                        if (!attr)
                                goto out;

                        sysfs_bin_attr_init(attr);
                        attr->attr.name = btf->name;
                        attr->attr.mode = 0444;
                        attr->size = btf->data_size;
                        attr->private = btf->data;
                        attr->read = sysfs_bin_attr_simple_read;

                        err = sysfs_create_bin_file(btf_kobj, attr);
                        if (err) {
                                pr_warn("failed to register module [%s] BTF in sysfs: %d\n",
                                        mod->name, err);
                                kfree(attr);
                                err = 0;
                                goto out;
                        }

                        btf_mod->sysfs_attr = attr;
                }

                break;
        case MODULE_STATE_LIVE:
                mutex_lock(&btf_module_mutex);
                list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
                        if (btf_mod->module != module)
                                continue;

                        btf_mod->flags |= BTF_MODULE_F_LIVE;
                        break;
                }
                mutex_unlock(&btf_module_mutex);
                break;
        case MODULE_STATE_GOING:
                mutex_lock(&btf_module_mutex);
                list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
                        if (btf_mod->module != module)
                                continue;

                        /*
                         * For modules, we do the freeing of BTF IDR as soon as
                         * module goes away to disable BTF discovery, since the
                         * btf_try_get_module() on such BTFs will fail. This may
                         * be called again on btf_put(), but it's ok to do so.
                         */
                        btf_free_id(btf_mod->btf);
                        list_del(&btf_mod->list);
                        if (btf_mod->sysfs_attr)
                                sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
                        purge_cand_cache(btf_mod->btf);
                        btf_put(btf_mod->btf);
                        kfree(btf_mod->sysfs_attr);
                        kfree(btf_mod);
                        break;
                }
                mutex_unlock(&btf_module_mutex);
                break;
        }
out:
        return notifier_from_errno(err);
}

static struct notifier_block btf_module_nb = {
        .notifier_call = btf_module_notify,
};

static int __init btf_module_init(void)
{
        register_module_notifier(&btf_module_nb);
        return 0;
}

fs_initcall(btf_module_init);
#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */

struct module *btf_try_get_module(const struct btf *btf)
{
        struct module *res = NULL;
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
        struct btf_module *btf_mod, *tmp;

        mutex_lock(&btf_module_mutex);
        list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
                if (btf_mod->btf != btf)
                        continue;

                /* We must only consider module whose __init routine has
                 * finished, hence we must check for BTF_MODULE_F_LIVE flag,
                 * which is set from the notifier callback for
                 * MODULE_STATE_LIVE.
                 */
                if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module))
                        res = btf_mod->module;

                break;
        }
        mutex_unlock(&btf_module_mutex);
#endif

        return res;
}

/* Returns struct btf corresponding to the struct module.
 * This function can return NULL or ERR_PTR.
 */
static struct btf *btf_get_module_btf(const struct module *module)
{
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
        struct btf_module *btf_mod, *tmp;
#endif
        struct btf *btf = NULL;

        if (!module) {
                btf = bpf_get_btf_vmlinux();
                if (!IS_ERR_OR_NULL(btf))
                        btf_get(btf);
                return btf;
        }

#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
        mutex_lock(&btf_module_mutex);
        list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
                if (btf_mod->module != module)
                        continue;

                btf_get(btf_mod->btf);
                btf = btf_mod->btf;
                break;
        }
        mutex_unlock(&btf_module_mutex);
#endif

        return btf;
}

static int check_btf_kconfigs(const struct module *module, const char *feature)
{
        if (!module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
                pr_err("missing vmlinux BTF, cannot register %s\n", feature);
                return -ENOENT;
        }
        if (module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
                pr_warn("missing module BTF, cannot register %s\n", feature);
        return 0;
}

BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
{
        struct btf *btf = NULL;
        int btf_obj_fd = 0;
        long ret;

        if (flags)
                return -EINVAL;

        if (name_sz <= 1 || name[name_sz - 1])
                return -EINVAL;

        ret = bpf_find_btf_id(name, kind, &btf);
        if (ret > 0 && btf_is_module(btf)) {
                btf_obj_fd = __btf_new_fd(btf);
                if (btf_obj_fd < 0) {
                        btf_put(btf);
                        return btf_obj_fd;
                }
                return ret | (((u64)btf_obj_fd) << 32);
        }
        if (ret > 0)
                btf_put(btf);
        return ret;
}

const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
        .func                = bpf_btf_find_by_name_kind,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE

/* Validate well-formedness of iter argument type.
 * On success, return positive BTF ID of iter state's STRUCT type.
 * On error, negative error is returned.
 */
int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
{
        const struct btf_param *arg;
        const struct btf_type *t;
        const char *name;
        int btf_id;

        if (btf_type_vlen(func) <= arg_idx)
                return -EINVAL;

        arg = &btf_params(func)[arg_idx];
        t = btf_type_skip_modifiers(btf, arg->type, NULL);
        if (!t || !btf_type_is_ptr(t))
                return -EINVAL;
        t = btf_type_skip_modifiers(btf, t->type, &btf_id);
        if (!t || !__btf_type_is_struct(t))
                return -EINVAL;

        name = btf_name_by_offset(btf, t->name_off);
        if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
                return -EINVAL;

        return btf_id;
}

static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
                                 const struct btf_type *func, u32 func_flags)
{
        u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
        const char *sfx, *iter_name;
        const struct btf_type *t;
        char exp_name[128];
        u32 nr_args;
        int btf_id;

        /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
        if (!flags || (flags & (flags - 1)))
                return -EINVAL;

        /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
        nr_args = btf_type_vlen(func);
        if (nr_args < 1)
                return -EINVAL;

        btf_id = btf_check_iter_arg(btf, func, 0);
        if (btf_id < 0)
                return btf_id;

        /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
         * fit nicely in stack slots
         */
        t = btf_type_by_id(btf, btf_id);
        if (t->size == 0 || (t->size % 8))
                return -EINVAL;

        /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
         * naming pattern
         */
        iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1;
        if (flags & KF_ITER_NEW)
                sfx = "new";
        else if (flags & KF_ITER_NEXT)
                sfx = "next";
        else /* (flags & KF_ITER_DESTROY) */
                sfx = "destroy";

        snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
        if (strcmp(func_name, exp_name))
                return -EINVAL;

        /* only iter constructor should have extra arguments */
        if (!(flags & KF_ITER_NEW) && nr_args != 1)
                return -EINVAL;

        if (flags & KF_ITER_NEXT) {
                /* bpf_iter_<type>_next() should return pointer */
                t = btf_type_skip_modifiers(btf, func->type, NULL);
                if (!t || !btf_type_is_ptr(t))
                        return -EINVAL;
        }

        if (flags & KF_ITER_DESTROY) {
                /* bpf_iter_<type>_destroy() should return void */
                t = btf_type_by_id(btf, func->type);
                if (!t || !btf_type_is_void(t))
                        return -EINVAL;
        }

        return 0;
}

static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
{
        const struct btf_type *func;
        const char *func_name;
        int err;

        /* any kfunc should be FUNC -> FUNC_PROTO */
        func = btf_type_by_id(btf, func_id);
        if (!func || !btf_type_is_func(func))
                return -EINVAL;

        /* sanity check kfunc name */
        func_name = btf_name_by_offset(btf, func->name_off);
        if (!func_name || !func_name[0])
                return -EINVAL;

        func = btf_type_by_id(btf, func->type);
        if (!func || !btf_type_is_func_proto(func))
                return -EINVAL;

        if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
                err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
                if (err)
                        return err;
        }

        return 0;
}

/* Kernel Function (kfunc) BTF ID set registration API */

static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
                                  const struct btf_kfunc_id_set *kset)
{
        struct btf_kfunc_hook_filter *hook_filter;
        struct btf_id_set8 *add_set = kset->set;
        bool vmlinux_set = !btf_is_module(btf);
        bool add_filter = !!kset->filter;
        struct btf_kfunc_set_tab *tab;
        struct btf_id_set8 *set;
        u32 set_cnt, i;
        int ret;

        if (hook >= BTF_KFUNC_HOOK_MAX) {
                ret = -EINVAL;
                goto end;
        }

        if (!add_set->cnt)
                return 0;

        tab = btf->kfunc_set_tab;

        if (tab && add_filter) {
                u32 i;

                hook_filter = &tab->hook_filters[hook];
                for (i = 0; i < hook_filter->nr_filters; i++) {
                        if (hook_filter->filters[i] == kset->filter) {
                                add_filter = false;
                                break;
                        }
                }

                if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) {
                        ret = -E2BIG;
                        goto end;
                }
        }

        if (!tab) {
                tab = kzalloc_obj(*tab, GFP_KERNEL | __GFP_NOWARN);
                if (!tab)
                        return -ENOMEM;
                btf->kfunc_set_tab = tab;
        }

        set = tab->sets[hook];
        /* Warn when register_btf_kfunc_id_set is called twice for the same hook
         * for module sets.
         */
        if (WARN_ON_ONCE(set && !vmlinux_set)) {
                ret = -EINVAL;
                goto end;
        }

        /* In case of vmlinux sets, there may be more than one set being
         * registered per hook. To create a unified set, we allocate a new set
         * and concatenate all individual sets being registered. While each set
         * is individually sorted, they may become unsorted when concatenated,
         * hence re-sorting the final set again is required to make binary
         * searching the set using btf_id_set8_contains function work.
         *
         * For module sets, we need to allocate as we may need to relocate
         * BTF ids.
         */
        set_cnt = set ? set->cnt : 0;

        if (set_cnt > U32_MAX - add_set->cnt) {
                ret = -EOVERFLOW;
                goto end;
        }

        if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) {
                ret = -E2BIG;
                goto end;
        }

        /* Grow set */
        set = krealloc(tab->sets[hook],
                       struct_size(set, pairs, set_cnt + add_set->cnt),
                       GFP_KERNEL | __GFP_NOWARN);
        if (!set) {
                ret = -ENOMEM;
                goto end;
        }

        /* For newly allocated set, initialize set->cnt to 0 */
        if (!tab->sets[hook])
                set->cnt = 0;
        tab->sets[hook] = set;

        /* Concatenate the two sets */
        memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
        /* Now that the set is copied, update with relocated BTF ids */
        for (i = set->cnt; i < set->cnt + add_set->cnt; i++)
                set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id);

        set->cnt += add_set->cnt;

        sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);

        if (add_filter) {
                hook_filter = &tab->hook_filters[hook];
                hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
        }
        return 0;
end:
        btf_free_kfunc_set_tab(btf);
        return ret;
}

static u32 *btf_kfunc_id_set_contains(const struct btf *btf,
                                      enum btf_kfunc_hook hook,
                                      u32 kfunc_btf_id)
{
        struct btf_id_set8 *set;
        u32 *id;

        if (hook >= BTF_KFUNC_HOOK_MAX)
                return NULL;
        if (!btf->kfunc_set_tab)
                return NULL;
        set = btf->kfunc_set_tab->sets[hook];
        if (!set)
                return NULL;
        id = btf_id_set8_contains(set, kfunc_btf_id);
        if (!id)
                return NULL;
        /* The flags for BTF ID are located next to it */
        return id + 1;
}

static bool __btf_kfunc_is_allowed(const struct btf *btf,
                                   enum btf_kfunc_hook hook,
                                   u32 kfunc_btf_id,
                                   const struct bpf_prog *prog)
{
        struct btf_kfunc_hook_filter *hook_filter;
        int i;

        if (hook >= BTF_KFUNC_HOOK_MAX)
                return false;
        if (!btf->kfunc_set_tab)
                return false;

        hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
        for (i = 0; i < hook_filter->nr_filters; i++) {
                if (hook_filter->filters[i](prog, kfunc_btf_id))
                        return false;
        }

        return true;
}

static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
{
        switch (prog_type) {
        case BPF_PROG_TYPE_UNSPEC:
                return BTF_KFUNC_HOOK_COMMON;
        case BPF_PROG_TYPE_XDP:
                return BTF_KFUNC_HOOK_XDP;
        case BPF_PROG_TYPE_SCHED_CLS:
                return BTF_KFUNC_HOOK_TC;
        case BPF_PROG_TYPE_STRUCT_OPS:
                return BTF_KFUNC_HOOK_STRUCT_OPS;
        case BPF_PROG_TYPE_TRACING:
        case BPF_PROG_TYPE_TRACEPOINT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_LSM:
                return BTF_KFUNC_HOOK_TRACING;
        case BPF_PROG_TYPE_SYSCALL:
                return BTF_KFUNC_HOOK_SYSCALL;
        case BPF_PROG_TYPE_CGROUP_SKB:
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_SOCK_OPS:
                return BTF_KFUNC_HOOK_CGROUP;
        case BPF_PROG_TYPE_SCHED_ACT:
                return BTF_KFUNC_HOOK_SCHED_ACT;
        case BPF_PROG_TYPE_SK_SKB:
                return BTF_KFUNC_HOOK_SK_SKB;
        case BPF_PROG_TYPE_SOCKET_FILTER:
                return BTF_KFUNC_HOOK_SOCKET_FILTER;
        case BPF_PROG_TYPE_LWT_OUT:
        case BPF_PROG_TYPE_LWT_IN:
        case BPF_PROG_TYPE_LWT_XMIT:
        case BPF_PROG_TYPE_LWT_SEG6LOCAL:
                return BTF_KFUNC_HOOK_LWT;
        case BPF_PROG_TYPE_NETFILTER:
                return BTF_KFUNC_HOOK_NETFILTER;
        case BPF_PROG_TYPE_KPROBE:
                return BTF_KFUNC_HOOK_KPROBE;
        default:
                return BTF_KFUNC_HOOK_MAX;
        }
}

bool btf_kfunc_is_allowed(const struct btf *btf,
                          u32 kfunc_btf_id,
                          const struct bpf_prog *prog)
{
        enum bpf_prog_type prog_type = resolve_prog_type(prog);
        enum btf_kfunc_hook hook;
        u32 *kfunc_flags;

        kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
        if (kfunc_flags && __btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog))
                return true;

        hook = bpf_prog_type_to_kfunc_hook(prog_type);
        kfunc_flags = btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
        if (kfunc_flags && __btf_kfunc_is_allowed(btf, hook, kfunc_btf_id, prog))
                return true;

        return false;
}

/* Caution:
 * Reference to the module (obtained using btf_try_get_module) corresponding to
 * the struct btf *MUST* be held when calling this function from verifier
 * context. This is usually true as we stash references in prog's kfunc_btf_tab;
 * keeping the reference for the duration of the call provides the necessary
 * protection for looking up a well-formed btf->kfunc_set_tab.
 */
u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog)
{
        enum bpf_prog_type prog_type = resolve_prog_type(prog);
        enum btf_kfunc_hook hook;
        u32 *kfunc_flags;

        kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
        if (kfunc_flags)
                return kfunc_flags;

        hook = bpf_prog_type_to_kfunc_hook(prog_type);
        return btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
}

u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
                                const struct bpf_prog *prog)
{
        if (!__btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog))
                return NULL;

        return btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
}

static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
                                       const struct btf_kfunc_id_set *kset)
{
        struct btf *btf;
        int ret, i;

        btf = btf_get_module_btf(kset->owner);
        if (!btf)
                return check_btf_kconfigs(kset->owner, "kfunc");
        if (IS_ERR(btf))
                return PTR_ERR(btf);

        for (i = 0; i < kset->set->cnt; i++) {
                ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id),
                                             kset->set->pairs[i].flags);
                if (ret)
                        goto err_out;
        }

        ret = btf_populate_kfunc_set(btf, hook, kset);

err_out:
        btf_put(btf);
        return ret;
}

/* This function must be invoked only from initcalls/module init functions */
int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
                              const struct btf_kfunc_id_set *kset)
{
        enum btf_kfunc_hook hook;

        /* All kfuncs need to be tagged as such in BTF.
         * WARN() for initcall registrations that do not check errors.
         */
        if (!(kset->set->flags & BTF_SET8_KFUNCS)) {
                WARN_ON(!kset->owner);
                return -EINVAL;
        }

        hook = bpf_prog_type_to_kfunc_hook(prog_type);
        return __register_btf_kfunc_id_set(hook, kset);
}
EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);

/* This function must be invoked only from initcalls/module init functions */
int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset)
{
        return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset);
}
EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set);

s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
{
        struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
        struct btf_id_dtor_kfunc *dtor;

        if (!tab)
                return -ENOENT;
        /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need
         * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func.
         */
        BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0);
        dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func);
        if (!dtor)
                return -ENOENT;
        return dtor->kfunc_btf_id;
}

static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt)
{
        const struct btf_type *dtor_func, *dtor_func_proto, *t;
        const struct btf_param *args;
        s32 dtor_btf_id;
        u32 nr_args, i;

        for (i = 0; i < cnt; i++) {
                dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id);

                dtor_func = btf_type_by_id(btf, dtor_btf_id);
                if (!dtor_func || !btf_type_is_func(dtor_func))
                        return -EINVAL;

                dtor_func_proto = btf_type_by_id(btf, dtor_func->type);
                if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto))
                        return -EINVAL;

                /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */
                t = btf_type_by_id(btf, dtor_func_proto->type);
                if (!t || !btf_type_is_void(t))
                        return -EINVAL;

                nr_args = btf_type_vlen(dtor_func_proto);
                if (nr_args != 1)
                        return -EINVAL;
                args = btf_params(dtor_func_proto);
                t = btf_type_by_id(btf, args[0].type);
                /* Allow any pointer type, as width on targets Linux supports
                 * will be same for all pointer types (i.e. sizeof(void *))
                 */
                if (!t || !btf_type_is_ptr(t))
                        return -EINVAL;

                if (IS_ENABLED(CONFIG_CFI)) {
                        /* Ensure the destructor kfunc type matches btf_dtor_kfunc_t */
                        t = btf_type_by_id(btf, t->type);
                        if (!btf_type_is_void(t))
                                return -EINVAL;
                }
        }
        return 0;
}

/* This function must be invoked only from initcalls/module init functions */
int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
                                struct module *owner)
{
        struct btf_id_dtor_kfunc_tab *tab;
        struct btf *btf;
        u32 tab_cnt, i;
        int ret;

        btf = btf_get_module_btf(owner);
        if (!btf)
                return check_btf_kconfigs(owner, "dtor kfuncs");
        if (IS_ERR(btf))
                return PTR_ERR(btf);

        if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
                pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
                ret = -E2BIG;
                goto end;
        }

        /* Ensure that the prototype of dtor kfuncs being registered is sane */
        ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt);
        if (ret < 0)
                goto end;

        tab = btf->dtor_kfunc_tab;
        /* Only one call allowed for modules */
        if (WARN_ON_ONCE(tab && btf_is_module(btf))) {
                ret = -EINVAL;
                goto end;
        }

        tab_cnt = tab ? tab->cnt : 0;
        if (tab_cnt > U32_MAX - add_cnt) {
                ret = -EOVERFLOW;
                goto end;
        }
        if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
                pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
                ret = -E2BIG;
                goto end;
        }

        tab = krealloc(btf->dtor_kfunc_tab,
                       struct_size(tab, dtors, tab_cnt + add_cnt),
                       GFP_KERNEL | __GFP_NOWARN);
        if (!tab) {
                ret = -ENOMEM;
                goto end;
        }

        if (!btf->dtor_kfunc_tab)
                tab->cnt = 0;
        btf->dtor_kfunc_tab = tab;

        memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));

        /* remap BTF ids based on BTF relocation (if any) */
        for (i = tab_cnt; i < tab_cnt + add_cnt; i++) {
                tab->dtors[i].btf_id = btf_relocate_id(btf, tab->dtors[i].btf_id);
                tab->dtors[i].kfunc_btf_id = btf_relocate_id(btf, tab->dtors[i].kfunc_btf_id);
        }

        tab->cnt += add_cnt;

        sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);

end:
        if (ret)
                btf_free_dtor_kfunc_tab(btf);
        btf_put(btf);
        return ret;
}
EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs);

#define MAX_TYPES_ARE_COMPAT_DEPTH 2

/* Check local and target types for compatibility. This check is used for
 * type-based CO-RE relocations and follow slightly different rules than
 * field-based relocations. This function assumes that root types were already
 * checked for name match. Beyond that initial root-level name check, names
 * are completely ignored. Compatibility rules are as follows:
 *   - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but
 *     kind should match for local and target types (i.e., STRUCT is not
 *     compatible with UNION);
 *   - for ENUMs/ENUM64s, the size is ignored;
 *   - for INT, size and signedness are ignored;
 *   - for ARRAY, dimensionality is ignored, element types are checked for
 *     compatibility recursively;
 *   - CONST/VOLATILE/RESTRICT modifiers are ignored;
 *   - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
 *   - FUNC_PROTOs are compatible if they have compatible signature: same
 *     number of input args and compatible return and argument types.
 * These rules are not set in stone and probably will be adjusted as we get
 * more experience with using BPF CO-RE relocations.
 */
int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
                              const struct btf *targ_btf, __u32 targ_id)
{
        return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id,
                                           MAX_TYPES_ARE_COMPAT_DEPTH);
}

#define MAX_TYPES_MATCH_DEPTH 2

int bpf_core_types_match(const struct btf *local_btf, u32 local_id,
                         const struct btf *targ_btf, u32 targ_id)
{
        return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false,
                                      MAX_TYPES_MATCH_DEPTH);
}

static bool bpf_core_is_flavor_sep(const char *s)
{
        /* check X___Y name pattern, where X and Y are not underscores */
        return s[0] != '_' &&                                      /* X */
               s[1] == '_' && s[2] == '_' && s[3] == '_' &&   /* ___ */
               s[4] != '_';                                      /* Y */
}

size_t bpf_core_essential_name_len(const char *name)
{
        size_t n = strlen(name);
        int i;

        for (i = n - 5; i >= 0; i--) {
                if (bpf_core_is_flavor_sep(name + i))
                        return i + 1;
        }
        return n;
}

static void bpf_free_cands(struct bpf_cand_cache *cands)
{
        if (!cands->cnt)
                /* empty candidate array was allocated on stack */
                return;
        kfree(cands);
}

static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
{
        kfree(cands->name);
        kfree(cands);
}

#define VMLINUX_CAND_CACHE_SIZE 31
static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];

#define MODULE_CAND_CACHE_SIZE 31
static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];

static void __print_cand_cache(struct bpf_verifier_log *log,
                               struct bpf_cand_cache **cache,
                               int cache_size)
{
        struct bpf_cand_cache *cc;
        int i, j;

        for (i = 0; i < cache_size; i++) {
                cc = cache[i];
                if (!cc)
                        continue;
                bpf_log(log, "[%d]%s(", i, cc->name);
                for (j = 0; j < cc->cnt; j++) {
                        bpf_log(log, "%d", cc->cands[j].id);
                        if (j < cc->cnt - 1)
                                bpf_log(log, " ");
                }
                bpf_log(log, "), ");
        }
}

static void print_cand_cache(struct bpf_verifier_log *log)
{
        mutex_lock(&cand_cache_mutex);
        bpf_log(log, "vmlinux_cand_cache:");
        __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
        bpf_log(log, "\nmodule_cand_cache:");
        __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
        bpf_log(log, "\n");
        mutex_unlock(&cand_cache_mutex);
}

static u32 hash_cands(struct bpf_cand_cache *cands)
{
        return jhash(cands->name, cands->name_len, 0);
}

static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
                                               struct bpf_cand_cache **cache,
                                               int cache_size)
{
        struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];

        if (cc && cc->name_len == cands->name_len &&
            !strncmp(cc->name, cands->name, cands->name_len))
                return cc;
        return NULL;
}

static size_t sizeof_cands(int cnt)
{
        return offsetof(struct bpf_cand_cache, cands[cnt]);
}

static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
                                                  struct bpf_cand_cache **cache,
                                                  int cache_size)
{
        struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;

        if (*cc) {
                bpf_free_cands_from_cache(*cc);
                *cc = NULL;
        }
        new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL_ACCOUNT);
        if (!new_cands) {
                bpf_free_cands(cands);
                return ERR_PTR(-ENOMEM);
        }
        /* strdup the name, since it will stay in cache.
         * the cands->name points to strings in prog's BTF and the prog can be unloaded.
         */
        new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL_ACCOUNT);
        bpf_free_cands(cands);
        if (!new_cands->name) {
                kfree(new_cands);
                return ERR_PTR(-ENOMEM);
        }
        *cc = new_cands;
        return new_cands;
}

#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
                               int cache_size)
{
        struct bpf_cand_cache *cc;
        int i, j;

        for (i = 0; i < cache_size; i++) {
                cc = cache[i];
                if (!cc)
                        continue;
                if (!btf) {
                        /* when new module is loaded purge all of module_cand_cache,
                         * since new module might have candidates with the name
                         * that matches cached cands.
                         */
                        bpf_free_cands_from_cache(cc);
                        cache[i] = NULL;
                        continue;
                }
                /* when module is unloaded purge cache entries
                 * that match module's btf
                 */
                for (j = 0; j < cc->cnt; j++)
                        if (cc->cands[j].btf == btf) {
                                bpf_free_cands_from_cache(cc);
                                cache[i] = NULL;
                                break;
                        }
        }

}

static void purge_cand_cache(struct btf *btf)
{
        mutex_lock(&cand_cache_mutex);
        __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
        mutex_unlock(&cand_cache_mutex);
}
#endif

static struct bpf_cand_cache *
bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
                   int targ_start_id)
{
        struct bpf_cand_cache *new_cands;
        const struct btf_type *t;
        const char *targ_name;
        size_t targ_essent_len;
        int n, i;

        n = btf_nr_types(targ_btf);
        for (i = targ_start_id; i < n; i++) {
                t = btf_type_by_id(targ_btf, i);
                if (btf_kind(t) != cands->kind)
                        continue;

                targ_name = btf_name_by_offset(targ_btf, t->name_off);
                if (!targ_name)
                        continue;

                /* the resched point is before strncmp to make sure that search
                 * for non-existing name will have a chance to schedule().
                 */
                cond_resched();

                if (strncmp(cands->name, targ_name, cands->name_len) != 0)
                        continue;

                targ_essent_len = bpf_core_essential_name_len(targ_name);
                if (targ_essent_len != cands->name_len)
                        continue;

                /* most of the time there is only one candidate for a given kind+name pair */
                new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL_ACCOUNT);
                if (!new_cands) {
                        bpf_free_cands(cands);
                        return ERR_PTR(-ENOMEM);
                }

                memcpy(new_cands, cands, sizeof_cands(cands->cnt));
                bpf_free_cands(cands);
                cands = new_cands;
                cands->cands[cands->cnt].btf = targ_btf;
                cands->cands[cands->cnt].id = i;
                cands->cnt++;
        }
        return cands;
}

static struct bpf_cand_cache *
bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
{
        struct bpf_cand_cache *cands, *cc, local_cand = {};
        const struct btf *local_btf = ctx->btf;
        const struct btf_type *local_type;
        const struct btf *main_btf;
        size_t local_essent_len;
        struct btf *mod_btf;
        const char *name;
        int id;

        main_btf = bpf_get_btf_vmlinux();
        if (IS_ERR(main_btf))
                return ERR_CAST(main_btf);
        if (!main_btf)
                return ERR_PTR(-EINVAL);

        local_type = btf_type_by_id(local_btf, local_type_id);
        if (!local_type)
                return ERR_PTR(-EINVAL);

        name = btf_name_by_offset(local_btf, local_type->name_off);
        if (str_is_empty(name))
                return ERR_PTR(-EINVAL);
        local_essent_len = bpf_core_essential_name_len(name);

        cands = &local_cand;
        cands->name = name;
        cands->kind = btf_kind(local_type);
        cands->name_len = local_essent_len;

        cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
        /* cands is a pointer to stack here */
        if (cc) {
                if (cc->cnt)
                        return cc;
                goto check_modules;
        }

        /* Attempt to find target candidates in vmlinux BTF first */
        cands = bpf_core_add_cands(cands, main_btf, btf_named_start_id(main_btf, true));
        if (IS_ERR(cands))
                return ERR_CAST(cands);

        /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */

        /* populate cache even when cands->cnt == 0 */
        cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
        if (IS_ERR(cc))
                return ERR_CAST(cc);

        /* if vmlinux BTF has any candidate, don't go for module BTFs */
        if (cc->cnt)
                return cc;

check_modules:
        /* cands is a pointer to stack here and cands->cnt == 0 */
        cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
        if (cc)
                /* if cache has it return it even if cc->cnt == 0 */
                return cc;

        /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
        spin_lock_bh(&btf_idr_lock);
        idr_for_each_entry(&btf_idr, mod_btf, id) {
                if (!btf_is_module(mod_btf))
                        continue;
                /* linear search could be slow hence unlock/lock
                 * the IDR to avoiding holding it for too long
                 */
                btf_get(mod_btf);
                spin_unlock_bh(&btf_idr_lock);
                cands = bpf_core_add_cands(cands, mod_btf, btf_named_start_id(mod_btf, true));
                btf_put(mod_btf);
                if (IS_ERR(cands))
                        return ERR_CAST(cands);
                spin_lock_bh(&btf_idr_lock);
        }
        spin_unlock_bh(&btf_idr_lock);
        /* cands is a pointer to kmalloced memory here if cands->cnt > 0
         * or pointer to stack if cands->cnd == 0.
         * Copy it into the cache even when cands->cnt == 0 and
         * return the result.
         */
        return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
}

int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
                   int relo_idx, void *insn)
{
        bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
        struct bpf_core_cand_list cands = {};
        struct bpf_core_relo_res targ_res;
        struct bpf_core_spec *specs;
        const struct btf_type *type;
        int err;

        /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
         * into arrays of btf_ids of struct fields and array indices.
         */
        specs = kzalloc_objs(*specs, 3, GFP_KERNEL_ACCOUNT);
        if (!specs)
                return -ENOMEM;

        type = btf_type_by_id(ctx->btf, relo->type_id);
        if (!type) {
                bpf_log(ctx->log, "relo #%u: bad type id %u\n",
                        relo_idx, relo->type_id);
                kfree(specs);
                return -EINVAL;
        }

        if (need_cands) {
                struct bpf_cand_cache *cc;
                int i;

                mutex_lock(&cand_cache_mutex);
                cc = bpf_core_find_cands(ctx, relo->type_id);
                if (IS_ERR(cc)) {
                        bpf_log(ctx->log, "target candidate search failed for %d\n",
                                relo->type_id);
                        err = PTR_ERR(cc);
                        goto out;
                }
                if (cc->cnt) {
                        cands.cands = kzalloc_objs(*cands.cands, cc->cnt,
                                                   GFP_KERNEL_ACCOUNT);
                        if (!cands.cands) {
                                err = -ENOMEM;
                                goto out;
                        }
                }
                for (i = 0; i < cc->cnt; i++) {
                        bpf_log(ctx->log,
                                "CO-RE relocating %s %s: found target candidate [%d]\n",
                                btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
                        cands.cands[i].btf = cc->cands[i].btf;
                        cands.cands[i].id = cc->cands[i].id;
                }
                cands.len = cc->cnt;
                /* cand_cache_mutex needs to span the cache lookup and
                 * copy of btf pointer into bpf_core_cand_list,
                 * since module can be unloaded while bpf_core_calc_relo_insn
                 * is working with module's btf.
                 */
        }

        err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs,
                                      &targ_res);
        if (err)
                goto out;

        err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx,
                                  &targ_res);

out:
        kfree(specs);
        if (need_cands) {
                kfree(cands.cands);
                mutex_unlock(&cand_cache_mutex);
                if (ctx->log->level & BPF_LOG_LEVEL2)
                        print_cand_cache(ctx->log);
        }
        return err;
}

bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
                                const struct bpf_reg_state *reg,
                                const char *field_name, u32 btf_id, const char *suffix)
{
        struct btf *btf = reg->btf;
        const struct btf_type *walk_type, *safe_type;
        const char *tname;
        char safe_tname[64];
        long ret, safe_id;
        const struct btf_member *member;
        u32 i;

        walk_type = btf_type_by_id(btf, reg->btf_id);
        if (!walk_type)
                return false;

        tname = btf_name_by_offset(btf, walk_type->name_off);

        ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
        if (ret >= sizeof(safe_tname))
                return false;

        safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
        if (safe_id < 0)
                return false;

        safe_type = btf_type_by_id(btf, safe_id);
        if (!safe_type)
                return false;

        for_each_member(i, safe_type, member) {
                const char *m_name = __btf_name_by_offset(btf, member->name_off);
                const struct btf_type *mtype = btf_type_by_id(btf, member->type);
                u32 id;

                if (!btf_type_is_ptr(mtype))
                        continue;

                btf_type_skip_modifiers(btf, mtype->type, &id);
                /* If we match on both type and name, the field is considered trusted. */
                if (btf_id == id && !strcmp(field_name, m_name))
                        return true;
        }

        return false;
}

bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
                               const struct btf *reg_btf, u32 reg_id,
                               const struct btf *arg_btf, u32 arg_id)
{
        const char *reg_name, *arg_name, *search_needle;
        const struct btf_type *reg_type, *arg_type;
        int reg_len, arg_len, cmp_len;
        size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char);

        reg_type = btf_type_by_id(reg_btf, reg_id);
        if (!reg_type)
                return false;

        arg_type = btf_type_by_id(arg_btf, arg_id);
        if (!arg_type)
                return false;

        reg_name = btf_name_by_offset(reg_btf, reg_type->name_off);
        arg_name = btf_name_by_offset(arg_btf, arg_type->name_off);

        reg_len = strlen(reg_name);
        arg_len = strlen(arg_name);

        /* Exactly one of the two type names may be suffixed with ___init, so
         * if the strings are the same size, they can't possibly be no-cast
         * aliases of one another. If you have two of the same type names, e.g.
         * they're both nf_conn___init, it would be improper to return true
         * because they are _not_ no-cast aliases, they are the same type.
         */
        if (reg_len == arg_len)
                return false;

        /* Either of the two names must be the other name, suffixed with ___init. */
        if ((reg_len != arg_len + pattern_len) &&
            (arg_len != reg_len + pattern_len))
                return false;

        if (reg_len < arg_len) {
                search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX);
                cmp_len = reg_len;
        } else {
                search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX);
                cmp_len = arg_len;
        }

        if (!search_needle)
                return false;

        /* ___init suffix must come at the end of the name */
        if (*(search_needle + pattern_len) != '\0')
                return false;

        return !strncmp(reg_name, arg_name, cmp_len);
}

#ifdef CONFIG_BPF_JIT
static int
btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
                   struct bpf_verifier_log *log)
{
        struct btf_struct_ops_tab *tab, *new_tab;
        int i, err;

        tab = btf->struct_ops_tab;
        if (!tab) {
                tab = kzalloc_flex(*tab, ops, 4);
                if (!tab)
                        return -ENOMEM;
                tab->capacity = 4;
                btf->struct_ops_tab = tab;
        }

        for (i = 0; i < tab->cnt; i++)
                if (tab->ops[i].st_ops == st_ops)
                        return -EEXIST;

        if (tab->cnt == tab->capacity) {
                new_tab = krealloc(tab,
                                   struct_size(tab, ops, tab->capacity * 2),
                                   GFP_KERNEL);
                if (!new_tab)
                        return -ENOMEM;
                tab = new_tab;
                tab->capacity *= 2;
                btf->struct_ops_tab = tab;
        }

        tab->ops[btf->struct_ops_tab->cnt].st_ops = st_ops;

        err = bpf_struct_ops_desc_init(&tab->ops[btf->struct_ops_tab->cnt], btf, log);
        if (err)
                return err;

        btf->struct_ops_tab->cnt++;

        return 0;
}

const struct bpf_struct_ops_desc *
bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
{
        const struct bpf_struct_ops_desc *st_ops_list;
        unsigned int i;
        u32 cnt;

        if (!value_id)
                return NULL;
        if (!btf->struct_ops_tab)
                return NULL;

        cnt = btf->struct_ops_tab->cnt;
        st_ops_list = btf->struct_ops_tab->ops;
        for (i = 0; i < cnt; i++) {
                if (st_ops_list[i].value_id == value_id)
                        return &st_ops_list[i];
        }

        return NULL;
}

const struct bpf_struct_ops_desc *
bpf_struct_ops_find(struct btf *btf, u32 type_id)
{
        const struct bpf_struct_ops_desc *st_ops_list;
        unsigned int i;
        u32 cnt;

        if (!type_id)
                return NULL;
        if (!btf->struct_ops_tab)
                return NULL;

        cnt = btf->struct_ops_tab->cnt;
        st_ops_list = btf->struct_ops_tab->ops;
        for (i = 0; i < cnt; i++) {
                if (st_ops_list[i].type_id == type_id)
                        return &st_ops_list[i];
        }

        return NULL;
}

int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops)
{
        struct bpf_verifier_log *log;
        struct btf *btf;
        int err = 0;

        btf = btf_get_module_btf(st_ops->owner);
        if (!btf)
                return check_btf_kconfigs(st_ops->owner, "struct_ops");
        if (IS_ERR(btf))
                return PTR_ERR(btf);

        log = kzalloc_obj(*log, GFP_KERNEL | __GFP_NOWARN);
        if (!log) {
                err = -ENOMEM;
                goto errout;
        }

        log->level = BPF_LOG_KERNEL;

        err = btf_add_struct_ops(btf, st_ops, log);

errout:
        kfree(log);
        btf_put(btf);

        return err;
}
EXPORT_SYMBOL_GPL(__register_bpf_struct_ops);
#endif

bool btf_param_match_suffix(const struct btf *btf,
                            const struct btf_param *arg,
                            const char *suffix)
{
        int suffix_len = strlen(suffix), len;
        const char *param_name;

        /* In the future, this can be ported to use BTF tagging */
        param_name = btf_name_by_offset(btf, arg->name_off);
        if (str_is_empty(param_name))
                return false;
        len = strlen(param_name);
        if (len <= suffix_len)
                return false;
        param_name += len - suffix_len;
        return !strncmp(param_name, suffix, suffix_len);
}







































































































































































































    2 






    2 


















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * NILFS local header file.
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Koji Sato and Ryusuke Konishi.
 */

#ifndef _NILFS_H
#define _NILFS_H

#include <linux/kernel.h>
#include <linux/buffer_head.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/nilfs2_api.h>
#include <linux/nilfs2_ondisk.h>
#include "the_nilfs.h"
#include "bmap.h"

/**
 * struct nilfs_inode_info - nilfs inode data in memory
 * @i_flags: inode flags
 * @i_type: inode type (combination of flags that inidicate usage)
 * @i_state: dynamic state flags
 * @i_bmap: pointer on i_bmap_data
 * @i_bmap_data: raw block mapping
 * @i_xattr: <TODO>
 * @i_dir_start_lookup: page index of last successful search
 * @i_cno: checkpoint number for GC inode
 * @i_assoc_inode: associated inode (B-tree node cache holder or back pointer)
 * @i_dirty: list for connecting dirty files
 * @xattr_sem: semaphore for extended attributes processing
 * @i_bh: buffer contains disk inode
 * @i_root: root object of the current filesystem tree
 * @vfs_inode: VFS inode object
 */
struct nilfs_inode_info {
        __u32 i_flags;
        unsigned int i_type;
        unsigned long  i_state;                /* Dynamic state flags */
        struct nilfs_bmap *i_bmap;
        struct nilfs_bmap i_bmap_data;
        __u64 i_xattr;        /* sector_t ??? */
        __u32 i_dir_start_lookup;
        __u64 i_cno;                /* check point number for GC inode */
        struct inode *i_assoc_inode;
        struct list_head i_dirty;        /* List for connecting dirty files */

#ifdef CONFIG_NILFS_XATTR
        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_sem even when reading would cause contention
         * between readers of EAs and writers of regular file data, so
         * instead we synchronize on xattr_sem when reading or changing
         * EAs.
         */
        struct rw_semaphore xattr_sem;
#endif
        struct buffer_head *i_bh;        /*
                                         * i_bh contains a new or dirty
                                         * disk inode.
                                         */
        struct nilfs_root *i_root;
        struct inode vfs_inode;
};

static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
{
        return container_of(inode, struct nilfs_inode_info, vfs_inode);
}

static inline struct nilfs_inode_info *
NILFS_BMAP_I(const struct nilfs_bmap *bmap)
{
        return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
}

/*
 * Dynamic state flags of NILFS on-memory inode (i_state)
 */
enum {
        NILFS_I_NEW = 0,                /* Inode is newly created */
        NILFS_I_DIRTY,                        /* The file is dirty */
        NILFS_I_QUEUED,                        /* inode is in dirty_files list */
        NILFS_I_BUSY,                        /*
                                         * Inode is grabbed by a segment
                                         * constructor
                                         */
        NILFS_I_COLLECTED,                /* All dirty blocks are collected */
        NILFS_I_UPDATED,                /* The file has been written back */
        NILFS_I_INODE_SYNC,                /* dsync is not allowed for inode */
        NILFS_I_BMAP,                        /* has bmap and btnode_cache */
};

/*
 * Flags to identify the usage of on-memory inodes (i_type)
 */
enum {
        NILFS_I_TYPE_NORMAL =        0,
        NILFS_I_TYPE_GC =        0x0001,        /* For data caching during GC */
        NILFS_I_TYPE_BTNC =        0x0002,        /* For btree node cache */
        NILFS_I_TYPE_SHADOW =        0x0004,        /* For shadowed page cache */
};

/*
 * commit flags for nilfs_commit_super and nilfs_sync_super
 */
enum {
        NILFS_SB_COMMIT = 0,        /* Commit a super block alternately */
        NILFS_SB_COMMIT_ALL        /* Commit both super blocks */
};

/**
 * define NILFS_MAX_VOLUME_NAME - maximum number of characters (bytes) in a
 *                                file system volume name
 *
 * Defined by the size of the volume name field in the on-disk superblocks.
 * This volume name does not include the terminating NULL byte if the string
 * length matches the field size, so use (NILFS_MAX_VOLUME_NAME + 1) for the
 * size of the buffer that requires a NULL byte termination.
 */
#define NILFS_MAX_VOLUME_NAME  \
        sizeof_field(struct nilfs_super_block, s_volume_name)

/*
 * Macros to check inode numbers
 */
#define NILFS_MDT_INO_BITS                                                \
        (BIT(NILFS_DAT_INO) | BIT(NILFS_CPFILE_INO) |                        \
         BIT(NILFS_SUFILE_INO) | BIT(NILFS_IFILE_INO) |                        \
         BIT(NILFS_ATIME_INO) | BIT(NILFS_SKETCH_INO))

#define NILFS_SYS_INO_BITS (BIT(NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)

#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)

#define NILFS_MDT_INODE(sb, ino) \
        ((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino)))
#define NILFS_VALID_INODE(sb, ino) \
        ((ino) >= NILFS_FIRST_INO(sb) ||                                \
         ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino))))

#define NILFS_PRIVATE_INODE(ino) ({                                        \
        ino_t __ino = (ino);                                                \
        ((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO &&        \
         (__ino) != NILFS_SKETCH_INO); })

/**
 * struct nilfs_transaction_info: context information for synchronization
 * @ti_magic: Magic number
 * @ti_save: Backup of journal_info field of task_struct
 * @ti_flags: Flags
 * @ti_count: Nest level
 */
struct nilfs_transaction_info {
        u32                        ti_magic;
        void                       *ti_save;
                                /*
                                 * This should never be used.  If it happens,
                                 * one of other filesystems has a bug.
                                 */
        unsigned short                ti_flags;
        unsigned short                ti_count;
};

/* ti_magic */
#define NILFS_TI_MAGIC                0xd9e392fb

/* ti_flags */
#define NILFS_TI_DYNAMIC_ALLOC        0x0001  /* Allocated from slab */
#define NILFS_TI_SYNC                0x0002        /*
                                         * Force to construct segment at the
                                         * end of transaction.
                                         */
#define NILFS_TI_GC                0x0004        /* GC context */
#define NILFS_TI_COMMIT                0x0008        /* Change happened or not */
#define NILFS_TI_WRITER                0x0010        /* Constructor context */


int nilfs_transaction_begin(struct super_block *,
                            struct nilfs_transaction_info *, int);
int nilfs_transaction_commit(struct super_block *);
void nilfs_transaction_abort(struct super_block *);

static inline void nilfs_set_transaction_flag(unsigned int flag)
{
        struct nilfs_transaction_info *ti = current->journal_info;

        ti->ti_flags |= flag;
}

static inline int nilfs_test_transaction_flag(unsigned int flag)
{
        struct nilfs_transaction_info *ti = current->journal_info;

        if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
                return 0;
        return !!(ti->ti_flags & flag);
}

static inline int nilfs_doing_gc(void)
{
        return nilfs_test_transaction_flag(NILFS_TI_GC);
}

static inline int nilfs_doing_construction(void)
{
        return nilfs_test_transaction_flag(NILFS_TI_WRITER);
}

/*
 * function prototype
 */
#ifdef CONFIG_NILFS_POSIX_ACL
#error "NILFS: not yet supported POSIX ACL"
extern int nilfs_acl_chmod(struct inode *);
extern int nilfs_init_acl(struct inode *, struct inode *);
#else
static inline int nilfs_acl_chmod(struct inode *inode)
{
        return 0;
}

static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
{
        if (S_ISLNK(inode->i_mode))
                return 0;

        inode->i_mode &= ~current_umask();
        return 0;
}
#endif

#define NILFS_ATIME_DISABLE

/* Flags that should be inherited by new inodes from their parent. */
#define NILFS_FL_INHERITED                                                \
        (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL |                \
         FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\
         FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)

/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
{
        if (S_ISDIR(mode))
                return flags;
        else if (S_ISREG(mode))
                return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL);
        else
                return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
}

/* dir.c */
int nilfs_add_link(struct dentry *, struct inode *);
int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino);
int nilfs_make_empty(struct inode *, struct inode *);
struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *,
                struct folio **);
int nilfs_delete_entry(struct nilfs_dir_entry *, struct folio *);
int nilfs_empty_dir(struct inode *);
struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct folio **);
int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
                   struct folio *folio, struct inode *inode);

/* file.c */
extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);

/* ioctl.c */
int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *m);
int nilfs_fileattr_set(struct mnt_idmap *idmap,
                       struct dentry *dentry, struct file_kattr *fa);
long nilfs_ioctl(struct file *, unsigned int, unsigned long);
long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
                                       void **);

/* inode.c */
void nilfs_inode_add_blocks(struct inode *inode, int n);
void nilfs_inode_sub_blocks(struct inode *inode, int n);
extern struct inode *nilfs_new_inode(struct inode *, umode_t);
extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern void nilfs_set_inode_flags(struct inode *);
extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
void nilfs_write_inode_common(struct inode *inode,
                              struct nilfs_inode *raw_inode);
struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
                            unsigned long ino);
struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
                                unsigned long ino);
struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
                         unsigned long ino);
extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
                                       unsigned long ino, __u64 cno);
int nilfs_attach_btree_node_cache(struct inode *inode);
void nilfs_detach_btree_node_cache(struct inode *inode);
struct inode *nilfs_iget_for_shadow(struct inode *inode);
extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
extern void nilfs_truncate(struct inode *);
extern void nilfs_evict_inode(struct inode *);
extern int nilfs_setattr(struct mnt_idmap *, struct dentry *,
                         struct iattr *);
extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
int nilfs_permission(struct mnt_idmap *idmap, struct inode *inode,
                     int mask);
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty);
extern int __nilfs_mark_inode_dirty(struct inode *, int);
extern void nilfs_dirty_inode(struct inode *, int flags);
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 __u64 start, __u64 len);
static inline int nilfs_mark_inode_dirty(struct inode *inode)
{
        return __nilfs_mark_inode_dirty(inode, I_DIRTY);
}
static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
{
        return __nilfs_mark_inode_dirty(inode, I_DIRTY_SYNC);
}

/* super.c */
extern struct inode *nilfs_alloc_inode(struct super_block *);

__printf(2, 3)
void __nilfs_msg(struct super_block *sb, const char *fmt, ...);
extern __printf(3, 4)
void __nilfs_error(struct super_block *sb, const char *function,
                   const char *fmt, ...);

#ifdef CONFIG_PRINTK

#define nilfs_msg(sb, level, fmt, ...)                                        \
        __nilfs_msg(sb, level fmt, ##__VA_ARGS__)
#define nilfs_error(sb, fmt, ...)                                        \
        __nilfs_error(sb, __func__, fmt, ##__VA_ARGS__)

#else

#define nilfs_msg(sb, level, fmt, ...)                                        \
        do {                                                                \
                no_printk(level fmt, ##__VA_ARGS__);                        \
                (void)(sb);                                                \
        } while (0)
#define nilfs_error(sb, fmt, ...)                                        \
        do {                                                                \
                no_printk(fmt, ##__VA_ARGS__);                                \
                __nilfs_error(sb, "", " ");                                \
        } while (0)

#endif /* CONFIG_PRINTK */

#define nilfs_crit(sb, fmt, ...)                                        \
        nilfs_msg(sb, KERN_CRIT, fmt, ##__VA_ARGS__)
#define nilfs_err(sb, fmt, ...)                                                \
        nilfs_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
#define nilfs_warn(sb, fmt, ...)                                        \
        nilfs_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
#define nilfs_info(sb, fmt, ...)                                        \
        nilfs_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__)

extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
extern int nilfs_store_magic(struct super_block *sb,
                             struct nilfs_super_block *sbp);
extern int nilfs_check_feature_compatibility(struct super_block *,
                                             struct nilfs_super_block *);
extern void nilfs_set_log_cursor(struct nilfs_super_block *,
                                 struct the_nilfs *);
struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
                                               int flip);
int nilfs_commit_super(struct super_block *sb, int flag);
int nilfs_cleanup_super(struct super_block *sb);
int nilfs_resize_fs(struct super_block *sb, __u64 newsize);
int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
                            struct nilfs_root **root);
int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);

/* gcinode.c */
int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
                                   struct buffer_head **);
int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
                                   struct buffer_head **);
int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
int nilfs_init_gcinode(struct inode *inode);
void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);

/* sysfs.c */
int __init nilfs_sysfs_init(void);
void nilfs_sysfs_exit(void);
int nilfs_sysfs_create_device_group(struct super_block *);
void nilfs_sysfs_delete_device_group(struct the_nilfs *);
int nilfs_sysfs_create_snapshot_group(struct nilfs_root *);
void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *);

/*
 * Inodes and files operations
 */
extern const struct file_operations nilfs_dir_operations;
extern const struct inode_operations nilfs_file_inode_operations;
extern const struct file_operations nilfs_file_operations;
extern const struct address_space_operations nilfs_aops;
extern const struct address_space_operations nilfs_buffer_cache_aops;
extern const struct inode_operations nilfs_dir_inode_operations;
extern const struct inode_operations nilfs_special_inode_operations;
extern const struct inode_operations nilfs_symlink_inode_operations;

/*
 * filesystem type
 */
extern struct file_system_type nilfs_fs_type;


#endif        /* _NILFS_H */























































































































































































































































































































































































































































































































    6 










































   13 
















































































































































































































































    1 






































































































































































































































































































































































































































   11 



























    1 



































































































































































































































































































































































































































































   19 























































































































































































































































































































































































































































   22 







































    3 
































   17 





























   26 



































































































































































































   15 

   19 
























   23 




















    1 























































































































































   19 




























































    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   11 








































































   17 












































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-fallback.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_FALLBACK_H
#define _LINUX_ATOMIC_FALLBACK_H

#include <linux/compiler.h>

#if defined(arch_xchg)
#define raw_xchg arch_xchg
#elif defined(arch_xchg_relaxed)
#define raw_xchg(...) \
        __atomic_op_fence(arch_xchg, __VA_ARGS__)
#else
extern void raw_xchg_not_implemented(void);
#define raw_xchg(...) raw_xchg_not_implemented()
#endif

#if defined(arch_xchg_acquire)
#define raw_xchg_acquire arch_xchg_acquire
#elif defined(arch_xchg_relaxed)
#define raw_xchg_acquire(...) \
        __atomic_op_acquire(arch_xchg, __VA_ARGS__)
#elif defined(arch_xchg)
#define raw_xchg_acquire arch_xchg
#else
extern void raw_xchg_acquire_not_implemented(void);
#define raw_xchg_acquire(...) raw_xchg_acquire_not_implemented()
#endif

#if defined(arch_xchg_release)
#define raw_xchg_release arch_xchg_release
#elif defined(arch_xchg_relaxed)
#define raw_xchg_release(...) \
        __atomic_op_release(arch_xchg, __VA_ARGS__)
#elif defined(arch_xchg)
#define raw_xchg_release arch_xchg
#else
extern void raw_xchg_release_not_implemented(void);
#define raw_xchg_release(...) raw_xchg_release_not_implemented()
#endif

#if defined(arch_xchg_relaxed)
#define raw_xchg_relaxed arch_xchg_relaxed
#elif defined(arch_xchg)
#define raw_xchg_relaxed arch_xchg
#else
extern void raw_xchg_relaxed_not_implemented(void);
#define raw_xchg_relaxed(...) raw_xchg_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg)
#define raw_cmpxchg arch_cmpxchg
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg(...) \
        __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
#else
extern void raw_cmpxchg_not_implemented(void);
#define raw_cmpxchg(...) raw_cmpxchg_not_implemented()
#endif

#if defined(arch_cmpxchg_acquire)
#define raw_cmpxchg_acquire arch_cmpxchg_acquire
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__)
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_acquire arch_cmpxchg
#else
extern void raw_cmpxchg_acquire_not_implemented(void);
#define raw_cmpxchg_acquire(...) raw_cmpxchg_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg_release)
#define raw_cmpxchg_release arch_cmpxchg_release
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_release(...) \
        __atomic_op_release(arch_cmpxchg, __VA_ARGS__)
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_release arch_cmpxchg
#else
extern void raw_cmpxchg_release_not_implemented(void);
#define raw_cmpxchg_release(...) raw_cmpxchg_release_not_implemented()
#endif

#if defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_relaxed arch_cmpxchg_relaxed
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_relaxed arch_cmpxchg
#else
extern void raw_cmpxchg_relaxed_not_implemented(void);
#define raw_cmpxchg_relaxed(...) raw_cmpxchg_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg64)
#define raw_cmpxchg64 arch_cmpxchg64
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64(...) \
        __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__)
#else
extern void raw_cmpxchg64_not_implemented(void);
#define raw_cmpxchg64(...) raw_cmpxchg64_not_implemented()
#endif

#if defined(arch_cmpxchg64_acquire)
#define raw_cmpxchg64_acquire arch_cmpxchg64_acquire
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__)
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_acquire arch_cmpxchg64
#else
extern void raw_cmpxchg64_acquire_not_implemented(void);
#define raw_cmpxchg64_acquire(...) raw_cmpxchg64_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg64_release)
#define raw_cmpxchg64_release arch_cmpxchg64_release
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_release(...) \
        __atomic_op_release(arch_cmpxchg64, __VA_ARGS__)
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_release arch_cmpxchg64
#else
extern void raw_cmpxchg64_release_not_implemented(void);
#define raw_cmpxchg64_release(...) raw_cmpxchg64_release_not_implemented()
#endif

#if defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_relaxed arch_cmpxchg64_relaxed
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_relaxed arch_cmpxchg64
#else
extern void raw_cmpxchg64_relaxed_not_implemented(void);
#define raw_cmpxchg64_relaxed(...) raw_cmpxchg64_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg128)
#define raw_cmpxchg128 arch_cmpxchg128
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128(...) \
        __atomic_op_fence(arch_cmpxchg128, __VA_ARGS__)
#else
extern void raw_cmpxchg128_not_implemented(void);
#define raw_cmpxchg128(...) raw_cmpxchg128_not_implemented()
#endif

#if defined(arch_cmpxchg128_acquire)
#define raw_cmpxchg128_acquire arch_cmpxchg128_acquire
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg128, __VA_ARGS__)
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_acquire arch_cmpxchg128
#else
extern void raw_cmpxchg128_acquire_not_implemented(void);
#define raw_cmpxchg128_acquire(...) raw_cmpxchg128_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg128_release)
#define raw_cmpxchg128_release arch_cmpxchg128_release
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_release(...) \
        __atomic_op_release(arch_cmpxchg128, __VA_ARGS__)
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_release arch_cmpxchg128
#else
extern void raw_cmpxchg128_release_not_implemented(void);
#define raw_cmpxchg128_release(...) raw_cmpxchg128_release_not_implemented()
#endif

#if defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_relaxed arch_cmpxchg128_relaxed
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_relaxed arch_cmpxchg128
#else
extern void raw_cmpxchg128_relaxed_not_implemented(void);
#define raw_cmpxchg128_relaxed(...) raw_cmpxchg128_relaxed_not_implemented()
#endif

#if defined(arch_try_cmpxchg)
#define raw_try_cmpxchg arch_try_cmpxchg
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg(...) \
        __atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__)
#else
#define raw_try_cmpxchg(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_acquire)
#define raw_try_cmpxchg_acquire arch_try_cmpxchg_acquire
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__)
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_acquire arch_try_cmpxchg
#else
#define raw_try_cmpxchg_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_release)
#define raw_try_cmpxchg_release arch_try_cmpxchg_release
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_release(...) \
        __atomic_op_release(arch_try_cmpxchg, __VA_ARGS__)
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_release arch_try_cmpxchg
#else
#define raw_try_cmpxchg_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_relaxed arch_try_cmpxchg_relaxed
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_relaxed arch_try_cmpxchg
#else
#define raw_try_cmpxchg_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64 arch_try_cmpxchg64
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64(...) \
        __atomic_op_fence(arch_try_cmpxchg64, __VA_ARGS__)
#else
#define raw_try_cmpxchg64(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_acquire)
#define raw_try_cmpxchg64_acquire arch_try_cmpxchg64_acquire
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg64, __VA_ARGS__)
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_acquire arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_release)
#define raw_try_cmpxchg64_release arch_try_cmpxchg64_release
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_release(...) \
        __atomic_op_release(arch_try_cmpxchg64, __VA_ARGS__)
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_release arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64_relaxed
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128 arch_try_cmpxchg128
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128(...) \
        __atomic_op_fence(arch_try_cmpxchg128, __VA_ARGS__)
#else
#define raw_try_cmpxchg128(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_acquire)
#define raw_try_cmpxchg128_acquire arch_try_cmpxchg128_acquire
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg128, __VA_ARGS__)
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_acquire arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_release)
#define raw_try_cmpxchg128_release arch_try_cmpxchg128_release
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_release(...) \
        __atomic_op_release(arch_try_cmpxchg128, __VA_ARGS__)
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_release arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128_relaxed
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg_local arch_cmpxchg_local

#ifdef arch_try_cmpxchg_local
#define raw_try_cmpxchg_local arch_try_cmpxchg_local
#else
#define raw_try_cmpxchg_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg64_local arch_cmpxchg64_local

#ifdef arch_try_cmpxchg64_local
#define raw_try_cmpxchg64_local arch_try_cmpxchg64_local
#else
#define raw_try_cmpxchg64_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg128_local arch_cmpxchg128_local

#ifdef arch_try_cmpxchg128_local
#define raw_try_cmpxchg128_local arch_try_cmpxchg128_local
#else
#define raw_try_cmpxchg128_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_sync_cmpxchg arch_sync_cmpxchg

#ifdef arch_sync_try_cmpxchg
#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg
#else
#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

/**
 * raw_atomic_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
raw_atomic_read(const atomic_t *v)
{
        return arch_atomic_read(v);
}

/**
 * raw_atomic_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
raw_atomic_read_acquire(const atomic_t *v)
{
#if defined(arch_atomic_read_acquire)
        return arch_atomic_read_acquire(v);
#else
        int ret;

        if (__native_word(atomic_t)) {
                ret = smp_load_acquire(&(v)->counter);
        } else {
                ret = raw_atomic_read(v);
                __atomic_acquire_fence();
        }

        return ret;
#endif
}

/**
 * raw_atomic_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_set(atomic_t *v, int i)
{
        arch_atomic_set(v, i);
}

/**
 * raw_atomic_set_release() - atomic set with release ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_set_release(atomic_t *v, int i)
{
#if defined(arch_atomic_set_release)
        arch_atomic_set_release(v, i);
#else
        if (__native_word(atomic_t)) {
                smp_store_release(&(v)->counter, i);
        } else {
                __atomic_release_fence();
                raw_atomic_set(v, i);
        }
#endif
}

/**
 * raw_atomic_add() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_add(int i, atomic_t *v)
{
        arch_atomic_add(i, v);
}

/**
 * raw_atomic_add_return() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_add_return"
#endif
}

/**
 * raw_atomic_add_return_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_acquire)
        return arch_atomic_add_return_acquire(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        int ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_acquire"
#endif
}

/**
 * raw_atomic_add_return_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_release(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_release)
        return arch_atomic_add_return_release(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_add_return_relaxed(i, v);
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_release"
#endif
}

/**
 * raw_atomic_add_return_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_relaxed)
        return arch_atomic_add_return_relaxed(i, v);
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_relaxed"
#endif
}

/**
 * raw_atomic_fetch_add() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_add"
#endif
}

/**
 * raw_atomic_fetch_add_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_acquire)
        return arch_atomic_fetch_add_acquire(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        int ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_acquire"
#endif
}

/**
 * raw_atomic_fetch_add_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_release)
        return arch_atomic_fetch_add_release(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_add_relaxed(i, v);
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_release"
#endif
}

/**
 * raw_atomic_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_relaxed)
        return arch_atomic_fetch_add_relaxed(i, v);
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_relaxed"
#endif
}

/**
 * raw_atomic_sub() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_sub(int i, atomic_t *v)
{
        arch_atomic_sub(i, v);
}

/**
 * raw_atomic_sub_return() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_sub_return"
#endif
}

/**
 * raw_atomic_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_acquire)
        return arch_atomic_sub_return_acquire(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        int ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_acquire"
#endif
}

/**
 * raw_atomic_sub_return_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_release(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_release)
        return arch_atomic_sub_return_release(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_sub_return_relaxed(i, v);
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_release"
#endif
}

/**
 * raw_atomic_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_relaxed)
        return arch_atomic_sub_return_relaxed(i, v);
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_relaxed"
#endif
}

/**
 * raw_atomic_fetch_sub() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_sub"
#endif
}

/**
 * raw_atomic_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_acquire)
        return arch_atomic_fetch_sub_acquire(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        int ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_acquire"
#endif
}

/**
 * raw_atomic_fetch_sub_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_release)
        return arch_atomic_fetch_sub_release(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_release"
#endif
}

/**
 * raw_atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_relaxed)
        return arch_atomic_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_relaxed"
#endif
}

/**
 * raw_atomic_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_inc(atomic_t *v)
{
#if defined(arch_atomic_inc)
        arch_atomic_inc(v);
#else
        raw_atomic_add(1, v);
#endif
}

/**
 * raw_atomic_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return(atomic_t *v)
{
#if defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#elif defined(arch_atomic_inc_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_add_return(1, v);
#endif
}

/**
 * raw_atomic_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_acquire(atomic_t *v)
{
#if defined(arch_atomic_inc_return_acquire)
        return arch_atomic_inc_return_acquire(v);
#elif defined(arch_atomic_inc_return_relaxed)
        int ret = arch_atomic_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_acquire(1, v);
#endif
}

/**
 * raw_atomic_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_release(atomic_t *v)
{
#if defined(arch_atomic_inc_return_release)
        return arch_atomic_inc_return_release(v);
#elif defined(arch_atomic_inc_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_inc_return_relaxed(v);
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_release(1, v);
#endif
}

/**
 * raw_atomic_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_relaxed(atomic_t *v)
{
#if defined(arch_atomic_inc_return_relaxed)
        return arch_atomic_inc_return_relaxed(v);
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_add(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_acquire(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_acquire)
        return arch_atomic_fetch_inc_acquire(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        int ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_acquire(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_release(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_release)
        return arch_atomic_fetch_inc_release(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_inc_relaxed(v);
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_release(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_relaxed(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_relaxed)
        return arch_atomic_fetch_inc_relaxed(v);
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_relaxed(1, v);
#endif
}

/**
 * raw_atomic_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_dec(atomic_t *v)
{
#if defined(arch_atomic_dec)
        arch_atomic_dec(v);
#else
        raw_atomic_sub(1, v);
#endif
}

/**
 * raw_atomic_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return(atomic_t *v)
{
#if defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#elif defined(arch_atomic_dec_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_sub_return(1, v);
#endif
}

/**
 * raw_atomic_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_acquire(atomic_t *v)
{
#if defined(arch_atomic_dec_return_acquire)
        return arch_atomic_dec_return_acquire(v);
#elif defined(arch_atomic_dec_return_relaxed)
        int ret = arch_atomic_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_acquire(1, v);
#endif
}

/**
 * raw_atomic_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_release(atomic_t *v)
{
#if defined(arch_atomic_dec_return_release)
        return arch_atomic_dec_return_release(v);
#elif defined(arch_atomic_dec_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_dec_return_relaxed(v);
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_release(1, v);
#endif
}

/**
 * raw_atomic_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_relaxed(atomic_t *v)
{
#if defined(arch_atomic_dec_return_relaxed)
        return arch_atomic_dec_return_relaxed(v);
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_sub(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_acquire(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_acquire)
        return arch_atomic_fetch_dec_acquire(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        int ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_acquire(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_release(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_release)
        return arch_atomic_fetch_dec_release(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_dec_relaxed(v);
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_release(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_relaxed(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_relaxed)
        return arch_atomic_fetch_dec_relaxed(v);
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_relaxed(1, v);
#endif
}

/**
 * raw_atomic_and() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_and(int i, atomic_t *v)
{
        arch_atomic_and(i, v);
}

/**
 * raw_atomic_fetch_and() - atomic bitwise AND with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_and"
#endif
}

/**
 * raw_atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_acquire)
        return arch_atomic_fetch_and_acquire(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        int ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_acquire"
#endif
}

/**
 * raw_atomic_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_release)
        return arch_atomic_fetch_and_release(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_and_relaxed(i, v);
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_release"
#endif
}

/**
 * raw_atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_relaxed)
        return arch_atomic_fetch_and_relaxed(i, v);
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_relaxed"
#endif
}

/**
 * raw_atomic_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_andnot(int i, atomic_t *v)
{
#if defined(arch_atomic_andnot)
        arch_atomic_andnot(i, v);
#else
        raw_atomic_and(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_and(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_acquire)
        return arch_atomic_fetch_andnot_acquire(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        int ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_acquire(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_release)
        return arch_atomic_fetch_andnot_release(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_release(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_relaxed)
        return arch_atomic_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_relaxed(~i, v);
#endif
}

/**
 * raw_atomic_or() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_or(int i, atomic_t *v)
{
        arch_atomic_or(i, v);
}

/**
 * raw_atomic_fetch_or() - atomic bitwise OR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_or"
#endif
}

/**
 * raw_atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_acquire)
        return arch_atomic_fetch_or_acquire(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        int ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_acquire"
#endif
}

/**
 * raw_atomic_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_release)
        return arch_atomic_fetch_or_release(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_or_relaxed(i, v);
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_release"
#endif
}

/**
 * raw_atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_relaxed)
        return arch_atomic_fetch_or_relaxed(i, v);
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_relaxed"
#endif
}

/**
 * raw_atomic_xor() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_xor(int i, atomic_t *v)
{
        arch_atomic_xor(i, v);
}

/**
 * raw_atomic_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_xor"
#endif
}

/**
 * raw_atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_acquire)
        return arch_atomic_fetch_xor_acquire(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        int ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_acquire"
#endif
}

/**
 * raw_atomic_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_release)
        return arch_atomic_fetch_xor_release(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_release"
#endif
}

/**
 * raw_atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_relaxed)
        return arch_atomic_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_relaxed"
#endif
}

/**
 * raw_atomic_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_xchg_relaxed(v, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_xchg(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_acquire(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_acquire)
        return arch_atomic_xchg_acquire(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        int ret = arch_atomic_xchg_relaxed(v, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_acquire(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_release(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_release)
        return arch_atomic_xchg_release(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_xchg_relaxed(v, new);
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_release(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_relaxed(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_relaxed)
        return arch_atomic_xchg_relaxed(v, new);
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_relaxed(&v->counter, new);
#endif
}

/**
 * raw_atomic_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_cmpxchg(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_acquire)
        return arch_atomic_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        int ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_acquire(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_release)
        return arch_atomic_cmpxchg_release(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_release(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_relaxed)
        return arch_atomic_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_relaxed(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_acquire)
        return arch_atomic_try_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_release)
        return arch_atomic_try_cmpxchg_release(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_relaxed)
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_sub_and_test(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_and_test)
        return arch_atomic_sub_and_test(i, v);
#else
        return raw_atomic_sub_return(i, v) == 0;
#endif
}

/**
 * raw_atomic_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_dec_and_test(atomic_t *v)
{
#if defined(arch_atomic_dec_and_test)
        return arch_atomic_dec_and_test(v);
#else
        return raw_atomic_dec_return(v) == 0;
#endif
}

/**
 * raw_atomic_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_and_test(atomic_t *v)
{
#if defined(arch_atomic_inc_and_test)
        return arch_atomic_inc_and_test(v);
#else
        return raw_atomic_inc_return(v) == 0;
#endif
}

/**
 * raw_atomic_add_negative() - atomic add and test if negative with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_negative_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_add_return(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_acquire)
        return arch_atomic_add_negative_acquire(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        bool ret = arch_atomic_add_negative_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_acquire(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_release() - atomic add and test if negative with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_release(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_release)
        return arch_atomic_add_negative_release(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        __atomic_release_fence();
        return arch_atomic_add_negative_relaxed(i, v);
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_release(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_relaxed)
        return arch_atomic_add_negative_relaxed(i, v);
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_relaxed(i, v) < 0;
#endif
}

/**
 * raw_atomic_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_fetch_add_unless)
        return arch_atomic_fetch_add_unless(v, a, u);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!raw_atomic_try_cmpxchg(v, &c, c + a));

        return c;
#endif
}

/**
 * raw_atomic_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_add_unless)
        return arch_atomic_add_unless(v, a, u);
#else
        return raw_atomic_fetch_add_unless(v, a, u) != u;
#endif
}

/**
 * raw_atomic_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_not_zero(atomic_t *v)
{
#if defined(arch_atomic_inc_not_zero)
        return arch_atomic_inc_not_zero(v);
#else
        return raw_atomic_add_unless(v, 1, 0);
#endif
}

/**
 * raw_atomic_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_unless_negative(atomic_t *v)
{
#if defined(arch_atomic_inc_unless_negative)
        return arch_atomic_inc_unless_negative(v);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!raw_atomic_try_cmpxchg(v, &c, c + 1));

        return true;
#endif
}

/**
 * raw_atomic_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_dec_unless_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_unless_positive)
        return arch_atomic_dec_unless_positive(v);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!raw_atomic_try_cmpxchg(v, &c, c - 1));

        return true;
#endif
}

/**
 * raw_atomic_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline int
raw_atomic_dec_if_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_if_positive)
        return arch_atomic_dec_if_positive(v);
#else
        int dec, c = raw_atomic_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!raw_atomic_try_cmpxchg(v, &c, dec));

        return dec;
#endif
}

#ifdef CONFIG_GENERIC_ATOMIC64
#include <asm-generic/atomic64.h>
#endif

/**
 * raw_atomic64_read() - atomic load with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
raw_atomic64_read(const atomic64_t *v)
{
        return arch_atomic64_read(v);
}

/**
 * raw_atomic64_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
raw_atomic64_read_acquire(const atomic64_t *v)
{
#if defined(arch_atomic64_read_acquire)
        return arch_atomic64_read_acquire(v);
#else
        s64 ret;

        if (__native_word(atomic64_t)) {
                ret = smp_load_acquire(&(v)->counter);
        } else {
                ret = raw_atomic64_read(v);
                __atomic_acquire_fence();
        }

        return ret;
#endif
}

/**
 * raw_atomic64_set() - atomic set with relaxed ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_set(atomic64_t *v, s64 i)
{
        arch_atomic64_set(v, i);
}

/**
 * raw_atomic64_set_release() - atomic set with release ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_set_release(atomic64_t *v, s64 i)
{
#if defined(arch_atomic64_set_release)
        arch_atomic64_set_release(v, i);
#else
        if (__native_word(atomic64_t)) {
                smp_store_release(&(v)->counter, i);
        } else {
                __atomic_release_fence();
                raw_atomic64_set(v, i);
        }
#endif
}

/**
 * raw_atomic64_add() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_add(s64 i, atomic64_t *v)
{
        arch_atomic64_add(i, v);
}

/**
 * raw_atomic64_add_return() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_add_return"
#endif
}

/**
 * raw_atomic64_add_return_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_acquire)
        return arch_atomic64_add_return_acquire(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        s64 ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_acquire"
#endif
}

/**
 * raw_atomic64_add_return_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_release)
        return arch_atomic64_add_return_release(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_add_return_relaxed(i, v);
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_release"
#endif
}

/**
 * raw_atomic64_add_return_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_relaxed)
        return arch_atomic64_add_return_relaxed(i, v);
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_relaxed"
#endif
}

/**
 * raw_atomic64_fetch_add() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_add"
#endif
}

/**
 * raw_atomic64_fetch_add_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_acquire)
        return arch_atomic64_fetch_add_acquire(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        s64 ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_acquire"
#endif
}

/**
 * raw_atomic64_fetch_add_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_release)
        return arch_atomic64_fetch_add_release(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_add_relaxed(i, v);
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_release"
#endif
}

/**
 * raw_atomic64_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_relaxed)
        return arch_atomic64_fetch_add_relaxed(i, v);
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_relaxed"
#endif
}

/**
 * raw_atomic64_sub() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_sub(s64 i, atomic64_t *v)
{
        arch_atomic64_sub(i, v);
}

/**
 * raw_atomic64_sub_return() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_sub_return"
#endif
}

/**
 * raw_atomic64_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_acquire)
        return arch_atomic64_sub_return_acquire(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        s64 ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_acquire"
#endif
}

/**
 * raw_atomic64_sub_return_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_release)
        return arch_atomic64_sub_return_release(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_sub_return_relaxed(i, v);
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_release"
#endif
}

/**
 * raw_atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_relaxed)
        return arch_atomic64_sub_return_relaxed(i, v);
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_relaxed"
#endif
}

/**
 * raw_atomic64_fetch_sub() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_sub"
#endif
}

/**
 * raw_atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_acquire)
        return arch_atomic64_fetch_sub_acquire(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        s64 ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_acquire"
#endif
}

/**
 * raw_atomic64_fetch_sub_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_release)
        return arch_atomic64_fetch_sub_release(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_release"
#endif
}

/**
 * raw_atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_relaxed)
        return arch_atomic64_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_relaxed"
#endif
}

/**
 * raw_atomic64_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_inc(atomic64_t *v)
{
#if defined(arch_atomic64_inc)
        arch_atomic64_inc(v);
#else
        raw_atomic64_add(1, v);
#endif
}

/**
 * raw_atomic64_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_add_return(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_acquire)
        return arch_atomic64_inc_return_acquire(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        s64 ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_acquire(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_release(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_release)
        return arch_atomic64_inc_return_release(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_inc_return_relaxed(v);
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_release(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_relaxed)
        return arch_atomic64_inc_return_relaxed(v);
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_add(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_acquire)
        return arch_atomic64_fetch_inc_acquire(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        s64 ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_acquire(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_release(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_release)
        return arch_atomic64_fetch_inc_release(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_inc_relaxed(v);
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_release(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_relaxed)
        return arch_atomic64_fetch_inc_relaxed(v);
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_dec(atomic64_t *v)
{
#if defined(arch_atomic64_dec)
        arch_atomic64_dec(v);
#else
        raw_atomic64_sub(1, v);
#endif
}

/**
 * raw_atomic64_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_sub_return(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_acquire)
        return arch_atomic64_dec_return_acquire(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        s64 ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_acquire(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_release(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_release)
        return arch_atomic64_dec_return_release(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_dec_return_relaxed(v);
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_release(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_relaxed)
        return arch_atomic64_dec_return_relaxed(v);
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_sub(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_acquire)
        return arch_atomic64_fetch_dec_acquire(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        s64 ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_acquire(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_release(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_release)
        return arch_atomic64_fetch_dec_release(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_dec_relaxed(v);
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_release(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_relaxed)
        return arch_atomic64_fetch_dec_relaxed(v);
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_and() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_and(s64 i, atomic64_t *v)
{
        arch_atomic64_and(i, v);
}

/**
 * raw_atomic64_fetch_and() - atomic bitwise AND with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_and"
#endif
}

/**
 * raw_atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_acquire)
        return arch_atomic64_fetch_and_acquire(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        s64 ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_acquire"
#endif
}

/**
 * raw_atomic64_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_release)
        return arch_atomic64_fetch_and_release(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_and_relaxed(i, v);
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_release"
#endif
}

/**
 * raw_atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_relaxed)
        return arch_atomic64_fetch_and_relaxed(i, v);
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_relaxed"
#endif
}

/**
 * raw_atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_andnot(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_andnot)
        arch_atomic64_andnot(i, v);
#else
        raw_atomic64_and(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_and(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_acquire)
        return arch_atomic64_fetch_andnot_acquire(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_acquire(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_release)
        return arch_atomic64_fetch_andnot_release(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_release(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_relaxed)
        return arch_atomic64_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_relaxed(~i, v);
#endif
}

/**
 * raw_atomic64_or() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_or(s64 i, atomic64_t *v)
{
        arch_atomic64_or(i, v);
}

/**
 * raw_atomic64_fetch_or() - atomic bitwise OR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_or"
#endif
}

/**
 * raw_atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_acquire)
        return arch_atomic64_fetch_or_acquire(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        s64 ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_acquire"
#endif
}

/**
 * raw_atomic64_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_release)
        return arch_atomic64_fetch_or_release(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_or_relaxed(i, v);
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_release"
#endif
}

/**
 * raw_atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_relaxed)
        return arch_atomic64_fetch_or_relaxed(i, v);
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_relaxed"
#endif
}

/**
 * raw_atomic64_xor() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_xor(s64 i, atomic64_t *v)
{
        arch_atomic64_xor(i, v);
}

/**
 * raw_atomic64_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_xor"
#endif
}

/**
 * raw_atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_acquire)
        return arch_atomic64_fetch_xor_acquire(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        s64 ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_acquire"
#endif
}

/**
 * raw_atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_release)
        return arch_atomic64_fetch_xor_release(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_release"
#endif
}

/**
 * raw_atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_relaxed)
        return arch_atomic64_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_relaxed"
#endif
}

/**
 * raw_atomic64_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_xchg_relaxed(v, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_xchg(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_acquire(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_acquire)
        return arch_atomic64_xchg_acquire(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        s64 ret = arch_atomic64_xchg_relaxed(v, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_acquire(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_release(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_release)
        return arch_atomic64_xchg_release(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_xchg_relaxed(v, new);
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_release(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_relaxed)
        return arch_atomic64_xchg_relaxed(v, new);
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_relaxed(&v->counter, new);
#endif
}

/**
 * raw_atomic64_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_cmpxchg(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_acquire)
        return arch_atomic64_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_acquire(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_release)
        return arch_atomic64_cmpxchg_release(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_release(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_relaxed)
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_relaxed(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_acquire)
        return arch_atomic64_try_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_release)
        return arch_atomic64_try_cmpxchg_release(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occurred, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_relaxed)
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_and_test)
        return arch_atomic64_sub_and_test(i, v);
#else
        return raw_atomic64_sub_return(i, v) == 0;
#endif
}

/**
 * raw_atomic64_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_dec_and_test(atomic64_t *v)
{
#if defined(arch_atomic64_dec_and_test)
        return arch_atomic64_dec_and_test(v);
#else
        return raw_atomic64_dec_return(v) == 0;
#endif
}

/**
 * raw_atomic64_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_and_test(atomic64_t *v)
{
#if defined(arch_atomic64_inc_and_test)
        return arch_atomic64_inc_and_test(v);
#else
        return raw_atomic64_inc_return(v) == 0;
#endif
}

/**
 * raw_atomic64_add_negative() - atomic add and test if negative with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_negative_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_add_return(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_acquire)
        return arch_atomic64_add_negative_acquire(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        bool ret = arch_atomic64_add_negative_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_acquire(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_release() - atomic add and test if negative with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_release)
        return arch_atomic64_add_negative_release(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        __atomic_release_fence();
        return arch_atomic64_add_negative_relaxed(i, v);
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_release(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_relaxed)
        return arch_atomic64_add_negative_relaxed(i, v);
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_relaxed(i, v) < 0;
#endif
}

/**
 * raw_atomic64_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
#if defined(arch_atomic64_fetch_add_unless)
        return arch_atomic64_fetch_add_unless(v, a, u);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c + a));

        return c;
#endif
}

/**
 * raw_atomic64_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
#if defined(arch_atomic64_add_unless)
        return arch_atomic64_add_unless(v, a, u);
#else
        return raw_atomic64_fetch_add_unless(v, a, u) != u;
#endif
}

/**
 * raw_atomic64_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_not_zero(atomic64_t *v)
{
#if defined(arch_atomic64_inc_not_zero)
        return arch_atomic64_inc_not_zero(v);
#else
        return raw_atomic64_add_unless(v, 1, 0);
#endif
}

/**
 * raw_atomic64_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_unless_negative(atomic64_t *v)
{
#if defined(arch_atomic64_inc_unless_negative)
        return arch_atomic64_inc_unless_negative(v);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c + 1));

        return true;
#endif
}

/**
 * raw_atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_dec_unless_positive(atomic64_t *v)
{
#if defined(arch_atomic64_dec_unless_positive)
        return arch_atomic64_dec_unless_positive(v);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c - 1));

        return true;
#endif
}

/**
 * raw_atomic64_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline s64
raw_atomic64_dec_if_positive(atomic64_t *v)
{
#if defined(arch_atomic64_dec_if_positive)
        return arch_atomic64_dec_if_positive(v);
#else
        s64 dec, c = raw_atomic64_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!raw_atomic64_try_cmpxchg(v, &c, dec));

        return dec;
#endif
}

#endif /* _LINUX_ATOMIC_FALLBACK_H */
// 206314f82b8b73a5c3aa69cf7f35ac9e7b5d6b58
























































































































































    1 




    1 



















    1 




    1 














    1 








    1 






























































    1 


























































    1 












    1 













    1 
























    1 


    1 


    1 























    1 






    1 
    1 

    1 








    1 












































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfsplus/btree.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Handle opening/closing btree
 */

#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/log2.h>

#include "hfsplus_fs.h"
#include "hfsplus_raw.h"

/*
 * Initial source code of clump size calculation is gotten
 * from http://opensource.apple.com/tarballs/diskdev_cmds/
 */
#define CLUMP_ENTRIES        15

static short clumptbl[CLUMP_ENTRIES * 3] = {
/*
 *            Volume        Attributes         Catalog         Extents
 *             Size        Clump (MB)        Clump (MB)        Clump (MB)
 */
        /*   1GB */          4,                  4,                 4,
        /*   2GB */          6,                  6,                 4,
        /*   4GB */          8,                  8,                 4,
        /*   8GB */         11,                 11,                 5,
        /*
         * For volumes 16GB and larger, we want to make sure that a full OS
         * install won't require fragmentation of the Catalog or Attributes
         * B-trees.  We do this by making the clump sizes sufficiently large,
         * and by leaving a gap after the B-trees for them to grow into.
         *
         * For SnowLeopard 10A298, a FullNetInstall with all packages selected
         * results in:
         * Catalog B-tree Header
         *        nodeSize:          8192
         *        totalNodes:       31616
         *        freeNodes:         1978
         * (used = 231.55 MB)
         * Attributes B-tree Header
         *        nodeSize:          8192
         *        totalNodes:       63232
         *        freeNodes:          958
         * (used = 486.52 MB)
         *
         * We also want Time Machine backup volumes to have a sufficiently
         * large clump size to reduce fragmentation.
         *
         * The series of numbers for Catalog and Attribute form a geometric
         * series. For Catalog (16GB to 512GB), each term is 8**(1/5) times
         * the previous term.  For Attributes (16GB to 512GB), each term is
         * 4**(1/5) times the previous term.  For 1TB to 16TB, each term is
         * 2**(1/5) times the previous term.
         */
        /*  16GB */         64,                 32,                 5,
        /*  32GB */         84,                 49,                 6,
        /*  64GB */        111,                 74,                 7,
        /* 128GB */        147,                111,                 8,
        /* 256GB */        194,                169,                 9,
        /* 512GB */        256,                256,                11,
        /*   1TB */        294,                294,                14,
        /*   2TB */        338,                338,                16,
        /*   4TB */        388,                388,                20,
        /*   8TB */        446,                446,                25,
        /*  16TB */        512,                512,                32
};

u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size,
                                        u64 sectors, int file_id)
{
        u32 mod = max(node_size, block_size);
        u32 clump_size;
        int column;
        int i;

        /* Figure out which column of the above table to use for this file. */
        switch (file_id) {
        case HFSPLUS_ATTR_CNID:
                column = 0;
                break;
        case HFSPLUS_CAT_CNID:
                column = 1;
                break;
        default:
                column = 2;
                break;
        }

        /*
         * The default clump size is 0.8% of the volume size. And
         * it must also be a multiple of the node and block size.
         */
        if (sectors < 0x200000) {
                clump_size = sectors << 2;        /*  0.8 %  */
                if (clump_size < (8 * node_size))
                        clump_size = 8 * node_size;
        } else {
                /* turn exponent into table index... */
                for (i = 0, sectors = sectors >> 22;
                     sectors && (i < CLUMP_ENTRIES - 1);
                     ++i, sectors = sectors >> 1) {
                        /* empty body */
                }

                clump_size = clumptbl[column + (i) * 3] * 1024 * 1024;
        }

        /*
         * Round the clump size to a multiple of node and block size.
         * NOTE: This rounds down.
         */
        clump_size /= mod;
        clump_size *= mod;

        /*
         * Rounding down could have rounded down to 0 if the block size was
         * greater than the clump size.  If so, just use one block or node.
         */
        if (clump_size == 0)
                clump_size = mod;

        return clump_size;
}

/* Context for iterating b-tree map pages
 * @page_idx: The index of the page within the b-node's page array
 * @off: The byte offset within the mapped page
 * @len: The remaining length of the map record
 */
struct hfs_bmap_ctx {
        unsigned int page_idx;
        unsigned int off;
        u16 len;
};

/*
 * Finds the specific page containing the requested byte offset within the map
 * record. Automatically handles the difference between header and map nodes.
 * Returns the struct page pointer, or an ERR_PTR on failure.
 * Note: The caller is responsible for mapping/unmapping the returned page.
 */
static struct page *hfs_bmap_get_map_page(struct hfs_bnode *node,
                                          struct hfs_bmap_ctx *ctx,
                                          u32 byte_offset)
{
        u16 rec_idx, off16;
        unsigned int page_off;

        if (node->this == HFSPLUS_TREE_HEAD) {
                if (node->type != HFS_NODE_HEADER) {
                        pr_err("hfsplus: invalid btree header node\n");
                        return ERR_PTR(-EIO);
                }
                rec_idx = HFSPLUS_BTREE_HDR_MAP_REC_INDEX;
        } else {
                if (node->type != HFS_NODE_MAP) {
                        pr_err("hfsplus: invalid btree map node\n");
                        return ERR_PTR(-EIO);
                }
                rec_idx = HFSPLUS_BTREE_MAP_NODE_REC_INDEX;
        }

        ctx->len = hfs_brec_lenoff(node, rec_idx, &off16);
        if (!ctx->len)
                return ERR_PTR(-ENOENT);

        if (!is_bnode_offset_valid(node, off16))
                return ERR_PTR(-EIO);

        ctx->len = check_and_correct_requested_length(node, off16, ctx->len);

        if (byte_offset >= ctx->len)
                return ERR_PTR(-EINVAL);

        page_off = (u32)off16 + node->page_offset + byte_offset;
        ctx->page_idx = page_off >> PAGE_SHIFT;
        ctx->off = page_off & ~PAGE_MASK;

        return node->page[ctx->page_idx];
}

/**
 * hfs_bmap_test_bit - test a bit in the b-tree map
 * @node: the b-tree node containing the map record
 * @node_bit_idx: the relative bit index within the node's map record
 *
 * Returns true if set, false if clear or on failure.
 */
static bool hfs_bmap_test_bit(struct hfs_bnode *node, u32 node_bit_idx)
{
        struct hfs_bmap_ctx ctx;
        struct page *page;
        u8 *bmap, byte, mask;

        page = hfs_bmap_get_map_page(node, &ctx, node_bit_idx / BITS_PER_BYTE);
        if (IS_ERR(page))
                return false;

        bmap = kmap_local_page(page);
        byte = bmap[ctx.off];
        kunmap_local(bmap);

        mask = 1 << (7 - (node_bit_idx % BITS_PER_BYTE));
        return (byte & mask) != 0;
}


/**
 * hfs_bmap_clear_bit - clear a bit in the b-tree map
 * @node: the b-tree node containing the map record
 * @node_bit_idx: the relative bit index within the node's map record
 *
 * Returns 0 on success, -EINVAL if already clear, or negative error code.
 */
static int hfs_bmap_clear_bit(struct hfs_bnode *node, u32 node_bit_idx)
{
        struct hfs_bmap_ctx ctx;
        struct page *page;
        u8 *bmap, mask;

        page = hfs_bmap_get_map_page(node, &ctx, node_bit_idx / BITS_PER_BYTE);
        if (IS_ERR(page))
                return PTR_ERR(page);

        bmap = kmap_local_page(page);

        mask = 1 << (7 - (node_bit_idx % BITS_PER_BYTE));

        if (!(bmap[ctx.off] & mask)) {
                kunmap_local(bmap);
                return -EINVAL;
        }

        bmap[ctx.off] &= ~mask;
        set_page_dirty(page);
        kunmap_local(bmap);

        return 0;
}

#define HFS_EXTENT_TREE_NAME  "Extents Overflow File"
#define HFS_CATALOG_TREE_NAME "Catalog File"
#define HFS_ATTR_TREE_NAME    "Attributes File"
#define HFS_UNKNOWN_TREE_NAME "Unknown B-tree"

static const char *hfs_btree_name(u32 cnid)
{
        switch (cnid) {
        case HFSPLUS_EXT_CNID:
                return HFS_EXTENT_TREE_NAME;
        case HFSPLUS_CAT_CNID:
                return HFS_CATALOG_TREE_NAME;
        case HFSPLUS_ATTR_CNID:
                return HFS_ATTR_TREE_NAME;
        default:
                return HFS_UNKNOWN_TREE_NAME;
        }
}

/* Get a reference to a B*Tree and do some initial checks */
struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
{
        struct hfs_btree *tree;
        struct hfs_btree_header_rec *head;
        struct address_space *mapping;
        struct hfs_bnode *node;
        struct inode *inode;
        struct page *page;
        unsigned int size;

        tree = kzalloc_obj(*tree);
        if (!tree)
                return NULL;

        mutex_init(&tree->tree_lock);
        spin_lock_init(&tree->hash_lock);
        tree->sb = sb;
        tree->cnid = id;
        inode = hfsplus_iget(sb, id);
        if (IS_ERR(inode))
                goto free_tree;
        tree->inode = inode;

        if (!HFSPLUS_I(tree->inode)->first_blocks) {
                pr_err("invalid btree extent records (0 size)\n");
                goto free_inode;
        }

        mapping = tree->inode->i_mapping;
        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
                goto free_inode;

        /* Load the header */
        head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
                sizeof(struct hfs_bnode_desc));
        tree->root = be32_to_cpu(head->root);
        tree->leaf_count = be32_to_cpu(head->leaf_count);
        tree->leaf_head = be32_to_cpu(head->leaf_head);
        tree->leaf_tail = be32_to_cpu(head->leaf_tail);
        tree->node_count = be32_to_cpu(head->node_count);
        tree->free_nodes = be32_to_cpu(head->free_nodes);
        tree->attributes = be32_to_cpu(head->attributes);
        tree->node_size = be16_to_cpu(head->node_size);
        tree->max_key_len = be16_to_cpu(head->max_key_len);
        tree->depth = be16_to_cpu(head->depth);

        /* Verify the tree and set the correct compare function */
        switch (id) {
        case HFSPLUS_EXT_CNID:
                if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
                        pr_err("invalid extent max_key_len %d\n",
                                tree->max_key_len);
                        goto fail_page;
                }
                if (tree->attributes & HFS_TREE_VARIDXKEYS) {
                        pr_err("invalid extent btree flag\n");
                        goto fail_page;
                }

                tree->keycmp = hfsplus_ext_cmp_key;
                break;
        case HFSPLUS_CAT_CNID:
                if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
                        pr_err("invalid catalog max_key_len %d\n",
                                tree->max_key_len);
                        goto fail_page;
                }
                if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
                        pr_err("invalid catalog btree flag\n");
                        goto fail_page;
                }

                if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
                    (head->key_type == HFSPLUS_KEY_BINARY))
                        tree->keycmp = hfsplus_cat_bin_cmp_key;
                else {
                        tree->keycmp = hfsplus_cat_case_cmp_key;
                        set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
                }
                break;
        case HFSPLUS_ATTR_CNID:
                if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) {
                        pr_err("invalid attributes max_key_len %d\n",
                                tree->max_key_len);
                        goto fail_page;
                }
                tree->keycmp = hfsplus_attr_bin_cmp_key;
                break;
        default:
                pr_err("unknown B*Tree requested\n");
                goto fail_page;
        }

        if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
                pr_err("invalid btree flag\n");
                goto fail_page;
        }

        size = tree->node_size;
        if (!is_power_of_2(size))
                goto fail_page;
        if (!tree->node_count)
                goto fail_page;

        tree->node_size_shift = ffs(size) - 1;

        tree->pages_per_bnode =
                (tree->node_size + PAGE_SIZE - 1) >>
                PAGE_SHIFT;

        kunmap_local(head);
        put_page(page);

        node = hfs_bnode_find(tree, HFSPLUS_TREE_HEAD);
        if (IS_ERR(node))
                goto free_inode;

        if (!hfs_bmap_test_bit(node, 0)) {
                pr_warn("(%s): %s (cnid 0x%x) map record invalid or bitmap corruption detected, forcing read-only.\n",
                                sb->s_id, hfs_btree_name(id), id);
                pr_warn("Run fsck.hfsplus to repair.\n");
                sb->s_flags |= SB_RDONLY;
        }

        hfs_bnode_put(node);

        return tree;

 fail_page:
        kunmap_local(head);
        put_page(page);
 free_inode:
        tree->inode->i_mapping->a_ops = &hfsplus_aops;
        iput(tree->inode);
 free_tree:
        kfree(tree);
        return NULL;
}

/* Release resources used by a btree */
void hfs_btree_close(struct hfs_btree *tree)
{
        struct hfs_bnode *node;
        int i;

        if (!tree)
                return;

        for (i = 0; i < NODE_HASH_SIZE; i++) {
                while ((node = tree->node_hash[i])) {
                        tree->node_hash[i] = node->next_hash;
                        if (atomic_read(&node->refcnt))
                                pr_crit("node %d:%d "
                                                "still has %d user(s)!\n",
                                        node->tree->cnid, node->this,
                                        atomic_read(&node->refcnt));
                        hfs_bnode_free(node);
                        tree->node_hash_cnt--;
                }
        }
        iput(tree->inode);
        kfree(tree);
}

int hfs_btree_write(struct hfs_btree *tree)
{
        struct hfs_btree_header_rec *head;
        struct hfs_bnode *node;
        struct page *page;

        node = hfs_bnode_find(tree, 0);
        if (IS_ERR(node))
                /* panic? */
                return -EIO;
        /* Load the header */
        page = node->page[0];
        head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
                sizeof(struct hfs_bnode_desc));

        head->root = cpu_to_be32(tree->root);
        head->leaf_count = cpu_to_be32(tree->leaf_count);
        head->leaf_head = cpu_to_be32(tree->leaf_head);
        head->leaf_tail = cpu_to_be32(tree->leaf_tail);
        head->node_count = cpu_to_be32(tree->node_count);
        head->free_nodes = cpu_to_be32(tree->free_nodes);
        head->attributes = cpu_to_be32(tree->attributes);
        head->depth = cpu_to_be16(tree->depth);

        kunmap_local(head);
        set_page_dirty(page);
        hfs_bnode_put(node);
        return 0;
}

static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
{
        struct hfs_btree *tree = prev->tree;
        struct hfs_bnode *node;
        struct hfs_bnode_desc desc;
        __be32 cnid;

        node = hfs_bnode_create(tree, idx);
        if (IS_ERR(node))
                return node;

        tree->free_nodes--;
        prev->next = idx;
        cnid = cpu_to_be32(idx);
        hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4);

        node->type = HFS_NODE_MAP;
        node->num_recs = 1;
        hfs_bnode_clear(node, 0, tree->node_size);
        desc.next = 0;
        desc.prev = 0;
        desc.type = HFS_NODE_MAP;
        desc.height = 0;
        desc.num_recs = cpu_to_be16(1);
        desc.reserved = 0;
        hfs_bnode_write(node, &desc, 0, sizeof(desc));
        hfs_bnode_write_u16(node, 14, 0x8000);
        hfs_bnode_write_u16(node, tree->node_size - 2, 14);
        hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6);

        return node;
}

/* Make sure @tree has enough space for the @rsvd_nodes */
int hfs_bmap_reserve(struct hfs_btree *tree, u32 rsvd_nodes)
{
        struct inode *inode = tree->inode;
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        u32 count;
        int res;

        lockdep_assert_held(&tree->tree_lock);

        if (rsvd_nodes <= 0)
                return 0;

        while (tree->free_nodes < rsvd_nodes) {
                res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree));
                if (res)
                        return res;
                hip->phys_size = inode->i_size =
                        (loff_t)hip->alloc_blocks <<
                                HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
                hip->fs_blocks =
                        hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
                inode_set_bytes(inode, inode->i_size);
                count = inode->i_size >> tree->node_size_shift;
                tree->free_nodes += count - tree->node_count;
                tree->node_count = count;
        }
        return 0;
}

struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
{
        struct hfs_bnode *node, *next_node;
        struct hfs_bmap_ctx ctx;
        struct page *page;
        u32 nidx, idx;
        u8 *data, byte, m;
        int i, res;

        lockdep_assert_held(&tree->tree_lock);

        res = hfs_bmap_reserve(tree, 1);
        if (res)
                return ERR_PTR(res);

        nidx = 0;
        node = hfs_bnode_find(tree, nidx);
        if (IS_ERR(node))
                return node;

        page = hfs_bmap_get_map_page(node, &ctx, 0);
        if (IS_ERR(page)) {
                res = PTR_ERR(page);
                hfs_bnode_put(node);
                return ERR_PTR(res);
        }

        data = kmap_local_page(page);
        idx = 0;

        for (;;) {
                while (ctx.len) {
                        byte = data[ctx.off];
                        if (byte != 0xff) {
                                for (m = 0x80, i = 0; i < 8; m >>= 1, i++) {
                                        if (!(byte & m)) {
                                                idx += i;
                                                data[ctx.off] |= m;
                                                set_page_dirty(page);
                                                kunmap_local(data);
                                                tree->free_nodes--;
                                                hfs_btree_write(tree);
                                                mark_inode_dirty(tree->inode);
                                                hfs_bnode_put(node);
                                                return hfs_bnode_create(tree,
                                                        idx);
                                        }
                                }
                        }
                        if (++ctx.off >= PAGE_SIZE) {
                                kunmap_local(data);
                                page = node->page[++ctx.page_idx];
                                data = kmap_local_page(page);
                                ctx.off = 0;
                        }
                        idx += 8;
                        ctx.len--;
                }
                kunmap_local(data);
                nidx = node->next;
                if (!nidx) {
                        hfs_dbg("create new bmap node\n");
                        next_node = hfs_bmap_new_bmap(node, idx);
                        hfs_btree_write(tree);
                } else
                        next_node = hfs_bnode_find(tree, nidx);
                hfs_bnode_put(node);
                if (IS_ERR(next_node))
                        return next_node;
                node = next_node;

                page = hfs_bmap_get_map_page(node, &ctx, 0);
                if (IS_ERR(page)) {
                        res = PTR_ERR(page);
                        hfs_bnode_put(node);
                        return ERR_PTR(res);
                }
                data = kmap_local_page(page);
        }
}

void hfs_bmap_free(struct hfs_bnode *node)
{
        struct hfs_btree *tree;
        u16 off, len;
        u32 nidx;
        int res;

        hfs_dbg("node %u\n", node->this);
        BUG_ON(!node->this);
        tree = node->tree;
        lockdep_assert_held(&tree->tree_lock);
        nidx = node->this;
        node = hfs_bnode_find(tree, 0);
        if (IS_ERR(node))
                return;
        len = hfs_brec_lenoff(node, 2, &off);
        while (nidx >= len * 8) {
                u32 i;

                nidx -= len * 8;
                i = node->next;
                if (!i) {
                        /* panic */;
                        pr_crit("unable to free bnode %u. "
                                        "bmap not found!\n",
                                node->this);
                        hfs_bnode_put(node);
                        return;
                }
                hfs_bnode_put(node);
                node = hfs_bnode_find(tree, i);
                if (IS_ERR(node))
                        return;
                if (node->type != HFS_NODE_MAP) {
                        /* panic */;
                        pr_crit("invalid bmap found! "
                                        "(%u,%d)\n",
                                node->this, node->type);
                        hfs_bnode_put(node);
                        return;
                }
                len = hfs_brec_lenoff(node, 0, &off);
        }

        res = hfs_bmap_clear_bit(node, nidx);
        if (res == -EINVAL) {
                pr_crit("trying to free the freed bnode %u(%d)\n",
                        nidx, node->type);
        } else if (res) {
                pr_crit("fail to free bnode %u(%d)\n",
                        nidx, node->type);
        } else {
                tree->free_nodes++;
                hfs_btree_write(tree);
                mark_inode_dirty(tree->inode);
        }

        hfs_bnode_put(node);
}



















































































   22 


   21 



































   18 


   17 


























    2 


    1 

























    1 


    1 











































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */

#include <linux/cred.h>
#include <linux/fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
#include <linux/seq_file.h>

#include "internal.h"

/*
 * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t,
 * never from raw values. These are just internal helpers.
 */
#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val }
#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val }

struct mnt_idmap {
        struct uid_gid_map uid_map;
        struct uid_gid_map gid_map;
        refcount_t count;
};

/*
 * Carries the initial idmapping of 0:0:4294967295 which is an identity
 * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
 * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
 */
struct mnt_idmap nop_mnt_idmap = {
        .count        = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL_GPL(nop_mnt_idmap);

/*
 * Carries the invalid idmapping of a full 0-4294967295 {g,u}id range.
 * This means that all {g,u}ids are mapped to INVALID_VFS{G,U}ID.
 */
struct mnt_idmap invalid_mnt_idmap = {
        .count        = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL_GPL(invalid_mnt_idmap);

/**
 * initial_idmapping - check whether this is the initial mapping
 * @ns: idmapping to check
 *
 * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
 * [...], 1000 to 1000 [...].
 *
 * Return: true if this is the initial mapping, false if not.
 */
static inline bool initial_idmapping(const struct user_namespace *ns)
{
        return ns == &init_user_ns;
}

/**
 * make_vfsuid - map a filesystem kuid according to an idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @kuid : kuid to be mapped
 *
 * Take a @kuid and remap it from @fs_userns into @idmap. Use this
 * function when preparing a @kuid to be reported to userspace.
 *
 * If initial_idmapping() determines that this is not an idmapped mount
 * we can simply return @kuid unchanged.
 * If initial_idmapping() tells us that the filesystem is not mounted with an
 * idmapping we know the value of @kuid won't change when calling
 * from_kuid() so we can simply retrieve the value via __kuid_val()
 * directly.
 *
 * Return: @kuid mapped according to @idmap.
 * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is
 * returned.
 */

vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
                     struct user_namespace *fs_userns,
                     kuid_t kuid)
{
        uid_t uid;

        if (idmap == &nop_mnt_idmap)
                return VFSUIDT_INIT(kuid);
        if (idmap == &invalid_mnt_idmap)
                return INVALID_VFSUID;
        if (initial_idmapping(fs_userns))
                uid = __kuid_val(kuid);
        else
                uid = from_kuid(fs_userns, kuid);
        if (uid == (uid_t)-1)
                return INVALID_VFSUID;
        return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid));
}
EXPORT_SYMBOL_GPL(make_vfsuid);

/**
 * make_vfsgid - map a filesystem kgid according to an idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @kgid : kgid to be mapped
 *
 * Take a @kgid and remap it from @fs_userns into @idmap. Use this
 * function when preparing a @kgid to be reported to userspace.
 *
 * If initial_idmapping() determines that this is not an idmapped mount
 * we can simply return @kgid unchanged.
 * If initial_idmapping() tells us that the filesystem is not mounted with an
 * idmapping we know the value of @kgid won't change when calling
 * from_kgid() so we can simply retrieve the value via __kgid_val()
 * directly.
 *
 * Return: @kgid mapped according to @idmap.
 * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is
 * returned.
 */
vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
                     struct user_namespace *fs_userns, kgid_t kgid)
{
        gid_t gid;

        if (idmap == &nop_mnt_idmap)
                return VFSGIDT_INIT(kgid);
        if (idmap == &invalid_mnt_idmap)
                return INVALID_VFSGID;
        if (initial_idmapping(fs_userns))
                gid = __kgid_val(kgid);
        else
                gid = from_kgid(fs_userns, kgid);
        if (gid == (gid_t)-1)
                return INVALID_VFSGID;
        return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid));
}
EXPORT_SYMBOL_GPL(make_vfsgid);

/**
 * from_vfsuid - map a vfsuid into the filesystem idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @vfsuid : vfsuid to be mapped
 *
 * Map @vfsuid into the filesystem idmapping. This function has to be used in
 * order to e.g. write @vfsuid to inode->i_uid.
 *
 * Return: @vfsuid mapped into the filesystem idmapping
 */
kuid_t from_vfsuid(struct mnt_idmap *idmap,
                   struct user_namespace *fs_userns, vfsuid_t vfsuid)
{
        uid_t uid;

        if (idmap == &nop_mnt_idmap)
                return AS_KUIDT(vfsuid);
        if (idmap == &invalid_mnt_idmap)
                return INVALID_UID;
        uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
        if (uid == (uid_t)-1)
                return INVALID_UID;
        if (initial_idmapping(fs_userns))
                return KUIDT_INIT(uid);
        return make_kuid(fs_userns, uid);
}
EXPORT_SYMBOL_GPL(from_vfsuid);

/**
 * from_vfsgid - map a vfsgid into the filesystem idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @vfsgid : vfsgid to be mapped
 *
 * Map @vfsgid into the filesystem idmapping. This function has to be used in
 * order to e.g. write @vfsgid to inode->i_gid.
 *
 * Return: @vfsgid mapped into the filesystem idmapping
 */
kgid_t from_vfsgid(struct mnt_idmap *idmap,
                   struct user_namespace *fs_userns, vfsgid_t vfsgid)
{
        gid_t gid;

        if (idmap == &nop_mnt_idmap)
                return AS_KGIDT(vfsgid);
        if (idmap == &invalid_mnt_idmap)
                return INVALID_GID;
        gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
        if (gid == (gid_t)-1)
                return INVALID_GID;
        if (initial_idmapping(fs_userns))
                return KGIDT_INIT(gid);
        return make_kgid(fs_userns, gid);
}
EXPORT_SYMBOL_GPL(from_vfsgid);

#ifdef CONFIG_MULTIUSER
/**
 * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups
 * @vfsgid: the mnt gid to match
 *
 * This function can be used to determine whether @vfsuid matches any of the
 * caller's groups.
 *
 * Return: 1 if vfsuid matches caller's groups, 0 if not.
 */
int vfsgid_in_group_p(vfsgid_t vfsgid)
{
        return in_group_p(AS_KGIDT(vfsgid));
}
#else
int vfsgid_in_group_p(vfsgid_t vfsgid)
{
        return 1;
}
#endif
EXPORT_SYMBOL_GPL(vfsgid_in_group_p);

static int copy_mnt_idmap(struct uid_gid_map *map_from,
                          struct uid_gid_map *map_to)
{
        struct uid_gid_extent *forward, *reverse;
        u32 nr_extents = READ_ONCE(map_from->nr_extents);
        /* Pairs with smp_wmb() when writing the idmapping. */
        smp_rmb();

        /*
         * Don't blindly copy @map_to into @map_from if nr_extents is
         * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we
         * read @nr_extents someone could have written an idmapping and
         * then we might end up with inconsistent data. So just don't do
         * anything at all.
         */
        if (nr_extents == 0)
                return -EINVAL;

        /*
         * Here we know that nr_extents is greater than zero which means
         * a map has been written. Since idmappings can't be changed
         * once they have been written we know that we can safely copy
         * from @map_to into @map_from.
         */

        if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
                *map_to = *map_from;
                return 0;
        }

        forward = kmemdup_array(map_from->forward, nr_extents,
                                sizeof(struct uid_gid_extent),
                                GFP_KERNEL_ACCOUNT);
        if (!forward)
                return -ENOMEM;

        reverse = kmemdup_array(map_from->reverse, nr_extents,
                                sizeof(struct uid_gid_extent),
                                GFP_KERNEL_ACCOUNT);
        if (!reverse) {
                kfree(forward);
                return -ENOMEM;
        }

        /*
         * The idmapping isn't exposed anywhere so we don't need to care
         * about ordering between extent pointers and @nr_extents
         * initialization.
         */
        map_to->forward = forward;
        map_to->reverse = reverse;
        map_to->nr_extents = nr_extents;
        return 0;
}

static void free_mnt_idmap(struct mnt_idmap *idmap)
{
        if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(idmap->uid_map.forward);
                kfree(idmap->uid_map.reverse);
        }
        if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(idmap->gid_map.forward);
                kfree(idmap->gid_map.reverse);
        }
        kfree(idmap);
}

struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
{
        struct mnt_idmap *idmap;
        int ret;

        idmap = kzalloc_obj(struct mnt_idmap, GFP_KERNEL_ACCOUNT);
        if (!idmap)
                return ERR_PTR(-ENOMEM);

        refcount_set(&idmap->count, 1);
        ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map);
        if (!ret)
                ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map);
        if (ret) {
                free_mnt_idmap(idmap);
                idmap = ERR_PTR(ret);
        }
        return idmap;
}

/**
 * mnt_idmap_get - get a reference to an idmapping
 * @idmap: the idmap to bump the reference on
 *
 * If @idmap is not the @nop_mnt_idmap bump the reference count.
 *
 * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed.
 */
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
{
        if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap)
                refcount_inc(&idmap->count);

        return idmap;
}
EXPORT_SYMBOL_GPL(mnt_idmap_get);

/**
 * mnt_idmap_put - put a reference to an idmapping
 * @idmap: the idmap to put the reference on
 *
 * If this is a non-initial idmapping, put the reference count when a mount is
 * released and free it if we're the last user.
 */
void mnt_idmap_put(struct mnt_idmap *idmap)
{
        if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap &&
            refcount_dec_and_test(&idmap->count))
                free_mnt_idmap(idmap);
}
EXPORT_SYMBOL_GPL(mnt_idmap_put);

int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map)
{
        struct uid_gid_map *map, *map_up;
        u32 idx, nr_mappings;

        if (!is_valid_mnt_idmap(idmap))
                return 0;

        /*
         * Idmappings are shown relative to the caller's idmapping.
         * This is both the most intuitive and most useful solution.
         */
        if (uid_map) {
                map = &idmap->uid_map;
                map_up = &current_user_ns()->uid_map;
        } else {
                map = &idmap->gid_map;
                map_up = &current_user_ns()->gid_map;
        }

        for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) {
                uid_t lower;
                struct uid_gid_extent *extent;

                if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        extent = &map->extent[idx];
                else
                        extent = &map->forward[idx];

                /*
                 * Verify that the whole range of the mapping can be
                 * resolved in the caller's idmapping. If it cannot be
                 * resolved skip the mapping.
                 */
                lower = map_id_range_up(map_up, extent->lower_first, extent->count);
                if (lower == (uid_t) -1)
                        continue;

                seq_printf(seq, "%u %u %u", extent->first, lower, extent->count);

                seq->count++; /* mappings are separated by \0 */
                if (seq_has_overflowed(seq))
                        return -EAGAIN;

                nr_mappings++;
        }

        return nr_mappings;
}



























    1 
































    1 
    1 





























































































































































































    1 































    1 


    1 
















    1 





    1 





    1 







    1 























    1 


    1 








































    1 













    1 

























    1 









































































    1 









    1 



    1 
















    1 













































    1 





















    1 







    1 








































    1 




    1 


















    1 


    1 






    1 






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
// SPDX-License-Identifier: GPL-2.0-or-later
/* Request a key from userspace
 *
 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * See Documentation/security/keys/request-key.rst
 */

#include <linux/export.h>
#include <linux/sched.h>
#include <linux/kmod.h>
#include <linux/err.h>
#include <linux/keyctl.h>
#include <linux/slab.h>
#include <net/net_namespace.h>
#include "internal.h"
#include <keys/request_key_auth-type.h>

#define key_negative_timeout        60        /* default timeout on a negative key's existence */

static struct key *check_cached_key(struct keyring_search_context *ctx)
{
#ifdef CONFIG_KEYS_REQUEST_CACHE
        struct key *key = current->cached_requested_key;

        if (key &&
            ctx->match_data.cmp(key, &ctx->match_data) &&
            !(key->flags & ((1 << KEY_FLAG_INVALIDATED) |
                            (1 << KEY_FLAG_REVOKED))))
                return key_get(key);
#endif
        return NULL;
}

static void cache_requested_key(struct key *key)
{
#ifdef CONFIG_KEYS_REQUEST_CACHE
        struct task_struct *t = current;

        /* Do not cache key if it is a kernel thread */
        if (!(t->flags & PF_KTHREAD)) {
                key_put(t->cached_requested_key);
                t->cached_requested_key = key_get(key);
                set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
        }
#endif
}

/**
 * complete_request_key - Complete the construction of a key.
 * @authkey: The authorisation key.
 * @error: The success or failute of the construction.
 *
 * Complete the attempt to construct a key.  The key will be negated
 * if an error is indicated.  The authorisation key will be revoked
 * unconditionally.
 */
void complete_request_key(struct key *authkey, int error)
{
        struct request_key_auth *rka = get_request_key_auth(authkey);
        struct key *key = rka->target_key;

        kenter("%d{%d},%d", authkey->serial, key->serial, error);

        if (error < 0)
                key_negate_and_link(key, key_negative_timeout, NULL, authkey);
        else
                key_revoke(authkey);
}
EXPORT_SYMBOL(complete_request_key);

/*
 * Initialise a usermode helper that is going to have a specific session
 * keyring.
 *
 * This is called in context of freshly forked kthread before kernel_execve(),
 * so we can simply install the desired session_keyring at this point.
 */
static int umh_keys_init(struct subprocess_info *info, struct cred *cred)
{
        struct key *keyring = info->data;

        return install_session_keyring_to_cred(cred, keyring);
}

/*
 * Clean up a usermode helper with session keyring.
 */
static void umh_keys_cleanup(struct subprocess_info *info)
{
        struct key *keyring = info->data;
        key_put(keyring);
}

/*
 * Call a usermode helper with a specific session keyring.
 */
static int call_usermodehelper_keys(const char *path, char **argv, char **envp,
                                        struct key *session_keyring, int wait)
{
        struct subprocess_info *info;

        info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL,
                                          umh_keys_init, umh_keys_cleanup,
                                          session_keyring);
        if (!info)
                return -ENOMEM;

        key_get(session_keyring);
        return call_usermodehelper_exec(info, wait);
}

/*
 * Request userspace finish the construction of a key
 * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring>"
 */
static int call_sbin_request_key(struct key *authkey, void *aux)
{
        static char const request_key[] = "/sbin/request-key";
        struct request_key_auth *rka = get_request_key_auth(authkey);
        const struct cred *cred = current_cred();
        key_serial_t prkey, sskey;
        struct key *key = rka->target_key, *keyring, *session, *user_session;
        char *argv[9], *envp[3], uid_str[12], gid_str[12];
        char key_str[12], keyring_str[3][12];
        char desc[20];
        int ret, i;

        kenter("{%d},{%d},%s", key->serial, authkey->serial, rka->op);

        ret = look_up_user_keyrings(NULL, &user_session);
        if (ret < 0)
                goto error_us;

        /* allocate a new session keyring */
        sprintf(desc, "_req.%u", key->serial);

        cred = get_current_cred();
        keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred,
                                KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
                                KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL);
        put_cred(cred);
        if (IS_ERR(keyring)) {
                ret = PTR_ERR(keyring);
                goto error_alloc;
        }

        /* attach the auth key to the session keyring */
        ret = key_link(keyring, authkey);
        if (ret < 0)
                goto error_link;

        /* record the UID and GID */
        sprintf(uid_str, "%d", from_kuid(&init_user_ns, cred->fsuid));
        sprintf(gid_str, "%d", from_kgid(&init_user_ns, cred->fsgid));

        /* we say which key is under construction */
        sprintf(key_str, "%d", key->serial);

        /* we specify the process's default keyrings */
        sprintf(keyring_str[0], "%d",
                cred->thread_keyring ? cred->thread_keyring->serial : 0);

        prkey = 0;
        if (cred->process_keyring)
                prkey = cred->process_keyring->serial;
        sprintf(keyring_str[1], "%d", prkey);

        session = cred->session_keyring;
        if (!session)
                session = user_session;
        sskey = session->serial;

        sprintf(keyring_str[2], "%d", sskey);

        /* set up a minimal environment */
        i = 0;
        envp[i++] = "HOME=/";
        envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
        envp[i] = NULL;

        /* set up the argument list */
        i = 0;
        argv[i++] = (char *)request_key;
        argv[i++] = (char *)rka->op;
        argv[i++] = key_str;
        argv[i++] = uid_str;
        argv[i++] = gid_str;
        argv[i++] = keyring_str[0];
        argv[i++] = keyring_str[1];
        argv[i++] = keyring_str[2];
        argv[i] = NULL;

        /* do it */
        ret = call_usermodehelper_keys(request_key, argv, envp, keyring,
                                       UMH_WAIT_PROC);
        kdebug("usermode -> 0x%x", ret);
        if (ret >= 0) {
                /* ret is the exit/wait code */
                if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags) ||
                    key_validate(key) < 0)
                        ret = -ENOKEY;
                else
                        /* ignore any errors from userspace if the key was
                         * instantiated */
                        ret = 0;
        }

error_link:
        key_put(keyring);

error_alloc:
        key_put(user_session);
error_us:
        complete_request_key(authkey, ret);
        kleave(" = %d", ret);
        return ret;
}

/*
 * Call out to userspace for key construction.
 *
 * Program failure is ignored in favour of key status.
 */
static int construct_key(struct key *key, const void *callout_info,
                         size_t callout_len, void *aux,
                         struct key *dest_keyring)
{
        request_key_actor_t actor;
        struct key *authkey;
        int ret;

        kenter("%d,%p,%zu,%p", key->serial, callout_info, callout_len, aux);

        /* allocate an authorisation key */
        authkey = request_key_auth_new(key, "create", callout_info, callout_len,
                                       dest_keyring);
        if (IS_ERR(authkey))
                return PTR_ERR(authkey);

        /* Make the call */
        actor = call_sbin_request_key;
        if (key->type->request_key)
                actor = key->type->request_key;

        ret = actor(authkey, aux);

        /* check that the actor called complete_request_key() prior to
         * returning an error */
        WARN_ON(ret < 0 &&
                !test_bit(KEY_FLAG_INVALIDATED, &authkey->flags));

        key_put(authkey);
        kleave(" = %d", ret);
        return ret;
}

/*
 * Get the appropriate destination keyring for the request.
 *
 * The keyring selected is returned with an extra reference upon it which the
 * caller must release.
 */
static int construct_get_dest_keyring(struct key **_dest_keyring)
{
        struct request_key_auth *rka;
        const struct cred *cred = current_cred();
        struct key *dest_keyring = *_dest_keyring, *authkey;
        int ret;

        kenter("%p", dest_keyring);

        /* find the appropriate keyring */
        if (dest_keyring) {
                /* the caller supplied one */
                key_get(dest_keyring);
        } else {
                bool do_perm_check = true;

                /* use a default keyring; falling through the cases until we
                 * find one that we actually have */
                switch (cred->jit_keyring) {
                case KEY_REQKEY_DEFL_DEFAULT:
                case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
                        if (cred->request_key_auth) {
                                authkey = cred->request_key_auth;
                                down_read(&authkey->sem);
                                rka = get_request_key_auth(authkey);
                                if (!test_bit(KEY_FLAG_REVOKED,
                                              &authkey->flags))
                                        dest_keyring =
                                                key_get(rka->dest_keyring);
                                up_read(&authkey->sem);
                                if (dest_keyring) {
                                        do_perm_check = false;
                                        break;
                                }
                        }

                        fallthrough;
                case KEY_REQKEY_DEFL_THREAD_KEYRING:
                        dest_keyring = key_get(cred->thread_keyring);
                        if (dest_keyring)
                                break;

                        fallthrough;
                case KEY_REQKEY_DEFL_PROCESS_KEYRING:
                        dest_keyring = key_get(cred->process_keyring);
                        if (dest_keyring)
                                break;

                        fallthrough;
                case KEY_REQKEY_DEFL_SESSION_KEYRING:
                        dest_keyring = key_get(cred->session_keyring);

                        if (dest_keyring)
                                break;

                        fallthrough;
                case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
                        ret = look_up_user_keyrings(NULL, &dest_keyring);
                        if (ret < 0)
                                return ret;
                        break;

                case KEY_REQKEY_DEFL_USER_KEYRING:
                        ret = look_up_user_keyrings(&dest_keyring, NULL);
                        if (ret < 0)
                                return ret;
                        break;

                case KEY_REQKEY_DEFL_GROUP_KEYRING:
                default:
                        BUG();
                }

                /*
                 * Require Write permission on the keyring.  This is essential
                 * because the default keyring may be the session keyring, and
                 * joining a keyring only requires Search permission.
                 *
                 * However, this check is skipped for the "requestor keyring" so
                 * that /sbin/request-key can itself use request_key() to add
                 * keys to the original requestor's destination keyring.
                 */
                if (dest_keyring && do_perm_check) {
                        ret = key_permission(make_key_ref(dest_keyring, 1),
                                             KEY_NEED_WRITE);
                        if (ret) {
                                key_put(dest_keyring);
                                return ret;
                        }
                }
        }

        *_dest_keyring = dest_keyring;
        kleave(" [dk %d]", key_serial(dest_keyring));
        return 0;
}

/*
 * Allocate a new key in under-construction state and attempt to link it in to
 * the requested keyring.
 *
 * May return a key that's already under construction instead if there was a
 * race between two thread calling request_key().
 */
static int construct_alloc_key(struct keyring_search_context *ctx,
                               struct key *dest_keyring,
                               unsigned long flags,
                               struct key_user *user,
                               struct key **_key)
{
        struct assoc_array_edit *edit = NULL;
        struct key *key;
        key_perm_t perm;
        key_ref_t key_ref;
        int ret;

        kenter("%s,%s,,,",
               ctx->index_key.type->name, ctx->index_key.description);

        *_key = NULL;
        mutex_lock(&user->cons_lock);

        perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
        perm |= KEY_USR_VIEW;
        if (ctx->index_key.type->read)
                perm |= KEY_POS_READ;
        if (ctx->index_key.type == &key_type_keyring ||
            ctx->index_key.type->update)
                perm |= KEY_POS_WRITE;

        key = key_alloc(ctx->index_key.type, ctx->index_key.description,
                        ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred,
                        perm, flags, NULL);
        if (IS_ERR(key))
                goto alloc_failed;

        set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags);

        if (dest_keyring) {
                ret = __key_link_lock(dest_keyring, &key->index_key);
                if (ret < 0)
                        goto link_lock_failed;
        }

        /*
         * Attach the key to the destination keyring under lock, but we do need
         * to do another check just in case someone beat us to it whilst we
         * waited for locks.
         *
         * The caller might specify a comparison function which looks for keys
         * that do not exactly match but are still equivalent from the caller's
         * perspective. The __key_link_begin() operation must be done only after
         * an actual key is determined.
         */
        mutex_lock(&key_construction_mutex);

        rcu_read_lock();
        key_ref = search_process_keyrings_rcu(ctx);
        rcu_read_unlock();
        if (!IS_ERR(key_ref))
                goto key_already_present;

        if (dest_keyring) {
                ret = __key_link_begin(dest_keyring, &key->index_key, &edit);
                if (ret < 0)
                        goto link_alloc_failed;
                __key_link(dest_keyring, key, &edit);
        }

        mutex_unlock(&key_construction_mutex);
        if (dest_keyring)
                __key_link_end(dest_keyring, &key->index_key, edit);
        mutex_unlock(&user->cons_lock);
        *_key = key;
        kleave(" = 0 [%d]", key_serial(key));
        return 0;

        /* the key is now present - we tell the caller that we found it by
         * returning -EINPROGRESS  */
key_already_present:
        key_put(key);
        mutex_unlock(&key_construction_mutex);
        key = key_ref_to_ptr(key_ref);
        if (dest_keyring) {
                ret = __key_link_begin(dest_keyring, &key->index_key, &edit);
                if (ret < 0)
                        goto link_alloc_failed_unlocked;
                ret = __key_link_check_live_key(dest_keyring, key);
                if (ret == 0)
                        __key_link(dest_keyring, key, &edit);
                __key_link_end(dest_keyring, &key->index_key, edit);
                if (ret < 0)
                        goto link_check_failed;
        }
        mutex_unlock(&user->cons_lock);
        *_key = key;
        kleave(" = -EINPROGRESS [%d]", key_serial(key));
        return -EINPROGRESS;

link_check_failed:
        mutex_unlock(&user->cons_lock);
        key_put(key);
        kleave(" = %d [linkcheck]", ret);
        return ret;

link_alloc_failed:
        mutex_unlock(&key_construction_mutex);
link_alloc_failed_unlocked:
        __key_link_end(dest_keyring, &key->index_key, edit);
link_lock_failed:
        mutex_unlock(&user->cons_lock);
        key_put(key);
        kleave(" = %d [prelink]", ret);
        return ret;

alloc_failed:
        mutex_unlock(&user->cons_lock);
        kleave(" = %ld", PTR_ERR(key));
        return PTR_ERR(key);
}

/*
 * Commence key construction.
 */
static struct key *construct_key_and_link(struct keyring_search_context *ctx,
                                          const char *callout_info,
                                          size_t callout_len,
                                          void *aux,
                                          struct key *dest_keyring,
                                          unsigned long flags)
{
        struct key_user *user;
        struct key *key;
        int ret;

        kenter("");

        if (ctx->index_key.type == &key_type_keyring)
                return ERR_PTR(-EPERM);

        ret = construct_get_dest_keyring(&dest_keyring);
        if (ret)
                goto error;

        user = key_user_lookup(current_fsuid());
        if (!user) {
                ret = -ENOMEM;
                goto error_put_dest_keyring;
        }

        ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key);
        key_user_put(user);

        if (ret == 0) {
                ret = construct_key(key, callout_info, callout_len, aux,
                                    dest_keyring);
                if (ret < 0) {
                        kdebug("cons failed");
                        goto construction_failed;
                }
        } else if (ret == -EINPROGRESS) {
                ret = 0;
        } else {
                goto error_put_dest_keyring;
        }

        key_put(dest_keyring);
        kleave(" = key %d", key_serial(key));
        return key;

construction_failed:
        key_negate_and_link(key, key_negative_timeout, NULL, NULL);
        key_put(key);
error_put_dest_keyring:
        key_put(dest_keyring);
error:
        kleave(" = %d", ret);
        return ERR_PTR(ret);
}

/**
 * request_key_and_link - Request a key and cache it in a keyring.
 * @type: The type of key we want.
 * @description: The searchable description of the key.
 * @domain_tag: The domain in which the key operates.
 * @callout_info: The data to pass to the instantiation upcall (or NULL).
 * @callout_len: The length of callout_info.
 * @aux: Auxiliary data for the upcall.
 * @dest_keyring: Where to cache the key.
 * @flags: Flags to key_alloc().
 *
 * A key matching the specified criteria (type, description, domain_tag) is
 * searched for in the process's keyrings and returned with its usage count
 * incremented if found.  Otherwise, if callout_info is not NULL, a key will be
 * allocated and some service (probably in userspace) will be asked to
 * instantiate it.
 *
 * If successfully found or created, the key will be linked to the destination
 * keyring if one is provided.
 *
 * Returns a pointer to the key if successful; -EACCES, -ENOKEY, -EKEYREVOKED
 * or -EKEYEXPIRED if an inaccessible, negative, revoked or expired key was
 * found; -ENOKEY if no key was found and no @callout_info was given; -EDQUOT
 * if insufficient key quota was available to create a new key; or -ENOMEM if
 * insufficient memory was available.
 *
 * If the returned key was created, then it may still be under construction,
 * and wait_for_key_construction() should be used to wait for that to complete.
 */
struct key *request_key_and_link(struct key_type *type,
                                 const char *description,
                                 struct key_tag *domain_tag,
                                 const void *callout_info,
                                 size_t callout_len,
                                 void *aux,
                                 struct key *dest_keyring,
                                 unsigned long flags)
{
        struct keyring_search_context ctx = {
                .index_key.type                = type,
                .index_key.domain_tag        = domain_tag,
                .index_key.description        = description,
                .index_key.desc_len        = strlen(description),
                .cred                        = current_cred(),
                .match_data.cmp                = key_default_cmp,
                .match_data.raw_data        = description,
                .match_data.lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
                .flags                        = (KEYRING_SEARCH_DO_STATE_CHECK |
                                           KEYRING_SEARCH_SKIP_EXPIRED |
                                           KEYRING_SEARCH_RECURSE),
        };
        struct key *key;
        key_ref_t key_ref;
        int ret;

        kenter("%s,%s,%p,%zu,%p,%p,%lx",
               ctx.index_key.type->name, ctx.index_key.description,
               callout_info, callout_len, aux, dest_keyring, flags);

        if (type->match_preparse) {
                ret = type->match_preparse(&ctx.match_data);
                if (ret < 0) {
                        key = ERR_PTR(ret);
                        goto error;
                }
        }

        key = check_cached_key(&ctx);
        if (key)
                goto error_free;

        /* search all the process keyrings for a key */
        rcu_read_lock();
        key_ref = search_process_keyrings_rcu(&ctx);
        rcu_read_unlock();

        if (!IS_ERR(key_ref)) {
                if (dest_keyring) {
                        ret = key_task_permission(key_ref, current_cred(),
                                                  KEY_NEED_LINK);
                        if (ret < 0) {
                                key_ref_put(key_ref);
                                key = ERR_PTR(ret);
                                goto error_free;
                        }
                }

                key = key_ref_to_ptr(key_ref);
                if (dest_keyring) {
                        ret = key_link(dest_keyring, key);
                        if (ret < 0) {
                                key_put(key);
                                key = ERR_PTR(ret);
                                goto error_free;
                        }
                }

                /* Only cache the key on immediate success */
                cache_requested_key(key);
        } else if (PTR_ERR(key_ref) != -EAGAIN) {
                key = ERR_CAST(key_ref);
        } else  {
                /* the search failed, but the keyrings were searchable, so we
                 * should consult userspace if we can */
                key = ERR_PTR(-ENOKEY);
                if (!callout_info)
                        goto error_free;

                key = construct_key_and_link(&ctx, callout_info, callout_len,
                                             aux, dest_keyring, flags);
        }

error_free:
        if (type->match_free)
                type->match_free(&ctx.match_data);
error:
        kleave(" = %p", key);
        return key;
}

/**
 * wait_for_key_construction - Wait for construction of a key to complete
 * @key: The key being waited for.
 * @intr: Whether to wait interruptibly.
 *
 * Wait for a key to finish being constructed.
 *
 * Returns 0 if successful; -ERESTARTSYS if the wait was interrupted; -ENOKEY
 * if the key was negated; or -EKEYREVOKED or -EKEYEXPIRED if the key was
 * revoked or expired.
 */
int wait_for_key_construction(struct key *key, bool intr)
{
        int ret;

        ret = wait_on_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT,
                          intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
        if (ret)
                return -ERESTARTSYS;
        ret = key_read_state(key);
        if (ret < 0)
                return ret;
        return key_validate(key);
}
EXPORT_SYMBOL(wait_for_key_construction);

/**
 * request_key_tag - Request a key and wait for construction
 * @type: Type of key.
 * @description: The searchable description of the key.
 * @domain_tag: The domain in which the key operates.
 * @callout_info: The data to pass to the instantiation upcall (or NULL).
 *
 * As for request_key_and_link() except that it does not add the returned key
 * to a keyring if found, new keys are always allocated in the user's quota,
 * the callout_info must be a NUL-terminated string and no auxiliary data can
 * be passed.
 *
 * Furthermore, it then works as wait_for_key_construction() to wait for the
 * completion of keys undergoing construction with a non-interruptible wait.
 */
struct key *request_key_tag(struct key_type *type,
                            const char *description,
                            struct key_tag *domain_tag,
                            const char *callout_info)
{
        struct key *key;
        size_t callout_len = 0;
        int ret;

        if (callout_info)
                callout_len = strlen(callout_info);
        key = request_key_and_link(type, description, domain_tag,
                                   callout_info, callout_len,
                                   NULL, NULL, KEY_ALLOC_IN_QUOTA);
        if (!IS_ERR(key)) {
                ret = wait_for_key_construction(key, false);
                if (ret < 0) {
                        key_put(key);
                        return ERR_PTR(ret);
                }
        }
        return key;
}
EXPORT_SYMBOL(request_key_tag);

/**
 * request_key_with_auxdata - Request a key with auxiliary data for the upcaller
 * @type: The type of key we want.
 * @description: The searchable description of the key.
 * @domain_tag: The domain in which the key operates.
 * @callout_info: The data to pass to the instantiation upcall (or NULL).
 * @callout_len: The length of callout_info.
 * @aux: Auxiliary data for the upcall.
 *
 * As for request_key_and_link() except that it does not add the returned key
 * to a keyring if found and new keys are always allocated in the user's quota.
 *
 * Furthermore, it then works as wait_for_key_construction() to wait for the
 * completion of keys undergoing construction with a non-interruptible wait.
 */
struct key *request_key_with_auxdata(struct key_type *type,
                                     const char *description,
                                     struct key_tag *domain_tag,
                                     const void *callout_info,
                                     size_t callout_len,
                                     void *aux)
{
        struct key *key;
        int ret;

        key = request_key_and_link(type, description, domain_tag,
                                   callout_info, callout_len,
                                   aux, NULL, KEY_ALLOC_IN_QUOTA);
        if (!IS_ERR(key)) {
                ret = wait_for_key_construction(key, false);
                if (ret < 0) {
                        key_put(key);
                        return ERR_PTR(ret);
                }
        }
        return key;
}
EXPORT_SYMBOL(request_key_with_auxdata);

/**
 * request_key_rcu - Request key from RCU-read-locked context
 * @type: The type of key we want.
 * @description: The name of the key we want.
 * @domain_tag: The domain in which the key operates.
 *
 * Request a key from a context that we may not sleep in (such as RCU-mode
 * pathwalk).  Keys under construction are ignored.
 *
 * Return a pointer to the found key if successful, -ENOKEY if we couldn't find
 * a key or some other error if the key found was unsuitable or inaccessible.
 */
struct key *request_key_rcu(struct key_type *type,
                            const char *description,
                            struct key_tag *domain_tag)
{
        struct keyring_search_context ctx = {
                .index_key.type                = type,
                .index_key.domain_tag        = domain_tag,
                .index_key.description        = description,
                .index_key.desc_len        = strlen(description),
                .cred                        = current_cred(),
                .match_data.cmp                = key_default_cmp,
                .match_data.raw_data        = description,
                .match_data.lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
                .flags                        = (KEYRING_SEARCH_DO_STATE_CHECK |
                                           KEYRING_SEARCH_SKIP_EXPIRED),
        };
        struct key *key;
        key_ref_t key_ref;

        kenter("%s,%s", type->name, description);

        key = check_cached_key(&ctx);
        if (key)
                return key;

        /* search all the process keyrings for a key */
        key_ref = search_process_keyrings_rcu(&ctx);
        if (IS_ERR(key_ref)) {
                key = ERR_CAST(key_ref);
                if (PTR_ERR(key_ref) == -EAGAIN)
                        key = ERR_PTR(-ENOKEY);
        } else {
                key = key_ref_to_ptr(key_ref);
                cache_requested_key(key);
        }

        kleave(" = %p", key);
        return key;
}
EXPORT_SYMBOL(request_key_rcu);















































































































    2 
    3 







    2 




    1 




    1 


    2 




    2 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2013 Nicira, Inc.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
#include <linux/netfilter_ipv4.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/static_key.h>

#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ip_tunnels.h>
#include <net/ip6_tunnel.h>
#include <net/ip6_checksum.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/dst_metadata.h>
#include <net/geneve.h>
#include <net/vxlan.h>
#include <net/erspan.h>

const struct ip_tunnel_encap_ops __rcu *
                iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
EXPORT_SYMBOL(iptun_encaps);

const struct ip6_tnl_encap_ops __rcu *
                ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
EXPORT_SYMBOL(ip6tun_encaps);

void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
                   __be32 src, __be32 dst, __u8 proto,
                   __u8 tos, __u8 ttl, __be16 df, bool xnet,
                   u16 ipcb_flags)
{
        int pkt_len = skb->len - skb_inner_network_offset(skb);
        struct net *net = dev_net(rt->dst.dev);
        struct net_device *dev = skb->dev;
        struct iphdr *iph;
        int err;

        if (unlikely(dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT)) {
                if (dev) {
                        net_crit_ratelimited("Dead loop on virtual device %s (net %llu), fix it urgently!\n",
                                             dev->name, dev_net(dev)->net_cookie);
                        DEV_STATS_INC(dev, tx_errors);
                }
                ip_rt_put(rt);
                kfree_skb_reason(skb, SKB_DROP_REASON_RECURSION_LIMIT);
                return;
        }

        dev_xmit_recursion_inc();

        skb_scrub_packet(skb, xnet);

        skb_clear_hash_if_not_l4(skb);
        skb_dst_set(skb, &rt->dst);
        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
        IPCB(skb)->flags = ipcb_flags;

        /* Push down and install the IP header. */
        skb_push(skb, sizeof(struct iphdr));
        skb_reset_network_header(skb);

        iph = ip_hdr(skb);

        iph->version        =        4;
        iph->ihl        =        sizeof(struct iphdr) >> 2;
        iph->frag_off        =        ip_mtu_locked(&rt->dst) ? 0 : df;
        iph->protocol        =        proto;
        iph->tos        =        tos;
        iph->daddr        =        dst;
        iph->saddr        =        src;
        iph->ttl        =        ttl;
        __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);

        err = ip_local_out(net, sk, skb);

        if (dev) {
                if (unlikely(net_xmit_eval(err)))
                        pkt_len = 0;
                iptunnel_xmit_stats(dev, pkt_len);
        }

        dev_xmit_recursion_dec();
}
EXPORT_SYMBOL_GPL(iptunnel_xmit);

int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
                           __be16 inner_proto, bool raw_proto, bool xnet)
{
        if (unlikely(!pskb_may_pull(skb, hdr_len)))
                return -ENOMEM;

        skb_pull_rcsum(skb, hdr_len);

        if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
                struct ethhdr *eh;

                if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
                        return -ENOMEM;

                eh = (struct ethhdr *)skb->data;
                if (likely(eth_proto_is_802_3(eh->h_proto)))
                        skb->protocol = eh->h_proto;
                else
                        skb->protocol = htons(ETH_P_802_2);

        } else {
                skb->protocol = inner_proto;
        }

        skb_clear_hash_if_not_l4(skb);
        __vlan_hwaccel_clear_tag(skb);
        skb_set_queue_mapping(skb, 0);
        skb_scrub_packet(skb, xnet);

        return iptunnel_pull_offloads(skb);
}
EXPORT_SYMBOL_GPL(__iptunnel_pull_header);

struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
                                             gfp_t flags)
{
        IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { };
        struct metadata_dst *res;
        struct ip_tunnel_info *dst, *src;

        if (!md || md->type != METADATA_IP_TUNNEL ||
            md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
                return NULL;

        src = &md->u.tun_info;
        res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags);
        if (!res)
                return NULL;

        dst = &res->u.tun_info;
        dst->key.tun_id = src->key.tun_id;
        if (src->mode & IP_TUNNEL_INFO_IPV6)
                memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
                       sizeof(struct in6_addr));
        else
                dst->key.u.ipv4.dst = src->key.u.ipv4.src;
        ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags);
        dst->mode = src->mode | IP_TUNNEL_INFO_TX;
        ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
                                src->options_len, tun_flags);

        return res;
}
EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);

int iptunnel_handle_offloads(struct sk_buff *skb,
                             int gso_type_mask)
{
        int err;

        if (likely(!skb->encapsulation)) {
                skb_reset_inner_headers(skb);
                skb->encapsulation = 1;
        }

        if (skb_is_gso(skb)) {
                err = skb_header_unclone(skb, GFP_ATOMIC);
                if (unlikely(err))
                        return err;
                skb_shinfo(skb)->gso_type |= gso_type_mask;
                return 0;
        }

        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                skb->ip_summed = CHECKSUM_NONE;
                /* We clear encapsulation here to prevent badly-written
                 * drivers potentially deciding to offload an inner checksum
                 * if we set CHECKSUM_PARTIAL on the outer header.
                 * This should go away when the drivers are all fixed.
                 */
                skb->encapsulation = 0;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);

/**
 * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD
 * @skb:        Original packet with L2 header
 * @mtu:        MTU value for ICMP error
 *
 * Return: length on success, negative error code if message couldn't be built.
 */
static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
{
        const struct iphdr *iph = ip_hdr(skb);
        struct icmphdr *icmph;
        struct iphdr *niph;
        struct ethhdr eh;
        int len, err;

        if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
                return -EINVAL;

        if (skb_is_gso(skb))
                skb_gso_reset(skb);

        skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
        pskb_pull(skb, ETH_HLEN);
        skb_reset_network_header(skb);

        err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
        if (err)
                return err;

        len = skb->len + sizeof(*icmph);
        err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
        if (err)
                return err;

        icmph = skb_push(skb, sizeof(*icmph));
        *icmph = (struct icmphdr) {
                .type                        = ICMP_DEST_UNREACH,
                .code                        = ICMP_FRAG_NEEDED,
                .checksum                = 0,
                .un.frag.__unused        = 0,
                .un.frag.mtu                = htons(mtu),
        };
        icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0));
        skb_reset_transport_header(skb);

        niph = skb_push(skb, sizeof(*niph));
        *niph = (struct iphdr) {
                .ihl                        = sizeof(*niph) / 4u,
                .version                 = 4,
                .tos                         = 0,
                .tot_len                = htons(len + sizeof(*niph)),
                .id                        = 0,
                .frag_off                = htons(IP_DF),
                .ttl                        = iph->ttl,
                .protocol                = IPPROTO_ICMP,
                .saddr                        = iph->daddr,
                .daddr                        = iph->saddr,
        };
        ip_send_check(niph);
        skb_reset_network_header(skb);

        skb->ip_summed = CHECKSUM_NONE;

        eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
        skb_reset_mac_header(skb);

        return skb->len;
}

/**
 * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed
 * @skb:        Buffer being sent by encapsulation, L2 headers expected
 * @mtu:        Network MTU for path
 *
 * Return: 0 for no ICMP reply, length if built, negative value on error.
 */
static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
{
        const struct icmphdr *icmph = icmp_hdr(skb);
        const struct iphdr *iph = ip_hdr(skb);

        if (mtu < 576 || iph->frag_off != htons(IP_DF))
                return 0;

        if (ipv4_is_lbcast(iph->daddr)  || ipv4_is_multicast(iph->daddr) ||
            ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr)  ||
            ipv4_is_lbcast(iph->saddr)  || ipv4_is_multicast(iph->saddr))
                return 0;

        if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
                return 0;

        return iptunnel_pmtud_build_icmp(skb, mtu);
}

#if IS_ENABLED(CONFIG_IPV6)
/**
 * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD
 * @skb:        Original packet with L2 header
 * @mtu:        MTU value for ICMPv6 error
 *
 * Return: length on success, negative error code if message couldn't be built.
 */
static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
{
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
        struct icmp6hdr *icmp6h;
        struct ipv6hdr *nip6h;
        struct ethhdr eh;
        int len, err;
        __wsum csum;

        if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
                return -EINVAL;

        if (skb_is_gso(skb))
                skb_gso_reset(skb);

        skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
        pskb_pull(skb, ETH_HLEN);
        skb_reset_network_header(skb);

        err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
        if (err)
                return err;

        len = skb->len + sizeof(*icmp6h);
        err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
        if (err)
                return err;

        icmp6h = skb_push(skb, sizeof(*icmp6h));
        *icmp6h = (struct icmp6hdr) {
                .icmp6_type                = ICMPV6_PKT_TOOBIG,
                .icmp6_code                = 0,
                .icmp6_cksum                = 0,
                .icmp6_mtu                = htonl(mtu),
        };
        skb_reset_transport_header(skb);

        nip6h = skb_push(skb, sizeof(*nip6h));
        *nip6h = (struct ipv6hdr) {
                .priority                = 0,
                .version                = 6,
                .flow_lbl                = { 0 },
                .payload_len                = htons(len),
                .nexthdr                = IPPROTO_ICMPV6,
                .hop_limit                = ip6h->hop_limit,
                .saddr                        = ip6h->daddr,
                .daddr                        = ip6h->saddr,
        };
        skb_reset_network_header(skb);

        csum = skb_checksum(skb, skb_transport_offset(skb), len, 0);
        icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len,
                                              IPPROTO_ICMPV6, csum);

        skb->ip_summed = CHECKSUM_NONE;

        eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
        skb_reset_mac_header(skb);

        return skb->len;
}

/**
 * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed
 * @skb:        Buffer being sent by encapsulation, L2 headers expected
 * @mtu:        Network MTU for path
 *
 * Return: 0 for no ICMPv6 reply, length if built, negative value on error.
 */
static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
{
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
        int stype = ipv6_addr_type(&ip6h->saddr);
        u8 proto = ip6h->nexthdr;
        __be16 frag_off;
        int offset;

        if (mtu < IPV6_MIN_MTU)
                return 0;

        if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
            stype == IPV6_ADDR_LOOPBACK)
                return 0;

        offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
                                  &frag_off);
        if (offset < 0 || (frag_off & htons(~0x7)))
                return 0;

        if (proto == IPPROTO_ICMPV6) {
                struct icmp6hdr *icmp6h;

                if (!pskb_may_pull(skb, skb_network_header(skb) +
                                        offset + 1 - skb->data))
                        return 0;

                icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset);
                if (icmpv6_is_err(icmp6h->icmp6_type) ||
                    icmp6h->icmp6_type == NDISC_REDIRECT)
                        return 0;
        }

        return iptunnel_pmtud_build_icmpv6(skb, mtu);
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

/**
 * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
 * @skb:        Buffer being sent by encapsulation, L2 headers expected
 * @encap_dst:        Destination for tunnel encapsulation (outer IP)
 * @headroom:        Encapsulation header size, bytes
 * @reply:        Build matching ICMP or ICMPv6 message as a result
 *
 * L2 tunnel implementations that can carry IP and can be directly bridged
 * (currently UDP tunnels) can't always rely on IP forwarding paths to handle
 * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
 * based on payload and sent back by the encapsulation itself.
 *
 * For routable interfaces, we just need to update the PMTU for the destination.
 *
 * Return: 0 if ICMP error not needed, length if built, negative value on error
 */
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
                          int headroom, bool reply)
{
        u32 mtu = dst_mtu(encap_dst) - headroom;

        if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
            (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu))
                return 0;

        skb_dst_update_pmtu_no_confirm(skb, mtu);

        if (!reply)
                return 0;

        if (skb->protocol == htons(ETH_P_IP))
                return iptunnel_pmtud_check_icmp(skb, mtu);

#if IS_ENABLED(CONFIG_IPV6)
        if (skb->protocol == htons(ETH_P_IPV6))
                return iptunnel_pmtud_check_icmpv6(skb, mtu);
#endif
        return 0;
}
EXPORT_SYMBOL(skb_tunnel_check_pmtu);

static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
        [LWTUNNEL_IP_UNSPEC]        = { .strict_start_type = LWTUNNEL_IP_OPTS },
        [LWTUNNEL_IP_ID]        = { .type = NLA_U64 },
        [LWTUNNEL_IP_DST]        = { .type = NLA_U32 },
        [LWTUNNEL_IP_SRC]        = { .type = NLA_U32 },
        [LWTUNNEL_IP_TTL]        = { .type = NLA_U8 },
        [LWTUNNEL_IP_TOS]        = { .type = NLA_U8 },
        [LWTUNNEL_IP_FLAGS]        = { .type = NLA_U16 },
        [LWTUNNEL_IP_OPTS]        = { .type = NLA_NESTED },
};

static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
        [LWTUNNEL_IP_OPTS_GENEVE]        = { .type = NLA_NESTED },
        [LWTUNNEL_IP_OPTS_VXLAN]        = { .type = NLA_NESTED },
        [LWTUNNEL_IP_OPTS_ERSPAN]        = { .type = NLA_NESTED },
};

static const struct nla_policy
geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
        [LWTUNNEL_IP_OPT_GENEVE_CLASS]        = { .type = NLA_U16 },
        [LWTUNNEL_IP_OPT_GENEVE_TYPE]        = { .type = NLA_U8 },
        [LWTUNNEL_IP_OPT_GENEVE_DATA]        = { .type = NLA_BINARY, .len = 127 },
};

static const struct nla_policy
vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
        [LWTUNNEL_IP_OPT_VXLAN_GBP]        = { .type = NLA_U32 },
};

static const struct nla_policy
erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
        [LWTUNNEL_IP_OPT_ERSPAN_VER]        = { .type = NLA_U8 },
        [LWTUNNEL_IP_OPT_ERSPAN_INDEX]        = { .type = NLA_U32 },
        [LWTUNNEL_IP_OPT_ERSPAN_DIR]        = { .type = NLA_U8 },
        [LWTUNNEL_IP_OPT_ERSPAN_HWID]        = { .type = NLA_U8 },
};

static int ip_tun_parse_opts_geneve(struct nlattr *attr,
                                    struct ip_tunnel_info *info, int opts_len,
                                    struct netlink_ext_ack *extack)
{
        struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
        int data_len, err;

        err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr,
                               geneve_opt_policy, extack);
        if (err)
                return err;

        if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
            !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
            !tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
                return -EINVAL;

        attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
        data_len = nla_len(attr);
        if (data_len % 4)
                return -EINVAL;

        if (info) {
                struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;

                memcpy(opt->opt_data, nla_data(attr), data_len);
                opt->length = data_len / 4;
                attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
                opt->opt_class = nla_get_be16(attr);
                attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
                opt->type = nla_get_u8(attr);
                __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags);
        }

        return sizeof(struct geneve_opt) + data_len;
}

static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
                                   struct ip_tunnel_info *info, int opts_len,
                                   struct netlink_ext_ack *extack)
{
        struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
        int err;

        err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr,
                               vxlan_opt_policy, extack);
        if (err)
                return err;

        if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
                return -EINVAL;

        if (info) {
                struct vxlan_metadata *md =
                        ip_tunnel_info_opts(info) + opts_len;

                attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
                md->gbp = nla_get_u32(attr);
                md->gbp &= VXLAN_GBP_MASK;
                __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags);
        }

        return sizeof(struct vxlan_metadata);
}

static int ip_tun_parse_opts_erspan(struct nlattr *attr,
                                    struct ip_tunnel_info *info, int opts_len,
                                    struct netlink_ext_ack *extack)
{
        struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
        int err;
        u8 ver;

        err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
                               erspan_opt_policy, extack);
        if (err)
                return err;

        if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
                return -EINVAL;

        ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
        if (ver == 1) {
                if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
                        return -EINVAL;
        } else if (ver == 2) {
                if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
                    !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
                        return -EINVAL;
        } else {
                return -EINVAL;
        }

        if (info) {
                struct erspan_metadata *md =
                        ip_tunnel_info_opts(info) + opts_len;

                md->version = ver;
                if (ver == 1) {
                        attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
                        md->u.index = nla_get_be32(attr);
                } else {
                        attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
                        md->u.md2.dir = nla_get_u8(attr);
                        attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
                        set_hwid(&md->u.md2, nla_get_u8(attr));
                }

                __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags);
        }

        return sizeof(struct erspan_metadata);
}

static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
                             struct netlink_ext_ack *extack)
{
        int err, rem, opt_len, opts_len = 0;
        struct nlattr *nla;
        u32 type = 0;

        if (!attr)
                return 0;

        err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
                           ip_opts_policy, extack);
        if (err)
                return err;

        nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
                switch (nla_type(nla)) {
                case LWTUNNEL_IP_OPTS_GENEVE:
                        if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
                                return -EINVAL;
                        opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
                                                           extack);
                        if (opt_len < 0)
                                return opt_len;
                        opts_len += opt_len;
                        if (opts_len > IP_TUNNEL_OPTS_MAX)
                                return -EINVAL;
                        type = IP_TUNNEL_GENEVE_OPT_BIT;
                        break;
                case LWTUNNEL_IP_OPTS_VXLAN:
                        if (type)
                                return -EINVAL;
                        opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
                                                          extack);
                        if (opt_len < 0)
                                return opt_len;
                        opts_len += opt_len;
                        type = IP_TUNNEL_VXLAN_OPT_BIT;
                        break;
                case LWTUNNEL_IP_OPTS_ERSPAN:
                        if (type)
                                return -EINVAL;
                        opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
                                                           extack);
                        if (opt_len < 0)
                                return opt_len;
                        opts_len += opt_len;
                        type = IP_TUNNEL_ERSPAN_OPT_BIT;
                        break;
                default:
                        return -EINVAL;
                }
        }

        return opts_len;
}

static int ip_tun_get_optlen(struct nlattr *attr,
                             struct netlink_ext_ack *extack)
{
        return ip_tun_parse_opts(attr, NULL, extack);
}

static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
                           struct netlink_ext_ack *extack)
{
        return ip_tun_parse_opts(attr, info, extack);
}

static int ip_tun_build_state(struct net *net, struct nlattr *attr,
                              unsigned int family, const void *cfg,
                              struct lwtunnel_state **ts,
                              struct netlink_ext_ack *extack)
{
        struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
        struct lwtunnel_state *new_state;
        struct ip_tunnel_info *tun_info;
        int err, opt_len;

        err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
                                          ip_tun_policy, extack);
        if (err < 0)
                return err;

        opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
        if (opt_len < 0)
                return opt_len;

        new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
        if (!new_state)
                return -ENOMEM;

        new_state->type = LWTUNNEL_ENCAP_IP;

        tun_info = lwt_tun_info(new_state);

        err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
        if (err < 0) {
                lwtstate_free(new_state);
                return err;
        }

#ifdef CONFIG_DST_CACHE
        err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
        if (err) {
                lwtstate_free(new_state);
                return err;
        }
#endif

        if (tb[LWTUNNEL_IP_ID])
                tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);

        if (tb[LWTUNNEL_IP_DST])
                tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);

        if (tb[LWTUNNEL_IP_SRC])
                tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);

        if (tb[LWTUNNEL_IP_TTL])
                tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);

        if (tb[LWTUNNEL_IP_TOS])
                tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);

        if (tb[LWTUNNEL_IP_FLAGS]) {
                IP_TUNNEL_DECLARE_FLAGS(flags);

                ip_tunnel_flags_from_be16(flags,
                                          nla_get_be16(tb[LWTUNNEL_IP_FLAGS]));
                ip_tunnel_clear_options_present(flags);

                ip_tunnel_flags_or(tun_info->key.tun_flags,
                                   tun_info->key.tun_flags, flags);
        }

        tun_info->mode = IP_TUNNEL_INFO_TX;
        tun_info->options_len = opt_len;

        *ts = new_state;

        return 0;
}

static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
{
#ifdef CONFIG_DST_CACHE
        struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);

        dst_cache_destroy(&tun_info->dst_cache);
#endif
}

static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
                                         struct ip_tunnel_info *tun_info)
{
        struct geneve_opt *opt;
        struct nlattr *nest;
        int offset = 0;

        nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
        if (!nest)
                return -ENOMEM;

        while (tun_info->options_len > offset) {
                opt = ip_tunnel_info_opts(tun_info) + offset;
                if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
                                 opt->opt_class) ||
                    nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
                    nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
                            opt->opt_data)) {
                        nla_nest_cancel(skb, nest);
                        return -ENOMEM;
                }
                offset += sizeof(*opt) + opt->length * 4;
        }

        nla_nest_end(skb, nest);
        return 0;
}

static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
                                        struct ip_tunnel_info *tun_info)
{
        struct vxlan_metadata *md;
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
        if (!nest)
                return -ENOMEM;

        md = ip_tunnel_info_opts(tun_info);
        if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
                nla_nest_cancel(skb, nest);
                return -ENOMEM;
        }

        nla_nest_end(skb, nest);
        return 0;
}

static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
                                         struct ip_tunnel_info *tun_info)
{
        struct erspan_metadata *md;
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
        if (!nest)
                return -ENOMEM;

        md = ip_tunnel_info_opts(tun_info);
        if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
                goto err;

        if (md->version == 1 &&
            nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
                goto err;

        if (md->version == 2 &&
            (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
             nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
                        get_hwid(&md->u.md2))))
                goto err;

        nla_nest_end(skb, nest);
        return 0;
err:
        nla_nest_cancel(skb, nest);
        return -ENOMEM;
}

static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
                                  struct ip_tunnel_info *tun_info)
{
        struct nlattr *nest;
        int err = 0;

        if (!ip_tunnel_is_options_present(tun_info->key.tun_flags))
                return 0;

        nest = nla_nest_start_noflag(skb, type);
        if (!nest)
                return -ENOMEM;

        if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags))
                err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
        else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags))
                err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
        else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
                err = ip_tun_fill_encap_opts_erspan(skb, tun_info);

        if (err) {
                nla_nest_cancel(skb, nest);
                return err;
        }

        nla_nest_end(skb, nest);
        return 0;
}

static int ip_tun_fill_encap_info(struct sk_buff *skb,
                                  struct lwtunnel_state *lwtstate)
{
        struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);

        if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id,
                         LWTUNNEL_IP_PAD) ||
            nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
            nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
            nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
            nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
            nla_put_be16(skb, LWTUNNEL_IP_FLAGS,
                         ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
            ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
                return -ENOMEM;

        return 0;
}

static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
{
        int opt_len;

        if (!ip_tunnel_is_options_present(info->key.tun_flags))
                return 0;

        opt_len = nla_total_size(0);                /* LWTUNNEL_IP_OPTS */
        if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) {
                struct geneve_opt *opt;
                int offset = 0;

                opt_len += nla_total_size(0);        /* LWTUNNEL_IP_OPTS_GENEVE */
                while (info->options_len > offset) {
                        opt = ip_tunnel_info_opts(info) + offset;
                        opt_len += nla_total_size(2)        /* OPT_GENEVE_CLASS */
                                   + nla_total_size(1)        /* OPT_GENEVE_TYPE */
                                   + nla_total_size(opt->length * 4);
                                                        /* OPT_GENEVE_DATA */
                        offset += sizeof(*opt) + opt->length * 4;
                }
        } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
                opt_len += nla_total_size(0)        /* LWTUNNEL_IP_OPTS_VXLAN */
                           + nla_total_size(4);        /* OPT_VXLAN_GBP */
        } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) {
                struct erspan_metadata *md = ip_tunnel_info_opts(info);

                opt_len += nla_total_size(0)        /* LWTUNNEL_IP_OPTS_ERSPAN */
                           + nla_total_size(1)        /* OPT_ERSPAN_VER */
                           + (md->version == 1 ? nla_total_size(4)
                                                /* OPT_ERSPAN_INDEX (v1) */
                                               : nla_total_size(1) +
                                                 nla_total_size(1));
                                                /* OPT_ERSPAN_DIR + HWID (v2) */
        }

        return opt_len;
}

static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
{
        return nla_total_size_64bit(8)        /* LWTUNNEL_IP_ID */
                + nla_total_size(4)        /* LWTUNNEL_IP_DST */
                + nla_total_size(4)        /* LWTUNNEL_IP_SRC */
                + nla_total_size(1)        /* LWTUNNEL_IP_TOS */
                + nla_total_size(1)        /* LWTUNNEL_IP_TTL */
                + nla_total_size(2)        /* LWTUNNEL_IP_FLAGS */
                + ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
                                        /* LWTUNNEL_IP_OPTS */
}

static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
{
        struct ip_tunnel_info *info_a = lwt_tun_info(a);
        struct ip_tunnel_info *info_b = lwt_tun_info(b);

        return memcmp(info_a, info_b, sizeof(info_a->key)) ||
               info_a->mode != info_b->mode ||
               info_a->options_len != info_b->options_len ||
               memcmp(ip_tunnel_info_opts(info_a),
                      ip_tunnel_info_opts(info_b), info_a->options_len);
}

static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
        .build_state = ip_tun_build_state,
        .destroy_state = ip_tun_destroy_state,
        .fill_encap = ip_tun_fill_encap_info,
        .get_encap_size = ip_tun_encap_nlsize,
        .cmp_encap = ip_tun_cmp_encap,
        .owner = THIS_MODULE,
};

static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
        [LWTUNNEL_IP6_UNSPEC]        = { .strict_start_type = LWTUNNEL_IP6_OPTS },
        [LWTUNNEL_IP6_ID]                = { .type = NLA_U64 },
        [LWTUNNEL_IP6_DST]                = { .len = sizeof(struct in6_addr) },
        [LWTUNNEL_IP6_SRC]                = { .len = sizeof(struct in6_addr) },
        [LWTUNNEL_IP6_HOPLIMIT]                = { .type = NLA_U8 },
        [LWTUNNEL_IP6_TC]                = { .type = NLA_U8 },
        [LWTUNNEL_IP6_FLAGS]                = { .type = NLA_U16 },
        [LWTUNNEL_IP6_OPTS]                = { .type = NLA_NESTED },
};

static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
                               unsigned int family, const void *cfg,
                               struct lwtunnel_state **ts,
                               struct netlink_ext_ack *extack)
{
        struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
        struct lwtunnel_state *new_state;
        struct ip_tunnel_info *tun_info;
        int err, opt_len;

        err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
                                          ip6_tun_policy, extack);
        if (err < 0)
                return err;

        opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
        if (opt_len < 0)
                return opt_len;

        new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
        if (!new_state)
                return -ENOMEM;

        new_state->type = LWTUNNEL_ENCAP_IP6;

        tun_info = lwt_tun_info(new_state);

        err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
        if (err < 0) {
                lwtstate_free(new_state);
                return err;
        }

        if (tb[LWTUNNEL_IP6_ID])
                tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);

        if (tb[LWTUNNEL_IP6_DST])
                tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);

        if (tb[LWTUNNEL_IP6_SRC])
                tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);

        if (tb[LWTUNNEL_IP6_HOPLIMIT])
                tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);

        if (tb[LWTUNNEL_IP6_TC])
                tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);

        if (tb[LWTUNNEL_IP6_FLAGS]) {
                IP_TUNNEL_DECLARE_FLAGS(flags);
                __be16 data;

                data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
                ip_tunnel_flags_from_be16(flags, data);
                ip_tunnel_clear_options_present(flags);

                ip_tunnel_flags_or(tun_info->key.tun_flags,
                                   tun_info->key.tun_flags, flags);
        }

        tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
        tun_info->options_len = opt_len;

        *ts = new_state;

        return 0;
}

static int ip6_tun_fill_encap_info(struct sk_buff *skb,
                                   struct lwtunnel_state *lwtstate)
{
        struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);

        if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id,
                         LWTUNNEL_IP6_PAD) ||
            nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
            nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
            nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
            nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
            nla_put_be16(skb, LWTUNNEL_IP6_FLAGS,
                         ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
            ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
                return -ENOMEM;

        return 0;
}

static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
{
        return nla_total_size_64bit(8)        /* LWTUNNEL_IP6_ID */
                + nla_total_size(16)        /* LWTUNNEL_IP6_DST */
                + nla_total_size(16)        /* LWTUNNEL_IP6_SRC */
                + nla_total_size(1)        /* LWTUNNEL_IP6_HOPLIMIT */
                + nla_total_size(1)        /* LWTUNNEL_IP6_TC */
                + nla_total_size(2)        /* LWTUNNEL_IP6_FLAGS */
                + ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
                                        /* LWTUNNEL_IP6_OPTS */
}

static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
        .build_state = ip6_tun_build_state,
        .fill_encap = ip6_tun_fill_encap_info,
        .get_encap_size = ip6_tun_encap_nlsize,
        .cmp_encap = ip_tun_cmp_encap,
        .owner = THIS_MODULE,
};

void __init ip_tunnel_core_init(void)
{
        /* If you land here, make sure whether increasing ip_tunnel_info's
         * options_len is a reasonable choice with its usage in front ends
         * (f.e., it's part of flow keys, etc).
         */
        BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);

        lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
        lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
}

DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
EXPORT_SYMBOL(ip_tunnel_metadata_cnt);

void ip_tunnel_need_metadata(void)
{
        static_branch_inc(&ip_tunnel_metadata_cnt);
}
EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);

void ip_tunnel_unneed_metadata(void)
{
        static_branch_dec(&ip_tunnel_metadata_cnt);
}
EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);

/* Returns either the correct skb->protocol value, or 0 if invalid. */
__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb)
{
        if (skb_network_header(skb) >= skb->head &&
            (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) &&
            ip_hdr(skb)->version == 4)
                return htons(ETH_P_IP);
        if (skb_network_header(skb) >= skb->head &&
            (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) &&
            ipv6_hdr(skb)->version == 6)
                return htons(ETH_P_IPV6);
        return 0;
}
EXPORT_SYMBOL(ip_tunnel_parse_protocol);

const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol };
EXPORT_SYMBOL(ip_tunnel_header_ops);

/* This function returns true when ENCAP attributes are present in the nl msg */
bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
                                   struct ip_tunnel_encap *encap)
{
        bool ret = false;

        memset(encap, 0, sizeof(*encap));

        if (!data)
                return ret;

        if (data[IFLA_IPTUN_ENCAP_TYPE]) {
                ret = true;
                encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
        }

        if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
                ret = true;
                encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
        }

        if (data[IFLA_IPTUN_ENCAP_SPORT]) {
                ret = true;
                encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
        }

        if (data[IFLA_IPTUN_ENCAP_DPORT]) {
                ret = true;
                encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
        }

        return ret;
}
EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);

void ip_tunnel_netlink_parms(struct nlattr *data[],
                             struct ip_tunnel_parm_kern *parms)
{
        if (data[IFLA_IPTUN_LINK])
                parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);

        if (data[IFLA_IPTUN_LOCAL])
                parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);

        if (data[IFLA_IPTUN_REMOTE])
                parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);

        if (data[IFLA_IPTUN_TTL]) {
                parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
                if (parms->iph.ttl)
                        parms->iph.frag_off = htons(IP_DF);
        }

        if (data[IFLA_IPTUN_TOS])
                parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);

        if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
                parms->iph.frag_off = htons(IP_DF);

        if (data[IFLA_IPTUN_FLAGS]) {
                __be16 flags;

                flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
                ip_tunnel_flags_from_be16(parms->i_flags, flags);
        }

        if (data[IFLA_IPTUN_PROTO])
                parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
}
EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms);
























































































































































































































































































































































































































































































































































































































































































































































































































   12 












   13 




















































































































































































    3 










    3 







    3 
    2 











































    6 




























    3 
    6 
    6 

    5 



    6 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/file.c - kernfs file implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/uio.h>

#include "kernfs-internal.h"

struct kernfs_open_node {
        struct rcu_head                rcu_head;
        atomic_t                event;
        wait_queue_head_t        poll;
        struct list_head        files; /* goes through kernfs_open_file.list */
        unsigned int                nr_mmapped;
        unsigned int                nr_to_release;
};

/*
 * kernfs_notify() may be called from any context and bounces notifications
 * through a work item.  To minimize space overhead in kernfs_node, the
 * pending queue is implemented as a singly linked list of kernfs_nodes.
 * The list is terminated with the self pointer so that whether a
 * kernfs_node is on the list or not can be determined by testing the next
 * pointer for %NULL.
 */
#define KERNFS_NOTIFY_EOL                        ((void *)&kernfs_notify_list)

static DEFINE_SPINLOCK(kernfs_notify_lock);
static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;

static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
{
        int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);

        return &kernfs_locks->open_file_mutex[idx];
}

static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
{
        struct mutex *lock;

        lock = kernfs_open_file_mutex_ptr(kn);

        mutex_lock(lock);

        return lock;
}

/**
 * of_on - Get the kernfs_open_node of the specified kernfs_open_file
 * @of: target kernfs_open_file
 *
 * Return: the kernfs_open_node of the kernfs_open_file
 */
static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
{
        return rcu_dereference_protected(of->kn->attr.open,
                                         !list_empty(&of->list));
}

/* Get active reference to kernfs node for an open file */
static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of)
{
        /* Skip if file was already released */
        if (unlikely(of->released))
                return NULL;

        if (!kernfs_get_active(of->kn))
                return NULL;

        return of;
}

static void kernfs_put_active_of(struct kernfs_open_file *of)
{
        return kernfs_put_active(of->kn);
}

/**
 * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
 *
 * @kn: target kernfs_node.
 *
 * Fetch and return ->attr.open of @kn when caller holds the
 * kernfs_open_file_mutex_ptr(kn).
 *
 * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when
 * the caller guarantees that this mutex is being held, other updaters can't
 * change ->attr.open and this means that we can safely deref ->attr.open
 * outside RCU read-side critical section.
 *
 * The caller needs to make sure that kernfs_open_file_mutex is held.
 *
 * Return: @kn->attr.open when kernfs_open_file_mutex is held.
 */
static struct kernfs_open_node *
kernfs_deref_open_node_locked(struct kernfs_node *kn)
{
        return rcu_dereference_protected(kn->attr.open,
                                lockdep_is_held(kernfs_open_file_mutex_ptr(kn)));
}

static struct kernfs_open_file *kernfs_of(struct file *file)
{
        return ((struct seq_file *)file->private_data)->private;
}

/*
 * Determine the kernfs_ops for the given kernfs_node.  This function must
 * be called while holding an active reference.
 */
static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
{
        if (kn->flags & KERNFS_LOCKDEP)
                lockdep_assert_held(kn);
        return kn->attr.ops;
}

/*
 * As kernfs_seq_stop() is also called after kernfs_seq_start() or
 * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
 * a seq_file iteration which is fully initialized with an active reference
 * or an aborted kernfs_seq_start() due to get_active failure.  The
 * position pointer is the only context for each seq_file iteration and
 * thus the stop condition should be encoded in it.  As the return value is
 * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
 * choice to indicate get_active failure.
 *
 * Unfortunately, this is complicated due to the optional custom seq_file
 * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
 * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
 * custom seq_file operations and thus can't decide whether put_active
 * should be performed or not only on ERR_PTR(-ENODEV).
 *
 * This is worked around by factoring out the custom seq_stop() and
 * put_active part into kernfs_seq_stop_active(), skipping it from
 * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
 * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
 * that kernfs_seq_stop_active() is skipped only after get_active failure.
 */
static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;
        const struct kernfs_ops *ops = kernfs_ops(of->kn);

        if (ops->seq_stop)
                ops->seq_stop(sf, v);
        kernfs_put_active_of(of);
}

static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
{
        struct kernfs_open_file *of = sf->private;
        const struct kernfs_ops *ops;

        /*
         * @of->mutex nests outside active ref and is primarily to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active_of(of))
                return ERR_PTR(-ENODEV);

        ops = kernfs_ops(of->kn);
        if (ops->seq_start) {
                void *next = ops->seq_start(sf, ppos);
                /* see the comment above kernfs_seq_stop_active() */
                if (next == ERR_PTR(-ENODEV))
                        kernfs_seq_stop_active(sf, next);
                return next;
        }
        return single_start(sf, ppos);
}

static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
{
        struct kernfs_open_file *of = sf->private;
        const struct kernfs_ops *ops = kernfs_ops(of->kn);

        if (ops->seq_next) {
                void *next = ops->seq_next(sf, v, ppos);
                /* see the comment above kernfs_seq_stop_active() */
                if (next == ERR_PTR(-ENODEV))
                        kernfs_seq_stop_active(sf, next);
                return next;
        } else {
                /*
                 * The same behavior and code as single_open(), always
                 * terminate after the initial read.
                 */
                ++*ppos;
                return NULL;
        }
}

static void kernfs_seq_stop(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;

        if (v != ERR_PTR(-ENODEV))
                kernfs_seq_stop_active(sf, v);
        mutex_unlock(&of->mutex);
}

static int kernfs_seq_show(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;

        of->event = atomic_read(&of_on(of)->event);

        return of->kn->attr.ops->seq_show(sf, v);
}

static const struct seq_operations kernfs_seq_ops = {
        .start = kernfs_seq_start,
        .next = kernfs_seq_next,
        .stop = kernfs_seq_stop,
        .show = kernfs_seq_show,
};

/*
 * As reading a bin file can have side-effects, the exact offset and bytes
 * specified in read(2) call should be passed to the read callback making
 * it difficult to use seq_file.  Implement simplistic custom buffering for
 * bin files.
 */
static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
        ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
        const struct kernfs_ops *ops;
        char *buf;

        buf = of->prealloc_buf;
        if (buf)
                mutex_lock(&of->prealloc_mutex);
        else
                buf = kmalloc(len, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        /*
         * @of->mutex nests outside active ref and is used both to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active_of(of)) {
                len = -ENODEV;
                mutex_unlock(&of->mutex);
                goto out_free;
        }

        of->event = atomic_read(&of_on(of)->event);

        ops = kernfs_ops(of->kn);
        if (ops->read)
                len = ops->read(of, buf, len, iocb->ki_pos);
        else
                len = -EINVAL;

        kernfs_put_active_of(of);
        mutex_unlock(&of->mutex);

        if (len < 0)
                goto out_free;

        if (copy_to_iter(buf, len, iter) != len) {
                len = -EFAULT;
                goto out_free;
        }

        iocb->ki_pos += len;

 out_free:
        if (buf == of->prealloc_buf)
                mutex_unlock(&of->prealloc_mutex);
        else
                kfree(buf);
        return len;
}

static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW)
                return seq_read_iter(iocb, iter);
        return kernfs_file_read_iter(iocb, iter);
}

/*
 * Copy data in from userland and pass it to the matching kernfs write
 * operation.
 *
 * There is no easy way for us to know if userspace is only doing a partial
 * write, so we don't support them. We expect the entire buffer to come on
 * the first write.  Hint: if you're writing a value, first read the file,
 * modify only the value you're changing, then write entire buffer
 * back.
 */
static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
        ssize_t len = iov_iter_count(iter);
        const struct kernfs_ops *ops;
        char *buf;

        if (of->atomic_write_len) {
                if (len > of->atomic_write_len)
                        return -E2BIG;
        } else {
                len = min_t(size_t, len, PAGE_SIZE);
        }

        buf = of->prealloc_buf;
        if (buf)
                mutex_lock(&of->prealloc_mutex);
        else
                buf = kmalloc(len + 1, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        if (copy_from_iter(buf, len, iter) != len) {
                len = -EFAULT;
                goto out_free;
        }
        buf[len] = '\0';        /* guarantee string termination */

        /*
         * @of->mutex nests outside active ref and is used both to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active_of(of)) {
                mutex_unlock(&of->mutex);
                len = -ENODEV;
                goto out_free;
        }

        ops = kernfs_ops(of->kn);
        if (ops->write)
                len = ops->write(of, buf, len, iocb->ki_pos);
        else
                len = -EINVAL;

        kernfs_put_active_of(of);
        mutex_unlock(&of->mutex);

        if (len > 0)
                iocb->ki_pos += len;

out_free:
        if (buf == of->prealloc_buf)
                mutex_unlock(&of->prealloc_mutex);
        else
                kfree(buf);
        return len;
}

static void kernfs_vma_open(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);

        if (!of->vm_ops)
                return;

        if (!kernfs_get_active_of(of))
                return;

        if (of->vm_ops->open)
                of->vm_ops->open(vma);

        kernfs_put_active_of(of);
}

static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        vm_fault_t ret;

        if (!of->vm_ops)
                return VM_FAULT_SIGBUS;

        if (!kernfs_get_active_of(of))
                return VM_FAULT_SIGBUS;

        ret = VM_FAULT_SIGBUS;
        if (of->vm_ops->fault)
                ret = of->vm_ops->fault(vmf);

        kernfs_put_active_of(of);
        return ret;
}

static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        vm_fault_t ret;

        if (!of->vm_ops)
                return VM_FAULT_SIGBUS;

        if (!kernfs_get_active_of(of))
                return VM_FAULT_SIGBUS;

        ret = 0;
        if (of->vm_ops->page_mkwrite)
                ret = of->vm_ops->page_mkwrite(vmf);
        else
                file_update_time(file);

        kernfs_put_active_of(of);
        return ret;
}

static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
                             void *buf, int len, int write)
{
        struct file *file = vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        int ret;

        if (!of->vm_ops)
                return -EINVAL;

        if (!kernfs_get_active_of(of))
                return -EINVAL;

        ret = -EINVAL;
        if (of->vm_ops->access)
                ret = of->vm_ops->access(vma, addr, buf, len, write);

        kernfs_put_active_of(of);
        return ret;
}

static const struct vm_operations_struct kernfs_vm_ops = {
        .open                = kernfs_vma_open,
        .fault                = kernfs_vma_fault,
        .page_mkwrite        = kernfs_vma_page_mkwrite,
        .access                = kernfs_vma_access,
};

static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct kernfs_open_file *of = kernfs_of(file);
        const struct kernfs_ops *ops;
        int rc;

        /*
         * mmap path and of->mutex are prone to triggering spurious lockdep
         * warnings and we don't want to add spurious locking dependency
         * between the two.  Check whether mmap is actually implemented
         * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
         * comment in kernfs_fop_open() for more details.
         */
        if (!(of->kn->flags & KERNFS_HAS_MMAP))
                return -ENODEV;

        mutex_lock(&of->mutex);

        rc = -ENODEV;
        if (!kernfs_get_active_of(of))
                goto out_unlock;

        ops = kernfs_ops(of->kn);
        rc = ops->mmap(of, vma);
        if (rc)
                goto out_put;

        /*
         * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
         * to satisfy versions of X which crash if the mmap fails: that
         * substitutes a new vm_file, and we don't then want bin_vm_ops.
         */
        if (vma->vm_file != file)
                goto out_put;

        rc = -EINVAL;
        if (of->mmapped && of->vm_ops != vma->vm_ops)
                goto out_put;

        /*
         * It is not possible to successfully wrap close.
         * So error if someone is trying to use close.
         */
        if (vma->vm_ops && vma->vm_ops->close)
                goto out_put;

        rc = 0;
        if (!of->mmapped) {
                of->mmapped = true;
                of_on(of)->nr_mmapped++;
                of->vm_ops = vma->vm_ops;
        }
        vma->vm_ops = &kernfs_vm_ops;
out_put:
        kernfs_put_active_of(of);
out_unlock:
        mutex_unlock(&of->mutex);

        return rc;
}

/**
 *        kernfs_get_open_node - get or create kernfs_open_node
 *        @kn: target kernfs_node
 *        @of: kernfs_open_file for this instance of open
 *
 *        If @kn->attr.open exists, increment its reference count; otherwise,
 *        create one.  @of is chained to the files list.
 *
 *        Locking:
 *        Kernel thread context (may sleep).
 *
 *        Return:
 *        %0 on success, -errno on failure.
 */
static int kernfs_get_open_node(struct kernfs_node *kn,
                                struct kernfs_open_file *of)
{
        struct kernfs_open_node *on;
        struct mutex *mutex;

        mutex = kernfs_open_file_mutex_lock(kn);
        on = kernfs_deref_open_node_locked(kn);

        if (!on) {
                /* not there, initialize a new one */
                on = kzalloc_obj(*on);
                if (!on) {
                        mutex_unlock(mutex);
                        return -ENOMEM;
                }
                atomic_set(&on->event, 1);
                init_waitqueue_head(&on->poll);
                INIT_LIST_HEAD(&on->files);
                rcu_assign_pointer(kn->attr.open, on);
        }

        list_add_tail(&of->list, &on->files);
        if (kn->flags & KERNFS_HAS_RELEASE)
                on->nr_to_release++;

        mutex_unlock(mutex);
        return 0;
}

/**
 *        kernfs_unlink_open_file - Unlink @of from @kn.
 *
 *        @kn: target kernfs_node
 *        @of: associated kernfs_open_file
 *        @open_failed: ->open() failed, cancel ->release()
 *
 *        Unlink @of from list of @kn's associated open files. If list of
 *        associated open files becomes empty, disassociate and free
 *        kernfs_open_node.
 *
 *        LOCKING:
 *        None.
 */
static void kernfs_unlink_open_file(struct kernfs_node *kn,
                                    struct kernfs_open_file *of,
                                    bool open_failed)
{
        struct kernfs_open_node *on;
        struct mutex *mutex;

        mutex = kernfs_open_file_mutex_lock(kn);

        on = kernfs_deref_open_node_locked(kn);
        if (!on) {
                mutex_unlock(mutex);
                return;
        }

        if (of) {
                if (kn->flags & KERNFS_HAS_RELEASE) {
                        WARN_ON_ONCE(of->released == open_failed);
                        if (open_failed)
                                on->nr_to_release--;
                }
                if (of->mmapped)
                        on->nr_mmapped--;
                list_del(&of->list);
        }

        if (list_empty(&on->files)) {
                rcu_assign_pointer(kn->attr.open, NULL);
                kfree_rcu(on, rcu_head);
        }

        mutex_unlock(mutex);
}

static int kernfs_fop_open(struct inode *inode, struct file *file)
{
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_root *root = kernfs_root(kn);
        const struct kernfs_ops *ops;
        struct kernfs_open_file *of;
        bool has_read, has_write, has_mmap;
        int error = -EACCES;

        if (!kernfs_get_active(kn))
                return -ENODEV;

        ops = kernfs_ops(kn);

        has_read = ops->seq_show || ops->read || ops->mmap;
        has_write = ops->write || ops->mmap;
        has_mmap = ops->mmap;

        /* see the flag definition for details */
        if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
                if ((file->f_mode & FMODE_WRITE) &&
                    (!(inode->i_mode & S_IWUGO) || !has_write))
                        goto err_out;

                if ((file->f_mode & FMODE_READ) &&
                    (!(inode->i_mode & S_IRUGO) || !has_read))
                        goto err_out;
        }

        /* allocate a kernfs_open_file for the file */
        error = -ENOMEM;
        of = kzalloc_obj(struct kernfs_open_file);
        if (!of)
                goto err_out;

        /*
         * The following is done to give a different lockdep key to
         * @of->mutex for files which implement mmap.  This is a rather
         * crude way to avoid false positive lockdep warning around
         * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and
         * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
         * which mm->mmap_lock nests, while holding @of->mutex.  As each
         * open file has a separate mutex, it's okay as long as those don't
         * happen on the same file.  At this point, we can't easily give
         * each file a separate locking class.  Let's differentiate on
         * whether the file has mmap or not for now.
         *
         * For similar reasons, writable and readonly files are given different
         * lockdep key, because the writable file /sys/power/resume may call vfs
         * lookup helpers for arbitrary paths and readonly files can be read by
         * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs.
         *
         * All three cases look the same.  They're supposed to
         * look that way and give @of->mutex different static lockdep keys.
         */
        if (has_mmap)
                mutex_init(&of->mutex);
        else if (file->f_mode & FMODE_WRITE)
                mutex_init(&of->mutex);
        else
                mutex_init(&of->mutex);

        of->kn = kn;
        of->file = file;

        /*
         * Write path needs to atomic_write_len outside active reference.
         * Cache it in open_file.  See kernfs_fop_write_iter() for details.
         */
        of->atomic_write_len = ops->atomic_write_len;

        error = -EINVAL;
        /*
         * ->seq_show is incompatible with ->prealloc,
         * as seq_read does its own allocation.
         * ->read must be used instead.
         */
        if (ops->prealloc && ops->seq_show)
                goto err_free;
        if (ops->prealloc) {
                int len = of->atomic_write_len ?: PAGE_SIZE;
                of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL);
                error = -ENOMEM;
                if (!of->prealloc_buf)
                        goto err_free;
                mutex_init(&of->prealloc_mutex);
        }

        /*
         * Always instantiate seq_file even if read access doesn't use
         * seq_file or is not requested.  This unifies private data access
         * and readable regular files are the vast majority anyway.
         */
        if (ops->seq_show)
                error = seq_open(file, &kernfs_seq_ops);
        else
                error = seq_open(file, NULL);
        if (error)
                goto err_free;

        of->seq_file = file->private_data;
        of->seq_file->private = of;

        /* seq_file clears PWRITE unconditionally, restore it if WRITE */
        if (file->f_mode & FMODE_WRITE)
                file->f_mode |= FMODE_PWRITE;

        /* make sure we have open node struct */
        error = kernfs_get_open_node(kn, of);
        if (error)
                goto err_seq_release;

        if (ops->open) {
                /* nobody has access to @of yet, skip @of->mutex */
                error = ops->open(of);
                if (error)
                        goto err_put_node;
        }

        /* open succeeded, put active references */
        kernfs_put_active(kn);
        return 0;

err_put_node:
        kernfs_unlink_open_file(kn, of, true);
err_seq_release:
        seq_release(inode, file);
err_free:
        kfree(of->prealloc_buf);
        kfree(of);
err_out:
        kernfs_put_active(kn);
        return error;
}

/* used from release/drain to ensure that ->release() is called exactly once */
static void kernfs_release_file(struct kernfs_node *kn,
                                struct kernfs_open_file *of)
{
        /*
         * @of is guaranteed to have no other file operations in flight and
         * we just want to synchronize release and drain paths.
         * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used
         * here because drain path may be called from places which can
         * cause circular dependency.
         */
        lockdep_assert_held(kernfs_open_file_mutex_ptr(kn));

        if (!of->released) {
                /*
                 * A file is never detached without being released and we
                 * need to be able to release files which are deactivated
                 * and being drained.  Don't use kernfs_ops().
                 */
                kn->attr.ops->release(of);
                of->released = true;
                of_on(of)->nr_to_release--;
        }
}

static int kernfs_fop_release(struct inode *inode, struct file *filp)
{
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_open_file *of = kernfs_of(filp);

        if (kn->flags & KERNFS_HAS_RELEASE) {
                struct mutex *mutex;

                mutex = kernfs_open_file_mutex_lock(kn);
                kernfs_release_file(kn, of);
                mutex_unlock(mutex);
        }

        kernfs_unlink_open_file(kn, of, false);
        seq_release(inode, filp);
        kfree(of->prealloc_buf);
        kfree(of);

        return 0;
}

bool kernfs_should_drain_open_files(struct kernfs_node *kn)
{
        struct kernfs_open_node *on;
        bool ret;

        /*
         * @kn being deactivated guarantees that @kn->attr.open can't change
         * beneath us making the lockless test below safe.
         * Callers post kernfs_unbreak_active_protection may be counted in
         * kn->active by now, do not WARN_ON because of them.
         */

        rcu_read_lock();
        on = rcu_dereference(kn->attr.open);
        ret = on && (on->nr_mmapped || on->nr_to_release);
        rcu_read_unlock();

        return ret;
}

void kernfs_drain_open_files(struct kernfs_node *kn)
{
        struct kernfs_open_node *on;
        struct kernfs_open_file *of;
        struct mutex *mutex;

        mutex = kernfs_open_file_mutex_lock(kn);
        on = kernfs_deref_open_node_locked(kn);
        if (!on) {
                mutex_unlock(mutex);
                return;
        }

        list_for_each_entry(of, &on->files, list) {
                struct inode *inode = file_inode(of->file);

                if (of->mmapped) {
                        unmap_mapping_range(inode->i_mapping, 0, 0, 1);
                        of->mmapped = false;
                        on->nr_mmapped--;
                }

                if (kn->flags & KERNFS_HAS_RELEASE)
                        kernfs_release_file(kn, of);
        }

        WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release);
        mutex_unlock(mutex);
}

/*
 * Kernfs attribute files are pollable.  The idea is that you read
 * the content and then you use 'poll' or 'select' to wait for
 * the content to change.  When the content changes (assuming the
 * manager for the kobject supports notification), poll will
 * return EPOLLERR|EPOLLPRI, and select will return the fd whether
 * it is waiting for read, write, or exceptions.
 * Once poll/select indicates that the value has changed, you
 * need to close and re-open the file, or seek to 0 and read again.
 * Reminder: this only works for attributes which actively support
 * it, and it is not possible to test an attribute from userspace
 * to see if it supports poll (Neither 'poll' nor 'select' return
 * an appropriate error code).  When in doubt, set a suitable timeout value.
 */
__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
{
        struct kernfs_open_node *on = of_on(of);

        poll_wait(of->file, &on->poll, wait);

        if (of->event != atomic_read(&on->event))
                return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;

        return DEFAULT_POLLMASK;
}

static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
{
        struct kernfs_open_file *of = kernfs_of(filp);
        struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
        __poll_t ret;

        if (!kernfs_get_active_of(of))
                return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;

        if (kn->attr.ops->poll)
                ret = kn->attr.ops->poll(of, wait);
        else
                ret = kernfs_generic_poll(of, wait);

        kernfs_put_active_of(of);
        return ret;
}

static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
{
        struct kernfs_open_file *of = kernfs_of(file);
        const struct kernfs_ops *ops;
        loff_t ret;

        /*
         * @of->mutex nests outside active ref and is primarily to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active_of(of)) {
                mutex_unlock(&of->mutex);
                return -ENODEV;
        }

        ops = kernfs_ops(of->kn);
        if (ops->llseek)
                ret = ops->llseek(of, offset, whence);
        else
                ret = generic_file_llseek(file, offset, whence);

        kernfs_put_active_of(of);
        mutex_unlock(&of->mutex);
        return ret;
}

static void kernfs_notify_workfn(struct work_struct *work)
{
        struct kernfs_node *kn;
        struct kernfs_super_info *info;
        struct kernfs_root *root;
repeat:
        /* pop one off the notify_list */
        spin_lock_irq(&kernfs_notify_lock);
        kn = kernfs_notify_list;
        if (kn == KERNFS_NOTIFY_EOL) {
                spin_unlock_irq(&kernfs_notify_lock);
                return;
        }
        kernfs_notify_list = kn->attr.notify_next;
        kn->attr.notify_next = NULL;
        spin_unlock_irq(&kernfs_notify_lock);

        root = kernfs_root(kn);
        /* kick fsnotify */

        down_read(&root->kernfs_supers_rwsem);
        down_read(&root->kernfs_rwsem);
        list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
                struct kernfs_node *parent;
                struct inode *p_inode = NULL;
                const char *kn_name;
                struct inode *inode;
                struct qstr name;

                /*
                 * We want fsnotify_modify() on @kn but as the
                 * modifications aren't originating from userland don't
                 * have the matching @file available.  Look up the inodes
                 * and generate the events manually.
                 */
                inode = ilookup(info->sb, kernfs_ino(kn));
                if (!inode)
                        continue;

                kn_name = kernfs_rcu_name(kn);
                name = QSTR(kn_name);
                parent = kernfs_get_parent(kn);
                if (parent) {
                        p_inode = ilookup(info->sb, kernfs_ino(parent));
                        if (p_inode) {
                                fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD,
                                         inode, FSNOTIFY_EVENT_INODE,
                                         p_inode, &name, inode, 0);
                                iput(p_inode);
                        }

                        kernfs_put(parent);
                }

                if (!p_inode)
                        fsnotify_inode(inode, FS_MODIFY);

                iput(inode);
        }

        up_read(&root->kernfs_rwsem);
        up_read(&root->kernfs_supers_rwsem);
        kernfs_put(kn);
        goto repeat;
}

/**
 * kernfs_notify - notify a kernfs file
 * @kn: file to notify
 *
 * Notify @kn such that poll(2) on @kn wakes up.  Maybe be called from any
 * context.
 */
void kernfs_notify(struct kernfs_node *kn)
{
        static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
        unsigned long flags;
        struct kernfs_open_node *on;

        if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
                return;

        /* kick poll immediately */
        rcu_read_lock();
        on = rcu_dereference(kn->attr.open);
        if (on) {
                atomic_inc(&on->event);
                wake_up_interruptible(&on->poll);
        }
        rcu_read_unlock();

        /* schedule work to kick fsnotify */
        spin_lock_irqsave(&kernfs_notify_lock, flags);
        if (!kn->attr.notify_next) {
                kernfs_get(kn);
                kn->attr.notify_next = kernfs_notify_list;
                kernfs_notify_list = kn;
                schedule_work(&kernfs_notify_work);
        }
        spin_unlock_irqrestore(&kernfs_notify_lock, flags);
}
EXPORT_SYMBOL_GPL(kernfs_notify);

const struct file_operations kernfs_file_fops = {
        .read_iter        = kernfs_fop_read_iter,
        .write_iter        = kernfs_fop_write_iter,
        .llseek                = kernfs_fop_llseek,
        .mmap                = kernfs_fop_mmap,
        .open                = kernfs_fop_open,
        .release        = kernfs_fop_release,
        .poll                = kernfs_fop_poll,
        .fsync                = noop_fsync,
        .splice_read        = copy_splice_read,
        .splice_write        = iter_file_splice_write,
};

/**
 * __kernfs_create_file - kernfs internal function to create a file
 * @parent: directory to create the file in
 * @name: name of the file
 * @mode: mode of the file
 * @uid: uid of the file
 * @gid: gid of the file
 * @size: size of the file
 * @ops: kernfs operations for the file
 * @priv: private data for the file
 * @ns: optional namespace tag of the file
 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
 *
 * Return: the created node on success, ERR_PTR() value on error.
 */
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                                         const char *name,
                                         umode_t mode, kuid_t uid, kgid_t gid,
                                         loff_t size,
                                         const struct kernfs_ops *ops,
                                         void *priv, const struct ns_common *ns,
                                         struct lock_class_key *key)
{
        struct kernfs_node *kn;
        unsigned flags;
        int rc;

        flags = KERNFS_FILE;

        kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG,
                             uid, gid, flags);
        if (!kn)
                return ERR_PTR(-ENOMEM);

        kn->attr.ops = ops;
        kn->attr.size = size;
        kn->ns = ns;
        kn->priv = priv;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (key) {
                lockdep_init_map(&kn->dep_map, "kn->active", key, 0);
                kn->flags |= KERNFS_LOCKDEP;
        }
#endif

        /*
         * kn->attr.ops is accessible only while holding active ref.  We
         * need to know whether some ops are implemented outside active
         * ref.  Cache their existence in flags.
         */
        if (ops->seq_show)
                kn->flags |= KERNFS_HAS_SEQ_SHOW;
        if (ops->mmap)
                kn->flags |= KERNFS_HAS_MMAP;
        if (ops->release)
                kn->flags |= KERNFS_HAS_RELEASE;

        rc = kernfs_add_one(kn);
        if (rc) {
                kernfs_put(kn);
                return ERR_PTR(rc);
        }
        return kn;
}



































    1 












    1 
    1 







    1 

    1 









    1 









    1 

    1 

    1 
    1 











    3 






















    3 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/drivers/base/map.c
 *
 * (C) Copyright Al Viro 2002,2003
 *
 * NOTE: data structure needs to be changed.  It works, but for large dev_t
 * it will be too slow.  It is isolated, though, so these changes will be
 * local to that file.
 */

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/kdev_t.h>
#include <linux/kobject.h>
#include <linux/kobj_map.h>

struct kobj_map {
        struct probe {
                struct probe *next;
                dev_t dev;
                unsigned long range;
                struct module *owner;
                kobj_probe_t *get;
                int (*lock)(dev_t, void *);
                void *data;
        } *probes[255];
        struct mutex *lock;
};

int kobj_map(struct kobj_map *domain, dev_t dev, unsigned long range,
             struct module *module, kobj_probe_t *probe,
             int (*lock)(dev_t, void *), void *data)
{
        unsigned int n = MAJOR(dev + range - 1) - MAJOR(dev) + 1;
        unsigned int index = MAJOR(dev);
        unsigned int i;
        struct probe *p;

        if (n > 255)
                n = 255;

        p = kmalloc_objs(struct probe, n);
        if (p == NULL)
                return -ENOMEM;

        for (i = 0; i < n; i++, p++) {
                p->owner = module;
                p->get = probe;
                p->lock = lock;
                p->dev = dev;
                p->range = range;
                p->data = data;
        }
        mutex_lock(domain->lock);
        for (i = 0, p -= n; i < n; i++, p++, index++) {
                struct probe **s = &domain->probes[index % 255];
                while (*s && (*s)->range < range)
                        s = &(*s)->next;
                p->next = *s;
                *s = p;
        }
        mutex_unlock(domain->lock);
        return 0;
}

void kobj_unmap(struct kobj_map *domain, dev_t dev, unsigned long range)
{
        unsigned int n = MAJOR(dev + range - 1) - MAJOR(dev) + 1;
        unsigned int index = MAJOR(dev);
        unsigned int i;
        struct probe *found = NULL;

        if (n > 255)
                n = 255;

        mutex_lock(domain->lock);
        for (i = 0; i < n; i++, index++) {
                struct probe **s;
                for (s = &domain->probes[index % 255]; *s; s = &(*s)->next) {
                        struct probe *p = *s;
                        if (p->dev == dev && p->range == range) {
                                *s = p->next;
                                if (!found)
                                        found = p;
                                break;
                        }
                }
        }
        mutex_unlock(domain->lock);
        kfree(found);
}

struct kobject *kobj_lookup(struct kobj_map *domain, dev_t dev, int *index)
{
        struct kobject *kobj;
        struct probe *p;
        unsigned long best = ~0UL;

retry:
        mutex_lock(domain->lock);
        for (p = domain->probes[MAJOR(dev) % 255]; p; p = p->next) {
                struct kobject *(*probe)(dev_t, int *, void *);
                struct module *owner;
                void *data;

                if (p->dev > dev || p->dev + p->range - 1 < dev)
                        continue;
                if (p->range - 1 >= best)
                        break;
                if (!try_module_get(p->owner))
                        continue;
                owner = p->owner;
                data = p->data;
                probe = p->get;
                best = p->range - 1;
                *index = dev - p->dev;
                if (p->lock && p->lock(dev, data) < 0) {
                        module_put(owner);
                        continue;
                }
                mutex_unlock(domain->lock);
                kobj = probe(dev, index, data);
                /* Currently ->owner protects _only_ ->probe() itself. */
                module_put(owner);
                if (kobj)
                        return kobj;
                goto retry;
        }
        mutex_unlock(domain->lock);
        return NULL;
}

struct kobj_map *kobj_map_init(kobj_probe_t *base_probe, struct mutex *lock)
{
        struct kobj_map *p = kmalloc_obj(struct kobj_map);
        struct probe *base = kzalloc_obj(*base);
        int i;

        if ((p == NULL) || (base == NULL)) {
                kfree(p);
                kfree(base);
                return NULL;
        }

        base->dev = 1;
        base->range = ~0;
        base->get = base_probe;
        for (i = 0; i < 255; i++)
                p->probes[i] = base;
        p->lock = lock;
        return p;
}


















































































































































































































































   44 






















   44 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef LLIST_H
#define LLIST_H
/*
 * Lock-less NULL terminated single linked list
 *
 * Cases where locking is not needed:
 * If there are multiple producers and multiple consumers, llist_add can be
 * used in producers and llist_del_all can be used in consumers simultaneously
 * without locking. Also a single consumer can use llist_del_first while
 * multiple producers simultaneously use llist_add, without any locking.
 *
 * Cases where locking is needed:
 * If we have multiple consumers with llist_del_first used in one consumer, and
 * llist_del_first or llist_del_all used in other consumers, then a lock is
 * needed.  This is because llist_del_first depends on list->first->next not
 * changing, but without lock protection, there's no way to be sure about that
 * if a preemption happens in the middle of the delete operation and on being
 * preempted back, the list->first is the same as before causing the cmpxchg in
 * llist_del_first to succeed. For example, while a llist_del_first operation
 * is in progress in one consumer, then a llist_del_first, llist_add,
 * llist_add (or llist_del_all, llist_add, llist_add) sequence in another
 * consumer may cause violations.
 *
 * This can be summarized as follows:
 *
 *           |   add    | del_first |  del_all
 * add       |    -     |     -     |     -
 * del_first |          |     L     |     L
 * del_all   |          |           |     -
 *
 * Where, a particular row's operation can happen concurrently with a column's
 * operation, with "-" being no lock needed, while "L" being lock is needed.
 *
 * The list entries deleted via llist_del_all can be traversed with
 * traversing function such as llist_for_each etc.  But the list
 * entries can not be traversed safely before deleted from the list.
 * The order of deleted entries is from the newest to the oldest added
 * one.  If you want to traverse from the oldest to the newest, you
 * must reverse the order by yourself before traversing.
 *
 * The basic atomic operation of this list is cmpxchg on long.  On
 * architectures that don't have NMI-safe cmpxchg implementation, the
 * list can NOT be used in NMI handlers.  So code that uses the list in
 * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
 *
 * Copyright 2010,2011 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 */

#include <linux/atomic.h>
#include <linux/container_of.h>
#include <linux/stddef.h>
#include <linux/types.h>

struct llist_head {
        struct llist_node *first;
};

struct llist_node {
        struct llist_node *next;
};

#define LLIST_HEAD_INIT(name)        { NULL }
#define LLIST_HEAD(name)        struct llist_head name = LLIST_HEAD_INIT(name)

/**
 * init_llist_head - initialize lock-less list head
 * @head:        the head for your lock-less list
 */
static inline void init_llist_head(struct llist_head *list)
{
        list->first = NULL;
}

/**
 * init_llist_node - initialize lock-less list node
 * @node:        the node to be initialised
 *
 * In cases where there is a need to test if a node is on
 * a list or not, this initialises the node to clearly
 * not be on any list.
 */
static inline void init_llist_node(struct llist_node *node)
{
        WRITE_ONCE(node->next, node);
}

/**
 * llist_on_list - test if a lock-list list node is on a list
 * @node:        the node to test
 *
 * When a node is on a list the ->next pointer will be NULL or
 * some other node.  It can never point to itself.  We use that
 * in init_llist_node() to record that a node is not on any list,
 * and here to test whether it is on any list.
 */
static inline bool llist_on_list(const struct llist_node *node)
{
        return READ_ONCE(node->next) != node;
}

/**
 * llist_entry - get the struct of this entry
 * @ptr:        the &struct llist_node pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the llist_node within the struct.
 */
#define llist_entry(ptr, type, member)                \
        container_of(ptr, type, member)

/**
 * member_address_is_nonnull - check whether the member address is not NULL
 * @ptr:        the object pointer (struct type * that contains the llist_node)
 * @member:        the name of the llist_node within the struct.
 *
 * This macro is conceptually the same as
 *        &ptr->member != NULL
 * but it works around the fact that compilers can decide that taking a member
 * address is never a NULL pointer.
 *
 * Real objects that start at a high address and have a member at NULL are
 * unlikely to exist, but such pointers may be returned e.g. by the
 * container_of() macro.
 */
#define member_address_is_nonnull(ptr, member)        \
        ((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0)

/**
 * llist_for_each - iterate over some deleted entries of a lock-less list
 * @pos:        the &struct llist_node to use as a loop cursor
 * @node:        the first entry of deleted list entries
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being deleted from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each(pos, node)                        \
        for ((pos) = (node); pos; (pos) = (pos)->next)

/**
 * llist_for_each_safe - iterate over some deleted entries of a lock-less list
 *                         safe against removal of list entry
 * @pos:        the &struct llist_node to use as a loop cursor
 * @n:                another &struct llist_node to use as temporary storage
 * @node:        the first entry of deleted list entries
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being deleted from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_safe(pos, n, node)                        \
        for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))

/**
 * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
 * @pos:        the type * to use as a loop cursor.
 * @node:        the fist entry of deleted list entries.
 * @member:        the name of the llist_node with the struct.
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being removed from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_entry(pos, node, member)                                \
        for ((pos) = llist_entry((node), typeof(*(pos)), member);        \
             member_address_is_nonnull(pos, member);                        \
             (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))

/**
 * llist_for_each_entry_safe - iterate over some deleted entries of lock-less list of given type
 *                               safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @node:        the first entry of deleted list entries.
 * @member:        the name of the llist_node with the struct.
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being removed from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_entry_safe(pos, n, node, member)                               \
        for (pos = llist_entry((node), typeof(*pos), member);                       \
             member_address_is_nonnull(pos, member) &&                               \
                (n = llist_entry(pos->member.next, typeof(*n), member), true); \
             pos = n)

/**
 * llist_empty - tests whether a lock-less list is empty
 * @head:        the list to test
 *
 * Not guaranteed to be accurate or up to date.  Just a quick way to
 * test whether the list is empty without deleting something from the
 * list.
 */
static inline bool llist_empty(const struct llist_head *head)
{
        return READ_ONCE(head->first) == NULL;
}

static inline struct llist_node *llist_next(struct llist_node *node)
{
        return READ_ONCE(node->next);
}

/**
 * llist_add_batch - add several linked entries in batch
 * @new_first:        first entry in batch to be added
 * @new_last:        last entry in batch to be added
 * @head:        the head for your lock-less list
 *
 * Return whether list is empty before adding.
 */
static inline bool llist_add_batch(struct llist_node *new_first,
                                   struct llist_node *new_last,
                                   struct llist_head *head)
{
        struct llist_node *first = READ_ONCE(head->first);

        do {
                new_last->next = first;
        } while (!try_cmpxchg(&head->first, &first, new_first));

        return !first;
}

static inline bool __llist_add_batch(struct llist_node *new_first,
                                     struct llist_node *new_last,
                                     struct llist_head *head)
{
        new_last->next = head->first;
        head->first = new_first;
        return new_last->next == NULL;
}

/**
 * llist_add - add a new entry
 * @new:        new entry to be added
 * @head:        the head for your lock-less list
 *
 * Returns true if the list was empty prior to adding this entry.
 */
static inline bool llist_add(struct llist_node *new, struct llist_head *head)
{
        return llist_add_batch(new, new, head);
}

static inline bool __llist_add(struct llist_node *new, struct llist_head *head)
{
        return __llist_add_batch(new, new, head);
}

/**
 * llist_del_all - delete all entries from lock-less list
 * @head:        the head of lock-less list to delete all entries
 *
 * If list is empty, return NULL, otherwise, delete all entries and
 * return the pointer to the first entry.  The order of entries
 * deleted is from the newest to the oldest added one.
 */
static inline struct llist_node *llist_del_all(struct llist_head *head)
{
        return xchg(&head->first, NULL);
}

static inline struct llist_node *__llist_del_all(struct llist_head *head)
{
        struct llist_node *first = head->first;

        head->first = NULL;
        return first;
}

extern struct llist_node *llist_del_first(struct llist_head *head);

/**
 * llist_del_first_init - delete first entry from lock-list and mark is as being off-list
 * @head:        the head of lock-less list to delete from.
 *
 * This behave the same as llist_del_first() except that llist_init_node() is called
 * on the returned node so that llist_on_list() will report false for the node.
 */
static inline struct llist_node *llist_del_first_init(struct llist_head *head)
{
        struct llist_node *n = llist_del_first(head);

        if (n)
                init_llist_node(n);
        return n;
}

extern bool llist_del_first_this(struct llist_head *head,
                                 struct llist_node *this);

struct llist_node *llist_reverse_order(struct llist_node *head);

#endif /* LLIST_H */





























































































































































































































































    1 



























    1 

























    1 



    1 































    1 































    1 







    1 



    1 



    1 























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/bitmap.h>
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/export.h>
#include <linux/hex.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>

#include "kstrtox.h"

/**
 * bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap
 *
 * @ubuf: pointer to user buffer containing string.
 * @ulen: buffer size in bytes.  If string is smaller than this
 *    then it must be terminated with a \0.
 * @maskp: pointer to bitmap array that will contain result.
 * @nmaskbits: size of bitmap, in bits.
 */
int bitmap_parse_user(const char __user *ubuf,
                        unsigned int ulen, unsigned long *maskp,
                        int nmaskbits)
{
        char *buf;
        int ret;

        buf = memdup_user_nul(ubuf, ulen);
        if (IS_ERR(buf))
                return PTR_ERR(buf);

        ret = bitmap_parse(buf, UINT_MAX, maskp, nmaskbits);

        kfree(buf);
        return ret;
}
EXPORT_SYMBOL(bitmap_parse_user);

/**
 * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string
 * @list: indicates whether the bitmap must be list
 * @buf: page aligned buffer into which string is placed
 * @maskp: pointer to bitmap to convert
 * @nmaskbits: size of bitmap, in bits
 *
 * Output format is a comma-separated list of decimal numbers and
 * ranges if list is specified or hex digits grouped into comma-separated
 * sets of 8 digits/set. Returns the number of characters written to buf.
 *
 * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned
 * area and that sufficient storage remains at @buf to accommodate the
 * bitmap_print_to_pagebuf() output. Returns the number of characters
 * actually printed to @buf, excluding terminating '\0'.
 */
int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
                            int nmaskbits)
{
        ptrdiff_t len = PAGE_SIZE - offset_in_page(buf);

        return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) :
                      scnprintf(buf, len, "%*pb\n", nmaskbits, maskp);
}
EXPORT_SYMBOL(bitmap_print_to_pagebuf);

/**
 * bitmap_print_to_buf  - convert bitmap to list or hex format ASCII string
 * @list: indicates whether the bitmap must be list
 *      true:  print in decimal list format
 *      false: print in hexadecimal bitmask format
 * @buf: buffer into which string is placed
 * @maskp: pointer to bitmap to convert
 * @nmaskbits: size of bitmap, in bits
 * @off: in the string from which we are copying, We copy to @buf
 * @count: the maximum number of bytes to print
 */
static int bitmap_print_to_buf(bool list, char *buf, const unsigned long *maskp,
                int nmaskbits, loff_t off, size_t count)
{
        const char *fmt = list ? "%*pbl\n" : "%*pb\n";
        ssize_t size;
        void *data;

        data = kasprintf(GFP_KERNEL, fmt, nmaskbits, maskp);
        if (!data)
                return -ENOMEM;

        size = memory_read_from_buffer(buf, count, &off, data, strlen(data) + 1);
        kfree(data);

        return size;
}

/**
 * bitmap_print_bitmask_to_buf  - convert bitmap to hex bitmask format ASCII string
 * @buf: buffer into which string is placed
 * @maskp: pointer to bitmap to convert
 * @nmaskbits: size of bitmap, in bits
 * @off: in the string from which we are copying, We copy to @buf
 * @count: the maximum number of bytes to print
 *
 * The bitmap_print_to_pagebuf() is used indirectly via its cpumap wrapper
 * cpumap_print_to_pagebuf() or directly by drivers to export hexadecimal
 * bitmask and decimal list to userspace by sysfs ABI.
 * Drivers might be using a normal attribute for this kind of ABIs. A
 * normal attribute typically has show entry as below::
 *
 *   static ssize_t example_attribute_show(struct device *dev,
 *                struct device_attribute *attr, char *buf)
 *   {
 *        ...
 *        return bitmap_print_to_pagebuf(true, buf, &mask, nr_trig_max);
 *   }
 *
 * show entry of attribute has no offset and count parameters and this
 * means the file is limited to one page only.
 * bitmap_print_to_pagebuf() API works terribly well for this kind of
 * normal attribute with buf parameter and without offset, count::
 *
 *   bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
 *                           int nmaskbits)
 *   {
 *   }
 *
 * The problem is once we have a large bitmap, we have a chance to get a
 * bitmask or list more than one page. Especially for list, it could be
 * as complex as 0,3,5,7,9,... We have no simple way to know it exact size.
 * It turns out bin_attribute is a way to break this limit. bin_attribute
 * has show entry as below::
 *
 *   static ssize_t
 *   example_bin_attribute_show(struct file *filp, struct kobject *kobj,
 *                struct bin_attribute *attr, char *buf,
 *                loff_t offset, size_t count)
 *   {
 *        ...
 *   }
 *
 * With the new offset and count parameters, this makes sysfs ABI be able
 * to support file size more than one page. For example, offset could be
 * >= 4096.
 * bitmap_print_bitmask_to_buf(), bitmap_print_list_to_buf() wit their
 * cpumap wrapper cpumap_print_bitmask_to_buf(), cpumap_print_list_to_buf()
 * make those drivers be able to support large bitmask and list after they
 * move to use bin_attribute. In result, we have to pass the corresponding
 * parameters such as off, count from bin_attribute show entry to this API.
 *
 * The role of cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf()
 * is similar with cpumap_print_to_pagebuf(),  the difference is that
 * bitmap_print_to_pagebuf() mainly serves sysfs attribute with the assumption
 * the destination buffer is exactly one page and won't be more than one page.
 * cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf(), on the other
 * hand, mainly serves bin_attribute which doesn't work with exact one page,
 * and it can break the size limit of converted decimal list and hexadecimal
 * bitmask.
 *
 * WARNING!
 *
 * This function is not a replacement for sprintf() or bitmap_print_to_pagebuf().
 * It is intended to workaround sysfs limitations discussed above and should be
 * used carefully in general case for the following reasons:
 *
 *  - Time complexity is O(nbits^2/count), comparing to O(nbits) for snprintf().
 *  - Memory complexity is O(nbits), comparing to O(1) for snprintf().
 *  - @off and @count are NOT offset and number of bits to print.
 *  - If printing part of bitmap as list, the resulting string is not a correct
 *    list representation of bitmap. Particularly, some bits within or out of
 *    related interval may be erroneously set or unset. The format of the string
 *    may be broken, so bitmap_parselist-like parser may fail parsing it.
 *  - If printing the whole bitmap as list by parts, user must ensure the order
 *    of calls of the function such that the offset is incremented linearly.
 *  - If printing the whole bitmap as list by parts, user must keep bitmap
 *    unchanged between the very first and very last call. Otherwise concatenated
 *    result may be incorrect, and format may be broken.
 *
 * Returns the number of characters actually printed to @buf
 */
int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp,
                                int nmaskbits, loff_t off, size_t count)
{
        return bitmap_print_to_buf(false, buf, maskp, nmaskbits, off, count);
}
EXPORT_SYMBOL(bitmap_print_bitmask_to_buf);

/**
 * bitmap_print_list_to_buf  - convert bitmap to decimal list format ASCII string
 * @buf: buffer into which string is placed
 * @maskp: pointer to bitmap to convert
 * @nmaskbits: size of bitmap, in bits
 * @off: in the string from which we are copying, We copy to @buf
 * @count: the maximum number of bytes to print
 *
 * Everything is same with the above bitmap_print_bitmask_to_buf() except
 * the print format.
 */
int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
                             int nmaskbits, loff_t off, size_t count)
{
        return bitmap_print_to_buf(true, buf, maskp, nmaskbits, off, count);
}
EXPORT_SYMBOL(bitmap_print_list_to_buf);

/*
 * Region 9-38:4/10 describes the following bitmap structure:
 * 0           9  12    18                        38             N
 * .........****......****......****..................
 *            ^  ^     ^                         ^             ^
 *      start  off   group_len               end         nbits
 */
struct region {
        unsigned int start;
        unsigned int off;
        unsigned int group_len;
        unsigned int end;
        unsigned int nbits;
};

static void bitmap_set_region(const struct region *r, unsigned long *bitmap)
{
        unsigned int start;

        for (start = r->start; start <= r->end; start += r->group_len)
                bitmap_set(bitmap, start, min(r->end - start + 1, r->off));
}

static int bitmap_check_region(const struct region *r)
{
        if (r->start > r->end || r->group_len == 0 || r->off > r->group_len)
                return -EINVAL;

        if (r->end >= r->nbits)
                return -ERANGE;

        return 0;
}

static const char *bitmap_getnum(const char *str, unsigned int *num,
                                 unsigned int lastbit)
{
        unsigned long long n;
        unsigned int len;

        if (str[0] == 'N') {
                *num = lastbit;
                return str + 1;
        }

        len = _parse_integer(str, 10, &n);
        if (!len)
                return ERR_PTR(-EINVAL);
        if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n)
                return ERR_PTR(-EOVERFLOW);

        *num = n;
        return str + len;
}

static inline bool end_of_str(char c)
{
        return c == '\0' || c == '\n';
}

static inline bool __end_of_region(char c)
{
        return isspace(c) || c == ',';
}

static inline bool end_of_region(char c)
{
        return __end_of_region(c) || end_of_str(c);
}

/*
 * The format allows commas and whitespaces at the beginning
 * of the region.
 */
static const char *bitmap_find_region(const char *str)
{
        while (__end_of_region(*str))
                str++;

        return end_of_str(*str) ? NULL : str;
}

static const char *bitmap_find_region_reverse(const char *start, const char *end)
{
        while (start <= end && __end_of_region(*end))
                end--;

        return end;
}

static const char *bitmap_parse_region(const char *str, struct region *r)
{
        unsigned int lastbit = r->nbits - 1;

        if (!strncasecmp(str, "all", 3)) {
                r->start = 0;
                r->end = lastbit;
                str += 3;

                goto check_pattern;
        }

        str = bitmap_getnum(str, &r->start, lastbit);
        if (IS_ERR(str))
                return str;

        if (end_of_region(*str))
                goto no_end;

        if (*str != '-')
                return ERR_PTR(-EINVAL);

        str = bitmap_getnum(str + 1, &r->end, lastbit);
        if (IS_ERR(str))
                return str;

check_pattern:
        if (end_of_region(*str))
                goto no_pattern;

        if (*str != ':')
                return ERR_PTR(-EINVAL);

        str = bitmap_getnum(str + 1, &r->off, lastbit);
        if (IS_ERR(str))
                return str;

        if (*str != '/')
                return ERR_PTR(-EINVAL);

        return bitmap_getnum(str + 1, &r->group_len, lastbit);

no_end:
        r->end = r->start;
no_pattern:
        r->off = r->end + 1;
        r->group_len = r->end + 1;

        return end_of_str(*str) ? NULL : str;
}

/**
 * bitmap_parselist - convert list format ASCII string to bitmap
 * @buf: read user string from this buffer; must be terminated
 *    with a \0 or \n.
 * @maskp: write resulting mask here
 * @nmaskbits: number of bits in mask to be written
 *
 * Input format is a comma-separated list of decimal numbers and
 * ranges.  Consecutively set bits are shown as two hyphen-separated
 * decimal numbers, the smallest and largest bit numbers set in
 * the range.
 * Optionally each range can be postfixed to denote that only parts of it
 * should be set. The range will divided to groups of specific size.
 * From each group will be used only defined amount of bits.
 * Syntax: range:used_size/group_size
 * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769
 * The value 'N' can be used as a dynamically substituted token for the
 * maximum allowed value; i.e (nmaskbits - 1).  Keep in mind that it is
 * dynamic, so if system changes cause the bitmap width to change, such
 * as more cores in a CPU list, then any ranges using N will also change.
 *
 * Returns: 0 on success, -errno on invalid input strings. Error values:
 *
 *   - ``-EINVAL``: wrong region format
 *   - ``-EINVAL``: invalid character in string
 *   - ``-ERANGE``: bit number specified too large for mask
 *   - ``-EOVERFLOW``: integer overflow in the input parameters
 */
int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits)
{
        struct region r;
        long ret;

        r.nbits = nmaskbits;
        bitmap_zero(maskp, r.nbits);

        while (buf) {
                buf = bitmap_find_region(buf);
                if (buf == NULL)
                        return 0;

                buf = bitmap_parse_region(buf, &r);
                if (IS_ERR(buf))
                        return PTR_ERR(buf);

                ret = bitmap_check_region(&r);
                if (ret)
                        return ret;

                bitmap_set_region(&r, maskp);
        }

        return 0;
}
EXPORT_SYMBOL(bitmap_parselist);


/**
 * bitmap_parselist_user() - convert user buffer's list format ASCII
 * string to bitmap
 *
 * @ubuf: pointer to user buffer containing string.
 * @ulen: buffer size in bytes.  If string is smaller than this
 *    then it must be terminated with a \0.
 * @maskp: pointer to bitmap array that will contain result.
 * @nmaskbits: size of bitmap, in bits.
 *
 * Wrapper for bitmap_parselist(), providing it with user buffer.
 */
int bitmap_parselist_user(const char __user *ubuf,
                        unsigned int ulen, unsigned long *maskp,
                        int nmaskbits)
{
        char *buf;
        int ret;

        buf = memdup_user_nul(ubuf, ulen);
        if (IS_ERR(buf))
                return PTR_ERR(buf);

        ret = bitmap_parselist(buf, maskp, nmaskbits);

        kfree(buf);
        return ret;
}
EXPORT_SYMBOL(bitmap_parselist_user);

static const char *bitmap_get_x32_reverse(const char *start,
                                        const char *end, u32 *num)
{
        u32 ret = 0;
        int c, i;

        for (i = 0; i < 32; i += 4) {
                c = hex_to_bin(*end--);
                if (c < 0)
                        return ERR_PTR(-EINVAL);

                ret |= c << i;

                if (start > end || __end_of_region(*end))
                        goto out;
        }

        if (hex_to_bin(*end--) >= 0)
                return ERR_PTR(-EOVERFLOW);
out:
        *num = ret;
        return end;
}

/**
 * bitmap_parse - convert an ASCII hex string into a bitmap.
 * @start: pointer to buffer containing string.
 * @buflen: buffer size in bytes.  If string is smaller than this
 *    then it must be terminated with a \0 or \n. In that case,
 *    UINT_MAX may be provided instead of string length.
 * @maskp: pointer to bitmap array that will contain result.
 * @nmaskbits: size of bitmap, in bits.
 *
 * Commas group hex digits into chunks.  Each chunk defines exactly 32
 * bits of the resultant bitmask.  No chunk may specify a value larger
 * than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value
 * then leading 0-bits are prepended.  %-EINVAL is returned for illegal
 * characters. Grouping such as "1,,5", ",44", "," or "" is allowed.
 * Leading, embedded and trailing whitespace accepted.
 */
int bitmap_parse(const char *start, unsigned int buflen,
                unsigned long *maskp, int nmaskbits)
{
        const char *end = strnchrnul(start, buflen, '\n') - 1;
        int chunks = BITS_TO_U32(nmaskbits);
        u32 *bitmap = (u32 *)maskp;
        int unset_bit;
        int chunk;

        for (chunk = 0; ; chunk++) {
                end = bitmap_find_region_reverse(start, end);
                if (start > end)
                        break;

                if (!chunks--)
                        return -EOVERFLOW;

#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
                end = bitmap_get_x32_reverse(start, end, &bitmap[chunk ^ 1]);
#else
                end = bitmap_get_x32_reverse(start, end, &bitmap[chunk]);
#endif
                if (IS_ERR(end))
                        return PTR_ERR(end);
        }

        unset_bit = (BITS_TO_U32(nmaskbits) - chunks) * 32;
        if (unset_bit < nmaskbits) {
                bitmap_clear(maskp, unset_bit, nmaskbits - unset_bit);
                return 0;
        }

        if (find_next_bit(maskp, unset_bit, nmaskbits) != unset_bit)
                return -EOVERFLOW;

        return 0;
}
EXPORT_SYMBOL(bitmap_parse);



































































































   54 





















































   21 






   21 






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGE_REF_H
#define _LINUX_PAGE_REF_H

#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/tracepoint-defs.h>

DECLARE_TRACEPOINT(page_ref_set);
DECLARE_TRACEPOINT(page_ref_mod);
DECLARE_TRACEPOINT(page_ref_mod_and_test);
DECLARE_TRACEPOINT(page_ref_mod_and_return);
DECLARE_TRACEPOINT(page_ref_mod_unless);
DECLARE_TRACEPOINT(page_ref_freeze);
DECLARE_TRACEPOINT(page_ref_unfreeze);

#ifdef CONFIG_DEBUG_PAGE_REF

/*
 * Ideally we would want to use the trace_<tracepoint>_enabled() helper
 * functions. But due to include header file issues, that is not
 * feasible. Instead we have to open code the static key functions.
 *
 * See trace_##name##_enabled(void) in include/linux/tracepoint.h
 */
#define page_ref_tracepoint_active(t) tracepoint_enabled(t)

extern void __page_ref_set(struct page *page, int v);
extern void __page_ref_mod(struct page *page, int v);
extern void __page_ref_mod_and_test(struct page *page, int v, int ret);
extern void __page_ref_mod_and_return(struct page *page, int v, int ret);
extern void __page_ref_mod_unless(struct page *page, int v, int u);
extern void __page_ref_freeze(struct page *page, int v, int ret);
extern void __page_ref_unfreeze(struct page *page, int v);

#else

#define page_ref_tracepoint_active(t) false

static inline void __page_ref_set(struct page *page, int v)
{
}
static inline void __page_ref_mod(struct page *page, int v)
{
}
static inline void __page_ref_mod_and_test(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_and_return(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_unless(struct page *page, int v, int u)
{
}
static inline void __page_ref_freeze(struct page *page, int v, int ret)
{
}
static inline void __page_ref_unfreeze(struct page *page, int v)
{
}

#endif

static inline int page_ref_count(const struct page *page)
{
        return atomic_read(&page->_refcount);
}

/**
 * folio_ref_count - The reference count on this folio.
 * @folio: The folio.
 *
 * The refcount is usually incremented by calls to folio_get() and
 * decremented by calls to folio_put().  Some typical users of the
 * folio refcount:
 *
 * - Each reference from a page table
 * - The page cache
 * - Filesystem private data
 * - The LRU list
 * - Pipes
 * - Direct IO which references this page in the process address space
 *
 * Return: The number of references to this folio.
 */
static inline int folio_ref_count(const struct folio *folio)
{
        return page_ref_count(&folio->page);
}

static inline int page_count(const struct page *page)
{
        return folio_ref_count(page_folio(page));
}

static inline void set_page_count(struct page *page, int v)
{
        atomic_set(&page->_refcount, v);
        if (page_ref_tracepoint_active(page_ref_set))
                __page_ref_set(page, v);
}

static inline void folio_set_count(struct folio *folio, int v)
{
        set_page_count(&folio->page, v);
}

/*
 * Setup the page count before being freed into the page allocator for
 * the first time (boot or memory hotplug)
 */
static inline void init_page_count(struct page *page)
{
        set_page_count(page, 1);
}

static inline void page_ref_add(struct page *page, int nr)
{
        atomic_add(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, nr);
}

static inline void folio_ref_add(struct folio *folio, int nr)
{
        page_ref_add(&folio->page, nr);
}

static inline void page_ref_sub(struct page *page, int nr)
{
        atomic_sub(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -nr);
}

static inline void folio_ref_sub(struct folio *folio, int nr)
{
        page_ref_sub(&folio->page, nr);
}

static inline int folio_ref_sub_return(struct folio *folio, int nr)
{
        int ret = atomic_sub_return(nr, &folio->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(&folio->page, -nr, ret);
        return ret;
}

static inline void page_ref_inc(struct page *page)
{
        atomic_inc(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, 1);
}

static inline void folio_ref_inc(struct folio *folio)
{
        page_ref_inc(&folio->page);
}

static inline void page_ref_dec(struct page *page)
{
        atomic_dec(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -1);
}

static inline void folio_ref_dec(struct folio *folio)
{
        page_ref_dec(&folio->page);
}

static inline int page_ref_sub_and_test(struct page *page, int nr)
{
        int ret = atomic_sub_and_test(nr, &page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -nr, ret);
        return ret;
}

static inline int folio_ref_sub_and_test(struct folio *folio, int nr)
{
        return page_ref_sub_and_test(&folio->page, nr);
}

static inline int page_ref_inc_return(struct page *page)
{
        int ret = atomic_inc_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, 1, ret);
        return ret;
}

static inline int folio_ref_inc_return(struct folio *folio)
{
        return page_ref_inc_return(&folio->page);
}

static inline int page_ref_dec_and_test(struct page *page)
{
        int ret = atomic_dec_and_test(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -1, ret);
        return ret;
}

static inline int folio_ref_dec_and_test(struct folio *folio)
{
        return page_ref_dec_and_test(&folio->page);
}

static inline int page_ref_dec_return(struct page *page)
{
        int ret = atomic_dec_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, -1, ret);
        return ret;
}

static inline int folio_ref_dec_return(struct folio *folio)
{
        return page_ref_dec_return(&folio->page);
}

static inline bool page_ref_add_unless_zero(struct page *page, int nr)
{
        bool ret = atomic_add_unless(&page->_refcount, nr, 0);

        if (page_ref_tracepoint_active(page_ref_mod_unless))
                __page_ref_mod_unless(page, nr, ret);
        return ret;
}

static inline bool folio_ref_add_unless_zero(struct folio *folio, int nr)
{
        return page_ref_add_unless_zero(&folio->page, nr);
}

/**
 * folio_try_get - Attempt to increase the refcount on a folio.
 * @folio: The folio.
 *
 * If you do not already have a reference to a folio, you can attempt to
 * get one using this function.  It may fail if, for example, the folio
 * has been freed since you found a pointer to it, or it is frozen for
 * the purposes of splitting or migration.
 *
 * Return: True if the reference count was successfully incremented.
 */
static inline bool folio_try_get(struct folio *folio)
{
        return folio_ref_add_unless_zero(folio, 1);
}

static inline bool folio_ref_try_add(struct folio *folio, int count)
{
        return folio_ref_add_unless_zero(folio, count);
}

static inline int page_ref_freeze(struct page *page, int count)
{
        int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);

        if (page_ref_tracepoint_active(page_ref_freeze))
                __page_ref_freeze(page, count, ret);
        return ret;
}

static inline int folio_ref_freeze(struct folio *folio, int count)
{
        return page_ref_freeze(&folio->page, count);
}

static inline void page_ref_unfreeze(struct page *page, int count)
{
        VM_BUG_ON_PAGE(page_count(page) != 0, page);
        VM_BUG_ON(count == 0);

        atomic_set_release(&page->_refcount, count);
        if (page_ref_tracepoint_active(page_ref_unfreeze))
                __page_ref_unfreeze(page, count);
}

static inline void folio_ref_unfreeze(struct folio *folio, int count)
{
        page_ref_unfreeze(&folio->page, count);
}
#endif




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 






    1 










    1 

    1 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                IPv4 Forwarding Information Base: semantics.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 */

#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/netlink.h>
#include <linux/hash.h>
#include <linux/nospec.h>

#include <net/arp.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/nexthop.h>
#include <net/netlink.h>
#include <net/rtnh.h>
#include <net/lwtunnel.h>
#include <net/fib_notifier.h>
#include <net/addrconf.h>

#include "fib_lookup.h"

/* for_nexthops and change_nexthops only used when nexthop object
 * is not set in a fib_info. The logic within can reference fib_nh.
 */
#ifdef CONFIG_IP_ROUTE_MULTIPATH

#define for_nexthops(fi) {                                                \
        int nhsel; const struct fib_nh *nh;                                \
        for (nhsel = 0, nh = (fi)->fib_nh;                                \
             nhsel < fib_info_num_path((fi));                                \
             nh++, nhsel++)

#define change_nexthops(fi) {                                                \
        int nhsel; struct fib_nh *nexthop_nh;                                \
        for (nhsel = 0,        nexthop_nh = (struct fib_nh *)((fi)->fib_nh);        \
             nhsel < fib_info_num_path((fi));                                \
             nexthop_nh++, nhsel++)

#else /* CONFIG_IP_ROUTE_MULTIPATH */

/* Hope, that gcc will optimize it to get rid of dummy loop */

#define for_nexthops(fi) {                                                \
        int nhsel; const struct fib_nh *nh = (fi)->fib_nh;                \
        for (nhsel = 0; nhsel < 1; nhsel++)

#define change_nexthops(fi) {                                                \
        int nhsel;                                                        \
        struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);        \
        for (nhsel = 0; nhsel < 1; nhsel++)

#endif /* CONFIG_IP_ROUTE_MULTIPATH */

#define endfor_nexthops(fi) }


const struct fib_prop fib_props[RTN_MAX + 1] = {
        [RTN_UNSPEC] = {
                .error        = 0,
                .scope        = RT_SCOPE_NOWHERE,
        },
        [RTN_UNICAST] = {
                .error        = 0,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_LOCAL] = {
                .error        = 0,
                .scope        = RT_SCOPE_HOST,
        },
        [RTN_BROADCAST] = {
                .error        = 0,
                .scope        = RT_SCOPE_LINK,
        },
        [RTN_ANYCAST] = {
                .error        = 0,
                .scope        = RT_SCOPE_LINK,
        },
        [RTN_MULTICAST] = {
                .error        = 0,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_BLACKHOLE] = {
                .error        = -EINVAL,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_UNREACHABLE] = {
                .error        = -EHOSTUNREACH,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_PROHIBIT] = {
                .error        = -EACCES,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_THROW] = {
                .error        = -EAGAIN,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_NAT] = {
                .error        = -EINVAL,
                .scope        = RT_SCOPE_NOWHERE,
        },
        [RTN_XRESOLVE] = {
                .error        = -EINVAL,
                .scope        = RT_SCOPE_NOWHERE,
        },
};

static void rt_fibinfo_free(struct rtable __rcu **rtp)
{
        struct rtable *rt = rcu_dereference_protected(*rtp, 1);

        if (!rt)
                return;

        /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
         * because we waited an RCU grace period before calling
         * free_fib_info_rcu()
         */

        dst_dev_put(&rt->dst);
        dst_release_immediate(&rt->dst);
}

static void free_nh_exceptions(struct fib_nh_common *nhc)
{
        struct fnhe_hash_bucket *hash;
        int i;

        hash = rcu_dereference_protected(nhc->nhc_exceptions, 1);
        if (!hash)
                return;
        for (i = 0; i < FNHE_HASH_SIZE; i++) {
                struct fib_nh_exception *fnhe;

                fnhe = rcu_dereference_protected(hash[i].chain, 1);
                while (fnhe) {
                        struct fib_nh_exception *next;

                        next = rcu_dereference_protected(fnhe->fnhe_next, 1);

                        rt_fibinfo_free(&fnhe->fnhe_rth_input);
                        rt_fibinfo_free(&fnhe->fnhe_rth_output);

                        kfree(fnhe);

                        fnhe = next;
                }
        }
        kfree(hash);
}

static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
{
        int cpu;

        if (!rtp)
                return;

        for_each_possible_cpu(cpu) {
                struct rtable *rt;

                rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
                if (rt) {
                        dst_dev_put(&rt->dst);
                        dst_release_immediate(&rt->dst);
                }
        }
        free_percpu(rtp);
}

void fib_nh_common_release(struct fib_nh_common *nhc)
{
        netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker);
        lwtstate_put(nhc->nhc_lwtstate);
        rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
        rt_fibinfo_free(&nhc->nhc_rth_input);
        free_nh_exceptions(nhc);
}
EXPORT_SYMBOL_GPL(fib_nh_common_release);

void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
        if (fib_nh->nh_tclassid)
                atomic_dec(&net->ipv4.fib_num_tclassid_users);
#endif
        fib_nh_common_release(&fib_nh->nh_common);
}

/* Release a nexthop info record */
static void free_fib_info_rcu(struct rcu_head *head)
{
        struct fib_info *fi = container_of(head, struct fib_info, rcu);

        if (fi->nh) {
                nexthop_put(fi->nh);
        } else {
                change_nexthops(fi) {
                        fib_nh_release(fi->fib_net, nexthop_nh);
                } endfor_nexthops(fi);
        }

        ip_fib_metrics_put(fi->fib_metrics);

        kfree(fi);
}

void free_fib_info(struct fib_info *fi)
{
        if (fi->fib_dead == 0) {
                pr_warn("Freeing alive fib_info %p\n", fi);
                return;
        }

        call_rcu_hurry(&fi->rcu, free_fib_info_rcu);
}
EXPORT_SYMBOL_GPL(free_fib_info);

void fib_release_info(struct fib_info *fi)
{
        ASSERT_RTNL();
        if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
                hlist_del(&fi->fib_hash);
                fi->fib_net->ipv4.fib_info_cnt--;

                if (fi->fib_prefsrc)
                        hlist_del(&fi->fib_lhash);
                if (fi->nh) {
                        list_del(&fi->nh_list);
                } else {
                        change_nexthops(fi) {
                                if (!nexthop_nh->fib_nh_dev)
                                        continue;
                                hlist_del_rcu(&nexthop_nh->nh_hash);
                        } endfor_nexthops(fi)
                }
                /* Paired with READ_ONCE() from fib_table_lookup() */
                WRITE_ONCE(fi->fib_dead, 1);
                fib_info_put(fi);
        }
}

static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
{
        const struct fib_nh *onh;

        if (fi->nh || ofi->nh)
                return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1;

        if (ofi->fib_nhs == 0)
                return 0;

        for_nexthops(fi) {
                onh = fib_info_nh(ofi, nhsel);

                if (nh->fib_nh_oif != onh->fib_nh_oif ||
                    nh->fib_nh_gw_family != onh->fib_nh_gw_family ||
                    nh->fib_nh_scope != onh->fib_nh_scope ||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
                    nh->fib_nh_weight != onh->fib_nh_weight ||
#endif
#ifdef CONFIG_IP_ROUTE_CLASSID
                    nh->nh_tclassid != onh->nh_tclassid ||
#endif
                    lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) ||
                    ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK))
                        return -1;

                if (nh->fib_nh_gw_family == AF_INET &&
                    nh->fib_nh_gw4 != onh->fib_nh_gw4)
                        return -1;

                if (nh->fib_nh_gw_family == AF_INET6 &&
                    ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6))
                        return -1;
        } endfor_nexthops(fi);
        return 0;
}

static struct hlist_head *fib_nh_head(struct net_device *dev)
{
        return &dev->fib_nh_head;
}

static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
                                      u32 prefsrc, u32 priority)
{
        unsigned int val = init_val;

        val ^= (protocol << 8) | scope;
        val ^= prefsrc;
        val ^= priority;

        return val;
}

static unsigned int fib_info_hashfn_result(const struct net *net,
                                           unsigned int val)
{
        return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits);
}

static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi)
{
        struct net *net = fi->fib_net;
        unsigned int val;

        val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
                                fi->fib_scope, (__force u32)fi->fib_prefsrc,
                                fi->fib_priority);

        if (fi->nh) {
                val ^= fi->nh->id;
        } else {
                for_nexthops(fi) {
                        val ^= nh->fib_nh_oif;
                } endfor_nexthops(fi)
        }

        return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)];
}

static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net,
                                                    __be32 val)
{
        unsigned int hash_bits = net->ipv4.fib_info_hash_bits;
        u32 slot;

        slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits);

        return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot];
}

static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits)
{
        /* The second half is used for prefsrc */
        return kvzalloc_objs(struct hlist_head, (1 << hash_bits) * 2);
}

static void fib_info_hash_free(struct hlist_head *head)
{
        kvfree(head);
}

static void fib_info_hash_grow(struct net *net)
{
        unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits;
        struct hlist_head *new_info_hash, *old_info_hash;
        unsigned int i;

        if (net->ipv4.fib_info_cnt < old_size)
                return;

        new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1);
        if (!new_info_hash)
                return;

        old_info_hash = net->ipv4.fib_info_hash;
        net->ipv4.fib_info_hash = new_info_hash;
        net->ipv4.fib_info_hash_bits += 1;

        for (i = 0; i < old_size; i++) {
                struct hlist_head *head = &old_info_hash[i];
                struct hlist_node *n;
                struct fib_info *fi;

                hlist_for_each_entry_safe(fi, n, head, fib_hash)
                        hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
        }

        for (i = 0; i < old_size; i++) {
                struct hlist_head *lhead = &old_info_hash[old_size + i];
                struct hlist_node *n;
                struct fib_info *fi;

                hlist_for_each_entry_safe(fi, n, lhead, fib_lhash)
                        hlist_add_head(&fi->fib_lhash,
                                       fib_info_laddrhash_bucket(fi->fib_net,
                                                                 fi->fib_prefsrc));
        }

        fib_info_hash_free(old_info_hash);
}

/* no metrics, only nexthop id */
static struct fib_info *fib_find_info_nh(struct net *net,
                                         const struct fib_config *cfg)
{
        struct hlist_head *head;
        struct fib_info *fi;
        unsigned int hash;

        hash = fib_info_hashfn_1(cfg->fc_nh_id,
                                 cfg->fc_protocol, cfg->fc_scope,
                                 (__force u32)cfg->fc_prefsrc,
                                 cfg->fc_priority);
        hash = fib_info_hashfn_result(net, hash);
        head = &net->ipv4.fib_info_hash[hash];

        hlist_for_each_entry(fi, head, fib_hash) {
                if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
                        continue;

                if (cfg->fc_protocol == fi->fib_protocol &&
                    cfg->fc_scope == fi->fib_scope &&
                    cfg->fc_prefsrc == fi->fib_prefsrc &&
                    cfg->fc_priority == fi->fib_priority &&
                    cfg->fc_type == fi->fib_type &&
                    cfg->fc_table == fi->fib_tb_id &&
                    !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK))
                        return fi;
        }

        return NULL;
}

static struct fib_info *fib_find_info(struct fib_info *nfi)
{
        struct hlist_head *head = fib_info_hash_bucket(nfi);
        struct fib_info *fi;

        hlist_for_each_entry(fi, head, fib_hash) {
                if (fi->fib_nhs != nfi->fib_nhs)
                        continue;

                if (nfi->fib_protocol == fi->fib_protocol &&
                    nfi->fib_scope == fi->fib_scope &&
                    nfi->fib_prefsrc == fi->fib_prefsrc &&
                    nfi->fib_priority == fi->fib_priority &&
                    nfi->fib_type == fi->fib_type &&
                    nfi->fib_tb_id == fi->fib_tb_id &&
                    memcmp(nfi->fib_metrics, fi->fib_metrics,
                           sizeof(u32) * RTAX_MAX) == 0 &&
                    !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
                    nh_comp(fi, nfi) == 0)
                        return fi;
        }

        return NULL;
}

/* Check, that the gateway is already configured.
 * Used only by redirect accept routine, under rcu_read_lock();
 */
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
        struct hlist_head *head;
        struct fib_nh *nh;

        head = fib_nh_head(dev);

        hlist_for_each_entry_rcu(nh, head, nh_hash) {
                DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
                if (nh->fib_nh_gw4 == gw &&
                    !(nh->fib_nh_flags & RTNH_F_DEAD)) {
                        return 0;
                }
        }

        return -1;
}

size_t fib_nlmsg_size(struct fib_info *fi)
{
        size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
                         + nla_total_size(4) /* RTA_TABLE */
                         + nla_total_size(4) /* RTA_DST */
                         + nla_total_size(4) /* RTA_PRIORITY */
                         + nla_total_size(4) /* RTA_PREFSRC */
                         + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
        unsigned int nhs = fib_info_num_path(fi);

        /* space for nested metrics */
        payload += nla_total_size((RTAX_MAX * nla_total_size(4)));

        if (fi->nh)
                payload += nla_total_size(4); /* RTA_NH_ID */

        if (nhs) {
                size_t nh_encapsize = 0;
                /* Also handles the special case nhs == 1 */

                /* each nexthop is packed in an attribute */
                size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
                unsigned int i;

                /* may contain flow and gateway attribute */
                nhsize += 2 * nla_total_size(4);

                /* grab encap info */
                for (i = 0; i < fib_info_num_path(fi); i++) {
                        struct fib_nh_common *nhc = fib_info_nhc(fi, i);

                        if (nhc->nhc_lwtstate) {
                                /* RTA_ENCAP_TYPE */
                                nh_encapsize += lwtunnel_get_encap_size(
                                                nhc->nhc_lwtstate);
                                /* RTA_ENCAP */
                                nh_encapsize +=  nla_total_size(2);
                        }
                }

                /* all nexthops are packed in a nested attribute */
                payload += nla_total_size((nhs * nhsize) + nh_encapsize);

        }

        return payload;
}

void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
               int dst_len, u32 tb_id, const struct nl_info *info,
               unsigned int nlm_flags)
{
        struct fib_rt_info fri;
        struct sk_buff *skb;
        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
        int err = -ENOBUFS;

        skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
        if (!skb)
                goto errout;

        fri.fi = fa->fa_info;
        fri.tb_id = tb_id;
        fri.dst = key;
        fri.dst_len = dst_len;
        fri.dscp = fa->fa_dscp;
        fri.type = fa->fa_type;
        fri.offload = READ_ONCE(fa->offload);
        fri.trap = READ_ONCE(fa->trap);
        fri.offload_failed = READ_ONCE(fa->offload_failed);
        err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
                    info->nlh, GFP_KERNEL);
        return;
errout:
        rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}

static int fib_detect_death(struct fib_info *fi, int order,
                            struct fib_info **last_resort, int *last_idx,
                            int dflt)
{
        const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
        struct neighbour *n;
        int state = NUD_NONE;

        if (likely(nhc->nhc_gw_family == AF_INET))
                n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev);
        else if (IS_ENABLED(CONFIG_IPV6) && nhc->nhc_gw_family == AF_INET6)
                n = neigh_lookup(&nd_tbl, &nhc->nhc_gw.ipv6, nhc->nhc_dev);
        else
                n = NULL;

        if (n) {
                state = READ_ONCE(n->nud_state);
                neigh_release(n);
        } else {
                return 0;
        }
        if (state == NUD_REACHABLE)
                return 0;
        if ((state & NUD_VALID) && order != dflt)
                return 0;
        if ((state & NUD_VALID) ||
            (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
                *last_resort = fi;
                *last_idx = order;
        }
        return 1;
}

int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
                       struct nlattr *encap, u16 encap_type,
                       void *cfg, gfp_t gfp_flags,
                       struct netlink_ext_ack *extack)
{
        int err;

        nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *,
                                                    gfp_flags);
        if (!nhc->nhc_pcpu_rth_output)
                return -ENOMEM;

        if (encap) {
                struct lwtunnel_state *lwtstate;

                err = lwtunnel_build_state(net, encap_type, encap,
                                           nhc->nhc_family, cfg, &lwtstate,
                                           extack);
                if (err)
                        goto lwt_failure;

                nhc->nhc_lwtstate = lwtstate_get(lwtstate);
        }

        return 0;

lwt_failure:
        rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
        nhc->nhc_pcpu_rth_output = NULL;
        return err;
}
EXPORT_SYMBOL_GPL(fib_nh_common_init);

int fib_nh_init(struct net *net, struct fib_nh *nh,
                struct fib_config *cfg, int nh_weight,
                struct netlink_ext_ack *extack)
{
        int err;

        nh->fib_nh_family = AF_INET;

        err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap,
                                 cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
        if (err)
                return err;

        nh->fib_nh_oif = cfg->fc_oif;
        nh->fib_nh_gw_family = cfg->fc_gw_family;
        if (cfg->fc_gw_family == AF_INET)
                nh->fib_nh_gw4 = cfg->fc_gw4;
        else if (cfg->fc_gw_family == AF_INET6)
                nh->fib_nh_gw6 = cfg->fc_gw6;

        nh->fib_nh_flags = cfg->fc_flags;

#ifdef CONFIG_IP_ROUTE_CLASSID
        nh->nh_tclassid = cfg->fc_flow;
        if (nh->nh_tclassid)
                atomic_inc(&net->ipv4.fib_num_tclassid_users);
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        nh->fib_nh_weight = nh_weight;
#endif
        return 0;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH

static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
                              struct netlink_ext_ack *extack)
{
        int nhs = 0;

        while (rtnh_ok(rtnh, remaining)) {
                nhs++;
                rtnh = rtnh_next(rtnh, &remaining);
        }

        /* leftover implies invalid nexthop configuration, discard it */
        if (remaining > 0) {
                NL_SET_ERR_MSG(extack,
                               "Invalid nexthop configuration - extra data after nexthops");
                nhs = 0;
        }

        return nhs;
}

static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla,
                            struct netlink_ext_ack *extack)
{
        if (nla_len(nla) < sizeof(*gw)) {
                NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY");
                return -EINVAL;
        }

        *gw = nla_get_in_addr(nla);

        return 0;
}

/* only called when fib_nh is integrated into fib_info */
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
                       int remaining, struct fib_config *cfg,
                       struct netlink_ext_ack *extack)
{
        struct net *net = fi->fib_net;
        struct fib_config fib_cfg;
        struct fib_nh *nh;
        int ret;

        change_nexthops(fi) {
                int attrlen;

                memset(&fib_cfg, 0, sizeof(fib_cfg));

                if (!rtnh_ok(rtnh, remaining)) {
                        NL_SET_ERR_MSG(extack,
                                       "Invalid nexthop configuration - extra data after nexthop");
                        return -EINVAL;
                }

                if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) {
                        NL_SET_ERR_MSG(extack,
                                       "Invalid flags for nexthop - can not contain DEAD or LINKDOWN");
                        return -EINVAL;
                }

                fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
                fib_cfg.fc_oif = rtnh->rtnh_ifindex;

                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);

                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        nlav = nla_find(attrs, attrlen, RTA_VIA);
                        if (nla && nlav) {
                                NL_SET_ERR_MSG(extack,
                                               "Nexthop configuration can not contain both GATEWAY and VIA");
                                return -EINVAL;
                        }
                        if (nla) {
                                ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla,
                                                       extack);
                                if (ret)
                                        goto errout;

                                if (fib_cfg.fc_gw4)
                                        fib_cfg.fc_gw_family = AF_INET;
                        } else if (nlav) {
                                ret = fib_gw_from_via(&fib_cfg, nlav, extack);
                                if (ret)
                                        goto errout;
                        }

                        nla = nla_find(attrs, attrlen, RTA_FLOW);
                        if (nla) {
                                if (nla_len(nla) < sizeof(u32)) {
                                        NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
                                        return -EINVAL;
                                }
                                fib_cfg.fc_flow = nla_get_u32(nla);
                        }

                        fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
                        /* RTA_ENCAP_TYPE length checked in
                         * lwtunnel_valid_encap_type_attr
                         */
                        nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
                        if (nla)
                                fib_cfg.fc_encap_type = nla_get_u16(nla);
                }

                ret = fib_nh_init(net, nexthop_nh, &fib_cfg,
                                  rtnh->rtnh_hops + 1, extack);
                if (ret)
                        goto errout;

                rtnh = rtnh_next(rtnh, &remaining);
        } endfor_nexthops(fi);

        ret = -EINVAL;
        nh = fib_info_nh(fi, 0);
        if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) {
                NL_SET_ERR_MSG(extack,
                               "Nexthop device index does not match RTA_OIF");
                goto errout;
        }
        if (cfg->fc_gw_family) {
                if (cfg->fc_gw_family != nh->fib_nh_gw_family ||
                    (cfg->fc_gw_family == AF_INET &&
                     nh->fib_nh_gw4 != cfg->fc_gw4) ||
                    (cfg->fc_gw_family == AF_INET6 &&
                     ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) {
                        NL_SET_ERR_MSG(extack,
                                       "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA");
                        goto errout;
                }
        }
#ifdef CONFIG_IP_ROUTE_CLASSID
        if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) {
                NL_SET_ERR_MSG(extack,
                               "Nexthop class id does not match RTA_FLOW");
                goto errout;
        }
#endif
        ret = 0;
errout:
        return ret;
}

/* only called when fib_nh is integrated into fib_info */
static void fib_rebalance(struct fib_info *fi)
{
        int total;
        int w;

        if (fib_info_num_path(fi) < 2)
                return;

        total = 0;
        for_nexthops(fi) {
                if (nh->fib_nh_flags & RTNH_F_DEAD)
                        continue;

                if (ip_ignore_linkdown(nh->fib_nh_dev) &&
                    nh->fib_nh_flags & RTNH_F_LINKDOWN)
                        continue;

                total += nh->fib_nh_weight;
        } endfor_nexthops(fi);

        w = 0;
        change_nexthops(fi) {
                int upper_bound;

                if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) {
                        upper_bound = -1;
                } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) &&
                           nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
                        upper_bound = -1;
                } else {
                        w += nexthop_nh->fib_nh_weight;
                        upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
                                                            total) - 1;
                }

                atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound);
        } endfor_nexthops(fi);
}
#else /* CONFIG_IP_ROUTE_MULTIPATH */

static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
                       int remaining, struct fib_config *cfg,
                       struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel");

        return -EINVAL;
}

#define fib_rebalance(fi) do { } while (0)

#endif /* CONFIG_IP_ROUTE_MULTIPATH */

static int fib_encap_match(struct net *net, u16 encap_type,
                           struct nlattr *encap,
                           const struct fib_nh *nh,
                           const struct fib_config *cfg,
                           struct netlink_ext_ack *extack)
{
        struct lwtunnel_state *lwtstate;
        int ret, result = 0;

        if (encap_type == LWTUNNEL_ENCAP_NONE)
                return 0;

        ret = lwtunnel_build_state(net, encap_type, encap, AF_INET,
                                   cfg, &lwtstate, extack);
        if (!ret) {
                result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws);
                lwtstate_free(lwtstate);
        }

        return result;
}

int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
                 struct netlink_ext_ack *extack)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        struct rtnexthop *rtnh;
        int remaining;
#endif

        if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
                return 1;

        if (cfg->fc_nh_id) {
                if (fi->nh && cfg->fc_nh_id == fi->nh->id)
                        return 0;
                return 1;
        }

        if (fi->nh) {
                if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp)
                        return 1;
                return 0;
        }

        if (cfg->fc_oif || cfg->fc_gw_family) {
                struct fib_nh *nh;

                nh = fib_info_nh(fi, 0);
                if (cfg->fc_encap) {
                        if (fib_encap_match(net, cfg->fc_encap_type,
                                            cfg->fc_encap, nh, cfg, extack))
                                return 1;
                }
#ifdef CONFIG_IP_ROUTE_CLASSID
                if (cfg->fc_flow &&
                    cfg->fc_flow != nh->nh_tclassid)
                        return 1;
#endif
                if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) ||
                    (cfg->fc_gw_family &&
                     cfg->fc_gw_family != nh->fib_nh_gw_family))
                        return 1;

                if (cfg->fc_gw_family == AF_INET &&
                    cfg->fc_gw4 != nh->fib_nh_gw4)
                        return 1;

                if (cfg->fc_gw_family == AF_INET6 &&
                    ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6))
                        return 1;

                return 0;
        }

#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (!cfg->fc_mp)
                return 0;

        rtnh = cfg->fc_mp;
        remaining = cfg->fc_mp_len;

        for_nexthops(fi) {
                int attrlen;

                if (!rtnh_ok(rtnh, remaining))
                        return -EINVAL;

                if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif)
                        return 1;

                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
                        int err;

                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        nlav = nla_find(attrs, attrlen, RTA_VIA);
                        if (nla && nlav) {
                                NL_SET_ERR_MSG(extack,
                                               "Nexthop configuration can not contain both GATEWAY and VIA");
                                return -EINVAL;
                        }

                        if (nla) {
                                __be32 gw;

                                err = fib_gw_from_attr(&gw, nla, extack);
                                if (err)
                                        return err;

                                if (nh->fib_nh_gw_family != AF_INET ||
                                    gw != nh->fib_nh_gw4)
                                        return 1;
                        } else if (nlav) {
                                struct fib_config cfg2;

                                err = fib_gw_from_via(&cfg2, nlav, extack);
                                if (err)
                                        return err;

                                switch (nh->fib_nh_gw_family) {
                                case AF_INET:
                                        if (cfg2.fc_gw_family != AF_INET ||
                                            cfg2.fc_gw4 != nh->fib_nh_gw4)
                                                return 1;
                                        break;
                                case AF_INET6:
                                        if (cfg2.fc_gw_family != AF_INET6 ||
                                            ipv6_addr_cmp(&cfg2.fc_gw6,
                                                          &nh->fib_nh_gw6))
                                                return 1;
                                        break;
                                }
                        }

#ifdef CONFIG_IP_ROUTE_CLASSID
                        nla = nla_find(attrs, attrlen, RTA_FLOW);
                        if (nla) {
                                if (nla_len(nla) < sizeof(u32)) {
                                        NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
                                        return -EINVAL;
                                }
                                if (nla_get_u32(nla) != nh->nh_tclassid)
                                        return 1;
                        }
#endif
                }

                rtnh = rtnh_next(rtnh, &remaining);
        } endfor_nexthops(fi);
#endif
        return 0;
}

bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
{
        struct nlattr *nla;
        int remaining;

        if (!cfg->fc_mx)
                return true;

        nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
                int type = nla_type(nla);
                u32 fi_val, val;

                if (!type)
                        continue;
                if (type > RTAX_MAX)
                        return false;

                type = array_index_nospec(type, RTAX_MAX + 1);
                if (type == RTAX_CC_ALGO) {
                        char tmp[TCP_CA_NAME_MAX];
                        bool ecn_ca = false;

                        nla_strscpy(tmp, nla, sizeof(tmp));
                        val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
                } else {
                        if (nla_len(nla) != sizeof(u32))
                                return false;
                        val = nla_get_u32(nla);
                }

                fi_val = fi->fib_metrics->metrics[type - 1];
                if (type == RTAX_FEATURES)
                        fi_val &= ~DST_FEATURE_ECN_CA;

                if (fi_val != val)
                        return false;
        }

        return true;
}

static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh,
                              u32 table, struct netlink_ext_ack *extack)
{
        struct fib6_config cfg = {
                .fc_table = table,
                .fc_flags = nh->fib_nh_flags | RTF_GATEWAY,
                .fc_ifindex = nh->fib_nh_oif,
                .fc_gateway = nh->fib_nh_gw6,
        };
        struct fib6_nh fib6_nh = {};
        int err;

        err = fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack);
        if (!err) {
                nh->fib_nh_dev = fib6_nh.fib_nh_dev;
                netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
                            GFP_KERNEL);
                nh->fib_nh_oif = nh->fib_nh_dev->ifindex;
                nh->fib_nh_scope = RT_SCOPE_LINK;

                fib6_nh_release(&fib6_nh);
        }

        return err;
}

/*
 * Picture
 * -------
 *
 * Semantics of nexthop is very messy by historical reasons.
 * We have to take into account, that:
 * a) gateway can be actually local interface address,
 *    so that gatewayed route is direct.
 * b) gateway must be on-link address, possibly
 *    described not by an ifaddr, but also by a direct route.
 * c) If both gateway and interface are specified, they should not
 *    contradict.
 * d) If we use tunnel routes, gateway could be not on-link.
 *
 * Attempt to reconcile all of these (alas, self-contradictory) conditions
 * results in pretty ugly and hairy code with obscure logic.
 *
 * I chose to generalized it instead, so that the size
 * of code does not increase practically, but it becomes
 * much more general.
 * Every prefix is assigned a "scope" value: "host" is local address,
 * "link" is direct route,
 * [ ... "site" ... "interior" ... ]
 * and "universe" is true gateway route with global meaning.
 *
 * Every prefix refers to a set of "nexthop"s (gw, oif),
 * where gw must have narrower scope. This recursion stops
 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
 * which means that gw is forced to be on link.
 *
 * Code is still hairy, but now it is apparently logically
 * consistent and very flexible. F.e. as by-product it allows
 * to co-exists in peace independent exterior and interior
 * routing processes.
 *
 * Normally it looks as following.
 *
 * {universe prefix}  -> (gw, oif) [scope link]
 *                  |
 *                  |-> {link prefix} -> (gw, oif) [scope local]
 *                                        |
 *                                        |-> {local prefix} (terminal node)
 */
static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
                              u8 scope, struct netlink_ext_ack *extack)
{
        struct net_device *dev;
        struct fib_result res;
        int err = 0;

        if (nh->fib_nh_flags & RTNH_F_ONLINK) {
                unsigned int addr_type;

                if (scope >= RT_SCOPE_LINK) {
                        NL_SET_ERR_MSG(extack, "Nexthop has invalid scope");
                        return -EINVAL;
                }
                dev = __dev_get_by_index(net, nh->fib_nh_oif);
                if (!dev) {
                        NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
                        return -ENODEV;
                }
                if (!(dev->flags & IFF_UP)) {
                        NL_SET_ERR_MSG(extack, "Nexthop device is not up");
                        return -ENETDOWN;
                }
                addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4);
                if (addr_type != RTN_UNICAST) {
                        NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
                        return -EINVAL;
                }
                if (!netif_carrier_ok(dev))
                        nh->fib_nh_flags |= RTNH_F_LINKDOWN;
                nh->fib_nh_dev = dev;
                netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
                nh->fib_nh_scope = RT_SCOPE_LINK;
                return 0;
        }
        rcu_read_lock();
        {
                struct fib_table *tbl = NULL;
                struct flowi4 fl4 = {
                        .daddr = nh->fib_nh_gw4,
                        .flowi4_scope = scope + 1,
                        .flowi4_oif = nh->fib_nh_oif,
                        .flowi4_iif = LOOPBACK_IFINDEX,
                };

                /* It is not necessary, but requires a bit of thinking */
                if (fl4.flowi4_scope < RT_SCOPE_LINK)
                        fl4.flowi4_scope = RT_SCOPE_LINK;

                if (table && table != RT_TABLE_MAIN)
                        tbl = fib_get_table(net, table);

                if (tbl)
                        err = fib_table_lookup(tbl, &fl4, &res,
                                               FIB_LOOKUP_IGNORE_LINKSTATE |
                                               FIB_LOOKUP_NOREF);

                /* on error or if no table given do full lookup. This
                 * is needed for example when nexthops are in the local
                 * table rather than the given table
                 */
                if (!tbl || err) {
                        err = fib_lookup(net, &fl4, &res,
                                         FIB_LOOKUP_IGNORE_LINKSTATE);
                }

                if (err) {
                        NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
                        goto out;
                }
        }

        err = -EINVAL;
        if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
                NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
                goto out;
        }
        nh->fib_nh_scope = res.scope;
        nh->fib_nh_oif = FIB_RES_OIF(res);
        nh->fib_nh_dev = dev = FIB_RES_DEV(res);
        if (!dev) {
                NL_SET_ERR_MSG(extack,
                               "No egress device for nexthop gateway");
                goto out;
        }
        netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
        if (!netif_carrier_ok(dev))
                nh->fib_nh_flags |= RTNH_F_LINKDOWN;
        err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
out:
        rcu_read_unlock();
        return err;
}

static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
                              struct netlink_ext_ack *extack)
{
        struct in_device *in_dev;
        int err;

        if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
                return -EINVAL;
        }

        rcu_read_lock();

        err = -ENODEV;
        in_dev = inetdev_by_index(net, nh->fib_nh_oif);
        if (!in_dev)
                goto out;
        err = -ENETDOWN;
        if (!(in_dev->dev->flags & IFF_UP)) {
                NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
                goto out;
        }

        nh->fib_nh_dev = in_dev->dev;
        netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
        nh->fib_nh_scope = RT_SCOPE_HOST;
        if (!netif_carrier_ok(nh->fib_nh_dev))
                nh->fib_nh_flags |= RTNH_F_LINKDOWN;
        err = 0;
out:
        rcu_read_unlock();
        return err;
}

int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
                 struct netlink_ext_ack *extack)
{
        int err;

        if (nh->fib_nh_gw_family == AF_INET)
                err = fib_check_nh_v4_gw(net, nh, table, scope, extack);
        else if (nh->fib_nh_gw_family == AF_INET6)
                err = fib_check_nh_v6_gw(net, nh, table, extack);
        else
                err = fib_check_nh_nongw(net, nh, extack);

        return err;
}

__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
                                 unsigned char scope)
{
        struct fib_nh *nh;
        __be32 saddr;

        if (nhc->nhc_family != AF_INET)
                return inet_select_addr(nhc->nhc_dev, 0, scope);

        nh = container_of(nhc, struct fib_nh, nh_common);
        saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope);

        WRITE_ONCE(nh->nh_saddr, saddr);
        WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid));

        return saddr;
}

__be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
{
        struct fib_nh_common *nhc = res->nhc;

        if (res->fi->fib_prefsrc)
                return res->fi->fib_prefsrc;

        if (nhc->nhc_family == AF_INET) {
                struct fib_nh *nh;

                nh = container_of(nhc, struct fib_nh, nh_common);
                if (READ_ONCE(nh->nh_saddr_genid) ==
                    atomic_read(&net->ipv4.dev_addr_genid))
                        return READ_ONCE(nh->nh_saddr);
        }

        return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope);
}

static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
{
        if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
            fib_prefsrc != cfg->fc_dst) {
                u32 tb_id = cfg->fc_table;
                int rc;

                if (tb_id == RT_TABLE_MAIN)
                        tb_id = RT_TABLE_LOCAL;

                rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
                                          fib_prefsrc, tb_id);

                if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
                        rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
                                                  fib_prefsrc, RT_TABLE_LOCAL);
                }

                if (rc != RTN_LOCAL)
                        return false;
        }
        return true;
}

struct fib_info *fib_create_info(struct fib_config *cfg,
                                 struct netlink_ext_ack *extack)
{
        int err;
        struct fib_info *fi = NULL;
        struct nexthop *nh = NULL;
        struct fib_info *ofi;
        int nhs = 1;
        struct net *net = cfg->fc_nlinfo.nl_net;

        ASSERT_RTNL();
        if (cfg->fc_type > RTN_MAX)
                goto err_inval;

        /* Fast check to catch the most weird cases */
        if (fib_props[cfg->fc_type].scope > cfg->fc_scope) {
                NL_SET_ERR_MSG(extack, "Invalid scope");
                goto err_inval;
        }

        if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid rtm_flags - can not contain DEAD or LINKDOWN");
                goto err_inval;
        }

        if (cfg->fc_nh_id) {
                if (!cfg->fc_mx) {
                        fi = fib_find_info_nh(net, cfg);
                        if (fi) {
                                refcount_inc(&fi->fib_treeref);
                                return fi;
                        }
                }

                nh = nexthop_find_by_id(net, cfg->fc_nh_id);
                if (!nh) {
                        NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                        goto err_inval;
                }
                nhs = 0;
        }

#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (cfg->fc_mp) {
                nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack);
                if (nhs == 0)
                        goto err_inval;
        }
#endif

        fib_info_hash_grow(net);

        fi = kzalloc_flex(*fi, fib_nh, nhs);
        if (!fi) {
                err = -ENOBUFS;
                goto failure;
        }

        fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack);
        if (IS_ERR(fi->fib_metrics)) {
                err = PTR_ERR(fi->fib_metrics);
                kfree(fi);
                return ERR_PTR(err);
        }

        fi->fib_net = net;
        fi->fib_protocol = cfg->fc_protocol;
        fi->fib_scope = cfg->fc_scope;
        fi->fib_flags = cfg->fc_flags;
        fi->fib_priority = cfg->fc_priority;
        fi->fib_prefsrc = cfg->fc_prefsrc;
        fi->fib_type = cfg->fc_type;
        fi->fib_tb_id = cfg->fc_table;

        fi->fib_nhs = nhs;
        if (nh) {
                if (!nexthop_get(nh)) {
                        NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
                        err = -EINVAL;
                } else {
                        err = 0;
                        fi->nh = nh;
                }
        } else {
                change_nexthops(fi) {
                        nexthop_nh->nh_parent = fi;
                } endfor_nexthops(fi)

                if (cfg->fc_mp)
                        err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg,
                                          extack);
                else
                        err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
        }

        if (err != 0)
                goto failure;

        if (fib_props[cfg->fc_type].error) {
                if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) {
                        NL_SET_ERR_MSG(extack,
                                       "Gateway, device and multipath can not be specified for this route type");
                        goto err_inval;
                }
                goto link_it;
        } else {
                switch (cfg->fc_type) {
                case RTN_UNICAST:
                case RTN_LOCAL:
                case RTN_BROADCAST:
                case RTN_ANYCAST:
                case RTN_MULTICAST:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Invalid route type");
                        goto err_inval;
                }
        }

        if (cfg->fc_scope > RT_SCOPE_HOST) {
                NL_SET_ERR_MSG(extack, "Invalid scope");
                goto err_inval;
        }

        if (fi->nh) {
                err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack);
                if (err)
                        goto failure;
        } else if (cfg->fc_scope == RT_SCOPE_HOST) {
                struct fib_nh *nh = fi->fib_nh;

                /* Local address is added. */
                if (nhs != 1) {
                        NL_SET_ERR_MSG(extack,
                                       "Route with host scope can not have multiple nexthops");
                        goto err_inval;
                }
                if (nh->fib_nh_gw_family) {
                        NL_SET_ERR_MSG(extack,
                                       "Route with host scope can not have a gateway");
                        goto err_inval;
                }
                nh->fib_nh_scope = RT_SCOPE_NOWHERE;
                nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif);
                err = -ENODEV;
                if (!nh->fib_nh_dev)
                        goto failure;
                netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
                                     GFP_KERNEL);
        } else {
                int linkdown = 0;

                change_nexthops(fi) {
                        err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh,
                                           cfg->fc_table, cfg->fc_scope,
                                           extack);
                        if (err != 0)
                                goto failure;
                        if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN)
                                linkdown++;
                } endfor_nexthops(fi)
                if (linkdown == fi->fib_nhs)
                        fi->fib_flags |= RTNH_F_LINKDOWN;
        }

        if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) {
                NL_SET_ERR_MSG(extack, "Invalid prefsrc address");
                goto err_inval;
        }

        if (!fi->nh) {
                change_nexthops(fi) {
                        fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common,
                                                  fi->fib_scope);
                        if (nexthop_nh->fib_nh_gw_family == AF_INET6)
                                fi->fib_nh_is_v6 = true;
                } endfor_nexthops(fi)

                fib_rebalance(fi);
        }

link_it:
        ofi = fib_find_info(fi);
        if (ofi) {
                /* fib_table_lookup() should not see @fi yet. */
                fi->fib_dead = 1;
                free_fib_info(fi);
                refcount_inc(&ofi->fib_treeref);
                return ofi;
        }

        refcount_set(&fi->fib_treeref, 1);
        refcount_set(&fi->fib_clntref, 1);

        net->ipv4.fib_info_cnt++;
        hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));

        if (fi->fib_prefsrc) {
                struct hlist_head *head;

                head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc);
                hlist_add_head(&fi->fib_lhash, head);
        }
        if (fi->nh) {
                list_add(&fi->nh_list, &nh->fi_list);
        } else {
                change_nexthops(fi) {
                        struct hlist_head *head;

                        if (!nexthop_nh->fib_nh_dev)
                                continue;
                        head = fib_nh_head(nexthop_nh->fib_nh_dev);
                        hlist_add_head_rcu(&nexthop_nh->nh_hash, head);
                } endfor_nexthops(fi)
        }
        return fi;

err_inval:
        err = -EINVAL;

failure:
        if (fi) {
                /* fib_table_lookup() should not see @fi yet. */
                fi->fib_dead = 1;
                free_fib_info(fi);
        }

        return ERR_PTR(err);
}

int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
                     u8 rt_family, unsigned char *flags, bool skip_oif)
{
        if (nhc->nhc_flags & RTNH_F_DEAD)
                *flags |= RTNH_F_DEAD;

        if (nhc->nhc_flags & RTNH_F_LINKDOWN) {
                *flags |= RTNH_F_LINKDOWN;

                rcu_read_lock();
                switch (nhc->nhc_family) {
                case AF_INET:
                        if (ip_ignore_linkdown(nhc->nhc_dev))
                                *flags |= RTNH_F_DEAD;
                        break;
                case AF_INET6:
                        if (ip6_ignore_linkdown(nhc->nhc_dev))
                                *flags |= RTNH_F_DEAD;
                        break;
                }
                rcu_read_unlock();
        }

        switch (nhc->nhc_gw_family) {
        case AF_INET:
                if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
                        goto nla_put_failure;
                break;
        case AF_INET6:
                /* if gateway family does not match nexthop family
                 * gateway is encoded as RTA_VIA
                 */
                if (rt_family != nhc->nhc_gw_family) {
                        int alen = sizeof(struct in6_addr);
                        struct nlattr *nla;
                        struct rtvia *via;

                        nla = nla_reserve(skb, RTA_VIA, alen + 2);
                        if (!nla)
                                goto nla_put_failure;

                        via = nla_data(nla);
                        via->rtvia_family = AF_INET6;
                        memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen);
                } else if (nla_put_in6_addr(skb, RTA_GATEWAY,
                                            &nhc->nhc_gw.ipv6) < 0) {
                        goto nla_put_failure;
                }
                break;
        }

        *flags |= (nhc->nhc_flags &
                   (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP));

        if (!skip_oif && nhc->nhc_dev &&
            nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex))
                goto nla_put_failure;

        if (lwtunnel_fill_encap(skb, nhc->nhc_lwtstate,
                                RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
                goto nla_put_failure;

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(fib_nexthop_info);

#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
                    int nh_weight, u8 rt_family, u32 nh_tclassid)
{
        const struct net_device *dev = nhc->nhc_dev;
        struct rtnexthop *rtnh;
        unsigned char flags = 0;

        rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
        if (!rtnh)
                goto nla_put_failure;

        rtnh->rtnh_hops = nh_weight - 1;
        rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;

        if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0)
                goto nla_put_failure;

        rtnh->rtnh_flags = flags;

        if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid))
                goto nla_put_failure;

        /* length of rtnetlink header + attributes */
        rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(fib_add_nexthop);
#endif

#ifdef CONFIG_IP_ROUTE_MULTIPATH
static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
{
        struct nlattr *mp;

        mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
        if (!mp)
                goto nla_put_failure;

        if (unlikely(fi->nh)) {
                if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0)
                        goto nla_put_failure;
                goto mp_end;
        }

        for_nexthops(fi) {
                u32 nh_tclassid = 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
                nh_tclassid = nh->nh_tclassid;
#endif
                if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
                                    AF_INET, nh_tclassid) < 0)
                        goto nla_put_failure;
        } endfor_nexthops(fi);

mp_end:
        nla_nest_end(skb, mp);

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}
#else
static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
{
        return 0;
}
#endif

int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
                  const struct fib_rt_info *fri, unsigned int flags)
{
        unsigned int nhs = fib_info_num_path(fri->fi);
        struct fib_info *fi = fri->fi;
        u32 tb_id = fri->tb_id;
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
        if (!nlh)
                return -EMSGSIZE;

        rtm = nlmsg_data(nlh);
        rtm->rtm_family = AF_INET;
        rtm->rtm_dst_len = fri->dst_len;
        rtm->rtm_src_len = 0;
        rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp);
        if (tb_id < 256)
                rtm->rtm_table = tb_id;
        else
                rtm->rtm_table = RT_TABLE_COMPAT;
        if (nla_put_u32(skb, RTA_TABLE, tb_id))
                goto nla_put_failure;
        rtm->rtm_type = fri->type;
        rtm->rtm_flags = fi->fib_flags;
        rtm->rtm_scope = fi->fib_scope;
        rtm->rtm_protocol = fi->fib_protocol;

        if (rtm->rtm_dst_len &&
            nla_put_in_addr(skb, RTA_DST, fri->dst))
                goto nla_put_failure;
        if (fi->fib_priority &&
            nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
                goto nla_put_failure;
        if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0)
                goto nla_put_failure;

        if (fi->fib_prefsrc &&
            nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
                goto nla_put_failure;

        if (fi->nh) {
                if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id))
                        goto nla_put_failure;
                if (nexthop_is_blackhole(fi->nh))
                        rtm->rtm_type = RTN_BLACKHOLE;
                if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode))
                        goto offload;
        }

        if (nhs == 1) {
                const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
                unsigned char flags = 0;

                if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0)
                        goto nla_put_failure;

                rtm->rtm_flags = flags;
#ifdef CONFIG_IP_ROUTE_CLASSID
                if (nhc->nhc_family == AF_INET) {
                        struct fib_nh *nh;

                        nh = container_of(nhc, struct fib_nh, nh_common);
                        if (nh->nh_tclassid &&
                            nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
                                goto nla_put_failure;
                }
#endif
        } else {
                if (fib_add_multipath(skb, fi) < 0)
                        goto nla_put_failure;
        }

offload:
        if (fri->offload)
                rtm->rtm_flags |= RTM_F_OFFLOAD;
        if (fri->trap)
                rtm->rtm_flags |= RTM_F_TRAP;
        if (fri->offload_failed)
                rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

/*
 * Update FIB if:
 * - local address disappeared -> we must delete all the entries
 *   referring to it.
 * - device went down -> we must shutdown all nexthops going via it.
 */
int fib_sync_down_addr(struct net_device *dev, __be32 local)
{
        int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
        struct net *net = dev_net(dev);
        struct hlist_head *head;
        struct fib_info *fi;
        int ret = 0;

        if (!local)
                return 0;

        head = fib_info_laddrhash_bucket(net, local);
        hlist_for_each_entry(fi, head, fib_lhash) {
                if (!net_eq(fi->fib_net, net) ||
                    fi->fib_tb_id != tb_id)
                        continue;
                if (fi->fib_prefsrc == local) {
                        fi->fib_flags |= RTNH_F_DEAD;
                        fi->pfsrc_removed = true;
                        ret++;
                }
        }
        return ret;
}

static int call_fib_nh_notifiers(struct fib_nh *nh,
                                 enum fib_event_type event_type)
{
        bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev);
        struct fib_nh_notifier_info info = {
                .fib_nh = nh,
        };

        switch (event_type) {
        case FIB_EVENT_NH_ADD:
                if (nh->fib_nh_flags & RTNH_F_DEAD)
                        break;
                if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN)
                        break;
                return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type,
                                           &info.info);
        case FIB_EVENT_NH_DEL:
                if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) ||
                    (nh->fib_nh_flags & RTNH_F_DEAD))
                        return call_fib4_notifiers(dev_net(nh->fib_nh_dev),
                                                   event_type, &info.info);
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

/* Update the PMTU of exceptions when:
 * - the new MTU of the first hop becomes smaller than the PMTU
 * - the old MTU was the same as the PMTU, and it limited discovery of
 *   larger MTUs on the path. With that limit raised, we can now
 *   discover larger MTUs
 * A special case is locked exceptions, for which the PMTU is smaller
 * than the minimal accepted PMTU:
 * - if the new MTU is greater than the PMTU, don't make any change
 * - otherwise, unlock and set PMTU
 */
void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
{
        struct fnhe_hash_bucket *bucket;
        int i;

        bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1);
        if (!bucket)
                return;

        for (i = 0; i < FNHE_HASH_SIZE; i++) {
                struct fib_nh_exception *fnhe;

                for (fnhe = rcu_dereference_protected(bucket[i].chain, 1);
                     fnhe;
                     fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) {
                        if (fnhe->fnhe_mtu_locked) {
                                if (new <= fnhe->fnhe_pmtu) {
                                        fnhe->fnhe_pmtu = new;
                                        fnhe->fnhe_mtu_locked = false;
                                }
                        } else if (new < fnhe->fnhe_pmtu ||
                                   orig == fnhe->fnhe_pmtu) {
                                fnhe->fnhe_pmtu = new;
                        }
                }
        }
}

void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
{
        struct hlist_head *head = fib_nh_head(dev);
        struct fib_nh *nh;

        hlist_for_each_entry(nh, head, nh_hash) {
                DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
                fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
        }
}

/* Event              force Flags           Description
 * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
 * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
 * NETDEV_DOWN        1     LINKDOWN|DEAD   Last address removed
 * NETDEV_UNREGISTER  1     LINKDOWN|DEAD   Device removed
 *
 * only used when fib_nh is built into fib_info
 */
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
{
        struct hlist_head *head = fib_nh_head(dev);
        struct fib_info *prev_fi = NULL;
        int scope = RT_SCOPE_NOWHERE;
        struct fib_nh *nh;
        int ret = 0;

        if (force)
                scope = -1;

        hlist_for_each_entry(nh, head, nh_hash) {
                struct fib_info *fi = nh->nh_parent;
                int dead;

                BUG_ON(!fi->fib_nhs);
                DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
                if (fi == prev_fi)
                        continue;
                prev_fi = fi;
                dead = 0;
                change_nexthops(fi) {
                        if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD)
                                dead++;
                        else if (nexthop_nh->fib_nh_dev == dev &&
                                 nexthop_nh->fib_nh_scope != scope) {
                                switch (event) {
                                case NETDEV_DOWN:
                                case NETDEV_UNREGISTER:
                                        nexthop_nh->fib_nh_flags |= RTNH_F_DEAD;
                                        fallthrough;
                                case NETDEV_CHANGE:
                                        nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
                                        break;
                                }
                                call_fib_nh_notifiers(nexthop_nh,
                                                      FIB_EVENT_NH_DEL);
                                dead++;
                        }
#ifdef CONFIG_IP_ROUTE_MULTIPATH
                        if (event == NETDEV_UNREGISTER &&
                            nexthop_nh->fib_nh_dev == dev) {
                                dead = fi->fib_nhs;
                                break;
                        }
#endif
                } endfor_nexthops(fi)
                if (dead == fi->fib_nhs) {
                        switch (event) {
                        case NETDEV_DOWN:
                        case NETDEV_UNREGISTER:
                                fi->fib_flags |= RTNH_F_DEAD;
                                fallthrough;
                        case NETDEV_CHANGE:
                                fi->fib_flags |= RTNH_F_LINKDOWN;
                                break;
                        }
                        ret++;
                }

                fib_rebalance(fi);
        }

        return ret;
}

/* Must be invoked inside of an RCU protected region.  */
static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
{
        struct fib_info *fi = NULL, *last_resort = NULL;
        struct hlist_head *fa_head = res->fa_head;
        struct fib_table *tb = res->table;
        u8 slen = 32 - res->prefixlen;
        int order = -1, last_idx = -1;
        struct fib_alias *fa, *fa1 = NULL;
        u32 last_prio = res->fi->fib_priority;
        dscp_t last_dscp = 0;

        hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
                struct fib_info *next_fi = fa->fa_info;
                struct fib_nh_common *nhc;

                if (fa->fa_slen != slen)
                        continue;
                if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp))
                        continue;
                if (fa->tb_id != tb->tb_id)
                        continue;
                if (next_fi->fib_priority > last_prio &&
                    fa->fa_dscp == last_dscp) {
                        if (last_dscp)
                                continue;
                        break;
                }
                if (next_fi->fib_flags & RTNH_F_DEAD)
                        continue;
                last_dscp = fa->fa_dscp;
                last_prio = next_fi->fib_priority;

                if (next_fi->fib_scope != res->scope ||
                    fa->fa_type != RTN_UNICAST)
                        continue;

                nhc = fib_info_nhc(next_fi, 0);
                if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK)
                        continue;

                fib_alias_accessed(fa);

                if (!fi) {
                        if (next_fi != res->fi)
                                break;
                        fa1 = fa;
                } else if (!fib_detect_death(fi, order, &last_resort,
                                             &last_idx, fa1->fa_default)) {
                        fib_result_assign(res, fi);
                        fa1->fa_default = order;
                        goto out;
                }
                fi = next_fi;
                order++;
        }

        if (order <= 0 || !fi) {
                if (fa1)
                        fa1->fa_default = -1;
                goto out;
        }

        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
                              fa1->fa_default)) {
                fib_result_assign(res, fi);
                fa1->fa_default = order;
                goto out;
        }

        if (last_idx >= 0)
                fib_result_assign(res, last_resort);
        fa1->fa_default = last_idx;
out:
        return;
}

/*
 * Dead device goes up. We wake up dead nexthops.
 * It takes sense only on multipath routes.
 *
 * only used when fib_nh is built into fib_info
 */
int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
{
        struct fib_info *prev_fi;
        struct hlist_head *head;
        struct fib_nh *nh;
        int ret;

        if (!(dev->flags & IFF_UP))
                return 0;

        if (nh_flags & RTNH_F_DEAD) {
                unsigned int flags = netif_get_flags(dev);

                if (flags & (IFF_RUNNING | IFF_LOWER_UP))
                        nh_flags |= RTNH_F_LINKDOWN;
        }

        prev_fi = NULL;
        head = fib_nh_head(dev);
        ret = 0;

        hlist_for_each_entry(nh, head, nh_hash) {
                struct fib_info *fi = nh->nh_parent;
                int alive;

                BUG_ON(!fi->fib_nhs);
                DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
                if (fi == prev_fi)
                        continue;

                prev_fi = fi;
                alive = 0;
                change_nexthops(fi) {
                        if (!(nexthop_nh->fib_nh_flags & nh_flags)) {
                                alive++;
                                continue;
                        }
                        if (!nexthop_nh->fib_nh_dev ||
                            !(nexthop_nh->fib_nh_dev->flags & IFF_UP))
                                continue;
                        if (nexthop_nh->fib_nh_dev != dev ||
                            !__in_dev_get_rtnl(dev))
                                continue;
                        alive++;
                        nexthop_nh->fib_nh_flags &= ~nh_flags;
                        call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
                } endfor_nexthops(fi)

                if (alive > 0) {
                        fi->fib_flags &= ~nh_flags;
                        ret++;
                }

                fib_rebalance(fi);
        }

        return ret;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
static bool fib_good_nh(const struct fib_nh *nh)
{
        int state = NUD_REACHABLE;

        if (nh->fib_nh_scope == RT_SCOPE_LINK) {
                struct neighbour *n;

                rcu_read_lock();

                if (likely(nh->fib_nh_gw_family == AF_INET))
                        n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
                                                   (__force u32)nh->fib_nh_gw4);
                else if (IS_ENABLED(CONFIG_IPV6) &&
                         nh->fib_nh_gw_family == AF_INET6)
                        n = __ipv6_neigh_lookup_noref(nh->fib_nh_dev,
                                                      &nh->fib_nh_gw6);
                else
                        n = NULL;
                if (n)
                        state = READ_ONCE(n->nud_state);

                rcu_read_unlock();
        }

        return !!(state & NUD_VALID);
}

void fib_select_multipath(struct fib_result *res, int hash,
                          const struct flowi4 *fl4)
{
        struct fib_info *fi = res->fi;
        struct net *net = fi->fib_net;
        bool use_neigh;
        int score = -1;
        __be32 saddr;

        if (unlikely(res->fi->nh)) {
                nexthop_path_fib_result(res, hash);
                return;
        }

        use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh);
        saddr = fl4 ? fl4->saddr : 0;

        change_nexthops(fi) {
                int nh_upper_bound, nh_score = 0;

                /* Nexthops without a carrier are assigned an upper bound of
                 * minus one when "ignore_routes_with_linkdown" is set.
                 */
                nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound);
                if (nh_upper_bound == -1 ||
                    (use_neigh && !fib_good_nh(nexthop_nh)))
                        continue;

                if (saddr && nexthop_nh->nh_saddr == saddr)
                        nh_score += 2;
                if (hash <= nh_upper_bound)
                        nh_score++;
                if (score < nh_score) {
                        res->nh_sel = nhsel;
                        res->nhc = &nexthop_nh->nh_common;
                        if (nh_score == 3 || (!saddr && nh_score == 1))
                                return;
                        score = nh_score;
                }

        } endfor_nexthops(fi);
}
#endif

void fib_select_path(struct net *net, struct fib_result *res,
                     struct flowi4 *fl4, const struct sk_buff *skb)
{
        if (fl4->flowi4_oif)
                goto check_saddr;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (fib_info_num_path(res->fi) > 1) {
                int h = fib_multipath_hash(net, fl4, skb, NULL);

                fib_select_multipath(res, h, fl4);
        }
        else
#endif
        if (!res->prefixlen &&
            res->table->tb_num_default > 1 &&
            res->type == RTN_UNICAST)
                fib_select_default(fl4, res);

check_saddr:
        if (!fl4->saddr) {
                struct net_device *l3mdev;

                l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev);

                if (!l3mdev ||
                    l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev)
                        fl4->saddr = fib_result_prefsrc(net, res);
                else
                        fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
        }
}

int __net_init fib4_semantics_init(struct net *net)
{
        unsigned int hash_bits = 4;

        net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits);
        if (!net->ipv4.fib_info_hash)
                return -ENOMEM;

        net->ipv4.fib_info_hash_bits = hash_bits;
        net->ipv4.fib_info_cnt = 0;

        return 0;
}

void __net_exit fib4_semantics_exit(struct net *net)
{
        fib_info_hash_free(net->ipv4.fib_info_hash);
}


















































































































































































































































































































































































































































































































































































































































































































































   16 
   15 


   14 




















































   16 



   16 













































    2 













   12 












   11 














   14 





   11 
























   12 
















































































































































































   24 







   23 







    3 



    8 









   18 






   27 








   25 




















































































   25 
































   24 









   28 



























   28 


































   27 





   20 






    3 






    3 

    3 



















    2 








   23 













































    8 

   11 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
 *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
 */
#include <linux/sched.h>                /* test_thread_flag(), ...        */
#include <linux/sched/task_stack.h>        /* task_stack_*(), ...                */
#include <linux/kdebug.h>                /* oops_begin/end, ...                */
#include <linux/memblock.h>                /* max_low_pfn                        */
#include <linux/kfence.h>                /* kfence_handle_page_fault        */
#include <linux/kprobes.h>                /* NOKPROBE_SYMBOL, ...                */
#include <linux/mmiotrace.h>                /* kmmio_handler, ...                */
#include <linux/perf_event.h>                /* perf_sw_event                */
#include <linux/hugetlb.h>                /* hstate_index_to_shift        */
#include <linux/context_tracking.h>        /* exception_enter(), ...        */
#include <linux/uaccess.h>                /* faulthandler_disabled()        */
#include <linux/efi.h>                        /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <linux/mm.h>                        /* find_and_lock_vma() */
#include <linux/vmalloc.h>

#include <asm/cpufeature.h>                /* boot_cpu_has, ...                */
#include <asm/traps.h>                        /* dotraplinkage, ...                */
#include <asm/fixmap.h>                        /* VSYSCALL_ADDR                */
#include <asm/vsyscall.h>                /* emulate_vsyscall                */
#include <asm/vm86.h>                        /* struct vm86                        */
#include <asm/mmu_context.h>                /* vma_pkey()                        */
#include <asm/efi.h>                        /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h>                        /* store_idt(), ...                */
#include <asm/cpu_entry_area.h>                /* exception stack                */
#include <asm/pgtable_areas.h>                /* VMALLOC_START, ...                */
#include <asm/kvm_para.h>                /* kvm_handle_async_pf                */
#include <asm/vdso.h>                        /* fixup_vdso_exception()        */
#include <asm/irq_stack.h>
#include <asm/fred.h>
#include <asm/sev.h>                        /* snp_dump_hva_rmpentry()        */

#define CREATE_TRACE_POINTS
#include <trace/events/exceptions.h>

/*
 * Returns 0 if mmiotrace is disabled, or if the fault is not
 * handled by mmiotrace:
 */
static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
                        return -1;
        return 0;
}

/*
 * Prefetch quirks:
 *
 * 32-bit mode:
 *
 *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 *   Check that here and ignore it.  This is AMD erratum #91.
 *
 * 64-bit mode:
 *
 *   Sometimes the CPU reports invalid exceptions on prefetch.
 *   Check that here and ignore it.
 *
 * Opcode checker based on code by Richard Brunner.
 */
static inline int
check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
                      unsigned char opcode, int *prefetch)
{
        unsigned char instr_hi = opcode & 0xf0;
        unsigned char instr_lo = opcode & 0x0f;

        switch (instr_hi) {
        case 0x20:
        case 0x30:
                /*
                 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
                 * In X86_64 long mode, the CPU will signal invalid
                 * opcode if some of these prefixes are present so
                 * X86_64 will never get here anyway
                 */
                return ((instr_lo & 7) == 0x6);
#ifdef CONFIG_X86_64
        case 0x40:
                /*
                 * In 64-bit mode 0x40..0x4F are valid REX prefixes
                 */
                return (!user_mode(regs) || user_64bit_mode(regs));
#endif
        case 0x60:
                /* 0x64 thru 0x67 are valid prefixes in all modes. */
                return (instr_lo & 0xC) == 0x4;
        case 0xF0:
                /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
                return !instr_lo || (instr_lo>>1) == 1;
        case 0x00:
                /* Prefetch instruction is 0x0F0D or 0x0F18 */
                if (get_kernel_nofault(opcode, instr))
                        return 0;

                *prefetch = (instr_lo == 0xF) &&
                        (opcode == 0x0D || opcode == 0x18);
                return 0;
        default:
                return 0;
        }
}

static bool is_amd_k8_pre_npt(void)
{
        struct cpuinfo_x86 *c = &boot_cpu_data;

        return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
                        c->x86_vendor == X86_VENDOR_AMD &&
                        c->x86 == 0xf && c->x86_model < 0x40);
}

static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
        unsigned char *max_instr;
        unsigned char *instr;
        int prefetch = 0;

        /* Erratum #91 affects AMD K8, pre-NPT CPUs */
        if (!is_amd_k8_pre_npt())
                return 0;

        /*
         * If it was a exec (instruction fetch) fault on NX page, then
         * do not ignore the fault:
         */
        if (error_code & X86_PF_INSTR)
                return 0;

        instr = (void *)convert_ip_to_linear(current, regs);
        max_instr = instr + 15;

        /*
         * This code has historically always bailed out if IP points to a
         * not-present page (e.g. due to a race).  No one has ever
         * complained about this.
         */
        pagefault_disable();

        while (instr < max_instr) {
                unsigned char opcode;

                if (user_mode(regs)) {
                        if (get_user(opcode, (unsigned char __user *) instr))
                                break;
                } else {
                        if (get_kernel_nofault(opcode, instr))
                                break;
                }

                instr++;

                if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
                        break;
        }

        pagefault_enable();
        return prefetch;
}

DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

#ifdef CONFIG_X86_32
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
        unsigned index = pgd_index(address);
        pgd_t *pgd_k;
        p4d_t *p4d, *p4d_k;
        pud_t *pud, *pud_k;
        pmd_t *pmd, *pmd_k;

        pgd += index;
        pgd_k = init_mm.pgd + index;

        if (!pgd_present(*pgd_k))
                return NULL;

        /*
         * set_pgd(pgd, *pgd_k); here would be useless on PAE
         * and redundant with the set_pmd() on non-PAE. As would
         * set_p4d/set_pud.
         */
        p4d = p4d_offset(pgd, address);
        p4d_k = p4d_offset(pgd_k, address);
        if (!p4d_present(*p4d_k))
                return NULL;

        pud = pud_offset(p4d, address);
        pud_k = pud_offset(p4d_k, address);
        if (!pud_present(*pud_k))
                return NULL;

        pmd = pmd_offset(pud, address);
        pmd_k = pmd_offset(pud_k, address);

        if (pmd_present(*pmd) != pmd_present(*pmd_k))
                set_pmd(pmd, *pmd_k);

        if (!pmd_present(*pmd_k))
                return NULL;
        else
                BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));

        return pmd_k;
}

/*
 *   Handle a fault on the vmalloc or module mapping area
 *
 *   This is needed because there is a race condition between the time
 *   when the vmalloc mapping code updates the PMD to the point in time
 *   where it synchronizes this update with the other page-tables in the
 *   system.
 *
 *   In this race window another thread/CPU can map an area on the same
 *   PMD, finds it already present and does not synchronize it with the
 *   rest of the system yet. As a result v[mz]alloc might return areas
 *   which are not mapped in every page-table in the system, causing an
 *   unhandled page-fault when they are accessed.
 */
static noinline int vmalloc_fault(unsigned long address)
{
        unsigned long pgd_paddr;
        pmd_t *pmd_k;
        pte_t *pte_k;

        /* Make sure we are in vmalloc area: */
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;

        /*
         * Synchronize this task's top level page-table
         * with the 'reference' page table.
         *
         * Do _not_ use "current" here. We might be inside
         * an interrupt in the middle of a task switch..
         */
        pgd_paddr = read_cr3_pa();
        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
        if (!pmd_k)
                return -1;

        if (pmd_leaf(*pmd_k))
                return 0;

        pte_k = pte_offset_kernel(pmd_k, address);
        if (!pte_present(*pte_k))
                return -1;

        return 0;
}
NOKPROBE_SYMBOL(vmalloc_fault);

void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
        unsigned long addr;

        for (addr = start & PMD_MASK;
             addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
             addr += PMD_SIZE) {
                struct page *page;

                spin_lock(&pgd_lock);
                list_for_each_entry(page, &pgd_list, lru) {
                        spinlock_t *pgt_lock;

                        /* the pgt_lock only for Xen */
                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;

                        spin_lock(pgt_lock);
                        vmalloc_sync_one(page_address(page), addr);
                        spin_unlock(pgt_lock);
                }
                spin_unlock(&pgd_lock);
        }
}

static bool low_pfn(unsigned long pfn)
{
        return pfn < max_low_pfn;
}

static void dump_pagetable(unsigned long address)
{
        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = &base[pgd_index(address)];
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

#ifdef CONFIG_X86_PAE
        pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
        if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
                goto out;
#define pr_pde pr_cont
#else
#define pr_pde pr_info
#endif
        p4d = p4d_offset(pgd, address);
        pud = pud_offset(p4d, address);
        pmd = pmd_offset(pud, address);
        pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
#undef pr_pde

        /*
         * We must not directly access the pte in the highpte
         * case if the page table is located in highmem.
         * And let's rather not kmap-atomic the pte, just in case
         * it's allocated already:
         */
        if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd))
                goto out;

        pte = pte_offset_kernel(pmd, address);
        pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
out:
        pr_cont("\n");
}

#else /* CONFIG_X86_64: */

#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR 
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
"******* Working around it, but it may cause SEGVs or burn power.\n"
"******* Please consider a BIOS update.\n"
"******* Disabling USB legacy in the BIOS may also help.\n";
#endif

static int bad_address(void *p)
{
        unsigned long dummy;

        return get_kernel_nofault(dummy, (unsigned long *)p);
}

static void dump_pagetable(unsigned long address)
{
        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = base + pgd_index(address);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

        if (bad_address(pgd))
                goto bad;

        pr_info("PGD %lx ", pgd_val(*pgd));

        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (bad_address(p4d))
                goto bad;

        pr_cont("P4D %lx ", p4d_val(*p4d));
        if (!p4d_present(*p4d) || p4d_leaf(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (bad_address(pud))
                goto bad;

        pr_cont("PUD %lx ", pud_val(*pud));
        if (!pud_present(*pud) || pud_leaf(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
        if (bad_address(pmd))
                goto bad;

        pr_cont("PMD %lx ", pmd_val(*pmd));
        if (!pmd_present(*pmd) || pmd_leaf(*pmd))
                goto out;

        pte = pte_offset_kernel(pmd, address);
        if (bad_address(pte))
                goto bad;

        pr_cont("PTE %lx", pte_val(*pte));
out:
        pr_cont("\n");
        return;
bad:
        pr_info("BAD\n");
}

#endif /* CONFIG_X86_64 */

/*
 * Workaround for K8 erratum #93 & buggy BIOS.
 *
 * BIOS SMM functions are required to use a specific workaround
 * to avoid corruption of the 64bit RIP register on C stepping K8.
 *
 * A lot of BIOS that didn't get tested properly miss this.
 *
 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
 * Try to work around it here.
 *
 * Note we only handle faults in kernel here.
 * Does nothing on 32-bit.
 */
static int is_errata93(struct pt_regs *regs, unsigned long address)
{
#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
            || boot_cpu_data.x86 != 0xf)
                return 0;

        if (user_mode(regs))
                return 0;

        if (address != regs->ip)
                return 0;

        if ((address >> 32) != 0)
                return 0;

        address |= 0xffffffffUL << 32;
        if ((address >= (u64)_stext && address <= (u64)_etext) ||
            (address >= MODULES_VADDR && address <= MODULES_END)) {
                printk_once(errata93_warning);
                regs->ip = address;
                return 1;
        }
#endif
        return 0;
}

/*
 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
 * to illegal addresses >4GB.
 *
 * We catch this in the page fault handler because these addresses
 * are not reachable. Just detect this case and return.  Any code
 * segment in LDT is compatibility mode.
 */
static int is_errata100(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
        if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
                return 1;
#endif
        return 0;
}

/* Pentium F0 0F C7 C8 bug workaround: */
static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
                       unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
        if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
            idt_is_f00f_address(address)) {
                handle_invalid_op(regs);
                return 1;
        }
#endif
        return 0;
}

static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
{
        u32 offset = (index >> 3) * sizeof(struct desc_struct);
        unsigned long addr;
        struct ldttss_desc desc;

        if (index == 0) {
                pr_alert("%s: NULL\n", name);
                return;
        }

        if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
                pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
                return;
        }

        if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
                              sizeof(struct ldttss_desc))) {
                pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
                         name, index);
                return;
        }

        addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
#ifdef CONFIG_X86_64
        addr |= ((u64)desc.base3 << 32);
#endif
        pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
                 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
}

static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
        if (!oops_may_print())
                return;

        if (error_code & X86_PF_INSTR) {
                unsigned int level;
                bool nx, rw;
                pgd_t *pgd;
                pte_t *pte;

                pgd = __va(read_cr3_pa());
                pgd += pgd_index(address);

                pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);

                if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx))
                        pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
                                from_kuid(&init_user_ns, current_uid()));
                if (pte && pte_present(*pte) && pte_exec(*pte) && !nx &&
                                (pgd_flags(*pgd) & _PAGE_USER) &&
                                (__read_cr4() & X86_CR4_SMEP))
                        pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
                                from_kuid(&init_user_ns, current_uid()));
        }

        if (address < PAGE_SIZE && !user_mode(regs))
                pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
                        (void *)address);
        else
                pr_alert("BUG: unable to handle page fault for address: %px\n",
                        (void *)address);

        pr_alert("#PF: %s %s in %s mode\n",
                 (error_code & X86_PF_USER)  ? "user" : "supervisor",
                 (error_code & X86_PF_INSTR) ? "instruction fetch" :
                 (error_code & X86_PF_WRITE) ? "write access" :
                                               "read access",
                             user_mode(regs) ? "user" : "kernel");
        pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
                 !(error_code & X86_PF_PROT) ? "not-present page" :
                 (error_code & X86_PF_RSVD)  ? "reserved bit violation" :
                 (error_code & X86_PF_PK)    ? "protection keys violation" :
                 (error_code & X86_PF_RMP)   ? "RMP violation" :
                                               "permissions violation");

        if (!(error_code & X86_PF_USER) && user_mode(regs)) {
                struct desc_ptr idt, gdt;
                u16 ldtr, tr;

                /*
                 * This can happen for quite a few reasons.  The more obvious
                 * ones are faults accessing the GDT, or LDT.  Perhaps
                 * surprisingly, if the CPU tries to deliver a benign or
                 * contributory exception from user code and gets a page fault
                 * during delivery, the page fault can be delivered as though
                 * it originated directly from user code.  This could happen
                 * due to wrong permissions on the IDT, GDT, LDT, TSS, or
                 * kernel or IST stack.
                 */
                store_idt(&idt);

                /* Usable even on Xen PV -- it's just slow. */
                native_store_gdt(&gdt);

                pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
                         idt.address, idt.size, gdt.address, gdt.size);

                store_ldt(ldtr);
                show_ldttss(&gdt, "LDTR", ldtr);

                store_tr(tr);
                show_ldttss(&gdt, "TR", tr);
        }

        dump_pagetable(address);

        if (error_code & X86_PF_RMP)
                snp_dump_hva_rmpentry(address);
}

static noinline void
pgtable_bad(struct pt_regs *regs, unsigned long error_code,
            unsigned long address)
{
        struct task_struct *tsk;
        unsigned long flags;
        int sig;

        flags = oops_begin();
        tsk = current;
        sig = SIGKILL;

        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
               tsk->comm, address);
        dump_pagetable(address);

        if (__die("Bad pagetable", regs, error_code))
                sig = 0;

        oops_end(flags, regs, sig);
}

static void sanitize_error_code(unsigned long address,
                                unsigned long *error_code)
{
        /*
         * To avoid leaking information about the kernel page
         * table layout, pretend that user-mode accesses to
         * kernel addresses are always protection faults.
         *
         * NB: This means that failed vsyscalls with vsyscall=none
         * will have the PROT bit.  This doesn't leak any
         * information and does not appear to cause any problems.
         */
        if (address >= TASK_SIZE_MAX)
                *error_code |= X86_PF_PROT;
}

static void set_signal_archinfo(unsigned long address,
                                unsigned long error_code)
{
        struct task_struct *tsk = current;

        tsk->thread.trap_nr = X86_TRAP_PF;
        tsk->thread.error_code = error_code | X86_PF_USER;
        tsk->thread.cr2 = address;
}

static noinline void
page_fault_oops(struct pt_regs *regs, unsigned long error_code,
                unsigned long address)
{
#ifdef CONFIG_VMAP_STACK
        struct stack_info info;
#endif
        unsigned long flags;
        int sig;

        if (user_mode(regs)) {
                /*
                 * Implicit kernel access from user mode?  Skip the stack
                 * overflow and EFI special cases.
                 */
                goto oops;
        }

#ifdef CONFIG_VMAP_STACK
        /*
         * Stack overflow?  During boot, we can fault near the initial
         * stack in the direct map, but that's not an overflow -- check
         * that we're in vmalloc space to avoid this.
         */
        if (is_vmalloc_addr((void *)address) &&
            get_stack_guard_info((void *)address, &info)) {
                /*
                 * We're likely to be running with very little stack space
                 * left.  It's plausible that we'd hit this condition but
                 * double-fault even before we get this far, in which case
                 * we're fine: the double-fault handler will deal with it.
                 *
                 * We don't want to make it all the way into the oops code
                 * and then double-fault, though, because we're likely to
                 * break the console driver and lose most of the stack dump.
                 */
                call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
                              handle_stack_overflow,
                              ASM_CALL_ARG3,
                              , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));

                BUG();
        }
#endif

        /*
         * Buggy firmware could access regions which might page fault.  If
         * this happens, EFI has a special OOPS path that will try to
         * avoid hanging the system.
         */
        if (IS_ENABLED(CONFIG_EFI))
                efi_crash_gracefully_on_page_fault(address);

        /* Only not-present faults should be handled by KFENCE. */
        if (!(error_code & X86_PF_PROT) &&
            kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
                return;

oops:
        /*
         * Oops. The kernel tried to access some bad page. We'll have to
         * terminate things with extreme prejudice:
         */
        flags = oops_begin();

        show_fault_oops(regs, error_code, address);

        if (task_stack_end_corrupted(current))
                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");

        sig = SIGKILL;
        if (__die("Oops", regs, error_code))
                sig = 0;

        /* Executive summary in case the body of the oops scrolled away */
        printk(KERN_DEFAULT "CR2: %016lx\n", address);

        oops_end(flags, regs, sig);
}

static noinline void
kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
                         unsigned long address, int signal, int si_code,
                         u32 pkey)
{
        WARN_ON_ONCE(user_mode(regs));

        /* Are we prepared to handle this kernel fault? */
        if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        /*
         * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
         * instruction.
         */
        if (is_prefetch(regs, error_code, address))
                return;

        page_fault_oops(regs, error_code, address);
}

/*
 * Print out info about fatal segfaults, if the show_unhandled_signals
 * sysctl is set:
 */
static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
                unsigned long address, struct task_struct *tsk)
{
        const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
        /* This is a racy snapshot, but it's better than nothing. */
        int cpu = raw_smp_processor_id();

        if (!unhandled_signal(tsk, SIGSEGV))
                return;

        if (!printk_ratelimit())
                return;

        printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
                loglvl, tsk->comm, task_pid_nr(tsk), address,
                (void *)regs->ip, (void *)regs->sp, error_code);

        print_vma_addr(KERN_CONT " in ", regs->ip);

        /*
         * Dump the likely CPU where the fatal segfault happened.
         * This can help identify faulty hardware.
         */
        printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu,
               topology_core_id(cpu), topology_physical_package_id(cpu));


        printk(KERN_CONT "\n");

        show_opcodes(regs, loglvl);
}

static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                       unsigned long address, u32 pkey, int si_code)
{
        struct task_struct *tsk = current;

        if (!user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         SIGSEGV, si_code, pkey);
                return;
        }

        if (!(error_code & X86_PF_USER)) {
                /* Implicit user access to kernel memory -- just oops */
                page_fault_oops(regs, error_code, address);
                return;
        }

        /*
         * User mode accesses just cause a SIGSEGV.
         * It's possible to have interrupts off here:
         */
        local_irq_enable();

        /*
         * Valid to do another page fault here because this one came
         * from user space:
         */
        if (is_prefetch(regs, error_code, address))
                return;

        if (is_errata100(regs, address))
                return;

        sanitize_error_code(address, &error_code);

        if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        if (likely(show_unhandled_signals))
                show_signal_msg(regs, error_code, address, tsk);

        set_signal_archinfo(address, error_code);

        if (si_code == SEGV_PKUERR)
                force_sig_pkuerr((void __user *)address, pkey);
        else
                force_sig_fault(SIGSEGV, si_code, (void __user *)address);
}

static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                     unsigned long address)
{
        __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
}

static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
           unsigned long address, struct mm_struct *mm,
           struct vm_area_struct *vma, u32 pkey, int si_code)
{
        /*
         * Something tried to access memory that isn't in our memory map..
         * Fix it, but check if it's kernel or user first..
         */
        if (mm)
                mmap_read_unlock(mm);
        else
                vma_end_read(vma);

        __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}

static inline bool bad_area_access_from_pkeys(unsigned long error_code,
                struct vm_area_struct *vma)
{
        /* This code is always called on the current mm */
        bool foreign = false;

        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return false;
        if (error_code & X86_PF_PK)
                return true;
        /* this checks permission keys on the VMA: */
        if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
                                       (error_code & X86_PF_INSTR), foreign))
                return true;
        return false;
}

static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
                      unsigned long address, struct mm_struct *mm,
                      struct vm_area_struct *vma)
{
        /*
         * This OSPKE check is not strictly necessary at runtime.
         * But, doing it this way allows compiler optimizations
         * if pkeys are compiled out.
         */
        if (bad_area_access_from_pkeys(error_code, vma)) {
                /*
                 * A protection key fault means that the PKRU value did not allow
                 * access to some PTE.  Userspace can figure out what PKRU was
                 * from the XSAVE state.  This function captures the pkey from
                 * the vma and passes it to userspace so userspace can discover
                 * which protection key was set on the PTE.
                 *
                 * If we get here, we know that the hardware signaled a X86_PF_PK
                 * fault and that there was a VMA once we got in the fault
                 * handler.  It does *not* guarantee that the VMA we find here
                 * was the one that we faulted on.
                 *
                 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
                 * 2. T1   : set PKRU to deny access to pkey=4, touches page
                 * 3. T1   : faults...
                 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
                 * 5. T1   : enters fault handler, takes mmap_lock, etc...
                 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
                 *             faulted on a pte with its pkey=4.
                 */
                u32 pkey = vma_pkey(vma);

                __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR);
        } else {
                __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR);
        }
}

static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
          vm_fault_t fault)
{
        /* Kernel mode? Handle exceptions or die: */
        if (!user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
                return;
        }

        /* User-space => ok to do another page fault: */
        if (is_prefetch(regs, error_code, address))
                return;

        sanitize_error_code(address, &error_code);

        if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        set_signal_archinfo(address, error_code);

#ifdef CONFIG_MEMORY_FAILURE
        if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
                struct task_struct *tsk = current;
                unsigned lsb = 0;

                pr_err(
        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
                        tsk->comm, tsk->pid, address);
                if (fault & VM_FAULT_HWPOISON_LARGE)
                        lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
                if (fault & VM_FAULT_HWPOISON)
                        lsb = PAGE_SHIFT;
                force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
                return;
        }
#endif
        force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
}

static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
        if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
                return 0;

        if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
                return 0;

        return 1;
}

/*
 * Handle a spurious fault caused by a stale TLB entry.
 *
 * This allows us to lazily refresh the TLB when increasing the
 * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
 * eagerly is very expensive since that implies doing a full
 * cross-processor TLB flush, even if no stale TLB entries exist
 * on other processors.
 *
 * Spurious faults may only occur if the TLB contains an entry with
 * fewer permission than the page table entry.  Non-present (P = 0)
 * and reserved bit (R = 1) faults are never spurious.
 *
 * There are no security implications to leaving a stale TLB when
 * increasing the permissions on a page.
 *
 * Returns non-zero if a spurious fault was handled, zero otherwise.
 *
 * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
 * (Optional Invalidation).
 */
static noinline int
spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        int ret;

        /*
         * Only writes to RO or instruction fetches from NX may cause
         * spurious faults.
         *
         * These could be from user or supervisor accesses but the TLB
         * is only lazily flushed after a kernel mapping protection
         * change, so user accesses are not expected to cause spurious
         * faults.
         */
        if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
            error_code != (X86_PF_INSTR | X86_PF_PROT))
                return 0;

        pgd = init_mm.pgd + pgd_index(address);
        if (!pgd_present(*pgd))
                return 0;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                return 0;

        if (p4d_leaf(*p4d))
                return spurious_kernel_fault_check(error_code, (pte_t *) p4d);

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                return 0;

        if (pud_leaf(*pud))
                return spurious_kernel_fault_check(error_code, (pte_t *) pud);

        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return 0;

        if (pmd_leaf(*pmd))
                return spurious_kernel_fault_check(error_code, (pte_t *) pmd);

        pte = pte_offset_kernel(pmd, address);
        if (!pte_present(*pte))
                return 0;

        ret = spurious_kernel_fault_check(error_code, pte);
        if (!ret)
                return 0;

        /*
         * Make sure we have permissions in PMD.
         * If not, then there's a bug in the page tables:
         */
        ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
        WARN_ONCE(!ret, "PMD has incorrect permission bits\n");

        return ret;
}
NOKPROBE_SYMBOL(spurious_kernel_fault);

int show_unhandled_signals = 1;

static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
        /* This is only called for the current mm, so: */
        bool foreign = false;

        /*
         * Read or write was blocked by protection keys.  This is
         * always an unconditional error and can never result in
         * a follow-up action to resolve the fault, like a COW.
         */
        if (error_code & X86_PF_PK)
                return 1;

        /*
         * SGX hardware blocked the access.  This usually happens
         * when the enclave memory contents have been destroyed, like
         * after a suspend/resume cycle. In any case, the kernel can't
         * fix the cause of the fault.  Handle the fault as an access
         * error even in cases where no actual access violation
         * occurred.  This allows userspace to rebuild the enclave in
         * response to the signal.
         */
        if (unlikely(error_code & X86_PF_SGX))
                return 1;

        /*
         * Make sure to check the VMA so that we do not perform
         * faults just to hit a X86_PF_PK as soon as we fill in a
         * page.
         */
        if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
                                       (error_code & X86_PF_INSTR), foreign))
                return 1;

        /*
         * Shadow stack accesses (PF_SHSTK=1) are only permitted to
         * shadow stack VMAs. All other accesses result in an error.
         */
        if (error_code & X86_PF_SHSTK) {
                if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK)))
                        return 1;
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
                return 0;
        }

        if (error_code & X86_PF_WRITE) {
                /* write, present and write, not present: */
                if (unlikely(vma->vm_flags & VM_SHADOW_STACK))
                        return 1;
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
                return 0;
        }

        /* read, present: */
        if (unlikely(error_code & X86_PF_PROT))
                return 1;

        /* read, not present: */
        if (unlikely(!vma_is_accessible(vma)))
                return 1;

        return 0;
}

bool fault_in_kernel_space(unsigned long address)
{
        /*
         * On 64-bit systems, the vsyscall page is at an address above
         * TASK_SIZE_MAX, but is not considered part of the kernel
         * address space.
         */
        if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
                return false;

        return address >= TASK_SIZE_MAX;
}

/*
 * Called for all faults where 'address' is part of the kernel address
 * space.  Might get called for faults that originate from *code* that
 * ran in userspace or the kernel.
 */
static void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
                   unsigned long address)
{
        /*
         * Protection keys exceptions only happen on user pages.  We
         * have no user pages in the kernel portion of the address
         * space, so do not expect them here.
         */
        WARN_ON_ONCE(hw_error_code & X86_PF_PK);

#ifdef CONFIG_X86_32
        /*
         * We can fault-in kernel-space virtual memory on-demand. The
         * 'reference' page table is init_mm.pgd.
         *
         * NOTE! We MUST NOT take any locks for this case. We may
         * be in an interrupt or a critical region, and should
         * only copy the information from the master page table,
         * nothing more.
         *
         * Before doing this on-demand faulting, ensure that the
         * fault is not any of the following:
         * 1. A fault on a PTE with a reserved bit set.
         * 2. A fault caused by a user-mode access.  (Do not demand-
         *    fault kernel memory due to user-mode accesses).
         * 3. A fault caused by a page-level protection violation.
         *    (A demand fault would be on a non-present page which
         *     would have X86_PF_PROT==0).
         *
         * This is only needed to close a race condition on x86-32 in
         * the vmalloc mapping/unmapping code. See the comment above
         * vmalloc_fault() for details. On x86-64 the race does not
         * exist as the vmalloc mappings don't need to be synchronized
         * there.
         */
        if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
                if (vmalloc_fault(address) >= 0)
                        return;
        }
#endif

        if (is_f00f_bug(regs, hw_error_code, address))
                return;

        /* Was the fault spurious, caused by lazy TLB invalidation? */
        if (spurious_kernel_fault(hw_error_code, address))
                return;

        /* kprobes don't want to hook the spurious faults: */
        if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;

        /*
         * Note, despite being a "bad area", there are quite a few
         * acceptable reasons to get here, such as erratum fixups
         * and handling kernel code that can fault, like get_user().
         *
         * Don't take the mm semaphore here. If we fixup a prefetch
         * fault we could otherwise deadlock:
         */
        bad_area_nosemaphore(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);

/*
 * Handle faults in the user portion of the address space.  Nothing in here
 * should check X86_PF_USER without a specific justification: for almost
 * all purposes, we should treat a normal kernel access to user memory
 * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
 * The one exception is AC flag handling, which is, per the x86
 * architecture, special for WRUSS.
 */
static inline
void do_user_addr_fault(struct pt_regs *regs,
                        unsigned long error_code,
                        unsigned long address)
{
        struct vm_area_struct *vma;
        struct task_struct *tsk;
        struct mm_struct *mm;
        vm_fault_t fault;
        unsigned int flags = FAULT_FLAG_DEFAULT;

        tsk = current;
        mm = tsk->mm;

        if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
                /*
                 * Whoops, this is kernel mode code trying to execute from
                 * user memory.  Unless this is AMD erratum #93, which
                 * corrupts RIP such that it looks like a user address,
                 * this is unrecoverable.  Don't even try to look up the
                 * VMA or look for extable entries.
                 */
                if (is_errata93(regs, address))
                        return;

                page_fault_oops(regs, error_code, address);
                return;
        }

        /* kprobes don't want to hook the spurious faults: */
        if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;

        /*
         * Reserved bits are never expected to be set on
         * entries in the user portion of the page tables.
         */
        if (unlikely(error_code & X86_PF_RSVD))
                pgtable_bad(regs, error_code, address);

        /*
         * If SMAP is on, check for invalid kernel (supervisor) access to user
         * pages in the user address space.  The odd case here is WRUSS,
         * which, according to the preliminary documentation, does not respect
         * SMAP and will have the USER bit set so, in all cases, SMAP
         * enforcement appears to be consistent with the USER bit.
         */
        if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
                     !(error_code & X86_PF_USER) &&
                     !(regs->flags & X86_EFLAGS_AC))) {
                /*
                 * No extable entry here.  This was a kernel access to an
                 * invalid pointer.  get_kernel_nofault() will not get here.
                 */
                page_fault_oops(regs, error_code, address);
                return;
        }

        /*
         * If we're in an interrupt, have no user context or are running
         * in a region with pagefaults disabled then we must not take the fault
         */
        if (unlikely(faulthandler_disabled() || !mm)) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /* Legacy check - remove this after verifying that it doesn't trigger */
        if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        local_irq_enable();

        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

        /*
         * Read-only permissions can not be expressed in shadow stack PTEs.
         * Treat all shadow stack accesses as WRITE faults. This ensures
         * that the MM will prepare everything (e.g., break COW) such that
         * maybe_mkwrite() can create a proper shadow stack PTE.
         */
        if (error_code & X86_PF_SHSTK)
                flags |= FAULT_FLAG_WRITE;
        if (error_code & X86_PF_WRITE)
                flags |= FAULT_FLAG_WRITE;
        if (error_code & X86_PF_INSTR)
                flags |= FAULT_FLAG_INSTRUCTION;

        /*
         * We set FAULT_FLAG_USER based on the register state, not
         * based on X86_PF_USER. User space accesses that cause
         * system page faults are still user accesses.
         */
        if (user_mode(regs))
                flags |= FAULT_FLAG_USER;

#ifdef CONFIG_X86_64
        /*
         * Faults in the vsyscall page might need emulation.  The
         * vsyscall page is at a high address (>PAGE_OFFSET), but is
         * considered to be part of the user address space.
         *
         * The vsyscall page does not have a "real" VMA, so do this
         * emulation before we go searching for VMAs.
         *
         * PKRU never rejects instruction fetches, so we don't need
         * to consider the PF_PK bit.
         */
        if (is_vsyscall_vaddr(address)) {
                if (emulate_vsyscall_pf(error_code, regs, address))
                        return;
        }
#endif

        if (!(flags & FAULT_FLAG_USER))
                goto lock_mmap;

        vma = lock_vma_under_rcu(mm, address);
        if (!vma)
                goto lock_mmap;

        if (unlikely(access_error(error_code, vma))) {
                bad_area_access_error(regs, error_code, address, NULL, vma);
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                return;
        }
        fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
        if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
                vma_end_read(vma);

        if (!(fault & VM_FAULT_RETRY)) {
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                goto done;
        }
        count_vm_vma_lock_event(VMA_LOCK_RETRY);
        if (fault & VM_FAULT_MAJOR)
                flags |= FAULT_FLAG_TRIED;

        /* Quick path to respond to signals */
        if (fault_signal_pending(fault, regs)) {
                if (!user_mode(regs))
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGBUS, BUS_ADRERR,
                                                 ARCH_DEFAULT_PKEY);
                return;
        }
lock_mmap:

retry:
        vma = lock_mm_and_find_vma(mm, address, regs);
        if (unlikely(!vma)) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /*
         * Ok, we have a good vm_area for this memory access, so
         * we can handle it..
         */
        if (unlikely(access_error(error_code, vma))) {
                bad_area_access_error(regs, error_code, address, mm, vma);
                return;
        }

        /*
         * If for any reason at all we couldn't handle the fault,
         * make sure we exit gracefully rather than endlessly redo
         * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
         * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
         *
         * Note that handle_userfault() may also release and reacquire mmap_lock
         * (and not return with VM_FAULT_RETRY), when returning to userland to
         * repeat the page fault later with a VM_FAULT_NOPAGE retval
         * (potentially after handling any pending signal during the return to
         * userland). The return to userland is identified whenever
         * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
         */
        fault = handle_mm_fault(vma, address, flags, regs);

        if (fault_signal_pending(fault, regs)) {
                /*
                 * Quick path to respond to signals.  The core mm code
                 * has unlocked the mm for us if we get here.
                 */
                if (!user_mode(regs))
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGBUS, BUS_ADRERR,
                                                 ARCH_DEFAULT_PKEY);
                return;
        }

        /* The fault is fully completed (including releasing mmap lock) */
        if (fault & VM_FAULT_COMPLETED)
                return;

        /*
         * If we need to retry the mmap_lock has already been released,
         * and if there is a fatal signal pending there is no guarantee
         * that we made any progress. Handle this case first.
         */
        if (unlikely(fault & VM_FAULT_RETRY)) {
                flags |= FAULT_FLAG_TRIED;
                goto retry;
        }

        mmap_read_unlock(mm);
done:
        if (likely(!(fault & VM_FAULT_ERROR)))
                return;

        if (fatal_signal_pending(current) && !user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         0, 0, ARCH_DEFAULT_PKEY);
                return;
        }

        if (fault & VM_FAULT_OOM) {
                /* Kernel mode? Handle exceptions or die: */
                if (!user_mode(regs)) {
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGSEGV, SEGV_MAPERR,
                                                 ARCH_DEFAULT_PKEY);
                        return;
                }

                /*
                 * We ran out of memory, call the OOM killer, and return the
                 * userspace (which will retry the fault, or kill us if we got
                 * oom-killed):
                 */
                pagefault_out_of_memory();
        } else {
                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
                             VM_FAULT_HWPOISON_LARGE))
                        do_sigbus(regs, error_code, address, fault);
                else if (fault & VM_FAULT_SIGSEGV)
                        bad_area_nosemaphore(regs, error_code, address);
                else
                        BUG();
        }
}
NOKPROBE_SYMBOL(do_user_addr_fault);

static __always_inline void
trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
                         unsigned long address)
{
        if (user_mode(regs))
                trace_page_fault_user(address, regs, error_code);
        else
                trace_page_fault_kernel(address, regs, error_code);
}

static __always_inline void
handle_page_fault(struct pt_regs *regs, unsigned long error_code,
                              unsigned long address)
{
        trace_page_fault_entries(regs, error_code, address);

        if (unlikely(kmmio_fault(regs, address)))
                return;

        /* Was the fault on kernel-controlled part of the address space? */
        if (unlikely(fault_in_kernel_space(address))) {
                do_kern_addr_fault(regs, error_code, address);
        } else {
                do_user_addr_fault(regs, error_code, address);
        }
        /*
         * page fault handling might have reenabled interrupts,
         * make sure to disable them again.
         */
        local_irq_disable();
}

DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
{
        irqentry_state_t state;
        unsigned long address;

        address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2();

        /*
         * KVM uses #PF vector to deliver 'page not present' events to guests
         * (asynchronous page fault mechanism). The event happens when a
         * userspace task is trying to access some valid (from guest's point of
         * view) memory which is not currently mapped by the host (e.g. the
         * memory is swapped out). Note, the corresponding "page ready" event
         * which is injected when the memory becomes available, is delivered via
         * an interrupt mechanism and not a #PF exception
         * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
         *
         * We are relying on the interrupted context being sane (valid RSP,
         * relevant locks not held, etc.), which is fine as long as the
         * interrupted context had IF=1.  We are also relying on the KVM
         * async pf type field and CR2 being read consistently instead of
         * getting values from real and async page faults mixed up.
         *
         * Fingers crossed.
         *
         * The async #PF handling code takes care of idtentry handling
         * itself.
         */
        if (kvm_handle_async_pf(regs, (u32)address))
                return;

        /*
         * Entry handling for valid #PF from kernel mode is slightly
         * different: RCU is already watching and ct_irq_enter() must not
         * be invoked because a kernel fault on a user space address might
         * sleep.
         *
         * In case the fault hit a RCU idle region the conditional entry
         * code reenabled RCU to avoid subsequent wreckage which helps
         * debuggability.
         */
        state = irqentry_enter(regs);

        instrumentation_begin();
        handle_page_fault(regs, error_code, address);
        instrumentation_end();

        irqentry_exit(regs, state);
}















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 
    2 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the TCP module.
 *
 * Version:        @(#)tcp.h        1.0.5        05/23/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _TCP_H
#define _TCP_H

#define FASTRETRANS_DEBUG 1

#include <linux/list.h>
#include <linux/tcp.h>
#include <linux/bug.h>
#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/percpu.h>
#include <linux/skbuff.h>
#include <linux/kref.h>
#include <linux/ktime.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/bits.h>

#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#include <net/inet_hashtables.h>
#include <net/checksum.h>
#include <net/request_sock.h>
#include <net/sock_reuseport.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/tcp_states.h>
#include <net/tcp_ao.h>
#include <net/inet_ecn.h>
#include <net/dst.h>
#include <net/mptcp.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>

#include <linux/seq_file.h>
#include <linux/memcontrol.h>
#include <linux/bpf-cgroup.h>
#include <linux/siphash.h>

extern struct inet_hashinfo tcp_hashinfo;

DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
int tcp_orphan_count_sum(void);

static inline void tcp_orphan_count_inc(void)
{
        this_cpu_inc(tcp_orphan_count);
}

static inline void tcp_orphan_count_dec(void)
{
        this_cpu_dec(tcp_orphan_count);
}

DECLARE_PER_CPU(u32, tcp_tw_isn);

void tcp_time_wait(struct sock *sk, int state, int timeo);

#define MAX_TCP_HEADER        L1_CACHE_ALIGN(128 + MAX_HEADER)
#define MAX_TCP_OPTION_SPACE 40
#define TCP_MIN_SND_MSS                48
#define TCP_MIN_GSO_SIZE        (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)

/*
 * Never offer a window over 32767 without using window scaling. Some
 * poor stacks do signed 16bit maths!
 */
#define MAX_TCP_WINDOW                32767U

/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
#define TCP_MIN_MSS                88U

/* The initial MTU to use for probing */
#define TCP_BASE_MSS                1024

/* probing interval, default to 10 minutes as per RFC4821 */
#define TCP_PROBE_INTERVAL        600

/* Specify interval when tcp mtu probing will stop */
#define TCP_PROBE_THRESHOLD        8

/* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3

/* Maximal number of ACKs sent quickly to accelerate slow-start. */
#define TCP_MAX_QUICKACKS        16U

/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE                14U

/* Default sending frequency of accurate ECN option per RTT */
#define TCP_ACCECN_OPTION_BEACON        3

/* urg_data states */
#define TCP_URG_VALID        0x0100
#define TCP_URG_NOTYET        0x0200
#define TCP_URG_READ        0x0400

#define TCP_RETR1        3        /*
                                 * This is how many retries it does before it
                                 * tries to figure out if the gateway is
                                 * down. Minimal RFC value is 3; it corresponds
                                 * to ~3sec-8min depending on RTO.
                                 */

#define TCP_RETR2        15        /*
                                 * This should take at least
                                 * 90 minutes to time out.
                                 * RFC1122 says that the limit is 100 sec.
                                 * 15 is ~13-30min depending on RTO.
                                 */

#define TCP_SYN_RETRIES         6        /* This is how many retries are done
                                 * when active opening a connection.
                                 * RFC1122 says the minimum retry MUST
                                 * be at least 180secs.  Nevertheless
                                 * this value is corresponding to
                                 * 63secs of retransmission with the
                                 * current initial RTO.
                                 */

#define TCP_SYNACK_RETRIES 5        /* This is how may retries are done
                                 * when passive opening a connection.
                                 * This is corresponding to 31secs of
                                 * retransmission with the current
                                 * initial RTO.
                                 */

#define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT
                                  * state, about 60 seconds        */
#define TCP_FIN_TIMEOUT        TCP_TIMEWAIT_LEN
                                 /* BSD style FIN_WAIT2 deadlock breaker.
                                  * It used to be 3min, new value is 60sec,
                                  * to combine FIN-WAIT-2 timeout with
                                  * TIME-WAIT timer.
                                  */
#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */

#define TCP_DELACK_MAX        ((unsigned)(HZ/5))        /* maximal time to delay before sending an ACK */
static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);

#if HZ >= 100
#define TCP_DELACK_MIN        ((unsigned)(HZ/25))        /* minimal time to delay before sending an ACK */
#define TCP_ATO_MIN        ((unsigned)(HZ/25))
#else
#define TCP_DELACK_MIN        4U
#define TCP_ATO_MIN        4U
#endif
#define TCP_RTO_MAX_SEC 120
#define TCP_RTO_MAX        ((unsigned)(TCP_RTO_MAX_SEC * HZ))
#define TCP_RTO_MIN        ((unsigned)(HZ / 5))
#define TCP_TIMEOUT_MIN        (2U) /* Min timeout for TCP timers in jiffies */

#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */

#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))        /* RFC6298 2.1 initial RTO value        */
#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))        /* RFC 1122 initial RTO value, now
                                                 * used as a fallback RTO for the
                                                 * initial data transmission if no
                                                 * valid RTT sample has been acquired,
                                                 * most likely due to retrans in 3WHS.
                                                 */

#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
                                                         * for local resources.
                                                         */
#define TCP_KEEPALIVE_TIME        (120*60*HZ)        /* two hours */
#define TCP_KEEPALIVE_PROBES        9                /* Max of 9 keepalive probes        */
#define TCP_KEEPALIVE_INTVL        (75*HZ)

#define MAX_TCP_KEEPIDLE        32767
#define MAX_TCP_KEEPINTVL        32767
#define MAX_TCP_KEEPCNT                127
#define MAX_TCP_SYNCNT                127

/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds
 * to avoid overflows. This assumes a clock smaller than 1 Mhz.
 * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz.
 */
#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC)

#define TCP_PAWS_MSL        60                /* Per-host timestamps are invalidated
                                         * after this time. It should be equal
                                         * (or greater than) TCP_TIMEWAIT_LEN
                                         * to provide reliability equal to one
                                         * provided by timewait state.
                                         */
#define TCP_PAWS_WINDOW        1                /* Replay window for per-host
                                         * timestamps. It must be less than
                                         * minimal timewait lifetime.
                                         */
/*
 *        TCP option
 */

#define TCPOPT_NOP                1        /* Padding */
#define TCPOPT_EOL                0        /* End of options */
#define TCPOPT_MSS                2        /* Segment size negotiating */
#define TCPOPT_WINDOW                3        /* Window scaling */
#define TCPOPT_SACK_PERM        4       /* SACK Permitted */
#define TCPOPT_SACK             5       /* SACK Block */
#define TCPOPT_TIMESTAMP        8        /* Better RTT estimations/PAWS */
#define TCPOPT_MD5SIG                19        /* MD5 Signature (RFC2385) */
#define TCPOPT_AO                29        /* Authentication Option (RFC5925) */
#define TCPOPT_MPTCP                30        /* Multipath TCP (RFC6824) */
#define TCPOPT_FASTOPEN                34        /* Fast open (RFC7413) */
#define TCPOPT_ACCECN0                172        /* 0xAC: Accurate ECN Order 0 */
#define TCPOPT_ACCECN1                174        /* 0xAE: Accurate ECN Order 1 */
#define TCPOPT_EXP                254        /* Experimental */
/* Magic number to be after the option value for sharing TCP
 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
 */
#define TCPOPT_FASTOPEN_MAGIC        0xF989
#define TCPOPT_SMC_MAGIC        0xE2D4C3D9

/*
 *     TCP option lengths
 */

#define TCPOLEN_MSS            4
#define TCPOLEN_WINDOW         3
#define TCPOLEN_SACK_PERM      2
#define TCPOLEN_TIMESTAMP      10
#define TCPOLEN_MD5SIG         18
#define TCPOLEN_FASTOPEN_BASE  2
#define TCPOLEN_ACCECN_BASE    2
#define TCPOLEN_EXP_FASTOPEN_BASE  4
#define TCPOLEN_EXP_SMC_BASE   6

/* But this is what stacks really send out. */
#define TCPOLEN_TSTAMP_ALIGNED                12
#define TCPOLEN_WSCALE_ALIGNED                4
#define TCPOLEN_SACKPERM_ALIGNED        4
#define TCPOLEN_SACK_BASE                2
#define TCPOLEN_SACK_BASE_ALIGNED        4
#define TCPOLEN_SACK_PERBLOCK                8
#define TCPOLEN_MD5SIG_ALIGNED                20
#define TCPOLEN_MSS_ALIGNED                4
#define TCPOLEN_EXP_SMC_BASE_ALIGNED        8
#define TCPOLEN_ACCECN_PERFIELD                3

/* Maximum number of byte counters in AccECN option + size */
#define TCP_ACCECN_NUMFIELDS                3
#define TCP_ACCECN_MAXSIZE                (TCPOLEN_ACCECN_BASE + \
                                         TCPOLEN_ACCECN_PERFIELD * \
                                         TCP_ACCECN_NUMFIELDS)
#define TCP_ACCECN_SAFETY_SHIFT                1 /* SAFETY_FACTOR in accecn draft */

/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF                1        /* Nagle's algo is disabled */
#define TCP_NAGLE_CORK                2        /* Socket is corked            */
#define TCP_NAGLE_PUSH                4        /* Cork is overridden for already queued data */

/* TCP thin-stream limits */
#define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */

/* TCP initial congestion window as per rfc6928 */
#define TCP_INIT_CWND                10

/* Bit Flags for sysctl_tcp_fastopen */
#define        TFO_CLIENT_ENABLE        1
#define        TFO_SERVER_ENABLE        2
#define        TFO_CLIENT_NO_COOKIE        4        /* Data in SYN w/o cookie option */

/* Accept SYN data w/o any cookie option */
#define        TFO_SERVER_COOKIE_NOT_REQD        0x200

/* Force enable TFO on all listeners, i.e., not requiring the
 * TCP_FASTOPEN socket option.
 */
#define        TFO_SERVER_WO_SOCKOPT1        0x400


/* sysctl variables for tcp */
extern int sysctl_tcp_max_orphans;
extern long sysctl_tcp_mem[3];

#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
#define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
#define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */

DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);

extern struct percpu_counter tcp_sockets_allocated;
extern unsigned long tcp_memory_pressure;

/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
{
        if (mem_cgroup_sk_enabled(sk) &&
            mem_cgroup_sk_under_memory_pressure(sk))
                return true;

        if (sk->sk_bypass_prot_mem)
                return false;

        return READ_ONCE(tcp_memory_pressure);
}
/*
 * The next routines deal with comparing 32 bit unsigned ints
 * and worry about wraparound (automatic with unsigned arithmetic).
 */

static inline bool before(__u32 seq1, __u32 seq2)
{
        return (__s32)(seq1-seq2) < 0;
}
#define after(seq2, seq1)         before(seq1, seq2)

/* is s2<=s1<=s3 ? */
static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3)
{
        return seq3 - seq2 >= seq1 - seq2;
}

static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
        sk_wmem_queued_add(sk, -skb->truesize);
        if (!skb_zcopy_pure(skb))
                sk_mem_uncharge(sk, skb->truesize);
        else
                sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb)));
        __kfree_skb(skb);
}

void sk_forced_mem_schedule(struct sock *sk, int size);

bool tcp_check_oom(const struct sock *sk, int shift);


extern struct proto tcp_prot;

#define TCP_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.tcp_statistics, field)
#define __TCP_INC_STATS(net, field)        __SNMP_INC_STATS((net)->mib.tcp_statistics, field)
#define TCP_DEC_STATS(net, field)        SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
#define TCP_ADD_STATS(net, field, val)        SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)

/*
 * TCP splice context
 */
struct tcp_splice_state {
        struct pipe_inode_info *pipe;
        size_t len;
        unsigned int flags;
};

void tcp_tsq_work_init(void);

int tcp_v4_err(struct sk_buff *skb, u32);

void tcp_shutdown(struct sock *sk, int how);

int tcp_v4_rcv(struct sk_buff *skb);

void tcp_remove_empty_skb(struct sock *sk);
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
                         size_t size, struct ubuf_info *uarg);
void tcp_splice_eof(struct socket *sock);
int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
int tcp_wmem_schedule(struct sock *sk, int copy);
void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
              int size_goal);

void tcp_release_cb(struct sock *sk);

static inline bool tcp_release_cb_cond(struct sock *sk)
{
#ifdef CONFIG_INET
        if (likely(sk->sk_prot->release_cb == tcp_release_cb)) {
                if (unlikely(smp_load_acquire(&sk->sk_tsq_flags) & TCP_DEFERRED_ALL))
                        tcp_release_cb(sk);
                return true;
        }
#endif
        return false;
}

void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk);
void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, int *karg);
enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
void tcp_twsk_purge(struct list_head *net_exit_list);
int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
                         unsigned int offset, size_t len);
ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
                        struct pipe_inode_info *pipe, size_t len,
                        unsigned int flags);
struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
                                     bool force_schedule);

static inline void tcp_dec_quickack_mode(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ack.quick) {
                /* How many ACKs S/ACKing new data have we sent? */
                const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0;

                if (pkts >= icsk->icsk_ack.quick) {
                        icsk->icsk_ack.quick = 0;
                        /* Leaving quickack mode we deflate ATO. */
                        icsk->icsk_ack.ato   = TCP_ATO_MIN;
                } else
                        icsk->icsk_ack.quick -= pkts;
        }
}

#define        TCP_ECN_MODE_RFC3168        BIT(0)
#define        TCP_ECN_QUEUE_CWR        BIT(1)
#define        TCP_ECN_DEMAND_CWR        BIT(2)
#define        TCP_ECN_SEEN                BIT(3)
#define        TCP_ECN_MODE_ACCECN        BIT(4)

#define        TCP_ECN_DISABLED        0
#define        TCP_ECN_MODE_PENDING        (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
#define        TCP_ECN_MODE_ANY        (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)

static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp)
{
        return tp->ecn_flags & TCP_ECN_MODE_ANY;
}

static inline bool tcp_ecn_mode_rfc3168(const struct tcp_sock *tp)
{
        return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_RFC3168;
}

static inline bool tcp_ecn_mode_accecn(const struct tcp_sock *tp)
{
        return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_ACCECN;
}

static inline bool tcp_ecn_disabled(const struct tcp_sock *tp)
{
        return !tcp_ecn_mode_any(tp);
}

static inline bool tcp_ecn_mode_pending(const struct tcp_sock *tp)
{
        return (tp->ecn_flags & TCP_ECN_MODE_PENDING) == TCP_ECN_MODE_PENDING;
}

static inline void tcp_ecn_mode_set(struct tcp_sock *tp, u8 mode)
{
        tp->ecn_flags &= ~TCP_ECN_MODE_ANY;
        tp->ecn_flags |= mode;
}

enum tcp_tw_status {
        TCP_TW_SUCCESS = 0,
        TCP_TW_RST = 1,
        TCP_TW_ACK = 2,
        TCP_TW_SYN = 3,
        TCP_TW_ACK_OOW = 4
};


enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
                                              struct sk_buff *skb,
                                              const struct tcphdr *th,
                                              u32 *tw_isn,
                                              enum skb_drop_reason *drop_reason);
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                           struct request_sock *req, bool fastopen,
                           bool *lost_race, enum skb_drop_reason *drop_reason);
enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
                                       struct sk_buff *skb);
void tcp_enter_loss(struct sock *sk);
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag);
void tcp_clear_retrans(struct tcp_sock *tp);
void tcp_update_pacing_rate(struct sock *sk);
void tcp_set_rto(struct sock *sk);
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
void __tcp_close(struct sock *sk, long timeout);
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
__poll_t tcp_poll(struct file *file, struct socket *sock,
                      struct poll_table_struct *wait);
int do_tcp_getsockopt(struct sock *sk, int level,
                      int optname, sockptr_t optval, sockptr_t optlen);
int tcp_getsockopt(struct sock *sk, int level, int optname,
                   char __user *optval, int __user *optlen);
bool tcp_bpf_bypass_getsockopt(int level, int optname);
int do_tcp_setsockopt(struct sock *sk, int level, int optname,
                      sockptr_t optval, unsigned int optlen);
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                   unsigned int optlen);
void tcp_reset_keepalive_timer(struct sock *sk, unsigned long timeout);
void tcp_set_keepalive(struct sock *sk, int val);
void tcp_syn_ack_timeout(const struct request_sock *req);
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                int flags);
int tcp_set_rcvlowat(struct sock *sk, int val);
void tcp_set_rcvbuf(struct sock *sk, int val);
int tcp_set_window_clamp(struct sock *sk, int val);

static inline void
tcp_update_recv_tstamps(struct sk_buff *skb,
                        struct scm_timestamping_internal *tss)
{
        tss->ts[0] = skb->tstamp;
        tss->ts[2] = skb_hwtstamps(skb)->hwtstamp;
}

void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
                        struct scm_timestamping_internal *tss);
void tcp_data_ready(struct sock *sk);
#ifdef CONFIG_MMU
int tcp_mmap(struct file *file, struct socket *sock,
             struct vm_area_struct *vma);
#endif
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
                       struct tcp_options_received *opt_rx,
                       int estab, struct tcp_fastopen_cookie *foc);

/*
 *        BPF SKB-less helpers
 */
u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
                         struct tcphdr *th, u32 *cookie);
u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
                         struct tcphdr *th, u32 *cookie);
u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss);
u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
                          const struct tcp_request_sock_ops *af_ops,
                          struct sock *sk, struct tcphdr *th);
/*
 *        TCP v4 functions exported for the inet6 API
 */

void tcp_v4_mtu_reduced(struct sock *sk);
void tcp_req_err(struct sock *sk, u32 seq, bool abort);
void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(const struct sock *sk,
                                      struct request_sock *req,
                                      struct sk_buff *skb);
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req,
                                  struct dst_entry *dst,
                                  struct request_sock *req_unhash,
                                  bool *own_req,
                                  void (*opt_child_init)(struct sock *newsk,
                                                         const struct sock *sk));
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
int tcp_connect(struct sock *sk);
enum tcp_synack_type {
        TCP_SYNACK_NORMAL,
        TCP_SYNACK_FASTOPEN,
        TCP_SYNACK_COOKIE,
        TCP_SYNACK_RETRANS,
};
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
                                struct request_sock *req,
                                struct tcp_fastopen_cookie *foc,
                                enum tcp_synack_type synack_type,
                                struct sk_buff *syn_skb);
int tcp_disconnect(struct sock *sk, int flags);

void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);

/* From syncookies.c */
struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
                                 struct request_sock *req,
                                 struct dst_entry *dst);
int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th);
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
                                            struct sock *sk, struct sk_buff *skb,
                                            struct tcp_options_received *tcp_opt,
                                            int mss, u32 tsoff);

#if IS_ENABLED(CONFIG_BPF)
struct bpf_tcp_req_attrs {
        u32 rcv_tsval;
        u32 rcv_tsecr;
        u16 mss;
        u8 rcv_wscale;
        u8 snd_wscale;
        u8 ecn_ok;
        u8 wscale_ok;
        u8 sack_ok;
        u8 tstamp_ok;
        u8 usec_ts_ok;
        u8 reserved[3];
};
#endif

#ifdef CONFIG_SYN_COOKIES

/* Syncookies use a monotonic timer which increments every 60 seconds.
 * This counter is used both as a hash input and partially encoded into
 * the cookie value.  A cookie is only validated further if the delta
 * between the current counter value and the encoded one is less than this,
 * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
 * the counter advances immediately after a cookie is generated).
 */
#define MAX_SYNCOOKIE_AGE        2
#define TCP_SYNCOOKIE_PERIOD        (60 * HZ)
#define TCP_SYNCOOKIE_VALID        (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD)

/* syncookies: remember time of last synqueue overflow
 * But do not dirty this field too often (once per second is enough)
 * It is racy as we do not hold a lock, but race is very minor.
 */
static inline void tcp_synq_overflow(const struct sock *sk)
{
        unsigned int last_overflow;
        unsigned int now = jiffies;

        if (sk->sk_reuseport) {
                struct sock_reuseport *reuse;

                reuse = rcu_dereference(sk->sk_reuseport_cb);
                if (likely(reuse)) {
                        last_overflow = READ_ONCE(reuse->synq_overflow_ts);
                        if (!time_between32(now, last_overflow,
                                            last_overflow + HZ))
                                WRITE_ONCE(reuse->synq_overflow_ts, now);
                        return;
                }
        }

        last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
        if (!time_between32(now, last_overflow, last_overflow + HZ))
                WRITE_ONCE(tcp_sk_rw(sk)->rx_opt.ts_recent_stamp, now);
}

/* syncookies: no recent synqueue overflow on this listening socket? */
static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
{
        unsigned int last_overflow;
        unsigned int now = jiffies;

        if (sk->sk_reuseport) {
                struct sock_reuseport *reuse;

                reuse = rcu_dereference(sk->sk_reuseport_cb);
                if (likely(reuse)) {
                        last_overflow = READ_ONCE(reuse->synq_overflow_ts);
                        return !time_between32(now, last_overflow - HZ,
                                               last_overflow +
                                               TCP_SYNCOOKIE_VALID);
                }
        }

        last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);

        /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID,
         * then we're under synflood. However, we have to use
         * 'last_overflow - HZ' as lower bound. That's because a concurrent
         * tcp_synq_overflow() could update .ts_recent_stamp after we read
         * jiffies but before we store .ts_recent_stamp into last_overflow,
         * which could lead to rejecting a valid syncookie.
         */
        return !time_between32(now, last_overflow - HZ,
                               last_overflow + TCP_SYNCOOKIE_VALID);
}

static inline u32 tcp_cookie_time(void)
{
        u64 val = get_jiffies_64();

        do_div(val, TCP_SYNCOOKIE_PERIOD);
        return val;
}

/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */
static inline u64 tcp_ns_to_ts(bool usec_ts, u64 val)
{
        if (usec_ts)
                return div_u64(val, NSEC_PER_USEC);

        return div_u64(val, NSEC_PER_MSEC);
}

u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
                              u16 *mssp);
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
u64 cookie_init_timestamp(struct request_sock *req, u64 now);
bool cookie_timestamp_decode(const struct net *net,
                             struct tcp_options_received *opt);

static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *dst)
{
        return READ_ONCE(net->ipv4.sysctl_tcp_ecn) ||
                dst_feature(dst, RTAX_FEATURE_ECN);
}

#if IS_ENABLED(CONFIG_BPF)
static inline bool cookie_bpf_ok(struct sk_buff *skb)
{
        return skb->sk;
}

struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb);
#else
static inline bool cookie_bpf_ok(struct sk_buff *skb)
{
        return false;
}

static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk,
                                                    struct sk_buff *skb)
{
        return NULL;
}
#endif

/* From net/ipv6/syncookies.c */
int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th);
struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);

u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
                              const struct tcphdr *th, u16 *mssp);
__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
#endif
/* tcp_output.c */

void tcp_skb_entail(struct sock *sk, struct sk_buff *skb);
void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb);
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
                               int nonagle);
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
void tcp_retransmit_timer(struct sock *sk);
void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
enum tcp_queue {
        TCP_FRAG_IN_WRITE_QUEUE,
        TCP_FRAG_IN_RTX_QUEUE,
};
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
                 struct sk_buff *skb, u32 len,
                 unsigned int mss_now, gfp_t gfp);

void tcp_send_probe0(struct sock *);
int tcp_write_wakeup(struct sock *, int mib);
void tcp_send_fin(struct sock *sk);
void tcp_send_active_reset(struct sock *sk, gfp_t priority,
                           enum sk_rst_reason reason);
int tcp_send_synack(struct sock *);
void tcp_push_one(struct sock *, unsigned int mss_now);
void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags);
void tcp_send_ack(struct sock *sk);
void tcp_send_delayed_ack(struct sock *sk);
void tcp_send_loss_probe(struct sock *sk);
bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
                             const struct sk_buff *next_skb);

/* tcp_input.c */
void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_done_with_error(struct sock *sk, int err);
void tcp_reset(struct sock *sk, struct sk_buff *skb);
void tcp_fin(struct sock *sk);
void __tcp_check_space(struct sock *sk);
static inline void tcp_check_space(struct sock *sk)
{
        /* pairs with tcp_poll() */
        smp_mb();

        if (sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
                __tcp_check_space(sk);
}
void tcp_sack_compress_send_ack(struct sock *sk);

static inline void tcp_cleanup_skb(struct sk_buff *skb)
{
        skb_dst_drop(skb);
        secpath_reset(skb);
}

static inline void tcp_add_receive_queue(struct sock *sk, struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
        DEBUG_NET_WARN_ON_ONCE(secpath_exists(skb));
        __skb_queue_tail(&sk->sk_receive_queue, skb);
}

/* tcp_timer.c */
void tcp_init_xmit_timers(struct sock *);
static inline void tcp_clear_xmit_timers(struct sock *sk)
{
        if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
                __sock_put(sk);

        if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
                __sock_put(sk);

        inet_csk_clear_xmit_timers(sk);
}

unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
unsigned int tcp_current_mss(struct sock *sk);
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);

/* Bound MSS / TSO packet size with the half of the window */
static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
{
        int cutoff;

        /* When peer uses tiny windows, there is no use in packetizing
         * to sub-MSS pieces for the sake of SWS or making sure there
         * are enough packets in the pipe for fast recovery.
         *
         * On the other hand, for extremely large MSS devices, handling
         * smaller than MSS windows in this way does make sense.
         */
        if (tp->max_window > TCP_MSS_DEFAULT)
                cutoff = (tp->max_window >> 1);
        else
                cutoff = tp->max_window;

        if (cutoff && pktsize > cutoff)
                return max_t(int, cutoff, 68U - tp->tcp_header_len);
        else
                return pktsize;
}

/* tcp.c */
void tcp_get_info(struct sock *, struct tcp_info *);
void tcp_rate_check_app_limited(struct sock *sk);

/* Read 'sendfile()'-style from a TCP socket */
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
                  sk_read_actor_t recv_actor);
int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
                        sk_read_actor_t recv_actor, bool noack,
                        u32 *copied_seq);
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off);
void tcp_read_done(struct sock *sk, size_t len);

void tcp_initialize_rcv_mss(struct sock *sk);

int tcp_mtu_to_mss(struct sock *sk, int pmtu);
int tcp_mss_to_mtu(struct sock *sk, int mss);
void tcp_mtup_init(struct sock *sk);

static inline unsigned int tcp_rto_max(const struct sock *sk)
{
        return READ_ONCE(inet_csk(sk)->icsk_rto_max);
}

static inline void tcp_bound_rto(struct sock *sk)
{
        inet_csk(sk)->icsk_rto = min(inet_csk(sk)->icsk_rto, tcp_rto_max(sk));
}

static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{
        return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}

static inline unsigned long tcp_reqsk_timeout(struct request_sock *req)
{
        u64 timeout = (u64)req->timeout << req->num_timeout;

        return (unsigned long)min_t(u64, timeout,
                                    tcp_rto_max(req->rsk_listener));
}

u32 tcp_delack_max(const struct sock *sk);

/* Compute the actual rto_min value */
static inline u32 tcp_rto_min(const struct sock *sk)
{
        const struct dst_entry *dst = __sk_dst_get(sk);
        u32 rto_min = READ_ONCE(inet_csk(sk)->icsk_rto_min);

        if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
                rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
        return rto_min;
}

static inline u32 tcp_rto_min_us(const struct sock *sk)
{
        return jiffies_to_usecs(tcp_rto_min(sk));
}

static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
{
        return dst_metric_locked(dst, RTAX_CC_ALGO);
}

/* Minimum RTT in usec. ~0 means not available. */
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
{
        return minmax_get(&tp->rtt_min);
}

/* Compute the actual receive window we are currently advertising.
 * Rcv_nxt can be after the window if our peer push more data
 * than the offered window.
 */
static inline u32 tcp_receive_window(const struct tcp_sock *tp)
{
        s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;

        if (win < 0)
                win = 0;
        return (u32) win;
}

/* Compute the maximum receive window we ever advertised.
 * Rcv_nxt can be after the window if our peer push more data
 * than the offered window.
 */
static inline u32 tcp_max_receive_window(const struct tcp_sock *tp)
{
        s32 win = tp->rcv_mwnd_seq - tp->rcv_nxt;

        if (win < 0)
                win = 0;
        return (u32) win;
}

/* Check if we need to update the maximum receive window sequence number */
static inline void tcp_update_max_rcv_wnd_seq(struct tcp_sock *tp)
{
        u32 wre = tp->rcv_wup + tp->rcv_wnd;

        if (after(wre, tp->rcv_mwnd_seq))
                tp->rcv_mwnd_seq = wre;
}

/* Choose a new window, without checks for shrinking, and without
 * scaling applied to the result.  The caller does these things
 * if necessary.  This is a "raw" window selection.
 */
u32 __tcp_select_window(struct sock *sk);

void tcp_send_window_probe(struct sock *sk);

/* TCP uses 32bit jiffies to save some space.
 * Note that this is different from tcp_time_stamp, which
 * historically has been the same until linux-4.13.
 */
#define tcp_jiffies32 ((u32)jiffies)

/*
 * Deliver a 32bit value for TCP timestamp option (RFC 7323)
 * It is no longer tied to jiffies, but to 1 ms clock.
 * Note: double check if you want to use tcp_jiffies32 instead of this.
 */
#define TCP_TS_HZ        1000

static inline u64 tcp_clock_ns(void)
{
        return ktime_get_ns();
}

static inline u64 tcp_clock_us(void)
{
        return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}

static inline u64 tcp_clock_ms(void)
{
        return div_u64(tcp_clock_ns(), NSEC_PER_MSEC);
}

/* TCP Timestamp included in TS option (RFC 1323) can either use ms
 * or usec resolution. Each socket carries a flag to select one or other
 * resolution, as the route attribute could change anytime.
 * Each flow must stick to initial resolution.
 */
static inline u32 tcp_clock_ts(bool usec_ts)
{
        return usec_ts ? tcp_clock_us() : tcp_clock_ms();
}

static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
{
        return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
}

static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
{
        if (tp->tcp_usec_ts)
                return tp->tcp_mstamp;
        return tcp_time_stamp_ms(tp);
}

void tcp_mstamp_refresh(struct tcp_sock *tp);

static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
{
        return max_t(s64, t1 - t0, 0);
}

/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
        return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}

/* Provide skb TSval in usec or ms unit */
static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
{
        if (usec_ts)
                return tcp_skb_timestamp_us(skb);

        return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC);
}

static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
{
        return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
}

static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
{
        return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
}

#define tcp_flag_byte(th) (((u_int8_t *)th)[13])

#define TCPHDR_FIN        BIT(0)
#define TCPHDR_SYN        BIT(1)
#define TCPHDR_RST        BIT(2)
#define TCPHDR_PSH        BIT(3)
#define TCPHDR_ACK        BIT(4)
#define TCPHDR_URG        BIT(5)
#define TCPHDR_ECE        BIT(6)
#define TCPHDR_CWR        BIT(7)
#define TCPHDR_AE        BIT(8)
#define TCPHDR_FLAGS_MASK (TCPHDR_FIN | TCPHDR_SYN | TCPHDR_RST | \
                           TCPHDR_PSH | TCPHDR_ACK | TCPHDR_URG | \
                           TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
#define tcp_flags_ntohs(th) (ntohs(*(__be16 *)&tcp_flag_word(th)) & \
                            TCPHDR_FLAGS_MASK)

#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
#define TCPHDR_SYN_ECN        (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR)

#define TCP_ACCECN_CEP_ACE_MASK 0x7
#define TCP_ACCECN_ACE_MAX_DELTA 6

/* To avoid/detect middlebox interference, not all counters start at 0.
 * See draft-ietf-tcpm-accurate-ecn for the latest values.
 */
#define TCP_ACCECN_CEP_INIT_OFFSET 5
#define TCP_ACCECN_E1B_INIT_OFFSET 1
#define TCP_ACCECN_E0B_INIT_OFFSET 1
#define TCP_ACCECN_CEB_INIT_OFFSET 0

/* State flags for sacked in struct tcp_skb_cb */
enum tcp_skb_cb_sacked_flags {
        TCPCB_SACKED_ACKED        = (1 << 0),        /* SKB ACK'd by a SACK block        */
        TCPCB_SACKED_RETRANS        = (1 << 1),        /* SKB retransmitted                */
        TCPCB_LOST                = (1 << 2),        /* SKB is lost                        */
        TCPCB_TAGBITS                = (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS |
                                   TCPCB_LOST),        /* All tag bits                        */
        TCPCB_REPAIRED                = (1 << 4),        /* SKB repaired (no skb_mstamp_ns)        */
        TCPCB_EVER_RETRANS        = (1 << 7),        /* Ever retransmitted frame        */
        TCPCB_RETRANS                = (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS |
                                   TCPCB_REPAIRED),
};

/* This is what the send packet queuing engine uses to pass
 * TCP per-packet control information to the transmission code.
 * We also store the host-order sequence numbers in here too.
 * This is 44 bytes if IPV6 is enabled.
 * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
 */
struct tcp_skb_cb {
        __u32                seq;                /* Starting sequence number        */
        __u32                end_seq;        /* SEQ + FIN + SYN + datalen        */
        union {
                /* Note :
                 *           tcp_gso_segs/size are used in write queue only,
                 *          cf tcp_skb_pcount()/tcp_skb_mss()
                 */
                struct {
                        u16        tcp_gso_segs;
                        u16        tcp_gso_size;
                };
        };
        __u16                tcp_flags;        /* TCP header flags (tcp[12-13])*/

        __u8                sacked;                /* State flags for SACK.        */
        __u8                ip_dsfield;        /* IPv4 tos or IPv6 dsfield        */
#define TSTAMP_ACK_SK        0x1
#define TSTAMP_ACK_BPF        0x2
        __u8                txstamp_ack:2,        /* Record TX timestamp for ack? */
                        eor:1,                /* Is skb MSG_EOR marked? */
                        has_rxtstamp:1,        /* SKB has a RX timestamp        */
                        unused:4;
        __u32                ack_seq;        /* Sequence number ACK'd        */
        union {
                struct {
#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1)
                        /* There is space for up to 24 bytes */
                        __u32 is_app_limited:1, /* cwnd not fully used? */
                              delivered_ce:20,
                              unused:11;
                        /* pkts S/ACKed so far upon tx of skb, incl retrans: */
                        __u32 delivered;
                        /* start of send pipeline phase */
                        u64 first_tx_mstamp;
                        /* when we reached the "delivered" count */
                        u64 delivered_mstamp;
                } tx;   /* only used for outgoing skbs */
                union {
                        struct inet_skb_parm        h4;
#if IS_ENABLED(CONFIG_IPV6)
                        struct inet6_skb_parm        h6;
#endif
                } header;        /* For incoming skbs */
        };
};

#define TCP_SKB_CB(__skb)        ((struct tcp_skb_cb *)&((__skb)->cb[0]))

extern const struct inet_connection_sock_af_ops ipv4_specific;

#if IS_ENABLED(CONFIG_IPV6)
/* This is the variant of inet6_iif() that must be used by TCP,
 * as TCP moves IP6CB into a different location in skb->cb[]
 */
static inline int tcp_v6_iif(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->header.h6.iif;
}

static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb)
{
        bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);

        return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
}

/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v6_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
                return TCP_SKB_CB(skb)->header.h6.iif;
#endif
        return 0;
}

extern const struct inet_connection_sock_af_ops ipv6_specific;

INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb));

#endif

/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v4_sdif(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
                return TCP_SKB_CB(skb)->header.h4.iif;
#endif
        return 0;
}

/* Due to TSO, an SKB can be composed of multiple actual
 * packets.  To keep these tracked properly, we use this.
 */
static inline int tcp_skb_pcount(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->tcp_gso_segs;
}

static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
{
        TCP_SKB_CB(skb)->tcp_gso_segs = segs;
}

static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
{
        TCP_SKB_CB(skb)->tcp_gso_segs += segs;
}

/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */
static inline int tcp_skb_mss(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->tcp_gso_size;
}

static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
{
        return likely(!TCP_SKB_CB(skb)->eor);
}

static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
                                        const struct sk_buff *from)
{
        /* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */
        return likely(tcp_skb_can_collapse_to(to) &&
                      mptcp_skb_can_collapse(to, from) &&
                      skb_pure_zcopy_same(to, from) &&
                      skb_frags_readable(to) == skb_frags_readable(from));
}

static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to,
                                           const struct sk_buff *from)
{
        return likely(mptcp_skb_can_collapse(to, from) &&
                      !skb_cmp_decrypted(to, from));
}

/* Events passed to congestion control interface */
enum tcp_ca_event {
        CA_EVENT_TX_START,        /* first transmit when no packets in flight */
        CA_EVENT_CWND_RESTART,        /* congestion window restart */
        CA_EVENT_COMPLETE_CWR,        /* end of congestion recovery */
        CA_EVENT_LOSS,                /* loss timeout */
        CA_EVENT_ECN_NO_CE,        /* ECT set, but not CE marked */
        CA_EVENT_ECN_IS_CE,        /* received CE marked IP packet */
};

/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
enum tcp_ca_ack_event_flags {
        CA_ACK_SLOWPATH                = (1 << 0),        /* In slow path processing */
        CA_ACK_WIN_UPDATE        = (1 << 1),        /* ACK updated window */
        CA_ACK_ECE                = (1 << 2),        /* ECE bit is set on ack */
};

/*
 * Interface for adding new TCP congestion control handlers
 */
#define TCP_CA_NAME_MAX        16
#define TCP_CA_MAX        128
#define TCP_CA_BUF_MAX        (TCP_CA_NAME_MAX*TCP_CA_MAX)

#define TCP_CA_UNSPEC        0

/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
#define TCP_CONG_NON_RESTRICTED                BIT(0)
/* Requires ECN/ECT set on all packets */
#define TCP_CONG_NEEDS_ECN                BIT(1)
/* Require successfully negotiated AccECN capability */
#define TCP_CONG_NEEDS_ACCECN                BIT(2)
/* Use ECT(1) instead of ECT(0) while the CA is uninitialized */
#define TCP_CONG_ECT_1_NEGOTIATION        BIT(3)
/* Cannot fallback to RFC3168 during AccECN negotiation */
#define TCP_CONG_NO_FALLBACK_RFC3168        BIT(4)
#define TCP_CONG_MASK  (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN | \
                        TCP_CONG_NEEDS_ACCECN | TCP_CONG_ECT_1_NEGOTIATION | \
                        TCP_CONG_NO_FALLBACK_RFC3168)

union tcp_cc_info;

struct ack_sample {
        u32 pkts_acked;
        s32 rtt_us;
        u32 in_flight;
};

/* A rate sample measures the number of (original/retransmitted) data
 * packets delivered "delivered" over an interval of time "interval_us".
 * The tcp_rate.c code fills in the rate sample, and congestion
 * control modules that define a cong_control function to run at the end
 * of ACK processing can optionally chose to consult this sample when
 * setting cwnd and pacing rate.
 * A sample is invalid if "delivered" or "interval_us" is negative.
 */
struct rate_sample {
        u64  prior_mstamp; /* starting timestamp for interval */
        u32  prior_delivered;        /* tp->delivered at "prior_mstamp" */
        u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
        s32  delivered;                /* number of packets delivered over interval */
        s32  delivered_ce;        /* number of packets delivered w/ CE marks*/
        long interval_us;        /* time for tp->delivered to incr "delivered" */
        u32 snd_interval_us;        /* snd interval for delivered packets */
        u32 rcv_interval_us;        /* rcv interval for delivered packets */
        long rtt_us;                /* RTT of last (S)ACKed packet (or -1) */
        int  losses;                /* number of packets marked lost upon ACK */
        u32  acked_sacked;        /* number of packets newly (S)ACKed upon ACK */
        u32  prior_in_flight;        /* in flight before this ACK */
        u32  last_end_seq;        /* end_seq of most recently ACKed packet */
        bool is_app_limited;        /* is sample from packet with bubble in pipe? */
        bool is_retrans;        /* is sample from retransmission? */
        bool is_ack_delayed;        /* is this (likely) a delayed ACK? */
};

struct tcp_congestion_ops {
/* fast path fields are put first to fill one cache line */

        /* A congestion control (CC) must provide one of either:
         *
         * (a) a cong_avoid function, if the CC wants to use the core TCP
         *     stack's default functionality to implement a "classic"
         *     (Reno/CUBIC-style) response to packet loss, RFC3168 ECN,
         *     idle periods, pacing rate computations, etc.
         *
         * (b) a cong_control function, if the CC wants custom behavior and
         *      complete control of all congestion control behaviors.
         */
        /* (a) "classic" response: calculate new cwnd.
         */
        void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
        /* (b) "custom" response: call when packets are delivered to update
         * cwnd and pacing rate, after all the ca_state processing.
         */
        void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs);

        /* return slow start threshold (required) */
        u32 (*ssthresh)(struct sock *sk);

        /* call before changing ca_state (optional) */
        void (*set_state)(struct sock *sk, u8 new_state);

        /* call when cwnd event occurs (optional) */
        void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);

        /* call when CA_EVENT_TX_START cwnd event occurs (optional) */
        void (*cwnd_event_tx_start)(struct sock *sk);

        /* call when ack arrives (optional) */
        void (*in_ack_event)(struct sock *sk, u32 flags);

        /* hook for packet ack accounting (optional) */
        void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);

        /* override sysctl_tcp_min_tso_segs (optional) */
        u32 (*min_tso_segs)(struct sock *sk);

        /* new value of cwnd after loss (required) */
        u32  (*undo_cwnd)(struct sock *sk);
        /* returns the multiplier used in tcp_sndbuf_expand (optional) */
        u32 (*sndbuf_expand)(struct sock *sk);

/* control/slow paths put last */
        /* get info for inet_diag (optional) */
        size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
                           union tcp_cc_info *info);

        char                         name[TCP_CA_NAME_MAX];
        struct module                *owner;
        struct list_head        list;
        u32                        key;
        u32                        flags;

        /* initialize private data (optional) */
        void (*init)(struct sock *sk);
        /* cleanup private data  (optional) */
        void (*release)(struct sock *sk);
} ____cacheline_aligned_in_smp;

int tcp_register_congestion_control(struct tcp_congestion_ops *type);
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
int tcp_update_congestion_control(struct tcp_congestion_ops *type,
                                  struct tcp_congestion_ops *old_type);
int tcp_validate_congestion_control(struct tcp_congestion_ops *ca);

void tcp_assign_congestion_control(struct sock *sk);
void tcp_init_congestion_control(struct sock *sk);
void tcp_cleanup_congestion_control(struct sock *sk);
int tcp_set_default_congestion_control(struct net *net, const char *name);
void tcp_get_default_congestion_control(struct net *net, char *name);
void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
                               bool cap_net_admin);
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);

u32 tcp_reno_ssthresh(struct sock *sk);
u32 tcp_reno_undo_cwnd(struct sock *sk);
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;

struct tcp_congestion_ops *tcp_ca_find(const char *name);
struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
#ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
#else
static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
        return NULL;
}
#endif

static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
}

static inline bool tcp_ca_needs_accecn(const struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ACCECN;
}

static inline bool tcp_ca_ect_1_negotiation(const struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        return icsk->icsk_ca_ops->flags & TCP_CONG_ECT_1_NEGOTIATION;
}

static inline bool tcp_ca_no_fallback_rfc3168(const struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        return icsk->icsk_ca_ops->flags & TCP_CONG_NO_FALLBACK_RFC3168;
}

static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        if (event == CA_EVENT_TX_START) {
                if (icsk->icsk_ca_ops->cwnd_event_tx_start)
                        icsk->icsk_ca_ops->cwnd_event_tx_start(sk);
                return;
        }
        if (icsk->icsk_ca_ops->cwnd_event)
                icsk->icsk_ca_ops->cwnd_event(sk, event);
}

/* From tcp_cong.c */
void tcp_set_ca_state(struct sock *sk, const u8 ca_state);


static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
{
        return t1 > t2 || (t1 == t2 && after(seq1, seq2));
}

/* These functions determine how the current flow behaves in respect of SACK
 * handling. SACK is negotiated with the peer, and therefore it can vary
 * between different flows.
 *
 * tcp_is_sack - SACK enabled
 * tcp_is_reno - No SACK
 */
static inline int tcp_is_sack(const struct tcp_sock *tp)
{
        return likely(tp->rx_opt.sack_ok);
}

static inline bool tcp_is_reno(const struct tcp_sock *tp)
{
        return !tcp_is_sack(tp);
}

static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{
        return tp->sacked_out + tp->lost_out;
}

/* This determines how many packets are "in the network" to the best
 * of our knowledge.  In many cases it is conservative, but where
 * detailed information is available from the receiver (via SACK
 * blocks etc.) we can make more aggressive calculations.
 *
 * Use this for decisions involving congestion control, use just
 * tp->packets_out to determine if the send queue is empty or not.
 *
 * Read this equation as:
 *
 *        "Packets sent once on transmission queue" MINUS
 *        "Packets left network, but not honestly ACKed yet" PLUS
 *        "Packets fast retransmitted"
 */
static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
{
        return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
}

#define TCP_INFINITE_SSTHRESH        0x7fffffff

static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
{
        return tp->snd_cwnd;
}

static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
{
        WARN_ON_ONCE((int)val <= 0);
        tp->snd_cwnd = val;
}

static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
{
        return tcp_snd_cwnd(tp) < tp->snd_ssthresh;
}

static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp)
{
        return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH;
}

static inline bool tcp_in_cwnd_reduction(const struct sock *sk)
{
        return (TCPF_CA_CWR | TCPF_CA_Recovery) &
               (1 << inet_csk(sk)->icsk_ca_state);
}

/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
 * The exception is cwnd reduction phase, when cwnd is decreasing towards
 * ssthresh.
 */
static inline __u32 tcp_current_ssthresh(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        if (tcp_in_cwnd_reduction(sk))
                return tp->snd_ssthresh;
        else
                return max(tp->snd_ssthresh,
                           ((tcp_snd_cwnd(tp) >> 1) +
                            (tcp_snd_cwnd(tp) >> 2)));
}

/* Use define here intentionally to get WARN_ON location shown at the caller */
#define tcp_verify_left_out(tp)        WARN_ON(tcp_left_out(tp) > tp->packets_out)

void tcp_enter_cwr(struct sock *sk);
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst);

/* The maximum number of MSS of available cwnd for which TSO defers
 * sending if not using sysctl_tcp_tso_win_divisor.
 */
static inline __u32 tcp_max_tso_deferred_mss(const struct tcp_sock *tp)
{
        return 3;
}

/* Returns end sequence number of the receiver's advertised window */
static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
{
        return tp->snd_una + tp->snd_wnd;
}

/* We follow the spirit of RFC2861 to validate cwnd but implement a more
 * flexible approach. The RFC suggests cwnd should not be raised unless
 * it was fully used previously. And that's exactly what we do in
 * congestion avoidance mode. But in slow start we allow cwnd to grow
 * as long as the application has used half the cwnd.
 * Example :
 *    cwnd is 10 (IW10), but application sends 9 frames.
 *    We allow cwnd to reach 18 when all frames are ACKed.
 * This check is safe because it's as aggressive as slow start which already
 * risks 100% overshoot. The advantage is that we discourage application to
 * either send more filler packets or data to artificially blow up the cwnd
 * usage, and allow application-limited process to probe bw more aggressively.
 */
static inline bool tcp_is_cwnd_limited(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        if (tp->is_cwnd_limited)
                return true;

        /* If in slow start, ensure cwnd grows to twice what was ACKed. */
        if (tcp_in_slow_start(tp))
                return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out;

        return false;
}

/* BBR congestion control needs pacing.
 * Same remark for SO_MAX_PACING_RATE.
 * sch_fq packet scheduler is efficiently handling pacing,
 * but is not always installed/used.
 * Return true if TCP stack should pace packets itself.
 */
static inline bool tcp_needs_internal_pacing(const struct sock *sk)
{
        return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
}

/* Estimates in how many jiffies next packet for this flow can be sent.
 * Scheduling a retransmit timer too early would be silly.
 */
static inline unsigned long tcp_pacing_delay(const struct sock *sk)
{
        s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache;

        return delay > 0 ? nsecs_to_jiffies(delay) : 0;
}

static inline void tcp_reset_xmit_timer(struct sock *sk,
                                        const int what,
                                        unsigned long when,
                                        bool pace_delay)
{
        if (pace_delay)
                when += tcp_pacing_delay(sk);
        inet_csk_reset_xmit_timer(sk, what, when,
                                  tcp_rto_max(sk));
}

/* Something is really bad, we could not queue an additional packet,
 * because qdisc is full or receiver sent a 0 window, or we are paced.
 * We do not want to add fuel to the fire, or abort too early,
 * so make sure the timer we arm now is at least 200ms in the future,
 * regardless of current icsk_rto value (as it could be ~2ms)
 */
static inline unsigned long tcp_probe0_base(const struct sock *sk)
{
        return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
}

/* Variant of inet_csk_rto_backoff() used for zero window probes */
static inline unsigned long tcp_probe0_when(const struct sock *sk,
                                            unsigned long max_when)
{
        u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1,
                           inet_csk(sk)->icsk_backoff);
        u64 when = (u64)tcp_probe0_base(sk) << backoff;

        return (unsigned long)min_t(u64, when, max_when);
}

static inline void tcp_check_probe_timer(struct sock *sk)
{
        if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
                tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
                                     tcp_probe0_base(sk), true);
}

static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
{
        tp->snd_wl1 = seq;
}

static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq)
{
        tp->snd_wl1 = seq;
}

/*
 * Calculate(/check) TCP checksum
 */
static inline __sum16 tcp_v4_check(int len, __be32 saddr,
                                   __be32 daddr, __wsum base)
{
        return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base);
}

static inline bool tcp_checksum_complete(struct sk_buff *skb)
{
        return !skb_csum_unnecessary(skb) &&
                __skb_checksum_complete(skb);
}

enum skb_drop_reason tcp_add_backlog(struct sock *sk, struct sk_buff *skb);

static inline enum skb_drop_reason
tcp_filter(struct sock *sk, struct sk_buff *skb)
{
        const struct tcphdr *th = (const struct tcphdr *)skb->data;

        return sk_filter_trim_cap(sk, skb, __tcp_hdrlen(th));
}

void tcp_set_state(struct sock *sk, int state);
void tcp_done(struct sock *sk);
int tcp_abort(struct sock *sk, int err);

static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
{
        rx_opt->dsack = 0;
        rx_opt->num_sacks = 0;
}

void tcp_cwnd_restart(struct sock *sk, s32 delta);

static inline void tcp_slow_start_after_idle_check(struct sock *sk)
{
        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
        struct tcp_sock *tp = tcp_sk(sk);
        s32 delta;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) ||
            tp->packets_out || ca_ops->cong_control)
                return;
        delta = tcp_jiffies32 - tp->lsndtime;
        if (delta > inet_csk(sk)->icsk_rto)
                tcp_cwnd_restart(sk, delta);
}

/* Determine a window scaling and initial window to offer. */
void tcp_select_initial_window(const struct sock *sk, int __space,
                               __u32 mss, __u32 *rcv_wnd,
                               __u32 *window_clamp, int wscale_ok,
                               __u8 *rcv_wscale, __u32 init_rcv_wnd);

static inline int __tcp_win_from_space(u8 scaling_ratio, int space)
{
        s64 scaled_space = (s64)space * scaling_ratio;

        return scaled_space >> TCP_RMEM_TO_WIN_SCALE;
}

static inline int tcp_win_from_space(const struct sock *sk, int space)
{
        return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space);
}

/* inverse of __tcp_win_from_space() */
static inline int __tcp_space_from_win(u8 scaling_ratio, int win)
{
        u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;

        do_div(val, scaling_ratio);
        return val;
}

static inline int tcp_space_from_win(const struct sock *sk, int win)
{
        return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
}

/* Assume a 50% default for skb->len/skb->truesize ratio.
 * This may be adjusted later in tcp_measure_rcv_mss().
 */
#define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1))

static inline void tcp_scaling_ratio_init(struct sock *sk)
{
        tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
}

/* Note: caller must be prepared to deal with negative returns */
static inline int tcp_space(const struct sock *sk)
{
        return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
                                  READ_ONCE(sk->sk_backlog.len) -
                                  atomic_read(&sk->sk_rmem_alloc));
}

static inline int tcp_full_space(const struct sock *sk)
{
        return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}

static inline void __tcp_adjust_rcv_ssthresh(struct sock *sk, u32 new_ssthresh)
{
        int unused_mem = sk_unused_reserved_mem(sk);
        struct tcp_sock *tp = tcp_sk(sk);

        tp->rcv_ssthresh = min(tp->rcv_ssthresh, new_ssthresh);
        if (unused_mem)
                tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh,
                                         tcp_win_from_space(sk, unused_mem));
}

static inline void tcp_adjust_rcv_ssthresh(struct sock *sk)
{
        __tcp_adjust_rcv_ssthresh(sk, 4U * tcp_sk(sk)->advmss);
}

void tcp_cleanup_rbuf(struct sock *sk, int copied);
void __tcp_cleanup_rbuf(struct sock *sk, int copied);


/* We provision sk_rcvbuf around 200% of sk_rcvlowat.
 * If 87.5 % (7/8) of the space has been consumed, we want to override
 * SO_RCVLOWAT constraint, since we are receiving skbs with too small
 * len/truesize ratio.
 */
static inline bool tcp_rmem_pressure(const struct sock *sk)
{
        int rcvbuf, threshold;

        if (tcp_under_memory_pressure(sk))
                return true;

        rcvbuf = READ_ONCE(sk->sk_rcvbuf);
        threshold = rcvbuf - (rcvbuf >> 3);

        return atomic_read(&sk->sk_rmem_alloc) > threshold;
}

static inline bool tcp_epollin_ready(const struct sock *sk, int target)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);

        if (avail <= 0)
                return false;

        return (avail >= target) || tcp_rmem_pressure(sk) ||
               (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss);
}

extern void tcp_openreq_init_rwin(struct request_sock *req,
                                  const struct sock *sk_listener,
                                  const struct dst_entry *dst);

void tcp_enter_memory_pressure(struct sock *sk);
void tcp_leave_memory_pressure(struct sock *sk);

static inline int keepalive_intvl_when(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        int val;

        /* Paired with WRITE_ONCE() in tcp_sock_set_keepintvl()
         * and do_tcp_setsockopt().
         */
        val = READ_ONCE(tp->keepalive_intvl);

        return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl);
}

static inline int keepalive_time_when(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        int val;

        /* Paired with WRITE_ONCE() in tcp_sock_set_keepidle_locked() */
        val = READ_ONCE(tp->keepalive_time);

        return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time);
}

static inline int keepalive_probes(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        int val;

        /* Paired with WRITE_ONCE() in tcp_sock_set_keepcnt()
         * and do_tcp_setsockopt().
         */
        val = READ_ONCE(tp->keepalive_probes);

        return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes);
}

static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
{
        const struct inet_connection_sock *icsk = &tp->inet_conn;

        return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime,
                          tcp_jiffies32 - tp->rcv_tstamp);
}

static inline int tcp_fin_time(const struct sock *sk)
{
        int fin_timeout = tcp_sk(sk)->linger2 ? :
                READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout);
        const int rto = inet_csk(sk)->icsk_rto;

        if (fin_timeout < (rto << 2) - (rto >> 1))
                fin_timeout = (rto << 2) - (rto >> 1);

        return fin_timeout;
}

static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
                                  int paws_win)
{
        if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
                return true;
        if (unlikely(!time_before32(ktime_get_seconds(),
                                    rx_opt->ts_recent_stamp + TCP_PAWS_WRAP)))
                return true;
        /*
         * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
         * then following tcp messages have valid values. Ignore 0 value,
         * or else 'negative' tsval might forbid us to accept their packets.
         */
        if (!rx_opt->ts_recent)
                return true;
        return false;
}

static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
                                   int rst)
{
        if (tcp_paws_check(rx_opt, 0))
                return false;

        /* RST segments are not recommended to carry timestamp,
           and, if they do, it is recommended to ignore PAWS because
           "their cleanup function should take precedence over timestamps."
           Certainly, it is mistake. It is necessary to understand the reasons
           of this constraint to relax it: if peer reboots, clock may go
           out-of-sync and half-open connections will not be reset.
           Actually, the problem would be not existing if all
           the implementations followed draft about maintaining clock
           via reboots. Linux-2.2 DOES NOT!

           However, we can relax time bounds for RST segments to MSL.
         */
        if (rst && !time_before32(ktime_get_seconds(),
                                  rx_opt->ts_recent_stamp + TCP_PAWS_MSL))
                return false;
        return true;
}

static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
        u32 ace;

        /* mptcp hooks are only on the slow path */
        if (sk_is_mptcp((struct sock *)tp))
                return;

        ace = tcp_ecn_mode_accecn(tp) ?
              ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) &
               TCP_ACCECN_CEP_ACE_MASK) : 0;

        tp->pred_flags = htonl((tp->tcp_header_len << 26) |
                               (ace << 22) |
                               ntohl(TCP_FLAG_ACK) |
                               snd_wnd);
}

static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
        __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}

static inline void tcp_fast_path_check(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
            tp->rcv_wnd &&
            atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
            !tp->urg_data)
                tcp_fast_path_on(tp);
}

bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
                          int mib_idx, u32 *last_oow_ack_time);

static inline void tcp_mib_init(struct net *net)
{
        /* See RFC 2012 */
        TCP_ADD_STATS(net, TCP_MIB_RTOALGORITHM, 1);
        TCP_ADD_STATS(net, TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ);
        TCP_ADD_STATS(net, TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ);
        TCP_ADD_STATS(net, TCP_MIB_MAXCONN, -1);
}

/* from STCP */
static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
{
        tp->retransmit_skb_hint = NULL;
}

#define tcp_md5_addr tcp_ao_addr

/* - key database */
struct tcp_md5sig_key {
        struct hlist_node        node;
        u8                        keylen;
        u8                        family; /* AF_INET or AF_INET6 */
        u8                        prefixlen;
        u8                        flags;
        union tcp_md5_addr        addr;
        int                        l3index; /* set if key added with L3 scope */
        u8                        key[TCP_MD5SIG_MAXKEYLEN];
        struct rcu_head                rcu;
};

/* - sock block */
struct tcp_md5sig_info {
        struct hlist_head        head;
        struct rcu_head                rcu;
};

/* - pseudo header */
struct tcp4_pseudohdr {
        __be32                saddr;
        __be32                daddr;
        __u8                pad;
        __u8                protocol;
        __be16                len;
};

struct tcp6_pseudohdr {
        struct in6_addr        saddr;
        struct in6_addr daddr;
        __be32                len;
        __be32                protocol;        /* including padding */
};

/*
 * struct tcp_sigpool - per-CPU pool of ahash_requests
 * @scratch: per-CPU temporary area, that can be used between
 *             tcp_sigpool_start() and tcp_sigpool_end() to perform
 *             crypto request
 * @req: pre-allocated ahash request
 */
struct tcp_sigpool {
        void *scratch;
        struct ahash_request *req;
};

int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size);
void tcp_sigpool_get(unsigned int id);
void tcp_sigpool_release(unsigned int id);
int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp,
                              const struct sk_buff *skb,
                              unsigned int header_len);

/**
 * tcp_sigpool_start - disable bh and start using tcp_sigpool_ahash
 * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
 * @c: returned tcp_sigpool for usage (uninitialized on failure)
 *
 * Returns: 0 on success, error otherwise.
 */
int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c);
/**
 * tcp_sigpool_end - enable bh and stop using tcp_sigpool
 * @c: tcp_sigpool context that was returned by tcp_sigpool_start()
 */
void tcp_sigpool_end(struct tcp_sigpool *c);
size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len);
/* - functions */
void tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
                         const struct sock *sk, const struct sk_buff *skb);
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
                   int family, u8 prefixlen, int l3index, u8 flags,
                   const u8 *newkey, u8 newkeylen);
int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
                     int family, u8 prefixlen, int l3index,
                     struct tcp_md5sig_key *key);

int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr,
                   int family, u8 prefixlen, int l3index, u8 flags);
void tcp_clear_md5_list(struct sock *sk);
struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
                                         const struct sock *addr_sk);

#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
                                           const union tcp_md5_addr *addr,
                                           int family, bool any_l3index);
static inline struct tcp_md5sig_key *
tcp_md5_do_lookup(const struct sock *sk, int l3index,
                  const union tcp_md5_addr *addr, int family)
{
        if (!static_branch_unlikely(&tcp_md5_needed.key))
                return NULL;
        return __tcp_md5_do_lookup(sk, l3index, addr, family, false);
}

static inline struct tcp_md5sig_key *
tcp_md5_do_lookup_any_l3index(const struct sock *sk,
                              const union tcp_md5_addr *addr, int family)
{
        if (!static_branch_unlikely(&tcp_md5_needed.key))
                return NULL;
        return __tcp_md5_do_lookup(sk, 0, addr, family, true);
}

#define tcp_twsk_md5_key(twsk)        ((twsk)->tw_md5_key)
void tcp_md5_destruct_sock(struct sock *sk);
#else
static inline struct tcp_md5sig_key *
tcp_md5_do_lookup(const struct sock *sk, int l3index,
                  const union tcp_md5_addr *addr, int family)
{
        return NULL;
}

static inline struct tcp_md5sig_key *
tcp_md5_do_lookup_any_l3index(const struct sock *sk,
                              const union tcp_md5_addr *addr, int family)
{
        return NULL;
}

#define tcp_twsk_md5_key(twsk)        NULL
static inline void tcp_md5_destruct_sock(struct sock *sk)
{
}
#endif

struct md5_ctx;
void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb,
                           unsigned int header_len);
void tcp_md5_hash_key(struct md5_ctx *ctx, const struct tcp_md5sig_key *key);

/* From tcp_fastopen.c */
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
                            struct tcp_fastopen_cookie *cookie);
void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
                            struct tcp_fastopen_cookie *cookie, bool syn_lost,
                            u16 try_exp);
struct tcp_fastopen_request {
        /* Fast Open cookie. Size 0 means a cookie request */
        struct tcp_fastopen_cookie        cookie;
        struct msghdr                        *data;  /* data in MSG_FASTOPEN */
        size_t                                size;
        int                                copied;        /* queued in tcp_connect() */
        struct ubuf_info                *uarg;
};
void tcp_free_fastopen_req(struct tcp_sock *tp);
void tcp_fastopen_destroy_cipher(struct sock *sk);
void tcp_fastopen_ctx_destroy(struct net *net);
int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
                              void *primary_key, void *backup_key);
int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
                            u64 *key);
void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
                              struct request_sock *req,
                              struct tcp_fastopen_cookie *foc,
                              const struct dst_entry *dst);
void tcp_fastopen_init_key_once(struct net *net);
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
                             struct tcp_fastopen_cookie *cookie);
bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
#define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t)
#define TCP_FASTOPEN_KEY_MAX 2
#define TCP_FASTOPEN_KEY_BUF_LENGTH \
        (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX)

/* Fastopen key context */
struct tcp_fastopen_context {
        siphash_key_t        key[TCP_FASTOPEN_KEY_MAX];
        int                num;
        struct rcu_head        rcu;
};

void tcp_fastopen_active_disable(struct sock *sk);
bool tcp_fastopen_active_should_disable(struct sock *sk);
void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired);

/* Caller needs to wrap with rcu_read_(un)lock() */
static inline
struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk)
{
        struct tcp_fastopen_context *ctx;

        ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
        if (!ctx)
                ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
        return ctx;
}

static inline
bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc,
                               const struct tcp_fastopen_cookie *orig)
{
        if (orig->len == TCP_FASTOPEN_COOKIE_SIZE &&
            orig->len == foc->len &&
            !memcmp(orig->val, foc->val, foc->len))
                return true;
        return false;
}

static inline
int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx)
{
        return ctx->num;
}

/* Latencies incurred by various limits for a sender. They are
 * chronograph-like stats that are mutually exclusive.
 */
enum tcp_chrono {
        TCP_CHRONO_UNSPEC,
        TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
        TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
        TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
        __TCP_CHRONO_MAX,
};

static inline void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
{
        const u32 now = tcp_jiffies32;
        enum tcp_chrono old = tp->chrono_type;

        if (old > TCP_CHRONO_UNSPEC)
                tp->chrono_stat[old - 1] += now - tp->chrono_start;
        tp->chrono_start = now;
        tp->chrono_type = new;
}

static inline void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
{
        struct tcp_sock *tp = tcp_sk(sk);

        /* If there are multiple conditions worthy of tracking in a
         * chronograph then the highest priority enum takes precedence
         * over the other conditions. So that if something "more interesting"
         * starts happening, stop the previous chrono and start a new one.
         */
        if (type > tp->chrono_type)
                tcp_chrono_set(tp, type);
}

void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);

/* This helper is needed, because skb->tcp_tsorted_anchor uses
 * the same memory storage than skb->destructor/_skb_refdst
 */
static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
{
        skb->destructor = NULL;
        skb->_skb_refdst = 0UL;
}

#define tcp_skb_tsorted_save(skb) {                \
        unsigned long _save = skb->_skb_refdst;        \
        skb->_skb_refdst = 0UL;

#define tcp_skb_tsorted_restore(skb)                \
        skb->_skb_refdst = _save;                \
}

void tcp_write_queue_purge(struct sock *sk);

static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
{
        return skb_rb_first(&sk->tcp_rtx_queue);
}

static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk)
{
        return skb_rb_last(&sk->tcp_rtx_queue);
}

static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk)
{
        return skb_peek_tail(&sk->sk_write_queue);
}

#define tcp_for_write_queue_from_safe(skb, tmp, sk)                        \
        skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)

static inline struct sk_buff *tcp_send_head(const struct sock *sk)
{
        return skb_peek(&sk->sk_write_queue);
}

static inline bool tcp_skb_is_last(const struct sock *sk,
                                   const struct sk_buff *skb)
{
        return skb_queue_is_last(&sk->sk_write_queue, skb);
}

/**
 * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue
 * @sk: socket
 *
 * Since the write queue can have a temporary empty skb in it,
 * we must not use "return skb_queue_empty(&sk->sk_write_queue)"
 */
static inline bool tcp_write_queue_empty(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        return tp->write_seq == tp->snd_nxt;
}

static inline bool tcp_rtx_queue_empty(const struct sock *sk)
{
        return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
}

static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
{
        return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
}

static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
        __skb_queue_tail(&sk->sk_write_queue, skb);

        /* Queue it, remembering where we must start sending. */
        if (sk->sk_write_queue.next == skb)
                tcp_chrono_start(sk, TCP_CHRONO_BUSY);
}

/* Insert new before skb on the write queue of sk.  */
static inline void tcp_insert_write_queue_before(struct sk_buff *new,
                                                  struct sk_buff *skb,
                                                  struct sock *sk)
{
        __skb_queue_before(&sk->sk_write_queue, skb, new);
}

static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
        tcp_skb_tsorted_anchor_cleanup(skb);
        __skb_unlink(skb, &sk->sk_write_queue);
}

void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);

static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
{
        tcp_skb_tsorted_anchor_cleanup(skb);
        rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
}

static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
{
        list_del(&skb->tcp_tsorted_anchor);
        tcp_rtx_queue_unlink(skb, sk);
        tcp_wmem_free_skb(sk, skb);
}

static inline void tcp_write_collapse_fence(struct sock *sk)
{
        struct sk_buff *skb = tcp_write_queue_tail(sk);

        if (skb)
                TCP_SKB_CB(skb)->eor = 1;
}

static inline void tcp_push_pending_frames(struct sock *sk)
{
        if (tcp_send_head(sk)) {
                struct tcp_sock *tp = tcp_sk(sk);

                __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle);
        }
}

/* Start sequence of the skb just after the highest skb with SACKed
 * bit, valid only if sacked_out > 0 or when the caller has ensured
 * validity by itself.
 */
static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
{
        if (!tp->sacked_out)
                return tp->snd_una;

        if (tp->highest_sack == NULL)
                return tp->snd_nxt;

        return TCP_SKB_CB(tp->highest_sack)->seq;
}

static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
{
        tcp_sk(sk)->highest_sack = skb_rb_next(skb);
}

static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
{
        return tcp_sk(sk)->highest_sack;
}

static inline void tcp_highest_sack_reset(struct sock *sk)
{
        tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk);
}

/* Called when old skb is about to be deleted and replaced by new skb */
static inline void tcp_highest_sack_replace(struct sock *sk,
                                            struct sk_buff *old,
                                            struct sk_buff *new)
{
        if (old == tcp_highest_sack(sk))
                tcp_sk(sk)->highest_sack = new;
}

/* This helper checks if socket has IP_TRANSPARENT set */
static inline bool inet_sk_transparent(const struct sock *sk)
{
        switch (sk->sk_state) {
        case TCP_TIME_WAIT:
                return inet_twsk(sk)->tw_transparent;
        case TCP_NEW_SYN_RECV:
                return inet_rsk(inet_reqsk(sk))->no_srccheck;
        }
        return inet_test_bit(TRANSPARENT, sk);
}

/* Determines whether this is a thin stream (which may suffer from
 * increased latency). Used to trigger latency-reducing mechanisms.
 */
static inline bool tcp_stream_is_thin(struct tcp_sock *tp)
{
        return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp);
}

/* /proc */
enum tcp_seq_states {
        TCP_SEQ_STATE_LISTENING,
        TCP_SEQ_STATE_ESTABLISHED,
};

void *tcp_seq_start(struct seq_file *seq, loff_t *pos);
void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
void tcp_seq_stop(struct seq_file *seq, void *v);

struct tcp_seq_afinfo {
        sa_family_t                        family;
};

struct tcp_iter_state {
        struct seq_net_private        p;
        enum tcp_seq_states        state;
        struct sock                *syn_wait_sk;
        int                        bucket, offset, sbucket, num;
        loff_t                        last_pos;
};

extern struct request_sock_ops tcp_request_sock_ops;
extern struct request_sock_ops tcp6_request_sock_ops;

void tcp_v4_destroy_sock(struct sock *sk);

struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
                                netdev_features_t features);
struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
                                struct tcphdr *th);
INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
#ifdef CONFIG_INET
void tcp_gro_complete(struct sk_buff *skb);
#else
static inline void tcp_gro_complete(struct sk_buff *skb) { }
#endif

static inline void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr,
                                       __be32 daddr)
{
        struct tcphdr *th = tcp_hdr(skb);

        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
        skb->csum_start = skb_transport_header(skb) - skb->head;
        skb->csum_offset = offsetof(struct tcphdr, check);
}

static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        u32 val;

        val = READ_ONCE(tp->notsent_lowat);

        return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
}

bool tcp_stream_memory_free(const struct sock *sk, int wake);

#ifdef CONFIG_PROC_FS
int tcp4_proc_init(void);
void tcp4_proc_exit(void);
#endif

int tcp_rtx_synack(const struct sock *sk, struct request_sock *req);
int tcp_conn_request(struct request_sock_ops *rsk_ops,
                     const struct tcp_request_sock_ops *af_ops,
                     struct sock *sk, struct sk_buff *skb);

/* TCP af-specific functions */
struct tcp_sock_af_ops {
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key        *(*md5_lookup) (const struct sock *sk,
                                                const struct sock *addr_sk);
        void                (*calc_md5_hash)(char *location,
                                         const struct tcp_md5sig_key *md5,
                                         const struct sock *sk,
                                         const struct sk_buff *skb);
        int                (*md5_parse)(struct sock *sk,
                                     int optname,
                                     sockptr_t optval,
                                     int optlen);
#endif
#ifdef CONFIG_TCP_AO
        int (*ao_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen);
        struct tcp_ao_key *(*ao_lookup)(const struct sock *sk,
                                        struct sock *addr_sk,
                                        int sndid, int rcvid);
        int (*ao_calc_key_sk)(struct tcp_ao_key *mkt, u8 *key,
                              const struct sock *sk,
                              __be32 sisn, __be32 disn, bool send);
        int (*calc_ao_hash)(char *location, struct tcp_ao_key *ao,
                            const struct sock *sk, const struct sk_buff *skb,
                            const u8 *tkey, int hash_offset, u32 sne);
#endif
};

struct tcp_request_sock_ops {
        u16 mss_clamp;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk,
                                                 const struct sock *addr_sk);
        void                (*calc_md5_hash) (char *location,
                                          const struct tcp_md5sig_key *md5,
                                          const struct sock *sk,
                                          const struct sk_buff *skb);
#endif
#ifdef CONFIG_TCP_AO
        struct tcp_ao_key *(*ao_lookup)(const struct sock *sk,
                                        struct request_sock *req,
                                        int sndid, int rcvid);
        int (*ao_calc_key)(struct tcp_ao_key *mkt, u8 *key, struct request_sock *sk);
        int (*ao_synack_hash)(char *ao_hash, struct tcp_ao_key *mkt,
                              struct request_sock *req, const struct sk_buff *skb,
                              int hash_offset, u32 sne);
#endif
#ifdef CONFIG_SYN_COOKIES
        __u32 (*cookie_init_seq)(const struct sk_buff *skb,
                                 __u16 *mss);
#endif
        struct dst_entry *(*route_req)(const struct sock *sk,
                                       struct sk_buff *skb,
                                       struct flowi *fl,
                                       struct request_sock *req,
                                       u32 tw_isn);
        union tcp_seq_and_ts_off (*init_seq_and_ts_off)(
                                        const struct net *net,
                                        const struct sk_buff *skb);
        int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
                           struct flowi *fl, struct request_sock *req,
                           struct tcp_fastopen_cookie *foc,
                           enum tcp_synack_type synack_type,
                           struct sk_buff *syn_skb);
};

extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
#if IS_ENABLED(CONFIG_IPV6)
extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
#endif

#ifdef CONFIG_SYN_COOKIES
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
                                         const struct sock *sk, struct sk_buff *skb,
                                         __u16 *mss)
{
        tcp_synq_overflow(sk);
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
        return ops->cookie_init_seq(skb, mss);
}
#else
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
                                         const struct sock *sk, struct sk_buff *skb,
                                         __u16 *mss)
{
        return 0;
}
#endif

struct tcp_key {
        union {
                struct {
                        struct tcp_ao_key *ao_key;
                        char *traffic_key;
                        u32 sne;
                        u8 rcv_next;
                };
                struct tcp_md5sig_key *md5_key;
        };
        enum {
                TCP_KEY_NONE = 0,
                TCP_KEY_MD5,
                TCP_KEY_AO,
        } type;
};

static inline void tcp_get_current_key(const struct sock *sk,
                                       struct tcp_key *out)
{
#if defined(CONFIG_TCP_AO) || defined(CONFIG_TCP_MD5SIG)
        const struct tcp_sock *tp = tcp_sk(sk);
#endif

#ifdef CONFIG_TCP_AO
        if (static_branch_unlikely(&tcp_ao_needed.key)) {
                struct tcp_ao_info *ao;

                ao = rcu_dereference_protected(tp->ao_info,
                                               lockdep_sock_is_held(sk));
                if (ao) {
                        out->ao_key = READ_ONCE(ao->current_key);
                        out->type = TCP_KEY_AO;
                        return;
                }
        }
#endif
#ifdef CONFIG_TCP_MD5SIG
        if (static_branch_unlikely(&tcp_md5_needed.key) &&
            rcu_access_pointer(tp->md5sig_info)) {
                out->md5_key = tp->af_specific->md5_lookup(sk, sk);
                if (out->md5_key) {
                        out->type = TCP_KEY_MD5;
                        return;
                }
        }
#endif
        out->type = TCP_KEY_NONE;
}

static inline bool tcp_key_is_md5(const struct tcp_key *key)
{
        if (static_branch_tcp_md5())
                return key->type == TCP_KEY_MD5;
        return false;
}

static inline bool tcp_key_is_ao(const struct tcp_key *key)
{
        if (static_branch_tcp_ao())
                return key->type == TCP_KEY_AO;
        return false;
}

int tcpv4_offload_init(void);

void tcp_v4_init(void);
void tcp_init(void);

/* tcp_recovery.c */
void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
                                u32 reo_wnd);
extern bool tcp_rack_mark_lost(struct sock *sk);
extern void tcp_rack_reo_timeout(struct sock *sk);

/* tcp_plb.c */

/*
 * Scaling factor for fractions in PLB. For example, tcp_plb_update_state
 * expects cong_ratio which represents fraction of traffic that experienced
 * congestion over a single RTT. In order to avoid floating point operations,
 * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in.
 */
#define TCP_PLB_SCALE 8

/* State for PLB (Protective Load Balancing) for a single TCP connection. */
struct tcp_plb_state {
        u8        consec_cong_rounds:5, /* consecutive congested rounds */
                unused:3;
        u32        pause_until; /* jiffies32 when PLB can resume rerouting */
};

static inline void tcp_plb_init(const struct sock *sk,
                                struct tcp_plb_state *plb)
{
        plb->consec_cong_rounds = 0;
        plb->pause_until = 0;
}
void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
                          const int cong_ratio);
void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb);
void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb);

static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str)
{
        WARN_ONCE(cond,
                  "%scwn:%u out:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n",
                  str,
                  tcp_snd_cwnd(tcp_sk(sk)),
                  tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out,
                  tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out,
                  tcp_sk(sk)->tlp_high_seq, sk->sk_state,
                  inet_csk(sk)->icsk_ca_state,
                  tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache,
                  inet_csk(sk)->icsk_pmtu_cookie);
}

/* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
        const struct sk_buff *skb = tcp_rtx_queue_head(sk);
        u32 rto = inet_csk(sk)->icsk_rto;

        if (likely(skb)) {
                u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto);

                return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
        } else {
                tcp_warn_once(sk, 1, "rtx queue empty: ");
                return jiffies_to_usecs(rto);
        }

}

/*
 * Save and compile IPv4 options, return a pointer to it
 */
static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net,
                                                         struct sk_buff *skb)
{
        const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
        struct ip_options_rcu *dopt = NULL;

        if (opt->optlen) {
                int opt_size = sizeof(*dopt) + opt->optlen;

                dopt = kmalloc(opt_size, GFP_ATOMIC);
                if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) {
                        kfree(dopt);
                        dopt = NULL;
                }
        }
        return dopt;
}

/* locally generated TCP pure ACKs have skb->truesize == 2
 * (check tcp_send_ack() in net/ipv4/tcp_output.c )
 * This is much faster than dissecting the packet to find out.
 * (Think of GRE encapsulations, IPv4, IPv6, ...)
 */
static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb)
{
        return skb->truesize == 2;
}

static inline void skb_set_tcp_pure_ack(struct sk_buff *skb)
{
        skb->truesize = 2;
}

static inline int tcp_inq(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int answ;

        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                answ = 0;
        } else if (sock_flag(sk, SOCK_URGINLINE) ||
                   !tp->urg_data ||
                   before(tp->urg_seq, tp->copied_seq) ||
                   !before(tp->urg_seq, tp->rcv_nxt)) {

                answ = tp->rcv_nxt - tp->copied_seq;

                /* Subtract 1, if FIN was received */
                if (answ && sock_flag(sk, SOCK_DONE))
                        answ--;
        } else {
                answ = tp->urg_seq - tp->copied_seq;
        }

        return answ;
}

int tcp_peek_len(struct socket *sock);

static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
{
        u16 segs_in;

        segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs);

        /* We update these fields while other threads might
         * read them from tcp_get_info()
         */
        WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in);
        if (skb->len > tcp_hdrlen(skb))
                WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in);
}

/*
 * TCP listen path runs lockless.
 * We forced "struct sock" to be const qualified to make sure
 * we don't modify one of its field by mistake.
 * Here, we increment sk_drops which is an atomic_t, so we can safely
 * make sock writable again.
 */
static inline void tcp_listendrop(const struct sock *sk)
{
        sk_drops_inc((struct sock *)sk);
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
}

enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);

/*
 * Interface for adding Upper Level Protocols over TCP
 */

#define TCP_ULP_NAME_MAX        16
#define TCP_ULP_MAX                128
#define TCP_ULP_BUF_MAX                (TCP_ULP_NAME_MAX*TCP_ULP_MAX)

struct tcp_ulp_ops {
        struct list_head        list;

        /* initialize ulp */
        int (*init)(struct sock *sk);
        /* update ulp */
        void (*update)(struct sock *sk, struct proto *p,
                       void (*write_space)(struct sock *sk));
        /* cleanup ulp */
        void (*release)(struct sock *sk);
        /* diagnostic */
        int (*get_info)(struct sock *sk, struct sk_buff *skb, bool net_admin);
        size_t (*get_info_size)(const struct sock *sk, bool net_admin);
        /* clone ulp */
        void (*clone)(const struct request_sock *req, struct sock *newsk,
                      const gfp_t priority);

        char                name[TCP_ULP_NAME_MAX];
        struct module        *owner;
};
int tcp_register_ulp(struct tcp_ulp_ops *type);
void tcp_unregister_ulp(struct tcp_ulp_ops *type);
int tcp_set_ulp(struct sock *sk, const char *name);
void tcp_get_available_ulp(char *buf, size_t len);
void tcp_cleanup_ulp(struct sock *sk);
void tcp_update_ulp(struct sock *sk, struct proto *p,
                    void (*write_space)(struct sock *sk));

#define MODULE_ALIAS_TCP_ULP(name)                                \
        MODULE_INFO(alias, name);                \
        MODULE_INFO(alias, "tcp-ulp-" name)

#ifdef CONFIG_NET_SOCK_MSG
struct sk_msg;
struct sk_psock;

#ifdef CONFIG_BPF_SYSCALL
int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
#ifdef CONFIG_BPF_STREAM_PARSER
struct strparser;
int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
                           sk_read_actor_t recv_actor);
#endif /* CONFIG_BPF_STREAM_PARSER */
#endif /* CONFIG_BPF_SYSCALL */

#ifdef CONFIG_INET
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb);
#else
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
{
}
#endif

int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
                          struct sk_msg *msg, u32 bytes, int flags);
#endif /* CONFIG_NET_SOCK_MSG */

#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
{
}
#endif

#ifdef CONFIG_CGROUP_BPF
static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
                                      struct sk_buff *skb,
                                      unsigned int end_offset)
{
        skops->skb = skb;
        skops->skb_data_end = skb->data + end_offset;
}
#else
static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
                                      struct sk_buff *skb,
                                      unsigned int end_offset)
{
}
#endif

/* Call BPF_SOCK_OPS program that returns an int. If the return value
 * is < 0, then the BPF op failed (for example if the loaded BPF
 * program does not support the chosen operation or there is no BPF
 * program loaded).
 */
#ifdef CONFIG_BPF
static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
        struct bpf_sock_ops_kern sock_ops;
        int ret;

        memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
        if (sk_fullsock(sk)) {
                sock_ops.is_fullsock = 1;
                sock_ops.is_locked_tcp_sock = 1;
                sock_owned_by_me(sk);
        }

        sock_ops.sk = sk;
        sock_ops.op = op;
        if (nargs > 0)
                memcpy(sock_ops.args, args, nargs * sizeof(*args));

        ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
        if (ret == 0)
                ret = sock_ops.reply;
        else
                ret = -1;
        return ret;
}

static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
{
        u32 args[2] = {arg1, arg2};

        return tcp_call_bpf(sk, op, 2, args);
}

static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
                                    u32 arg3)
{
        u32 args[3] = {arg1, arg2, arg3};

        return tcp_call_bpf(sk, op, 3, args);
}

#else
static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
        return -EPERM;
}

static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
{
        return -EPERM;
}

static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
                                    u32 arg3)
{
        return -EPERM;
}

#endif

static inline u32 tcp_timeout_init(struct sock *sk)
{
        int timeout;

        timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);

        if (timeout <= 0)
                timeout = TCP_TIMEOUT_INIT;
        return min_t(int, timeout, TCP_RTO_MAX);
}

static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
{
        int rwnd;

        rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);

        if (rwnd < 0)
                rwnd = 0;
        return rwnd;
}

static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
{
        return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
}

static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt)
{
        if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
                tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt);
}

#if IS_ENABLED(CONFIG_SMC)
extern struct static_key_false tcp_have_smc;
#endif

#if IS_ENABLED(CONFIG_TLS_DEVICE)
void clean_acked_data_enable(struct tcp_sock *tp,
                             void (*cad)(struct sock *sk, u32 ack_seq));
void clean_acked_data_disable(struct tcp_sock *tp);
void clean_acked_data_flush(void);
#endif

DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
static inline void tcp_add_tx_delay(struct sk_buff *skb,
                                    const struct tcp_sock *tp)
{
        if (static_branch_unlikely(&tcp_tx_delay_enabled))
                skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
}

/* Compute Earliest Departure Time for some control packets
 * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets.
 */
static inline u64 tcp_transmit_time(const struct sock *sk)
{
        if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
                u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
                        tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;

                return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
        }
        return 0;
}

static inline int tcp_parse_auth_options(const struct tcphdr *th,
                const u8 **md5_hash, const struct tcp_ao_hdr **aoh)
{
        const u8 *md5_tmp, *ao_tmp;
        int ret;

        ret = tcp_do_parse_auth_options(th, &md5_tmp, &ao_tmp);
        if (ret)
                return ret;

        if (md5_hash)
                *md5_hash = md5_tmp;

        if (aoh) {
                if (!ao_tmp)
                        *aoh = NULL;
                else
                        *aoh = (struct tcp_ao_hdr *)(ao_tmp - 2);
        }

        return 0;
}

static inline bool tcp_ao_required(struct sock *sk, const void *saddr,
                                   int family, int l3index, bool stat_inc)
{
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info *ao_info;
        struct tcp_ao_key *ao_key;

        if (!static_branch_unlikely(&tcp_ao_needed.key))
                return false;

        ao_info = rcu_dereference_check(tcp_sk(sk)->ao_info,
                                        lockdep_sock_is_held(sk));
        if (!ao_info)
                return false;

        ao_key = tcp_ao_do_lookup(sk, l3index, saddr, family, -1, -1);
        if (ao_info->ao_required || ao_key) {
                if (stat_inc) {
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOREQUIRED);
                        atomic64_inc(&ao_info->counters.ao_required);
                }
                return true;
        }
#endif
        return false;
}

enum skb_drop_reason tcp_inbound_hash(struct sock *sk,
                const struct request_sock *req, const struct sk_buff *skb,
                const void *saddr, const void *daddr,
                int family, int dif, int sdif);

static inline int tcp_recv_should_stop(struct sock *sk)
{
        return sk->sk_err ||
               sk->sk_state == TCP_CLOSE ||
               (sk->sk_shutdown & RCV_SHUTDOWN) ||
               signal_pending(current);
}

INDIRECT_CALLABLE_DECLARE(union tcp_seq_and_ts_off
                          tcp_v4_init_seq_and_ts_off(const struct net *net,
                                                     const struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(union tcp_seq_and_ts_off
                          tcp_v6_init_seq_and_ts_off(const struct net *net,
                                                     const struct sk_buff *skb));
#endif        /* _TCP_H */















    2 


    2 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
// SPDX-License-Identifier: GPL-2.0
/*
 * Device physical location support
 *
 * Author: Won Chung <wonchung@google.com>
 */

#include <linux/acpi.h>
#include <linux/sysfs.h>
#include <linux/string_choices.h>

#include "physical_location.h"

bool dev_add_physical_location(struct device *dev)
{
        struct acpi_pld_info *pld;

        if (!has_acpi_companion(dev))
                return false;

        if (!acpi_get_physical_device_location(ACPI_HANDLE(dev), &pld))
                return false;

        dev->physical_location = kzalloc_obj(*dev->physical_location);
        if (!dev->physical_location) {
                ACPI_FREE(pld);
                return false;
        }

        dev->physical_location->panel = pld->panel;
        dev->physical_location->vertical_position = pld->vertical_position;
        dev->physical_location->horizontal_position = pld->horizontal_position;
        dev->physical_location->dock = pld->dock;
        dev->physical_location->lid = pld->lid;

        ACPI_FREE(pld);
        return true;
}

static ssize_t panel_show(struct device *dev, struct device_attribute *attr,
        char *buf)
{
        const char *panel;

        switch (dev->physical_location->panel) {
        case DEVICE_PANEL_TOP:
                panel = "top";
                break;
        case DEVICE_PANEL_BOTTOM:
                panel = "bottom";
                break;
        case DEVICE_PANEL_LEFT:
                panel = "left";
                break;
        case DEVICE_PANEL_RIGHT:
                panel = "right";
                break;
        case DEVICE_PANEL_FRONT:
                panel = "front";
                break;
        case DEVICE_PANEL_BACK:
                panel = "back";
                break;
        default:
                panel = "unknown";
        }
        return sysfs_emit(buf, "%s\n", panel);
}
static DEVICE_ATTR_RO(panel);

static ssize_t vertical_position_show(struct device *dev,
        struct device_attribute *attr, char *buf)
{
        const char *vertical_position;

        switch (dev->physical_location->vertical_position) {
        case DEVICE_VERT_POS_UPPER:
                vertical_position = "upper";
                break;
        case DEVICE_VERT_POS_CENTER:
                vertical_position = "center";
                break;
        case DEVICE_VERT_POS_LOWER:
                vertical_position = "lower";
                break;
        default:
                vertical_position = "unknown";
        }
        return sysfs_emit(buf, "%s\n", vertical_position);
}
static DEVICE_ATTR_RO(vertical_position);

static ssize_t horizontal_position_show(struct device *dev,
        struct device_attribute *attr, char *buf)
{
        const char *horizontal_position;

        switch (dev->physical_location->horizontal_position) {
        case DEVICE_HORI_POS_LEFT:
                horizontal_position = "left";
                break;
        case DEVICE_HORI_POS_CENTER:
                horizontal_position = "center";
                break;
        case DEVICE_HORI_POS_RIGHT:
                horizontal_position = "right";
                break;
        default:
                horizontal_position = "unknown";
        }
        return sysfs_emit(buf, "%s\n", horizontal_position);
}
static DEVICE_ATTR_RO(horizontal_position);

static ssize_t dock_show(struct device *dev, struct device_attribute *attr,
        char *buf)
{
        return sysfs_emit(buf, "%s\n",
                str_yes_no(dev->physical_location->dock));
}
static DEVICE_ATTR_RO(dock);

static ssize_t lid_show(struct device *dev, struct device_attribute *attr,
        char *buf)
{
        return sysfs_emit(buf, "%s\n",
                str_yes_no(dev->physical_location->lid));
}
static DEVICE_ATTR_RO(lid);

static struct attribute *dev_attr_physical_location[] = {
        &dev_attr_panel.attr,
        &dev_attr_vertical_position.attr,
        &dev_attr_horizontal_position.attr,
        &dev_attr_dock.attr,
        &dev_attr_lid.attr,
        NULL,
};

const struct attribute_group dev_attr_physical_location_group = {
        .name = "physical_location",
        .attrs = dev_attr_physical_location,
};










































































   13 



































































    3 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * fs/kernfs/kernfs-internal.h - kernfs internal header file
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
 */

#ifndef __KERNFS_INTERNAL_H
#define __KERNFS_INTERNAL_H

#include <linux/lockdep.h>
#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/xattr.h>

#include <linux/kernfs.h>
#include <linux/fs_context.h>

struct kernfs_iattrs {
        kuid_t                        ia_uid;
        kgid_t                        ia_gid;
        struct timespec64        ia_atime;
        struct timespec64        ia_mtime;
        struct timespec64        ia_ctime;

        struct simple_xattrs        *xattrs;
        struct simple_xattr_limits xattr_limits;
};

struct kernfs_root {
        /* published fields */
        struct kernfs_node        *kn;
        unsigned int                flags;        /* KERNFS_ROOT_* flags */

        /* private fields, do not use outside kernfs proper */
        struct idr                ino_idr;
        spinlock_t                kernfs_idr_lock;        /* root->ino_idr */
        u32                        last_id_lowbits;
        u32                        id_highbits;
        struct kernfs_syscall_ops *syscall_ops;

        /* list of kernfs_super_info of this root, protected by kernfs_rwsem */
        struct list_head        supers;

        wait_queue_head_t        deactivate_waitq;
        struct rw_semaphore        kernfs_rwsem;
        struct rw_semaphore        kernfs_iattr_rwsem;
        struct rw_semaphore        kernfs_supers_rwsem;

        /* kn->parent and kn->name */
        rwlock_t                kernfs_rename_lock;

        struct rcu_head                rcu;
};

/* +1 to avoid triggering overflow warning when negating it */
#define KN_DEACTIVATED_BIAS                (INT_MIN + 1)

/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */

/**
 * kernfs_root - find out the kernfs_root a kernfs_node belongs to
 * @kn: kernfs_node of interest
 *
 * Return: the kernfs_root @kn belongs to.
 */
static inline struct kernfs_root *kernfs_root(const struct kernfs_node *kn)
{
        const struct kernfs_node *knp;
        /* if parent exists, it's always a dir; otherwise, @sd is a dir */
        guard(rcu)();
        knp = rcu_dereference(kn->__parent);
        if (knp)
                kn = knp;
        return kn->dir.root;
}

/*
 * mount.c
 */
struct kernfs_super_info {
        struct super_block        *sb;

        /*
         * The root associated with this super_block.  Each super_block is
         * identified by the root and ns it's associated with.
         */
        struct kernfs_root        *root;

        /*
         * Each sb is associated with one namespace tag, currently the
         * network namespace of the task which mounted this kernfs
         * instance.  If multiple tags become necessary, make the following
         * an array and compare kernfs_node tag against every entry.
         */
        const struct ns_common        *ns;

        /* anchored at kernfs_root->supers, protected by kernfs_rwsem */
        struct list_head        node;
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))

static inline bool kernfs_root_is_locked(const struct kernfs_node *kn)
{
        return lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem);
}

static inline bool kernfs_rename_is_locked(const struct kernfs_node *kn)
{
        return lockdep_is_held(&kernfs_root(kn)->kernfs_rename_lock);
}

static inline const char *kernfs_rcu_name(const struct kernfs_node *kn)
{
        return rcu_dereference_check(kn->name, kernfs_root_is_locked(kn));
}

static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn)
{
        /*
         * The kernfs_node::__parent remains valid within a RCU section. The kn
         * can be reparented (and renamed) which changes the entry. This can be
         * avoided by locking kernfs_root::kernfs_rwsem or
         * kernfs_root::kernfs_rename_lock.
         * Both locks can be used to obtain a reference on __parent. Once the
         * reference count reaches 0 then the node is about to be freed
         * and can not be renamed (or become a different parent) anymore.
         */
        return rcu_dereference_check(kn->__parent,
                                     kernfs_root_is_locked(kn) ||
                                     kernfs_rename_is_locked(kn) ||
                                     !atomic_read(&kn->count));
}

static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
{
        if (d_really_is_negative(dentry))
                return NULL;
        return d_inode(dentry)->i_private;
}

static inline void kernfs_set_rev(struct kernfs_node *parent,
                                  struct dentry *dentry)
{
        dentry->d_time = parent->dir.rev;
}

static inline void kernfs_inc_rev(struct kernfs_node *parent)
{
        parent->dir.rev++;
}

static inline bool kernfs_dir_changed(struct kernfs_node *parent,
                                      struct dentry *dentry)
{
        if (parent->dir.rev != dentry->d_time)
                return true;
        return false;
}

extern const struct super_operations kernfs_sops;
extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;

/*
 * inode.c
 */
extern const struct xattr_handler * const kernfs_xattr_handlers[];
void kernfs_evict_inode(struct inode *inode);
int kernfs_iop_permission(struct mnt_idmap *idmap,
                          struct inode *inode, int mask);
int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                       struct iattr *iattr);
int kernfs_iop_getattr(struct mnt_idmap *idmap,
                       const struct path *path, struct kstat *stat,
                       u32 request_mask, unsigned int query_flags);
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);

/*
 * dir.c
 */
extern const struct dentry_operations kernfs_dops;
extern const struct file_operations kernfs_dir_fops;
extern const struct inode_operations kernfs_dir_iops;

struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
void kernfs_put_active(struct kernfs_node *kn);
int kernfs_add_one(struct kernfs_node *kn);
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                                    const char *name, umode_t mode,
                                    kuid_t uid, kgid_t gid,
                                    unsigned flags);

/*
 * file.c
 */
extern const struct file_operations kernfs_file_fops;

bool kernfs_should_drain_open_files(struct kernfs_node *kn);
void kernfs_drain_open_files(struct kernfs_node *kn);

/*
 * symlink.c
 */
extern const struct inode_operations kernfs_symlink_iops;

/*
 * kernfs locks
 */
extern struct kernfs_global_locks *kernfs_locks;
#endif        /* __KERNFS_INTERNAL_H */





















    1 
   19 






   22 







































































   20 





   22 







   20 






   20 










   20 

























   21 






   23 




   20 
   23 


   20 



    2 



   21 



    1 




   21 




































   18 



   19 


   19 
































































































    1 








    1 
    1 









    1 






    1 



    1 


    1 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/fs_struct.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/prefetch.h>
#include "mount.h"
#include "internal.h"

struct prepend_buffer {
        char *buf;
        int len;
};
#define DECLARE_BUFFER(__name, __buf, __len) \
        struct prepend_buffer __name = {.buf = __buf + __len, .len = __len}

static char *extract_string(struct prepend_buffer *p)
{
        if (likely(p->len >= 0))
                return p->buf;
        return ERR_PTR(-ENAMETOOLONG);
}

static bool prepend_char(struct prepend_buffer *p, unsigned char c)
{
        if (likely(p->len > 0)) {
                p->len--;
                *--p->buf = c;
                return true;
        }
        p->len = -1;
        return false;
}

/*
 * The source of the prepend data can be an optimistic load
 * of a dentry name and length. And because we don't hold any
 * locks, the length and the pointer to the name may not be
 * in sync if a concurrent rename happens, and the kernel
 * copy might fault as a result.
 *
 * The end result will correct itself when we check the
 * rename sequence count, but we need to be able to handle
 * the fault gracefully.
 */
static bool prepend_copy(void *dst, const void *src, int len)
{
        if (unlikely(copy_from_kernel_nofault(dst, src, len))) {
                memset(dst, 'x', len);
                return false;
        }
        return true;
}

static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
{
        // Already overflowed?
        if (p->len < 0)
                return false;

        // Will overflow?
        if (p->len < namelen) {
                // Fill as much as possible from the end of the name
                str += namelen - p->len;
                p->buf -= p->len;
                prepend_copy(p->buf, str, p->len);
                p->len = -1;
                return false;
        }

        // Fits fully
        p->len -= namelen;
        p->buf -= namelen;
        return prepend_copy(p->buf, str, namelen);
}

/**
 * prepend_name - prepend a pathname in front of current buffer pointer
 * @p: prepend buffer which contains buffer pointer and allocated length
 * @name: name string and length qstr structure
 *
 * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
 * make sure that either the old or the new name pointer and length are
 * fetched. However, there may be mismatch between length and pointer.
 * But since the length cannot be trusted, we need to copy the name very
 * carefully when doing the prepend_copy(). It also prepends "/" at
 * the beginning of the name. The sequence number check at the caller will
 * retry it again when a d_move() does happen. So any garbage in the buffer
 * due to mismatched pointer and length will be discarded.
 *
 * Load acquire is needed to make sure that we see the new name data even
 * if we might get the length wrong.
 */
static bool prepend_name(struct prepend_buffer *p, const struct qstr *name)
{
        const char *dname = smp_load_acquire(&name->name); /* ^^^ */
        u32 dlen = READ_ONCE(name->len);

        return prepend(p, dname, dlen) && prepend_char(p, '/');
}

static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
                          const struct path *root, struct prepend_buffer *p)
{
        while (dentry != root->dentry || &mnt->mnt != root->mnt) {
                const struct dentry *parent = READ_ONCE(dentry->d_parent);

                if (dentry == mnt->mnt.mnt_root) {
                        struct mount *m = READ_ONCE(mnt->mnt_parent);
                        struct mnt_namespace *mnt_ns;

                        if (likely(mnt != m)) {
                                dentry = READ_ONCE(mnt->mnt_mountpoint);
                                mnt = m;
                                continue;
                        }
                        /* Global root */
                        mnt_ns = READ_ONCE(mnt->mnt_ns);
                        /* open-coded is_mounted() to use local mnt_ns */
                        if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
                                return 1;        // absolute root
                        else
                                return 2;        // detached or not attached yet
                }

                if (unlikely(dentry == parent))
                        /* Escaped? */
                        return 3;

                prefetch(parent);
                if (!prepend_name(p, &dentry->d_name))
                        break;
                dentry = parent;
        }
        return 0;
}

/**
 * prepend_path - Prepend path string to a buffer
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry
 * @p: prepend buffer which contains buffer pointer and allocated length
 *
 * The function will first try to write out the pathname without taking any
 * lock other than the RCU read lock to make sure that dentries won't go away.
 * It only checks the sequence number of the global rename_lock as any change
 * in the dentry's d_seq will be preceded by changes in the rename_lock
 * sequence number. If the sequence number had been changed, it will restart
 * the whole pathname back-tracing sequence again by taking the rename_lock.
 * In this case, there is no need to take the RCU read lock as the recursive
 * parent pointer references will keep the dentry chain alive as long as no
 * rename operation is performed.
 */
static int prepend_path(const struct path *path,
                        const struct path *root,
                        struct prepend_buffer *p)
{
        unsigned seq, m_seq = 0;
        struct prepend_buffer b;
        int error;

        rcu_read_lock();
restart_mnt:
        read_seqbegin_or_lock(&mount_lock, &m_seq);
        seq = 0;
        rcu_read_lock();
restart:
        b = *p;
        read_seqbegin_or_lock(&rename_lock, &seq);
        error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b);
        if (!(seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&rename_lock, seq)) {
                seq = 1;
                goto restart;
        }
        done_seqretry(&rename_lock, seq);

        if (!(m_seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&mount_lock, m_seq)) {
                m_seq = 1;
                goto restart_mnt;
        }
        done_seqretry(&mount_lock, m_seq);

        if (unlikely(error == 3))
                b = *p;

        if (b.len == p->len)
                prepend_char(&b, '/');

        *p = b;
        return error;
}

/**
 * __d_path - return the path of a dentry
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
 * Convert a dentry into an ASCII path name.
 *
 * Returns a pointer into the buffer or an error code if the
 * path was too long.
 *
 * "buflen" should be positive.
 *
 * If the path is not reachable from the supplied root, return %NULL.
 */
char *__d_path(const struct path *path,
               const struct path *root,
               char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        if (unlikely(prepend_path(path, root, &b) > 0))
                return NULL;
        return extract_string(&b);
}

char *d_absolute_path(const struct path *path,
               char *buf, int buflen)
{
        struct path root = {};
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        if (unlikely(prepend_path(path, &root, &b) > 1))
                return ERR_PTR(-EINVAL);
        return extract_string(&b);
}

static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
{
        unsigned seq;

        do {
                seq = read_seqbegin(&fs->seq);
                *root = fs->root;
        } while (read_seqretry(&fs->seq, seq));
}

/**
 * d_path - return the path of a dentry
 * @path: path to report
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
 * Returns a pointer into the buffer or an error code if the path was
 * too long. Note: Callers should use the returned pointer, not the passed
 * in buffer, to use the name! The implementation often starts at an offset
 * into the buffer, and may leave 0 bytes at the start.
 *
 * "buflen" should be positive.
 */
char *d_path(const struct path *path, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);
        struct path root;

        /*
         * We have various synthetic filesystems that never get mounted.  On
         * these filesystems dentries are never used for lookup purposes, and
         * thus don't need to be hashed.  They also don't need a name until a
         * user wants to identify the object in /proc/pid/fd/.  The little hack
         * below allows us to generate a name for these objects on demand:
         *
         * Some pseudo inodes are mountable.  When they are mounted
         * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
         * and instead have d_path return the mounted path.
         */
        if (path->dentry->d_op && path->dentry->d_op->d_dname &&
            (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);

        rcu_read_lock();
        get_fs_root_rcu(current->fs, &root);
        if (unlikely(d_unlinked(path->dentry)))
                prepend(&b, " (deleted)", 11);
        else
                prepend_char(&b, 0);
        prepend_path(path, &root, &b);
        rcu_read_unlock();

        return extract_string(&b);
}
EXPORT_SYMBOL(d_path);

/*
 * Helper function for dentry_operations.d_dname() members
 */
char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...)
{
        va_list args;
        char *start;
        int sz;

        va_start(args, fmt);
        sz = vsnprintf(buffer, buflen, fmt, args) + 1;
        va_end(args);

        if (sz > NAME_MAX || sz > buflen)
                return ERR_PTR(-ENAMETOOLONG);

        /* Move the formatted d_name to the end of the buffer. */
        start = buffer + (buflen - sz);
        return memmove(start, buffer, sz);
}

char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
{
        DECLARE_BUFFER(b, buffer, buflen);
        /* these dentries are never renamed, so d_lock is not needed */
        prepend(&b, " (deleted)", 11);
        prepend(&b, dentry->d_name.name, dentry->d_name.len);
        prepend_char(&b, '/');
        return extract_string(&b);
}

/*
 * Write full pathname from the root of the filesystem into the buffer.
 */
static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p)
{
        const struct dentry *dentry;
        struct prepend_buffer b;
        int seq = 0;

        rcu_read_lock();
restart:
        dentry = d;
        b = *p;
        read_seqbegin_or_lock(&rename_lock, &seq);
        while (!IS_ROOT(dentry)) {
                const struct dentry *parent = dentry->d_parent;

                prefetch(parent);
                if (!prepend_name(&b, &dentry->d_name))
                        break;
                dentry = parent;
        }
        if (!(seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&rename_lock, seq)) {
                seq = 1;
                goto restart;
        }
        done_seqretry(&rename_lock, seq);
        if (b.len == p->len)
                prepend_char(&b, '/');
        return extract_string(&b);
}

char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        return __dentry_path(dentry, &b);
}
EXPORT_SYMBOL(dentry_path_raw);

char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        if (unlikely(d_unlinked(dentry)))
                prepend(&b, "//deleted", 10);
        else
                prepend_char(&b, 0);
        return __dentry_path(dentry, &b);
}

static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
                                    struct path *pwd)
{
        unsigned seq;

        do {
                seq = read_seqbegin(&fs->seq);
                *root = fs->root;
                *pwd = fs->pwd;
        } while (read_seqretry(&fs->seq, seq));
}

/*
 * NOTE! The user-level library version returns a
 * character pointer. The kernel system call just
 * returns the length of the buffer filled (which
 * includes the ending '\0' character), or a negative
 * error value. So libc would do something like
 *
 *        char *getcwd(char * buf, size_t size)
 *        {
 *                int retval;
 *
 *                retval = sys_getcwd(buf, size);
 *                if (retval >= 0)
 *                        return buf;
 *                errno = -retval;
 *                return NULL;
 *        }
 */
SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
{
        int error;
        struct path pwd, root;
        char *page = __getname();

        if (!page)
                return -ENOMEM;

        rcu_read_lock();
        get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);

        if (unlikely(d_unlinked(pwd.dentry))) {
                rcu_read_unlock();
                error = -ENOENT;
        } else {
                unsigned len;
                DECLARE_BUFFER(b, page, PATH_MAX);

                prepend_char(&b, 0);
                if (unlikely(prepend_path(&pwd, &root, &b) > 0))
                        prepend(&b, "(unreachable)", 13);
                rcu_read_unlock();

                len = PATH_MAX - b.len;
                if (unlikely(len > PATH_MAX))
                        error = -ENAMETOOLONG;
                else if (unlikely(len > size))
                        error = -ERANGE;
                else if (copy_to_user(buf, b.buf, len))
                        error = -EFAULT;
                else
                        error = len;
        }
        __putname(page);
        return error;
}











































































    1 





























    8 



















































































































    3 













































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_BIT_H
#define _LINUX_WAIT_BIT_H

/*
 * Linux wait-bit related types and methods:
 */
#include <linux/wait.h>

struct wait_bit_key {
        unsigned long                *flags;
        int                        bit_nr;
        unsigned long                timeout;
};

struct wait_bit_queue_entry {
        struct wait_bit_key        key;
        struct wait_queue_entry        wq_entry;
};

#define __WAIT_BIT_KEY_INITIALIZER(word, bit)                                        \
        { .flags = word, .bit_nr = bit, }

typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);

void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit);
int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
void wake_up_bit(unsigned long *word, int bit);
int out_of_line_wait_on_bit(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode);
int out_of_line_wait_on_bit_timeout(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
int out_of_line_wait_on_bit_lock(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode);
struct wait_queue_head *bit_waitqueue(unsigned long *word, int bit);
extern void __init wait_bit_init(void);

int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_BIT(name, word, bit)                                        \
        struct wait_bit_queue_entry name = {                                        \
                .key = __WAIT_BIT_KEY_INITIALIZER(word, bit),                        \
                .wq_entry = {                                                        \
                        .private        = current,                                \
                        .func                = wake_bit_function,                        \
                        .entry                =                                        \
                                LIST_HEAD_INIT((name).wq_entry.entry),                \
                },                                                                \
        }

extern int bit_wait(struct wait_bit_key *key, int mode);
extern int bit_wait_io(struct wait_bit_key *key, int mode);
extern int bit_wait_timeout(struct wait_bit_key *key, int mode);

/**
 * wait_on_bit - wait for a bit to be cleared
 * @word: the address containing the bit being waited on
 * @bit: the bit at that address being waited on
 * @mode: the task state to sleep in
 *
 * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP())
 * to be cleared.  The clearing of the bit must be signalled with
 * wake_up_bit(), often as clear_and_wake_up_bit().
 *
 * The process will wait on a waitqueue selected by hash from a shared
 * pool.  It will only be woken on a wake_up for the target bit, even
 * if other processes on the same queue are waiting for other bits.
 *
 * Returned value will be zero if the bit was cleared in which case the
 * call has ACQUIRE semantics, or %-EINTR if the process received a
 * signal and the mode permitted wake up on that signal.
 */
static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit,
                                       bit_wait,
                                       mode);
}

/**
 * wait_on_bit_io - wait for a bit to be cleared
 * @word: the address containing the bit being waited on
 * @bit: the bit at that address being waited on
 * @mode: the task state to sleep in
 *
 * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP())
 * to be cleared.  The clearing of the bit must be signalled with
 * wake_up_bit(), often as clear_and_wake_up_bit().
 *
 * This is similar to wait_on_bit(), but calls io_schedule() instead of
 * schedule() for the actual waiting.
 *
 * Returned value will be zero if the bit was cleared in which case the
 * call has ACQUIRE semantics, or %-EINTR if the process received a
 * signal and the mode permitted wake up on that signal.
 */
static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit,
                                       bit_wait_io,
                                       mode);
}

/**
 * wait_on_bit_timeout - wait for a bit to be cleared or a timeout to elapse
 * @word: the address containing the bit being waited on
 * @bit: the bit at that address being waited on
 * @mode: the task state to sleep in
 * @timeout: timeout, in jiffies
 *
 * Wait for the given bit in an unsigned long or bitmap (see
 * DECLARE_BITMAP()) to be cleared, or for a timeout to expire.  The
 * clearing of the bit must be signalled with wake_up_bit(), often as
 * clear_and_wake_up_bit().
 *
 * This is similar to wait_on_bit(), except it also takes a timeout
 * parameter.
 *
 * Returned value will be zero if the bit was cleared in which case the
 * call has ACQUIRE semantics, or %-EINTR if the process received a
 * signal and the mode permitted wake up on that signal, or %-EAGAIN if the
 * timeout elapsed.
 */
static inline int
wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
                    unsigned long timeout)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit_timeout(word, bit,
                                               bit_wait_timeout,
                                               mode, timeout);
}

/**
 * wait_on_bit_action - wait for a bit to be cleared
 * @word: the address containing the bit waited on
 * @bit: the bit at that address being waited on
 * @action: the function used to sleep, which may take special actions
 * @mode: the task state to sleep in
 *
 * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP())
 * to be cleared.  The clearing of the bit must be signalled with
 * wake_up_bit(), often as clear_and_wake_up_bit().
 *
 * This is similar to wait_on_bit(), but calls @action() instead of
 * schedule() for the actual waiting.
 *
 * Returned value will be zero if the bit was cleared in which case the
 * call has ACQUIRE semantics, or the error code returned by @action if
 * that call returned non-zero.
 */
static inline int
wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
                   unsigned mode)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit, action, mode);
}

/**
 * wait_on_bit_lock - wait for a bit to be cleared, then set it
 * @word: the address containing the bit being waited on
 * @bit: the bit of the word being waited on and set
 * @mode: the task state to sleep in
 *
 * Wait for the given bit in an unsigned long or bitmap (see
 * DECLARE_BITMAP()) to be cleared.  The clearing of the bit must be
 * signalled with wake_up_bit(), often as clear_and_wake_up_bit().  As
 * soon as it is clear, atomically set it and return.
 *
 * This is similar to wait_on_bit(), but sets the bit before returning.
 *
 * Returned value will be zero if the bit was successfully set in which
 * case the call has the same memory sequencing semantics as
 * test_and_clear_bit(), or %-EINTR if the process received a signal and
 * the mode permitted wake up on that signal.
 */
static inline int
wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
}

/**
 * wait_on_bit_lock_io - wait for a bit to be cleared, then set it
 * @word: the address containing the bit being waited on
 * @bit: the bit of the word being waited on and set
 * @mode: the task state to sleep in
 *
 * Wait for the given bit in an unsigned long or bitmap (see
 * DECLARE_BITMAP()) to be cleared.  The clearing of the bit must be
 * signalled with wake_up_bit(), often as clear_and_wake_up_bit().  As
 * soon as it is clear, atomically set it and return.
 *
 * This is similar to wait_on_bit_lock(), but calls io_schedule() instead
 * of schedule().
 *
 * Returns zero if the bit was (eventually) found to be clear and was
 * set.  Returns non-zero if a signal was delivered to the process and
 * the @mode allows that signal to wake the process.
 */
static inline int
wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
}

/**
 * wait_on_bit_lock_action - wait for a bit to be cleared, then set it
 * @word: the address containing the bit being waited on
 * @bit: the bit of the word being waited on and set
 * @action: the function used to sleep, which may take special actions
 * @mode: the task state to sleep in
 *
 * This is similar to wait_on_bit_lock(), but calls @action() instead of
 * schedule() for the actual waiting.
 *
 * Returned value will be zero if the bit was successfully set in which
 * case the call has the same memory sequencing semantics as
 * test_and_clear_bit(), or the error code returned by @action if that
 * call returned non-zero.
 */
static inline int
wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
                        unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, action, mode);
}

extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags);
extern void wake_up_var(void *var);
extern wait_queue_head_t *__var_waitqueue(void *p);

#define ___wait_var_event(var, condition, state, exclusive, ret, cmd)        \
({                                                                        \
        __label__ __out;                                                \
        struct wait_queue_head *__wq_head = __var_waitqueue(var);        \
        struct wait_bit_queue_entry __wbq_entry;                        \
        long __ret = ret; /* explicit shadow */                                \
                                                                        \
        init_wait_var_entry(&__wbq_entry, var,                                \
                            exclusive ? WQ_FLAG_EXCLUSIVE : 0);                \
        for (;;) {                                                        \
                long __int = prepare_to_wait_event(__wq_head,                \
                                                   &__wbq_entry.wq_entry, \
                                                   state);                \
                if (condition)                                                \
                        break;                                                \
                                                                        \
                if (___wait_is_interruptible(state) && __int) {                \
                        __ret = __int;                                        \
                        goto __out;                                        \
                }                                                        \
                                                                        \
                cmd;                                                        \
        }                                                                \
        finish_wait(__wq_head, &__wbq_entry.wq_entry);                        \
__out:        __ret;                                                                \
})

#define __wait_var_event(var, condition)                                \
        ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                          schedule())
#define __wait_var_event_io(var, condition)                                \
        ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                          io_schedule())

/**
 * wait_var_event - wait for a variable to be updated and notified
 * @var: the address of variable being waited on
 * @condition: the condition to wait for
 *
 * Wait for a @condition to be true, only re-checking when a wake up is
 * received for the given @var (an arbitrary kernel address which need
 * not be directly related to the given condition, but usually is).
 *
 * The process will wait on a waitqueue selected by hash from a shared
 * pool.  It will only be woken on a wake_up for the given address.
 *
 * The condition should normally use smp_load_acquire() or a similarly
 * ordered access to ensure that any changes to memory made before the
 * condition became true will be visible after the wait completes.
 */
#define wait_var_event(var, condition)                                        \
do {                                                                        \
        might_sleep();                                                        \
        if (condition)                                                        \
                break;                                                        \
        __wait_var_event(var, condition);                                \
} while (0)

/**
 * wait_var_event_io - wait for a variable to be updated and notified
 * @var: the address of variable being waited on
 * @condition: the condition to wait for
 *
 * Wait for an IO related @condition to be true, only re-checking when a
 * wake up is received for the given @var (an arbitrary kernel address
 * which need not be directly related to the given condition, but
 * usually is).
 *
 * The process will wait on a waitqueue selected by hash from a shared
 * pool.  It will only be woken on a wake_up for the given address.
 *
 * This is similar to wait_var_event(), but calls io_schedule() instead
 * of schedule().
 *
 * The condition should normally use smp_load_acquire() or a similarly
 * ordered access to ensure that any changes to memory made before the
 * condition became true will be visible after the wait completes.
 */
#define wait_var_event_io(var, condition)                                \
do {                                                                        \
        might_sleep();                                                        \
        if (condition)                                                        \
                break;                                                        \
        __wait_var_event_io(var, condition);                                \
} while (0)

#define __wait_var_event_killable(var, condition)                        \
        ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,                \
                          schedule())

/**
 * wait_var_event_killable - wait for a variable to be updated and notified
 * @var: the address of variable being waited on
 * @condition: the condition to wait for
 *
 * Wait for a @condition to be true or a fatal signal to be received,
 * only re-checking the condition when a wake up is received for the given
 * @var (an arbitrary kernel address which need not be directly related
 * to the given condition, but usually is).
 *
 * This is similar to wait_var_event() but returns a value which is
 * 0 if the condition became true, or %-ERESTARTSYS if a fatal signal
 * was received.
 *
 * The condition should normally use smp_load_acquire() or a similarly
 * ordered access to ensure that any changes to memory made before the
 * condition became true will be visible after the wait completes.
 */
#define wait_var_event_killable(var, condition)                                \
({                                                                        \
        int __ret = 0;                                                        \
        might_sleep();                                                        \
        if (!(condition))                                                \
                __ret = __wait_var_event_killable(var, condition);        \
        __ret;                                                                \
})

#define __wait_var_event_timeout(var, condition, timeout)                \
        ___wait_var_event(var, ___wait_cond_timeout(condition),                \
                          TASK_UNINTERRUPTIBLE, 0, timeout,                \
                          __ret = schedule_timeout(__ret))

/**
 * wait_var_event_timeout - wait for a variable to be updated or a timeout to expire
 * @var: the address of variable being waited on
 * @condition: the condition to wait for
 * @timeout: maximum time to wait in jiffies
 *
 * Wait for a @condition to be true or a timeout to expire, only
 * re-checking the condition when a wake up is received for the given
 * @var (an arbitrary kernel address which need not be directly related
 * to the given condition, but usually is).
 *
 * This is similar to wait_var_event() but returns a value which is 0 if
 * the timeout expired and the condition was still false, or the
 * remaining time left in the timeout (but at least 1) if the condition
 * was found to be true.
 *
 * The condition should normally use smp_load_acquire() or a similarly
 * ordered access to ensure that any changes to memory made before the
 * condition became true will be visible after the wait completes.
 */
#define wait_var_event_timeout(var, condition, timeout)                        \
({                                                                        \
        long __ret = timeout;                                                \
        might_sleep();                                                        \
        if (!___wait_cond_timeout(condition))                                \
                __ret = __wait_var_event_timeout(var, condition, timeout); \
        __ret;                                                                \
})

#define __wait_var_event_interruptible(var, condition)                        \
        ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0,        \
                          schedule())

/**
 * wait_var_event_interruptible - wait for a variable to be updated and notified
 * @var: the address of variable being waited on
 * @condition: the condition to wait for
 *
 * Wait for a @condition to be true or a signal to be received, only
 * re-checking the condition when a wake up is received for the given
 * @var (an arbitrary kernel address which need not be directly related
 * to the given condition, but usually is).
 *
 * This is similar to wait_var_event() but returns a value which is 0 if
 * the condition became true, or %-ERESTARTSYS if a signal was received.
 *
 * The condition should normally use smp_load_acquire() or a similarly
 * ordered access to ensure that any changes to memory made before the
 * condition became true will be visible after the wait completes.
 */
#define wait_var_event_interruptible(var, condition)                        \
({                                                                        \
        int __ret = 0;                                                        \
        might_sleep();                                                        \
        if (!(condition))                                                \
                __ret = __wait_var_event_interruptible(var, condition);        \
        __ret;                                                                \
})

/**
 * wait_var_event_any_lock - wait for a variable to be updated under a lock
 * @var: the address of the variable being waited on
 * @condition: condition to wait for
 * @lock: the object that is locked to protect updates to the variable
 * @type: prefix on lock and unlock operations
 * @state: waiting state, %TASK_UNINTERRUPTIBLE etc.
 *
 * Wait for a condition which can only be reliably tested while holding
 * a lock.  The variables assessed in the condition will normal be updated
 * under the same lock, and the wake up should be signalled with
 * wake_up_var_locked() under the same lock.
 *
 * This is similar to wait_var_event(), but assumes a lock is held
 * while calling this function and while updating the variable.
 *
 * This must be called while the given lock is held and the lock will be
 * dropped when schedule() is called to wait for a wake up, and will be
 * reclaimed before testing the condition again.  The functions used to
 * unlock and lock the object are constructed by appending _unlock and _lock
 * to @type.
 *
 * Return %-ERESTARTSYS if a signal arrives which is allowed to interrupt
 * the wait according to @state.
 */
#define wait_var_event_any_lock(var, condition, lock, type, state)        \
({                                                                        \
        int __ret = 0;                                                        \
        if (!(condition))                                                \
                __ret = ___wait_var_event(var, condition, state, 0, 0,        \
                                          type ## _unlock(lock);        \
                                          schedule();                        \
                                          type ## _lock(lock));                \
        __ret;                                                                \
})

/**
 * wait_var_event_spinlock - wait for a variable to be updated under a spinlock
 * @var: the address of the variable being waited on
 * @condition: condition to wait for
 * @lock: the spinlock which protects updates to the variable
 *
 * Wait for a condition which can only be reliably tested while holding
 * a spinlock.  The variables assessed in the condition will normal be updated
 * under the same spinlock, and the wake up should be signalled with
 * wake_up_var_locked() under the same spinlock.
 *
 * This is similar to wait_var_event(), but assumes a spinlock is held
 * while calling this function and while updating the variable.
 *
 * This must be called while the given lock is held and the lock will be
 * dropped when schedule() is called to wait for a wake up, and will be
 * reclaimed before testing the condition again.
 */
#define wait_var_event_spinlock(var, condition, lock)                        \
        wait_var_event_any_lock(var, condition, lock, spin, TASK_UNINTERRUPTIBLE)

/**
 * wait_var_event_mutex - wait for a variable to be updated under a mutex
 * @var: the address of the variable being waited on
 * @condition: condition to wait for
 * @lock: the mutex which protects updates to the variable
 *
 * Wait for a condition which can only be reliably tested while holding
 * a mutex.  The variables assessed in the condition will normal be
 * updated under the same mutex, and the wake up should be signalled
 * with wake_up_var_locked() under the same mutex.
 *
 * This is similar to wait_var_event(), but assumes a mutex is held
 * while calling this function and while updating the variable.
 *
 * This must be called while the given mutex is held and the mutex will be
 * dropped when schedule() is called to wait for a wake up, and will be
 * reclaimed before testing the condition again.
 */
#define wait_var_event_mutex(var, condition, lock)                        \
        wait_var_event_any_lock(var, condition, lock, mutex, TASK_UNINTERRUPTIBLE)

/**
 * wake_up_var_protected - wake up waiters for a variable asserting that it is safe
 * @var: the address of the variable being waited on
 * @cond: the condition which afirms this is safe
 *
 * When waking waiters which use wait_var_event_any_lock() the waker must be
 * holding the reelvant lock to avoid races.  This version of wake_up_var()
 * asserts that the relevant lock is held and so no barrier is needed.
 * The @cond is only tested when CONFIG_LOCKDEP is enabled.
 */
#define wake_up_var_protected(var, cond)                                \
do {                                                                        \
        lockdep_assert(cond);                                                \
        wake_up_var(var);                                                \
} while (0)

/**
 * wake_up_var_locked - wake up waiters for a variable while holding a spinlock or mutex
 * @var: the address of the variable being waited on
 * @lock: The spinlock or mutex what protects the variable
 *
 * Send a wake up for the given variable which should be waited for with
 * wait_var_event_spinlock() or wait_var_event_mutex().  Unlike wake_up_var(),
 * no extra barriers are needed as the locking provides sufficient sequencing.
 */
#define wake_up_var_locked(var, lock)                                        \
        wake_up_var_protected(var, lockdep_is_held(lock))

/**
 * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
 * @bit: the bit of the word being waited on
 * @word: the address containing the bit being waited on
 *
 * The designated bit is cleared and any tasks waiting in wait_on_bit()
 * or similar will be woken.  This call has RELEASE semantics so that
 * any changes to memory made before this call are guaranteed to be visible
 * after the corresponding wait_on_bit() completes.
 */
static inline void clear_and_wake_up_bit(int bit, unsigned long *word)
{
        clear_bit_unlock(bit, word);
        /* See wake_up_bit() for which memory barrier you need to use. */
        smp_mb__after_atomic();
        wake_up_bit(word, bit);
}

/**
 * test_and_clear_wake_up_bit - clear a bit if it was set: wake up anyone waiting on that bit
 * @bit: the bit of the word being waited on
 * @word: the address of memory containing that bit
 *
 * If the bit is set and can be atomically cleared, any tasks waiting in
 * wait_on_bit() or similar will be woken.  This call has the same
 * complete ordering semantics as test_and_clear_bit().  Any changes to
 * memory made before this call are guaranteed to be visible after the
 * corresponding wait_on_bit() completes.
 *
 * Returns %true if the bit was successfully set and the wake up was sent.
 */
static inline bool test_and_clear_wake_up_bit(int bit, unsigned long *word)
{
        if (!test_and_clear_bit(bit, word))
                return false;
        /* no extra barrier required */
        wake_up_bit(word, bit);
        return true;
}

/**
 * atomic_dec_and_wake_up - decrement an atomic_t and if zero, wake up waiters
 * @var: the variable to dec and test
 *
 * Decrements the atomic variable and if it reaches zero, send a wake_up to any
 * processes waiting on the variable.
 *
 * This function has the same complete ordering semantics as atomic_dec_and_test.
 *
 * Returns %true is the variable reaches zero and the wake up was sent.
 */

static inline bool atomic_dec_and_wake_up(atomic_t *var)
{
        if (!atomic_dec_and_test(var))
                return false;
        /* No extra barrier required */
        wake_up_var(var);
        return true;
}

/**
 * store_release_wake_up - update a variable and send a wake_up
 * @var: the address of the variable to be updated and woken
 * @val: the value to store in the variable.
 *
 * Store the given value in the variable send a wake up to any tasks
 * waiting on the variable.  All necessary barriers are included to ensure
 * the task calling wait_var_event() sees the new value and all values
 * written to memory before this call.
 */
#define store_release_wake_up(var, val)                                        \
do {                                                                        \
        smp_store_release(var, val);                                        \
        smp_mb();                                                        \
        wake_up_var(var);                                                \
} while (0)

#endif /* _LINUX_WAIT_BIT_H */













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   20 
   20 








































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Handle firewalling
 *        Linux ethernet bridge
 *
 *        Authors:
 *        Lennert Buytenhek                <buytenh@gnu.org>
 *        Bart De Schuymer                <bdschuym@pandora.be>
 *
 *        Lennert dedicates this file to Kerstin Wurdinger.
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <linux/netfilter_bridge.h>
#include <uapi/linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_arp.h>
#include <linux/in_route.h>
#include <linux/rculist.h>
#include <linux/inetdevice.h>

#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/dst_metadata.h>
#include <net/route.h>
#include <net/netfilter/br_netfilter.h>
#include <net/netns/generic.h>
#include <net/inet_dscp.h>

#include <linux/uaccess.h>
#include "br_private.h"
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack_core.h>
#endif

static unsigned int brnf_net_id __read_mostly;

struct brnf_net {
        bool enabled;

#ifdef CONFIG_SYSCTL
        struct ctl_table_header *ctl_hdr;
#endif

        /* default value is 1 */
        int call_iptables;
        int call_ip6tables;
        int call_arptables;

        /* default value is 0 */
        int filter_vlan_tagged;
        int filter_pppoe_tagged;
        int pass_vlan_indev;
};

#define IS_IP(skb) \
        (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))

#define IS_IPV6(skb) \
        (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))

#define IS_ARP(skb) \
        (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))

static inline __be16 vlan_proto(const struct sk_buff *skb)
{
        if (skb_vlan_tag_present(skb))
                return skb->protocol;
        else if (skb->protocol == htons(ETH_P_8021Q))
                return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
        else
                return 0;
}

static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net)
{
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged;
}

static inline bool is_vlan_ipv6(const struct sk_buff *skb,
                                const struct net *net)
{
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        return vlan_proto(skb) == htons(ETH_P_IPV6) &&
               brnet->filter_vlan_tagged;
}

static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net)
{
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged;
}

static inline __be16 pppoe_proto(const struct sk_buff *skb)
{
        return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
                            sizeof(struct pppoe_hdr)));
}

static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net)
{
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        return skb->protocol == htons(ETH_P_PPP_SES) &&
               pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged;
}

static inline bool is_pppoe_ipv6(const struct sk_buff *skb,
                                 const struct net *net)
{
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        return skb->protocol == htons(ETH_P_PPP_SES) &&
               pppoe_proto(skb) == htons(PPP_IPV6) &&
               brnet->filter_pppoe_tagged;
}

/* largest possible L2 header, see br_nf_dev_queue_xmit() */
#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)

struct brnf_frag_data {
        local_lock_t bh_lock;
        char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH];
        u8 encap_size;
        u8 size;
        u16 vlan_tci;
        __be16 vlan_proto;
};

static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

static void nf_bridge_info_free(struct sk_buff *skb)
{
        skb_ext_del(skb, SKB_EXT_BRIDGE_NF);
}

static inline struct net_device *bridge_parent(const struct net_device *dev)
{
        struct net_bridge_port *port;

        port = br_port_get_rcu(dev);
        return port ? port->br->dev : NULL;
}

static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
{
        return skb_ext_add(skb, SKB_EXT_BRIDGE_NF);
}

unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
{
        switch (skb->protocol) {
        case __cpu_to_be16(ETH_P_8021Q):
                return VLAN_HLEN;
        case __cpu_to_be16(ETH_P_PPP_SES):
                return PPPOE_SES_HLEN;
        default:
                return 0;
        }
}

static inline void nf_bridge_pull_encap_header(struct sk_buff *skb)
{
        unsigned int len = nf_bridge_encap_header_len(skb);

        skb_pull(skb, len);
        skb->network_header += len;
}

static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb)
{
        unsigned int len = nf_bridge_encap_header_len(skb);

        skb_pull_rcsum(skb, len);
        skb->network_header += len;
}

/* When handing a packet over to the IP layer
 * check whether we have a skb that is in the
 * expected format
 */

static int br_validate_ipv4(struct net *net, struct sk_buff *skb)
{
        const struct iphdr *iph;
        u32 len;

        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto inhdr_error;

        iph = ip_hdr(skb);

        /* Basic sanity checks */
        if (iph->ihl < 5 || iph->version != 4)
                goto inhdr_error;

        if (!pskb_may_pull(skb, iph->ihl*4))
                goto inhdr_error;

        iph = ip_hdr(skb);
        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
                goto csum_error;

        len = skb_ip_totlen(skb);
        if (skb->len < len) {
                __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
                goto drop;
        } else if (len < (iph->ihl*4))
                goto inhdr_error;

        if (pskb_trim_rcsum(skb, len)) {
                __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
                goto drop;
        }

        memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
        /* We should really parse IP options here but until
         * somebody who actually uses IP options complains to
         * us we'll just silently ignore the options because
         * we're lazy!
         */
        return 0;

csum_error:
        __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
        __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
        return -1;
}

void nf_bridge_update_protocol(struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        switch (nf_bridge->orig_proto) {
        case BRNF_PROTO_8021Q:
                skb->protocol = htons(ETH_P_8021Q);
                break;
        case BRNF_PROTO_PPPOE:
                skb->protocol = htons(ETH_P_PPP_SES);
                break;
        case BRNF_PROTO_UNCHANGED:
                break;
        }
}

/* Obtain the correct destination MAC address, while preserving the original
 * source MAC address. If we already know this address, we just copy it. If we
 * don't, we use the neighbour framework to find out. In both cases, we make
 * sure that br_handle_frame_finish() is called afterwards.
 */
int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct neighbour *neigh;
        struct dst_entry *dst;

        skb->dev = bridge_parent(skb->dev);
        if (!skb->dev)
                goto free_skb;
        dst = skb_dst(skb);
        neigh = dst_neigh_lookup_skb(dst, skb);
        if (neigh) {
                struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
                int ret;

                if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) &&
                    READ_ONCE(neigh->hh.hh_len)) {
                        struct net_device *br_indev;

                        br_indev = nf_bridge_get_physindev(skb, net);
                        if (!br_indev) {
                                neigh_release(neigh);
                                goto free_skb;
                        }

                        neigh_hh_bridge(&neigh->hh, skb);
                        skb->dev = br_indev;

                        ret = br_handle_frame_finish(net, sk, skb);
                } else {
                        /* the neighbour function below overwrites the complete
                         * MAC header, so we save the Ethernet source address and
                         * protocol number.
                         */
                        skb_copy_from_linear_data_offset(skb,
                                                         -(ETH_HLEN-ETH_ALEN),
                                                         nf_bridge->neigh_header,
                                                         ETH_HLEN-ETH_ALEN);
                        /* tell br_dev_xmit to continue with forwarding */
                        nf_bridge->bridged_dnat = 1;
                        /* FIXME Need to refragment */
                        ret = READ_ONCE(neigh->output)(neigh, skb);
                }
                neigh_release(neigh);
                return ret;
        }
free_skb:
        kfree_skb(skb);
        return 0;
}

static inline bool
br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb,
                             const struct nf_bridge_info *nf_bridge)
{
        return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr;
}

/* This requires some explaining. If DNAT has taken place,
 * we will need to fix up the destination Ethernet address.
 * This is also true when SNAT takes place (for the reply direction).
 *
 * There are two cases to consider:
 * 1. The packet was DNAT'ed to a device in the same bridge
 *    port group as it was received on. We can still bridge
 *    the packet.
 * 2. The packet was DNAT'ed to a different device, either
 *    a non-bridged device or another bridge port group.
 *    The packet will need to be routed.
 *
 * The correct way of distinguishing between these two cases is to
 * call ip_route_input() and to look at skb->dst->dev, which is
 * changed to the destination device if ip_route_input() succeeds.
 *
 * Let's first consider the case that ip_route_input() succeeds:
 *
 * If the output device equals the logical bridge device the packet
 * came in on, we can consider this bridging. The corresponding MAC
 * address will be obtained in br_nf_pre_routing_finish_bridge.
 * Otherwise, the packet is considered to be routed and we just
 * change the destination MAC address so that the packet will
 * later be passed up to the IP stack to be routed. For a redirected
 * packet, ip_route_input() will give back the localhost as output device,
 * which differs from the bridge device.
 *
 * Let's now consider the case that ip_route_input() fails:
 *
 * This can be because the destination address is martian, in which case
 * the packet will be dropped.
 * If IP forwarding is disabled, ip_route_input() will fail, while
 * ip_route_output_key() can return success. The source
 * address for ip_route_output_key() is set to zero, so ip_route_output_key()
 * thinks we're handling a locally generated packet and won't care
 * if IP forwarding is enabled. If the output device equals the logical bridge
 * device, we proceed as if ip_route_input() succeeded. If it differs from the
 * logical bridge port or if ip_route_output_key() fails we drop the packet.
 */
static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
        struct net_device *dev = skb->dev, *br_indev;
        const struct iphdr *iph = ip_hdr(skb);
        enum skb_drop_reason reason;
        struct rtable *rt;

        br_indev = nf_bridge_get_physindev(skb, net);
        if (!br_indev) {
                kfree_skb(skb);
                return 0;
        }

        nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;

        if (nf_bridge->pkt_otherhost) {
                skb->pkt_type = PACKET_OTHERHOST;
                nf_bridge->pkt_otherhost = false;
        }
        nf_bridge->in_prerouting = 0;
        if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) {
                reason = ip_route_input(skb, iph->daddr, iph->saddr,
                                        ip4h_dscp(iph), dev);
                if (reason) {
                        kfree_skb_reason(skb, reason);
                        return 0;
                } else {
                        if (skb_dst(skb)->dev == dev) {
                                skb->dev = br_indev;
                                nf_bridge_update_protocol(skb);
                                nf_bridge_push_encap_header(skb);
                                br_nf_hook_thresh(NF_BR_PRE_ROUTING,
                                                  net, sk, skb, skb->dev,
                                                  NULL,
                                                  br_nf_pre_routing_finish_bridge);
                                return 0;
                        }
                        ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
                        skb->pkt_type = PACKET_HOST;
                }
        } else {
                rt = bridge_parent_rtable(br_indev);
                if (!rt) {
                        kfree_skb(skb);
                        return 0;
                }
                skb_dst_drop(skb);
                skb_dst_set_noref(skb, &rt->dst);
        }

        skb->dev = br_indev;
        nf_bridge_update_protocol(skb);
        nf_bridge_push_encap_header(skb);
        br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL,
                          br_handle_frame_finish);
        return 0;
}

static struct net_device *brnf_get_logical_dev(struct sk_buff *skb,
                                               const struct net_device *dev,
                                               const struct net *net)
{
        struct net_device *vlan, *br;
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        br = bridge_parent(dev);

        if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
                return br;

        vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
                                    skb_vlan_tag_get(skb) & VLAN_VID_MASK);

        return vlan ? vlan : br;
}

/* Some common code for IPv4/IPv6 */
struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (skb->pkt_type == PACKET_OTHERHOST) {
                skb->pkt_type = PACKET_HOST;
                nf_bridge->pkt_otherhost = true;
        }

        nf_bridge->in_prerouting = 1;
        nf_bridge->physinif = skb->dev->ifindex;
        skb->dev = brnf_get_logical_dev(skb, skb->dev, net);

        if (skb->protocol == htons(ETH_P_8021Q))
                nf_bridge->orig_proto = BRNF_PROTO_8021Q;
        else if (skb->protocol == htons(ETH_P_PPP_SES))
                nf_bridge->orig_proto = BRNF_PROTO_PPPOE;

        /* Must drop socket now because of tproxy. */
        skb_orphan(skb);
        return skb->dev;
}

/* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
 * Replicate the checks that IPv4 does on packet reception.
 * Set skb->dev to the bridge device (i.e. parent of the
 * receiving device) to make netfilter happy, the REDIRECT
 * target in particular.  Save the original destination IP
 * address to be able to detect DNAT afterwards. */
static unsigned int br_nf_pre_routing(void *priv,
                                      struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        struct nf_bridge_info *nf_bridge;
        struct net_bridge_port *p;
        struct net_bridge *br;
        __u32 len = nf_bridge_encap_header_len(skb);
        struct brnf_net *brnet;

        if (unlikely(!pskb_may_pull(skb, len)))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0);

        p = br_port_get_rcu(state->in);
        if (p == NULL)
                return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
        br = p->br;

        brnet = net_generic(state->net, brnf_net_id);
        if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
            is_pppoe_ipv6(skb, state->net)) {
                if (!brnet->call_ip6tables &&
                    !br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
                        return NF_ACCEPT;
                if (!ipv6_mod_enabled()) {
                        pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported.");
                        return NF_DROP_REASON(skb, SKB_DROP_REASON_IPV6DISABLED, 0);
                }

                nf_bridge_pull_encap_header_rcsum(skb);
                return br_nf_pre_routing_ipv6(priv, skb, state);
        }

        if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
                return NF_ACCEPT;

        if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) &&
            !is_pppoe_ip(skb, state->net))
                return NF_ACCEPT;

        nf_bridge_pull_encap_header_rcsum(skb);

        if (br_validate_ipv4(state->net, skb))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);

        if (!nf_bridge_alloc(skb))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);
        if (!setup_pre_routing(skb, state->net))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);

        nf_bridge = nf_bridge_info_get(skb);
        nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr;

        skb->protocol = htons(ETH_P_IP);
        skb->transport_header = skb->network_header + ip_hdr(skb)->ihl * 4;

        NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb,
                skb->dev, NULL,
                br_nf_pre_routing_finish);

        return NF_STOLEN;
}

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* conntracks' nf_confirm logic cannot handle cloned skbs referencing
 * the same nf_conn entry, which will happen for multicast (broadcast)
 * Frames on bridges.
 *
 * Example:
 *      macvlan0
 *      br0
 *  ethX  ethY
 *
 * ethX (or Y) receives multicast or broadcast packet containing
 * an IP packet, not yet in conntrack table.
 *
 * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
 *    -> skb->_nfct now references a unconfirmed entry
 * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
 *    interface.
 * 3. skb gets passed up the stack.
 * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
 *    and schedules a work queue to send them out on the lower devices.
 *
 *    The clone skb->_nfct is not a copy, it is the same entry as the
 *    original skb.  The macvlan rx handler then returns RX_HANDLER_PASS.
 * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
 *
 * The Macvlan broadcast worker and normal confirm path will race.
 *
 * This race will not happen if step 2 already confirmed a clone. In that
 * case later steps perform skb_clone() with skb->_nfct already confirmed (in
 * hash table).  This works fine.
 *
 * But such confirmation won't happen when eb/ip/nftables rules dropped the
 * packets before they reached the nf_confirm step in postrouting.
 *
 * Work around this problem by explicit confirmation of the entry at
 * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed
 * entry.
 *
 */
static unsigned int br_nf_local_in(void *priv,
                                   struct sk_buff *skb,
                                   const struct nf_hook_state *state)
{
        bool promisc = BR_INPUT_SKB_CB(skb)->promisc;
        struct nf_conntrack *nfct = skb_nfct(skb);
        const struct nf_ct_hook *ct_hook;
        struct nf_conn *ct;
        int ret;

        if (promisc) {
                nf_reset_ct(skb);
                return NF_ACCEPT;
        }

        if (!nfct || skb->pkt_type == PACKET_HOST)
                return NF_ACCEPT;

        ct = container_of(nfct, struct nf_conn, ct_general);
        if (likely(nf_ct_is_confirmed(ct)))
                return NF_ACCEPT;

        if (WARN_ON_ONCE(refcount_read(&nfct->use) != 1)) {
                nf_reset_ct(skb);
                return NF_ACCEPT;
        }

        WARN_ON_ONCE(skb_shared(skb));

        /* We can't call nf_confirm here, it would create a dependency
         * on nf_conntrack module.
         */
        ct_hook = rcu_dereference(nf_ct_hook);
        if (!ct_hook) {
                skb->_nfct = 0ul;
                nf_conntrack_put(nfct);
                return NF_ACCEPT;
        }

        nf_bridge_pull_encap_header(skb);
        ret = ct_hook->confirm(skb);
        switch (ret & NF_VERDICT_MASK) {
        case NF_STOLEN:
                return NF_STOLEN;
        default:
                nf_bridge_push_encap_header(skb);
                break;
        }

        return ret;
}
#endif

/* PF_BRIDGE/FORWARD *************************************************/
static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
        struct net_device *in;

        if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) {

                if (skb->protocol == htons(ETH_P_IP))
                        nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;

                if (skb->protocol == htons(ETH_P_IPV6))
                        nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;

                in = nf_bridge_get_physindev(skb, net);
                if (!in) {
                        kfree_skb(skb);
                        return 0;
                }
                if (nf_bridge->pkt_otherhost) {
                        skb->pkt_type = PACKET_OTHERHOST;
                        nf_bridge->pkt_otherhost = false;
                }
                nf_bridge_update_protocol(skb);
        } else {
                in = *((struct net_device **)(skb->cb));
        }
        nf_bridge_push_encap_header(skb);

        br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev,
                          br_forward_finish);
        return 0;
}


static unsigned int br_nf_forward_ip(struct sk_buff *skb,
                                     const struct nf_hook_state *state,
                                     u8 pf)
{
        struct nf_bridge_info *nf_bridge;
        struct net_device *parent;

        nf_bridge = nf_bridge_info_get(skb);
        if (!nf_bridge)
                return NF_ACCEPT;

        /* Need exclusive nf_bridge_info since we might have multiple
         * different physoutdevs. */
        if (!nf_bridge_unshare(skb))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);

        nf_bridge = nf_bridge_info_get(skb);
        if (!nf_bridge)
                return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);

        parent = bridge_parent(state->out);
        if (!parent)
                return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);

        nf_bridge_pull_encap_header(skb);

        if (skb->pkt_type == PACKET_OTHERHOST) {
                skb->pkt_type = PACKET_HOST;
                nf_bridge->pkt_otherhost = true;
        }

        if (pf == NFPROTO_IPV4) {
                if (br_validate_ipv4(state->net, skb))
                        return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
                IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
                skb->protocol = htons(ETH_P_IP);
        } else if (pf == NFPROTO_IPV6) {
                if (br_validate_ipv6(state->net, skb))
                        return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
                IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
                skb->protocol = htons(ETH_P_IPV6);
        } else {
                WARN_ON_ONCE(1);
                return NF_DROP;
        }

        nf_bridge->physoutdev = skb->dev;

        NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb,
                brnf_get_logical_dev(skb, state->in, state->net),
                parent,        br_nf_forward_finish);

        return NF_STOLEN;
}

static unsigned int br_nf_forward_arp(struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        struct net_bridge_port *p;
        struct net_bridge *br;
        struct net_device **d = (struct net_device **)(skb->cb);
        struct brnf_net *brnet;

        p = br_port_get_rcu(state->out);
        if (p == NULL)
                return NF_ACCEPT;
        br = p->br;

        brnet = net_generic(state->net, brnf_net_id);
        if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
                return NF_ACCEPT;

        if (is_vlan_arp(skb, state->net))
                nf_bridge_pull_encap_header(skb);

        if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr))))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0);

        if (arp_hdr(skb)->ar_pln != 4) {
                if (is_vlan_arp(skb, state->net))
                        nf_bridge_push_encap_header(skb);
                return NF_ACCEPT;
        }
        *d = state->in;
        NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->net, state->sk, skb,
                state->in, state->out, br_nf_forward_finish);

        return NF_STOLEN;
}

/* This is the 'purely bridged' case.  For IP, we pass the packet to
 * netfilter with indev and outdev set to the bridge device,
 * but we are still able to filter on the 'real' indev/outdev
 * because of the physdev module. For ARP, indev and outdev are the
 * bridge ports.
 */
static unsigned int br_nf_forward(void *priv,
                                  struct sk_buff *skb,
                                  const struct nf_hook_state *state)
{
        if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
            is_pppoe_ip(skb, state->net))
                return br_nf_forward_ip(skb, state, NFPROTO_IPV4);
        if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
            is_pppoe_ipv6(skb, state->net))
                return br_nf_forward_ip(skb, state, NFPROTO_IPV6);
        if (IS_ARP(skb) || is_vlan_arp(skb, state->net))
                return br_nf_forward_arp(skb, state);

        return NF_ACCEPT;
}

static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct brnf_frag_data *data;
        int err;

        data = this_cpu_ptr(&brnf_frag_data_storage);
        err = skb_cow_head(skb, data->size);

        if (err) {
                kfree_skb(skb);
                return 0;
        }

        if (data->vlan_proto)
                __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);

        skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size);
        __skb_push(skb, data->encap_size);

        nf_bridge_info_free(skb);
        return br_dev_queue_push_xmit(net, sk, skb);
}

static int
br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                  int (*output)(struct net *, struct sock *, struct sk_buff *))
{
        unsigned int mtu = ip_skb_dst_mtu(sk, skb);
        struct iphdr *iph = ip_hdr(skb);

        if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
                     (IPCB(skb)->frag_max_size &&
                      IPCB(skb)->frag_max_size > mtu))) {
                IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
                kfree_skb(skb);
                return -EMSGSIZE;
        }

        return ip_do_fragment(net, sk, skb, output);
}

static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE)
                return PPPOE_SES_HLEN;
        return 0;
}

static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
        unsigned int mtu, mtu_reserved;
        int ret;

        mtu_reserved = nf_bridge_mtu_reduction(skb);
        mtu = skb->dev->mtu;

        if (nf_bridge->pkt_otherhost) {
                skb->pkt_type = PACKET_OTHERHOST;
                nf_bridge->pkt_otherhost = false;
        }

        if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu)
                mtu = nf_bridge->frag_max_size;

        nf_bridge_update_protocol(skb);
        nf_bridge_push_encap_header(skb);

        if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) {
                nf_bridge_info_free(skb);
                return br_dev_queue_push_xmit(net, sk, skb);
        }

        /* Fragmentation on metadata/template dst is not supported */
        if (unlikely(!skb_valid_dst(skb)))
                goto drop;

        /* This is wrong! We should preserve the original fragment
         * boundaries by preserving frag_list rather than refragmenting.
         */
        if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) &&
            skb->protocol == htons(ETH_P_IP)) {
                struct brnf_frag_data *data;

                if (br_validate_ipv4(net, skb))
                        goto drop;

                IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;

                local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
                data = this_cpu_ptr(&brnf_frag_data_storage);

                if (skb_vlan_tag_present(skb)) {
                        data->vlan_tci = skb->vlan_tci;
                        data->vlan_proto = skb->vlan_proto;
                } else {
                        data->vlan_proto = 0;
                }

                data->encap_size = nf_bridge_encap_header_len(skb);
                data->size = ETH_HLEN + data->encap_size;

                skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
                                                 data->size);

                ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit);
                local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
                return ret;
        }
        if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) &&
            skb->protocol == htons(ETH_P_IPV6)) {
                struct brnf_frag_data *data;

                if (br_validate_ipv6(net, skb))
                        goto drop;

                IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;

                local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
                data = this_cpu_ptr(&brnf_frag_data_storage);
                data->encap_size = nf_bridge_encap_header_len(skb);
                data->size = ETH_HLEN + data->encap_size;

                skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
                                                 data->size);

                ret = ip6_fragment(net, sk, skb, br_nf_push_frag_xmit);
                local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
                return ret;
        }
        nf_bridge_info_free(skb);
        return br_dev_queue_push_xmit(net, sk, skb);
 drop:
        kfree_skb(skb);
        return 0;
}

/* PF_BRIDGE/POST_ROUTING ********************************************/
static unsigned int br_nf_post_routing(void *priv,
                                       struct sk_buff *skb,
                                       const struct nf_hook_state *state)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
        struct net_device *realoutdev = bridge_parent(skb->dev);
        u_int8_t pf;

        /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in
         * on a bridge, but was delivered locally and is now being routed:
         *
         * POST_ROUTING was already invoked from the ip stack.
         */
        if (!nf_bridge || !nf_bridge->physoutdev)
                return NF_ACCEPT;

        if (!realoutdev)
                return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);

        if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
            is_pppoe_ip(skb, state->net))
                pf = NFPROTO_IPV4;
        else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
                 is_pppoe_ipv6(skb, state->net))
                pf = NFPROTO_IPV6;
        else
                return NF_ACCEPT;

        if (skb->pkt_type == PACKET_OTHERHOST) {
                skb->pkt_type = PACKET_HOST;
                nf_bridge->pkt_otherhost = true;
        }

        nf_bridge_pull_encap_header(skb);
        if (pf == NFPROTO_IPV4)
                skb->protocol = htons(ETH_P_IP);
        else
                skb->protocol = htons(ETH_P_IPV6);

        NF_HOOK(pf, NF_INET_POST_ROUTING, state->net, state->sk, skb,
                NULL, realoutdev,
                br_nf_dev_queue_xmit);

        return NF_STOLEN;
}

/* IP/SABOTAGE *****************************************************/
/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
 * for the second time. */
static unsigned int ip_sabotage_in(void *priv,
                                   struct sk_buff *skb,
                                   const struct nf_hook_state *state)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (nf_bridge) {
                if (nf_bridge->sabotage_in_done)
                        return NF_ACCEPT;

                if (!nf_bridge->in_prerouting &&
                    !netif_is_l3_master(skb->dev) &&
                    !netif_is_l3_slave(skb->dev)) {
                        nf_bridge->sabotage_in_done = 1;
                        state->okfn(state->net, state->sk, skb);
                        return NF_STOLEN;
                }
        }

        return NF_ACCEPT;
}

/* This is called when br_netfilter has called into iptables/netfilter,
 * and DNAT has taken place on a bridge-forwarded packet.
 *
 * neigh->output has created a new MAC header, with local br0 MAC
 * as saddr.
 *
 * This restores the original MAC saddr of the bridged packet
 * before invoking bridge forward logic to transmit the packet.
 */
static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
        struct net_device *br_indev;

        br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev));
        if (!br_indev) {
                kfree_skb(skb);
                return;
        }

        skb_pull(skb, ETH_HLEN);
        nf_bridge->bridged_dnat = 0;

        BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN));

        skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN),
                                       nf_bridge->neigh_header,
                                       ETH_HLEN - ETH_ALEN);
        skb->dev = br_indev;

        nf_bridge->physoutdev = NULL;
        br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
}

static int br_nf_dev_xmit(struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (nf_bridge && nf_bridge->bridged_dnat) {
                br_nf_pre_routing_finish_bridge_slow(skb);
                return 1;
        }
        return 0;
}

static const struct nf_br_ops br_ops = {
        .br_dev_xmit_hook =        br_nf_dev_xmit,
};

/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
 * br_dev_queue_push_xmit is called afterwards */
static const struct nf_hook_ops br_nf_ops[] = {
        {
                .hook = br_nf_pre_routing,
                .pf = NFPROTO_BRIDGE,
                .hooknum = NF_BR_PRE_ROUTING,
                .priority = NF_BR_PRI_BRNF,
        },
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        {
                .hook = br_nf_local_in,
                .pf = NFPROTO_BRIDGE,
                .hooknum = NF_BR_LOCAL_IN,
                .priority = NF_BR_PRI_LAST,
        },
#endif
        {
                .hook = br_nf_forward,
                .pf = NFPROTO_BRIDGE,
                .hooknum = NF_BR_FORWARD,
                .priority = NF_BR_PRI_BRNF,
        },
        {
                .hook = br_nf_post_routing,
                .pf = NFPROTO_BRIDGE,
                .hooknum = NF_BR_POST_ROUTING,
                .priority = NF_BR_PRI_LAST,
        },
        {
                .hook = ip_sabotage_in,
                .pf = NFPROTO_IPV4,
                .hooknum = NF_INET_PRE_ROUTING,
                .priority = NF_IP_PRI_FIRST,
        },
        {
                .hook = ip_sabotage_in,
                .pf = NFPROTO_IPV6,
                .hooknum = NF_INET_PRE_ROUTING,
                .priority = NF_IP6_PRI_FIRST,
        },
};

static int brnf_device_event(struct notifier_block *unused, unsigned long event,
                             void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct brnf_net *brnet;
        struct net *net;
        int ret;

        if (event != NETDEV_REGISTER || !netif_is_bridge_master(dev))
                return NOTIFY_DONE;

        ASSERT_RTNL();

        net = dev_net(dev);
        brnet = net_generic(net, brnf_net_id);
        if (brnet->enabled)
                return NOTIFY_OK;

        ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
        if (ret)
                return NOTIFY_BAD;

        brnet->enabled = true;
        return NOTIFY_OK;
}

static struct notifier_block brnf_notifier __read_mostly = {
        .notifier_call = brnf_device_event,
};

/* recursively invokes nf_hook_slow (again), skipping already-called
 * hooks (< NF_BR_PRI_BRNF).
 *
 * Called with rcu read lock held.
 */
int br_nf_hook_thresh(unsigned int hook, struct net *net,
                      struct sock *sk, struct sk_buff *skb,
                      struct net_device *indev,
                      struct net_device *outdev,
                      int (*okfn)(struct net *, struct sock *,
                                  struct sk_buff *))
{
        const struct nf_hook_entries *e;
        struct nf_hook_state state;
        struct nf_hook_ops **ops;
        unsigned int i;
        int ret;

        e = rcu_dereference(net->nf.hooks_bridge[hook]);
        if (!e)
                return okfn(net, sk, skb);

        ops = nf_hook_entries_get_hook_ops(e);
        for (i = 0; i < e->num_hook_entries; i++) {
                /* These hooks have already been called */
                if (ops[i]->priority < NF_BR_PRI_BRNF)
                        continue;

                /* These hooks have not been called yet, run them. */
                if (ops[i]->priority > NF_BR_PRI_BRNF)
                        break;

                /* take a closer look at NF_BR_PRI_BRNF. */
                if (ops[i]->hook == br_nf_pre_routing) {
                        /* This hook diverted the skb to this function,
                         * hooks after this have not been run yet.
                         */
                        i++;
                        break;
                }
        }

        nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev,
                           sk, net, okfn);

        ret = nf_hook_slow(skb, &state, e, i);
        if (ret == 1)
                ret = okfn(net, sk, skb);

        return ret;
}

#ifdef CONFIG_SYSCTL
static
int brnf_sysctl_call_tables(const struct ctl_table *ctl, int write,
                            void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec(ctl, write, buffer, lenp, ppos);

        if (write && *(int *)(ctl->data))
                *(int *)(ctl->data) = 1;
        return ret;
}

static struct ctl_table brnf_table[] = {
        {
                .procname        = "bridge-nf-call-arptables",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
        {
                .procname        = "bridge-nf-call-iptables",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
        {
                .procname        = "bridge-nf-call-ip6tables",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
        {
                .procname        = "bridge-nf-filter-vlan-tagged",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
        {
                .procname        = "bridge-nf-filter-pppoe-tagged",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
        {
                .procname        = "bridge-nf-pass-vlan-input-dev",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
};

static inline void br_netfilter_sysctl_default(struct brnf_net *brnf)
{
        brnf->call_iptables = 1;
        brnf->call_ip6tables = 1;
        brnf->call_arptables = 1;
        brnf->filter_vlan_tagged = 0;
        brnf->filter_pppoe_tagged = 0;
        brnf->pass_vlan_indev = 0;
}

static int br_netfilter_sysctl_init_net(struct net *net)
{
        struct ctl_table *table = brnf_table;
        struct brnf_net *brnet;

        if (!net_eq(net, &init_net)) {
                table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL);
                if (!table)
                        return -ENOMEM;
        }

        brnet = net_generic(net, brnf_net_id);
        table[0].data = &brnet->call_arptables;
        table[1].data = &brnet->call_iptables;
        table[2].data = &brnet->call_ip6tables;
        table[3].data = &brnet->filter_vlan_tagged;
        table[4].data = &brnet->filter_pppoe_tagged;
        table[5].data = &brnet->pass_vlan_indev;

        br_netfilter_sysctl_default(brnet);

        brnet->ctl_hdr = register_net_sysctl_sz(net, "net/bridge", table,
                                                ARRAY_SIZE(brnf_table));
        if (!brnet->ctl_hdr) {
                if (!net_eq(net, &init_net))
                        kfree(table);

                return -ENOMEM;
        }

        return 0;
}

static void br_netfilter_sysctl_exit_net(struct net *net,
                                         struct brnf_net *brnet)
{
        const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg;

        unregister_net_sysctl_table(brnet->ctl_hdr);
        if (!net_eq(net, &init_net))
                kfree(table);
}

static int __net_init brnf_init_net(struct net *net)
{
        return br_netfilter_sysctl_init_net(net);
}
#endif

static void __net_exit brnf_exit_net(struct net *net)
{
        struct brnf_net *brnet;

        brnet = net_generic(net, brnf_net_id);
        if (brnet->enabled) {
                nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
                brnet->enabled = false;
        }

#ifdef CONFIG_SYSCTL
        br_netfilter_sysctl_exit_net(net, brnet);
#endif
}

static struct pernet_operations brnf_net_ops __read_mostly = {
#ifdef CONFIG_SYSCTL
        .init = brnf_init_net,
#endif
        .exit = brnf_exit_net,
        .id   = &brnf_net_id,
        .size = sizeof(struct brnf_net),
};

static int __init br_netfilter_init(void)
{
        int ret;

        ret = register_pernet_subsys(&brnf_net_ops);
        if (ret < 0)
                return ret;

        ret = register_netdevice_notifier(&brnf_notifier);
        if (ret < 0) {
                unregister_pernet_subsys(&brnf_net_ops);
                return ret;
        }

        RCU_INIT_POINTER(nf_br_ops, &br_ops);
        printk(KERN_NOTICE "Bridge firewalling registered\n");
        return 0;
}

static void __exit br_netfilter_fini(void)
{
        RCU_INIT_POINTER(nf_br_ops, NULL);
        unregister_netdevice_notifier(&brnf_notifier);
        unregister_pernet_subsys(&brnf_net_ops);
}

module_init(br_netfilter_init);
module_exit(br_netfilter_fini);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>");
MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge");


















































































































































































































    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
// SPDX-License-Identifier: GPL-2.0
/*
 * Dynamic byte queue limits.  See include/linux/dynamic_queue_limits.h
 *
 * Copyright (c) 2011, Tom Herbert <therbert@google.com>
 */
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/dynamic_queue_limits.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <trace/events/napi.h>

#define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0)
#define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0)

static void dql_check_stall(struct dql *dql, unsigned short stall_thrs)
{
        unsigned long now;

        if (!stall_thrs)
                return;

        now = jiffies;
        /* Check for a potential stall */
        if (time_after_eq(now, dql->last_reap + stall_thrs)) {
                unsigned long hist_head, t, start, end;

                /* We are trying to detect a period of at least @stall_thrs
                 * jiffies without any Tx completions, but during first half
                 * of which some Tx was posted.
                 */
dqs_again:
                hist_head = READ_ONCE(dql->history_head);
                /* pairs with smp_wmb() in dql_queued() */
                smp_rmb();

                /* Get the previous entry in the ring buffer, which is the
                 * oldest sample.
                 */
                start = (hist_head - DQL_HIST_LEN + 1) * BITS_PER_LONG;

                /* Advance start to continue from the last reap time */
                if (time_before(start, dql->last_reap + 1))
                        start = dql->last_reap + 1;

                /* Newest sample we should have already seen a completion for */
                end = hist_head * BITS_PER_LONG + (BITS_PER_LONG - 1);

                /* Shrink the search space to [start, (now - start_thrs/2)] if
                 * `end` is beyond the stall zone
                 */
                if (time_before(now, end + stall_thrs / 2))
                        end = now - stall_thrs / 2;

                /* Search for the queued time in [t, end] */
                for (t = start; time_before_eq(t, end); t++)
                        if (test_bit(t % (DQL_HIST_LEN * BITS_PER_LONG),
                                     dql->history))
                                break;

                /* Variable t contains the time of the queue */
                if (!time_before_eq(t, end))
                        goto no_stall;

                /* The ring buffer was modified in the meantime, retry */
                if (hist_head != READ_ONCE(dql->history_head))
                        goto dqs_again;

                dql->stall_cnt++;
                dql->stall_max = max_t(unsigned short, dql->stall_max, now - t);

                trace_dql_stall_detected(dql->stall_thrs, now - t,
                                         dql->last_reap, dql->history_head,
                                         now, dql->history);
        }
no_stall:
        dql->last_reap = now;
}

/* Records completed count and recalculates the queue limit */
void dql_completed(struct dql *dql, unsigned int count)
{
        unsigned int inprogress, prev_inprogress, limit;
        unsigned int ovlimit, completed, num_queued;
        unsigned short stall_thrs;
        bool all_prev_completed;

        num_queued = READ_ONCE(dql->num_queued);
        /* Read stall_thrs in advance since it belongs to the same (first)
         * cache line as ->num_queued. This way, dql_check_stall() does not
         * need to touch the first cache line again later, reducing the window
         * of possible false sharing.
         */
        stall_thrs = READ_ONCE(dql->stall_thrs);

        /* Can't complete more than what's in queue */
        BUG_ON(count > num_queued - dql->num_completed);

        completed = dql->num_completed + count;
        limit = dql->limit;
        ovlimit = POSDIFF(num_queued - dql->num_completed, limit);
        inprogress = num_queued - completed;
        prev_inprogress = dql->prev_num_queued - dql->num_completed;
        all_prev_completed = AFTER_EQ(completed, dql->prev_num_queued);

        if ((ovlimit && !inprogress) ||
            (dql->prev_ovlimit && all_prev_completed)) {
                /*
                 * Queue considered starved if:
                 *   - The queue was over-limit in the last interval,
                 *     and there is no more data in the queue.
                 *  OR
                 *   - The queue was over-limit in the previous interval and
                 *     when enqueuing it was possible that all queued data
                 *     had been consumed.  This covers the case when queue
                 *     may have becomes starved between completion processing
                 *     running and next time enqueue was scheduled.
                 *
                 *     When queue is starved increase the limit by the amount
                 *     of bytes both sent and completed in the last interval,
                 *     plus any previous over-limit.
                 */
                limit += POSDIFF(completed, dql->prev_num_queued) +
                     dql->prev_ovlimit;
                dql->slack_start_time = jiffies;
                dql->lowest_slack = UINT_MAX;
        } else if (inprogress && prev_inprogress && !all_prev_completed) {
                /*
                 * Queue was not starved, check if the limit can be decreased.
                 * A decrease is only considered if the queue has been busy in
                 * the whole interval (the check above).
                 *
                 * If there is slack, the amount of excess data queued above
                 * the amount needed to prevent starvation, the queue limit
                 * can be decreased.  To avoid hysteresis we consider the
                 * minimum amount of slack found over several iterations of the
                 * completion routine.
                 */
                unsigned int slack, slack_last_objs;

                /*
                 * Slack is the maximum of
                 *   - The queue limit plus previous over-limit minus twice
                 *     the number of objects completed.  Note that two times
                 *     number of completed bytes is a basis for an upper bound
                 *     of the limit.
                 *   - Portion of objects in the last queuing operation that
                 *     was not part of non-zero previous over-limit.  That is
                 *     "round down" by non-overlimit portion of the last
                 *     queueing operation.
                 */
                slack = POSDIFF(limit + dql->prev_ovlimit,
                    2 * (completed - dql->num_completed));
                slack_last_objs = dql->prev_ovlimit ?
                    POSDIFF(dql->prev_last_obj_cnt, dql->prev_ovlimit) : 0;

                slack = max(slack, slack_last_objs);

                if (slack < dql->lowest_slack)
                        dql->lowest_slack = slack;

                if (time_after(jiffies,
                               dql->slack_start_time + dql->slack_hold_time)) {
                        limit = POSDIFF(limit, dql->lowest_slack);
                        dql->slack_start_time = jiffies;
                        dql->lowest_slack = UINT_MAX;
                }
        }

        /* Enforce bounds on limit */
        limit = clamp(limit, dql->min_limit, dql->max_limit);

        if (limit != dql->limit) {
                dql->limit = limit;
                ovlimit = 0;
        }

        dql->adj_limit = limit + completed;
        dql->prev_ovlimit = ovlimit;
        dql->prev_last_obj_cnt = READ_ONCE(dql->last_obj_cnt);
        dql->num_completed = completed;
        dql->prev_num_queued = num_queued;

        dql_check_stall(dql, stall_thrs);
}
EXPORT_SYMBOL(dql_completed);

void dql_reset(struct dql *dql)
{
        /* Reset all dynamic values */
        dql->limit = dql->min_limit;
        dql->num_queued = 0;
        dql->num_completed = 0;
        dql->last_obj_cnt = 0;
        dql->prev_num_queued = 0;
        dql->prev_last_obj_cnt = 0;
        dql->prev_ovlimit = 0;
        dql->lowest_slack = UINT_MAX;
        dql->slack_start_time = jiffies;

        dql->last_reap = jiffies;
        dql->history_head = jiffies / BITS_PER_LONG;
        memset(dql->history, 0, sizeof(dql->history));
}
EXPORT_SYMBOL(dql_reset);

void dql_init(struct dql *dql, unsigned int hold_time)
{
        dql->max_limit = DQL_MAX_LIMIT;
        dql->min_limit = 0;
        dql->slack_hold_time = hold_time;
        dql->stall_thrs = 0;
        dql_reset(dql);
}
EXPORT_SYMBOL(dql_init);

































   29 






















    4 
   17 




   17 













   17 










    2 

    2 















    5 





















   11 









    4 












   10 






    3 







    5 
    1 

    1 



    1 

    4 

    6 







    4 




    4 










    4 







    4 
    4 

















   28 




   12 


    6 


    7 









































    3 









    1 















    7 


    1 


    6 






























    9 




    5 




    6 














































    7 



























    7 
























































































































    1 









































    2 
    2 


    1 




    1 
    1 





    1 






























    6 




    3 

    4 




    2 



    6 

    4 




    6 


    5 



    6 

    5 

    6 






    6 







    6 



    6 



    6 
    1 



    2 
    1 


    1 








    6 




    5 
    2 
    4 

    6 

    5 

    2 



    6 






    1 





    1 
    1 










    1 







































    6 





    6 

    6 


    2 
    6 



    2 






    6 
    6 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
 * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
 */
#include <uapi/linux/btf.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/math64.h>
#include <linux/string.h>

#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)

static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
{
        /* ubuf and len_total should both be specified (or not) together */
        if (!!log->ubuf != !!log->len_total)
                return false;
        /* log buf without log_level is meaningless */
        if (log->ubuf && log->level == 0)
                return false;
        if (log->level & ~BPF_LOG_MASK)
                return false;
        if (log->len_total > UINT_MAX >> 2)
                return false;
        return true;
}

int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
                  char __user *log_buf, u32 log_size)
{
        log->level = log_level;
        log->ubuf = log_buf;
        log->len_total = log_size;

        /* log attributes have to be sane */
        if (!bpf_verifier_log_attr_valid(log))
                return -EINVAL;

        return 0;
}

static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
{
        /* add_len includes terminal \0, so no need for +1. */
        u64 len = log->end_pos + add_len;

        /* log->len_max could be larger than our current len due to
         * bpf_vlog_reset() calls, so we maintain the max of any length at any
         * previous point
         */
        if (len > UINT_MAX)
                log->len_max = UINT_MAX;
        else if (len > log->len_max)
                log->len_max = len;
}

void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
                       va_list args)
{
        u64 cur_pos;
        u32 new_n, n;

        n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);

        if (log->level == BPF_LOG_KERNEL) {
                bool newline = n > 0 && log->kbuf[n - 1] == '\n';

                pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
                return;
        }

        n += 1; /* include terminating zero */
        bpf_vlog_update_len_max(log, n);

        if (log->level & BPF_LOG_FIXED) {
                /* check if we have at least something to put into user buf */
                new_n = 0;
                if (log->end_pos < log->len_total) {
                        new_n = min_t(u32, log->len_total - log->end_pos, n);
                        log->kbuf[new_n - 1] = '\0';
                }

                cur_pos = log->end_pos;
                log->end_pos += n - 1; /* don't count terminating '\0' */

                if (log->ubuf && new_n &&
                    copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
                        goto fail;
        } else {
                u64 new_end, new_start;
                u32 buf_start, buf_end;

                new_end = log->end_pos + n;
                if (new_end - log->start_pos >= log->len_total)
                        new_start = new_end - log->len_total;
                else
                        new_start = log->start_pos;

                log->start_pos = new_start;
                log->end_pos = new_end - 1; /* don't count terminating '\0' */

                if (!log->ubuf)
                        return;

                new_n = min(n, log->len_total);
                cur_pos = new_end - new_n;
                div_u64_rem(cur_pos, log->len_total, &buf_start);
                div_u64_rem(new_end, log->len_total, &buf_end);
                /* new_end and buf_end are exclusive indices, so if buf_end is
                 * exactly zero, then it actually points right to the end of
                 * ubuf and there is no wrap around
                 */
                if (buf_end == 0)
                        buf_end = log->len_total;

                /* if buf_start > buf_end, we wrapped around;
                 * if buf_start == buf_end, then we fill ubuf completely; we
                 * can't have buf_start == buf_end to mean that there is
                 * nothing to write, because we always write at least
                 * something, even if terminal '\0'
                 */
                if (buf_start < buf_end) {
                        /* message fits within contiguous chunk of ubuf */
                        if (copy_to_user(log->ubuf + buf_start,
                                         log->kbuf + n - new_n,
                                         buf_end - buf_start))
                                goto fail;
                } else {
                        /* message wraps around the end of ubuf, copy in two chunks */
                        if (copy_to_user(log->ubuf + buf_start,
                                         log->kbuf + n - new_n,
                                         log->len_total - buf_start))
                                goto fail;
                        if (copy_to_user(log->ubuf,
                                         log->kbuf + n - buf_end,
                                         buf_end))
                                goto fail;
                }
        }

        return;
fail:
        log->ubuf = NULL;
}

void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
{
        char zero = 0;
        u32 pos;

        if (WARN_ON_ONCE(new_pos > log->end_pos))
                return;

        if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL)
                return;

        /* if position to which we reset is beyond current log window,
         * then we didn't preserve any useful content and should adjust
         * start_pos to end up with an empty log (start_pos == end_pos)
         */
        log->end_pos = new_pos;
        if (log->end_pos < log->start_pos)
                log->start_pos = log->end_pos;

        if (!log->ubuf)
                return;

        if (log->level & BPF_LOG_FIXED)
                pos = log->end_pos + 1;
        else
                div_u64_rem(new_pos, log->len_total, &pos);

        if (pos < log->len_total && put_user(zero, log->ubuf + pos))
                log->ubuf = NULL;
}

static void bpf_vlog_reverse_kbuf(char *buf, int len)
{
        int i, j;

        for (i = 0, j = len - 1; i < j; i++, j--)
                swap(buf[i], buf[j]);
}

static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
{
        /* we split log->kbuf into two equal parts for both ends of array */
        int n = sizeof(log->kbuf) / 2, nn;
        char *lbuf = log->kbuf, *rbuf = log->kbuf + n;

        /* Read ubuf's section [start, end) two chunks at a time, from left
         * and right side; within each chunk, swap all the bytes; after that
         * reverse the order of lbuf and rbuf and write result back to ubuf.
         * This way we'll end up with swapped contents of specified
         * [start, end) ubuf segment.
         */
        while (end - start > 1) {
                nn = min(n, (end - start ) / 2);

                if (copy_from_user(lbuf, log->ubuf + start, nn))
                        return -EFAULT;
                if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
                        return -EFAULT;

                bpf_vlog_reverse_kbuf(lbuf, nn);
                bpf_vlog_reverse_kbuf(rbuf, nn);

                /* we write lbuf to the right end of ubuf, while rbuf to the
                 * left one to end up with properly reversed overall ubuf
                 */
                if (copy_to_user(log->ubuf + start, rbuf, nn))
                        return -EFAULT;
                if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
                        return -EFAULT;

                start += nn;
                end -= nn;
        }

        return 0;
}

int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
{
        u32 sublen;
        int err;

        *log_size_actual = 0;
        if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
                return 0;

        if (!log->ubuf)
                goto skip_log_rotate;
        /* If we never truncated log, there is nothing to move around. */
        if (log->start_pos == 0)
                goto skip_log_rotate;

        /* Otherwise we need to rotate log contents to make it start from the
         * buffer beginning and be a continuous zero-terminated string. Note
         * that if log->start_pos != 0 then we definitely filled up entire log
         * buffer with no gaps, and we just need to shift buffer contents to
         * the left by (log->start_pos % log->len_total) bytes.
         *
         * Unfortunately, user buffer could be huge and we don't want to
         * allocate temporary kernel memory of the same size just to shift
         * contents in a straightforward fashion. Instead, we'll be clever and
         * do in-place array rotation. This is a leetcode-style problem, which
         * could be solved by three rotations.
         *
         * Let's say we have log buffer that has to be shifted left by 7 bytes
         * (spaces and vertical bar is just for demonstrative purposes):
         *   E F G H I J K | A B C D
         *
         * First, we reverse entire array:
         *   D C B A | K J I H G F E
         *
         * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
         * (KJIHGFE), resulting in a properly rotated array:
         *   A B C D | E F G H I J K
         *
         * We'll utilize log->kbuf to read user memory chunk by chunk, swap
         * bytes, and write them back. Doing it byte-by-byte would be
         * unnecessarily inefficient. Altogether we are going to read and
         * write each byte twice, for total 4 memory copies between kernel and
         * user space.
         */

        /* length of the chopped off part that will be the beginning;
         * len(ABCD) in the example above
         */
        div_u64_rem(log->start_pos, log->len_total, &sublen);
        sublen = log->len_total - sublen;

        err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
        err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
        err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
        if (err)
                log->ubuf = NULL;

skip_log_rotate:
        *log_size_actual = log->len_max;

        /* properly initialized log has either both ubuf!=NULL and len_total>0
         * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
         * we got a fault somewhere along the way, so report it back
         */
        if (!!log->ubuf != !!log->len_total)
                return -EFAULT;

        /* did truncation actually happen? */
        if (log->ubuf && log->len_max > log->len_total)
                return -ENOSPC;

        return 0;
}

/* log_level controls verbosity level of eBPF verifier.
 * bpf_verifier_log_write() is used to dump the verification trace to the log,
 * so the user can figure out what's wrong with the program
 */
__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
                                           const char *fmt, ...)
{
        va_list args;

        if (!bpf_verifier_log_needed(&env->log))
                return;

        va_start(args, fmt);
        bpf_verifier_vlog(&env->log, fmt, args);
        va_end(args);
}
EXPORT_SYMBOL_GPL(bpf_verifier_log_write);

__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
                            const char *fmt, ...)
{
        va_list args;

        if (!bpf_verifier_log_needed(log))
                return;

        va_start(args, fmt);
        bpf_verifier_vlog(log, fmt, args);
        va_end(args);
}
EXPORT_SYMBOL_GPL(bpf_log);

static const char *ltrim(const char *s)
{
        while (isspace(*s))
                s++;

        return s;
}

__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
                                  u32 insn_off,
                                  const char *prefix_fmt, ...)
{
        const struct bpf_line_info *linfo, *prev_linfo;
        const struct btf *btf;
        const char *s, *fname;

        if (!bpf_verifier_log_needed(&env->log))
                return;

        prev_linfo = env->prev_linfo;
        linfo = bpf_find_linfo(env->prog, insn_off);
        if (!linfo || linfo == prev_linfo)
                return;

        /* It often happens that two separate linfo records point to the same
         * source code line, but have differing column numbers. Given verifier
         * log doesn't emit column information, from user perspective we just
         * end up emitting the same source code line twice unnecessarily.
         * So instead check that previous and current linfo record point to
         * the same file (file_name_offs match) and the same line number, and
         * avoid emitting duplicated source code line in such case.
         */
        if (prev_linfo && linfo->file_name_off == prev_linfo->file_name_off &&
            BPF_LINE_INFO_LINE_NUM(linfo->line_col) == BPF_LINE_INFO_LINE_NUM(prev_linfo->line_col))
                return;

        if (prefix_fmt) {
                va_list args;

                va_start(args, prefix_fmt);
                bpf_verifier_vlog(&env->log, prefix_fmt, args);
                va_end(args);
        }

        btf = env->prog->aux->btf;
        s = ltrim(btf_name_by_offset(btf, linfo->line_off));
        verbose(env, "%s", s); /* source code line */

        s = btf_name_by_offset(btf, linfo->file_name_off);
        /* leave only file name */
        fname = strrchr(s, '/');
        fname = fname ? fname + 1 : s;
        verbose(env, " @ %s:%u\n", fname, BPF_LINE_INFO_LINE_NUM(linfo->line_col));

        env->prev_linfo = linfo;
}

static const char *btf_type_name(const struct btf *btf, u32 id)
{
        return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}

/* string representation of 'enum bpf_reg_type'
 *
 * Note that reg_type_str() can not appear more than once in a single verbose()
 * statement.
 */
const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
{
        char postfix[16] = {0}, prefix[64] = {0};
        static const char * const str[] = {
                [NOT_INIT]                = "?",
                [SCALAR_VALUE]                = "scalar",
                [PTR_TO_CTX]                = "ctx",
                [CONST_PTR_TO_MAP]        = "map_ptr",
                [PTR_TO_MAP_VALUE]        = "map_value",
                [PTR_TO_STACK]                = "fp",
                [PTR_TO_PACKET]                = "pkt",
                [PTR_TO_PACKET_META]        = "pkt_meta",
                [PTR_TO_PACKET_END]        = "pkt_end",
                [PTR_TO_FLOW_KEYS]        = "flow_keys",
                [PTR_TO_SOCKET]                = "sock",
                [PTR_TO_SOCK_COMMON]        = "sock_common",
                [PTR_TO_TCP_SOCK]        = "tcp_sock",
                [PTR_TO_TP_BUFFER]        = "tp_buffer",
                [PTR_TO_XDP_SOCK]        = "xdp_sock",
                [PTR_TO_BTF_ID]                = "ptr_",
                [PTR_TO_MEM]                = "mem",
                [PTR_TO_ARENA]                = "arena",
                [PTR_TO_BUF]                = "buf",
                [PTR_TO_FUNC]                = "func",
                [PTR_TO_INSN]                = "insn",
                [PTR_TO_MAP_KEY]        = "map_key",
                [CONST_PTR_TO_DYNPTR]        = "dynptr_ptr",
        };

        if (type & PTR_MAYBE_NULL) {
                if (base_type(type) == PTR_TO_BTF_ID)
                        strscpy(postfix, "or_null_");
                else
                        strscpy(postfix, "_or_null");
        }

        snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
                 type & MEM_RDONLY ? "rdonly_" : "",
                 type & MEM_RINGBUF ? "ringbuf_" : "",
                 type & MEM_USER ? "user_" : "",
                 type & MEM_PERCPU ? "percpu_" : "",
                 type & MEM_RCU ? "rcu_" : "",
                 type & PTR_UNTRUSTED ? "untrusted_" : "",
                 type & PTR_TRUSTED ? "trusted_" : ""
        );

        snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
                 prefix, str[base_type(type)], postfix);
        return env->tmp_str_buf;
}

const char *dynptr_type_str(enum bpf_dynptr_type type)
{
        switch (type) {
        case BPF_DYNPTR_TYPE_LOCAL:
                return "local";
        case BPF_DYNPTR_TYPE_RINGBUF:
                return "ringbuf";
        case BPF_DYNPTR_TYPE_SKB:
                return "skb";
        case BPF_DYNPTR_TYPE_XDP:
                return "xdp";
        case BPF_DYNPTR_TYPE_SKB_META:
                return "skb_meta";
        case BPF_DYNPTR_TYPE_FILE:
                return "file";
        case BPF_DYNPTR_TYPE_INVALID:
                return "<invalid>";
        default:
                WARN_ONCE(1, "unknown dynptr type %d\n", type);
                return "<unknown>";
        }
}

const char *iter_type_str(const struct btf *btf, u32 btf_id)
{
        if (!btf || btf_id == 0)
                return "<invalid>";

        /* we already validated that type is valid and has conforming name */
        return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
}

const char *iter_state_str(enum bpf_iter_state state)
{
        switch (state) {
        case BPF_ITER_STATE_ACTIVE:
                return "active";
        case BPF_ITER_STATE_DRAINED:
                return "drained";
        case BPF_ITER_STATE_INVALID:
                return "<invalid>";
        default:
                WARN_ONCE(1, "unknown iter state %d\n", state);
                return "<unknown>";
        }
}

static char slot_type_char[] = {
        [STACK_INVALID]        = '?',
        [STACK_SPILL]        = 'r',
        [STACK_MISC]        = 'm',
        [STACK_ZERO]        = '0',
        [STACK_DYNPTR]        = 'd',
        [STACK_ITER]        = 'i',
        [STACK_IRQ_FLAG] = 'f',
        [STACK_POISON]        = 'p',
};

#define UNUM_MAX_DECIMAL U16_MAX
#define SNUM_MAX_DECIMAL S16_MAX
#define SNUM_MIN_DECIMAL S16_MIN

static bool is_unum_decimal(u64 num)
{
        return num <= UNUM_MAX_DECIMAL;
}

static bool is_snum_decimal(s64 num)
{
        return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL;
}

static void verbose_unum(struct bpf_verifier_env *env, u64 num)
{
        if (is_unum_decimal(num))
                verbose(env, "%llu", num);
        else
                verbose(env, "%#llx", num);
}

static void verbose_snum(struct bpf_verifier_env *env, s64 num)
{
        if (is_snum_decimal(num))
                verbose(env, "%lld", num);
        else
                verbose(env, "%#llx", num);
}

int tnum_strn(char *str, size_t size, struct tnum a)
{
        /* print as a constant, if tnum is fully known */
        if (a.mask == 0) {
                if (is_unum_decimal(a.value))
                        return snprintf(str, size, "%llu", a.value);
                if (is_snum_decimal(a.value))
                        return snprintf(str, size, "%lld", a.value);
                else
                        return snprintf(str, size, "%#llx", a.value);
        }
        return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
}
EXPORT_SYMBOL_GPL(tnum_strn);

static void print_scalar_ranges(struct bpf_verifier_env *env,
                                const struct bpf_reg_state *reg,
                                const char **sep)
{
        /* For signed ranges, we want to unify 64-bit and 32-bit values in the
         * output as much as possible, but there is a bit of a complication.
         * If we choose to print values as decimals, this is natural to do,
         * because negative 64-bit and 32-bit values >= -S32_MIN have the same
         * representation due to sign extension. But if we choose to print
         * them in hex format (see is_snum_decimal()), then sign extension is
         * misleading.
         * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in
         * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two
         * very different numbers.
         * So we avoid sign extension if we choose to print values in hex.
         */
        struct {
                const char *name;
                u64 val;
                bool omit;
        } minmaxs[] = {
                {"smin",   reg->smin_value,         reg->smin_value == S64_MIN},
                {"smax",   reg->smax_value,         reg->smax_value == S64_MAX},
                {"umin",   reg->umin_value,         reg->umin_value == 0},
                {"umax",   reg->umax_value,         reg->umax_value == U64_MAX},
                {"smin32",
                 is_snum_decimal((s64)reg->s32_min_value)
                         ? (s64)reg->s32_min_value
                         : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN},
                {"smax32",
                 is_snum_decimal((s64)reg->s32_max_value)
                         ? (s64)reg->s32_max_value
                         : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX},
                {"umin32", reg->u32_min_value,      reg->u32_min_value == 0},
                {"umax32", reg->u32_max_value,      reg->u32_max_value == U32_MAX},
        }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
        bool neg1, neg2;

        for (m1 = &minmaxs[0]; m1 < mend; m1++) {
                if (m1->omit)
                        continue;

                neg1 = m1->name[0] == 's' && (s64)m1->val < 0;

                verbose(env, "%s%s=", *sep, m1->name);
                *sep = ",";

                for (m2 = m1 + 2; m2 < mend; m2 += 2) {
                        if (m2->omit || m2->val != m1->val)
                                continue;
                        /* don't mix negatives with positives */
                        neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
                        if (neg2 != neg1)
                                continue;
                        m2->omit = true;
                        verbose(env, "%s=", m2->name);
                }

                if (m1->name[0] == 's')
                        verbose_snum(env, m1->val);
                else
                        verbose_unum(env, m1->val);
        }
}

static bool type_is_map_ptr(enum bpf_reg_type t) {
        switch (base_type(t)) {
        case CONST_PTR_TO_MAP:
        case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
                return true;
        default:
                return false;
        }
}

/*
 * _a stands for append, was shortened to avoid multiline statements below.
 * This macro is used to output a comma separated list of attributes.
 */
#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })

static void print_reg_state(struct bpf_verifier_env *env,
                            const struct bpf_func_state *state,
                            const struct bpf_reg_state *reg)
{
        enum bpf_reg_type t;
        const char *sep = "";

        t = reg->type;
        if (t == SCALAR_VALUE && reg->precise)
                verbose(env, "P");
        if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) {
                verbose_snum(env, reg->var_off.value);
                return;
        }

        verbose(env, "%s", reg_type_str(env, t));
        if (t == PTR_TO_ARENA)
                return;
        if (t == PTR_TO_STACK) {
                if (state->frameno != reg->frameno)
                        verbose(env, "[%d]", reg->frameno);
                if (tnum_is_const(reg->var_off)) {
                        verbose_snum(env, reg->var_off.value + reg->delta);
                        return;
                }
        }
        if (base_type(t) == PTR_TO_BTF_ID)
                verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
        verbose(env, "(");
        if (reg->id)
                verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
        if (reg->id & BPF_ADD_CONST)
                verbose(env, "%+d", reg->delta);
        if (reg->ref_obj_id)
                verbose_a("ref_obj_id=%d", reg->ref_obj_id);
        if (type_is_non_owning_ref(reg->type))
                verbose_a("%s", "non_own_ref");
        if (type_is_map_ptr(t)) {
                if (reg->map_ptr->name[0])
                        verbose_a("map=%s", reg->map_ptr->name);
                verbose_a("ks=%d,vs=%d",
                          reg->map_ptr->key_size,
                          reg->map_ptr->value_size);
        }
        if (t != SCALAR_VALUE && reg->delta) {
                verbose_a("off=");
                verbose_snum(env, reg->delta);
        }
        if (type_is_pkt_pointer(t)) {
                verbose_a("r=");
                verbose_unum(env, reg->range);
        }
        if (base_type(t) == PTR_TO_MEM) {
                verbose_a("sz=");
                verbose_unum(env, reg->mem_size);
        }
        if (t == CONST_PTR_TO_DYNPTR)
                verbose_a("type=%s",  dynptr_type_str(reg->dynptr.type));
        if (tnum_is_const(reg->var_off)) {
                /* a pointer register with fixed offset */
                if (reg->var_off.value) {
                        verbose_a("imm=");
                        verbose_snum(env, reg->var_off.value);
                }
        } else {
                print_scalar_ranges(env, reg, &sep);
                if (!tnum_is_unknown(reg->var_off)) {
                        char tn_buf[48];

                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                        verbose_a("var_off=%s", tn_buf);
                }
        }
        verbose(env, ")");
}

void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
                          u32 frameno, bool print_all)
{
        const struct bpf_func_state *state = vstate->frame[frameno];
        const struct bpf_reg_state *reg;
        int i;

        if (state->frameno)
                verbose(env, " frame%d:", state->frameno);
        for (i = 0; i < MAX_BPF_REG; i++) {
                reg = &state->regs[i];
                if (reg->type == NOT_INIT)
                        continue;
                if (!print_all && !reg_scratched(env, i))
                        continue;
                verbose(env, " R%d", i);
                verbose(env, "=");
                print_reg_state(env, state, reg);
        }
        for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
                char types_buf[BPF_REG_SIZE + 1];
                const char *sep = "";
                bool valid = false;
                u8 slot_type;
                int j;

                if (!print_all && !stack_slot_scratched(env, i))
                        continue;

                for (j = 0; j < BPF_REG_SIZE; j++) {
                        slot_type = state->stack[i].slot_type[j];
                        if (slot_type != STACK_INVALID && slot_type != STACK_POISON)
                                valid = true;
                        types_buf[j] = slot_type_char[slot_type];
                }
                types_buf[BPF_REG_SIZE] = 0;
                if (!valid)
                        continue;

                reg = &state->stack[i].spilled_ptr;
                switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
                case STACK_SPILL:
                        /* print MISC/ZERO/INVALID slots above subreg spill */
                        for (j = 0; j < BPF_REG_SIZE; j++)
                                if (state->stack[i].slot_type[j] == STACK_SPILL)
                                        break;
                        types_buf[j] = '\0';

                        verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
                        print_reg_state(env, state, reg);
                        break;
                case STACK_DYNPTR:
                        /* skip to main dynptr slot */
                        i += BPF_DYNPTR_NR_SLOTS - 1;
                        reg = &state->stack[i].spilled_ptr;

                        verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
                        verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
                        if (reg->id)
                                verbose_a("id=%d", reg->id);
                        if (reg->ref_obj_id)
                                verbose_a("ref_id=%d", reg->ref_obj_id);
                        if (reg->dynptr_id)
                                verbose_a("dynptr_id=%d", reg->dynptr_id);
                        verbose(env, ")");
                        break;
                case STACK_ITER:
                        /* only main slot has ref_obj_id set; skip others */
                        if (!reg->ref_obj_id)
                                continue;

                        verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)",
                                (-i - 1) * BPF_REG_SIZE,
                                iter_type_str(reg->iter.btf, reg->iter.btf_id),
                                reg->ref_obj_id, iter_state_str(reg->iter.state),
                                reg->iter.depth);
                        break;
                case STACK_MISC:
                case STACK_ZERO:
                default:
                        verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
                        break;
                }
        }
        if (vstate->acquired_refs && vstate->refs[0].id) {
                verbose(env, " refs=%d", vstate->refs[0].id);
                for (i = 1; i < vstate->acquired_refs; i++)
                        if (vstate->refs[i].id)
                                verbose(env, ",%d", vstate->refs[i].id);
        }
        if (state->in_callback_fn)
                verbose(env, " cb");
        if (state->in_async_callback_fn)
                verbose(env, " async_cb");
        verbose(env, "\n");
        if (!print_all)
                mark_verifier_state_clean(env);
}

u32 bpf_vlog_alignment(u32 pos)
{
        return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
                        BPF_LOG_MIN_ALIGNMENT) - pos - 1;
}

void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
                      u32 frameno)
{
        if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
                /* remove new line character */
                bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
                verbose(env, "%*c;", bpf_vlog_alignment(env->prev_insn_print_pos), ' ');
        } else {
                verbose(env, "%d:", env->insn_idx);
        }
        print_verifier_state(env, vstate, frameno, false);
}

























    1 




    1 












    1 



    1 







    1 



































    1 


















    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
 * Phillip Lougher <phillip@squashfs.org.uk>
 *
 * zlib_wrapper.c
 */


#include <linux/mutex.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/zlib.h>
#include <linux/vmalloc.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs.h"
#include "decompressor.h"
#include "page_actor.h"

static void *zlib_init(struct squashfs_sb_info *dummy, void *buff)
{
        z_stream *stream = kmalloc_obj(z_stream);
        if (stream == NULL)
                goto failed;
        stream->workspace = vmalloc(zlib_inflate_workspacesize());
        if (stream->workspace == NULL)
                goto failed;

        return stream;

failed:
        ERROR("Failed to allocate zlib workspace\n");
        kfree(stream);
        return ERR_PTR(-ENOMEM);
}


static void zlib_free(void *strm)
{
        z_stream *stream = strm;

        if (stream)
                vfree(stream->workspace);
        kfree(stream);
}


static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
        struct bio *bio, int offset, int length,
        struct squashfs_page_actor *output)
{
        struct bvec_iter_all iter_all = {};
        struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
        int zlib_init = 0, error = 0;
        z_stream *stream = strm;

        stream->avail_out = PAGE_SIZE;
        stream->next_out = squashfs_first_page(output);
        stream->avail_in = 0;

        if (IS_ERR(stream->next_out)) {
                error = PTR_ERR(stream->next_out);
                goto finish;
        }

        for (;;) {
                int zlib_err;

                if (stream->avail_in == 0) {
                        const void *data;
                        int avail;

                        if (!bio_next_segment(bio, &iter_all)) {
                                /* Z_STREAM_END must be reached. */
                                error = -EIO;
                                break;
                        }

                        avail = min(length, ((int)bvec->bv_len) - offset);
                        data = bvec_virt(bvec);
                        length -= avail;
                        stream->next_in = data + offset;
                        stream->avail_in = avail;
                        offset = 0;
                }

                if (stream->avail_out == 0) {
                        stream->next_out = squashfs_next_page(output);
                        if (IS_ERR(stream->next_out)) {
                                error = PTR_ERR(stream->next_out);
                                break;
                        } else if (stream->next_out != NULL)
                                stream->avail_out = PAGE_SIZE;
                }

                if (!zlib_init) {
                        zlib_err = zlib_inflateInit(stream);
                        if (zlib_err != Z_OK) {
                                error = -EIO;
                                break;
                        }
                        zlib_init = 1;
                }

                zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
                if (zlib_err == Z_STREAM_END)
                        break;
                if (zlib_err != Z_OK) {
                        error = -EIO;
                        break;
                }
        }

finish:
        squashfs_finish_page(output);

        if (!error)
                if (zlib_inflateEnd(stream) != Z_OK)
                        error = -EIO;

        return error ? error : stream->total_out;
}

const struct squashfs_decompressor squashfs_zlib_comp_ops = {
        .init = zlib_init,
        .free = zlib_free,
        .decompress = zlib_uncompress,
        .id = ZLIB_COMPRESSION,
        .name = "zlib",
        .alloc_buffer = 1,
        .supported = 1
};





































  105 









   30 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H

#define HAVE_JUMP_LABEL_BATCH

#include <asm/asm.h>
#include <asm/nops.h>

#ifndef __ASSEMBLER__

#include <linux/stringify.h>
#include <linux/types.h>

#define JUMP_TABLE_ENTRY(key, label)                        \
        ".pushsection __jump_table,  \"aw\" \n\t"        \
        _ASM_ALIGN "\n\t"                                \
        ANNOTATE_DATA_SPECIAL "\n"                        \
        ".long 1b - . \n\t"                                \
        ".long " label " - . \n\t"                        \
        _ASM_PTR " " key " - . \n\t"                        \
        ".popsection \n\t"

/* This macro is also expanded on the Rust side. */
#ifdef CONFIG_HAVE_JUMP_LABEL_HACK
#define ARCH_STATIC_BRANCH_ASM(key, label)                \
        "1: jmp " label " # objtool NOPs this \n\t"        \
        JUMP_TABLE_ENTRY(key " + 2", label)
#else /* !CONFIG_HAVE_JUMP_LABEL_HACK */
#define ARCH_STATIC_BRANCH_ASM(key, label)                \
        "1: .byte " __stringify(BYTES_NOP5) "\n\t"        \
        JUMP_TABLE_ENTRY(key, label)
#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */

static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
{
        asm goto(ARCH_STATIC_BRANCH_ASM("%c0 + %c1", "%l[l_yes]")
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
{
        asm goto("1:"
                "jmp %l[l_yes]\n\t"
                JUMP_TABLE_ENTRY("%c0 + %c1", "%l[l_yes]")
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

extern int arch_jump_entry_size(struct jump_entry *entry);

#endif        /* __ASSEMBLER__ */

#endif










































































    3 









    2 





























































































































































































































































































    3 







    3 

    3 






























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>

#define CREATE_TRACE_POINTS
#include <trace/events/notifier.h>

/*
 *        Notifier chain core routines.  The exported routines below
 *        are layered on top of these, with appropriate locking added.
 */

static int notifier_chain_register(struct notifier_block **nl,
                                   struct notifier_block *n,
                                   bool unique_priority)
{
        while ((*nl) != NULL) {
                if (unlikely((*nl) == n)) {
                        WARN(1, "notifier callback %ps already registered",
                             n->notifier_call);
                        return -EEXIST;
                }
                if (n->priority > (*nl)->priority)
                        break;
                if (n->priority == (*nl)->priority && unique_priority)
                        return -EBUSY;
                nl = &((*nl)->next);
        }
        n->next = *nl;
        rcu_assign_pointer(*nl, n);
        trace_notifier_register((void *)n->notifier_call);
        return 0;
}

static int notifier_chain_unregister(struct notifier_block **nl,
                struct notifier_block *n)
{
        while ((*nl) != NULL) {
                if ((*nl) == n) {
                        rcu_assign_pointer(*nl, n->next);
                        trace_notifier_unregister((void *)n->notifier_call);
                        return 0;
                }
                nl = &((*nl)->next);
        }
        return -ENOENT;
}

/**
 * notifier_call_chain - Informs the registered notifiers about an event.
 *        @nl:                Pointer to head of the blocking notifier chain
 *        @val:                Value passed unmodified to notifier function
 *        @v:                Pointer passed unmodified to notifier function
 *        @nr_to_call:        Number of notifier functions to be called. Don't care
 *                        value of this parameter is -1.
 *        @nr_calls:        Records the number of notifications sent. Don't care
 *                        value of this field is NULL.
 *        Return:                notifier_call_chain returns the value returned by the
 *                        last notifier function called.
 */
static int notifier_call_chain(struct notifier_block **nl,
                               unsigned long val, void *v,
                               int nr_to_call, int *nr_calls)
{
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;

        nb = rcu_dereference_raw(*nl);

        while (nb && nr_to_call) {
                next_nb = rcu_dereference_raw(nb->next);

#ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
                        WARN(1, "Invalid notifier called!");
                        nb = next_nb;
                        continue;
                }
#endif
                trace_notifier_run((void *)nb->notifier_call);
                ret = nb->notifier_call(nb, val, v);

                if (nr_calls)
                        (*nr_calls)++;

                if (ret & NOTIFY_STOP_MASK)
                        break;
                nb = next_nb;
                nr_to_call--;
        }
        return ret;
}
NOKPROBE_SYMBOL(notifier_call_chain);

/**
 * notifier_call_chain_robust - Inform the registered notifiers about an event
 *                              and rollback on error.
 * @nl:                Pointer to head of the blocking notifier chain
 * @val_up:        Value passed unmodified to the notifier function
 * @val_down:        Value passed unmodified to the notifier function when recovering
 *              from an error on @val_up
 * @v:                Pointer passed unmodified to the notifier function
 *
 * NOTE:        It is important the @nl chain doesn't change between the two
 *                invocations of notifier_call_chain() such that we visit the
 *                exact same notifier callbacks; this rules out any RCU usage.
 *
 * Return:        the return value of the @val_up call.
 */
static int notifier_call_chain_robust(struct notifier_block **nl,
                                     unsigned long val_up, unsigned long val_down,
                                     void *v)
{
        int ret, nr = 0;

        ret = notifier_call_chain(nl, val_up, v, -1, &nr);
        if (ret & NOTIFY_STOP_MASK)
                notifier_call_chain(nl, val_down, v, nr-1, NULL);

        return ret;
}

/*
 *        Atomic notifier chain routines.  Registration and unregistration
 *        use a spinlock, and call_chain is synchronized by RCU (no locks).
 */

/**
 *        atomic_notifier_chain_register - Add notifier to an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an atomic notifier chain.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
                struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_register(&nh->head, n, false);
        spin_unlock_irqrestore(&nh->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);

/**
 *        atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an atomic notifier chain if there is no other
 *        notifier registered using the same priority.
 *
 *        Returns 0 on success, %-EEXIST or %-EBUSY on error.
 */
int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
                                               struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_register(&nh->head, n, true);
        spin_unlock_irqrestore(&nh->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);

/**
 *        atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from an atomic notifier chain.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
                struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_unregister(&nh->head, n);
        spin_unlock_irqrestore(&nh->lock, flags);
        synchronize_rcu();
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);

/**
 *        atomic_notifier_call_chain - Call functions in an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in an atomic context, so they must not block.
 *        This routine uses RCU to synchronize with changes to the chain.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
                               unsigned long val, void *v)
{
        int ret;

        rcu_read_lock();
        ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);

/**
 *        atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
 *        @nh: Pointer to head of the atomic notifier chain
 *
 *        Checks whether notifier chain is empty.
 *
 *        Returns true is notifier chain is empty, false otherwise.
 */
bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
{
        return !rcu_access_pointer(nh->head);
}

/*
 *        Blocking notifier chain routines.  All access to the chain is
 *        synchronized by an rwsem.
 */

static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                                              struct notifier_block *n,
                                              bool unique_priority)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, unique_priority);

        down_write(&nh->rwsem);
        ret = notifier_chain_register(&nh->head, n, unique_priority);
        up_write(&nh->rwsem);
        return ret;
}

/**
 *        blocking_notifier_chain_register - Add notifier to a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to a blocking notifier chain.
 *        Must be called in process context.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, false);
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);

/**
 *        blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an blocking notifier chain if there is no other
 *        notifier registered using the same priority.
 *
 *        Returns 0 on success, %-EEXIST or %-EBUSY on error.
 */
int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
                                                 struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, true);
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);

/**
 *        blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from a blocking notifier chain.
 *        Must be called from process context.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_unregister(&nh->head, n);

        down_write(&nh->rwsem);
        ret = notifier_chain_unregister(&nh->head, n);
        up_write(&nh->rwsem);
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);

int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v)
{
        int ret = NOTIFY_DONE;

        /*
         * We check the head outside the lock, but if this access is
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
                up_read(&nh->rwsem);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);

/**
 *        blocking_notifier_call_chain - Call functions in a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in a process context, so they are allowed to block.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                unsigned long val, void *v)
{
        int ret = NOTIFY_DONE;

        /*
         * We check the head outside the lock, but if this access is
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
                up_read(&nh->rwsem);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);

/*
 *        Raw notifier chain routines.  There is no protection;
 *        the caller must provide it.  Use at your own risk!
 */

/**
 *        raw_notifier_chain_register - Add notifier to a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to a raw notifier chain.
 *        All locking must be provided by the caller.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int raw_notifier_chain_register(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_register(&nh->head, n, false);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);

/**
 *        raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from a raw notifier chain.
 *        All locking must be provided by the caller.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_unregister(&nh->head, n);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);

int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v)
{
        return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);

/**
 *        raw_notifier_call_chain - Call functions in a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in an undefined context.
 *        All locking must be provided by the caller.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int raw_notifier_call_chain(struct raw_notifier_head *nh,
                unsigned long val, void *v)
{
        return notifier_call_chain(&nh->head, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);

/*
 *        SRCU notifier chain routines.    Registration and unregistration
 *        use a mutex, and call_chain is synchronized by SRCU (no locks).
 */

/**
 *        srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an SRCU notifier chain.
 *        Must be called in process context.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call mutex_lock().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, false);

        mutex_lock(&nh->mutex);
        ret = notifier_chain_register(&nh->head, n, false);
        mutex_unlock(&nh->mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);

/**
 *        srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from an SRCU notifier chain.
 *        Must be called from process context.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call mutex_lock().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_unregister(&nh->head, n);

        mutex_lock(&nh->mutex);
        ret = notifier_chain_unregister(&nh->head, n);
        mutex_unlock(&nh->mutex);
        synchronize_srcu(&nh->srcu);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);

/**
 *        srcu_notifier_call_chain - Call functions in an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in a process context, so they are allowed to block.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
                unsigned long val, void *v)
{
        int ret;
        int idx;

        idx = srcu_read_lock(&nh->srcu);
        ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
        srcu_read_unlock(&nh->srcu, idx);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);

/**
 *        srcu_init_notifier_head - Initialize an SRCU notifier head
 *        @nh: Pointer to head of the srcu notifier chain
 *
 *        Unlike other sorts of notifier heads, SRCU notifier heads require
 *        dynamic initialization.  Be sure to call this routine before
 *        calling any of the other SRCU notifier routines for this head.
 *
 *        If an SRCU notifier head is deallocated, it must first be cleaned
 *        up by calling srcu_cleanup_notifier_head().  Otherwise the head's
 *        per-cpu data (used by the SRCU mechanism) will leak.
 */
void srcu_init_notifier_head(struct srcu_notifier_head *nh)
{
        mutex_init(&nh->mutex);
        if (init_srcu_struct(&nh->srcu) < 0)
                BUG();
        nh->head = NULL;
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);

static ATOMIC_NOTIFIER_HEAD(die_chain);

int notrace notify_die(enum die_val val, const char *str,
               struct pt_regs *regs, long err, int trap, int sig)
{
        struct die_args args = {
                .regs        = regs,
                .str        = str,
                .err        = err,
                .trapnr        = trap,
                .signr        = sig,

        };
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                           "notify_die called but RCU thinks we're quiescent");
        return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);

int register_die_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);

int unregister_die_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_unregister(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_die_notifier);











































































    1 





    1 























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
/*
 *  linux/fs/hfs/extent.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains the functions related to the extents B-tree.
 */

#include <linux/pagemap.h>

#include "hfs_fs.h"
#include "btree.h"

/*================ File-local functions ================*/

/*
 * build_key
 */
static void hfs_ext_build_key(hfs_btree_key *key, u32 cnid, u16 block, u8 type)
{
        key->key_len = 7;
        key->ext.FkType = type;
        key->ext.FNum = cpu_to_be32(cnid);
        key->ext.FABN = cpu_to_be16(block);
}

/*
 * hfs_ext_compare()
 *
 * Description:
 *   This is the comparison function used for the extents B-tree.  In
 *   comparing extent B-tree entries, the file id is the most
 *   significant field (compared as unsigned ints); the fork type is
 *   the second most significant field (compared as unsigned chars);
 *   and the allocation block number field is the least significant
 *   (compared as unsigned ints).
 * Input Variable(s):
 *   struct hfs_ext_key *key1: pointer to the first key to compare
 *   struct hfs_ext_key *key2: pointer to the second key to compare
 * Output Variable(s):
 *   NONE
 * Returns:
 *   int: negative if key1<key2, positive if key1>key2, and 0 if key1==key2
 * Preconditions:
 *   key1 and key2 point to "valid" (struct hfs_ext_key)s.
 * Postconditions:
 *   This function has no side-effects */
int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2)
{
        __be32 fnum1, fnum2;
        __be16 block1, block2;

        fnum1 = key1->ext.FNum;
        fnum2 = key2->ext.FNum;
        if (fnum1 != fnum2)
                return be32_to_cpu(fnum1) < be32_to_cpu(fnum2) ? -1 : 1;
        if (key1->ext.FkType != key2->ext.FkType)
                return key1->ext.FkType < key2->ext.FkType ? -1 : 1;

        block1 = key1->ext.FABN;
        block2 = key2->ext.FABN;
        if (block1 == block2)
                return 0;
        return be16_to_cpu(block1) < be16_to_cpu(block2) ? -1 : 1;
}

/*
 * hfs_ext_find_block
 *
 * Find a block within an extent record
 */
u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off)
{
        int i;
        u16 count;

        for (i = 0; i < 3; ext++, i++) {
                count = be16_to_cpu(ext->count);
                if (off < count)
                        return be16_to_cpu(ext->block) + off;
                off -= count;
        }
        /* panic? */
        return 0;
}

static int hfs_ext_block_count(struct hfs_extent *ext)
{
        int i;
        u16 count = 0;

        for (i = 0; i < 3; ext++, i++)
                count += be16_to_cpu(ext->count);
        return count;
}

static u16 hfs_ext_lastblock(struct hfs_extent *ext)
{
        int i;

        ext += 2;
        for (i = 0; i < 2; ext--, i++)
                if (ext->count)
                        break;
        return be16_to_cpu(ext->block) + be16_to_cpu(ext->count);
}

static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
{
        int res;

        hfs_ext_build_key(fd->search_key, inode->i_ino, HFS_I(inode)->cached_start,
                          HFS_IS_RSRC(inode) ?  HFS_FK_RSRC : HFS_FK_DATA);
        res = hfs_brec_find(fd);
        if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) {
                if (res != -ENOENT)
                        return res;
                /* Fail early and avoid ENOSPC during the btree operation */
                res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
                if (res)
                        return res;
                hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec));
                HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
        } else {
                if (res)
                        return res;
                hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength);
                HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY;
        }
        return 0;
}

int hfs_ext_write_extent(struct inode *inode)
{
        struct hfs_find_data fd;
        int res = 0;

        if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) {
                res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
                if (res)
                        return res;
                res = __hfs_ext_write_extent(inode, &fd);
                hfs_find_exit(&fd);
        }
        return res;
}

static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent,
                                        u32 cnid, u32 block, u8 type)
{
        int res;

        hfs_ext_build_key(fd->search_key, cnid, block, type);
        fd->key->ext.FNum = 0;
        res = hfs_brec_find(fd);
        if (res && res != -ENOENT)
                return res;
        if (fd->key->ext.FNum != fd->search_key->ext.FNum ||
            fd->key->ext.FkType != fd->search_key->ext.FkType)
                return -ENOENT;
        if (fd->entrylength != sizeof(hfs_extent_rec))
                return -EIO;
        hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfs_extent_rec));
        return 0;
}

static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
{
        int res;

        if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) {
                res = __hfs_ext_write_extent(inode, fd);
                if (res)
                        return res;
        }

        res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino,
                                    block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA);
        if (!res) {
                HFS_I(inode)->cached_start = be16_to_cpu(fd->key->ext.FABN);
                HFS_I(inode)->cached_blocks = hfs_ext_block_count(HFS_I(inode)->cached_extents);
        } else {
                HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0;
                HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
        }
        return res;
}

static int hfs_ext_read_extent(struct inode *inode, u16 block)
{
        struct hfs_find_data fd;
        int res;

        if (block >= HFS_I(inode)->cached_start &&
            block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks)
                return 0;

        res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
        if (!res) {
                res = __hfs_ext_cache_extent(&fd, inode, block);
                hfs_find_exit(&fd);
        }
        return res;
}

static void hfs_dump_extent(struct hfs_extent *extent)
{
        int i;

        hfs_dbg("extent:   ");
        for (i = 0; i < 3; i++)
                hfs_dbg(" block %u, count %u",
                        be16_to_cpu(extent[i].block),
                        be16_to_cpu(extent[i].count));
        hfs_dbg("\n");
}

static int hfs_add_extent(struct hfs_extent *extent, u16 offset,
                          u16 alloc_block, u16 block_count)
{
        u16 count, start;
        int i;

        hfs_dump_extent(extent);
        for (i = 0; i < 3; extent++, i++) {
                count = be16_to_cpu(extent->count);
                if (offset == count) {
                        start = be16_to_cpu(extent->block);
                        if (alloc_block != start + count) {
                                if (++i >= 3)
                                        return -ENOSPC;
                                extent++;
                                extent->block = cpu_to_be16(alloc_block);
                        } else
                                block_count += count;
                        extent->count = cpu_to_be16(block_count);
                        return 0;
                } else if (offset < count)
                        break;
                offset -= count;
        }
        /* panic? */
        return -EIO;
}

static int hfs_free_extents(struct super_block *sb, struct hfs_extent *extent,
                            u16 offset, u16 block_nr)
{
        u16 count, start;
        int i;

        hfs_dump_extent(extent);
        for (i = 0; i < 3; extent++, i++) {
                count = be16_to_cpu(extent->count);
                if (offset == count)
                        goto found;
                else if (offset < count)
                        break;
                offset -= count;
        }
        /* panic? */
        return -EIO;
found:
        for (;;) {
                start = be16_to_cpu(extent->block);
                if (count <= block_nr) {
                        hfs_clear_vbm_bits(sb, start, count);
                        extent->block = 0;
                        extent->count = 0;
                        block_nr -= count;
                } else {
                        count -= block_nr;
                        hfs_clear_vbm_bits(sb, start + count, block_nr);
                        extent->count = cpu_to_be16(count);
                        block_nr = 0;
                }
                if (!block_nr || !i)
                        return 0;
                i--;
                extent--;
                count = be16_to_cpu(extent->count);
        }
}

int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type)
{
        struct hfs_find_data fd;
        u32 total_blocks, blocks, start;
        u32 cnid = be32_to_cpu(file->FlNum);
        struct hfs_extent *extent;
        int res, i;

        if (type == HFS_FK_DATA) {
                total_blocks = be32_to_cpu(file->PyLen);
                extent = file->ExtRec;
        } else {
                total_blocks = be32_to_cpu(file->RPyLen);
                extent = file->RExtRec;
        }
        total_blocks /= HFS_SB(sb)->alloc_blksz;
        if (!total_blocks)
                return 0;

        blocks = 0;
        for (i = 0; i < 3; i++)
                blocks += be16_to_cpu(extent[i].count);

        res = hfs_free_extents(sb, extent, blocks, blocks);
        if (res)
                return res;
        if (total_blocks == blocks)
                return 0;

        res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
        if (res)
                return res;
        do {
                res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type);
                if (res)
                        break;
                start = be16_to_cpu(fd.key->ext.FABN);
                hfs_free_extents(sb, extent, total_blocks - start, total_blocks);
                hfs_brec_remove(&fd);
                total_blocks = start;
        } while (total_blocks > blocks);
        hfs_find_exit(&fd);

        return res;
}

/*
 * hfs_get_block
 */
int hfs_get_block(struct inode *inode, sector_t block,
                  struct buffer_head *bh_result, int create)
{
        struct super_block *sb;
        u16 dblock, ablock;
        int res;

        sb = inode->i_sb;
        /* Convert inode block to disk allocation block */
        ablock = (u32)block / HFS_SB(sb)->fs_div;

        if (block >= HFS_I(inode)->fs_blocks) {
                if (!create)
                        return 0;
                if (block > HFS_I(inode)->fs_blocks)
                        return -EIO;
                if (ablock >= HFS_I(inode)->alloc_blocks) {
                        res = hfs_extend_file(inode);
                        if (res)
                                return res;
                }
        } else
                create = 0;

        if (ablock < HFS_I(inode)->first_blocks) {
                dblock = hfs_ext_find_block(HFS_I(inode)->first_extents, ablock);
                goto done;
        }

        mutex_lock(&HFS_I(inode)->extents_lock);
        res = hfs_ext_read_extent(inode, ablock);
        if (!res)
                dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents,
                                            ablock - HFS_I(inode)->cached_start);
        else {
                mutex_unlock(&HFS_I(inode)->extents_lock);
                return -EIO;
        }
        mutex_unlock(&HFS_I(inode)->extents_lock);

done:
        map_bh(bh_result, sb, HFS_SB(sb)->fs_start +
               dblock * HFS_SB(sb)->fs_div +
               (u32)block % HFS_SB(sb)->fs_div);

        if (create) {
                set_buffer_new(bh_result);
                HFS_I(inode)->phys_size += sb->s_blocksize;
                HFS_I(inode)->fs_blocks++;
                inode_add_bytes(inode, sb->s_blocksize);
                mark_inode_dirty(inode);
        }
        return 0;
}

int hfs_extend_file(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        u32 start, len, goal;
        int res;

        mutex_lock(&HFS_I(inode)->extents_lock);
        if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks)
                goal = hfs_ext_lastblock(HFS_I(inode)->first_extents);
        else {
                res = hfs_ext_read_extent(inode, HFS_I(inode)->alloc_blocks);
                if (res)
                        goto out;
                goal = hfs_ext_lastblock(HFS_I(inode)->cached_extents);
        }

        len = HFS_I(inode)->clump_blocks;
        start = hfs_vbm_search_free(sb, goal, &len);
        if (!len) {
                res = -ENOSPC;
                goto out;
        }

        hfs_dbg("ino %llu, start %u, len %u\n", inode->i_ino, start, len);
        if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) {
                if (!HFS_I(inode)->first_blocks) {
                        hfs_dbg("first_extent: start %u, len %u\n",
                                start, len);
                        /* no extents yet */
                        HFS_I(inode)->first_extents[0].block = cpu_to_be16(start);
                        HFS_I(inode)->first_extents[0].count = cpu_to_be16(len);
                        res = 0;
                } else {
                        /* try to append to extents in inode */
                        res = hfs_add_extent(HFS_I(inode)->first_extents,
                                             HFS_I(inode)->alloc_blocks,
                                             start, len);
                        if (res == -ENOSPC)
                                goto insert_extent;
                }
                if (!res) {
                        hfs_dump_extent(HFS_I(inode)->first_extents);
                        HFS_I(inode)->first_blocks += len;
                }
        } else {
                res = hfs_add_extent(HFS_I(inode)->cached_extents,
                                     HFS_I(inode)->alloc_blocks -
                                     HFS_I(inode)->cached_start,
                                     start, len);
                if (!res) {
                        hfs_dump_extent(HFS_I(inode)->cached_extents);
                        HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY;
                        HFS_I(inode)->cached_blocks += len;
                } else if (res == -ENOSPC)
                        goto insert_extent;
        }
out:
        mutex_unlock(&HFS_I(inode)->extents_lock);
        if (!res) {
                HFS_I(inode)->alloc_blocks += len;
                mark_inode_dirty(inode);
                if (inode->i_ino < HFS_FIRSTUSER_CNID)
                        set_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags);
                set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
                hfs_mark_mdb_dirty(sb);
        }
        return res;

insert_extent:
        hfs_dbg("insert new extent\n");
        res = hfs_ext_write_extent(inode);
        if (res)
                goto out;

        memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
        HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start);
        HFS_I(inode)->cached_extents[0].count = cpu_to_be16(len);
        hfs_dump_extent(HFS_I(inode)->cached_extents);
        HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW;
        HFS_I(inode)->cached_start = HFS_I(inode)->alloc_blocks;
        HFS_I(inode)->cached_blocks = len;

        res = 0;
        goto out;
}

void hfs_file_truncate(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        struct hfs_find_data fd;
        u16 blk_cnt, alloc_cnt, start;
        u32 size;
        int res;

        hfs_dbg("ino %llu, phys_size %llu -> i_size %llu\n",
                inode->i_ino, (long long)HFS_I(inode)->phys_size,
                inode->i_size);
        if (inode->i_size > HFS_I(inode)->phys_size) {
                struct address_space *mapping = inode->i_mapping;
                void *fsdata = NULL;
                struct folio *folio;

                /* XXX: Can use generic_cont_expand? */
                size = inode->i_size - 1;
                res = hfs_write_begin(NULL, mapping, size + 1, 0, &folio,
                                &fsdata);
                if (!res) {
                        res = generic_write_end(NULL, mapping, size + 1, 0, 0,
                                        folio, fsdata);
                }
                if (res)
                        inode->i_size = HFS_I(inode)->phys_size;
                return;
        } else if (inode->i_size == HFS_I(inode)->phys_size)
                return;
        size = inode->i_size + HFS_SB(sb)->alloc_blksz - 1;
        blk_cnt = size / HFS_SB(sb)->alloc_blksz;
        alloc_cnt = HFS_I(inode)->alloc_blocks;
        if (blk_cnt == alloc_cnt)
                goto out;

        mutex_lock(&HFS_I(inode)->extents_lock);
        res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
        if (res) {
                mutex_unlock(&HFS_I(inode)->extents_lock);
                /* XXX: We lack error handling of hfs_file_truncate() */
                return;
        }
        while (1) {
                if (alloc_cnt == HFS_I(inode)->first_blocks) {
                        hfs_free_extents(sb, HFS_I(inode)->first_extents,
                                         alloc_cnt, alloc_cnt - blk_cnt);
                        hfs_dump_extent(HFS_I(inode)->first_extents);
                        HFS_I(inode)->first_blocks = blk_cnt;
                        break;
                }
                res = __hfs_ext_cache_extent(&fd, inode, alloc_cnt);
                if (res)
                        break;
                start = HFS_I(inode)->cached_start;
                hfs_free_extents(sb, HFS_I(inode)->cached_extents,
                                 alloc_cnt - start, alloc_cnt - blk_cnt);
                hfs_dump_extent(HFS_I(inode)->cached_extents);
                if (blk_cnt > start) {
                        HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY;
                        break;
                }
                alloc_cnt = start;
                HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0;
                HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
                hfs_brec_remove(&fd);
        }
        hfs_find_exit(&fd);
        mutex_unlock(&HFS_I(inode)->extents_lock);

        HFS_I(inode)->alloc_blocks = blk_cnt;
out:
        HFS_I(inode)->phys_size = inode->i_size;
        HFS_I(inode)->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
        inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits);
        mark_inode_dirty(inode);
}




































































































































































































































































    1 




























































































































    1 



















    3 





























    1 




























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2018 Facebook */

#ifndef _LINUX_BTF_H
#define _LINUX_BTF_H 1

#include <linux/types.h>
#include <linux/bpfptr.h>
#include <linux/bsearch.h>
#include <linux/btf_ids.h>
#include <uapi/linux/btf.h>
#include <uapi/linux/bpf.h>

#define BTF_TYPE_EMIT(type) ((void)(type *)0)
#define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val)

/* These need to be macros, as the expressions are used in assembler input */
#define KF_ACQUIRE        (1 << 0) /* kfunc is an acquire function */
#define KF_RELEASE        (1 << 1) /* kfunc is a release function */
#define KF_RET_NULL        (1 << 2) /* kfunc returns a pointer that may be NULL */
/* Trusted arguments are those which are guaranteed to be valid when passed to
 * the kfunc. It is used to enforce that pointers obtained from either acquire
 * kfuncs, or from the main kernel on a tracepoint or struct_ops callback
 * invocation, remain unmodified when being passed to helpers taking trusted
 * args.
 *
 * Consider, for example, the following new task tracepoint:
 *
 *        SEC("tp_btf/task_newtask")
 *        int BPF_PROG(new_task_tp, struct task_struct *task, u64 clone_flags)
 *        {
 *                ...
 *        }
 *
 * And the following kfunc:
 *
 *        BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE)
 *
 * All invocations to the kfunc must pass the unmodified, unwalked task:
 *
 *        bpf_task_acquire(task);                    // Allowed
 *        bpf_task_acquire(task->last_wakee); // Rejected, walked task
 *
 * Programs may also pass referenced tasks directly to the kfunc:
 *
 *        struct task_struct *acquired;
 *
 *        acquired = bpf_task_acquire(task);        // Allowed, same as above
 *        bpf_task_acquire(acquired);                // Allowed
 *        bpf_task_acquire(task);                        // Allowed
 *        bpf_task_acquire(acquired->last_wakee); // Rejected, walked task
 *
 * Programs may _not_, however, pass a task from an arbitrary fentry/fexit, or
 * kprobe/kretprobe to the kfunc, as BPF cannot guarantee that all of these
 * pointers are guaranteed to be safe. For example, the following BPF program
 * would be rejected:
 *
 * SEC("kretprobe/free_task")
 * int BPF_PROG(free_task_probe, struct task_struct *tsk)
 * {
 *        struct task_struct *acquired;
 *
 *        acquired = bpf_task_acquire(acquired); // Rejected, not a trusted pointer
 *        bpf_task_release(acquired);
 *
 *        return 0;
 * }
 */
#define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
#define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
#define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */
#define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
#define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
#define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */
#define KF_FASTCALL     (1 << 12) /* kfunc supports bpf_fastcall protocol */
#define KF_ARENA_RET    (1 << 13) /* kfunc returns an arena pointer */
#define KF_ARENA_ARG1   (1 << 14) /* kfunc takes an arena pointer as its first argument */
#define KF_ARENA_ARG2   (1 << 15) /* kfunc takes an arena pointer as its second argument */
#define KF_IMPLICIT_ARGS (1 << 16) /* kfunc has implicit arguments supplied by the verifier */

/*
 * Tag marking a kernel function as a kfunc. This is meant to minimize the
 * amount of copy-paste that kfunc authors have to include for correctness so
 * as to avoid issues such as the compiler inlining or eliding either a static
 * kfunc, or a global kfunc in an LTO build.
 */
#define __bpf_kfunc __used __retain __noclone noinline

#define __bpf_kfunc_start_defs()                                               \
        __diag_push();                                                               \
        __diag_ignore_all("-Wmissing-declarations",                               \
                          "Global kfuncs as their definitions will be in BTF");\
        __diag_ignore_all("-Wmissing-prototypes",                               \
                          "Global kfuncs as their definitions will be in BTF")

#define __bpf_kfunc_end_defs() __diag_pop()
#define __bpf_hook_start() __bpf_kfunc_start_defs()
#define __bpf_hook_end() __bpf_kfunc_end_defs()

/*
 * Return the name of the passed struct, if exists, or halt the build if for
 * example the structure gets renamed. In this way, developers have to revisit
 * the code using that structure name, and update it accordingly.
 */
#define stringify_struct(x)                        \
        ({ BUILD_BUG_ON(sizeof(struct x) < 0);        \
           __stringify(x); })

struct btf;
struct btf_member;
struct btf_type;
union bpf_attr;
struct btf_show;
struct btf_id_set;
struct bpf_prog;

typedef int (*btf_kfunc_filter_t)(const struct bpf_prog *prog, u32 kfunc_id);

struct btf_kfunc_id_set {
        struct module *owner;
        struct btf_id_set8 *set;
        btf_kfunc_filter_t filter;
};

struct btf_id_dtor_kfunc {
        u32 btf_id;
        u32 kfunc_btf_id;
};

struct btf_struct_meta {
        u32 btf_id;
        struct btf_record *record;
};

struct btf_struct_metas {
        u32 cnt;
        struct btf_struct_meta types[];
};

extern const struct file_operations btf_fops;

const char *btf_get_name(const struct btf *btf);
void btf_get(struct btf *btf);
void btf_put(struct btf *btf);
const struct btf_header *btf_header(const struct btf *btf);
int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz);
struct btf *btf_get_by_fd(int fd);
int btf_get_info_by_fd(const struct btf *btf,
                       const union bpf_attr *attr,
                       union bpf_attr __user *uattr);
/* Figure out the size of a type_id.  If type_id is a modifier
 * (e.g. const), it will be resolved to find out the type with size.
 *
 * For example:
 * In describing "const void *",  type_id is "const" and "const"
 * refers to "void *".  The return type will be "void *".
 *
 * If type_id is a simple "int", then return type will be "int".
 *
 * @btf: struct btf object
 * @type_id: Find out the size of type_id. The type_id of the return
 *           type is set to *type_id.
 * @ret_size: It can be NULL.  If not NULL, the size of the return
 *            type is set to *ret_size.
 * Return: The btf_type (resolved to another type with size info if needed).
 *         NULL is returned if type_id itself does not have size info
 *         (e.g. void) or it cannot be resolved to another type that
 *         has size info.
 *         *type_id and *ret_size will not be changed in the
 *         NULL return case.
 */
const struct btf_type *btf_type_id_size(const struct btf *btf,
                                        u32 *type_id,
                                        u32 *ret_size);

/*
 * Options to control show behaviour.
 *        - BTF_SHOW_COMPACT: no formatting around type information
 *        - BTF_SHOW_NONAME: no struct/union member names/types
 *        - BTF_SHOW_PTR_RAW: show raw (unobfuscated) pointer values;
 *          equivalent to %px.
 *        - BTF_SHOW_ZERO: show zero-valued struct/union members; they
 *          are not displayed by default
 *        - BTF_SHOW_UNSAFE: skip use of bpf_probe_read() to safely read
 *          data before displaying it.
 */
#define BTF_SHOW_COMPACT        BTF_F_COMPACT
#define BTF_SHOW_NONAME                BTF_F_NONAME
#define BTF_SHOW_PTR_RAW        BTF_F_PTR_RAW
#define BTF_SHOW_ZERO                BTF_F_ZERO
#define BTF_SHOW_UNSAFE                (1ULL << 4)

void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
                       struct seq_file *m);
int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, void *obj,
                            struct seq_file *m, u64 flags);

/*
 * Copy len bytes of string representation of obj of BTF type_id into buf.
 *
 * @btf: struct btf object
 * @type_id: type id of type obj points to
 * @obj: pointer to typed data
 * @buf: buffer to write to
 * @len: maximum length to write to buf
 * @flags: show options (see above)
 *
 * Return: length that would have been/was copied as per snprintf, or
 *           negative error.
 */
int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
                           char *buf, int len, u64 flags);

int btf_get_fd_by_id(u32 id);
u32 btf_obj_id(const struct btf *btf);
bool btf_is_kernel(const struct btf *btf);
bool btf_is_module(const struct btf *btf);
bool btf_is_vmlinux(const struct btf *btf);
struct module *btf_try_get_module(const struct btf *btf);
u32 btf_nr_types(const struct btf *btf);
u32 btf_named_start_id(const struct btf *btf, bool own);
struct btf *btf_base_btf(const struct btf *btf);
bool btf_type_is_i32(const struct btf_type *t);
bool btf_type_is_i64(const struct btf_type *t);
bool btf_type_is_primitive(const struct btf_type *t);
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
                           const struct btf_member *m,
                           u32 expected_offset, u32 expected_size);
struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
                                    u32 field_mask, u32 value_size);
int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec);
bool btf_type_is_void(const struct btf_type *t);
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p);
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
                                               u32 id, u32 *res_id);
const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
                                            u32 id, u32 *res_id);
const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
                                                 u32 id, u32 *res_id);
const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
                 u32 *type_size);
const char *btf_type_str(const struct btf_type *t);

#define for_each_member(i, struct_type, member)                        \
        for (i = 0, member = btf_type_member(struct_type);        \
             i < btf_type_vlen(struct_type);                        \
             i++, member++)

#define for_each_vsi(i, datasec_type, member)                        \
        for (i = 0, member = btf_type_var_secinfo(datasec_type);        \
             i < btf_type_vlen(datasec_type);                        \
             i++, member++)

static inline bool btf_type_is_ptr(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
}

static inline bool btf_type_is_int(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
}

static inline bool btf_type_is_small_int(const struct btf_type *t)
{
        return btf_type_is_int(t) && t->size <= sizeof(u64);
}

static inline u8 btf_int_encoding(const struct btf_type *t)
{
        return BTF_INT_ENCODING(*(u32 *)(t + 1));
}

static inline bool btf_type_is_signed_int(const struct btf_type *t)
{
        return btf_type_is_int(t) && (btf_int_encoding(t) & BTF_INT_SIGNED);
}

static inline bool btf_type_is_enum(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
}

static inline bool btf_is_any_enum(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM ||
               BTF_INFO_KIND(t->info) == BTF_KIND_ENUM64;
}

static inline bool btf_kind_core_compat(const struct btf_type *t1,
                                        const struct btf_type *t2)
{
        return BTF_INFO_KIND(t1->info) == BTF_INFO_KIND(t2->info) ||
               (btf_is_any_enum(t1) && btf_is_any_enum(t2));
}

static inline bool str_is_empty(const char *s)
{
        return !s || !s[0];
}

static inline u16 btf_kind(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info);
}

static inline bool btf_is_enum(const struct btf_type *t)
{
        return btf_kind(t) == BTF_KIND_ENUM;
}

static inline bool btf_is_enum64(const struct btf_type *t)
{
        return btf_kind(t) == BTF_KIND_ENUM64;
}

static inline u64 btf_enum64_value(const struct btf_enum64 *e)
{
        return ((u64)e->val_hi32 << 32) | e->val_lo32;
}

static inline bool btf_is_composite(const struct btf_type *t)
{
        u16 kind = btf_kind(t);

        return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}

static inline bool btf_is_array(const struct btf_type *t)
{
        return btf_kind(t) == BTF_KIND_ARRAY;
}

static inline bool btf_is_int(const struct btf_type *t)
{
        return btf_kind(t) == BTF_KIND_INT;
}

static inline bool btf_is_ptr(const struct btf_type *t)
{
        return btf_kind(t) == BTF_KIND_PTR;
}

static inline u8 btf_int_offset(const struct btf_type *t)
{
        return BTF_INT_OFFSET(*(u32 *)(t + 1));
}

static inline __u8 btf_int_bits(const struct btf_type *t)
{
        return BTF_INT_BITS(*(__u32 *)(t + 1));
}

static inline bool btf_type_is_scalar(const struct btf_type *t)
{
        return btf_type_is_int(t) || btf_type_is_enum(t);
}

static inline bool btf_type_is_fwd(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
}

static inline bool btf_type_is_typedef(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF;
}

static inline bool btf_type_is_volatile(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_VOLATILE;
}

static inline bool btf_type_is_func(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
}

static inline bool btf_type_is_func_proto(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
}

static inline bool btf_type_is_var(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
}

static inline bool btf_type_is_type_tag(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG;
}

/* union is only a special case of struct:
 * all its offsetof(member) == 0
 */
static inline bool btf_type_is_struct(const struct btf_type *t)
{
        u8 kind = BTF_INFO_KIND(t->info);

        return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}

static inline bool __btf_type_is_struct(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
}

static inline bool btf_type_is_array(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
}

static inline u16 btf_type_vlen(const struct btf_type *t)
{
        return BTF_INFO_VLEN(t->info);
}

static inline u16 btf_vlen(const struct btf_type *t)
{
        return btf_type_vlen(t);
}

static inline u16 btf_func_linkage(const struct btf_type *t)
{
        return BTF_INFO_VLEN(t->info);
}

static inline bool btf_type_kflag(const struct btf_type *t)
{
        return BTF_INFO_KFLAG(t->info);
}

static inline u32 __btf_member_bit_offset(const struct btf_type *struct_type,
                                          const struct btf_member *member)
{
        return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
                                           : member->offset;
}

static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type,
                                             const struct btf_member *member)
{
        return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
                                           : 0;
}

static inline struct btf_member *btf_members(const struct btf_type *t)
{
        return (struct btf_member *)(t + 1);
}

static inline u32 btf_member_bit_offset(const struct btf_type *t, u32 member_idx)
{
        const struct btf_member *m = btf_members(t) + member_idx;

        return __btf_member_bit_offset(t, m);
}

static inline u32 btf_member_bitfield_size(const struct btf_type *t, u32 member_idx)
{
        const struct btf_member *m = btf_members(t) + member_idx;

        return __btf_member_bitfield_size(t, m);
}

static inline const struct btf_member *btf_type_member(const struct btf_type *t)
{
        return (const struct btf_member *)(t + 1);
}

static inline struct btf_array *btf_array(const struct btf_type *t)
{
        return (struct btf_array *)(t + 1);
}

static inline struct btf_enum *btf_enum(const struct btf_type *t)
{
        return (struct btf_enum *)(t + 1);
}

static inline struct btf_enum64 *btf_enum64(const struct btf_type *t)
{
        return (struct btf_enum64 *)(t + 1);
}

static inline const struct btf_var_secinfo *btf_type_var_secinfo(
                const struct btf_type *t)
{
        return (const struct btf_var_secinfo *)(t + 1);
}

static inline struct btf_param *btf_params(const struct btf_type *t)
{
        return (struct btf_param *)(t + 1);
}

static inline struct btf_decl_tag *btf_decl_tag(const struct btf_type *t)
{
        return (struct btf_decl_tag *)(t + 1);
}

static inline int btf_id_cmp_func(const void *a, const void *b)
{
        const int *pa = a, *pb = b;

        return *pa - *pb;
}

static inline bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
{
        return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
}

static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
{
        return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
}

bool btf_param_match_suffix(const struct btf *btf,
                            const struct btf_param *arg,
                            const char *suffix);
int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
                       u32 arg_no);
u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off);

struct bpf_verifier_log;

#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
struct bpf_struct_ops;
int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops);
const struct bpf_struct_ops_desc *bpf_struct_ops_find_value(struct btf *btf, u32 value_id);
const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id);
#else
static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id)
{
        return NULL;
}
#endif

enum btf_field_iter_kind {
        BTF_FIELD_ITER_IDS,
        BTF_FIELD_ITER_STRS,
};

struct btf_field_desc {
        /* once-per-type offsets */
        int t_off_cnt, t_offs[2];
        /* member struct size, or zero, if no members */
        int m_sz;
        /* repeated per-member offsets */
        int m_off_cnt, m_offs[1];
};

struct btf_field_iter {
        struct btf_field_desc desc;
        void *p;
        int m_idx;
        int off_idx;
        int vlen;
};

#ifdef CONFIG_BPF_SYSCALL
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
void btf_set_base_btf(struct btf *btf, const struct btf *base_btf);
int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **map_ids);
int btf_field_iter_init(struct btf_field_iter *it, struct btf_type *t,
                        enum btf_field_iter_kind iter_kind);
__u32 *btf_field_iter_next(struct btf_field_iter *it);

const char *btf_name_by_offset(const struct btf *btf, u32 offset);
const char *btf_str_by_offset(const struct btf *btf, u32 offset);
struct btf *btf_parse_vmlinux(void);
struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog);
bool btf_kfunc_is_allowed(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog);
u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
                                const struct bpf_prog *prog);
int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
                              const struct btf_kfunc_id_set *s);
int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset);
s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
                                struct module *owner);
struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id);
bool btf_is_projection_of(const char *pname, const char *tname);
bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
                           const struct btf_type *t, enum bpf_prog_type prog_type,
                           int arg);
int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type);
bool btf_types_are_same(const struct btf *btf1, u32 id1,
                        const struct btf *btf2, u32 id2);
int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx);

static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t)
{
        if (!btf_type_is_ptr(t))
                return false;

        t = btf_type_skip_modifiers(btf, t->type, NULL);

        return btf_type_is_struct(t);
}
#else
static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
                                                    u32 type_id)
{
        return NULL;
}

static inline void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
{
}

static inline int btf_relocate(void *log, struct btf *btf, const struct btf *base_btf,
                               __u32 **map_ids)
{
        return -EOPNOTSUPP;
}

static inline int btf_field_iter_init(struct btf_field_iter *it, struct btf_type *t,
                                      enum btf_field_iter_kind iter_kind)
{
        return -EOPNOTSUPP;
}

static inline __u32 *btf_field_iter_next(struct btf_field_iter *it)
{
        return NULL;
}

static inline const char *btf_name_by_offset(const struct btf *btf,
                                             u32 offset)
{
        return NULL;
}
static inline u32 *btf_kfunc_id_set_contains(const struct btf *btf,
                                             u32 kfunc_btf_id,
                                             struct bpf_prog *prog)

{
        return NULL;
}
static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
                                            const struct btf_kfunc_id_set *s)
{
        return 0;
}
static inline s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
{
        return -ENOENT;
}
static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors,
                                              u32 add_cnt, struct module *owner)
{
        return 0;
}
static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
{
        return NULL;
}
static inline bool
btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
                     const struct btf_type *t, enum bpf_prog_type prog_type,
                     int arg)
{
        return false;
}
static inline int get_kern_ctx_btf_id(struct bpf_verifier_log *log,
                                      enum bpf_prog_type prog_type) {
        return -EINVAL;
}
static inline bool btf_types_are_same(const struct btf *btf1, u32 id1,
                                      const struct btf *btf2, u32 id2)
{
        return false;
}
static inline int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
{
        return -EOPNOTSUPP;
}
#endif
#endif




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 




















































































    1 






















    1 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 */
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/bpf-cgroup.h>
#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
#include <linux/topology.h>
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/uidgid.h>
#include <linux/filter.h>
#include <linux/ctype.h>
#include <linux/jiffies.h>
#include <linux/pid_namespace.h>
#include <linux/poison.h>
#include <linux/proc_ns.h>
#include <linux/sched/task.h>
#include <linux/security.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/kasan.h>
#include <linux/bpf_verifier.h>
#include <linux/uaccess.h>
#include <linux/verification.h>
#include <linux/task_work.h>
#include <linux/irq_work.h>
#include <linux/buildid.h>

#include "../../lib/kstrtox.h"

/* If kernel subsystem is allowing eBPF programs to call this function,
 * inside its own verifier_ops->get_func_proto() callback it should return
 * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
 *
 * Different map implementations will rely on rcu in map methods
 * lookup/update/delete, therefore eBPF programs must run under rcu lock
 * if program is allowed to access maps, so check rcu_read_lock_held() or
 * rcu_read_lock_trace_held() in all three functions.
 */
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
        WARN_ON_ONCE(!bpf_rcu_lock_held());
        return (unsigned long) map->ops->map_lookup_elem(map, key);
}

const struct bpf_func_proto bpf_map_lookup_elem_proto = {
        .func                = bpf_map_lookup_elem,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_MAP_VALUE_OR_NULL,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_MAP_KEY,
};

BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
           void *, value, u64, flags)
{
        WARN_ON_ONCE(!bpf_rcu_lock_held());
        return map->ops->map_update_elem(map, key, value, flags);
}

const struct bpf_func_proto bpf_map_update_elem_proto = {
        .func                = bpf_map_update_elem,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_MAP_KEY,
        .arg3_type        = ARG_PTR_TO_MAP_VALUE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
        WARN_ON_ONCE(!bpf_rcu_lock_held());
        return map->ops->map_delete_elem(map, key);
}

const struct bpf_func_proto bpf_map_delete_elem_proto = {
        .func                = bpf_map_delete_elem,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_MAP_KEY,
};

BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags)
{
        return map->ops->map_push_elem(map, value, flags);
}

const struct bpf_func_proto bpf_map_push_elem_proto = {
        .func                = bpf_map_push_elem,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_MAP_VALUE,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value)
{
        return map->ops->map_pop_elem(map, value);
}

const struct bpf_func_proto bpf_map_pop_elem_proto = {
        .func                = bpf_map_pop_elem,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
};

BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
{
        return map->ops->map_peek_elem(map, value);
}

const struct bpf_func_proto bpf_map_peek_elem_proto = {
        .func                = bpf_map_peek_elem,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
};

BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
{
        WARN_ON_ONCE(!bpf_rcu_lock_held());
        return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
}

const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
        .func                = bpf_map_lookup_percpu_elem,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_MAP_VALUE_OR_NULL,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_MAP_KEY,
        .arg3_type        = ARG_ANYTHING,
};

const struct bpf_func_proto bpf_get_prandom_u32_proto = {
        .func                = bpf_user_rnd_u32,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_0(bpf_get_smp_processor_id)
{
        return smp_processor_id();
}

const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
        .func                = bpf_get_smp_processor_id,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .allow_fastcall        = true,
};

BPF_CALL_0(bpf_get_numa_node_id)
{
        return numa_node_id();
}

const struct bpf_func_proto bpf_get_numa_node_id_proto = {
        .func                = bpf_get_numa_node_id,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_0(bpf_ktime_get_ns)
{
        /* NMI safe access to clock monotonic */
        return ktime_get_mono_fast_ns();
}

const struct bpf_func_proto bpf_ktime_get_ns_proto = {
        .func                = bpf_ktime_get_ns,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_0(bpf_ktime_get_boot_ns)
{
        /* NMI safe access to clock boottime */
        return ktime_get_boot_fast_ns();
}

const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
        .func                = bpf_ktime_get_boot_ns,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_0(bpf_ktime_get_coarse_ns)
{
        return ktime_get_coarse_ns();
}

const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
        .func                = bpf_ktime_get_coarse_ns,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_0(bpf_ktime_get_tai_ns)
{
        /* NMI safe access to clock tai */
        return ktime_get_tai_fast_ns();
}

const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
        .func                = bpf_ktime_get_tai_ns,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_0(bpf_get_current_pid_tgid)
{
        struct task_struct *task = current;

        if (unlikely(!task))
                return -EINVAL;

        return (u64) task->tgid << 32 | task->pid;
}

const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
        .func                = bpf_get_current_pid_tgid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_0(bpf_get_current_uid_gid)
{
        struct task_struct *task = current;
        kuid_t uid;
        kgid_t gid;

        if (unlikely(!task))
                return -EINVAL;

        current_uid_gid(&uid, &gid);
        return (u64) from_kgid(&init_user_ns, gid) << 32 |
                     from_kuid(&init_user_ns, uid);
}

const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
        .func                = bpf_get_current_uid_gid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
{
        struct task_struct *task = current;

        if (unlikely(!task))
                goto err_clear;

        /* Verifier guarantees that size > 0 */
        strscpy_pad(buf, task->comm, size);
        return 0;
err_clear:
        memset(buf, 0, size);
        return -EINVAL;
}

const struct bpf_func_proto bpf_get_current_comm_proto = {
        .func                = bpf_get_current_comm,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE,
};

#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)

static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
{
        arch_spinlock_t *l = (void *)lock;
        union {
                __u32 val;
                arch_spinlock_t lock;
        } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };

        compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
        BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
        BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
        preempt_disable();
        arch_spin_lock(l);
}

static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
{
        arch_spinlock_t *l = (void *)lock;

        arch_spin_unlock(l);
        preempt_enable();
}

#else

static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
{
        atomic_t *l = (void *)lock;

        BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
        do {
                atomic_cond_read_relaxed(l, !VAL);
        } while (atomic_xchg(l, 1));
}

static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
{
        atomic_t *l = (void *)lock;

        atomic_set_release(l, 0);
}

#endif

static DEFINE_PER_CPU(unsigned long, irqsave_flags);

static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
{
        unsigned long flags;

        local_irq_save(flags);
        __bpf_spin_lock(lock);
        __this_cpu_write(irqsave_flags, flags);
}

NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
{
        __bpf_spin_lock_irqsave(lock);
        return 0;
}

const struct bpf_func_proto bpf_spin_lock_proto = {
        .func                = bpf_spin_lock,
        .gpl_only        = false,
        .ret_type        = RET_VOID,
        .arg1_type        = ARG_PTR_TO_SPIN_LOCK,
        .arg1_btf_id    = BPF_PTR_POISON,
};

static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
{
        unsigned long flags;

        flags = __this_cpu_read(irqsave_flags);
        __bpf_spin_unlock(lock);
        local_irq_restore(flags);
}

NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
{
        __bpf_spin_unlock_irqrestore(lock);
        return 0;
}

const struct bpf_func_proto bpf_spin_unlock_proto = {
        .func                = bpf_spin_unlock,
        .gpl_only        = false,
        .ret_type        = RET_VOID,
        .arg1_type        = ARG_PTR_TO_SPIN_LOCK,
        .arg1_btf_id    = BPF_PTR_POISON,
};

void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
                           bool lock_src)
{
        struct bpf_spin_lock *lock;

        if (lock_src)
                lock = src + map->record->spin_lock_off;
        else
                lock = dst + map->record->spin_lock_off;
        preempt_disable();
        __bpf_spin_lock_irqsave(lock);
        copy_map_value(map, dst, src);
        __bpf_spin_unlock_irqrestore(lock);
        preempt_enable();
}

BPF_CALL_0(bpf_jiffies64)
{
        return get_jiffies_64();
}

const struct bpf_func_proto bpf_jiffies64_proto = {
        .func                = bpf_jiffies64,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
        struct cgroup *cgrp;
        u64 cgrp_id;

        rcu_read_lock();
        cgrp = task_dfl_cgroup(current);
        cgrp_id = cgroup_id(cgrp);
        rcu_read_unlock();

        return cgrp_id;
}

const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
        .func                = bpf_get_current_cgroup_id,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
{
        struct cgroup *cgrp;
        struct cgroup *ancestor;
        u64 cgrp_id;

        rcu_read_lock();
        cgrp = task_dfl_cgroup(current);
        ancestor = cgroup_ancestor(cgrp, ancestor_level);
        cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
        rcu_read_unlock();

        return cgrp_id;
}

const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
        .func                = bpf_get_current_ancestor_cgroup_id,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
};
#endif /* CONFIG_CGROUPS */

#define BPF_STRTOX_BASE_MASK 0x1F

static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags,
                          unsigned long long *res, bool *is_negative)
{
        unsigned int base = flags & BPF_STRTOX_BASE_MASK;
        const char *cur_buf = buf;
        size_t cur_len = buf_len;
        unsigned int consumed;
        size_t val_len;
        char str[64];

        if (!buf || !buf_len || !res || !is_negative)
                return -EINVAL;

        if (base != 0 && base != 8 && base != 10 && base != 16)
                return -EINVAL;

        if (flags & ~BPF_STRTOX_BASE_MASK)
                return -EINVAL;

        while (cur_buf < buf + buf_len && isspace(*cur_buf))
                ++cur_buf;

        *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-');
        if (*is_negative)
                ++cur_buf;

        consumed = cur_buf - buf;
        cur_len -= consumed;
        if (!cur_len)
                return -EINVAL;

        cur_len = min(cur_len, sizeof(str) - 1);
        memcpy(str, cur_buf, cur_len);
        str[cur_len] = '\0';
        cur_buf = str;

        cur_buf = _parse_integer_fixup_radix(cur_buf, &base);
        val_len = _parse_integer(cur_buf, base, res);

        if (val_len & KSTRTOX_OVERFLOW)
                return -ERANGE;

        if (val_len == 0)
                return -EINVAL;

        cur_buf += val_len;
        consumed += cur_buf - str;

        return consumed;
}

static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
                         long long *res)
{
        unsigned long long _res;
        bool is_negative;
        int err;

        err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
        if (err < 0)
                return err;
        if (is_negative) {
                if ((long long)-_res > 0)
                        return -ERANGE;
                *res = -_res;
        } else {
                if ((long long)_res < 0)
                        return -ERANGE;
                *res = _res;
        }
        return err;
}

BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
           s64 *, res)
{
        long long _res;
        int err;

        *res = 0;
        err = __bpf_strtoll(buf, buf_len, flags, &_res);
        if (err < 0)
                return err;
        *res = _res;
        return err;
}

const struct bpf_func_proto bpf_strtol_proto = {
        .func                = bpf_strtol,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
        .arg4_size        = sizeof(s64),
};

BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
           u64 *, res)
{
        unsigned long long _res;
        bool is_negative;
        int err;

        *res = 0;
        err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
        if (err < 0)
                return err;
        if (is_negative)
                return -EINVAL;
        *res = _res;
        return err;
}

const struct bpf_func_proto bpf_strtoul_proto = {
        .func                = bpf_strtoul,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
        .arg4_size        = sizeof(u64),
};

BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
{
        return strncmp(s1, s2, s1_sz);
}

static const struct bpf_func_proto bpf_strncmp_proto = {
        .func                = bpf_strncmp,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE,
        .arg3_type        = ARG_PTR_TO_CONST_STR,
};

BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
           struct bpf_pidns_info *, nsdata, u32, size)
{
        struct task_struct *task = current;
        struct pid_namespace *pidns;
        int err = -EINVAL;

        if (unlikely(size != sizeof(struct bpf_pidns_info)))
                goto clear;

        if (unlikely((u64)(dev_t)dev != dev))
                goto clear;

        if (unlikely(!task))
                goto clear;

        pidns = task_active_pid_ns(task);
        if (unlikely(!pidns)) {
                err = -ENOENT;
                goto clear;
        }

        if (!ns_match(&pidns->ns, (dev_t)dev, ino))
                goto clear;

        nsdata->pid = task_pid_nr_ns(task, pidns);
        nsdata->tgid = task_tgid_nr_ns(task, pidns);
        return 0;
clear:
        memset((void *)nsdata, 0, (size_t) size);
        return err;
}

const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
        .func                = bpf_get_ns_current_pid_tgid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type      = ARG_CONST_SIZE,
};

static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
        .func                = bpf_get_raw_cpu_id,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
           u64, flags, void *, data, u64, size)
{
        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
                return -EINVAL;

        return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
}

const struct bpf_func_proto bpf_event_output_data_proto =  {
        .func                = bpf_event_output_data,
        .gpl_only       = true,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_CONST_MAP_PTR,
        .arg3_type      = ARG_ANYTHING,
        .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
           const void __user *, user_ptr)
{
        int ret = copy_from_user(dst, user_ptr, size);

        if (unlikely(ret)) {
                memset(dst, 0, size);
                ret = -EFAULT;
        }

        return ret;
}

const struct bpf_func_proto bpf_copy_from_user_proto = {
        .func                = bpf_copy_from_user,
        .gpl_only        = false,
        .might_sleep        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
           const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
{
        int ret;

        /* flags is not used yet */
        if (unlikely(flags))
                return -EINVAL;

        if (unlikely(!size))
                return 0;

        ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
        if (ret == size)
                return 0;

        memset(dst, 0, size);
        /* Return -EFAULT for partial read */
        return ret < 0 ? ret : -EFAULT;
}

const struct bpf_func_proto bpf_copy_from_user_task_proto = {
        .func                = bpf_copy_from_user_task,
        .gpl_only        = true,
        .might_sleep        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_BTF_ID,
        .arg4_btf_id        = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
        .arg5_type        = ARG_ANYTHING
};

BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
{
        if (cpu >= nr_cpu_ids)
                return (unsigned long)NULL;

        return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu);
}

const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
        .func                = bpf_per_cpu_ptr,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg1_type        = ARG_PTR_TO_PERCPU_BTF_ID,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
{
        return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr);
}

const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
        .func                = bpf_this_cpu_ptr,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
        .arg1_type        = ARG_PTR_TO_PERCPU_BTF_ID,
};

static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
                size_t bufsz)
{
        void __user *user_ptr = (__force void __user *)unsafe_ptr;

        buf[0] = 0;

        switch (fmt_ptype) {
        case 's':
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
                if ((unsigned long)unsafe_ptr < TASK_SIZE)
                        return strncpy_from_user_nofault(buf, user_ptr, bufsz);
                fallthrough;
#endif
        case 'k':
                return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
        case 'u':
                return strncpy_from_user_nofault(buf, user_ptr, bufsz);
        }

        return -EINVAL;
}

/* Support executing three nested bprintf helper calls on a given CPU */
#define MAX_BPRINTF_NEST_LEVEL        3

static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);

int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
        int nest_level;

        preempt_disable();
        nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
        if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
                this_cpu_dec(bpf_bprintf_nest_level);
                preempt_enable();
                return -EBUSY;
        }
        *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);

        return 0;
}

void bpf_put_buffers(void)
{
        if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
                return;
        this_cpu_dec(bpf_bprintf_nest_level);
        preempt_enable();
}

void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
{
        if (!data->bin_args && !data->buf)
                return;
        bpf_put_buffers();
}

/*
 * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
 *
 * Returns a negative value if fmt is an invalid format string or 0 otherwise.
 *
 * This can be used in two ways:
 * - Format string verification only: when data->get_bin_args is false
 * - Arguments preparation: in addition to the above verification, it writes in
 *   data->bin_args a binary representation of arguments usable by bstr_printf
 *   where pointers from BPF have been sanitized.
 *
 * In argument preparation mode, if 0 is returned, safe temporary buffers are
 * allocated and bpf_bprintf_cleanup should be called to free them after use.
 */
int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
                        u32 num_args, struct bpf_bprintf_data *data)
{
        bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
        char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
        struct bpf_bprintf_buffers *buffers = NULL;
        size_t sizeof_cur_arg, sizeof_cur_ip;
        int err, i, num_spec = 0;
        u64 cur_arg;
        char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";

        fmt_end = strnchr(fmt, fmt_size, 0);
        if (!fmt_end)
                return -EINVAL;
        fmt_size = fmt_end - fmt;

        if (get_buffers && bpf_try_get_buffers(&buffers))
                return -EBUSY;

        if (data->get_bin_args) {
                if (num_args)
                        tmp_buf = buffers->bin_args;
                tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
                data->bin_args = (u32 *)tmp_buf;
        }

        if (data->get_buf)
                data->buf = buffers->buf;

        for (i = 0; i < fmt_size; i++) {
                unsigned char c = fmt[i];

                /*
                 * Permit bytes >= 0x80 in plain text so UTF-8 literals can pass
                 * through unchanged, while still rejecting ASCII control bytes.
                 */
                if (isascii(c) && !isprint(c) && !isspace(c)) {
                        err = -EINVAL;
                        goto out;
                }

                if (fmt[i] != '%')
                        continue;

                if (fmt[i + 1] == '%') {
                        i++;
                        continue;
                }

                if (num_spec >= num_args) {
                        err = -EINVAL;
                        goto out;
                }

                /* The string is zero-terminated so if fmt[i] != 0, we can
                 * always access fmt[i + 1], in the worst case it will be a 0
                 */
                i++;
                c = fmt[i];
                /*
                 * The format parser below only understands ASCII conversion
                 * specifiers and modifiers, so reject non-ASCII after '%'.
                 */
                if (!isascii(c)) {
                        err = -EINVAL;
                        goto out;
                }

                /* skip optional "[0 +-][num]" width formatting field */
                while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-' ||
                       fmt[i] == ' ')
                        i++;
                if (fmt[i] >= '1' && fmt[i] <= '9') {
                        i++;
                        while (fmt[i] >= '0' && fmt[i] <= '9')
                                i++;
                }

                if (fmt[i] == 'p') {
                        sizeof_cur_arg = sizeof(long);

                        if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
                            ispunct(fmt[i + 1])) {
                                if (tmp_buf)
                                        cur_arg = raw_args[num_spec];
                                goto nocopy_fmt;
                        }

                        if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
                            fmt[i + 2] == 's') {
                                fmt_ptype = fmt[i + 1];
                                i += 2;
                                goto fmt_str;
                        }

                        if (fmt[i + 1] == 'K' ||
                            fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
                            fmt[i + 1] == 'S') {
                                if (tmp_buf)
                                        cur_arg = raw_args[num_spec];
                                i++;
                                goto nocopy_fmt;
                        }

                        if (fmt[i + 1] == 'B') {
                                if (tmp_buf)  {
                                        err = snprintf(tmp_buf,
                                                       (tmp_buf_end - tmp_buf),
                                                       "%pB",
                                                       (void *)(long)raw_args[num_spec]);
                                        tmp_buf += (err + 1);
                                }

                                i++;
                                num_spec++;
                                continue;
                        }

                        /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
                        if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
                            (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
                                err = -EINVAL;
                                goto out;
                        }

                        i += 2;
                        if (!tmp_buf)
                                goto nocopy_fmt;

                        sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
                        if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
                                err = -ENOSPC;
                                goto out;
                        }

                        unsafe_ptr = (char *)(long)raw_args[num_spec];
                        err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
                                                       sizeof_cur_ip);
                        if (err < 0)
                                memset(cur_ip, 0, sizeof_cur_ip);

                        /* hack: bstr_printf expects IP addresses to be
                         * pre-formatted as strings, ironically, the easiest way
                         * to do that is to call snprintf.
                         */
                        ip_spec[2] = fmt[i - 1];
                        ip_spec[3] = fmt[i];
                        err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
                                       ip_spec, &cur_ip);

                        tmp_buf += err + 1;
                        num_spec++;

                        continue;
                } else if (fmt[i] == 's') {
                        fmt_ptype = fmt[i];
fmt_str:
                        if (fmt[i + 1] != 0 &&
                            !isspace(fmt[i + 1]) &&
                            !ispunct(fmt[i + 1])) {
                                err = -EINVAL;
                                goto out;
                        }

                        if (!tmp_buf)
                                goto nocopy_fmt;

                        if (tmp_buf_end == tmp_buf) {
                                err = -ENOSPC;
                                goto out;
                        }

                        unsafe_ptr = (char *)(long)raw_args[num_spec];
                        err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
                                                    fmt_ptype,
                                                    tmp_buf_end - tmp_buf);
                        if (err < 0) {
                                tmp_buf[0] = '\0';
                                err = 1;
                        }

                        tmp_buf += err;
                        num_spec++;

                        continue;
                } else if (fmt[i] == 'c') {
                        if (!tmp_buf)
                                goto nocopy_fmt;

                        if (tmp_buf_end == tmp_buf) {
                                err = -ENOSPC;
                                goto out;
                        }

                        *tmp_buf = raw_args[num_spec];
                        tmp_buf++;
                        num_spec++;

                        continue;
                }

                sizeof_cur_arg = sizeof(int);

                if (fmt[i] == 'l') {
                        sizeof_cur_arg = sizeof(long);
                        i++;
                }
                if (fmt[i] == 'l') {
                        sizeof_cur_arg = sizeof(long long);
                        i++;
                }

                if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
                    fmt[i] != 'x' && fmt[i] != 'X') {
                        err = -EINVAL;
                        goto out;
                }

                if (tmp_buf)
                        cur_arg = raw_args[num_spec];
nocopy_fmt:
                if (tmp_buf) {
                        tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
                        if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
                                err = -ENOSPC;
                                goto out;
                        }

                        if (sizeof_cur_arg == 8) {
                                *(u32 *)tmp_buf = *(u32 *)&cur_arg;
                                *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
                        } else {
                                *(u32 *)tmp_buf = (u32)(long)cur_arg;
                        }
                        tmp_buf += sizeof_cur_arg;
                }
                num_spec++;
        }

        err = 0;
out:
        if (err)
                bpf_bprintf_cleanup(data);
        return err;
}

BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
           const void *, args, u32, data_len)
{
        struct bpf_bprintf_data data = {
                .get_bin_args        = true,
        };
        int err, num_args;

        if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
            (data_len && !args))
                return -EINVAL;
        num_args = data_len / 8;

        /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
         * can safely give an unbounded size.
         */
        err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
        if (err < 0)
                return err;

        err = bstr_printf(str, str_size, fmt, data.bin_args);

        bpf_bprintf_cleanup(&data);

        return err + 1;
}

const struct bpf_func_proto bpf_snprintf_proto = {
        .func                = bpf_snprintf,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_PTR_TO_CONST_STR,
        .arg4_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
{
        if (map->map_type == BPF_MAP_TYPE_ARRAY) {
                struct bpf_array *array = container_of(map, struct bpf_array, map);

                *arr_idx = ((char *)value - array->value) / array->elem_size;
                return arr_idx;
        }
        return (void *)value - round_up(map->key_size, 8);
}

enum bpf_async_type {
        BPF_ASYNC_TYPE_TIMER = 0,
        BPF_ASYNC_TYPE_WQ,
};

enum bpf_async_op {
        BPF_ASYNC_START,
        BPF_ASYNC_CANCEL
};

struct bpf_async_cmd {
        struct llist_node node;
        u64 nsec;
        u32 mode;
        enum bpf_async_op op;
};

struct bpf_async_cb {
        struct bpf_map *map;
        struct bpf_prog *prog;
        void __rcu *callback_fn;
        void *value;
        struct rcu_head rcu;
        u64 flags;
        struct irq_work worker;
        refcount_t refcnt;
        enum bpf_async_type type;
        struct llist_head async_cmds;
};

/* BPF map elements can contain 'struct bpf_timer'.
 * Such map owns all of its BPF timers.
 * 'struct bpf_timer' is allocated as part of map element allocation
 * and it's zero initialized.
 * That space is used to keep 'struct bpf_async_kern'.
 * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
 * remembers 'struct bpf_map *' pointer it's part of.
 * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
 * bpf_timer_start() arms the timer.
 * If user space reference to a map goes to zero at this point
 * ops->map_release_uref callback is responsible for cancelling the timers,
 * freeing their memory, and decrementing prog's refcnts.
 * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
 * Inner maps can contain bpf timers as well. ops->map_release_uref is
 * freeing the timers when inner map is replaced or deleted by user space.
 */
struct bpf_hrtimer {
        struct bpf_async_cb cb;
        struct hrtimer timer;
        atomic_t cancelling;
};

struct bpf_work {
        struct bpf_async_cb cb;
        struct work_struct work;
};

/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
struct bpf_async_kern {
        union {
                struct bpf_async_cb *cb;
                struct bpf_hrtimer *timer;
                struct bpf_work *work;
        };
} __attribute__((aligned(8)));

static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);

static void bpf_async_refcount_put(struct bpf_async_cb *cb);

static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
        struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
        struct bpf_map *map = t->cb.map;
        void *value = t->cb.value;
        bpf_callback_t callback_fn;
        void *key;
        u32 idx;

        BTF_TYPE_EMIT(struct bpf_timer);
        callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
        if (!callback_fn)
                goto out;

        /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
         * cannot be preempted by another bpf_timer_cb() on the same cpu.
         * Remember the timer this callback is servicing to prevent
         * deadlock if callback_fn() calls bpf_timer_cancel() or
         * bpf_map_delete_elem() on the same timer.
         */
        this_cpu_write(hrtimer_running, t);

        key = map_key_from_value(map, value, &idx);

        callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
        /* The verifier checked that return value is zero. */

        this_cpu_write(hrtimer_running, NULL);
out:
        return HRTIMER_NORESTART;
}

static void bpf_wq_work(struct work_struct *work)
{
        struct bpf_work *w = container_of(work, struct bpf_work, work);
        struct bpf_async_cb *cb = &w->cb;
        struct bpf_map *map = cb->map;
        bpf_callback_t callback_fn;
        void *value = cb->value;
        void *key;
        u32 idx;

        BTF_TYPE_EMIT(struct bpf_wq);

        callback_fn = READ_ONCE(cb->callback_fn);
        if (!callback_fn)
                return;

        key = map_key_from_value(map, value, &idx);

        rcu_read_lock_trace();
        migrate_disable();

        callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);

        migrate_enable();
        rcu_read_unlock_trace();
}

static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
{
        struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);

        /*
         * Drop the last reference to prog only after RCU GP, as set_callback()
         * may race with cancel_and_free()
         */
        if (cb->prog)
                bpf_prog_put(cb->prog);

        kfree_nolock(cb);
}

/* Callback from call_rcu_tasks_trace, chains to call_rcu for final free */
static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu)
{
        struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
        struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
        struct bpf_work *w = container_of(cb, struct bpf_work, cb);
        bool retry = false;

        /*
         * bpf_async_cancel_and_free() tried to cancel timer/wq, but it
         * could have raced with timer/wq_start. Now refcnt is zero and
         * srcu/rcu GP completed. Cancel timer/wq again.
         */
        switch (cb->type) {
        case BPF_ASYNC_TYPE_TIMER:
                if (hrtimer_try_to_cancel(&t->timer) < 0)
                        retry = true;
                break;
        case BPF_ASYNC_TYPE_WQ:
                if (!cancel_work(&w->work) && work_busy(&w->work))
                        retry = true;
                break;
        }
        if (retry) {
                /*
                 * hrtimer or wq callback may still be running. It must be
                 * in rcu_tasks_trace or rcu CS, so wait for GP again.
                 * It won't retry forever, since refcnt zero prevents all
                 * operations on timer/wq.
                 */
                call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
                return;
        }

        /* RCU Tasks Trace grace period implies RCU grace period. */
        bpf_async_cb_rcu_free(rcu);
}

static void worker_for_call_rcu(struct irq_work *work)
{
        struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);

        call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
}

static void bpf_async_refcount_put(struct bpf_async_cb *cb)
{
        if (!refcount_dec_and_test(&cb->refcnt))
                return;

        if (irqs_disabled()) {
                cb->worker = IRQ_WORK_INIT(worker_for_call_rcu);
                irq_work_queue(&cb->worker);
        } else {
                call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
        }
}

static void bpf_async_cancel_and_free(struct bpf_async_kern *async);
static void bpf_async_irq_worker(struct irq_work *work);

static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
                            enum bpf_async_type type)
{
        struct bpf_async_cb *cb, *old_cb;
        struct bpf_hrtimer *t;
        struct bpf_work *w;
        clockid_t clockid;
        size_t size;

        switch (type) {
        case BPF_ASYNC_TYPE_TIMER:
                size = sizeof(struct bpf_hrtimer);
                break;
        case BPF_ASYNC_TYPE_WQ:
                size = sizeof(struct bpf_work);
                break;
        default:
                return -EINVAL;
        }

        old_cb = READ_ONCE(async->cb);
        if (old_cb)
                return -EBUSY;

        cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
        if (!cb)
                return -ENOMEM;

        switch (type) {
        case BPF_ASYNC_TYPE_TIMER:
                clockid = flags & (MAX_CLOCKS - 1);
                t = (struct bpf_hrtimer *)cb;

                atomic_set(&t->cancelling, 0);
                hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
                cb->value = (void *)async - map->record->timer_off;
                break;
        case BPF_ASYNC_TYPE_WQ:
                w = (struct bpf_work *)cb;

                INIT_WORK(&w->work, bpf_wq_work);
                cb->value = (void *)async - map->record->wq_off;
                break;
        }
        cb->map = map;
        cb->prog = NULL;
        cb->flags = flags;
        cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker);
        init_llist_head(&cb->async_cmds);
        refcount_set(&cb->refcnt, 1); /* map's reference */
        cb->type = type;
        rcu_assign_pointer(cb->callback_fn, NULL);

        old_cb = cmpxchg(&async->cb, NULL, cb);
        if (old_cb) {
                /* Lost the race to initialize this bpf_async_kern, drop the allocated object */
                kfree_nolock(cb);
                return -EBUSY;
        }
        /* Guarantee the order between async->cb and map->usercnt. So
         * when there are concurrent uref release and bpf timer init, either
         * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
         * timer or atomic64_read() below returns a zero usercnt.
         */
        smp_mb();
        if (!atomic64_read(&map->usercnt)) {
                /* maps with timers must be either held by user space
                 * or pinned in bpffs.
                 */
                bpf_async_cancel_and_free(async);
                return -EPERM;
        }

        return 0;
}

BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
           u64, flags)
{
        clock_t clockid = flags & (MAX_CLOCKS - 1);

        BUILD_BUG_ON(MAX_CLOCKS != 16);
        BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
        BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));

        if (flags >= MAX_CLOCKS ||
            /* similar to timerfd except _ALARM variants are not supported */
            (clockid != CLOCK_MONOTONIC &&
             clockid != CLOCK_REALTIME &&
             clockid != CLOCK_BOOTTIME))
                return -EINVAL;

        return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
}

static const struct bpf_func_proto bpf_timer_init_proto = {
        .func                = bpf_timer_init,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_TIMER,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

static int bpf_async_update_prog_callback(struct bpf_async_cb *cb,
                                          struct bpf_prog *prog,
                                          void *callback_fn)
{
        struct bpf_prog *prev;

        /* Acquire a guard reference on prog to prevent it from being freed during the loop */
        if (prog) {
                prog = bpf_prog_inc_not_zero(prog);
                if (IS_ERR(prog))
                        return PTR_ERR(prog);
        }

        do {
                if (prog)
                        prog = bpf_prog_inc_not_zero(prog);
                prev = xchg(&cb->prog, prog);
                rcu_assign_pointer(cb->callback_fn, callback_fn);

                /*
                 * Release previous prog, make sure that if other CPU is contending,
                 * to set bpf_prog, references are not leaked as each iteration acquires and
                 * releases one reference.
                 */
                if (prev)
                        bpf_prog_put(prev);

        } while (READ_ONCE(cb->prog) != prog ||
                 (void __force *)READ_ONCE(cb->callback_fn) != callback_fn);

        if (prog)
                bpf_prog_put(prog);

        return 0;
}

static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running);

static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op,
                                 u64 nsec, u32 timer_mode)
{
        /*
         * Do not schedule another operation on this cpu if it's in irq_work
         * callback that is processing async_cmds queue. Otherwise the following
         * loop is possible:
         * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue().
         * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start().
         */
        if (this_cpu_read(async_cb_running) == cb) {
                bpf_async_refcount_put(cb);
                return -EDEADLK;
        }

        struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE);

        if (!cmd) {
                bpf_async_refcount_put(cb);
                return -ENOMEM;
        }
        init_llist_node(&cmd->node);
        cmd->nsec = nsec;
        cmd->mode = timer_mode;
        cmd->op = op;
        if (llist_add(&cmd->node, &cb->async_cmds))
                irq_work_queue(&cb->worker);
        return 0;
}

static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
                                    struct bpf_prog *prog)
{
        struct bpf_async_cb *cb;

        cb = READ_ONCE(async->cb);
        if (!cb)
                return -EINVAL;

        return bpf_async_update_prog_callback(cb, prog, callback_fn);
}

BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
           struct bpf_prog_aux *, aux)
{
        return __bpf_async_set_callback(timer, callback_fn, aux->prog);
}

static const struct bpf_func_proto bpf_timer_set_callback_proto = {
        .func                = bpf_timer_set_callback,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_TIMER,
        .arg2_type        = ARG_PTR_TO_FUNC,
};

static bool defer_timer_wq_op(void)
{
        return in_hardirq() || irqs_disabled();
}

BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags)
{
        struct bpf_hrtimer *t;
        u32 mode;

        if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
                return -EINVAL;

        t = READ_ONCE(async->timer);
        if (!t || !READ_ONCE(t->cb.prog))
                return -EINVAL;

        if (flags & BPF_F_TIMER_ABS)
                mode = HRTIMER_MODE_ABS_SOFT;
        else
                mode = HRTIMER_MODE_REL_SOFT;

        if (flags & BPF_F_TIMER_CPU_PIN)
                mode |= HRTIMER_MODE_PINNED;

        /*
         * bpf_async_cancel_and_free() could have dropped refcnt to zero. In
         * such case BPF progs are not allowed to arm the timer to prevent UAF.
         */
        if (!refcount_inc_not_zero(&t->cb.refcnt))
                return -ENOENT;

        if (!defer_timer_wq_op()) {
                hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
                bpf_async_refcount_put(&t->cb);
                return 0;
        } else {
                return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode);
        }
}

static const struct bpf_func_proto bpf_timer_start_proto = {
        .func                = bpf_timer_start,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_TIMER,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async)
{
        struct bpf_hrtimer *t, *cur_t;
        bool inc = false;
        int ret = 0;

        if (defer_timer_wq_op())
                return -EOPNOTSUPP;

        t = READ_ONCE(async->timer);
        if (!t)
                return -EINVAL;

        cur_t = this_cpu_read(hrtimer_running);
        if (cur_t == t) {
                /* If bpf callback_fn is trying to bpf_timer_cancel()
                 * its own timer the hrtimer_cancel() will deadlock
                 * since it waits for callback_fn to finish.
                 */
                return -EDEADLK;
        }

        /* Only account in-flight cancellations when invoked from a timer
         * callback, since we want to avoid waiting only if other _callbacks_
         * are waiting on us, to avoid introducing lockups. Non-callback paths
         * are ok, since nobody would synchronously wait for their completion.
         */
        if (!cur_t)
                goto drop;
        atomic_inc(&t->cancelling);
        /* Need full barrier after relaxed atomic_inc */
        smp_mb__after_atomic();
        inc = true;
        if (atomic_read(&cur_t->cancelling)) {
                /* We're cancelling timer t, while some other timer callback is
                 * attempting to cancel us. In such a case, it might be possible
                 * that timer t belongs to the other callback, or some other
                 * callback waiting upon it (creating transitive dependencies
                 * upon us), and we will enter a deadlock if we continue
                 * cancelling and waiting for it synchronously, since it might
                 * do the same. Bail!
                 */
                atomic_dec(&t->cancelling);
                return -EDEADLK;
        }
drop:
        bpf_async_update_prog_callback(&t->cb, NULL, NULL);
        /* Cancel the timer and wait for associated callback to finish
         * if it was running.
         */
        ret = hrtimer_cancel(&t->timer);
        if (inc)
                atomic_dec(&t->cancelling);
        return ret;
}

static const struct bpf_func_proto bpf_timer_cancel_proto = {
        .func                = bpf_timer_cancel,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_TIMER,
};

static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op,
                                 u64 timer_nsec, u32 timer_mode)
{
        switch (cb->type) {
        case BPF_ASYNC_TYPE_TIMER: {
                struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);

                switch (op) {
                case BPF_ASYNC_START:
                        hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode);
                        break;
                case BPF_ASYNC_CANCEL:
                        hrtimer_try_to_cancel(&t->timer);
                        break;
                }
                break;
        }
        case BPF_ASYNC_TYPE_WQ: {
                struct bpf_work *w = container_of(cb, struct bpf_work, cb);

                switch (op) {
                case BPF_ASYNC_START:
                        schedule_work(&w->work);
                        break;
                case BPF_ASYNC_CANCEL:
                        cancel_work(&w->work);
                        break;
                }
                break;
        }
        }
        bpf_async_refcount_put(cb);
}

static void bpf_async_irq_worker(struct irq_work *work)
{
        struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
        struct llist_node *pos, *n, *list;

        list = llist_del_all(&cb->async_cmds);
        if (!list)
                return;

        list = llist_reverse_order(list);
        this_cpu_write(async_cb_running, cb);
        llist_for_each_safe(pos, n, list) {
                struct bpf_async_cmd *cmd;

                cmd = container_of(pos, struct bpf_async_cmd, node);
                bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode);
                kfree_nolock(cmd);
        }
        this_cpu_write(async_cb_running, NULL);
}

static void bpf_async_cancel_and_free(struct bpf_async_kern *async)
{
        struct bpf_async_cb *cb;

        if (!READ_ONCE(async->cb))
                return;

        cb = xchg(&async->cb, NULL);
        if (!cb)
                return;

        bpf_async_update_prog_callback(cb, NULL, NULL);
        /*
         * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last
         * refcnt. Either synchronously or asynchronously in irq_work.
         */

        if (!defer_timer_wq_op()) {
                bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0);
        } else {
                (void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
                /*
                 * bpf_async_schedule_op() either enqueues allocated cmd into llist
                 * or fails with ENOMEM and drop the last refcnt.
                 * This is unlikely, but safe, since bpf_async_cb_rcu_tasks_trace_free()
                 * callback will do additional timer/wq_cancel due to races anyway.
                 */
        }
}

/*
 * This function is called by map_delete/update_elem for individual element and
 * by ops->map_release_uref when the user space reference to a map reaches zero.
 */
void bpf_timer_cancel_and_free(void *val)
{
        bpf_async_cancel_and_free(val);
}

/*
 * This function is called by map_delete/update_elem for individual element and
 * by ops->map_release_uref when the user space reference to a map reaches zero.
 */
void bpf_wq_cancel_and_free(void *val)
{
        bpf_async_cancel_and_free(val);
}

BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
{
        unsigned long *kptr = dst;

        /* This helper may be inlined by verifier. */
        return xchg(kptr, (unsigned long)ptr);
}

/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
 * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
 * denote type that verifier will determine.
 */
static const struct bpf_func_proto bpf_kptr_xchg_proto = {
        .func         = bpf_kptr_xchg,
        .gpl_only     = false,
        .ret_type     = RET_PTR_TO_BTF_ID_OR_NULL,
        .ret_btf_id   = BPF_PTR_POISON,
        .arg1_type    = ARG_KPTR_XCHG_DEST,
        .arg2_type    = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
        .arg2_btf_id  = BPF_PTR_POISON,
};

struct bpf_dynptr_file_impl {
        struct freader freader;
        /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */
        u64 offset;
        u64 size;
};

/* Since the upper 8 bits of dynptr->size is reserved, the
 * maximum supported size is 2^24 - 1.
 */
#define DYNPTR_MAX_SIZE        ((1UL << 24) - 1)
#define DYNPTR_TYPE_SHIFT        28
#define DYNPTR_SIZE_MASK        0xFFFFFF
#define DYNPTR_RDONLY_BIT        BIT(31)

bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
{
        return ptr->size & DYNPTR_RDONLY_BIT;
}

void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
{
        ptr->size |= DYNPTR_RDONLY_BIT;
}

static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
{
        ptr->size |= type << DYNPTR_TYPE_SHIFT;
}

static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
{
        return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
}

u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
{
        if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
                struct bpf_dynptr_file_impl *df = ptr->data;

                return df->size;
        }

        return ptr->size & DYNPTR_SIZE_MASK;
}

static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off)
{
        if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
                struct bpf_dynptr_file_impl *df = ptr->data;

                df->offset += off;
                return;
        }
        ptr->offset += off;
}

static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size)
{
        u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;

        if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
                struct bpf_dynptr_file_impl *df = ptr->data;

                df->size = new_size;
                return;
        }
        ptr->size = (u32)new_size | metadata;
}

int bpf_dynptr_check_size(u64 size)
{
        return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
}

static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len)
{
        const void *ptr;

        if (!buf)
                return -EINVAL;

        df->freader.buf = buf;
        df->freader.buf_sz = len;
        ptr = freader_fetch(&df->freader, offset + df->offset, len);
        if (!ptr)
                return df->freader.err;

        if (ptr != buf) /* Force copying into the buffer */
                memcpy(buf, ptr, len);

        return 0;
}

void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
                     enum bpf_dynptr_type type, u32 offset, u32 size)
{
        ptr->data = data;
        ptr->offset = offset;
        ptr->size = size;
        bpf_dynptr_set_type(ptr, type);
}

void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
{
        memset(ptr, 0, sizeof(*ptr));
}

BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr)
{
        int err;

        BTF_TYPE_EMIT(struct bpf_dynptr);

        err = bpf_dynptr_check_size(size);
        if (err)
                goto error;

        /* flags is currently unsupported */
        if (flags) {
                err = -EINVAL;
                goto error;
        }

        bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);

        return 0;

error:
        bpf_dynptr_set_null(ptr);
        return err;
}

static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
        .func                = bpf_dynptr_from_mem,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
};

static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src,
                             u64 offset, u64 flags)
{
        enum bpf_dynptr_type type;
        int err;

        if (!src->data || flags)
                return -EINVAL;

        err = bpf_dynptr_check_off_len(src, offset, len);
        if (err)
                return err;

        type = bpf_dynptr_get_type(src);

        switch (type) {
        case BPF_DYNPTR_TYPE_LOCAL:
        case BPF_DYNPTR_TYPE_RINGBUF:
                /* Source and destination may possibly overlap, hence use memmove to
                 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
                 * pointing to overlapping PTR_TO_MAP_VALUE regions.
                 */
                memmove(dst, src->data + src->offset + offset, len);
                return 0;
        case BPF_DYNPTR_TYPE_SKB:
                return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
        case BPF_DYNPTR_TYPE_XDP:
                return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
        case BPF_DYNPTR_TYPE_SKB_META:
                memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
                return 0;
        case BPF_DYNPTR_TYPE_FILE:
                return bpf_file_fetch_bytes(src->data, offset, dst, len);
        default:
                WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
                return -EFAULT;
        }
}

BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src,
           u64, offset, u64, flags)
{
        return __bpf_dynptr_read(dst, len, src, offset, flags);
}

static const struct bpf_func_proto bpf_dynptr_read_proto = {
        .func                = bpf_dynptr_read,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src,
                       u64 len, u64 flags)
{
        enum bpf_dynptr_type type;
        int err;

        if (!dst->data || __bpf_dynptr_is_rdonly(dst))
                return -EINVAL;

        err = bpf_dynptr_check_off_len(dst, offset, len);
        if (err)
                return err;

        type = bpf_dynptr_get_type(dst);

        switch (type) {
        case BPF_DYNPTR_TYPE_LOCAL:
        case BPF_DYNPTR_TYPE_RINGBUF:
                if (flags)
                        return -EINVAL;
                /* Source and destination may possibly overlap, hence use memmove to
                 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
                 * pointing to overlapping PTR_TO_MAP_VALUE regions.
                 */
                memmove(dst->data + dst->offset + offset, src, len);
                return 0;
        case BPF_DYNPTR_TYPE_SKB:
                return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
                                             flags);
        case BPF_DYNPTR_TYPE_XDP:
                if (flags)
                        return -EINVAL;
                return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
        case BPF_DYNPTR_TYPE_SKB_META:
                return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src,
                                                  len, flags);
        default:
                WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
                return -EFAULT;
        }
}

BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src,
           u64, len, u64, flags)
{
        return __bpf_dynptr_write(dst, offset, src, len, flags);
}

static const struct bpf_func_proto bpf_dynptr_write_proto = {
        .func                = bpf_dynptr_write,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len)
{
        enum bpf_dynptr_type type;
        int err;

        if (!ptr->data)
                return 0;

        err = bpf_dynptr_check_off_len(ptr, offset, len);
        if (err)
                return 0;

        if (__bpf_dynptr_is_rdonly(ptr))
                return 0;

        type = bpf_dynptr_get_type(ptr);

        switch (type) {
        case BPF_DYNPTR_TYPE_LOCAL:
        case BPF_DYNPTR_TYPE_RINGBUF:
                return (unsigned long)(ptr->data + ptr->offset + offset);
        case BPF_DYNPTR_TYPE_SKB:
        case BPF_DYNPTR_TYPE_XDP:
        case BPF_DYNPTR_TYPE_SKB_META:
                /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
                return 0;
        default:
                WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
                return 0;
        }
}

static const struct bpf_func_proto bpf_dynptr_data_proto = {
        .func                = bpf_dynptr_data,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
        .arg1_type        = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_CONST_ALLOC_SIZE_OR_ZERO,
};

const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
const struct bpf_func_proto bpf_perf_event_read_proto __weak;
const struct bpf_func_proto bpf_send_signal_proto __weak;
const struct bpf_func_proto bpf_send_signal_thread_proto __weak;
const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak;
const struct bpf_func_proto bpf_get_task_stack_proto __weak;
const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak;

const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_map_lookup_elem:
                return &bpf_map_lookup_elem_proto;
        case BPF_FUNC_map_update_elem:
                return &bpf_map_update_elem_proto;
        case BPF_FUNC_map_delete_elem:
                return &bpf_map_delete_elem_proto;
        case BPF_FUNC_map_push_elem:
                return &bpf_map_push_elem_proto;
        case BPF_FUNC_map_pop_elem:
                return &bpf_map_pop_elem_proto;
        case BPF_FUNC_map_peek_elem:
                return &bpf_map_peek_elem_proto;
        case BPF_FUNC_map_lookup_percpu_elem:
                return &bpf_map_lookup_percpu_elem_proto;
        case BPF_FUNC_get_prandom_u32:
                return &bpf_get_prandom_u32_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_raw_smp_processor_id_proto;
        case BPF_FUNC_get_numa_node_id:
                return &bpf_get_numa_node_id_proto;
        case BPF_FUNC_tail_call:
                return &bpf_tail_call_proto;
        case BPF_FUNC_ktime_get_ns:
                return &bpf_ktime_get_ns_proto;
        case BPF_FUNC_ktime_get_boot_ns:
                return &bpf_ktime_get_boot_ns_proto;
        case BPF_FUNC_ktime_get_tai_ns:
                return &bpf_ktime_get_tai_ns_proto;
        case BPF_FUNC_ringbuf_output:
                return &bpf_ringbuf_output_proto;
        case BPF_FUNC_ringbuf_reserve:
                return &bpf_ringbuf_reserve_proto;
        case BPF_FUNC_ringbuf_submit:
                return &bpf_ringbuf_submit_proto;
        case BPF_FUNC_ringbuf_discard:
                return &bpf_ringbuf_discard_proto;
        case BPF_FUNC_ringbuf_query:
                return &bpf_ringbuf_query_proto;
        case BPF_FUNC_strncmp:
                return &bpf_strncmp_proto;
        case BPF_FUNC_strtol:
                return &bpf_strtol_proto;
        case BPF_FUNC_strtoul:
                return &bpf_strtoul_proto;
        case BPF_FUNC_get_current_pid_tgid:
                return &bpf_get_current_pid_tgid_proto;
        case BPF_FUNC_get_ns_current_pid_tgid:
                return &bpf_get_ns_current_pid_tgid_proto;
        case BPF_FUNC_get_current_uid_gid:
                return &bpf_get_current_uid_gid_proto;
        default:
                break;
        }

        if (!bpf_token_capable(prog->aux->token, CAP_BPF))
                return NULL;

        switch (func_id) {
        case BPF_FUNC_spin_lock:
                return &bpf_spin_lock_proto;
        case BPF_FUNC_spin_unlock:
                return &bpf_spin_unlock_proto;
        case BPF_FUNC_jiffies64:
                return &bpf_jiffies64_proto;
        case BPF_FUNC_per_cpu_ptr:
                return &bpf_per_cpu_ptr_proto;
        case BPF_FUNC_this_cpu_ptr:
                return &bpf_this_cpu_ptr_proto;
        case BPF_FUNC_timer_init:
                return &bpf_timer_init_proto;
        case BPF_FUNC_timer_set_callback:
                return &bpf_timer_set_callback_proto;
        case BPF_FUNC_timer_start:
                return &bpf_timer_start_proto;
        case BPF_FUNC_timer_cancel:
                return &bpf_timer_cancel_proto;
        case BPF_FUNC_kptr_xchg:
                return &bpf_kptr_xchg_proto;
        case BPF_FUNC_for_each_map_elem:
                return &bpf_for_each_map_elem_proto;
        case BPF_FUNC_loop:
                return &bpf_loop_proto;
        case BPF_FUNC_user_ringbuf_drain:
                return &bpf_user_ringbuf_drain_proto;
        case BPF_FUNC_ringbuf_reserve_dynptr:
                return &bpf_ringbuf_reserve_dynptr_proto;
        case BPF_FUNC_ringbuf_submit_dynptr:
                return &bpf_ringbuf_submit_dynptr_proto;
        case BPF_FUNC_ringbuf_discard_dynptr:
                return &bpf_ringbuf_discard_dynptr_proto;
        case BPF_FUNC_dynptr_from_mem:
                return &bpf_dynptr_from_mem_proto;
        case BPF_FUNC_dynptr_read:
                return &bpf_dynptr_read_proto;
        case BPF_FUNC_dynptr_write:
                return &bpf_dynptr_write_proto;
        case BPF_FUNC_dynptr_data:
                return &bpf_dynptr_data_proto;
#ifdef CONFIG_CGROUPS
        case BPF_FUNC_cgrp_storage_get:
                return &bpf_cgrp_storage_get_proto;
        case BPF_FUNC_cgrp_storage_delete:
                return &bpf_cgrp_storage_delete_proto;
        case BPF_FUNC_get_current_cgroup_id:
                return &bpf_get_current_cgroup_id_proto;
        case BPF_FUNC_get_current_ancestor_cgroup_id:
                return &bpf_get_current_ancestor_cgroup_id_proto;
        case BPF_FUNC_current_task_under_cgroup:
                return &bpf_current_task_under_cgroup_proto;
#endif
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_curr_proto;
#endif
        case BPF_FUNC_task_storage_get:
                return &bpf_task_storage_get_proto;
        case BPF_FUNC_task_storage_delete:
                return &bpf_task_storage_delete_proto;
        default:
                break;
        }

        if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
                return NULL;

        switch (func_id) {
        case BPF_FUNC_trace_printk:
                return bpf_get_trace_printk_proto();
        case BPF_FUNC_get_current_task:
                return &bpf_get_current_task_proto;
        case BPF_FUNC_get_current_task_btf:
                return &bpf_get_current_task_btf_proto;
        case BPF_FUNC_get_current_comm:
                return &bpf_get_current_comm_proto;
        case BPF_FUNC_probe_read_user:
                return &bpf_probe_read_user_proto;
        case BPF_FUNC_probe_read_kernel:
                return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
                       NULL : &bpf_probe_read_kernel_proto;
        case BPF_FUNC_probe_read_user_str:
                return &bpf_probe_read_user_str_proto;
        case BPF_FUNC_probe_read_kernel_str:
                return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
                       NULL : &bpf_probe_read_kernel_str_proto;
        case BPF_FUNC_copy_from_user:
                return &bpf_copy_from_user_proto;
        case BPF_FUNC_copy_from_user_task:
                return &bpf_copy_from_user_task_proto;
        case BPF_FUNC_snprintf_btf:
                return &bpf_snprintf_btf_proto;
        case BPF_FUNC_snprintf:
                return &bpf_snprintf_proto;
        case BPF_FUNC_task_pt_regs:
                return &bpf_task_pt_regs_proto;
        case BPF_FUNC_trace_vprintk:
                return bpf_get_trace_vprintk_proto();
        case BPF_FUNC_perf_event_read_value:
                return bpf_get_perf_event_read_value_proto();
        case BPF_FUNC_perf_event_read:
                return &bpf_perf_event_read_proto;
        case BPF_FUNC_send_signal:
                return &bpf_send_signal_proto;
        case BPF_FUNC_send_signal_thread:
                return &bpf_send_signal_thread_proto;
        case BPF_FUNC_get_task_stack:
                return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
                                       : &bpf_get_task_stack_proto;
        case BPF_FUNC_get_branch_snapshot:
                return &bpf_get_branch_snapshot_proto;
        case BPF_FUNC_find_vma:
                return &bpf_find_vma_proto;
        default:
                return NULL;
        }
}
EXPORT_SYMBOL_GPL(bpf_base_func_proto);

void bpf_list_head_free(const struct btf_field *field, void *list_head,
                        struct bpf_spin_lock *spin_lock)
{
        struct list_head *head = list_head, *orig_head = list_head;

        BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
        BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));

        /* Do the actual list draining outside the lock to not hold the lock for
         * too long, and also prevent deadlocks if tracing programs end up
         * executing on entry/exit of functions called inside the critical
         * section, and end up doing map ops that call bpf_list_head_free for
         * the same map value again.
         */
        __bpf_spin_lock_irqsave(spin_lock);
        if (!head->next || list_empty(head))
                goto unlock;
        head = head->next;
unlock:
        INIT_LIST_HEAD(orig_head);
        __bpf_spin_unlock_irqrestore(spin_lock);

        while (head != orig_head) {
                void *obj = head;

                obj -= field->graph_root.node_offset;
                head = head->next;
                /* The contained type can also have resources, including a
                 * bpf_list_head which needs to be freed.
                 */
                __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
        }
}

/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
 * 'rb_node *', so field name of rb_node within containing struct is not
 * needed.
 *
 * Since bpf_rb_tree's node type has a corresponding struct btf_field with
 * graph_root.node_offset, it's not necessary to know field name
 * or type of node struct
 */
#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
        for (pos = rb_first_postorder(root); \
            pos && ({ n = rb_next_postorder(pos); 1; }); \
            pos = n)

void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
                      struct bpf_spin_lock *spin_lock)
{
        struct rb_root_cached orig_root, *root = rb_root;
        struct rb_node *pos, *n;
        void *obj;

        BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
        BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));

        __bpf_spin_lock_irqsave(spin_lock);
        orig_root = *root;
        *root = RB_ROOT_CACHED;
        __bpf_spin_unlock_irqrestore(spin_lock);

        bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
                obj = pos;
                obj -= field->graph_root.node_offset;


                __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
        }
}

__bpf_kfunc_start_defs();

/**
 * bpf_obj_new() - allocate an object described by program BTF
 * @local_type_id__k: type ID in program BTF
 * @meta: verifier-supplied struct metadata
 *
 * Allocate an object of the type identified by @local_type_id__k and
 * initialize its special fields. BPF programs can use
 * bpf_core_type_id_local() to provide @local_type_id__k. The verifier
 * rewrites @meta; BPF programs do not set it.
 *
 * Return: Pointer to the allocated object, or %NULL on failure.
 */
__bpf_kfunc void *bpf_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
{
        u64 size = local_type_id__k;
        void *p;

        p = bpf_mem_alloc(&bpf_global_ma, size);
        if (!p)
                return NULL;
        if (meta)
                bpf_obj_init(meta->record, p);

        return p;
}

__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
{
        return bpf_obj_new(local_type_id__k, meta__ign);
}

/**
 * bpf_percpu_obj_new() - allocate a percpu object described by program BTF
 * @local_type_id__k: type ID in program BTF
 * @meta: verifier-supplied struct metadata
 *
 * Allocate a percpu object of the type identified by @local_type_id__k. BPF
 * programs can use bpf_core_type_id_local() to provide @local_type_id__k.
 * The verifier rewrites @meta; BPF programs do not set it.
 *
 * Return: Pointer to the allocated percpu object, or %NULL on failure.
 */
__bpf_kfunc void *bpf_percpu_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
{
        u64 size = local_type_id__k;

        /* The verifier has ensured that meta must be NULL */
        return bpf_mem_alloc(&bpf_global_percpu_ma, size);
}

__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
{
        return bpf_percpu_obj_new(local_type_id__k, meta__ign);
}

/* Must be called under migrate_disable(), as required by bpf_mem_free */
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
{
        struct bpf_mem_alloc *ma;

        if (rec && rec->refcount_off >= 0 &&
            !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
                /* Object is refcounted and refcount_dec didn't result in 0
                 * refcount. Return without freeing the object
                 */
                return;
        }

        if (rec)
                bpf_obj_free_fields(rec, p);

        if (percpu)
                ma = &bpf_global_percpu_ma;
        else
                ma = &bpf_global_ma;
        bpf_mem_free_rcu(ma, p);
}

/**
 * bpf_obj_drop() - drop a previously allocated object
 * @p__alloc: object to free
 * @meta: verifier-supplied struct metadata
 *
 * Destroy special fields in @p__alloc as needed and free the object. The
 * verifier rewrites @meta; BPF programs do not set it.
 */
__bpf_kfunc void bpf_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
{
        void *p = p__alloc;

        __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
}

__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
{
        return bpf_obj_drop(p__alloc, meta__ign);
}

/**
 * bpf_percpu_obj_drop() - drop a previously allocated percpu object
 * @p__alloc: percpu object to free
 * @meta: verifier-supplied struct metadata
 *
 * Free @p__alloc. The verifier rewrites @meta; BPF programs do not set it.
 */
__bpf_kfunc void bpf_percpu_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
{
        /* The verifier has ensured that meta must be NULL */
        bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
}

__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
{
        bpf_percpu_obj_drop(p__alloc, meta__ign);
}

/**
 * bpf_refcount_acquire() - turn a local kptr into an owning reference
 * @p__refcounted_kptr: non-owning local kptr
 * @meta: verifier-supplied struct metadata
 *
 * Increment the refcount for @p__refcounted_kptr. The verifier rewrites
 * @meta; BPF programs do not set it.
 *
 * Return: Owning reference to @p__refcounted_kptr, or %NULL on failure.
 */
__bpf_kfunc void *bpf_refcount_acquire(void *p__refcounted_kptr, struct btf_struct_meta *meta)
{
        struct bpf_refcount *ref;

        /* Could just cast directly to refcount_t *, but need some code using
         * bpf_refcount type so that it is emitted in vmlinux BTF
         */
        ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
        if (!refcount_inc_not_zero((refcount_t *)ref))
                return NULL;

        /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
         * in verifier.c
         */
        return (void *)p__refcounted_kptr;
}

__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
{
        return bpf_refcount_acquire(p__refcounted_kptr, meta__ign);
}

static int __bpf_list_add(struct bpf_list_node_kern *node,
                          struct bpf_list_head *head,
                          bool tail, struct btf_record *rec, u64 off)
{
        struct list_head *n = &node->list_head, *h = (void *)head;

        /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
         * called on its fields, so init here
         */
        if (unlikely(!h->next))
                INIT_LIST_HEAD(h);

        /* node->owner != NULL implies !list_empty(n), no need to separately
         * check the latter
         */
        if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
                /* Only called from BPF prog, no need to migrate_disable */
                __bpf_obj_drop_impl((void *)n - off, rec, false);
                return -EINVAL;
        }

        tail ? list_add_tail(n, h) : list_add(n, h);
        WRITE_ONCE(node->owner, head);

        return 0;
}

/**
 * bpf_list_push_front() - add a node to the front of a BPF linked list
 * @head: list head
 * @node: node to insert
 * @meta: verifier-supplied struct metadata
 * @off: verifier-supplied offset of @node within the containing object
 *
 * Insert @node at the front of @head. The verifier rewrites @meta and @off;
 * BPF programs do not set them.
 *
 * Return: 0 on success, or %-EINVAL if @node is already linked.
 */
__bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head,
                                    struct bpf_list_node *node,
                                    struct btf_struct_meta *meta,
                                    u64 off)
{
        struct bpf_list_node_kern *n = (void *)node;

        return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
}

__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
                                         struct bpf_list_node *node,
                                         void *meta__ign, u64 off)
{
        return bpf_list_push_front(head, node, meta__ign, off);
}

/**
 * bpf_list_push_back() - add a node to the back of a BPF linked list
 * @head: list head
 * @node: node to insert
 * @meta: verifier-supplied struct metadata
 * @off: verifier-supplied offset of @node within the containing object
 *
 * Insert @node at the back of @head. The verifier rewrites @meta and @off;
 * BPF programs do not set them.
 *
 * Return: 0 on success, or %-EINVAL if @node is already linked.
 */
__bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head,
                                   struct bpf_list_node *node,
                                   struct btf_struct_meta *meta,
                                   u64 off)
{
        struct bpf_list_node_kern *n = (void *)node;

        return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
}

__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
                                        struct bpf_list_node *node,
                                        void *meta__ign, u64 off)
{
        return bpf_list_push_back(head, node, meta__ign, off);
}

static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
{
        struct list_head *n, *h = (void *)head;
        struct bpf_list_node_kern *node;

        /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
         * called on its fields, so init here
         */
        if (unlikely(!h->next))
                INIT_LIST_HEAD(h);
        if (list_empty(h))
                return NULL;

        n = tail ? h->prev : h->next;
        node = container_of(n, struct bpf_list_node_kern, list_head);
        if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
                return NULL;

        list_del_init(n);
        WRITE_ONCE(node->owner, NULL);
        return (struct bpf_list_node *)n;
}

__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
{
        return __bpf_list_del(head, false);
}

__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
{
        return __bpf_list_del(head, true);
}

__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
{
        struct list_head *h = (struct list_head *)head;

        if (list_empty(h) || unlikely(!h->next))
                return NULL;

        return (struct bpf_list_node *)h->next;
}

__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
{
        struct list_head *h = (struct list_head *)head;

        if (list_empty(h) || unlikely(!h->next))
                return NULL;

        return (struct bpf_list_node *)h->prev;
}

__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
                                                  struct bpf_rb_node *node)
{
        struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
        struct rb_root_cached *r = (struct rb_root_cached *)root;
        struct rb_node *n = &node_internal->rb_node;

        /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
         * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
         */
        if (READ_ONCE(node_internal->owner) != root)
                return NULL;

        rb_erase_cached(n, r);
        RB_CLEAR_NODE(n);
        WRITE_ONCE(node_internal->owner, NULL);
        return (struct bpf_rb_node *)n;
}

/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
 * program
 */
static int __bpf_rbtree_add(struct bpf_rb_root *root,
                            struct bpf_rb_node_kern *node,
                            void *less, struct btf_record *rec, u64 off)
{
        struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
        struct rb_node *parent = NULL, *n = &node->rb_node;
        bpf_callback_t cb = (bpf_callback_t)less;
        bool leftmost = true;

        /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
         * check the latter
         */
        if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
                /* Only called from BPF prog, no need to migrate_disable */
                __bpf_obj_drop_impl((void *)n - off, rec, false);
                return -EINVAL;
        }

        while (*link) {
                parent = *link;
                if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(n, parent, link);
        rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
        WRITE_ONCE(node->owner, root);
        return 0;
}

/**
 * bpf_rbtree_add() - add a node to a BPF rbtree
 * @root: tree root
 * @node: node to insert
 * @less: comparator used to order nodes
 * @meta: verifier-supplied struct metadata
 * @off: verifier-supplied offset of @node within the containing object
 *
 * Insert @node into @root using @less. The verifier rewrites @meta and @off;
 * BPF programs do not set them.
 *
 * Return: 0 on success, or %-EINVAL if @node is already linked in a tree.
 */
__bpf_kfunc int bpf_rbtree_add(struct bpf_rb_root *root,
                               struct bpf_rb_node *node,
                               bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
                               struct btf_struct_meta *meta,
                               u64 off)
{
        struct bpf_rb_node_kern *n = (void *)node;

        return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
}

__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
                                    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
                                    void *meta__ign, u64 off)
{
        return bpf_rbtree_add(root, node, less, meta__ign, off);
}

__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
{
        struct rb_root_cached *r = (struct rb_root_cached *)root;

        return (struct bpf_rb_node *)rb_first_cached(r);
}

__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root)
{
        struct rb_root_cached *r = (struct rb_root_cached *)root;

        return (struct bpf_rb_node *)r->rb_root.rb_node;
}

__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node)
{
        struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;

        if (READ_ONCE(node_internal->owner) != root)
                return NULL;

        return (struct bpf_rb_node *)node_internal->rb_node.rb_left;
}

__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node)
{
        struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;

        if (READ_ONCE(node_internal->owner) != root)
                return NULL;

        return (struct bpf_rb_node *)node_internal->rb_node.rb_right;
}

/**
 * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
 * kfunc which is not stored in a map as a kptr, must be released by calling
 * bpf_task_release().
 * @p: The task on which a reference is being acquired.
 */
__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
{
        if (refcount_inc_not_zero(&p->rcu_users))
                return p;
        return NULL;
}

/**
 * bpf_task_release - Release the reference acquired on a task.
 * @p: The task on which a reference is being released.
 */
__bpf_kfunc void bpf_task_release(struct task_struct *p)
{
        put_task_struct_rcu_user(p);
}

__bpf_kfunc void bpf_task_release_dtor(void *p)
{
        put_task_struct_rcu_user(p);
}
CFI_NOSEAL(bpf_task_release_dtor);

#ifdef CONFIG_CGROUPS
/**
 * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
 * this kfunc which is not stored in a map as a kptr, must be released by
 * calling bpf_cgroup_release().
 * @cgrp: The cgroup on which a reference is being acquired.
 */
__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
{
        return cgroup_tryget(cgrp) ? cgrp : NULL;
}

/**
 * bpf_cgroup_release - Release the reference acquired on a cgroup.
 * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
 * not be freed until the current grace period has ended, even if its refcount
 * drops to 0.
 * @cgrp: The cgroup on which a reference is being released.
 */
__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
{
        cgroup_put(cgrp);
}

__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
{
        cgroup_put(cgrp);
}
CFI_NOSEAL(bpf_cgroup_release_dtor);

/**
 * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
 * array. A cgroup returned by this kfunc which is not subsequently stored in a
 * map, must be released by calling bpf_cgroup_release().
 * @cgrp: The cgroup for which we're performing a lookup.
 * @level: The level of ancestor to look up.
 */
__bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
{
        struct cgroup *ancestor;

        if (level > cgrp->level || level < 0)
                return NULL;

        /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
        ancestor = cgrp->ancestors[level];
        if (!cgroup_tryget(ancestor))
                return NULL;
        return ancestor;
}

/**
 * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
 * kfunc which is not subsequently stored in a map, must be released by calling
 * bpf_cgroup_release().
 * @cgid: cgroup id.
 */
__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
{
        struct cgroup *cgrp;

        cgrp = __cgroup_get_from_id(cgid);
        if (IS_ERR(cgrp))
                return NULL;
        return cgrp;
}

/**
 * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
 * task's membership of cgroup ancestry.
 * @task: the task to be tested
 * @ancestor: possible ancestor of @task's cgroup
 *
 * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
 * It follows all the same rules as cgroup_is_descendant, and only applies
 * to the default hierarchy.
 */
__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
                                       struct cgroup *ancestor)
{
        long ret;

        rcu_read_lock();
        ret = task_under_cgroup_hierarchy(task, ancestor);
        rcu_read_unlock();
        return ret;
}

BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct cgroup *cgrp;

        if (unlikely(idx >= array->map.max_entries))
                return -E2BIG;

        cgrp = READ_ONCE(array->ptrs[idx]);
        if (unlikely(!cgrp))
                return -EAGAIN;

        return task_under_cgroup_hierarchy(current, cgrp);
}

const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
        .func           = bpf_current_task_under_cgroup,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_ANYTHING,
};

/**
 * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
 * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
 * hierarchy ID.
 * @task: The target task
 * @hierarchy_id: The ID of a cgroup1 hierarchy
 *
 * On success, the cgroup is returen. On failure, NULL is returned.
 */
__bpf_kfunc struct cgroup *
bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
{
        struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);

        if (IS_ERR(cgrp))
                return NULL;
        return cgrp;
}
#endif /* CONFIG_CGROUPS */

/**
 * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
 * in the root pid namespace idr. If a task is returned, it must either be
 * stored in a map, or released with bpf_task_release().
 * @pid: The pid of the task being looked up.
 */
__bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
{
        struct task_struct *p;

        rcu_read_lock();
        p = find_task_by_pid_ns(pid, &init_pid_ns);
        if (p)
                p = bpf_task_acquire(p);
        rcu_read_unlock();

        return p;
}

/**
 * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up
 * in the pid namespace of the current task. If a task is returned, it must
 * either be stored in a map, or released with bpf_task_release().
 * @vpid: The vpid of the task being looked up.
 */
__bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
{
        struct task_struct *p;

        rcu_read_lock();
        p = find_task_by_vpid(vpid);
        if (p)
                p = bpf_task_acquire(p);
        rcu_read_unlock();

        return p;
}

/**
 * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
 * @p: The dynptr whose data slice to retrieve
 * @offset: Offset into the dynptr
 * @buffer__nullable: User-provided buffer to copy contents into.  May be NULL
 * @buffer__szk: Size (in bytes) of the buffer if present. This is the
 *               length of the requested slice. This must be a constant.
 *
 * For non-skb and non-xdp type dynptrs, there is no difference between
 * bpf_dynptr_slice and bpf_dynptr_data.
 *
 *  If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
 *
 * If the intention is to write to the data slice, please use
 * bpf_dynptr_slice_rdwr.
 *
 * The user must check that the returned pointer is not null before using it.
 *
 * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
 * does not change the underlying packet data pointers, so a call to
 * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
 * the bpf program.
 *
 * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
 * data slice (can be either direct pointer to the data or a pointer to the user
 * provided buffer, with its contents containing the data, if unable to obtain
 * direct pointer)
 */
__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
                                   void *buffer__nullable, u64 buffer__szk)
{
        const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
        enum bpf_dynptr_type type;
        u64 len = buffer__szk;
        int err;

        if (!ptr->data)
                return NULL;

        err = bpf_dynptr_check_off_len(ptr, offset, len);
        if (err)
                return NULL;

        type = bpf_dynptr_get_type(ptr);

        switch (type) {
        case BPF_DYNPTR_TYPE_LOCAL:
        case BPF_DYNPTR_TYPE_RINGBUF:
                return ptr->data + ptr->offset + offset;
        case BPF_DYNPTR_TYPE_SKB:
                if (buffer__nullable)
                        return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable);
                else
                        return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
        case BPF_DYNPTR_TYPE_XDP:
        {
                void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
                if (!IS_ERR_OR_NULL(xdp_ptr))
                        return xdp_ptr;

                if (!buffer__nullable)
                        return NULL;
                bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false);
                return buffer__nullable;
        }
        case BPF_DYNPTR_TYPE_SKB_META:
                return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
        case BPF_DYNPTR_TYPE_FILE:
                err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk);
                return err ? NULL : buffer__nullable;
        default:
                WARN_ONCE(true, "unknown dynptr type %d\n", type);
                return NULL;
        }
}

/**
 * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
 * @p: The dynptr whose data slice to retrieve
 * @offset: Offset into the dynptr
 * @buffer__nullable: User-provided buffer to copy contents into. May be NULL
 * @buffer__szk: Size (in bytes) of the buffer if present. This is the
 *               length of the requested slice. This must be a constant.
 *
 * For non-skb and non-xdp type dynptrs, there is no difference between
 * bpf_dynptr_slice and bpf_dynptr_data.
 *
 * If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
 *
 * The returned pointer is writable and may point to either directly the dynptr
 * data at the requested offset or to the buffer if unable to obtain a direct
 * data pointer to (example: the requested slice is to the paged area of an skb
 * packet). In the case where the returned pointer is to the buffer, the user
 * is responsible for persisting writes through calling bpf_dynptr_write(). This
 * usually looks something like this pattern:
 *
 * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
 * if (!eth)
 *        return TC_ACT_SHOT;
 *
 * // mutate eth header //
 *
 * if (eth == buffer)
 *        bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
 *
 * Please note that, as in the example above, the user must check that the
 * returned pointer is not null before using it.
 *
 * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
 * does not change the underlying packet data pointers, so a call to
 * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
 * the bpf program.
 *
 * Return: NULL if the call failed (eg invalid dynptr), pointer to a
 * data slice (can be either direct pointer to the data or a pointer to the user
 * provided buffer, with its contents containing the data, if unable to obtain
 * direct pointer)
 */
__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
                                        void *buffer__nullable, u64 buffer__szk)
{
        const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;

        if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
                return NULL;

        /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
         *
         * For skb-type dynptrs, it is safe to write into the returned pointer
         * if the bpf program allows skb data writes. There are two possibilities
         * that may occur when calling bpf_dynptr_slice_rdwr:
         *
         * 1) The requested slice is in the head of the skb. In this case, the
         * returned pointer is directly to skb data, and if the skb is cloned, the
         * verifier will have uncloned it (see bpf_unclone_prologue()) already.
         * The pointer can be directly written into.
         *
         * 2) Some portion of the requested slice is in the paged buffer area.
         * In this case, the requested data will be copied out into the buffer
         * and the returned pointer will be a pointer to the buffer. The skb
         * will not be pulled. To persist the write, the user will need to call
         * bpf_dynptr_write(), which will pull the skb and commit the write.
         *
         * Similarly for xdp programs, if the requested slice is not across xdp
         * fragments, then a direct pointer will be returned, otherwise the data
         * will be copied out into the buffer and the user will need to call
         * bpf_dynptr_write() to commit changes.
         */
        return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk);
}

__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
        u64 size;

        if (!ptr->data || start > end)
                return -EINVAL;

        size = __bpf_dynptr_size(ptr);

        if (start > size || end > size)
                return -ERANGE;

        bpf_dynptr_advance_offset(ptr, start);
        bpf_dynptr_set_size(ptr, end - start);

        return 0;
}

__bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;

        return !ptr->data;
}

__bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;

        if (!ptr->data)
                return false;

        return __bpf_dynptr_is_rdonly(ptr);
}

__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;

        if (!ptr->data)
                return -EINVAL;

        return __bpf_dynptr_size(ptr);
}

__bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
                                 struct bpf_dynptr *clone__uninit)
{
        struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;

        if (!ptr->data) {
                bpf_dynptr_set_null(clone);
                return -EINVAL;
        }

        *clone = *ptr;

        return 0;
}

/**
 * bpf_dynptr_copy() - Copy data from one dynptr to another.
 * @dst_ptr: Destination dynptr - where data should be copied to
 * @dst_off: Offset into the destination dynptr
 * @src_ptr: Source dynptr - where data should be copied from
 * @src_off: Offset into the source dynptr
 * @size: Length of the data to copy from source to destination
 *
 * Copies data from source dynptr to destination dynptr.
 * Returns 0 on success; negative error, otherwise.
 */
__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
                                struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
{
        struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
        struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
        void *src_slice, *dst_slice;
        char buf[256];
        u64 off;

        src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
        dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);

        if (src_slice && dst_slice) {
                memmove(dst_slice, src_slice, size);
                return 0;
        }

        if (src_slice)
                return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0);

        if (dst_slice)
                return __bpf_dynptr_read(dst_slice, size, src, src_off, 0);

        if (bpf_dynptr_check_off_len(dst, dst_off, size) ||
            bpf_dynptr_check_off_len(src, src_off, size))
                return -E2BIG;

        off = 0;
        while (off < size) {
                u64 chunk_sz = min_t(u64, sizeof(buf), size - off);
                int err;

                err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
                if (err)
                        return err;
                err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0);
                if (err)
                        return err;

                off += chunk_sz;
        }
        return 0;
}

/**
 * bpf_dynptr_memset() - Fill dynptr memory with a constant byte.
 * @p: Destination dynptr - where data will be filled
 * @offset: Offset into the dynptr to start filling from
 * @size: Number of bytes to fill
 * @val: Constant byte to fill the memory with
 *
 * Fills the @size bytes of the memory area pointed to by @p
 * at @offset with the constant byte @val.
 * Returns 0 on success; negative error, otherwise.
 */
__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
        u64 chunk_sz, write_off;
        char buf[256];
        void* slice;
        int err;

        slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size);
        if (likely(slice)) {
                memset(slice, val, size);
                return 0;
        }

        if (__bpf_dynptr_is_rdonly(ptr))
                return -EINVAL;

        err = bpf_dynptr_check_off_len(ptr, offset, size);
        if (err)
                return err;

        /* Non-linear data under the dynptr, write from a local buffer */
        chunk_sz = min_t(u64, sizeof(buf), size);
        memset(buf, val, chunk_sz);

        for (write_off = 0; write_off < size; write_off += chunk_sz) {
                chunk_sz = min_t(u64, sizeof(buf), size - write_off);
                err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
                if (err)
                        return err;
        }

        return 0;
}

__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
{
        return obj;
}

__bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k)
{
        return (void *)obj__ign;
}

__bpf_kfunc void bpf_rcu_read_lock(void)
{
        rcu_read_lock();
}

__bpf_kfunc void bpf_rcu_read_unlock(void)
{
        rcu_read_unlock();
}

struct bpf_throw_ctx {
        struct bpf_prog_aux *aux;
        u64 sp;
        u64 bp;
        int cnt;
};

static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
{
        struct bpf_throw_ctx *ctx = cookie;
        struct bpf_prog *prog;

        /*
         * The RCU read lock is held to safely traverse the latch tree, but we
         * don't need its protection when accessing the prog, since it has an
         * active stack frame on the current stack trace, and won't disappear.
         */
        rcu_read_lock();
        prog = bpf_prog_ksym_find(ip);
        rcu_read_unlock();
        if (!prog)
                return !ctx->cnt;
        ctx->cnt++;
        if (bpf_is_subprog(prog))
                return true;
        ctx->aux = prog->aux;
        ctx->sp = sp;
        ctx->bp = bp;
        return false;
}

__bpf_kfunc void bpf_throw(u64 cookie)
{
        struct bpf_throw_ctx ctx = {};

        arch_bpf_stack_walk(bpf_stack_walker, &ctx);
        WARN_ON_ONCE(!ctx.aux);
        if (ctx.aux)
                WARN_ON_ONCE(!ctx.aux->exception_boundary);
        WARN_ON_ONCE(!ctx.bp);
        WARN_ON_ONCE(!ctx.cnt);
        /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
         * deeper stack depths than ctx.sp as we do not return from bpf_throw,
         * which skips compiler generated instrumentation to do the same.
         */
        kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
        ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
        WARN(1, "A call to BPF exception callback should never return\n");
}

__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
{
        struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
        struct bpf_map *map = p__map;

        BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
        BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));

        if (flags)
                return -EINVAL;

        return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
}

__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
{
        struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
        struct bpf_work *w;

        if (flags)
                return -EINVAL;

        w = READ_ONCE(async->work);
        if (!w || !READ_ONCE(w->cb.prog))
                return -EINVAL;

        if (!refcount_inc_not_zero(&w->cb.refcnt))
                return -ENOENT;

        if (!defer_timer_wq_op()) {
                schedule_work(&w->work);
                bpf_async_refcount_put(&w->cb);
                return 0;
        } else {
                return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0);
        }
}

__bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq,
                                    int (callback_fn)(void *map, int *key, void *value),
                                    unsigned int flags,
                                    struct bpf_prog_aux *aux)
{
        struct bpf_async_kern *async = (struct bpf_async_kern *)wq;

        if (flags)
                return -EINVAL;

        return __bpf_async_set_callback(async, callback_fn, aux->prog);
}

__bpf_kfunc void bpf_preempt_disable(void)
{
        preempt_disable();
}

__bpf_kfunc void bpf_preempt_enable(void)
{
        preempt_enable();
}

struct bpf_iter_bits {
        __u64 __opaque[2];
} __aligned(8);

#define BITS_ITER_NR_WORDS_MAX 511

struct bpf_iter_bits_kern {
        union {
                __u64 *bits;
                __u64 bits_copy;
        };
        int nr_bits;
        int bit;
} __aligned(8);

/* On 64-bit hosts, unsigned long and u64 have the same size, so passing
 * a u64 pointer and an unsigned long pointer to find_next_bit() will
 * return the same result, as both point to the same 8-byte area.
 *
 * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
 * pointer also makes no difference. This is because the first iterated
 * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
 * long is composed of bits 32-63 of the u64.
 *
 * However, for 32-bit big-endian hosts, this is not the case. The first
 * iterated unsigned long will be bits 32-63 of the u64, so swap these two
 * ulong values within the u64.
 */
static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
{
#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
        unsigned int i;

        for (i = 0; i < nr; i++)
                bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
#endif
}

/**
 * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
 * @it: The new bpf_iter_bits to be created
 * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
 * @nr_words: The size of the specified memory area, measured in 8-byte units.
 * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
 * further reduced by the BPF memory allocator implementation.
 *
 * This function initializes a new bpf_iter_bits structure for iterating over
 * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
 * copies the data of the memory area to the newly created bpf_iter_bits @it for
 * subsequent iteration operations.
 *
 * On success, 0 is returned. On failure, ERR is returned.
 */
__bpf_kfunc int
bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
{
        struct bpf_iter_bits_kern *kit = (void *)it;
        u32 nr_bytes = nr_words * sizeof(u64);
        u32 nr_bits = BYTES_TO_BITS(nr_bytes);
        int err;

        BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
        BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
                     __alignof__(struct bpf_iter_bits));

        kit->nr_bits = 0;
        kit->bits_copy = 0;
        kit->bit = -1;

        if (!unsafe_ptr__ign || !nr_words)
                return -EINVAL;
        if (nr_words > BITS_ITER_NR_WORDS_MAX)
                return -E2BIG;

        /* Optimization for u64 mask */
        if (nr_bits == 64) {
                err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
                if (err)
                        return -EFAULT;

                swap_ulong_in_u64(&kit->bits_copy, nr_words);

                kit->nr_bits = nr_bits;
                return 0;
        }

        if (bpf_mem_alloc_check_size(false, nr_bytes))
                return -E2BIG;

        /* Fallback to memalloc */
        kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
        if (!kit->bits)
                return -ENOMEM;

        err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
        if (err) {
                bpf_mem_free(&bpf_global_ma, kit->bits);
                return err;
        }

        swap_ulong_in_u64(kit->bits, nr_words);

        kit->nr_bits = nr_bits;
        return 0;
}

/**
 * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
 * @it: The bpf_iter_bits to be checked
 *
 * This function returns a pointer to a number representing the value of the
 * next bit in the bits.
 *
 * If there are no further bits available, it returns NULL.
 */
__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
{
        struct bpf_iter_bits_kern *kit = (void *)it;
        int bit = kit->bit, nr_bits = kit->nr_bits;
        const void *bits;

        if (!nr_bits || bit >= nr_bits)
                return NULL;

        bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
        bit = find_next_bit(bits, nr_bits, bit + 1);
        if (bit >= nr_bits) {
                kit->bit = bit;
                return NULL;
        }

        kit->bit = bit;
        return &kit->bit;
}

/**
 * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
 * @it: The bpf_iter_bits to be destroyed
 *
 * Destroy the resource associated with the bpf_iter_bits.
 */
__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
{
        struct bpf_iter_bits_kern *kit = (void *)it;

        if (kit->nr_bits <= 64)
                return;
        bpf_mem_free(&bpf_global_ma, kit->bits);
}

/**
 * bpf_copy_from_user_str() - Copy a string from an unsafe user address
 * @dst:             Destination address, in kernel space.  This buffer must be
 *                   at least @dst__sz bytes long.
 * @dst__sz:         Maximum number of bytes to copy, includes the trailing NUL.
 * @unsafe_ptr__ign: Source address, in user space.
 * @flags:           The only supported flag is BPF_F_PAD_ZEROS
 *
 * Copies a NUL-terminated string from userspace to BPF space. If user string is
 * too long this will still ensure zero termination in the dst buffer unless
 * buffer size is 0.
 *
 * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and
 * memset all of @dst on failure.
 */
__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags)
{
        int ret;

        if (unlikely(flags & ~BPF_F_PAD_ZEROS))
                return -EINVAL;

        if (unlikely(!dst__sz))
                return 0;

        ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1);
        if (ret < 0) {
                if (flags & BPF_F_PAD_ZEROS)
                        memset((char *)dst, 0, dst__sz);

                return ret;
        }

        if (flags & BPF_F_PAD_ZEROS)
                memset((char *)dst + ret, 0, dst__sz - ret);
        else
                ((char *)dst)[ret] = '\0';

        return ret + 1;
}

/**
 * bpf_copy_from_user_task_str() - Copy a string from an task's address space
 * @dst:             Destination address, in kernel space.  This buffer must be
 *                   at least @dst__sz bytes long.
 * @dst__sz:         Maximum number of bytes to copy, includes the trailing NUL.
 * @unsafe_ptr__ign: Source address in the task's address space.
 * @tsk:             The task whose address space will be used
 * @flags:           The only supported flag is BPF_F_PAD_ZEROS
 *
 * Copies a NUL terminated string from a task's address space to @dst__sz
 * buffer. If user string is too long this will still ensure zero termination
 * in the @dst__sz buffer unless buffer size is 0.
 *
 * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success
 * and memset all of @dst__sz on failure.
 *
 * Return: The number of copied bytes on success including the NUL terminator.
 * A negative error code on failure.
 */
__bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz,
                                            const void __user *unsafe_ptr__ign,
                                            struct task_struct *tsk, u64 flags)
{
        int ret;

        if (unlikely(flags & ~BPF_F_PAD_ZEROS))
                return -EINVAL;

        if (unlikely(dst__sz == 0))
                return 0;

        ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0);
        if (ret < 0) {
                if (flags & BPF_F_PAD_ZEROS)
                        memset(dst, 0, dst__sz);
                return ret;
        }

        if (flags & BPF_F_PAD_ZEROS)
                memset(dst + ret, 0, dst__sz - ret);

        return ret + 1;
}

/* Keep unsinged long in prototype so that kfunc is usable when emitted to
 * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
 * unsigned long always points to 8-byte region on stack, the kernel may only
 * read and write the 4-bytes on 32-bit.
 */
__bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
{
        local_irq_save(*flags__irq_flag);
}

__bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
{
        local_irq_restore(*flags__irq_flag);
}

__bpf_kfunc void __bpf_trap(void)
{
}

/*
 * Kfuncs for string operations.
 *
 * Since strings are not necessarily %NUL-terminated, we cannot directly call
 * in-kernel implementations. Instead, we open-code the implementations using
 * __get_kernel_nofault instead of plain dereference to make them safe.
 */

static int __bpf_strncasecmp(const char *s1, const char *s2, bool ignore_case, size_t len)
{
        char c1, c2;
        int i;

        if (!copy_from_kernel_nofault_allowed(s1, 1) ||
            !copy_from_kernel_nofault_allowed(s2, 1)) {
                return -ERANGE;
        }

        guard(pagefault)();
        for (i = 0; i < len && i < XATTR_SIZE_MAX; i++) {
                __get_kernel_nofault(&c1, s1, char, err_out);
                __get_kernel_nofault(&c2, s2, char, err_out);
                if (ignore_case) {
                        c1 = tolower(c1);
                        c2 = tolower(c2);
                }
                if (c1 != c2)
                        return c1 < c2 ? -1 : 1;
                if (c1 == '\0')
                        return 0;
                s1++;
                s2++;
        }
        return i == XATTR_SIZE_MAX ? -E2BIG : 0;
err_out:
        return -EFAULT;
}

/**
 * bpf_strcmp - Compare two strings
 * @s1__ign: One string
 * @s2__ign: Another string
 *
 * Return:
 * * %0       - Strings are equal
 * * %-1      - @s1__ign is smaller
 * * %1       - @s2__ign is smaller
 * * %-EFAULT - Cannot read one of the strings
 * * %-E2BIG  - One of strings is too large
 * * %-ERANGE - One of strings is outside of kernel address space
 */
__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
{
        return __bpf_strncasecmp(s1__ign, s2__ign, false, XATTR_SIZE_MAX);
}

/**
 * bpf_strcasecmp - Compare two strings, ignoring the case of the characters
 * @s1__ign: One string
 * @s2__ign: Another string
 *
 * Return:
 * * %0       - Strings are equal
 * * %-1      - @s1__ign is smaller
 * * %1       - @s2__ign is smaller
 * * %-EFAULT - Cannot read one of the strings
 * * %-E2BIG  - One of strings is too large
 * * %-ERANGE - One of strings is outside of kernel address space
 */
__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
{
        return __bpf_strncasecmp(s1__ign, s2__ign, true, XATTR_SIZE_MAX);
}

/*
 * bpf_strncasecmp - Compare two length-limited strings, ignoring case
 * @s1__ign: One string
 * @s2__ign: Another string
 * @len: The maximum number of characters to compare
 *
 * Return:
 * * %0       - Strings are equal
 * * %-1      - @s1__ign is smaller
 * * %1       - @s2__ign is smaller
 * * %-EFAULT - Cannot read one of the strings
 * * %-E2BIG  - One of strings is too large
 * * %-ERANGE - One of strings is outside of kernel address space
 */
__bpf_kfunc int bpf_strncasecmp(const char *s1__ign, const char *s2__ign, size_t len)
{
        return __bpf_strncasecmp(s1__ign, s2__ign, true, len);
}

/**
 * bpf_strnchr - Find a character in a length limited string
 * @s__ign: The string to be searched
 * @count: The number of characters to be searched
 * @c: The character to search for
 *
 * Note that the %NUL-terminator is considered part of the string, and can
 * be searched for.
 *
 * Return:
 * * >=0      - Index of the first occurrence of @c within @s__ign
 * * %-ENOENT - @c not found in the first @count characters of @s__ign
 * * %-EFAULT - Cannot read @s__ign
 * * %-E2BIG  - @s__ign is too large
 * * %-ERANGE - @s__ign is outside of kernel address space
 */
__bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c)
{
        char sc;
        int i;

        if (!copy_from_kernel_nofault_allowed(s__ign, 1))
                return -ERANGE;

        guard(pagefault)();
        for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
                __get_kernel_nofault(&sc, s__ign, char, err_out);
                if (sc == c)
                        return i;
                if (sc == '\0')
                        return -ENOENT;
                s__ign++;
        }
        return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT;
err_out:
        return -EFAULT;
}

/**
 * bpf_strchr - Find the first occurrence of a character in a string
 * @s__ign: The string to be searched
 * @c: The character to search for
 *
 * Note that the %NUL-terminator is considered part of the string, and can
 * be searched for.
 *
 * Return:
 * * >=0      - The index of the first occurrence of @c within @s__ign
 * * %-ENOENT - @c not found in @s__ign
 * * %-EFAULT - Cannot read @s__ign
 * * %-E2BIG  - @s__ign is too large
 * * %-ERANGE - @s__ign is outside of kernel address space
 */
__bpf_kfunc int bpf_strchr(const char *s__ign, char c)
{
        return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c);
}

/**
 * bpf_strchrnul - Find and return a character in a string, or end of string
 * @s__ign: The string to be searched
 * @c: The character to search for
 *
 * Return:
 * * >=0      - Index of the first occurrence of @c within @s__ign or index of
 *              the null byte at the end of @s__ign when @c is not found
 * * %-EFAULT - Cannot read @s__ign
 * * %-E2BIG  - @s__ign is too large
 * * %-ERANGE - @s__ign is outside of kernel address space
 */
__bpf_kfunc int bpf_strchrnul(const char *s__ign, char c)
{
        char sc;
        int i;

        if (!copy_from_kernel_nofault_allowed(s__ign, 1))
                return -ERANGE;

        guard(pagefault)();
        for (i = 0; i < XATTR_SIZE_MAX; i++) {
                __get_kernel_nofault(&sc, s__ign, char, err_out);
                if (sc == '\0' || sc == c)
                        return i;
                s__ign++;
        }
        return -E2BIG;
err_out:
        return -EFAULT;
}

/**
 * bpf_strrchr - Find the last occurrence of a character in a string
 * @s__ign: The string to be searched
 * @c: The character to search for
 *
 * Return:
 * * >=0      - Index of the last occurrence of @c within @s__ign
 * * %-ENOENT - @c not found in @s__ign
 * * %-EFAULT - Cannot read @s__ign
 * * %-E2BIG  - @s__ign is too large
 * * %-ERANGE - @s__ign is outside of kernel address space
 */
__bpf_kfunc int bpf_strrchr(const char *s__ign, int c)
{
        char sc;
        int i, last = -ENOENT;

        if (!copy_from_kernel_nofault_allowed(s__ign, 1))
                return -ERANGE;

        guard(pagefault)();
        for (i = 0; i < XATTR_SIZE_MAX; i++) {
                __get_kernel_nofault(&sc, s__ign, char, err_out);
                if (sc == c)
                        last = i;
                if (sc == '\0')
                        return last;
                s__ign++;
        }
        return -E2BIG;
err_out:
        return -EFAULT;
}

/**
 * bpf_strnlen - Calculate the length of a length-limited string
 * @s__ign: The string
 * @count: The maximum number of characters to count
 *
 * Return:
 * * >=0      - The length of @s__ign
 * * %-EFAULT - Cannot read @s__ign
 * * %-E2BIG  - @s__ign is too large
 * * %-ERANGE - @s__ign is outside of kernel address space
 */
__bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count)
{
        char c;
        int i;

        if (!copy_from_kernel_nofault_allowed(s__ign, 1))
                return -ERANGE;

        guard(pagefault)();
        for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
                __get_kernel_nofault(&c, s__ign, char, err_out);
                if (c == '\0')
                        return i;
                s__ign++;
        }
        return i == XATTR_SIZE_MAX ? -E2BIG : i;
err_out:
        return -EFAULT;
}

/**
 * bpf_strlen - Calculate the length of a string
 * @s__ign: The string
 *
 * Return:
 * * >=0      - The length of @s__ign
 * * %-EFAULT - Cannot read @s__ign
 * * %-E2BIG  - @s__ign is too large
 * * %-ERANGE - @s__ign is outside of kernel address space
 */
__bpf_kfunc int bpf_strlen(const char *s__ign)
{
        return bpf_strnlen(s__ign, XATTR_SIZE_MAX);
}

/**
 * bpf_strspn - Calculate the length of the initial substring of @s__ign which
 *              only contains letters in @accept__ign
 * @s__ign: The string to be searched
 * @accept__ign: The string to search for
 *
 * Return:
 * * >=0      - The length of the initial substring of @s__ign which only
 *              contains letters from @accept__ign
 * * %-EFAULT - Cannot read one of the strings
 * * %-E2BIG  - One of the strings is too large
 * * %-ERANGE - One of the strings is outside of kernel address space
 */
__bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign)
{
        char cs, ca;
        int i, j;

        if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
            !copy_from_kernel_nofault_allowed(accept__ign, 1)) {
                return -ERANGE;
        }

        guard(pagefault)();
        for (i = 0; i < XATTR_SIZE_MAX; i++) {
                __get_kernel_nofault(&cs, s__ign, char, err_out);
                if (cs == '\0')
                        return i;
                for (j = 0; j < XATTR_SIZE_MAX; j++) {
                        __get_kernel_nofault(&ca, accept__ign + j, char, err_out);
                        if (cs == ca || ca == '\0')
                                break;
                }
                if (j == XATTR_SIZE_MAX)
                        return -E2BIG;
                if (ca == '\0')
                        return i;
                s__ign++;
        }
        return -E2BIG;
err_out:
        return -EFAULT;
}

/**
 * bpf_strcspn - Calculate the length of the initial substring of @s__ign which
 *               does not contain letters in @reject__ign
 * @s__ign: The string to be searched
 * @reject__ign: The string to search for
 *
 * Return:
 * * >=0      - The length of the initial substring of @s__ign which does not
 *              contain letters from @reject__ign
 * * %-EFAULT - Cannot read one of the strings
 * * %-E2BIG  - One of the strings is too large
 * * %-ERANGE - One of the strings is outside of kernel address space
 */
__bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign)
{
        char cs, cr;
        int i, j;

        if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
            !copy_from_kernel_nofault_allowed(reject__ign, 1)) {
                return -ERANGE;
        }

        guard(pagefault)();
        for (i = 0; i < XATTR_SIZE_MAX; i++) {
                __get_kernel_nofault(&cs, s__ign, char, err_out);
                if (cs == '\0')
                        return i;
                for (j = 0; j < XATTR_SIZE_MAX; j++) {
                        __get_kernel_nofault(&cr, reject__ign + j, char, err_out);
                        if (cs == cr || cr == '\0')
                                break;
                }
                if (j == XATTR_SIZE_MAX)
                        return -E2BIG;
                if (cr != '\0')
                        return i;
                s__ign++;
        }
        return -E2BIG;
err_out:
        return -EFAULT;
}

static int __bpf_strnstr(const char *s1, const char *s2, size_t len,
                         bool ignore_case)
{
        char c1, c2;
        int i, j;

        if (!copy_from_kernel_nofault_allowed(s1, 1) ||
            !copy_from_kernel_nofault_allowed(s2, 1)) {
                return -ERANGE;
        }

        guard(pagefault)();
        for (i = 0; i < XATTR_SIZE_MAX; i++) {
                for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
                        __get_kernel_nofault(&c2, s2 + j, char, err_out);
                        if (c2 == '\0')
                                return i;
                        /*
                         * We allow reading an extra byte from s2 (note the
                         * `i + j <= len` above) to cover the case when s2 is
                         * a suffix of the first len chars of s1.
                         */
                        if (i + j == len)
                                break;
                        __get_kernel_nofault(&c1, s1 + j, char, err_out);

                        if (ignore_case) {
                                c1 = tolower(c1);
                                c2 = tolower(c2);
                        }

                        if (c1 == '\0')
                                return -ENOENT;
                        if (c1 != c2)
                                break;
                }
                if (j == XATTR_SIZE_MAX)
                        return -E2BIG;
                if (i + j == len)
                        return -ENOENT;
                s1++;
        }
        return -E2BIG;
err_out:
        return -EFAULT;
}

/**
 * bpf_strstr - Find the first substring in a string
 * @s1__ign: The string to be searched
 * @s2__ign: The string to search for
 *
 * Return:
 * * >=0      - Index of the first character of the first occurrence of @s2__ign
 *              within @s1__ign
 * * %-ENOENT - @s2__ign is not a substring of @s1__ign
 * * %-EFAULT - Cannot read one of the strings
 * * %-E2BIG  - One of the strings is too large
 * * %-ERANGE - One of the strings is outside of kernel address space
 */
__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
{
        return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false);
}

/**
 * bpf_strcasestr - Find the first substring in a string, ignoring the case of
 *                  the characters
 * @s1__ign: The string to be searched
 * @s2__ign: The string to search for
 *
 * Return:
 * * >=0      - Index of the first character of the first occurrence of @s2__ign
 *              within @s1__ign
 * * %-ENOENT - @s2__ign is not a substring of @s1__ign
 * * %-EFAULT - Cannot read one of the strings
 * * %-E2BIG  - One of the strings is too large
 * * %-ERANGE - One of the strings is outside of kernel address space
 */
__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign)
{
        return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true);
}

/**
 * bpf_strnstr - Find the first substring in a length-limited string
 * @s1__ign: The string to be searched
 * @s2__ign: The string to search for
 * @len: the maximum number of characters to search
 *
 * Return:
 * * >=0      - Index of the first character of the first occurrence of @s2__ign
 *              within the first @len characters of @s1__ign
 * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
 * * %-EFAULT - Cannot read one of the strings
 * * %-E2BIG  - One of the strings is too large
 * * %-ERANGE - One of the strings is outside of kernel address space
 */
__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign,
                            size_t len)
{
        return __bpf_strnstr(s1__ign, s2__ign, len, false);
}

/**
 * bpf_strncasestr - Find the first substring in a length-limited string,
 *                   ignoring the case of the characters
 * @s1__ign: The string to be searched
 * @s2__ign: The string to search for
 * @len: the maximum number of characters to search
 *
 * Return:
 * * >=0      - Index of the first character of the first occurrence of @s2__ign
 *              within the first @len characters of @s1__ign
 * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
 * * %-EFAULT - Cannot read one of the strings
 * * %-E2BIG  - One of the strings is too large
 * * %-ERANGE - One of the strings is outside of kernel address space
 */
__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign,
                                size_t len)
{
        return __bpf_strnstr(s1__ign, s2__ign, len, true);
}

#ifdef CONFIG_KEYS
/**
 * bpf_lookup_user_key - lookup a key by its serial
 * @serial: key handle serial number
 * @flags: lookup-specific flags
 *
 * Search a key with a given *serial* and the provided *flags*.
 * If found, increment the reference count of the key by one, and
 * return it in the bpf_key structure.
 *
 * The bpf_key structure must be passed to bpf_key_put() when done
 * with it, so that the key reference count is decremented and the
 * bpf_key structure is freed.
 *
 * Permission checks are deferred to the time the key is used by
 * one of the available key-specific kfuncs.
 *
 * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
 * special keyring (e.g. session keyring), if it doesn't yet exist.
 * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
 * for the key construction, and to retrieve uninstantiated keys (keys
 * without data attached to them).
 *
 * Return: a bpf_key pointer with a valid key pointer if the key is found, a
 *         NULL pointer otherwise.
 */
__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
{
        key_ref_t key_ref;
        struct bpf_key *bkey;

        if (flags & ~KEY_LOOKUP_ALL)
                return NULL;

        /*
         * Permission check is deferred until the key is used, as the
         * intent of the caller is unknown here.
         */
        key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
        if (IS_ERR(key_ref))
                return NULL;

        bkey = kmalloc_obj(*bkey);
        if (!bkey) {
                key_put(key_ref_to_ptr(key_ref));
                return NULL;
        }

        bkey->key = key_ref_to_ptr(key_ref);
        bkey->has_ref = true;

        return bkey;
}

/**
 * bpf_lookup_system_key - lookup a key by a system-defined ID
 * @id: key ID
 *
 * Obtain a bpf_key structure with a key pointer set to the passed key ID.
 * The key pointer is marked as invalid, to prevent bpf_key_put() from
 * attempting to decrement the key reference count on that pointer. The key
 * pointer set in such way is currently understood only by
 * verify_pkcs7_signature().
 *
 * Set *id* to one of the values defined in include/linux/verification.h:
 * 0 for the primary keyring (immutable keyring of system keys);
 * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
 * (where keys can be added only if they are vouched for by existing keys
 * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
 * keyring (primarily used by the integrity subsystem to verify a kexec'ed
 * kerned image and, possibly, the initramfs signature).
 *
 * Return: a bpf_key pointer with an invalid key pointer set from the
 *         pre-determined ID on success, a NULL pointer otherwise
 */
__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
{
        struct bpf_key *bkey;

        if (system_keyring_id_check(id) < 0)
                return NULL;

        bkey = kmalloc_obj(*bkey, GFP_ATOMIC);
        if (!bkey)
                return NULL;

        bkey->key = (struct key *)(unsigned long)id;
        bkey->has_ref = false;

        return bkey;
}

/**
 * bpf_key_put - decrement key reference count if key is valid and free bpf_key
 * @bkey: bpf_key structure
 *
 * Decrement the reference count of the key inside *bkey*, if the pointer
 * is valid, and free *bkey*.
 */
__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
{
        if (bkey->has_ref)
                key_put(bkey->key);

        kfree(bkey);
}

/**
 * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
 * @data_p: data to verify
 * @sig_p: signature of the data
 * @trusted_keyring: keyring with keys trusted for signature verification
 *
 * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
 * with keys in a keyring referenced by *trusted_keyring*.
 *
 * Return: 0 on success, a negative value on error.
 */
__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
                               struct bpf_dynptr *sig_p,
                               struct bpf_key *trusted_keyring)
{
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
        struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
        struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
        const void *data, *sig;
        u32 data_len, sig_len;
        int ret;

        if (trusted_keyring->has_ref) {
                /*
                 * Do the permission check deferred in bpf_lookup_user_key().
                 * See bpf_lookup_user_key() for more details.
                 *
                 * A call to key_task_permission() here would be redundant, as
                 * it is already done by keyring_search() called by
                 * find_asymmetric_key().
                 */
                ret = key_validate(trusted_keyring->key);
                if (ret < 0)
                        return ret;
        }

        data_len = __bpf_dynptr_size(data_ptr);
        data = __bpf_dynptr_data(data_ptr, data_len);
        sig_len = __bpf_dynptr_size(sig_ptr);
        sig = __bpf_dynptr_data(sig_ptr, sig_len);

        return verify_pkcs7_signature(data, data_len, sig, sig_len,
                                      trusted_keyring->key,
                                      VERIFYING_BPF_SIGNATURE, NULL,
                                      NULL);
#else
        return -EOPNOTSUPP;
#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
}
#endif /* CONFIG_KEYS */

typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);

enum bpf_task_work_state {
        /* bpf_task_work is ready to be used */
        BPF_TW_STANDBY = 0,
        /* irq work scheduling in progress */
        BPF_TW_PENDING,
        /* task work scheduling in progress */
        BPF_TW_SCHEDULING,
        /* task work is scheduled successfully */
        BPF_TW_SCHEDULED,
        /* callback is running */
        BPF_TW_RUNNING,
        /* associated BPF map value is deleted */
        BPF_TW_FREED,
};

struct bpf_task_work_ctx {
        enum bpf_task_work_state state;
        refcount_t refcnt;
        struct callback_head work;
        struct irq_work irq_work;
        /* bpf_prog that schedules task work */
        struct bpf_prog *prog;
        /* task for which callback is scheduled */
        struct task_struct *task;
        /* the map and map value associated with this context */
        struct bpf_map *map;
        void *map_val;
        enum task_work_notify_mode mode;
        bpf_task_work_callback_t callback_fn;
        struct rcu_head rcu;
} __aligned(8);

/* Actual type for struct bpf_task_work */
struct bpf_task_work_kern {
        struct bpf_task_work_ctx *ctx;
};

static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
{
        if (ctx->prog) {
                bpf_prog_put(ctx->prog);
                ctx->prog = NULL;
        }
        if (ctx->task) {
                bpf_task_release(ctx->task);
                ctx->task = NULL;
        }
}

static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
{
        return refcount_inc_not_zero(&ctx->refcnt);
}

static void bpf_task_work_destroy(struct irq_work *irq_work)
{
        struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);

        bpf_task_work_ctx_reset(ctx);
        kfree_rcu(ctx, rcu);
}

static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
{
        if (!refcount_dec_and_test(&ctx->refcnt))
                return;

        if (irqs_disabled()) {
                ctx->irq_work = IRQ_WORK_INIT(bpf_task_work_destroy);
                irq_work_queue(&ctx->irq_work);
        } else {
                bpf_task_work_destroy(&ctx->irq_work);
        }
}

static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
{
        /*
         * Scheduled task_work callback holds ctx ref, so if we successfully
         * cancelled, we put that ref on callback's behalf. If we couldn't
         * cancel, callback will inevitably run or has already completed
         * running, and it would have taken care of its ctx ref itself.
         */
        if (task_work_cancel(ctx->task, &ctx->work))
                bpf_task_work_ctx_put(ctx);
}

static void bpf_task_work_callback(struct callback_head *cb)
{
        struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
        enum bpf_task_work_state state;
        u32 idx;
        void *key;

        /* Read lock is needed to protect ctx and map key/value access */
        guard(rcu_tasks_trace)();
        /*
         * This callback may start running before bpf_task_work_irq() switched to
         * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
         */
        state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
        if (state == BPF_TW_SCHEDULED)
                state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
        if (state == BPF_TW_FREED) {
                bpf_task_work_ctx_put(ctx);
                return;
        }

        key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);

        migrate_disable();
        ctx->callback_fn(ctx->map, key, ctx->map_val);
        migrate_enable();

        bpf_task_work_ctx_reset(ctx);
        (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);

        bpf_task_work_ctx_put(ctx);
}

static void bpf_task_work_irq(struct irq_work *irq_work)
{
        struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
        enum bpf_task_work_state state;
        int err;

        guard(rcu)();

        if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
                bpf_task_work_ctx_put(ctx);
                return;
        }

        err = task_work_add(ctx->task, &ctx->work, ctx->mode);
        if (err) {
                bpf_task_work_ctx_reset(ctx);
                /*
                 * try to switch back to STANDBY for another task_work reuse, but we might have
                 * gone to FREED already, which is fine as we already cleaned up after ourselves
                 */
                (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
                bpf_task_work_ctx_put(ctx);
                return;
        }

        /*
         * It's technically possible for just scheduled task_work callback to
         * complete running by now, going SCHEDULING -> RUNNING and then
         * dropping its ctx refcount. Instead of capturing an extra ref just
         * to protect below ctx->state access, we rely on rcu_read_lock
         * above to prevent kfree_rcu from freeing ctx before we return.
         */
        state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
        if (state == BPF_TW_FREED)
                bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
}

static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
                                                         struct bpf_map *map)
{
        struct bpf_task_work_kern *twk = (void *)tw;
        struct bpf_task_work_ctx *ctx, *old_ctx;

        ctx = READ_ONCE(twk->ctx);
        if (ctx)
                return ctx;

        ctx = bpf_map_kmalloc_nolock(map, sizeof(*ctx), 0, NUMA_NO_NODE);
        if (!ctx)
                return ERR_PTR(-ENOMEM);

        memset(ctx, 0, sizeof(*ctx));
        refcount_set(&ctx->refcnt, 1); /* map's own ref */
        ctx->state = BPF_TW_STANDBY;

        old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
        if (old_ctx) {
                /*
                 * tw->ctx is set by concurrent BPF program, release allocated
                 * memory and try to reuse already set context.
                 */
                kfree_nolock(ctx);
                return old_ctx;
        }

        return ctx; /* Success */
}

static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
                                                           struct bpf_map *map)
{
        struct bpf_task_work_ctx *ctx;

        /*
         * Sleepable BPF programs hold rcu_read_lock_trace but not
         * regular rcu_read_lock. Since kfree_rcu waits for regular
         * RCU GP, the ctx can be freed while we're between reading
         * the pointer and incrementing the refcount. Take regular
         * rcu_read_lock to prevent kfree_rcu from freeing the ctx
         * before we can tryget it.
         */
        scoped_guard(rcu) {
                ctx = bpf_task_work_fetch_ctx(tw, map);
                if (IS_ERR(ctx))
                        return ctx;

                /* try to get ref for task_work callback to hold */
                if (!bpf_task_work_ctx_tryget(ctx))
                        return ERR_PTR(-EBUSY);
        }

        if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
                /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
                bpf_task_work_ctx_put(ctx);
                return ERR_PTR(-EBUSY);
        }

        /*
         * If no process or bpffs is holding a reference to the map, no new callbacks should be
         * scheduled. This does not address any race or correctness issue, but rather is a policy
         * choice: dropping user references should stop everything.
         */
        if (!atomic64_read(&map->usercnt)) {
                /* drop ref we just got for task_work callback itself */
                bpf_task_work_ctx_put(ctx);
                /* transfer map's ref into cancel_and_free() */
                bpf_task_work_cancel_and_free(tw);
                return ERR_PTR(-EBUSY);
        }

        return ctx;
}

static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
                                  struct bpf_map *map, bpf_task_work_callback_t callback_fn,
                                  struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
{
        struct bpf_prog *prog;
        struct bpf_task_work_ctx *ctx;
        int err;

        BTF_TYPE_EMIT(struct bpf_task_work);

        prog = bpf_prog_inc_not_zero(aux->prog);
        if (IS_ERR(prog))
                return -EBADF;
        task = bpf_task_acquire(task);
        if (!task) {
                err = -EBADF;
                goto release_prog;
        }

        ctx = bpf_task_work_acquire_ctx(tw, map);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto release_all;
        }

        ctx->task = task;
        ctx->callback_fn = callback_fn;
        ctx->prog = prog;
        ctx->mode = mode;
        ctx->map = map;
        ctx->map_val = (void *)tw - map->record->task_work_off;
        init_task_work(&ctx->work, bpf_task_work_callback);
        init_irq_work(&ctx->irq_work, bpf_task_work_irq);

        irq_work_queue(&ctx->irq_work);
        return 0;

release_all:
        bpf_task_release(task);
release_prog:
        bpf_prog_put(prog);
        return err;
}

/**
 * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL
 * mode
 * @task: Task struct for which callback should be scheduled
 * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
 * @map__map: bpf_map that embeds struct bpf_task_work in the values
 * @callback: pointer to BPF subprogram to call
 * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
 *
 * Return: 0 if task work has been scheduled successfully, negative error code otherwise
 */
__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
                                              void *map__map, bpf_task_work_callback_t callback,
                                              struct bpf_prog_aux *aux)
{
        return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL);
}

/**
 * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME
 * mode
 * @task: Task struct for which callback should be scheduled
 * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
 * @map__map: bpf_map that embeds struct bpf_task_work in the values
 * @callback: pointer to BPF subprogram to call
 * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
 *
 * Return: 0 if task work has been scheduled successfully, negative error code otherwise
 */
__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
                                              void *map__map, bpf_task_work_callback_t callback,
                                              struct bpf_prog_aux *aux)
{
        return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME);
}

static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
                            struct bpf_dynptr_kern *ptr)
{
        struct bpf_dynptr_file_impl *state;

        /* flags is currently unsupported */
        if (flags) {
                bpf_dynptr_set_null(ptr);
                return -EINVAL;
        }

        state = kmalloc_nolock(sizeof(*state), 0, NUMA_NO_NODE);
        if (!state) {
                bpf_dynptr_set_null(ptr);
                return -ENOMEM;
        }
        state->offset = 0;
        state->size = U64_MAX; /* Don't restrict size, as file may change anyways */
        freader_init_from_file(&state->freader, NULL, 0, file, may_sleep);
        bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0);
        bpf_dynptr_set_rdonly(ptr);
        return 0;
}

__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
{
        return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit);
}

int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
{
        return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit);
}

__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr;
        struct bpf_dynptr_file_impl *df = ptr->data;

        if (!df)
                return 0;

        freader_cleanup(&df->freader);
        kfree_nolock(df);
        bpf_dynptr_set_null(ptr);
        return 0;
}

/**
 * bpf_timer_cancel_async - try to deactivate a timer
 * @timer:        bpf_timer to stop
 *
 * Returns:
 *
 *  *  0 when the timer was not active
 *  *  1 when the timer was active
 *  * -1 when the timer is currently executing the callback function and
 *       cannot be stopped
 *  * -ECANCELED when the timer will be cancelled asynchronously
 *  * -ENOMEM when out of memory
 *  * -EINVAL when the timer was not initialized
 *  * -ENOENT when this kfunc is racing with timer deletion
 */
__bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer)
{
        struct bpf_async_kern *async = (void *)timer;
        struct bpf_async_cb *cb;
        int ret;

        cb = READ_ONCE(async->cb);
        if (!cb)
                return -EINVAL;

        /*
         * Unlike hrtimer_start() it's ok to synchronously call
         * hrtimer_try_to_cancel() when refcnt reached zero, but deferring to
         * irq_work is not, since irq callback may execute after RCU GP and
         * cb could be freed at that time. Check for refcnt zero for
         * consistency.
         */
        if (!refcount_inc_not_zero(&cb->refcnt))
                return -ENOENT;

        if (!defer_timer_wq_op()) {
                struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);

                ret = hrtimer_try_to_cancel(&t->timer);
                bpf_async_refcount_put(cb);
                return ret;
        } else {
                ret = bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
                return ret ? ret : -ECANCELED;
        }
}

__bpf_kfunc_end_defs();

static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
{
        struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);

        bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
        bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
}

void bpf_task_work_cancel_and_free(void *val)
{
        struct bpf_task_work_kern *twk = val;
        struct bpf_task_work_ctx *ctx;
        enum bpf_task_work_state state;

        ctx = xchg(&twk->ctx, NULL);
        if (!ctx)
                return;

        state = xchg(&ctx->state, BPF_TW_FREED);
        if (state == BPF_TW_SCHEDULED) {
                /* run in irq_work to avoid locks in NMI */
                init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
                irq_work_queue(&ctx->irq_work);
                return;
        }

        bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
}

BTF_KFUNCS_START(generic_btf_ids)
#ifdef CONFIG_CRASH_DUMP
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
BTF_ID_FLAGS(func, bpf_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_percpu_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_percpu_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_refcount_acquire, KF_ACQUIRE | KF_RET_NULL | KF_RCU | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_rbtree_add, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)

#ifdef CONFIG_CGROUPS
BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_throw)
#ifdef CONFIG_BPF_EVENTS
BTF_ID_FLAGS(func, bpf_send_signal_task)
#endif
#ifdef CONFIG_KEYS
BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
#endif
#endif
#ifdef CONFIG_S390
BTF_ID_FLAGS(func, bpf_get_lowcore)
#endif
BTF_KFUNCS_END(generic_btf_ids)

static const struct btf_kfunc_id_set generic_kfunc_set = {
        .owner = THIS_MODULE,
        .set   = &generic_btf_ids,
};


BTF_ID_LIST(generic_dtor_ids)
BTF_ID(struct, task_struct)
BTF_ID(func, bpf_task_release_dtor)
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
BTF_ID(func, bpf_cgroup_release_dtor)
#endif

BTF_KFUNCS_START(common_btf_ids)
BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
BTF_ID_FLAGS(func, bpf_rcu_read_lock)
BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
#ifdef CONFIG_CGROUPS
BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
#endif
BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_dynptr_adjust)
BTF_ID_FLAGS(func, bpf_dynptr_is_null)
BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
BTF_ID_FLAGS(func, bpf_dynptr_size)
BTF_ID_FLAGS(func, bpf_dynptr_clone)
BTF_ID_FLAGS(func, bpf_dynptr_copy)
BTF_ID_FLAGS(func, bpf_dynptr_memset)
#ifdef CONFIG_NET
BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
#endif
BTF_ID_FLAGS(func, bpf_wq_init)
BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_wq_start)
BTF_ID_FLAGS(func, bpf_preempt_disable)
BTF_ID_FLAGS(func, bpf_preempt_enable)
BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_get_kmem_cache)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_local_irq_save)
BTF_ID_FLAGS(func, bpf_local_irq_restore)
#ifdef CONFIG_BPF_EVENTS
BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE)
#endif
#ifdef CONFIG_DMA_SHARED_BUFFER
BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
#endif
BTF_ID_FLAGS(func, __bpf_trap)
BTF_ID_FLAGS(func, bpf_strcmp);
BTF_ID_FLAGS(func, bpf_strcasecmp);
BTF_ID_FLAGS(func, bpf_strncasecmp);
BTF_ID_FLAGS(func, bpf_strchr);
BTF_ID_FLAGS(func, bpf_strchrnul);
BTF_ID_FLAGS(func, bpf_strnchr);
BTF_ID_FLAGS(func, bpf_strrchr);
BTF_ID_FLAGS(func, bpf_strlen);
BTF_ID_FLAGS(func, bpf_strnlen);
BTF_ID_FLAGS(func, bpf_strspn);
BTF_ID_FLAGS(func, bpf_strcspn);
BTF_ID_FLAGS(func, bpf_strstr);
BTF_ID_FLAGS(func, bpf_strcasestr);
BTF_ID_FLAGS(func, bpf_strnstr);
BTF_ID_FLAGS(func, bpf_strncasestr);
#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_dynptr_from_file)
BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
BTF_ID_FLAGS(func, bpf_timer_cancel_async)
BTF_KFUNCS_END(common_btf_ids)

static const struct btf_kfunc_id_set common_kfunc_set = {
        .owner = THIS_MODULE,
        .set   = &common_btf_ids,
};

static int __init kfunc_init(void)
{
        int ret;
        const struct btf_id_dtor_kfunc generic_dtors[] = {
                {
                        .btf_id       = generic_dtor_ids[0],
                        .kfunc_btf_id = generic_dtor_ids[1]
                },
#ifdef CONFIG_CGROUPS
                {
                        .btf_id       = generic_dtor_ids[2],
                        .kfunc_btf_id = generic_dtor_ids[3]
                },
#endif
        };

        ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set);
        ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
                                                  ARRAY_SIZE(generic_dtors),
                                                  THIS_MODULE);
        return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
}

late_initcall(kfunc_init);

/* Get a pointer to dynptr data up to len bytes for read only access. If
 * the dynptr doesn't have continuous data up to len bytes, return NULL.
 */
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len)
{
        const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;

        return bpf_dynptr_slice(p, 0, NULL, len);
}

/* Get a pointer to dynptr data up to len bytes for read write access. If
 * the dynptr doesn't have continuous data up to len bytes, or the dynptr
 * is read only, return NULL.
 */
void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len)
{
        if (__bpf_dynptr_is_rdonly(ptr))
                return NULL;
        return (void *)__bpf_dynptr_data(ptr, len);
}

void bpf_map_free_internal_structs(struct bpf_map *map, void *val)
{
        if (btf_record_has_field(map->record, BPF_TIMER))
                bpf_obj_free_timer(map->record, val);
        if (btf_record_has_field(map->record, BPF_WORKQUEUE))
                bpf_obj_free_workqueue(map->record, val);
        if (btf_record_has_field(map->record, BPF_TASK_WORK))
                bpf_obj_free_task_work(map->record, val);
}


























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 */
#ifndef __XFS_BUF_H__
#define __XFS_BUF_H__

#include <linux/list.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/dax.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
#include <linux/lockref.h>

extern struct kmem_cache *xfs_buf_cache;

/*
 *        Base types
 */
struct xfs_buf;

#define XFS_BUF_DADDR_MAX        ((xfs_daddr_t) S64_MAX)
#define XFS_BUF_DADDR_NULL        ((xfs_daddr_t) (-1LL))

#define XBF_READ         (1u << 0) /* buffer intended for reading from device */
#define XBF_WRITE         (1u << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD         (1u << 2) /* asynchronous read-ahead */
#define XBF_ASYNC         (1u << 4) /* initiator will not wait for completion */
#define XBF_DONE         (1u << 5) /* all pages in the buffer uptodate */
#define XBF_STALE         (1u << 6) /* buffer has been staled, do not find it */
#define XBF_WRITE_FAIL         (1u << 7) /* async writes have failed on this buffer */

/* buffer type flags for write callbacks */
#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */

/* flags used only internally */
#define _XBF_KMEM         (1u << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q         (1u << 22)/* buffer on a delwri queue */

/* flags used only as arguments to access routines */
/*
 * Online fsck is scanning the buffer cache for live buffers.  Do not warn
 * about length mismatches during lookups and do not return stale buffers.
 */
#define XBF_LIVESCAN         (1u << 28)
#define XBF_INCORE         (1u << 29)/* lookup only, return if found in cache */
#define XBF_TRYLOCK         (1u << 30)/* lock requested, but do not wait */


typedef unsigned int xfs_buf_flags_t;

#define XFS_BUF_FLAGS \
        { XBF_READ,                "READ" }, \
        { XBF_WRITE,                "WRITE" }, \
        { XBF_READ_AHEAD,        "READ_AHEAD" }, \
        { XBF_ASYNC,                "ASYNC" }, \
        { XBF_DONE,                "DONE" }, \
        { XBF_STALE,                "STALE" }, \
        { XBF_WRITE_FAIL,        "WRITE_FAIL" }, \
        { _XBF_LOGRECOVERY,        "LOG_RECOVERY" }, \
        { _XBF_KMEM,                "KMEM" }, \
        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
        /* The following interface flags should never be set */ \
        { XBF_LIVESCAN,                "LIVESCAN" }, \
        { XBF_INCORE,                "INCORE" }, \
        { XBF_TRYLOCK,                "TRYLOCK" }

/*
 * The xfs_buftarg contains 2 notions of "sector size" -
 *
 * 1) The metadata sector size, which is the minimum unit and
 *    alignment of IO which will be performed by metadata operations.
 * 2) The device logical sector size
 *
 * The first is specified at mkfs time, and is stored on-disk in the
 * superblock's sb_sectsize.
 *
 * The latter is derived from the underlying device, and controls direct IO
 * alignment constraints.
 */
struct xfs_buftarg {
        dev_t                        bt_dev;
        struct block_device        *bt_bdev;
        struct dax_device        *bt_daxdev;
        struct file                *bt_file;
        u64                        bt_dax_part_off;
        struct xfs_mount        *bt_mount;
        unsigned int                bt_meta_sectorsize;
        size_t                        bt_meta_sectormask;
        size_t                        bt_logical_sectorsize;
        size_t                        bt_logical_sectormask;
        xfs_daddr_t                bt_nr_sectors;

        /* LRU control structures */
        struct shrinker                *bt_shrinker;
        struct list_lru                bt_lru;

        struct percpu_counter        bt_readahead_count;
        struct ratelimit_state        bt_ioerror_rl;

        /* Hardware atomic write unit values, bytes */
        unsigned int                bt_awu_min;
        unsigned int                bt_awu_max;

        struct rhashtable        bt_hash;
};

struct xfs_buf_map {
        xfs_daddr_t                bm_bn;        /* block number for I/O */
        int                        bm_len;        /* size of I/O */
        unsigned int                bm_flags;
};

/*
 * Online fsck is scanning the buffer cache for live buffers.  Do not warn
 * about length mismatches during lookups and do not return stale buffers.
 */
#define XBM_LIVESCAN                (1U << 0)

#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
        struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };

struct xfs_buf_ops {
        char *name;
        union {
                __be32 magic[2];        /* v4 and v5 on disk magic values */
                __be16 magic16[2];        /* v4 and v5 on disk magic values */
        };
        void (*verify_read)(struct xfs_buf *);
        void (*verify_write)(struct xfs_buf *);
        xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
};

struct xfs_buf {
        /*
         * first cacheline holds all the fields needed for an uncontended cache
         * hit to be fully processed. The semaphore straddles the cacheline
         * boundary, but the counter and lock sits on the first cacheline,
         * which is the only bit that is touched if we hit the semaphore
         * fast-path on locking.
         */
        struct rhash_head        b_rhash_head;        /* pag buffer hash node */

        xfs_daddr_t                b_rhash_key;        /* buffer cache index */
        int                        b_length;        /* size of buffer in BBs */
        struct lockref                b_lockref;        /* refcount + lock */
        atomic_t                b_lru_ref;        /* lru reclaim ref count */
        xfs_buf_flags_t                b_flags;        /* status flags */
        struct semaphore        b_sema;                /* semaphore for lockables */

        /*
         * concurrent access to b_lru and b_lru_flags are protected by
         * bt_lru_lock and not by b_sema
         */
        struct list_head        b_lru;                /* lru list */
        wait_queue_head_t        b_waiters;        /* unpin waiters */
        struct list_head        b_list;
        struct xfs_perag        *b_pag;
        struct xfs_mount        *b_mount;
        struct xfs_buftarg        *b_target;        /* buffer target (device) */
        void                        *b_addr;        /* virtual address of buffer */
        struct work_struct        b_ioend_work;
        struct completion        b_iowait;        /* queue for I/O waiters */
        struct xfs_buf_log_item        *b_log_item;
        struct list_head        b_li_list;        /* Log items list head */
        struct xfs_trans        *b_transp;
        struct xfs_buf_map        *b_maps;        /* compound buffer map */
        struct xfs_buf_map        __b_map;        /* inline compound buffer map */
        int                        b_map_count;
        atomic_t                b_pin_count;        /* pin count */
        int                        b_error;        /* error code on I/O */
        void                        (*b_iodone)(struct xfs_buf *bp);

        /*
         * async write failure retry count. Initialised to zero on the first
         * failure, then when it exceeds the maximum configured without a
         * success the write is considered to be failed permanently and the
         * iodone handler will take appropriate action.
         *
         * For retry timeouts, we record the jiffy of the first failure. This
         * means that we can change the retry timeout for buffers already under
         * I/O and thus avoid getting stuck in a retry loop with a long timeout.
         *
         * last_error is used to ensure that we are getting repeated errors, not
         * different errors. e.g. a block device might change ENOSPC to EIO when
         * a failure timeout occurs, so we want to re-initialise the error
         * retry behaviour appropriately when that happens.
         */
        int                        b_retries;
        unsigned long                b_first_retry_time; /* in jiffies */
        int                        b_last_error;

        const struct xfs_buf_ops        *b_ops;
        struct rcu_head                b_rcu;
};

/* Finding and Reading Buffers */
int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
                int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
                int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp,
                const struct xfs_buf_ops *ops, xfs_failaddr_t fa);
void xfs_buf_readahead_map(struct xfs_buftarg *target,
                               struct xfs_buf_map *map, int nmaps,
                               const struct xfs_buf_ops *ops);

static inline int
xfs_buf_incore(
        struct xfs_buftarg        *target,
        xfs_daddr_t                blkno,
        size_t                        numblks,
        xfs_buf_flags_t                flags,
        struct xfs_buf                **bpp)
{
        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);

        return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp);
}

static inline int
xfs_buf_get(
        struct xfs_buftarg        *target,
        xfs_daddr_t                blkno,
        size_t                        numblks,
        struct xfs_buf                **bpp)
{
        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);

        return xfs_buf_get_map(target, &map, 1, 0, bpp);
}

static inline int
xfs_buf_read(
        struct xfs_buftarg        *target,
        xfs_daddr_t                blkno,
        size_t                        numblks,
        xfs_buf_flags_t                flags,
        struct xfs_buf                **bpp,
        const struct xfs_buf_ops *ops)
{
        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);

        return xfs_buf_read_map(target, &map, 1, flags, bpp, ops,
                        __builtin_return_address(0));
}

static inline void
xfs_buf_readahead(
        struct xfs_buftarg        *target,
        xfs_daddr_t                blkno,
        size_t                        numblks,
        const struct xfs_buf_ops *ops)
{
        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
        return xfs_buf_readahead_map(target, &map, 1, ops);
}

int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
                struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
                size_t numblks, struct xfs_buf **bpp,
                const struct xfs_buf_ops *ops);
int _xfs_buf_read(struct xfs_buf *bp);
void xfs_buf_hold(struct xfs_buf *bp);

/* Releasing Buffers */
extern void xfs_buf_rele(struct xfs_buf *);

/* Locking and Unlocking Buffers */
extern int xfs_buf_trylock(struct xfs_buf *);
extern void xfs_buf_lock(struct xfs_buf *);
extern void xfs_buf_unlock(struct xfs_buf *);
#define xfs_buf_islocked(bp) \
        ((bp)->b_sema.count <= 0)

static inline void xfs_buf_relse(struct xfs_buf *bp)
{
        xfs_buf_unlock(bp);
        xfs_buf_rele(bp);
}

/* Buffer Read and Write Routines */
extern int xfs_bwrite(struct xfs_buf *bp);

extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
                xfs_failaddr_t failaddr);
#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
void xfs_buf_ioend_fail(struct xfs_buf *);
void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)

/* Buffer Utility Routines */
static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset)
{
        return bp->b_addr + offset;
}

static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize)
{
        memset(bp->b_addr + boff, 0, bsize);
}

extern void xfs_buf_stale(struct xfs_buf *bp);

/* Delayed Write Buffer Routines */
extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);

static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
{
        return bp->b_maps[0].bm_bn;
}

void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref);

/*
 * If the buffer is already on the LRU, do nothing. Otherwise set the buffer
 * up with a reference count of 0 so it will be tossed from the cache when
 * released.
 */
static inline void xfs_buf_oneshot(struct xfs_buf *bp)
{
        if (!list_empty(&bp->b_lru) || atomic_read(&bp->b_lru_ref) > 1)
                return;
        atomic_set(&bp->b_lru_ref, 0);
}

static inline int xfs_buf_ispinned(struct xfs_buf *bp)
{
        return atomic_read(&bp->b_pin_count);
}

static inline int
xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
{
        return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
                                cksum_offset);
}

static inline void
xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
{
        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
                         cksum_offset);
}

/*
 *        Handling of buftargs.
 */
struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
                struct file *bdev_file);
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize,
                xfs_fsblock_t nr_blocks);

#define xfs_readonly_buftarg(buftarg)        bdev_read_only((buftarg)->bt_bdev)

int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);

/* for xfs_buf_mem.c only: */
int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize,
                const char *descr);
void xfs_destroy_buftarg(struct xfs_buftarg *btp);

#endif        /* __XFS_BUF_H__ */












































































































































































































    3 
    3 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
// SPDX-License-Identifier: GPL-2.0-only
/*
 * linux/lib/cmdline.c
 * Helper functions generally used for parsing kernel command line
 * and module options.
 *
 * Code and copyrights come from init/main.c and arch/i386/kernel/setup.c.
 *
 * GNU Indent formatting options for this file: -kr -i8 -npsl -pcs
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/ctype.h>

/*
 *        If a hyphen was found in get_option, this will handle the
 *        range of numbers, M-N.  This will expand the range and insert
 *        the values[M, M+1, ..., N] into the ints array in get_options.
 */

static int get_range(char **str, int *pint, int n)
{
        int x, inc_counter, upper_range;

        (*str)++;
        upper_range = simple_strtol((*str), NULL, 0);
        inc_counter = upper_range - *pint;
        for (x = *pint; n && x < upper_range; x++, n--)
                *pint++ = x;
        return inc_counter;
}

/**
 *        get_option - Parse integer from an option string
 *        @str: option string
 *        @pint: (optional output) integer value parsed from @str
 *
 *        Read an int from an option string; if available accept a subsequent
 *        comma as well.
 *
 *        When @pint is NULL the function can be used as a validator of
 *        the current option in the string.
 *
 *        Return values:
 *        0 - no int in string
 *        1 - int found, no subsequent comma
 *        2 - int found including a subsequent comma
 *        3 - hyphen found to denote a range
 *
 *        Leading hyphen without integer is no integer case, but we consume it
 *        for the sake of simplification.
 */

int get_option(char **str, int *pint)
{
        char *cur = *str;
        int value;

        if (!cur || !(*cur))
                return 0;
        if (*cur == '-')
                value = -simple_strtoull(++cur, str, 0);
        else
                value = simple_strtoull(cur, str, 0);
        if (pint)
                *pint = value;
        if (cur == *str)
                return 0;
        if (**str == ',') {
                (*str)++;
                return 2;
        }
        if (**str == '-')
                return 3;

        return 1;
}
EXPORT_SYMBOL(get_option);

/**
 *        get_options - Parse a string into a list of integers
 *        @str: String to be parsed
 *        @nints: size of integer array
 *        @ints: integer array (must have room for at least one element)
 *
 *        This function parses a string containing a comma-separated
 *        list of integers, a hyphen-separated range of _positive_ integers,
 *        or a combination of both.  The parse halts when the array is
 *        full, or when no more numbers can be retrieved from the
 *        string.
 *
 *        When @nints is 0, the function just validates the given @str and
 *        returns the amount of parseable integers as described below.
 *
 *        Returns:
 *
 *        The first element is filled by the number of collected integers
 *        in the range. The rest is what was parsed from the @str.
 *
 *        Return value is the character in the string which caused
 *        the parse to end (typically a null terminator, if @str is
 *        completely parseable).
 */

char *get_options(const char *str, int nints, int *ints)
{
        bool validate = (nints == 0);
        int res, i = 1;

        while (i < nints || validate) {
                int *pint = validate ? ints : ints + i;

                res = get_option((char **)&str, pint);
                if (res == 0)
                        break;
                if (res == 3) {
                        int n = validate ? 0 : nints - i;
                        int range_nums;

                        range_nums = get_range((char **)&str, pint, n);
                        if (range_nums < 0)
                                break;
                        /*
                         * Decrement the result by one to leave out the
                         * last number in the range.  The next iteration
                         * will handle the upper number in the range
                         */
                        i += (range_nums - 1);
                }
                i++;
                if (res == 1)
                        break;
        }
        ints[0] = i - 1;
        return (char *)str;
}
EXPORT_SYMBOL(get_options);

/**
 *        memparse - parse a string with mem suffixes into a number
 *        @ptr: Where parse begins
 *        @retptr: (output) Optional pointer to next char after parse completes
 *
 *        Parses a string into a number.  The number stored at @ptr is
 *        potentially suffixed with K, M, G, T, P, E.
 */

unsigned long long memparse(const char *ptr, char **retptr)
{
        char *endptr;        /* local pointer to end of parsed string */

        unsigned long long ret = simple_strtoull(ptr, &endptr, 0);

        switch (*endptr) {
        case 'E':
        case 'e':
                ret <<= 10;
                fallthrough;
        case 'P':
        case 'p':
                ret <<= 10;
                fallthrough;
        case 'T':
        case 't':
                ret <<= 10;
                fallthrough;
        case 'G':
        case 'g':
                ret <<= 10;
                fallthrough;
        case 'M':
        case 'm':
                ret <<= 10;
                fallthrough;
        case 'K':
        case 'k':
                ret <<= 10;
                endptr++;
                fallthrough;
        default:
                break;
        }

        if (retptr)
                *retptr = endptr;

        return ret;
}
EXPORT_SYMBOL(memparse);

/**
 *        parse_option_str - Parse a string and check an option is set or not
 *        @str: String to be parsed
 *        @option: option name
 *
 *        This function parses a string containing a comma-separated list of
 *        strings like a=b,c.
 *
 *        Return true if there's such option in the string, or return false.
 */
bool parse_option_str(const char *str, const char *option)
{
        while (*str) {
                if (!strncmp(str, option, strlen(option))) {
                        str += strlen(option);
                        if (!*str || *str == ',')
                                return true;
                }

                while (*str && *str != ',')
                        str++;

                if (*str == ',')
                        str++;
        }

        return false;
}

/*
 * Parse a string to get a param value pair.
 * You can use " around spaces, but can't escape ".
 * Hyphens and underscores equivalent in parameter names.
 */
char *next_arg(char *args, char **param, char **val)
{
        unsigned int i, equals = 0;
        int in_quote = 0, quoted = 0;

        if (*args == '"') {
                args++;
                in_quote = 1;
                quoted = 1;
        }

        for (i = 0; args[i]; i++) {
                if (isspace(args[i]) && !in_quote)
                        break;
                if (equals == 0) {
                        if (args[i] == '=')
                                equals = i;
                }
                if (args[i] == '"')
                        in_quote = !in_quote;
        }

        *param = args;
        if (!equals)
                *val = NULL;
        else {
                args[equals] = '\0';
                *val = args + equals + 1;

                /* Don't include quotes in value. */
                if (**val == '"') {
                        (*val)++;
                        if (args[i-1] == '"')
                                args[i-1] = '\0';
                }
        }
        if (quoted && i > 0 && args[i-1] == '"')
                args[i-1] = '\0';

        if (args[i]) {
                args[i] = '\0';
                args += i + 1;
        } else
                args += i;

        /* Chew up trailing spaces. */
        return skip_spaces(args);
}
EXPORT_SYMBOL(next_arg);














































































































































































































































































































































































































































































































































































































































   24 





   24 
   21 









   22 









   23 


































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
// SPDX-License-Identifier: GPL-2.0-only
/*
 * AppArmor security module
 *
 * This file contains AppArmor mediation of files
 *
 * Copyright (C) 1998-2008 Novell/SUSE
 * Copyright 2009-2010 Canonical Ltd.
 */

#include <linux/tty.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mount.h>

#include "include/af_unix.h"
#include "include/apparmor.h"
#include "include/audit.h"
#include "include/cred.h"
#include "include/file.h"
#include "include/match.h"
#include "include/net.h"
#include "include/path.h"
#include "include/policy.h"
#include "include/label.h"

static u32 map_mask_to_chr_mask(u32 mask)
{
        u32 m = mask & PERMS_CHRS_MASK;

        if (mask & AA_MAY_GETATTR)
                m |= MAY_READ;
        if (mask & (AA_MAY_SETATTR | AA_MAY_CHMOD | AA_MAY_CHOWN))
                m |= MAY_WRITE;

        return m;
}

/**
 * file_audit_cb - call back for file specific audit fields
 * @ab: audit_buffer  (NOT NULL)
 * @va: audit struct to audit values of  (NOT NULL)
 */
static void file_audit_cb(struct audit_buffer *ab, void *va)
{
        struct common_audit_data *sa = va;
        struct apparmor_audit_data *ad = aad(sa);
        kuid_t fsuid = ad->subj_cred ? ad->subj_cred->fsuid : current_fsuid();
        char str[10];

        if (ad->request & AA_AUDIT_FILE_MASK) {
                aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs,
                                    map_mask_to_chr_mask(ad->request));
                audit_log_format(ab, " requested_mask=\"%s\"", str);
        }
        if (ad->denied & AA_AUDIT_FILE_MASK) {
                aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs,
                                    map_mask_to_chr_mask(ad->denied));
                audit_log_format(ab, " denied_mask=\"%s\"", str);
        }
        if (ad->request & AA_AUDIT_FILE_MASK) {
                audit_log_format(ab, " fsuid=%d",
                                 from_kuid(&init_user_ns, fsuid));
                audit_log_format(ab, " ouid=%d",
                                 from_kuid(&init_user_ns, ad->fs.ouid));
        }

        if (ad->peer) {
                audit_log_format(ab, " target=");
                aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer,
                                FLAG_VIEW_SUBNS, GFP_KERNEL);
        } else if (ad->fs.target) {
                audit_log_format(ab, " target=");
                audit_log_untrustedstring(ab, ad->fs.target);
        }
}

/**
 * aa_audit_file - handle the auditing of file operations
 * @subj_cred: cred of the subject
 * @profile: the profile being enforced  (NOT NULL)
 * @perms: the permissions computed for the request (NOT NULL)
 * @op: operation being mediated
 * @request: permissions requested
 * @name: name of object being mediated (MAYBE NULL)
 * @target: name of target (MAYBE NULL)
 * @tlabel: target label (MAY BE NULL)
 * @ouid: object uid
 * @info: extra information message (MAYBE NULL)
 * @error: 0 if operation allowed else failure error code
 *
 * Returns: %0 or error on failure
 */
int aa_audit_file(const struct cred *subj_cred,
                  struct aa_profile *profile, struct aa_perms *perms,
                  const char *op, u32 request, const char *name,
                  const char *target, struct aa_label *tlabel,
                  kuid_t ouid, const char *info, int error)
{
        int type = AUDIT_APPARMOR_AUTO;
        DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_FILE, op);

        ad.subj_cred = subj_cred;
        ad.request = request;
        ad.tags = perms->tag;
        ad.name = name;
        ad.fs.target = target;
        ad.peer = tlabel;
        ad.fs.ouid = ouid;
        ad.info = info;
        ad.error = error;
        ad.common.u.tsk = NULL;

        if (likely(!ad.error)) {
                u32 mask = perms->audit;

                if (unlikely(AUDIT_MODE(profile) == AUDIT_ALL))
                        mask = 0xffff;

                /* mask off perms that are not being force audited */
                ad.request &= mask;

                if (likely(!ad.request))
                        return 0;
                type = AUDIT_APPARMOR_AUDIT;
        } else {
                /* only report permissions that were denied */
                ad.request = ad.request & ~perms->allow;
                AA_BUG(!ad.request);

                if (ad.request & perms->kill)
                        type = AUDIT_APPARMOR_KILL;

                /* quiet known rejects, assumes quiet and kill do not overlap */
                if ((ad.request & perms->quiet) &&
                    AUDIT_MODE(profile) != AUDIT_NOQUIET &&
                    AUDIT_MODE(profile) != AUDIT_ALL)
                        ad.request &= ~perms->quiet;

                if (!ad.request)
                        return ad.error;
        }

        ad.denied = ad.request & ~perms->allow;
        return aa_audit(type, profile, &ad, file_audit_cb);
}

static int path_name(const char *op, const struct cred *subj_cred,
                     struct aa_label *label,
                     const struct path *path, int flags, char *buffer,
                     const char **name, struct path_cond *cond, u32 request)
{
        struct aa_profile *profile;
        const char *info = NULL;
        int error;

        /* don't reaudit files closed during inheritance */
        if (unlikely(path->dentry == aa_null.dentry))
                error = -EACCES;
        else
                error = aa_path_name(path, flags, buffer, name, &info,
                                     labels_profile(label)->disconnected);
        if (error) {
                fn_for_each_confined(label, profile,
                        aa_audit_file(subj_cred,
                                      profile, &nullperms, op, request, *name,
                                      NULL, NULL, cond->uid, info, error));
                return error;
        }

        return 0;
}

struct aa_perms default_perms = {};
/**
 * aa_lookup_condperms - convert dfa compressed perms to internal perms
 * @subj_uid: uid to use for subject owner test
 * @rules: the aa_policydb to lookup perms for  (NOT NULL)
 * @state: state in dfa
 * @cond:  conditions to consider  (NOT NULL)
 *
 * TODO: convert from dfa + state to permission entry
 *
 * Returns: a pointer to a file permission set
 */
struct aa_perms *aa_lookup_condperms(kuid_t subj_uid, struct aa_policydb *rules,
                                     aa_state_t state, struct path_cond *cond)
{
        unsigned int index = ACCEPT_TABLE(rules->dfa)[state];

        if (!(rules->perms))
                return &default_perms;

        if ((ACCEPT_TABLE2(rules->dfa)[state] & ACCEPT_FLAG_OWNER)) {
                if (uid_eq(subj_uid, cond->uid))
                        return &(rules->perms[index]);
                return &(rules->perms[index + 1]);
        }

        return &(rules->perms[index]);
}

/**
 * aa_str_perms - find permission that match @name
 * @file_rules: the aa_policydb to match against  (NOT NULL)
 * @start: state to start matching in
 * @name: string to match against dfa  (NOT NULL)
 * @cond: conditions to consider for permission set computation  (NOT NULL)
 * @perms: Returns - the permissions found when matching @name
 *
 * Returns: the final state in @dfa when beginning @start and walking @name
 */
aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start,
                        const char *name, struct path_cond *cond,
                        struct aa_perms *perms)
{
        aa_state_t state;
        state = aa_dfa_match(file_rules->dfa, start, name);
        *perms = *(aa_lookup_condperms(current_fsuid(), file_rules, state,
                                       cond));

        return state;
}

int __aa_path_perm(const char *op, const struct cred *subj_cred,
                   struct aa_profile *profile, const char *name,
                   u32 request, struct path_cond *cond, int flags,
                   struct aa_perms *perms)
{
        struct aa_ruleset *rules = profile->label.rules[0];
        int e = 0;

        if (profile_unconfined(profile) ||
            ((flags & PATH_SOCK_COND) && !RULE_MEDIATES_v9NET(rules)))
                return 0;
        aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE],
                     name, cond, perms);
        if (request & ~perms->allow)
                e = -EACCES;
        return aa_audit_file(subj_cred,
                             profile, perms, op, request, name, NULL, NULL,
                             cond->uid, NULL, e);
}


static int profile_path_perm(const char *op, const struct cred *subj_cred,
                             struct aa_profile *profile,
                             const struct path *path, char *buffer, u32 request,
                             struct path_cond *cond, int flags,
                             struct aa_perms *perms)
{
        const char *name;
        int error;

        if (profile_unconfined(profile))
                return 0;

        error = path_name(op, subj_cred, &profile->label, path,
                          flags | profile->path_flags, buffer, &name, cond,
                          request);
        if (error)
                return error;
        return __aa_path_perm(op, subj_cred, profile, name, request, cond,
                              flags, perms);
}

/**
 * aa_path_perm - do permissions check & audit for @path
 * @op: operation being checked
 * @subj_cred: subject cred
 * @label: profile being enforced  (NOT NULL)
 * @path: path to check permissions of  (NOT NULL)
 * @flags: any additional path flags beyond what the profile specifies
 * @request: requested permissions
 * @cond: conditional info for this request  (NOT NULL)
 *
 * Returns: %0 else error if access denied or other error
 */
int aa_path_perm(const char *op, const struct cred *subj_cred,
                 struct aa_label *label,
                 const struct path *path, int flags, u32 request,
                 struct path_cond *cond)
{
        struct aa_perms perms = {};
        struct aa_profile *profile;
        char *buffer = NULL;
        int error;

        flags |= PATH_DELEGATE_DELETED | (S_ISDIR(cond->mode) ? PATH_IS_DIR :
                                                                0);
        buffer = aa_get_buffer(false);
        if (!buffer)
                return -ENOMEM;
        error = fn_for_each_confined(label, profile,
                        profile_path_perm(op, subj_cred, profile, path, buffer,
                                          request, cond, flags, &perms));

        aa_put_buffer(buffer);

        return error;
}

/**
 * xindex_is_subset - helper for aa_path_link
 * @link: link permission set
 * @target: target permission set
 *
 * test target x permissions are equal OR a subset of link x permissions
 * this is done as part of the subset test, where a hardlink must have
 * a subset of permissions that the target has.
 *
 * Returns: true if subset else false
 */
static inline bool xindex_is_subset(u32 link, u32 target)
{
        if (((link & ~AA_X_UNSAFE) != (target & ~AA_X_UNSAFE)) ||
            ((link & AA_X_UNSAFE) && !(target & AA_X_UNSAFE)))
                return false;

        return true;
}

static int profile_path_link(const struct cred *subj_cred,
                             struct aa_profile *profile,
                             const struct path *link, char *buffer,
                             const struct path *target, char *buffer2,
                             struct path_cond *cond)
{
        struct aa_ruleset *rules = profile->label.rules[0];
        const char *lname, *tname = NULL;
        struct aa_perms lperms = {}, perms;
        const char *info = NULL;
        u32 request = AA_MAY_LINK;
        aa_state_t state;
        int error;

        error = path_name(OP_LINK, subj_cred, &profile->label, link,
                          profile->path_flags,
                          buffer, &lname, cond, AA_MAY_LINK);
        if (error)
                goto audit;

        /* buffer2 freed below, tname is pointer in buffer2 */
        error = path_name(OP_LINK, subj_cred, &profile->label, target,
                          profile->path_flags,
                          buffer2, &tname, cond, AA_MAY_LINK);
        if (error)
                goto audit;

        error = -EACCES;
        /* aa_str_perms - handles the case of the dfa being NULL */
        state = aa_str_perms(rules->file,
                             rules->file->start[AA_CLASS_FILE], lname,
                             cond, &lperms);

        if (!(lperms.allow & AA_MAY_LINK))
                goto audit;

        /* test to see if target can be paired with link */
        state = aa_dfa_null_transition(rules->file->dfa, state);
        aa_str_perms(rules->file, state, tname, cond, &perms);

        /* force audit/quiet masks for link are stored in the second entry
         * in the link pair.
         */
        lperms.audit = perms.audit;
        lperms.quiet = perms.quiet;
        lperms.kill = perms.kill;

        if (!(perms.allow & AA_MAY_LINK)) {
                info = "target restricted";
                lperms = perms;
                goto audit;
        }

        /* done if link subset test is not required */
        if (!(perms.allow & AA_LINK_SUBSET))
                goto done_tests;

        /* Do link perm subset test requiring allowed permission on link are
         * a subset of the allowed permissions on target.
         */
        aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE],
                     tname, cond, &perms);

        /* AA_MAY_LINK is not considered in the subset test */
        request = lperms.allow & ~AA_MAY_LINK;
        lperms.allow &= perms.allow | AA_MAY_LINK;

        request |= AA_AUDIT_FILE_MASK & (lperms.allow & ~perms.allow);
        if (request & ~lperms.allow) {
                goto audit;
        } else if ((lperms.allow & MAY_EXEC) &&
                   !xindex_is_subset(lperms.xindex, perms.xindex)) {
                lperms.allow &= ~MAY_EXEC;
                request |= MAY_EXEC;
                info = "link not subset of target";
                goto audit;
        }

done_tests:
        error = 0;

audit:
        return aa_audit_file(subj_cred,
                             profile, &lperms, OP_LINK, request, lname, tname,
                             NULL, cond->uid, info, error);
}

/**
 * aa_path_link - Handle hard link permission check
 * @subj_cred: subject cred
 * @label: the label being enforced  (NOT NULL)
 * @old_dentry: the target dentry  (NOT NULL)
 * @new_dir: directory the new link will be created in  (NOT NULL)
 * @new_dentry: the link being created  (NOT NULL)
 *
 * Handle the permission test for a link & target pair.  Permission
 * is encoded as a pair where the link permission is determined
 * first, and if allowed, the target is tested.  The target test
 * is done from the point of the link match (not start of DFA)
 * making the target permission dependent on the link permission match.
 *
 * The subset test if required forces that permissions granted
 * on link are a subset of the permission granted to target.
 *
 * Returns: %0 if allowed else error
 */
int aa_path_link(const struct cred *subj_cred,
                 struct aa_label *label, struct dentry *old_dentry,
                 const struct path *new_dir, struct dentry *new_dentry)
{
        struct path link = { .mnt = new_dir->mnt, .dentry = new_dentry };
        struct path target = { .mnt = new_dir->mnt, .dentry = old_dentry };
        struct inode *inode = d_backing_inode(old_dentry);
        vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(target.mnt), inode);
        struct path_cond cond = {
                .uid = vfsuid_into_kuid(vfsuid),
                .mode = inode->i_mode,
        };
        char *buffer = NULL, *buffer2 = NULL;
        struct aa_profile *profile;
        int error;

        /* buffer freed below, lname is pointer in buffer */
        buffer = aa_get_buffer(false);
        buffer2 = aa_get_buffer(false);
        error = -ENOMEM;
        if (!buffer || !buffer2)
                goto out;

        error = fn_for_each_confined(label, profile,
                        profile_path_link(subj_cred, profile, &link, buffer,
                                          &target, buffer2, &cond));
out:
        aa_put_buffer(buffer);
        aa_put_buffer(buffer2);
        return error;
}

static void update_file_ctx(struct aa_file_ctx *fctx, struct aa_label *label,
                            u32 request)
{
        struct aa_label *l, *old;

        /* update caching of label on file_ctx */
        spin_lock(&fctx->lock);
        old = rcu_dereference_protected(fctx->label,
                                        lockdep_is_held(&fctx->lock));
        l = aa_label_merge(old, label, GFP_ATOMIC);
        if (l) {
                if (l != old) {
                        rcu_assign_pointer(fctx->label, l);
                        aa_put_label(old);
                } else
                        aa_put_label(l);
                fctx->allow |= request;
        }
        spin_unlock(&fctx->lock);
}

static int __file_path_perm(const char *op, const struct cred *subj_cred,
                            struct aa_label *label,
                            struct aa_label *flabel, struct file *file,
                            u32 request, u32 denied, bool in_atomic)
{
        struct aa_profile *profile;
        struct aa_perms perms = {};
        vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(file),
                                            file_inode(file));
        struct path_cond cond = {
                .uid = vfsuid_into_kuid(vfsuid),
                .mode = file_inode(file)->i_mode
        };
        char *buffer;
        int flags, error;

        /* revalidation due to label out of date. No revocation at this time */
        if (!denied && aa_label_is_subset(flabel, label))
                /* TODO: check for revocation on stale profiles */
                return 0;

        flags = PATH_DELEGATE_DELETED | (S_ISDIR(cond.mode) ? PATH_IS_DIR : 0);
        buffer = aa_get_buffer(in_atomic);
        if (!buffer)
                return -ENOMEM;

        /* check every profile in task label not in current cache */
        error = fn_for_each_not_in_set(flabel, label, profile,
                        profile_path_perm(op, subj_cred, profile,
                                          &file->f_path, buffer,
                                          request, &cond, flags, &perms));
        if (denied && !error) {
                /*
                 * check every profile in file label that was not tested
                 * in the initial check above.
                 *
                 * TODO: cache full perms so this only happens because of
                 * conditionals
                 * TODO: don't audit here
                 */
                if (label == flabel)
                        error = fn_for_each(label, profile,
                                profile_path_perm(op, subj_cred,
                                                  profile, &file->f_path,
                                                  buffer, request, &cond, flags,
                                                  &perms));
                else
                        error = fn_for_each_not_in_set(label, flabel, profile,
                                profile_path_perm(op, subj_cred,
                                                  profile, &file->f_path,
                                                  buffer, request, &cond, flags,
                                                  &perms));
        }
        if (!error)
                update_file_ctx(file_ctx(file), label, request);

        aa_put_buffer(buffer);

        return error;
}

static int __file_sock_perm(const char *op, const struct cred *subj_cred,
                            struct aa_label *label,
                            struct aa_label *flabel, struct file *file,
                            u32 request, u32 denied)
{
        int error;

        /* revalidation due to label out of date. No revocation at this time */
        if (!denied && aa_label_is_subset(flabel, label))
                return 0;

        /* TODO: improve to skip profiles cached in flabel */
        error = aa_sock_file_perm(subj_cred, label, op, request, file);
        if (denied) {
                /* TODO: improve to skip profiles checked above */
                /* check every profile in file label to is cached */
                last_error(error, aa_sock_file_perm(subj_cred, flabel, op,
                                                    request, file));
        }
        if (!error)
                update_file_ctx(file_ctx(file), label, request);

        return error;
}

/* for now separate fn to indicate semantics of the check */
static bool __file_is_delegated(struct aa_label *obj_label)
{
        return unconfined(obj_label);
}

static bool __is_unix_file(struct file *file)
{
        struct socket *sock = (struct socket *) file->private_data;

        lockdep_assert_in_rcu_read_lock();

        if (!S_ISSOCK(file_inode(file)->i_mode))
                return false;
        /* sock and sock->sk can be NULL for sockets being set up or torn down */
        if (!sock || !sock->sk)
                return false;
        if (sock->sk->sk_family == PF_UNIX)
                return true;
        return false;
}

static bool __unix_needs_revalidation(struct file *file, struct aa_label *label,
                                      u32 request)
{
        struct socket *sock = (struct socket *) file->private_data;

        AA_BUG(!__is_unix_file(file));
        lockdep_assert_in_rcu_read_lock();

        struct aa_sk_ctx *skctx = aa_sock(sock->sk);

        if (rcu_access_pointer(skctx->peer) !=
            rcu_access_pointer(skctx->peer_lastupdate))
                return true;

        return !__aa_subj_label_is_cached(rcu_dereference(skctx->label), label);
}

/**
 * aa_file_perm - do permission revalidation check & audit for @file
 * @op: operation being checked
 * @subj_cred: subject cred
 * @label: label being enforced   (NOT NULL)
 * @file: file to revalidate access permissions on  (NOT NULL)
 * @request: requested permissions
 * @in_atomic: whether allocations need to be done in atomic context
 *
 * Returns: %0 if access allowed else error
 */
int aa_file_perm(const char *op, const struct cred *subj_cred,
                 struct aa_label *label, struct file *file,
                 u32 request, bool in_atomic)
{
        struct aa_file_ctx *fctx;
        struct aa_label *flabel;
        u32 denied;
        int error = 0;

        AA_BUG(!label);
        AA_BUG(!file);

        /* don't reaudit files closed during inheritance */
        if (unlikely(file->f_path.dentry == aa_null.dentry))
                return -EACCES;

        fctx = file_ctx(file);

        rcu_read_lock();
        flabel  = rcu_dereference(fctx->label);
        AA_BUG(!flabel);

        /* revalidate access, if task is unconfined, or the cached cred
         * doesn't match or if the request is for more permissions than
         * was granted.
         *
         * Note: the test for !unconfined(flabel) is to handle file
         *       delegation from unconfined tasks
         */
        denied = request & ~fctx->allow;
        if (unconfined(label) || __file_is_delegated(flabel) ||
            (!denied && __is_unix_file(file) && !__unix_needs_revalidation(file, label, request)) ||
            (!denied && __aa_subj_label_is_cached(label, flabel))) {
                rcu_read_unlock();
                goto done;
        }

        /* slow path - revalidate access */
        flabel  = aa_get_newest_label(flabel);
        rcu_read_unlock();

        if (path_mediated_fs(file->f_path.dentry))
                error = __file_path_perm(op, subj_cred, label, flabel, file,
                                         request, denied, in_atomic);

        else if (S_ISSOCK(file_inode(file)->i_mode))
                error = __file_sock_perm(op, subj_cred, label, flabel, file,
                                         request, denied);
        aa_put_label(flabel);

done:
        return error;
}

static void revalidate_tty(const struct cred *subj_cred, struct aa_label *label)
{
        struct tty_struct *tty;
        int drop_tty = 0;

        tty = get_current_tty();
        if (!tty)
                return;

        spin_lock(&tty->files_lock);
        if (!list_empty(&tty->tty_files)) {
                struct tty_file_private *file_priv;
                struct file *file;
                /* TODO: Revalidate access to controlling tty. */
                file_priv = list_first_entry(&tty->tty_files,
                                             struct tty_file_private, list);
                file = file_priv->file;

                if (aa_file_perm(OP_INHERIT, subj_cred, label, file,
                                 MAY_READ | MAY_WRITE, IN_ATOMIC))
                        drop_tty = 1;
        }
        spin_unlock(&tty->files_lock);
        tty_kref_put(tty);

        if (drop_tty)
                no_tty();
}

struct cred_label {
        const struct cred *cred;
        struct aa_label *label;
};

static int match_file(const void *p, struct file *file, unsigned int fd)
{
        struct cred_label *cl = (struct cred_label *)p;

        if (aa_file_perm(OP_INHERIT, cl->cred, cl->label, file,
                         aa_map_file_to_perms(file), IN_ATOMIC))
                return fd + 1;
        return 0;
}


/* based on selinux's flush_unauthorized_files */
void aa_inherit_files(const struct cred *cred, struct files_struct *files)
{
        struct aa_label *label = aa_get_newest_cred_label(cred);
        struct cred_label cl = {
                .cred = cred,
                .label = label,
        };
        struct file *devnull = NULL;
        unsigned int n;

        revalidate_tty(cred, label);

        /* Revalidate access to inherited open files. */
        n = iterate_fd(files, 0, match_file, &cl);
        if (!n) /* none found? */
                goto out;

        devnull = dentry_open(&aa_null, O_RDWR, cred);
        if (IS_ERR(devnull))
                devnull = NULL;
        /* replace all the matching ones with this */
        do {
                replace_fd(n - 1, devnull, 0);
        } while ((n = iterate_fd(files, n, match_file, &cl)) != 0);
        if (devnull)
                fput(devnull);
out:
        aa_put_label(label);
}

































































































































































































































































































    1 



    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TTY_PORT_H
#define _LINUX_TTY_PORT_H

#include <linux/kfifo.h>
#include <linux/kref.h>
#include <linux/mutex.h>
#include <linux/tty_buffer.h>
#include <linux/wait.h>

struct attribute_group;
struct tty_driver;
struct tty_port;
struct tty_struct;

/**
 * struct tty_port_operations -- operations on tty_port
 * @carrier_raised: return true if the carrier is raised on @port
 * @dtr_rts: raise the DTR line if @active is true, otherwise lower DTR
 * @shutdown: called when the last close completes or a hangup finishes IFF the
 *        port was initialized. Do not use to free resources. Turn off the device
 *        only. Called under the port mutex to serialize against @activate and
 *        @shutdown.
 * @activate: called under the port mutex from tty_port_open(), serialized using
 *        the port mutex. Supposed to turn on the device.
 *
 *        FIXME: long term getting the tty argument *out* of this would be good
 *        for consoles.
 *
 * @destruct: called on the final put of a port. Free resources, possibly incl.
 *        the port itself.
 */
struct tty_port_operations {
        bool (*carrier_raised)(struct tty_port *port);
        void (*dtr_rts)(struct tty_port *port, bool active);
        void (*shutdown)(struct tty_port *port);
        int (*activate)(struct tty_port *port, struct tty_struct *tty);
        void (*destruct)(struct tty_port *port);
};

struct tty_port_client_operations {
        size_t (*receive_buf)(struct tty_port *port, const u8 *cp, const u8 *fp,
                              size_t count);
        void (*lookahead_buf)(struct tty_port *port, const u8 *cp,
                              const u8 *fp, size_t count);
        void (*write_wakeup)(struct tty_port *port);
};

extern const struct tty_port_client_operations tty_port_default_client_ops;

/**
 * struct tty_port -- port level information
 *
 * @buf: buffer for this port, locked internally
 * @tty: back pointer to &struct tty_struct, valid only if the tty is open. Use
 *         tty_port_tty_get() to obtain it (and tty_kref_put() to release).
 * @itty: internal back pointer to &struct tty_struct. Avoid this. It should be
 *          eliminated in the long term.
 * @ops: tty port operations (like activate, shutdown), see &struct
 *         tty_port_operations
 * @client_ops: tty port client operations (like receive_buf, write_wakeup).
 *                By default, tty_port_default_client_ops is used.
 * @lock: lock protecting @tty
 * @blocked_open: # of procs waiting for open in tty_port_block_til_ready()
 * @count: usage count
 * @open_wait: open waiters queue (waiting e.g. for a carrier)
 * @delta_msr_wait: modem status change queue (waiting for MSR changes)
 * @flags: user TTY flags (%ASYNC_)
 * @iflags: internal flags (%TTY_PORT_)
 * @console: when set, the port is a console
 * @mutex: locking, for open, shutdown and other port operations
 * @buf_mutex: @xmit_buf alloc lock
 * @xmit_buf: optional xmit buffer used by some drivers
 * @xmit_fifo: optional xmit buffer used by some drivers
 * @close_delay: delay in jiffies to wait when closing the port
 * @closing_wait: delay in jiffies for output to be sent before closing
 * @drain_delay: set to zero if no pure time based drain is needed else set to
 *                 size of fifo
 * @kref: references counter. Reaching zero calls @ops->destruct() if non-%NULL
 *          or frees the port otherwise.
 * @client_data: pointer to private data, for @client_ops
 *
 * Each device keeps its own port level information. &struct tty_port was
 * introduced as a common structure for such information. As every TTY device
 * shall have a backing tty_port structure, every driver can use these members.
 *
 * The tty port has a different lifetime to the tty so must be kept apart.
 * In addition be careful as tty -> port mappings are valid for the life
 * of the tty object but in many cases port -> tty mappings are valid only
 * until a hangup so don't use the wrong path.
 *
 * Tty port shall be initialized by tty_port_init() and shut down either by
 * tty_port_destroy() (refcounting not used), or tty_port_put() (refcounting).
 *
 * There is a lot of helpers around &struct tty_port too. To name the most
 * significant ones: tty_port_open(), tty_port_close() (or
 * tty_port_close_start() and tty_port_close_end() separately if need be), and
 * tty_port_hangup(). These call @ops->activate() and @ops->shutdown() as
 * needed.
 */
struct tty_port {
        struct tty_bufhead        buf;
        struct tty_struct        *tty;
        struct tty_struct        *itty;
        const struct tty_port_operations *ops;
        const struct tty_port_client_operations *client_ops;
        spinlock_t                lock;
        int                        blocked_open;
        int                        count;
        wait_queue_head_t        open_wait;
        wait_queue_head_t        delta_msr_wait;
        unsigned long                flags;
        unsigned long                iflags;
        unsigned char                console:1;
        struct mutex                mutex;
        struct mutex                buf_mutex;
        u8                        *xmit_buf;
        DECLARE_KFIFO_PTR(xmit_fifo, u8);
        unsigned int                close_delay;
        unsigned int                closing_wait;
        int                        drain_delay;
        struct kref                kref;
        void                        *client_data;
};

/* tty_port::iflags bits -- use atomic bit ops */
#define TTY_PORT_INITIALIZED        0        /* device is initialized */
#define TTY_PORT_SUSPENDED        1        /* device is suspended */
#define TTY_PORT_ACTIVE                2        /* device is open */

/*
 * uart drivers: use the uart_port::status field and the UPSTAT_* defines
 * for s/w-based flow control steering and carrier detection status
 */
#define TTY_PORT_CTS_FLOW        3        /* h/w flow control enabled */
#define TTY_PORT_CHECK_CD        4        /* carrier detect enabled */
#define TTY_PORT_KOPENED        5        /* device exclusively opened by
                                           kernel */

void tty_port_init(struct tty_port *port);
void tty_port_link_wq(struct tty_port *port, struct workqueue_struct *flip_wq);
void tty_port_link_device(struct tty_port *port, struct tty_driver *driver,
                unsigned index);
struct device *tty_port_register_device(struct tty_port *port,
                struct tty_driver *driver, unsigned index,
                struct device *device);
struct device *tty_port_register_device_attr(struct tty_port *port,
                struct tty_driver *driver, unsigned index,
                struct device *device, void *drvdata,
                const struct attribute_group **attr_grp);
struct device *tty_port_register_device_attr_serdev(struct tty_port *port,
                struct tty_driver *driver, unsigned index,
                struct device *host, struct device *parent, void *drvdata,
                const struct attribute_group **attr_grp);
void tty_port_unregister_device(struct tty_port *port,
                struct tty_driver *driver, unsigned index);
int tty_port_alloc_xmit_buf(struct tty_port *port);
void tty_port_free_xmit_buf(struct tty_port *port);
void tty_port_destroy(struct tty_port *port);
void tty_port_put(struct tty_port *port);

static inline struct tty_port *tty_port_get(struct tty_port *port)
{
        if (port && kref_get_unless_zero(&port->kref))
                return port;
        return NULL;
}

/*
 * Never overwrite the workqueue set by tty_port_link_wq().
 * No effect when %TTY_DRIVER_NO_WORKQUEUE is set, as driver->flip_wq is
 * %NULL.
 */
static inline void tty_port_link_driver_wq(struct tty_port *port,
                                           struct tty_driver *driver)
{
        if (!port->buf.flip_wq)
                tty_port_link_wq(port, driver->flip_wq);
}

/* If the cts flow control is enabled, return true. */
static inline bool tty_port_cts_enabled(const struct tty_port *port)
{
        return test_bit(TTY_PORT_CTS_FLOW, &port->iflags);
}

static inline void tty_port_set_cts_flow(struct tty_port *port, bool val)
{
        assign_bit(TTY_PORT_CTS_FLOW, &port->iflags, val);
}

static inline bool tty_port_active(const struct tty_port *port)
{
        return test_bit(TTY_PORT_ACTIVE, &port->iflags);
}

static inline void tty_port_set_active(struct tty_port *port, bool val)
{
        assign_bit(TTY_PORT_ACTIVE, &port->iflags, val);
}

static inline bool tty_port_check_carrier(const struct tty_port *port)
{
        return test_bit(TTY_PORT_CHECK_CD, &port->iflags);
}

static inline void tty_port_set_check_carrier(struct tty_port *port, bool val)
{
        assign_bit(TTY_PORT_CHECK_CD, &port->iflags, val);
}

static inline bool tty_port_suspended(const struct tty_port *port)
{
        return test_bit(TTY_PORT_SUSPENDED, &port->iflags);
}

static inline void tty_port_set_suspended(struct tty_port *port, bool val)
{
        assign_bit(TTY_PORT_SUSPENDED, &port->iflags, val);
}

static inline bool tty_port_initialized(const struct tty_port *port)
{
        return test_bit(TTY_PORT_INITIALIZED, &port->iflags);
}

static inline void tty_port_set_initialized(struct tty_port *port, bool val)
{
        assign_bit(TTY_PORT_INITIALIZED, &port->iflags, val);
}

static inline bool tty_port_kopened(const struct tty_port *port)
{
        return test_bit(TTY_PORT_KOPENED, &port->iflags);
}

static inline void tty_port_set_kopened(struct tty_port *port, bool val)
{
        assign_bit(TTY_PORT_KOPENED, &port->iflags, val);
}

struct tty_struct *tty_port_tty_get(struct tty_port *port);
void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
bool tty_port_carrier_raised(struct tty_port *port);
void tty_port_raise_dtr_rts(struct tty_port *port);
void tty_port_lower_dtr_rts(struct tty_port *port);
void tty_port_hangup(struct tty_port *port);
void __tty_port_tty_hangup(struct tty_port *port, bool check_clocal, bool async);
void tty_port_tty_wakeup(struct tty_port *port);
int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty,
                struct file *filp);
int tty_port_close_start(struct tty_port *port, struct tty_struct *tty,
                struct file *filp);
void tty_port_close_end(struct tty_port *port, struct tty_struct *tty);
void tty_port_close(struct tty_port *port, struct tty_struct *tty,
                struct file *filp);
int tty_port_install(struct tty_port *port, struct tty_driver *driver,
                struct tty_struct *tty);
int tty_port_open(struct tty_port *port, struct tty_struct *tty,
                struct file *filp);

static inline int tty_port_users(struct tty_port *port)
{
        return port->count + port->blocked_open;
}

/**
 * tty_port_tty_hangup - helper to hang up a tty asynchronously
 * @port: tty port
 * @check_clocal: hang only ttys with %CLOCAL unset?
 */
static inline void tty_port_tty_hangup(struct tty_port *port, bool check_clocal)
{
        __tty_port_tty_hangup(port, check_clocal, true);
}

/**
 * tty_port_tty_vhangup - helper to hang up a tty synchronously
 * @port: tty port
 */
static inline void tty_port_tty_vhangup(struct tty_port *port)
{
        __tty_port_tty_hangup(port, false, false);
}

#ifdef CONFIG_TTY
void tty_kref_put(struct tty_struct *tty);
__DEFINE_CLASS_IS_CONDITIONAL(tty_port_tty, true);
__DEFINE_UNLOCK_GUARD(tty_port_tty, struct tty_struct, tty_kref_put(_T->lock));
static inline class_tty_port_tty_t class_tty_port_tty_constructor(struct tty_port *tport)
{
        class_tty_port_tty_t _t = {
                .lock = tty_port_tty_get(tport),
        };
        return _t;
}
#define scoped_tty()        ((struct tty_struct *)(__guard_ptr(tty_port_tty)(&scope)))
#endif

#endif


































































































































































































































































































































































































































































































































































































































































































































































   11 







































































   11 


   10 

























   11 













   10 
   12 






   11 


























































































































































































































































































   11 








   11 

























   11 







































   11 
   11 








































































































































































































































































































   11 





























   11 





































































































































   11 



   10 






























   12 







   13 






   11 



   11 












    7 

   10 
   11 





































































































   12 
















   11 












    9 





   12 




























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 1993 by Theodore Ts'o.
 */
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/major.h>
#include <linux/wait.h>
#include <linux/blkpg.h>
#include <linux/init.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/suspend.h>
#include <linux/freezer.h>
#include <linux/mutex.h>
#include <linux/writeback.h>
#include <linux/completion.h>
#include <linux/highmem.h>
#include <linux/splice.h>
#include <linux/sysfs.h>
#include <linux/miscdevice.h>
#include <linux/falloc.h>
#include <linux/uio.h>
#include <linux/ioprio.h>
#include <linux/blk-cgroup.h>
#include <linux/sched/mm.h>
#include <linux/statfs.h>
#include <linux/uaccess.h>
#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <uapi/linux/loop.h>

/* Possible states of device */
enum {
        Lo_unbound,
        Lo_bound,
        Lo_rundown,
        Lo_deleting,
};

struct loop_device {
        int                lo_number;
        loff_t                lo_offset;
        loff_t                lo_sizelimit;
        int                lo_flags;
        char                lo_file_name[LO_NAME_SIZE];

        struct file        *lo_backing_file;
        unsigned int        lo_min_dio_size;
        struct block_device *lo_device;

        gfp_t                old_gfp_mask;

        spinlock_t                lo_lock;
        int                        lo_state;
        spinlock_t              lo_work_lock;
        struct workqueue_struct *workqueue;
        struct work_struct      rootcg_work;
        struct list_head        rootcg_cmd_list;
        struct list_head        idle_worker_list;
        struct rb_root          worker_tree;
        struct timer_list       timer;
        bool                        sysfs_inited;

        struct request_queue        *lo_queue;
        struct blk_mq_tag_set        tag_set;
        struct gendisk                *lo_disk;
        struct mutex                lo_mutex;
        bool                        idr_visible;
};

struct loop_cmd {
        struct list_head list_entry;
        bool use_aio; /* use AIO interface to handle I/O */
        atomic_t ref; /* only for aio */
        long ret;
        struct kiocb iocb;
        struct bio_vec *bvec;
        struct cgroup_subsys_state *blkcg_css;
        struct cgroup_subsys_state *memcg_css;
};

#define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
#define LOOP_DEFAULT_HW_Q_DEPTH 128

static DEFINE_IDR(loop_index_idr);
static DEFINE_MUTEX(loop_ctl_mutex);
static DEFINE_MUTEX(loop_validate_mutex);

/**
 * loop_global_lock_killable() - take locks for safe loop_validate_file() test
 *
 * @lo: struct loop_device
 * @global: true if @lo is about to bind another "struct loop_device", false otherwise
 *
 * Returns 0 on success, -EINTR otherwise.
 *
 * Since loop_validate_file() traverses on other "struct loop_device" if
 * is_loop_device() is true, we need a global lock for serializing concurrent
 * loop_configure()/loop_change_fd()/__loop_clr_fd() calls.
 */
static int loop_global_lock_killable(struct loop_device *lo, bool global)
{
        int err;

        if (global) {
                err = mutex_lock_killable(&loop_validate_mutex);
                if (err)
                        return err;
        }
        err = mutex_lock_killable(&lo->lo_mutex);
        if (err && global)
                mutex_unlock(&loop_validate_mutex);
        return err;
}

/**
 * loop_global_unlock() - release locks taken by loop_global_lock_killable()
 *
 * @lo: struct loop_device
 * @global: true if @lo was about to bind another "struct loop_device", false otherwise
 */
static void loop_global_unlock(struct loop_device *lo, bool global)
{
        mutex_unlock(&lo->lo_mutex);
        if (global)
                mutex_unlock(&loop_validate_mutex);
}

static int max_part;
static int part_shift;

static loff_t lo_calculate_size(struct loop_device *lo, struct file *file)
{
        loff_t loopsize;
        int ret;

        if (S_ISBLK(file_inode(file)->i_mode)) {
                loopsize = i_size_read(file->f_mapping->host);
        } else {
                struct kstat stat;

                /*
                 * Get the accurate file size. This provides better results than
                 * cached inode data, particularly for network filesystems where
                 * metadata may be stale.
                 */
                ret = vfs_getattr_nosec(&file->f_path, &stat, STATX_SIZE, 0);
                if (ret)
                        return 0;

                loopsize = stat.size;
        }

        if (lo->lo_offset > 0)
                loopsize -= lo->lo_offset;
        /* offset is beyond i_size, weird but possible */
        if (loopsize < 0)
                return 0;
        if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
                loopsize = lo->lo_sizelimit;
        /*
         * Unfortunately, if we want to do I/O on the device,
         * the number of 512-byte sectors has to fit into a sector_t.
         */
        return loopsize >> 9;
}

/*
 * We support direct I/O only if lo_offset is aligned with the logical I/O size
 * of backing device, and the logical block size of loop is bigger than that of
 * the backing device.
 */
static bool lo_can_use_dio(struct loop_device *lo)
{
        if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT))
                return false;
        if (queue_logical_block_size(lo->lo_queue) < lo->lo_min_dio_size)
                return false;
        if (lo->lo_offset & (lo->lo_min_dio_size - 1))
                return false;
        return true;
}

/*
 * Direct I/O can be enabled either by using an O_DIRECT file descriptor, or by
 * passing in the LO_FLAGS_DIRECT_IO flag from userspace.  It will be silently
 * disabled when the device block size is too small or the offset is unaligned.
 *
 * loop_get_status will always report the effective LO_FLAGS_DIRECT_IO flag and
 * not the originally passed in one.
 */
static inline void loop_update_dio(struct loop_device *lo)
{
        lockdep_assert_held(&lo->lo_mutex);
        WARN_ON_ONCE(lo->lo_state == Lo_bound &&
                     lo->lo_queue->mq_freeze_depth == 0);

        if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo))
                lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
}

/**
 * loop_set_size() - sets device size and notifies userspace
 * @lo: struct loop_device to set the size for
 * @size: new size of the loop device
 *
 * Callers must validate that the size passed into this function fits into
 * a sector_t, eg using loop_validate_size()
 */
static void loop_set_size(struct loop_device *lo, loff_t size)
{
        if (!set_capacity_and_notify(lo->lo_disk, size))
                kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
}

static void loop_clear_limits(struct loop_device *lo, int mode)
{
        struct queue_limits lim = queue_limits_start_update(lo->lo_queue);

        if (mode & FALLOC_FL_ZERO_RANGE)
                lim.max_write_zeroes_sectors = 0;

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                lim.max_hw_discard_sectors = 0;
                lim.discard_granularity = 0;
        }

        /*
         * XXX: this updates the queue limits without freezing the queue, which
         * is against the locking protocol and dangerous.  But we can't just
         * freeze the queue as we're inside the ->queue_rq method here.  So this
         * should move out into a workqueue unless we get the file operations to
         * advertise if they support specific fallocate operations.
         */
        queue_limits_commit_update(lo->lo_queue, &lim);
}

static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
                        int mode)
{
        /*
         * We use fallocate to manipulate the space mappings used by the image
         * a.k.a. discard/zerorange.
         */
        struct file *file = lo->lo_backing_file;
        int ret;

        mode |= FALLOC_FL_KEEP_SIZE;

        if (!bdev_max_discard_sectors(lo->lo_device))
                return -EOPNOTSUPP;

        ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
        if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
                return -EIO;

        /*
         * We initially configure the limits in a hope that fallocate is
         * supported and clear them here if that turns out not to be true.
         */
        if (unlikely(ret == -EOPNOTSUPP))
                loop_clear_limits(lo, mode);

        return ret;
}

static int lo_req_flush(struct loop_device *lo, struct request *rq)
{
        int ret = vfs_fsync(lo->lo_backing_file, 0);
        if (unlikely(ret && ret != -EINVAL))
                ret = -EIO;

        return ret;
}

static void lo_complete_rq(struct request *rq)
{
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
        blk_status_t ret = BLK_STS_OK;

        if (cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
            req_op(rq) != REQ_OP_READ) {
                if (cmd->ret < 0)
                        ret = errno_to_blk_status(cmd->ret);
                goto end_io;
        }

        /*
         * Short READ - if we got some data, advance our request and
         * retry it. If we got no data, end the rest with EIO.
         */
        if (cmd->ret) {
                blk_update_request(rq, BLK_STS_OK, cmd->ret);
                cmd->ret = 0;
                blk_mq_requeue_request(rq, true);
        } else {
                struct bio *bio = rq->bio;

                while (bio) {
                        zero_fill_bio(bio);
                        bio = bio->bi_next;
                }

                ret = BLK_STS_IOERR;
end_io:
                blk_mq_end_request(rq, ret);
        }
}

static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
{
        struct request *rq = blk_mq_rq_from_pdu(cmd);

        if (!atomic_dec_and_test(&cmd->ref))
                return;
        kfree(cmd->bvec);
        cmd->bvec = NULL;
        if (req_op(rq) == REQ_OP_WRITE)
                kiocb_end_write(&cmd->iocb);
        if (likely(!blk_should_fake_timeout(rq->q)))
                blk_mq_complete_request(rq);
}

static void lo_rw_aio_complete(struct kiocb *iocb, long ret)
{
        struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);

        cmd->ret = ret;
        lo_rw_aio_do_completion(cmd);
}

static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
                     loff_t pos, int rw)
{
        struct iov_iter iter;
        struct req_iterator rq_iter;
        struct bio_vec *bvec;
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        struct bio *bio = rq->bio;
        struct file *file = lo->lo_backing_file;
        struct bio_vec tmp;
        unsigned int offset;
        unsigned int nr_bvec;
        int ret;

        nr_bvec = blk_rq_nr_bvec(rq);

        if (rq->bio != rq->biotail) {

                bvec = kmalloc_objs(struct bio_vec, nr_bvec, GFP_NOIO);
                if (!bvec)
                        return -EIO;
                cmd->bvec = bvec;

                /*
                 * The bios of the request may be started from the middle of
                 * the 'bvec' because of bio splitting, so we can't directly
                 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
                 * API will take care of all details for us.
                 */
                rq_for_each_bvec(tmp, rq, rq_iter) {
                        *bvec = tmp;
                        bvec++;
                }
                bvec = cmd->bvec;
                offset = 0;
        } else {
                /*
                 * Same here, this bio may be started from the middle of the
                 * 'bvec' because of bio splitting, so offset from the bvec
                 * must be passed to iov iterator
                 */
                offset = bio->bi_iter.bi_bvec_done;
                bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
        }
        atomic_set(&cmd->ref, 2);

        iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
        iter.iov_offset = offset;

        cmd->iocb.ki_pos = pos;
        cmd->iocb.ki_filp = file;
        cmd->iocb.ki_ioprio = req_get_ioprio(rq);
        if (cmd->use_aio) {
                cmd->iocb.ki_complete = lo_rw_aio_complete;
                cmd->iocb.ki_flags = IOCB_DIRECT;
        } else {
                cmd->iocb.ki_complete = NULL;
                cmd->iocb.ki_flags = 0;
        }

        if (rw == ITER_SOURCE) {
                kiocb_start_write(&cmd->iocb);
                ret = file->f_op->write_iter(&cmd->iocb, &iter);
        } else
                ret = file->f_op->read_iter(&cmd->iocb, &iter);

        lo_rw_aio_do_completion(cmd);

        if (ret != -EIOCBQUEUED)
                lo_rw_aio_complete(&cmd->iocb, ret);
        return -EIOCBQUEUED;
}

static int do_req_filebacked(struct loop_device *lo, struct request *rq)
{
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
        loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;

        switch (req_op(rq)) {
        case REQ_OP_FLUSH:
                return lo_req_flush(lo, rq);
        case REQ_OP_WRITE_ZEROES:
                /*
                 * If the caller doesn't want deallocation, call zeroout to
                 * write zeroes the range.  Otherwise, punch them out.
                 */
                return lo_fallocate(lo, rq, pos,
                        (rq->cmd_flags & REQ_NOUNMAP) ?
                                FALLOC_FL_ZERO_RANGE :
                                FALLOC_FL_PUNCH_HOLE);
        case REQ_OP_DISCARD:
                return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
        case REQ_OP_WRITE:
                return lo_rw_aio(lo, cmd, pos, ITER_SOURCE);
        case REQ_OP_READ:
                return lo_rw_aio(lo, cmd, pos, ITER_DEST);
        default:
                WARN_ON_ONCE(1);
                return -EIO;
        }
}

static void loop_reread_partitions(struct loop_device *lo)
{
        int rc;

        mutex_lock(&lo->lo_disk->open_mutex);
        rc = bdev_disk_changed(lo->lo_disk, false);
        mutex_unlock(&lo->lo_disk->open_mutex);
        if (rc)
                pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
                        __func__, lo->lo_number, lo->lo_file_name, rc);
}

static unsigned int loop_query_min_dio_size(struct loop_device *lo)
{
        struct file *file = lo->lo_backing_file;
        struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev;
        struct kstat st;

        /*
         * Use the minimal dio alignment of the file system if provided.
         */
        if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) &&
            (st.result_mask & STATX_DIOALIGN))
                return st.dio_offset_align;

        /*
         * In a perfect world this wouldn't be needed, but as of Linux 6.13 only
         * a handful of file systems support the STATX_DIOALIGN flag.
         */
        if (sb_bdev)
                return bdev_logical_block_size(sb_bdev);
        return SECTOR_SIZE;
}

static inline int is_loop_device(struct file *file)
{
        struct inode *i = file->f_mapping->host;

        return i && S_ISBLK(i->i_mode) && imajor(i) == LOOP_MAJOR;
}

static int loop_validate_file(struct file *file, struct block_device *bdev)
{
        struct inode        *inode = file->f_mapping->host;
        struct file        *f = file;

        /* Avoid recursion */
        while (is_loop_device(f)) {
                struct loop_device *l;

                lockdep_assert_held(&loop_validate_mutex);
                if (f->f_mapping->host->i_rdev == bdev->bd_dev)
                        return -EBADF;

                l = I_BDEV(f->f_mapping->host)->bd_disk->private_data;
                if (l->lo_state != Lo_bound)
                        return -EINVAL;
                /* Order wrt setting lo->lo_backing_file in loop_configure(). */
                rmb();
                f = l->lo_backing_file;
        }
        if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
                return -EINVAL;
        return 0;
}

static void loop_assign_backing_file(struct loop_device *lo, struct file *file)
{
        lo->lo_backing_file = file;
        lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping);
        mapping_set_gfp_mask(file->f_mapping,
                        lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS));
        if (lo->lo_backing_file->f_flags & O_DIRECT)
                lo->lo_flags |= LO_FLAGS_DIRECT_IO;
        lo->lo_min_dio_size = loop_query_min_dio_size(lo);
}

static int loop_check_backing_file(struct file *file)
{
        if (!file->f_op->read_iter)
                return -EINVAL;

        if ((file->f_mode & FMODE_WRITE) && !file->f_op->write_iter)
                return -EINVAL;

        return 0;
}

/*
 * loop_change_fd switched the backing store of a loopback device to
 * a new file. This is useful for operating system installers to free up
 * the original file and in High Availability environments to switch to
 * an alternative location for the content in case of server meltdown.
 * This can only work if the loop device is used read-only, and if the
 * new backing store is the same size and type as the old backing store.
 */
static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
                          unsigned int arg)
{
        struct file *file = fget(arg);
        struct file *old_file;
        unsigned int memflags;
        int error;
        bool partscan;
        bool is_loop;

        if (!file)
                return -EBADF;

        error = loop_check_backing_file(file);
        if (error) {
                fput(file);
                return error;
        }

        /* suppress uevents while reconfiguring the device */
        dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1);

        is_loop = is_loop_device(file);
        error = loop_global_lock_killable(lo, is_loop);
        if (error)
                goto out_putf;
        error = -ENXIO;
        if (lo->lo_state != Lo_bound)
                goto out_err;

        /* the loop device has to be read-only */
        error = -EINVAL;
        if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
                goto out_err;

        error = loop_validate_file(file, bdev);
        if (error)
                goto out_err;

        old_file = lo->lo_backing_file;

        error = -EINVAL;

        /* size of the new backing store needs to be the same */
        if (lo_calculate_size(lo, file) != lo_calculate_size(lo, old_file))
                goto out_err;

        /*
         * We might switch to direct I/O mode for the loop device, write back
         * all dirty data the page cache now that so that the individual I/O
         * operations don't have to do that.
         */
        vfs_fsync(file, 0);

        /* and ... switch */
        disk_force_media_change(lo->lo_disk);
        memflags = blk_mq_freeze_queue(lo->lo_queue);
        mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
        loop_assign_backing_file(lo, file);
        loop_update_dio(lo);
        blk_mq_unfreeze_queue(lo->lo_queue, memflags);
        partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
        loop_global_unlock(lo, is_loop);

        /*
         * Flush loop_validate_file() before fput(), for l->lo_backing_file
         * might be pointing at old_file which might be the last reference.
         */
        if (!is_loop) {
                mutex_lock(&loop_validate_mutex);
                mutex_unlock(&loop_validate_mutex);
        }
        /*
         * We must drop file reference outside of lo_mutex as dropping
         * the file ref can take open_mutex which creates circular locking
         * dependency.
         */
        fput(old_file);
        dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);
        if (partscan)
                loop_reread_partitions(lo);

        error = 0;
done:
        kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
        return error;

out_err:
        loop_global_unlock(lo, is_loop);
out_putf:
        fput(file);
        dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);
        goto done;
}

/* loop sysfs attributes */

static ssize_t loop_attr_show(struct device *dev, char *page,
                              ssize_t (*callback)(struct loop_device *, char *))
{
        struct gendisk *disk = dev_to_disk(dev);
        struct loop_device *lo = disk->private_data;

        return callback(lo, page);
}

#define LOOP_ATTR_RO(_name)                                                \
static ssize_t loop_attr_##_name##_show(struct loop_device *, char *);        \
static ssize_t loop_attr_do_show_##_name(struct device *d,                \
                                struct device_attribute *attr, char *b)        \
{                                                                        \
        return loop_attr_show(d, b, loop_attr_##_name##_show);                \
}                                                                        \
static struct device_attribute loop_attr_##_name =                        \
        __ATTR(_name, 0444, loop_attr_do_show_##_name, NULL);

static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
{
        ssize_t ret;
        char *p = NULL;

        spin_lock_irq(&lo->lo_lock);
        if (lo->lo_backing_file)
                p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
        spin_unlock_irq(&lo->lo_lock);

        if (IS_ERR_OR_NULL(p))
                ret = PTR_ERR(p);
        else {
                ret = strlen(p);
                memmove(buf, p, ret);
                buf[ret++] = '\n';
                buf[ret] = 0;
        }

        return ret;
}

static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
{
        return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset);
}

static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
{
        return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
}

static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
{
        int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);

        return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0");
}

static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
{
        int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN);

        return sysfs_emit(buf, "%s\n", partscan ? "1" : "0");
}

static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
{
        int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO);

        return sysfs_emit(buf, "%s\n", dio ? "1" : "0");
}

LOOP_ATTR_RO(backing_file);
LOOP_ATTR_RO(offset);
LOOP_ATTR_RO(sizelimit);
LOOP_ATTR_RO(autoclear);
LOOP_ATTR_RO(partscan);
LOOP_ATTR_RO(dio);

static struct attribute *loop_attrs[] = {
        &loop_attr_backing_file.attr,
        &loop_attr_offset.attr,
        &loop_attr_sizelimit.attr,
        &loop_attr_autoclear.attr,
        &loop_attr_partscan.attr,
        &loop_attr_dio.attr,
        NULL,
};

static struct attribute_group loop_attribute_group = {
        .name = "loop",
        .attrs= loop_attrs,
};

static void loop_sysfs_init(struct loop_device *lo)
{
        lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
                                                &loop_attribute_group);
}

static void loop_sysfs_exit(struct loop_device *lo)
{
        if (lo->sysfs_inited)
                sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
                                   &loop_attribute_group);
}

static void loop_get_discard_config(struct loop_device *lo,
                                    u32 *granularity, u32 *max_discard_sectors)
{
        struct file *file = lo->lo_backing_file;
        struct inode *inode = file->f_mapping->host;
        struct kstatfs sbuf;

        /*
         * If the backing device is a block device, mirror its zeroing
         * capability. Set the discard sectors to the block device's zeroing
         * capabilities because loop discards result in blkdev_issue_zeroout(),
         * not blkdev_issue_discard(). This maintains consistent behavior with
         * file-backed loop devices: discarded regions read back as zero.
         */
        if (S_ISBLK(inode->i_mode)) {
                struct block_device *bdev = I_BDEV(inode);

                *max_discard_sectors = bdev_write_zeroes_sectors(bdev);
                *granularity = bdev_discard_granularity(bdev);

        /*
         * We use punch hole to reclaim the free space used by the
         * image a.k.a. discard.
         */
        } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) {
                *max_discard_sectors = UINT_MAX >> 9;
                *granularity = sbuf.f_bsize;
        }
}

struct loop_worker {
        struct rb_node rb_node;
        struct work_struct work;
        struct list_head cmd_list;
        struct list_head idle_list;
        struct loop_device *lo;
        struct cgroup_subsys_state *blkcg_css;
        unsigned long last_ran_at;
};

static void loop_workfn(struct work_struct *work);

#ifdef CONFIG_BLK_CGROUP
static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
{
        return !css || css == blkcg_root_css;
}
#else
static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
{
        return !css;
}
#endif

static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
{
        struct rb_node **node, *parent = NULL;
        struct loop_worker *cur_worker, *worker = NULL;
        struct work_struct *work;
        struct list_head *cmd_list;

        spin_lock_irq(&lo->lo_work_lock);

        if (queue_on_root_worker(cmd->blkcg_css))
                goto queue_work;

        node = &lo->worker_tree.rb_node;

        while (*node) {
                parent = *node;
                cur_worker = container_of(*node, struct loop_worker, rb_node);
                if (cur_worker->blkcg_css == cmd->blkcg_css) {
                        worker = cur_worker;
                        break;
                } else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) {
                        node = &(*node)->rb_left;
                } else {
                        node = &(*node)->rb_right;
                }
        }
        if (worker)
                goto queue_work;

        worker = kzalloc_obj(struct loop_worker, GFP_NOWAIT);
        /*
         * In the event we cannot allocate a worker, just queue on the
         * rootcg worker and issue the I/O as the rootcg
         */
        if (!worker) {
                cmd->blkcg_css = NULL;
                if (cmd->memcg_css)
                        css_put(cmd->memcg_css);
                cmd->memcg_css = NULL;
                goto queue_work;
        }

        worker->blkcg_css = cmd->blkcg_css;
        css_get(worker->blkcg_css);
        INIT_WORK(&worker->work, loop_workfn);
        INIT_LIST_HEAD(&worker->cmd_list);
        INIT_LIST_HEAD(&worker->idle_list);
        worker->lo = lo;
        rb_link_node(&worker->rb_node, parent, node);
        rb_insert_color(&worker->rb_node, &lo->worker_tree);
queue_work:
        if (worker) {
                /*
                 * We need to remove from the idle list here while
                 * holding the lock so that the idle timer doesn't
                 * free the worker
                 */
                if (!list_empty(&worker->idle_list))
                        list_del_init(&worker->idle_list);
                work = &worker->work;
                cmd_list = &worker->cmd_list;
        } else {
                work = &lo->rootcg_work;
                cmd_list = &lo->rootcg_cmd_list;
        }
        list_add_tail(&cmd->list_entry, cmd_list);
        queue_work(lo->workqueue, work);
        spin_unlock_irq(&lo->lo_work_lock);
}

static void loop_set_timer(struct loop_device *lo)
{
        timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
}

static void loop_free_idle_workers(struct loop_device *lo, bool delete_all)
{
        struct loop_worker *pos, *worker;

        spin_lock_irq(&lo->lo_work_lock);
        list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
                                idle_list) {
                if (!delete_all &&
                    time_is_after_jiffies(worker->last_ran_at +
                                          LOOP_IDLE_WORKER_TIMEOUT))
                        break;
                list_del(&worker->idle_list);
                rb_erase(&worker->rb_node, &lo->worker_tree);
                css_put(worker->blkcg_css);
                kfree(worker);
        }
        if (!list_empty(&lo->idle_worker_list))
                loop_set_timer(lo);
        spin_unlock_irq(&lo->lo_work_lock);
}

static void loop_free_idle_workers_timer(struct timer_list *timer)
{
        struct loop_device *lo = container_of(timer, struct loop_device, timer);

        return loop_free_idle_workers(lo, false);
}

/**
 * loop_set_status_from_info - configure device from loop_info
 * @lo: struct loop_device to configure
 * @info: struct loop_info64 to configure the device with
 *
 * Configures the loop device parameters according to the passed
 * in loop_info64 configuration.
 */
static int
loop_set_status_from_info(struct loop_device *lo,
                          const struct loop_info64 *info)
{
        if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
                return -EINVAL;

        switch (info->lo_encrypt_type) {
        case LO_CRYPT_NONE:
                break;
        case LO_CRYPT_XOR:
                pr_warn("support for the xor transformation has been removed.\n");
                return -EINVAL;
        case LO_CRYPT_CRYPTOAPI:
                pr_warn("support for cryptoloop has been removed.  Use dm-crypt instead.\n");
                return -EINVAL;
        default:
                return -EINVAL;
        }

        /* Avoid assigning overflow values */
        if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX)
                return -EOVERFLOW;

        lo->lo_offset = info->lo_offset;
        lo->lo_sizelimit = info->lo_sizelimit;

        memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
        lo->lo_file_name[LO_NAME_SIZE-1] = 0;
        return 0;
}

static unsigned int loop_default_blocksize(struct loop_device *lo)
{
        /* In case of direct I/O, match underlying minimum I/O size */
        if (lo->lo_flags & LO_FLAGS_DIRECT_IO)
                return lo->lo_min_dio_size;
        return SECTOR_SIZE;
}

static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
                unsigned int bsize)
{
        struct file *file = lo->lo_backing_file;
        struct inode *inode = file->f_mapping->host;
        struct block_device *backing_bdev = NULL;
        u32 granularity = 0, max_discard_sectors = 0;

        if (S_ISBLK(inode->i_mode))
                backing_bdev = I_BDEV(inode);
        else if (inode->i_sb->s_bdev)
                backing_bdev = inode->i_sb->s_bdev;

        if (!bsize)
                bsize = loop_default_blocksize(lo);

        loop_get_discard_config(lo, &granularity, &max_discard_sectors);

        lim->logical_block_size = bsize;
        lim->physical_block_size = bsize;
        lim->io_min = bsize;
        lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
        if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY))
                lim->features |= BLK_FEAT_WRITE_CACHE;
        if (backing_bdev && bdev_rot(backing_bdev))
                lim->features |= BLK_FEAT_ROTATIONAL;
        lim->max_hw_discard_sectors = max_discard_sectors;
        lim->max_write_zeroes_sectors = max_discard_sectors;
        if (max_discard_sectors)
                lim->discard_granularity = granularity;
        else
                lim->discard_granularity = 0;
}

static int loop_configure(struct loop_device *lo, blk_mode_t mode,
                          struct block_device *bdev,
                          const struct loop_config *config)
{
        struct file *file = fget(config->fd);
        struct queue_limits lim;
        int error;
        loff_t size;
        bool partscan;
        bool is_loop;

        if (!file)
                return -EBADF;

        error = loop_check_backing_file(file);
        if (error) {
                fput(file);
                return error;
        }

        is_loop = is_loop_device(file);

        /* This is safe, since we have a reference from open(). */
        __module_get(THIS_MODULE);

        /*
         * If we don't hold exclusive handle for the device, upgrade to it
         * here to avoid changing device under exclusive owner.
         */
        if (!(mode & BLK_OPEN_EXCL)) {
                error = bd_prepare_to_claim(bdev, loop_configure, NULL);
                if (error)
                        goto out_putf;
        }

        error = loop_global_lock_killable(lo, is_loop);
        if (error)
                goto out_bdev;

        error = -EBUSY;
        if (lo->lo_state != Lo_unbound)
                goto out_unlock;

        error = loop_validate_file(file, bdev);
        if (error)
                goto out_unlock;

        if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) {
                error = -EINVAL;
                goto out_unlock;
        }

        error = loop_set_status_from_info(lo, &config->info);
        if (error)
                goto out_unlock;
        lo->lo_flags = config->info.lo_flags;

        if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) ||
            !file->f_op->write_iter)
                lo->lo_flags |= LO_FLAGS_READ_ONLY;

        if (!lo->workqueue) {
                lo->workqueue = alloc_workqueue("loop%d",
                                                WQ_UNBOUND | WQ_FREEZABLE,
                                                0, lo->lo_number);
                if (!lo->workqueue) {
                        error = -ENOMEM;
                        goto out_unlock;
                }
        }

        /* suppress uevents while reconfiguring the device */
        dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1);

        disk_force_media_change(lo->lo_disk);
        set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);

        lo->lo_device = bdev;
        loop_assign_backing_file(lo, file);

        lim = queue_limits_start_update(lo->lo_queue);
        loop_update_limits(lo, &lim, config->block_size);
        /* No need to freeze the queue as the device isn't bound yet. */
        error = queue_limits_commit_update(lo->lo_queue, &lim);
        if (error)
                goto out_unlock;

        /*
         * We might switch to direct I/O mode for the loop device, write back
         * all dirty data the page cache now that so that the individual I/O
         * operations don't have to do that.
         */
        vfs_fsync(file, 0);

        loop_update_dio(lo);
        loop_sysfs_init(lo);

        size = lo_calculate_size(lo, file);
        loop_set_size(lo, size);

        /* Order wrt reading lo_state in loop_validate_file(). */
        wmb();

        WRITE_ONCE(lo->lo_state, Lo_bound);
        if (part_shift)
                lo->lo_flags |= LO_FLAGS_PARTSCAN;
        partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
        if (partscan)
                clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);

        dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);
        kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);

        loop_global_unlock(lo, is_loop);
        if (partscan)
                loop_reread_partitions(lo);

        if (!(mode & BLK_OPEN_EXCL))
                bd_abort_claiming(bdev, loop_configure);

        return 0;

out_unlock:
        loop_global_unlock(lo, is_loop);
out_bdev:
        if (!(mode & BLK_OPEN_EXCL))
                bd_abort_claiming(bdev, loop_configure);
out_putf:
        fput(file);
        /* This is safe: open() is still holding a reference. */
        module_put(THIS_MODULE);
        return error;
}

static void __loop_clr_fd(struct loop_device *lo)
{
        struct queue_limits lim;
        struct file *filp;
        gfp_t gfp = lo->old_gfp_mask;

        spin_lock_irq(&lo->lo_lock);
        filp = lo->lo_backing_file;
        lo->lo_backing_file = NULL;
        spin_unlock_irq(&lo->lo_lock);

        lo->lo_device = NULL;
        lo->lo_offset = 0;
        lo->lo_sizelimit = 0;
        memset(lo->lo_file_name, 0, LO_NAME_SIZE);

        /*
         * Reset the block size to the default.
         *
         * No queue freezing needed because this is called from the final
         * ->release call only, so there can't be any outstanding I/O.
         */
        lim = queue_limits_start_update(lo->lo_queue);
        lim.logical_block_size = SECTOR_SIZE;
        lim.physical_block_size = SECTOR_SIZE;
        lim.io_min = SECTOR_SIZE;
        queue_limits_commit_update(lo->lo_queue, &lim);

        invalidate_disk(lo->lo_disk);
        loop_sysfs_exit(lo);
        /* let user-space know about this change */
        kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
        mapping_set_gfp_mask(filp->f_mapping, gfp);
        /* This is safe: open() is still holding a reference. */
        module_put(THIS_MODULE);

        disk_force_media_change(lo->lo_disk);

        if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
                int err;

                /*
                 * open_mutex has been held already in release path, so don't
                 * acquire it if this function is called in such case.
                 *
                 * If the reread partition isn't from release path, lo_refcnt
                 * must be at least one and it can only become zero when the
                 * current holder is released.
                 */
                err = bdev_disk_changed(lo->lo_disk, false);
                if (err)
                        pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
                                __func__, lo->lo_number, err);
                /* Device is gone, no point in returning error */
        }

        /*
         * lo->lo_state is set to Lo_unbound here after above partscan has
         * finished. There cannot be anybody else entering __loop_clr_fd() as
         * Lo_rundown state protects us from all the other places trying to
         * change the 'lo' device.
         */
        lo->lo_flags = 0;
        if (!part_shift)
                set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
        mutex_lock(&lo->lo_mutex);
        WRITE_ONCE(lo->lo_state, Lo_unbound);
        mutex_unlock(&lo->lo_mutex);

        /*
         * Need not hold lo_mutex to fput backing file. Calling fput holding
         * lo_mutex triggers a circular lock dependency possibility warning as
         * fput can take open_mutex which is usually taken before lo_mutex.
         */
        fput(filp);
}

static int loop_clr_fd(struct loop_device *lo)
{
        int err;

        /*
         * Since lo_ioctl() is called without locks held, it is possible that
         * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel.
         *
         * Therefore, use global lock when setting Lo_rundown state in order to
         * make sure that loop_validate_file() will fail if the "struct file"
         * which loop_configure()/loop_change_fd() found via fget() was this
         * loop device.
         */
        err = loop_global_lock_killable(lo, true);
        if (err)
                return err;
        if (lo->lo_state != Lo_bound) {
                loop_global_unlock(lo, true);
                return -ENXIO;
        }
        /*
         * Mark the device for removing the backing device on last close.
         * If we are the only opener, also switch the state to roundown here to
         * prevent new openers from coming in.
         */

        lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
        if (disk_openers(lo->lo_disk) == 1)
                WRITE_ONCE(lo->lo_state, Lo_rundown);
        loop_global_unlock(lo, true);

        return 0;
}

static int
loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
{
        int err;
        bool partscan = false;
        bool size_changed = false;
        unsigned int memflags;

        err = mutex_lock_killable(&lo->lo_mutex);
        if (err)
                return err;
        if (lo->lo_state != Lo_bound) {
                err = -ENXIO;
                goto out_unlock;
        }

        if (lo->lo_offset != info->lo_offset ||
            lo->lo_sizelimit != info->lo_sizelimit) {
                size_changed = true;
                sync_blockdev(lo->lo_device);
                invalidate_bdev(lo->lo_device);
        }

        /* I/O needs to be drained before changing lo_offset or lo_sizelimit */
        memflags = blk_mq_freeze_queue(lo->lo_queue);

        err = loop_set_status_from_info(lo, info);
        if (err)
                goto out_unfreeze;

        partscan = !(lo->lo_flags & LO_FLAGS_PARTSCAN) &&
                (info->lo_flags & LO_FLAGS_PARTSCAN);

        lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS;
        lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS);

        /* update the direct I/O flag if lo_offset changed */
        loop_update_dio(lo);

out_unfreeze:
        blk_mq_unfreeze_queue(lo->lo_queue, memflags);
        if (partscan)
                clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
        if (!err && size_changed) {
                loff_t new_size = lo_calculate_size(lo, lo->lo_backing_file);
                loop_set_size(lo, new_size);
        }
out_unlock:
        mutex_unlock(&lo->lo_mutex);
        if (partscan)
                loop_reread_partitions(lo);

        return err;
}

static int
loop_get_status(struct loop_device *lo, struct loop_info64 *info)
{
        struct path path;
        struct kstat stat;
        int ret;

        ret = mutex_lock_killable(&lo->lo_mutex);
        if (ret)
                return ret;
        if (lo->lo_state != Lo_bound) {
                mutex_unlock(&lo->lo_mutex);
                return -ENXIO;
        }

        memset(info, 0, sizeof(*info));
        info->lo_number = lo->lo_number;
        info->lo_offset = lo->lo_offset;
        info->lo_sizelimit = lo->lo_sizelimit;
        info->lo_flags = lo->lo_flags;
        memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);

        /* Drop lo_mutex while we call into the filesystem. */
        path = lo->lo_backing_file->f_path;
        path_get(&path);
        mutex_unlock(&lo->lo_mutex);
        ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
        if (!ret) {
                info->lo_device = huge_encode_dev(stat.dev);
                info->lo_inode = stat.ino;
                info->lo_rdevice = huge_encode_dev(stat.rdev);
        }
        path_put(&path);
        return ret;
}

static void
loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64)
{
        memset(info64, 0, sizeof(*info64));
        info64->lo_number = info->lo_number;
        info64->lo_device = info->lo_device;
        info64->lo_inode = info->lo_inode;
        info64->lo_rdevice = info->lo_rdevice;
        info64->lo_offset = info->lo_offset;
        info64->lo_sizelimit = 0;
        info64->lo_flags = info->lo_flags;
        memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
}

static int
loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
{
        memset(info, 0, sizeof(*info));
        info->lo_number = info64->lo_number;
        info->lo_device = info64->lo_device;
        info->lo_inode = info64->lo_inode;
        info->lo_rdevice = info64->lo_rdevice;
        info->lo_offset = info64->lo_offset;
        info->lo_flags = info64->lo_flags;
        memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);

        /* error in case values were truncated */
        if (info->lo_device != info64->lo_device ||
            info->lo_rdevice != info64->lo_rdevice ||
            info->lo_inode != info64->lo_inode ||
            info->lo_offset != info64->lo_offset)
                return -EOVERFLOW;

        return 0;
}

static int
loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
{
        struct loop_info info;
        struct loop_info64 info64;

        if (copy_from_user(&info, arg, sizeof (struct loop_info)))
                return -EFAULT;
        loop_info64_from_old(&info, &info64);
        return loop_set_status(lo, &info64);
}

static int
loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg)
{
        struct loop_info64 info64;

        if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
                return -EFAULT;
        return loop_set_status(lo, &info64);
}

static int
loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
        struct loop_info info;
        struct loop_info64 info64;
        int err;

        if (!arg)
                return -EINVAL;
        err = loop_get_status(lo, &info64);
        if (!err)
                err = loop_info64_to_old(&info64, &info);
        if (!err && copy_to_user(arg, &info, sizeof(info)))
                err = -EFAULT;

        return err;
}

static int
loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
        struct loop_info64 info64;
        int err;

        if (!arg)
                return -EINVAL;
        err = loop_get_status(lo, &info64);
        if (!err && copy_to_user(arg, &info64, sizeof(info64)))
                err = -EFAULT;

        return err;
}

static int loop_set_capacity(struct loop_device *lo)
{
        loff_t size;

        if (unlikely(lo->lo_state != Lo_bound))
                return -ENXIO;

        size = lo_calculate_size(lo, lo->lo_backing_file);
        loop_set_size(lo, size);

        return 0;
}

static int loop_set_dio(struct loop_device *lo, unsigned long arg)
{
        bool use_dio = !!arg;
        unsigned int memflags;

        if (lo->lo_state != Lo_bound)
                return -ENXIO;
        if (use_dio == !!(lo->lo_flags & LO_FLAGS_DIRECT_IO))
                return 0;

        if (use_dio) {
                if (!lo_can_use_dio(lo))
                        return -EINVAL;
                /* flush dirty pages before starting to use direct I/O */
                vfs_fsync(lo->lo_backing_file, 0);
        }

        memflags = blk_mq_freeze_queue(lo->lo_queue);
        if (use_dio)
                lo->lo_flags |= LO_FLAGS_DIRECT_IO;
        else
                lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
        blk_mq_unfreeze_queue(lo->lo_queue, memflags);
        return 0;
}

static int loop_set_block_size(struct loop_device *lo, blk_mode_t mode,
                               struct block_device *bdev, unsigned long arg)
{
        struct queue_limits lim;
        unsigned int memflags;
        int err = 0;

        /*
         * If we don't hold exclusive handle for the device, upgrade to it
         * here to avoid changing device under exclusive owner.
         */
        if (!(mode & BLK_OPEN_EXCL)) {
                err = bd_prepare_to_claim(bdev, loop_set_block_size, NULL);
                if (err)
                        return err;
        }

        err = mutex_lock_killable(&lo->lo_mutex);
        if (err)
                goto abort_claim;

        if (lo->lo_state != Lo_bound) {
                err = -ENXIO;
                goto unlock;
        }

        if (lo->lo_queue->limits.logical_block_size == arg)
                goto unlock;

        sync_blockdev(lo->lo_device);
        invalidate_bdev(lo->lo_device);

        lim = queue_limits_start_update(lo->lo_queue);
        loop_update_limits(lo, &lim, arg);

        memflags = blk_mq_freeze_queue(lo->lo_queue);
        err = queue_limits_commit_update(lo->lo_queue, &lim);
        loop_update_dio(lo);
        blk_mq_unfreeze_queue(lo->lo_queue, memflags);

unlock:
        mutex_unlock(&lo->lo_mutex);
abort_claim:
        if (!(mode & BLK_OPEN_EXCL))
                bd_abort_claiming(bdev, loop_set_block_size);
        return err;
}

static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
                           unsigned long arg)
{
        int err;

        err = mutex_lock_killable(&lo->lo_mutex);
        if (err)
                return err;
        switch (cmd) {
        case LOOP_SET_CAPACITY:
                err = loop_set_capacity(lo);
                break;
        case LOOP_SET_DIRECT_IO:
                err = loop_set_dio(lo, arg);
                break;
        default:
                err = -EINVAL;
        }
        mutex_unlock(&lo->lo_mutex);
        return err;
}

static int lo_ioctl(struct block_device *bdev, blk_mode_t mode,
        unsigned int cmd, unsigned long arg)
{
        struct loop_device *lo = bdev->bd_disk->private_data;
        void __user *argp = (void __user *) arg;
        int err;

        switch (cmd) {
        case LOOP_SET_FD: {
                /*
                 * Legacy case - pass in a zeroed out struct loop_config with
                 * only the file descriptor set , which corresponds with the
                 * default parameters we'd have used otherwise.
                 */
                struct loop_config config;

                memset(&config, 0, sizeof(config));
                config.fd = arg;

                return loop_configure(lo, mode, bdev, &config);
        }
        case LOOP_CONFIGURE: {
                struct loop_config config;

                if (copy_from_user(&config, argp, sizeof(config)))
                        return -EFAULT;

                return loop_configure(lo, mode, bdev, &config);
        }
        case LOOP_CHANGE_FD:
                return loop_change_fd(lo, bdev, arg);
        case LOOP_CLR_FD:
                return loop_clr_fd(lo);
        case LOOP_SET_STATUS:
                err = -EPERM;
                if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN))
                        err = loop_set_status_old(lo, argp);
                break;
        case LOOP_GET_STATUS:
                return loop_get_status_old(lo, argp);
        case LOOP_SET_STATUS64:
                err = -EPERM;
                if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN))
                        err = loop_set_status64(lo, argp);
                break;
        case LOOP_GET_STATUS64:
                return loop_get_status64(lo, argp);
        case LOOP_SET_BLOCK_SIZE:
                if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN))
                        return -EPERM;
                return loop_set_block_size(lo, mode, bdev, arg);
        case LOOP_SET_CAPACITY:
        case LOOP_SET_DIRECT_IO:
                if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN))
                        return -EPERM;
                fallthrough;
        default:
                err = lo_simple_ioctl(lo, cmd, arg);
                break;
        }

        return err;
}

#ifdef CONFIG_COMPAT
struct compat_loop_info {
        compat_int_t        lo_number;      /* ioctl r/o */
        compat_dev_t        lo_device;      /* ioctl r/o */
        compat_ulong_t        lo_inode;       /* ioctl r/o */
        compat_dev_t        lo_rdevice;     /* ioctl r/o */
        compat_int_t        lo_offset;
        compat_int_t        lo_encrypt_type;        /* obsolete, ignored */
        compat_int_t        lo_encrypt_key_size;    /* ioctl w/o */
        compat_int_t        lo_flags;       /* ioctl r/o */
        char                lo_name[LO_NAME_SIZE];
        unsigned char        lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
        compat_ulong_t        lo_init[2];
        char                reserved[4];
};

/*
 * Transfer 32-bit compatibility structure in userspace to 64-bit loop info
 * - noinlined to reduce stack space usage in main part of driver
 */
static noinline int
loop_info64_from_compat(const struct compat_loop_info __user *arg,
                        struct loop_info64 *info64)
{
        struct compat_loop_info info;

        if (copy_from_user(&info, arg, sizeof(info)))
                return -EFAULT;

        memset(info64, 0, sizeof(*info64));
        info64->lo_number = info.lo_number;
        info64->lo_device = info.lo_device;
        info64->lo_inode = info.lo_inode;
        info64->lo_rdevice = info.lo_rdevice;
        info64->lo_offset = info.lo_offset;
        info64->lo_sizelimit = 0;
        info64->lo_flags = info.lo_flags;
        memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
        return 0;
}

/*
 * Transfer 64-bit loop info to 32-bit compatibility structure in userspace
 * - noinlined to reduce stack space usage in main part of driver
 */
static noinline int
loop_info64_to_compat(const struct loop_info64 *info64,
                      struct compat_loop_info __user *arg)
{
        struct compat_loop_info info;

        memset(&info, 0, sizeof(info));
        info.lo_number = info64->lo_number;
        info.lo_device = info64->lo_device;
        info.lo_inode = info64->lo_inode;
        info.lo_rdevice = info64->lo_rdevice;
        info.lo_offset = info64->lo_offset;
        info.lo_flags = info64->lo_flags;
        memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);

        /* error in case values were truncated */
        if (info.lo_device != info64->lo_device ||
            info.lo_rdevice != info64->lo_rdevice ||
            info.lo_inode != info64->lo_inode ||
            info.lo_offset != info64->lo_offset)
                return -EOVERFLOW;

        if (copy_to_user(arg, &info, sizeof(info)))
                return -EFAULT;
        return 0;
}

static int
loop_set_status_compat(struct loop_device *lo,
                       const struct compat_loop_info __user *arg)
{
        struct loop_info64 info64;
        int ret;

        ret = loop_info64_from_compat(arg, &info64);
        if (ret < 0)
                return ret;
        return loop_set_status(lo, &info64);
}

static int
loop_get_status_compat(struct loop_device *lo,
                       struct compat_loop_info __user *arg)
{
        struct loop_info64 info64;
        int err;

        if (!arg)
                return -EINVAL;
        err = loop_get_status(lo, &info64);
        if (!err)
                err = loop_info64_to_compat(&info64, arg);
        return err;
}

static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
                           unsigned int cmd, unsigned long arg)
{
        struct loop_device *lo = bdev->bd_disk->private_data;
        int err;

        switch(cmd) {
        case LOOP_SET_STATUS:
                err = loop_set_status_compat(lo,
                             (const struct compat_loop_info __user *)arg);
                break;
        case LOOP_GET_STATUS:
                err = loop_get_status_compat(lo,
                                     (struct compat_loop_info __user *)arg);
                break;
        case LOOP_SET_CAPACITY:
        case LOOP_CLR_FD:
        case LOOP_GET_STATUS64:
        case LOOP_SET_STATUS64:
        case LOOP_CONFIGURE:
                arg = (unsigned long) compat_ptr(arg);
                fallthrough;
        case LOOP_SET_FD:
        case LOOP_CHANGE_FD:
        case LOOP_SET_BLOCK_SIZE:
        case LOOP_SET_DIRECT_IO:
                err = lo_ioctl(bdev, mode, cmd, arg);
                break;
        default:
                err = -ENOIOCTLCMD;
                break;
        }
        return err;
}
#endif

static int lo_open(struct gendisk *disk, blk_mode_t mode)
{
        struct loop_device *lo = disk->private_data;
        int err;

        err = mutex_lock_killable(&lo->lo_mutex);
        if (err)
                return err;

        if (lo->lo_state == Lo_deleting || lo->lo_state == Lo_rundown)
                err = -ENXIO;
        mutex_unlock(&lo->lo_mutex);
        return err;
}

static void lo_release(struct gendisk *disk)
{
        struct loop_device *lo = disk->private_data;
        bool need_clear = false;

        if (disk_openers(disk) > 0)
                return;
        /*
         * Clear the backing device information if this is the last close of
         * a device that's been marked for auto clear, or on which LOOP_CLR_FD
         * has been called.
         */

        mutex_lock(&lo->lo_mutex);
        if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR))
                WRITE_ONCE(lo->lo_state, Lo_rundown);

        need_clear = (lo->lo_state == Lo_rundown);
        mutex_unlock(&lo->lo_mutex);

        if (need_clear)
                __loop_clr_fd(lo);
}

static void lo_free_disk(struct gendisk *disk)
{
        struct loop_device *lo = disk->private_data;

        if (lo->workqueue)
                destroy_workqueue(lo->workqueue);
        loop_free_idle_workers(lo, true);
        timer_shutdown_sync(&lo->timer);
        mutex_destroy(&lo->lo_mutex);
        kfree(lo);
}

static const struct block_device_operations lo_fops = {
        .owner =        THIS_MODULE,
        .open =         lo_open,
        .release =        lo_release,
        .ioctl =        lo_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl =        lo_compat_ioctl,
#endif
        .free_disk =        lo_free_disk,
};

/*
 * And now the modules code and kernel interface.
 */

/*
 * If max_loop is specified, create that many devices upfront.
 * This also becomes a hard limit. If max_loop is not specified,
 * the default isn't a hard limit (as before commit 85c50197716c
 * changed the default value from 0 for max_loop=0 reasons), just
 * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
 * init time. Loop devices can be requested on-demand with the
 * /dev/loop-control interface, or be instantiated by accessing
 * a 'dead' device node.
 */
static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT;

#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
static bool max_loop_specified;

static int max_loop_param_set_int(const char *val,
                                  const struct kernel_param *kp)
{
        int ret;

        ret = param_set_int(val, kp);
        if (ret < 0)
                return ret;

        max_loop_specified = true;
        return 0;
}

static const struct kernel_param_ops max_loop_param_ops = {
        .set = max_loop_param_set_int,
        .get = param_get_int,
};

module_param_cb(max_loop, &max_loop_param_ops, &max_loop, 0444);
MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
#else
module_param(max_loop, int, 0444);
MODULE_PARM_DESC(max_loop, "Initial number of loop devices");
#endif

module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");

static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH;

static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p)
{
        int qd, ret;

        ret = kstrtoint(s, 0, &qd);
        if (ret < 0)
                return ret;
        if (qd < 1)
                return -EINVAL;
        hw_queue_depth = qd;
        return 0;
}

static const struct kernel_param_ops loop_hw_qdepth_param_ops = {
        .set        = loop_set_hw_queue_depth,
        .get        = param_get_int,
};

device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444);
MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH));

MODULE_DESCRIPTION("Loopback device support");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
                const struct blk_mq_queue_data *bd)
{
        struct request *rq = bd->rq;
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
        struct loop_device *lo = rq->q->queuedata;

        blk_mq_start_request(rq);

        if (data_race(READ_ONCE(lo->lo_state)) != Lo_bound)
                return BLK_STS_IOERR;

        switch (req_op(rq)) {
        case REQ_OP_FLUSH:
        case REQ_OP_DISCARD:
        case REQ_OP_WRITE_ZEROES:
                cmd->use_aio = false;
                break;
        default:
                cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
                break;
        }

        /* always use the first bio's css */
        cmd->blkcg_css = NULL;
        cmd->memcg_css = NULL;
#ifdef CONFIG_BLK_CGROUP
        if (rq->bio) {
                cmd->blkcg_css = bio_blkcg_css(rq->bio);
#ifdef CONFIG_MEMCG
                if (cmd->blkcg_css) {
                        cmd->memcg_css =
                                cgroup_get_e_css(cmd->blkcg_css->cgroup,
                                                &memory_cgrp_subsys);
                }
#endif
        }
#endif
        loop_queue_work(lo, cmd);

        return BLK_STS_OK;
}

static void loop_handle_cmd(struct loop_cmd *cmd)
{
        struct cgroup_subsys_state *cmd_blkcg_css = cmd->blkcg_css;
        struct cgroup_subsys_state *cmd_memcg_css = cmd->memcg_css;
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        const bool write = op_is_write(req_op(rq));
        struct loop_device *lo = rq->q->queuedata;
        int ret = 0;
        struct mem_cgroup *old_memcg = NULL;

        if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
                ret = -EIO;
                goto failed;
        }

        /* We can block in this context, so ignore REQ_NOWAIT. */
        if (rq->cmd_flags & REQ_NOWAIT)
                rq->cmd_flags &= ~REQ_NOWAIT;

        if (cmd_blkcg_css)
                kthread_associate_blkcg(cmd_blkcg_css);
        if (cmd_memcg_css)
                old_memcg = set_active_memcg(
                        mem_cgroup_from_css(cmd_memcg_css));

        /*
         * do_req_filebacked() may call blk_mq_complete_request() synchronously
         * or asynchronously if using aio. Hence, do not touch 'cmd' after
         * do_req_filebacked() has returned unless we are sure that 'cmd' has
         * not yet been completed.
         */
        ret = do_req_filebacked(lo, rq);

        if (cmd_blkcg_css)
                kthread_associate_blkcg(NULL);

        if (cmd_memcg_css) {
                set_active_memcg(old_memcg);
                css_put(cmd_memcg_css);
        }
 failed:
        /* complete non-aio request */
        if (ret != -EIOCBQUEUED) {
                if (ret == -EOPNOTSUPP)
                        cmd->ret = ret;
                else
                        cmd->ret = ret ? -EIO : 0;
                if (likely(!blk_should_fake_timeout(rq->q)))
                        blk_mq_complete_request(rq);
        }
}

static void loop_process_work(struct loop_worker *worker,
                        struct list_head *cmd_list, struct loop_device *lo)
{
        int orig_flags = current->flags;
        struct loop_cmd *cmd;

        current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
        spin_lock_irq(&lo->lo_work_lock);
        while (!list_empty(cmd_list)) {
                cmd = container_of(
                        cmd_list->next, struct loop_cmd, list_entry);
                list_del(cmd_list->next);
                spin_unlock_irq(&lo->lo_work_lock);

                loop_handle_cmd(cmd);
                cond_resched();

                spin_lock_irq(&lo->lo_work_lock);
        }

        /*
         * We only add to the idle list if there are no pending cmds
         * *and* the worker will not run again which ensures that it
         * is safe to free any worker on the idle list
         */
        if (worker && !work_pending(&worker->work)) {
                worker->last_ran_at = jiffies;
                list_add_tail(&worker->idle_list, &lo->idle_worker_list);
                loop_set_timer(lo);
        }
        spin_unlock_irq(&lo->lo_work_lock);
        current->flags = orig_flags;
}

static void loop_workfn(struct work_struct *work)
{
        struct loop_worker *worker =
                container_of(work, struct loop_worker, work);
        loop_process_work(worker, &worker->cmd_list, worker->lo);
}

static void loop_rootcg_workfn(struct work_struct *work)
{
        struct loop_device *lo =
                container_of(work, struct loop_device, rootcg_work);
        loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
}

static const struct blk_mq_ops loop_mq_ops = {
        .queue_rq       = loop_queue_rq,
        .complete        = lo_complete_rq,
};

static int loop_add(int i)
{
        struct queue_limits lim = {
                /*
                 * Random number picked from the historic block max_sectors cap.
                 */
                .max_hw_sectors                = 2560u,
        };
        struct loop_device *lo;
        struct gendisk *disk;
        int err;

        err = -ENOMEM;
        lo = kzalloc_obj(*lo);
        if (!lo)
                goto out;
        lo->worker_tree = RB_ROOT;
        INIT_LIST_HEAD(&lo->idle_worker_list);
        timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE);
        WRITE_ONCE(lo->lo_state, Lo_unbound);

        err = mutex_lock_killable(&loop_ctl_mutex);
        if (err)
                goto out_free_dev;

        /* allocate id, if @id >= 0, we're requesting that specific id */
        if (i >= 0) {
                err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
                if (err == -ENOSPC)
                        err = -EEXIST;
        } else {
                err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL);
        }
        mutex_unlock(&loop_ctl_mutex);
        if (err < 0)
                goto out_free_dev;
        i = err;

        lo->tag_set.ops = &loop_mq_ops;
        lo->tag_set.nr_hw_queues = 1;
        lo->tag_set.queue_depth = hw_queue_depth;
        lo->tag_set.numa_node = NUMA_NO_NODE;
        lo->tag_set.cmd_size = sizeof(struct loop_cmd);
        lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT;
        lo->tag_set.driver_data = lo;

        err = blk_mq_alloc_tag_set(&lo->tag_set);
        if (err)
                goto out_free_idr;

        disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo);
        if (IS_ERR(disk)) {
                err = PTR_ERR(disk);
                goto out_cleanup_tags;
        }
        lo->lo_queue = lo->lo_disk->queue;

        /*
         * Disable partition scanning by default. The in-kernel partition
         * scanning can be requested individually per-device during its
         * setup. Userspace can always add and remove partitions from all
         * devices. The needed partition minors are allocated from the
         * extended minor space, the main loop device numbers will continue
         * to match the loop minors, regardless of the number of partitions
         * used.
         *
         * If max_part is given, partition scanning is globally enabled for
         * all loop devices. The minors for the main loop devices will be
         * multiples of max_part.
         *
         * Note: Global-for-all-devices, set-only-at-init, read-only module
         * parameteters like 'max_loop' and 'max_part' make things needlessly
         * complicated, are too static, inflexible and may surprise
         * userspace tools. Parameters like this in general should be avoided.
         */
        if (!part_shift)
                set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
        mutex_init(&lo->lo_mutex);
        lo->lo_number                = i;
        spin_lock_init(&lo->lo_lock);
        spin_lock_init(&lo->lo_work_lock);
        INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
        INIT_LIST_HEAD(&lo->rootcg_cmd_list);
        disk->major                = LOOP_MAJOR;
        disk->first_minor        = i << part_shift;
        disk->minors                = 1 << part_shift;
        disk->fops                = &lo_fops;
        disk->private_data        = lo;
        disk->queue                = lo->lo_queue;
        disk->events                = DISK_EVENT_MEDIA_CHANGE;
        disk->event_flags        = DISK_EVENT_FLAG_UEVENT;
        sprintf(disk->disk_name, "loop%d", i);
        /* Make this loop device reachable from pathname. */
        err = add_disk(disk);
        if (err)
                goto out_cleanup_disk;

        /* Show this loop device. */
        mutex_lock(&loop_ctl_mutex);
        lo->idr_visible = true;
        mutex_unlock(&loop_ctl_mutex);

        return i;

out_cleanup_disk:
        put_disk(disk);
out_cleanup_tags:
        blk_mq_free_tag_set(&lo->tag_set);
out_free_idr:
        mutex_lock(&loop_ctl_mutex);
        idr_remove(&loop_index_idr, i);
        mutex_unlock(&loop_ctl_mutex);
out_free_dev:
        kfree(lo);
out:
        return err;
}

static void loop_remove(struct loop_device *lo)
{
        /* Make this loop device unreachable from pathname. */
        del_gendisk(lo->lo_disk);
        blk_mq_free_tag_set(&lo->tag_set);

        mutex_lock(&loop_ctl_mutex);
        idr_remove(&loop_index_idr, lo->lo_number);
        mutex_unlock(&loop_ctl_mutex);

        put_disk(lo->lo_disk);
}

#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
static void loop_probe(dev_t dev)
{
        int idx = MINOR(dev) >> part_shift;

        if (max_loop_specified && max_loop && idx >= max_loop)
                return;
        loop_add(idx);
}
#else
#define loop_probe NULL
#endif /* !CONFIG_BLOCK_LEGACY_AUTOLOAD */

static int loop_control_remove(int idx)
{
        struct loop_device *lo;
        int ret;

        if (idx < 0) {
                pr_warn_once("deleting an unspecified loop device is not supported.\n");
                return -EINVAL;
        }
                
        /* Hide this loop device for serialization. */
        ret = mutex_lock_killable(&loop_ctl_mutex);
        if (ret)
                return ret;
        lo = idr_find(&loop_index_idr, idx);
        if (!lo || !lo->idr_visible)
                ret = -ENODEV;
        else
                lo->idr_visible = false;
        mutex_unlock(&loop_ctl_mutex);
        if (ret)
                return ret;

        /* Check whether this loop device can be removed. */
        ret = mutex_lock_killable(&lo->lo_mutex);
        if (ret)
                goto mark_visible;
        if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) {
                mutex_unlock(&lo->lo_mutex);
                ret = -EBUSY;
                goto mark_visible;
        }
        /* Mark this loop device as no more bound, but not quite unbound yet */
        WRITE_ONCE(lo->lo_state, Lo_deleting);
        mutex_unlock(&lo->lo_mutex);

        loop_remove(lo);
        return 0;

mark_visible:
        /* Show this loop device again. */
        mutex_lock(&loop_ctl_mutex);
        lo->idr_visible = true;
        mutex_unlock(&loop_ctl_mutex);
        return ret;
}

static int loop_control_get_free(int idx)
{
        struct loop_device *lo;
        int id, ret;

        ret = mutex_lock_killable(&loop_ctl_mutex);
        if (ret)
                return ret;
        idr_for_each_entry(&loop_index_idr, lo, id) {
                /*
                 * Hitting a race results in creating a new loop device
                 * which is harmless.
                 */
                if (lo->idr_visible &&
                    data_race(READ_ONCE(lo->lo_state)) == Lo_unbound)
                        goto found;
        }
        mutex_unlock(&loop_ctl_mutex);
        return loop_add(-1);
found:
        mutex_unlock(&loop_ctl_mutex);
        return id;
}

static long loop_control_ioctl(struct file *file, unsigned int cmd,
                               unsigned long parm)
{
        switch (cmd) {
        case LOOP_CTL_ADD:
                return loop_add(parm);
        case LOOP_CTL_REMOVE:
                return loop_control_remove(parm);
        case LOOP_CTL_GET_FREE:
                return loop_control_get_free(parm);
        default:
                return -ENOSYS;
        }
}

static const struct file_operations loop_ctl_fops = {
        .open                = nonseekable_open,
        .unlocked_ioctl        = loop_control_ioctl,
        .compat_ioctl        = loop_control_ioctl,
        .owner                = THIS_MODULE,
        .llseek                = noop_llseek,
};

static struct miscdevice loop_misc = {
        .minor                = LOOP_CTRL_MINOR,
        .name                = "loop-control",
        .fops                = &loop_ctl_fops,
};

MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
MODULE_ALIAS("devname:loop-control");

static int __init loop_init(void)
{
        int i;
        int err;

        part_shift = 0;
        if (max_part > 0) {
                part_shift = fls(max_part);

                /*
                 * Adjust max_part according to part_shift as it is exported
                 * to user space so that user can decide correct minor number
                 * if [s]he want to create more devices.
                 *
                 * Note that -1 is required because partition 0 is reserved
                 * for the whole disk.
                 */
                max_part = (1UL << part_shift) - 1;
        }

        if ((1UL << part_shift) > DISK_MAX_PARTS) {
                err = -EINVAL;
                goto err_out;
        }

        if (max_loop > 1UL << (MINORBITS - part_shift)) {
                err = -EINVAL;
                goto err_out;
        }

        err = misc_register(&loop_misc);
        if (err < 0)
                goto err_out;


        if (__register_blkdev(LOOP_MAJOR, "loop", loop_probe)) {
                err = -EIO;
                goto misc_out;
        }

        /* pre-create number of devices given by config or max_loop */
        for (i = 0; i < max_loop; i++)
                loop_add(i);

        printk(KERN_INFO "loop: module loaded\n");
        return 0;

misc_out:
        misc_deregister(&loop_misc);
err_out:
        return err;
}

static void __exit loop_exit(void)
{
        struct loop_device *lo;
        int id;

        unregister_blkdev(LOOP_MAJOR, "loop");
        misc_deregister(&loop_misc);

        /*
         * There is no need to use loop_ctl_mutex here, for nobody else can
         * access loop_index_idr when this module is unloading (unless forced
         * module unloading is requested). If this is not a clean unloading,
         * we have no means to avoid kernel crash.
         */
        idr_for_each_entry(&loop_index_idr, lo, id)
                loop_remove(lo);

        idr_destroy(&loop_index_idr);
}

module_init(loop_init);
module_exit(loop_exit);

#ifndef MODULE
static int __init max_loop_setup(char *str)
{
        max_loop = simple_strtol(str, NULL, 0);
#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
        max_loop_specified = true;
#endif
        return 1;
}

__setup("max_loop=", max_loop_setup);
#endif













































































































































































































































































































































































































    1 


















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TTY_H
#define _LINUX_TTY_H

#include <linux/fs.h>
#include <linux/major.h>
#include <linux/termios.h>
#include <linux/workqueue.h>
#include <linux/tty_driver.h>
#include <linux/tty_ldisc.h>
#include <linux/tty_port.h>
#include <linux/mutex.h>
#include <linux/tty_flags.h>
#include <uapi/linux/tty.h>
#include <linux/rwsem.h>
#include <linux/llist.h>


/*
 * (Note: the *_driver.minor_start values 1, 64, 128, 192 are
 * hardcoded at present.)
 */
#define NR_UNIX98_PTY_DEFAULT        4096      /* Default maximum for Unix98 ptys */
#define NR_UNIX98_PTY_RESERVE        1024          /* Default reserve for main devpts */
#define NR_UNIX98_PTY_MAX        (1 << MINORBITS) /* Absolute limit */

/*
 * This character is the same as _POSIX_VDISABLE: it cannot be used as
 * a c_cc[] character, but indicates that a particular special character
 * isn't in use (eg VINTR has no character etc)
 */
#define __DISABLED_CHAR '\0'

#define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR])
#define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT])
#define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE])
#define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL])
#define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF])
#define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME])
#define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN])
#define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC])
#define START_CHAR(tty) ((tty)->termios.c_cc[VSTART])
#define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP])
#define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP])
#define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL])
#define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT])
#define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD])
#define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE])
#define LNEXT_CHAR(tty)        ((tty)->termios.c_cc[VLNEXT])
#define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2])

#define _I_FLAG(tty, f)        ((tty)->termios.c_iflag & (f))
#define _O_FLAG(tty, f)        ((tty)->termios.c_oflag & (f))
#define _C_FLAG(tty, f)        ((tty)->termios.c_cflag & (f))
#define _L_FLAG(tty, f)        ((tty)->termios.c_lflag & (f))

#define I_IGNBRK(tty)        _I_FLAG((tty), IGNBRK)
#define I_BRKINT(tty)        _I_FLAG((tty), BRKINT)
#define I_IGNPAR(tty)        _I_FLAG((tty), IGNPAR)
#define I_PARMRK(tty)        _I_FLAG((tty), PARMRK)
#define I_INPCK(tty)        _I_FLAG((tty), INPCK)
#define I_ISTRIP(tty)        _I_FLAG((tty), ISTRIP)
#define I_INLCR(tty)        _I_FLAG((tty), INLCR)
#define I_IGNCR(tty)        _I_FLAG((tty), IGNCR)
#define I_ICRNL(tty)        _I_FLAG((tty), ICRNL)
#define I_IUCLC(tty)        _I_FLAG((tty), IUCLC)
#define I_IXON(tty)        _I_FLAG((tty), IXON)
#define I_IXANY(tty)        _I_FLAG((tty), IXANY)
#define I_IXOFF(tty)        _I_FLAG((tty), IXOFF)
#define I_IMAXBEL(tty)        _I_FLAG((tty), IMAXBEL)
#define I_IUTF8(tty)        _I_FLAG((tty), IUTF8)

#define O_OPOST(tty)        _O_FLAG((tty), OPOST)
#define O_OLCUC(tty)        _O_FLAG((tty), OLCUC)
#define O_ONLCR(tty)        _O_FLAG((tty), ONLCR)
#define O_OCRNL(tty)        _O_FLAG((tty), OCRNL)
#define O_ONOCR(tty)        _O_FLAG((tty), ONOCR)
#define O_ONLRET(tty)        _O_FLAG((tty), ONLRET)
#define O_OFILL(tty)        _O_FLAG((tty), OFILL)
#define O_OFDEL(tty)        _O_FLAG((tty), OFDEL)
#define O_NLDLY(tty)        _O_FLAG((tty), NLDLY)
#define O_CRDLY(tty)        _O_FLAG((tty), CRDLY)
#define O_TABDLY(tty)        _O_FLAG((tty), TABDLY)
#define O_BSDLY(tty)        _O_FLAG((tty), BSDLY)
#define O_VTDLY(tty)        _O_FLAG((tty), VTDLY)
#define O_FFDLY(tty)        _O_FLAG((tty), FFDLY)

#define C_BAUD(tty)        _C_FLAG((tty), CBAUD)
#define C_CSIZE(tty)        _C_FLAG((tty), CSIZE)
#define C_CSTOPB(tty)        _C_FLAG((tty), CSTOPB)
#define C_CREAD(tty)        _C_FLAG((tty), CREAD)
#define C_PARENB(tty)        _C_FLAG((tty), PARENB)
#define C_PARODD(tty)        _C_FLAG((tty), PARODD)
#define C_HUPCL(tty)        _C_FLAG((tty), HUPCL)
#define C_CLOCAL(tty)        _C_FLAG((tty), CLOCAL)
#define C_CIBAUD(tty)        _C_FLAG((tty), CIBAUD)
#define C_CRTSCTS(tty)        _C_FLAG((tty), CRTSCTS)
#define C_CMSPAR(tty)        _C_FLAG((tty), CMSPAR)

#define L_ISIG(tty)        _L_FLAG((tty), ISIG)
#define L_ICANON(tty)        _L_FLAG((tty), ICANON)
#define L_XCASE(tty)        _L_FLAG((tty), XCASE)
#define L_ECHO(tty)        _L_FLAG((tty), ECHO)
#define L_ECHOE(tty)        _L_FLAG((tty), ECHOE)
#define L_ECHOK(tty)        _L_FLAG((tty), ECHOK)
#define L_ECHONL(tty)        _L_FLAG((tty), ECHONL)
#define L_NOFLSH(tty)        _L_FLAG((tty), NOFLSH)
#define L_TOSTOP(tty)        _L_FLAG((tty), TOSTOP)
#define L_ECHOCTL(tty)        _L_FLAG((tty), ECHOCTL)
#define L_ECHOPRT(tty)        _L_FLAG((tty), ECHOPRT)
#define L_ECHOKE(tty)        _L_FLAG((tty), ECHOKE)
#define L_FLUSHO(tty)        _L_FLAG((tty), FLUSHO)
#define L_PENDIN(tty)        _L_FLAG((tty), PENDIN)
#define L_IEXTEN(tty)        _L_FLAG((tty), IEXTEN)
#define L_EXTPROC(tty)        _L_FLAG((tty), EXTPROC)

struct device;
struct signal_struct;
struct tty_operations;

/**
 * struct tty_struct - state associated with a tty while open
 *
 * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero
 *          frees the structure
 * @dev: class device or %NULL (e.g. ptys, serdev)
 * @driver: &struct tty_driver operating this tty
 * @ops: &struct tty_operations of @driver for this tty (open, close, etc.)
 * @index: index of this tty (e.g. to construct @name like tty12)
 * @ldisc_sem: protects line discipline changes (@ldisc) -- lock tty not pty
 * @ldisc: the current line discipline for this tty (n_tty by default)
 * @atomic_write_lock: protects against concurrent writers, i.e. locks
 *                       @write_cnt, @write_buf and similar
 * @legacy_mutex: leftover from history (BKL -> BTM -> @legacy_mutex),
 *                  protecting several operations on this tty
 * @throttle_mutex: protects against concurrent tty_throttle_safe() and
 *                    tty_unthrottle_safe() (but not tty_unthrottle())
 * @termios_rwsem: protects @termios and @termios_locked
 * @winsize_mutex: protects @winsize
 * @termios: termios for the current tty, copied from/to @driver.termios
 * @termios_locked: locked termios (by %TIOCGLCKTRMIOS and %TIOCSLCKTRMIOS
 *                    ioctls)
 * @name: name of the tty constructed by tty_line_name() (e.g. ttyS3)
 * @flags: bitwise OR of %TTY_THROTTLED, %TTY_IO_ERROR, ...
 * @count: count of open processes, reaching zero cancels all the work for
 *           this tty and drops a @kref too (but does not free this tty)
 * @winsize: size of the terminal "window" (cf. @winsize_mutex)
 * @flow: flow settings grouped together
 * @flow.lock: lock for @flow members
 * @flow.stopped: tty stopped/started by stop_tty()/start_tty()
 * @flow.tco_stopped: tty stopped/started by %TCOOFF/%TCOON ioctls (it has
 *                      precedence over @flow.stopped)
 * @ctrl: control settings grouped together
 * @ctrl.lock: lock for @ctrl members
 * @ctrl.pgrp: process group of this tty (setpgrp(2))
 * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both
 *                  @ctrl.lock and @legacy_mutex, readers must use at least one of
 *                  them.
 * @ctrl.pktstatus: packet mode status (bitwise OR of %TIOCPKT_ constants)
 * @ctrl.packet: packet mode enabled
 * @hw_stopped: not controlled by the tty layer, under @driver's control for CTS
 *                handling
 * @receive_room: bytes permitted to feed to @ldisc without any being lost
 * @flow_change: controls behavior of throttling, see tty_throttle_safe() and
 *                 tty_unthrottle_safe()
 * @link: link to another pty (master -> slave and vice versa)
 * @fasync: state for %O_ASYNC (for %SIGIO); managed by fasync_helper()
 * @write_wait: concurrent writers are waiting in this queue until they are
 *                allowed to write
 * @read_wait: readers wait for data in this queue
 * @hangup_work: normally a work to perform a hangup (do_tty_hangup()); while
 *                 freeing the tty, (re)used to release_one_tty()
 * @disc_data: pointer to @ldisc's private data (e.g. to &struct n_tty_data)
 * @driver_data: pointer to @driver's private data (e.g. &struct uart_state)
 * @files_lock:        protects @tty_files list
 * @tty_files: list of (re)openers of this tty (i.e. linked &struct
 *               tty_file_private)
 * @closing: when set during close, n_tty processes only START & STOP chars
 * @write_buf: temporary buffer used during tty_write() to copy user data to
 * @write_cnt: count of bytes written in tty_write() to @write_buf
 * @SAK_work: if the tty has a pending do_SAK, it is queued here
 * @port: persistent storage for this device (i.e. &struct tty_port)
 *
 * All of the state associated with a tty while the tty is open. Persistent
 * storage for tty devices is referenced here as @port and is documented in
 * &struct tty_port.
 */
struct tty_struct {
        struct kref kref;
        int index;
        struct device *dev;
        struct tty_driver *driver;
        struct tty_port *port;
        const struct tty_operations *ops;

        struct tty_ldisc *ldisc;
        struct ld_semaphore ldisc_sem;

        struct mutex atomic_write_lock;
        struct mutex legacy_mutex;
        struct mutex throttle_mutex;
        struct rw_semaphore termios_rwsem;
        struct mutex winsize_mutex;
        struct ktermios termios, termios_locked;
        char name[64];
        unsigned long flags;
        int count;
        unsigned int receive_room;
        struct winsize winsize;

        struct {
                spinlock_t lock;
                bool stopped;
                bool tco_stopped;
        } flow;

        struct {
                struct pid *pgrp;
                struct pid *session;
                spinlock_t lock;
                unsigned char pktstatus;
                bool packet;
        } ctrl;

        bool hw_stopped;
        bool closing;
        int flow_change;

        struct tty_struct *link;
        struct fasync_struct *fasync;
        wait_queue_head_t write_wait;
        wait_queue_head_t read_wait;
        struct work_struct hangup_work;
        void *disc_data;
        void *driver_data;
        spinlock_t files_lock;
        int write_cnt;
        u8 *write_buf;

        struct list_head tty_files;

        struct work_struct SAK_work;
} __randomize_layout;

/* Each of a tty's open files has private_data pointing to tty_file_private */
struct tty_file_private {
        struct tty_struct *tty;
        struct file *file;
        struct list_head list;
};

/**
 * enum tty_struct_flags - TTY Struct Flags
 *
 * These bits are used in the :c:member:`tty_struct.flags` field.
 *
 * So that interrupts won't be able to mess up the queues,
 * copy_to_cooked must be atomic with respect to itself, as must
 * tty->write.  Thus, you must use the inline functions set_bit() and
 * clear_bit() to make things atomic.
 *
 * @TTY_THROTTLED:
 *        Driver input is throttled. The ldisc should call
 *        :c:member:`tty_driver.unthrottle()` in order to resume reception when
 *        it is ready to process more data (at threshold min).
 *
 * @TTY_IO_ERROR:
 *        If set, causes all subsequent userspace read/write calls on the tty to
 *        fail, returning -%EIO. (May be no ldisc too.)
 *
 * @TTY_OTHER_CLOSED:
 *        Device is a pty and the other side has closed.
 *
 * @TTY_EXCLUSIVE:
 *        Exclusive open mode (a single opener).
 *
 * @TTY_DO_WRITE_WAKEUP:
 *        If set, causes the driver to call the
 *        :c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume
 *        transmission when it can accept more data to transmit.
 *
 * @TTY_LDISC_OPEN:
 *        Indicates that a line discipline is open. For debugging purposes only.
 *
 * @TTY_PTY_LOCK:
 *        A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic.
 *
 * @TTY_NO_WRITE_SPLIT:
 *        Prevent driver from splitting up writes into smaller chunks (preserve
 *        write boundaries to driver).
 *
 * @TTY_HUPPED:
 *        The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`.
 *
 * @TTY_HUPPING:
 *        The TTY is in the process of hanging up to abort potential readers.
 *
 * @TTY_LDISC_CHANGING:
 *        Line discipline for this TTY is being changed. I/O should not block
 *        when this is set. Use tty_io_nonblock() to check.
 *
 * @TTY_LDISC_HALTED:
 *        Line discipline for this TTY was stopped. No work should be queued to
 *        this ldisc.
 */
enum tty_struct_flags {
        TTY_THROTTLED,
        TTY_IO_ERROR,
        TTY_OTHER_CLOSED,
        TTY_EXCLUSIVE,
        TTY_DO_WRITE_WAKEUP,
        TTY_LDISC_OPEN,
        TTY_PTY_LOCK,
        TTY_NO_WRITE_SPLIT,
        TTY_HUPPED,
        TTY_HUPPING,
        TTY_LDISC_CHANGING,
        TTY_LDISC_HALTED,
};

static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file)
{
        return file->f_flags & O_NONBLOCK ||
                test_bit(TTY_LDISC_CHANGING, &tty->flags);
}

static inline bool tty_io_error(struct tty_struct *tty)
{
        return test_bit(TTY_IO_ERROR, &tty->flags);
}

static inline bool tty_throttled(struct tty_struct *tty)
{
        return test_bit(TTY_THROTTLED, &tty->flags);
}

#ifdef CONFIG_TTY
void tty_kref_put(struct tty_struct *tty);
struct pid *tty_get_pgrp(struct tty_struct *tty);
void tty_vhangup_self(void);
void disassociate_ctty(int priv);
dev_t tty_devnum(struct tty_struct *tty);
void proc_clear_tty(struct task_struct *p);
struct tty_struct *get_current_tty(void);
/* tty_io.c */
int __init tty_init(void);
const char *tty_name(const struct tty_struct *tty);
struct tty_struct *tty_kopen_exclusive(dev_t device);
struct tty_struct *tty_kopen_shared(dev_t device);
void tty_kclose(struct tty_struct *tty);
int tty_dev_name_to_number(const char *name, dev_t *number);
#else
static inline void tty_kref_put(struct tty_struct *tty)
{ }
static inline struct pid *tty_get_pgrp(struct tty_struct *tty)
{ return NULL; }
static inline void tty_vhangup_self(void)
{ }
static inline void disassociate_ctty(int priv)
{ }
static inline dev_t tty_devnum(struct tty_struct *tty)
{ return 0; }
static inline void proc_clear_tty(struct task_struct *p)
{ }
static inline struct tty_struct *get_current_tty(void)
{ return NULL; }
/* tty_io.c */
static inline int __init tty_init(void)
{ return 0; }
static inline const char *tty_name(const struct tty_struct *tty)
{ return "(none)"; }
static inline struct tty_struct *tty_kopen_exclusive(dev_t device)
{ return ERR_PTR(-ENODEV); }
static inline void tty_kclose(struct tty_struct *tty)
{ }
static inline int tty_dev_name_to_number(const char *name, dev_t *number)
{ return -ENOTSUPP; }
#endif

extern struct ktermios tty_std_termios;

int vcs_init(void);

extern const struct class tty_class;

/**
 * tty_kref_get - get a tty reference
 * @tty: tty device
 *
 * Returns: a new reference to a tty object
 *
 * Locking: The caller must hold sufficient locks/counts to ensure that their
 * existing reference cannot go away.
 */
static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
{
        if (tty)
                kref_get(&tty->kref);
        return tty;
}

const char *tty_driver_name(const struct tty_struct *tty);
void tty_wait_until_sent(struct tty_struct *tty, long timeout);
void stop_tty(struct tty_struct *tty);
void start_tty(struct tty_struct *tty);
void tty_write_message(struct tty_struct *tty, char *msg);
int tty_send_xchar(struct tty_struct *tty, u8 ch);
int tty_put_char(struct tty_struct *tty, u8 c);
unsigned int tty_chars_in_buffer(struct tty_struct *tty);
unsigned int tty_write_room(struct tty_struct *tty);
void tty_driver_flush_buffer(struct tty_struct *tty);
void tty_unthrottle(struct tty_struct *tty);
bool tty_throttle_safe(struct tty_struct *tty);
bool tty_unthrottle_safe(struct tty_struct *tty);
int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
int tty_get_icount(struct tty_struct *tty,
                struct serial_icounter_struct *icount);
int tty_get_tiocm(struct tty_struct *tty);
int is_current_pgrp_orphaned(void);
void tty_hangup(struct tty_struct *tty);
void tty_vhangup(struct tty_struct *tty);
int tty_hung_up_p(struct file *filp);
void do_SAK(struct tty_struct *tty);
void __do_SAK(struct tty_struct *tty);
void no_tty(void);
speed_t tty_termios_baud_rate(const struct ktermios *termios);
void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud,
                speed_t obaud);
void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud,
                speed_t obaud);

/**
 * tty_get_baud_rate - get tty bit rates
 * @tty: tty to query
 *
 * Returns: the baud rate as an integer for this terminal
 *
 * Locking: The termios lock must be held by the caller.
 */
static inline speed_t tty_get_baud_rate(const struct tty_struct *tty)
{
        return tty_termios_baud_rate(&tty->termios);
}

unsigned char tty_get_char_size(unsigned int cflag);
unsigned char tty_get_frame_size(unsigned int cflag);

void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old);
bool tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b);
int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);

void tty_wakeup(struct tty_struct *tty);

int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg);
int tty_perform_flush(struct tty_struct *tty, unsigned long arg);
struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx);
void tty_release_struct(struct tty_struct *tty, int idx);
void tty_init_termios(struct tty_struct *tty);
void tty_save_termios(struct tty_struct *tty);
int tty_standard_install(struct tty_driver *driver,
                struct tty_struct *tty);

extern struct mutex tty_mutex;

/* n_tty.c */
void n_tty_inherit_ops(struct tty_ldisc_ops *ops);
#ifdef CONFIG_TTY
void __init n_tty_init(void);
#else
static inline void n_tty_init(void) { }
#endif

/* tty_audit.c */
#ifdef CONFIG_AUDIT
void tty_audit_exit(void);
void tty_audit_fork(struct signal_struct *sig);
int tty_audit_push(void);
#else
static inline void tty_audit_exit(void)
{
}
static inline void tty_audit_fork(struct signal_struct *sig)
{
}
static inline int tty_audit_push(void)
{
        return 0;
}
#endif

/* tty_ioctl.c */
int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd,
                unsigned long arg);

/* vt.c */

int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg);

long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd,
                unsigned long arg);

/* tty_mutex.c */
/* functions for preparation of BKL removal */
void tty_lock(struct tty_struct *tty);
int  tty_lock_interruptible(struct tty_struct *tty);
void tty_unlock(struct tty_struct *tty);
void tty_lock_slave(struct tty_struct *tty);
void tty_unlock_slave(struct tty_struct *tty);
void tty_set_lock_subclass(struct tty_struct *tty);

#endif
































    2 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BLOCKGROUP_LOCK_H
#define _LINUX_BLOCKGROUP_LOCK_H
/*
 * Per-blockgroup locking for ext2 and ext3.
 *
 * Simple hashed spinlocking.
 */

#include <linux/spinlock.h>
#include <linux/cache.h>

#ifdef CONFIG_SMP
#define NR_BG_LOCKS        (4 << ilog2(NR_CPUS < 32 ? NR_CPUS : 32))
#else
#define NR_BG_LOCKS        1
#endif

struct bgl_lock {
        spinlock_t lock;
} ____cacheline_aligned_in_smp;

struct blockgroup_lock {
        struct bgl_lock locks[NR_BG_LOCKS];
};

static inline void bgl_lock_init(struct blockgroup_lock *bgl)
{
        int i;

        for (i = 0; i < NR_BG_LOCKS; i++)
                spin_lock_init(&bgl->locks[i].lock);
}

static inline spinlock_t *
bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group)
{
        return &bgl->locks[block_group & (NR_BG_LOCKS-1)].lock;
}

#endif




















































































































    1 






    2 











































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * xfrm_device.c - IPsec device offloading code.
 *
 * Copyright (c) 2015 secunet Security Networks AG
 *
 * Author:
 * Steffen Klassert <steffen.klassert@secunet.com>
 */

#include <linux/errno.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <net/dst.h>
#include <net/gso.h>
#include <net/xfrm.h>
#include <linux/notifier.h>

#ifdef CONFIG_XFRM_OFFLOAD
static void __xfrm_transport_prep(struct xfrm_state *x, struct sk_buff *skb,
                                  unsigned int hsize)
{
        struct xfrm_offload *xo = xfrm_offload(skb);

        skb_reset_mac_len(skb);
        if (xo->flags & XFRM_GSO_SEGMENT)
                skb->transport_header -= x->props.header_len;

        pskb_pull(skb, skb_transport_offset(skb) + x->props.header_len);
}

static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb,
                                    unsigned int hsize)

{
        struct xfrm_offload *xo = xfrm_offload(skb);

        if (xo->flags & XFRM_GSO_SEGMENT)
                skb->transport_header = skb->network_header + hsize;

        skb_reset_mac_len(skb);
        pskb_pull(skb,
                  skb->mac_len + x->props.header_len - x->props.enc_hdr_len);
}

static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
                                  unsigned int hsize)
{
        struct xfrm_offload *xo = xfrm_offload(skb);
        int phlen = 0;

        if (xo->flags & XFRM_GSO_SEGMENT)
                skb->transport_header = skb->network_header + hsize;

        skb_reset_mac_len(skb);
        if (x->sel.family != AF_INET6) {
                phlen = IPV4_BEET_PHMAXLEN;
                if (x->outer_mode.family == AF_INET6)
                        phlen += sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        }

        pskb_pull(skb, skb->mac_len + hsize + (x->props.header_len - phlen));
}

/* Adjust pointers into the packet when IPsec is done at layer2 */
static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
{
        switch (x->outer_mode.encap) {
        case XFRM_MODE_IPTFS:
        case XFRM_MODE_TUNNEL:
                if (x->outer_mode.family == AF_INET)
                        return __xfrm_mode_tunnel_prep(x, skb,
                                                       sizeof(struct iphdr));
                if (x->outer_mode.family == AF_INET6)
                        return __xfrm_mode_tunnel_prep(x, skb,
                                                       sizeof(struct ipv6hdr));
                break;
        case XFRM_MODE_TRANSPORT:
                if (x->outer_mode.family == AF_INET)
                        return __xfrm_transport_prep(x, skb,
                                                     sizeof(struct iphdr));
                if (x->outer_mode.family == AF_INET6)
                        return __xfrm_transport_prep(x, skb,
                                                     sizeof(struct ipv6hdr));
                break;
        case XFRM_MODE_BEET:
                if (x->outer_mode.family == AF_INET)
                        return __xfrm_mode_beet_prep(x, skb,
                                                     sizeof(struct iphdr));
                if (x->outer_mode.family == AF_INET6)
                        return __xfrm_mode_beet_prep(x, skb,
                                                     sizeof(struct ipv6hdr));
                break;
        case XFRM_MODE_ROUTEOPTIMIZATION:
        case XFRM_MODE_IN_TRIGGER:
                break;
        }
}

static inline bool xmit_xfrm_check_overflow(struct sk_buff *skb)
{
        struct xfrm_offload *xo = xfrm_offload(skb);
        __u32 seq = xo->seq.low;

        seq += skb_shinfo(skb)->gso_segs;
        if (unlikely(seq < xo->seq.low))
                return true;

        return false;
}

struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again)
{
        int err;
        unsigned long flags;
        struct xfrm_state *x;
        struct softnet_data *sd;
        struct sk_buff *skb2, *nskb, *pskb = NULL;
        netdev_features_t esp_features = features;
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct net_device *dev = skb->dev;
        struct sec_path *sp;

        if (!xo || (xo->flags & XFRM_XMIT))
                return skb;

        if (!(features & NETIF_F_HW_ESP))
                esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);

        sp = skb_sec_path(skb);
        x = sp->xvec[sp->len - 1];
        if (xo->flags & XFRM_GRO || x->xso.dir == XFRM_DEV_OFFLOAD_IN)
                return skb;

        /* The packet was sent to HW IPsec packet offload engine,
         * but to wrong device. Drop the packet, so it won't skip
         * XFRM stack.
         */
        if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->xso.dev != dev) {
                kfree_skb(skb);
                dev_core_stats_tx_dropped_inc(dev);
                return NULL;
        }

        local_irq_save(flags);
        sd = this_cpu_ptr(&softnet_data);
        err = !skb_queue_empty(&sd->xfrm_backlog);
        local_irq_restore(flags);

        if (err) {
                *again = true;
                return skb;
        }

        if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) ||
                                unlikely(xmit_xfrm_check_overflow(skb)))) {
                struct sk_buff *segs;

                /* Packet got rerouted, fixup features and segment it. */
                esp_features = esp_features & ~(NETIF_F_HW_ESP | NETIF_F_GSO_ESP);

                segs = skb_gso_segment(skb, esp_features);
                if (IS_ERR(segs)) {
                        kfree_skb(skb);
                        dev_core_stats_tx_dropped_inc(dev);
                        return NULL;
                } else {
                        consume_skb(skb);
                        skb = segs;
                }
        }

        if (!skb->next) {
                esp_features |= skb->dev->gso_partial_features;
                xfrm_outer_mode_prep(x, skb);

                xo->flags |= XFRM_DEV_RESUME;

                err = x->type_offload->xmit(x, skb, esp_features);
                if (err) {
                        if (err == -EINPROGRESS)
                                return NULL;

                        XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
                        kfree_skb(skb);
                        return NULL;
                }

                skb_push(skb, skb->data - skb_mac_header(skb));

                return skb;
        }

        skb_list_walk_safe(skb, skb2, nskb) {
                esp_features |= skb->dev->gso_partial_features;
                skb_mark_not_on_list(skb2);

                xo = xfrm_offload(skb2);
                xo->flags |= XFRM_DEV_RESUME;

                xfrm_outer_mode_prep(x, skb2);

                err = x->type_offload->xmit(x, skb2, esp_features);
                if (!err) {
                        skb2->next = nskb;
                } else if (err != -EINPROGRESS) {
                        XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
                        skb2->next = nskb;
                        kfree_skb_list(skb2);
                        return NULL;
                } else {
                        if (skb == skb2)
                                skb = nskb;
                        else
                                pskb->next = nskb;

                        continue;
                }

                skb_push(skb2, skb2->data - skb_mac_header(skb2));
                pskb = skb2;
        }

        return skb;
}
EXPORT_SYMBOL_GPL(validate_xmit_xfrm);

int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
                       struct xfrm_user_offload *xuo,
                       struct netlink_ext_ack *extack)
{
        int err;
        struct dst_entry *dst;
        struct net_device *dev;
        struct xfrm_dev_offload *xso = &x->xso;
        xfrm_address_t *saddr;
        xfrm_address_t *daddr;
        bool is_packet_offload;

        if (xuo->flags &
            ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) {
                NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
                return -EINVAL;
        }

        if ((xuo->flags & XFRM_OFFLOAD_INBOUND && x->dir == XFRM_SA_DIR_OUT) ||
            (!(xuo->flags & XFRM_OFFLOAD_INBOUND) && x->dir == XFRM_SA_DIR_IN)) {
                NL_SET_ERR_MSG(extack, "Mismatched SA and offload direction");
                return -EINVAL;
        }

        if (xuo->flags & XFRM_OFFLOAD_INBOUND && x->if_id) {
                NL_SET_ERR_MSG(extack, "XFRM if_id is not supported in RX path");
                return -EINVAL;
        }

        is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET;

        /* We don't yet support TFC padding. */
        if (x->tfcpad) {
                NL_SET_ERR_MSG(extack, "TFC padding can't be offloaded");
                return -EINVAL;
        }

        dev = dev_get_by_index(net, xuo->ifindex);
        if (!dev) {
                struct xfrm_dst_lookup_params params;

                if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) {
                        saddr = &x->props.saddr;
                        daddr = &x->id.daddr;
                } else {
                        saddr = &x->id.daddr;
                        daddr = &x->props.saddr;
                }

                memset(&params, 0, sizeof(params));
                params.net = net;
                params.saddr = saddr;
                params.daddr = daddr;
                params.mark = xfrm_smark_get(0, x);
                dst = __xfrm_dst_lookup(x->props.family, &params);
                if (IS_ERR(dst))
                        return (is_packet_offload) ? -EINVAL : 0;

                dev = dst->dev;

                dev_hold(dev);
                dst_release(dst);
        }

        if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) {
                xso->dev = NULL;
                dev_put(dev);
                return (is_packet_offload) ? -EINVAL : 0;
        }

        if (!is_packet_offload && x->props.flags & XFRM_STATE_ESN &&
            !dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
                NL_SET_ERR_MSG(extack, "Device doesn't support offload with ESN");
                xso->dev = NULL;
                dev_put(dev);
                return -EINVAL;
        }

        if (!x->type_offload) {
                NL_SET_ERR_MSG(extack, "Type doesn't support offload");
                dev_put(dev);
                return -EINVAL;
        }

        xso->dev = dev;
        netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC);

        if (xuo->flags & XFRM_OFFLOAD_INBOUND)
                xso->dir = XFRM_DEV_OFFLOAD_IN;
        else
                xso->dir = XFRM_DEV_OFFLOAD_OUT;

        if (is_packet_offload)
                xso->type = XFRM_DEV_OFFLOAD_PACKET;
        else
                xso->type = XFRM_DEV_OFFLOAD_CRYPTO;

        err = dev->xfrmdev_ops->xdo_dev_state_add(dev, x, extack);
        if (err) {
                xso->dev = NULL;
                xso->dir = 0;
                netdev_put(dev, &xso->dev_tracker);
                xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;

                xfrm_unset_type_offload(x);
                /* User explicitly requested packet offload mode and configured
                 * policy in addition to the XFRM state. So be civil to users,
                 * and return an error instead of taking fallback path.
                 */
                if ((err != -EOPNOTSUPP && !is_packet_offload) || is_packet_offload) {
                        NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this state");
                        return err;
                }
        }

        return 0;
}
EXPORT_SYMBOL_GPL(xfrm_dev_state_add);

int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
                        struct xfrm_user_offload *xuo, u8 dir,
                        struct netlink_ext_ack *extack)
{
        struct xfrm_dev_offload *xdo = &xp->xdo;
        struct net_device *dev;
        int err;

        if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_PACKET) {
                /* We support only packet offload mode and it means
                 * that user must set XFRM_OFFLOAD_PACKET bit.
                 */
                NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
                return -EINVAL;
        }

        dev = dev_get_by_index(net, xuo->ifindex);
        if (!dev)
                return -EINVAL;

        if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) {
                xdo->dev = NULL;
                dev_put(dev);
                NL_SET_ERR_MSG(extack, "Policy offload is not supported");
                return -EINVAL;
        }

        xdo->dev = dev;
        netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC);
        xdo->type = XFRM_DEV_OFFLOAD_PACKET;
        switch (dir) {
        case XFRM_POLICY_IN:
                xdo->dir = XFRM_DEV_OFFLOAD_IN;
                break;
        case XFRM_POLICY_OUT:
                xdo->dir = XFRM_DEV_OFFLOAD_OUT;
                break;
        case XFRM_POLICY_FWD:
                xdo->dir = XFRM_DEV_OFFLOAD_FWD;
                break;
        default:
                xdo->dev = NULL;
                netdev_put(dev, &xdo->dev_tracker);
                NL_SET_ERR_MSG(extack, "Unrecognized offload direction");
                return -EINVAL;
        }

        err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack);
        if (err) {
                xdo->dev = NULL;
                xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
                xdo->dir = 0;
                netdev_put(dev, &xdo->dev_tracker);
                NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this policy");
                return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(xfrm_dev_policy_add);

bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
{
        int mtu;
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
        struct net_device *dev = x->xso.dev;
        bool check_tunnel_size;

        if (!x->type_offload ||
            (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap))
                return false;

        if ((!dev || dev == xfrm_dst_path(dst)->dev) &&
            !xdst->child->xfrm) {
                mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
                if (skb->len <= mtu)
                        goto ok;

                if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
                        goto ok;
        }

        return false;

ok:
        if (!dev)
                return true;

        check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET &&
                            x->props.mode == XFRM_MODE_TUNNEL;
        switch (skb_dst(skb)->ops->family) {
        case AF_INET:
                /* Check for IPv4 options */
                if (ip_hdr(skb)->ihl != 5)
                        return false;
                if (check_tunnel_size && xfrm4_tunnel_check_size(skb))
                        return false;
                break;
        case AF_INET6:
                /* Check for IPv6 extensions */
                if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr))
                        return false;
                if (check_tunnel_size && xfrm6_tunnel_check_size(skb))
                        return false;
                break;
        default:
                break;
        }

        if (dev->xfrmdev_ops->xdo_dev_offload_ok)
                return dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x);

        return true;
}
EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok);

void xfrm_dev_resume(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        int ret = NETDEV_TX_BUSY;
        struct netdev_queue *txq;
        struct softnet_data *sd;
        unsigned long flags;

        rcu_read_lock();
        txq = netdev_core_pick_tx(dev, skb, NULL);

        HARD_TX_LOCK(dev, txq, smp_processor_id());
        if (!netif_xmit_frozen_or_stopped(txq))
                skb = dev_hard_start_xmit(skb, dev, txq, &ret);
        HARD_TX_UNLOCK(dev, txq);

        if (!dev_xmit_complete(ret)) {
                local_irq_save(flags);
                sd = this_cpu_ptr(&softnet_data);
                skb_queue_tail(&sd->xfrm_backlog, skb);
                raise_softirq_irqoff(NET_TX_SOFTIRQ);
                local_irq_restore(flags);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(xfrm_dev_resume);

void xfrm_dev_backlog(struct softnet_data *sd)
{
        struct sk_buff_head *xfrm_backlog = &sd->xfrm_backlog;
        struct sk_buff_head list;
        struct sk_buff *skb;

        if (skb_queue_empty(xfrm_backlog))
                return;

        __skb_queue_head_init(&list);

        spin_lock(&xfrm_backlog->lock);
        skb_queue_splice_init(xfrm_backlog, &list);
        spin_unlock(&xfrm_backlog->lock);

        while (!skb_queue_empty(&list)) {
                skb = __skb_dequeue(&list);
                xfrm_dev_resume(skb);
        }

}
#endif

static int xfrm_api_check(struct net_device *dev)
{
#ifdef CONFIG_XFRM_OFFLOAD
        if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) &&
            !(dev->features & NETIF_F_HW_ESP))
                return NOTIFY_BAD;

        if ((dev->features & NETIF_F_HW_ESP) &&
            (!(dev->xfrmdev_ops &&
               dev->xfrmdev_ops->xdo_dev_state_add &&
               dev->xfrmdev_ops->xdo_dev_state_delete)))
                return NOTIFY_BAD;
#else
        if (dev->features & (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM))
                return NOTIFY_BAD;
#endif

        return NOTIFY_DONE;
}

static int xfrm_dev_down(struct net_device *dev)
{
        if (dev->features & NETIF_F_HW_ESP) {
                xfrm_dev_state_flush(dev_net(dev), dev, true);
                xfrm_dev_policy_flush(dev_net(dev), dev, true);
        }

        return NOTIFY_DONE;
}

static int xfrm_dev_unregister(struct net_device *dev)
{
        xfrm_dev_state_flush(dev_net(dev), dev, true);
        xfrm_dev_policy_flush(dev_net(dev), dev, true);

        return NOTIFY_DONE;
}

static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        switch (event) {
        case NETDEV_REGISTER:
                return xfrm_api_check(dev);

        case NETDEV_FEAT_CHANGE:
                return xfrm_api_check(dev);

        case NETDEV_DOWN:
                return xfrm_dev_down(dev);

        case NETDEV_UNREGISTER:
                return xfrm_dev_unregister(dev);
        }
        return NOTIFY_DONE;
}

static struct notifier_block xfrm_dev_notifier = {
        .notifier_call        = xfrm_dev_event,
};

void __init xfrm_dev_init(void)
{
        register_netdevice_notifier(&xfrm_dev_notifier);
}




























    4 





    4 








    4 





    4 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/symlink.c - kernfs symlink implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/namei.h>

#include "kernfs-internal.h"

/**
 * kernfs_create_link - create a symlink
 * @parent: directory to create the symlink in
 * @name: name of the symlink
 * @target: target node for the symlink to point to
 *
 * Return: the created node on success, ERR_PTR() value on error.
 * Ownership of the link matches ownership of the target.
 */
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
                                       const char *name,
                                       struct kernfs_node *target)
{
        struct kernfs_node *kn;
        int error;
        kuid_t uid = GLOBAL_ROOT_UID;
        kgid_t gid = GLOBAL_ROOT_GID;

        if (target->iattr) {
                uid = target->iattr->ia_uid;
                gid = target->iattr->ia_gid;
        }

        kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK);
        if (!kn)
                return ERR_PTR(-ENOMEM);

        if (kernfs_ns_enabled(parent))
                kn->ns = target->ns;
        kn->symlink.target_kn = target;
        kernfs_get(target);        /* ref owned by symlink */

        error = kernfs_add_one(kn);
        if (!error)
                return kn;

        kernfs_put(kn);
        return ERR_PTR(error);
}

static int kernfs_get_target_path(struct kernfs_node *parent,
                                  struct kernfs_node *target, char *path)
{
        struct kernfs_node *base, *kn;
        char *s = path;
        int len = 0;

        /* go up to the root, stop at the base */
        base = parent;
        while (kernfs_parent(base)) {
                kn = kernfs_parent(target);
                while (kernfs_parent(kn) && base != kn)
                        kn = kernfs_parent(kn);

                if (base == kn)
                        break;

                if ((s - path) + 3 >= PATH_MAX)
                        return -ENAMETOOLONG;

                strcpy(s, "../");
                s += 3;
                base = kernfs_parent(base);
        }

        /* determine end of target string for reverse fillup */
        kn = target;
        while (kernfs_parent(kn) && kn != base) {
                len += strlen(kernfs_rcu_name(kn)) + 1;
                kn = kernfs_parent(kn);
        }

        /* check limits */
        if (len < 2)
                return -EINVAL;
        len--;
        if ((s - path) + len >= PATH_MAX)
                return -ENAMETOOLONG;

        /* reverse fillup of target string from target to base */
        kn = target;
        while (kernfs_parent(kn) && kn != base) {
                const char *name = kernfs_rcu_name(kn);
                int slen = strlen(name);

                len -= slen;
                memcpy(s + len, name, slen);
                if (len)
                        s[--len] = '/';

                kn = kernfs_parent(kn);
        }

        return 0;
}

static int kernfs_getlink(struct inode *inode, char *path)
{
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_node *parent;
        struct kernfs_node *target = kn->symlink.target_kn;
        struct kernfs_root *root = kernfs_root(kn);
        int error;

        down_read(&root->kernfs_rwsem);
        parent = kernfs_parent(kn);
        error = kernfs_get_target_path(parent, target, path);
        up_read(&root->kernfs_rwsem);

        return error;
}

static const char *kernfs_iop_get_link(struct dentry *dentry,
                                       struct inode *inode,
                                       struct delayed_call *done)
{
        char *body;
        int error;

        if (!dentry)
                return ERR_PTR(-ECHILD);
        body = kzalloc(PAGE_SIZE, GFP_KERNEL);
        if (!body)
                return ERR_PTR(-ENOMEM);
        error = kernfs_getlink(inode, body);
        if (unlikely(error < 0)) {
                kfree(body);
                return ERR_PTR(error);
        }
        set_delayed_call(done, kfree_link, body);
        return body;
}

const struct inode_operations kernfs_symlink_iops = {
        .listxattr        = kernfs_iop_listxattr,
        .get_link        = kernfs_iop_get_link,
        .setattr        = kernfs_iop_setattr,
        .getattr        = kernfs_iop_getattr,
        .permission        = kernfs_iop_permission,
};














































































































































































    1 








    1 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* delayacct.h - per-task delay accounting
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
 */

#ifndef _LINUX_DELAYACCT_H
#define _LINUX_DELAYACCT_H

#include <uapi/linux/taskstats.h>

#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info {
        raw_spinlock_t        lock;

        /* For each stat XXX, add following, aligned appropriately
         *
         * struct timespec XXX_start, XXX_end;
         * u64 XXX_delay;
         * u32 XXX_count;
         *
         * Atomicity of updates to XXX_delay, XXX_count protected by
         * single lock above (split into XXX_lock if contention is an issue).
         */

        /*
         * XXX_count is incremented on every XXX operation, the delay
         * associated with the operation is added to XXX_delay.
         * XXX_delay contains the accumulated delay time in nanoseconds.
         */
        u64 blkio_start;
        u64 blkio_delay_max;
        u64 blkio_delay_min;
        u64 blkio_delay;        /* wait for sync block io completion */
        u64 swapin_start;
        u64 swapin_delay_max;
        u64 swapin_delay_min;
        u64 swapin_delay;        /* wait for swapin */
        u32 blkio_count;        /* total count of the number of sync block */
                                /* io operations performed */
        u32 swapin_count;        /* total count of swapin */

        u64 freepages_start;
        u64 freepages_delay_max;
        u64 freepages_delay_min;
        u64 freepages_delay;        /* wait for memory reclaim */

        u64 thrashing_start;
        u64 thrashing_delay_max;
        u64 thrashing_delay_min;
        u64 thrashing_delay;        /* wait for thrashing page */

        u64 compact_start;
        u64 compact_delay_max;
        u64 compact_delay_min;
        u64 compact_delay;        /* wait for memory compact */

        u64 wpcopy_start;
        u64 wpcopy_delay_max;
        u64 wpcopy_delay_min;
        u64 wpcopy_delay;        /* wait for write-protect copy */

        u64 irq_delay_max;
        u64 irq_delay_min;
        u64 irq_delay;        /* wait for IRQ/SOFTIRQ */

        u32 freepages_count;        /* total count of memory reclaim */
        u32 thrashing_count;        /* total count of thrash waits */
        u32 compact_count;        /* total count of memory compact */
        u32 wpcopy_count;        /* total count of write-protect copy */
        u32 irq_count;        /* total count of IRQ/SOFTIRQ */

        struct timespec64 blkio_delay_max_ts;
        struct timespec64 swapin_delay_max_ts;
        struct timespec64 freepages_delay_max_ts;
        struct timespec64 thrashing_delay_max_ts;
        struct timespec64 compact_delay_max_ts;
        struct timespec64 wpcopy_delay_max_ts;
        struct timespec64 irq_delay_max_ts;
};
#endif

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/jump_label.h>

#ifdef CONFIG_TASK_DELAY_ACCT
DECLARE_STATIC_KEY_FALSE(delayacct_key);
extern int delayacct_on;        /* Delay accounting turned on/off */
extern struct kmem_cache *delayacct_cache;
extern void delayacct_init(void);

extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
extern void __delayacct_blkio_start(void);
extern void __delayacct_blkio_end(struct task_struct *);
extern int delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
extern void __delayacct_thrashing_start(bool *in_thrashing);
extern void __delayacct_thrashing_end(bool *in_thrashing);
extern void __delayacct_swapin_start(void);
extern void __delayacct_swapin_end(void);
extern void __delayacct_compact_start(void);
extern void __delayacct_compact_end(void);
extern void __delayacct_wpcopy_start(void);
extern void __delayacct_wpcopy_end(void);
extern void __delayacct_irq(struct task_struct *task, u32 delta);

static inline void delayacct_tsk_init(struct task_struct *tsk)
{
        /* reinitialize in case parent's non-null pointer was dup'ed*/
        tsk->delays = NULL;
        if (delayacct_on)
                __delayacct_tsk_init(tsk);
}

/* Free tsk->delays. Called from bad fork and __put_task_struct
 * where there's no risk of tsk->delays being accessed elsewhere
 */
static inline void delayacct_tsk_free(struct task_struct *tsk)
{
        if (tsk->delays)
                kmem_cache_free(delayacct_cache, tsk->delays);
        tsk->delays = NULL;
}

static inline void delayacct_blkio_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_blkio_start();
}

static inline void delayacct_blkio_end(struct task_struct *p)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (p->delays)
                __delayacct_blkio_end(p);
}

static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{
        if (tsk->delays)
                return __delayacct_blkio_ticks(tsk);
        return 0;
}

static inline void delayacct_freepages_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_freepages_start();
}

static inline void delayacct_freepages_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_freepages_end();
}

static inline void delayacct_thrashing_start(bool *in_thrashing)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_thrashing_start(in_thrashing);
}

static inline void delayacct_thrashing_end(bool *in_thrashing)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_thrashing_end(in_thrashing);
}

static inline void delayacct_swapin_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_swapin_start();
}

static inline void delayacct_swapin_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_swapin_end();
}

static inline void delayacct_compact_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_compact_start();
}

static inline void delayacct_compact_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_compact_end();
}

static inline void delayacct_wpcopy_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_wpcopy_start();
}

static inline void delayacct_wpcopy_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_wpcopy_end();
}

static inline void delayacct_irq(struct task_struct *task, u32 delta)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (task->delays)
                __delayacct_irq(task, delta);
}

#else
static inline void delayacct_init(void)
{}
static inline void delayacct_tsk_init(struct task_struct *tsk)
{}
static inline void delayacct_tsk_free(struct task_struct *tsk)
{}
static inline void delayacct_blkio_start(void)
{}
static inline void delayacct_blkio_end(struct task_struct *p)
{}
static inline int delayacct_add_tsk(struct taskstats *d,
                                        struct task_struct *tsk)
{ return 0; }
static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{ return 0; }
static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
{ return 0; }
static inline void delayacct_freepages_start(void)
{}
static inline void delayacct_freepages_end(void)
{}
static inline void delayacct_thrashing_start(bool *in_thrashing)
{}
static inline void delayacct_thrashing_end(bool *in_thrashing)
{}
static inline void delayacct_swapin_start(void)
{}
static inline void delayacct_swapin_end(void)
{}
static inline void delayacct_compact_start(void)
{}
static inline void delayacct_compact_end(void)
{}
static inline void delayacct_wpcopy_start(void)
{}
static inline void delayacct_wpcopy_end(void)
{}
static inline void delayacct_irq(struct task_struct *task, u32 delta)
{}

#endif /* CONFIG_TASK_DELAY_ACCT */

#endif



























































































































































































































































































































































    1 




    1 




    1 












    1 

    1 



    1 






    1 


    1 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * net/sched/sch_generic.c        Generic packet scheduler routines.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/if_vlan.h>
#include <linux/skb_array.h>
#include <linux/if_macvlan.h>
#include <linux/bpf.h>
#include <trace/events/qdisc.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
#include <net/hotdata.h>
#include <trace/events/net.h>
#include <net/xfrm.h>

/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

void __tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q,
                          struct netdev_queue *txq, struct net_device *dev)
{
        while (skb) {
                u32 reason = tc_skb_cb(skb)->drop_reason;
                struct sk_buff *next = skb->next;
                enum skb_drop_reason skb_reason;

                prefetch(next);
                /* TC classifier and qdisc share drop_reason storage.
                 * Check subsystem mask to identify qdisc drop reasons,
                 * else pass through skb_drop_reason set by TC classifier.
                 */
                if ((reason & SKB_DROP_REASON_SUBSYS_MASK) == __QDISC_DROP_REASON) {
                        trace_qdisc_drop(q, txq, dev, skb, (enum qdisc_drop_reason)reason);
                        skb_reason = SKB_DROP_REASON_QDISC_DROP;
                } else {
                        skb_reason = (enum skb_drop_reason)reason;
                }
                kfree_skb_reason(skb, skb_reason);
                skb = next;
        }
}
EXPORT_SYMBOL(__tcf_kfree_skb_list);

static void qdisc_maybe_clear_missed(struct Qdisc *q,
                                     const struct netdev_queue *txq)
{
        clear_bit(__QDISC_STATE_MISSED, &q->state);

        /* Make sure the below netif_xmit_frozen_or_stopped()
         * checking happens after clearing STATE_MISSED.
         */
        smp_mb__after_atomic();

        /* Checking netif_xmit_frozen_or_stopped() again to
         * make sure STATE_MISSED is set if the STATE_MISSED
         * set by netif_tx_wake_queue()'s rescheduling of
         * net_tx_action() is cleared by the above clear_bit().
         */
        if (!netif_xmit_frozen_or_stopped(txq))
                set_bit(__QDISC_STATE_MISSED, &q->state);
        else
                set_bit(__QDISC_STATE_DRAINING, &q->state);
}

/* Main transmission queue. */

/* Modifications to data participating in scheduling must be protected with
 * qdisc_lock(qdisc) spinlock.
 *
 * The idea is the following:
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
 * - updates to tree and tree walking are only done under the rtnl mutex.
 */

#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)

static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
        const struct netdev_queue *txq = q->dev_queue;
        spinlock_t *lock = NULL;
        struct sk_buff *skb;

        if (q->flags & TCQ_F_NOLOCK) {
                lock = qdisc_lock(q);
                spin_lock(lock);
        }

        skb = skb_peek(&q->skb_bad_txq);
        if (skb) {
                /* check the reason of requeuing without tx lock first */
                txq = skb_get_tx_queue(txq->dev, skb);
                if (!netif_xmit_frozen_or_stopped(txq)) {
                        skb = __skb_dequeue(&q->skb_bad_txq);
                        if (qdisc_is_percpu_stats(q)) {
                                qdisc_qstats_cpu_backlog_dec(q, skb);
                                qdisc_qstats_cpu_qlen_dec(q);
                        } else {
                                qdisc_qstats_backlog_dec(q, skb);
                                q->q.qlen--;
                        }
                } else {
                        skb = SKB_XOFF_MAGIC;
                        qdisc_maybe_clear_missed(q, txq);
                }
        }

        if (lock)
                spin_unlock(lock);

        return skb;
}

static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
{
        struct sk_buff *skb = skb_peek(&q->skb_bad_txq);

        if (unlikely(skb))
                skb = __skb_dequeue_bad_txq(q);

        return skb;
}

static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
                                             struct sk_buff *skb)
{
        spinlock_t *lock = NULL;

        if (q->flags & TCQ_F_NOLOCK) {
                lock = qdisc_lock(q);
                spin_lock(lock);
        }

        __skb_queue_tail(&q->skb_bad_txq, skb);

        if (qdisc_is_percpu_stats(q)) {
                qdisc_qstats_cpu_backlog_inc(q, skb);
                qdisc_qstats_cpu_qlen_inc(q);
        } else {
                qdisc_qstats_backlog_inc(q, skb);
                q->q.qlen++;
        }

        if (lock)
                spin_unlock(lock);
}

static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
        spinlock_t *lock = NULL;

        if (q->flags & TCQ_F_NOLOCK) {
                lock = qdisc_lock(q);
                spin_lock(lock);
        }

        while (skb) {
                struct sk_buff *next = skb->next;

                __skb_queue_tail(&q->gso_skb, skb);

                /* it's still part of the queue */
                if (qdisc_is_percpu_stats(q)) {
                        qdisc_qstats_cpu_requeues_inc(q);
                        qdisc_qstats_cpu_backlog_inc(q, skb);
                        qdisc_qstats_cpu_qlen_inc(q);
                } else {
                        q->qstats.requeues++;
                        qdisc_qstats_backlog_inc(q, skb);
                        q->q.qlen++;
                }

                skb = next;
        }

        if (lock) {
                spin_unlock(lock);
                set_bit(__QDISC_STATE_MISSED, &q->state);
        } else {
                __netif_schedule(q);
        }
}

static void try_bulk_dequeue_skb(struct Qdisc *q,
                                 struct sk_buff *skb,
                                 const struct netdev_queue *txq,
                                 int *packets, int budget)
{
        int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
        int cnt = 0;

        while (bytelimit > 0) {
                struct sk_buff *nskb = q->dequeue(q);

                if (!nskb)
                        break;

                bytelimit -= nskb->len; /* covers GSO len */
                skb->next = nskb;
                skb = nskb;
                if (++cnt >= budget)
                        break;
        }
        (*packets) += cnt;
        skb_mark_not_on_list(skb);
}

/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
                                      struct sk_buff *skb,
                                      int *packets)
{
        int mapping = skb_get_queue_mapping(skb);
        struct sk_buff *nskb;
        int cnt = 0;

        do {
                nskb = q->dequeue(q);
                if (!nskb)
                        break;
                if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
                        qdisc_enqueue_skb_bad_txq(q, nskb);
                        break;
                }
                skb->next = nskb;
                skb = nskb;
        } while (++cnt < 8);
        (*packets) += cnt;
        skb_mark_not_on_list(skb);
}

/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
                                   int *packets, int budget)
{
        const struct netdev_queue *txq = q->dev_queue;
        struct sk_buff *skb = NULL;

        *packets = 1;
        if (unlikely(!skb_queue_empty(&q->gso_skb))) {
                spinlock_t *lock = NULL;

                if (q->flags & TCQ_F_NOLOCK) {
                        lock = qdisc_lock(q);
                        spin_lock(lock);
                }

                skb = skb_peek(&q->gso_skb);

                /* skb may be null if another cpu pulls gso_skb off in between
                 * empty check and lock.
                 */
                if (!skb) {
                        if (lock)
                                spin_unlock(lock);
                        goto validate;
                }

                /* skb in gso_skb were already validated */
                *validate = false;
                if (xfrm_offload(skb))
                        *validate = true;
                /* check the reason of requeuing without tx lock first */
                txq = skb_get_tx_queue(txq->dev, skb);
                if (!netif_xmit_frozen_or_stopped(txq)) {
                        skb = __skb_dequeue(&q->gso_skb);
                        if (qdisc_is_percpu_stats(q)) {
                                qdisc_qstats_cpu_backlog_dec(q, skb);
                                qdisc_qstats_cpu_qlen_dec(q);
                        } else {
                                qdisc_qstats_backlog_dec(q, skb);
                                q->q.qlen--;
                        }
                } else {
                        skb = NULL;
                        qdisc_maybe_clear_missed(q, txq);
                }
                if (lock)
                        spin_unlock(lock);
                goto trace;
        }
validate:
        *validate = true;

        if ((q->flags & TCQ_F_ONETXQUEUE) &&
            netif_xmit_frozen_or_stopped(txq)) {
                qdisc_maybe_clear_missed(q, txq);
                return skb;
        }

        skb = qdisc_dequeue_skb_bad_txq(q);
        if (unlikely(skb)) {
                if (skb == SKB_XOFF_MAGIC)
                        return NULL;
                goto bulk;
        }
        skb = q->dequeue(q);
        if (skb) {
bulk:
                if (qdisc_may_bulk(q))
                        try_bulk_dequeue_skb(q, skb, txq, packets, budget);
                else
                        try_bulk_dequeue_skb_slow(q, skb, packets);
        }
trace:
        trace_qdisc_dequeue(q, txq, *packets, skb);
        return skb;
}

/*
 * Transmit possibly several skbs, and handle the return status as
 * required. Owning qdisc running bit guarantees that only one CPU
 * can execute this function.
 *
 * Returns to the caller:
 *                                false  - hardware queue frozen backoff
 *                                true   - feel free to send more pkts
 */
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
                     struct net_device *dev, struct netdev_queue *txq,
                     spinlock_t *root_lock, bool validate)
{
        int ret = NETDEV_TX_BUSY;
        bool again = false;

        /* And release qdisc */
        if (root_lock)
                spin_unlock(root_lock);

        /* Note that we validate skb (GSO, checksum, ...) outside of locks */
        if (validate)
                skb = validate_xmit_skb_list(skb, dev, &again);

#ifdef CONFIG_XFRM_OFFLOAD
        if (unlikely(again)) {
                if (root_lock)
                        spin_lock(root_lock);

                dev_requeue_skb(skb, q);
                return false;
        }
#endif

        if (likely(skb)) {
                HARD_TX_LOCK(dev, txq, smp_processor_id());
                if (!netif_xmit_frozen_or_stopped(txq))
                        skb = dev_hard_start_xmit(skb, dev, txq, &ret);
                else
                        qdisc_maybe_clear_missed(q, txq);

                HARD_TX_UNLOCK(dev, txq);
        } else {
                if (root_lock)
                        spin_lock(root_lock);
                return true;
        }

        if (root_lock)
                spin_lock(root_lock);

        if (!dev_xmit_complete(ret)) {
                /* Driver returned NETDEV_TX_BUSY - requeue skb */
                if (unlikely(ret != NETDEV_TX_BUSY))
                        net_warn_ratelimited("BUG %s code %d qlen %d\n",
                                             dev->name, ret, q->q.qlen);

                dev_requeue_skb(skb, q);
                return false;
        }

        return true;
}

/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
 * running seqcount guarantees only one CPU can process
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *                                0  - queue is empty or throttled.
 *                                >0 - queue is not empty.
 *
 */
static inline bool qdisc_restart(struct Qdisc *q, int *packets, int budget)
{
        spinlock_t *root_lock = NULL;
        struct netdev_queue *txq;
        struct net_device *dev;
        struct sk_buff *skb;
        bool validate;

        /* Dequeue packet */
        skb = dequeue_skb(q, &validate, packets, budget);
        if (unlikely(!skb))
                return false;

        if (!(q->flags & TCQ_F_NOLOCK))
                root_lock = qdisc_lock(q);

        dev = qdisc_dev(q);
        txq = skb_get_tx_queue(dev, skb);

        return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
}

void __qdisc_run(struct Qdisc *q)
{
        int quota = READ_ONCE(net_hotdata.dev_tx_weight);
        int packets;

        while (qdisc_restart(q, &packets, quota)) {
                quota -= packets;
                if (quota <= 0) {
                        if (q->flags & TCQ_F_NOLOCK)
                                set_bit(__QDISC_STATE_MISSED, &q->state);
                        else
                                __netif_schedule(q);

                        break;
                }
        }
}

unsigned long dev_trans_start(struct net_device *dev)
{
        unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start);
        unsigned long val;
        unsigned int i;

        for (i = 1; i < dev->num_tx_queues; i++) {
                val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start);
                if (val && time_after(val, res))
                        res = val;
        }

        return res;
}
EXPORT_SYMBOL(dev_trans_start);

static void netif_freeze_queues(struct net_device *dev)
{
        unsigned int i;
        int cpu;

        cpu = smp_processor_id();
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                /* We are the only thread of execution doing a
                 * freeze, but we have to grab the _xmit_lock in
                 * order to synchronize with threads which are in
                 * the ->hard_start_xmit() handler and already
                 * checked the frozen bit.
                 */
                __netif_tx_lock(txq, cpu);
                set_bit(__QUEUE_STATE_FROZEN, &txq->state);
                __netif_tx_unlock(txq);
        }
}

void netif_tx_lock(struct net_device *dev)
{
        spin_lock(&dev->tx_global_lock);
        netif_freeze_queues(dev);
}
EXPORT_SYMBOL(netif_tx_lock);

static void netif_unfreeze_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                /* No need to grab the _xmit_lock here.  If the
                 * queue is not stopped for another reason, we
                 * force a schedule.
                 */
                clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
                netif_schedule_queue(txq);
        }
}

void netif_tx_unlock(struct net_device *dev)
{
        netif_unfreeze_queues(dev);
        spin_unlock(&dev->tx_global_lock);
}
EXPORT_SYMBOL(netif_tx_unlock);

static void dev_watchdog(struct timer_list *t)
{
        struct net_device *dev = timer_container_of(dev, t, watchdog_timer);
        bool release = true;

        spin_lock(&dev->tx_global_lock);
        if (!qdisc_tx_is_noop(dev)) {
                if (netif_device_present(dev) &&
                    netif_running(dev) &&
                    netif_carrier_ok(dev)) {
                        unsigned int timedout_ms = 0;
                        unsigned int i;
                        unsigned long trans_start;
                        unsigned long oldest_start = jiffies;

                        for (i = 0; i < dev->num_tx_queues; i++) {
                                struct netdev_queue *txq;

                                txq = netdev_get_tx_queue(dev, i);
                                if (!netif_xmit_stopped(txq))
                                        continue;

                                /* Paired with WRITE_ONCE() + smp_mb...() in
                                 * netdev_tx_sent_queue() and netif_tx_stop_queue().
                                 */
                                smp_mb();
                                trans_start = READ_ONCE(txq->trans_start);

                                if (time_after(jiffies, trans_start + dev->watchdog_timeo)) {
                                        timedout_ms = jiffies_to_msecs(jiffies - trans_start);
                                        atomic_long_inc(&txq->trans_timeout);
                                        break;
                                }
                                if (time_after(oldest_start, trans_start))
                                        oldest_start = trans_start;
                        }

                        if (unlikely(timedout_ms)) {
                                trace_net_dev_xmit_timeout(dev, i);
                                netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n",
                                            raw_smp_processor_id(),
                                            i, timedout_ms);
                                netif_freeze_queues(dev);
                                dev->netdev_ops->ndo_tx_timeout(dev, i);
                                netif_unfreeze_queues(dev);
                        }
                        if (!mod_timer(&dev->watchdog_timer,
                                       round_jiffies(oldest_start +
                                                     dev->watchdog_timeo)))
                                release = false;
                }
        }
        spin_unlock(&dev->tx_global_lock);

        if (release)
                netdev_put(dev, &dev->watchdog_dev_tracker);
}

void netdev_watchdog_up(struct net_device *dev)
{
        if (!dev->netdev_ops->ndo_tx_timeout)
                return;
        if (dev->watchdog_timeo <= 0)
                dev->watchdog_timeo = 5*HZ;
        if (!mod_timer(&dev->watchdog_timer,
                       round_jiffies(jiffies + dev->watchdog_timeo)))
                netdev_hold(dev, &dev->watchdog_dev_tracker,
                            GFP_ATOMIC);
}
EXPORT_SYMBOL_GPL(netdev_watchdog_up);

static void netdev_watchdog_down(struct net_device *dev)
{
        netif_tx_lock_bh(dev);
        if (timer_delete(&dev->watchdog_timer))
                netdev_put(dev, &dev->watchdog_dev_tracker);
        netif_tx_unlock_bh(dev);
}

/**
 *        netif_carrier_on - set carrier
 *        @dev: network device
 *
 * Device has detected acquisition of carrier.
 */
void netif_carrier_on(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
                if (dev->reg_state == NETREG_UNINITIALIZED)
                        return;
                atomic_inc(&dev->carrier_up_count);
                linkwatch_fire_event(dev);
                if (netif_running(dev))
                        netdev_watchdog_up(dev);
        }
}
EXPORT_SYMBOL(netif_carrier_on);

/**
 *        netif_carrier_off - clear carrier
 *        @dev: network device
 *
 * Device has detected loss of carrier.
 */
void netif_carrier_off(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
                if (dev->reg_state == NETREG_UNINITIALIZED)
                        return;
                atomic_inc(&dev->carrier_down_count);
                linkwatch_fire_event(dev);
        }
}
EXPORT_SYMBOL(netif_carrier_off);

/**
 *        netif_carrier_event - report carrier state event
 *        @dev: network device
 *
 * Device has detected a carrier event but the carrier state wasn't changed.
 * Use in drivers when querying carrier state asynchronously, to avoid missing
 * events (link flaps) if link recovers before it's queried.
 */
void netif_carrier_event(struct net_device *dev)
{
        if (dev->reg_state == NETREG_UNINITIALIZED)
                return;
        atomic_inc(&dev->carrier_up_count);
        atomic_inc(&dev->carrier_down_count);
        linkwatch_fire_event(dev);
}
EXPORT_SYMBOL_GPL(netif_carrier_event);

/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
                        struct sk_buff **to_free)
{
        dev_core_stats_tx_dropped_inc(skb->dev);
        __qdisc_drop(skb, to_free);
        return NET_XMIT_CN;
}

static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
{
        return NULL;
}

struct Qdisc_ops noop_qdisc_ops __read_mostly = {
        .id                =        "noop",
        .priv_size        =        0,
        .enqueue        =        noop_enqueue,
        .dequeue        =        noop_dequeue,
        .peek                =        noop_dequeue,
        .owner                =        THIS_MODULE,
};

static struct netdev_queue noop_netdev_queue = {
        RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
        RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc),
};

struct Qdisc noop_qdisc = {
        .enqueue        =        noop_enqueue,
        .dequeue        =        noop_dequeue,
        .flags                =        TCQ_F_BUILTIN,
        .ops                =        &noop_qdisc_ops,
        .q.lock                =        __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
        .dev_queue        =        &noop_netdev_queue,
        .gso_skb = {
                .next = (struct sk_buff *)&noop_qdisc.gso_skb,
                .prev = (struct sk_buff *)&noop_qdisc.gso_skb,
                .qlen = 0,
                .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
        },
        .skb_bad_txq = {
                .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
                .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
                .qlen = 0,
                .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
        },
};
EXPORT_SYMBOL(noop_qdisc);

static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
                        struct netlink_ext_ack *extack)
{
        /* register_qdisc() assigns a default of noop_enqueue if unset,
         * but __dev_queue_xmit() treats noqueue only as such
         * if this is NULL - so clear it here. */
        qdisc->enqueue = NULL;
        return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
        .id                =        "noqueue",
        .priv_size        =        0,
        .init                =        noqueue_init,
        .enqueue        =        noop_enqueue,
        .dequeue        =        noop_dequeue,
        .peek                =        noop_dequeue,
        .owner                =        THIS_MODULE,
};

const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = {
        1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
};
EXPORT_SYMBOL(sch_default_prio2band);

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

/*
 * Private data for a pfifo_fast scheduler containing:
 *        - rings for priority bands
 */
struct pfifo_fast_priv {
        struct skb_array q[PFIFO_FAST_BANDS];
};

static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
                                          int band)
{
        return &priv->q[band];
}

static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
                              struct sk_buff **to_free)
{
        int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX];
        struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
        struct skb_array *q = band2list(priv, band);
        unsigned int pkt_len = qdisc_pkt_len(skb);
        int err;

        err = skb_array_produce(q, skb);

        if (unlikely(err)) {
                tcf_set_qdisc_drop_reason(skb, QDISC_DROP_OVERLIMIT);

                if (qdisc_is_percpu_stats(qdisc))
                        return qdisc_drop_cpu(skb, qdisc, to_free);
                else
                        return qdisc_drop(skb, qdisc, to_free);
        }

        qdisc_update_stats_at_enqueue(qdisc, pkt_len);
        return NET_XMIT_SUCCESS;
}

static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
{
        struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
        struct sk_buff *skb = NULL;
        bool need_retry = true;
        int band;

retry:
        for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
                struct skb_array *q = band2list(priv, band);

                if (__skb_array_empty(q))
                        continue;

                skb = __skb_array_consume(q);
        }
        if (likely(skb)) {
                qdisc_update_stats_at_dequeue(qdisc, skb);
        } else if (need_retry &&
                   READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) {
                /* Delay clearing the STATE_MISSED here to reduce
                 * the overhead of the second spin_trylock() in
                 * qdisc_run_begin() and __netif_schedule() calling
                 * in qdisc_run_end().
                 */
                clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
                clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);

                /* Make sure dequeuing happens after clearing
                 * STATE_MISSED.
                 */
                smp_mb__after_atomic();

                need_retry = false;

                goto retry;
        }

        return skb;
}

static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
{
        struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
        struct sk_buff *skb = NULL;
        int band;

        for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
                struct skb_array *q = band2list(priv, band);

                skb = __skb_array_peek(q);
        }

        return skb;
}

static void pfifo_fast_reset(struct Qdisc *qdisc)
{
        int i, band;
        struct pfifo_fast_priv *priv = qdisc_priv(qdisc);

        for (band = 0; band < PFIFO_FAST_BANDS; band++) {
                struct skb_array *q = band2list(priv, band);
                struct sk_buff *skb;

                /* NULL ring is possible if destroy path is due to a failed
                 * skb_array_init() in pfifo_fast_init() case.
                 */
                if (!q->ring.queue)
                        continue;

                while ((skb = __skb_array_consume(q)) != NULL)
                        rtnl_kfree_skbs(skb, skb);
        }

        if (qdisc_is_percpu_stats(qdisc)) {
                for_each_possible_cpu(i) {
                        struct gnet_stats_queue *q;

                        q = per_cpu_ptr(qdisc->cpu_qstats, i);
                        q->backlog = 0;
                        q->qlen = 0;
                }
        }
}

static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
        struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

        memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1);
        if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
                goto nla_put_failure;
        return skb->len;

nla_put_failure:
        return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
                           struct netlink_ext_ack *extack)
{
        unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
        struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
        int prio;

        /* guard against zero length rings */
        if (!qlen)
                return -EINVAL;

        for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
                struct skb_array *q = band2list(priv, prio);
                int err;

                err = skb_array_init(q, qlen, GFP_KERNEL);
                if (err)
                        return -ENOMEM;
        }

        /* Can by-pass the queue discipline */
        qdisc->flags |= TCQ_F_CAN_BYPASS;
        return 0;
}

static void pfifo_fast_destroy(struct Qdisc *sch)
{
        struct pfifo_fast_priv *priv = qdisc_priv(sch);
        int prio;

        for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
                struct skb_array *q = band2list(priv, prio);

                /* NULL ring is possible if destroy path is due to a failed
                 * skb_array_init() in pfifo_fast_init() case.
                 */
                if (!q->ring.queue)
                        continue;
                /* Destroy ring but no need to kfree_skb because a call to
                 * pfifo_fast_reset() has already done that work.
                 */
                ptr_ring_cleanup(&q->ring, NULL);
        }
}

static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
                                          unsigned int new_len)
{
        struct pfifo_fast_priv *priv = qdisc_priv(sch);
        struct skb_array *bands[PFIFO_FAST_BANDS];
        int prio;

        for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
                struct skb_array *q = band2list(priv, prio);

                bands[prio] = q;
        }

        return skb_array_resize_multiple_bh(bands, PFIFO_FAST_BANDS, new_len,
                                            GFP_KERNEL);
}

struct Qdisc_ops pfifo_fast_ops __read_mostly = {
        .id                =        "pfifo_fast",
        .priv_size        =        sizeof(struct pfifo_fast_priv),
        .enqueue        =        pfifo_fast_enqueue,
        .dequeue        =        pfifo_fast_dequeue,
        .peek                =        pfifo_fast_peek,
        .init                =        pfifo_fast_init,
        .destroy        =        pfifo_fast_destroy,
        .reset                =        pfifo_fast_reset,
        .dump                =        pfifo_fast_dump,
        .change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
        .owner                =        THIS_MODULE,
        .static_flags        =        TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
};
EXPORT_SYMBOL(pfifo_fast_ops);

static struct lock_class_key qdisc_tx_busylock;

struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
                          const struct Qdisc_ops *ops,
                          struct netlink_ext_ack *extack)
{
        struct Qdisc *sch;
        unsigned int size = sizeof(*sch) + ops->priv_size;
        int err = -ENOBUFS;
        struct net_device *dev;

        if (!dev_queue) {
                NL_SET_ERR_MSG(extack, "No device queue given");
                err = -EINVAL;
                goto errout;
        }

        dev = dev_queue->dev;
        sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue));

        if (!sch)
                goto errout;
        __skb_queue_head_init(&sch->gso_skb);
        __skb_queue_head_init(&sch->skb_bad_txq);
        gnet_stats_basic_sync_init(&sch->bstats);
        qdisc_lock_init(sch, ops);

        if (ops->static_flags & TCQ_F_CPUSTATS) {
                sch->cpu_bstats =
                        netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
                if (!sch->cpu_bstats)
                        goto errout1;

                sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
                if (!sch->cpu_qstats) {
                        free_percpu(sch->cpu_bstats);
                        goto errout1;
                }
        }

        /* seqlock has the same scope of busylock, for NOLOCK qdisc */
        spin_lock_init(&sch->seqlock);
        lockdep_set_class(&sch->seqlock,
                          dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

        sch->ops = ops;
        sch->flags = ops->static_flags;
        sch->enqueue = ops->enqueue;
        sch->dequeue = ops->dequeue;
        sch->dev_queue = dev_queue;
        netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL);
        refcount_set(&sch->refcnt, 1);

        return sch;
errout1:
        qdisc_lock_uninit(sch, ops);
        kfree(sch);
errout:
        return ERR_PTR(err);
}

struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
                                const struct Qdisc_ops *ops,
                                unsigned int parentid,
                                struct netlink_ext_ack *extack)
{
        struct Qdisc *sch;

        if (!bpf_try_module_get(ops, ops->owner)) {
                NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
                return NULL;
        }

        sch = qdisc_alloc(dev_queue, ops, extack);
        if (IS_ERR(sch)) {
                bpf_module_put(ops, ops->owner);
                return NULL;
        }
        sch->parent = parentid;

        if (!ops->init || ops->init(sch, NULL, extack) == 0) {
                trace_qdisc_create(ops, dev_queue->dev, parentid);
                return sch;
        }

        qdisc_put(sch);
        return NULL;
}
EXPORT_SYMBOL(qdisc_create_dflt);

/* Under qdisc_lock(qdisc) and BH! */

void qdisc_reset(struct Qdisc *qdisc)
{
        const struct Qdisc_ops *ops = qdisc->ops;

        trace_qdisc_reset(qdisc);

        if (ops->reset)
                ops->reset(qdisc);

        __skb_queue_purge(&qdisc->gso_skb);
        __skb_queue_purge(&qdisc->skb_bad_txq);

        qdisc->q.qlen = 0;
        qdisc->qstats.backlog = 0;
}
EXPORT_SYMBOL(qdisc_reset);

void qdisc_free(struct Qdisc *qdisc)
{
        if (qdisc_is_percpu_stats(qdisc)) {
                free_percpu(qdisc->cpu_bstats);
                free_percpu(qdisc->cpu_qstats);
        }

        kfree(qdisc);
}

static void qdisc_free_cb(struct rcu_head *head)
{
        struct Qdisc *q = container_of(head, struct Qdisc, rcu);

        qdisc_free(q);
}

static void __qdisc_destroy(struct Qdisc *qdisc)
{
        const struct Qdisc_ops  *ops = qdisc->ops;
        struct net_device *dev = qdisc_dev(qdisc);

#ifdef CONFIG_NET_SCHED
        qdisc_hash_del(qdisc);

        qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
        gen_kill_estimator(&qdisc->rate_est);

        qdisc_reset(qdisc);


        if (ops->destroy)
                ops->destroy(qdisc);

        qdisc_lock_uninit(qdisc, ops);
        bpf_module_put(ops, ops->owner);
        netdev_put(dev, &qdisc->dev_tracker);

        trace_qdisc_destroy(qdisc);

        call_rcu(&qdisc->rcu, qdisc_free_cb);
}

void qdisc_destroy(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return;

        __qdisc_destroy(qdisc);
}

void qdisc_put(struct Qdisc *qdisc)
{
        if (!qdisc)
                return;

        if (qdisc->flags & TCQ_F_BUILTIN ||
            !refcount_dec_and_test(&qdisc->refcnt))
                return;

        __qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);

/* Version of qdisc_put() that is called with rtnl mutex unlocked.
 * Intended to be used as optimization, this function only takes rtnl lock if
 * qdisc reference counter reached zero.
 */

void qdisc_put_unlocked(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN ||
            !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
                return;

        __qdisc_destroy(qdisc);
        rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);

/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
                              struct Qdisc *qdisc)
{
        struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
        spinlock_t *root_lock;

        root_lock = qdisc_lock(oqdisc);
        spin_lock_bh(root_lock);

        /* ... and graft new one */
        if (qdisc == NULL)
                qdisc = &noop_qdisc;
        rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
        rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

        spin_unlock_bh(root_lock);

        return oqdisc;
}
EXPORT_SYMBOL(dev_graft_qdisc);

static void shutdown_scheduler_queue(struct net_device *dev,
                                     struct netdev_queue *dev_queue,
                                     void *_qdisc_default)
{
        struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
        struct Qdisc *qdisc_default = _qdisc_default;

        if (qdisc) {
                rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
                rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default);

                qdisc_put(qdisc);
        }
}

static void attach_one_default_qdisc(struct net_device *dev,
                                     struct netdev_queue *dev_queue,
                                     void *_unused)
{
        struct Qdisc *qdisc;
        const struct Qdisc_ops *ops = default_qdisc_ops;

        if (dev->priv_flags & IFF_NO_QUEUE)
                ops = &noqueue_qdisc_ops;
        else if(dev->type == ARPHRD_CAN)
                ops = &pfifo_fast_ops;

        qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
        if (!qdisc)
                return;

        if (!netif_is_multiqueue(dev))
                qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
        rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
}

static void attach_default_qdiscs(struct net_device *dev)
{
        struct netdev_queue *txq;
        struct Qdisc *qdisc;

        txq = netdev_get_tx_queue(dev, 0);

        if (!netif_is_multiqueue(dev) ||
            dev->priv_flags & IFF_NO_QUEUE) {
                netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
                qdisc = rtnl_dereference(txq->qdisc_sleeping);
                rcu_assign_pointer(dev->qdisc, qdisc);
                qdisc_refcount_inc(qdisc);
        } else {
                qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
                if (qdisc) {
                        rcu_assign_pointer(dev->qdisc, qdisc);
                        qdisc->ops->attach(qdisc);
                }
        }
        qdisc = rtnl_dereference(dev->qdisc);

        /* Detect default qdisc setup/init failed and fallback to "noqueue" */
        if (qdisc == &noop_qdisc) {
                netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n",
                            default_qdisc_ops->id, noqueue_qdisc_ops.id);
                netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
                dev->priv_flags |= IFF_NO_QUEUE;
                netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
                qdisc = rtnl_dereference(txq->qdisc_sleeping);
                rcu_assign_pointer(dev->qdisc, qdisc);
                qdisc_refcount_inc(qdisc);
                dev->priv_flags ^= IFF_NO_QUEUE;
        }

#ifdef CONFIG_NET_SCHED
        if (qdisc != &noop_qdisc)
                qdisc_hash_add(qdisc, false);
#endif
}

static void transition_one_qdisc(struct net_device *dev,
                                 struct netdev_queue *dev_queue,
                                 void *_need_watchdog)
{
        struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
        int *need_watchdog_p = _need_watchdog;

        if (!(new_qdisc->flags & TCQ_F_BUILTIN))
                clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

        rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
        if (need_watchdog_p) {
                WRITE_ONCE(dev_queue->trans_start, 0);
                *need_watchdog_p = 1;
        }
}

void dev_activate(struct net_device *dev)
{
        int need_watchdog;

        /* No queueing discipline is attached to device;
         * create default one for devices, which need queueing
         * and noqueue_qdisc for virtual interfaces
         */

        if (rtnl_dereference(dev->qdisc) == &noop_qdisc)
                attach_default_qdiscs(dev);

        if (!netif_carrier_ok(dev))
                /* Delay activation until next carrier-on event */
                return;

        need_watchdog = 0;
        netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
        if (dev_ingress_queue(dev))
                transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);

        if (need_watchdog) {
                netif_trans_update(dev);
                netdev_watchdog_up(dev);
        }
}
EXPORT_SYMBOL(dev_activate);

static void qdisc_deactivate(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return;

        set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
}

static void dev_deactivate_queue(struct net_device *dev,
                                 struct netdev_queue *dev_queue,
                                 void *_sync_needed)
{
        bool *sync_needed = _sync_needed;
        struct Qdisc *qdisc;

        qdisc = rtnl_dereference(dev_queue->qdisc);
        if (qdisc) {
                if (qdisc->enqueue)
                        *sync_needed = true;
                qdisc_deactivate(qdisc);
                rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
        }
}

static bool some_qdisc_is_busy(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *dev_queue;
                spinlock_t *root_lock;
                struct Qdisc *q;
                int val;

                dev_queue = netdev_get_tx_queue(dev, i);
                q = rtnl_dereference(dev_queue->qdisc_sleeping);

                root_lock = qdisc_lock(q);
                spin_lock_bh(root_lock);

                val = (qdisc_is_running(q) ||
                       test_bit(__QDISC_STATE_SCHED, &q->state));

                spin_unlock_bh(root_lock);

                if (val)
                        return true;
        }
        return false;
}

/**
 *         dev_deactivate_many - deactivate transmissions on several devices
 *         @head: list of devices to deactivate
 *        @reset_needed: qdisc should be reset if true.
 *
 *        This function returns only when all outstanding transmissions
 *        have completed, unless all devices are in dismantle phase.
 */
void dev_deactivate_many(struct list_head *head, bool reset_needed)
{
        bool sync_needed = false;
        struct net_device *dev;

        list_for_each_entry(dev, head, close_list) {
                netdev_for_each_tx_queue(dev, dev_deactivate_queue,
                                         &sync_needed);
                if (dev_ingress_queue(dev))
                        dev_deactivate_queue(dev, dev_ingress_queue(dev),
                                             &sync_needed);

                netdev_watchdog_down(dev);
        }

        /* Wait for outstanding qdisc enqueuing calls. */
        if (sync_needed)
                synchronize_net();

        if (reset_needed) {
                list_for_each_entry(dev, head, close_list) {
                        netdev_for_each_tx_queue(dev, dev_reset_queue, NULL);

                        if (dev_ingress_queue(dev))
                                dev_reset_queue(dev, dev_ingress_queue(dev),
                                                NULL);
                }
        }

        /* Wait for outstanding qdisc_run calls. */
        list_for_each_entry(dev, head, close_list) {
                while (some_qdisc_is_busy(dev)) {
                        /* wait_event() would avoid this sleep-loop but would
                         * require expensive checks in the fast paths of packet
                         * processing which isn't worth it.
                         */
                        schedule_timeout_uninterruptible(1);
                }
        }
}

void dev_deactivate(struct net_device *dev, bool reset_needed)
{
        LIST_HEAD(single);

        list_add(&dev->close_list, &single);
        dev_deactivate_many(&single, reset_needed);
        list_del(&single);
}
EXPORT_SYMBOL(dev_deactivate);

static int qdisc_change_tx_queue_len(struct net_device *dev,
                                     struct netdev_queue *dev_queue)
{
        struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
        const struct Qdisc_ops *ops = qdisc->ops;

        if (ops->change_tx_queue_len)
                return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
        return 0;
}

void dev_qdisc_change_real_num_tx(struct net_device *dev,
                                  unsigned int new_real_tx)
{
        struct Qdisc *qdisc = rtnl_dereference(dev->qdisc);

        if (qdisc->ops->change_real_num_tx)
                qdisc->ops->change_real_num_tx(qdisc, new_real_tx);
}

void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx)
{
#ifdef CONFIG_NET_SCHED
        struct net_device *dev = qdisc_dev(sch);
        struct Qdisc *qdisc;
        unsigned int i;

        for (i = new_real_tx; i < dev->real_num_tx_queues; i++) {
                qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping);
                /* Only update the default qdiscs we created,
                 * qdiscs with handles are always hashed.
                 */
                if (qdisc != &noop_qdisc && !qdisc->handle)
                        qdisc_hash_del(qdisc);
        }
        for (i = dev->real_num_tx_queues; i < new_real_tx; i++) {
                qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping);
                if (qdisc != &noop_qdisc && !qdisc->handle)
                        qdisc_hash_add(qdisc, false);
        }
#endif
}
EXPORT_SYMBOL(mq_change_real_num_tx);

int dev_qdisc_change_tx_queue_len(struct net_device *dev)
{
        bool up = dev->flags & IFF_UP;
        unsigned int i;
        int ret = 0;

        if (up)
                dev_deactivate(dev, false);

        for (i = 0; i < dev->num_tx_queues; i++) {
                ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);

                /* TODO: revert changes on a partial failure */
                if (ret)
                        break;
        }

        if (up)
                dev_activate(dev);
        return ret;
}

static void dev_init_scheduler_queue(struct net_device *dev,
                                     struct netdev_queue *dev_queue,
                                     void *_qdisc)
{
        struct Qdisc *qdisc = _qdisc;

        rcu_assign_pointer(dev_queue->qdisc, qdisc);
        rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
}

void dev_init_scheduler(struct net_device *dev)
{
        rcu_assign_pointer(dev->qdisc, &noop_qdisc);
        netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
        if (dev_ingress_queue(dev))
                dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);

        timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
}

void dev_shutdown(struct net_device *dev)
{
        netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
        if (dev_ingress_queue(dev))
                shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
        qdisc_put(rtnl_dereference(dev->qdisc));
        rcu_assign_pointer(dev->qdisc, &noop_qdisc);

        WARN_ON(timer_pending(&dev->watchdog_timer));
}

/**
 * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division
 * @rate:   Rate to compute reciprocal division values of
 * @mult:   Multiplier for reciprocal division
 * @shift:  Shift for reciprocal division
 *
 * The multiplier and shift for reciprocal division by rate are stored
 * in mult and shift.
 *
 * The deal here is to replace a divide by a reciprocal one
 * in fast path (a reciprocal divide is a multiply and a shift)
 *
 * Normal formula would be :
 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
 *
 * We compute mult/shift to use instead :
 *  time_in_ns = (len * mult) >> shift;
 *
 * We try to get the highest possible mult value for accuracy,
 * but have to make sure no overflows will ever happen.
 *
 * reciprocal_value() is not used here it doesn't handle 64-bit values.
 */
static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift)
{
        u64 factor = NSEC_PER_SEC;

        *mult = 1;
        *shift = 0;

        if (rate <= 0)
                return;

        for (;;) {
                *mult = div64_u64(factor, rate);
                if (*mult & (1U << 31) || factor & (1ULL << 63))
                        break;
                factor <<= 1;
                (*shift)++;
        }
}

void psched_ratecfg_precompute(struct psched_ratecfg *r,
                               const struct tc_ratespec *conf,
                               u64 rate64)
{
        memset(r, 0, sizeof(*r));
        r->overhead = conf->overhead;
        r->mpu = conf->mpu;
        r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
        r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
        psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift);
}
EXPORT_SYMBOL(psched_ratecfg_precompute);

void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64)
{
        r->rate_pkts_ps = pktrate64;
        psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift);
}
EXPORT_SYMBOL(psched_ppscfg_precompute);

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
                          struct tcf_proto *tp_head)
{
        /* Protected with chain0->filter_chain_lock.
         * Can't access chain directly because tp_head can be NULL.
         */
        struct mini_Qdisc *miniq_old =
                rcu_dereference_protected(*miniqp->p_miniq, 1);
        struct mini_Qdisc *miniq;

        if (!tp_head) {
                RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
        } else {
                miniq = miniq_old != &miniqp->miniq1 ?
                        &miniqp->miniq1 : &miniqp->miniq2;

                /* We need to make sure that readers won't see the miniq
                 * we are about to modify. So ensure that at least one RCU
                 * grace period has elapsed since the miniq was made
                 * inactive.
                 */
                if (IS_ENABLED(CONFIG_PREEMPT_RT))
                        cond_synchronize_rcu(miniq->rcu_state);
                else if (!poll_state_synchronize_rcu(miniq->rcu_state))
                        synchronize_rcu_expedited();

                miniq->filter_list = tp_head;
                rcu_assign_pointer(*miniqp->p_miniq, miniq);
        }

        if (miniq_old)
                /* This is counterpart of the rcu sync above. We need to
                 * block potential new user of miniq_old until all readers
                 * are not seeing it.
                 */
                miniq_old->rcu_state = start_poll_synchronize_rcu();
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
                                struct tcf_block *block)
{
        miniqp->miniq1.block = block;
        miniqp->miniq2.block = block;
}
EXPORT_SYMBOL(mini_qdisc_pair_block_init);

void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
                          struct mini_Qdisc __rcu **p_miniq)
{
        miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
        miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
        miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
        miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
        miniqp->miniq1.rcu_state = get_state_synchronize_rcu();
        miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state;
        miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);



































































































































































    1 

















































































































































    2 





    2 



































































































































    2 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Authentication token and access key management
 *
 * Copyright (C) 2004, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * See Documentation/security/keys/core.rst for information on keys/keyrings.
 */

#ifndef _LINUX_KEY_H
#define _LINUX_KEY_H

#include <linux/types.h>
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>
#include <linux/sysctl.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include <linux/assoc_array.h>
#include <linux/refcount.h>
#include <linux/time64.h>

#ifdef __KERNEL__
#include <linux/uidgid.h>

/* key handle serial number */
typedef int32_t key_serial_t;

/* key handle permissions mask */
typedef uint32_t key_perm_t;

struct key;
struct net;

#ifdef CONFIG_KEYS

#undef KEY_DEBUGGING

#define KEY_POS_VIEW        0x01000000        /* possessor can view a key's attributes */
#define KEY_POS_READ        0x02000000        /* possessor can read key payload / view keyring */
#define KEY_POS_WRITE        0x04000000        /* possessor can update key payload / add link to keyring */
#define KEY_POS_SEARCH        0x08000000        /* possessor can find a key in search / search a keyring */
#define KEY_POS_LINK        0x10000000        /* possessor can create a link to a key/keyring */
#define KEY_POS_SETATTR        0x20000000        /* possessor can set key attributes */
#define KEY_POS_ALL        0x3f000000

#define KEY_USR_VIEW        0x00010000        /* user permissions... */
#define KEY_USR_READ        0x00020000
#define KEY_USR_WRITE        0x00040000
#define KEY_USR_SEARCH        0x00080000
#define KEY_USR_LINK        0x00100000
#define KEY_USR_SETATTR        0x00200000
#define KEY_USR_ALL        0x003f0000

#define KEY_GRP_VIEW        0x00000100        /* group permissions... */
#define KEY_GRP_READ        0x00000200
#define KEY_GRP_WRITE        0x00000400
#define KEY_GRP_SEARCH        0x00000800
#define KEY_GRP_LINK        0x00001000
#define KEY_GRP_SETATTR        0x00002000
#define KEY_GRP_ALL        0x00003f00

#define KEY_OTH_VIEW        0x00000001        /* third party permissions... */
#define KEY_OTH_READ        0x00000002
#define KEY_OTH_WRITE        0x00000004
#define KEY_OTH_SEARCH        0x00000008
#define KEY_OTH_LINK        0x00000010
#define KEY_OTH_SETATTR        0x00000020
#define KEY_OTH_ALL        0x0000003f

#define KEY_PERM_UNDEF        0xffffffff

/*
 * The permissions required on a key that we're looking up.
 */
enum key_need_perm {
        KEY_NEED_UNSPECIFIED,        /* Needed permission unspecified */
        KEY_NEED_VIEW,                /* Require permission to view attributes */
        KEY_NEED_READ,                /* Require permission to read content */
        KEY_NEED_WRITE,                /* Require permission to update / modify */
        KEY_NEED_SEARCH,        /* Require permission to search (keyring) or find (key) */
        KEY_NEED_LINK,                /* Require permission to link */
        KEY_NEED_SETATTR,        /* Require permission to change attributes */
        KEY_NEED_UNLINK,        /* Require permission to unlink key */
        KEY_SYSADMIN_OVERRIDE,        /* Special: override by CAP_SYS_ADMIN */
        KEY_AUTHTOKEN_OVERRIDE,        /* Special: override by possession of auth token */
        KEY_DEFER_PERM_CHECK,        /* Special: permission check is deferred */
};

enum key_lookup_flag {
        KEY_LOOKUP_CREATE = 0x01,
        KEY_LOOKUP_PARTIAL = 0x02,
        KEY_LOOKUP_ALL = (KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL),
};

struct seq_file;
struct user_struct;
struct signal_struct;
struct cred;

struct key_type;
struct key_owner;
struct key_tag;
struct keyring_list;
struct keyring_name;

struct key_tag {
        struct rcu_head                rcu;
        refcount_t                usage;
        bool                        removed;        /* T when subject removed */
};

struct keyring_index_key {
        /* [!] If this structure is altered, the union in struct key must change too! */
        unsigned long                hash;                        /* Hash value */
        union {
                struct {
#ifdef __LITTLE_ENDIAN /* Put desc_len at the LSB of x */
                        u16        desc_len;
                        char        desc[sizeof(long) - 2];        /* First few chars of description */
#else
                        char        desc[sizeof(long) - 2];        /* First few chars of description */
                        u16        desc_len;
#endif
                };
                unsigned long x;
        };
        struct key_type                *type;
        struct key_tag                *domain_tag;        /* Domain of operation */
        const char                *description;
};

union key_payload {
        void __rcu                *rcu_data0;
        void                        *data[4];
};

/*****************************************************************************/
/*
 * key reference with possession attribute handling
 *
 * NOTE! key_ref_t is a typedef'd pointer to a type that is not actually
 * defined. This is because we abuse the bottom bit of the reference to carry a
 * flag to indicate whether the calling process possesses that key in one of
 * its keyrings.
 *
 * the key_ref_t has been made a separate type so that the compiler can reject
 * attempts to dereference it without proper conversion.
 *
 * the three functions are used to assemble and disassemble references
 */
typedef struct __key_reference_with_attributes *key_ref_t;

static inline key_ref_t make_key_ref(const struct key *key,
                                     bool possession)
{
        return (key_ref_t) ((unsigned long) key | possession);
}

static inline struct key *key_ref_to_ptr(const key_ref_t key_ref)
{
        return (struct key *) ((unsigned long) key_ref & ~1UL);
}

static inline bool is_key_possessed(const key_ref_t key_ref)
{
        return (unsigned long) key_ref & 1UL;
}

typedef int (*key_restrict_link_func_t)(struct key *dest_keyring,
                                        const struct key_type *type,
                                        const union key_payload *payload,
                                        struct key *restriction_key);

struct key_restriction {
        key_restrict_link_func_t check;
        struct key *key;
        struct key_type *keytype;
};

enum key_state {
        KEY_IS_UNINSTANTIATED,
        KEY_IS_POSITIVE,                /* Positively instantiated */
};

/*****************************************************************************/
/*
 * authentication token / access credential / keyring
 * - types of key include:
 *   - keyrings
 *   - disk encryption IDs
 *   - Kerberos TGTs and tickets
 */
struct key {
        refcount_t                usage;                /* number of references */
        key_serial_t                serial;                /* key serial number */
        union {
                struct list_head graveyard_link;
                struct rb_node        serial_node;
        };
#ifdef CONFIG_KEY_NOTIFICATIONS
        struct watch_list        *watchers;        /* Entities watching this key for changes */
#endif
        struct rw_semaphore        sem;                /* change vs change sem */
        struct key_user                *user;                /* owner of this key */
        void                        *security;        /* security data for this key */
        union {
                time64_t        expiry;                /* time at which key expires (or 0) */
                time64_t        revoked_at;        /* time at which key was revoked */
        };
        time64_t                last_used_at;        /* last time used for LRU keyring discard */
        kuid_t                        uid;
        kgid_t                        gid;
        key_perm_t                perm;                /* access permissions */
        unsigned short                quotalen;        /* length added to quota */
        unsigned short                datalen;        /* payload data length
                                                 * - may not match RCU dereferenced payload
                                                 * - payload should contain own length
                                                 */
        short                        state;                /* Key state (+) or rejection error (-) */

#ifdef KEY_DEBUGGING
        unsigned                magic;
#define KEY_DEBUG_MAGIC                0x18273645u
#endif

        unsigned long                flags;                /* status flags (change with bitops) */
#define KEY_FLAG_DEAD                0        /* set if key type has been deleted */
#define KEY_FLAG_REVOKED        1        /* set if key had been revoked */
#define KEY_FLAG_IN_QUOTA        2        /* set if key consumes quota */
#define KEY_FLAG_USER_CONSTRUCT        3        /* set if key is being constructed in userspace */
#define KEY_FLAG_ROOT_CAN_CLEAR        4        /* set if key can be cleared by root without permission */
#define KEY_FLAG_INVALIDATED        5        /* set if key has been invalidated */
#define KEY_FLAG_BUILTIN        6        /* set if key is built in to the kernel */
#define KEY_FLAG_ROOT_CAN_INVAL        7        /* set if key can be invalidated by root without permission */
#define KEY_FLAG_KEEP                8        /* set if key should not be removed */
#define KEY_FLAG_UID_KEYRING        9        /* set if key is a user or user session keyring */
#define KEY_FLAG_USER_ALIVE        10        /* set if final put has not happened on key yet */

        /* the key type and key description string
         * - the desc is used to match a key against search criteria
         * - it should be a printable string
         * - eg: for krb5 AFS, this might be "afs@REDHAT.COM"
         */
        union {
                struct keyring_index_key index_key;
                struct {
                        unsigned long        hash;
                        unsigned long        len_desc;
                        struct key_type        *type;                /* type of key */
                        struct key_tag        *domain_tag;        /* Domain of operation */
                        char                *description;
                };
        };

        /* key data
         * - this is used to hold the data actually used in cryptography or
         *   whatever
         */
        union {
                union key_payload payload;
                struct {
                        /* Keyring bits */
                        struct list_head name_link;
                        struct assoc_array keys;
                };
        };

        /* This is set on a keyring to restrict the addition of a link to a key
         * to it.  If this structure isn't provided then it is assumed that the
         * keyring is open to any addition.  It is ignored for non-keyring
         * keys. Only set this value using keyring_restrict(), keyring_alloc(),
         * or key_alloc().
         *
         * This is intended for use with rings of trusted keys whereby addition
         * to the keyring needs to be controlled.  KEY_ALLOC_BYPASS_RESTRICTION
         * overrides this, allowing the kernel to add extra keys without
         * restriction.
         */
        struct key_restriction *restrict_link;
};

extern struct key *key_alloc(struct key_type *type,
                             const char *desc,
                             kuid_t uid, kgid_t gid,
                             const struct cred *cred,
                             key_perm_t perm,
                             unsigned long flags,
                             struct key_restriction *restrict_link);


#define KEY_ALLOC_IN_QUOTA                0x0000        /* add to quota, reject if would overrun */
#define KEY_ALLOC_QUOTA_OVERRUN                0x0001        /* add to quota, permit even if overrun */
#define KEY_ALLOC_NOT_IN_QUOTA                0x0002        /* not in quota */
#define KEY_ALLOC_BUILT_IN                0x0004        /* Key is built into kernel */
#define KEY_ALLOC_BYPASS_RESTRICTION        0x0008        /* Override the check on restricted keyrings */
#define KEY_ALLOC_UID_KEYRING                0x0010        /* allocating a user or user session keyring */
#define KEY_ALLOC_SET_KEEP                0x0020        /* Set the KEEP flag on the key/keyring */

extern void key_revoke(struct key *key);
extern void key_invalidate(struct key *key);
extern void key_put(struct key *key);
extern bool key_put_tag(struct key_tag *tag);
extern void key_remove_domain(struct key_tag *domain_tag);

static inline struct key *__key_get(struct key *key)
{
        refcount_inc(&key->usage);
        return key;
}

static inline struct key *key_get(struct key *key)
{
        return key ? __key_get(key) : key;
}

static inline void key_ref_put(key_ref_t key_ref)
{
        key_put(key_ref_to_ptr(key_ref));
}

extern struct key *request_key_tag(struct key_type *type,
                                   const char *description,
                                   struct key_tag *domain_tag,
                                   const char *callout_info);

extern struct key *request_key_rcu(struct key_type *type,
                                   const char *description,
                                   struct key_tag *domain_tag);

extern struct key *request_key_with_auxdata(struct key_type *type,
                                            const char *description,
                                            struct key_tag *domain_tag,
                                            const void *callout_info,
                                            size_t callout_len,
                                            void *aux);

/**
 * request_key - Request a key and wait for construction
 * @type: Type of key.
 * @description: The searchable description of the key.
 * @callout_info: The data to pass to the instantiation upcall (or NULL).
 *
 * As for request_key_tag(), but with the default global domain tag.
 */
static inline struct key *request_key(struct key_type *type,
                                      const char *description,
                                      const char *callout_info)
{
        return request_key_tag(type, description, NULL, callout_info);
}

#ifdef CONFIG_NET
/**
 * request_key_net - Request a key for a net namespace and wait for construction
 * @type: Type of key.
 * @description: The searchable description of the key.
 * @net: The network namespace that is the key's domain of operation.
 * @callout_info: The data to pass to the instantiation upcall (or NULL).
 *
 * As for request_key() except that it does not add the returned key to a
 * keyring if found, new keys are always allocated in the user's quota, the
 * callout_info must be a NUL-terminated string and no auxiliary data can be
 * passed.  Only keys that operate the specified network namespace are used.
 *
 * Furthermore, it then works as wait_for_key_construction() to wait for the
 * completion of keys undergoing construction with a non-interruptible wait.
 */
#define request_key_net(type, description, net, callout_info) \
        request_key_tag(type, description, net->key_domain, callout_info)

/**
 * request_key_net_rcu - Request a key for a net namespace under RCU conditions
 * @type: Type of key.
 * @description: The searchable description of the key.
 * @net: The network namespace that is the key's domain of operation.
 *
 * As for request_key_rcu() except that only keys that operate the specified
 * network namespace are used.
 */
#define request_key_net_rcu(type, description, net) \
        request_key_rcu(type, description, net->key_domain)
#endif /* CONFIG_NET */

extern int wait_for_key_construction(struct key *key, bool intr);

extern int key_validate(const struct key *key);

extern key_ref_t key_create(key_ref_t keyring,
                            const char *type,
                            const char *description,
                            const void *payload,
                            size_t plen,
                            key_perm_t perm,
                            unsigned long flags);

extern key_ref_t key_create_or_update(key_ref_t keyring,
                                      const char *type,
                                      const char *description,
                                      const void *payload,
                                      size_t plen,
                                      key_perm_t perm,
                                      unsigned long flags);

extern int key_update(key_ref_t key,
                      const void *payload,
                      size_t plen);

extern int key_link(struct key *keyring,
                    struct key *key);

extern int key_move(struct key *key,
                    struct key *from_keyring,
                    struct key *to_keyring,
                    unsigned int flags);

extern int key_unlink(struct key *keyring,
                      struct key *key);

extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
                                 const struct cred *cred,
                                 key_perm_t perm,
                                 unsigned long flags,
                                 struct key_restriction *restrict_link,
                                 struct key *dest);

extern int restrict_link_reject(struct key *keyring,
                                const struct key_type *type,
                                const union key_payload *payload,
                                struct key *restriction_key);

extern int keyring_clear(struct key *keyring);

extern key_ref_t keyring_search(key_ref_t keyring,
                                struct key_type *type,
                                const char *description,
                                bool recurse);

extern int keyring_restrict(key_ref_t keyring, const char *type,
                            const char *restriction);

extern struct key *key_lookup(key_serial_t id);

static inline key_serial_t key_serial(const struct key *key)
{
        return key ? key->serial : 0;
}

extern void key_set_timeout(struct key *, unsigned);

extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags,
                                 enum key_need_perm need_perm);
extern void key_free_user_ns(struct user_namespace *);

static inline short key_read_state(const struct key *key)
{
        /* Barrier versus mark_key_instantiated(). */
        return smp_load_acquire(&key->state);
}

/**
 * key_is_positive - Determine if a key has been positively instantiated
 * @key: The key to check.
 *
 * Return true if the specified key has been positively instantiated, false
 * otherwise.
 */
static inline bool key_is_positive(const struct key *key)
{
        return key_read_state(key) == KEY_IS_POSITIVE;
}

static inline bool key_is_negative(const struct key *key)
{
        return key_read_state(key) < 0;
}

#define dereference_key_rcu(KEY)                                        \
        (rcu_dereference((KEY)->payload.rcu_data0))

#define dereference_key_locked(KEY)                                        \
        (rcu_dereference_protected((KEY)->payload.rcu_data0,                \
                                   rwsem_is_locked(&((struct key *)(KEY))->sem)))

#define rcu_assign_keypointer(KEY, PAYLOAD)                                \
do {                                                                        \
        rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD));        \
} while (0)

/*
 * the userspace interface
 */
extern int install_thread_keyring_to_cred(struct cred *cred);
extern void key_fsuid_changed(struct cred *new_cred);
extern void key_fsgid_changed(struct cred *new_cred);
extern void key_init(void);

#else /* CONFIG_KEYS */

#define key_validate(k)                        0
#define key_serial(k)                        0
#define key_get(k)                         ({ NULL; })
#define key_revoke(k)                        do { } while(0)
#define key_invalidate(k)                do { } while(0)
#define key_put(k)                        do { } while(0)
#define key_ref_put(k)                        do { } while(0)
#define make_key_ref(k, p)                NULL
#define key_ref_to_ptr(k)                NULL
#define is_key_possessed(k)                0
#define key_fsuid_changed(c)                do { } while(0)
#define key_fsgid_changed(c)                do { } while(0)
#define key_init()                        do { } while(0)
#define key_free_user_ns(ns)                do { } while(0)
#define key_remove_domain(d)                do { } while(0)
#define key_lookup(k)                        NULL

#endif /* CONFIG_KEYS */
#endif /* __KERNEL__ */
#endif /* _LINUX_KEY_H */















































































































    3 






















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
// SPDX-License-Identifier: GPL-2.0
/*
 *  hrtimers - High-resolution kernel timers
 *
 *   Copyright(C) 2005, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
 *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
 *
 *  data type definitions, declarations, prototypes
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 */
#ifndef _LINUX_HRTIMER_H
#define _LINUX_HRTIMER_H

#include <linux/hrtimer_defs.h>
#include <linux/hrtimer_rearm.h>
#include <linux/hrtimer_types.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/percpu-defs.h>
#include <linux/rbtree.h>
#include <linux/timer.h>

/*
 * Mode arguments of xxx_hrtimer functions:
 *
 * HRTIMER_MODE_ABS                - Time value is absolute
 * HRTIMER_MODE_REL                - Time value is relative to now
 * HRTIMER_MODE_PINNED                - Timer is bound to CPU (is only considered
 *                                  when starting the timer)
 * HRTIMER_MODE_SOFT                - Timer callback function will be executed in
 *                                  soft irq context
 * HRTIMER_MODE_HARD                - Timer callback function will be executed in
 *                                  hard irq context even on PREEMPT_RT.
 * HRTIMER_MODE_LAZY_REARM        - Avoid reprogramming if the timer was the
 *                                  first expiring timer and is moved into the
 *                                  future. Special mode for the HRTICK timer to
 *                                  avoid extensive reprogramming of the hardware,
 *                                  which is expensive in virtual machines. Risks
 *                                  a pointless expiry, but that's better than
 *                                  reprogramming on every context switch,
 */
enum hrtimer_mode {
        HRTIMER_MODE_ABS        = 0x00,
        HRTIMER_MODE_REL        = 0x01,
        HRTIMER_MODE_PINNED        = 0x02,
        HRTIMER_MODE_SOFT        = 0x04,
        HRTIMER_MODE_HARD        = 0x08,
        HRTIMER_MODE_LAZY_REARM        = 0x10,

        HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
        HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,

        HRTIMER_MODE_ABS_SOFT        = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
        HRTIMER_MODE_REL_SOFT        = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,

        HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
        HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,

        HRTIMER_MODE_ABS_HARD        = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
        HRTIMER_MODE_REL_HARD        = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,

        HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
        HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
};

/**
 * struct hrtimer_sleeper - simple sleeper structure
 * @timer:        embedded timer structure
 * @task:        task to wake up
 *
 * task is set to NULL, when the timer expires.
 */
struct hrtimer_sleeper {
        struct hrtimer timer;
        struct task_struct *task;
};

static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
        timer->node.expires = time;
        timer->_softexpires = time;
}

static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
{
        timer->_softexpires = time;
        timer->node.expires = ktime_add_safe(time, delta);
}

static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta)
{
        timer->_softexpires = time;
        timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
}

static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
{
        timer->node.expires = ktime_add_safe(timer->node.expires, time);
        timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
}

static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
{
        timer->node.expires = ktime_add_ns(timer->node.expires, ns);
        timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
}

static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
{
        return timer->node.expires;
}

static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
{
        return timer->_softexpires;
}

ktime_t hrtimer_cb_get_time(const struct hrtimer *timer);

static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
{
        return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer));
}

#ifdef CONFIG_HIGH_RES_TIMERS
extern unsigned int hrtimer_resolution;
struct clock_event_device;

extern void hrtimer_interrupt(struct clock_event_device *dev);

extern struct static_key_false hrtimer_highres_enabled_key;

static inline bool hrtimer_highres_enabled(void)
{
        return static_branch_likely(&hrtimer_highres_enabled_key);
}

#else  /* CONFIG_HIGH_RES_TIMERS */
#define hrtimer_resolution        (unsigned int)LOW_RES_NSEC
static inline bool hrtimer_highres_enabled(void) { return false; }
#endif  /* !CONFIG_HIGH_RES_TIMERS */

static inline ktime_t
__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now)
{
        ktime_t rem = ktime_sub(timer->node.expires, now);

        /*
         * Adjust relative timers for the extra we added in
         * hrtimer_start_range_ns() to prevent short timeouts.
         */
        if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel)
                rem -= hrtimer_resolution;
        return rem;
}

static inline ktime_t
hrtimer_expires_remaining_adjusted(const struct hrtimer *timer)
{
        return __hrtimer_expires_remaining_adjusted(timer, hrtimer_cb_get_time(timer));
}

#ifdef CONFIG_TIMERFD
extern void timerfd_clock_was_set(void);
extern void timerfd_resume(void);
#else
static inline void timerfd_clock_was_set(void) { }
static inline void timerfd_resume(void) { }
#endif

DECLARE_PER_CPU(struct tick_device, tick_cpu_device);

#ifdef CONFIG_PREEMPT_RT
void hrtimer_cancel_wait_running(const struct hrtimer *timer);
#else
static inline void hrtimer_cancel_wait_running(struct hrtimer *timer)
{
        cpu_relax();
}
#endif

static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused)
{
        return HRTIMER_NORESTART;
}

/* Exported timer functions: */

/* Initialize timers: */
extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
                          clockid_t clock_id, enum hrtimer_mode mode);
extern void hrtimer_setup_on_stack(struct hrtimer *timer,
                                   enum hrtimer_restart (*function)(struct hrtimer *),
                                   clockid_t clock_id, enum hrtimer_mode mode);
extern void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
                                           enum hrtimer_mode mode);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
#else
static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
#endif

/* Basic timer operations: */
extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                                   u64 range_ns, const enum hrtimer_mode mode);

/**
 * hrtimer_start - (re)start an hrtimer
 * @timer:        the timer to be added
 * @tim:        expiry time
 * @mode:        timer mode: absolute (HRTIMER_MODE_ABS) or
 *                relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
 *                softirq based mode is considered for debug purpose only!
 */
static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
                                 const enum hrtimer_mode mode)
{
        hrtimer_start_range_ns(timer, tim, 0, mode);
}

extern int hrtimer_cancel(struct hrtimer *timer);
extern int hrtimer_try_to_cancel(struct hrtimer *timer);

static inline void hrtimer_start_expires(struct hrtimer *timer,
                                         enum hrtimer_mode mode)
{
        u64 delta;
        ktime_t soft, hard;
        soft = hrtimer_get_softexpires(timer);
        hard = hrtimer_get_expires(timer);
        delta = ktime_to_ns(ktime_sub(hard, soft));
        hrtimer_start_range_ns(timer, soft, delta, mode);
}

void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
                                   enum hrtimer_mode mode);

static inline void hrtimer_restart(struct hrtimer *timer)
{
        hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}

/* Query timers: */
extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);

/**
 * hrtimer_get_remaining - get remaining time for the timer
 * @timer:        the timer to read
 */
static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
{
        return __hrtimer_get_remaining(timer, false);
}

extern u64 hrtimer_get_next_event(void);
extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);

extern bool hrtimer_active(const struct hrtimer *timer);

/**
 * hrtimer_is_queued - check, whether the timer is on one of the queues
 * @timer:        Timer to check
 *
 * Returns: True if the timer is queued, false otherwise
 *
 * The function can be used lockless, but it gives only a current snapshot.
 */
static inline bool hrtimer_is_queued(struct hrtimer *timer)
{
        /* The READ_ONCE pairs with the update functions of timer->is_queued */
        return READ_ONCE(timer->is_queued);
}

/*
 * Helper function to check, whether the timer is running the callback
 * function
 */
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
        return timer->base->running == timer;
}

/**
 * hrtimer_update_function - Update the timer's callback function
 * @timer:        Timer to update
 * @function:        New callback function
 *
 * Only safe to call if the timer is not enqueued. Can be called in the callback function if the
 * timer is not enqueued at the same time (see the comments above HRTIMER_STATE_ENQUEUED).
 */
static inline void hrtimer_update_function(struct hrtimer *timer,
                                           enum hrtimer_restart (*function)(struct hrtimer *))
{
#ifdef CONFIG_PROVE_LOCKING
        guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock);

        if (WARN_ON_ONCE(hrtimer_is_queued(timer)))
                return;

        if (WARN_ON_ONCE(!function))
                return;
#endif
        ACCESS_PRIVATE(timer, function) = function;
}

/* Forward a hrtimer so it expires after now: */
extern u64
hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);

/**
 * hrtimer_forward_now() - forward the timer expiry so it expires after now
 * @timer:        hrtimer to forward
 * @interval:        the interval to forward
 *
 * It is a variant of hrtimer_forward(). The timer will expire after the current
 * time of the hrtimer clock base. See hrtimer_forward() for details.
 */
static inline u64 hrtimer_forward_now(struct hrtimer *timer,
                                      ktime_t interval)
{
        return hrtimer_forward(timer, hrtimer_cb_get_time(timer), interval);
}

/* Precise sleep: */

extern int nanosleep_copyout(struct restart_block *, struct timespec64 *);
extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
                              const clockid_t clockid);

extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
                                    const enum hrtimer_mode mode);
extern int schedule_hrtimeout_range_clock(ktime_t *expires,
                                          u64 delta,
                                          const enum hrtimer_mode mode,
                                          clockid_t clock_id);
extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);

/* Soft interrupt function to run the hrtimer queues: */
extern void hrtimer_run_queues(void);

/* Bootup initialization: */
extern void __init hrtimers_init(void);

/* Show pending timers: */
extern void sysrq_timer_list_show(void);

int hrtimers_prepare_cpu(unsigned int cpu);
int hrtimers_cpu_starting(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
int hrtimers_cpu_dying(unsigned int cpu);
#else
#define hrtimers_cpu_dying        NULL
#endif

#endif















































































































































































































































































































































































































































































































































































    3 









    3 











































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/devres.c - device resource management
 *
 * Copyright (c) 2006  SUSE Linux Products GmbH
 * Copyright (c) 2006  Tejun Heo <teheo@suse.de>
 */

#include <linux/device.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/percpu.h>

#include <asm/sections.h>

#include "base.h"
#include "trace.h"

struct devres {
        struct devres_node                node;
        dr_release_t                        release;
        /*
         * Some archs want to perform DMA into kmalloc caches
         * and need a guaranteed alignment larger than
         * the alignment of a 64-bit integer.
         * Thus we use ARCH_DMA_MINALIGN for data[] which will force the same
         * alignment for struct devres when allocated by kmalloc().
         */
        u8 __aligned(ARCH_DMA_MINALIGN) data[];
};

struct devres_group {
        struct devres_node                node[2];
        void                                *id;
        int                                color;
        /* -- 8 pointers */
};

void devres_node_init(struct devres_node *node,
                      dr_node_release_t release,
                      dr_node_free_t free_node)
{
        INIT_LIST_HEAD(&node->entry);
        node->release = release;
        node->free_node = free_node;
}

static inline void free_node(struct devres_node *node)
{
        node->free_node(node);
}

void devres_set_node_dbginfo(struct devres_node *node, const char *name,
                             size_t size)
{
        node->name = name;
        node->size = size;
}

#ifdef CONFIG_DEBUG_DEVRES
static int log_devres = 0;
module_param_named(log, log_devres, int, S_IRUGO | S_IWUSR);

static void devres_dbg(struct device *dev, struct devres_node *node,
                       const char *op)
{
        if (unlikely(log_devres))
                dev_err(dev, "DEVRES %3s %p %s (%zu bytes)\n",
                        op, node, node->name, node->size);
}
#else /* CONFIG_DEBUG_DEVRES */
#define devres_dbg(dev, node, op)        do {} while (0)
#endif /* CONFIG_DEBUG_DEVRES */

static void devres_log(struct device *dev, struct devres_node *node,
                       const char *op)
{
        trace_devres_log(dev, op, node, node->name, node->size);
        devres_dbg(dev, node, op);
}

/*
 * Release functions for devres group.  These callbacks are used only
 * for identification.
 */
static void group_open_release(struct device *dev, struct devres_node *node)
{
        /* noop */
}

static void group_close_release(struct device *dev, struct devres_node *node)
{
        /* noop */
}

static struct devres_group *node_to_group(struct devres_node *node)
{
        if (node->release == &group_open_release)
                return container_of(node, struct devres_group, node[0]);
        if (node->release == &group_close_release)
                return container_of(node, struct devres_group, node[1]);
        return NULL;
}

static bool check_dr_size(size_t size, size_t *tot_size)
{
        /* We must catch any near-SIZE_MAX cases that could overflow. */
        if (unlikely(check_add_overflow(sizeof(struct devres),
                                        size, tot_size)))
                return false;

        /* Actually allocate the full kmalloc bucket size. */
        *tot_size = kmalloc_size_roundup(*tot_size);

        return true;
}

static void dr_node_release(struct device *dev, struct devres_node *node)
{
        struct devres *dr = container_of(node, struct devres, node);

        dr->release(dev, dr->data);
}

static void dr_node_free(struct devres_node *node)
{
        struct devres *dr = container_of(node, struct devres, node);

        kfree(dr);
}

static __always_inline struct devres *alloc_dr(dr_release_t release,
                                               size_t size, gfp_t gfp, int nid)
{
        size_t tot_size;
        struct devres *dr;

        if (!check_dr_size(size, &tot_size))
                return NULL;

        dr = kmalloc_node_track_caller(tot_size, gfp, nid);
        if (unlikely(!dr))
                return NULL;

        /* No need to clear memory twice */
        if (!(gfp & __GFP_ZERO))
                memset(dr, 0, offsetof(struct devres, data));

        devres_node_init(&dr->node, dr_node_release, dr_node_free);
        dr->release = release;
        return dr;
}

static void add_dr(struct device *dev, struct devres_node *node)
{
        devres_log(dev, node, "ADD");
        BUG_ON(!list_empty(&node->entry));
        list_add_tail(&node->entry, &dev->devres_head);
}

static void replace_dr(struct device *dev,
                       struct devres_node *old, struct devres_node *new)
{
        devres_log(dev, old, "REPLACE");
        BUG_ON(!list_empty(&new->entry));
        list_replace(&old->entry, &new->entry);
}

/**
 * __devres_alloc_node - Allocate device resource data
 * @release: Release function devres will be associated with
 * @size: Allocation size
 * @gfp: Allocation flags
 * @nid: NUMA node
 * @name: Name of the resource
 *
 * Allocate devres of @size bytes.  The allocated area is zeroed, then
 * associated with @release.  The returned pointer can be passed to
 * other devres_*() functions.
 *
 * RETURNS:
 * Pointer to allocated devres on success, NULL on failure.
 */
void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid,
                          const char *name)
{
        struct devres *dr;

        dr = alloc_dr(release, size, gfp | __GFP_ZERO, nid);
        if (unlikely(!dr))
                return NULL;
        devres_set_node_dbginfo(&dr->node, name, size);
        return dr->data;
}
EXPORT_SYMBOL_GPL(__devres_alloc_node);

/**
 * devres_for_each_res - Resource iterator
 * @dev: Device to iterate resource from
 * @release: Look for resources associated with this release function
 * @match: Match function (optional)
 * @match_data: Data for the match function
 * @fn: Function to be called for each matched resource.
 * @data: Data for @fn, the 3rd parameter of @fn
 *
 * Call @fn for each devres of @dev which is associated with @release
 * and for which @match returns 1.
 *
 * RETURNS:
 *         void
 */
void devres_for_each_res(struct device *dev, dr_release_t release,
                        dr_match_t match, void *match_data,
                        void (*fn)(struct device *, void *, void *),
                        void *data)
{
        struct devres_node *node;
        struct devres_node *tmp;

        if (!fn)
                return;

        guard(spinlock_irqsave)(&dev->devres_lock);
        list_for_each_entry_safe_reverse(node, tmp,
                        &dev->devres_head, entry) {
                struct devres *dr = container_of(node, struct devres, node);

                if (node->release != dr_node_release)
                        continue;
                if (dr->release != release)
                        continue;
                if (match && !match(dev, dr->data, match_data))
                        continue;
                fn(dev, dr->data, data);
        }
}
EXPORT_SYMBOL_GPL(devres_for_each_res);

static inline void free_dr(struct devres *dr)
{
        free_node(&dr->node);
}

/**
 * devres_free - Free device resource data
 * @res: Pointer to devres data to free
 *
 * Free devres created with devres_alloc().
 */
void devres_free(void *res)
{
        if (res) {
                struct devres *dr = container_of(res, struct devres, data);

                BUG_ON(!list_empty(&dr->node.entry));
                free_dr(dr);
        }
}
EXPORT_SYMBOL_GPL(devres_free);

void devres_node_add(struct device *dev, struct devres_node *node)
{
        guard(spinlock_irqsave)(&dev->devres_lock);

        add_dr(dev, node);
}

/**
 * devres_add - Register device resource
 * @dev: Device to add resource to
 * @res: Resource to register
 *
 * Register devres @res to @dev.  @res should have been allocated
 * using devres_alloc().  On driver detach, the associated release
 * function will be invoked and devres will be freed automatically.
 */
void devres_add(struct device *dev, void *res)
{
        struct devres *dr = container_of(res, struct devres, data);

        devres_node_add(dev, &dr->node);
}
EXPORT_SYMBOL_GPL(devres_add);

static struct devres *find_dr(struct device *dev, dr_release_t release,
                              dr_match_t match, void *match_data)
{
        struct devres_node *node;

        list_for_each_entry_reverse(node, &dev->devres_head, entry) {
                struct devres *dr = container_of(node, struct devres, node);

                if (node->release != dr_node_release)
                        continue;
                if (dr->release != release)
                        continue;
                if (match && !match(dev, dr->data, match_data))
                        continue;
                return dr;
        }

        return NULL;
}

/**
 * devres_find - Find device resource
 * @dev: Device to lookup resource from
 * @release: Look for resources associated with this release function
 * @match: Match function (optional)
 * @match_data: Data for the match function
 *
 * Find the latest devres of @dev which is associated with @release
 * and for which @match returns 1.  If @match is NULL, it's considered
 * to match all.
 *
 * RETURNS:
 * Pointer to found devres, NULL if not found.
 */
void *devres_find(struct device *dev, dr_release_t release,
                  dr_match_t match, void *match_data)
{
        struct devres *dr;

        guard(spinlock_irqsave)(&dev->devres_lock);
        dr = find_dr(dev, release, match, match_data);
        if (dr)
                return dr->data;

        return NULL;
}
EXPORT_SYMBOL_GPL(devres_find);

/**
 * devres_get - Find devres, if non-existent, add one atomically
 * @dev: Device to lookup or add devres for
 * @new_res: Pointer to new initialized devres to add if not found
 * @match: Match function (optional)
 * @match_data: Data for the match function
 *
 * Find the latest devres of @dev which has the same release function
 * as @new_res and for which @match return 1.  If found, @new_res is
 * freed; otherwise, @new_res is added atomically.
 *
 * RETURNS:
 * Pointer to found or added devres.
 */
void *devres_get(struct device *dev, void *new_res,
                 dr_match_t match, void *match_data)
{
        struct devres *new_dr = container_of(new_res, struct devres, data);
        struct devres *dr;
        unsigned long flags;

        spin_lock_irqsave(&dev->devres_lock, flags);
        dr = find_dr(dev, new_dr->release, match, match_data);
        if (!dr) {
                add_dr(dev, &new_dr->node);
                dr = new_dr;
                new_res = NULL;
        }
        spin_unlock_irqrestore(&dev->devres_lock, flags);
        devres_free(new_res);

        return dr->data;
}
EXPORT_SYMBOL_GPL(devres_get);

bool devres_node_remove(struct device *dev, struct devres_node *node)
{
        struct devres_node *__node;

        guard(spinlock_irqsave)(&dev->devres_lock);
        list_for_each_entry_reverse(__node, &dev->devres_head, entry) {
                if (__node == node) {
                        list_del_init(&node->entry);
                        devres_log(dev, node, "REM");
                        return true;
                }
        }

        return false;
}

/**
 * devres_remove - Find a device resource and remove it
 * @dev: Device to find resource from
 * @release: Look for resources associated with this release function
 * @match: Match function (optional)
 * @match_data: Data for the match function
 *
 * Find the latest devres of @dev associated with @release and for
 * which @match returns 1.  If @match is NULL, it's considered to
 * match all.  If found, the resource is removed atomically and
 * returned.
 *
 * RETURNS:
 * Pointer to removed devres on success, NULL if not found.
 */
void *devres_remove(struct device *dev, dr_release_t release,
                    dr_match_t match, void *match_data)
{
        struct devres *dr;

        guard(spinlock_irqsave)(&dev->devres_lock);
        dr = find_dr(dev, release, match, match_data);
        if (dr) {
                list_del_init(&dr->node.entry);
                devres_log(dev, &dr->node, "REM");
                return dr->data;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(devres_remove);

/**
 * devres_destroy - Find a device resource and destroy it
 * @dev: Device to find resource from
 * @release: Look for resources associated with this release function
 * @match: Match function (optional)
 * @match_data: Data for the match function
 *
 * Find the latest devres of @dev associated with @release and for
 * which @match returns 1.  If @match is NULL, it's considered to
 * match all.  If found, the resource is removed atomically and freed.
 *
 * Note that the release function for the resource will not be called,
 * only the devres-allocated data will be freed.  The caller becomes
 * responsible for freeing any other data.
 *
 * RETURNS:
 * 0 if devres is found and freed, -ENOENT if not found.
 */
int devres_destroy(struct device *dev, dr_release_t release,
                   dr_match_t match, void *match_data)
{
        void *res;

        res = devres_remove(dev, release, match, match_data);
        if (unlikely(!res))
                return -ENOENT;

        devres_free(res);
        return 0;
}
EXPORT_SYMBOL_GPL(devres_destroy);


/**
 * devres_release - Find a device resource and destroy it, calling release
 * @dev: Device to find resource from
 * @release: Look for resources associated with this release function
 * @match: Match function (optional)
 * @match_data: Data for the match function
 *
 * Find the latest devres of @dev associated with @release and for
 * which @match returns 1.  If @match is NULL, it's considered to
 * match all.  If found, the resource is removed atomically, the
 * release function called and the resource freed.
 *
 * RETURNS:
 * 0 if devres is found and freed, -ENOENT if not found.
 */
int devres_release(struct device *dev, dr_release_t release,
                   dr_match_t match, void *match_data)
{
        void *res;

        res = devres_remove(dev, release, match, match_data);
        if (unlikely(!res))
                return -ENOENT;

        (*release)(dev, res);
        devres_free(res);
        return 0;
}
EXPORT_SYMBOL_GPL(devres_release);

static int remove_nodes(struct device *dev,
                        struct list_head *first, struct list_head *end,
                        struct list_head *todo)
{
        struct devres_node *node, *n;
        int cnt = 0, nr_groups = 0;

        /* First pass - move normal devres entries to @todo and clear
         * devres_group colors.
         */
        node = list_entry(first, struct devres_node, entry);
        list_for_each_entry_safe_from(node, n, end, entry) {
                struct devres_group *grp;

                grp = node_to_group(node);
                if (grp) {
                        /* clear color of group markers in the first pass */
                        grp->color = 0;
                        nr_groups++;
                } else {
                        /* regular devres entry */
                        if (&node->entry == first)
                                first = first->next;
                        list_move_tail(&node->entry, todo);
                        cnt++;
                }
        }

        if (!nr_groups)
                return cnt;

        /* Second pass - Scan groups and color them.  A group gets
         * color value of two iff the group is wholly contained in
         * [current node, end). That is, for a closed group, both opening
         * and closing markers should be in the range, while just the
         * opening marker is enough for an open group.
         */
        node = list_entry(first, struct devres_node, entry);
        list_for_each_entry_safe_from(node, n, end, entry) {
                struct devres_group *grp;

                grp = node_to_group(node);
                BUG_ON(!grp || list_empty(&grp->node[0].entry));

                grp->color++;
                if (list_empty(&grp->node[1].entry))
                        grp->color++;

                BUG_ON(grp->color <= 0 || grp->color > 2);
                if (grp->color == 2) {
                        /* No need to update current node or end. The removed
                         * nodes are always before both.
                         */
                        list_move_tail(&grp->node[0].entry, todo);
                        list_del_init(&grp->node[1].entry);
                }
        }

        return cnt;
}

static void release_nodes(struct device *dev, struct list_head *todo)
{
        struct devres_node *node, *tmp;

        list_for_each_entry_safe_reverse(node, tmp, todo, entry) {
                devres_log(dev, node, "REL");
                node->release(dev, node);
                free_node(node);
        }
}

/**
 * devres_release_all - Release all managed resources
 * @dev: Device to release resources for
 *
 * Release all resources associated with @dev.  This function is
 * called on driver detach.
 */
int devres_release_all(struct device *dev)
{
        unsigned long flags;
        LIST_HEAD(todo);
        int cnt;

        /* Looks like an uninitialized device structure */
        if (WARN_ON(dev->devres_head.next == NULL))
                return -ENODEV;

        /* Nothing to release if list is empty */
        if (list_empty(&dev->devres_head))
                return 0;

        spin_lock_irqsave(&dev->devres_lock, flags);
        cnt = remove_nodes(dev, dev->devres_head.next, &dev->devres_head, &todo);
        spin_unlock_irqrestore(&dev->devres_lock, flags);

        release_nodes(dev, &todo);
        return cnt;
}

static void devres_group_free(struct devres_node *node)
{
        struct devres_group *grp = container_of(node, struct devres_group, node[0]);

        kfree(grp);
}

/**
 * devres_open_group - Open a new devres group
 * @dev: Device to open devres group for
 * @id: Separator ID
 * @gfp: Allocation flags
 *
 * Open a new devres group for @dev with @id.  For @id, using a
 * pointer to an object which won't be used for another group is
 * recommended.  If @id is NULL, address-wise unique ID is created.
 *
 * RETURNS:
 * ID of the new group, NULL on failure.
 */
void *devres_open_group(struct device *dev, void *id, gfp_t gfp)
{
        struct devres_group *grp;

        grp = kmalloc_obj(*grp, gfp);
        if (unlikely(!grp))
                return NULL;

        devres_node_init(&grp->node[0], &group_open_release, devres_group_free);
        devres_node_init(&grp->node[1], &group_close_release, NULL);
        devres_set_node_dbginfo(&grp->node[0], "grp<", 0);
        devres_set_node_dbginfo(&grp->node[1], "grp>", 0);
        grp->id = grp;
        if (id)
                grp->id = id;
        grp->color = 0;

        devres_node_add(dev, &grp->node[0]);
        return grp->id;
}
EXPORT_SYMBOL_GPL(devres_open_group);

/*
 * Find devres group with ID @id.  If @id is NULL, look for the latest open
 * group.
 */
static struct devres_group *find_group(struct device *dev, void *id)
{
        struct devres_node *node;

        list_for_each_entry_reverse(node, &dev->devres_head, entry) {
                struct devres_group *grp;

                if (node->release != &group_open_release)
                        continue;

                grp = container_of(node, struct devres_group, node[0]);

                if (id) {
                        if (grp->id == id)
                                return grp;
                } else if (list_empty(&grp->node[1].entry))
                        return grp;
        }

        return NULL;
}

/**
 * devres_close_group - Close a devres group
 * @dev: Device to close devres group for
 * @id: ID of target group, can be NULL
 *
 * Close the group identified by @id.  If @id is NULL, the latest open
 * group is selected.
 */
void devres_close_group(struct device *dev, void *id)
{
        struct devres_group *grp;

        guard(spinlock_irqsave)(&dev->devres_lock);
        grp = find_group(dev, id);
        if (grp)
                add_dr(dev, &grp->node[1]);
        else
                WARN_ON(1);
}
EXPORT_SYMBOL_GPL(devres_close_group);

/**
 * devres_remove_group - Remove a devres group
 * @dev: Device to remove group for
 * @id: ID of target group, can be NULL
 *
 * Remove the group identified by @id.  If @id is NULL, the latest
 * open group is selected.  Note that removing a group doesn't affect
 * any other resources.
 */
void devres_remove_group(struct device *dev, void *id)
{
        struct devres_group *grp;
        unsigned long flags;

        spin_lock_irqsave(&dev->devres_lock, flags);

        grp = find_group(dev, id);
        if (grp) {
                list_del_init(&grp->node[0].entry);
                list_del_init(&grp->node[1].entry);
                devres_log(dev, &grp->node[0], "REM");
        } else
                WARN_ON(1);

        spin_unlock_irqrestore(&dev->devres_lock, flags);

        kfree(grp);
}
EXPORT_SYMBOL_GPL(devres_remove_group);

/**
 * devres_release_group - Release resources in a devres group
 * @dev: Device to release group for
 * @id: ID of target group, can be NULL
 *
 * Release all resources in the group identified by @id.  If @id is
 * NULL, the latest open group is selected.  The selected group and
 * groups properly nested inside the selected group are removed.
 *
 * RETURNS:
 * The number of released non-group resources.
 */
int devres_release_group(struct device *dev, void *id)
{
        struct devres_group *grp;
        unsigned long flags;
        LIST_HEAD(todo);
        int cnt = 0;

        spin_lock_irqsave(&dev->devres_lock, flags);
        grp = find_group(dev, id);
        if (grp) {
                struct list_head *first = &grp->node[0].entry;
                struct list_head *end = &dev->devres_head;

                if (!list_empty(&grp->node[1].entry))
                        end = grp->node[1].entry.next;

                cnt = remove_nodes(dev, first, end, &todo);
        } else if (list_empty(&dev->devres_head)) {
                /*
                 * dev is probably dying via devres_release_all(): groups
                 * have already been removed and are on the process of
                 * being released - don't touch and don't warn.
                 */
        } else {
                WARN_ON(1);
        }
        spin_unlock_irqrestore(&dev->devres_lock, flags);

        release_nodes(dev, &todo);

        return cnt;
}
EXPORT_SYMBOL_GPL(devres_release_group);

/*
 * Custom devres actions allow inserting a simple function call
 * into the teardown sequence.
 */

struct action_devres {
        void *data;
        void (*action)(void *);
};

struct devres_action {
        struct devres_node node;
        struct action_devres action;
};

static int devm_action_match(struct devres_action *devres, struct action_devres *target)
{
        return devres->action.action == target->action &&
               devres->action.data == target->data;
}

static void devm_action_release(struct device *dev, struct devres_node *node)
{
        struct devres_action *devres = container_of(node, struct devres_action, node);

        devres->action.action(devres->action.data);
}

static void devm_action_free(struct devres_node *node)
{
        struct devres_action *action = container_of(node, struct devres_action, node);

        kfree(action);
}

/**
 * __devm_add_action() - add a custom action to list of managed resources
 * @dev: Device that owns the action
 * @action: Function that should be called
 * @data: Pointer to data passed to @action implementation
 * @name: Name of the resource (for debugging purposes)
 *
 * This adds a custom action to the list of managed resources so that
 * it gets executed as part of standard resource unwinding.
 */
int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name)
{
        struct devres_action *devres;

        devres = kzalloc_obj(*devres);
        if (!devres)
                return -ENOMEM;

        devres_node_init(&devres->node, devm_action_release, devm_action_free);
        devres_set_node_dbginfo(&devres->node, name, sizeof(*devres));

        devres->action.data = data;
        devres->action.action = action;

        devres_node_add(dev, &devres->node);
        return 0;
}
EXPORT_SYMBOL_GPL(__devm_add_action);

static struct devres_action *devres_action_find(struct device *dev,
                                                void (*action)(void *),
                                                void *data)
{
        struct devres_node *node;
        struct action_devres target = {
                .data = data,
                .action = action,
        };

        list_for_each_entry_reverse(node, &dev->devres_head, entry) {
                struct devres_action *dr = container_of(node, struct devres_action, node);

                if (node->release != devm_action_release)
                        continue;
                if (devm_action_match(dr, &target))
                        return dr;
        }

        return NULL;
}

bool devm_is_action_added(struct device *dev, void (*action)(void *), void *data)
{
        guard(spinlock_irqsave)(&dev->devres_lock);

        return !!devres_action_find(dev, action, data);
}
EXPORT_SYMBOL_GPL(devm_is_action_added);

static struct devres_action *remove_action(struct device *dev,
                                           void (*action)(void *),
                                           void *data)
{
        struct devres_action *dr;

        guard(spinlock_irqsave)(&dev->devres_lock);

        dr = devres_action_find(dev, action, data);
        if (!dr)
                return ERR_PTR(-ENOENT);

        list_del_init(&dr->node.entry);
        devres_log(dev, &dr->node, "REM");

        return dr;
}

/**
 * devm_remove_action_nowarn() - removes previously added custom action
 * @dev: Device that owns the action
 * @action: Function implementing the action
 * @data: Pointer to data passed to @action implementation
 *
 * Removes instance of @action previously added by devm_add_action().
 * Both action and data should match one of the existing entries.
 *
 * In contrast to devm_remove_action(), this function does not WARN() if no
 * entry could have been found.
 *
 * This should only be used if the action is contained in an object with
 * independent lifetime management, e.g. the Devres rust abstraction.
 *
 * Causing the warning from regular driver code most likely indicates an abuse
 * of the devres API.
 *
 * Returns: 0 on success, -ENOENT if no entry could have been found.
 */
int devm_remove_action_nowarn(struct device *dev,
                              void (*action)(void *),
                              void *data)
{
        struct devres_action *dr;

        dr = remove_action(dev, action, data);
        if (IS_ERR(dr))
                return PTR_ERR(dr);

        kfree(dr);

        return 0;
}
EXPORT_SYMBOL_GPL(devm_remove_action_nowarn);

/**
 * devm_release_action() - release previously added custom action
 * @dev: Device that owns the action
 * @action: Function implementing the action
 * @data: Pointer to data passed to @action implementation
 *
 * Releases and removes instance of @action previously added by
 * devm_add_action().  Both action and data should match one of the
 * existing entries.
 */
void devm_release_action(struct device *dev, void (*action)(void *), void *data)
{
        struct devres_action *dr;

        dr = remove_action(dev, action, data);
        if (WARN_ON(IS_ERR(dr)))
                return;

        dr->action.action(dr->action.data);

        kfree(dr);
}
EXPORT_SYMBOL_GPL(devm_release_action);

/*
 * Managed kmalloc/kfree
 */
static void devm_kmalloc_release(struct device *dev, void *res)
{
        /* noop */
}

static int devm_kmalloc_match(struct device *dev, void *res, void *data)
{
        return res == data;
}

/**
 * devm_kmalloc - Resource-managed kmalloc
 * @dev: Device to allocate memory for
 * @size: Allocation size
 * @gfp: Allocation gfp flags
 *
 * Managed kmalloc.  Memory allocated with this function is
 * automatically freed on driver detach.  Like all other devres
 * resources, guaranteed alignment is unsigned long long.
 *
 * RETURNS:
 * Pointer to allocated memory on success, NULL on failure.
 */
void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp)
{
        struct devres *dr;

        if (unlikely(!size))
                return ZERO_SIZE_PTR;

        /* use raw alloc_dr for kmalloc caller tracing */
        dr = alloc_dr(devm_kmalloc_release, size, gfp, dev_to_node(dev));
        if (unlikely(!dr))
                return NULL;

        /*
         * This is named devm_kzalloc_release for historical reasons
         * The initial implementation did not support kmalloc, only kzalloc
         */
        devres_set_node_dbginfo(&dr->node, "devm_kzalloc_release", size);
        devres_add(dev, dr->data);
        return dr->data;
}
EXPORT_SYMBOL_GPL(devm_kmalloc);

/**
 * devm_krealloc - Resource-managed krealloc()
 * @dev: Device to re-allocate memory for
 * @ptr: Pointer to the memory chunk to re-allocate
 * @new_size: New allocation size
 * @gfp: Allocation gfp flags
 *
 * Managed krealloc(). Resizes the memory chunk allocated with devm_kmalloc().
 * Behaves similarly to regular krealloc(): if @ptr is NULL or ZERO_SIZE_PTR,
 * it's the equivalent of devm_kmalloc(). If new_size is zero, it frees the
 * previously allocated memory and returns ZERO_SIZE_PTR. This function doesn't
 * change the order in which the release callback for the re-alloc'ed devres
 * will be called (except when falling back to devm_kmalloc() or when freeing
 * resources when new_size is zero). The contents of the memory are preserved
 * up to the lesser of new and old sizes.
 */
void *devm_krealloc(struct device *dev, void *ptr, size_t new_size, gfp_t gfp)
{
        size_t total_new_size, total_old_size;
        struct devres *old_dr, *new_dr;
        unsigned long flags;

        if (unlikely(!new_size)) {
                devm_kfree(dev, ptr);
                return ZERO_SIZE_PTR;
        }

        if (unlikely(ZERO_OR_NULL_PTR(ptr)))
                return devm_kmalloc(dev, new_size, gfp);

        if (WARN_ON(is_kernel_rodata((unsigned long)ptr)))
                /*
                 * We cannot reliably realloc a const string returned by
                 * devm_kstrdup_const().
                 */
                return NULL;

        if (!check_dr_size(new_size, &total_new_size))
                return NULL;

        total_old_size = ksize(container_of(ptr, struct devres, data));
        if (total_old_size == 0) {
                WARN(1, "Pointer doesn't point to dynamically allocated memory.");
                return NULL;
        }

        /*
         * If new size is smaller or equal to the actual number of bytes
         * allocated previously - just return the same pointer.
         */
        if (total_new_size <= total_old_size)
                return ptr;

        /*
         * Otherwise: allocate new, larger chunk. We need to allocate before
         * taking the lock as most probably the caller uses GFP_KERNEL.
         * alloc_dr() will call check_dr_size() to reserve extra memory
         * for struct devres automatically, so size @new_size user request
         * is delivered to it directly as devm_kmalloc() does.
         */
        new_dr = alloc_dr(devm_kmalloc_release,
                          new_size, gfp, dev_to_node(dev));
        if (!new_dr)
                return NULL;

        devres_set_node_dbginfo(&new_dr->node, "devm_krealloc_release", new_size);

        /*
         * The spinlock protects the linked list against concurrent
         * modifications but not the resource itself.
         */
        spin_lock_irqsave(&dev->devres_lock, flags);

        old_dr = find_dr(dev, devm_kmalloc_release, devm_kmalloc_match, ptr);
        if (!old_dr) {
                spin_unlock_irqrestore(&dev->devres_lock, flags);
                free_dr(new_dr);
                WARN(1, "Memory chunk not managed or managed by a different device.");
                return NULL;
        }

        replace_dr(dev, &old_dr->node, &new_dr->node);

        spin_unlock_irqrestore(&dev->devres_lock, flags);

        /*
         * We can copy the memory contents after releasing the lock as we're
         * no longer modifying the list links.
         */
        memcpy(new_dr->data, old_dr->data,
               total_old_size - offsetof(struct devres, data));
        /*
         * Same for releasing the old devres - it's now been removed from the
         * list. This is also the reason why we must not use devm_kfree() - the
         * links are no longer valid.
         */
        free_dr(old_dr);

        return new_dr->data;
}
EXPORT_SYMBOL_GPL(devm_krealloc);

/**
 * devm_kstrdup - Allocate resource managed space and
 *                copy an existing string into that.
 * @dev: Device to allocate memory for
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the devm_kmalloc() call when
 *       allocating memory
 * RETURNS:
 * Pointer to allocated string on success, NULL on failure.
 */
char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp)
{
        if (!s)
                return NULL;

        return devm_kmemdup(dev, s, strlen(s) + 1, gfp);
}
EXPORT_SYMBOL_GPL(devm_kstrdup);

/**
 * devm_kstrdup_const - resource managed conditional string duplication
 * @dev: device for which to duplicate the string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Strings allocated by devm_kstrdup_const will be automatically freed when
 * the associated device is detached.
 *
 * RETURNS:
 * Source string if it is in .rodata section otherwise it falls back to
 * devm_kstrdup.
 */
const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp)
{
        if (is_kernel_rodata((unsigned long)s))
                return s;

        return devm_kstrdup(dev, s, gfp);
}
EXPORT_SYMBOL_GPL(devm_kstrdup_const);

/**
 * devm_kvasprintf - Allocate resource managed space and format a string
 *                     into that.
 * @dev: Device to allocate memory for
 * @gfp: the GFP mask used in the devm_kmalloc() call when
 *       allocating memory
 * @fmt: The printf()-style format string
 * @ap: Arguments for the format string
 * RETURNS:
 * Pointer to allocated string on success, NULL on failure.
 */
char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
                      va_list ap)
{
        unsigned int len;
        char *p;
        va_list aq;

        va_copy(aq, ap);
        len = vsnprintf(NULL, 0, fmt, aq);
        va_end(aq);

        p = devm_kmalloc(dev, len+1, gfp);
        if (!p)
                return NULL;

        vsnprintf(p, len+1, fmt, ap);

        return p;
}
EXPORT_SYMBOL(devm_kvasprintf);

/**
 * devm_kasprintf - Allocate resource managed space and format a string
 *                    into that.
 * @dev: Device to allocate memory for
 * @gfp: the GFP mask used in the devm_kmalloc() call when
 *       allocating memory
 * @fmt: The printf()-style format string
 * @...: Arguments for the format string
 * RETURNS:
 * Pointer to allocated string on success, NULL on failure.
 */
char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...)
{
        va_list ap;
        char *p;

        va_start(ap, fmt);
        p = devm_kvasprintf(dev, gfp, fmt, ap);
        va_end(ap);

        return p;
}
EXPORT_SYMBOL_GPL(devm_kasprintf);

/**
 * devm_kfree - Resource-managed kfree
 * @dev: Device this memory belongs to
 * @p: Memory to free
 *
 * Free memory allocated with devm_kmalloc().
 */
void devm_kfree(struct device *dev, const void *p)
{
        int rc;

        /*
         * Special cases: pointer to a string in .rodata returned by
         * devm_kstrdup_const() or NULL/ZERO ptr.
         */
        if (unlikely(is_kernel_rodata((unsigned long)p) || ZERO_OR_NULL_PTR(p)))
                return;

        rc = devres_destroy(dev, devm_kmalloc_release,
                            devm_kmalloc_match, (void *)p);
        WARN_ON(rc);
}
EXPORT_SYMBOL_GPL(devm_kfree);

/**
 * devm_kmemdup - Resource-managed kmemdup
 * @dev: Device this memory belongs to
 * @src: Memory region to duplicate
 * @len: Memory region length
 * @gfp: GFP mask to use
 *
 * Duplicate region of a memory using resource managed kmalloc
 */
void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = devm_kmalloc(dev, len, gfp);
        if (p)
                memcpy(p, src, len);

        return p;
}
EXPORT_SYMBOL_GPL(devm_kmemdup);

/**
 * devm_kmemdup_const - conditionally duplicate and manage a region of memory
 *
 * @dev: Device this memory belongs to
 * @src: memory region to duplicate
 * @len: memory region length,
 * @gfp: GFP mask to use
 *
 * Return: source address if it is in .rodata or the return value of kmemdup()
 * to which the function falls back otherwise.
 */
const void *
devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp)
{
        if (is_kernel_rodata((unsigned long)src))
                return src;

        return devm_kmemdup(dev, src, len, gfp);
}
EXPORT_SYMBOL_GPL(devm_kmemdup_const);

struct pages_devres {
        unsigned long addr;
        unsigned int order;
};

static int devm_pages_match(struct device *dev, void *res, void *p)
{
        struct pages_devres *devres = res;
        struct pages_devres *target = p;

        return devres->addr == target->addr;
}

static void devm_pages_release(struct device *dev, void *res)
{
        struct pages_devres *devres = res;

        free_pages(devres->addr, devres->order);
}

/**
 * devm_get_free_pages - Resource-managed __get_free_pages
 * @dev: Device to allocate memory for
 * @gfp_mask: Allocation gfp flags
 * @order: Allocation size is (1 << order) pages
 *
 * Managed get_free_pages.  Memory allocated with this function is
 * automatically freed on driver detach.
 *
 * RETURNS:
 * Address of allocated memory on success, 0 on failure.
 */

unsigned long devm_get_free_pages(struct device *dev,
                                  gfp_t gfp_mask, unsigned int order)
{
        struct pages_devres *devres;
        unsigned long addr;

        addr = __get_free_pages(gfp_mask, order);

        if (unlikely(!addr))
                return 0;

        devres = devres_alloc(devm_pages_release,
                              sizeof(struct pages_devres), GFP_KERNEL);
        if (unlikely(!devres)) {
                free_pages(addr, order);
                return 0;
        }

        devres->addr = addr;
        devres->order = order;

        devres_add(dev, devres);
        return addr;
}
EXPORT_SYMBOL_GPL(devm_get_free_pages);

/**
 * devm_free_pages - Resource-managed free_pages
 * @dev: Device this memory belongs to
 * @addr: Memory to free
 *
 * Free memory allocated with devm_get_free_pages(). Unlike free_pages,
 * there is no need to supply the @order.
 */
void devm_free_pages(struct device *dev, unsigned long addr)
{
        struct pages_devres devres = { .addr = addr };

        WARN_ON(devres_release(dev, devm_pages_release, devm_pages_match,
                               &devres));
}
EXPORT_SYMBOL_GPL(devm_free_pages);

static void devm_percpu_release(struct device *dev, void *pdata)
{
        void __percpu *p;

        p = *(void __percpu **)pdata;
        free_percpu(p);
}

/**
 * __devm_alloc_percpu - Resource-managed alloc_percpu
 * @dev: Device to allocate per-cpu memory for
 * @size: Size of per-cpu memory to allocate
 * @align: Alignment of per-cpu memory to allocate
 *
 * Managed alloc_percpu. Per-cpu memory allocated with this function is
 * automatically freed on driver detach.
 *
 * RETURNS:
 * Pointer to allocated memory on success, NULL on failure.
 */
void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
                size_t align)
{
        void *p;
        void __percpu *pcpu;

        pcpu = __alloc_percpu(size, align);
        if (!pcpu)
                return NULL;

        p = devres_alloc(devm_percpu_release, sizeof(void *), GFP_KERNEL);
        if (!p) {
                free_percpu(pcpu);
                return NULL;
        }

        *(void __percpu **)p = pcpu;

        devres_add(dev, p);

        return pcpu;
}
EXPORT_SYMBOL_GPL(__devm_alloc_percpu);




























    1 
















    1 
    1 








    2 





























    1 

















    1 













    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
// SPDX-License-Identifier: GPL-2.0-or-later
/* Key permission checking
 *
 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/security.h>
#include "internal.h"

/**
 * key_task_permission - Check a key can be used
 * @key_ref: The key to check.
 * @cred: The credentials to use.
 * @need_perm: The permission required.
 *
 * Check to see whether permission is granted to use a key in the desired way,
 * but permit the security modules to override.
 *
 * The caller must hold either a ref on cred or must hold the RCU readlock.
 *
 * Returns 0 if successful, -EACCES if access is denied based on the
 * permissions bits or the LSM check.
 */
int key_task_permission(const key_ref_t key_ref, const struct cred *cred,
                        enum key_need_perm need_perm)
{
        struct key *key;
        key_perm_t kperm, mask;
        int ret;

        switch (need_perm) {
        default:
                WARN_ON(1);
                return -EACCES;
        case KEY_NEED_UNLINK:
        case KEY_SYSADMIN_OVERRIDE:
        case KEY_AUTHTOKEN_OVERRIDE:
        case KEY_DEFER_PERM_CHECK:
                goto lsm;

        case KEY_NEED_VIEW:        mask = KEY_OTH_VIEW;        break;
        case KEY_NEED_READ:        mask = KEY_OTH_READ;        break;
        case KEY_NEED_WRITE:        mask = KEY_OTH_WRITE;        break;
        case KEY_NEED_SEARCH:        mask = KEY_OTH_SEARCH;        break;
        case KEY_NEED_LINK:        mask = KEY_OTH_LINK;        break;
        case KEY_NEED_SETATTR:        mask = KEY_OTH_SETATTR;        break;
        }

        key = key_ref_to_ptr(key_ref);

        /* use the second 8-bits of permissions for keys the caller owns */
        if (uid_eq(key->uid, cred->fsuid)) {
                kperm = key->perm >> 16;
                goto use_these_perms;
        }

        /* use the third 8-bits of permissions for keys the caller has a group
         * membership in common with */
        if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) {
                if (gid_eq(key->gid, cred->fsgid)) {
                        kperm = key->perm >> 8;
                        goto use_these_perms;
                }

                ret = groups_search(cred->group_info, key->gid);
                if (ret) {
                        kperm = key->perm >> 8;
                        goto use_these_perms;
                }
        }

        /* otherwise use the least-significant 8-bits */
        kperm = key->perm;

use_these_perms:

        /* use the top 8-bits of permissions for keys the caller possesses
         * - possessor permissions are additive with other permissions
         */
        if (is_key_possessed(key_ref))
                kperm |= key->perm >> 24;

        if ((kperm & mask) != mask)
                return -EACCES;

        /* let LSM be the final arbiter */
lsm:
        return security_key_permission(key_ref, cred, need_perm);
}
EXPORT_SYMBOL(key_task_permission);

/**
 * key_validate - Validate a key.
 * @key: The key to be validated.
 *
 * Check that a key is valid, returning 0 if the key is okay, -ENOKEY if the
 * key is invalidated, -EKEYREVOKED if the key's type has been removed or if
 * the key has been revoked or -EKEYEXPIRED if the key has expired.
 */
int key_validate(const struct key *key)
{
        unsigned long flags = READ_ONCE(key->flags);
        time64_t expiry = READ_ONCE(key->expiry);

        if (flags & (1 << KEY_FLAG_INVALIDATED))
                return -ENOKEY;

        /* check it's still accessible */
        if (flags & ((1 << KEY_FLAG_REVOKED) |
                     (1 << KEY_FLAG_DEAD)))
                return -EKEYREVOKED;

        /* check it hasn't expired */
        if (expiry) {
                if (ktime_get_real_seconds() >= expiry)
                        return -EKEYEXPIRED;
        }

        return 0;
}
EXPORT_SYMBOL(key_validate);














































   23 

































    2 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM maple_tree

#if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MM_H


#include <linux/tracepoint.h>

struct ma_state;

TRACE_EVENT(ma_op,

        TP_PROTO(const char *fn, struct ma_state *mas),

        TP_ARGS(fn, mas),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last
        )
)
TRACE_EVENT(ma_read,

        TP_PROTO(const char *fn, struct ma_state *mas),

        TP_ARGS(fn, mas),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last
        )
)

TRACE_EVENT(ma_write,

        TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv,
                 void *val),

        TP_ARGS(fn, mas, piv, val),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(unsigned long, piv)
                        __field(void *, val)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->piv                = piv;
                        __entry->val                = val;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last,
                  (unsigned long) __entry->piv,
                  (void *) __entry->val
        )
)
#endif /* _TRACE_MM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>















































































































































































































































































































































































































































































































































































































































































































































    2 
    2 
























    2 












    2 








    2 


    2 







    1 









    1 

















































    1 












    1 








    1 






    1 















    1 








    1 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 















    2 







    2 



    2 
    2 














    2 








    2 







    2 





    2 

    2 




























































































































































    2 



    2 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Implementation of the Transmission Control Protocol(TCP).
 *
 *                IPv4 specific functions
 *
 *                code split from:
 *                linux/ipv4/tcp.c
 *                linux/ipv4/tcp_input.c
 *                linux/ipv4/tcp_output.c
 *
 *                See tcp.c for author information
 */

/*
 * Changes:
 *                David S. Miller        :        New socket lookup architecture.
 *                                        This code is dedicated to John Dyson.
 *                David S. Miller :        Change semantics of established hash,
 *                                        half is devoted to TIME_WAIT sockets
 *                                        and the rest go in the other half.
 *                Andi Kleen :                Add support for syncookies and fixed
 *                                        some bugs: ip options weren't passed to
 *                                        the TCP layer, missed a check for an
 *                                        ACK bit.
 *                Andi Kleen :                Implemented fast path mtu discovery.
 *                                             Fixed many serious bugs in the
 *                                        request_sock handling and moved
 *                                        most of it into the af independent code.
 *                                        Added tail drop and some other bugfixes.
 *                                        Added new listen semantics.
 *                Mike McLagan        :        Routing by source
 *        Juan Jose Ciarlante:                ip_dynaddr bits
 *                Andi Kleen:                various fixes.
 *        Vitaly E. Lavrov        :        Transparent proxy revived after year
 *                                        coma.
 *        Andi Kleen                :        Fix new listen.
 *        Andi Kleen                :        Fix accept error reporting.
 *        YOSHIFUJI Hideaki @USAGI and:        Support IPV6_V6ONLY socket option, which
 *        Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
 *                                        a single port at the same time.
 */

#define pr_fmt(fmt) "TCP: " fmt

#include <linux/bottom_half.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/cache.h>
#include <linux/fips.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sock_diag.h>

#include <net/aligned_data.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
#include <net/tcp_ecn.h>
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/inet_ecn.h>
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>
#include <net/psp.h>

#include <linux/inet.h>
#include <linux/ipv6.h>
#include <linux/stddef.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/inetdevice.h>
#include <linux/btf_ids.h>
#include <linux/skbuff_ref.h>

#include <crypto/md5.h>
#include <crypto/utils.h>

#include <trace/events/tcp.h>

#ifdef CONFIG_TCP_MD5SIG
static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
#endif

struct inet_hashinfo tcp_hashinfo;

static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

static DEFINE_MUTEX(tcp_exit_batch_mutex);

INDIRECT_CALLABLE_SCOPE union tcp_seq_and_ts_off
tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
{
        return secure_tcp_seq_and_ts_off(net,
                                         ip_hdr(skb)->daddr,
                                         ip_hdr(skb)->saddr,
                                         tcp_hdr(skb)->dest,
                                         tcp_hdr(skb)->source);
}

int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
        int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
        const struct inet_timewait_sock *tw = inet_twsk(sktw);
        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
        struct tcp_sock *tp = tcp_sk(sk);
        int ts_recent_stamp;
        u32 reuse_thresh;

        if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
                reuse = 0;

        if (reuse == 2) {
                /* Still does not detect *everything* that goes through
                 * lo, since we require a loopback src or dst address
                 * or direct binding to 'lo' interface.
                 */
                bool loopback = false;
                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
                        loopback = true;
#if IS_ENABLED(CONFIG_IPV6)
                if (tw->tw_family == AF_INET6) {
                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
                                loopback = true;
                } else
#endif
                {
                        if (ipv4_is_loopback(tw->tw_daddr) ||
                            ipv4_is_loopback(tw->tw_rcv_saddr))
                                loopback = true;
                }
                if (!loopback)
                        reuse = 0;
        }

        /* With PAWS, it is safe from the viewpoint
           of data integrity. Even without PAWS it is safe provided sequence
           spaces do not overlap i.e. at data rates <= 80Mbit/sec.

           Actually, the idea is close to VJ's one, only timestamp cache is
           held not per host, but per port pair and TW bucket is used as state
           holder.

           If TW bucket has been already destroyed we fall back to VJ's scheme
           and use initial timestamp retrieved from peer table.
         */
        ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
        reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
                       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
        if (ts_recent_stamp &&
            (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
                /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
                 * and releasing the bucket lock.
                 */
                if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
                        return 0;

                /* In case of repair and re-using TIME-WAIT sockets we still
                 * want to be sure that it is safe as above but honor the
                 * sequence numbers and time stamps set as part of the repair
                 * process.
                 *
                 * Without this check re-using a TIME-WAIT socket with TCP
                 * repair would accumulate a -1 on the repair assigned
                 * sequence number. The first time it is reused the sequence
                 * is -1, the second time -2, etc. This fixes that issue
                 * without appearing to create any others.
                 */
                if (likely(!tp->repair)) {
                        u32 seq = tcptw->tw_snd_nxt + 65535 + 2;

                        if (!seq)
                                seq = 1;
                        WRITE_ONCE(tp->write_seq, seq);
                        tp->rx_opt.ts_recent           = READ_ONCE(tcptw->tw_ts_recent);
                        tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
                }

                return 1;
        }

        return 0;
}

static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
                              int addr_len)
{
        /* This check is replicated from tcp_v4_connect() and intended to
         * prevent BPF program called below from accessing bytes that are out
         * of the bound specified by user in addr_len.
         */
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        sock_owned_by_me(sk);

        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
}

/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
        struct inet_timewait_death_row *tcp_death_row;
        struct inet_sock *inet = inet_sk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct ip_options_rcu *inet_opt;
        struct net *net = sock_net(sk);
        __be16 orig_sport, orig_dport;
        __be32 daddr, nexthop;
        struct flowi4 *fl4;
        struct rtable *rt;
        int err;

        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        if (usin->sin_family != AF_INET)
                return -EAFNOSUPPORT;

        nexthop = daddr = usin->sin_addr.s_addr;
        inet_opt = rcu_dereference_protected(inet->inet_opt,
                                             lockdep_sock_is_held(sk));
        if (inet_opt && inet_opt->opt.srr) {
                if (!daddr)
                        return -EINVAL;
                nexthop = inet_opt->opt.faddr;
        }

        orig_sport = inet->inet_sport;
        orig_dport = usin->sin_port;
        fl4 = &inet->cork.fl.u.ip4;
        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
                              sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
                              orig_dport, sk);
        if (IS_ERR(rt)) {
                err = PTR_ERR(rt);
                if (err == -ENETUNREACH)
                        IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
                return err;
        }

        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
                ip_rt_put(rt);
                return -ENETUNREACH;
        }

        if (!inet_opt || !inet_opt->opt.srr)
                daddr = fl4->daddr;

        tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;

        if (!inet->inet_saddr) {
                err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
                if (err) {
                        ip_rt_put(rt);
                        return err;
                }
        } else {
                sk_rcv_saddr_set(sk, inet->inet_saddr);
        }

        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
                /* Reset inherited state */
                tp->rx_opt.ts_recent           = 0;
                tp->rx_opt.ts_recent_stamp = 0;
                if (likely(!tp->repair))
                        WRITE_ONCE(tp->write_seq, 0);
        }

        inet->inet_dport = usin->sin_port;
        sk_daddr_set(sk, daddr);

        inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
        if (inet_opt)
                inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;

        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;

        /* Socket identity is still unknown (sport may be zero).
         * However we set state to SYN-SENT and not releasing socket
         * lock select source port, enter ourselves into the hash tables and
         * complete initialization after this.
         */
        tcp_set_state(sk, TCP_SYN_SENT);
        err = inet_hash_connect(tcp_death_row, sk);
        if (err)
                goto failure;

        sk_set_txhash(sk);

        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
                               inet->inet_sport, inet->inet_dport, sk);
        if (IS_ERR(rt)) {
                err = PTR_ERR(rt);
                rt = NULL;
                goto failure;
        }
        tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
        /* OK, now commit destination to socket.  */
        sk->sk_gso_type = SKB_GSO_TCPV4;
        sk_setup_caps(sk, &rt->dst);
        rt = NULL;

        if (likely(!tp->repair)) {
                union tcp_seq_and_ts_off st;

                st = secure_tcp_seq_and_ts_off(net,
                                               inet->inet_saddr,
                                               inet->inet_daddr,
                                               inet->inet_sport,
                                               usin->sin_port);
                if (!tp->write_seq)
                        WRITE_ONCE(tp->write_seq, st.seq);
                WRITE_ONCE(tp->tsoffset, st.ts_off);
        }

        atomic_set(&inet->inet_id, get_random_u16());

        if (tcp_fastopen_defer_connect(sk, &err))
                return err;
        if (err)
                goto failure;

        err = tcp_connect(sk);

        if (err)
                goto failure;

        return 0;

failure:
        /*
         * This unhashes the socket and releases the local port,
         * if necessary.
         */
        tcp_set_state(sk, TCP_CLOSE);
        inet_bhash2_reset_saddr(sk);
        ip_rt_put(rt);
        sk->sk_route_caps = 0;
        inet->inet_dport = 0;
        return err;
}

/*
 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 * It can be called through tcp_release_cb() if socket was owned by user
 * at the time tcp_v4_err() was called to handle ICMP message.
 */
void tcp_v4_mtu_reduced(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);
        struct dst_entry *dst;
        u32 mtu, dmtu;

        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
                return;
        mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
        dst = inet_csk_update_pmtu(sk, mtu);
        if (!dst)
                return;

        /* Something is about to be wrong... Remember soft error
         * for the case, if this connection will not able to recover.
         */
        dmtu = dst4_mtu(dst);
        if (mtu < dmtu && ip_dont_fragment(sk, dst))
                WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);

        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
            ip_sk_accept_pmtu(sk) &&
            inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
                tcp_sync_mss(sk, dmtu);

                /* Resend the TCP packet because it's
                 * clear that the old packet has been
                 * dropped. This is the new "fast" path mtu
                 * discovery.
                 */
                tcp_simple_retransmit(sk);
        } /* else let the usual retransmit timer handle it */
}

static void do_redirect(struct sk_buff *skb, struct sock *sk)
{
        struct dst_entry *dst = __sk_dst_check(sk, 0);

        if (dst)
                dst->ops->redirect(dst, sk, skb);
}


/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
void tcp_req_err(struct sock *sk, u32 seq, bool abort)
{
        struct request_sock *req = inet_reqsk(sk);
        struct net *net = sock_net(sk);

        /* ICMPs are not backlogged, hence we cannot get
         * an established socket here.
         */
        if (seq != tcp_rsk(req)->snt_isn) {
                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
        } else if (abort) {
                /*
                 * Still in SYN_RECV, just remove it silently.
                 * There is no good way to pass the error to the newly
                 * created socket, and POSIX does not want network
                 * errors returned from accept().
                 */
                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
                tcp_listendrop(req->rsk_listener);
        }
        reqsk_put(req);
}

/* TCP-LD (RFC 6069) logic */
void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        s32 remaining;
        u32 delta_us;

        if (sock_owned_by_user(sk))
                return;

        if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
            !icsk->icsk_backoff)
                return;

        skb = tcp_rtx_queue_head(sk);
        if (WARN_ON_ONCE(!skb))
                return;

        icsk->icsk_backoff--;
        icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
        icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));

        tcp_mstamp_refresh(tp);
        delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
        remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);

        if (remaining > 0) {
                tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
        } else {
                /* RTO revert clocked out retransmission.
                 * Will retransmit now.
                 */
                tcp_retransmit_timer(sk);
        }
}

/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.  After adjustment
 * header points to the first 8 bytes of the tcp header.  We need
 * to find the appropriate port.
 *
 * The locking strategy used here is very "optimistic". When
 * someone else accesses the socket the ICMP is just dropped
 * and for some paths there is no check at all.
 * A more general error queue to queue errors for later handling
 * is probably better.
 *
 */

int tcp_v4_err(struct sk_buff *skb, u32 info)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
        struct net *net = dev_net_rcu(skb->dev);
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        struct request_sock *fastopen;
        struct tcp_sock *tp;
        u32 seq, snd_una;
        struct sock *sk;
        int err;

        sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
                                       ntohs(th->source), inet_iif(skb), 0);
        if (!sk) {
                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
                return -ENOENT;
        }
        if (sk->sk_state == TCP_TIME_WAIT) {
                /* To increase the counter of ignored icmps for TCP-AO */
                tcp_ao_ignore_icmp(sk, AF_INET, type, code);
                inet_twsk_put(inet_twsk(sk));
                return 0;
        }
        seq = ntohl(th->seq);
        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
                                     type == ICMP_TIME_EXCEEDED ||
                                     (type == ICMP_DEST_UNREACH &&
                                      (code == ICMP_NET_UNREACH ||
                                       code == ICMP_HOST_UNREACH)));
                return 0;
        }

        if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
                sock_put(sk);
                return 0;
        }

        bh_lock_sock(sk);
        /* If too many ICMPs get dropped on busy
         * servers this needs to be solved differently.
         * We do take care of PMTU discovery (RFC1191) special case :
         * we can receive locally generated ICMP messages while socket is held.
         */
        if (sock_owned_by_user(sk)) {
                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
        }
        if (sk->sk_state == TCP_CLOSE)
                goto out;

        if (static_branch_unlikely(&ip4_min_ttl)) {
                /* min_ttl can be changed concurrently from do_ip_setsockopt() */
                if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
                        goto out;
                }
        }

        tp = tcp_sk(sk);
        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
        fastopen = rcu_dereference(tp->fastopen_rsk);
        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
        if (sk->sk_state != TCP_LISTEN &&
            !between(seq, snd_una, tp->snd_nxt)) {
                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
                goto out;
        }

        switch (type) {
        case ICMP_REDIRECT:
                if (!sock_owned_by_user(sk))
                        do_redirect(skb, sk);
                goto out;
        case ICMP_SOURCE_QUENCH:
                /* Just silently ignore these. */
                goto out;
        case ICMP_PARAMETERPROB:
                err = EPROTO;
                break;
        case ICMP_DEST_UNREACH:
                if (code > NR_ICMP_UNREACH)
                        goto out;

                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
                        /* We are not interested in TCP_LISTEN and open_requests
                         * (SYN-ACKs send out by Linux are always <576bytes so
                         * they should go through unfragmented).
                         */
                        if (sk->sk_state == TCP_LISTEN)
                                goto out;

                        WRITE_ONCE(tp->mtu_info, info);
                        if (!sock_owned_by_user(sk)) {
                                tcp_v4_mtu_reduced(sk);
                        } else {
                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
                                        sock_hold(sk);
                        }
                        goto out;
                }

                err = icmp_err_convert[code].errno;
                /* check if this ICMP message allows revert of backoff.
                 * (see RFC 6069)
                 */
                if (!fastopen &&
                    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
                        tcp_ld_RTO_revert(sk, seq);
                break;
        case ICMP_TIME_EXCEEDED:
                err = EHOSTUNREACH;
                break;
        default:
                goto out;
        }

        switch (sk->sk_state) {
        case TCP_SYN_SENT:
        case TCP_SYN_RECV:
                /* Only in fast or simultaneous open. If a fast open socket is
                 * already accepted it is treated as a connected one below.
                 */
                if (fastopen && !fastopen->sk)
                        break;

                ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);

                if (!sock_owned_by_user(sk))
                        tcp_done_with_error(sk, err);
                else
                        WRITE_ONCE(sk->sk_err_soft, err);
                goto out;
        }

        /* If we've already connected we will keep trying
         * until we time out, or the user gives up.
         *
         * rfc1122 4.2.3.9 allows to consider as hard errors
         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
         * but it is obsoleted by pmtu discovery).
         *
         * Note, that in modern internet, where routing is unreliable
         * and in each dark corner broken firewalls sit, sending random
         * errors ordered by their masters even this two messages finally lose
         * their original sense (even Linux sends invalid PORT_UNREACHs)
         *
         * Now we are in compliance with RFCs.
         *                                                        --ANK (980905)
         */

        if (!sock_owned_by_user(sk) &&
            inet_test_bit(RECVERR, sk)) {
                WRITE_ONCE(sk->sk_err, err);
                sk_error_report(sk);
        } else        { /* Only an error on timeout */
                WRITE_ONCE(sk->sk_err_soft, err);
        }

out:
        bh_unlock_sock(sk);
        sock_put(sk);
        return 0;
}

#define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))

static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
                                 const struct tcp_ao_hdr *aoh,
                                 struct ip_reply_arg *arg, struct tcphdr *reply,
                                 __be32 reply_options[REPLY_OPTIONS_LEN])
{
#ifdef CONFIG_TCP_AO
        int sdif = tcp_v4_sdif(skb);
        int dif = inet_iif(skb);
        int l3index = sdif ? dif : 0;
        bool allocated_traffic_key;
        struct tcp_ao_key *key;
        char *traffic_key;
        bool drop = true;
        u32 ao_sne = 0;
        u8 keyid;

        rcu_read_lock();
        if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
                                 &key, &traffic_key, &allocated_traffic_key,
                                 &keyid, &ao_sne))
                goto out;

        reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
                                 (aoh->rnext_keyid << 8) | keyid);
        arg->iov[0].iov_len += tcp_ao_len_aligned(key);
        reply->doff = arg->iov[0].iov_len / 4;

        if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
                            key, traffic_key,
                            (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
                            (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
                            reply, ao_sne))
                goto out;
        drop = false;
out:
        rcu_read_unlock();
        if (allocated_traffic_key)
                kfree(traffic_key);
        return drop;
#else
        return true;
#endif
}

/*
 *        This routine will send an RST to the other tcp.
 *
 *        Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 *                      for reset.
 *        Answer: if a packet caused RST, it is not for a socket
 *                existing in our system, if it is matched to a socket,
 *                it is just duplicate segment or bug in other side's TCP.
 *                So that we build reply only basing on parameters
 *                arrived with segment.
 *        Exception: precedence violation. We do not implement it in any case.
 */

static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
                              enum sk_rst_reason reason)
{
        const struct tcphdr *th = tcp_hdr(skb);
        struct {
                struct tcphdr th;
                __be32 opt[REPLY_OPTIONS_LEN];
        } rep;
        const __u8 *md5_hash_location = NULL;
        const struct tcp_ao_hdr *aoh;
        struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *key = NULL;
        unsigned char newhash[16];
        struct sock *sk1 = NULL;
#endif
        u64 transmit_time = 0;
        struct sock *ctl_sk;
        struct net *net;
        u32 txhash = 0;

        /* Never send a reset in response to a reset. */
        if (th->rst)
                return;

        /* If sk not NULL, it means we did a successful lookup and incoming
         * route had to be correct. prequeue might have dropped our dst.
         */
        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
                return;

        /* Swap the send and the receive. */
        memset(&rep, 0, sizeof(rep));
        rep.th.dest   = th->source;
        rep.th.source = th->dest;
        rep.th.doff   = sizeof(struct tcphdr) / 4;
        rep.th.rst    = 1;

        if (th->ack) {
                rep.th.seq = th->ack_seq;
        } else {
                rep.th.ack = 1;
                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
                                       skb->len - (th->doff << 2));
        }

        memset(&arg, 0, sizeof(arg));
        arg.iov[0].iov_base = (unsigned char *)&rep;
        arg.iov[0].iov_len  = sizeof(rep.th);

        net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);

        /* Invalid TCP option size or twice included auth */
        if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
                return;

        if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
                return;

#ifdef CONFIG_TCP_MD5SIG
        rcu_read_lock();
        if (sk && sk_fullsock(sk)) {
                const union tcp_md5_addr *addr;
                int l3index;

                /* sdif set, means packet ingressed via a device
                 * in an L3 domain and inet_iif is set to it.
                 */
                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
                key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
        } else if (md5_hash_location) {
                const union tcp_md5_addr *addr;
                int sdif = tcp_v4_sdif(skb);
                int dif = inet_iif(skb);
                int l3index;

                /*
                 * active side is lost. Try to find listening socket through
                 * source port, and then find md5 key through listening socket.
                 * we are not loose security here:
                 * Incoming packet is checked with md5 hash with finding key,
                 * no RST generated if md5 hash doesn't match.
                 */
                sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
                                             th->source, ip_hdr(skb)->daddr,
                                             ntohs(th->source), dif, sdif);
                /* don't send rst if it can't find key */
                if (!sk1)
                        goto out;

                /* sdif set, means packet ingressed via a device
                 * in an L3 domain and dif is set to it.
                 */
                l3index = sdif ? dif : 0;
                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
                key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
                if (!key)
                        goto out;

                tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
                if (crypto_memneq(md5_hash_location, newhash, 16))
                        goto out;
        }

        if (key) {
                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
                                   (TCPOPT_NOP << 16) |
                                   (TCPOPT_MD5SIG << 8) |
                                   TCPOLEN_MD5SIG);
                /* Update length and the length the header thinks exists */
                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
                rep.th.doff = arg.iov[0].iov_len / 4;

                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
                                     key, ip_hdr(skb)->saddr,
                                     ip_hdr(skb)->daddr, &rep.th);
        }
#endif
        /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
        if (rep.opt[0] == 0) {
                __be32 mrst = mptcp_reset_option(skb);

                if (mrst) {
                        rep.opt[0] = mrst;
                        arg.iov[0].iov_len += sizeof(mrst);
                        rep.th.doff = arg.iov[0].iov_len / 4;
                }
        }

        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
                                      ip_hdr(skb)->saddr, /* XXX */
                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;

        /* When socket is gone, all binding information is lost.
         * routing might fail in this case. No choice here, if we choose to force
         * input interface, we will misroute in case of asymmetric route.
         */
        if (sk)
                arg.bound_dev_if = sk->sk_bound_dev_if;

        trace_tcp_send_reset(sk, skb, reason);

        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));

        /* ECN bits of TW reset are cleared */
        arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
        local_bh_disable();
        local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
        ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);

        sock_net_set(ctl_sk, net);
        if (sk) {
                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
                                   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
                ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
                                   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
                transmit_time = tcp_transmit_time(sk);
                xfrm_sk_clone_policy(ctl_sk, sk);
                txhash = (sk->sk_state == TCP_TIME_WAIT) ?
                         inet_twsk(sk)->tw_txhash : sk->sk_txhash;
        } else {
                ctl_sk->sk_mark = 0;
                ctl_sk->sk_priority = 0;
        }
        ip_send_unicast_reply(ctl_sk, sk,
                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
                              &arg, arg.iov[0].iov_len,
                              transmit_time, txhash);

        xfrm_sk_free_policy(ctl_sk);
        sock_net_set(ctl_sk, &init_net);
        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
        local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
        local_bh_enable();

#ifdef CONFIG_TCP_MD5SIG
out:
        rcu_read_unlock();
#endif
}

/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
   outside socket context is ugly, certainly. What can I do?
 */

static void tcp_v4_send_ack(const struct sock *sk,
                            struct sk_buff *skb, u32 seq, u32 ack,
                            u32 win, u32 tsval, u32 tsecr, int oif,
                            struct tcp_key *key,
                            int reply_flags, u8 tos, u32 txhash)
{
        const struct tcphdr *th = tcp_hdr(skb);
        struct {
                struct tcphdr th;
                __be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
        } rep;
        struct net *net = sock_net(sk);
        struct ip_reply_arg arg;
        struct sock *ctl_sk;
        u64 transmit_time;

        memset(&rep.th, 0, sizeof(struct tcphdr));
        memset(&arg, 0, sizeof(arg));

        arg.iov[0].iov_base = (unsigned char *)&rep;
        arg.iov[0].iov_len  = sizeof(rep.th);
        if (tsecr) {
                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
                                   (TCPOPT_TIMESTAMP << 8) |
                                   TCPOLEN_TIMESTAMP);
                rep.opt[1] = htonl(tsval);
                rep.opt[2] = htonl(tsecr);
                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
        }

        /* Swap the send and the receive. */
        rep.th.dest    = th->source;
        rep.th.source  = th->dest;
        rep.th.doff    = arg.iov[0].iov_len / 4;
        rep.th.seq     = htonl(seq);
        rep.th.ack_seq = htonl(ack);
        rep.th.ack     = 1;
        rep.th.window  = htons(win);

#ifdef CONFIG_TCP_MD5SIG
        if (tcp_key_is_md5(key)) {
                int offset = (tsecr) ? 3 : 0;

                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
                                          (TCPOPT_NOP << 16) |
                                          (TCPOPT_MD5SIG << 8) |
                                          TCPOLEN_MD5SIG);
                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
                rep.th.doff = arg.iov[0].iov_len/4;

                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
                                    key->md5_key, ip_hdr(skb)->saddr,
                                    ip_hdr(skb)->daddr, &rep.th);
        }
#endif
#ifdef CONFIG_TCP_AO
        if (tcp_key_is_ao(key)) {
                int offset = (tsecr) ? 3 : 0;

                rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
                                          (tcp_ao_len(key->ao_key) << 16) |
                                          (key->ao_key->sndid << 8) |
                                          key->rcv_next);
                arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
                rep.th.doff = arg.iov[0].iov_len / 4;

                tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
                                key->ao_key, key->traffic_key,
                                (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
                                (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
                                &rep.th, key->sne);
        }
#endif
        arg.flags = reply_flags;
        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
                                      ip_hdr(skb)->saddr, /* XXX */
                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
        if (oif)
                arg.bound_dev_if = oif;
        arg.tos = tos;
        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
        local_bh_disable();
        local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
        ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
        sock_net_set(ctl_sk, net);
        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
                           inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
                           inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
        transmit_time = tcp_transmit_time(sk);
        ip_send_unicast_reply(ctl_sk, sk,
                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
                              &arg, arg.iov[0].iov_len,
                              transmit_time, txhash);

        sock_net_set(ctl_sk, &init_net);
        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
        local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
        local_bh_enable();
}

static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
                                enum tcp_tw_status tw_status)
{
        struct inet_timewait_sock *tw = inet_twsk(sk);
        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
        struct tcp_key key = {};
        u8 tos = tw->tw_tos;

        /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
         * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
         * being placed in a different service queues (Classic rather than L4S)
         */
        if (tw_status == TCP_TW_ACK_OOW)
                tos &= ~INET_ECN_MASK;

#ifdef CONFIG_TCP_AO
        struct tcp_ao_info *ao_info;

        if (static_branch_unlikely(&tcp_ao_needed.key)) {
                /* FIXME: the segment to-be-acked is not verified yet */
                ao_info = rcu_dereference(tcptw->ao_info);
                if (ao_info) {
                        const struct tcp_ao_hdr *aoh;

                        if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
                                inet_twsk_put(tw);
                                return;
                        }

                        if (aoh)
                                key.ao_key = tcp_ao_established_key(sk, ao_info,
                                                                    aoh->rnext_keyid, -1);
                }
        }
        if (key.ao_key) {
                struct tcp_ao_key *rnext_key;

                key.traffic_key = snd_other_key(key.ao_key);
                key.sne = READ_ONCE(ao_info->snd_sne);
                rnext_key = READ_ONCE(ao_info->rnext_key);
                key.rcv_next = rnext_key->rcvid;
                key.type = TCP_KEY_AO;
#else
        if (0) {
#endif
        } else if (static_branch_tcp_md5()) {
                key.md5_key = tcp_twsk_md5_key(tcptw);
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
        }

        tcp_v4_send_ack(sk, skb,
                        tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
                        tcp_tw_tsval(tcptw),
                        READ_ONCE(tcptw->tw_ts_recent),
                        tw->tw_bound_dev_if, &key,
                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
                        tos,
                        tw->tw_txhash);

        inet_twsk_put(tw);
}

static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req)
{
        struct tcp_key key = {};

        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
         */
        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
                                             tcp_sk(sk)->snd_nxt;

#ifdef CONFIG_TCP_AO
        if (static_branch_unlikely(&tcp_ao_needed.key) &&
            tcp_rsk_used_ao(req)) {
                const union tcp_md5_addr *addr;
                const struct tcp_ao_hdr *aoh;
                int l3index;

                /* Invalid TCP option size or twice included auth */
                if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
                        return;
                if (!aoh)
                        return;

                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
                key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
                                              aoh->rnext_keyid, -1);
                if (unlikely(!key.ao_key)) {
                        /* Send ACK with any matching MKT for the peer */
                        key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
                        /* Matching key disappeared (user removed the key?)
                         * let the handshake timeout.
                         */
                        if (!key.ao_key) {
                                net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
                                                     addr,
                                                     ntohs(tcp_hdr(skb)->source),
                                                     &ip_hdr(skb)->daddr,
                                                     ntohs(tcp_hdr(skb)->dest));
                                return;
                        }
                }
                key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
                if (!key.traffic_key)
                        return;

                key.type = TCP_KEY_AO;
                key.rcv_next = aoh->keyid;
                tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
#else
        if (0) {
#endif
        } else if (static_branch_tcp_md5()) {
                const union tcp_md5_addr *addr;
                int l3index;

                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
                key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
        }

        /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
        tcp_v4_send_ack(sk, skb, seq,
                        tcp_rsk(req)->rcv_nxt,
                        tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
                        tcp_rsk_tsval(tcp_rsk(req)),
                        req->ts_recent,
                        0, &key,
                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
                        ip_hdr(skb)->tos & ~INET_ECN_MASK,
                        READ_ONCE(tcp_rsk(req)->txhash));
        if (tcp_key_is_ao(&key))
                kfree(key.traffic_key);
}

/*
 *        Send a SYN-ACK after having received a SYN.
 *        This still operates on a request_sock only, not on a big
 *        socket.
 */
static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
                              struct flowi *fl,
                              struct request_sock *req,
                              struct tcp_fastopen_cookie *foc,
                              enum tcp_synack_type synack_type,
                              struct sk_buff *syn_skb)
{
        struct inet_request_sock *ireq = inet_rsk(req);
        struct flowi4 fl4;
        int err = -1;
        struct sk_buff *skb;
        u8 tos;

        /* First, grab a route. */
        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
                return -1;

        skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);

        if (skb) {
                tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);

                tos = READ_ONCE(inet_sk(sk)->tos);

                if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
                        tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
                              (tos & INET_ECN_MASK);

                if (!INET_ECN_is_capable(tos) &&
                    tcp_bpf_ca_needs_ecn((struct sock *)req))
                        tos |= INET_ECN_ECT_0;

                rcu_read_lock();
                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
                                            ireq->ir_rmt_addr,
                                            rcu_dereference(ireq->ireq_opt),
                                            tos);
                rcu_read_unlock();
                err = net_xmit_eval(err);
        }

        return err;
}

/*
 *        IPv4 request_sock destructor.
 */
static void tcp_v4_reqsk_destructor(struct request_sock *req)
{
        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
}

#ifdef CONFIG_TCP_MD5SIG
/*
 * RFC2385 MD5 checksumming requires a mapping of
 * IP address->MD5 Key.
 * We need to maintain these in the sk structure.
 */

DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);

static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
{
        if (!old)
                return true;

        /* l3index always overrides non-l3index */
        if (old->l3index && new->l3index == 0)
                return false;
        if (old->l3index == 0 && new->l3index)
                return true;

        return old->prefixlen < new->prefixlen;
}

/* Find the Key structure for an address.  */
struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
                                           const union tcp_md5_addr *addr,
                                           int family, bool any_l3index)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_key *key;
        const struct tcp_md5sig_info *md5sig;
        __be32 mask;
        struct tcp_md5sig_key *best_match = NULL;
        bool match;

        /* caller either holds rcu_read_lock() or socket lock */
        md5sig = rcu_dereference_check(tp->md5sig_info,
                                       lockdep_sock_is_held(sk));
        if (!md5sig)
                return NULL;

        hlist_for_each_entry_rcu(key, &md5sig->head, node,
                                 lockdep_sock_is_held(sk)) {
                if (key->family != family)
                        continue;
                if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
                    key->l3index != l3index)
                        continue;
                if (family == AF_INET) {
                        mask = inet_make_mask(key->prefixlen);
                        match = (key->addr.a4.s_addr & mask) ==
                                (addr->a4.s_addr & mask);
#if IS_ENABLED(CONFIG_IPV6)
                } else if (family == AF_INET6) {
                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
                                                  key->prefixlen);
#endif
                } else {
                        match = false;
                }

                if (match && better_md5_match(best_match, key))
                        best_match = key;
        }
        return best_match;
}

static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
                                                      const union tcp_md5_addr *addr,
                                                      int family, u8 prefixlen,
                                                      int l3index, u8 flags)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_key *key;
        unsigned int size = sizeof(struct in_addr);
        const struct tcp_md5sig_info *md5sig;

        /* caller either holds rcu_read_lock() or socket lock */
        md5sig = rcu_dereference_check(tp->md5sig_info,
                                       lockdep_sock_is_held(sk));
        if (!md5sig)
                return NULL;
#if IS_ENABLED(CONFIG_IPV6)
        if (family == AF_INET6)
                size = sizeof(struct in6_addr);
#endif
        hlist_for_each_entry_rcu(key, &md5sig->head, node,
                                 lockdep_sock_is_held(sk)) {
                if (key->family != family)
                        continue;
                if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
                        continue;
                if (key->l3index != l3index)
                        continue;
                if (!memcmp(&key->addr, addr, size) &&
                    key->prefixlen == prefixlen)
                        return key;
        }
        return NULL;
}

struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
                                         const struct sock *addr_sk)
{
        const union tcp_md5_addr *addr;
        int l3index;

        l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
                                                 addr_sk->sk_bound_dev_if);
        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
        return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
}

static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_info *md5sig;

        md5sig = kmalloc_obj(*md5sig, gfp);
        if (!md5sig)
                return -ENOMEM;

        sk_gso_disable(sk);
        INIT_HLIST_HEAD(&md5sig->head);
        rcu_assign_pointer(tp->md5sig_info, md5sig);
        return 0;
}

/* This can be called on a newly created socket, from other files */
static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
                            int family, u8 prefixlen, int l3index, u8 flags,
                            const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
        /* Add Key to the list */
        struct tcp_md5sig_key *key;
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_info *md5sig;

        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
        if (key) {
                /* Pre-existing entry - just update that one.
                 * Note that the key might be used concurrently.
                 * data_race() is telling kcsan that we do not care of
                 * key mismatches, since changing MD5 key on live flows
                 * can lead to packet drops.
                 */
                data_race(memcpy(key->key, newkey, newkeylen));

                /* Pairs with READ_ONCE() in tcp_md5_hash_key().
                 * Also note that a reader could catch new key->keylen value
                 * but old key->key[], this is the reason we use __GFP_ZERO
                 * at sock_kmalloc() time below these lines.
                 */
                WRITE_ONCE(key->keylen, newkeylen);

                return 0;
        }

        md5sig = rcu_dereference_protected(tp->md5sig_info,
                                           lockdep_sock_is_held(sk));

        key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
        if (!key)
                return -ENOMEM;

        memcpy(key->key, newkey, newkeylen);
        key->keylen = newkeylen;
        key->family = family;
        key->prefixlen = prefixlen;
        key->l3index = l3index;
        key->flags = flags;
        memcpy(&key->addr, addr,
               (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
                                                                 sizeof(struct in_addr));
        hlist_add_head_rcu(&key->node, &md5sig->head);
        return 0;
}

int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
                   int family, u8 prefixlen, int l3index, u8 flags,
                   const u8 *newkey, u8 newkeylen)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
                if (fips_enabled) {
                        pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
                        return -EOPNOTSUPP;
                }

                if (tcp_md5sig_info_add(sk, GFP_KERNEL))
                        return -ENOMEM;

                if (!static_branch_inc(&tcp_md5_needed.key)) {
                        struct tcp_md5sig_info *md5sig;

                        md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
                        rcu_assign_pointer(tp->md5sig_info, NULL);
                        kfree_rcu(md5sig, rcu);
                        return -EUSERS;
                }
        }

        return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
                                newkey, newkeylen, GFP_KERNEL);
}

int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
                     int family, u8 prefixlen, int l3index,
                     struct tcp_md5sig_key *key)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {

                if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
                        return -ENOMEM;

                if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
                        struct tcp_md5sig_info *md5sig;

                        md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
                        net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
                        rcu_assign_pointer(tp->md5sig_info, NULL);
                        kfree_rcu(md5sig, rcu);
                        return -EUSERS;
                }
        }

        return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
                                key->flags, key->key, key->keylen,
                                sk_gfp_mask(sk, GFP_ATOMIC));
}

int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
                   u8 prefixlen, int l3index, u8 flags)
{
        struct tcp_md5sig_key *key;

        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
        if (!key)
                return -ENOENT;
        hlist_del_rcu(&key->node);
        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
        kfree_rcu(key, rcu);
        return 0;
}

void tcp_clear_md5_list(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_key *key;
        struct hlist_node *n;
        struct tcp_md5sig_info *md5sig;

        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);

        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
                hlist_del(&key->node);
                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
                kfree(key);
        }
}

static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
                                 sockptr_t optval, int optlen)
{
        struct tcp_md5sig cmd;
        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
        const union tcp_md5_addr *addr;
        u8 prefixlen = 32;
        int l3index = 0;
        bool l3flag;
        u8 flags;

        if (optlen < sizeof(cmd))
                return -EINVAL;

        if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
                return -EFAULT;

        if (sin->sin_family != AF_INET)
                return -EINVAL;

        flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
        l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;

        if (optname == TCP_MD5SIG_EXT &&
            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
                prefixlen = cmd.tcpm_prefixlen;
                if (prefixlen > 32)
                        return -EINVAL;
        }

        if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
            cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
                struct net_device *dev;

                rcu_read_lock();
                dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
                if (dev && netif_is_l3_master(dev))
                        l3index = dev->ifindex;

                rcu_read_unlock();

                /* ok to reference set/not set outside of rcu;
                 * right now device MUST be an L3 master
                 */
                if (!dev || !l3index)
                        return -EINVAL;
        }

        addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;

        if (!cmd.tcpm_keylen)
                return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);

        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
                return -EINVAL;

        /* Don't allow keys for peers that have a matching TCP-AO key.
         * See the comment in tcp_ao_add_cmd()
         */
        if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
                return -EKEYREJECTED;

        return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
                              cmd.tcpm_key, cmd.tcpm_keylen);
}

static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
                                    __be32 daddr, __be32 saddr,
                                    const struct tcphdr *th, int nbytes)
{
        struct {
                struct tcp4_pseudohdr ip;
                struct tcphdr tcp;
        } h;

        h.ip.saddr = saddr;
        h.ip.daddr = daddr;
        h.ip.pad = 0;
        h.ip.protocol = IPPROTO_TCP;
        h.ip.len = cpu_to_be16(nbytes);
        h.tcp = *th;
        h.tcp.check = 0;
        md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
}

static noinline_for_stack void
tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
                    __be32 daddr, __be32 saddr, const struct tcphdr *th)
{
        struct md5_ctx ctx;

        md5_init(&ctx);
        tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
        tcp_md5_hash_key(&ctx, key);
        md5_final(&ctx, md5_hash);
}

noinline_for_stack void
tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
                    const struct sock *sk, const struct sk_buff *skb)
{
        const struct tcphdr *th = tcp_hdr(skb);
        __be32 saddr, daddr;
        struct md5_ctx ctx;

        if (sk) { /* valid for establish/request sockets */
                saddr = sk->sk_rcv_saddr;
                daddr = sk->sk_daddr;
        } else {
                const struct iphdr *iph = ip_hdr(skb);
                saddr = iph->saddr;
                daddr = iph->daddr;
        }

        md5_init(&ctx);
        tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
        tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
        tcp_md5_hash_key(&ctx, key);
        md5_final(&ctx, md5_hash);
}

#endif

static void tcp_v4_init_req(struct request_sock *req,
                            const struct sock *sk_listener,
                            struct sk_buff *skb)
{
        struct inet_request_sock *ireq = inet_rsk(req);
        struct net *net = sock_net(sk_listener);

        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
}

static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
                                          struct sk_buff *skb,
                                          struct flowi *fl,
                                          struct request_sock *req,
                                          u32 tw_isn)
{
        tcp_v4_init_req(req, sk, skb);

        if (security_inet_conn_request(sk, skb, req))
                return NULL;

        return inet_csk_route_req(sk, &fl->u.ip4, req);
}

struct request_sock_ops tcp_request_sock_ops __read_mostly = {
        .family                =        PF_INET,
        .obj_size        =        sizeof(struct tcp_request_sock),
        .send_ack        =        tcp_v4_reqsk_send_ack,
        .destructor        =        tcp_v4_reqsk_destructor,
        .send_reset        =        tcp_v4_send_reset,
};

const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
        .mss_clamp        =        TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
        .req_md5_lookup        =        tcp_v4_md5_lookup,
        .calc_md5_hash        =        tcp_v4_md5_hash_skb,
#endif
#ifdef CONFIG_TCP_AO
        .ao_lookup        =        tcp_v4_ao_lookup_rsk,
        .ao_calc_key        =        tcp_v4_ao_calc_key_rsk,
        .ao_synack_hash        =        tcp_v4_ao_synack_hash,
#endif
#ifdef CONFIG_SYN_COOKIES
        .cookie_init_seq =        cookie_v4_init_sequence,
#endif
        .route_req        =        tcp_v4_route_req,
        .init_seq_and_ts_off        =        tcp_v4_init_seq_and_ts_off,
        .send_synack        =        tcp_v4_send_synack,
};

int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
        /* Never answer to SYNs send to broadcast or multicast */
        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
                goto drop;

        return tcp_conn_request(&tcp_request_sock_ops,
                                &tcp_request_sock_ipv4_ops, sk, skb);

drop:
        tcp_listendrop(sk);
        return 0;
}


/*
 * The three way handshake has completed - we got a valid synack -
 * now create the new socket.
 */
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req,
                                  struct dst_entry *dst,
                                  struct request_sock *req_unhash,
                                  bool *own_req,
                                  void (*opt_child_init)(struct sock *newsk,
                                                         const struct sock *sk))
{
        struct inet_request_sock *ireq;
        bool found_dup_sk = false;
        struct inet_sock *newinet;
        struct tcp_sock *newtp;
        struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
        const union tcp_md5_addr *addr;
        struct tcp_md5sig_key *key;
        int l3index;
#endif
        struct ip_options_rcu *inet_opt;

        if (sk_acceptq_is_full(sk))
                goto exit_overflow;

        newsk = tcp_create_openreq_child(sk, req, skb);
        if (!newsk)
                goto exit_nonewsk;

        newsk->sk_gso_type = SKB_GSO_TCPV4;
        inet_sk_rx_dst_set(newsk, skb);

        newtp                      = tcp_sk(newsk);
        newinet                      = inet_sk(newsk);
        ireq                      = inet_rsk(req);
        inet_opt              = rcu_dereference(ireq->ireq_opt);
        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
        newinet->mc_index     = inet_iif(skb);
        newinet->mc_ttl              = ip_hdr(skb)->ttl;
        newinet->rcv_tos      = ip_hdr(skb)->tos;
        inet_csk(newsk)->icsk_ext_hdr_len = 0;
        if (inet_opt)
                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
        atomic_set(&newinet->inet_id, get_random_u16());

        /* Set ToS of the new socket based upon the value of incoming SYN.
         * ECT bits are set later in tcp_init_transfer().
         */
        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
                newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;

        if (!dst) {
                dst = inet_csk_route_child_sock(sk, newsk, req);
                if (!dst)
                        goto put_and_exit;
        } else {
                /* syncookie case : see end of cookie_v4_check() */
        }
        sk_setup_caps(newsk, dst);

#if IS_ENABLED(CONFIG_IPV6)
        if (opt_child_init)
                opt_child_init(newsk, sk);
#endif
        tcp_ca_openreq_child(newsk, dst);

        tcp_sync_mss(newsk, dst4_mtu(dst));
        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));

        tcp_initialize_rcv_mss(newsk);

#ifdef CONFIG_TCP_MD5SIG
        l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
        /* Copy over the MD5 key from the original socket */
        addr = (union tcp_md5_addr *)&newinet->inet_daddr;
        key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
        if (key && !tcp_rsk_used_ao(req)) {
                if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
                        goto put_and_exit;
                sk_gso_disable(newsk);
        }
#endif
#ifdef CONFIG_TCP_AO
        if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
                goto put_and_exit; /* OOM, release back memory */
#endif

        if (__inet_inherit_port(sk, newsk) < 0)
                goto put_and_exit;
        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
                                       &found_dup_sk);
        if (likely(*own_req)) {
                tcp_move_syn(newtp, req);
                ireq->ireq_opt = NULL;
        } else {
                newinet->inet_opt = NULL;

                if (!req_unhash && found_dup_sk) {
                        /* This code path should only be executed in the
                         * syncookie case only
                         */
                        bh_unlock_sock(newsk);
                        sock_put(newsk);
                        newsk = NULL;
                }
        }
        return newsk;

exit_overflow:
        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
        dst_release(dst);
exit:
        tcp_listendrop(sk);
        return NULL;
put_and_exit:
        newinet->inet_opt = NULL;
        inet_csk_prepare_forced_close(newsk);
        tcp_done(newsk);
        goto exit;
}

static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
#ifdef CONFIG_SYN_COOKIES
        const struct tcphdr *th = tcp_hdr(skb);

        if (!th->syn)
                sk = cookie_v4_check(sk, skb);
#endif
        return sk;
}

u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
                         struct tcphdr *th, u32 *cookie)
{
        u16 mss = 0;
#ifdef CONFIG_SYN_COOKIES
        mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
                                    &tcp_request_sock_ipv4_ops, sk, th);
        if (mss) {
                *cookie = __cookie_v4_init_sequence(iph, th, &mss);
                tcp_synq_overflow(sk);
        }
#endif
        return mss;
}

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
/* The socket must have it's spinlock held when we get
 * here, unless it is a TCP_LISTEN socket.
 *
 * We have a potential double-lock case here, so even when
 * doing backlog processing we use the BH locking scheme.
 * This is because we cannot sleep with the original spinlock
 * held.
 */
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason reason;
        struct sock *rsk;

        reason = psp_sk_rx_policy_check(sk, skb);
        if (reason)
                goto err_discard;

        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
                struct dst_entry *dst;

                dst = rcu_dereference_protected(sk->sk_rx_dst,
                                                lockdep_sock_is_held(sk));

                sock_rps_save_rxhash(sk, skb);
                sk_mark_napi_id(sk, skb);
                if (dst && unlikely(dst != skb_dst(skb))) {
                        if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
                            !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
                                             dst, 0)) {
                                RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
                                dst_release(dst);
                        }
                }
                tcp_rcv_established(sk, skb);
                return 0;
        }

        if (tcp_checksum_complete(skb))
                goto csum_err;

        if (sk->sk_state == TCP_LISTEN) {
                struct sock *nsk = tcp_v4_cookie_check(sk, skb);

                if (!nsk)
                        return 0;
                if (nsk != sk) {
                        reason = tcp_child_process(sk, nsk, skb);
                        if (reason) {
                                rsk = nsk;
                                goto reset;
                        }
                        return 0;
                }
        } else
                sock_rps_save_rxhash(sk, skb);

        reason = tcp_rcv_state_process(sk, skb);
        if (reason) {
                rsk = sk;
                goto reset;
        }
        return 0;

reset:
        tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
discard:
        sk_skb_reason_drop(sk, skb, reason);
        /* Be careful here. If this function gets more complicated and
         * gcc suffers from register pressure on the x86, sk (in %ebx)
         * might be destroyed here. This current version compiles correctly,
         * but you have been warned.
         */
        return 0;

csum_err:
        reason = SKB_DROP_REASON_TCP_CSUM;
        trace_tcp_bad_csum(skb);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
err_discard:
        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
        goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);

enum skb_drop_reason tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
        u32 tail_gso_size, tail_gso_segs;
        struct skb_shared_info *shinfo;
        const struct tcphdr *th;
        struct tcphdr *thtail;
        struct sk_buff *tail;
        unsigned int hdrlen;
        bool fragstolen;
        u32 gso_segs;
        u32 gso_size;
        u64 limit;
        int delta;
        int err;

        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
         * we can fix skb->truesize to its real value to avoid future drops.
         * This is valid because skb is not yet charged to the socket.
         * It has been noticed pure SACK packets were sometimes dropped
         * (if cooked by drivers without copybreak feature).
         */
        skb_condense(skb);

        tcp_cleanup_skb(skb);

        if (unlikely(tcp_checksum_complete(skb))) {
                bh_unlock_sock(sk);
                trace_tcp_bad_csum(skb);
                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
                return SKB_DROP_REASON_TCP_CSUM;
        }

        /* Attempt coalescing to last skb in backlog, even if we are
         * above the limits.
         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
         */
        th = (const struct tcphdr *)skb->data;
        hdrlen = th->doff * 4;

        tail = sk->sk_backlog.tail;
        if (!tail)
                goto no_coalesce;
        thtail = (struct tcphdr *)tail->data;

        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
            ((TCP_SKB_CB(tail)->tcp_flags |
              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
            !((TCP_SKB_CB(tail)->tcp_flags &
              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
            ((TCP_SKB_CB(tail)->tcp_flags ^
              TCP_SKB_CB(skb)->tcp_flags) &
             (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
            !tcp_skb_can_collapse_rx(tail, skb) ||
            thtail->doff != th->doff ||
            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
            /* prior to PSP Rx policy check, retain exact PSP metadata */
            psp_skb_coalesce_diff(tail, skb))
                goto no_coalesce;

        __skb_pull(skb, hdrlen);

        shinfo = skb_shinfo(skb);
        gso_size = shinfo->gso_size ?: skb->len;
        gso_segs = shinfo->gso_segs ?: 1;

        shinfo = skb_shinfo(tail);
        tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
        tail_gso_segs = shinfo->gso_segs ?: 1;

        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;

                if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
                        thtail->window = th->window;
                }

                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
                 * thtail->fin, so that the fast path in tcp_rcv_established()
                 * is not entered if we append a packet with a FIN.
                 * SYN, RST, URG are not present.
                 * ACK is set on both packets.
                 * PSH : we do not really care in TCP stack,
                 *       at least for 'GRO' packets.
                 */
                thtail->fin |= th->fin;
                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;

                if (TCP_SKB_CB(skb)->has_rxtstamp) {
                        TCP_SKB_CB(tail)->has_rxtstamp = true;
                        tail->tstamp = skb->tstamp;
                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
                }

                /* Not as strict as GRO. We only need to carry mss max value */
                shinfo->gso_size = max(gso_size, tail_gso_size);
                shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);

                sk->sk_backlog.len += delta;
                __NET_INC_STATS(sock_net(sk),
                                LINUX_MIB_TCPBACKLOGCOALESCE);
                kfree_skb_partial(skb, fragstolen);
                return SKB_NOT_DROPPED_YET;
        }
        __skb_push(skb, hdrlen);

no_coalesce:
        /* sk->sk_backlog.len is reset only at the end of __release_sock().
         * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
         * sk_rcvbuf in normal conditions.
         */
        limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;

        limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;

        /* Only socket owner can try to collapse/prune rx queues
         * to reduce memory overhead, so add a little headroom here.
         * Few sockets backlog are possibly concurrently non empty.
         */
        limit += 64 * 1024;

        limit = min_t(u64, limit, UINT_MAX);

        err = sk_add_backlog(sk, skb, limit);
        if (unlikely(err)) {
                bh_unlock_sock(sk);
                if (err == -ENOMEM) {
                        __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
                        return SKB_DROP_REASON_PFMEMALLOC;
                }
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
                return SKB_DROP_REASON_SOCKET_BACKLOG;
        }
        return SKB_NOT_DROPPED_YET;
}

static void tcp_v4_restore_cb(struct sk_buff *skb)
{
        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
                sizeof(struct inet_skb_parm));
}

static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
                           const struct tcphdr *th)
{
        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
         * barrier() makes sure compiler wont play fool^Waliasing games.
         */
        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
                sizeof(struct inet_skb_parm));
        barrier();

        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
                                    skb->len - th->doff * 4);
        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
        TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
        TCP_SKB_CB(skb)->sacked         = 0;
        TCP_SKB_CB(skb)->has_rxtstamp =
                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
}

/*
 *        From tcp_input.c
 */

int tcp_v4_rcv(struct sk_buff *skb)
{
        struct net *net = dev_net_rcu(skb->dev);
        enum skb_drop_reason drop_reason;
        enum tcp_tw_status tw_status;
        int sdif = inet_sdif(skb);
        int dif = inet_iif(skb);
        const struct iphdr *iph;
        const struct tcphdr *th;
        struct sock *sk = NULL;
        bool refcounted;
        int ret;
        u32 isn;

        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        if (skb->pkt_type != PACKET_HOST)
                goto discard_it;

        /* Count it even if it's bad */
        __TCP_INC_STATS(net, TCP_MIB_INSEGS);

        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
                goto discard_it;

        th = (const struct tcphdr *)skb->data;

        if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
                drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
                goto bad_packet;
        }
        if (!pskb_may_pull(skb, th->doff * 4))
                goto discard_it;

        /* An explanation is required here, I think.
         * Packet length and doff are validated by header prediction,
         * provided case of th->doff==0 is eliminated.
         * So, we defer the checks. */

        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
                goto csum_error;

        th = (const struct tcphdr *)skb->data;
        iph = ip_hdr(skb);
lookup:
        sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
                               th->dest, sdif, &refcounted);
        if (!sk)
                goto no_tcp_socket;

        if (sk->sk_state == TCP_TIME_WAIT)
                goto do_time_wait;

        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);
                bool req_stolen = false;
                struct sock *nsk;

                sk = req->rsk_listener;
                if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
                        drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                else
                        drop_reason = tcp_inbound_hash(sk, req, skb,
                                                       &iph->saddr, &iph->daddr,
                                                       AF_INET, dif, sdif);
                if (unlikely(drop_reason)) {
                        sk_drops_skbadd(sk, skb);
                        reqsk_put(req);
                        goto discard_it;
                }
                if (tcp_checksum_complete(skb)) {
                        reqsk_put(req);
                        goto csum_error;
                }
                if (unlikely(sk->sk_state != TCP_LISTEN)) {
                        nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
                        if (!nsk) {
                                inet_csk_reqsk_queue_drop_and_put(sk, req);
                                goto lookup;
                        }
                        sk = nsk;
                        /* reuseport_migrate_sock() has already held one sk_refcnt
                         * before returning.
                         */
                } else {
                        /* We own a reference on the listener, increase it again
                         * as we might lose it too soon.
                         */
                        sock_hold(sk);
                }
                refcounted = true;
                nsk = NULL;
                drop_reason = tcp_filter(sk, skb);
                if (!drop_reason) {
                        th = (const struct tcphdr *)skb->data;
                        iph = ip_hdr(skb);
                        tcp_v4_fill_cb(skb, iph, th);
                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
                                            &drop_reason);
                }
                if (!nsk) {
                        reqsk_put(req);
                        if (req_stolen) {
                                /* Another cpu got exclusive access to req
                                 * and created a full blown socket.
                                 * Try to feed this packet to this socket
                                 * instead of discarding it.
                                 */
                                tcp_v4_restore_cb(skb);
                                sock_put(sk);
                                goto lookup;
                        }
                        goto discard_and_relse;
                }
                nf_reset_ct(skb);
                if (nsk == sk) {
                        reqsk_put(req);
                        tcp_v4_restore_cb(skb);
                } else {
                        drop_reason = tcp_child_process(sk, nsk, skb);
                        if (drop_reason) {
                                enum sk_rst_reason rst_reason;

                                rst_reason = sk_rst_convert_drop_reason(drop_reason);
                                tcp_v4_send_reset(nsk, skb, rst_reason);
                                goto discard_and_relse;
                        }
                        sock_put(sk);
                        return 0;
                }
        }

process:
        if (static_branch_unlikely(&ip4_min_ttl)) {
                /* min_ttl can be changed concurrently from do_ip_setsockopt() */
                if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
                        drop_reason = SKB_DROP_REASON_TCP_MINTTL;
                        goto discard_and_relse;
                }
        }

        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                goto discard_and_relse;
        }

        drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
                                       AF_INET, dif, sdif);
        if (drop_reason)
                goto discard_and_relse;

        nf_reset_ct(skb);

        drop_reason = tcp_filter(sk, skb);
        if (drop_reason)
                goto discard_and_relse;

        th = (const struct tcphdr *)skb->data;
        iph = ip_hdr(skb);
        tcp_v4_fill_cb(skb, iph, th);

        skb->dev = NULL;

        if (sk->sk_state == TCP_LISTEN) {
                ret = tcp_v4_do_rcv(sk, skb);
                goto put_and_return;
        }

        sk_incoming_cpu_update(sk);

        bh_lock_sock_nested(sk);
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
                ret = tcp_v4_do_rcv(sk, skb);
        } else {
                drop_reason = tcp_add_backlog(sk, skb);
                if (drop_reason)
                        goto discard_and_relse;
        }
        bh_unlock_sock(sk);

put_and_return:
        if (refcounted)
                sock_put(sk);

        return ret;

no_tcp_socket:
        drop_reason = SKB_DROP_REASON_NO_SOCKET;
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
                goto discard_it;

        tcp_v4_fill_cb(skb, iph, th);

        if (tcp_checksum_complete(skb)) {
csum_error:
                drop_reason = SKB_DROP_REASON_TCP_CSUM;
                trace_tcp_bad_csum(skb);
                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
                __TCP_INC_STATS(net, TCP_MIB_INERRS);
        } else {
                tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
        }

discard_it:
        SKB_DR_OR(drop_reason, NOT_SPECIFIED);
        /* Discard frame. */
        sk_skb_reason_drop(sk, skb, drop_reason);
        return 0;

discard_and_relse:
        sk_drops_skbadd(sk, skb);
        if (refcounted)
                sock_put(sk);
        goto discard_it;

do_time_wait:
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                inet_twsk_put(inet_twsk(sk));
                goto discard_it;
        }

        tcp_v4_fill_cb(skb, iph, th);

        if (tcp_checksum_complete(skb)) {
                inet_twsk_put(inet_twsk(sk));
                goto csum_error;
        }

        tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
                                               &drop_reason);
        switch (tw_status) {
        case TCP_TW_SYN: {
                struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
                                                        iph->saddr, th->source,
                                                        iph->daddr, th->dest,
                                                        inet_iif(skb),
                                                        sdif);
                if (sk2) {
                        inet_twsk_deschedule_put(inet_twsk(sk));
                        sk = sk2;
                        tcp_v4_restore_cb(skb);
                        refcounted = false;
                        __this_cpu_write(tcp_tw_isn, isn);
                        goto process;
                }

                drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
                if (drop_reason)
                        break;
        }
                /* to ACK */
                fallthrough;
        case TCP_TW_ACK:
        case TCP_TW_ACK_OOW:
                tcp_v4_timewait_ack(sk, skb, tw_status);
                break;
        case TCP_TW_RST:
                tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
                inet_twsk_deschedule_put(inet_twsk(sk));
                goto discard_it;
        case TCP_TW_SUCCESS:;
        }
        goto discard_it;
}

static struct timewait_sock_ops tcp_timewait_sock_ops = {
        .twsk_obj_size        = sizeof(struct tcp_timewait_sock),
};

void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst_hold_safe(dst)) {
                rcu_assign_pointer(sk->sk_rx_dst, dst);
                sk->sk_rx_dst_ifindex = skb->skb_iif;
        }
}

const struct inet_connection_sock_af_ops ipv4_specific = {
        .queue_xmit           = ip_queue_xmit,
        .rebuild_header           = inet_sk_rebuild_header,
        .sk_rx_dst_set           = inet_sk_rx_dst_set,
        .conn_request           = tcp_v4_conn_request,
        .syn_recv_sock           = tcp_v4_syn_recv_sock,
        .net_header_len           = sizeof(struct iphdr),
        .setsockopt           = ip_setsockopt,
        .getsockopt           = ip_getsockopt,
        .mtu_reduced           = tcp_v4_mtu_reduced,
};

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
#ifdef CONFIG_TCP_MD5SIG
        .md5_lookup                = tcp_v4_md5_lookup,
        .calc_md5_hash                = tcp_v4_md5_hash_skb,
        .md5_parse                = tcp_v4_parse_md5_keys,
#endif
#ifdef CONFIG_TCP_AO
        .ao_lookup                = tcp_v4_ao_lookup,
        .calc_ao_hash                = tcp_v4_ao_hash_skb,
        .ao_parse                = tcp_v4_parse_ao,
        .ao_calc_key_sk                = tcp_v4_ao_calc_key_sk,
#endif
};

static void tcp4_destruct_sock(struct sock *sk)
{
        tcp_md5_destruct_sock(sk);
        tcp_ao_destroy_sock(sk, false);
        inet_sock_destruct(sk);
}
#endif

/* NOTE: A lot of things set to zero explicitly by call to
 *       sk_alloc() so need not be done here.
 */
static int tcp_v4_init_sock(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        tcp_init_sock(sk);

        icsk->icsk_af_ops = &ipv4_specific;

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
        sk->sk_destruct = tcp4_destruct_sock;
#endif

        return 0;
}

static void tcp_release_user_frags(struct sock *sk)
{
#ifdef CONFIG_PAGE_POOL
        unsigned long index;
        void *netmem;

        xa_for_each(&sk->sk_user_frags, index, netmem)
                WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
#endif
}

void tcp_v4_destroy_sock(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        tcp_release_user_frags(sk);

        xa_destroy(&sk->sk_user_frags);

        trace_tcp_destroy_sock(sk);

        tcp_clear_xmit_timers(sk);

        tcp_cleanup_congestion_control(sk);

        tcp_cleanup_ulp(sk);

        /* Cleanup up the write buffer. */
        tcp_write_queue_purge(sk);

        /* Check if we want to disable active TFO */
        tcp_fastopen_active_disable_ofo_check(sk);

        /* Cleans up our, hopefully empty, out_of_order_queue. */
        skb_rbtree_purge(&tp->out_of_order_queue);

        /* Clean up a referenced TCP bind bucket. */
        if (inet_csk(sk)->icsk_bind_hash)
                inet_put_port(sk);

        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));

        /* If socket is aborted during connect operation */
        tcp_free_fastopen_req(tp);
        tcp_fastopen_destroy_cipher(sk);
        tcp_saved_syn_free(tp);

        sk_sockets_allocated_dec(sk);
}

#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */

static unsigned short seq_file_family(const struct seq_file *seq);

static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
{
        unsigned short family = seq_file_family(seq);

        /* AF_UNSPEC is used as a match all */
        return ((family == AF_UNSPEC || family == sk->sk_family) &&
                net_eq(sock_net(sk), seq_file_net(seq)));
}

/* Find a non empty bucket (starting from st->bucket)
 * and return the first sk from it.
 */
static void *listening_get_first(struct seq_file *seq)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct tcp_iter_state *st = seq->private;

        st->offset = 0;
        for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
                struct inet_listen_hashbucket *ilb2;
                struct hlist_nulls_node *node;
                struct sock *sk;

                ilb2 = &hinfo->lhash2[st->bucket];
                if (hlist_nulls_empty(&ilb2->nulls_head))
                        continue;

                spin_lock(&ilb2->lock);
                sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
                        if (seq_sk_match(seq, sk))
                                return sk;
                }
                spin_unlock(&ilb2->lock);
        }

        return NULL;
}

/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
 * If "cur" is the last one in the st->bucket,
 * call listening_get_first() to return the first sk of the next
 * non empty bucket.
 */
static void *listening_get_next(struct seq_file *seq, void *cur)
{
        struct tcp_iter_state *st = seq->private;
        struct inet_listen_hashbucket *ilb2;
        struct hlist_nulls_node *node;
        struct inet_hashinfo *hinfo;
        struct sock *sk = cur;

        ++st->num;
        ++st->offset;

        sk = sk_nulls_next(sk);
        sk_nulls_for_each_from(sk, node) {
                if (seq_sk_match(seq, sk))
                        return sk;
        }

        hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        ilb2 = &hinfo->lhash2[st->bucket];
        spin_unlock(&ilb2->lock);
        ++st->bucket;
        return listening_get_first(seq);
}

static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
{
        struct tcp_iter_state *st = seq->private;
        void *rc;

        st->bucket = 0;
        st->offset = 0;
        rc = listening_get_first(seq);

        while (rc && *pos) {
                rc = listening_get_next(seq, rc);
                --*pos;
        }
        return rc;
}

static inline bool empty_bucket(struct inet_hashinfo *hinfo,
                                const struct tcp_iter_state *st)
{
        return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
}

/*
 * Get first established socket starting from bucket given in st->bucket.
 * If st->bucket is zero, the very first socket in the hash is returned.
 */
static void *established_get_first(struct seq_file *seq)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct tcp_iter_state *st = seq->private;

        st->offset = 0;
        for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
                struct sock *sk;
                struct hlist_nulls_node *node;
                spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);

                cond_resched();

                /* Lockless fast path for the common case of empty buckets */
                if (empty_bucket(hinfo, st))
                        continue;

                spin_lock_bh(lock);
                sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
                        if (seq_sk_match(seq, sk))
                                return sk;
                }
                spin_unlock_bh(lock);
        }

        return NULL;
}

static void *established_get_next(struct seq_file *seq, void *cur)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct tcp_iter_state *st = seq->private;
        struct hlist_nulls_node *node;
        struct sock *sk = cur;

        ++st->num;
        ++st->offset;

        sk = sk_nulls_next(sk);

        sk_nulls_for_each_from(sk, node) {
                if (seq_sk_match(seq, sk))
                        return sk;
        }

        spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
        ++st->bucket;
        return established_get_first(seq);
}

static void *established_get_idx(struct seq_file *seq, loff_t pos)
{
        struct tcp_iter_state *st = seq->private;
        void *rc;

        st->bucket = 0;
        rc = established_get_first(seq);

        while (rc && pos) {
                rc = established_get_next(seq, rc);
                --pos;
        }
        return rc;
}

static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
{
        void *rc;
        struct tcp_iter_state *st = seq->private;

        st->state = TCP_SEQ_STATE_LISTENING;
        rc          = listening_get_idx(seq, &pos);

        if (!rc) {
                st->state = TCP_SEQ_STATE_ESTABLISHED;
                rc          = established_get_idx(seq, pos);
        }

        return rc;
}

static void *tcp_seek_last_pos(struct seq_file *seq)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct tcp_iter_state *st = seq->private;
        int bucket = st->bucket;
        int offset = st->offset;
        int orig_num = st->num;
        void *rc = NULL;

        switch (st->state) {
        case TCP_SEQ_STATE_LISTENING:
                if (st->bucket > hinfo->lhash2_mask)
                        break;
                rc = listening_get_first(seq);
                while (offset-- && rc && bucket == st->bucket)
                        rc = listening_get_next(seq, rc);
                if (rc)
                        break;
                st->bucket = 0;
                st->state = TCP_SEQ_STATE_ESTABLISHED;
                fallthrough;
        case TCP_SEQ_STATE_ESTABLISHED:
                if (st->bucket > hinfo->ehash_mask)
                        break;
                rc = established_get_first(seq);
                while (offset-- && rc && bucket == st->bucket)
                        rc = established_get_next(seq, rc);
        }

        st->num = orig_num;

        return rc;
}

void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
{
        struct tcp_iter_state *st = seq->private;
        void *rc;

        if (*pos && *pos == st->last_pos) {
                rc = tcp_seek_last_pos(seq);
                if (rc)
                        goto out;
        }

        st->state = TCP_SEQ_STATE_LISTENING;
        st->num = 0;
        st->bucket = 0;
        st->offset = 0;
        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;

out:
        st->last_pos = *pos;
        return rc;
}

void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct tcp_iter_state *st = seq->private;
        void *rc = NULL;

        if (v == SEQ_START_TOKEN) {
                rc = tcp_get_idx(seq, 0);
                goto out;
        }

        switch (st->state) {
        case TCP_SEQ_STATE_LISTENING:
                rc = listening_get_next(seq, v);
                if (!rc) {
                        st->state = TCP_SEQ_STATE_ESTABLISHED;
                        st->bucket = 0;
                        st->offset = 0;
                        rc          = established_get_first(seq);
                }
                break;
        case TCP_SEQ_STATE_ESTABLISHED:
                rc = established_get_next(seq, v);
                break;
        }
out:
        ++*pos;
        st->last_pos = *pos;
        return rc;
}

void tcp_seq_stop(struct seq_file *seq, void *v)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct tcp_iter_state *st = seq->private;

        switch (st->state) {
        case TCP_SEQ_STATE_LISTENING:
                if (v != SEQ_START_TOKEN)
                        spin_unlock(&hinfo->lhash2[st->bucket].lock);
                break;
        case TCP_SEQ_STATE_ESTABLISHED:
                if (v)
                        spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
                break;
        }
}

static void get_openreq4(const struct request_sock *req,
                         struct seq_file *f, int i)
{
        const struct inet_request_sock *ireq = inet_rsk(req);
        long delta = req->rsk_timer.expires - jiffies;

        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
                i,
                ireq->ir_loc_addr,
                ireq->ir_num,
                ireq->ir_rmt_addr,
                ntohs(ireq->ir_rmt_port),
                TCP_SYN_RECV,
                0, 0, /* could print option size, but that is af dependent. */
                1,    /* timers active (only the expire timer) */
                jiffies_delta_to_clock_t(delta),
                req->num_timeout,
                from_kuid_munged(seq_user_ns(f),
                                 sk_uid(req->rsk_listener)),
                0,  /* non standard timer */
                0, /* open_requests have no inode */
                0,
                req);
}

static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
{
        int timer_active;
        unsigned long timer_expires;
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const struct inet_sock *inet = inet_sk(sk);
        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
        __be32 dest = inet->inet_daddr;
        __be32 src = inet->inet_rcv_saddr;
        __u16 destp = ntohs(inet->inet_dport);
        __u16 srcp = ntohs(inet->inet_sport);
        u8 icsk_pending;
        int rx_queue;
        int state;

        icsk_pending = smp_load_acquire(&icsk->icsk_pending);
        if (icsk_pending == ICSK_TIME_RETRANS ||
            icsk_pending == ICSK_TIME_REO_TIMEOUT ||
            icsk_pending == ICSK_TIME_LOSS_PROBE) {
                timer_active        = 1;
                timer_expires        = tcp_timeout_expires(sk);
        } else if (icsk_pending == ICSK_TIME_PROBE0) {
                timer_active        = 4;
                timer_expires        = tcp_timeout_expires(sk);
        } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
                timer_active        = 2;
                timer_expires        = icsk->icsk_keepalive_timer.expires;
        } else {
                timer_active        = 0;
                timer_expires = jiffies;
        }

        state = inet_sk_state_load(sk);
        if (state == TCP_LISTEN)
                rx_queue = READ_ONCE(sk->sk_ack_backlog);
        else
                /* Because we don't lock the socket,
                 * we might find a transient negative value.
                 */
                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
                                      READ_ONCE(tp->copied_seq), 0);

        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
                        "%08X %5u %8d %llu %d %pK %lu %lu %u %u %d",
                i, src, srcp, dest, destp, state,
                READ_ONCE(tp->write_seq) - tp->snd_una,
                rx_queue,
                timer_active,
                jiffies_delta_to_clock_t(timer_expires - jiffies),
                READ_ONCE(icsk->icsk_retransmits),
                from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
                READ_ONCE(icsk->icsk_probes_out),
                sock_i_ino(sk),
                refcount_read(&sk->sk_refcnt), sk,
                jiffies_to_clock_t(icsk->icsk_rto),
                jiffies_to_clock_t(icsk->icsk_ack.ato),
                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
                tcp_snd_cwnd(tp),
                state == TCP_LISTEN ?
                    fastopenq->max_qlen :
                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
}

static void get_timewait4_sock(const struct inet_timewait_sock *tw,
                               struct seq_file *f, int i)
{
        long delta = tw->tw_timer.expires - jiffies;
        __be32 dest, src;
        __u16 destp, srcp;

        dest  = tw->tw_daddr;
        src   = tw->tw_rcv_saddr;
        destp = ntohs(tw->tw_dport);
        srcp  = ntohs(tw->tw_sport);

        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
                i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
                refcount_read(&tw->tw_refcnt), tw);
}

#define TMPSZ 150

static int tcp4_seq_show(struct seq_file *seq, void *v)
{
        struct tcp_iter_state *st;
        struct sock *sk = v;

        seq_setwidth(seq, TMPSZ - 1);
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
                           "rx_queue tr tm->when retrnsmt   uid  timeout "
                           "inode");
                goto out;
        }
        st = seq->private;

        if (sk->sk_state == TCP_TIME_WAIT)
                get_timewait4_sock(v, seq, st->num);
        else if (sk->sk_state == TCP_NEW_SYN_RECV)
                get_openreq4(v, seq, st->num);
        else
                get_tcp4_sock(v, seq, st->num);
out:
        seq_pad(seq, '\n');
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
union bpf_tcp_iter_batch_item {
        struct sock *sk;
        __u64 cookie;
};

struct bpf_tcp_iter_state {
        struct tcp_iter_state state;
        unsigned int cur_sk;
        unsigned int end_sk;
        unsigned int max_sk;
        union bpf_tcp_iter_batch_item *batch;
};

struct bpf_iter__tcp {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct sock_common *, sk_common);
        uid_t uid __aligned(8);
};

static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
                             struct sock_common *sk_common, uid_t uid)
{
        struct bpf_iter__tcp ctx;

        meta->seq_num--;  /* skip SEQ_START_TOKEN */
        ctx.meta = meta;
        ctx.sk_common = sk_common;
        ctx.uid = uid;
        return bpf_iter_run_prog(prog, &ctx);
}

static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
{
        union bpf_tcp_iter_batch_item *item;
        unsigned int cur_sk = iter->cur_sk;
        __u64 cookie;

        /* Remember the cookies of the sockets we haven't seen yet, so we can
         * pick up where we left off next time around.
         */
        while (cur_sk < iter->end_sk) {
                item = &iter->batch[cur_sk++];
                cookie = sock_gen_cookie(item->sk);
                sock_gen_put(item->sk);
                item->cookie = cookie;
        }
}

static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
                                      unsigned int new_batch_sz, gfp_t flags)
{
        union bpf_tcp_iter_batch_item *new_batch;

        new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
                             flags | __GFP_NOWARN);
        if (!new_batch)
                return -ENOMEM;

        memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
        kvfree(iter->batch);
        iter->batch = new_batch;
        iter->max_sk = new_batch_sz;

        return 0;
}

static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
                                               union bpf_tcp_iter_batch_item *cookies,
                                               int n_cookies)
{
        struct hlist_nulls_node *node;
        struct sock *sk;
        int i;

        for (i = 0; i < n_cookies; i++) {
                sk = first_sk;
                sk_nulls_for_each_from(sk, node)
                        if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
                                return sk;
        }

        return NULL;
}

static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct bpf_tcp_iter_state *iter = seq->private;
        struct tcp_iter_state *st = &iter->state;
        unsigned int find_cookie = iter->cur_sk;
        unsigned int end_cookie = iter->end_sk;
        int resume_bucket = st->bucket;
        struct sock *sk;

        if (end_cookie && find_cookie == end_cookie)
                ++st->bucket;

        sk = listening_get_first(seq);
        iter->cur_sk = 0;
        iter->end_sk = 0;

        if (sk && st->bucket == resume_bucket && end_cookie) {
                sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
                                                end_cookie - find_cookie);
                if (!sk) {
                        spin_unlock(&hinfo->lhash2[st->bucket].lock);
                        ++st->bucket;
                        sk = listening_get_first(seq);
                }
        }

        return sk;
}

static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct bpf_tcp_iter_state *iter = seq->private;
        struct tcp_iter_state *st = &iter->state;
        unsigned int find_cookie = iter->cur_sk;
        unsigned int end_cookie = iter->end_sk;
        int resume_bucket = st->bucket;
        struct sock *sk;

        if (end_cookie && find_cookie == end_cookie)
                ++st->bucket;

        sk = established_get_first(seq);
        iter->cur_sk = 0;
        iter->end_sk = 0;

        if (sk && st->bucket == resume_bucket && end_cookie) {
                sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
                                                end_cookie - find_cookie);
                if (!sk) {
                        spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
                        ++st->bucket;
                        sk = established_get_first(seq);
                }
        }

        return sk;
}

static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
{
        struct bpf_tcp_iter_state *iter = seq->private;
        struct tcp_iter_state *st = &iter->state;
        struct sock *sk = NULL;

        switch (st->state) {
        case TCP_SEQ_STATE_LISTENING:
                sk = bpf_iter_tcp_resume_listening(seq);
                if (sk)
                        break;
                st->bucket = 0;
                st->state = TCP_SEQ_STATE_ESTABLISHED;
                fallthrough;
        case TCP_SEQ_STATE_ESTABLISHED:
                sk = bpf_iter_tcp_resume_established(seq);
                break;
        }

        return sk;
}

static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
                                                 struct sock **start_sk)
{
        struct bpf_tcp_iter_state *iter = seq->private;
        struct hlist_nulls_node *node;
        unsigned int expected = 1;
        struct sock *sk;

        sock_hold(*start_sk);
        iter->batch[iter->end_sk++].sk = *start_sk;

        sk = sk_nulls_next(*start_sk);
        *start_sk = NULL;
        sk_nulls_for_each_from(sk, node) {
                if (seq_sk_match(seq, sk)) {
                        if (iter->end_sk < iter->max_sk) {
                                sock_hold(sk);
                                iter->batch[iter->end_sk++].sk = sk;
                        } else if (!*start_sk) {
                                /* Remember where we left off. */
                                *start_sk = sk;
                        }
                        expected++;
                }
        }

        return expected;
}

static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
                                                   struct sock **start_sk)
{
        struct bpf_tcp_iter_state *iter = seq->private;
        struct hlist_nulls_node *node;
        unsigned int expected = 1;
        struct sock *sk;

        sock_hold(*start_sk);
        iter->batch[iter->end_sk++].sk = *start_sk;

        sk = sk_nulls_next(*start_sk);
        *start_sk = NULL;
        sk_nulls_for_each_from(sk, node) {
                if (seq_sk_match(seq, sk)) {
                        if (iter->end_sk < iter->max_sk) {
                                sock_hold(sk);
                                iter->batch[iter->end_sk++].sk = sk;
                        } else if (!*start_sk) {
                                /* Remember where we left off. */
                                *start_sk = sk;
                        }
                        expected++;
                }
        }

        return expected;
}

static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
                                        struct sock **start_sk)
{
        struct bpf_tcp_iter_state *iter = seq->private;
        struct tcp_iter_state *st = &iter->state;

        if (st->state == TCP_SEQ_STATE_LISTENING)
                return bpf_iter_tcp_listening_batch(seq, start_sk);
        else
                return bpf_iter_tcp_established_batch(seq, start_sk);
}

static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct bpf_tcp_iter_state *iter = seq->private;
        struct tcp_iter_state *st = &iter->state;

        if (st->state == TCP_SEQ_STATE_LISTENING)
                spin_unlock(&hinfo->lhash2[st->bucket].lock);
        else
                spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
}

static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
{
        struct bpf_tcp_iter_state *iter = seq->private;
        unsigned int expected;
        struct sock *sk;
        int err;

        sk = bpf_iter_tcp_resume(seq);
        if (!sk)
                return NULL; /* Done */

        expected = bpf_iter_fill_batch(seq, &sk);
        if (likely(iter->end_sk == expected))
                goto done;

        /* Batch size was too small. */
        bpf_iter_tcp_unlock_bucket(seq);
        bpf_iter_tcp_put_batch(iter);
        err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
                                         GFP_USER);
        if (err)
                return ERR_PTR(err);

        sk = bpf_iter_tcp_resume(seq);
        if (!sk)
                return NULL; /* Done */

        expected = bpf_iter_fill_batch(seq, &sk);
        if (likely(iter->end_sk == expected))
                goto done;

        /* Batch size was still too small. Hold onto the lock while we try
         * again with a larger batch to make sure the current bucket's size
         * does not change in the meantime.
         */
        err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
        if (err) {
                bpf_iter_tcp_unlock_bucket(seq);
                return ERR_PTR(err);
        }

        expected = bpf_iter_fill_batch(seq, &sk);
        WARN_ON_ONCE(iter->end_sk != expected);
done:
        bpf_iter_tcp_unlock_bucket(seq);
        return iter->batch[0].sk;
}

static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
{
        /* bpf iter does not support lseek, so it always
         * continue from where it was stop()-ped.
         */
        if (*pos)
                return bpf_iter_tcp_batch(seq);

        return SEQ_START_TOKEN;
}

static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct bpf_tcp_iter_state *iter = seq->private;
        struct tcp_iter_state *st = &iter->state;
        struct sock *sk;

        /* Whenever seq_next() is called, the iter->cur_sk is
         * done with seq_show(), so advance to the next sk in
         * the batch.
         */
        if (iter->cur_sk < iter->end_sk) {
                /* Keeping st->num consistent in tcp_iter_state.
                 * bpf_iter_tcp does not use st->num.
                 * meta.seq_num is used instead.
                 */
                st->num++;
                sock_gen_put(iter->batch[iter->cur_sk++].sk);
        }

        if (iter->cur_sk < iter->end_sk)
                sk = iter->batch[iter->cur_sk].sk;
        else
                sk = bpf_iter_tcp_batch(seq);

        ++*pos;
        /* Keeping st->last_pos consistent in tcp_iter_state.
         * bpf iter does not do lseek, so st->last_pos always equals to *pos.
         */
        st->last_pos = *pos;
        return sk;
}

static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        struct sock *sk = v;
        uid_t uid;
        int ret;

        if (v == SEQ_START_TOKEN)
                return 0;

        if (sk_fullsock(sk))
                lock_sock(sk);

        if (unlikely(sk_unhashed(sk))) {
                ret = SEQ_SKIP;
                goto unlock;
        }

        if (sk->sk_state == TCP_TIME_WAIT) {
                uid = 0;
        } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
                const struct request_sock *req = v;

                uid = from_kuid_munged(seq_user_ns(seq),
                                       sk_uid(req->rsk_listener));
        } else {
                uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
        }

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        ret = tcp_prog_seq_show(prog, &meta, v, uid);

unlock:
        if (sk_fullsock(sk))
                release_sock(sk);
        return ret;

}

static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_tcp_iter_state *iter = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)tcp_prog_seq_show(prog, &meta, v, 0);
        }

        if (iter->cur_sk < iter->end_sk)
                bpf_iter_tcp_put_batch(iter);
}

static const struct seq_operations bpf_iter_tcp_seq_ops = {
        .show                = bpf_iter_tcp_seq_show,
        .start                = bpf_iter_tcp_seq_start,
        .next                = bpf_iter_tcp_seq_next,
        .stop                = bpf_iter_tcp_seq_stop,
};
#endif
static unsigned short seq_file_family(const struct seq_file *seq)
{
        const struct tcp_seq_afinfo *afinfo;

#ifdef CONFIG_BPF_SYSCALL
        /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
        if (seq->op == &bpf_iter_tcp_seq_ops)
                return AF_UNSPEC;
#endif

        /* Iterated from proc fs */
        afinfo = pde_data(file_inode(seq->file));
        return afinfo->family;
}

static const struct seq_operations tcp4_seq_ops = {
        .show                = tcp4_seq_show,
        .start                = tcp_seq_start,
        .next                = tcp_seq_next,
        .stop                = tcp_seq_stop,
};

static struct tcp_seq_afinfo tcp4_seq_afinfo = {
        .family                = AF_INET,
};

static int __net_init tcp4_proc_init_net(struct net *net)
{
        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
                return -ENOMEM;
        return 0;
}

static void __net_exit tcp4_proc_exit_net(struct net *net)
{
        remove_proc_entry("tcp", net->proc_net);
}

static struct pernet_operations tcp4_net_ops = {
        .init = tcp4_proc_init_net,
        .exit = tcp4_proc_exit_net,
};

int __init tcp4_proc_init(void)
{
        return register_pernet_subsys(&tcp4_net_ops);
}

void tcp4_proc_exit(void)
{
        unregister_pernet_subsys(&tcp4_net_ops);
}
#endif /* CONFIG_PROC_FS */

struct proto tcp_prot = {
        .name                        = "TCP",
        .owner                        = THIS_MODULE,
        .close                        = tcp_close,
        .pre_connect                = tcp_v4_pre_connect,
        .connect                = tcp_v4_connect,
        .disconnect                = tcp_disconnect,
        .accept                        = inet_csk_accept,
        .ioctl                        = tcp_ioctl,
        .init                        = tcp_v4_init_sock,
        .destroy                = tcp_v4_destroy_sock,
        .shutdown                = tcp_shutdown,
        .setsockopt                = tcp_setsockopt,
        .getsockopt                = tcp_getsockopt,
        .bpf_bypass_getsockopt        = tcp_bpf_bypass_getsockopt,
        .keepalive                = tcp_set_keepalive,
        .recvmsg                = tcp_recvmsg,
        .sendmsg                = tcp_sendmsg,
        .splice_eof                = tcp_splice_eof,
        .backlog_rcv                = tcp_v4_do_rcv,
        .release_cb                = tcp_release_cb,
        .hash                        = inet_hash,
        .unhash                        = inet_unhash,
        .get_port                = inet_csk_get_port,
        .put_port                = inet_put_port,
#ifdef CONFIG_BPF_SYSCALL
        .psock_update_sk_prot        = tcp_bpf_update_proto,
#endif
        .enter_memory_pressure        = tcp_enter_memory_pressure,
        .leave_memory_pressure        = tcp_leave_memory_pressure,
        .stream_memory_free        = tcp_stream_memory_free,
        .sockets_allocated        = &tcp_sockets_allocated,

        .memory_allocated        = &net_aligned_data.tcp_memory_allocated,
        .per_cpu_fw_alloc        = &tcp_memory_per_cpu_fw_alloc,

        .memory_pressure        = &tcp_memory_pressure,
        .sysctl_mem                = sysctl_tcp_mem,
        .sysctl_wmem_offset        = offsetof(struct net, ipv4.sysctl_tcp_wmem),
        .sysctl_rmem_offset        = offsetof(struct net, ipv4.sysctl_tcp_rmem),
        .max_header                = MAX_TCP_HEADER,
        .obj_size                = sizeof(struct tcp_sock),
        .freeptr_offset                = offsetof(struct tcp_sock,
                                           inet_conn.icsk_inet.sk.sk_freeptr),
        .slab_flags                = SLAB_TYPESAFE_BY_RCU,
        .twsk_prot                = &tcp_timewait_sock_ops,
        .rsk_prot                = &tcp_request_sock_ops,
        .h.hashinfo                = NULL,
        .no_autobind                = true,
        .diag_destroy                = tcp_abort,
};
EXPORT_SYMBOL(tcp_prot);

static void __net_exit tcp_sk_exit(struct net *net)
{
        if (net->ipv4.tcp_congestion_control)
                bpf_module_put(net->ipv4.tcp_congestion_control,
                               net->ipv4.tcp_congestion_control->owner);
}

static void __net_init tcp_set_hashinfo(struct net *net)
{
        struct inet_hashinfo *hinfo;
        unsigned int ehash_entries;
        struct net *old_net;

        if (net_eq(net, &init_net))
                goto fallback;

        old_net = current->nsproxy->net_ns;
        ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
        if (!ehash_entries)
                goto fallback;

        ehash_entries = roundup_pow_of_two(ehash_entries);
        hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
        if (!hinfo) {
                pr_warn("Failed to allocate TCP ehash (entries: %u) "
                        "for a netns, fallback to the global one\n",
                        ehash_entries);
fallback:
                hinfo = &tcp_hashinfo;
                ehash_entries = tcp_hashinfo.ehash_mask + 1;
        }

        net->ipv4.tcp_death_row.hashinfo = hinfo;
        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
        net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
}

static int __net_init tcp_sk_init(struct net *net)
{
        net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
        net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
        net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
        net->ipv4.sysctl_tcp_ecn_fallback = 1;

        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
        net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;

        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;

        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
        net->ipv4.sysctl_tcp_syncookies = 1;
        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
        net->ipv4.sysctl_tcp_orphan_retries = 0;
        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
        net->ipv4.sysctl_tcp_tw_reuse = 2;
        net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
        net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;

        refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
        tcp_set_hashinfo(net);

        net->ipv4.sysctl_tcp_sack = 1;
        net->ipv4.sysctl_tcp_window_scaling = 1;
        net->ipv4.sysctl_tcp_timestamps = 1;
        net->ipv4.sysctl_tcp_early_retrans = 3;
        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
        net->ipv4.sysctl_tcp_retrans_collapse = 1;
        net->ipv4.sysctl_tcp_max_reordering = 300;
        net->ipv4.sysctl_tcp_dsack = 1;
        net->ipv4.sysctl_tcp_app_win = 31;
        net->ipv4.sysctl_tcp_adv_win_scale = 1;
        net->ipv4.sysctl_tcp_frto = 2;
        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
        net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
        /* This limits the percentage of the congestion window which we
         * will allow a single TSO frame to consume.  Building TSO frames
         * which are too large can cause TCP streams to be bursty.
         */
        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
        /* Default TSQ limit of 4 MB */
        net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;

        /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
        net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;

        net->ipv4.sysctl_tcp_min_tso_segs = 2;
        net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
        net->ipv4.sysctl_tcp_autocorking = 1;
        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
        if (net != &init_net) {
                memcpy(net->ipv4.sysctl_tcp_rmem,
                       init_net.ipv4.sysctl_tcp_rmem,
                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
                memcpy(net->ipv4.sysctl_tcp_wmem,
                       init_net.ipv4.sysctl_tcp_wmem,
                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
        }
        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
        net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
        net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
        net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
        atomic_set(&net->ipv4.tfo_active_disable_times, 0);

        /* Set default values for PLB */
        net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
        net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
        net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
        net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
        /* Default congestion threshold for PLB to mark a round is 50% */
        net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;

        /* Reno is always built in */
        if (!net_eq(net, &init_net) &&
            bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
                               init_net.ipv4.tcp_congestion_control->owner))
                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
        else
                net->ipv4.tcp_congestion_control = &tcp_reno;

        net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
        net->ipv4.sysctl_tcp_shrink_window = 0;

        net->ipv4.sysctl_tcp_pingpong_thresh = 1;
        net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
        net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;

        return 0;
}

static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
        struct net *net;

        /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
         * and failed setup_net error unwinding path are serialized.
         *
         * tcp_twsk_purge() handles twsk in any dead netns, not just those in
         * net_exit_list, the thread that dismantles a particular twsk must
         * do so without other thread progressing to refcount_dec_and_test() of
         * tcp_death_row.tw_refcount.
         */
        mutex_lock(&tcp_exit_batch_mutex);

        tcp_twsk_purge(net_exit_list);

        list_for_each_entry(net, net_exit_list, exit_list) {
                inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
                WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
                tcp_fastopen_ctx_destroy(net);
        }

        mutex_unlock(&tcp_exit_batch_mutex);
}

static struct pernet_operations __net_initdata tcp_sk_ops = {
       .init           = tcp_sk_init,
       .exit           = tcp_sk_exit,
       .exit_batch = tcp_sk_exit_batch,
};

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
                     struct sock_common *sk_common, uid_t uid)

#define INIT_BATCH_SZ 16

static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
{
        struct bpf_tcp_iter_state *iter = priv_data;
        int err;

        err = bpf_iter_init_seq_net(priv_data, aux);
        if (err)
                return err;

        err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
        if (err) {
                bpf_iter_fini_seq_net(priv_data);
                return err;
        }

        return 0;
}

static void bpf_iter_fini_tcp(void *priv_data)
{
        struct bpf_tcp_iter_state *iter = priv_data;

        bpf_iter_fini_seq_net(priv_data);
        kvfree(iter->batch);
}

static const struct bpf_iter_seq_info tcp_seq_info = {
        .seq_ops                = &bpf_iter_tcp_seq_ops,
        .init_seq_private        = bpf_iter_init_tcp,
        .fini_seq_private        = bpf_iter_fini_tcp,
        .seq_priv_size                = sizeof(struct bpf_tcp_iter_state),
};

static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
                            const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_setsockopt:
                return &bpf_sk_setsockopt_proto;
        case BPF_FUNC_getsockopt:
                return &bpf_sk_getsockopt_proto;
        default:
                return NULL;
        }
}

static struct bpf_iter_reg tcp_reg_info = {
        .target                        = "tcp",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__tcp, sk_common),
                  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
        },
        .get_func_proto                = bpf_iter_tcp_get_func_proto,
        .seq_info                = &tcp_seq_info,
};

static void __init bpf_iter_register(void)
{
        tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
        if (bpf_iter_reg_target(&tcp_reg_info))
                pr_warn("Warning: could not register bpf iterator tcp\n");
}

#endif

void __init tcp_v4_init(void)
{
        int cpu, res;

        for_each_possible_cpu(cpu) {
                struct sock *sk;

                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
                                           IPPROTO_TCP, &init_net);
                if (res)
                        panic("Failed to create the TCP control socket.\n");
                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

                /* Please enforce IP_DF and IPID==0 for RST and
                 * ACK sent in SYN-RECV and TIME-WAIT state.
                 */
                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;

                sk->sk_clockid = CLOCK_MONOTONIC;

                per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
        }
        if (register_pernet_subsys(&tcp_sk_ops))
                panic("Failed to create the TCP control socket.\n");

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        bpf_iter_register();
#endif
}















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2023 Isovalent */
#ifndef __NET_TCX_H
#define __NET_TCX_H

#include <linux/bpf.h>
#include <linux/bpf_mprog.h>

#include <net/sch_generic.h>

struct mini_Qdisc;

struct tcx_entry {
        struct mini_Qdisc __rcu *miniq;
        struct bpf_mprog_bundle bundle;
        u32 miniq_active;
        struct rcu_head rcu;
};

struct tcx_link {
        struct bpf_link link;
        struct net_device *dev;
};

static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress)
{
#ifdef CONFIG_NET_XGRESS
        skb->tc_at_ingress = ingress;
#endif
}

#ifdef CONFIG_NET_XGRESS
static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry)
{
        struct bpf_mprog_bundle *bundle = entry->parent;

        return container_of(bundle, struct tcx_entry, bundle);
}

static inline struct tcx_link *tcx_link(const struct bpf_link *link)
{
        return container_of(link, struct tcx_link, link);
}

void tcx_inc(void);
void tcx_dec(void);

static inline void tcx_entry_sync(void)
{
        /* bpf_mprog_entry got a/b swapped, therefore ensure that
         * there are no inflight users on the old one anymore.
         */
        synchronize_rcu();
}

static inline void
tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry,
                 bool ingress)
{
        ASSERT_RTNL();
        if (ingress)
                rcu_assign_pointer(dev->tcx_ingress, entry);
        else
                rcu_assign_pointer(dev->tcx_egress, entry);
}

static inline struct bpf_mprog_entry *
tcx_entry_fetch(struct net_device *dev, bool ingress)
{
        ASSERT_RTNL();
        if (ingress)
                return rcu_dereference_rtnl(dev->tcx_ingress);
        else
                return rcu_dereference_rtnl(dev->tcx_egress);
}

static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void)
{
        struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL);

        if (tcx) {
                bpf_mprog_bundle_init(&tcx->bundle);
                return &tcx->bundle.a;
        }
        return NULL;
}
#define tcx_entry_create(...)        alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__))

static inline void tcx_entry_free(struct bpf_mprog_entry *entry)
{
        kfree_rcu(tcx_entry(entry), rcu);
}

static inline struct bpf_mprog_entry *
tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created)
{
        struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress);

        *created = false;
        if (!entry) {
                entry = tcx_entry_create();
                if (!entry)
                        return NULL;
                *created = true;
        }
        return entry;
}

static inline void tcx_skeys_inc(bool ingress)
{
        tcx_inc();
        if (ingress)
                net_inc_ingress_queue();
        else
                net_inc_egress_queue();
}

static inline void tcx_skeys_dec(bool ingress)
{
        if (ingress)
                net_dec_ingress_queue();
        else
                net_dec_egress_queue();
        tcx_dec();
}

static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry)
{
        ASSERT_RTNL();
        tcx_entry(entry)->miniq_active++;
}

static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry)
{
        ASSERT_RTNL();
        tcx_entry(entry)->miniq_active--;
}

static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry)
{
        ASSERT_RTNL();
        return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active;
}

static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb,
                                                   int code)
{
        switch (code) {
        case TCX_PASS:
                skb->tc_index = qdisc_skb_cb(skb)->tc_classid;
                fallthrough;
        case TCX_DROP:
        case TCX_REDIRECT:
                return code;
        case TCX_NEXT:
        default:
                return TCX_NEXT;
        }
}
#endif /* CONFIG_NET_XGRESS */

#if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL)
int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
void tcx_uninstall(struct net_device *dev, bool ingress);

int tcx_prog_query(const union bpf_attr *attr,
                   union bpf_attr __user *uattr);

static inline void dev_tcx_uninstall(struct net_device *dev)
{
        ASSERT_RTNL();
        tcx_uninstall(dev, true);
        tcx_uninstall(dev, false);
}
#else
static inline int tcx_prog_attach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_link_attach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_prog_detach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_prog_query(const union bpf_attr *attr,
                                 union bpf_attr __user *uattr)
{
        return -EINVAL;
}

static inline void dev_tcx_uninstall(struct net_device *dev)
{
}
#endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */
#endif /* __NET_TCX_H */













    8 


























    9 














    1 






    1 







































    2 











    2 






















    2 
































    2 



















    1 




















    1 







    1 










    1 







    1 






























    1 

    1 
    1 














    1 








    1 





























    1 









    1 



    1 





















































































































































































































    1 








































































































































































































    2 




























    2 


    2 







    1 
    1 
































    2 



    2 
    2 








    2 






    2 
    2 

    2 

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <linux/bitmap.h>

#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)

/* for any branch, call, exit record the history of jmps in the given state */
int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
                         int insn_flags, u64 linked_regs)
{
        u32 cnt = cur->jmp_history_cnt;
        struct bpf_jmp_history_entry *p;
        size_t alloc_size;

        /* combine instruction flags if we already recorded this instruction */
        if (env->cur_hist_ent) {
                /* atomic instructions push insn_flags twice, for READ and
                 * WRITE sides, but they should agree on stack slot
                 */
                verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
                                (env->cur_hist_ent->flags & insn_flags) != insn_flags,
                                env, "insn history: insn_idx %d cur flags %x new flags %x",
                                env->insn_idx, env->cur_hist_ent->flags, insn_flags);
                env->cur_hist_ent->flags |= insn_flags;
                verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
                                "insn history: insn_idx %d linked_regs: %#llx",
                                env->insn_idx, env->cur_hist_ent->linked_regs);
                env->cur_hist_ent->linked_regs = linked_regs;
                return 0;
        }

        cnt++;
        alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
        p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT);
        if (!p)
                return -ENOMEM;
        cur->jmp_history = p;

        p = &cur->jmp_history[cnt - 1];
        p->idx = env->insn_idx;
        p->prev_idx = env->prev_insn_idx;
        p->flags = insn_flags;
        p->linked_regs = linked_regs;
        cur->jmp_history_cnt = cnt;
        env->cur_hist_ent = p;

        return 0;
}

static bool is_atomic_load_insn(const struct bpf_insn *insn)
{
        return BPF_CLASS(insn->code) == BPF_STX &&
               BPF_MODE(insn->code) == BPF_ATOMIC &&
               insn->imm == BPF_LOAD_ACQ;
}

static bool is_atomic_fetch_insn(const struct bpf_insn *insn)
{
        return BPF_CLASS(insn->code) == BPF_STX &&
               BPF_MODE(insn->code) == BPF_ATOMIC &&
               (insn->imm & BPF_FETCH);
}

static int insn_stack_access_spi(int insn_flags)
{
        return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
}

static int insn_stack_access_frameno(int insn_flags)
{
        return insn_flags & INSN_F_FRAMENO_MASK;
}

/* Backtrack one insn at a time. If idx is not at the top of recorded
 * history then previous instruction came from straight line execution.
 * Return -ENOENT if we exhausted all instructions within given state.
 *
 * It's legal to have a bit of a looping with the same starting and ending
 * insn index within the same state, e.g.: 3->4->5->3, so just because current
 * instruction index is the same as state's first_idx doesn't mean we are
 * done. If there is still some jump history left, we should keep going. We
 * need to take into account that we might have a jump history between given
 * state's parent and itself, due to checkpointing. In this case, we'll have
 * history entry recording a jump from last instruction of parent state and
 * first instruction of given state.
 */
static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
                             u32 *history)
{
        u32 cnt = *history;

        if (i == st->first_insn_idx) {
                if (cnt == 0)
                        return -ENOENT;
                if (cnt == 1 && st->jmp_history[0].idx == i)
                        return -ENOENT;
        }

        if (cnt && st->jmp_history[cnt - 1].idx == i) {
                i = st->jmp_history[cnt - 1].prev_idx;
                (*history)--;
        } else {
                i--;
        }
        return i;
}

static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
                                                        u32 hist_end, int insn_idx)
{
        if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
                return &st->jmp_history[hist_end - 1];
        return NULL;
}

static inline void bt_init(struct backtrack_state *bt, u32 frame)
{
        bt->frame = frame;
}

static inline void bt_reset(struct backtrack_state *bt)
{
        struct bpf_verifier_env *env = bt->env;

        memset(bt, 0, sizeof(*bt));
        bt->env = env;
}

static inline u32 bt_empty(struct backtrack_state *bt)
{
        u64 mask = 0;
        int i;

        for (i = 0; i <= bt->frame; i++)
                mask |= bt->reg_masks[i] | bt->stack_masks[i];

        return mask == 0;
}

static inline int bt_subprog_enter(struct backtrack_state *bt)
{
        if (bt->frame == MAX_CALL_FRAMES - 1) {
                verifier_bug(bt->env, "subprog enter from frame %d", bt->frame);
                return -EFAULT;
        }
        bt->frame++;
        return 0;
}

static inline int bt_subprog_exit(struct backtrack_state *bt)
{
        if (bt->frame == 0) {
                verifier_bug(bt->env, "subprog exit from frame 0");
                return -EFAULT;
        }
        bt->frame--;
        return 0;
}

static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
{
        bt->reg_masks[frame] &= ~(1 << reg);
}

static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
{
        bpf_bt_set_frame_reg(bt, bt->frame, reg);
}

static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
{
        bt_clear_frame_reg(bt, bt->frame, reg);
}

static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
{
        bt->stack_masks[frame] &= ~(1ull << slot);
}

static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
{
        return bt->reg_masks[frame];
}

static inline u32 bt_reg_mask(struct backtrack_state *bt)
{
        return bt->reg_masks[bt->frame];
}

static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
{
        return bt->stack_masks[frame];
}

static inline u64 bt_stack_mask(struct backtrack_state *bt)
{
        return bt->stack_masks[bt->frame];
}

static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
{
        return bt->reg_masks[bt->frame] & (1 << reg);
}


/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
{
        DECLARE_BITMAP(mask, 64);
        bool first = true;
        int i, n;

        buf[0] = '\0';

        bitmap_from_u64(mask, reg_mask);
        for_each_set_bit(i, mask, 32) {
                n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
                first = false;
                buf += n;
                buf_sz -= n;
                if (buf_sz < 0)
                        break;
        }
}
/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
{
        DECLARE_BITMAP(mask, 64);
        bool first = true;
        int i, n;

        buf[0] = '\0';

        bitmap_from_u64(mask, stack_mask);
        for_each_set_bit(i, mask, 64) {
                n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
                first = false;
                buf += n;
                buf_sz -= n;
                if (buf_sz < 0)
                        break;
        }
}


/* For given verifier state backtrack_insn() is called from the last insn to
 * the first insn. Its purpose is to compute a bitmask of registers and
 * stack slots that needs precision in the parent verifier state.
 *
 * @idx is an index of the instruction we are currently processing;
 * @subseq_idx is an index of the subsequent instruction that:
 *   - *would be* executed next, if jump history is viewed in forward order;
 *   - *was* processed previously during backtracking.
 */
static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
                          struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
{
        struct bpf_insn *insn = env->prog->insnsi + idx;
        u8 class = BPF_CLASS(insn->code);
        u8 opcode = BPF_OP(insn->code);
        u8 mode = BPF_MODE(insn->code);
        u32 dreg = insn->dst_reg;
        u32 sreg = insn->src_reg;
        u32 spi, i, fr;

        if (insn->code == 0)
                return 0;
        if (env->log.level & BPF_LOG_LEVEL2) {
                fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
                verbose(env, "mark_precise: frame%d: regs=%s ",
                        bt->frame, env->tmp_str_buf);
                bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
                verbose(env, "stack=%s before ", env->tmp_str_buf);
                verbose(env, "%d: ", idx);
                bpf_verbose_insn(env, insn);
        }

        /* If there is a history record that some registers gained range at this insn,
         * propagate precision marks to those registers, so that bt_is_reg_set()
         * accounts for these registers.
         */
        bpf_bt_sync_linked_regs(bt, hist);

        if (class == BPF_ALU || class == BPF_ALU64) {
                if (!bt_is_reg_set(bt, dreg))
                        return 0;
                if (opcode == BPF_END || opcode == BPF_NEG) {
                        /* sreg is reserved and unused
                         * dreg still need precision before this insn
                         */
                        return 0;
                } else if (opcode == BPF_MOV) {
                        if (BPF_SRC(insn->code) == BPF_X) {
                                /* dreg = sreg or dreg = (s8, s16, s32)sreg
                                 * dreg needs precision after this insn
                                 * sreg needs precision before this insn
                                 */
                                bt_clear_reg(bt, dreg);
                                if (sreg != BPF_REG_FP)
                                        bt_set_reg(bt, sreg);
                        } else {
                                /* dreg = K
                                 * dreg needs precision after this insn.
                                 * Corresponding register is already marked
                                 * as precise=true in this verifier state.
                                 * No further markings in parent are necessary
                                 */
                                bt_clear_reg(bt, dreg);
                        }
                } else {
                        if (BPF_SRC(insn->code) == BPF_X) {
                                /* dreg += sreg
                                 * both dreg and sreg need precision
                                 * before this insn
                                 */
                                if (sreg != BPF_REG_FP)
                                        bt_set_reg(bt, sreg);
                        } /* else dreg += K
                           * dreg still needs precision before this insn
                           */
                }
        } else if (class == BPF_LDX ||
                   is_atomic_load_insn(insn) ||
                   is_atomic_fetch_insn(insn)) {
                u32 load_reg = dreg;

                /*
                 * Atomic fetch operation writes the old value into
                 * a register (sreg or r0) and if it was tracked for
                 * precision, propagate to the stack slot like we do
                 * in regular ldx.
                 */
                if (is_atomic_fetch_insn(insn))
                        load_reg = insn->imm == BPF_CMPXCHG ?
                                   BPF_REG_0 : sreg;

                if (!bt_is_reg_set(bt, load_reg))
                        return 0;
                bt_clear_reg(bt, load_reg);

                /* scalars can only be spilled into stack w/o losing precision.
                 * Load from any other memory can be zero extended.
                 * The desire to keep that precision is already indicated
                 * by 'precise' mark in corresponding register of this state.
                 * No further tracking necessary.
                 */
                if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
                        return 0;
                /* dreg = *(u64 *)[fp - off] was a fill from the stack.
                 * that [fp - off] slot contains scalar that needs to be
                 * tracked with precision
                 */
                spi = insn_stack_access_spi(hist->flags);
                fr = insn_stack_access_frameno(hist->flags);
                bpf_bt_set_frame_slot(bt, fr, spi);
        } else if (class == BPF_STX || class == BPF_ST) {
                if (bt_is_reg_set(bt, dreg))
                        /* stx & st shouldn't be using _scalar_ dst_reg
                         * to access memory. It means backtracking
                         * encountered a case of pointer subtraction.
                         */
                        return -ENOTSUPP;
                /* scalars can only be spilled into stack */
                if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
                        return 0;
                spi = insn_stack_access_spi(hist->flags);
                fr = insn_stack_access_frameno(hist->flags);
                if (!bt_is_frame_slot_set(bt, fr, spi))
                        return 0;
                bt_clear_frame_slot(bt, fr, spi);
                if (class == BPF_STX)
                        bt_set_reg(bt, sreg);
        } else if (class == BPF_JMP || class == BPF_JMP32) {
                if (bpf_pseudo_call(insn)) {
                        int subprog_insn_idx, subprog;

                        subprog_insn_idx = idx + insn->imm + 1;
                        subprog = bpf_find_subprog(env, subprog_insn_idx);
                        if (subprog < 0)
                                return -EFAULT;

                        if (bpf_subprog_is_global(env, subprog)) {
                                /* check that jump history doesn't have any
                                 * extra instructions from subprog; the next
                                 * instruction after call to global subprog
                                 * should be literally next instruction in
                                 * caller program
                                 */
                                verifier_bug_if(idx + 1 != subseq_idx, env,
                                                "extra insn from subprog");
                                /* r1-r5 are invalidated after subprog call,
                                 * so for global func call it shouldn't be set
                                 * anymore
                                 */
                                if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
                                        verifier_bug(env, "global subprog unexpected regs %x",
                                                     bt_reg_mask(bt));
                                        return -EFAULT;
                                }
                                /* global subprog always sets R0 */
                                bt_clear_reg(bt, BPF_REG_0);
                                return 0;
                        } else {
                                /* static subprog call instruction, which
                                 * means that we are exiting current subprog,
                                 * so only r1-r5 could be still requested as
                                 * precise, r0 and r6-r10 or any stack slot in
                                 * the current frame should be zero by now
                                 */
                                if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
                                        verifier_bug(env, "static subprog unexpected regs %x",
                                                     bt_reg_mask(bt));
                                        return -EFAULT;
                                }
                                /* we are now tracking register spills correctly,
                                 * so any instance of leftover slots is a bug
                                 */
                                if (bt_stack_mask(bt) != 0) {
                                        verifier_bug(env,
                                                     "static subprog leftover stack slots %llx",
                                                     bt_stack_mask(bt));
                                        return -EFAULT;
                                }
                                /* propagate r1-r5 to the caller */
                                for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
                                        if (bt_is_reg_set(bt, i)) {
                                                bt_clear_reg(bt, i);
                                                bpf_bt_set_frame_reg(bt, bt->frame - 1, i);
                                        }
                                }
                                if (bt_subprog_exit(bt))
                                        return -EFAULT;
                                return 0;
                        }
                } else if (bpf_is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
                        /* exit from callback subprog to callback-calling helper or
                         * kfunc call. Use idx/subseq_idx check to discern it from
                         * straight line code backtracking.
                         * Unlike the subprog call handling above, we shouldn't
                         * propagate precision of r1-r5 (if any requested), as they are
                         * not actually arguments passed directly to callback subprogs
                         */
                        if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
                                verifier_bug(env, "callback unexpected regs %x",
                                             bt_reg_mask(bt));
                                return -EFAULT;
                        }
                        if (bt_stack_mask(bt) != 0) {
                                verifier_bug(env, "callback leftover stack slots %llx",
                                             bt_stack_mask(bt));
                                return -EFAULT;
                        }
                        /* clear r1-r5 in callback subprog's mask */
                        for (i = BPF_REG_1; i <= BPF_REG_5; i++)
                                bt_clear_reg(bt, i);
                        if (bt_subprog_exit(bt))
                                return -EFAULT;
                        return 0;
                } else if (opcode == BPF_CALL) {
                        /* kfunc with imm==0 is invalid and fixup_kfunc_call will
                         * catch this error later. Make backtracking conservative
                         * with ENOTSUPP.
                         */
                        if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
                                return -ENOTSUPP;
                        /* regular helper call sets R0 */
                        bt_clear_reg(bt, BPF_REG_0);
                        if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
                                /* if backtracking was looking for registers R1-R5
                                 * they should have been found already.
                                 */
                                verifier_bug(env, "backtracking call unexpected regs %x",
                                             bt_reg_mask(bt));
                                return -EFAULT;
                        }
                        if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call
                            && subseq_idx - idx != 1) {
                                if (bt_subprog_enter(bt))
                                        return -EFAULT;
                        }
                } else if (opcode == BPF_EXIT) {
                        bool r0_precise;

                        /* Backtracking to a nested function call, 'idx' is a part of
                         * the inner frame 'subseq_idx' is a part of the outer frame.
                         * In case of a regular function call, instructions giving
                         * precision to registers R1-R5 should have been found already.
                         * In case of a callback, it is ok to have R1-R5 marked for
                         * backtracking, as these registers are set by the function
                         * invoking callback.
                         */
                        if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx))
                                for (i = BPF_REG_1; i <= BPF_REG_5; i++)
                                        bt_clear_reg(bt, i);
                        if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
                                verifier_bug(env, "backtracking exit unexpected regs %x",
                                             bt_reg_mask(bt));
                                return -EFAULT;
                        }

                        /* BPF_EXIT in subprog or callback always returns
                         * right after the call instruction, so by checking
                         * whether the instruction at subseq_idx-1 is subprog
                         * call or not we can distinguish actual exit from
                         * *subprog* from exit from *callback*. In the former
                         * case, we need to propagate r0 precision, if
                         * necessary. In the former we never do that.
                         */
                        r0_precise = subseq_idx - 1 >= 0 &&
                                     bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
                                     bt_is_reg_set(bt, BPF_REG_0);

                        bt_clear_reg(bt, BPF_REG_0);
                        if (bt_subprog_enter(bt))
                                return -EFAULT;

                        if (r0_precise)
                                bt_set_reg(bt, BPF_REG_0);
                        /* r6-r9 and stack slots will stay set in caller frame
                         * bitmasks until we return back from callee(s)
                         */
                        return 0;
                } else if (BPF_SRC(insn->code) == BPF_X) {
                        if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
                                return 0;
                        /* dreg <cond> sreg
                         * Both dreg and sreg need precision before
                         * this insn. If only sreg was marked precise
                         * before it would be equally necessary to
                         * propagate it to dreg.
                         */
                        if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
                                bt_set_reg(bt, sreg);
                        if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
                                bt_set_reg(bt, dreg);
                } else if (BPF_SRC(insn->code) == BPF_K) {
                         /* dreg <cond> K
                          * Only dreg still needs precision before
                          * this insn, so for the K-based conditional
                          * there is nothing new to be marked.
                          */
                }
        } else if (class == BPF_LD) {
                if (!bt_is_reg_set(bt, dreg))
                        return 0;
                bt_clear_reg(bt, dreg);
                /* It's ld_imm64 or ld_abs or ld_ind.
                 * For ld_imm64 no further tracking of precision
                 * into parent is necessary
                 */
                if (mode == BPF_IND || mode == BPF_ABS)
                        /* to be analyzed */
                        return -ENOTSUPP;
        }
        /* Propagate precision marks to linked registers, to account for
         * registers marked as precise in this function.
         */
        bpf_bt_sync_linked_regs(bt, hist);
        return 0;
}

/* the scalar precision tracking algorithm:
 * . at the start all registers have precise=false.
 * . scalar ranges are tracked as normal through alu and jmp insns.
 * . once precise value of the scalar register is used in:
 *   .  ptr + scalar alu
 *   . if (scalar cond K|scalar)
 *   .  helper_call(.., scalar, ...) where ARG_CONST is expected
 *   backtrack through the verifier states and mark all registers and
 *   stack slots with spilled constants that these scalar registers
 *   should be precise.
 * . during state pruning two registers (or spilled stack slots)
 *   are equivalent if both are not precise.
 *
 * Note the verifier cannot simply walk register parentage chain,
 * since many different registers and stack slots could have been
 * used to compute single precise scalar.
 *
 * The approach of starting with precise=true for all registers and then
 * backtrack to mark a register as not precise when the verifier detects
 * that program doesn't care about specific value (e.g., when helper
 * takes register as ARG_ANYTHING parameter) is not safe.
 *
 * It's ok to walk single parentage chain of the verifier states.
 * It's possible that this backtracking will go all the way till 1st insn.
 * All other branches will be explored for needing precision later.
 *
 * The backtracking needs to deal with cases like:
 *   R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
 * r9 -= r8
 * r5 = r9
 * if r5 > 0x79f goto pc+7
 *    R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
 * r5 += 1
 * ...
 * call bpf_perf_event_output#25
 *   where .arg5_type = ARG_CONST_SIZE_OR_ZERO
 *
 * and this case:
 * r6 = 1
 * call foo // uses callee's r6 inside to compute r0
 * r0 += r6
 * if r0 == 0 goto
 *
 * to track above reg_mask/stack_mask needs to be independent for each frame.
 *
 * Also if parent's curframe > frame where backtracking started,
 * the verifier need to mark registers in both frames, otherwise callees
 * may incorrectly prune callers. This is similar to
 * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
 *
 * For now backtracking falls back into conservative marking.
 */
void bpf_mark_all_scalars_precise(struct bpf_verifier_env *env,
                                 struct bpf_verifier_state *st)
{
        struct bpf_func_state *func;
        struct bpf_reg_state *reg;
        int i, j;

        if (env->log.level & BPF_LOG_LEVEL2) {
                verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
                        st->curframe);
        }

        /* big hammer: mark all scalars precise in this path.
         * pop_stack may still get !precise scalars.
         * We also skip current state and go straight to first parent state,
         * because precision markings in current non-checkpointed state are
         * not needed. See why in the comment in __mark_chain_precision below.
         */
        for (st = st->parent; st; st = st->parent) {
                for (i = 0; i <= st->curframe; i++) {
                        func = st->frame[i];
                        for (j = 0; j < BPF_REG_FP; j++) {
                                reg = &func->regs[j];
                                if (reg->type != SCALAR_VALUE || reg->precise)
                                        continue;
                                reg->precise = true;
                                if (env->log.level & BPF_LOG_LEVEL2) {
                                        verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
                                                i, j);
                                }
                        }
                        for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
                                if (!bpf_is_spilled_reg(&func->stack[j]))
                                        continue;
                                reg = &func->stack[j].spilled_ptr;
                                if (reg->type != SCALAR_VALUE || reg->precise)
                                        continue;
                                reg->precise = true;
                                if (env->log.level & BPF_LOG_LEVEL2) {
                                        verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
                                                i, -(j + 1) * 8);
                                }
                        }
                }
        }
}

/*
 * bpf_mark_chain_precision() backtracks BPF program instruction sequence and
 * chain of verifier states making sure that register *regno* (if regno >= 0)
 * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
 * SCALARS, as well as any other registers and slots that contribute to
 * a tracked state of given registers/stack slots, depending on specific BPF
 * assembly instructions (see backtrack_insns() for exact instruction handling
 * logic). This backtracking relies on recorded jmp_history and is able to
 * traverse entire chain of parent states. This process ends only when all the
 * necessary registers/slots and their transitive dependencies are marked as
 * precise.
 *
 * One important and subtle aspect is that precise marks *do not matter* in
 * the currently verified state (current state). It is important to understand
 * why this is the case.
 *
 * First, note that current state is the state that is not yet "checkpointed",
 * i.e., it is not yet put into env->explored_states, and it has no children
 * states as well. It's ephemeral, and can end up either a) being discarded if
 * compatible explored state is found at some point or BPF_EXIT instruction is
 * reached or b) checkpointed and put into env->explored_states, branching out
 * into one or more children states.
 *
 * In the former case, precise markings in current state are completely
 * ignored by state comparison code (see regsafe() for details). Only
 * checkpointed ("old") state precise markings are important, and if old
 * state's register/slot is precise, regsafe() assumes current state's
 * register/slot as precise and checks value ranges exactly and precisely. If
 * states turn out to be compatible, current state's necessary precise
 * markings and any required parent states' precise markings are enforced
 * after the fact with propagate_precision() logic, after the fact. But it's
 * important to realize that in this case, even after marking current state
 * registers/slots as precise, we immediately discard current state. So what
 * actually matters is any of the precise markings propagated into current
 * state's parent states, which are always checkpointed (due to b) case above).
 * As such, for scenario a) it doesn't matter if current state has precise
 * markings set or not.
 *
 * Now, for the scenario b), checkpointing and forking into child(ren)
 * state(s). Note that before current state gets to checkpointing step, any
 * processed instruction always assumes precise SCALAR register/slot
 * knowledge: if precise value or range is useful to prune jump branch, BPF
 * verifier takes this opportunity enthusiastically. Similarly, when
 * register's value is used to calculate offset or memory address, exact
 * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
 * what we mentioned above about state comparison ignoring precise markings
 * during state comparison, BPF verifier ignores and also assumes precise
 * markings *at will* during instruction verification process. But as verifier
 * assumes precision, it also propagates any precision dependencies across
 * parent states, which are not yet finalized, so can be further restricted
 * based on new knowledge gained from restrictions enforced by their children
 * states. This is so that once those parent states are finalized, i.e., when
 * they have no more active children state, state comparison logic in
 * is_state_visited() would enforce strict and precise SCALAR ranges, if
 * required for correctness.
 *
 * To build a bit more intuition, note also that once a state is checkpointed,
 * the path we took to get to that state is not important. This is crucial
 * property for state pruning. When state is checkpointed and finalized at
 * some instruction index, it can be correctly and safely used to "short
 * circuit" any *compatible* state that reaches exactly the same instruction
 * index. I.e., if we jumped to that instruction from a completely different
 * code path than original finalized state was derived from, it doesn't
 * matter, current state can be discarded because from that instruction
 * forward having a compatible state will ensure we will safely reach the
 * exit. States describe preconditions for further exploration, but completely
 * forget the history of how we got here.
 *
 * This also means that even if we needed precise SCALAR range to get to
 * finalized state, but from that point forward *that same* SCALAR register is
 * never used in a precise context (i.e., it's precise value is not needed for
 * correctness), it's correct and safe to mark such register as "imprecise"
 * (i.e., precise marking set to false). This is what we rely on when we do
 * not set precise marking in current state. If no child state requires
 * precision for any given SCALAR register, it's safe to dictate that it can
 * be imprecise. If any child state does require this register to be precise,
 * we'll mark it precise later retroactively during precise markings
 * propagation from child state to parent states.
 *
 * Skipping precise marking setting in current state is a mild version of
 * relying on the above observation. But we can utilize this property even
 * more aggressively by proactively forgetting any precise marking in the
 * current state (which we inherited from the parent state), right before we
 * checkpoint it and branch off into new child state. This is done by
 * mark_all_scalars_imprecise() to hopefully get more permissive and generic
 * finalized states which help in short circuiting more future states.
 */
int bpf_mark_chain_precision(struct bpf_verifier_env *env,
                            struct bpf_verifier_state *starting_state,
                            int regno,
                            bool *changed)
{
        struct bpf_verifier_state *st = starting_state;
        struct backtrack_state *bt = &env->bt;
        int first_idx = st->first_insn_idx;
        int last_idx = starting_state->insn_idx;
        int subseq_idx = -1;
        struct bpf_func_state *func;
        bool tmp, skip_first = true;
        struct bpf_reg_state *reg;
        int i, fr, err;

        if (!env->bpf_capable)
                return 0;

        changed = changed ?: &tmp;
        /* set frame number from which we are starting to backtrack */
        bt_init(bt, starting_state->curframe);

        /* Do sanity checks against current state of register and/or stack
         * slot, but don't set precise flag in current state, as precision
         * tracking in the current state is unnecessary.
         */
        func = st->frame[bt->frame];
        if (regno >= 0) {
                reg = &func->regs[regno];
                if (reg->type != SCALAR_VALUE) {
                        verifier_bug(env, "backtracking misuse");
                        return -EFAULT;
                }
                bt_set_reg(bt, regno);
        }

        if (bt_empty(bt))
                return 0;

        for (;;) {
                DECLARE_BITMAP(mask, 64);
                u32 history = st->jmp_history_cnt;
                struct bpf_jmp_history_entry *hist;

                if (env->log.level & BPF_LOG_LEVEL2) {
                        verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
                                bt->frame, last_idx, first_idx, subseq_idx);
                }

                if (last_idx < 0) {
                        /* we are at the entry into subprog, which
                         * is expected for global funcs, but only if
                         * requested precise registers are R1-R5
                         * (which are global func's input arguments)
                         */
                        if (st->curframe == 0 &&
                            st->frame[0]->subprogno > 0 &&
                            st->frame[0]->callsite == BPF_MAIN_FUNC &&
                            bt_stack_mask(bt) == 0 &&
                            (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
                                bitmap_from_u64(mask, bt_reg_mask(bt));
                                for_each_set_bit(i, mask, 32) {
                                        reg = &st->frame[0]->regs[i];
                                        bt_clear_reg(bt, i);
                                        if (reg->type == SCALAR_VALUE) {
                                                reg->precise = true;
                                                *changed = true;
                                        }
                                }
                                return 0;
                        }

                        verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx",
                                     st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
                        return -EFAULT;
                }

                for (i = last_idx;;) {
                        if (skip_first) {
                                err = 0;
                                skip_first = false;
                        } else {
                                hist = get_jmp_hist_entry(st, history, i);
                                err = backtrack_insn(env, i, subseq_idx, hist, bt);
                        }
                        if (err == -ENOTSUPP) {
                                bpf_mark_all_scalars_precise(env, starting_state);
                                bt_reset(bt);
                                return 0;
                        } else if (err) {
                                return err;
                        }
                        if (bt_empty(bt))
                                /* Found assignment(s) into tracked register in this state.
                                 * Since this state is already marked, just return.
                                 * Nothing to be tracked further in the parent state.
                                 */
                                return 0;
                        subseq_idx = i;
                        i = get_prev_insn_idx(st, i, &history);
                        if (i == -ENOENT)
                                break;
                        if (i >= env->prog->len) {
                                /* This can happen if backtracking reached insn 0
                                 * and there are still reg_mask or stack_mask
                                 * to backtrack.
                                 * It means the backtracking missed the spot where
                                 * particular register was initialized with a constant.
                                 */
                                verifier_bug(env, "backtracking idx %d", i);
                                return -EFAULT;
                        }
                }
                st = st->parent;
                if (!st)
                        break;

                for (fr = bt->frame; fr >= 0; fr--) {
                        func = st->frame[fr];
                        bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
                        for_each_set_bit(i, mask, 32) {
                                reg = &func->regs[i];
                                if (reg->type != SCALAR_VALUE) {
                                        bt_clear_frame_reg(bt, fr, i);
                                        continue;
                                }
                                if (reg->precise) {
                                        bt_clear_frame_reg(bt, fr, i);
                                } else {
                                        reg->precise = true;
                                        *changed = true;
                                }
                        }

                        bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
                        for_each_set_bit(i, mask, 64) {
                                if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE,
                                                    env, "stack slot %d, total slots %d",
                                                    i, func->allocated_stack / BPF_REG_SIZE))
                                        return -EFAULT;

                                if (!bpf_is_spilled_scalar_reg(&func->stack[i])) {
                                        bt_clear_frame_slot(bt, fr, i);
                                        continue;
                                }
                                reg = &func->stack[i].spilled_ptr;
                                if (reg->precise) {
                                        bt_clear_frame_slot(bt, fr, i);
                                } else {
                                        reg->precise = true;
                                        *changed = true;
                                }
                        }
                        if (env->log.level & BPF_LOG_LEVEL2) {
                                fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
                                             bt_frame_reg_mask(bt, fr));
                                verbose(env, "mark_precise: frame%d: parent state regs=%s ",
                                        fr, env->tmp_str_buf);
                                bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
                                               bt_frame_stack_mask(bt, fr));
                                verbose(env, "stack=%s: ", env->tmp_str_buf);
                                print_verifier_state(env, st, fr, true);
                        }
                }

                if (bt_empty(bt))
                        return 0;

                subseq_idx = first_idx;
                last_idx = st->last_insn_idx;
                first_idx = st->first_insn_idx;
        }

        /* if we still have requested precise regs or slots, we missed
         * something (e.g., stack access through non-r10 register), so
         * fallback to marking all precise
         */
        if (!bt_empty(bt)) {
                bpf_mark_all_scalars_precise(env, starting_state);
                bt_reset(bt);
        }

        return 0;
}















































































































   14 

































































































































































    9 


   11 
























   14 




   11 





















   14 




   14 



   14 


































   13 












   14 







   14 









   13 



   11 






   14 







   13 
   12 




   14 














































   14 




   14 



















   13 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   12 





   12 






























































































   14 
   13 
   14 


























   13 
   14 
    2 


   14 

















   13 










   14 













    2 

    2 













   13 





   11 





































    8 




    9 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
// SPDX-License-Identifier: GPL-2.0
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                       Nauman Rafique <nauman@google.com>
 *
 * For policy-specific per-blkcg data:
 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
 *                    Arianna Avanzini <avanzini.arianna@gmail.com>
 */
#include <linux/ioprio.h>
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/sched/signal.h>
#include <linux/err.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/wait_bit.h>
#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/part_stat.h>
#include "blk.h"
#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);

/*
 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
 * blkcg_pol_register_mutex nests outside of it and synchronizes entire
 * policy [un]register operations including cgroup file additions /
 * removals.  Putting cgroup file registration outside blkcg_pol_mutex
 * allows grabbing it from cgroup callbacks.
 */
static DEFINE_MUTEX(blkcg_pol_register_mutex);
static DEFINE_MUTEX(blkcg_pol_mutex);

struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);

struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
EXPORT_SYMBOL_GPL(blkcg_root_css);

static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];

static LIST_HEAD(all_blkcgs);                /* protected by blkcg_pol_mutex */

bool blkcg_debug_stats = false;

static DEFINE_RAW_SPINLOCK(blkg_stat_lock);

#define BLKG_DESTROY_BATCH_SIZE  64

/*
 * Lockless lists for tracking IO stats update
 *
 * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
 * There are multiple blkg's (one for each block device) attached to each
 * blkcg. The rstat code keeps track of which cpu has IO stats updated,
 * but it doesn't know which blkg has the updated stats. If there are many
 * block devices in a system, the cost of iterating all the blkg's to flush
 * out the IO stats can be high. To reduce such overhead, a set of percpu
 * lockless lists (lhead) per blkcg are used to track the set of recently
 * updated iostat_cpu's since the last flush. An iostat_cpu will be put
 * onto the lockless list on the update side [blk_cgroup_bio_start()] if
 * not there yet and then removed when being flushed [blkcg_rstat_flush()].
 * References to blkg are gotten and then put back in the process to
 * protect against blkg removal.
 *
 * Return: 0 if successful or -ENOMEM if allocation fails.
 */
static int init_blkcg_llists(struct blkcg *blkcg)
{
        int cpu;

        blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
        if (!blkcg->lhead)
                return -ENOMEM;

        for_each_possible_cpu(cpu)
                init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
        return 0;
}

/**
 * blkcg_css - find the current css
 *
 * Find the css associated with either the kthread or the current task.
 * This may return a dying css, so it is up to the caller to use tryget logic
 * to confirm it is alive and well.
 */
static struct cgroup_subsys_state *blkcg_css(void)
{
        struct cgroup_subsys_state *css;

        css = kthread_blkcg();
        if (css)
                return css;
        return task_css(current, io_cgrp_id);
}

static void blkg_free_workfn(struct work_struct *work)
{
        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
                                             free_work);
        struct request_queue *q = blkg->q;
        int i;

        /*
         * pd_free_fn() can also be called from blkcg_deactivate_policy(),
         * in order to make sure pd_free_fn() is called in order, the deletion
         * of the list blkg->q_node is delayed to here from blkg_destroy(), and
         * blkcg_mutex is used to synchronize blkg_free_workfn() and
         * blkcg_deactivate_policy().
         */
        mutex_lock(&q->blkcg_mutex);
        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (blkg->pd[i])
                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
        if (blkg->parent)
                blkg_put(blkg->parent);
        spin_lock_irq(&q->queue_lock);
        list_del_init(&blkg->q_node);
        spin_unlock_irq(&q->queue_lock);
        mutex_unlock(&q->blkcg_mutex);

        blk_put_queue(q);
        free_percpu(blkg->iostat_cpu);
        percpu_ref_exit(&blkg->refcnt);
        kfree(blkg);
}

/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkcg_gq *blkg)
{
        if (!blkg)
                return;

        /*
         * Both ->pd_free_fn() and request queue's release handler may
         * sleep, so free us by scheduling one work func
         */
        INIT_WORK(&blkg->free_work, blkg_free_workfn);
        schedule_work(&blkg->free_work);
}

static void __blkg_release(struct rcu_head *rcu)
{
        struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
        struct blkcg *blkcg = blkg->blkcg;
        int cpu;

#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        WARN_ON(!bio_list_empty(&blkg->async_bios));
#endif
        /*
         * Flush all the non-empty percpu lockless lists before releasing
         * us, given these stat belongs to us.
         *
         * blkg_stat_lock is for serializing blkg stat update
         */
        for_each_possible_cpu(cpu)
                __blkcg_rstat_flush(blkcg, cpu);

        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
        blkg_free(blkg);
}

/*
 * A group is RCU protected, but having an rcu lock does not mean that one
 * can access all the fields of blkg and assume these are valid.  For
 * example, don't try to follow throtl_data and request queue links.
 *
 * Having a reference to blkg under an rcu allows accesses to only values
 * local to groups like group stats and group rate limits.
 */
static void blkg_release(struct percpu_ref *ref)
{
        struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);

        call_rcu(&blkg->rcu_head, __blkg_release);
}

#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
static struct workqueue_struct *blkcg_punt_bio_wq;

static void blkg_async_bio_workfn(struct work_struct *work)
{
        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
                                             async_bio_work);
        struct bio_list bios = BIO_EMPTY_LIST;
        struct bio *bio;
        struct blk_plug plug;
        bool need_plug = false;

        /* as long as there are pending bios, @blkg can't go away */
        spin_lock(&blkg->async_bio_lock);
        bio_list_merge_init(&bios, &blkg->async_bios);
        spin_unlock(&blkg->async_bio_lock);

        /* start plug only when bio_list contains at least 2 bios */
        if (bios.head && bios.head->bi_next) {
                need_plug = true;
                blk_start_plug(&plug);
        }
        while ((bio = bio_list_pop(&bios)))
                submit_bio(bio);
        if (need_plug)
                blk_finish_plug(&plug);
}

/*
 * When a shared kthread issues a bio for a cgroup, doing so synchronously can
 * lead to priority inversions as the kthread can be trapped waiting for that
 * cgroup.  Use this helper instead of submit_bio to punt the actual issuing to
 * a dedicated per-blkcg work item to avoid such priority inversions.
 */
void blkcg_punt_bio_submit(struct bio *bio)
{
        struct blkcg_gq *blkg = bio->bi_blkg;

        if (blkg->parent) {
                spin_lock(&blkg->async_bio_lock);
                bio_list_add(&blkg->async_bios, bio);
                spin_unlock(&blkg->async_bio_lock);
                queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
        } else {
                /* never bounce for the root cgroup */
                submit_bio(bio);
        }
}
EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);

static int __init blkcg_punt_bio_init(void)
{
        blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
                                            WQ_MEM_RECLAIM | WQ_FREEZABLE |
                                            WQ_UNBOUND | WQ_SYSFS, 0);
        if (!blkcg_punt_bio_wq)
                return -ENOMEM;
        return 0;
}
subsys_initcall(blkcg_punt_bio_init);
#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */

/**
 * bio_blkcg_css - return the blkcg CSS associated with a bio
 * @bio: target bio
 *
 * This returns the CSS for the blkcg associated with a bio, or %NULL if not
 * associated. Callers are expected to either handle %NULL or know association
 * has been done prior to calling this.
 */
struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio)
{
        if (!bio || !bio->bi_blkg)
                return NULL;
        return &bio->bi_blkg->blkcg->css;
}
EXPORT_SYMBOL_GPL(bio_blkcg_css);

/**
 * blkcg_parent - get the parent of a blkcg
 * @blkcg: blkcg of interest
 *
 * Return the parent blkcg of @blkcg.  Can be called anytime.
 */
static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
{
        return css_to_blkcg(blkcg->css.parent);
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @disk: gendisk the new blkg is associated with
 * @gfp_mask: allocation mask to use
 *
 * Allocate a new blkg associating @blkcg and @disk.
 */
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
                                   gfp_t gfp_mask)
{
        struct blkcg_gq *blkg;
        int i, cpu;

        /* alloc and init base part */
        blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node);
        if (!blkg)
                return NULL;
        if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
                goto out_free_blkg;
        blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
        if (!blkg->iostat_cpu)
                goto out_exit_refcnt;
        if (!blk_get_queue(disk->queue))
                goto out_free_iostat;

        blkg->q = disk->queue;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
        blkg->iostat.blkg = blkg;
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spin_lock_init(&blkg->async_bio_lock);
        bio_list_init(&blkg->async_bios);
        INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
#endif

        u64_stats_init(&blkg->iostat.sync);
        for_each_possible_cpu(cpu) {
                u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
                per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
        }

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkg_policy_data *pd;

                if (!blkcg_policy_enabled(disk->queue, pol))
                        continue;

                /* alloc per-policy data and attach it to blkg */
                pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask);
                if (!pd)
                        goto out_free_pds;
                blkg->pd[i] = pd;
                pd->blkg = blkg;
                pd->plid = i;
                pd->online = false;
        }

        return blkg;

out_free_pds:
        while (--i >= 0)
                if (blkg->pd[i])
                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
        blk_put_queue(disk->queue);
out_free_iostat:
        free_percpu(blkg->iostat_cpu);
out_exit_refcnt:
        percpu_ref_exit(&blkg->refcnt);
out_free_blkg:
        kfree(blkg);
        return NULL;
}

/*
 * If @new_blkg is %NULL, this function tries to allocate a new one as
 * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
 */
static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
                                    struct blkcg_gq *new_blkg)
{
        struct blkcg_gq *blkg;
        int i, ret;

        lockdep_assert_held(&disk->queue->queue_lock);

        /* request_queue is dying, do not create/recreate a blkg */
        if (blk_queue_dying(disk->queue)) {
                ret = -ENODEV;
                goto err_free_blkg;
        }

        /* blkg holds a reference to blkcg */
        if (!css_tryget_online(&blkcg->css)) {
                ret = -ENODEV;
                goto err_free_blkg;
        }

        /* allocate */
        if (!new_blkg) {
                new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto err_put_css;
                }
        }
        blkg = new_blkg;

        /* link parent */
        if (blkcg_parent(blkcg)) {
                blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
                if (WARN_ON_ONCE(!blkg->parent)) {
                        ret = -ENODEV;
                        goto err_put_css;
                }
                blkg_get(blkg->parent);
        }

        /* invoke per-policy init */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (blkg->pd[i] && pol->pd_init_fn)
                        pol->pd_init_fn(blkg->pd[i]);
        }

        /* insert */
        spin_lock(&blkcg->lock);
        ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg);
        if (likely(!ret)) {
                hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
                list_add(&blkg->q_node, &disk->queue->blkg_list);

                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];

                        if (blkg->pd[i]) {
                                if (pol->pd_online_fn)
                                        pol->pd_online_fn(blkg->pd[i]);
                                blkg->pd[i]->online = true;
                        }
                }
        }
        blkg->online = true;
        spin_unlock(&blkcg->lock);

        if (!ret)
                return blkg;

        /* @blkg failed fully initialized, use the usual release path */
        blkg_put(blkg);
        return ERR_PTR(ret);

err_put_css:
        css_put(&blkcg->css);
err_free_blkg:
        if (new_blkg)
                blkg_free(new_blkg);
        return ERR_PTR(ret);
}

/**
 * blkg_lookup_create - lookup blkg, try to create one if not there
 * @blkcg: blkcg of interest
 * @disk: gendisk of interest
 *
 * Lookup blkg for the @blkcg - @disk pair.  If it doesn't exist, try to
 * create one.  blkg creation is performed recursively from blkcg_root such
 * that all non-root blkg's have access to the parent blkg.  This function
 * should be called under RCU read lock and takes @disk->queue->queue_lock.
 *
 * Returns the blkg or the closest blkg if blkg_create() fails as it walks
 * down from root.
 */
static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        unsigned long flags;

        WARN_ON_ONCE(!rcu_read_lock_held());

        blkg = blkg_lookup(blkcg, q);
        if (blkg)
                return blkg;

        spin_lock_irqsave(&q->queue_lock, flags);
        blkg = blkg_lookup(blkcg, q);
        if (blkg) {
                if (blkcg != &blkcg_root &&
                    blkg != rcu_dereference(blkcg->blkg_hint))
                        rcu_assign_pointer(blkcg->blkg_hint, blkg);
                goto found;
        }

        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.  Returns the closest
         * blkg to the intended blkg should blkg_create() fail.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent = blkcg_parent(blkcg);
                struct blkcg_gq *ret_blkg = q->root_blkg;

                while (parent) {
                        blkg = blkg_lookup(parent, q);
                        if (blkg) {
                                /* remember closest blkg */
                                ret_blkg = blkg;
                                break;
                        }
                        pos = parent;
                        parent = blkcg_parent(parent);
                }

                blkg = blkg_create(pos, disk, NULL);
                if (IS_ERR(blkg)) {
                        blkg = ret_blkg;
                        break;
                }
                if (pos == blkcg)
                        break;
        }

found:
        spin_unlock_irqrestore(&q->queue_lock, flags);
        return blkg;
}

static void blkg_destroy(struct blkcg_gq *blkg)
{
        struct blkcg *blkcg = blkg->blkcg;
        int i;

        lockdep_assert_held(&blkg->q->queue_lock);
        lockdep_assert_held(&blkcg->lock);

        /*
         * blkg stays on the queue list until blkg_free_workfn(), see details in
         * blkg_free_workfn(), hence this function can be called from
         * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before
         * blkg_free_workfn().
         */
        if (hlist_unhashed(&blkg->blkcg_node))
                return;

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (blkg->pd[i] && blkg->pd[i]->online) {
                        blkg->pd[i]->online = false;
                        if (pol->pd_offline_fn)
                                pol->pd_offline_fn(blkg->pd[i]);
                }
        }

        blkg->online = false;

        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
        hlist_del_init_rcu(&blkg->blkcg_node);

        /*
         * Both setting lookup hint to and clearing it from @blkg are done
         * under queue_lock.  If it's not pointing to @blkg now, it never
         * will.  Hint assignment itself can race safely.
         */
        if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
                rcu_assign_pointer(blkcg->blkg_hint, NULL);

        /*
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
         */
        percpu_ref_kill(&blkg->refcnt);
}

static void blkg_destroy_all(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        int count = BLKG_DESTROY_BATCH_SIZE;
        int i;

restart:
        spin_lock_irq(&q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;

                if (hlist_unhashed(&blkg->blkcg_node))
                        continue;

                spin_lock(&blkcg->lock);
                blkg_destroy(blkg);
                spin_unlock(&blkcg->lock);

                /*
                 * in order to avoid holding the spin lock for too long, release
                 * it when a batch of blkgs are destroyed.
                 */
                if (!(--count)) {
                        count = BLKG_DESTROY_BATCH_SIZE;
                        spin_unlock_irq(&q->queue_lock);
                        cond_resched();
                        goto restart;
                }
        }

        /*
         * Mark policy deactivated since policy offline has been done, and
         * the free is scheduled, so future blkcg_deactivate_policy() can
         * be bypassed
         */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (pol)
                        __clear_bit(pol->plid, q->blkcg_pols);
        }

        q->root_blkg = NULL;
        spin_unlock_irq(&q->queue_lock);

        wake_up_var(&q->root_blkg);
}

static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] = src->bytes[i];
                dst->ios[i] = src->ios[i];
        }
}

static void __blkg_clear_stat(struct blkg_iostat_set *bis)
{
        struct blkg_iostat cur = {0};
        unsigned long flags;

        flags = u64_stats_update_begin_irqsave(&bis->sync);
        blkg_iostat_set(&bis->cur, &cur);
        blkg_iostat_set(&bis->last, &cur);
        u64_stats_update_end_irqrestore(&bis->sync, flags);
}

static void blkg_clear_stat(struct blkcg_gq *blkg)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu);

                __blkg_clear_stat(s);
        }
        __blkg_clear_stat(&blkg->iostat);
}

static int blkcg_reset_stats(struct cgroup_subsys_state *css,
                             struct cftype *cftype, u64 val)
{
        struct blkcg *blkcg = css_to_blkcg(css);
        struct blkcg_gq *blkg;
        int i;

        pr_info_once("blkio.%s is deprecated\n", cftype->name);
        mutex_lock(&blkcg_pol_mutex);
        spin_lock_irq(&blkcg->lock);

        /*
         * Note that stat reset is racy - it doesn't synchronize against
         * stat updates.  This is a debug feature which shouldn't exist
         * anyway.  If you get hit by a race, retry.
         */
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
                blkg_clear_stat(blkg);
                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];

                        if (blkg->pd[i] && pol->pd_reset_stats_fn)
                                pol->pd_reset_stats_fn(blkg->pd[i]);
                }
        }

        spin_unlock_irq(&blkcg->lock);
        mutex_unlock(&blkcg_pol_mutex);
        return 0;
}

const char *blkg_dev_name(struct blkcg_gq *blkg)
{
        if (!blkg->q->disk)
                return NULL;
        return bdi_dev_name(blkg->q->disk->bdi);
}

/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data and the matching queue lock held.  If @show_total
 * is %true, the sum of the return values from @prfill is printed with
 * "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
                       const struct blkcg_policy *pol, int data,
                       bool show_total)
{
        struct blkcg_gq *blkg;
        u64 total = 0;

        rcu_read_lock();
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                spin_lock_irq(&blkg->q->queue_lock);
                if (blkcg_policy_enabled(blkg->q, pol))
                        total += prfill(sf, blkg->pd[pol->plid], data);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();

        if (show_total)
                seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy private data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device associated with @pd.
 */
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
{
        const char *dname = blkg_dev_name(pd->blkg);

        if (!dname)
                return 0;

        seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
        return v;
}
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);

/**
 * blkg_conf_init - initialize a blkg_conf_ctx
 * @ctx: blkg_conf_ctx to initialize
 * @input: input string
 *
 * Initialize @ctx which can be used to parse blkg config input string @input.
 * Once initialized, @ctx can be used with blkg_conf_open_bdev() and
 * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
 */
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
{
        *ctx = (struct blkg_conf_ctx){ .input = input };
}
EXPORT_SYMBOL_GPL(blkg_conf_init);

/**
 * blkg_conf_open_bdev - parse and open bdev for per-blkg config update
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from
 * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
 * set to point past the device node prefix.
 *
 * This function may be called multiple times on @ctx and the extra calls become
 * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
 * explicitly if bdev access is needed without resolving the blkcg / policy part
 * of @ctx->input. Returns -errno on error.
 */
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
{
        char *input = ctx->input;
        unsigned int major, minor;
        struct block_device *bdev;
        int key_len;

        if (ctx->bdev)
                return 0;

        if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
                return -EINVAL;

        input += key_len;
        if (!isspace(*input))
                return -EINVAL;
        input = skip_spaces(input);

        bdev = blkdev_get_no_open(MKDEV(major, minor), false);
        if (!bdev)
                return -ENODEV;
        if (bdev_is_partition(bdev)) {
                blkdev_put_no_open(bdev);
                return -ENODEV;
        }

        mutex_lock(&bdev->bd_queue->rq_qos_mutex);
        if (!disk_live(bdev->bd_disk)) {
                blkdev_put_no_open(bdev);
                mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
                return -ENODEV;
        }

        ctx->body = input;
        ctx->bdev = bdev;
        return 0;
}
/*
 * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
 * ensures the correct locking order between freeze queue and q->rq_qos_mutex.
 *
 * This function returns negative error on failure. On success it returns
 * memflags which must be saved and later passed to blkg_conf_exit_frozen
 * for restoring the memalloc scope.
 */
unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
{
        int ret;
        unsigned long memflags;

        if (ctx->bdev)
                return -EINVAL;

        ret = blkg_conf_open_bdev(ctx);
        if (ret < 0)
                return ret;
        /*
         * At this point, we haven’t started protecting anything related to QoS,
         * so we release q->rq_qos_mutex here, which was first acquired in blkg_
         * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
         * the queue to maintain the correct locking order.
         */
        mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);

        memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
        mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);

        return memflags;
}

/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @pol: target policy
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Parse per-blkg config update from @ctx->input and initialize @ctx
 * accordingly. On success, @ctx->body points to the part of @ctx->input
 * following MAJ:MIN, @ctx->bdev points to the target block device and
 * @ctx->blkg to the blkg being configured.
 *
 * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
 * function returns with queue lock held and must be followed by
 * blkg_conf_exit().
 */
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   struct blkg_conf_ctx *ctx)
        __acquires(&bdev->bd_queue->queue_lock)
{
        struct gendisk *disk;
        struct request_queue *q;
        struct blkcg_gq *blkg;
        int ret;

        ret = blkg_conf_open_bdev(ctx);
        if (ret)
                return ret;

        disk = ctx->bdev->bd_disk;
        q = disk->queue;

        /* Prevent concurrent with blkcg_deactivate_policy() */
        mutex_lock(&q->blkcg_mutex);
        spin_lock_irq(&q->queue_lock);

        if (!blkcg_policy_enabled(q, pol)) {
                ret = -EOPNOTSUPP;
                goto fail_unlock;
        }

        blkg = blkg_lookup(blkcg, q);
        if (blkg)
                goto success;

        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent;
                struct blkcg_gq *new_blkg;

                parent = blkcg_parent(blkcg);
                while (parent && !blkg_lookup(parent, q)) {
                        pos = parent;
                        parent = blkcg_parent(parent);
                }

                /* Drop locks to do new blkg allocation with GFP_KERNEL. */
                spin_unlock_irq(&q->queue_lock);

                new_blkg = blkg_alloc(pos, disk, GFP_NOIO);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto fail_exit;
                }

                if (radix_tree_preload(GFP_KERNEL)) {
                        blkg_free(new_blkg);
                        ret = -ENOMEM;
                        goto fail_exit;
                }

                spin_lock_irq(&q->queue_lock);

                if (!blkcg_policy_enabled(q, pol)) {
                        blkg_free(new_blkg);
                        ret = -EOPNOTSUPP;
                        goto fail_preloaded;
                }

                blkg = blkg_lookup(pos, q);
                if (blkg) {
                        blkg_free(new_blkg);
                } else {
                        blkg = blkg_create(pos, disk, new_blkg);
                        if (IS_ERR(blkg)) {
                                ret = PTR_ERR(blkg);
                                goto fail_preloaded;
                        }
                }

                radix_tree_preload_end();

                if (pos == blkcg)
                        goto success;
        }
success:
        mutex_unlock(&q->blkcg_mutex);
        ctx->blkg = blkg;
        return 0;

fail_preloaded:
        radix_tree_preload_end();
fail_unlock:
        spin_unlock_irq(&q->queue_lock);
fail_exit:
        mutex_unlock(&q->blkcg_mutex);
        /*
         * If queue was bypassing, we should retry.  Do so after a
         * short msleep().  It isn't strictly necessary but queue
         * can be bypassing for some time and it's always nice to
         * avoid busy looping.
         */
        if (ret == -EBUSY) {
                msleep(10);
                ret = restart_syscall();
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);

/**
 * blkg_conf_exit - clean up per-blkg config update
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Clean up after per-blkg config update. This function must be called on all
 * blkg_conf_ctx's initialized with blkg_conf_init().
 */
void blkg_conf_exit(struct blkg_conf_ctx *ctx)
        __releases(&ctx->bdev->bd_queue->queue_lock)
        __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
{
        if (ctx->blkg) {
                spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
                ctx->blkg = NULL;
        }

        if (ctx->bdev) {
                mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
                blkdev_put_no_open(ctx->bdev);
                ctx->body = NULL;
                ctx->bdev = NULL;
        }
}
EXPORT_SYMBOL_GPL(blkg_conf_exit);

/*
 * Similar to blkg_conf_exit, but also unfreezes the queue. Should be used
 * when blkg_conf_open_bdev_frozen is used to open the bdev.
 */
void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
{
        if (ctx->bdev) {
                struct request_queue *q = ctx->bdev->bd_queue;

                blkg_conf_exit(ctx);
                blk_mq_unfreeze_queue(q, memflags);
        }
}

static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] += src->bytes[i];
                dst->ios[i] += src->ios[i];
        }
}

static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] -= src->bytes[i];
                dst->ios[i] -= src->ios[i];
        }
}

static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
                                struct blkg_iostat *last)
{
        struct blkg_iostat delta;
        unsigned long flags;

        /* propagate percpu delta to global */
        flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
        blkg_iostat_set(&delta, cur);
        blkg_iostat_sub(&delta, last);
        blkg_iostat_add(&blkg->iostat.cur, &delta);
        blkg_iostat_add(last, &delta);
        u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
{
        struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
        struct llist_node *lnode;
        struct blkg_iostat_set *bisc, *next_bisc;
        unsigned long flags;

        rcu_read_lock();

        lnode = llist_del_all(lhead);
        if (!lnode)
                goto out;

        /*
         * For covering concurrent parent blkg update from blkg_release().
         *
         * When flushing from cgroup, the subsystem rstat lock is always held,
         * so this lock won't cause contention most of time.
         */
        raw_spin_lock_irqsave(&blkg_stat_lock, flags);

        /*
         * Iterate only the iostat_cpu's queued in the lockless list.
         */
        llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
                struct blkcg_gq *blkg = bisc->blkg;
                struct blkcg_gq *parent = blkg->parent;
                struct blkg_iostat cur;
                unsigned int seq;

                /*
                 * Order assignment of `next_bisc` from `bisc->lnode.next` in
                 * llist_for_each_entry_safe and clearing `bisc->lqueued` for
                 * avoiding to assign `next_bisc` with new next pointer added
                 * in blk_cgroup_bio_start() in case of re-ordering.
                 *
                 * The pair barrier is implied in llist_add() in blk_cgroup_bio_start().
                 */
                smp_mb();

                WRITE_ONCE(bisc->lqueued, false);
                if (bisc == &blkg->iostat)
                        goto propagate_up; /* propagate up to parent only */

                /* fetch the current per-cpu values */
                do {
                        seq = u64_stats_fetch_begin(&bisc->sync);
                        blkg_iostat_set(&cur, &bisc->cur);
                } while (u64_stats_fetch_retry(&bisc->sync, seq));

                blkcg_iostat_update(blkg, &cur, &bisc->last);

propagate_up:
                /* propagate global delta to parent (unless that's root) */
                if (parent && parent->parent) {
                        blkcg_iostat_update(parent, &blkg->iostat.cur,
                                            &blkg->iostat.last);
                        /*
                         * Queue parent->iostat to its blkcg's lockless
                         * list to propagate up to the grandparent if the
                         * iostat hasn't been queued yet.
                         */
                        if (!parent->iostat.lqueued) {
                                struct llist_head *plhead;

                                plhead = per_cpu_ptr(parent->blkcg->lhead, cpu);
                                llist_add(&parent->iostat.lnode, plhead);
                                parent->iostat.lqueued = true;
                        }
                }
        }
        raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
out:
        rcu_read_unlock();
}

static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
        /* Root-level stats are sourced from system-wide IO stats */
        if (cgroup_parent(css->cgroup))
                __blkcg_rstat_flush(css_to_blkcg(css), cpu);
}

/*
 * We source root cgroup stats from the system-wide stats to avoid
 * tracking the same information twice and incurring overhead when no
 * cgroups are defined. For that reason, css_rstat_flush in
 * blkcg_print_stat does not actually fill out the iostat in the root
 * cgroup's blkcg_gq.
 *
 * However, we would like to re-use the printing code between the root and
 * non-root cgroups to the extent possible. For that reason, we simulate
 * flushing the root cgroup's stats by explicitly filling in the iostat
 * with disk level statistics.
 */
static void blkcg_fill_root_iostats(void)
{
        struct class_dev_iter iter;
        struct device *dev;

        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
        while ((dev = class_dev_iter_next(&iter))) {
                struct block_device *bdev = dev_to_bdev(dev);
                struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
                struct blkg_iostat tmp;
                int cpu;
                unsigned long flags;

                memset(&tmp, 0, sizeof(tmp));
                for_each_possible_cpu(cpu) {
                        struct disk_stats *cpu_dkstats;

                        cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
                        tmp.ios[BLKG_IOSTAT_READ] +=
                                cpu_dkstats->ios[STAT_READ];
                        tmp.ios[BLKG_IOSTAT_WRITE] +=
                                cpu_dkstats->ios[STAT_WRITE];
                        tmp.ios[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->ios[STAT_DISCARD];
                        // convert sectors to bytes
                        tmp.bytes[BLKG_IOSTAT_READ] +=
                                cpu_dkstats->sectors[STAT_READ] << 9;
                        tmp.bytes[BLKG_IOSTAT_WRITE] +=
                                cpu_dkstats->sectors[STAT_WRITE] << 9;
                        tmp.bytes[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->sectors[STAT_DISCARD] << 9;
                }

                flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
                blkg_iostat_set(&blkg->iostat.cur, &tmp);
                u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
        }
        class_dev_iter_exit(&iter);
}

static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
{
        struct blkg_iostat_set *bis = &blkg->iostat;
        u64 rbytes, wbytes, rios, wios, dbytes, dios;
        const char *dname;
        unsigned seq;
        int i;

        if (!blkg->online)
                return;

        dname = blkg_dev_name(blkg);
        if (!dname)
                return;

        seq_printf(s, "%s ", dname);

        do {
                seq = u64_stats_fetch_begin(&bis->sync);

                rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
                wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
                dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
                rios = bis->cur.ios[BLKG_IOSTAT_READ];
                wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
                dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
        } while (u64_stats_fetch_retry(&bis->sync, seq));

        if (rbytes || wbytes || rios || wios) {
                seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
                        rbytes, wbytes, rios, wios,
                        dbytes, dios);
        }

        if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
                seq_printf(s, " use_delay=%d delay_nsec=%llu",
                        atomic_read(&blkg->use_delay),
                        atomic64_read(&blkg->delay_nsec));
        }

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (!blkg->pd[i] || !pol->pd_stat_fn)
                        continue;

                pol->pd_stat_fn(blkg->pd[i], s);
        }

        seq_puts(s, "\n");
}

static int blkcg_print_stat(struct seq_file *sf, void *v)
{
        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
        struct blkcg_gq *blkg;

        if (!seq_css(sf)->parent)
                blkcg_fill_root_iostats();
        else
                css_rstat_flush(&blkcg->css);

        rcu_read_lock();
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                spin_lock_irq(&blkg->q->queue_lock);
                blkcg_print_one_stat(blkg, sf);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();
        return 0;
}

static struct cftype blkcg_files[] = {
        {
                .name = "stat",
                .seq_show = blkcg_print_stat,
        },
        { }        /* terminate */
};

static struct cftype blkcg_legacy_files[] = {
        {
                .name = "reset_stats",
                .write_u64 = blkcg_reset_stats,
        },
        { }        /* terminate */
};

#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css)
{
        return &css_to_blkcg(css)->cgwb_list;
}
#endif

/*
 * blkcg destruction is a three-stage process.
 *
 * 1. Destruction starts.  The blkcg_css_offline() callback is invoked
 *    which offlines writeback.  Here we tie the next stage of blkg destruction
 *    to the completion of writeback associated with the blkcg.  This lets us
 *    avoid punting potentially large amounts of outstanding writeback to root
 *    while maintaining any ongoing policies.  The next stage is triggered when
 *    the nr_cgwbs count goes to zero.
 *
 * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
 *    and handles the destruction of blkgs.  Here the css reference held by
 *    the blkg is put back eventually allowing blkcg_css_free() to be called.
 *    This work may occur in cgwb_release_workfn() on the cgwb_release
 *    workqueue.  Any submitted ios that fail to get the blkg ref will be
 *    punted to the root_blkg.
 *
 * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
 *    This finally frees the blkcg.
 */

/**
 * blkcg_destroy_blkgs - responsible for shooting down blkgs
 * @blkcg: blkcg of interest
 *
 * blkgs should be removed while holding both q and blkcg locks.  As blkcg lock
 * is nested inside q lock, this function performs reverse double lock dancing.
 * Destroying the blkgs releases the reference held on the blkcg's css allowing
 * blkcg_css_free to eventually be called.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
static void blkcg_destroy_blkgs(struct blkcg *blkcg)
{
        might_sleep();

        spin_lock_irq(&blkcg->lock);

        while (!hlist_empty(&blkcg->blkg_list)) {
                struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
                                                struct blkcg_gq, blkcg_node);
                struct request_queue *q = blkg->q;

                if (need_resched() || !spin_trylock(&q->queue_lock)) {
                        /*
                         * Given that the system can accumulate a huge number
                         * of blkgs in pathological cases, check to see if we
                         * need to rescheduling to avoid softlockup.
                         */
                        spin_unlock_irq(&blkcg->lock);
                        cond_resched();
                        spin_lock_irq(&blkcg->lock);
                        continue;
                }

                blkg_destroy(blkg);
                spin_unlock(&q->queue_lock);
        }

        spin_unlock_irq(&blkcg->lock);
}

/**
 * blkcg_pin_online - pin online state
 * @blkcg_css: blkcg of interest
 *
 * While pinned, a blkcg is kept online.  This is primarily used to
 * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline
 * while an associated cgwb is still active.
 */
void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css)
{
        refcount_inc(&css_to_blkcg(blkcg_css)->online_pin);
}

/**
 * blkcg_unpin_online - unpin online state
 * @blkcg_css: blkcg of interest
 *
 * This is primarily used to impedance-match blkg and cgwb lifetimes so
 * that blkg doesn't go offline while an associated cgwb is still active.
 * When this count goes to zero, all active cgwbs have finished so the
 * blkcg can continue destruction by calling blkcg_destroy_blkgs().
 */
void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
{
        struct blkcg *blkcg = css_to_blkcg(blkcg_css);

        do {
                struct blkcg *parent;

                if (!refcount_dec_and_test(&blkcg->online_pin))
                        break;

                parent = blkcg_parent(blkcg);
                blkcg_destroy_blkgs(blkcg);
                blkcg = parent;
        } while (blkcg);
}

/**
 * blkcg_css_offline - cgroup css_offline callback
 * @css: css of interest
 *
 * This function is called when @css is about to go away.  Here the cgwbs are
 * offlined first and only once writeback associated with the blkcg has
 * finished do we start step 2 (see above).
 */
static void blkcg_css_offline(struct cgroup_subsys_state *css)
{
        /* this prevents anyone from attaching or migrating to this blkcg */
        wb_blkcg_offline(css);

        /* put the base online pin allowing step 2 to be triggered */
        blkcg_unpin_online(css);
}

static void blkcg_css_free(struct cgroup_subsys_state *css)
{
        struct blkcg *blkcg = css_to_blkcg(css);
        int i;

        mutex_lock(&blkcg_pol_mutex);

        list_del(&blkcg->all_blkcgs_node);

        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (blkcg->cpd[i])
                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);

        mutex_unlock(&blkcg_pol_mutex);

        free_percpu(blkcg->lhead);
        kfree(blkcg);
}

static struct cgroup_subsys_state *
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct blkcg *blkcg;
        int i;

        mutex_lock(&blkcg_pol_mutex);

        if (!parent_css) {
                blkcg = &blkcg_root;
        } else {
                blkcg = kzalloc_obj(*blkcg);
                if (!blkcg)
                        goto unlock;
        }

        if (init_blkcg_llists(blkcg))
                goto free_blkcg;

        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkcg_policy_data *cpd;

                /*
                 * If the policy hasn't been attached yet, wait for it
                 * to be attached before doing anything else. Otherwise,
                 * check if the policy requires any specific per-cgroup
                 * data: if it does, allocate and initialize it.
                 */
                if (!pol || !pol->cpd_alloc_fn)
                        continue;

                cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                if (!cpd)
                        goto free_pd_blkcg;

                blkcg->cpd[i] = cpd;
                cpd->blkcg = blkcg;
                cpd->plid = i;
        }

        spin_lock_init(&blkcg->lock);
        refcount_set(&blkcg->online_pin, 1);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&blkcg->cgwb_list);
#endif
        list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);

        mutex_unlock(&blkcg_pol_mutex);
        return &blkcg->css;

free_pd_blkcg:
        for (i--; i >= 0; i--)
                if (blkcg->cpd[i])
                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
        free_percpu(blkcg->lhead);
free_blkcg:
        if (blkcg != &blkcg_root)
                kfree(blkcg);
unlock:
        mutex_unlock(&blkcg_pol_mutex);
        return ERR_PTR(-ENOMEM);
}

static int blkcg_css_online(struct cgroup_subsys_state *css)
{
        struct blkcg *parent = blkcg_parent(css_to_blkcg(css));

        /*
         * blkcg_pin_online() is used to delay blkcg offline so that blkgs
         * don't go offline while cgwbs are still active on them.  Pin the
         * parent so that offline always happens towards the root.
         */
        if (parent)
                blkcg_pin_online(&parent->css);
        return 0;
}

void blkg_init_queue(struct request_queue *q)
{
        INIT_LIST_HEAD(&q->blkg_list);
        mutex_init(&q->blkcg_mutex);
}

int blkcg_init_disk(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *new_blkg, *blkg;
        bool preloaded;

        /*
         * If the queue is shared across disk rebind (e.g., SCSI), the
         * previous disk's blkcg state is cleaned up asynchronously via
         * disk_release() -> blkcg_exit_disk(). Wait for that cleanup to
         * finish (indicated by root_blkg becoming NULL) before setting up
         * new blkcg state. Otherwise, we may overwrite q->root_blkg while
         * the old one is still alive, and radix_tree_insert() in
         * blkg_create() will fail with -EEXIST because the old entries
         * still occupy the same queue id slot in blkcg->blkg_tree.
         */
        wait_var_event(&q->root_blkg, !READ_ONCE(q->root_blkg));

        new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
        if (!new_blkg)
                return -ENOMEM;

        preloaded = !radix_tree_preload(GFP_KERNEL);

        /* Make sure the root blkg exists. */
        /* spin_lock_irq can serve as RCU read-side critical section. */
        spin_lock_irq(&q->queue_lock);
        blkg = blkg_create(&blkcg_root, disk, new_blkg);
        if (IS_ERR(blkg))
                goto err_unlock;
        q->root_blkg = blkg;
        spin_unlock_irq(&q->queue_lock);

        if (preloaded)
                radix_tree_preload_end();

        return 0;

err_unlock:
        spin_unlock_irq(&q->queue_lock);
        if (preloaded)
                radix_tree_preload_end();
        return PTR_ERR(blkg);
}

void blkcg_exit_disk(struct gendisk *disk)
{
        blkg_destroy_all(disk);
        blk_throtl_exit(disk);
}

static void blkcg_exit(struct task_struct *tsk)
{
        if (tsk->throttle_disk)
                put_disk(tsk->throttle_disk);
        tsk->throttle_disk = NULL;
}

struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_online = blkcg_css_online,
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .css_rstat_flush = blkcg_rstat_flush,
        .dfl_cftypes = blkcg_files,
        .legacy_cftypes = blkcg_legacy_files,
        .legacy_name = "blkio",
        .exit = blkcg_exit,
#ifdef CONFIG_MEMCG
        /*
         * This ensures that, if available, memcg is automatically enabled
         * together on the default hierarchy so that the owner cgroup can
         * be retrieved from writeback pages.
         */
        .depends_on = 1 << memory_cgrp_id,
#endif
};
EXPORT_SYMBOL_GPL(io_cgrp_subsys);

/**
 * blkcg_activate_policy - activate a blkcg policy on a gendisk
 * @disk: gendisk of interest
 * @pol: blkcg policy to activate
 *
 * Activate @pol on @disk.  Requires %GFP_KERNEL context.  @disk goes through
 * bypass mode to populate its blkgs with policy_data for @pol.
 *
 * Activation happens with @disk bypassed, so nobody would be accessing blkgs
 * from IO path.  Update of each blkg is protected by both queue and blkcg
 * locks so that holding either lock and testing blkcg_policy_enabled() is
 * always enough for dereferencing policy data.
 *
 * The caller is responsible for synchronizing [de]activations and policy
 * [un]registerations.  Returns 0 on success, -errno on failure.
 */
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
{
        struct request_queue *q = disk->queue;
        struct blkg_policy_data *pd_prealloc = NULL;
        struct blkcg_gq *blkg, *pinned_blkg = NULL;
        unsigned int memflags;
        int ret;

        if (blkcg_policy_enabled(q, pol))
                return 0;

        /*
         * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn,
         * for example, ioprio. Such policy will work on blkcg level, not disk
         * level, and don't need to be activated.
         */
        if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
                return -EINVAL;

        if (queue_is_mq(q))
                memflags = blk_mq_freeze_queue(q);
retry:
        spin_lock_irq(&q->queue_lock);

        /* blkg_list is pushed at the head, reverse walk to initialize parents first */
        list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
                struct blkg_policy_data *pd;

                if (blkg->pd[pol->plid])
                        continue;

                /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
                if (blkg == pinned_blkg) {
                        pd = pd_prealloc;
                        pd_prealloc = NULL;
                } else {
                        pd = pol->pd_alloc_fn(disk, blkg->blkcg,
                                              GFP_NOWAIT);
                }

                if (!pd) {
                        /*
                         * GFP_NOWAIT failed.  Free the existing one and
                         * prealloc for @blkg w/ GFP_KERNEL.
                         */
                        if (pinned_blkg)
                                blkg_put(pinned_blkg);
                        blkg_get(blkg);
                        pinned_blkg = blkg;

                        spin_unlock_irq(&q->queue_lock);

                        if (pd_prealloc)
                                pol->pd_free_fn(pd_prealloc);
                        pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
                                                       GFP_KERNEL);
                        if (pd_prealloc)
                                goto retry;
                        else
                                goto enomem;
                }

                spin_lock(&blkg->blkcg->lock);

                pd->blkg = blkg;
                pd->plid = pol->plid;
                blkg->pd[pol->plid] = pd;

                if (pol->pd_init_fn)
                        pol->pd_init_fn(pd);

                if (pol->pd_online_fn)
                        pol->pd_online_fn(pd);
                pd->online = true;

                spin_unlock(&blkg->blkcg->lock);
        }

        __set_bit(pol->plid, q->blkcg_pols);
        ret = 0;

        spin_unlock_irq(&q->queue_lock);
out:
        if (queue_is_mq(q))
                blk_mq_unfreeze_queue(q, memflags);
        if (pinned_blkg)
                blkg_put(pinned_blkg);
        if (pd_prealloc)
                pol->pd_free_fn(pd_prealloc);
        return ret;

enomem:
        /* alloc failed, take down everything */
        spin_lock_irq(&q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;
                struct blkg_policy_data *pd;

                spin_lock(&blkcg->lock);
                pd = blkg->pd[pol->plid];
                if (pd) {
                        if (pd->online && pol->pd_offline_fn)
                                pol->pd_offline_fn(pd);
                        pd->online = false;
                        pol->pd_free_fn(pd);
                        blkg->pd[pol->plid] = NULL;
                }
                spin_unlock(&blkcg->lock);
        }
        spin_unlock_irq(&q->queue_lock);
        ret = -ENOMEM;
        goto out;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);

/**
 * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk
 * @disk: gendisk of interest
 * @pol: blkcg policy to deactivate
 *
 * Deactivate @pol on @disk.  Follows the same synchronization rules as
 * blkcg_activate_policy().
 */
void blkcg_deactivate_policy(struct gendisk *disk,
                             const struct blkcg_policy *pol)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        unsigned int memflags;

        if (!blkcg_policy_enabled(q, pol))
                return;

        if (queue_is_mq(q))
                memflags = blk_mq_freeze_queue(q);

        mutex_lock(&q->blkcg_mutex);
        spin_lock_irq(&q->queue_lock);

        __clear_bit(pol->plid, q->blkcg_pols);

        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;

                spin_lock(&blkcg->lock);
                if (blkg->pd[pol->plid]) {
                        if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
                                pol->pd_offline_fn(blkg->pd[pol->plid]);
                        pol->pd_free_fn(blkg->pd[pol->plid]);
                        blkg->pd[pol->plid] = NULL;
                }
                spin_unlock(&blkcg->lock);
        }

        spin_unlock_irq(&q->queue_lock);
        mutex_unlock(&q->blkcg_mutex);

        if (queue_is_mq(q))
                blk_mq_unfreeze_queue(q, memflags);
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);

static void blkcg_free_all_cpd(struct blkcg_policy *pol)
{
        struct blkcg *blkcg;

        list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                if (blkcg->cpd[pol->plid]) {
                        pol->cpd_free_fn(blkcg->cpd[pol->plid]);
                        blkcg->cpd[pol->plid] = NULL;
                }
        }
}

/**
 * blkcg_policy_register - register a blkcg policy
 * @pol: blkcg policy to register
 *
 * Register @pol with blkcg core.  Might sleep and @pol may be modified on
 * successful registration.  Returns 0 on success and -errno on failure.
 */
int blkcg_policy_register(struct blkcg_policy *pol)
{
        struct blkcg *blkcg;
        int i, ret;

        /*
         * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
         * without pd_alloc_fn/pd_free_fn can't be activated.
         */
        if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
            (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
                return -EINVAL;

        mutex_lock(&blkcg_pol_register_mutex);
        mutex_lock(&blkcg_pol_mutex);

        /* find an empty slot */
        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (!blkcg_policy[i])
                        break;
        if (i >= BLKCG_MAX_POLS) {
                pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
                ret = -ENOSPC;
                goto err_unlock;
        }

        /* register @pol */
        pol->plid = i;
        blkcg_policy[pol->plid] = pol;

        /* allocate and install cpd's */
        if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                        struct blkcg_policy_data *cpd;

                        cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                        if (!cpd) {
                                ret = -ENOMEM;
                                goto err_free_cpds;
                        }

                        blkcg->cpd[pol->plid] = cpd;
                        cpd->blkcg = blkcg;
                        cpd->plid = pol->plid;
                }
        }

        mutex_unlock(&blkcg_pol_mutex);

        /* everything is in place, add intf files for the new policy */
        if (pol->dfl_cftypes == pol->legacy_cftypes) {
                WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys,
                                           pol->dfl_cftypes));
        } else {
                WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
                                               pol->dfl_cftypes));
                WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
                                                  pol->legacy_cftypes));
        }
        mutex_unlock(&blkcg_pol_register_mutex);
        return 0;

err_free_cpds:
        if (pol->cpd_free_fn)
                blkcg_free_all_cpd(pol);

        blkcg_policy[pol->plid] = NULL;
err_unlock:
        mutex_unlock(&blkcg_pol_mutex);
        mutex_unlock(&blkcg_pol_register_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(blkcg_policy_register);

/**
 * blkcg_policy_unregister - unregister a blkcg policy
 * @pol: blkcg policy to unregister
 *
 * Undo blkcg_policy_register(@pol).  Might sleep.
 */
void blkcg_policy_unregister(struct blkcg_policy *pol)
{
        mutex_lock(&blkcg_pol_register_mutex);

        if (WARN_ON(blkcg_policy[pol->plid] != pol))
                goto out_unlock;

        /* kill the intf files first */
        if (pol->dfl_cftypes)
                cgroup_rm_cftypes(pol->dfl_cftypes);
        if (pol->legacy_cftypes)
                cgroup_rm_cftypes(pol->legacy_cftypes);

        /* remove cpds and unregister */
        mutex_lock(&blkcg_pol_mutex);

        if (pol->cpd_free_fn)
                blkcg_free_all_cpd(pol);

        blkcg_policy[pol->plid] = NULL;

        mutex_unlock(&blkcg_pol_mutex);
out_unlock:
        mutex_unlock(&blkcg_pol_register_mutex);
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);

/*
 * Scale the accumulated delay based on how long it has been since we updated
 * the delay.  We only call this when we are adding delay, in case it's been a
 * while since we added delay, and when we are checking to see if we need to
 * delay a task, to account for any delays that may have occurred.
 */
static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
{
        u64 old = atomic64_read(&blkg->delay_start);

        /* negative use_delay means no scaling, see blkcg_set_delay() */
        if (atomic_read(&blkg->use_delay) < 0)
                return;

        /*
         * We only want to scale down every second.  The idea here is that we
         * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
         * time window.  We only want to throttle tasks for recent delay that
         * has occurred, in 1 second time windows since that's the maximum
         * things can be throttled.  We save the current delay window in
         * blkg->last_delay so we know what amount is still left to be charged
         * to the blkg from this point onward.  blkg->last_use keeps track of
         * the use_delay counter.  The idea is if we're unthrottling the blkg we
         * are ok with whatever is happening now, and we can take away more of
         * the accumulated delay as we've already throttled enough that
         * everybody is happy with their IO latencies.
         */
        if (time_before64(old + NSEC_PER_SEC, now) &&
            atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) {
                u64 cur = atomic64_read(&blkg->delay_nsec);
                u64 sub = min_t(u64, blkg->last_delay, now - old);
                int cur_use = atomic_read(&blkg->use_delay);

                /*
                 * We've been unthrottled, subtract a larger chunk of our
                 * accumulated delay.
                 */
                if (cur_use < blkg->last_use)
                        sub = max_t(u64, sub, blkg->last_delay >> 1);

                /*
                 * This shouldn't happen, but handle it anyway.  Our delay_nsec
                 * should only ever be growing except here where we subtract out
                 * min(last_delay, 1 second), but lord knows bugs happen and I'd
                 * rather not end up with negative numbers.
                 */
                if (unlikely(cur < sub)) {
                        atomic64_set(&blkg->delay_nsec, 0);
                        blkg->last_delay = 0;
                } else {
                        atomic64_sub(sub, &blkg->delay_nsec);
                        blkg->last_delay = cur - sub;
                }
                blkg->last_use = cur_use;
        }
}

/*
 * This is called when we want to actually walk up the hierarchy and check to
 * see if we need to throttle, and then actually throttle if there is some
 * accumulated delay.  This should only be called upon return to user space so
 * we're not holding some lock that would induce a priority inversion.
 */
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
        unsigned long pflags;
        bool clamp;
        u64 now = blk_time_get_ns();
        u64 exp;
        u64 delay_nsec = 0;
        int tok;

        while (blkg->parent) {
                int use_delay = atomic_read(&blkg->use_delay);

                if (use_delay) {
                        u64 this_delay;

                        blkcg_scale_delay(blkg, now);
                        this_delay = atomic64_read(&blkg->delay_nsec);
                        if (this_delay > delay_nsec) {
                                delay_nsec = this_delay;
                                clamp = use_delay > 0;
                        }
                }
                blkg = blkg->parent;
        }

        if (!delay_nsec)
                return;

        /*
         * Let's not sleep for all eternity if we've amassed a huge delay.
         * Swapping or metadata IO can accumulate 10's of seconds worth of
         * delay, and we want userspace to be able to do _something_ so cap the
         * delays at 0.25s. If there's 10's of seconds worth of delay then the
         * tasks will be delayed for 0.25 second for every syscall. If
         * blkcg_set_delay() was used as indicated by negative use_delay, the
         * caller is responsible for regulating the range.
         */
        if (clamp)
                delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);

        if (use_memdelay)
                psi_memstall_enter(&pflags);

        exp = ktime_add_ns(now, delay_nsec);
        tok = io_schedule_prepare();
        do {
                __set_current_state(TASK_KILLABLE);
                if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
                        break;
        } while (!fatal_signal_pending(current));
        io_schedule_finish(tok);

        if (use_memdelay)
                psi_memstall_leave(&pflags);
}

/**
 * blkcg_maybe_throttle_current - throttle the current task if it has been marked
 *
 * This is only called if we've been marked with set_notify_resume().  Obviously
 * we can be set_notify_resume() for reasons other than blkcg throttling, so we
 * check to see if current->throttle_disk is set and if not this doesn't do
 * anything.  This should only ever be called by the resume code, it's not meant
 * to be called by people willy-nilly as it will actually do the work to
 * throttle the task if it is setup for throttling.
 */
void blkcg_maybe_throttle_current(void)
{
        struct gendisk *disk = current->throttle_disk;
        struct blkcg *blkcg;
        struct blkcg_gq *blkg;
        bool use_memdelay = current->use_memdelay;

        if (!disk)
                return;

        current->throttle_disk = NULL;
        current->use_memdelay = false;

        rcu_read_lock();
        blkcg = css_to_blkcg(blkcg_css());
        if (!blkcg)
                goto out;
        blkg = blkg_lookup(blkcg, disk->queue);
        if (!blkg)
                goto out;
        if (!blkg_tryget(blkg))
                goto out;
        rcu_read_unlock();

        blkcg_maybe_throttle_blkg(blkg, use_memdelay);
        blkg_put(blkg);
        put_disk(disk);
        return;
out:
        rcu_read_unlock();
        put_disk(disk);
}

/**
 * blkcg_schedule_throttle - this task needs to check for throttling
 * @disk: disk to throttle
 * @use_memdelay: do we charge this to memory delay for PSI
 *
 * This is called by the IO controller when we know there's delay accumulated
 * for the blkg for this task.  We do not pass the blkg because there are places
 * we call this that may not have that information, the swapping code for
 * instance will only have a block_device at that point.  This set's the
 * notify_resume for the task to check and see if it requires throttling before
 * returning to user space.
 *
 * We will only schedule once per syscall.  You can call this over and over
 * again and it will only do the check once upon return to user space, and only
 * throttle once.  If the task needs to be throttled again it'll need to be
 * re-set at the next time we see the task.
 */
void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay)
{
        if (unlikely(current->flags & PF_KTHREAD))
                return;

        if (current->throttle_disk != disk) {
                if (test_bit(GD_DEAD, &disk->state))
                        return;
                get_device(disk_to_dev(disk));

                if (current->throttle_disk)
                        put_disk(current->throttle_disk);
                current->throttle_disk = disk;
        }

        if (use_memdelay)
                current->use_memdelay = use_memdelay;
        set_notify_resume(current);
}

/**
 * blkcg_add_delay - add delay to this blkg
 * @blkg: blkg of interest
 * @now: the current time in nanoseconds
 * @delta: how many nanoseconds of delay to add
 *
 * Charge @delta to the blkg's current delay accumulation.  This is used to
 * throttle tasks if an IO controller thinks we need more throttling.
 */
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
{
        if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
                return;
        blkcg_scale_delay(blkg, now);
        atomic64_add(delta, &blkg->delay_nsec);
}

/**
 * blkg_tryget_closest - try and get a blkg ref on the closet blkg
 * @bio: target bio
 * @css: target css
 *
 * As the failure mode here is to walk up the blkg tree, this ensure that the
 * blkg->parent pointers are always valid.  This returns the blkg that it ended
 * up taking a reference on or %NULL if no reference was taken.
 */
static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
                struct cgroup_subsys_state *css)
{
        struct blkcg_gq *blkg, *ret_blkg = NULL;

        rcu_read_lock();
        blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk);
        while (blkg) {
                if (blkg_tryget(blkg)) {
                        ret_blkg = blkg;
                        break;
                }
                blkg = blkg->parent;
        }
        rcu_read_unlock();

        return ret_blkg;
}

/**
 * bio_associate_blkg_from_css - associate a bio with a specified css
 * @bio: target bio
 * @css: target css
 *
 * Associate @bio with the blkg found by combining the css's blkg and the
 * request_queue of the @bio.  An association failure is handled by walking up
 * the blkg tree.  Therefore, the blkg associated can be anything between @blkg
 * and q->root_blkg.  This situation only happens when a cgroup is dying and
 * then the remaining bios will spill to the closest alive blkg.
 *
 * A reference will be taken on the blkg and will be released when @bio is
 * freed.
 */
void bio_associate_blkg_from_css(struct bio *bio,
                                 struct cgroup_subsys_state *css)
{
        if (bio->bi_blkg)
                blkg_put(bio->bi_blkg);

        if (css && css->parent) {
                bio->bi_blkg = blkg_tryget_closest(bio, css);
        } else {
                blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
                bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
        }
}
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);

/**
 * bio_associate_blkg - associate a bio with a blkg
 * @bio: target bio
 *
 * Associate @bio with the blkg found from the bio's css and request_queue.
 * If one is not found, bio_lookup_blkg() creates the blkg.  If a blkg is
 * already associated, the css is reused and association redone as the
 * request_queue may have changed.
 */
void bio_associate_blkg(struct bio *bio)
{
        struct cgroup_subsys_state *css;

        if (blk_op_is_passthrough(bio->bi_opf))
                return;

        rcu_read_lock();

        if (bio->bi_blkg)
                css = bio_blkcg_css(bio);
        else
                css = blkcg_css();

        bio_associate_blkg_from_css(bio, css);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(bio_associate_blkg);

/**
 * bio_clone_blkg_association - clone blkg association from src to dst bio
 * @dst: destination bio
 * @src: source bio
 */
void bio_clone_blkg_association(struct bio *dst, struct bio *src)
{
        if (src->bi_blkg)
                bio_associate_blkg_from_css(dst, bio_blkcg_css(src));
}
EXPORT_SYMBOL_GPL(bio_clone_blkg_association);

static int blk_cgroup_io_type(struct bio *bio)
{
        if (op_is_discard(bio->bi_opf))
                return BLKG_IOSTAT_DISCARD;
        if (op_is_write(bio->bi_opf))
                return BLKG_IOSTAT_WRITE;
        return BLKG_IOSTAT_READ;
}

void blk_cgroup_bio_start(struct bio *bio)
{
        struct blkcg *blkcg = bio->bi_blkg->blkcg;
        int rwd = blk_cgroup_io_type(bio), cpu;
        struct blkg_iostat_set *bis;
        unsigned long flags;

        if (!cgroup_subsys_on_dfl(io_cgrp_subsys))
                return;

        /* Root-level stats are sourced from system-wide IO stats */
        if (!cgroup_parent(blkcg->css.cgroup))
                return;

        cpu = get_cpu();
        bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
        flags = u64_stats_update_begin_irqsave(&bis->sync);

        /*
         * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
         * bio and we would have already accounted for the size of the bio.
         */
        if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
                bio_set_flag(bio, BIO_CGROUP_ACCT);
                bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
        }
        bis->cur.ios[rwd]++;

        /*
         * If the iostat_cpu isn't in a lockless list, put it into the
         * list to indicate that a stat update is pending.
         */
        if (!READ_ONCE(bis->lqueued)) {
                struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);

                llist_add(&bis->lnode, lhead);
                WRITE_ONCE(bis->lqueued, true);
        }

        u64_stats_update_end_irqrestore(&bis->sync, flags);
        css_rstat_updated(&blkcg->css, cpu);
        put_cpu();
}

bool blk_cgroup_congested(void)
{
        struct blkcg *blkcg;
        bool ret = false;

        rcu_read_lock();
        for (blkcg = css_to_blkcg(blkcg_css()); blkcg;
             blkcg = blkcg_parent(blkcg)) {
                if (atomic_read(&blkcg->congestion_count)) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}

module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");




























    2 














    2 

    2 






















    1 




    1 











    1 







    1 






    2 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// SPDX-License-Identifier: GPL-2.0-only
/*
 * ratelimit.c - Do something with rate limit.
 *
 * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com>
 *
 * 2008-05-01 rewrite the function and use a ratelimit_state data struct as
 * parameter. Now every user can use their own standalone ratelimit_state.
 */

#include <linux/ratelimit.h>
#include <linux/jiffies.h>
#include <linux/export.h>

/*
 * __ratelimit - rate limiting
 * @rs: ratelimit_state data
 * @func: name of calling function
 *
 * This enforces a rate limit: not more than @rs->burst callbacks
 * in every @rs->interval
 *
 * RETURNS:
 * 0 means callbacks will be suppressed.
 * 1 means go ahead and do it.
 */
int ___ratelimit(struct ratelimit_state *rs, const char *func)
{
        /* Paired with WRITE_ONCE() in .proc_handler().
         * Changing two values separately could be inconsistent
         * and some message could be lost.  (See: net_ratelimit_state).
         */
        int interval = READ_ONCE(rs->interval);
        int burst = READ_ONCE(rs->burst);
        unsigned long flags;
        int ret = 0;

        /*
         * Zero interval says never limit, otherwise, non-positive burst
         * says always limit.
         */
        if (interval <= 0 || burst <= 0) {
                WARN_ONCE(interval < 0 || burst < 0, "Negative interval (%d) or burst (%d): Uninitialized ratelimit_state structure?\n", interval, burst);
                ret = interval == 0 || burst > 0;
                if (!(READ_ONCE(rs->flags) & RATELIMIT_INITIALIZED) || (!interval && !burst) ||
                    !raw_spin_trylock_irqsave(&rs->lock, flags))
                        goto nolock_ret;

                /* Force re-initialization once re-enabled. */
                rs->flags &= ~RATELIMIT_INITIALIZED;
                goto unlock_ret;
        }

        /*
         * If we contend on this state's lock then just check if
         * the current burst is used or not. It might cause
         * false positive when we are past the interval and
         * the current lock owner is just about to reset it.
         */
        if (!raw_spin_trylock_irqsave(&rs->lock, flags)) {
                if (READ_ONCE(rs->flags) & RATELIMIT_INITIALIZED &&
                    atomic_read(&rs->rs_n_left) > 0 && atomic_dec_return(&rs->rs_n_left) >= 0)
                        ret = 1;
                goto nolock_ret;
        }

        if (!(rs->flags & RATELIMIT_INITIALIZED)) {
                rs->begin = jiffies;
                rs->flags |= RATELIMIT_INITIALIZED;
                atomic_set(&rs->rs_n_left, rs->burst);
        }

        if (time_is_before_jiffies(rs->begin + interval)) {
                int m;

                /*
                 * Reset rs_n_left ASAP to reduce false positives
                 * in parallel calls, see above.
                 */
                atomic_set(&rs->rs_n_left, rs->burst);
                rs->begin = jiffies;

                if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
                        m = ratelimit_state_reset_miss(rs);
                        if (m) {
                                printk_deferred(KERN_WARNING
                                                "%s: %d callbacks suppressed\n", func, m);
                        }
                }
        }

        /* Note that the burst might be taken by a parallel call. */
        if (atomic_read(&rs->rs_n_left) > 0 && atomic_dec_return(&rs->rs_n_left) >= 0)
                ret = 1;

unlock_ret:
        raw_spin_unlock_irqrestore(&rs->lock, flags);

nolock_ret:
        if (!ret)
                ratelimit_state_inc_miss(rs);

        return ret;
}
EXPORT_SYMBOL(___ratelimit);

































































    1 




    1 



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
// SPDX-License-Identifier: GPL-2.0
/* XSKMAP used for AF_XDP sockets
 * Copyright(c) 2018 Intel Corporation.
 */

#include <linux/bpf.h>
#include <linux/filter.h>
#include <net/xdp_sock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/btf_ids.h>

#include "xsk.h"

static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
                                               struct xdp_sock __rcu **map_entry)
{
        struct xsk_map_node *node;

        node = bpf_map_kzalloc(&map->map, sizeof(*node),
                               GFP_ATOMIC | __GFP_NOWARN);
        if (!node)
                return ERR_PTR(-ENOMEM);

        bpf_map_inc(&map->map);
        atomic_inc(&map->count);

        node->map = map;
        node->map_entry = map_entry;
        return node;
}

static void xsk_map_node_free(struct xsk_map_node *node)
{
        struct xsk_map *map = node->map;

        bpf_map_put(&node->map->map);
        kfree(node);
        atomic_dec(&map->count);
}

static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
{
        spin_lock_bh(&xs->map_list_lock);
        list_add_tail(&node->node, &xs->map_list);
        spin_unlock_bh(&xs->map_list_lock);
}

static void xsk_map_sock_delete(struct xdp_sock *xs,
                                struct xdp_sock __rcu **map_entry)
{
        struct xsk_map_node *n, *tmp;

        spin_lock_bh(&xs->map_list_lock);
        list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
                if (map_entry == n->map_entry) {
                        list_del(&n->node);
                        xsk_map_node_free(n);
                }
        }
        spin_unlock_bh(&xs->map_list_lock);
}

static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
        struct xsk_map *m;
        int numa_node;
        u64 size;

        if (attr->max_entries == 0 || attr->key_size != 4 ||
            attr->value_size != 4 ||
            attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
                return ERR_PTR(-EINVAL);

        numa_node = bpf_map_attr_numa_node(attr);
        size = struct_size(m, xsk_map, attr->max_entries);

        m = bpf_map_area_alloc(size, numa_node);
        if (!m)
                return ERR_PTR(-ENOMEM);

        bpf_map_init_from_attr(&m->map, attr);
        spin_lock_init(&m->lock);

        return &m->map;
}

static u64 xsk_map_mem_usage(const struct bpf_map *map)
{
        struct xsk_map *m = container_of(map, struct xsk_map, map);

        return struct_size(m, xsk_map, map->max_entries) +
                   (u64)atomic_read(&m->count) * sizeof(struct xsk_map_node);
}

static void xsk_map_free(struct bpf_map *map)
{
        struct xsk_map *m = container_of(map, struct xsk_map, map);

        synchronize_net();
        bpf_map_area_free(m);
}

static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
        struct xsk_map *m = container_of(map, struct xsk_map, map);
        u32 index = key ? *(u32 *)key : U32_MAX;
        u32 *next = next_key;

        if (index >= m->map.max_entries) {
                *next = 0;
                return 0;
        }

        if (index == m->map.max_entries - 1)
                return -ENOENT;
        *next = index + 1;
        return 0;
}

static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
        const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
        struct bpf_insn *insn = insn_buf;

        *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
        *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
        *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
        *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
        *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
        *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
        *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
        *insn++ = BPF_MOV64_IMM(ret, 0);
        return insn - insn_buf;
}

/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
 * by local_bh_disable() (from XDP calls inside NAPI). The
 * rcu_read_lock_bh_held() below makes lockdep accept both.
 */
static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
{
        struct xsk_map *m = container_of(map, struct xsk_map, map);

        if (key >= map->max_entries)
                return NULL;

        return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held());
}

static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
{
        return __xsk_map_lookup_elem(map, *(u32 *)key);
}

static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
                                u64 map_flags)
{
        struct xsk_map *m = container_of(map, struct xsk_map, map);
        struct xdp_sock __rcu **map_entry;
        struct xdp_sock *xs, *old_xs;
        u32 i = *(u32 *)key, fd = *(u32 *)value;
        struct xsk_map_node *node;
        struct socket *sock;
        int err;

        if (unlikely(map_flags > BPF_EXIST))
                return -EINVAL;
        if (unlikely(i >= m->map.max_entries))
                return -E2BIG;

        sock = sockfd_lookup(fd, &err);
        if (!sock)
                return err;

        if (sock->sk->sk_family != PF_XDP) {
                sockfd_put(sock);
                return -EOPNOTSUPP;
        }

        xs = (struct xdp_sock *)sock->sk;

        map_entry = &m->xsk_map[i];
        node = xsk_map_node_alloc(m, map_entry);
        if (IS_ERR(node)) {
                sockfd_put(sock);
                return PTR_ERR(node);
        }

        spin_lock_bh(&m->lock);
        old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock));
        if (old_xs == xs) {
                err = 0;
                goto out;
        } else if (old_xs && map_flags == BPF_NOEXIST) {
                err = -EEXIST;
                goto out;
        } else if (!old_xs && map_flags == BPF_EXIST) {
                err = -ENOENT;
                goto out;
        }
        xsk_map_sock_add(xs, node);
        rcu_assign_pointer(*map_entry, xs);
        if (old_xs)
                xsk_map_sock_delete(old_xs, map_entry);
        spin_unlock_bh(&m->lock);
        sockfd_put(sock);
        return 0;

out:
        spin_unlock_bh(&m->lock);
        sockfd_put(sock);
        xsk_map_node_free(node);
        return err;
}

static long xsk_map_delete_elem(struct bpf_map *map, void *key)
{
        struct xsk_map *m = container_of(map, struct xsk_map, map);
        struct xdp_sock __rcu **map_entry;
        struct xdp_sock *old_xs;
        u32 k = *(u32 *)key;

        if (k >= map->max_entries)
                return -EINVAL;

        spin_lock_bh(&m->lock);
        map_entry = &m->xsk_map[k];
        old_xs = unrcu_pointer(xchg(map_entry, NULL));
        if (old_xs)
                xsk_map_sock_delete(old_xs, map_entry);
        spin_unlock_bh(&m->lock);

        return 0;
}

static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
        return __bpf_xdp_redirect_map(map, index, flags, 0,
                                      __xsk_map_lookup_elem);
}

void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
                             struct xdp_sock __rcu **map_entry)
{
        spin_lock_bh(&map->lock);
        if (rcu_access_pointer(*map_entry) == xs) {
                rcu_assign_pointer(*map_entry, NULL);
                xsk_map_sock_delete(xs, map_entry);
        }
        spin_unlock_bh(&map->lock);
}

static bool xsk_map_meta_equal(const struct bpf_map *meta0,
                               const struct bpf_map *meta1)
{
        return meta0->max_entries == meta1->max_entries &&
                bpf_map_meta_equal(meta0, meta1);
}

BTF_ID_LIST_SINGLE(xsk_map_btf_ids, struct, xsk_map)
const struct bpf_map_ops xsk_map_ops = {
        .map_meta_equal = xsk_map_meta_equal,
        .map_alloc = xsk_map_alloc,
        .map_free = xsk_map_free,
        .map_get_next_key = xsk_map_get_next_key,
        .map_lookup_elem = xsk_map_lookup_elem,
        .map_gen_lookup = xsk_map_gen_lookup,
        .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
        .map_update_elem = xsk_map_update_elem,
        .map_delete_elem = xsk_map_delete_elem,
        .map_check_btf = map_check_no_btf,
        .map_mem_usage = xsk_map_mem_usage,
        .map_btf_id = &xsk_map_btf_ids[0],
        .map_redirect = xsk_map_redirect,
};



































































































































































































































    1 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Multipath TCP
 *
 * Copyright (c) 2017 - 2019, Intel Corporation.
 */

#ifndef __NET_MPTCP_H
#define __NET_MPTCP_H

#include <linux/skbuff.h>
#include <linux/tcp.h>
#include <linux/types.h>

struct mptcp_info;
struct mptcp_sock;
struct mptcp_pm_addr_entry;
struct seq_file;

/* MPTCP sk_buff extension data */
struct mptcp_ext {
        union {
                u64        data_ack;
                u32        data_ack32;
        };
        u64                data_seq;
        u32                subflow_seq;
        u16                data_len;
        __sum16                csum;
        u8                use_map:1,
                        dsn64:1,
                        data_fin:1,
                        use_ack:1,
                        ack64:1,
                        mpc_map:1,
                        frozen:1,
                        reset_transient:1;
        u8                reset_reason:4,
                        csum_reqd:1,
                        infinite_map:1;
};

#define MPTCPOPT_HMAC_LEN        20
#define MPTCP_RM_IDS_MAX        8

struct mptcp_rm_list {
        u8 ids[MPTCP_RM_IDS_MAX];
        u8 nr;
};

struct mptcp_addr_info {
        u8                        id;
        sa_family_t                family;
        __be16                        port;
        union {
                struct in_addr        addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
                struct in6_addr        addr6;
#endif
        };
};

struct mptcp_out_options {
#if IS_ENABLED(CONFIG_MPTCP)
        u16 suboptions;
        struct mptcp_rm_list rm_list;
        u8 join_id;
        u8 backup;
        u8 reset_reason:4,
           reset_transient:1,
           csum_reqd:1,
           allow_join_id0:1;
        union {
                struct {
                        u64 sndr_key;
                        u64 rcvr_key;
                        u64 data_seq;
                        u32 subflow_seq;
                        u16 data_len;
                        __sum16 csum;
                };
                struct {
                        struct mptcp_addr_info addr;
                        u64 ahmac;
                };
                struct {
                        struct mptcp_ext ext_copy;
                        u64 fail_seq;
                };
                struct {
                        u32 nonce;
                        u32 token;
                        u64 thmac;
                        u8 hmac[MPTCPOPT_HMAC_LEN];
                };
        };
#endif
};

#define MPTCP_SCHED_NAME_MAX        16
#define MPTCP_SCHED_MAX                128
#define MPTCP_SCHED_BUF_MAX        (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX)

struct mptcp_sched_ops {
        int (*get_send)(struct mptcp_sock *msk);
        int (*get_retrans)(struct mptcp_sock *msk);

        char                        name[MPTCP_SCHED_NAME_MAX];
        struct module                *owner;
        struct list_head        list;

        void (*init)(struct mptcp_sock *msk);
        void (*release)(struct mptcp_sock *msk);
} ____cacheline_aligned_in_smp;

#define MPTCP_PM_NAME_MAX        16
#define MPTCP_PM_MAX                128
#define MPTCP_PM_BUF_MAX        (MPTCP_PM_NAME_MAX * MPTCP_PM_MAX)

struct mptcp_pm_ops {
        char                        name[MPTCP_PM_NAME_MAX];
        struct module                *owner;
        struct list_head        list;

        void (*init)(struct mptcp_sock *msk);
        void (*release)(struct mptcp_sock *msk);
} ____cacheline_aligned_in_smp;

#ifdef CONFIG_MPTCP
void mptcp_init(void);

static inline bool sk_is_mptcp(const struct sock *sk)
{
        return tcp_sk(sk)->is_mptcp;
}

static inline bool rsk_is_mptcp(const struct request_sock *req)
{
        return tcp_rsk(req)->is_mptcp;
}

static inline bool rsk_drop_req(const struct request_sock *req)
{
        return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req;
}

void mptcp_space(const struct sock *ssk, int *space, int *full_space);
bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
                       unsigned int *size, struct mptcp_out_options *opts);
bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
                          struct mptcp_out_options *opts);
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
                               unsigned int *size, unsigned int remaining,
                               struct mptcp_out_options *opts);
bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);

void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
                         struct mptcp_out_options *opts);

void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info);

/* move the skb extension owership, with the assumption that 'to' is
 * newly allocated
 */
static inline void mptcp_skb_ext_move(struct sk_buff *to,
                                      struct sk_buff *from)
{
        if (!skb_ext_exist(from, SKB_EXT_MPTCP))
                return;

        if (WARN_ON_ONCE(to->active_extensions))
                skb_ext_put(to);

        to->active_extensions = from->active_extensions;
        to->extensions = from->extensions;
        from->active_extensions = 0;
}

static inline void mptcp_skb_ext_copy(struct sk_buff *to,
                                      struct sk_buff *from)
{
        struct mptcp_ext *from_ext;

        from_ext = skb_ext_find(from, SKB_EXT_MPTCP);
        if (!from_ext)
                return;

        from_ext->frozen = 1;
        skb_ext_copy(to, from);
}

static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext,
                                     const struct mptcp_ext *from_ext)
{
        /* MPTCP always clears the ext when adding it to the skb, so
         * holes do not bother us here
         */
        return !from_ext ||
               (to_ext && from_ext &&
                !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext)));
}

/* check if skbs can be collapsed.
 * MPTCP collapse is allowed if neither @to or @from carry an mptcp data
 * mapping, or if the extension of @to is the same as @from.
 * Collapsing is not possible if @to lacks an extension, but @from carries one.
 */
static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
                                          const struct sk_buff *from)
{
        return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP),
                                 skb_ext_find(from, SKB_EXT_MPTCP));
}

void mptcp_seq_show(struct seq_file *seq);
int mptcp_subflow_init_cookie_req(struct request_sock *req,
                                  const struct sock *sk_listener,
                                  struct sk_buff *skb);
struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
                                               struct sock *sk_listener,
                                               bool attach_listener);

__be32 mptcp_get_reset_option(const struct sk_buff *skb);

static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
{
        if (skb_ext_exist(skb, SKB_EXT_MPTCP))
                return mptcp_get_reset_option(skb);

        return htonl(0u);
}

void mptcp_active_detect_blackhole(struct sock *sk, bool expired);
#else

static inline void mptcp_init(void)
{
}

static inline bool sk_is_mptcp(const struct sock *sk)
{
        return false;
}

static inline bool rsk_is_mptcp(const struct request_sock *req)
{
        return false;
}

static inline bool rsk_drop_req(const struct request_sock *req)
{
        return false;
}

static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
                                     unsigned int *size,
                                     struct mptcp_out_options *opts)
{
        return false;
}

static inline bool mptcp_synack_options(const struct request_sock *req,
                                        unsigned int *size,
                                        struct mptcp_out_options *opts)
{
        return false;
}

static inline bool mptcp_established_options(struct sock *sk,
                                             struct sk_buff *skb,
                                             unsigned int *size,
                                             unsigned int remaining,
                                             struct mptcp_out_options *opts)
{
        return false;
}

static inline bool mptcp_incoming_options(struct sock *sk,
                                          struct sk_buff *skb)
{
        return true;
}

static inline void mptcp_skb_ext_move(struct sk_buff *to,
                                      const struct sk_buff *from)
{
}

static inline void mptcp_skb_ext_copy(struct sk_buff *to,
                                      struct sk_buff *from)
{
}

static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
                                          const struct sk_buff *from)
{
        return true;
}

static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
static inline void mptcp_seq_show(struct seq_file *seq) { }

static inline int mptcp_subflow_init_cookie_req(struct request_sock *req,
                                                const struct sock *sk_listener,
                                                struct sk_buff *skb)
{
        return 0; /* TCP fallback */
}

static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
                                                             struct sock *sk_listener,
                                                             bool attach_listener)
{
        return NULL;
}

static inline __be32 mptcp_reset_option(const struct sk_buff *skb)  { return htonl(0u); }

static inline void mptcp_active_detect_blackhole(struct sock *sk, bool expired) { }
#endif /* CONFIG_MPTCP */

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
int mptcpv6_init(void);
void mptcpv6_handle_mapped(struct sock *sk, bool mapped);
#elif IS_ENABLED(CONFIG_IPV6)
static inline int mptcpv6_init(void) { return 0; }
static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { }
#endif

#if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL)
struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk);
#else
static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; }
#endif

#if !IS_ENABLED(CONFIG_MPTCP)
struct mptcp_sock { };
#endif

#endif /* __NET_MPTCP_H */







































































































































































































































































































































































































































































































    1 














































    1 



























































    1 


    1 

















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
// SPDX-License-Identifier: GPL-2.0
/*
 * HugeTLB Vmemmap Optimization (HVO)
 *
 * Copyright (c) 2020, ByteDance. All rights reserved.
 *
 *     Author: Muchun Song <songmuchun@bytedance.com>
 *
 * See Documentation/mm/vmemmap_dedup.rst
 */
#define pr_fmt(fmt)        "HugeTLB: " fmt

#include <linux/pgtable.h>
#include <linux/moduleparam.h>
#include <linux/bootmem_info.h>
#include <linux/mmdebug.h>
#include <linux/pagewalk.h>
#include <linux/pgalloc.h>

#include <asm/tlbflush.h>
#include "hugetlb_vmemmap.h"
#include "internal.h"

/**
 * struct vmemmap_remap_walk - walk vmemmap page table
 *
 * @remap_pte:                called for each lowest-level entry (PTE).
 * @nr_walked:                the number of walked pte.
 * @vmemmap_head:        the page to be installed as first in the vmemmap range
 * @vmemmap_tail:        the page to be installed as non-first in the vmemmap range
 * @vmemmap_pages:        the list head of the vmemmap pages that can be freed
 *                        or is mapped from.
 * @flags:                used to modify behavior in vmemmap page table walking
 *                        operations.
 */
struct vmemmap_remap_walk {
        void                        (*remap_pte)(pte_t *pte, unsigned long addr,
                                             struct vmemmap_remap_walk *walk);

        unsigned long                nr_walked;
        struct page                *vmemmap_head;
        struct page                *vmemmap_tail;
        struct list_head        *vmemmap_pages;


/* Skip the TLB flush when we split the PMD */
#define VMEMMAP_SPLIT_NO_TLB_FLUSH        BIT(0)
/* Skip the TLB flush when we remap the PTE */
#define VMEMMAP_REMAP_NO_TLB_FLUSH        BIT(1)
        unsigned long                flags;
};

static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
                             struct vmemmap_remap_walk *walk)
{
        pmd_t __pmd;
        int i;
        unsigned long addr = start;
        pte_t *pgtable;

        pgtable = pte_alloc_one_kernel(&init_mm);
        if (!pgtable)
                return -ENOMEM;

        pmd_populate_kernel(&init_mm, &__pmd, pgtable);

        for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
                pte_t entry, *pte;
                pgprot_t pgprot = PAGE_KERNEL;

                entry = mk_pte(head + i, pgprot);
                pte = pte_offset_kernel(&__pmd, addr);
                set_pte_at(&init_mm, addr, pte, entry);
        }

        spin_lock(&init_mm.page_table_lock);
        if (likely(pmd_leaf(*pmd))) {
                /*
                 * Higher order allocations from buddy allocator must be able to
                 * be treated as independent small pages (as they can be freed
                 * individually).
                 */
                if (!PageReserved(head))
                        split_page(head, get_order(PMD_SIZE));

                /* Make pte visible before pmd. See comment in pmd_install(). */
                smp_wmb();
                pmd_populate_kernel(&init_mm, pmd, pgtable);
                if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
                        flush_tlb_kernel_range(start, start + PMD_SIZE);
        } else {
                pte_free_kernel(&init_mm, pgtable);
        }
        spin_unlock(&init_mm.page_table_lock);

        return 0;
}

static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
                             unsigned long next, struct mm_walk *walk)
{
        int ret = 0;
        struct page *head;
        struct vmemmap_remap_walk *vmemmap_walk = walk->private;

        /* Only splitting, not remapping the vmemmap pages. */
        if (!vmemmap_walk->remap_pte)
                walk->action = ACTION_CONTINUE;

        spin_lock(&init_mm.page_table_lock);
        head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
        /*
         * Due to HugeTLB alignment requirements and the vmemmap
         * pages being at the start of the hotplugged memory
         * region in memory_hotplug.memmap_on_memory case. Checking
         * the vmemmap page associated with the first vmemmap page
         * if it is self-hosted is sufficient.
         *
         * [                  hotplugged memory                  ]
         * [        section        ][...][        section        ]
         * [ vmemmap ][              usable memory               ]
         *   ^  | ^                        |
         *   +--+ |                        |
         *        +------------------------+
         */
        if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
                struct page *page = head ? head + pte_index(addr) :
                                    pte_page(ptep_get(pte_offset_kernel(pmd, addr)));

                if (PageVmemmapSelfHosted(page))
                        ret = -ENOTSUPP;
        }
        spin_unlock(&init_mm.page_table_lock);
        if (!head || ret)
                return ret;

        return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
}

static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
                             unsigned long next, struct mm_walk *walk)
{
        struct vmemmap_remap_walk *vmemmap_walk = walk->private;

        vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
        vmemmap_walk->nr_walked++;

        return 0;
}

static const struct mm_walk_ops vmemmap_remap_ops = {
        .pmd_entry        = vmemmap_pmd_entry,
        .pte_entry        = vmemmap_pte_entry,
};

static int vmemmap_remap_range(unsigned long start, unsigned long end,
                               struct vmemmap_remap_walk *walk)
{
        int ret;

        VM_BUG_ON(!PAGE_ALIGNED(start | end));

        mmap_read_lock(&init_mm);
        ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
                                    NULL, walk);
        mmap_read_unlock(&init_mm);
        if (ret)
                return ret;

        if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
                flush_tlb_kernel_range(start, end);

        return 0;
}

/*
 * Free a vmemmap page. A vmemmap page can be allocated from the memblock
 * allocator or buddy allocator. If the PG_reserved flag is set, it means
 * that it allocated from the memblock allocator, just free it via the
 * free_bootmem_page(). Otherwise, use __free_page().
 */
static inline void free_vmemmap_page(struct page *page)
{
        if (PageReserved(page)) {
                memmap_boot_pages_add(-1);
                free_bootmem_page(page);
        } else {
                memmap_pages_add(-1);
                __free_page(page);
        }
}

/* Free a list of the vmemmap pages */
static void free_vmemmap_page_list(struct list_head *list)
{
        struct page *page, *next;

        list_for_each_entry_safe(page, next, list, lru)
                free_vmemmap_page(page);
}

static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
                              struct vmemmap_remap_walk *walk)
{
        struct page *page = pte_page(ptep_get(pte));
        pte_t entry;

        /* Remapping the head page requires r/w */
        if (unlikely(walk->nr_walked == 0 && walk->vmemmap_head)) {
                list_del(&walk->vmemmap_head->lru);

                /*
                 * Makes sure that preceding stores to the page contents from
                 * vmemmap_remap_free() become visible before the set_pte_at()
                 * write.
                 */
                smp_wmb();

                entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL);
        } else {
                /*
                 * Remap the tail pages as read-only to catch illegal write
                 * operation to the tail pages.
                 */
                entry = mk_pte(walk->vmemmap_tail, PAGE_KERNEL_RO);
        }

        list_add(&page->lru, walk->vmemmap_pages);
        set_pte_at(&init_mm, addr, pte, entry);
}

static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
                                struct vmemmap_remap_walk *walk)
{
        struct page *page;
        struct page *from, *to;

        page = list_first_entry(walk->vmemmap_pages, struct page, lru);
        list_del(&page->lru);

        /*
         * Initialize tail pages in the newly allocated vmemmap page.
         *
         * There is folio-scope metadata that is encoded in the first few
         * tail pages.
         *
         * Use the value last tail page in the page with the head page
         * to initialize the rest of tail pages.
         */
        from = compound_head((struct page *)addr) +
                PAGE_SIZE / sizeof(struct page) - 1;
        to = page_to_virt(page);
        for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++, to++)
                *to = *from;

        /*
         * Makes sure that preceding stores to the page contents become visible
         * before the set_pte_at() write.
         */
        smp_wmb();
        set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
}

/**
 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
 *                      backing PMDs of the directmap into PTEs
 * @start:     start address of the vmemmap virtual address range that we want
 *             to remap.
 * @end:       end address of the vmemmap virtual address range that we want to
 *             remap.
 * Return: %0 on success, negative error code otherwise.
 */
static int vmemmap_remap_split(unsigned long start, unsigned long end)
{
        struct vmemmap_remap_walk walk = {
                .remap_pte        = NULL,
                .flags                = VMEMMAP_SPLIT_NO_TLB_FLUSH,
        };

        return vmemmap_remap_range(start, end, &walk);
}

/**
 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
 *                        to use @vmemmap_head/tail, then free vmemmap which
 *                        the range are mapped to.
 * @start:        start address of the vmemmap virtual address range that we want
 *                to remap.
 * @end:        end address of the vmemmap virtual address range that we want to
 *                remap.
 * @vmemmap_head: the page to be installed as first in the vmemmap range
 * @vmemmap_tail: the page to be installed as non-first in the vmemmap range
 * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
 *                responsibility to free pages.
 * @flags:        modifications to vmemmap_remap_walk flags
 *
 * Return: %0 on success, negative error code otherwise.
 */
static int vmemmap_remap_free(unsigned long start, unsigned long end,
                              struct page *vmemmap_head,
                              struct page *vmemmap_tail,
                              struct list_head *vmemmap_pages,
                              unsigned long flags)
{
        int ret;
        struct vmemmap_remap_walk walk = {
                .remap_pte        = vmemmap_remap_pte,
                .vmemmap_head        = vmemmap_head,
                .vmemmap_tail        = vmemmap_tail,
                .vmemmap_pages        = vmemmap_pages,
                .flags                = flags,
        };

        ret = vmemmap_remap_range(start, end, &walk);
        if (!ret || !walk.nr_walked)
                return ret;

        end = start + walk.nr_walked * PAGE_SIZE;

        /*
         * vmemmap_pages contains pages from the previous vmemmap_remap_range()
         * call which failed.  These are pages which were removed from
         * the vmemmap. They will be restored in the following call.
         */
        walk = (struct vmemmap_remap_walk) {
                .remap_pte        = vmemmap_restore_pte,
                .vmemmap_pages        = vmemmap_pages,
                .flags                = 0,
        };

        vmemmap_remap_range(start, end, &walk);

        return ret;
}

static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
                                   struct list_head *list)
{
        gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
        unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
        int nid = page_to_nid((struct page *)start);
        struct page *page, *next;
        int i;

        for (i = 0; i < nr_pages; i++) {
                page = alloc_pages_node(nid, gfp_mask, 0);
                if (!page)
                        goto out;
                list_add(&page->lru, list);
        }
        memmap_pages_add(nr_pages);

        return 0;
out:
        list_for_each_entry_safe(page, next, list, lru)
                __free_page(page);
        return -ENOMEM;
}

/**
 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
 *                         to the page which is from the @vmemmap_pages
 *                         respectively.
 * @start:        start address of the vmemmap virtual address range that we want
 *                to remap.
 * @end:        end address of the vmemmap virtual address range that we want to
 *                remap.
 * @flags:        modifications to vmemmap_remap_walk flags
 *
 * Return: %0 on success, negative error code otherwise.
 */
static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
                               unsigned long flags)
{
        LIST_HEAD(vmemmap_pages);
        struct vmemmap_remap_walk walk = {
                .remap_pte        = vmemmap_restore_pte,
                .vmemmap_pages        = &vmemmap_pages,
                .flags                = flags,
        };

        if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
                return -ENOMEM;

        return vmemmap_remap_range(start, end, &walk);
}

static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
static int __init hugetlb_vmemmap_optimize_param(char *buf)
{
        return kstrtobool(buf, &vmemmap_optimize_enabled);
}
early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);

static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
                                           struct folio *folio, unsigned long flags)
{
        int ret;
        unsigned long vmemmap_start, vmemmap_end;

        VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);

        if (!folio_test_hugetlb_vmemmap_optimized(folio))
                return 0;

        vmemmap_start        = (unsigned long)&folio->page;
        vmemmap_end        = vmemmap_start + hugetlb_vmemmap_size(h);

        vmemmap_start        += HUGETLB_VMEMMAP_RESERVE_SIZE;

        /*
         * The pages which the vmemmap virtual address range [@vmemmap_start,
         * @vmemmap_end) are mapped to are freed to the buddy allocator.
         * When a HugeTLB page is freed to the buddy allocator, previously
         * discarded vmemmap pages must be allocated and remapping.
         */
        ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags);
        if (!ret)
                folio_clear_hugetlb_vmemmap_optimized(folio);

        return ret;
}

/**
 * hugetlb_vmemmap_restore_folio - restore previously optimized (by
 *                                hugetlb_vmemmap_optimize_folio()) vmemmap pages which
 *                                will be reallocated and remapped.
 * @h:                struct hstate.
 * @folio:     the folio whose vmemmap pages will be restored.
 *
 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
 * negative error code otherwise.
 */
int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
{
        return __hugetlb_vmemmap_restore_folio(h, folio, 0);
}

/**
 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
 * @h:                        hstate.
 * @folio_list:                list of folios.
 * @non_hvo_folios:        Output list of folios for which vmemmap exists.
 *
 * Return: number of folios for which vmemmap was restored, or an error code
 *                if an error was encountered restoring vmemmap for a folio.
 *                Folios that have vmemmap are moved to the non_hvo_folios
 *                list.  Processing of entries stops when the first error is
 *                encountered. The folio that experienced the error and all
 *                non-processed folios will remain on folio_list.
 */
long hugetlb_vmemmap_restore_folios(const struct hstate *h,
                                        struct list_head *folio_list,
                                        struct list_head *non_hvo_folios)
{
        struct folio *folio, *t_folio;
        long restored = 0;
        long ret = 0;
        unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH;

        list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
                if (folio_test_hugetlb_vmemmap_optimized(folio)) {
                        ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
                        if (ret)
                                break;
                        restored++;
                }

                /* Add non-optimized folios to output list */
                list_move(&folio->lru, non_hvo_folios);
        }

        if (restored)
                flush_tlb_all();
        if (!ret)
                ret = restored;
        return ret;
}

/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
{
        if (folio_test_hugetlb_vmemmap_optimized(folio))
                return false;

        if (!READ_ONCE(vmemmap_optimize_enabled))
                return false;

        if (!hugetlb_vmemmap_optimizable(h))
                return false;

        return true;
}

static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone)
{
        const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER;
        struct page *tail, *p;
        int node = zone_to_nid(zone);

        tail = READ_ONCE(zone->vmemmap_tails[idx]);
        if (likely(tail))
                return tail;

        tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
        if (!tail)
                return NULL;

        p = page_to_virt(tail);
        for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
                init_compound_tail(p + i, NULL, order, zone);

        if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) {
                __free_page(tail);
                tail = READ_ONCE(zone->vmemmap_tails[idx]);
        }

        return tail;
}

static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
                                            struct folio *folio,
                                            struct list_head *vmemmap_pages,
                                            unsigned long flags)
{
        unsigned long vmemmap_start, vmemmap_end;
        struct page *vmemmap_head, *vmemmap_tail;
        int nid, ret = 0;

        VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);

        if (!vmemmap_should_optimize_folio(h, folio))
                return ret;

        nid = folio_nid(folio);
        vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio));
        if (!vmemmap_tail)
                return -ENOMEM;

        /*
         * Very Subtle
         * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
         * immediately after remapping.  As a result, subsequent accesses
         * and modifications to struct pages associated with the hugetlb
         * page could be to the OLD struct pages.  Set the vmemmap optimized
         * flag here so that it is copied to the new head page.  This keeps
         * the old and new struct pages in sync.
         * If there is an error during optimization, we will immediately FLUSH
         * the TLB and clear the flag below.
         */
        folio_set_hugetlb_vmemmap_optimized(folio);

        vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0);
        if (!vmemmap_head) {
                ret = -ENOMEM;
                goto out;
        }

        copy_page(page_to_virt(vmemmap_head), folio);
        list_add(&vmemmap_head->lru, vmemmap_pages);
        memmap_pages_add(1);

        vmemmap_start        = (unsigned long)&folio->page;
        vmemmap_end        = vmemmap_start + hugetlb_vmemmap_size(h);

        /*
         * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end).
         * Add pages previously mapping the range to vmemmap_pages list so that
         * they can be freed by the caller.
         */
        ret = vmemmap_remap_free(vmemmap_start, vmemmap_end,
                                 vmemmap_head, vmemmap_tail,
                                 vmemmap_pages, flags);
out:
        if (ret)
                folio_clear_hugetlb_vmemmap_optimized(folio);

        return ret;
}

/**
 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
 * @h:                struct hstate.
 * @folio:     the folio whose vmemmap pages will be optimized.
 *
 * This function only tries to optimize @folio's vmemmap pages and does not
 * guarantee that the optimization will succeed after it returns. The caller
 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
 * vmemmap pages have been optimized.
 */
void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
{
        LIST_HEAD(vmemmap_pages);

        __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
        free_vmemmap_page_list(&vmemmap_pages);
}

static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
{
        unsigned long vmemmap_start, vmemmap_end;

        if (!vmemmap_should_optimize_folio(h, folio))
                return 0;

        vmemmap_start        = (unsigned long)&folio->page;
        vmemmap_end        = vmemmap_start + hugetlb_vmemmap_size(h);

        /*
         * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
         * @vmemmap_end]
         */
        return vmemmap_remap_split(vmemmap_start, vmemmap_end);
}

static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
                                              struct list_head *folio_list,
                                              bool boot)
{
        struct folio *folio;
        int nr_to_optimize;
        LIST_HEAD(vmemmap_pages);
        unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH;

        nr_to_optimize = 0;
        list_for_each_entry(folio, folio_list, lru) {
                int ret;
                unsigned long spfn, epfn;

                if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
                        /*
                         * Already optimized by pre-HVO, just map the
                         * mirrored tail page structs RO.
                         */
                        spfn = (unsigned long)&folio->page;
                        epfn = spfn + pages_per_huge_page(h);
                        vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
                                        HUGETLB_VMEMMAP_RESERVE_SIZE);
                        register_page_bootmem_memmap(pfn_to_section_nr(spfn),
                                        &folio->page,
                                        HUGETLB_VMEMMAP_RESERVE_SIZE);
                        continue;
                }

                nr_to_optimize++;

                ret = hugetlb_vmemmap_split_folio(h, folio);

                /*
                 * Splitting the PMD requires allocating a page, thus let's fail
                 * early once we encounter the first OOM. No point in retrying
                 * as it can be dynamically done on remap with the memory
                 * we get back from the vmemmap deduplication.
                 */
                if (ret == -ENOMEM)
                        break;
        }

        if (!nr_to_optimize)
                /*
                 * All pre-HVO folios, nothing left to do. It's ok if
                 * there is a mix of pre-HVO and not yet HVO-ed folios
                 * here, as __hugetlb_vmemmap_optimize_folio() will
                 * skip any folios that already have the optimized flag
                 * set, see vmemmap_should_optimize_folio().
                 */
                goto out;

        flush_tlb_all();

        list_for_each_entry(folio, folio_list, lru) {
                int ret;

                ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);

                /*
                 * Pages to be freed may have been accumulated.  If we
                 * encounter an ENOMEM,  free what we have and try again.
                 * This can occur in the case that both splitting fails
                 * halfway and head page allocation also failed. In this
                 * case __hugetlb_vmemmap_optimize_folio() would free memory
                 * allowing more vmemmap remaps to occur.
                 */
                if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
                        flush_tlb_all();
                        free_vmemmap_page_list(&vmemmap_pages);
                        INIT_LIST_HEAD(&vmemmap_pages);
                        __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
                }
        }

out:
        flush_tlb_all();
        free_vmemmap_page_list(&vmemmap_pages);
}

void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
{
        __hugetlb_vmemmap_optimize_folios(h, folio_list, false);
}

void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
{
        __hugetlb_vmemmap_optimize_folios(h, folio_list, true);
}

#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT

/* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */
static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
{
        unsigned long section_size, psize, pmd_vmemmap_size;
        phys_addr_t paddr;

        if (!READ_ONCE(vmemmap_optimize_enabled))
                return false;

        if (!hugetlb_vmemmap_optimizable(m->hstate))
                return false;

        psize = huge_page_size(m->hstate);
        paddr = virt_to_phys(m);

        /*
         * Pre-HVO only works if the bootmem huge page
         * is aligned to the section size.
         */
        section_size = (1UL << PA_SECTION_SHIFT);
        if (!IS_ALIGNED(paddr, section_size) ||
            !IS_ALIGNED(psize, section_size))
                return false;

        /*
         * The pre-HVO code does not deal with splitting PMDS,
         * so the bootmem page must be aligned to the number
         * of base pages that can be mapped with one vmemmap PMD.
         */
        pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
        if (!IS_ALIGNED(paddr, pmd_vmemmap_size) ||
            !IS_ALIGNED(psize, pmd_vmemmap_size))
                return false;

        return true;
}

/*
 * Initialize memmap section for a gigantic page, HVO-style.
 */
void __init hugetlb_vmemmap_init_early(int nid)
{
        unsigned long psize, paddr, section_size;
        unsigned long ns, i, pnum, pfn, nr_pages;
        struct huge_bootmem_page *m = NULL;
        void *map;

        if (!READ_ONCE(vmemmap_optimize_enabled))
                return;

        section_size = (1UL << PA_SECTION_SHIFT);

        list_for_each_entry(m, &huge_boot_pages[nid], list) {
                if (!vmemmap_should_optimize_bootmem_page(m))
                        continue;

                nr_pages = pages_per_huge_page(m->hstate);
                psize = nr_pages << PAGE_SHIFT;
                paddr = virt_to_phys(m);
                pfn = PHYS_PFN(paddr);
                map = pfn_to_page(pfn);

                pnum = pfn_to_section_nr(pfn);
                ns = psize / section_size;

                for (i = 0; i < ns; i++) {
                        sparse_init_early_section(nid, map, pnum,
                                        SECTION_IS_VMEMMAP_PREINIT);
                        map += section_map_size();
                        pnum++;
                }

                m->flags |= HUGE_BOOTMEM_HVO;
        }
}

static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn)
{
        struct zone *zone;
        enum zone_type zone_type;

        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
                zone = &NODE_DATA(nid)->node_zones[zone_type];
                if (zone_spans_pfn(zone, pfn))
                        return zone;
        }

        return NULL;
}

void __init hugetlb_vmemmap_init_late(int nid)
{
        struct huge_bootmem_page *m, *tm;
        unsigned long phys, nr_pages, start, end;
        unsigned long pfn, nr_mmap;
        struct zone *zone = NULL;
        struct hstate *h;
        void *map;

        if (!READ_ONCE(vmemmap_optimize_enabled))
                return;

        list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
                if (!(m->flags & HUGE_BOOTMEM_HVO))
                        continue;

                phys = virt_to_phys(m);
                h = m->hstate;
                pfn = PHYS_PFN(phys);
                nr_pages = pages_per_huge_page(h);
                map = pfn_to_page(pfn);
                start = (unsigned long)map;
                end = start + nr_pages * sizeof(struct page);

                if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
                        /*
                         * Oops, the hugetlb page spans multiple zones.
                         * Remove it from the list, and populate it normally.
                         */
                        list_del(&m->list);

                        vmemmap_populate(start, end, nid, NULL);
                        nr_mmap = end - start;
                        memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));

                        memblock_phys_free(phys, huge_page_size(h));
                        continue;
                }

                if (!zone || !zone_spans_pfn(zone, pfn))
                        zone = pfn_to_zone(nid, pfn);
                if (WARN_ON_ONCE(!zone))
                        continue;

                if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone,
                                         HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) {
                        /* Fallback if HVO population fails */
                        vmemmap_populate(start, end, nid, NULL);
                        nr_mmap = end - start;
                } else {
                        m->flags |= HUGE_BOOTMEM_ZONES_VALID;
                        nr_mmap = HUGETLB_VMEMMAP_RESERVE_SIZE;
                }

                memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
        }
}
#endif

static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
        {
                .procname        = "hugetlb_optimize_vmemmap",
                .data                = &vmemmap_optimize_enabled,
                .maxlen                = sizeof(vmemmap_optimize_enabled),
                .mode                = 0644,
                .proc_handler        = proc_dobool,
        },
};

static int __init hugetlb_vmemmap_init(void)
{
        const struct hstate *h;
        struct zone *zone;

        /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
        BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);

        for_each_zone(zone) {
                for (int i = 0; i < NR_VMEMMAP_TAILS; i++) {
                        struct page *tail, *p;
                        unsigned int order;

                        tail = zone->vmemmap_tails[i];
                        if (!tail)
                                continue;

                        order = i + VMEMMAP_TAIL_MIN_ORDER;
                        p = page_to_virt(tail);
                        for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++)
                                init_compound_tail(p + j, NULL, order, zone);
                }
        }

        for_each_hstate(h) {
                if (hugetlb_vmemmap_optimizable(h)) {
                        register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
                        break;
                }
        }
        return 0;
}
late_initcall(hugetlb_vmemmap_init);



















































































   11 


























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/percpu_counter.h>

#include <linux/atomic.h>
#include <uapi/linux/mman.h>

/*
 * Arrange for legacy / undefined architecture specific flags to be
 * ignored by mmap handling code.
 */
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
#ifndef MAP_ABOVE4G
#define MAP_ABOVE4G 0
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
#ifndef MAP_HUGE_1GB
#define MAP_HUGE_1GB 0
#endif
#ifndef MAP_UNINITIALIZED
#define MAP_UNINITIALIZED 0
#endif
#ifndef MAP_SYNC
#define MAP_SYNC 0
#endif

/*
 * The historical set of flags that all mmap implementations implicitly
 * support when a ->mmap_validate() op is not provided in file_operations.
 *
 * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the
 * kernel.
 */
#define LEGACY_MAP_MASK (MAP_SHARED \
                | MAP_PRIVATE \
                | MAP_FIXED \
                | MAP_ANONYMOUS \
                | MAP_DENYWRITE \
                | MAP_EXECUTABLE \
                | MAP_UNINITIALIZED \
                | MAP_GROWSDOWN \
                | MAP_LOCKED \
                | MAP_NORESERVE \
                | MAP_POPULATE \
                | MAP_NONBLOCK \
                | MAP_STACK \
                | MAP_HUGETLB \
                | MAP_32BIT \
                | MAP_ABOVE4G \
                | MAP_HUGE_2MB \
                | MAP_HUGE_1GB)

extern int sysctl_overcommit_memory;
extern struct percpu_counter vm_committed_as;

#ifdef CONFIG_SMP
extern s32 vm_committed_as_batch;
extern void mm_compute_batch(int overcommit_policy);
#else
#define vm_committed_as_batch 0
static inline void mm_compute_batch(int overcommit_policy)
{
}
#endif

unsigned long vm_memory_committed(void);

static inline void vm_acct_memory(long pages)
{
        percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
}

static inline void vm_unacct_memory(long pages)
{
        vm_acct_memory(-pages);
}

/*
 * Allow architectures to handle additional protection and flag bits. The
 * overriding macros must be defined in the arch-specific asm/mman.h file.
 */

#ifndef arch_calc_vm_prot_bits
#define arch_calc_vm_prot_bits(prot, pkey) 0
#endif

#ifndef arch_calc_vm_flag_bits
#define arch_calc_vm_flag_bits(file, flags) 0
#endif

#ifndef arch_validate_prot
/*
 * This is called from mprotect().  PROT_GROWSDOWN and PROT_GROWSUP have
 * already been masked out.
 *
 * Returns true if the prot flags are valid
 */
static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
{
        return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0;
}
#define arch_validate_prot arch_validate_prot
#endif

#ifndef arch_validate_flags
/*
 * This is called from mmap() and mprotect() with the updated vma->vm_flags.
 *
 * Returns true if the VM_* flags are valid.
 */
static inline bool arch_validate_flags(unsigned long flags)
{
        return true;
}
#define arch_validate_flags arch_validate_flags
#endif

/*
 * Optimisation macro.  It is equivalent to:
 *      (x & bit1) ? bit2 : 0
 * but this version is faster.
 * ("bit1" and "bit2" must be single bits)
 */
#define _calc_vm_trans(x, bit1, bit2) \
  ((!(bit1) || !(bit2)) ? 0 : \
  ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
   : ((x) & (bit1)) / ((bit1) / (bit2))))

/*
 * Combine the mmap "prot" argument into "vm_flags" used internally.
 */
static inline vm_flags_t
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
{
        return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
               _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
               _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
               arch_calc_vm_prot_bits(prot, pkey);
}

/*
 * Combine the mmap "flags" argument into "vm_flags" used internally.
 */
static inline vm_flags_t
calc_vm_flag_bits(struct file *file, unsigned long flags)
{
        return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
               _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
               _calc_vm_trans(flags, MAP_SYNC,             VM_SYNC      ) |
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
               _calc_vm_trans(flags, MAP_STACK,             VM_NOHUGEPAGE) |
#endif
               arch_calc_vm_flag_bits(file, flags);
}

unsigned long vm_commit_limit(void);

#ifndef arch_memory_deny_write_exec_supported
static inline bool arch_memory_deny_write_exec_supported(void)
{
        return true;
}
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
#endif
#endif /* _LINUX_MMAN_H */

































































































































































































































































































































































































































































































    2 




    2 




































































































































































    2 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_IP_TUNNELS_H
#define __NET_IP_TUNNELS_H 1

#include <linux/if_tunnel.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/types.h>
#include <linux/u64_stats_sync.h>
#include <linux/bitops.h>

#include <net/dsfield.h>
#include <net/flow.h>
#include <net/gro_cells.h>
#include <net/inet_dscp.h>
#include <net/inet_ecn.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/lwtunnel.h>
#include <net/dst_cache.h>
#include <net/netdev_lock.h>

#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#endif

/* Recursion limit for tunnel xmit to detect routing loops.
 * Unlike XMIT_RECURSION_LIMIT (8) used in the no-qdisc path, tunnel
 * recursion involves route lookups and full IP output, consuming much
 * more stack per level, so a lower limit is needed.
 */
#define IP_TUNNEL_RECURSION_LIMIT        5

/* Keep error state on tunnel for 30 sec */
#define IPTUNNEL_ERR_TIMEO        (30*HZ)

/* Used to memset ip_tunnel padding. */
#define IP_TUNNEL_KEY_SIZE        offsetofend(struct ip_tunnel_key, tp_dst)

/* Used to memset ipv4 address padding. */
#define IP_TUNNEL_KEY_IPV4_PAD        offsetofend(struct ip_tunnel_key, u.ipv4.dst)
#define IP_TUNNEL_KEY_IPV4_PAD_LEN                                \
        (sizeof_field(struct ip_tunnel_key, u) -                \
         sizeof_field(struct ip_tunnel_key, u.ipv4))

#define __ipt_flag_op(op, ...)                                        \
        op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM)

#define IP_TUNNEL_DECLARE_FLAGS(...)                                \
        __ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__)

#define ip_tunnel_flags_zero(...)        __ipt_flag_op(bitmap_zero, __VA_ARGS__)
#define ip_tunnel_flags_copy(...)        __ipt_flag_op(bitmap_copy, __VA_ARGS__)
#define ip_tunnel_flags_and(...)        __ipt_flag_op(bitmap_and, __VA_ARGS__)
#define ip_tunnel_flags_or(...)                __ipt_flag_op(bitmap_or, __VA_ARGS__)

#define ip_tunnel_flags_empty(...)                                \
        __ipt_flag_op(bitmap_empty, __VA_ARGS__)
#define ip_tunnel_flags_intersect(...)                                \
        __ipt_flag_op(bitmap_intersects, __VA_ARGS__)
#define ip_tunnel_flags_subset(...)                                \
        __ipt_flag_op(bitmap_subset, __VA_ARGS__)

struct ip_tunnel_key {
        __be64                        tun_id;
        union {
                struct {
                        __be32        src;
                        __be32        dst;
                } ipv4;
                struct {
                        struct in6_addr src;
                        struct in6_addr dst;
                } ipv6;
        } u;
        IP_TUNNEL_DECLARE_FLAGS(tun_flags);
        __be32                        label;                /* Flow Label for IPv6 */
        u32                        nhid;
        u8                        tos;                /* TOS for IPv4, TC for IPv6 */
        u8                        ttl;                /* TTL for IPv4, HL for IPv6 */
        __be16                        tp_src;
        __be16                        tp_dst;
        __u8                        flow_flags;
};

struct ip_tunnel_encap {
        u16                        type;
        u16                        flags;
        __be16                        sport;
        __be16                        dport;
};

/* Flags for ip_tunnel_info mode. */
#define IP_TUNNEL_INFO_TX        0x01        /* represents tx tunnel parameters */
#define IP_TUNNEL_INFO_IPV6        0x02        /* key contains IPv6 addresses */
#define IP_TUNNEL_INFO_BRIDGE        0x04        /* represents a bridged tunnel id */

/* Maximum tunnel options length. */
#define IP_TUNNEL_OPTS_MAX                                        \
        GENMASK((sizeof_field(struct ip_tunnel_info,                \
                              options_len) * BITS_PER_BYTE) - 1, 0)

#define ip_tunnel_info_opts(info)                                \
        _Generic(info,                                                \
                 const struct ip_tunnel_info * : ((const void *)(info)->options),\
                 struct ip_tunnel_info * : ((void *)(info)->options)\
        )

struct ip_tunnel_info {
        struct ip_tunnel_key        key;
        struct ip_tunnel_encap        encap;
#ifdef CONFIG_DST_CACHE
        struct dst_cache        dst_cache;
#endif
        u8                        options_len;
        u8                        mode;
        u8                        options[] __aligned_largest __counted_by(options_len);
};

/* 6rd prefix/relay information */
#ifdef CONFIG_IPV6_SIT_6RD
struct ip_tunnel_6rd_parm {
        struct in6_addr                prefix;
        __be32                        relay_prefix;
        u16                        prefixlen;
        u16                        relay_prefixlen;
};
#endif

struct ip_tunnel_prl_entry {
        struct ip_tunnel_prl_entry __rcu *next;
        __be32                                addr;
        u16                                flags;
        struct rcu_head                        rcu_head;
};

struct metadata_dst;

/* Kernel-side variant of ip_tunnel_parm */
struct ip_tunnel_parm_kern {
        char                        name[IFNAMSIZ];
        IP_TUNNEL_DECLARE_FLAGS(i_flags);
        IP_TUNNEL_DECLARE_FLAGS(o_flags);
        __be32                        i_key;
        __be32                        o_key;
        int                        link;
        struct iphdr                iph;
};

struct ip_tunnel {
        struct ip_tunnel __rcu        *next;
        struct hlist_node hash_node;

        struct net_device        *dev;
        netdevice_tracker        dev_tracker;

        struct net                *net;        /* netns for packet i/o */

        unsigned long        err_time;        /* Time when the last ICMP error
                                         * arrived */
        int                err_count;        /* Number of arrived ICMP errors */

        /* These four fields used only by GRE */
        u32                i_seqno;        /* The last seen seqno        */
        atomic_t        o_seqno;        /* The last output seqno */
        int                tun_hlen;        /* Precalculated header length */

        /* These four fields used only by ERSPAN */
        u32                index;                /* ERSPAN type II index */
        u8                erspan_ver;        /* ERSPAN version */
        u8                dir;                /* ERSPAN direction */
        u16                hwid;                /* ERSPAN hardware ID */

        struct dst_cache dst_cache;

        struct ip_tunnel_parm_kern parms;

        int                mlink;
        int                encap_hlen;        /* Encap header length (FOU,GUE) */
        int                hlen;                /* tun_hlen + encap_hlen */
        struct ip_tunnel_encap encap;

        /* for SIT */
#ifdef CONFIG_IPV6_SIT_6RD
        struct ip_tunnel_6rd_parm ip6rd;
#endif
        struct ip_tunnel_prl_entry __rcu *prl;        /* potential router list */
        unsigned int                prl_count;        /* # of entries in PRL */
        unsigned int                ip_tnl_net_id;
        struct gro_cells        gro_cells;
        __u32                        fwmark;
        bool                        collect_md;
        bool                        ignore_df;
};

struct tnl_ptk_info {
        IP_TUNNEL_DECLARE_FLAGS(flags);
        __be16 proto;
        __be32 key;
        __be32 seq;
        int hdr_len;
};

#define PACKET_RCVD        0
#define PACKET_REJECT        1
#define PACKET_NEXT        2

#define IP_TNL_HASH_BITS   7
#define IP_TNL_HASH_SIZE   (1 << IP_TNL_HASH_BITS)

struct ip_tunnel_net {
        struct net_device *fb_tunnel_dev;
        struct rtnl_link_ops *rtnl_link_ops;
        struct hlist_head tunnels[IP_TNL_HASH_SIZE];
        struct ip_tunnel __rcu *collect_md_tun;
        int type;
};

static inline void ip_tunnel_set_options_present(unsigned long *flags)
{
        IP_TUNNEL_DECLARE_FLAGS(present) = { };

        __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present);
        __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
        __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present);

        ip_tunnel_flags_or(flags, flags, present);
}

static inline void ip_tunnel_clear_options_present(unsigned long *flags)
{
        IP_TUNNEL_DECLARE_FLAGS(present) = { };

        __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present);
        __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
        __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present);

        __ipt_flag_op(bitmap_andnot, flags, flags, present);
}

static inline bool ip_tunnel_is_options_present(const unsigned long *flags)
{
        IP_TUNNEL_DECLARE_FLAGS(present) = { };

        __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present);
        __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
        __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present);

        return ip_tunnel_flags_intersect(flags, present);
}

static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags)
{
        IP_TUNNEL_DECLARE_FLAGS(supp) = { };

        bitmap_set(supp, 0, BITS_PER_TYPE(__be16));
        __set_bit(IP_TUNNEL_VTI_BIT, supp);

        return ip_tunnel_flags_subset(flags, supp);
}

static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags)
{
        ip_tunnel_flags_zero(dst);

        bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16));
        __assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI);
}

static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags)
{
        __be16 ret;

        ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16)));
        if (test_bit(IP_TUNNEL_VTI_BIT, flags))
                ret |= VTI_ISVTI;

        return ret;
}

static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
                                      __be32 saddr, __be32 daddr,
                                      u8 tos, u8 ttl, __be32 label,
                                      __be16 tp_src, __be16 tp_dst,
                                      __be64 tun_id,
                                      const unsigned long *tun_flags)
{
        key->tun_id = tun_id;
        key->u.ipv4.src = saddr;
        key->u.ipv4.dst = daddr;
        memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD,
               0, IP_TUNNEL_KEY_IPV4_PAD_LEN);
        key->tos = tos;
        key->ttl = ttl;
        key->label = label;
        ip_tunnel_flags_copy(key->tun_flags, tun_flags);

        /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
         * the upper tunnel are used.
         * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
         */
        key->tp_src = tp_src;
        key->tp_dst = tp_dst;

        /* Clear struct padding. */
        if (sizeof(*key) != IP_TUNNEL_KEY_SIZE)
                memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE,
                       0, sizeof(*key) - IP_TUNNEL_KEY_SIZE);
}

static inline bool
ip_tunnel_dst_cache_usable(const struct sk_buff *skb,
                           const struct ip_tunnel_info *info)
{
        if (skb->mark)
                return false;

        return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
}

static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
                                               *tun_info)
{
        return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
}

static inline __be64 key32_to_tunnel_id(__be32 key)
{
#ifdef __BIG_ENDIAN
        return (__force __be64)key;
#else
        return (__force __be64)((__force u64)key << 32);
#endif
}

/* Returns the least-significant 32 bits of a __be64. */
static inline __be32 tunnel_id_to_key32(__be64 tun_id)
{
#ifdef __BIG_ENDIAN
        return (__force __be32)tun_id;
#else
        return (__force __be32)((__force u64)tun_id >> 32);
#endif
}

#ifdef CONFIG_INET

static inline void ip_tunnel_init_flow(struct flowi4 *fl4,
                                       int proto,
                                       __be32 daddr, __be32 saddr,
                                       __be32 key, __u8 tos,
                                       struct net *net, int oif,
                                       __u32 mark, __u32 tun_inner_hash,
                                       __u8 flow_flags)
{
        memset(fl4, 0, sizeof(*fl4));

        if (oif) {
                fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index(net, oif);
                /* Legacy VRF/l3mdev use case */
                fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif;
        }

        fl4->daddr = daddr;
        fl4->saddr = saddr;
        fl4->flowi4_dscp = inet_dsfield_to_dscp(tos);
        fl4->flowi4_proto = proto;
        fl4->fl4_gre_key = key;
        fl4->flowi4_mark = mark;
        fl4->flowi4_multipath_hash = tun_inner_hash;
        fl4->flowi4_flags = flow_flags;
}

int __ip_tunnel_init(struct net_device *dev);
#define ip_tunnel_init(DEV)                        \
({                                                \
        struct net_device *__dev = (DEV);        \
        int __res = __ip_tunnel_init(__dev);        \
                                                \
        if (!__res)                                \
                netdev_lockdep_set_classes(__dev);\
        __res;                                        \
})

void ip_tunnel_uninit(struct net_device *dev);
void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
struct net *ip_tunnel_get_link_net(const struct net_device *dev);
int ip_tunnel_get_iflink(const struct net_device *dev);
int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
                       struct rtnl_link_ops *ops, char *devname);
void ip_tunnel_delete_net(struct net *net, unsigned int id,
                          struct rtnl_link_ops *ops,
                          struct list_head *dev_to_kill);

void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                    const struct iphdr *tnl_params, const u8 protocol);
void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                       const u8 proto, int tunnel_hlen);
int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
                  int cmd);
bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
                              const void __user *data);
bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp);
int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
                             void __user *data, int cmd);
int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);

struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
                                   int link, const unsigned long *flags,
                                   __be32 remote, __be32 local,
                                   __be32 key);

void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info);
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
                  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
                  bool log_ecn_error);
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
                         struct ip_tunnel_parm_kern *p, __u32 fwmark);
int ip_tunnel_newlink(struct net *net, struct net_device *dev,
                      struct nlattr *tb[], struct ip_tunnel_parm_kern *p,
                      __u32 fwmark);
void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);

bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
                                   struct ip_tunnel_encap *encap);

void ip_tunnel_netlink_parms(struct nlattr *data[],
                             struct ip_tunnel_parm_kern *parms);

extern const struct header_ops ip_tunnel_header_ops;
__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb);

struct ip_tunnel_encap_ops {
        size_t (*encap_hlen)(struct ip_tunnel_encap *e);
        int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
                            u8 *protocol, struct flowi4 *fl4);
        int (*err_handler)(struct sk_buff *skb, u32 info);
};

#define MAX_IPTUN_ENCAP_OPS 8

extern const struct ip_tunnel_encap_ops __rcu *
                iptun_encaps[MAX_IPTUN_ENCAP_OPS];

int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op,
                            unsigned int num);
int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
                            unsigned int num);

int ip_tunnel_encap_setup(struct ip_tunnel *t,
                          struct ip_tunnel_encap *ipencap);

static inline enum skb_drop_reason
pskb_inet_may_pull_reason(struct sk_buff *skb)
{
        int nhlen;

        switch (skb->protocol) {
#if IS_ENABLED(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                nhlen = sizeof(struct ipv6hdr);
                break;
#endif
        case htons(ETH_P_IP):
                nhlen = sizeof(struct iphdr);
                break;
        default:
                nhlen = 0;
        }

        return pskb_network_may_pull_reason(skb, nhlen);
}

static inline bool pskb_inet_may_pull(struct sk_buff *skb)
{
        return pskb_inet_may_pull_reason(skb) == SKB_NOT_DROPPED_YET;
}

/* Variant of pskb_inet_may_pull().
 */
static inline enum skb_drop_reason
skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit)
{
        int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN;
        __be16 type = skb->protocol;
        enum skb_drop_reason reason;

        /* Essentially this is skb_protocol(skb, true)
         * And we get MAC len.
         */
        if (eth_type_vlan(type))
                type = __vlan_get_protocol(skb, type, &maclen);

        switch (type) {
#if IS_ENABLED(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                nhlen = sizeof(struct ipv6hdr);
                break;
#endif
        case htons(ETH_P_IP):
                nhlen = sizeof(struct iphdr);
                break;
        }
        /* For ETH_P_IPV6/ETH_P_IP we make sure to pull
         * a base network header in skb->head.
         */
        reason = pskb_may_pull_reason(skb, maclen + nhlen);
        if (reason)
                return reason;

        skb_set_network_header(skb, maclen);

        return SKB_NOT_DROPPED_YET;
}

static inline int ip_encap_hlen(struct ip_tunnel_encap *e)
{
        const struct ip_tunnel_encap_ops *ops;
        int hlen = -EINVAL;

        if (e->type == TUNNEL_ENCAP_NONE)
                return 0;

        if (e->type >= MAX_IPTUN_ENCAP_OPS)
                return -EINVAL;

        rcu_read_lock();
        ops = rcu_dereference(iptun_encaps[e->type]);
        if (likely(ops && ops->encap_hlen))
                hlen = ops->encap_hlen(e);
        rcu_read_unlock();

        return hlen;
}

static inline int ip_tunnel_encap(struct sk_buff *skb,
                                  struct ip_tunnel_encap *e,
                                  u8 *protocol, struct flowi4 *fl4)
{
        const struct ip_tunnel_encap_ops *ops;
        int ret = -EINVAL;

        if (e->type == TUNNEL_ENCAP_NONE)
                return 0;

        if (e->type >= MAX_IPTUN_ENCAP_OPS)
                return -EINVAL;

        rcu_read_lock();
        ops = rcu_dereference(iptun_encaps[e->type]);
        if (likely(ops && ops->build_header))
                ret = ops->build_header(skb, e, protocol, fl4);
        rcu_read_unlock();

        return ret;
}

/* Extract dsfield from inner protocol */
static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
                                       const struct sk_buff *skb)
{
        __be16 payload_protocol = skb_protocol(skb, true);

        if (payload_protocol == htons(ETH_P_IP))
                return iph->tos;
        else if (payload_protocol == htons(ETH_P_IPV6))
                return ipv6_get_dsfield((const struct ipv6hdr *)iph);
        else
                return 0;
}

static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph,
                                             const struct sk_buff *skb)
{
        __be16 payload_protocol = skb_protocol(skb, true);

        if (payload_protocol == htons(ETH_P_IPV6))
                return ip6_flowlabel((const struct ipv6hdr *)iph);
        else
                return 0;
}

static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
                                       const struct sk_buff *skb)
{
        __be16 payload_protocol = skb_protocol(skb, true);

        if (payload_protocol == htons(ETH_P_IP))
                return iph->ttl;
        else if (payload_protocol == htons(ETH_P_IPV6))
                return ((const struct ipv6hdr *)iph)->hop_limit;
        else
                return 0;
}

/* Propagate ECN bits out */
static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
                                     const struct sk_buff *skb)
{
        u8 inner = ip_tunnel_get_dsfield(iph, skb);

        return INET_ECN_encapsulate(tos, inner);
}

int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
                           __be16 inner_proto, bool raw_proto, bool xnet);

static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
                                       __be16 inner_proto, bool xnet)
{
        return __iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet);
}

void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
                   __be32 src, __be32 dst, u8 proto,
                   u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
                                             gfp_t flags);
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
                          int headroom, bool reply);

static inline void ip_tunnel_adj_headroom(struct net_device *dev,
                                          unsigned int headroom)
{
        /* we must cap headroom to some upperlimit, else pskb_expand_head
         * will overflow header offsets in skb_headers_offset_update().
         */
        const unsigned int max_allowed = 512;

        if (headroom > max_allowed)
                headroom = max_allowed;

        if (headroom > READ_ONCE(dev->needed_headroom))
                WRITE_ONCE(dev->needed_headroom, headroom);
}

int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask);

static inline int iptunnel_pull_offloads(struct sk_buff *skb)
{
        if (skb_is_gso(skb)) {
                int err;

                err = skb_unclone(skb, GFP_ATOMIC);
                if (unlikely(err))
                        return err;
                skb_shinfo(skb)->gso_type &= ~(NETIF_F_GSO_ENCAP_ALL >>
                                               NETIF_F_GSO_SHIFT);
        }

        skb->encapsulation = 0;
        return 0;
}

static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
{
        if (pkt_len > 0) {
                if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
                        struct pcpu_dstats *dstats = get_cpu_ptr(dev->dstats);

                        u64_stats_update_begin(&dstats->syncp);
                        u64_stats_add(&dstats->tx_bytes, pkt_len);
                        u64_stats_inc(&dstats->tx_packets);
                        u64_stats_update_end(&dstats->syncp);
                        put_cpu_ptr(dstats);
                        return;
                }
                if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
                        struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats);

                        u64_stats_update_begin(&tstats->syncp);
                        u64_stats_add(&tstats->tx_bytes, pkt_len);
                        u64_stats_inc(&tstats->tx_packets);
                        u64_stats_update_end(&tstats->syncp);
                        put_cpu_ptr(tstats);
                        return;
                }
                pr_err_once("iptunnel_xmit_stats pcpu_stat_type=%d\n",
                            dev->pcpu_stat_type);
                WARN_ON_ONCE(1);
                return;
        }

        if (pkt_len < 0) {
                DEV_STATS_INC(dev, tx_errors);
                DEV_STATS_INC(dev, tx_aborted_errors);
        } else {
                DEV_STATS_INC(dev, tx_dropped);
        }
}

static inline void ip_tunnel_info_opts_get(void *to,
                                           const struct ip_tunnel_info *info)
{
        memcpy(to, ip_tunnel_info_opts(info), info->options_len);
}

static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
                                           const void *from, int len,
                                           const unsigned long *flags)
{
        info->options_len = len;
        if (len > 0) {
                memcpy(ip_tunnel_info_opts(info), from, len);
                ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags,
                                   flags);
        }
}

static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
{
        return (struct ip_tunnel_info *)lwtstate->data;
}

DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);

/* Returns > 0 if metadata should be collected */
static inline int ip_tunnel_collect_metadata(void)
{
        return static_branch_unlikely(&ip_tunnel_metadata_cnt);
}

void __init ip_tunnel_core_init(void);

void ip_tunnel_need_metadata(void);
void ip_tunnel_unneed_metadata(void);

#else /* CONFIG_INET */

static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
{
        return NULL;
}

static inline void ip_tunnel_need_metadata(void)
{
}

static inline void ip_tunnel_unneed_metadata(void)
{
}

static inline void ip_tunnel_info_opts_get(void *to,
                                           const struct ip_tunnel_info *info)
{
}

static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
                                           const void *from, int len,
                                           const unsigned long *flags)
{
        info->options_len = 0;
}

#endif /* CONFIG_INET */

#endif /* __NET_IP_TUNNELS_H */
















   12 


















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2025 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/mempool.h>
#include <linux/fserror.h>

#define FSERROR_DEFAULT_EVENT_POOL_SIZE                (32)

static struct mempool fserror_events_pool;

void fserror_mount(struct super_block *sb)
{
        /*
         * The pending error counter is biased by 1 so that we don't wake_var
         * until we're actually trying to unmount.
         */
        refcount_set(&sb->s_pending_errors, 1);
}

void fserror_unmount(struct super_block *sb)
{
        /*
         * If we don't drop the pending error count to zero, then wait for it
         * to drop below 1, which means that the pending errors cleared and
         * hopefully we didn't saturate with 1 billion+ concurrent events.
         */
        if (!refcount_dec_and_test(&sb->s_pending_errors))
                wait_var_event(&sb->s_pending_errors,
                               refcount_read(&sb->s_pending_errors) < 1);
}

static inline void fserror_pending_dec(struct super_block *sb)
{
        if (refcount_dec_and_test(&sb->s_pending_errors))
                wake_up_var(&sb->s_pending_errors);
}

static inline void fserror_free_event(struct fserror_event *event)
{
        fserror_pending_dec(event->sb);
        mempool_free(event, &fserror_events_pool);
}

static void fserror_worker(struct work_struct *work)
{
        struct fserror_event *event =
                        container_of(work, struct fserror_event, work);
        struct super_block *sb = event->sb;

        if (sb->s_flags & SB_ACTIVE) {
                struct fs_error_report report = {
                        /* send positive error number to userspace */
                        .error = -event->error,
                        .inode = event->inode,
                        .sb = event->sb,
                };

                if (sb->s_op->report_error)
                        sb->s_op->report_error(event);

                fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL,
                         NULL, 0);
        }

        iput(event->inode);
        fserror_free_event(event);
}

static inline struct fserror_event *fserror_alloc_event(struct super_block *sb,
                                                        gfp_t gfp_flags)
{
        struct fserror_event *event = NULL;

        /*
         * If pending_errors already reached zero or is no longer active,
         * the superblock is being deactivated so there's no point in
         * continuing.
         *
         * The order of the check of s_pending_errors and SB_ACTIVE are
         * mandated by order of accesses in generic_shutdown_super and
         * fserror_unmount.  Barriers are implicitly provided by the refcount
         * manipulations in this function and fserror_unmount.
         */
        if (!refcount_inc_not_zero(&sb->s_pending_errors))
                return NULL;
        if (!(sb->s_flags & SB_ACTIVE))
                goto out_pending;

        event = mempool_alloc(&fserror_events_pool, gfp_flags);
        if (!event)
                goto out_pending;

        /* mempool_alloc doesn't support GFP_ZERO */
        memset(event, 0, sizeof(*event));
        event->sb = sb;
        INIT_WORK(&event->work, fserror_worker);

        return event;

out_pending:
        fserror_pending_dec(sb);
        return NULL;
}

/**
 * fserror_report - report a filesystem error of some kind
 *
 * @sb:                superblock of the filesystem
 * @inode:        inode within that filesystem, if applicable
 * @type:        type of error encountered
 * @pos:        start of inode range affected, if applicable
 * @len:        length of inode range affected, if applicable
 * @error:        error number encountered, must be negative
 * @gfp:        memory allocation flags for conveying the event to a worker,
 *                since this function can be called from atomic contexts
 *
 * Report details of a filesystem error to the super_operations::report_error
 * callback if present; and to fsnotify for distribution to userspace.  @sb,
 * @gfp, @type, and @error must all be specified.  For file I/O errors, the
 * @inode, @pos, and @len fields must also be specified.  For file metadata
 * errors, @inode must be specified.  If @inode is not NULL, then @inode->i_sb
 * must point to @sb.
 *
 * Reporting work is deferred to a workqueue to ensure that ->report_error is
 * called from process context without any locks held.  An active reference to
 * the inode is maintained until event handling is complete, and unmount will
 * wait for queued events to drain.
 */
void fserror_report(struct super_block *sb, struct inode *inode,
                    enum fserror_type type, loff_t pos, u64 len, int error,
                    gfp_t gfp)
{
        struct fserror_event *event;

        /* sb and inode must be from the same filesystem */
        WARN_ON_ONCE(inode && inode->i_sb != sb);

        /* error number must be negative */
        WARN_ON_ONCE(error >= 0);

        event = fserror_alloc_event(sb, gfp);
        if (!event)
                goto lost;

        event->type = type;
        event->pos = pos;
        event->len = len;
        event->error = error;

        /*
         * Can't iput from non-sleeping context, so grabbing another reference
         * to the inode must be the last thing before submitting the event.
         */
        if (inode) {
                event->inode = igrab(inode);
                if (!event->inode)
                        goto lost_event;
        }

        /*
         * Use schedule_work here even if we're already in process context so
         * that fsnotify and super_operations::report_error implementations are
         * guaranteed to run in process context without any locks held.  Since
         * errors are supposed to be rare, the overhead shouldn't kill us any
         * more than the failing device will.
         */
        schedule_work(&event->work);
        return;

lost_event:
        fserror_free_event(event);
lost:
        if (inode)
                pr_err_ratelimited(
 "%s: lost file I/O error report for ino %llu type %u pos 0x%llx len 0x%llx error %d",
                       sb->s_id, inode->i_ino, type, pos, len, error);
        else
                pr_err_ratelimited(
 "%s: lost filesystem error report for type %u error %d",
                       sb->s_id, type, error);
}
EXPORT_SYMBOL_GPL(fserror_report);

static int __init fserror_init(void)
{
        return mempool_init_kmalloc_pool(&fserror_events_pool,
                                         FSERROR_DEFAULT_EVENT_POOL_SIZE,
                                         sizeof(struct fserror_event));
}
fs_initcall(fserror_init);







































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TLB_H
#define _ASM_X86_TLB_H

#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);

#include <asm-generic/tlb.h>
#include <linux/kernel.h>
#include <vdso/bits.h>
#include <vdso/page.h>

static inline void tlb_flush(struct mmu_gather *tlb)
{
        unsigned long start = 0UL, end = TLB_FLUSH_ALL;
        unsigned int stride_shift = tlb_get_unmap_shift(tlb);

        if (!tlb->fullmm && !tlb->need_flush_all) {
                start = tlb->start;
                end = tlb->end;
        }

        flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
}

static inline void invlpg(unsigned long addr)
{
        asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
}

enum addr_stride {
        PTE_STRIDE = 0,
        PMD_STRIDE = 1
};

/*
 * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
 * of the three. For example:
 * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
 * - FLAG_PCID:                            invalidate all TLB entries matching the PCID
 *
 * The first is used to invalidate (kernel) mappings at a particular
 * address across all processes.
 *
 * The latter invalidates all TLB entries matching a PCID.
 */
#define INVLPGB_FLAG_VA                        BIT(0)
#define INVLPGB_FLAG_PCID                BIT(1)
#define INVLPGB_FLAG_ASID                BIT(2)
#define INVLPGB_FLAG_INCLUDE_GLOBAL        BIT(3)
#define INVLPGB_FLAG_FINAL_ONLY                BIT(4)
#define INVLPGB_FLAG_INCLUDE_NESTED        BIT(5)

/* The implied mode when all bits are clear: */
#define INVLPGB_MODE_ALL_NONGLOBALS        0UL

#ifdef CONFIG_BROADCAST_TLB_FLUSH
/*
 * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
 *
 * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
 * be done in a parallel fashion.
 *
 * The instruction takes the number of extra pages to invalidate, beyond the
 * first page, while __invlpgb gets the more human readable number of pages to
 * invalidate.
 *
 * The bits in rax[0:2] determine respectively which components of the address
 * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
 * address in the specified range matches.
 *
 * Since it is desired to only flush TLB entries for the ASID that is executing
 * the instruction (a host/hypervisor or a guest), the ASID valid bit should
 * always be set. On a host/hypervisor, the hardware will use the ASID value
 * specified in EDX[15:0] (which should be 0). On a guest, the hardware will
 * use the actual ASID value of the guest.
 *
 * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
 * this CPU have completed.
 */
static inline void __invlpgb(unsigned long asid, unsigned long pcid,
                             unsigned long addr, u16 nr_pages,
                             enum addr_stride stride, u8 flags)
{
        u64 rax = addr | flags | INVLPGB_FLAG_ASID;
        u32 ecx = (stride << 31) | (nr_pages - 1);
        u32 edx = (pcid << 16) | asid;

        /* The low bits in rax are for flags. Verify addr is clean. */
        VM_WARN_ON_ONCE(addr & ~PAGE_MASK);

        /* INVLPGB; supported in binutils >= 2.36. */
        asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx));
}

static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags)
{
        __invlpgb(asid, pcid, 0, 1, 0, flags);
}

static inline void __tlbsync(void)
{
        /*
         * TLBSYNC waits for INVLPGB instructions originating on the same CPU
         * to have completed. Print a warning if the task has been migrated,
         * and might not be waiting on all the INVLPGBs issued during this TLB
         * invalidation sequence.
         */
        cant_migrate();

        /* TLBSYNC: supported in binutils >= 0.36. */
        asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
}
#else
/* Some compilers (I'm looking at you clang!) simply can't do DCE */
static inline void __invlpgb(unsigned long asid, unsigned long pcid,
                             unsigned long addr, u16 nr_pages,
                             enum addr_stride s, u8 flags) { }
static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { }
static inline void __tlbsync(void) { }
#endif

static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
                                                unsigned long addr,
                                                u16 nr, bool stride)
{
        enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE;
        u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA;

        __invlpgb(0, pcid, addr, nr, str, flags);
}

/* Flush all mappings for a given PCID, not including globals. */
static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
{
        __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID);
}

/* Flush all mappings, including globals, for all PCIDs. */
static inline void invlpgb_flush_all(void)
{
        /*
         * TLBSYNC at the end needs to make sure all flushes done on the
         * current CPU have been executed system-wide. Therefore, make
         * sure nothing gets migrated in-between but disable preemption
         * as it is cheaper.
         */
        guard(preempt)();
        __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL);
        __tlbsync();
}

/* Flush addr, including globals, for all PCIDs. */
static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
{
        __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL);
}

/* Flush all mappings for all PCIDs except globals. */
static inline void invlpgb_flush_all_nonglobals(void)
{
        guard(preempt)();
        __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS);
        __tlbsync();
}
#endif /* _ASM_X86_TLB_H */



















































































































































































































   14 






























    9 






























   11 









   10 















































   12 












    2 































































   12 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef INT_BLK_MQ_H
#define INT_BLK_MQ_H

#include <linux/blk-mq.h>
#include "blk-stat.h"

struct blk_mq_tag_set;
struct elevator_tags;

struct blk_mq_ctxs {
        struct kobject kobj;
        struct blk_mq_ctx __percpu        *queue_ctx;
};

/**
 * struct blk_mq_ctx - State for a software queue facing the submitting CPUs
 */
struct blk_mq_ctx {
        struct {
                spinlock_t                lock;
                struct list_head        rq_lists[HCTX_MAX_TYPES];
        } ____cacheline_aligned_in_smp;

        unsigned int                cpu;
        unsigned short                index_hw[HCTX_MAX_TYPES];
        struct blk_mq_hw_ctx         *hctxs[HCTX_MAX_TYPES];

        struct request_queue        *queue;
        struct blk_mq_ctxs      *ctxs;
        struct kobject                kobj;
} ____cacheline_aligned_in_smp;

enum {
        BLK_MQ_NO_TAG                = -1U,
        BLK_MQ_TAG_MIN                = 1,
        BLK_MQ_TAG_MAX                = BLK_MQ_NO_TAG - 1,
};

#define BLK_MQ_CPU_WORK_BATCH        (8)

typedef unsigned int __bitwise blk_insert_t;
#define BLK_MQ_INSERT_AT_HEAD                ((__force blk_insert_t)0x01)

void blk_mq_submit_bio(struct bio *bio);
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
                unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q);
struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
                                                struct elevator_tags *tags,
                                                unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
                             bool);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
                                        struct blk_mq_ctx *start);
void blk_mq_put_rq_ref(struct request *rq);

/*
 * Internal helpers for allocating/freeing the request map
 */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags);
struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
                                unsigned int hctx_idx, unsigned int depth);
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
                             struct blk_mq_tags *tags,
                             unsigned int hctx_idx);

/*
 * CPU -> queue mappings
 */
extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);

/*
 * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue
 * @q: request queue
 * @type: the hctx type index
 * @cpu: CPU
 */
static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q,
                                                          enum hctx_type type,
                                                          unsigned int cpu)
{
        return queue_hctx((q), (q->tag_set->map[type].mq_map[cpu]));
}

static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
{
        enum hctx_type type = HCTX_TYPE_DEFAULT;

        /*
         * The caller ensure that if REQ_POLLED, poll must be enabled.
         */
        if (opf & REQ_POLLED)
                type = HCTX_TYPE_POLL;
        else if ((opf & REQ_OP_MASK) == REQ_OP_READ)
                type = HCTX_TYPE_READ;
        return type;
}

/*
 * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
 * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED).
 * @ctx: software queue cpu ctx
 */
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf,
                                                     struct blk_mq_ctx *ctx)
{
        return ctx->hctxs[blk_mq_get_hctx_type(opf)];
}

/*
 * Default to double of smaller one between hw queue_depth and
 * 128, since we don't split into sync/async like the old code
 * did. Additionally, this is a per-hw queue depth.
 */
static inline unsigned int blk_mq_default_nr_requests(
                struct blk_mq_tag_set *set)
{
        return 2 * min_t(unsigned int, set->queue_depth, BLKDEV_DEFAULT_RQ);
}

/*
 * sysfs helpers
 */
extern void blk_mq_sysfs_init(struct request_queue *q);
extern void blk_mq_sysfs_deinit(struct request_queue *q);
int blk_mq_sysfs_register(struct gendisk *disk);
void blk_mq_sysfs_unregister(struct gendisk *disk);
int blk_mq_sysfs_register_hctxs(struct request_queue *q);
void blk_mq_sysfs_unregister_hctxs(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_free_plug_rqs(struct blk_plug *plug);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);

void blk_mq_cancel_work_sync(struct request_queue *q);

void blk_mq_release(struct request_queue *q);

static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
                                           unsigned int cpu)
{
        return per_cpu_ptr(q->queue_ctx, cpu);
}

/*
 * This assumes per-cpu software queueing queues. They could be per-node
 * as well, for instance. For now this is hardcoded as-is. Note that we don't
 * care about preemption, since we know the ctx's are persistent. This does
 * mean that we can't rely on ctx always matching the currently running CPU.
 */
static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
{
        return __blk_mq_get_ctx(q, raw_smp_processor_id());
}

struct blk_mq_alloc_data {
        /* input parameter */
        struct request_queue *q;
        blk_mq_req_flags_t flags;
        unsigned int shallow_depth;
        blk_opf_t cmd_flags;
        req_flags_t rq_flags;

        /* allocate multiple requests/tags in one go */
        unsigned int nr_tags;
        struct rq_list *cached_rqs;

        /* input & output parameter */
        struct blk_mq_ctx *ctx;
        struct blk_mq_hw_ctx *hctx;
};

struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
                unsigned int reserved_tags, unsigned int flags, int node);
void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags);

unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
                unsigned int *offset);
void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                unsigned int tag);
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
                unsigned int size);
void blk_mq_tag_update_sched_shared_tags(struct request_queue *q,
                                         unsigned int nr);

void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
                void *priv);
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
                void *priv);

static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
                                                 struct blk_mq_hw_ctx *hctx)
{
        if (!hctx)
                return &bt->ws[0];
        return sbq_wait_ptr(bt, &hctx->wait_index);
}

void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);

static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_tag_busy(hctx);
}

static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_tag_idle(hctx);
}

static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
                                          unsigned int tag)
{
        return tag < tags->nr_reserved_tags;
}

static inline bool blk_mq_is_shared_tags(unsigned int flags)
{
        return flags & BLK_MQ_F_TAG_HCTX_SHARED;
}

static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
        if (data->rq_flags & RQF_SCHED_TAGS)
                return data->hctx->sched_tags;
        return data->hctx->tags;
}

static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
{
        /* Fast path: hardware queue is not stopped most of the time. */
        if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
                return false;

        /*
         * This barrier is used to order adding of dispatch list before and
         * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier
         * in blk_mq_start_stopped_hw_queue() so that dispatch code could
         * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not
         * empty to avoid missing dispatching requests.
         */
        smp_mb();

        return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
}

static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
{
        return hctx->nr_ctx && hctx->tags;
}

void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]);

static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
                                              int budget_token)
{
        if (q->mq_ops->put_budget)
                q->mq_ops->put_budget(q, budget_token);
}

static inline int blk_mq_get_dispatch_budget(struct request_queue *q)
{
        if (q->mq_ops->get_budget)
                return q->mq_ops->get_budget(q);
        return 0;
}

static inline void blk_mq_set_rq_budget_token(struct request *rq, int token)
{
        if (token < 0)
                return;

        if (rq->q->mq_ops->set_rq_budget_token)
                rq->q->mq_ops->set_rq_budget_token(rq, token);
}

static inline int blk_mq_get_rq_budget_token(struct request *rq)
{
        if (rq->q->mq_ops->get_rq_budget_token)
                return rq->q->mq_ops->get_rq_budget_token(rq);
        return -1;
}

static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
                                                int val)
{
        if (blk_mq_is_shared_tags(hctx->flags))
                atomic_add(val, &hctx->queue->nr_active_requests_shared_tags);
        else
                atomic_add(val, &hctx->nr_active);
}

static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
        __blk_mq_add_active_requests(hctx, 1);
}

static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
                int val)
{
        if (blk_mq_is_shared_tags(hctx->flags))
                atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
        else
                atomic_sub(val, &hctx->nr_active);
}

static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{
        __blk_mq_sub_active_requests(hctx, 1);
}

static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
                                              int val)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_add_active_requests(hctx, val);
}

static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_inc_active_requests(hctx);
}

static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
                                              int val)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_sub_active_requests(hctx, val);
}

static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_dec_active_requests(hctx);
}

static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
        if (blk_mq_is_shared_tags(hctx->flags))
                return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
        return atomic_read(&hctx->nr_active);
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
                                           struct request *rq)
{
        blk_mq_dec_active_requests(hctx);
        blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
        rq->tag = BLK_MQ_NO_TAG;
}

static inline void blk_mq_put_driver_tag(struct request *rq)
{
        if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG)
                return;

        __blk_mq_put_driver_tag(rq->mq_hctx, rq);
}

bool __blk_mq_alloc_driver_tag(struct request *rq);

static inline bool blk_mq_get_driver_tag(struct request *rq)
{
        if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
                return false;

        return true;
}

static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
{
        int cpu;

        for_each_possible_cpu(cpu)
                qmap->mq_map[cpu] = 0;
}

/* Free all requests on the list */
static inline void blk_mq_free_requests(struct list_head *list)
{
        while (!list_empty(list)) {
                struct request *rq = list_entry_rq(list->next);

                list_del_init(&rq->queuelist);
                blk_mq_free_request(rq);
        }
}

/*
 * For shared tag users, we track the number of currently active users
 * and attempt to provide a fair share of the tag depth for each of them.
 */
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
                                  struct sbitmap_queue *bt)
{
        unsigned int depth, users;

        if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
                return true;

        /*
         * Don't try dividing an ant
         */
        if (bt->sb.depth == 1)
                return true;

        if (blk_mq_is_shared_tags(hctx->flags)) {
                struct request_queue *q = hctx->queue;

                if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
                        return true;
        } else {
                if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        return true;
        }

        users = READ_ONCE(hctx->tags->active_queues);
        if (!users)
                return true;

        /*
         * Allow at least some tags
         */
        depth = max((bt->sb.depth + users - 1) / users, 4U);
        return __blk_mq_active_requests(hctx) < depth;
}

/* run the code block in @dispatch_ops with rcu/srcu read lock held */
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops)        \
do {                                                                \
        if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) {                \
                struct blk_mq_tag_set *__tag_set = (q)->tag_set; \
                int srcu_idx;                                        \
                                                                \
                might_sleep_if(check_sleep);                        \
                srcu_idx = srcu_read_lock(__tag_set->srcu);        \
                (dispatch_ops);                                        \
                srcu_read_unlock(__tag_set->srcu, srcu_idx);        \
        } else {                                                \
                rcu_read_lock();                                \
                (dispatch_ops);                                        \
                rcu_read_unlock();                                \
        }                                                        \
} while (0)

#define blk_mq_run_dispatch_ops(q, dispatch_ops)                \
        __blk_mq_run_dispatch_ops(q, true, dispatch_ops)        \

static inline bool blk_mq_can_poll(struct request_queue *q)
{
        return (q->limits.features & BLK_FEAT_POLL) &&
                q->tag_set->map[HCTX_TYPE_POLL].nr_queues;
}

#endif































































































































































































































































    1 






    1 








































































































































































































































































































































































































    1 
    1 










































































































    1 














    1 














































    1 
















































































    1 
















    1 



















    1 












    1 







































































    1 
































































































































































































































































































    1 



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGETLB_H
#define _LINUX_HUGETLB_H

#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/fs.h>
#include <linux/hugetlb_inline.h>
#include <linux/cgroup.h>
#include <linux/page_ref.h>
#include <linux/list.h>
#include <linux/kref.h>
#include <linux/pgtable.h>
#include <linux/gfp.h>
#include <linux/userfaultfd_k.h>
#include <linux/nodemask.h>

struct mmu_gather;
struct node;

void free_huge_folio(struct folio *folio);

#ifdef CONFIG_HUGETLB_PAGE

#include <linux/pagemap.h>
#include <linux/shm.h>
#include <asm/tlbflush.h>

/*
 * For HugeTLB page, there are more metadata to save in the struct page. But
 * the head struct page cannot meet our needs, so we have to abuse other tail
 * struct page to store the metadata.
 */
#define __NR_USED_SUBPAGE 3

struct hugepage_subpool {
        spinlock_t lock;
        long count;
        long max_hpages;        /* Maximum huge pages or -1 if no maximum. */
        long used_hpages;        /* Used count against maximum, includes */
                                /* both allocated and reserved pages. */
        struct hstate *hstate;
        long min_hpages;        /* Minimum huge pages or -1 if no minimum. */
        long rsv_hpages;        /* Pages reserved against global pool to */
                                /* satisfy minimum size. */
};

struct resv_map {
        struct kref refs;
        spinlock_t lock;
        struct list_head regions;
        long adds_in_progress;
        struct list_head region_cache;
        long region_cache_count;
        struct rw_semaphore rw_sema;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On private mappings, the counter to uncharge reservations is stored
         * here. If these fields are 0, then either the mapping is shared, or
         * cgroup accounting is disabled for this resv_map.
         */
        struct page_counter *reservation_counter;
        unsigned long pages_per_hpage;
        struct cgroup_subsys_state *css;
#endif
};

/*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
 *
 * The region data structures are embedded into a resv_map and protected
 * by a resv_map's lock.  The set of regions within the resv_map represent
 * reservations for huge pages, or huge pages that have already been
 * instantiated within the map.  The from and to elements are huge page
 * indices into the associated mapping.  from indicates the starting index
 * of the region.  to represents the first index past the end of  the region.
 *
 * For example, a file region structure with from == 0 and to == 4 represents
 * four huge pages in a mapping.  It is important to note that the to element
 * represents the first element past the end of the region. This is used in
 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
 *
 * Interval notation of the form [from, to) will be used to indicate that
 * the endpoint from is inclusive and to is exclusive.
 */
struct file_region {
        struct list_head link;
        long from;
        long to;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On shared mappings, each reserved region appears as a struct
         * file_region in resv_map. These fields hold the info needed to
         * uncharge each reservation.
         */
        struct page_counter *reservation_counter;
        struct cgroup_subsys_state *css;
#endif
};

struct hugetlb_vma_lock {
        struct kref refs;
        struct rw_semaphore rw_sema;
        struct vm_area_struct *vma;
};

extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);

extern spinlock_t hugetlb_lock;
extern int hugetlb_max_hstate __read_mostly;
#define for_each_hstate(h) \
        for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)

struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
                                                long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);

void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
int move_hugetlb_page_tables(struct vm_area_struct *vma,
                             struct vm_area_struct *new_vma,
                             unsigned long old_addr, unsigned long new_addr,
                             unsigned long len);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
                            struct vm_area_struct *, struct vm_area_struct *);
void unmap_hugepage_range(struct vm_area_struct *,
                          unsigned long start, unsigned long end,
                          struct folio *, zap_flags_t);
void __unmap_hugepage_range(struct mmu_gather *tlb,
                          struct vm_area_struct *vma,
                          unsigned long start, unsigned long end,
                          struct folio *, zap_flags_t zap_flags);
void hugetlb_report_meminfo(struct seq_file *);
int hugetlb_report_node_meminfo(char *buf, int len, int nid);
void hugetlb_show_meminfo_node(int nid);
unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
#ifdef CONFIG_USERFAULTFD
int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                             struct vm_area_struct *dst_vma,
                             unsigned long dst_addr,
                             unsigned long src_addr,
                             uffd_flags_t flags,
                             struct folio **foliop);
#endif /* CONFIG_USERFAULTFD */
long hugetlb_reserve_pages(struct inode *inode, long from, long to,
                           struct vm_area_desc *desc, vma_flags_t vma_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                long freed);
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                bool *migratable_cleared);
void folio_putback_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);

pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud);
bool hugetlbfs_pagecache_present(struct hstate *h,
                                 struct vm_area_struct *vma,
                                 unsigned long address);

struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);

extern int movable_gigantic_pages __read_mostly;
extern int sysctl_hugetlb_shm_group __read_mostly;
extern struct list_head huge_boot_pages[MAX_NUMNODES];

void hugetlb_bootmem_alloc(void);
extern nodemask_t hugetlb_bootmem_nodes;
void hugetlb_bootmem_set_nodes(void);

/* arch callbacks */

#ifndef CONFIG_HIGHPTE
/*
 * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures
 * which may go down to the lowest PTE level in their huge_pte_offset() and
 * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap().
 */
static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address)
{
        return pte_offset_kernel(pmd, address);
}
static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd,
                                    unsigned long address)
{
        return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address);
}
#endif

pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz);
/*
 * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
 * Returns the pte_t* if found, or NULL if the address is not mapped.
 *
 * IMPORTANT: we should normally not directly call this function, instead
 * this is only a common interface to implement arch-specific
 * walker. Please use hugetlb_walk() instead, because that will attempt to
 * verify the locking for you.
 *
 * Since this function will walk all the pgtable pages (including not only
 * high-level pgtable page, but also PUD entry that can be unshared
 * concurrently for VM_SHARED), the caller of this function should be
 * responsible of its thread safety.  One can follow this rule:
 *
 *  (1) For private mappings: pmd unsharing is not possible, so holding the
 *      mmap_lock for either read or write is sufficient. Most callers
 *      already hold the mmap_lock, so normally, no special action is
 *      required.
 *
 *  (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged
 *      pgtable page can go away from under us!  It can be done by a pmd
 *      unshare with a follow up munmap() on the other process), then we
 *      need either:
 *
 *     (2.1) hugetlb vma lock read or write held, to make sure pmd unshare
 *           won't happen upon the range (it also makes sure the pte_t we
 *           read is the right and stable one), or,
 *
 *     (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make
 *           sure even if unshare happened the racy unmap() will wait until
 *           i_mmap_rwsem is released.
 *
 * Option (2.1) is the safest, which guarantees pte stability from pmd
 * sharing pov, until the vma lock released.  Option (2.2) doesn't protect
 * a concurrent pmd unshare, but it makes sure the pgtable page is safe to
 * access.
 */
pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep);
void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end);

extern void __hugetlb_zap_begin(struct vm_area_struct *vma,
                                unsigned long *begin, unsigned long *end);
extern void __hugetlb_zap_end(struct vm_area_struct *vma,
                              struct zap_details *details);

static inline void hugetlb_zap_begin(struct vm_area_struct *vma,
                                     unsigned long *start, unsigned long *end)
{
        if (is_vm_hugetlb_page(vma))
                __hugetlb_zap_begin(vma, start, end);
}

static inline void hugetlb_zap_end(struct vm_area_struct *vma,
                                   struct zap_details *details)
{
        if (is_vm_hugetlb_page(vma))
                __hugetlb_zap_end(vma, details);
}

void hugetlb_vma_lock_read(struct vm_area_struct *vma);
void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
void hugetlb_vma_lock_write(struct vm_area_struct *vma);
void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot,
                unsigned long cp_flags);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);

unsigned int arch_hugetlb_cma_order(void);

#else /* !CONFIG_HUGETLB_PAGE */

static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
}

static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}

static inline unsigned long hugetlb_total_pages(void)
{
        return 0;
}

static inline struct address_space *hugetlb_folio_mapping_lock_write(
                                                        struct folio *folio)
{
        return NULL;
}

static inline int huge_pmd_unshare(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
        return 0;
}

static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb,
                struct vm_area_struct *vma)
{
}

static inline void adjust_range_if_pmd_sharing_possible(
                                struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

static inline void hugetlb_zap_begin(
                                struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

static inline void hugetlb_zap_end(
                                struct vm_area_struct *vma,
                                struct zap_details *details)
{
}

static inline int copy_hugetlb_page_range(struct mm_struct *dst,
                                          struct mm_struct *src,
                                          struct vm_area_struct *dst_vma,
                                          struct vm_area_struct *src_vma)
{
        BUG();
        return 0;
}

static inline int move_hugetlb_page_tables(struct vm_area_struct *vma,
                                           struct vm_area_struct *new_vma,
                                           unsigned long old_addr,
                                           unsigned long new_addr,
                                           unsigned long len)
{
        BUG();
        return 0;
}

static inline void hugetlb_report_meminfo(struct seq_file *m)
{
}

static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
        return 0;
}

static inline void hugetlb_show_meminfo_node(int nid)
{
}

static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
{
}

static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{
        return 1;
}

static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
{
}

static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}

#ifdef CONFIG_USERFAULTFD
static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                                           struct vm_area_struct *dst_vma,
                                           unsigned long dst_addr,
                                           unsigned long src_addr,
                                           uffd_flags_t flags,
                                           struct folio **foliop)
{
        BUG();
        return 0;
}
#endif /* CONFIG_USERFAULTFD */

static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
                                        unsigned long sz)
{
        return NULL;
}

static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
{
        return false;
}

static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
{
        return 0;
}

static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared)
{
        return 0;
}

static inline void folio_putback_hugetlb(struct folio *folio)
{
}

static inline void move_hugetlb_state(struct folio *old_folio,
                                        struct folio *new_folio, int reason)
{
}

static inline long hugetlb_change_protection(
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned long end, pgprot_t newprot,
                        unsigned long cp_flags)
{
        return 0;
}

static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
                        struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, struct folio *folio,
                        zap_flags_t zap_flags)
{
        BUG();
}

static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned int flags)
{
        BUG();
        return 0;
}

static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
{
}

static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}

static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
{
        return 0;
}

#endif /* !CONFIG_HUGETLB_PAGE */

#ifndef pgd_write
static inline int pgd_write(pgd_t pgd)
{
        BUG();
        return 0;
}
#endif

#define HUGETLB_ANON_FILE "anon_hugepage"

enum {
        /*
         * The file will be used as an shm file so shmfs accounting rules
         * apply
         */
        HUGETLB_SHMFS_INODE     = 1,
        /*
         * The file is being created on the internal vfs mount and shmfs
         * accounting rules do not apply
         */
        HUGETLB_ANONHUGE_INODE  = 2,
};

#ifdef CONFIG_HUGETLBFS
struct hugetlbfs_sb_info {
        long        max_inodes;   /* inodes allowed */
        long        free_inodes;  /* inodes free */
        spinlock_t        stat_lock;
        struct hstate *hstate;
        struct hugepage_subpool *spool;
        kuid_t        uid;
        kgid_t        gid;
        umode_t mode;
};

static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

struct hugetlbfs_inode_info {
        struct inode vfs_inode;
        struct resv_map *resv_map;
        unsigned int seals;
};

static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
        return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}

extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t size, vma_flags_t acct,
                                int creat_flags, int page_size_log);

static inline bool is_file_hugepages(const struct file *file)
{
        return file->f_op->fop_flags & FOP_HUGE_PAGES;
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return HUGETLBFS_SB(i->i_sb)->hstate;
}
#else /* !CONFIG_HUGETLBFS */

#define is_file_hugepages(file)                        false
static inline struct file *
hugetlb_file_setup(const char *name, size_t size, vma_flags_t acctflag,
                int creat_flags, int page_size_log)
{
        return ERR_PTR(-ENOSYS);
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return NULL;
}
#endif /* !CONFIG_HUGETLBFS */

unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                    unsigned long len, unsigned long pgoff,
                                    unsigned long flags);

/*
 * huegtlb page specific state flags.  These flags are located in page.private
 * of the hugetlb head page.  Functions created via the below macros should be
 * used to manipulate these flags.
 *
 * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
 *        allocation time.  Cleared when page is fully instantiated.  Free
 *        routine checks flag to restore a reservation on error paths.
 *        Synchronization:  Examined or modified by code that knows it has
 *        the only reference to page.  i.e. After allocation but before use
 *        or when the page is being freed.
 * HPG_migratable  - Set after a newly allocated page is added to the page
 *        cache and/or page tables.  Indicates the page is a candidate for
 *        migration.
 *        Synchronization:  Initially set after new page allocation with no
 *        locking.  When examined and modified during migration processing
 *        (isolate, migrate, putback) the hugetlb_lock is held.
 * HPG_temporary - Set on a page that is temporarily allocated from the buddy
 *        allocator.  Typically used for migration target pages when no pages
 *        are available in the pool.  The hugetlb free page path will
 *        immediately free pages with this flag set to the buddy allocator.
 *        Synchronization: Can be set after huge page allocation from buddy when
 *        code knows it has only reference.  All other examinations and
 *        modifications require hugetlb_lock.
 * HPG_freed - Set when page is on the free lists.
 *        Synchronization: hugetlb_lock held for examination and modification.
 * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
 * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page
 *     that is not tracked by raw_hwp_page list.
 */
enum hugetlb_page_flags {
        HPG_restore_reserve = 0,
        HPG_migratable,
        HPG_temporary,
        HPG_freed,
        HPG_vmemmap_optimized,
        HPG_raw_hwp_unreliable,
        HPG_cma,
        __NR_HPAGEFLAGS,
};

/*
 * Macros to create test, set and clear function definitions for
 * hugetlb specific page flags.
 */
#ifdef CONFIG_HUGETLB_PAGE
#define TESTHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
bool folio_test_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                return test_bit(HPG_##flname, private);                \
        }

#define SETHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
void folio_set_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                set_bit(HPG_##flname, private);                        \
        }

#define CLEARHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
void folio_clear_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                clear_bit(HPG_##flname, private);                \
        }
#else
#define TESTHPAGEFLAG(uname, flname)                                \
static inline bool                                                \
folio_test_hugetlb_##flname(struct folio *folio)                \
        { return 0; }

#define SETHPAGEFLAG(uname, flname)                                \
static inline void                                                \
folio_set_hugetlb_##flname(struct folio *folio)                 \
        { }

#define CLEARHPAGEFLAG(uname, flname)                                \
static inline void                                                \
folio_clear_hugetlb_##flname(struct folio *folio)                \
        { }
#endif

#define HPAGEFLAG(uname, flname)                                \
        TESTHPAGEFLAG(uname, flname)                                \
        SETHPAGEFLAG(uname, flname)                                \
        CLEARHPAGEFLAG(uname, flname)                                \

/*
 * Create functions associated with hugetlb page flags
 */
HPAGEFLAG(RestoreReserve, restore_reserve)
HPAGEFLAG(Migratable, migratable)
HPAGEFLAG(Temporary, temporary)
HPAGEFLAG(Freed, freed)
HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
HPAGEFLAG(Cma, cma)

#ifdef CONFIG_HUGETLB_PAGE

#define HSTATE_NAME_LEN 32
/* Defines one hugetlb page size */
struct hstate {
        struct mutex resize_lock;
        struct lock_class_key resize_key;
        int next_nid_to_alloc;
        int next_nid_to_free;
        unsigned int order;
        unsigned int demote_order;
        unsigned long mask;
        unsigned long max_huge_pages;
        unsigned long nr_huge_pages;
        unsigned long free_huge_pages;
        unsigned long resv_huge_pages;
        unsigned long surplus_huge_pages;
        unsigned long nr_overcommit_huge_pages;
        struct list_head hugepage_activelist;
        struct list_head hugepage_freelists[MAX_NUMNODES];
        unsigned int max_huge_pages_node[MAX_NUMNODES];
        unsigned int nr_huge_pages_node[MAX_NUMNODES];
        unsigned int free_huge_pages_node[MAX_NUMNODES];
        unsigned int surplus_huge_pages_node[MAX_NUMNODES];
        char name[HSTATE_NAME_LEN];
};

struct cma;

struct huge_bootmem_page {
        struct list_head list;
        struct hstate *hstate;
        unsigned long flags;
        struct cma *cma;
};

#define HUGE_BOOTMEM_HVO                0x0001
#define HUGE_BOOTMEM_ZONES_VALID        0x0002
#define HUGE_BOOTMEM_CMA                0x0004

bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);

int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list);
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
void wait_for_freed_hugetlb_folios(void);
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
                                unsigned long addr, bool cow_from_owner);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
                                nodemask_t *nmask, gfp_t gfp_mask,
                                bool allow_alloc_fallback);
struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
                                          nodemask_t *nmask, gfp_t gfp_mask);

int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
                        pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
                                unsigned long address, struct folio *folio);

/* arch callback */
int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
int __init alloc_bootmem_huge_page(struct hstate *h, int nid);
bool __init hugetlb_node_alloc_supported(void);

void __init hugetlb_add_hstate(unsigned order);
bool __init arch_hugetlb_valid_size(unsigned long size);
struct hstate *size_to_hstate(unsigned long size);

#ifndef HUGE_MAX_HSTATE
#define HUGE_MAX_HSTATE 1
#endif

extern struct hstate hstates[HUGE_MAX_HSTATE];
extern unsigned int default_hstate_idx;

#define default_hstate (hstates[default_hstate_idx])

static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
{
        return HUGETLBFS_SB(inode->i_sb)->spool;
}

static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
{
        return folio->_hugetlb_subpool;
}

static inline void hugetlb_set_folio_subpool(struct folio *folio,
                                        struct hugepage_subpool *subpool)
{
        folio->_hugetlb_subpool = subpool;
}

static inline struct hstate *hstate_file(struct file *f)
{
        return hstate_inode(file_inode(f));
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        if (!page_size_log)
                return &default_hstate;

        if (page_size_log < BITS_PER_LONG)
                return size_to_hstate(1UL << page_size_log);

        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return hstate_file(vma->vm_file);
}

static inline unsigned long huge_page_size(const struct hstate *h)
{
        return (unsigned long)PAGE_SIZE << h->order;
}

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return h->mask;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return h->order;
}

static inline unsigned huge_page_shift(struct hstate *h)
{
        return h->order + PAGE_SHIFT;
}

/**
 * hugetlb_linear_page_index() - linear_page_index() but in hugetlb
 *                                 page size granularity.
 * @vma: the hugetlb VMA
 * @address: the virtual address within the VMA
 *
 * Return: the page offset within the mapping in huge page units.
 */
static inline pgoff_t hugetlb_linear_page_index(struct vm_area_struct *vma,
                unsigned long address)
{
        struct hstate *h = hstate_vma(vma);

        return ((address - vma->vm_start) >> huge_page_shift(h)) +
                (vma->vm_pgoff >> huge_page_order(h));
}

static inline bool order_is_gigantic(unsigned int order)
{
        return order > MAX_PAGE_ORDER;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return order_is_gigantic(huge_page_order(h));
}

static inline unsigned int pages_per_huge_page(const struct hstate *h)
{
        return 1 << h->order;
}

static inline unsigned int blocks_per_huge_page(struct hstate *h)
{
        return huge_page_size(h) / 512;
}

static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
                                struct address_space *mapping, pgoff_t idx)
{
        return filemap_lock_folio(mapping, idx << huge_page_order(h));
}

#include <asm/hugetlb.h>

#ifndef is_hugepage_only_range
static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}
#define is_hugepage_only_range is_hugepage_only_range
#endif

#ifndef arch_clear_hugetlb_flags
static inline void arch_clear_hugetlb_flags(struct folio *folio) { }
#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
#endif

#ifndef arch_make_huge_pte
static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
                                       vm_flags_t flags)
{
        return pte_mkhuge(entry);
}
#endif

#ifndef arch_has_huge_bootmem_alloc
/*
 * Some architectures do their own bootmem allocation, so they can't use
 * early CMA allocation.
 */
static inline bool arch_has_huge_bootmem_alloc(void)
{
        return false;
}
#endif

static inline struct hstate *folio_hstate(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        return size_to_hstate(folio_size(folio));
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return hstates[index].order + PAGE_SHIFT;
}

static inline int hstate_index(struct hstate *h)
{
        return h - hstates;
}

int dissolve_free_hugetlb_folio(struct folio *folio);
int dissolve_free_hugetlb_folios(unsigned long start_pfn,
                                    unsigned long end_pfn);

#ifdef CONFIG_MEMORY_FAILURE
extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
#else
static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
{
}
#endif

#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
#ifndef arch_hugetlb_migration_supported
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        if ((huge_page_shift(h) == PMD_SHIFT) ||
                (huge_page_shift(h) == PUD_SHIFT) ||
                        (huge_page_shift(h) == PGDIR_SHIFT))
                return true;
        else
                return false;
}
#endif
#else
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        return false;
}
#endif

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return arch_hugetlb_migration_supported(h);
}

/*
 * Movability check is different as compared to migration check.
 * It determines whether or not a huge page should be placed on
 * movable zone or not. Movability of any huge page should be
 * required only if huge page size is supported for migration.
 * There won't be any reason for the huge page to be movable if
 * it is not migratable to start with. Also the size of the huge
 * page should be large enough to be placed under a movable zone
 * and still feasible enough to be migratable. Just the presence
 * in movable zone does not make the migration feasible.
 *
 * So even though large huge page sizes like the gigantic ones
 * are migratable they should not be movable because its not
 * feasible to migrate them from movable zone.
 */
static inline bool hugepage_movable_supported(struct hstate *h)
{
        if (!hugepage_migration_supported(h))
                return false;

        if (hstate_is_gigantic(h) && !movable_gigantic_pages)
                return false;
        return true;
}

/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        gfp_t gfp = __GFP_COMP | __GFP_NOWARN;

        gfp |= hugepage_movable_supported(h) ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER;

        return gfp;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        gfp_t modified_mask = htlb_alloc_mask(h);

        /* Some callers might want to enforce node */
        modified_mask |= (gfp_mask & __GFP_THISNODE);

        modified_mask |= (gfp_mask & __GFP_NOWARN);

        return modified_mask;
}

static inline bool htlb_allow_alloc_fallback(int reason)
{
        bool allowed_fallback = false;

        /*
         * Note: the memory offline, memory failure and migration syscalls will
         * be allowed to fallback to other nodes due to lack of a better chioce,
         * that might break the per-node hugetlb pool. While other cases will
         * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool.
         */
        switch (reason) {
        case MR_MEMORY_HOTPLUG:
        case MR_MEMORY_FAILURE:
        case MR_SYSCALL:
        case MR_MEMPOLICY_MBIND:
                allowed_fallback = true;
                break;
        default:
                break;
        }

        return allowed_fallback;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        const unsigned long size = huge_page_size(h);

        VM_WARN_ON(size == PAGE_SIZE);

        /*
         * hugetlb must use the exact same PT locks as core-mm page table
         * walkers would. When modifying a PTE table, hugetlb must take the
         * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD
         * PT lock etc.
         *
         * The expectation is that any hugetlb folio smaller than a PMD is
         * always mapped into a single PTE table and that any hugetlb folio
         * smaller than a PUD (but at least as big as a PMD) is always mapped
         * into a single PMD table.
         *
         * If that does not hold for an architecture, then that architecture
         * must disable split PT locks such that all *_lockptr() functions
         * will give us the same result: the per-MM PT lock.
         *
         * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where
         * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use pud_lockptr()
         * and core-mm would use pmd_lockptr(). However, in such configurations
         * split PMD locks are disabled -- they don't make sense on a single
         * PGDIR page table -- and the end result is the same.
         */
        if (size >= PUD_SIZE)
                return pud_lockptr(mm, (pud_t *) pte);
        else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE))
                return pmd_lockptr(mm, (pmd_t *) pte);
        /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */
        return ptep_lockptr(mm, pte);
}

#ifndef hugepages_supported
/*
 * Some platform decide whether they support huge pages at boot
 * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
 * when there is no such support
 */
#define hugepages_supported() (HPAGE_SHIFT != 0)
#endif

void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);

static inline void hugetlb_count_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->hugetlb_usage, 0);
}

static inline void hugetlb_count_add(long l, struct mm_struct *mm)
{
        atomic_long_add(l, &mm->hugetlb_usage);
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
        atomic_long_sub(l, &mm->hugetlb_usage);
}

#ifndef huge_ptep_modify_prot_start
#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep)
{
        unsigned long psize = huge_page_size(hstate_vma(vma));

        return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize);
}
#endif

#ifndef huge_ptep_modify_prot_commit
#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep,
                                                pte_t old_pte, pte_t pte)
{
        unsigned long psize = huge_page_size(hstate_vma(vma));

        set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}
#endif

#ifdef CONFIG_NUMA
void hugetlb_register_node(struct node *node);
void hugetlb_unregister_node(struct node *node);
#endif

/*
 * Check if a given raw @page in a hugepage is HWPOISON.
 */
bool is_raw_hwpoison_page_in_hugepage(struct page *page);

static inline unsigned long huge_page_mask_align(struct file *file)
{
        return PAGE_MASK & ~huge_page_mask(hstate_file(file));
}

#else        /* CONFIG_HUGETLB_PAGE */
struct hstate {};

static inline unsigned long huge_page_mask_align(struct file *file)
{
        return 0;
}

static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
{
        return NULL;
}

static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
                                struct address_space *mapping, pgoff_t idx)
{
        return NULL;
}

static inline int isolate_or_dissolve_huge_folio(struct folio *folio,
                                                struct list_head *list)
{
        return -ENOMEM;
}

static inline int replace_free_hugepage_folios(unsigned long start_pfn,
                unsigned long end_pfn)
{
        return 0;
}

static inline void wait_for_freed_hugetlb_folios(void)
{
}

static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           bool cow_from_owner)
{
        return NULL;
}

static inline struct folio *
alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
                            nodemask_t *nmask, gfp_t gfp_mask)
{
        return NULL;
}

static inline struct folio *
alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
                        nodemask_t *nmask, gfp_t gfp_mask,
                        bool allow_alloc_fallback)
{
        return NULL;
}

static inline int __alloc_bootmem_huge_page(struct hstate *h)
{
        return 0;
}

static inline struct hstate *hstate_file(struct file *f)
{
        return NULL;
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return NULL;
}

static inline struct hstate *folio_hstate(struct folio *folio)
{
        return NULL;
}

static inline struct hstate *size_to_hstate(unsigned long size)
{
        return NULL;
}

static inline unsigned long huge_page_size(struct hstate *h)
{
        return PAGE_SIZE;
}

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return PAGE_MASK;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return 0;
}

static inline unsigned int huge_page_shift(struct hstate *h)
{
        return PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return false;
}

static inline unsigned int pages_per_huge_page(struct hstate *h)
{
        return 1;
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return 0;
}

static inline int hstate_index(struct hstate *h)
{
        return 0;
}

static inline int dissolve_free_hugetlb_folio(struct folio *folio)
{
        return 0;
}

static inline int dissolve_free_hugetlb_folios(unsigned long start_pfn,
                                           unsigned long end_pfn)
{
        return 0;
}

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return false;
}

static inline bool hugepage_movable_supported(struct hstate *h)
{
        return false;
}

static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        return 0;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        return 0;
}

static inline bool htlb_allow_alloc_fallback(int reason)
{
        return false;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        return &mm->page_table_lock;
}

static inline void hugetlb_count_init(struct mm_struct *mm)
{
}

static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
{
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
}

static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
                                          unsigned long addr, pte_t *ptep)
{
#ifdef CONFIG_MMU
        return ptep_get(ptep);
#else
        return *ptep;
#endif
}

static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                                   pte_t *ptep, pte_t pte, unsigned long sz)
{
}

static inline void hugetlb_register_node(struct node *node)
{
}

static inline void hugetlb_unregister_node(struct node *node)
{
}

static inline bool hugetlbfs_pagecache_present(
    struct hstate *h, struct vm_area_struct *vma, unsigned long address)
{
        return false;
}

static inline void hugetlb_bootmem_alloc(void)
{
}
#endif        /* CONFIG_HUGETLB_PAGE */

static inline spinlock_t *huge_pte_lock(struct hstate *h,
                                        struct mm_struct *mm, pte_t *pte)
{
        spinlock_t *ptl;

        ptl = huge_pte_lockptr(h, mm, pte);
        spin_lock(ptl);
        return ptl;
}

#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(void);
#else
static inline __init void hugetlb_cma_reserve(void)
{
}
#endif

#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return ptdesc_pmd_is_shared(virt_to_ptdesc(pte));
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return false;
}
#endif

bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);

#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
/*
 * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
 * implement this.
 */
#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#endif

static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
{
        return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
}

bool __vma_private_lock(struct vm_area_struct *vma);

/*
 * Safe version of huge_pte_offset() to check the locks.  See comments
 * above huge_pte_offset().
 */
static inline pte_t *
hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
{
#if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP)
        struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

        /*
         * If pmd sharing possible, locking needed to safely walk the
         * hugetlb pgtables.  More information can be found at the comment
         * above huge_pte_offset() in the same file.
         *
         * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
         */
        if (__vma_shareable_lock(vma))
                WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
                             !lockdep_is_held(
                                 &vma->vm_file->f_mapping->i_mmap_rwsem));
#endif
        return huge_pte_offset(vma->vm_mm, addr, sz);
}

#endif /* _LINUX_HUGETLB_H */









































































































































































































































































































































































































































































































































































































































































































































































































































    1 
























































































    1 

    1 





































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Joystick device driver for the input driver suite.
 *
 * Copyright (c) 1999-2002 Vojtech Pavlik
 * Copyright (c) 1999 Colin Van Dyke
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <asm/io.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/joystick.h>
#include <linux/input.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/device.h>
#include <linux/cdev.h>

MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
MODULE_DESCRIPTION("Joystick device interfaces");
MODULE_LICENSE("GPL");

#define JOYDEV_MINOR_BASE        0
#define JOYDEV_MINORS                16
#define JOYDEV_BUFFER_SIZE        64

struct joydev {
        int open;
        struct input_handle handle;
        wait_queue_head_t wait;
        struct list_head client_list;
        spinlock_t client_lock; /* protects client_list */
        struct mutex mutex;
        struct device dev;
        struct cdev cdev;
        bool exist;

        struct js_corr corr[ABS_CNT];
        struct JS_DATA_SAVE_TYPE glue;
        int nabs;
        int nkey;
        __u16 keymap[KEY_MAX - BTN_MISC + 1];
        __u16 keypam[KEY_MAX - BTN_MISC + 1];
        __u8 absmap[ABS_CNT];
        __u8 abspam[ABS_CNT];
        __s16 abs[ABS_CNT];
};

struct joydev_client {
        struct js_event buffer[JOYDEV_BUFFER_SIZE];
        int head;
        int tail;
        int startup;
        spinlock_t buffer_lock; /* protects access to buffer, head and tail */
        struct fasync_struct *fasync;
        struct joydev *joydev;
        struct list_head node;
};

static int joydev_correct(int value, struct js_corr *corr)
{
        switch (corr->type) {

        case JS_CORR_NONE:
                break;

        case JS_CORR_BROKEN:
                value = value > corr->coef[0] ? (value < corr->coef[1] ? 0 :
                        ((corr->coef[3] * (value - corr->coef[1])) >> 14)) :
                        ((corr->coef[2] * (value - corr->coef[0])) >> 14);
                break;

        default:
                return 0;
        }

        return clamp(value, -32767, 32767);
}

static void joydev_pass_event(struct joydev_client *client,
                              struct js_event *event)
{
        struct joydev *joydev = client->joydev;

        /*
         * IRQs already disabled, just acquire the lock
         */
        spin_lock(&client->buffer_lock);

        client->buffer[client->head] = *event;

        if (client->startup == joydev->nabs + joydev->nkey) {
                client->head++;
                client->head &= JOYDEV_BUFFER_SIZE - 1;
                if (client->tail == client->head)
                        client->startup = 0;
        }

        spin_unlock(&client->buffer_lock);

        kill_fasync(&client->fasync, SIGIO, POLL_IN);
}

static void joydev_event(struct input_handle *handle,
                         unsigned int type, unsigned int code, int value)
{
        struct joydev *joydev = handle->private;
        struct joydev_client *client;
        struct js_event event;

        switch (type) {

        case EV_KEY:
                if (code < BTN_MISC || value == 2)
                        return;
                event.type = JS_EVENT_BUTTON;
                event.number = joydev->keymap[code - BTN_MISC];
                event.value = value;
                break;

        case EV_ABS:
                event.type = JS_EVENT_AXIS;
                event.number = joydev->absmap[code];
                event.value = joydev_correct(value,
                                        &joydev->corr[event.number]);
                if (event.value == joydev->abs[event.number])
                        return;
                joydev->abs[event.number] = event.value;
                break;

        default:
                return;
        }

        event.time = jiffies_to_msecs(jiffies);

        rcu_read_lock();
        list_for_each_entry_rcu(client, &joydev->client_list, node)
                joydev_pass_event(client, &event);
        rcu_read_unlock();

        wake_up_interruptible(&joydev->wait);
}

static int joydev_fasync(int fd, struct file *file, int on)
{
        struct joydev_client *client = file->private_data;

        return fasync_helper(fd, file, on, &client->fasync);
}

static void joydev_free(struct device *dev)
{
        struct joydev *joydev = container_of(dev, struct joydev, dev);

        input_put_device(joydev->handle.dev);
        kfree(joydev);
}

static void joydev_attach_client(struct joydev *joydev,
                                 struct joydev_client *client)
{
        spin_lock(&joydev->client_lock);
        list_add_tail_rcu(&client->node, &joydev->client_list);
        spin_unlock(&joydev->client_lock);
}

static void joydev_detach_client(struct joydev *joydev,
                                 struct joydev_client *client)
{
        spin_lock(&joydev->client_lock);
        list_del_rcu(&client->node);
        spin_unlock(&joydev->client_lock);
        synchronize_rcu();
}

static void joydev_refresh_state(struct joydev *joydev)
{
        struct input_dev *dev = joydev->handle.dev;
        int i, val;

        for (i = 0; i < joydev->nabs; i++) {
                val = input_abs_get_val(dev, joydev->abspam[i]);
                joydev->abs[i] = joydev_correct(val, &joydev->corr[i]);
        }
}

static int joydev_open_device(struct joydev *joydev)
{
        int retval;

        retval = mutex_lock_interruptible(&joydev->mutex);
        if (retval)
                return retval;

        if (!joydev->exist)
                retval = -ENODEV;
        else if (!joydev->open++) {
                retval = input_open_device(&joydev->handle);
                if (retval)
                        joydev->open--;
                else
                        joydev_refresh_state(joydev);
        }

        mutex_unlock(&joydev->mutex);
        return retval;
}

static void joydev_close_device(struct joydev *joydev)
{
        mutex_lock(&joydev->mutex);

        if (joydev->exist && !--joydev->open)
                input_close_device(&joydev->handle);

        mutex_unlock(&joydev->mutex);
}

/*
 * Wake up users waiting for IO so they can disconnect from
 * dead device.
 */
static void joydev_hangup(struct joydev *joydev)
{
        struct joydev_client *client;

        spin_lock(&joydev->client_lock);
        list_for_each_entry(client, &joydev->client_list, node)
                kill_fasync(&client->fasync, SIGIO, POLL_HUP);
        spin_unlock(&joydev->client_lock);

        wake_up_interruptible(&joydev->wait);
}

static int joydev_release(struct inode *inode, struct file *file)
{
        struct joydev_client *client = file->private_data;
        struct joydev *joydev = client->joydev;

        joydev_detach_client(joydev, client);
        kfree(client);

        joydev_close_device(joydev);

        return 0;
}

static int joydev_open(struct inode *inode, struct file *file)
{
        struct joydev *joydev =
                        container_of(inode->i_cdev, struct joydev, cdev);
        struct joydev_client *client;
        int error;

        client = kzalloc_obj(struct joydev_client);
        if (!client)
                return -ENOMEM;

        spin_lock_init(&client->buffer_lock);
        client->joydev = joydev;
        joydev_attach_client(joydev, client);

        error = joydev_open_device(joydev);
        if (error)
                goto err_free_client;

        file->private_data = client;
        stream_open(inode, file);

        return 0;

 err_free_client:
        joydev_detach_client(joydev, client);
        kfree(client);
        return error;
}

static int joydev_generate_startup_event(struct joydev_client *client,
                                         struct input_dev *input,
                                         struct js_event *event)
{
        struct joydev *joydev = client->joydev;
        int have_event;

        spin_lock_irq(&client->buffer_lock);

        have_event = client->startup < joydev->nabs + joydev->nkey;

        if (have_event) {

                event->time = jiffies_to_msecs(jiffies);
                if (client->startup < joydev->nkey) {
                        event->type = JS_EVENT_BUTTON | JS_EVENT_INIT;
                        event->number = client->startup;
                        event->value = !!test_bit(joydev->keypam[event->number],
                                                  input->key);
                } else {
                        event->type = JS_EVENT_AXIS | JS_EVENT_INIT;
                        event->number = client->startup - joydev->nkey;
                        event->value = joydev->abs[event->number];
                }
                client->startup++;
        }

        spin_unlock_irq(&client->buffer_lock);

        return have_event;
}

static int joydev_fetch_next_event(struct joydev_client *client,
                                   struct js_event *event)
{
        int have_event;

        spin_lock_irq(&client->buffer_lock);

        have_event = client->head != client->tail;
        if (have_event) {
                *event = client->buffer[client->tail++];
                client->tail &= JOYDEV_BUFFER_SIZE - 1;
        }

        spin_unlock_irq(&client->buffer_lock);

        return have_event;
}

/*
 * Old joystick interface
 */
static ssize_t joydev_0x_read(struct joydev_client *client,
                              struct input_dev *input,
                              char __user *buf)
{
        struct joydev *joydev = client->joydev;
        struct JS_DATA_TYPE data;
        int i;

        spin_lock_irq(&input->event_lock);

        /*
         * Get device state
         */
        for (data.buttons = i = 0; i < 32 && i < joydev->nkey; i++)
                data.buttons |=
                        test_bit(joydev->keypam[i], input->key) ? (1 << i) : 0;
        data.x = (joydev->abs[0] / 256 + 128) >> joydev->glue.JS_CORR.x;
        data.y = (joydev->abs[1] / 256 + 128) >> joydev->glue.JS_CORR.y;

        /*
         * Reset reader's event queue
         */
        spin_lock(&client->buffer_lock);
        client->startup = 0;
        client->tail = client->head;
        spin_unlock(&client->buffer_lock);

        spin_unlock_irq(&input->event_lock);

        if (copy_to_user(buf, &data, sizeof(struct JS_DATA_TYPE)))
                return -EFAULT;

        return sizeof(struct JS_DATA_TYPE);
}

static inline int joydev_data_pending(struct joydev_client *client)
{
        struct joydev *joydev = client->joydev;

        return client->startup < joydev->nabs + joydev->nkey ||
                client->head != client->tail;
}

static ssize_t joydev_read(struct file *file, char __user *buf,
                           size_t count, loff_t *ppos)
{
        struct joydev_client *client = file->private_data;
        struct joydev *joydev = client->joydev;
        struct input_dev *input = joydev->handle.dev;
        struct js_event event;
        int retval;

        if (!joydev->exist)
                return -ENODEV;

        if (count < sizeof(struct js_event))
                return -EINVAL;

        if (count == sizeof(struct JS_DATA_TYPE))
                return joydev_0x_read(client, input, buf);

        if (!joydev_data_pending(client) && (file->f_flags & O_NONBLOCK))
                return -EAGAIN;

        retval = wait_event_interruptible(joydev->wait,
                        !joydev->exist || joydev_data_pending(client));
        if (retval)
                return retval;

        if (!joydev->exist)
                return -ENODEV;

        while (retval + sizeof(struct js_event) <= count &&
               joydev_generate_startup_event(client, input, &event)) {

                if (copy_to_user(buf + retval, &event, sizeof(struct js_event)))
                        return -EFAULT;

                retval += sizeof(struct js_event);
        }

        while (retval + sizeof(struct js_event) <= count &&
               joydev_fetch_next_event(client, &event)) {

                if (copy_to_user(buf + retval, &event, sizeof(struct js_event)))
                        return -EFAULT;

                retval += sizeof(struct js_event);
        }

        return retval;
}

/* No kernel lock - fine */
static __poll_t joydev_poll(struct file *file, poll_table *wait)
{
        struct joydev_client *client = file->private_data;
        struct joydev *joydev = client->joydev;

        poll_wait(file, &joydev->wait, wait);
        return (joydev_data_pending(client) ? (EPOLLIN | EPOLLRDNORM) : 0) |
                (joydev->exist ?  0 : (EPOLLHUP | EPOLLERR));
}

static int joydev_handle_JSIOCSAXMAP(struct joydev *joydev,
                                     void __user *argp, size_t len)
{
        __u8 *abspam;
        int i;
        int retval = 0;

        len = min(len, sizeof(joydev->abspam));

        /* Validate the map. */
        abspam = memdup_user(argp, len);
        if (IS_ERR(abspam))
                return PTR_ERR(abspam);

        for (i = 0; i < len && i < joydev->nabs; i++) {
                if (abspam[i] > ABS_MAX) {
                        retval = -EINVAL;
                        goto out;
                }
        }

        memcpy(joydev->abspam, abspam, len);

        for (i = 0; i < joydev->nabs; i++)
                joydev->absmap[joydev->abspam[i]] = i;

 out:
        kfree(abspam);
        return retval;
}

static int joydev_handle_JSIOCSBTNMAP(struct joydev *joydev,
                                      void __user *argp, size_t len)
{
        __u16 *keypam;
        int i;
        int retval = 0;

        if (len % sizeof(*keypam))
                return -EINVAL;

        len = min(len, sizeof(joydev->keypam));

        /* Validate the map. */
        keypam = memdup_user(argp, len);
        if (IS_ERR(keypam))
                return PTR_ERR(keypam);

        for (i = 0; i < (len / 2) && i < joydev->nkey; i++) {
                if (keypam[i] > KEY_MAX || keypam[i] < BTN_MISC) {
                        retval = -EINVAL;
                        goto out;
                }
        }

        memcpy(joydev->keypam, keypam, len);

        for (i = 0; i < joydev->nkey; i++)
                joydev->keymap[joydev->keypam[i] - BTN_MISC] = i;

 out:
        kfree(keypam);
        return retval;
}


static int joydev_ioctl_common(struct joydev *joydev,
                                unsigned int cmd, void __user *argp)
{
        struct input_dev *dev = joydev->handle.dev;
        size_t len;
        int i;
        const char *name;

        /* Process fixed-sized commands. */
        switch (cmd) {

        case JS_SET_CAL:
                return copy_from_user(&joydev->glue.JS_CORR, argp,
                                sizeof(joydev->glue.JS_CORR)) ? -EFAULT : 0;

        case JS_GET_CAL:
                return copy_to_user(argp, &joydev->glue.JS_CORR,
                                sizeof(joydev->glue.JS_CORR)) ? -EFAULT : 0;

        case JS_SET_TIMEOUT:
                return get_user(joydev->glue.JS_TIMEOUT, (s32 __user *) argp);

        case JS_GET_TIMEOUT:
                return put_user(joydev->glue.JS_TIMEOUT, (s32 __user *) argp);

        case JSIOCGVERSION:
                return put_user(JS_VERSION, (__u32 __user *) argp);

        case JSIOCGAXES:
                return put_user(joydev->nabs, (__u8 __user *) argp);

        case JSIOCGBUTTONS:
                return put_user(joydev->nkey, (__u8 __user *) argp);

        case JSIOCSCORR:
                if (copy_from_user(joydev->corr, argp,
                              sizeof(joydev->corr[0]) * joydev->nabs))
                        return -EFAULT;

                for (i = 0; i < joydev->nabs; i++) {
                        int val = input_abs_get_val(dev, joydev->abspam[i]);
                        joydev->abs[i] = joydev_correct(val, &joydev->corr[i]);
                }
                return 0;

        case JSIOCGCORR:
                return copy_to_user(argp, joydev->corr,
                        sizeof(joydev->corr[0]) * joydev->nabs) ? -EFAULT : 0;

        }

        /*
         * Process variable-sized commands (the axis and button map commands
         * are considered variable-sized to decouple them from the values of
         * ABS_MAX and KEY_MAX).
         */
        switch (cmd & ~IOCSIZE_MASK) {

        case (JSIOCSAXMAP & ~IOCSIZE_MASK):
                return joydev_handle_JSIOCSAXMAP(joydev, argp, _IOC_SIZE(cmd));

        case (JSIOCGAXMAP & ~IOCSIZE_MASK):
                len = min_t(size_t, _IOC_SIZE(cmd), sizeof(joydev->abspam));
                return copy_to_user(argp, joydev->abspam, len) ? -EFAULT : len;

        case (JSIOCSBTNMAP & ~IOCSIZE_MASK):
                return joydev_handle_JSIOCSBTNMAP(joydev, argp, _IOC_SIZE(cmd));

        case (JSIOCGBTNMAP & ~IOCSIZE_MASK):
                len = min_t(size_t, _IOC_SIZE(cmd), sizeof(joydev->keypam));
                return copy_to_user(argp, joydev->keypam, len) ? -EFAULT : len;

        case JSIOCGNAME(0):
                name = dev->name;
                if (!name)
                        return 0;

                len = min_t(size_t, _IOC_SIZE(cmd), strlen(name) + 1);
                return copy_to_user(argp, name, len) ? -EFAULT : len;
        }

        return -EINVAL;
}

#ifdef CONFIG_COMPAT
static long joydev_compat_ioctl(struct file *file,
                                unsigned int cmd, unsigned long arg)
{
        struct joydev_client *client = file->private_data;
        struct joydev *joydev = client->joydev;
        void __user *argp = (void __user *)arg;
        s32 tmp32;
        struct JS_DATA_SAVE_TYPE_32 ds32;
        int retval;

        retval = mutex_lock_interruptible(&joydev->mutex);
        if (retval)
                return retval;

        if (!joydev->exist) {
                retval = -ENODEV;
                goto out;
        }

        switch (cmd) {

        case JS_SET_TIMELIMIT:
                retval = get_user(tmp32, (s32 __user *) arg);
                if (retval == 0)
                        joydev->glue.JS_TIMELIMIT = tmp32;
                break;

        case JS_GET_TIMELIMIT:
                tmp32 = joydev->glue.JS_TIMELIMIT;
                retval = put_user(tmp32, (s32 __user *) arg);
                break;

        case JS_SET_ALL:
                retval = copy_from_user(&ds32, argp,
                                        sizeof(ds32)) ? -EFAULT : 0;
                if (retval == 0) {
                        joydev->glue.JS_TIMEOUT    = ds32.JS_TIMEOUT;
                        joydev->glue.BUSY          = ds32.BUSY;
                        joydev->glue.JS_EXPIRETIME = ds32.JS_EXPIRETIME;
                        joydev->glue.JS_TIMELIMIT  = ds32.JS_TIMELIMIT;
                        joydev->glue.JS_SAVE       = ds32.JS_SAVE;
                        joydev->glue.JS_CORR       = ds32.JS_CORR;
                }
                break;

        case JS_GET_ALL:
                ds32.JS_TIMEOUT    = joydev->glue.JS_TIMEOUT;
                ds32.BUSY          = joydev->glue.BUSY;
                ds32.JS_EXPIRETIME = joydev->glue.JS_EXPIRETIME;
                ds32.JS_TIMELIMIT  = joydev->glue.JS_TIMELIMIT;
                ds32.JS_SAVE       = joydev->glue.JS_SAVE;
                ds32.JS_CORR       = joydev->glue.JS_CORR;

                retval = copy_to_user(argp, &ds32, sizeof(ds32)) ? -EFAULT : 0;
                break;

        default:
                retval = joydev_ioctl_common(joydev, cmd, argp);
                break;
        }

 out:
        mutex_unlock(&joydev->mutex);
        return retval;
}
#endif /* CONFIG_COMPAT */

static long joydev_ioctl(struct file *file,
                         unsigned int cmd, unsigned long arg)
{
        struct joydev_client *client = file->private_data;
        struct joydev *joydev = client->joydev;
        void __user *argp = (void __user *)arg;
        int retval;

        retval = mutex_lock_interruptible(&joydev->mutex);
        if (retval)
                return retval;

        if (!joydev->exist) {
                retval = -ENODEV;
                goto out;
        }

        switch (cmd) {

        case JS_SET_TIMELIMIT:
                retval = get_user(joydev->glue.JS_TIMELIMIT,
                                  (long __user *) arg);
                break;

        case JS_GET_TIMELIMIT:
                retval = put_user(joydev->glue.JS_TIMELIMIT,
                                  (long __user *) arg);
                break;

        case JS_SET_ALL:
                retval = copy_from_user(&joydev->glue, argp,
                                        sizeof(joydev->glue)) ? -EFAULT : 0;
                break;

        case JS_GET_ALL:
                retval = copy_to_user(argp, &joydev->glue,
                                      sizeof(joydev->glue)) ? -EFAULT : 0;
                break;

        default:
                retval = joydev_ioctl_common(joydev, cmd, argp);
                break;
        }
 out:
        mutex_unlock(&joydev->mutex);
        return retval;
}

static const struct file_operations joydev_fops = {
        .owner                = THIS_MODULE,
        .read                = joydev_read,
        .poll                = joydev_poll,
        .open                = joydev_open,
        .release        = joydev_release,
        .unlocked_ioctl        = joydev_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = joydev_compat_ioctl,
#endif
        .fasync                = joydev_fasync,
};

/*
 * Mark device non-existent. This disables writes, ioctls and
 * prevents new users from opening the device. Already posted
 * blocking reads will stay, however new ones will fail.
 */
static void joydev_mark_dead(struct joydev *joydev)
{
        mutex_lock(&joydev->mutex);
        joydev->exist = false;
        mutex_unlock(&joydev->mutex);
}

static void joydev_cleanup(struct joydev *joydev)
{
        struct input_handle *handle = &joydev->handle;

        joydev_mark_dead(joydev);
        joydev_hangup(joydev);

        /* joydev is marked dead so no one else accesses joydev->open */
        if (joydev->open)
                input_close_device(handle);
}

/*
 * These codes are copied from hid-ids.h, unfortunately there is no common
 * usb_ids/bt_ids.h header.
 */
#define USB_VENDOR_ID_SONY                        0x054c
#define USB_DEVICE_ID_SONY_PS3_CONTROLLER                0x0268
#define USB_DEVICE_ID_SONY_PS4_CONTROLLER                0x05c4
#define USB_DEVICE_ID_SONY_PS4_CONTROLLER_2                0x09cc
#define USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE        0x0ba0

#define USB_VENDOR_ID_THQ                        0x20d6
#define USB_DEVICE_ID_THQ_PS3_UDRAW                        0xcb17

#define USB_VENDOR_ID_NINTENDO                0x057e
#define USB_DEVICE_ID_NINTENDO_JOYCONL        0x2006
#define USB_DEVICE_ID_NINTENDO_JOYCONR        0x2007
#define USB_DEVICE_ID_NINTENDO_PROCON        0x2009
#define USB_DEVICE_ID_NINTENDO_CHRGGRIP        0x200E

#define ACCEL_DEV(vnd, prd)                                                \
        {                                                                \
                .flags = INPUT_DEVICE_ID_MATCH_VENDOR |                        \
                                INPUT_DEVICE_ID_MATCH_PRODUCT |                \
                                INPUT_DEVICE_ID_MATCH_PROPBIT,                \
                .vendor = (vnd),                                        \
                .product = (prd),                                        \
                .propbit = { BIT_MASK(INPUT_PROP_ACCELEROMETER) },        \
        }

static const struct input_device_id joydev_blacklist[] = {
        /* Avoid touchpads and touchscreens */
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_KEYBIT,
                .evbit = { BIT_MASK(EV_KEY) },
                .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) },
        },
        /* Avoid tablets, digitisers and similar devices */
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_KEYBIT,
                .evbit = { BIT_MASK(EV_KEY) },
                .keybit = { [BIT_WORD(BTN_DIGI)] = BIT_MASK(BTN_DIGI) },
        },
        /* Disable accelerometers on composite devices */
        ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER),
        ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER),
        ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2),
        ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE),
        ACCEL_DEV(USB_VENDOR_ID_THQ, USB_DEVICE_ID_THQ_PS3_UDRAW),
        ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_PROCON),
        ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_CHRGGRIP),
        ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_JOYCONL),
        ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_JOYCONR),
        { /* sentinel */ }
};

static bool joydev_dev_is_blacklisted(struct input_dev *dev)
{
        const struct input_device_id *id;

        for (id = joydev_blacklist; id->flags; id++) {
                if (input_match_device_id(dev, id)) {
                        dev_dbg(&dev->dev,
                                "joydev: blacklisting '%s'\n", dev->name);
                        return true;
                }
        }

        return false;
}

static bool joydev_dev_is_absolute_mouse(struct input_dev *dev)
{
        DECLARE_BITMAP(jd_scratch, KEY_CNT);
        bool ev_match = false;

        BUILD_BUG_ON(ABS_CNT > KEY_CNT || EV_CNT > KEY_CNT);

        /*
         * Virtualization (VMware, etc) and remote management (HP
         * ILO2) solutions use absolute coordinates for their virtual
         * pointing devices so that there is one-to-one relationship
         * between pointer position on the host screen and virtual
         * guest screen, and so their mice use ABS_X, ABS_Y and 3
         * primary button events. This clashes with what joydev
         * considers to be joysticks (a device with at minimum ABS_X
         * axis).
         *
         * Here we are trying to separate absolute mice from
         * joysticks. A device is, for joystick detection purposes,
         * considered to be an absolute mouse if the following is
         * true:
         *
         * 1) Event types are exactly
         *      EV_ABS, EV_KEY and EV_SYN
         *    or
         *      EV_ABS, EV_KEY, EV_SYN and EV_MSC
         *    or
         *      EV_ABS, EV_KEY, EV_SYN, EV_MSC and EV_REL.
         * 2) Absolute events are exactly ABS_X and ABS_Y.
         * 3) Keys are exactly BTN_LEFT, BTN_RIGHT and BTN_MIDDLE.
         * 4) Device is not on "Amiga" bus.
         */

        bitmap_zero(jd_scratch, EV_CNT);
        /* VMware VMMouse, HP ILO2 */
        __set_bit(EV_ABS, jd_scratch);
        __set_bit(EV_KEY, jd_scratch);
        __set_bit(EV_SYN, jd_scratch);
        if (bitmap_equal(jd_scratch, dev->evbit, EV_CNT))
                ev_match = true;

        /* HP ILO2, AMI BMC firmware */
        __set_bit(EV_MSC, jd_scratch);
        if (bitmap_equal(jd_scratch, dev->evbit, EV_CNT))
                ev_match = true;

        /* VMware Virtual USB Mouse, QEMU USB Tablet, ATEN BMC firmware */
        __set_bit(EV_REL, jd_scratch);
        if (bitmap_equal(jd_scratch, dev->evbit, EV_CNT))
                ev_match = true;

        if (!ev_match)
                return false;

        bitmap_zero(jd_scratch, ABS_CNT);
        __set_bit(ABS_X, jd_scratch);
        __set_bit(ABS_Y, jd_scratch);
        if (!bitmap_equal(dev->absbit, jd_scratch, ABS_CNT))
                return false;

        bitmap_zero(jd_scratch, KEY_CNT);
        __set_bit(BTN_LEFT, jd_scratch);
        __set_bit(BTN_RIGHT, jd_scratch);
        __set_bit(BTN_MIDDLE, jd_scratch);

        if (!bitmap_equal(dev->keybit, jd_scratch, KEY_CNT))
                return false;

        /*
         * Amiga joystick (amijoy) historically uses left/middle/right
         * button events.
         */
        if (dev->id.bustype == BUS_AMIGA)
                return false;

        return true;
}

static bool joydev_match(struct input_handler *handler, struct input_dev *dev)
{
        /* Disable blacklisted devices */
        if (joydev_dev_is_blacklisted(dev))
                return false;

        /* Avoid absolute mice */
        if (joydev_dev_is_absolute_mouse(dev))
                return false;

        return true;
}

static int joydev_connect(struct input_handler *handler, struct input_dev *dev,
                          const struct input_device_id *id)
{
        struct joydev *joydev;
        int i, j, t, minor, dev_no;
        int error;

        minor = input_get_new_minor(JOYDEV_MINOR_BASE, JOYDEV_MINORS, true);
        if (minor < 0) {
                error = minor;
                pr_err("failed to reserve new minor: %d\n", error);
                return error;
        }

        joydev = kzalloc_obj(struct joydev);
        if (!joydev) {
                error = -ENOMEM;
                goto err_free_minor;
        }

        INIT_LIST_HEAD(&joydev->client_list);
        spin_lock_init(&joydev->client_lock);
        mutex_init(&joydev->mutex);
        init_waitqueue_head(&joydev->wait);
        joydev->exist = true;

        dev_no = minor;
        /* Normalize device number if it falls into legacy range */
        if (dev_no < JOYDEV_MINOR_BASE + JOYDEV_MINORS)
                dev_no -= JOYDEV_MINOR_BASE;
        dev_set_name(&joydev->dev, "js%d", dev_no);

        joydev->handle.dev = input_get_device(dev);
        joydev->handle.name = dev_name(&joydev->dev);
        joydev->handle.handler = handler;
        joydev->handle.private = joydev;

        for_each_set_bit(i, dev->absbit, ABS_CNT) {
                joydev->absmap[i] = joydev->nabs;
                joydev->abspam[joydev->nabs] = i;
                joydev->nabs++;
        }

        for (i = BTN_JOYSTICK - BTN_MISC; i < KEY_MAX - BTN_MISC + 1; i++)
                if (test_bit(i + BTN_MISC, dev->keybit)) {
                        joydev->keymap[i] = joydev->nkey;
                        joydev->keypam[joydev->nkey] = i + BTN_MISC;
                        joydev->nkey++;
                }

        for (i = 0; i < BTN_JOYSTICK - BTN_MISC; i++)
                if (test_bit(i + BTN_MISC, dev->keybit)) {
                        joydev->keymap[i] = joydev->nkey;
                        joydev->keypam[joydev->nkey] = i + BTN_MISC;
                        joydev->nkey++;
                }

        for (i = 0; i < joydev->nabs; i++) {
                j = joydev->abspam[i];
                if (input_abs_get_max(dev, j) == input_abs_get_min(dev, j)) {
                        joydev->corr[i].type = JS_CORR_NONE;
                        continue;
                }
                joydev->corr[i].type = JS_CORR_BROKEN;
                joydev->corr[i].prec = input_abs_get_fuzz(dev, j);

                t = (input_abs_get_max(dev, j) + input_abs_get_min(dev, j)) / 2;
                joydev->corr[i].coef[0] = t - input_abs_get_flat(dev, j);
                joydev->corr[i].coef[1] = t + input_abs_get_flat(dev, j);

                t = (input_abs_get_max(dev, j) - input_abs_get_min(dev, j)) / 2
                        - 2 * input_abs_get_flat(dev, j);
                if (t) {
                        joydev->corr[i].coef[2] = (1 << 29) / t;
                        joydev->corr[i].coef[3] = (1 << 29) / t;
                }
        }

        joydev->dev.devt = MKDEV(INPUT_MAJOR, minor);
        joydev->dev.class = &input_class;
        joydev->dev.parent = &dev->dev;
        joydev->dev.release = joydev_free;
        device_initialize(&joydev->dev);

        error = input_register_handle(&joydev->handle);
        if (error)
                goto err_free_joydev;

        cdev_init(&joydev->cdev, &joydev_fops);

        error = cdev_device_add(&joydev->cdev, &joydev->dev);
        if (error)
                goto err_cleanup_joydev;

        return 0;

 err_cleanup_joydev:
        joydev_cleanup(joydev);
        input_unregister_handle(&joydev->handle);
 err_free_joydev:
        put_device(&joydev->dev);
 err_free_minor:
        input_free_minor(minor);
        return error;
}

static void joydev_disconnect(struct input_handle *handle)
{
        struct joydev *joydev = handle->private;

        cdev_device_del(&joydev->cdev, &joydev->dev);
        joydev_cleanup(joydev);
        input_free_minor(MINOR(joydev->dev.devt));
        input_unregister_handle(handle);
        put_device(&joydev->dev);
}

static const struct input_device_id joydev_ids[] = {
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_ABSBIT,
                .evbit = { BIT_MASK(EV_ABS) },
                .absbit = { BIT_MASK(ABS_X) },
        },
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_ABSBIT,
                .evbit = { BIT_MASK(EV_ABS) },
                .absbit = { BIT_MASK(ABS_Z) },
        },
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_ABSBIT,
                .evbit = { BIT_MASK(EV_ABS) },
                .absbit = { BIT_MASK(ABS_WHEEL) },
        },
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_ABSBIT,
                .evbit = { BIT_MASK(EV_ABS) },
                .absbit = { BIT_MASK(ABS_THROTTLE) },
        },
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_KEYBIT,
                .evbit = { BIT_MASK(EV_KEY) },
                .keybit = {[BIT_WORD(BTN_JOYSTICK)] = BIT_MASK(BTN_JOYSTICK) },
        },
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_KEYBIT,
                .evbit = { BIT_MASK(EV_KEY) },
                .keybit = { [BIT_WORD(BTN_GAMEPAD)] = BIT_MASK(BTN_GAMEPAD) },
        },
        {
                .flags = INPUT_DEVICE_ID_MATCH_EVBIT |
                                INPUT_DEVICE_ID_MATCH_KEYBIT,
                .evbit = { BIT_MASK(EV_KEY) },
                .keybit = { [BIT_WORD(BTN_TRIGGER_HAPPY)] = BIT_MASK(BTN_TRIGGER_HAPPY) },
        },
        { }        /* Terminating entry */
};

MODULE_DEVICE_TABLE(input, joydev_ids);

static struct input_handler joydev_handler = {
        .event                = joydev_event,
        .match                = joydev_match,
        .connect        = joydev_connect,
        .disconnect        = joydev_disconnect,
        .legacy_minors        = true,
        .minor                = JOYDEV_MINOR_BASE,
        .name                = "joydev",
        .id_table        = joydev_ids,
};

static int __init joydev_init(void)
{
        return input_register_handler(&joydev_handler);
}

static void __exit joydev_exit(void)
{
        input_unregister_handler(&joydev_handler);
}

module_init(joydev_init);
module_exit(joydev_exit);






































































































































































































































































































   74 



   71 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/timer.h>
#include <linux/acpi_pmtmr.h>
#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/clocksource.h>
#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
#include <linux/static_call.h>

#include <asm/cpuid/api.h>
#include <asm/hpet.h>
#include <asm/timer.h>
#include <asm/vgtod.h>
#include <asm/time.h>
#include <asm/delay.h>
#include <asm/hypervisor.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
#include <asm/geode.h>
#include <asm/apic.h>
#include <asm/cpu_device_id.h>
#include <asm/i8259.h>
#include <asm/msr.h>
#include <asm/topology.h>
#include <asm/uv/uv.h>
#include <asm/sev.h>

unsigned int __read_mostly cpu_khz;        /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);

unsigned int __read_mostly tsc_khz;
EXPORT_SYMBOL(tsc_khz);

#define KHZ        1000

/*
 * TSC can be unstable due to cpufreq or due to unsynced TSCs
 */
static int __read_mostly tsc_unstable;
static unsigned int __initdata tsc_early_khz;

static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc);

int tsc_clocksource_reliable;

static int __read_mostly tsc_force_recalibrate;

static struct clocksource_base art_base_clk = {
        .id    = CSID_X86_ART,
};
static bool have_art;

struct cyc2ns {
        struct cyc2ns_data data[2];        /*  0 + 2*16 = 32 */
        seqcount_latch_t   seq;                /* 32 + 4    = 36 */

}; /* fits one cacheline */

static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);

static int __init tsc_early_khz_setup(char *buf)
{
        return kstrtouint(buf, 0, &tsc_early_khz);
}
early_param("tsc_early_khz", tsc_early_khz_setup);

__always_inline void __cyc2ns_read(struct cyc2ns_data *data)
{
        int seq, idx;

        do {
                seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
                idx = seq & 1;

                data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
                data->cyc2ns_mul    = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
                data->cyc2ns_shift  = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);

        } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
}

__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
{
        preempt_disable_notrace();
        __cyc2ns_read(data);
}

__always_inline void cyc2ns_read_end(void)
{
        preempt_enable_notrace();
}

/*
 * Accelerators for sched_clock()
 * convert from cycles(64bits) => nanoseconds (64bits)
 *  basic equation:
 *              ns = cycles / (freq / ns_per_sec)
 *              ns = cycles * (ns_per_sec / freq)
 *              ns = cycles * (10^9 / (cpu_khz * 10^3))
 *              ns = cycles * (10^6 / cpu_khz)
 *
 *      Then we use scaling math (suggested by george@mvista.com) to get:
 *              ns = cycles * (10^6 * SC / cpu_khz) / SC
 *              ns = cycles * cyc2ns_scale / SC
 *
 *      And since SC is a constant power of two, we can convert the div
 *  into a shift. The larger SC is, the more accurate the conversion, but
 *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
 *  (64-bit result) can be used.
 *
 *  We can use khz divisor instead of mhz to keep a better precision.
 *  (mathieu.desnoyers@polymtl.ca)
 *
 *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
 */

static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
        struct cyc2ns_data data;
        unsigned long long ns;

        __cyc2ns_read(&data);

        ns = data.cyc2ns_offset;
        ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);

        return ns;
}

static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
        unsigned long long ns;
        preempt_disable_notrace();
        ns = __cycles_2_ns(cyc);
        preempt_enable_notrace();
        return ns;
}

static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
        unsigned long long ns_now;
        struct cyc2ns_data data;
        struct cyc2ns *c2n;

        ns_now = cycles_2_ns(tsc_now);

        /*
         * Compute a new multiplier as per the above comment and ensure our
         * time function is continuous; see the comment near struct
         * cyc2ns_data.
         */
        clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
                               NSEC_PER_MSEC, 0);

        /*
         * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
         * not expected to be greater than 31 due to the original published
         * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
         * value) - refer perf_event_mmap_page documentation in perf_event.h.
         */
        if (data.cyc2ns_shift == 32) {
                data.cyc2ns_shift = 31;
                data.cyc2ns_mul >>= 1;
        }

        data.cyc2ns_offset = ns_now -
                mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);

        c2n = per_cpu_ptr(&cyc2ns, cpu);

        write_seqcount_latch_begin(&c2n->seq);
        c2n->data[0] = data;
        write_seqcount_latch(&c2n->seq);
        c2n->data[1] = data;
        write_seqcount_latch_end(&c2n->seq);
}

static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
        unsigned long flags;

        local_irq_save(flags);
        sched_clock_idle_sleep_event();

        if (khz)
                __set_cyc2ns_scale(khz, cpu, tsc_now);

        sched_clock_idle_wakeup_event();
        local_irq_restore(flags);
}

/*
 * Initialize cyc2ns for boot cpu
 */
static void __init cyc2ns_init_boot_cpu(void)
{
        struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);

        seqcount_latch_init(&c2n->seq);
        __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
}

/*
 * Secondary CPUs do not run through tsc_init(), so set up
 * all the scale factors for all CPUs, assuming the same
 * speed as the bootup CPU.
 */
static void __init cyc2ns_init_secondary_cpus(void)
{
        unsigned int cpu, this_cpu = smp_processor_id();
        struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
        struct cyc2ns_data *data = c2n->data;

        for_each_possible_cpu(cpu) {
                if (cpu != this_cpu) {
                        seqcount_latch_init(&c2n->seq);
                        c2n = per_cpu_ptr(&cyc2ns, cpu);
                        c2n->data[0] = data[0];
                        c2n->data[1] = data[1];
                }
        }
}

/*
 * Scheduler clock - returns current time in nanosec units.
 */
noinstr u64 native_sched_clock(void)
{
        if (static_branch_likely(&__use_tsc)) {
                u64 tsc_now = rdtsc();

                /* return the value in ns */
                return __cycles_2_ns(tsc_now);
        }

        /*
         * Fall back to jiffies if there's no TSC available:
         * ( But note that we still use it if the TSC is marked
         *   unstable. We do this because unlike Time Of Day,
         *   the scheduler clock tolerates small errors and it's
         *   very important for it to be as fast as the platform
         *   can achieve it. )
         */

        /* No locking but a rare wrong value is not a big deal: */
        return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
}

/*
 * Generate a sched_clock if you already have a TSC value.
 */
u64 native_sched_clock_from_tsc(u64 tsc)
{
        return cycles_2_ns(tsc);
}

/* We need to define a real function for sched_clock, to override the
   weak default version */
#ifdef CONFIG_PARAVIRT
DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock);

noinstr u64 sched_clock_noinstr(void)
{
        return static_call(pv_sched_clock)();
}

bool using_native_sched_clock(void)
{
        return static_call_query(pv_sched_clock) == native_sched_clock;
}

void paravirt_set_sched_clock(u64 (*func)(void))
{
        static_call_update(pv_sched_clock, func);
}
#else
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));

bool using_native_sched_clock(void) { return true; }
void paravirt_set_sched_clock(u64 (*func)(void)) { }
#endif

notrace u64 sched_clock(void)
{
        u64 now;
        preempt_disable_notrace();
        now = sched_clock_noinstr();
        preempt_enable_notrace();
        return now;
}

int check_tsc_unstable(void)
{
        return tsc_unstable;
}
EXPORT_SYMBOL_GPL(check_tsc_unstable);

#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
        mark_tsc_unstable("boot parameter notsc");
        return 1;
}
#else
/*
 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
 * in cpu/common.c
 */
int __init notsc_setup(char *str)
{
        setup_clear_cpu_cap(X86_FEATURE_TSC);
        return 1;
}
#endif
__setup("notsc", notsc_setup);

enum {
        TSC_WATCHDOG_AUTO,
        TSC_WATCHDOG_OFF,
        TSC_WATCHDOG_ON,
};

static int no_sched_irq_time;
static int tsc_watchdog;

static int __init tsc_setup(char *str)
{
        if (!strcmp(str, "reliable"))
                tsc_clocksource_reliable = 1;
        if (!strncmp(str, "noirqtime", 9))
                no_sched_irq_time = 1;
        if (!strcmp(str, "unstable"))
                mark_tsc_unstable("boot parameter");
        if (!strcmp(str, "nowatchdog"))
                tsc_watchdog = TSC_WATCHDOG_OFF;
        if (!strcmp(str, "recalibrate"))
                tsc_force_recalibrate = 1;
        if (!strcmp(str, "watchdog"))
                tsc_watchdog = TSC_WATCHDOG_ON;
        return 1;
}
__setup("tsc=", tsc_setup);

#define MAX_RETRIES                5
#define TSC_DEFAULT_THRESHOLD        0x20000

/*
 * Read TSC and the reference counters. Take care of any disturbances
 */
static u64 tsc_read_refs(u64 *p, int hpet)
{
        u64 t1, t2;
        u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
        int i;

        for (i = 0; i < MAX_RETRIES; i++) {
                t1 = get_cycles();
                if (hpet)
                        *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
                else
                        *p = acpi_pm_read_early();
                t2 = get_cycles();
                if ((t2 - t1) < thresh)
                        return t2;
        }
        return ULLONG_MAX;
}

/*
 * Calculate the TSC frequency from HPET reference
 */
static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
{
        u64 tmp;

        if (hpet2 < hpet1)
                hpet2 += 0x100000000ULL;
        hpet2 -= hpet1;
        tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
        do_div(tmp, 1000000);
        deltatsc = div64_u64(deltatsc, tmp);

        return (unsigned long) deltatsc;
}

/*
 * Calculate the TSC frequency from PMTimer reference
 */
static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
{
        u64 tmp;

        if (!pm1 && !pm2)
                return ULONG_MAX;

        if (pm2 < pm1)
                pm2 += (u64)ACPI_PM_OVRRUN;
        pm2 -= pm1;
        tmp = pm2 * 1000000000LL;
        do_div(tmp, PMTMR_TICKS_PER_SEC);
        do_div(deltatsc, tmp);

        return (unsigned long) deltatsc;
}

#define CAL_MS                10
#define CAL_LATCH        (PIT_TICK_RATE / (1000 / CAL_MS))
#define CAL_PIT_LOOPS        1000

#define CAL2_MS                50
#define CAL2_LATCH        (PIT_TICK_RATE / (1000 / CAL2_MS))
#define CAL2_PIT_LOOPS        5000


/*
 * Try to calibrate the TSC against the Programmable
 * Interrupt Timer and return the frequency of the TSC
 * in kHz.
 *
 * Return ULONG_MAX on failure to calibrate.
 */
static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
{
        u64 tsc, t1, t2, delta;
        unsigned long tscmin, tscmax;
        int pitcnt;

        if (!has_legacy_pic()) {
                /*
                 * Relies on tsc_early_delay_calibrate() to have given us semi
                 * usable udelay(), wait for the same 50ms we would have with
                 * the PIT loop below.
                 */
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                return ULONG_MAX;
        }

        /* Set the Gate high, disable speaker */
        outb((inb(0x61) & ~0x02) | 0x01, 0x61);

        /*
         * Setup CTC channel 2* for mode 0, (interrupt on terminal
         * count mode), binary count. Set the latch register to 50ms
         * (LSB then MSB) to begin countdown.
         */
        outb(0xb0, 0x43);
        outb(latch & 0xff, 0x42);
        outb(latch >> 8, 0x42);

        tsc = t1 = t2 = get_cycles();

        pitcnt = 0;
        tscmax = 0;
        tscmin = ULONG_MAX;
        while ((inb(0x61) & 0x20) == 0) {
                t2 = get_cycles();
                delta = t2 - tsc;
                tsc = t2;
                if ((unsigned long) delta < tscmin)
                        tscmin = (unsigned int) delta;
                if ((unsigned long) delta > tscmax)
                        tscmax = (unsigned int) delta;
                pitcnt++;
        }

        /*
         * Sanity checks:
         *
         * If we were not able to read the PIT more than loopmin
         * times, then we have been hit by a massive SMI
         *
         * If the maximum is 10 times larger than the minimum,
         * then we got hit by an SMI as well.
         */
        if (pitcnt < loopmin || tscmax > 10 * tscmin)
                return ULONG_MAX;

        /* Calculate the PIT value */
        delta = t2 - t1;
        do_div(delta, ms);
        return delta;
}

/*
 * This reads the current MSB of the PIT counter, and
 * checks if we are running on sufficiently fast and
 * non-virtualized hardware.
 *
 * Our expectations are:
 *
 *  - the PIT is running at roughly 1.19MHz
 *
 *  - each IO is going to take about 1us on real hardware,
 *    but we allow it to be much faster (by a factor of 10) or
 *    _slightly_ slower (ie we allow up to a 2us read+counter
 *    update - anything else implies a unacceptably slow CPU
 *    or PIT for the fast calibration to work.
 *
 *  - with 256 PIT ticks to read the value, we have 214us to
 *    see the same MSB (and overhead like doing a single TSC
 *    read per MSB value etc).
 *
 *  - We're doing 2 reads per loop (LSB, MSB), and we expect
 *    them each to take about a microsecond on real hardware.
 *    So we expect a count value of around 100. But we'll be
 *    generous, and accept anything over 50.
 *
 *  - if the PIT is stuck, and we see *many* more reads, we
 *    return early (and the next caller of pit_expect_msb()
 *    then consider it a failure when they don't see the
 *    next expected value).
 *
 * These expectations mean that we know that we have seen the
 * transition from one expected value to another with a fairly
 * high accuracy, and we didn't miss any events. We can thus
 * use the TSC value at the transitions to calculate a pretty
 * good value for the TSC frequency.
 */
static inline int pit_verify_msb(unsigned char val)
{
        /* Ignore LSB */
        inb(0x42);
        return inb(0x42) == val;
}

static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
{
        int count;
        u64 tsc = 0, prev_tsc = 0;

        for (count = 0; count < 50000; count++) {
                if (!pit_verify_msb(val))
                        break;
                prev_tsc = tsc;
                tsc = get_cycles();
        }
        *deltap = get_cycles() - prev_tsc;
        *tscp = tsc;

        /*
         * We require _some_ success, but the quality control
         * will be based on the error terms on the TSC values.
         */
        return count > 5;
}

/*
 * How many MSB values do we want to see? We aim for
 * a maximum error rate of 500ppm (in practice the
 * real error is much smaller), but refuse to spend
 * more than 50ms on it.
 */
#define MAX_QUICK_PIT_MS 50
#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)

static unsigned long quick_pit_calibrate(void)
{
        int i;
        u64 tsc, delta;
        unsigned long d1, d2;

        if (!has_legacy_pic())
                return 0;

        /* Set the Gate high, disable speaker */
        outb((inb(0x61) & ~0x02) | 0x01, 0x61);

        /*
         * Counter 2, mode 0 (one-shot), binary count
         *
         * NOTE! Mode 2 decrements by two (and then the
         * output is flipped each time, giving the same
         * final output frequency as a decrement-by-one),
         * so mode 0 is much better when looking at the
         * individual counts.
         */
        outb(0xb0, 0x43);

        /* Start at 0xffff */
        outb(0xff, 0x42);
        outb(0xff, 0x42);

        /*
         * The PIT starts counting at the next edge, so we
         * need to delay for a microsecond. The easiest way
         * to do that is to just read back the 16-bit counter
         * once from the PIT.
         */
        pit_verify_msb(0);

        if (pit_expect_msb(0xff, &tsc, &d1)) {
                for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
                        if (!pit_expect_msb(0xff-i, &delta, &d2))
                                break;

                        delta -= tsc;

                        /*
                         * Extrapolate the error and fail fast if the error will
                         * never be below 500 ppm.
                         */
                        if (i == 1 &&
                            d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
                                return 0;

                        /*
                         * Iterate until the error is less than 500 ppm
                         */
                        if (d1+d2 >= delta >> 11)
                                continue;

                        /*
                         * Check the PIT one more time to verify that
                         * all TSC reads were stable wrt the PIT.
                         *
                         * This also guarantees serialization of the
                         * last cycle read ('d2') in pit_expect_msb.
                         */
                        if (!pit_verify_msb(0xfe - i))
                                break;
                        goto success;
                }
        }
        pr_info("Fast TSC calibration failed\n");
        return 0;

success:
        /*
         * Ok, if we get here, then we've seen the
         * MSB of the PIT decrement 'i' times, and the
         * error has shrunk to less than 500 ppm.
         *
         * As a result, we can depend on there not being
         * any odd delays anywhere, and the TSC reads are
         * reliable (within the error).
         *
         * kHz = ticks / time-in-seconds / 1000;
         * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
         * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
         */
        delta *= PIT_TICK_RATE;
        do_div(delta, i*256*1000);
        pr_info("Fast TSC calibration using PIT\n");
        return delta;
}

/**
 * native_calibrate_tsc - determine TSC frequency
 * Determine TSC frequency via CPUID, else return 0.
 */
unsigned long native_calibrate_tsc(void)
{
        unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
        unsigned int crystal_khz;

        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return 0;

        if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
                return 0;

        eax_denominator = ebx_numerator = ecx_hz = edx = 0;

        /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
        cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);

        if (ebx_numerator == 0 || eax_denominator == 0)
                return 0;

        crystal_khz = ecx_hz / 1000;

        /*
         * Denverton SoCs don't report crystal clock, and also don't support
         * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz
         * crystal clock.
         */
        if (crystal_khz == 0 &&
                        boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
                crystal_khz = 25000;

        /*
         * TSC frequency reported directly by CPUID is a "hardware reported"
         * frequency and is the most accurate one so far we have. This
         * is considered a known frequency.
         */
        if (crystal_khz != 0)
                setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);

        /*
         * Some Intel SoCs like Skylake and Kabylake don't report the crystal
         * clock, but we can easily calculate it to a high degree of accuracy
         * by considering the crystal ratio and the CPU speed.
         */
        if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
                unsigned int eax_base_mhz, ebx, ecx, edx;

                cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
                crystal_khz = eax_base_mhz * 1000 *
                        eax_denominator / ebx_numerator;
        }

        if (crystal_khz == 0)
                return 0;

        /*
         * For Atom SoCs TSC is the only reliable clocksource.
         * Mark TSC reliable so no watchdog on it.
         */
        if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
                setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);

#ifdef CONFIG_X86_LOCAL_APIC
        /*
         * The local APIC appears to be fed by the core crystal clock
         * (which sounds entirely sensible). We can set the global
         * lapic_timer_period here to avoid having to calibrate the APIC
         * timer later.
         */
        lapic_timer_period = crystal_khz * 1000 / HZ;
#endif

        return crystal_khz * ebx_numerator / eax_denominator;
}

static unsigned long cpu_khz_from_cpuid(void)
{
        unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;

        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return 0;

        if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
                return 0;

        eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;

        cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);

        return eax_base_mhz * 1000;
}

/*
 * calibrate cpu using pit, hpet, and ptimer methods. They are available
 * later in boot after acpi is initialized.
 */
static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
{
        u64 tsc1, tsc2, delta, ref1, ref2;
        unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
        unsigned long flags, latch, ms;
        int hpet = is_hpet_enabled(), i, loopmin;

        /*
         * Run 5 calibration loops to get the lowest frequency value
         * (the best estimate). We use two different calibration modes
         * here:
         *
         * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
         * load a timeout of 50ms. We read the time right after we
         * started the timer and wait until the PIT count down reaches
         * zero. In each wait loop iteration we read the TSC and check
         * the delta to the previous read. We keep track of the min
         * and max values of that delta. The delta is mostly defined
         * by the IO time of the PIT access, so we can detect when
         * any disturbance happened between the two reads. If the
         * maximum time is significantly larger than the minimum time,
         * then we discard the result and have another try.
         *
         * 2) Reference counter. If available we use the HPET or the
         * PMTIMER as a reference to check the sanity of that value.
         * We use separate TSC readouts and check inside of the
         * reference read for any possible disturbance. We discard
         * disturbed values here as well. We do that around the PIT
         * calibration delay loop as we have to wait for a certain
         * amount of time anyway.
         */

        /* Preset PIT loop values */
        latch = CAL_LATCH;
        ms = CAL_MS;
        loopmin = CAL_PIT_LOOPS;

        for (i = 0; i < 3; i++) {
                unsigned long tsc_pit_khz;

                /*
                 * Read the start value and the reference count of
                 * hpet/pmtimer when available. Then do the PIT
                 * calibration, which will take at least 50ms, and
                 * read the end value.
                 */
                local_irq_save(flags);
                tsc1 = tsc_read_refs(&ref1, hpet);
                tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
                tsc2 = tsc_read_refs(&ref2, hpet);
                local_irq_restore(flags);

                /* Pick the lowest PIT TSC calibration so far */
                tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);

                /* hpet or pmtimer available ? */
                if (ref1 == ref2)
                        continue;

                /* Check, whether the sampling was disturbed */
                if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
                        continue;

                tsc2 = (tsc2 - tsc1) * 1000000LL;
                if (hpet)
                        tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
                else
                        tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);

                tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);

                /* Check the reference deviation */
                delta = ((u64) tsc_pit_min) * 100;
                do_div(delta, tsc_ref_min);

                /*
                 * If both calibration results are inside a 10% window
                 * then we can be sure, that the calibration
                 * succeeded. We break out of the loop right away. We
                 * use the reference value, as it is more precise.
                 */
                if (delta >= 90 && delta <= 110) {
                        pr_info("PIT calibration matches %s. %d loops\n",
                                hpet ? "HPET" : "PMTIMER", i + 1);
                        return tsc_ref_min;
                }

                /*
                 * Check whether PIT failed more than once. This
                 * happens in virtualized environments. We need to
                 * give the virtual PC a slightly longer timeframe for
                 * the HPET/PMTIMER to make the result precise.
                 */
                if (i == 1 && tsc_pit_min == ULONG_MAX) {
                        latch = CAL2_LATCH;
                        ms = CAL2_MS;
                        loopmin = CAL2_PIT_LOOPS;
                }
        }

        /*
         * Now check the results.
         */
        if (tsc_pit_min == ULONG_MAX) {
                /* PIT gave no useful value */
                pr_warn("Unable to calibrate against PIT\n");

                /* We don't have an alternative source, disable TSC */
                if (!hpet && !ref1 && !ref2) {
                        pr_notice("No reference (HPET/PMTIMER) available\n");
                        return 0;
                }

                /* The alternative source failed as well, disable TSC */
                if (tsc_ref_min == ULONG_MAX) {
                        pr_warn("HPET/PMTIMER calibration failed\n");
                        return 0;
                }

                /* Use the alternative source */
                pr_info("using %s reference calibration\n",
                        hpet ? "HPET" : "PMTIMER");

                return tsc_ref_min;
        }

        /* We don't have an alternative source, use the PIT calibration value */
        if (!hpet && !ref1 && !ref2) {
                pr_info("Using PIT calibration value\n");
                return tsc_pit_min;
        }

        /* The alternative source failed, use the PIT calibration value */
        if (tsc_ref_min == ULONG_MAX) {
                pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
                return tsc_pit_min;
        }

        /*
         * The calibration values differ too much. In doubt, we use
         * the PIT value as we know that there are PMTIMERs around
         * running at double speed. At least we let the user know:
         */
        pr_warn("PIT calibration deviates from %s: %lu %lu\n",
                hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
        pr_info("Using PIT calibration value\n");
        return tsc_pit_min;
}

/**
 * native_calibrate_cpu_early - can calibrate the cpu early in boot
 */
unsigned long native_calibrate_cpu_early(void)
{
        unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();

        if (!fast_calibrate)
                fast_calibrate = cpu_khz_from_msr();
        if (!fast_calibrate) {
                local_irq_save(flags);
                fast_calibrate = quick_pit_calibrate();
                local_irq_restore(flags);
        }
        return fast_calibrate;
}


/**
 * native_calibrate_cpu - calibrate the cpu
 */
static unsigned long native_calibrate_cpu(void)
{
        unsigned long tsc_freq = native_calibrate_cpu_early();

        if (!tsc_freq)
                tsc_freq = pit_hpet_ptimer_calibrate_cpu();

        return tsc_freq;
}

void recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
        unsigned long cpu_khz_old = cpu_khz;

        if (!boot_cpu_has(X86_FEATURE_TSC))
                return;

        cpu_khz = x86_platform.calibrate_cpu();
        tsc_khz = x86_platform.calibrate_tsc();
        if (tsc_khz == 0)
                tsc_khz = cpu_khz;
        else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
                cpu_khz = tsc_khz;
        cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
                                                    cpu_khz_old, cpu_khz);
#endif
}
EXPORT_SYMBOL_GPL(recalibrate_cpu_khz);


static unsigned long long cyc2ns_suspend;

void tsc_save_sched_clock_state(void)
{
        if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
                return;

        cyc2ns_suspend = sched_clock();
}

/*
 * Even on processors with invariant TSC, TSC gets reset in some the
 * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
 * arbitrary value (still sync'd across cpu's) during resume from such sleep
 * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
 * that sched_clock() continues from the point where it was left off during
 * suspend.
 */
void tsc_restore_sched_clock_state(void)
{
        unsigned long long offset;
        unsigned long flags;
        int cpu;

        if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
                return;

        local_irq_save(flags);

        /*
         * We're coming out of suspend, there's no concurrency yet; don't
         * bother being nice about the RCU stuff, just write to both
         * data fields.
         */

        this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
        this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);

        offset = cyc2ns_suspend - sched_clock();

        for_each_possible_cpu(cpu) {
                per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
                per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
        }

        local_irq_restore(flags);
}

#ifdef CONFIG_CPU_FREQ
/*
 * Frequency scaling support. Adjust the TSC based timer when the CPU frequency
 * changes.
 *
 * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC
 * as unstable and give up in those cases.
 *
 * Should fix up last_tsc too. Currently gettimeofday in the
 * first tick after the change will be slightly wrong.
 */

static unsigned int  ref_freq;
static unsigned long loops_per_jiffy_ref;
static unsigned long tsc_khz_ref;

static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
                                void *data)
{
        struct cpufreq_freqs *freq = data;

        if (num_online_cpus() > 1) {
                mark_tsc_unstable("cpufreq changes on SMP");
                return 0;
        }

        if (!ref_freq) {
                ref_freq = freq->old;
                loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
                tsc_khz_ref = tsc_khz;
        }

        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
            (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
                boot_cpu_data.loops_per_jiffy =
                        cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);

                tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
                if (!(freq->flags & CPUFREQ_CONST_LOOPS))
                        mark_tsc_unstable("cpufreq changes");

                set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
        }

        return 0;
}

static struct notifier_block time_cpufreq_notifier_block = {
        .notifier_call  = time_cpufreq_notifier
};

static int __init cpufreq_register_tsc_scaling(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC))
                return 0;
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;
        cpufreq_register_notifier(&time_cpufreq_notifier_block,
                                CPUFREQ_TRANSITION_NOTIFIER);
        return 0;
}

core_initcall(cpufreq_register_tsc_scaling);

#endif /* CONFIG_CPU_FREQ */

#define ART_MIN_DENOMINATOR (1)

/*
 * If ART is present detect the numerator:denominator to convert to TSC
 */
static void __init detect_art(void)
{
        unsigned int unused;

        if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
                return;

        /*
         * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
         * and the TSC counter resets must not occur asynchronously.
         */
        if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
            !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
            !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
            tsc_async_resets)
                return;

        cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator,
              &art_base_clk.numerator, &art_base_clk.freq_khz, &unused);

        art_base_clk.freq_khz /= KHZ;
        if (art_base_clk.denominator < ART_MIN_DENOMINATOR)
                return;

        rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset);

        /* Make this sticky over multiple CPU init calls */
        setup_force_cpu_cap(X86_FEATURE_ART);
}


/* clocksource code */

static void tsc_resume(struct clocksource *cs)
{
        tsc_verify_tsc_adjust(true);
}

/*
 * We used to compare the TSC to the cycle_last value in the clocksource
 * structure to avoid a nasty time-warp. This can be observed in a
 * very small window right after one CPU updated cycle_last under
 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
 * is smaller than the cycle_last reference value due to a TSC which
 * is slightly behind. This delta is nowhere else observable, but in
 * that case it results in a forward time jump in the range of hours
 * due to the unsigned delta calculation of the time keeping core
 * code, which is necessary to support wrapping clocksources like pm
 * timer.
 *
 * This sanity check is now done in the core timekeeping code.
 * checking the result of read_tsc() - cycle_last for being negative.
 * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
 */
static u64 read_tsc(struct clocksource *cs)
{
        return (u64)rdtsc_ordered();
}

static void tsc_cs_mark_unstable(struct clocksource *cs)
{
        if (tsc_unstable)
                return;

        tsc_unstable = 1;
        if (using_native_sched_clock())
                clear_sched_clock_stable();
        pr_info("Marking TSC unstable due to clocksource watchdog\n");
}

static void tsc_cs_tick_stable(struct clocksource *cs)
{
        if (tsc_unstable)
                return;

        if (using_native_sched_clock())
                sched_clock_tick_stable();
}

static int tsc_cs_enable(struct clocksource *cs)
{
        vclocks_set_used(VDSO_CLOCKMODE_TSC);
        return 0;
}

/*
 * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
 */
static struct clocksource clocksource_tsc_early = {
        .name                        = "tsc-early",
        .rating                        = 299,
        .read                        = read_tsc,
        .mask                        = CLOCKSOURCE_MASK(64),
        .flags                        = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_MUST_VERIFY,
        .id                        = CSID_X86_TSC_EARLY,
        .vdso_clock_mode        = VDSO_CLOCKMODE_TSC,
        .enable                        = tsc_cs_enable,
        .resume                        = tsc_resume,
        .mark_unstable                = tsc_cs_mark_unstable,
        .tick_stable                = tsc_cs_tick_stable,
        .list                        = LIST_HEAD_INIT(clocksource_tsc_early.list),
};

/*
 * Must mark VALID_FOR_HRES early such that when we unregister tsc_early
 * this one will immediately take over. We will only register if TSC has
 * been found good.
 */
static struct clocksource clocksource_tsc = {
        .name                        = "tsc",
        .rating                        = 300,
        .read                        = read_tsc,
        .mask                        = CLOCKSOURCE_MASK(64),
        .flags                        = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_CAN_INLINE_READ |
                                  CLOCK_SOURCE_MUST_VERIFY |
                                  CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT,
        .id                        = CSID_X86_TSC,
        .vdso_clock_mode        = VDSO_CLOCKMODE_TSC,
        .enable                        = tsc_cs_enable,
        .resume                        = tsc_resume,
        .mark_unstable                = tsc_cs_mark_unstable,
        .tick_stable                = tsc_cs_tick_stable,
        .list                        = LIST_HEAD_INIT(clocksource_tsc.list),
};

void mark_tsc_unstable(char *reason)
{
        if (tsc_unstable)
                return;

        tsc_unstable = 1;
        if (using_native_sched_clock())
                clear_sched_clock_stable();
        pr_info("Marking TSC unstable due to %s\n", reason);

        clocksource_mark_unstable(&clocksource_tsc_early);
        clocksource_mark_unstable(&clocksource_tsc);
}

EXPORT_SYMBOL_GPL(mark_tsc_unstable);

static void __init tsc_disable_clocksource_watchdog(void)
{
        if (tsc_watchdog == TSC_WATCHDOG_ON)
                return;
        clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
        clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}

static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
        if (is_geode_lx()) {
                /* RTSC counts during suspend */
#define RTSC_SUSP 0x100
                unsigned long res_low, res_high;

                rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
                /* Geode_LX - the OLPC CPU has a very reliable TSC */
                if (res_low & RTSC_SUSP)
                        tsc_clocksource_reliable = 1;
        }
#endif
        if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
                tsc_clocksource_reliable = 1;

        /*
         * Disable the clocksource watchdog when the system has:
         *  - TSC running at constant frequency
         *  - TSC which does not stop in C-States
         *  - the TSC_ADJUST register which allows to detect even minimal
         *    modifications
         *  - not more than four packages
         */
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
            boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
            boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
            topology_max_packages() <= 4)
                tsc_disable_clocksource_watchdog();
}

/*
 * Make an educated guess if the TSC is trustworthy and synchronized
 * over all CPUs.
 */
int unsynchronized_tsc(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
                return 1;

#ifdef CONFIG_SMP
        if (apic_is_clustered_box())
                return 1;
#endif

        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;

        if (tsc_clocksource_reliable)
                return 0;
        /*
         * Intel systems are normally all synchronized.
         * Exceptions must mark TSC as unstable:
         */
        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
                /* assume multi socket systems are not synchronized: */
                if (topology_max_packages() > 1)
                        return 1;
        }

        return 0;
}

static void tsc_refine_calibration_work(struct work_struct *work);
static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
/**
 * tsc_refine_calibration_work - Further refine tsc freq calibration
 * @work: ignored.
 *
 * This functions uses delayed work over a period of a
 * second to further refine the TSC freq value. Since this is
 * timer based, instead of loop based, we don't block the boot
 * process while this longer calibration is done.
 *
 * If there are any calibration anomalies (too many SMIs, etc),
 * or the refined calibration is off by 1% of the fast early
 * calibration, we throw out the new calibration and use the
 * early calibration.
 */
static void tsc_refine_calibration_work(struct work_struct *work)
{
        static u64 tsc_start = ULLONG_MAX, ref_start;
        static int hpet;
        u64 tsc_stop, ref_stop, delta;
        unsigned long freq;
        int cpu;

        /* Don't bother refining TSC on unstable systems */
        if (tsc_unstable)
                goto unreg;

        /*
         * Since the work is started early in boot, we may be
         * delayed the first time we expire. So set the workqueue
         * again once we know timers are working.
         */
        if (tsc_start == ULLONG_MAX) {
restart:
                /*
                 * Only set hpet once, to avoid mixing hardware
                 * if the hpet becomes enabled later.
                 */
                hpet = is_hpet_enabled();
                tsc_start = tsc_read_refs(&ref_start, hpet);
                schedule_delayed_work(&tsc_irqwork, HZ);
                return;
        }

        tsc_stop = tsc_read_refs(&ref_stop, hpet);

        /* hpet or pmtimer available ? */
        if (ref_start == ref_stop)
                goto out;

        /* Check, whether the sampling was disturbed */
        if (tsc_stop == ULLONG_MAX)
                goto restart;

        delta = tsc_stop - tsc_start;
        delta *= 1000000LL;
        if (hpet)
                freq = calc_hpet_ref(delta, ref_start, ref_stop);
        else
                freq = calc_pmtimer_ref(delta, ref_start, ref_stop);

        /* Will hit this only if tsc_force_recalibrate has been set */
        if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {

                /* Warn if the deviation exceeds 500 ppm */
                if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
                        pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
                        pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
                                (unsigned long)tsc_khz / 1000,
                                (unsigned long)tsc_khz % 1000);
                }

                pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
                        hpet ? "HPET" : "PM_TIMER",
                        (unsigned long)freq / 1000,
                        (unsigned long)freq % 1000);

                return;
        }

        /* Make sure we're within 1% */
        if (abs(tsc_khz - freq) > tsc_khz/100)
                goto out;

        tsc_khz = freq;
        pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
                (unsigned long)tsc_khz / 1000,
                (unsigned long)tsc_khz % 1000);

        clocksource_tsc.flags |= CLOCK_SOURCE_CALIBRATED;

        /* Inform the TSC deadline clockevent devices about the recalibration */
        lapic_update_tsc_freq();

        /* Update the sched_clock() rate to match the clocksource one */
        for_each_possible_cpu(cpu)
                set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);

out:
        if (tsc_unstable)
                goto unreg;

        if (boot_cpu_has(X86_FEATURE_ART)) {
                have_art = true;
                clocksource_tsc.base = &art_base_clk;
        }

        /*
         * Transfer the valid for high resolution flag if it was set on the
         * early TSC already. That guarantees that there is no intermediate
         * clocksource selected once the early TSC is unregistered.
         */
        if (clocksource_tsc_early.flags & CLOCK_SOURCE_VALID_FOR_HRES)
                clocksource_tsc.flags |= CLOCK_SOURCE_VALID_FOR_HRES;

        clocksource_register_khz(&clocksource_tsc, tsc_khz);
unreg:
        clocksource_unregister(&clocksource_tsc_early);
}


static int __init init_tsc_clocksource(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
                return 0;

        if (tsc_unstable) {
                clocksource_unregister(&clocksource_tsc_early);
                return 0;
        }

        if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
                clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;

        /*
         * When TSC frequency is known (retrieved via MSR or CPUID), we skip
         * the refined calibration and directly register it as a clocksource.
         */
        if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
                if (boot_cpu_has(X86_FEATURE_ART)) {
                        have_art = true;
                        clocksource_tsc.base = &art_base_clk;
                }
                clocksource_register_khz(&clocksource_tsc, tsc_khz);
                clocksource_unregister(&clocksource_tsc_early);

                if (!tsc_force_recalibrate)
                        return 0;
        }

        schedule_delayed_work(&tsc_irqwork, 0);
        return 0;
}
/*
 * We use device_initcall here, to ensure we run after the hpet
 * is fully initialized, which may occur at fs_initcall time.
 */
device_initcall(init_tsc_clocksource);

static bool __init determine_cpu_tsc_frequencies(bool early)
{
        /* Make sure that cpu and tsc are not already calibrated */
        WARN_ON(cpu_khz || tsc_khz);

        if (early) {
                cpu_khz = x86_platform.calibrate_cpu();
                if (tsc_early_khz)
                        tsc_khz = tsc_early_khz;
                else
                        tsc_khz = x86_platform.calibrate_tsc();
        } else {
                /* We should not be here with non-native cpu calibration */
                WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
                cpu_khz = pit_hpet_ptimer_calibrate_cpu();
        }

        /*
         * Trust non-zero tsc_khz as authoritative,
         * and use it to sanity check cpu_khz,
         * which will be off if system timer is off.
         */
        if (tsc_khz == 0)
                tsc_khz = cpu_khz;
        else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
                cpu_khz = tsc_khz;

        if (tsc_khz == 0)
                return false;

        pr_info("Detected %lu.%03lu MHz processor\n",
                (unsigned long)cpu_khz / KHZ,
                (unsigned long)cpu_khz % KHZ);

        if (cpu_khz != tsc_khz) {
                pr_info("Detected %lu.%03lu MHz TSC",
                        (unsigned long)tsc_khz / KHZ,
                        (unsigned long)tsc_khz % KHZ);
        }
        return true;
}

static unsigned long __init get_loops_per_jiffy(void)
{
        u64 lpj = (u64)tsc_khz * KHZ;

        do_div(lpj, HZ);
        return lpj;
}

static void __init tsc_enable_sched_clock(void)
{
        loops_per_jiffy = get_loops_per_jiffy();
        use_tsc_delay();

        /* Sanitize TSC ADJUST before cyc2ns gets initialized */
        tsc_store_and_check_tsc_adjust(true);
        cyc2ns_init_boot_cpu();
        static_branch_enable(&__use_tsc);
}

void __init tsc_early_init(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC))
                return;
        /* Don't change UV TSC multi-chassis synchronization */
        if (is_early_uv_system())
                return;

        snp_secure_tsc_init();

        if (!determine_cpu_tsc_frequencies(true))
                return;
        tsc_enable_sched_clock();
}

void __init tsc_init(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_TSC)) {
                setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
                return;
        }

        /*
         * native_calibrate_cpu_early can only calibrate using methods that are
         * available early in boot.
         */
        if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
                x86_platform.calibrate_cpu = native_calibrate_cpu;

        if (!tsc_khz) {
                /* We failed to determine frequencies earlier, try again */
                if (!determine_cpu_tsc_frequencies(false)) {
                        mark_tsc_unstable("could not calculate TSC khz");
                        setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
                        return;
                }
                tsc_enable_sched_clock();
        }

        cyc2ns_init_secondary_cpus();

        if (!no_sched_irq_time)
                enable_sched_clock_irqtime();

        lpj_fine = get_loops_per_jiffy();

        check_system_tsc_reliable();

        if (unsynchronized_tsc()) {
                mark_tsc_unstable("TSCs unsynchronized");
                return;
        }

        if (tsc_clocksource_reliable || tsc_watchdog == TSC_WATCHDOG_OFF)
                tsc_disable_clocksource_watchdog();

        clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
        detect_art();
}

#ifdef CONFIG_SMP
/*
 * Check whether existing calibration data can be reused.
 */
unsigned long calibrate_delay_is_known(void)
{
        int sibling, cpu = smp_processor_id();
        int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
        const struct cpumask *mask = topology_core_cpumask(cpu);

        /*
         * If TSC has constant frequency and TSC is synchronized across
         * sockets then reuse CPU0 calibration.
         */
        if (constant_tsc && !tsc_unstable)
                return cpu_data(0).loops_per_jiffy;

        /*
         * If TSC has constant frequency and TSC is not synchronized across
         * sockets and this is not the first CPU in the socket, then reuse
         * the calibration value of an already online CPU on that socket.
         *
         * This assumes that CONSTANT_TSC is consistent for all CPUs in a
         * socket.
         */
        if (!constant_tsc || !mask)
                return 0;

        sibling = cpumask_any_but(mask, cpu);
        if (sibling < nr_cpu_ids)
                return cpu_data(sibling).loops_per_jiffy;
        return 0;
}
#endif






































































































    1 






    1 




















































































































    1 







    1 


    1 





    1 




    1 






    1 




















    1 























    1 


    1 



















    1 





    1 














    1 


































    1 











    1 




























    1 




















    1 












    1 
















    1 



    1 















    1 



    1 



    1 





































































    1 










    1 















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
/*
 * Ext4 orphan inode handling
 */
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>

#include "ext4.h"
#include "ext4_jbd2.h"

#define EXT4_MAX_ORPHAN_FILE_BLOCKS 512

static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
{
        int i, j, start;
        struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
        int ret = 0;
        bool found = false;
        __le32 *bdata;
        int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
        int looped = 0;

        /*
         * Find block with free orphan entry. Use CPU number for a naive hash
         * for a search start in the orphan file
         */
        start = raw_smp_processor_id()*13 % oi->of_blocks;
        i = start;
        do {
                if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries)
                    >= 0) {
                        found = true;
                        break;
                }
                if (++i >= oi->of_blocks)
                        i = 0;
        } while (i != start);

        if (!found) {
                /*
                 * For now we don't grow or shrink orphan file. We just use
                 * whatever was allocated at mke2fs time. The additional
                 * credits we would have to reserve for each orphan inode
                 * operation just don't seem worth it.
                 */
                return -ENOSPC;
        }

        ret = ext4_journal_get_write_access(handle, inode->i_sb,
                                oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
        if (ret) {
                atomic_inc(&oi->of_binfo[i].ob_free_entries);
                return ret;
        }

        bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
        /* Find empty slot in a block */
        j = 0;
        do {
                if (looped) {
                        /*
                         * Did we walk through the block several times without
                         * finding free entry? It is theoretically possible
                         * if entries get constantly allocated and freed or
                         * if the block is corrupted. Avoid indefinite looping
                         * and bail. We'll use orphan list instead.
                         */
                        if (looped > 3) {
                                atomic_inc(&oi->of_binfo[i].ob_free_entries);
                                return -ENOSPC;
                        }
                        cond_resched();
                }
                while (bdata[j]) {
                        if (++j >= inodes_per_ob) {
                                j = 0;
                                looped++;
                        }
                }
        } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) !=
                 (__le32)0);

        EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
        ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);

        return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
}

/*
 * ext4_orphan_add() links an unlinked or truncated inode into a list of
 * such inodes, starting at the superblock, in case we crash before the
 * file is closed/deleted, or in case the inode truncate spans multiple
 * transactions and the last transaction is not recovered after a crash.
 *
 * At filesystem recovery time, we walk this list deleting unlinked
 * inodes and truncating linked inodes in ext4_orphan_cleanup().
 *
 * Orphan list manipulation functions must be called under i_rwsem unless
 * we are just creating the inode or deleting it.
 */
int ext4_orphan_add(handle_t *handle, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_iloc iloc;
        int err = 0, rc;
        bool dirty = false;

        if (!sbi->s_journal || is_bad_inode(inode))
                return 0;

        WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
                     !inode_is_locked(inode));
        if (ext4_inode_orphan_tracked(inode))
                return 0;

        /*
         * Orphan handling is only valid for files with data blocks
         * being truncated, or files being unlinked. Note that we either
         * hold i_rwsem, or the inode can not be referenced from outside,
         * so i_nlink should not be bumped due to race
         */
        ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);

        if (sbi->s_orphan_info.of_blocks) {
                err = ext4_orphan_file_add(handle, inode);
                /*
                 * Fallback to normal orphan list of orphan file is
                 * out of space
                 */
                if (err != -ENOSPC)
                        return err;
        }

        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
        err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
                                            EXT4_JTR_NONE);
        if (err)
                goto out;

        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out;

        mutex_lock(&sbi->s_orphan_lock);
        /*
         * Due to previous errors inode may be already a part of on-disk
         * orphan list. If so skip on-disk list modification.
         */
        if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
            (le32_to_cpu(sbi->s_es->s_inodes_count))) {
                /* Insert this inode at the head of the on-disk orphan list */
                NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
                lock_buffer(sbi->s_sbh);
                sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
                ext4_superblock_csum_set(sb);
                unlock_buffer(sbi->s_sbh);
                dirty = true;
        }
        list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
        mutex_unlock(&sbi->s_orphan_lock);

        if (dirty) {
                err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
                rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
                if (!err)
                        err = rc;
                if (err) {
                        /*
                         * We have to remove inode from in-memory list if
                         * addition to on disk orphan list failed. Stray orphan
                         * list entries can cause panics at unmount time.
                         */
                        mutex_lock(&sbi->s_orphan_lock);
                        list_del_init(&EXT4_I(inode)->i_orphan);
                        mutex_unlock(&sbi->s_orphan_lock);
                }
        } else
                brelse(iloc.bh);

        ext4_debug("superblock will point to %llu\n", inode->i_ino);
        ext4_debug("orphan inode %llu will point to %d\n",
                        inode->i_ino, NEXT_ORPHAN(inode));
out:
        ext4_std_error(sb, err);
        return err;
}

static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
{
        struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
        __le32 *bdata;
        int blk, off;
        int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
        int ret = 0;

        if (!handle)
                goto out;
        blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob;
        off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob;
        if (WARN_ON_ONCE(blk >= oi->of_blocks))
                goto out;

        ret = ext4_journal_get_write_access(handle, inode->i_sb,
                                oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE);
        if (ret)
                goto out;

        bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
        bdata[off] = 0;
        atomic_inc(&oi->of_binfo[blk].ob_free_entries);
        ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
out:
        ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
        INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan);

        return ret;
}

/*
 * ext4_orphan_del() removes an unlinked or truncated inode from the list
 * of such inodes stored on disk, because it is finally being cleaned up.
 */
int ext4_orphan_del(handle_t *handle, struct inode *inode)
{
        struct list_head *prev;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 ino_next;
        struct ext4_iloc iloc;
        int err = 0;

        if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
                return 0;

        WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
                     !inode_is_locked(inode));
        if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
                return ext4_orphan_file_del(handle, inode);

        /* Do this quick check before taking global s_orphan_lock. */
        if (list_empty(&ei->i_orphan))
                return 0;

        if (handle) {
                /* Grab inode buffer early before taking global s_orphan_lock */
                err = ext4_reserve_inode_write(handle, inode, &iloc);
        }

        mutex_lock(&sbi->s_orphan_lock);
        ext4_debug("remove inode %llu from orphan list\n", inode->i_ino);

        prev = ei->i_orphan.prev;
        list_del_init(&ei->i_orphan);

        /* If we're on an error path, we may not have a valid
         * transaction handle with which to update the orphan list on
         * disk, but we still need to remove the inode from the linked
         * list in memory. */
        if (!handle || err) {
                mutex_unlock(&sbi->s_orphan_lock);
                goto out_err;
        }

        ino_next = NEXT_ORPHAN(inode);
        if (prev == &sbi->s_orphan) {
                ext4_debug("superblock will point to %u\n", ino_next);
                BUFFER_TRACE(sbi->s_sbh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode->i_sb,
                                                    sbi->s_sbh, EXT4_JTR_NONE);
                if (err) {
                        mutex_unlock(&sbi->s_orphan_lock);
                        goto out_brelse;
                }
                lock_buffer(sbi->s_sbh);
                sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
                ext4_superblock_csum_set(inode->i_sb);
                unlock_buffer(sbi->s_sbh);
                mutex_unlock(&sbi->s_orphan_lock);
                err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
        } else {
                struct ext4_iloc iloc2;
                struct inode *i_prev =
                        &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;

                ext4_debug("orphan inode %llu will point to %u\n",
                          i_prev->i_ino, ino_next);
                err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
                if (err) {
                        mutex_unlock(&sbi->s_orphan_lock);
                        goto out_brelse;
                }
                NEXT_ORPHAN(i_prev) = ino_next;
                err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
                mutex_unlock(&sbi->s_orphan_lock);
        }
        if (err)
                goto out_brelse;
        NEXT_ORPHAN(inode) = 0;
        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
out_err:
        ext4_std_error(inode->i_sb, err);
        return err;

out_brelse:
        brelse(iloc.bh);
        goto out_err;
}

#ifdef CONFIG_QUOTA
static int ext4_quota_on_mount(struct super_block *sb, int type)
{
        return dquot_quota_on_mount(sb,
                rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type],
                                          lockdep_is_held(&sb->s_umount)),
                EXT4_SB(sb)->s_jquota_fmt, type);
}
#endif

static void ext4_process_orphan(struct inode *inode,
                                int *nr_truncates, int *nr_orphans)
{
        struct super_block *sb = inode->i_sb;
        int ret;

        dquot_initialize(inode);
        if (inode->i_nlink) {
                if (test_opt(sb, DEBUG))
                        ext4_msg(sb, KERN_DEBUG,
                                "%s: truncating inode %llu to %lld bytes",
                                __func__, inode->i_ino, inode->i_size);
                ext4_debug("truncating inode %llu to %lld bytes\n",
                           inode->i_ino, inode->i_size);
                inode_lock(inode);
                truncate_inode_pages(inode->i_mapping, inode->i_size);
                ret = ext4_truncate(inode);
                if (ret) {
                        /*
                         * We need to clean up the in-core orphan list
                         * manually if ext4_truncate() failed to get a
                         * transaction handle.
                         */
                        ext4_orphan_del(NULL, inode);
                        ext4_std_error(inode->i_sb, ret);
                }
                inode_unlock(inode);
                (*nr_truncates)++;
        } else {
                if (test_opt(sb, DEBUG))
                        ext4_msg(sb, KERN_DEBUG,
                                "%s: deleting unreferenced inode %llu",
                                __func__, inode->i_ino);
                ext4_debug("deleting unreferenced inode %llu\n",
                           inode->i_ino);
                (*nr_orphans)++;
        }
        iput(inode);  /* The delete magic happens here! */
}

/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
 * the superblock) which were deleted from all directories, but held open by
 * a process at the time of a crash.  We walk the list and try to delete these
 * inodes at recovery time (only with a read-write filesystem).
 *
 * In order to keep the orphan inode chain consistent during traversal (in
 * case of crash during recovery), we link each inode into the superblock
 * orphan list_head and handle it the same way as an inode deletion during
 * normal operation (which journals the operations for us).
 *
 * We only do an iget() and an iput() on each inode, which is very safe if we
 * accidentally point at an in-use or already deleted inode.  The worst that
 * can happen in this case is that we get a "bit already cleared" message from
 * ext4_free_inode().  The only reason we would point at a wrong inode is if
 * e2fsck was run on this filesystem, and it must have already done the orphan
 * inode cleanup for us, so we can safely abort without any further action.
 */
void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
{
        unsigned int s_flags = sb->s_flags;
        int nr_orphans = 0, nr_truncates = 0;
        struct inode *inode;
        int i, j;
#ifdef CONFIG_QUOTA
        int quota_update = 0;
#endif
        __le32 *bdata;
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
        int inodes_per_ob = ext4_inodes_per_orphan_block(sb);

        if (!es->s_last_orphan && !oi->of_blocks) {
                ext4_debug("no orphan inodes to clean up\n");
                return;
        }

        if (bdev_read_only(sb->s_bdev)) {
                ext4_msg(sb, KERN_ERR, "write access "
                        "unavailable, skipping orphan cleanup");
                return;
        }

        /* Check if feature set would not allow a r/w mount */
        if (!ext4_feature_set_ok(sb, 0)) {
                ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
                         "unknown ROCOMPAT features");
                return;
        }

        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                /* don't clear list on RO mount w/ errors */
                if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
                        ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
                                  "clearing orphan list.");
                        es->s_last_orphan = 0;
                }
                ext4_debug("Skipping orphan recovery on fs with errors.\n");
                return;
        }

        if (s_flags & SB_RDONLY) {
                ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
                sb->s_flags &= ~SB_RDONLY;
        }
#ifdef CONFIG_QUOTA
        /*
         * Turn on quotas which were not enabled for read-only mounts if
         * filesystem has quota feature, so that they are updated correctly.
         */
        if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
                int ret = ext4_enable_quotas(sb);

                if (!ret)
                        quota_update = 1;
                else
                        ext4_msg(sb, KERN_ERR,
                                "Cannot turn on quotas: error %d", ret);
        }

        /* Turn on journaled quotas used for old sytle */
        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                if (EXT4_SB(sb)->s_qf_names[i]) {
                        int ret = ext4_quota_on_mount(sb, i);

                        if (!ret)
                                quota_update = 1;
                        else
                                ext4_msg(sb, KERN_ERR,
                                        "Cannot turn on journaled "
                                        "quota: type %d: error %d", i, ret);
                }
        }
#endif

        while (es->s_last_orphan) {
                /*
                 * We may have encountered an error during cleanup; if
                 * so, skip the rest.
                 */
                if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                        ext4_debug("Skipping orphan recovery on fs with errors.\n");
                        es->s_last_orphan = 0;
                        break;
                }

                inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
                if (IS_ERR(inode)) {
                        es->s_last_orphan = 0;
                        break;
                }

                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
                ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
        }

        for (i = 0; i < oi->of_blocks; i++) {
                bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
                for (j = 0; j < inodes_per_ob; j++) {
                        if (!bdata[j])
                                continue;
                        inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j]));
                        if (IS_ERR(inode))
                                continue;
                        ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
                        EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
                        ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
                }
        }

#define PLURAL(x) (x), ((x) == 1) ? "" : "s"

        if (nr_orphans)
                ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
                       PLURAL(nr_orphans));
        if (nr_truncates)
                ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
                       PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
        /* Turn off quotas if they were enabled for orphan cleanup */
        if (quota_update) {
                for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                        if (sb_dqopt(sb)->files[i])
                                dquot_quota_off(sb, i);
                }
        }
#endif
        sb->s_flags = s_flags; /* Restore SB_RDONLY status */
}

void ext4_release_orphan_info(struct super_block *sb)
{
        int i;
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;

        if (!oi->of_blocks)
                return;
        for (i = 0; i < oi->of_blocks; i++)
                brelse(oi->of_binfo[i].ob_bh);
        kvfree(oi->of_binfo);
}

static struct ext4_orphan_block_tail *ext4_orphan_block_tail(
                                                struct super_block *sb,
                                                struct buffer_head *bh)
{
        return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize -
                                sizeof(struct ext4_orphan_block_tail));
}

static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
                                              struct buffer_head *bh)
{
        __u32 calculated;
        int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
        struct ext4_orphan_block_tail *ot;
        __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);

        if (!ext4_has_feature_metadata_csum(sb))
                return 1;

        ot = ext4_orphan_block_tail(sb, bh);
        calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
                                 sizeof(dsk_block_nr));
        calculated = ext4_chksum(calculated, (__u8 *)bh->b_data,
                                 inodes_per_ob * sizeof(__u32));
        return le32_to_cpu(ot->ob_checksum) == calculated;
}

/* This gets called only when checksumming is enabled */
void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
                                    struct buffer_head *bh,
                                    void *data, size_t size)
{
        struct super_block *sb = EXT4_TRIGGER(triggers)->sb;
        __u32 csum;
        int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
        struct ext4_orphan_block_tail *ot;
        __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);

        csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
                           sizeof(dsk_block_nr));
        csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32));
        ot = ext4_orphan_block_tail(sb, bh);
        ot->ob_checksum = cpu_to_le32(csum);
}

int ext4_init_orphan_info(struct super_block *sb)
{
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
        struct inode *inode;
        int i, j;
        int ret;
        int free;
        __le32 *bdata;
        int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
        struct ext4_orphan_block_tail *ot;
        ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);

        if (!ext4_has_feature_orphan_file(sb))
                return 0;

        inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL);
        if (IS_ERR(inode)) {
                ext4_msg(sb, KERN_ERR, "get orphan inode failed");
                return PTR_ERR(inode);
        }
        /*
         * This is just an artificial limit to prevent corrupted fs from
         * consuming absurd amounts of memory when pinning blocks of orphan
         * file in memory.
         */
        if (inode->i_size > (EXT4_MAX_ORPHAN_FILE_BLOCKS << inode->i_blkbits)) {
                ext4_msg(sb, KERN_ERR, "orphan file too big: %llu",
                         (unsigned long long)inode->i_size);
                ret = -EFSCORRUPTED;
                goto out_put;
        }
        oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
        oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
        oi->of_binfo = kvmalloc_objs(struct ext4_orphan_block, oi->of_blocks);
        if (!oi->of_binfo) {
                ret = -ENOMEM;
                goto out_put;
        }
        for (i = 0; i < oi->of_blocks; i++) {
                oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0);
                if (IS_ERR(oi->of_binfo[i].ob_bh)) {
                        ret = PTR_ERR(oi->of_binfo[i].ob_bh);
                        goto out_free;
                }
                if (!oi->of_binfo[i].ob_bh) {
                        ret = -EIO;
                        goto out_free;
                }
                ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh);
                if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) {
                        ext4_error(sb, "orphan file block %d: bad magic", i);
                        ret = -EIO;
                        goto out_free;
                }
                if (!ext4_orphan_file_block_csum_verify(sb,
                                                oi->of_binfo[i].ob_bh)) {
                        ext4_error(sb, "orphan file block %d: bad checksum", i);
                        ret = -EIO;
                        goto out_free;
                }
                bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
                free = 0;
                for (j = 0; j < inodes_per_ob; j++)
                        if (bdata[j] == 0)
                                free++;
                atomic_set(&oi->of_binfo[i].ob_free_entries, free);
        }
        iput(inode);
        return 0;
out_free:
        for (i--; i >= 0; i--)
                brelse(oi->of_binfo[i].ob_bh);
        kvfree(oi->of_binfo);
out_put:
        iput(inode);
        return ret;
}

int ext4_orphan_file_empty(struct super_block *sb)
{
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
        int i;
        int inodes_per_ob = ext4_inodes_per_orphan_block(sb);

        if (!ext4_has_feature_orphan_file(sb))
                return 1;
        for (i = 0; i < oi->of_blocks; i++)
                if (atomic_read(&oi->of_binfo[i].ob_free_entries) !=
                    inodes_per_ob)
                        return 0;
        return 1;
}
















    1 














    1 
















    1 






























































    1 













    1 















    1 





























    1 



    1 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfs/bfind.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Search routines for btrees
 */

#include <linux/slab.h>
#include "btree.h"

int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
{
        void *ptr;

        if (!tree || !fd)
                return -EINVAL;

        fd->tree = tree;
        fd->bnode = NULL;
        ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
        if (!ptr)
                return -ENOMEM;
        fd->search_key = ptr;
        fd->key = ptr + tree->max_key_len + 2;
        hfs_dbg("cnid %d, caller %ps\n",
                tree->cnid, __builtin_return_address(0));
        switch (tree->cnid) {
        case HFS_CAT_CNID:
                mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
                break;
        case HFS_EXT_CNID:
                mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
                break;
        case HFS_ATTR_CNID:
                mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

void hfs_find_exit(struct hfs_find_data *fd)
{
        hfs_bnode_put(fd->bnode);
        kfree(fd->search_key);
        hfs_dbg("cnid %d, caller %ps\n",
                fd->tree->cnid, __builtin_return_address(0));
        mutex_unlock(&fd->tree->tree_lock);
        fd->tree = NULL;
}

/* Find the record in bnode that best matches key (not greater than...)*/
int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
{
        int cmpval;
        u16 off, len, keylen;
        int rec;
        int b, e;
        int res;

        b = 0;
        e = bnode->num_recs - 1;
        res = -ENOENT;
        do {
                rec = (e + b) / 2;
                len = hfs_brec_lenoff(bnode, rec, &off);
                keylen = hfs_brec_keylen(bnode, rec);
                if (keylen == 0) {
                        res = -EINVAL;
                        goto fail;
                }
                hfs_bnode_read(bnode, fd->key, off, keylen);
                cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
                if (!cmpval) {
                        e = rec;
                        res = 0;
                        goto done;
                }
                if (cmpval < 0)
                        b = rec + 1;
                else
                        e = rec - 1;
        } while (b <= e);
        if (rec != e && e >= 0) {
                len = hfs_brec_lenoff(bnode, e, &off);
                keylen = hfs_brec_keylen(bnode, e);
                if (keylen == 0) {
                        res = -EINVAL;
                        goto fail;
                }
                hfs_bnode_read(bnode, fd->key, off, keylen);
        }
done:
        fd->record = e;
        fd->keyoffset = off;
        fd->keylength = keylen;
        fd->entryoffset = off + keylen;
        fd->entrylength = len - keylen;
fail:
        return res;
}

/* Traverse a B*Tree from the root to a leaf finding best fit to key */
/* Return allocated copy of node found, set recnum to best record */
int hfs_brec_find(struct hfs_find_data *fd)
{
        struct hfs_btree *tree;
        struct hfs_bnode *bnode;
        u32 nidx, parent;
        __be32 data;
        int height, res;

        fd->record = -1;
        fd->keyoffset = -1;
        fd->keylength = -1;
        fd->entryoffset = -1;
        fd->entrylength = -1;

        tree = fd->tree;
        if (fd->bnode)
                hfs_bnode_put(fd->bnode);
        fd->bnode = NULL;
        nidx = tree->root;
        if (!nidx)
                return -ENOENT;
        height = tree->depth;
        res = 0;
        parent = 0;
        for (;;) {
                bnode = hfs_bnode_find(tree, nidx);
                if (IS_ERR(bnode)) {
                        res = PTR_ERR(bnode);
                        bnode = NULL;
                        break;
                }
                if (bnode->height != height)
                        goto invalid;
                if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF))
                        goto invalid;
                bnode->parent = parent;

                res = __hfs_brec_find(bnode, fd);
                if (!height)
                        break;
                if (fd->record < 0)
                        goto release;

                parent = nidx;
                hfs_bnode_read(bnode, &data, fd->entryoffset, 4);
                nidx = be32_to_cpu(data);
                hfs_bnode_put(bnode);
        }
        fd->bnode = bnode;
        return res;

invalid:
        pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
               height, bnode->height, bnode->type, nidx, parent);
        res = -EIO;
release:
        hfs_bnode_put(bnode);
        return res;
}

int hfs_brec_read(struct hfs_find_data *fd, void *rec, u32 rec_len)
{
        int res;

        res = hfs_brec_find(fd);
        if (res)
                return res;
        if (fd->entrylength > rec_len)
                return -EINVAL;
        hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength);
        return 0;
}

int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
{
        struct hfs_btree *tree;
        struct hfs_bnode *bnode;
        int idx, res = 0;
        u16 off, len, keylen;

        bnode = fd->bnode;
        tree = bnode->tree;

        if (cnt < 0) {
                cnt = -cnt;
                while (cnt > fd->record) {
                        cnt -= fd->record + 1;
                        fd->record = bnode->num_recs - 1;
                        idx = bnode->prev;
                        if (!idx) {
                                res = -ENOENT;
                                goto out;
                        }
                        hfs_bnode_put(bnode);
                        bnode = hfs_bnode_find(tree, idx);
                        if (IS_ERR(bnode)) {
                                res = PTR_ERR(bnode);
                                bnode = NULL;
                                goto out;
                        }
                }
                fd->record -= cnt;
        } else {
                while (cnt >= bnode->num_recs - fd->record) {
                        cnt -= bnode->num_recs - fd->record;
                        fd->record = 0;
                        idx = bnode->next;
                        if (!idx) {
                                res = -ENOENT;
                                goto out;
                        }
                        hfs_bnode_put(bnode);
                        bnode = hfs_bnode_find(tree, idx);
                        if (IS_ERR(bnode)) {
                                res = PTR_ERR(bnode);
                                bnode = NULL;
                                goto out;
                        }
                }
                fd->record += cnt;
        }

        len = hfs_brec_lenoff(bnode, fd->record, &off);
        keylen = hfs_brec_keylen(bnode, fd->record);
        if (keylen == 0) {
                res = -EINVAL;
                goto out;
        }
        fd->keyoffset = off;
        fd->keylength = keylen;
        fd->entryoffset = off + keylen;
        fd->entrylength = len - keylen;
        hfs_bnode_read(bnode, fd->key, off, keylen);
out:
        fd->bnode = bnode;
        return res;
}












































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/xattr.h

  Extended attributes handling.

  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
*/
#ifndef _LINUX_XATTR_H
#define _LINUX_XATTR_H


#include <linux/slab.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/rhashtable-types.h>
#include <linux/user_namespace.h>
#include <uapi/linux/xattr.h>

/* List of all open_how "versions". */
#define XATTR_ARGS_SIZE_VER0        16 /* sizeof first published struct */
#define XATTR_ARGS_SIZE_LATEST        XATTR_ARGS_SIZE_VER0

struct inode;
struct dentry;

static inline bool is_posix_acl_xattr(const char *name)
{
        return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
               (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0);
}

/*
 * struct xattr_handler: When @name is set, match attributes with exactly that
 * name.  When @prefix is set instead, match attributes with that prefix and
 * with a non-empty suffix.
 */
struct xattr_handler {
        const char *name;
        const char *prefix;
        int flags;      /* fs private flags */
        bool (*list)(struct dentry *dentry);
        int (*get)(const struct xattr_handler *, struct dentry *dentry,
                   struct inode *inode, const char *name, void *buffer,
                   size_t size);
        int (*set)(const struct xattr_handler *,
                   struct mnt_idmap *idmap, struct dentry *dentry,
                   struct inode *inode, const char *name, const void *buffer,
                   size_t size, int flags);
};

/**
 * xattr_handler_can_list - check whether xattr can be listed
 * @handler: handler for this type of xattr
 * @dentry: dentry whose inode xattr to list
 *
 * Determine whether the xattr associated with @dentry can be listed given
 * @handler.
 *
 * Return: true if xattr can be listed, false if not.
 */
static inline bool xattr_handler_can_list(const struct xattr_handler *handler,
                                          struct dentry *dentry)
{
        return handler && (!handler->list || handler->list(dentry));
}

const char *xattr_full_name(const struct xattr_handler *, const char *);

struct xattr {
        const char *name;
        void *value;
        size_t value_len;
};

ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *,
                     void *, size_t);
ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
int __vfs_setxattr(struct mnt_idmap *, struct dentry *, struct inode *,
                   const char *, const void *, size_t, int);
int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *,
                          const char *, const void *, size_t, int);
int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *,
                          const char *, const void *, size_t, int,
                          struct delegated_inode *);
int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *,
                 const void *, size_t, int);
int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);
int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *,
                             const char *, struct delegated_inode *);
int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);

ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
int vfs_getxattr_alloc(struct mnt_idmap *idmap,
                       struct dentry *dentry, const char *name,
                       char **xattr_value, size_t size, gfp_t flags);

int xattr_supports_user_prefix(struct inode *inode);

static inline const char *xattr_prefix(const struct xattr_handler *handler)
{
        return handler->prefix ?: handler->name;
}

struct simple_xattrs {
        struct rhashtable ht;
};

struct simple_xattr {
        struct rhash_head hash_node;
        struct rcu_head rcu;
        char *name;
        size_t size;
        char value[] __counted_by(size);
};

#define SIMPLE_XATTR_MAX_NR                128
#define SIMPLE_XATTR_MAX_SIZE                (128 << 10)

struct simple_xattr_limits {
        atomic_t        nr_xattrs;        /* current user.* xattr count */
        atomic_t        xattr_size;        /* current total user.* value bytes */
};

static inline void simple_xattr_limits_init(struct simple_xattr_limits *limits)
{
        atomic_set(&limits->nr_xattrs, 0);
        atomic_set(&limits->xattr_size, 0);
}

int simple_xattrs_init(struct simple_xattrs *xattrs);
struct simple_xattrs *simple_xattrs_alloc(void);
struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp,
                                               const void *value, int flags);
void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space);
size_t simple_xattr_space(const char *name, size_t size);
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size);
void simple_xattr_free(struct simple_xattr *xattr);
void simple_xattr_free_rcu(struct simple_xattr *xattr);
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
                     void *buffer, size_t size);
struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs,
                                      const char *name, const void *value,
                                      size_t size, int flags);
int simple_xattr_set_limited(struct simple_xattrs *xattrs,
                             struct simple_xattr_limits *limits,
                             const char *name, const void *value,
                             size_t size, int flags);
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
                          char *buffer, size_t size);
int simple_xattr_add(struct simple_xattrs *xattrs,
                     struct simple_xattr *new_xattr);
int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name);

DEFINE_CLASS(simple_xattr,
             struct simple_xattr *,
             if (!IS_ERR_OR_NULL(_T)) simple_xattr_free(_T),
             simple_xattr_alloc(value, size),
             const void *value, size_t size)

DEFINE_CLASS(simple_xattrs,
            struct simple_xattrs *,
            if (!IS_ERR_OR_NULL(_T)) { simple_xattrs_free(_T, NULL); kfree(_T); },
            simple_xattrs_alloc(),
            void)

#endif        /* _LINUX_XATTR_H */
















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MSR_H
#define _ASM_X86_MSR_H

#include "msr-index.h"

#ifndef __ASSEMBLER__

#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/cpumask.h>
#include <uapi/asm/msr.h>
#include <asm/shared/msr.h>

#include <linux/types.h>
#include <linux/percpu.h>

struct msr_info {
        u32                        msr_no;
        struct msr                reg;
        struct msr __percpu        *msrs;
        int                        err;
};

struct msr_regs_info {
        u32 *regs;
        int err;
};

struct saved_msr {
        bool valid;
        struct msr_info info;
};

struct saved_msrs {
        unsigned int num;
        struct saved_msr *array;
};

/*
 * Be very careful with includes. This header is prone to include loops.
 */
#include <asm/atomic.h>
#include <linux/tracepoint-defs.h>

#ifdef CONFIG_TRACEPOINTS
DECLARE_TRACEPOINT(read_msr);
DECLARE_TRACEPOINT(write_msr);
DECLARE_TRACEPOINT(rdpmc);
extern void do_trace_write_msr(u32 msr, u64 val, int failed);
extern void do_trace_read_msr(u32 msr, u64 val, int failed);
extern void do_trace_rdpmc(u32 msr, u64 val, int failed);
#else
static inline void do_trace_write_msr(u32 msr, u64 val, int failed) {}
static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {}
static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {}
#endif

/*
 * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
 * accessors and should not have any tracing or other functionality piggybacking
 * on them - those are *purely* for accessing MSRs and nothing more. So don't even
 * think of extending them - you will be slapped with a stinking trout or a frozen
 * shark will reach you, wherever you are! You've been warned.
 */
static __always_inline u64 __rdmsr(u32 msr)
{
        EAX_EDX_DECLARE_ARGS(val, low, high);

        asm volatile("1: rdmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
                     : EAX_EDX_RET(val, low, high) : "c" (msr));

        return EAX_EDX_VAL(val, low, high);
}

static __always_inline void __wrmsrq(u32 msr, u64 val)
{
        asm volatile("1: wrmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
                     : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)) : "memory");
}

#define native_rdmsr(msr, val1, val2)                        \
do {                                                        \
        u64 __val = __rdmsr((msr));                        \
        (void)((val1) = (u32)__val);                        \
        (void)((val2) = (u32)(__val >> 32));                \
} while (0)

static __always_inline u64 native_rdmsrq(u32 msr)
{
        return __rdmsr(msr);
}

#define native_wrmsr(msr, low, high)                        \
        __wrmsrq((msr), (u64)(high) << 32 | (low))

#define native_wrmsrq(msr, val)                                \
        __wrmsrq((msr), (val))

static inline u64 native_read_msr(u32 msr)
{
        u64 val;

        val = __rdmsr(msr);

        if (tracepoint_enabled(read_msr))
                do_trace_read_msr(msr, val, 0);

        return val;
}

static inline int native_read_msr_safe(u32 msr, u64 *p)
{
        int err;
        EAX_EDX_DECLARE_ARGS(val, low, high);

        asm volatile("1: rdmsr ; xor %[err],%[err]\n"
                     "2:\n\t"
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
                     : [err] "=r" (err), EAX_EDX_RET(val, low, high)
                     : "c" (msr));
        if (tracepoint_enabled(read_msr))
                do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), err);

        *p = EAX_EDX_VAL(val, low, high);

        return err;
}

/* Can be uninlined because referenced by paravirt */
static inline void notrace native_write_msr(u32 msr, u64 val)
{
        native_wrmsrq(msr, val);

        if (tracepoint_enabled(write_msr))
                do_trace_write_msr(msr, val, 0);
}

/* Can be uninlined because referenced by paravirt */
static inline int notrace native_write_msr_safe(u32 msr, u64 val)
{
        int err;

        asm volatile("1: wrmsr ; xor %[err],%[err]\n"
                     "2:\n\t"
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
                     : [err] "=a" (err)
                     : "c" (msr), "0" ((u32)val), "d" ((u32)(val >> 32))
                     : "memory");
        if (tracepoint_enabled(write_msr))
                do_trace_write_msr(msr, val, err);
        return err;
}

extern int rdmsr_safe_regs(u32 regs[8]);
extern int wrmsr_safe_regs(u32 regs[8]);

static inline u64 native_read_pmc(int counter)
{
        EAX_EDX_DECLARE_ARGS(val, low, high);

        asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
        if (tracepoint_enabled(rdpmc))
                do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
        return EAX_EDX_VAL(val, low, high);
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#include <linux/errno.h>
/*
 * Access to machine-specific registers (available on 586 and better only)
 * Note: the rd* operations modify the parameters directly (without using
 * pointer indirection), this allows gcc to optimize better
 */

#define rdmsr(msr, low, high)                                        \
do {                                                                \
        u64 __val = native_read_msr((msr));                        \
        (void)((low) = (u32)__val);                                \
        (void)((high) = (u32)(__val >> 32));                        \
} while (0)

static inline void wrmsr(u32 msr, u32 low, u32 high)
{
        native_write_msr(msr, (u64)high << 32 | low);
}

#define rdmsrq(msr, val)                        \
        ((val) = native_read_msr((msr)))

static inline void wrmsrq(u32 msr, u64 val)
{
        native_write_msr(msr, val);
}

/* wrmsr with exception handling */
static inline int wrmsrq_safe(u32 msr, u64 val)
{
        return native_write_msr_safe(msr, val);
}

/* rdmsr with exception handling */
#define rdmsr_safe(msr, low, high)                                \
({                                                                \
        u64 __val;                                                \
        int __err = native_read_msr_safe((msr), &__val);        \
        (*low) = (u32)__val;                                        \
        (*high) = (u32)(__val >> 32);                                \
        __err;                                                        \
})

static inline int rdmsrq_safe(u32 msr, u64 *p)
{
        return native_read_msr_safe(msr, p);
}

static __always_inline u64 rdpmc(int counter)
{
        return native_read_pmc(counter);
}

#endif        /* !CONFIG_PARAVIRT_XXL */

/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
#define ASM_WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)

/* Non-serializing WRMSR, when available.  Falls back to a serializing WRMSR. */
static __always_inline void wrmsrns(u32 msr, u64 val)
{
        /*
         * WRMSR is 2 bytes.  WRMSRNS is 3 bytes.  Pad WRMSR with a redundant
         * DS prefix to avoid a trailing NOP.
         */
        asm volatile("1: " ALTERNATIVE("ds wrmsr", ASM_WRMSRNS, X86_FEATURE_WRMSRNS)
                     "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
                     : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)));
}

/*
 * Dual u32 version of wrmsrq_safe():
 */
static inline int wrmsr_safe(u32 msr, u32 low, u32 high)
{
        return wrmsrq_safe(msr, (u64)high << 32 | low);
}

struct msr __percpu *msrs_alloc(void);
void msrs_free(struct msr __percpu *msrs);
int msr_set_bit(u32 msr, u8 bit);
int msr_clear_bit(u32 msr, u8 bit);

#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
#else  /*  CONFIG_SMP  */
static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
        rdmsr(msr_no, *l, *h);
        return 0;
}
static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
        wrmsr(msr_no, l, h);
        return 0;
}
static inline int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
        rdmsrq(msr_no, *q);
        return 0;
}
static inline int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
        wrmsrq(msr_no, q);
        return 0;
}
static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
                                struct msr __percpu *msrs)
{
        rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h));
}
static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
                                struct msr __percpu *msrs)
{
        wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h));
}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
                                    u32 *l, u32 *h)
{
        return rdmsr_safe(msr_no, l, h);
}
static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
        return wrmsr_safe(msr_no, l, h);
}
static inline int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
        return rdmsrq_safe(msr_no, q);
}
static inline int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
        return wrmsrq_safe(msr_no, q);
}
static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
        return rdmsr_safe_regs(regs);
}
static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
        return wrmsr_safe_regs(regs);
}
#endif  /* CONFIG_SMP */

/* Compatibility wrappers: */
#define rdmsrl(msr, val) rdmsrq(msr, val)
#define wrmsrl(msr, val) wrmsrq(msr, val)
#define rdmsrl_on_cpu(cpu, msr, q) rdmsrq_on_cpu(cpu, msr, q)

#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_MSR_H */
























































































































































































































































































































































































































    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/percpu-vm.c - vmalloc area based chunk allocation
 *
 * Copyright (C) 2010                SUSE Linux Products GmbH
 * Copyright (C) 2010                Tejun Heo <tj@kernel.org>
 *
 * Chunks are mapped into vmalloc areas and populated page by page.
 * This is the default chunk allocator.
 */
#include "internal.h"

static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
                                    unsigned int cpu, int page_idx)
{
        /* must not be used on pre-mapped chunk */
        WARN_ON(chunk->immutable);

        return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
}

/**
 * pcpu_get_pages - get temp pages array
 *
 * Returns pointer to array of pointers to struct page which can be indexed
 * with pcpu_page_idx().  Note that there is only one array and accesses
 * should be serialized by pcpu_alloc_mutex.
 *
 * RETURNS:
 * Pointer to temp pages array on success.
 */
static struct page **pcpu_get_pages(void)
{
        static struct page **pages;
        size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);

        lockdep_assert_held(&pcpu_alloc_mutex);

        if (!pages)
                pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL);
        return pages;
}

/**
 * pcpu_free_pages - free pages which were allocated for @chunk
 * @chunk: chunk pages were allocated for
 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
 * @page_start: page index of the first page to be freed
 * @page_end: page index of the last page to be freed + 1
 *
 * Free pages [@page_start and @page_end) in @pages for all units.
 * The pages were allocated for @chunk.
 */
static void pcpu_free_pages(struct pcpu_chunk *chunk,
                            struct page **pages, int page_start, int page_end)
{
        unsigned int cpu;
        int i;

        for_each_possible_cpu(cpu) {
                for (i = page_start; i < page_end; i++) {
                        struct page *page = pages[pcpu_page_idx(cpu, i)];

                        if (page)
                                __free_page(page);
                }
        }
}

/**
 * pcpu_alloc_pages - allocates pages for @chunk
 * @chunk: target chunk
 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
 * @page_start: page index of the first page to be allocated
 * @page_end: page index of the last page to be allocated + 1
 * @gfp: allocation flags passed to the underlying allocator
 *
 * Allocate pages [@page_start,@page_end) into @pages for all units.
 * The allocation is for @chunk.  Percpu core doesn't care about the
 * content of @pages and will pass it verbatim to pcpu_map_pages().
 */
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
                            struct page **pages, int page_start, int page_end,
                            gfp_t gfp)
{
        unsigned int cpu, tcpu;
        int i;

        gfp |= __GFP_HIGHMEM;

        for_each_possible_cpu(cpu) {
                for (i = page_start; i < page_end; i++) {
                        struct page **pagep = &pages[pcpu_page_idx(cpu, i)];

                        *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
                        if (!*pagep)
                                goto err;
                }
        }
        return 0;

err:
        while (--i >= page_start)
                __free_page(pages[pcpu_page_idx(cpu, i)]);

        for_each_possible_cpu(tcpu) {
                if (tcpu == cpu)
                        break;
                for (i = page_start; i < page_end; i++)
                        __free_page(pages[pcpu_page_idx(tcpu, i)]);
        }
        return -ENOMEM;
}

/**
 * pcpu_pre_unmap_flush - flush cache prior to unmapping
 * @chunk: chunk the regions to be flushed belongs to
 * @page_start: page index of the first page to be flushed
 * @page_end: page index of the last page to be flushed + 1
 *
 * Pages in [@page_start,@page_end) of @chunk are about to be
 * unmapped.  Flush cache.  As each flushing trial can be very
 * expensive, issue flush on the whole region at once rather than
 * doing it for each cpu.  This could be an overkill but is more
 * scalable.
 */
static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
                                 int page_start, int page_end)
{
        flush_cache_vunmap(
                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}

static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
{
        vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT));
}

/**
 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
 * @chunk: chunk of interest
 * @pages: pages array which can be used to pass information to free
 * @page_start: page index of the first page to unmap
 * @page_end: page index of the last page to unmap + 1
 *
 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
 * Corresponding elements in @pages were cleared by the caller and can
 * be used to carry information to pcpu_free_pages() which will be
 * called after all unmaps are finished.  The caller should call
 * proper pre/post flush functions.
 */
static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
                             struct page **pages, int page_start, int page_end)
{
        unsigned int cpu;
        int i;

        for_each_possible_cpu(cpu) {
                for (i = page_start; i < page_end; i++) {
                        struct page *page;

                        page = pcpu_chunk_page(chunk, cpu, i);
                        WARN_ON(!page);
                        pages[pcpu_page_idx(cpu, i)] = page;
                }
                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
                                   page_end - page_start);
        }
}

/**
 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
 * @chunk: pcpu_chunk the regions to be flushed belong to
 * @page_start: page index of the first page to be flushed
 * @page_end: page index of the last page to be flushed + 1
 *
 * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
 * TLB for the regions.  This can be skipped if the area is to be
 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
 *
 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
 * for the whole region.
 */
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
                                      int page_start, int page_end)
{
        flush_tlb_kernel_range(
                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}

static int __pcpu_map_pages(unsigned long addr, struct page **pages,
                            int nr_pages)
{
        return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
                        PAGE_KERNEL, pages, PAGE_SHIFT, GFP_KERNEL);
}

/**
 * pcpu_map_pages - map pages into a pcpu_chunk
 * @chunk: chunk of interest
 * @pages: pages array containing pages to be mapped
 * @page_start: page index of the first page to map
 * @page_end: page index of the last page to map + 1
 *
 * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
 * caller is responsible for calling pcpu_post_map_flush() after all
 * mappings are complete.
 *
 * This function is responsible for setting up whatever is necessary for
 * reverse lookup (addr -> chunk).
 */
static int pcpu_map_pages(struct pcpu_chunk *chunk,
                          struct page **pages, int page_start, int page_end)
{
        unsigned int cpu, tcpu;
        int i, err;

        for_each_possible_cpu(cpu) {
                err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
                                       &pages[pcpu_page_idx(cpu, page_start)],
                                       page_end - page_start);
                if (err < 0)
                        goto err;

                for (i = page_start; i < page_end; i++)
                        pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
                                            chunk);
        }
        return 0;
err:
        for_each_possible_cpu(tcpu) {
                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
                                   page_end - page_start);
                if (tcpu == cpu)
                        break;
        }
        pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
        return err;
}

/**
 * pcpu_post_map_flush - flush cache after mapping
 * @chunk: pcpu_chunk the regions to be flushed belong to
 * @page_start: page index of the first page to be flushed
 * @page_end: page index of the last page to be flushed + 1
 *
 * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
 * cache.
 *
 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
 * for the whole region.
 */
static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
                                int page_start, int page_end)
{
        flush_cache_vmap(
                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}

/**
 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
 * @chunk: chunk of interest
 * @page_start: the start page
 * @page_end: the end page
 * @gfp: allocation flags passed to the underlying memory allocator
 *
 * For each cpu, populate and map pages [@page_start,@page_end) into
 * @chunk.
 *
 * CONTEXT:
 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
 */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
                               int page_start, int page_end, gfp_t gfp)
{
        struct page **pages;

        pages = pcpu_get_pages();
        if (!pages)
                return -ENOMEM;

        if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
                return -ENOMEM;

        if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
                pcpu_free_pages(chunk, pages, page_start, page_end);
                return -ENOMEM;
        }
        pcpu_post_map_flush(chunk, page_start, page_end);

        return 0;
}

/**
 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
 * @chunk: chunk to depopulate
 * @page_start: the start page
 * @page_end: the end page
 *
 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 * from @chunk.
 *
 * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the
 * region back to vmalloc() which will lazily flush the tlb.
 *
 * CONTEXT:
 * pcpu_alloc_mutex.
 */
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                  int page_start, int page_end)
{
        struct page **pages;

        /*
         * If control reaches here, there must have been at least one
         * successful population attempt so the temp pages array must
         * be available now.
         */
        pages = pcpu_get_pages();
        BUG_ON(!pages);

        /* unmap and free */
        pcpu_pre_unmap_flush(chunk, page_start, page_end);

        pcpu_unmap_pages(chunk, pages, page_start, page_end);

        pcpu_free_pages(chunk, pages, page_start, page_end);
}

static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
        struct pcpu_chunk *chunk;
        struct vm_struct **vms;

        chunk = pcpu_alloc_chunk(gfp);
        if (!chunk)
                return NULL;

        vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
                                pcpu_nr_groups, pcpu_atom_size);
        if (!vms) {
                pcpu_free_chunk(chunk);
                return NULL;
        }

        chunk->data = vms;
        chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];

        pcpu_stats_chunk_alloc();
        trace_percpu_create_chunk(chunk->base_addr);

        return chunk;
}

static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
{
        if (!chunk)
                return;

        pcpu_stats_chunk_dealloc();
        trace_percpu_destroy_chunk(chunk->base_addr);

        if (chunk->data)
                pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
        pcpu_free_chunk(chunk);
}

static struct page *pcpu_addr_to_page(void *addr)
{
        return vmalloc_to_page(addr);
}

static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
{
        /* no extra restriction */
        return 0;
}

/**
 * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim
 * @chunk: chunk of interest
 *
 * This is the entry point for percpu reclaim.  If a chunk qualifies, it is then
 * isolated and managed in separate lists at the back of pcpu_slot: sidelined
 * and to_depopulate respectively.  The to_depopulate list holds chunks slated
 * for depopulation.  They no longer contribute to pcpu_nr_empty_pop_pages once
 * they are on this list.  Once depopulated, they are moved onto the sidelined
 * list which enables them to be pulled back in for allocation if no other chunk
 * can suffice the allocation.
 */
static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
{
        /* do not reclaim either the first chunk or reserved chunk */
        if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk)
                return false;

        /*
         * If it is isolated, it may be on the sidelined list so move it back to
         * the to_depopulate list.  If we hit at least 1/4 pages empty pages AND
         * there is no system-wide shortage of empty pages aside from this
         * chunk, move it to the to_depopulate list.
         */
        return ((chunk->isolated && chunk->nr_empty_pop_pages) ||
                (pcpu_nr_empty_pop_pages >
                 (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) &&
                 chunk->nr_empty_pop_pages >= chunk->nr_pages / 4));
}



























































































    1 








    1 
   13 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/folio_batch.h
 *
 * In many places it is efficient to batch an operation up against multiple
 * folios.  A folio_batch is a container which is used for that.
 */

#ifndef _LINUX_FOLIO_BATCH_H
#define _LINUX_FOLIO_BATCH_H

#include <linux/types.h>

/* 31 pointers + header align the folio_batch structure to a power of two */
#define FOLIO_BATCH_SIZE        31

struct folio;

/**
 * struct folio_batch - A collection of folios.
 *
 * The folio_batch is used to amortise the cost of retrieving and
 * operating on a set of folios.  The order of folios in the batch may be
 * significant (eg delete_from_page_cache_batch()).  Some users of the
 * folio_batch store "exceptional" entries in it which can be removed
 * by calling folio_batch_remove_exceptionals().
 */
struct folio_batch {
        unsigned char nr;
        unsigned char i;
        bool percpu_pvec_drained;
        struct folio *folios[FOLIO_BATCH_SIZE];
};

/**
 * folio_batch_init() - Initialise a batch of folios
 * @fbatch: The folio batch.
 *
 * A freshly initialised folio_batch contains zero folios.
 */
static inline void folio_batch_init(struct folio_batch *fbatch)
{
        fbatch->nr = 0;
        fbatch->i = 0;
        fbatch->percpu_pvec_drained = false;
}

static inline void folio_batch_reinit(struct folio_batch *fbatch)
{
        fbatch->nr = 0;
        fbatch->i = 0;
}

static inline unsigned int folio_batch_count(const struct folio_batch *fbatch)
{
        return fbatch->nr;
}

static inline unsigned int folio_batch_space(const struct folio_batch *fbatch)
{
        return FOLIO_BATCH_SIZE - fbatch->nr;
}

/**
 * folio_batch_add() - Add a folio to a batch.
 * @fbatch: The folio batch.
 * @folio: The folio to add.
 *
 * The folio is added to the end of the batch.
 * The batch must have previously been initialised using folio_batch_init().
 *
 * Return: The number of slots still available.
 */
static inline unsigned folio_batch_add(struct folio_batch *fbatch,
                struct folio *folio)
{
        fbatch->folios[fbatch->nr++] = folio;
        return folio_batch_space(fbatch);
}

/**
 * folio_batch_next - Return the next folio to process.
 * @fbatch: The folio batch being processed.
 *
 * Use this function to implement a queue of folios.
 *
 * Return: The next folio in the queue, or NULL if the queue is empty.
 */
static inline struct folio *folio_batch_next(struct folio_batch *fbatch)
{
        if (fbatch->i == fbatch->nr)
                return NULL;
        return fbatch->folios[fbatch->i++];
}

void __folio_batch_release(struct folio_batch *fbatch);

static inline void folio_batch_release(struct folio_batch *fbatch)
{
        if (folio_batch_count(fbatch))
                __folio_batch_release(fbatch);
}

void folio_batch_remove_exceptionals(struct folio_batch *fbatch);
#endif /* _LINUX_FOLIO_BATCH_H */
















































































































































































































































   15 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sock

#if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SOCK_H

#include <net/sock.h>
#include <net/ipv6.h>
#include <linux/tracepoint.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <trace/events/net_probe_common.h>

#define family_names                        \
                EM(AF_INET)                                \
                EMe(AF_INET6)

/* The protocol traced by inet_sock_set_state */
#define inet_protocol_names                \
                EM(IPPROTO_TCP)                        \
                EM(IPPROTO_SCTP)                \
                EMe(IPPROTO_MPTCP)

#define tcp_state_names                        \
                EM(TCP_ESTABLISHED)                \
                EM(TCP_SYN_SENT)                \
                EM(TCP_SYN_RECV)                \
                EM(TCP_FIN_WAIT1)                \
                EM(TCP_FIN_WAIT2)                \
                EM(TCP_TIME_WAIT)                \
                EM(TCP_CLOSE)                        \
                EM(TCP_CLOSE_WAIT)                \
                EM(TCP_LAST_ACK)                \
                EM(TCP_LISTEN)                        \
                EM(TCP_CLOSING)                        \
                EMe(TCP_NEW_SYN_RECV)

#define skmem_kind_names                        \
                EM(SK_MEM_SEND)                        \
                EMe(SK_MEM_RECV)

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a)       TRACE_DEFINE_ENUM(a);
#define EMe(a)      TRACE_DEFINE_ENUM(a);

family_names
inet_protocol_names
tcp_state_names
skmem_kind_names

#undef EM
#undef EMe
#define EM(a)       { a, #a },
#define EMe(a)      { a, #a }

#define show_family_name(val)                        \
        __print_symbolic(val, family_names)

#define show_inet_protocol_name(val)    \
        __print_symbolic(val, inet_protocol_names)

#define show_tcp_state_name(val)        \
        __print_symbolic(val, tcp_state_names)

#define show_skmem_kind_names(val)        \
        __print_symbolic(val, skmem_kind_names)

TRACE_EVENT(sock_rcvqueue_full,

        TP_PROTO(struct sock *sk, struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                __field(int, rmem_alloc)
                __field(unsigned int, truesize)
                __field(int, sk_rcvbuf)
        ),

        TP_fast_assign(
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->truesize   = skb->truesize;
                __entry->sk_rcvbuf  = READ_ONCE(sk->sk_rcvbuf);
        ),

        TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d",
                __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf)
);

TRACE_EVENT(sock_exceed_buf_limit,

        TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind),

        TP_ARGS(sk, prot, allocated, kind),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __array(long, sysctl_mem, 3)
                __field(long, allocated)
                __field(int, sysctl_rmem)
                __field(int, rmem_alloc)
                __field(int, sysctl_wmem)
                __field(int, wmem_alloc)
                __field(int, wmem_queued)
                __field(int, kind)
        ),

        TP_fast_assign(
                strscpy(__entry->name, prot->name, 32);
                __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]);
                __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]);
                __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]);
                __entry->allocated = allocated;
                __entry->sysctl_rmem = sk_get_rmem0(sk, prot);
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->sysctl_wmem = sk_get_wmem0(sk, prot);
                __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc);
                __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued);
                __entry->kind = kind;
        ),

        TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
                __entry->name,
                __entry->sysctl_mem[0],
                __entry->sysctl_mem[1],
                __entry->sysctl_mem[2],
                __entry->allocated,
                __entry->sysctl_rmem,
                __entry->rmem_alloc,
                __entry->sysctl_wmem,
                __entry->wmem_alloc,
                __entry->wmem_queued,
                show_skmem_kind_names(__entry->kind)
        )
);

TRACE_EVENT(inet_sock_set_state,

        TP_PROTO(const struct sock *sk, const int oldstate, const int newstate),

        TP_ARGS(sk, oldstate, newstate),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(int, oldstate)
                __field(int, newstate)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u16, protocol)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skaddr = sk;
                __entry->oldstate = oldstate;
                __entry->newstate = newstate;

                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
        ),

        TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
                        show_family_name(__entry->family),
                        show_inet_protocol_name(__entry->protocol),
                        __entry->sport, __entry->dport,
                        __entry->saddr, __entry->daddr,
                        __entry->saddr_v6, __entry->daddr_v6,
                        show_tcp_state_name(__entry->oldstate),
                        show_tcp_state_name(__entry->newstate))
);

TRACE_EVENT(inet_sk_error_report,

        TP_PROTO(const struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(int, error)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u16, protocol)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->error = sk->sk_err;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
        ),

        TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d",
                  show_family_name(__entry->family),
                  show_inet_protocol_name(__entry->protocol),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  __entry->error)
);

TRACE_EVENT(sk_data_ready,

        TP_PROTO(const struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(__u16, family)
                __field(__u16, protocol)
                __field(unsigned long, ip)
        ),

        TP_fast_assign(
                __entry->skaddr = sk;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->ip = _RET_IP_;
        ),

        TP_printk("family=%u protocol=%u func=%ps",
                  __entry->family, __entry->protocol, (void *)__entry->ip)
);

/*
 * sock send/recv msg length
 */
DECLARE_EVENT_CLASS(sock_msg_length,

        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags),

        TP_STRUCT__entry(
                __field(void *, sk)
                __field(__u16, family)
                __field(__u16, protocol)
                __field(int, ret)
                __field(int, flags)
        ),

        TP_fast_assign(
                __entry->sk = sk;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->ret = ret;
                __entry->flags = flags;
        ),

        TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x",
                  __entry->sk, show_family_name(__entry->family),
                  show_inet_protocol_name(__entry->protocol),
                  !(__entry->flags & MSG_PEEK) ?
                  (__entry->ret > 0 ? __entry->ret : 0) : 0,
                  __entry->ret < 0 ? __entry->ret : 0,
                  __entry->flags)
);

DEFINE_EVENT(sock_msg_length, sock_send_length,
        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags)
);

DEFINE_EVENT(sock_msg_length, sock_recv_length,
        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags)
);
#endif /* _TRACE_SOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


































































































































































































































   21 































































































































































































































































































































    2 


    8 
   12 



    8 


   20 














   19 

    8 























    1 




























   15 







































































































































































    7 
    9 






    2 















    2 






















































































































































































































    6 









    1 























































































































    8 










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Macros for manipulating and testing page->flags
 */

#ifndef PAGE_FLAGS_H
#define PAGE_FLAGS_H

#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mmdebug.h>
#ifndef __GENERATING_BOUNDS_H
#include <linux/mm_types.h>
#include <generated/bounds.h>
#endif /* !__GENERATING_BOUNDS_H */

/*
 * Various page->flags bits:
 *
 * PG_reserved is set for special pages. The "struct page" of such a page
 * should in general not be touched (e.g. set dirty) except by its owner.
 * Pages marked as PG_reserved include:
 * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
 *   initrd, HW tables)
 * - Pages reserved or allocated early during boot (before the page allocator
 *   was initialized). This includes (depending on the architecture) the
 *   initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
 *   much more. Once (if ever) freed, PG_reserved is cleared and they will
 *   be given to the page allocator.
 * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
 *   to read/write these pages might end badly. Don't touch!
 * - The zero page(s)
 * - Pages allocated in the context of kexec/kdump (loaded kernel image,
 *   control pages, vmcoreinfo)
 * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
 *   not marked PG_reserved (as they might be in use by somebody else who does
 *   not respect the caching strategy).
 * - MCA pages on ia64
 * - Pages holding CPU notes for POWER Firmware Assisted Dump
 * - Device memory (e.g. PMEM, DAX, HMM)
 * Some PG_reserved pages will be excluded from the hibernation image.
 * PG_reserved does in general not hinder anybody from dumping or swapping
 * and is no longer required for remap_pfn_range(). ioremap might require it.
 * Consequently, PG_reserved for a page mapped into user space can indicate
 * the zero page, the vDSO, MMIO pages or device memory.
 *
 * The PG_private bitflag is set on pagecache pages if they contain filesystem
 * specific data (which is normally at page->private). It can be used by
 * private allocations for its own usage.
 *
 * During initiation of disk I/O, PG_locked is set. This bit is set before I/O
 * and cleared when writeback _starts_ or when read _completes_. PG_writeback
 * is set before writeback starts and cleared when it finishes.
 *
 * PG_locked also pins a page in pagecache, and blocks truncation of the file
 * while it is held.
 *
 * page_waitqueue(page) is a wait queue of all tasks waiting for the page
 * to become unlocked.
 *
 * PG_swapbacked is set when a page uses swap as a backing storage.  This are
 * usually PageAnon or shmem pages but please note that even anonymous pages
 * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
 * a result of MADV_FREE).
 *
 * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
 * file-backed pagecache (see mm/vmscan.c).
 *
 * PG_arch_1 is an architecture specific page state bit.  The generic code
 * guarantees that this bit is cleared for a page when it first is entered into
 * the page cache.
 *
 * PG_hwpoison indicates that a page got corrupted in hardware and contains
 * data with incorrect ECC bits that triggered a machine check. Accessing is
 * not safe since it may cause another machine check. Don't touch!
 */

/*
 * Don't use the pageflags directly.  Use the PageFoo macros.
 *
 * The page flags field is split into two parts, the main flags area
 * which extends from the low bits upwards, and the fields area which
 * extends from the high bits downwards.
 *
 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *
 * The fields area is reserved for fields mapping zone, node (for NUMA) and
 * SPARSEMEM section (for variants of SPARSEMEM that require section ids like
 * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
 */
enum pageflags {
        PG_locked,                /* Page is locked. Don't touch. */
        PG_writeback,                /* Page is under writeback */
        PG_referenced,
        PG_uptodate,
        PG_dirty,
        PG_lru,
        PG_head,                /* Must be in bit 6 */
        PG_waiters,                /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
        PG_active,
        PG_workingset,
        PG_owner_priv_1,        /* Owner use. If pagecache, fs may use */
        PG_owner_2,                /* Owner use. If pagecache, fs may use */
        PG_arch_1,
        PG_reserved,
        PG_private,                /* If pagecache, has fs-private data */
        PG_private_2,                /* If pagecache, has fs aux data */
        PG_reclaim,                /* To be reclaimed asap */
        PG_swapbacked,                /* Page is backed by RAM/swap */
        PG_unevictable,                /* Page is "unevictable"  */
        PG_dropbehind,                /* drop pages on IO completion */
#ifdef CONFIG_MMU
        PG_mlocked,                /* Page is vma mlocked */
#endif
#ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,                /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
        PG_young,
        PG_idle,
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_2
        PG_arch_2,
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
        PG_arch_3,
#endif
        __NR_PAGEFLAGS,

        PG_readahead = PG_reclaim,

        /* Anonymous memory (and shmem) */
        PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
        /* Some filesystems */
        PG_checked = PG_owner_priv_1,

        /*
         * Depending on the way an anonymous folio can be mapped into a page
         * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped
         * THP), PG_anon_exclusive may be set only for the head page or for
         * tail pages of an anonymous folio. For now, we only expect it to be
         * set on tail pages for PTE-mapped THP.
         */
        PG_anon_exclusive = PG_owner_2,

        /*
         * Set if all buffer heads in the folio are mapped.
         * Filesystems which do not use BHs can use it for their own purpose.
         */
        PG_mappedtodisk = PG_owner_2,

        /* Two page bits are conscripted by FS-Cache to maintain local caching
         * state.  These bits are set on pages belonging to the netfs's inodes
         * when those inodes are being locally cached.
         */
        PG_fscache = PG_private_2,        /* page backed by cache */

        /* XEN */
        /* Pinned in Xen as a read-only pagetable page. */
        PG_pinned = PG_owner_priv_1,
        /* Pinned as part of domain save (see xen_mm_pin_all()). */
        PG_savepinned = PG_dirty,
        /* Has a grant mapping of another (foreign) domain's page. */
        PG_foreign = PG_owner_priv_1,
        /* Remapped by swiotlb-xen. */
        PG_xen_remapped = PG_owner_priv_1,

#ifdef CONFIG_MIGRATION
        /* movable_ops page that is isolated for migration */
        PG_movable_ops_isolated = PG_reclaim,
        /* this is a movable_ops page (for selected typed pages only) */
        PG_movable_ops = PG_uptodate,
#endif

        /* Only valid for buddy pages. Used to track pages that are reported */
        PG_reported = PG_uptodate,

#ifdef CONFIG_MEMORY_HOTPLUG
        /* For self-hosted memmap pages */
        PG_vmemmap_self_hosted = PG_owner_priv_1,
#endif

        /*
         * Flags only valid for compound pages.  Stored in first tail page's
         * flags word.  Cannot use the first 8 flags or any flag marked as
         * PF_ANY.
         */

        /* At least one page in this folio has the hwpoison flag set */
        PG_has_hwpoisoned = PG_active,
        PG_large_rmappable = PG_workingset, /* anon or file-backed */
        PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
};

#define PAGEFLAGS_MASK                ((1UL << NR_PAGEFLAGS) - 1)

#ifndef __GENERATING_BOUNDS_H

/*
 * For tail pages, if the size of struct page is power-of-2 ->compound_info
 * encodes the mask that converts the address of the tail page address to
 * the head page address.
 *
 * Otherwise, ->compound_info has direct pointer to head pages.
 */
static __always_inline bool compound_info_has_mask(void)
{
        /*
         * Limit mask usage to HugeTLB vmemmap optimization (HVO) where it
         * makes a difference.
         *
         * The approach with mask would work in the wider set of conditions,
         * but it requires validating that struct pages are naturally aligned
         * for all orders up to the MAX_FOLIO_ORDER, which can be tricky.
         */
        if (!IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP))
                return false;

        return is_power_of_2(sizeof(struct page));
}

static __always_inline unsigned long _compound_head(const struct page *page)
{
        unsigned long info = READ_ONCE(page->compound_info);
        unsigned long mask;

        if (!compound_info_has_mask()) {
                /* Bit 0 encodes PageTail() */
                if (info & 1)
                        return info - 1;

                return (unsigned long)page;
        }

        /*
         * If compound_info_has_mask() is true the rest of the info encodes
         * the mask that converts the address of the tail page to the head page.
         *
         * No need to clear bit 0 in the mask as 'page' always has it clear.
         *
         * Let's do it in a branchless manner.
         */

        /* Non-tail: -1UL, Tail: 0 */
        mask = (info & 1) - 1;

        /* Non-tail: -1UL, Tail: info */
        mask |= info;

        return (unsigned long)page & mask;
}

#define compound_head(page)        ((typeof(page))_compound_head(page))

static __always_inline void set_compound_head(struct page *tail,
                const struct page *head, unsigned int order)
{
        unsigned int shift;
        unsigned long mask;

        if (!compound_info_has_mask()) {
                WRITE_ONCE(tail->compound_info, (unsigned long)head | 1);
                return;
        }

        /*
         * If the size of struct page is power-of-2, bits [shift:0] of the
         * virtual address of compound head are zero.
         *
         * Calculate mask that can be applied to the virtual address of
         * the tail page to get address of the head page.
         */
        shift = order + order_base_2(sizeof(struct page));
        mask = GENMASK(BITS_PER_LONG - 1, shift);

        /* Bit 0 encodes PageTail() */
        WRITE_ONCE(tail->compound_info, mask | 1);
}

static __always_inline void clear_compound_head(struct page *page)
{
        WRITE_ONCE(page->compound_info, 0);
}

/**
 * page_folio - Converts from page to folio.
 * @p: The page.
 *
 * Every page is part of a folio.  This function cannot be called on a
 * NULL pointer.
 *
 * Context: No reference, nor lock is required on @page.  If the caller
 * does not hold a reference, this call may race with a folio split, so
 * it should re-check the folio still contains this page after gaining
 * a reference on the folio.
 * Return: The folio which contains this page.
 */
#define page_folio(p)                (_Generic((p),                                \
        const struct page *:        (const struct folio *)_compound_head(p), \
        struct page *:                (struct folio *)_compound_head(p)))

/**
 * folio_page - Return a page from a folio.
 * @folio: The folio.
 * @n: The page number to return.
 *
 * @n is relative to the start of the folio.  This function does not
 * check that the page number lies within @folio; the caller is presumed
 * to have a reference to the page.
 */
#define folio_page(folio, n)        (&(folio)->page + (n))

static __always_inline int PageTail(const struct page *page)
{
        return READ_ONCE(page->compound_info) & 1;
}

static __always_inline int PageCompound(const struct page *page)
{
        return test_bit(PG_head, &page->flags.f) ||
               READ_ONCE(page->compound_info) & 1;
}

#define        PAGE_POISON_PATTERN        -1l
static inline int PagePoisoned(const struct page *page)
{
        return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN;
}

#ifdef CONFIG_DEBUG_VM
void page_init_poison(struct page *page, size_t size);
#else
static inline void page_init_poison(struct page *page, size_t size)
{
}
#endif

static const unsigned long *const_folio_flags(const struct folio *folio,
                unsigned n)
{
        const struct page *page = &folio->page;

        VM_BUG_ON_PGFLAGS(page->compound_info & 1, page);
        VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page);
        return &page[n].flags.f;
}

static unsigned long *folio_flags(struct folio *folio, unsigned n)
{
        struct page *page = &folio->page;

        VM_BUG_ON_PGFLAGS(page->compound_info & 1, page);
        VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page);
        return &page[n].flags.f;
}

/*
 * Page flags policies wrt compound pages
 *
 * PF_POISONED_CHECK
 *     check if this struct page poisoned/uninitialized
 *
 * PF_ANY:
 *     the page flag is relevant for small, head and tail pages.
 *
 * PF_HEAD:
 *     for compound page all operations related to the page flag applied to
 *     head page.
 *
 * PF_NO_TAIL:
 *     modifications of the page flag must be done on small or head pages,
 *     checks can be done on tail pages too.
 *
 * PF_NO_COMPOUND:
 *     the page flag is not relevant for compound pages.
 *
 * PF_SECOND:
 *     the page flag is stored in the first tail page.
 */
#define PF_POISONED_CHECK(page) ({                                        \
                VM_BUG_ON_PGFLAGS(PagePoisoned(page), page);                \
                page; })
#define PF_ANY(page, enforce)        PF_POISONED_CHECK(page)
#define PF_HEAD(page, enforce)        PF_POISONED_CHECK(compound_head(page))
#define PF_NO_TAIL(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);        \
                PF_POISONED_CHECK(compound_head(page)); })
#define PF_NO_COMPOUND(page, enforce) ({                                \
                VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page);        \
                PF_POISONED_CHECK(page); })
#define PF_SECOND(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(!PageHead(page), page);                \
                PF_POISONED_CHECK(&page[1]); })

/* Which page is the flag stored in */
#define FOLIO_PF_ANY                0
#define FOLIO_PF_HEAD                0
#define FOLIO_PF_NO_TAIL        0
#define FOLIO_PF_NO_COMPOUND        0
#define FOLIO_PF_SECOND                1

#define FOLIO_HEAD_PAGE                0
#define FOLIO_SECOND_PAGE        1

/*
 * Macros to create function definitions for page flags
 */
#define FOLIO_TEST_FLAG(name, page)                                        \
static __always_inline bool folio_test_##name(const struct folio *folio) \
{ return test_bit(PG_##name, const_folio_flags(folio, page)); }

#define FOLIO_SET_FLAG(name, page)                                        \
static __always_inline void folio_set_##name(struct folio *folio)        \
{ set_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_CLEAR_FLAG(name, page)                                        \
static __always_inline void folio_clear_##name(struct folio *folio)        \
{ clear_bit(PG_##name, folio_flags(folio, page)); }

#define __FOLIO_SET_FLAG(name, page)                                        \
static __always_inline void __folio_set_##name(struct folio *folio)        \
{ __set_bit(PG_##name, folio_flags(folio, page)); }

#define __FOLIO_CLEAR_FLAG(name, page)                                        \
static __always_inline void __folio_clear_##name(struct folio *folio)        \
{ __clear_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_TEST_SET_FLAG(name, page)                                        \
static __always_inline bool folio_test_set_##name(struct folio *folio)        \
{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_TEST_CLEAR_FLAG(name, page)                                \
static __always_inline bool folio_test_clear_##name(struct folio *folio) \
{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_FLAG(name, page)                                                \
FOLIO_TEST_FLAG(name, page)                                                \
FOLIO_SET_FLAG(name, page)                                                \
FOLIO_CLEAR_FLAG(name, page)

#define TESTPAGEFLAG(uname, lname, policy)                                \
FOLIO_TEST_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline int Page##uname(const struct page *page)                \
{ return test_bit(PG_##lname, &policy(page, 0)->flags.f); }

#define SETPAGEFLAG(uname, lname, policy)                                \
FOLIO_SET_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void SetPage##uname(struct page *page)                \
{ set_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define CLEARPAGEFLAG(uname, lname, policy)                                \
FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void ClearPage##uname(struct page *page)                \
{ clear_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define __SETPAGEFLAG(uname, lname, policy)                                \
__FOLIO_SET_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{ __set_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define __CLEARPAGEFLAG(uname, lname, policy)                                \
__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)                                \
static __always_inline void __ClearPage##uname(struct page *page)        \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define TESTSETFLAG(uname, lname, policy)                                \
FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy)                                \
static __always_inline int TestSetPage##uname(struct page *page)        \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define TESTCLEARFLAG(uname, lname, policy)                                \
FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy)                                \
static __always_inline int TestClearPage##uname(struct page *page)        \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define PAGEFLAG(uname, lname, policy)                                        \
        TESTPAGEFLAG(uname, lname, policy)                                \
        SETPAGEFLAG(uname, lname, policy)                                \
        CLEARPAGEFLAG(uname, lname, policy)

#define __PAGEFLAG(uname, lname, policy)                                \
        TESTPAGEFLAG(uname, lname, policy)                                \
        __SETPAGEFLAG(uname, lname, policy)                                \
        __CLEARPAGEFLAG(uname, lname, policy)

#define TESTSCFLAG(uname, lname, policy)                                \
        TESTSETFLAG(uname, lname, policy)                                \
        TESTCLEARFLAG(uname, lname, policy)

#define FOLIO_TEST_FLAG_FALSE(name)                                        \
static inline bool folio_test_##name(const struct folio *folio)                \
{ return false; }
#define FOLIO_SET_FLAG_NOOP(name)                                        \
static inline void folio_set_##name(struct folio *folio) { }
#define FOLIO_CLEAR_FLAG_NOOP(name)                                        \
static inline void folio_clear_##name(struct folio *folio) { }
#define __FOLIO_SET_FLAG_NOOP(name)                                        \
static inline void __folio_set_##name(struct folio *folio) { }
#define __FOLIO_CLEAR_FLAG_NOOP(name)                                        \
static inline void __folio_clear_##name(struct folio *folio) { }
#define FOLIO_TEST_SET_FLAG_FALSE(name)                                        \
static inline bool folio_test_set_##name(struct folio *folio)                \
{ return false; }
#define FOLIO_TEST_CLEAR_FLAG_FALSE(name)                                \
static inline bool folio_test_clear_##name(struct folio *folio)                \
{ return false; }

#define FOLIO_FLAG_FALSE(name)                                                \
FOLIO_TEST_FLAG_FALSE(name)                                                \
FOLIO_SET_FLAG_NOOP(name)                                                \
FOLIO_CLEAR_FLAG_NOOP(name)

#define TESTPAGEFLAG_FALSE(uname, lname)                                \
FOLIO_TEST_FLAG_FALSE(lname)                                                \
static inline int Page##uname(const struct page *page) { return 0; }

#define SETPAGEFLAG_NOOP(uname, lname)                                        \
FOLIO_SET_FLAG_NOOP(lname)                                                \
static inline void SetPage##uname(struct page *page) {  }

#define CLEARPAGEFLAG_NOOP(uname, lname)                                \
FOLIO_CLEAR_FLAG_NOOP(lname)                                                \
static inline void ClearPage##uname(struct page *page) {  }

#define __CLEARPAGEFLAG_NOOP(uname, lname)                                \
__FOLIO_CLEAR_FLAG_NOOP(lname)                                                \
static inline void __ClearPage##uname(struct page *page) {  }

#define TESTSETFLAG_FALSE(uname, lname)                                        \
FOLIO_TEST_SET_FLAG_FALSE(lname)                                        \
static inline int TestSetPage##uname(struct page *page) { return 0; }

#define TESTCLEARFLAG_FALSE(uname, lname)                                \
FOLIO_TEST_CLEAR_FLAG_FALSE(lname)                                        \
static inline int TestClearPage##uname(struct page *page) { return 0; }

#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname)        \
        SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname)

#define TESTSCFLAG_FALSE(uname, lname)                                        \
        TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname)

__PAGEFLAG(Locked, locked, PF_NO_TAIL)
FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE)
FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE)
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
        __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
        TESTCLEARFLAG(LRU, lru, PF_HEAD)
FOLIO_FLAG(active, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE)
PAGEFLAG(Workingset, workingset, PF_HEAD)
        TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND)           /* Used by some filesystems */

/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
        TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
        TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)

PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE)

/*
 * Private page markings that may be used by the filesystem that owns the page
 * for its own purposes.
 * - PG_private and PG_private_2 cause release_folio() and co to be invoked
 */
PAGEFLAG(Private, private, PF_ANY)
FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE)

/* owner_2 can be set on tail pages for anon memory */
FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE)

/*
 * Only test-and-set exist for PG_writeback.  The unconditional operators are
 * risky: they bypass page accounting.
 */
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
        TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE)

/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
        TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE)

FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE)

#ifdef CONFIG_HIGHMEM
/*
 * Must use a macro here due to header dependency issues. page_zone() is not
 * available at this point.
 */
#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
#define folio_test_highmem(__f)        is_highmem_idx(folio_zonenum(__f))
#else
PAGEFLAG_FALSE(HighMem, highmem)
#endif
#define PhysHighMem(__p) (PageHighMem(phys_to_page(__p)))

/* Does kmap_local_folio() only allow access to one page of the folio? */
#ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
#define folio_test_partial_kmap(f)        true
#else
#define folio_test_partial_kmap(f)        folio_test_highmem(f)
#endif

#ifdef CONFIG_SWAP
static __always_inline bool folio_test_swapcache(const struct folio *folio)
{
        return folio_test_swapbacked(folio) &&
                        test_bit(PG_swapcache, const_folio_flags(folio, 0));
}

FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE)
FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE)
#else
FOLIO_FLAG_FALSE(swapcache)
#endif

FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)

#ifdef CONFIG_MMU
FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
        FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE)
#else
FOLIO_FLAG_FALSE(mlocked)
        __FOLIO_CLEAR_FLAG_NOOP(mlocked)
        FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked)
        FOLIO_TEST_SET_FLAG_FALSE(mlocked)
#endif

#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison, hwpoison)
#define __PG_HWPOISON 0
#endif

#ifdef CONFIG_PAGE_IDLE_FLAG
#ifdef CONFIG_64BIT
FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_FLAG(idle, FOLIO_HEAD_PAGE)
#endif
/* See page_idle.h for !64BIT workaround */
#else /* !CONFIG_PAGE_IDLE_FLAG */
FOLIO_FLAG_FALSE(young)
FOLIO_TEST_CLEAR_FLAG_FALSE(young)
FOLIO_FLAG_FALSE(idle)
#endif

/*
 * PageReported() is used to track reported free pages within the Buddy
 * allocator. We can use the non-atomic version of the test and set
 * operations as both should be shielded with the zone lock to prevent
 * any possible races on the setting or clearing of the bit.
 */
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)

#ifdef CONFIG_MEMORY_HOTPLUG
PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY)
#else
PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
#endif

/*
 * On an anonymous folio mapped into a user virtual memory area,
 * folio->mapping points to its anon_vma, not to a struct address_space;
 * with the FOLIO_MAPPING_ANON bit set to distinguish it.  See rmap.h.
 *
 * On an anonymous folio in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
 * the FOLIO_MAPPING_ANON_KSM bit may be set along with the FOLIO_MAPPING_ANON
 * bit; and then folio->mapping points, not to an anon_vma, but to a private
 * structure which KSM associates with that merged folio.  See ksm.h.
 *
 * Please note that, confusingly, "folio_mapping" refers to the inode
 * address_space which maps the folio from disk; whereas "folio_mapped"
 * refers to user virtual address space into which the folio is mapped.
 *
 * For slab pages, since slab reuses the bits in struct page to store its
 * internal states, the folio->mapping does not exist as such, nor do
 * these flags below.  So in order to avoid testing non-existent bits,
 * please make sure that folio_test_slab(folio) actually evaluates to
 * false before calling the following functions (e.g., folio_test_anon).
 * See mm/slab.h.
 */
#define FOLIO_MAPPING_ANON        0x1
#define FOLIO_MAPPING_ANON_KSM        0x2
#define FOLIO_MAPPING_KSM        (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM)
#define FOLIO_MAPPING_FLAGS        (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM)

static __always_inline bool folio_test_anon(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0;
}

static __always_inline bool folio_test_lazyfree(const struct folio *folio)
{
        return folio_test_anon(folio) && !folio_test_swapbacked(folio);
}

static __always_inline bool PageAnonNotKsm(const struct page *page)
{
        unsigned long flags = (unsigned long)page_folio(page)->mapping;

        return (flags & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_ANON;
}

static __always_inline bool PageAnon(const struct page *page)
{
        return folio_test_anon(page_folio(page));
}
#ifdef CONFIG_KSM
/*
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
 * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
 * anon_vma, but to that page's node of the stable tree.
 */
static __always_inline bool folio_test_ksm(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & FOLIO_MAPPING_FLAGS) ==
                                FOLIO_MAPPING_KSM;
}
#else
FOLIO_TEST_FLAG_FALSE(ksm)
#endif

u64 stable_page_flags(const struct page *page);

/**
 * folio_xor_flags_has_waiters - Change some folio flags.
 * @folio: The folio.
 * @mask: Bits set in this word will be changed.
 *
 * This must only be used for flags which are changed with the folio
 * lock held.  For example, it is unsafe to use for PG_dirty as that
 * can be set without the folio lock held.  It can also only be used
 * on flags which are in the range 0-6 as some of the implementations
 * only affect those bits.
 *
 * Return: Whether there are tasks waiting on the folio.
 */
static inline bool folio_xor_flags_has_waiters(struct folio *folio,
                unsigned long mask)
{
        return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0));
}

/**
 * folio_test_uptodate - Is this folio up to date?
 * @folio: The folio.
 *
 * The uptodate flag is set on a folio when every byte in the folio is
 * at least as new as the corresponding bytes on storage.  Anonymous
 * and CoW folios are always uptodate.  If the folio is not uptodate,
 * some of the bytes in it may be; see the is_partially_uptodate()
 * address_space operation.
 */
static inline bool folio_test_uptodate(const struct folio *folio)
{
        bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0));
        /*
         * Must ensure that the data we read out of the folio is loaded
         * _after_ we've loaded folio->flags to check the uptodate bit.
         * We can skip the barrier if the folio is not uptodate, because
         * we wouldn't be reading anything from it.
         *
         * See folio_mark_uptodate() for the other side of the story.
         */
        if (ret)
                smp_rmb();

        return ret;
}

static inline bool PageUptodate(const struct page *page)
{
        return folio_test_uptodate(page_folio(page));
}

static __always_inline void __folio_mark_uptodate(struct folio *folio)
{
        smp_wmb();
        __set_bit(PG_uptodate, folio_flags(folio, 0));
}

static __always_inline void folio_mark_uptodate(struct folio *folio)
{
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
         * so that all previous stores issued in order to bring the folio
         * uptodate are actually visible before folio_test_uptodate becomes true.
         */
        smp_wmb();
        set_bit(PG_uptodate, folio_flags(folio, 0));
}

static __always_inline void __SetPageUptodate(struct page *page)
{
        __folio_mark_uptodate((struct folio *)page);
}

static __always_inline void SetPageUptodate(struct page *page)
{
        folio_mark_uptodate((struct folio *)page);
}

CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)

void __folio_start_writeback(struct folio *folio, bool keep_write);
void set_page_writeback(struct page *page);

#define folio_start_writeback(folio)                        \
        __folio_start_writeback(folio, false)

static __always_inline bool folio_test_head(const struct folio *folio)
{
        return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY));
}

static __always_inline int PageHead(const struct page *page)
{
        PF_POISONED_CHECK(page);
        return test_bit(PG_head, &page->flags.f);
}

__SETPAGEFLAG(Head, head, PF_ANY)
__CLEARPAGEFLAG(Head, head, PF_ANY)
CLEARPAGEFLAG(Head, head, PF_ANY)

/**
 * folio_test_large() - Does this folio contain more than one page?
 * @folio: The folio to test.
 *
 * Return: True if the folio is larger than one page.
 */
static inline bool folio_test_large(const struct folio *folio)
{
        return folio_test_head(folio);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
        BUG_ON(!PageHead(page));
        ClearPageHead(page);
}
FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
#else
FOLIO_FLAG_FALSE(large_rmappable)
FOLIO_FLAG_FALSE(partially_mapped)
#endif

#define PG_head_mask ((1UL << PG_head))

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * PageTransCompound returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransCompound(const struct page *page)
{
        return PageCompound(page);
}
#else
TESTPAGEFLAG_FALSE(TransCompound, transcompound)
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
/*
 * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the
 * compound page.
 *
 * This flag is set by hwpoison handler.  Cleared by THP split or free page.
 */
FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE)
#else
FOLIO_FLAG_FALSE(has_hwpoisoned)
#endif

/*
 * For pages that do not use mapcount, page_type may be used.
 * The low 24 bits of pagetype may be used for your own purposes, as long
 * as you are careful to not affect the top 8 bits.  The low bits of
 * pagetype will be overwritten when you clear the page_type from the page.
 */
enum pagetype {
        /* 0x00-0x7f are positive numbers, ie mapcount */
        /* Reserve 0x80-0xef for mapcount overflow. */
        PGTY_buddy                = 0xf0,
        PGTY_offline                = 0xf1,
        PGTY_table                = 0xf2,
        PGTY_guard                = 0xf3,
        PGTY_hugetlb                = 0xf4,
        PGTY_slab                = 0xf5,
        PGTY_zsmalloc                = 0xf6,
        PGTY_unaccepted                = 0xf7,
        PGTY_large_kmalloc        = 0xf8,
        PGTY_netpp                = 0xf9,

        PGTY_mapcount_underflow = 0xff
};

static inline bool page_type_has_type(int page_type)
{
        return page_type < (PGTY_mapcount_underflow << 24);
}

/* This takes a mapcount which is one more than page->_mapcount */
static inline bool page_mapcount_is_type(unsigned int mapcount)
{
        return page_type_has_type(mapcount - 1);
}

static inline bool page_has_type(const struct page *page)
{
        return page_type_has_type(data_race(page->page_type));
}

#define FOLIO_TYPE_OPS(lname, fname)                                        \
static __always_inline bool folio_test_##fname(const struct folio *folio) \
{                                                                        \
        return data_race(folio->page.page_type >> 24) == PGTY_##lname;        \
}                                                                        \
static __always_inline void __folio_set_##fname(struct folio *folio)        \
{                                                                        \
        if (folio_test_##fname(folio))                                        \
                return;                                                        \
        VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX,        \
                        folio);                                                \
        folio->page.page_type = (unsigned int)PGTY_##lname << 24;        \
}                                                                        \
static __always_inline void __folio_clear_##fname(struct folio *folio)        \
{                                                                        \
        if (folio->page.page_type == UINT_MAX)                                \
                return;                                                        \
        VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio);                \
        folio->page.page_type = UINT_MAX;                                \
}

#define PAGE_TYPE_OPS(uname, lname, fname)                                \
FOLIO_TYPE_OPS(lname, fname)                                                \
static __always_inline int Page##uname(const struct page *page)                \
{                                                                        \
        return data_race(page->page_type >> 24) == PGTY_##lname;        \
}                                                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{                                                                        \
        if (Page##uname(page))                                                \
                return;                                                        \
        VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page);        \
        page->page_type = (unsigned int)PGTY_##lname << 24;                \
}                                                                        \
static __always_inline void __ClearPage##uname(struct page *page)        \
{                                                                        \
        if (page->page_type == UINT_MAX)                                \
                return;                                                        \
        VM_BUG_ON_PAGE(!Page##uname(page), page);                        \
        page->page_type = UINT_MAX;                                        \
}

/*
 * PageBuddy() indicates that the page is free and in the buddy system
 * (see mm/page_alloc.c).
 */
PAGE_TYPE_OPS(Buddy, buddy, buddy)

/*
 * PageOffline() indicates that the page is logically offline although the
 * containing section is online. (e.g. inflated in a balloon driver or
 * not onlined when onlining the section).
 * The content of these pages is effectively stale. Such pages should not
 * be touched (read/write/dump/save) except by their owner.
 *
 * When a memory block gets onlined, all pages are initialized with a
 * refcount of 1 and PageOffline(). generic_online_page() will
 * take care of clearing PageOffline().
 *
 * If a driver wants to allow to offline unmovable PageOffline() pages without
 * putting them back to the buddy, it can do so via the memory notifier by
 * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the
 * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline()
 * pages (now with a reference count of zero) are treated like free (unmanaged)
 * pages, allowing the containing memory block to get offlined. A driver that
 * relies on this feature is aware that re-onlining the memory block will
 * require not giving them to the buddy via generic_online_page().
 *
 * Memory offlining code will not adjust the managed page count for any
 * PageOffline() pages, treating them like they were never exposed to the
 * buddy using generic_online_page().
 *
 * There are drivers that mark a page PageOffline() and expect there won't be
 * any further access to page content. PFN walkers that read content of random
 * pages should check PageOffline() and synchronize with such drivers using
 * page_offline_freeze()/page_offline_thaw().
 */
PAGE_TYPE_OPS(Offline, offline, offline)

extern void page_offline_freeze(void);
extern void page_offline_thaw(void);
extern void page_offline_begin(void);
extern void page_offline_end(void);

/*
 * Marks pages in use as page tables.
 */
PAGE_TYPE_OPS(Table, table, pgtable)

/*
 * Marks guardpages used with debug_pagealloc.
 */
PAGE_TYPE_OPS(Guard, guard, guard)

PAGE_TYPE_OPS(Slab, slab, slab)

#ifdef CONFIG_HUGETLB_PAGE
FOLIO_TYPE_OPS(hugetlb, hugetlb)
#else
FOLIO_TEST_FLAG_FALSE(hugetlb)
#endif

PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)

/*
 * Mark pages that has to be accepted before touched for the first time.
 *
 * Serialized with zone lock.
 */
PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted)
PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc)

/*
 * Marks page_pool allocated pages.
 */
PAGE_TYPE_OPS(Netpp, netpp, netpp)

/**
 * PageHuge - Determine if the page belongs to hugetlbfs
 * @page: The page to test.
 *
 * Context: Any context.
 * Return: True for hugetlbfs pages, false for anon pages or pages
 * belonging to other filesystems.
 */
static inline bool PageHuge(const struct page *page)
{
        return folio_test_hugetlb(page_folio(page));
}

/*
 * Check if a page is currently marked HWPoisoned. Note that this check is
 * best effort only and inherently racy: there is no way to synchronize with
 * failing hardware.
 */
static inline bool is_page_hwpoison(const struct page *page)
{
        const struct folio *folio;

        if (PageHWPoison(page))
                return true;
        folio = page_folio(page);
        return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
}

static inline bool folio_contain_hwpoisoned_page(struct folio *folio)
{
        return folio_test_hwpoison(folio) ||
            (folio_test_large(folio) && folio_test_has_hwpoisoned(folio));
}

bool is_free_buddy_page(const struct page *page);

#ifdef CONFIG_MIGRATION
/*
 * This page is migratable through movable_ops (for selected typed pages
 * only).
 *
 * Page migration of such pages might fail, for example, if the page is
 * already isolated by somebody else, or if the page is about to get freed.
 *
 * While a subsystem might set selected typed pages that support page migration
 * as being movable through movable_ops, it must never clear this flag.
 *
 * This flag is only cleared when the page is freed back to the buddy.
 *
 * Only selected page types support this flag (see page_movable_ops()) and
 * the flag might be used in other context for other pages. Always use
 * page_has_movable_ops() instead.
 */
TESTPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL);
SETPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL);
/*
 * A movable_ops page has this flag set while it is isolated for migration.
 * This flag primarily protects against concurrent migration attempts.
 *
 * Once migration ended (success or failure), the flag is cleared. The
 * flag is managed by the migration core.
 */
PAGEFLAG(MovableOpsIsolated, movable_ops_isolated, PF_NO_TAIL);
#else /* !CONFIG_MIGRATION */
TESTPAGEFLAG_FALSE(MovableOps, movable_ops);
SETPAGEFLAG_NOOP(MovableOps, movable_ops);
PAGEFLAG_FALSE(MovableOpsIsolated, movable_ops_isolated);
#endif /* CONFIG_MIGRATION */

/**
 * page_has_movable_ops - test for a movable_ops page
 * @page: The page to test.
 *
 * Test whether this is a movable_ops page. Such pages will stay that
 * way until freed.
 *
 * Returns true if this is a movable_ops page, otherwise false.
 */
static inline bool page_has_movable_ops(const struct page *page)
{
        return PageMovableOps(page) &&
               (PageOffline(page) || PageZsmalloc(page));
}

static __always_inline int PageAnonExclusive(const struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
        /*
         * HugeTLB stores this information on the head page; THP keeps it per
         * page
         */
        if (PageHuge(page))
                page = compound_head(page);
        return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}

static __always_inline void SetPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}

static __always_inline void ClearPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}

static __always_inline void __ClearPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}

#ifdef CONFIG_MMU
#define __PG_MLOCKED                (1UL << PG_mlocked)
#else
#define __PG_MLOCKED                0
#endif

/*
 * Flags checked when a page is freed.  Pages being freed should not have
 * these flags set.  If they are, there is a problem.
 */
#define PAGE_FLAGS_CHECK_AT_FREE                                \
        (1UL << PG_lru                | 1UL << PG_locked        |        \
         1UL << PG_private        | 1UL << PG_private_2        |        \
         1UL << PG_writeback        | 1UL << PG_reserved        |        \
         1UL << PG_active         |                                \
         1UL << PG_unevictable        | __PG_MLOCKED | LRU_GEN_MASK)

/*
 * Flags checked when a page is prepped for return by the page allocator.
 * Pages being prepped should not have these flags set.  If they are set,
 * there has been a kernel bug or struct page corruption.
 *
 * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
 * alloc-free cycle to prevent from reusing the page.
 */
#define PAGE_FLAGS_CHECK_AT_PREP        \
        ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)

/*
 * Flags stored in the second page of a compound page.  They may overlap
 * the CHECK_AT_FREE flags above, so need to be cleared.
 */
#define PAGE_FLAGS_SECOND                                                \
        (0xffUL /* order */                | 1UL << PG_has_hwpoisoned |        \
         1UL << PG_large_rmappable        | 1UL << PG_partially_mapped)

#define PAGE_FLAGS_PRIVATE                                \
        (1UL << PG_private | 1UL << PG_private_2)
/**
 * folio_has_private - Determine if folio has private stuff
 * @folio: The folio to be checked
 *
 * Determine if a folio has private stuff, indicating that release routines
 * should be invoked upon it.
 */
static inline int folio_has_private(const struct folio *folio)
{
        return !!(folio->flags.f & PAGE_FLAGS_PRIVATE);
}

#undef PF_ANY
#undef PF_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
#undef PF_SECOND
#endif /* !__GENERATING_BOUNDS_H */

#endif        /* PAGE_FLAGS_H */















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: common low-level thread information accessors
 *
 * Copyright (C) 2002  David Howells (dhowells@redhat.com)
 * - Incorporating suggestions made by Linus Torvalds
 */

#ifndef _LINUX_THREAD_INFO_H
#define _LINUX_THREAD_INFO_H

#include <linux/types.h>
#include <linux/limits.h>
#include <linux/bug.h>
#include <linux/restart_block.h>
#include <linux/errno.h>

#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
 * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
 * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
 * including <asm/current.h> can cause a circular dependency on some platforms.
 */
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif

#include <linux/bitops.h>

/*
 * For per-arch arch_within_stack_frames() implementations, defined in
 * asm/thread_info.h.
 */
enum {
        BAD_STACK = -1,
        NOT_STACK = 0,
        GOOD_FRAME,
        GOOD_STACK,
};

#ifdef CONFIG_GENERIC_ENTRY
enum syscall_work_bit {
        SYSCALL_WORK_BIT_SECCOMP,
        SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
        SYSCALL_WORK_BIT_SYSCALL_TRACE,
        SYSCALL_WORK_BIT_SYSCALL_EMU,
        SYSCALL_WORK_BIT_SYSCALL_AUDIT,
        SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
        SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
        SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE,
};

#define SYSCALL_WORK_SECCOMP                        BIT(SYSCALL_WORK_BIT_SECCOMP)
#define SYSCALL_WORK_SYSCALL_TRACEPOINT                BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
#define SYSCALL_WORK_SYSCALL_TRACE                BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
#define SYSCALL_WORK_SYSCALL_EMU                BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
#define SYSCALL_WORK_SYSCALL_AUDIT                BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
#define SYSCALL_WORK_SYSCALL_USER_DISPATCH        BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
#define SYSCALL_WORK_SYSCALL_EXIT_TRAP                BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
#define SYSCALL_WORK_SYSCALL_RSEQ_SLICE                BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE)
#endif

#include <asm/thread_info.h>

#ifndef TIF_NEED_RESCHED_LAZY
#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
#error Inconsistent PREEMPT_LAZY
#endif
#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
#endif

#ifndef TIF_RSEQ
# define TIF_RSEQ        TIF_NOTIFY_RESUME
# define _TIF_RSEQ        _TIF_NOTIFY_RESUME
#endif

#ifdef __KERNEL__

#ifndef arch_set_restart_data
#define arch_set_restart_data(restart) do { } while (0)
#endif

static inline long set_restart_fn(struct restart_block *restart,
                                        long (*fn)(struct restart_block *))
{
        restart->fn = fn;
        arch_set_restart_data(restart);
        return -ERESTART_RESTARTBLOCK;
}

#ifndef THREAD_ALIGN
#define THREAD_ALIGN        THREAD_SIZE
#endif

#define THREADINFO_GFP                (GFP_KERNEL_ACCOUNT | __GFP_ZERO)

/*
 * flag set/clear/test wrappers
 * - pass TIF_xxxx constants to these functions
 */

static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
{
        set_bit(flag, (unsigned long *)&ti->flags);
}

static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline void update_ti_thread_flag(struct thread_info *ti, int flag,
                                         bool value)
{
        if (value)
                set_ti_thread_flag(ti, flag);
        else
                clear_ti_thread_flag(ti, flag);
}

static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_set_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_bit(flag, (unsigned long *)&ti->flags);
}

/*
 * This may be used in noinstr code, and needs to be __always_inline to prevent
 * inadvertent instrumentation.
 */
static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti)
{
        return READ_ONCE(ti->flags);
}

#define set_thread_flag(flag) \
        set_ti_thread_flag(current_thread_info(), flag)
#define clear_thread_flag(flag) \
        clear_ti_thread_flag(current_thread_info(), flag)
#define update_thread_flag(flag, value) \
        update_ti_thread_flag(current_thread_info(), flag, value)
#define test_and_set_thread_flag(flag) \
        test_and_set_ti_thread_flag(current_thread_info(), flag)
#define test_and_clear_thread_flag(flag) \
        test_and_clear_ti_thread_flag(current_thread_info(), flag)
#define test_thread_flag(flag) \
        test_ti_thread_flag(current_thread_info(), flag)
#define read_thread_flags() \
        read_ti_thread_flags(current_thread_info())

#define read_task_thread_flags(t) \
        read_ti_thread_flags(task_thread_info(t))

#ifdef CONFIG_GENERIC_ENTRY
#define set_syscall_work(fl) \
        set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define test_syscall_work(fl) \
        test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define clear_syscall_work(fl) \
        clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)

#define set_task_syscall_work(t, fl) \
        set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define test_task_syscall_work(t, fl) \
        test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define clear_task_syscall_work(t, fl) \
        clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)

#else /* CONFIG_GENERIC_ENTRY */

#define set_syscall_work(fl)                                                \
        set_ti_thread_flag(current_thread_info(), TIF_##fl)
#define test_syscall_work(fl) \
        test_ti_thread_flag(current_thread_info(), TIF_##fl)
#define clear_syscall_work(fl) \
        clear_ti_thread_flag(current_thread_info(), TIF_##fl)

#define set_task_syscall_work(t, fl) \
        set_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define test_task_syscall_work(t, fl) \
        test_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define clear_task_syscall_work(t, fl) \
        clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
#endif /* !CONFIG_GENERIC_ENTRY */

#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H

static __always_inline bool tif_test_bit(int bit)
{
        return arch_test_bit(bit,
                             (unsigned long *)(&current_thread_info()->flags));
}

#else

static __always_inline bool tif_test_bit(int bit)
{
        return test_bit(bit,
                        (unsigned long *)(&current_thread_info()->flags));
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */

static __always_inline bool tif_need_resched(void)
{
        return tif_test_bit(TIF_NEED_RESCHED);
}

#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
                                           const void * const stackend,
                                           const void *obj, unsigned long len)
{
        return 0;
}
#endif

#ifndef arch_setup_new_exec
static inline void arch_setup_new_exec(void) { }
#endif

void arch_task_cache_init(void); /* for CONFIG_SH */
void arch_release_task_struct(struct task_struct *tsk);
int arch_dup_task_struct(struct task_struct *dst,
                                struct task_struct *src);

#endif        /* __KERNEL__ */

#endif /* _LINUX_THREAD_INFO_H */























































































































































































































































































    3 




    3 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
// SPDX-License-Identifier: GPL-2.0+
/*
 * A wrapper for multiple PHYs which passes all phy_* function calls to
 * multiple (actual) PHY devices. This is comes handy when initializing
 * all PHYs on a HCD and to keep them all in the same state.
 *
 * Copyright (C) 2018 Martin Blumenstingl <martin.blumenstingl@googlemail.com>
 */

#include <linux/device.h>
#include <linux/list.h>
#include <linux/phy/phy.h>
#include <linux/of.h>

#include "phy.h"

struct usb_phy_roothub {
        struct phy                *phy;
        struct list_head        list;
};

/* Allocate the roothub_entry by specific name of phy */
static int usb_phy_roothub_add_phy_by_name(struct device *dev, const char *name,
                                           struct list_head *list)
{
        struct usb_phy_roothub *roothub_entry;
        struct phy *phy;

        phy = devm_of_phy_get(dev, dev->of_node, name);
        if (IS_ERR(phy))
                return PTR_ERR(phy);

        roothub_entry = devm_kzalloc(dev, sizeof(*roothub_entry), GFP_KERNEL);
        if (!roothub_entry)
                return -ENOMEM;

        INIT_LIST_HEAD(&roothub_entry->list);

        roothub_entry->phy = phy;

        list_add_tail(&roothub_entry->list, list);

        return 0;
}

static int usb_phy_roothub_add_phy(struct device *dev, int index,
                                   struct list_head *list)
{
        struct usb_phy_roothub *roothub_entry;
        struct phy *phy;

        phy = devm_of_phy_get_by_index(dev, dev->of_node, index);
        if (IS_ERR(phy)) {
                if (PTR_ERR(phy) == -ENODEV)
                        return 0;
                else
                        return PTR_ERR(phy);
        }

        roothub_entry = devm_kzalloc(dev, sizeof(*roothub_entry), GFP_KERNEL);
        if (!roothub_entry)
                return -ENOMEM;

        INIT_LIST_HEAD(&roothub_entry->list);

        roothub_entry->phy = phy;

        list_add_tail(&roothub_entry->list, list);

        return 0;
}

struct usb_phy_roothub *usb_phy_roothub_alloc(struct device *dev)
{
        struct usb_phy_roothub *phy_roothub;
        int i, num_phys, err;

        if (!IS_ENABLED(CONFIG_GENERIC_PHY))
                return NULL;

        num_phys = of_count_phandle_with_args(dev->of_node, "phys",
                                              "#phy-cells");
        if (num_phys <= 0)
                return NULL;

        phy_roothub = devm_kzalloc(dev, sizeof(*phy_roothub), GFP_KERNEL);
        if (!phy_roothub)
                return ERR_PTR(-ENOMEM);

        INIT_LIST_HEAD(&phy_roothub->list);

        if (!usb_phy_roothub_add_phy_by_name(dev, "usb2-phy", &phy_roothub->list))
                return phy_roothub;

        for (i = 0; i < num_phys; i++) {
                err = usb_phy_roothub_add_phy(dev, i, &phy_roothub->list);
                if (err)
                        return ERR_PTR(err);
        }

        return phy_roothub;
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_alloc);

/**
 * usb_phy_roothub_alloc_usb3_phy - alloc the roothub
 * @dev: the device of the host controller
 *
 * Allocate the usb phy roothub if the host use a generic usb3-phy.
 *
 * Return: On success, a pointer to the usb_phy_roothub. Otherwise,
 * %NULL if no use usb3 phy or %-ENOMEM if out of memory.
 */
struct usb_phy_roothub *usb_phy_roothub_alloc_usb3_phy(struct device *dev)
{
        struct usb_phy_roothub *phy_roothub;
        int num_phys, usb2_phy_index;

        if (!IS_ENABLED(CONFIG_GENERIC_PHY))
                return NULL;

        num_phys = of_count_phandle_with_args(dev->of_node, "phys",
                                              "#phy-cells");
        if (num_phys <= 0)
                return NULL;

        /*
         * If 'usb2-phy' is not present, usb_phy_roothub_alloc() added
         * all PHYs to the primary HCD's phy_roothub already, so skip
         * adding 'usb3-phy' here to avoid double use of that.
         */
        usb2_phy_index = of_property_match_string(dev->of_node, "phy-names",
                                                  "usb2-phy");
        if (usb2_phy_index < 0)
                return NULL;

        phy_roothub = devm_kzalloc(dev, sizeof(*phy_roothub), GFP_KERNEL);
        if (!phy_roothub)
                return ERR_PTR(-ENOMEM);

        INIT_LIST_HEAD(&phy_roothub->list);

        if (!usb_phy_roothub_add_phy_by_name(dev, "usb3-phy", &phy_roothub->list))
                return phy_roothub;

        return NULL;
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_alloc_usb3_phy);

int usb_phy_roothub_init(struct usb_phy_roothub *phy_roothub)
{
        struct usb_phy_roothub *roothub_entry;
        struct list_head *head;
        int err;

        if (!phy_roothub)
                return 0;

        head = &phy_roothub->list;

        list_for_each_entry(roothub_entry, head, list) {
                err = phy_init(roothub_entry->phy);
                if (err)
                        goto err_exit_phys;
        }

        return 0;

err_exit_phys:
        list_for_each_entry_continue_reverse(roothub_entry, head, list)
                phy_exit(roothub_entry->phy);

        return err;
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_init);

int usb_phy_roothub_exit(struct usb_phy_roothub *phy_roothub)
{
        struct usb_phy_roothub *roothub_entry;
        struct list_head *head;
        int err, ret = 0;

        if (!phy_roothub)
                return 0;

        head = &phy_roothub->list;

        list_for_each_entry(roothub_entry, head, list) {
                err = phy_exit(roothub_entry->phy);
                if (err)
                        ret = err;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_exit);

int usb_phy_roothub_set_mode(struct usb_phy_roothub *phy_roothub,
                             enum phy_mode mode)
{
        struct usb_phy_roothub *roothub_entry;
        struct list_head *head;
        int err;

        if (!phy_roothub)
                return 0;

        head = &phy_roothub->list;

        list_for_each_entry(roothub_entry, head, list) {
                err = phy_set_mode(roothub_entry->phy, mode);
                if (err)
                        return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_set_mode);

int usb_phy_roothub_calibrate(struct usb_phy_roothub *phy_roothub)
{
        struct usb_phy_roothub *roothub_entry;
        struct list_head *head;
        int err;

        if (!phy_roothub)
                return 0;

        head = &phy_roothub->list;

        list_for_each_entry(roothub_entry, head, list) {
                err = phy_calibrate(roothub_entry->phy);
                if (err)
                        return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_calibrate);

/**
 * usb_phy_roothub_notify_connect() - connect notification
 * @phy_roothub: the phy of roothub, if the host use a generic phy.
 * @port: the port index for connect
 *
 * If the phy needs to get connection status, the callback can be used.
 * Returns: %0 if successful, a negative error code otherwise
 */
int usb_phy_roothub_notify_connect(struct usb_phy_roothub *phy_roothub, int port)
{
        struct usb_phy_roothub *roothub_entry;
        struct list_head *head;
        int err;

        if (!phy_roothub)
                return 0;

        head = &phy_roothub->list;

        list_for_each_entry(roothub_entry, head, list) {
                err = phy_notify_connect(roothub_entry->phy, port);
                if (err)
                        return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_notify_connect);

/**
 * usb_phy_roothub_notify_disconnect() - disconnect notification
 * @phy_roothub: the phy of roothub, if the host use a generic phy.
 * @port: the port index for disconnect
 *
 * If the phy needs to get connection status, the callback can be used.
 * Returns: %0 if successful, a negative error code otherwise
 */
int usb_phy_roothub_notify_disconnect(struct usb_phy_roothub *phy_roothub, int port)
{
        struct usb_phy_roothub *roothub_entry;
        struct list_head *head;
        int err;

        if (!phy_roothub)
                return 0;

        head = &phy_roothub->list;

        list_for_each_entry(roothub_entry, head, list) {
                err = phy_notify_disconnect(roothub_entry->phy, port);
                if (err)
                        return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_notify_disconnect);

int usb_phy_roothub_power_on(struct usb_phy_roothub *phy_roothub)
{
        struct usb_phy_roothub *roothub_entry;
        struct list_head *head;
        int err;

        if (!phy_roothub)
                return 0;

        head = &phy_roothub->list;

        list_for_each_entry(roothub_entry, head, list) {
                err = phy_power_on(roothub_entry->phy);
                if (err)
                        goto err_out;
        }

        return 0;

err_out:
        list_for_each_entry_continue_reverse(roothub_entry, head, list)
                phy_power_off(roothub_entry->phy);

        return err;
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_power_on);

void usb_phy_roothub_power_off(struct usb_phy_roothub *phy_roothub)
{
        struct usb_phy_roothub *roothub_entry;

        if (!phy_roothub)
                return;

        list_for_each_entry_reverse(roothub_entry, &phy_roothub->list, list)
                phy_power_off(roothub_entry->phy);
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_power_off);

int usb_phy_roothub_suspend(struct device *controller_dev,
                            struct usb_phy_roothub *phy_roothub)
{
        usb_phy_roothub_power_off(phy_roothub);

        /* keep the PHYs initialized so the device can wake up the system */
        if (device_may_wakeup(controller_dev))
                return 0;

        return usb_phy_roothub_exit(phy_roothub);
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_suspend);

int usb_phy_roothub_resume(struct device *controller_dev,
                           struct usb_phy_roothub *phy_roothub)
{
        int err;

        /* if the device can't wake up the system _exit was called */
        if (!device_may_wakeup(controller_dev)) {
                err = usb_phy_roothub_init(phy_roothub);
                if (err)
                        return err;
        }

        err = usb_phy_roothub_power_on(phy_roothub);

        /* undo _init if _power_on failed */
        if (err && !device_may_wakeup(controller_dev))
                usb_phy_roothub_exit(phy_roothub);

        return err;
}
EXPORT_SYMBOL_GPL(usb_phy_roothub_resume);



























   28 



























   47 




   45 




   43 
   28 










   43 



   53 



   49 















   42 


   42 
















   25 




   28 




























   25 



   25 


   18 

















   45 


   46 


























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
// SPDX-License-Identifier: GPL-2.0
/*
 * Lockless hierarchical page accounting & limiting
 *
 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
 */

#include <linux/page_counter.h>
#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/bug.h>
#include <asm/page.h>

static bool track_protection(struct page_counter *c)
{
        return c->protection_support;
}

static void propagate_protected_usage(struct page_counter *c,
                                      unsigned long usage)
{
        unsigned long protected, old_protected;
        long delta;

        if (!c->parent)
                return;

        protected = min(usage, READ_ONCE(c->min));
        old_protected = atomic_long_read(&c->min_usage);
        if (protected != old_protected) {
                old_protected = atomic_long_xchg(&c->min_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_min_usage);
        }

        protected = min(usage, READ_ONCE(c->low));
        old_protected = atomic_long_read(&c->low_usage);
        if (protected != old_protected) {
                old_protected = atomic_long_xchg(&c->low_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_low_usage);
        }
}

/**
 * page_counter_cancel - take pages out of the local counter
 * @counter: counter
 * @nr_pages: number of pages to cancel
 */
void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
{
        long new;

        new = atomic_long_sub_return(nr_pages, &counter->usage);
        /* More uncharges than charges? */
        if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
                      new, nr_pages)) {
                new = 0;
                atomic_long_set(&counter->usage, new);
        }
        if (track_protection(counter))
                propagate_protected_usage(counter, new);
}

/**
 * page_counter_charge - hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 *
 * NOTE: This does not consider any configured counter limits.
 */
void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;
        bool protection = track_protection(counter);

        for (c = counter; c; c = c->parent) {
                long new;

                new = atomic_long_add_return(nr_pages, &c->usage);
                if (protection)
                        propagate_protected_usage(c, new);
                /*
                 * This is indeed racy, but we can live with some
                 * inaccuracy in the watermark.
                 *
                 * Notably, we have two watermarks to allow for both a globally
                 * visible peak and one that can be reset at a smaller scope.
                 *
                 * Since we reset both watermarks when the global reset occurs,
                 * we can guarantee that watermark >= local_watermark, so we
                 * don't need to do both comparisons every time.
                 *
                 * On systems with branch predictors, the inner condition should
                 * be almost free.
                 */
                if (new > READ_ONCE(c->local_watermark)) {
                        WRITE_ONCE(c->local_watermark, new);
                        if (new > READ_ONCE(c->watermark))
                                WRITE_ONCE(c->watermark, new);
                }
        }
}

/**
 * page_counter_try_charge - try to hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 * @fail: points first counter to hit its limit, if any
 *
 * Returns %true on success, or %false and @fail if the counter or one
 * of its ancestors has hit its configured limit.
 */
bool page_counter_try_charge(struct page_counter *counter,
                             unsigned long nr_pages,
                             struct page_counter **fail)
{
        struct page_counter *c;
        bool protection = track_protection(counter);
        bool track_failcnt = counter->track_failcnt;

        for (c = counter; c; c = c->parent) {
                long new;
                /*
                 * Charge speculatively to avoid an expensive CAS.  If
                 * a bigger charge fails, it might falsely lock out a
                 * racing smaller charge and send it into reclaim
                 * early, but the error is limited to the difference
                 * between the two sizes, which is less than 2M/4M in
                 * case of a THP locking out a regular page charge.
                 *
                 * The atomic_long_add_return() implies a full memory
                 * barrier between incrementing the count and reading
                 * the limit.  When racing with page_counter_set_max(),
                 * we either see the new limit or the setter sees the
                 * counter has changed and retries.
                 */
                new = atomic_long_add_return(nr_pages, &c->usage);
                if (new > c->max) {
                        atomic_long_sub(nr_pages, &c->usage);
                        /*
                         * This is racy, but we can live with some
                         * inaccuracy in the failcnt which is only used
                         * to report stats.
                         */
                        if (track_failcnt)
                                data_race(c->failcnt++);
                        *fail = c;
                        goto failed;
                }
                if (protection)
                        propagate_protected_usage(c, new);

                /* see comment on page_counter_charge */
                if (new > READ_ONCE(c->local_watermark)) {
                        WRITE_ONCE(c->local_watermark, new);
                        if (new > READ_ONCE(c->watermark))
                                WRITE_ONCE(c->watermark, new);
                }
        }
        return true;

failed:
        for (c = counter; c != *fail; c = c->parent)
                page_counter_cancel(c, nr_pages);

        return false;
}

/**
 * page_counter_uncharge - hierarchically uncharge pages
 * @counter: counter
 * @nr_pages: number of pages to uncharge
 */
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent)
                page_counter_cancel(c, nr_pages);
}

/**
 * page_counter_set_max - set the maximum number of pages allowed
 * @counter: counter
 * @nr_pages: limit to set
 *
 * Returns 0 on success, -EBUSY if the current number of pages on the
 * counter already exceeds the specified limit.
 *
 * The caller must serialize invocations on the same counter.
 */
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
{
        for (;;) {
                unsigned long old;
                long usage;

                /*
                 * Update the limit while making sure that it's not
                 * below the concurrently-changing counter value.
                 *
                 * The xchg implies two full memory barriers before
                 * and after, so the read-swap-read is ordered and
                 * ensures coherency with page_counter_try_charge():
                 * that function modifies the count before checking
                 * the limit, so if it sees the old limit, we see the
                 * modified counter and retry.
                 */
                usage = page_counter_read(counter);

                if (usage > nr_pages)
                        return -EBUSY;

                old = xchg(&counter->max, nr_pages);

                if (page_counter_read(counter) <= usage || nr_pages >= old)
                        return 0;

                counter->max = old;
                cond_resched();
        }
}

/**
 * page_counter_set_min - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->min, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_set_low - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->low, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_memparse - memparse() for page counter limits
 * @buf: string to parse
 * @max: string meaning maximum possible value
 * @nr_pages: returns the result in number of pages
 *
 * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
 * limited to %PAGE_COUNTER_MAX.
 */
int page_counter_memparse(const char *buf, const char *max,
                          unsigned long *nr_pages)
{
        char *end;
        u64 bytes;

        if (!strcmp(buf, max)) {
                *nr_pages = PAGE_COUNTER_MAX;
                return 0;
        }

        bytes = memparse(buf, &end);
        if (*end != '\0')
                return -EINVAL;

        *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);

        return 0;
}


#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
/*
 * This function calculates an individual page counter's effective
 * protection which is derived from its own memory.min/low, its
 * parent's and siblings' settings, as well as the actual memory
 * distribution in the tree.
 *
 * The following rules apply to the effective protection values:
 *
 * 1. At the first level of reclaim, effective protection is equal to
 *    the declared protection in memory.min and memory.low.
 *
 * 2. To enable safe delegation of the protection configuration, at
 *    subsequent levels the effective protection is capped to the
 *    parent's effective protection.
 *
 * 3. To make complex and dynamic subtrees easier to configure, the
 *    user is allowed to overcommit the declared protection at a given
 *    level. If that is the case, the parent's effective protection is
 *    distributed to the children in proportion to how much protection
 *    they have declared and how much of it they are utilizing.
 *
 *    This makes distribution proportional, but also work-conserving:
 *    if one counter claims much more protection than it uses memory,
 *    the unused remainder is available to its siblings.
 *
 * 4. Conversely, when the declared protection is undercommitted at a
 *    given level, the distribution of the larger parental protection
 *    budget is NOT proportional. A counter's protection from a sibling
 *    is capped to its own memory.min/low setting.
 *
 * 5. However, to allow protecting recursive subtrees from each other
 *    without having to declare each individual counter's fixed share
 *    of the ancestor's claim to protection, any unutilized -
 *    "floating" - protection from up the tree is distributed in
 *    proportion to each counter's *usage*. This makes the protection
 *    neutral wrt sibling cgroups and lets them compete freely over
 *    the shared parental protection budget, but it protects the
 *    subtree as a whole from neighboring subtrees.
 *
 * Note that 4. and 5. are not in conflict: 4. is about protecting
 * against immediate siblings whereas 5. is about protecting against
 * neighboring subtrees.
 */
static unsigned long effective_protection(unsigned long usage,
                                          unsigned long parent_usage,
                                          unsigned long setting,
                                          unsigned long parent_effective,
                                          unsigned long siblings_protected,
                                          bool recursive_protection)
{
        unsigned long protected;
        unsigned long ep;

        protected = min(usage, setting);
        /*
         * If all cgroups at this level combined claim and use more
         * protection than what the parent affords them, distribute
         * shares in proportion to utilization.
         *
         * We are using actual utilization rather than the statically
         * claimed protection in order to be work-conserving: claimed
         * but unused protection is available to siblings that would
         * otherwise get a smaller chunk than what they claimed.
         */
        if (siblings_protected > parent_effective)
                return protected * parent_effective / siblings_protected;

        /*
         * Ok, utilized protection of all children is within what the
         * parent affords them, so we know whatever this child claims
         * and utilizes is effectively protected.
         *
         * If there is unprotected usage beyond this value, reclaim
         * will apply pressure in proportion to that amount.
         *
         * If there is unutilized protection, the cgroup will be fully
         * shielded from reclaim, but we do return a smaller value for
         * protection than what the group could enjoy in theory. This
         * is okay. With the overcommit distribution above, effective
         * protection is always dependent on how memory is actually
         * consumed among the siblings anyway.
         */
        ep = protected;

        /*
         * If the children aren't claiming (all of) the protection
         * afforded to them by the parent, distribute the remainder in
         * proportion to the (unprotected) memory of each cgroup. That
         * way, cgroups that aren't explicitly prioritized wrt each
         * other compete freely over the allowance, but they are
         * collectively protected from neighboring trees.
         *
         * We're using unprotected memory for the weight so that if
         * some cgroups DO claim explicit protection, we don't protect
         * the same bytes twice.
         *
         * Check both usage and parent_usage against the respective
         * protected values. One should imply the other, but they
         * aren't read atomically - make sure the division is sane.
         */
        if (!recursive_protection)
                return ep;

        if (parent_effective > siblings_protected &&
            parent_usage > siblings_protected &&
            usage > protected) {
                unsigned long unclaimed;

                unclaimed = parent_effective - siblings_protected;
                unclaimed *= usage - protected;
                unclaimed /= parent_usage - siblings_protected;

                ep += unclaimed;
        }

        return ep;
}


/**
 * page_counter_calculate_protection - check if memory consumption is in the normal range
 * @root: the top ancestor of the sub-tree being checked
 * @counter: the page_counter the counter to update
 * @recursive_protection: Whether to use memory_recursiveprot behavior.
 *
 * Calculates elow/emin thresholds for given page_counter.
 *
 * WARNING: This function is not stateless! It can only be used as part
 *          of a top-down tree iteration, not for isolated queries.
 */
void page_counter_calculate_protection(struct page_counter *root,
                                       struct page_counter *counter,
                                       bool recursive_protection)
{
        unsigned long usage, parent_usage;
        struct page_counter *parent = counter->parent;

        /*
         * Effective values of the reclaim targets are ignored so they
         * can be stale. Have a look at mem_cgroup_protection for more
         * details.
         * TODO: calculation should be more robust so that we do not need
         * that special casing.
         */
        if (root == counter)
                return;

        usage = page_counter_read(counter);
        if (!usage)
                return;

        if (parent == root) {
                counter->emin = READ_ONCE(counter->min);
                counter->elow = READ_ONCE(counter->low);
                return;
        }

        parent_usage = page_counter_read(parent);

        WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage,
                        READ_ONCE(counter->min),
                        READ_ONCE(parent->emin),
                        atomic_long_read(&parent->children_min_usage),
                        recursive_protection));

        WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage,
                        READ_ONCE(counter->low),
                        READ_ONCE(parent->elow),
                        atomic_long_read(&parent->children_low_usage),
                        recursive_protection));
}
#endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */




















    2 

    2 

































































    2 



    2 



    2 



    1 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// SPDX-License-Identifier: GPL-2.0-only
/*
 * x86-optimized CRC32 functions
 *
 * Copyright (C) 2008 Intel Corporation
 * Copyright 2012 Xyratex Technology Limited
 * Copyright 2024 Google LLC
 */

#include "crc-pclmul-template.h"

static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);

DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);

static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
{
        CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
                   have_pclmulqdq);
        return crc32_le_base(crc, p, len);
}

#ifdef CONFIG_X86_64
#define CRC32_INST "crc32q %1, %q0"
#else
#define CRC32_INST "crc32l %1, %0"
#endif

/*
 * Use carryless multiply version of crc32c when buffer size is >= 512 to
 * account for FPU state save/restore overhead.
 */
#define CRC32C_PCLMUL_BREAKEVEN        512

asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);

static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
{
        size_t num_longs;

        if (!static_branch_likely(&have_crc32))
                return crc32c_base(crc, p, len);

        if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
            static_branch_likely(&have_pclmulqdq) && likely(irq_fpu_usable())) {
                /*
                 * Long length, the vector registers are usable, and the CPU is
                 * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
                 * It is worthwhile to divide the data into multiple streams,
                 * CRC them independently, and combine them using PCLMULQDQ.
                 * crc32c_x86_3way() does this using 3 streams, which is the
                 * most that x86_64 CPUs have traditionally been capable of.
                 *
                 * However, due to improved VPCLMULQDQ performance on newer
                 * CPUs, use crc32_lsb_vpclmul_avx512() instead of
                 * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
                 * "good" implementation of AVX-512.
                 *
                 * Future work: the optimal strategy on Zen 3--5 is actually to
                 * use both crc32q and VPCLMULQDQ in parallel.  Unfortunately,
                 * different numbers of streams and vector lengths are optimal
                 * on each CPU microarchitecture, making it challenging to take
                 * advantage of this.  (Zen 5 even supports 7 parallel crc32q, a
                 * major upgrade.)  For now, just choose between
                 * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512().  The latter
                 * is needed anyway for crc32_le(), so we just reuse it here.
                 */
                kernel_fpu_begin();
                if (static_branch_likely(&have_vpclmul_avx512))
                        crc = crc32_lsb_vpclmul_avx512(crc, p, len,
                                       crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
                else
                        crc = crc32c_x86_3way(crc, p, len);
                kernel_fpu_end();
                return crc;
        }

        /*
         * Short length, XMM registers unusable, or the CPU is 32-bit; but the
         * CPU supports CRC32 instructions.  Just issue a single stream of CRC32
         * instructions inline.  While this doesn't use the CPU's CRC32
         * throughput very well, it avoids the need to combine streams.  Stream
         * combination would be inefficient here.
         */

        for (num_longs = len / sizeof(unsigned long);
             num_longs != 0; num_longs--, p += sizeof(unsigned long))
                asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));

        if (sizeof(unsigned long) > 4 && (len & 4)) {
                asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
                p += 4;
        }
        if (len & 2) {
                asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
                p += 2;
        }
        if (len & 1)
                asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));

        return crc;
}

#define crc32_be_arch crc32_be_base /* not implemented on this arch */

#define crc32_mod_init_arch crc32_mod_init_arch
static void crc32_mod_init_arch(void)
{
        if (boot_cpu_has(X86_FEATURE_XMM4_2))
                static_branch_enable(&have_crc32);
        if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
                static_branch_enable(&have_pclmulqdq);
                if (have_vpclmul()) {
                        if (have_avx512()) {
                                static_call_update(crc32_lsb_pclmul,
                                                   crc32_lsb_vpclmul_avx512);
                                static_branch_enable(&have_vpclmul_avx512);
                        } else {
                                static_call_update(crc32_lsb_pclmul,
                                                   crc32_lsb_vpclmul_avx2);
                        }
                }
        }
}

static inline u32 crc32_optimizations_arch(void)
{
        u32 optimizations = 0;

        if (static_key_enabled(&have_crc32))
                optimizations |= CRC32C_OPTIMIZATION;
        if (static_key_enabled(&have_pclmulqdq))
                optimizations |= CRC32_LE_OPTIMIZATION;
        return optimizations;
}







































































































































   11 

   11 
   11 













   10 



   13 


























   12 




   11 

   10 



   11 

















   13 










   11 


   10 

   11 








   13 


   13 











   11 







   11 



   11 























   11 
   11 



















   11 






   11 






   10 







   11 


    9 


   11 

   11 



   11 
   11 






   14 


   15 




























    9 





























    9 

















































































































   11 






























    1 
    1 
    2 





















































































































   15 





   14 
    9 






    1 
   11 

   15 



























































































































































   12 
   14 


   11 





   14 



   12 











   13 


    1 










































    1 

   12 














   13 




    1 






   12 





   11 

   12 
   12 







    1 










   12 
   10 


   12 











   10 

















   11 
























































































































    1 













    2 































    2 


























    2 



























    4 

    2 








    5 









    5 
    1 

































    6 
    5 










    5 











    6 




    5 







    2 















    4 











    5 
    1 



















































































































































   14 




   17 
    2 


















    2 









    2 


















































































    1 
    1 


























    1 









    1 




    1 


    1 












    1 











    1 








    3 





























































































































































































   16 







   14 











   13 




























   13 









   12 






    5 



    5 








    8 
    6 





   12 


   11 





    8 







    6 





    7 














    8 
    7 





    1 



    1 


















    8 






   13 














   11 

   14 












   13 
































   12 




   13 





   13 





































   13 




   13 




   13 







    1 



















   12 






    1 
























































































































   11 




   12 







    1 



























































































































































    2 





    2 
    2 

    2 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 









    3 







    3 
    3 



    3 










    3 










    3 































































































































    3 














    3 
































    3 
    3 



















    3 



    3 






    3 
    3 


    3 





















































































































    4 




    4 


    2 





    2 









    3 





















    2 






























    1 





































    3 




    3 
    3 
























































































































































































































































































































    9 



    9 




    9 
    1 























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        linux/mm/filemap.c
 *
 * Copyright (C) 1994-1999  Linus Torvalds
 */

/*
 * This file handles the generic file mmap semantics used by
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem used to do this differently, for example)
 */
#include <linux/export.h>
#include <linux/compiler.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/leafops.h>
#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/folio_batch.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include <linux/migrate.h>
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
#include <linux/sysctl.h>
#include <linux/pgalloc.h>

#include <asm/tlbflush.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/filemap.h>

/*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
#include <linux/buffer_head.h> /* for try_to_free_buffers */

#include <asm/mman.h>

#include "swap.h"

/*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
 * though.
 *
 * Shared mappings now work. 15.8.1995  Bruno.
 *
 * finished 'unifying' the page and buffer cache and SMP-threaded the
 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 *
 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 */

/*
 * Lock ordering:
 *
 *  ->i_mmap_rwsem                (truncate_pagecache)
 *    ->private_lock                (__free_pte->block_dirty_folio)
 *      ->swap_lock                (exclusive_swap_page, others)
 *        ->i_pages lock
 *
 *  ->i_rwsem
 *    ->invalidate_lock                (acquired by fs in truncate path)
 *      ->i_mmap_rwsem                (truncate->unmap_mapping_range)
 *
 *  ->mmap_lock
 *    ->i_mmap_rwsem
 *      ->page_table_lock or pte_lock        (various, mainly in memory.c)
 *        ->i_pages lock        (arch-dependent flush_dcache_mmap_lock)
 *
 *  ->mmap_lock
 *    ->invalidate_lock                (filemap_fault)
 *      ->lock_page                (filemap_fault, access_process_vm)
 *
 *  ->i_rwsem                        (generic_perform_write)
 *    ->mmap_lock                (fault_in_readable->do_page_fault)
 *
 *  bdi->wb.list_lock
 *    sb_lock                        (fs/fs-writeback.c)
 *    ->i_pages lock                (__sync_single_inode)
 *
 *  ->i_mmap_rwsem
 *    ->anon_vma.lock                (vma_merge)
 *
 *  ->anon_vma.lock
 *    ->page_table_lock or pte_lock        (anon_vma_prepare and various)
 *
 *  ->page_table_lock or pte_lock
 *    ->swap_lock                (try_to_unmap_one)
 *    ->private_lock                (try_to_unmap_one)
 *    ->i_pages lock                (try_to_unmap_one)
 *    ->lruvec->lru_lock        (follow_page_mask->mark_page_accessed)
 *    ->lruvec->lru_lock        (check_pte_range->folio_isolate_lru)
 *    ->private_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    ->i_pages lock                (folio_remove_rmap_pte->set_page_dirty)
 *    bdi.wb->list_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    ->inode->i_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
 *    ->inode->i_lock                (zap_pte_range->set_page_dirty)
 *    ->private_lock                (zap_pte_range->block_dirty_folio)
 */

static void page_cache_delete(struct address_space *mapping,
                                   struct folio *folio, void *shadow)
{
        XA_STATE(xas, &mapping->i_pages, folio->index);
        long nr = 1;

        mapping_set_update(&xas, mapping);

        xas_set_order(&xas, folio->index, folio_order(folio));
        nr = folio_nr_pages(folio);

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        xas_store(&xas, shadow);
        xas_init_marks(&xas);

        folio->mapping = NULL;
        /* Leave folio->index set: truncation lookup relies upon it */
        mapping->nrpages -= nr;
}

static void filemap_unaccount_folio(struct address_space *mapping,
                struct folio *folio)
{
        long nr;

        VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
        if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
                pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
                         current->comm, folio_pfn(folio));
                dump_page(&folio->page, "still mapped when deleted");
                dump_stack();
                add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

                if (mapping_exiting(mapping) && !folio_test_large(folio)) {
                        int mapcount = folio_mapcount(folio);

                        if (folio_ref_count(folio) >= mapcount + 2) {
                                /*
                                 * All vmas have already been torn down, so it's
                                 * a good bet that actually the page is unmapped
                                 * and we'd rather not leak it: if we're wrong,
                                 * another bad page check should catch it later.
                                 */
                                atomic_set(&folio->_mapcount, -1);
                                folio_ref_sub(folio, mapcount);
                        }
                }
        }

        /* hugetlb folios do not participate in page cache accounting. */
        if (folio_test_hugetlb(folio))
                return;

        nr = folio_nr_pages(folio);

        lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
        if (folio_test_swapbacked(folio)) {
                lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
                if (folio_test_pmd_mappable(folio))
                        lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
        } else if (folio_test_pmd_mappable(folio)) {
                lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
                filemap_nr_thps_dec(mapping);
        }
        if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))
                mod_node_page_state(folio_pgdat(folio),
                                    NR_KERNEL_FILE_PAGES, -nr);

        /*
         * At this point folio must be either written or cleaned by
         * truncate.  Dirty folio here signals a bug and loss of
         * unwritten data - on ordinary filesystems.
         *
         * But it's harmless on in-memory filesystems like tmpfs; and can
         * occur when a driver which did get_user_pages() sets page dirty
         * before putting it, while the inode is being finally evicted.
         *
         * Below fixes dirty accounting after removing the folio entirely
         * but leaves the dirty flag set: it has no effect for truncated
         * folio and anyway will be cleared before returning folio to
         * buddy allocator.
         */
        if (WARN_ON_ONCE(folio_test_dirty(folio) &&
                         mapping_can_writeback(mapping)))
                folio_account_cleaned(folio, inode_to_wb(mapping->host));
}

/*
 * Delete a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
 * is safe.  The caller must hold the i_pages lock.
 */
void __filemap_remove_folio(struct folio *folio, void *shadow)
{
        struct address_space *mapping = folio->mapping;

        trace_mm_filemap_delete_from_page_cache(folio);
        filemap_unaccount_folio(mapping, folio);
        page_cache_delete(mapping, folio, shadow);
}

static void filemap_free_folio(const struct address_space *mapping,
                struct folio *folio)
{
        void (*free_folio)(struct folio *);

        free_folio = mapping->a_ops->free_folio;
        if (free_folio)
                free_folio(folio);

        folio_put_refs(folio, folio_nr_pages(folio));
}

/**
 * filemap_remove_folio - Remove folio from page cache.
 * @folio: The folio.
 *
 * This must be called only on folios that are locked and have been
 * verified to be in the page cache.  It will never put the folio into
 * the free list because the caller has a reference on the page.
 */
void filemap_remove_folio(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;

        BUG_ON(!folio_test_locked(folio));
        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        __filemap_remove_folio(folio, NULL);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_lru_list_add(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        filemap_free_folio(mapping, folio);
}

/*
 * page_cache_delete_batch - delete several folios from page cache
 * @mapping: the mapping to which folios belong
 * @fbatch: batch of folios to delete
 *
 * The function walks over mapping->i_pages and removes folios passed in
 * @fbatch from the mapping. The function expects @fbatch to be sorted
 * by page index and is optimised for it to be dense.
 * It tolerates holes in @fbatch (mapping entries at those indices are not
 * modified).
 *
 * The function expects the i_pages lock to be held.
 */
static void page_cache_delete_batch(struct address_space *mapping,
                             struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
        long total_pages = 0;
        int i = 0;
        struct folio *folio;

        mapping_set_update(&xas, mapping);
        xas_for_each(&xas, folio, ULONG_MAX) {
                if (i >= folio_batch_count(fbatch))
                        break;

                /* A swap/dax/shadow entry got inserted? Skip it. */
                if (xa_is_value(folio))
                        continue;
                /*
                 * A page got inserted in our range? Skip it. We have our
                 * pages locked so they are protected from being removed.
                 * If we see a page whose index is higher than ours, it
                 * means our page has been removed, which shouldn't be
                 * possible because we're holding the PageLock.
                 */
                if (folio != fbatch->folios[i]) {
                        VM_BUG_ON_FOLIO(folio->index >
                                        fbatch->folios[i]->index, folio);
                        continue;
                }

                WARN_ON_ONCE(!folio_test_locked(folio));

                folio->mapping = NULL;
                /* Leave folio->index set: truncation lookup relies on it */

                i++;
                xas_store(&xas, NULL);
                total_pages += folio_nr_pages(folio);
        }
        mapping->nrpages -= total_pages;
}

void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct folio_batch *fbatch)
{
        int i;

        if (!folio_batch_count(fbatch))
                return;

        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                trace_mm_filemap_delete_from_page_cache(folio);
                filemap_unaccount_folio(mapping, folio);
        }
        page_cache_delete_batch(mapping, fbatch);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_lru_list_add(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        for (i = 0; i < folio_batch_count(fbatch); i++)
                filemap_free_folio(mapping, fbatch->folios[i]);
}

int filemap_check_errors(struct address_space *mapping)
{
        int ret = 0;
        /* Check for outstanding write errors */
        if (test_bit(AS_ENOSPC, &mapping->flags) &&
            test_and_clear_bit(AS_ENOSPC, &mapping->flags))
                ret = -ENOSPC;
        if (test_bit(AS_EIO, &mapping->flags) &&
            test_and_clear_bit(AS_EIO, &mapping->flags))
                ret = -EIO;
        return ret;
}
EXPORT_SYMBOL(filemap_check_errors);

static int filemap_check_and_keep_errors(struct address_space *mapping)
{
        /* Check for outstanding write errors */
        if (test_bit(AS_EIO, &mapping->flags))
                return -EIO;
        if (test_bit(AS_ENOSPC, &mapping->flags))
                return -ENOSPC;
        return 0;
}

static int filemap_writeback(struct address_space *mapping, loff_t start,
                loff_t end, enum writeback_sync_modes sync_mode,
                long *nr_to_write)
{
        struct writeback_control wbc = {
                .sync_mode        = sync_mode,
                .nr_to_write        = nr_to_write ? *nr_to_write : LONG_MAX,
                .range_start        = start,
                .range_end        = end,
        };
        int ret;

        if (!mapping_can_writeback(mapping) ||
            !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;

        wbc_attach_fdatawrite_inode(&wbc, mapping->host);
        ret = do_writepages(mapping, &wbc);
        wbc_detach_inode(&wbc);

        if (!ret && nr_to_write)
                *nr_to_write = wbc.nr_to_write;
        return ret;
}

/**
 * filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 * @mapping:        address space structure to write
 * @start:        offset in bytes where the range starts
 * @end:        offset in bytes where the range ends (inclusive)
 *
 * Start writeback against all of a mapping's dirty pages that lie
 * within the byte offsets <start, end> inclusive.
 *
 * This is a data integrity operation that waits upon dirty or in writeback
 * pages.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                loff_t end)
{
        return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL);
}
EXPORT_SYMBOL(filemap_fdatawrite_range);

int filemap_fdatawrite(struct address_space *mapping)
{
        return filemap_fdatawrite_range(mapping, 0, LLONG_MAX);
}
EXPORT_SYMBOL(filemap_fdatawrite);

/**
 * filemap_flush_range - start writeback on a range
 * @mapping:        target address_space
 * @start:        index to start writeback on
 * @end:        last (inclusive) index for writeback
 *
 * This is a non-integrity writeback helper, to start writing back folios
 * for the indicated range.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_flush_range(struct address_space *mapping, loff_t start,
                                  loff_t end)
{
        return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL);
}
EXPORT_SYMBOL_GPL(filemap_flush_range);

/**
 * filemap_flush - mostly a non-blocking flush
 * @mapping:        target address_space
 *
 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 * purposes - I/O may not be started against all dirty pages.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_flush(struct address_space *mapping)
{
        return filemap_flush_range(mapping, 0, LLONG_MAX);
}
EXPORT_SYMBOL(filemap_flush);

/*
 * Start writeback on @nr_to_write pages from @mapping.  No one but the existing
 * btrfs caller should be using this.  Talk to linux-mm if you think adding a
 * new caller is a good idea.
 */
int filemap_flush_nr(struct address_space *mapping, long *nr_to_write)
{
        return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE,
                        nr_to_write);
}
EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs");

/**
 * filemap_range_has_page - check if a page exists in range.
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback.
 *
 * Return: %true if at least one page exists in the specified range,
 * %false otherwise.
 */
bool filemap_range_has_page(struct address_space *mapping,
                           loff_t start_byte, loff_t end_byte)
{
        struct folio *folio;
        XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
        pgoff_t max = end_byte >> PAGE_SHIFT;

        if (end_byte < start_byte)
                return false;

        rcu_read_lock();
        for (;;) {
                folio = xas_find(&xas, max);
                if (xas_retry(&xas, folio))
                        continue;
                /* Shadow entries don't count */
                if (xa_is_value(folio))
                        continue;
                /*
                 * We don't need to try to pin this page; we're about to
                 * release the RCU lock anyway.  It is enough to know that
                 * there was a page here recently.
                 */
                break;
        }
        rcu_read_unlock();

        return folio != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);

static void __filemap_fdatawait_range(struct address_space *mapping,
                                     loff_t start_byte, loff_t end_byte)
{
        pgoff_t index = start_byte >> PAGE_SHIFT;
        pgoff_t end = end_byte >> PAGE_SHIFT;
        struct folio_batch fbatch;
        unsigned nr_folios;

        folio_batch_init(&fbatch);

        while (index <= end) {
                unsigned i;

                nr_folios = filemap_get_folios_tag(mapping, &index, end,
                                PAGECACHE_TAG_WRITEBACK, &fbatch);

                if (!nr_folios)
                        break;

                for (i = 0; i < nr_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        folio_wait_writeback(folio);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
}

/**
 * filemap_fdatawait_range - wait for writeback to complete
 * @mapping:                address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space
 * in the given range and wait for all of them.  Check error status of
 * the address space and return it.
 *
 * Since the error status of the address space is cleared by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
 *
 * Return: error status of the address space.
 */
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                            loff_t end_byte)
{
        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return filemap_check_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range);

/**
 * filemap_fdatawait_range_keep_errors - wait for writeback to complete
 * @mapping:                address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space in the
 * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
 * this function does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 */
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte)
{
        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);

/**
 * file_fdatawait_range - wait for writeback to complete
 * @file:                file pointing to address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the address space that file
 * refers to, in the given range and wait for all of them.  Check error
 * status of the address space vs. the file->f_wb_err cursor and return it.
 *
 * Since the error status of the file is advanced by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
 *
 * Return: error status of the address space vs. the file->f_wb_err cursor.
 */
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
        struct address_space *mapping = file->f_mapping;

        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return file_check_and_advance_wb_err(file);
}
EXPORT_SYMBOL(file_fdatawait_range);

/**
 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
 * @mapping: address space structure to wait for
 *
 * Walk the list of under-writeback pages of the given address space
 * and wait for all of them.  Unlike filemap_fdatawait(), this function
 * does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 *
 * Return: error status of the address space.
 */
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
        __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
        return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);

/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
        return mapping->nrpages;
}

bool filemap_range_has_writeback(struct address_space *mapping,
                                 loff_t start_byte, loff_t end_byte)
{
        XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
        pgoff_t max = end_byte >> PAGE_SHIFT;
        struct folio *folio;

        if (end_byte < start_byte)
                return false;

        rcu_read_lock();
        xas_for_each(&xas, folio, max) {
                if (xas_retry(&xas, folio))
                        continue;
                if (xa_is_value(folio))
                        continue;
                if (folio_test_dirty(folio) || folio_test_locked(folio) ||
                                folio_test_writeback(folio))
                        break;
        }
        rcu_read_unlock();
        return folio != NULL;
}
EXPORT_SYMBOL_GPL(filemap_range_has_writeback);

/**
 * filemap_write_and_wait_range - write out & wait on a file range
 * @mapping:        the address_space for the pages
 * @lstart:        offset in bytes where the range starts
 * @lend:        offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * Return: error status of the address space.
 */
int filemap_write_and_wait_range(struct address_space *mapping,
                                 loff_t lstart, loff_t lend)
{
        int err = 0, err2;

        if (lend < lstart)
                return 0;

        if (mapping_needs_writeback(mapping)) {
                err = filemap_fdatawrite_range(mapping, lstart, lend);
                /*
                 * Even if the above returned error, the pages may be
                 * written partially (e.g. -ENOSPC), so we wait for it.
                 * But the -EIO is special case, it may indicate the worst
                 * thing (e.g. bug) happened, so we avoid waiting for it.
                 */
                if (err != -EIO)
                        __filemap_fdatawait_range(mapping, lstart, lend);
        }
        err2 = filemap_check_errors(mapping);
        if (!err)
                err = err2;
        return err;
}
EXPORT_SYMBOL(filemap_write_and_wait_range);

void __filemap_set_wb_err(struct address_space *mapping, int err)
{
        errseq_t eseq = errseq_set(&mapping->wb_err, err);

        trace_filemap_set_wb_err(mapping, eseq);
}
EXPORT_SYMBOL(__filemap_set_wb_err);

/**
 * file_check_and_advance_wb_err - report wb error (if any) that was previously
 *                                    and advance wb_err to current one
 * @file: struct file on which the error is being reported
 *
 * When userland calls fsync (or something like nfsd does the equivalent), we
 * want to report any writeback errors that occurred since the last fsync (or
 * since the file was opened if there haven't been any).
 *
 * Grab the wb_err from the mapping. If it matches what we have in the file,
 * then just quickly return 0. The file is all caught up.
 *
 * If it doesn't match, then take the mapping value, set the "seen" flag in
 * it and try to swap it into place. If it works, or another task beat us
 * to it with the new value, then update the f_wb_err and return the error
 * portion. The error at this point must be reported via proper channels
 * (a'la fsync, or NFS COMMIT operation, etc.).
 *
 * While we handle mapping->wb_err with atomic operations, the f_wb_err
 * value is protected by the f_lock since we must ensure that it reflects
 * the latest value swapped in for this file descriptor.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int file_check_and_advance_wb_err(struct file *file)
{
        int err = 0;
        errseq_t old = READ_ONCE(file->f_wb_err);
        struct address_space *mapping = file->f_mapping;

        /* Locklessly handle the common case where nothing has changed */
        if (errseq_check(&mapping->wb_err, old)) {
                /* Something changed, must use slow path */
                spin_lock(&file->f_lock);
                old = file->f_wb_err;
                err = errseq_check_and_advance(&mapping->wb_err,
                                                &file->f_wb_err);
                trace_file_check_and_advance_wb_err(file, old);
                spin_unlock(&file->f_lock);
        }

        /*
         * We're mostly using this function as a drop in replacement for
         * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
         * that the legacy code would have had on these flags.
         */
        clear_bit(AS_EIO, &mapping->flags);
        clear_bit(AS_ENOSPC, &mapping->flags);
        return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);

/**
 * file_write_and_wait_range - write out & wait on a file range
 * @file:        file pointing to address_space with pages
 * @lstart:        offset in bytes where the range starts
 * @lend:        offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * After writing out and waiting on the data, we check and advance the
 * f_wb_err cursor to the latest value, and return any errors detected there.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
        int err = 0, err2;
        struct address_space *mapping = file->f_mapping;

        if (lend < lstart)
                return 0;

        if (mapping_needs_writeback(mapping)) {
                err = filemap_fdatawrite_range(mapping, lstart, lend);
                /* See comment of filemap_write_and_wait() */
                if (err != -EIO)
                        __filemap_fdatawait_range(mapping, lstart, lend);
        }
        err2 = file_check_and_advance_wb_err(file);
        if (!err)
                err = err2;
        return err;
}
EXPORT_SYMBOL(file_write_and_wait_range);

/**
 * replace_page_cache_folio - replace a pagecache folio with a new one
 * @old:        folio to be replaced
 * @new:        folio to replace with
 *
 * This function replaces a folio in the pagecache with a new one.  On
 * success it acquires the pagecache reference for the new folio and
 * drops it for the old folio.  Both the old and new folios must be
 * locked.  This function does not add the new folio to the LRU, the
 * caller must do that.
 *
 * The remove + add is atomic.  This function cannot fail.
 */
void replace_page_cache_folio(struct folio *old, struct folio *new)
{
        struct address_space *mapping = old->mapping;
        void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
        pgoff_t offset = old->index;
        XA_STATE(xas, &mapping->i_pages, offset);

        VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
        VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
        VM_BUG_ON_FOLIO(new->mapping, new);

        folio_get(new);
        new->mapping = mapping;
        new->index = offset;

        mem_cgroup_replace_folio(old, new);

        xas_lock_irq(&xas);
        xas_store(&xas, new);

        old->mapping = NULL;
        /* hugetlb pages do not participate in page cache accounting. */
        if (!folio_test_hugetlb(old))
                lruvec_stat_sub_folio(old, NR_FILE_PAGES);
        if (!folio_test_hugetlb(new))
                lruvec_stat_add_folio(new, NR_FILE_PAGES);
        if (folio_test_swapbacked(old))
                lruvec_stat_sub_folio(old, NR_SHMEM);
        if (folio_test_swapbacked(new))
                lruvec_stat_add_folio(new, NR_SHMEM);
        xas_unlock_irq(&xas);
        if (free_folio)
                free_folio(old);
        folio_put(old);
}
EXPORT_SYMBOL_GPL(replace_page_cache_folio);

noinline int __filemap_add_folio(struct address_space *mapping,
                struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
        XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
        bool huge;
        long nr;
        unsigned int forder = folio_order(folio);

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
        VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),
                        folio);
        mapping_set_update(&xas, mapping);

        VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
        huge = folio_test_hugetlb(folio);
        nr = folio_nr_pages(folio);

        gfp &= GFP_RECLAIM_MASK;
        folio_ref_add(folio, nr);
        folio->mapping = mapping;
        folio->index = xas.xa_index;

        for (;;) {
                int order = -1;
                void *entry, *old = NULL;

                xas_lock_irq(&xas);
                xas_for_each_conflict(&xas, entry) {
                        old = entry;
                        if (!xa_is_value(entry)) {
                                xas_set_err(&xas, -EEXIST);
                                goto unlock;
                        }
                        /*
                         * If a larger entry exists,
                         * it will be the first and only entry iterated.
                         */
                        if (order == -1)
                                order = xas_get_order(&xas);
                }

                if (old) {
                        if (order > 0 && order > forder) {
                                unsigned int split_order = max(forder,
                                                xas_try_split_min_order(order));

                                /* How to handle large swap entries? */
                                BUG_ON(shmem_mapping(mapping));

                                while (order > forder) {
                                        xas_set_order(&xas, index, split_order);
                                        xas_try_split(&xas, old, order);
                                        if (xas_error(&xas))
                                                goto unlock;
                                        order = split_order;
                                        split_order =
                                                max(xas_try_split_min_order(
                                                            split_order),
                                                    forder);
                                }
                                xas_reset(&xas);
                        }
                        if (shadowp)
                                *shadowp = old;
                }

                xas_store(&xas, folio);
                if (xas_error(&xas))
                        goto unlock;

                mapping->nrpages += nr;

                /* hugetlb pages do not participate in page cache accounting */
                if (!huge) {
                        lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
                        if (folio_test_pmd_mappable(folio))
                                lruvec_stat_mod_folio(folio,
                                                NR_FILE_THPS, nr);
                }

unlock:
                xas_unlock_irq(&xas);

                if (!xas_nomem(&xas, gfp))
                        break;
        }

        if (xas_error(&xas))
                goto error;

        trace_mm_filemap_add_to_page_cache(folio);
        return 0;
error:
        folio->mapping = NULL;
        /* Leave folio->index set: truncation relies upon it */
        folio_put_refs(folio, nr);
        return xas_error(&xas);
}
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);

int filemap_add_folio(struct address_space *mapping, struct folio *folio,
                                pgoff_t index, gfp_t gfp)
{
        void *shadow = NULL;
        int ret;
        struct mem_cgroup *tmp;
        bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags);

        if (kernel_file)
                tmp = set_active_memcg(root_mem_cgroup);
        ret = mem_cgroup_charge(folio, NULL, gfp);
        if (kernel_file)
                set_active_memcg(tmp);
        if (ret)
                return ret;

        __folio_set_locked(folio);
        ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
        if (unlikely(ret)) {
                mem_cgroup_uncharge(folio);
                __folio_clear_locked(folio);
        } else {
                /*
                 * The folio might have been evicted from cache only
                 * recently, in which case it should be activated like
                 * any other repeatedly accessed folio.
                 * The exception is folios getting rewritten; evicting other
                 * data from the working set, only to cache data that will
                 * get overwritten with something else, is a waste of memory.
                 */
                WARN_ON_ONCE(folio_test_active(folio));
                if (!(gfp & __GFP_WRITE) && shadow)
                        workingset_refault(folio, shadow);
                folio_add_lru(folio);
                if (kernel_file)
                        mod_node_page_state(folio_pgdat(folio),
                                            NR_KERNEL_FILE_PAGES,
                                            folio_nr_pages(folio));
        }
        return ret;
}
EXPORT_SYMBOL_GPL(filemap_add_folio);

#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *policy)
{
        int n;
        struct folio *folio;

        if (policy)
                return folio_alloc_mpol_noprof(gfp, order, policy,
                                NO_INTERLEAVE_INDEX, numa_node_id());

        if (cpuset_do_page_mem_spread()) {
                unsigned int cpuset_mems_cookie;
                do {
                        cpuset_mems_cookie = read_mems_allowed_begin();
                        n = cpuset_mem_spread_node();
                        folio = __folio_alloc_node_noprof(gfp, order, n);
                } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));

                return folio;
        }
        return folio_alloc_noprof(gfp, order);
}
EXPORT_SYMBOL(filemap_alloc_folio_noprof);
#endif

/*
 * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
 *
 * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to lock
 * @mapping2: the second mapping to lock
 */
void filemap_invalidate_lock_two(struct address_space *mapping1,
                                 struct address_space *mapping2)
{
        if (mapping1 > mapping2)
                swap(mapping1, mapping2);
        if (mapping1)
                down_write(&mapping1->invalidate_lock);
        if (mapping2 && mapping1 != mapping2)
                down_write_nested(&mapping2->invalidate_lock, 1);
}
EXPORT_SYMBOL(filemap_invalidate_lock_two);

/*
 * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
 *
 * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to unlock
 * @mapping2: the second mapping to unlock
 */
void filemap_invalidate_unlock_two(struct address_space *mapping1,
                                   struct address_space *mapping2)
{
        if (mapping1)
                up_write(&mapping1->invalidate_lock);
        if (mapping2 && mapping1 != mapping2)
                up_write(&mapping2->invalidate_lock);
}
EXPORT_SYMBOL(filemap_invalidate_unlock_two);

/*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
 * waitqueues where the bucket discipline is to maintain all
 * waiters on the same queue and wake all when any of the pages
 * become available, and for the woken contexts to check to be
 * sure the appropriate page became available, this saves space
 * at a cost of "thundering herd" phenomena during rare hash
 * collisions.
 */
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;

static wait_queue_head_t *folio_waitqueue(struct folio *folio)
{
        return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}

/* How many times do we accept lock stealing from under a waiter? */
static int sysctl_page_lock_unfairness = 5;
static const struct ctl_table filemap_sysctl_table[] = {
        {
                .procname        = "page_lock_unfairness",
                .data                = &sysctl_page_lock_unfairness,
                .maxlen                = sizeof(sysctl_page_lock_unfairness),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
        }
};

void __init pagecache_init(void)
{
        int i;

        for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
                init_waitqueue_head(&folio_wait_table[i]);

        page_writeback_init();
        register_sysctl_init("vm", filemap_sysctl_table);
}

/*
 * The page wait code treats the "wait->flags" somewhat unusually, because
 * we have multiple different kinds of waits, not just the usual "exclusive"
 * one.
 *
 * We have:
 *
 *  (a) no special bits set:
 *
 *        We're just waiting for the bit to be released, and when a waker
 *        calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
 *        and remove it from the wait queue.
 *
 *        Simple and straightforward.
 *
 *  (b) WQ_FLAG_EXCLUSIVE:
 *
 *        The waiter is waiting to get the lock, and only one waiter should
 *        be woken up to avoid any thundering herd behavior. We'll set the
 *        WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
 *
 *        This is the traditional exclusive wait.
 *
 *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
 *
 *        The waiter is waiting to get the bit, and additionally wants the
 *        lock to be transferred to it for fair lock behavior. If the lock
 *        cannot be taken, we stop walking the wait queue without waking
 *        the waiter.
 *
 *        This is the "fair lock handoff" case, and in addition to setting
 *        WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
 *        that it now has the lock.
 */
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
        unsigned int flags;
        struct wait_page_key *key = arg;
        struct wait_page_queue *wait_page
                = container_of(wait, struct wait_page_queue, wait);

        if (!wake_page_match(wait_page, key))
                return 0;

        /*
         * If it's a lock handoff wait, we get the bit for it, and
         * stop walking (and do not wake it up) if we can't.
         */
        flags = wait->flags;
        if (flags & WQ_FLAG_EXCLUSIVE) {
                if (test_bit(key->bit_nr, &key->folio->flags.f))
                        return -1;
                if (flags & WQ_FLAG_CUSTOM) {
                        if (test_and_set_bit(key->bit_nr, &key->folio->flags.f))
                                return -1;
                        flags |= WQ_FLAG_DONE;
                }
        }

        /*
         * We are holding the wait-queue lock, but the waiter that
         * is waiting for this will be checking the flags without
         * any locking.
         *
         * So update the flags atomically, and wake up the waiter
         * afterwards to avoid any races. This store-release pairs
         * with the load-acquire in folio_wait_bit_common().
         */
        smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
        wake_up_state(wait->private, mode);

        /*
         * Ok, we have successfully done what we're waiting for,
         * and we can unconditionally remove the wait entry.
         *
         * Note that this pairs with the "finish_wait()" in the
         * waiter, and has to be the absolute last thing we do.
         * After this list_del_init(&wait->entry) the wait entry
         * might be de-allocated and the process might even have
         * exited.
         */
        list_del_init_careful(&wait->entry);
        return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}

static void folio_wake_bit(struct folio *folio, int bit_nr)
{
        wait_queue_head_t *q = folio_waitqueue(folio);
        struct wait_page_key key;
        unsigned long flags;

        key.folio = folio;
        key.bit_nr = bit_nr;
        key.page_match = 0;

        spin_lock_irqsave(&q->lock, flags);
        __wake_up_locked_key(q, TASK_NORMAL, &key);

        /*
         * It's possible to miss clearing waiters here, when we woke our page
         * waiters, but the hashed waitqueue has waiters for other pages on it.
         * That's okay, it's a rare case. The next waker will clear it.
         *
         * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
         * other), the flag may be cleared in the course of freeing the page;
         * but that is not required for correctness.
         */
        if (!waitqueue_active(q) || !key.page_match)
                folio_clear_waiters(folio);

        spin_unlock_irqrestore(&q->lock, flags);
}

/*
 * A choice of three behaviors for folio_wait_bit_common():
 */
enum behavior {
        EXCLUSIVE,        /* Hold ref to page and take the bit when woken, like
                         * __folio_lock() waiting on then setting PG_locked.
                         */
        SHARED,                /* Hold ref to page and check the bit when woken, like
                         * folio_wait_writeback() waiting on PG_writeback.
                         */
        DROP,                /* Drop ref to page before wait, no check when woken,
                         * like folio_put_wait_locked() on PG_locked.
                         */
};

/*
 * Attempt to check (or get) the folio flag, and mark us done
 * if successful.
 */
static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
                                        struct wait_queue_entry *wait)
{
        if (wait->flags & WQ_FLAG_EXCLUSIVE) {
                if (test_and_set_bit(bit_nr, &folio->flags.f))
                        return false;
        } else if (test_bit(bit_nr, &folio->flags.f))
                return false;

        wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
        return true;
}

static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
                int state, enum behavior behavior)
{
        wait_queue_head_t *q = folio_waitqueue(folio);
        int unfairness = sysctl_page_lock_unfairness;
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
        bool thrashing = false;
        unsigned long pflags;
        bool in_thrashing;

        if (bit_nr == PG_locked &&
            !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
                delayacct_thrashing_start(&in_thrashing);
                psi_memstall_enter(&pflags);
                thrashing = true;
        }

        init_wait(wait);
        wait->func = wake_page_function;
        wait_page.folio = folio;
        wait_page.bit_nr = bit_nr;

repeat:
        wait->flags = 0;
        if (behavior == EXCLUSIVE) {
                wait->flags = WQ_FLAG_EXCLUSIVE;
                if (--unfairness < 0)
                        wait->flags |= WQ_FLAG_CUSTOM;
        }

        /*
         * Do one last check whether we can get the
         * page bit synchronously.
         *
         * Do the folio_set_waiters() marking before that
         * to let any waker we _just_ missed know they
         * need to wake us up (otherwise they'll never
         * even go to the slow case that looks at the
         * page queue), and add ourselves to the wait
         * queue if we need to sleep.
         *
         * This part needs to be done under the queue
         * lock to avoid races.
         */
        spin_lock_irq(&q->lock);
        folio_set_waiters(folio);
        if (!folio_trylock_flag(folio, bit_nr, wait))
                __add_wait_queue_entry_tail(q, wait);
        spin_unlock_irq(&q->lock);

        /*
         * From now on, all the logic will be based on
         * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
         * see whether the page bit testing has already
         * been done by the wake function.
         *
         * We can drop our reference to the folio.
         */
        if (behavior == DROP)
                folio_put(folio);

        /*
         * Note that until the "finish_wait()", or until
         * we see the WQ_FLAG_WOKEN flag, we need to
         * be very careful with the 'wait->flags', because
         * we may race with a waker that sets them.
         */
        for (;;) {
                unsigned int flags;

                set_current_state(state);

                /* Loop until we've been woken or interrupted */
                flags = smp_load_acquire(&wait->flags);
                if (!(flags & WQ_FLAG_WOKEN)) {
                        if (signal_pending_state(state, current))
                                break;

                        io_schedule();
                        continue;
                }

                /* If we were non-exclusive, we're done */
                if (behavior != EXCLUSIVE)
                        break;

                /* If the waker got the lock for us, we're done */
                if (flags & WQ_FLAG_DONE)
                        break;

                /*
                 * Otherwise, if we're getting the lock, we need to
                 * try to get it ourselves.
                 *
                 * And if that fails, we'll have to retry this all.
                 */
                if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
                        goto repeat;

                wait->flags |= WQ_FLAG_DONE;
                break;
        }

        /*
         * If a signal happened, this 'finish_wait()' may remove the last
         * waiter from the wait-queues, but the folio waiters bit will remain
         * set. That's ok. The next wakeup will take care of it, and trying
         * to do it here would be difficult and prone to races.
         */
        finish_wait(q, wait);

        if (thrashing) {
                delayacct_thrashing_end(&in_thrashing);
                psi_memstall_leave(&pflags);
        }

        /*
         * NOTE! The wait->flags weren't stable until we've done the
         * 'finish_wait()', and we could have exited the loop above due
         * to a signal, and had a wakeup event happen after the signal
         * test but before the 'finish_wait()'.
         *
         * So only after the finish_wait() can we reliably determine
         * if we got woken up or not, so we can now figure out the final
         * return value based on that state without races.
         *
         * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
         * waiter, but an exclusive one requires WQ_FLAG_DONE.
         */
        if (behavior == EXCLUSIVE)
                return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;

        return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}

#ifdef CONFIG_MIGRATION
/**
 * softleaf_entry_wait_on_locked - Wait for a migration entry or
 * device_private entry to be removed.
 * @entry: migration or device_private swap entry.
 * @ptl: already locked ptl. This function will drop the lock.
 *
 * Wait for a migration entry referencing the given page, or device_private
 * entry referencing a dvice_private page to be unlocked. This is
 * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
 * this can be called without taking a reference on the page. Instead this
 * should be called while holding the ptl for @entry referencing
 * the page.
 *
 * Returns after unlocking the ptl.
 *
 * This follows the same logic as folio_wait_bit_common() so see the comments
 * there.
 */
void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
        __releases(ptl)
{
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
        bool thrashing = false;
        unsigned long pflags;
        bool in_thrashing;
        wait_queue_head_t *q;
        struct folio *folio = softleaf_to_folio(entry);

        q = folio_waitqueue(folio);
        if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
                delayacct_thrashing_start(&in_thrashing);
                psi_memstall_enter(&pflags);
                thrashing = true;
        }

        init_wait(wait);
        wait->func = wake_page_function;
        wait_page.folio = folio;
        wait_page.bit_nr = PG_locked;
        wait->flags = 0;

        spin_lock_irq(&q->lock);
        folio_set_waiters(folio);
        if (!folio_trylock_flag(folio, PG_locked, wait))
                __add_wait_queue_entry_tail(q, wait);
        spin_unlock_irq(&q->lock);

        /*
         * If a migration entry exists for the page the migration path must hold
         * a valid reference to the page, and it must take the ptl to remove the
         * migration entry. So the page is valid until the ptl is dropped.
         * Similarly any path attempting to drop the last reference to a
         * device-private page needs to grab the ptl to remove the device-private
         * entry.
         */
        spin_unlock(ptl);

        for (;;) {
                unsigned int flags;

                set_current_state(TASK_UNINTERRUPTIBLE);

                /* Loop until we've been woken or interrupted */
                flags = smp_load_acquire(&wait->flags);
                if (!(flags & WQ_FLAG_WOKEN)) {
                        if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
                                break;

                        io_schedule();
                        continue;
                }
                break;
        }

        finish_wait(q, wait);

        if (thrashing) {
                delayacct_thrashing_end(&in_thrashing);
                psi_memstall_leave(&pflags);
        }
}
#endif

void folio_wait_bit(struct folio *folio, int bit_nr)
{
        folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
EXPORT_SYMBOL(folio_wait_bit);

int folio_wait_bit_killable(struct folio *folio, int bit_nr)
{
        return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
}
EXPORT_SYMBOL(folio_wait_bit_killable);

/**
 * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
 * @folio: The folio to wait for.
 * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
 *
 * The caller should hold a reference on @folio.  They expect the page to
 * become unlocked relatively soon, but do not wish to hold up migration
 * (for example) by holding the reference while waiting for the folio to
 * come unlocked.  After this function returns, the caller should not
 * dereference @folio.
 *
 * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
 */
static int folio_put_wait_locked(struct folio *folio, int state)
{
        return folio_wait_bit_common(folio, PG_locked, state, DROP);
}

/**
 * folio_unlock - Unlock a locked folio.
 * @folio: The folio.
 *
 * Unlocks the folio and wakes up any thread sleeping on the page lock.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
 */
void folio_unlock(struct folio *folio)
{
        /* Bit 7 allows x86 to check the byte's sign bit */
        BUILD_BUG_ON(PG_waiters != 7);
        BUILD_BUG_ON(PG_locked > 7);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
                folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_unlock);

/**
 * folio_end_read - End read on a folio.
 * @folio: The folio.
 * @success: True if all reads completed successfully.
 *
 * When all reads against a folio have completed, filesystems should
 * call this function to let the pagecache know that no more reads
 * are outstanding.  This will unlock the folio and wake up any thread
 * sleeping on the lock.  The folio will also be marked uptodate if all
 * reads succeeded.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
 */
void folio_end_read(struct folio *folio, bool success)
{
        unsigned long mask = 1 << PG_locked;

        /* Must be in bottom byte for x86 to work */
        BUILD_BUG_ON(PG_uptodate > 7);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);

        if (likely(success))
                mask |= 1 << PG_uptodate;
        if (folio_xor_flags_has_waiters(folio, mask))
                folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_end_read);

/**
 * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
 * @folio: The folio.
 *
 * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
 * it.  The folio reference held for PG_private_2 being set is released.
 *
 * This is, for example, used when a netfs folio is being written to a local
 * disk cache, thereby allowing writes to the cache for the same folio to be
 * serialised.
 */
void folio_end_private_2(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
        clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
        folio_wake_bit(folio, PG_private_2);
        folio_put(folio);
}
EXPORT_SYMBOL(folio_end_private_2);

/**
 * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
 *
 * Wait for PG_private_2 to be cleared on a folio.
 */
void folio_wait_private_2(struct folio *folio)
{
        while (folio_test_private_2(folio))
                folio_wait_bit(folio, PG_private_2);
}
EXPORT_SYMBOL(folio_wait_private_2);

/**
 * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
 *
 * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
 * received by the calling task.
 *
 * Return:
 * - 0 if successful.
 * - -EINTR if a fatal signal was encountered.
 */
int folio_wait_private_2_killable(struct folio *folio)
{
        int ret = 0;

        while (folio_test_private_2(folio)) {
                ret = folio_wait_bit_killable(folio, PG_private_2);
                if (ret < 0)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(folio_wait_private_2_killable);

static void filemap_end_dropbehind(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (folio_test_writeback(folio) || folio_test_dirty(folio))
                return;
        if (!folio_test_clear_dropbehind(folio))
                return;
        if (mapping)
                folio_unmap_invalidate(mapping, folio, 0);
}

/*
 * If folio was marked as dropbehind, then pages should be dropped when writeback
 * completes. Do that now. If we fail, it's likely because of a big folio -
 * just reset dropbehind for that case and latter completions should invalidate.
 */
void folio_end_dropbehind(struct folio *folio)
{
        if (!folio_test_dropbehind(folio))
                return;

        /*
         * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
         * but can happen if normal writeback just happens to find dirty folios
         * that were created as part of uncached writeback, and that writeback
         * would otherwise not need non-IRQ handling. Just skip the
         * invalidation in that case.
         */
        if (in_task() && folio_trylock(folio)) {
                filemap_end_dropbehind(folio);
                folio_unlock(folio);
        }
}
EXPORT_SYMBOL_GPL(folio_end_dropbehind);

/**
 * folio_end_writeback_no_dropbehind - End writeback against a folio.
 * @folio: The folio.
 *
 * The folio must actually be under writeback.
 * This call is intended for filesystems that need to defer dropbehind.
 *
 * Context: May be called from process or interrupt context.
 */
void folio_end_writeback_no_dropbehind(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);

        /*
         * folio_test_clear_reclaim() could be used here but it is an
         * atomic operation and overkill in this particular case. Failing
         * to shuffle a folio marked for immediate reclaim is too mild
         * a gain to justify taking an atomic operation penalty at the
         * end of every folio writeback.
         */
        if (folio_test_reclaim(folio)) {
                folio_clear_reclaim(folio);
                folio_rotate_reclaimable(folio);
        }

        if (__folio_end_writeback(folio))
                folio_wake_bit(folio, PG_writeback);

        acct_reclaim_writeback(folio);
}
EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind);

/**
 * folio_end_writeback - End writeback against a folio.
 * @folio: The folio.
 *
 * The folio must actually be under writeback.
 *
 * Context: May be called from process or interrupt context.
 */
void folio_end_writeback(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);

        /*
         * Writeback does not hold a folio reference of its own, relying
         * on truncation to wait for the clearing of PG_writeback.
         * But here we must make sure that the folio is not freed and
         * reused before the folio_wake_bit().
         */
        folio_get(folio);
        folio_end_writeback_no_dropbehind(folio);
        folio_end_dropbehind(folio);
        folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);

/**
 * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
 * @folio: The folio to lock
 */
void __folio_lock(struct folio *folio)
{
        folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
                                EXCLUSIVE);
}
EXPORT_SYMBOL(__folio_lock);

int __folio_lock_killable(struct folio *folio)
{
        return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
                                        EXCLUSIVE);
}
EXPORT_SYMBOL_GPL(__folio_lock_killable);

static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
{
        struct wait_queue_head *q = folio_waitqueue(folio);
        int ret;

        wait->folio = folio;
        wait->bit_nr = PG_locked;

        spin_lock_irq(&q->lock);
        __add_wait_queue_entry_tail(q, &wait->wait);
        folio_set_waiters(folio);
        ret = !folio_trylock(folio);
        /*
         * If we were successful now, we know we're still on the
         * waitqueue as we're still under the lock. This means it's
         * safe to remove and return success, we know the callback
         * isn't going to trigger.
         */
        if (!ret)
                __remove_wait_queue(q, &wait->wait);
        else
                ret = -EIOCBQUEUED;
        spin_unlock_irq(&q->lock);
        return ret;
}

/*
 * Return values:
 * 0 - folio is locked.
 * non-zero - folio is not locked.
 *     mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
 *     vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
 *     FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
 *
 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
 */
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
{
        unsigned int flags = vmf->flags;

        if (fault_flag_allow_retry_first(flags)) {
                /*
                 * CAUTION! In this case, mmap_lock/per-VMA lock is not
                 * released even though returning VM_FAULT_RETRY.
                 */
                if (flags & FAULT_FLAG_RETRY_NOWAIT)
                        return VM_FAULT_RETRY;

                release_fault_lock(vmf);
                if (flags & FAULT_FLAG_KILLABLE)
                        folio_wait_locked_killable(folio);
                else
                        folio_wait_locked(folio);
                return VM_FAULT_RETRY;
        }
        if (flags & FAULT_FLAG_KILLABLE) {
                bool ret;

                ret = __folio_lock_killable(folio);
                if (ret) {
                        release_fault_lock(vmf);
                        return VM_FAULT_RETRY;
                }
        } else {
                __folio_lock(folio);
        }

        return 0;
}

/**
 * page_cache_next_miss() - Find the next gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
 *
 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
 * gap with the lowest index.
 *
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 5, then subsequently a gap is
 * created at index 10, page_cache_next_miss covering both indices may
 * return 10 if called under the rcu_read_lock.
 *
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'return - index >= max_scan' will be true).
 * In the rare case of index wrap-around, 0 will be returned.
 */
pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
{
        XA_STATE(xas, &mapping->i_pages, index);
        unsigned long nr = max_scan;

        while (nr--) {
                void *entry = xas_next(&xas);
                if (!entry || xa_is_value(entry))
                        return xas.xa_index;
                if (xas.xa_index == 0)
                        return 0;
        }

        return index + max_scan;
}
EXPORT_SYMBOL(page_cache_next_miss);

/**
 * page_cache_prev_miss() - Find the previous gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
 *
 * Search the range [max(index - max_scan + 1, 0), index] for the
 * gap with the highest index.
 *
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 10, then subsequently a gap is
 * created at index 5, page_cache_prev_miss() covering both indices may
 * return 5 if called under the rcu_read_lock.
 *
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'index - return >= max_scan' will be true).
 * In the rare case of wrap-around, ULONG_MAX will be returned.
 */
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
{
        XA_STATE(xas, &mapping->i_pages, index);

        while (max_scan--) {
                void *entry = xas_prev(&xas);
                if (!entry || xa_is_value(entry))
                        break;
                if (xas.xa_index == ULONG_MAX)
                        break;
        }

        return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_prev_miss);

/*
 * Lockless page cache protocol:
 * On the lookup side:
 * 1. Load the folio from i_pages
 * 2. Increment the refcount if it's not zero
 * 3. If the folio is not found by xas_reload(), put the refcount and retry
 *
 * On the removal side:
 * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
 * B. Remove the page from i_pages
 * C. Return the page to the page allocator
 *
 * This means that any page may have its reference count temporarily
 * increased by a speculative page cache (or GUP-fast) lookup as it can
 * be allocated by another user before the RCU grace period expires.
 * Because the refcount temporarily acquired here may end up being the
 * last refcount on the page, any page allocation must be freeable by
 * folio_put().
 */

/*
 * filemap_get_entry - Get a page cache entry.
 * @mapping: the address_space to search
 * @index: The page cache index.
 *
 * Looks up the page cache entry at @mapping & @index.  If it is a folio,
 * it is returned with an increased refcount.  If it is a shadow entry
 * of a previously evicted folio, or a swap entry from shmem/tmpfs,
 * it is returned without further action.
 *
 * Return: The folio, swap or shadow entry, %NULL if nothing is found.
 */
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;

        rcu_read_lock();
repeat:
        xas_reset(&xas);
        folio = xas_load(&xas);
        if (xas_retry(&xas, folio))
                goto repeat;
        /*
         * A shadow entry of a recently evicted page, or a swap entry from
         * shmem/tmpfs.  Return it without attempting to raise page count.
         */
        if (!folio || xa_is_value(folio))
                goto out;

        if (!folio_try_get(folio))
                goto repeat;

        if (unlikely(folio != xas_reload(&xas))) {
                folio_put(folio);
                goto repeat;
        }
out:
        rcu_read_unlock();

        return folio;
}

/**
 * __filemap_get_folio_mpol - Find and get a reference to a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 * @fgp_flags: %FGP flags modify how the folio is returned.
 * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
 * @policy: NUMA memory allocation policy to follow.
 *
 * Looks up the page cache entry at @mapping & @index.
 *
 * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
 * if the %GFP flags specified for %FGP_CREAT are atomic.
 *
 * If this function returns a folio, it is returned with an increased refcount.
 *
 * Return: The found folio or an ERR_PTR() otherwise.
 */
struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
                pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy)
{
        struct folio *folio;

repeat:
        folio = filemap_get_entry(mapping, index);
        if (xa_is_value(folio))
                folio = NULL;
        if (!folio)
                goto no_page;

        if (fgp_flags & FGP_LOCK) {
                if (fgp_flags & FGP_NOWAIT) {
                        if (!folio_trylock(folio)) {
                                folio_put(folio);
                                return ERR_PTR(-EAGAIN);
                        }
                } else {
                        folio_lock(folio);
                }

                /* Has the page been truncated? */
                if (unlikely(folio->mapping != mapping)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto repeat;
                }
                VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
        }

        if (fgp_flags & FGP_ACCESSED)
                folio_mark_accessed(folio);
        else if (fgp_flags & FGP_WRITE) {
                /* Clear idle flag for buffer write */
                if (folio_test_idle(folio))
                        folio_clear_idle(folio);
        }

        if (fgp_flags & FGP_STABLE)
                folio_wait_stable(folio);
no_page:
        if (!folio && (fgp_flags & FGP_CREAT)) {
                unsigned int min_order = mapping_min_folio_order(mapping);
                unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
                int err;
                index = mapping_align_index(mapping, index);

                if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
                        gfp |= __GFP_WRITE;
                if (fgp_flags & FGP_NOFS)
                        gfp &= ~__GFP_FS;
                if (fgp_flags & FGP_NOWAIT) {
                        gfp &= ~GFP_KERNEL;
                        gfp |= GFP_NOWAIT;
                }
                if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
                        fgp_flags |= FGP_LOCK;

                if (order > mapping_max_folio_order(mapping))
                        order = mapping_max_folio_order(mapping);
                /* If we're not aligned, allocate a smaller folio */
                if (index & ((1UL << order) - 1))
                        order = __ffs(index);

                do {
                        gfp_t alloc_gfp = gfp;

                        err = -ENOMEM;
                        if (order > min_order)
                                alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
                        folio = filemap_alloc_folio(alloc_gfp, order, policy);
                        if (!folio)
                                continue;

                        /* Init accessed so avoid atomic mark_page_accessed later */
                        if (fgp_flags & FGP_ACCESSED)
                                __folio_set_referenced(folio);
                        if (fgp_flags & FGP_DONTCACHE)
                                __folio_set_dropbehind(folio);

                        err = filemap_add_folio(mapping, folio, index, gfp);
                        if (!err)
                                break;
                        folio_put(folio);
                        folio = NULL;
                } while (order-- > min_order);

                if (err == -EEXIST)
                        goto repeat;
                if (err) {
                        /*
                         * When NOWAIT I/O fails to allocate folios this could
                         * be due to a nonblocking memory allocation and not
                         * because the system actually is out of memory.
                         * Return -EAGAIN so that there caller retries in a
                         * blocking fashion instead of propagating -ENOMEM
                         * to the application.
                         */
                        if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
                                err = -EAGAIN;
                        return ERR_PTR(err);
                }
                /*
                 * filemap_add_folio locks the page, and for mmap
                 * we expect an unlocked page.
                 */
                if (folio && (fgp_flags & FGP_FOR_MMAP))
                        folio_unlock(folio);
        }

        if (!folio)
                return ERR_PTR(-ENOENT);
        /* not an uncached lookup, clear uncached if set */
        if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
                folio_clear_dropbehind(folio);
        return folio;
}
EXPORT_SYMBOL(__filemap_get_folio_mpol);

static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
                xa_mark_t mark)
{
        struct folio *folio;

retry:
        if (mark == XA_PRESENT)
                folio = xas_find(xas, max);
        else
                folio = xas_find_marked(xas, max, mark);

        if (xas_retry(xas, folio))
                goto retry;
        /*
         * A shadow entry of a recently evicted page, a swap
         * entry from shmem/tmpfs or a DAX entry.  Return it
         * without attempting to raise page count.
         */
        if (!folio || xa_is_value(folio))
                return folio;

        if (!folio_try_get(folio))
                goto reset;

        if (unlikely(folio != xas_reload(xas))) {
                folio_put(folio);
                goto reset;
        }

        return folio;
reset:
        xas_reset(xas);
        goto retry;
}

/**
 * find_get_entries - gang pagecache lookup
 * @mapping:        The address_space to search
 * @start:        The starting page cache index
 * @end:        The final page index (inclusive).
 * @fbatch:        Where the resulting entries are placed.
 * @indices:        The cache indices corresponding to the entries in @entries
 *
 * find_get_entries() will search for and return a batch of entries in
 * the mapping.  The entries are placed in @fbatch.  find_get_entries()
 * takes a reference on any actual folios it returns.
 *
 * The entries have ascending indexes.  The indices may not be consecutive
 * due to not-present entries or large folios.
 *
 * Any shadow entries of evicted folios, or swap entries from
 * shmem/tmpfs, are included in the returned array.
 *
 * Return: The number of entries which were found.
 */
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
                indices[fbatch->nr] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;
        }

        if (folio_batch_count(fbatch)) {
                unsigned long nr;
                int idx = folio_batch_count(fbatch) - 1;

                folio = fbatch->folios[idx];
                if (!xa_is_value(folio))
                        nr = folio_nr_pages(folio);
                else
                        nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);
                *start = round_down(indices[idx] + nr, nr);
        }
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}

/**
 * find_lock_entries - Find a batch of pagecache entries.
 * @mapping:        The address_space to search.
 * @start:        The starting page cache index.
 * @end:        The final page index (inclusive).
 * @fbatch:        Where the resulting entries are placed.
 * @indices:        The cache indices of the entries in @fbatch.
 *
 * find_lock_entries() will return a batch of entries from @mapping.
 * Swap, shadow and DAX entries are included.  Folios are returned
 * locked and with an incremented refcount.  Folios which are locked
 * by somebody else or under writeback are skipped.  Folios which are
 * partially outside the range are not returned.
 *
 * The entries have ascending indexes.  The indices may not be consecutive
 * due to not-present entries, large folios, folios which could not be
 * locked or folios under writeback.
 *
 * Return: The number of entries which were found.
 */
unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
                unsigned long base;
                unsigned long nr;

                if (!xa_is_value(folio)) {
                        nr = folio_nr_pages(folio);
                        base = folio->index;
                        /* Omit large folio which begins before the start */
                        if (base < *start)
                                goto put;
                        /* Omit large folio which extends beyond the end */
                        if (base + nr - 1 > end)
                                goto put;
                        if (!folio_trylock(folio))
                                goto put;
                        if (folio->mapping != mapping ||
                            folio_test_writeback(folio))
                                goto unlock;
                        VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
                                        folio);
                } else {
                        nr = 1 << xas_get_order(&xas);
                        base = xas.xa_index & ~(nr - 1);
                        /* Omit order>0 value which begins before the start */
                        if (base < *start)
                                continue;
                        /* Omit order>0 value which extends beyond the end */
                        if (base + nr - 1 > end)
                                break;
                }

                /* Update start now so that last update is correct on return */
                *start = base + nr;
                indices[fbatch->nr] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;
                continue;
unlock:
                folio_unlock(folio);
put:
                folio_put(folio);
        }
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}

/**
 * filemap_get_folios - Get a batch of folios
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index (inclusive)
 * @fbatch:        The batch to fill.
 *
 * Search for and return a batch of folios in the mapping starting at
 * index @start and up to index @end (inclusive).  The folios are returned
 * in @fbatch with an elevated reference count.
 *
 * Return: The number of folios which were found.
 * We also update @start to index the next folio for the traversal.
 */
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch)
{
        return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
}
EXPORT_SYMBOL(filemap_get_folios);

/**
 * filemap_get_folios_contig - Get a batch of contiguous folios
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index (inclusive)
 * @fbatch:        The batch to fill
 *
 * filemap_get_folios_contig() works exactly like filemap_get_folios(),
 * except the returned folios are guaranteed to be contiguous. This may
 * not return all contiguous folios if the batch gets filled up.
 *
 * Return: The number of folios found.
 * Also update @start to be positioned for traversal of the next folio.
 */

unsigned filemap_get_folios_contig(struct address_space *mapping,
                pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        unsigned long nr;
        struct folio *folio;

        rcu_read_lock();

        for (folio = xas_load(&xas); folio && xas.xa_index <= end;
                        folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;
                /*
                 * If the entry has been swapped out, we can stop looking.
                 * No current caller is looking for DAX entries.
                 */
                if (xa_is_value(folio))
                        goto update_start;

                /* If we landed in the middle of a THP, continue at its end. */
                if (xa_is_sibling(folio))
                        goto update_start;

                if (!folio_try_get(folio))
                        goto retry;

                if (unlikely(folio != xas_reload(&xas)))
                        goto put_folio;

                if (!folio_batch_add(fbatch, folio)) {
                        nr = folio_nr_pages(folio);
                        *start = folio->index + nr;
                        goto out;
                }
                xas_advance(&xas, folio_next_index(folio) - 1);
                continue;
put_folio:
                folio_put(folio);

retry:
                xas_reset(&xas);
        }

update_start:
        nr = folio_batch_count(fbatch);

        if (nr) {
                folio = fbatch->folios[nr - 1];
                *start = folio_next_index(folio);
        }
out:
        rcu_read_unlock();
        return folio_batch_count(fbatch);
}
EXPORT_SYMBOL(filemap_get_folios_contig);

/**
 * filemap_get_folios_tag - Get a batch of folios matching @tag
 * @mapping:    The address_space to search
 * @start:      The starting page index
 * @end:        The final page index (inclusive)
 * @tag:        The tag index
 * @fbatch:     The batch to fill
 *
 * The first folio may start before @start; if it does, it will contain
 * @start.  The final folio may extend beyond @end; if it does, it will
 * contain @end.  The folios have ascending indices.  There may be gaps
 * between the folios if there are indices which have no folio in the
 * page cache.  If folios are added to or removed from the page cache
 * while this is running, they may or may not be found by this call.
 * Only returns folios that are tagged with @tag.
 *
 * Return: The number of folios found.
 * Also update @start to index the next folio for traversal.
 */
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
                        pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
                /*
                 * Shadow entries should never be tagged, but this iteration
                 * is lockless so there is a window for page reclaim to evict
                 * a page we saw tagged. Skip over it.
                 */
                if (xa_is_value(folio))
                        continue;
                if (!folio_batch_add(fbatch, folio)) {
                        unsigned long nr = folio_nr_pages(folio);
                        *start = folio->index + nr;
                        goto out;
                }
        }
        /*
         * We come here when there is no page beyond @end. We take care to not
         * overflow the index @start as it confuses some of the callers. This
         * breaks the iteration when there is a page at index -1 but that is
         * already broke anyway.
         */
        if (end == (pgoff_t)-1)
                *start = (pgoff_t)-1;
        else
                *start = end + 1;
out:
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}
EXPORT_SYMBOL(filemap_get_folios_tag);

/**
 * filemap_get_folios_dirty - Get a batch of dirty folios
 * @mapping:        The address_space to search
 * @start:        The starting folio index
 * @end:        The final folio index (inclusive)
 * @fbatch:        The batch to fill
 *
 * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except
 * the returned folios are presumed to be dirty or undergoing writeback. Dirty
 * state is presumed because we don't block on folio lock nor want to miss
 * folios. Callers that need to can recheck state upon locking the folio.
 *
 * This may not return all dirty folios if the batch gets filled up.
 *
 * Return: The number of folios found.
 * Also update @start to be positioned for traversal of the next folio.
 */
unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start,
                        pgoff_t end, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
                if (xa_is_value(folio))
                        continue;
                if (folio_trylock(folio)) {
                        bool clean = !folio_test_dirty(folio) &&
                                     !folio_test_writeback(folio);
                        folio_unlock(folio);
                        if (clean) {
                                folio_put(folio);
                                continue;
                        }
                }
                if (!folio_batch_add(fbatch, folio)) {
                        unsigned long nr = folio_nr_pages(folio);
                        *start = folio->index + nr;
                        goto out;
                }
        }
        /*
         * We come here when there is no folio beyond @end. We take care to not
         * overflow the index @start as it confuses some of the callers. This
         * breaks the iteration when there is a folio at index -1 but that is
         * already broke anyway.
         */
        if (end == (pgoff_t)-1)
                *start = (pgoff_t)-1;
        else
                *start = end + 1;
out:
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}

/*
 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 * a _large_ part of the i/o request. Imagine the worst scenario:
 *
 *      ---R__________________________________________B__________
 *         ^ reading here                             ^ bad block(assume 4k)
 *
 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 * => failing the whole request => read(R) => read(R+1) =>
 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 *
 * It is going insane. Fix it by quickly scaling down the readahead size.
 */
static void shrink_readahead_size_eio(struct file_ra_state *ra)
{
        ra->ra_pages /= 4;
}

/*
 * filemap_get_read_batch - Get a batch of folios for read
 *
 * Get a batch of folios which represent a contiguous range of bytes in
 * the file.  No exceptional entries will be returned.  If @index is in
 * the middle of a folio, the entire folio will be returned.  The last
 * folio in the batch may have the readahead flag set or the uptodate flag
 * clear so that the caller can take the appropriate action.
 */
static void filemap_get_read_batch(struct address_space *mapping,
                pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;

        rcu_read_lock();
        for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;
                if (xas.xa_index > max || xa_is_value(folio))
                        break;
                if (xa_is_sibling(folio))
                        break;
                if (!folio_try_get(folio))
                        goto retry;

                if (unlikely(folio != xas_reload(&xas)))
                        goto put_folio;

                if (!folio_batch_add(fbatch, folio))
                        break;
                if (!folio_test_uptodate(folio))
                        break;
                if (folio_test_readahead(folio))
                        break;
                xas_advance(&xas, folio_next_index(folio) - 1);
                continue;
put_folio:
                folio_put(folio);
retry:
                xas_reset(&xas);
        }
        rcu_read_unlock();
}

static int filemap_read_folio(struct file *file, filler_t filler,
                struct folio *folio)
{
        bool workingset = folio_test_workingset(folio);
        unsigned long pflags;
        int error;

        /* Start the actual read. The read will unlock the page. */
        if (unlikely(workingset))
                psi_memstall_enter(&pflags);
        error = filler(file, folio);
        if (unlikely(workingset))
                psi_memstall_leave(&pflags);
        if (error)
                return error;

        error = folio_wait_locked_killable(folio);
        if (error)
                return error;
        if (folio_test_uptodate(folio))
                return 0;
        if (file)
                shrink_readahead_size_eio(&file->f_ra);
        return -EIO;
}

static bool filemap_range_uptodate(struct address_space *mapping,
                loff_t pos, size_t count, struct folio *folio,
                bool need_uptodate)
{
        if (folio_test_uptodate(folio))
                return true;
        /* pipes can't handle partially uptodate pages */
        if (need_uptodate)
                return false;
        if (!mapping->a_ops->is_partially_uptodate)
                return false;
        if (mapping->host->i_blkbits >= folio_shift(folio))
                return false;

        if (folio_pos(folio) > pos) {
                count -= folio_pos(folio) - pos;
                pos = 0;
        } else {
                pos -= folio_pos(folio);
        }

        if (pos == 0 && count >= folio_size(folio))
                return false;

        return mapping->a_ops->is_partially_uptodate(folio, pos, count);
}

static int filemap_update_page(struct kiocb *iocb,
                struct address_space *mapping, size_t count,
                struct folio *folio, bool need_uptodate)
{
        int error;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!filemap_invalidate_trylock_shared(mapping))
                        return -EAGAIN;
        } else {
                filemap_invalidate_lock_shared(mapping);
        }

        if (!folio_trylock(folio)) {
                error = -EAGAIN;
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
                        goto unlock_mapping;
                if (!(iocb->ki_flags & IOCB_WAITQ)) {
                        filemap_invalidate_unlock_shared(mapping);
                        /*
                         * This is where we usually end up waiting for a
                         * previously submitted readahead to finish.
                         */
                        folio_put_wait_locked(folio, TASK_KILLABLE);
                        return AOP_TRUNCATED_PAGE;
                }
                error = __folio_lock_async(folio, iocb->ki_waitq);
                if (error)
                        goto unlock_mapping;
        }

        error = AOP_TRUNCATED_PAGE;
        if (!folio->mapping)
                goto unlock;

        error = 0;
        if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
                                   need_uptodate))
                goto unlock;

        error = -EAGAIN;
        if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
                goto unlock;

        error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
                        folio);
        goto unlock_mapping;
unlock:
        folio_unlock(folio);
unlock_mapping:
        filemap_invalidate_unlock_shared(mapping);
        if (error == AOP_TRUNCATED_PAGE)
                folio_put(folio);
        return error;
}

static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        struct folio *folio;
        int error;
        unsigned int min_order = mapping_min_folio_order(mapping);
        pgoff_t index;

        if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
                return -EAGAIN;

        folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL);
        if (!folio)
                return -ENOMEM;
        if (iocb->ki_flags & IOCB_DONTCACHE)
                __folio_set_dropbehind(folio);

        /*
         * Protect against truncate / hole punch. Grabbing invalidate_lock
         * here assures we cannot instantiate and bring uptodate new
         * pagecache folios after evicting page cache during truncate
         * and before actually freeing blocks.        Note that we could
         * release invalidate_lock after inserting the folio into
         * the page cache as the locked folio would then be enough to
         * synchronize with hole punching. But there are code paths
         * such as filemap_update_page() filling in partially uptodate
         * pages or ->readahead() that need to hold invalidate_lock
         * while mapping blocks for IO so let's hold the lock here as
         * well to keep locking rules simple.
         */
        filemap_invalidate_lock_shared(mapping);
        index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;
        error = filemap_add_folio(mapping, folio, index,
                        mapping_gfp_constraint(mapping, GFP_KERNEL));
        if (error == -EEXIST)
                error = AOP_TRUNCATED_PAGE;
        if (error)
                goto error;

        error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
                                        folio);
        if (error)
                goto error;

        filemap_invalidate_unlock_shared(mapping);
        folio_batch_add(fbatch, folio);
        return 0;
error:
        filemap_invalidate_unlock_shared(mapping);
        folio_put(folio);
        return error;
}

static int filemap_readahead(struct kiocb *iocb, struct file *file,
                struct address_space *mapping, struct folio *folio,
                pgoff_t last_index)
{
        DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);

        if (iocb->ki_flags & IOCB_NOIO)
                return -EAGAIN;
        if (iocb->ki_flags & IOCB_DONTCACHE)
                ractl.dropbehind = 1;
        page_cache_async_ra(&ractl, folio, last_index - folio->index);
        return 0;
}

static int filemap_get_pages(struct kiocb *iocb, size_t count,
                struct folio_batch *fbatch, bool need_uptodate)
{
        struct file *filp = iocb->ki_filp;
        struct address_space *mapping = filp->f_mapping;
        pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
        pgoff_t last_index;
        struct folio *folio;
        unsigned int flags;
        int err = 0;

        /* "last_index" is the index of the folio beyond the end of the read */
        last_index = round_up(iocb->ki_pos + count,
                        mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT;
retry:
        if (fatal_signal_pending(current))
                return -EINTR;

        filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        if (!folio_batch_count(fbatch)) {
                DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);

                if (iocb->ki_flags & IOCB_NOIO)
                        return -EAGAIN;
                if (iocb->ki_flags & IOCB_NOWAIT)
                        flags = memalloc_noio_save();
                if (iocb->ki_flags & IOCB_DONTCACHE)
                        ractl.dropbehind = 1;
                page_cache_sync_ra(&ractl, last_index - index);
                if (iocb->ki_flags & IOCB_NOWAIT)
                        memalloc_noio_restore(flags);
                filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        }
        if (!folio_batch_count(fbatch)) {
                err = filemap_create_folio(iocb, fbatch);
                if (err == AOP_TRUNCATED_PAGE)
                        goto retry;
                return err;
        }

        folio = fbatch->folios[folio_batch_count(fbatch) - 1];
        if (folio_test_readahead(folio)) {
                err = filemap_readahead(iocb, filp, mapping, folio, last_index);
                if (err)
                        goto err;
        }
        if (!folio_test_uptodate(folio)) {
                if (folio_batch_count(fbatch) > 1) {
                        err = -EAGAIN;
                        goto err;
                }
                err = filemap_update_page(iocb, mapping, count, folio,
                                          need_uptodate);
                if (err)
                        goto err;
        }

        trace_mm_filemap_get_pages(mapping, index, last_index - 1);
        return 0;
err:
        if (err < 0)
                folio_put(folio);
        if (likely(--fbatch->nr))
                return 0;
        if (err == AOP_TRUNCATED_PAGE)
                goto retry;
        return err;
}

static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
{
        unsigned int shift = folio_shift(folio);

        return (pos1 >> shift == pos2 >> shift);
}

static void filemap_end_dropbehind_read(struct folio *folio)
{
        if (!folio_test_dropbehind(folio))
                return;
        if (folio_test_writeback(folio) || folio_test_dirty(folio))
                return;
        if (folio_trylock(folio)) {
                filemap_end_dropbehind(folio);
                folio_unlock(folio);
        }
}

/**
 * filemap_read - Read data from the page cache.
 * @iocb: The iocb to read.
 * @iter: Destination for the data.
 * @already_read: Number of bytes already read by the caller.
 *
 * Copies data from the page cache.  If the data is not currently present,
 * uses the readahead and read_folio address_space operations to fetch it.
 *
 * Return: Total number of bytes copied, including those already read by
 * the caller.  If an error happens before any bytes are copied, returns
 * a negative error number.
 */
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t already_read)
{
        struct file *filp = iocb->ki_filp;
        struct file_ra_state *ra = &filp->f_ra;
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
        struct folio_batch fbatch;
        int i, error = 0;
        bool writably_mapped;
        loff_t isize, end_offset;
        loff_t last_pos = ra->prev_pos;

        if (unlikely(iocb->ki_pos < 0))
                return -EINVAL;
        if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
                return 0;
        if (unlikely(!iov_iter_count(iter)))
                return 0;

        iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
        folio_batch_init(&fbatch);

        do {
                cond_resched();

                /*
                 * If we've already successfully copied some data, then we
                 * can no longer safely return -EIOCBQUEUED. Hence mark
                 * an async read NOWAIT at that point.
                 */
                if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
                        iocb->ki_flags |= IOCB_NOWAIT;

                if (unlikely(iocb->ki_pos >= i_size_read(inode)))
                        break;

                error = filemap_get_pages(iocb, iter->count, &fbatch, false);
                if (error < 0)
                        break;

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(inode);
                if (unlikely(iocb->ki_pos >= isize))
                        goto put_folios;
                end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
                 */
                writably_mapped = mapping_writably_mapped(mapping);

                /*
                 * When a read accesses the same folio several times, only
                 * mark it as accessed the first time.
                 */
                if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
                                    fbatch.folios[0]))
                        folio_mark_accessed(fbatch.folios[0]);

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
                        size_t fsize = folio_size(folio);
                        size_t offset = iocb->ki_pos & (fsize - 1);
                        size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
                                             fsize - offset);
                        size_t copied;

                        if (end_offset < folio_pos(folio))
                                break;
                        if (i > 0)
                                folio_mark_accessed(folio);
                        /*
                         * If users can be writing to this folio using arbitrary
                         * virtual addresses, take care of potential aliasing
                         * before reading the folio on the kernel side.
                         */
                        if (writably_mapped)
                                flush_dcache_folio(folio);

                        copied = copy_folio_to_iter(folio, offset, bytes, iter);

                        already_read += copied;
                        iocb->ki_pos += copied;
                        last_pos = iocb->ki_pos;

                        if (copied < bytes) {
                                error = -EFAULT;
                                break;
                        }
                }
put_folios:
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        filemap_end_dropbehind_read(folio);
                        folio_put(folio);
                }
                folio_batch_init(&fbatch);
        } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);

        file_accessed(filp);
        ra->prev_pos = last_pos;
        return already_read ? already_read : error;
}
EXPORT_SYMBOL_GPL(filemap_read);

int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        loff_t pos = iocb->ki_pos;
        loff_t end = pos + count - 1;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (filemap_range_needs_writeback(mapping, pos, end))
                        return -EAGAIN;
                return 0;
        }

        return filemap_write_and_wait_range(mapping, pos, end);
}
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);

int filemap_invalidate_pages(struct address_space *mapping,
                             loff_t pos, loff_t end, bool nowait)
{
        int ret;

        if (nowait) {
                /* we could block if there are any pages in the range */
                if (filemap_range_has_page(mapping, pos, end))
                        return -EAGAIN;
        } else {
                ret = filemap_write_and_wait_range(mapping, pos, end);
                if (ret)
                        return ret;
        }

        /*
         * After a write we want buffered reads to be sure to go to disk to get
         * the new data.  We invalidate clean cached page from the region we're
         * about to write.  We do this *before* the write so that we can return
         * without clobbering -EIOCBQUEUED from ->direct_IO().
         */
        return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
                                             end >> PAGE_SHIFT);
}

int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;

        return filemap_invalidate_pages(mapping, iocb->ki_pos,
                                        iocb->ki_pos + count - 1,
                                        iocb->ki_flags & IOCB_NOWAIT);
}
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);

/**
 * generic_file_read_iter - generic filesystem read routine
 * @iocb:        kernel I/O control block
 * @iter:        destination for the data read
 *
 * This is the "read_iter()" routine for all filesystems
 * that can use the page cache directly.
 *
 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
 * be returned when no data can be read without waiting for I/O requests
 * to complete; it doesn't prevent readahead.
 *
 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
 * requests shall be made for the read or for readahead.  When no data
 * can be read, -EAGAIN shall be returned.  When readahead would be
 * triggered, a partial, possibly empty read shall be returned.
 *
 * Return:
 * * number of bytes copied, even for partial reads
 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
 */
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        size_t count = iov_iter_count(iter);
        ssize_t retval = 0;

        if (!count)
                return 0; /* skip atime */

        if (iocb->ki_flags & IOCB_DIRECT) {
                struct file *file = iocb->ki_filp;
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;

                retval = kiocb_write_and_wait(iocb, count);
                if (retval < 0)
                        return retval;
                file_accessed(file);

                retval = mapping->a_ops->direct_IO(iocb, iter);
                if (retval >= 0) {
                        iocb->ki_pos += retval;
                        count -= retval;
                }
                if (retval != -EIOCBQUEUED)
                        iov_iter_revert(iter, count - iov_iter_count(iter));

                /*
                 * Btrfs can have a short DIO read if we encounter
                 * compressed extents, so if there was an error, or if
                 * we've already read everything we wanted to, or if
                 * there was a short read because we hit EOF, go ahead
                 * and return.  Otherwise fallthrough to buffered io for
                 * the rest of the read.  Buffered reads will not work for
                 * DAX files, so don't bother trying.
                 */
                if (retval < 0 || !count || IS_DAX(inode))
                        return retval;
                if (iocb->ki_pos >= i_size_read(inode))
                        return retval;
        }

        return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);

/*
 * Splice subpages from a folio into a pipe.
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
                              struct folio *folio, loff_t fpos, size_t size)
{
        struct page *page;
        size_t spliced = 0, offset = offset_in_folio(folio, fpos);

        page = folio_page(folio, offset / PAGE_SIZE);
        size = min(size, folio_size(folio) - offset);
        offset %= PAGE_SIZE;

        while (spliced < size && !pipe_is_full(pipe)) {
                struct pipe_buffer *buf = pipe_head_buf(pipe);
                size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);

                *buf = (struct pipe_buffer) {
                        .ops        = &page_cache_pipe_buf_ops,
                        .page        = page,
                        .offset        = offset,
                        .len        = part,
                };
                folio_get(folio);
                pipe->head++;
                page++;
                spliced += part;
                offset = 0;
        }

        return spliced;
}

/**
 * filemap_splice_read -  Splice data from a file's pagecache into a pipe
 * @in: The file to read from
 * @ppos: Pointer to the file position to read from
 * @pipe: The pipe to splice into
 * @len: The amount to splice
 * @flags: The SPLICE_F_* flags
 *
 * This function gets folios from a file's pagecache and splices them into the
 * pipe.  Readahead will be called as necessary to fill more folios.  This may
 * be used for blockdevs also.
 *
 * Return: On success, the number of bytes read will be returned and *@ppos
 * will be updated if appropriate; 0 will be returned if there is no more data
 * to be read; -EAGAIN will be returned if the pipe had no space, and some
 * other negative error code will be returned on error.  A short read may occur
 * if the pipe has insufficient space, we reach the end of the data or we hit a
 * hole.
 */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
                            struct pipe_inode_info *pipe,
                            size_t len, unsigned int flags)
{
        struct folio_batch fbatch;
        struct kiocb iocb;
        size_t total_spliced = 0, used, npages;
        loff_t isize, end_offset;
        bool writably_mapped;
        int i, error = 0;

        if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
                return 0;

        init_sync_kiocb(&iocb, in);
        iocb.ki_pos = *ppos;

        /* Work out how much data we can actually add into the pipe */
        used = pipe_buf_usage(pipe);
        npages = max_t(ssize_t, pipe->max_usage - used, 0);
        len = min_t(size_t, len, npages * PAGE_SIZE);

        folio_batch_init(&fbatch);

        do {
                cond_resched();

                if (*ppos >= i_size_read(in->f_mapping->host))
                        break;

                iocb.ki_pos = *ppos;
                error = filemap_get_pages(&iocb, len, &fbatch, true);
                if (error < 0)
                        break;

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(in->f_mapping->host);
                if (unlikely(*ppos >= isize))
                        break;
                end_offset = min_t(loff_t, isize, *ppos + len);

                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
                 */
                writably_mapped = mapping_writably_mapped(in->f_mapping);

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
                        size_t n;

                        if (folio_pos(folio) >= end_offset)
                                goto out;
                        folio_mark_accessed(folio);

                        /*
                         * If users can be writing to this folio using arbitrary
                         * virtual addresses, take care of potential aliasing
                         * before reading the folio on the kernel side.
                         */
                        if (writably_mapped)
                                flush_dcache_folio(folio);

                        n = min_t(loff_t, len, isize - *ppos);
                        n = splice_folio_into_pipe(pipe, folio, *ppos, n);
                        if (!n)
                                goto out;
                        len -= n;
                        total_spliced += n;
                        *ppos += n;
                        in->f_ra.prev_pos = *ppos;
                        if (pipe_is_full(pipe))
                                goto out;
                }

                folio_batch_release(&fbatch);
        } while (len);

out:
        folio_batch_release(&fbatch);
        file_accessed(in);

        return total_spliced ? total_spliced : error;
}
EXPORT_SYMBOL(filemap_splice_read);

static inline loff_t folio_seek_hole_data(struct xa_state *xas,
                struct address_space *mapping, struct folio *folio,
                loff_t start, loff_t end, bool seek_data)
{
        const struct address_space_operations *ops = mapping->a_ops;
        size_t offset, bsz = i_blocksize(mapping->host);

        if (xa_is_value(folio) || folio_test_uptodate(folio))
                return seek_data ? start : end;
        if (!ops->is_partially_uptodate)
                return seek_data ? end : start;

        xas_pause(xas);
        rcu_read_unlock();
        folio_lock(folio);
        if (unlikely(folio->mapping != mapping))
                goto unlock;

        offset = offset_in_folio(folio, start) & ~(bsz - 1);

        do {
                if (ops->is_partially_uptodate(folio, offset, bsz) ==
                                                        seek_data)
                        break;
                start = (start + bsz) & ~((u64)bsz - 1);
                offset += bsz;
        } while (offset < folio_size(folio));
unlock:
        folio_unlock(folio);
        rcu_read_lock();
        return start;
}

static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
{
        if (xa_is_value(folio))
                return PAGE_SIZE << xas_get_order(xas);
        return folio_size(folio);
}

/**
 * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
 * @mapping: Address space to search.
 * @start: First byte to consider.
 * @end: Limit of search (exclusive).
 * @whence: Either SEEK_HOLE or SEEK_DATA.
 *
 * If the page cache knows which blocks contain holes and which blocks
 * contain data, your filesystem can use this function to implement
 * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
 * entirely memory-based such as tmpfs, and filesystems which support
 * unwritten extents.
 *
 * Return: The requested offset on success, or -ENXIO if @whence specifies
 * SEEK_DATA and there is no data after @start.  There is an implicit hole
 * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
 * and @end contain data.
 */
loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
                loff_t end, int whence)
{
        XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
        pgoff_t max = (end - 1) >> PAGE_SHIFT;
        bool seek_data = (whence == SEEK_DATA);
        struct folio *folio;

        if (end <= start)
                return -ENXIO;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
                loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
                size_t seek_size;

                if (start < pos) {
                        if (!seek_data)
                                goto unlock;
                        start = pos;
                }

                seek_size = seek_folio_size(&xas, folio);
                pos = round_up((u64)pos + 1, seek_size);
                start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
                                seek_data);
                if (start < pos)
                        goto unlock;
                if (start >= end)
                        break;
                if (seek_size > PAGE_SIZE)
                        xas_set(&xas, pos >> PAGE_SHIFT);
                if (!xa_is_value(folio))
                        folio_put(folio);
        }
        if (seek_data)
                start = -ENXIO;
unlock:
        rcu_read_unlock();
        if (folio && !xa_is_value(folio))
                folio_put(folio);
        if (start > end)
                return end;
        return start;
}

#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS  (100)
/*
 * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
 * @vmf - the vm_fault for this fault.
 * @folio - the folio to lock.
 * @fpin - the pointer to the file we may pin (or is already pinned).
 *
 * This works similar to lock_folio_or_retry in that it can drop the
 * mmap_lock.  It differs in that it actually returns the folio locked
 * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
 * to drop the mmap_lock then fpin will point to the pinned file and
 * needs to be fput()'ed at a later point.
 */
static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
                                     struct file **fpin)
{
        if (folio_trylock(folio))
                return 1;

        /*
         * NOTE! This will make us return with VM_FAULT_RETRY, but with
         * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
         * is supposed to work. We have way too many special cases..
         */
        if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
                return 0;

        *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
        if (vmf->flags & FAULT_FLAG_KILLABLE) {
                if (__folio_lock_killable(folio)) {
                        /*
                         * We didn't have the right flags to drop the
                         * fault lock, but all fault_handlers only check
                         * for fatal signals if we return VM_FAULT_RETRY,
                         * so we need to drop the fault lock here and
                         * return 0 if we don't have a fpin.
                         */
                        if (*fpin == NULL)
                                release_fault_lock(vmf);
                        return 0;
                }
        } else
                __folio_lock(folio);

        return 1;
}

/*
 * Synchronous readahead happens when we don't even find a page in the page
 * cache at all.  We don't want to perform IO under the mmap sem, so if we have
 * to drop the mmap sem we return the file that was pinned in order for us to do
 * that.  If we didn't pin a file then we return NULL.  The file that is
 * returned needs to be fput()'ed when we're done with it.
 */
static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct file_ra_state *ra = &file->f_ra;
        struct address_space *mapping = file->f_mapping;
        DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
        struct file *fpin = NULL;
        vm_flags_t vm_flags = vmf->vma->vm_flags;
        bool force_thp_readahead = false;
        unsigned short mmap_miss;

        /* Use the readahead code, even if readahead is disabled */
        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
            (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER)
                force_thp_readahead = true;

        if (!force_thp_readahead) {
                /*
                 * If we don't want any read-ahead, don't bother.
                 * VM_EXEC case below is already intended for random access.
                 */
                if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)
                        return fpin;

                if (!ra->ra_pages)
                        return fpin;

                if (vm_flags & VM_SEQ_READ) {
                        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                        page_cache_sync_ra(&ractl, ra->ra_pages);
                        return fpin;
                }
        }

        if (!(vm_flags & VM_SEQ_READ)) {
                /* Avoid banging the cache line if not needed */
                mmap_miss = READ_ONCE(ra->mmap_miss);
                if (mmap_miss < MMAP_LOTSAMISS * 10)
                        WRITE_ONCE(ra->mmap_miss, ++mmap_miss);

                /*
                 * Do we miss much more than hit in this file? If so,
                 * stop bothering with read-ahead. It will only hurt.
                 */
                if (mmap_miss > MMAP_LOTSAMISS)
                        return fpin;
        }

        if (force_thp_readahead) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
                ra->size = HPAGE_PMD_NR;
                /*
                 * Fetch two PMD folios, so we get the chance to actually
                 * readahead, unless we've been told not to.
                 */
                if (!(vm_flags & VM_RAND_READ))
                        ra->size *= 2;
                ra->async_size = HPAGE_PMD_NR;
                ra->order = HPAGE_PMD_ORDER;
                page_cache_ra_order(&ractl, ra);
                return fpin;
        }

        if (vm_flags & VM_EXEC) {
                /*
                 * Allow arch to request a preferred minimum folio order for
                 * executable memory. This can often be beneficial to
                 * performance if (e.g.) arm64 can contpte-map the folio.
                 * Executable memory rarely benefits from readahead, due to its
                 * random access nature, so set async_size to 0.
                 *
                 * Limit to the boundaries of the VMA to avoid reading in any
                 * pad that might exist between sections, which would be a waste
                 * of memory.
                 */
                struct vm_area_struct *vma = vmf->vma;
                unsigned long start = vma->vm_pgoff;
                unsigned long end = start + vma_pages(vma);
                unsigned long ra_end;

                ra->order = exec_folio_order();
                ra->start = round_down(vmf->pgoff, 1UL << ra->order);
                ra->start = max(ra->start, start);
                ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);
                ra_end = min(ra_end, end);
                ra->size = ra_end - ra->start;
                ra->async_size = 0;
        } else {
                /*
                 * mmap read-around
                 */
                ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
                ra->size = ra->ra_pages;
                ra->async_size = ra->ra_pages / 4;
                ra->order = 0;
        }

        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
        ractl._index = ra->start;
        page_cache_ra_order(&ractl, ra);
        return fpin;
}

/*
 * Asynchronous readahead happens when we find the page and PG_readahead,
 * so we want to possibly extend the readahead further.  We return the file that
 * was pinned if we have to drop the mmap_lock in order to do IO.
 */
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
                                            struct folio *folio)
{
        struct file *file = vmf->vma->vm_file;
        struct file_ra_state *ra = &file->f_ra;
        DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
        struct file *fpin = NULL;
        unsigned short mmap_miss;

        /* If we don't want any read-ahead, don't bother */
        if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
                return fpin;

        /*
         * If the folio is locked, we're likely racing against another fault.
         * Don't touch the mmap_miss counter to avoid decreasing it multiple
         * times for a single folio and break the balance with mmap_miss
         * increase in do_sync_mmap_readahead().
         */
        if (likely(!folio_test_locked(folio))) {
                mmap_miss = READ_ONCE(ra->mmap_miss);
                if (mmap_miss)
                        WRITE_ONCE(ra->mmap_miss, --mmap_miss);
        }

        if (folio_test_readahead(folio)) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                page_cache_async_ra(&ractl, folio, ra->ra_pages);
        }
        return fpin;
}

static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;
        pte_t *ptep;

        /*
         * We might have COW'ed a pagecache folio and might now have an mlocked
         * anon folio mapped. The original pagecache folio is not mlocked and
         * might have been evicted. During a read+clear/modify/write update of
         * the PTE, such as done in do_numa_page()/change_pte_range(), we
         * temporarily clear the PTE under PT lock and might detect it here as
         * "none" when not holding the PT lock.
         *
         * Not rechecking the PTE under PT lock could result in an unexpected
         * major fault in an mlock'ed region. Recheck only for this special
         * scenario while holding the PT lock, to not degrade non-mlocked
         * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
         * the number of times we hold PT lock.
         */
        if (!(vma->vm_flags & VM_LOCKED))
                return 0;

        if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
                return 0;

        ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,
                                        &vmf->ptl);
        if (unlikely(!ptep))
                return VM_FAULT_NOPAGE;

        if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
                ret = VM_FAULT_NOPAGE;
        } else {
                spin_lock(vmf->ptl);
                if (unlikely(!pte_none(ptep_get(ptep))))
                        ret = VM_FAULT_NOPAGE;
                spin_unlock(vmf->ptl);
        }
        pte_unmap(ptep);
        return ret;
}

/**
 * filemap_fault - read in file data for page fault handling
 * @vmf:        struct vm_fault containing details of the fault
 *
 * filemap_fault() is invoked via the vma operations vector for a
 * mapped memory region to read in file data during a page fault.
 *
 * The goto's are kind of ugly, but this streamlines the normal case of having
 * it in the page cache, and handles the special cases reasonably without
 * having a lot of duplicated code.
 *
 * vma->vm_mm->mmap_lock must be held on entry.
 *
 * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
 * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
 *
 * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
 * has not been released.
 *
 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
 *
 * Return: bitwise-OR of %VM_FAULT_ codes.
 */
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
        int error;
        struct file *file = vmf->vma->vm_file;
        struct file *fpin = NULL;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        pgoff_t max_idx, index = vmf->pgoff;
        struct folio *folio;
        vm_fault_t ret = 0;
        bool mapping_locked = false;

        max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(index >= max_idx))
                return VM_FAULT_SIGBUS;

        trace_mm_filemap_fault(mapping, index);

        /*
         * Do we have something in the page cache already?
         */
        folio = filemap_get_folio(mapping, index);
        if (likely(!IS_ERR(folio))) {
                /*
                 * We found the page, so try async readahead before waiting for
                 * the lock.
                 */
                if (!(vmf->flags & FAULT_FLAG_TRIED))
                        fpin = do_async_mmap_readahead(vmf, folio);
                if (unlikely(!folio_test_uptodate(folio))) {
                        filemap_invalidate_lock_shared(mapping);
                        mapping_locked = true;
                }
        } else {
                ret = filemap_fault_recheck_pte_none(vmf);
                if (unlikely(ret))
                        return ret;

                /* No page in the page cache at all */
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
                fpin = do_sync_mmap_readahead(vmf);
retry_find:
                /*
                 * See comment in filemap_create_folio() why we need
                 * invalidate_lock
                 */
                if (!mapping_locked) {
                        filemap_invalidate_lock_shared(mapping);
                        mapping_locked = true;
                }
                folio = __filemap_get_folio(mapping, index,
                                          FGP_CREAT|FGP_FOR_MMAP,
                                          vmf->gfp_mask);
                if (IS_ERR(folio)) {
                        if (fpin)
                                goto out_retry;
                        filemap_invalidate_unlock_shared(mapping);
                        return VM_FAULT_OOM;
                }
        }

        if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
                goto out_retry;

        /* Did it get truncated? */
        if (unlikely(folio->mapping != mapping)) {
                folio_unlock(folio);
                folio_put(folio);
                goto retry_find;
        }
        VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);

        /*
         * We have a locked folio in the page cache, now we need to check
         * that it's up-to-date. If not, it is going to be due to an error,
         * or because readahead was otherwise unable to retrieve it.
         */
        if (unlikely(!folio_test_uptodate(folio))) {
                /*
                 * If the invalidate lock is not held, the folio was in cache
                 * and uptodate and now it is not. Strange but possible since we
                 * didn't hold the page lock all the time. Let's drop
                 * everything, get the invalidate lock and try again.
                 */
                if (!mapping_locked) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto retry_find;
                }

                /*
                 * OK, the folio is really not uptodate. This can be because the
                 * VMA has the VM_RAND_READ flag set, or because an error
                 * arose. Let's read it in directly.
                 */
                goto page_not_uptodate;
        }

        /*
         * We've made it this far and we had to drop our mmap_lock, now is the
         * time to return to the upper layer and have it re-find the vma and
         * redo the fault.
         */
        if (fpin) {
                folio_unlock(folio);
                goto out_retry;
        }
        if (mapping_locked)
                filemap_invalidate_unlock_shared(mapping);

        /*
         * Found the page and have a reference on it.
         * We must recheck i_size under page lock.
         */
        max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(index >= max_idx)) {
                folio_unlock(folio);
                folio_put(folio);
                return VM_FAULT_SIGBUS;
        }

        vmf->page = folio_file_page(folio, index);
        return ret | VM_FAULT_LOCKED;

page_not_uptodate:
        /*
         * Umm, take care of errors if the page isn't up-to-date.
         * Try to re-read it _once_. We do this synchronously,
         * because there really aren't any performance issues here
         * and we need to check for errors.
         */
        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
        error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
        if (fpin)
                goto out_retry;
        folio_put(folio);

        if (!error || error == AOP_TRUNCATED_PAGE)
                goto retry_find;
        filemap_invalidate_unlock_shared(mapping);

        return VM_FAULT_SIGBUS;

out_retry:
        /*
         * We dropped the mmap_lock, we need to return to the fault handler to
         * re-find the vma and come back and find our hopefully still populated
         * page.
         */
        if (!IS_ERR(folio))
                folio_put(folio);
        if (mapping_locked)
                filemap_invalidate_unlock_shared(mapping);
        if (fpin)
                fput(fpin);
        return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);

static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
                pgoff_t start)
{
        struct mm_struct *mm = vmf->vma->vm_mm;

        /* Huge page is mapped? No need to proceed. */
        if (pmd_trans_huge(*vmf->pmd)) {
                folio_unlock(folio);
                folio_put(folio);
                return true;
        }

        if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
                struct page *page = folio_file_page(folio, start);
                vm_fault_t ret = do_set_pmd(vmf, folio, page);
                if (!ret) {
                        /* The page is mapped successfully, reference consumed. */
                        folio_unlock(folio);
                        return true;
                }
        }

        if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
                pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);

        return false;
}

static struct folio *next_uptodate_folio(struct xa_state *xas,
                struct address_space *mapping, pgoff_t end_pgoff)
{
        struct folio *folio = xas_next_entry(xas, end_pgoff);
        unsigned long max_idx;

        do {
                if (!folio)
                        return NULL;
                if (xas_retry(xas, folio))
                        continue;
                if (xa_is_value(folio))
                        continue;
                if (!folio_try_get(folio))
                        continue;
                if (folio_test_locked(folio))
                        goto skip;
                /* Has the page moved or been split? */
                if (unlikely(folio != xas_reload(xas)))
                        goto skip;
                if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
                        goto skip;
                if (!folio_trylock(folio))
                        goto skip;
                if (folio->mapping != mapping)
                        goto unlock;
                if (!folio_test_uptodate(folio))
                        goto unlock;
                max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
                if (xas->xa_index >= max_idx)
                        goto unlock;
                return folio;
unlock:
                folio_unlock(folio);
skip:
                folio_put(folio);
        } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);

        return NULL;
}

/*
 * Map page range [start_page, start_page + nr_pages) of folio.
 * start_page is gotten from start by folio_page(folio, start)
 */
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
                        struct folio *folio, unsigned long start,
                        unsigned long addr, unsigned int nr_pages,
                        unsigned long *rss, unsigned short *mmap_miss,
                        pgoff_t file_end)
{
        struct address_space *mapping = folio->mapping;
        unsigned int ref_from_caller = 1;
        vm_fault_t ret = 0;
        struct page *page = folio_page(folio, start);
        unsigned int count = 0;
        pte_t *old_ptep = vmf->pte;
        unsigned long addr0;

        /*
         * Map the large folio fully where possible:
         *
         *  - The folio is fully within size of the file or belong
         *    to shmem/tmpfs;
         *  - The folio doesn't cross VMA boundary;
         *  - The folio doesn't cross page table boundary;
         */
        addr0 = addr - start * PAGE_SIZE;
        if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&
            folio_within_vma(folio, vmf->vma) &&
            (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) {
                vmf->pte -= start;
                page -= start;
                addr = addr0;
                nr_pages = folio_nr_pages(folio);
        }

        do {
                if (PageHWPoison(page + count))
                        goto skip;

                /*
                 * If there are too many folios that are recently evicted
                 * in a file, they will probably continue to be evicted.
                 * In such situation, read-ahead is only a waste of IO.
                 * Don't decrease mmap_miss in this scenario to make sure
                 * we can stop read-ahead.
                 */
                if (!folio_test_workingset(folio))
                        (*mmap_miss)++;

                /*
                 * NOTE: If there're PTE markers, we'll leave them to be
                 * handled in the specific fault path, and it'll prohibit the
                 * fault-around logic.
                 */
                if (!pte_none(ptep_get(&vmf->pte[count])))
                        goto skip;

                count++;
                continue;
skip:
                if (count) {
                        set_pte_range(vmf, folio, page, count, addr);
                        *rss += count;
                        folio_ref_add(folio, count - ref_from_caller);
                        ref_from_caller = 0;
                        if (in_range(vmf->address, addr, count * PAGE_SIZE))
                                ret = VM_FAULT_NOPAGE;
                }

                count++;
                page += count;
                vmf->pte += count;
                addr += count * PAGE_SIZE;
                count = 0;
        } while (--nr_pages > 0);

        if (count) {
                set_pte_range(vmf, folio, page, count, addr);
                *rss += count;
                folio_ref_add(folio, count - ref_from_caller);
                ref_from_caller = 0;
                if (in_range(vmf->address, addr, count * PAGE_SIZE))
                        ret = VM_FAULT_NOPAGE;
        }

        vmf->pte = old_ptep;
        if (ref_from_caller)
                /* Locked folios cannot get truncated. */
                folio_ref_dec(folio);

        return ret;
}

static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
                struct folio *folio, unsigned long addr,
                unsigned long *rss, unsigned short *mmap_miss)
{
        vm_fault_t ret = 0;
        struct page *page = &folio->page;

        if (PageHWPoison(page))
                goto out;

        /* See comment of filemap_map_folio_range() */
        if (!folio_test_workingset(folio))
                (*mmap_miss)++;

        /*
         * NOTE: If there're PTE markers, we'll leave them to be
         * handled in the specific fault path, and it'll prohibit
         * the fault-around logic.
         */
        if (!pte_none(ptep_get(vmf->pte)))
                goto out;

        if (vmf->address == addr)
                ret = VM_FAULT_NOPAGE;

        set_pte_range(vmf, folio, page, 1, addr);
        (*rss)++;
        return ret;

out:
        /* Locked folios cannot get truncated. */
        folio_ref_dec(folio);
        return ret;
}

vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                             pgoff_t start_pgoff, pgoff_t end_pgoff)
{
        struct vm_area_struct *vma = vmf->vma;
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        pgoff_t file_end, last_pgoff = start_pgoff;
        unsigned long addr;
        XA_STATE(xas, &mapping->i_pages, start_pgoff);
        struct folio *folio;
        vm_fault_t ret = 0;
        unsigned long rss = 0;
        unsigned int nr_pages = 0, folio_type;
        unsigned short mmap_miss = 0, mmap_miss_saved;

        /*
         * Recalculate end_pgoff based on file_end before calling
         * next_uptodate_folio() to avoid races with concurrent
         * truncation.
         */
        file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
        end_pgoff = min(end_pgoff, file_end);

        rcu_read_lock();
        folio = next_uptodate_folio(&xas, mapping, end_pgoff);
        if (!folio)
                goto out;

        /*
         * Do not allow to map with PMD across i_size to preserve
         * SIGBUS semantics.
         *
         * Make an exception for shmem/tmpfs that for long time
         * intentionally mapped with PMDs across i_size.
         */
        if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&
            filemap_map_pmd(vmf, folio, start_pgoff)) {
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
        if (!vmf->pte) {
                folio_unlock(folio);
                folio_put(folio);
                goto out;
        }

        folio_type = mm_counter_file(folio);
        do {
                unsigned long end;

                addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
                vmf->pte += xas.xa_index - last_pgoff;
                last_pgoff = xas.xa_index;
                end = folio_next_index(folio) - 1;
                nr_pages = min(end, end_pgoff) - xas.xa_index + 1;

                if (!folio_test_large(folio))
                        ret |= filemap_map_order0_folio(vmf,
                                        folio, addr, &rss, &mmap_miss);
                else
                        ret |= filemap_map_folio_range(vmf, folio,
                                        xas.xa_index - folio->index, addr,
                                        nr_pages, &rss, &mmap_miss, file_end);

                folio_unlock(folio);
        } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
        add_mm_counter(vma->vm_mm, folio_type, rss);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);
out:
        rcu_read_unlock();

        mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
        if (mmap_miss >= mmap_miss_saved)
                WRITE_ONCE(file->f_ra.mmap_miss, 0);
        else
                WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);

        return ret;
}
EXPORT_SYMBOL(filemap_map_pages);

vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        struct folio *folio = page_folio(vmf->page);
        vm_fault_t ret = VM_FAULT_LOCKED;

        sb_start_pagefault(mapping->host->i_sb);
        file_update_time(vmf->vma->vm_file);
        folio_lock(folio);
        if (folio->mapping != mapping) {
                folio_unlock(folio);
                ret = VM_FAULT_NOPAGE;
                goto out;
        }
        /*
         * We mark the folio dirty already here so that when freeze is in
         * progress, we are guaranteed that writeback during freezing will
         * see the dirty folio and writeprotect it again.
         */
        folio_mark_dirty(folio);
        folio_wait_stable(folio);
out:
        sb_end_pagefault(mapping->host->i_sb);
        return ret;
}

const struct vm_operations_struct generic_file_vm_ops = {
        .fault                = filemap_fault,
        .map_pages        = filemap_map_pages,
        .page_mkwrite        = filemap_page_mkwrite,
};

/* This is used for a general mmap of a disk file */

int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct address_space *mapping = file->f_mapping;

        if (!mapping->a_ops->read_folio)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &generic_file_vm_ops;
        return 0;
}

int generic_file_mmap_prepare(struct vm_area_desc *desc)
{
        struct file *file = desc->file;
        struct address_space *mapping = file->f_mapping;

        if (!mapping->a_ops->read_folio)
                return -ENOEXEC;
        file_accessed(file);
        desc->vm_ops = &generic_file_vm_ops;
        return 0;
}

/*
 * This is for filesystems which do not implement ->writepage.
 */
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
        if (vma_is_shared_maywrite(vma))
                return -EINVAL;
        return generic_file_mmap(file, vma);
}

int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
{
        if (is_shared_maywrite(&desc->vma_flags))
                return -EINVAL;
        return generic_file_mmap_prepare(desc);
}
#else
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
        return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        return -ENOSYS;
}
int generic_file_mmap_prepare(struct vm_area_desc *desc)
{
        return -ENOSYS;
}
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
        return -ENOSYS;
}
int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
{
        return -ENOSYS;
}
#endif /* CONFIG_MMU */

EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_mmap_prepare);
EXPORT_SYMBOL(generic_file_readonly_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap_prepare);

static struct folio *do_read_cache_folio(struct address_space *mapping,
                pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
{
        struct folio *folio;
        int err;

        if (!filler)
                filler = mapping->a_ops->read_folio;
repeat:
        folio = filemap_get_folio(mapping, index);
        if (IS_ERR(folio)) {
                folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL);
                if (!folio)
                        return ERR_PTR(-ENOMEM);
                index = mapping_align_index(mapping, index);
                err = filemap_add_folio(mapping, folio, index, gfp);
                if (unlikely(err)) {
                        folio_put(folio);
                        if (err == -EEXIST)
                                goto repeat;
                        /* Presumably ENOMEM for xarray node */
                        return ERR_PTR(err);
                }

                goto filler;
        }
        if (folio_test_uptodate(folio))
                goto out;

        if (!folio_trylock(folio)) {
                folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
                goto repeat;
        }

        /* Folio was truncated from mapping */
        if (!folio->mapping) {
                folio_unlock(folio);
                folio_put(folio);
                goto repeat;
        }

        /* Someone else locked and filled the page in a very small window */
        if (folio_test_uptodate(folio)) {
                folio_unlock(folio);
                goto out;
        }

filler:
        err = filemap_read_folio(file, filler, folio);
        if (err) {
                folio_put(folio);
                if (err == AOP_TRUNCATED_PAGE)
                        goto repeat;
                return ERR_PTR(err);
        }

out:
        folio_mark_accessed(folio);
        return folio;
}

/**
 * read_cache_folio - Read into page cache, fill it if needed.
 * @mapping: The address_space to read from.
 * @index: The index to read.
 * @filler: Function to perform the read, or NULL to use aops->read_folio().
 * @file: Passed to filler function, may be NULL if not required.
 *
 * Read one page into the page cache.  If it succeeds, the folio returned
 * will contain @index, but it may not be the first page of the folio.
 *
 * If the filler function returns an error, it will be returned to the
 * caller.
 *
 * Context: May sleep.  Expects mapping->invalidate_lock to be held.
 * Return: An uptodate folio on success, ERR_PTR() on failure.
 */
struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
                filler_t filler, struct file *file)
{
        return do_read_cache_folio(mapping, index, filler, file,
                        mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_folio);

/**
 * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
 * @mapping:        The address_space for the folio.
 * @index:        The index that the allocated folio will contain.
 * @gfp:        The page allocator flags to use if allocating.
 *
 * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
 * any new memory allocations done using the specified allocation flags.
 *
 * The most likely error from this function is EIO, but ENOMEM is
 * possible and so is EINTR.  If ->read_folio returns another error,
 * that will be returned to the caller.
 *
 * The function expects mapping->invalidate_lock to be already held.
 *
 * Return: Uptodate folio on success, ERR_PTR() on failure.
 */
struct folio *mapping_read_folio_gfp(struct address_space *mapping,
                pgoff_t index, gfp_t gfp)
{
        return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(mapping_read_folio_gfp);

static struct page *do_read_cache_page(struct address_space *mapping,
                pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
{
        struct folio *folio;

        folio = do_read_cache_folio(mapping, index, filler, file, gfp);
        if (IS_ERR(folio))
                return &folio->page;
        return folio_file_page(folio, index);
}

struct page *read_cache_page(struct address_space *mapping,
                        pgoff_t index, filler_t *filler, struct file *file)
{
        return do_read_cache_page(mapping, index, filler, file,
                        mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);

/**
 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
 * @mapping:        the page's address_space
 * @index:        the page index
 * @gfp:        the page allocator flags to use if allocating
 *
 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
 * any new page allocations done using the specified allocation flags.
 *
 * If the page does not get brought uptodate, return -EIO.
 *
 * The function expects mapping->invalidate_lock to be already held.
 *
 * Return: up to date page on success, ERR_PTR() on failure.
 */
struct page *read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index,
                                gfp_t gfp)
{
        return do_read_cache_page(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);

/*
 * Warn about a page cache invalidation failure during a direct I/O write.
 */
static void dio_warn_stale_pagecache(struct file *filp)
{
        static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
        char pathname[128];
        char *path;

        errseq_set(&filp->f_mapping->wb_err, -EIO);
        if (__ratelimit(&_rs)) {
                path = file_path(filp, pathname, sizeof(pathname));
                if (IS_ERR(path))
                        path = "(unknown)";
                pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
                pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
                        current->comm);
        }
}

void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;

        if (mapping->nrpages &&
            invalidate_inode_pages2_range(mapping,
                        iocb->ki_pos >> PAGE_SHIFT,
                        (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
                dio_warn_stale_pagecache(iocb->ki_filp);
}

ssize_t
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        size_t write_len = iov_iter_count(from);
        ssize_t written;

        /*
         * If a page can not be invalidated, return 0 to fall back
         * to buffered write.
         */
        written = kiocb_invalidate_pages(iocb, write_len);
        if (written) {
                if (written == -EBUSY)
                        return 0;
                return written;
        }

        written = mapping->a_ops->direct_IO(iocb, from);

        /*
         * Finally, try again to invalidate clean pages which might have been
         * cached by non-direct readahead, or faulted in by get_user_pages()
         * if the source of the write was an mmap'ed region of the file
         * we're writing.  Either one is a pretty crazy thing to do,
         * so we don't support it 100%.  If this invalidation
         * fails, tough, the write still worked...
         *
         * Most of the time we do not need this since dio_complete() will do
         * the invalidation for us. However there are some file systems that
         * do not end up with dio_complete() being called, so let's not break
         * them by removing it completely.
         *
         * Noticeable example is a blkdev_direct_IO().
         *
         * Skip invalidation for async writes or if mapping has no pages.
         */
        if (written > 0) {
                struct inode *inode = mapping->host;
                loff_t pos = iocb->ki_pos;

                kiocb_invalidate_post_direct_write(iocb, written);
                pos += written;
                write_len -= written;
                if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
                        i_size_write(inode, pos);
                        mark_inode_dirty(inode);
                }
                iocb->ki_pos = pos;
        }
        if (written != -EIOCBQUEUED)
                iov_iter_revert(from, write_len - iov_iter_count(from));
        return written;
}
EXPORT_SYMBOL(generic_file_direct_write);

ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
{
        struct file *file = iocb->ki_filp;
        loff_t pos = iocb->ki_pos;
        struct address_space *mapping = file->f_mapping;
        const struct address_space_operations *a_ops = mapping->a_ops;
        size_t chunk = mapping_max_folio_size(mapping);
        long status = 0;
        ssize_t written = 0;

        do {
                struct folio *folio;
                size_t offset;                /* Offset into folio */
                size_t bytes;                /* Bytes to write to folio */
                size_t copied;                /* Bytes copied from user */
                void *fsdata = NULL;

                bytes = iov_iter_count(i);
retry:
                offset = pos & (chunk - 1);
                bytes = min(chunk - offset, bytes);
                balance_dirty_pages_ratelimited(mapping);

                if (fatal_signal_pending(current)) {
                        status = -EINTR;
                        break;
                }

                status = a_ops->write_begin(iocb, mapping, pos, bytes,
                                                &folio, &fsdata);
                if (unlikely(status < 0))
                        break;

                offset = offset_in_folio(folio, pos);
                if (bytes > folio_size(folio) - offset)
                        bytes = folio_size(folio) - offset;

                if (mapping_writably_mapped(mapping))
                        flush_dcache_folio(folio);

                /*
                 * Faults here on mmap()s can recurse into arbitrary
                 * filesystem code. Lots of locks are held that can
                 * deadlock. Use an atomic copy to avoid deadlocking
                 * in page fault handling.
                 */
                copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
                flush_dcache_folio(folio);

                status = a_ops->write_end(iocb, mapping, pos, bytes, copied,
                                                folio, fsdata);
                if (unlikely(status != copied)) {
                        iov_iter_revert(i, copied - max(status, 0L));
                        if (unlikely(status < 0))
                                break;
                }
                cond_resched();

                if (unlikely(status == 0)) {
                        /*
                         * A short copy made ->write_end() reject the
                         * thing entirely.  Might be memory poisoning
                         * halfway through, might be a race with munmap,
                         * might be severe memory pressure.
                         */
                        if (chunk > PAGE_SIZE)
                                chunk /= 2;
                        if (copied) {
                                bytes = copied;
                                goto retry;
                        }

                        /*
                         * 'folio' is now unlocked and faults on it can be
                         * handled. Ensure forward progress by trying to
                         * fault it in now.
                         */
                        if (fault_in_iov_iter_readable(i, bytes) == bytes) {
                                status = -EFAULT;
                                break;
                        }
                } else {
                        pos += status;
                        written += status;
                }
        } while (iov_iter_count(i));

        if (!written)
                return status;
        iocb->ki_pos += written;
        return written;
}
EXPORT_SYMBOL(generic_perform_write);

/**
 * __generic_file_write_iter - write data to a file
 * @iocb:        IO state structure (file, offset, etc.)
 * @from:        iov_iter with data to write
 *
 * This function does all the work needed for actually writing data to a
 * file. It does all basic checks, removes SUID from the file, updates
 * modification times and calls proper subroutines depending on whether we
 * do direct IO or a standard buffered write.
 *
 * It expects i_rwsem to be grabbed unless we work on a block device or similar
 * object which does not need locking at all.
 *
 * This function does *not* take care of syncing data in case of O_SYNC write.
 * A caller has to handle it. This is mainly due to the fact that we want to
 * avoid syncing under i_rwsem.
 *
 * Return:
 * * number of bytes written, even for truncated writes
 * * negative error code if no data has been written at all
 */
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        ssize_t ret;

        ret = file_remove_privs(file);
        if (ret)
                return ret;

        ret = file_update_time(file);
        if (ret)
                return ret;

        if (iocb->ki_flags & IOCB_DIRECT) {
                ret = generic_file_direct_write(iocb, from);
                /*
                 * If the write stopped short of completing, fall back to
                 * buffered writes.  Some filesystems do this for writes to
                 * holes, for example.  For DAX files, a buffered write will
                 * not succeed (even if it did, DAX does not handle dirty
                 * page-cache pages correctly).
                 */
                if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
                        return ret;
                return direct_write_fallback(iocb, from, ret,
                                generic_perform_write(iocb, from));
        }

        return generic_perform_write(iocb, from);
}
EXPORT_SYMBOL(__generic_file_write_iter);

/**
 * generic_file_write_iter - write data to a file
 * @iocb:        IO state structure
 * @from:        iov_iter with data to write
 *
 * This is a wrapper around __generic_file_write_iter() to be used by most
 * filesystems. It takes care of syncing the file in case of O_SYNC file
 * and acquires i_rwsem as needed.
 * Return:
 * * negative error code if no data has been written at all of
 *   vfs_fsync_range() failed for a synchronous write
 * * number of bytes written, even for truncated writes
 */
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;

        inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret > 0)
                ret = __generic_file_write_iter(iocb, from);
        inode_unlock(inode);

        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        return ret;
}
EXPORT_SYMBOL(generic_file_write_iter);

/**
 * filemap_release_folio() - Release fs-specific metadata on a folio.
 * @folio: The folio which the kernel is trying to free.
 * @gfp: Memory allocation flags (and I/O mode).
 *
 * The address_space is trying to release any data attached to a folio
 * (presumably at folio->private).
 *
 * This will also be called if the private_2 flag is set on a page,
 * indicating that the folio has other metadata associated with it.
 *
 * The @gfp argument specifies whether I/O may be performed to release
 * this page (__GFP_IO), and whether the call may block
 * (__GFP_RECLAIM & __GFP_FS).
 *
 * Return: %true if the release was successful, otherwise %false.
 */
bool filemap_release_folio(struct folio *folio, gfp_t gfp)
{
        struct address_space * const mapping = folio->mapping;

        BUG_ON(!folio_test_locked(folio));
        if (!folio_needs_release(folio))
                return true;
        if (folio_test_writeback(folio))
                return false;

        if (mapping && mapping->a_ops->release_folio)
                return mapping->a_ops->release_folio(folio, gfp);
        return try_to_free_buffers(folio);
}
EXPORT_SYMBOL(filemap_release_folio);

/**
 * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache
 * @inode: The inode to flush
 * @flush: Set to write back rather than simply invalidate.
 * @start: First byte to in range.
 * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start
 *       onwards.
 *
 * Invalidate all the folios on an inode that contribute to the specified
 * range, possibly writing them back first.  Whilst the operation is
 * undertaken, the invalidate lock is held to prevent new folios from being
 * installed.
 */
int filemap_invalidate_inode(struct inode *inode, bool flush,
                             loff_t start, loff_t end)
{
        struct address_space *mapping = inode->i_mapping;
        pgoff_t first = start >> PAGE_SHIFT;
        pgoff_t last = end >> PAGE_SHIFT;
        pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;

        if (!mapping || !mapping->nrpages || end < start)
                goto out;

        /* Prevent new folios from being added to the inode. */
        filemap_invalidate_lock(mapping);

        if (!mapping->nrpages)
                goto unlock;

        unmap_mapping_pages(mapping, first, nr, false);

        /* Write back the data if we're asked to. */
        if (flush)
                filemap_fdatawrite_range(mapping, start, end);

        /* Wait for writeback to complete on all folios and discard. */
        invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);

unlock:
        filemap_invalidate_unlock(mapping);
out:
        return filemap_check_errors(mapping);
}
EXPORT_SYMBOL_GPL(filemap_invalidate_inode);

#ifdef CONFIG_CACHESTAT_SYSCALL
/**
 * filemap_cachestat() - compute the page cache statistics of a mapping
 * @mapping:        The mapping to compute the statistics for.
 * @first_index:        The starting page cache index.
 * @last_index:        The final page index (inclusive).
 * @cs:        the cachestat struct to write the result to.
 *
 * This will query the page cache statistics of a mapping in the
 * page range of [first_index, last_index] (inclusive). The statistics
 * queried include: number of dirty pages, number of pages marked for
 * writeback, and the number of (recently) evicted pages.
 */
static void filemap_cachestat(struct address_space *mapping,
                pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
{
        XA_STATE(xas, &mapping->i_pages, first_index);
        struct folio *folio;

        /* Flush stats (and potentially sleep) outside the RCU read section. */
        mem_cgroup_flush_stats_ratelimited(NULL);

        rcu_read_lock();
        xas_for_each(&xas, folio, last_index) {
                int order;
                unsigned long nr_pages;
                pgoff_t folio_first_index, folio_last_index;

                /*
                 * Don't deref the folio. It is not pinned, and might
                 * get freed (and reused) underneath us.
                 *
                 * We *could* pin it, but that would be expensive for
                 * what should be a fast and lightweight syscall.
                 *
                 * Instead, derive all information of interest from
                 * the rcu-protected xarray.
                 */

                if (xas_retry(&xas, folio))
                        continue;

                order = xas_get_order(&xas);
                nr_pages = 1 << order;
                folio_first_index = round_down(xas.xa_index, 1 << order);
                folio_last_index = folio_first_index + nr_pages - 1;

                /* Folios might straddle the range boundaries, only count covered pages */
                if (folio_first_index < first_index)
                        nr_pages -= first_index - folio_first_index;

                if (folio_last_index > last_index)
                        nr_pages -= folio_last_index - last_index;

                if (xa_is_value(folio)) {
                        /* page is evicted */
                        void *shadow = (void *)folio;
                        bool workingset; /* not used */

                        cs->nr_evicted += nr_pages;

#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
                        if (shmem_mapping(mapping)) {
                                /* shmem file - in swap cache */
                                swp_entry_t swp = radix_to_swp_entry(folio);

                                /* swapin error results in poisoned entry */
                                if (!softleaf_is_swap(swp))
                                        goto resched;

                                /*
                                 * Getting a swap entry from the shmem
                                 * inode means we beat
                                 * shmem_unuse(). rcu_read_lock()
                                 * ensures swapoff waits for us before
                                 * freeing the swapper space. However,
                                 * we can race with swapping and
                                 * invalidation, so there might not be
                                 * a shadow in the swapcache (yet).
                                 */
                                shadow = swap_cache_get_shadow(swp);
                                if (!shadow)
                                        goto resched;
                        }
#endif
                        if (workingset_test_recent(shadow, true, &workingset, false))
                                cs->nr_recently_evicted += nr_pages;

                        goto resched;
                }

                /* page is in cache */
                cs->nr_cache += nr_pages;

                if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
                        cs->nr_dirty += nr_pages;

                if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
                        cs->nr_writeback += nr_pages;

resched:
                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();
}

/*
 * See mincore: reveal pagecache information only for files
 * that the calling process has write access to, or could (if
 * tried) open for writing.
 */
static inline bool can_do_cachestat(struct file *f)
{
        if (f->f_mode & FMODE_WRITE)
                return true;
        if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))
                return true;
        return file_permission(f, MAY_WRITE) == 0;
}

/*
 * The cachestat(2) system call.
 *
 * cachestat() returns the page cache statistics of a file in the
 * bytes range specified by `off` and `len`: number of cached pages,
 * number of dirty pages, number of pages marked for writeback,
 * number of evicted pages, and number of recently evicted pages.
 *
 * An evicted page is a page that is previously in the page cache
 * but has been evicted since. A page is recently evicted if its last
 * eviction was recent enough that its reentry to the cache would
 * indicate that it is actively being used by the system, and that
 * there is memory pressure on the system.
 *
 * `off` and `len` must be non-negative integers. If `len` > 0,
 * the queried range is [`off`, `off` + `len`]. If `len` == 0,
 * we will query in the range from `off` to the end of the file.
 *
 * The `flags` argument is unused for now, but is included for future
 * extensibility. User should pass 0 (i.e no flag specified).
 *
 * Currently, hugetlbfs is not supported.
 *
 * Because the status of a page can change after cachestat() checks it
 * but before it returns to the application, the returned values may
 * contain stale information.
 *
 * return values:
 *  zero        - success
 *  -EFAULT     - cstat or cstat_range points to an illegal address
 *  -EINVAL     - invalid flags
 *  -EBADF      - invalid file descriptor
 *  -EOPNOTSUPP - file descriptor is of a hugetlbfs file
 */
SYSCALL_DEFINE4(cachestat, unsigned int, fd,
                struct cachestat_range __user *, cstat_range,
                struct cachestat __user *, cstat, unsigned int, flags)
{
        CLASS(fd, f)(fd);
        struct address_space *mapping;
        struct cachestat_range csr;
        struct cachestat cs;
        pgoff_t first_index, last_index;

        if (fd_empty(f))
                return -EBADF;

        if (copy_from_user(&csr, cstat_range,
                        sizeof(struct cachestat_range)))
                return -EFAULT;

        /* hugetlbfs is not supported */
        if (is_file_hugepages(fd_file(f)))
                return -EOPNOTSUPP;

        if (!can_do_cachestat(fd_file(f)))
                return -EPERM;

        if (flags != 0)
                return -EINVAL;

        first_index = csr.off >> PAGE_SHIFT;
        last_index =
                csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
        memset(&cs, 0, sizeof(struct cachestat));
        mapping = fd_file(f)->f_mapping;
        filemap_cachestat(mapping, first_index, last_index, &cs);

        if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
                return -EFAULT;

        return 0;
}
#endif /* CONFIG_CACHESTAT_SYSCALL */




















































































































    2 













    2 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_H
#define _LINUX_SCHED_TASK_H

/*
 * Interface between the scheduler and various task lifetime (fork()/exit())
 * functionality:
 */

#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/uaccess.h>

struct task_struct;
struct rusage;
union thread_union;
struct css_set;

/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL

struct kernel_clone_args {
        u64 flags;
        int __user *pidfd;
        int __user *child_tid;
        int __user *parent_tid;
        const char *name;
        int exit_signal;
        u32 kthread:1;
        u32 io_thread:1;
        u32 user_worker:1;
        u32 no_files:1;
        unsigned long stack;
        unsigned long stack_size;
        unsigned long tls;
        pid_t *set_tid;
        /* Number of elements in *set_tid */
        size_t set_tid_size;
        int cgroup;
        int idle;
        int (*fn)(void *);
        void *fn_arg;
        struct cgroup *cgrp;
        struct css_set *cset;
        unsigned int kill_seq;
};

/*
 * This serializes "schedule()" and also protects
 * the run-queue from deletions/modifications (but
 * _adding_ to the beginning of the run-queue has
 * a separate lock).
 */
extern rwlock_t tasklist_lock;
extern spinlock_t mmlist_lock;

extern union thread_union init_thread_union;
extern struct task_struct init_task;

extern int lockdep_tasklist_lock_is_held(void);

extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(u64 clone_flags, struct task_struct *p);
extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
extern void sched_cancel_fork(struct task_struct *p);
extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
void __noreturn make_task_dead(int signr);

extern void mm_cache_init(void);
extern void proc_caches_init(void);

extern void fork_init(void);

extern void release_task(struct task_struct * p);

extern int copy_thread(struct task_struct *, const struct kernel_clone_args *);

extern void flush_thread(void);

#ifdef CONFIG_HAVE_EXIT_THREAD
extern void exit_thread(struct task_struct *tsk);
#else
static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
extern __noreturn void do_group_exit(int);

extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);

extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *copy_process(struct pid *pid, int trace, int node,
                                 struct kernel_clone_args *args);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
                            unsigned long flags);
extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);

extern void free_task(struct task_struct *tsk);

/* sched_exec is called by processes performing an exec */
extern void sched_exec(void);

static inline struct task_struct *get_task_struct(struct task_struct *t)
{
        refcount_inc(&t->usage);
        return t;
}

static inline struct task_struct *tryget_task_struct(struct task_struct *t)
{
        return refcount_inc_not_zero(&t->usage) ? t : NULL;
}

extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);

static inline void put_task_struct(struct task_struct *t)
{
        if (!refcount_dec_and_test(&t->usage))
                return;

        /*
         * Under PREEMPT_RT, we can't call __put_task_struct
         * in atomic context because it will indirectly
         * acquire sleeping locks. The same is true if the
         * current process has a mutex enqueued (blocked on
         * a PI chain).
         *
         * In !RT, it is always safe to call __put_task_struct().
         * Though, in order to simplify the code, resort to the
         * deferred call too.
         *
         * call_rcu() will schedule __put_task_struct_rcu_cb()
         * to be called in process context.
         *
         * __put_task_struct() is called when
         * refcount_dec_and_test(&t->usage) succeeds.
         *
         * This means that it can't "conflict" with
         * put_task_struct_rcu_user() which abuses ->rcu the same
         * way; rcu_users has a reference so task->usage can't be
         * zero after rcu_users 1 -> 0 transition.
         *
         * delayed_free_task() also uses ->rcu, but it is only called
         * when it fails to fork a process. Therefore, there is no
         * way it can conflict with __put_task_struct().
         */
        call_rcu(&t->rcu, __put_task_struct_rcu_cb);
}

DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))

static inline void put_task_struct_many(struct task_struct *t, int nr)
{
        if (refcount_sub_and_test(nr, &t->usage))
                __put_task_struct(t);
}

void put_task_struct_rcu_user(struct task_struct *task);

/* Free all architecture-specific resources held by a thread. */
void release_thread(struct task_struct *dead_task);

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
# define arch_task_struct_size (sizeof(struct task_struct))
#endif

#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
/*
 * If an architecture has not declared a thread_struct whitelist we
 * must assume something there may need to be copied to userspace.
 */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
                                                unsigned long *size)
{
        *offset = 0;
        /* Handle dynamically sized thread_struct. */
        *size = arch_task_struct_size - offsetof(struct task_struct, thread);
}
#endif

#ifdef CONFIG_VMAP_STACK
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return t->stack_vm_area;
}
#else
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return NULL;
}
#endif

/*
 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
 * subscriptions and synchronises with wait4().  Also used in procfs.  Also
 * pins the final release of task.io_context.  Also protects ->cpuset and
 * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
 *
 * Nests inside of read_lock(&tasklist_lock). It must not be nested with
 * write_lock_irq(&tasklist_lock), neither inside nor outside.
 */
static inline void task_lock(struct task_struct *p)
        __acquires(&p->alloc_lock)
{
        spin_lock(&p->alloc_lock);
}

static inline void task_unlock(struct task_struct *p)
        __releases(&p->alloc_lock)
{
        spin_unlock(&p->alloc_lock);
}

DEFINE_LOCK_GUARD_1(task_lock, struct task_struct, task_lock(_T->lock), task_unlock(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(task_lock, __acquires(&_T->alloc_lock), __releases(&(*(struct task_struct **)_T)->alloc_lock))
#define class_task_lock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(task_lock, _T)

#endif /* _LINUX_SCHED_TASK_H */
































































    3 














































































































































































































   10 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_INETDEVICE_H
#define _LINUX_INETDEVICE_H

#ifdef __KERNEL__

#include <linux/bitmap.h>
#include <linux/if.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/rcupdate.h>
#include <linux/timer.h>
#include <linux/sysctl.h>
#include <linux/rtnetlink.h>
#include <linux/refcount.h>

struct ipv4_devconf {
        void        *sysctl;
        int        data[IPV4_DEVCONF_MAX];
        DECLARE_BITMAP(state, IPV4_DEVCONF_MAX);
};

#define MC_HASH_SZ_LOG 9

struct in_device {
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;

        refcount_t                refcnt;
        int                        dead;
        struct in_ifaddr        __rcu *ifa_list;/* IP ifaddr chain                */

        struct ip_mc_list __rcu        *mc_list;        /* IP multicast filter chain    */
        struct ip_mc_list __rcu        * __rcu *mc_hash;

        int                        mc_count;        /* Number of installed mcasts        */
        spinlock_t                mc_tomb_lock;
        struct ip_mc_list        *mc_tomb;
        unsigned long                mr_v1_seen;
        unsigned long                mr_v2_seen;
        unsigned long                mr_qi;                /* Query Interval */
        unsigned long                mr_qri;                /* Query Response Interval */
        unsigned char                mr_qrv;                /* Query Robustness Variable */
        unsigned char                mr_gq_running;
        u32                        mr_maxdelay;
        u32                        mr_ifc_count;
        struct timer_list        mr_gq_timer;        /* general query timer */
        struct timer_list        mr_ifc_timer;        /* interface change timer */

        struct neigh_parms        *arp_parms;
        struct ipv4_devconf        cnf;
        struct rcu_head                rcu_head;
};

#define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1])
#define IPV4_DEVCONF_RO(cnf, attr) READ_ONCE(IPV4_DEVCONF(cnf, attr))
#define IPV4_DEVCONF_ALL(net, attr) \
        IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr)
#define IPV4_DEVCONF_ALL_RO(net, attr) READ_ONCE(IPV4_DEVCONF_ALL(net, attr))

static inline int ipv4_devconf_get(const struct in_device *in_dev, int index)
{
        index--;
        return READ_ONCE(in_dev->cnf.data[index]);
}

static inline void ipv4_devconf_set(struct in_device *in_dev, int index,
                                    int val)
{
        index--;
        set_bit(index, in_dev->cnf.state);
        WRITE_ONCE(in_dev->cnf.data[index], val);
}

static inline void ipv4_devconf_setall(struct in_device *in_dev)
{
        bitmap_fill(in_dev->cnf.state, IPV4_DEVCONF_MAX);
}

#define IN_DEV_CONF_GET(in_dev, attr) \
        ipv4_devconf_get((in_dev), IPV4_DEVCONF_ ## attr)
#define IN_DEV_CONF_SET(in_dev, attr, val) \
        ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val))

#define IN_DEV_ANDCONF(in_dev, attr) \
        (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr) && \
         IN_DEV_CONF_GET((in_dev), attr))

#define IN_DEV_NET_ORCONF(in_dev, net, attr) \
        (IPV4_DEVCONF_ALL_RO(net, attr) || \
         IN_DEV_CONF_GET((in_dev), attr))

#define IN_DEV_ORCONF(in_dev, attr) \
        IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr)

#define IN_DEV_MAXCONF(in_dev, attr) \
        (max(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr), \
             IN_DEV_CONF_GET((in_dev), attr)))

#define IN_DEV_FORWARD(in_dev)                IN_DEV_CONF_GET((in_dev), FORWARDING)
#define IN_DEV_MFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
#define IN_DEV_BFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
#define IN_DEV_RPFILTER(in_dev)                IN_DEV_MAXCONF((in_dev), RP_FILTER)
#define IN_DEV_SRC_VMARK(in_dev)            IN_DEV_ORCONF((in_dev), SRC_VMARK)
#define IN_DEV_SOURCE_ROUTE(in_dev)        IN_DEV_ANDCONF((in_dev), \
                                                       ACCEPT_SOURCE_ROUTE)
#define IN_DEV_ACCEPT_LOCAL(in_dev)        IN_DEV_ORCONF((in_dev), ACCEPT_LOCAL)
#define IN_DEV_BOOTP_RELAY(in_dev)        IN_DEV_ANDCONF((in_dev), BOOTP_RELAY)

#define IN_DEV_LOG_MARTIANS(in_dev)        IN_DEV_ORCONF((in_dev), LOG_MARTIANS)
#define IN_DEV_PROXY_ARP(in_dev)        IN_DEV_ORCONF((in_dev), PROXY_ARP)
#define IN_DEV_PROXY_ARP_PVLAN(in_dev)        IN_DEV_ORCONF((in_dev), PROXY_ARP_PVLAN)
#define IN_DEV_SHARED_MEDIA(in_dev)        IN_DEV_ORCONF((in_dev), SHARED_MEDIA)
#define IN_DEV_TX_REDIRECTS(in_dev)        IN_DEV_ORCONF((in_dev), SEND_REDIRECTS)
#define IN_DEV_SEC_REDIRECTS(in_dev)        IN_DEV_ORCONF((in_dev), \
                                                      SECURE_REDIRECTS)
#define IN_DEV_IDTAG(in_dev)                IN_DEV_CONF_GET(in_dev, TAG)
#define IN_DEV_MEDIUM_ID(in_dev)        IN_DEV_CONF_GET(in_dev, MEDIUM_ID)
#define IN_DEV_PROMOTE_SECONDARIES(in_dev) \
                                        IN_DEV_ORCONF((in_dev), \
                                                      PROMOTE_SECONDARIES)
#define IN_DEV_ROUTE_LOCALNET(in_dev)        IN_DEV_ORCONF(in_dev, ROUTE_LOCALNET)
#define IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)        \
        IN_DEV_NET_ORCONF(in_dev, net, ROUTE_LOCALNET)

#define IN_DEV_RX_REDIRECTS(in_dev) \
        ((IN_DEV_FORWARD(in_dev) && \
          IN_DEV_ANDCONF((in_dev), ACCEPT_REDIRECTS)) \
         || (!IN_DEV_FORWARD(in_dev) && \
          IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS)))

#define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \
        IN_DEV_ORCONF((in_dev), IGNORE_ROUTES_WITH_LINKDOWN)

#define IN_DEV_ARPFILTER(in_dev)        IN_DEV_ORCONF((in_dev), ARPFILTER)
#define IN_DEV_ARP_ACCEPT(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_ACCEPT)
#define IN_DEV_ARP_ANNOUNCE(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
#define IN_DEV_ARP_IGNORE(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_IGNORE)
#define IN_DEV_ARP_NOTIFY(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
#define IN_DEV_ARP_EVICT_NOCARRIER(in_dev) IN_DEV_ANDCONF((in_dev), \
                                                          ARP_EVICT_NOCARRIER)

struct in_ifaddr {
        struct hlist_node        addr_lst;
        struct in_ifaddr        __rcu *ifa_next;
        struct in_device        *ifa_dev;
        struct rcu_head                rcu_head;
        __be32                        ifa_local;
        __be32                        ifa_address;
        __be32                        ifa_mask;
        __u32                        ifa_rt_priority;
        __be32                        ifa_broadcast;
        unsigned char                ifa_scope;
        unsigned char                ifa_prefixlen;
        unsigned char                ifa_proto;
        __u32                        ifa_flags;
        char                        ifa_label[IFNAMSIZ];

        /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
        __u32                        ifa_valid_lft;
        __u32                        ifa_preferred_lft;
        unsigned long                ifa_cstamp; /* created timestamp */
        unsigned long                ifa_tstamp; /* updated timestamp */
};

struct in_validator_info {
        __be32                        ivi_addr;
        struct in_device        *ivi_dev;
        struct netlink_ext_ack        *extack;
};

int register_inetaddr_notifier(struct notifier_block *nb);
int unregister_inetaddr_notifier(struct notifier_block *nb);
int register_inetaddr_validator_notifier(struct notifier_block *nb);
int unregister_inetaddr_validator_notifier(struct notifier_block *nb);

void inet_netconf_notify_devconf(struct net *net, int event, int type,
                                 int ifindex, struct ipv4_devconf *devconf);

struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref);
static inline struct net_device *ip_dev_find(struct net *net, __be32 addr)
{
        return __ip_dev_find(net, addr, true);
}

int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b);
int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *);
#ifdef CONFIG_INET
int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size);
#else
static inline int inet_gifconf(struct net_device *dev, char __user *buf,
                               int len, int size)
{
        return 0;
}
#endif
void devinet_init(void);
struct in_device *inetdev_by_index(struct net *, int);
__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope);
__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst,
                         __be32 local, int scope);
struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
                                    __be32 mask);
struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa)
{
        return !((addr^ifa->ifa_address)&ifa->ifa_mask);
}

/*
 *        Check if a mask is acceptable.
 */
 
static __inline__ bool bad_mask(__be32 mask, __be32 addr)
{
        __u32 hmask;
        if (addr & (mask = ~mask))
                return true;
        hmask = ntohl(mask);
        if (hmask & (hmask+1))
                return true;
        return false;
}

#define in_dev_for_each_ifa_rtnl(ifa, in_dev)                        \
        for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa;        \
             ifa = rtnl_dereference(ifa->ifa_next))

#define in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev)                        \
        for (ifa = rtnl_net_dereference(net, (in_dev)->ifa_list); ifa;        \
             ifa = rtnl_net_dereference(net, ifa->ifa_next))

#define in_dev_for_each_ifa_rcu(ifa, in_dev)                        \
        for (ifa = rcu_dereference((in_dev)->ifa_list); ifa;        \
             ifa = rcu_dereference(ifa->ifa_next))

static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
{
        return rcu_dereference(dev->ip_ptr);
}

static inline struct in_device *in_dev_get(const struct net_device *dev)
{
        struct in_device *in_dev;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(dev);
        if (in_dev)
                refcount_inc(&in_dev->refcnt);
        rcu_read_unlock();
        return in_dev;
}

static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev)
{
        return rtnl_dereference(dev->ip_ptr);
}

static inline struct in_device *__in_dev_get_rtnl_net(const struct net_device *dev)
{
        return rtnl_net_dereference(dev_net(dev), dev->ip_ptr);
}

/* called with rcu_read_lock or rtnl held */
static inline bool ip_ignore_linkdown(const struct net_device *dev)
{
        struct in_device *in_dev;
        bool rc = false;

        in_dev = rcu_dereference_rtnl(dev->ip_ptr);
        if (in_dev &&
            IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
                rc = true;

        return rc;
}

static inline struct neigh_parms *__in_dev_arp_parms_get_rcu(const struct net_device *dev)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);

        return in_dev ? in_dev->arp_parms : NULL;
}

void in_dev_finish_destroy(struct in_device *idev);

static inline void in_dev_put(struct in_device *idev)
{
        if (refcount_dec_and_test(&idev->refcnt))
                in_dev_finish_destroy(idev);
}

#define __in_dev_put(idev)  refcount_dec(&(idev)->refcnt)
#define in_dev_hold(idev)   refcount_inc(&(idev)->refcnt)

#endif /* __KERNEL__ */

static __inline__ __be32 inet_make_mask(int logmask)
{
        if (logmask)
                return htonl(~((1U<<(32-logmask))-1));
        return 0;
}

static __inline__ int inet_mask_len(__be32 mask)
{
        __u32 hmask = ntohl(mask);
        if (!hmask)
                return 0;
        return 32 - ffz(~hmask);
}


#endif /* _LINUX_INETDEVICE_H */






















































































































































































































































































































































   33 















































   16 











    1 





    1 






























































































































   73 



































    8 















    3 









    1 





















    1 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SPINLOCK_H
#define __LINUX_SPINLOCK_H
#define __LINUX_INSIDE_SPINLOCK_H

/*
 * include/linux/spinlock.h - generic spinlock/rwlock declarations
 *
 * here's the role of the various spinlock/rwlock related include files:
 *
 * on SMP builds:
 *
 *  asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the
 *                        initializers
 *
 *  linux/spinlock_types_raw:
 *                          The raw types and initializers
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  asm/spinlock.h:       contains the arch_spin_*()/etc. lowlevel
 *                        implementations, mostly inline assembly code
 *
 *   (also included on UP-debug builds:)
 *
 *  linux/spinlock_api_smp.h:
 *                        contains the prototypes for the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 *
 * on UP builds:
 *
 *  linux/spinlock_type_up.h:
 *                        contains the generic, simplified UP spinlock type.
 *                        (which is an empty structure on non-debug builds)
 *
 *  linux/spinlock_types_raw:
 *                          The raw RT types and initializers
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  linux/spinlock_up.h:
 *                        contains the arch_spin_*()/etc. version of UP
 *                        builds. (which are NOPs on non-debug, non-preempt
 *                        builds)
 *
 *   (included on UP-non-debug builds:)
 *
 *  linux/spinlock_api_up.h:
 *                        builds the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 */

#include <linux/typecheck.h>
#include <linux/preempt.h>
#include <linux/linkage.h>
#include <linux/compiler.h>
#include <linux/irqflags.h>
#include <linux/thread_info.h>
#include <linux/stringify.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/barrier.h>
#include <asm/mmiowb.h>


/*
 * Must define these before including other files, inline functions need them
 */
#define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME

#define LOCK_SECTION_START(extra)               \
        ".subsection 1\n\t"                     \
        extra                                   \
        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
        LOCK_SECTION_NAME ":\n\t"               \
        ".endif\n"

#define LOCK_SECTION_END                        \
        ".previous\n\t"

#define __lockfunc __section(".spinlock.text")

/*
 * Pull the arch_spinlock_t and arch_rwlock_t definitions:
 */
#include <linux/spinlock_types.h>

/*
 * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them):
 */
#ifdef CONFIG_SMP
# include <asm/spinlock.h>
#else
# include <linux/spinlock_up.h>
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
  extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
                                   struct lock_class_key *key, short inner);

# define raw_spin_lock_init(lock)                                        \
do {                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN);        \
} while (0)

#else
# define raw_spin_lock_init(lock)                                \
        do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0)
#endif

#define raw_spin_is_locked(lock)        arch_spin_is_locked(&(lock)->raw_lock)

#ifdef arch_spin_is_contended
#define raw_spin_is_contended(lock)        arch_spin_is_contended(&(lock)->raw_lock)
#else
#define raw_spin_is_contended(lock)        (((void)(lock), 0))
#endif /*arch_spin_is_contended*/

/*
 * smp_mb__after_spinlock() provides the equivalent of a full memory barrier
 * between program-order earlier lock acquisitions and program-order later
 * memory accesses.
 *
 * This guarantees that the following two properties hold:
 *
 *   1) Given the snippet:
 *
 *          { X = 0;  Y = 0; }
 *
 *          CPU0                                CPU1
 *
 *          WRITE_ONCE(X, 1);                WRITE_ONCE(Y, 1);
 *          spin_lock(S);                        smp_mb();
 *          smp_mb__after_spinlock();        r1 = READ_ONCE(X);
 *          r0 = READ_ONCE(Y);
 *          spin_unlock(S);
 *
 *      it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0)
 *      and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments
 *      preceding the call to smp_mb__after_spinlock() in __schedule() and in
 *      try_to_wake_up().
 *
 *   2) Given the snippet:
 *
 *  { X = 0;  Y = 0; }
 *
 *  CPU0                CPU1                                CPU2
 *
 *  spin_lock(S);        spin_lock(S);                        r1 = READ_ONCE(Y);
 *  WRITE_ONCE(X, 1);        smp_mb__after_spinlock();        smp_rmb();
 *  spin_unlock(S);        r0 = READ_ONCE(X);                r2 = READ_ONCE(X);
 *                        WRITE_ONCE(Y, 1);
 *                        spin_unlock(S);
 *
 *      it is forbidden that CPU0's critical section executes before CPU1's
 *      critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1)
 *      and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments
 *      preceding the calls to smp_rmb() in try_to_wake_up() for similar
 *      snippets but "projected" onto two CPUs.
 *
 * Property (2) upgrades the lock to an RCsc lock.
 *
 * Since most load-store architectures implement ACQUIRE with an smp_mb() after
 * the LL/SC loop, they need no further barriers. Similarly all our TSO
 * architectures imply an smp_mb() for each atomic instruction and equally don't
 * need more.
 *
 * Architectures that can implement ACQUIRE better need to take care.
 */
#ifndef smp_mb__after_spinlock
#define smp_mb__after_spinlock()        kcsan_mb()
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
 extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
 extern int do_raw_spin_trylock(raw_spinlock_t *lock) __cond_acquires(true, lock);
 extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
#else
static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
{
        __acquire(lock);
        arch_spin_lock(&lock->raw_lock);
        mmiowb_spin_lock();
}

static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
        __cond_acquires(true, lock)
{
        int ret = arch_spin_trylock(&(lock)->raw_lock);

        if (ret)
                mmiowb_spin_lock();

        return ret;
}

static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
{
        mmiowb_spin_unlock();
        arch_spin_unlock(&lock->raw_lock);
        __release(lock);
}
#endif

/*
 * Define the various spin_lock methods.  Note we define these
 * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The
 * various methods are defined as nops in the case they are not
 * required.
 */
#define raw_spin_trylock(lock)        _raw_spin_trylock(lock)

#define raw_spin_lock(lock)        _raw_spin_lock(lock)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define raw_spin_lock_nested(lock, subclass) \
        _raw_spin_lock_nested(lock, subclass)

# define raw_spin_lock_nest_lock(lock, nest_lock)                        \
         do {                                                                \
                 typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\
                 _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map);        \
         } while (0)
#else
/*
 * Always evaluate the 'subclass' argument to avoid that the compiler
 * warns about set-but-not-used variables when building with
 * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1.
 */
# define raw_spin_lock_nested(lock, subclass)                \
        _raw_spin_lock(((void)(subclass), (lock)))
# define raw_spin_lock_nest_lock(lock, nest_lock)        _raw_spin_lock(lock)
#endif

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)

#define raw_spin_lock_irqsave(lock, flags)                        \
        do {                                                \
                typecheck(unsigned long, flags);        \
                flags = _raw_spin_lock_irqsave(lock);        \
        } while (0)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave_nested(lock, subclass);        \
        } while (0)
#else
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave(lock);                        \
        } while (0)
#endif

#else

#define raw_spin_lock_irqsave(lock, flags)                \
        do {                                                \
                typecheck(unsigned long, flags);        \
                _raw_spin_lock_irqsave(lock, flags);        \
        } while (0)

#define raw_spin_lock_irqsave_nested(lock, flags, subclass)        \
        raw_spin_lock_irqsave(lock, flags)

#endif

#define raw_spin_lock_irq(lock)                _raw_spin_lock_irq(lock)
#define raw_spin_lock_bh(lock)                _raw_spin_lock_bh(lock)
#define raw_spin_unlock(lock)                _raw_spin_unlock(lock)
#define raw_spin_unlock_irq(lock)        _raw_spin_unlock_irq(lock)

#define raw_spin_unlock_irqrestore(lock, flags)                \
        do {                                                        \
                typecheck(unsigned long, flags);                \
                _raw_spin_unlock_irqrestore(lock, flags);        \
        } while (0)
#define raw_spin_unlock_bh(lock)        _raw_spin_unlock_bh(lock)

#define raw_spin_trylock_bh(lock)        _raw_spin_trylock_bh(lock)

#define raw_spin_trylock_irq(lock)        _raw_spin_trylock_irq(lock)

#define raw_spin_trylock_irqsave(lock, flags) _raw_spin_trylock_irqsave(lock, &(flags))

#ifndef CONFIG_PREEMPT_RT
/* Include rwlock functions for !RT */
#include <linux/rwlock.h>
#endif

/*
 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
 */
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
# include <linux/spinlock_api_smp.h>
#else
# include <linux/spinlock_api_up.h>
#endif

/* Non PREEMPT_RT kernel, map to raw spinlocks: */
#ifndef CONFIG_PREEMPT_RT

/*
 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
 */

static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
{
        return &lock->rlock;
}

#ifdef CONFIG_DEBUG_SPINLOCK

# define spin_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __raw_spin_lock_init(spinlock_check(lock),                \
                             #lock, &__key, LD_WAIT_CONFIG);        \
} while (0)

#else

# define spin_lock_init(_lock)                        \
do {                                                \
        spinlock_check(_lock);                        \
        *(_lock) = __SPIN_LOCK_UNLOCKED(_lock);        \
} while (0)

#endif

static __always_inline void spin_lock(spinlock_t *lock)
        __acquires(lock) __no_context_analysis
{
        raw_spin_lock(&lock->rlock);
}

static __always_inline void spin_lock_bh(spinlock_t *lock)
        __acquires(lock) __no_context_analysis
{
        raw_spin_lock_bh(&lock->rlock);
}

static __always_inline int spin_trylock(spinlock_t *lock)
        __cond_acquires(true, lock) __no_context_analysis
{
        return raw_spin_trylock(&lock->rlock);
}

#define spin_lock_nested(lock, subclass)                        \
do {                                                                \
        raw_spin_lock_nested(spinlock_check(lock), subclass);        \
        __release(spinlock_check(lock)); __acquire(lock);        \
} while (0)

#define spin_lock_nest_lock(lock, nest_lock)                                \
do {                                                                        \
        raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);        \
        __release(spinlock_check(lock)); __acquire(lock);        \
} while (0)

static __always_inline void spin_lock_irq(spinlock_t *lock)
        __acquires(lock) __no_context_analysis
{
        raw_spin_lock_irq(&lock->rlock);
}

#define spin_lock_irqsave(lock, flags)                                \
do {                                                                \
        raw_spin_lock_irqsave(spinlock_check(lock), flags);        \
        __release(spinlock_check(lock)); __acquire(lock);        \
} while (0)

#define spin_lock_irqsave_nested(lock, flags, subclass)                        \
do {                                                                        \
        raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
        __release(spinlock_check(lock)); __acquire(lock);                \
} while (0)

static __always_inline void spin_unlock(spinlock_t *lock)
        __releases(lock) __no_context_analysis
{
        raw_spin_unlock(&lock->rlock);
}

static __always_inline void spin_unlock_bh(spinlock_t *lock)
        __releases(lock) __no_context_analysis
{
        raw_spin_unlock_bh(&lock->rlock);
}

static __always_inline void spin_unlock_irq(spinlock_t *lock)
        __releases(lock) __no_context_analysis
{
        raw_spin_unlock_irq(&lock->rlock);
}

static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
        __releases(lock) __no_context_analysis
{
        raw_spin_unlock_irqrestore(&lock->rlock, flags);
}

static __always_inline int spin_trylock_bh(spinlock_t *lock)
        __cond_acquires(true, lock) __no_context_analysis
{
        return raw_spin_trylock_bh(&lock->rlock);
}

static __always_inline int spin_trylock_irq(spinlock_t *lock)
        __cond_acquires(true, lock) __no_context_analysis
{
        return raw_spin_trylock_irq(&lock->rlock);
}

static __always_inline bool _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
        __cond_acquires(true, lock) __no_context_analysis
{
        return raw_spin_trylock_irqsave(spinlock_check(lock), *flags);
}
#define spin_trylock_irqsave(lock, flags) _spin_trylock_irqsave(lock, &(flags))

/**
 * spin_is_locked() - Check whether a spinlock is locked.
 * @lock: Pointer to the spinlock.
 *
 * This function is NOT required to provide any memory ordering
 * guarantees; it could be used for debugging purposes or, when
 * additional synchronization is needed, accompanied with other
 * constructs (memory barriers) enforcing the synchronization.
 *
 * Returns: 1 if @lock is locked, 0 otherwise.
 *
 * Note that the function only tells you that the spinlock is
 * seen to be locked, not that it is locked on your CPU.
 *
 * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n,
 * the return value is always 0 (see include/linux/spinlock_up.h).
 * Therefore you should not rely heavily on the return value.
 */
static __always_inline int spin_is_locked(spinlock_t *lock)
{
        return raw_spin_is_locked(&lock->rlock);
}

static __always_inline int spin_is_contended(spinlock_t *lock)
{
        return raw_spin_is_contended(&lock->rlock);
}

#define assert_spin_locked(lock)        assert_raw_spin_locked(&(lock)->rlock)

#else  /* !CONFIG_PREEMPT_RT */
# include <linux/spinlock_rt.h>
#endif /* CONFIG_PREEMPT_RT */

/*
 * Does a critical section need to be broken due to another
 * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
 * but a general need for low latency)
 */
static inline int spin_needbreak(spinlock_t *lock)
{
        if (!preempt_model_preemptible())
                return 0;

        return spin_is_contended(lock);
}

/*
 * Check if a rwlock is contended.
 * Returns non-zero if there is another task waiting on the rwlock.
 * Returns zero if the lock is not contended or the system / underlying
 * rwlock implementation does not support contention detection.
 * Technically does not depend on CONFIG_PREEMPTION, but a general need
 * for low latency.
 */
static inline int rwlock_needbreak(rwlock_t *lock)
{
        if (!preempt_model_preemptible())
                return 0;

        return rwlock_is_contended(lock);
}

/*
 * Pull the atomic_t declaration:
 * (asm-mips/atomic.h needs above definitions)
 */
#include <linux/atomic.h>
/**
 * atomic_dec_and_lock - lock on reaching reference count zero
 * @atomic: the atomic counter
 * @lock: the spinlock in question
 *
 * Decrements @atomic by 1.  If the result is 0, returns true and locks
 * @lock.  Returns false for all other cases.
 */
extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) __cond_acquires(true, lock);

extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                        unsigned long *flags) __cond_acquires(true, lock);
#define atomic_dec_and_lock_irqsave(atomic, lock, flags) _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))

extern int atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) __cond_acquires(true, lock);

extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock,
                                            unsigned long *flags) __cond_acquires(true, lock);
#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags))

int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
                             size_t max_size, unsigned int cpu_mult,
                             gfp_t gfp, const char *name,
                             struct lock_class_key *key);

#define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp)    \
        ({                                                                     \
                static struct lock_class_key key;                             \
                int ret;                                                     \
                                                                             \
                ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size,   \
                                               cpu_mult, gfp, #locks, &key); \
                ret;                                                             \
        })

void free_bucket_spinlocks(spinlock_t *locks);

DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t,
                    raw_spin_lock(_T->lock),
                    raw_spin_unlock(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(raw_spinlock, __acquires(_T), __releases(*(raw_spinlock_t **)_T))
#define class_raw_spinlock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(raw_spinlock, _T)

DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(raw_spinlock_try, __acquires(_T), __releases(*(raw_spinlock_t **)_T))
#define class_raw_spinlock_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(raw_spinlock_try, _T)

DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t,
                    raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING),
                    raw_spin_unlock(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(raw_spinlock_nested, __acquires(_T), __releases(*(raw_spinlock_t **)_T))
#define class_raw_spinlock_nested_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(raw_spinlock_nested, _T)

DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t,
                    raw_spin_lock_irq(_T->lock),
                    raw_spin_unlock_irq(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(raw_spinlock_irq, __acquires(_T), __releases(*(raw_spinlock_t **)_T))
#define class_raw_spinlock_irq_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(raw_spinlock_irq, _T)

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(raw_spinlock_irq_try, __acquires(_T), __releases(*(raw_spinlock_t **)_T))
#define class_raw_spinlock_irq_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(raw_spinlock_irq_try, _T)

DEFINE_LOCK_GUARD_1(raw_spinlock_bh, raw_spinlock_t,
                    raw_spin_lock_bh(_T->lock),
                    raw_spin_unlock_bh(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(raw_spinlock_bh, __acquires(_T), __releases(*(raw_spinlock_t **)_T))
#define class_raw_spinlock_bh_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(raw_spinlock_bh, _T)

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_bh, _try, raw_spin_trylock_bh(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(raw_spinlock_bh_try, __acquires(_T), __releases(*(raw_spinlock_t **)_T))
#define class_raw_spinlock_bh_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(raw_spinlock_bh_try, _T)

DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
                    raw_spin_lock_irqsave(_T->lock, _T->flags),
                    raw_spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)
DECLARE_LOCK_GUARD_1_ATTRS(raw_spinlock_irqsave, __acquires(_T), __releases(*(raw_spinlock_t **)_T))
#define class_raw_spinlock_irqsave_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(raw_spinlock_irqsave, _T)

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try,
                         raw_spin_trylock_irqsave(_T->lock, _T->flags))
DECLARE_LOCK_GUARD_1_ATTRS(raw_spinlock_irqsave_try, __acquires(_T), __releases(*(raw_spinlock_t **)_T))
#define class_raw_spinlock_irqsave_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(raw_spinlock_irqsave_try, _T)

DEFINE_LOCK_GUARD_1(raw_spinlock_init, raw_spinlock_t, raw_spin_lock_init(_T->lock), /* */)
DECLARE_LOCK_GUARD_1_ATTRS(raw_spinlock_init, __acquires(_T), __releases(*(raw_spinlock_t **)_T))
#define class_raw_spinlock_init_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(raw_spinlock_init, _T)

DEFINE_LOCK_GUARD_1(spinlock, spinlock_t,
                    spin_lock(_T->lock),
                    spin_unlock(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(spinlock, __acquires(_T), __releases(*(spinlock_t **)_T))
#define class_spinlock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(spinlock, _T)

DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(spinlock_try, __acquires(_T), __releases(*(spinlock_t **)_T))
#define class_spinlock_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(spinlock_try, _T)

DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
                    spin_lock_irq(_T->lock),
                    spin_unlock_irq(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(spinlock_irq, __acquires(_T), __releases(*(spinlock_t **)_T))
#define class_spinlock_irq_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(spinlock_irq, _T)

DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try,
                         spin_trylock_irq(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(spinlock_irq_try, __acquires(_T), __releases(*(spinlock_t **)_T))
#define class_spinlock_irq_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(spinlock_irq_try, _T)

DEFINE_LOCK_GUARD_1(spinlock_bh, spinlock_t,
                    spin_lock_bh(_T->lock),
                    spin_unlock_bh(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(spinlock_bh, __acquires(_T), __releases(*(spinlock_t **)_T))
#define class_spinlock_bh_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(spinlock_bh, _T)

DEFINE_LOCK_GUARD_1_COND(spinlock_bh, _try,
                         spin_trylock_bh(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(spinlock_bh_try, __acquires(_T), __releases(*(spinlock_t **)_T))
#define class_spinlock_bh_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(spinlock_bh_try, _T)

DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
                    spin_lock_irqsave(_T->lock, _T->flags),
                    spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)
DECLARE_LOCK_GUARD_1_ATTRS(spinlock_irqsave, __acquires(_T), __releases(*(spinlock_t **)_T))
#define class_spinlock_irqsave_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(spinlock_irqsave, _T)

DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
                         spin_trylock_irqsave(_T->lock, _T->flags))
DECLARE_LOCK_GUARD_1_ATTRS(spinlock_irqsave_try, __acquires(_T), __releases(*(spinlock_t **)_T))
#define class_spinlock_irqsave_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(spinlock_irqsave_try, _T)

DEFINE_LOCK_GUARD_1(spinlock_init, spinlock_t, spin_lock_init(_T->lock), /* */)
DECLARE_LOCK_GUARD_1_ATTRS(spinlock_init, __acquires(_T), __releases(*(spinlock_t **)_T))
#define class_spinlock_init_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(spinlock_init, _T)

DEFINE_LOCK_GUARD_1(read_lock, rwlock_t,
                    read_lock(_T->lock),
                    read_unlock(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(read_lock, __acquires(_T), __releases(*(rwlock_t **)_T))
#define class_read_lock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(read_lock, _T)

DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t,
                    read_lock_irq(_T->lock),
                    read_unlock_irq(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(read_lock_irq, __acquires(_T), __releases(*(rwlock_t **)_T))
#define class_read_lock_irq_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(read_lock_irq, _T)

DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t,
                    read_lock_irqsave(_T->lock, _T->flags),
                    read_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)
DECLARE_LOCK_GUARD_1_ATTRS(read_lock_irqsave, __acquires(_T), __releases(*(rwlock_t **)_T))
#define class_read_lock_irqsave_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(read_lock_irqsave, _T)

DEFINE_LOCK_GUARD_1(write_lock, rwlock_t,
                    write_lock(_T->lock),
                    write_unlock(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(write_lock, __acquires(_T), __releases(*(rwlock_t **)_T))
#define class_write_lock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(write_lock, _T)

DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t,
                    write_lock_irq(_T->lock),
                    write_unlock_irq(_T->lock))
DECLARE_LOCK_GUARD_1_ATTRS(write_lock_irq, __acquires(_T), __releases(*(rwlock_t **)_T))
#define class_write_lock_irq_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(write_lock_irq, _T)

DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t,
                    write_lock_irqsave(_T->lock, _T->flags),
                    write_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)
DECLARE_LOCK_GUARD_1_ATTRS(write_lock_irqsave, __acquires(_T), __releases(*(rwlock_t **)_T))
#define class_write_lock_irqsave_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(write_lock_irqsave, _T)

DEFINE_LOCK_GUARD_1(rwlock_init, rwlock_t, rwlock_init(_T->lock), /* */)
DECLARE_LOCK_GUARD_1_ATTRS(rwlock_init, __acquires(_T), __releases(*(rwlock_t **)_T))
#define class_rwlock_init_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rwlock_init, _T)

#undef __LINUX_INSIDE_SPINLOCK_H
#endif /* __LINUX_SPINLOCK_H */























































































































































    1 













    1 




    1 
    1 


























    1 


























































































    1 
























    1 

















    1 








































    2 











    2 






    2 








    2 










































    1 










    1 















    1 





































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions to sequence PREFLUSH and FUA writes.
 *
 * Copyright (C) 2011                Max Planck Institute for Gravitational Physics
 * Copyright (C) 2011                Tejun Heo <tj@kernel.org>
 *
 * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three
 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
 * properties and hardware capability.
 *
 * If a request doesn't have data, only REQ_PREFLUSH makes sense, which
 * indicates a simple flush request.  If there is data, REQ_PREFLUSH indicates
 * that the device cache should be flushed before the data is executed, and
 * REQ_FUA means that the data must be on non-volatile media on request
 * completion.
 *
 * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any
 * difference.  The requests are either completed immediately if there's no data
 * or executed as normal requests otherwise.
 *
 * If the device has writeback cache and supports FUA, REQ_PREFLUSH is
 * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
 *
 * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH
 * is translated to PREFLUSH and REQ_FUA to POSTFLUSH.
 *
 * The actual execution of flush is double buffered.  Whenever a request
 * needs to execute PRE or POSTFLUSH, it queues at
 * fq->flush_queue[fq->flush_pending_idx].  Once certain criteria are met, a
 * REQ_OP_FLUSH is issued and the pending_idx is toggled.  When the flush
 * completes, all the requests which were pending are proceeded to the next
 * step.  This allows arbitrary merging of different types of PREFLUSH/FUA
 * requests.
 *
 * Currently, the following conditions are used to determine when to issue
 * flush.
 *
 * C1. At any given time, only one flush shall be in progress.  This makes
 *     double buffering sufficient.
 *
 * C2. Flush is deferred if any request is executing DATA of its sequence.
 *     This avoids issuing separate POSTFLUSHes for requests which shared
 *     PREFLUSH.
 *
 * C3. The second condition is ignored if there is a request which has
 *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
 *     starvation in the unlikely case where there are continuous stream of
 *     FUA (without PREFLUSH) requests.
 *
 * For devices which support FUA, it isn't clear whether C2 (and thus C3)
 * is beneficial.
 *
 * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice.
 * Once while executing DATA and again after the whole sequence is
 * complete.  The first completion updates the contained bio but doesn't
 * finish it so that the bio submitter is notified only after the whole
 * sequence is complete.  This is implemented by testing RQF_FLUSH_SEQ in
 * req_bio_endio().
 *
 * The above peculiarity requires that each PREFLUSH/FUA request has only one
 * bio attached to it, which is guaranteed as they aren't allowed to be
 * merged in the usual way.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/part_stat.h>

#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"

/* PREFLUSH/FUA sequences */
enum {
        REQ_FSEQ_PREFLUSH        = (1 << 0), /* pre-flushing in progress */
        REQ_FSEQ_DATA                = (1 << 1), /* data write in progress */
        REQ_FSEQ_POSTFLUSH        = (1 << 2), /* post-flushing in progress */
        REQ_FSEQ_DONE                = (1 << 3),

        REQ_FSEQ_ACTIONS        = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
                                  REQ_FSEQ_POSTFLUSH,

        /*
         * If flush has been pending longer than the following timeout,
         * it's issued even if flush_data requests are still in flight.
         */
        FLUSH_PENDING_TIMEOUT        = 5 * HZ,
};

static void blk_kick_flush(struct request_queue *q,
                           struct blk_flush_queue *fq, blk_opf_t flags);

static inline struct blk_flush_queue *
blk_get_flush_queue(struct blk_mq_ctx *ctx)
{
        return blk_mq_map_queue(REQ_OP_FLUSH, ctx)->fq;
}

static unsigned int blk_flush_cur_seq(struct request *rq)
{
        return 1 << ffz(rq->flush.seq);
}

static void blk_flush_restore_request(struct request *rq)
{
        /*
         * After flush data completion, @rq->bio is %NULL but we need to
         * complete the bio again.  @rq->biotail is guaranteed to equal the
         * original @rq->bio.  Restore it.
         */
        rq->bio = rq->biotail;
        if (rq->bio)
                rq->__sector = rq->bio->bi_iter.bi_sector;

        /* make @rq a normal request */
        rq->rq_flags &= ~RQF_FLUSH_SEQ;
        rq->end_io = rq->flush.saved_end_io;
}

static void blk_account_io_flush(struct request *rq)
{
        struct block_device *part = rq->q->disk->part0;

        part_stat_lock();
        part_stat_inc(part, ios[STAT_FLUSH]);
        part_stat_add(part, nsecs[STAT_FLUSH],
                      blk_time_get_ns() - rq->start_time_ns);
        part_stat_unlock();
}

/**
 * blk_flush_complete_seq - complete flush sequence
 * @rq: PREFLUSH/FUA request being sequenced
 * @fq: flush queue
 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
 * @error: whether an error occurred
 *
 * @rq just completed @seq part of its flush sequence, record the
 * completion and trigger the next step.
 *
 * CONTEXT:
 * spin_lock_irq(fq->mq_flush_lock)
 */
static void blk_flush_complete_seq(struct request *rq,
                                   struct blk_flush_queue *fq,
                                   unsigned int seq, blk_status_t error)
{
        struct request_queue *q = rq->q;
        struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
        blk_opf_t cmd_flags;

        BUG_ON(rq->flush.seq & seq);
        rq->flush.seq |= seq;
        cmd_flags = rq->cmd_flags;

        if (likely(!error))
                seq = blk_flush_cur_seq(rq);
        else
                seq = REQ_FSEQ_DONE;

        switch (seq) {
        case REQ_FSEQ_PREFLUSH:
        case REQ_FSEQ_POSTFLUSH:
                /* queue for flush */
                if (list_empty(pending))
                        fq->flush_pending_since = jiffies;
                list_add_tail(&rq->queuelist, pending);
                break;

        case REQ_FSEQ_DATA:
                fq->flush_data_in_flight++;
                spin_lock(&q->requeue_lock);
                list_move(&rq->queuelist, &q->requeue_list);
                spin_unlock(&q->requeue_lock);
                blk_mq_kick_requeue_list(q);
                break;

        case REQ_FSEQ_DONE:
                /*
                 * @rq was previously adjusted by blk_insert_flush() for
                 * flush sequencing and may already have gone through the
                 * flush data request completion path.  Restore @rq for
                 * normal completion and end it.
                 */
                list_del_init(&rq->queuelist);
                blk_flush_restore_request(rq);
                blk_mq_end_request(rq, error);
                break;

        default:
                BUG();
        }

        blk_kick_flush(q, fq, cmd_flags);
}

static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
                                       blk_status_t error,
                                       const struct io_comp_batch *iob)
{
        struct request_queue *q = flush_rq->q;
        struct list_head *running;
        struct request *rq, *n;
        unsigned long flags = 0;
        struct blk_flush_queue *fq = blk_get_flush_queue(flush_rq->mq_ctx);

        /* release the tag's ownership to the req cloned from */
        spin_lock_irqsave(&fq->mq_flush_lock, flags);

        if (!req_ref_put_and_test(flush_rq)) {
                fq->rq_status = error;
                spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
                return RQ_END_IO_NONE;
        }

        blk_account_io_flush(flush_rq);
        /*
         * Flush request has to be marked as IDLE when it is really ended
         * because its .end_io() is called from timeout code path too for
         * avoiding use-after-free.
         */
        WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
        if (fq->rq_status != BLK_STS_OK) {
                error = fq->rq_status;
                fq->rq_status = BLK_STS_OK;
        }

        if (!q->elevator) {
                flush_rq->tag = BLK_MQ_NO_TAG;
        } else {
                blk_mq_put_driver_tag(flush_rq);
                flush_rq->internal_tag = BLK_MQ_NO_TAG;
        }

        running = &fq->flush_queue[fq->flush_running_idx];
        BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);

        /* account completion of the flush request */
        fq->flush_running_idx ^= 1;

        /* and push the waiting requests to the next stage */
        list_for_each_entry_safe(rq, n, running, queuelist) {
                unsigned int seq = blk_flush_cur_seq(rq);

                BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
                list_del_init(&rq->queuelist);
                blk_flush_complete_seq(rq, fq, seq, error);
        }

        spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
        return RQ_END_IO_NONE;
}

bool is_flush_rq(struct request *rq)
{
        return rq->end_io == flush_end_io;
}

/**
 * blk_kick_flush - consider issuing flush request
 * @q: request_queue being kicked
 * @fq: flush queue
 * @flags: cmd_flags of the original request
 *
 * Flush related states of @q have changed, consider issuing flush request.
 * Please read the comment at the top of this file for more info.
 *
 * CONTEXT:
 * spin_lock_irq(fq->mq_flush_lock)
 *
 */
static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
                           blk_opf_t flags)
{
        struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
        struct request *first_rq =
                list_first_entry(pending, struct request, queuelist);
        struct request *flush_rq = fq->flush_rq;

        /* C1 described at the top of this file */
        if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
                return;

        /* C2 and C3 */
        if (fq->flush_data_in_flight &&
            time_before(jiffies,
                        fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
                return;

        /*
         * Issue flush and toggle pending_idx.  This makes pending_idx
         * different from running_idx, which means flush is in flight.
         */
        fq->flush_pending_idx ^= 1;

        blk_rq_init(q, flush_rq);

        /*
         * In case of none scheduler, borrow tag from the first request
         * since they can't be in flight at the same time. And acquire
         * the tag's ownership for flush req.
         *
         * In case of IO scheduler, flush rq need to borrow scheduler tag
         * just for cheating put/get driver tag.
         */
        flush_rq->mq_ctx = first_rq->mq_ctx;
        flush_rq->mq_hctx = first_rq->mq_hctx;

        if (!q->elevator)
                flush_rq->tag = first_rq->tag;
        else
                flush_rq->internal_tag = first_rq->internal_tag;

        flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
        flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
        flush_rq->rq_flags |= RQF_FLUSH_SEQ;
        flush_rq->end_io = flush_end_io;
        /*
         * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
         * implied in refcount_inc_not_zero() called from
         * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref
         * and READ flush_rq->end_io
         */
        smp_wmb();
        req_ref_set(flush_rq, 1);

        spin_lock(&q->requeue_lock);
        list_add_tail(&flush_rq->queuelist, &q->flush_list);
        spin_unlock(&q->requeue_lock);

        blk_mq_kick_requeue_list(q);
}

static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
                                               blk_status_t error,
                                               const struct io_comp_batch *iob)
{
        struct request_queue *q = rq->q;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        unsigned long flags;
        struct blk_flush_queue *fq = blk_get_flush_queue(ctx);

        if (q->elevator) {
                WARN_ON(rq->tag < 0);
                blk_mq_put_driver_tag(rq);
        }

        /*
         * After populating an empty queue, kick it to avoid stall.  Read
         * the comment in flush_end_io().
         */
        spin_lock_irqsave(&fq->mq_flush_lock, flags);
        fq->flush_data_in_flight--;
        /*
         * May have been corrupted by rq->rq_next reuse, we need to
         * re-initialize rq->queuelist before reusing it here.
         */
        INIT_LIST_HEAD(&rq->queuelist);
        blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
        spin_unlock_irqrestore(&fq->mq_flush_lock, flags);

        blk_mq_sched_restart(hctx);
        return RQ_END_IO_NONE;
}

static void blk_rq_init_flush(struct request *rq)
{
        rq->flush.seq = 0;
        rq->rq_flags |= RQF_FLUSH_SEQ;
        rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
        rq->end_io = mq_flush_data_end_io;
}

/*
 * Insert a PREFLUSH/FUA request into the flush state machine.
 * Returns true if the request has been consumed by the flush state machine,
 * or false if the caller should continue to process it.
 */
bool blk_insert_flush(struct request *rq)
{
        struct request_queue *q = rq->q;
        struct blk_flush_queue *fq = blk_get_flush_queue(rq->mq_ctx);
        bool supports_fua = q->limits.features & BLK_FEAT_FUA;
        unsigned int policy = 0;

        /* FLUSH/FUA request must never be merged */
        WARN_ON_ONCE(rq->bio != rq->biotail);

        if (blk_rq_sectors(rq))
                policy |= REQ_FSEQ_DATA;

        /*
         * Check which flushes we need to sequence for this operation.
         */
        if (blk_queue_write_cache(q)) {
                if (rq->cmd_flags & REQ_PREFLUSH)
                        policy |= REQ_FSEQ_PREFLUSH;
                if ((rq->cmd_flags & REQ_FUA) && !supports_fua)
                        policy |= REQ_FSEQ_POSTFLUSH;
        }

        /*
         * @policy now records what operations need to be done.  Adjust
         * REQ_PREFLUSH and FUA for the driver.
         */
        rq->cmd_flags &= ~REQ_PREFLUSH;
        if (!supports_fua)
                rq->cmd_flags &= ~REQ_FUA;

        /*
         * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any
         * of those flags, we have to set REQ_SYNC to avoid skewing
         * the request accounting.
         */
        rq->cmd_flags |= REQ_SYNC;

        switch (policy) {
        case 0:
                /*
                 * An empty flush handed down from a stacking driver may
                 * translate into nothing if the underlying device does not
                 * advertise a write-back cache.  In this case, simply
                 * complete the request.
                 */
                blk_mq_end_request(rq, 0);
                return true;
        case REQ_FSEQ_DATA:
                /*
                 * If there's data, but no flush is necessary, the request can
                 * be processed directly without going through flush machinery.
                 * Queue for normal execution.
                 */
                return false;
        case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH:
                /*
                 * Initialize the flush fields and completion handler to trigger
                 * the post flush, and then just pass the command on.
                 */
                blk_rq_init_flush(rq);
                rq->flush.seq |= REQ_FSEQ_PREFLUSH;
                spin_lock_irq(&fq->mq_flush_lock);
                fq->flush_data_in_flight++;
                spin_unlock_irq(&fq->mq_flush_lock);
                return false;
        default:
                /*
                 * Mark the request as part of a flush sequence and submit it
                 * for further processing to the flush state machine.
                 */
                blk_rq_init_flush(rq);
                spin_lock_irq(&fq->mq_flush_lock);
                blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
                spin_unlock_irq(&fq->mq_flush_lock);
                return true;
        }
}

/**
 * blkdev_issue_flush - queue a flush
 * @bdev:        blockdev to issue flush for
 *
 * Description:
 *    Issue a flush for the block device in question.
 */
int blkdev_issue_flush(struct block_device *bdev)
{
        struct bio bio;

        bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
        return submit_bio_wait(&bio);
}
EXPORT_SYMBOL(blkdev_issue_flush);

struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
                                              gfp_t flags)
{
        struct blk_flush_queue *fq;
        int rq_sz = sizeof(struct request);

        fq = kzalloc_node(sizeof(*fq), flags, node);
        if (!fq)
                goto fail;

        spin_lock_init(&fq->mq_flush_lock);

        rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
        fq->flush_rq = kzalloc_node(rq_sz, flags, node);
        if (!fq->flush_rq)
                goto fail_rq;

        INIT_LIST_HEAD(&fq->flush_queue[0]);
        INIT_LIST_HEAD(&fq->flush_queue[1]);

        return fq;

 fail_rq:
        kfree(fq);
 fail:
        return NULL;
}

void blk_free_flush_queue(struct blk_flush_queue *fq)
{
        /* bio based request queue hasn't flush queue */
        if (!fq)
                return;

        kfree(fq->flush_rq);
        kfree(fq);
}

/*
 * Allow driver to set its own lock class to fq->mq_flush_lock for
 * avoiding lockdep complaint.
 *
 * flush_end_io() may be called recursively from some driver, such as
 * nvme-loop, so lockdep may complain 'possible recursive locking' because
 * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
 * key. We need to assign different lock class for these driver's
 * fq->mq_flush_lock for avoiding the lockdep warning.
 *
 * Use dynamically allocated lock class key for each 'blk_flush_queue'
 * instance is over-kill, and more worse it introduces horrible boot delay
 * issue because synchronize_rcu() is implied in lockdep_unregister_key which
 * is called for each hctx release. SCSI probing may synchronously create and
 * destroy lots of MQ request_queues for non-existent devices, and some robot
 * test kernel always enable lockdep option. It is observed that more than half
 * an hour is taken during SCSI MQ probe with per-fq lock class.
 */
void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
                struct lock_class_key *key)
{
        lockdep_set_class(&hctx->fq->mq_flush_lock, key);
}
EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);







































































    2 







    2 








    2 





































    1 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#ifndef _LINUX_JHASH_H
#define _LINUX_JHASH_H

/* jhash.h: Jenkins hash support.
 *
 * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
 *
 * https://burtleburtle.net/bob/hash/
 *
 * These are the credits from Bob's sources:
 *
 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
 *
 * These are functions for producing 32-bit hashes for hash table lookup.
 * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
 * are externally useful functions.  Routines to test the hash are included
 * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
 * the public domain.  It has no warranty.
 *
 * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org)
 *
 * I've modified Bob's hash to be useful in the Linux kernel, and
 * any bugs present are my fault.
 * Jozsef
 */
#include <linux/bitops.h>
#include <linux/unaligned.h>

/* Best hash sizes are of power of two */
#define jhash_size(n)   ((u32)1<<(n))
/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
#define jhash_mask(n)   (jhash_size(n)-1)

/* __jhash_mix - mix 3 32-bit values reversibly. */
#define __jhash_mix(a, b, c)                        \
{                                                \
        a -= c;  a ^= rol32(c, 4);  c += b;        \
        b -= a;  b ^= rol32(a, 6);  a += c;        \
        c -= b;  c ^= rol32(b, 8);  b += a;        \
        a -= c;  a ^= rol32(c, 16); c += b;        \
        b -= a;  b ^= rol32(a, 19); a += c;        \
        c -= b;  c ^= rol32(b, 4);  b += a;        \
}

/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
#define __jhash_final(a, b, c)                        \
{                                                \
        c ^= b; c -= rol32(b, 14);                \
        a ^= c; a -= rol32(c, 11);                \
        b ^= a; b -= rol32(a, 25);                \
        c ^= b; c -= rol32(b, 16);                \
        a ^= c; a -= rol32(c, 4);                \
        b ^= a; b -= rol32(a, 14);                \
        c ^= b; c -= rol32(b, 24);                \
}

/* An arbitrary initial parameter */
#define JHASH_INITVAL                0xdeadbeef

/* jhash - hash an arbitrary key
 * @k: sequence of bytes as key
 * @length: the length of the key
 * @initval: the previous hash, or an arbitrary value
 *
 * The generic version, hashes an arbitrary sequence of bytes.
 * No alignment or length assumptions are made about the input key.
 *
 * Returns the hash value of the key. The result depends on endianness.
 */
static inline u32 jhash(const void *key, u32 length, u32 initval)
{
        u32 a, b, c;
        const u8 *k = key;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + length + initval;

        /* All but the last block: affect some 32 bits of (a,b,c) */
        while (length > 12) {
                a += get_unaligned((u32 *)k);
                b += get_unaligned((u32 *)(k + 4));
                c += get_unaligned((u32 *)(k + 8));
                __jhash_mix(a, b, c);
                length -= 12;
                k += 12;
        }
        /* Last block: affect all 32 bits of (c) */
        switch (length) {
        case 12: c += (u32)k[11]<<24;        fallthrough;
        case 11: c += (u32)k[10]<<16;        fallthrough;
        case 10: c += (u32)k[9]<<8;        fallthrough;
        case 9:  c += k[8];                fallthrough;
        case 8:  b += (u32)k[7]<<24;        fallthrough;
        case 7:  b += (u32)k[6]<<16;        fallthrough;
        case 6:  b += (u32)k[5]<<8;        fallthrough;
        case 5:  b += k[4];                fallthrough;
        case 4:  a += (u32)k[3]<<24;        fallthrough;
        case 3:  a += (u32)k[2]<<16;        fallthrough;
        case 2:  a += (u32)k[1]<<8;        fallthrough;
        case 1:  a += k[0];
                 __jhash_final(a, b, c);
                 break;
        case 0: /* Nothing left to add */
                break;
        }

        return c;
}

/* jhash2 - hash an array of u32's
 * @k: the key which must be an array of u32's
 * @length: the number of u32's in the key
 * @initval: the previous hash, or an arbitrary value
 *
 * Returns the hash value of the key.
 */
static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
{
        u32 a, b, c;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + (length<<2) + initval;

        /* Handle most of the key */
        while (length > 3) {
                a += k[0];
                b += k[1];
                c += k[2];
                __jhash_mix(a, b, c);
                length -= 3;
                k += 3;
        }

        /* Handle the last 3 u32's */
        switch (length) {
        case 3: c += k[2];        fallthrough;
        case 2: b += k[1];        fallthrough;
        case 1: a += k[0];
                __jhash_final(a, b, c);
                break;
        case 0:        /* Nothing left to add */
                break;
        }

        return c;
}


/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
        a += initval;
        b += initval;
        c += initval;

        __jhash_final(a, b, c);

        return c;
}

static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
{
        return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2));
}

static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
{
        return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}

static inline u32 jhash_1word(u32 a, u32 initval)
{
        return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2));
}

#endif /* _LINUX_JHASH_H */



























    1 



































    1 








    1 





    1 














    1 













































































































































































































































































































































































































































    1 





















    1 



















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfsplus/inode.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Inode handling routines
 */

#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/uio.h>
#include <linux/fileattr.h>

#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
#include "xattr.h"

static int hfsplus_read_folio(struct file *file, struct folio *folio)
{
        return block_read_full_folio(folio, hfsplus_get_block);
}

static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
{
        struct inode *inode = mapping->host;

        if (to > inode->i_size) {
                truncate_pagecache(inode, inode->i_size);
                hfsplus_file_truncate(inode);
        }
}

int hfsplus_write_begin(const struct kiocb *iocb,
                        struct address_space *mapping, loff_t pos,
                        unsigned len, struct folio **foliop,
                        void **fsdata)
{
        int ret;

        ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
                                hfsplus_get_block,
                                &HFSPLUS_I(mapping->host)->phys_size);
        if (unlikely(ret))
                hfsplus_write_failed(mapping, pos + len);

        return ret;
}

static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
{
        return generic_block_bmap(mapping, block, hfsplus_get_block);
}

static bool hfsplus_release_folio(struct folio *folio, gfp_t mask)
{
        struct inode *inode = folio->mapping->host;
        struct super_block *sb = inode->i_sb;
        struct hfs_btree *tree;
        struct hfs_bnode *node;
        u32 nidx;
        int i;
        bool res = true;

        switch (inode->i_ino) {
        case HFSPLUS_EXT_CNID:
                tree = HFSPLUS_SB(sb)->ext_tree;
                break;
        case HFSPLUS_CAT_CNID:
                tree = HFSPLUS_SB(sb)->cat_tree;
                break;
        case HFSPLUS_ATTR_CNID:
                tree = HFSPLUS_SB(sb)->attr_tree;
                break;
        default:
                BUG();
                return false;
        }
        if (!tree)
                return false;
        if (tree->node_size >= PAGE_SIZE) {
                nidx = folio->index >>
                        (tree->node_size_shift - PAGE_SHIFT);
                spin_lock(&tree->hash_lock);
                node = hfs_bnode_findhash(tree, nidx);
                if (!node)
                        ;
                else if (atomic_read(&node->refcnt))
                        res = false;
                if (res && node) {
                        hfs_bnode_unhash(node);
                        hfs_bnode_free(node);
                }
                spin_unlock(&tree->hash_lock);
        } else {
                nidx = folio->index <<
                        (PAGE_SHIFT - tree->node_size_shift);
                i = 1 << (PAGE_SHIFT - tree->node_size_shift);
                spin_lock(&tree->hash_lock);
                do {
                        node = hfs_bnode_findhash(tree, nidx++);
                        if (!node)
                                continue;
                        if (atomic_read(&node->refcnt)) {
                                res = false;
                                break;
                        }
                        hfs_bnode_unhash(node);
                        hfs_bnode_free(node);
                } while (--i && nidx < tree->node_count);
                spin_unlock(&tree->hash_lock);
        }
        return res ? try_to_free_buffers(folio) : false;
}

static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        size_t count = iov_iter_count(iter);
        ssize_t ret;

        ret = blockdev_direct_IO(iocb, inode, iter, hfsplus_get_block);

        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again.
         */
        if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
                loff_t isize = i_size_read(inode);
                loff_t end = iocb->ki_pos + count;

                if (end > isize)
                        hfsplus_write_failed(mapping, end);
        }

        return ret;
}

static int hfsplus_writepages(struct address_space *mapping,
                              struct writeback_control *wbc)
{
        return mpage_writepages(mapping, wbc, hfsplus_get_block);
}

const struct address_space_operations hfsplus_btree_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio        = hfsplus_read_folio,
        .writepages        = hfsplus_writepages,
        .write_begin        = hfsplus_write_begin,
        .write_end        = generic_write_end,
        .migrate_folio        = buffer_migrate_folio,
        .bmap                = hfsplus_bmap,
        .release_folio        = hfsplus_release_folio,
};

const struct address_space_operations hfsplus_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio        = hfsplus_read_folio,
        .write_begin        = hfsplus_write_begin,
        .write_end        = generic_write_end,
        .bmap                = hfsplus_bmap,
        .direct_IO        = hfsplus_direct_IO,
        .writepages        = hfsplus_writepages,
        .migrate_folio        = buffer_migrate_folio,
};

const struct dentry_operations hfsplus_dentry_operations = {
        .d_hash       = hfsplus_hash_dentry,
        .d_compare    = hfsplus_compare_dentry,
};

static int hfsplus_get_perms(struct inode *inode,
                             struct hfsplus_perm *perms, int dir)
{
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
        u16 mode;

        mode = be16_to_cpu(perms->mode);
        if (dir) {
                if (mode && !S_ISDIR(mode))
                        goto bad_type;
        } else if (mode) {
                switch (mode & S_IFMT) {
                case S_IFREG:
                case S_IFLNK:
                case S_IFCHR:
                case S_IFBLK:
                case S_IFIFO:
                case S_IFSOCK:
                        break;
                default:
                        goto bad_type;
                }
        }

        i_uid_write(inode, be32_to_cpu(perms->owner));
        if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode))
                inode->i_uid = sbi->uid;

        i_gid_write(inode, be32_to_cpu(perms->group));
        if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode))
                inode->i_gid = sbi->gid;

        if (dir) {
                mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
                mode |= S_IFDIR;
        } else if (!mode)
                mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
        inode->i_mode = mode;

        HFSPLUS_I(inode)->userflags = perms->userflags;
        if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
                inode->i_flags |= S_IMMUTABLE;
        else
                inode->i_flags &= ~S_IMMUTABLE;
        if (perms->rootflags & HFSPLUS_FLG_APPEND)
                inode->i_flags |= S_APPEND;
        else
                inode->i_flags &= ~S_APPEND;
        return 0;
bad_type:
        pr_err("invalid file type 0%04o for inode %llu\n", mode, inode->i_ino);
        return -EIO;
}

static int hfsplus_file_open(struct inode *inode, struct file *file)
{
        if (HFSPLUS_IS_RSRC(inode))
                inode = HFSPLUS_I(inode)->rsrc_inode;
        if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EOVERFLOW;
        atomic_inc(&HFSPLUS_I(inode)->opencnt);
        return 0;
}

static int hfsplus_file_release(struct inode *inode, struct file *file)
{
        struct super_block *sb = inode->i_sb;

        if (HFSPLUS_IS_RSRC(inode))
                inode = HFSPLUS_I(inode)->rsrc_inode;
        if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
                inode_lock(inode);
                hfsplus_file_truncate(inode);
                if (inode->i_flags & S_DEAD) {
                        hfsplus_delete_cat(inode->i_ino,
                                           HFSPLUS_SB(sb)->hidden_dir, NULL);
                        hfsplus_delete_inode(inode);
                }
                inode_unlock(inode);
        }
        return 0;
}

static int hfsplus_setattr(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        int error;

        error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
        if (error)
                return error;

        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
                inode_dio_wait(inode);
                if (attr->ia_size > inode->i_size) {
                        error = generic_cont_expand_simple(inode,
                                                           attr->ia_size);
                        if (error)
                                return error;
                }
                truncate_setsize(inode, attr->ia_size);
                hfsplus_file_truncate(inode);
                inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        }

        setattr_copy(&nop_mnt_idmap, inode, attr);
        mark_inode_dirty(inode);

        return 0;
}

int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path,
                    struct kstat *stat, u32 request_mask,
                    unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);

        if (request_mask & STATX_BTIME) {
                stat->result_mask |= STATX_BTIME;
                stat->btime = hfsp_mt2ut(hip->create_date);
        }

        if (inode->i_flags & S_APPEND)
                stat->attributes |= STATX_ATTR_APPEND;
        if (inode->i_flags & S_IMMUTABLE)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (hip->userflags & HFSPLUS_FLG_NODUMP)
                stat->attributes |= STATX_ATTR_NODUMP;

        stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE |
                                 STATX_ATTR_NODUMP;

        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        return 0;
}

int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
                       int datasync)
{
        struct inode *inode = file->f_mapping->host;
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        struct super_block *sb = inode->i_sb;
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
        struct hfsplus_vh *vhdr = sbi->s_vhdr;
        int error = 0, error2;

        hfs_dbg("inode->i_ino %llu, start %llu, end %llu\n",
                inode->i_ino, start, end);

        error = file_write_and_wait_range(file, start, end);
        if (error)
                return error;
        inode_lock(inode);

        /*
         * Sync inode metadata into the catalog and extent trees.
         */
        sync_inode_metadata(inode, 1);

        /*
         * And explicitly write out the btrees.
         */
        if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY,
                                &HFSPLUS_I(HFSPLUS_CAT_TREE_I(sb))->flags)) {
                clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags);
                error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
        }

        if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY,
                                &HFSPLUS_I(HFSPLUS_EXT_TREE_I(sb))->flags)) {
                clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
                error2 =
                        filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
                if (!error)
                        error = error2;
        }

        if (sbi->attr_tree) {
                if (test_and_clear_bit(HFSPLUS_I_ATTR_DIRTY,
                                &HFSPLUS_I(HFSPLUS_ATTR_TREE_I(sb))->flags)) {
                        clear_bit(HFSPLUS_I_ATTR_DIRTY, &hip->flags);
                        error2 =
                                filemap_write_and_wait(
                                            sbi->attr_tree->inode->i_mapping);
                        if (!error)
                                error = error2;
                }
        } else {
                if (test_and_clear_bit(HFSPLUS_I_ATTR_DIRTY, &hip->flags))
                        pr_err("sync non-existent attributes tree\n");
        }

        if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY,
                                &HFSPLUS_I(sbi->alloc_file)->flags)) {
                clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags);
                error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
                if (!error)
                        error = error2;
        }

        mutex_lock(&sbi->vh_mutex);
        hfsplus_prepare_volume_header_for_commit(vhdr);
        mutex_unlock(&sbi->vh_mutex);

        error2 = hfsplus_commit_superblock(inode->i_sb);
        if (!error)
                error = error2;

        if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
                blkdev_issue_flush(inode->i_sb->s_bdev);

        inode_unlock(inode);

        return error;
}

static const struct inode_operations hfsplus_file_inode_operations = {
        .setattr        = hfsplus_setattr,
        .getattr        = hfsplus_getattr,
        .listxattr        = hfsplus_listxattr,
        .fileattr_get        = hfsplus_fileattr_get,
        .fileattr_set        = hfsplus_fileattr_set,
};

static const struct inode_operations hfsplus_symlink_inode_operations = {
        .get_link        = page_get_link,
        .setattr        = hfsplus_setattr,
        .getattr        = hfsplus_getattr,
        .listxattr        = hfsplus_listxattr,
};

static const struct inode_operations hfsplus_special_inode_operations = {
        .setattr        = hfsplus_setattr,
        .getattr        = hfsplus_getattr,
        .listxattr        = hfsplus_listxattr,
};

static const struct file_operations hfsplus_file_operations = {
        .llseek                = generic_file_llseek,
        .read_iter        = generic_file_read_iter,
        .write_iter        = generic_file_write_iter,
        .mmap_prepare        = generic_file_mmap_prepare,
        .splice_read        = filemap_splice_read,
        .splice_write        = iter_file_splice_write,
        .fsync                = hfsplus_file_fsync,
        .open                = hfsplus_file_open,
        .release        = hfsplus_file_release,
        .unlocked_ioctl = hfsplus_ioctl,
};

struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
                                umode_t mode)
{
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct inode *inode = new_inode(sb);
        struct hfsplus_inode_info *hip;

        if (!inode)
                return NULL;

        inode->i_ino = sbi->next_cnid++;
        inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
        set_nlink(inode, 1);
        simple_inode_init_ts(inode);

        hip = HFSPLUS_I(inode);
        INIT_LIST_HEAD(&hip->open_dir_list);
        spin_lock_init(&hip->open_dir_lock);
        mutex_init(&hip->extents_lock);
        atomic_set(&hip->opencnt, 0);
        hip->extent_state = 0;
        hip->flags = 0;
        hip->userflags = 0;
        hip->subfolders = 0;
        memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
        memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
        hip->alloc_blocks = 0;
        hip->first_blocks = 0;
        hip->cached_start = 0;
        hip->cached_blocks = 0;
        hip->phys_size = 0;
        hip->fs_blocks = 0;
        hip->rsrc_inode = NULL;
        if (S_ISDIR(inode->i_mode)) {
                inode->i_size = 2;
                sbi->folder_count++;
                inode->i_op = &hfsplus_dir_inode_operations;
                inode->i_fop = &hfsplus_dir_operations;
        } else if (S_ISREG(inode->i_mode)) {
                sbi->file_count++;
                inode->i_op = &hfsplus_file_inode_operations;
                inode->i_fop = &hfsplus_file_operations;
                inode->i_mapping->a_ops = &hfsplus_aops;
                hip->clump_blocks = sbi->data_clump_blocks;
        } else if (S_ISLNK(inode->i_mode)) {
                sbi->file_count++;
                inode->i_op = &hfsplus_symlink_inode_operations;
                inode_nohighmem(inode);
                inode->i_mapping->a_ops = &hfsplus_aops;
                hip->clump_blocks = 1;
        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
                   S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                sbi->file_count++;
                inode->i_op = &hfsplus_special_inode_operations;
        } else
                sbi->file_count++;

        insert_inode_hash(inode);
        mark_inode_dirty(inode);
        hfsplus_mark_mdb_dirty(sb);

        return inode;
}

void hfsplus_delete_inode(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (S_ISDIR(inode->i_mode)) {
                HFSPLUS_SB(sb)->folder_count--;
                hfsplus_mark_mdb_dirty(sb);
                return;
        }
        HFSPLUS_SB(sb)->file_count--;
        if (S_ISREG(inode->i_mode)) {
                if (!inode->i_nlink) {
                        inode->i_size = 0;
                        hfsplus_file_truncate(inode);
                }
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_size = 0;
                hfsplus_file_truncate(inode);
        }
        hfsplus_mark_mdb_dirty(sb);
}

void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
{
        struct super_block *sb = inode->i_sb;
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        u32 count;
        int i;

        memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
        for (count = 0, i = 0; i < 8; i++)
                count += be32_to_cpu(fork->extents[i].block_count);
        hip->first_blocks = count;
        memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
        hip->cached_start = 0;
        hip->cached_blocks = 0;

        hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
        hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
        hip->fs_blocks =
                (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
        inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
        hip->clump_blocks =
                be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
        if (!hip->clump_blocks) {
                hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
                        sbi->rsrc_clump_blocks :
                        sbi->data_clump_blocks;
        }
}

void hfsplus_inode_write_fork(struct inode *inode,
                struct hfsplus_fork_raw *fork)
{
        memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
               sizeof(hfsplus_extent_rec));
        fork->total_size = cpu_to_be64(inode->i_size);
        fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
}

int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
{
        hfsplus_cat_entry entry;
        int res = 0;
        u16 type;

        type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);

        HFSPLUS_I(inode)->linkid = 0;
        if (type == HFSPLUS_FOLDER) {
                struct hfsplus_cat_folder *folder = &entry.folder;

                if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) {
                        pr_err("bad catalog folder entry\n");
                        res = -EIO;
                        goto out;
                }
                hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
                                        sizeof(struct hfsplus_cat_folder));
                res = hfsplus_get_perms(inode, &folder->permissions, 1);
                if (res)
                        goto out;
                set_nlink(inode, 1);
                inode->i_size = 2 + be32_to_cpu(folder->valence);
                inode_set_atime_to_ts(inode, hfsp_mt2ut(folder->access_date));
                inode_set_mtime_to_ts(inode,
                                      hfsp_mt2ut(folder->content_mod_date));
                inode_set_ctime_to_ts(inode,
                                      hfsp_mt2ut(folder->attribute_mod_date));
                HFSPLUS_I(inode)->create_date = folder->create_date;
                HFSPLUS_I(inode)->fs_blocks = 0;
                if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
                        HFSPLUS_I(inode)->subfolders =
                                be32_to_cpu(folder->subfolders);
                }
                inode->i_op = &hfsplus_dir_inode_operations;
                inode->i_fop = &hfsplus_dir_operations;
        } else if (type == HFSPLUS_FILE) {
                struct hfsplus_cat_file *file = &entry.file;

                if (fd->entrylength < sizeof(struct hfsplus_cat_file)) {
                        pr_err("bad catalog file entry\n");
                        res = -EIO;
                        goto out;
                }
                hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
                                        sizeof(struct hfsplus_cat_file));

                hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
                                        &file->rsrc_fork : &file->data_fork);
                res = hfsplus_get_perms(inode, &file->permissions, 0);
                if (res)
                        goto out;
                set_nlink(inode, 1);
                if (S_ISREG(inode->i_mode)) {
                        if (file->permissions.dev)
                                set_nlink(inode,
                                          be32_to_cpu(file->permissions.dev));
                        inode->i_op = &hfsplus_file_inode_operations;
                        inode->i_fop = &hfsplus_file_operations;
                        inode->i_mapping->a_ops = &hfsplus_aops;
                } else if (S_ISLNK(inode->i_mode)) {
                        inode->i_op = &hfsplus_symlink_inode_operations;
                        inode_nohighmem(inode);
                        inode->i_mapping->a_ops = &hfsplus_aops;
                } else {
                        inode->i_op = &hfsplus_special_inode_operations;
                        init_special_inode(inode, inode->i_mode,
                                           be32_to_cpu(file->permissions.dev));
                }
                inode_set_atime_to_ts(inode, hfsp_mt2ut(file->access_date));
                inode_set_mtime_to_ts(inode,
                                      hfsp_mt2ut(file->content_mod_date));
                inode_set_ctime_to_ts(inode,
                                      hfsp_mt2ut(file->attribute_mod_date));
                HFSPLUS_I(inode)->create_date = file->create_date;
        } else {
                pr_err("bad catalog entry used to create inode\n");
                res = -EIO;
        }
out:
        return res;
}

int hfsplus_cat_write_inode(struct inode *inode)
{
        struct inode *main_inode = inode;
        struct hfs_btree *tree = HFSPLUS_SB(inode->i_sb)->cat_tree;
        struct hfs_find_data fd;
        hfsplus_cat_entry entry;
        int res = 0;

        hfs_dbg("inode->i_ino %llu\n", inode->i_ino);

        if (HFSPLUS_IS_RSRC(inode))
                main_inode = HFSPLUS_I(inode)->rsrc_inode;

        if (!main_inode->i_nlink)
                return 0;

        if (hfs_find_init(tree, &fd))
                /* panic? */
                return -EIO;

        if (hfsplus_find_cat(main_inode->i_sb, main_inode->i_ino, &fd))
                /* panic? */
                goto out;

        if (S_ISDIR(main_inode->i_mode)) {
                struct hfsplus_cat_folder *folder = &entry.folder;

                if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) {
                        pr_err("bad catalog folder entry\n");
                        res = -EIO;
                        goto out;
                }
                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
                                        sizeof(struct hfsplus_cat_folder));
                /* simple node checks? */
                hfsplus_cat_set_perms(inode, &folder->permissions);
                folder->access_date = hfsp_ut2mt(inode_get_atime(inode));
                folder->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode));
                folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
                folder->valence = cpu_to_be32(inode->i_size - 2);
                if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
                        folder->subfolders =
                                cpu_to_be32(HFSPLUS_I(inode)->subfolders);
                }
                hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
                                         sizeof(struct hfsplus_cat_folder));
        } else if (HFSPLUS_IS_RSRC(inode)) {
                struct hfsplus_cat_file *file = &entry.file;
                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
                               sizeof(struct hfsplus_cat_file));
                hfsplus_inode_write_fork(inode, &file->rsrc_fork);
                hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
                                sizeof(struct hfsplus_cat_file));
        } else {
                struct hfsplus_cat_file *file = &entry.file;

                if (fd.entrylength < sizeof(struct hfsplus_cat_file)) {
                        pr_err("bad catalog file entry\n");
                        res = -EIO;
                        goto out;
                }
                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
                                        sizeof(struct hfsplus_cat_file));
                hfsplus_inode_write_fork(inode, &file->data_fork);
                hfsplus_cat_set_perms(inode, &file->permissions);
                if (HFSPLUS_FLG_IMMUTABLE &
                                (file->permissions.rootflags |
                                        file->permissions.userflags))
                        file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
                else
                        file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
                file->access_date = hfsp_ut2mt(inode_get_atime(inode));
                file->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode));
                file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
                hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
                                         sizeof(struct hfsplus_cat_file));
        }

        res = hfs_btree_write(tree);
        if (res) {
                pr_err("b-tree write err: %d, ino %llu\n",
                       res, inode->i_ino);
                goto out;
        }

        set_bit(HFSPLUS_I_CAT_DIRTY,
                &HFSPLUS_I(HFSPLUS_CAT_TREE_I(inode->i_sb))->flags);
        set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
out:
        hfs_find_exit(&fd);

        return res;
}

int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        unsigned int flags = 0;

        if (inode->i_flags & S_IMMUTABLE)
                flags |= FS_IMMUTABLE_FL;
        if (inode->i_flags & S_APPEND)
                flags |= FS_APPEND_FL;
        if (hip->userflags & HFSPLUS_FLG_NODUMP)
                flags |= FS_NODUMP_FL;

        fileattr_fill_flags(fa, flags);

        return 0;
}

int hfsplus_fileattr_set(struct mnt_idmap *idmap,
                         struct dentry *dentry, struct file_kattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
        unsigned int new_fl = 0;

        if (fileattr_has_fsx(fa))
                return -EOPNOTSUPP;

        /* don't silently ignore unsupported ext2 flags */
        if (fa->flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL))
                return -EOPNOTSUPP;

        if (fa->flags & FS_IMMUTABLE_FL)
                new_fl |= S_IMMUTABLE;

        if (fa->flags & FS_APPEND_FL)
                new_fl |= S_APPEND;

        inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND);

        if (fa->flags & FS_NODUMP_FL)
                hip->userflags |= HFSPLUS_FLG_NODUMP;
        else
                hip->userflags &= ~HFSPLUS_FLG_NODUMP;

        inode_set_ctime_current(inode);
        mark_inode_dirty(inode);

        return 0;
}






















































































































































































































































   23 







































































































































































































































































































































   28 

   27 


































































    2 

    1 







































   27 

   29 





























































































   24 




































































   24 

























































































































































































































































































    2 




    2 

































    5 














    1 








    1 





    1 



































   24 

   11 













































































































































































































































































    1 







    1 
































































































































































































    1 
    2 
    1 


    1 



    1 


































































































































































































































































































    4 

















   29 
   25 


    9 






   26 









   29 



   27 

   23 




























































































































































































































































































































































































































































































































































































































































































































































    1 






















    1 






















    1 

    2 
    2 
























    1 


    1 

    1 













































































    4 
   27 





   27 






















































































































































































































































































































































    2 








































    1 












    1 












    1 



    1 




















































































































































    2 





    1 



    1 




















    1 










    1 
























    2 










    2 











    2 





    2 





























    1 



















    2 



    1 


    2 



    2 



    2 


    1 













    1 



















    2 


    2 































    1 



    1 












    1 





























    1 









    1 





    1 



    1 
    1 





    1 


    1 



















































































































    1 











    1 



    1 
    1 




    1 







    1 



















    1 




    1 


















































































































    2 











    1 



    1 

    1 






    1 






    2 


    1 
    2 
























    2 




    3 
































































































































    1 



    1 
    1 
    1 



    1 





    1 




    1 
































    4 


    4 


    4 
    4 






    1 













































































    1 


















































    1 












    1 












    1 







    1 
    1 
    1 


    1 























































































































































































































































































































































    1 






    1 
    1 










    1 

    1 











    1 











    1 



    1 


    1 












    1 


    1 


    1 






























    2 
















































































































































    1 





    1 

































































    1 





    1 
















    1 





    1 

























































    1 















































    1 

    1 



    1 
















    1 


















    1 


    1 


































































































































































































































    2 












































































































































































































































    1 








    1 




































































































































































































































































































































































    1 



    1 





































   23 






   23 






   27 
   24 


    2 





    1 

    1 






   21 
    1 















































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
// SPDX-License-Identifier: GPL-2.0+
/*
 * Maple Tree implementation
 * Copyright (c) 2018-2022 Oracle Corporation
 * Authors: Liam R. Howlett <Liam.Howlett@oracle.com>
 *            Matthew Wilcox <willy@infradead.org>
 * Copyright (c) 2023 ByteDance
 * Author: Peng Zhang <zhangpeng.00@bytedance.com>
 */

/*
 * DOC: Interesting implementation details of the Maple Tree
 *
 * Each node type has a number of slots for entries and a number of slots for
 * pivots.  In the case of dense nodes, the pivots are implied by the position
 * and are simply the slot index + the minimum of the node.
 *
 * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
 * indicate that the tree is specifying ranges.  Pivots may appear in the
 * subtree with an entry attached to the value whereas keys are unique to a
 * specific position of a B-tree.  Pivot values are inclusive of the slot with
 * the same index.
 *
 *
 * The following illustrates the layout of a range64 nodes slots and pivots.
 *
 *
 *  Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 |
 *           ┬   ┬   ┬   ┬     ┬    ┬    ┬    ┬    ┬
 *           │   │   │   │     │    │    │    │    └─ Implied maximum
 *           │   │   │   │     │    │    │    └─ Pivot 14
 *           │   │   │   │     │    │    └─ Pivot 13
 *           │   │   │   │     │    └─ Pivot 12
 *           │   │   │   │     └─ Pivot 11
 *           │   │   │   └─ Pivot 2
 *           │   │   └─ Pivot 1
 *           │   └─ Pivot 0
 *           └─  Implied minimum
 *
 * Slot contents:
 *  Internal (non-leaf) nodes contain pointers to other nodes.
 *  Leaf nodes contain entries.
 *
 * The location of interest is often referred to as an offset.  All offsets have
 * a slot, but the last offset has an implied pivot from the node above (or
 * UINT_MAX for the root node.
 *
 * Ranges complicate certain write activities.  When modifying any of
 * the B-tree variants, it is known that one entry will either be added or
 * deleted.  When modifying the Maple Tree, one store operation may overwrite
 * the entire data set, or one half of the tree, or the middle half of the tree.
 *
 */


#include <linux/maple_tree.h>
#include <linux/xarray.h>
#include <linux/types.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/limits.h>
#include <asm/barrier.h>

#define CREATE_TRACE_POINTS
#include <trace/events/maple_tree.h>

#define TP_FCT tracepoint_string(__func__)

/*
 * Kernel pointer hashing renders much of the maple tree dump useless as tagged
 * pointers get hashed to arbitrary values.
 *
 * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is
 * permissible to bypass this. Otherwise remain cautious and retain the hashing.
 *
 * Userland doesn't know about %px so also use %p there.
 */
#if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE)
#define PTR_FMT "%px"
#else
#define PTR_FMT "%p"
#endif

#define MA_ROOT_PARENT 1

/*
 * Maple state flags
 * * MA_STATE_PREALLOC                - Preallocated nodes, WARN_ON allocation
 */
#define MA_STATE_PREALLOC        1

#define ma_parent_ptr(x) ((struct maple_pnode *)(x))
#define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT)
#define ma_mnode_ptr(x) ((struct maple_node *)(x))
#define ma_enode_ptr(x) ((struct maple_enode *)(x))
static struct kmem_cache *maple_node_cache;

#ifdef CONFIG_DEBUG_MAPLE_TREE
static const unsigned long mt_max[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS,
        [maple_leaf_64]                = ULONG_MAX,
        [maple_range_64]        = ULONG_MAX,
        [maple_arange_64]        = ULONG_MAX,
        [maple_copy]                = ULONG_MAX,
};
#define mt_node_max(x) mt_max[mte_node_type(x)]
#endif

static const unsigned char mt_slots[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS,
        [maple_leaf_64]                = MAPLE_RANGE64_SLOTS,
        [maple_range_64]        = MAPLE_RANGE64_SLOTS,
        [maple_arange_64]        = MAPLE_ARANGE64_SLOTS,
        [maple_copy]                = 3,
};
#define mt_slot_count(x) mt_slots[mte_node_type(x)]

static const unsigned char mt_pivots[] = {
        [maple_dense]                = 0,
        [maple_leaf_64]                = MAPLE_RANGE64_SLOTS - 1,
        [maple_range_64]        = MAPLE_RANGE64_SLOTS - 1,
        [maple_arange_64]        = MAPLE_ARANGE64_SLOTS - 1,
        [maple_copy]                = 3,
};
#define mt_pivot_count(x) mt_pivots[mte_node_type(x)]

static const unsigned char mt_min_slots[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS / 2,
        [maple_leaf_64]                = (MAPLE_RANGE64_SLOTS / 2) - 2,
        [maple_range_64]        = (MAPLE_RANGE64_SLOTS / 2) - 2,
        [maple_arange_64]        = (MAPLE_ARANGE64_SLOTS / 2) - 1,
        [maple_copy]                = 1, /* Should never be used */
};
#define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)]

/* Functions */
static inline struct maple_node *mt_alloc_one(gfp_t gfp)
{
        return kmem_cache_alloc(maple_node_cache, gfp);
}

static inline void mt_free_bulk(size_t size, void __rcu **nodes)
{
        kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes);
}

static void mt_return_sheaf(struct slab_sheaf *sheaf)
{
        kmem_cache_return_sheaf(maple_node_cache, GFP_NOWAIT, sheaf);
}

static struct slab_sheaf *mt_get_sheaf(gfp_t gfp, int count)
{
        return kmem_cache_prefill_sheaf(maple_node_cache, gfp, count);
}

static int mt_refill_sheaf(gfp_t gfp, struct slab_sheaf **sheaf,
                unsigned int size)
{
        return kmem_cache_refill_sheaf(maple_node_cache, gfp, sheaf, size);
}

/*
 * ma_free_rcu() - Use rcu callback to free a maple node
 * @node: The node to free
 *
 * The maple tree uses the parent pointer to indicate this node is no longer in
 * use and will be freed.
 */
static void ma_free_rcu(struct maple_node *node)
{
        WARN_ON(node->parent != ma_parent_ptr(node));
        kfree_rcu(node, rcu);
}

static void mt_set_height(struct maple_tree *mt, unsigned char height)
{
        unsigned int new_flags = mt->ma_flags;

        new_flags &= ~MT_FLAGS_HEIGHT_MASK;
        MT_BUG_ON(mt, height > MAPLE_HEIGHT_MAX);
        new_flags |= height << MT_FLAGS_HEIGHT_OFFSET;
        mt->ma_flags = new_flags;
}

static unsigned int mas_mt_height(struct ma_state *mas)
{
        return mt_height(mas->tree);
}

static inline unsigned int mt_attr(struct maple_tree *mt)
{
        return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK;
}

static __always_inline enum maple_type mte_node_type(
                const struct maple_enode *entry)
{
        return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) &
                MAPLE_NODE_TYPE_MASK;
}

static __always_inline bool ma_is_dense(const enum maple_type type)
{
        return type < maple_leaf_64;
}

static __always_inline bool ma_is_leaf(const enum maple_type type)
{
        return type < maple_range_64;
}

static __always_inline bool mte_is_leaf(const struct maple_enode *entry)
{
        return ma_is_leaf(mte_node_type(entry));
}

/*
 * We also reserve values with the bottom two bits set to '10' which are
 * below 4096
 */
static __always_inline bool mt_is_reserved(const void *entry)
{
        return ((unsigned long)entry < MAPLE_RESERVED_RANGE) &&
                xa_is_internal(entry);
}

static __always_inline void mas_set_err(struct ma_state *mas, long err)
{
        mas->node = MA_ERROR(err);
        mas->status = ma_error;
}

static __always_inline bool mas_is_ptr(const struct ma_state *mas)
{
        return mas->status == ma_root;
}

static __always_inline bool mas_is_start(const struct ma_state *mas)
{
        return mas->status == ma_start;
}

static __always_inline bool mas_is_none(const struct ma_state *mas)
{
        return mas->status == ma_none;
}

static __always_inline bool mas_is_paused(const struct ma_state *mas)
{
        return mas->status == ma_pause;
}

static __always_inline bool mas_is_overflow(struct ma_state *mas)
{
        return mas->status == ma_overflow;
}

static inline bool mas_is_underflow(struct ma_state *mas)
{
        return mas->status == ma_underflow;
}

static __always_inline struct maple_node *mte_to_node(
                const struct maple_enode *entry)
{
        return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK);
}

/*
 * mte_to_mat() - Convert a maple encoded node to a maple topiary node.
 * @entry: The maple encoded node
 *
 * Return: a maple topiary pointer
 */
static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry)
{
        return (struct maple_topiary *)
                ((unsigned long)entry & ~MAPLE_NODE_MASK);
}

/*
 * mas_mn() - Get the maple state node.
 * @mas: The maple state
 *
 * Return: the maple node (not encoded - bare pointer).
 */
static inline struct maple_node *mas_mn(const struct ma_state *mas)
{
        return mte_to_node(mas->node);
}

/*
 * mte_set_node_dead() - Set a maple encoded node as dead.
 * @mn: The maple encoded node.
 */
static inline void mte_set_node_dead(struct maple_enode *mn)
{
        mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn));
        smp_wmb(); /* Needed for RCU */
}

/* Bit 1 indicates the root is a node */
#define MAPLE_ROOT_NODE                        0x02
/* maple_type stored bit 3-6 */
#define MAPLE_ENODE_TYPE_SHIFT                0x03
/* Bit 2 means a NULL somewhere below */
#define MAPLE_ENODE_NULL                0x04

static inline struct maple_enode *mt_mk_node(const struct maple_node *node,
                                             enum maple_type type)
{
        return (void *)((unsigned long)node |
                        (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL);
}

static inline void ma_init_slot(void __rcu **slot, const struct maple_node *mn,
                                const enum maple_type mt)
{
        /* WARNING: this is unsafe if the slot is exposed to readers. */
        RCU_INIT_POINTER(*slot, (void *)mt_mk_node(mn, mt));
}

static inline void *mte_mk_root(const struct maple_enode *node)
{
        return (void *)((unsigned long)node | MAPLE_ROOT_NODE);
}

static inline void *mte_safe_root(const struct maple_enode *node)
{
        return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE);
}

static inline void __maybe_unused *mte_set_full(const struct maple_enode *node)
{
        return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL);
}

static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node)
{
        return (void *)((unsigned long)node | MAPLE_ENODE_NULL);
}

static inline bool __maybe_unused mte_has_null(const struct maple_enode *node)
{
        return (unsigned long)node & MAPLE_ENODE_NULL;
}

static __always_inline bool ma_is_root(struct maple_node *node)
{
        return ((unsigned long)node->parent & MA_ROOT_PARENT);
}

static __always_inline bool mte_is_root(const struct maple_enode *node)
{
        return ma_is_root(mte_to_node(node));
}

static inline bool mas_is_root_limits(const struct ma_state *mas)
{
        return !mas->min && mas->max == ULONG_MAX;
}

static __always_inline bool mt_is_alloc(struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE);
}

/*
 * The Parent Pointer
 * Excluding root, the parent pointer is 256B aligned like all other tree nodes.
 * When storing a 32 or 64 bit values, the offset can fit into 5 bits.  The 16
 * bit values need an extra bit to store the offset.  This extra bit comes from
 * a reuse of the last bit in the node type.  This is possible by using bit 1 to
 * indicate if bit 2 is part of the type or the slot.
 *
 * Node types:
 *  0b??1 = Root
 *  0b?00 = 16 bit nodes
 *  0b010 = 32 bit nodes
 *  0b110 = 64 bit nodes
 *
 * Slot size and alignment
 *  0b??1 : Root
 *  0b?00 : 16 bit values, type in 0-1, slot in 2-7
 *  0b010 : 32 bit values, type in 0-2, slot in 3-7
 *  0b110 : 64 bit values, type in 0-2, slot in 3-7
 */

#define MAPLE_PARENT_ROOT                0x01

#define MAPLE_PARENT_SLOT_SHIFT                0x03
#define MAPLE_PARENT_SLOT_MASK                0xF8

#define MAPLE_PARENT_16B_SLOT_SHIFT        0x02
#define MAPLE_PARENT_16B_SLOT_MASK        0xFC

#define MAPLE_PARENT_RANGE64                0x06
#define MAPLE_PARENT_RANGE32                0x02
#define MAPLE_PARENT_NOT_RANGE16        0x02

/*
 * mte_parent_shift() - Get the parent shift for the slot storage.
 * @parent: The parent pointer cast as an unsigned long
 * Return: The shift into that pointer to the star to of the slot
 */
static inline unsigned long mte_parent_shift(unsigned long parent)
{
        /* Note bit 1 == 0 means 16B */
        if (likely(parent & MAPLE_PARENT_NOT_RANGE16))
                return MAPLE_PARENT_SLOT_SHIFT;

        return MAPLE_PARENT_16B_SLOT_SHIFT;
}

/*
 * mte_parent_slot_mask() - Get the slot mask for the parent.
 * @parent: The parent pointer cast as an unsigned long.
 * Return: The slot mask for that parent.
 */
static inline unsigned long mte_parent_slot_mask(unsigned long parent)
{
        /* Note bit 1 == 0 means 16B */
        if (likely(parent & MAPLE_PARENT_NOT_RANGE16))
                return MAPLE_PARENT_SLOT_MASK;

        return MAPLE_PARENT_16B_SLOT_MASK;
}

/*
 * mas_parent_type() - Return the maple_type of the parent from the stored
 * parent type.
 * @mas: The maple state
 * @enode: The maple_enode to extract the parent's enum
 * Return: The node->parent maple_type
 */
static inline
enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode)
{
        unsigned long p_type;

        p_type = (unsigned long)mte_to_node(enode)->parent;
        if (WARN_ON(p_type & MAPLE_PARENT_ROOT))
                return 0;

        p_type &= MAPLE_NODE_MASK;
        p_type &= ~mte_parent_slot_mask(p_type);
        switch (p_type) {
        case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */
                if (mt_is_alloc(mas->tree))
                        return maple_arange_64;
                return maple_range_64;
        }

        return 0;
}

/*
 * mas_set_parent() - Set the parent node and encode the slot
 * @mas: The maple state
 * @enode: The encoded maple node.
 * @parent: The encoded maple node that is the parent of @enode.
 * @slot: The slot that @enode resides in @parent.
 *
 * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the
 * parent type.
 */
static inline
void mas_set_parent(struct ma_state *mas, struct maple_enode *enode,
                    const struct maple_enode *parent, unsigned char slot)
{
        unsigned long val = (unsigned long)parent;
        unsigned long shift;
        unsigned long type;
        enum maple_type p_type = mte_node_type(parent);

        MAS_BUG_ON(mas, p_type == maple_dense);
        MAS_BUG_ON(mas, p_type == maple_leaf_64);

        switch (p_type) {
        case maple_range_64:
        case maple_arange_64:
                shift = MAPLE_PARENT_SLOT_SHIFT;
                type = MAPLE_PARENT_RANGE64;
                break;
        default:
        case maple_dense:
        case maple_leaf_64:
                shift = type = 0;
                break;
        }

        val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */
        val |= (slot << shift) | type;
        mte_to_node(enode)->parent = ma_parent_ptr(val);
}

/*
 * mte_parent_slot() - get the parent slot of @enode.
 * @enode: The encoded maple node.
 *
 * Return: The slot in the parent node where @enode resides.
 */
static __always_inline
unsigned int mte_parent_slot(const struct maple_enode *enode)
{
        unsigned long val = (unsigned long)mte_to_node(enode)->parent;

        if (unlikely(val & MA_ROOT_PARENT))
                return 0;

        /*
         * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost
         * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT
         */
        return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val);
}

/*
 * mte_parent() - Get the parent of @node.
 * @enode: The encoded maple node.
 *
 * Return: The parent maple node.
 */
static __always_inline
struct maple_node *mte_parent(const struct maple_enode *enode)
{
        return (void *)((unsigned long)
                        (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK);
}

/*
 * ma_dead_node() - check if the @enode is dead.
 * @enode: The encoded maple node
 *
 * Return: true if dead, false otherwise.
 */
static __always_inline bool ma_dead_node(const struct maple_node *node)
{
        struct maple_node *parent;

        /* Do not reorder reads from the node prior to the parent check */
        smp_rmb();
        parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK);
        return (parent == node);
}

/*
 * mte_dead_node() - check if the @enode is dead.
 * @enode: The encoded maple node
 *
 * Return: true if dead, false otherwise.
 */
static __always_inline bool mte_dead_node(const struct maple_enode *enode)
{
        struct maple_node *node;

        node = mte_to_node(enode);
        return ma_dead_node(node);
}

/*
 * ma_pivots() - Get a pointer to the maple node pivots.
 * @node: the maple node
 * @type: the node type
 *
 * In the event of a dead node, this array may be %NULL
 *
 * Return: A pointer to the maple node pivots
 */
static inline unsigned long *ma_pivots(struct maple_node *node,
                                           enum maple_type type)
{
        switch (type) {
        case maple_arange_64:
                return node->ma64.pivot;
        case maple_range_64:
        case maple_leaf_64:
                return node->mr64.pivot;
        case maple_copy:
                return node->cp.pivot;
        case maple_dense:
                return NULL;
        }
        return NULL;
}

/*
 * ma_gaps() - Get a pointer to the maple node gaps.
 * @node: the maple node
 * @type: the node type
 *
 * Return: A pointer to the maple node gaps
 */
static inline unsigned long *ma_gaps(struct maple_node *node,
                                     enum maple_type type)
{
        switch (type) {
        case maple_arange_64:
                return node->ma64.gap;
        case maple_copy:
                return node->cp.gap;
        case maple_range_64:
        case maple_leaf_64:
        case maple_dense:
                return NULL;
        }
        return NULL;
}

/*
 * mas_safe_pivot() - get the pivot at @piv or mas->max.
 * @mas: The maple state
 * @pivots: The pointer to the maple node pivots
 * @piv: The pivot to fetch
 * @type: The maple node type
 *
 * Return: The pivot at @piv within the limit of the @pivots array, @mas->max
 * otherwise.
 */
static __always_inline unsigned long
mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots,
               unsigned char piv, enum maple_type type)
{
        if (piv >= mt_pivots[type])
                return mas->max;

        return pivots[piv];
}

/*
 * mas_safe_min() - Return the minimum for a given offset.
 * @mas: The maple state
 * @pivots: The pointer to the maple node pivots
 * @offset: The offset into the pivot array
 *
 * Return: The minimum range value that is contained in @offset.
 */
static inline unsigned long
mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset)
{
        if (likely(offset))
                return pivots[offset - 1] + 1;

        return mas->min;
}

/*
 * mte_set_pivot() - Set a pivot to a value in an encoded maple node.
 * @mn: The encoded maple node
 * @piv: The pivot offset
 * @val: The value of the pivot
 */
static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv,
                                unsigned long val)
{
        struct maple_node *node = mte_to_node(mn);
        enum maple_type type = mte_node_type(mn);

        BUG_ON(piv >= mt_pivots[type]);
        switch (type) {
        case maple_range_64:
        case maple_leaf_64:
                node->mr64.pivot[piv] = val;
                break;
        case maple_arange_64:
                node->ma64.pivot[piv] = val;
                break;
        case maple_copy:
        case maple_dense:
                break;
        }

}

/*
 * ma_slots() - Get a pointer to the maple node slots.
 * @mn: The maple node
 * @mt: The maple node type
 *
 * Return: A pointer to the maple node slots
 */
static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt)
{
        switch (mt) {
        case maple_arange_64:
                return mn->ma64.slot;
        case maple_range_64:
        case maple_leaf_64:
                return mn->mr64.slot;
        case maple_copy:
                return mn->cp.slot;
        case maple_dense:
                return mn->slot;
        }

        return NULL;
}

static inline bool mt_write_locked(const struct maple_tree *mt)
{
        return mt_external_lock(mt) ? mt_write_lock_is_held(mt) :
                lockdep_is_held(&mt->ma_lock);
}

static __always_inline bool mt_locked(const struct maple_tree *mt)
{
        return mt_external_lock(mt) ? mt_lock_is_held(mt) :
                lockdep_is_held(&mt->ma_lock);
}

static __always_inline void *mt_slot(const struct maple_tree *mt,
                void __rcu **slots, unsigned char offset)
{
        return rcu_dereference_check(slots[offset], mt_locked(mt));
}

static __always_inline void *mt_slot_locked(struct maple_tree *mt,
                void __rcu **slots, unsigned char offset)
{
        return rcu_dereference_protected(slots[offset], mt_write_locked(mt));
}
/*
 * mas_slot_locked() - Get the slot value when holding the maple tree lock.
 * @mas: The maple state
 * @slots: The pointer to the slots
 * @offset: The offset into the slots array to fetch
 *
 * Return: The entry stored in @slots at the @offset.
 */
static __always_inline void *mas_slot_locked(struct ma_state *mas,
                void __rcu **slots, unsigned char offset)
{
        return mt_slot_locked(mas->tree, slots, offset);
}

/*
 * mas_slot() - Get the slot value when not holding the maple tree lock.
 * @mas: The maple state
 * @slots: The pointer to the slots
 * @offset: The offset into the slots array to fetch
 *
 * Return: The entry stored in @slots at the @offset
 */
static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots,
                unsigned char offset)
{
        return mt_slot(mas->tree, slots, offset);
}

/*
 * mas_root() - Get the maple tree root.
 * @mas: The maple state.
 *
 * Return: The pointer to the root of the tree
 */
static __always_inline void *mas_root(struct ma_state *mas)
{
        return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree));
}

static inline void *mt_root_locked(struct maple_tree *mt)
{
        return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt));
}

/*
 * mas_root_locked() - Get the maple tree root when holding the maple tree lock.
 * @mas: The maple state.
 *
 * Return: The pointer to the root of the tree
 */
static inline void *mas_root_locked(struct ma_state *mas)
{
        return mt_root_locked(mas->tree);
}

static inline struct maple_metadata *ma_meta(struct maple_node *mn,
                                             enum maple_type mt)
{
        switch (mt) {
        case maple_arange_64:
                return &mn->ma64.meta;
        default:
                return &mn->mr64.meta;
        }
}

/*
 * ma_set_meta() - Set the metadata information of a node.
 * @mn: The maple node
 * @mt: The maple node type
 * @offset: The offset of the highest sub-gap in this node.
 * @end: The end of the data in this node.
 */
static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt,
                               unsigned char offset, unsigned char end)
{
        struct maple_metadata *meta = ma_meta(mn, mt);

        meta->gap = offset;
        meta->end = end;
}

/*
 * mt_clear_meta() - clear the metadata information of a node, if it exists
 * @mt: The maple tree
 * @mn: The maple node
 * @type: The maple node type
 */
static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn,
                                  enum maple_type type)
{
        struct maple_metadata *meta;
        unsigned long *pivots;
        void __rcu **slots;
        void *next;

        switch (type) {
        case maple_range_64:
                pivots = mn->mr64.pivot;
                if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
                        slots = mn->mr64.slot;
                        next = mt_slot_locked(mt, slots,
                                              MAPLE_RANGE64_SLOTS - 1);
                        if (unlikely((mte_to_node(next) &&
                                      mte_node_type(next))))
                                return; /* no metadata, could be node */
                }
                fallthrough;
        case maple_arange_64:
                meta = ma_meta(mn, type);
                break;
        default:
                return;
        }

        meta->gap = 0;
        meta->end = 0;
}

/*
 * ma_meta_end() - Get the data end of a node from the metadata
 * @mn: The maple node
 * @mt: The maple node type
 */
static inline unsigned char ma_meta_end(struct maple_node *mn,
                                        enum maple_type mt)
{
        struct maple_metadata *meta = ma_meta(mn, mt);

        return meta->end;
}

/*
 * ma_meta_gap() - Get the largest gap location of a node from the metadata
 * @mn: The maple node
 */
static inline unsigned char ma_meta_gap(struct maple_node *mn)
{
        return mn->ma64.meta.gap;
}

/*
 * ma_set_meta_gap() - Set the largest gap location in a nodes metadata
 * @mn: The maple node
 * @mt: The maple node type
 * @offset: The location of the largest gap.
 */
static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt,
                                   unsigned char offset)
{

        struct maple_metadata *meta = ma_meta(mn, mt);

        meta->gap = offset;
}

/*
 * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes.
 * @mat: the ma_topiary, a linked list of dead nodes.
 * @dead_enode: the node to be marked as dead and added to the tail of the list
 *
 * Add the @dead_enode to the linked list in @mat.
 */
static inline void mat_add(struct ma_topiary *mat,
                           struct maple_enode *dead_enode)
{
        mte_set_node_dead(dead_enode);
        mte_to_mat(dead_enode)->next = NULL;
        if (!mat->tail) {
                mat->tail = mat->head = dead_enode;
                return;
        }

        mte_to_mat(mat->tail)->next = dead_enode;
        mat->tail = dead_enode;
}

static void mt_free_walk(struct rcu_head *head);
static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
                            bool free);
/*
 * mas_mat_destroy() - Free all nodes and subtrees in a dead list.
 * @mas: the maple state
 * @mat: the ma_topiary linked list of dead nodes to free.
 *
 * Destroy walk a dead list.
 */
static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat)
{
        struct maple_enode *next;
        struct maple_node *node;
        bool in_rcu = mt_in_rcu(mas->tree);

        while (mat->head) {
                next = mte_to_mat(mat->head)->next;
                node = mte_to_node(mat->head);
                mt_destroy_walk(mat->head, mas->tree, !in_rcu);
                if (in_rcu)
                        call_rcu(&node->rcu, mt_free_walk);
                mat->head = next;
        }
}
/*
 * mas_descend() - Descend into the slot stored in the ma_state.
 * @mas: the maple state.
 *
 * Note: Not RCU safe, only use in write side or debug code.
 */
static inline void mas_descend(struct ma_state *mas)
{
        enum maple_type type;
        unsigned long *pivots;
        struct maple_node *node;
        void __rcu **slots;

        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);

        if (mas->offset)
                mas->min = pivots[mas->offset - 1] + 1;
        mas->max = mas_safe_pivot(mas, pivots, mas->offset, type);
        mas->node = mas_slot(mas, slots, mas->offset);
}

/*
 * mas_ascend() - Walk up a level of the tree.
 * @mas: The maple state
 *
 * Sets the @mas->max and @mas->min for the parent node of mas->node.  This
 * may cause several levels of walking up to find the correct min and max.
 * May find a dead node which will cause a premature return.
 * Return: 1 on dead node, 0 otherwise
 */
static int mas_ascend(struct ma_state *mas)
{
        struct maple_enode *p_enode; /* parent enode. */
        struct maple_enode *a_enode; /* ancestor enode. */
        struct maple_node *a_node; /* ancestor node. */
        struct maple_node *p_node; /* parent node. */
        unsigned char a_slot;
        enum maple_type a_type;
        unsigned long min, max;
        unsigned long *pivots;
        bool set_max = false, set_min = false;

        a_node = mas_mn(mas);
        if (ma_is_root(a_node)) {
                mas->offset = 0;
                return 0;
        }

        p_node = mte_parent(mas->node);
        if (unlikely(a_node == p_node))
                return 1;

        a_type = mas_parent_type(mas, mas->node);
        mas->offset = mte_parent_slot(mas->node);
        a_enode = mt_mk_node(p_node, a_type);

        /* Check to make sure all parent information is still accurate */
        if (p_node != mte_parent(mas->node))
                return 1;

        mas->node = a_enode;

        if (mte_is_root(a_enode)) {
                mas->max = ULONG_MAX;
                mas->min = 0;
                return 0;
        }

        min = 0;
        max = ULONG_MAX;

        /*
         * !mas->offset implies that parent node min == mas->min.
         * mas->offset > 0 implies that we need to walk up to find the
         * implied pivot min.
         */
        if (!mas->offset) {
                min = mas->min;
                set_min = true;
        }

        if (mas->max == ULONG_MAX)
                set_max = true;

        do {
                p_enode = a_enode;
                a_type = mas_parent_type(mas, p_enode);
                a_node = mte_parent(p_enode);
                a_slot = mte_parent_slot(p_enode);
                a_enode = mt_mk_node(a_node, a_type);
                pivots = ma_pivots(a_node, a_type);

                if (unlikely(ma_dead_node(a_node)))
                        return 1;

                if (!set_min && a_slot) {
                        set_min = true;
                        min = pivots[a_slot - 1] + 1;
                }

                if (!set_max && a_slot < mt_pivots[a_type]) {
                        set_max = true;
                        max = pivots[a_slot];
                }

                if (unlikely(ma_dead_node(a_node)))
                        return 1;

                if (unlikely(ma_is_root(a_node)))
                        break;

        } while (!set_min || !set_max);

        mas->max = max;
        mas->min = min;
        return 0;
}

/*
 * mas_pop_node() - Get a previously allocated maple node from the maple state.
 * @mas: The maple state
 *
 * Return: A pointer to a maple node.
 */
static __always_inline struct maple_node *mas_pop_node(struct ma_state *mas)
{
        struct maple_node *ret;

        if (mas->alloc) {
                ret = mas->alloc;
                mas->alloc = NULL;
                goto out;
        }

        if (WARN_ON_ONCE(!mas->sheaf))
                return NULL;

        ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf);

out:
        memset(ret, 0, sizeof(*ret));
        return ret;
}

/*
 * mas_alloc_nodes() - Allocate nodes into a maple state
 * @mas: The maple state
 * @gfp: The GFP Flags
 */
static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
{
        if (!mas->node_request)
                return;

        if (mas->node_request == 1) {
                if (mas->sheaf)
                        goto use_sheaf;

                if (mas->alloc)
                        return;

                mas->alloc = mt_alloc_one(gfp);
                if (!mas->alloc)
                        goto error;

                mas->node_request = 0;
                return;
        }

use_sheaf:
        if (unlikely(mas->alloc)) {
                kfree(mas->alloc);
                mas->alloc = NULL;
        }

        if (mas->sheaf) {
                unsigned long refill;

                refill = mas->node_request;
                if (kmem_cache_sheaf_size(mas->sheaf) >= refill) {
                        mas->node_request = 0;
                        return;
                }

                if (mt_refill_sheaf(gfp, &mas->sheaf, refill))
                        goto error;

                mas->node_request = 0;
                return;
        }

        mas->sheaf = mt_get_sheaf(gfp, mas->node_request);
        if (likely(mas->sheaf)) {
                mas->node_request = 0;
                return;
        }

error:
        mas_set_err(mas, -ENOMEM);
}

static inline void mas_empty_nodes(struct ma_state *mas)
{
        mas->node_request = 0;
        if (mas->sheaf) {
                mt_return_sheaf(mas->sheaf);
                mas->sheaf = NULL;
        }

        if (mas->alloc) {
                kfree(mas->alloc);
                mas->alloc = NULL;
        }
}

/*
 * mas_free() - Free an encoded maple node
 * @mas: The maple state
 * @used: The encoded maple node to free.
 *
 * Uses rcu free if necessary, pushes @used back on the maple state allocations
 * otherwise.
 */
static inline void mas_free(struct ma_state *mas, struct maple_enode *used)
{
        ma_free_rcu(mte_to_node(used));
}

/*
 * mas_start() - Sets up maple state for operations.
 * @mas: The maple state.
 *
 * If mas->status == ma_start, then set the min, max and depth to
 * defaults.
 *
 * Return:
 * - If mas->node is an error or not mas_start, return NULL.
 * - If it's an empty tree:     NULL & mas->status == ma_none
 * - If it's a single entry:    The entry & mas->status == ma_root
 * - If it's a tree:            NULL & mas->status == ma_active
 */
static inline struct maple_enode *mas_start(struct ma_state *mas)
{
        if (likely(mas_is_start(mas))) {
                struct maple_enode *root;

                mas->min = 0;
                mas->max = ULONG_MAX;

retry:
                mas->depth = 0;
                root = mas_root(mas);
                /* Tree with nodes */
                if (likely(xa_is_node(root))) {
                        mas->depth = 0;
                        mas->status = ma_active;
                        mas->node = mte_safe_root(root);
                        mas->offset = 0;
                        if (mte_dead_node(mas->node))
                                goto retry;

                        return NULL;
                }

                mas->node = NULL;
                /* empty tree */
                if (unlikely(!root)) {
                        mas->status = ma_none;
                        mas->offset = MAPLE_NODE_SLOTS;
                        return NULL;
                }

                /* Single entry tree */
                mas->status = ma_root;
                mas->offset = MAPLE_NODE_SLOTS;

                /* Single entry tree. */
                if (mas->index > 0)
                        return NULL;

                return root;
        }

        return NULL;
}

/*
 * ma_data_end() - Find the end of the data in a node.
 * @node: The maple node
 * @type: The maple node type
 * @pivots: The array of pivots in the node
 * @max: The maximum value in the node
 *
 * Uses metadata to find the end of the data when possible.
 * Return: The zero indexed last slot with data (may be null).
 */
static __always_inline unsigned char ma_data_end(struct maple_node *node,
                enum maple_type type, unsigned long *pivots, unsigned long max)
{
        unsigned char offset;

        if (!pivots)
                return 0;

        if (type == maple_arange_64)
                return ma_meta_end(node, type);

        offset = mt_pivots[type] - 1;
        if (likely(!pivots[offset]))
                return ma_meta_end(node, type);

        if (likely(pivots[offset] == max))
                return offset;

        return mt_pivots[type];
}

/*
 * mas_data_end() - Find the end of the data (slot).
 * @mas: the maple state
 *
 * This method is optimized to check the metadata of a node if the node type
 * supports data end metadata.
 *
 * Return: The zero indexed last slot with data (may be null).
 */
static inline unsigned char mas_data_end(struct ma_state *mas)
{
        enum maple_type type;
        struct maple_node *node;
        unsigned char offset;
        unsigned long *pivots;

        type = mte_node_type(mas->node);
        node = mas_mn(mas);
        if (type == maple_arange_64)
                return ma_meta_end(node, type);

        pivots = ma_pivots(node, type);
        if (unlikely(ma_dead_node(node)))
                return 0;

        offset = mt_pivots[type] - 1;
        if (likely(!pivots[offset]))
                return ma_meta_end(node, type);

        if (likely(pivots[offset] == mas->max))
                return offset;

        return mt_pivots[type];
}

static inline
void wr_mas_setup(struct ma_wr_state *wr_mas, struct ma_state *mas)
{
        wr_mas->node = mas_mn(mas);
        wr_mas->type = mte_node_type(mas->node);
        wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type);
        wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type);
        wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, mas->offset);
        wr_mas->r_max = mas_safe_pivot(mas, wr_mas->pivots, mas->offset,
                                       wr_mas->type);
}

static inline
void wr_mas_ascend(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        mas_ascend(mas);
        wr_mas_setup(wr_mas, mas);
        mas->end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots,
                               mas->max);
        /* Careful, this may be wrong.. */
        wr_mas->end_piv = wr_mas->r_max;
        wr_mas->offset_end = mas->offset;
}

static inline unsigned long ma_leaf_max_gap(struct maple_node *mn,
                enum maple_type mt, unsigned long min, unsigned long max,
                unsigned long *pivots, void __rcu **slots)
{
        unsigned long pstart, gap, max_gap;
        unsigned char i;
        unsigned char max_piv;

        max_gap = 0;
        if (unlikely(ma_is_dense(mt))) {
                gap = 0;
                for (i = 0; i < mt_slots[mt]; i++) {
                        if (slots[i]) {
                                if (gap > max_gap)
                                        max_gap = gap;
                                gap = 0;
                        } else {
                                gap++;
                        }
                }
                if (gap > max_gap)
                        max_gap = gap;
                return max_gap;
        }

        /*
         * Check the first implied pivot optimizes the loop below and slot 1 may
         * be skipped if there is a gap in slot 0.
         */
        if (likely(!slots[0])) {
                max_gap = pivots[0] - min + 1;
                i = 2;
        } else {
                i = 1;
        }

        /* reduce max_piv as the special case is checked before the loop */
        max_piv = ma_data_end(mn, mt, pivots, max) - 1;
        /*
         * Check end implied pivot which can only be a gap on the right most
         * node.
         */
        if (unlikely(max == ULONG_MAX) && !slots[max_piv + 1]) {
                gap = ULONG_MAX - pivots[max_piv];
                if (gap > max_gap)
                        max_gap = gap;

                if (max_gap > pivots[max_piv] - min)
                        return max_gap;
        }

        for (; i <= max_piv; i++) {
                /* data == no gap. */
                if (likely(slots[i]))
                        continue;

                pstart = pivots[i - 1];
                gap = pivots[i] - pstart;
                if (gap > max_gap)
                        max_gap = gap;

                /* There cannot be two gaps in a row. */
                i++;
        }
        return max_gap;
}

/*
 * mas_leaf_max_gap() - Returns the largest gap in a leaf node
 * @mas: the maple state
 *
 * Return: The maximum gap in the leaf.
 */
static inline unsigned long mas_leaf_max_gap(struct ma_state *mas)
{
        enum maple_type mt;
        struct maple_node *mn;
        unsigned long *pivots;
        void __rcu **slots;

        mn = mas_mn(mas);
        mt = mte_node_type(mas->node);
        slots = ma_slots(mn, mt);
        pivots = ma_pivots(mn, mt);

        return ma_leaf_max_gap(mn, mt, mas->min, mas->max, pivots, slots);
}

/*
 * ma_max_gap() - Get the maximum gap in a maple node (non-leaf)
 * @node: The maple node
 * @gaps: The pointer to the gaps
 * @mt: The maple node type
 * @off: Pointer to store the offset location of the gap.
 *
 * Uses the metadata data end to scan backwards across set gaps.
 *
 * Return: The maximum gap value
 */
static inline unsigned long
ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt,
            unsigned char *off)
{
        unsigned char offset, i;
        unsigned long max_gap = 0;

        i = offset = ma_meta_end(node, mt);
        do {
                if (gaps[i] > max_gap) {
                        max_gap = gaps[i];
                        offset = i;
                }
        } while (i--);

        *off = offset;
        return max_gap;
}

/*
 * mas_max_gap() - find the largest gap in a non-leaf node and set the slot.
 * @mas: The maple state.
 *
 * Return: The gap value.
 */
static inline unsigned long mas_max_gap(struct ma_state *mas)
{
        unsigned long *gaps;
        unsigned char offset;
        enum maple_type mt;
        struct maple_node *node;

        mt = mte_node_type(mas->node);
        if (ma_is_leaf(mt))
                return mas_leaf_max_gap(mas);

        node = mas_mn(mas);
        MAS_BUG_ON(mas, mt != maple_arange_64);
        offset = ma_meta_gap(node);
        gaps = ma_gaps(node, mt);
        return gaps[offset];
}

/*
 * mas_parent_gap() - Set the parent gap and any gaps above, as needed
 * @mas: The maple state
 * @offset: The gap offset in the parent to set
 * @new: The new gap value.
 *
 * Set the parent gap then continue to set the gap upwards, using the metadata
 * of the parent to see if it is necessary to check the node above.
 */
static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset,
                unsigned long new)
{
        unsigned long meta_gap = 0;
        struct maple_node *pnode;
        struct maple_enode *penode;
        unsigned long *pgaps;
        unsigned char meta_offset;
        enum maple_type pmt;

        pnode = mte_parent(mas->node);
        pmt = mas_parent_type(mas, mas->node);
        penode = mt_mk_node(pnode, pmt);
        pgaps = ma_gaps(pnode, pmt);

ascend:
        MAS_BUG_ON(mas, pmt != maple_arange_64);
        meta_offset = ma_meta_gap(pnode);
        meta_gap = pgaps[meta_offset];

        pgaps[offset] = new;

        if (meta_gap == new)
                return;

        if (offset != meta_offset) {
                if (meta_gap > new)
                        return;

                ma_set_meta_gap(pnode, pmt, offset);
        } else if (new < meta_gap) {
                new = ma_max_gap(pnode, pgaps, pmt, &meta_offset);
                ma_set_meta_gap(pnode, pmt, meta_offset);
        }

        if (ma_is_root(pnode))
                return;

        /* Go to the parent node. */
        pnode = mte_parent(penode);
        pmt = mas_parent_type(mas, penode);
        pgaps = ma_gaps(pnode, pmt);
        offset = mte_parent_slot(penode);
        penode = mt_mk_node(pnode, pmt);
        goto ascend;
}

/*
 * mas_update_gap() - Update a nodes gaps and propagate up if necessary.
 * @mas: the maple state.
 */
static inline void mas_update_gap(struct ma_state *mas)
{
        unsigned char pslot;
        unsigned long p_gap;
        unsigned long max_gap;

        if (!mt_is_alloc(mas->tree))
                return;

        if (mte_is_root(mas->node))
                return;

        max_gap = mas_max_gap(mas);

        pslot = mte_parent_slot(mas->node);
        p_gap = ma_gaps(mte_parent(mas->node),
                        mas_parent_type(mas, mas->node))[pslot];

        if (p_gap != max_gap)
                mas_parent_gap(mas, pslot, max_gap);
}

/*
 * mas_adopt_children() - Set the parent pointer of all nodes in @parent to
 * @parent with the slot encoded.
 * @mas: the maple state (for the tree)
 * @parent: the maple encoded node containing the children.
 */
static inline void mas_adopt_children(struct ma_state *mas,
                struct maple_enode *parent)
{
        enum maple_type type = mte_node_type(parent);
        struct maple_node *node = mte_to_node(parent);
        void __rcu **slots = ma_slots(node, type);
        unsigned long *pivots = ma_pivots(node, type);
        struct maple_enode *child;
        unsigned char offset;

        offset = ma_data_end(node, type, pivots, mas->max);
        do {
                child = mas_slot_locked(mas, slots, offset);
                mas_set_parent(mas, child, parent, offset);
        } while (offset--);
}

/*
 * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old
 * node as dead.
 * @mas: the maple state with the new node
 * @old_enode: The old maple encoded node to replace.
 * @new_height: if we are inserting a root node, update the height of the tree
 */
static inline void mas_put_in_tree(struct ma_state *mas,
                struct maple_enode *old_enode, char new_height)
        __must_hold(mas->tree->ma_lock)
{
        unsigned char offset;
        void __rcu **slots;

        if (mte_is_root(mas->node)) {
                mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas));
                rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
                mt_set_height(mas->tree, new_height);
        } else {

                offset = mte_parent_slot(mas->node);
                slots = ma_slots(mte_parent(mas->node),
                                 mas_parent_type(mas, mas->node));
                rcu_assign_pointer(slots[offset], mas->node);
        }

        mte_set_node_dead(old_enode);
}

/*
 * mas_replace_node() - Replace a node by putting it in the tree, marking it
 * dead, and freeing it.
 * the parent encoding to locate the maple node in the tree.
 * @mas: the ma_state with @mas->node pointing to the new node.
 * @old_enode: The old maple encoded node.
 * @new_height: The new height of the tree as a result of the operation
 */
static inline void mas_replace_node(struct ma_state *mas,
                struct maple_enode *old_enode, unsigned char new_height)
        __must_hold(mas->tree->ma_lock)
{
        mas_put_in_tree(mas, old_enode, new_height);
        mas_free(mas, old_enode);
}

/*
 * mas_find_child() - Find a child who has the parent @mas->node.
 * @mas: the maple state with the parent.
 * @child: the maple state to store the child.
 */
static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child)
        __must_hold(mas->tree->ma_lock)
{
        enum maple_type mt;
        unsigned char offset;
        unsigned char end;
        unsigned long *pivots;
        struct maple_enode *entry;
        struct maple_node *node;
        void __rcu **slots;

        mt = mte_node_type(mas->node);
        node = mas_mn(mas);
        slots = ma_slots(node, mt);
        pivots = ma_pivots(node, mt);
        end = ma_data_end(node, mt, pivots, mas->max);
        for (offset = mas->offset; offset <= end; offset++) {
                entry = mas_slot_locked(mas, slots, offset);
                if (mte_parent(entry) == node) {
                        *child = *mas;
                        mas->offset = offset + 1;
                        child->offset = offset;
                        mas_descend(child);
                        child->offset = 0;
                        return true;
                }
        }
        return false;
}

/*
 * mas_leaf_set_meta() - Set the metadata of a leaf if possible.
 * @node: The maple node
 * @mt: The maple type
 * @end: The node end
 */
static inline void mas_leaf_set_meta(struct maple_node *node,
                enum maple_type mt, unsigned char end)
{
        if (end < mt_slots[mt] - 1)
                ma_set_meta(node, mt, 0, end);
}

/*
 * mas_prev_sibling() - Find the previous node with the same parent.
 * @mas: the maple state
 *
 * Return: True if there is a previous sibling, false otherwise.
 */
static inline bool mas_prev_sibling(struct ma_state *mas)
{
        unsigned int p_slot = mte_parent_slot(mas->node);

        /* For root node, p_slot is set to 0 by mte_parent_slot(). */
        if (!p_slot)
                return false;

        mas_ascend(mas);
        mas->offset = p_slot - 1;
        mas_descend(mas);
        return true;
}

/*
 * mas_next_sibling() - Find the next node with the same parent.
 * @mas: the maple state
 *
 * Return: true if there is a next sibling, false otherwise.
 */
static inline bool mas_next_sibling(struct ma_state *mas)
{
        MA_STATE(parent, mas->tree, mas->index, mas->last);

        if (mte_is_root(mas->node))
                return false;

        parent = *mas;
        mas_ascend(&parent);
        parent.offset = mte_parent_slot(mas->node) + 1;
        if (parent.offset > mas_data_end(&parent))
                return false;

        *mas = parent;
        mas_descend(mas);
        return true;
}

/*
 * mas_wr_node_walk() - Find the correct offset for the index in the @mas.
 *                      If @mas->index cannot be found within the containing
 *                      node, we traverse to the last entry in the node.
 * @wr_mas: The maple write state
 *
 * Uses mas_slot_locked() and does not need to worry about dead nodes.
 */
static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char count, offset;

        if (unlikely(ma_is_dense(wr_mas->type))) {
                wr_mas->r_max = wr_mas->r_min = mas->index;
                mas->offset = mas->index = mas->min;
                return;
        }

        wr_mas->node = mas_mn(wr_mas->mas);
        wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type);
        count = mas->end = ma_data_end(wr_mas->node, wr_mas->type,
                                       wr_mas->pivots, mas->max);
        offset = mas->offset;

        while (offset < count && mas->index > wr_mas->pivots[offset])
                offset++;

        wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max;
        wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset);
        wr_mas->offset_end = mas->offset = offset;
}

static inline void rebalance_sib(struct ma_state *parent, struct ma_state *sib)
{
        *sib = *parent;
        /* Prioritize move right to pull data left */
        if (sib->offset < sib->end)
                sib->offset++;
        else
                sib->offset--;

        mas_descend(sib);
        sib->end = mas_data_end(sib);
}

static inline
void spanning_sib(struct ma_wr_state *l_wr_mas,
                struct ma_wr_state *r_wr_mas, struct ma_state *nneighbour)
{
        struct ma_state l_tmp = *l_wr_mas->mas;
        struct ma_state r_tmp = *r_wr_mas->mas;
        unsigned char depth = 0;

        do {
                mas_ascend(&r_tmp);
                mas_ascend(&l_tmp);
                depth++;
                if (r_tmp.offset < mas_data_end(&r_tmp)) {
                        r_tmp.offset++;
                        mas_descend(&r_tmp);
                        r_tmp.offset = 0;
                        while (--depth)
                                mas_descend(&r_tmp);

                        r_tmp.end = mas_data_end(&r_tmp);
                        *nneighbour = r_tmp;
                        return;
                } else if (l_tmp.offset) {
                        l_tmp.offset--;
                        do {
                                mas_descend(&l_tmp);
                                l_tmp.offset = mas_data_end(&l_tmp);
                        } while (--depth);

                        l_tmp.end = l_tmp.offset;
                        *nneighbour = l_tmp;
                        return;
                }
        } while (!mte_is_root(r_tmp.node));

        WARN_ON_ONCE(1);
}

/*
 * mas_topiary_node() - Dispose of a single node
 * @mas: The maple state for pushing nodes
 * @in_rcu: If the tree is in rcu mode
 *
 * The node will either be RCU freed or pushed back on the maple state.
 */
static inline void mas_topiary_node(struct ma_state *mas,
                struct ma_state *tmp_mas, bool in_rcu)
{
        struct maple_node *tmp;
        struct maple_enode *enode;

        if (mas_is_none(tmp_mas))
                return;

        enode = tmp_mas->node;
        tmp = mte_to_node(enode);
        mte_set_node_dead(enode);
        ma_free_rcu(tmp);
}

/*
 * mas_topiary_replace() - Replace the data with new data, then repair the
 * parent links within the new tree.  Iterate over the dead sub-tree and collect
 * the dead subtrees and topiary the nodes that are no longer of use.
 *
 * The new tree will have up to three children with the correct parent.  Keep
 * track of the new entries as they need to be followed to find the next level
 * of new entries.
 *
 * The old tree will have up to three children with the old parent.  Keep track
 * of the old entries as they may have more nodes below replaced.  Nodes within
 * [index, last] are dead subtrees, others need to be freed and followed.
 *
 * @mas: The maple state pointing at the new data
 * @old_enode: The maple encoded node being replaced
 * @new_height: The new height of the tree as a result of the operation
 *
 */
static inline void mas_topiary_replace(struct ma_state *mas,
                struct maple_enode *old_enode, unsigned char new_height)
{
        struct ma_state tmp[3], tmp_next[3];
        MA_TOPIARY(subtrees, mas->tree);
        bool in_rcu;
        int i, n;

        /* Place data in tree & then mark node as old */
        mas_put_in_tree(mas, old_enode, new_height);

        /* Update the parent pointers in the tree */
        tmp[0] = *mas;
        tmp[0].offset = 0;
        tmp[1].status = ma_none;
        tmp[2].status = ma_none;
        while (!mte_is_leaf(tmp[0].node)) {
                n = 0;
                for (i = 0; i < 3; i++) {
                        if (mas_is_none(&tmp[i]))
                                continue;

                        while (n < 3) {
                                if (!mas_find_child(&tmp[i], &tmp_next[n]))
                                        break;
                                n++;
                        }

                        mas_adopt_children(&tmp[i], tmp[i].node);
                }

                if (MAS_WARN_ON(mas, n == 0))
                        break;

                while (n < 3)
                        tmp_next[n++].status = ma_none;

                for (i = 0; i < 3; i++)
                        tmp[i] = tmp_next[i];
        }

        /* Collect the old nodes that need to be discarded */
        if (mte_is_leaf(old_enode))
                return mas_free(mas, old_enode);

        tmp[0] = *mas;
        tmp[0].offset = 0;
        tmp[0].node = old_enode;
        tmp[1].status = ma_none;
        tmp[2].status = ma_none;
        in_rcu = mt_in_rcu(mas->tree);
        do {
                n = 0;
                for (i = 0; i < 3; i++) {
                        if (mas_is_none(&tmp[i]))
                                continue;

                        while (n < 3) {
                                if (!mas_find_child(&tmp[i], &tmp_next[n]))
                                        break;

                                if ((tmp_next[n].min >= tmp_next->index) &&
                                    (tmp_next[n].max <= tmp_next->last)) {
                                        mat_add(&subtrees, tmp_next[n].node);
                                        tmp_next[n].status = ma_none;
                                } else {
                                        n++;
                                }
                        }
                }

                if (MAS_WARN_ON(mas, n == 0))
                        break;

                while (n < 3)
                        tmp_next[n++].status = ma_none;

                for (i = 0; i < 3; i++) {
                        mas_topiary_node(mas, &tmp[i], in_rcu);
                        tmp[i] = tmp_next[i];
                }
        } while (!mte_is_leaf(tmp[0].node));

        for (i = 0; i < 3; i++)
                mas_topiary_node(mas, &tmp[i], in_rcu);

        mas_mat_destroy(mas, &subtrees);
}

/*
 * node_copy() - Copy from one node to another.
 *
 * @mas: The maple state
 * @src: The source node
 * @start: The offset into the src to start copying
 * @size: The size to copy (non-zero)
 * @s_max: The source node max
 * @s_mt: The source maple node type
 * @dst: The destination
 * @d_start: The start location in the destination node
 * @d_mt: The destination maple node type
 */
static inline
unsigned long node_copy(struct ma_state *mas, struct maple_node *src,
        unsigned char start, unsigned char size, unsigned long s_max,
        enum maple_type s_mt, struct maple_node *dst, unsigned char d_start,
        enum maple_type d_mt)
{
        unsigned long *s_pivots, *d_pivots;
        void __rcu **s_slots, **d_slots;
        unsigned long *s_gaps, *d_gaps;
        unsigned long d_max;

        d_slots = ma_slots(dst, d_mt) + d_start;
        d_pivots = ma_pivots(dst, d_mt) + d_start;
        s_slots = ma_slots(src, s_mt) + start;
        s_pivots = ma_pivots(src, s_mt) + start;
        memcpy(d_slots, s_slots, size * sizeof(void __rcu *));
        if (!ma_is_leaf(d_mt) && s_mt == maple_copy) {
                struct maple_enode *edst = mt_mk_node(dst, d_mt);


                for (int i = 0; i < size; i++)
                        mas_set_parent(mas,
                                       mt_slot_locked(mas->tree, d_slots, i),
                                       edst, d_start + i);
        }

        d_gaps = ma_gaps(dst, d_mt);
        if (d_gaps) {
                s_gaps = ma_gaps(src, s_mt) + start;
                d_gaps += d_start;
                memcpy(d_gaps, s_gaps, size * sizeof(unsigned long));
        }

        if (start + size - 1 < mt_pivots[s_mt])
                d_max = s_pivots[size - 1];
        else
                d_max = s_max;

        if (d_start + size <= mt_pivots[d_mt])
                d_pivots[size - 1] = d_max;

        size--;
        if (size)
                memcpy(d_pivots, s_pivots, size * sizeof(unsigned long));

        return d_max;
}

/*
 * node_finalise() - Zero out unused area and populate metadata
 * @node: The maple node
 * @mt: The maple node type
 * @end: The end of the used area
 */
static inline
void node_finalise(struct maple_node *node, enum maple_type mt,
                   unsigned char end)
{
        unsigned char max_end = mt_slots[mt];
        unsigned char size;
        unsigned long *gaps;
        unsigned char gap_slot;

        gaps = ma_gaps(node, mt);
        if (end < max_end - 1) {
                size = max_end - end;
                memset(ma_slots(node, mt) + end, 0, size * sizeof(void *));

                if (gaps)
                        memset(gaps + end, 0, size * sizeof(unsigned long));

                if (--size)
                        memset(ma_pivots(node, mt) + end, 0, size * sizeof(unsigned long));
        }

        gap_slot = 0;
        if (gaps && !ma_is_leaf(mt)) {
                unsigned long max_gap;

                max_gap = 0;
                for (int i = 0; i <= end; i++)
                        if (gaps[i] > max_gap) {
                                gap_slot = i;
                                max_gap = gaps[i];
                        }
        }

        if (mt == maple_arange_64)
                ma_set_meta(node, mt, gap_slot, end - 1);
        else if (end <= max_end - 1)
                ma_set_meta(node, mt, gap_slot, end - 1);
}

static inline void *mtree_range_walk(struct ma_state *mas)
{
        unsigned long *pivots;
        unsigned char offset;
        struct maple_node *node;
        struct maple_enode *next, *last;
        enum maple_type type;
        void __rcu **slots;
        unsigned char end;
        unsigned long max, min;
        unsigned long prev_max, prev_min;

        next = mas->node;
        min = mas->min;
        max = mas->max;
        do {
                last = next;
                node = mte_to_node(next);
                type = mte_node_type(next);
                pivots = ma_pivots(node, type);
                end = ma_data_end(node, type, pivots, max);
                prev_min = min;
                prev_max = max;
                if (pivots[0] >= mas->index) {
                        offset = 0;
                        max = pivots[0];
                        goto next;
                }

                offset = 1;
                while (offset < end) {
                        if (pivots[offset] >= mas->index) {
                                max = pivots[offset];
                                break;
                        }
                        offset++;
                }

                min = pivots[offset - 1] + 1;
next:
                slots = ma_slots(node, type);
                next = mt_slot(mas->tree, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        goto dead_node;
        } while (!ma_is_leaf(type));

        mas->end = end;
        mas->offset = offset;
        mas->index = min;
        mas->last = max;
        mas->min = prev_min;
        mas->max = prev_max;
        mas->node = last;
        return (void *)next;

dead_node:
        mas_reset(mas);
        return NULL;
}

/*
 * mas_wmb_replace() - Write memory barrier and replace
 * @mas: The maple state
 * @cp: The maple copy node
 *
 * Updates gap as necessary.
 */
static inline void mas_wmb_replace(struct ma_state *mas, struct maple_copy *cp)
{
        struct maple_enode *old_enode;

        old_enode = mas->node;
        mas->node = mt_slot_locked(mas->tree, cp->slot, 0);
        /* Insert the new data in the tree */
        mas_topiary_replace(mas, old_enode, cp->height);
        if (!mte_is_leaf(mas->node))
                mas_update_gap(mas);

        mtree_range_walk(mas);
}


/*
 * cp_leaf_init() - Initialize a maple_copy node for the leaf level of a
 * spanning store
 * @cp: The maple copy node
 * @mas: The maple state
 * @l_wr_mas: The left write state of the spanning store
 * @r_wr_mas: The right write state of the spanning store
 */
static inline void cp_leaf_init(struct maple_copy *cp,
                struct ma_state *mas, struct ma_wr_state *l_wr_mas,
                struct ma_wr_state *r_wr_mas)
{
        unsigned char end = 0;

        /*
         * WARNING: The use of RCU_INIT_POINTER() makes it extremely important
         * to not expose the maple_copy node to any readers.  Exposure may
         * result in buggy code when a compiler reorders the instructions.
         */

        cp->height = 1;
        /* Create entries to insert including split entries to left and right */
        if (l_wr_mas->r_min < mas->index) {
                end++;
                RCU_INIT_POINTER(cp->slot[0], l_wr_mas->content);
                cp->pivot[0] = mas->index - 1;
        }
        RCU_INIT_POINTER(cp->slot[end], l_wr_mas->entry);
        cp->pivot[end] = mas->last;

        if (r_wr_mas->end_piv > mas->last) {
                end++;
                RCU_INIT_POINTER(cp->slot[end],
                                 r_wr_mas->slots[r_wr_mas->offset_end]);
                cp->pivot[end] = r_wr_mas->end_piv;
        }

        cp->min = l_wr_mas->r_min;
        cp->max = cp->pivot[end];
        cp->end = end;
}

/*
 * cp_data_calc() - Calculate the size of the data (1 indexed).
 * @cp: The maple copy struct with the new data populated.
 * @l_wr_mas: The maple write state containing the data to the left of the write
 * @r_wr_mas: The maple write state containing the data to the right of the
 * write
 *
 * cp->data is a size (not indexed by 0).
 */
static inline void cp_data_calc(struct maple_copy *cp,
                struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas)
{

        /* Add 1 every time for the 0th element */
        cp->data = l_wr_mas->mas->offset;
        /* Add the new data and any partial overwrites */
        cp->data += cp->end + 1;
        /* Data from right (offset + 1 to end), +1 for zero */
        cp->data += r_wr_mas->mas->end - r_wr_mas->offset_end;
}

static bool data_fits(struct ma_state *sib, struct ma_state *mas,
                struct maple_copy *cp)
{
        unsigned char new_data;
        enum maple_type type;
        unsigned char space;
        unsigned char end;

        type = mte_node_type(mas->node);
        space = 2 * mt_slots[type];
        end = sib->end;

        new_data = end + 1 + cp->data;
        if (new_data > space)
                return false;

        /*
         * This is off by one by design.  The extra space is left to reduce
         * jitter in operations that add then remove two entries.
         *
         * end is an index while new space and data are both sizes.  Adding one
         * to end to convert the index to a size means that the below
         * calculation should be <=, but we want to keep an extra space in nodes
         * to reduce jitter.
         *
         * Note that it is still possible to get a full node on the left by the
         * NULL landing exactly on the split.  The NULL ending of a node happens
         * in the dst_setup() function, where we will either increase the split
         * by one or decrease it by one, if possible.  In the case of split
         * (this case), it is always possible to shift the spilt by one - again
         * because there is at least one slot free by the below checking.
         */
        if (new_data < space)
                return true;

        return false;
}

static inline void push_data_sib(struct maple_copy *cp, struct ma_state *mas,
                struct ma_state *sib, struct ma_state *parent)
{

        if (mte_is_root(mas->node))
                goto no_push;


        *sib = *parent;
        if (sib->offset) {
                sib->offset--;
                mas_descend(sib);
                sib->end = mas_data_end(sib);
                if (data_fits(sib, mas, cp))        /* Push left */
                        return;

                *sib = *parent;
        }

        if (sib->offset >= sib->end)
                goto no_push;

        sib->offset++;
        mas_descend(sib);
        sib->end = mas_data_end(sib);
        if (data_fits(sib, mas, cp))                /* Push right*/
                return;

no_push:
        sib->end = 0;
}

/*
 * rebalance_data() - Calculate the @cp data, populate @sib if insufficient or
 * if the data can be pushed into a sibling.
 * @cp: The maple copy node
 * @wr_mas: The left write maple state
 * @sib: The maple state of the sibling.
 *
 * Note: @cp->data is a size and not indexed by 0. @sib->end may be set to 0 to
 * indicate it will not be used.
 *
 */
static inline void rebalance_data(struct maple_copy *cp,
                struct ma_wr_state *wr_mas, struct ma_state *sib,
                struct ma_state *parent)
{
        cp_data_calc(cp, wr_mas, wr_mas);
        sib->end = 0;
        if (cp->data > mt_slots[wr_mas->type]) {
                push_data_sib(cp, wr_mas->mas, sib, parent);
                if (sib->end)
                        goto use_sib;
        } else if (cp->data <= mt_min_slots[wr_mas->type]) {
                if ((wr_mas->mas->min != 0) ||
                    (wr_mas->mas->max != ULONG_MAX)) {
                        rebalance_sib(parent, sib);
                        goto use_sib;
                }
        }

        return;

use_sib:

        cp->data += sib->end + 1;
}

/*
 * spanning_data() - Calculate the @cp data and populate @sib if insufficient
 * @cp: The maple copy node
 * @l_wr_mas: The left write maple state
 * @r_wr_mas: The right write maple state
 * @sib: The maple state of the sibling.
 *
 * Note: @cp->data is a size and not indexed by 0. @sib->end may be set to 0 to
 * indicate it will not be used.
 */
static inline void spanning_data(struct maple_copy *cp,
                struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas,
                struct ma_state *sib)
{
        cp_data_calc(cp, l_wr_mas, r_wr_mas);
        if (((l_wr_mas->mas->min != 0) || (r_wr_mas->mas->max != ULONG_MAX)) &&
            (cp->data <= mt_min_slots[l_wr_mas->type])) {
                spanning_sib(l_wr_mas, r_wr_mas, sib);
                cp->data += sib->end + 1;
        } else {
                sib->end = 0;
        }
}

/*
 * dst_setup() - Set up one or more destinations for the new data.
 * @cp: The maple copy node
 * @mas: The maple state
 * @mt: The source node type
 */
static inline
void dst_setup(struct maple_copy *cp, struct ma_state *mas, enum maple_type mt)
{
        /* Data is 1 indexed, every src has +1 added.  */

        if (cp->data <= mt_slots[mt]) {
                cp->split = cp->data - 1;
                cp->d_count = 1;
                goto node_setup;
        }

        cp->split = (cp->data - 1) / 2;
        cp->d_count = 2;
        if (cp->data < mt_slots[mt] * 2)
                goto node_setup;

        if (cp->data == mt_slots[mt] * 2) {
                unsigned char off;
                unsigned char s;

                if (!ma_is_leaf(mt))
                        goto node_setup;

                /*
                 * Leaf nodes are a bit tricky because we cannot assume the data
                 * can fit due to the NULL limitation on node ends.
                 */
                off = cp->split;
                for (s = 0; s < cp->s_count; s++) {
                        unsigned char s_off;

                        s_off = cp->src[s].end - cp->src[s].start;
                        if (s_off >= off)
                                break;

                        s_off++;
                        off -= s_off;
                }

                off += cp->src[s].start;
                if (ma_slots(cp->src[s].node, cp->src[s].mt)[off])
                        goto node_setup;

                cp->split++;
                if (cp->split < mt_slots[mt])
                        goto node_setup;

                cp->split -= 2;
                if (cp->data - 2 - cp->split < mt_slots[mt])
                        goto node_setup;

        }

        /* No other choice but to 3-way split the data */
        cp->split = (cp->data + 2) / 3;
        cp->d_count = 3;

node_setup:
        for (int i = 0; i < cp->d_count; i++) {
                cp->dst[i].mt = mt;
                cp->dst[i].node = ma_mnode_ptr(mas_pop_node(mas));
        }
}

static inline void append_mas_cp(struct maple_copy *cp,
        struct ma_state *mas, unsigned char start, unsigned char end)
{
        struct maple_node *node;
        enum maple_type mt;
        unsigned char count;

        count = cp->s_count;
        node = mas_mn(mas);
        mt = mte_node_type(mas->node);
        cp->src[count].node = node;
        cp->src[count].mt = mt;
        if (mas->end <= end)
                cp->src[count].max = mas->max;
        else
                cp->src[count].max = ma_pivots(node, mt)[end];

        cp->src[count].start = start;
        cp->src[count].end = end;
        cp->s_count++;
}

static inline void append_wr_mas_cp(struct maple_copy *cp,
        struct ma_wr_state *wr_mas, unsigned char start, unsigned char end)
{
        unsigned char count;

        count = cp->s_count;
        cp->src[count].node = wr_mas->node;
        cp->src[count].mt = wr_mas->type;
        if (wr_mas->mas->end <= end)
                cp->src[count].max = wr_mas->mas->max;
        else
                cp->src[count].max = wr_mas->pivots[end];

        cp->src[count].start = start;
        cp->src[count].end = end;
        cp->s_count++;
}

static inline void init_cp_src(struct maple_copy *cp)
{
        cp->src[cp->s_count].node = ma_mnode_ptr(cp);
        cp->src[cp->s_count].mt = maple_copy;
        cp->src[cp->s_count].max = cp->max;
        cp->src[cp->s_count].start = 0;
        cp->src[cp->s_count].end = cp->end;
        cp->s_count++;
}

/*
 * multi_src_setup() - Set the @cp node up with multiple sources to copy from.
 * @cp: The maple copy node
 * @l_wr_mas: The left write maple state
 * @r_wr_mas: The right write maple state
 * @sib: The sibling maple state
 *
 * Note: @sib->end == 0 indicates no sibling will be used.
 */
static inline
void multi_src_setup(struct maple_copy *cp, struct ma_wr_state *l_wr_mas,
                struct ma_wr_state *r_wr_mas, struct ma_state *sib)
{
        cp->s_count = 0;
        if (sib->end && sib->max < l_wr_mas->mas->min)
                append_mas_cp(cp, sib, 0, sib->end);

        /* Copy left 0 - offset */
        if (l_wr_mas->mas->offset) {
                unsigned char off = l_wr_mas->mas->offset - 1;

                append_wr_mas_cp(cp, l_wr_mas, 0, off);
                cp->src[cp->s_count - 1].max = cp->min - 1;
        }

        init_cp_src(cp);

        /* Copy right either from offset or offset + 1 pending on r_max */
        if (r_wr_mas->mas->end != r_wr_mas->offset_end)
                append_wr_mas_cp(cp, r_wr_mas, r_wr_mas->offset_end + 1,
                               r_wr_mas->mas->end);

        if (sib->end && sib->min > r_wr_mas->mas->max)
                append_mas_cp(cp, sib, 0, sib->end);
}

static inline
void cp_data_write(struct maple_copy *cp, struct ma_state *mas)
{
        struct maple_node *dst, *src;
        unsigned char s, d;
        unsigned char dst_offset;
        unsigned char data_offset;
        unsigned char src_end, s_offset;
        unsigned char split;
        unsigned long s_max, d_max;
        unsigned char dst_size;
        enum maple_type s_mt, d_mt;

        data_offset = 0;
        s = d = 0;
        /* Readability help */
        src = cp->src[s].node;
        dst = cp->dst[d].node;
        s_offset = cp->src[s].start;
        src_end = cp->src[s].end;
        split = cp->split;
        s_max = cp->src[s].max;
        s_mt = cp->src[s].mt;
        d_mt = cp->dst[d].mt;
        do {
                dst_offset = 0;
                d_max = 0;
                dst = cp->dst[d].node;
                d_mt = cp->dst[d].mt;
                dst_size = split + 1;

                while (dst_size) {
                        unsigned char size;

                        if (src_end - s_offset + 1 < dst_size)
                                size = src_end - s_offset + 1;
                        else
                                size = dst_size;

                        d_max = node_copy(mas, src, s_offset, size, s_max, s_mt,
                                          dst, dst_offset, d_mt);

                        dst_offset += size;
                        s_offset += size;
                        if (s_offset > src_end) {
                                /* This source is exhausted */
                                s++;
                                if (s >= cp->s_count) {
                                        cp->dst[d].max = d_max;
                                        node_finalise(dst, d_mt, dst_offset);
                                        return;
                                }
                                /* Reset local src */
                                src = cp->src[s].node;
                                s_offset = cp->src[s].start;
                                src_end = cp->src[s].end;
                                s_max = cp->src[s].max;
                                s_mt = cp->src[s].mt;
                        }

                        dst_size -= size;
                        data_offset += size;
                }

                split = cp->split;
                cp->dst[d].max = d_max;
                /* Handle null entries */
                if (cp->dst[d].max != ULONG_MAX &&
                    !ma_slots(dst, d_mt)[dst_offset - 1]) {
                        if (s_offset == cp->src[s].start) {
                                s--;
                                src = cp->src[s].node;
                                src_end = cp->src[s].end;
                                s_max = cp->src[s].max;
                                s_mt = cp->src[s].mt;
                                s_offset = src_end;
                        } else {
                                s_offset--;
                        }
                        /* Set dst max and clear pivot */
                        split++;
                        data_offset--;
                        dst_offset--;
                        cp->dst[d].max = ma_pivots(dst, d_mt)[dst_offset - 1];
                }

                node_finalise(dst, d_mt, dst_offset);
                ++d; /* Next destination */
                if (d == cp->d_count - 1)
                        split = cp->data - data_offset;

                if (d >= cp->d_count) {
                        WARN_ON(data_offset < cp->data);
                        return;
                }

        } while (data_offset <= cp->data);
}

/*
 * cp_dst_to_slots() - Migrate the maple copy destination to the maple copy
 * slots
 * @cp: The maple copy node
 * @min: The minimal value represented
 * @max: The maximum value represented
 * @mas: The maple state
 */
static inline void cp_dst_to_slots(struct maple_copy *cp, unsigned long min,
                unsigned long max, struct ma_state *mas)
{
        unsigned char d;
        unsigned long slot_min = min;

        for (d = 0; d < cp->d_count; d++) {
                struct maple_node *mn = cp->dst[d].node;
                enum maple_type mt = cp->dst[d].mt;
                unsigned long slot_max = cp->dst[d].max;

                /*
                 * Warning, see cp_leaf_init() comment and rcu_assign_pointer()
                 * documentation.  Since these are new nodes, there are no
                 * read-side operations that can view them until they are
                 * inserted into the tree after an rcu_assign_pointer() call.
                 */
                ma_init_slot(&cp->slot[d], mn, mt);
                cp->pivot[d] = slot_max;
                if (mt_is_alloc(mas->tree)) {
                        if (ma_is_leaf(mt)) {
                                cp->gap[d] = ma_leaf_max_gap(mn, mt, slot_min,
                                                 slot_max, ma_pivots(mn, mt),
                                                 ma_slots(mn, mt));
                        } else {
                                unsigned long *gaps = ma_gaps(mn, mt);

                                if (gaps) {
                                        unsigned char gap_slot;

                                        gap_slot = ma_meta_gap(mn);
                                        cp->gap[d] = gaps[gap_slot];
                                }
                        }
                }
                slot_min = slot_max + 1;
        }

        cp->end = cp->d_count - 1;
        cp->min = min;
        cp->max = max;
}

static inline bool cp_is_new_root(struct maple_copy *cp, struct ma_state *mas)
{
        if (cp->min || cp->max != ULONG_MAX)
                return false;

        if (cp->d_count != 1) {
                enum maple_type mt = maple_arange_64;

                if (!mt_is_alloc(mas->tree))
                        mt = maple_range_64;

                cp->data = cp->d_count;
                cp->s_count = 0;
                dst_setup(cp, mas, mt);
                init_cp_src(cp);
                node_copy(mas, cp->src[0].node, 0, cp->data, cp->max, maple_copy,
                          cp->dst[0].node, 0, mt);
                node_finalise(cp->dst[0].node, mt, cp->end + 1);
                /*
                 * Warning, see cp_leaf_init() comment and rcu_assign_pointer()
                 * documentation.  Since this is a new root, there are no
                 * read-side operations that can view it until it is insert into
                 * the tree after an rcu_assign_pointer() call.
                 */
                ma_init_slot(&cp->slot[0], cp->dst[0].node, mt);
                cp->height++;
        }
        WARN_ON_ONCE(cp->dst[0].node != mte_to_node(
                                mt_slot_locked(mas->tree, cp->slot, 0)));
        cp->dst[0].node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->min = 0;
        mas->max = ULONG_MAX;
        mas->depth = 0;
        mas->node = mas_root_locked(mas);
        return true;
}

static inline bool cp_converged(struct maple_copy *cp, struct ma_state *mas,
                                struct ma_state *sib)
{
        if (cp->d_count != 1 || sib->end)
                return false;

        cp->dst[0].node->parent = ma_parent_ptr(mas_mn(mas)->parent);
        return true;
}

/*
 * spanning_ascend() - See if a spanning store operation has to keep walking up
 * the tree
 * @cp: The maple_copy node
 * @l_wr_mas: The left maple write state
 * @r_wr_mas: The right maple write state
 * @sib: the maple state of the sibling
 *
 * Returns: True if another iteration is necessary.
 */
static bool spanning_ascend(struct maple_copy *cp, struct ma_state *mas,
                            struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas,
                            struct ma_state *sib)
{
        if (sib->end) {
                if (sib->max < l_wr_mas->mas->min)
                        *l_wr_mas->mas = *sib;
                else
                        *r_wr_mas->mas = *sib;
        }

        cp_dst_to_slots(cp, l_wr_mas->mas->min, r_wr_mas->mas->max, mas);
        if (cp_is_new_root(cp, mas))
                return false;

        /* Converged and has a single destination */
        if ((cp->d_count == 1) &&
            (l_wr_mas->mas->node == r_wr_mas->mas->node)) {
                cp->dst[0].node->parent = ma_parent_ptr(mas_mn(mas)->parent);
                return false;
        }

        cp->height++;
        wr_mas_ascend(l_wr_mas);
        wr_mas_ascend(r_wr_mas);
        return true;
}

static inline
void copy_tree_location(const struct ma_state *src, struct ma_state *dst)
{
        dst->node = src->node;
        dst->offset = src->offset;
        dst->min = src->min;
        dst->max = src->max;
        dst->end = src->end;
        dst->depth = src->depth;
}

/*
 * rebalance_ascend() - Ascend the tree and set up for the next loop - if
 * necessary
 *
 * Return: True if there another rebalancing operation on the next level is
 * needed, false otherwise.
 */
static inline bool rebalance_ascend(struct maple_copy *cp,
                struct ma_wr_state *wr_mas, struct ma_state *sib,
                struct ma_state *parent)
{
        struct ma_state *mas;
        unsigned long min, max;

        mas = wr_mas->mas;
        if (!sib->end) {
                min = mas->min;
                max = mas->max;
        } else if (sib->min > mas->max) { /* Move right succeeded */
                min = mas->min;
                max = sib->max;
                wr_mas->offset_end = parent->offset + 1;
        } else {
                min = sib->min;
                max = mas->max;
                wr_mas->offset_end = parent->offset;
                parent->offset--;
        }

        cp_dst_to_slots(cp, min, max, mas);
        if (cp_is_new_root(cp, mas))
                return false;

        if (cp_converged(cp, mas, sib))
                return false;

        cp->height++;
        copy_tree_location(parent, mas);
        wr_mas_setup(wr_mas, mas);
        return true;
}

/*
 * mas_root_expand() - Expand a root to a node
 * @mas: The maple state
 * @entry: The entry to store into the tree
 */
static inline void mas_root_expand(struct ma_state *mas, void *entry)
{
        void *contents = mas_root_locked(mas);
        enum maple_type type = maple_leaf_64;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots;
        int slot = 0;

        node = mas_pop_node(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->node = mt_mk_node(node, type);
        mas->status = ma_active;

        if (mas->index) {
                if (contents) {
                        rcu_assign_pointer(slots[slot], contents);
                        if (likely(mas->index > 1))
                                slot++;
                }
                pivots[slot++] = mas->index - 1;
        }

        rcu_assign_pointer(slots[slot], entry);
        mas->offset = slot;
        pivots[slot] = mas->last;
        if (mas->last != ULONG_MAX)
                pivots[++slot] = ULONG_MAX;

        mt_set_height(mas->tree, 1);
        ma_set_meta(node, maple_leaf_64, 0, slot);
        /* swap the new root into the tree */
        rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
}

/*
 * mas_store_root() - Storing value into root.
 * @mas: The maple state
 * @entry: The entry to store.
 *
 * There is no root node now and we are storing a value into the root - this
 * function either assigns the pointer or expands into a node.
 */
static inline void mas_store_root(struct ma_state *mas, void *entry)
{
        if (!entry) {
                if (!mas->index)
                        rcu_assign_pointer(mas->tree->ma_root, NULL);
        } else if (likely((mas->last != 0) || (mas->index != 0)))
                mas_root_expand(mas, entry);
        else if (((unsigned long) (entry) & 3) == 2)
                mas_root_expand(mas, entry);
        else {
                rcu_assign_pointer(mas->tree->ma_root, entry);
                mas->status = ma_start;
        }
}

/*
 * mas_is_span_wr() - Check if the write needs to be treated as a write that
 * spans the node.
 * @wr_mas: The maple write state
 *
 * Spanning writes are writes that start in one node and end in another OR if
 * the write of a %NULL will cause the node to end with a %NULL.
 *
 * Return: True if this is a spanning write, false otherwise.
 */
static bool mas_is_span_wr(struct ma_wr_state *wr_mas)
{
        unsigned long max = wr_mas->r_max;
        unsigned long last = wr_mas->mas->last;
        enum maple_type type = wr_mas->type;
        void *entry = wr_mas->entry;

        /* Contained in this pivot, fast path */
        if (last < max)
                return false;

        if (ma_is_leaf(type)) {
                max = wr_mas->mas->max;
                if (last < max)
                        return false;
        }

        if (last == max) {
                /*
                 * The last entry of leaf node cannot be NULL unless it is the
                 * rightmost node (writing ULONG_MAX), otherwise it spans slots.
                 */
                if (entry || last == ULONG_MAX)
                        return false;
        }

        trace_ma_write(TP_FCT, wr_mas->mas, wr_mas->r_max, entry);
        return true;
}

static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas)
{
        wr_mas->type = mte_node_type(wr_mas->mas->node);
        mas_wr_node_walk(wr_mas);
        wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type);
}

static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas)
{
        wr_mas->mas->max = wr_mas->r_max;
        wr_mas->mas->min = wr_mas->r_min;
        wr_mas->mas->node = wr_mas->content;
        wr_mas->mas->offset = 0;
        wr_mas->mas->depth++;
}
/*
 * mas_wr_walk() - Walk the tree for a write.
 * @wr_mas: The maple write state
 *
 * Uses mas_slot_locked() and does not need to worry about dead nodes.
 *
 * Return: True if it's contained in a node, false on spanning write.
 */
static bool mas_wr_walk(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        while (true) {
                mas_wr_walk_descend(wr_mas);
                if (unlikely(mas_is_span_wr(wr_mas)))
                        return false;

                wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                  mas->offset);
                if (ma_is_leaf(wr_mas->type))
                        return true;

                if (mas->end < mt_slots[wr_mas->type] - 1)
                        wr_mas->vacant_height = mas->depth + 1;

                if (ma_is_root(mas_mn(mas))) {
                        /* root needs more than 2 entries to be sufficient + 1 */
                        if (mas->end > 2)
                                wr_mas->sufficient_height = 1;
                } else if (mas->end > mt_min_slots[wr_mas->type] + 1)
                        wr_mas->sufficient_height = mas->depth + 1;

                mas_wr_walk_traverse(wr_mas);
        }

        return true;
}

static void mas_wr_walk_index(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        while (true) {
                mas_wr_walk_descend(wr_mas);
                wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                  mas->offset);
                if (ma_is_leaf(wr_mas->type))
                        return;
                mas_wr_walk_traverse(wr_mas);
        }
}
/*
 * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs.
 * @l_wr_mas: The left maple write state
 * @r_wr_mas: The right maple write state
 */
static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas,
                                            struct ma_wr_state *r_wr_mas)
{
        struct ma_state *r_mas = r_wr_mas->mas;
        struct ma_state *l_mas = l_wr_mas->mas;
        unsigned char l_slot;

        l_slot = l_mas->offset;
        if (!l_wr_mas->content)
                l_mas->index = l_wr_mas->r_min;

        if ((l_mas->index == l_wr_mas->r_min) &&
                 (l_slot &&
                  !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) {
                if (l_slot > 1)
                        l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1;
                else
                        l_mas->index = l_mas->min;

                l_mas->offset = l_slot - 1;
                l_wr_mas->r_min = l_mas->index;
        }

        if (!r_wr_mas->content) {
                if (r_mas->last < r_wr_mas->r_max)
                        r_mas->last = r_wr_mas->r_max;
                r_mas->offset++;
        } else if ((r_mas->last == r_wr_mas->r_max) &&
            (r_mas->last < r_mas->max) &&
            !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) {
                r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots,
                                             r_wr_mas->type, r_mas->offset + 1);
                r_mas->offset++;
                r_wr_mas->r_max = r_mas->last;
        }
}

static inline void *mas_state_walk(struct ma_state *mas)
{
        void *entry;

        entry = mas_start(mas);
        if (mas_is_none(mas))
                return NULL;

        if (mas_is_ptr(mas))
                return entry;

        return mtree_range_walk(mas);
}

/*
 * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up
 * to date.
 *
 * @mas: The maple state.
 *
 * Note: Leaves mas in undesirable state.
 * Return: The entry for @mas->index or %NULL on dead node.
 */
static inline void *mtree_lookup_walk(struct ma_state *mas)
{
        unsigned long *pivots;
        unsigned char offset;
        struct maple_node *node;
        struct maple_enode *next;
        enum maple_type type;
        void __rcu **slots;
        unsigned char end;

        next = mas->node;
        do {
                node = mte_to_node(next);
                type = mte_node_type(next);
                pivots = ma_pivots(node, type);
                end = mt_pivots[type];
                offset = 0;
                do {
                        if (pivots[offset] >= mas->index)
                                break;
                } while (++offset < end);

                slots = ma_slots(node, type);
                next = mt_slot(mas->tree, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        goto dead_node;
        } while (!ma_is_leaf(type));

        return (void *)next;

dead_node:
        mas_reset(mas);
        return NULL;
}

static void mte_destroy_walk(struct maple_enode *, struct maple_tree *);
/*
 * mas_new_root() - Create a new root node that only contains the entry passed
 * in.
 * @mas: The maple state
 * @entry: The entry to store.
 *
 * Only valid when the index == 0 and the last == ULONG_MAX
 */
static inline void mas_new_root(struct ma_state *mas, void *entry)
{
        struct maple_enode *root = mas_root_locked(mas);
        enum maple_type type = maple_leaf_64;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots;

        WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX);

        if (!entry) {
                mt_set_height(mas->tree, 0);
                rcu_assign_pointer(mas->tree->ma_root, entry);
                mas->status = ma_start;
                goto done;
        }

        node = mas_pop_node(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->node = mt_mk_node(node, type);
        mas->status = ma_active;
        rcu_assign_pointer(slots[0], entry);
        pivots[0] = mas->last;
        mt_set_height(mas->tree, 1);
        rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));

done:
        if (xa_is_node(root))
                mte_destroy_walk(root, mas->tree);
}
/*
 * mas_wr_spanning_store() - Create a subtree with the store operation completed
 * and new nodes where necessary, then place the sub-tree in the actual tree.
 * Note that mas is expected to point to the node which caused the store to
 * span.
 * @wr_mas: The maple write state
 */
static void mas_wr_spanning_store(struct ma_wr_state *wr_mas)
{
        struct maple_copy cp;
        struct ma_state *mas;
        struct ma_state sib;

        /* Left and Right side of spanning store */
        MA_STATE(r_mas, NULL, 0, 0);
        MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry);

        /*
         * A store operation that spans multiple nodes is called a spanning
         * store and is handled early in the store call stack by the function
         * mas_is_span_wr().  When a spanning store is identified, the maple
         * state is duplicated.  The first maple state walks the left tree path
         * to ``index``, the duplicate walks the right tree path to ``last``.
         * The data in the two nodes are combined into a single node, two nodes,
         * or possibly three nodes (see the 3-way split above).  A ``NULL``
         * written to the last entry of a node is considered a spanning store as
         * a rebalance is required for the operation to complete and an overflow
         * of data may happen.
         */
        mas = wr_mas->mas;
        trace_ma_op(TP_FCT, mas);

        if (unlikely(!mas->index && mas->last == ULONG_MAX))
                return mas_new_root(mas, wr_mas->entry);
        /*
         * Node rebalancing may occur due to this store, so there may be three new
         * entries per level plus a new root.
         */

        /*
         * Set up right side.  Need to get to the next offset after the spanning
         * store to ensure it's not NULL and to combine both the next node and
         * the node with the start together.
         */
        r_mas = *mas;
        /* Avoid overflow, walk to next slot in the tree. */
        if (r_mas.last + 1)
                r_mas.last++;

        r_mas.index = r_mas.last;
        mas_wr_walk_index(&r_wr_mas);
        r_mas.last = r_mas.index = mas->last;
        r_wr_mas.end_piv = r_wr_mas.r_max;

        /* Set up left side. */
        mas_wr_walk_index(wr_mas);

        if (!wr_mas->entry) {
                mas_extend_spanning_null(wr_mas, &r_wr_mas);
                mas->last = r_mas.last;
        }

        /* expanding NULLs may make this cover the entire range */
        if (!mas->index && r_mas.last == ULONG_MAX) {
                mas_set_range(mas, 0, ULONG_MAX);
                return mas_new_root(mas, wr_mas->entry);
        }

        cp_leaf_init(&cp, mas, wr_mas, &r_wr_mas);
        do {
                spanning_data(&cp, wr_mas, &r_wr_mas, &sib);
                multi_src_setup(&cp, wr_mas, &r_wr_mas, &sib);
                dst_setup(&cp, mas, wr_mas->type);
                cp_data_write(&cp, mas);
        } while (spanning_ascend(&cp, mas, wr_mas, &r_wr_mas, &sib));

        mas_wmb_replace(mas, &cp);
}

/*
 * mas_wr_node_store() - Attempt to store the value in a node
 * @wr_mas: The maple write state
 *
 * Attempts to reuse the node, but may allocate.
 */
static inline void mas_wr_node_store(struct ma_wr_state *wr_mas)
{
        unsigned char dst_offset, offset_end;
        unsigned char copy_size, node_pivots;
        struct maple_node reuse, *newnode;
        unsigned long *dst_pivots;
        void __rcu **dst_slots;
        unsigned char new_end;
        struct ma_state *mas;
        bool in_rcu;

        mas = wr_mas->mas;
        trace_ma_op(TP_FCT, mas);
        in_rcu = mt_in_rcu(mas->tree);
        offset_end = wr_mas->offset_end;
        node_pivots = mt_pivots[wr_mas->type];
        /* Assume last adds an entry */
        new_end = mas->end + 1 - offset_end + mas->offset;
        if (mas->last == wr_mas->end_piv) {
                offset_end++; /* don't copy this offset */
                new_end--;
        }

        /* set up node. */
        if (in_rcu) {
                newnode = mas_pop_node(mas);
        } else {
                memset(&reuse, 0, sizeof(struct maple_node));
                newnode = &reuse;
        }

        newnode->parent = mas_mn(mas)->parent;
        dst_pivots = ma_pivots(newnode, wr_mas->type);
        dst_slots = ma_slots(newnode, wr_mas->type);
        /* Copy from start to insert point */
        if (mas->offset) {
                memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset);
                memcpy(dst_slots, wr_mas->slots, sizeof(void __rcu *) * mas->offset);
        }

        /* Handle insert of new range starting after old range */
        if (wr_mas->r_min < mas->index) {
                rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content);
                dst_pivots[mas->offset++] = mas->index - 1;
                new_end++;
        }

        /* Store the new entry and range end. */
        if (mas->offset < node_pivots)
                dst_pivots[mas->offset] = mas->last;
        rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry);

        /*
         * this range wrote to the end of the node or it overwrote the rest of
         * the data
         */
        if (offset_end > mas->end)
                goto done;

        dst_offset = mas->offset + 1;
        /* Copy to the end of node if necessary. */
        copy_size = mas->end - offset_end + 1;
        memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end,
               sizeof(void __rcu *) * copy_size);
        memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end,
               sizeof(unsigned long) * (copy_size - 1));

        if (new_end < node_pivots)
                dst_pivots[new_end] = mas->max;

done:
        mas_leaf_set_meta(newnode, maple_leaf_64, new_end);
        if (in_rcu) {
                struct maple_enode *old_enode = mas->node;

                mas->node = mt_mk_node(newnode, wr_mas->type);
                mas_replace_node(mas, old_enode, mas_mt_height(mas));
        } else {
                memcpy(wr_mas->node, newnode, sizeof(struct maple_node));
        }
        trace_ma_write(TP_FCT, mas, 0, wr_mas->entry);
        mas_update_gap(mas);
        mas->end = new_end;
}

/*
 * mas_wr_slot_store: Attempt to store a value in a slot.
 * @wr_mas: the maple write state
 */
static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char offset = mas->offset;
        void __rcu **slots = wr_mas->slots;
        bool gap = false;

        gap |= !mt_slot_locked(mas->tree, slots, offset);
        gap |= !mt_slot_locked(mas->tree, slots, offset + 1);

        if (wr_mas->offset_end - offset == 1) {
                if (mas->index == wr_mas->r_min) {
                        /* Overwriting the range and a part of the next one */
                        rcu_assign_pointer(slots[offset], wr_mas->entry);
                        wr_mas->pivots[offset] = mas->last;
                } else {
                        /* Overwriting a part of the range and the next one */
                        rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
                        wr_mas->pivots[offset] = mas->index - 1;
                        mas->offset++; /* Keep mas accurate. */
                }
        } else {
                WARN_ON_ONCE(mt_in_rcu(mas->tree));
                /*
                 * Expand the range, only partially overwriting the previous and
                 * next ranges
                 */
                gap |= !mt_slot_locked(mas->tree, slots, offset + 2);
                rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
                wr_mas->pivots[offset] = mas->index - 1;
                wr_mas->pivots[offset + 1] = mas->last;
                mas->offset++; /* Keep mas accurate. */
        }

        trace_ma_write(TP_FCT, mas, 0, wr_mas->entry);
        /*
         * Only update gap when the new entry is empty or there is an empty
         * entry in the original two ranges.
         */
        if (!wr_mas->entry || gap)
                mas_update_gap(mas);
}

static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        if (!wr_mas->slots[wr_mas->offset_end]) {
                /* If this one is null, the next and prev are not */
                mas->last = wr_mas->end_piv;
        } else {
                /* Check next slot(s) if we are overwriting the end */
                if ((mas->last == wr_mas->end_piv) &&
                    (mas->end != wr_mas->offset_end) &&
                    !wr_mas->slots[wr_mas->offset_end + 1]) {
                        wr_mas->offset_end++;
                        if (wr_mas->offset_end == mas->end)
                                mas->last = mas->max;
                        else
                                mas->last = wr_mas->pivots[wr_mas->offset_end];
                        wr_mas->end_piv = mas->last;
                }
        }

        if (!wr_mas->content) {
                /* If this one is null, the next and prev are not */
                mas->index = wr_mas->r_min;
        } else {
                /* Check prev slot if we are overwriting the start */
                if (mas->index == wr_mas->r_min && mas->offset &&
                    !wr_mas->slots[mas->offset - 1]) {
                        mas->offset--;
                        wr_mas->r_min = mas->index =
                                mas_safe_min(mas, wr_mas->pivots, mas->offset);
                        wr_mas->r_max = wr_mas->pivots[mas->offset];
                }
        }
}

static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
{
        while ((wr_mas->offset_end < wr_mas->mas->end) &&
               (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end]))
                wr_mas->offset_end++;

        if (wr_mas->offset_end < wr_mas->mas->end)
                wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end];
        else
                wr_mas->end_piv = wr_mas->mas->max;
}

static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end = mas->end + 2;

        new_end -= wr_mas->offset_end - mas->offset;
        if (wr_mas->r_min == mas->index)
                new_end--;

        if (wr_mas->end_piv == mas->last)
                new_end--;

        return new_end;
}

/*
 * mas_wr_append: Attempt to append
 * @wr_mas: the maple write state
 *
 * This is currently unsafe in rcu mode since the end of the node may be cached
 * by readers while the node contents may be updated which could result in
 * inaccurate information.
 */
static inline void mas_wr_append(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        void __rcu **slots;
        unsigned char end = mas->end;
        unsigned char new_end = mas_wr_new_end(wr_mas);

        if (new_end < mt_pivots[wr_mas->type]) {
                wr_mas->pivots[new_end] = wr_mas->pivots[end];
                ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end);
        }

        slots = wr_mas->slots;
        if (new_end == end + 1) {
                if (mas->last == wr_mas->r_max) {
                        /* Append to end of range */
                        rcu_assign_pointer(slots[new_end], wr_mas->entry);
                        wr_mas->pivots[end] = mas->index - 1;
                        mas->offset = new_end;
                } else {
                        /* Append to start of range */
                        rcu_assign_pointer(slots[new_end], wr_mas->content);
                        wr_mas->pivots[end] = mas->last;
                        rcu_assign_pointer(slots[end], wr_mas->entry);
                }
        } else {
                /* Append to the range without touching any boundaries. */
                rcu_assign_pointer(slots[new_end], wr_mas->content);
                wr_mas->pivots[end + 1] = mas->last;
                rcu_assign_pointer(slots[end + 1], wr_mas->entry);
                wr_mas->pivots[end] = mas->index - 1;
                mas->offset = end + 1;
        }

        if (!wr_mas->content || !wr_mas->entry)
                mas_update_gap(mas);

        mas->end = new_end;
        trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry);
}

/*
 * split_ascend() - See if a split operation has to keep walking up the tree
 * @cp: The maple_copy node
 * @wr_mas: The maple write state
 * @sib: the maple state of the sibling
 *
 * Return: true if another split operation on the next level is needed, false
 * otherwise
 */
static inline bool split_ascend(struct maple_copy *cp,
                struct ma_wr_state *wr_mas, struct ma_state *sib,
                struct ma_state *parent)
{
        struct ma_state *mas;
        unsigned long min, max;

        mas = wr_mas->mas;
        min = mas->min; /* push right, or normal split */
        max = mas->max;
        wr_mas->offset_end = parent->offset;
        if (sib->end) {
                if (sib->max < mas->min) {
                        min = sib->min; /* push left */
                        parent->offset--;
                } else {
                        max = sib->max; /* push right */
                        wr_mas->offset_end++;
                }
        }

        cp_dst_to_slots(cp, min, max, mas);
        if (cp_is_new_root(cp, mas))
                return false;

        if (cp_converged(cp, mas, sib))
                return false;

        cp->height++;
        copy_tree_location(parent, mas);
        wr_mas_setup(wr_mas, mas);
        return true;
}

/*
 * split_data() - Calculate the @cp data, populate @sib if the data can be
 * pushed into a sibling.
 * @cp: The maple copy node
 * @wr_mas: The left write maple state
 * @sib: The maple state of the sibling.
 *
 * Note: @cp->data is a size and not indexed by 0. @sib->end may be set to 0 to
 * indicate it will not be used.
 *
 */
static inline void split_data(struct maple_copy *cp,
                struct ma_wr_state *wr_mas, struct ma_state *sib,
                struct ma_state *parent)
{
        cp_data_calc(cp, wr_mas, wr_mas);
        if (cp->data <= mt_slots[wr_mas->type]) {
                sib->end = 0;
                return;
        }

        push_data_sib(cp, wr_mas->mas, sib, parent);
        if (sib->end)
                cp->data += sib->end + 1;
}

/*
 * mas_wr_split() - Expand one node into two
 * @wr_mas: The write maple state
 */
static void mas_wr_split(struct ma_wr_state *wr_mas)
{
        struct ma_state parent;
        struct ma_state *mas;
        struct maple_copy cp;
        struct ma_state sib;

        mas = wr_mas->mas;
        trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry);
        parent = *mas;
        cp_leaf_init(&cp, mas, wr_mas, wr_mas);
        do {
                if (!mte_is_root(parent.node)) {
                        mas_ascend(&parent);
                        parent.end = mas_data_end(&parent);
                }
                split_data(&cp, wr_mas, &sib, &parent);
                multi_src_setup(&cp, wr_mas, wr_mas, &sib);
                dst_setup(&cp, mas, wr_mas->type);
                cp_data_write(&cp, mas);
        } while (split_ascend(&cp, wr_mas, &sib, &parent));

        mas_wmb_replace(mas, &cp);
}

/*
 * mas_wr_rebalance() - Insufficient data in one node needs to either get data
 * from a sibling or absorb a sibling all together.
 * @wr_mas: The write maple state
 *
 * Rebalance is different than a spanning store in that the write state is
 * already at the leaf node that's being altered.
 */
static void mas_wr_rebalance(struct ma_wr_state *wr_mas)
{
        struct ma_state parent;
        struct ma_state *mas;
        struct maple_copy cp;
        struct ma_state sib;

        /*
         * Rebalancing occurs if a node is insufficient.  Data is rebalanced
         * against the node to the right if it exists, otherwise the node to the
         * left of this node is rebalanced against this node.  If rebalancing
         * causes just one node to be produced instead of two, then the parent
         * is also examined and rebalanced if it is insufficient.  Every level
         * tries to combine the data in the same way.  If one node contains the
         * entire range of the tree, then that node is used as a new root node.
         */

        mas = wr_mas->mas;
        trace_ma_op(TP_FCT, mas);
        parent = *mas;
        cp_leaf_init(&cp, mas, wr_mas, wr_mas);
        do {
                if (!mte_is_root(parent.node)) {
                        mas_ascend(&parent);
                        parent.end = mas_data_end(&parent);
                }
                rebalance_data(&cp, wr_mas, &sib, &parent);
                multi_src_setup(&cp, wr_mas, wr_mas, &sib);
                dst_setup(&cp, mas, wr_mas->type);
                cp_data_write(&cp, mas);
        } while (rebalance_ascend(&cp, wr_mas, &sib, &parent));

        mas_wmb_replace(mas, &cp);
}

/*
 * mas_wr_store_entry() - Internal call to store a value
 * @wr_mas: The maple write state
 */
static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        switch (mas->store_type) {
        case wr_exact_fit:
                rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry);
                if (!!wr_mas->entry ^ !!wr_mas->content)
                        mas_update_gap(mas);
                break;
        case wr_append:
                mas_wr_append(wr_mas);
                break;
        case wr_slot_store:
                mas_wr_slot_store(wr_mas);
                break;
        case wr_node_store:
                mas_wr_node_store(wr_mas);
                break;
        case wr_spanning_store:
                mas_wr_spanning_store(wr_mas);
                break;
        case wr_split_store:
                mas_wr_split(wr_mas);
                break;
        case wr_rebalance:
                mas_wr_rebalance(wr_mas);
                break;
        case wr_new_root:
                mas_new_root(mas, wr_mas->entry);
                break;
        case wr_store_root:
                mas_store_root(mas, wr_mas->entry);
                break;
        case wr_invalid:
                MT_BUG_ON(mas->tree, 1);
        }
}

static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        if (!mas_is_active(mas)) {
                if (mas_is_start(mas))
                        goto set_content;

                if (unlikely(mas_is_paused(mas)))
                        goto reset;

                if (unlikely(mas_is_none(mas)))
                        goto reset;

                if (unlikely(mas_is_overflow(mas)))
                        goto reset;

                if (unlikely(mas_is_underflow(mas)))
                        goto reset;
        }

        /*
         * A less strict version of mas_is_span_wr() where we allow spanning
         * writes within this node.  This is to stop partial walks in
         * mas_prealloc() from being reset.
         */
        if (mas->last > mas->max)
                goto reset;

        if (wr_mas->entry)
                goto set_content;

        if (mte_is_leaf(mas->node) && mas->last == mas->max)
                goto reset;

        goto set_content;

reset:
        mas_reset(mas);
set_content:
        wr_mas->content = mas_start(mas);
}

/**
 * mas_prealloc_calc() - Calculate number of nodes needed for a
 * given store oepration
 * @wr_mas: The maple write state
 * @entry: The entry to store into the tree
 *
 * Return: Number of nodes required for preallocation.
 */
static inline void mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char height = mas_mt_height(mas);
        int ret = height * 3 + 1;
        unsigned char delta = height - wr_mas->vacant_height;

        switch (mas->store_type) {
        case wr_exact_fit:
        case wr_append:
        case wr_slot_store:
                ret = 0;
                break;
        case wr_spanning_store:
                if (wr_mas->sufficient_height < wr_mas->vacant_height)
                        ret = (height - wr_mas->sufficient_height) * 3 + 1;
                else
                        ret = delta * 3 + 1;
                break;
        case wr_split_store:
                ret = delta * 2 + 1;
                break;
        case wr_rebalance:
                if (wr_mas->sufficient_height < wr_mas->vacant_height)
                        ret = (height - wr_mas->sufficient_height) * 2 + 1;
                else
                        ret = delta * 2 + 1;
                break;
        case wr_node_store:
                ret = mt_in_rcu(mas->tree) ? 1 : 0;
                break;
        case wr_new_root:
                ret = 1;
                break;
        case wr_store_root:
                if (likely((mas->last != 0) || (mas->index != 0)))
                        ret = 1;
                else if (((unsigned long) (entry) & 3) == 2)
                        ret = 1;
                else
                        ret = 0;
                break;
        case wr_invalid:
                WARN_ON_ONCE(1);
        }

        mas->node_request = ret;
}

/*
 * mas_wr_store_type() - Determine the store type for a given
 * store operation.
 * @wr_mas: The maple write state
 *
 * Return: the type of store needed for the operation
 */
static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end;

        if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
                return wr_store_root;

        if (unlikely(!mas_wr_walk(wr_mas)))
                return wr_spanning_store;

        /* At this point, we are at the leaf node that needs to be altered. */
        mas_wr_end_piv(wr_mas);
        if (!wr_mas->entry)
                mas_wr_extend_null(wr_mas);

        if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last))
                return wr_exact_fit;

        if (unlikely(!mas->index && mas->last == ULONG_MAX))
                return wr_new_root;

        new_end = mas_wr_new_end(wr_mas);
        /* Potential spanning rebalance collapsing a node */
        if (new_end < mt_min_slots[wr_mas->type]) {
                if (!mte_is_root(mas->node))
                        return  wr_rebalance;
                return wr_node_store;
        }

        if (new_end >= mt_slots[wr_mas->type])
                return wr_split_store;

        if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end))
                return wr_append;

        if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) ||
                (wr_mas->offset_end - mas->offset == 1)))
                return wr_slot_store;

        return wr_node_store;
}

/**
 * mas_wr_preallocate() - Preallocate enough nodes for a store operation
 * @wr_mas: The maple write state
 * @entry: The entry that will be stored
 *
 */
static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry)
{
        struct ma_state *mas = wr_mas->mas;

        mas_wr_prealloc_setup(wr_mas);
        mas->store_type = mas_wr_store_type(wr_mas);
        mas_prealloc_calc(wr_mas, entry);
        if (!mas->node_request)
                return;

        mas_alloc_nodes(mas, GFP_NOWAIT);
}

/**
 * mas_insert() - Internal call to insert a value
 * @mas: The maple state
 * @entry: The entry to store
 *
 * Return: %NULL or the contents that already exists at the requested index
 * otherwise.  The maple state needs to be checked for error conditions.
 */
static inline void *mas_insert(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        /*
         * Inserting a new range inserts either 0, 1, or 2 pivots within the
         * tree.  If the insert fits exactly into an existing gap with a value
         * of NULL, then the slot only needs to be written with the new value.
         * If the range being inserted is adjacent to another range, then only a
         * single pivot needs to be inserted (as well as writing the entry).  If
         * the new range is within a gap but does not touch any other ranges,
         * then two pivots need to be inserted: the start - 1, and the end.  As
         * usual, the entry must be written.  Most operations require a new node
         * to be allocated and replace an existing node to ensure RCU safety,
         * when in RCU mode.  The exception to requiring a newly allocated node
         * is when inserting at the end of a node (appending).  When done
         * carefully, appending can reuse the node in place.
         */
        wr_mas.content = mas_start(mas);
        if (wr_mas.content)
                goto exists;

        mas_wr_preallocate(&wr_mas, entry);
        if (mas_is_err(mas))
                return NULL;

        /* spanning writes always overwrite something */
        if (mas->store_type == wr_spanning_store)
                goto exists;

        /* At this point, we are at the leaf node that needs to be altered. */
        if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) {
                wr_mas.offset_end = mas->offset;
                wr_mas.end_piv = wr_mas.r_max;

                if (wr_mas.content || (mas->last > wr_mas.r_max))
                        goto exists;
        }

        mas_wr_store_entry(&wr_mas);
        return wr_mas.content;

exists:
        mas_set_err(mas, -EEXIST);
        return wr_mas.content;

}

/**
 * mas_alloc_cyclic() - Internal call to find somewhere to store an entry
 * @mas: The maple state.
 * @startp: Pointer to ID.
 * @range_lo: Lower bound of range to search.
 * @range_hi: Upper bound of range to search.
 * @entry: The entry to store.
 * @next: Pointer to next ID to allocate.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 if the allocation succeeded without wrapping, 1 if the
 * allocation succeeded after wrapping, or -EBUSY if there are no
 * free entries.
 */
int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp)
{
        unsigned long min = range_lo;
        int ret = 0;

        range_lo = max(min, *next);
        ret = mas_empty_area(mas, range_lo, range_hi, 1);
        if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }
        if (ret < 0 && range_lo > min) {
                mas_reset(mas);
                ret = mas_empty_area(mas, min, range_hi, 1);
                if (ret == 0)
                        ret = 1;
        }
        if (ret < 0)
                return ret;

        do {
                mas_insert(mas, entry);
        } while (mas_nomem(mas, gfp));
        if (mas_is_err(mas))
                return xa_err(mas->node);

        *startp = mas->index;
        *next = *startp + 1;
        if (*next == 0)
                mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED;

        mas_destroy(mas);
        return ret;
}
EXPORT_SYMBOL(mas_alloc_cyclic);

static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index)
{
retry:
        mas_set(mas, index);
        mas_state_walk(mas);
        if (mas_is_start(mas))
                goto retry;
}

static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas,
                struct maple_node *node, const unsigned long index)
{
        if (unlikely(ma_dead_node(node))) {
                mas_rewalk(mas, index);
                return true;
        }
        return false;
}

/*
 * mas_prev_node() - Find the prev non-null entry at the same level in the
 * tree.  The prev value will be mas->node[mas->offset] or the status will be
 * ma_none.
 * @mas: The maple state
 * @min: The lower limit to search
 *
 * The prev node value will be mas->node[mas->offset] or the status will be
 * ma_none.
 * Return: 1 if the node is dead, 0 otherwise.
 */
static int mas_prev_node(struct ma_state *mas, unsigned long min)
{
        enum maple_type mt;
        int offset, level;
        void __rcu **slots;
        struct maple_node *node;
        unsigned long *pivots;
        unsigned long max;

        node = mas_mn(mas);
        if (!mas->min)
                goto no_entry;

        max = mas->min - 1;
        if (max < min)
                goto no_entry;

        level = 0;
        do {
                if (ma_is_root(node))
                        goto no_entry;

                /* Walk up. */
                if (unlikely(mas_ascend(mas)))
                        return 1;
                offset = mas->offset;
                level++;
                node = mas_mn(mas);
        } while (!offset);

        offset--;
        mt = mte_node_type(mas->node);
        while (level > 1) {
                level--;
                slots = ma_slots(node, mt);
                mas->node = mas_slot(mas, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        return 1;

                mt = mte_node_type(mas->node);
                node = mas_mn(mas);
                pivots = ma_pivots(node, mt);
                offset = ma_data_end(node, mt, pivots, max);
                if (unlikely(ma_dead_node(node)))
                        return 1;
        }

        slots = ma_slots(node, mt);
        mas->node = mas_slot(mas, slots, offset);
        pivots = ma_pivots(node, mt);
        if (unlikely(ma_dead_node(node)))
                return 1;

        if (likely(offset))
                mas->min = pivots[offset - 1] + 1;
        mas->max = max;
        mas->offset = mas_data_end(mas);
        if (unlikely(mte_dead_node(mas->node)))
                return 1;

        mas->end = mas->offset;
        return 0;

no_entry:
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->status = ma_underflow;
        return 0;
}

/*
 * mas_prev_slot() - Get the entry in the previous slot
 *
 * @mas: The maple state
 * @min: The minimum starting range
 * @empty: Can be empty
 *
 * Return: The entry in the previous slot which is possibly NULL
 */
static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty)
{
        void *entry;
        void __rcu **slots;
        unsigned long pivot;
        enum maple_type type;
        unsigned long *pivots;
        struct maple_node *node;
        unsigned long save_point = mas->index;

retry:
        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (mas->min <= min) {
                pivot = mas_safe_min(mas, pivots, mas->offset);

                if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                        goto retry;

                if (pivot <= min)
                        goto underflow;
        }

again:
        if (likely(mas->offset)) {
                mas->offset--;
                mas->last = mas->index - 1;
                mas->index = mas_safe_min(mas, pivots, mas->offset);
        } else  {
                if (mas->index <= min)
                        goto underflow;

                if (mas_prev_node(mas, min)) {
                        mas_rewalk(mas, save_point);
                        goto retry;
                }

                if (WARN_ON_ONCE(mas_is_underflow(mas)))
                        return NULL;

                mas->last = mas->max;
                node = mas_mn(mas);
                type = mte_node_type(mas->node);
                pivots = ma_pivots(node, type);
                mas->index = pivots[mas->offset - 1] + 1;
        }

        slots = ma_slots(node, type);
        entry = mas_slot(mas, slots, mas->offset);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (likely(entry))
                return entry;

        if (!empty) {
                if (mas->index <= min)
                        goto underflow;

                goto again;
        }

        return entry;

underflow:
        mas->status = ma_underflow;
        return NULL;
}

/*
 * mas_next_node() - Get the next node at the same level in the tree.
 * @mas: The maple state
 * @node: The maple node
 * @max: The maximum pivot value to check.
 *
 * The next value will be mas->node[mas->offset] or the status will have
 * overflowed.
 * Return: 1 on dead node, 0 otherwise.
 */
static int mas_next_node(struct ma_state *mas, struct maple_node *node,
                unsigned long max)
{
        unsigned long min;
        unsigned long *pivots;
        struct maple_enode *enode;
        struct maple_node *tmp;
        int level = 0;
        unsigned char node_end;
        enum maple_type mt;
        void __rcu **slots;

        if (mas->max >= max)
                goto overflow;

        min = mas->max + 1;
        level = 0;
        do {
                if (ma_is_root(node))
                        goto overflow;

                /* Walk up. */
                if (unlikely(mas_ascend(mas)))
                        return 1;

                level++;
                node = mas_mn(mas);
                mt = mte_node_type(mas->node);
                pivots = ma_pivots(node, mt);
                node_end = ma_data_end(node, mt, pivots, mas->max);
                if (unlikely(ma_dead_node(node)))
                        return 1;

        } while (unlikely(mas->offset == node_end));

        slots = ma_slots(node, mt);
        mas->offset++;
        enode = mas_slot(mas, slots, mas->offset);
        if (unlikely(ma_dead_node(node)))
                return 1;

        if (level > 1)
                mas->offset = 0;

        while (unlikely(level > 1)) {
                level--;
                mas->node = enode;
                node = mas_mn(mas);
                mt = mte_node_type(mas->node);
                slots = ma_slots(node, mt);
                enode = mas_slot(mas, slots, 0);
                if (unlikely(ma_dead_node(node)))
                        return 1;
        }

        if (!mas->offset)
                pivots = ma_pivots(node, mt);

        mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt);
        tmp = mte_to_node(enode);
        mt = mte_node_type(enode);
        pivots = ma_pivots(tmp, mt);
        mas->end = ma_data_end(tmp, mt, pivots, mas->max);
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->node = enode;
        mas->min = min;
        return 0;

overflow:
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->status = ma_overflow;
        return 0;
}

/*
 * mas_next_slot() - Get the entry in the next slot
 *
 * @mas: The maple state
 * @max: The maximum starting range
 * @empty: Can be empty
 *
 * Return: The entry in the next slot which is possibly NULL
 */
static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty)
{
        void __rcu **slots;
        unsigned long *pivots;
        unsigned long pivot;
        enum maple_type type;
        struct maple_node *node;
        unsigned long save_point = mas->last;
        void *entry;

retry:
        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (mas->max >= max) {
                if (likely(mas->offset < mas->end))
                        pivot = pivots[mas->offset];
                else
                        pivot = mas->max;

                if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                        goto retry;

                if (pivot >= max) { /* Was at the limit, next will extend beyond */
                        mas->status = ma_overflow;
                        return NULL;
                }
        }

        if (likely(mas->offset < mas->end)) {
                mas->index = pivots[mas->offset] + 1;
again:
                mas->offset++;
                if (likely(mas->offset < mas->end))
                        mas->last = pivots[mas->offset];
                else
                        mas->last = mas->max;
        } else  {
                if (mas->last >= max) {
                        mas->status = ma_overflow;
                        return NULL;
                }

                if (mas_next_node(mas, node, max)) {
                        mas_rewalk(mas, save_point);
                        goto retry;
                }

                if (WARN_ON_ONCE(mas_is_overflow(mas)))
                        return NULL;

                mas->offset = 0;
                mas->index = mas->min;
                node = mas_mn(mas);
                type = mte_node_type(mas->node);
                pivots = ma_pivots(node, type);
                mas->last = pivots[0];
        }

        slots = ma_slots(node, type);
        entry = mt_slot(mas->tree, slots, mas->offset);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (entry)
                return entry;


        if (!empty) {
                if (mas->last >= max) {
                        mas->status = ma_overflow;
                        return NULL;
                }

                mas->index = mas->last + 1;
                goto again;
        }

        return entry;
}

/*
 * mas_rev_awalk() - Internal function.  Reverse allocation walk.  Find the
 * highest gap address of a given size in a given node and descend.
 * @mas: The maple state
 * @size: The needed size.
 *
 * Return: True if found in a leaf, false otherwise.
 *
 */
static bool mas_rev_awalk(struct ma_state *mas, unsigned long size,
                unsigned long *gap_min, unsigned long *gap_max)
{
        enum maple_type type = mte_node_type(mas->node);
        struct maple_node *node = mas_mn(mas);
        unsigned long *pivots, *gaps;
        void __rcu **slots;
        unsigned long gap = 0;
        unsigned long max, min;
        unsigned char offset;

        if (unlikely(mas_is_err(mas)))
                return true;

        if (ma_is_dense(type)) {
                /* dense nodes. */
                mas->offset = (unsigned char)(mas->index - mas->min);
                return true;
        }

        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        gaps = ma_gaps(node, type);
        offset = mas->offset;
        min = mas_safe_min(mas, pivots, offset);
        /* Skip out of bounds. */
        while (mas->last < min)
                min = mas_safe_min(mas, pivots, --offset);

        max = mas_safe_pivot(mas, pivots, offset, type);
        while (mas->index <= max) {
                gap = 0;
                if (gaps)
                        gap = gaps[offset];
                else if (!mas_slot(mas, slots, offset))
                        gap = max - min + 1;

                if (gap) {
                        if ((size <= gap) && (size <= mas->last - min + 1))
                                break;

                        if (!gaps) {
                                /* Skip the next slot, it cannot be a gap. */
                                if (offset < 2)
                                        goto ascend;

                                offset -= 2;
                                max = pivots[offset];
                                min = mas_safe_min(mas, pivots, offset);
                                continue;
                        }
                }

                if (!offset)
                        goto ascend;

                offset--;
                max = min - 1;
                min = mas_safe_min(mas, pivots, offset);
        }

        if (unlikely((mas->index > max) || (size - 1 > max - mas->index)))
                goto no_space;

        if (unlikely(ma_is_leaf(type))) {
                mas->offset = offset;
                *gap_min = min;
                *gap_max = min + gap - 1;
                return true;
        }

        /* descend, only happens under lock. */
        mas->node = mas_slot(mas, slots, offset);
        mas->min = min;
        mas->max = max;
        mas->offset = mas_data_end(mas);
        return false;

ascend:
        if (!mte_is_root(mas->node))
                return false;

no_space:
        mas_set_err(mas, -EBUSY);
        return false;
}

static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size)
{
        enum maple_type type = mte_node_type(mas->node);
        unsigned long pivot, min, gap = 0;
        unsigned char offset, data_end;
        unsigned long *gaps, *pivots;
        void __rcu **slots;
        struct maple_node *node;
        bool found = false;

        if (ma_is_dense(type)) {
                mas->offset = (unsigned char)(mas->index - mas->min);
                return true;
        }

        node = mas_mn(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        gaps = ma_gaps(node, type);
        offset = mas->offset;
        min = mas_safe_min(mas, pivots, offset);
        data_end = ma_data_end(node, type, pivots, mas->max);
        for (; offset <= data_end; offset++) {
                pivot = mas_safe_pivot(mas, pivots, offset, type);

                /* Not within lower bounds */
                if (mas->index > pivot)
                        goto next_slot;

                if (gaps)
                        gap = gaps[offset];
                else if (!mas_slot(mas, slots, offset))
                        gap = min(pivot, mas->last) - max(mas->index, min) + 1;
                else
                        goto next_slot;

                if (gap >= size) {
                        if (ma_is_leaf(type)) {
                                found = true;
                                break;
                        }

                        mas->node = mas_slot(mas, slots, offset);
                        mas->min = min;
                        mas->max = pivot;
                        offset = 0;
                        break;
                }
next_slot:
                min = pivot + 1;
                if (mas->last <= pivot) {
                        mas_set_err(mas, -EBUSY);
                        return true;
                }
        }

        mas->offset = offset;
        return found;
}

/**
 * mas_walk() - Search for @mas->index in the tree.
 * @mas: The maple state.
 *
 * mas->index and mas->last will be set to the range if there is a value.  If
 * mas->status is ma_none, reset to ma_start
 *
 * Return: the entry at the location or %NULL.
 */
void *mas_walk(struct ma_state *mas)
{
        void *entry;

        if (!mas_is_active(mas) && !mas_is_start(mas))
                mas->status = ma_start;
retry:
        entry = mas_state_walk(mas);
        if (mas_is_start(mas)) {
                goto retry;
        } else if (mas_is_none(mas)) {
                mas->index = 0;
                mas->last = ULONG_MAX;
        } else if (mas_is_ptr(mas)) {
                if (!mas->index) {
                        mas->last = 0;
                        return entry;
                }

                mas->index = 1;
                mas->last = ULONG_MAX;
                mas->status = ma_none;
                return NULL;
        }

        return entry;
}
EXPORT_SYMBOL_GPL(mas_walk);

static inline bool mas_rewind_node(struct ma_state *mas)
{
        unsigned char slot;

        do {
                if (mte_is_root(mas->node)) {
                        slot = mas->offset;
                        if (!slot)
                                return false;
                } else {
                        mas_ascend(mas);
                        slot = mas->offset;
                }
        } while (!slot);

        mas->offset = --slot;
        return true;
}

/*
 * mas_skip_node() - Internal function.  Skip over a node.
 * @mas: The maple state.
 *
 * Return: true if there is another node, false otherwise.
 */
static inline bool mas_skip_node(struct ma_state *mas)
{
        if (mas_is_err(mas))
                return false;

        do {
                if (mte_is_root(mas->node)) {
                        if (mas->offset >= mas_data_end(mas)) {
                                mas_set_err(mas, -EBUSY);
                                return false;
                        }
                } else {
                        mas_ascend(mas);
                }
        } while (mas->offset >= mas_data_end(mas));

        mas->offset++;
        return true;
}

/*
 * mas_awalk() - Allocation walk.  Search from low address to high, for a gap of
 * @size
 * @mas: The maple state
 * @size: The size of the gap required
 *
 * Search between @mas->index and @mas->last for a gap of @size.
 */
static inline void mas_awalk(struct ma_state *mas, unsigned long size)
{
        struct maple_enode *last = NULL;

        /*
         * There are 4 options:
         * go to child (descend)
         * go back to parent (ascend)
         * no gap found. (return, error == -EBUSY)
         * found the gap. (return)
         */
        while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) {
                if (last == mas->node)
                        mas_skip_node(mas);
                else
                        last = mas->node;
        }
}

/*
 * mas_sparse_area() - Internal function.  Return upper or lower limit when
 * searching for a gap in an empty tree.
 * @mas: The maple state
 * @min: the minimum range
 * @max: The maximum range
 * @size: The size of the gap
 * @fwd: Searching forward or back
 */
static inline int mas_sparse_area(struct ma_state *mas, unsigned long min,
                                unsigned long max, unsigned long size, bool fwd)
{
        if (!unlikely(mas_is_none(mas)) && min == 0) {
                min++;
                /*
                 * At this time, min is increased, we need to recheck whether
                 * the size is satisfied.
                 */
                if (min > max || max - min + 1 < size)
                        return -EBUSY;
        }
        /* mas_is_ptr */

        if (fwd) {
                mas->index = min;
                mas->last = min + size - 1;
        } else {
                mas->last = max;
                mas->index = max - size + 1;
        }
        return 0;
}

/*
 * mas_empty_area() - Get the lowest address within the range that is
 * sufficient for the size requested.
 * @mas: The maple state
 * @min: The lowest value of the range
 * @max: The highest value of the range
 * @size: The size needed
 */
int mas_empty_area(struct ma_state *mas, unsigned long min,
                unsigned long max, unsigned long size)
{
        unsigned char offset;
        unsigned long *pivots;
        enum maple_type mt;
        struct maple_node *node;

        if (min > max)
                return -EINVAL;

        if (size == 0 || max - min < size - 1)
                return -EINVAL;

        if (mas_is_start(mas))
                mas_start(mas);
        else if (mas->offset >= 2)
                mas->offset -= 2;
        else if (!mas_skip_node(mas))
                return -EBUSY;

        /* Empty set */
        if (mas_is_none(mas) || mas_is_ptr(mas))
                return mas_sparse_area(mas, min, max, size, true);

        /* The start of the window can only be within these values */
        mas->index = min;
        mas->last = max;
        mas_awalk(mas, size);

        if (unlikely(mas_is_err(mas)))
                return xa_err(mas->node);

        offset = mas->offset;
        node = mas_mn(mas);
        mt = mte_node_type(mas->node);
        pivots = ma_pivots(node, mt);
        min = mas_safe_min(mas, pivots, offset);
        if (mas->index < min)
                mas->index = min;
        mas->last = mas->index + size - 1;
        mas->end = ma_data_end(node, mt, pivots, mas->max);
        return 0;
}
EXPORT_SYMBOL_GPL(mas_empty_area);

/*
 * mas_empty_area_rev() - Get the highest address within the range that is
 * sufficient for the size requested.
 * @mas: The maple state
 * @min: The lowest value of the range
 * @max: The highest value of the range
 * @size: The size needed
 */
int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
                unsigned long max, unsigned long size)
{
        struct maple_enode *last = mas->node;

        if (min > max)
                return -EINVAL;

        if (size == 0 || max - min < size - 1)
                return -EINVAL;

        if (mas_is_start(mas))
                mas_start(mas);
        else if ((mas->offset < 2) && (!mas_rewind_node(mas)))
                return -EBUSY;

        if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
                return mas_sparse_area(mas, min, max, size, false);
        else if (mas->offset >= 2)
                mas->offset -= 2;
        else
                mas->offset = mas_data_end(mas);


        /* The start of the window can only be within these values. */
        mas->index = min;
        mas->last = max;

        while (!mas_rev_awalk(mas, size, &min, &max)) {
                if (last == mas->node) {
                        if (!mas_rewind_node(mas))
                                return -EBUSY;
                } else {
                        last = mas->node;
                }
        }

        if (mas_is_err(mas))
                return xa_err(mas->node);

        if (unlikely(mas->offset == MAPLE_NODE_SLOTS))
                return -EBUSY;

        /* Trim the upper limit to the max. */
        if (max < mas->last)
                mas->last = max;

        mas->index = mas->last - size + 1;
        mas->end = mas_data_end(mas);
        return 0;
}
EXPORT_SYMBOL_GPL(mas_empty_area_rev);

/*
 * mte_dead_leaves() - Mark all leaves of a node as dead.
 * @enode: the encoded node
 * @mt: the maple tree
 * @slots: Pointer to the slot array
 *
 * Must hold the write lock.
 *
 * Return: The number of leaves marked as dead.
 */
static inline
unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt,
                              void __rcu **slots)
{
        struct maple_node *node;
        enum maple_type type;
        void *entry;
        int offset;

        for (offset = 0; offset < mt_slot_count(enode); offset++) {
                entry = mt_slot(mt, slots, offset);
                type = mte_node_type(entry);
                node = mte_to_node(entry);
                /* Use both node and type to catch LE & BE metadata */
                if (!node || !type)
                        break;

                mte_set_node_dead(entry);
                node->type = type;
                rcu_assign_pointer(slots[offset], node);
        }

        return offset;
}

/**
 * mte_dead_walk() - Walk down a dead tree to just before the leaves
 * @enode: The maple encoded node
 * @offset: The starting offset
 *
 * Note: This can only be used from the RCU callback context.
 */
static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset)
{
        struct maple_node *node, *next;
        void __rcu **slots = NULL;

        next = mte_to_node(*enode);
        do {
                *enode = ma_enode_ptr(next);
                node = mte_to_node(*enode);
                slots = ma_slots(node, node->type);
                next = rcu_dereference_protected(slots[offset],
                                        lock_is_held(&rcu_callback_map));
                offset = 0;
        } while (!ma_is_leaf(next->type));

        return slots;
}

/**
 * mt_free_walk() - Walk & free a tree in the RCU callback context
 * @head: The RCU head that's within the node.
 *
 * Note: This can only be used from the RCU callback context.
 */
static void mt_free_walk(struct rcu_head *head)
{
        void __rcu **slots;
        struct maple_node *node, *start;
        struct maple_enode *enode;
        unsigned char offset;
        enum maple_type type;

        node = container_of(head, struct maple_node, rcu);

        if (ma_is_leaf(node->type))
                goto free_leaf;

        start = node;
        enode = mt_mk_node(node, node->type);
        slots = mte_dead_walk(&enode, 0);
        node = mte_to_node(enode);
        do {
                mt_free_bulk(node->slot_len, slots);
                offset = node->parent_slot + 1;
                enode = node->piv_parent;
                if (mte_to_node(enode) == node)
                        goto free_leaf;

                type = mte_node_type(enode);
                slots = ma_slots(mte_to_node(enode), type);
                if ((offset < mt_slots[type]) &&
                    rcu_dereference_protected(slots[offset],
                                              lock_is_held(&rcu_callback_map)))
                        slots = mte_dead_walk(&enode, offset);
                node = mte_to_node(enode);
        } while ((node != start) || (node->slot_len < offset));

        slots = ma_slots(node, node->type);
        mt_free_bulk(node->slot_len, slots);

free_leaf:
        kfree(node);
}

static inline void __rcu **mte_destroy_descend(struct maple_enode **enode,
        struct maple_tree *mt, struct maple_enode *prev, unsigned char offset)
{
        struct maple_node *node;
        struct maple_enode *next = *enode;
        void __rcu **slots = NULL;
        enum maple_type type;
        unsigned char next_offset = 0;

        do {
                *enode = next;
                node = mte_to_node(*enode);
                type = mte_node_type(*enode);
                slots = ma_slots(node, type);
                next = mt_slot_locked(mt, slots, next_offset);
                if ((mte_dead_node(next)))
                        next = mt_slot_locked(mt, slots, ++next_offset);

                mte_set_node_dead(*enode);
                node->type = type;
                node->piv_parent = prev;
                node->parent_slot = offset;
                offset = next_offset;
                next_offset = 0;
                prev = *enode;
        } while (!mte_is_leaf(next));

        return slots;
}

static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
                            bool free)
{
        void __rcu **slots;
        struct maple_node *node = mte_to_node(enode);
        struct maple_enode *start;

        if (mte_is_leaf(enode)) {
                mte_set_node_dead(enode);
                node->type = mte_node_type(enode);
                goto free_leaf;
        }

        start = enode;
        slots = mte_destroy_descend(&enode, mt, start, 0);
        node = mte_to_node(enode); // Updated in the above call.
        do {
                enum maple_type type;
                unsigned char offset;
                struct maple_enode *parent, *tmp;

                node->slot_len = mte_dead_leaves(enode, mt, slots);
                if (free)
                        mt_free_bulk(node->slot_len, slots);
                offset = node->parent_slot + 1;
                enode = node->piv_parent;
                if (mte_to_node(enode) == node)
                        goto free_leaf;

                type = mte_node_type(enode);
                slots = ma_slots(mte_to_node(enode), type);
                if (offset >= mt_slots[type])
                        goto next;

                tmp = mt_slot_locked(mt, slots, offset);
                if (mte_node_type(tmp) && mte_to_node(tmp)) {
                        parent = enode;
                        enode = tmp;
                        slots = mte_destroy_descend(&enode, mt, parent, offset);
                }
next:
                node = mte_to_node(enode);
        } while (start != enode);

        node = mte_to_node(enode);
        node->slot_len = mte_dead_leaves(enode, mt, slots);
        if (free)
                mt_free_bulk(node->slot_len, slots);

free_leaf:
        if (free)
                kfree(node);
        else
                mt_clear_meta(mt, node, node->type);
}

/*
 * mte_destroy_walk() - Free a tree or sub-tree.
 * @enode: the encoded maple node (maple_enode) to start
 * @mt: the tree to free - needed for node types.
 *
 * Must hold the write lock.
 */
static inline void mte_destroy_walk(struct maple_enode *enode,
                                    struct maple_tree *mt)
{
        struct maple_node *node = mte_to_node(enode);

        if (mt_in_rcu(mt)) {
                mt_destroy_walk(enode, mt, false);
                call_rcu(&node->rcu, mt_free_walk);
        } else {
                mt_destroy_walk(enode, mt, true);
        }
}
/* Interface */

/**
 * mas_store() - Store an @entry.
 * @mas: The maple state.
 * @entry: The entry to store.
 *
 * The @mas->index and @mas->last is used to set the range for the @entry.
 *
 * Return: the first entry between mas->index and mas->last or %NULL.
 */
void *mas_store(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        trace_ma_write(TP_FCT, mas, 0, entry);
#ifdef CONFIG_DEBUG_MAPLE_TREE
        if (MAS_WARN_ON(mas, mas->index > mas->last))
                pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last,
                       entry);

        if (mas->index > mas->last) {
                mas_set_err(mas, -EINVAL);
                return NULL;
        }

#endif

        /*
         * Storing is the same operation as insert with the added caveat that it
         * can overwrite entries.  Although this seems simple enough, one may
         * want to examine what happens if a single store operation was to
         * overwrite multiple entries within a self-balancing B-Tree.
         */
        mas_wr_prealloc_setup(&wr_mas);
        mas->store_type = mas_wr_store_type(&wr_mas);
        if (mas->mas_flags & MA_STATE_PREALLOC) {
                mas_wr_store_entry(&wr_mas);
                MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
                return wr_mas.content;
        }

        mas_prealloc_calc(&wr_mas, entry);
        if (!mas->node_request)
                goto store;

        mas_alloc_nodes(mas, GFP_NOWAIT);
        if (mas_is_err(mas))
                return NULL;

store:
        mas_wr_store_entry(&wr_mas);
        mas_destroy(mas);
        return wr_mas.content;
}
EXPORT_SYMBOL_GPL(mas_store);

/**
 * mas_store_gfp() - Store a value into the tree.
 * @mas: The maple state
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations if necessary.
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp)
{
        unsigned long index = mas->index;
        unsigned long last = mas->last;
        MA_WR_STATE(wr_mas, mas, entry);
        int ret = 0;

retry:
        mas_wr_preallocate(&wr_mas, entry);
        if (unlikely(mas_nomem(mas, gfp))) {
                if (!entry)
                        __mas_set_range(mas, index, last);
                goto retry;
        }

        if (mas_is_err(mas)) {
                ret = xa_err(mas->node);
                goto out;
        }

        mas_wr_store_entry(&wr_mas);
out:
        mas_destroy(mas);
        return ret;
}
EXPORT_SYMBOL_GPL(mas_store_gfp);

/**
 * mas_store_prealloc() - Store a value into the tree using memory
 * preallocated in the maple state.
 * @mas: The maple state
 * @entry: The entry to store.
 */
void mas_store_prealloc(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        if (mas->store_type == wr_store_root) {
                mas_wr_prealloc_setup(&wr_mas);
                goto store;
        }

        mas_wr_walk_descend(&wr_mas);
        if (mas->store_type != wr_spanning_store) {
                /* set wr_mas->content to current slot */
                wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset);
                mas_wr_end_piv(&wr_mas);
        }

store:
        trace_ma_write(TP_FCT, mas, 0, entry);
        mas_wr_store_entry(&wr_mas);
        MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
        mas_destroy(mas);
}
EXPORT_SYMBOL_GPL(mas_store_prealloc);

/**
 * mas_preallocate() - Preallocate enough nodes for a store operation
 * @mas: The maple state
 * @entry: The entry that will be stored
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated.
 */
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
{
        MA_WR_STATE(wr_mas, mas, entry);

        mas_wr_prealloc_setup(&wr_mas);
        mas->store_type = mas_wr_store_type(&wr_mas);
        mas_prealloc_calc(&wr_mas, entry);
        if (!mas->node_request)
                goto set_flag;

        mas->mas_flags &= ~MA_STATE_PREALLOC;
        mas_alloc_nodes(mas, gfp);
        if (mas_is_err(mas)) {
                int ret = xa_err(mas->node);

                mas->node_request = 0;
                mas_destroy(mas);
                mas_reset(mas);
                return ret;
        }

set_flag:
        mas->mas_flags |= MA_STATE_PREALLOC;
        return 0;
}
EXPORT_SYMBOL_GPL(mas_preallocate);

/*
 * mas_destroy() - destroy a maple state.
 * @mas: The maple state
 *
 * Upon completion, check the left-most node and rebalance against the node to
 * the right if necessary.  Frees any allocated nodes associated with this maple
 * state.
 */
void mas_destroy(struct ma_state *mas)
{
        mas->mas_flags &= ~MA_STATE_PREALLOC;
        mas_empty_nodes(mas);
}
EXPORT_SYMBOL_GPL(mas_destroy);

static void mas_may_activate(struct ma_state *mas)
{
        if (!mas->node) {
                mas->status = ma_start;
        } else if (mas->index > mas->max || mas->index < mas->min) {
                mas->status = ma_start;
        } else {
                mas->status = ma_active;
        }
}

static bool mas_next_setup(struct ma_state *mas, unsigned long max,
                void **entry)
{
        bool was_none = mas_is_none(mas);

        if (unlikely(mas->last >= max)) {
                mas->status = ma_overflow;
                return true;
        }

        switch (mas->status) {
        case ma_active:
                return false;
        case ma_none:
                fallthrough;
        case ma_pause:
                mas->status = ma_start;
                fallthrough;
        case ma_start:
                mas_walk(mas); /* Retries on dead nodes handled by mas_walk */
                break;
        case ma_overflow:
                /* Overflowed before, but the max changed */
                mas_may_activate(mas);
                break;
        case ma_underflow:
                /* The user expects the mas to be one before where it is */
                mas_may_activate(mas);
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (likely(mas_is_active(mas))) /* Fast path */
                return false;

        if (mas_is_ptr(mas)) {
                *entry = NULL;
                if (was_none && mas->index == 0) {
                        mas->index = mas->last = 0;
                        return true;
                }
                mas->index = 1;
                mas->last = ULONG_MAX;
                mas->status = ma_none;
                return true;
        }

        if (mas_is_none(mas))
                return true;

        return false;
}

/**
 * mas_next() - Get the next entry.
 * @mas: The maple state
 * @max: The maximum index to check.
 *
 * Returns the next entry after @mas->index.
 * Must hold rcu_read_lock or the write lock.
 * Can return the zero entry.
 *
 * Return: The next entry or %NULL
 */
void *mas_next(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_next_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, false);
}
EXPORT_SYMBOL_GPL(mas_next);

/**
 * mas_next_range() - Advance the maple state to the next range
 * @mas: The maple state
 * @max: The maximum index to check.
 *
 * Sets @mas->index and @mas->last to the range.
 * Must hold rcu_read_lock or the write lock.
 * Can return the zero entry.
 *
 * Return: The next entry or %NULL
 */
void *mas_next_range(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_next_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, true);
}
EXPORT_SYMBOL_GPL(mas_next_range);

/**
 * mt_next() - get the next value in the maple tree
 * @mt: The maple tree
 * @index: The start index
 * @max: The maximum index to check
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * Return: The entry higher than @index or %NULL if nothing is found.
 */
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max)
{
        void *entry = NULL;
        MA_STATE(mas, mt, index, index);

        rcu_read_lock();
        entry = mas_next(&mas, max);
        rcu_read_unlock();
        return entry;
}
EXPORT_SYMBOL_GPL(mt_next);

static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry)
{
        if (unlikely(mas->index <= min)) {
                mas->status = ma_underflow;
                return true;
        }

        switch (mas->status) {
        case ma_active:
                return false;
        case ma_start:
                break;
        case ma_none:
                fallthrough;
        case ma_pause:
                mas->status = ma_start;
                break;
        case ma_underflow:
                /* underflowed before but the min changed */
                mas_may_activate(mas);
                break;
        case ma_overflow:
                /* User expects mas to be one after where it is */
                mas_may_activate(mas);
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas))
                mas_walk(mas);

        if (unlikely(mas_is_ptr(mas))) {
                if (!mas->index) {
                        mas->status = ma_none;
                        return true;
                }
                mas->index = mas->last = 0;
                *entry = mas_root(mas);
                return true;
        }

        if (mas_is_none(mas)) {
                if (mas->index) {
                        /* Walked to out-of-range pointer? */
                        mas->index = mas->last = 0;
                        mas->status = ma_root;
                        *entry = mas_root(mas);
                        return true;
                }
                return true;
        }

        return false;
}

/**
 * mas_prev() - Get the previous entry
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * Will reset mas to ma_start if the status is ma_none.  Will stop on not
 * searchable nodes.
 *
 * Return: the previous value or %NULL.
 */
void *mas_prev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_prev_setup(mas, min, &entry))
                return entry;

        return mas_prev_slot(mas, min, false);
}
EXPORT_SYMBOL_GPL(mas_prev);

/**
 * mas_prev_range() - Advance to the previous range
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Sets @mas->index and @mas->last to the range.
 * Must hold rcu_read_lock or the write lock.
 * Will reset mas to ma_start if the node is ma_none.  Will stop on not
 * searchable nodes.
 *
 * Return: the previous value or %NULL.
 */
void *mas_prev_range(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_prev_setup(mas, min, &entry))
                return entry;

        return mas_prev_slot(mas, min, true);
}
EXPORT_SYMBOL_GPL(mas_prev_range);

/**
 * mt_prev() - get the previous value in the maple tree
 * @mt: The maple tree
 * @index: The start index
 * @min: The minimum index to check
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * Return: The entry before @index or %NULL if nothing is found.
 */
void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min)
{
        void *entry = NULL;
        MA_STATE(mas, mt, index, index);

        rcu_read_lock();
        entry = mas_prev(&mas, min);
        rcu_read_unlock();
        return entry;
}
EXPORT_SYMBOL_GPL(mt_prev);

/**
 * mas_pause() - Pause a mas_find/mas_for_each to drop the lock.
 * @mas: The maple state to pause
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @mas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call mas_pause(), the mt_for_each()
 * iterator may be more appropriate.
 *
 */
void mas_pause(struct ma_state *mas)
{
        mas->status = ma_pause;
        mas->node = NULL;
}
EXPORT_SYMBOL_GPL(mas_pause);

/**
 * mas_find_setup() - Internal function to set up mas_find*().
 * @mas: The maple state
 * @max: The maximum index
 * @entry: Pointer to the entry
 *
 * Returns: True if entry is the answer, false otherwise.
 */
static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry)
{
        switch (mas->status) {
        case ma_active:
                if (mas->last < max)
                        return false;
                return true;
        case ma_start:
                break;
        case ma_pause:
                if (unlikely(mas->last >= max))
                        return true;

                mas->index = ++mas->last;
                mas->status = ma_start;
                break;
        case ma_none:
                if (unlikely(mas->last >= max))
                        return true;

                mas->index = mas->last;
                mas->status = ma_start;
                break;
        case ma_underflow:
                /* mas is pointing at entry before unable to go lower */
                if (unlikely(mas->index >= max)) {
                        mas->status = ma_overflow;
                        return true;
                }

                mas_may_activate(mas);
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_overflow:
                if (unlikely(mas->last >= max))
                        return true;

                mas_may_activate(mas);
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas)) {
                /* First run or continue */
                if (mas->index > max)
                        return true;

                *entry = mas_walk(mas);
                if (*entry)
                        return true;

        }

        if (unlikely(mas_is_ptr(mas)))
                goto ptr_out_of_range;

        if (unlikely(mas_is_none(mas)))
                return true;

        if (mas->index == max)
                return true;

        return false;

ptr_out_of_range:
        mas->status = ma_none;
        mas->index = 1;
        mas->last = ULONG_MAX;
        return true;
}

/**
 * mas_find() - On the first call, find the entry at or after mas->index up to
 * %max.  Otherwise, find the entry after mas->index.
 * @mas: The maple state
 * @max: The maximum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_overflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_find_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        entry = mas_next_slot(mas, max, false);
        /* Ignore overflow */
        mas->status = ma_active;
        return entry;
}
EXPORT_SYMBOL_GPL(mas_find);

/**
 * mas_find_range() - On the first call, find the entry at or after
 * mas->index up to %max.  Otherwise, advance to the next slot mas->index.
 * @mas: The maple state
 * @max: The maximum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_overflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_range(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_find_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, true);
}
EXPORT_SYMBOL_GPL(mas_find_range);

/**
 * mas_find_rev_setup() - Internal function to set up mas_find_*_rev()
 * @mas: The maple state
 * @min: The minimum index
 * @entry: Pointer to the entry
 *
 * Returns: True if entry is the answer, false otherwise.
 */
static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min,
                void **entry)
{

        switch (mas->status) {
        case ma_active:
                goto active;
        case ma_start:
                break;
        case ma_pause:
                if (unlikely(mas->index <= min)) {
                        mas->status = ma_underflow;
                        return true;
                }
                mas->last = --mas->index;
                mas->status = ma_start;
                break;
        case ma_none:
                if (mas->index <= min)
                        goto none;

                mas->last = mas->index;
                mas->status = ma_start;
                break;
        case ma_overflow: /* user expects the mas to be one after where it is */
                if (unlikely(mas->index <= min)) {
                        mas->status = ma_underflow;
                        return true;
                }

                mas->status = ma_active;
                break;
        case ma_underflow: /* user expects the mas to be one before where it is */
                if (unlikely(mas->index <= min))
                        return true;

                mas->status = ma_active;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas)) {
                /* First run or continue */
                if (mas->index < min)
                        return true;

                *entry = mas_walk(mas);
                if (*entry)
                        return true;
        }

        if (unlikely(mas_is_ptr(mas)))
                goto none;

        if (unlikely(mas_is_none(mas))) {
                /*
                 * Walked to the location, and there was nothing so the previous
                 * location is 0.
                 */
                mas->last = mas->index = 0;
                mas->status = ma_root;
                *entry = mas_root(mas);
                return true;
        }

active:
        if (mas->index < min)
                return true;

        return false;

none:
        mas->status = ma_none;
        return true;
}

/**
 * mas_find_rev: On the first call, find the first non-null entry at or below
 * mas->index down to %min.  Otherwise find the first non-null entry below
 * mas->index down to %min.
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_underflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_rev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_find_rev_setup(mas, min, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_prev_slot */
        return mas_prev_slot(mas, min, false);

}
EXPORT_SYMBOL_GPL(mas_find_rev);

/**
 * mas_find_range_rev: On the first call, find the first non-null entry at or
 * below mas->index down to %min.  Otherwise advance to the previous slot after
 * mas->index down to %min.
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_underflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_range_rev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_find_rev_setup(mas, min, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_prev_slot */
        return mas_prev_slot(mas, min, true);
}
EXPORT_SYMBOL_GPL(mas_find_range_rev);

/**
 * mas_erase() - Find the range in which index resides and erase the entire
 * range.
 * @mas: The maple state
 *
 * Must hold the write lock.
 * Searches for @mas->index, sets @mas->index and @mas->last to the range and
 * erases that range.
 *
 * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated.
 */
void *mas_erase(struct ma_state *mas)
{
        void *entry;
        unsigned long index = mas->index;
        MA_WR_STATE(wr_mas, mas, NULL);

        if (!mas_is_active(mas) || !mas_is_start(mas))
                mas->status = ma_start;

write_retry:
        entry = mas_state_walk(mas);
        if (!entry)
                return NULL;

        /* Must reset to ensure spanning writes of last slot are detected */
        mas_reset(mas);
        mas_wr_preallocate(&wr_mas, NULL);
        if (mas_nomem(mas, GFP_KERNEL)) {
                /* in case the range of entry changed when unlocked */
                mas->index = mas->last = index;
                goto write_retry;
        }

        if (mas_is_err(mas))
                goto out;

        mas_wr_store_entry(&wr_mas);
out:
        mas_destroy(mas);
        return entry;
}
EXPORT_SYMBOL_GPL(mas_erase);

/**
 * mas_nomem() - Check if there was an error allocating and do the allocation
 * if necessary If there are allocations, then free them.
 * @mas: The maple state
 * @gfp: The GFP_FLAGS to use for allocations
 * Return: true on allocation, false otherwise.
 */
bool mas_nomem(struct ma_state *mas, gfp_t gfp)
        __must_hold(mas->tree->ma_lock)
{
        if (likely(mas->node != MA_ERROR(-ENOMEM)))
                return false;

        if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) {
                mtree_unlock(mas->tree);
                mas_alloc_nodes(mas, gfp);
                mtree_lock(mas->tree);
        } else {
                mas_alloc_nodes(mas, gfp);
        }

        if (!mas->sheaf && !mas->alloc)
                return false;

        mas->status = ma_start;
        return true;
}

void __init maple_tree_init(void)
{
        struct kmem_cache_args args = {
                .align  = sizeof(struct maple_node),
                .sheaf_capacity = 32,
        };

        maple_node_cache = kmem_cache_create("maple_node",
                        sizeof(struct maple_node), &args,
                        SLAB_PANIC);
}

/**
 * mtree_load() - Load a value stored in a maple tree
 * @mt: The maple tree
 * @index: The index to load
 *
 * Return: the entry or %NULL
 */
void *mtree_load(struct maple_tree *mt, unsigned long index)
{
        MA_STATE(mas, mt, index, index);
        void *entry;

        trace_ma_read(TP_FCT, &mas);
        rcu_read_lock();
retry:
        entry = mas_start(&mas);
        if (unlikely(mas_is_none(&mas)))
                goto unlock;

        if (unlikely(mas_is_ptr(&mas))) {
                if (index)
                        entry = NULL;

                goto unlock;
        }

        entry = mtree_lookup_walk(&mas);
        if (!entry && unlikely(mas_is_start(&mas)))
                goto retry;
unlock:
        rcu_read_unlock();
        if (xa_is_zero(entry))
                return NULL;

        return entry;
}
EXPORT_SYMBOL(mtree_load);

/**
 * mtree_store_range() - Store an entry at a given range.
 * @mt: The maple tree
 * @index: The start of the range
 * @last: The end of the range
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mtree_store_range(struct maple_tree *mt, unsigned long index,
                unsigned long last, void *entry, gfp_t gfp)
{
        MA_STATE(mas, mt, index, last);
        int ret = 0;

        trace_ma_write(TP_FCT, &mas, 0, entry);
        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;

        if (index > last)
                return -EINVAL;

        mtree_lock(mt);
        ret = mas_store_gfp(&mas, entry, gfp);
        mtree_unlock(mt);

        return ret;
}
EXPORT_SYMBOL(mtree_store_range);

/**
 * mtree_store() - Store an entry at a given index.
 * @mt: The maple tree
 * @index: The index to store the value
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mtree_store(struct maple_tree *mt, unsigned long index, void *entry,
                 gfp_t gfp)
{
        return mtree_store_range(mt, index, index, entry, gfp);
}
EXPORT_SYMBOL(mtree_store);

/**
 * mtree_insert_range() - Insert an entry at a given range if there is no value.
 * @mt: The maple tree
 * @first: The start of the range
 * @last: The end of the range
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
 * request, -ENOMEM if memory could not be allocated.
 */
int mtree_insert_range(struct maple_tree *mt, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        MA_STATE(ms, mt, first, last);
        int ret = 0;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;

        if (first > last)
                return -EINVAL;

        mtree_lock(mt);
retry:
        mas_insert(&ms, entry);
        if (mas_nomem(&ms, gfp))
                goto retry;

        mtree_unlock(mt);
        if (mas_is_err(&ms))
                ret = xa_err(ms.node);

        mas_destroy(&ms);
        return ret;
}
EXPORT_SYMBOL(mtree_insert_range);

/**
 * mtree_insert() - Insert an entry at a given index if there is no value.
 * @mt: The maple tree
 * @index : The index to store the value
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
 * request, -ENOMEM if memory could not be allocated.
 */
int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry,
                 gfp_t gfp)
{
        return mtree_insert_range(mt, index, index, entry, gfp);
}
EXPORT_SYMBOL(mtree_insert);

int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp)
{
        int ret = 0;

        MA_STATE(mas, mt, 0, 0);
        if (!mt_is_alloc(mt))
                return -EINVAL;

        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;

        mtree_lock(mt);
retry:
        ret = mas_empty_area(&mas, min, max, size);
        if (ret)
                goto unlock;

        mas_insert(&mas, entry);
        /*
         * mas_nomem() may release the lock, causing the allocated area
         * to be unavailable, so try to allocate a free area again.
         */
        if (mas_nomem(&mas, gfp))
                goto retry;

        if (mas_is_err(&mas))
                ret = xa_err(mas.node);
        else
                *startp = mas.index;

unlock:
        mtree_unlock(mt);
        mas_destroy(&mas);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_range);

/**
 * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree.
 * @mt: The maple tree.
 * @startp: Pointer to ID.
 * @range_lo: Lower bound of range to search.
 * @range_hi: Upper bound of range to search.
 * @entry: The entry to store.
 * @next: Pointer to next ID to allocate.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Finds an empty entry in @mt after @next, stores the new index into
 * the @id pointer, stores the entry at that index, then updates @next.
 *
 * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag.
 *
 * Context: Any context.  Takes and releases the mt.lock.  May sleep if
 * the @gfp flags permit.
 *
 * Return: 0 if the allocation succeeded without wrapping, 1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no
 * free entries.
 */
int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp)
{
        int ret;

        MA_STATE(mas, mt, 0, 0);

        if (!mt_is_alloc(mt))
                return -EINVAL;
        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;
        mtree_lock(mt);
        ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi,
                               next, gfp);
        mtree_unlock(mt);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_cyclic);

int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp)
{
        int ret = 0;

        MA_STATE(mas, mt, 0, 0);
        if (!mt_is_alloc(mt))
                return -EINVAL;

        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;

        mtree_lock(mt);
retry:
        ret = mas_empty_area_rev(&mas, min, max, size);
        if (ret)
                goto unlock;

        mas_insert(&mas, entry);
        /*
         * mas_nomem() may release the lock, causing the allocated area
         * to be unavailable, so try to allocate a free area again.
         */
        if (mas_nomem(&mas, gfp))
                goto retry;

        if (mas_is_err(&mas))
                ret = xa_err(mas.node);
        else
                *startp = mas.index;

unlock:
        mtree_unlock(mt);
        mas_destroy(&mas);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_rrange);

/**
 * mtree_erase() - Find an index and erase the entire range.
 * @mt: The maple tree
 * @index: The index to erase
 *
 * Erasing is the same as a walk to an entry then a store of a NULL to that
 * ENTIRE range.  In fact, it is implemented as such using the advanced API.
 *
 * Return: The entry stored at the @index or %NULL
 */
void *mtree_erase(struct maple_tree *mt, unsigned long index)
{
        void *entry = NULL;

        MA_STATE(mas, mt, index, index);
        trace_ma_op(TP_FCT, &mas);

        mtree_lock(mt);
        entry = mas_erase(&mas);
        mtree_unlock(mt);

        return entry;
}
EXPORT_SYMBOL(mtree_erase);

/*
 * mas_dup_free() - Free an incomplete duplication of a tree.
 * @mas: The maple state of a incomplete tree.
 *
 * The parameter @mas->node passed in indicates that the allocation failed on
 * this node. This function frees all nodes starting from @mas->node in the
 * reverse order of mas_dup_build(). There is no need to hold the source tree
 * lock at this time.
 */
static void mas_dup_free(struct ma_state *mas)
{
        struct maple_node *node;
        enum maple_type type;
        void __rcu **slots;
        unsigned char count, i;

        /* Maybe the first node allocation failed. */
        if (mas_is_none(mas))
                return;

        while (!mte_is_root(mas->node)) {
                mas_ascend(mas);
                if (mas->offset) {
                        mas->offset--;
                        do {
                                mas_descend(mas);
                                mas->offset = mas_data_end(mas);
                        } while (!mte_is_leaf(mas->node));

                        mas_ascend(mas);
                }

                node = mte_to_node(mas->node);
                type = mte_node_type(mas->node);
                slots = ma_slots(node, type);
                count = mas_data_end(mas) + 1;
                for (i = 0; i < count; i++)
                        ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK;
                mt_free_bulk(count, slots);
        }

        node = mte_to_node(mas->node);
        kfree(node);
}

/*
 * mas_copy_node() - Copy a maple node and replace the parent.
 * @mas: The maple state of source tree.
 * @new_mas: The maple state of new tree.
 * @parent: The parent of the new node.
 *
 * Copy @mas->node to @new_mas->node, set @parent to be the parent of
 * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM.
 */
static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas,
                struct maple_pnode *parent)
{
        struct maple_node *node = mte_to_node(mas->node);
        struct maple_node *new_node = mte_to_node(new_mas->node);
        unsigned long val;

        /* Copy the node completely. */
        memcpy(new_node, node, sizeof(struct maple_node));
        /* Update the parent node pointer. */
        val = (unsigned long)node->parent & MAPLE_NODE_MASK;
        new_node->parent = ma_parent_ptr(val | (unsigned long)parent);
}

/*
 * mas_dup_alloc() - Allocate child nodes for a maple node.
 * @mas: The maple state of source tree.
 * @new_mas: The maple state of new tree.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * This function allocates child nodes for @new_mas->node during the duplication
 * process. If memory allocation fails, @mas is set to -ENOMEM.
 */
static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas,
                gfp_t gfp)
{
        struct maple_node *node = mte_to_node(mas->node);
        struct maple_node *new_node = mte_to_node(new_mas->node);
        enum maple_type type;
        unsigned char count, i;
        void __rcu **slots;
        void __rcu **new_slots;
        unsigned long val;

        /* Allocate memory for child nodes. */
        type = mte_node_type(mas->node);
        new_slots = ma_slots(new_node, type);
        count = mas->node_request = mas_data_end(mas) + 1;
        mas_alloc_nodes(mas, gfp);
        if (unlikely(mas_is_err(mas)))
                return;

        slots = ma_slots(node, type);
        for (i = 0; i < count; i++) {
                val = (unsigned long)mt_slot_locked(mas->tree, slots, i);
                val &= MAPLE_NODE_MASK;
                /*
                 * Warning, see rcu_assign_pointer() documentation.  Since this
                 * is a duplication of a tree, there are no readers walking the
                 * tree until after the rcu_assign_pointer() call in
                 * mas_dup_build().
                 */
                RCU_INIT_POINTER(new_slots[i],
                                 ma_mnode_ptr((unsigned long)mas_pop_node(mas) |
                                              val));
        }
}

/*
 * mas_dup_build() - Build a new maple tree from a source tree
 * @mas: The maple state of source tree, need to be in MAS_START state.
 * @new_mas: The maple state of new tree, need to be in MAS_START state.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * This function builds a new tree in DFS preorder. If the memory allocation
 * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the
 * last node. mas_dup_free() will free the incomplete duplication of a tree.
 *
 * Note that the attributes of the two trees need to be exactly the same, and the
 * new tree needs to be empty, otherwise -EINVAL will be set in @mas.
 */
static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas,
                gfp_t gfp)
{
        struct maple_node *node;
        struct maple_pnode *parent = NULL;
        struct maple_enode *root;
        enum maple_type type;

        if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) ||
            unlikely(!mtree_empty(new_mas->tree))) {
                mas_set_err(mas, -EINVAL);
                return;
        }

        root = mas_start(mas);
        if (mas_is_ptr(mas) || mas_is_none(mas))
                goto set_new_tree;

        node = mt_alloc_one(gfp);
        if (!node) {
                new_mas->status = ma_none;
                mas_set_err(mas, -ENOMEM);
                return;
        }

        type = mte_node_type(mas->node);
        root = mt_mk_node(node, type);
        new_mas->node = root;
        new_mas->min = 0;
        new_mas->max = ULONG_MAX;
        root = mte_mk_root(root);
        while (1) {
                mas_copy_node(mas, new_mas, parent);
                if (!mte_is_leaf(mas->node)) {
                        /* Only allocate child nodes for non-leaf nodes. */
                        mas_dup_alloc(mas, new_mas, gfp);
                        if (unlikely(mas_is_err(mas)))
                                goto empty_mas;
                } else {
                        /*
                         * This is the last leaf node and duplication is
                         * completed.
                         */
                        if (mas->max == ULONG_MAX)
                                goto done;

                        /* This is not the last leaf node and needs to go up. */
                        do {
                                mas_ascend(mas);
                                mas_ascend(new_mas);
                        } while (mas->offset == mas_data_end(mas));

                        /* Move to the next subtree. */
                        mas->offset++;
                        new_mas->offset++;
                }

                mas_descend(mas);
                parent = ma_parent_ptr(mte_to_node(new_mas->node));
                mas_descend(new_mas);
                mas->offset = 0;
                new_mas->offset = 0;
        }
done:
        /* Specially handle the parent of the root node. */
        mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas));
set_new_tree:
        /* Make them the same height */
        new_mas->tree->ma_flags = mas->tree->ma_flags;
        rcu_assign_pointer(new_mas->tree->ma_root, root);
empty_mas:
        mas_empty_nodes(mas);
}

/**
 * __mt_dup(): Duplicate an entire maple tree
 * @mt: The source maple tree
 * @new: The new maple tree
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
 * traversal. It uses memcpy() to copy nodes in the source tree and allocate
 * new child nodes in non-leaf nodes. The new node is exactly the same as the
 * source node except for all the addresses stored in it. It will be faster than
 * traversing all elements in the source tree and inserting them one by one into
 * the new tree.
 * The user needs to ensure that the attributes of the source tree and the new
 * tree are the same, and the new tree needs to be an empty tree, otherwise
 * -EINVAL will be returned.
 * Note that the user needs to manually lock the source tree and the new tree.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
 * the attributes of the two trees are different or the new tree is not an empty
 * tree.
 */
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
{
        int ret = 0;
        MA_STATE(mas, mt, 0, 0);
        MA_STATE(new_mas, new, 0, 0);

        mas_dup_build(&mas, &new_mas, gfp);
        if (unlikely(mas_is_err(&mas))) {
                ret = xa_err(mas.node);
                if (ret == -ENOMEM)
                        mas_dup_free(&new_mas);
        }

        return ret;
}
EXPORT_SYMBOL(__mt_dup);

/**
 * mtree_dup(): Duplicate an entire maple tree
 * @mt: The source maple tree
 * @new: The new maple tree
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
 * traversal. It uses memcpy() to copy nodes in the source tree and allocate
 * new child nodes in non-leaf nodes. The new node is exactly the same as the
 * source node except for all the addresses stored in it. It will be faster than
 * traversing all elements in the source tree and inserting them one by one into
 * the new tree.
 * The user needs to ensure that the attributes of the source tree and the new
 * tree are the same, and the new tree needs to be an empty tree, otherwise
 * -EINVAL will be returned.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
 * the attributes of the two trees are different or the new tree is not an empty
 * tree.
 */
int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
{
        int ret = 0;
        MA_STATE(mas, mt, 0, 0);
        MA_STATE(new_mas, new, 0, 0);

        mas_lock(&new_mas);
        mas_lock_nested(&mas, SINGLE_DEPTH_NESTING);
        mas_dup_build(&mas, &new_mas, gfp);
        mas_unlock(&mas);
        if (unlikely(mas_is_err(&mas))) {
                ret = xa_err(mas.node);
                if (ret == -ENOMEM)
                        mas_dup_free(&new_mas);
        }

        mas_unlock(&new_mas);
        return ret;
}
EXPORT_SYMBOL(mtree_dup);

/**
 * __mt_destroy() - Walk and free all nodes of a locked maple tree.
 * @mt: The maple tree
 *
 * Note: Does not handle locking.
 */
void __mt_destroy(struct maple_tree *mt)
{
        void *root = mt_root_locked(mt);

        rcu_assign_pointer(mt->ma_root, NULL);
        if (xa_is_node(root))
                mte_destroy_walk(root, mt);

        mt->ma_flags = mt_attr(mt);
}
EXPORT_SYMBOL_GPL(__mt_destroy);

/**
 * mtree_destroy() - Destroy a maple tree
 * @mt: The maple tree
 *
 * Frees all resources used by the tree.  Handles locking.
 */
void mtree_destroy(struct maple_tree *mt)
{
        mtree_lock(mt);
        __mt_destroy(mt);
        mtree_unlock(mt);
}
EXPORT_SYMBOL(mtree_destroy);

/**
 * mt_find() - Search from the start up until an entry is found.
 * @mt: The maple tree
 * @index: Pointer which contains the start location of the search
 * @max: The maximum value of the search range
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * In case that an entry is found @index is updated to point to the next
 * possible entry independent whether the found entry is occupying a
 * single index or a range if indices.
 *
 * Return: The entry at or after the @index or %NULL
 */
void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max)
{
        MA_STATE(mas, mt, *index, *index);
        void *entry;
#ifdef CONFIG_DEBUG_MAPLE_TREE
        unsigned long copy = *index;
#endif

        trace_ma_read(TP_FCT, &mas);

        if ((*index) > max)
                return NULL;

        rcu_read_lock();
retry:
        entry = mas_state_walk(&mas);
        if (mas_is_start(&mas))
                goto retry;

        if (unlikely(xa_is_zero(entry)))
                entry = NULL;

        if (entry)
                goto unlock;

        while (mas_is_active(&mas) && (mas.last < max)) {
                entry = mas_next_slot(&mas, max, false);
                if (likely(entry && !xa_is_zero(entry)))
                        break;
        }

        if (unlikely(xa_is_zero(entry)))
                entry = NULL;
unlock:
        rcu_read_unlock();
        if (likely(entry)) {
                *index = mas.last + 1;
#ifdef CONFIG_DEBUG_MAPLE_TREE
                if (MT_WARN_ON(mt, (*index) && ((*index) <= copy)))
                        pr_err("index not increased! %lx <= %lx\n",
                               *index, copy);
#endif
        }

        return entry;
}
EXPORT_SYMBOL(mt_find);

/**
 * mt_find_after() - Search from the start up until an entry is found.
 * @mt: The maple tree
 * @index: Pointer which contains the start location of the search
 * @max: The maximum value to check
 *
 * Same as mt_find() except that it checks @index for 0 before
 * searching. If @index == 0, the search is aborted. This covers a wrap
 * around of @index to 0 in an iterator loop.
 *
 * Return: The entry at or after the @index or %NULL
 */
void *mt_find_after(struct maple_tree *mt, unsigned long *index,
                    unsigned long max)
{
        if (!(*index))
                return NULL;

        return mt_find(mt, index, max);
}
EXPORT_SYMBOL(mt_find_after);

#ifdef CONFIG_DEBUG_MAPLE_TREE
atomic_t maple_tree_tests_run;
EXPORT_SYMBOL_GPL(maple_tree_tests_run);
atomic_t maple_tree_tests_passed;
EXPORT_SYMBOL_GPL(maple_tree_tests_passed);

#ifndef __KERNEL__
extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int);
void mt_set_non_kernel(unsigned int val)
{
        kmem_cache_set_non_kernel(maple_node_cache, val);
}

extern void kmem_cache_set_callback(struct kmem_cache *cachep,
                void (*callback)(void *));
void mt_set_callback(void (*callback)(void *))
{
        kmem_cache_set_callback(maple_node_cache, callback);
}

extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private);
void mt_set_private(void *private)
{
        kmem_cache_set_private(maple_node_cache, private);
}

extern unsigned long kmem_cache_get_alloc(struct kmem_cache *);
unsigned long mt_get_alloc_size(void)
{
        return kmem_cache_get_alloc(maple_node_cache);
}

extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *);
void mt_zero_nr_tallocated(void)
{
        kmem_cache_zero_nr_tallocated(maple_node_cache);
}

extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *);
unsigned int mt_nr_tallocated(void)
{
        return kmem_cache_nr_tallocated(maple_node_cache);
}

extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *);
unsigned int mt_nr_allocated(void)
{
        return kmem_cache_nr_allocated(maple_node_cache);
}

void mt_cache_shrink(void)
{
}
#else
/*
 * mt_cache_shrink() - For testing, don't use this.
 *
 * Certain testcases can trigger an OOM when combined with other memory
 * debugging configuration options.  This function is used to reduce the
 * possibility of an out of memory even due to kmem_cache objects remaining
 * around for longer than usual.
 */
void mt_cache_shrink(void)
{
        kmem_cache_shrink(maple_node_cache);

}
EXPORT_SYMBOL_GPL(mt_cache_shrink);

#endif /* not defined __KERNEL__ */
/*
 * mas_get_slot() - Get the entry in the maple state node stored at @offset.
 * @mas: The maple state
 * @offset: The offset into the slot array to fetch.
 *
 * Return: The entry stored at @offset.
 */
static inline struct maple_enode *mas_get_slot(struct ma_state *mas,
                unsigned char offset)
{
        return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)),
                        offset);
}

/* Depth first search, post-order */
static void mas_dfs_postorder(struct ma_state *mas, unsigned long max)
{

        struct maple_enode *p, *mn = mas->node;
        unsigned long p_min, p_max;

        mas_next_node(mas, mas_mn(mas), max);
        if (!mas_is_overflow(mas))
                return;

        if (mte_is_root(mn))
                return;

        mas->node = mn;
        mas_ascend(mas);
        do {
                p = mas->node;
                p_min = mas->min;
                p_max = mas->max;
                mas_prev_node(mas, 0);
        } while (!mas_is_underflow(mas));

        mas->node = p;
        mas->max = p_max;
        mas->min = p_min;
}

/* Tree validations */
static void mt_dump_node(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format);
static void mt_dump_range(unsigned long min, unsigned long max,
                          unsigned int depth, enum mt_dump_format format)
{
        static const char spaces[] = "                                ";

        switch(format) {
        case mt_dump_hex:
                if (min == max)
                        pr_info("%.*s%lx: ", depth * 2, spaces, min);
                else
                        pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max);
                break;
        case mt_dump_dec:
                if (min == max)
                        pr_info("%.*s%lu: ", depth * 2, spaces, min);
                else
                        pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max);
        }
}

static void mt_dump_entry(void *entry, unsigned long min, unsigned long max,
                          unsigned int depth, enum mt_dump_format format)
{
        mt_dump_range(min, max, depth, format);

        if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry),
                        xa_to_value(entry), entry);
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else if (mt_is_reserved(entry))
                pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry);
        else
                pr_cont(PTR_FMT "\n", entry);
}

static void mt_dump_range64(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format)
{
        struct maple_range_64 *node = &mte_to_node(entry)->mr64;
        bool leaf = mte_is_leaf(entry);
        unsigned long first = min;
        int i;

        pr_cont(" contents: ");
        for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) {
                switch(format) {
                case mt_dump_hex:
                        pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]);
                        break;
                case mt_dump_dec:
                        pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]);
                }
        }
        pr_cont(PTR_FMT "\n", node->slot[i]);
        for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) {
                unsigned long last = max;

                if (i < (MAPLE_RANGE64_SLOTS - 1))
                        last = node->pivot[i];
                else if (!node->slot[i] && max != mt_node_max(entry))
                        break;
                if (last == 0 && i > 0)
                        break;
                if (leaf)
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);
                else if (node->slot[i])
                        mt_dump_node(mt, mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);

                if (last == max)
                        break;
                if (last > max) {
                        switch(format) {
                        case mt_dump_hex:
                                pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n",
                                        node, last, max, i);
                                break;
                        case mt_dump_dec:
                                pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n",
                                        node, last, max, i);
                        }
                }
                first = last + 1;
        }
}

static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
        unsigned long min, unsigned long max, unsigned int depth,
        enum mt_dump_format format)
{
        struct maple_arange_64 *node = &mte_to_node(entry)->ma64;
        unsigned long first = min;
        int i;

        pr_cont(" contents: ");
        for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
                switch (format) {
                case mt_dump_hex:
                        pr_cont("%lx ", node->gap[i]);
                        break;
                case mt_dump_dec:
                        pr_cont("%lu ", node->gap[i]);
                }
        }
        pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap);
        for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) {
                switch (format) {
                case mt_dump_hex:
                        pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]);
                        break;
                case mt_dump_dec:
                        pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]);
                }
        }
        pr_cont(PTR_FMT "\n", node->slot[i]);
        for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
                unsigned long last = max;

                if (i < (MAPLE_ARANGE64_SLOTS - 1))
                        last = node->pivot[i];
                else if (!node->slot[i])
                        break;
                if (last == 0 && i > 0)
                        break;
                if (node->slot[i])
                        mt_dump_node(mt, mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);

                if (last == max)
                        break;
                if (last > max) {
                        switch(format) {
                        case mt_dump_hex:
                                pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n",
                                        node, last, max, i);
                                break;
                        case mt_dump_dec:
                                pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n",
                                        node, last, max, i);
                        }
                }
                first = last + 1;
        }
}

static void mt_dump_node(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format)
{
        struct maple_node *node = mte_to_node(entry);
        unsigned int type = mte_node_type(entry);
        unsigned int i;

        mt_dump_range(min, max, depth, format);

        pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node,
                depth, type, node ? node->parent : NULL);
        switch (type) {
        case maple_dense:
                pr_cont("\n");
                for (i = 0; i < MAPLE_NODE_SLOTS; i++) {
                        if (min + i > max)
                                pr_cont("OUT OF RANGE: ");
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        min + i, min + i, depth, format);
                }
                break;
        case maple_leaf_64:
        case maple_range_64:
                mt_dump_range64(mt, entry, min, max, depth, format);
                break;
        case maple_arange_64:
                mt_dump_arange64(mt, entry, min, max, depth, format);
                break;

        default:
                pr_cont(" UNKNOWN TYPE\n");
        }
}

void mt_dump(const struct maple_tree *mt, enum mt_dump_format format)
{
        void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt));

        pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n",
                 mt, mt->ma_flags, mt_height(mt), entry);
        if (xa_is_node(entry))
                mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format);
        else if (entry)
                mt_dump_entry(entry, 0, 0, 0, format);
        else
                pr_info("(empty)\n");
}
EXPORT_SYMBOL_GPL(mt_dump);

/*
 * Calculate the maximum gap in a node and check if that's what is reported in
 * the parent (unless root).
 */
static void mas_validate_gaps(struct ma_state *mas)
{
        struct maple_enode *mte = mas->node;
        struct maple_node *p_mn, *node = mte_to_node(mte);
        enum maple_type mt = mte_node_type(mas->node);
        unsigned long gap = 0, max_gap = 0;
        unsigned long p_end, p_start = mas->min;
        unsigned char p_slot, offset;
        unsigned long *gaps = NULL;
        unsigned long *pivots = ma_pivots(node, mt);
        unsigned int i;

        if (ma_is_dense(mt)) {
                for (i = 0; i < mt_slot_count(mte); i++) {
                        if (mas_get_slot(mas, i)) {
                                if (gap > max_gap)
                                        max_gap = gap;
                                gap = 0;
                                continue;
                        }
                        gap++;
                }
                goto counted;
        }

        gaps = ma_gaps(node, mt);
        for (i = 0; i < mt_slot_count(mte); i++) {
                p_end = mas_safe_pivot(mas, pivots, i, mt);

                if (!gaps) {
                        if (!mas_get_slot(mas, i))
                                gap = p_end - p_start + 1;
                } else {
                        void *entry = mas_get_slot(mas, i);

                        gap = gaps[i];
                        MT_BUG_ON(mas->tree, !entry);

                        if (gap > p_end - p_start + 1) {
                                pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n",
                                       mas_mn(mas), i, gap, p_end, p_start,
                                       p_end - p_start + 1);
                                MT_BUG_ON(mas->tree, gap > p_end - p_start + 1);
                        }
                }

                if (gap > max_gap)
                        max_gap = gap;

                p_start = p_end + 1;
                if (p_end >= mas->max)
                        break;
        }

counted:
        if (mt == maple_arange_64) {
                MT_BUG_ON(mas->tree, !gaps);
                offset = ma_meta_gap(node);
                if (offset > i) {
                        pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset);
                        MT_BUG_ON(mas->tree, 1);
                }

                if (gaps[offset] != max_gap) {
                        pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n",
                               node, offset, max_gap);
                        MT_BUG_ON(mas->tree, 1);
                }

                for (i++ ; i < mt_slot_count(mte); i++) {
                        if (gaps[i] != 0) {
                                pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n",
                                       node, i);
                                MT_BUG_ON(mas->tree, 1);
                        }
                }
        }

        if (mte_is_root(mte))
                return;

        p_slot = mte_parent_slot(mas->node);
        p_mn = mte_parent(mte);
        MT_BUG_ON(mas->tree, max_gap > mas->max);
        if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) {
                pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap);
                mt_dump(mas->tree, mt_dump_hex);
                MT_BUG_ON(mas->tree, 1);
        }
}

static void mas_validate_parent_slot(struct ma_state *mas)
{
        struct maple_node *parent;
        struct maple_enode *node;
        enum maple_type p_type;
        unsigned char p_slot;
        void __rcu **slots;
        int i;

        if (mte_is_root(mas->node))
                return;

        p_slot = mte_parent_slot(mas->node);
        p_type = mas_parent_type(mas, mas->node);
        parent = mte_parent(mas->node);
        slots = ma_slots(parent, p_type);
        MT_BUG_ON(mas->tree, mas_mn(mas) == parent);

        /* Check prev/next parent slot for duplicate node entry */

        for (i = 0; i < mt_slots[p_type]; i++) {
                node = mas_slot(mas, slots, i);
                if (i == p_slot) {
                        if (node != mas->node)
                                pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n",
                                        parent, i, mas_mn(mas));
                        MT_BUG_ON(mas->tree, node != mas->node);
                } else if (node == mas->node) {
                        pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n",
                               mas_mn(mas), parent, i, p_slot);
                        MT_BUG_ON(mas->tree, node == mas->node);
                }
        }
}

static void mas_validate_child_slot(struct ma_state *mas)
{
        enum maple_type type = mte_node_type(mas->node);
        void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
        unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type);
        struct maple_enode *child;
        unsigned char i;

        if (mte_is_leaf(mas->node))
                return;

        for (i = 0; i < mt_slots[type]; i++) {
                child = mas_slot(mas, slots, i);

                if (!child) {
                        pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n",
                               mas_mn(mas), i);
                        MT_BUG_ON(mas->tree, 1);
                }

                if (mte_parent_slot(child) != i) {
                        pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n",
                               mas_mn(mas), i, mte_to_node(child),
                               mte_parent_slot(child));
                        MT_BUG_ON(mas->tree, 1);
                }

                if (mte_parent(child) != mte_to_node(mas->node)) {
                        pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n",
                               mte_to_node(child), mte_parent(child),
                               mte_to_node(mas->node));
                        MT_BUG_ON(mas->tree, 1);
                }

                if (i < mt_pivots[type] && pivots[i] == mas->max)
                        break;
        }
}

/*
 * Validate all pivots are within mas->min and mas->max, check metadata ends
 * where the maximum ends and ensure there is no slots or pivots set outside of
 * the end of the data.
 */
static void mas_validate_limits(struct ma_state *mas)
{
        int i;
        unsigned long prev_piv = 0;
        enum maple_type type = mte_node_type(mas->node);
        void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
        unsigned long *pivots = ma_pivots(mas_mn(mas), type);

        for (i = 0; i < mt_slots[type]; i++) {
                unsigned long piv;

                piv = mas_safe_pivot(mas, pivots, i, type);

                if (!piv && (i != 0)) {
                        pr_err("Missing node limit pivot at " PTR_FMT "[%u]",
                               mas_mn(mas), i);
                        MAS_WARN_ON(mas, 1);
                }

                if (prev_piv > piv) {
                        pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n",
                                mas_mn(mas), i, piv, prev_piv);
                        MAS_WARN_ON(mas, piv < prev_piv);
                }

                if (piv < mas->min) {
                        pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i,
                                piv, mas->min);
                        MAS_WARN_ON(mas, piv < mas->min);
                }
                if (piv > mas->max) {
                        pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i,
                                piv, mas->max);
                        MAS_WARN_ON(mas, piv > mas->max);
                }
                prev_piv = piv;
                if (piv == mas->max)
                        break;
        }

        if (mas_data_end(mas) != i) {
                pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n",
                       mas_mn(mas), mas_data_end(mas), i);
                MT_BUG_ON(mas->tree, 1);
        }

        for (i += 1; i < mt_slots[type]; i++) {
                void *entry = mas_slot(mas, slots, i);

                if (entry && (i != mt_slots[type] - 1)) {
                        pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n",
                               mas_mn(mas), i, entry);
                        MT_BUG_ON(mas->tree, entry != NULL);
                }

                if (i < mt_pivots[type]) {
                        unsigned long piv = pivots[i];

                        if (!piv)
                                continue;

                        pr_err(PTR_FMT "[%u] should not have piv %lu\n",
                               mas_mn(mas), i, piv);
                        MAS_WARN_ON(mas, i < mt_pivots[type] - 1);
                }
        }
}

static void mt_validate_nulls(struct maple_tree *mt)
{
        void *entry, *last = (void *)1;
        unsigned char offset = 0;
        void __rcu **slots;
        MA_STATE(mas, mt, 0, 0);

        mas_start(&mas);
        if (mas_is_none(&mas) || (mas_is_ptr(&mas)))
                return;

        while (!mte_is_leaf(mas.node))
                mas_descend(&mas);

        slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node));
        do {
                entry = mas_slot(&mas, slots, offset);
                if (!last && !entry) {
                        pr_err("Sequential nulls end at " PTR_FMT "[%u]\n",
                                mas_mn(&mas), offset);
                }
                MT_BUG_ON(mt, !last && !entry);
                last = entry;
                if (offset == mas_data_end(&mas)) {
                        mas_next_node(&mas, mas_mn(&mas), ULONG_MAX);
                        if (mas_is_overflow(&mas))
                                return;
                        offset = 0;
                        slots = ma_slots(mte_to_node(mas.node),
                                         mte_node_type(mas.node));
                } else {
                        offset++;
                }

        } while (!mas_is_overflow(&mas));
}

/*
 * validate a maple tree by checking:
 * 1. The limits (pivots are within mas->min to mas->max)
 * 2. The gap is correctly set in the parents
 */
void mt_validate(struct maple_tree *mt)
        __must_hold(mas->tree->ma_lock)
{
        unsigned char end;

        MA_STATE(mas, mt, 0, 0);
        mas_start(&mas);
        if (!mas_is_active(&mas))
                return;

        while (!mte_is_leaf(mas.node))
                mas_descend(&mas);

        while (!mas_is_overflow(&mas)) {
                MAS_WARN_ON(&mas, mte_dead_node(mas.node));
                end = mas_data_end(&mas);
                if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) &&
                                (!mte_is_root(mas.node)))) {
                        pr_err("Invalid size %u of " PTR_FMT "\n",
                               end, mas_mn(&mas));
                }

                mas_validate_parent_slot(&mas);
                mas_validate_limits(&mas);
                mas_validate_child_slot(&mas);
                if (mt_is_alloc(mt))
                        mas_validate_gaps(&mas);
                mas_dfs_postorder(&mas, ULONG_MAX);
        }
        mt_validate_nulls(mt);
}
EXPORT_SYMBOL_GPL(mt_validate);

void mas_dump(const struct ma_state *mas)
{
        pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ",
               mas->tree, mas->node);
        switch (mas->status) {
        case ma_active:
                pr_err("(ma_active)");
                break;
        case ma_none:
                pr_err("(ma_none)");
                break;
        case ma_root:
                pr_err("(ma_root)");
                break;
        case ma_start:
                pr_err("(ma_start) ");
                break;
        case ma_pause:
                pr_err("(ma_pause) ");
                break;
        case ma_overflow:
                pr_err("(ma_overflow) ");
                break;
        case ma_underflow:
                pr_err("(ma_underflow) ");
                break;
        case ma_error:
                pr_err("(ma_error) ");
                break;
        }

        pr_err("Store Type: ");
        switch (mas->store_type) {
        case wr_invalid:
                pr_err("invalid store type\n");
                break;
        case wr_new_root:
                pr_err("new_root\n");
                break;
        case wr_store_root:
                pr_err("store_root\n");
                break;
        case wr_exact_fit:
                pr_err("exact_fit\n");
                break;
        case wr_split_store:
                pr_err("split_store\n");
                break;
        case wr_slot_store:
                pr_err("slot_store\n");
                break;
        case wr_append:
                pr_err("append\n");
                break;
        case wr_node_store:
                pr_err("node_store\n");
                break;
        case wr_spanning_store:
                pr_err("spanning_store\n");
                break;
        case wr_rebalance:
                pr_err("rebalance\n");
                break;
        }

        pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end,
               mas->index, mas->last);
        pr_err("     min=%lx max=%lx sheaf=" PTR_FMT ", request %lu depth=%u, flags=%x\n",
               mas->min, mas->max, mas->sheaf, mas->node_request, mas->depth,
               mas->mas_flags);
        if (mas->index > mas->last)
                pr_err("Check index & last\n");
}
EXPORT_SYMBOL_GPL(mas_dump);

void mas_wr_dump(const struct ma_wr_state *wr_mas)
{
        pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n",
               wr_mas->node, wr_mas->r_min, wr_mas->r_max);
        pr_err("        type=%u off_end=%u, node_end=%u, end_piv=%lx\n",
               wr_mas->type, wr_mas->offset_end, wr_mas->mas->end,
               wr_mas->end_piv);
}
EXPORT_SYMBOL_GPL(mas_wr_dump);

#endif /* CONFIG_DEBUG_MAPLE_TREE */




















































































































































































































    1 



    1 





















































































































































































    1 

    1 


    1 
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
// SPDX-License-Identifier: GPL-2.0
/*
 * memfd_create system call and file sealing support
 *
 * Code was originally included in shmem.c, and broken out to facilitate
 * use by hugetlbfs as well as tmpfs.
 */

#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/khugepaged.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
#include "swap.h"

/*
 * We need a tag: a new tag would expand every xa_node by 8 bytes,
 * so reuse a tag which we firmly believe is never set or cleared on tmpfs
 * or hugetlbfs because they are memory only filesystems.
 */
#define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
#define LAST_SCAN               4       /* about 150ms max */

static bool memfd_folio_has_extra_refs(struct folio *folio)
{
        return folio_ref_count(folio) != folio_expected_ref_count(folio);
}

static void memfd_tag_pins(struct xa_state *xas)
{
        struct folio *folio;
        int latency = 0;

        lru_add_drain();

        xas_lock_irq(xas);
        xas_for_each(xas, folio, ULONG_MAX) {
                if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
                        xas_set_mark(xas, MEMFD_TAG_PINNED);

                if (++latency < XA_CHECK_SCHED)
                        continue;
                latency = 0;

                xas_pause(xas);
                xas_unlock_irq(xas);
                cond_resched();
                xas_lock_irq(xas);
        }
        xas_unlock_irq(xas);
}

/*
 * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c).
 * It is mainly called to allocate a folio in a memfd when the caller
 * (memfd_pin_folios()) cannot find a folio in the page cache at a given
 * index in the mapping.
 */
struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
{
#ifdef CONFIG_HUGETLB_PAGE
        struct folio *folio;
        gfp_t gfp_mask;

        if (is_file_hugepages(memfd)) {
                /*
                 * The folio would most likely be accessed by a DMA driver,
                 * therefore, we have zone memory constraints where we can
                 * alloc from. Also, the folio will be pinned for an indefinite
                 * amount of time, so it is not expected to be migrated away.
                 */
                struct inode *inode = file_inode(memfd);
                struct hstate *h = hstate_file(memfd);
                int err = -ENOMEM;
                long nr_resv;

                gfp_mask = htlb_alloc_mask(h);
                gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
                idx >>= huge_page_order(h);

                nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, EMPTY_VMA_FLAGS);
                if (nr_resv < 0)
                        return ERR_PTR(nr_resv);

                folio = alloc_hugetlb_folio_reserve(h,
                                                    numa_node_id(),
                                                    NULL,
                                                    gfp_mask);
                if (folio) {
                        u32 hash;

                        /*
                         * Zero the folio to prevent information leaks to userspace.
                         * Use folio_zero_user() which is optimized for huge/gigantic
                         * pages. Pass 0 as addr_hint since this is not a faulting path
                         *  and we don't have a user virtual address yet.
                         */
                        folio_zero_user(folio, 0);

                        /*
                         * Mark the folio uptodate before adding to page cache,
                         * as required by filemap.c and other hugetlb paths.
                         */
                        __folio_mark_uptodate(folio);

                        /*
                         * Serialize hugepage allocation and instantiation to prevent
                         * races with concurrent allocations, as required by all other
                         * callers of hugetlb_add_to_page_cache().
                         */
                        hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);

                        err = hugetlb_add_to_page_cache(folio,
                                                        memfd->f_mapping,
                                                        idx);

                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);

                        if (err) {
                                folio_put(folio);
                                goto err_unresv;
                        }

                        hugetlb_set_folio_subpool(folio, subpool_inode(inode));
                        folio_unlock(folio);
                        return folio;
                }
err_unresv:
                if (nr_resv > 0)
                        hugetlb_unreserve_pages(inode, idx, idx + 1, 0);
                return ERR_PTR(err);
        }
#endif
        return shmem_read_folio(memfd->f_mapping, idx);
}

/*
 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
 * via get_user_pages(), drivers might have some pending I/O without any active
 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
 * and see whether it has an elevated ref-count. If so, we tag them and wait for
 * them to be dropped.
 * The caller must guarantee that no new user will acquire writable references
 * to those folios to avoid races.
 */
static int memfd_wait_for_pins(struct address_space *mapping)
{
        XA_STATE(xas, &mapping->i_pages, 0);
        struct folio *folio;
        int error, scan;

        memfd_tag_pins(&xas);

        error = 0;
        for (scan = 0; scan <= LAST_SCAN; scan++) {
                int latency = 0;

                if (!xas_marked(&xas, MEMFD_TAG_PINNED))
                        break;

                if (!scan)
                        lru_add_drain_all();
                else if (schedule_timeout_killable((HZ << scan) / 200))
                        scan = LAST_SCAN;

                xas_set(&xas, 0);
                xas_lock_irq(&xas);
                xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
                        bool clear = true;

                        if (!xa_is_value(folio) &&
                            memfd_folio_has_extra_refs(folio)) {
                                /*
                                 * On the last scan, we clean up all those tags
                                 * we inserted; but make a note that we still
                                 * found folios pinned.
                                 */
                                if (scan == LAST_SCAN)
                                        error = -EBUSY;
                                else
                                        clear = false;
                        }
                        if (clear)
                                xas_clear_mark(&xas, MEMFD_TAG_PINNED);

                        if (++latency < XA_CHECK_SCHED)
                                continue;
                        latency = 0;

                        xas_pause(&xas);
                        xas_unlock_irq(&xas);
                        cond_resched();
                        xas_lock_irq(&xas);
                }
                xas_unlock_irq(&xas);
        }

        return error;
}

static unsigned int *memfd_file_seals_ptr(struct file *file)
{
        if (shmem_file(file))
                return &SHMEM_I(file_inode(file))->seals;

#ifdef CONFIG_HUGETLBFS
        if (is_file_hugepages(file))
                return &HUGETLBFS_I(file_inode(file))->seals;
#endif

        return NULL;
}

#define F_ALL_SEALS (F_SEAL_SEAL | \
                     F_SEAL_EXEC | \
                     F_SEAL_SHRINK | \
                     F_SEAL_GROW | \
                     F_SEAL_WRITE | \
                     F_SEAL_FUTURE_WRITE)

int memfd_add_seals(struct file *file, unsigned int seals)
{
        struct inode *inode = file_inode(file);
        unsigned int *file_seals;
        int error;

        /*
         * SEALING
         * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
         * but restrict access to a specific subset of file operations. Seals
         * can only be added, but never removed. This way, mutually untrusted
         * parties can share common memory regions with a well-defined policy.
         * A malicious peer can thus never perform unwanted operations on a
         * shared object.
         *
         * Seals are only supported on special tmpfs or hugetlbfs files and
         * always affect the whole underlying inode. Once a seal is set, it
         * may prevent some kinds of access to the file. Currently, the
         * following seals are defined:
         *   SEAL_SEAL: Prevent further seals from being set on this file
         *   SEAL_SHRINK: Prevent the file from shrinking
         *   SEAL_GROW: Prevent the file from growing
         *   SEAL_WRITE: Prevent write access to the file
         *   SEAL_EXEC: Prevent modification of the exec bits in the file mode
         *
         * As we don't require any trust relationship between two parties, we
         * must prevent seals from being removed. Therefore, sealing a file
         * only adds a given set of seals to the file, it never touches
         * existing seals. Furthermore, the "setting seals"-operation can be
         * sealed itself, which basically prevents any further seal from being
         * added.
         *
         * Semantics of sealing are only defined on volatile files. Only
         * anonymous tmpfs and hugetlbfs files support sealing. More
         * importantly, seals are never written to disk. Therefore, there's
         * no plan to support it on other file types.
         */

        if (!(file->f_mode & FMODE_WRITE))
                return -EPERM;
        if (seals & ~(unsigned int)F_ALL_SEALS)
                return -EINVAL;

        inode_lock(inode);

        file_seals = memfd_file_seals_ptr(file);
        if (!file_seals) {
                error = -EINVAL;
                goto unlock;
        }

        if (*file_seals & F_SEAL_SEAL) {
                error = -EPERM;
                goto unlock;
        }

        if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
                error = mapping_deny_writable(file->f_mapping);
                if (error)
                        goto unlock;

                error = memfd_wait_for_pins(file->f_mapping);
                if (error) {
                        mapping_allow_writable(file->f_mapping);
                        goto unlock;
                }
        }

        /*
         * SEAL_EXEC implies SEAL_WRITE, making W^X from the start.
         */
        if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
                seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;

        *file_seals |= seals;
        error = 0;

unlock:
        inode_unlock(inode);
        return error;
}

int memfd_get_seals(struct file *file)
{
        unsigned int *seals = memfd_file_seals_ptr(file);

        return seals ? *seals : -EINVAL;
}

long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
        long error;

        switch (cmd) {
        case F_ADD_SEALS:
                error = memfd_add_seals(file, arg);
                break;
        case F_GET_SEALS:
                error = memfd_get_seals(file);
                break;
        default:
                error = -EINVAL;
                break;
        }

        return error;
}

#define MFD_NAME_PREFIX "memfd:"
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)

#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)

static int check_sysctl_memfd_noexec(unsigned int *flags)
{
#ifdef CONFIG_SYSCTL
        struct pid_namespace *ns = task_active_pid_ns(current);
        int sysctl = pidns_memfd_noexec_scope(ns);

        if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
                if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
                        *flags |= MFD_NOEXEC_SEAL;
                else
                        *flags |= MFD_EXEC;
        }

        if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
                pr_err_ratelimited(
                        "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
                        current->comm, task_pid_nr(current), sysctl);
                return -EACCES;
        }
#endif
        return 0;
}

static inline bool is_write_sealed(unsigned int seals)
{
        return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
}

static int check_write_seal(vm_flags_t *vm_flags_ptr)
{
        vm_flags_t vm_flags = *vm_flags_ptr;
        vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE);

        /* If a private mapping then writability is irrelevant. */
        if (!(mask & VM_SHARED))
                return 0;

        /*
         * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
         * write seals are active.
         */
        if (mask & VM_WRITE)
                return -EPERM;

        /*
         * This is a read-only mapping, disallow mprotect() from making a
         * write-sealed mapping writable in future.
         */
        *vm_flags_ptr &= ~VM_MAYWRITE;

        return 0;
}

int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr)
{
        int err = 0;
        unsigned int *seals_ptr = memfd_file_seals_ptr(file);
        unsigned int seals = seals_ptr ? *seals_ptr : 0;

        if (is_write_sealed(seals))
                err = check_write_seal(vm_flags_ptr);

        return err;
}

static int sanitize_flags(unsigned int *flags_ptr)
{
        unsigned int flags = *flags_ptr;

        if (!(flags & MFD_HUGETLB)) {
                if (flags & ~MFD_ALL_FLAGS)
                        return -EINVAL;
        } else {
                /* Allow huge page size encoding in flags. */
                if (flags & ~(MFD_ALL_FLAGS |
                                (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
                        return -EINVAL;
        }

        /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
        if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
                return -EINVAL;

        return check_sysctl_memfd_noexec(flags_ptr);
}

static char *alloc_name(const char __user *uname)
{
        int error;
        char *name;
        long len;

        name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
        if (!name)
                return ERR_PTR(-ENOMEM);

        memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN);
        /* returned length does not include terminating zero */
        len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
        if (len < 0) {
                error = -EFAULT;
                goto err_name;
        } else if (len > MFD_NAME_MAX_LEN) {
                error = -EINVAL;
                goto err_name;
        }

        return name;

err_name:
        kfree(name);
        return ERR_PTR(error);
}

struct file *memfd_alloc_file(const char *name, unsigned int flags)
{
        unsigned int *file_seals;
        struct file *file;
        struct inode *inode;
        int err = 0;

        if (flags & MFD_HUGETLB) {
                file = hugetlb_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT),
                                        HUGETLB_ANONHUGE_INODE,
                                        (flags >> MFD_HUGE_SHIFT) &
                                        MFD_HUGE_MASK);
        } else {
                file = shmem_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT));
        }
        if (IS_ERR(file))
                return file;

        inode = file_inode(file);
        err = security_inode_init_security_anon(inode,
                        &QSTR(MEMFD_ANON_NAME), NULL);
        if (err) {
                fput(file);
                file = ERR_PTR(err);
                return file;
        }

        file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        file->f_flags |= O_LARGEFILE;

        if (flags & MFD_NOEXEC_SEAL) {
                inode->i_mode &= ~0111;
                file_seals = memfd_file_seals_ptr(file);
                if (file_seals) {
                        *file_seals &= ~F_SEAL_SEAL;
                        *file_seals |= F_SEAL_EXEC;
                }
        } else if (flags & MFD_ALLOW_SEALING) {
                /* MFD_EXEC and MFD_ALLOW_SEALING are set */
                file_seals = memfd_file_seals_ptr(file);
                if (file_seals)
                        *file_seals &= ~F_SEAL_SEAL;
        }

        return file;
}

SYSCALL_DEFINE2(memfd_create,
                const char __user *, uname,
                unsigned int, flags)
{
        char *name __free(kfree) = NULL;
        unsigned int fd_flags;
        int error;

        error = sanitize_flags(&flags);
        if (error < 0)
                return error;

        name = alloc_name(uname);
        if (IS_ERR(name))
                return PTR_ERR(name);

        fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0;
        return FD_ADD(fd_flags, memfd_alloc_file(name, flags));
}
















































    2 

    2 























    1 



    1 

    1 

    1 














    1 



    1 

    1 









    1 























































    1 


















    1 

























































    1 



    1 

    1 







    1 








    1 





    1 





































































































    1 


    1 

    1 



    1 





    1 



    1 
    1 



    1 




















    1 




    1 







    1 







    1 



    1 









    1 



















    1 







































    1 














    1 












    1 
    1 

































    1 
























































    1 


    1 



























    1 


    1 












































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
// SPDX-License-Identifier: GPL-2.0-only
/*
 * proc/fs/generic.c --- generic routines for the proc-fs
 *
 * This file contains generic proc-fs routines for handling
 * directories and files.
 * 
 * Copyright (C) 1991, 1992 Linus Torvalds.
 * Copyright (C) 1997 Theodore Ts'o
 */

#include <linux/cache.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/printk.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/idr.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>

#include "internal.h"

static DEFINE_RWLOCK(proc_subdir_lock);

struct kmem_cache *proc_dir_entry_cache __ro_after_init;

void pde_free(struct proc_dir_entry *pde)
{
        if (S_ISLNK(pde->mode))
                kfree(pde->data);
        if (pde->name != pde->inline_name)
                kfree(pde->name);
        kmem_cache_free(proc_dir_entry_cache, pde);
}

static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len)
{
        if (len < de->namelen)
                return -1;
        if (len > de->namelen)
                return 1;

        return memcmp(name, de->name, len);
}

static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
{
        return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
                             subdir_node);
}

static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
{
        return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry,
                             subdir_node);
}

static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
                                              const char *name,
                                              unsigned int len)
{
        struct rb_node *node = dir->subdir.rb_node;

        while (node) {
                struct proc_dir_entry *de = rb_entry(node,
                                                     struct proc_dir_entry,
                                                     subdir_node);
                int result = proc_match(name, de, len);

                if (result < 0)
                        node = node->rb_left;
                else if (result > 0)
                        node = node->rb_right;
                else
                        return de;
        }
        return NULL;
}

static bool pde_subdir_insert(struct proc_dir_entry *dir,
                              struct proc_dir_entry *de)
{
        struct rb_root *root = &dir->subdir;
        struct rb_node **new = &root->rb_node, *parent = NULL;

        /* Figure out where to put new node */
        while (*new) {
                struct proc_dir_entry *this = rb_entry(*new,
                                                       struct proc_dir_entry,
                                                       subdir_node);
                int result = proc_match(de->name, this, de->namelen);

                parent = *new;
                if (result < 0)
                        new = &(*new)->rb_left;
                else if (result > 0)
                        new = &(*new)->rb_right;
                else
                        return false;
        }

        /* Add new node and rebalance tree. */
        rb_link_node(&de->subdir_node, parent, new);
        rb_insert_color(&de->subdir_node, root);
        return true;
}

static int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        struct proc_dir_entry *de = PDE(inode);
        int error;

        error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
        if (error)
                return error;

        setattr_copy(&nop_mnt_idmap, inode, iattr);

        proc_set_user(de, inode->i_uid, inode->i_gid);
        de->mode = inode->i_mode;
        return 0;
}

static int proc_getattr(struct mnt_idmap *idmap,
                        const struct path *path, struct kstat *stat,
                        u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct proc_dir_entry *de = PDE(inode);
        if (de) {
                nlink_t nlink = READ_ONCE(de->nlink);
                if (nlink > 0) {
                        set_nlink(inode, nlink);
                }
        }

        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        return 0;
}

static const struct inode_operations proc_file_inode_operations = {
        .setattr        = proc_setattr,
};

/*
 * This function parses a name such as "tty/driver/serial", and
 * returns the struct proc_dir_entry for "/proc/tty/driver", and
 * returns "serial" in residual.
 */
static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
                             const char **residual)
{
        const char                     *cp = name, *next;
        struct proc_dir_entry        *de;

        de = *ret ?: &proc_root;
        while ((next = strchr(cp, '/')) != NULL) {
                de = pde_subdir_find(de, cp, next - cp);
                if (!de) {
                        WARN(1, "name '%s'\n", name);
                        return -ENOENT;
                }
                cp = next + 1;
        }
        *residual = cp;
        *ret = de;
        return 0;
}

static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
                           const char **residual)
{
        int rv;

        read_lock(&proc_subdir_lock);
        rv = __xlate_proc_name(name, ret, residual);
        read_unlock(&proc_subdir_lock);
        return rv;
}

static DEFINE_IDA(proc_inum_ida);

#define PROC_DYNAMIC_FIRST 0xF0000000U

/*
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
 */
int proc_alloc_inum(unsigned int *inum)
{
        int i;

        i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST,
                          GFP_KERNEL);
        if (i < 0)
                return i;

        *inum = PROC_DYNAMIC_FIRST + (unsigned int)i;
        return 0;
}

void proc_free_inum(unsigned int inum)
{
        ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
}

static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name,
                                  struct dentry *dentry, unsigned int flags)
{
        if (flags & LOOKUP_RCU)
                return -ECHILD;

        if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0)
                return 0; /* revalidate */
        return 1;
}

static int proc_misc_d_delete(const struct dentry *dentry)
{
        return atomic_read(&PDE(d_inode(dentry))->in_use) < 0;
}

static const struct dentry_operations proc_misc_dentry_ops = {
        .d_revalidate        = proc_misc_d_revalidate,
        .d_delete        = proc_misc_d_delete,
};

/*
 * Don't create negative dentries here, return -ENOENT by hand
 * instead.
 */
struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
                              struct proc_dir_entry *de)
{
        struct inode *inode;

        read_lock(&proc_subdir_lock);
        de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
        if (de) {
                pde_get(de);
                read_unlock(&proc_subdir_lock);
                inode = proc_get_inode(dir->i_sb, de);
                if (!inode)
                        return ERR_PTR(-ENOMEM);
                if (de->flags & PROC_ENTRY_FORCE_LOOKUP)
                        return d_splice_alias_ops(inode, dentry,
                                                  &proc_net_dentry_ops);
                return d_splice_alias_ops(inode, dentry,
                                          &proc_misc_dentry_ops);
        }
        read_unlock(&proc_subdir_lock);
        return ERR_PTR(-ENOENT);
}

struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
                unsigned int flags)
{
        struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb);

        if (fs_info->pidonly == PROC_PIDONLY_ON)
                return ERR_PTR(-ENOENT);

        return proc_lookup_de(dir, dentry, PDE(dir));
}

/*
 * This returns non-zero if at EOF, so that the /proc
 * root directory can use this and check if it should
 * continue with the <pid> entries..
 *
 * Note that the VFS-layer doesn't care about the return
 * value of the readdir() call, as long as it's non-negative
 * for success..
 */
int proc_readdir_de(struct file *file, struct dir_context *ctx,
                    struct proc_dir_entry *de)
{
        int i;

        if (!dir_emit_dots(file, ctx))
                return 0;

        i = ctx->pos - 2;
        read_lock(&proc_subdir_lock);
        de = pde_subdir_first(de);
        for (;;) {
                if (!de) {
                        read_unlock(&proc_subdir_lock);
                        return 0;
                }
                if (!i)
                        break;
                de = pde_subdir_next(de);
                i--;
        }

        do {
                struct proc_dir_entry *next;
                pde_get(de);
                read_unlock(&proc_subdir_lock);
                if (!dir_emit(ctx, de->name, de->namelen,
                            de->low_ino, de->mode >> 12)) {
                        pde_put(de);
                        return 0;
                }
                ctx->pos++;
                read_lock(&proc_subdir_lock);
                next = pde_subdir_next(de);
                pde_put(de);
                de = next;
        } while (de);
        read_unlock(&proc_subdir_lock);
        return 1;
}

int proc_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);

        if (fs_info->pidonly == PROC_PIDONLY_ON)
                return 1;

        return proc_readdir_de(file, ctx, PDE(inode));
}

/*
 * These are the generic /proc directory operations. They
 * use the in-memory "struct proc_dir_entry" tree to parse
 * the /proc directory.
 */
static const struct file_operations proc_dir_operations = {
        .llseek                        = generic_file_llseek,
        .read                        = generic_read_dir,
        .iterate_shared                = proc_readdir,
};

static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name,
                                 struct dentry *dentry, unsigned int flags)
{
        return 0;
}

const struct dentry_operations proc_net_dentry_ops = {
        .d_revalidate        = proc_net_d_revalidate,
        .d_delete        = always_delete_dentry,
};

/*
 * proc directories can do almost nothing..
 */
static const struct inode_operations proc_dir_inode_operations = {
        .lookup                = proc_lookup,
        .getattr        = proc_getattr,
        .setattr        = proc_setattr,
};

static void pde_set_flags(struct proc_dir_entry *pde)
{
        const struct proc_ops *proc_ops = pde->proc_ops;

        if (!proc_ops)
                return;

        if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
                pde->flags |= PROC_ENTRY_PERMANENT;
        if (proc_ops->proc_read_iter)
                pde->flags |= PROC_ENTRY_proc_read_iter;
#ifdef CONFIG_COMPAT
        if (proc_ops->proc_compat_ioctl)
                pde->flags |= PROC_ENTRY_proc_compat_ioctl;
#endif
        if (proc_ops->proc_lseek)
                pde->flags |= PROC_ENTRY_proc_lseek;
}

/* returns the registered entry, or frees dp and returns NULL on failure */
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
                struct proc_dir_entry *dp)
{
        if (proc_alloc_inum(&dp->low_ino))
                goto out_free_entry;

        if (!S_ISDIR(dp->mode))
                pde_set_flags(dp);

        write_lock(&proc_subdir_lock);
        dp->parent = dir;
        if (pde_subdir_insert(dir, dp) == false) {
                WARN(1, "proc_dir_entry '%s/%s' already registered\n",
                     dir->name, dp->name);
                write_unlock(&proc_subdir_lock);
                goto out_free_inum;
        }
        dir->nlink++;
        write_unlock(&proc_subdir_lock);

        return dp;
out_free_inum:
        proc_free_inum(dp->low_ino);
out_free_entry:
        pde_free(dp);
        return NULL;
}

static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
                                          const char *name,
                                          umode_t mode,
                                          nlink_t nlink)
{
        struct proc_dir_entry *ent = NULL;
        const char *fn;
        struct qstr qstr;

        if (xlate_proc_name(name, parent, &fn) != 0)
                goto out;
        qstr.name = fn;
        qstr.len = strlen(fn);
        if (qstr.len == 0 || qstr.len >= 256) {
                WARN(1, "name len %u\n", qstr.len);
                return NULL;
        }
        if (qstr.len == 1 && fn[0] == '.') {
                WARN(1, "name '.'\n");
                return NULL;
        }
        if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') {
                WARN(1, "name '..'\n");
                return NULL;
        }
        if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
                WARN(1, "create '/proc/%s' by hand\n", qstr.name);
                return NULL;
        }
        if (is_empty_pde(*parent)) {
                WARN(1, "attempt to add to permanently empty directory");
                return NULL;
        }

        ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
        if (!ent)
                goto out;

        if (qstr.len + 1 <= SIZEOF_PDE_INLINE_NAME) {
                ent->name = ent->inline_name;
        } else {
                ent->name = kmalloc(qstr.len + 1, GFP_KERNEL);
                if (!ent->name) {
                        pde_free(ent);
                        return NULL;
                }
        }

        memcpy(ent->name, fn, qstr.len + 1);
        ent->namelen = qstr.len;
        ent->mode = mode;
        ent->nlink = nlink;
        ent->subdir = RB_ROOT;
        refcount_set(&ent->refcnt, 1);
        spin_lock_init(&ent->pde_unload_lock);
        INIT_LIST_HEAD(&ent->pde_openers);
        proc_set_user(ent, (*parent)->uid, (*parent)->gid);

        /* Revalidate everything under /proc/${pid}/net */
        if ((*parent)->flags & PROC_ENTRY_FORCE_LOOKUP)
                pde_force_lookup(ent);

out:
        return ent;
}

struct proc_dir_entry *proc_symlink(const char *name,
                struct proc_dir_entry *parent, const char *dest)
{
        struct proc_dir_entry *ent;

        ent = __proc_create(&parent, name,
                          (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);

        if (ent) {
                ent->size = strlen(dest);
                ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL);
                if (ent->data) {
                        ent->proc_iops = &proc_link_inode_operations;
                        ent = proc_register(parent, ent);
                } else {
                        pde_free(ent);
                        ent = NULL;
                }
        }
        return ent;
}
EXPORT_SYMBOL(proc_symlink);

struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
                struct proc_dir_entry *parent, void *data, bool force_lookup)
{
        struct proc_dir_entry *ent;

        if (mode == 0)
                mode = S_IRUGO | S_IXUGO;

        ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
        if (ent) {
                ent->data = data;
                ent->proc_dir_ops = &proc_dir_operations;
                ent->proc_iops = &proc_dir_inode_operations;
                if (force_lookup) {
                        pde_force_lookup(ent);
                }
                ent = proc_register(parent, ent);
        }
        return ent;
}
EXPORT_SYMBOL_GPL(_proc_mkdir);

struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent, void *data)
{
        return _proc_mkdir(name, mode, parent, data, false);
}
EXPORT_SYMBOL_GPL(proc_mkdir_data);

struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
                                       struct proc_dir_entry *parent)
{
        return proc_mkdir_data(name, mode, parent, NULL);
}
EXPORT_SYMBOL(proc_mkdir_mode);

struct proc_dir_entry *proc_mkdir(const char *name,
                struct proc_dir_entry *parent)
{
        return proc_mkdir_data(name, 0, parent, NULL);
}
EXPORT_SYMBOL(proc_mkdir);

struct proc_dir_entry *proc_create_mount_point(const char *name)
{
        umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO;
        struct proc_dir_entry *ent, *parent = NULL;

        ent = __proc_create(&parent, name, mode, 2);
        if (ent) {
                ent->data = NULL;
                ent->proc_dir_ops = NULL;
                ent->proc_iops = NULL;
                ent = proc_register(parent, ent);
        }
        return ent;
}
EXPORT_SYMBOL(proc_create_mount_point);

struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
                struct proc_dir_entry **parent, void *data)
{
        struct proc_dir_entry *p;

        if ((mode & S_IFMT) == 0)
                mode |= S_IFREG;
        if ((mode & S_IALLUGO) == 0)
                mode |= S_IRUGO;
        if (WARN_ON_ONCE(!S_ISREG(mode)))
                return NULL;

        p = __proc_create(parent, name, mode, 1);
        if (p) {
                p->proc_iops = &proc_file_inode_operations;
                p->data = data;
        }
        return p;
}

struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                const struct proc_ops *proc_ops, void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        p->proc_ops = proc_ops;
        return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
 
struct proc_dir_entry *proc_create(const char *name, umode_t mode,
                                   struct proc_dir_entry *parent,
                                   const struct proc_ops *proc_ops)
{
        return proc_create_data(name, mode, parent, proc_ops, NULL);
}
EXPORT_SYMBOL(proc_create);

static int proc_seq_open(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);

        if (de->state_size)
                return seq_open_private(file, de->seq_ops, de->state_size);
        return seq_open(file, de->seq_ops);
}

static int proc_seq_release(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);

        if (de->state_size)
                return seq_release_private(inode, file);
        return seq_release(inode, file);
}

static const struct proc_ops proc_seq_ops = {
        /* not permanent -- can call into arbitrary seq_operations */
        .proc_open        = proc_seq_open,
        .proc_read_iter        = seq_read_iter,
        .proc_lseek        = seq_lseek,
        .proc_release        = proc_seq_release,
};

struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
                struct proc_dir_entry *parent, const struct seq_operations *ops,
                unsigned int state_size, void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        p->proc_ops = &proc_seq_ops;
        p->seq_ops = ops;
        p->state_size = state_size;
        return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_seq_private);

static int proc_single_open(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);

        return single_open(file, de->single_show, de->data);
}

static const struct proc_ops proc_single_ops = {
        /* not permanent -- can call into arbitrary ->single_show */
        .proc_open        = proc_single_open,
        .proc_read_iter = seq_read_iter,
        .proc_lseek        = seq_lseek,
        .proc_release        = single_release,
};

struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                int (*show)(struct seq_file *, void *), void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        p->proc_ops = &proc_single_ops;
        p->single_show = show;
        return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_single_data);

void proc_set_size(struct proc_dir_entry *de, loff_t size)
{
        de->size = size;
}
EXPORT_SYMBOL(proc_set_size);

void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
{
        de->uid = uid;
        de->gid = gid;
}
EXPORT_SYMBOL(proc_set_user);

void pde_put(struct proc_dir_entry *pde)
{
        if (refcount_dec_and_test(&pde->refcnt)) {
                proc_free_inum(pde->low_ino);
                pde_free(pde);
        }
}

static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent)
{
        rb_erase(&pde->subdir_node, &parent->subdir);
        RB_CLEAR_NODE(&pde->subdir_node);
}

/*
 * Remove a /proc entry and free it if it's not currently in use.
 */
void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
{
        struct proc_dir_entry *de = NULL;
        const char *fn = name;
        unsigned int len;

        write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
                write_unlock(&proc_subdir_lock);
                return;
        }
        len = strlen(fn);

        de = pde_subdir_find(parent, fn, len);
        if (de) {
                if (unlikely(pde_is_permanent(de))) {
                        WARN(1, "removing permanent /proc entry '%s'", de->name);
                        de = NULL;
                } else {
                        pde_erase(de, parent);
                        if (S_ISDIR(de->mode))
                                parent->nlink--;
                }
        }
        write_unlock(&proc_subdir_lock);
        if (!de) {
                WARN(1, "name '%s'\n", name);
                return;
        }

        proc_entry_rundown(de);

        WARN(pde_subdir_first(de),
             "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
             __func__, de->parent->name, de->name, pde_subdir_first(de)->name);
        pde_put(de);
}
EXPORT_SYMBOL(remove_proc_entry);

int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
{
        struct proc_dir_entry *root = NULL, *de, *next;
        const char *fn = name;
        unsigned int len;

        write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
                write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        len = strlen(fn);

        root = pde_subdir_find(parent, fn, len);
        if (!root) {
                write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        if (unlikely(pde_is_permanent(root))) {
                write_unlock(&proc_subdir_lock);
                WARN(1, "removing permanent /proc entry '%s/%s'",
                        root->parent->name, root->name);
                return -EINVAL;
        }
        pde_erase(root, parent);

        de = root;
        while (1) {
                next = pde_subdir_first(de);
                if (next) {
                        if (unlikely(pde_is_permanent(next))) {
                                write_unlock(&proc_subdir_lock);
                                WARN(1, "removing permanent /proc entry '%s/%s'",
                                        next->parent->name, next->name);
                                return -EINVAL;
                        }
                        pde_erase(next, de);
                        de = next;
                        continue;
                }
                next = de->parent;
                if (S_ISDIR(de->mode))
                        next->nlink--;
                write_unlock(&proc_subdir_lock);

                proc_entry_rundown(de);
                if (de == root)
                        break;
                pde_put(de);

                write_lock(&proc_subdir_lock);
                de = next;
        }
        pde_put(root);
        return 0;
}
EXPORT_SYMBOL(remove_proc_subtree);

void *proc_get_parent_data(const struct inode *inode)
{
        struct proc_dir_entry *de = PDE(inode);
        return de->parent->data;
}
EXPORT_SYMBOL_GPL(proc_get_parent_data);

void proc_remove(struct proc_dir_entry *de)
{
        if (de)
                remove_proc_subtree(de->name, de->parent);
}
EXPORT_SYMBOL(proc_remove);

/*
 * Pull a user buffer into memory and pass it to the file's write handler if
 * one is supplied.  The ->write() method is permitted to modify the
 * kernel-side buffer.
 */
ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size,
                          loff_t *_pos)
{
        struct proc_dir_entry *pde = PDE(file_inode(f));
        char *buf;
        int ret;

        if (!pde->write)
                return -EACCES;
        if (size == 0 || size > PAGE_SIZE - 1)
                return -EINVAL;
        buf = memdup_user_nul(ubuf, size);
        if (IS_ERR(buf))
                return PTR_ERR(buf);
        ret = pde->write(f, buf, size);
        kfree(buf);
        return ret == 0 ? size : ret;
}


































































































































































































































































































































































































































    1 













































    1 

    1 
    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate_wait.h>
#include <linux/leafops.h>
#include <linux/shmem_fs.h>
#include <linux/dax.h>
#include <linux/ksm.h>
#include <linux/pgalloc.h>
#include <linux/backing-dev.h>

#include <asm/tlb.h>
#include "internal.h"
#include "mm_slot.h"

enum scan_result {
        SCAN_FAIL,
        SCAN_SUCCEED,
        SCAN_NO_PTE_TABLE,
        SCAN_PMD_MAPPED,
        SCAN_EXCEED_NONE_PTE,
        SCAN_EXCEED_SWAP_PTE,
        SCAN_EXCEED_SHARED_PTE,
        SCAN_PTE_NON_PRESENT,
        SCAN_PTE_UFFD_WP,
        SCAN_PTE_MAPPED_HUGEPAGE,
        SCAN_LACK_REFERENCED_PAGE,
        SCAN_PAGE_NULL,
        SCAN_SCAN_ABORT,
        SCAN_PAGE_COUNT,
        SCAN_PAGE_LRU,
        SCAN_PAGE_LOCK,
        SCAN_PAGE_ANON,
        SCAN_PAGE_LAZYFREE,
        SCAN_PAGE_COMPOUND,
        SCAN_ANY_PROCESS,
        SCAN_VMA_NULL,
        SCAN_VMA_CHECK,
        SCAN_ADDRESS_RANGE,
        SCAN_DEL_PAGE_LRU,
        SCAN_ALLOC_HUGE_PAGE_FAIL,
        SCAN_CGROUP_CHARGE_FAIL,
        SCAN_TRUNCATED,
        SCAN_PAGE_HAS_PRIVATE,
        SCAN_STORE_FAILED,
        SCAN_COPY_MC,
        SCAN_PAGE_FILLED,
        SCAN_PAGE_DIRTY_OR_WRITEBACK,
};

#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>

static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);

/*
 * default scan 8*HPAGE_PMD_NR ptes, pte_mapped_hugepage, pmd_mapped,
 * no_pte_table or vmas every 10 second.
 */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static unsigned long khugepaged_sleep_expire;
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
/*
 * default collapse hugepages if there is at least one pte mapped like
 * it would have happened if the vma was large enough during page
 * fault.
 *
 * Note that these are only respected if collapse was initiated by khugepaged.
 */
#define KHUGEPAGED_MAX_PTES_LIMIT (HPAGE_PMD_NR - 1)
unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;

#define MM_SLOTS_HASH_BITS 10
static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);

static struct kmem_cache *mm_slot_cache __ro_after_init;

struct collapse_control {
        bool is_khugepaged;

        /* Num pages scanned per node */
        u32 node_load[MAX_NUMNODES];

        /* Num pages scanned (see khugepaged_pages_to_scan) */
        unsigned int progress;

        /* nodemask for allocation fallback */
        nodemask_t alloc_nmask;
};

/**
 * struct khugepaged_scan - cursor for scanning
 * @mm_head: the head of the mm list to scan
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
 *
 * There is only the one khugepaged_scan instance of this cursor structure.
 */
struct khugepaged_scan {
        struct list_head mm_head;
        struct mm_slot *mm_slot;
        unsigned long address;
};

static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};

#ifdef CONFIG_SYSFS
static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
                                         struct kobj_attribute *attr,
                                         char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}

static ssize_t __sleep_millisecs_store(const char *buf, size_t count,
                                       unsigned int *millisecs)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        *millisecs = msecs;
        khugepaged_sleep_expire = 0;
        wake_up_interruptible(&khugepaged_wait);

        return count;
}

static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
                                          struct kobj_attribute *attr,
                                          const char *buf, size_t count)
{
        return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs);
}
static struct kobj_attribute scan_sleep_millisecs_attr =
        __ATTR_RW(scan_sleep_millisecs);

static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
                                          struct kobj_attribute *attr,
                                          char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
}

static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
                                           struct kobj_attribute *attr,
                                           const char *buf, size_t count)
{
        return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs);
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
        __ATTR_RW(alloc_sleep_millisecs);

static ssize_t pages_to_scan_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        unsigned int pages;
        int err;

        err = kstrtouint(buf, 10, &pages);
        if (err || !pages)
                return -EINVAL;

        khugepaged_pages_to_scan = pages;

        return count;
}
static struct kobj_attribute pages_to_scan_attr =
        __ATTR_RW(pages_to_scan);

static ssize_t pages_collapsed_show(struct kobject *kobj,
                                    struct kobj_attribute *attr,
                                    char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
}
static struct kobj_attribute pages_collapsed_attr =
        __ATTR_RO(pages_collapsed);

static ssize_t full_scans_show(struct kobject *kobj,
                               struct kobj_attribute *attr,
                               char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
}
static struct kobj_attribute full_scans_attr =
        __ATTR_RO(full_scans);

static ssize_t defrag_show(struct kobject *kobj,
                           struct kobj_attribute *attr, char *buf)
{
        return single_hugepage_flag_show(kobj, attr, buf,
                                         TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static ssize_t defrag_store(struct kobject *kobj,
                            struct kobj_attribute *attr,
                            const char *buf, size_t count)
{
        return single_hugepage_flag_store(kobj, attr, buf, count,
                                 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static struct kobj_attribute khugepaged_defrag_attr =
        __ATTR_RW(defrag);

/*
 * max_ptes_none controls if khugepaged should collapse hugepages over
 * any unmapped ptes in turn potentially increasing the memory
 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
 * reduce the available free memory in the system as it
 * runs. Increasing max_ptes_none will instead potentially reduce the
 * free memory in the system during the khugepaged scan.
 */
static ssize_t max_ptes_none_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
}
static ssize_t max_ptes_none_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_none;

        err = kstrtoul(buf, 10, &max_ptes_none);
        if (err || max_ptes_none > KHUGEPAGED_MAX_PTES_LIMIT)
                return -EINVAL;

        khugepaged_max_ptes_none = max_ptes_none;

        return count;
}
static struct kobj_attribute khugepaged_max_ptes_none_attr =
        __ATTR_RW(max_ptes_none);

static ssize_t max_ptes_swap_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
}

static ssize_t max_ptes_swap_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_swap;

        err  = kstrtoul(buf, 10, &max_ptes_swap);
        if (err || max_ptes_swap > KHUGEPAGED_MAX_PTES_LIMIT)
                return -EINVAL;

        khugepaged_max_ptes_swap = max_ptes_swap;

        return count;
}

static struct kobj_attribute khugepaged_max_ptes_swap_attr =
        __ATTR_RW(max_ptes_swap);

static ssize_t max_ptes_shared_show(struct kobject *kobj,
                                    struct kobj_attribute *attr,
                                    char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
}

static ssize_t max_ptes_shared_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_shared;

        err  = kstrtoul(buf, 10, &max_ptes_shared);
        if (err || max_ptes_shared > KHUGEPAGED_MAX_PTES_LIMIT)
                return -EINVAL;

        khugepaged_max_ptes_shared = max_ptes_shared;

        return count;
}

static struct kobj_attribute khugepaged_max_ptes_shared_attr =
        __ATTR_RW(max_ptes_shared);

static struct attribute *khugepaged_attr[] = {
        &khugepaged_defrag_attr.attr,
        &khugepaged_max_ptes_none_attr.attr,
        &khugepaged_max_ptes_swap_attr.attr,
        &khugepaged_max_ptes_shared_attr.attr,
        &pages_to_scan_attr.attr,
        &pages_collapsed_attr.attr,
        &full_scans_attr.attr,
        &scan_sleep_millisecs_attr.attr,
        &alloc_sleep_millisecs_attr.attr,
        NULL,
};

struct attribute_group khugepaged_attr_group = {
        .attrs = khugepaged_attr,
        .name = "khugepaged",
};
#endif /* CONFIG_SYSFS */

static bool pte_none_or_zero(pte_t pte)
{
        if (pte_none(pte))
                return true;
        return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
}

int hugepage_madvise(struct vm_area_struct *vma,
                     vm_flags_t *vm_flags, int advice)
{
        switch (advice) {
        case MADV_HUGEPAGE:
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
                 * If the vma become good for khugepaged to scan,
                 * register it here without waiting a page fault that
                 * may not happen any time soon.
                 */
                khugepaged_enter_vma(vma, *vm_flags);
                break;
        case MADV_NOHUGEPAGE:
                *vm_flags &= ~VM_HUGEPAGE;
                *vm_flags |= VM_NOHUGEPAGE;
                /*
                 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
                 * this vma even if we leave the mm registered in khugepaged if
                 * it got registered before VM_NOHUGEPAGE was set.
                 */
                break;
        }

        return 0;
}

int __init khugepaged_init(void)
{
        mm_slot_cache = KMEM_CACHE(mm_slot, 0);
        if (!mm_slot_cache)
                return -ENOMEM;

        khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
        khugepaged_max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT;
        khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
        khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;

        return 0;
}

void __init khugepaged_destroy(void)
{
        kmem_cache_destroy(mm_slot_cache);
}

static inline int collapse_test_exit(struct mm_struct *mm)
{
        return atomic_read(&mm->mm_users) == 0;
}

static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
{
        return collapse_test_exit(mm) ||
                mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}

static bool hugepage_pmd_enabled(void)
{
        /*
         * We cover the anon, shmem and the file-backed case here; file-backed
         * hugepages, when configured in, are determined by the global control.
         * Anon pmd-sized hugepages are determined by the pmd-size control.
         * Shmem pmd-sized hugepages are also determined by its pmd-size control,
         * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
         */
        if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
            hugepage_global_enabled())
                return true;
        if (test_bit(PMD_ORDER, &huge_anon_orders_always))
                return true;
        if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
                return true;
        if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
            hugepage_global_enabled())
                return true;
        if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
                return true;
        return false;
}

void __khugepaged_enter(struct mm_struct *mm)
{
        struct mm_slot *slot;
        int wakeup;

        /* __khugepaged_exit() must not run from under us */
        VM_BUG_ON_MM(collapse_test_exit(mm), mm);
        if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
                return;

        slot = mm_slot_alloc(mm_slot_cache);
        if (!slot)
                return;

        spin_lock(&khugepaged_mm_lock);
        mm_slot_insert(mm_slots_hash, mm, slot);
        /*
         * Insert just behind the scanning cursor, to let the area settle
         * down a little.
         */
        wakeup = list_empty(&khugepaged_scan.mm_head);
        list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
        spin_unlock(&khugepaged_mm_lock);

        mmgrab(mm);
        if (wakeup)
                wake_up_interruptible(&khugepaged_wait);
}

void khugepaged_enter_vma(struct vm_area_struct *vma,
                          vm_flags_t vm_flags)
{
        if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
            hugepage_pmd_enabled()) {
                if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
                        __khugepaged_enter(vma->vm_mm);
        }
}

void __khugepaged_exit(struct mm_struct *mm)
{
        struct mm_slot *slot;
        int free = 0;

        spin_lock(&khugepaged_mm_lock);
        slot = mm_slot_lookup(mm_slots_hash, mm);
        if (slot && khugepaged_scan.mm_slot != slot) {
                hash_del(&slot->hash);
                list_del(&slot->mm_node);
                free = 1;
        }
        spin_unlock(&khugepaged_mm_lock);

        if (free) {
                mm_flags_clear(MMF_VM_HUGEPAGE, mm);
                mm_slot_free(mm_slot_cache, slot);
                mmdrop(mm);
        } else if (slot) {
                /*
                 * This is required to serialize against
                 * collapse_test_exit() (which is guaranteed to run
                 * under mmap sem read mode). Stop here (after we return all
                 * pagetables will be destroyed) until khugepaged has finished
                 * working on the pagetables under the mmap_lock.
                 */
                mmap_write_lock(mm);
                mmap_write_unlock(mm);
        }
}

static void release_pte_folio(struct folio *folio)
{
        node_stat_mod_folio(folio,
                        NR_ISOLATED_ANON + folio_is_file_lru(folio),
                        -folio_nr_pages(folio));
        folio_unlock(folio);
        folio_putback_lru(folio);
}

static void release_pte_pages(pte_t *pte, pte_t *_pte,
                struct list_head *compound_pagelist)
{
        struct folio *folio, *tmp;

        while (--_pte >= pte) {
                pte_t pteval = ptep_get(_pte);
                unsigned long pfn;

                if (pte_none(pteval))
                        continue;
                VM_WARN_ON_ONCE(!pte_present(pteval));
                pfn = pte_pfn(pteval);
                if (is_zero_pfn(pfn))
                        continue;
                folio = pfn_folio(pfn);
                if (folio_test_large(folio))
                        continue;
                release_pte_folio(folio);
        }

        list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
                list_del(&folio->lru);
                release_pte_folio(folio);
        }
}

static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
                unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
                struct list_head *compound_pagelist)
{
        struct page *page = NULL;
        struct folio *folio = NULL;
        unsigned long addr = start_addr;
        pte_t *_pte;
        int none_or_zero = 0, shared = 0, referenced = 0;
        enum scan_result result = SCAN_FAIL;

        for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, addr += PAGE_SIZE) {
                pte_t pteval = ptep_get(_pte);
                if (pte_none_or_zero(pteval)) {
                        ++none_or_zero;
                        if (!userfaultfd_armed(vma) &&
                            (!cc->is_khugepaged ||
                             none_or_zero <= khugepaged_max_ptes_none)) {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
                                count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                                goto out;
                        }
                }
                if (!pte_present(pteval)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto out;
                }
                if (pte_uffd_wp(pteval)) {
                        result = SCAN_PTE_UFFD_WP;
                        goto out;
                }
                page = vm_normal_page(vma, addr, pteval);
                if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out;
                }

                folio = page_folio(page);
                VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);

                /*
                 * If the vma has the VM_DROPPABLE flag, the collapse will
                 * preserve the lazyfree property without needing to skip.
                 */
                if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) &&
                    folio_test_lazyfree(folio) && !pte_dirty(pteval)) {
                        result = SCAN_PAGE_LAZYFREE;
                        goto out;
                }

                /* See collapse_scan_pmd(). */
                if (folio_maybe_mapped_shared(folio)) {
                        ++shared;
                        if (cc->is_khugepaged &&
                            shared > khugepaged_max_ptes_shared) {
                                result = SCAN_EXCEED_SHARED_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
                                goto out;
                        }
                }

                if (folio_test_large(folio)) {
                        struct folio *f;

                        /*
                         * Check if we have dealt with the compound page
                         * already
                         */
                        list_for_each_entry(f, compound_pagelist, lru) {
                                if (folio == f)
                                        goto next;
                        }
                }

                /*
                 * We can do it before folio_isolate_lru because the
                 * folio can't be freed from under us. NOTE: PG_lock
                 * is needed to serialize against split_huge_page
                 * when invoked from the VM.
                 */
                if (!folio_trylock(folio)) {
                        result = SCAN_PAGE_LOCK;
                        goto out;
                }

                /*
                 * Check if the page has any GUP (or other external) pins.
                 *
                 * The page table that maps the page has been already unlinked
                 * from the page table tree and this process cannot get
                 * an additional pin on the page.
                 *
                 * New pins can come later if the page is shared across fork,
                 * but not from this process. The other process cannot write to
                 * the page, only trigger CoW.
                 */
                if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
                        folio_unlock(folio);
                        result = SCAN_PAGE_COUNT;
                        goto out;
                }

                /*
                 * Isolate the page to avoid collapsing an hugepage
                 * currently in use by the VM.
                 */
                if (!folio_isolate_lru(folio)) {
                        folio_unlock(folio);
                        result = SCAN_DEL_PAGE_LRU;
                        goto out;
                }
                node_stat_mod_folio(folio,
                                NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                folio_nr_pages(folio));
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
                VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

                if (folio_test_large(folio))
                        list_add_tail(&folio->lru, compound_pagelist);
next:
                /*
                 * If collapse was initiated by khugepaged, check that there is
                 * enough young pte to justify collapsing the page
                 */
                if (cc->is_khugepaged &&
                    (pte_young(pteval) || folio_test_young(folio) ||
                     folio_test_referenced(folio) ||
                     mmu_notifier_test_young(vma->vm_mm, addr)))
                        referenced++;
        }

        if (unlikely(cc->is_khugepaged && !referenced)) {
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
                trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
                                                    referenced, result);
                return result;
        }
out:
        release_pte_pages(pte, _pte, compound_pagelist);
        trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
                                            referenced, result);
        return result;
}

static void __collapse_huge_page_copy_succeeded(pte_t *pte,
                                                struct vm_area_struct *vma,
                                                unsigned long address,
                                                spinlock_t *ptl,
                                                struct list_head *compound_pagelist)
{
        unsigned long end = address + HPAGE_PMD_SIZE;
        struct folio *src, *tmp;
        pte_t pteval;
        pte_t *_pte;
        unsigned int nr_ptes;

        for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
             address += nr_ptes * PAGE_SIZE) {
                nr_ptes = 1;
                pteval = ptep_get(_pte);
                if (pte_none_or_zero(pteval)) {
                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
                        if (pte_none(pteval))
                                continue;
                        /*
                         * ptl mostly unnecessary.
                         */
                        spin_lock(ptl);
                        ptep_clear(vma->vm_mm, address, _pte);
                        spin_unlock(ptl);
                        ksm_might_unmap_zero_page(vma->vm_mm, pteval);
                } else {
                        struct page *src_page = pte_page(pteval);

                        src = page_folio(src_page);

                        if (folio_test_large(src)) {
                                unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;

                                nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
                        } else {
                                release_pte_folio(src);
                        }

                        /*
                         * ptl mostly unnecessary, but preempt has to
                         * be disabled to update the per-cpu stats
                         * inside folio_remove_rmap_pte().
                         */
                        spin_lock(ptl);
                        clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
                        folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
                        spin_unlock(ptl);
                        free_swap_cache(src);
                        folio_put_refs(src, nr_ptes);
                }
        }

        list_for_each_entry_safe(src, tmp, compound_pagelist, lru) {
                list_del(&src->lru);
                node_stat_sub_folio(src, NR_ISOLATED_ANON +
                                folio_is_file_lru(src));
                folio_unlock(src);
                free_swap_cache(src);
                folio_putback_lru(src);
        }
}

static void __collapse_huge_page_copy_failed(pte_t *pte,
                                             pmd_t *pmd,
                                             pmd_t orig_pmd,
                                             struct vm_area_struct *vma,
                                             struct list_head *compound_pagelist)
{
        spinlock_t *pmd_ptl;

        /*
         * Re-establish the PMD to point to the original page table
         * entry. Restoring PMD needs to be done prior to releasing
         * pages. Since pages are still isolated and locked here,
         * acquiring anon_vma_lock_write is unnecessary.
         */
        pmd_ptl = pmd_lock(vma->vm_mm, pmd);
        pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
        spin_unlock(pmd_ptl);
        /*
         * Release both raw and compound pages isolated
         * in __collapse_huge_page_isolate.
         */
        release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
}

/*
 * __collapse_huge_page_copy - attempts to copy memory contents from raw
 * pages to a hugepage. Cleans up the raw pages if copying succeeds;
 * otherwise restores the original page table and releases isolated raw pages.
 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
 *
 * @pte: starting of the PTEs to copy from
 * @folio: the new hugepage to copy contents to
 * @pmd: pointer to the new hugepage's PMD
 * @orig_pmd: the original raw pages' PMD
 * @vma: the original raw pages' virtual memory area
 * @address: starting address to copy
 * @ptl: lock on raw pages' PTEs
 * @compound_pagelist: list that stores compound pages
 */
static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
                pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
                unsigned long address, spinlock_t *ptl,
                struct list_head *compound_pagelist)
{
        unsigned int i;
        enum scan_result result = SCAN_SUCCEED;

        /*
         * Copying pages' contents is subject to memory poison at any iteration.
         */
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                pte_t pteval = ptep_get(pte + i);
                struct page *page = folio_page(folio, i);
                unsigned long src_addr = address + i * PAGE_SIZE;
                struct page *src_page;

                if (pte_none_or_zero(pteval)) {
                        clear_user_highpage(page, src_addr);
                        continue;
                }
                src_page = pte_page(pteval);
                if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) {
                        result = SCAN_COPY_MC;
                        break;
                }
        }

        if (likely(result == SCAN_SUCCEED))
                __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
                                                    compound_pagelist);
        else
                __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
                                                 compound_pagelist);

        return result;
}

static void khugepaged_alloc_sleep(void)
{
        DEFINE_WAIT(wait);

        add_wait_queue(&khugepaged_wait, &wait);
        __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
        schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
        remove_wait_queue(&khugepaged_wait, &wait);
}

static struct collapse_control khugepaged_collapse_control = {
        .is_khugepaged = true,
};

static bool collapse_scan_abort(int nid, struct collapse_control *cc)
{
        int i;

        /*
         * If node_reclaim_mode is disabled, then no extra effort is made to
         * allocate memory locally.
         */
        if (!node_reclaim_enabled())
                return false;

        /* If there is a count for this node already, it must be acceptable */
        if (cc->node_load[nid])
                return false;

        for (i = 0; i < MAX_NUMNODES; i++) {
                if (!cc->node_load[i])
                        continue;
                if (node_distance(nid, i) > node_reclaim_distance)
                        return true;
        }
        return false;
}

#define khugepaged_defrag()                                        \
        (transparent_hugepage_flags &                                \
         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))

/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
        return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
}

#ifdef CONFIG_NUMA
static int collapse_find_target_node(struct collapse_control *cc)
{
        int nid, target_node = 0, max_value = 0;

        /* find first node with max normal pages hit */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                if (cc->node_load[nid] > max_value) {
                        max_value = cc->node_load[nid];
                        target_node = nid;
                }

        for_each_online_node(nid) {
                if (max_value == cc->node_load[nid])
                        node_set(nid, cc->alloc_nmask);
        }

        return target_node;
}
#else
static int collapse_find_target_node(struct collapse_control *cc)
{
        return 0;
}
#endif

/*
 * If mmap_lock temporarily dropped, revalidate vma
 * before taking mmap_lock.
 * Returns enum scan_result value.
 */

static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
                bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc)
{
        struct vm_area_struct *vma;
        enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
                                 TVA_FORCED_COLLAPSE;

        if (unlikely(collapse_test_exit_or_disable(mm)))
                return SCAN_ANY_PROCESS;

        *vmap = vma = find_vma(mm, address);
        if (!vma)
                return SCAN_VMA_NULL;

        if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
                return SCAN_ADDRESS_RANGE;
        if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
                return SCAN_VMA_CHECK;
        /*
         * Anon VMA expected, the address may be unmapped then
         * remapped to file after khugepaged reaquired the mmap_lock.
         *
         * thp_vma_allowable_order may return true for qualified file
         * vmas.
         */
        if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
                return SCAN_PAGE_ANON;
        return SCAN_SUCCEED;
}

static inline enum scan_result check_pmd_state(pmd_t *pmd)
{
        pmd_t pmde = pmdp_get_lockless(pmd);

        if (pmd_none(pmde))
                return SCAN_NO_PTE_TABLE;

        /*
         * The folio may be under migration when khugepaged is trying to
         * collapse it. Migration success or failure will eventually end
         * up with a present PMD mapping a folio again.
         */
        if (pmd_is_migration_entry(pmde))
                return SCAN_PMD_MAPPED;
        if (!pmd_present(pmde))
                return SCAN_NO_PTE_TABLE;
        if (pmd_trans_huge(pmde))
                return SCAN_PMD_MAPPED;
        if (pmd_bad(pmde))
                return SCAN_NO_PTE_TABLE;
        return SCAN_SUCCEED;
}

static enum scan_result find_pmd_or_thp_or_none(struct mm_struct *mm,
                unsigned long address, pmd_t **pmd)
{
        *pmd = mm_find_pmd(mm, address);
        if (!*pmd)
                return SCAN_NO_PTE_TABLE;

        return check_pmd_state(*pmd);
}

static enum scan_result check_pmd_still_valid(struct mm_struct *mm,
                unsigned long address, pmd_t *pmd)
{
        pmd_t *new_pmd;
        enum scan_result result = find_pmd_or_thp_or_none(mm, address, &new_pmd);

        if (result != SCAN_SUCCEED)
                return result;
        if (new_pmd != pmd)
                return SCAN_FAIL;
        return SCAN_SUCCEED;
}

/*
 * Bring missing pages in from swap, to complete THP collapse.
 * Only done if khugepaged_scan_pmd believes it is worthwhile.
 *
 * Called and returns without pte mapped or spinlocks held.
 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
 */
static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd,
                int referenced)
{
        int swapped_in = 0;
        vm_fault_t ret = 0;
        unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
        enum scan_result result;
        pte_t *pte = NULL;
        spinlock_t *ptl;

        for (addr = start_addr; addr < end; addr += PAGE_SIZE) {
                struct vm_fault vmf = {
                        .vma = vma,
                        .address = addr,
                        .pgoff = linear_page_index(vma, addr),
                        .flags = FAULT_FLAG_ALLOW_RETRY,
                        .pmd = pmd,
                };

                if (!pte++) {
                        /*
                         * Here the ptl is only used to check pte_same() in
                         * do_swap_page(), so readonly version is enough.
                         */
                        pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
                        if (!pte) {
                                mmap_read_unlock(mm);
                                result = SCAN_NO_PTE_TABLE;
                                goto out;
                        }
                }

                vmf.orig_pte = ptep_get_lockless(pte);
                if (pte_none(vmf.orig_pte) ||
                    pte_present(vmf.orig_pte))
                        continue;

                vmf.pte = pte;
                vmf.ptl = ptl;
                ret = do_swap_page(&vmf);
                /* Which unmaps pte (after perhaps re-checking the entry) */
                pte = NULL;

                /*
                 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
                 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
                 * we do not retry here and swap entry will remain in pagetable
                 * resulting in later failure.
                 */
                if (ret & VM_FAULT_RETRY) {
                        /* Likely, but not guaranteed, that page lock failed */
                        result = SCAN_PAGE_LOCK;
                        goto out;
                }
                if (ret & VM_FAULT_ERROR) {
                        mmap_read_unlock(mm);
                        result = SCAN_FAIL;
                        goto out;
                }
                swapped_in++;
        }

        if (pte)
                pte_unmap(pte);

        /* Drain LRU cache to remove extra pin on the swapped in pages */
        if (swapped_in)
                lru_add_drain();

        result = SCAN_SUCCEED;
out:
        trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
        return result;
}

static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
                struct collapse_control *cc)
{
        gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
                     GFP_TRANSHUGE);
        int node = collapse_find_target_node(cc);
        struct folio *folio;

        folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
        if (!folio) {
                *foliop = NULL;
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                return SCAN_ALLOC_HUGE_PAGE_FAIL;
        }

        count_vm_event(THP_COLLAPSE_ALLOC);
        if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
                folio_put(folio);
                *foliop = NULL;
                return SCAN_CGROUP_CHARGE_FAIL;
        }

        count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);

        *foliop = folio;
        return SCAN_SUCCEED;
}

static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
                int referenced, int unmapped, struct collapse_control *cc)
{
        LIST_HEAD(compound_pagelist);
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pgtable_t pgtable;
        struct folio *folio;
        spinlock_t *pmd_ptl, *pte_ptl;
        enum scan_result result = SCAN_FAIL;
        struct vm_area_struct *vma;
        struct mmu_notifier_range range;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        /*
         * Before allocating the hugepage, release the mmap_lock read lock.
         * The allocation can take potentially a long time if it involves
         * sync compaction, and we do not need to hold the mmap_lock during
         * that. We will recheck the vma after taking it again in write mode.
         */
        mmap_read_unlock(mm);

        result = alloc_charge_folio(&folio, mm, cc);
        if (result != SCAN_SUCCEED)
                goto out_nolock;

        mmap_read_lock(mm);
        result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }

        result = find_pmd_or_thp_or_none(mm, address, &pmd);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }

        if (unmapped) {
                /*
                 * __collapse_huge_page_swapin will return with mmap_lock
                 * released when it fails. So we jump out_nolock directly in
                 * that case.  Continuing to collapse causes inconsistency.
                 */
                result = __collapse_huge_page_swapin(mm, vma, address, pmd,
                                                     referenced);
                if (result != SCAN_SUCCEED)
                        goto out_nolock;
        }

        mmap_read_unlock(mm);
        /*
         * Prevent all access to pagetables with the exception of
         * gup_fast later handled by the ptep_clear_flush and the VM
         * handled by the anon_vma lock + PG_lock.
         *
         * UFFDIO_MOVE is prevented to race as well thanks to the
         * mmap_lock.
         */
        mmap_write_lock(mm);
        result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
        if (result != SCAN_SUCCEED)
                goto out_up_write;
        /* check if the pmd is still valid */
        vma_start_write(vma);
        result = check_pmd_still_valid(mm, address, pmd);
        if (result != SCAN_SUCCEED)
                goto out_up_write;

        anon_vma_lock_write(vma->anon_vma);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
                                address + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
        /*
         * This removes any huge TLB entry from the CPU so we won't allow
         * huge and small TLB entries for the same virtual address to
         * avoid the risk of CPU bugs in that area.
         *
         * Parallel GUP-fast is fine since GUP-fast will back off when
         * it detects PMD is changed.
         */
        _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(&range);
        tlb_remove_table_sync_one();

        pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
        if (pte) {
                result = __collapse_huge_page_isolate(vma, address, pte, cc,
                                                      &compound_pagelist);
                spin_unlock(pte_ptl);
        } else {
                result = SCAN_NO_PTE_TABLE;
        }

        if (unlikely(result != SCAN_SUCCEED)) {
                if (pte)
                        pte_unmap(pte);
                spin_lock(pmd_ptl);
                BUG_ON(!pmd_none(*pmd));
                /*
                 * We can only use set_pmd_at when establishing
                 * hugepmds and never for establishing regular pmds that
                 * points to regular pagetables. Use pmd_populate for that
                 */
                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(pmd_ptl);
                anon_vma_unlock_write(vma->anon_vma);
                goto out_up_write;
        }

        /*
         * All pages are isolated and locked so anon_vma rmap
         * can't run anymore.
         */
        anon_vma_unlock_write(vma->anon_vma);

        result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
                                           vma, address, pte_ptl,
                                           &compound_pagelist);
        pte_unmap(pte);
        if (unlikely(result != SCAN_SUCCEED))
                goto out_up_write;

        /*
         * The smp_wmb() inside __folio_mark_uptodate() ensures the
         * copy_huge_page writes become visible before the set_pmd_at()
         * write.
         */
        __folio_mark_uptodate(folio);
        pgtable = pmd_pgtable(_pmd);

        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        map_anon_folio_pmd_nopf(folio, pmd, vma, address);
        spin_unlock(pmd_ptl);

        folio = NULL;

        result = SCAN_SUCCEED;
out_up_write:
        mmap_write_unlock(mm);
out_nolock:
        if (folio)
                folio_put(folio);
        trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
        return result;
}

static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long start_addr,
                bool *lock_dropped, struct collapse_control *cc)
{
        pmd_t *pmd;
        pte_t *pte, *_pte;
        int none_or_zero = 0, shared = 0, referenced = 0;
        enum scan_result result = SCAN_FAIL;
        struct page *page = NULL;
        struct folio *folio = NULL;
        unsigned long addr;
        spinlock_t *ptl;
        int node = NUMA_NO_NODE, unmapped = 0;

        VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);

        result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
        if (result != SCAN_SUCCEED) {
                cc->progress++;
                goto out;
        }

        memset(cc->node_load, 0, sizeof(cc->node_load));
        nodes_clear(cc->alloc_nmask);
        pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
        if (!pte) {
                cc->progress++;
                result = SCAN_NO_PTE_TABLE;
                goto out;
        }

        for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, addr += PAGE_SIZE) {
                cc->progress++;

                pte_t pteval = ptep_get(_pte);
                if (pte_none_or_zero(pteval)) {
                        ++none_or_zero;
                        if (!userfaultfd_armed(vma) &&
                            (!cc->is_khugepaged ||
                             none_or_zero <= khugepaged_max_ptes_none)) {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
                                count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                                goto out_unmap;
                        }
                }
                if (!pte_present(pteval)) {
                        ++unmapped;
                        if (!cc->is_khugepaged ||
                            unmapped <= khugepaged_max_ptes_swap) {
                                /*
                                 * Always be strict with uffd-wp
                                 * enabled swap entries.  Please see
                                 * comment below for pte_uffd_wp().
                                 */
                                if (pte_swp_uffd_wp_any(pteval)) {
                                        result = SCAN_PTE_UFFD_WP;
                                        goto out_unmap;
                                }
                                continue;
                        } else {
                                result = SCAN_EXCEED_SWAP_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
                                goto out_unmap;
                        }
                }
                if (pte_uffd_wp(pteval)) {
                        /*
                         * Don't collapse the page if any of the small
                         * PTEs are armed with uffd write protection.
                         * Here we can also mark the new huge pmd as
                         * write protected if any of the small ones is
                         * marked but that could bring unknown
                         * userfault messages that falls outside of
                         * the registered range.  So, just be simple.
                         */
                        result = SCAN_PTE_UFFD_WP;
                        goto out_unmap;
                }

                page = vm_normal_page(vma, addr, pteval);
                if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out_unmap;
                }
                folio = page_folio(page);

                /*
                 * If the vma has the VM_DROPPABLE flag, the collapse will
                 * preserve the lazyfree property without needing to skip.
                 */
                if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) &&
                    folio_test_lazyfree(folio) && !pte_dirty(pteval)) {
                        result = SCAN_PAGE_LAZYFREE;
                        goto out_unmap;
                }

                if (!folio_test_anon(folio)) {
                        result = SCAN_PAGE_ANON;
                        goto out_unmap;
                }

                /*
                 * We treat a single page as shared if any part of the THP
                 * is shared.
                 */
                if (folio_maybe_mapped_shared(folio)) {
                        ++shared;
                        if (cc->is_khugepaged &&
                            shared > khugepaged_max_ptes_shared) {
                                result = SCAN_EXCEED_SHARED_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
                                goto out_unmap;
                        }
                }

                /*
                 * Record which node the original page is from and save this
                 * information to cc->node_load[].
                 * Khugepaged will allocate hugepage from the node has the max
                 * hit record.
                 */
                node = folio_nid(folio);
                if (collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        goto out_unmap;
                }
                cc->node_load[node]++;
                if (!folio_test_lru(folio)) {
                        result = SCAN_PAGE_LRU;
                        goto out_unmap;
                }
                if (folio_test_locked(folio)) {
                        result = SCAN_PAGE_LOCK;
                        goto out_unmap;
                }

                /*
                 * Check if the page has any GUP (or other external) pins.
                 *
                 * Here the check may be racy:
                 * it may see folio_mapcount() > folio_ref_count().
                 * But such case is ephemeral we could always retry collapse
                 * later.  However it may report false positive if the page
                 * has excessive GUP pins (i.e. 512).  Anyway the same check
                 * will be done again later the risk seems low.
                 */
                if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
                        result = SCAN_PAGE_COUNT;
                        goto out_unmap;
                }

                /*
                 * If collapse was initiated by khugepaged, check that there is
                 * enough young pte to justify collapsing the page
                 */
                if (cc->is_khugepaged &&
                    (pte_young(pteval) || folio_test_young(folio) ||
                     folio_test_referenced(folio) ||
                     mmu_notifier_test_young(vma->vm_mm, addr)))
                        referenced++;
        }
        if (cc->is_khugepaged &&
                   (!referenced ||
                    (unmapped && referenced < HPAGE_PMD_NR / 2))) {
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
        }
out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (result == SCAN_SUCCEED) {
                result = collapse_huge_page(mm, start_addr, referenced,
                                            unmapped, cc);
                /* collapse_huge_page will return with the mmap_lock released */
                *lock_dropped = true;
        }
out:
        trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
                                     none_or_zero, result, unmapped);
        return result;
}

static void collect_mm_slot(struct mm_slot *slot)
{
        struct mm_struct *mm = slot->mm;

        lockdep_assert_held(&khugepaged_mm_lock);

        if (collapse_test_exit(mm)) {
                /* free mm_slot */
                hash_del(&slot->hash);
                list_del(&slot->mm_node);

                /*
                 * Not strictly needed because the mm exited already.
                 *
                 * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
                 */

                /* khugepaged_mm_lock actually not necessary for the below */
                mm_slot_free(mm_slot_cache, slot);
                mmdrop(mm);
        }
}

/* folio must be locked, and mmap_lock must be held */
static enum scan_result set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmdp, struct folio *folio, struct page *page)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_fault vmf = {
                .vma = vma,
                .address = addr,
                .flags = 0,
        };
        pgd_t *pgdp;
        p4d_t *p4dp;
        pud_t *pudp;

        mmap_assert_locked(vma->vm_mm);

        if (!pmdp) {
                pgdp = pgd_offset(mm, addr);
                p4dp = p4d_alloc(mm, pgdp, addr);
                if (!p4dp)
                        return SCAN_FAIL;
                pudp = pud_alloc(mm, p4dp, addr);
                if (!pudp)
                        return SCAN_FAIL;
                pmdp = pmd_alloc(mm, pudp, addr);
                if (!pmdp)
                        return SCAN_FAIL;
        }

        vmf.pmd = pmdp;
        if (do_set_pmd(&vmf, folio, page))
                return SCAN_FAIL;

        folio_get(folio);
        return SCAN_SUCCEED;
}

static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                bool install_pmd)
{
        enum scan_result result = SCAN_FAIL;
        int nr_mapped_ptes = 0;
        unsigned int nr_batch_ptes;
        struct mmu_notifier_range range;
        bool notified = false;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        unsigned long end = haddr + HPAGE_PMD_SIZE;
        struct vm_area_struct *vma = vma_lookup(mm, haddr);
        struct folio *folio;
        pte_t *start_pte, *pte;
        pmd_t *pmd, pgt_pmd;
        spinlock_t *pml = NULL, *ptl;
        int i;

        mmap_assert_locked(mm);

        /* First check VMA found, in case page tables are being torn down */
        if (!vma || !vma->vm_file ||
            !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
                return SCAN_VMA_CHECK;

        /* Fast check before locking page if already PMD-mapped */
        result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
        if (result == SCAN_PMD_MAPPED)
                return result;

        /*
         * If we are here, we've succeeded in replacing all the native pages
         * in the page cache with a single hugepage. If a mm were to fault-in
         * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
         * and map it by a PMD, regardless of sysfs THP settings. As such, let's
         * analogously elide sysfs THP settings here and force collapse.
         */
        if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
                return SCAN_VMA_CHECK;

        /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
        if (userfaultfd_wp(vma))
                return SCAN_PTE_UFFD_WP;

        folio = filemap_lock_folio(vma->vm_file->f_mapping,
                               linear_page_index(vma, haddr));
        if (IS_ERR(folio))
                return SCAN_PAGE_NULL;

        if (!is_pmd_order(folio_order(folio))) {
                result = SCAN_PAGE_COMPOUND;
                goto drop_folio;
        }

        result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
        switch (result) {
        case SCAN_SUCCEED:
                break;
        case SCAN_NO_PTE_TABLE:
                /*
                 * All pte entries have been removed and pmd cleared.
                 * Skip all the pte checks and just update the pmd mapping.
                 */
                goto maybe_install_pmd;
        default:
                goto drop_folio;
        }

        result = SCAN_FAIL;
        start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
        if (!start_pte)                /* mmap_lock + page lock should prevent this */
                goto drop_folio;

        /* step 1: check all mapped PTEs are to the right huge page */
        for (i = 0, addr = haddr, pte = start_pte;
             i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
                pte_t ptent = ptep_get(pte);

                /* empty pte, skip */
                if (pte_none(ptent))
                        continue;

                /* page swapped out, abort */
                if (!pte_present(ptent)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto abort;
                }

                page = vm_normal_page(vma, addr, ptent);
                if (WARN_ON_ONCE(page && is_zone_device_page(page)))
                        page = NULL;
                /*
                 * Note that uprobe, debugger, or MAP_PRIVATE may change the
                 * page table, but the new page will not be a subpage of hpage.
                 */
                if (folio_page(folio, i) != page)
                        goto abort;
        }

        pte_unmap_unlock(start_pte, ptl);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                haddr, haddr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        notified = true;

        /*
         * pmd_lock covers a wider range than ptl, and (if split from mm's
         * page_table_lock) ptl nests inside pml. The less time we hold pml,
         * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
         * inserts a valid as-if-COWed PTE without even looking up page cache.
         * So page lock of folio does not protect from it, so we must not drop
         * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
         */
        if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
                pml = pmd_lock(mm, pmd);

        start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
        if (!start_pte)                /* mmap_lock + page lock should prevent this */
                goto abort;
        if (!pml)
                spin_lock(ptl);
        else if (ptl != pml)
                spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

        if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
                goto abort;

        /* step 2: clear page table and adjust rmap */
        for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
             i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
             pte += nr_batch_ptes) {
                unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
                struct page *page;
                pte_t ptent = ptep_get(pte);

                nr_batch_ptes = 1;

                if (pte_none(ptent))
                        continue;
                /*
                 * We dropped ptl after the first scan, to do the mmu_notifier:
                 * page lock stops more PTEs of the folio being faulted in, but
                 * does not stop write faults COWing anon copies from existing
                 * PTEs; and does not stop those being swapped out or migrated.
                 */
                if (!pte_present(ptent)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto abort;
                }
                page = vm_normal_page(vma, addr, ptent);

                if (folio_page(folio, i) != page)
                        goto abort;

                nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);

                /*
                 * Must clear entry, or a racing truncate may re-remove it.
                 * TLB flush can be left until pmdp_collapse_flush() does it.
                 * PTE dirty? Shmem page is already dirty; file is read-only.
                 */
                clear_ptes(mm, addr, pte, nr_batch_ptes);
                folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
                nr_mapped_ptes += nr_batch_ptes;
        }

        if (!pml)
                spin_unlock(ptl);

        /* step 3: set proper refcount and mm_counters. */
        if (nr_mapped_ptes) {
                folio_ref_sub(folio, nr_mapped_ptes);
                add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
        }

        /* step 4: remove empty page table */
        if (!pml) {
                pml = pmd_lock(mm, pmd);
                if (ptl != pml) {
                        spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
                        if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
                                flush_tlb_mm(mm);
                                goto unlock;
                        }
                }
        }
        pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
        pmdp_get_lockless_sync();
        pte_unmap_unlock(start_pte, ptl);
        if (ptl != pml)
                spin_unlock(pml);

        mmu_notifier_invalidate_range_end(&range);

        mm_dec_nr_ptes(mm);
        page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
        pte_free_defer(mm, pmd_pgtable(pgt_pmd));

maybe_install_pmd:
        /* step 5: install pmd entry */
        result = install_pmd
                        ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page)
                        : SCAN_SUCCEED;
        goto drop_folio;
abort:
        if (nr_mapped_ptes) {
                flush_tlb_mm(mm);
                folio_ref_sub(folio, nr_mapped_ptes);
                add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
        }
unlock:
        if (start_pte)
                pte_unmap_unlock(start_pte, ptl);
        if (pml && pml != ptl)
                spin_unlock(pml);
        if (notified)
                mmu_notifier_invalidate_range_end(&range);
drop_folio:
        folio_unlock(folio);
        folio_put(folio);
        return result;
}

/**
 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
 * address haddr.
 *
 * @mm: process address space where collapse happens
 * @addr: THP collapse address
 * @install_pmd: If a huge PMD should be installed
 *
 * This function checks whether all the PTEs in the PMD are pointing to the
 * right THP. If so, retract the page table so the THP can refault in with
 * as pmd-mapped. Possibly install a huge PMD mapping the THP.
 */
void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                bool install_pmd)
{
        try_collapse_pte_mapped_thp(mm, addr, install_pmd);
}

/* Can we retract page tables for this file-backed VMA? */
static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
{
        /*
         * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
         * got written to. These VMAs are likely not worth removing
         * page tables from, as PMD-mapping is likely to be split later.
         */
        if (READ_ONCE(vma->anon_vma))
                return false;

        /*
         * When a vma is registered with uffd-wp, we cannot recycle
         * the page table because there may be pte markers installed.
         * Other vmas can still have the same file mapped hugely, but
         * skip this one: it will always be mapped in small page size
         * for uffd-wp registered ranges.
         */
        if (userfaultfd_wp(vma))
                return false;

        /*
         * If the VMA contains guard regions then we can't collapse it.
         *
         * This is set atomically on guard marker installation under mmap/VMA
         * read lock, and here we may not hold any VMA or mmap lock at all.
         *
         * This is therefore serialised on the PTE page table lock, which is
         * obtained on guard region installation after the flag is set, so this
         * check being performed under this lock excludes races.
         */
        if (vma_test_atomic_flag(vma, VMA_MAYBE_GUARD_BIT))
                return false;

        return true;
}

static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
        struct vm_area_struct *vma;

        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                struct mmu_notifier_range range;
                struct mm_struct *mm;
                unsigned long addr;
                pmd_t *pmd, pgt_pmd;
                spinlock_t *pml;
                spinlock_t *ptl;
                bool success = false;

                addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                if (addr & ~HPAGE_PMD_MASK ||
                    vma->vm_end < addr + HPAGE_PMD_SIZE)
                        continue;

                mm = vma->vm_mm;
                if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
                        continue;

                if (collapse_test_exit(mm))
                        continue;

                if (!file_backed_vma_is_retractable(vma))
                        continue;

                /* PTEs were notified when unmapped; but now for the PMD? */
                mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                        addr, addr + HPAGE_PMD_SIZE);
                mmu_notifier_invalidate_range_start(&range);

                pml = pmd_lock(mm, pmd);
                /*
                 * The lock of new_folio is still held, we will be blocked in
                 * the page fault path, which prevents the pte entries from
                 * being set again. So even though the old empty PTE page may be
                 * concurrently freed and a new PTE page is filled into the pmd
                 * entry, it is still empty and can be removed.
                 *
                 * So here we only need to recheck if the state of pmd entry
                 * still meets our requirements, rather than checking pmd_same()
                 * like elsewhere.
                 */
                if (check_pmd_state(pmd) != SCAN_SUCCEED)
                        goto drop_pml;
                ptl = pte_lockptr(mm, pmd);
                if (ptl != pml)
                        spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

                /*
                 * Huge page lock is still held, so normally the page table must
                 * remain empty; and we have already skipped anon_vma and
                 * userfaultfd_wp() vmas.  But since the mmap_lock is not held,
                 * it is still possible for a racing userfaultfd_ioctl() or
                 * madvise() to have inserted ptes or markers.  Now that we hold
                 * ptlock, repeating the retractable checks protects us from
                 * races against the prior checks.
                 */
                if (likely(file_backed_vma_is_retractable(vma))) {
                        pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
                        pmdp_get_lockless_sync();
                        success = true;
                }

                if (ptl != pml)
                        spin_unlock(ptl);
drop_pml:
                spin_unlock(pml);

                mmu_notifier_invalidate_range_end(&range);

                if (success) {
                        mm_dec_nr_ptes(mm);
                        page_table_check_pte_clear_range(mm, addr, pgt_pmd);
                        pte_free_defer(mm, pmd_pgtable(pgt_pmd));
                }
        }
        i_mmap_unlock_read(mapping);
}

/**
 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
 *
 * @mm: process address space where collapse happens
 * @addr: virtual collapse start address
 * @file: file that collapse on
 * @start: collapse start address
 * @cc: collapse context and scratchpad
 *
 * Basic scheme is simple, details are more complex:
 *  - allocate and lock a new huge page;
 *  - scan page cache, locking old pages
 *    + swap/gup in pages if necessary;
 *  - copy data to new page
 *  - handle shmem holes
 *    + re-validate that holes weren't filled by someone else
 *    + check for userfaultfd
 *  - finalize updates to the page cache;
 *  - if replacing succeeds:
 *    + unlock huge page;
 *    + free old pages;
 *  - if replacing failed;
 *    + unlock old pages
 *    + unlock and free huge page;
 */
static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
                struct file *file, pgoff_t start, struct collapse_control *cc)
{
        struct address_space *mapping = file->f_mapping;
        struct page *dst;
        struct folio *folio, *tmp, *new_folio;
        pgoff_t index = 0, end = start + HPAGE_PMD_NR;
        LIST_HEAD(pagelist);
        XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
        enum scan_result result = SCAN_SUCCEED;
        int nr_none = 0;
        bool is_shmem = shmem_file(file);

        VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
        VM_BUG_ON(start & (HPAGE_PMD_NR - 1));

        result = alloc_charge_folio(&new_folio, mm, cc);
        if (result != SCAN_SUCCEED)
                goto out;

        mapping_set_update(&xas, mapping);

        __folio_set_locked(new_folio);
        if (is_shmem)
                __folio_set_swapbacked(new_folio);
        new_folio->index = start;
        new_folio->mapping = mapping;

        /*
         * Ensure we have slots for all the pages in the range.  This is
         * almost certainly a no-op because most of the pages must be present
         */
        do {
                xas_lock_irq(&xas);
                xas_create_range(&xas);
                if (!xas_error(&xas))
                        break;
                xas_unlock_irq(&xas);
                if (!xas_nomem(&xas, GFP_KERNEL)) {
                        result = SCAN_FAIL;
                        goto rollback;
                }
        } while (1);

        for (index = start; index < end;) {
                xas_set(&xas, index);
                folio = xas_load(&xas);

                VM_BUG_ON(index != xas.xa_index);
                if (is_shmem) {
                        if (!folio) {
                                /*
                                 * Stop if extent has been truncated or
                                 * hole-punched, and is now completely
                                 * empty.
                                 */
                                if (index == start) {
                                        if (!xas_next_entry(&xas, end - 1)) {
                                                result = SCAN_TRUNCATED;
                                                goto xa_locked;
                                        }
                                }
                                nr_none++;
                                index++;
                                continue;
                        }

                        if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
                                xas_unlock_irq(&xas);
                                /* swap in or instantiate fallocated page */
                                if (shmem_get_folio(mapping->host, index, 0,
                                                &folio, SGP_NOALLOC)) {
                                        result = SCAN_FAIL;
                                        goto xa_unlocked;
                                }
                                /* drain lru cache to help folio_isolate_lru() */
                                lru_add_drain();
                        } else if (folio_trylock(folio)) {
                                folio_get(folio);
                                xas_unlock_irq(&xas);
                        } else {
                                result = SCAN_PAGE_LOCK;
                                goto xa_locked;
                        }
                } else {        /* !is_shmem */
                        if (!folio || xa_is_value(folio)) {
                                xas_unlock_irq(&xas);
                                page_cache_sync_readahead(mapping, &file->f_ra,
                                                          file, index,
                                                          end - index);
                                /* drain lru cache to help folio_isolate_lru() */
                                lru_add_drain();
                                folio = filemap_lock_folio(mapping, index);
                                if (IS_ERR(folio)) {
                                        result = SCAN_FAIL;
                                        goto xa_unlocked;
                                }
                        } else if (folio_test_dirty(folio)) {
                                /*
                                 * khugepaged only works on read-only fd,
                                 * so this page is dirty because it hasn't
                                 * been flushed since first write. There
                                 * won't be new dirty pages.
                                 *
                                 * Trigger async flush here and hope the
                                 * writeback is done when khugepaged
                                 * revisits this page.
                                 *
                                 * This is a one-off situation. We are not
                                 * forcing writeback in loop.
                                 */
                                xas_unlock_irq(&xas);
                                filemap_flush(mapping);
                                result = SCAN_PAGE_DIRTY_OR_WRITEBACK;
                                goto xa_unlocked;
                        } else if (folio_test_writeback(folio)) {
                                xas_unlock_irq(&xas);
                                result = SCAN_PAGE_DIRTY_OR_WRITEBACK;
                                goto xa_unlocked;
                        } else if (folio_trylock(folio)) {
                                folio_get(folio);
                                xas_unlock_irq(&xas);
                        } else {
                                result = SCAN_PAGE_LOCK;
                                goto xa_locked;
                        }
                }

                /*
                 * The folio must be locked, so we can drop the i_pages lock
                 * without racing with truncate.
                 */
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

                /* make sure the folio is up to date */
                if (unlikely(!folio_test_uptodate(folio))) {
                        result = SCAN_FAIL;
                        goto out_unlock;
                }

                /*
                 * If file was truncated then extended, or hole-punched, before
                 * we locked the first folio, then a THP might be there already.
                 * This will be discovered on the first iteration.
                 */
                if (is_pmd_order(folio_order(folio))) {
                        result = SCAN_PTE_MAPPED_HUGEPAGE;
                        goto out_unlock;
                }

                if (folio_mapping(folio) != mapping) {
                        result = SCAN_TRUNCATED;
                        goto out_unlock;
                }

                if (!is_shmem && (folio_test_dirty(folio) ||
                                  folio_test_writeback(folio))) {
                        /*
                         * khugepaged only works on read-only fd, so this
                         * folio is dirty because it hasn't been flushed
                         * since first write.
                         */
                        result = SCAN_PAGE_DIRTY_OR_WRITEBACK;
                        goto out_unlock;
                }

                if (!folio_isolate_lru(folio)) {
                        result = SCAN_DEL_PAGE_LRU;
                        goto out_unlock;
                }

                if (!filemap_release_folio(folio, GFP_KERNEL)) {
                        result = SCAN_PAGE_HAS_PRIVATE;
                        folio_putback_lru(folio);
                        goto out_unlock;
                }

                if (folio_mapped(folio))
                        try_to_unmap(folio,
                                        TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);

                xas_lock_irq(&xas);

                VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);

                /*
                 * We control 2 + nr_pages references to the folio:
                 *  - we hold a pin on it;
                 *  - nr_pages reference from page cache;
                 *  - one from lru_isolate_folio;
                 * If those are the only references, then any new usage
                 * of the folio will have to fetch it from the page
                 * cache. That requires locking the folio to handle
                 * truncate, so any new usage will be blocked until we
                 * unlock folio after collapse/during rollback.
                 */
                if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) {
                        result = SCAN_PAGE_COUNT;
                        xas_unlock_irq(&xas);
                        folio_putback_lru(folio);
                        goto out_unlock;
                }

                /*
                 * Accumulate the folios that are being collapsed.
                 */
                list_add_tail(&folio->lru, &pagelist);
                index += folio_nr_pages(folio);
                continue;
out_unlock:
                folio_unlock(folio);
                folio_put(folio);
                goto xa_unlocked;
        }

        if (!is_shmem) {
                filemap_nr_thps_inc(mapping);
                /*
                 * Paired with the fence in do_dentry_open() -> get_write_access()
                 * to ensure i_writecount is up to date and the update to nr_thps
                 * is visible. Ensures the page cache will be truncated if the
                 * file is opened writable.
                 */
                smp_mb();
                if (inode_is_open_for_write(mapping->host)) {
                        result = SCAN_FAIL;
                        filemap_nr_thps_dec(mapping);
                }
        }

xa_locked:
        xas_unlock_irq(&xas);
xa_unlocked:

        /*
         * If collapse is successful, flush must be done now before copying.
         * If collapse is unsuccessful, does flush actually need to be done?
         * Do it anyway, to clear the state.
         */
        try_to_unmap_flush();

        if (result == SCAN_SUCCEED && nr_none &&
            !shmem_charge(mapping->host, nr_none))
                result = SCAN_FAIL;
        if (result != SCAN_SUCCEED) {
                nr_none = 0;
                goto rollback;
        }

        /*
         * The old folios are locked, so they won't change anymore.
         */
        index = start;
        dst = folio_page(new_folio, 0);
        list_for_each_entry(folio, &pagelist, lru) {
                int i, nr_pages = folio_nr_pages(folio);

                while (index < folio->index) {
                        clear_highpage(dst);
                        index++;
                        dst++;
                }

                for (i = 0; i < nr_pages; i++) {
                        if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) {
                                result = SCAN_COPY_MC;
                                goto rollback;
                        }
                        index++;
                        dst++;
                }
        }
        while (index < end) {
                clear_highpage(dst);
                index++;
                dst++;
        }

        if (nr_none) {
                struct vm_area_struct *vma;
                int nr_none_check = 0;

                i_mmap_lock_read(mapping);
                xas_lock_irq(&xas);

                xas_set(&xas, start);
                for (index = start; index < end; index++) {
                        if (!xas_next(&xas)) {
                                xas_store(&xas, XA_RETRY_ENTRY);
                                if (xas_error(&xas)) {
                                        result = SCAN_STORE_FAILED;
                                        goto immap_locked;
                                }
                                nr_none_check++;
                        }
                }

                if (nr_none != nr_none_check) {
                        result = SCAN_PAGE_FILLED;
                        goto immap_locked;
                }

                /*
                 * If userspace observed a missing page in a VMA with
                 * a MODE_MISSING userfaultfd, then it might expect a
                 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
                 * roll back to avoid suppressing such an event. Since
                 * wp/minor userfaultfds don't give userspace any
                 * guarantees that the kernel doesn't fill a missing
                 * page with a zero page, so they don't matter here.
                 *
                 * Any userfaultfds registered after this point will
                 * not be able to observe any missing pages due to the
                 * previously inserted retry entries.
                 */
                vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
                        if (userfaultfd_missing(vma)) {
                                result = SCAN_EXCEED_NONE_PTE;
                                goto immap_locked;
                        }
                }

immap_locked:
                i_mmap_unlock_read(mapping);
                if (result != SCAN_SUCCEED) {
                        xas_set(&xas, start);
                        for (index = start; index < end; index++) {
                                if (xas_next(&xas) == XA_RETRY_ENTRY)
                                        xas_store(&xas, NULL);
                        }

                        xas_unlock_irq(&xas);
                        goto rollback;
                }
        } else {
                xas_lock_irq(&xas);
        }

        if (is_shmem) {
                lruvec_stat_mod_folio(new_folio, NR_SHMEM, HPAGE_PMD_NR);
                lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
        } else {
                lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
        }
        lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, HPAGE_PMD_NR);

        /*
         * Mark new_folio as uptodate before inserting it into the
         * page cache so that it isn't mistaken for an fallocated but
         * unwritten page.
         */
        folio_mark_uptodate(new_folio);
        folio_ref_add(new_folio, HPAGE_PMD_NR - 1);

        if (is_shmem)
                folio_mark_dirty(new_folio);
        folio_add_lru(new_folio);

        /* Join all the small entries into a single multi-index entry. */
        xas_set_order(&xas, start, HPAGE_PMD_ORDER);
        xas_store(&xas, new_folio);
        WARN_ON_ONCE(xas_error(&xas));
        xas_unlock_irq(&xas);

        /*
         * Remove pte page tables, so we can re-fault the page as huge.
         * If MADV_COLLAPSE, adjust result to call try_collapse_pte_mapped_thp().
         */
        retract_page_tables(mapping, start);
        if (cc && !cc->is_khugepaged)
                result = SCAN_PTE_MAPPED_HUGEPAGE;
        folio_unlock(new_folio);

        /*
         * The collapse has succeeded, so free the old folios.
         */
        list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
                list_del(&folio->lru);
                lruvec_stat_mod_folio(folio, NR_FILE_PAGES,
                                      -folio_nr_pages(folio));
                if (is_shmem)
                        lruvec_stat_mod_folio(folio, NR_SHMEM,
                                              -folio_nr_pages(folio));
                folio->mapping = NULL;
                folio_clear_active(folio);
                folio_clear_unevictable(folio);
                folio_unlock(folio);
                folio_put_refs(folio, 2 + folio_nr_pages(folio));
        }

        goto out;

rollback:
        /* Something went wrong: roll back page cache changes */
        if (nr_none) {
                xas_lock_irq(&xas);
                mapping->nrpages -= nr_none;
                xas_unlock_irq(&xas);
                shmem_uncharge(mapping->host, nr_none);
        }

        list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
                list_del(&folio->lru);
                folio_unlock(folio);
                folio_putback_lru(folio);
                folio_put(folio);
        }
        /*
         * Undo the updates of filemap_nr_thps_inc for non-SHMEM
         * file only. This undo is not needed unless failure is
         * due to SCAN_COPY_MC.
         */
        if (!is_shmem && result == SCAN_COPY_MC) {
                filemap_nr_thps_dec(mapping);
                /*
                 * Paired with the fence in do_dentry_open() -> get_write_access()
                 * to ensure the update to nr_thps is visible.
                 */
                smp_mb();
        }

        new_folio->mapping = NULL;

        folio_unlock(new_folio);
        folio_put(new_folio);
out:
        VM_BUG_ON(!list_empty(&pagelist));
        trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
        return result;
}

static enum scan_result collapse_scan_file(struct mm_struct *mm,
                unsigned long addr, struct file *file, pgoff_t start,
                struct collapse_control *cc)
{
        struct folio *folio = NULL;
        struct address_space *mapping = file->f_mapping;
        XA_STATE(xas, &mapping->i_pages, start);
        int present, swap;
        int node = NUMA_NO_NODE;
        enum scan_result result = SCAN_SUCCEED;

        present = 0;
        swap = 0;
        memset(cc->node_load, 0, sizeof(cc->node_load));
        nodes_clear(cc->alloc_nmask);
        rcu_read_lock();
        xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
                if (xas_retry(&xas, folio))
                        continue;

                if (xa_is_value(folio)) {
                        swap += 1 << xas_get_order(&xas);
                        if (cc->is_khugepaged &&
                            swap > khugepaged_max_ptes_swap) {
                                result = SCAN_EXCEED_SWAP_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
                                break;
                        }
                        continue;
                }

                if (!folio_try_get(folio)) {
                        xas_reset(&xas);
                        continue;
                }

                if (unlikely(folio != xas_reload(&xas))) {
                        folio_put(folio);
                        xas_reset(&xas);
                        continue;
                }

                if (is_pmd_order(folio_order(folio))) {
                        result = SCAN_PTE_MAPPED_HUGEPAGE;
                        /*
                         * PMD-sized THP implies that we can only try
                         * retracting the PTE table.
                         */
                        folio_put(folio);
                        break;
                }

                node = folio_nid(folio);
                if (collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        folio_put(folio);
                        break;
                }
                cc->node_load[node]++;

                if (!folio_test_lru(folio)) {
                        result = SCAN_PAGE_LRU;
                        folio_put(folio);
                        break;
                }

                if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
                        result = SCAN_PAGE_COUNT;
                        folio_put(folio);
                        break;
                }

                /*
                 * We probably should check if the folio is referenced
                 * here, but nobody would transfer pte_young() to
                 * folio_test_referenced() for us.  And rmap walk here
                 * is just too costly...
                 */

                present += folio_nr_pages(folio);
                folio_put(folio);

                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();
        if (result == SCAN_PTE_MAPPED_HUGEPAGE)
                cc->progress++;
        else
                cc->progress += HPAGE_PMD_NR;

        if (result == SCAN_SUCCEED) {
                if (cc->is_khugepaged &&
                    present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
                        result = SCAN_EXCEED_NONE_PTE;
                        count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                } else {
                        result = collapse_file(mm, addr, file, start, cc);
                }
        }

        trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
        return result;
}

/*
 * Try to collapse a single PMD starting at a PMD aligned addr, and return
 * the results.
 */
static enum scan_result collapse_single_pmd(unsigned long addr,
                struct vm_area_struct *vma, bool *lock_dropped,
                struct collapse_control *cc)
{
        struct mm_struct *mm = vma->vm_mm;
        bool triggered_wb = false;
        enum scan_result result;
        struct file *file;
        pgoff_t pgoff;

        mmap_assert_locked(mm);

        if (vma_is_anonymous(vma)) {
                result = collapse_scan_pmd(mm, vma, addr, lock_dropped, cc);
                goto end;
        }

        file = get_file(vma->vm_file);
        pgoff = linear_page_index(vma, addr);

        mmap_read_unlock(mm);
        *lock_dropped = true;
retry:
        result = collapse_scan_file(mm, addr, file, pgoff, cc);

        /*
         * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
         * then retry the collapse one time.
         */
        if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
            !triggered_wb && mapping_can_writeback(file->f_mapping)) {
                const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
                const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;

                filemap_write_and_wait_range(file->f_mapping, lstart, lend);
                triggered_wb = true;
                goto retry;
        }
        fput(file);

        if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
                mmap_read_lock(mm);
                if (collapse_test_exit_or_disable(mm))
                        result = SCAN_ANY_PROCESS;
                else
                        result = try_collapse_pte_mapped_thp(mm, addr,
                                                             !cc->is_khugepaged);
                if (result == SCAN_PMD_MAPPED)
                        result = SCAN_SUCCEED;
                mmap_read_unlock(mm);
        }
end:
        if (cc->is_khugepaged && result == SCAN_SUCCEED)
                ++khugepaged_pages_collapsed;
        return result;
}

static void collapse_scan_mm_slot(unsigned int progress_max,
                enum scan_result *result, struct collapse_control *cc)
        __releases(&khugepaged_mm_lock)
        __acquires(&khugepaged_mm_lock)
{
        struct vma_iterator vmi;
        struct mm_slot *slot;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        unsigned int progress_prev = cc->progress;

        lockdep_assert_held(&khugepaged_mm_lock);
        *result = SCAN_FAIL;

        if (khugepaged_scan.mm_slot) {
                slot = khugepaged_scan.mm_slot;
        } else {
                slot = list_first_entry(&khugepaged_scan.mm_head,
                                     struct mm_slot, mm_node);
                khugepaged_scan.address = 0;
                khugepaged_scan.mm_slot = slot;
        }
        spin_unlock(&khugepaged_mm_lock);

        mm = slot->mm;
        /*
         * Don't wait for semaphore (to avoid long wait times).  Just move to
         * the next mm on the list.
         */
        vma = NULL;
        if (unlikely(!mmap_read_trylock(mm)))
                goto breakouterloop_mmap_lock;

        cc->progress++;
        if (unlikely(collapse_test_exit_or_disable(mm)))
                goto breakouterloop;

        vma_iter_init(&vmi, mm, khugepaged_scan.address);
        for_each_vma(vmi, vma) {
                unsigned long hstart, hend;

                cond_resched();
                if (unlikely(collapse_test_exit_or_disable(mm))) {
                        cc->progress++;
                        break;
                }
                if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
                        cc->progress++;
                        continue;
                }
                hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
                hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
                if (khugepaged_scan.address > hend) {
                        cc->progress++;
                        continue;
                }
                if (khugepaged_scan.address < hstart)
                        khugepaged_scan.address = hstart;
                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);

                while (khugepaged_scan.address < hend) {
                        bool lock_dropped = false;

                        cond_resched();
                        if (unlikely(collapse_test_exit_or_disable(mm)))
                                goto breakouterloop;

                        VM_WARN_ON_ONCE(khugepaged_scan.address < hstart ||
                                  khugepaged_scan.address + HPAGE_PMD_SIZE >
                                  hend);

                        *result = collapse_single_pmd(khugepaged_scan.address,
                                                      vma, &lock_dropped, cc);
                        /* move to next address */
                        khugepaged_scan.address += HPAGE_PMD_SIZE;
                        if (lock_dropped)
                                /*
                                 * We released mmap_lock so break loop.  Note
                                 * that we drop mmap_lock before all hugepage
                                 * allocations, so if allocation fails, we are
                                 * guaranteed to break here and report the
                                 * correct result back to caller.
                                 */
                                goto breakouterloop_mmap_lock;
                        if (cc->progress >= progress_max)
                                goto breakouterloop;
                }
        }
breakouterloop:
        mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
breakouterloop_mmap_lock:

        spin_lock(&khugepaged_mm_lock);
        VM_BUG_ON(khugepaged_scan.mm_slot != slot);
        /*
         * Release the current mm_slot if this mm is about to die, or
         * if we scanned all vmas of this mm, or THP got disabled.
         */
        if (collapse_test_exit_or_disable(mm) || !vma) {
                /*
                 * Make sure that if mm_users is reaching zero while
                 * khugepaged runs here, khugepaged_exit will find
                 * mm_slot not pointing to the exiting mm.
                 */
                if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
                        khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
                        khugepaged_scan.address = 0;
                } else {
                        khugepaged_scan.mm_slot = NULL;
                        khugepaged_full_scans++;
                }

                collect_mm_slot(slot);
        }

        trace_mm_khugepaged_scan(mm, cc->progress - progress_prev,
                                 khugepaged_scan.mm_slot == NULL);
}

static int khugepaged_has_work(void)
{
        return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
}

static int khugepaged_wait_event(void)
{
        return !list_empty(&khugepaged_scan.mm_head) ||
                kthread_should_stop();
}

static void khugepaged_do_scan(struct collapse_control *cc)
{
        const unsigned int progress_max = READ_ONCE(khugepaged_pages_to_scan);
        unsigned int pass_through_head = 0;
        bool wait = true;
        enum scan_result result = SCAN_SUCCEED;

        lru_add_drain_all();

        cc->progress = 0;
        while (true) {
                cond_resched();

                if (unlikely(kthread_should_stop()))
                        break;

                spin_lock(&khugepaged_mm_lock);
                if (!khugepaged_scan.mm_slot)
                        pass_through_head++;
                if (khugepaged_has_work() &&
                    pass_through_head < 2)
                        collapse_scan_mm_slot(progress_max, &result, cc);
                else
                        cc->progress = progress_max;
                spin_unlock(&khugepaged_mm_lock);

                if (cc->progress >= progress_max)
                        break;

                if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
                        /*
                         * If fail to allocate the first time, try to sleep for
                         * a while.  When hit again, cancel the scan.
                         */
                        if (!wait)
                                break;
                        wait = false;
                        khugepaged_alloc_sleep();
                }
        }
}

static bool khugepaged_should_wakeup(void)
{
        return kthread_should_stop() ||
               time_after_eq(jiffies, khugepaged_sleep_expire);
}

static void khugepaged_wait_work(void)
{
        if (khugepaged_has_work()) {
                const unsigned long scan_sleep_jiffies =
                        msecs_to_jiffies(khugepaged_scan_sleep_millisecs);

                if (!scan_sleep_jiffies)
                        return;

                khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
                wait_event_freezable_timeout(khugepaged_wait,
                                             khugepaged_should_wakeup(),
                                             scan_sleep_jiffies);
                return;
        }

        if (hugepage_pmd_enabled())
                wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}

static int khugepaged(void *none)
{
        struct mm_slot *slot;

        set_freezable();
        set_user_nice(current, MAX_NICE);

        while (!kthread_should_stop()) {
                khugepaged_do_scan(&khugepaged_collapse_control);
                khugepaged_wait_work();
        }

        spin_lock(&khugepaged_mm_lock);
        slot = khugepaged_scan.mm_slot;
        khugepaged_scan.mm_slot = NULL;
        if (slot)
                collect_mm_slot(slot);
        spin_unlock(&khugepaged_mm_lock);
        return 0;
}

void set_recommended_min_free_kbytes(void)
{
        struct zone *zone;
        int nr_zones = 0;
        unsigned long recommended_min;

        if (!hugepage_pmd_enabled()) {
                calculate_min_free_kbytes();
                goto update_wmarks;
        }

        for_each_populated_zone(zone) {
                /*
                 * We don't need to worry about fragmentation of
                 * ZONE_MOVABLE since it only has movable pages.
                 */
                if (zone_idx(zone) > gfp_zone(GFP_USER))
                        continue;

                nr_zones++;
        }

        /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
        recommended_min = pageblock_nr_pages * nr_zones * 2;

        /*
         * Make sure that on average at least two pageblocks are almost free
         * of another type, one for a migratetype to fall back to and a
         * second to avoid subsequent fallbacks of other types There are 3
         * MIGRATE_TYPES we care about.
         */
        recommended_min += pageblock_nr_pages * nr_zones *
                           MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;

        /* don't ever allow to reserve more than 5% of the lowmem */
        recommended_min = min(recommended_min,
                              (unsigned long) nr_free_buffer_pages() / 20);
        recommended_min <<= (PAGE_SHIFT-10);

        if (recommended_min > min_free_kbytes) {
                if (user_min_free_kbytes >= 0)
                        pr_info_ratelimited("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
                                            min_free_kbytes, recommended_min);

                min_free_kbytes = recommended_min;
        }

update_wmarks:
        setup_per_zone_wmarks();
}

int start_stop_khugepaged(void)
{
        int err = 0;

        mutex_lock(&khugepaged_mutex);
        if (hugepage_pmd_enabled()) {
                if (!khugepaged_thread)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
                if (IS_ERR(khugepaged_thread)) {
                        pr_err("khugepaged: kthread_run(khugepaged) failed\n");
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
                        goto fail;
                }

                if (!list_empty(&khugepaged_scan.mm_head))
                        wake_up_interruptible(&khugepaged_wait);
        } else if (khugepaged_thread) {
                kthread_stop(khugepaged_thread);
                khugepaged_thread = NULL;
        }
        set_recommended_min_free_kbytes();
fail:
        mutex_unlock(&khugepaged_mutex);
        return err;
}

void khugepaged_min_free_kbytes_update(void)
{
        mutex_lock(&khugepaged_mutex);
        if (hugepage_pmd_enabled() && khugepaged_thread)
                set_recommended_min_free_kbytes();
        mutex_unlock(&khugepaged_mutex);
}

bool current_is_khugepaged(void)
{
        return kthread_func(current) == khugepaged;
}

static int madvise_collapse_errno(enum scan_result r)
{
        /*
         * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
         * actionable feedback to caller, so they may take an appropriate
         * fallback measure depending on the nature of the failure.
         */
        switch (r) {
        case SCAN_ALLOC_HUGE_PAGE_FAIL:
                return -ENOMEM;
        case SCAN_CGROUP_CHARGE_FAIL:
        case SCAN_EXCEED_NONE_PTE:
                return -EBUSY;
        /* Resource temporary unavailable - trying again might succeed */
        case SCAN_PAGE_COUNT:
        case SCAN_PAGE_LOCK:
        case SCAN_PAGE_LRU:
        case SCAN_DEL_PAGE_LRU:
        case SCAN_PAGE_FILLED:
        case SCAN_PAGE_DIRTY_OR_WRITEBACK:
                return -EAGAIN;
        /*
         * Other: Trying again likely not to succeed / error intrinsic to
         * specified memory range. khugepaged likely won't be able to collapse
         * either.
         */
        default:
                return -EINVAL;
        }
}

int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
                     unsigned long end, bool *lock_dropped)
{
        struct collapse_control *cc;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long hstart, hend, addr;
        enum scan_result last_fail = SCAN_FAIL;
        int thps = 0;
        bool mmap_unlocked = false;

        BUG_ON(vma->vm_start > start);
        BUG_ON(vma->vm_end < end);

        if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
                return -EINVAL;

        cc = kmalloc_obj(*cc);
        if (!cc)
                return -ENOMEM;
        cc->is_khugepaged = false;
        cc->progress = 0;

        mmgrab(mm);
        lru_add_drain_all();

        hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = end & HPAGE_PMD_MASK;

        for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
                enum scan_result result = SCAN_FAIL;

                if (mmap_unlocked) {
                        cond_resched();
                        mmap_read_lock(mm);
                        mmap_unlocked = false;
                        *lock_dropped = true;
                        result = hugepage_vma_revalidate(mm, addr, false, &vma,
                                                         cc);
                        if (result  != SCAN_SUCCEED) {
                                last_fail = result;
                                goto out_nolock;
                        }

                        hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
                }

                result = collapse_single_pmd(addr, vma, &mmap_unlocked, cc);

                switch (result) {
                case SCAN_SUCCEED:
                case SCAN_PMD_MAPPED:
                        ++thps;
                        break;
                /* Whitelisted set of results where continuing OK */
                case SCAN_NO_PTE_TABLE:
                case SCAN_PTE_NON_PRESENT:
                case SCAN_PTE_UFFD_WP:
                case SCAN_LACK_REFERENCED_PAGE:
                case SCAN_PAGE_NULL:
                case SCAN_PAGE_COUNT:
                case SCAN_PAGE_LOCK:
                case SCAN_PAGE_COMPOUND:
                case SCAN_PAGE_LRU:
                case SCAN_DEL_PAGE_LRU:
                        last_fail = result;
                        break;
                default:
                        last_fail = result;
                        /* Other error, exit */
                        goto out_maybelock;
                }
        }

out_maybelock:
        /* Caller expects us to hold mmap_lock on return */
        if (mmap_unlocked) {
                *lock_dropped = true;
                mmap_read_lock(mm);
        }
out_nolock:
        mmap_assert_locked(mm);
        mmdrop(mm);
        kfree(cc);

        return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
                        : madvise_collapse_errno(last_fail);
}


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGALLOC_H
#define _ASM_X86_PGALLOC_H

#include <linux/threads.h>
#include <linux/mm.h>                /* for struct page */
#include <linux/pagemap.h>

#include <asm/cpufeature.h>

#define __HAVE_ARCH_PTE_ALLOC_ONE
#define __HAVE_ARCH_PGD_FREE
#include <asm-generic/pgalloc.h>

static inline int  __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define paravirt_pgd_alloc(mm)        __paravirt_pgd_alloc(mm)
static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
                                            unsigned long start, unsigned long count) {}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_release_pte(unsigned long pfn) {}
static inline void paravirt_release_pmd(unsigned long pfn) {}
static inline void paravirt_release_pud(unsigned long pfn) {}
static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif

/*
 * In case of Page Table Isolation active, we acquire two PGDs instead of one.
 * Being order-1, it is both 8k in size and 8k-aligned.  That lets us just
 * flip bit 12 in a pointer to swap between the two 4k halves.
 */
static inline unsigned int pgd_allocation_order(void)
{
        if (cpu_feature_enabled(X86_FEATURE_PTI))
                return 1;
        return 0;
}

/*
 * Allocate and free page tables.
 */
extern pgd_t *pgd_alloc(struct mm_struct *);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);

extern pgtable_t pte_alloc_one(struct mm_struct *);

extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);

static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
                                  unsigned long address)
{
        ___pte_free_tlb(tlb, pte);
}

static inline void pmd_populate_kernel(struct mm_struct *mm,
                                       pmd_t *pmd, pte_t *pte)
{
        paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
        set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}

static inline void pmd_populate_kernel_safe(struct mm_struct *mm,
                                       pmd_t *pmd, pte_t *pte)
{
        paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
        set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}

static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
                                struct page *pte)
{
        unsigned long pfn = page_to_pfn(pte);

        paravirt_alloc_pte(mm, pfn);
        set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}

#if CONFIG_PGTABLE_LEVELS > 2
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);

static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
                                  unsigned long address)
{
        ___pmd_free_tlb(tlb, pmd);
}

#ifdef CONFIG_X86_PAE
extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
#else        /* !CONFIG_X86_PAE */
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
        set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}

static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
        set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}
#endif        /* CONFIG_X86_PAE */

#if CONFIG_PGTABLE_LEVELS > 3
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
        paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
        set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}

static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
        paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
        set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}

extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);

static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
                                  unsigned long address)
{
        ___pud_free_tlb(tlb, pud);
}

#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
        if (!pgtable_l5_enabled())
                return;
        paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
        set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}

static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
        if (!pgtable_l5_enabled())
                return;
        paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
        set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}

extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);

static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
                                  unsigned long address)
{
        if (pgtable_l5_enabled())
                ___p4d_free_tlb(tlb, p4d);
}

#endif        /* CONFIG_PGTABLE_LEVELS > 4 */
#endif        /* CONFIG_PGTABLE_LEVELS > 3 */
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#endif /* _ASM_X86_PGALLOC_H */







































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NDISC_H
#define _NDISC_H

/*
 *        ICMP codes for neighbour discovery messages
 */

#define NDISC_ROUTER_SOLICITATION        133
#define NDISC_ROUTER_ADVERTISEMENT        134
#define NDISC_NEIGHBOUR_SOLICITATION        135
#define NDISC_NEIGHBOUR_ADVERTISEMENT        136
#define NDISC_REDIRECT                        137

/*
 * Router type: cross-layer information from link-layer to
 * IPv6 layer reported by certain link types (e.g., RFC4214).
 */
#define NDISC_NODETYPE_UNSPEC                0        /* unspecified (default) */
#define NDISC_NODETYPE_HOST                1        /* host or unauthorized router */
#define NDISC_NODETYPE_NODEFAULT        2        /* non-default router */
#define NDISC_NODETYPE_DEFAULT                3        /* default router */

/*
 *        ndisc options
 */

enum {
        __ND_OPT_PREFIX_INFO_END = 0,
        ND_OPT_SOURCE_LL_ADDR = 1,        /* RFC2461 */
        ND_OPT_TARGET_LL_ADDR = 2,        /* RFC2461 */
        ND_OPT_PREFIX_INFO = 3,                /* RFC2461 */
        ND_OPT_REDIRECT_HDR = 4,        /* RFC2461 */
        ND_OPT_MTU = 5,                        /* RFC2461 */
        ND_OPT_NONCE = 14,              /* RFC7527 */
        __ND_OPT_ARRAY_MAX,
        ND_OPT_ROUTE_INFO = 24,                /* RFC4191 */
        ND_OPT_RDNSS = 25,                /* RFC5006 */
        ND_OPT_DNSSL = 31,                /* RFC6106 */
        ND_OPT_6CO = 34,                /* RFC6775 */
        ND_OPT_CAPTIVE_PORTAL = 37,        /* RFC7710 */
        ND_OPT_PREF64 = 38,                /* RFC8781 */
        __ND_OPT_MAX
};

#define MAX_RTR_SOLICITATION_DELAY        HZ

#define ND_REACHABLE_TIME                (30*HZ)
#define ND_RETRANS_TIMER                HZ

#include <linux/compiler.h>
#include <linux/icmpv6.h>
#include <linux/in6.h>
#include <linux/types.h>
#include <linux/if_arp.h>
#include <linux/netdevice.h>
#include <linux/hash.h>

#include <net/neighbour.h>

struct ctl_table;
struct inet6_dev;
struct net_device;
struct net_proto_family;
struct sk_buff;
struct prefix_info;

extern struct neigh_table nd_tbl;

struct nd_msg {
        struct icmp6hdr        icmph;
        struct in6_addr        target;
        __u8                opt[];
};

struct rs_msg {
        struct icmp6hdr        icmph;
        __u8                opt[];
};

struct ra_msg {
        struct icmp6hdr                icmph;
        __be32                        reachable_time;
        __be32                        retrans_timer;
};

struct rd_msg {
        struct icmp6hdr icmph;
        struct in6_addr        target;
        struct in6_addr        dest;
        __u8                opt[];
};

struct nd_opt_hdr {
        __u8                nd_opt_type;
        __u8                nd_opt_len;
} __packed;

/* ND options */
struct ndisc_options {
        struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX];
#ifdef CONFIG_IPV6_ROUTE_INFO
        struct nd_opt_hdr *nd_opts_ri;
        struct nd_opt_hdr *nd_opts_ri_end;
#endif
        struct nd_opt_hdr *nd_useropts;
        struct nd_opt_hdr *nd_useropts_end;
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct nd_opt_hdr *nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR + 1];
#endif
};

#define nd_opts_src_lladdr                nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
#define nd_opts_tgt_lladdr                nd_opt_array[ND_OPT_TARGET_LL_ADDR]
#define nd_opts_pi                        nd_opt_array[ND_OPT_PREFIX_INFO]
#define nd_opts_pi_end                        nd_opt_array[__ND_OPT_PREFIX_INFO_END]
#define nd_opts_rh                        nd_opt_array[ND_OPT_REDIRECT_HDR]
#define nd_opts_mtu                        nd_opt_array[ND_OPT_MTU]
#define nd_opts_nonce                        nd_opt_array[ND_OPT_NONCE]
#define nd_802154_opts_src_lladdr        nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR]
#define nd_802154_opts_tgt_lladdr        nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR]

#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)

struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
                                          u8 *opt, int opt_len,
                                          struct ndisc_options *ndopts);

void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data,
                              int data_len, int pad);

#define NDISC_OPS_REDIRECT_DATA_SPACE        2

/*
 * This structure defines the hooks for IPv6 neighbour discovery.
 * The following hooks can be defined; unless noted otherwise, they are
 * optional and can be filled with a null pointer.
 *
 * int (*parse_options)(const struct net_device *dev,
 *                        struct nd_opt_hdr *nd_opt,
 *                        struct ndisc_options *ndopts):
 *     This function is called while parsing ndisc ops and put each position
 *     as pointer into ndopts. If this function return unequal 0, then this
 *     function took care about the ndisc option, if 0 then the IPv6 ndisc
 *     option parser will take care about that option.
 *
 * void (*update)(const struct net_device *dev, struct neighbour *n,
 *                  u32 flags, u8 icmp6_type,
 *                  const struct ndisc_options *ndopts):
 *     This function is called when IPv6 ndisc updates the neighbour cache
 *     entry. Additional options which can be updated may be previously
 *     parsed by parse_opts callback and accessible over ndopts parameter.
 *
 * int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type,
 *                         struct neighbour *neigh, u8 *ha_buf,
 *                         u8 **ha):
 *     This function is called when the necessary option space will be
 *     calculated before allocating a skb. The parameters neigh, ha_buf
 *     abd ha are available on NDISC_REDIRECT messages only.
 *
 * void (*fill_addr_option)(const struct net_device *dev,
 *                            struct sk_buff *skb, u8 icmp6_type,
 *                            const u8 *ha):
 *     This function is called when the skb will finally fill the option
 *     fields inside skb. NOTE: this callback should fill the option
 *     fields to the skb which are previously indicated by opt_space
 *     parameter. That means the decision to add such option should
 *     not lost between these two callbacks, e.g. protected by interface
 *     up state.
 *
 * void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev,
 *                               const struct prefix_info *pinfo,
 *                               struct inet6_dev *in6_dev,
 *                               struct in6_addr *addr,
 *                               int addr_type, u32 addr_flags,
 *                               bool sllao, bool tokenized,
 *                               __u32 valid_lft, u32 prefered_lft,
 *                               bool dev_addr_generated):
 *     This function is called when a RA messages is received with valid
 *     PIO option fields and an IPv6 address will be added to the interface
 *     for autoconfiguration. The parameter dev_addr_generated reports about
 *     if the address was based on dev->dev_addr or not. This can be used
 *     to add a second address if link-layer operates with two link layer
 *     addresses. E.g. 802.15.4 6LoWPAN.
 */
struct ndisc_ops {
        int        (*parse_options)(const struct net_device *dev,
                                 struct nd_opt_hdr *nd_opt,
                                 struct ndisc_options *ndopts);
        void        (*update)(const struct net_device *dev, struct neighbour *n,
                          u32 flags, u8 icmp6_type,
                          const struct ndisc_options *ndopts);
        int        (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type,
                                  struct neighbour *neigh, u8 *ha_buf,
                                  u8 **ha);
        void        (*fill_addr_option)(const struct net_device *dev,
                                    struct sk_buff *skb, u8 icmp6_type,
                                    const u8 *ha);
        void        (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev,
                                       const struct prefix_info *pinfo,
                                       struct inet6_dev *in6_dev,
                                       struct in6_addr *addr,
                                       int addr_type, u32 addr_flags,
                                       bool sllao, bool tokenized,
                                       __u32 valid_lft, u32 prefered_lft,
                                       bool dev_addr_generated);
};

#if IS_ENABLED(CONFIG_IPV6)
static inline int ndisc_ops_parse_options(const struct net_device *dev,
                                          struct nd_opt_hdr *nd_opt,
                                          struct ndisc_options *ndopts)
{
        if (dev->ndisc_ops && dev->ndisc_ops->parse_options)
                return dev->ndisc_ops->parse_options(dev, nd_opt, ndopts);
        else
                return 0;
}

static inline void ndisc_ops_update(const struct net_device *dev,
                                          struct neighbour *n, u32 flags,
                                          u8 icmp6_type,
                                          const struct ndisc_options *ndopts)
{
        if (dev->ndisc_ops && dev->ndisc_ops->update)
                dev->ndisc_ops->update(dev, n, flags, icmp6_type, ndopts);
}

static inline int ndisc_ops_opt_addr_space(const struct net_device *dev,
                                           u8 icmp6_type)
{
        if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space &&
            icmp6_type != NDISC_REDIRECT)
                return dev->ndisc_ops->opt_addr_space(dev, icmp6_type, NULL,
                                                      NULL, NULL);
        else
                return 0;
}

static inline int ndisc_ops_redirect_opt_addr_space(const struct net_device *dev,
                                                    struct neighbour *neigh,
                                                    u8 *ha_buf, u8 **ha)
{
        if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space)
                return dev->ndisc_ops->opt_addr_space(dev, NDISC_REDIRECT,
                                                      neigh, ha_buf, ha);
        else
                return 0;
}

static inline void ndisc_ops_fill_addr_option(const struct net_device *dev,
                                              struct sk_buff *skb,
                                              u8 icmp6_type)
{
        if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option &&
            icmp6_type != NDISC_REDIRECT)
                dev->ndisc_ops->fill_addr_option(dev, skb, icmp6_type, NULL);
}

static inline void ndisc_ops_fill_redirect_addr_option(const struct net_device *dev,
                                                       struct sk_buff *skb,
                                                       const u8 *ha)
{
        if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option)
                dev->ndisc_ops->fill_addr_option(dev, skb, NDISC_REDIRECT, ha);
}

static inline void ndisc_ops_prefix_rcv_add_addr(struct net *net,
                                                 struct net_device *dev,
                                                 const struct prefix_info *pinfo,
                                                 struct inet6_dev *in6_dev,
                                                 struct in6_addr *addr,
                                                 int addr_type, u32 addr_flags,
                                                 bool sllao, bool tokenized,
                                                 __u32 valid_lft,
                                                 u32 prefered_lft,
                                                 bool dev_addr_generated)
{
        if (dev->ndisc_ops && dev->ndisc_ops->prefix_rcv_add_addr)
                dev->ndisc_ops->prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
                                                    addr, addr_type,
                                                    addr_flags, sllao,
                                                    tokenized, valid_lft,
                                                    prefered_lft,
                                                    dev_addr_generated);
}
#endif

/*
 * Return the padding between the option length and the start of the
 * link addr.  Currently only IP-over-InfiniBand needs this, although
 * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may
 * also need a pad of 2.
 */
static inline int ndisc_addr_option_pad(unsigned short type)
{
        switch (type) {
        case ARPHRD_INFINIBAND: return 2;
        default:                return 0;
        }
}

static inline int __ndisc_opt_addr_space(unsigned char addr_len, int pad)
{
        return NDISC_OPT_SPACE(addr_len + pad);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int ndisc_opt_addr_space(struct net_device *dev, u8 icmp6_type)
{
        return __ndisc_opt_addr_space(dev->addr_len,
                                      ndisc_addr_option_pad(dev->type)) +
                ndisc_ops_opt_addr_space(dev, icmp6_type);
}

static inline int ndisc_redirect_opt_addr_space(struct net_device *dev,
                                                struct neighbour *neigh,
                                                u8 *ops_data_buf,
                                                u8 **ops_data)
{
        return __ndisc_opt_addr_space(dev->addr_len,
                                      ndisc_addr_option_pad(dev->type)) +
                ndisc_ops_redirect_opt_addr_space(dev, neigh, ops_data_buf,
                                                  ops_data);
}
#endif

static inline u8 *__ndisc_opt_addr_data(struct nd_opt_hdr *p,
                                        unsigned char addr_len, int prepad)
{
        u8 *lladdr = (u8 *)(p + 1);
        int lladdrlen = p->nd_opt_len << 3;
        if (lladdrlen != __ndisc_opt_addr_space(addr_len, prepad))
                return NULL;
        return lladdr + prepad;
}

static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
                                      struct net_device *dev)
{
        return __ndisc_opt_addr_data(p, dev->addr_len,
                                     ndisc_addr_option_pad(dev->type));
}

static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, __u32 *hash_rnd)
{
        const u32 *p32 = pkey;

        return (((p32[0] ^ hash32_ptr(dev)) * hash_rnd[0]) +
                (p32[1] * hash_rnd[1]) +
                (p32[2] * hash_rnd[2]) +
                (p32[3] * hash_rnd[3]));
}

static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev, const void *pkey)
{
        return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev);
}

static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey)
{
        struct neighbour *n;

        rcu_read_lock();
        n = __ipv6_neigh_lookup_noref(dev, pkey);
        if (n && !refcount_inc_not_zero(&n->refcnt))
                n = NULL;
        rcu_read_unlock();

        return n;
}

static inline void __ipv6_confirm_neigh(struct net_device *dev,
                                        const void *pkey)
{
        struct neighbour *n;

        rcu_read_lock();
        n = __ipv6_neigh_lookup_noref(dev, pkey);
        neigh_confirm(n);
        rcu_read_unlock();
}

static inline struct neighbour *ip_neigh_gw6(struct net_device *dev,
                                             const void *addr)
{
#if IS_ENABLED(CONFIG_IPV6)
        struct neighbour *neigh;

        neigh = __ipv6_neigh_lookup_noref(dev, addr);
        if (unlikely(!neigh))
                neigh = __neigh_create(&nd_tbl, addr, dev, false);

        return neigh;
#else
        return ERR_PTR(-EAFNOSUPPORT);
#endif
}

int ndisc_init(void);
int ndisc_late_init(void);

void ndisc_late_cleanup(void);
void ndisc_cleanup(void);

enum skb_drop_reason ndisc_rcv(struct sk_buff *skb);

struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit,
                                const struct in6_addr *saddr, u64 nonce);
void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
                   const struct in6_addr *daddr, const struct in6_addr *saddr,
                   u64 nonce);

void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
                    const struct in6_addr *saddr);

void ndisc_send_rs(struct net_device *dev,
                   const struct in6_addr *saddr, const struct in6_addr *daddr);

void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
                   const struct in6_addr *solicited_addr,
                   bool router, bool solicited, bool override, bool inc_opt);

void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target);

int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev,
                 int dir);

void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
                  const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type,
                  struct ndisc_options *ndopts);

/*
 *        IGMP
 */
int igmp6_init(void);
int igmp6_late_init(void);

void igmp6_cleanup(void);
void igmp6_late_cleanup(void);

void igmp6_event_query(struct sk_buff *skb);

void igmp6_event_report(struct sk_buff *skb);


#ifdef CONFIG_SYSCTL
int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write,
                               void *buffer, size_t *lenp, loff_t *ppos);
#endif

void inet6_ifinfo_notify(int event, struct inet6_dev *idev);

#endif














































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
/* SPDX-License-Identifier: GPL-2.0
 *
 * page_pool/helpers.h
 *        Author:        Jesper Dangaard Brouer <netoptimizer@brouer.com>
 *        Copyright (C) 2016 Red Hat, Inc.
 */

/**
 * DOC: page_pool allocator
 *
 * The page_pool allocator is optimized for recycling page or page fragment used
 * by skb packet and xdp frame.
 *
 * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(),
 * which allocate memory with or without page splitting depending on the
 * requested memory size.
 *
 * If the driver knows that it always requires full pages or its allocations are
 * always smaller than half a page, it can use one of the more specific API
 * calls:
 *
 * 1. page_pool_alloc_pages(): allocate memory without page splitting when
 * driver knows that the memory it need is always bigger than half of the page
 * allocated from page pool. There is no cache line dirtying for 'struct page'
 * when a page is recycled back to the page pool.
 *
 * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver
 * knows that the memory it need is always smaller than or equal to half of the
 * page allocated from page pool. Page splitting enables memory saving and thus
 * avoids TLB/cache miss for data access, but there also is some cost to
 * implement page splitting, mainly some cache line dirtying/bouncing for
 * 'struct page' and atomic operation for page->pp_ref_count.
 *
 * The API keeps track of in-flight pages, in order to let API users know when
 * it is safe to free a page_pool object, the API users must call
 * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or
 * attach the page_pool object to a page_pool-aware object like skbs marked with
 * skb_mark_for_recycle().
 *
 * page_pool_put_page() may be called multiple times on the same page if a page
 * is split into multiple fragments. For the last fragment, it will either
 * recycle the page, or in case of page->_refcount > 1, it will release the DMA
 * mapping and in-flight state accounting.
 *
 * dma_sync_single_range_for_device() is only called for the last fragment when
 * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the
 * last freed fragment to do the sync_for_device operation for all fragments in
 * the same page when a page is split. The API user must setup pool->p.max_len
 * and pool->p.offset correctly and ensure that page_pool_put_page() is called
 * with dma_sync_size being -1 for fragment API.
 */
#ifndef _NET_PAGE_POOL_HELPERS_H
#define _NET_PAGE_POOL_HELPERS_H

#include <linux/dma-mapping.h>

#include <net/page_pool/types.h>
#include <net/net_debug.h>
#include <net/netmem.h>

#ifdef CONFIG_PAGE_POOL_STATS
/* Deprecated driver-facing API, use netlink instead */
int page_pool_ethtool_stats_get_count(void);
u8 *page_pool_ethtool_stats_get_strings(u8 *data);
u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats);

bool page_pool_get_stats(const struct page_pool *pool,
                         struct page_pool_stats *stats);
#else
static inline int page_pool_ethtool_stats_get_count(void)
{
        return 0;
}

static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data)
{
        return data;
}

static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
{
        return data;
}
#endif

/**
 * page_pool_dev_alloc_pages() - allocate a page.
 * @pool:        pool from which to allocate
 *
 * Get a page from the page allocator or page_pool caches.
 */
static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
{
        gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

        return page_pool_alloc_pages(pool, gfp);
}

/**
 * page_pool_dev_alloc_frag() - allocate a page fragment.
 * @pool: pool from which to allocate
 * @offset: offset to the allocated page
 * @size: requested size
 *
 * Get a page fragment from the page allocator or page_pool caches.
 *
 * Return: allocated page fragment, otherwise return NULL.
 */
static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
                                                    unsigned int *offset,
                                                    unsigned int size)
{
        gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

        return page_pool_alloc_frag(pool, offset, size, gfp);
}

static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool,
                                                unsigned int *offset,
                                                unsigned int *size, gfp_t gfp)
{
        unsigned int max_size = PAGE_SIZE << pool->p.order;
        netmem_ref netmem;

        if ((*size << 1) > max_size) {
                *size = max_size;
                *offset = 0;
                return page_pool_alloc_netmems(pool, gfp);
        }

        netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp);
        if (unlikely(!netmem))
                return 0;

        /* There is very likely not enough space for another fragment, so append
         * the remaining size to the current fragment to avoid truesize
         * underestimate problem.
         */
        if (pool->frag_offset + *size > max_size) {
                *size = max_size - *offset;
                pool->frag_offset = max_size;
        }

        return netmem;
}

static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool,
                                                    unsigned int *offset,
                                                    unsigned int *size)
{
        gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;

        return page_pool_alloc_netmem(pool, offset, size, gfp);
}

static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool)
{
        gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;

        return page_pool_alloc_netmems(pool, gfp);
}

static inline struct page *page_pool_alloc(struct page_pool *pool,
                                           unsigned int *offset,
                                           unsigned int *size, gfp_t gfp)
{
        return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp));
}

/**
 * page_pool_dev_alloc() - allocate a page or a page fragment.
 * @pool: pool from which to allocate
 * @offset: offset to the allocated page
 * @size: in as the requested size, out as the allocated size
 *
 * Get a page or a page fragment from the page allocator or page_pool caches
 * depending on the requested size in order to allocate memory with least memory
 * utilization and performance penalty.
 *
 * Return: allocated page or page fragment, otherwise return NULL.
 */
static inline struct page *page_pool_dev_alloc(struct page_pool *pool,
                                               unsigned int *offset,
                                               unsigned int *size)
{
        gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

        return page_pool_alloc(pool, offset, size, gfp);
}

static inline void *page_pool_alloc_va(struct page_pool *pool,
                                       unsigned int *size, gfp_t gfp)
{
        unsigned int offset;
        struct page *page;

        /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */
        page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM);
        if (unlikely(!page))
                return NULL;

        return page_address(page) + offset;
}

/**
 * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its
 *                              va.
 * @pool: pool from which to allocate
 * @size: in as the requested size, out as the allocated size
 *
 * This is just a thin wrapper around the page_pool_alloc() API, and
 * it returns va of the allocated page or page fragment.
 *
 * Return: the va for the allocated page or page fragment, otherwise return NULL.
 */
static inline void *page_pool_dev_alloc_va(struct page_pool *pool,
                                           unsigned int *size)
{
        gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

        return page_pool_alloc_va(pool, size, gfp);
}

/**
 * page_pool_get_dma_dir() - Retrieve the stored DMA direction.
 * @pool:        pool from which page was allocated
 *
 * Get the stored dma direction. A driver might decide to store this locally
 * and avoid the extra cache line from page_pool to determine the direction.
 */
static inline enum dma_data_direction
page_pool_get_dma_dir(const struct page_pool *pool)
{
        return pool->p.dma_dir;
}

static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr)
{
        atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr);
}

/**
 * page_pool_fragment_page() - split a fresh page into fragments
 * @page:        page to split
 * @nr:                references to set
 *
 * pp_ref_count represents the number of outstanding references to the page,
 * which will be freed using page_pool APIs (rather than page allocator APIs
 * like put_page()). Such references are usually held by page_pool-aware
 * objects like skbs marked for page pool recycling.
 *
 * This helper allows the caller to take (set) multiple references to a
 * freshly allocated page. The page must be freshly allocated (have a
 * pp_ref_count of 1). This is commonly done by drivers and
 * "fragment allocators" to save atomic operations - either when they know
 * upfront how many references they will need; or to take MAX references and
 * return the unused ones with a single atomic dec(), instead of performing
 * multiple atomic inc() operations.
 */
static inline void page_pool_fragment_page(struct page *page, long nr)
{
        page_pool_fragment_netmem(page_to_netmem(page), nr);
}

static inline long page_pool_unref_netmem(netmem_ref netmem, long nr)
{
        atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem);
        long ret;

        /* If nr == pp_ref_count then we have cleared all remaining
         * references to the page:
         * 1. 'n == 1': no need to actually overwrite it.
         * 2. 'n != 1': overwrite it with one, which is the rare case
         *              for pp_ref_count draining.
         *
         * The main advantage to doing this is that not only we avoid a atomic
         * update, as an atomic_read is generally a much cheaper operation than
         * an atomic update, especially when dealing with a page that may be
         * referenced by only 2 or 3 users; but also unify the pp_ref_count
         * handling by ensuring all pages have partitioned into only 1 piece
         * initially, and only overwrite it when the page is partitioned into
         * more than one piece.
         */
        if (atomic_long_read(pp_ref_count) == nr) {
                /* As we have ensured nr is always one for constant case using
                 * the BUILD_BUG_ON(), only need to handle the non-constant case
                 * here for pp_ref_count draining, which is a rare case.
                 */
                BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1);
                if (!__builtin_constant_p(nr))
                        atomic_long_set(pp_ref_count, 1);

                return 0;
        }

        ret = atomic_long_sub_return(nr, pp_ref_count);
        WARN_ON(ret < 0);

        /* We are the last user here too, reset pp_ref_count back to 1 to
         * ensure all pages have been partitioned into 1 piece initially,
         * this should be the rare case when the last two fragment users call
         * page_pool_unref_page() currently.
         */
        if (unlikely(!ret))
                atomic_long_set(pp_ref_count, 1);

        return ret;
}

static inline long page_pool_unref_page(struct page *page, long nr)
{
        return page_pool_unref_netmem(page_to_netmem(page), nr);
}

static inline void page_pool_ref_netmem(netmem_ref netmem)
{
        atomic_long_inc(netmem_get_pp_ref_count_ref(netmem));
}

static inline void page_pool_ref_page(struct page *page)
{
        page_pool_ref_netmem(page_to_netmem(page));
}

static inline bool page_pool_unref_and_test(netmem_ref netmem)
{
        /* If page_pool_unref_page() returns 0, we were the last user */
        return page_pool_unref_netmem(netmem, 1) == 0;
}

static inline void page_pool_put_netmem(struct page_pool *pool,
                                        netmem_ref netmem,
                                        unsigned int dma_sync_size,
                                        bool allow_direct)
{
        /* When page_pool isn't compiled-in, net/core/xdp.c doesn't
         * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
         */
#ifdef CONFIG_PAGE_POOL
        if (!page_pool_unref_and_test(netmem))
                return;

        page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct);
#endif
}

/**
 * page_pool_put_page() - release a reference to a page pool page
 * @pool:        pool from which page was allocated
 * @page:        page to release a reference on
 * @dma_sync_size: how much of the page may have been touched by the device
 * @allow_direct: released by the consumer, allow lockless caching
 *
 * The outcome of this depends on the page refcnt. If the driver bumps
 * the refcnt > 1 this will unmap the page. If the page refcnt is 1
 * the allocator owns the page and will try to recycle it in one of the pool
 * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device
 * using dma_sync_single_range_for_device().
 */
static inline void page_pool_put_page(struct page_pool *pool,
                                      struct page *page,
                                      unsigned int dma_sync_size,
                                      bool allow_direct)
{
        page_pool_put_netmem(pool, page_to_netmem(page), dma_sync_size,
                             allow_direct);
}

static inline void page_pool_put_full_netmem(struct page_pool *pool,
                                             netmem_ref netmem,
                                             bool allow_direct)
{
        page_pool_put_netmem(pool, netmem, -1, allow_direct);
}

/**
 * page_pool_put_full_page() - release a reference on a page pool page
 * @pool:        pool from which page was allocated
 * @page:        page to release a reference on
 * @allow_direct: released by the consumer, allow lockless caching
 *
 * Similar to page_pool_put_page(), but will DMA sync the entire memory area
 * as configured in &page_pool_params.max_len.
 */
static inline void page_pool_put_full_page(struct page_pool *pool,
                                           struct page *page, bool allow_direct)
{
        page_pool_put_netmem(pool, page_to_netmem(page), -1, allow_direct);
}

/**
 * page_pool_recycle_direct() - release a reference on a page pool page
 * @pool:        pool from which page was allocated
 * @page:        page to release a reference on
 *
 * Similar to page_pool_put_full_page() but caller must guarantee safe context
 * (e.g NAPI), since it will recycle the page directly into the pool fast cache.
 */
static inline void page_pool_recycle_direct(struct page_pool *pool,
                                            struct page *page)
{
        page_pool_put_full_page(pool, page, true);
}

static inline void page_pool_recycle_direct_netmem(struct page_pool *pool,
                                                   netmem_ref netmem)
{
        page_pool_put_full_netmem(pool, netmem, true);
}

#define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA        \
                (sizeof(dma_addr_t) > sizeof(unsigned long))

/**
 * page_pool_free_va() - free a va into the page_pool
 * @pool: pool from which va was allocated
 * @va: va to be freed
 * @allow_direct: freed by the consumer, allow lockless caching
 *
 * Free a va allocated from page_pool_allo_va().
 */
static inline void page_pool_free_va(struct page_pool *pool, void *va,
                                     bool allow_direct)
{
        page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct);
}

static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem)
{
        dma_addr_t ret = netmem_get_dma_addr(netmem);

        if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
                ret <<= PAGE_SHIFT;

        return ret;
}

/**
 * page_pool_get_dma_addr() - Retrieve the stored DMA address.
 * @page:        page allocated from a page pool
 *
 * Fetch the DMA address of the page. The page pool to which the page belongs
 * must had been created with PP_FLAG_DMA_MAP.
 */
static inline dma_addr_t page_pool_get_dma_addr(const struct page *page)
{
        return page_pool_get_dma_addr_netmem(page_to_netmem(page));
}

static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool,
                                                const dma_addr_t dma_addr,
                                                u32 offset, u32 dma_sync_size)
{
        dma_sync_single_range_for_cpu(pool->p.dev, dma_addr,
                                      offset + pool->p.offset, dma_sync_size,
                                      page_pool_get_dma_dir(pool));
}

/**
 * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW
 * @pool: &page_pool the @page belongs to
 * @page: page to sync
 * @offset: offset from page start to "hard" start if using PP frags
 * @dma_sync_size: size of the data written to the page
 *
 * Can be used as a shorthand to sync Rx pages before accessing them in the
 * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``.
 * Note that this version performs DMA sync unconditionally, even if the
 * associated PP doesn't perform sync-for-device.
 */
static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool,
                                              const struct page *page,
                                              u32 offset, u32 dma_sync_size)
{
        __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset,
                                     dma_sync_size);
}

static inline void
page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool,
                                  const netmem_ref netmem, u32 offset,
                                  u32 dma_sync_size)
{
        if (!pool->dma_sync_for_cpu)
                return;

        __page_pool_dma_sync_for_cpu(pool,
                                     page_pool_get_dma_addr_netmem(netmem),
                                     offset, dma_sync_size);
}

static inline void page_pool_get(struct page_pool *pool)
{
        refcount_inc(&pool->user_cnt);
}

static inline bool page_pool_put(struct page_pool *pool)
{
        return refcount_dec_and_test(&pool->user_cnt);
}

static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid)
{
        if (unlikely(pool->p.nid != new_nid))
                page_pool_update_nid(pool, new_nid);
}

/**
 * page_pool_is_unreadable() - will allocated buffers be unreadable for the CPU
 * @pool: queried page pool
 *
 * Check if page pool will return buffers which are unreadable to the CPU /
 * kernel. This will only be the case if user space bound a memory provider (mp)
 * which returns unreadable memory to the queue served by the page pool.
 * If %PP_FLAG_ALLOW_UNREADABLE_NETMEM was set but there is no mp bound
 * this helper will return false. See also netif_rxq_has_unreadable_mp().
 *
 * Return: true if memory allocated by the page pool may be unreadable
 */
static inline bool page_pool_is_unreadable(struct page_pool *pool)
{
        return !!pool->mp_ops;
}

#endif /* _NET_PAGE_POOL_HELPERS_H */

























































































    2 












































    5 






    4 






    4 































































































































































































































    1 






    1 























































































































































    1 


























































































































































































































































































    2 






    2 







































    2 










    1 




    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM writeback

#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WRITEBACK_H

#include <linux/tracepoint.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>

#define show_inode_state(state)                                        \
        __print_flags(state, "|",                                \
                {I_DIRTY_SYNC,                "I_DIRTY_SYNC"},        \
                {I_DIRTY_DATASYNC,        "I_DIRTY_DATASYNC"},        \
                {I_DIRTY_PAGES,                "I_DIRTY_PAGES"},        \
                {I_NEW,                        "I_NEW"},                \
                {I_WILL_FREE,                "I_WILL_FREE"},                \
                {I_FREEING,                "I_FREEING"},                \
                {I_CLEAR,                "I_CLEAR"},                \
                {I_SYNC,                "I_SYNC"},                \
                {I_DIRTY_TIME,                "I_DIRTY_TIME"},        \
                {I_REFERENCED,                "I_REFERENCED"},        \
                {I_LINKABLE,                "I_LINKABLE"},                \
                {I_WB_SWITCH,                "I_WB_SWITCH"},                \
                {I_OVL_INUSE,                "I_OVL_INUSE"},                \
                {I_CREATING,                "I_CREATING"},                \
                {I_DONTCACHE,                "I_DONTCACHE"},                \
                {I_SYNC_QUEUED,                "I_SYNC_QUEUED"},        \
                {I_PINNING_NETFS_WB,        "I_PINNING_NETFS_WB"},        \
                {I_LRU_ISOLATING,        "I_LRU_ISOLATING"}        \
        )

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a,b)         TRACE_DEFINE_ENUM(a);
#define EMe(a,b)        TRACE_DEFINE_ENUM(a);

#define WB_WORK_REASON                                                        \
        EM( WB_REASON_BACKGROUND,                "background")                \
        EM( WB_REASON_VMSCAN,                        "vmscan")                \
        EM( WB_REASON_SYNC,                        "sync")                        \
        EM( WB_REASON_PERIODIC,                        "periodic")                \
        EM( WB_REASON_FS_FREE_SPACE,                "fs_free_space")        \
        EM( WB_REASON_FORKER_THREAD,                "forker_thread")        \
        EMe(WB_REASON_FOREIGN_FLUSH,                "foreign_flush")

WB_WORK_REASON

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings
 * that will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a,b)                { a, b },
#define EMe(a,b)        { a, b }

struct wb_writeback_work;

DECLARE_EVENT_CLASS(writeback_folio_template,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(u64, ino)
                __field(pgoff_t, index)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
                                         NULL), 32);
                __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0;
                __entry->index = folio->index;
        ),

        TP_printk("bdi %s: ino=%llu index=%lu",
                __entry->name,
                __entry->ino,
                __entry->index
        )
);

DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping)
);

DEFINE_EVENT(writeback_folio_template, folio_wait_writeback,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping)
);

DECLARE_EVENT_CLASS(writeback_dirty_inode_template,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(u64, ino)
                __field(unsigned long, state)
                __field(unsigned long, flags)
        ),

        TP_fast_assign(
                struct backing_dev_info *bdi = inode_to_bdi(inode);

                /* may be called for files on pseudo FSes w/ unregistered bdi */
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode_state_read_once(inode);
                __entry->flags                = flags;
        ),

        TP_printk("bdi %s: ino=%llu state=%s flags=%s",
                __entry->name,
                __entry->ino,
                show_inode_state(__entry->state),
                show_inode_state(__entry->flags)
        )
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

#ifdef CREATE_TRACE_POINTS
#ifdef CONFIG_CGROUP_WRITEBACK

static inline u64 __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return cgroup_ino(wb->memcg_css->cgroup);
}

static inline u64 __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        if (wbc->wb)
                return __trace_wb_assign_cgroup(wbc->wb);
        else
                return 1;
}
#else        /* CONFIG_CGROUP_WRITEBACK */

static inline u64 __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return 1;
}

static inline u64 __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        return 1;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */
#endif        /* CREATE_TRACE_POINTS */

#ifdef CONFIG_CGROUP_WRITEBACK
TRACE_EVENT(inode_foreign_history,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                 unsigned int history),

        TP_ARGS(inode, wbc, history),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                ino)
                __field(u64,                cgroup_ino)
                __field(unsigned int,        history)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
                __entry->history        = history;
        ),

        TP_printk("bdi %s: ino=%llu cgroup_ino=%llu history=0x%x",
                __entry->name,
                __entry->ino,
                __entry->cgroup_ino,
                __entry->history
        )
);

TRACE_EVENT(inode_switch_wbs_queue,

        TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb,
                 unsigned int count),

        TP_ARGS(old_wb, new_wb, count),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                old_cgroup_ino)
                __field(u64,                new_cgroup_ino)
                __field(unsigned int,        count)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
                __entry->old_cgroup_ino        = __trace_wb_assign_cgroup(old_wb);
                __entry->new_cgroup_ino        = __trace_wb_assign_cgroup(new_wb);
                __entry->count                = count;
        ),

        TP_printk("bdi %s: old_cgroup_ino=%llu new_cgroup_ino=%llu count=%u",
                __entry->name,
                __entry->old_cgroup_ino,
                __entry->new_cgroup_ino,
                __entry->count
        )
);

TRACE_EVENT(inode_switch_wbs,

        TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
                 struct bdi_writeback *new_wb),

        TP_ARGS(inode, old_wb, new_wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                ino)
                __field(u64,                old_cgroup_ino)
                __field(u64,                new_cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->old_cgroup_ino        = __trace_wb_assign_cgroup(old_wb);
                __entry->new_cgroup_ino        = __trace_wb_assign_cgroup(new_wb);
        ),

        TP_printk("bdi %s: ino=%llu old_cgroup_ino=%llu new_cgroup_ino=%llu",
                __entry->name,
                __entry->ino,
                __entry->old_cgroup_ino,
                __entry->new_cgroup_ino
        )
);

TRACE_EVENT(track_foreign_dirty,

        TP_PROTO(struct folio *folio, struct bdi_writeback *wb),

        TP_ARGS(folio, wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                bdi_id)
                __field(u64,                ino)
                __field(u64,                cgroup_ino)
                __field(u64,                page_cgroup_ino)
                __field(unsigned int,        memcg_id)
        ),

        TP_fast_assign(
                struct address_space *mapping = folio_mapping(folio);
                struct inode *inode = mapping ? mapping->host : NULL;

                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->bdi_id                = wb->bdi->id;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->memcg_id        = wb->memcg_css->id;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);

                rcu_read_lock();
                __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
                rcu_read_unlock();
        ),

        TP_printk("bdi %s[%llu]: ino=%llu memcg_id=%u cgroup_ino=%llu page_cgroup_ino=%llu",
                __entry->name,
                __entry->bdi_id,
                __entry->ino,
                __entry->memcg_id,
                __entry->cgroup_ino,
                __entry->page_cgroup_ino
        )
);

TRACE_EVENT(flush_foreign,

        TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id,
                 unsigned int frn_memcg_id),

        TP_ARGS(wb, frn_bdi_id, frn_memcg_id),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                cgroup_ino)
                __field(unsigned int,        frn_bdi_id)
                __field(unsigned int,        frn_memcg_id)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->frn_bdi_id        = frn_bdi_id;
                __entry->frn_memcg_id        = frn_memcg_id;
        ),

        TP_printk("bdi %s: cgroup_ino=%llu frn_bdi_id=%u frn_memcg_id=%u",
                __entry->name,
                __entry->cgroup_ino,
                __entry->frn_bdi_id,
                __entry->frn_memcg_id
        )
);
#endif

DECLARE_EVENT_CLASS(writeback_write_inode_template,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(u64, ino)
                __field(u64, cgroup_ino)
                __field(int, sync_mode)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%llu sync_mode=%d cgroup_ino=%llu",
                __entry->name,
                __entry->ino,
                __entry->sync_mode,
                __entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DECLARE_EVENT_CLASS(writeback_work_class,
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
        TP_ARGS(wb, work),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(u64, cgroup_ino)
                __field(long, nr_pages)
                __field(dev_t, sb_dev)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, range_cyclic)
                __field(int, for_background)
                __field(int, reason)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->nr_pages = work->nr_pages;
                __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
                __entry->sync_mode = work->sync_mode;
                __entry->for_kupdate = work->for_kupdate;
                __entry->range_cyclic = work->range_cyclic;
                __entry->for_background        = work->for_background;
                __entry->reason = work->reason;
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
                  "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%llu",
                  __entry->name,
                  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
                  __entry->nr_pages,
                  __entry->sync_mode,
                  __entry->for_kupdate,
                  __entry->range_cyclic,
                  __entry->for_background,
                  __print_symbolic(__entry->reason, WB_WORK_REASON),
                  __entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_WORK_EVENT(name) \
DEFINE_EVENT(writeback_work_class, name, \
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
        TP_ARGS(wb, work))
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);

TRACE_EVENT(writeback_pages_written,
        TP_PROTO(long pages_written),
        TP_ARGS(pages_written),
        TP_STRUCT__entry(
                __field(long,                pages)
        ),
        TP_fast_assign(
                __entry->pages                = pages_written;
        ),
        TP_printk("%ld", __entry->pages)
);

DECLARE_EVENT_CLASS(writeback_class,
        TP_PROTO(struct bdi_writeback *wb),
        TP_ARGS(wb),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(u64, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: cgroup_ino=%llu",
                  __entry->name,
                  __entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_EVENT(name) \
DEFINE_EVENT(writeback_class, name, \
        TP_PROTO(struct bdi_writeback *wb), \
        TP_ARGS(wb))

DEFINE_WRITEBACK_EVENT(writeback_wake_background);

TRACE_EVENT(writeback_bdi_register,
        TP_PROTO(struct backing_dev_info *bdi),
        TP_ARGS(bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
        ),
        TP_printk("bdi %s",
                __entry->name
        )
);

DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
        TP_ARGS(wbc, bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(u64, cgroup_ino)
                __field(long, nr_to_write)
                __field(long, pages_skipped)
                __field(long, range_start)
                __field(long, range_end)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, for_background)
                __field(int, range_cyclic)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->for_background        = wbc->for_background;
                __entry->range_cyclic        = wbc->range_cyclic;
                __entry->range_start        = (long)wbc->range_start;
                __entry->range_end        = (long)wbc->range_end;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d bgrd=%d "
                "cyclic=%d start=0x%lx end=0x%lx cgroup_ino=%llu",
                __entry->name,
                __entry->nr_to_write,
                __entry->pages_skipped,
                __entry->sync_mode,
                __entry->for_kupdate,
                __entry->for_background,
                __entry->range_cyclic,
                __entry->range_start,
                __entry->range_end,
                __entry->cgroup_ino
        )
)

#define DEFINE_WBC_EVENT(name) \
DEFINE_EVENT(wbc_class, name, \
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
        TP_ARGS(wbc, bdi))
DEFINE_WBC_EVENT(wbc_writepage);

TRACE_EVENT(writeback_queue_io,
        TP_PROTO(struct bdi_writeback *wb,
                 struct wb_writeback_work *work,
                 unsigned long dirtied_before,
                 int moved),
        TP_ARGS(wb, work, dirtied_before, moved),
        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                cgroup_ino)
                __field(unsigned long,        older)
                __field(long,                age)
                __field(int,                moved)
                __field(int,                reason)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->older        = dirtied_before;
                __entry->age        = (jiffies - dirtied_before) * 1000 / HZ;
                __entry->moved        = moved;
                __entry->reason        = work->reason;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%llu",
                __entry->name,
                __entry->older,        /* dirtied_before in jiffies */
                __entry->age,        /* dirtied_before in relative milliseconds */
                __entry->moved,
                __print_symbolic(__entry->reason, WB_WORK_REASON),
                __entry->cgroup_ino
        )
);

TRACE_EVENT(global_dirty_state,

        TP_PROTO(unsigned long background_thresh,
                 unsigned long dirty_thresh
        ),

        TP_ARGS(background_thresh,
                dirty_thresh
        ),

        TP_STRUCT__entry(
                __field(unsigned long,        nr_dirty)
                __field(unsigned long,        nr_writeback)
                __field(unsigned long,        background_thresh)
                __field(unsigned long,        dirty_thresh)
                __field(unsigned long,        dirty_limit)
                __field(unsigned long,        nr_dirtied)
                __field(unsigned long,        nr_written)
        ),

        TP_fast_assign(
                __entry->nr_dirty        = global_node_page_state(NR_FILE_DIRTY);
                __entry->nr_writeback        = global_node_page_state(NR_WRITEBACK);
                __entry->nr_dirtied        = global_node_page_state(NR_DIRTIED);
                __entry->nr_written        = global_node_page_state(NR_WRITTEN);
                __entry->background_thresh = background_thresh;
                __entry->dirty_thresh        = dirty_thresh;
                __entry->dirty_limit        = global_wb_domain.dirty_limit;
        ),

        TP_printk("dirty=%lu writeback=%lu "
                  "bg_thresh=%lu thresh=%lu limit=%lu "
                  "dirtied=%lu written=%lu",
                  __entry->nr_dirty,
                  __entry->nr_writeback,
                  __entry->background_thresh,
                  __entry->dirty_thresh,
                  __entry->dirty_limit,
                  __entry->nr_dirtied,
                  __entry->nr_written
        )
);

#define KBps(x)                        ((x) << (PAGE_SHIFT - 10))

TRACE_EVENT(bdi_dirty_ratelimit,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long dirty_rate,
                 unsigned long task_ratelimit),

        TP_ARGS(wb, dirty_rate, task_ratelimit),

        TP_STRUCT__entry(
                __array(char,                bdi, 32)
                __field(u64,                cgroup_ino)
                __field(unsigned long,        write_bw)
                __field(unsigned long,        avg_write_bw)
                __field(unsigned long,        dirty_rate)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned long,        balanced_dirty_ratelimit)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
                __entry->write_bw        = KBps(wb->write_bandwidth);
                __entry->avg_write_bw        = KBps(wb->avg_write_bandwidth);
                __entry->dirty_rate        = KBps(dirty_rate);
                __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
                                        KBps(wb->balanced_dirty_ratelimit);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),

        TP_printk("bdi %s: "
                  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "balanced_dirty_ratelimit=%lu cgroup_ino=%llu",
                  __entry->bdi,
                  __entry->write_bw,                /* write bandwidth */
                  __entry->avg_write_bw,        /* avg write bandwidth */
                  __entry->dirty_rate,                /* bdi dirty rate */
                  __entry->dirty_ratelimit,        /* base ratelimit */
                  __entry->task_ratelimit, /* ratelimit with position control */
                  __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
                  __entry->cgroup_ino
        )
);

TRACE_EVENT(balance_dirty_pages,

        TP_PROTO(struct bdi_writeback *wb,
                 struct dirty_throttle_control *dtc,
                 unsigned long dirty_ratelimit,
                 unsigned long task_ratelimit,
                 unsigned long dirtied,
                 unsigned long period,
                 long pause,
                 unsigned long start_time),

        TP_ARGS(wb, dtc,
                dirty_ratelimit, task_ratelimit,
                dirtied, period, pause, start_time),

        TP_STRUCT__entry(
                __array(         char,        bdi, 32)
                __field(u64,                cgroup_ino)
                __field(unsigned long,        limit)
                __field(unsigned long,        setpoint)
                __field(unsigned long,        dirty)
                __field(unsigned long,        wb_setpoint)
                __field(unsigned long,        wb_dirty)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned long,        paused)
                __field(         long,        pause)
                __field(unsigned long,        period)
                __field(         long,        think)
                __field(unsigned int,        dirtied)
                __field(unsigned int,        dirtied_pause)
        ),

        TP_fast_assign(
                unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2;
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);

                __entry->limit                = dtc->limit;
                __entry->setpoint        = (dtc->limit + freerun) / 2;
                __entry->dirty                = dtc->dirty;
                __entry->wb_setpoint        = __entry->setpoint *
                                                dtc->wb_thresh / (dtc->thresh + 1);
                __entry->wb_dirty        = dtc->wb_dirty;
                __entry->dirty_ratelimit = KBps(dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->dirtied        = dirtied;
                __entry->dirtied_pause        = current->nr_dirtied_pause;
                __entry->think                = current->dirty_paused_when == 0 ? 0 :
                         (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
                __entry->period                = period * 1000 / HZ;
                __entry->pause                = pause * 1000 / HZ;
                __entry->paused                = (jiffies - start_time) * 1000 / HZ;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),


        TP_printk("bdi %s: "
                  "limit=%lu setpoint=%lu dirty=%lu "
                  "wb_setpoint=%lu wb_dirty=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "dirtied=%u dirtied_pause=%u "
                  "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%llu",
                  __entry->bdi,
                  __entry->limit,
                  __entry->setpoint,
                  __entry->dirty,
                  __entry->wb_setpoint,
                  __entry->wb_dirty,
                  __entry->dirty_ratelimit,
                  __entry->task_ratelimit,
                  __entry->dirtied,
                  __entry->dirtied_pause,
                  __entry->paused,        /* ms */
                  __entry->pause,        /* ms */
                  __entry->period,        /* ms */
                  __entry->think,        /* ms */
                  __entry->cgroup_ino
          )
);

TRACE_EVENT(writeback_sb_inodes_requeue,

        TP_PROTO(struct inode *inode),
        TP_ARGS(inode),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(u64, ino)
                __field(u64, cgroup_ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode_state_read_once(inode);
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(inode_to_wb(inode));
        ),

        TP_printk("bdi %s: ino=%llu state=%s dirtied_when=%lu age=%lu cgroup_ino=%llu",
                  __entry->name,
                  __entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  __entry->cgroup_ino
        )
);

DECLARE_EVENT_CLASS(writeback_single_inode_template,

        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write
        ),

        TP_ARGS(inode, wbc, nr_to_write),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(u64, ino)
                __field(u64, cgroup_ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(unsigned long, writeback_index)
                __field(unsigned long, wrote)
                __field(long, nr_to_write)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode_state_read_once(inode);
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->nr_to_write        = nr_to_write;
                __entry->wrote                = nr_to_write - wbc->nr_to_write;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%llu state=%s dirtied_when=%lu age=%lu "
                  "index=%lu to_write=%ld wrote=%lu cgroup_ino=%llu",
                  __entry->name,
                  __entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  __entry->writeback_index,
                  __entry->nr_to_write,
                  __entry->wrote,
                  __entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DECLARE_EVENT_CLASS(writeback_inode_template,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        u64,        ino                        )
                __field(unsigned long,        state                        )
                __field(unsigned long, dirtied_when                )
                __field(        dev_t,        dev                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->state        = inode_state_read_once(inode);
                __entry->mode        = inode->i_mode;
                __entry->dirtied_when = inode->dirtied_when;
        ),

        TP_printk("dev %d,%d ino %llu dirtied %lu state %s mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino, __entry->dirtied_when,
                  show_inode_state(__entry->state), __entry->mode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/*
 * Inode writeback list tracking.
 */

DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

#endif /* _TRACE_WRITEBACK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>















































































































































































































































































































    3 
    8 





























































































































































   23 


























   20 








   23 








   22 




   20 








































































































   16 




















    2 



































   18 




   17 







   17 








    9 
   18 


    9 


   16 









   17 








   16 


























































































































































































































































































































   21 





































    4 



   25 



   21 



    4 




   35 



   35 


   21 







   21 



   26 

























   33 
























   34 
   32 
   30 
    4 

    3 












   34 
   33 







   33 












   34 
   33 
   33 















   13 
    3 

   12 















   15 
   16 






























































































































































































   18 
   14 


   19 
















































































































   15 




   16 
   14 
   15 











































   16 









   14 































































































































































































































































































































































































































































































   15 





   15 


   16 












   15 






























































































































































































































































































































































































































































































































   33 
   35 



















    3 




    3 






















































































































































    1 
    1 












    1 











































































































































































































































































































































































































































































































































































































































































































   11 










   12 







































































    3 












    3 











































































































































































   12 














   10 






   11 


































































































































































































































































































   13 


























































































































































































































































































































































































































































































































































    4 



    5 
    5 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 



    1 
    2 
    2 








    2 












    2 




    2 

    2 




    2 









    2 









    2 


    2 







    2 

    2 



    2 
    1 
    1 
    2 




















































































































































































































































































































































































































    3 














    4 





















































































































































































































































































































    1 


















































































    1 

















    1 




















    1 

































    1 



    1 







    1 




































































































































































































































    1 









    1 






    1 












    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Routines having to do with the 'struct sk_buff' memory handlers.
 *
 *        Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
 *                        Florian La Roche <rzsfl@rz.uni-sb.de>
 *
 *        Fixes:
 *                Alan Cox        :        Fixed the worst of the load
 *                                        balancer bugs.
 *                Dave Platt        :        Interrupt stacking fix.
 *        Richard Kooijman        :        Timestamp fixes.
 *                Alan Cox        :        Changed buffer format.
 *                Alan Cox        :        destructor hook for AF_UNIX etc.
 *                Linus Torvalds        :        Better skb_clone.
 *                Alan Cox        :        Added skb_copy.
 *                Alan Cox        :        Added all the changed routines Linus
 *                                        only put in the headers
 *                Ray VanTassle        :        Fixed --skb->lock in free
 *                Alan Cox        :        skb_copy copy arp field
 *                Andi Kleen        :        slabified it.
 *                Robert Olsson        :        Removed skb_head_pool
 *
 *        NOTE:
 *                The __skb_ routines should be called with interrupts
 *        disabled, or you better be *real* sure that the operation is atomic
 *        with respect to whatever list is being frobbed (e.g. via lock_sock()
 *        or via disabling bottom half handlers, etc).
 */

/*
 *        The functions in this file will not compile correctly with gcc 2.4.x
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/skbuff_ref.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/bitfield.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <linux/iov_iter.h>
#include <linux/crc32.h>

#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/gro.h>
#include <net/gso.h>
#include <net/hotdata.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/mctp.h>
#include <net/can.h>
#include <net/page_pool/helpers.h>
#include <net/psp/types.h>
#include <net/dropreason.h>
#include <net/xdp_sock.h>

#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/textsearch.h>

#include "dev.h"
#include "devmem.h"
#include "net-sysfs.h"
#include "netmem_priv.h"
#include "sock_destructor.h"

#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif

#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
                                               GRO_MAX_HEAD_PAD))

/* SKB_SMALL_HEAD_CACHE_SIZE is the size used for the skbuff_small_head
 * kmem_cache. The non-power-of-2 padding is kept for historical reasons and
 * to avoid potential collisions with generic kmalloc bucket sizes.
 */
#define SKB_SMALL_HEAD_CACHE_SIZE                                        \
        (is_power_of_2(SKB_SMALL_HEAD_SIZE) ?                        \
                (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) :        \
                SKB_SMALL_HEAD_SIZE)

#define SKB_SMALL_HEAD_HEADROOM                                                \
        SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)

/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
 * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
 * netmem is a page.
 */
static_assert(offsetof(struct bio_vec, bv_page) ==
              offsetof(skb_frag_t, netmem));
static_assert(sizeof_field(struct bio_vec, bv_page) ==
              sizeof_field(skb_frag_t, netmem));

static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
static_assert(sizeof_field(struct bio_vec, bv_len) ==
              sizeof_field(skb_frag_t, len));

static_assert(offsetof(struct bio_vec, bv_offset) ==
              offsetof(skb_frag_t, offset));
static_assert(sizeof_field(struct bio_vec, bv_offset) ==
              sizeof_field(skb_frag_t, offset));

#undef FN
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
static const char * const drop_reasons[] = {
        [SKB_CONSUMED] = "CONSUMED",
        DEFINE_DROP_REASON(FN, FN)
};

static const struct drop_reason_list drop_reasons_core = {
        .reasons = drop_reasons,
        .n_reasons = ARRAY_SIZE(drop_reasons),
};

const struct drop_reason_list __rcu *
drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
        [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
};
EXPORT_SYMBOL(drop_reasons_by_subsys);

/**
 * drop_reasons_register_subsys - register another drop reason subsystem
 * @subsys: the subsystem to register, must not be the core
 * @list: the list of drop reasons within the subsystem, must point to
 *        a statically initialized list
 */
void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
                                  const struct drop_reason_list *list)
{
        if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
                 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
                 "invalid subsystem %d\n", subsys))
                return;

        /* must point to statically allocated memory, so INIT is OK */
        RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
}
EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);

/**
 * drop_reasons_unregister_subsys - unregister a drop reason subsystem
 * @subsys: the subsystem to remove, must not be the core
 *
 * Note: This will synchronize_rcu() to ensure no users when it returns.
 */
void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
{
        if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
                 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
                 "invalid subsystem %d\n", subsys))
                return;

        RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);

        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);

/**
 *        skb_panic - private function for out-of-line support
 *        @skb:        buffer
 *        @sz:        size
 *        @addr:        address
 *        @msg:        skb_over_panic or skb_under_panic
 *
 *        Out-of-line support for skb_put() and skb_push().
 *        Called via the wrapper skb_over_panic() or skb_under_panic().
 *        Keep out of line to prevent kernel bloat.
 *        __builtin_return_address is not used because it is not always reliable.
 */
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
                      const char msg[])
{
        pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
                 msg, addr, skb->len, sz, skb->head, skb->data,
                 (unsigned long)skb->tail, (unsigned long)skb->end,
                 skb->dev ? skb->dev->name : "<NULL>");
        BUG();
}

static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
        skb_panic(skb, sz, addr, __func__);
}

static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
        skb_panic(skb, sz, addr, __func__);
}

#define NAPI_SKB_CACHE_SIZE        128
#define NAPI_SKB_CACHE_BULK        32
#define NAPI_SKB_CACHE_FREE        32

struct napi_alloc_cache {
        local_lock_t bh_lock;
        struct page_frag_cache page;
        unsigned int skb_count;
        void *skb_cache[NAPI_SKB_CACHE_SIZE];
};

static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        void *data;

        fragsz = SKB_DATA_ALIGN(fragsz);

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        data = __page_frag_alloc_align(&nc->page, fragsz,
                                       GFP_ATOMIC | __GFP_NOWARN, align_mask);
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
        return data;

}
EXPORT_SYMBOL(__napi_alloc_frag_align);

void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
        void *data;

        if (in_hardirq() || irqs_disabled()) {
                struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);

                fragsz = SKB_DATA_ALIGN(fragsz);
                data = __page_frag_alloc_align(nc, fragsz,
                                               GFP_ATOMIC | __GFP_NOWARN,
                                               align_mask);
        } else {
                local_bh_disable();
                data = __napi_alloc_frag_align(fragsz, align_mask);
                local_bh_enable();
        }
        return data;
}
EXPORT_SYMBOL(__netdev_alloc_frag_align);

/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler
 * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset.
 */
static u32 skbuff_cache_size __read_mostly;

static inline struct sk_buff *napi_skb_cache_get(bool alloc)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        struct sk_buff *skb;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        if (unlikely(!nc->skb_count)) {
                if (alloc)
                        nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
                                                GFP_ATOMIC | __GFP_NOWARN,
                                                NAPI_SKB_CACHE_BULK,
                                                nc->skb_cache);
                if (unlikely(!nc->skb_count)) {
                        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
                        return NULL;
                }
        }

        skb = nc->skb_cache[--nc->skb_count];
        if (nc->skb_count)
                prefetch(nc->skb_cache[nc->skb_count - 1]);
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
        kasan_mempool_unpoison_object(skb, skbuff_cache_size);

        return skb;
}

/*
 * Only clear those fields we need to clear, not those that we will
 * actually initialise later. Hence, don't put any more fields after
 * the tail pointer in struct sk_buff!
 */
static inline void skbuff_clear(struct sk_buff *skb)
{
        /* Replace memset(skb, 0, offsetof(struct sk_buff, tail))
         * with two smaller memset(), with a barrier() between them.
         * This forces the compiler to inline both calls.
         */
        BUILD_BUG_ON(offsetof(struct sk_buff, tail) <= 128);
        memset(skb, 0, 128);
        barrier();
        memset((void *)skb + 128, 0, offsetof(struct sk_buff, tail) - 128);
}

/**
 * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
 * @skbs: pointer to an at least @n-sized array to fill with skb pointers
 * @n: number of entries to provide
 *
 * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
 * the pointers into the provided array @skbs. If there are less entries
 * available, tries to replenish the cache and bulk-allocates the diff from
 * the MM layer if needed.
 * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
 * ready for {,__}build_skb_around() and don't have any data buffers attached.
 * Must be called *only* from the BH context.
 *
 * Return: number of successfully allocated skbs (@n if no actual allocation
 *           needed or kmem_cache_alloc_bulk() didn't fail).
 */
u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        u32 bulk, total = n;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);

        if (nc->skb_count >= n)
                goto get;

        /* No enough cached skbs. Try refilling the cache first */
        bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
        nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
                                               GFP_ATOMIC | __GFP_NOWARN, bulk,
                                               &nc->skb_cache[nc->skb_count]);
        if (likely(nc->skb_count >= n))
                goto get;

        /* Still not enough. Bulk-allocate the missing part directly, zeroed */
        n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
                                   GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
                                   n - nc->skb_count, &skbs[nc->skb_count]);
        if (likely(nc->skb_count >= n))
                goto get;

        /* kmem_cache didn't allocate the number we need, limit the output */
        total -= n - nc->skb_count;
        n = nc->skb_count;

get:
        for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
                skbs[i] = nc->skb_cache[base + i];

                kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size);
                skbuff_clear(skbs[i]);
        }

        nc->skb_count -= n;
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);

        return total;
}
EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);

static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
                                         unsigned int size)
{
        struct skb_shared_info *shinfo;

        size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        /* Assumes caller memset cleared SKB */
        skb->truesize = SKB_TRUESIZE(size);
        refcount_set(&skb->users, 1);
        skb->head = data;
        skb->data = data;
        skb_reset_tail_pointer(skb);
        skb_set_end_offset(skb, size);
        skb->mac_header = (typeof(skb->mac_header))~0U;
        skb->transport_header = (typeof(skb->transport_header))~0U;
        skb->alloc_cpu = raw_smp_processor_id();
        /* make sure we initialize shinfo sequentially */
        shinfo = skb_shinfo(skb);
        memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
        atomic_set(&shinfo->dataref, 1);

        skb_set_kcov_handle(skb, kcov_common_handle());
}

static inline void *__slab_build_skb(void *data, unsigned int *size)
{
        void *resized;

        /* Must find the allocation size (and grow it to match). */
        *size = ksize(data);
        /* krealloc() will immediately return "data" when
         * "ksize(data)" is requested: it is the existing upper
         * bounds. As a result, GFP_ATOMIC will be ignored. Note
         * that this "new" pointer needs to be passed back to the
         * caller for use so the __alloc_size hinting will be
         * tracked correctly.
         */
        resized = krealloc(data, *size, GFP_ATOMIC);
        WARN_ON_ONCE(resized != data);
        return resized;
}

/* build_skb() variant which can operate on slab buffers.
 * Note that this should be used sparingly as slab buffers
 * cannot be combined efficiently by GRO!
 */
struct sk_buff *slab_build_skb(void *data)
{
        struct sk_buff *skb;
        unsigned int size;

        skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;

        skbuff_clear(skb);
        data = __slab_build_skb(data, &size);
        __finalize_skb_around(skb, data, size);

        return skb;
}
EXPORT_SYMBOL(slab_build_skb);

/* Caller must provide SKB that is memset cleared */
static void __build_skb_around(struct sk_buff *skb, void *data,
                               unsigned int frag_size)
{
        unsigned int size = frag_size;

        /* frag_size == 0 is considered deprecated now. Callers
         * using slab buffer should use slab_build_skb() instead.
         */
        if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
                data = __slab_build_skb(data, &size);

        __finalize_skb_around(skb, data, size);
}

/**
 * __build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data (must not be 0)
 *
 * Allocate a new &sk_buff. Caller provides space holding head and
 * skb_shared_info. @data must have been allocated from the page
 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
 * allocation is deprecated, and callers should use slab_build_skb()
 * instead.)
 * The return is the new skb buffer.
 * On a failure the return is %NULL, and @data is not freed.
 * Notes :
 *  Before IO, driver allocates only data buffer where NIC put incoming frame
 *  Driver should add room at head (NET_SKB_PAD) and
 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 *  before giving packet to stack.
 *  RX rings only contains data buffers, not full skbs.
 */
struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb;

        skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;

        skbuff_clear(skb);
        __build_skb_around(skb, data, frag_size);

        return skb;
}

/* build_skb() is wrapper over __build_skb(), that specifically
 * takes care of skb->head and skb->pfmemalloc
 */
struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb = __build_skb(data, frag_size);

        if (likely(skb && frag_size)) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }
        return skb;
}
EXPORT_SYMBOL(build_skb);

/**
 * build_skb_around - build a network buffer around provided skb
 * @skb: sk_buff provide by caller, must be memset cleared
 * @data: data buffer provided by caller
 * @frag_size: size of data
 */
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size)
{
        if (unlikely(!skb))
                return NULL;

        __build_skb_around(skb, data, frag_size);

        if (frag_size) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }
        return skb;
}
EXPORT_SYMBOL(build_skb_around);

/**
 * __napi_build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data
 *
 * Version of __build_skb() that uses NAPI percpu caches to obtain
 * skbuff_head instead of inplace allocation.
 *
 * Returns a new &sk_buff on success, %NULL on allocation failure.
 */
static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb;

        skb = napi_skb_cache_get(true);
        if (unlikely(!skb))
                return NULL;

        skbuff_clear(skb);
        __build_skb_around(skb, data, frag_size);

        return skb;
}

/**
 * napi_build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data
 *
 * Version of __napi_build_skb() that takes care of skb->head_frag
 * and skb->pfmemalloc when the data is a page or page fragment.
 *
 * Returns a new &sk_buff on success, %NULL on allocation failure.
 */
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb = __napi_build_skb(data, frag_size);

        if (likely(skb) && frag_size) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }

        return skb;
}
EXPORT_SYMBOL(napi_build_skb);

static void *kmalloc_pfmemalloc(size_t obj_size, gfp_t flags, int node)
{
        if (!gfp_pfmemalloc_allowed(flags))
                return NULL;
        if (!obj_size)
                return kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
                                             flags, node);
        return kmalloc_node_track_caller(obj_size, flags, node);
}

/*
 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 * the caller if emergency pfmemalloc reserves are being used. If it is and
 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 * may be used. Otherwise, the packet data may be discarded until enough
 * memory is free
 */
static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
                             struct sk_buff *skb)
{
        size_t obj_size;
        void *obj;

        obj_size = SKB_HEAD_ALIGN(*size);
        if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
            !(flags & KMALLOC_NOT_NORMAL_BITS)) {
                obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
                                flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
                                node);
                *size = SKB_SMALL_HEAD_CACHE_SIZE;
                if (likely(obj))
                        goto out;
                /* Try again but now we are using pfmemalloc reserves */
                if (skb)
                        skb->pfmemalloc = true;
                return kmalloc_pfmemalloc(0, flags, node);
        }

        obj_size = kmalloc_size_roundup(obj_size);
        /* The following cast might truncate high-order bits of obj_size, this
         * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
         */
        *size = (unsigned int)obj_size;

        /*
         * Try a regular allocation, when that fails and we're not entitled
         * to the reserves, fail.
         */
        obj = kmalloc_node_track_caller(obj_size,
                                        flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
                                        node);
        if (likely(obj))
                goto out;

        /* Try again but now we are using pfmemalloc reserves */
        if (skb)
                skb->pfmemalloc = true;
        obj = kmalloc_pfmemalloc(obj_size, flags, node);
out:
        return obj;
}

/*         Allocate a new skbuff. We do this ourselves so we can fill in a few
 *        'private' fields and also do memory statistics to find all the
 *        [BEEP] leaks.
 *
 */

/**
 *        __alloc_skb        -        allocate a network buffer
 *        @size: size to allocate
 *        @gfp_mask: allocation mask
 *        @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 *                instead of head cache and allocate a cloned (child) skb.
 *                If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 *                allocations in case the data is required for writeback
 *        @node: numa node to allocate memory on
 *
 *        Allocate a new &sk_buff. The returned buffer has no headroom and a
 *        tail room of at least size bytes. The object has a reference count
 *        of one. The return is the buffer. On a failure the return is %NULL.
 *
 *        Buffers may only be allocated from interrupts using a @gfp_mask of
 *        %GFP_ATOMIC.
 */
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
                            int flags, int node)
{
        struct sk_buff *skb = NULL;
        struct kmem_cache *cache;
        u8 *data;

        if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
                gfp_mask |= __GFP_MEMALLOC;

        if (flags & SKB_ALLOC_FCLONE) {
                cache = net_hotdata.skbuff_fclone_cache;
                goto fallback;
        }
        cache = net_hotdata.skbuff_cache;
        if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id()))
                goto fallback;

        if (flags & SKB_ALLOC_NAPI) {
                skb = napi_skb_cache_get(true);
                if (unlikely(!skb))
                        return NULL;
        } else if (!in_hardirq() && !irqs_disabled()) {
                local_bh_disable();
                skb = napi_skb_cache_get(false);
                local_bh_enable();
        }

        if (!skb) {
fallback:
                skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
                if (unlikely(!skb))
                        return NULL;
        }
        skbuff_clear(skb);

        /* We do our best to align skb_shared_info on a separate cache
         * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
         * aligned memory blocks, unless SLUB/SLAB debug is enabled.
         * Both skb->head and skb_shared_info are cache line aligned.
         */
        data = kmalloc_reserve(&size, gfp_mask, node, skb);
        if (unlikely(!data))
                goto nodata;
        /* kmalloc_size_roundup() might give us more room than requested.
         * Put skb_shared_info exactly at the end of allocated zone,
         * to allow max possible filling before reallocation.
         */
        __finalize_skb_around(skb, data, size);

        if (flags & SKB_ALLOC_FCLONE) {
                struct sk_buff_fclones *fclones;

                fclones = container_of(skb, struct sk_buff_fclones, skb1);

                /* skb->fclone is a 2bits field.
                 * Replace expensive RMW (skb->fclone = SKB_FCLONE_ORIG)
                 * with a single OR.
                 */
                BUILD_BUG_ON(SKB_FCLONE_UNAVAILABLE != 0);
                DEBUG_NET_WARN_ON_ONCE(skb->fclone != SKB_FCLONE_UNAVAILABLE);
                skb->fclone |= SKB_FCLONE_ORIG;

                refcount_set(&fclones->fclone_ref, 1);
        }

        return skb;

nodata:
        kmem_cache_free(cache, skb);
        return NULL;
}
EXPORT_SYMBOL(__alloc_skb);

/**
 *        __netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @len: length to allocate
 *        @gfp_mask: get_free_pages mask, passed to alloc_skb
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has NET_SKB_PAD headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory.
 */
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
                                   gfp_t gfp_mask)
{
        struct page_frag_cache *nc;
        struct sk_buff *skb;
        bool pfmemalloc;
        void *data;

        len += NET_SKB_PAD;

        /* If requested length is either too small or too big,
         * we use kmalloc() for skb->head allocation.
         */
        if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
            len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
            (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
                goto skb_success;
        }

        len = SKB_HEAD_ALIGN(len);

        if (sk_memalloc_socks())
                gfp_mask |= __GFP_MEMALLOC;

        if (in_hardirq() || irqs_disabled()) {
                nc = this_cpu_ptr(&netdev_alloc_cache);
                data = page_frag_alloc(nc, len, gfp_mask);
                pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
        } else {
                local_bh_disable();
                local_lock_nested_bh(&napi_alloc_cache.bh_lock);

                nc = this_cpu_ptr(&napi_alloc_cache.page);
                data = page_frag_alloc(nc, len, gfp_mask);
                pfmemalloc = page_frag_cache_is_pfmemalloc(nc);

                local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
                local_bh_enable();
        }

        if (unlikely(!data))
                return NULL;

        skb = __build_skb(data, len);
        if (unlikely(!skb)) {
                skb_free_frag(data);
                return NULL;
        }

        if (pfmemalloc)
                skb->pfmemalloc = 1;
        skb->head_frag = 1;

skb_success:
        skb_reserve(skb, NET_SKB_PAD);
        skb->dev = dev;

skb_fail:
        return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);

/**
 *        napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
 *        @napi: napi instance this buffer was allocated for
 *        @len: length to allocate
 *
 *        Allocate a new sk_buff for use in NAPI receive.  This buffer will
 *        attempt to allocate the head from a special reserved region used
 *        only for NAPI Rx allocation.  By doing this we can save several
 *        CPU cycles by avoiding having to disable and re-enable IRQs.
 *
 *        %NULL is returned if there is no free memory.
 */
struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
{
        gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
        struct napi_alloc_cache *nc;
        struct sk_buff *skb;
        bool pfmemalloc;
        void *data;

        DEBUG_NET_WARN_ON_ONCE(!in_softirq());
        len += NET_SKB_PAD + NET_IP_ALIGN;

        /* If requested length is either too small or too big,
         * we use kmalloc() for skb->head allocation.
         */
        if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
            len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
            (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
                                  NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
                goto skb_success;
        }

        len = SKB_HEAD_ALIGN(len);

        if (sk_memalloc_socks())
                gfp_mask |= __GFP_MEMALLOC;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        nc = this_cpu_ptr(&napi_alloc_cache);

        data = page_frag_alloc(&nc->page, len, gfp_mask);
        pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);

        if (unlikely(!data))
                return NULL;

        skb = __napi_build_skb(data, len);
        if (unlikely(!skb)) {
                skb_free_frag(data);
                return NULL;
        }

        if (pfmemalloc)
                skb->pfmemalloc = 1;
        skb->head_frag = 1;

skb_success:
        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
        skb->dev = napi->dev;

skb_fail:
        return skb;
}
EXPORT_SYMBOL(napi_alloc_skb);


void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize)
{
        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

        DEBUG_NET_WARN_ON_ONCE(size > truesize);

        skb_frag_size_add(frag, size);
        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_coalesce_rx_frag);

static void skb_drop_list(struct sk_buff **listp)
{
        kfree_skb_list(*listp);
        *listp = NULL;
}

static inline void skb_drop_fraglist(struct sk_buff *skb)
{
        skb_drop_list(&skb_shinfo(skb)->frag_list);
}

static void skb_clone_fraglist(struct sk_buff *skb)
{
        struct sk_buff *list;

        skb_walk_frags(skb, list)
                skb_get(list);
}

int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
                    unsigned int headroom)
{
#if IS_ENABLED(CONFIG_PAGE_POOL)
        u32 size, truesize, len, max_head_size, off;
        struct sk_buff *skb = *pskb, *nskb;
        int err, i, head_off;
        void *data;

        /* XDP does not support fraglist so we need to linearize
         * the skb.
         */
        if (skb_has_frag_list(skb))
                return -EOPNOTSUPP;

        max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
        if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
                return -ENOMEM;

        size = min_t(u32, skb->len, max_head_size);
        truesize = SKB_HEAD_ALIGN(size) + headroom;
        data = page_pool_dev_alloc_va(pool, &truesize);
        if (!data)
                return -ENOMEM;

        nskb = napi_build_skb(data, truesize);
        if (!nskb) {
                page_pool_free_va(pool, data, true);
                return -ENOMEM;
        }

        skb_reserve(nskb, headroom);
        skb_copy_header(nskb, skb);
        skb_mark_for_recycle(nskb);

        err = skb_copy_bits(skb, 0, nskb->data, size);
        if (err) {
                consume_skb(nskb);
                return err;
        }
        skb_put(nskb, size);

        head_off = skb_headroom(nskb) - skb_headroom(skb);
        skb_headers_offset_update(nskb, head_off);

        off = size;
        len = skb->len - off;
        for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
                struct page *page;
                u32 page_off;

                size = min_t(u32, len, PAGE_SIZE);
                truesize = size;

                page = page_pool_dev_alloc(pool, &page_off, &truesize);
                if (!page) {
                        consume_skb(nskb);
                        return -ENOMEM;
                }

                skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
                err = skb_copy_bits(skb, off, page_address(page) + page_off,
                                    size);
                if (err) {
                        consume_skb(nskb);
                        return err;
                }

                len -= size;
                off += size;
        }

        consume_skb(skb);
        *pskb = nskb;

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}
EXPORT_SYMBOL(skb_pp_cow_data);

int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
                         const struct bpf_prog *prog)
{
        if (!prog->aux->xdp_has_frags)
                return -EINVAL;

        return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
}
EXPORT_SYMBOL(skb_cow_data_for_xdp);

#if IS_ENABLED(CONFIG_PAGE_POOL)
bool napi_pp_put_page(netmem_ref netmem)
{
        netmem = netmem_compound_head(netmem);

        if (unlikely(!netmem_is_pp(netmem)))
                return false;

        page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);

        return true;
}
EXPORT_SYMBOL(napi_pp_put_page);
#endif

static bool skb_pp_recycle(struct sk_buff *skb, void *data)
{
        if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
                return false;
        return napi_pp_put_page(page_to_netmem(virt_to_page(data)));
}

/**
 * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
 * @skb:        page pool aware skb
 *
 * Increase the fragment reference count (pp_ref_count) of a skb. This is
 * intended to gain fragment references only for page pool aware skbs,
 * i.e. when skb->pp_recycle is true, and not for fragments in a
 * non-pp-recycling skb. It has a fallback to increase references on normal
 * pages, as page pool aware skbs may also have normal page fragments.
 */
static int skb_pp_frag_ref(struct sk_buff *skb)
{
        struct skb_shared_info *shinfo;
        netmem_ref head_netmem;
        int i;

        if (!skb->pp_recycle)
                return -EINVAL;

        shinfo = skb_shinfo(skb);

        for (i = 0; i < shinfo->nr_frags; i++) {
                head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
                if (likely(netmem_is_pp(head_netmem)))
                        page_pool_ref_netmem(head_netmem);
                else
                        page_ref_inc(netmem_to_page(head_netmem));
        }
        return 0;
}

static void skb_kfree_head(void *head)
{
        kfree(head);
}

static void skb_free_head(struct sk_buff *skb)
{
        unsigned char *head = skb->head;

        if (skb->head_frag) {
                if (skb_pp_recycle(skb, head))
                        return;
                skb_free_frag(head);
        } else {
                skb_kfree_head(head);
        }
}

static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);
        int i;

        if (!skb_data_unref(skb, shinfo))
                goto exit;

        if (skb_zcopy(skb)) {
                bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;

                skb_zcopy_clear(skb, true);
                if (skip_unref)
                        goto free_head;
        }

        for (i = 0; i < shinfo->nr_frags; i++)
                __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);

free_head:
        if (shinfo->frag_list)
                kfree_skb_list_reason(shinfo->frag_list, reason);

        skb_free_head(skb);
exit:
        /* When we clone an SKB we copy the reycling bit. The pp_recycle
         * bit is only set on the head though, so in order to avoid races
         * while trying to recycle fragments on __skb_frag_unref() we need
         * to make one SKB responsible for triggering the recycle path.
         * So disable the recycling bit if an SKB is cloned and we have
         * additional references to the fragmented part of the SKB.
         * Eventually the last SKB will have the recycling bit set and it's
         * dataref set to 0, which will trigger the recycling
         */
        skb->pp_recycle = 0;
}

/*
 *        Free an skbuff by memory without cleaning the state.
 */
static void kfree_skbmem(struct sk_buff *skb)
{
        struct sk_buff_fclones *fclones;

        switch (skb->fclone) {
        case SKB_FCLONE_UNAVAILABLE:
                kmem_cache_free(net_hotdata.skbuff_cache, skb);
                return;

        case SKB_FCLONE_ORIG:
                fclones = container_of(skb, struct sk_buff_fclones, skb1);

                /* We usually free the clone (TX completion) before original skb
                 * This test would have no chance to be true for the clone,
                 * while here, branch prediction will be good.
                 */
                if (refcount_read(&fclones->fclone_ref) == 1)
                        goto fastpath;
                break;

        default: /* SKB_FCLONE_CLONE */
                fclones = container_of(skb, struct sk_buff_fclones, skb2);
                break;
        }
        if (!refcount_dec_and_test(&fclones->fclone_ref))
                return;
fastpath:
        kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
}

void skb_release_head_state(struct sk_buff *skb)
{
        skb_dst_drop(skb);
        if (skb->destructor) {
                DEBUG_NET_WARN_ON_ONCE(in_hardirq());
#ifdef CONFIG_INET
                INDIRECT_CALL_4(skb->destructor,
                                tcp_wfree, __sock_wfree, sock_wfree,
                                xsk_destruct_skb,
                                skb);
#else
                INDIRECT_CALL_2(skb->destructor,
                                sock_wfree, xsk_destruct_skb,
                                skb);

#endif
                skb->destructor = NULL;
                skb->sk = NULL;
        }
        nf_reset_ct(skb);
        skb_ext_reset(skb);
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
        skb_release_head_state(skb);
        if (likely(skb->head))
                skb_release_data(skb, reason);
}

/**
 *        __kfree_skb - private function
 *        @skb: buffer
 *
 *        Free an sk_buff. Release anything attached to the buffer.
 *        Clean the state. This is an internal helper function. Users should
 *        always call kfree_skb
 */

void __kfree_skb(struct sk_buff *skb)
{
        skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
        kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);

static __always_inline
bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
                          enum skb_drop_reason reason)
{
        if (unlikely(!skb_unref(skb)))
                return false;

        DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
                               u32_get_bits(reason,
                                            SKB_DROP_REASON_SUBSYS_MASK) >=
                                SKB_DROP_REASON_SUBSYS_NUM);

        if (reason == SKB_CONSUMED)
                trace_consume_skb(skb, __builtin_return_address(0));
        else
                trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
        return true;
}

/**
 *        sk_skb_reason_drop - free an sk_buff with special reason
 *        @sk: the socket to receive @skb, or NULL if not applicable
 *        @skb: buffer to free
 *        @reason: reason why this skb is dropped
 *
 *        Drop a reference to the buffer and free it if the usage count has hit
 *        zero. Meanwhile, pass the receiving socket and drop reason to
 *        'kfree_skb' tracepoint.
 */
void __fix_address
sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
        if (__sk_skb_reason_drop(sk, skb, reason))
                __kfree_skb(skb);
}
EXPORT_SYMBOL(sk_skb_reason_drop);

#define KFREE_SKB_BULK_SIZE        16

struct skb_free_array {
        unsigned int skb_count;
        void *skb_array[KFREE_SKB_BULK_SIZE];
};

static void kfree_skb_add_bulk(struct sk_buff *skb,
                               struct skb_free_array *sa,
                               enum skb_drop_reason reason)
{
        /* if SKB is a clone, don't handle this case */
        if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
                __kfree_skb(skb);
                return;
        }

        skb_release_all(skb, reason);
        sa->skb_array[sa->skb_count++] = skb;

        if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
                kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE,
                                     sa->skb_array);
                sa->skb_count = 0;
        }
}

void __fix_address
kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
{
        struct skb_free_array sa;

        sa.skb_count = 0;

        while (segs) {
                struct sk_buff *next = segs->next;

                if (__sk_skb_reason_drop(NULL, segs, reason)) {
                        skb_poison_list(segs);
                        kfree_skb_add_bulk(segs, &sa, reason);
                }

                segs = next;
        }

        if (sa.skb_count)
                kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array);
}
EXPORT_SYMBOL(kfree_skb_list_reason);

/* Dump skb information and contents.
 *
 * Must only be called from net_ratelimit()-ed paths.
 *
 * Dumps whole packets if full_pkt, only headers otherwise.
 */
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
{
        struct skb_shared_info *sh = skb_shinfo(skb);
        struct net_device *dev = skb->dev;
        struct sock *sk = skb->sk;
        struct sk_buff *list_skb;
        bool has_mac, has_trans;
        int headroom, tailroom;
        int i, len, seg_len;

        if (full_pkt)
                len = skb->len;
        else
                len = min_t(int, skb->len, MAX_HEADER + 128);

        headroom = skb_headroom(skb);
        tailroom = skb_tailroom(skb);

        has_mac = skb_mac_header_was_set(skb);
        has_trans = skb_transport_header_was_set(skb);

        printk("%sskb len=%u data_len=%u headroom=%u headlen=%u tailroom=%u\n"
               "end-tail=%u mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
               "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
               "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
               "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
               "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
               "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
               level, skb->len, skb->data_len, headroom, skb_headlen(skb),
               tailroom, skb->end - skb->tail,
               has_mac ? skb->mac_header : -1,
               has_mac ? skb_mac_header_len(skb) : -1,
               skb->mac_len,
               skb->network_header,
               has_trans ? skb_network_header_len(skb) : -1,
               has_trans ? skb->transport_header : -1,
               sh->tx_flags, sh->nr_frags,
               sh->gso_size, sh->gso_type, sh->gso_segs,
               skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
               skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
               skb->hash, skb->sw_hash, skb->l4_hash,
               ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
               skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
               skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
               skb->inner_network_header, skb->inner_transport_header);

        if (dev)
                printk("%sdev name=%s feat=%pNF\n",
                       level, dev->name, &dev->features);
        if (sk)
                printk("%ssk family=%hu type=%u proto=%u\n",
                       level, sk->sk_family, sk->sk_type, sk->sk_protocol);

        if (full_pkt && headroom)
                print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
                               16, 1, skb->head, headroom, false);

        seg_len = min_t(int, skb_headlen(skb), len);
        if (seg_len)
                print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
                               16, 1, skb->data, seg_len, false);
        len -= seg_len;

        if (full_pkt && tailroom)
                print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
                               16, 1, skb_tail_pointer(skb), tailroom, false);

        for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                u32 p_off, p_len, copied;
                struct page *p;
                u8 *vaddr;

                if (skb_frag_is_net_iov(frag)) {
                        printk("%sskb frag %d: not readable\n", level, i);
                        len -= skb_frag_size(frag);
                        if (!len)
                                break;
                        continue;
                }

                skb_frag_foreach_page(frag, skb_frag_off(frag),
                                      skb_frag_size(frag), p, p_off, p_len,
                                      copied) {
                        seg_len = min_t(int, p_len, len);
                        vaddr = kmap_atomic(p);
                        print_hex_dump(level, "skb frag:     ",
                                       DUMP_PREFIX_OFFSET,
                                       16, 1, vaddr + p_off, seg_len, false);
                        kunmap_atomic(vaddr);
                        len -= seg_len;
                        if (!len)
                                break;
                }
        }

        if (full_pkt && skb_has_frag_list(skb)) {
                printk("skb fraglist:\n");
                skb_walk_frags(skb, list_skb)
                        skb_dump(level, list_skb, true);
        }
}
EXPORT_SYMBOL(skb_dump);

/**
 *        skb_tx_error - report an sk_buff xmit error
 *        @skb: buffer that triggered an error
 *
 *        Report xmit error if a device callback is tracking this skb.
 *        skb must be freed afterwards.
 */
void skb_tx_error(struct sk_buff *skb)
{
        if (skb) {
                skb_zcopy_downgrade_managed(skb);
                skb_zcopy_clear(skb, true);
        }
}
EXPORT_SYMBOL(skb_tx_error);

#ifdef CONFIG_TRACEPOINTS
/**
 *        consume_skb - free an skbuff
 *        @skb: buffer to free
 *
 *        Drop a ref to the buffer and free it if the usage count has hit zero
 *        Functions identically to kfree_skb, but kfree_skb assumes that the frame
 *        is being dropped after a failure and notes that
 */
void consume_skb(struct sk_buff *skb)
{
        if (!skb_unref(skb))
                return;

        trace_consume_skb(skb, __builtin_return_address(0));
        __kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
#endif

/**
 *        __consume_stateless_skb - free an skbuff, assuming it is stateless
 *        @skb: buffer to free
 *
 *        Alike consume_skb(), but this variant assumes that this is the last
 *        skb reference and all the head states have been already dropped
 */
void __consume_stateless_skb(struct sk_buff *skb)
{
        trace_consume_skb(skb, __builtin_return_address(0));
        skb_release_data(skb, SKB_CONSUMED);
        kfree_skbmem(skb);
}

static void napi_skb_cache_put(struct sk_buff *skb)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);

        if (!kasan_mempool_poison_object(skb))
                return;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        nc->skb_cache[nc->skb_count++] = skb;

        if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
                u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE;

                for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++)
                        kasan_mempool_unpoison_object(nc->skb_cache[i],
                                                skbuff_cache_size);

                kmem_cache_free_bulk(net_hotdata.skbuff_cache,
                                     NAPI_SKB_CACHE_FREE,
                                     nc->skb_cache + remaining);
                nc->skb_count = remaining;
        }
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}

void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{
        skb_release_all(skb, reason);
        napi_skb_cache_put(skb);
}

void napi_skb_free_stolen_head(struct sk_buff *skb)
{
        if (unlikely(skb->slow_gro)) {
                nf_reset_ct(skb);
                skb_dst_drop(skb);
                skb_ext_put(skb);
                skb_orphan(skb);
                skb->slow_gro = 0;
        }
        napi_skb_cache_put(skb);
}

/**
 * napi_consume_skb() - consume skb in NAPI context, try to feed skb cache
 * @skb: buffer to free
 * @budget: NAPI budget
 *
 * Non-zero @budget must come from the @budget argument passed by the core
 * to a NAPI poll function. Note that core may pass budget of 0 to NAPI poll
 * for example when polling for netpoll / netconsole.
 *
 * Passing @budget of 0 is safe from any context, it turns this function
 * into dev_consume_skb_any().
 */
void napi_consume_skb(struct sk_buff *skb, int budget)
{
        if (unlikely(!budget || !skb)) {
                dev_consume_skb_any(skb);
                return;
        }

        DEBUG_NET_WARN_ON_ONCE(!in_softirq());

        if (!static_branch_unlikely(&skb_defer_disable_key) &&
            skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) {
                skb_release_head_state(skb);
                return skb_attempt_defer_free(skb);
        }

        if (!skb_unref(skb))
                return;

        /* if reaching here SKB is ready to free */
        trace_consume_skb(skb, __builtin_return_address(0));

        /* if SKB is a clone, don't handle this case */
        if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
                __kfree_skb(skb);
                return;
        }

        skb_release_all(skb, SKB_CONSUMED);
        napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);

/* Make sure a field is contained by headers group */
#define CHECK_SKB_FIELD(field) \
        BUILD_BUG_ON(offsetof(struct sk_buff, field) !=                \
                     offsetof(struct sk_buff, headers.field));        \

static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
        new->tstamp                = old->tstamp;
        /* We do not copy old->sk */
        new->dev                = old->dev;
        memcpy(new->cb, old->cb, sizeof(old->cb));
        skb_dst_copy(new, old);
        __skb_ext_copy(new, old);
        __nf_copy(new, old, false);

        /* Note : this field could be in the headers group.
         * It is not yet because we do not want to have a 16 bit hole
         */
        new->queue_mapping = old->queue_mapping;

        memcpy(&new->headers, &old->headers, sizeof(new->headers));
        CHECK_SKB_FIELD(protocol);
        CHECK_SKB_FIELD(csum);
        CHECK_SKB_FIELD(hash);
        CHECK_SKB_FIELD(priority);
        CHECK_SKB_FIELD(skb_iif);
        CHECK_SKB_FIELD(vlan_proto);
        CHECK_SKB_FIELD(vlan_tci);
        CHECK_SKB_FIELD(transport_header);
        CHECK_SKB_FIELD(network_header);
        CHECK_SKB_FIELD(mac_header);
        CHECK_SKB_FIELD(inner_protocol);
        CHECK_SKB_FIELD(inner_transport_header);
        CHECK_SKB_FIELD(inner_network_header);
        CHECK_SKB_FIELD(inner_mac_header);
        CHECK_SKB_FIELD(mark);
#ifdef CONFIG_NETWORK_SECMARK
        CHECK_SKB_FIELD(secmark);
#endif
#ifdef CONFIG_NET_RX_BUSY_POLL
        CHECK_SKB_FIELD(napi_id);
#endif
        CHECK_SKB_FIELD(alloc_cpu);
#ifdef CONFIG_XPS
        CHECK_SKB_FIELD(sender_cpu);
#endif
#ifdef CONFIG_NET_SCHED
        CHECK_SKB_FIELD(tc_index);
#endif

}

/*
 * You should not add any new code to this function.  Add it to
 * __copy_skb_header above instead.
 */
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
{
#define C(x) n->x = skb->x

        n->next = n->prev = NULL;
        n->sk = NULL;
        __copy_skb_header(n, skb);

        C(len);
        C(data_len);
        C(mac_len);
        n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
        n->cloned = 1;
        n->nohdr = 0;
        n->peeked = 0;
        C(pfmemalloc);
        C(pp_recycle);
        n->destructor = NULL;
        C(tail);
        C(end);
        C(head);
        C(head_frag);
        C(data);
        C(truesize);
        refcount_set(&n->users, 1);

        atomic_inc(&(skb_shinfo(skb)->dataref));
        skb->cloned = 1;

        return n;
#undef C
}

/**
 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
 * @first: first sk_buff of the msg
 */
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
{
        struct sk_buff *n;

        n = alloc_skb(0, GFP_ATOMIC);
        if (!n)
                return NULL;

        n->len = first->len;
        n->data_len = first->len;
        n->truesize = first->truesize;

        skb_shinfo(n)->frag_list = first;

        __copy_skb_header(n, first);
        n->destructor = NULL;

        return n;
}
EXPORT_SYMBOL_GPL(alloc_skb_for_msg);

/**
 *        skb_morph        -        morph one skb into another
 *        @dst: the skb to receive the contents
 *        @src: the skb to supply the contents
 *
 *        This is identical to skb_clone except that the target skb is
 *        supplied by the user.
 *
 *        The target skb is returned upon exit.
 */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
        skb_release_all(dst, SKB_CONSUMED);
        return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);

int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
        unsigned long max_pg, num_pg, new_pg, old_pg, rlim;
        struct user_struct *user;

        if (capable(CAP_IPC_LOCK) || !size)
                return 0;

        rlim = rlimit(RLIMIT_MEMLOCK);
        if (rlim == RLIM_INFINITY)
                return 0;

        num_pg = (size >> PAGE_SHIFT) + 2;        /* worst case */
        max_pg = rlim >> PAGE_SHIFT;
        user = mmp->user ? : current_user();

        old_pg = atomic_long_read(&user->locked_vm);
        do {
                new_pg = old_pg + num_pg;
                if (new_pg > max_pg)
                        return -ENOBUFS;
        } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));

        if (!mmp->user) {
                mmp->user = get_uid(user);
                mmp->num_pg = num_pg;
        } else {
                mmp->num_pg += num_pg;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);

void mm_unaccount_pinned_pages(struct mmpin *mmp)
{
        if (mmp->user) {
                atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
                free_uid(mmp->user);
        }
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);

static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
                                            bool devmem)
{
        struct ubuf_info_msgzc *uarg;
        struct sk_buff *skb;

        WARN_ON_ONCE(!in_task());

        skb = sock_omalloc(sk, 0, GFP_KERNEL);
        if (!skb)
                return NULL;

        BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
        uarg = (void *)skb->cb;
        uarg->mmp.user = NULL;

        if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
                kfree_skb(skb);
                return NULL;
        }

        uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
        uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
        uarg->len = 1;
        uarg->bytelen = size;
        uarg->zerocopy = 1;
        uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
        refcount_set(&uarg->ubuf.refcnt, 1);
        sock_hold(sk);

        return &uarg->ubuf;
}

static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
{
        return container_of((void *)uarg, struct sk_buff, cb);
}

struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
                                       struct ubuf_info *uarg, bool devmem)
{
        if (uarg) {
                struct ubuf_info_msgzc *uarg_zc;
                const u32 byte_limit = 1 << 19;                /* limit to a few TSO */
                u32 bytelen, next;

                /* there might be non MSG_ZEROCOPY users */
                if (uarg->ops != &msg_zerocopy_ubuf_ops)
                        return NULL;

                /* realloc only when socket is locked (TCP, UDP cork),
                 * so uarg->len and sk_zckey access is serialized
                 */
                if (!sock_owned_by_user(sk)) {
                        WARN_ON_ONCE(1);
                        return NULL;
                }

                uarg_zc = uarg_to_msgzc(uarg);
                bytelen = uarg_zc->bytelen + size;
                if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
                        /* TCP can create new skb to attach new uarg */
                        if (sk->sk_type == SOCK_STREAM)
                                goto new_alloc;
                        return NULL;
                }

                next = (u32)atomic_read(&sk->sk_zckey);
                if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
                        if (likely(!devmem) &&
                            mm_account_pinned_pages(&uarg_zc->mmp, size))
                                return NULL;
                        uarg_zc->len++;
                        uarg_zc->bytelen = bytelen;
                        atomic_set(&sk->sk_zckey, ++next);

                        /* no extra ref when appending to datagram (MSG_MORE) */
                        if (sk->sk_type == SOCK_STREAM)
                                net_zcopy_get(uarg);

                        return uarg;
                }
        }

new_alloc:
        return msg_zerocopy_alloc(sk, size, devmem);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);

static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
{
        struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
        u32 old_lo, old_hi;
        u64 sum_len;

        old_lo = serr->ee.ee_info;
        old_hi = serr->ee.ee_data;
        sum_len = old_hi - old_lo + 1ULL + len;

        if (sum_len >= (1ULL << 32))
                return false;

        if (lo != old_hi + 1)
                return false;

        serr->ee.ee_data += len;
        return true;
}

static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
{
        struct sk_buff *tail, *skb = skb_from_uarg(uarg);
        struct sock_exterr_skb *serr;
        struct sock *sk = skb->sk;
        struct sk_buff_head *q;
        unsigned long flags;
        bool is_zerocopy;
        u32 lo, hi;
        u16 len;

        mm_unaccount_pinned_pages(&uarg->mmp);

        /* if !len, there was only 1 call, and it was aborted
         * so do not queue a completion notification
         */
        if (!uarg->len || sock_flag(sk, SOCK_DEAD))
                goto release;

        len = uarg->len;
        lo = uarg->id;
        hi = uarg->id + len - 1;
        is_zerocopy = uarg->zerocopy;

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = 0;
        serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
        serr->ee.ee_data = hi;
        serr->ee.ee_info = lo;
        if (!is_zerocopy)
                serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;

        q = &sk->sk_error_queue;
        spin_lock_irqsave(&q->lock, flags);
        tail = skb_peek_tail(q);
        if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
            !skb_zerocopy_notify_extend(tail, lo, len)) {
                __skb_queue_tail(q, skb);
                skb = NULL;
        }
        spin_unlock_irqrestore(&q->lock, flags);

        sk_error_report(sk);

release:
        consume_skb(skb);
        sock_put(sk);
}

static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
                                  bool success)
{
        struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);

        uarg_zc->zerocopy = uarg_zc->zerocopy & success;

        if (refcount_dec_and_test(&uarg->refcnt))
                __msg_zerocopy_callback(uarg_zc);
}

void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
        struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;

        atomic_dec(&sk->sk_zckey);
        uarg_to_msgzc(uarg)->len--;

        if (have_uref)
                msg_zerocopy_complete(NULL, uarg, true);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);

const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
        .complete = msg_zerocopy_complete,
};
EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg,
                             struct net_devmem_dmabuf_binding *binding)
{
        int err, orig_len = skb->len;

        if (uarg->ops->link_skb) {
                err = uarg->ops->link_skb(skb, uarg);
                if (err)
                        return err;
        } else {
                struct ubuf_info *orig_uarg = skb_zcopy(skb);

                /* An skb can only point to one uarg. This edge case happens
                 * when TCP appends to an skb, but zerocopy_realloc triggered
                 * a new alloc.
                 */
                if (orig_uarg && uarg != orig_uarg)
                        return -EEXIST;
        }

        err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
                                      binding);
        if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
                struct sock *save_sk = skb->sk;

                /* Streams do not free skb on error. Reset to prev state. */
                iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
                skb->sk = sk;
                ___pskb_trim(skb, orig_len);
                skb->sk = save_sk;
                return err;
        }

        skb_zcopy_set(skb, uarg, NULL);
        return skb->len - orig_len;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);

void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
        int i;

        skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                skb_frag_ref(skb, i);
}
EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);

static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
                              gfp_t gfp_mask)
{
        if (skb_zcopy(orig)) {
                if (skb_zcopy(nskb)) {
                        /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
                        if (!gfp_mask) {
                                WARN_ON_ONCE(1);
                                return -ENOMEM;
                        }
                        if (skb_uarg(nskb) == skb_uarg(orig))
                                return 0;
                        if (skb_copy_ubufs(nskb, GFP_ATOMIC))
                                return -EIO;
                }
                skb_zcopy_set(nskb, skb_uarg(orig), NULL);
        }
        return 0;
}

/**
 *        skb_copy_ubufs        -        copy userspace skb frags buffers to kernel
 *        @skb: the skb to modify
 *        @gfp_mask: allocation priority
 *
 *        This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
 *        It will copy all frags into kernel and drop the reference
 *        to userspace pages.
 *
 *        If this function is called from an interrupt gfp_mask() must be
 *        %GFP_ATOMIC.
 *
 *        Returns 0 on success or a negative error code on failure
 *        to allocate kernel memory to copy to.
 */
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
{
        int num_frags = skb_shinfo(skb)->nr_frags;
        struct page *page, *head = NULL;
        int i, order, psize, new_frags;
        u32 d_off;

        if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
                return -EINVAL;

        if (!skb_frags_readable(skb))
                return -EFAULT;

        if (!num_frags)
                goto release;

        /* We might have to allocate high order pages, so compute what minimum
         * page order is needed.
         */
        order = 0;
        while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb))
                order++;
        psize = (PAGE_SIZE << order);

        new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order);
        for (i = 0; i < new_frags; i++) {
                page = alloc_pages(gfp_mask | __GFP_COMP, order);
                if (!page) {
                        while (head) {
                                struct page *next = (struct page *)page_private(head);
                                put_page(head);
                                head = next;
                        }
                        return -ENOMEM;
                }
                set_page_private(page, (unsigned long)head);
                head = page;
        }

        page = head;
        d_off = 0;
        for (i = 0; i < num_frags; i++) {
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];
                u32 p_off, p_len, copied;
                struct page *p;
                u8 *vaddr;

                skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
                                      p, p_off, p_len, copied) {
                        u32 copy, done = 0;
                        vaddr = kmap_atomic(p);

                        while (done < p_len) {
                                if (d_off == psize) {
                                        d_off = 0;
                                        page = (struct page *)page_private(page);
                                }
                                copy = min_t(u32, psize - d_off, p_len - done);
                                memcpy(page_address(page) + d_off,
                                       vaddr + p_off + done, copy);
                                done += copy;
                                d_off += copy;
                        }
                        kunmap_atomic(vaddr);
                }
        }

        /* skb frags release userspace buffers */
        for (i = 0; i < num_frags; i++)
                skb_frag_unref(skb, i);

        /* skb frags point to kernel buffers */
        for (i = 0; i < new_frags - 1; i++) {
                __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
                head = (struct page *)page_private(head);
        }
        __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
                               d_off);
        skb_shinfo(skb)->nr_frags = new_frags;

release:
        skb_zcopy_clear(skb, false);
        return 0;
}
EXPORT_SYMBOL_GPL(skb_copy_ubufs);

/**
 *        skb_clone        -        duplicate an sk_buff
 *        @skb: buffer to clone
 *        @gfp_mask: allocation priority
 *
 *        Duplicate an &sk_buff. The new one is not owned by a socket. Both
 *        copies share the same packet data but not structure. The new
 *        buffer has a reference count of 1. If the allocation fails the
 *        function returns %NULL otherwise the new buffer is returned.
 *
 *        If this function is called from an interrupt gfp_mask() must be
 *        %GFP_ATOMIC.
 */

struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff_fclones *fclones = container_of(skb,
                                                       struct sk_buff_fclones,
                                                       skb1);
        struct sk_buff *n;

        if (skb_orphan_frags(skb, gfp_mask))
                return NULL;

        if (skb->fclone == SKB_FCLONE_ORIG &&
            refcount_read(&fclones->fclone_ref) == 1) {
                n = &fclones->skb2;
                refcount_set(&fclones->fclone_ref, 2);
                n->fclone = SKB_FCLONE_CLONE;
        } else {
                if (skb_pfmemalloc(skb))
                        gfp_mask |= __GFP_MEMALLOC;

                n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask);
                if (!n)
                        return NULL;

                n->fclone = SKB_FCLONE_UNAVAILABLE;
        }

        return __skb_clone(n, skb);
}
EXPORT_SYMBOL(skb_clone);

void skb_headers_offset_update(struct sk_buff *skb, int off)
{
        /* Only adjust this if it actually is csum_start rather than csum */
        if (skb->ip_summed == CHECKSUM_PARTIAL)
                skb->csum_start += off;
        /* {transport,network,mac}_header and tail are relative to skb->head */
        skb->transport_header += off;
        skb->network_header   += off;
        if (skb_mac_header_was_set(skb))
                skb->mac_header += off;
        skb->inner_transport_header += off;
        skb->inner_network_header += off;
        skb->inner_mac_header += off;
}
EXPORT_SYMBOL(skb_headers_offset_update);

void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
        __copy_skb_header(new, old);

        skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
        skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
        skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
EXPORT_SYMBOL(skb_copy_header);

static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
        if (skb_pfmemalloc(skb))
                return SKB_ALLOC_RX;
        return 0;
}

/**
 *        skb_copy        -        create private copy of an sk_buff
 *        @skb: buffer to copy
 *        @gfp_mask: allocation priority
 *
 *        Make a copy of both an &sk_buff and its data. This is used when the
 *        caller wishes to modify the data and needs a private copy of the
 *        data to alter. Returns %NULL on failure or the pointer to the buffer
 *        on success. The returned buffer has a reference count of 1.
 *
 *        As by-product this function converts non-linear &sk_buff to linear
 *        one, so that &sk_buff becomes completely private and caller is allowed
 *        to modify all the data of returned buffer. This means that this
 *        function is not recommended for use in circumstances when only
 *        header is going to be modified. Use pskb_copy() instead.
 */

struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff *n;
        unsigned int size;
        int headerlen;

        if (!skb_frags_readable(skb))
                return NULL;

        if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
                return NULL;

        headerlen = skb_headroom(skb);
        size = skb_end_offset(skb) + skb->data_len;
        n = __alloc_skb(size, gfp_mask,
                        skb_alloc_rx_flag(skb), NUMA_NO_NODE);
        if (!n)
                return NULL;

        /* Set the data pointer */
        skb_reserve(n, headerlen);
        /* Set the tail pointer and length */
        skb_put(n, skb->len);

        BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));

        skb_copy_header(n, skb);
        return n;
}
EXPORT_SYMBOL(skb_copy);

/**
 *        __pskb_copy_fclone        -  create copy of an sk_buff with private head.
 *        @skb: buffer to copy
 *        @headroom: headroom of new skb
 *        @gfp_mask: allocation priority
 *        @fclone: if true allocate the copy of the skb from the fclone
 *        cache instead of the head cache; it is recommended to set this
 *        to true for the cases where the copy will likely be cloned
 *
 *        Make a copy of both an &sk_buff and part of its data, located
 *        in header. Fragmented data remain shared. This is used when
 *        the caller wishes to modify only header of &sk_buff and needs
 *        private copy of the header to alter. Returns %NULL on failure
 *        or the pointer to the buffer on success.
 *        The returned buffer has a reference count of 1.
 */

struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone)
{
        unsigned int size = skb_headlen(skb) + headroom;
        int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
        struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);

        if (!n)
                goto out;

        /* Set the data pointer */
        skb_reserve(n, headroom);
        /* Set the tail pointer and length */
        skb_put(n, skb_headlen(skb));
        /* Copy the bytes */
        skb_copy_from_linear_data(skb, n->data, n->len);

        n->truesize += skb->data_len;
        n->data_len  = skb->data_len;
        n->len             = skb->len;

        if (skb_shinfo(skb)->nr_frags) {
                int i;

                if (skb_orphan_frags(skb, gfp_mask) ||
                    skb_zerocopy_clone(n, skb, gfp_mask)) {
                        kfree_skb(n);
                        n = NULL;
                        goto out;
                }
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
                        skb_frag_ref(skb, i);
                }
                skb_shinfo(n)->nr_frags = i;
        }

        if (skb_has_frag_list(skb)) {
                skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
                skb_clone_fraglist(n);
        }

        skb_copy_header(n, skb);
out:
        return n;
}
EXPORT_SYMBOL(__pskb_copy_fclone);

/**
 *        pskb_expand_head - reallocate header of &sk_buff
 *        @skb: buffer to reallocate
 *        @nhead: room to add at head
 *        @ntail: room to add at tail
 *        @gfp_mask: allocation priority
 *
 *        Expands (or creates identical copy, if @nhead and @ntail are zero)
 *        header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
 *        reference count of 1. Returns zero in the case of success or error,
 *        if expansion failed. In the last case, &sk_buff is not changed.
 *
 *        All the pointers pointing into skb header may change and must be
 *        reloaded after call to this function.
 *
 *        Note: If you skb_push() the start of the buffer after reallocating the
 *        header, call skb_postpush_data_move() first to move the metadata out of
 *        the way before writing to &sk_buff->data.
 */

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
                     gfp_t gfp_mask)
{
        unsigned int osize = skb_end_offset(skb);
        unsigned int size = osize + nhead + ntail;
        long off;
        u8 *data;
        int i;

        BUG_ON(nhead < 0);

        BUG_ON(skb_shared(skb));

        skb_zcopy_downgrade_managed(skb);

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                goto nodata;
        size = SKB_WITH_OVERHEAD(size);

        /* Copy only real data... and, alas, header. This should be
         * optimized for the cases when header is void.
         */
        memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb),
               offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));

        /*
         * if shinfo is shared we must drop the old head gracefully, but if it
         * is not we can just drop the old head and let the existing refcount
         * be since all we did is relocate the values
         */
        if (skb_cloned(skb)) {
                if (skb_orphan_frags(skb, gfp_mask))
                        goto nofrags;
                if (skb_zcopy(skb))
                        refcount_inc(&skb_uarg(skb)->refcnt);
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);

                if (skb_has_frag_list(skb))
                        skb_clone_fraglist(skb);

                skb_release_data(skb, SKB_CONSUMED);
        } else {
                skb_free_head(skb);
        }
        off = (data + nhead) - skb->head;

        skb->head     = data;
        skb->head_frag = 0;
        skb->data    += off;

        skb_set_end_offset(skb, size);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        off           = nhead;
#endif
        skb->tail              += off;
        skb_headers_offset_update(skb, nhead);
        skb->cloned   = 0;
        skb->hdr_len  = 0;
        skb->nohdr    = 0;
        atomic_set(&skb_shinfo(skb)->dataref, 1);

        /* It is not generally safe to change skb->truesize.
         * For the moment, we really care of rx path, or
         * when skb is orphaned (not attached to a socket).
         */
        if (!skb->sk || skb->destructor == sock_edemux)
                skb->truesize += size - osize;

        return 0;

nofrags:
        skb_kfree_head(data);
nodata:
        return -ENOMEM;
}
EXPORT_SYMBOL(pskb_expand_head);

/* Make private copy of skb with writable head and some headroom */

struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
{
        struct sk_buff *skb2;
        int delta = headroom - skb_headroom(skb);

        if (delta <= 0)
                skb2 = pskb_copy(skb, GFP_ATOMIC);
        else {
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
                                             GFP_ATOMIC)) {
                        kfree_skb(skb2);
                        skb2 = NULL;
                }
        }
        return skb2;
}
EXPORT_SYMBOL(skb_realloc_headroom);

/* Note: We plan to rework this in linux-6.4 */
int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
        unsigned int saved_end_offset, saved_truesize;
        struct skb_shared_info *shinfo;
        int res;

        saved_end_offset = skb_end_offset(skb);
        saved_truesize = skb->truesize;

        res = pskb_expand_head(skb, 0, 0, pri);
        if (res)
                return res;

        skb->truesize = saved_truesize;

        if (likely(skb_end_offset(skb) == saved_end_offset))
                return 0;

        shinfo = skb_shinfo(skb);

        /* We are about to change back skb->end,
         * we need to move skb_shinfo() to its new location.
         */
        memmove(skb->head + saved_end_offset,
                shinfo,
                offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));

        skb_set_end_offset(skb, saved_end_offset);

        return 0;
}

/**
 *        skb_expand_head - reallocate header of &sk_buff
 *        @skb: buffer to reallocate
 *        @headroom: needed headroom
 *
 *        Unlike skb_realloc_headroom, this one does not allocate a new skb
 *        if possible; copies skb->sk to new skb as needed
 *        and frees original skb in case of failures.
 *
 *        It expect increased headroom and generates warning otherwise.
 */

struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
{
        int delta = headroom - skb_headroom(skb);
        int osize = skb_end_offset(skb);
        struct sock *sk = skb->sk;

        if (WARN_ONCE(delta <= 0,
                      "%s is expecting an increase in the headroom", __func__))
                return skb;

        delta = SKB_DATA_ALIGN(delta);
        /* pskb_expand_head() might crash, if skb is shared. */
        if (skb_shared(skb) || !is_skb_wmem(skb)) {
                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

                if (unlikely(!nskb))
                        goto fail;

                if (sk)
                        skb_set_owner_w(nskb, sk);
                consume_skb(skb);
                skb = nskb;
        }
        if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
                goto fail;

        if (sk && is_skb_wmem(skb)) {
                delta = skb_end_offset(skb) - osize;
                refcount_add(delta, &sk->sk_wmem_alloc);
                skb->truesize += delta;
        }
        return skb;

fail:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(skb_expand_head);

/**
 *        skb_copy_expand        -        copy and expand sk_buff
 *        @skb: buffer to copy
 *        @newheadroom: new free bytes at head
 *        @newtailroom: new free bytes at tail
 *        @gfp_mask: allocation priority
 *
 *        Make a copy of both an &sk_buff and its data and while doing so
 *        allocate additional space.
 *
 *        This is used when the caller wishes to modify the data and needs a
 *        private copy of the data to alter as well as more space for new fields.
 *        Returns %NULL on failure or the pointer to the buffer
 *        on success. The returned buffer has a reference count of 1.
 *
 *        You must pass %GFP_ATOMIC as the allocation priority if this function
 *        is called from an interrupt.
 */
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
                                int newheadroom, int newtailroom,
                                gfp_t gfp_mask)
{
        /*
         *        Allocate the copy buffer
         */
        int head_copy_len, head_copy_off;
        struct sk_buff *n;
        int oldheadroom;

        if (!skb_frags_readable(skb))
                return NULL;

        if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
                return NULL;

        oldheadroom = skb_headroom(skb);
        n = __alloc_skb(newheadroom + skb->len + newtailroom,
                        gfp_mask, skb_alloc_rx_flag(skb),
                        NUMA_NO_NODE);
        if (!n)
                return NULL;

        skb_reserve(n, newheadroom);

        /* Set the tail pointer and length */
        skb_put(n, skb->len);

        head_copy_len = oldheadroom;
        head_copy_off = 0;
        if (newheadroom <= head_copy_len)
                head_copy_len = newheadroom;
        else
                head_copy_off = newheadroom - head_copy_len;

        /* Copy the linear header and data. */
        BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
                             skb->len + head_copy_len));

        skb_copy_header(n, skb);

        skb_headers_offset_update(n, newheadroom - oldheadroom);

        return n;
}
EXPORT_SYMBOL(skb_copy_expand);

/**
 *        __skb_pad                -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *        @free_on_error: free buffer on error
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error
 *        if @free_on_error is true.
 */

int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
{
        int err;
        int ntail;

        /* If the skbuff is non linear tailroom is always zero.. */
        if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
                memset(skb->data+skb->len, 0, pad);
                return 0;
        }

        ntail = skb->data_len + pad - (skb->end - skb->tail);
        if (likely(skb_cloned(skb) || ntail > 0)) {
                err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
                if (unlikely(err))
                        goto free_skb;
        }

        /* FIXME: The use of this function with non-linear skb's really needs
         * to be audited.
         */
        err = skb_linearize(skb);
        if (unlikely(err))
                goto free_skb;

        memset(skb->data + skb->len, 0, pad);
        return 0;

free_skb:
        if (free_on_error)
                kfree_skb(skb);
        return err;
}
EXPORT_SYMBOL(__skb_pad);

/**
 *        pskb_put - add data to the tail of a potentially fragmented buffer
 *        @skb: start of the buffer to use
 *        @tail: tail fragment of the buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the potentially
 *        fragmented buffer. @tail must be the last fragment of @skb -- or
 *        @skb itself. If this would exceed the total buffer size the kernel
 *        will panic. A pointer to the first byte of the extra data is
 *        returned.
 */

void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
{
        if (tail != skb) {
                skb->data_len += len;
                skb->len += len;
        }
        return skb_put(tail, len);
}
EXPORT_SYMBOL_GPL(pskb_put);

/**
 *        skb_put - add data to a buffer
 *        @skb: buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the buffer. If this would
 *        exceed the total buffer size the kernel will panic. A pointer to the
 *        first byte of the extra data is returned.
 */
void *skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        if (unlikely(skb->tail > skb->end))
                skb_over_panic(skb, len, __builtin_return_address(0));
        return tmp;
}
EXPORT_SYMBOL(skb_put);

/**
 *        skb_push - add data to the start of a buffer
 *        @skb: buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the buffer at the buffer
 *        start. If this would exceed the total buffer headroom the kernel will
 *        panic. A pointer to the first byte of the extra data is returned.
 */
void *skb_push(struct sk_buff *skb, unsigned int len)
{
        skb->data -= len;
        skb->len  += len;
        if (unlikely(skb->data < skb->head))
                skb_under_panic(skb, len, __builtin_return_address(0));
        return skb->data;
}
EXPORT_SYMBOL(skb_push);

/**
 *        skb_pull - remove data from the start of a buffer
 *        @skb: buffer to use
 *        @len: amount of data to remove
 *
 *        This function removes data from the start of a buffer, returning
 *        the memory to the headroom. A pointer to the next data in the buffer
 *        is returned. Once the data has been pulled future pushes will overwrite
 *        the old data.
 */
void *skb_pull(struct sk_buff *skb, unsigned int len)
{
        return skb_pull_inline(skb, len);
}
EXPORT_SYMBOL(skb_pull);

/**
 *        skb_pull_data - remove data from the start of a buffer returning its
 *        original position.
 *        @skb: buffer to use
 *        @len: amount of data to remove
 *
 *        This function removes data from the start of a buffer, returning
 *        the memory to the headroom. A pointer to the original data in the buffer
 *        is returned after checking if there is enough data to pull. Once the
 *        data has been pulled future pushes will overwrite the old data.
 */
void *skb_pull_data(struct sk_buff *skb, size_t len)
{
        void *data = skb->data;

        if (skb->len < len)
                return NULL;

        skb_pull(skb, len);

        return data;
}
EXPORT_SYMBOL(skb_pull_data);

/**
 *        skb_trim - remove end from a buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        Cut the length of a buffer down by removing data from the tail. If
 *        the buffer is already under the length specified it is not modified.
 *        The skb must be linear.
 */
void skb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->len > len)
                __skb_trim(skb, len);
}
EXPORT_SYMBOL(skb_trim);

/* Trims skb to length len. It can change skb pointers.
 */

int ___pskb_trim(struct sk_buff *skb, unsigned int len)
{
        struct sk_buff **fragp;
        struct sk_buff *frag;
        int offset = skb_headlen(skb);
        int nfrags = skb_shinfo(skb)->nr_frags;
        int i;
        int err;

        if (skb_cloned(skb) &&
            unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
                return err;

        i = 0;
        if (offset >= len)
                goto drop_pages;

        for (; i < nfrags; i++) {
                int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (end < len) {
                        offset = end;
                        continue;
                }

                skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);

drop_pages:
                skb_shinfo(skb)->nr_frags = i;

                for (; i < nfrags; i++)
                        skb_frag_unref(skb, i);

                if (skb_has_frag_list(skb))
                        skb_drop_fraglist(skb);
                goto done;
        }

        for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
             fragp = &frag->next) {
                int end = offset + frag->len;

                if (skb_shared(frag)) {
                        struct sk_buff *nfrag;

                        nfrag = skb_clone(frag, GFP_ATOMIC);
                        if (unlikely(!nfrag))
                                return -ENOMEM;

                        nfrag->next = frag->next;
                        consume_skb(frag);
                        frag = nfrag;
                        *fragp = frag;
                }

                if (end < len) {
                        offset = end;
                        continue;
                }

                if (end > len &&
                    unlikely((err = pskb_trim(frag, len - offset))))
                        return err;

                if (frag->next)
                        skb_drop_list(&frag->next);
                break;
        }

done:
        if (len > skb_headlen(skb)) {
                skb->data_len -= skb->len - len;
                skb->len       = len;
        } else {
                skb->len       = len;
                skb->data_len  = 0;
                skb_set_tail_pointer(skb, len);
        }

        if (!skb->sk || skb->destructor == sock_edemux)
                skb_condense(skb);
        return 0;
}
EXPORT_SYMBOL(___pskb_trim);

/* Note : use pskb_trim_rcsum() instead of calling this directly
 */
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                int delta = skb->len - len;

                skb->csum = csum_block_sub(skb->csum,
                                           skb_checksum(skb, len, delta, 0),
                                           len);
        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
                int offset = skb_checksum_start_offset(skb) + skb->csum_offset;

                if (offset + sizeof(__sum16) > hdlen)
                        return -EINVAL;
        }
        return __pskb_trim(skb, len);
}
EXPORT_SYMBOL(pskb_trim_rcsum_slow);

/**
 *        __pskb_pull_tail - advance tail of skb header
 *        @skb: buffer to reallocate
 *        @delta: number of bytes to advance tail
 *
 *        The function makes a sense only on a fragmented &sk_buff,
 *        it expands header moving its tail forward and copying necessary
 *        data from fragmented part.
 *
 *        &sk_buff MUST have reference count of 1.
 *
 *        Returns %NULL (and &sk_buff does not change) if pull failed
 *        or value of new tail of skb in the case of success.
 *
 *        All the pointers pointing into skb header may change and must be
 *        reloaded after call to this function.
 */

/* Moves tail of skb head forward, copying data from fragmented part,
 * when it is necessary.
 * 1. It may fail due to malloc failure.
 * 2. It may change skb pointers.
 *
 * It is pretty complicated. Luckily, it is called only in exceptional cases.
 */
void *__pskb_pull_tail(struct sk_buff *skb, int delta)
{
        /* If skb has not enough free space at tail, get new one
         * plus 128 bytes for future expansions. If we have enough
         * room at tail, reallocate without expansion only if skb is cloned.
         */
        int i, k, eat = (skb->tail + delta) - skb->end;

        if (!skb_frags_readable(skb))
                return NULL;

        if (eat > 0 || skb_cloned(skb)) {
                if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
                                     GFP_ATOMIC))
                        return NULL;
        }

        BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
                             skb_tail_pointer(skb), delta));

        /* Optimization: no fragments, no reasons to preestimate
         * size of pulled pages. Superb.
         */
        if (!skb_has_frag_list(skb))
                goto pull_pages;

        /* Estimate size of pulled pages. */
        eat = delta;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (size >= eat)
                        goto pull_pages;
                eat -= size;
        }

        /* If we need update frag list, we are in troubles.
         * Certainly, it is possible to add an offset to skb data,
         * but taking into account that pulling is expected to
         * be very rare operation, it is worth to fight against
         * further bloating skb head and crucify ourselves here instead.
         * Pure masohism, indeed. 8)8)
         */
        if (eat) {
                struct sk_buff *list = skb_shinfo(skb)->frag_list;
                struct sk_buff *clone = NULL;
                struct sk_buff *insp = NULL;

                do {
                        if (list->len <= eat) {
                                /* Eaten as whole. */
                                eat -= list->len;
                                list = list->next;
                                insp = list;
                        } else {
                                /* Eaten partially. */
                                if (skb_is_gso(skb) && !list->head_frag &&
                                    skb_headlen(list))
                                        skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;

                                if (skb_shared(list)) {
                                        /* Sucks! We need to fork list. :-( */
                                        clone = skb_clone(list, GFP_ATOMIC);
                                        if (!clone)
                                                return NULL;
                                        insp = list->next;
                                        list = clone;
                                } else {
                                        /* This may be pulled without
                                         * problems. */
                                        insp = list;
                                }
                                if (!pskb_pull(list, eat)) {
                                        kfree_skb(clone);
                                        return NULL;
                                }
                                break;
                        }
                } while (eat);

                /* Free pulled out fragments. */
                while ((list = skb_shinfo(skb)->frag_list) != insp) {
                        skb_shinfo(skb)->frag_list = list->next;
                        consume_skb(list);
                }
                /* And insert new clone at head. */
                if (clone) {
                        clone->next = list;
                        skb_shinfo(skb)->frag_list = clone;
                }
        }
        /* Success! Now we may commit changes to skb data. */

pull_pages:
        eat = delta;
        k = 0;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (size <= eat) {
                        skb_frag_unref(skb, i);
                        eat -= size;
                } else {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[k];

                        *frag = skb_shinfo(skb)->frags[i];
                        if (eat) {
                                skb_frag_off_add(frag, eat);
                                skb_frag_size_sub(frag, eat);
                                if (!i)
                                        goto end;
                                eat = 0;
                        }
                        k++;
                }
        }
        skb_shinfo(skb)->nr_frags = k;

end:
        skb->tail     += delta;
        skb->data_len -= delta;

        if (!skb->data_len)
                skb_zcopy_clear(skb, false);

        return skb_tail_pointer(skb);
}
EXPORT_SYMBOL(__pskb_pull_tail);

/**
 *        skb_copy_bits - copy bits from skb to kernel buffer
 *        @skb: source skb
 *        @offset: offset in source
 *        @to: destination buffer
 *        @len: number of bytes to copy
 *
 *        Copy the specified number of bytes from the source skb to the
 *        destination buffer.
 *
 *        CAUTION ! :
 *                If its prototype is ever changed,
 *                check arch/{*}/net/{*}.S files,
 *                since it is called from BPF assembly code.
 */
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
{
        int start = skb_headlen(skb);
        struct sk_buff *frag_iter;
        int i, copy;

        if (offset > (int)skb->len - len)
                goto fault;

        /* Copy header. */
        if ((copy = start - offset) > 0) {
                if (copy > len)
                        copy = len;
                skb_copy_from_linear_data_offset(skb, offset, to, copy);
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
                to     += copy;
        }

        if (!skb_frags_readable(skb))
                goto fault;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(f);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(f,
                                              skb_frag_off(f) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(to + copied, vaddr + p_off, p_len);
                                kunmap_atomic(vaddr);
                        }

                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_copy_bits(frag_iter, offset - start, to, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_bits);

/*
 * Callback from splice_to_pipe(), if we need to release some pages
 * at the end of the spd in case we error'ed out in filling the pipe.
 */
static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
{
        put_page(spd->pages[i]);
}

static struct page *linear_to_page(struct page *page, unsigned int *len,
                                   unsigned int *offset,
                                   struct sock *sk)
{
        struct page_frag *pfrag = sk_page_frag(sk);

        if (!sk_page_frag_refill(sk, pfrag))
                return NULL;

        *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);

        memcpy(page_address(pfrag->page) + pfrag->offset,
               page_address(page) + *offset, *len);
        *offset = pfrag->offset;
        pfrag->offset += *len;

        return pfrag->page;
}

static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
                             struct page *page,
                             unsigned int offset)
{
        return        spd->nr_pages &&
                spd->pages[spd->nr_pages - 1] == page &&
                (spd->partial[spd->nr_pages - 1].offset +
                 spd->partial[spd->nr_pages - 1].len == offset);
}

/*
 * Fill page/offset/length into spd, if it can hold more pages.
 */
static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
                          unsigned int *len, unsigned int offset, bool linear,
                          struct sock *sk)
{
        if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
                return true;

        if (linear) {
                page = linear_to_page(page, len, &offset, sk);
                if (!page)
                        return true;
        }
        if (spd_can_coalesce(spd, page, offset)) {
                spd->partial[spd->nr_pages - 1].len += *len;
                return false;
        }
        get_page(page);
        spd->pages[spd->nr_pages] = page;
        spd->partial[spd->nr_pages].len = *len;
        spd->partial[spd->nr_pages].offset = offset;
        spd->nr_pages++;

        return false;
}

static bool __splice_segment(struct page *page, unsigned int poff,
                             unsigned int plen, unsigned int *off,
                             unsigned int *len,
                             struct splice_pipe_desc *spd, bool linear,
                             struct sock *sk)
{
        if (!*len)
                return true;

        /* skip this segment if already processed */
        if (*off >= plen) {
                *off -= plen;
                return false;
        }

        /* ignore any bits we already processed */
        poff += *off;
        plen -= *off;
        *off = 0;

        do {
                unsigned int flen = min(*len, plen);

                if (spd_fill_page(spd, page, &flen, poff, linear, sk))
                        return true;
                poff += flen;
                plen -= flen;
                *len -= flen;
                if (!*len)
                        return true;
        } while (plen);

        return false;
}

/*
 * Map linear and fragment data from the skb to spd. It reports true if the
 * pipe is full or if we already spliced the requested length.
 */
static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
                              unsigned int *offset, unsigned int *len,
                              struct splice_pipe_desc *spd, struct sock *sk)
{
        struct sk_buff *iter;
        int seg;

        /* map the linear part :
         * If skb->head_frag is set, this 'linear' part is backed by a
         * fragment, and if the head is not shared with any clones then
         * we can avoid a copy since we own the head portion of this page.
         */
        if (__splice_segment(virt_to_page(skb->data),
                             (unsigned long) skb->data & (PAGE_SIZE - 1),
                             skb_headlen(skb),
                             offset, len, spd,
                             skb_head_is_locked(skb),
                             sk))
                return true;

        /*
         * then map the fragments
         */
        if (!skb_frags_readable(skb))
                return false;

        for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
                const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];

                if (WARN_ON_ONCE(!skb_frag_page(f)))
                        return false;

                if (__splice_segment(skb_frag_page(f),
                                     skb_frag_off(f), skb_frag_size(f),
                                     offset, len, spd, false, sk))
                        return true;
        }

        skb_walk_frags(skb, iter) {
                if (*offset >= iter->len) {
                        *offset -= iter->len;
                        continue;
                }
                /* __skb_splice_bits() only fails if the output has no room
                 * left, so no point in going over the frag_list for the error
                 * case.
                 */
                if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
                        return true;
        }

        return false;
}

/*
 * Map data from the skb to a pipe. Should handle both the linear part,
 * the fragments, and the frag list.
 */
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int tlen,
                    unsigned int flags)
{
        struct partial_page partial[MAX_SKB_FRAGS];
        struct page *pages[MAX_SKB_FRAGS];
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
                .nr_pages_max = MAX_SKB_FRAGS,
                .ops = &nosteal_pipe_buf_ops,
                .spd_release = sock_spd_release,
        };
        int ret = 0;

        __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);

        if (spd.nr_pages)
                ret = splice_to_pipe(pipe, &spd);

        return ret;
}
EXPORT_SYMBOL_GPL(skb_splice_bits);

static int sendmsg_locked(struct sock *sk, struct msghdr *msg)
{
        struct socket *sock = sk->sk_socket;
        size_t size = msg_data_left(msg);

        if (!sock)
                return -EINVAL;

        if (!sock->ops->sendmsg_locked)
                return sock_no_sendmsg_locked(sk, msg, size);

        return sock->ops->sendmsg_locked(sk, msg, size);
}

static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
{
        struct socket *sock = sk->sk_socket;

        if (!sock)
                return -EINVAL;
        return sock_sendmsg(sock, msg);
}

typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
                           int len, sendmsg_func sendmsg, int flags)
{
        int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0;
        unsigned int orig_len = len;
        struct sk_buff *head = skb;
        unsigned short fragidx;
        int slen, ret;

do_frag_list:

        /* Deal with head data */
        while (offset < skb_headlen(skb) && len) {
                struct kvec kv;
                struct msghdr msg;

                slen = min_t(int, len, skb_headlen(skb) - offset);
                kv.iov_base = skb->data + offset;
                kv.iov_len = slen;
                memset(&msg, 0, sizeof(msg));
                msg.msg_flags = MSG_DONTWAIT | flags;
                if (slen < len)
                        msg.msg_flags |= more_hint;

                iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
                ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
                                      sendmsg_unlocked, sk, &msg);
                if (ret <= 0)
                        goto error;

                offset += ret;
                len -= ret;
        }

        /* All the data was skb head? */
        if (!len)
                goto out;

        /* Make offset relative to start of frags */
        offset -= skb_headlen(skb);

        /* Find where we are in frag list */
        for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
                skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];

                if (offset < skb_frag_size(frag))
                        break;

                offset -= skb_frag_size(frag);
        }

        for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
                skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];

                slen = min_t(size_t, len, skb_frag_size(frag) - offset);

                while (slen) {
                        struct bio_vec bvec;
                        struct msghdr msg = {
                                .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT |
                                             flags,
                        };

                        if (slen < len)
                                msg.msg_flags |= more_hint;
                        bvec_set_page(&bvec, skb_frag_page(frag), slen,
                                      skb_frag_off(frag) + offset);
                        iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
                                      slen);

                        ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
                                              sendmsg_unlocked, sk, &msg);
                        if (ret <= 0)
                                goto error;

                        len -= ret;
                        offset += ret;
                        slen -= ret;
                }

                offset = 0;
        }

        if (len) {
                /* Process any frag lists */

                if (skb == head) {
                        if (skb_has_frag_list(skb)) {
                                skb = skb_shinfo(skb)->frag_list;
                                goto do_frag_list;
                        }
                } else if (skb->next) {
                        skb = skb->next;
                        goto do_frag_list;
                }
        }

out:
        return orig_len - len;

error:
        return orig_len == len ? ret : orig_len - len;
}

/* Send skb data on a socket. Socket must be locked. */
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len)
{
        return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);

int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
                                    int offset, int len, int flags)
{
        return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags);

/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
        return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0);
}

/**
 *        skb_store_bits - store bits from kernel buffer to skb
 *        @skb: destination buffer
 *        @offset: offset in destination
 *        @from: source buffer
 *        @len: number of bytes to copy
 *
 *        Copy the specified number of bytes from the source buffer to the
 *        destination skb.  This function handles all the messy bits of
 *        traversing fragment lists and such.
 */

int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
{
        int start = skb_headlen(skb);
        struct sk_buff *frag_iter;
        int i, copy;

        if (offset > (int)skb->len - len)
                goto fault;

        if ((copy = start - offset) > 0) {
                if (copy > len)
                        copy = len;
                skb_copy_to_linear_data_offset(skb, offset, from, copy);
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
                from += copy;
        }

        if (!skb_frags_readable(skb))
                goto fault;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(vaddr + p_off, from + copied, p_len);
                                kunmap_atomic(vaddr);
                        }

                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        from += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_store_bits(frag_iter, offset - start,
                                           from, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        from += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_store_bits);

/* Checksum skb data. */
__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int pos = 0;

        /* Checksum header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                csum = csum_partial(skb->data + offset, copy, csum);
                if ((len -= copy) == 0)
                        return csum;
                offset += copy;
                pos        = copy;
        }

        if (WARN_ON_ONCE(!skb_frags_readable(skb)))
                return 0;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        __wsum csum2;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                csum2 = csum_partial(vaddr + p_off, p_len, 0);
                                kunmap_atomic(vaddr);
                                csum = csum_block_add(csum, csum2, pos);
                                pos += p_len;
                        }

                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        __wsum csum2;
                        if (copy > len)
                                copy = len;
                        csum2 = skb_checksum(frag_iter, offset - start, copy,
                                             0);
                        csum = csum_block_add(csum, csum2, pos);
                        if ((len -= copy) == 0)
                                return csum;
                        offset += copy;
                        pos    += copy;
                }
                start = end;
        }
        BUG_ON(len);

        return csum;
}
EXPORT_SYMBOL(skb_checksum);

/* Both of above in one bottle. */

__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
                                    u8 *to, int len)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int pos = 0;
        __wsum csum = 0;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                csum = csum_partial_copy_nocheck(skb->data + offset, to,
                                                 copy);
                if ((len -= copy) == 0)
                        return csum;
                offset += copy;
                to     += copy;
                pos        = copy;
        }

        if (!skb_frags_readable(skb))
                return 0;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        u32 p_off, p_len, copied;
                        struct page *p;
                        __wsum csum2;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                csum2 = csum_partial_copy_nocheck(vaddr + p_off,
                                                                  to + copied,
                                                                  p_len);
                                kunmap_atomic(vaddr);
                                csum = csum_block_add(csum, csum2, pos);
                                pos += p_len;
                        }

                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                __wsum csum2;
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        csum2 = skb_copy_and_csum_bits(frag_iter,
                                                       offset - start,
                                                       to, copy);
                        csum = csum_block_add(csum, csum2, pos);
                        if ((len -= copy) == 0)
                                return csum;
                        offset += copy;
                        to     += copy;
                        pos    += copy;
                }
                start = end;
        }
        BUG_ON(len);
        return csum;
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);

#ifdef CONFIG_NET_CRC32C
u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;

        if (copy > 0) {
                copy = min(copy, len);
                crc = crc32c(crc, skb->data + offset, copy);
                len -= copy;
                if (len == 0)
                        return crc;
                offset += copy;
        }

        if (WARN_ON_ONCE(!skb_frags_readable(skb)))
                return 0;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                copy = end - offset;
                if (copy > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        copy = min(copy, len);
                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                crc = crc32c(crc, vaddr + p_off, p_len);
                                kunmap_atomic(vaddr);
                        }
                        len -= copy;
                        if (len == 0)
                                return crc;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                copy = end - offset;
                if (copy > 0) {
                        copy = min(copy, len);
                        crc = skb_crc32c(frag_iter, offset - start, copy, crc);
                        len -= copy;
                        if (len == 0)
                                return crc;
                        offset += copy;
                }
                start = end;
        }
        BUG_ON(len);

        return crc;
}
EXPORT_SYMBOL(skb_crc32c);
#endif /* CONFIG_NET_CRC32C */

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
        __sum16 sum;

        sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
        /* See comments in __skb_checksum_complete(). */
        if (likely(!sum)) {
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(skb->dev, skb);
        }
        if (!skb_shared(skb))
                skb->csum_valid = !sum;
        return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);

/* This function assumes skb->csum already holds pseudo header's checksum,
 * which has been changed from the hardware checksum, for example, by
 * __skb_checksum_validate_complete(). And, the original skb->csum must
 * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
 *
 * It returns non-zero if the recomputed checksum is still invalid, otherwise
 * zero. The new checksum is stored back into skb->csum unless the skb is
 * shared.
 */
__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
        __wsum csum;
        __sum16 sum;

        csum = skb_checksum(skb, 0, skb->len, 0);

        sum = csum_fold(csum_add(skb->csum, csum));
        /* This check is inverted, because we already knew the hardware
         * checksum is invalid before calling this function. So, if the
         * re-computed checksum is valid instead, then we have a mismatch
         * between the original skb->csum and skb_checksum(). This means either
         * the original hardware checksum is incorrect or we screw up skb->csum
         * when moving skb->data around.
         */
        if (likely(!sum)) {
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(skb->dev, skb);
        }

        if (!skb_shared(skb)) {
                /* Save full packet checksum */
                skb->csum = csum;
                skb->ip_summed = CHECKSUM_COMPLETE;
                skb->csum_complete_sw = 1;
                skb->csum_valid = !sum;
        }

        return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);

 /**
 *        skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
 *        @from: source buffer
 *
 *        Calculates the amount of linear headroom needed in the 'to' skb passed
 *        into skb_zerocopy().
 */
unsigned int
skb_zerocopy_headlen(const struct sk_buff *from)
{
        unsigned int hlen = 0;

        if (!from->head_frag ||
            skb_headlen(from) < L1_CACHE_BYTES ||
            skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
                hlen = skb_headlen(from);
                if (!hlen)
                        hlen = from->len;
        }

        if (skb_has_frag_list(from))
                hlen = from->len;

        return hlen;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);

/**
 *        skb_zerocopy - Zero copy skb to skb
 *        @to: destination buffer
 *        @from: source buffer
 *        @len: number of bytes to copy from source buffer
 *        @hlen: size of linear headroom in destination buffer
 *
 *        Copies up to `len` bytes from `from` to `to` by creating references
 *        to the frags in the source buffer.
 *
 *        The `hlen` as calculated by skb_zerocopy_headlen() specifies the
 *        headroom in the `to` buffer.
 *
 *        Return value:
 *        0: everything is OK
 *        -ENOMEM: couldn't orphan frags of @from due to lack of memory
 *        -EFAULT: skb_copy_bits() found some problem with skb geometry
 */
int
skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
{
        int i, j = 0;
        int plen = 0; /* length of skb->head fragment */
        int ret;
        struct page *page;
        unsigned int offset;

        BUG_ON(!from->head_frag && !hlen);

        /* dont bother with small payloads */
        if (len <= skb_tailroom(to))
                return skb_copy_bits(from, 0, skb_put(to, len), len);

        if (hlen) {
                ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
                if (unlikely(ret))
                        return ret;
                len -= hlen;
        } else {
                plen = min_t(int, skb_headlen(from), len);
                if (plen) {
                        page = virt_to_head_page(from->head);
                        offset = from->data - (unsigned char *)page_address(page);
                        __skb_fill_netmem_desc(to, 0, page_to_netmem(page),
                                               offset, plen);
                        get_page(page);
                        j = 1;
                        len -= plen;
                }
        }

        skb_len_add(to, len + plen);

        if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
                skb_tx_error(from);
                return -ENOMEM;
        }
        skb_zerocopy_clone(to, from, GFP_ATOMIC);

        for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
                int size;

                if (!len)
                        break;
                skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
                size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
                                        len);
                skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
                len -= size;
                skb_frag_ref(to, j);
                j++;
        }
        skb_shinfo(to)->nr_frags = j;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_zerocopy);

void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
        __wsum csum;
        long csstart;

        if (skb->ip_summed == CHECKSUM_PARTIAL)
                csstart = skb_checksum_start_offset(skb);
        else
                csstart = skb_headlen(skb);

        BUG_ON(csstart > skb_headlen(skb));

        skb_copy_from_linear_data(skb, to, csstart);

        csum = 0;
        if (csstart != skb->len)
                csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
                                              skb->len - csstart);

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                long csstuff = csstart + skb->csum_offset;

                *((__sum16 *)(to + csstuff)) = csum_fold(csum);
        }
}
EXPORT_SYMBOL(skb_copy_and_csum_dev);

/**
 *        skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. The list lock is taken so the function
 *        may be used safely with other locking list functions. The head item is
 *        returned or %NULL if the list is empty.
 */

struct sk_buff *skb_dequeue(struct sk_buff_head *list)
{
        unsigned long flags;
        struct sk_buff *result;

        spin_lock_irqsave(&list->lock, flags);
        result = __skb_dequeue(list);
        spin_unlock_irqrestore(&list->lock, flags);
        return result;
}
EXPORT_SYMBOL(skb_dequeue);

/**
 *        skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. The list lock is taken so the function
 *        may be used safely with other locking list functions. The tail item is
 *        returned or %NULL if the list is empty.
 */
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
{
        unsigned long flags;
        struct sk_buff *result;

        spin_lock_irqsave(&list->lock, flags);
        result = __skb_dequeue_tail(list);
        spin_unlock_irqrestore(&list->lock, flags);
        return result;
}
EXPORT_SYMBOL(skb_dequeue_tail);

/**
 *        skb_queue_purge_reason - empty a list
 *        @list: list to empty
 *        @reason: drop reason
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function takes the list
 *        lock and is atomic with respect to other list locking functions.
 */
void skb_queue_purge_reason(struct sk_buff_head *list,
                            enum skb_drop_reason reason)
{
        struct sk_buff_head tmp;
        unsigned long flags;

        if (skb_queue_empty_lockless(list))
                return;

        __skb_queue_head_init(&tmp);

        spin_lock_irqsave(&list->lock, flags);
        skb_queue_splice_init(list, &tmp);
        spin_unlock_irqrestore(&list->lock, flags);

        __skb_queue_purge_reason(&tmp, reason);
}
EXPORT_SYMBOL(skb_queue_purge_reason);

/**
 *        skb_rbtree_purge - empty a skb rbtree
 *        @root: root of the rbtree to empty
 *        Return value: the sum of truesizes of all purged skbs.
 *
 *        Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
 *        the list and one reference dropped. This function does not take
 *        any lock. Synchronization should be handled by the caller (e.g., TCP
 *        out-of-order queue is protected by the socket lock).
 */
unsigned int skb_rbtree_purge(struct rb_root *root)
{
        struct rb_node *p = rb_first(root);
        unsigned int sum = 0;

        while (p) {
                struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);

                p = rb_next(p);
                rb_erase(&skb->rbnode, root);
                sum += skb->truesize;
                kfree_skb(skb);
        }
        return sum;
}

void skb_errqueue_purge(struct sk_buff_head *list)
{
        struct sk_buff *skb, *next;
        struct sk_buff_head kill;
        unsigned long flags;

        __skb_queue_head_init(&kill);

        spin_lock_irqsave(&list->lock, flags);
        skb_queue_walk_safe(list, skb, next) {
                if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY ||
                    SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING)
                        continue;
                __skb_unlink(skb, list);
                __skb_queue_tail(&kill, skb);
        }
        spin_unlock_irqrestore(&list->lock, flags);
        __skb_queue_purge(&kill);
}
EXPORT_SYMBOL(skb_errqueue_purge);

/**
 *        skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of the list. This function takes the
 *        list lock and can be used safely with other locking &sk_buff functions
 *        safely.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_head(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_head);

/**
 *        skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the tail of the list. This function takes the
 *        list lock and can be used safely with other locking &sk_buff functions
 *        safely.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_tail(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_tail);

/**
 *        skb_unlink        -        remove a buffer from a list
 *        @skb: buffer to remove
 *        @list: list to use
 *
 *        Remove a packet from a list. The list locks are taken and this
 *        function is atomic with respect to other list locked calls
 *
 *        You must know what list the SKB is on.
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_unlink(skb, list);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_unlink);

/**
 *        skb_append        -        append a buffer
 *        @old: buffer to insert after
 *        @newsk: buffer to insert
 *        @list: list to use
 *
 *        Place a packet after a given packet in a list. The list locks are taken
 *        and this function is atomic with respect to other list locked calls.
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_after(list, old, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_append);

static inline void skb_split_inside_header(struct sk_buff *skb,
                                           struct sk_buff* skb1,
                                           const u32 len, const int pos)
{
        int i;

        skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
                                         pos - len);
        /* And move data appendix as is. */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];

        skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
        skb1->unreadable           = skb->unreadable;
        skb_shinfo(skb)->nr_frags  = 0;
        skb1->data_len                   = skb->data_len;
        skb1->len                   += skb1->data_len;
        skb->data_len                   = 0;
        skb->len                   = len;
        skb_set_tail_pointer(skb, len);
}

static inline void skb_split_no_header(struct sk_buff *skb,
                                       struct sk_buff* skb1,
                                       const u32 len, int pos)
{
        int i, k = 0;
        const int nfrags = skb_shinfo(skb)->nr_frags;

        skb_shinfo(skb)->nr_frags = 0;
        skb1->len                  = skb1->data_len = skb->len - len;
        skb->len                  = len;
        skb->data_len                  = len - pos;

        for (i = 0; i < nfrags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (pos + size > len) {
                        skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];

                        if (pos < len) {
                                /* Split frag.
                                 * We have two variants in this case:
                                 * 1. Move all the frag to the second
                                 *    part, if it is possible. F.e.
                                 *    this approach is mandatory for TUX,
                                 *    where splitting is expensive.
                                 * 2. Split is accurately. We make this.
                                 */
                                skb_frag_ref(skb, i);
                                skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
                                skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
                                skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
                                skb_shinfo(skb)->nr_frags++;
                        }
                        k++;
                } else
                        skb_shinfo(skb)->nr_frags++;
                pos += size;
        }
        skb_shinfo(skb1)->nr_frags = k;

        skb1->unreadable = skb->unreadable;
}

/**
 * skb_split - Split fragmented skb to two parts at length len.
 * @skb: the buffer to split
 * @skb1: the buffer to receive the second part
 * @len: new length for skb
 */
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
        int pos = skb_headlen(skb);
        const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;

        skb_zcopy_downgrade_managed(skb);

        skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
        skb_zerocopy_clone(skb1, skb, 0);
        if (len < pos)        /* Split line is inside header. */
                skb_split_inside_header(skb, skb1, len, pos);
        else                /* Second chunk has no header, nothing to copy. */
                skb_split_no_header(skb, skb1, len, pos);
}
EXPORT_SYMBOL(skb_split);

/* Shifting from/to a cloned skb is a no-go.
 *
 * Caller cannot keep skb_shinfo related pointers past calling here!
 */
static int skb_prepare_for_shift(struct sk_buff *skb)
{
        return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
}

/**
 * skb_shift - Shifts paged data partially from skb to another
 * @tgt: buffer into which tail data gets added
 * @skb: buffer from which the paged data comes from
 * @shiftlen: shift up to this many bytes
 *
 * Attempts to shift up to shiftlen worth of bytes, which may be less than
 * the length of the skb, from skb to tgt. Returns number bytes shifted.
 * It's up to caller to free skb if everything was shifted.
 *
 * If @tgt runs out of frags, the whole operation is aborted.
 *
 * Skb cannot include anything else but paged data while tgt is allowed
 * to have non-paged data as well.
 *
 * TODO: full sized shift could be optimized but that would need
 * specialized skb free'er to handle frags without up-to-date nr_frags.
 */
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
{
        int from, to, merge, todo;
        skb_frag_t *fragfrom, *fragto;

        BUG_ON(shiftlen > skb->len);

        if (skb_headlen(skb))
                return 0;
        if (skb_zcopy(tgt) || skb_zcopy(skb))
                return 0;

        DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle);
        DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb));

        todo = shiftlen;
        from = 0;
        to = skb_shinfo(tgt)->nr_frags;
        fragfrom = &skb_shinfo(skb)->frags[from];

        /* Actual merge is delayed until the point when we know we can
         * commit all, so that we don't have to undo partial changes
         */
        if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
                              skb_frag_off(fragfrom))) {
                merge = -1;
        } else {
                merge = to - 1;

                todo -= skb_frag_size(fragfrom);
                if (todo < 0) {
                        if (skb_prepare_for_shift(skb) ||
                            skb_prepare_for_shift(tgt))
                                return 0;

                        /* All previous frag pointers might be stale! */
                        fragfrom = &skb_shinfo(skb)->frags[from];
                        fragto = &skb_shinfo(tgt)->frags[merge];

                        skb_frag_size_add(fragto, shiftlen);
                        skb_frag_size_sub(fragfrom, shiftlen);
                        skb_frag_off_add(fragfrom, shiftlen);

                        goto onlymerged;
                }

                from++;
        }

        /* Skip full, not-fitting skb to avoid expensive operations */
        if ((shiftlen == skb->len) &&
            (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
                return 0;

        if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
                return 0;

        while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
                if (to == MAX_SKB_FRAGS)
                        return 0;

                fragfrom = &skb_shinfo(skb)->frags[from];
                fragto = &skb_shinfo(tgt)->frags[to];

                if (todo >= skb_frag_size(fragfrom)) {
                        *fragto = *fragfrom;
                        todo -= skb_frag_size(fragfrom);
                        from++;
                        to++;

                } else {
                        __skb_frag_ref(fragfrom);
                        skb_frag_page_copy(fragto, fragfrom);
                        skb_frag_off_copy(fragto, fragfrom);
                        skb_frag_size_set(fragto, todo);

                        skb_frag_off_add(fragfrom, todo);
                        skb_frag_size_sub(fragfrom, todo);
                        todo = 0;

                        to++;
                        break;
                }
        }

        /* Ready to "commit" this state change to tgt */
        skb_shinfo(tgt)->nr_frags = to;

        if (merge >= 0) {
                fragfrom = &skb_shinfo(skb)->frags[0];
                fragto = &skb_shinfo(tgt)->frags[merge];

                skb_frag_size_add(fragto, skb_frag_size(fragfrom));
                __skb_frag_unref(fragfrom, skb->pp_recycle);
        }

        /* Reposition in the original skb */
        to = 0;
        while (from < skb_shinfo(skb)->nr_frags)
                skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
        skb_shinfo(skb)->nr_frags = to;

        BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);

onlymerged:
        /* Most likely the tgt won't ever need its checksum anymore, skb on
         * the other hand might need it if it needs to be resent
         */
        tgt->ip_summed = CHECKSUM_PARTIAL;
        skb->ip_summed = CHECKSUM_PARTIAL;

        skb_len_add(skb, -shiftlen);
        skb_len_add(tgt, shiftlen);

        return shiftlen;
}

/**
 * skb_prepare_seq_read - Prepare a sequential read of skb data
 * @skb: the buffer to read
 * @from: lower offset of data to be read
 * @to: upper offset of data to be read
 * @st: state variable
 *
 * Initializes the specified state variable. Must be called before
 * invoking skb_seq_read() for the first time.
 */
void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st)
{
        st->lower_offset = from;
        st->upper_offset = to;
        st->root_skb = st->cur_skb = skb;
        st->frag_idx = st->stepped_offset = 0;
        st->frag_data = NULL;
        st->frag_off = 0;
}
EXPORT_SYMBOL(skb_prepare_seq_read);

/**
 * skb_seq_read - Sequentially read skb data
 * @consumed: number of bytes consumed by the caller so far
 * @data: destination pointer for data to be returned
 * @st: state variable
 *
 * Reads a block of skb data at @consumed relative to the
 * lower offset specified to skb_prepare_seq_read(). Assigns
 * the head of the data block to @data and returns the length
 * of the block or 0 if the end of the skb data or the upper
 * offset has been reached.
 *
 * The caller is not required to consume all of the data
 * returned, i.e. @consumed is typically set to the number
 * of bytes already consumed and the next call to
 * skb_seq_read() will return the remaining part of the block.
 *
 * Note 1: The size of each block of data returned can be arbitrary,
 *       this limitation is the cost for zerocopy sequential
 *       reads of potentially non linear data.
 *
 * Note 2: Fragment lists within fragments are not implemented
 *       at the moment, state->root_skb could be replaced with
 *       a stack for this purpose.
 */
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st)
{
        unsigned int block_limit, abs_offset = consumed + st->lower_offset;
        skb_frag_t *frag;

        if (unlikely(abs_offset >= st->upper_offset)) {
                if (st->frag_data) {
                        kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }
                return 0;
        }

next_skb:
        block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;

        if (abs_offset < block_limit && !st->frag_data) {
                *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
                return block_limit - abs_offset;
        }

        if (!skb_frags_readable(st->cur_skb))
                return 0;

        if (st->frag_idx == 0 && !st->frag_data)
                st->stepped_offset += skb_headlen(st->cur_skb);

        while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
                unsigned int pg_idx, pg_off, pg_sz;

                frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];

                pg_idx = 0;
                pg_off = skb_frag_off(frag);
                pg_sz = skb_frag_size(frag);

                if (skb_frag_must_loop(skb_frag_page(frag))) {
                        pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
                        pg_off = offset_in_page(pg_off + st->frag_off);
                        pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
                                                    PAGE_SIZE - pg_off);
                }

                block_limit = pg_sz + st->stepped_offset;
                if (abs_offset < block_limit) {
                        if (!st->frag_data)
                                st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);

                        *data = (u8 *)st->frag_data + pg_off +
                                (abs_offset - st->stepped_offset);

                        return block_limit - abs_offset;
                }

                if (st->frag_data) {
                        kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }

                st->stepped_offset += pg_sz;
                st->frag_off += pg_sz;
                if (st->frag_off == skb_frag_size(frag)) {
                        st->frag_off = 0;
                        st->frag_idx++;
                }
        }

        if (st->frag_data) {
                kunmap_atomic(st->frag_data);
                st->frag_data = NULL;
        }

        if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
                st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
                st->frag_idx = 0;
                goto next_skb;
        } else if (st->cur_skb->next) {
                st->cur_skb = st->cur_skb->next;
                st->frag_idx = 0;
                goto next_skb;
        }

        return 0;
}
EXPORT_SYMBOL(skb_seq_read);

/**
 * skb_abort_seq_read - Abort a sequential read of skb data
 * @st: state variable
 *
 * Must be called if skb_seq_read() was not called until it
 * returned 0.
 */
void skb_abort_seq_read(struct skb_seq_state *st)
{
        if (st->frag_data)
                kunmap_atomic(st->frag_data);
}
EXPORT_SYMBOL(skb_abort_seq_read);

/**
 * skb_copy_seq_read() - copy from a skb_seq_state to a buffer
 * @st: source skb_seq_state
 * @offset: offset in source
 * @to: destination buffer
 * @len: number of bytes to copy
 *
 * Copy @len bytes from @offset bytes into the source @st to the destination
 * buffer @to. `offset` should increase (or be unchanged) with each subsequent
 * call to this function. If offset needs to decrease from the previous use `st`
 * should be reset first.
 *
 * Return: 0 on success or -EINVAL if the copy ended early
 */
int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len)
{
        const u8 *data;
        u32 sqlen;

        for (;;) {
                sqlen = skb_seq_read(offset, &data, st);
                if (sqlen == 0)
                        return -EINVAL;
                if (sqlen >= len) {
                        memcpy(to, data, len);
                        return 0;
                }
                memcpy(to, data, sqlen);
                to += sqlen;
                offset += sqlen;
                len -= sqlen;
        }
}
EXPORT_SYMBOL(skb_copy_seq_read);

#define TS_SKB_CB(state)        ((struct skb_seq_state *) &((state)->cb))

static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
                                          struct ts_config *conf,
                                          struct ts_state *state)
{
        return skb_seq_read(offset, text, TS_SKB_CB(state));
}

static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
{
        skb_abort_seq_read(TS_SKB_CB(state));
}

/**
 * skb_find_text - Find a text pattern in skb data
 * @skb: the buffer to look in
 * @from: search offset
 * @to: search limit
 * @config: textsearch configuration
 *
 * Finds a pattern in the skb data according to the specified
 * textsearch configuration. Use textsearch_next() to retrieve
 * subsequent occurrences of the pattern. Returns the offset
 * to the first occurrence or UINT_MAX if no match was found.
 */
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config)
{
        unsigned int patlen = config->ops->get_pattern_len(config);
        struct ts_state state;
        unsigned int ret;

        BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));

        config->get_next_block = skb_ts_get_next_block;
        config->finish = skb_ts_finish;

        skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));

        ret = textsearch_find(config, &state);
        return (ret + patlen <= to - from ? ret : UINT_MAX);
}
EXPORT_SYMBOL(skb_find_text);

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size, size_t max_frags)
{
        int i = skb_shinfo(skb)->nr_frags;

        if (skb_can_coalesce(skb, i, page, offset)) {
                skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
        } else if (i < max_frags) {
                skb_zcopy_downgrade_managed(skb);
                get_page(page);
                skb_fill_page_desc_noacc(skb, i, page, offset, size);
        } else {
                return -EMSGSIZE;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(skb_append_pagefrags);

/**
 *        skb_pull_rcsum - pull skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_pull on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_pull unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
{
        unsigned char *data = skb->data;

        BUG_ON(len > skb->len);
        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, data, len);
        return skb->data;
}
EXPORT_SYMBOL_GPL(skb_pull_rcsum);

static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
{
        skb_frag_t head_frag;
        struct page *page;

        page = virt_to_head_page(frag_skb->head);
        skb_frag_fill_page_desc(&head_frag, page, frag_skb->data -
                                (unsigned char *)page_address(page),
                                skb_headlen(frag_skb));
        return head_frag;
}

struct sk_buff *skb_segment_list(struct sk_buff *skb,
                                 netdev_features_t features,
                                 unsigned int offset)
{
        struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
        unsigned int tnl_hlen = skb_tnl_header_len(skb);
        unsigned int delta_len = 0;
        struct sk_buff *tail = NULL;
        struct sk_buff *nskb, *tmp;
        int len_diff, err;

        /* Only skb_gro_receive_list generated skbs arrive here */
        DEBUG_NET_WARN_ON_ONCE(!(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST));

        skb_push(skb, -skb_network_offset(skb) + offset);

        /* Ensure the head is writeable before touching the shared info */
        err = skb_unclone(skb, GFP_ATOMIC);
        if (err)
                goto err_linearize;

        skb_shinfo(skb)->frag_list = NULL;

        while (list_skb) {
                nskb = list_skb;
                list_skb = list_skb->next;

                DEBUG_NET_WARN_ON_ONCE(nskb->sk);

                err = 0;
                if (skb_shared(nskb)) {
                        tmp = skb_clone(nskb, GFP_ATOMIC);
                        if (tmp) {
                                consume_skb(nskb);
                                nskb = tmp;
                                err = skb_unclone(nskb, GFP_ATOMIC);
                        } else {
                                err = -ENOMEM;
                        }
                }

                if (!tail)
                        skb->next = nskb;
                else
                        tail->next = nskb;

                if (unlikely(err)) {
                        nskb->next = list_skb;
                        goto err_linearize;
                }

                tail = nskb;

                delta_len += nskb->len;

                skb_push(nskb, -skb_network_offset(nskb) + offset);

                skb_release_head_state(nskb);
                len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
                __copy_skb_header(nskb, skb);

                skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
                nskb->transport_header += len_diff;
                skb_copy_from_linear_data_offset(skb, -tnl_hlen,
                                                 nskb->data - tnl_hlen,
                                                 offset + tnl_hlen);

                if (skb_needs_linearize(nskb, features) &&
                    __skb_linearize(nskb))
                        goto err_linearize;
        }

        skb->data_len = skb->data_len - delta_len;
        skb->len = skb->len - delta_len;

        skb_gso_reset(skb);

        skb->prev = tail;

        if (skb_needs_linearize(skb, features) &&
            __skb_linearize(skb))
                goto err_linearize;

        skb_get(skb);

        return skb;

err_linearize:
        kfree_skb_list(skb->next);
        skb->next = NULL;
        return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(skb_segment_list);

/**
 *        skb_segment - Perform protocol segmentation on skb.
 *        @head_skb: buffer to segment
 *        @features: features for the output path (see dev->features)
 *
 *        This function performs segmentation on the given skb.  It returns
 *        a pointer to the first in a list of new skbs for the segments.
 *        In case of error it returns ERR_PTR(err).
 */
struct sk_buff *skb_segment(struct sk_buff *head_skb,
                            netdev_features_t features)
{
        struct sk_buff *segs = NULL;
        struct sk_buff *tail = NULL;
        struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
        unsigned int mss = skb_shinfo(head_skb)->gso_size;
        unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
        unsigned int offset = doffset;
        unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
        unsigned int partial_segs = 0;
        unsigned int headroom;
        unsigned int len = head_skb->len;
        struct sk_buff *frag_skb;
        skb_frag_t *frag;
        __be16 proto;
        bool csum, sg;
        int err = -ENOMEM;
        int i = 0;
        int nfrags, pos;

        if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
            mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
                struct sk_buff *check_skb;

                for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
                        if (skb_headlen(check_skb) && !check_skb->head_frag) {
                                /* gso_size is untrusted, and we have a frag_list with
                                 * a linear non head_frag item.
                                 *
                                 * If head_skb's headlen does not fit requested gso_size,
                                 * it means that the frag_list members do NOT terminate
                                 * on exact gso_size boundaries. Hence we cannot perform
                                 * skb_frag_t page sharing. Therefore we must fallback to
                                 * copying the frag_list skbs; we do so by disabling SG.
                                 */
                                features &= ~NETIF_F_SG;
                                break;
                        }
                }
        }

        __skb_push(head_skb, doffset);
        proto = skb_network_protocol(head_skb, NULL);
        if (unlikely(!proto))
                return ERR_PTR(-EINVAL);

        sg = !!(features & NETIF_F_SG);
        csum = !!can_checksum_protocol(features, proto);

        if (sg && csum && (mss != GSO_BY_FRAGS))  {
                if (!(features & NETIF_F_GSO_PARTIAL)) {
                        struct sk_buff *iter;
                        unsigned int frag_len;

                        if (!list_skb ||
                            !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
                                goto normal;

                        /* If we get here then all the required
                         * GSO features except frag_list are supported.
                         * Try to split the SKB to multiple GSO SKBs
                         * with no frag_list.
                         * Currently we can do that only when the buffers don't
                         * have a linear part and all the buffers except
                         * the last are of the same length.
                         */
                        frag_len = list_skb->len;
                        skb_walk_frags(head_skb, iter) {
                                if (frag_len != iter->len && iter->next)
                                        goto normal;
                                if (skb_headlen(iter) && !iter->head_frag)
                                        goto normal;

                                len -= iter->len;
                        }

                        if (len != frag_len)
                                goto normal;
                }

                /* GSO partial only requires that we trim off any excess that
                 * doesn't fit into an MSS sized block, so take care of that
                 * now.
                 * Cap len to not accidentally hit GSO_BY_FRAGS.
                 */
                partial_segs = min(len, GSO_BY_FRAGS - 1) / mss;
                if (partial_segs > 1)
                        mss *= partial_segs;
                else
                        partial_segs = 0;
        }

normal:
        headroom = skb_headroom(head_skb);
        pos = skb_headlen(head_skb);

        if (skb_orphan_frags(head_skb, GFP_ATOMIC))
                return ERR_PTR(-ENOMEM);

        nfrags = skb_shinfo(head_skb)->nr_frags;
        frag = skb_shinfo(head_skb)->frags;
        frag_skb = head_skb;

        do {
                struct sk_buff *nskb;
                skb_frag_t *nskb_frag;
                int hsize;
                int size;

                if (unlikely(mss == GSO_BY_FRAGS)) {
                        len = list_skb->len;
                } else {
                        len = head_skb->len - offset;
                        if (len > mss)
                                len = mss;
                }

                hsize = skb_headlen(head_skb) - offset;

                if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
                    (skb_headlen(list_skb) == len || sg)) {
                        BUG_ON(skb_headlen(list_skb) > len);

                        nskb = skb_clone(list_skb, GFP_ATOMIC);
                        if (unlikely(!nskb))
                                goto err;

                        i = 0;
                        nfrags = skb_shinfo(list_skb)->nr_frags;
                        frag = skb_shinfo(list_skb)->frags;
                        frag_skb = list_skb;
                        pos += skb_headlen(list_skb);

                        while (pos < offset + len) {
                                BUG_ON(i >= nfrags);

                                size = skb_frag_size(frag);
                                if (pos + size > offset + len)
                                        break;

                                i++;
                                pos += size;
                                frag++;
                        }

                        list_skb = list_skb->next;

                        if (unlikely(pskb_trim(nskb, len))) {
                                kfree_skb(nskb);
                                goto err;
                        }

                        hsize = skb_end_offset(nskb);
                        if (skb_cow_head(nskb, doffset + headroom)) {
                                kfree_skb(nskb);
                                goto err;
                        }

                        nskb->truesize += skb_end_offset(nskb) - hsize;
                        skb_release_head_state(nskb);
                        __skb_push(nskb, doffset);
                } else {
                        if (hsize < 0)
                                hsize = 0;
                        if (hsize > len || !sg)
                                hsize = len;

                        nskb = __alloc_skb(hsize + doffset + headroom,
                                           GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
                                           NUMA_NO_NODE);

                        if (unlikely(!nskb))
                                goto err;

                        skb_reserve(nskb, headroom);
                        __skb_put(nskb, doffset);
                }

                if (segs)
                        tail->next = nskb;
                else
                        segs = nskb;
                tail = nskb;

                __copy_skb_header(nskb, head_skb);

                skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
                skb_reset_mac_len(nskb);

                skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
                                                 nskb->data - tnl_hlen,
                                                 doffset + tnl_hlen);

                if (nskb->len == len + doffset)
                        goto perform_csum_check;

                if (!sg) {
                        if (!csum) {
                                if (!nskb->remcsum_offload)
                                        nskb->ip_summed = CHECKSUM_NONE;
                                SKB_GSO_CB(nskb)->csum =
                                        skb_copy_and_csum_bits(head_skb, offset,
                                                               skb_put(nskb,
                                                                       len),
                                                               len);
                                SKB_GSO_CB(nskb)->csum_start =
                                        skb_headroom(nskb) + doffset;
                        } else {
                                if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
                                        goto err;
                        }
                        continue;
                }

                nskb_frag = skb_shinfo(nskb)->frags;

                skb_copy_from_linear_data_offset(head_skb, offset,
                                                 skb_put(nskb, hsize), hsize);

                skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
                                           SKBFL_SHARED_FRAG;

                if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
                        goto err;

                while (pos < offset + len) {
                        if (i >= nfrags) {
                                if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
                                    skb_zerocopy_clone(nskb, list_skb,
                                                       GFP_ATOMIC))
                                        goto err;

                                i = 0;
                                nfrags = skb_shinfo(list_skb)->nr_frags;
                                frag = skb_shinfo(list_skb)->frags;
                                frag_skb = list_skb;
                                if (!skb_headlen(list_skb)) {
                                        BUG_ON(!nfrags);
                                } else {
                                        BUG_ON(!list_skb->head_frag);

                                        /* to make room for head_frag. */
                                        i--;
                                        frag--;
                                }

                                list_skb = list_skb->next;
                        }

                        if (unlikely(skb_shinfo(nskb)->nr_frags >=
                                     MAX_SKB_FRAGS)) {
                                net_warn_ratelimited(
                                        "skb_segment: too many frags: %u %u\n",
                                        pos, mss);
                                err = -EINVAL;
                                goto err;
                        }

                        *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
                        __skb_frag_ref(nskb_frag);
                        size = skb_frag_size(nskb_frag);

                        if (pos < offset) {
                                skb_frag_off_add(nskb_frag, offset - pos);
                                skb_frag_size_sub(nskb_frag, offset - pos);
                        }

                        skb_shinfo(nskb)->nr_frags++;

                        if (pos + size <= offset + len) {
                                i++;
                                frag++;
                                pos += size;
                        } else {
                                skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
                                goto skip_fraglist;
                        }

                        nskb_frag++;
                }

skip_fraglist:
                nskb->data_len = len - hsize;
                nskb->len += nskb->data_len;
                nskb->truesize += nskb->data_len;

perform_csum_check:
                if (!csum) {
                        if (skb_has_shared_frag(nskb) &&
                            __skb_linearize(nskb))
                                goto err;

                        if (!nskb->remcsum_offload)
                                nskb->ip_summed = CHECKSUM_NONE;
                        SKB_GSO_CB(nskb)->csum =
                                skb_checksum(nskb, doffset,
                                             nskb->len - doffset, 0);
                        SKB_GSO_CB(nskb)->csum_start =
                                skb_headroom(nskb) + doffset;
                }
        } while ((offset += len) < head_skb->len);

        /* Some callers want to get the end of the list.
         * Put it in segs->prev to avoid walking the list.
         * (see validate_xmit_skb_list() for example)
         */
        segs->prev = tail;

        if (partial_segs) {
                struct sk_buff *iter;
                int type = skb_shinfo(head_skb)->gso_type;
                unsigned short gso_size = skb_shinfo(head_skb)->gso_size;

                /* Update type to add partial and then remove dodgy if set */
                type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
                type &= ~SKB_GSO_DODGY;

                /* Update GSO info and prepare to start updating headers on
                 * our way back down the stack of protocols.
                 */
                for (iter = segs; iter; iter = iter->next) {
                        skb_shinfo(iter)->gso_size = gso_size;
                        skb_shinfo(iter)->gso_segs = partial_segs;
                        skb_shinfo(iter)->gso_type = type;
                        SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
                }

                if (tail->len - doffset <= gso_size)
                        skb_shinfo(tail)->gso_size = 0;
                else if (tail != segs)
                        skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
        }

        /* Following permits correct backpressure, for protocols
         * using skb_set_owner_w().
         * Idea is to tranfert ownership from head_skb to last segment.
         */
        if (head_skb->destructor == sock_wfree) {
                swap(tail->truesize, head_skb->truesize);
                swap(tail->destructor, head_skb->destructor);
                swap(tail->sk, head_skb->sk);
        }
        return segs;

err:
        kfree_skb_list(segs);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(skb_segment);

#ifdef CONFIG_SKB_EXTENSIONS
#define SKB_EXT_ALIGN_VALUE        8
#define SKB_EXT_CHUNKSIZEOF(x)        (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)

static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
#endif
#ifdef CONFIG_XFRM
        [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
#endif
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
        [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
#endif
#if IS_ENABLED(CONFIG_INET_PSP)
        [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
#endif
#if IS_ENABLED(CONFIG_CAN)
        [SKB_EXT_CAN] = SKB_EXT_CHUNKSIZEOF(struct can_skb_ext),
#endif
};

static __always_inline __no_profile unsigned int skb_ext_total_length(void)
{
        unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext);
        int i;

        for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++)
                l += skb_ext_type_len[i];

        return l;
}

static noinline void __init __no_profile skb_extensions_init(void)
{
        BUILD_BUG_ON(SKB_EXT_NUM > 8);
        BUILD_BUG_ON(skb_ext_total_length() > 255);

        skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
                                             SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
                                             0,
                                             SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                             NULL);
}
#else
static void skb_extensions_init(void) {}
#endif

/* The SKB kmem_cache slab is critical for network performance.  Never
 * merge/alias the slab with similar sized objects.  This avoids fragmentation
 * that hurts performance of kmem_cache_{alloc,free}_bulk APIs.
 */
#ifndef CONFIG_SLUB_TINY
#define FLAG_SKB_NO_MERGE        SLAB_NO_MERGE
#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */
#define FLAG_SKB_NO_MERGE        0
#endif

void __init skb_init(void)
{
        net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
                                              sizeof(struct sk_buff),
                                              0,
                                              SLAB_HWCACHE_ALIGN|SLAB_PANIC|
                                                FLAG_SKB_NO_MERGE,
                                              offsetof(struct sk_buff, cb),
                                              sizeof_field(struct sk_buff, cb),
                                              NULL);
        skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache);

        net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
                                                sizeof(struct sk_buff_fclones),
                                                0,
                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                                NULL);
        /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
         * struct skb_shared_info is located at the end of skb->head,
         * and should not be copied to/from user.
         */
        net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
                                                SKB_SMALL_HEAD_CACHE_SIZE,
                                                0,
                                                SLAB_HWCACHE_ALIGN | SLAB_PANIC,
                                                0,
                                                SKB_SMALL_HEAD_HEADROOM,
                                                NULL);
        skb_extensions_init();
}

static int
__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
               unsigned int recursion_level)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int elt = 0;

        if (unlikely(recursion_level >= 24))
                return -EMSGSIZE;

        if (copy > 0) {
                if (copy > len)
                        copy = len;
                sg_set_buf(sg, skb->data + offset, copy);
                elt++;
                if ((len -= copy) == 0)
                        return elt;
                offset += copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        if (unlikely(elt && sg_is_last(&sg[elt - 1])))
                                return -EMSGSIZE;

                        if (copy > len)
                                copy = len;
                        sg_set_page(&sg[elt], skb_frag_page(frag), copy,
                                    skb_frag_off(frag) + offset - start);
                        elt++;
                        if (!(len -= copy))
                                return elt;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end, ret;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (unlikely(elt && sg_is_last(&sg[elt - 1])))
                                return -EMSGSIZE;

                        if (copy > len)
                                copy = len;
                        ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
                                              copy, recursion_level + 1);
                        if (unlikely(ret < 0))
                                return ret;
                        elt += ret;
                        if ((len -= copy) == 0)
                                return elt;
                        offset += copy;
                }
                start = end;
        }
        BUG_ON(len);
        return elt;
}

/**
 *        skb_to_sgvec - Fill a scatter-gather list from a socket buffer
 *        @skb: Socket buffer containing the buffers to be mapped
 *        @sg: The scatter-gather list to map into
 *        @offset: The offset into the buffer's contents to start mapping
 *        @len: Length of buffer space to be mapped
 *
 *        Fill the specified scatter-gather list with mappings/pointers into a
 *        region of the buffer space attached to a socket buffer. Returns either
 *        the number of scatterlist items used, or -EMSGSIZE if the contents
 *        could not fit.
 */
int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
{
        int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);

        if (nsg <= 0)
                return nsg;

        sg_mark_end(&sg[nsg - 1]);

        return nsg;
}
EXPORT_SYMBOL_GPL(skb_to_sgvec);

/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
 * sglist without mark the sg which contain last skb data as the end.
 * So the caller can mannipulate sg list as will when padding new data after
 * the first call without calling sg_unmark_end to expend sg list.
 *
 * Scenario to use skb_to_sgvec_nomark:
 * 1. sg_init_table
 * 2. skb_to_sgvec_nomark(payload1)
 * 3. skb_to_sgvec_nomark(payload2)
 *
 * This is equivalent to:
 * 1. sg_init_table
 * 2. skb_to_sgvec(payload1)
 * 3. sg_unmark_end
 * 4. skb_to_sgvec(payload2)
 *
 * When mapping multiple payload conditionally, skb_to_sgvec_nomark
 * is more preferable.
 */
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                        int offset, int len)
{
        return __skb_to_sgvec(skb, sg, offset, len, 0);
}
EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);



/**
 *        skb_cow_data - Check that a socket buffer's data buffers are writable
 *        @skb: The socket buffer to check.
 *        @tailbits: Amount of trailing space to be added
 *        @trailer: Returned pointer to the skb where the @tailbits space begins
 *
 *        Make sure that the data buffers attached to a socket buffer are
 *        writable. If they are not, private copies are made of the data buffers
 *        and the socket buffer is set to use these instead.
 *
 *        If @tailbits is given, make sure that there is space to write @tailbits
 *        bytes of data beyond current end of socket buffer.  @trailer will be
 *        set to point to the skb in which this space begins.
 *
 *        The number of scatterlist elements required to completely map the
 *        COW'd and extended socket buffer will be returned.
 */
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
{
        int copyflag;
        int elt;
        struct sk_buff *skb1, **skb_p;

        /* If skb is cloned or its head is paged, reallocate
         * head pulling out all the pages (pages are considered not writable
         * at the moment even if they are anonymous).
         */
        if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
            !__pskb_pull_tail(skb, __skb_pagelen(skb)))
                return -ENOMEM;

        /* Easy case. Most of packets will go this way. */
        if (!skb_has_frag_list(skb)) {
                /* A little of trouble, not enough of space for trailer.
                 * This should not happen, when stack is tuned to generate
                 * good frames. OK, on miss we reallocate and reserve even more
                 * space, 128 bytes is fair. */

                if (skb_tailroom(skb) < tailbits &&
                    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
                        return -ENOMEM;

                /* Voila! */
                *trailer = skb;
                return 1;
        }

        /* Misery. We are in troubles, going to mincer fragments... */

        elt = 1;
        skb_p = &skb_shinfo(skb)->frag_list;
        copyflag = 0;

        while ((skb1 = *skb_p) != NULL) {
                int ntail = 0;

                /* The fragment is partially pulled by someone,
                 * this can happen on input. Copy it and everything
                 * after it. */

                if (skb_shared(skb1))
                        copyflag = 1;

                /* If the skb is the last, worry about trailer. */

                if (skb1->next == NULL && tailbits) {
                        if (skb_shinfo(skb1)->nr_frags ||
                            skb_has_frag_list(skb1) ||
                            skb_tailroom(skb1) < tailbits)
                                ntail = tailbits + 128;
                }

                if (copyflag ||
                    skb_cloned(skb1) ||
                    ntail ||
                    skb_shinfo(skb1)->nr_frags ||
                    skb_has_frag_list(skb1)) {
                        struct sk_buff *skb2;

                        /* Fuck, we are miserable poor guys... */
                        if (ntail == 0)
                                skb2 = skb_copy(skb1, GFP_ATOMIC);
                        else
                                skb2 = skb_copy_expand(skb1,
                                                       skb_headroom(skb1),
                                                       ntail,
                                                       GFP_ATOMIC);
                        if (unlikely(skb2 == NULL))
                                return -ENOMEM;

                        if (skb1->sk)
                                skb_set_owner_w(skb2, skb1->sk);

                        /* Looking around. Are we still alive?
                         * OK, link new skb, drop old one */

                        skb2->next = skb1->next;
                        *skb_p = skb2;
                        kfree_skb(skb1);
                        skb1 = skb2;
                }
                elt++;
                *trailer = skb1;
                skb_p = &skb1->next;
        }

        return elt;
}
EXPORT_SYMBOL_GPL(skb_cow_data);

static void sock_rmem_free(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
}

static void skb_set_err_queue(struct sk_buff *skb)
{
        /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
         * So, it is safe to (mis)use it to mark skbs on the error queue.
         */
        skb->pkt_type = PACKET_OUTGOING;
        BUILD_BUG_ON(PACKET_OUTGOING == 0);
}

/*
 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
 */
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
            (unsigned int)READ_ONCE(sk->sk_rcvbuf))
                return -ENOMEM;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rmem_free;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        skb_set_err_queue(skb);

        /* before exiting rcu section, make sure dst is refcounted */
        skb_dst_force(skb);

        skb_queue_tail(&sk->sk_error_queue, skb);
        if (!sock_flag(sk, SOCK_DEAD))
                sk_error_report(sk);
        return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);

static bool is_icmp_err_skb(const struct sk_buff *skb)
{
        return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
                       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
}

struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
        struct sk_buff_head *q = &sk->sk_error_queue;
        struct sk_buff *skb, *skb_next = NULL;
        bool icmp_next = false;
        unsigned long flags;

        if (skb_queue_empty_lockless(q))
                return NULL;

        spin_lock_irqsave(&q->lock, flags);
        skb = __skb_dequeue(q);
        if (skb && (skb_next = skb_peek(q))) {
                icmp_next = is_icmp_err_skb(skb_next);
                if (icmp_next)
                        sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
        }
        spin_unlock_irqrestore(&q->lock, flags);

        if (is_icmp_err_skb(skb) && !icmp_next)
                sk->sk_err = 0;

        if (skb_next)
                sk_error_report(sk);

        return skb;
}
EXPORT_SYMBOL(sock_dequeue_err_skb);

/**
 * skb_clone_sk - create clone of skb, and take reference to socket
 * @skb: the skb to clone
 *
 * This function creates a clone of a buffer that holds a reference on
 * sk_refcnt.  Buffers created via this function are meant to be
 * returned using sock_queue_err_skb, or free via kfree_skb.
 *
 * When passing buffers allocated with this function to sock_queue_err_skb
 * it is necessary to wrap the call with sock_hold/sock_put in order to
 * prevent the socket from being released prior to being enqueued on
 * the sk_error_queue.
 */
struct sk_buff *skb_clone_sk(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        struct sk_buff *clone;

        if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
                return NULL;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (!clone) {
                sock_put(sk);
                return NULL;
        }

        clone->sk = sk;
        clone->destructor = sock_efree;

        return clone;
}
EXPORT_SYMBOL(skb_clone_sk);

static void __skb_complete_tx_timestamp(struct sk_buff *skb,
                                        struct sock *sk,
                                        int tstype,
                                        bool opt_stats)
{
        struct sock_exterr_skb *serr;
        int err;

        BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = ENOMSG;
        serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
        serr->ee.ee_info = tstype;
        serr->opt_stats = opt_stats;
        serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
        if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
                serr->ee.ee_data = skb_shinfo(skb)->tskey;
                if (sk_is_tcp(sk))
                        serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
        }

        err = sock_queue_err_skb(sk, skb);

        if (err)
                kfree_skb(skb);
}

static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
        struct socket *sock;
        struct file *file;
        bool ret = false;

        if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
                return true;

        /* The sk pointer remains valid as long as the skb is. The sk_socket and
         * file pointer may become NULL if the socket is closed. Both structures
         * (including file->cred) are RCU freed which means they can be accessed
         * within a RCU read section.
         */
        rcu_read_lock();
        sock = READ_ONCE(sk->sk_socket);
        if (!sock)
                goto out;
        file = READ_ONCE(sock->file);
        if (!file)
                goto out;
        ret = file_ns_capable(file, &init_user_ns, CAP_NET_RAW);
out:
        rcu_read_unlock();
        return ret;
}

void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps)
{
        struct sock *sk = skb->sk;

        if (!skb_may_tx_timestamp(sk, false))
                goto err;

        /* Take a reference to prevent skb_orphan() from freeing the socket,
         * but only if the socket refcount is not zero.
         */
        if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
                *skb_hwtstamps(skb) = *hwtstamps;
                __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
                sock_put(sk);
                return;
        }

err:
        kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);

static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
                                                 struct skb_shared_hwtstamps *hwtstamps,
                                                 int tstype)
{
        switch (tstype) {
        case SCM_TSTAMP_SCHED:
                return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
        case SCM_TSTAMP_SND:
                return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
                                                    SKBTX_SW_TSTAMP);
        case SCM_TSTAMP_ACK:
                return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
        case SCM_TSTAMP_COMPLETION:
                return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP;
        }

        return false;
}

static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
                                                  struct skb_shared_hwtstamps *hwtstamps,
                                                  struct sock *sk,
                                                  int tstype)
{
        int op;

        switch (tstype) {
        case SCM_TSTAMP_SCHED:
                op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
                break;
        case SCM_TSTAMP_SND:
                if (hwtstamps) {
                        op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
                        *skb_hwtstamps(skb) = *hwtstamps;
                } else {
                        op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
                }
                break;
        case SCM_TSTAMP_ACK:
                op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
                break;
        default:
                return;
        }

        bpf_skops_tx_timestamping(sk, skb, op);
}

void __skb_tstamp_tx(struct sk_buff *orig_skb,
                     const struct sk_buff *ack_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype)
{
        struct sk_buff *skb;
        bool tsonly, opt_stats = false;
        u32 tsflags;

        if (!sk)
                return;

        if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
                skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
                                                      sk, tstype);

        if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
                return;

        tsflags = READ_ONCE(sk->sk_tsflags);
        if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
            skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
                return;

        tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
        if (!skb_may_tx_timestamp(sk, tsonly))
                return;

        if (tsonly) {
#ifdef CONFIG_INET
                if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
                    sk_is_tcp(sk)) {
                        skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
                                                             ack_skb);
                        opt_stats = true;
                } else
#endif
                        skb = alloc_skb(0, GFP_ATOMIC);
        } else {
                skb = skb_clone(orig_skb, GFP_ATOMIC);

                if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) {
                        kfree_skb(skb);
                        return;
                }
        }
        if (!skb)
                return;

        if (tsonly) {
                skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
                                             SKBTX_ANY_TSTAMP;
                skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
        }

        if (hwtstamps)
                *skb_hwtstamps(skb) = *hwtstamps;
        else
                __net_timestamp(skb);

        __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);

void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps)
{
        return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
                               SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);

#ifdef CONFIG_WIRELESS
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
{
        struct sock *sk = skb->sk;
        struct sock_exterr_skb *serr;
        int err = 1;

        skb->wifi_acked_valid = 1;
        skb->wifi_acked = acked;

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = ENOMSG;
        serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;

        /* Take a reference to prevent skb_orphan() from freeing the socket,
         * but only if the socket refcount is not zero.
         */
        if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
                err = sock_queue_err_skb(sk, skb);
                sock_put(sk);
        }
        if (err)
                kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
#endif /* CONFIG_WIRELESS */

/**
 * skb_partial_csum_set - set up and verify partial csum values for packet
 * @skb: the skb to set
 * @start: the number of bytes after skb->data to start checksumming.
 * @off: the offset from start to place the checksum.
 *
 * For untrusted partially-checksummed packets, we need to make sure the values
 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
 *
 * This function checks and sets those values and skb->ip_summed: if this
 * returns false you should drop the packet.
 */
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
{
        u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
        u32 csum_start = skb_headroom(skb) + (u32)start;

        if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) {
                net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
                                     start, off, skb_headroom(skb), skb_headlen(skb));
                return false;
        }
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = csum_start;
        skb->csum_offset = off;
        skb->transport_header = csum_start;
        return true;
}
EXPORT_SYMBOL_GPL(skb_partial_csum_set);

static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
                               unsigned int max)
{
        if (skb_headlen(skb) >= len)
                return 0;

        /* If we need to pullup then pullup to the max, so we
         * won't need to do it again.
         */
        if (max > skb->len)
                max = skb->len;

        if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
                return -ENOMEM;

        if (skb_headlen(skb) < len)
                return -EPROTO;

        return 0;
}

#define MAX_TCP_HDR_LEN (15 * 4)

static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
                                      typeof(IPPROTO_IP) proto,
                                      unsigned int off)
{
        int err;

        switch (proto) {
        case IPPROTO_TCP:
                err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
                                          off + MAX_TCP_HDR_LEN);
                if (!err && !skb_partial_csum_set(skb, off,
                                                  offsetof(struct tcphdr,
                                                           check)))
                        err = -EPROTO;
                return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;

        case IPPROTO_UDP:
                err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
                                          off + sizeof(struct udphdr));
                if (!err && !skb_partial_csum_set(skb, off,
                                                  offsetof(struct udphdr,
                                                           check)))
                        err = -EPROTO;
                return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
        }

        return ERR_PTR(-EPROTO);
}

/* This value should be large enough to cover a tagged ethernet header plus
 * maximally sized IP and TCP or UDP headers.
 */
#define MAX_IP_HDR_LEN 128

static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
{
        unsigned int off;
        bool fragment;
        __sum16 *csum;
        int err;

        fragment = false;

        err = skb_maybe_pull_tail(skb,
                                  sizeof(struct iphdr),
                                  MAX_IP_HDR_LEN);
        if (err < 0)
                goto out;

        if (ip_is_fragment(ip_hdr(skb)))
                fragment = true;

        off = ip_hdrlen(skb);

        err = -EPROTO;

        if (fragment)
                goto out;

        csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
        if (IS_ERR(csum))
                return PTR_ERR(csum);

        if (recalculate)
                *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
                                           ip_hdr(skb)->daddr,
                                           skb->len - off,
                                           ip_hdr(skb)->protocol, 0);
        err = 0;

out:
        return err;
}

/* This value should be large enough to cover a tagged ethernet header plus
 * an IPv6 header, all options, and a maximal TCP or UDP header.
 */
#define MAX_IPV6_HDR_LEN 256

#define OPT_HDR(type, skb, off) \
        (type *)(skb_network_header(skb) + (off))

static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
{
        int err;
        u8 nexthdr;
        unsigned int off;
        unsigned int len;
        bool fragment;
        bool done;
        __sum16 *csum;

        fragment = false;
        done = false;

        off = sizeof(struct ipv6hdr);

        err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
        if (err < 0)
                goto out;

        nexthdr = ipv6_hdr(skb)->nexthdr;

        len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
        while (off <= len && !done) {
                switch (nexthdr) {
                case IPPROTO_DSTOPTS:
                case IPPROTO_HOPOPTS:
                case IPPROTO_ROUTING: {
                        struct ipv6_opt_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct ipv6_opt_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
                        nexthdr = hp->nexthdr;
                        off += ipv6_optlen(hp);
                        break;
                }
                case IPPROTO_AH: {
                        struct ip_auth_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct ip_auth_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct ip_auth_hdr, skb, off);
                        nexthdr = hp->nexthdr;
                        off += ipv6_authlen(hp);
                        break;
                }
                case IPPROTO_FRAGMENT: {
                        struct frag_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct frag_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct frag_hdr, skb, off);

                        if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
                                fragment = true;

                        nexthdr = hp->nexthdr;
                        off += sizeof(struct frag_hdr);
                        break;
                }
                default:
                        done = true;
                        break;
                }
        }

        err = -EPROTO;

        if (!done || fragment)
                goto out;

        csum = skb_checksum_setup_ip(skb, nexthdr, off);
        if (IS_ERR(csum))
                return PTR_ERR(csum);

        if (recalculate)
                *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                         &ipv6_hdr(skb)->daddr,
                                         skb->len - off, nexthdr, 0);
        err = 0;

out:
        return err;
}

/**
 * skb_checksum_setup - set up partial checksum offset
 * @skb: the skb to set up
 * @recalculate: if true the pseudo-header checksum will be recalculated
 */
int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
{
        int err;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                err = skb_checksum_setup_ipv4(skb, recalculate);
                break;

        case htons(ETH_P_IPV6):
                err = skb_checksum_setup_ipv6(skb, recalculate);
                break;

        default:
                err = -EPROTO;
                break;
        }

        return err;
}
EXPORT_SYMBOL(skb_checksum_setup);

/**
 * skb_checksum_maybe_trim - maybe trims the given skb
 * @skb: the skb to check
 * @transport_len: the data length beyond the network header
 *
 * Checks whether the given skb has data beyond the given transport length.
 * If so, returns a cloned skb trimmed to this transport length.
 * Otherwise returns the provided skb. Returns NULL in error cases
 * (e.g. transport_len exceeds skb length or out-of-memory).
 *
 * Caller needs to set the skb transport header and free any returned skb if it
 * differs from the provided skb.
 */
static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
                                               unsigned int transport_len)
{
        struct sk_buff *skb_chk;
        unsigned int len = skb_transport_offset(skb) + transport_len;
        int ret;

        if (skb->len < len)
                return NULL;
        else if (skb->len == len)
                return skb;

        skb_chk = skb_clone(skb, GFP_ATOMIC);
        if (!skb_chk)
                return NULL;

        ret = pskb_trim_rcsum(skb_chk, len);
        if (ret) {
                kfree_skb(skb_chk);
                return NULL;
        }

        return skb_chk;
}

/**
 * skb_checksum_trimmed - validate checksum of an skb
 * @skb: the skb to check
 * @transport_len: the data length beyond the network header
 * @skb_chkf: checksum function to use
 *
 * Applies the given checksum function skb_chkf to the provided skb.
 * Returns a checked and maybe trimmed skb. Returns NULL on error.
 *
 * If the skb has data beyond the given transport length, then a
 * trimmed & cloned skb is checked and returned.
 *
 * Caller needs to set the skb transport header and free any returned skb if it
 * differs from the provided skb.
 */
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb))
{
        struct sk_buff *skb_chk;
        unsigned int offset = skb_transport_offset(skb);
        __sum16 ret;

        skb_chk = skb_checksum_maybe_trim(skb, transport_len);
        if (!skb_chk)
                goto err;

        if (!pskb_may_pull(skb_chk, offset))
                goto err;

        skb_pull_rcsum(skb_chk, offset);
        ret = skb_chkf(skb_chk);
        skb_push_rcsum(skb_chk, offset);

        if (ret)
                goto err;

        return skb_chk;

err:
        if (skb_chk && skb_chk != skb)
                kfree_skb(skb_chk);

        return NULL;

}
EXPORT_SYMBOL(skb_checksum_trimmed);

void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
        net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
                             skb->dev->name);
}
EXPORT_SYMBOL(__skb_warn_lro_forwarding);

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
        if (head_stolen) {
                skb_release_head_state(skb);
                kmem_cache_free(net_hotdata.skbuff_cache, skb);
        } else {
                __kfree_skb(skb);
        }
}
EXPORT_SYMBOL(kfree_skb_partial);

/**
 * skb_try_coalesce - try to merge skb to prior one
 * @to: prior buffer
 * @from: buffer to add
 * @fragstolen: pointer to boolean
 * @delta_truesize: how much more was allocated than was requested
 */
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize)
{
        struct skb_shared_info *to_shinfo, *from_shinfo;
        int i, delta, len = from->len;

        *fragstolen = false;

        if (skb_cloned(to))
                return false;

        /* In general, avoid mixing page_pool and non-page_pool allocated
         * pages within the same SKB. In theory we could take full
         * references if @from is cloned and !@to->pp_recycle but its
         * tricky (due to potential race with the clone disappearing) and
         * rare, so not worth dealing with.
         */
        if (to->pp_recycle != from->pp_recycle)
                return false;

        if (skb_frags_readable(from) != skb_frags_readable(to))
                return false;

        if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
                if (len)
                        BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
                *delta_truesize = 0;
                return true;
        }

        to_shinfo = skb_shinfo(to);
        from_shinfo = skb_shinfo(from);
        if (to_shinfo->frag_list || from_shinfo->frag_list)
                return false;
        if (skb_zcopy(to) || skb_zcopy(from))
                return false;

        if (skb_headlen(from) != 0) {
                struct page *page;
                unsigned int offset;

                if (to_shinfo->nr_frags +
                    from_shinfo->nr_frags >= MAX_SKB_FRAGS)
                        return false;

                if (skb_head_is_locked(from))
                        return false;

                delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));

                page = virt_to_head_page(from->head);
                offset = from->data - (unsigned char *)page_address(page);

                skb_fill_page_desc(to, to_shinfo->nr_frags,
                                   page, offset, skb_headlen(from));
                *fragstolen = true;
        } else {
                if (to_shinfo->nr_frags +
                    from_shinfo->nr_frags > MAX_SKB_FRAGS)
                        return false;

                delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
        }

        WARN_ON_ONCE(delta < len);

        memcpy(to_shinfo->frags + to_shinfo->nr_frags,
               from_shinfo->frags,
               from_shinfo->nr_frags * sizeof(skb_frag_t));
        to_shinfo->nr_frags += from_shinfo->nr_frags;

        if (!skb_cloned(from))
                from_shinfo->nr_frags = 0;

        /* if the skb is not cloned this does nothing
         * since we set nr_frags to 0.
         */
        if (skb_pp_frag_ref(from)) {
                for (i = 0; i < from_shinfo->nr_frags; i++)
                        __skb_frag_ref(&from_shinfo->frags[i]);
        }

        to->truesize += delta;
        to->len += len;
        to->data_len += len;

        *delta_truesize = delta;
        return true;
}
EXPORT_SYMBOL(skb_try_coalesce);

/**
 * skb_scrub_packet - scrub an skb
 *
 * @skb: buffer to clean
 * @xnet: packet is crossing netns
 *
 * skb_scrub_packet can be used after encapsulating or decapsulating a packet
 * into/from a tunnel. Some information have to be cleared during these
 * operations.
 * skb_scrub_packet can also be used to clean a skb before injecting it in
 * another namespace (@xnet == true). We have to clear all information in the
 * skb that could impact namespace isolation.
 */
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
        skb->pkt_type = PACKET_HOST;
        skb->skb_iif = 0;
        skb->ignore_df = 0;
        skb_dst_drop(skb);
        skb_ext_reset(skb);
        nf_reset_ct(skb);
        nf_reset_trace(skb);

#ifdef CONFIG_NET_SWITCHDEV
        skb->offload_fwd_mark = 0;
        skb->offload_l3_fwd_mark = 0;
#endif
        ipvs_reset(skb);

        if (!xnet)
                return;

        skb->mark = 0;
        skb_clear_tstamp(skb);
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);

static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
        int mac_len, meta_len;
        void *meta;

        if (skb_cow(skb, skb_headroom(skb)) < 0) {
                kfree_skb(skb);
                return NULL;
        }

        mac_len = skb->data - skb_mac_header(skb);
        if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
                memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
                        mac_len - VLAN_HLEN - ETH_TLEN);
        }

        meta_len = skb_metadata_len(skb);
        if (meta_len) {
                meta = skb_metadata_end(skb) - meta_len;
                memmove(meta + VLAN_HLEN, meta, meta_len);
        }

        skb->mac_header += VLAN_HLEN;
        return skb;
}

struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
{
        struct vlan_hdr *vhdr;
        u16 vlan_tci;

        if (unlikely(skb_vlan_tag_present(skb))) {
                /* vlan_tci is already set-up so leave this for another time */
                return skb;
        }

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (unlikely(!skb))
                goto err_free;
        /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
        if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
                goto err_free;

        vhdr = (struct vlan_hdr *)skb->data;
        vlan_tci = ntohs(vhdr->h_vlan_TCI);
        __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);

        skb_pull_rcsum(skb, VLAN_HLEN);
        vlan_set_encap_proto(skb, vhdr);

        skb = skb_reorder_vlan_header(skb);
        if (unlikely(!skb))
                goto err_free;

        skb_reset_network_header(skb);
        if (!skb_transport_header_was_set(skb))
                skb_reset_transport_header(skb);
        skb_reset_mac_len(skb);

        return skb;

err_free:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(skb_vlan_untag);

int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
{
        if (!pskb_may_pull(skb, write_len))
                return -ENOMEM;

        if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
                return 0;

        return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}
EXPORT_SYMBOL(skb_ensure_writable);

int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev)
{
        int needed_headroom = dev->needed_headroom;
        int needed_tailroom = dev->needed_tailroom;

        /* For tail taggers, we need to pad short frames ourselves, to ensure
         * that the tail tag does not fail at its role of being at the end of
         * the packet, once the conduit interface pads the frame. Account for
         * that pad length here, and pad later.
         */
        if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
                needed_tailroom += ETH_ZLEN - skb->len;
        /* skb_headroom() returns unsigned int... */
        needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
        needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);

        if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
                /* No reallocation needed, yay! */
                return 0;

        return pskb_expand_head(skb, needed_headroom, needed_tailroom,
                                GFP_ATOMIC);
}
EXPORT_SYMBOL(skb_ensure_writable_head_tail);

/* remove VLAN header from packet and update csum accordingly.
 * expects a non skb_vlan_tag_present skb with a vlan tag payload
 */
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
        int offset = skb->data - skb_mac_header(skb);
        int err;

        if (WARN_ONCE(offset,
                      "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
                      offset)) {
                return -EINVAL;
        }

        err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
        if (unlikely(err))
                return err;

        skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);

        vlan_remove_tag(skb, vlan_tci);

        skb->mac_header += VLAN_HLEN;

        if (skb_network_offset(skb) < ETH_HLEN)
                skb_set_network_header(skb, ETH_HLEN);

        skb_reset_mac_len(skb);

        return err;
}
EXPORT_SYMBOL(__skb_vlan_pop);

/* Pop a vlan tag either from hwaccel or from payload.
 * Expects skb->data at mac header.
 */
int skb_vlan_pop(struct sk_buff *skb)
{
        u16 vlan_tci;
        __be16 vlan_proto;
        int err;

        if (likely(skb_vlan_tag_present(skb))) {
                __vlan_hwaccel_clear_tag(skb);
        } else {
                if (unlikely(!eth_type_vlan(skb->protocol)))
                        return 0;

                err = __skb_vlan_pop(skb, &vlan_tci);
                if (err)
                        return err;
        }
        /* move next vlan tag to hw accel tag */
        if (likely(!eth_type_vlan(skb->protocol)))
                return 0;

        vlan_proto = skb->protocol;
        err = __skb_vlan_pop(skb, &vlan_tci);
        if (unlikely(err))
                return err;

        __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
        return 0;
}
EXPORT_SYMBOL(skb_vlan_pop);

/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
 * Expects skb->data at mac header.
 */
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
        if (skb_vlan_tag_present(skb)) {
                int offset = skb->data - skb_mac_header(skb);
                int err;

                if (WARN_ONCE(offset,
                              "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
                              offset)) {
                        return -EINVAL;
                }

                err = __vlan_insert_tag(skb, skb->vlan_proto,
                                        skb_vlan_tag_get(skb));
                if (err)
                        return err;

                skb->protocol = skb->vlan_proto;
                skb->network_header -= VLAN_HLEN;

                skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
        }
        __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
        return 0;
}
EXPORT_SYMBOL(skb_vlan_push);

/**
 * skb_eth_pop() - Drop the Ethernet header at the head of a packet
 *
 * @skb: Socket buffer to modify
 *
 * Drop the Ethernet header of @skb.
 *
 * Expects that skb->data points to the mac header and that no VLAN tags are
 * present.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_eth_pop(struct sk_buff *skb)
{
        if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
            skb_network_offset(skb) < ETH_HLEN)
                return -EPROTO;

        skb_pull_rcsum(skb, ETH_HLEN);
        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        return 0;
}
EXPORT_SYMBOL(skb_eth_pop);

/**
 * skb_eth_push() - Add a new Ethernet header at the head of a packet
 *
 * @skb: Socket buffer to modify
 * @dst: Destination MAC address of the new header
 * @src: Source MAC address of the new header
 *
 * Prepend @skb with a new Ethernet header.
 *
 * Expects that skb->data points to the mac header, which must be empty.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src)
{
        struct ethhdr *eth;
        int err;

        if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
                return -EPROTO;

        err = skb_cow_head(skb, sizeof(*eth));
        if (err < 0)
                return err;

        skb_push(skb, sizeof(*eth));
        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        eth = eth_hdr(skb);
        ether_addr_copy(eth->h_dest, dst);
        ether_addr_copy(eth->h_source, src);
        eth->h_proto = skb->protocol;

        skb_postpush_rcsum(skb, eth, sizeof(*eth));

        return 0;
}
EXPORT_SYMBOL(skb_eth_push);

/* Update the ethertype of hdr and the skb csum value if required. */
static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
                             __be16 ethertype)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                __be16 diff[] = { ~hdr->h_proto, ethertype };

                skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
        }

        hdr->h_proto = ethertype;
}

/**
 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
 *                   the packet
 *
 * @skb: buffer
 * @mpls_lse: MPLS label stack entry to push
 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
 * @mac_len: length of the MAC header
 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
 *            ethernet
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet)
{
        struct mpls_shim_hdr *lse;
        int err;

        if (unlikely(!eth_p_mpls(mpls_proto)))
                return -EINVAL;

        /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
        if (skb->encapsulation)
                return -EINVAL;

        err = skb_cow_head(skb, MPLS_HLEN);
        if (unlikely(err))
                return err;

        if (!skb->inner_protocol) {
                skb_set_inner_network_header(skb, skb_network_offset(skb));
                skb_set_inner_protocol(skb, skb->protocol);
        }

        skb_push(skb, MPLS_HLEN);
        memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
                mac_len);
        skb_reset_mac_header(skb);
        skb_set_network_header(skb, mac_len);
        skb_reset_mac_len(skb);

        lse = mpls_hdr(skb);
        lse->label_stack_entry = mpls_lse;
        skb_postpush_rcsum(skb, lse, MPLS_HLEN);

        if (ethernet && mac_len >= ETH_HLEN)
                skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
        skb->protocol = mpls_proto;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_push);

/**
 * skb_mpls_pop() - pop the outermost MPLS header
 *
 * @skb: buffer
 * @next_proto: ethertype of header after popped MPLS header
 * @mac_len: length of the MAC header
 * @ethernet: flag to indicate if the packet is ethernet
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet)
{
        int err;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return 0;

        err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
        if (unlikely(err))
                return err;

        skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
        memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
                mac_len);

        __skb_pull(skb, MPLS_HLEN);
        skb_reset_mac_header(skb);
        skb_set_network_header(skb, mac_len);

        if (ethernet && mac_len >= ETH_HLEN) {
                struct ethhdr *hdr;

                /* use mpls_hdr() to get ethertype to account for VLANs. */
                hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
                skb_mod_eth_type(skb, hdr, next_proto);
        }
        skb->protocol = next_proto;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_pop);

/**
 * skb_mpls_update_lse() - modify outermost MPLS header and update csum
 *
 * @skb: buffer
 * @mpls_lse: new MPLS label stack entry to update to
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
{
        int err;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return -EINVAL;

        err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
        if (unlikely(err))
                return err;

        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };

                skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
        }

        mpls_hdr(skb)->label_stack_entry = mpls_lse;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_update_lse);

/**
 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
 *
 * @skb: buffer
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_dec_ttl(struct sk_buff *skb)
{
        u32 lse;
        u8 ttl;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return -EINVAL;

        if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
                return -ENOMEM;

        lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
        ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
        if (!--ttl)
                return -EINVAL;

        lse &= ~MPLS_LS_TTL_MASK;
        lse |= ttl << MPLS_LS_TTL_SHIFT;

        return skb_mpls_update_lse(skb, cpu_to_be32(lse));
}
EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);

/**
 * alloc_skb_with_frags - allocate skb with page frags
 *
 * @header_len: size of linear part
 * @data_len: needed length in frags
 * @order: max page order desired.
 * @errcode: pointer to error code if any
 * @gfp_mask: allocation mask
 *
 * This can be used to allocate a paged skb, given a maximal order for frags.
 */
struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int order,
                                     int *errcode,
                                     gfp_t gfp_mask)
{
        unsigned long chunk;
        struct sk_buff *skb;
        struct page *page;
        int nr_frags = 0;

        *errcode = -EMSGSIZE;
        if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order)))
                return NULL;

        *errcode = -ENOBUFS;
        skb = alloc_skb(header_len, gfp_mask);
        if (!skb)
                return NULL;

        while (data_len) {
                if (nr_frags == MAX_SKB_FRAGS)
                        goto failure;
                while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
                        order--;

                if (order) {
                        page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
                                           __GFP_COMP |
                                           __GFP_NOWARN,
                                           order);
                        if (!page) {
                                order--;
                                continue;
                        }
                } else {
                        page = alloc_page(gfp_mask);
                        if (!page)
                                goto failure;
                }
                chunk = min_t(unsigned long, data_len,
                              PAGE_SIZE << order);
                skb_fill_page_desc(skb, nr_frags, page, 0, chunk);
                nr_frags++;
                skb->truesize += (PAGE_SIZE << order);
                data_len -= chunk;
        }
        return skb;

failure:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(alloc_skb_with_frags);

/* carve out the first off bytes from skb when off < headlen */
static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
                                    const int headlen, gfp_t gfp_mask)
{
        int i;
        unsigned int size = skb_end_offset(skb);
        int new_hlen = headlen - off;
        u8 *data;

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                return -ENOMEM;
        size = SKB_WITH_OVERHEAD(size);

        /* Copy real data, and all frags */
        skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
        skb->len -= off;

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb),
               offsetof(struct skb_shared_info,
                        frags[skb_shinfo(skb)->nr_frags]));
        if (skb_cloned(skb)) {
                /* drop the old head gracefully */
                if (skb_orphan_frags(skb, gfp_mask)) {
                        skb_kfree_head(data);
                        return -ENOMEM;
                }
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);
                if (skb_has_frag_list(skb))
                        skb_clone_fraglist(skb);
                skb_release_data(skb, SKB_CONSUMED);
        } else {
                /* we can reuse existing recount- all we did was
                 * relocate values
                 */
                skb_free_head(skb);
        }

        skb->head = data;
        skb->data = data;
        skb->head_frag = 0;
        skb_set_end_offset(skb, size);
        skb_set_tail_pointer(skb, skb_headlen(skb));
        skb_headers_offset_update(skb, 0);
        skb->cloned = 0;
        skb->hdr_len = 0;
        skb->nohdr = 0;
        atomic_set(&skb_shinfo(skb)->dataref, 1);

        return 0;
}

static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);

/* carve out the first eat bytes from skb's frag_list. May recurse into
 * pskb_carve()
 */
static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat,
                                gfp_t gfp_mask)
{
        struct sk_buff *list = shinfo->frag_list;
        struct sk_buff *clone = NULL;
        struct sk_buff *insp = NULL;

        do {
                if (!list) {
                        pr_err("Not enough bytes to eat. Want %d\n", eat);
                        return -EFAULT;
                }
                if (list->len <= eat) {
                        /* Eaten as whole. */
                        eat -= list->len;
                        list = list->next;
                        insp = list;
                } else {
                        /* Eaten partially. */
                        if (skb_shared(list)) {
                                clone = skb_clone(list, gfp_mask);
                                if (!clone)
                                        return -ENOMEM;
                                insp = list->next;
                                list = clone;
                        } else {
                                /* This may be pulled without problems. */
                                insp = list;
                        }
                        if (pskb_carve(list, eat, gfp_mask) < 0) {
                                kfree_skb(clone);
                                return -ENOMEM;
                        }
                        break;
                }
        } while (eat);

        /* Free pulled out fragments. */
        while ((list = shinfo->frag_list) != insp) {
                shinfo->frag_list = list->next;
                consume_skb(list);
        }
        /* And insert new clone at head. */
        if (clone) {
                clone->next = list;
                shinfo->frag_list = clone;
        }
        return 0;
}

/* carve off first len bytes from skb. Split line (off) is in the
 * non-linear part of skb
 */
static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
                                       int pos, gfp_t gfp_mask)
{
        int i, k = 0;
        unsigned int size = skb_end_offset(skb);
        u8 *data;
        const int nfrags = skb_shinfo(skb)->nr_frags;
        struct skb_shared_info *shinfo;

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                return -ENOMEM;
        size = SKB_WITH_OVERHEAD(size);

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
        if (skb_orphan_frags(skb, gfp_mask)) {
                skb_kfree_head(data);
                return -ENOMEM;
        }
        shinfo = (struct skb_shared_info *)(data + size);
        for (i = 0; i < nfrags; i++) {
                int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (pos + fsize > off) {
                        shinfo->frags[k] = skb_shinfo(skb)->frags[i];

                        if (pos < off) {
                                /* Split frag.
                                 * We have two variants in this case:
                                 * 1. Move all the frag to the second
                                 *    part, if it is possible. F.e.
                                 *    this approach is mandatory for TUX,
                                 *    where splitting is expensive.
                                 * 2. Split is accurately. We make this.
                                 */
                                skb_frag_off_add(&shinfo->frags[0], off - pos);
                                skb_frag_size_sub(&shinfo->frags[0], off - pos);
                        }
                        skb_frag_ref(skb, i);
                        k++;
                }
                pos += fsize;
        }
        shinfo->nr_frags = k;
        if (skb_has_frag_list(skb))
                skb_clone_fraglist(skb);

        /* split line is in frag list */
        if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) {
                /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
                if (skb_has_frag_list(skb))
                        kfree_skb_list(skb_shinfo(skb)->frag_list);
                skb_kfree_head(data);
                return -ENOMEM;
        }
        skb_release_data(skb, SKB_CONSUMED);

        skb->head = data;
        skb->head_frag = 0;
        skb->data = data;
        skb_set_end_offset(skb, size);
        skb_reset_tail_pointer(skb);
        skb_headers_offset_update(skb, 0);
        skb->cloned   = 0;
        skb->hdr_len  = 0;
        skb->nohdr    = 0;
        skb->len -= off;
        skb->data_len = skb->len;
        atomic_set(&skb_shinfo(skb)->dataref, 1);
        return 0;
}

/* remove len bytes from the beginning of the skb */
static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
{
        int headlen = skb_headlen(skb);

        if (len < headlen)
                return pskb_carve_inside_header(skb, len, headlen, gfp);
        else
                return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
}

/* Extract to_copy bytes starting at off from skb, and return this in
 * a new skb
 */
struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
                             int to_copy, gfp_t gfp)
{
        struct sk_buff  *clone = skb_clone(skb, gfp);

        if (!clone)
                return NULL;

        if (pskb_carve(clone, off, gfp) < 0 ||
            pskb_trim(clone, to_copy)) {
                kfree_skb(clone);
                return NULL;
        }
        return clone;
}
EXPORT_SYMBOL(pskb_extract);

/**
 * skb_condense - try to get rid of fragments/frag_list if possible
 * @skb: buffer
 *
 * Can be used to save memory before skb is added to a busy queue.
 * If packet has bytes in frags and enough tail room in skb->head,
 * pull all of them, so that we can free the frags right now and adjust
 * truesize.
 * Notes:
 *        We do not reallocate skb->head thus can not fail.
 *        Caller must re-evaluate skb->truesize if needed.
 */
void skb_condense(struct sk_buff *skb)
{
        if (skb->data_len) {
                if (skb->data_len > skb->end - skb->tail ||
                    skb_cloned(skb) || !skb_frags_readable(skb))
                        return;

                /* Nice, we can free page frag(s) right now */
                __pskb_pull_tail(skb, skb->data_len);
        }
        /* At this point, skb->truesize might be over estimated,
         * because skb had a fragment, and fragments do not tell
         * their truesize.
         * When we pulled its content into skb->head, fragment
         * was freed, but __pskb_pull_tail() could not possibly
         * adjust skb->truesize, not knowing the frag truesize.
         */
        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}
EXPORT_SYMBOL(skb_condense);

#ifdef CONFIG_SKB_EXTENSIONS
static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
{
        return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
}

/**
 * __skb_ext_alloc - allocate a new skb extensions storage
 *
 * @flags: See kmalloc().
 *
 * Returns the newly allocated pointer. The pointer can later attached to a
 * skb via __skb_ext_set().
 * Note: caller must handle the skb_ext as an opaque data.
 */
struct skb_ext *__skb_ext_alloc(gfp_t flags)
{
        struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);

        if (new) {
                memset(new->offset, 0, sizeof(new->offset));
                refcount_set(&new->refcnt, 1);
        }

        return new;
}

static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
                                         unsigned int old_active)
{
        struct skb_ext *new;

        if (refcount_read(&old->refcnt) == 1)
                return old;

        new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
        if (!new)
                return NULL;

        memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
        refcount_set(&new->refcnt, 1);

#ifdef CONFIG_XFRM
        if (old_active & (1 << SKB_EXT_SEC_PATH)) {
                struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
                unsigned int i;

                for (i = 0; i < sp->len; i++)
                        xfrm_state_hold(sp->xvec[i]);
        }
#endif
#ifdef CONFIG_MCTP_FLOWS
        if (old_active & (1 << SKB_EXT_MCTP)) {
                struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP);

                if (flow->key)
                        refcount_inc(&flow->key->refs);
        }
#endif
        __skb_ext_put(old);
        return new;
}

/**
 * __skb_ext_set - attach the specified extension storage to this skb
 * @skb: buffer
 * @id: extension id
 * @ext: extension storage previously allocated via __skb_ext_alloc()
 *
 * Existing extensions, if any, are cleared.
 *
 * Returns the pointer to the extension.
 */
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext)
{
        unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);

        skb_ext_put(skb);
        newlen = newoff + skb_ext_type_len[id];
        ext->chunks = newlen;
        ext->offset[id] = newoff;
        skb->extensions = ext;
        skb->active_extensions = 1 << id;
        return skb_ext_get_ptr(ext, id);
}
EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL");

/**
 * skb_ext_add - allocate space for given extension, COW if needed
 * @skb: buffer
 * @id: extension to allocate space for
 *
 * Allocates enough space for the given extension.
 * If the extension is already present, a pointer to that extension
 * is returned.
 *
 * If the skb was cloned, COW applies and the returned memory can be
 * modified without changing the extension space of clones buffers.
 *
 * Returns pointer to the extension or NULL on allocation failure.
 */
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
{
        struct skb_ext *new, *old = NULL;
        unsigned int newlen, newoff;

        if (skb->active_extensions) {
                old = skb->extensions;

                new = skb_ext_maybe_cow(old, skb->active_extensions);
                if (!new)
                        return NULL;

                if (__skb_ext_exist(new, id))
                        goto set_active;

                newoff = new->chunks;
        } else {
                newoff = SKB_EXT_CHUNKSIZEOF(*new);

                new = __skb_ext_alloc(GFP_ATOMIC);
                if (!new)
                        return NULL;
        }

        newlen = newoff + skb_ext_type_len[id];
        new->chunks = newlen;
        new->offset[id] = newoff;
set_active:
        skb->slow_gro = 1;
        skb->extensions = new;
        skb->active_extensions |= 1 << id;
        return skb_ext_get_ptr(new, id);
}
EXPORT_SYMBOL(skb_ext_add);

#ifdef CONFIG_XFRM
static void skb_ext_put_sp(struct sec_path *sp)
{
        unsigned int i;

        for (i = 0; i < sp->len; i++)
                xfrm_state_put(sp->xvec[i]);
}
#endif

#ifdef CONFIG_MCTP_FLOWS
static void skb_ext_put_mctp(struct mctp_flow *flow)
{
        if (flow->key)
                mctp_key_unref(flow->key);
}
#endif

void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        struct skb_ext *ext = skb->extensions;

        skb->active_extensions &= ~(1 << id);
        if (skb->active_extensions == 0) {
                skb->extensions = NULL;
                __skb_ext_put(ext);
#ifdef CONFIG_XFRM
        } else if (id == SKB_EXT_SEC_PATH &&
                   refcount_read(&ext->refcnt) == 1) {
                struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);

                skb_ext_put_sp(sp);
                sp->len = 0;
#endif
        }
}
EXPORT_SYMBOL(__skb_ext_del);

void __skb_ext_put(struct skb_ext *ext)
{
        /* If this is last clone, nothing can increment
         * it after check passes.  Avoids one atomic op.
         */
        if (refcount_read(&ext->refcnt) == 1)
                goto free_now;

        if (!refcount_dec_and_test(&ext->refcnt))
                return;
free_now:
#ifdef CONFIG_XFRM
        if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
                skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
#endif
#ifdef CONFIG_MCTP_FLOWS
        if (__skb_ext_exist(ext, SKB_EXT_MCTP))
                skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
#endif

        kmem_cache_free(skbuff_ext_cache, ext);
}
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */

static void kfree_skb_napi_cache(struct sk_buff *skb)
{
        /* if SKB is a clone, don't handle this case */
        if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
                __kfree_skb(skb);
                return;
        }

        local_bh_disable();
        __napi_kfree_skb(skb, SKB_CONSUMED);
        local_bh_enable();
}

DEFINE_STATIC_KEY_FALSE(skb_defer_disable_key);

/**
 * skb_attempt_defer_free - queue skb for remote freeing
 * @skb: buffer
 *
 * Put @skb in a per-cpu list, using the cpu which
 * allocated the skb/pages to reduce false sharing
 * and memory zone spinlock contention.
 */
void skb_attempt_defer_free(struct sk_buff *skb)
{
        struct skb_defer_node *sdn;
        unsigned long defer_count;
        unsigned int defer_max;
        bool kick;
        int cpu;

        if (static_branch_unlikely(&skb_defer_disable_key))
                goto nodefer;

        /* zero copy notifications should not be delayed. */
        if (skb_zcopy(skb))
                goto nodefer;

        cpu = skb->alloc_cpu;
        if (cpu == raw_smp_processor_id() ||
            WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
            !cpu_online(cpu)) {
nodefer:        kfree_skb_napi_cache(skb);
                return;
        }

        DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
        DEBUG_NET_WARN_ON_ONCE(skb->destructor);
        DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb));

        sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id();

        defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
        defer_count = atomic_long_inc_return(&sdn->defer_count);

        if (defer_count >= defer_max)
                goto nodefer;

        llist_add(&skb->ll_node, &sdn->defer_list);

        /* Send an IPI every time queue reaches half capacity. */
        kick = (defer_count - 1) == (defer_max >> 1);

        /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
         * if we are unlucky enough (this seems very unlikely).
         */
        if (unlikely(kick))
                kick_defer_list_purge(cpu);
}

static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
                                 size_t offset, size_t len)
{
        const char *kaddr;
        __wsum csum;

        kaddr = kmap_local_page(page);
        csum = csum_partial(kaddr + offset, len, 0);
        kunmap_local(kaddr);
        skb->csum = csum_block_add(skb->csum, csum, skb->len);
}

/**
 * skb_splice_from_iter - Splice (or copy) pages to skbuff
 * @skb: The buffer to add pages to
 * @iter: Iterator representing the pages to be added
 * @maxsize: Maximum amount of pages to be added
 *
 * This is a common helper function for supporting MSG_SPLICE_PAGES.  It
 * extracts pages from an iterator and adds them to the socket buffer if
 * possible, copying them to fragments if not possible (such as if they're slab
 * pages).
 *
 * Returns the amount of data spliced/copied or -EMSGSIZE if there's
 * insufficient space in the buffer to transfer anything.
 */
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
                             ssize_t maxsize)
{
        size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
        struct page *pages[8], **ppages = pages;
        ssize_t spliced = 0, ret = 0;
        unsigned int i;

        while (iter->count > 0) {
                ssize_t space, nr, len;
                size_t off;

                ret = -EMSGSIZE;
                space = frag_limit - skb_shinfo(skb)->nr_frags;
                if (space < 0)
                        break;

                /* We might be able to coalesce without increasing nr_frags */
                nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));

                len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
                if (len <= 0) {
                        ret = len ?: -EIO;
                        break;
                }

                i = 0;
                do {
                        struct page *page = pages[i++];
                        size_t part = min_t(size_t, PAGE_SIZE - off, len);

                        ret = -EIO;
                        if (WARN_ON_ONCE(!sendpage_ok(page)))
                                goto out;

                        ret = skb_append_pagefrags(skb, page, off, part,
                                                   frag_limit);
                        if (ret < 0) {
                                iov_iter_revert(iter, len);
                                goto out;
                        }

                        if (skb->ip_summed == CHECKSUM_NONE)
                                skb_splice_csum_page(skb, page, off, part);

                        off = 0;
                        spliced += part;
                        maxsize -= part;
                        len -= part;
                } while (len > 0);

                if (maxsize <= 0)
                        break;
        }

out:
        skb_len_add(skb, spliced);
        return spliced ?: ret;
}
EXPORT_SYMBOL(skb_splice_from_iter);

static __always_inline
size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
                             size_t len, void *to, void *priv2)
{
        __wsum *csum = priv2;
        __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);

        *csum = csum_block_add(*csum, next, progress);
        return 0;
}

static __always_inline
size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
                                size_t len, void *to, void *priv2)
{
        __wsum next, *csum = priv2;

        next = csum_and_copy_from_user(iter_from, to + progress, len);
        *csum = csum_block_add(*csum, next, progress);
        return next ? 0 : len;
}

bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
                                  __wsum *csum, struct iov_iter *i)
{
        size_t copied;

        if (WARN_ON_ONCE(!i->data_source))
                return false;
        copied = iterate_and_advance2(i, bytes, addr, csum,
                                      copy_from_user_iter_csum,
                                      memcpy_from_iter_csum);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);

void __get_netmem(netmem_ref netmem)
{
        struct net_iov *niov = netmem_to_net_iov(netmem);

        if (net_is_devmem_iov(niov))
                net_devmem_get_net_iov(netmem_to_net_iov(netmem));
}
EXPORT_SYMBOL(__get_netmem);

void __put_netmem(netmem_ref netmem)
{
        struct net_iov *niov = netmem_to_net_iov(netmem);

        if (net_is_devmem_iov(niov))
                net_devmem_put_net_iov(netmem_to_net_iov(netmem));
}
EXPORT_SYMBOL(__put_netmem);

struct vlan_type_depth __vlan_get_protocol_offset(const struct sk_buff *skb,
                                                  __be16 type,
                                                  int mac_offset)
{
        unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH;

        /* if type is 802.1Q/AD then the header should already be
         * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
         * ETH_HLEN otherwise
         */
        if (vlan_depth) {
                if (WARN_ON_ONCE(vlan_depth < VLAN_HLEN))
                        return (struct vlan_type_depth) { 0 };
                vlan_depth -= VLAN_HLEN;
        } else {
                vlan_depth = ETH_HLEN;
        }
        do {
                struct vlan_hdr vhdr, *vh;

                vh = skb_header_pointer(skb, mac_offset + vlan_depth,
                                        sizeof(vhdr), &vhdr);
                if (unlikely(!vh || !--parse_depth))
                        return (struct vlan_type_depth) { 0 };

                type = vh->h_vlan_encapsulated_proto;
                vlan_depth += VLAN_HLEN;
        } while (eth_type_vlan(type));

        return (struct vlan_type_depth) {
                .type = type,
                .depth = vlan_depth
        };
}
EXPORT_SYMBOL(__vlan_get_protocol_offset);
































































































    1 




















    1 




















































    1 











    1 



    1 






    1 






    1 

































































































































    1 

    1 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 









    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 














    1 














































































































































































































































































    1 
    1 













































































































































































































































































































    1 











    1 
    1 

    1 


    1 


















































































    1 























    1 
































    1 







    1 













    1 
















    1 
    1 
    1 
    1 





    1 















    1 











    1 





    1 





    1 

    1 












    1 


























    1 









    1 














    1 



























































    1 



    1 













    1 



























    1 






    1 



















    1 



    1 





    1 








    1 



    1 















    1 















    1 








    1 













    1 



    1 











    1 





























    1 













    1 











    1 











    1 




    1 















    1 













    1 
    1 





























    1 








    1 













    1 









    1 











    1 







    1 















    1 






















    1 








    1 


    1 

    1 










    1 






























    1 

























    1 




    1 
    1 
    1 
    1 







    1 





    1 






    1 


    1 





    1 


    1 



    1 



    1 
































































    1 





















    1 

    1 
    1 





























































































    1 


    1 

    1 












    1 








    1 














    1 

    1 

    1 






































    1 





    1 





































































































































































































































































































































































































































































































    1 

























    1 
















    1 











    1 


    1 


























    1 


    1 










    1 












    1 















    1 









    1 
















































    1 

















    1 











    1 
    1 







    1 































































    1 





    1 





    1 




    1 




















    1 



    1 

    1 















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/inode.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *        (jj@sunsite.ms.mff.cuni.cz)
 *
 *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
 */

#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/folio_batch.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/iomap.h>
#include <linux/iversion.h>

#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"

#include <kunit/static_stub.h>

#include <trace/events/ext4.h>

static void ext4_journalled_zero_new_buffers(handle_t *handle,
                                            struct inode *inode,
                                            struct folio *folio,
                                            unsigned from, unsigned to);

static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
                              struct ext4_inode_info *ei)
{
        __u32 csum;
        __u16 dummy_csum = 0;
        int offset = offsetof(struct ext4_inode, i_checksum_lo);
        unsigned int csum_size = sizeof(dummy_csum);

        csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset);
        csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size);
        offset += csum_size;
        csum = ext4_chksum(csum, (__u8 *)raw + offset,
                           EXT4_GOOD_OLD_INODE_SIZE - offset);

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                offset = offsetof(struct ext4_inode, i_checksum_hi);
                csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE,
                                   offset - EXT4_GOOD_OLD_INODE_SIZE);
                if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
                        csum = ext4_chksum(csum, (__u8 *)&dummy_csum,
                                           csum_size);
                        offset += csum_size;
                }
                csum = ext4_chksum(csum, (__u8 *)raw + offset,
                                   EXT4_INODE_SIZE(inode->i_sb) - offset);
        }

        return csum;
}

static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
                                  struct ext4_inode_info *ei)
{
        __u32 provided, calculated;

        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
            !ext4_has_feature_metadata_csum(inode->i_sb))
                return 1;

        provided = le16_to_cpu(raw->i_checksum_lo);
        calculated = ext4_inode_csum(inode, raw, ei);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
                provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
        else
                calculated &= 0xFFFF;

        return provided == calculated;
}

void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei)
{
        __u32 csum;

        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
            !ext4_has_feature_metadata_csum(inode->i_sb))
                return;

        csum = ext4_inode_csum(inode, raw, ei);
        raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
                raw->i_checksum_hi = cpu_to_le16(csum >> 16);
}

static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
{
        struct jbd2_inode *jinode = READ_ONCE(EXT4_I(inode)->jinode);

        trace_ext4_begin_ordered_truncate(inode, new_size);
        /*
         * If jinode is zero, then we never opened the file for
         * writing, so there's no need to call
         * jbd2_journal_begin_ordered_truncate() since there's no
         * outstanding writes we need to flush.
         */
        if (!jinode)
                return 0;
        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
                                                   jinode,
                                                   new_size);
}

/*
 * Test whether an inode is a fast symlink.
 * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
 */
int ext4_inode_is_fast_symlink(struct inode *inode)
{
        if (!ext4_has_feature_ea_inode(inode->i_sb)) {
                int ea_blocks = EXT4_I(inode)->i_file_acl ?
                                EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;

                if (ext4_has_inline_data(inode))
                        return 0;

                return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
        }
        return S_ISLNK(inode->i_mode) && inode->i_size &&
               (inode->i_size < EXT4_N_BLOCKS * 4);
}

/*
 * Called at the last iput() if i_nlink is zero.
 */
void ext4_evict_inode(struct inode *inode)
{
        handle_t *handle;
        int err;
        /*
         * Credits for final inode cleanup and freeing:
         * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
         * (xattr block freeing), bitmap, group descriptor (inode freeing)
         */
        int extra_credits = 6;
        struct ext4_xattr_inode_array *ea_inode_array = NULL;
        bool freeze_protected = false;

        trace_ext4_evict_inode(inode);

        dax_break_layout_final(inode);

        if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
                ext4_evict_ea_inode(inode);
        if (inode->i_nlink) {
                /*
                 * If there's dirty page will lead to data loss, user
                 * could see stale data.
                 */
                if (unlikely(!ext4_emergency_state(inode->i_sb) &&
                    mapping_tagged(&inode->i_data, PAGECACHE_TAG_DIRTY)))
                        ext4_warning_inode(inode, "data will be lost");

                truncate_inode_pages_final(&inode->i_data);
                /* Avoid mballoc special inode which has no proper iops */
                if (!EXT4_SB(inode->i_sb)->s_journal)
                        mmb_sync(&EXT4_I(inode)->i_metadata_bhs);
                goto no_delete;
        }

        if (is_bad_inode(inode))
                goto no_delete;
        dquot_initialize(inode);

        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages_final(&inode->i_data);

        /*
         * For inodes with journalled data, transaction commit could have
         * dirtied the inode. And for inodes with dioread_nolock, unwritten
         * extents converting worker could merge extents and also have dirtied
         * the inode. Flush worker is ignoring it because of I_FREEING flag but
         * we still need to remove the inode from the writeback lists.
         */
        inode_io_list_del(inode);

        /*
         * Protect us against freezing - iput() caller didn't have to have any
         * protection against it. When we are in a running transaction though,
         * we are already protected against freezing and we cannot grab further
         * protection due to lock ordering constraints.
         */
        if (!ext4_journal_current_handle()) {
                sb_start_intwrite(inode->i_sb);
                freeze_protected = true;
        }

        if (!IS_NOQUOTA(inode))
                extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);

        /*
         * Block bitmap, group descriptor, and inode are accounted in both
         * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
         */
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
                         ext4_blocks_for_truncate(inode) + extra_credits - 3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
                 * If we're going to skip the normal cleanup, we still need to
                 * make sure that the in-core orphan linked list is properly
                 * cleaned up.
                 */
                ext4_orphan_del(NULL, inode);
                if (freeze_protected)
                        sb_end_intwrite(inode->i_sb);
                goto no_delete;
        }

        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

        /*
         * Set inode->i_size to 0 before calling ext4_truncate(). We need
         * special handling of symlinks here because i_size is used to
         * determine whether ext4_inode_info->i_data contains symlink data or
         * block mappings. Setting i_size to 0 will remove its fast symlink
         * status. Erase i_data so that it becomes a valid empty block map.
         */
        if (ext4_inode_is_fast_symlink(inode))
                memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_warning(inode->i_sb,
                             "couldn't mark inode dirty (err %d)", err);
                goto stop_handle;
        }
        if (inode->i_blocks) {
                err = ext4_truncate(inode);
                if (err) {
                        ext4_error_err(inode->i_sb, -err,
                                       "couldn't truncate inode %llu (err %d)",
                                       inode->i_ino, err);
                        goto stop_handle;
                }
        }

        /* Remove xattr references. */
        err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
                                      extra_credits);
        if (err) {
                ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
stop_handle:
                ext4_journal_stop(handle);
                ext4_orphan_del(NULL, inode);
                if (freeze_protected)
                        sb_end_intwrite(inode->i_sb);
                ext4_xattr_inode_array_free(ea_inode_array);
                goto no_delete;
        }

        /*
         * Kill off the orphan record which ext4_truncate created.
         * AKPM: I think this can be inside the above `if'.
         * Note that ext4_orphan_del() has to be able to cope with the
         * deletion of a non-existent orphan - this is because we don't
         * know if ext4_truncate() actually created an orphan record.
         * (Well, we could do this if we need to, but heck - it works)
         */
        ext4_orphan_del(handle, inode);
        EXT4_I(inode)->i_dtime        = (__u32)ktime_get_real_seconds();

        /*
         * One subtle ordering requirement: if anything has gone wrong
         * (transaction abort, IO errors, whatever), then we can still
         * do these next steps (the fs will already have been marked as
         * having errors), but we can't free the inode if the mark_dirty
         * fails.
         */
        if (ext4_mark_inode_dirty(handle, inode))
                /* If that failed, just do the required in-core inode clear. */
                ext4_clear_inode(inode);
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
        if (freeze_protected)
                sb_end_intwrite(inode->i_sb);
        ext4_xattr_inode_array_free(ea_inode_array);
        return;
no_delete:
        /*
         * Check out some where else accidentally dirty the evicting inode,
         * which may probably cause inode use-after-free issues later.
         */
        WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));

        if (!list_empty(&EXT4_I(inode)->i_fc_list))
                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
        ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
}

#ifdef CONFIG_QUOTA
qsize_t *ext4_get_reserved_space(struct inode *inode)
{
        return &EXT4_I(inode)->i_reserved_quota;
}
#endif

/*
 * Called with i_data_sem down, which is important since we can call
 * ext4_discard_preallocations() from here.
 */
void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_warning(inode->i_sb, "%s: ino %llu, used %d "
                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                used = ei->i_reserved_data_blocks;
        }

        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);

        spin_unlock(&ei->i_block_reservation_lock);

        /* Update quota subsystem for data blocks */
        if (quota_claim)
                dquot_claim_block(inode, EXT4_C2B(sbi, used));
        else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
                 * not re-claim the quota for fallocated blocks.
                 */
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
        }

        /*
         * If we have done all the pending block allocations and if
         * there aren't any writers on the inode, we can discard the
         * inode's preallocations.
         */
        if ((ei->i_reserved_data_blocks == 0) &&
            !inode_is_open_for_write(inode))
                ext4_discard_preallocations(inode);
}

static int __check_block_validity(struct inode *inode, const char *func,
                                unsigned int line,
                                struct ext4_map_blocks *map)
{
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

        if (journal && inode == journal->j_inode)
                return 0;

        if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
                ext4_error_inode(inode, func, line, map->m_pblk,
                                 "lblock %lu mapped to illegal pblock %llu "
                                 "(length %d)", (unsigned long) map->m_lblk,
                                 map->m_pblk, map->m_len);
                return -EFSCORRUPTED;
        }
        return 0;
}

int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                       ext4_lblk_t len)
{
        int ret;

        KUNIT_STATIC_STUB_REDIRECT(ext4_issue_zeroout, inode, lblk, pblk, len);

        if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
                return fscrypt_zeroout_range(inode,
                                (loff_t)lblk << inode->i_blkbits,
                                pblk << (inode->i_blkbits - SECTOR_SHIFT),
                                (u64)len << inode->i_blkbits);

        ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
        if (ret > 0)
                ret = 0;

        return ret;
}

/*
 * For generic regular files, when updating the extent tree, Ext4 should
 * hold the i_rwsem and invalidate_lock exclusively. This ensures
 * exclusion against concurrent page faults, as well as reads and writes.
 */
#ifdef CONFIG_EXT4_DEBUG
void ext4_check_map_extents_env(struct inode *inode)
{
        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        if (!S_ISREG(inode->i_mode) ||
            IS_NOQUOTA(inode) || IS_VERITY(inode) ||
            is_special_ino(inode->i_sb, inode->i_ino) ||
            (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
            ext4_verity_in_progress(inode))
                return;

        WARN_ON_ONCE(!inode_is_locked(inode) &&
                     !rwsem_is_locked(&inode->i_mapping->invalidate_lock));
}
#else
void ext4_check_map_extents_env(struct inode *inode) {}
#endif

#define check_block_validity(inode, map)        \
        __check_block_validity((inode), __func__, __LINE__, (map))

#ifdef ES_AGGRESSIVE_TEST
static void ext4_map_blocks_es_recheck(handle_t *handle,
                                       struct inode *inode,
                                       struct ext4_map_blocks *es_map,
                                       struct ext4_map_blocks *map,
                                       int flags)
{
        int retval;

        map->m_flags = 0;
        /*
         * There is a race window that the result is not the same.
         * e.g. xfstests #223 when dioread_nolock enables.  The reason
         * is that we lookup a block mapping in extent status tree with
         * out taking i_data_sem.  So at the time the unwritten extent
         * could be converted.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, 0);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));

        /*
         * We don't check m_len because extent will be collpased in status
         * tree.  So the m_len might not equal.
         */
        if (es_map->m_lblk != map->m_lblk ||
            es_map->m_flags != map->m_flags ||
            es_map->m_pblk != map->m_pblk) {
                printk("ES cache assertion failed for inode: %llu "
                       "es_cached ex [%d/%d/%llu/%x] != "
                       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
                       inode->i_ino, es_map->m_lblk, es_map->m_len,
                       es_map->m_pblk, es_map->m_flags, map->m_lblk,
                       map->m_len, map->m_pblk, map->m_flags,
                       retval, flags);
        }
}
#endif /* ES_AGGRESSIVE_TEST */

static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
                        struct inode *inode, struct ext4_map_blocks *map,
                        unsigned int orig_mlen)
{
        struct ext4_map_blocks map2;
        unsigned int status, status2;
        int retval;

        status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;

        WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF));
        WARN_ON_ONCE(orig_mlen <= map->m_len);

        /* Prepare map2 for lookup in next leaf block */
        map2.m_lblk = map->m_lblk + map->m_len;
        map2.m_len = orig_mlen - map->m_len;
        map2.m_flags = 0;
        retval = ext4_ext_map_blocks(handle, inode, &map2, 0);

        if (retval <= 0) {
                ext4_es_cache_extent(inode, map->m_lblk, map->m_len,
                                     map->m_pblk, status);
                return map->m_len;
        }

        if (unlikely(retval != map2.m_len)) {
                ext4_warning(inode->i_sb,
                             "ES len assertion failed for inode "
                             "%llu: retval %d != map->m_len %d",
                             inode->i_ino, retval, map2.m_len);
                WARN_ON(1);
        }

        status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ?
                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;

        /*
         * If map2 is contiguous with map, then let's insert it as a single
         * extent in es cache and return the combined length of both the maps.
         */
        if (map->m_pblk + map->m_len == map2.m_pblk &&
                        status == status2) {
                ext4_es_cache_extent(inode, map->m_lblk,
                                     map->m_len + map2.m_len, map->m_pblk,
                                     status);
                map->m_len += map2.m_len;
        } else {
                ext4_es_cache_extent(inode, map->m_lblk, map->m_len,
                                     map->m_pblk, status);
        }

        return map->m_len;
}

int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
                          struct ext4_map_blocks *map, int flags)
{
        unsigned int status;
        int retval;
        unsigned int orig_mlen = map->m_len;

        flags &= EXT4_EX_QUERY_FILTER;
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                retval = ext4_ext_map_blocks(handle, inode, map, flags);
        else
                retval = ext4_ind_map_blocks(handle, inode, map, flags);
        if (retval < 0)
                return retval;

        /* A hole? */
        if (retval == 0)
                goto out;

        if (unlikely(retval != map->m_len)) {
                ext4_warning(inode->i_sb,
                             "ES len assertion failed for inode "
                             "%llu: retval %d != map->m_len %d",
                             inode->i_ino, retval, map->m_len);
                WARN_ON(1);
        }

        /*
         * No need to query next in leaf:
         * - if returned extent is not last in leaf or
         * - if the last in leaf is the full requested range
         */
        if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) ||
                        map->m_len == orig_mlen) {
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                ext4_es_cache_extent(inode, map->m_lblk, map->m_len,
                                     map->m_pblk, status);
        } else {
                retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map,
                                                            orig_mlen);
        }
out:
        map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq);
        return retval;
}

int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags)
{
        unsigned int status;
        int err, retval = 0;

        /*
         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE
         * indicates that the blocks and quotas has already been
         * checked when the data was copied into the page cache.
         */
        if (map->m_flags & EXT4_MAP_DELAYED)
                flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;

        /*
         * Here we clear m_flags because after allocating an new extent,
         * it will be set again.
         */
        map->m_flags &= ~EXT4_MAP_FLAGS;

        /*
         * We need to check for EXT4 here because migrate could have
         * changed the inode type in between.
         */
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, flags);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, flags);

                /*
                 * We allocated new blocks which will result in i_data's
                 * format changing. Force the migrate to fail by clearing
                 * migrate flags.
                 */
                if (retval > 0 && map->m_flags & EXT4_MAP_NEW)
                        ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
        }
        if (retval <= 0)
                return retval;

        if (unlikely(retval != map->m_len)) {
                ext4_warning(inode->i_sb,
                             "ES len assertion failed for inode %llu: "
                             "retval %d != map->m_len %d",
                             inode->i_ino, retval, map->m_len);
                WARN_ON(1);
        }

        /*
         * We have to zeroout blocks before inserting them into extent
         * status tree. Otherwise someone could look them up there and
         * use them before they are really zeroed. We also have to
         * unmap metadata before zeroing as otherwise writeback can
         * overwrite zeros with stale data from block device.
         */
        if (flags & EXT4_GET_BLOCKS_ZERO &&
            map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) {
                err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk,
                                         map->m_len);
                if (err)
                        return err;
        }

        status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                        EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
        ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk,
                              status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
        map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq);

        return retval;
}

/*
 * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
 * If file type is extents based, it will call ext4_ext_map_blocks(),
 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocated.
 * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are
 * pre-allocated and unwritten, the resulting @map is marked as unwritten.
 * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped.
 *
 * It returns 0 if plain look up failed (blocks have not been allocated), in
 * that case, @map is returned as unmapped but we still do fill map->m_len to
 * indicate the length of a hole starting at map->m_lblk.
 *
 * It returns the error in case of allocation failure.
 */
int ext4_map_blocks(handle_t *handle, struct inode *inode,
                    struct ext4_map_blocks *map, int flags)
{
        struct extent_status es;
        int retval;
        int ret = 0;
        unsigned int orig_mlen = map->m_len;
#ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;

        memcpy(&orig_map, map, sizeof(*map));
#endif

        map->m_flags = 0;
        ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
                  flags, map->m_len, (unsigned long) map->m_lblk);

        /*
         * ext4_map_blocks returns an int, and m_len is an unsigned int
         */
        if (unlikely(map->m_len > INT_MAX))
                map->m_len = INT_MAX;

        /* We can handle the block number less than EXT_MAX_BLOCKS */
        if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
                return -EFSCORRUPTED;

        /*
         * Callers from the context of data submission are the only exceptions
         * for regular files that do not hold the i_rwsem or invalidate_lock.
         * However, caching unrelated ranges is not permitted.
         */
        if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
                WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE));
        else
                ext4_check_map_extents_env(inode);

        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
                if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
                        map->m_pblk = ext4_es_pblock(&es) +
                                        map->m_lblk - es.es_lblk;
                        map->m_flags |= ext4_es_is_written(&es) ?
                                        EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
                        retval = es.es_len - (map->m_lblk - es.es_lblk);
                        if (retval > map->m_len)
                                retval = map->m_len;
                        map->m_len = retval;
                } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
                        map->m_pblk = 0;
                        map->m_flags |= ext4_es_is_delayed(&es) ?
                                        EXT4_MAP_DELAYED : 0;
                        retval = es.es_len - (map->m_lblk - es.es_lblk);
                        if (retval > map->m_len)
                                retval = map->m_len;
                        map->m_len = retval;
                        retval = 0;
                } else {
                        BUG();
                }

                if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
                        return retval;
#ifdef ES_AGGRESSIVE_TEST
                ext4_map_blocks_es_recheck(handle, inode, map,
                                           &orig_map, flags);
#endif
                if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) ||
                                orig_mlen == map->m_len)
                        goto found;

                map->m_len = orig_mlen;
        }
        /*
         * In the query cache no-wait mode, nothing we can do more if we
         * cannot find extent in the cache.
         */
        if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
                return 0;

        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        retval = ext4_map_query_blocks(handle, inode, map, flags);
        up_read((&EXT4_I(inode)->i_data_sem));

found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }

        /* If it is only a block(s) look up */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
                return retval;

        /*
         * Returns if the blocks have already allocated
         *
         * Note that if blocks have been preallocated
         * ext4_ext_map_blocks() returns with buffer head unmapped
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
                /*
                 * If we need to convert extent to unwritten
                 * we continue and do the actual work in
                 * ext4_ext_map_blocks()
                 */
                if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
                        return retval;


        ext4_fc_track_inode(handle, inode);
        /*
         * New blocks allocate and/or writing to unwritten extent
         * will possibly result in updating i_data, so we take
         * the write lock of i_data_sem, and call get_block()
         * with create == 1 flag.
         */
        down_write(&EXT4_I(inode)->i_data_sem);
        retval = ext4_map_create_blocks(handle, inode, map, flags);
        up_write((&EXT4_I(inode)->i_data_sem));

        if (retval < 0)
                ext_debug(inode, "failed with err %d\n", retval);
        if (retval <= 0)
                return retval;

        if (map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;

                /*
                 * Inodes with freshly allocated blocks where contents will be
                 * visible after transaction commit must be on transaction's
                 * ordered data list.
                 */
                if (map->m_flags & EXT4_MAP_NEW &&
                    !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
                    !(flags & EXT4_GET_BLOCKS_ZERO) &&
                    !ext4_is_quota_file(inode) &&
                    ext4_should_order_data(inode)) {
                        loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk);
                        loff_t length = EXT4_LBLK_TO_B(inode, map->m_len);

                        if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
                                ret = ext4_jbd2_inode_add_wait(handle, inode,
                                                start_byte, length);
                        else
                                ret = ext4_jbd2_inode_add_write(handle, inode,
                                                start_byte, length);
                        if (ret)
                                return ret;
                }
        }
        ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk +
                            map->m_len - 1);
        return retval;
}

/*
 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
 * we have to be careful as someone else may be manipulating b_state as well.
 */
static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
{
        unsigned long old_state;
        unsigned long new_state;

        flags &= EXT4_MAP_FLAGS;

        /* Dummy buffer_head? Set non-atomically. */
        if (!bh->b_folio) {
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
                return;
        }
        /*
         * Someone else may be modifying b_state. Be careful! This is ugly but
         * once we get rid of using bh as a container for mapping information
         * to pass to / from get_block functions, this can go away.
         */
        old_state = READ_ONCE(bh->b_state);
        do {
                new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
        } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
}

/*
 * Make sure that the current journal transaction has enough credits to map
 * one extent. Return -EAGAIN if it cannot extend the current running
 * transaction.
 */
static inline int ext4_journal_ensure_extent_credits(handle_t *handle,
                                                     struct inode *inode)
{
        int credits;
        int ret;

        /* Called from ext4_da_write_begin() which has no handle started? */
        if (!handle)
                return 0;

        credits = ext4_chunk_trans_blocks(inode, 1);
        ret = __ext4_journal_ensure_credits(handle, credits, credits, 0);
        return ret <= 0 ? ret : -EAGAIN;
}

static int _ext4_get_block(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int flags)
{
        struct ext4_map_blocks map;
        int ret = 0;

        if (ext4_has_inline_data(inode))
                return -ERANGE;

        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;

        ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
                              flags);
        if (ret > 0) {
                map_bh(bh, inode->i_sb, map.m_pblk);
                ext4_update_bh_state(bh, map.m_flags);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        } else if (ret == 0) {
                /* hole case, need to fill in bh->b_size */
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
        }
        return ret;
}

int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh, int create)
{
        return _ext4_get_block(inode, iblock, bh,
                               create ? EXT4_GET_BLOCKS_CREATE : 0);
}

/*
 * Get block function used when preparing for buffered write if we require
 * creating an unwritten extent if blocks haven't been allocated.  The extent
 * will be converted to written after the IO is complete.
 */
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create)
{
        int ret = 0;

        ext4_debug("ext4_get_block_unwritten: inode %llu, create flag %d\n",
                   inode->i_ino, create);
        ret = _ext4_get_block(inode, iblock, bh_result,
                               EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);

        /*
         * If the buffer is marked unwritten, mark it as new to make sure it is
         * zeroed out correctly in case of partial writes. Otherwise, there is
         * a chance of stale data getting exposed.
         */
        if (ret == 0 && buffer_unwritten(bh_result))
                set_buffer_new(bh_result);

        return ret;
}

/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096

/*
 * `handle' can be NULL if create is zero
 */
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                ext4_lblk_t block, int map_flags)
{
        struct ext4_map_blocks map;
        struct buffer_head *bh;
        int create = map_flags & EXT4_GET_BLOCKS_CREATE;
        bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
        int err;

        ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                    || handle != NULL || create == 0);
        ASSERT(create == 0 || !nowait);

        map.m_lblk = block;
        map.m_len = 1;
        err = ext4_map_blocks(handle, inode, &map, map_flags);

        if (err == 0)
                return create ? ERR_PTR(-ENOSPC) : NULL;
        if (err < 0)
                return ERR_PTR(err);

        if (nowait)
                return sb_find_get_block(inode->i_sb, map.m_pblk);

        /*
         * Since bh could introduce extra ref count such as referred by
         * journal_head etc. Try to avoid using __GFP_MOVABLE here
         * as it may fail the migration when journal_head remains.
         */
        bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk,
                                inode->i_sb->s_blocksize);

        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);
        if (map.m_flags & EXT4_MAP_NEW) {
                ASSERT(create != 0);
                ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                            || (handle != NULL));

                /*
                 * Now that we do not always journal data, we should
                 * keep in mind whether this should always journal the
                 * new buffer as metadata.  For now, regular file
                 * writes use ext4_get_block instead, so it's not a
                 * problem.
                 */
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
                                                     EXT4_JTR_NONE);
                if (unlikely(err)) {
                        unlock_buffer(bh);
                        goto errout;
                }
                if (!buffer_uptodate(bh)) {
                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
                        set_buffer_uptodate(bh);
                }
                unlock_buffer(bh);
                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (unlikely(err))
                        goto errout;
        } else
                BUFFER_TRACE(bh, "not a new buffer");
        return bh;
errout:
        brelse(bh);
        return ERR_PTR(err);
}

struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
                               ext4_lblk_t block, int map_flags)
{
        struct buffer_head *bh;
        int ret;

        bh = ext4_getblk(handle, inode, block, map_flags);
        if (IS_ERR(bh))
                return bh;
        if (!bh || ext4_buffer_uptodate(bh))
                return bh;

        ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
        if (ret) {
                put_bh(bh);
                return ERR_PTR(ret);
        }
        return bh;
}

/* Read a contiguous batch of blocks. */
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs)
{
        int i, err;

        for (i = 0; i < bh_count; i++) {
                bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
                if (IS_ERR(bhs[i])) {
                        err = PTR_ERR(bhs[i]);
                        bh_count = i;
                        goto out_brelse;
                }
        }

        for (i = 0; i < bh_count; i++)
                /* Note that NULL bhs[i] is valid because of holes. */
                if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
                        ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);

        if (!wait)
                return 0;

        for (i = 0; i < bh_count; i++)
                if (bhs[i])
                        wait_on_buffer(bhs[i]);

        for (i = 0; i < bh_count; i++) {
                if (bhs[i] && !buffer_uptodate(bhs[i])) {
                        err = -EIO;
                        goto out_brelse;
                }
        }
        return 0;

out_brelse:
        for (i = 0; i < bh_count; i++) {
                brelse(bhs[i]);
                bhs[i] = NULL;
        }
        return err;
}

int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle, struct inode *inode,
                                     struct buffer_head *bh))
{
        struct buffer_head *bh;
        unsigned block_start, block_end;
        unsigned blocksize = head->b_size;
        int err, ret = 0;
        struct buffer_head *next;

        for (bh = head, block_start = 0;
             ret == 0 && (bh != head || !block_start);
             block_start = block_end, bh = next) {
                next = bh->b_this_page;
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (partial && !buffer_uptodate(bh))
                                *partial = 1;
                        continue;
                }
                err = (*fn)(handle, inode, bh);
                if (!ret)
                        ret = err;
        }
        return ret;
}

/*
 * Helper for handling dirtying of journalled data. We also mark the folio as
 * dirty so that writeback code knows about this page (and inode) contains
 * dirty data. ext4_writepages() then commits appropriate transaction to
 * make data stable.
 */
static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
        struct folio *folio = bh->b_folio;
        struct inode *inode = folio->mapping->host;

        /* only regular files have a_ops */
        if (S_ISREG(inode->i_mode))
                folio_mark_dirty(folio);
        return ext4_handle_dirty_metadata(handle, NULL, bh);
}

int do_journal_get_write_access(handle_t *handle, struct inode *inode,
                                struct buffer_head *bh)
{
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        BUFFER_TRACE(bh, "get write access");
        return ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                            EXT4_JTR_NONE);
}

int ext4_block_write_begin(handle_t *handle, struct folio *folio,
                           loff_t pos, unsigned len,
                           get_block_t *get_block)
{
        unsigned int from = offset_in_folio(folio, pos);
        unsigned to = from + len;
        struct inode *inode = folio->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
        int err = 0;
        unsigned int blocksize = i_blocksize(inode);
        struct buffer_head *bh, *head, *wait[2];
        int nr_wait = 0;
        int i;
        bool should_journal_data = ext4_should_journal_data(inode);

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(to > folio_size(folio));
        BUG_ON(from > to);
        WARN_ON_ONCE(blocksize > folio_size(folio));

        head = folio_buffers(folio);
        if (!head)
                head = create_empty_buffers(folio, blocksize, 0);
        block = EXT4_PG_TO_LBLK(inode, folio->index);

        for (bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start = block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (folio_test_uptodate(folio)) {
                                set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (WARN_ON_ONCE(buffer_new(bh)))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = ext4_journal_ensure_extent_credits(handle, inode);
                        if (!err)
                                err = get_block(inode, block, bh, 1);
                        if (err)
                                break;
                        if (buffer_new(bh)) {
                                /*
                                 * We may be zeroing partial buffers or all new
                                 * buffers in case of failure. Prepare JBD2 for
                                 * that.
                                 */
                                if (should_journal_data)
                                        do_journal_get_write_access(handle,
                                                                    inode, bh);
                                if (folio_test_uptodate(folio)) {
                                        /*
                                         * Unlike __block_write_begin() we leave
                                         * dirtying of new uptodate buffers to
                                         * ->write_end() time or
                                         * folio_zero_new_buffers().
                                         */
                                        set_buffer_uptodate(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        folio_zero_segments(folio, to,
                                                            block_end,
                                                            block_start, from);
                                continue;
                        }
                }
                if (folio_test_uptodate(folio)) {
                        set_buffer_uptodate(bh);
                        continue;
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                    (block_start < from || block_end > to)) {
                        ext4_read_bh_lock(bh, 0, false);
                        wait[nr_wait++] = bh;
                }
        }
        /*
         * If we issued read requests, let them complete.
         */
        for (i = 0; i < nr_wait; i++) {
                wait_on_buffer(wait[i]);
                if (!buffer_uptodate(wait[i]))
                        err = -EIO;
        }
        if (unlikely(err)) {
                if (should_journal_data)
                        ext4_journalled_zero_new_buffers(handle, inode, folio,
                                                         from, to);
                else
                        folio_zero_new_buffers(folio, from, to);
        } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                for (i = 0; i < nr_wait; i++) {
                        int err2;

                        err2 = fscrypt_decrypt_pagecache_blocks(folio,
                                                blocksize, bh_offset(wait[i]));
                        if (err2) {
                                clear_buffer_uptodate(wait[i]);
                                err = err2;
                        }
                }
        }

        return err;
}

/*
 * To preserve ordering, it is essential that the hole instantiation and
 * the data write be encapsulated in a single transaction.  We cannot
 * close off a transaction and start a new one between the ext4_get_block()
 * and the ext4_write_end().  So doing the jbd2_journal_start at the start of
 * ext4_write_begin() is the right place.
 */
static int ext4_write_begin(const struct kiocb *iocb,
                            struct address_space *mapping,
                            loff_t pos, unsigned len,
                            struct folio **foliop, void **fsdata)
{
        struct inode *inode = mapping->host;
        int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
        struct folio *folio;
        pgoff_t index;
        unsigned from, to;

        ret = ext4_emergency_state(inode->i_sb);
        if (unlikely(ret))
                return ret;

        trace_ext4_write_begin(inode, pos, len);
        /*
         * Reserve one block more for addition to orphan list in case
         * we allocate blocks but write fails for some reason
         */
        needed_blocks = ext4_chunk_trans_extent(inode,
                        ext4_journal_blocks_per_folio(inode)) + 1;
        index = pos >> PAGE_SHIFT;

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
                                                    foliop);
                if (ret < 0)
                        return ret;
                if (ret == 1)
                        return 0;
        }

        /*
         * write_begin_get_folio() can take a long time if the
         * system is thrashing due to memory pressure, or if the folio
         * is being written back.  So grab it first before we start
         * the transaction handle.  This also allows us to allocate
         * the folio (if needed) without using GFP_NOFS.
         */
retry_grab:
        folio = write_begin_get_folio(iocb, mapping, index, len);
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        if (len > folio_next_pos(folio) - pos)
                len = folio_next_pos(folio) - pos;

        from = offset_in_folio(folio, pos);
        to = from + len;

        /*
         * The same as page allocation, we prealloc buffer heads before
         * starting the handle.
         */
        if (!folio_buffers(folio))
                create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);

        folio_unlock(folio);

retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
                folio_put(folio);
                return PTR_ERR(handle);
        }

        folio_lock(folio);
        if (folio->mapping != mapping) {
                /* The folio got truncated from under us */
                folio_unlock(folio);
                folio_put(folio);
                ext4_journal_stop(handle);
                goto retry_grab;
        }
        /* In case writeback began while the folio was unlocked */
        folio_wait_stable(folio);

        if (ext4_should_dioread_nolock(inode))
                ret = ext4_block_write_begin(handle, folio, pos, len,
                                             ext4_get_block_unwritten);
        else
                ret = ext4_block_write_begin(handle, folio, pos, len,
                                             ext4_get_block);
        if (!ret && ext4_should_journal_data(inode)) {
                ret = ext4_walk_page_buffers(handle, inode,
                                             folio_buffers(folio), from, to,
                                             NULL, do_journal_get_write_access);
        }

        if (ret) {
                bool extended = (pos + len > inode->i_size) &&
                                !ext4_verity_in_progress(inode);

                folio_unlock(folio);
                /*
                 * ext4_block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_rwsem.
                 *
                 * Add inode to orphan list in case we crash before
                 * truncate finishes
                 */
                if (extended && ext4_can_truncate(inode))
                        ext4_orphan_add(handle, inode);

                ext4_journal_stop(handle);
                if (extended) {
                        ext4_truncate_failed_write(inode);
                        /*
                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
                         * make sure the inode is removed from the
                         * orphan list in that case.
                         */
                        if (inode->i_nlink)
                                ext4_orphan_del(NULL, inode);
                }

                if (ret == -EAGAIN ||
                    (ret == -ENOSPC &&
                     ext4_should_retry_alloc(inode->i_sb, &retries)))
                        goto retry_journal;
                folio_put(folio);
                return ret;
        }
        *foliop = folio;
        return ret;
}

/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct inode *inode,
                        struct buffer_head *bh)
{
        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        set_buffer_uptodate(bh);
        ret = ext4_dirty_journalled_data(handle, bh);
        clear_buffer_meta(bh);
        clear_buffer_prio(bh);
        clear_buffer_new(bh);
        return ret;
}

/*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `iocb` can be NULL - eg, when called from page_symlink().
 */
static int ext4_write_end(const struct kiocb *iocb,
                          struct address_space *mapping,
                          loff_t pos, unsigned len, unsigned copied,
                          struct folio *folio, void *fsdata)
{
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int i_size_changed = 0;
        bool verity = ext4_verity_in_progress(inode);

        trace_ext4_write_end(inode, pos, len, copied);

        if (ext4_has_inline_data(inode) &&
            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        copied = block_write_end(pos, len, copied, folio);
        /*
         * it's important to update i_size while still holding folio lock:
         * page writeout could otherwise come in and zero beyond i_size.
         *
         * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
         * blocks are being written past EOF, so skip the i_size update.
         */
        if (!verity)
                i_size_changed = ext4_update_inode_size(inode, pos + copied);
        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos && !verity)
                pagecache_isize_extended(inode, old_size, pos);

        /*
         * Don't mark the inode dirty under folio lock. First, it unnecessarily
         * makes the holding time of folio lock longer. Second, it forces lock
         * ordering of folio lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                ret = ext4_mark_inode_dirty(handle, inode);

        if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;

        if (pos + len > inode->i_size && !verity) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return ret ? ret : copied;
}

/*
 * This is a private version of folio_zero_new_buffers() which doesn't
 * set the buffer to be dirty, since in data=journalled mode we need
 * to call ext4_dirty_journalled_data() instead.
 */
static void ext4_journalled_zero_new_buffers(handle_t *handle,
                                            struct inode *inode,
                                            struct folio *folio,
                                            unsigned from, unsigned to)
{
        unsigned int block_start = 0, block_end;
        struct buffer_head *head, *bh;

        bh = head = folio_buffers(folio);
        do {
                block_end = block_start + bh->b_size;
                if (buffer_new(bh)) {
                        if (block_end > from && block_start < to) {
                                if (!folio_test_uptodate(folio)) {
                                        unsigned start, size;

                                        start = max(from, block_start);
                                        size = min(to, block_end) - start;

                                        folio_zero_range(folio, start, size);
                                }
                                clear_buffer_new(bh);
                                write_end_fn(handle, inode, bh);
                        }
                }
                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);
}

static int ext4_journalled_write_end(const struct kiocb *iocb,
                                     struct address_space *mapping,
                                     loff_t pos, unsigned len, unsigned copied,
                                     struct folio *folio, void *fsdata)
{
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
        int size_changed = 0;
        bool verity = ext4_verity_in_progress(inode);

        trace_ext4_journalled_write_end(inode, pos, len, copied);
        from = pos & (PAGE_SIZE - 1);
        to = from + len;

        BUG_ON(!ext4_handle_valid(handle));

        if (ext4_has_inline_data(inode))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        if (unlikely(copied < len) && !folio_test_uptodate(folio)) {
                copied = 0;
                ext4_journalled_zero_new_buffers(handle, inode, folio,
                                                 from, to);
        } else {
                if (unlikely(copied < len))
                        ext4_journalled_zero_new_buffers(handle, inode, folio,
                                                         from + copied, to);
                ret = ext4_walk_page_buffers(handle, inode,
                                             folio_buffers(folio),
                                             from, from + copied, &partial,
                                             write_end_fn);
                if (!partial)
                        folio_mark_uptodate(folio);
        }
        if (!verity)
                size_changed = ext4_update_inode_size(inode, pos + copied);
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos && !verity)
                pagecache_isize_extended(inode, old_size, pos);

        if (size_changed) {
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (!ret)
                        ret = ret2;
        }

        if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size && !verity) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return ret ? ret : copied;
}

/*
 * Reserve space for 'nr_resv' clusters
 */
static int ext4_da_reserve_space(struct inode *inode, int nr_resv)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        int ret;

        /*
         * We will charge metadata quota at writeout time; this saves
         * us from metadata over-estimation, though we may go over by
         * a small amount in the end.  Here we just reserve for data.
         */
        ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv));
        if (ret)
                return ret;

        spin_lock(&ei->i_block_reservation_lock);
        if (ext4_claim_free_clusters(sbi, nr_resv, 0)) {
                spin_unlock(&ei->i_block_reservation_lock);
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv));
                return -ENOSPC;
        }
        ei->i_reserved_data_blocks += nr_resv;
        trace_ext4_da_reserve_space(inode, nr_resv);
        spin_unlock(&ei->i_block_reservation_lock);

        return 0;       /* success */
}

void ext4_da_release_space(struct inode *inode, int to_free)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (!to_free)
                return;                /* Nothing to release, exit */

        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);

        trace_ext4_da_release_space(inode, to_free);
        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
                 * if there aren't enough reserved blocks, then the
                 * counter is messed up somewhere.  Since this
                 * function is called from invalidate page, it's
                 * harmless to return without any action.
                 */
                ext4_warning(inode->i_sb, "ext4_da_release_space: "
                         "ino %llu, to_free %d with only %d reserved "
                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                to_free = ei->i_reserved_data_blocks;
        }
        ei->i_reserved_data_blocks -= to_free;

        /* update fs dirty data blocks counter */
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);

        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);

        dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}

/*
 * Delayed allocation stuff
 */

struct mpage_da_data {
        /* These are input fields for ext4_do_writepages() */
        struct inode *inode;
        struct writeback_control *wbc;
        unsigned int can_map:1;        /* Can writepages call map blocks? */

        /* These are internal state of ext4_do_writepages() */
        loff_t start_pos;        /* The start pos to write */
        loff_t next_pos;        /* Current pos to examine */
        loff_t end_pos;                /* Last pos to examine */

        /*
         * Extent to map - this can be after start_pos because that can be
         * fully mapped. We somewhat abuse m_flags to store whether the extent
         * is delalloc or unwritten.
         */
        struct ext4_map_blocks map;
        struct ext4_io_submit io_submit;        /* IO submission data */
        unsigned int do_map:1;
        unsigned int scanned_until_end:1;
        unsigned int journalled_more_data:1;
};

static void mpage_release_unused_pages(struct mpage_da_data *mpd,
                                       bool invalidate)
{
        unsigned nr, i;
        pgoff_t index, end;
        struct folio_batch fbatch;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;

        /* This is necessary when next_pos == 0. */
        if (mpd->start_pos >= mpd->next_pos)
                return;

        mpd->scanned_until_end = 0;
        if (invalidate) {
                ext4_lblk_t start, last;
                start = EXT4_B_TO_LBLK(inode, mpd->start_pos);
                last = mpd->next_pos >> inode->i_blkbits;

                /*
                 * avoid racing with extent status tree scans made by
                 * ext4_insert_delayed_block()
                 */
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_es_remove_extent(inode, start, last - start);
                up_write(&EXT4_I(inode)->i_data_sem);
        }

        folio_batch_init(&fbatch);
        index = mpd->start_pos >> PAGE_SHIFT;
        end = mpd->next_pos >> PAGE_SHIFT;
        while (index < end) {
                nr = filemap_get_folios(mapping, &index, end - 1, &fbatch);
                if (nr == 0)
                        break;
                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        if (folio_pos(folio) < mpd->start_pos)
                                continue;
                        if (folio_next_index(folio) > end)
                                continue;
                        BUG_ON(!folio_test_locked(folio));
                        BUG_ON(folio_test_writeback(folio));
                        if (invalidate) {
                                if (folio_mapped(folio)) {
                                        folio_clear_dirty_for_io(folio);
                                        /*
                                         * Unmap folio from page
                                         * tables to prevent
                                         * subsequent accesses through
                                         * stale PTEs. This ensures
                                         * future accesses trigger new
                                         * page faults rather than
                                         * reusing the invalidated
                                         * folio.
                                         */
                                        unmap_mapping_pages(folio->mapping,
                                                folio->index,
                                                folio_nr_pages(folio), false);
                                }
                                block_invalidate_folio(folio, 0,
                                                folio_size(folio));
                                folio_clear_uptodate(folio);
                        }
                        folio_unlock(folio);
                }
                folio_batch_release(&fbatch);
        }
}

static void ext4_print_free_blocks(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct super_block *sb = inode->i_sb;
        struct ext4_inode_info *ei = EXT4_I(inode);

        ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
               EXT4_C2B(EXT4_SB(inode->i_sb),
                        ext4_count_free_clusters(sb)));
        ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
        ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_freeclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "Block reservation details");
        ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
                 ei->i_reserved_data_blocks);
        return;
}

/*
 * Check whether the cluster containing lblk has been allocated or has
 * delalloc reservation.
 *
 * Returns 0 if the cluster doesn't have either, 1 if it has delalloc
 * reservation, 2 if it's already been allocated, negative error code on
 * failure.
 */
static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int ret;

        /* Has delalloc reservation? */
        if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk))
                return 1;

        /* Already been allocated? */
        if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
                return 2;
        ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk));
        if (ret < 0)
                return ret;
        if (ret > 0)
                return 2;

        return 0;
}

/*
 * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents
 *                              status tree, incrementing the reserved
 *                              cluster/block count or making pending
 *                              reservations where needed
 *
 * @inode - file containing the newly added block
 * @lblk - start logical block to be added
 * @len - length of blocks to be added
 *
 * Returns 0 on success, negative error code on failure.
 */
static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk,
                                      ext4_lblk_t len)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int ret;
        bool lclu_allocated = false;
        bool end_allocated = false;
        ext4_lblk_t resv_clu;
        ext4_lblk_t end = lblk + len - 1;

        /*
         * If the cluster containing lblk or end is shared with a delayed,
         * written, or unwritten extent in a bigalloc file system, it's
         * already been accounted for and does not need to be reserved.
         * A pending reservation must be made for the cluster if it's
         * shared with a written or unwritten extent and doesn't already
         * have one.  Written and unwritten extents can be purged from the
         * extents status tree if the system is under memory pressure, so
         * it's necessary to examine the extent tree if a search of the
         * extents status tree doesn't get a match.
         */
        if (sbi->s_cluster_ratio == 1) {
                ret = ext4_da_reserve_space(inode, len);
                if (ret != 0)   /* ENOSPC */
                        return ret;
        } else {   /* bigalloc */
                resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1;

                ret = ext4_clu_alloc_state(inode, lblk);
                if (ret < 0)
                        return ret;
                if (ret > 0) {
                        resv_clu--;
                        lclu_allocated = (ret == 2);
                }

                if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) {
                        ret = ext4_clu_alloc_state(inode, end);
                        if (ret < 0)
                                return ret;
                        if (ret > 0) {
                                resv_clu--;
                                end_allocated = (ret == 2);
                        }
                }

                if (resv_clu) {
                        ret = ext4_da_reserve_space(inode, resv_clu);
                        if (ret != 0)   /* ENOSPC */
                                return ret;
                }
        }

        ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated,
                                      end_allocated);
        return 0;
}

/*
 * Looks up the requested blocks and sets the delalloc extent map.
 * First try to look up for the extent entry that contains the requested
 * blocks in the extent status tree without i_data_sem, then try to look
 * up for the ondisk extent mapping with i_data_sem in read mode,
 * finally hold i_data_sem in write mode, looks up again and add a
 * delalloc extent entry if it still couldn't find any extent. Pass out
 * the mapped extent through @map and return 0 on success.
 */
static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
{
        struct extent_status es;
        int retval;
#ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;

        memcpy(&orig_map, map, sizeof(*map));
#endif

        map->m_flags = 0;
        ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
                  (unsigned long) map->m_lblk);

        ext4_check_map_extents_env(inode);

        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) {
                map->m_len = min_t(unsigned int, map->m_len,
                                   es.es_len - (map->m_lblk - es.es_lblk));

                if (ext4_es_is_hole(&es))
                        goto add_delayed;

found:
                /*
                 * Delayed extent could be allocated by fallocate.
                 * So we need to check it.
                 */
                if (ext4_es_is_delayed(&es)) {
                        map->m_flags |= EXT4_MAP_DELAYED;
                        return 0;
                }

                map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
                if (ext4_es_is_written(&es))
                        map->m_flags |= EXT4_MAP_MAPPED;
                else if (ext4_es_is_unwritten(&es))
                        map->m_flags |= EXT4_MAP_UNWRITTEN;
                else
                        BUG();

#ifdef ES_AGGRESSIVE_TEST
                ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
#endif
                return 0;
        }

        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_has_inline_data(inode))
                retval = 0;
        else
                retval = ext4_map_query_blocks(NULL, inode, map, 0);
        up_read(&EXT4_I(inode)->i_data_sem);
        if (retval)
                return retval < 0 ? retval : 0;

add_delayed:
        down_write(&EXT4_I(inode)->i_data_sem);
        /*
         * Page fault path (ext4_page_mkwrite does not take i_rwsem)
         * and fallocate path (no folio lock) can race. Make sure we
         * lookup the extent status tree here again while i_data_sem
         * is held in write mode, before inserting a new da entry in
         * the extent status tree.
         */
        if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) {
                map->m_len = min_t(unsigned int, map->m_len,
                                   es.es_len - (map->m_lblk - es.es_lblk));

                if (!ext4_es_is_hole(&es)) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        goto found;
                }
        } else if (!ext4_has_inline_data(inode)) {
                retval = ext4_map_query_blocks(NULL, inode, map, 0);
                if (retval) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        return retval < 0 ? retval : 0;
                }
        }

        map->m_flags |= EXT4_MAP_DELAYED;
        retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
        if (!retval)
                map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq);
        up_write(&EXT4_I(inode)->i_data_sem);

        return retval;
}

/*
 * This is a special get_block_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
 *
 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
 * We also have b_blocknr = -1 and b_bdev initialized properly
 *
 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
 * initialized properly.
 */
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create)
{
        struct ext4_map_blocks map;
        sector_t invalid_block = ~((sector_t) 0xffff);
        int ret = 0;

        BUG_ON(create == 0);
        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);

        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
                invalid_block = ~0;

        map.m_lblk = iblock;
        map.m_len = 1;

        /*
         * first, we need to know whether the block is allocated already
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
        ret = ext4_da_map_blocks(inode, &map);
        if (ret < 0)
                return ret;

        if (map.m_flags & EXT4_MAP_DELAYED) {
                map_bh(bh, inode->i_sb, invalid_block);
                set_buffer_new(bh);
                set_buffer_delay(bh);
                return 0;
        }

        map_bh(bh, inode->i_sb, map.m_pblk);
        ext4_update_bh_state(bh, map.m_flags);

        if (buffer_unwritten(bh)) {
                /* A delayed write to unwritten bh should be marked
                 * new and mapped.  Mapped ensures that we don't do
                 * get_block multiple times when we write to the same
                 * offset and new ensures that we do proper zero out
                 * for partial write.
                 */
                set_buffer_new(bh);
                set_buffer_mapped(bh);
        }
        return 0;
}

static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
{
        mpd->start_pos += folio_size(folio);
        mpd->wbc->nr_to_write -= folio_nr_pages(folio);
        folio_unlock(folio);
}

static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
{
        size_t len;
        loff_t size;
        int err;

        WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos);
        folio_clear_dirty_for_io(folio);
        /*
         * We have to be very careful here!  Nothing protects writeback path
         * against i_size changes and the page can be writeably mapped into
         * page tables. So an application can be growing i_size and writing
         * data through mmap while writeback runs. folio_clear_dirty_for_io()
         * write-protects our page in page tables and the page cannot get
         * written to again until we release folio lock. So only after
         * folio_clear_dirty_for_io() we are safe to sample i_size for
         * ext4_bio_write_folio() to zero-out tail of the written page. We rely
         * on the barrier provided by folio_test_clear_dirty() in
         * folio_clear_dirty_for_io() to make sure i_size is really sampled only
         * after page tables are updated.
         */
        size = i_size_read(mpd->inode);
        len = folio_size(folio);
        if (folio_pos(folio) + len > size &&
            !ext4_verity_in_progress(mpd->inode))
                len = size & (len - 1);
        err = ext4_bio_write_folio(&mpd->io_submit, folio, len);

        return err;
}

#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))

/*
 * mballoc gives us at most this number of blocks...
 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
 * The rest of mballoc seems to handle chunks up to full group size.
 */
#define MAX_WRITEPAGES_EXTENT_LEN 2048

/*
 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
 *
 * @mpd - extent of blocks
 * @lblk - logical number of the block in the file
 * @bh - buffer head we want to add to the extent
 *
 * The function is used to collect contig. blocks in the same state. If the
 * buffer doesn't require mapping for writeback and we haven't started the
 * extent of buffers to map yet, the function returns 'true' immediately - the
 * caller can write the buffer right away. Otherwise the function returns true
 * if the block has been added to the extent, false if the block couldn't be
 * added.
 */
static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
                                   struct buffer_head *bh)
{
        struct ext4_map_blocks *map = &mpd->map;

        /* Buffer that doesn't need mapping for writeback? */
        if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
            (!buffer_delay(bh) && !buffer_unwritten(bh))) {
                /* So far no extent to map => we write the buffer right away */
                if (map->m_len == 0)
                        return true;
                return false;
        }

        /* First block in the extent? */
        if (map->m_len == 0) {
                /* We cannot map unless handle is started... */
                if (!mpd->do_map)
                        return false;
                map->m_lblk = lblk;
                map->m_len = 1;
                map->m_flags = bh->b_state & BH_FLAGS;
                return true;
        }

        /* Don't go larger than mballoc is willing to allocate */
        if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
                return false;

        /* Can we merge the block to our big extent? */
        if (lblk == map->m_lblk + map->m_len &&
            (bh->b_state & BH_FLAGS) == map->m_flags) {
                map->m_len++;
                return true;
        }
        return false;
}

/*
 * mpage_process_page_bufs - submit page buffers for IO or add them to extent
 *
 * @mpd - extent of blocks for mapping
 * @head - the first buffer in the page
 * @bh - buffer we should start processing from
 * @lblk - logical number of the block in the file corresponding to @bh
 *
 * Walk through page buffers from @bh upto @head (exclusive) and either submit
 * the page for IO if all buffers in this page were mapped and there's no
 * accumulated extent of buffers to map or add buffers in the page to the
 * extent of buffers to map. The function returns 1 if the caller can continue
 * by processing the next page, 0 if it should stop adding buffers to the
 * extent to map because we cannot extend it anymore. It can also return value
 * < 0 in case of error during IO submission.
 */
static int mpage_process_page_bufs(struct mpage_da_data *mpd,
                                   struct buffer_head *head,
                                   struct buffer_head *bh,
                                   ext4_lblk_t lblk)
{
        struct inode *inode = mpd->inode;
        int err;
        ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
                                                        >> inode->i_blkbits;

        if (ext4_verity_in_progress(inode))
                blocks = EXT_MAX_BLOCKS;

        do {
                BUG_ON(buffer_locked(bh));

                if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
                        /* Found extent to map? */
                        if (mpd->map.m_len)
                                return 0;
                        /* Buffer needs mapping and handle is not started? */
                        if (!mpd->do_map)
                                return 0;
                        /* Everything mapped so far and we hit EOF */
                        break;
                }
        } while (lblk++, (bh = bh->b_this_page) != head);
        /* So far everything mapped? Submit the page for IO. */
        if (mpd->map.m_len == 0) {
                err = mpage_submit_folio(mpd, head->b_folio);
                if (err < 0)
                        return err;
                mpage_folio_done(mpd, head->b_folio);
        }
        if (lblk >= blocks) {
                mpd->scanned_until_end = 1;
                return 0;
        }
        return 1;
}

/*
 * mpage_process_folio - update folio buffers corresponding to changed extent
 *                         and may submit fully mapped page for IO
 * @mpd: description of extent to map, on return next extent to map
 * @folio: Contains these buffers.
 * @m_lblk: logical block mapping.
 * @m_pblk: corresponding physical mapping.
 * @map_bh: determines on return whether this page requires any further
 *                  mapping or not.
 *
 * Scan given folio buffers corresponding to changed extent and update buffer
 * state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits.
 * If the given folio is not fully mapped, we update @mpd to the next extent in
 * the given folio that needs mapping & return @map_bh as true.
 */
static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,
                              ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
                              bool *map_bh)
{
        struct buffer_head *head, *bh;
        ext4_io_end_t *io_end = mpd->io_submit.io_end;
        ext4_lblk_t lblk = *m_lblk;
        ext4_fsblk_t pblock = *m_pblk;
        int err = 0;
        ssize_t io_end_size = 0;
        struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);

        bh = head = folio_buffers(folio);
        do {
                if (lblk < mpd->map.m_lblk)
                        continue;
                if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
                        /*
                         * Buffer after end of mapped extent.
                         * Find next buffer in the folio to map.
                         */
                        mpd->map.m_len = 0;
                        mpd->map.m_flags = 0;
                        io_end_vec->size += io_end_size;

                        err = mpage_process_page_bufs(mpd, head, bh, lblk);
                        if (err > 0)
                                err = 0;
                        if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
                                io_end_vec = ext4_alloc_io_end_vec(io_end);
                                if (IS_ERR(io_end_vec)) {
                                        err = PTR_ERR(io_end_vec);
                                        goto out;
                                }
                                io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode,
                                                                mpd->map.m_lblk);
                        }
                        *map_bh = true;
                        goto out;
                }
                if (buffer_delay(bh)) {
                        clear_buffer_delay(bh);
                        bh->b_blocknr = pblock++;
                }
                clear_buffer_unwritten(bh);
                io_end_size += i_blocksize(mpd->inode);
        } while (lblk++, (bh = bh->b_this_page) != head);

        io_end_vec->size += io_end_size;
        *map_bh = false;
out:
        *m_lblk = lblk;
        *m_pblk = pblock;
        return err;
}

/*
 * mpage_map_buffers - update buffers corresponding to changed extent and
 *                       submit fully mapped pages for IO
 *
 * @mpd - description of extent to map, on return next extent to map
 *
 * Scan buffers corresponding to changed extent (we expect corresponding pages
 * to be already locked) and update buffer state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits,
 * and mark buffers as uninit when we perform writes to unwritten extents
 * and do extent conversion after IO is finished. If the last page is not fully
 * mapped, we update @map to the next extent in the last page that needs
 * mapping. Otherwise we submit the page for IO.
 */
static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
{
        struct folio_batch fbatch;
        unsigned nr, i;
        struct inode *inode = mpd->inode;
        pgoff_t start, end;
        ext4_lblk_t lblk;
        ext4_fsblk_t pblock;
        int err;
        bool map_bh = false;

        start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk);
        end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1);
        pblock = mpd->map.m_pblk;

        folio_batch_init(&fbatch);
        while (start <= end) {
                nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch);
                if (nr == 0)
                        break;
                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        lblk = EXT4_PG_TO_LBLK(inode, folio->index);
                        err = mpage_process_folio(mpd, folio, &lblk, &pblock,
                                                 &map_bh);
                        /*
                         * If map_bh is true, means page may require further bh
                         * mapping, or maybe the page was submitted for IO.
                         * So we return to call further extent mapping.
                         */
                        if (err < 0 || map_bh)
                                goto out;
                        /* Page fully mapped - let IO run! */
                        err = mpage_submit_folio(mpd, folio);
                        if (err < 0)
                                goto out;
                        mpage_folio_done(mpd, folio);
                }
                folio_batch_release(&fbatch);
        }
        /* Extent fully mapped and matches with page boundary. We are done. */
        mpd->map.m_len = 0;
        mpd->map.m_flags = 0;
        return 0;
out:
        folio_batch_release(&fbatch);
        return err;
}

static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
{
        struct inode *inode = mpd->inode;
        struct ext4_map_blocks *map = &mpd->map;
        int get_blocks_flags;
        int err, dioread_nolock;

        /* Make sure transaction has enough credits for this extent */
        err = ext4_journal_ensure_extent_credits(handle, inode);
        if (err < 0)
                return err;

        trace_ext4_da_write_pages_extent(inode, map);
        /*
         * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
         * to convert an unwritten extent to be initialized (in the case
         * where we have written into one or more preallocated blocks).  It is
         * possible that we're going to need more metadata blocks than
         * previously reserved. However we must not fail because we're in
         * writeback and there is nothing we can do about it so it might result
         * in data loss.  So use reserved blocks to allocate metadata if
         * possible. In addition, do not cache any unrelated extents, as it
         * only holds the folio lock but does not hold the i_rwsem or
         * invalidate_lock, which could corrupt the extent status tree.
         */
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
                           EXT4_GET_BLOCKS_METADATA_NOFAIL |
                           EXT4_GET_BLOCKS_IO_SUBMIT |
                           EXT4_EX_NOCACHE;

        dioread_nolock = ext4_should_dioread_nolock(inode);
        if (dioread_nolock)
                get_blocks_flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;

        err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
        if (err < 0)
                return err;
        if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
                if (!mpd->io_submit.io_end->handle &&
                    ext4_handle_valid(handle)) {
                        mpd->io_submit.io_end->handle = handle->h_rsv_handle;
                        handle->h_rsv_handle = NULL;
                }
                ext4_set_io_unwritten_flag(mpd->io_submit.io_end);
        }

        BUG_ON(map->m_len == 0);
        return 0;
}

/*
 * This is used to submit mapped buffers in a single folio that is not fully
 * mapped for various reasons, such as insufficient space or journal credits.
 */
static int mpage_submit_partial_folio(struct mpage_da_data *mpd)
{
        struct inode *inode = mpd->inode;
        struct folio *folio;
        loff_t pos;
        int ret;

        folio = filemap_get_folio(inode->i_mapping,
                                  mpd->start_pos >> PAGE_SHIFT);
        if (IS_ERR(folio))
                return PTR_ERR(folio);
        /*
         * The mapped position should be within the current processing folio
         * but must not be the folio start position.
         */
        pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits;
        if (WARN_ON_ONCE((folio_pos(folio) == pos) ||
                         !folio_contains(folio, pos >> PAGE_SHIFT)))
                return -EINVAL;

        ret = mpage_submit_folio(mpd, folio);
        if (ret)
                goto out;
        /*
         * Update start_pos to prevent this folio from being released in
         * mpage_release_unused_pages(), it will be reset to the aligned folio
         * pos when this folio is written again in the next round. Additionally,
         * do not update wbc->nr_to_write here, as it will be updated once the
         * entire folio has finished processing.
         */
        mpd->start_pos = pos;
out:
        folio_unlock(folio);
        folio_put(folio);
        return ret;
}

/*
 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
 *                                 mpd->len and submit pages underlying it for IO
 *
 * @handle - handle for journal operations
 * @mpd - extent to map
 * @give_up_on_write - we set this to true iff there is a fatal error and there
 *                     is no hope of writing the data. The caller should discard
 *                     dirty pages to avoid infinite loops.
 *
 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
 * delayed, blocks are allocated, if it is unwritten, we may need to convert
 * them to initialized or split the described range from larger unwritten
 * extent. Note that we need not map all the described range since allocation
 * can return less blocks or the range is covered by more unwritten extents. We
 * cannot map more because we are limited by reserved transaction credits. On
 * the other hand we always make sure that the last touched page is fully
 * mapped so that it can be written out (and thus forward progress is
 * guaranteed). After mapping we submit all mapped pages for IO.
 */
static int mpage_map_and_submit_extent(handle_t *handle,
                                       struct mpage_da_data *mpd,
                                       bool *give_up_on_write)
{
        struct inode *inode = mpd->inode;
        struct ext4_map_blocks *map = &mpd->map;
        int err;
        loff_t disksize;
        int progress = 0;
        ext4_io_end_t *io_end = mpd->io_submit.io_end;
        struct ext4_io_end_vec *io_end_vec;

        io_end_vec = ext4_alloc_io_end_vec(io_end);
        if (IS_ERR(io_end_vec))
                return PTR_ERR(io_end_vec);
        io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk);
        do {
                err = mpage_map_one_extent(handle, mpd);
                if (err < 0) {
                        struct super_block *sb = inode->i_sb;

                        if (ext4_emergency_state(sb))
                                goto invalidate_dirty_pages;
                        /*
                         * Let the uper layers retry transient errors.
                         * In the case of ENOSPC, if ext4_count_free_blocks()
                         * is non-zero, a commit should free up blocks.
                         */
                        if ((err == -ENOMEM) || (err == -EAGAIN) ||
                            (err == -ENOSPC && ext4_count_free_clusters(sb))) {
                                /*
                                 * We may have already allocated extents for
                                 * some bhs inside the folio, issue the
                                 * corresponding data to prevent stale data.
                                 */
                                if (progress) {
                                        if (mpage_submit_partial_folio(mpd))
                                                goto invalidate_dirty_pages;
                                        goto update_disksize;
                                }
                                return err;
                        }
                        ext4_msg(sb, KERN_CRIT,
                                 "Delayed block allocation failed for "
                                 "inode %llu at logical offset %llu with"
                                 " max blocks %u with error %d",
                                 inode->i_ino,
                                 (unsigned long long)map->m_lblk,
                                 (unsigned)map->m_len, -err);
                        ext4_msg(sb, KERN_CRIT,
                                 "This should not happen!! Data will "
                                 "be lost\n");
                        if (err == -ENOSPC)
                                ext4_print_free_blocks(inode);
                invalidate_dirty_pages:
                        *give_up_on_write = true;
                        return err;
                }
                progress = 1;
                /*
                 * Update buffer state, submit mapped pages, and get us new
                 * extent to map
                 */
                err = mpage_map_and_submit_buffers(mpd);
                if (err < 0)
                        goto update_disksize;
        } while (map->m_len);

update_disksize:
        /*
         * Update on-disk size after IO is submitted.  Races with
         * truncate are avoided by checking i_size under i_data_sem.
         */
        disksize = mpd->start_pos;
        if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
                int err2;
                loff_t i_size;

                down_write(&EXT4_I(inode)->i_data_sem);
                i_size = i_size_read(inode);
                if (disksize > i_size)
                        disksize = i_size;
                if (disksize > EXT4_I(inode)->i_disksize)
                        EXT4_I(inode)->i_disksize = disksize;
                up_write(&EXT4_I(inode)->i_data_sem);
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (err2) {
                        ext4_error_err(inode->i_sb, -err2,
                                       "Failed to mark inode %llu dirty",
                                       inode->i_ino);
                }
                if (!err)
                        err = err2;
        }
        return err;
}

static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
                                     size_t len)
{
        struct buffer_head *page_bufs = folio_buffers(folio);
        struct inode *inode = folio->mapping->host;
        int ret, err;

        ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
                                     NULL, do_journal_get_write_access);
        err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
                                     NULL, write_end_fn);
        if (ret == 0)
                ret = err;
        err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len);
        if (ret == 0)
                ret = err;
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;

        return ret;
}

static int mpage_journal_page_buffers(handle_t *handle,
                                      struct mpage_da_data *mpd,
                                      struct folio *folio)
{
        struct inode *inode = mpd->inode;
        loff_t size = i_size_read(inode);
        size_t len = folio_size(folio);

        folio_clear_checked(folio);
        mpd->wbc->nr_to_write -= folio_nr_pages(folio);

        if (folio_pos(folio) + len > size &&
            !ext4_verity_in_progress(inode))
                len = size & (len - 1);

        return ext4_journal_folio_buffers(handle, folio, len);
}

/*
 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
 *                                  needing mapping, submit mapped pages
 *
 * @mpd - where to look for pages
 *
 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
 * IO immediately. If we cannot map blocks, we submit just already mapped
 * buffers in the page for IO and keep page dirty. When we can map blocks and
 * we find a page which isn't mapped we start accumulating extent of buffers
 * underlying these pages that needs mapping (formed by either delayed or
 * unwritten buffers). We also lock the pages containing these buffers. The
 * extent found is returned in @mpd structure (starting at mpd->lblk with
 * length mpd->len blocks).
 *
 * Note that this function can attach bios to one io_end structure which are
 * neither logically nor physically contiguous. Although it may seem as an
 * unnecessary complication, it is actually inevitable in blocksize < pagesize
 * case as we need to track IO to all buffers underlying a page in one io_end.
 */
static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
{
        struct address_space *mapping = mpd->inode->i_mapping;
        struct folio_batch fbatch;
        unsigned int nr_folios;
        pgoff_t index = mpd->start_pos >> PAGE_SHIFT;
        pgoff_t end = mpd->end_pos >> PAGE_SHIFT;
        xa_mark_t tag;
        int i, err = 0;
        ext4_lblk_t lblk;
        struct buffer_head *head;
        handle_t *handle = NULL;
        int bpp = ext4_journal_blocks_per_folio(mpd->inode);

        tag = wbc_to_tag(mpd->wbc);

        mpd->map.m_len = 0;
        mpd->next_pos = mpd->start_pos;
        if (ext4_should_journal_data(mpd->inode)) {
                handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
                                            bpp);
                if (IS_ERR(handle))
                        return PTR_ERR(handle);
        }
        folio_batch_init(&fbatch);
        while (index <= end) {
                nr_folios = filemap_get_folios_tag(mapping, &index, end,
                                tag, &fbatch);
                if (nr_folios == 0)
                        break;

                for (i = 0; i < nr_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        /*
                         * Accumulated enough dirty pages? This doesn't apply
                         * to WB_SYNC_ALL mode. For integrity sync we have to
                         * keep going because someone may be concurrently
                         * dirtying pages, and we might have synced a lot of
                         * newly appeared dirty pages, but have not synced all
                         * of the old dirty pages.
                         */
                        if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
                            mpd->wbc->nr_to_write <=
                            EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len))
                                goto out;

                        /* If we can't merge this page, we are done. */
                        if (mpd->map.m_len > 0 &&
                            mpd->next_pos != folio_pos(folio))
                                goto out;

                        if (handle) {
                                err = ext4_journal_ensure_credits(handle, bpp,
                                                                  0);
                                if (err < 0)
                                        goto out;
                        }

                        folio_lock(folio);
                        /*
                         * If the page is no longer dirty, or its mapping no
                         * longer corresponds to inode we are writing (which
                         * means it has been truncated or invalidated), or the
                         * page is already under writeback and we are not doing
                         * a data integrity writeback, skip the page
                         */
                        if (!folio_test_dirty(folio) ||
                            (folio_test_writeback(folio) &&
                             (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
                            unlikely(folio->mapping != mapping)) {
                                folio_unlock(folio);
                                continue;
                        }

                        folio_wait_writeback(folio);
                        BUG_ON(folio_test_writeback(folio));

                        /*
                         * Should never happen but for buggy code in
                         * other subsystems that call
                         * set_page_dirty() without properly warning
                         * the file system first.  See [1] for more
                         * information.
                         *
                         * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
                         */
                        if (!folio_buffers(folio)) {
                                ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index);
                                folio_clear_dirty(folio);
                                folio_unlock(folio);
                                continue;
                        }

                        if (mpd->map.m_len == 0)
                                mpd->start_pos = folio_pos(folio);
                        mpd->next_pos = folio_next_pos(folio);
                        /*
                         * Writeout when we cannot modify metadata is simple.
                         * Just submit the page. For data=journal mode we
                         * first handle writeout of the page for checkpoint and
                         * only after that handle delayed page dirtying. This
                         * makes sure current data is checkpointed to the final
                         * location before possibly journalling it again which
                         * is desirable when the page is frequently dirtied
                         * through a pin.
                         */
                        if (!mpd->can_map) {
                                err = mpage_submit_folio(mpd, folio);
                                if (err < 0)
                                        goto out;
                                /* Pending dirtying of journalled data? */
                                if (folio_test_checked(folio)) {
                                        err = mpage_journal_page_buffers(handle,
                                                mpd, folio);
                                        if (err < 0)
                                                goto out;
                                        mpd->journalled_more_data = 1;
                                }
                                mpage_folio_done(mpd, folio);
                        } else {
                                /* Add all dirty buffers to mpd */
                                lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index);
                                head = folio_buffers(folio);
                                err = mpage_process_page_bufs(mpd, head, head,
                                                lblk);
                                if (err <= 0)
                                        goto out;
                                err = 0;
                        }
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
        mpd->scanned_until_end = 1;
        if (handle)
                ext4_journal_stop(handle);
        return 0;
out:
        folio_batch_release(&fbatch);
        if (handle)
                ext4_journal_stop(handle);
        return err;
}

static int ext4_do_writepages(struct mpage_da_data *mpd)
{
        struct writeback_control *wbc = mpd->wbc;
        pgoff_t        writeback_index = 0;
        long nr_to_write = wbc->nr_to_write;
        int range_whole = 0;
        int cycled = 1;
        handle_t *handle = NULL;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
        int needed_blocks, rsv_blocks = 0, ret = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        struct blk_plug plug;
        bool give_up_on_write = false;

        trace_ext4_writepages(inode, wbc);

        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
         * because that could violate lock ordering on umount
         */
        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                goto out_writepages;

        /*
         * If the filesystem has aborted, it is read-only, so return
         * right away instead of dumping stack traces later on that
         * will obscure the real source of the problem.  We test
         * fs shutdown state instead of sb->s_flag's SB_RDONLY because
         * the latter could be true if the filesystem is mounted
         * read-only, and in that case, ext4_writepages should
         * *never* be called, so if that ever happens, we would want
         * the stack trace.
         */
        ret = ext4_emergency_state(mapping->host->i_sb);
        if (unlikely(ret))
                goto out_writepages;

        /*
         * If we have inline data and arrive here, it means that
         * we will soon create the block for the 1st page, so
         * we'd better clear the inline data here.
         */
        if (ext4_has_inline_data(inode)) {
                /* Just inode will be modified... */
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out_writepages;
                }
                BUG_ON(ext4_test_inode_state(inode,
                                EXT4_STATE_MAY_INLINE_DATA));
                ext4_destroy_inline_data(handle, inode);
                ext4_journal_stop(handle);
        }

        /*
         * data=journal mode does not do delalloc so we just need to writeout /
         * journal already mapped buffers. On the other hand we need to commit
         * transaction to make data stable. We expect all the data to be
         * already in the journal (the only exception are DMA pinned pages
         * dirtied behind our back) so we commit transaction here and run the
         * writeback loop to checkpoint them. The checkpointing is not actually
         * necessary to make data persistent *but* quite a few places (extent
         * shifting operations, fsverity, ...) depend on being able to drop
         * pagecache pages after calling filemap_write_and_wait() and for that
         * checkpointing needs to happen.
         */
        if (ext4_should_journal_data(inode)) {
                mpd->can_map = 0;
                if (wbc->sync_mode == WB_SYNC_ALL)
                        ext4_fc_commit(sbi->s_journal,
                                       EXT4_I(inode)->i_datasync_tid);
        }
        mpd->journalled_more_data = 0;

        if (ext4_should_dioread_nolock(inode)) {
                int bpf = ext4_journal_blocks_per_folio(inode);
                /*
                 * We may need to convert up to one extent per block in
                 * the folio and we may dirty the inode.
                 */
                rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf);
        }

        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;

        if (wbc->range_cyclic) {
                writeback_index = mapping->writeback_index;
                if (writeback_index)
                        cycled = 0;
                mpd->start_pos = writeback_index << PAGE_SHIFT;
                mpd->end_pos = LLONG_MAX;
        } else {
                mpd->start_pos = wbc->range_start;
                mpd->end_pos = wbc->range_end;
        }

        ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT,
                                        mpd->end_pos >> PAGE_SHIFT);
        blk_start_plug(&plug);

        /*
         * First writeback pages that don't need mapping - we can avoid
         * starting a transaction unnecessarily and also avoid being blocked
         * in the block layer on device congestion while having transaction
         * started.
         */
        mpd->do_map = 0;
        mpd->scanned_until_end = 0;
        mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
        if (!mpd->io_submit.io_end) {
                ret = -ENOMEM;
                goto unplug;
        }
        ret = mpage_prepare_extent_to_map(mpd);
        /* Unlock pages we didn't use */
        mpage_release_unused_pages(mpd, false);
        /* Submit prepared bio */
        ext4_io_submit(&mpd->io_submit);
        ext4_put_io_end_defer(mpd->io_submit.io_end);
        mpd->io_submit.io_end = NULL;
        if (ret < 0)
                goto unplug;

        while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
                /* For each extent of pages we use new io_end */
                mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
                if (!mpd->io_submit.io_end) {
                        ret = -ENOMEM;
                        break;
                }

                WARN_ON_ONCE(!mpd->can_map);
                /*
                 * We have two constraints: We find one extent to map and we
                 * must always write out whole page (makes a difference when
                 * blocksize < pagesize) so that we don't block on IO when we
                 * try to write out the rest of the page. Journalled mode is
                 * not supported by delalloc.
                 */
                BUG_ON(ext4_should_journal_data(inode));
                /*
                 * Calculate the number of credits needed to reserve for one
                 * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will
                 * attempt to extend the transaction or start a new iteration
                 * if the reserved credits are insufficient.
                 */
                needed_blocks = ext4_chunk_trans_blocks(inode,
                                                MAX_WRITEPAGES_EXTENT_LEN);
                /* start a new transaction */
                handle = ext4_journal_start_with_reserve(inode,
                                EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %llu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        /* Release allocated io_end */
                        ext4_put_io_end(mpd->io_submit.io_end);
                        mpd->io_submit.io_end = NULL;
                        break;
                }
                mpd->do_map = 1;

                trace_ext4_da_write_folios_start(inode, mpd->start_pos,
                                mpd->next_pos, wbc);
                ret = mpage_prepare_extent_to_map(mpd);
                if (!ret && mpd->map.m_len)
                        ret = mpage_map_and_submit_extent(handle, mpd,
                                        &give_up_on_write);
                /*
                 * Caution: If the handle is synchronous,
                 * ext4_journal_stop() can wait for transaction commit
                 * to finish which may depend on writeback of pages to
                 * complete or on page lock to be released.  In that
                 * case, we have to wait until after we have
                 * submitted all the IO, released page locks we hold,
                 * and dropped io_end reference (for extent conversion
                 * to be able to complete) before stopping the handle.
                 */
                if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
                        ext4_journal_stop(handle);
                        handle = NULL;
                        mpd->do_map = 0;
                }
                /* Unlock pages we didn't use */
                mpage_release_unused_pages(mpd, give_up_on_write);
                /* Submit prepared bio */
                ext4_io_submit(&mpd->io_submit);

                /*
                 * Drop our io_end reference we got from init. We have
                 * to be careful and use deferred io_end finishing if
                 * we are still holding the transaction as we can
                 * release the last reference to io_end which may end
                 * up doing unwritten extent conversion.
                 */
                if (handle) {
                        ext4_put_io_end_defer(mpd->io_submit.io_end);
                        ext4_journal_stop(handle);
                } else
                        ext4_put_io_end(mpd->io_submit.io_end);
                mpd->io_submit.io_end = NULL;
                trace_ext4_da_write_folios_end(inode, mpd->start_pos,
                                mpd->next_pos, wbc, ret);

                if (ret == -ENOSPC && sbi->s_journal) {
                        /*
                         * Commit the transaction which would
                         * free blocks released in the transaction
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
                        ret = 0;
                        continue;
                }
                if (ret == -EAGAIN)
                        ret = 0;
                /* Fatal error - ENOMEM, EIO... */
                if (ret)
                        break;
        }
unplug:
        blk_finish_plug(&plug);
        if (!ret && !cycled && wbc->nr_to_write > 0) {
                cycled = 1;
                mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1;
                mpd->start_pos = 0;
                goto retry;
        }

        /* Update index */
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * Set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
                mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT;

out_writepages:
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
        return ret;
}

static int ext4_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
{
        struct super_block *sb = mapping->host->i_sb;
        struct mpage_da_data mpd = {
                .inode = mapping->host,
                .wbc = wbc,
                .can_map = 1,
        };
        int ret;
        int alloc_ctx;

        ret = ext4_emergency_state(sb);
        if (unlikely(ret))
                return ret;

        alloc_ctx = ext4_writepages_down_read(sb);
        ret = ext4_do_writepages(&mpd);
        /*
         * For data=journal writeback we could have come across pages marked
         * for delayed dirtying (PageChecked) which were just added to the
         * running transaction. Try once more to get them to stable storage.
         */
        if (!ret && mpd.journalled_more_data)
                ret = ext4_do_writepages(&mpd);
        ext4_writepages_up_read(sb, alloc_ctx);

        return ret;
}

int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        loff_t range_start, range_end;
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
        };
        struct mpage_da_data mpd = {
                .inode = jinode->i_vfs_inode,
                .wbc = &wbc,
                .can_map = 0,
        };

        if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end))
                return 0;

        wbc.range_start = range_start;
        wbc.range_end = range_end;

        return ext4_do_writepages(&mpd);
}

static int ext4_dax_writepages(struct address_space *mapping,
                               struct writeback_control *wbc)
{
        int ret;
        long nr_to_write = wbc->nr_to_write;
        struct inode *inode = mapping->host;
        int alloc_ctx;

        ret = ext4_emergency_state(inode->i_sb);
        if (unlikely(ret))
                return ret;

        alloc_ctx = ext4_writepages_down_read(inode->i_sb);
        trace_ext4_writepages(inode, wbc);

        ret = dax_writeback_mapping_range(mapping,
                                          EXT4_SB(inode->i_sb)->s_daxdev, wbc);
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
        ext4_writepages_up_read(inode->i_sb, alloc_ctx);
        return ret;
}

static int ext4_nonda_switch(struct super_block *sb)
{
        s64 free_clusters, dirty_clusters;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /*
         * switch to non delalloc mode if we are running low
         * on free block. The free block accounting via percpu
         * counters can get slightly wrong with percpu_counter_batch getting
         * accumulated on each CPU without updating global counters
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
         */
        free_clusters =
                percpu_counter_read_positive(&sbi->s_freeclusters_counter);
        dirty_clusters =
                percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
        /*
         * Start pushing delalloc when 1/2 of free blocks are dirty.
         */
        if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
                try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);

        if (2 * free_clusters < 3 * dirty_clusters ||
            free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
                /*
                 * free block count is less than 150% of dirty blocks
                 * or free blocks is less than watermark
                 */
                return 1;
        }
        return 0;
}

static int ext4_da_write_begin(const struct kiocb *iocb,
                               struct address_space *mapping,
                               loff_t pos, unsigned len,
                               struct folio **foliop, void **fsdata)
{
        int ret, retries = 0;
        struct folio *folio;
        pgoff_t index;
        struct inode *inode = mapping->host;

        ret = ext4_emergency_state(inode->i_sb);
        if (unlikely(ret))
                return ret;

        index = pos >> PAGE_SHIFT;

        if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
                *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
                return ext4_write_begin(iocb, mapping, pos,
                                        len, foliop, fsdata);
        }
        *fsdata = (void *)0;
        trace_ext4_da_write_begin(inode, pos, len);

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_generic_write_inline_data(mapping, inode, pos, len,
                                                     foliop, fsdata, true);
                if (ret < 0)
                        return ret;
                if (ret == 1)
                        return 0;
        }

retry:
        folio = write_begin_get_folio(iocb, mapping, index, len);
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        if (len > folio_next_pos(folio) - pos)
                len = folio_next_pos(folio) - pos;

        ret = ext4_block_write_begin(NULL, folio, pos, len,
                                     ext4_da_get_block_prep);
        if (ret < 0) {
                folio_unlock(folio);
                folio_put(folio);
                /*
                 * ext4_block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold inode lock.
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);

                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry;
                return ret;
        }

        *foliop = folio;
        return ret;
}

/*
 * Check if we should update i_disksize
 * when write to the end of file but not require block allocation
 */
static int ext4_da_should_update_i_disksize(struct folio *folio,
                                            unsigned long offset)
{
        struct buffer_head *bh;
        struct inode *inode = folio->mapping->host;
        unsigned int idx;
        int i;

        bh = folio_buffers(folio);
        idx = offset >> inode->i_blkbits;

        for (i = 0; i < idx; i++)
                bh = bh->b_this_page;

        if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
                return 0;
        return 1;
}

static int ext4_da_do_write_end(struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct folio *folio)
{
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        bool disksize_changed = false;
        loff_t new_i_size;
        handle_t *handle;

        if (unlikely(!folio_buffers(folio))) {
                folio_unlock(folio);
                folio_put(folio);
                return -EIO;
        }
        /*
         * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
         * flag, which all that's needed to trigger page writeback.
         */
        copied = block_write_end(pos, len, copied, folio);
        new_i_size = pos + copied;

        /*
         * It's important to update i_size while still holding folio lock,
         * because folio writeout could otherwise come in and zero beyond
         * i_size.
         *
         * Since we are holding inode lock, we are sure i_disksize <=
         * i_size. We also know that if i_disksize < i_size, there are
         * delalloc writes pending in the range up to i_size. If the end of
         * the current write is <= i_size, there's no need to touch
         * i_disksize since writeback will push i_disksize up to i_size
         * eventually. If the end of the current write is > i_size and
         * inside an allocated block which ext4_da_should_update_i_disksize()
         * checked, we need to update i_disksize here as certain
         * ext4_writepages() paths not allocating blocks and update i_disksize.
         */
        if (new_i_size > inode->i_size) {
                unsigned long end;

                i_size_write(inode, new_i_size);
                end = offset_in_folio(folio, new_i_size - 1);
                if (copied && ext4_da_should_update_i_disksize(folio, end)) {
                        ext4_update_i_disksize(inode, new_i_size);
                        disksize_changed = true;
                }
        }

        folio_unlock(folio);
        folio_put(folio);

        if (pos > old_size)
                pagecache_isize_extended(inode, old_size, pos);

        if (!disksize_changed)
                return copied;

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);

        return copied;
}

static int ext4_da_write_end(const struct kiocb *iocb,
                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned copied,
                             struct folio *folio, void *fsdata)
{
        struct inode *inode = mapping->host;
        int write_mode = (int)(unsigned long)fsdata;

        if (write_mode == FALL_BACK_TO_NONDELALLOC)
                return ext4_write_end(iocb, mapping, pos,
                                      len, copied, folio, fsdata);

        trace_ext4_da_write_end(inode, pos, len, copied);

        if (write_mode != CONVERT_INLINE_DATA &&
            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
            ext4_has_inline_data(inode))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        if (unlikely(copied < len) && !folio_test_uptodate(folio))
                copied = 0;

        return ext4_da_do_write_end(mapping, pos, len, copied, folio);
}

/*
 * Force all delayed allocation blocks to be allocated for a given inode.
 */
int ext4_alloc_da_blocks(struct inode *inode)
{
        trace_ext4_alloc_da_blocks(inode);

        if (!EXT4_I(inode)->i_reserved_data_blocks)
                return 0;

        /*
         * We do something simple for now.  The filemap_flush() will
         * also start triggering a write of the data blocks, which is
         * not strictly speaking necessary.  However, to do otherwise
         * would require replicating code paths in:
         *
         * ext4_writepages() ->
         *    write_cache_pages() ---> (via passed in callback function)
         *        __mpage_da_writepage() -->
         *           mpage_add_bh_to_extent()
         *           mpage_da_map_blocks()
         *
         * The problem is that write_cache_pages(), located in
         * mm/page-writeback.c, marks pages clean in preparation for
         * doing I/O, which is not desirable if we're not planning on
         * doing I/O at all.
         *
         * We could call write_cache_pages(), and then redirty all of
         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
         * simplifying them because we wouldn't actually intend to
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
         *
         * For now, though, we'll cheat by calling filemap_flush(),
         * which will map the blocks, and start the I/O, but not
         * actually wait for the I/O to complete.
         */
        return filemap_flush(inode->i_mapping);
}

/*
 * bmap() is special.  It gets used by applications such as lilo and by
 * the swapper to find the on-disk block of a specific piece of data.
 *
 * Naturally, this is dangerous if the block concerned is still in the
 * journal.  If somebody makes a swapfile on an ext4 data-journaling
 * filesystem and enables swap, then they may get a nasty shock when the
 * data getting swapped to that swapfile suddenly gets overwritten by
 * the original zero's written out previously to the journal and
 * awaiting writeback in the kernel's buffer cache.
 *
 * So, if we see any bmap calls here on a modified, data-journaled file,
 * take extra steps to flush any blocks which might be in the cache.
 */
static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
{
        struct inode *inode = mapping->host;
        sector_t ret = 0;

        inode_lock_shared(inode);
        /*
         * We can get here for an inline file via the FIBMAP ioctl
         */
        if (ext4_has_inline_data(inode))
                goto out;

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
            (test_opt(inode->i_sb, DELALLOC) ||
             ext4_should_journal_data(inode))) {
                /*
                 * With delalloc or journalled data we want to sync the file so
                 * that we can make sure we allocate blocks for file and data
                 * is in place for the user to see it
                 */
                filemap_write_and_wait(mapping);
        }

        ret = iomap_bmap(mapping, block, &ext4_iomap_ops);

out:
        inode_unlock_shared(inode);
        return ret;
}

static void ext4_invalidate_folio(struct folio *folio, size_t offset,
                                size_t length)
{
        trace_ext4_invalidate_folio(folio, offset, length);

        /* No journalling happens on data buffers when this function is used */
        WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));

        block_invalidate_folio(folio, offset, length);
}

static int __ext4_journalled_invalidate_folio(struct folio *folio,
                                            size_t offset, size_t length)
{
        journal_t *journal = EXT4_JOURNAL(folio->mapping->host);

        trace_ext4_journalled_invalidate_folio(folio, offset, length);

        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
        if (offset == 0 && length == folio_size(folio))
                folio_clear_checked(folio);

        return jbd2_journal_invalidate_folio(journal, folio, offset, length);
}

/* Wrapper for aops... */
static void ext4_journalled_invalidate_folio(struct folio *folio,
                                           size_t offset,
                                           size_t length)
{
        WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}

static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
        struct inode *inode = folio->mapping->host;
        journal_t *journal = EXT4_JOURNAL(inode);

        trace_ext4_release_folio(inode, folio);

        /* Page has dirty journalled data -> cannot release */
        if (folio_test_checked(folio))
                return false;
        if (journal)
                return jbd2_journal_try_to_free_buffers(journal, folio);
        else
                return try_to_free_buffers(folio);
}

static bool ext4_inode_datasync_dirty(struct inode *inode)
{
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

        if (journal) {
                if (jbd2_transaction_committed(journal,
                        EXT4_I(inode)->i_datasync_tid))
                        return false;
                if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
                        return !list_empty(&EXT4_I(inode)->i_fc_list);
                return true;
        }

        /* Any metadata buffers to write? */
        if (mmb_has_buffers(&EXT4_I(inode)->i_metadata_bhs))
                return true;
        return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
}

static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
                           struct ext4_map_blocks *map, loff_t offset,
                           loff_t length, unsigned int flags)
{
        u8 blkbits = inode->i_blkbits;

        /*
         * Writes that span EOF might trigger an I/O size update on completion,
         * so consider them to be dirty for the purpose of O_DSYNC, even if
         * there is no other metadata changes being made or are pending.
         */
        iomap->flags = 0;
        if (ext4_inode_datasync_dirty(inode) ||
            offset + length > i_size_read(inode))
                iomap->flags |= IOMAP_F_DIRTY;

        if (map->m_flags & EXT4_MAP_NEW)
                iomap->flags |= IOMAP_F_NEW;

        /* HW-offload atomics are always used */
        if (flags & IOMAP_ATOMIC)
                iomap->flags |= IOMAP_F_ATOMIC_BIO;

        if (flags & IOMAP_DAX)
                iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
        else
                iomap->bdev = inode->i_sb->s_bdev;
        iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk);
        iomap->length = EXT4_LBLK_TO_B(inode, map->m_len);

        if ((map->m_flags & EXT4_MAP_MAPPED) &&
            !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                iomap->flags |= IOMAP_F_MERGED;

        /*
         * Flags passed to ext4_map_blocks() for direct I/O writes can result
         * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
         * set. In order for any allocated unwritten extents to be converted
         * into written extents correctly within the ->end_io() handler, we
         * need to ensure that the iomap->type is set appropriately. Hence, the
         * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
         * been set first.
         */
        if (map->m_flags & EXT4_MAP_UNWRITTEN) {
                iomap->type = IOMAP_UNWRITTEN;
                iomap->addr = (u64) map->m_pblk << blkbits;
                if (flags & IOMAP_DAX)
                        iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
        } else if (map->m_flags & EXT4_MAP_MAPPED) {
                iomap->type = IOMAP_MAPPED;
                iomap->addr = (u64) map->m_pblk << blkbits;
                if (flags & IOMAP_DAX)
                        iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
        } else if (map->m_flags & EXT4_MAP_DELAYED) {
                iomap->type = IOMAP_DELALLOC;
                iomap->addr = IOMAP_NULL_ADDR;
        } else {
                iomap->type = IOMAP_HOLE;
                iomap->addr = IOMAP_NULL_ADDR;
        }
}

static int ext4_map_blocks_atomic_write_slow(handle_t *handle,
                        struct inode *inode, struct ext4_map_blocks *map)
{
        ext4_lblk_t m_lblk = map->m_lblk;
        unsigned int m_len = map->m_len;
        unsigned int mapped_len = 0, m_flags = 0;
        ext4_fsblk_t next_pblk = 0;
        bool check_next_pblk = false;
        int ret = 0;

        WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb));

        /*
         * This is a slow path in case of mixed mapping. We use
         * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single
         * contiguous mapped mapping. This will ensure any unwritten or hole
         * regions within the requested range is zeroed out and we return
         * a single contiguous mapped extent.
         */
        m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;

        do {
                ret = ext4_map_blocks(handle, inode, map, m_flags);
                if (ret < 0 && ret != -ENOSPC)
                        goto out_err;
                /*
                 * This should never happen, but let's return an error code to
                 * avoid an infinite loop in here.
                 */
                if (ret == 0) {
                        ret = -EFSCORRUPTED;
                        ext4_warning_inode(inode,
                                "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d",
                                m_flags, ret);
                        goto out_err;
                }
                /*
                 * With bigalloc we should never get ENOSPC nor discontiguous
                 * physical extents.
                 */
                if ((check_next_pblk && next_pblk != map->m_pblk) ||
                                ret == -ENOSPC) {
                        ext4_warning_inode(inode,
                                "Non-contiguous allocation detected: expected %llu, got %llu, "
                                "or ext4_map_blocks() returned out of space ret: %d",
                                next_pblk, map->m_pblk, ret);
                        ret = -EFSCORRUPTED;
                        goto out_err;
                }
                next_pblk = map->m_pblk + map->m_len;
                check_next_pblk = true;

                mapped_len += map->m_len;
                map->m_lblk += map->m_len;
                map->m_len = m_len - mapped_len;
        } while (mapped_len < m_len);

        /*
         * We might have done some work in above loop, so we need to query the
         * start of the physical extent, based on the origin m_lblk and m_len.
         * Let's also ensure we were able to allocate the required range for
         * mixed mapping case.
         */
        map->m_lblk = m_lblk;
        map->m_len = m_len;
        map->m_flags = 0;

        ret = ext4_map_blocks(handle, inode, map,
                              EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF);
        if (ret != m_len) {
                ext4_warning_inode(inode,
                        "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n",
                        m_lblk, m_len, ret);
                ret = -EINVAL;
        }
        return ret;

out_err:
        /* reset map before returning an error */
        map->m_lblk = m_lblk;
        map->m_len = m_len;
        map->m_flags = 0;
        return ret;
}

/*
 * ext4_map_blocks_atomic: Helper routine to ensure the entire requested
 * range in @map [lblk, lblk + len) is one single contiguous extent with no
 * mixed mappings.
 *
 * We first use m_flags passed to us by our caller (ext4_iomap_alloc()).
 * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying
 * physical extent for the requested range does not have a single contiguous
 * mapping type i.e. (Hole, Mapped, or Unwritten) throughout.
 * In that case we will loop over the requested range to allocate and zero out
 * the unwritten / holes in between, to get a single mapped extent from
 * [m_lblk, m_lblk +  m_len). Note that this is only possible because we know
 * this can be called only with bigalloc enabled filesystem where the underlying
 * cluster is already allocated. This avoids allocating discontiguous extents
 * in the slow path due to multiple calls to ext4_map_blocks().
 * The slow path is mostly non-performance critical path, so it should be ok to
 * loop using ext4_map_blocks() with appropriate flags to allocate & zero the
 * underlying short holes/unwritten extents within the requested range.
 */
static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int m_flags,
                                bool *force_commit)
{
        ext4_lblk_t m_lblk = map->m_lblk;
        unsigned int m_len = map->m_len;
        int ret = 0;

        WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb));

        ret = ext4_map_blocks(handle, inode, map, m_flags);
        if (ret < 0 || ret == m_len)
                goto out;
        /*
         * This is a mixed mapping case where we were not able to allocate
         * a single contiguous extent. In that case let's reset requested
         * mapping and call the slow path.
         */
        map->m_lblk = m_lblk;
        map->m_len = m_len;
        map->m_flags = 0;

        /*
         * slow path means we have mixed mapping, that means we will need
         * to force txn commit.
         */
        *force_commit = true;
        return ext4_map_blocks_atomic_write_slow(handle, inode, map);
out:
        return ret;
}

static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
                            unsigned int flags)
{
        handle_t *handle;
        int ret, dio_credits, m_flags = 0, retries = 0;
        bool force_commit = false;

        /*
         * Trim the mapping request to the maximum value that we can map at
         * once for direct I/O.
         */
        if (map->m_len > DIO_MAX_BLOCKS)
                map->m_len = DIO_MAX_BLOCKS;

        /*
         * journal credits estimation for atomic writes. We call
         * ext4_map_blocks(), to find if there could be a mixed mapping. If yes,
         * then let's assume the no. of pextents required can be m_len i.e.
         * every alternate block can be unwritten and hole.
         */
        if (flags & IOMAP_ATOMIC) {
                unsigned int orig_mlen = map->m_len;

                ret = ext4_map_blocks(NULL, inode, map, 0);
                if (ret < 0)
                        return ret;
                if (map->m_len < orig_mlen) {
                        map->m_len = orig_mlen;
                        dio_credits = ext4_meta_trans_blocks(inode, orig_mlen,
                                                             map->m_len);
                } else {
                        dio_credits = ext4_chunk_trans_blocks(inode,
                                                              map->m_len);
                }
        } else {
                dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
        }

retry:
        /*
         * Either we allocate blocks and then don't get an unwritten extent, so
         * in that case we have reserved enough credits. Or, the blocks are
         * already allocated and unwritten. In that case, the extent conversion
         * fits into the credits as well.
         */
        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        /*
         * DAX and direct I/O are the only two operations that are currently
         * supported with IOMAP_WRITE.
         */
        WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT)));
        if (flags & IOMAP_DAX)
                m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
        /*
         * We use i_size instead of i_disksize here because delalloc writeback
         * can complete at any point during the I/O and subsequently push the
         * i_disksize out to i_size. This could be beyond where direct I/O is
         * happening and thus expose allocated blocks to direct I/O reads.
         */
        else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode))
                m_flags = EXT4_GET_BLOCKS_CREATE;
        else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                m_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;

        if (flags & IOMAP_ATOMIC)
                ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags,
                                                   &force_commit);
        else
                ret = ext4_map_blocks(handle, inode, map, m_flags);

        /*
         * We cannot fill holes in indirect tree based inodes as that could
         * expose stale data in the case of a crash. Use the magic error code
         * to fallback to buffered I/O.
         */
        if (!m_flags && !ret)
                ret = -ENOTBLK;

        ext4_journal_stop(handle);
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;

        /*
         * Force commit the current transaction if the allocation spans a mixed
         * mapping range. This ensures any pending metadata updates (like
         * unwritten to written extents conversion) in this range are in
         * consistent state with the file data blocks, before performing the
         * actual write I/O. If the commit fails, the whole I/O must be aborted
         * to prevent any possible torn writes.
         */
        if (ret > 0 && force_commit) {
                int ret2;

                ret2 = ext4_force_commit(inode->i_sb);
                if (ret2)
                        return ret2;
        }

        return ret;
}


static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
        int ret;
        struct ext4_map_blocks map;
        u8 blkbits = inode->i_blkbits;
        unsigned int orig_mlen;

        if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                return -EINVAL;

        if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
                return -ERANGE;

        /*
         * Calculate the first and last logical blocks respectively.
         */
        map.m_lblk = offset >> blkbits;
        map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                          EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
        orig_mlen = map.m_len;

        if (flags & IOMAP_WRITE) {
                /*
                 * We check here if the blocks are already allocated, then we
                 * don't need to start a journal txn and we can directly return
                 * the mapping information. This could boost performance
                 * especially in multi-threaded overwrite requests.
                 */
                if (offset + length <= i_size_read(inode)) {
                        ret = ext4_map_blocks(NULL, inode, &map, 0);
                        /*
                         * For DAX we convert extents to initialized ones before
                         * copying the data, otherwise we do it after I/O so
                         * there's no need to call into ext4_iomap_alloc().
                         */
                        if ((map.m_flags & EXT4_MAP_MAPPED) ||
                            (!(flags & IOMAP_DAX) &&
                             (map.m_flags & EXT4_MAP_UNWRITTEN))) {
                                /*
                                 * For atomic writes the entire requested
                                 * length should be mapped.
                                 */
                                if (ret == orig_mlen ||
                                    (!(flags & IOMAP_ATOMIC) && ret > 0))
                                        goto out;
                        }
                        map.m_len = orig_mlen;
                }
                ret = ext4_iomap_alloc(inode, &map, flags);
        } else {
                ret = ext4_map_blocks(NULL, inode, &map, 0);
        }

        if (ret < 0)
                return ret;
out:
        /*
         * When inline encryption is enabled, sometimes I/O to an encrypted file
         * has to be broken up to guarantee DUN contiguity.  Handle this by
         * limiting the length of the mapping returned.
         */
        map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);

        /*
         * Before returning to iomap, let's ensure the allocated mapping
         * covers the entire requested length for atomic writes.
         */
        if (flags & IOMAP_ATOMIC) {
                if (map.m_len < (length >> blkbits)) {
                        WARN_ON_ONCE(1);
                        return -EINVAL;
                }
        }
        ext4_set_iomap(inode, iomap, &map, offset, length, flags);

        return 0;
}

const struct iomap_ops ext4_iomap_ops = {
        .iomap_begin                = ext4_iomap_begin,
};

static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
                                   loff_t length, unsigned int flags,
                                   struct iomap *iomap, struct iomap *srcmap)
{
        int ret;
        struct ext4_map_blocks map;
        u8 blkbits = inode->i_blkbits;

        if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                return -EINVAL;

        if (ext4_has_inline_data(inode)) {
                ret = ext4_inline_data_iomap(inode, iomap);
                if (ret != -EAGAIN) {
                        if (ret == 0 && offset >= iomap->length)
                                ret = -ENOENT;
                        return ret;
                }
        }

        /*
         * Calculate the first and last logical block respectively.
         */
        map.m_lblk = offset >> blkbits;
        map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                          EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;

        /*
         * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
         * So handle it here itself instead of querying ext4_map_blocks().
         * Since ext4_map_blocks() will warn about it and will return
         * -EIO error.
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                if (offset >= sbi->s_bitmap_maxbytes) {
                        map.m_flags = 0;
                        goto set_iomap;
                }
        }

        ret = ext4_map_blocks(NULL, inode, &map, 0);
        if (ret < 0)
                return ret;
set_iomap:
        ext4_set_iomap(inode, iomap, &map, offset, length, flags);

        return 0;
}

const struct iomap_ops ext4_iomap_report_ops = {
        .iomap_begin = ext4_iomap_begin_report,
};

/*
 * For data=journal mode, folio should be marked dirty only when it was
 * writeably mapped. When that happens, it was already attached to the
 * transaction and marked as jbddirty (we take care of this in
 * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings
 * so we should have nothing to do here, except for the case when someone
 * had the page pinned and dirtied the page through this pin (e.g. by doing
 * direct IO to it). In that case we'd need to attach buffers here to the
 * transaction but we cannot due to lock ordering.  We cannot just dirty the
 * folio and leave attached buffers clean, because the buffers' dirty state is
 * "definitive".  We cannot just set the buffers dirty or jbddirty because all
 * the journalling code will explode.  So what we do is to mark the folio
 * "pending dirty" and next time ext4_writepages() is called, attach buffers
 * to the transaction appropriately.
 */
static bool ext4_journalled_dirty_folio(struct address_space *mapping,
                struct folio *folio)
{
        WARN_ON_ONCE(!folio_buffers(folio));
        if (folio_maybe_dma_pinned(folio))
                folio_set_checked(folio);
        return filemap_dirty_folio(mapping, folio);
}

static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
        WARN_ON_ONCE(!folio_buffers(folio));
        return block_dirty_folio(mapping, folio);
}

static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
                                    struct file *file, sector_t *span)
{
        return iomap_swapfile_activate(sis, file, span,
                                       &ext4_iomap_report_ops);
}

static const struct address_space_operations ext4_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_write_begin,
        .write_end                = ext4_write_end,
        .dirty_folio                = ext4_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_journalled_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_write_begin,
        .write_end                = ext4_journalled_write_end,
        .dirty_folio                = ext4_journalled_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_journalled_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio_norefs,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_da_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_da_write_begin,
        .write_end                = ext4_da_write_end,
        .dirty_folio                = ext4_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_dax_aops = {
        .writepages                = ext4_dax_writepages,
        .dirty_folio                = noop_dirty_folio,
        .bmap                        = ext4_bmap,
        .swap_activate                = ext4_iomap_swap_activate,
};

void ext4_set_aops(struct inode *inode)
{
        switch (ext4_inode_journal_mode(inode)) {
        case EXT4_INODE_ORDERED_DATA_MODE:
        case EXT4_INODE_WRITEBACK_DATA_MODE:
                break;
        case EXT4_INODE_JOURNAL_DATA_MODE:
                inode->i_mapping->a_ops = &ext4_journalled_aops;
                return;
        default:
                BUG();
        }
        if (IS_DAX(inode))
                inode->i_mapping->a_ops = &ext4_dax_aops;
        else if (test_opt(inode->i_sb, DELALLOC))
                inode->i_mapping->a_ops = &ext4_da_aops;
        else
                inode->i_mapping->a_ops = &ext4_aops;
}

/*
 * Here we can't skip an unwritten buffer even though it usually reads zero
 * because it might have data in pagecache (eg, if called from ext4_zero_range,
 * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
 * racing writeback can come later and flush the stale pagecache to disk.
 */
static struct buffer_head *ext4_load_tail_bh(struct inode *inode, loff_t from)
{
        unsigned int offset, blocksize, pos;
        ext4_lblk_t iblock;
        struct address_space *mapping = inode->i_mapping;
        struct buffer_head *bh;
        struct folio *folio;
        int err = 0;

        folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT,
                                    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                                    mapping_gfp_constraint(mapping, ~__GFP_FS));
        if (IS_ERR(folio))
                return ERR_CAST(folio);

        blocksize = inode->i_sb->s_blocksize;

        iblock = EXT4_PG_TO_LBLK(inode, folio->index);

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio, blocksize, 0);

        /* Find the buffer that contains "offset" */
        offset = offset_in_folio(folio, from);
        pos = blocksize;
        while (offset >= pos) {
                bh = bh->b_this_page;
                iblock++;
                pos += blocksize;
        }
        if (buffer_freed(bh)) {
                BUFFER_TRACE(bh, "freed: skip");
                goto unlock;
        }
        if (!buffer_mapped(bh)) {
                BUFFER_TRACE(bh, "unmapped");
                ext4_get_block(inode, iblock, bh, 0);
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh)) {
                        BUFFER_TRACE(bh, "still unmapped");
                        goto unlock;
                }
        }

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (folio_test_uptodate(folio))
                set_buffer_uptodate(bh);

        if (!buffer_uptodate(bh)) {
                err = ext4_read_bh_lock(bh, 0, true);
                if (err)
                        goto unlock;
                if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                        /* We expect the key to be set. */
                        BUG_ON(!fscrypt_has_encryption_key(inode));
                        err = fscrypt_decrypt_pagecache_blocks(folio,
                                                               blocksize,
                                                               bh_offset(bh));
                        if (err) {
                                clear_buffer_uptodate(bh);
                                goto unlock;
                        }
                }
        }
        return bh;

unlock:
        folio_unlock(folio);
        folio_put(folio);
        return err ? ERR_PTR(err) : NULL;
}

static int ext4_block_do_zero_range(struct inode *inode, loff_t from,
                                    loff_t length, bool *did_zero,
                                    bool *zero_written)
{
        struct buffer_head *bh;
        struct folio *folio;

        bh = ext4_load_tail_bh(inode, from);
        if (IS_ERR_OR_NULL(bh))
                return PTR_ERR_OR_ZERO(bh);

        folio = bh->b_folio;
        folio_zero_range(folio, offset_in_folio(folio, from), length);
        BUFFER_TRACE(bh, "zeroed end of block");

        mark_buffer_dirty(bh);
        if (did_zero)
                *did_zero = true;
        if (zero_written && !buffer_unwritten(bh) && !buffer_delay(bh))
                *zero_written = true;

        folio_unlock(folio);
        folio_put(folio);
        return 0;
}

static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from,
                                            loff_t length, bool *did_zero)
{
        struct buffer_head *bh;
        struct folio *folio;
        handle_t *handle;
        int err;

        handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        bh = ext4_load_tail_bh(inode, from);
        if (IS_ERR_OR_NULL(bh)) {
                err = PTR_ERR_OR_ZERO(bh);
                goto out_handle;
        }
        folio = bh->b_folio;

        BUFFER_TRACE(bh, "get write access");
        err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                            EXT4_JTR_NONE);
        if (err)
                goto out;

        folio_zero_range(folio, offset_in_folio(folio, from), length);
        BUFFER_TRACE(bh, "zeroed end of block");

        err = ext4_dirty_journalled_data(handle, bh);
        if (err)
                goto out;

        if (did_zero)
                *did_zero = true;
out:
        folio_unlock(folio);
        folio_put(folio);
out_handle:
        ext4_journal_stop(handle);
        return err;
}

/*
 * Zeros out a mapping of length 'length' starting from file offset
 * 'from'.  The range to be zero'd must be contained with in one block.
 * If the specified range exceeds the end of the block it will be
 * shortened to end of the block that corresponds to 'from'.
 */
static int ext4_block_zero_range(struct inode *inode,
                                 loff_t from, loff_t length, bool *did_zero,
                                 bool *zero_written)
{
        unsigned blocksize = inode->i_sb->s_blocksize;
        unsigned int max = blocksize - (from & (blocksize - 1));

        /*
         * correct length if it does not fall between
         * 'from' and the end of the block
         */
        if (length > max || length < 0)
                length = max;

        if (IS_DAX(inode)) {
                return dax_zero_range(inode, from, length, did_zero,
                                      &ext4_iomap_ops);
        } else if (ext4_should_journal_data(inode)) {
                return ext4_block_journalled_zero_range(inode, from, length,
                                                        did_zero);
        }
        return ext4_block_do_zero_range(inode, from, length, did_zero,
                                        zero_written);
}

/*
 * Zero out a mapping from file offset 'from' up to the end of the block
 * which corresponds to 'from' or to the given 'end' inside this block.
 * This required during truncate up and performing append writes. We need
 * to physically zero the tail end of that block so it doesn't yield old
 * data if the file is grown.
 */
int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
{
        unsigned int blocksize = i_blocksize(inode);
        unsigned int offset;
        loff_t length = end - from;
        bool did_zero = false;
        bool zero_written = false;
        int err;

        offset = from & (blocksize - 1);
        if (!offset || from >= end)
                return 0;
        /* If we are processing an encrypted inode during orphan list handling */
        if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
                return 0;

        if (length > blocksize - offset)
                length = blocksize - offset;

        err = ext4_block_zero_range(inode, from, length,
                                    &did_zero, &zero_written);
        if (err)
                return err;
        /*
         * It's necessary to order zeroed data before update i_disksize when
         * truncating up or performing an append write, because there might be
         * exposing stale on-disk data which may caused by concurrent post-EOF
         * mmap write during folio writeback.
         */
        if (ext4_should_order_data(inode) &&
            did_zero && zero_written && !IS_DAX(inode)) {
                handle_t *handle;

                handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
                if (IS_ERR(handle))
                        return PTR_ERR(handle);

                err = ext4_jbd2_inode_add_write(handle, inode, from, length);
                ext4_journal_stop(handle);
                if (err)
                        return err;
        }

        return 0;
}

int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length,
                             bool *did_zero)
{
        struct super_block *sb = inode->i_sb;
        unsigned partial_start, partial_end;
        ext4_fsblk_t start, end;
        loff_t byte_end = (lstart + length - 1);
        int err = 0;

        partial_start = lstart & (sb->s_blocksize - 1);
        partial_end = byte_end & (sb->s_blocksize - 1);

        start = lstart >> sb->s_blocksize_bits;
        end = byte_end >> sb->s_blocksize_bits;

        /* Handle partial zero within the single block */
        if (start == end &&
            (partial_start || (partial_end != sb->s_blocksize - 1))) {
                err = ext4_block_zero_range(inode, lstart, length, did_zero,
                                            NULL);
                return err;
        }
        /* Handle partial zero out on the start of the range */
        if (partial_start) {
                err = ext4_block_zero_range(inode, lstart, sb->s_blocksize,
                                            did_zero, NULL);
                if (err)
                        return err;
        }
        /* Handle partial zero out on the end of the range */
        if (partial_end != sb->s_blocksize - 1)
                err = ext4_block_zero_range(inode, byte_end - partial_end,
                                            partial_end + 1, did_zero, NULL);
        return err;
}

int ext4_can_truncate(struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
                return 1;
        if (S_ISLNK(inode->i_mode))
                return !ext4_inode_is_fast_symlink(inode);
        return 0;
}

/*
 * We have to make sure i_disksize gets properly updated before we truncate
 * page cache due to hole punching or zero range. Otherwise i_disksize update
 * can get lost as it may have been postponed to submission of writeback but
 * that will never happen if we remove the folio containing i_size from the
 * page cache. Also if we punch hole within i_size but above i_disksize,
 * following ext4_page_mkwrite() may mistakenly allocate written blocks over
 * the hole and thus introduce allocated blocks beyond i_disksize which is
 * not allowed (e2fsck would complain in case of crash).
 */
int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len)
{
        handle_t *handle;
        int ret;

        loff_t size = i_size_read(inode);

        WARN_ON(!inode_is_locked(inode));
        if (offset > size)
                return 0;

        if (offset + len < size)
                size = offset + len;
        if (EXT4_I(inode)->i_disksize >= size)
                return 0;

        handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ext4_update_i_disksize(inode, size);
        ret = ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);

        return ret;
}

static inline void ext4_truncate_folio(struct inode *inode,
                                       loff_t start, loff_t end)
{
        unsigned long blocksize = i_blocksize(inode);
        struct folio *folio;

        /* Nothing to be done if no complete block needs to be truncated. */
        if (round_up(start, blocksize) >= round_down(end, blocksize))
                return;

        folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
        if (IS_ERR(folio))
                return;

        if (folio_mkclean(folio))
                folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);
}

int ext4_truncate_page_cache_block_range(struct inode *inode,
                                         loff_t start, loff_t end)
{
        unsigned long blocksize = i_blocksize(inode);
        int ret;

        /*
         * For journalled data we need to write (and checkpoint) pages
         * before discarding page cache to avoid inconsitent data on disk
         * in case of crash before freeing or unwritten converting trans
         * is committed.
         */
        if (ext4_should_journal_data(inode)) {
                ret = filemap_write_and_wait_range(inode->i_mapping, start,
                                                   end - 1);
                if (ret)
                        return ret;
                goto truncate_pagecache;
        }

        /*
         * If the block size is less than the page size, the file's mapped
         * blocks within one page could be freed or converted to unwritten.
         * So it's necessary to remove writable userspace mappings, and then
         * ext4_page_mkwrite() can be called during subsequent write access
         * to these partial folios.
         */
        if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
            blocksize < PAGE_SIZE && start < inode->i_size) {
                loff_t page_boundary = round_up(start, PAGE_SIZE);

                ext4_truncate_folio(inode, start, min(page_boundary, end));
                if (end > page_boundary)
                        ext4_truncate_folio(inode,
                                            round_down(end, PAGE_SIZE), end);
        }

truncate_pagecache:
        truncate_pagecache_range(inode, start, end - 1);
        return 0;
}

static void ext4_wait_dax_page(struct inode *inode)
{
        filemap_invalidate_unlock(inode->i_mapping);
        schedule();
        filemap_invalidate_lock(inode->i_mapping);
}

int ext4_break_layouts(struct inode *inode)
{
        if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
                return -EINVAL;

        return dax_break_layout_inode(inode, ext4_wait_dax_page);
}

/*
 * ext4_punch_hole: punches a hole in a file by releasing the blocks
 * associated with the given offset and length
 *
 * @inode:  File inode
 * @offset: The offset where the hole will begin
 * @len:    The length of the hole
 *
 * Returns: 0 on success or negative on failure
 */

int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t start_lblk, end_lblk;
        loff_t max_end = sb->s_maxbytes;
        loff_t end = offset + length;
        handle_t *handle;
        unsigned int credits;
        bool partial_zeroed = false;
        int ret;

        trace_ext4_punch_hole(inode, offset, length, 0);
        WARN_ON_ONCE(!inode_is_locked(inode));

        /*
         * For indirect-block based inodes, make sure that the hole within
         * one block before last range.
         */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;

        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size || offset >= max_end)
                return 0;

        /*
         * If the hole extends beyond i_size, set the hole to end after
         * the block that contains i_size to save pointless tail block zeroing.
         */
        if (end >= inode->i_size)
                end = round_up(inode->i_size, sb->s_blocksize);
        if (end > max_end)
                end = max_end;
        length = end - offset;

        ret = ext4_update_disksize_before_punch(inode, offset, length);
        if (ret)
                return ret;

        /* Now release the pages and zero block aligned part of pages*/
        ret = ext4_truncate_page_cache_block_range(inode, offset, end);
        if (ret)
                return ret;

        ret = ext4_zero_partial_blocks(inode, offset, length, &partial_zeroed);
        if (ret)
                return ret;
        if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) {
                ret = filemap_write_and_wait_range(inode->i_mapping, offset,
                                                   end - 1);
                if (ret)
                        return ret;
        }

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_chunk_trans_extent(inode, 0);
        else
                credits = ext4_blocks_for_truncate(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                ext4_std_error(sb, ret);
                return ret;
        }

        /* If there are blocks to remove, do it */
        start_lblk = EXT4_B_TO_LBLK(inode, offset);
        end_lblk = end >> inode->i_blkbits;

        if (end_lblk > start_lblk) {
                ext4_lblk_t hole_len = end_lblk - start_lblk;

                ext4_fc_track_inode(handle, inode);
                ext4_check_map_extents_env(inode);
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode);

                ext4_es_remove_extent(inode, start_lblk, hole_len);

                if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                        ret = ext4_ext_remove_space(inode, start_lblk,
                                                    end_lblk - 1);
                else
                        ret = ext4_ind_remove_space(handle, inode, start_lblk,
                                                    end_lblk);
                if (ret) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        goto out_handle;
                }

                ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
                                      EXTENT_STATUS_HOLE, 0);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
        ext4_fc_track_range(handle, inode, start_lblk, end_lblk);

        ret = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(ret))
                goto out_handle;

        ext4_update_inode_fsync_trans(handle, inode, 1);
        if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
                ext4_handle_sync(handle);
out_handle:
        ext4_journal_stop(handle);
        return ret;
}

int ext4_inode_attach_jinode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct jbd2_inode *jinode;

        if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
                return 0;

        jinode = jbd2_alloc_inode(GFP_KERNEL);
        spin_lock(&inode->i_lock);
        if (!ei->jinode) {
                if (!jinode) {
                        spin_unlock(&inode->i_lock);
                        return -ENOMEM;
                }
                jbd2_journal_init_jbd_inode(jinode, inode);
                /*
                 * Publish ->jinode only after it is fully initialized so that
                 * readers never observe a partially initialized jbd2_inode.
                 */
                smp_wmb();
                WRITE_ONCE(ei->jinode, jinode);
                jinode = NULL;
        }
        spin_unlock(&inode->i_lock);
        if (unlikely(jinode != NULL))
                jbd2_free_inode(jinode);
        return 0;
}

/*
 * ext4_truncate()
 *
 * We block out ext4_get_block() block instantiations across the entire
 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
 * simultaneously on behalf of the same inode.
 *
 * As we work through the truncate and commit bits of it to the journal there
 * is one core, guiding principle: the file's tree must always be consistent on
 * disk.  We must be able to restart the truncate after a crash.
 *
 * The file's tree may be transiently inconsistent in memory (although it
 * probably isn't), but whenever we close off and commit a journal transaction,
 * the contents of (the filesystem + the journal) must be consistent and
 * restartable.  It's pretty simple, really: bottom up, right to left (although
 * left-to-right works OK too).
 *
 * Note that at recovery time, journal replay occurs *before* the restart of
 * truncate against the orphan inode list.
 *
 * The committed inode has the new, desired i_size (which is the same as
 * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
 * that this inode's truncate did not complete and it will again call
 * ext4_truncate() to have another go.  So there will be instantiated blocks
 * to the right of the truncation point in a crashed ext4 filesystem.  But
 * that's fine - as long as they are linked from the inode, the post-crash
 * ext4_truncate() run will find them and release them.
 */
int ext4_truncate(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int credits;
        int err = 0, err2;
        handle_t *handle;

        /*
         * There is a possibility that we're either freeing the inode
         * or it's a completely new inode. In those cases we might not
         * have i_rwsem locked because it's not necessary.
         */
        if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
                WARN_ON(!inode_is_locked(inode));
        trace_ext4_truncate_enter(inode);

        if (!ext4_can_truncate(inode))
                goto out_trace;

        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);

        if (ext4_has_inline_data(inode)) {
                int has_inline = 1;

                err = ext4_inline_data_truncate(inode, &has_inline);
                if (err || has_inline)
                        goto out_trace;
        }

        /* If we zero-out tail of the page, we have to create jinode for jbd2 */
        if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
                err = ext4_inode_attach_jinode(inode);
                if (err)
                        goto out_trace;

                /* Zero to the end of the block containing i_size */
                err = ext4_block_zero_eof(inode, inode->i_size, LLONG_MAX);
                if (err)
                        goto out_trace;
        }

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_chunk_trans_extent(inode, 1);
        else
                credits = ext4_blocks_for_truncate(inode);

        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto out_trace;
        }

        /*
         * We add the inode to the orphan list, so that if this
         * truncate spans multiple transactions, and we crash, we will
         * resume the truncate when the filesystem recovers.  It also
         * marks the inode dirty, to catch the new size.
         *
         * Implication: the file must always be in a sane, consistent
         * truncatable state while each transaction commits.
         */
        err = ext4_orphan_add(handle, inode);
        if (err)
                goto out_stop;

        ext4_fc_track_inode(handle, inode);
        ext4_check_map_extents_env(inode);

        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode);

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                err = ext4_ext_truncate(handle, inode);
        else
                ext4_ind_truncate(handle, inode);

        up_write(&ei->i_data_sem);
        if (err)
                goto out_stop;

        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

out_stop:
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
         * However, if this was a real unlink then we were called by
         * ext4_evict_inode(), and we allow that function to clean up the
         * orphan info for us.
         */
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);

        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        err2 = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(err2 && !err))
                err = err2;
        ext4_journal_stop(handle);

out_trace:
        trace_ext4_truncate_exit(inode);
        return err;
}

static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
{
        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                return inode_peek_iversion_raw(inode);
        else
                return inode_peek_iversion(inode);
}

static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
                                 struct ext4_inode_info *ei)
{
        struct inode *inode = &(ei->vfs_inode);
        u64 i_blocks = READ_ONCE(inode->i_blocks);
        struct super_block *sb = inode->i_sb;

        if (i_blocks <= ~0U) {
                /*
                 * i_blocks can be represented in a 32 bit variable
                 * as multiple of 512 bytes
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = 0;
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                return 0;
        }

        /*
         * This should never happen since sb->s_maxbytes should not have
         * allowed this, sb->s_maxbytes was set according to the huge_file
         * feature in ext4_fill_super().
         */
        if (!ext4_has_feature_huge_file(sb))
                return -EFSCORRUPTED;

        if (i_blocks <= 0xffffffffffffULL) {
                /*
                 * i_blocks can be represented in a 48 bit variable
                 * as multiple of 512 bytes
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
        } else {
                ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                /* i_block is stored in file system block size */
                i_blocks = i_blocks >> (inode->i_blkbits - 9);
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
        }
        return 0;
}

static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        uid_t i_uid;
        gid_t i_gid;
        projid_t i_projid;
        int block;
        int err;

        err = ext4_inode_blocks_set(raw_inode, ei);

        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
        i_uid = i_uid_read(inode);
        i_gid = i_gid_read(inode);
        i_projid = from_kprojid(&init_user_ns, ei->i_projid);
        if (!(test_opt(inode->i_sb, NO_UID32))) {
                raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
                /*
                 * Fix up interoperability with old kernels. Otherwise,
                 * old inodes get re-used with the upper 16 bits of the
                 * uid/gid intact.
                 */
                if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) {
                        raw_inode->i_uid_high = 0;
                        raw_inode->i_gid_high = 0;
                } else {
                        raw_inode->i_uid_high =
                                cpu_to_le16(high_16_bits(i_uid));
                        raw_inode->i_gid_high =
                                cpu_to_le16(high_16_bits(i_gid));
                }
        } else {
                raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
                raw_inode->i_uid_high = 0;
                raw_inode->i_gid_high = 0;
        }
        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);

        EXT4_INODE_SET_CTIME(inode, raw_inode);
        EXT4_INODE_SET_MTIME(inode, raw_inode);
        EXT4_INODE_SET_ATIME(inode, raw_inode);
        EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);

        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
                raw_inode->i_file_acl_high =
                        cpu_to_le16(ei->i_file_acl >> 32);
        raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
        ext4_isize_set(raw_inode, ei->i_disksize);

        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                if (old_valid_dev(inode->i_rdev)) {
                        raw_inode->i_block[0] =
                                cpu_to_le32(old_encode_dev(inode->i_rdev));
                        raw_inode->i_block[1] = 0;
                } else {
                        raw_inode->i_block[0] = 0;
                        raw_inode->i_block[1] =
                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        raw_inode->i_block[2] = 0;
                }
        } else if (!ext4_has_inline_data(inode)) {
                for (block = 0; block < EXT4_N_BLOCKS; block++)
                        raw_inode->i_block[block] = ei->i_data[block];
        }

        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
                u64 ivers = ext4_inode_peek_iversion(inode);

                raw_inode->i_disk_version = cpu_to_le32(ivers);
                if (ei->i_extra_isize) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                raw_inode->i_version_hi =
                                        cpu_to_le32(ivers >> 32);
                        raw_inode->i_extra_isize =
                                cpu_to_le16(ei->i_extra_isize);
                }
        }

        if (i_projid != EXT4_DEF_PROJID &&
            !ext4_has_feature_project(inode->i_sb))
                err = err ?: -EFSCORRUPTED;

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
                raw_inode->i_projid = cpu_to_le32(i_projid);

        ext4_inode_csum_set(inode, raw_inode, ei);
        return err;
}

/*
 * ext4_get_inode_loc returns with an extra refcount against the inode's
 * underlying buffer_head on success. If we pass 'inode' and it does not
 * have in-inode xattr, we have all inode data in memory that is needed
 * to recreate the on-disk version of this inode.
 */
static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
                                struct inode *inode, struct ext4_iloc *iloc,
                                ext4_fsblk_t *ret_block)
{
        struct ext4_group_desc        *gdp;
        struct buffer_head        *bh;
        ext4_fsblk_t                block;
        struct blk_plug                plug;
        int                        inodes_per_block, inode_offset;

        iloc->bh = NULL;
        if (ino < EXT4_ROOT_INO ||
            ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
                return -EFSCORRUPTED;

        iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
        if (!gdp)
                return -EIO;

        /*
         * Figure out the offset within the block group inode table
         */
        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        inode_offset = ((ino - 1) %
                        EXT4_INODES_PER_GROUP(sb));
        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);

        block = ext4_inode_table(sb, gdp);
        if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
            (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
                ext4_error(sb, "Invalid inode table block %llu in "
                           "block_group %u", block, iloc->block_group);
                return -EFSCORRUPTED;
        }
        block += (inode_offset / inodes_per_block);

        bh = sb_getblk(sb, block);
        if (unlikely(!bh))
                return -ENOMEM;
        if (ext4_buffer_uptodate(bh))
                goto has_buffer;

        lock_buffer(bh);
        if (ext4_buffer_uptodate(bh)) {
                /* Someone brought it uptodate while we waited */
                unlock_buffer(bh);
                goto has_buffer;
        }

        /*
         * If we have all information of the inode in memory and this
         * is the only valid inode in the block, we need not read the
         * block.
         */
        if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                struct buffer_head *bitmap_bh;
                int i, start;

                start = inode_offset & ~(inodes_per_block - 1);

                /* Is the inode bitmap in cache? */
                bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
                if (unlikely(!bitmap_bh))
                        goto make_io;

                /*
                 * If the inode bitmap isn't in cache then the
                 * optimisation may end up performing two reads instead
                 * of one, so skip it.
                 */
                if (!buffer_uptodate(bitmap_bh)) {
                        brelse(bitmap_bh);
                        goto make_io;
                }
                for (i = start; i < start + inodes_per_block; i++) {
                        if (i == inode_offset)
                                continue;
                        if (ext4_test_bit(i, bitmap_bh->b_data))
                                break;
                }
                brelse(bitmap_bh);
                if (i == start + inodes_per_block) {
                        struct ext4_inode *raw_inode =
                                (struct ext4_inode *) (bh->b_data + iloc->offset);

                        /* all other inodes are free, so skip I/O */
                        memset(bh->b_data, 0, bh->b_size);
                        if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
                                ext4_fill_raw_inode(inode, raw_inode);
                        set_buffer_uptodate(bh);
                        unlock_buffer(bh);
                        goto has_buffer;
                }
        }

make_io:
        /*
         * If we need to do any I/O, try to pre-readahead extra
         * blocks from the inode table.
         */
        blk_start_plug(&plug);
        if (EXT4_SB(sb)->s_inode_readahead_blks) {
                ext4_fsblk_t b, end, table;
                unsigned num;
                __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;

                table = ext4_inode_table(sb, gdp);
                /* s_inode_readahead_blks is always a power of 2 */
                b = block & ~((ext4_fsblk_t) ra_blks - 1);
                if (table > b)
                        b = table;
                end = b + ra_blks;
                num = EXT4_INODES_PER_GROUP(sb);
                if (ext4_has_group_desc_csum(sb))
                        num -= ext4_itable_unused_count(sb, gdp);
                table += num / inodes_per_block;
                if (end > table)
                        end = table;
                while (b <= end)
                        ext4_sb_breadahead_unmovable(sb, b++);
        }

        /*
         * There are other valid inodes in the buffer, this inode
         * has in-inode xattrs, or we don't have this inode in memory.
         * Read the block from disk.
         */
        trace_ext4_load_inode(sb, ino);
        ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL,
                            ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO));
        blk_finish_plug(&plug);
        wait_on_buffer(bh);
        if (!buffer_uptodate(bh)) {
                if (ret_block)
                        *ret_block = block;
                brelse(bh);
                return -EIO;
        }
has_buffer:
        iloc->bh = bh;
        return 0;
}

static int __ext4_get_inode_loc_noinmem(struct inode *inode,
                                        struct ext4_iloc *iloc)
{
        ext4_fsblk_t err_blk = 0;
        int ret;

        ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
                                        &err_blk);

        if (ret == -EIO)
                ext4_error_inode_block(inode, err_blk, EIO,
                                        "unable to read itable block");

        return ret;
}

int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
        ext4_fsblk_t err_blk = 0;
        int ret;

        ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
                                        &err_blk);

        if (ret == -EIO)
                ext4_error_inode_block(inode, err_blk, EIO,
                                        "unable to read itable block");

        return ret;
}


int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc)
{
        return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
}

static bool ext4_should_enable_dax(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        if (test_opt2(inode->i_sb, DAX_NEVER))
                return false;
        if (!S_ISREG(inode->i_mode))
                return false;
        if (ext4_should_journal_data(inode))
                return false;
        if (ext4_has_inline_data(inode))
                return false;
        if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
                return false;
        if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
                return false;
        if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
                return false;
        if (test_opt(inode->i_sb, DAX_ALWAYS))
                return true;

        return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}

void ext4_set_inode_flags(struct inode *inode, bool init)
{
        unsigned int flags = EXT4_I(inode)->i_flags;
        unsigned int new_fl = 0;

        WARN_ON_ONCE(IS_DAX(inode) && init);

        if (flags & EXT4_SYNC_FL)
                new_fl |= S_SYNC;
        if (flags & EXT4_APPEND_FL)
                new_fl |= S_APPEND;
        if (flags & EXT4_IMMUTABLE_FL)
                new_fl |= S_IMMUTABLE;
        if (flags & EXT4_NOATIME_FL)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;

        /* Because of the way inode_set_flags() works we must preserve S_DAX
         * here if already set. */
        new_fl |= (inode->i_flags & S_DAX);
        if (init && ext4_should_enable_dax(inode))
                new_fl |= S_DAX;

        if (flags & EXT4_ENCRYPT_FL)
                new_fl |= S_ENCRYPTED;
        if (flags & EXT4_CASEFOLD_FL)
                new_fl |= S_CASEFOLD;
        if (flags & EXT4_VERITY_FL)
                new_fl |= S_VERITY;
        inode_set_flags(inode, new_fl,
                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
                        S_ENCRYPTED|S_CASEFOLD|S_VERITY);
}

static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
                                  struct ext4_inode_info *ei)
{
        blkcnt_t i_blocks ;
        struct inode *inode = &(ei->vfs_inode);
        struct super_block *sb = inode->i_sb;

        if (ext4_has_feature_huge_file(sb)) {
                /* we are using combined 48 bit field */
                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
                                        le32_to_cpu(raw_inode->i_blocks_lo);
                if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
                        /* i_blocks represent file system block size */
                        return i_blocks  << (inode->i_blkbits - 9);
                } else {
                        return i_blocks;
                }
        } else {
                return le32_to_cpu(raw_inode->i_blocks_lo);
        }
}

static inline int ext4_iget_extra_inode(struct inode *inode,
                                         struct ext4_inode *raw_inode,
                                         struct ext4_inode_info *ei)
{
        __le32 *magic = (void *)raw_inode +
                        EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;

        if (EXT4_INODE_HAS_XATTR_SPACE(inode)  &&
            *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
                int err;

                err = xattr_check_inode(inode, IHDR(inode, raw_inode),
                                        ITAIL(inode, raw_inode));
                if (err)
                        return err;

                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                err = ext4_find_inline_data_nolock(inode);
                if (!err && ext4_has_inline_data(inode))
                        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                return err;
        } else
                EXT4_I(inode)->i_inline_off = 0;
        return 0;
}

int ext4_get_projid(struct inode *inode, kprojid_t *projid)
{
        if (!ext4_has_feature_project(inode->i_sb))
                return -EOPNOTSUPP;
        *projid = EXT4_I(inode)->i_projid;
        return 0;
}

/*
 * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
 * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
 * set.
 */
static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
{
        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                inode_set_iversion_raw(inode, val);
        else
                inode_set_iversion_queried(inode, val);
}

static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
                            const char *function, unsigned int line)
{
        const char *err_str;

        if (flags & EXT4_IGET_EA_INODE) {
                if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
                        err_str = "missing EA_INODE flag";
                        goto error;
                }
                if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
                    EXT4_I(inode)->i_file_acl) {
                        err_str = "ea_inode with extended attributes";
                        goto error;
                }
        } else {
                if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
                        /*
                         * open_by_handle_at() could provide an old inode number
                         * that has since been reused for an ea_inode; this does
                         * not indicate filesystem corruption
                         */
                        if (flags & EXT4_IGET_HANDLE)
                                return -ESTALE;
                        err_str = "unexpected EA_INODE flag";
                        goto error;
                }
        }
        if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
                err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
                goto error;
        }
        return 0;

error:
        ext4_error_inode(inode, function, line, 0, "%s", err_str);
        return -EFSCORRUPTED;
}

void ext4_set_inode_mapping_order(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        u16 min_order, max_order;

        max_order = EXT4_SB(sb)->s_max_folio_order;
        if (!max_order)
                return;

        min_order = EXT4_SB(sb)->s_min_folio_order;
        if (!min_order && !S_ISREG(inode->i_mode))
                return;

        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                max_order = min_order;

        mapping_set_folio_order_range(inode->i_mapping, min_order, max_order);
}

struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                          ext4_iget_flags flags, const char *function,
                          unsigned int line)
{
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct inode *inode;
        journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        loff_t size;
        int block;
        uid_t i_uid;
        gid_t i_gid;
        projid_t i_projid;

        if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) ||
            (ino < EXT4_ROOT_INO) ||
            (ino > le32_to_cpu(es->s_inodes_count))) {
                if (flags & EXT4_IGET_HANDLE)
                        return ERR_PTR(-ESTALE);
                __ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
                             "inode #%lu: comm %s: iget: illegal inode #",
                             ino, current->comm);
                return ERR_PTR(-EFSCORRUPTED);
        }

        inode = iget_locked(sb, ino);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (!(inode_state_read_once(inode) & I_NEW)) {
                ret = check_igot_inode(inode, flags, function, line);
                if (ret) {
                        iput(inode);
                        return ERR_PTR(ret);
                }
                return inode;
        }

        ei = EXT4_I(inode);
        iloc.bh = NULL;

        ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
        if (ret < 0)
                goto bad_inode;
        raw_inode = ext4_raw_inode(&iloc);

        if ((flags & EXT4_IGET_HANDLE) &&
            (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
                ret = -ESTALE;
                goto bad_inode;
        }

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                        EXT4_INODE_SIZE(inode->i_sb) ||
                    (ei->i_extra_isize & 3)) {
                        ext4_error_inode(inode, function, line, 0,
                                         "iget: bad extra_isize %u "
                                         "(inode size %u)",
                                         ei->i_extra_isize,
                                         EXT4_INODE_SIZE(inode->i_sb));
                        ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
        } else
                ei->i_extra_isize = 0;

        /* Precompute checksum seed for inode metadata */
        if (ext4_has_feature_metadata_csum(sb)) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                __u32 csum;
                __le32 inum = cpu_to_le32(inode->i_ino);
                __le32 gen = raw_inode->i_generation;
                csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
                                   sizeof(inum));
                ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
        }

        if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
            ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
             (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
                ext4_error_inode_err(inode, function, line, 0,
                                EFSBADCRC, "iget: checksum invalid");
                ret = -EFSBADCRC;
                goto bad_inode;
        }

        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
        i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
        if (ext4_has_feature_project(sb) &&
            EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
                i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
        else
                i_projid = EXT4_DEF_PROJID;

        if (!(test_opt(inode->i_sb, NO_UID32))) {
                i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
        i_uid_write(inode, i_uid);
        i_gid_write(inode, i_gid);
        ei->i_projid = make_kprojid(&init_user_ns, i_projid);
        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));

        ei->i_inline_off = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
         * This is needed because nfsd might try to access dead inodes
         * the test is that same one that e2fsck uses
         * NeilBrown 1999oct15
         */
        if (inode->i_nlink == 0) {
                if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
                    ino != EXT4_BOOT_LOADER_INO) {
                        /* this inode is deleted or unallocated */
                        if (flags & EXT4_IGET_SPECIAL) {
                                ext4_error_inode(inode, function, line, 0,
                                                 "iget: special inode unallocated");
                                ret = -EFSCORRUPTED;
                        } else
                                ret = -ESTALE;
                        goto bad_inode;
                }
                /* The only unlinked inodes we let through here have
                 * valid i_mode and are being read by the orphan
                 * recovery code: that's fine, we're about to complete
                 * the process of deleting those.
                 * OR it is the EXT4_BOOT_LOADER_INO which is
                 * not initialized on a new filesystem. */
        }
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        ext4_set_inode_flags(inode, true);
        /* Detect invalid flag combination - can't have both inline data and extents */
        if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
            ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_error_inode(inode, function, line, 0,
                        "inode has both inline data and extents flags");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
        if (ext4_has_feature_64bit(sb))
                ei->i_file_acl |=
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
        inode->i_size = ext4_isize(sb, raw_inode);
        size = i_size_read(inode);
        if (size < 0 || size > ext4_get_maxbytes(inode)) {
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bad i_size value: %lld", size);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        /*
         * If dir_index is not enabled but there's dir with INDEX flag set,
         * we'd normally treat htree data as empty space. But with metadata
         * checksumming that corrupts checksums so forbid that.
         */
        if (!ext4_has_feature_dir_index(sb) &&
            ext4_has_feature_metadata_csum(sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                ext4_error_inode(inode, function, line, 0,
                         "iget: Dir with htree data on filesystem without dir_index feature.");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
#endif
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
        ei->i_last_alloc_group = ~0;
        /*
         * NOTE! The in-memory inode i_data array is in little-endian order
         * even on big-endian machines: we do NOT byteswap the block numbers!
         */
        for (block = 0; block < EXT4_N_BLOCKS; block++)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
        ext4_fc_init_inode(&ei->vfs_inode);

        /*
         * Set transaction id's of transactions that have to be committed
         * to finish f[data]sync. We set them to currently running transaction
         * as we cannot be sure that the inode or some of its metadata isn't
         * part of the transaction - the inode could have been reclaimed and
         * now it is reread from disk.
         */
        if (journal) {
                transaction_t *transaction;
                tid_t tid;

                read_lock(&journal->j_state_lock);
                if (journal->j_running_transaction)
                        transaction = journal->j_running_transaction;
                else
                        transaction = journal->j_committing_transaction;
                if (transaction)
                        tid = transaction->t_tid;
                else
                        tid = journal->j_commit_sequence;
                read_unlock(&journal->j_state_lock);
                ei->i_sync_tid = tid;
                ei->i_datasync_tid = tid;
        }

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                if (ei->i_extra_isize == 0) {
                        /* The extra space is currently unused. Use it. */
                        BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
                        ei->i_extra_isize = sizeof(struct ext4_inode) -
                                            EXT4_GOOD_OLD_INODE_SIZE;
                } else {
                        ret = ext4_iget_extra_inode(inode, raw_inode, ei);
                        if (ret)
                                goto bad_inode;
                }
        }

        EXT4_INODE_GET_CTIME(inode, raw_inode);
        EXT4_INODE_GET_ATIME(inode, raw_inode);
        EXT4_INODE_GET_MTIME(inode, raw_inode);
        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);

        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
                u64 ivers = le32_to_cpu(raw_inode->i_disk_version);

                if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                ivers |=
                    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
                }
                ext4_inode_set_iversion_queried(inode, ivers);
        }

        ret = 0;
        if (ei->i_file_acl &&
            !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bad extended attribute block %llu",
                                 ei->i_file_acl);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        } else if (!ext4_has_inline_data(inode)) {
                /* validate the block references in the inode */
                if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
                        (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                        (S_ISLNK(inode->i_mode) &&
                        !ext4_inode_is_fast_symlink(inode)))) {
                        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                                ret = ext4_ext_check_inode(inode);
                        else
                                ret = ext4_ind_check_inode(inode);
                }
        }
        if (ret)
                goto bad_inode;

        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &ext4_dir_inode_operations;
                inode->i_fop = &ext4_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
                /* VFS does not allow setting these so must be corruption */
                if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
                        ext4_error_inode(inode, function, line, 0,
                                         "iget: immutable or append flags "
                                         "not allowed on symlinks");
                        ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
                if (IS_ENCRYPTED(inode)) {
                        inode->i_op = &ext4_encrypted_symlink_inode_operations;
                } else if (ext4_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext4_fast_symlink_inode_operations;

                        /*
                         * Orphan cleanup can see inodes with i_size == 0
                         * and i_data uninitialized. Skip size checks in
                         * that case. This is safe because the first thing
                         * ext4_evict_inode() does for fast symlinks is
                         * clearing of i_data and i_size.
                         */
                        if ((EXT4_SB(sb)->s_mount_state & EXT4_ORPHAN_FS)) {
                                if (inode->i_nlink != 0) {
                                        ext4_error_inode(inode, function, line, 0,
                                                "invalid orphan symlink nlink %d",
                                                inode->i_nlink);
                                        ret = -EFSCORRUPTED;
                                        goto bad_inode;
                                }
                        } else {
                                if (inode->i_size == 0 ||
                                    inode->i_size >= sizeof(ei->i_data) ||
                                    strnlen((char *)ei->i_data, inode->i_size + 1) !=
                                                inode->i_size) {
                                        ext4_error_inode(inode, function, line, 0,
                                                "invalid fast symlink length %llu",
                                                (unsigned long long)inode->i_size);
                                        ret = -EFSCORRUPTED;
                                        goto bad_inode;
                                }
                                inode_set_cached_link(inode, (char *)ei->i_data,
                                                      inode->i_size);
                        }
                } else {
                        inode->i_op = &ext4_symlink_inode_operations;
                }
        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
              S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                inode->i_op = &ext4_special_inode_operations;
                if (raw_inode->i_block[0])
                        init_special_inode(inode, inode->i_mode,
                           old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else if (ino == EXT4_BOOT_LOADER_INO) {
                make_bad_inode(inode);
        } else {
                ret = -EFSCORRUPTED;
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bogus i_mode (%o)", inode->i_mode);
                goto bad_inode;
        }
        if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) {
                ext4_error_inode(inode, function, line, 0,
                                 "casefold flag without casefold feature");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }

        ext4_set_inode_mapping_order(inode);

        ret = check_igot_inode(inode, flags, function, line);
        /*
         * -ESTALE here means there is nothing inherently wrong with the inode,
         * it's just not an inode we can return for an fhandle lookup.
         */
        if (ret == -ESTALE) {
                brelse(iloc.bh);
                unlock_new_inode(inode);
                iput(inode);
                return ERR_PTR(-ESTALE);
        }
        if (ret)
                goto bad_inode;
        brelse(iloc.bh);
        /* Initialize the "no ACL's" state for the simple cases */
        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) && !ei->i_file_acl)
                cache_no_acl(inode);
        unlock_new_inode(inode);
        return inode;

bad_inode:
        brelse(iloc.bh);
        iget_failed(inode);
        return ERR_PTR(ret);
}

static void __ext4_update_other_inode_time(struct super_block *sb,
                                           unsigned long orig_ino,
                                           unsigned long ino,
                                           struct ext4_inode *raw_inode)
{
        struct inode *inode;

        inode = find_inode_by_ino_rcu(sb, ino);
        if (!inode)
                return;

        if (!inode_is_dirtytime_only(inode))
                return;

        spin_lock(&inode->i_lock);
        if (inode_is_dirtytime_only(inode)) {
                struct ext4_inode_info        *ei = EXT4_I(inode);

                inode_state_clear(inode, I_DIRTY_TIME);
                spin_unlock(&inode->i_lock);

                spin_lock(&ei->i_raw_lock);
                EXT4_INODE_SET_CTIME(inode, raw_inode);
                EXT4_INODE_SET_MTIME(inode, raw_inode);
                EXT4_INODE_SET_ATIME(inode, raw_inode);
                ext4_inode_csum_set(inode, raw_inode, ei);
                spin_unlock(&ei->i_raw_lock);
                trace_ext4_other_inode_update_time(inode, orig_ino);
                return;
        }
        spin_unlock(&inode->i_lock);
}

/*
 * Opportunistically update the other time fields for other inodes in
 * the same inode table block.
 */
static void ext4_update_other_inodes_time(struct super_block *sb,
                                          unsigned long orig_ino, char *buf)
{
        unsigned long ino;
        int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        int inode_size = EXT4_INODE_SIZE(sb);

        /*
         * Calculate the first inode in the inode table block.  Inode
         * numbers are one-based.  That is, the first inode in a block
         * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
         */
        ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
        rcu_read_lock();
        for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
                if (ino == orig_ino)
                        continue;
                __ext4_update_other_inode_time(sb, orig_ino, ino,
                                               (struct ext4_inode *)buf);
        }
        rcu_read_unlock();
}

/*
 * Post the struct inode info into an on-disk inode location in the
 * buffer-cache.  This gobbles the caller's reference to the
 * buffer_head in the inode location struct.
 *
 * The caller must have write access to iloc->bh.
 */
static int ext4_do_update_inode(handle_t *handle,
                                struct inode *inode,
                                struct ext4_iloc *iloc)
{
        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct buffer_head *bh = iloc->bh;
        struct super_block *sb = inode->i_sb;
        int err;
        int need_datasync = 0, set_large_file = 0;

        spin_lock(&ei->i_raw_lock);

        /*
         * For fields not tracked in the in-memory inode, initialise them
         * to zero for new inodes.
         */
        if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);

        if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
                need_datasync = 1;
        if (ei->i_disksize > 0x7fffffffULL) {
                if (!ext4_has_feature_large_file(sb) ||
                    EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
                        set_large_file = 1;
        }

        err = ext4_fill_raw_inode(inode, raw_inode);
        spin_unlock(&ei->i_raw_lock);
        if (err) {
                EXT4_ERROR_INODE(inode, "corrupted inode contents");
                goto out_brelse;
        }

        if (inode->i_sb->s_flags & SB_LAZYTIME)
                ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
                                              bh->b_data);

        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (err)
                goto out_error;
        ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        if (set_large_file) {
                BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
                err = ext4_journal_get_write_access(handle, sb,
                                                    EXT4_SB(sb)->s_sbh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto out_error;
                lock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_set_feature_large_file(sb);
                ext4_superblock_csum_set(sb);
                unlock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_handle_sync(handle);
                err = ext4_handle_dirty_metadata(handle, NULL,
                                                 EXT4_SB(sb)->s_sbh);
        }
        ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_error:
        ext4_std_error(inode->i_sb, err);
out_brelse:
        brelse(bh);
        return err;
}

/*
 * ext4_write_inode()
 *
 * We are called from a few places:
 *
 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
 *   Here, there will be no transaction running. We wait for any running
 *   transaction to commit.
 *
 * - Within flush work (sys_sync(), kupdate and such).
 *   We wait on commit, if told to.
 *
 * - Within iput_final() -> write_inode_now()
 *   We wait on commit, if told to.
 *
 * In all cases it is actually safe for us to return without doing anything,
 * because the inode has been copied into a raw inode buffer in
 * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
 * writeback.
 *
 * Note that we are absolutely dependent upon all inode dirtiers doing the
 * right thing: they *must* call mark_inode_dirty() after dirtying info in
 * which we are interested.
 *
 * It would be a bug for them to not do this.  The code:
 *
 *        mark_inode_dirty(inode)
 *        stuff();
 *        inode->i_size = expr;
 *
 * is in error because write_inode() could occur while `stuff()' is running,
 * and the new i_size will be lost.  Plus the inode will no longer be on the
 * superblock's dirty inode list.
 */
int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int err;

        if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
                return 0;

        err = ext4_emergency_state(inode->i_sb);
        if (unlikely(err))
                return err;

        if (EXT4_SB(inode->i_sb)->s_journal) {
                if (ext4_journal_current_handle()) {
                        ext4_debug("called recursively, non-PF_MEMALLOC!\n");
                        dump_stack();
                        return -EIO;
                }

                /*
                 * No need to force transaction in WB_SYNC_NONE mode. Also
                 * ext4_sync_fs() will force the commit after everything is
                 * written.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
                        return 0;

                err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
                                                EXT4_I(inode)->i_sync_tid);
        } else {
                struct ext4_iloc iloc;

                err = __ext4_get_inode_loc_noinmem(inode, &iloc);
                if (err)
                        return err;
                /*
                 * sync(2) will flush the whole buffer cache. No need to do
                 * it here separately for each inode.
                 */
                if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
                        ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
                                               "IO error syncing inode");
                        err = -EIO;
                }
                brelse(iloc.bh);
        }
        return err;
}

/*
 * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
 * buffers that are attached to a folio straddling i_size and are undergoing
 * commit. In that case we have to wait for commit to finish and try again.
 */
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
        unsigned offset;
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        tid_t commit_tid;
        int ret;
        bool has_transaction;

        offset = inode->i_size & (PAGE_SIZE - 1);
        /*
         * If the folio is fully truncated, we don't need to wait for any commit
         * (and we even should not as __ext4_journalled_invalidate_folio() may
         * strip all buffers from the folio but keep the folio dirty which can then
         * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
         * buffers). Also we don't need to wait for any commit if all buffers in
         * the folio remain valid. This is most beneficial for the common case of
         * blocksize == PAGESIZE.
         */
        if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
                return;
        while (1) {
                struct folio *folio = filemap_lock_folio(inode->i_mapping,
                                      inode->i_size >> PAGE_SHIFT);
                if (IS_ERR(folio))
                        return;
                ret = __ext4_journalled_invalidate_folio(folio, offset,
                                                folio_size(folio) - offset);
                folio_unlock(folio);
                folio_put(folio);
                if (ret != -EBUSY)
                        return;
                has_transaction = false;
                read_lock(&journal->j_state_lock);
                if (journal->j_committing_transaction) {
                        commit_tid = journal->j_committing_transaction->t_tid;
                        has_transaction = true;
                }
                read_unlock(&journal->j_state_lock);
                if (has_transaction)
                        jbd2_log_wait_commit(journal, commit_tid);
        }
}

/*
 * ext4_setattr()
 *
 * Called from notify_change.
 *
 * We want to trap VFS attempts to truncate the file as soon as
 * possible.  In particular, we want to make sure that when the VFS
 * shrinks i_size, we put the inode on the orphan list and modify
 * i_disksize immediately, so that during the subsequent flushing of
 * dirty pages and freeing of disk blocks, we can guarantee that any
 * commit will leave the blocks being flushed in an unused state on
 * disk.  (On recovery, the inode will get truncated and the blocks will
 * be freed, so we have a strong guarantee that no future commit will
 * leave these blocks visible to the user.)
 *
 * Another thing we have to assure is that if we are in ordered mode
 * and inode is still attached to the committing transaction, we must
 * we start writeout of all the dirty pages which are being truncated.
 * This way we are sure that all the data written in the previous
 * transaction are already on disk (truncate waits for pages under
 * writeback).
 *
 * Called with inode->i_rwsem down.
 */
int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                 struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        int error, rc = 0;
        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
        bool inc_ivers = true;

        error = ext4_emergency_state(inode->i_sb);
        if (unlikely(error))
                return error;

        if (unlikely(IS_IMMUTABLE(inode)))
                return -EPERM;

        if (unlikely(IS_APPEND(inode) &&
                     (ia_valid & (ATTR_MODE | ATTR_UID |
                                  ATTR_GID | ATTR_TIMES_SET))))
                return -EPERM;

        error = setattr_prepare(idmap, dentry, attr);
        if (error)
                return error;

        error = fscrypt_prepare_setattr(dentry, attr);
        if (error)
                return error;

        if (is_quota_modification(idmap, inode, attr)) {
                error = dquot_initialize(inode);
                if (error)
                        return error;
        }

        if (i_uid_needs_update(idmap, attr, inode) ||
            i_gid_needs_update(idmap, attr, inode)) {
                handle_t *handle;

                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
                handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
                        (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
                         EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
                }

                /* dquot_transfer() calls back ext4_get_inode_usage() which
                 * counts xattr inode references.
                 */
                down_read(&EXT4_I(inode)->xattr_sem);
                error = dquot_transfer(idmap, inode, attr);
                up_read(&EXT4_I(inode)->xattr_sem);

                if (error) {
                        ext4_journal_stop(handle);
                        return error;
                }
                /* Update corresponding info in inode so that everything is in
                 * one transaction */
                i_uid_update(idmap, attr, inode);
                i_gid_update(idmap, attr, inode);
                error = ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
                if (unlikely(error)) {
                        return error;
                }
        }

        if (attr->ia_valid & ATTR_SIZE) {
                handle_t *handle;
                loff_t oldsize = inode->i_size;
                loff_t old_disksize;
                int shrink = (attr->ia_size < inode->i_size);

                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
                                return -EFBIG;
                        }
                }
                if (!S_ISREG(inode->i_mode)) {
                        return -EINVAL;
                }

                if (attr->ia_size == inode->i_size)
                        inc_ivers = false;

                /*
                 * If file has inline data but new size exceeds inline capacity,
                 * convert to extent-based storage first to prevent inconsistent
                 * state (inline flag set but size exceeds inline capacity).
                 */
                if (ext4_has_inline_data(inode) &&
                    attr->ia_size > EXT4_I(inode)->i_inline_size) {
                        error = ext4_convert_inline_data(inode);
                        if (error)
                                goto err_out;
                }

                if (shrink) {
                        if (ext4_should_order_data(inode)) {
                                error = ext4_begin_ordered_truncate(inode,
                                                            attr->ia_size);
                                if (error)
                                        goto err_out;
                        }
                        /*
                         * Blocks are going to be removed from the inode. Wait
                         * for dio in flight.
                         */
                        inode_dio_wait(inode);
                }

                filemap_invalidate_lock(inode->i_mapping);

                rc = ext4_break_layouts(inode);
                if (rc) {
                        filemap_invalidate_unlock(inode->i_mapping);
                        goto err_out;
                }

                if (attr->ia_size != inode->i_size) {
                        /* attach jbd2 jinode for EOF folio tail zeroing */
                        if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
                            oldsize & (inode->i_sb->s_blocksize - 1)) {
                                error = ext4_inode_attach_jinode(inode);
                                if (error)
                                        goto out_mmap_sem;
                        }

                        /*
                         * Update c/mtime and tail zero the EOF folio on
                         * truncate up. ext4_truncate() handles the shrink case
                         * below.
                         */
                        if (!shrink) {
                                inode_set_mtime_to_ts(inode,
                                                      inode_set_ctime_current(inode));
                                if (oldsize & (inode->i_sb->s_blocksize - 1)) {
                                        error = ext4_block_zero_eof(inode,
                                                        oldsize, LLONG_MAX);
                                        if (error)
                                                goto out_mmap_sem;
                                }
                        }

                        handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
                        if (IS_ERR(handle)) {
                                error = PTR_ERR(handle);
                                goto out_mmap_sem;
                        }
                        if (ext4_handle_valid(handle) && shrink) {
                                error = ext4_orphan_add(handle, inode);
                                orphan = 1;
                        }

                        if (shrink)
                                ext4_fc_track_range(handle, inode,
                                        (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
                                        inode->i_sb->s_blocksize_bits,
                                        EXT_MAX_BLOCKS - 1);
                        else
                                ext4_fc_track_range(
                                        handle, inode,
                                        (oldsize > 0 ? oldsize - 1 : oldsize) >>
                                        inode->i_sb->s_blocksize_bits,
                                        (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
                                        inode->i_sb->s_blocksize_bits);

                        down_write(&EXT4_I(inode)->i_data_sem);
                        old_disksize = EXT4_I(inode)->i_disksize;
                        EXT4_I(inode)->i_disksize = attr->ia_size;

                        /*
                         * We have to update i_size under i_data_sem together
                         * with i_disksize to avoid races with writeback code
                         * running ext4_wb_update_i_disksize().
                         */
                        if (!error)
                                i_size_write(inode, attr->ia_size);
                        else
                                EXT4_I(inode)->i_disksize = old_disksize;
                        up_write(&EXT4_I(inode)->i_data_sem);
                        rc = ext4_mark_inode_dirty(handle, inode);
                        if (!error)
                                error = rc;
                        ext4_journal_stop(handle);
                        if (error)
                                goto out_mmap_sem;
                        if (!shrink) {
                                pagecache_isize_extended(inode, oldsize,
                                                         inode->i_size);
                        } else if (ext4_should_journal_data(inode)) {
                                ext4_wait_for_tail_page_commit(inode);
                        }
                }

                /*
                 * Truncate pagecache after we've waited for commit
                 * in data=journal mode to make pages freeable.
                 */
                truncate_pagecache(inode, inode->i_size);
                /*
                 * Call ext4_truncate() even if i_size didn't change to
                 * truncate possible preallocated blocks.
                 */
                if (attr->ia_size <= oldsize) {
                        rc = ext4_truncate(inode);
                        if (rc)
                                error = rc;
                }
out_mmap_sem:
                filemap_invalidate_unlock(inode->i_mapping);
        }

        if (!error) {
                if (inc_ivers)
                        inode_inc_iversion(inode);
                setattr_copy(idmap, inode, attr);
                mark_inode_dirty(inode);
        }

        /*
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);

        if (!error && (ia_valid & ATTR_MODE))
                rc = posix_acl_chmod(idmap, dentry, inode->i_mode);

err_out:
        if  (error)
                ext4_std_error(inode->i_sb, error);
        if (!error)
                error = rc;
        return error;
}

u32 ext4_dio_alignment(struct inode *inode)
{
        if (fsverity_active(inode))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
        if (ext4_has_inline_data(inode))
                return 0;
        if (IS_ENCRYPTED(inode)) {
                if (!fscrypt_dio_supported(inode))
                        return 0;
                return i_blocksize(inode);
        }
        return 1; /* use the iomap defaults */
}

int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
                 struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int flags;

        if ((request_mask & STATX_BTIME) &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
                stat->result_mask |= STATX_BTIME;
                stat->btime.tv_sec = ei->i_crtime.tv_sec;
                stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
        }

        /*
         * Return the DIO alignment restrictions if requested.  We only return
         * this information when requested, since on encrypted files it might
         * take a fair bit of work to get if the file wasn't opened recently.
         */
        if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
                u32 dio_align = ext4_dio_alignment(inode);

                stat->result_mask |= STATX_DIOALIGN;
                if (dio_align == 1) {
                        struct block_device *bdev = inode->i_sb->s_bdev;

                        /* iomap defaults */
                        stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
                        stat->dio_offset_align = bdev_logical_block_size(bdev);
                } else {
                        stat->dio_mem_align = dio_align;
                        stat->dio_offset_align = dio_align;
                }
        }

        if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                unsigned int awu_min = 0, awu_max = 0;

                if (ext4_inode_can_atomic_write(inode)) {
                        awu_min = sbi->s_awu_min;
                        awu_max = sbi->s_awu_max;
                }

                generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0);
        }

        flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
        if (flags & EXT4_APPEND_FL)
                stat->attributes |= STATX_ATTR_APPEND;
        if (flags & EXT4_COMPR_FL)
                stat->attributes |= STATX_ATTR_COMPRESSED;
        if (flags & EXT4_ENCRYPT_FL)
                stat->attributes |= STATX_ATTR_ENCRYPTED;
        if (flags & EXT4_IMMUTABLE_FL)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (flags & EXT4_NODUMP_FL)
                stat->attributes |= STATX_ATTR_NODUMP;
        if (flags & EXT4_VERITY_FL)
                stat->attributes |= STATX_ATTR_VERITY;

        stat->attributes_mask |= (STATX_ATTR_APPEND |
                                  STATX_ATTR_COMPRESSED |
                                  STATX_ATTR_ENCRYPTED |
                                  STATX_ATTR_IMMUTABLE |
                                  STATX_ATTR_NODUMP |
                                  STATX_ATTR_VERITY);

        generic_fillattr(idmap, request_mask, inode, stat);
        return 0;
}

int ext4_file_getattr(struct mnt_idmap *idmap,
                      const struct path *path, struct kstat *stat,
                      u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        u64 delalloc_blocks;

        ext4_getattr(idmap, path, stat, request_mask, query_flags);

        /*
         * If there is inline data in the inode, the inode will normally not
         * have data blocks allocated (it may have an external xattr block).
         * Report at least one sector for such files, so tools like tar, rsync,
         * others don't incorrectly think the file is completely sparse.
         */
        if (unlikely(ext4_has_inline_data(inode)))
                stat->blocks += (stat->size + 511) >> 9;

        /*
         * We can't update i_blocks if the block allocation is delayed
         * otherwise in the case of system crash before the real block
         * allocation is done, we will have i_blocks inconsistent with
         * on-disk file blocks.
         * We always keep i_blocks updated together with real
         * allocation. But to not confuse with user, stat
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
        delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
                                   EXT4_I(inode)->i_reserved_data_blocks);
        stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
        return 0;
}

static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
                                   int pextents)
{
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return ext4_ind_trans_blocks(inode, lblocks);
        return ext4_ext_index_trans_blocks(inode, pextents);
}

/*
 * Account for index blocks, block groups bitmaps and block group
 * descriptor blocks if modify datablocks and index blocks
 * worse case, the indexs blocks spread over different block groups
 *
 * If datablocks are discontiguous, they are possible to spread over
 * different block groups too. If they are contiguous, with flexbg,
 * they could still across block group boundary.
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
{
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
        int idxblocks;
        int ret;

        /*
         * How many index and leaf blocks need to touch to map @lblocks
         * logical blocks to @pextents physical extents?
         */
        idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);

        /*
         * Now let's see how many group bitmaps and group descriptors need
         * to account
         */
        groups = idxblocks + pextents;
        gdpblocks = groups;
        if (groups > ngroups)
                groups = ngroups;
        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;

        /* bitmaps and block group descriptor blocks */
        ret = idxblocks + groups + gdpblocks;

        /* Blocks for super block, inode, quota and xattr blocks */
        ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);

        return ret;
}

/*
 * Calculate the journal credits for modifying the number of blocks
 * in a single extent within one transaction. 'nrblocks' is used only
 * for non-extent inodes. For extent type inodes, 'nrblocks' can be
 * zero if the exact number of blocks is unknown.
 */
int ext4_chunk_trans_extent(struct inode *inode, int nrblocks)
{
        int ret;

        ret = ext4_meta_trans_blocks(inode, nrblocks, 1);
        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))
                ret += nrblocks;
        return ret;
}

/*
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
 */
int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
{
        return ext4_meta_trans_blocks(inode, nrblocks, 1);
}

/*
 * The caller must have previously called ext4_reserve_inode_write().
 * Give this, we know that the caller already has write access to iloc->bh.
 */
int ext4_mark_iloc_dirty(handle_t *handle,
                         struct inode *inode, struct ext4_iloc *iloc)
{
        int err = 0;

        err = ext4_emergency_state(inode->i_sb);
        if (unlikely(err)) {
                put_bh(iloc->bh);
                return err;
        }
        ext4_fc_track_inode(handle, inode);

        /* the do_update_inode consumes one bh->b_count */
        get_bh(iloc->bh);

        /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
        err = ext4_do_update_inode(handle, inode, iloc);
        put_bh(iloc->bh);
        return err;
}

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int
ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                         struct ext4_iloc *iloc)
{
        int err;

        err = ext4_emergency_state(inode->i_sb);
        if (unlikely(err))
                return err;

        err = ext4_get_inode_loc(inode, iloc);
        if (!err) {
                BUFFER_TRACE(iloc->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode->i_sb,
                                                    iloc->bh, EXT4_JTR_NONE);
                if (err) {
                        brelse(iloc->bh);
                        iloc->bh = NULL;
                }
                ext4_fc_track_inode(handle, inode);
        }
        ext4_std_error(inode->i_sb, err);
        return err;
}

static int __ext4_expand_extra_isize(struct inode *inode,
                                     unsigned int new_extra_isize,
                                     struct ext4_iloc *iloc,
                                     handle_t *handle, int *no_expand)
{
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
        unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        int error;

        /* this was checked at iget time, but double check for good measure */
        if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
            (ei->i_extra_isize & 3)) {
                EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
                                 ei->i_extra_isize,
                                 EXT4_INODE_SIZE(inode->i_sb));
                return -EFSCORRUPTED;
        }
        if ((new_extra_isize < ei->i_extra_isize) ||
            (new_extra_isize < 4) ||
            (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
                return -EINVAL;        /* Should never happen */

        raw_inode = ext4_raw_inode(iloc);

        header = IHDR(inode, raw_inode);

        /* No extended attributes present */
        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
            header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
                memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
                       EXT4_I(inode)->i_extra_isize, 0,
                       new_extra_isize - EXT4_I(inode)->i_extra_isize);
                EXT4_I(inode)->i_extra_isize = new_extra_isize;
                return 0;
        }

        /*
         * We may need to allocate external xattr block so we need quotas
         * initialized. Here we can be called with various locks held so we
         * cannot affort to initialize quotas ourselves. So just bail.
         */
        if (dquot_initialize_needed(inode))
                return -EAGAIN;

        /* try to expand with EAs present */
        error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
                                           raw_inode, handle);
        if (error) {
                /*
                 * Inode size expansion failed; don't try again
                 */
                *no_expand = 1;
        }

        return error;
}

/*
 * Expand an inode by new_extra_isize bytes.
 * Returns 0 on success or negative error number on failure.
 */
static int ext4_try_to_expand_extra_isize(struct inode *inode,
                                          unsigned int new_extra_isize,
                                          struct ext4_iloc iloc,
                                          handle_t *handle)
{
        int no_expand;
        int error;

        if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND))
                return -EOVERFLOW;

        /*
         * In nojournal mode, we can immediately attempt to expand
         * the inode.  When journaled, we first need to obtain extra
         * buffer credits since we may write into the EA block
         * with this same handle. If journal_extend fails, then it will
         * only result in a minor loss of functionality for that inode.
         * If this is felt to be critical, then e2fsck should be run to
         * force a large enough s_min_extra_isize.
         */
        if (ext4_journal_extend(handle,
                                EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
                return -ENOSPC;

        if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
                return -EBUSY;

        error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc,
                                          handle, &no_expand);
        ext4_write_unlock_xattr(inode, &no_expand);

        return error;
}

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc)
{
        handle_t *handle;
        int no_expand;
        int error, rc;

        if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
                brelse(iloc->bh);
                return -EOVERFLOW;
        }

        handle = ext4_journal_start(inode, EXT4_HT_INODE,
                                    EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
                brelse(iloc->bh);
                return error;
        }

        ext4_write_lock_xattr(inode, &no_expand);

        BUFFER_TRACE(iloc->bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh,
                                              EXT4_JTR_NONE);
        if (error) {
                brelse(iloc->bh);
                goto out_unlock;
        }

        error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
                                          handle, &no_expand);

        rc = ext4_mark_iloc_dirty(handle, inode, iloc);
        if (!error)
                error = rc;

out_unlock:
        ext4_write_unlock_xattr(inode, &no_expand);
        ext4_journal_stop(handle);
        return error;
}

/*
 * What we do here is to mark the in-core inode as clean with respect to inode
 * dirtiness (it may still be data-dirty).
 * This means that the in-core inode may be reaped by prune_icache
 * without having to perform any I/O.  This is a very good thing,
 * because *any* task may call prune_icache - even ones which
 * have a transaction open against a different journal.
 *
 * Is this cheating?  Not really.  Sure, we haven't written the
 * inode out, but prune_icache isn't a user-visible syncing function.
 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
 * we start and wait on commits.
 */
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line)
{
        struct ext4_iloc iloc;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err;

        might_sleep();
        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out;

        if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
                ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
                                               iloc, handle);

        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
out:
        if (unlikely(err))
                ext4_error_inode_err(inode, func, line, 0, err,
                                        "mark_inode_dirty error");
        return err;
}

/*
 * ext4_dirty_inode() is called from __mark_inode_dirty()
 *
 * We're really interested in the case where a file is being extended.
 * i_size has been changed by generic_commit_write() and we thus need
 * to include the updated inode in the current transaction.
 *
 * Also, dquot_alloc_block() will always dirty the inode when blocks
 * are allocated to the file.
 *
 * If the inode is marked synchronous, we don't honour that here - doing
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
 */
void ext4_dirty_inode(struct inode *inode, int flags)
{
        handle_t *handle;

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                return;
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
}

int ext4_change_inode_journal_flag(struct inode *inode, int val)
{
        journal_t *journal;
        handle_t *handle;
        int err;
        int alloc_ctx;

        /*
         * We have to be very careful here: changing a data block's
         * journaling status dynamically is dangerous.  If we write a
         * data block to the journal, change the status and then delete
         * that block, we risk forgetting to revoke the old log record
         * from the journal and so a subsequent replay can corrupt data.
         * So, first we make sure that the journal is empty and that
         * nobody is changing anything.
         */

        journal = EXT4_JOURNAL(inode);
        if (!journal)
                return 0;
        if (is_journal_aborted(journal))
                return -EROFS;

        /* Wait for all existing dio workers */
        inode_dio_wait(inode);

        /*
         * Before flushing the journal and switching inode's aops, we have
         * to flush all dirty data the inode has. There can be outstanding
         * delayed allocations, there can be unwritten extents created by
         * fallocate or buffered writes in dioread_nolock mode covered by
         * dirty data which can be converted only after flushing the dirty
         * data (and journalled aops don't know how to handle these cases).
         */
        filemap_invalidate_lock(inode->i_mapping);
        err = filemap_write_and_wait(inode->i_mapping);
        if (err < 0) {
                filemap_invalidate_unlock(inode->i_mapping);
                return err;
        }
        /* Before switch the inode journalling mode evict all the page cache. */
        truncate_pagecache(inode, 0);

        alloc_ctx = ext4_writepages_down_write(inode->i_sb);
        jbd2_journal_lock_updates(journal);

        /*
         * OK, there are no updates running now, and all cached data is
         * synced to disk.  We are now in a completely consistent state
         * which doesn't have anything in the journal, and we know that
         * no filesystem updates are running, so it is safe to modify
         * the inode's in-core data-journaling state flag now.
         */

        if (val)
                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else {
                err = jbd2_journal_flush(journal, 0);
                if (err < 0) {
                        jbd2_journal_unlock_updates(journal);
                        ext4_writepages_up_write(inode->i_sb, alloc_ctx);
                        filemap_invalidate_unlock(inode->i_mapping);
                        return err;
                }
                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        }
        ext4_set_aops(inode);
        ext4_set_inode_mapping_order(inode);

        jbd2_journal_unlock_updates(journal);
        ext4_writepages_up_write(inode->i_sb, alloc_ctx);
        filemap_invalidate_unlock(inode->i_mapping);

        /* Finally we can mark the inode as dirty. */

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        ext4_fc_mark_ineligible(inode->i_sb,
                EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
        err = ext4_mark_inode_dirty(handle, inode);
        ext4_handle_sync(handle);
        ext4_journal_stop(handle);
        ext4_std_error(inode->i_sb, err);

        return err;
}

static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
                            struct buffer_head *bh)
{
        return !buffer_mapped(bh);
}

static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio,
                                   get_block_t get_block)
{
        handle_t *handle;
        loff_t size;
        unsigned long len;
        int credits;
        int ret;

        credits = ext4_chunk_trans_extent(inode,
                        ext4_journal_blocks_per_folio(inode));
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        folio_lock(folio);
        size = i_size_read(inode);
        /* Page got truncated from under us? */
        if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) {
                ret = -EFAULT;
                goto out_error;
        }

        len = folio_size(folio);
        if (folio_pos(folio) + len > size)
                len = size - folio_pos(folio);

        ret = ext4_block_write_begin(handle, folio, 0, len, get_block);
        if (ret)
                goto out_error;

        if (!ext4_should_journal_data(inode)) {
                block_commit_write(folio, 0, len);
                folio_mark_dirty(folio);
        } else {
                ret = ext4_journal_folio_buffers(handle, folio, len);
                if (ret)
                        goto out_error;
        }
        ext4_journal_stop(handle);
        folio_wait_stable(folio);
        return ret;

out_error:
        folio_unlock(folio);
        ext4_journal_stop(handle);
        return ret;
}

vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = page_folio(vmf->page);
        loff_t size;
        unsigned long len;
        int err;
        vm_fault_t ret;
        struct file *file = vma->vm_file;
        struct inode *inode = file_inode(file);
        struct address_space *mapping = inode->i_mapping;
        get_block_t *get_block = ext4_get_block;
        int retries = 0;

        if (unlikely(IS_IMMUTABLE(inode)))
                return VM_FAULT_SIGBUS;

        sb_start_pagefault(inode->i_sb);
        file_update_time(vma->vm_file);

        filemap_invalidate_lock_shared(mapping);

        err = ext4_convert_inline_data(inode);
        if (err)
                goto out_ret;

        /*
         * On data journalling we skip straight to the transaction handle:
         * there's no delalloc; page truncated will be checked later; the
         * early return w/ all buffers mapped (calculates size/len) can't
         * be used; and there's no dioread_nolock, so only ext4_get_block.
         */
        if (ext4_should_journal_data(inode))
                goto retry_alloc;

        /* Delalloc case is easy... */
        if (test_opt(inode->i_sb, DELALLOC) &&
            !ext4_nonda_switch(inode->i_sb)) {
                do {
                        err = block_page_mkwrite(vma, vmf,
                                                   ext4_da_get_block_prep);
                } while (err == -ENOSPC &&
                       ext4_should_retry_alloc(inode->i_sb, &retries));
                goto out_ret;
        }

        folio_lock(folio);
        size = i_size_read(inode);
        /* Page got truncated from under us? */
        if (folio->mapping != mapping || folio_pos(folio) > size) {
                folio_unlock(folio);
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        len = folio_size(folio);
        if (folio_pos(folio) + len > size)
                len = size - folio_pos(folio);
        /*
         * Return if we have all the buffers mapped. This avoids the need to do
         * journal_start/journal_stop which can block and take a long time
         *
         * This cannot be done for data journalling, as we have to add the
         * inode to the transaction's list to writeprotect pages on commit.
         */
        if (folio_buffers(folio)) {
                if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio),
                                            0, len, NULL,
                                            ext4_bh_unmapped)) {
                        /* Wait so that we don't change page under IO */
                        folio_wait_stable(folio);
                        ret = VM_FAULT_LOCKED;
                        goto out;
                }
        }
        folio_unlock(folio);
        /* OK, we need to fill the hole... */
        if (ext4_should_dioread_nolock(inode))
                get_block = ext4_get_block_unwritten;
retry_alloc:
        /* Start journal and allocate blocks */
        err = ext4_block_page_mkwrite(inode, folio, get_block);
        if (err == -EAGAIN ||
            (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)))
                goto retry_alloc;
out_ret:
        ret = vmf_fs_error(err);
out:
        filemap_invalidate_unlock_shared(mapping);
        sb_end_pagefault(inode->i_sb);
        return ret;
}


































































































































































































































































































































































   24 







































   23 





   24 

















    3 




































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit.h -- Auditing support
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 */
#ifndef _LINUX_AUDIT_H_
#define _LINUX_AUDIT_H_

#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/audit_arch.h>
#include <uapi/linux/audit.h>
#include <uapi/linux/fanotify.h>

#define AUDIT_STATUS_ALL (AUDIT_STATUS_ENABLED | \
                          AUDIT_STATUS_FAILURE | \
                          AUDIT_STATUS_PID | \
                          AUDIT_STATUS_RATE_LIMIT | \
                          AUDIT_STATUS_BACKLOG_LIMIT | \
                          AUDIT_STATUS_BACKLOG_WAIT_TIME | \
                          AUDIT_STATUS_LOST | \
                          AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL)

#define AUDIT_INO_UNSET ((u64)-1)
#define AUDIT_DEV_UNSET ((dev_t)-1)

struct audit_sig_info {
        uid_t                uid;
        pid_t                pid;
        char                ctx[];
};

struct audit_buffer;
struct audit_context;
struct inode;
struct netlink_skb_parms;
struct path;
struct linux_binprm;
struct mq_attr;
struct mqstat;
struct audit_watch;
struct audit_tree;
struct sk_buff;
struct kern_ipc_perm;
struct lsm_id;
struct lsm_prop;

struct audit_krule {
        u32                        pflags;
        u32                        flags;
        u32                        listnr;
        u32                        action;
        u32                        mask[AUDIT_BITMASK_SIZE];
        u32                        buflen; /* for data alloc on list rules */
        u32                        field_count;
        char                        *filterkey; /* ties events to rules */
        struct audit_field        *fields;
        struct audit_field        *arch_f; /* quick access to arch field */
        struct audit_field        *inode_f; /* quick access to an inode field */
        struct audit_watch        *watch;        /* associated watch */
        struct audit_tree        *tree;        /* associated watched tree */
        struct audit_fsnotify_mark        *exe;
        struct list_head        rlist;        /* entry in audit_{watch,tree}.rules list */
        struct list_head        list;        /* for AUDIT_LIST* purposes only */
        u64                        prio;
};

/* Flag to indicate legacy AUDIT_LOGINUID unset usage */
#define AUDIT_LOGINUID_LEGACY                0x1

struct audit_field {
        u32                                type;
        union {
                u32                        val;
                kuid_t                        uid;
                kgid_t                        gid;
                struct {
                        char                *lsm_str;
                        void                *lsm_rule;
                };
        };
        u32                                op;
};

enum audit_ntp_type {
        AUDIT_NTP_OFFSET,
        AUDIT_NTP_FREQ,
        AUDIT_NTP_STATUS,
        AUDIT_NTP_TAI,
        AUDIT_NTP_TICK,
        AUDIT_NTP_ADJUST,

        AUDIT_NTP_NVALS /* count */
};

#ifdef CONFIG_AUDITSYSCALL
struct audit_ntp_val {
        long long oldval, newval;
};

struct audit_ntp_data {
        struct audit_ntp_val vals[AUDIT_NTP_NVALS];
};
#else
struct audit_ntp_data {};
#endif

enum audit_nfcfgop {
        AUDIT_XT_OP_REGISTER,
        AUDIT_XT_OP_REPLACE,
        AUDIT_XT_OP_UNREGISTER,
        AUDIT_NFT_OP_TABLE_REGISTER,
        AUDIT_NFT_OP_TABLE_UNREGISTER,
        AUDIT_NFT_OP_CHAIN_REGISTER,
        AUDIT_NFT_OP_CHAIN_UNREGISTER,
        AUDIT_NFT_OP_RULE_REGISTER,
        AUDIT_NFT_OP_RULE_UNREGISTER,
        AUDIT_NFT_OP_SET_REGISTER,
        AUDIT_NFT_OP_SET_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_REGISTER,
        AUDIT_NFT_OP_SETELEM_UNREGISTER,
        AUDIT_NFT_OP_GEN_REGISTER,
        AUDIT_NFT_OP_OBJ_REGISTER,
        AUDIT_NFT_OP_OBJ_UNREGISTER,
        AUDIT_NFT_OP_OBJ_RESET,
        AUDIT_NFT_OP_FLOWTABLE_REGISTER,
        AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_RESET,
        AUDIT_NFT_OP_RULE_RESET,
        AUDIT_NFT_OP_INVALID,
};

extern int __init audit_register_class(int class, unsigned *list);
extern int audit_classify_syscall(int abi, unsigned syscall);
extern int audit_classify_arch(int arch);

/* audit_names->type values */
#define        AUDIT_TYPE_UNKNOWN        0        /* we don't know yet */
#define        AUDIT_TYPE_NORMAL        1        /* a "normal" audit record */
#define        AUDIT_TYPE_PARENT        2        /* a parent audit record */
#define        AUDIT_TYPE_CHILD_DELETE 3        /* a child being deleted */
#define        AUDIT_TYPE_CHILD_CREATE 4        /* a child being created */

/* maximized args number that audit_socketcall can process */
#define AUDITSC_ARGS                6

/* bit values for ->signal->audit_tty */
#define AUDIT_TTY_ENABLE        BIT(0)
#define AUDIT_TTY_LOG_PASSWD        BIT(1)

/* bit values for audit_cfg_lsm */
#define AUDIT_CFG_LSM_SECCTX_SUBJECT        BIT(0)
#define AUDIT_CFG_LSM_SECCTX_OBJECT        BIT(1)

struct filename;

#define AUDIT_OFF        0
#define AUDIT_ON        1
#define AUDIT_LOCKED        2
#ifdef CONFIG_AUDIT
/* These are defined in audit.c */
                                /* Public API */
extern __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...);

extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type);
extern __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...);
extern void                    audit_log_end(struct audit_buffer *ab);
extern bool                    audit_string_contains_control(const char *string,
                                                          size_t len);
extern void                    audit_log_n_hex(struct audit_buffer *ab,
                                          const unsigned char *buf,
                                          size_t len);
extern void                    audit_log_n_string(struct audit_buffer *ab,
                                               const char *buf,
                                               size_t n);
extern void                    audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                        const char *string,
                                                        size_t n);
extern void                    audit_log_untrustedstring(struct audit_buffer *ab,
                                                      const char *string);
extern void                    audit_log_d_path(struct audit_buffer *ab,
                                             const char *prefix,
                                             const struct path *path);
extern void                    audit_log_key(struct audit_buffer *ab,
                                          char *key);
extern void                    audit_log_path_denied(int type,
                                                  const char *operation);
extern void                    audit_log_lost(const char *message);

extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop);
extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop);
extern int audit_log_task_context(struct audit_buffer *ab);
extern void audit_log_task_info(struct audit_buffer *ab);
extern int audit_log_nf_skb(struct audit_buffer *ab,
                            const struct sk_buff *skb, u8 nfproto);

extern int                    audit_update_lsm_rules(void);

                                /* Private API (for audit.c only) */
extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);

extern int audit_set_loginuid(kuid_t loginuid);

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return tsk->loginuid;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return tsk->sessionid;
}

extern u32 audit_enabled;

extern int audit_signal_info(int sig, struct task_struct *t);

extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags);

#else /* CONFIG_AUDIT */
static inline __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...)
{ }
static inline struct audit_buffer *audit_log_start(struct audit_context *ctx,
                                                   gfp_t gfp_mask, int type)
{
        return NULL;
}
static inline __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{ }
static inline void audit_log_end(struct audit_buffer *ab)
{ }
static inline void audit_log_n_hex(struct audit_buffer *ab,
                                   const unsigned char *buf, size_t len)
{ }
static inline void audit_log_n_string(struct audit_buffer *ab,
                                      const char *buf, size_t n)
{ }
static inline void  audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                const char *string, size_t n)
{ }
static inline void audit_log_untrustedstring(struct audit_buffer *ab,
                                             const char *string)
{ }
static inline void audit_log_d_path(struct audit_buffer *ab,
                                    const char *prefix,
                                    const struct path *path)
{ }
static inline void audit_log_key(struct audit_buffer *ab, char *key)
{ }
static inline void audit_log_path_denied(int type, const char *operation)
{ }
static inline int audit_log_subj_ctx(struct audit_buffer *ab,
                                     struct lsm_prop *prop)
{
        return 0;
}
static inline int audit_log_obj_ctx(struct audit_buffer *ab,
                                    struct lsm_prop *prop)
{
        return 0;
}
static inline int audit_log_task_context(struct audit_buffer *ab)
{
        return 0;
}
static inline void audit_log_task_info(struct audit_buffer *ab)
{ }

static inline int audit_log_nf_skb(struct audit_buffer *ab,
                                   const struct sk_buff *skb, u8 nfproto)
{
        return 0;
}

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return INVALID_UID;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return AUDIT_SID_UNSET;
}

#define audit_enabled AUDIT_OFF

static inline int audit_signal_info(int sig, struct task_struct *t)
{
        return 0;
}

static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags)
{ }

#endif /* CONFIG_AUDIT */

#ifdef CONFIG_AUDIT_COMPAT_GENERIC
#define audit_is_compat(arch)  (!((arch) & __AUDIT_ARCH_64BIT))
#else
#define audit_is_compat(arch)  false
#endif

#define AUDIT_INODE_PARENT        1        /* dentry represents the parent */
#define AUDIT_INODE_HIDDEN        2        /* audit record should be hidden */
#define AUDIT_INODE_NOEVAL        4        /* audit record incomplete */

#ifdef CONFIG_AUDITSYSCALL
#include <asm/syscall.h> /* for syscall_get_arch() */

/* These are defined in auditsc.c */
                                /* Public API */
extern int  audit_alloc(struct task_struct *task);
extern void __audit_free(struct task_struct *task);
extern void __audit_uring_entry(u8 op);
extern void __audit_uring_exit(int success, long code);
extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
                                  unsigned long a2, unsigned long a3);
extern void __audit_syscall_exit(int ret_success, long ret_value);
extern void __audit_getname(struct filename *name);
extern void __audit_inode(struct filename *name, const struct dentry *dentry,
                                unsigned int flags);
extern void __audit_file(const struct file *);
extern void __audit_inode_child(struct inode *parent,
                                const struct dentry *dentry,
                                const unsigned char type);
extern void audit_seccomp(unsigned long syscall, long signr, int code);
extern void audit_seccomp_actions_logged(const char *names,
                                         const char *old_names, int res);
extern void __audit_ptrace(struct task_struct *t);

static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{
        task->audit_context = ctx;
}

static inline struct audit_context *audit_context(void)
{
        return current->audit_context;
}

static inline bool audit_dummy_context(void)
{
        void *p = audit_context();
        return !p || *(int *)p;
}
static inline void audit_free(struct task_struct *task)
{
        if (unlikely(task->audit_context))
                __audit_free(task);
}
static inline void audit_uring_entry(u8 op)
{
        /*
         * We intentionally check audit_context() before audit_enabled as most
         * Linux systems (as of ~2021) rely on systemd which forces audit to
         * be enabled regardless of the user's audit configuration.
         */
        if (unlikely(audit_context() && audit_enabled))
                __audit_uring_entry(op);
}
static inline void audit_uring_exit(int success, long code)
{
        if (unlikely(audit_context()))
                __audit_uring_exit(success, code);
}
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{
        if (unlikely(audit_context()))
                __audit_syscall_entry(major, a0, a1, a2, a3);
}
static inline void audit_syscall_exit(void *pt_regs)
{
        if (unlikely(audit_context())) {
                int success = is_syscall_success(pt_regs);
                long return_code = regs_return_value(pt_regs);

                __audit_syscall_exit(success, return_code);
        }
}
static inline void audit_getname(struct filename *name)
{
        if (unlikely(!audit_dummy_context()))
                __audit_getname(name);
}
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry, aflags);
}
static inline void audit_file(struct file *file)
{
        if (unlikely(!audit_dummy_context()))
                __audit_file(file);
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                                const struct dentry *dentry)
{
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry,
                                AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN);
}
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode_child(parent, dentry, type);
}
void audit_core_dumps(long signr);

static inline void audit_ptrace(struct task_struct *t)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ptrace(t);
}

                                /* Private API (for audit.c only) */
extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
extern void __audit_bprm(struct linux_binprm *bprm);
extern int __audit_socketcall(int nargs, unsigned long *args);
extern int __audit_sockaddr(int len, void *addr);
extern void __audit_fd_pair(int fd1, int fd2);
extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout);
extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                  const struct cred *new,
                                  const struct cred *old);
extern void __audit_log_capset(const struct cred *new, const struct cred *old);
extern void __audit_mmap_fd(int fd, int flags);
extern void __audit_openat2_how(struct open_how *how);
extern void __audit_log_kern_module(const char *name);
extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar);
extern void __audit_tk_injoffset(struct timespec64 offset);
extern void __audit_ntp_log(const struct audit_ntp_data *ad);
extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
                              enum audit_nfcfgop op, gfp_t gfp);

static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_obj(ipcp);
}
static inline void audit_fd_pair(int fd1, int fd2)
{
        if (unlikely(!audit_dummy_context()))
                __audit_fd_pair(fd1, fd2);
}
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_set_perm(qbytes, uid, gid, mode);
}
static inline void audit_bprm(struct linux_binprm *bprm)
{
        if (unlikely(!audit_dummy_context()))
                __audit_bprm(bprm);
}
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_socketcall(nargs, args);
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        unsigned long a[AUDITSC_ARGS];
        int i;

        if (audit_dummy_context())
                return 0;

        for (i = 0; i < nargs; i++)
                a[i] = (unsigned long)args[i];
        return __audit_socketcall(nargs, a);
}

static inline int audit_sockaddr(int len, void *addr)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_sockaddr(len, addr);
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_open(oflag, mode, attr);
}
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout);
}
static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_notify(mqdes, notification);
}
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_getsetattr(mqdes, mqstat);
}

static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_log_bprm_fcaps(bprm, new, old);
        return 0;
}

static inline void audit_log_capset(const struct cred *new,
                                   const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                __audit_log_capset(new, old);
}

static inline void audit_mmap_fd(int fd, int flags)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mmap_fd(fd, flags);
}

static inline void audit_openat2_how(struct open_how *how)
{
        if (unlikely(!audit_dummy_context()))
                __audit_openat2_how(how);
}

static inline void audit_log_kern_module(const char *name)
{
        if (!audit_dummy_context())
                __audit_log_kern_module(name);
}

static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
        if (audit_enabled)
                __audit_fanotify(response, friar);
}

static inline void audit_tk_injoffset(struct timespec64 offset)
{
        /* ignore no-op events */
        if (offset.tv_sec == 0 && offset.tv_nsec == 0)
                return;

        if (!audit_dummy_context())
                __audit_tk_injoffset(offset);
}

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{
        memset(ad, 0, sizeof(*ad));
}

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].oldval = val;
}

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].newval = val;
}

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{
        if (!audit_dummy_context())
                __audit_ntp_log(ad);
}

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{
        if (audit_enabled)
                __audit_log_nfcfg(name, af, nentries, op, gfp);
}

extern int audit_n_rules;
extern int audit_signals;
#else /* CONFIG_AUDITSYSCALL */
static inline int audit_alloc(struct task_struct *task)
{
        return 0;
}
static inline void audit_free(struct task_struct *task)
{ }
static inline void audit_uring_entry(u8 op)
{ }
static inline void audit_uring_exit(int success, long code)
{ }
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{ }
static inline void audit_syscall_exit(void *pt_regs)
{ }
static inline bool audit_dummy_context(void)
{
        return true;
}
static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{ }
static inline struct audit_context *audit_context(void)
{
        return NULL;
}
static inline void audit_getname(struct filename *name)
{ }
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags)
{ }
static inline void audit_file(struct file *file)
{
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                const struct dentry *dentry)
{ }
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type)
{ }
static inline void audit_core_dumps(long signr)
{ }
static inline void audit_seccomp(unsigned long syscall, long signr, int code)
{ }
static inline void audit_seccomp_actions_logged(const char *names,
                                                const char *old_names, int res)
{ }
static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{ }
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
                                        gid_t gid, umode_t mode)
{ }
static inline void audit_bprm(struct linux_binprm *bprm)
{ }
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        return 0;
}

static inline void audit_fd_pair(int fd1, int fd2)
{ }
static inline int audit_sockaddr(int len, void *addr)
{
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{ }
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len,
                                     unsigned int msg_prio,
                                     const struct timespec64 *abs_timeout)
{ }
static inline void audit_mq_notify(mqd_t mqdes,
                                   const struct sigevent *notification)
{ }
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{ }
static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        return 0;
}
static inline void audit_log_capset(const struct cred *new,
                                    const struct cred *old)
{ }
static inline void audit_mmap_fd(int fd, int flags)
{ }

static inline void audit_openat2_how(struct open_how *how)
{ }

static inline void audit_log_kern_module(const char *name)
{ }

static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{ }

static inline void audit_tk_injoffset(struct timespec64 offset)
{ }

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{ }

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{ }

static inline void audit_ptrace(struct task_struct *t)
{ }

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{ }

#define audit_n_rules 0
#define audit_signals 0
#endif /* CONFIG_AUDITSYSCALL */

static inline bool audit_loginuid_set(struct task_struct *tsk)
{
        return uid_valid(audit_get_loginuid(tsk));
}

#endif





























    1 









    1 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Squashfs - a compressed read only filesystem for Linux
 *
 * Copyright (c) 2010 LG Electronics
 * Chan Jeong <chan.jeong@lge.com>
 *
 * lzo_wrapper.c
 */

#include <linux/mutex.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/lzo.h>

#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs.h"
#include "decompressor.h"
#include "page_actor.h"

struct squashfs_lzo {
        void        *input;
        void        *output;
};

static void *lzo_init(struct squashfs_sb_info *msblk, void *buff)
{
        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);

        struct squashfs_lzo *stream = kzalloc_obj(*stream);
        if (stream == NULL)
                goto failed;
        stream->input = vmalloc(block_size);
        if (stream->input == NULL)
                goto failed;
        stream->output = vmalloc(block_size);
        if (stream->output == NULL)
                goto failed2;

        return stream;

failed2:
        vfree(stream->input);
failed:
        ERROR("Failed to allocate lzo workspace\n");
        kfree(stream);
        return ERR_PTR(-ENOMEM);
}


static void lzo_free(void *strm)
{
        struct squashfs_lzo *stream = strm;

        if (stream) {
                vfree(stream->input);
                vfree(stream->output);
        }
        kfree(stream);
}


static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
        struct bio *bio, int offset, int length,
        struct squashfs_page_actor *output)
{
        struct bvec_iter_all iter_all = {};
        struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
        struct squashfs_lzo *stream = strm;
        void *buff = stream->input, *data;
        int bytes = length, res;
        size_t out_len = output->length;

        while (bio_next_segment(bio, &iter_all)) {
                int avail = min(bytes, ((int)bvec->bv_len) - offset);

                data = bvec_virt(bvec);
                memcpy(buff, data + offset, avail);
                buff += avail;
                bytes -= avail;
                offset = 0;
        }

        res = lzo1x_decompress_safe(stream->input, (size_t)length,
                                        stream->output, &out_len);
        if (res != LZO_E_OK)
                goto failed;

        res = bytes = (int)out_len;
        data = squashfs_first_page(output);
        buff = stream->output;
        while (data) {
                if (bytes <= PAGE_SIZE) {
                        if (!IS_ERR(data))
                                memcpy(data, buff, bytes);
                        break;
                } else {
                        if (!IS_ERR(data))
                                memcpy(data, buff, PAGE_SIZE);
                        buff += PAGE_SIZE;
                        bytes -= PAGE_SIZE;
                        data = squashfs_next_page(output);
                }
        }
        squashfs_finish_page(output);

        return res;

failed:
        return -EIO;
}

const struct squashfs_decompressor squashfs_lzo_comp_ops = {
        .init = lzo_init,
        .free = lzo_free,
        .decompress = lzo_uncompress,
        .id = LZO_COMPRESSION,
        .name = "lzo",
        .alloc_buffer = 0,
        .supported = 1
};


















































































































































































































































































































































































































































































































































































































































    9 


   11 


















































































































































































   16 







































































































































































































































   15 





   17 
   16 

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

/*
 * fsnotify inode mark locking/lifetime/and refcnting
 *
 * REFCNT:
 * The group->recnt and mark->refcnt tell how many "things" in the kernel
 * currently are referencing the objects. Both kind of objects typically will
 * live inside the kernel with a refcnt of 2, one for its creation and one for
 * the reference a group and a mark hold to each other.
 * If you are holding the appropriate locks, you can take a reference and the
 * object itself is guaranteed to survive until the reference is dropped.
 *
 * LOCKING:
 * There are 3 locks involved with fsnotify inode marks and they MUST be taken
 * in order as follows:
 *
 * group->mark_mutex
 * mark->lock
 * mark->connector->lock
 *
 * group->mark_mutex protects the marks_list anchored inside a given group and
 * each mark is hooked via the g_list.  It also protects the groups private
 * data (i.e group limits).

 * mark->lock protects the marks attributes like its masks and flags.
 * Furthermore it protects the access to a reference of the group that the mark
 * is assigned to as well as the access to a reference of the inode/vfsmount
 * that is being watched by the mark.
 *
 * mark->connector->lock protects the list of marks anchored inside an
 * inode / vfsmount and each mark is hooked via the i_list.
 *
 * A list of notification marks relating to inode / mnt is contained in
 * fsnotify_mark_connector. That structure is alive as long as there are any
 * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets
 * detached from fsnotify_mark_connector when last reference to the mark is
 * dropped.  Thus having mark reference is enough to protect mark->connector
 * pointer and to make sure fsnotify_mark_connector cannot disappear. Also
 * because we remove mark from g_list before dropping mark reference associated
 * with that, any mark found through g_list is guaranteed to have
 * mark->connector set until we drop group->mark_mutex.
 *
 * LIFETIME:
 * Inode marks survive between when they are added to an inode and when their
 * refcnt==0. Marks are also protected by fsnotify_mark_srcu.
 *
 * The inode mark can be cleared for a number of different reasons including:
 * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
 * - The inode is being evicted from cache. (fsnotify_inode_delete)
 * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
 * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
 * - The fsnotify_group associated with the mark is going away and all such marks
 *   need to be cleaned up. (fsnotify_clear_marks_by_group)
 *
 * This has the very interesting property of being able to run concurrently with
 * any (or all) other directions.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
#include <linux/ratelimit.h>

#include <linux/atomic.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

#define FSNOTIFY_REAPER_DELAY        (1)        /* 1 jiffy */

struct srcu_struct fsnotify_mark_srcu;
static struct kmem_cache *fsnotify_mark_connector_cachep;
static struct kmem_cache *fsnotify_inode_mark_connector_cachep;

static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
static struct fsnotify_mark_connector *connector_destroy_list;

static void fsnotify_mark_destroy_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);

static void fsnotify_connector_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);

void fsnotify_get_mark(struct fsnotify_mark *mark)
{
        WARN_ON_ONCE(!refcount_read(&mark->refcnt));
        refcount_inc(&mark->refcnt);
}

static fsnotify_connp_t *fsnotify_object_connp(void *obj,
                                enum fsnotify_obj_type obj_type)
{
        switch (obj_type) {
        case FSNOTIFY_OBJ_TYPE_INODE:
                return &((struct inode *)obj)->i_fsnotify_marks;
        case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
                return &real_mount(obj)->mnt_fsnotify_marks;
        case FSNOTIFY_OBJ_TYPE_SB:
                return fsnotify_sb_marks(obj);
        case FSNOTIFY_OBJ_TYPE_MNTNS:
                return &((struct mnt_namespace *)obj)->n_fsnotify_marks;
        default:
                return NULL;
        }
}

static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
{
        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
                return &fsnotify_conn_inode(conn)->i_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT)
                return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
                return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS)
                return &fsnotify_conn_mntns(conn)->n_fsnotify_mask;
        return NULL;
}

__u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
{
        if (WARN_ON(!fsnotify_valid_obj_type(conn->type)))
                return 0;

        return READ_ONCE(*fsnotify_conn_mask_p(conn));
}

static void fsnotify_get_sb_watched_objects(struct super_block *sb)
{
        atomic_long_inc(fsnotify_sb_watched_objects(sb));
}

static void fsnotify_put_sb_watched_objects(struct super_block *sb)
{
        atomic_long_t *watched_objects = fsnotify_sb_watched_objects(sb);

        /* the superblock can go away after this decrement */
        if (atomic_long_dec_and_test(watched_objects))
                wake_up_var(watched_objects);
}

static void fsnotify_get_inode_ref(struct inode *inode)
{
        ihold(inode);
        fsnotify_get_sb_watched_objects(inode->i_sb);
}

static void fsnotify_put_inode_ref(struct inode *inode)
{
        /* read ->i_sb before the inode can go away */
        struct super_block *sb = inode->i_sb;

        iput(inode);
        fsnotify_put_sb_watched_objects(sb);
}

/*
 * Grab or drop watched objects reference depending on whether the connector
 * is attached and has any marks attached.
 */
static void fsnotify_update_sb_watchers(struct super_block *sb,
                                        struct fsnotify_mark_connector *conn)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
        bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED;
        struct fsnotify_mark *first_mark = NULL;
        unsigned int highest_prio = 0;

        if (conn->obj)
                first_mark = hlist_entry_safe(conn->list.first,
                                              struct fsnotify_mark, obj_list);
        if (first_mark)
                highest_prio = first_mark->group->priority;
        if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM))
                highest_prio = 0;

        /*
         * If the highest priority of group watching this object is prio,
         * then watched object has a reference on counters [0..prio].
         * Update priority >= 1 watched objects counters.
         */
        for (unsigned int p = conn->prio + 1; p <= highest_prio; p++)
                atomic_long_inc(&sbinfo->watched_objects[p]);
        for (unsigned int p = conn->prio; p > highest_prio; p--)
                atomic_long_dec(&sbinfo->watched_objects[p]);
        conn->prio = highest_prio;

        /* Update priority >= 0 (a.k.a total) watched objects counter */
        BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0);
        if (first_mark && !is_watched) {
                conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED;
                fsnotify_get_sb_watched_objects(sb);
        } else if (!first_mark && is_watched) {
                conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED;
                fsnotify_put_sb_watched_objects(sb);
        }
}

/*
 * Grab or drop inode reference for the connector if needed.
 *
 * When it's time to drop the reference, we only clear the HAS_IREF flag and
 * return the inode object. fsnotify_drop_object() will be resonsible for doing
 * iput() outside of spinlocks. This happens when last mark that wanted iref is
 * detached.
 */
static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
                                          bool want_iref)
{
        bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
        struct inode *inode = NULL;

        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE ||
            want_iref == has_iref)
                return NULL;

        if (want_iref) {
                /* Pin inode if any mark wants inode refcount held */
                fsnotify_get_inode_ref(fsnotify_conn_inode(conn));
                conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
        } else {
                /* Unpin inode after detach of last mark that wanted iref */
                inode = fsnotify_conn_inode(conn);
                conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
        }

        return inode;
}

static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
        u32 new_mask = 0;
        bool want_iref = false;
        struct fsnotify_mark *mark;

        assert_spin_locked(&conn->lock);
        /* We can get detached connector here when inode is getting unlinked. */
        if (!fsnotify_valid_obj_type(conn->type))
                return NULL;
        hlist_for_each_entry(mark, &conn->list, obj_list) {
                if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
                        continue;
                new_mask |= fsnotify_calc_mask(mark);
                if (conn->type == FSNOTIFY_OBJ_TYPE_INODE &&
                    !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
                        want_iref = true;
        }
        /*
         * We use WRITE_ONCE() to prevent silly compiler optimizations from
         * confusing readers not holding conn->lock with partial updates.
         */
        WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask);

        return fsnotify_update_iref(conn, want_iref);
}

static bool fsnotify_conn_watches_children(
                                        struct fsnotify_mark_connector *conn)
{
        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
                return false;

        return fsnotify_inode_watches_children(fsnotify_conn_inode(conn));
}

static void fsnotify_conn_set_children_dentry_flags(
                                        struct fsnotify_mark_connector *conn)
{
        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
                return;

        fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
}

/*
 * Calculate mask of events for a list of marks. The caller must make sure
 * connector and connector->obj cannot disappear under us.  Callers achieve
 * this by holding a mark->lock or mark->group->mark_mutex for a mark on this
 * list.
 */
void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
        bool update_children;

        if (!conn)
                return;

        spin_lock(&conn->lock);
        update_children = !fsnotify_conn_watches_children(conn);
        __fsnotify_recalc_mask(conn);
        update_children &= fsnotify_conn_watches_children(conn);
        spin_unlock(&conn->lock);
        /*
         * Set children's PARENT_WATCHED flags only if parent started watching.
         * When parent stops watching, we clear false positive PARENT_WATCHED
         * flags lazily in __fsnotify_parent().
         */
        if (update_children)
                fsnotify_conn_set_children_dentry_flags(conn);
}

/* Free all connectors queued for freeing once SRCU period ends */
static void fsnotify_connector_destroy_workfn(struct work_struct *work)
{
        struct fsnotify_mark_connector *conn, *free;

        spin_lock(&destroy_lock);
        conn = connector_destroy_list;
        connector_destroy_list = NULL;
        spin_unlock(&destroy_lock);

        synchronize_srcu(&fsnotify_mark_srcu);
        while (conn) {
                free = conn;
                conn = conn->destroy_next;
                kfree(free);
        }
}

static void fsnotify_untrack_connector(struct fsnotify_mark_connector *conn);

static void *fsnotify_detach_connector_from_object(
                                        struct fsnotify_mark_connector *conn,
                                        unsigned int *type)
{
        fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type);
        struct super_block *sb = fsnotify_connector_sb(conn);
        struct inode *inode = NULL;

        *type = conn->type;
        if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED)
                return NULL;

        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
                inode = fsnotify_conn_inode(conn);
                inode->i_fsnotify_mask = 0;
                fsnotify_untrack_connector(conn);

                /* Unpin inode when detaching from connector */
                if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
                        inode = NULL;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
                fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
                fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) {
                fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0;
        }

        rcu_assign_pointer(*connp, NULL);
        conn->obj = NULL;
        conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
        if (sb)
                fsnotify_update_sb_watchers(sb, conn);

        return inode;
}

static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
{
        struct fsnotify_group *group = mark->group;

        if (WARN_ON_ONCE(!group))
                return;
        group->ops->free_mark(mark);
        fsnotify_put_group(group);
}

/* Drop object reference originally held by a connector */
static void fsnotify_drop_object(unsigned int type, void *objp)
{
        if (!objp)
                return;
        /* Currently only inode references are passed to be dropped */
        if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
                return;
        fsnotify_put_inode_ref(objp);
}

void fsnotify_put_mark(struct fsnotify_mark *mark)
{
        struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector);
        void *objp = NULL;
        unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
        bool free_conn = false;

        /* Catch marks that were actually never attached to object */
        if (!conn) {
                if (refcount_dec_and_test(&mark->refcnt))
                        fsnotify_final_mark_destroy(mark);
                return;
        }

        /*
         * We have to be careful so that traversals of obj_list under lock can
         * safely grab mark reference.
         */
        if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock))
                return;

        hlist_del_init_rcu(&mark->obj_list);
        if (hlist_empty(&conn->list)) {
                objp = fsnotify_detach_connector_from_object(conn, &type);
                free_conn = true;
        } else {
                struct super_block *sb = fsnotify_connector_sb(conn);

                /* Update watched objects after detaching mark */
                if (sb)
                        fsnotify_update_sb_watchers(sb, conn);
                objp = __fsnotify_recalc_mask(conn);
                type = conn->type;
        }
        WRITE_ONCE(mark->connector, NULL);
        spin_unlock(&conn->lock);

        fsnotify_drop_object(type, objp);

        if (free_conn) {
                spin_lock(&destroy_lock);
                conn->destroy_next = connector_destroy_list;
                connector_destroy_list = conn;
                spin_unlock(&destroy_lock);
                queue_work(system_dfl_wq, &connector_reaper_work);
        }
        /*
         * Note that we didn't update flags telling whether inode cares about
         * what's happening with children. We update these flags from
         * __fsnotify_parent() lazily when next event happens on one of our
         * children.
         */
        spin_lock(&destroy_lock);
        list_add(&mark->g_list, &destroy_list);
        spin_unlock(&destroy_lock);
        queue_delayed_work(system_dfl_wq, &reaper_work,
                           FSNOTIFY_REAPER_DELAY);
}
EXPORT_SYMBOL_GPL(fsnotify_put_mark);

/*
 * Get mark reference when we found the mark via lockless traversal of object
 * list. Mark can be already removed from the list by now and on its way to be
 * destroyed once SRCU period ends.
 *
 * Also pin the group so it doesn't disappear under us.
 */
static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
{
        if (!mark)
                return true;

        if (refcount_inc_not_zero(&mark->refcnt)) {
                spin_lock(&mark->lock);
                if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
                        /* mark is attached, group is still alive then */
                        atomic_inc(&mark->group->user_waits);
                        spin_unlock(&mark->lock);
                        return true;
                }
                spin_unlock(&mark->lock);
                fsnotify_put_mark(mark);
        }
        return false;
}

/*
 * Puts marks and wakes up group destruction if necessary.
 *
 * Pairs with fsnotify_get_mark_safe()
 */
static void fsnotify_put_mark_wake(struct fsnotify_mark *mark)
{
        if (mark) {
                struct fsnotify_group *group = mark->group;

                fsnotify_put_mark(mark);
                /*
                 * We abuse notification_waitq on group shutdown for waiting for
                 * all marks pinned when waiting for userspace.
                 */
                if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
                        wake_up(&group->notification_waitq);
        }
}

bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
        __releases(&fsnotify_mark_srcu)
{
        int type;

        fsnotify_foreach_iter_type(type) {
                /* This can fail if mark is being removed */
                if (!fsnotify_get_mark_safe(iter_info->marks[type])) {
                        __release(&fsnotify_mark_srcu);
                        goto fail;
                }
        }

        /*
         * Now that both marks are pinned by refcount in the inode / vfsmount
         * lists, we can drop SRCU lock, and safely resume the list iteration
         * once userspace returns.
         */
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);

        return true;

fail:
        for (type--; type >= 0; type--)
                fsnotify_put_mark_wake(iter_info->marks[type]);
        return false;
}

void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
        __acquires(&fsnotify_mark_srcu)
{
        int type;

        iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
        fsnotify_foreach_iter_type(type)
                fsnotify_put_mark_wake(iter_info->marks[type]);
}

/*
 * Mark mark as detached, remove it from group list. Mark still stays in object
 * list until its last reference is dropped. Note that we rely on mark being
 * removed from group list before corresponding reference to it is dropped. In
 * particular we rely on mark->connector being valid while we hold
 * group->mark_mutex if we found the mark through g_list.
 *
 * Must be called with group->mark_mutex held. The caller must either hold
 * reference to the mark or be protected by fsnotify_mark_srcu.
 */
void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
        fsnotify_group_assert_locked(mark->group);
        WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
                     refcount_read(&mark->refcnt) < 1 +
                        !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));

        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                spin_unlock(&mark->lock);
                return;
        }
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
        list_del_init(&mark->g_list);
        spin_unlock(&mark->lock);

        /* Drop mark reference acquired in fsnotify_add_mark_locked() */
        fsnotify_put_mark(mark);
}

/*
 * Free fsnotify mark. The mark is actually only marked as being freed.  The
 * freeing is actually happening only once last reference to the mark is
 * dropped from a workqueue which first waits for srcu period end.
 *
 * Caller must have a reference to the mark or be protected by
 * fsnotify_mark_srcu.
 */
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
        struct fsnotify_group *group = mark->group;

        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
                spin_unlock(&mark->lock);
                return;
        }
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
        spin_unlock(&mark->lock);

        /*
         * Some groups like to know that marks are being freed.  This is a
         * callback to the group function to let it know that this mark
         * is being freed.
         */
        if (group->ops->freeing_mark)
                group->ops->freeing_mark(mark, group);
}

void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                           struct fsnotify_group *group)
{
        fsnotify_group_lock(group);
        fsnotify_detach_mark(mark);
        fsnotify_group_unlock(group);
        fsnotify_free_mark(mark);
}
EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);

/*
 * Sorting function for lists of fsnotify marks.
 *
 * Fanotify supports different notification classes (reflected as priority of
 * notification group). Events shall be passed to notification groups in
 * decreasing priority order. To achieve this marks in notification lists for
 * inodes and vfsmounts are sorted so that priorities of corresponding groups
 * are descending.
 *
 * Furthermore correct handling of the ignore mask requires processing inode
 * and vfsmount marks of each group together. Using the group address as
 * further sort criterion provides a unique sorting order and thus we can
 * merge inode and vfsmount lists of marks in linear time and find groups
 * present in both lists.
 *
 * A return value of 1 signifies that b has priority over a.
 * A return value of 0 signifies that the two marks have to be handled together.
 * A return value of -1 signifies that a has priority over b.
 */
int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
{
        if (a == b)
                return 0;
        if (!a)
                return 1;
        if (!b)
                return -1;
        if (a->priority < b->priority)
                return 1;
        if (a->priority > b->priority)
                return -1;
        if (a < b)
                return 1;
        return -1;
}

static int fsnotify_attach_info_to_sb(struct super_block *sb)
{
        struct fsnotify_sb_info *sbinfo;

        /* sb info is freed on fsnotify_sb_delete() */
        sbinfo = kzalloc_obj(*sbinfo);
        if (!sbinfo)
                return -ENOMEM;

        INIT_LIST_HEAD(&sbinfo->inode_conn_list);
        spin_lock_init(&sbinfo->list_lock);
        /*
         * cmpxchg() provides the barrier so that callers of fsnotify_sb_info()
         * will observe an initialized structure
         */
        if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) {
                /* Someone else created sbinfo for us */
                kfree(sbinfo);
        }
        return 0;
}

struct fsnotify_inode_mark_connector {
        struct fsnotify_mark_connector common;
        struct list_head conns_list;
};

static struct inode *fsnotify_get_living_inode(struct fsnotify_sb_info *sbinfo)
{
        struct fsnotify_inode_mark_connector *iconn;
        struct inode *inode;

        spin_lock(&sbinfo->list_lock);
        /* Find the first non-evicting inode */
        list_for_each_entry(iconn, &sbinfo->inode_conn_list, conns_list) {
                /* All connectors on the list are still attached to an inode */
                inode = iconn->common.obj;
                /*
                 * For connectors without FSNOTIFY_CONN_FLAG_HAS_IREF
                 * (evictable marks) corresponding inode may well have 0
                 * refcount and can be undergoing eviction. OTOH list_lock
                 * protects us from the connector getting detached and inode
                 * freed. So we can poke around the inode safely.
                 */
                spin_lock(&inode->i_lock);
                if (likely(
                    !(inode_state_read(inode) & (I_FREEING | I_WILL_FREE)))) {
                        __iget(inode);
                        spin_unlock(&inode->i_lock);
                        spin_unlock(&sbinfo->list_lock);
                        return inode;
                }
                spin_unlock(&inode->i_lock);
        }
        spin_unlock(&sbinfo->list_lock);

        return NULL;
}

/**
 * fsnotify_unmount_inodes - an sb is unmounting. Handle any watched inodes.
 * @sbinfo: fsnotify info for superblock being unmounted.
 *
 * Walk all inode connectors for the superblock and free all associated marks.
 */
void fsnotify_unmount_inodes(struct fsnotify_sb_info *sbinfo)
{
        struct inode *inode;

        while ((inode = fsnotify_get_living_inode(sbinfo))) {
                fsnotify_inode(inode, FS_UNMOUNT);
                fsnotify_clear_marks_by_inode(inode);
                iput(inode);
                cond_resched();
        }
}

static void fsnotify_init_connector(struct fsnotify_mark_connector *conn,
                                    void *obj, unsigned int obj_type)
{
        spin_lock_init(&conn->lock);
        INIT_HLIST_HEAD(&conn->list);
        conn->flags = 0;
        conn->prio = 0;
        conn->type = obj_type;
        conn->obj = obj;
}

static struct fsnotify_mark_connector *
fsnotify_alloc_inode_connector(struct inode *inode)
{
        struct fsnotify_inode_mark_connector *iconn;
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(inode->i_sb);

        iconn = kmem_cache_alloc(fsnotify_inode_mark_connector_cachep,
                                 GFP_KERNEL);
        if (!iconn)
                return NULL;

        fsnotify_init_connector(&iconn->common, inode, FSNOTIFY_OBJ_TYPE_INODE);
        spin_lock(&sbinfo->list_lock);
        list_add(&iconn->conns_list, &sbinfo->inode_conn_list);
        spin_unlock(&sbinfo->list_lock);

        return &iconn->common;
}

static void fsnotify_untrack_connector(struct fsnotify_mark_connector *conn)
{
        struct fsnotify_inode_mark_connector *iconn;
        struct fsnotify_sb_info *sbinfo;

        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
                return;

        iconn = container_of(conn, struct fsnotify_inode_mark_connector, common);
        sbinfo = fsnotify_sb_info(fsnotify_conn_inode(conn)->i_sb);
        spin_lock(&sbinfo->list_lock);
        list_del(&iconn->conns_list);
        spin_unlock(&sbinfo->list_lock);
}

static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
                                               void *obj, unsigned int obj_type)
{
        struct fsnotify_mark_connector *conn;

        if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
                struct inode *inode = obj;

                conn = fsnotify_alloc_inode_connector(inode);
        } else {
                conn = kmem_cache_alloc(fsnotify_mark_connector_cachep,
                                        GFP_KERNEL);
                if (conn)
                        fsnotify_init_connector(conn, obj, obj_type);
        }
        if (!conn)
                return -ENOMEM;

        /*
         * cmpxchg() provides the barrier so that readers of *connp can see
         * only initialized structure
         */
        if (cmpxchg(connp, NULL, conn)) {
                /* Someone else created list structure for us */
                fsnotify_untrack_connector(conn);
                kfree(conn);
        }
        return 0;
}

/*
 * Get mark connector, make sure it is alive and return with its lock held.
 * This is for users that get connector pointer from inode or mount. Users that
 * hold reference to a mark on the list may directly lock connector->lock as
 * they are sure list cannot go away under them.
 */
static struct fsnotify_mark_connector *fsnotify_grab_connector(
                                                fsnotify_connp_t *connp)
{
        struct fsnotify_mark_connector *conn;
        int idx;

        idx = srcu_read_lock(&fsnotify_mark_srcu);
        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (!conn)
                goto out;
        spin_lock(&conn->lock);
        if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) {
                spin_unlock(&conn->lock);
                srcu_read_unlock(&fsnotify_mark_srcu, idx);
                return NULL;
        }
out:
        srcu_read_unlock(&fsnotify_mark_srcu, idx);
        return conn;
}

/*
 * Add mark into proper place in given list of marks. These marks may be used
 * for the fsnotify backend to determine which event types should be delivered
 * to which group and for which inodes. These marks are ordered according to
 * priority, highest number first, and then by the group's location in memory.
 */
static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
                                  unsigned int obj_type, int add_flags)
{
        struct super_block *sb = fsnotify_object_sb(obj, obj_type);
        struct fsnotify_mark *lmark, *last = NULL;
        struct fsnotify_mark_connector *conn;
        fsnotify_connp_t *connp;
        int cmp;
        int err = 0;

        if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
                return -EINVAL;

        /*
         * Attach the sb info before attaching a connector to any object on sb.
         * The sb info will remain attached as long as sb lives.
         */
        if (sb && !fsnotify_sb_info(sb)) {
                err = fsnotify_attach_info_to_sb(sb);
                if (err)
                        return err;
        }

        connp = fsnotify_object_connp(obj, obj_type);
restart:
        spin_lock(&mark->lock);
        conn = fsnotify_grab_connector(connp);
        if (!conn) {
                spin_unlock(&mark->lock);
                err = fsnotify_attach_connector_to_object(connp, obj, obj_type);
                if (err)
                        return err;
                goto restart;
        }

        /* is mark the first mark? */
        if (hlist_empty(&conn->list)) {
                hlist_add_head_rcu(&mark->obj_list, &conn->list);
                goto added;
        }

        /* should mark be in the middle of the current list? */
        hlist_for_each_entry(lmark, &conn->list, obj_list) {
                last = lmark;

                if ((lmark->group == mark->group) &&
                    (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
                    !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) {
                        err = -EEXIST;
                        goto out_err;
                }

                cmp = fsnotify_compare_groups(lmark->group, mark->group);
                if (cmp >= 0) {
                        hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
                        goto added;
                }
        }

        BUG_ON(last == NULL);
        /* mark should be the last entry.  last is the current last entry */
        hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
        if (sb)
                fsnotify_update_sb_watchers(sb, conn);
        /*
         * Since connector is attached to object using cmpxchg() we are
         * guaranteed that connector initialization is fully visible by anyone
         * seeing mark->connector set.
         */
        WRITE_ONCE(mark->connector, conn);
out_err:
        spin_unlock(&conn->lock);
        spin_unlock(&mark->lock);
        return err;
}

/*
 * Attach an initialized mark to a given group and fs object.
 * These marks may be used for the fsnotify backend to determine which
 * event types should be delivered to which group.
 */
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
                             void *obj, unsigned int obj_type,
                             int add_flags)
{
        struct fsnotify_group *group = mark->group;
        int ret = 0;

        fsnotify_group_assert_locked(group);

        /*
         * LOCKING ORDER!!!!
         * group->mark_mutex
         * mark->lock
         * mark->connector->lock
         */
        spin_lock(&mark->lock);
        mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;

        list_add(&mark->g_list, &group->marks_list);
        fsnotify_get_mark(mark); /* for g_list */
        spin_unlock(&mark->lock);

        ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags);
        if (ret)
                goto err;

        fsnotify_recalc_mask(mark->connector);

        return ret;
err:
        spin_lock(&mark->lock);
        mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE |
                         FSNOTIFY_MARK_FLAG_ATTACHED);
        list_del_init(&mark->g_list);
        spin_unlock(&mark->lock);

        fsnotify_put_mark(mark);
        return ret;
}

int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
                      unsigned int obj_type, int add_flags)
{
        int ret;
        struct fsnotify_group *group = mark->group;

        fsnotify_group_lock(group);
        ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags);
        fsnotify_group_unlock(group);
        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify_add_mark);

/*
 * Given a list of marks, find the mark associated with given group. If found
 * take a reference to that mark and return it, else return NULL.
 */
struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
                                         struct fsnotify_group *group)
{
        fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type);
        struct fsnotify_mark_connector *conn;
        struct fsnotify_mark *mark;

        if (!connp)
                return NULL;

        conn = fsnotify_grab_connector(connp);
        if (!conn)
                return NULL;

        hlist_for_each_entry(mark, &conn->list, obj_list) {
                if (mark->group == group &&
                    (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                        fsnotify_get_mark(mark);
                        spin_unlock(&conn->lock);
                        return mark;
                }
        }
        spin_unlock(&conn->lock);
        return NULL;
}
EXPORT_SYMBOL_GPL(fsnotify_find_mark);

/* Clear any marks in a group with given type mask */
void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                   unsigned int obj_type)
{
        struct fsnotify_mark *lmark, *mark;
        LIST_HEAD(to_free);
        struct list_head *head = &to_free;

        /* Skip selection step if we want to clear all marks. */
        if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) {
                head = &group->marks_list;
                goto clear;
        }
        /*
         * We have to be really careful here. Anytime we drop mark_mutex, e.g.
         * fsnotify_clear_marks_by_inode() can come and free marks. Even in our
         * to_free list so we have to use mark_mutex even when accessing that
         * list. And freeing mark requires us to drop mark_mutex. So we can
         * reliably free only the first mark in the list. That's why we first
         * move marks to free to to_free list in one go and then free marks in
         * to_free list one by one.
         */
        fsnotify_group_lock(group);
        list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
                if (mark->connector->type == obj_type)
                        list_move(&mark->g_list, &to_free);
        }
        fsnotify_group_unlock(group);

clear:
        while (1) {
                fsnotify_group_lock(group);
                if (list_empty(head)) {
                        fsnotify_group_unlock(group);
                        break;
                }
                mark = list_first_entry(head, struct fsnotify_mark, g_list);
                fsnotify_get_mark(mark);
                fsnotify_detach_mark(mark);
                fsnotify_group_unlock(group);
                fsnotify_free_mark(mark);
                fsnotify_put_mark(mark);
        }
}

/* Destroy all marks attached to an object via connector */
void fsnotify_destroy_marks(fsnotify_connp_t *connp)
{
        struct fsnotify_mark_connector *conn;
        struct fsnotify_mark *mark, *old_mark = NULL;
        void *objp;
        unsigned int type;

        conn = fsnotify_grab_connector(connp);
        if (!conn)
                return;
        /*
         * We have to be careful since we can race with e.g.
         * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the
         * list can get modified. However we are holding mark reference and
         * thus our mark cannot be removed from obj_list so we can continue
         * iteration after regaining conn->lock.
         */
        hlist_for_each_entry(mark, &conn->list, obj_list) {
                fsnotify_get_mark(mark);
                spin_unlock(&conn->lock);
                if (old_mark)
                        fsnotify_put_mark(old_mark);
                old_mark = mark;
                fsnotify_destroy_mark(mark, mark->group);
                spin_lock(&conn->lock);
        }
        /*
         * Detach list from object now so that we don't pin inode until all
         * mark references get dropped. It would lead to strange results such
         * as delaying inode deletion or blocking unmount.
         */
        objp = fsnotify_detach_connector_from_object(conn, &type);
        spin_unlock(&conn->lock);
        if (old_mark)
                fsnotify_put_mark(old_mark);
        fsnotify_drop_object(type, objp);
}

/*
 * Nothing fancy, just initialize lists and locks and counters.
 */
void fsnotify_init_mark(struct fsnotify_mark *mark,
                        struct fsnotify_group *group)
{
        memset(mark, 0, sizeof(*mark));
        spin_lock_init(&mark->lock);
        refcount_set(&mark->refcnt, 1);
        fsnotify_get_group(group);
        mark->group = group;
        WRITE_ONCE(mark->connector, NULL);
}
EXPORT_SYMBOL_GPL(fsnotify_init_mark);

/*
 * Destroy all marks in destroy_list, waits for SRCU period to finish before
 * actually freeing marks.
 */
static void fsnotify_mark_destroy_workfn(struct work_struct *work)
{
        struct fsnotify_mark *mark, *next;
        struct list_head private_destroy_list;

        spin_lock(&destroy_lock);
        /* exchange the list head */
        list_replace_init(&destroy_list, &private_destroy_list);
        spin_unlock(&destroy_lock);

        synchronize_srcu(&fsnotify_mark_srcu);

        list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
                list_del_init(&mark->g_list);
                fsnotify_final_mark_destroy(mark);
        }
}

/* Wait for all marks queued for destruction to be actually destroyed */
void fsnotify_wait_marks_destroyed(void)
{
        flush_delayed_work(&reaper_work);
}
EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);

__init void fsnotify_init_connector_caches(void)
{
        fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
                                                    SLAB_PANIC);
        fsnotify_inode_mark_connector_cachep = KMEM_CACHE(
                                        fsnotify_inode_mark_connector,
                                        SLAB_PANIC);
}





























































































































































































































































   63 





   19 

























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NODEMASK_H
#define __LINUX_NODEMASK_H

/*
 * Nodemasks provide a bitmap suitable for representing the
 * set of Node's in a system, one bit position per Node number.
 *
 * See detailed comments in the file linux/bitmap.h describing the
 * data type on which these nodemasks are based.
 *
 * For details of nodemask_parse_user(), see bitmap_parse_user() in
 * lib/bitmap.c.  For details of nodelist_parse(), see bitmap_parselist(),
 * also in bitmap.c.  For details of node_remap(), see bitmap_bitremap in
 * lib/bitmap.c.  For details of nodes_remap(), see bitmap_remap in
 * lib/bitmap.c.  For details of nodes_onto(), see bitmap_onto in
 * lib/bitmap.c.  For details of nodes_fold(), see bitmap_fold in
 * lib/bitmap.c.
 *
 * The available nodemask operations are:
 *
 * void node_set(node, mask)                turn on bit 'node' in mask
 * void node_clear(node, mask)                turn off bit 'node' in mask
 * void nodes_setall(mask)                set all bits
 * void nodes_clear(mask)                clear all bits
 * int node_isset(node, mask)                true iff bit 'node' set in mask
 * int node_test_and_set(node, mask)        test and set bit 'node' in mask
 *
 * void nodes_and(dst, src1, src2)        dst = src1 & src2  [intersection]
 * void nodes_or(dst, src1, src2)        dst = src1 | src2  [union]
 * void nodes_xor(dst, src1, src2)        dst = src1 ^ src2
 * void nodes_andnot(dst, src1, src2)        dst = src1 & ~src2
 * void nodes_complement(dst, src)        dst = ~src
 *
 * int nodes_equal(mask1, mask2)        Does mask1 == mask2?
 * int nodes_intersects(mask1, mask2)        Do mask1 and mask2 intersect?
 * int nodes_subset(mask1, mask2)        Is mask1 a subset of mask2?
 * int nodes_empty(mask)                Is mask empty (no bits sets)?
 * int nodes_full(mask)                        Is mask full (all bits sets)?
 * int nodes_weight(mask)                Hamming weight - number of set bits
 *
 * unsigned int first_node(mask)        Number lowest set bit, or MAX_NUMNODES
 * unsigend int next_node(node, mask)        Next node past 'node', or MAX_NUMNODES
 * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first,
 *                                        or MAX_NUMNODES
 * unsigned int first_unset_node(mask)        First node not set in mask, or
 *                                        MAX_NUMNODES
 *
 * nodemask_t nodemask_of_node(node)        Return nodemask with bit 'node' set
 * NODE_MASK_ALL                        Initializer - all bits set
 * NODE_MASK_NONE                        Initializer - no bits set
 * unsigned long *nodes_addr(mask)        Array of unsigned long's in mask
 *
 * int nodemask_parse_user(ubuf, ulen, mask)        Parse ascii string as nodemask
 * int nodelist_parse(buf, map)                Parse ascii string as nodelist
 * int node_remap(oldbit, old, new)        newbit = map(old, new)(oldbit)
 * void nodes_remap(dst, src, old, new)        *dst = map(old, new)(src)
 * void nodes_onto(dst, orig, relmap)        *dst = orig relative to relmap
 * void nodes_fold(dst, orig, sz)        dst bits = orig bits mod sz
 *
 * for_each_node_mask(node, mask)        for-loop node over mask
 *
 * int num_online_nodes()                Number of online Nodes
 * int num_possible_nodes()                Number of all possible Nodes
 *
 * int node_random(mask)                Random node with set bit in mask
 *
 * int node_online(node)                Is some node online?
 * int node_possible(node)                Is some node possible?
 *
 * node_set_online(node)                set bit 'node' in node_online_map
 * node_set_offline(node)                clear bit 'node' in node_online_map
 *
 * for_each_node(node)                        for-loop node over node_possible_map
 * for_each_online_node(node)                for-loop node over node_online_map
 *
 * Subtlety:
 * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
 *    to generate slightly worse code.  So use a simple one-line #define
 *    for node_isset(), instead of wrapping an inline inside a macro, the
 *    way we do the other calls.
 *
 * NODEMASK_SCRATCH
 * When doing above logical AND, OR, XOR, Remap operations the callers tend to
 * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large,
 * nodemask_t's consume too much stack space.  NODEMASK_SCRATCH is a helper
 * for such situations. See below and CPUMASK_ALLOC also.
 */

#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/minmax.h>
#include <linux/nodemask_types.h>
#include <linux/random.h>

extern nodemask_t _unused_nodemask_arg_;

/**
 * nodemask_pr_args - printf args to output a nodemask
 * @maskp: nodemask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
 */
#define nodemask_pr_args(maskp)        __nodemask_pr_numnodes(maskp), \
                                __nodemask_pr_bits(maskp)
static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
{
        return m ? MAX_NUMNODES : 0;
}
static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
{
        return m ? m->bits : NULL;
}

/*
 * The inline keyword gives the compiler room to decide to inline, or
 * not inline a function as it sees best.  However, as these functions
 * are called in both __init and non-__init functions, if they are not
 * inlined we will end up with a section mismatch error (of the type of
 * freeable items not being freed).  So we must use __always_inline here
 * to fix the problem.  If other functions in the future also end up in
 * this situation they will also need to be annotated as __always_inline
 */
#define node_set(node, dst) __node_set((node), &(dst))
static __always_inline void __node_set(int node, volatile nodemask_t *dstp)
{
        set_bit(node, dstp->bits);
}

#define node_clear(node, dst) __node_clear((node), &(dst))
static __always_inline void __node_clear(int node, volatile nodemask_t *dstp)
{
        clear_bit(node, dstp->bits);
}

#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_fill(dstp->bits, nbits);
}

#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_zero(dstp->bits, nbits);
}

/* No static inline type checking - see Subtlety (1) above. */
#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)

#define node_test_and_set(node, nodemask) \
                        __node_test_and_set((node), &(nodemask))
static __always_inline bool __node_test_and_set(int node, nodemask_t *addr)
{
        return test_and_set_bit(node, addr->bits);
}

#define nodes_and(dst, src1, src2) \
                        __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_or(dst, src1, src2) \
                        __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_xor(dst, src1, src2) \
                        __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_andnot(dst, src1, src2) \
                        __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES)
static __always_inline void __nodes_copy(nodemask_t *dstp,
                                        const nodemask_t *srcp, unsigned int nbits)
{
        bitmap_copy(dstp->bits, srcp->bits, nbits);
}

#define nodes_complement(dst, src) \
                        __nodes_complement(&(dst), &(src), MAX_NUMNODES)
static __always_inline void __nodes_complement(nodemask_t *dstp,
                                        const nodemask_t *srcp, unsigned int nbits)
{
        bitmap_complement(dstp->bits, srcp->bits, nbits);
}

#define nodes_equal(src1, src2) \
                        __nodes_equal(&(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_equal(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_equal(src1p->bits, src2p->bits, nbits);
}

#define nodes_intersects(src1, src2) \
                        __nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_intersects(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_intersects(src1p->bits, src2p->bits, nbits);
}

#define nodes_subset(src1, src2) \
                        __nodes_subset(&(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_subset(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_subset(src1p->bits, src2p->bits, nbits);
}

#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_empty(srcp->bits, nbits);
}

#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_full(srcp->bits, nbits);
}

#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_weight(srcp->bits, nbits);
}

/* FIXME: better would be to fix all architectures to never return
          > MAX_NUMNODES, then the silly min()s could be dropped. */

#define first_node(src) __first_node(&(src))
static __always_inline unsigned int __first_node(const nodemask_t *srcp)
{
        return min(MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

#define next_node(n, src) __next_node((n), &(src))
static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp)
{
        return min(MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

/*
 * Find the next present node in src, starting after node n, wrapping around to
 * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
 */
#define next_node_in(n, src) __next_node_in((n), &(src))
static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
{
        unsigned int ret = __next_node(node, srcp);

        if (ret == MAX_NUMNODES)
                ret = __first_node(srcp);
        return ret;
}

static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node)
{
        nodes_clear(*mask);
        node_set(node, *mask);
}

#define nodemask_of_node(node)                                                \
({                                                                        \
        typeof(_unused_nodemask_arg_) m;                                \
        if (sizeof(m) == sizeof(unsigned long)) {                        \
                m.bits[0] = 1UL << (node);                                \
        } else {                                                        \
                init_nodemask_of_node(&m, (node));                        \
        }                                                                \
        m;                                                                \
})

#define first_unset_node(mask) __first_unset_node(&(mask))
static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp)
{
        return min(MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES));
}

#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)

#if MAX_NUMNODES <= BITS_PER_LONG

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#else

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL,                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#endif

#define NODE_MASK_NONE                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] =  0UL                        \
} })

#define nodes_addr(src) ((src).bits)

#define nodemask_parse_user(ubuf, ulen, dst) \
                __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
static __always_inline int __nodemask_parse_user(const char __user *buf, int len,
                                        nodemask_t *dstp, int nbits)
{
        return bitmap_parse_user(buf, len, dstp->bits, nbits);
}

#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
{
        return bitmap_parselist(buf, dstp->bits, nbits);
}

#define node_remap(oldbit, old, new) \
                __node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
static __always_inline int __node_remap(int oldbit,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
}

#define nodes_remap(dst, src, old, new) \
                __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
}

#define nodes_onto(dst, orig, relmap) \
                __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
                const nodemask_t *relmapp, int nbits)
{
        bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
}

#define nodes_fold(dst, orig, sz) \
                __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
                int sz, int nbits)
{
        bitmap_fold(dstp->bits, origp->bits, sz, nbits);
}

#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask)                                    \
        for ((node) = first_node(mask);                                    \
             (node) < MAX_NUMNODES;                                    \
             (node) = next_node((node), (mask)))
#else /* MAX_NUMNODES == 1 */
#define for_each_node_mask(node, mask)                                  \
        for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++)
#endif /* MAX_NUMNODES */

/*
 * Bitmasks that are kept for all the nodes.
 */
enum node_states {
        N_POSSIBLE,                /* The node could become online at some point */
        N_ONLINE,                /* The node is online */
        N_NORMAL_MEMORY,        /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
        N_HIGH_MEMORY,                /* The node has regular or high memory */
#else
        N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
        N_MEMORY,                /* The node has memory(regular, high, movable) */
        N_CPU,                /* The node has one or more cpus */
        N_GENERIC_INITIATOR,        /* The node has one or more Generic Initiators */
        NR_NODE_STATES
};

/*
 * The following particular system nodemasks and operations
 * on them manage all possible and online nodes.
 */

extern nodemask_t node_states[NR_NODE_STATES];

#if MAX_NUMNODES > 1
static __always_inline int node_state(int node, enum node_states state)
{
        return node_isset(node, node_states[state]);
}

static __always_inline void node_set_state(int node, enum node_states state)
{
        __node_set(node, &node_states[state]);
}

static __always_inline void node_clear_state(int node, enum node_states state)
{
        __node_clear(node, &node_states[state]);
}

static __always_inline int num_node_state(enum node_states state)
{
        return nodes_weight(node_states[state]);
}

#define for_each_node_state(__node, __state) \
        for_each_node_mask((__node), node_states[__state])

#define first_online_node        first_node(node_states[N_ONLINE])
#define first_memory_node        first_node(node_states[N_MEMORY])
static __always_inline unsigned int next_online_node(int nid)
{
        return next_node(nid, node_states[N_ONLINE]);
}
static __always_inline unsigned int next_memory_node(int nid)
{
        return next_node(nid, node_states[N_MEMORY]);
}

extern unsigned int nr_node_ids;
extern unsigned int nr_online_nodes;

static __always_inline void node_set_online(int nid)
{
        node_set_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

static __always_inline void node_set_offline(int nid)
{
        node_clear_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

#else

static __always_inline int node_state(int node, enum node_states state)
{
        return node == 0;
}

static __always_inline void node_set_state(int node, enum node_states state)
{
}

static __always_inline void node_clear_state(int node, enum node_states state)
{
}

static __always_inline int num_node_state(enum node_states state)
{
        return 1;
}

#define for_each_node_state(node, __state) \
        for ( (node) = 0; (node) == 0; (node) = 1)

#define first_online_node        0
#define first_memory_node        0
#define next_online_node(nid)        (MAX_NUMNODES)
#define next_memory_node(nid)        (MAX_NUMNODES)
#define nr_node_ids                1U
#define nr_online_nodes                1U

#define node_set_online(node)           node_set_state((node), N_ONLINE)
#define node_set_offline(node)           node_clear_state((node), N_ONLINE)

#endif

static __always_inline int node_random(const nodemask_t *maskp)
{
#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
        int node = find_random_bit(maskp->bits, MAX_NUMNODES);

        return node < MAX_NUMNODES ? node : NUMA_NO_NODE;
#else
        return 0;
#endif
}

#define node_online_map         node_states[N_ONLINE]
#define node_possible_map         node_states[N_POSSIBLE]

#define num_online_nodes()        num_node_state(N_ONLINE)
#define num_possible_nodes()        num_node_state(N_POSSIBLE)
#define node_online(node)        node_state((node), N_ONLINE)
#define node_possible(node)        node_state((node), N_POSSIBLE)

#define for_each_node(node)           for_each_node_state(node, N_POSSIBLE)
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
#define for_each_node_with_cpus(node)        for_each_node_state(node, N_CPU)

/*
 * For nodemask scratch area.
 * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
 * name.
 */
#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */
#define NODEMASK_ALLOC(type, name, gfp_flags)        \
                        type *name = kmalloc(sizeof(*name), gfp_flags)
#define NODEMASK_FREE(m)                        kfree(m)
#else
#define NODEMASK_ALLOC(type, name, gfp_flags)        type _##name, *name = &_##name
#define NODEMASK_FREE(m)                        do {} while (0)
#endif

/* Example structure for using NODEMASK_ALLOC, used in mempolicy. */
struct nodemask_scratch {
        nodemask_t        mask1;
        nodemask_t        mask2;
};

#define NODEMASK_SCRATCH(x)                                                \
                        NODEMASK_ALLOC(struct nodemask_scratch, x,        \
                                        GFP_KERNEL | __GFP_NORETRY)
#define NODEMASK_SCRATCH_FREE(x)        NODEMASK_FREE(x)


#endif /* __LINUX_NODEMASK_H */






































































































    6 




















    6 

    6 





























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
// SPDX-License-Identifier: GPL-2.0-only
/*
 * net/ipv6/fib6_rules.c        IPv6 Routing Policy Rules
 *
 * Copyright (C)2003-2006 Helsinki University of Technology
 * Copyright (C)2003-2006 USAGI/WIDE Project
 *
 * Authors
 *        Thomas Graf                <tgraf@suug.ch>
 *        Ville Nuorvala                <vnuorval@tcs.hut.fi>
 */

#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/export.h>
#include <linux/indirect_call_wrapper.h>

#include <net/fib_rules.h>
#include <net/inet_dscp.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/netlink.h>

struct fib6_rule {
        struct fib_rule                common;
        struct rt6key                src;
        struct rt6key                dst;
        __be32                        flowlabel;
        __be32                        flowlabel_mask;
        dscp_t                        dscp;
        dscp_t                        dscp_mask;
        u8                        dscp_full:1;        /* DSCP or TOS selector */
};

static bool fib6_rule_matchall(const struct fib_rule *rule)
{
        struct fib6_rule *r = container_of(rule, struct fib6_rule, common);

        if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask)
                return false;
        return fib_rule_matchall(rule);
}

bool fib6_rule_default(const struct fib_rule *rule)
{
        if (!fib6_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL ||
            rule->l3mdev)
                return false;
        if (rule->table != RT6_TABLE_LOCAL && rule->table != RT6_TABLE_MAIN)
                return false;
        return true;
}
EXPORT_SYMBOL_GPL(fib6_rule_default);

int fib6_rules_dump(struct net *net, struct notifier_block *nb,
                    struct netlink_ext_ack *extack)
{
        return fib_rules_dump(net, nb, AF_INET6, extack);
}

unsigned int fib6_rules_seq_read(const struct net *net)
{
        return fib_rules_seq_read(net, AF_INET6);
}

/* called with rcu lock held; no reference taken on fib6_info */
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                struct fib6_result *res, int flags)
{
        int err;

        if (net->ipv6.fib6_has_custom_rules) {
                struct fib_lookup_arg arg = {
                        .lookup_ptr = fib6_table_lookup,
                        .lookup_data = &oif,
                        .result = res,
                        .flags = FIB_LOOKUP_NOREF,
                };

                l3mdev_update_flow(net, flowi6_to_flowi(fl6));

                err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
                                       flowi6_to_flowi(fl6), flags, &arg);
        } else {
                err = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, oif,
                                        fl6, res, flags);
                if (err || res->f6i == net->ipv6.fib6_null_entry)
                        err = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
                                                oif, fl6, res, flags);
        }

        return err;
}
#if IS_MODULE(CONFIG_NFT_FIB_IPV6)
EXPORT_SYMBOL_GPL(fib6_lookup);
#endif

struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb,
                                   int flags, pol_lookup_t lookup)
{
        if (net->ipv6.fib6_has_custom_rules) {
                struct fib6_result res = {};
                struct fib_lookup_arg arg = {
                        .lookup_ptr = lookup,
                        .lookup_data = skb,
                        .result = &res,
                        .flags = FIB_LOOKUP_NOREF,
                };

                /* update flow if oif or iif point to device enslaved to l3mdev */
                l3mdev_update_flow(net, flowi6_to_flowi(fl6));

                fib_rules_lookup(net->ipv6.fib6_rules_ops,
                                 flowi6_to_flowi(fl6), flags, &arg);

                if (res.rt6)
                        return &res.rt6->dst;
        } else {
                struct rt6_info *rt;

                rt = pol_lookup_func(lookup,
                             net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
                if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
                        return &rt->dst;
                ip6_rt_put_flags(rt, flags);
                rt = pol_lookup_func(lookup,
                             net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
                if (rt->dst.error != -EAGAIN)
                        return &rt->dst;
                ip6_rt_put_flags(rt, flags);
        }

        if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                dst_hold(&net->ipv6.ip6_null_entry->dst);
        return &net->ipv6.ip6_null_entry->dst;
}

static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
                           struct flowi6 *flp6, const struct net_device *dev)
{
        struct fib6_rule *r = (struct fib6_rule *)rule;

        /* If we need to find a source address for this traffic,
         * we check the result if it meets requirement of the rule.
         */
        if ((rule->flags & FIB_RULE_FIND_SADDR) &&
            r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
                struct in6_addr saddr;

                if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
                                       rt6_flags2srcprefs(flags), &saddr))
                        return -EAGAIN;

                if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
                        return -EAGAIN;

                flp6->saddr = saddr;
        }

        return 0;
}

static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
                                int flags, struct fib_lookup_arg *arg)
{
        struct fib6_result *res = arg->result;
        struct flowi6 *flp6 = &flp->u.ip6;
        struct net *net = rule->fr_net;
        struct fib6_table *table;
        int err, *oif;
        u32 tb_id;

        switch (rule->action) {
        case FR_ACT_TO_TBL:
                break;
        case FR_ACT_UNREACHABLE:
                return -ENETUNREACH;
        case FR_ACT_PROHIBIT:
                return -EACCES;
        case FR_ACT_BLACKHOLE:
        default:
                return -EINVAL;
        }

        tb_id = fib_rule_get_table(rule, arg);
        table = fib6_get_table(net, tb_id);
        if (!table)
                return -EAGAIN;

        oif = (int *)arg->lookup_data;
        err = fib6_table_lookup(net, table, *oif, flp6, res, flags);
        if (!err && res->f6i != net->ipv6.fib6_null_entry)
                err = fib6_rule_saddr(net, rule, flags, flp6,
                                      res->nh->fib_nh_dev);
        else
                err = -EAGAIN;

        return err;
}

static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
                              int flags, struct fib_lookup_arg *arg)
{
        struct fib6_result *res = arg->result;
        struct flowi6 *flp6 = &flp->u.ip6;
        struct rt6_info *rt = NULL;
        struct fib6_table *table;
        struct net *net = rule->fr_net;
        pol_lookup_t lookup = arg->lookup_ptr;
        int err = 0;
        u32 tb_id;

        switch (rule->action) {
        case FR_ACT_TO_TBL:
                break;
        case FR_ACT_UNREACHABLE:
                err = -ENETUNREACH;
                rt = net->ipv6.ip6_null_entry;
                goto discard_pkt;
        default:
        case FR_ACT_BLACKHOLE:
                err = -EINVAL;
                rt = net->ipv6.ip6_blk_hole_entry;
                goto discard_pkt;
        case FR_ACT_PROHIBIT:
                err = -EACCES;
                rt = net->ipv6.ip6_prohibit_entry;
                goto discard_pkt;
        }

        tb_id = fib_rule_get_table(rule, arg);
        table = fib6_get_table(net, tb_id);
        if (!table) {
                err = -EAGAIN;
                goto out;
        }

        rt = pol_lookup_func(lookup,
                             net, table, flp6, arg->lookup_data, flags);
        if (rt != net->ipv6.ip6_null_entry) {
                struct inet6_dev *idev = ip6_dst_idev(&rt->dst);

                if (!idev)
                        goto again;
                err = fib6_rule_saddr(net, rule, flags, flp6,
                                      idev->dev);

                if (err == -EAGAIN)
                        goto again;

                err = rt->dst.error;
                if (err != -EAGAIN)
                        goto out;
        }
again:
        ip6_rt_put_flags(rt, flags);
        err = -EAGAIN;
        rt = NULL;
        goto out;

discard_pkt:
        if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                dst_hold(&rt->dst);
out:
        res->rt6 = rt;
        return err;
}

INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule,
                                             struct flowi *flp, int flags,
                                             struct fib_lookup_arg *arg)
{
        if (arg->lookup_ptr == fib6_table_lookup)
                return fib6_rule_action_alt(rule, flp, flags, arg);

        return __fib6_rule_action(rule, flp, flags, arg);
}

INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule,
                                                int flags,
                                                struct fib_lookup_arg *arg)
{
        struct fib6_result *res = arg->result;
        struct rt6_info *rt = res->rt6;
        struct net_device *dev = NULL;

        if (!rt)
                return false;

        if (rt->rt6i_idev)
                dev = rt->rt6i_idev->dev;

        /* do not accept result if the route does
         * not meet the required prefix length
         */
        if (rt->rt6i_dst.plen <= rule->suppress_prefixlen)
                goto suppress_route;

        /* do not accept result if the route uses a device
         * belonging to a forbidden interface group
         */
        if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
                goto suppress_route;

        return false;

suppress_route:
        ip6_rt_put_flags(rt, flags);
        return true;
}

INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule,
                                            struct flowi *fl, int flags)
{
        struct fib6_rule *r = (struct fib6_rule *) rule;
        struct flowi6 *fl6 = &fl->u.ip6;

        if (r->dst.plen &&
            !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen))
                return 0;

        /*
         * If FIB_RULE_FIND_SADDR is set and we do not have a
         * source address for the traffic, we defer check for
         * source address.
         */
        if (r->src.plen) {
                if (flags & RT6_LOOKUP_F_HAS_SADDR) {
                        if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr,
                                               r->src.plen))
                                return 0;
                } else if (!(r->common.flags & FIB_RULE_FIND_SADDR))
                        return 0;
        }

        if ((r->dscp ^ ip6_dscp(fl6->flowlabel)) & r->dscp_mask)
                return 0;

        if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask)
                return 0;

        if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto))
                return 0;

        if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask,
                                 fl6->fl6_sport))
                return 0;

        if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask,
                                 fl6->fl6_dport))
                return 0;

        return 1;
}

static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6,
                             struct netlink_ext_ack *extack)
{
        if (rule6->dscp) {
                NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP");
                return -EINVAL;
        }

        rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
        rule6->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK);
        rule6->dscp_full = true;

        return 0;
}

static int fib6_nl2rule_dscp_mask(const struct nlattr *nla,
                                  struct fib6_rule *rule6,
                                  struct netlink_ext_ack *extack)
{
        dscp_t dscp_mask;

        if (!rule6->dscp_full) {
                NL_SET_ERR_MSG_ATTR(extack, nla,
                                    "Cannot specify DSCP mask without DSCP value");
                return -EINVAL;
        }

        dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
        if (rule6->dscp & ~dscp_mask) {
                NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask");
                return -EINVAL;
        }

        rule6->dscp_mask = dscp_mask;

        return 0;
}

static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6,
                                  struct netlink_ext_ack *extack)
{
        __be32 flowlabel, flowlabel_mask;

        if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) ||
            NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK))
                return -EINVAL;

        flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]);
        flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]);

        if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) {
                NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK],
                                    "Invalid flow label mask");
                return -EINVAL;
        }

        if (flowlabel & ~flowlabel_mask) {
                NL_SET_ERR_MSG(extack, "Flow label and mask do not match");
                return -EINVAL;
        }

        rule6->flowlabel = flowlabel;
        rule6->flowlabel_mask = flowlabel_mask;

        return 0;
}

static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
                               struct fib_rule_hdr *frh,
                               struct nlattr **tb,
                               struct netlink_ext_ack *extack)
{
        struct fib6_rule *rule6 = (struct fib6_rule *)rule;
        struct net *net = rule->fr_net;
        int err = -EINVAL;

        if (!inet_validate_dscp(frh->tos)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid dsfield (tos): ECN bits must be 0");
                goto errout;
        }
        rule6->dscp = inet_dsfield_to_dscp(frh->tos);
        rule6->dscp_mask = frh->tos ? inet_dsfield_to_dscp(INET_DSCP_MASK) : 0;

        if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0)
                goto errout;

        if (tb[FRA_DSCP_MASK] &&
            fib6_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule6, extack) < 0)
                goto errout;

        if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) &&
            fib6_nl2rule_flowlabel(tb, rule6, extack) < 0)
                goto errout;

        if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
                if (rule->table == RT6_TABLE_UNSPEC) {
                        NL_SET_ERR_MSG(extack, "Invalid table");
                        goto errout;
                }

                if (fib6_new_table(net, rule->table) == NULL) {
                        err = -ENOBUFS;
                        goto errout;
                }
        }

        if (frh->src_len)
                rule6->src.addr = nla_get_in6_addr(tb[FRA_SRC]);

        if (frh->dst_len)
                rule6->dst.addr = nla_get_in6_addr(tb[FRA_DST]);

        rule6->src.plen = frh->src_len;
        rule6->dst.plen = frh->dst_len;

        if (fib_rule_requires_fldissect(rule))
                net->ipv6.fib6_rules_require_fldissect++;

        net->ipv6.fib6_has_custom_rules = true;
        err = 0;
errout:
        return err;
}

static int fib6_rule_delete(struct fib_rule *rule)
{
        struct net *net = rule->fr_net;

        if (net->ipv6.fib6_rules_require_fldissect &&
            fib_rule_requires_fldissect(rule))
                net->ipv6.fib6_rules_require_fldissect--;

        return 0;
}

static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
                             struct nlattr **tb)
{
        struct fib6_rule *rule6 = (struct fib6_rule *) rule;

        if (frh->src_len && (rule6->src.plen != frh->src_len))
                return 0;

        if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
                return 0;

        if (frh->tos &&
            (rule6->dscp_full ||
             inet_dscp_to_dsfield(rule6->dscp) != frh->tos))
                return 0;

        if (tb[FRA_DSCP]) {
                dscp_t dscp;

                dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2);
                if (!rule6->dscp_full || rule6->dscp != dscp)
                        return 0;
        }

        if (tb[FRA_DSCP_MASK]) {
                dscp_t dscp_mask;

                dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2);
                if (!rule6->dscp_full || rule6->dscp_mask != dscp_mask)
                        return 0;
        }

        if (tb[FRA_FLOWLABEL] &&
            nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel)
                return 0;

        if (tb[FRA_FLOWLABEL_MASK] &&
            nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask)
                return 0;

        if (frh->src_len &&
            nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr)))
                return 0;

        if (frh->dst_len &&
            nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr)))
                return 0;

        return 1;
}

static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
                          struct fib_rule_hdr *frh)
{
        struct fib6_rule *rule6 = (struct fib6_rule *) rule;

        frh->dst_len = rule6->dst.plen;
        frh->src_len = rule6->src.plen;

        if (rule6->dscp_full) {
                frh->tos = 0;
                if (nla_put_u8(skb, FRA_DSCP,
                               inet_dscp_to_dsfield(rule6->dscp) >> 2) ||
                    nla_put_u8(skb, FRA_DSCP_MASK,
                               inet_dscp_to_dsfield(rule6->dscp_mask) >> 2))
                        goto nla_put_failure;
        } else {
                frh->tos = inet_dscp_to_dsfield(rule6->dscp);
        }

        if (rule6->flowlabel_mask &&
            (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) ||
             nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask)))
                goto nla_put_failure;

        if ((rule6->dst.plen &&
             nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
            (rule6->src.plen &&
             nla_put_in6_addr(skb, FRA_SRC, &rule6->src.addr)))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -ENOBUFS;
}

static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
{
        return nla_total_size(16) /* dst */
               + nla_total_size(16) /* src */
               + nla_total_size(1) /* dscp */
               + nla_total_size(1) /* dscp mask */
               + nla_total_size(4) /* flowlabel */
               + nla_total_size(4); /* flowlabel mask */
}

static void fib6_rule_flush_cache(struct fib_rules_ops *ops)
{
        rt_genid_bump_ipv6(ops->fro_net);
}

static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
        .family                        = AF_INET6,
        .rule_size                = sizeof(struct fib6_rule),
        .addr_size                = sizeof(struct in6_addr),
        .action                        = fib6_rule_action,
        .match                        = fib6_rule_match,
        .suppress                = fib6_rule_suppress,
        .configure                = fib6_rule_configure,
        .delete                        = fib6_rule_delete,
        .compare                = fib6_rule_compare,
        .fill                        = fib6_rule_fill,
        .nlmsg_payload                = fib6_rule_nlmsg_payload,
        .flush_cache                = fib6_rule_flush_cache,
        .nlgroup                = RTNLGRP_IPV6_RULE,
        .owner                        = THIS_MODULE,
        .fro_net                = &init_net,
};

static int __net_init fib6_rules_net_init(struct net *net)
{
        struct fib_rules_ops *ops;
        int err;

        ops = fib_rules_register(&fib6_rules_ops_template, net);
        if (IS_ERR(ops))
                return PTR_ERR(ops);

        err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL);
        if (err)
                goto out_fib6_rules_ops;

        err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN);
        if (err)
                goto out_fib6_rules_ops;

        net->ipv6.fib6_rules_ops = ops;
        net->ipv6.fib6_rules_require_fldissect = 0;
out:
        return err;

out_fib6_rules_ops:
        fib_rules_unregister(ops);
        goto out;
}

static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list)
{
        struct net *net;

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list) {
                fib_rules_unregister(net->ipv6.fib6_rules_ops);
                cond_resched();
        }
        rtnl_unlock();
}

static struct pernet_operations fib6_rules_net_ops = {
        .init = fib6_rules_net_init,
        .exit_batch = fib6_rules_net_exit_batch,
};

int __init fib6_rules_init(void)
{
        return register_pernet_subsys(&fib6_rules_net_ops);
}


void fib6_rules_cleanup(void)
{
        unregister_pernet_subsys(&fib6_rules_net_ops);
}



































































































































































































































































































    4 




   16 

























   17 























   12 














































































   11 





   12 
   12 


    1 





   12 

    2 









   13 



   11 
   13 



   11 
   11 
   13 


























    1 
    1 


    1 

    1 









    1 
    1 
























































    3 

    3 
    3 
    3 



   10 
   12 
    3 








































   10 

    1 





















    2 






   11 












   12 


   11 

    1 




   11 

    2 

   11 
    1 



   12 

    2 


   10 




   13 
   13 


















   12 


    1 


   13 





































   13 














    1 







    1 





































   21 

























   21 
















   11 








   13 









   11 
    1 


   13 






































   20 
   16 




   20 


   13 




    1 

    1 


















    4 

    3 






















































    1 


























































    1 


    1 


















































































    3 
    3 






































































































































































    1 






    1 




    1 












    1 


































    1 


















    1 





    1 




























































    4 



    3 









    3 


    4 


































    1 



    1 

































































    1 




























    1 


























































    1 

    1 











    1 










    1 






    1 






















   24 
















    2 


   22 








    2 




























   24 
























    2 









    3 






































   21 







    7 
   14 























    1 


















    1 

    1 
    1 


    1 



    1 
    1 


















    5 




   19 




   14 


    3 
   19 


   22 





   21 
   24 
   19 






   22 

   21 



   22 


















   21 


   18 

































    3 





    2 













































































































































































    4 














































































   17 





























    9 




















    7 



   16 






























    4 




    3 



    4 




















    5 




























    4 










    4 

















































































    5 


    5 
    6 
    5 

































    3 




















    3 













    3 
    3 






    3 
    3 
























    3 












    3 









































    3 
    2 














    3 






    3 

    3 


    3 

















    5 




    3 


    3 

    5 
    1 
    4 
    2 
    2 



    2 


    3 
    3 

    4 
    2 












    4 
    3 








    3 
    3 
    3 






    3 


































































































































































































































































    2 










    1 
    1 




























































    1 














































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/dcache.c
 *
 * Complete reimplementation
 * (C) 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

/*
 * Notes on the allocation strategy:
 *
 * The dcache is a master of the icache - whenever a dcache entry
 * exists, the inode will always exist. "iput()" is done either when
 * the dcache entry is deleted or garbage collected.
 */

#include <linux/ratelimit.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/cache.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/memblock.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"

#include <asm/runtime-const.h>

/*
 * Usage:
 * dcache->d_inode->i_lock protects:
 *   - i_dentry, d_u.d_alias, d_inode of aliases
 * dcache_hash_bucket lock protects:
 *   - the dcache hash table
 * s_roots bl list spinlock protects:
 *   - the s_roots list (see __d_drop)
 * dentry->d_sb->s_dentry_lru_lock protects:
 *   - the dcache lru lists and counters
 * d_lock protects:
 *   - d_flags
 *   - d_name
 *   - d_lru
 *   - d_count
 *   - d_unhashed()
 *   - d_parent and d_chilren
 *   - childrens' d_sib and d_parent
 *   - d_u.d_alias, d_inode
 *
 * Ordering:
 * dentry->d_inode->i_lock
 *   dentry->d_lock
 *     dentry->d_sb->s_dentry_lru_lock
 *     dcache_hash_bucket lock
 *     s_roots lock
 *
 * If there is an ancestor relationship:
 * dentry->d_parent->...->d_parent->d_lock
 *   ...
 *     dentry->d_parent->d_lock
 *       dentry->d_lock
 *
 * If no ancestor relationship:
 * arbitrary, since it's serialized on rename_lock
 */
static int sysctl_vfs_cache_pressure __read_mostly = 100;
static int sysctl_vfs_cache_pressure_denom __read_mostly = 100;

unsigned long vfs_pressure_ratio(unsigned long val)
{
        return mult_frac(val, sysctl_vfs_cache_pressure, sysctl_vfs_cache_pressure_denom);
}
EXPORT_SYMBOL_GPL(vfs_pressure_ratio);

__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);

EXPORT_SYMBOL(rename_lock);

static struct kmem_cache *__dentry_cache __ro_after_init;
#define dentry_cache runtime_const_ptr(__dentry_cache)

const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
const struct qstr slash_name = QSTR_INIT("/", 1);
EXPORT_SYMBOL(slash_name);
const struct qstr dotdot_name = QSTR_INIT("..", 2);
EXPORT_SYMBOL(dotdot_name);

/*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
 * to make this good - I've just made it work.
 *
 * This hash-function tries to avoid losing too many bits of hash
 * information, yet avoid using a prime hash-size or similar.
 *
 * Marking the variables "used" ensures that the compiler doesn't
 * optimize them away completely on architectures with runtime
 * constant infrastructure, this allows debuggers to see their
 * values. But updating these values has no effect on those arches.
 */

static unsigned int d_hash_shift __ro_after_init __used;

static struct hlist_bl_head *dentry_hashtable __ro_after_init __used;

static inline struct hlist_bl_head *d_hash(unsigned long hashlen)
{
        return runtime_const_ptr(dentry_hashtable) +
                runtime_const_shift_right_32(hashlen, d_hash_shift);
}

#define IN_LOOKUP_SHIFT 10
static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];

static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
                                        unsigned int hash)
{
        hash += (unsigned long) parent / L1_CACHE_BYTES;
        return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
}

struct dentry_stat_t {
        long nr_dentry;
        long nr_unused;
        long age_limit;                /* age in seconds */
        long want_pages;        /* pages requested by system */
        long nr_negative;        /* # of unused negative dentries */
        long dummy;                /* Reserved for future use */
};

static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);
static int dentry_negative_policy;

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
/* Statistics gathering. */
static struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
};

/*
 * Here we resort to our own counters instead of using generic per-cpu counters
 * for consistency with what the vfs inode code does. We are expected to harvest
 * better code and performance by having our own specialized counters.
 *
 * Please note that the loop is done over all possible CPUs, not over all online
 * CPUs. The reason for this is that we don't want to play games with CPUs going
 * on and off. If one of them goes off, we will just keep their counters.
 *
 * glommer: See cffbc8a for details, and if you ever intend to change this,
 * please update all vfs counters to match.
 */
static long get_nr_dentry(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_unused, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_negative(void)
{
        int i;
        long sum = 0;

        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_negative, i);
        return sum < 0 ? 0 : sum;
}

static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer,
                          size_t *lenp, loff_t *ppos)
{
        dentry_stat.nr_dentry = get_nr_dentry();
        dentry_stat.nr_unused = get_nr_dentry_unused();
        dentry_stat.nr_negative = get_nr_dentry_negative();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table fs_dcache_sysctls[] = {
        {
                .procname        = "dentry-state",
                .data                = &dentry_stat,
                .maxlen                = 6*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_dentry,
        },
        {
                .procname        = "dentry-negative",
                .data                = &dentry_negative_policy,
                .maxlen                = sizeof(dentry_negative_policy),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static const struct ctl_table vm_dcache_sysctls[] = {
        {
                .procname        = "vfs_cache_pressure",
                .data                = &sysctl_vfs_cache_pressure,
                .maxlen                = sizeof(sysctl_vfs_cache_pressure),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
        },
        {
                .procname        = "vfs_cache_pressure_denom",
                .data                = &sysctl_vfs_cache_pressure_denom,
                .maxlen                = sizeof(sysctl_vfs_cache_pressure_denom),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ONE_HUNDRED,
        },
};

static int __init init_fs_dcache_sysctls(void)
{
        register_sysctl_init("vm", vm_dcache_sysctls);
        register_sysctl_init("fs", fs_dcache_sysctls);
        return 0;
}
fs_initcall(init_fs_dcache_sysctls);
#endif

/*
 * Compare 2 name strings, return 0 if they match, otherwise non-zero.
 * The strings are both count bytes long, and count is non-zero.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>
/*
 * NOTE! 'cs' and 'scount' come from a dentry, so it has a
 * aligned allocation for this particular component. We don't
 * strictly need the load_unaligned_zeropad() safety, but it
 * doesn't hurt either.
 *
 * In contrast, 'ct' and 'tcount' can be from a pathname, and do
 * need the careful unaligned handling.
 */
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        unsigned long a,b,mask;

        for (;;) {
                a = read_word_at_a_time(cs);
                b = load_unaligned_zeropad(ct);
                if (tcount < sizeof(unsigned long))
                        break;
                if (unlikely(a != b))
                        return 1;
                cs += sizeof(unsigned long);
                ct += sizeof(unsigned long);
                tcount -= sizeof(unsigned long);
                if (!tcount)
                        return 0;
        }
        mask = bytemask_from_count(tcount);
        return unlikely(!!((a ^ b) & mask));
}

#else

static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        do {
                if (*cs != *ct)
                        return 1;
                cs++;
                ct++;
                tcount--;
        } while (tcount);
        return 0;
}

#endif

static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
{
        /*
         * Be careful about RCU walk racing with rename:
         * use 'READ_ONCE' to fetch the name pointer.
         *
         * NOTE! Even if a rename will mean that the length
         * was not loaded atomically, we don't care. The
         * RCU walk will check the sequence count eventually,
         * and catch it. And we won't overrun the buffer,
         * because we're reading the name pointer atomically,
         * and a dentry name is guaranteed to be properly
         * terminated with a NUL byte.
         *
         * End result: even if 'len' is wrong, we'll exit
         * early because the data cannot match (there can
         * be no NUL in the ct/tcount data)
         */
        const unsigned char *cs = READ_ONCE(dentry->d_name.name);

        return dentry_string_cmp(cs, ct, tcount);
}

/*
 * long names are allocated separately from dentry and never modified.
 * Refcounted, freeing is RCU-delayed.  See take_dentry_name_snapshot()
 * for the reason why ->count and ->head can't be combined into a union.
 * dentry_string_cmp() relies upon ->name[] being word-aligned.
 */
struct external_name {
        atomic_t count;
        struct rcu_head head;
        unsigned char name[] __aligned(sizeof(unsigned long));
};

static inline struct external_name *external_name(struct dentry *dentry)
{
        return container_of(dentry->d_name.name, struct external_name, name[0]);
}

static void __d_free(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);

        kmem_cache_free(dentry_cache, dentry); 
}

static void __d_free_external(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
        kfree(external_name(dentry));
        kmem_cache_free(dentry_cache, dentry);
}

static inline int dname_external(const struct dentry *dentry)
{
        return dentry->d_name.name != dentry->d_shortname.string;
}

void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
{
        unsigned seq;
        const unsigned char *s;

        rcu_read_lock();
retry:
        seq = read_seqcount_begin(&dentry->d_seq);
        s = READ_ONCE(dentry->d_name.name);
        name->name.hash_len = dentry->d_name.hash_len;
        name->name.name = name->inline_name.string;
        if (likely(s == dentry->d_shortname.string)) {
                name->inline_name = dentry->d_shortname;
        } else {
                struct external_name *p;
                p = container_of(s, struct external_name, name[0]);
                // get a valid reference
                if (unlikely(!atomic_inc_not_zero(&p->count)))
                        goto retry;
                name->name.name = s;
        }
        if (read_seqcount_retry(&dentry->d_seq, seq)) {
                release_dentry_name_snapshot(name);
                goto retry;
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(take_dentry_name_snapshot);

void release_dentry_name_snapshot(struct name_snapshot *name)
{
        if (unlikely(name->name.name != name->inline_name.string)) {
                struct external_name *p;
                p = container_of(name->name.name, struct external_name, name[0]);
                if (unlikely(atomic_dec_and_test(&p->count)))
                        kfree_rcu(p, head);
        }
}
EXPORT_SYMBOL(release_dentry_name_snapshot);

static inline void __d_set_inode_and_type(struct dentry *dentry,
                                          struct inode *inode,
                                          unsigned type_flags)
{
        unsigned flags;

        dentry->d_inode = inode;
        flags = READ_ONCE(dentry->d_flags);
        flags &= ~DCACHE_ENTRY_TYPE;
        flags |= type_flags;
        smp_store_release(&dentry->d_flags, flags);
}

static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
        unsigned flags = READ_ONCE(dentry->d_flags);

        flags &= ~DCACHE_ENTRY_TYPE;
        WRITE_ONCE(dentry->d_flags, flags);
        dentry->d_inode = NULL;
        /*
         * The negative counter only tracks dentries on the LRU. Don't inc if
         * d_lru is on another list.
         */
        if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
                this_cpu_inc(nr_dentry_negative);
}

static void dentry_free(struct dentry *dentry)
{
        WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
        if (unlikely(dname_external(dentry))) {
                struct external_name *p = external_name(dentry);
                if (likely(atomic_dec_and_test(&p->count))) {
                        call_rcu(&dentry->d_u.d_rcu, __d_free_external);
                        return;
                }
        }
        /* if dentry was never visible to RCU, immediate free is OK */
        if (dentry->d_flags & DCACHE_NORCU)
                __d_free(&dentry->d_u.d_rcu);
        else
                call_rcu(&dentry->d_u.d_rcu, __d_free);
}

/*
 * Release the dentry's inode, using the filesystem
 * d_iput() operation if defined.
 */
static void dentry_unlink_inode(struct dentry * dentry)
        __releases(dentry->d_lock)
        __releases(dentry->d_inode->i_lock)
{
        struct inode *inode = dentry->d_inode;

        raw_write_seqcount_begin(&dentry->d_seq);
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
        raw_write_seqcount_end(&dentry->d_seq);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
                fsnotify_inoderemove(inode);
        if (dentry->d_op && dentry->d_op->d_iput)
                dentry->d_op->d_iput(dentry, inode);
        else
                iput(inode);
}

/*
 * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
 * is in use - which includes both the "real" per-superblock
 * LRU list _and_ the DCACHE_SHRINK_LIST use.
 *
 * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
 * on the shrink list (ie not on the superblock LRU list).
 *
 * The per-cpu "nr_dentry_unused" counters are updated with
 * the DCACHE_LRU_LIST bit.
 *
 * The per-cpu "nr_dentry_negative" counters are only updated
 * when deleted from or added to the per-superblock LRU list, not
 * from/to the shrink list. That is to avoid an unneeded dec/inc
 * pair when moving from LRU to shrink list in select_collect().
 *
 * These helper functions make sure we always follow the
 * rules. d_lock must be held by the caller.
 */
#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
static void d_lru_add(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, 0);
        dentry->d_flags |= DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_inc(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_add_obj(
                        &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_lru_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_del_obj(
                        &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_shrink_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        list_del_init(&dentry->d_lru);
        dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        this_cpu_dec(nr_dentry_unused);
}

static void d_shrink_add(struct dentry *dentry, struct list_head *list)
{
        D_FLAG_VERIFY(dentry, 0);
        list_add(&dentry->d_lru, list);
        dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
}

/*
 * These can only be called under the global LRU lock, ie during the
 * callback for freeing the LRU list. "isolate" removes it from the
 * LRU lists entirely, while shrink_move moves it to the indicated
 * private list.
 */
static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate(lru, &dentry->d_lru);
}

static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
                              struct list_head *list)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags |= DCACHE_SHRINK_LIST;
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate_move(lru, &dentry->d_lru, list);
}

static void ___d_drop(struct dentry *dentry)
{
        struct hlist_bl_head *b;
        /*
         * Hashed dentries are normally on the dentry hashtable,
         * with the exception of those newly allocated by
         * d_obtain_root, which are always IS_ROOT:
         */
        if (unlikely(IS_ROOT(dentry)))
                b = &dentry->d_sb->s_roots;
        else
                b = d_hash(dentry->d_name.hash);

        hlist_bl_lock(b);
        __hlist_bl_del(&dentry->d_hash);
        hlist_bl_unlock(b);
}

void __d_drop(struct dentry *dentry)
{
        if (!d_unhashed(dentry)) {
                ___d_drop(dentry);
                dentry->d_hash.pprev = NULL;
                write_seqcount_invalidate(&dentry->d_seq);
        }
}
EXPORT_SYMBOL(__d_drop);

/**
 * d_drop - drop a dentry
 * @dentry: dentry to drop
 *
 * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
 * be found through a VFS lookup any more. Note that this is different from
 * deleting the dentry - d_delete will try to mark the dentry negative if
 * possible, giving a successful _negative_ lookup, while d_drop will
 * just make the cache lookup fail.
 *
 * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
 * reason (NFS timeouts or autofs deletes).
 *
 * __d_drop requires dentry->d_lock
 *
 * ___d_drop doesn't mark dentry as "unhashed"
 * (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
 */
void d_drop(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_drop);

static inline void dentry_unlist(struct dentry *dentry)
{
        struct dentry *next;
        /*
         * Inform d_walk() and shrink_dentry_list() that we are no longer
         * attached to the dentry tree
         */
        dentry->d_flags |= DCACHE_DENTRY_KILLED;
        if (unlikely(hlist_unhashed(&dentry->d_sib)))
                return;
        __hlist_del(&dentry->d_sib);
        /*
         * Cursors can move around the list of children.  While we'd been
         * a normal list member, it didn't matter - ->d_sib.next would've
         * been updated.  However, from now on it won't be and for the
         * things like d_walk() it might end up with a nasty surprise.
         * Normally d_walk() doesn't care about cursors moving around -
         * ->d_lock on parent prevents that and since a cursor has no children
         * of its own, we get through it without ever unlocking the parent.
         * There is one exception, though - if we ascend from a child that
         * gets killed as soon as we unlock it, the next sibling is found
         * using the value left in its ->d_sib.next.  And if _that_
         * pointed to a cursor, and cursor got moved (e.g. by lseek())
         * before d_walk() regains parent->d_lock, we'll end up skipping
         * everything the cursor had been moved past.
         *
         * Solution: make sure that the pointer left behind in ->d_sib.next
         * points to something that won't be moving around.  I.e. skip the
         * cursors.
         */
        while (dentry->d_sib.next) {
                next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib);
                if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
                        break;
                dentry->d_sib.next = next->d_sib.next;
        }
}

static struct dentry *__dentry_kill(struct dentry *dentry)
{
        struct dentry *parent = NULL;
        bool can_free = true;

        /*
         * The dentry is now unrecoverably dead to the world.
         */
        lockref_mark_dead(&dentry->d_lockref);

        /*
         * inform the fs via d_prune that this dentry is about to be
         * unhashed and destroyed.
         */
        if (dentry->d_flags & DCACHE_OP_PRUNE)
                dentry->d_op->d_prune(dentry);

        if (dentry->d_flags & DCACHE_LRU_LIST) {
                if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
                        d_lru_del(dentry);
        }
        /* if it was on the hash then remove it */
        __d_drop(dentry);
        if (dentry->d_inode)
                dentry_unlink_inode(dentry);
        else
                spin_unlock(&dentry->d_lock);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);

        cond_resched();
        /* now that it's negative, ->d_parent is stable */
        if (!IS_ROOT(dentry)) {
                parent = dentry->d_parent;
                spin_lock(&parent->d_lock);
        }
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry_unlist(dentry);
        if (dentry->d_flags & DCACHE_SHRINK_LIST)
                can_free = false;
        spin_unlock(&dentry->d_lock);
        if (likely(can_free))
                dentry_free(dentry);
        if (parent && --parent->d_lockref.count) {
                spin_unlock(&parent->d_lock);
                return NULL;
        }
        return parent;
}

/*
 * Lock a dentry for feeding it to __dentry_kill().
 * Called under rcu_read_lock() and dentry->d_lock; the former
 * guarantees that nothing we access will be freed under us.
 * Note that dentry is *not* protected from concurrent dentry_kill(),
 * d_delete(), etc.
 *
 * Return false if dentry is busy.  Otherwise, return true and have
 * that dentry's inode locked.
 */

static bool lock_for_kill(struct dentry *dentry)
{
        struct inode *inode = dentry->d_inode;

        if (unlikely(dentry->d_lockref.count))
                return false;

        if (!inode || likely(spin_trylock(&inode->i_lock)))
                return true;

        do {
                spin_unlock(&dentry->d_lock);
                spin_lock(&inode->i_lock);
                spin_lock(&dentry->d_lock);
                if (likely(inode == dentry->d_inode))
                        break;
                spin_unlock(&inode->i_lock);
                inode = dentry->d_inode;
        } while (inode);
        if (likely(!dentry->d_lockref.count))
                return true;
        if (inode)
                spin_unlock(&inode->i_lock);
        return false;
}

/*
 * Decide if dentry is worth retaining.  Usually this is called with dentry
 * locked; if not locked, we are more limited and might not be able to tell
 * without a lock.  False in this case means "punt to locked path and recheck".
 *
 * In case we aren't locked, these predicates are not "stable". However, it is
 * sufficient that at some point after we dropped the reference the dentry was
 * hashed and the flags had the proper value. Other dentry users may have
 * re-gotten a reference to the dentry and change that, but our work is done -
 * we can leave the dentry around with a zero refcount.
 */
static inline bool retain_dentry(struct dentry *dentry, bool locked)
{
        unsigned int d_flags;

        smp_rmb();
        d_flags = READ_ONCE(dentry->d_flags);

        // Unreachable? Nobody would be able to look it up, no point retaining
        if (unlikely(d_unhashed(dentry)))
                return false;

        // Same if it's disconnected
        if (unlikely(d_flags & DCACHE_DISCONNECTED))
                return false;

        // ->d_delete() might tell us not to bother, but that requires
        // ->d_lock; can't decide without it
        if (unlikely(d_flags & DCACHE_OP_DELETE)) {
                if (!locked || dentry->d_op->d_delete(dentry))
                        return false;
        }

        // Explicitly told not to bother
        if (unlikely(d_flags & DCACHE_DONTCACHE))
                return false;

        // At this point it looks like we ought to keep it.  We also might
        // need to do something - put it on LRU if it wasn't there already
        // and mark it referenced if it was on LRU, but not marked yet.
        // Unfortunately, both actions require ->d_lock, so in lockless
        // case we'd have to punt rather than doing those.
        if (unlikely(!(d_flags & DCACHE_LRU_LIST))) {
                if (!locked)
                        return false;
                d_lru_add(dentry);
        } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) {
                if (!locked)
                        return false;
                dentry->d_flags |= DCACHE_REFERENCED;
        }
        return true;
}

void d_mark_dontcache(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&de->d_lock);
                de->d_flags |= DCACHE_DONTCACHE;
                spin_unlock(&de->d_lock);
        }
        inode_state_set(inode, I_DONTCACHE);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_mark_dontcache);

/*
 * Try to do a lockless dput(), and return whether that was successful.
 *
 * If unsuccessful, we return false, having already taken the dentry lock.
 * In that case refcount is guaranteed to be zero and we have already
 * decided that it's not worth keeping around.
 *
 * The caller needs to hold the RCU read lock, so that the dentry is
 * guaranteed to stay around even if the refcount goes down to zero!
 */
static inline bool fast_dput(struct dentry *dentry)
{
        int ret;

        /*
         * try to decrement the lockref optimistically.
         */
        ret = lockref_put_return(&dentry->d_lockref);

        /*
         * If the lockref_put_return() failed due to the lock being held
         * by somebody else, the fast path has failed. We will need to
         * get the lock, and then check the count again.
         */
        if (unlikely(ret < 0)) {
                spin_lock(&dentry->d_lock);
                if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
                        spin_unlock(&dentry->d_lock);
                        return true;
                }
                dentry->d_lockref.count--;
                goto locked;
        }

        /*
         * If we weren't the last ref, we're done.
         */
        if (ret)
                return true;

        /*
         * Can we decide that decrement of refcount is all we needed without
         * taking the lock?  There's a very common case when it's all we need -
         * dentry looks like it ought to be retained and there's nothing else
         * to do.
         */
        if (retain_dentry(dentry, false))
                return true;

        /*
         * Either not worth retaining or we can't tell without the lock.
         * Get the lock, then.  We've already decremented the refcount to 0,
         * but we'll need to re-check the situation after getting the lock.
         */
        spin_lock(&dentry->d_lock);

        /*
         * Did somebody else grab a reference to it in the meantime, and
         * we're no longer the last user after all? Alternatively, somebody
         * else could have killed it and marked it dead. Either way, we
         * don't need to do anything else.
         */
locked:
        if (dentry->d_lockref.count || retain_dentry(dentry, true)) {
                spin_unlock(&dentry->d_lock);
                return true;
        }
        return false;
}

static void finish_dput(struct dentry *dentry)
        __releases(dentry->d_lock)
        __releases(RCU)
{
        while (lock_for_kill(dentry)) {
                rcu_read_unlock();
                dentry = __dentry_kill(dentry);
                if (!dentry)
                        return;
                if (retain_dentry(dentry, true)) {
                        spin_unlock(&dentry->d_lock);
                        return;
                }
                rcu_read_lock();
        }
        rcu_read_unlock();
        spin_unlock(&dentry->d_lock);
}

/* 
 * This is dput
 *
 * This is complicated by the fact that we do not want to put
 * dentries that are no longer on any hash chain on the unused
 * list: we'd much rather just get rid of them immediately.
 *
 * However, that implies that we have to traverse the dentry
 * tree upwards to the parents which might _also_ now be
 * scheduled for deletion (it may have been only waiting for
 * its last child to go away).
 *
 * This tail recursion is done by hand as we don't want to depend
 * on the compiler to always get this right (gcc generally doesn't).
 * Real recursion would eat up our stack space.
 */

/*
 * dput - release a dentry
 * @dentry: dentry to release 
 *
 * Release a dentry. This will drop the usage count and if appropriate
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
 */
void dput(struct dentry *dentry)
{
        if (!dentry)
                return;
        might_sleep();
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
                return;
        }
        finish_dput(dentry);
}
EXPORT_SYMBOL(dput);

void d_make_discardable(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT));
        dentry->d_flags &= ~DCACHE_PERSISTENT;
        dentry->d_lockref.count--;
        rcu_read_lock();
        finish_dput(dentry);
}
EXPORT_SYMBOL(d_make_discardable);

static void to_shrink_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
        if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
                if (dentry->d_flags & DCACHE_LRU_LIST)
                        d_lru_del(dentry);
                d_shrink_add(dentry, list);
        }
}

void dput_to_list(struct dentry *dentry, struct list_head *list)
{
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
                return;
        }
        rcu_read_unlock();
        to_shrink_list(dentry, list);
        spin_unlock(&dentry->d_lock);
}

struct dentry *dget_parent(struct dentry *dentry)
{
        int gotref;
        struct dentry *ret;
        unsigned seq;

        /*
         * Do optimistic parent lookup without any
         * locking.
         */
        rcu_read_lock();
        seq = raw_seqcount_begin(&dentry->d_seq);
        ret = READ_ONCE(dentry->d_parent);
        gotref = lockref_get_not_zero(&ret->d_lockref);
        rcu_read_unlock();
        if (likely(gotref)) {
                if (!read_seqcount_retry(&dentry->d_seq, seq))
                        return ret;
                dput(ret);
        }

repeat:
        /*
         * Don't need rcu_dereference because we re-check it was correct under
         * the lock.
         */
        rcu_read_lock();
        ret = dentry->d_parent;
        spin_lock(&ret->d_lock);
        if (unlikely(ret != dentry->d_parent)) {
                spin_unlock(&ret->d_lock);
                rcu_read_unlock();
                goto repeat;
        }
        rcu_read_unlock();
        BUG_ON(!ret->d_lockref.count);
        ret->d_lockref.count++;
        spin_unlock(&ret->d_lock);
        return ret;
}
EXPORT_SYMBOL(dget_parent);

static struct dentry * __d_find_any_alias(struct inode *inode)
{
        struct dentry *alias;

        if (hlist_empty(&inode->i_dentry))
                return NULL;
        alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
        lockref_get(&alias->d_lockref);
        return alias;
}

/**
 * d_find_any_alias - find any alias for a given inode
 * @inode: inode to find an alias for
 *
 * If any aliases exist for the given inode, take and return a
 * reference for one of them.  If no aliases exist, return %NULL.
 */
struct dentry *d_find_any_alias(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        de = __d_find_any_alias(inode);
        spin_unlock(&inode->i_lock);
        return de;
}
EXPORT_SYMBOL(d_find_any_alias);

static struct dentry *__d_find_alias(struct inode *inode)
{
        struct dentry *alias;

        if (S_ISDIR(inode->i_mode))
                return __d_find_any_alias(inode);

        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&alias->d_lock);
                 if (!d_unhashed(alias)) {
                        dget_dlock(alias);
                        spin_unlock(&alias->d_lock);
                        return alias;
                }
                spin_unlock(&alias->d_lock);
        }
        return NULL;
}

/**
 * d_find_alias - grab a hashed alias of inode
 * @inode: inode in question
 *
 * If inode has a hashed alias, or is a directory and has any alias,
 * acquire the reference to alias and return it. Otherwise return NULL.
 * Notice that if inode is a directory there can be only one alias and
 * it can be unhashed only if it has no children, or if it is the root
 * of a filesystem, or if the directory was renamed and d_revalidate
 * was the first vfs operation to notice.
 *
 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
 * any other hashed alias over that one.
 */
struct dentry *d_find_alias(struct inode *inode)
{
        struct dentry *de = NULL;

        if (!hlist_empty(&inode->i_dentry)) {
                spin_lock(&inode->i_lock);
                de = __d_find_alias(inode);
                spin_unlock(&inode->i_lock);
        }
        return de;
}
EXPORT_SYMBOL(d_find_alias);

/*
 *  Caller MUST be holding rcu_read_lock() and be guaranteed
 *  that inode won't get freed until rcu_read_unlock().
 */
struct dentry *d_find_alias_rcu(struct inode *inode)
{
        struct hlist_head *l = &inode->i_dentry;
        struct dentry *de = NULL;

        spin_lock(&inode->i_lock);
        // ->i_dentry and ->i_rcu are colocated, but the latter won't be
        // used without having I_FREEING set, which means no aliases left
        if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) {
                if (S_ISDIR(inode->i_mode)) {
                        de = hlist_entry(l->first, struct dentry, d_u.d_alias);
                } else {
                        hlist_for_each_entry(de, l, d_u.d_alias)
                                if (!d_unhashed(de))
                                        break;
                }
        }
        spin_unlock(&inode->i_lock);
        return de;
}

/**
 * d_dispose_if_unused - move unreferenced dentries to shrink list
 * @dentry: dentry in question
 * @dispose: head of shrink list
 *
 * If dentry has no external references, move it to shrink list.
 *
 * NOTE!!! The caller is responsible for preventing eviction of the dentry by
 * holding dentry->d_inode->i_lock or equivalent.
 */
void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose)
{
        spin_lock(&dentry->d_lock);
        if (!dentry->d_lockref.count)
                to_shrink_list(dentry, dispose);
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_dispose_if_unused);

/*
 *        Try to kill dentries associated with this inode.
 * WARNING: you must own a reference to inode.
 */
void d_prune_aliases(struct inode *inode)
{
        LIST_HEAD(dispose);
        struct dentry *dentry;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias)
                d_dispose_if_unused(dentry, &dispose);
        spin_unlock(&inode->i_lock);
        shrink_dentry_list(&dispose);
}
EXPORT_SYMBOL(d_prune_aliases);

static inline void shrink_kill(struct dentry *victim)
{
        do {
                rcu_read_unlock();
                victim = __dentry_kill(victim);
                rcu_read_lock();
        } while (victim && lock_for_kill(victim));
        rcu_read_unlock();
        if (victim)
                spin_unlock(&victim->d_lock);
}

void shrink_dentry_list(struct list_head *list)
{
        while (!list_empty(list)) {
                struct dentry *dentry;

                dentry = list_entry(list->prev, struct dentry, d_lru);
                spin_lock(&dentry->d_lock);
                rcu_read_lock();
                if (!lock_for_kill(dentry)) {
                        bool can_free;
                        rcu_read_unlock();
                        d_shrink_del(dentry);
                        can_free = dentry->d_flags & DCACHE_DENTRY_KILLED;
                        spin_unlock(&dentry->d_lock);
                        if (can_free)
                                dentry_free(dentry);
                        continue;
                }
                d_shrink_del(dentry);
                shrink_kill(dentry);
        }
}
EXPORT_SYMBOL(shrink_dentry_list);

static enum lru_status dentry_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);


        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        /*
         * Referenced dentries are still in use. If they have active
         * counts, just remove them from the LRU. Otherwise give them
         * another pass through the LRU.
         */
        if (dentry->d_lockref.count) {
                d_lru_isolate(lru, dentry);
                spin_unlock(&dentry->d_lock);
                return LRU_REMOVED;
        }

        if (dentry->d_flags & DCACHE_REFERENCED) {
                dentry->d_flags &= ~DCACHE_REFERENCED;
                spin_unlock(&dentry->d_lock);

                /*
                 * The list move itself will be made by the common LRU code. At
                 * this point, we've dropped the dentry->d_lock but keep the
                 * lru lock. This is safe to do, since every list movement is
                 * protected by the lru lock even if both locks are held.
                 *
                 * This is guaranteed by the fact that all LRU management
                 * functions are intermediated by the LRU API calls like
                 * list_lru_add_obj and list_lru_del_obj. List movement in this file
                 * only ever occur through this functions or through callbacks
                 * like this one, that are called from the LRU API.
                 *
                 * The only exceptions to this are functions like
                 * shrink_dentry_list, and code that first checks for the
                 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
                 * operating only with stack provided lists after they are
                 * properly isolated from the main list.  It is thus, always a
                 * local access.
                 */
                return LRU_ROTATE;
        }

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}

/**
 * prune_dcache_sb - shrink the dcache
 * @sb: superblock
 * @sc: shrink control, passed to list_lru_shrink_walk()
 *
 * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
 * is done when we need more memory and called from the superblock shrinker
 * function.
 *
 * This function may fail to free any resources if all the dentries are in
 * use.
 */
long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(dispose);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
                                     dentry_lru_isolate, &dispose);
        shrink_dentry_list(&dispose);
        return freed;
}

static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
                struct list_lru_one *lru, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);

        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}


/**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
 * Shrink the dcache for the specified super block. This is used to free
 * the dcache before unmounting a file system.
 */
void shrink_dcache_sb(struct super_block *sb)
{
        do {
                LIST_HEAD(dispose);

                list_lru_walk(&sb->s_dentry_lru,
                        dentry_lru_isolate_shrink, &dispose, 1024);
                shrink_dentry_list(&dispose);
        } while (list_lru_count(&sb->s_dentry_lru) > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);

/**
 * enum d_walk_ret - action to take during tree walk
 * @D_WALK_CONTINUE:        continue walk
 * @D_WALK_QUIT:        quit walk
 * @D_WALK_NORETRY:        quit when retry is needed
 * @D_WALK_SKIP:        skip this dentry and its children
 */
enum d_walk_ret {
        D_WALK_CONTINUE,
        D_WALK_QUIT,
        D_WALK_NORETRY,
        D_WALK_SKIP,
};

/**
 * d_walk - walk the dentry tree
 * @parent:        start of walk
 * @data:        data passed to @enter() and @finish()
 * @enter:        callback when first entering the dentry
 *
 * The @enter() callbacks are called with d_lock held.
 */
static void d_walk(struct dentry *parent, void *data,
                   enum d_walk_ret (*enter)(void *, struct dentry *))
{
        struct dentry *this_parent, *dentry;
        unsigned seq = 0;
        enum d_walk_ret ret;
        bool retry = true;

again:
        read_seqbegin_or_lock(&rename_lock, &seq);
        this_parent = parent;
        spin_lock(&this_parent->d_lock);

        ret = enter(data, this_parent);
        switch (ret) {
        case D_WALK_CONTINUE:
                break;
        case D_WALK_QUIT:
        case D_WALK_SKIP:
                goto out_unlock;
        case D_WALK_NORETRY:
                retry = false;
                break;
        }
repeat:
        dentry = d_first_child(this_parent);
resume:
        hlist_for_each_entry_from(dentry, d_sib) {
                if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
                        continue;

                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);

                ret = enter(data, dentry);
                switch (ret) {
                case D_WALK_CONTINUE:
                        break;
                case D_WALK_QUIT:
                        spin_unlock(&dentry->d_lock);
                        goto out_unlock;
                case D_WALK_NORETRY:
                        retry = false;
                        break;
                case D_WALK_SKIP:
                        spin_unlock(&dentry->d_lock);
                        continue;
                }

                if (!hlist_empty(&dentry->d_children)) {
                        spin_unlock(&this_parent->d_lock);
                        spin_release(&dentry->d_lock.dep_map, _RET_IP_);
                        this_parent = dentry;
                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        rcu_read_lock();
ascend:
        if (this_parent != parent) {
                dentry = this_parent;
                this_parent = dentry->d_parent;

                spin_unlock(&dentry->d_lock);
                spin_lock(&this_parent->d_lock);

                /* might go back up the wrong parent if we have had a rename. */
                if (need_seqretry(&rename_lock, seq))
                        goto rename_retry;
                /* go into the first sibling still alive */
                hlist_for_each_entry_continue(dentry, d_sib) {
                        if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
                                rcu_read_unlock();
                                goto resume;
                        }
                }
                goto ascend;
        }
        if (need_seqretry(&rename_lock, seq))
                goto rename_retry;
        rcu_read_unlock();

out_unlock:
        spin_unlock(&this_parent->d_lock);
        done_seqretry(&rename_lock, seq);
        return;

rename_retry:
        spin_unlock(&this_parent->d_lock);
        rcu_read_unlock();
        BUG_ON(seq & 1);
        if (!retry)
                return;
        seq = 1;
        goto again;
}

struct check_mount {
        struct vfsmount *mnt;
        unsigned int mounted;
};

/* locks: mount_locked_reader && dentry->d_lock */
static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
        struct check_mount *info = data;
        struct path path = { .mnt = info->mnt, .dentry = dentry };

        if (likely(!d_mountpoint(dentry)))
                return D_WALK_CONTINUE;
        if (__path_is_mountpoint(&path)) {
                info->mounted = 1;
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * path_has_submounts - check for mounts over a dentry in the
 *                      current namespace.
 * @parent: path to check.
 *
 * Return true if the parent or its subdirectories contain
 * a mount point in the current namespace.
 */
int path_has_submounts(const struct path *parent)
{
        struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };

        guard(mount_locked_reader)();
        d_walk(parent->dentry, &data, path_check_mount);

        return data.mounted;
}
EXPORT_SYMBOL(path_has_submounts);

/*
 * Called by mount code to set a mountpoint and check if the mountpoint is
 * reachable (e.g. NFS can unhash a directory dentry and then the complete
 * subtree can become unreachable).
 *
 * Only one of d_invalidate() and d_set_mounted() must succeed.  For
 * this reason take rename_lock and d_lock on dentry and ancestors.
 */
int d_set_mounted(struct dentry *dentry)
{
        struct dentry *p;
        int ret = -ENOENT;
        read_seqlock_excl(&rename_lock);
        for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
                /* Need exclusion wrt. d_invalidate() */
                spin_lock(&p->d_lock);
                if (unlikely(d_unhashed(p))) {
                        spin_unlock(&p->d_lock);
                        goto out;
                }
                spin_unlock(&p->d_lock);
        }
        spin_lock(&dentry->d_lock);
        if (!d_unlinked(dentry)) {
                ret = -EBUSY;
                if (!d_mountpoint(dentry)) {
                        dentry->d_flags |= DCACHE_MOUNTED;
                        ret = 0;
                }
        }
         spin_unlock(&dentry->d_lock);
out:
        read_sequnlock_excl(&rename_lock);
        return ret;
}

/*
 * Search the dentry child list of the specified parent,
 * and move any unused dentries to the end of the unused
 * list for prune_dcache(). We descend to the next level
 * whenever the d_children list is non-empty and continue
 * searching.
 *
 * It returns zero iff there are no unused children,
 * otherwise  it returns the number of children moved to
 * the end of the unused list. This may not be the total
 * number of unused children, because select_parent can
 * drop the lock and return early due to latency
 * constraints.
 */

struct select_data {
        struct dentry *start;
        union {
                long found;
                struct dentry *victim;
        };
        struct list_head dispose;
};

static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                data->found++;
        } else if (!dentry->d_lockref.count) {
                to_shrink_list(dentry, &data->dispose);
                data->found++;
        } else if (dentry->d_lockref.count < 0) {
                data->found++;
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

static enum d_walk_ret select_collect_umount(void *_data, struct dentry *dentry)
{
        if (dentry->d_flags & DCACHE_PERSISTENT) {
                dentry->d_flags &= ~DCACHE_PERSISTENT;
                dentry->d_lockref.count--;
        }
        return select_collect(_data, dentry);
}

static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (!dentry->d_lockref.count) {
                if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                        rcu_read_lock();
                        data->victim = dentry;
                        return D_WALK_QUIT;
                }
                to_shrink_list(dentry, &data->dispose);
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

/**
 * shrink_dcache_tree - prune dcache
 * @parent: parent of entries to prune
 * @for_umount: true if we want to unpin the persistent ones
 *
 * Prune the dcache to remove unused children of the parent dentry.
 */
static void shrink_dcache_tree(struct dentry *parent, bool for_umount)
{
        for (;;) {
                struct select_data data = {.start = parent};

                INIT_LIST_HEAD(&data.dispose);
                d_walk(parent, &data,
                        for_umount ? select_collect_umount : select_collect);

                if (!list_empty(&data.dispose)) {
                        shrink_dentry_list(&data.dispose);
                        continue;
                }

                cond_resched();
                if (!data.found)
                        break;
                data.victim = NULL;
                d_walk(parent, &data, select_collect2);
                if (data.victim) {
                        spin_lock(&data.victim->d_lock);
                        if (!lock_for_kill(data.victim)) {
                                spin_unlock(&data.victim->d_lock);
                                rcu_read_unlock();
                        } else {
                                shrink_kill(data.victim);
                        }
                }
                if (!list_empty(&data.dispose))
                        shrink_dentry_list(&data.dispose);
        }
}

void shrink_dcache_parent(struct dentry *parent)
{
        shrink_dcache_tree(parent, false);
}
EXPORT_SYMBOL(shrink_dcache_parent);

static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
{
        /* it has busy descendents; complain about those instead */
        if (!hlist_empty(&dentry->d_children))
                return D_WALK_CONTINUE;

        /* root with refcount 1 is fine */
        if (dentry == _data && dentry->d_lockref.count == 1)
                return D_WALK_CONTINUE;

        WARN(1, "BUG: Dentry %p{i=%llx,n=%pd} "
                        " still in use (%d) [unmount of %s %s]\n",
                       dentry,
                       dentry->d_inode ?
                       dentry->d_inode->i_ino : (u64)0,
                       dentry,
                       dentry->d_lockref.count,
                       dentry->d_sb->s_type->name,
                       dentry->d_sb->s_id);
        return D_WALK_CONTINUE;
}

static void do_one_tree(struct dentry *dentry)
{
        shrink_dcache_tree(dentry, true);
        d_walk(dentry, dentry, umount_check);
        d_drop(dentry);
        dput(dentry);
}

/*
 * destroy the dentries attached to a superblock on unmounting
 */
void shrink_dcache_for_umount(struct super_block *sb)
{
        struct dentry *dentry;

        rwsem_assert_held_write(&sb->s_umount);

        dentry = sb->s_root;
        sb->s_root = NULL;
        do_one_tree(dentry);

        while (!hlist_bl_empty(&sb->s_roots)) {
                dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash));
                do_one_tree(dentry);
        }
}

static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
{
        struct dentry **victim = _data;
        if (d_mountpoint(dentry)) {
                *victim = dget_dlock(dentry);
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * d_invalidate - detach submounts, prune dcache, and drop
 * @dentry: dentry to invalidate (aka detach, prune and drop)
 */
void d_invalidate(struct dentry *dentry)
{
        bool had_submounts = false;
        spin_lock(&dentry->d_lock);
        if (d_unhashed(dentry)) {
                spin_unlock(&dentry->d_lock);
                return;
        }
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);

        /* Negative dentries can be dropped without further checks */
        if (!dentry->d_inode)
                return;

        shrink_dcache_parent(dentry);
        for (;;) {
                struct dentry *victim = NULL;
                d_walk(dentry, &victim, find_submount);
                if (!victim) {
                        if (had_submounts)
                                shrink_dcache_parent(dentry);
                        return;
                }
                had_submounts = true;
                detach_mounts(victim);
                dput(victim);
        }
}
EXPORT_SYMBOL(d_invalidate);

/**
 * __d_alloc - allocate a dcache entry
 * @sb: filesystem it will belong to
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
 
static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
        struct dentry *dentry;
        char *dname;
        int err;

        dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru,
                                      GFP_KERNEL);
        if (!dentry)
                return NULL;

        /*
         * We guarantee that the inline name is always NUL-terminated.
         * This way the memcpy() done by the name switching in rename
         * will still always have a NUL at the end, even if we might
         * be overwriting an internal NUL character
         */
        dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0;
        if (unlikely(!name)) {
                name = &slash_name;
                dname = dentry->d_shortname.string;
        } else if (name->len > DNAME_INLINE_LEN-1) {
                size_t size = offsetof(struct external_name, name[1]);
                struct external_name *p = kmalloc(size + name->len,
                                                  GFP_KERNEL_ACCOUNT |
                                                  __GFP_RECLAIMABLE);
                if (!p) {
                        kmem_cache_free(dentry_cache, dentry); 
                        return NULL;
                }
                atomic_set(&p->count, 1);
                dname = p->name;
        } else  {
                dname = dentry->d_shortname.string;
        }        

        dentry->__d_name.len = name->len;
        dentry->__d_name.hash = name->hash;
        memcpy(dname, name->name, name->len);
        dname[name->len] = 0;

        /* Make sure we always see the terminating NUL character */
        smp_store_release(&dentry->__d_name.name, dname); /* ^^^ */

        dentry->d_flags = 0;
        lockref_init(&dentry->d_lockref);
        seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
        dentry->d_inode = NULL;
        dentry->d_parent = dentry;
        dentry->d_sb = sb;
        dentry->d_op = sb->__s_d_op;
        dentry->d_flags = sb->s_d_flags;
        dentry->d_fsdata = NULL;
        INIT_HLIST_BL_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_HLIST_HEAD(&dentry->d_children);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_HLIST_NODE(&dentry->d_sib);

        if (dentry->d_op && dentry->d_op->d_init) {
                err = dentry->d_op->d_init(dentry);
                if (err) {
                        if (dname_external(dentry))
                                kfree(external_name(dentry));
                        kmem_cache_free(dentry_cache, dentry);
                        return NULL;
                }
        }

        this_cpu_inc(nr_dentry);

        return dentry;
}

/**
 * d_alloc - allocate a dcache entry
 * @parent: parent of entry to allocate
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
        struct dentry *dentry = __d_alloc(parent->d_sb, name);
        if (!dentry)
                return NULL;
        spin_lock(&parent->d_lock);
        /*
         * don't need child lock because it is not subject
         * to concurrency here
         */
        dentry->d_parent = dget_dlock(parent);
        hlist_add_head(&dentry->d_sib, &parent->d_children);
        spin_unlock(&parent->d_lock);

        return dentry;
}
EXPORT_SYMBOL(d_alloc);

struct dentry *d_alloc_anon(struct super_block *sb)
{
        return __d_alloc(sb, NULL);
}
EXPORT_SYMBOL(d_alloc_anon);

struct dentry *d_alloc_cursor(struct dentry * parent)
{
        struct dentry *dentry = d_alloc_anon(parent->d_sb);
        if (dentry) {
                dentry->d_flags |= DCACHE_DENTRY_CURSOR;
                dentry->d_parent = dget(parent);
        }
        return dentry;
}

/**
 * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
 * @sb: the superblock
 * @name: qstr of the name
 *
 * For a filesystem that just pins its dentries in memory and never
 * performs lookups at all, return an unhashed IS_ROOT dentry.
 * This is used for pipes, sockets et.al. - the stuff that should
 * never be anyone's children or parents.  Unlike all other
 * dentries, these will not have RCU delay between dropping the
 * last reference and freeing them.
 *
 * The only user is alloc_file_pseudo() and that's what should
 * be considered a public interface.  Don't use directly.
 */
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
{
        static const struct dentry_operations anon_ops = {
                .d_dname = simple_dname
        };
        struct dentry *dentry = __d_alloc(sb, name);
        if (likely(dentry)) {
                dentry->d_flags |= DCACHE_NORCU;
                /* d_op_flags(&anon_ops) is 0 */
                if (!dentry->d_op)
                        dentry->d_op = &anon_ops;
        }
        return dentry;
}

struct dentry *d_alloc_name(struct dentry *parent, const char *name)
{
        struct qstr q;

        q.name = name;
        q.hash_len = hashlen_string(parent, name);
        return d_alloc(parent, &q);
}
EXPORT_SYMBOL(d_alloc_name);

#define DCACHE_OP_FLAGS \
        (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | \
         DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_PRUNE | \
         DCACHE_OP_REAL)

static unsigned int d_op_flags(const struct dentry_operations *op)
{
        unsigned int flags = 0;
        if (op) {
                if (op->d_hash)
                        flags |= DCACHE_OP_HASH;
                if (op->d_compare)
                        flags |= DCACHE_OP_COMPARE;
                if (op->d_revalidate)
                        flags |= DCACHE_OP_REVALIDATE;
                if (op->d_weak_revalidate)
                        flags |= DCACHE_OP_WEAK_REVALIDATE;
                if (op->d_delete)
                        flags |= DCACHE_OP_DELETE;
                if (op->d_prune)
                        flags |= DCACHE_OP_PRUNE;
                if (op->d_real)
                        flags |= DCACHE_OP_REAL;
        }
        return flags;
}

static void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
        unsigned int flags = d_op_flags(op);
        WARN_ON_ONCE(dentry->d_op);
        WARN_ON_ONCE(dentry->d_flags & DCACHE_OP_FLAGS);
        dentry->d_op = op;
        if (flags)
                dentry->d_flags |= flags;
}

void set_default_d_op(struct super_block *s, const struct dentry_operations *ops)
{
        unsigned int flags = d_op_flags(ops);
        s->__s_d_op = ops;
        s->s_d_flags = (s->s_d_flags & ~DCACHE_OP_FLAGS) | flags;
}
EXPORT_SYMBOL(set_default_d_op);

static unsigned d_flags_for_inode(struct inode *inode)
{
        unsigned add_flags = DCACHE_REGULAR_TYPE;

        if (!inode)
                return DCACHE_MISS_TYPE;

        if (S_ISDIR(inode->i_mode)) {
                add_flags = DCACHE_DIRECTORY_TYPE;
                if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
                        if (unlikely(!inode->i_op->lookup))
                                add_flags = DCACHE_AUTODIR_TYPE;
                        else
                                inode->i_opflags |= IOP_LOOKUP;
                }
                goto type_determined;
        }

        if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
                if (unlikely(inode->i_op->get_link)) {
                        add_flags = DCACHE_SYMLINK_TYPE;
                        goto type_determined;
                }
                inode->i_opflags |= IOP_NOFOLLOW;
        }

        if (unlikely(!S_ISREG(inode->i_mode)))
                add_flags = DCACHE_SPECIAL_TYPE;

type_determined:
        if (unlikely(IS_AUTOMOUNT(inode)))
                add_flags |= DCACHE_NEED_AUTOMOUNT;
        return add_flags;
}

static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
        unsigned add_flags = d_flags_for_inode(inode);
        WARN_ON(d_in_lookup(dentry));

        /*
         * The negative counter only tracks dentries on the LRU. Don't dec if
         * d_lru is on another list.
         */
        if ((dentry->d_flags &
             (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
                this_cpu_dec(nr_dentry_negative);
        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
        raw_write_seqcount_end(&dentry->d_seq);
        fsnotify_update_flags(dentry);
}

/**
 * d_instantiate - fill in inode information for a dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
 *
 * Fill in inode information in the entry.
 *
 * This turns negative dentries into productive full members
 * of society.
 *
 * NOTE! This assumes that the inode count has been incremented
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
 
void d_instantiate(struct dentry *entry, struct inode * inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
                spin_lock(&entry->d_lock);
                __d_instantiate(entry, inode);
                spin_unlock(&entry->d_lock);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_instantiate);

/*
 * This should be equivalent to d_instantiate() + unlock_new_inode(),
 * with lockdep-related part of unlock_new_inode() done before
 * anything else.  Use that instead of open-coding d_instantiate()/
 * unlock_new_inode() combinations.
 */
void d_instantiate_new(struct dentry *entry, struct inode *inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        BUG_ON(!inode);
        lockdep_annotate_inode_mutex_key(inode);
        security_d_instantiate(entry, inode);
        spin_lock(&inode->i_lock);
        spin_lock(&entry->d_lock);
        __d_instantiate(entry, inode);
        spin_unlock(&entry->d_lock);
        WARN_ON(!(inode_state_read(inode) & I_NEW));
        inode_state_clear(inode, I_NEW | I_CREATING);
        inode_wake_up_bit(inode, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_instantiate_new);

struct dentry *d_make_root(struct inode *root_inode)
{
        struct dentry *res = NULL;

        if (root_inode) {
                res = d_alloc_anon(root_inode->i_sb);
                if (res)
                        d_instantiate(res, root_inode);
                else
                        iput(root_inode);
        }
        return res;
}
EXPORT_SYMBOL(d_make_root);

static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
{
        struct super_block *sb;
        struct dentry *new, *res;

        if (!inode)
                return ERR_PTR(-ESTALE);
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        sb = inode->i_sb;

        res = d_find_any_alias(inode); /* existing alias? */
        if (res)
                goto out;

        new = d_alloc_anon(sb);
        if (!new) {
                res = ERR_PTR(-ENOMEM);
                goto out;
        }

        security_d_instantiate(new, inode);
        spin_lock(&inode->i_lock);
        res = __d_find_any_alias(inode); /* recheck under lock */
        if (likely(!res)) { /* still no alias, attach a disconnected dentry */
                unsigned add_flags = d_flags_for_inode(inode);

                if (disconnected)
                        add_flags |= DCACHE_DISCONNECTED;

                spin_lock(&new->d_lock);
                __d_set_inode_and_type(new, inode, add_flags);
                hlist_add_head(&new->d_u.d_alias, &inode->i_dentry);
                if (!disconnected) {
                        hlist_bl_lock(&sb->s_roots);
                        hlist_bl_add_head(&new->d_hash, &sb->s_roots);
                        hlist_bl_unlock(&sb->s_roots);
                }
                spin_unlock(&new->d_lock);
                spin_unlock(&inode->i_lock);
                inode = NULL; /* consumed by new->d_inode */
                res = new;
        } else {
                spin_unlock(&inode->i_lock);
                dput(new);
        }

 out:
        iput(inode);
        return res;
}

/**
 * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain a dentry for an inode resulting from NFS filehandle conversion or
 * similar open by handle operations.  The returned dentry may be anonymous,
 * or may have a full name (if the inode was already in the cache).
 *
 * When called on a directory inode, we must ensure that the inode only ever
 * has one dentry.  If a dentry is found, that is returned instead of
 * allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is released.
 * To make it easier to use in export operations a %NULL or IS_ERR inode may
 * be passed in and the error will be propagated to the return value,
 * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_alias(struct inode *inode)
{
        return __d_obtain_alias(inode, true);
}
EXPORT_SYMBOL(d_obtain_alias);

/**
 * d_obtain_root - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain an IS_ROOT dentry for the root of a filesystem.
 *
 * We must ensure that directory inodes only ever have one dentry.  If a
 * dentry is found, that is returned instead of allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is
 * released.  A %NULL or IS_ERR inode may be passed in and will be the
 * error will be propagate to the return value, with a %NULL @inode
 * replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_root(struct inode *inode)
{
        return __d_obtain_alias(inode, false);
}
EXPORT_SYMBOL(d_obtain_root);

/**
 * d_add_ci - lookup or allocate new dentry with case-exact name
 * @dentry: the negative dentry that was passed to the parent's lookup func
 * @inode:  the inode case-insensitive lookup has found
 * @name:   the case-exact name to be associated with the returned dentry
 *
 * This is to avoid filling the dcache with case-insensitive names to the
 * same inode, only the actual correct case is stored in the dcache for
 * case-insensitive filesystems.
 *
 * For a case-insensitive lookup match and if the case-exact dentry
 * already exists in the dcache, use it and return it.
 *
 * If no entry exists with the exact case name, allocate new dentry with
 * the exact case, and return the spliced entry.
 */
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
                        struct qstr *name)
{
        struct dentry *found, *res;

        /*
         * First check if a dentry matching the name already exists,
         * if not go ahead and create it now.
         */
        found = d_hash_and_lookup(dentry->d_parent, name);
        if (found) {
                iput(inode);
                return found;
        }
        if (d_in_lookup(dentry)) {
                found = d_alloc_parallel(dentry->d_parent, name,
                                        dentry->d_wait);
                if (IS_ERR(found) || !d_in_lookup(found)) {
                        iput(inode);
                        return found;
                }
        } else {
                found = d_alloc(dentry->d_parent, name);
                if (!found) {
                        iput(inode);
                        return ERR_PTR(-ENOMEM);
                } 
        }
        res = d_splice_alias(inode, found);
        if (res) {
                d_lookup_done(found);
                dput(found);
                return res;
        }
        return found;
}
EXPORT_SYMBOL(d_add_ci);

/**
 * d_same_name - compare dentry name with case-exact name
 * @dentry: the negative dentry that was passed to the parent's lookup func
 * @parent: parent dentry
 * @name:   the case-exact name to be associated with the returned dentry
 *
 * Return: true if names are same, or false
 */
bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
                 const struct qstr *name)
{
        if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
                if (dentry->d_name.len != name->len)
                        return false;
                return dentry_cmp(dentry, name->name, name->len) == 0;
        }
        return parent->d_op->d_compare(dentry,
                                       dentry->d_name.len, dentry->d_name.name,
                                       name) == 0;
}
EXPORT_SYMBOL_GPL(d_same_name);

/*
 * This is __d_lookup_rcu() when the parent dentry has
 * DCACHE_OP_COMPARE, which makes things much nastier.
 */
static noinline struct dentry *__d_lookup_rcu_op_compare(
        const struct dentry *parent,
        const struct qstr *name,
        unsigned *seqp)
{
        u64 hashlen = name->hash_len;
        struct hlist_bl_head *b = d_hash(hashlen);
        struct hlist_bl_node *node;
        struct dentry *dentry;

        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                int tlen;
                const char *tname;
                unsigned seq;

seqretry:
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (d_unhashed(dentry))
                        continue;
                if (dentry->d_name.hash != hashlen_hash(hashlen))
                        continue;
                tlen = dentry->d_name.len;
                tname = dentry->d_name.name;
                /* we want a consistent (name,len) pair */
                if (read_seqcount_retry(&dentry->d_seq, seq)) {
                        cpu_relax();
                        goto seqretry;
                }
                if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0)
                        continue;
                *seqp = seq;
                return dentry;
        }
        return NULL;
}

/**
 * __d_lookup_rcu - search for a dentry (racy, store-free)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * @seqp: returns d_seq value at the point where the dentry was found
 * Returns: dentry, or NULL
 *
 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
 * resolution (store-free path walking) design described in
 * Documentation/filesystems/path-lookup.txt.
 *
 * This is not to be used outside core vfs.
 *
 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
 * held, and rcu_read_lock held. The returned dentry must not be stored into
 * without taking d_lock and checking d_seq sequence count against @seq
 * returned here.
 *
 * Alternatively, __d_lookup_rcu may be called again to look up the child of
 * the returned dentry, so long as its parent's seqlock is checked after the
 * child is looked up. Thus, an interlocking stepping of sequence lock checks
 * is formed, giving integrity down the path walk.
 *
 * NOTE! The caller *has* to check the resulting dentry against the sequence
 * number we've returned before using any of the resulting dentry state!
 */
struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name,
                                unsigned *seqp)
{
        u64 hashlen = name->hash_len;
        const unsigned char *str = name->name;
        struct hlist_bl_head *b = d_hash(hashlen);
        struct hlist_bl_node *node;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        if (unlikely(parent->d_flags & DCACHE_OP_COMPARE))
                return __d_lookup_rcu_op_compare(parent, name, seqp);

        /*
         * The hash list is protected using RCU.
         *
         * Carefully use d_seq when comparing a candidate dentry, to avoid
         * races with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                unsigned seq;

                /*
                 * The dentry sequence count protects us from concurrent
                 * renames, and thus protects parent and name fields.
                 *
                 * The caller must perform a seqcount check in order
                 * to do anything useful with the returned dentry.
                 *
                 * NOTE! We do a "raw" seqcount_begin here. That means that
                 * we don't wait for the sequence count to stabilize if it
                 * is in the middle of a sequence change. If we do the slow
                 * dentry compare, we will do seqretries until it is stable,
                 * and if we end up with a successful lookup, we actually
                 * want to exit RCU lookup anyway.
                 *
                 * Note that raw_seqcount_begin still *does* smp_rmb(), so
                 * we are still guaranteed NUL-termination of ->d_name.name.
                 */
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (dentry->d_name.hash_len != hashlen)
                        continue;
                if (unlikely(dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0))
                        continue;
                /*
                 * Check for the dentry being unhashed.
                 *
                 * As tempting as it is, we *can't* skip it because of a race window
                 * between us finding the dentry before it gets unhashed and loading
                 * the sequence counter after unhashing is finished.
                 *
                 * We can at least predict on it.
                 */
                if (unlikely(d_unhashed(dentry)))
                        continue;
                *seqp = seq;
                return dentry;
        }
        return NULL;
}

/**
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * d_lookup searches the children of the parent dentry for the name in
 * question. If the dentry is found its reference count is incremented and the
 * dentry is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned if the dentry does not exist.
 */
struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
{
        struct dentry *dentry;
        unsigned seq;

        do {
                seq = read_seqbegin(&rename_lock);
                dentry = __d_lookup(parent, name);
                if (dentry)
                        break;
        } while (read_seqretry(&rename_lock, seq));
        return dentry;
}
EXPORT_SYMBOL(d_lookup);

/**
 * __d_lookup - search for a dentry (racy)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * __d_lookup is like d_lookup, however it may (rarely) return a
 * false-negative result due to unrelated rename activity.
 *
 * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
 * however it must be used carefully, eg. with a following d_lookup in
 * the case of failure.
 *
 * __d_lookup callers must be commented.
 */
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = d_hash(hash);
        struct hlist_bl_node *node;
        struct dentry *found = NULL;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        /*
         * The hash list is protected using RCU.
         *
         * Take d_lock when comparing a candidate dentry, to avoid races
         * with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        rcu_read_lock();
        
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {

                if (dentry->d_name.hash != hash)
                        continue;

                spin_lock(&dentry->d_lock);
                if (dentry->d_parent != parent)
                        goto next;
                if (d_unhashed(dentry))
                        goto next;

                if (!d_same_name(dentry, parent, name))
                        goto next;

                dentry->d_lockref.count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
next:
                spin_unlock(&dentry->d_lock);
         }
         rcu_read_unlock();

         return found;
}

/**
 * d_hash_and_lookup - hash the qstr then search for a dentry
 * @dir: Directory to search in
 * @name: qstr of name we wish to find
 *
 * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
 */
struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
{
        /*
         * Check for a fs-specific hash function. Note that we must
         * calculate the standard hash first, as the d_op->d_hash()
         * routine may choose to leave the hash value unchanged.
         */
        name->hash = full_name_hash(dir, name->name, name->len);
        if (dir->d_flags & DCACHE_OP_HASH) {
                int err = dir->d_op->d_hash(dir, name);
                if (unlikely(err < 0))
                        return ERR_PTR(err);
        }
        return d_lookup(dir, name);
}

/*
 * When a file is deleted, we have two options:
 * - turn this dentry into a negative dentry
 * - unhash this dentry and free it.
 *
 * Usually, we want to just turn this into
 * a negative dentry, but if anybody else is
 * currently using the dentry or the inode
 * we can't do that and we fall back on removing
 * it from the hash queues and waiting for
 * it to be deleted later when it has no users
 */
 
/**
 * d_delete - delete a dentry
 * @dentry: The dentry to delete
 *
 * Turn the dentry into a negative dentry if possible, otherwise
 * remove it from the hash queues so it can be deleted later
 */
 
void d_delete(struct dentry * dentry)
{
        struct inode *inode = dentry->d_inode;

        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        /*
         * Are we the only user?
         */
        if (dentry->d_lockref.count == 1) {
                if (dentry_negative_policy)
                        __d_drop(dentry);
                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
                dentry_unlink_inode(dentry);
        } else {
                __d_drop(dentry);
                spin_unlock(&dentry->d_lock);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_delete);

static void __d_rehash(struct dentry *entry)
{
        struct hlist_bl_head *b = d_hash(entry->d_name.hash);

        hlist_bl_lock(b);
        hlist_bl_add_head_rcu(&entry->d_hash, b);
        hlist_bl_unlock(b);
}

/**
 * d_rehash - add an entry back to the hash
 * @entry: dentry to add to the hash
 *
 * Adds a dentry to the hash according to its name.
 */
 
void d_rehash(struct dentry * entry)
{
        spin_lock(&entry->d_lock);
        __d_rehash(entry);
        spin_unlock(&entry->d_lock);
}
EXPORT_SYMBOL(d_rehash);

static inline unsigned start_dir_add(struct inode *dir)
{
        preempt_disable_nested();
        for (;;) {
                unsigned n = READ_ONCE(dir->i_dir_seq);
                if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1))
                        return n;
                cpu_relax();
        }
}

static inline void end_dir_add(struct inode *dir, unsigned int n,
                               wait_queue_head_t *d_wait)
{
        smp_store_release(&dir->i_dir_seq, n + 2);
        preempt_enable_nested();
        if (wq_has_sleeper(d_wait))
                wake_up_all(d_wait);
}

static void d_wait_lookup(struct dentry *dentry)
{
        if (d_in_lookup(dentry)) {
                DECLARE_WAITQUEUE(wait, current);
                add_wait_queue(dentry->d_wait, &wait);
                do {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        spin_unlock(&dentry->d_lock);
                        schedule();
                        spin_lock(&dentry->d_lock);
                } while (d_in_lookup(dentry));
        }
}

struct dentry *d_alloc_parallel(struct dentry *parent,
                                const struct qstr *name,
                                wait_queue_head_t *wq)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = in_lookup_hash(parent, hash);
        struct hlist_bl_node *node;
        struct dentry *new = __d_alloc(parent->d_sb, name);
        struct dentry *dentry;
        unsigned seq, r_seq, d_seq;

        if (unlikely(!new))
                return ERR_PTR(-ENOMEM);

        new->d_flags |= DCACHE_PAR_LOOKUP;
        spin_lock(&parent->d_lock);
        new->d_parent = dget_dlock(parent);
        hlist_add_head(&new->d_sib, &parent->d_children);
        if (parent->d_flags & DCACHE_DISCONNECTED)
                new->d_flags |= DCACHE_DISCONNECTED;
        spin_unlock(&parent->d_lock);

retry:
        rcu_read_lock();
        seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
        r_seq = read_seqbegin(&rename_lock);
        dentry = __d_lookup_rcu(parent, name, &d_seq);
        if (unlikely(dentry)) {
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
                        rcu_read_unlock();
                        dput(dentry);
                        goto retry;
                }
                rcu_read_unlock();
                dput(new);
                return dentry;
        }
        if (unlikely(read_seqretry(&rename_lock, r_seq))) {
                rcu_read_unlock();
                goto retry;
        }

        if (unlikely(seq & 1)) {
                rcu_read_unlock();
                goto retry;
        }

        hlist_bl_lock(b);
        if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
                hlist_bl_unlock(b);
                rcu_read_unlock();
                goto retry;
        }
        /*
         * No changes for the parent since the beginning of d_lookup().
         * Since all removals from the chain happen with hlist_bl_lock(),
         * any potential in-lookup matches are going to stay here until
         * we unlock the chain.  All fields are stable in everything
         * we encounter.
         */
        hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
                if (dentry->d_name.hash != hash)
                        continue;
                if (dentry->d_parent != parent)
                        continue;
                if (!d_same_name(dentry, parent, name))
                        continue;
                hlist_bl_unlock(b);
                /* now we can try to grab a reference */
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }

                rcu_read_unlock();
                /*
                 * somebody is likely to be still doing lookup for it;
                 * wait for them to finish
                 */
                spin_lock(&dentry->d_lock);
                d_wait_lookup(dentry);
                /*
                 * it's not in-lookup anymore; in principle we should repeat
                 * everything from dcache lookup, but it's likely to be what
                 * d_lookup() would've found anyway.  If it is, just return it;
                 * otherwise we really have to repeat the whole thing.
                 */
                if (unlikely(dentry->d_name.hash != hash))
                        goto mismatch;
                if (unlikely(dentry->d_parent != parent))
                        goto mismatch;
                if (unlikely(d_unhashed(dentry)))
                        goto mismatch;
                if (unlikely(!d_same_name(dentry, parent, name)))
                        goto mismatch;
                /* OK, it *is* a hashed match; return it */
                spin_unlock(&dentry->d_lock);
                dput(new);
                return dentry;
        }
        rcu_read_unlock();
        new->d_wait = wq;
        hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b);
        hlist_bl_unlock(b);
        return new;
mismatch:
        spin_unlock(&dentry->d_lock);
        dput(dentry);
        goto retry;
}
EXPORT_SYMBOL(d_alloc_parallel);

/*
 * - Unhash the dentry
 * - Retrieve and clear the waitqueue head in dentry
 * - Return the waitqueue head
 */
static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry)
{
        wait_queue_head_t *d_wait;
        struct hlist_bl_head *b;

        lockdep_assert_held(&dentry->d_lock);

        b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash);
        hlist_bl_lock(b);
        dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
        __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
        d_wait = dentry->d_wait;
        dentry->d_wait = NULL;
        hlist_bl_unlock(b);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_LIST_HEAD(&dentry->d_lru);
        return d_wait;
}

void __d_lookup_unhash_wake(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        wake_up_all(__d_lookup_unhash(dentry));
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(__d_lookup_unhash_wake);

/* inode->i_lock held if inode is non-NULL */

static inline void __d_add(struct dentry *dentry, struct inode *inode,
                           const struct dentry_operations *ops)
{
        wait_queue_head_t *d_wait;
        struct inode *dir = NULL;
        unsigned n;
        spin_lock(&dentry->d_lock);
        if (unlikely(d_in_lookup(dentry))) {
                dir = dentry->d_parent->d_inode;
                n = start_dir_add(dir);
                d_wait = __d_lookup_unhash(dentry);
        }
        if (unlikely(ops))
                d_set_d_op(dentry, ops);
        if (inode) {
                unsigned add_flags = d_flags_for_inode(inode);
                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
                raw_write_seqcount_begin(&dentry->d_seq);
                __d_set_inode_and_type(dentry, inode, add_flags);
                raw_write_seqcount_end(&dentry->d_seq);
                fsnotify_update_flags(dentry);
        }
        __d_rehash(dentry);
        if (dir)
                end_dir_add(dir, n, d_wait);
        spin_unlock(&dentry->d_lock);
        if (inode)
                spin_unlock(&inode->i_lock);
}

/**
 * d_add - add dentry to hash queues
 * @entry: dentry to add
 * @inode: The inode to attach to this dentry
 *
 * This adds the entry to the hash queues and initializes @inode.
 * The entry was actually filled in earlier during d_alloc().
 */

void d_add(struct dentry *entry, struct inode *inode)
{
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
        }
        __d_add(entry, inode, NULL);
}
EXPORT_SYMBOL(d_add);

struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode)
{
        WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
        WARN_ON(!inode);
        security_d_instantiate(dentry, inode);
        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        __d_instantiate(dentry, inode);
        dentry->d_flags |= DCACHE_PERSISTENT;
        dget_dlock(dentry);
        if (d_unhashed(dentry))
                __d_rehash(dentry);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        return dentry;
}
EXPORT_SYMBOL(d_make_persistent);

static void swap_names(struct dentry *dentry, struct dentry *target)
{
        if (unlikely(dname_external(target))) {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * Both external: swap the pointers
                         */
                        swap(target->__d_name.name, dentry->__d_name.name);
                } else {
                        /*
                         * dentry:internal, target:external.  Steal target's
                         * storage and make target internal.
                         */
                        dentry->__d_name.name = target->__d_name.name;
                        target->d_shortname = dentry->d_shortname;
                        target->__d_name.name = target->d_shortname.string;
                }
        } else {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * dentry:external, target:internal.  Give dentry's
                         * storage to target and make dentry internal
                         */
                        target->__d_name.name = dentry->__d_name.name;
                        dentry->d_shortname = target->d_shortname;
                        dentry->__d_name.name = dentry->d_shortname.string;
                } else {
                        /*
                         * Both are internal.
                         */
                        for (int i = 0; i < DNAME_INLINE_WORDS; i++)
                                swap(dentry->d_shortname.words[i],
                                     target->d_shortname.words[i]);
                }
        }
        swap(dentry->__d_name.hash_len, target->__d_name.hash_len);
}

static void copy_name(struct dentry *dentry, struct dentry *target)
{
        struct external_name *old_name = NULL;
        if (unlikely(dname_external(dentry)))
                old_name = external_name(dentry);
        if (unlikely(dname_external(target))) {
                atomic_inc(&external_name(target)->count);
                dentry->__d_name = target->__d_name;
        } else {
                dentry->d_shortname = target->d_shortname;
                dentry->__d_name.name = dentry->d_shortname.string;
                dentry->__d_name.hash_len = target->__d_name.hash_len;
        }
        if (old_name && likely(atomic_dec_and_test(&old_name->count)))
                kfree_rcu(old_name, head);
}

/*
 * __d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 * @exchange: exchange the two dentries
 *
 * Update the dcache to reflect the move of a file name. Negative dcache
 * entries should not be moved in this way. Caller must hold rename_lock, the
 * i_rwsem of the source and target directories (exclusively), and the sb->
 * s_vfs_rename_mutex if they differ. See lock_rename().
 */
static void __d_move(struct dentry *dentry, struct dentry *target,
                     bool exchange)
{
        struct dentry *old_parent, *p;
        wait_queue_head_t *d_wait;
        struct inode *dir = NULL;
        unsigned n;

        WARN_ON(!dentry->d_inode);
        if (WARN_ON(dentry == target))
                return;

        BUG_ON(d_ancestor(target, dentry));
        old_parent = dentry->d_parent;
        p = d_ancestor(old_parent, target);
        if (IS_ROOT(dentry)) {
                BUG_ON(p);
                spin_lock(&target->d_parent->d_lock);
        } else if (!p) {
                /* target is not a descendent of dentry->d_parent */
                spin_lock(&target->d_parent->d_lock);
                spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
        } else {
                BUG_ON(p == dentry);
                spin_lock(&old_parent->d_lock);
                if (p != target)
                        spin_lock_nested(&target->d_parent->d_lock,
                                        DENTRY_D_LOCK_NESTED);
        }
        spin_lock_nested(&dentry->d_lock, 2);
        spin_lock_nested(&target->d_lock, 3);

        if (unlikely(d_in_lookup(target))) {
                dir = target->d_parent->d_inode;
                n = start_dir_add(dir);
                d_wait = __d_lookup_unhash(target);
        }

        write_seqcount_begin(&dentry->d_seq);
        write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);

        /* unhash both */
        if (!d_unhashed(dentry))
                ___d_drop(dentry);
        if (!d_unhashed(target))
                ___d_drop(target);

        /* ... and switch them in the tree */
        dentry->d_parent = target->d_parent;
        if (!exchange) {
                copy_name(dentry, target);
                target->d_hash.pprev = NULL;
                dentry->d_parent->d_lockref.count++;
                if (dentry != old_parent) /* wasn't IS_ROOT */
                        WARN_ON(!--old_parent->d_lockref.count);
        } else {
                target->d_parent = old_parent;
                swap_names(dentry, target);
                if (!hlist_unhashed(&target->d_sib))
                        __hlist_del(&target->d_sib);
                hlist_add_head(&target->d_sib, &target->d_parent->d_children);
                __d_rehash(target);
                fsnotify_update_flags(target);
        }
        if (!hlist_unhashed(&dentry->d_sib))
                __hlist_del(&dentry->d_sib);
        hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children);
        __d_rehash(dentry);
        fsnotify_update_flags(dentry);
        fscrypt_handle_d_move(dentry);

        write_seqcount_end(&target->d_seq);
        write_seqcount_end(&dentry->d_seq);

        if (dir)
                end_dir_add(dir, n, d_wait);

        if (dentry->d_parent != old_parent)
                spin_unlock(&dentry->d_parent->d_lock);
        if (dentry != old_parent)
                spin_unlock(&old_parent->d_lock);
        spin_unlock(&target->d_lock);
        spin_unlock(&dentry->d_lock);
}

/*
 * d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. See the locking
 * requirements for __d_move.
 */
void d_move(struct dentry *dentry, struct dentry *target)
{
        write_seqlock(&rename_lock);
        __d_move(dentry, target, false);
        write_sequnlock(&rename_lock);
}
EXPORT_SYMBOL(d_move);

/*
 * d_exchange - exchange two dentries
 * @dentry1: first dentry
 * @dentry2: second dentry
 */
void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
{
        write_seqlock(&rename_lock);

        WARN_ON(!dentry1->d_inode);
        WARN_ON(!dentry2->d_inode);
        WARN_ON(IS_ROOT(dentry1));
        WARN_ON(IS_ROOT(dentry2));

        __d_move(dentry1, dentry2, true);

        write_sequnlock(&rename_lock);
}
EXPORT_SYMBOL(d_exchange);

/**
 * d_ancestor - search for an ancestor
 * @p1: ancestor dentry
 * @p2: child dentry
 *
 * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
 * an ancestor of p2, else NULL.
 */
struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p;

        for (p = p2; !IS_ROOT(p); p = p->d_parent) {
                if (p->d_parent == p1)
                        return p;
        }
        return NULL;
}

/*
 * This helper attempts to cope with remotely renamed directories
 *
 * It assumes that the caller is already holding
 * dentry->d_parent->d_inode->i_rwsem, and rename_lock
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
 */
static int __d_unalias(struct dentry *dentry, struct dentry *alias)
{
        struct mutex *m1 = NULL;
        struct rw_semaphore *m2 = NULL;
        int ret = -ESTALE;

        /* If alias and dentry share a parent, then no extra locks required */
        if (alias->d_parent == dentry->d_parent)
                goto out_unalias;

        /* See lock_rename() */
        if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
                goto out_err;
        m1 = &dentry->d_sb->s_vfs_rename_mutex;
        if (!inode_trylock_shared(alias->d_parent->d_inode))
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_rwsem;
out_unalias:
        if (alias->d_op && alias->d_op->d_unalias_trylock &&
            !alias->d_op->d_unalias_trylock(alias))
                goto out_err;
        __d_move(alias, dentry, false);
        if (alias->d_op && alias->d_op->d_unalias_unlock)
                alias->d_op->d_unalias_unlock(alias);
        ret = 0;
out_err:
        if (m2)
                up_read(m2);
        if (m1)
                mutex_unlock(m1);
        return ret;
}

struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry,
                                  const struct dentry_operations *ops)
{
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        BUG_ON(!d_unhashed(dentry));

        if (!inode)
                goto out;

        security_d_instantiate(dentry, inode);
        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *new = __d_find_any_alias(inode);
                if (unlikely(new)) {
                        /* The reference to new ensures it remains an alias */
                        spin_unlock(&inode->i_lock);
                        write_seqlock(&rename_lock);
                        if (unlikely(d_ancestor(new, dentry))) {
                                write_sequnlock(&rename_lock);
                                dput(new);
                                new = ERR_PTR(-ELOOP);
                                pr_warn_ratelimited(
                                        "VFS: Lookup of '%s' in %s %s"
                                        " would have caused loop\n",
                                        dentry->d_name.name,
                                        inode->i_sb->s_type->name,
                                        inode->i_sb->s_id);
                        } else if (!IS_ROOT(new)) {
                                struct dentry *old_parent = dget(new->d_parent);
                                int err = __d_unalias(dentry, new);
                                write_sequnlock(&rename_lock);
                                if (err) {
                                        dput(new);
                                        new = ERR_PTR(err);
                                }
                                dput(old_parent);
                        } else {
                                __d_move(new, dentry, false);
                                write_sequnlock(&rename_lock);
                        }
                        iput(inode);
                        return new;
                }
        }
out:
        __d_add(dentry, inode, ops);
        return NULL;
}

/**
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
 * @inode:  the inode which may have a disconnected dentry
 * @dentry: a negative dentry which we want to point to the inode.
 *
 * If inode is a directory and has an IS_ROOT alias, then d_move that in
 * place of the given dentry and return it, else simply d_add the inode
 * to the dentry and return NULL.
 *
 * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
 * we should error out: directories can't have multiple aliases.
 *
 * This is needed in the lookup routine of any filesystem that is exportable
 * (via knfsd) so that we can build dcache paths to directories effectively.
 *
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
 * Cluster filesystems may call this function with a negative, hashed dentry.
 * In that case, we know that the inode will be a regular file, and also this
 * will only occur during atomic_open. So we need to check for the dentry
 * being already hashed only in the final case.
 */
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
        return d_splice_alias_ops(inode, dentry, NULL);
}
EXPORT_SYMBOL(d_splice_alias);

/*
 * Test whether new_dentry is a subdirectory of old_dentry.
 *
 * Trivially implemented using the dcache structure
 */

/**
 * is_subdir - is new dentry a subdirectory of old_dentry
 * @new_dentry: new dentry
 * @old_dentry: old dentry
 *
 * Returns true if new_dentry is a subdirectory of the parent (at any depth).
 * Returns false otherwise.
 * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
 */
  
bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
        bool subdir;
        unsigned seq;

        if (new_dentry == old_dentry)
                return true;

        /* Access d_parent under rcu as d_move() may change it. */
        rcu_read_lock();
        seq = read_seqbegin(&rename_lock);
        subdir = d_ancestor(old_dentry, new_dentry);
         /* Try lockless once... */
        if (read_seqretry(&rename_lock, seq)) {
                /* ...else acquire lock for progress even on deep chains. */
                read_seqlock_excl(&rename_lock);
                subdir = d_ancestor(old_dentry, new_dentry);
                read_sequnlock_excl(&rename_lock);
        }
        rcu_read_unlock();
        return subdir;
}
EXPORT_SYMBOL(is_subdir);

void d_mark_tmpfile(struct file *file, struct inode *inode)
{
        struct dentry *dentry = file->f_path.dentry;

        BUG_ON(dname_external(dentry) ||
                !hlist_unhashed(&dentry->d_u.d_alias) ||
                !d_unlinked(dentry));
        spin_lock(&dentry->d_parent->d_lock);
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry->__d_name.len = sprintf(dentry->d_shortname.string, "#%llu",
                                (unsigned long long)inode->i_ino);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dentry->d_parent->d_lock);
}
EXPORT_SYMBOL(d_mark_tmpfile);

int d_mark_tmpfile_name(struct file *file, const struct qstr *name)
{
        struct dentry *dentry = file->f_path.dentry;
        char *dname = dentry->d_shortname.string;

        if (unlikely(dname_external(dentry) ||
                     d_really_is_positive(dentry) ||
                     !d_unlinked(dentry)))
                return -EINVAL;
        if (unlikely(name->len > DNAME_INLINE_LEN - 1))
                return -ENAMETOOLONG;

        spin_lock(&dentry->d_parent->d_lock);
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry->__d_name.len = name->len;
        memcpy(dname, name->name, name->len);
        dname[name->len] = '\0';
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dentry->d_parent->d_lock);
        return 0;
}
EXPORT_SYMBOL(d_mark_tmpfile_name);

void d_tmpfile(struct file *file, struct inode *inode)
{
        struct dentry *dentry = file->f_path.dentry;

        inode_dec_link_count(inode);
        d_mark_tmpfile(file, inode);
        d_instantiate(dentry, inode);
}
EXPORT_SYMBOL(d_tmpfile);

/*
 * Obtain inode number of the parent dentry.
 */
ino_t d_parent_ino(struct dentry *dentry)
{
        struct dentry *parent;
        struct inode *iparent;
        unsigned seq;
        ino_t ret;

        scoped_guard(rcu) {
                seq = raw_seqcount_begin(&dentry->d_seq);
                parent = READ_ONCE(dentry->d_parent);
                iparent = d_inode_rcu(parent);
                if (likely(iparent)) {
                        ret = iparent->i_ino;
                        if (!read_seqcount_retry(&dentry->d_seq, seq))
                                return ret;
                }
        }

        spin_lock(&dentry->d_lock);
        ret = dentry->d_parent->d_inode->i_ino;
        spin_unlock(&dentry->d_lock);
        return ret;
}
EXPORT_SYMBOL(d_parent_ino);

static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
{
        return kstrtoul(str, 0, &dhash_entries) == 0;
}
__setup("dhash_entries=", set_dhash_entries);

static void __init dcache_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY | HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        2,
                                        0);
        d_hash_shift = 32 - d_hash_shift;

        runtime_const_init(shift, d_hash_shift);
        runtime_const_init(ptr, dentry_hashtable);
}

static void __init dcache_init(void)
{
        /*
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
         * of the dcache.
         */
        __dentry_cache = KMEM_CACHE_USERCOPY(dentry,
                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
                d_shortname.string);
        runtime_const_init(ptr, __dentry_cache);

        /* Hash may have been set up in dcache_init_early */
        if (!hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        2,
                                        0);
        d_hash_shift = 32 - d_hash_shift;

        runtime_const_init(shift, d_hash_shift);
        runtime_const_init(ptr, dentry_hashtable);
}

void __init vfs_caches_init_early(void)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
                INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);

        dcache_init_early();
        inode_init_early();
}

void __init vfs_caches_init(void)
{
        filename_init();
        dcache_init();
        inode_init();
        files_init();
        files_maxfiles_init();
        mnt_init();
        bdev_cache_init();
        chrdev_init();
}









































    5 

























    5 












    3 



















    3 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/sysfs/dir.c - sysfs core and dir operation implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * Please see Documentation/filesystems/sysfs.rst for more information.
 */

#define pr_fmt(fmt)        "sysfs: " fmt

#include <linux/fs.h>
#include <linux/kobject.h>
#include <linux/slab.h>
#include "sysfs.h"

DEFINE_SPINLOCK(sysfs_symlink_target_lock);

void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
{
        char *buf;

        buf = kzalloc(PATH_MAX, GFP_KERNEL);
        if (buf)
                kernfs_path(parent, buf, PATH_MAX);

        pr_warn("cannot create duplicate filename '%s/%s'\n", buf, name);
        dump_stack();

        kfree(buf);
}

/**
 * sysfs_create_dir_ns - create a directory for an object with a namespace tag
 * @kobj: object we're creating directory for
 * @ns: the namespace tag to use
 */
int sysfs_create_dir_ns(struct kobject *kobj, const struct ns_common *ns)
{
        struct kernfs_node *parent, *kn;
        kuid_t uid;
        kgid_t gid;

        if (WARN_ON(!kobj))
                return -EINVAL;

        if (kobj->parent)
                parent = kobj->parent->sd;
        else
                parent = sysfs_root_kn;

        if (!parent)
                return -ENOENT;

        kobject_get_ownership(kobj, &uid, &gid);

        kn = kernfs_create_dir_ns(parent, kobject_name(kobj), 0755, uid, gid,
                                  kobj, ns);
        if (IS_ERR(kn)) {
                if (PTR_ERR(kn) == -EEXIST)
                        sysfs_warn_dup(parent, kobject_name(kobj));
                return PTR_ERR(kn);
        }

        kobj->sd = kn;
        return 0;
}

/**
 *        sysfs_remove_dir - remove an object's directory.
 *        @kobj:        object.
 *
 *        The only thing special about this is that we remove any files in
 *        the directory before we remove the directory, and we've inlined
 *        what used to be sysfs_rmdir() below, instead of calling separately.
 */
void sysfs_remove_dir(struct kobject *kobj)
{
        struct kernfs_node *kn = kobj->sd;

        /*
         * In general, kobject owner is responsible for ensuring removal
         * doesn't race with other operations and sysfs doesn't provide any
         * protection; however, when @kobj is used as a symlink target, the
         * symlinking entity usually doesn't own @kobj and thus has no
         * control over removal.  @kobj->sd may be removed anytime
         * and symlink code may end up dereferencing an already freed node.
         *
         * sysfs_symlink_target_lock synchronizes @kobj->sd
         * disassociation against symlink operations so that symlink code
         * can safely dereference @kobj->sd.
         */
        spin_lock(&sysfs_symlink_target_lock);
        kobj->sd = NULL;
        spin_unlock(&sysfs_symlink_target_lock);

        if (kn) {
                WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
                kernfs_remove(kn);
        }
}

int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
                        const struct ns_common *new_ns)
{
        struct kernfs_node *parent;
        int ret;

        parent = kernfs_get_parent(kobj->sd);
        ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
        kernfs_put(parent);
        return ret;
}

int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
                      const struct ns_common *new_ns)
{
        struct kernfs_node *kn = kobj->sd;
        struct kernfs_node *new_parent;

        new_parent = new_parent_kobj && new_parent_kobj->sd ?
                new_parent_kobj->sd : sysfs_root_kn;

        return kernfs_rename_ns(kn, new_parent, NULL, new_ns);
}

/**
 * sysfs_create_mount_point - create an always empty directory
 * @parent_kobj:  kobject that will contain this always empty directory
 * @name: The name of the always empty directory to add
 */
int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name)
{
        struct kernfs_node *kn, *parent = parent_kobj->sd;

        kn = kernfs_create_empty_dir(parent, name);
        if (IS_ERR(kn)) {
                if (PTR_ERR(kn) == -EEXIST)
                        sysfs_warn_dup(parent, name);
                return PTR_ERR(kn);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(sysfs_create_mount_point);

/**
 *        sysfs_remove_mount_point - remove an always empty directory.
 *        @parent_kobj: kobject that will contain this always empty directory
 *        @name: The name of the always empty directory to remove
 *
 */
void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name)
{
        struct kernfs_node *parent = parent_kobj->sd;

        kernfs_remove_by_name_ns(parent, name, NULL);
}
EXPORT_SYMBOL_GPL(sysfs_remove_mount_point);























































   14 
   14 











   40 






   12 
    3 


    3 







    3 






   11 









































    1 

































































   41 





























   41 












   67 



























































































































   37 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BITOPS_H
#define _ASM_X86_BITOPS_H

/*
 * Copyright 1992, Linus Torvalds.
 *
 * Note: inlines with more than a single statement should be marked
 * __always_inline to avoid problems with older gcc's inlining heuristics.
 */

#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
#endif

#include <linux/compiler.h>
#include <asm/alternative.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>

#if BITS_PER_LONG == 32
# define _BITOPS_LONG_SHIFT 5
#elif BITS_PER_LONG == 64
# define _BITOPS_LONG_SHIFT 6
#else
# error "Unexpected BITS_PER_LONG"
#endif

#define BIT_64(n)                        (U64_C(1) << (n))

/*
 * These have to be done with inline assembly: that way the bit-setting
 * is guaranteed to be atomic. All bit operations return 0 if the bit
 * was cleared before the operation and != 0 if it was not.
 *
 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
 */

#define RLONG_ADDR(x)                         "m" (*(volatile long *) (x))
#define WBYTE_ADDR(x)                        "+m" (*(volatile char *) (x))

#define ADDR                                RLONG_ADDR(addr)

/*
 * We do the locked ops that don't return the old value as
 * a mask operation on a byte.
 */
#define CONST_MASK_ADDR(nr, addr)        WBYTE_ADDR((void *)(addr) + ((nr)>>3))
#define CONST_MASK(nr)                        (1 << ((nr) & 7))

static __always_inline void
arch_set_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm_inline volatile(LOCK_PREFIX "orb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (CONST_MASK(nr))
                        : "memory");
        } else {
                asm_inline volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline void
arch___set_bit(unsigned long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline void
arch_clear_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm_inline volatile(LOCK_PREFIX "andb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (~CONST_MASK(nr)));
        } else {
                asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline void
arch_clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        barrier();
        arch_clear_bit(nr, addr);
}

static __always_inline void
arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
                volatile unsigned long *addr)
{
        bool negative;
        asm_inline volatile(LOCK_PREFIX "xorb %2,%1"
                : "=@ccs" (negative), WBYTE_ADDR(addr)
                : "iq" ((char)mask) : "memory");
        return negative;
}
#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte

static __always_inline void
arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        arch___clear_bit(nr, addr);
}

static __always_inline void
arch___change_bit(unsigned long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline void
arch_change_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm_inline volatile(LOCK_PREFIX "xorb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (CONST_MASK(nr)));
        } else {
                asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline bool
arch_test_and_set_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr);
}

static __always_inline bool
arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
        return arch_test_and_set_bit(nr, addr);
}

static __always_inline bool
arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm(__ASM_SIZE(bts) " %2,%1"
            : "=@ccc" (oldbit)
            : ADDR, "Ir" (nr) : "memory");
        return oldbit;
}

static __always_inline bool
arch_test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr);
}

/*
 * Note: the operation is performed atomically with respect to
 * the local CPU, but not other CPUs. Portable code should not
 * rely on this behaviour.
 * KVM relies on this behaviour on x86 for modifying memory that is also
 * accessed from a hypervisor on the same CPU if running in a VM: don't change
 * this without also updating arch/x86/kernel/kvm.c
 */
static __always_inline bool
arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(btr) " %2,%1"
                     : "=@ccc" (oldbit)
                     : ADDR, "Ir" (nr) : "memory");
        return oldbit;
}

static __always_inline bool
arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(btc) " %2,%1"
                     : "=@ccc" (oldbit)
                     : ADDR, "Ir" (nr) : "memory");

        return oldbit;
}

static __always_inline bool
arch_test_and_change_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr);
}

static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
{
        return ((1UL << (nr & (BITS_PER_LONG-1))) &
                (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}

static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
{
        bool oldbit;

        asm volatile("testb %2,%1"
                     : "=@ccnz" (oldbit)
                     : "m" (((unsigned char *)addr)[nr >> 3]),
                       "i" (1 << (nr & 7))
                     :"memory");

        return oldbit;
}

static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(bt) " %2,%1"
                     : "=@ccc" (oldbit)
                     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");

        return oldbit;
}

static __always_inline bool
arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
{
        return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) :
                                          variable_test_bit(nr, addr);
}

static __always_inline bool
arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
{
        return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
                                          variable_test_bit(nr, addr);
}

static __always_inline __attribute_const__ unsigned long variable__ffs(unsigned long word)
{
        asm("tzcnt %1,%0"
                : "=r" (word)
                : ASM_INPUT_RM (word));
        return word;
}

/**
 * __ffs - find first set bit in word
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
#define __ffs(word)                                \
        (__builtin_constant_p(word) ?                \
         (unsigned long)__builtin_ctzl(word) :        \
         variable__ffs(word))

static __always_inline __attribute_const__ unsigned long variable_ffz(unsigned long word)
{
        return variable__ffs(~word);
}

/**
 * ffz - find first zero bit in word
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
#define ffz(word)                                \
        (__builtin_constant_p(word) ?                \
         (unsigned long)__builtin_ctzl(~word) :        \
         variable_ffz(word))

/*
 * __fls: find last set bit in word
 * @word: The word to search
 *
 * Undefined if no set bit exists, so code should check against 0 first.
 */
static __always_inline __attribute_const__ unsigned long __fls(unsigned long word)
{
        if (__builtin_constant_p(word))
                return BITS_PER_LONG - 1 - __builtin_clzl(word);

        asm("bsr %1,%0"
            : "=r" (word)
            : ASM_INPUT_RM (word));
        return word;
}

#undef ADDR

#ifdef __KERNEL__
static __always_inline __attribute_const__ int variable_ffs(int x)
{
        int r;

#ifdef CONFIG_X86_64
        /*
         * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before, except that the
         * top 32 bits will be cleared.
         *
         * We cannot do this on 32 bits because at the very least some
         * 486 CPUs did not behave this way.
         */
        asm("bsfl %1,%0"
            : "=r" (r)
            : ASM_INPUT_RM (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
        asm("bsfl %1,%0\n\t"
            "cmovzl %2,%0"
            : "=&r" (r) : "rm" (x), "r" (-1));
#else
        asm("bsfl %1,%0\n\t"
            "jnz 1f\n\t"
            "movl $-1,%0\n"
            "1:" : "=r" (r) : "rm" (x));
#endif
        return r + 1;
}

/**
 * ffs - find first set bit in word
 * @x: the word to search
 *
 * This is defined the same way as the libc and compiler builtin ffs
 * routines, therefore differs in spirit from the other bitops.
 *
 * ffs(value) returns 0 if value is 0 or the position of the first
 * set bit if value is nonzero. The first (least significant) bit
 * is at position 1.
 */
#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))

/**
 * fls - find last set bit in word
 * @x: the word to search
 *
 * This is defined in a similar way as the libc and compiler builtin
 * ffs, but returns the position of the most significant set bit.
 *
 * fls(value) returns 0 if value is 0 or the position of the last
 * set bit if value is nonzero. The last (most significant) bit is
 * at position 32.
 */
static __always_inline __attribute_const__ int fls(unsigned int x)
{
        int r;

        if (__builtin_constant_p(x))
                return x ? 32 - __builtin_clz(x) : 0;

#ifdef CONFIG_X86_64
        /*
         * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before, except that the
         * top 32 bits will be cleared.
         *
         * We cannot do this on 32 bits because at the very least some
         * 486 CPUs did not behave this way.
         */
        asm("bsrl %1,%0"
            : "=r" (r)
            : ASM_INPUT_RM (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
        asm("bsrl %1,%0\n\t"
            "cmovzl %2,%0"
            : "=&r" (r) : "rm" (x), "rm" (-1));
#else
        asm("bsrl %1,%0\n\t"
            "jnz 1f\n\t"
            "movl $-1,%0\n"
            "1:" : "=r" (r) : "rm" (x));
#endif
        return r + 1;
}

/**
 * fls64 - find last set bit in a 64-bit word
 * @x: the word to search
 *
 * This is defined in a similar way as the libc and compiler builtin
 * ffsll, but returns the position of the most significant set bit.
 *
 * fls64(value) returns 0 if value is 0 or the position of the last
 * set bit if value is nonzero. The last (most significant) bit is
 * at position 64.
 */
#ifdef CONFIG_X86_64
static __always_inline __attribute_const__ int fls64(__u64 x)
{
        int bitpos = -1;

        if (__builtin_constant_p(x))
                return x ? 64 - __builtin_clzll(x) : 0;
        /*
         * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before.
         */
        asm("bsrq %1,%q0"
            : "+r" (bitpos)
            : ASM_INPUT_RM (x));
        return bitpos + 1;
}
#else
#include <asm-generic/bitops/fls64.h>
#endif

#include <asm-generic/bitops/sched.h>

#include <asm/arch_hweight.h>

#include <asm-generic/bitops/const_hweight.h>

#include <asm-generic/bitops/instrumented-atomic.h>
#include <asm-generic/bitops/instrumented-non-atomic.h>
#include <asm-generic/bitops/instrumented-lock.h>

#include <asm-generic/bitops/le.h>

#include <asm-generic/bitops/ext2-atomic-setbit.h>

#endif /* __KERNEL__ */
#endif /* _ASM_X86_BITOPS_H */































































   13 
























   14 



































































































    1 































    1 















    2 

















































































































































    1 




















   13 


    1 

   11 












































































































    2 






    2 

































































































































    2 

























   14 
   13 















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H

#include <linux/bio-integrity.h>
#include <linux/blk-crypto.h>
#include <linux/lockdep.h>
#include <linux/memblock.h>        /* for max_pfn/max_low_pfn */
#include <linux/sched/sysctl.h>
#include <linux/timekeeping.h>
#include <xen/xen.h>
#include "blk-crypto-internal.h"

struct elv_change_ctx;

/*
 * Default upper limit for the software max_sectors limit used for regular I/Os.
 * This can be increased through sysfs.
 *
 * This should not be confused with the max_hw_sector limit that is entirely
 * controlled by the block device driver, usually based on hardware limits.
 */
#define BLK_DEF_MAX_SECTORS_CAP        (SZ_4M >> SECTOR_SHIFT)

#define        BLK_DEV_MAX_SECTORS        (LLONG_MAX >> 9)
#define        BLK_MIN_SEGMENT_SIZE        4096

/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT                (5 * HZ)

extern const struct kobj_type blk_queue_ktype;
extern struct dentry *blk_debugfs_root;

struct blk_flush_queue {
        spinlock_t                mq_flush_lock;
        unsigned int                flush_pending_idx:1;
        unsigned int                flush_running_idx:1;
        blk_status_t                 rq_status;
        unsigned long                flush_pending_since;
        struct list_head        flush_queue[2];
        unsigned long                flush_data_in_flight;
        struct request                *flush_rq;
        struct rcu_head                rcu_head;
};

bool is_flush_rq(struct request *req);

struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
                                              gfp_t flags);
void blk_free_flush_queue(struct blk_flush_queue *q);

bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
bool blk_queue_start_drain(struct request_queue *q);
bool __blk_freeze_queue_start(struct request_queue *q,
                              struct task_struct *owner);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
void submit_bio_noacct_nocheck(struct bio *bio, bool split);
int bio_submit_or_kill(struct bio *bio, unsigned int flags);

static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
        rcu_read_lock();
        if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
                goto fail;

        /*
         * The code that increments the pm_only counter must ensure that the
         * counter is globally visible before the queue is unfrozen.
         */
        if (blk_queue_pm_only(q) &&
            (!pm || queue_rpm_status(q) == RPM_SUSPENDED))
                goto fail_put;

        rcu_read_unlock();
        return true;

fail_put:
        blk_queue_exit(q);
fail:
        rcu_read_unlock();
        return false;
}

static inline int bio_queue_enter(struct bio *bio)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);

        if (blk_try_enter_queue(q, false)) {
                rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
                rwsem_release(&q->io_lockdep_map, _RET_IP_);
                return 0;
        }
        return __bio_queue_enter(q, bio);
}

static inline void blk_wait_io(struct completion *done)
{
        /* Prevent hang_check timer from firing at us during very long I/O */
        unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;

        if (timeout)
                while (!wait_for_completion_io_timeout(done, timeout))
                        ;
        else
                wait_for_completion_io(done);
}

struct block_device *blkdev_get_no_open(dev_t dev, bool autoload);
void blkdev_put_no_open(struct block_device *bdev);

bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
                struct page *page, unsigned len, unsigned offset);

static inline bool biovec_phys_mergeable(struct request_queue *q,
                struct bio_vec *vec1, struct bio_vec *vec2)
{
        unsigned long mask = queue_segment_boundary(q);
        phys_addr_t addr1 = bvec_phys(vec1);
        phys_addr_t addr2 = bvec_phys(vec2);

        /*
         * Merging adjacent physical pages may not work correctly under KMSAN
         * if their metadata pages aren't adjacent. Just disable merging.
         */
        if (IS_ENABLED(CONFIG_KMSAN))
                return false;

        if (addr1 + vec1->bv_len != addr2)
                return false;
        if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page))
                return false;
        if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
                return false;
        return true;
}

static inline bool __bvec_gap_to_prev(const struct queue_limits *lim,
                struct bio_vec *bprv, unsigned int offset)
{
        return (offset & lim->virt_boundary_mask) ||
                ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask);
}

/*
 * Check if adding a bio_vec after bprv with offset would create a gap in
 * the SG list. Most drivers don't care about this, but some do.
 */
static inline bool bvec_gap_to_prev(const struct queue_limits *lim,
                struct bio_vec *bprv, unsigned int offset)
{
        if (!lim->virt_boundary_mask)
                return false;
        return __bvec_gap_to_prev(lim, bprv, offset);
}

static inline bool rq_mergeable(struct request *rq)
{
        if (blk_rq_is_passthrough(rq))
                return false;

        if (req_op(rq) == REQ_OP_FLUSH)
                return false;

        if (req_op(rq) == REQ_OP_WRITE_ZEROES)
                return false;

        if (req_op(rq) == REQ_OP_ZONE_APPEND)
                return false;

        if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
                return false;
        if (rq->rq_flags & RQF_NOMERGE_FLAGS)
                return false;

        return true;
}

/*
 * There are two different ways to handle DISCARD merges:
 *  1) If max_discard_segments > 1, the driver treats every bio as a range and
 *     send the bios to controller together. The ranges don't need to be
 *     contiguous.
 *  2) Otherwise, the request will be normal read/write requests.  The ranges
 *     need to be contiguous.
 */
static inline bool blk_discard_mergable(struct request *req)
{
        if (req_op(req) == REQ_OP_DISCARD &&
            queue_max_discard_segments(req->q) > 1)
                return true;
        return false;
}

static inline unsigned int blk_rq_get_max_segments(struct request *rq)
{
        if (req_op(rq) == REQ_OP_DISCARD)
                return queue_max_discard_segments(rq->q);
        return queue_max_segments(rq->q);
}

static inline unsigned int blk_queue_get_max_sectors(struct request *rq)
{
        struct request_queue *q = rq->q;
        enum req_op op = req_op(rq);

        if (unlikely(op == REQ_OP_DISCARD))
                return min(q->limits.max_discard_sectors,
                           UINT_MAX >> SECTOR_SHIFT);

        if (unlikely(op == REQ_OP_SECURE_ERASE))
                return min(q->limits.max_secure_erase_sectors,
                           UINT_MAX >> SECTOR_SHIFT);

        if (unlikely(op == REQ_OP_WRITE_ZEROES))
                return q->limits.max_write_zeroes_sectors;

        if (rq->cmd_flags & REQ_ATOMIC)
                return q->limits.atomic_write_max_sectors;

        return q->limits.max_sectors;
}

#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
void bio_integrity_free(struct bio *bio);

/*
 * Integrity payloads can either be owned by the submitter, in which case
 * bio_uninit will free them, or owned and generated by the block layer,
 * in which case we'll verify them here (for reads) and free them before
 * the bio is handed back to the submitted.
 */
bool __bio_integrity_endio(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);

        if (bip && (bip->bip_flags & BIP_BLOCK_INTEGRITY))
                return __bio_integrity_endio(bio);
        return true;
}

bool blk_integrity_merge_rq(struct request_queue *, struct request *,
                struct request *);
bool blk_integrity_merge_bio(struct request_queue *, struct request *,
                struct bio *);

static inline bool integrity_req_gap_back_merge(struct request *req,
                struct bio *next)
{
        struct bio_integrity_payload *bip = bio_integrity(req->bio);
        struct bio_integrity_payload *bip_next = bio_integrity(next);

        return bvec_gap_to_prev(&req->q->limits,
                                &bip->bip_vec[bip->bip_vcnt - 1],
                                bip_next->bip_vec[0].bv_offset);
}

static inline bool integrity_req_gap_front_merge(struct request *req,
                struct bio *bio)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);
        struct bio_integrity_payload *bip_next = bio_integrity(req->bio);

        return bvec_gap_to_prev(&req->q->limits,
                                &bip->bip_vec[bip->bip_vcnt - 1],
                                bip_next->bip_vec[0].bv_offset);
}

extern const struct attribute_group blk_integrity_attr_group;
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline bool blk_integrity_merge_rq(struct request_queue *rq,
                struct request *r1, struct request *r2)
{
        return true;
}
static inline bool blk_integrity_merge_bio(struct request_queue *rq,
                struct request *r, struct bio *b)
{
        return true;
}
static inline bool integrity_req_gap_back_merge(struct request *req,
                struct bio *next)
{
        return false;
}
static inline bool integrity_req_gap_front_merge(struct request *req,
                struct bio *bio)
{
        return false;
}

static inline void blk_flush_integrity(void)
{
}
static inline bool bio_integrity_endio(struct bio *bio)
{
        return true;
}
static inline void bio_integrity_free(struct bio *bio)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */

unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);

enum bio_merge_status {
        BIO_MERGE_OK,
        BIO_MERGE_NONE,
        BIO_MERGE_FAILED,
};

enum bio_merge_status bio_attempt_back_merge(struct request *req,
                struct bio *bio, unsigned int nr_segs);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
                        struct bio *bio, unsigned int nr_segs);

/*
 * Plug flush limits
 */
#define BLK_MAX_REQUEST_COUNT        32
#define BLK_PLUG_FLUSH_SIZE        (128 * 1024)

/*
 * Internal elevator interface
 */
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)

bool blk_insert_flush(struct request *rq);

void elv_update_nr_hw_queues(struct request_queue *q,
                struct elv_change_ctx *ctx);
void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q);

ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_fail_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_fail_store(struct device *dev, struct device_attribute *attr,
                const char *buf, size_t count);
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
                                const char *, size_t);

struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
                unsigned *nsegs);
struct bio *bio_split_write_zeroes(struct bio *bio,
                const struct queue_limits *lim, unsigned *nsegs);
struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
                unsigned *nr_segs);
struct bio *bio_split_zone_append(struct bio *bio,
                const struct queue_limits *lim, unsigned *nr_segs);

/*
 * All drivers must accept single-segments bios that are smaller than PAGE_SIZE.
 *
 * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is
 * always valid if a bio has data.  The check might lead to occasional false
 * positives when bios are cloned, but compared to the performance impact of
 * cloned bios themselves the loop below doesn't matter anyway.
 */
static inline bool bio_may_need_split(struct bio *bio,
                const struct queue_limits *lim)
{
        const struct bio_vec *bv;

        if (lim->chunk_sectors)
                return true;

        if (!bio->bi_io_vec)
                return true;

        bv = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
        if (bio->bi_iter.bi_size > bv->bv_len - bio->bi_iter.bi_bvec_done)
                return true;
        return bv->bv_len + bv->bv_offset > lim->max_fast_segment_size;
}

/**
 * __bio_split_to_limits - split a bio to fit the queue limits
 * @bio:     bio to be split
 * @lim:     queue limits to split based on
 * @nr_segs: returns the number of segments in the returned bio
 *
 * Check if @bio needs splitting based on the queue limits, and if so split off
 * a bio fitting the limits from the beginning of @bio and return it.  @bio is
 * shortened to the remainder and re-submitted.
 *
 * The split bio is allocated from @q->bio_split, which is provided by the
 * block layer.
 */
static inline struct bio *__bio_split_to_limits(struct bio *bio,
                const struct queue_limits *lim, unsigned int *nr_segs)
{
        switch (bio_op(bio)) {
        case REQ_OP_READ:
        case REQ_OP_WRITE:
                if (bio_may_need_split(bio, lim))
                        return bio_split_rw(bio, lim, nr_segs);
                *nr_segs = 1;
                return bio;
        case REQ_OP_ZONE_APPEND:
                return bio_split_zone_append(bio, lim, nr_segs);
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
                return bio_split_discard(bio, lim, nr_segs);
        case REQ_OP_WRITE_ZEROES:
                return bio_split_write_zeroes(bio, lim, nr_segs);
        default:
                /* other operations can't be split */
                *nr_segs = 0;
                return bio;
        }
}

/**
 * get_max_segment_size() - maximum number of bytes to add as a single segment
 * @lim: Request queue limits.
 * @paddr: address of the range to add
 * @len: maximum length available to add at @paddr
 *
 * Returns the maximum number of bytes of the range starting at @paddr that can
 * be added to a single segment.
 */
static inline unsigned get_max_segment_size(const struct queue_limits *lim,
                phys_addr_t paddr, unsigned int len)
{
        /*
         * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
         * after having calculated the minimum.
         */
        return min_t(unsigned long, len,
                min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
                    (unsigned long)lim->max_segment_size - 1) + 1);
}

int ll_back_merge_fn(struct request *req, struct bio *bio,
                unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
                                struct request *next);
unsigned int blk_recalc_rq_segments(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);

int blk_set_default_limits(struct queue_limits *lim);
void blk_apply_bdi_limits(struct backing_dev_info *bdi,
                struct queue_limits *lim);
int blk_dev_init(void);

void update_io_ticks(struct block_device *part, unsigned long now, bool end);

static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
        req->cmd_flags |= REQ_NOMERGE;
        if (req == q->last_merge)
                q->last_merge = NULL;
}

/*
 * Internal io_context interface
 */
struct io_cq *ioc_find_get_icq(struct request_queue *q);
struct io_cq *ioc_lookup_icq(struct request_queue *q);
#ifdef CONFIG_BLK_ICQ
void ioc_clear_queue(struct request_queue *q);
#else
static inline void ioc_clear_queue(struct request_queue *q)
{
}
#endif /* CONFIG_BLK_ICQ */

#ifdef CONFIG_BLK_DEV_ZONED
void disk_init_zone_resources(struct gendisk *disk);
void disk_free_zone_resources(struct gendisk *disk);
static inline bool bio_zone_write_plugging(struct bio *bio)
{
        return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
}
static inline bool blk_req_bio_is_zone_append(struct request *rq,
                                              struct bio *bio)
{
        return req_op(rq) == REQ_OP_ZONE_APPEND ||
               bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
}
void blk_zone_write_plug_bio_merged(struct bio *bio);
void blk_zone_write_plug_init_request(struct request *rq);
void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio);
void blk_zone_mgmt_bio_endio(struct bio *bio);
void blk_zone_write_plug_bio_endio(struct bio *bio);
static inline void blk_zone_bio_endio(struct bio *bio)
{
        /*
         * Zone management BIOs may impact zone write plugs (e.g. a zone reset
         * changes a zone write plug zone write pointer offset), but these
         * operation do not go through zone write plugging as they may operate
         * on zones that do not have a zone write
         * plug. blk_zone_mgmt_bio_endio() handles the potential changes to zone
         * write plugs that are present.
         */
        if (op_is_zone_mgmt(bio_op(bio))) {
                blk_zone_mgmt_bio_endio(bio);
                return;
        }

        /*
         * For write BIOs to zoned devices, signal the completion of the BIO so
         * that the next write BIO can be submitted by zone write plugging.
         */
        if (bio_zone_write_plugging(bio))
                blk_zone_write_plug_bio_endio(bio);
}

void blk_zone_write_plug_finish_request(struct request *rq);
static inline void blk_zone_finish_request(struct request *rq)
{
        if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
                blk_zone_write_plug_finish_request(rq);
}
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
                unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
                unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
static inline void disk_init_zone_resources(struct gendisk *disk)
{
}
static inline void disk_free_zone_resources(struct gendisk *disk)
{
}
static inline bool bio_zone_write_plugging(struct bio *bio)
{
        return false;
}
static inline bool blk_req_bio_is_zone_append(struct request *req,
                                              struct bio *bio)
{
        return false;
}
static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
{
}
static inline void blk_zone_write_plug_init_request(struct request *rq)
{
}
static inline void blk_zone_append_update_request_bio(struct request *rq,
                                                      struct bio *bio)
{
}
static inline void blk_zone_bio_endio(struct bio *bio)
{
}
static inline void blk_zone_finish_request(struct request *rq)
{
}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
                unsigned int cmd, unsigned long arg)
{
        return -ENOTTY;
}
static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
                blk_mode_t mode, unsigned int cmd, unsigned long arg)
{
        return -ENOTTY;
}
#endif /* CONFIG_BLK_DEV_ZONED */

struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
void bdev_add(struct block_device *bdev, dev_t dev);
void bdev_unhash(struct block_device *bdev);
void bdev_drop(struct block_device *bdev);

int blk_alloc_ext_minor(void);
void blk_free_ext_minor(unsigned int minor);
#define ADDPART_FLAG_NONE        0
#define ADDPART_FLAG_RAID        1
#define ADDPART_FLAG_WHOLEDISK        2
#define ADDPART_FLAG_READONLY        4
int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
                sector_t length);
int bdev_del_partition(struct gendisk *disk, int partno);
int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
                sector_t length);
void drop_partition(struct block_device *part);

void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);

struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
                struct lock_class_key *lkclass);
struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);

int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);

int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
void disk_del_events(struct gendisk *disk);
void disk_release_events(struct gendisk *disk);
void disk_block_events(struct gendisk *disk);
void disk_unblock_events(struct gendisk *disk);
void disk_flush_events(struct gendisk *disk, unsigned int mask);
extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;

extern struct attribute_group blk_trace_attr_group;

blk_mode_t file_to_blk_mode(struct file *file);
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
                loff_t lstart, loff_t lend);
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);

extern const struct address_space_operations def_blk_aops;

int disk_register_independent_access_ranges(struct gendisk *disk);
void disk_unregister_independent_access_ranges(struct gendisk *disk);

int should_fail_bio(struct bio *bio);
#ifdef CONFIG_FAIL_MAKE_REQUEST
bool should_fail_request(struct block_device *part, unsigned int bytes);
#else /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool should_fail_request(struct block_device *part,
                                        unsigned int bytes)
{
        return false;
}
#endif /* CONFIG_FAIL_MAKE_REQUEST */

/*
 * Optimized request reference counting. Ideally we'd make timeouts be more
 * clever, as that's the only reason we need references at all... But until
 * this happens, this is faster than using refcount_t. Also see:
 *
 * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
 */
#define req_ref_zero_or_close_to_overflow(req)        \
        ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)

static inline bool req_ref_inc_not_zero(struct request *req)
{
        return atomic_inc_not_zero(&req->ref);
}

static inline bool req_ref_put_and_test(struct request *req)
{
        WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
        return atomic_dec_and_test(&req->ref);
}

static inline void req_ref_set(struct request *req, int value)
{
        atomic_set(&req->ref, value);
}

static inline int req_ref_read(struct request *req)
{
        return atomic_read(&req->ref);
}

static inline u64 blk_time_get_ns(void)
{
        struct blk_plug *plug = current->plug;

        if (!plug || !in_task())
                return ktime_get_ns();

        /*
         * 0 could very well be a valid time, but rather than flag "this is
         * a valid timestamp" separately, just accept that we'll do an extra
         * ktime_get_ns() if we just happen to get 0 as the current time.
         */
        if (!plug->cur_ktime) {
                plug->cur_ktime = ktime_get_ns();
                current->flags |= PF_BLOCK_TS;
        }
        return plug->cur_ktime;
}

static inline ktime_t blk_time_get(void)
{
        return ns_to_ktime(blk_time_get_ns());
}

void bdev_release(struct file *bdev_file);
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
              const struct blk_holder_ops *hops, struct file *bdev_file);
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);

void bio_integrity_generate(struct bio *bio);
blk_status_t bio_integrity_verify(struct bio *bio,
                struct bvec_iter *saved_iter);

void blk_integrity_prepare(struct request *rq);
void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);

#ifdef CONFIG_LOCKDEP
static inline void blk_freeze_acquire_lock(struct request_queue *q)
{
        if (!q->mq_freeze_disk_dead)
                rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_);
        if (!q->mq_freeze_queue_dying)
                rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_);
}

static inline void blk_unfreeze_release_lock(struct request_queue *q)
{
        if (!q->mq_freeze_queue_dying)
                rwsem_release(&q->q_lockdep_map, _RET_IP_);
        if (!q->mq_freeze_disk_dead)
                rwsem_release(&q->io_lockdep_map, _RET_IP_);
}
#else
static inline void blk_freeze_acquire_lock(struct request_queue *q)
{
}
static inline void blk_unfreeze_release_lock(struct request_queue *q)
{
}
#endif

/*
 * debugfs directory and file creation can trigger fs reclaim, which can enter
 * back into the block layer request_queue. This can cause deadlock if the
 * queue is frozen. Use NOIO context together with debugfs_mutex to prevent fs
 * reclaim from triggering block I/O.
 */
static inline void blk_debugfs_lock_nomemsave(struct request_queue *q)
{
        mutex_lock(&q->debugfs_mutex);
}

static inline void blk_debugfs_unlock_nomemrestore(struct request_queue *q)
{
        mutex_unlock(&q->debugfs_mutex);
}

static inline unsigned int __must_check blk_debugfs_lock(struct request_queue *q)
{
        unsigned int memflags = memalloc_noio_save();

        blk_debugfs_lock_nomemsave(q);
        return memflags;
}

static inline void blk_debugfs_unlock(struct request_queue *q,
                                      unsigned int memflags)
{
        blk_debugfs_unlock_nomemrestore(q);
        memalloc_noio_restore(memflags);
}

#endif /* BLK_INTERNAL_H */





























































   14 


   11 



















   13 




















































































































   12 












   12 
















   13 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
// SPDX-License-Identifier: GPL-2.0
#include <linux/irq_work.h>
#include <linux/spinlock.h>
#include <linux/task_work.h>
#include <linux/resume_user_mode.h>

static struct callback_head work_exited; /* all we need is ->next == NULL */

#ifdef CONFIG_IRQ_WORK
static void task_work_set_notify_irq(struct irq_work *entry)
{
        /*
         * no-op IPI
         *
         * TWA_NMI_CURRENT will already have set the TIF flag, all
         * this interrupt does it tickle the return-to-user path.
         */
}
static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
        IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
#endif

/**
 * task_work_add - ask the @task to execute @work->func()
 * @task: the task which should run the callback
 * @work: the callback to run
 * @notify: how to notify the targeted task
 *
 * Queue @work for task_work_run() below and notify the @task if @notify
 * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT.
 *
 * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
 * task and run the task_work, regardless of whether the task is currently
 * running in the kernel or userspace.
 * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
 * reschedule IPI to force the targeted task to reschedule and run task_work.
 * This can be advantageous if there's no strict requirement that the
 * task_work be run as soon as possible, just whenever the task enters the
 * kernel anyway.
 * @TWA_RESUME work is run only when the task exits the kernel and returns to
 * user mode, or before entering guest mode.
 * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the
 * current @task and if the current context is NMI.
 *
 * Fails if the @task is exiting/exited and thus it can't process this @work.
 * Otherwise @work->func() will be called when the @task goes through one of
 * the aforementioned transitions, or exits.
 *
 * If the targeted task is exiting, then an error is returned and the work item
 * is not queued. It's up to the caller to arrange for an alternative mechanism
 * in that case.
 *
 * Note: there is no ordering guarantee on works queued here. The task_work
 * list is LIFO.
 *
 * RETURNS:
 * 0 if succeeds or -ESRCH.
 */
int task_work_add(struct task_struct *task, struct callback_head *work,
                  enum task_work_notify_mode notify)
{
        struct callback_head *head;

        if (notify == TWA_NMI_CURRENT) {
                if (WARN_ON_ONCE(task != current))
                        return -EINVAL;
                if (!IS_ENABLED(CONFIG_IRQ_WORK))
                        return -EINVAL;
        } else {
                kasan_record_aux_stack(work);
        }

        head = READ_ONCE(task->task_works);
        do {
                if (unlikely(head == &work_exited))
                        return -ESRCH;
                work->next = head;
        } while (!try_cmpxchg(&task->task_works, &head, work));

        switch (notify) {
        case TWA_NONE:
                break;
        case TWA_RESUME:
                set_notify_resume(task);
                break;
        case TWA_SIGNAL:
                set_notify_signal(task);
                break;
        case TWA_SIGNAL_NO_IPI:
                __set_notify_signal(task);
                break;
#ifdef CONFIG_IRQ_WORK
        case TWA_NMI_CURRENT:
                set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
                irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
                break;
#endif
        default:
                WARN_ON_ONCE(1);
                break;
        }

        return 0;
}

/**
 * task_work_cancel_match - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @match: match function to call
 * @data: data to be passed in to match function
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel_match(struct task_struct *task,
                       bool (*match)(struct callback_head *, void *data),
                       void *data)
{
        struct callback_head **pprev = &task->task_works;
        struct callback_head *work;
        unsigned long flags;

        if (likely(!task_work_pending(task)))
                return NULL;
        /*
         * If cmpxchg() fails we continue without updating pprev.
         * Either we raced with task_work_add() which added the
         * new entry before this work, we will find it again. Or
         * we raced with task_work_run(), *pprev == NULL/exited.
         */
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        work = READ_ONCE(*pprev);
        while (work) {
                if (!match(work, data)) {
                        pprev = &work->next;
                        work = READ_ONCE(*pprev);
                } else if (try_cmpxchg(pprev, &work, work->next))
                        break;
        }
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);

        return work;
}

static bool task_work_func_match(struct callback_head *cb, void *data)
{
        return cb->func == data;
}

/**
 * task_work_cancel_func - cancel a pending work matching a function added by task_work_add()
 * @task: the task which should execute the func's work
 * @func: identifies the func to match with a work to remove
 *
 * Find the last queued pending work with ->func == @func and remove
 * it from queue.
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel_func(struct task_struct *task, task_work_func_t func)
{
        return task_work_cancel_match(task, task_work_func_match, func);
}

static bool task_work_match(struct callback_head *cb, void *data)
{
        return cb == data;
}

/**
 * task_work_cancel - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @cb: the callback to remove if queued
 *
 * Remove a callback from a task's queue if queued.
 *
 * RETURNS:
 * True if the callback was queued and got cancelled, false otherwise.
 */
bool task_work_cancel(struct task_struct *task, struct callback_head *cb)
{
        struct callback_head *ret;

        ret = task_work_cancel_match(task, task_work_match, cb);

        return ret == cb;
}

/**
 * task_work_run - execute the works added by task_work_add()
 *
 * Flush the pending works. Should be used by the core kernel code.
 * Called before the task returns to the user-mode or stops, or when
 * it exits. In the latter case task_work_add() can no longer add the
 * new work after task_work_run() returns.
 */
void task_work_run(void)
{
        struct task_struct *task = current;
        struct callback_head *work, *head, *next;

        for (;;) {
                /*
                 * work->func() can do task_work_add(), do not set
                 * work_exited unless the list is empty.
                 */
                work = READ_ONCE(task->task_works);
                do {
                        head = NULL;
                        if (!work) {
                                if (task->flags & PF_EXITING)
                                        head = &work_exited;
                                else
                                        break;
                        }
                } while (!try_cmpxchg(&task->task_works, &work, head));

                if (!work)
                        break;
                /*
                 * Synchronize with task_work_cancel_match(). It can not remove
                 * the first entry == work, cmpxchg(task_works) must fail.
                 * But it can remove another entry from the ->next list.
                 */
                raw_spin_lock_irq(&task->pi_lock);
                raw_spin_unlock_irq(&task->pi_lock);

                do {
                        next = work->next;
                        work->func(work);
                        work = next;
                        cond_resched();
                } while (work);
        }
}






































































































    2 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_GUE_H
#define __NET_GUE_H

/* Definitions for the GUE header, standard and private flags, lengths
 * of optional fields are below.
 *
 * Diagram of GUE header:
 *
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |Ver|C|  Hlen   | Proto/ctype   |        Standard flags       |P|
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                                                               |
 * ~                      Fields (optional)                        ~
 * |                                                               |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |            Private flags (optional, P bit is set)             |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                                                               |
 * ~                   Private fields (optional)                   ~
 * |                                                               |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 * C bit indicates control message when set, data message when unset.
 * For a control message, proto/ctype is interpreted as a type of
 * control message. For data messages, proto/ctype is the IP protocol
 * of the next header.
 *
 * P bit indicates private flags field is present. The private flags
 * may refer to options placed after this field.
 */

#include <asm/byteorder.h>
#include <linux/types.h>

struct guehdr {
        union {
                struct {
#if defined(__LITTLE_ENDIAN_BITFIELD)
                        __u8        hlen:5,
                                control:1,
                                version:2;
#elif defined (__BIG_ENDIAN_BITFIELD)
                        __u8        version:2,
                                control:1,
                                hlen:5;
#else
#error  "Please fix <asm/byteorder.h>"
#endif
                        __u8        proto_ctype;
                        __be16        flags;
                };
                __be32        word;
        };
};

/* Standard flags in GUE header */

#define GUE_FLAG_PRIV        htons(1<<0)        /* Private flags are in options */
#define GUE_LEN_PRIV        4

#define GUE_FLAGS_ALL        (GUE_FLAG_PRIV)

/* Private flags in the private option extension */

#define GUE_PFLAG_REMCSUM        htonl(1U << 31)
#define GUE_PLEN_REMCSUM        4

#define GUE_PFLAGS_ALL        (GUE_PFLAG_REMCSUM)

/* Functions to compute options length corresponding to flags.
 * If we ever have a lot of flags this can be potentially be
 * converted to a more optimized algorithm (table lookup
 * for instance).
 */
static inline size_t guehdr_flags_len(__be16 flags)
{
        return ((flags & GUE_FLAG_PRIV) ? GUE_LEN_PRIV : 0);
}

static inline size_t guehdr_priv_flags_len(__be32 flags)
{
        return 0;
}

/* Validate standard and private flags. Returns non-zero (meaning invalid)
 * if there is an unknown standard or private flags, or the options length for
 * the flags exceeds the options length specific in hlen of the GUE header.
 */
static inline int validate_gue_flags(struct guehdr *guehdr, size_t optlen)
{
        __be16 flags = guehdr->flags;
        size_t len;

        if (flags & ~GUE_FLAGS_ALL)
                return 1;

        len = guehdr_flags_len(flags);
        if (len > optlen)
                return 1;

        if (flags & GUE_FLAG_PRIV) {
                /* Private flags are last four bytes accounted in
                 * guehdr_flags_len
                 */
                __be32 pflags = *(__be32 *)((void *)&guehdr[1] +
                                            len - GUE_LEN_PRIV);

                if (pflags & ~GUE_PFLAGS_ALL)
                        return 1;

                len += guehdr_priv_flags_len(pflags);
                if (len > optlen)
                        return 1;
        }

        return 0;
}

#endif













































































































































    1 





































   20 









   18 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_USER_NAMESPACE_H
#define _LINUX_USER_NAMESPACE_H

#include <linux/kref.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/rculist_nulls.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/rcuref.h>
#include <linux/rwsem.h>
#include <linux/sysctl.h>
#include <linux/err.h>

#define UID_GID_MAP_MAX_BASE_EXTENTS 5
#define UID_GID_MAP_MAX_EXTENTS 340

struct uid_gid_extent {
        u32 first;
        u32 lower_first;
        u32 count;
};

struct uid_gid_map { /* 64 bytes -- 1 cache line */
        union {
                struct {
                        struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
                        u32 nr_extents;
                };
                struct {
                        struct uid_gid_extent *forward;
                        struct uid_gid_extent *reverse;
                };
        };
};

#define USERNS_SETGROUPS_ALLOWED 1UL

#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED

struct ucounts;

enum ucount_type {
        UCOUNT_USER_NAMESPACES,
        UCOUNT_PID_NAMESPACES,
        UCOUNT_UTS_NAMESPACES,
        UCOUNT_IPC_NAMESPACES,
        UCOUNT_NET_NAMESPACES,
        UCOUNT_MNT_NAMESPACES,
        UCOUNT_CGROUP_NAMESPACES,
        UCOUNT_TIME_NAMESPACES,
#ifdef CONFIG_INOTIFY_USER
        UCOUNT_INOTIFY_INSTANCES,
        UCOUNT_INOTIFY_WATCHES,
#endif
#ifdef CONFIG_FANOTIFY
        UCOUNT_FANOTIFY_GROUPS,
        UCOUNT_FANOTIFY_MARKS,
#endif
        UCOUNT_COUNTS,
};

enum rlimit_type {
        UCOUNT_RLIMIT_NPROC,
        UCOUNT_RLIMIT_MSGQUEUE,
        UCOUNT_RLIMIT_SIGPENDING,
        UCOUNT_RLIMIT_MEMLOCK,
        UCOUNT_RLIMIT_COUNTS,
};

#if IS_ENABLED(CONFIG_BINFMT_MISC)
struct binfmt_misc;
#endif

struct user_namespace {
        struct uid_gid_map        uid_map;
        struct uid_gid_map        gid_map;
        struct uid_gid_map        projid_map;
        struct user_namespace        *parent;
        int                        level;
        kuid_t                        owner;
        kgid_t                        group;
        struct ns_common        ns;
        unsigned long                flags;
        /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
         * in its effective capability set at the child ns creation time. */
        bool                        parent_could_setfcap;

#ifdef CONFIG_KEYS
        /* List of joinable keyrings in this namespace.  Modification access of
         * these pointers is controlled by keyring_sem.  Once
         * user_keyring_register is set, it won't be changed, so it can be
         * accessed directly with READ_ONCE().
         */
        struct list_head        keyring_name_list;
        struct key                *user_keyring_register;
        struct rw_semaphore        keyring_sem;
#endif

        /* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
        struct key                *persistent_keyring_register;
#endif
        struct work_struct        work;
#ifdef CONFIG_SYSCTL
        struct ctl_table_set        set;
        struct ctl_table_header *sysctls;
#endif
        struct ucounts                *ucounts;
        long ucount_max[UCOUNT_COUNTS];
        long rlimit_max[UCOUNT_RLIMIT_COUNTS];

#if IS_ENABLED(CONFIG_BINFMT_MISC)
        struct binfmt_misc *binfmt_misc;
#endif
} __randomize_layout;

struct ucounts {
        struct hlist_nulls_node node;
        struct user_namespace *ns;
        kuid_t uid;
        struct rcu_head rcu;
        rcuref_t count;
        atomic_long_t ucount[UCOUNT_COUNTS];
        atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
};

extern struct user_namespace init_user_ns;
extern struct ucounts init_ucounts;

bool setup_userns_sysctls(struct user_namespace *ns);
void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
void put_ucounts(struct ucounts *ucounts);

static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts)
{
        if (rcuref_get(&ucounts->count))
                return ucounts;
        return NULL;
}

static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type)
{
        return atomic_long_read(&ucounts->rlimit[type]);
}

long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
                            bool override_rlimit);
void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);

static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type)
{
        return READ_ONCE(ns->rlimit_max[type]);
}

static inline void set_userns_rlimit_max(struct user_namespace *ns,
                enum rlimit_type type, unsigned long max)
{
        ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
}

static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
        return container_of(ns, struct user_namespace, ns);
}

#ifdef CONFIG_USER_NS

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        if (ns)
                ns_ref_inc(ns);
        return ns;
}

extern int create_user_ns(struct cred *new);
extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
extern void __put_user_ns(struct user_namespace *ns);

static inline void put_user_ns(struct user_namespace *ns)
{
        if (ns && ns_ref_put(ns))
                __put_user_ns(ns);
}

struct seq_operations;
extern const struct seq_operations proc_uid_seq_operations;
extern const struct seq_operations proc_gid_seq_operations;
extern const struct seq_operations proc_projid_seq_operations;
extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
extern bool in_userns(const struct user_namespace *ancestor,
                       const struct user_namespace *child);
extern bool current_in_userns(const struct user_namespace *target_ns);
struct ns_common *ns_get_owner(struct ns_common *ns);
#else

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        return &init_user_ns;
}

static inline int create_user_ns(struct cred *new)
{
        return -EINVAL;
}

static inline int unshare_userns(unsigned long unshare_flags,
                                 struct cred **new_cred)
{
        if (unshare_flags & CLONE_NEWUSER)
                return -EINVAL;
        return 0;
}

static inline void put_user_ns(struct user_namespace *ns)
{
}

static inline bool userns_may_setgroups(const struct user_namespace *ns)
{
        return true;
}

static inline bool in_userns(const struct user_namespace *ancestor,
                             const struct user_namespace *child)
{
        return true;
}

static inline bool current_in_userns(const struct user_namespace *target_ns)
{
        return true;
}

static inline struct ns_common *ns_get_owner(struct ns_common *ns)
{
        return ERR_PTR(-EPERM);
}
#endif

#endif /* _LINUX_USER_H */

































































































































































































































    3 





















   11 













    2 









    5 
   10 










   38 
   14 








   14 
    2 




   16 























    3 


















































































































































    3 









   16 




















































































































    2 









    2 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * net/dst.h        Protocol independent destination cache definitions.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#ifndef _NET_DST_H
#define _NET_DST_H

#include <net/dst_ops.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/bug.h>
#include <linux/jiffies.h>
#include <linux/refcount.h>
#include <linux/rcuref.h>
#include <net/neighbour.h>
#include <asm/processor.h>
#include <linux/indirect_call_wrapper.h>

struct sk_buff;

struct dst_entry {
        union {
                struct net_device       *dev;
                struct net_device __rcu *dev_rcu;
        };
        struct  dst_ops                *ops;
        unsigned long                _metrics;
        unsigned long           expires;
#ifdef CONFIG_XFRM
        struct xfrm_state        *xfrm;
#else
        void                        *__pad1;
#endif
        int                        (*input)(struct sk_buff *);
        int                        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);

        unsigned short                flags;
#define DST_NOXFRM                0x0002
#define DST_NOPOLICY                0x0004
#define DST_NOCOUNT                0x0008
#define DST_FAKE_RTABLE                0x0010
#define DST_XFRM_TUNNEL                0x0020
#define DST_XFRM_QUEUE                0x0040
#define DST_METADATA                0x0080

        /* A non-zero value of dst->obsolete forces by-hand validation
         * of the route entry.  Positive values are set by the generic
         * dst layer to indicate that the entry has been forcefully
         * destroyed.
         *
         * Negative values are used by the implementation layer code to
         * force invocation of the dst_ops->check() method.
         */
        short                        obsolete;
#define DST_OBSOLETE_NONE        0
#define DST_OBSOLETE_DEAD        2
#define DST_OBSOLETE_FORCE_CHK        -1
#define DST_OBSOLETE_KILL        -2
        unsigned short                header_len;        /* more space at head required */
        unsigned short                trailer_len;        /* space to reserve at tail */

        /*
         * __rcuref wants to be on a different cache line from
         * input/output/ops or performance tanks badly
         */
#ifdef CONFIG_64BIT
        rcuref_t                __rcuref;        /* 64-bit offset 64 */
#endif
        int                        __use;
        unsigned long                lastuse;
        struct rcu_head                rcu_head;
        short                        error;
        short                        __pad;
        __u32                        tclassid;
#ifndef CONFIG_64BIT
        struct lwtunnel_state   *lwtstate;
        rcuref_t                __rcuref;        /* 32-bit offset 64 */
#endif
        netdevice_tracker        dev_tracker;

        /*
         * Used by rtable and rt6_info. Moves lwtstate into the next cache
         * line on 64bit so that lwtstate does not cause false sharing with
         * __rcuref under contention of __rcuref. This also puts the
         * frequently accessed members of rtable and rt6_info out of the
         * __rcuref cache line.
         */
        struct list_head        rt_uncached;
        struct uncached_list        *rt_uncached_list;
#ifdef CONFIG_64BIT
        struct lwtunnel_state   *lwtstate;
#endif
};

struct dst_metrics {
        u32                metrics[RTAX_MAX];
        refcount_t        refcnt;
} __aligned(4);                /* Low pointer bits contain DST_METRICS_FLAGS */
extern const struct dst_metrics dst_default_metrics;

u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);

#define DST_METRICS_READ_ONLY                0x1UL
#define DST_METRICS_REFCOUNTED                0x2UL
#define DST_METRICS_FLAGS                0x3UL
#define __DST_METRICS_PTR(Y)        \
        ((u32 *)((Y) & ~DST_METRICS_FLAGS))
#define DST_METRICS_PTR(X)        __DST_METRICS_PTR((X)->_metrics)

static inline bool dst_metrics_read_only(const struct dst_entry *dst)
{
        return dst->_metrics & DST_METRICS_READ_ONLY;
}

void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);

static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
{
        unsigned long val = dst->_metrics;
        if (!(val & DST_METRICS_READ_ONLY))
                __dst_destroy_metrics_generic(dst, val);
}

static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst)
{
        unsigned long p = dst->_metrics;

        BUG_ON(!p);

        if (p & DST_METRICS_READ_ONLY)
                return dst->ops->cow_metrics(dst, p);
        return __DST_METRICS_PTR(p);
}

/* This may only be invoked before the entry has reached global
 * visibility.
 */
static inline void dst_init_metrics(struct dst_entry *dst,
                                    const u32 *src_metrics,
                                    bool read_only)
{
        dst->_metrics = ((unsigned long) src_metrics) |
                (read_only ? DST_METRICS_READ_ONLY : 0);
}

static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
{
        u32 *dst_metrics = dst_metrics_write_ptr(dest);

        if (dst_metrics) {
                u32 *src_metrics = DST_METRICS_PTR(src);

                memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32));
        }
}

static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
{
        return DST_METRICS_PTR(dst);
}

static inline u32
dst_metric_raw(const struct dst_entry *dst, const int metric)
{
        u32 *p = DST_METRICS_PTR(dst);

        return p[metric-1];
}

static inline u32
dst_metric(const struct dst_entry *dst, const int metric)
{
        WARN_ON_ONCE(metric == RTAX_HOPLIMIT ||
                     metric == RTAX_ADVMSS ||
                     metric == RTAX_MTU);
        return dst_metric_raw(dst, metric);
}

static inline u32
dst_metric_advmss(const struct dst_entry *dst)
{
        u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS);

        if (!advmss)
                advmss = dst->ops->default_advmss(dst);

        return advmss;
}

static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
{
        u32 *p = dst_metrics_write_ptr(dst);

        if (p)
                p[metric-1] = val;
}

/* Kernel-internal feature bits that are unallocated in user space. */
#define DST_FEATURE_ECN_CA        (1U << 31)

#define DST_FEATURE_MASK        (DST_FEATURE_ECN_CA)
#define DST_FEATURE_ECN_MASK        (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)

static inline u32
dst_feature(const struct dst_entry *dst, u32 feature)
{
        return dst_metric(dst, RTAX_FEATURES) & feature;
}

INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *));
INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *));
static inline u32 dst_mtu(const struct dst_entry *dst)
{
        return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst);
}

/* Variant of dst_mtu() for IPv4 users. */
static inline u32 dst4_mtu(const struct dst_entry *dst)
{
        return INDIRECT_CALL_1(dst->ops->mtu, ipv4_mtu, dst);
}

/* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */
static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric)
{
        return msecs_to_jiffies(dst_metric(dst, metric));
}

static inline int
dst_metric_locked(const struct dst_entry *dst, int metric)
{
        return dst_metric(dst, RTAX_LOCK) & (1 << metric);
}

static inline void dst_hold(struct dst_entry *dst)
{
        /*
         * If your kernel compilation stops here, please check
         * the placement of __rcuref in struct dst_entry
         */
        BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63);
        WARN_ON(!rcuref_get(&dst->__rcuref));
}

static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
{
        if (unlikely(time != READ_ONCE(dst->lastuse))) {
                dst->__use++;
                WRITE_ONCE(dst->lastuse, time);
        }
}

static inline struct dst_entry *dst_clone(struct dst_entry *dst)
{
        if (dst)
                dst_hold(dst);
        return dst;
}

void dst_release(struct dst_entry *dst);

void dst_release_immediate(struct dst_entry *dst);

static inline void refdst_drop(unsigned long refdst)
{
        if (!(refdst & SKB_DST_NOREF))
                dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
}

/**
 * skb_dst_drop - drops skb dst
 * @skb: buffer
 *
 * Drops dst reference count if a reference was taken.
 */
static inline void skb_dst_drop(struct sk_buff *skb)
{
        if (skb->_skb_refdst) {
                refdst_drop(skb->_skb_refdst);
                skb->_skb_refdst = 0UL;
        }
}

static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst)
{
        nskb->slow_gro |= !!refdst;
        nskb->_skb_refdst = refdst;
        if (!(nskb->_skb_refdst & SKB_DST_NOREF))
                dst_clone(skb_dst(nskb));
}

static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
{
        __skb_dst_copy(nskb, oskb->_skb_refdst);
}

/**
 * dst_hold_safe - Take a reference on a dst if possible
 * @dst: pointer to dst entry
 *
 * This helper returns false if it could not safely
 * take a reference on a dst.
 */
static inline bool dst_hold_safe(struct dst_entry *dst)
{
        return rcuref_get(&dst->__rcuref);
}

/**
 * skb_dst_force - makes sure skb dst is refcounted
 * @skb: buffer
 *
 * If dst is not yet refcounted and not destroyed, grab a ref on it.
 * Returns: true if dst is refcounted.
 */
static inline bool skb_dst_force(struct sk_buff *skb)
{
        if (skb_dst_is_noref(skb)) {
                struct dst_entry *dst = skb_dst(skb);

                WARN_ON(!rcu_read_lock_held());
                if (!dst_hold_safe(dst))
                        dst = NULL;

                skb->_skb_refdst = (unsigned long)dst;
                skb->slow_gro |= !!dst;
        }

        return skb->_skb_refdst != 0UL;
}


/**
 *        __skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups. (no accounting done)
 */
static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                   struct net *net)
{
        skb->dev = dev;

        /*
         * Clear hash so that we can recalculate the hash for the
         * encapsulated packet, unless we have already determine the hash
         * over the L4 4-tuple.
         */
        skb_clear_hash_if_not_l4(skb);
        skb_set_queue_mapping(skb, 0);
        skb_scrub_packet(skb, !net_eq(net, dev_net(dev)));
}

/**
 *        skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups, and perform accounting.
 *        Note: this accounting is not SMP safe.
 */
static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                 struct net *net)
{
        DEV_STATS_INC(dev, rx_packets);
        DEV_STATS_ADD(dev, rx_bytes, skb->len);
        __skb_tunnel_rx(skb, dev, net);
}

static inline u32 dst_tclassid(const struct sk_buff *skb)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
        const struct dst_entry *dst;

        dst = skb_dst(skb);
        if (dst)
                return dst->tclassid;
#endif
        return 0;
}

int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static inline int dst_discard(struct sk_buff *skb)
{
        return dst_discard_out(&init_net, skb->sk, skb);
}
void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
                int initial_obsolete, unsigned short flags);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
              struct net_device *dev, int initial_obsolete,
              unsigned short flags);
void dst_dev_put(struct dst_entry *dst);

static inline void dst_confirm(struct dst_entry *dst)
{
}

static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
{
        struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr);
        return IS_ERR(n) ? NULL : n;
}

static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst,
                                                     struct sk_buff *skb)
{
        struct neighbour *n;

        if (WARN_ON_ONCE(!dst->ops->neigh_lookup))
                return NULL;

        n = dst->ops->neigh_lookup(dst, skb, NULL);

        return IS_ERR(n) ? NULL : n;
}

static inline void dst_confirm_neigh(const struct dst_entry *dst,
                                     const void *daddr)
{
        if (dst->ops->confirm_neigh)
                dst->ops->confirm_neigh(dst, daddr);
}

static inline void dst_link_failure(struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        if (dst && dst->ops && dst->ops->link_failure)
                dst->ops->link_failure(skb);
}

static inline void dst_set_expires(struct dst_entry *dst, int timeout)
{
        unsigned long old, expires = jiffies + timeout;

        if (expires == 0)
                expires = 1;

        old = READ_ONCE(dst->expires);

        if (!old || time_before(expires, old))
                WRITE_ONCE(dst->expires, expires);
}

static inline unsigned int dst_dev_overhead(struct dst_entry *dst,
                                            struct sk_buff *skb)
{
        if (likely(dst))
                return LL_RESERVED_SPACE(dst->dev);

        return skb->mac_len;
}

INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *,
                                         struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *,
                                         struct sk_buff *));
/* Output packet to network from transport.  */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->output),
                                  ip6_output, ip_output,
                                  net, sk, skb);
}

INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *));
/* Input packet from network to transport.  */
static inline int dst_input(struct sk_buff *skb)
{
        return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->input),
                                  ip6_input, ip_local_deliver, skb);
}

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
                                                          u32));
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
{
        if (READ_ONCE(dst->obsolete))
                dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check,
                                         ipv4_dst_check, dst, cookie);
        return dst;
}

/* Flags for xfrm_lookup flags argument. */
enum {
        XFRM_LOOKUP_ICMP = 1 << 0,
        XFRM_LOOKUP_QUEUE = 1 << 1,
        XFRM_LOOKUP_KEEP_DST_REF = 1 << 2,
};

struct flowi;
#ifndef CONFIG_XFRM
static inline struct dst_entry *xfrm_lookup(struct net *net,
                                            struct dst_entry *dst_orig,
                                            const struct flowi *fl,
                                            const struct sock *sk,
                                            int flags)
{
        return dst_orig;
}

static inline struct dst_entry *
xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig,
                      const struct flowi *fl, const struct sock *sk,
                      int flags, u32 if_id)
{
        return dst_orig;
}

static inline struct dst_entry *xfrm_lookup_route(struct net *net,
                                                  struct dst_entry *dst_orig,
                                                  const struct flowi *fl,
                                                  const struct sock *sk,
                                                  int flags)
{
        return dst_orig;
}

static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return NULL;
}

#else
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                              const struct flowi *fl, const struct sock *sk,
                              int flags);

struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
                                        struct dst_entry *dst_orig,
                                        const struct flowi *fl,
                                        const struct sock *sk, int flags,
                                        u32 if_id);

struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
                                    const struct flowi *fl, const struct sock *sk,
                                    int flags);

/* skb attached with this dst needs transformation if dst->xfrm is valid */
static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return dst->xfrm;
}
#endif

static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, true);
}

/* update dst pmtu but not do neighbor confirm */
static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}

static inline struct net_device *dst_dev(const struct dst_entry *dst)
{
        return READ_ONCE(dst->dev);
}

static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst)
{
        return rcu_dereference(dst->dev_rcu);
}

static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst)
{
        return dev_net_rcu(dst_dev_rcu(dst));
}

static inline struct net_device *skb_dst_dev(const struct sk_buff *skb)
{
        return dst_dev(skb_dst(skb));
}

static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb)
{
        return dst_dev_rcu(skb_dst(skb));
}

static inline struct net *skb_dst_dev_net(const struct sk_buff *skb)
{
        return dev_net(skb_dst_dev(skb));
}

static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb)
{
        return dev_net_rcu(skb_dst_dev_rcu(skb));
}

struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie);
void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu, bool confirm_neigh);
void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
                            struct sk_buff *skb);
u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old);
struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
                                             struct sk_buff *skb,
                                             const void *daddr);
unsigned int dst_blackhole_mtu(const struct dst_entry *dst);

#endif /* _NET_DST_H */
























































































































































































































































































































    1 
















    1 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Definitions for diskquota-operations. When diskquota is configured these
 * macros expand to the right source-code.
 *
 * Author:  Marco van Wieringen <mvw@planets.elm.net>
 */
#ifndef _LINUX_QUOTAOPS_
#define _LINUX_QUOTAOPS_

#include <linux/fs.h>

#define DQUOT_SPACE_WARN        0x1
#define DQUOT_SPACE_RESERVE        0x2
#define DQUOT_SPACE_NOFAIL        0x4

static inline struct quota_info *sb_dqopt(struct super_block *sb)
{
        return &sb->s_dquot;
}

/* i_rwsem must being held */
static inline bool is_quota_modification(struct mnt_idmap *idmap,
                                         struct inode *inode, struct iattr *ia)
{
        return ((ia->ia_valid & ATTR_SIZE) ||
                i_uid_needs_update(idmap, ia, inode) ||
                i_gid_needs_update(idmap, ia, inode));
}

#if defined(CONFIG_QUOTA)

#define quota_error(sb, fmt, args...) \
        __quota_error((sb), __func__, fmt , ## args)

extern __printf(3, 4)
void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...);

/*
 * declaration of quota_function calls in kernel.
 */
int dquot_initialize(struct inode *inode);
bool dquot_initialize_needed(struct inode *inode);
void dquot_drop(struct inode *inode);
struct dquot *dqget(struct super_block *sb, struct kqid qid);
struct dquot *dqgrab(struct dquot *dquot);

static inline bool dquot_is_busy(struct dquot *dquot)
{
        if (test_bit(DQ_MOD_B, &dquot->dq_flags))
                return true;
        if (atomic_read(&dquot->dq_count) > 0)
                return true;
        return false;
}

void dqput(struct dquot *dquot);
int dquot_scan_active(struct super_block *sb,
                      int (*fn)(struct dquot *dquot, unsigned long priv),
                      unsigned long priv);
struct dquot *dquot_alloc(struct super_block *sb, int type);
void dquot_destroy(struct dquot *dquot);

int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags);
void __dquot_free_space(struct inode *inode, qsize_t number, int flags);

int dquot_alloc_inode(struct inode *inode);

void dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
void dquot_free_inode(struct inode *inode);
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number);

int dquot_disable(struct super_block *sb, int type, unsigned int flags);
/* Suspend quotas on remount RO */
static inline int dquot_suspend(struct super_block *sb, int type)
{
        return dquot_disable(sb, type, DQUOT_SUSPENDED);
}
int dquot_resume(struct super_block *sb, int type);

int dquot_commit(struct dquot *dquot);
int dquot_acquire(struct dquot *dquot);
int dquot_release(struct dquot *dquot);
int dquot_commit_info(struct super_block *sb, int type);
int dquot_get_next_id(struct super_block *sb, struct kqid *qid);
int dquot_mark_dquot_dirty(struct dquot *dquot);

int dquot_file_open(struct inode *inode, struct file *file);

int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
        unsigned int flags);
int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
        unsigned int flags);
int dquot_quota_on(struct super_block *sb, int type, int format_id,
        const struct path *path);
int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
         int format_id, int type);
int dquot_quota_off(struct super_block *sb, int type);
int dquot_writeback_dquots(struct super_block *sb, int type);
int dquot_quota_sync(struct super_block *sb, int type);
int dquot_get_state(struct super_block *sb, struct qc_state *state);
int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii);
int dquot_get_dqblk(struct super_block *sb, struct kqid id,
                struct qc_dqblk *di);
int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id,
                struct qc_dqblk *di);
int dquot_set_dqblk(struct super_block *sb, struct kqid id,
                struct qc_dqblk *di);

int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode,
                   struct iattr *iattr);

static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->info + type;
}

/*
 * Functions for checking status of quota
 */

static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_USAGE_ENABLED, type);
}

static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
}

static inline bool sb_has_quota_suspended(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_SUSPENDED, type);
}

static inline unsigned sb_any_quota_suspended(struct super_block *sb)
{
        return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED);
}

/* Does kernel know about any quota information for given sb + type? */
static inline bool sb_has_quota_loaded(struct super_block *sb, int type)
{
        /* Currently if anything is on, then quota usage is on as well */
        return sb_has_quota_usage_enabled(sb, type);
}

static inline unsigned sb_any_quota_loaded(struct super_block *sb)
{
        return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED);
}

static inline bool sb_has_quota_active(struct super_block *sb, int type)
{
        return sb_has_quota_loaded(sb, type) &&
               !sb_has_quota_suspended(sb, type);
}

/*
 * Operations supported for diskquotas.
 */
extern const struct dquot_operations dquot_operations;
extern const struct quotactl_ops dquot_quotactl_sysfile_ops;

#else

static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_has_quota_suspended(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_any_quota_suspended(struct super_block *sb)
{
        return 0;
}

/* Does kernel know about any quota information for given sb + type? */
static inline int sb_has_quota_loaded(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_any_quota_loaded(struct super_block *sb)
{
        return 0;
}

static inline int sb_has_quota_active(struct super_block *sb, int type)
{
        return 0;
}

static inline int dquot_initialize(struct inode *inode)
{
        return 0;
}

static inline bool dquot_initialize_needed(struct inode *inode)
{
        return false;
}

static inline void dquot_drop(struct inode *inode)
{
}

static inline int dquot_alloc_inode(struct inode *inode)
{
        return 0;
}

static inline void dquot_free_inode(struct inode *inode)
{
}

static inline int dquot_transfer(struct mnt_idmap *idmap,
                                 struct inode *inode, struct iattr *iattr)
{
        return 0;
}

static inline int __dquot_alloc_space(struct inode *inode, qsize_t number,
                int flags)
{
        if (!(flags & DQUOT_SPACE_RESERVE))
                inode_add_bytes(inode, number);
        return 0;
}

static inline void __dquot_free_space(struct inode *inode, qsize_t number,
                int flags)
{
        if (!(flags & DQUOT_SPACE_RESERVE))
                inode_sub_bytes(inode, number);
}

static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
        inode_add_bytes(inode, number);
}

static inline int dquot_reclaim_space_nodirty(struct inode *inode,
                                              qsize_t number)
{
        inode_sub_bytes(inode, number);
        return 0;
}

static inline int dquot_disable(struct super_block *sb, int type,
                unsigned int flags)
{
        return 0;
}

static inline int dquot_suspend(struct super_block *sb, int type)
{
        return 0;
}

static inline int dquot_resume(struct super_block *sb, int type)
{
        return 0;
}

#define dquot_file_open                generic_file_open

static inline int dquot_writeback_dquots(struct super_block *sb, int type)
{
        return 0;
}

#endif /* CONFIG_QUOTA */

static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN);
}

static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr)
{
        __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL);
        mark_inode_dirty_sync(inode);
}

static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
{
        int ret;

        ret = dquot_alloc_space_nodirty(inode, nr);
        if (!ret) {
                /*
                 * Mark inode fully dirty. Since we are allocating blocks, inode
                 * would become fully dirty soon anyway and it reportedly
                 * reduces lock contention.
                 */
                mark_inode_dirty(inode);
        }
        return ret;
}

static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr)
{
        return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits);
}

static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr)
{
        dquot_alloc_space_nofail(inode, nr << inode->i_blkbits);
}

static inline int dquot_alloc_block(struct inode *inode, qsize_t nr)
{
        return dquot_alloc_space(inode, nr << inode->i_blkbits);
}

static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0);
}

static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr)
{
        int ret;

        ret = dquot_prealloc_block_nodirty(inode, nr);
        if (!ret)
                mark_inode_dirty_sync(inode);
        return ret;
}

static inline int dquot_reserve_block(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr << inode->i_blkbits,
                                DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE);
}

static inline void dquot_claim_block(struct inode *inode, qsize_t nr)
{
        dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr)
{
        dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr)
{
        __dquot_free_space(inode, nr, 0);
}

static inline void dquot_free_space(struct inode *inode, qsize_t nr)
{
        dquot_free_space_nodirty(inode, nr);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr)
{
        dquot_free_space_nodirty(inode, nr << inode->i_blkbits);
}

static inline void dquot_free_block(struct inode *inode, qsize_t nr)
{
        dquot_free_space(inode, nr << inode->i_blkbits);
}

static inline void dquot_release_reservation_block(struct inode *inode,
                qsize_t nr)
{
        __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE);
}

unsigned int qtype_enforce_flag(int type);

#endif /* _LINUX_QUOTAOPS_ */

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 














    1 














































































































































































    1 









































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/ipc/shm.c
 * Copyright (C) 1992, 1993 Krishna Balasubramanian
 *         Many improvements/fixes by Bruno Haible.
 * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
 * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
 *
 * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
 * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
 * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
 * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
 * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
 * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
 * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
 *
 * support for audit of ipc object properties and permission changes
 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
 *
 * namespaces support
 * OpenVZ, SWsoft Inc.
 * Pavel Emelianov <xemul@openvz.org>
 *
 * Better ipc lock (kern_ipc_perm.lock) handling
 * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
 */

#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <uapi/linux/shm.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/mman.h>
#include <linux/shmem_fs.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/ptrace.h>
#include <linux/seq_file.h>
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/mount.h>
#include <linux/ipc_namespace.h>
#include <linux/rhashtable.h>
#include <linux/nstree.h>

#include <linux/uaccess.h>

#include "util.h"

struct shmid_kernel /* private to the kernel */
{
        struct kern_ipc_perm        shm_perm;
        struct file                *shm_file;
        unsigned long                shm_nattch;
        unsigned long                shm_segsz;
        time64_t                shm_atim;
        time64_t                shm_dtim;
        time64_t                shm_ctim;
        struct pid                *shm_cprid;
        struct pid                *shm_lprid;
        struct ucounts                *mlock_ucounts;

        /*
         * The task created the shm object, for
         * task_lock(shp->shm_creator)
         */
        struct task_struct        *shm_creator;

        /*
         * List by creator. task_lock(->shm_creator) required for read/write.
         * If list_empty(), then the creator is dead already.
         */
        struct list_head        shm_clist;
        struct ipc_namespace        *ns;
} __randomize_layout;

/* shm_mode upper byte flags */
#define SHM_DEST        01000        /* segment will be destroyed on last detach */
#define SHM_LOCKED        02000   /* segment will not be swapped */

struct shm_file_data {
        int id;
        struct ipc_namespace *ns;
        struct file *file;
        const struct vm_operations_struct *vm_ops;
};

#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))

static const struct file_operations shm_file_operations;
static const struct vm_operations_struct shm_vm_ops;

#define shm_ids(ns)        ((ns)->ids[IPC_SHM_IDS])

#define shm_unlock(shp)                        \
        ipc_unlock(&(shp)->shm_perm)

static int newseg(struct ipc_namespace *, struct ipc_params *);
static void shm_open(struct vm_area_struct *vma);
static void shm_close(struct vm_area_struct *vma);
static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
#ifdef CONFIG_PROC_FS
static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
#endif

void shm_init_ns(struct ipc_namespace *ns)
{
        ns->shm_ctlmax = SHMMAX;
        ns->shm_ctlall = SHMALL;
        ns->shm_ctlmni = SHMMNI;
        ns->shm_rmid_forced = 0;
        ns->shm_tot = 0;
        ipc_init_ids(&shm_ids(ns));
}

/*
 * Called with shm_ids.rwsem (writer) and the shp structure locked.
 * Only shm_ids.rwsem remains locked on exit.
 */
static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
{
        struct shmid_kernel *shp;

        shp = container_of(ipcp, struct shmid_kernel, shm_perm);
        WARN_ON(ns != shp->ns);

        if (shp->shm_nattch) {
                shp->shm_perm.mode |= SHM_DEST;
                /* Do not find it any more */
                ipc_set_key_private(&shm_ids(ns), &shp->shm_perm);
                shm_unlock(shp);
        } else
                shm_destroy(ns, shp);
}

#ifdef CONFIG_IPC_NS
void shm_exit_ns(struct ipc_namespace *ns)
{
        free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
        idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
        rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
}
#endif

static int __init ipc_ns_init(void)
{
        shm_init_ns(&init_ipc_ns);
        ns_tree_add(&init_ipc_ns);
        return 0;
}

pure_initcall(ipc_ns_init);

void __init shm_init(void)
{
        ipc_init_proc_interface("sysvipc/shm",
#if BITS_PER_LONG <= 32
                                "       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
#else
                                "       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
#endif
                                IPC_SHM_IDS, sysvipc_shm_proc_show);
}

static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
{
        struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&shm_ids(ns), id);

        if (IS_ERR(ipcp))
                return ERR_CAST(ipcp);

        return container_of(ipcp, struct shmid_kernel, shm_perm);
}

static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
{
        struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);

        if (IS_ERR(ipcp))
                return ERR_CAST(ipcp);

        return container_of(ipcp, struct shmid_kernel, shm_perm);
}

/*
 * shm_lock_(check_) routines are called in the paths where the rwsem
 * is not necessarily held.
 */
static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
{
        struct kern_ipc_perm *ipcp;

        rcu_read_lock();
        ipcp = ipc_obtain_object_idr(&shm_ids(ns), id);
        if (IS_ERR(ipcp))
                goto err;

        ipc_lock_object(ipcp);
        /*
         * ipc_rmid() may have already freed the ID while ipc_lock_object()
         * was spinning: here verify that the structure is still valid.
         * Upon races with RMID, return -EIDRM, thus indicating that
         * the ID points to a removed identifier.
         */
        if (ipc_valid_object(ipcp)) {
                /* return a locked ipc object upon success */
                return container_of(ipcp, struct shmid_kernel, shm_perm);
        }

        ipc_unlock_object(ipcp);
        ipcp = ERR_PTR(-EIDRM);
err:
        rcu_read_unlock();
        /*
         * Callers of shm_lock() must validate the status of the returned ipc
         * object pointer and error out as appropriate.
         */
        return ERR_CAST(ipcp);
}

static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
{
        rcu_read_lock();
        ipc_lock_object(&ipcp->shm_perm);
}

static void shm_rcu_free(struct rcu_head *head)
{
        struct kern_ipc_perm *ptr = container_of(head, struct kern_ipc_perm,
                                                        rcu);
        struct shmid_kernel *shp = container_of(ptr, struct shmid_kernel,
                                                        shm_perm);
        security_shm_free(&shp->shm_perm);
        kfree(shp);
}

/*
 * It has to be called with shp locked.
 * It must be called before ipc_rmid()
 */
static inline void shm_clist_rm(struct shmid_kernel *shp)
{
        struct task_struct *creator;

        /* ensure that shm_creator does not disappear */
        rcu_read_lock();

        /*
         * A concurrent exit_shm may do a list_del_init() as well.
         * Just do nothing if exit_shm already did the work
         */
        if (!list_empty(&shp->shm_clist)) {
                /*
                 * shp->shm_creator is guaranteed to be valid *only*
                 * if shp->shm_clist is not empty.
                 */
                creator = shp->shm_creator;

                task_lock(creator);
                /*
                 * list_del_init() is a nop if the entry was already removed
                 * from the list.
                 */
                list_del_init(&shp->shm_clist);
                task_unlock(creator);
        }
        rcu_read_unlock();
}

static inline void shm_rmid(struct shmid_kernel *s)
{
        shm_clist_rm(s);
        ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
}


static int __shm_open(struct shm_file_data *sfd)
{
        struct shmid_kernel *shp;

        shp = shm_lock(sfd->ns, sfd->id);

        if (IS_ERR(shp))
                return PTR_ERR(shp);

        if (shp->shm_file != sfd->file) {
                /* ID was reused */
                shm_unlock(shp);
                return -EINVAL;
        }

        shp->shm_atim = ktime_get_real_seconds();
        ipc_update_pid(&shp->shm_lprid, task_tgid(current));
        shp->shm_nattch++;
        shm_unlock(shp);
        return 0;
}

/* This is called by fork, once for every shm attach. */
static void shm_open(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);
        int err;

        /* Always call underlying open if present */
        if (sfd->vm_ops->open)
                sfd->vm_ops->open(vma);

        err = __shm_open(sfd);
        /*
         * We raced in the idr lookup or with shm_destroy().
         * Either way, the ID is busted.
         */
        WARN_ON_ONCE(err);
}

/*
 * shm_destroy - free the struct shmid_kernel
 *
 * @ns: namespace
 * @shp: struct to free
 *
 * It has to be called with shp and shm_ids.rwsem (writer) locked,
 * but returns with shp unlocked and freed.
 */
static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
{
        struct file *shm_file;

        shm_file = shp->shm_file;
        shp->shm_file = NULL;
        ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
        shm_rmid(shp);
        shm_unlock(shp);
        if (!is_file_hugepages(shm_file))
                shmem_lock(shm_file, 0, shp->mlock_ucounts);
        fput(shm_file);
        ipc_update_pid(&shp->shm_cprid, NULL);
        ipc_update_pid(&shp->shm_lprid, NULL);
        ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
}

/*
 * shm_may_destroy - identifies whether shm segment should be destroyed now
 *
 * Returns true if and only if there are no active users of the segment and
 * one of the following is true:
 *
 * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
 *
 * 2) sysctl kernel.shm_rmid_forced is set to 1.
 */
static bool shm_may_destroy(struct shmid_kernel *shp)
{
        return (shp->shm_nattch == 0) &&
               (shp->ns->shm_rmid_forced ||
                (shp->shm_perm.mode & SHM_DEST));
}

/*
 * remove the attach descriptor vma.
 * free memory for segment if it is marked destroyed.
 * The descriptor has already been removed from the current->mm->mmap list
 * and will later be kfree()d.
 */
static void __shm_close(struct shm_file_data *sfd)
{
        struct shmid_kernel *shp;
        struct ipc_namespace *ns = sfd->ns;

        down_write(&shm_ids(ns).rwsem);
        /* remove from the list of attaches of the shm segment */
        shp = shm_lock(ns, sfd->id);

        /*
         * We raced in the idr lookup or with shm_destroy().
         * Either way, the ID is busted.
         */
        if (WARN_ON_ONCE(IS_ERR(shp)))
                goto done; /* no-op */

        ipc_update_pid(&shp->shm_lprid, task_tgid(current));
        shp->shm_dtim = ktime_get_real_seconds();
        shp->shm_nattch--;
        if (shm_may_destroy(shp))
                shm_destroy(ns, shp);
        else
                shm_unlock(shp);
done:
        up_write(&shm_ids(ns).rwsem);
}

static void shm_close(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);

        /* Always call underlying close if present */
        if (sfd->vm_ops->close)
                sfd->vm_ops->close(vma);

        __shm_close(sfd);
}

/* Called with ns->shm_ids(ns).rwsem locked */
static int shm_try_destroy_orphaned(int id, void *p, void *data)
{
        struct ipc_namespace *ns = data;
        struct kern_ipc_perm *ipcp = p;
        struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);

        /*
         * We want to destroy segments without users and with already
         * exit'ed originating process.
         *
         * As shp->* are changed under rwsem, it's safe to skip shp locking.
         */
        if (!list_empty(&shp->shm_clist))
                return 0;

        if (shm_may_destroy(shp)) {
                shm_lock_by_ptr(shp);
                shm_destroy(ns, shp);
        }
        return 0;
}

void shm_destroy_orphaned(struct ipc_namespace *ns)
{
        down_write(&shm_ids(ns).rwsem);
        if (shm_ids(ns).in_use) {
                rcu_read_lock();
                idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
                rcu_read_unlock();
        }
        up_write(&shm_ids(ns).rwsem);
}

/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
        for (;;) {
                struct shmid_kernel *shp;
                struct ipc_namespace *ns;

                task_lock(task);

                if (list_empty(&task->sysvshm.shm_clist)) {
                        task_unlock(task);
                        break;
                }

                shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
                                shm_clist);

                /*
                 * 1) Get pointer to the ipc namespace. It is worth to say
                 * that this pointer is guaranteed to be valid because
                 * shp lifetime is always shorter than namespace lifetime
                 * in which shp lives.
                 * We taken task_lock it means that shp won't be freed.
                 */
                ns = shp->ns;

                /*
                 * 2) If kernel.shm_rmid_forced is not set then only keep track of
                 * which shmids are orphaned, so that a later set of the sysctl
                 * can clean them up.
                 */
                if (!ns->shm_rmid_forced)
                        goto unlink_continue;

                /*
                 * 3) get a reference to the namespace.
                 *    The refcount could be already 0. If it is 0, then
                 *    the shm objects will be free by free_ipc_work().
                 */
                ns = get_ipc_ns_not_zero(ns);
                if (!ns) {
unlink_continue:
                        list_del_init(&shp->shm_clist);
                        task_unlock(task);
                        continue;
                }

                /*
                 * 4) get a reference to shp.
                 *   This cannot fail: shm_clist_rm() is called before
                 *   ipc_rmid(), thus the refcount cannot be 0.
                 */
                WARN_ON(!ipc_rcu_getref(&shp->shm_perm));

                /*
                 * 5) unlink the shm segment from the list of segments
                 *    created by current.
                 *    This must be done last. After unlinking,
                 *    only the refcounts obtained above prevent IPC_RMID
                 *    from destroying the segment or the namespace.
                 */
                list_del_init(&shp->shm_clist);

                task_unlock(task);

                /*
                 * 6) we have all references
                 *    Thus lock & if needed destroy shp.
                 */
                down_write(&shm_ids(ns).rwsem);
                shm_lock_by_ptr(shp);
                /*
                 * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's
                 * safe to call ipc_rcu_putref here
                 */
                ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);

                if (ipc_valid_object(&shp->shm_perm)) {
                        if (shm_may_destroy(shp))
                                shm_destroy(ns, shp);
                        else
                                shm_unlock(shp);
                } else {
                        /*
                         * Someone else deleted the shp from namespace
                         * idr/kht while we have waited.
                         * Just unlock and continue.
                         */
                        shm_unlock(shp);
                }

                up_write(&shm_ids(ns).rwsem);
                put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
        }
}

static vm_fault_t shm_fault(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);

        return sfd->vm_ops->fault(vmf);
}

static int shm_may_split(struct vm_area_struct *vma, unsigned long addr)
{
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);

        if (sfd->vm_ops->may_split)
                return sfd->vm_ops->may_split(vma, addr);

        return 0;
}

static unsigned long shm_pagesize(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);

        if (sfd->vm_ops->pagesize)
                return sfd->vm_ops->pagesize(vma);

        return PAGE_SIZE;
}

#ifdef CONFIG_NUMA
static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
        struct shm_file_data *sfd = shm_file_data(vma->vm_file);
        int err = 0;

        if (sfd->vm_ops->set_policy)
                err = sfd->vm_ops->set_policy(vma, mpol);
        return err;
}

static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
                                        unsigned long addr, pgoff_t *ilx)
{
        struct shm_file_data *sfd = shm_file_data(vma->vm_file);
        struct mempolicy *mpol = vma->vm_policy;

        if (sfd->vm_ops->get_policy)
                mpol = sfd->vm_ops->get_policy(vma, addr, ilx);
        return mpol;
}
#endif

static int shm_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct shm_file_data *sfd = shm_file_data(file);
        int ret;

        /*
         * In case of remap_file_pages() emulation, the file can represent an
         * IPC ID that was removed, and possibly even reused by another shm
         * segment already.  Propagate this case as an error to caller.
         */
        ret = __shm_open(sfd);
        if (ret)
                return ret;

        ret = vfs_mmap(sfd->file, vma);
        if (ret) {
                __shm_close(sfd);
                return ret;
        }
        sfd->vm_ops = vma->vm_ops;
#ifdef CONFIG_MMU
        WARN_ON(!sfd->vm_ops->fault);
#endif
        vma->vm_ops = &shm_vm_ops;
        return 0;
}

static int shm_release(struct inode *ino, struct file *file)
{
        struct shm_file_data *sfd = shm_file_data(file);

        put_ipc_ns(sfd->ns);
        fput(sfd->file);
        shm_file_data(file) = NULL;
        kfree(sfd);
        return 0;
}

static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
        struct shm_file_data *sfd = shm_file_data(file);

        if (!sfd->file->f_op->fsync)
                return -EINVAL;
        return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
}

static long shm_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len)
{
        struct shm_file_data *sfd = shm_file_data(file);

        if (!sfd->file->f_op->fallocate)
                return -EOPNOTSUPP;
        return sfd->file->f_op->fallocate(file, mode, offset, len);
}

static unsigned long shm_get_unmapped_area(struct file *file,
        unsigned long addr, unsigned long len, unsigned long pgoff,
        unsigned long flags)
{
        struct shm_file_data *sfd = shm_file_data(file);

        return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
                                                pgoff, flags);
}

static const struct file_operations shm_file_operations = {
        .mmap                = shm_mmap,
        .fsync                = shm_fsync,
        .release        = shm_release,
        .get_unmapped_area        = shm_get_unmapped_area,
        .llseek                = noop_llseek,
        .fallocate        = shm_fallocate,
};

/*
 * shm_file_operations_huge is now identical to shm_file_operations
 * except for fop_flags
 */
static const struct file_operations shm_file_operations_huge = {
        .mmap                = shm_mmap,
        .fsync                = shm_fsync,
        .release        = shm_release,
        .get_unmapped_area        = shm_get_unmapped_area,
        .llseek                = noop_llseek,
        .fallocate        = shm_fallocate,
        .fop_flags        = FOP_HUGE_PAGES,
};

static const struct vm_operations_struct shm_vm_ops = {
        .open        = shm_open,        /* callback for a new vm-area open */
        .close        = shm_close,        /* callback for when the vm-area is released */
        .fault        = shm_fault,
        .may_split = shm_may_split,
        .pagesize = shm_pagesize,
#if defined(CONFIG_NUMA)
        .set_policy = shm_set_policy,
        .get_policy = shm_get_policy,
#endif
};

/**
 * newseg - Create a new shared memory segment
 * @ns: namespace
 * @params: ptr to the structure that contains key, size and shmflg
 *
 * Called with shm_ids.rwsem held as a writer.
 */
static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
{
        key_t key = params->key;
        int shmflg = params->flg;
        size_t size = params->u.size;
        int error;
        struct shmid_kernel *shp;
        size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        const bool has_no_reserve = shmflg & SHM_NORESERVE;
        vma_flags_t acctflag = EMPTY_VMA_FLAGS;
        struct file *file;
        char name[13];

        if (size < SHMMIN || size > ns->shm_ctlmax)
                return -EINVAL;

        if (numpages << PAGE_SHIFT < size)
                return -ENOSPC;

        if (ns->shm_tot + numpages < ns->shm_tot ||
                        ns->shm_tot + numpages > ns->shm_ctlall)
                return -ENOSPC;

        shp = kmalloc_obj(*shp, GFP_KERNEL_ACCOUNT);
        if (unlikely(!shp))
                return -ENOMEM;

        shp->shm_perm.key = key;
        shp->shm_perm.mode = (shmflg & S_IRWXUGO);
        shp->mlock_ucounts = NULL;

        shp->shm_perm.security = NULL;
        error = security_shm_alloc(&shp->shm_perm);
        if (error) {
                kfree(shp);
                return error;
        }

        sprintf(name, "SYSV%08x", key);
        if (shmflg & SHM_HUGETLB) {
                struct hstate *hs;
                size_t hugesize;

                hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
                if (!hs) {
                        error = -EINVAL;
                        goto no_file;
                }
                hugesize = ALIGN(size, huge_page_size(hs));

                /* hugetlb_file_setup applies strict accounting */
                if (has_no_reserve)
                        vma_flags_set(&acctflag, VMA_NORESERVE_BIT);
                file = hugetlb_file_setup(name, hugesize, acctflag,
                                HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
        } else {
                /*
                 * Do not allow no accounting for OVERCOMMIT_NEVER, even
                 * if it's asked for.
                 */
                if  (has_no_reserve && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
                        vma_flags_set(&acctflag, VMA_NORESERVE_BIT);
                file = shmem_kernel_file_setup(name, size, acctflag);
        }
        error = PTR_ERR(file);
        if (IS_ERR(file))
                goto no_file;

        shp->shm_cprid = get_pid(task_tgid(current));
        shp->shm_lprid = NULL;
        shp->shm_atim = shp->shm_dtim = 0;
        shp->shm_ctim = ktime_get_real_seconds();
        shp->shm_segsz = size;
        shp->shm_nattch = 0;
        shp->shm_file = file;
        shp->shm_creator = current;

        /* ipc_addid() locks shp upon success. */
        error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
        if (error < 0)
                goto no_id;

        shp->ns = ns;

        task_lock(current);
        list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
        task_unlock(current);

        /*
         * shmid gets reported as "inode#" in /proc/pid/maps.
         * proc-ps tools use this. Changing this will break them.
         */
        file_inode(file)->i_ino = shp->shm_perm.id;

        ns->shm_tot += numpages;
        error = shp->shm_perm.id;

        ipc_unlock_object(&shp->shm_perm);
        rcu_read_unlock();
        return error;

no_id:
        ipc_update_pid(&shp->shm_cprid, NULL);
        ipc_update_pid(&shp->shm_lprid, NULL);
        fput(file);
        ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
        return error;
no_file:
        call_rcu(&shp->shm_perm.rcu, shm_rcu_free);
        return error;
}

/*
 * Called with shm_ids.rwsem and ipcp locked.
 */
static int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params)
{
        struct shmid_kernel *shp;

        shp = container_of(ipcp, struct shmid_kernel, shm_perm);
        if (shp->shm_segsz < params->u.size)
                return -EINVAL;

        return 0;
}

long ksys_shmget(key_t key, size_t size, int shmflg)
{
        struct ipc_namespace *ns;
        static const struct ipc_ops shm_ops = {
                .getnew = newseg,
                .associate = security_shm_associate,
                .more_checks = shm_more_checks,
        };
        struct ipc_params shm_params;

        ns = current->nsproxy->ipc_ns;

        shm_params.key = key;
        shm_params.flg = shmflg;
        shm_params.u.size = size;

        return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
}

SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
{
        return ksys_shmget(key, size, shmflg);
}

static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
{
        switch (version) {
        case IPC_64:
                return copy_to_user(buf, in, sizeof(*in));
        case IPC_OLD:
            {
                struct shmid_ds out;

                memset(&out, 0, sizeof(out));
                ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
                out.shm_segsz        = in->shm_segsz;
                out.shm_atime        = in->shm_atime;
                out.shm_dtime        = in->shm_dtime;
                out.shm_ctime        = in->shm_ctime;
                out.shm_cpid        = in->shm_cpid;
                out.shm_lpid        = in->shm_lpid;
                out.shm_nattch        = in->shm_nattch;

                return copy_to_user(buf, &out, sizeof(out));
            }
        default:
                return -EINVAL;
        }
}

static inline unsigned long
copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
{
        switch (version) {
        case IPC_64:
                if (copy_from_user(out, buf, sizeof(*out)))
                        return -EFAULT;
                return 0;
        case IPC_OLD:
            {
                struct shmid_ds tbuf_old;

                if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
                        return -EFAULT;

                out->shm_perm.uid        = tbuf_old.shm_perm.uid;
                out->shm_perm.gid        = tbuf_old.shm_perm.gid;
                out->shm_perm.mode        = tbuf_old.shm_perm.mode;

                return 0;
            }
        default:
                return -EINVAL;
        }
}

static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
{
        switch (version) {
        case IPC_64:
                return copy_to_user(buf, in, sizeof(*in));
        case IPC_OLD:
            {
                struct shminfo out;

                if (in->shmmax > INT_MAX)
                        out.shmmax = INT_MAX;
                else
                        out.shmmax = (int)in->shmmax;

                out.shmmin        = in->shmmin;
                out.shmmni        = in->shmmni;
                out.shmseg        = in->shmseg;
                out.shmall        = in->shmall;

                return copy_to_user(buf, &out, sizeof(out));
            }
        default:
                return -EINVAL;
        }
}

/*
 * Calculate and add used RSS and swap pages of a shm.
 * Called with shm_ids.rwsem held as a reader
 */
static void shm_add_rss_swap(struct shmid_kernel *shp,
        unsigned long *rss_add, unsigned long *swp_add)
{
        struct inode *inode;

        inode = file_inode(shp->shm_file);

        if (is_file_hugepages(shp->shm_file)) {
                struct address_space *mapping = inode->i_mapping;
                struct hstate *h = hstate_file(shp->shm_file);
                *rss_add += pages_per_huge_page(h) * mapping->nrpages;
        } else {
#ifdef CONFIG_SHMEM
                struct shmem_inode_info *info = SHMEM_I(inode);

                spin_lock_irq(&info->lock);
                *rss_add += inode->i_mapping->nrpages;
                *swp_add += info->swapped;
                spin_unlock_irq(&info->lock);
#else
                *rss_add += inode->i_mapping->nrpages;
#endif
        }
}

/*
 * Called with shm_ids.rwsem held as a reader
 */
static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
                unsigned long *swp)
{
        int next_id;
        int total, in_use;

        *rss = 0;
        *swp = 0;

        in_use = shm_ids(ns).in_use;

        for (total = 0, next_id = 0; total < in_use; next_id++) {
                struct kern_ipc_perm *ipc;
                struct shmid_kernel *shp;

                ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
                if (ipc == NULL)
                        continue;
                shp = container_of(ipc, struct shmid_kernel, shm_perm);

                shm_add_rss_swap(shp, rss, swp);

                total++;
        }
}

/*
 * This function handles some shmctl commands which require the rwsem
 * to be held in write mode.
 * NOTE: no locks must be held, the rwsem is taken inside this function.
 */
static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
                       struct shmid64_ds *shmid64)
{
        struct kern_ipc_perm *ipcp;
        struct shmid_kernel *shp;
        int err;

        down_write(&shm_ids(ns).rwsem);
        rcu_read_lock();

        ipcp = ipcctl_obtain_check(ns, &shm_ids(ns), shmid, cmd,
                                      &shmid64->shm_perm, 0);
        if (IS_ERR(ipcp)) {
                err = PTR_ERR(ipcp);
                goto out_unlock1;
        }

        shp = container_of(ipcp, struct shmid_kernel, shm_perm);

        err = security_shm_shmctl(&shp->shm_perm, cmd);
        if (err)
                goto out_unlock1;

        switch (cmd) {
        case IPC_RMID:
                ipc_lock_object(&shp->shm_perm);
                /* do_shm_rmid unlocks the ipc object and rcu */
                do_shm_rmid(ns, ipcp);
                goto out_up;
        case IPC_SET:
                ipc_lock_object(&shp->shm_perm);
                err = ipc_update_perm(&shmid64->shm_perm, ipcp);
                if (err)
                        goto out_unlock0;
                shp->shm_ctim = ktime_get_real_seconds();
                break;
        default:
                err = -EINVAL;
                goto out_unlock1;
        }

out_unlock0:
        ipc_unlock_object(&shp->shm_perm);
out_unlock1:
        rcu_read_unlock();
out_up:
        up_write(&shm_ids(ns).rwsem);
        return err;
}

static int shmctl_ipc_info(struct ipc_namespace *ns,
                           struct shminfo64 *shminfo)
{
        int err = security_shm_shmctl(NULL, IPC_INFO);
        if (!err) {
                memset(shminfo, 0, sizeof(*shminfo));
                shminfo->shmmni = shminfo->shmseg = ns->shm_ctlmni;
                shminfo->shmmax = ns->shm_ctlmax;
                shminfo->shmall = ns->shm_ctlall;
                shminfo->shmmin = SHMMIN;
                down_read(&shm_ids(ns).rwsem);
                err = ipc_get_maxidx(&shm_ids(ns));
                up_read(&shm_ids(ns).rwsem);
                if (err < 0)
                        err = 0;
        }
        return err;
}

static int shmctl_shm_info(struct ipc_namespace *ns,
                           struct shm_info *shm_info)
{
        int err = security_shm_shmctl(NULL, SHM_INFO);
        if (!err) {
                memset(shm_info, 0, sizeof(*shm_info));
                down_read(&shm_ids(ns).rwsem);
                shm_info->used_ids = shm_ids(ns).in_use;
                shm_get_stat(ns, &shm_info->shm_rss, &shm_info->shm_swp);
                shm_info->shm_tot = ns->shm_tot;
                shm_info->swap_attempts = 0;
                shm_info->swap_successes = 0;
                err = ipc_get_maxidx(&shm_ids(ns));
                up_read(&shm_ids(ns).rwsem);
                if (err < 0)
                        err = 0;
        }
        return err;
}

static int shmctl_stat(struct ipc_namespace *ns, int shmid,
                        int cmd, struct shmid64_ds *tbuf)
{
        struct shmid_kernel *shp;
        int err;

        memset(tbuf, 0, sizeof(*tbuf));

        rcu_read_lock();
        if (cmd == SHM_STAT || cmd == SHM_STAT_ANY) {
                shp = shm_obtain_object(ns, shmid);
                if (IS_ERR(shp)) {
                        err = PTR_ERR(shp);
                        goto out_unlock;
                }
        } else { /* IPC_STAT */
                shp = shm_obtain_object_check(ns, shmid);
                if (IS_ERR(shp)) {
                        err = PTR_ERR(shp);
                        goto out_unlock;
                }
        }

        /*
         * Semantically SHM_STAT_ANY ought to be identical to
         * that functionality provided by the /proc/sysvipc/
         * interface. As such, only audit these calls and
         * do not do traditional S_IRUGO permission checks on
         * the ipc object.
         */
        if (cmd == SHM_STAT_ANY)
                audit_ipc_obj(&shp->shm_perm);
        else {
                err = -EACCES;
                if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
                        goto out_unlock;
        }

        err = security_shm_shmctl(&shp->shm_perm, cmd);
        if (err)
                goto out_unlock;

        ipc_lock_object(&shp->shm_perm);

        if (!ipc_valid_object(&shp->shm_perm)) {
                ipc_unlock_object(&shp->shm_perm);
                err = -EIDRM;
                goto out_unlock;
        }

        kernel_to_ipc64_perm(&shp->shm_perm, &tbuf->shm_perm);
        tbuf->shm_segsz        = shp->shm_segsz;
        tbuf->shm_atime        = shp->shm_atim;
        tbuf->shm_dtime        = shp->shm_dtim;
        tbuf->shm_ctime        = shp->shm_ctim;
#ifndef CONFIG_64BIT
        tbuf->shm_atime_high = shp->shm_atim >> 32;
        tbuf->shm_dtime_high = shp->shm_dtim >> 32;
        tbuf->shm_ctime_high = shp->shm_ctim >> 32;
#endif
        tbuf->shm_cpid        = pid_vnr(shp->shm_cprid);
        tbuf->shm_lpid        = pid_vnr(shp->shm_lprid);
        tbuf->shm_nattch = shp->shm_nattch;

        if (cmd == IPC_STAT) {
                /*
                 * As defined in SUS:
                 * Return 0 on success
                 */
                err = 0;
        } else {
                /*
                 * SHM_STAT and SHM_STAT_ANY (both Linux specific)
                 * Return the full id, including the sequence number
                 */
                err = shp->shm_perm.id;
        }

        ipc_unlock_object(&shp->shm_perm);
out_unlock:
        rcu_read_unlock();
        return err;
}

static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
{
        struct shmid_kernel *shp;
        struct file *shm_file;
        int err;

        rcu_read_lock();
        shp = shm_obtain_object_check(ns, shmid);
        if (IS_ERR(shp)) {
                err = PTR_ERR(shp);
                goto out_unlock1;
        }

        audit_ipc_obj(&(shp->shm_perm));
        err = security_shm_shmctl(&shp->shm_perm, cmd);
        if (err)
                goto out_unlock1;

        ipc_lock_object(&shp->shm_perm);

        /* check if shm_destroy() is tearing down shp */
        if (!ipc_valid_object(&shp->shm_perm)) {
                err = -EIDRM;
                goto out_unlock0;
        }

        if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
                kuid_t euid = current_euid();

                if (!uid_eq(euid, shp->shm_perm.uid) &&
                    !uid_eq(euid, shp->shm_perm.cuid)) {
                        err = -EPERM;
                        goto out_unlock0;
                }
                if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) {
                        err = -EPERM;
                        goto out_unlock0;
                }
        }

        shm_file = shp->shm_file;
        if (is_file_hugepages(shm_file))
                goto out_unlock0;

        if (cmd == SHM_LOCK) {
                struct ucounts *ucounts = current_ucounts();

                err = shmem_lock(shm_file, 1, ucounts);
                if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
                        shp->shm_perm.mode |= SHM_LOCKED;
                        shp->mlock_ucounts = ucounts;
                }
                goto out_unlock0;
        }

        /* SHM_UNLOCK */
        if (!(shp->shm_perm.mode & SHM_LOCKED))
                goto out_unlock0;
        shmem_lock(shm_file, 0, shp->mlock_ucounts);
        shp->shm_perm.mode &= ~SHM_LOCKED;
        shp->mlock_ucounts = NULL;
        get_file(shm_file);
        ipc_unlock_object(&shp->shm_perm);
        rcu_read_unlock();
        shmem_unlock_mapping(shm_file->f_mapping);

        fput(shm_file);
        return err;

out_unlock0:
        ipc_unlock_object(&shp->shm_perm);
out_unlock1:
        rcu_read_unlock();
        return err;
}

static long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf, int version)
{
        int err;
        struct ipc_namespace *ns;
        struct shmid64_ds sem64;

        if (cmd < 0 || shmid < 0)
                return -EINVAL;

        ns = current->nsproxy->ipc_ns;

        switch (cmd) {
        case IPC_INFO: {
                struct shminfo64 shminfo;
                err = shmctl_ipc_info(ns, &shminfo);
                if (err < 0)
                        return err;
                if (copy_shminfo_to_user(buf, &shminfo, version))
                        err = -EFAULT;
                return err;
        }
        case SHM_INFO: {
                struct shm_info shm_info;
                err = shmctl_shm_info(ns, &shm_info);
                if (err < 0)
                        return err;
                if (copy_to_user(buf, &shm_info, sizeof(shm_info)))
                        err = -EFAULT;
                return err;
        }
        case SHM_STAT:
        case SHM_STAT_ANY:
        case IPC_STAT: {
                err = shmctl_stat(ns, shmid, cmd, &sem64);
                if (err < 0)
                        return err;
                if (copy_shmid_to_user(buf, &sem64, version))
                        err = -EFAULT;
                return err;
        }
        case IPC_SET:
                if (copy_shmid_from_user(&sem64, buf, version))
                        return -EFAULT;
                fallthrough;
        case IPC_RMID:
                return shmctl_down(ns, shmid, cmd, &sem64);
        case SHM_LOCK:
        case SHM_UNLOCK:
                return shmctl_do_lock(ns, shmid, cmd);
        default:
                return -EINVAL;
        }
}

SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
{
        return ksys_shmctl(shmid, cmd, buf, IPC_64);
}

#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
long ksys_old_shmctl(int shmid, int cmd, struct shmid_ds __user *buf)
{
        int version = ipc_parse_version(&cmd);

        return ksys_shmctl(shmid, cmd, buf, version);
}

SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
{
        return ksys_old_shmctl(shmid, cmd, buf);
}
#endif

#ifdef CONFIG_COMPAT

struct compat_shmid_ds {
        struct compat_ipc_perm shm_perm;
        int shm_segsz;
        old_time32_t shm_atime;
        old_time32_t shm_dtime;
        old_time32_t shm_ctime;
        compat_ipc_pid_t shm_cpid;
        compat_ipc_pid_t shm_lpid;
        unsigned short shm_nattch;
        unsigned short shm_unused;
        compat_uptr_t shm_unused2;
        compat_uptr_t shm_unused3;
};

struct compat_shminfo64 {
        compat_ulong_t shmmax;
        compat_ulong_t shmmin;
        compat_ulong_t shmmni;
        compat_ulong_t shmseg;
        compat_ulong_t shmall;
        compat_ulong_t __unused1;
        compat_ulong_t __unused2;
        compat_ulong_t __unused3;
        compat_ulong_t __unused4;
};

struct compat_shm_info {
        compat_int_t used_ids;
        compat_ulong_t shm_tot, shm_rss, shm_swp;
        compat_ulong_t swap_attempts, swap_successes;
};

static int copy_compat_shminfo_to_user(void __user *buf, struct shminfo64 *in,
                                        int version)
{
        if (in->shmmax > INT_MAX)
                in->shmmax = INT_MAX;
        if (version == IPC_64) {
                struct compat_shminfo64 info;
                memset(&info, 0, sizeof(info));
                info.shmmax = in->shmmax;
                info.shmmin = in->shmmin;
                info.shmmni = in->shmmni;
                info.shmseg = in->shmseg;
                info.shmall = in->shmall;
                return copy_to_user(buf, &info, sizeof(info));
        } else {
                struct shminfo info;
                memset(&info, 0, sizeof(info));
                info.shmmax = in->shmmax;
                info.shmmin = in->shmmin;
                info.shmmni = in->shmmni;
                info.shmseg = in->shmseg;
                info.shmall = in->shmall;
                return copy_to_user(buf, &info, sizeof(info));
        }
}

static int put_compat_shm_info(struct shm_info *ip,
                                struct compat_shm_info __user *uip)
{
        struct compat_shm_info info;

        memset(&info, 0, sizeof(info));
        info.used_ids = ip->used_ids;
        info.shm_tot = ip->shm_tot;
        info.shm_rss = ip->shm_rss;
        info.shm_swp = ip->shm_swp;
        info.swap_attempts = ip->swap_attempts;
        info.swap_successes = ip->swap_successes;
        return copy_to_user(uip, &info, sizeof(info));
}

static int copy_compat_shmid_to_user(void __user *buf, struct shmid64_ds *in,
                                        int version)
{
        if (version == IPC_64) {
                struct compat_shmid64_ds v;
                memset(&v, 0, sizeof(v));
                to_compat_ipc64_perm(&v.shm_perm, &in->shm_perm);
                v.shm_atime         = lower_32_bits(in->shm_atime);
                v.shm_atime_high = upper_32_bits(in->shm_atime);
                v.shm_dtime         = lower_32_bits(in->shm_dtime);
                v.shm_dtime_high = upper_32_bits(in->shm_dtime);
                v.shm_ctime         = lower_32_bits(in->shm_ctime);
                v.shm_ctime_high = upper_32_bits(in->shm_ctime);
                v.shm_segsz = in->shm_segsz;
                v.shm_nattch = in->shm_nattch;
                v.shm_cpid = in->shm_cpid;
                v.shm_lpid = in->shm_lpid;
                return copy_to_user(buf, &v, sizeof(v));
        } else {
                struct compat_shmid_ds v;
                memset(&v, 0, sizeof(v));
                to_compat_ipc_perm(&v.shm_perm, &in->shm_perm);
                v.shm_perm.key = in->shm_perm.key;
                v.shm_atime = in->shm_atime;
                v.shm_dtime = in->shm_dtime;
                v.shm_ctime = in->shm_ctime;
                v.shm_segsz = in->shm_segsz;
                v.shm_nattch = in->shm_nattch;
                v.shm_cpid = in->shm_cpid;
                v.shm_lpid = in->shm_lpid;
                return copy_to_user(buf, &v, sizeof(v));
        }
}

static int copy_compat_shmid_from_user(struct shmid64_ds *out, void __user *buf,
                                        int version)
{
        memset(out, 0, sizeof(*out));
        if (version == IPC_64) {
                struct compat_shmid64_ds __user *p = buf;
                return get_compat_ipc64_perm(&out->shm_perm, &p->shm_perm);
        } else {
                struct compat_shmid_ds __user *p = buf;
                return get_compat_ipc_perm(&out->shm_perm, &p->shm_perm);
        }
}

static long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version)
{
        struct ipc_namespace *ns;
        struct shmid64_ds sem64;
        int err;

        ns = current->nsproxy->ipc_ns;

        if (cmd < 0 || shmid < 0)
                return -EINVAL;

        switch (cmd) {
        case IPC_INFO: {
                struct shminfo64 shminfo;
                err = shmctl_ipc_info(ns, &shminfo);
                if (err < 0)
                        return err;
                if (copy_compat_shminfo_to_user(uptr, &shminfo, version))
                        err = -EFAULT;
                return err;
        }
        case SHM_INFO: {
                struct shm_info shm_info;
                err = shmctl_shm_info(ns, &shm_info);
                if (err < 0)
                        return err;
                if (put_compat_shm_info(&shm_info, uptr))
                        err = -EFAULT;
                return err;
        }
        case IPC_STAT:
        case SHM_STAT_ANY:
        case SHM_STAT:
                err = shmctl_stat(ns, shmid, cmd, &sem64);
                if (err < 0)
                        return err;
                if (copy_compat_shmid_to_user(uptr, &sem64, version))
                        err = -EFAULT;
                return err;

        case IPC_SET:
                if (copy_compat_shmid_from_user(&sem64, uptr, version))
                        return -EFAULT;
                fallthrough;
        case IPC_RMID:
                return shmctl_down(ns, shmid, cmd, &sem64);
        case SHM_LOCK:
        case SHM_UNLOCK:
                return shmctl_do_lock(ns, shmid, cmd);
        default:
                return -EINVAL;
        }
        return err;
}

COMPAT_SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, void __user *, uptr)
{
        return compat_ksys_shmctl(shmid, cmd, uptr, IPC_64);
}

#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION
long compat_ksys_old_shmctl(int shmid, int cmd, void __user *uptr)
{
        int version = compat_ipc_parse_version(&cmd);

        return compat_ksys_shmctl(shmid, cmd, uptr, version);
}

COMPAT_SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, void __user *, uptr)
{
        return compat_ksys_old_shmctl(shmid, cmd, uptr);
}
#endif
#endif

/*
 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
 *
 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
 * "raddr" thing points to kernel space, and there has to be a wrapper around
 * this.
 */
long do_shmat(int shmid, char __user *shmaddr, int shmflg,
              ulong *raddr, unsigned long shmlba)
{
        struct shmid_kernel *shp;
        unsigned long addr = (unsigned long)shmaddr;
        unsigned long size;
        struct file *file, *base;
        int    err;
        unsigned long flags = MAP_SHARED;
        unsigned long prot;
        int acc_mode;
        struct ipc_namespace *ns;
        struct shm_file_data *sfd;
        int f_flags;
        unsigned long populate = 0;

        err = -EINVAL;
        if (shmid < 0)
                goto out;

        if (addr) {
                if (addr & (shmlba - 1)) {
                        if (shmflg & SHM_RND) {
                                addr &= ~(shmlba - 1);  /* round down */

                                /*
                                 * Ensure that the round-down is non-nil
                                 * when remapping. This can happen for
                                 * cases when addr < shmlba.
                                 */
                                if (!addr && (shmflg & SHM_REMAP))
                                        goto out;
                        } else
#ifndef __ARCH_FORCE_SHMLBA
                                if (addr & ~PAGE_MASK)
#endif
                                        goto out;
                }

                flags |= MAP_FIXED;
        } else if ((shmflg & SHM_REMAP))
                goto out;

        if (shmflg & SHM_RDONLY) {
                prot = PROT_READ;
                acc_mode = S_IRUGO;
                f_flags = O_RDONLY;
        } else {
                prot = PROT_READ | PROT_WRITE;
                acc_mode = S_IRUGO | S_IWUGO;
                f_flags = O_RDWR;
        }
        if (shmflg & SHM_EXEC) {
                prot |= PROT_EXEC;
                acc_mode |= S_IXUGO;
        }

        /*
         * We cannot rely on the fs check since SYSV IPC does have an
         * additional creator id...
         */
        ns = current->nsproxy->ipc_ns;
        rcu_read_lock();
        shp = shm_obtain_object_check(ns, shmid);
        if (IS_ERR(shp)) {
                err = PTR_ERR(shp);
                goto out_unlock;
        }

        err = -EACCES;
        if (ipcperms(ns, &shp->shm_perm, acc_mode))
                goto out_unlock;

        err = security_shm_shmat(&shp->shm_perm, shmaddr, shmflg);
        if (err)
                goto out_unlock;

        ipc_lock_object(&shp->shm_perm);

        /* check if shm_destroy() is tearing down shp */
        if (!ipc_valid_object(&shp->shm_perm)) {
                ipc_unlock_object(&shp->shm_perm);
                err = -EIDRM;
                goto out_unlock;
        }

        /*
         * We need to take a reference to the real shm file to prevent the
         * pointer from becoming stale in cases where the lifetime of the outer
         * file extends beyond that of the shm segment.  It's not usually
         * possible, but it can happen during remap_file_pages() emulation as
         * that unmaps the memory, then does ->mmap() via file reference only.
         * We'll deny the ->mmap() if the shm segment was since removed, but to
         * detect shm ID reuse we need to compare the file pointers.
         */
        base = get_file(shp->shm_file);
        shp->shm_nattch++;
        size = i_size_read(file_inode(base));
        ipc_unlock_object(&shp->shm_perm);
        rcu_read_unlock();

        err = -ENOMEM;
        sfd = kzalloc_obj(*sfd);
        if (!sfd) {
                fput(base);
                goto out_nattch;
        }

        file = alloc_file_clone(base, f_flags,
                          is_file_hugepages(base) ?
                                &shm_file_operations_huge :
                                &shm_file_operations);
        err = PTR_ERR(file);
        if (IS_ERR(file)) {
                kfree(sfd);
                fput(base);
                goto out_nattch;
        }

        sfd->id = shp->shm_perm.id;
        sfd->ns = get_ipc_ns(ns);
        sfd->file = base;
        sfd->vm_ops = NULL;
        file->private_data = sfd;

        err = security_mmap_file(file, prot, flags);
        if (err)
                goto out_fput;

        if (mmap_write_lock_killable(current->mm)) {
                err = -EINTR;
                goto out_fput;
        }

        if (addr && !(shmflg & SHM_REMAP)) {
                err = -EINVAL;
                if (addr + size < addr)
                        goto invalid;

                if (find_vma_intersection(current->mm, addr, addr + size))
                        goto invalid;
        }

        addr = do_mmap(file, addr, size, prot, flags, 0, 0, &populate, NULL);
        *raddr = addr;
        err = 0;
        if (IS_ERR_VALUE(addr))
                err = (long)addr;
invalid:
        mmap_write_unlock(current->mm);
        if (populate)
                mm_populate(addr, populate);

out_fput:
        fput(file);

out_nattch:
        down_write(&shm_ids(ns).rwsem);
        shp = shm_lock(ns, shmid);
        shp->shm_nattch--;

        if (shm_may_destroy(shp))
                shm_destroy(ns, shp);
        else
                shm_unlock(shp);
        up_write(&shm_ids(ns).rwsem);
        return err;

out_unlock:
        rcu_read_unlock();
out:
        return err;
}

SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
{
        unsigned long ret;
        long err;

        err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
        if (err)
                return err;
        force_successful_syscall_return();
        return (long)ret;
}

#ifdef CONFIG_COMPAT

#ifndef COMPAT_SHMLBA
#define COMPAT_SHMLBA        SHMLBA
#endif

COMPAT_SYSCALL_DEFINE3(shmat, int, shmid, compat_uptr_t, shmaddr, int, shmflg)
{
        unsigned long ret;
        long err;

        err = do_shmat(shmid, compat_ptr(shmaddr), shmflg, &ret, COMPAT_SHMLBA);
        if (err)
                return err;
        force_successful_syscall_return();
        return (long)ret;
}
#endif

/*
 * detach and kill segment if marked destroyed.
 * The work is done in shm_close.
 */
long ksys_shmdt(char __user *shmaddr)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long addr = (unsigned long)shmaddr;
        int retval = -EINVAL;
#ifdef CONFIG_MMU
        loff_t size = 0;
        struct file *file;
        VMA_ITERATOR(vmi, mm, addr);
#endif

        if (addr & ~PAGE_MASK)
                return retval;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        /*
         * This function tries to be smart and unmap shm segments that
         * were modified by partial mlock or munmap calls:
         * - It first determines the size of the shm segment that should be
         *   unmapped: It searches for a vma that is backed by shm and that
         *   started at address shmaddr. It records it's size and then unmaps
         *   it.
         * - Then it unmaps all shm vmas that started at shmaddr and that
         *   are within the initially determined size and that are from the
         *   same shm segment from which we determined the size.
         * Errors from do_munmap are ignored: the function only fails if
         * it's called with invalid parameters or if it's called to unmap
         * a part of a vma. Both calls in this function are for full vmas,
         * the parameters are directly copied from the vma itself and always
         * valid - therefore do_munmap cannot fail. (famous last words?)
         */
        /*
         * If it had been mremap()'d, the starting address would not
         * match the usual checks anyway. So assume all vma's are
         * above the starting address given.
         */

#ifdef CONFIG_MMU
        for_each_vma(vmi, vma) {
                /*
                 * Check if the starting address would match, i.e. it's
                 * a fragment created by mprotect() and/or munmap(), or it
                 * otherwise it starts at this address with no hassles.
                 */
                if ((vma->vm_ops == &shm_vm_ops) &&
                        (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {

                        /*
                         * Record the file of the shm segment being
                         * unmapped.  With mremap(), someone could place
                         * page from another segment but with equal offsets
                         * in the range we are unmapping.
                         */
                        file = vma->vm_file;
                        size = i_size_read(file_inode(vma->vm_file));
                        do_vmi_align_munmap(&vmi, vma, mm, vma->vm_start,
                                            vma->vm_end, NULL, false);
                        /*
                         * We discovered the size of the shm segment, so
                         * break out of here and fall through to the next
                         * loop that uses the size information to stop
                         * searching for matching vma's.
                         */
                        retval = 0;
                        vma = vma_next(&vmi);
                        break;
                }
        }

        /*
         * We need look no further than the maximum address a fragment
         * could possibly have landed at. Also cast things to loff_t to
         * prevent overflows and make comparisons vs. equal-width types.
         */
        size = PAGE_ALIGN(size);
        while (vma && (loff_t)(vma->vm_end - addr) <= size) {
                /* finding a matching vma now does not alter retval */
                if ((vma->vm_ops == &shm_vm_ops) &&
                    ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
                    (vma->vm_file == file)) {
                        do_vmi_align_munmap(&vmi, vma, mm, vma->vm_start,
                                            vma->vm_end, NULL, false);
                }

                vma = vma_next(&vmi);
        }

#else        /* CONFIG_MMU */
        vma = vma_lookup(mm, addr);
        /* under NOMMU conditions, the exact address to be destroyed must be
         * given
         */
        if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
                do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
                retval = 0;
        }

#endif

        mmap_write_unlock(mm);
        return retval;
}

SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
{
        return ksys_shmdt(shmaddr);
}

#ifdef CONFIG_PROC_FS
static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
{
        struct pid_namespace *pid_ns = ipc_seq_pid_ns(s);
        struct user_namespace *user_ns = seq_user_ns(s);
        struct kern_ipc_perm *ipcp = it;
        struct shmid_kernel *shp;
        unsigned long rss = 0, swp = 0;

        shp = container_of(ipcp, struct shmid_kernel, shm_perm);
        shm_add_rss_swap(shp, &rss, &swp);

#if BITS_PER_LONG <= 32
#define SIZE_SPEC "%10lu"
#else
#define SIZE_SPEC "%21lu"
#endif

        seq_printf(s,
                   "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
                   "%5lu %5u %5u %5u %5u %10llu %10llu %10llu "
                   SIZE_SPEC " " SIZE_SPEC "\n",
                   shp->shm_perm.key,
                   shp->shm_perm.id,
                   shp->shm_perm.mode,
                   shp->shm_segsz,
                   pid_nr_ns(shp->shm_cprid, pid_ns),
                   pid_nr_ns(shp->shm_lprid, pid_ns),
                   shp->shm_nattch,
                   from_kuid_munged(user_ns, shp->shm_perm.uid),
                   from_kgid_munged(user_ns, shp->shm_perm.gid),
                   from_kuid_munged(user_ns, shp->shm_perm.cuid),
                   from_kgid_munged(user_ns, shp->shm_perm.cgid),
                   shp->shm_atim,
                   shp->shm_dtim,
                   shp->shm_ctim,
                   rss * PAGE_SIZE,
                   swp * PAGE_SIZE);

        return 0;
}
#endif



























































































































































































   70 

















   29 

















   28 












   10 

































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__

#include <linux/cleanup.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/minmax.h>
#include <linux/nospec.h>
#include <linux/sched.h>
#include <linux/ucopysize.h>

#include <asm/uaccess.h>

/*
 * Architectures that support memory tagging (assigning tags to memory regions,
 * embedding these tags into addresses that point to these memory regions, and
 * checking that the memory and the pointer tags match on memory accesses)
 * redefine this macro to strip tags from pointers.
 *
 * Passing down mm_struct allows to define untagging rules on per-process
 * basis.
 *
 * It's defined as noop for architectures that don't support memory tagging.
 */
#ifndef untagged_addr
#define untagged_addr(addr) (addr)
#endif

#ifndef untagged_addr_remote
#define untagged_addr_remote(mm, addr)        ({                \
        mmap_assert_locked(mm);                                \
        untagged_addr(addr);                                \
})
#endif

#ifdef masked_user_access_begin
 #define can_do_masked_user_access() 1
# ifndef masked_user_write_access_begin
#  define masked_user_write_access_begin masked_user_access_begin
# endif
# ifndef masked_user_read_access_begin
#  define masked_user_read_access_begin masked_user_access_begin
#endif
#else
 #define can_do_masked_user_access() 0
 #define masked_user_access_begin(src) NULL
 #define masked_user_read_access_begin(src) NULL
 #define masked_user_write_access_begin(src) NULL
 #define mask_user_address(src) (src)
#endif

/*
 * Architectures should provide two primitives (raw_copy_{to,from}_user())
 * and get rid of their private instances of copy_{to,from}_user() and
 * __copy_{to,from}_user{,_inatomic}().
 *
 * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
 * return the amount left to copy.  They should assume that access_ok() has
 * already been checked (and succeeded); they should *not* zero-pad anything.
 * No KASAN or object size checks either - those belong here.
 *
 * Both of these functions should attempt to copy size bytes starting at from
 * into the area starting at to.  They must not fetch or store anything
 * outside of those areas.  Return value must be between 0 (everything
 * copied successfully) and size (nothing copied).
 *
 * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
 * at to must become equal to the bytes fetched from the corresponding area
 * starting at from.  All data past to + size - N must be left unmodified.
 *
 * If copying succeeds, the return value must be 0.  If some data cannot be
 * fetched, it is permitted to copy less than had been fetched; the only
 * hard requirement is that not storing anything at all (i.e. returning size)
 * should happen only when nothing could be copied.  In other words, you don't
 * have to squeeze as much as possible - it is allowed, but not necessary.
 *
 * For raw_copy_from_user() to always points to kernel memory and no faults
 * on store should happen.  Interpretation of from is affected by set_fs().
 * For raw_copy_to_user() it's the other way round.
 *
 * Both can be inlined - it's up to architectures whether it wants to bother
 * with that.  They should not be used directly; they are used to implement
 * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
 * that are used instead.  Out of those, __... ones are inlined.  Plain
 * copy_{to,from}_user() might or might not be inlined.  If you want them
 * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
 *
 * NOTE: only copy_from_user() zero-pads the destination in case of short copy.
 * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
 * at all; their callers absolutely must check the return value.
 *
 * Biarch ones should also provide raw_copy_in_user() - similar to the above,
 * but both source and destination are __user pointers (affected by set_fs()
 * as usual) and both source and destination can trigger faults.
 */

static __always_inline __must_check unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
        unsigned long res;

        instrument_copy_from_user_before(to, from, n);
        check_object_size(to, n, false);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        return res;
}

static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res;

        might_fault();
        instrument_copy_from_user_before(to, from, n);
        if (should_fail_usercopy())
                return n;
        check_object_size(to, n, false);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        return res;
}

/**
 * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
 * @to:   Destination address, in user space.
 * @from: Source address, in kernel space.
 * @n:    Number of bytes to copy.
 *
 * Context: User context only.
 *
 * Copy data from kernel space to user space.  Caller must check
 * the specified block with access_ok() before calling this function.
 * The caller should also make sure he pins the user space address
 * so that we don't result in page fault and sleep.
 */
static __always_inline __must_check unsigned long
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

static __always_inline __must_check unsigned long
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

/*
 * Architectures that #define INLINE_COPY_TO_USER use this function
 * directly in the normal copy_to/from_user(), the other ones go
 * through an extern _copy_to/from_user(), which expands the same code
 * here.
 */
static inline __must_check unsigned long
_inline_copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (should_fail_usercopy())
                goto fail;
        if (can_do_masked_user_access())
                from = mask_user_address(from);
        else {
                if (!access_ok(from, n))
                        goto fail;
                /*
                 * Ensure that bad access_ok() speculation will not
                 * lead to nasty side effects *after* the copy is
                 * finished:
                 */
                barrier_nospec();
        }
        instrument_copy_from_user_before(to, from, n);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        if (likely(!res))
                return 0;
fail:
        memset(to + (n - res), 0, res);
        return res;
}
#ifndef INLINE_COPY_FROM_USER
extern __must_check unsigned long
_copy_from_user(void *, const void __user *, unsigned long);
#endif

static inline __must_check unsigned long
_inline_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (access_ok(to, n)) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
#ifndef INLINE_COPY_TO_USER
extern __must_check unsigned long
_copy_to_user(void __user *, const void *, unsigned long);
#endif

static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
        if (!check_copy_size(to, n, false))
                return n;
#ifdef INLINE_COPY_FROM_USER
        return _inline_copy_from_user(to, from, n);
#else
        return _copy_from_user(to, from, n);
#endif
}

static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
        if (!check_copy_size(from, n, true))
                return n;

#ifdef INLINE_COPY_TO_USER
        return _inline_copy_to_user(to, from, n);
#else
        return _copy_to_user(to, from, n);
#endif
}

#ifndef copy_mc_to_kernel
/*
 * Without arch opt-in this generic copy_mc_to_kernel() will not handle
 * #MC (or arch equivalent) during source read.
 */
static inline unsigned long __must_check
copy_mc_to_kernel(void *dst, const void *src, size_t cnt)
{
        memcpy(dst, src, cnt);
        return 0;
}
#endif

static __always_inline void pagefault_disabled_inc(void)
{
        current->pagefault_disabled++;
}

static __always_inline void pagefault_disabled_dec(void)
{
        current->pagefault_disabled--;
}

/*
 * These routines enable/disable the pagefault handler. If disabled, it will
 * not take any locks and go straight to the fixup table.
 *
 * User access methods will not sleep when called from a pagefault_disabled()
 * environment.
 */
static inline void pagefault_disable(void)
{
        pagefault_disabled_inc();
        /*
         * make sure to have issued the store before a pagefault
         * can hit.
         */
        barrier();
}

static inline void pagefault_enable(void)
{
        /*
         * make sure to issue those last loads/stores before enabling
         * the pagefault handler again.
         */
        barrier();
        pagefault_disabled_dec();
}

/*
 * Is the pagefault handler disabled? If so, user access methods will not sleep.
 */
static inline bool pagefault_disabled(void)
{
        return current->pagefault_disabled != 0;
}

/*
 * The pagefault handler is in general disabled by pagefault_disable() or
 * when in irq context (via in_atomic()).
 *
 * This function should only be used by the fault handlers. Other users should
 * stick to pagefault_disabled().
 * Please NEVER use preempt_disable() to disable the fault handler. With
 * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
 * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
 */
#define faulthandler_disabled() (pagefault_disabled() || in_atomic())

DEFINE_LOCK_GUARD_0(pagefault, pagefault_disable(), pagefault_enable())

#ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS

/**
 * probe_subpage_writeable: probe the user range for write faults at sub-page
 *                            granularity (e.g. arm64 MTE)
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Returns 0 on success, the number of bytes not probed on fault.
 *
 * It is expected that the caller checked for the write permission of each
 * page in the range either by put_user() or GUP. The architecture port can
 * implement a more efficient get_user() probing if the same sub-page faults
 * are triggered by either a read or a write.
 */
static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size)
{
        return 0;
}

#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */

#ifndef ARCH_HAS_NONTEMPORAL_UACCESS

static inline __must_check unsigned long
copy_from_user_inatomic_nontemporal(void *to, const void __user *from,
                                  unsigned long n)
{
        if (can_do_masked_user_access())
                from = mask_user_address(from);
        else
                if (!access_ok(from, n))
                        return n;
        return __copy_from_user_inatomic(to, from, n);
}

#endif                /* ARCH_HAS_NONTEMPORAL_UACCESS */

extern __must_check int check_zeroed_user(const void __user *from, size_t size);

/**
 * copy_struct_from_user: copy a struct from userspace
 * @dst:   Destination address, in kernel space. This buffer must be @ksize
 *         bytes long.
 * @ksize: Size of @dst struct.
 * @src:   Source address, in userspace.
 * @usize: (Alleged) size of @src struct.
 *
 * Copies a struct from userspace to kernel space, in a way that guarantees
 * backwards-compatibility for struct syscall arguments (as long as future
 * struct extensions are made such that all new fields are *appended* to the
 * old struct, and zeroed-out new fields have the same meaning as the old
 * struct).
 *
 * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
 * The recommended usage is something like the following:
 *
 *   SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
 *   {
 *      int err;
 *      struct foo karg = {};
 *
 *      if (usize > PAGE_SIZE)
 *        return -E2BIG;
 *      if (usize < FOO_SIZE_VER0)
 *        return -EINVAL;
 *
 *      err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
 *      if (err)
 *        return err;
 *
 *      // ...
 *   }
 *
 * There are three cases to consider:
 *  * If @usize == @ksize, then it's copied verbatim.
 *  * If @usize < @ksize, then the userspace has passed an old struct to a
 *    newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
 *    are to be zero-filled.
 *  * If @usize > @ksize, then the userspace has passed a new struct to an
 *    older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
 *    are checked to ensure they are zeroed, otherwise -E2BIG is returned.
 *
 * Returns (in all cases, some data may have been copied):
 *  * -E2BIG:  (@usize > @ksize) and there are non-zero trailing bytes in @src.
 *  * -EFAULT: access to userspace failed.
 */
static __always_inline __must_check int
copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
                      size_t usize)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        /* Double check if ksize is larger than a known object size. */
        if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1)))
                return -E2BIG;

        /* Deal with trailing bytes. */
        if (usize < ksize) {
                memset(dst + size, 0, rest);
        } else if (usize > ksize) {
                int ret = check_zeroed_user(src + size, rest);
                if (ret <= 0)
                        return ret ?: -E2BIG;
        }
        /* Copy the interoperable parts of the struct. */
        if (copy_from_user(dst, src, size))
                return -EFAULT;
        return 0;
}

/**
 * copy_struct_to_user: copy a struct to userspace
 * @dst:   Destination address, in userspace. This buffer must be @ksize
 *         bytes long.
 * @usize: (Alleged) size of @dst struct.
 * @src:   Source address, in kernel space.
 * @ksize: Size of @src struct.
 * @ignored_trailing: Set to %true if there was a non-zero byte in @src that
 * userspace cannot see because they are using an smaller struct.
 *
 * Copies a struct from kernel space to userspace, in a way that guarantees
 * backwards-compatibility for struct syscall arguments (as long as future
 * struct extensions are made such that all new fields are *appended* to the
 * old struct, and zeroed-out new fields have the same meaning as the old
 * struct).
 *
 * Some syscalls may wish to make sure that userspace knows about everything in
 * the struct, and if there is a non-zero value that userspce doesn't know
 * about, they want to return an error (such as -EMSGSIZE) or have some other
 * fallback (such as adding a "you're missing some information" flag). If
 * @ignored_trailing is non-%NULL, it will be set to %true if there was a
 * non-zero byte that could not be copied to userspace (ie. was past @usize).
 *
 * While unconditionally returning an error in this case is the simplest
 * solution, for maximum backward compatibility you should try to only return
 * -EMSGSIZE if the user explicitly requested the data that couldn't be copied.
 * Note that structure sizes can change due to header changes and simple
 * recompilations without code changes(!), so if you care about
 * @ignored_trailing you probably want to make sure that any new field data is
 * associated with a flag. Otherwise you might assume that a program knows
 * about data it does not.
 *
 * @ksize is just sizeof(*src), and @usize should've been passed by userspace.
 * The recommended usage is something like the following:
 *
 *   SYSCALL_DEFINE2(foobar, struct foo __user *, uarg, size_t, usize)
 *   {
 *      int err;
 *      bool ignored_trailing;
 *      struct foo karg = {};
 *
 *      if (usize > PAGE_SIZE)
 *                return -E2BIG;
 *      if (usize < FOO_SIZE_VER0)
 *                return -EINVAL;
 *
 *      // ... modify karg somehow ...
 *
 *      err = copy_struct_to_user(uarg, usize, &karg, sizeof(karg),
 *                                  &ignored_trailing);
 *      if (err)
 *                return err;
 *      if (ignored_trailing)
 *                return -EMSGSIZE:
 *
 *      // ...
 *   }
 *
 * There are three cases to consider:
 *  * If @usize == @ksize, then it's copied verbatim.
 *  * If @usize < @ksize, then the kernel is trying to pass userspace a newer
 *    struct than it supports. Thus we only copy the interoperable portions
 *    (@usize) and ignore the rest (but @ignored_trailing is set to %true if
 *    any of the trailing (@ksize - @usize) bytes are non-zero).
 *  * If @usize > @ksize, then the kernel is trying to pass userspace an older
 *    struct than userspace supports. In order to make sure the
 *    unknown-to-the-kernel fields don't contain garbage values, we zero the
 *    trailing (@usize - @ksize) bytes.
 *
 * Returns (in all cases, some data may have been copied):
 *  * -EFAULT: access to userspace failed.
 */
static __always_inline __must_check int
copy_struct_to_user(void __user *dst, size_t usize, const void *src,
                    size_t ksize, bool *ignored_trailing)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        /* Double check if ksize is larger than a known object size. */
        if (WARN_ON_ONCE(ksize > __builtin_object_size(src, 1)))
                return -E2BIG;

        /* Deal with trailing bytes. */
        if (usize > ksize) {
                if (clear_user(dst + size, rest))
                        return -EFAULT;
        }
        if (ignored_trailing)
                *ignored_trailing = ksize < usize &&
                        memchr_inv(src + size, 0, rest) != NULL;
        /* Copy the interoperable parts of the struct. */
        if (copy_to_user(dst, src, size))
                return -EFAULT;
        return 0;
}

bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size);

long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size);

long copy_from_user_nofault(void *dst, const void __user *src, size_t size);
long notrace copy_to_user_nofault(void __user *dst, const void *src,
                size_t size);

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr,
                long count);

long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                long count);
long strnlen_user_nofault(const void __user *unsafe_addr, long count);

#ifdef arch_get_kernel_nofault
/*
 * Wrap the architecture implementation so that @label can be outside of a
 * cleanup() scope. A regular C goto works correctly, but ASM goto does
 * not. Clang rejects such an attempt, but GCC silently emits buggy code.
 */
#define __get_kernel_nofault(dst, src, type, label)                \
do {                                                                \
        __label__ local_label;                                        \
        arch_get_kernel_nofault(dst, src, type, local_label);        \
        if (0) {                                                \
        local_label:                                                \
                goto label;                                        \
        }                                                        \
} while (0)

#define __put_kernel_nofault(dst, src, type, label)                \
do {                                                                \
        __label__ local_label;                                        \
        arch_put_kernel_nofault(dst, src, type, local_label);        \
        if (0) {                                                \
        local_label:                                                \
                goto label;                                        \
        }                                                        \
} while (0)

#elif !defined(__get_kernel_nofault) /* arch_get_kernel_nofault */

#define __get_kernel_nofault(dst, src, type, label)        \
do {                                                        \
        type __user *p = (type __force __user *)(src);        \
        type data;                                        \
        if (__get_user(data, p))                        \
                goto label;                                \
        *(type *)dst = data;                                \
} while (0)

#define __put_kernel_nofault(dst, src, type, label)        \
do {                                                        \
        type __user *p = (type __force __user *)(dst);        \
        type data = *(type *)src;                        \
        if (__put_user(data, p))                        \
                goto label;                                \
} while (0)

#endif  /* !__get_kernel_nofault */

/**
 * get_kernel_nofault(): safely attempt to read from a location
 * @val: read into this variable
 * @ptr: address to read from
 *
 * Returns 0 on success, or -EFAULT.
 */
#define get_kernel_nofault(val, ptr) ({                                \
        const typeof(val) *__gk_ptr = (ptr);                        \
        copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\
})

#ifdef user_access_begin

#ifdef arch_unsafe_get_user
/*
 * Wrap the architecture implementation so that @label can be outside of a
 * cleanup() scope. A regular C goto works correctly, but ASM goto does
 * not. Clang rejects such an attempt, but GCC silently emits buggy code.
 *
 * Some architectures use internal local labels already, but this extra
 * indirection here is harmless because the compiler optimizes it out
 * completely in any case. This construct just ensures that the ASM GOTO
 * target is always in the local scope. The C goto 'label' works correctly
 * when leaving a cleanup() scope.
 */
#define unsafe_get_user(x, ptr, label)                        \
do {                                                        \
        __label__ local_label;                                \
        arch_unsafe_get_user(x, ptr, local_label);        \
        if (0) {                                        \
        local_label:                                        \
                goto label;                                \
        }                                                \
} while (0)

#define unsafe_put_user(x, ptr, label)                        \
do {                                                        \
        __label__ local_label;                                \
        arch_unsafe_put_user(x, ptr, local_label);        \
        if (0) {                                        \
        local_label:                                        \
                goto label;                                \
        }                                                \
} while (0)
#endif /* arch_unsafe_get_user */

#else /* user_access_begin */
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
#define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e)
static inline unsigned long user_access_save(void) { return 0UL; }
static inline void user_access_restore(unsigned long flags) { }
#endif /* !user_access_begin */

#ifndef user_write_access_begin
#define user_write_access_begin user_access_begin
#define user_write_access_end user_access_end
#endif
#ifndef user_read_access_begin
#define user_read_access_begin user_access_begin
#define user_read_access_end user_access_end
#endif

/* Define RW variant so the below _mode macro expansion works */
#define masked_user_rw_access_begin(u)        masked_user_access_begin(u)
#define user_rw_access_begin(u, s)        user_access_begin(u, s)

/* Scoped user access */

/* Cleanup wrapper functions */
static __always_inline void __scoped_user_read_access_end(const void *p)
{
        user_read_access_end();
};
static __always_inline void __scoped_user_write_access_end(const void *p)
{
        user_write_access_end();
};
static __always_inline void __scoped_user_rw_access_end(const void *p)
{
        user_access_end();
};

/**
 * __scoped_user_access_begin - Start a scoped user access
 * @mode:        The mode of the access class (read, write, rw)
 * @uptr:        The pointer to access user space memory
 * @size:        Size of the access
 * @elbl:        Error label to goto when the access region is rejected
 *
 * Internal helper for __scoped_user_access(). Don't use directly.
 */
#define __scoped_user_access_begin(mode, uptr, size, elbl)                \
({                                                                        \
        typeof(uptr) __retptr;                                                \
                                                                        \
        if (can_do_masked_user_access()) {                                \
                __retptr = masked_user_##mode##_access_begin(uptr);        \
        } else {                                                        \
                __retptr = uptr;                                        \
                if (!user_##mode##_access_begin(uptr, size))                \
                        goto elbl;                                        \
        }                                                                \
        __retptr;                                                        \
})

/**
 * __scoped_user_access - Open a scope for user access
 * @mode:        The mode of the access class (read, write, rw)
 * @uptr:        The pointer to access user space memory
 * @size:        Size of the access
 * @elbl:        Error label to goto when the access region is rejected. It
 *                must be placed outside the scope
 *
 * If the user access function inside the scope requires a fault label, it
 * can use @elbl or a different label outside the scope, which requires
 * that user access which is implemented with ASM GOTO has been properly
 * wrapped. See unsafe_get_user() for reference.
 *
 *        scoped_user_rw_access(ptr, efault) {
 *                unsafe_get_user(rval, &ptr->rval, efault);
 *                unsafe_put_user(wval, &ptr->wval, efault);
 *        }
 *        return 0;
 *  efault:
 *        return -EFAULT;
 *
 * The scope is internally implemented as a autoterminating nested for()
 * loop, which can be left with 'return', 'break' and 'goto' at any
 * point.
 *
 * When the scope is left user_##@_mode##_access_end() is automatically
 * invoked.
 *
 * When the architecture supports masked user access and the access region
 * which is determined by @uptr and @size is not a valid user space
 * address, i.e. < TASK_SIZE, the scope sets the pointer to a faulting user
 * space address and does not terminate early. This optimizes for the good
 * case and lets the performance uncritical bad case go through the fault.
 *
 * The eventual modification of the pointer is limited to the scope.
 * Outside of the scope the original pointer value is unmodified, so that
 * the original pointer value is available for diagnostic purposes in an
 * out of scope fault path.
 *
 * Nesting scoped user access into a user access scope is invalid and fails
 * the build. Nesting into other guards, e.g. pagefault is safe.
 *
 * The masked variant does not check the size of the access and relies on a
 * mapping hole (e.g. guard page) to catch an out of range pointer, the
 * first access to user memory inside the scope has to be within
 * @uptr ... @uptr + PAGE_SIZE - 1
 *
 * Don't use directly. Use scoped_masked_user_$MODE_access() instead.
 */
#define __scoped_user_access(mode, uptr, size, elbl)                                \
for (bool done = false; !done; done = true)                                        \
        for (auto _tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl);        \
             !done; done = true)                                                \
                /* Force modified pointer usage within the scope */                \
                for (const auto uptr  __cleanup(__scoped_user_##mode##_access_end) = \
                     _tmpptr; !done; done = true)

/**
 * scoped_user_read_access_size - Start a scoped user read access with given size
 * @usrc:        Pointer to the user space address to read from
 * @size:        Size of the access starting from @usrc
 * @elbl:        Error label to goto when the access region is rejected
 *
 * For further information see __scoped_user_access() above.
 */
#define scoped_user_read_access_size(usrc, size, elbl)                \
        __scoped_user_access(read, usrc, size, elbl)

/**
 * scoped_user_read_access - Start a scoped user read access
 * @usrc:        Pointer to the user space address to read from
 * @elbl:        Error label to goto when the access region is rejected
 *
 * The size of the access starting from @usrc is determined via sizeof(*@usrc)).
 *
 * For further information see __scoped_user_access() above.
 */
#define scoped_user_read_access(usrc, elbl)                                \
        scoped_user_read_access_size(usrc, sizeof(*(usrc)), elbl)

/**
 * scoped_user_write_access_size - Start a scoped user write access with given size
 * @udst:        Pointer to the user space address to write to
 * @size:        Size of the access starting from @udst
 * @elbl:        Error label to goto when the access region is rejected
 *
 * For further information see __scoped_user_access() above.
 */
#define scoped_user_write_access_size(udst, size, elbl)                        \
        __scoped_user_access(write, udst, size, elbl)

/**
 * scoped_user_write_access - Start a scoped user write access
 * @udst:        Pointer to the user space address to write to
 * @elbl:        Error label to goto when the access region is rejected
 *
 * The size of the access starting from @udst is determined via sizeof(*@udst)).
 *
 * For further information see __scoped_user_access() above.
 */
#define scoped_user_write_access(udst, elbl)                                \
        scoped_user_write_access_size(udst, sizeof(*(udst)), elbl)

/**
 * scoped_user_rw_access_size - Start a scoped user read/write access with given size
 * @uptr:        Pointer to the user space address to read from and write to
 * @size:        Size of the access starting from @uptr
 * @elbl:        Error label to goto when the access region is rejected
 *
 * For further information see __scoped_user_access() above.
 */
#define scoped_user_rw_access_size(uptr, size, elbl)                        \
        __scoped_user_access(rw, uptr, size, elbl)

/**
 * scoped_user_rw_access - Start a scoped user read/write access
 * @uptr:        Pointer to the user space address to read from and write to
 * @elbl:        Error label to goto when the access region is rejected
 *
 * The size of the access starting from @uptr is determined via sizeof(*@uptr)).
 *
 * For further information see __scoped_user_access() above.
 */
#define scoped_user_rw_access(uptr, elbl)                                \
        scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl)

/**
 * get_user_inline - Read user data inlined
 * @val:        The variable to store the value read from user memory
 * @usrc:        Pointer to the user space memory to read from
 *
 * Return: 0 if successful, -EFAULT when faulted
 *
 * Inlined variant of get_user(). Only use when there is a demonstrable
 * performance reason.
 */
#define get_user_inline(val, usrc)                                \
({                                                                \
        __label__ efault;                                        \
        typeof(usrc) _tmpsrc = usrc;                                \
        int _ret = 0;                                                \
                                                                \
        scoped_user_read_access(_tmpsrc, efault)                \
                unsafe_get_user(val, _tmpsrc, efault);                \
        if (0) {                                                \
        efault:                                                        \
                _ret = -EFAULT;                                        \
        }                                                        \
        _ret;                                                        \
})

/**
 * put_user_inline - Write to user memory inlined
 * @val:        The value to write
 * @udst:        Pointer to the user space memory to write to
 *
 * Return: 0 if successful, -EFAULT when faulted
 *
 * Inlined variant of put_user(). Only use when there is a demonstrable
 * performance reason.
 */
#define put_user_inline(val, udst)                                \
({                                                                \
        __label__ efault;                                        \
        typeof(udst) _tmpdst = udst;                                \
        int _ret = 0;                                                \
                                                                \
        scoped_user_write_access(_tmpdst, efault)                \
                unsafe_put_user(val, _tmpdst, efault);                \
        if (0) {                                                \
        efault:                                                        \
                _ret = -EFAULT;                                        \
        }                                                        \
        _ret;                                                        \
})

#ifdef CONFIG_HARDENED_USERCOPY
void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len);
#endif

#endif                /* __LINUX_UACCESS_H__ */





















































































































































































    3 



    3 





















    3 


















    3 









    3 
    3 
    3 


















    3 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
// SPDX-License-Identifier: GPL-2.0
/*
 * driver.c - centralized device driver management
 *
 * Copyright (c) 2002-3 Patrick Mochel
 * Copyright (c) 2002-3 Open Source Development Labs
 * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2007 Novell Inc.
 */

#include <linux/device/driver.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include "base.h"

static struct device *next_device(struct klist_iter *i)
{
        struct klist_node *n = klist_next(i);
        struct device *dev = NULL;
        struct device_private *dev_prv;

        if (n) {
                dev_prv = to_device_private_driver(n);
                dev = dev_prv->device;
        }
        return dev;
}

/**
 * driver_set_override() - Helper to set or clear driver override.
 * @dev: Device to change
 * @override: Address of string to change (e.g. &device->driver_override);
 *            The contents will be freed and hold newly allocated override.
 * @s: NUL-terminated string, new driver name to force a match, pass empty
 *     string to clear it ("" or "\n", where the latter is only for sysfs
 *     interface).
 * @len: length of @s
 *
 * Helper to set or clear driver override in a device, intended for the cases
 * when the driver_override field is allocated by driver/bus code.
 *
 * Returns: 0 on success or a negative error code on failure.
 */
int driver_set_override(struct device *dev, const char **override,
                        const char *s, size_t len)
{
        const char *new, *old;
        char *cp;

        if (!override || !s)
                return -EINVAL;

        /*
         * The stored value will be used in sysfs show callback (sysfs_emit()),
         * which has a length limit of PAGE_SIZE and adds a trailing newline.
         * Thus we can store one character less to avoid truncation during sysfs
         * show.
         */
        if (len >= (PAGE_SIZE - 1))
                return -EINVAL;

        /*
         * Compute the real length of the string in case userspace sends us a
         * bunch of \0 characters like python likes to do.
         */
        len = strlen(s);

        if (!len) {
                /* Empty string passed - clear override */
                device_lock(dev);
                old = *override;
                *override = NULL;
                device_unlock(dev);
                kfree(old);

                return 0;
        }

        cp = strnchr(s, len, '\n');
        if (cp)
                len = cp - s;

        new = kstrndup(s, len, GFP_KERNEL);
        if (!new)
                return -ENOMEM;

        device_lock(dev);
        old = *override;
        if (cp != s) {
                *override = new;
        } else {
                /* "\n" passed - clear override */
                kfree(new);
                *override = NULL;
        }
        device_unlock(dev);

        kfree(old);

        return 0;
}
EXPORT_SYMBOL_GPL(driver_set_override);

/**
 * driver_for_each_device - Iterator for devices bound to a driver.
 * @drv: Driver we're iterating.
 * @start: Device to begin with
 * @data: Data to pass to the callback.
 * @fn: Function to call for each device.
 *
 * Iterate over the @drv's list of devices calling @fn for each one.
 */
int driver_for_each_device(struct device_driver *drv, struct device *start,
                           void *data, device_iter_t fn)
{
        struct klist_iter i;
        struct device *dev;
        int error = 0;

        if (!drv)
                return -EINVAL;

        klist_iter_init_node(&drv->p->klist_devices, &i,
                             start ? &start->p->knode_driver : NULL);
        while (!error && (dev = next_device(&i)))
                error = fn(dev, data);
        klist_iter_exit(&i);
        return error;
}
EXPORT_SYMBOL_GPL(driver_for_each_device);

/**
 * driver_find_device - device iterator for locating a particular device.
 * @drv: The device's driver
 * @start: Device to begin with
 * @data: Data to pass to match function
 * @match: Callback function to check device
 *
 * This is similar to the driver_for_each_device() function above, but
 * it returns a reference to a device that is 'found' for later use, as
 * determined by the @match callback.
 *
 * The callback should return 0 if the device doesn't match and non-zero
 * if it does.  If the callback returns non-zero, this function will
 * return to the caller and not iterate over any more devices.
 */
struct device *driver_find_device(const struct device_driver *drv,
                                  struct device *start, const void *data,
                                  device_match_t match)
{
        struct klist_iter i;
        struct device *dev;

        if (!drv || !drv->p)
                return NULL;

        klist_iter_init_node(&drv->p->klist_devices, &i,
                             (start ? &start->p->knode_driver : NULL));
        while ((dev = next_device(&i))) {
                if (match(dev, data)) {
                        get_device(dev);
                        break;
                }
        }
        klist_iter_exit(&i);
        return dev;
}
EXPORT_SYMBOL_GPL(driver_find_device);

/**
 * driver_create_file - create sysfs file for driver.
 * @drv: driver.
 * @attr: driver attribute descriptor.
 */
int driver_create_file(const struct device_driver *drv,
                       const struct driver_attribute *attr)
{
        int error;

        if (drv)
                error = sysfs_create_file(&drv->p->kobj, &attr->attr);
        else
                error = -EINVAL;
        return error;
}
EXPORT_SYMBOL_GPL(driver_create_file);

/**
 * driver_remove_file - remove sysfs file for driver.
 * @drv: driver.
 * @attr: driver attribute descriptor.
 */
void driver_remove_file(const struct device_driver *drv,
                        const struct driver_attribute *attr)
{
        if (drv)
                sysfs_remove_file(&drv->p->kobj, &attr->attr);
}
EXPORT_SYMBOL_GPL(driver_remove_file);

int driver_add_groups(const struct device_driver *drv,
                      const struct attribute_group **groups)
{
        return sysfs_create_groups(&drv->p->kobj, groups);
}

void driver_remove_groups(const struct device_driver *drv,
                          const struct attribute_group **groups)
{
        sysfs_remove_groups(&drv->p->kobj, groups);
}

/**
 * driver_register - register driver with bus
 * @drv: driver to register
 *
 * We pass off most of the work to the bus_add_driver() call,
 * since most of the things we have to do deal with the bus
 * structures.
 */
int driver_register(struct device_driver *drv)
{
        int ret;
        struct device_driver *other;

        if (!bus_is_registered(drv->bus)) {
                pr_err("Driver '%s' was unable to register with bus_type '%s' because the bus was not initialized.\n",
                           drv->name, drv->bus->name);
                return -EINVAL;
        }

        if ((drv->bus->probe && drv->probe) ||
            (drv->bus->remove && drv->remove) ||
            (drv->bus->shutdown && drv->shutdown))
                pr_warn("Driver '%s' needs updating - please use "
                        "bus_type methods\n", drv->name);

        other = driver_find(drv->name, drv->bus);
        if (other) {
                pr_err("Error: Driver '%s' is already registered, "
                        "aborting...\n", drv->name);
                return -EBUSY;
        }

        ret = bus_add_driver(drv);
        if (ret)
                return ret;
        ret = driver_add_groups(drv, drv->groups);
        if (ret) {
                bus_remove_driver(drv);
                return ret;
        }
        kobject_uevent(&drv->p->kobj, KOBJ_ADD);
        deferred_probe_extend_timeout();

        return ret;
}
EXPORT_SYMBOL_GPL(driver_register);

/**
 * driver_unregister - remove driver from system.
 * @drv: driver.
 *
 * Again, we pass off most of the work to the bus-level call.
 */
void driver_unregister(struct device_driver *drv)
{
        if (!drv || !drv->p) {
                WARN(1, "Unexpected driver unregister!\n");
                return;
        }
        driver_remove_groups(drv, drv->groups);
        bus_remove_driver(drv);
}
EXPORT_SYMBOL_GPL(driver_unregister);





































































































































































    6 
















   12 














    2 


















   10 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright 2019 Google LLC
 */

#ifndef __LINUX_BLK_CRYPTO_INTERNAL_H
#define __LINUX_BLK_CRYPTO_INTERNAL_H

#include <linux/bio.h>
#include <linux/blk-mq.h>

/* Represents a crypto mode supported by blk-crypto  */
struct blk_crypto_mode {
        const char *name; /* name of this mode, shown in sysfs */
        const char *cipher_str; /* crypto API name (for fallback case) */
        unsigned int keysize; /* key size in bytes */
        unsigned int security_strength; /* security strength in bytes */
        unsigned int ivsize; /* iv size in bytes */
};

extern const struct blk_crypto_mode blk_crypto_modes[];

#ifdef CONFIG_BLK_INLINE_ENCRYPTION

int blk_crypto_sysfs_register(struct gendisk *disk);

void blk_crypto_sysfs_unregister(struct gendisk *disk);

void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
                             unsigned int inc);

bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio);

bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes,
                             struct bio_crypt_ctx *bc2);

static inline bool bio_crypt_ctx_back_mergeable(struct request *req,
                                                struct bio *bio)
{
        return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req),
                                       bio->bi_crypt_context);
}

static inline bool bio_crypt_ctx_front_mergeable(struct request *req,
                                                 struct bio *bio)
{
        return bio_crypt_ctx_mergeable(bio->bi_crypt_context,
                                       bio->bi_iter.bi_size, req->crypt_ctx);
}

static inline bool bio_crypt_ctx_merge_rq(struct request *req,
                                          struct request *next)
{
        return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req),
                                       next->crypt_ctx);
}

static inline void blk_crypto_rq_set_defaults(struct request *rq)
{
        rq->crypt_ctx = NULL;
        rq->crypt_keyslot = NULL;
}

static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
{
        return rq->crypt_ctx;
}

static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
{
        return rq->crypt_keyslot;
}

blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
                                    const struct blk_crypto_key *key,
                                    struct blk_crypto_keyslot **slot_ptr);

void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);

int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
                           const struct blk_crypto_key *key);

bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
                                const struct blk_crypto_config *cfg);

int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
                     void __user *argp);

static inline bool blk_crypto_supported(struct bio *bio)
{
        return blk_crypto_config_supported_natively(bio->bi_bdev,
                        &bio->bi_crypt_context->bc_key->crypto_cfg);
}

#else /* CONFIG_BLK_INLINE_ENCRYPTION */

static inline int blk_crypto_sysfs_register(struct gendisk *disk)
{
        return 0;
}

static inline void blk_crypto_sysfs_unregister(struct gendisk *disk)
{
}

static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
                                               struct bio *bio)
{
        return true;
}

static inline bool bio_crypt_ctx_front_mergeable(struct request *req,
                                                 struct bio *bio)
{
        return true;
}

static inline bool bio_crypt_ctx_back_mergeable(struct request *req,
                                                struct bio *bio)
{
        return true;
}

static inline bool bio_crypt_ctx_merge_rq(struct request *req,
                                          struct request *next)
{
        return true;
}

static inline void blk_crypto_rq_set_defaults(struct request *rq) { }

static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
{
        return false;
}

static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
{
        return false;
}

static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
                                   void __user *argp)
{
        return -ENOTTY;
}

static inline bool blk_crypto_supported(struct bio *bio)
{
        return false;
}

#endif /* CONFIG_BLK_INLINE_ENCRYPTION */

void __bio_crypt_advance(struct bio *bio, unsigned int bytes);
static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes)
{
        if (bio_has_crypt_ctx(bio))
                __bio_crypt_advance(bio, bytes);
}

void __bio_crypt_free_ctx(struct bio *bio);
static inline void bio_crypt_free_ctx(struct bio *bio)
{
        if (bio_has_crypt_ctx(bio))
                __bio_crypt_free_ctx(bio);
}

static inline void bio_crypt_do_front_merge(struct request *rq,
                                            struct bio *bio)
{
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        if (bio_has_crypt_ctx(bio))
                memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun,
                       sizeof(rq->crypt_ctx->bc_dun));
#endif
}

blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq);
static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq)
{
        if (blk_crypto_rq_is_encrypted(rq))
                return __blk_crypto_rq_get_keyslot(rq);
        return BLK_STS_OK;
}

void __blk_crypto_rq_put_keyslot(struct request *rq);
static inline void blk_crypto_rq_put_keyslot(struct request *rq)
{
        if (blk_crypto_rq_has_keyslot(rq))
                __blk_crypto_rq_put_keyslot(rq);
}

void __blk_crypto_free_request(struct request *rq);
static inline void blk_crypto_free_request(struct request *rq)
{
        if (blk_crypto_rq_is_encrypted(rq))
                __blk_crypto_free_request(rq);
}

int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
                             gfp_t gfp_mask);
/**
 * blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio
 *                            is inserted
 * @rq: The request to prepare
 * @bio: The first bio being inserted into the request
 * @gfp_mask: Memory allocation flags
 *
 * Return: 0 on success, -ENOMEM if out of memory.  -ENOMEM is only possible if
 *           @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM.
 */
static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
                                         gfp_t gfp_mask)
{
        if (bio_has_crypt_ctx(bio))
                return __blk_crypto_rq_bio_prep(rq, bio, gfp_mask);
        return 0;
}

bool blk_crypto_fallback_bio_prep(struct bio *bio);

#ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK

int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num);

int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key);

#else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */

static inline int
blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
{
        pr_warn_once("crypto API fallback is disabled\n");
        return -ENOPKG;
}

static inline int
blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
{
        return 0;
}

#endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */

#endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */
























    1 







    1 











    1 


    1 
    1 



    1 





















    1 



    1 





    1 

    1 





    1 









    1 
























































































































































    1 


    1 


















    1 









    1 






















    1 













































































    1 


    1 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/random.h>
#include <linux/buffer_head.h>
#include <linux/utsname.h>
#include <linux/kthread.h>

#include "ext4.h"

/* Checksumming functions */
static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int offset = offsetof(struct mmp_struct, mmp_checksum);
        __u32 csum;

        csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset);

        return cpu_to_le32(csum);
}

static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
        if (!ext4_has_feature_metadata_csum(sb))
                return 1;

        return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
}

static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
        if (!ext4_has_feature_metadata_csum(sb))
                return;

        mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
}

/*
 * Write the MMP block using REQ_SYNC to try to get the block on-disk
 * faster.
 */
static int write_mmp_block_thawed(struct super_block *sb,
                                  struct buffer_head *bh)
{
        struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);

        ext4_mmp_csum_set(sb, mmp);
        lock_buffer(bh);
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
        submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
        wait_on_buffer(bh);
        if (unlikely(!buffer_uptodate(bh)))
                return -EIO;
        return 0;
}

static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
{
        /*
         * We protect against freezing so that we don't create dirty buffers
         * on frozen filesystem.
         */
        scoped_guard(super_write, sb)
                return write_mmp_block_thawed(sb, bh);
}

/*
 * Read the MMP block. It _must_ be read from disk and hence we clear the
 * uptodate flag on the buffer.
 */
static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
                          ext4_fsblk_t mmp_block)
{
        struct mmp_struct *mmp;
        int ret;

        if (*bh)
                clear_buffer_uptodate(*bh);

        /* This would be sb_bread(sb, mmp_block), except we need to be sure
         * that the MD RAID device cache has been bypassed, and that the read
         * is not blocked in the elevator. */
        if (!*bh) {
                *bh = sb_getblk(sb, mmp_block);
                if (!*bh) {
                        ret = -ENOMEM;
                        goto warn_exit;
                }
        }

        lock_buffer(*bh);
        ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false);
        if (ret)
                goto warn_exit;

        mmp = (struct mmp_struct *)((*bh)->b_data);
        if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
                ret = -EFSCORRUPTED;
                goto warn_exit;
        }
        if (!ext4_mmp_csum_verify(sb, mmp)) {
                ret = -EFSBADCRC;
                goto warn_exit;
        }
        return 0;
warn_exit:
        brelse(*bh);
        *bh = NULL;
        ext4_warning(sb, "Error %d while reading MMP block %llu",
                     ret, mmp_block);
        return ret;
}

/*
 * Dump as much information as possible to help the admin.
 */
void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
                    const char *function, unsigned int line, const char *msg)
{
        __ext4_warning(sb, function, line, "%s", msg);
        __ext4_warning(sb, function, line,
                       "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s",
                       (unsigned long long)le64_to_cpu(mmp->mmp_time),
                       (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename,
                       (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname);
}

/*
 * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
 */
static int kmmpd(void *data)
{
        struct super_block *sb = data;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh;
        struct mmp_struct *mmp;
        ext4_fsblk_t mmp_block;
        u32 seq = 0;
        unsigned long failed_writes = 0;
        int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
        unsigned mmp_check_interval;
        unsigned long last_update_time;
        unsigned long diff;
        int retval = 0;

        mmp_block = le64_to_cpu(es->s_mmp_block);
        mmp = (struct mmp_struct *)(bh->b_data);
        mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
        /*
         * Start with the higher mmp_check_interval and reduce it if
         * the MMP block is being updated on time.
         */
        mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
                                 EXT4_MMP_MIN_CHECK_INTERVAL);
        mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);

        memcpy(mmp->mmp_nodename, init_utsname()->nodename,
               sizeof(mmp->mmp_nodename));

        while (!kthread_should_stop() && !ext4_emergency_state(sb)) {
                if (!ext4_has_feature_mmp(sb)) {
                        ext4_warning(sb, "kmmpd being stopped since MMP feature"
                                     " has been disabled.");
                        goto wait_to_exit;
                }
                if (++seq > EXT4_MMP_SEQ_MAX)
                        seq = 1;

                mmp->mmp_seq = cpu_to_le32(seq);
                mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
                last_update_time = jiffies;

                retval = write_mmp_block(sb, bh);
                /*
                 * Don't spew too many error messages. Print one every
                 * (s_mmp_update_interval * 60) seconds.
                 */
                if (retval) {
                        if ((failed_writes % 60) == 0) {
                                ext4_error_err(sb, -retval,
                                               "Error writing to MMP block");
                        }
                        failed_writes++;
                }

                diff = jiffies - last_update_time;
                if (diff < mmp_update_interval * HZ)
                        schedule_timeout_interruptible(mmp_update_interval *
                                                       HZ - diff);

                /*
                 * We need to make sure that more than mmp_check_interval
                 * seconds have not passed since writing. If that has happened
                 * we need to check if the MMP block is as we left it.
                 */
                diff = jiffies - last_update_time;
                if (diff > mmp_check_interval * HZ) {
                        struct buffer_head *bh_check = NULL;
                        struct mmp_struct *mmp_check;

                        retval = read_mmp_block(sb, &bh_check, mmp_block);
                        if (retval) {
                                ext4_error_err(sb, -retval,
                                               "error reading MMP data: %d",
                                               retval);
                                goto wait_to_exit;
                        }

                        mmp_check = (struct mmp_struct *)(bh_check->b_data);
                        if (mmp->mmp_seq != mmp_check->mmp_seq ||
                            memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
                                   sizeof(mmp->mmp_nodename))) {
                                dump_mmp_msg(sb, mmp_check,
                                             "Error while updating MMP info. "
                                             "The filesystem seems to have been"
                                             " multiply mounted.");
                                ext4_error_err(sb, EBUSY, "abort");
                                put_bh(bh_check);
                                retval = -EBUSY;
                                goto wait_to_exit;
                        }
                        put_bh(bh_check);
                }

                 /*
                 * Adjust the mmp_check_interval depending on how much time
                 * it took for the MMP block to be written.
                 */
                mmp_check_interval = clamp(EXT4_MMP_CHECK_MULT * diff / HZ,
                                           EXT4_MMP_MIN_CHECK_INTERVAL,
                                           EXT4_MMP_MAX_CHECK_INTERVAL);
                mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
        }

        /*
         * Unmount seems to be clean.
         */
        mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
        mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());

        retval = write_mmp_block(sb, bh);

wait_to_exit:
        while (!kthread_should_stop()) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (!kthread_should_stop())
                        schedule();
        }
        set_current_state(TASK_RUNNING);
        return retval;
}

void ext4_stop_mmpd(struct ext4_sb_info *sbi)
{
        if (sbi->s_mmp_tsk) {
                kthread_stop(sbi->s_mmp_tsk);
                brelse(sbi->s_mmp_bh);
                sbi->s_mmp_tsk = NULL;
        }
}

/*
 * Get a random new sequence number but make sure it is not greater than
 * EXT4_MMP_SEQ_MAX.
 */
static unsigned int mmp_new_seq(void)
{
        return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1);
}

/*
 * Protect the filesystem from being mounted more than once.
 */
int ext4_multi_mount_protect(struct super_block *sb,
                                    ext4_fsblk_t mmp_block)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct buffer_head *bh = NULL;
        struct mmp_struct *mmp = NULL;
        u32 seq;
        unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
        unsigned int wait_time = 0;
        int retval;

        if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
            mmp_block >= ext4_blocks_count(es)) {
                ext4_warning(sb, "Invalid MMP block in superblock");
                retval = -EINVAL;
                goto failed;
        }

        retval = read_mmp_block(sb, &bh, mmp_block);
        if (retval)
                goto failed;

        mmp = (struct mmp_struct *)(bh->b_data);

        if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
                mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;

        /*
         * If check_interval in MMP block is larger, use that instead of
         * update_interval from the superblock.
         */
        if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
                mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);

        seq = le32_to_cpu(mmp->mmp_seq);
        if (seq == EXT4_MMP_SEQ_CLEAN)
                goto skip;

        if (seq == EXT4_MMP_SEQ_FSCK) {
                dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
                retval = -EBUSY;
                goto failed;
        }

        wait_time = min(mmp_check_interval * 2 + 1,
                        mmp_check_interval + 60);

        /* Print MMP interval if more than 20 secs. */
        if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
                ext4_warning(sb, "MMP interval %u higher than expected, please"
                             " wait.\n", wait_time * 2);

        if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
                ext4_warning(sb, "MMP startup interrupted, failing mount\n");
                retval = -ETIMEDOUT;
                goto failed;
        }

        retval = read_mmp_block(sb, &bh, mmp_block);
        if (retval)
                goto failed;
        mmp = (struct mmp_struct *)(bh->b_data);
        if (seq != le32_to_cpu(mmp->mmp_seq)) {
                dump_mmp_msg(sb, mmp,
                             "Device is already active on another node.");
                retval = -EBUSY;
                goto failed;
        }

skip:
        /*
         * write a new random sequence number.
         */
        seq = mmp_new_seq();
        mmp->mmp_seq = cpu_to_le32(seq);

        /*
         * On mount / remount we are protected against fs freezing (by s_umount
         * semaphore) and grabbing freeze protection upsets lockdep
         */
        retval = write_mmp_block_thawed(sb, bh);
        if (retval)
                goto failed;

        /*
         * wait for MMP interval and check mmp_seq.
         */
        if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
                ext4_warning(sb, "MMP startup interrupted, failing mount");
                retval = -ETIMEDOUT;
                goto failed;
        }

        retval = read_mmp_block(sb, &bh, mmp_block);
        if (retval)
                goto failed;
        mmp = (struct mmp_struct *)(bh->b_data);
        if (seq != le32_to_cpu(mmp->mmp_seq)) {
                dump_mmp_msg(sb, mmp,
                             "Device is already active on another node.");
                retval = -EBUSY;
                goto failed;
        }

        EXT4_SB(sb)->s_mmp_bh = bh;

        BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
        snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname),
                 "%pg", bh->b_bdev);

        /*
         * Start a kernel thread to update the MMP block periodically.
         */
        EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s",
                                             (int)sizeof(mmp->mmp_bdevname),
                                             mmp->mmp_bdevname);
        if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
                EXT4_SB(sb)->s_mmp_tsk = NULL;
                ext4_warning(sb, "Unable to create kmmpd thread for %s.",
                             sb->s_id);
                retval = -ENOMEM;
                goto failed;
        }

        return 0;

failed:
        brelse(bh);
        return retval;
}




































































































































































































    1 


















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008 IBM Corporation
 *
 * Author: Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima_api.c
 *        Implements must_appraise_or_measure, collect_measurement,
 *        appraise_measurement, store_measurement and store_template.
 */
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/hex.h>
#include <linux/xattr.h>
#include <linux/evm.h>
#include <linux/fsverity.h>

#include "ima.h"

/*
 * ima_free_template_entry - free an existing template entry
 */
void ima_free_template_entry(struct ima_template_entry *entry)
{
        int i;

        for (i = 0; i < entry->template_desc->num_fields; i++)
                kfree(entry->template_data[i].data);

        kfree(entry->digests);
        kfree(entry);
}

/*
 * ima_alloc_init_template - create and initialize a new template entry
 */
int ima_alloc_init_template(struct ima_event_data *event_data,
                            struct ima_template_entry **entry,
                            struct ima_template_desc *desc)
{
        struct ima_template_desc *template_desc;
        struct tpm_digest *digests;
        int i, result = 0;

        if (desc)
                template_desc = desc;
        else
                template_desc = ima_template_desc_current();

        *entry = kzalloc_flex(**entry, template_data, template_desc->num_fields,
                              GFP_NOFS);
        if (!*entry)
                return -ENOMEM;

        digests = kzalloc_objs(*digests,
                               NR_BANKS(ima_tpm_chip) + ima_extra_slots,
                               GFP_NOFS);
        if (!digests) {
                kfree(*entry);
                *entry = NULL;
                return -ENOMEM;
        }

        (*entry)->digests = digests;
        (*entry)->template_desc = template_desc;
        for (i = 0; i < template_desc->num_fields; i++) {
                const struct ima_template_field *field =
                        template_desc->fields[i];
                u32 len;

                result = field->field_init(event_data,
                                           &((*entry)->template_data[i]));
                if (result != 0)
                        goto out;

                len = (*entry)->template_data[i].len;
                (*entry)->template_data_len += sizeof(len);
                (*entry)->template_data_len += len;
        }
        return 0;
out:
        ima_free_template_entry(*entry);
        *entry = NULL;
        return result;
}

/*
 * ima_store_template - store ima template measurements
 *
 * Calculate the hash of a template entry, add the template entry
 * to an ordered list of measurement entries maintained inside the kernel,
 * and also update the aggregate integrity value (maintained inside the
 * configured TPM PCR) over the hashes of the current list of measurement
 * entries.
 *
 * Applications retrieve the current kernel-held measurement list through
 * the securityfs entries in /sys/kernel/security/ima. The signed aggregate
 * TPM PCR (called quote) can be retrieved using a TPM user space library
 * and is used to validate the measurement list.
 *
 * Returns 0 on success, error code otherwise
 */
int ima_store_template(struct ima_template_entry *entry,
                       int violation, struct inode *inode,
                       const unsigned char *filename, int pcr)
{
        static const char op[] = "add_template_measure";
        static const char audit_cause[] = "hashing_error";
        char *template_name = entry->template_desc->name;
        int result;

        if (!violation) {
                result = ima_calc_field_array_hash(&entry->template_data[0],
                                                   entry);
                if (result < 0) {
                        integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode,
                                            template_name, op,
                                            audit_cause, result, 0);
                        return result;
                }
        }
        entry->pcr = pcr;
        result = ima_add_template_entry(entry, violation, op, inode, filename);
        return result;
}

/*
 * ima_add_violation - add violation to measurement list.
 *
 * Violations are flagged in the measurement list with zero hash values.
 * By extending the PCR with 0xFF's instead of with zeroes, the PCR
 * value is invalidated.
 */
void ima_add_violation(struct file *file, const unsigned char *filename,
                       struct ima_iint_cache *iint, const char *op,
                       const char *cause)
{
        struct ima_template_entry *entry;
        struct inode *inode = file_inode(file);
        struct ima_event_data event_data = { .iint = iint,
                                             .file = file,
                                             .filename = filename,
                                             .violation = cause };
        int violation = 1;
        int result;

        /* can overflow, only indicator */
        atomic_long_inc(&ima_htable.violations);

        result = ima_alloc_init_template(&event_data, &entry, NULL);
        if (result < 0) {
                result = -ENOMEM;
                goto err_out;
        }
        result = ima_store_template(entry, violation, inode,
                                    filename, CONFIG_IMA_MEASURE_PCR_IDX);
        if (result < 0)
                ima_free_template_entry(entry);
err_out:
        integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename,
                            op, cause, result, 0);
}

/**
 * ima_get_action - appraise & measure decision based on policy.
 * @idmap: idmap of the mount the inode was found from
 * @inode: pointer to the inode associated with the object being validated
 * @cred: pointer to credentials structure to validate
 * @prop: properties of the task being validated
 * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC,
 *        MAY_APPEND)
 * @func: caller identifier
 * @pcr: pointer filled in if matched measure policy sets pcr=
 * @template_desc: pointer filled in if matched measure policy sets template=
 * @func_data: func specific data, may be NULL
 * @allowed_algos: allowlist of hash algorithms for the IMA xattr
 *
 * The policy is defined in terms of keypairs:
 *                subj=, obj=, type=, func=, mask=, fsmagic=
 *        subj,obj, and type: are LSM specific.
 *        func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK
 *        | KEXEC_CMDLINE | KEY_CHECK | CRITICAL_DATA | SETXATTR_CHECK
 *        | MMAP_CHECK_REQPROT
 *        mask: contains the permission mask
 *        fsmagic: hex value
 *
 * Returns IMA_MEASURE, IMA_APPRAISE mask.
 *
 */
int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
                   const struct cred *cred, struct lsm_prop *prop, int mask,
                   enum ima_hooks func, int *pcr,
                   struct ima_template_desc **template_desc,
                   const char *func_data, unsigned int *allowed_algos)
{
        int flags = IMA_MEASURE | IMA_AUDIT | IMA_APPRAISE | IMA_HASH;

        flags &= ima_policy_flag;

        return ima_match_policy(idmap, inode, cred, prop, func, mask,
                                flags, pcr, template_desc, func_data,
                                allowed_algos);
}

static bool ima_get_verity_digest(struct ima_iint_cache *iint,
                                  struct inode *inode,
                                  struct ima_max_digest_data *hash)
{
        enum hash_algo alg;
        int digest_len;

        /*
         * On failure, 'measure' policy rules will result in a file data
         * hash containing 0's.
         */
        digest_len = fsverity_get_digest(inode, hash->digest, NULL, &alg);
        if (digest_len == 0)
                return false;

        /*
         * Unlike in the case of actually calculating the file hash, in
         * the fsverity case regardless of the hash algorithm, return
         * the verity digest to be included in the measurement list. A
         * mismatch between the verity algorithm and the xattr signature
         * algorithm, if one exists, will be detected later.
         */
        hash->hdr.algo = alg;
        hash->hdr.length = digest_len;
        return true;
}

/*
 * ima_collect_measurement - collect file measurement
 *
 * Calculate the file hash, if it doesn't already exist,
 * storing the measurement and i_version in the iint.
 *
 * Must be called with iint->mutex held.
 *
 * Return 0 on success, error code otherwise
 */
int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file,
                            void *buf, loff_t size, enum hash_algo algo,
                            struct modsig *modsig)
{
        const char *audit_cause = "failed";
        struct inode *inode = file_inode(file);
        struct inode *real_inode = d_real_inode(file_dentry(file));
        struct ima_max_digest_data hash;
        struct ima_digest_data *hash_hdr = container_of(&hash.hdr,
                                                struct ima_digest_data, hdr);
        struct name_snapshot filename;
        struct kstat stat;
        int result = 0;
        int length;
        void *tmpbuf;
        u64 i_version = 0;

        /*
         * Always collect the modsig, because IMA might have already collected
         * the file digest without collecting the modsig in a previous
         * measurement rule.
         */
        if (modsig)
                ima_collect_modsig(modsig, buf, size);

        if (iint->flags & IMA_COLLECTED)
                goto out;

        /*
         * Detect file change based on STATX_CHANGE_COOKIE, when supported,
         * and fallback to detecting file change based on i_version.
         *
         * On filesystems which did not support i_version, support was
         * originally limited to an initial measurement/appraisal/audit,
         * but was later modified to assume the file changed.
         */
        result = vfs_getattr_nosec(&file->f_path, &stat, STATX_CHANGE_COOKIE,
                                   AT_STATX_SYNC_AS_STAT);
        if (!result && (stat.result_mask & STATX_CHANGE_COOKIE))
                i_version = stat.change_cookie;
        else if (IS_I_VERSION(real_inode))
                i_version = inode_peek_iversion(real_inode);

        hash.hdr.algo = algo;
        hash.hdr.length = hash_digest_size[algo];

        /* Initialize hash digest to 0's in case of failure */
        memset(&hash.digest, 0, sizeof(hash.digest));

        if (iint->flags & IMA_VERITY_REQUIRED) {
                if (!ima_get_verity_digest(iint, inode, &hash)) {
                        audit_cause = "no-verity-digest";
                        result = -ENODATA;
                }
        } else if (buf) {
                result = ima_calc_buffer_hash(buf, size, hash_hdr);
        } else {
                result = ima_calc_file_hash(file, hash_hdr);
        }

        if (result && result != -EBADF && result != -EINVAL)
                goto out;

        length = sizeof(hash.hdr) + hash.hdr.length;
        tmpbuf = krealloc(iint->ima_hash, length, GFP_NOFS);
        if (!tmpbuf) {
                result = -ENOMEM;
                goto out;
        }

        iint->ima_hash = tmpbuf;
        memcpy(iint->ima_hash, &hash, length);
        if (real_inode == inode)
                iint->real_inode.version = i_version;
        else
                integrity_inode_attrs_store(&iint->real_inode, i_version,
                                            real_inode);

        /* Possibly temporary failure due to type of read (eg. O_DIRECT) */
        if (!result)
                iint->flags |= IMA_COLLECTED;
out:
        if (result) {
                if (file->f_flags & O_DIRECT)
                        audit_cause = "failed(directio)";

                take_dentry_name_snapshot(&filename, file->f_path.dentry);

                integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode,
                                    filename.name.name, "collect_data",
                                    audit_cause, result, 0);

                release_dentry_name_snapshot(&filename);
        }
        return result;
}

/*
 * ima_store_measurement - store file measurement
 *
 * Create an "ima" template and then store the template by calling
 * ima_store_template.
 *
 * We only get here if the inode has not already been measured,
 * but the measurement could already exist:
 *        - multiple copies of the same file on either the same or
 *          different filesystems.
 *        - the inode was previously flushed as well as the iint info,
 *          containing the hashing info.
 *
 * Must be called with iint->mutex held.
 */
void ima_store_measurement(struct ima_iint_cache *iint, struct file *file,
                           const unsigned char *filename,
                           struct evm_ima_xattr_data *xattr_value,
                           int xattr_len, const struct modsig *modsig, int pcr,
                           struct ima_template_desc *template_desc)
{
        static const char op[] = "add_template_measure";
        static const char audit_cause[] = "ENOMEM";
        int result = -ENOMEM;
        struct inode *inode = file_inode(file);
        struct ima_template_entry *entry;
        struct ima_event_data event_data = { .iint = iint,
                                             .file = file,
                                             .filename = filename,
                                             .xattr_value = xattr_value,
                                             .xattr_len = xattr_len,
                                             .modsig = modsig };
        int violation = 0;

        /*
         * We still need to store the measurement in the case of MODSIG because
         * we only have its contents to put in the list at the time of
         * appraisal, but a file measurement from earlier might already exist in
         * the measurement list.
         */
        if (iint->measured_pcrs & (0x1 << pcr) && !modsig)
                return;

        result = ima_alloc_init_template(&event_data, &entry, template_desc);
        if (result < 0) {
                integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename,
                                    op, audit_cause, result, 0);
                return;
        }

        result = ima_store_template(entry, violation, inode, filename, pcr);
        if ((!result || result == -EEXIST) && !(file->f_flags & O_DIRECT)) {
                iint->flags |= IMA_MEASURED;
                iint->measured_pcrs |= (0x1 << pcr);
        }
        if (result < 0)
                ima_free_template_entry(entry);
}

void ima_audit_measurement(struct ima_iint_cache *iint,
                           const unsigned char *filename)
{
        struct audit_buffer *ab;
        char *hash;
        const char *algo_name = hash_algo_name[iint->ima_hash->algo];
        int i;

        if (iint->flags & IMA_AUDITED)
                return;

        hash = kzalloc((iint->ima_hash->length * 2) + 1, GFP_KERNEL);
        if (!hash)
                return;

        for (i = 0; i < iint->ima_hash->length; i++)
                hex_byte_pack(hash + (i * 2), iint->ima_hash->digest[i]);
        hash[i * 2] = '\0';

        ab = audit_log_start(audit_context(), GFP_KERNEL,
                             AUDIT_INTEGRITY_RULE);
        if (!ab)
                goto out;

        audit_log_format(ab, "file=");
        audit_log_untrustedstring(ab, filename);
        audit_log_format(ab, " hash=\"%s:%s\"", algo_name, hash);

        audit_log_task_info(ab);
        audit_log_end(ab);

        iint->flags |= IMA_AUDITED;
out:
        kfree(hash);
        return;
}

/*
 * ima_d_path - return a pointer to the full pathname
 *
 * Attempt to return a pointer to the full pathname for use in the
 * IMA measurement list, IMA audit records, and auditing logs.
 *
 * On failure, return a pointer to a copy of the filename, not dname.
 * Returning a pointer to dname, could result in using the pointer
 * after the memory has been freed.
 */
const char *ima_d_path(const struct path *path, char **pathbuf, char *namebuf)
{
        struct name_snapshot filename;
        char *pathname = NULL;

        *pathbuf = __getname();
        if (*pathbuf) {
                pathname = d_absolute_path(path, *pathbuf, PATH_MAX);
                if (IS_ERR(pathname)) {
                        __putname(*pathbuf);
                        *pathbuf = NULL;
                        pathname = NULL;
                }
        }

        if (!pathname) {
                take_dentry_name_snapshot(&filename, path->dentry);
                strscpy(namebuf, filename.name.name, NAME_MAX);
                release_dentry_name_snapshot(&filename);

                pathname = namebuf;
        }

        return pathname;
}


















































   16 
    1 
   18 






















    5 
   13 





   15 


   20 








   17 




































































    5 



   13 
   14 

    5 






    5 













































































































    1 







    1 






    1 



















    1 









    1 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
// SPDX-License-Identifier: GPL-2.0-only
/*
 * net/core/dst.c        Protocol independent destination cache.
 *
 * Authors:                Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
#include <net/net_namespace.h>
#include <linux/sched.h>
#include <linux/prefetch.h>
#include <net/lwtunnel.h>
#include <net/xfrm.h>

#include <net/dst.h>
#include <net/dst_metadata.h>

int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        kfree_skb(skb);
        return 0;
}
EXPORT_SYMBOL(dst_discard_out);

const struct dst_metrics dst_default_metrics = {
        /* This initializer is needed to force linker to place this variable
         * into const section. Otherwise it might end into bss section.
         * We really want to avoid false sharing on this variable, and catch
         * any writes on it.
         */
        .refcnt = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL(dst_default_metrics);

void dst_init(struct dst_entry *dst, struct dst_ops *ops,
              struct net_device *dev, int initial_obsolete,
              unsigned short flags)
{
        dst->dev = dev;
        netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC);
        dst->ops = ops;
        dst_init_metrics(dst, dst_default_metrics.metrics, true);
        dst->expires = 0UL;
#ifdef CONFIG_XFRM
        dst->xfrm = NULL;
#endif
        dst->input = dst_discard;
        dst->output = dst_discard_out;
        dst->error = 0;
        dst->obsolete = initial_obsolete;
        dst->header_len = 0;
        dst->trailer_len = 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
        dst->tclassid = 0;
#endif
        dst->lwtstate = NULL;
        rcuref_init(&dst->__rcuref, 1);
        INIT_LIST_HEAD(&dst->rt_uncached);
        dst->rt_uncached_list = NULL;
        dst->__use = 0;
        dst->lastuse = jiffies;
        dst->flags = flags;
        if (!(flags & DST_NOCOUNT))
                dst_entries_add(ops, 1);
}
EXPORT_SYMBOL(dst_init);

void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
                int initial_obsolete, unsigned short flags)
{
        struct dst_entry *dst;

        if (ops->gc &&
            !(flags & DST_NOCOUNT) &&
            dst_entries_get_fast(ops) > ops->gc_thresh)
                ops->gc(ops);

        dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
        if (!dst)
                return NULL;

        dst_init(dst, ops, dev, initial_obsolete, flags);

        return dst;
}
EXPORT_SYMBOL(dst_alloc);

static void dst_destroy(struct dst_entry *dst)
{
        struct dst_entry *child = NULL;

        smp_rmb();

#ifdef CONFIG_XFRM
        if (dst->xfrm) {
                struct xfrm_dst *xdst = (struct xfrm_dst *) dst;

                child = xdst->child;
        }
#endif
        if (dst->ops->destroy)
                dst->ops->destroy(dst);
        netdev_put(dst->dev, &dst->dev_tracker);

        lwtstate_put(dst->lwtstate);

        if (dst->flags & DST_METADATA)
                metadata_dst_free((struct metadata_dst *)dst);
        else
                kmem_cache_free(dst->ops->kmem_cachep, dst);

        dst = child;
        if (dst)
                dst_release_immediate(dst);
}

static void dst_destroy_rcu(struct rcu_head *head)
{
        struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);

        dst_destroy(dst);
}

/* Operations to mark dst as DEAD and clean up the net device referenced
 * by dst:
 * 1. put the dst under blackhole interface and discard all tx/rx packets
 *    on this route.
 * 2. release the net_device
 * This function should be called when removing routes from the fib tree
 * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
 * make the next dst_ops->check() fail.
 */
void dst_dev_put(struct dst_entry *dst)
{
        struct net_device *dev = dst->dev;

        WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD);
        if (dst->ops->ifdown)
                dst->ops->ifdown(dst, dev);
        WRITE_ONCE(dst->input, dst_discard);
        WRITE_ONCE(dst->output, dst_discard_out);
        rcu_assign_pointer(dst->dev_rcu, blackhole_netdev);
        netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
                           GFP_ATOMIC);
}
EXPORT_SYMBOL(dst_dev_put);

static void dst_count_dec(struct dst_entry *dst)
{
        if (!(dst->flags & DST_NOCOUNT))
                dst_entries_add(dst->ops, -1);
}

void dst_release(struct dst_entry *dst)
{
        if (dst && rcuref_put(&dst->__rcuref)) {
#ifdef CONFIG_DST_CACHE
                if (dst->flags & DST_METADATA) {
                        struct metadata_dst *md_dst = (struct metadata_dst *)dst;

                        if (md_dst->type == METADATA_IP_TUNNEL)
                                dst_cache_reset_now(&md_dst->u.tun_info.dst_cache);
                }
#endif
                dst_count_dec(dst);
                call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
        }
}
EXPORT_SYMBOL(dst_release);

void dst_release_immediate(struct dst_entry *dst)
{
        if (dst && rcuref_put(&dst->__rcuref)) {
                dst_count_dec(dst);
                dst_destroy(dst);
        }
}
EXPORT_SYMBOL(dst_release_immediate);

u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
        struct dst_metrics *p = kmalloc_obj(*p, GFP_ATOMIC);

        if (p) {
                struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
                unsigned long prev, new;

                refcount_set(&p->refcnt, 1);
                memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));

                new = (unsigned long) p;
                prev = cmpxchg(&dst->_metrics, old, new);

                if (prev != old) {
                        kfree(p);
                        p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
                        if (prev & DST_METRICS_READ_ONLY)
                                p = NULL;
                } else if (prev & DST_METRICS_REFCOUNTED) {
                        if (refcount_dec_and_test(&old_p->refcnt))
                                kfree(old_p);
                }
        }
        BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
        return (u32 *)p;
}
EXPORT_SYMBOL(dst_cow_metrics_generic);

/* Caller asserts that dst_metrics_read_only(dst) is false.  */
void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
{
        unsigned long prev, new;

        new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
        prev = cmpxchg(&dst->_metrics, old, new);
        if (prev == old)
                kfree(__DST_METRICS_PTR(old));
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);

struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie)
{
        return NULL;
}

u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old)
{
        return NULL;
}

struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
                                             struct sk_buff *skb,
                                             const void *daddr)
{
        return NULL;
}

void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu,
                               bool confirm_neigh)
{
}
EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu);

void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
                            struct sk_buff *skb)
{
}
EXPORT_SYMBOL_GPL(dst_blackhole_redirect);

unsigned int dst_blackhole_mtu(const struct dst_entry *dst)
{
        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

        return mtu ? : dst_dev(dst)->mtu;
}
EXPORT_SYMBOL_GPL(dst_blackhole_mtu);

static struct dst_ops dst_blackhole_ops = {
        .family                = AF_UNSPEC,
        .neigh_lookup        = dst_blackhole_neigh_lookup,
        .check                = dst_blackhole_check,
        .cow_metrics        = dst_blackhole_cow_metrics,
        .update_pmtu        = dst_blackhole_update_pmtu,
        .redirect        = dst_blackhole_redirect,
        .mtu                = dst_blackhole_mtu,
};

static void __metadata_dst_init(struct metadata_dst *md_dst,
                                enum metadata_type type, u8 optslen)
{
        struct dst_entry *dst;

        dst = &md_dst->dst;
        dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE,
                 DST_METADATA | DST_NOCOUNT);
        memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
        md_dst->type = type;
}

struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
                                        gfp_t flags)
{
        struct metadata_dst *md_dst;

        md_dst = kmalloc_flex(*md_dst, u.tun_info.options, optslen, flags);
        if (!md_dst)
                return NULL;

        __metadata_dst_init(md_dst, type, optslen);

        return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc);

void metadata_dst_free(struct metadata_dst *md_dst)
{
#ifdef CONFIG_DST_CACHE
        if (md_dst->type == METADATA_IP_TUNNEL)
                dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
#endif
        if (md_dst->type == METADATA_XFRM)
                dst_release(md_dst->u.xfrm_info.dst_orig);
        kfree(md_dst);
}
EXPORT_SYMBOL_GPL(metadata_dst_free);

struct metadata_dst __percpu *
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
{
        int cpu;
        struct metadata_dst __percpu *md_dst;

        md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options,
                                                optslen),
                                    __alignof__(struct metadata_dst), flags);
        if (!md_dst)
                return NULL;

        for_each_possible_cpu(cpu)
                __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);

        return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);

void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);

#ifdef CONFIG_DST_CACHE
                if (one_md_dst->type == METADATA_IP_TUNNEL)
                        dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
#endif
                if (one_md_dst->type == METADATA_XFRM)
                        dst_release(one_md_dst->u.xfrm_info.dst_orig);
        }
        free_percpu(md_dst);
}
EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);



































































    2 


























































































































































































































































































































































































































































    2 






















    2 



















































































































    2 





    2 
    2 







    2 


    2 




    2 








    2 











    2 









    2 
    2 









    2 
    2 

    2 

    2 

    2 
    2 

















    2 



















































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







    1 


    1 





    1 


    1 
    1 
    1 




    1 















    1 





    1 
    1 
    1 







    1 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Generic address resolution entity
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
 *
 *        Fixes:
 *        Vitaly E. Lavrov        releasing NULL neighbor in neigh_add.
 *        Harald Welte                Add neighbour cache statistics like rtstat
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/times.h>
#include <net/net_namespace.h>
#include <net/neighbour.h>
#include <net/arp.h>
#include <net/dst.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/random.h>
#include <linux/string.h>
#include <linux/log2.h>
#include <linux/inetdevice.h>
#include <net/addrconf.h>

#include <trace/events/neigh.h>

#define NEIGH_DEBUG 1
#define neigh_dbg(level, fmt, ...)                \
do {                                                \
        if (level <= NEIGH_DEBUG)                \
                pr_debug(fmt, ##__VA_ARGS__);        \
} while (0)

#define PNEIGH_HASHMASK                0xF

static void neigh_timer_handler(struct timer_list *t);
static void neigh_notify(struct neighbour *n, int type, int flags, u32 pid);
static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid);
static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
                          bool skip_perm);

#ifdef CONFIG_PROC_FS
static const struct seq_operations neigh_stat_seq_ops;
#endif

static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family)
{
        int i;

        switch (family) {
        default:
                DEBUG_NET_WARN_ON_ONCE(1);
                fallthrough; /* to avoid panic by null-ptr-deref */
        case AF_INET:
                i = NEIGH_ARP_TABLE;
                break;
        case AF_INET6:
                i = NEIGH_ND_TABLE;
                break;
        }

        return &dev->neighbours[i];
}

/*
   Neighbour hash table buckets are protected with tbl->lock.

   - All the scans/updates to hash buckets MUST be made under this lock.
   - NOTHING clever should be made under this lock: no callbacks
     to protocol backends, no attempts to send something to network.
     It will result in deadlocks, if backend/driver wants to use neighbour
     cache.
   - If the entry requires some non-trivial actions, increase
     its reference count and release table lock.

   Neighbour entries are protected:
   - with reference count.
   - with rwlock neigh->lock

   Reference count prevents destruction.

   neigh->lock mainly serializes ll address data and its validity state.
   However, the same lock is used to protect another entry fields:
    - timer
    - resolution queue

   Again, nothing clever shall be made under neigh->lock,
   the most complicated procedure, which we allow is dev->hard_header.
   It is supposed, that dev->hard_header is simplistic and does
   not make callbacks to neighbour tables.
 */

static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
{
        kfree_skb(skb);
        return -ENETDOWN;
}

static void neigh_cleanup_and_release(struct neighbour *neigh)
{
        trace_neigh_cleanup_and_release(neigh, 0);
        neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
        neigh_release(neigh);
}

/*
 * It is random distribution in the interval (1/2)*base...(3/2)*base.
 * It corresponds to default IPv6 settings and is not overridable,
 * because it is really reasonable choice.
 */

unsigned long neigh_rand_reach_time(unsigned long base)
{
        return base ? get_random_u32_below(base) + (base >> 1) : 0;
}
EXPORT_SYMBOL(neigh_rand_reach_time);

static void neigh_mark_dead(struct neighbour *n)
{
        n->dead = 1;
        if (!list_empty(&n->gc_list)) {
                list_del_init(&n->gc_list);
                atomic_dec(&n->tbl->gc_entries);
        }
        if (!list_empty(&n->managed_list))
                list_del_init(&n->managed_list);
}

static void neigh_update_gc_list(struct neighbour *n)
{
        bool on_gc_list, exempt_from_gc;

        spin_lock_bh(&n->tbl->lock);
        write_lock(&n->lock);
        if (n->dead)
                goto out;

        /* remove from the gc list if new state is permanent or if neighbor is
         * externally learned / validated; otherwise entry should be on the gc
         * list
         */
        exempt_from_gc = n->nud_state & NUD_PERMANENT ||
                         n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED);
        on_gc_list = !list_empty(&n->gc_list);

        if (exempt_from_gc && on_gc_list) {
                list_del_init(&n->gc_list);
                atomic_dec(&n->tbl->gc_entries);
        } else if (!exempt_from_gc && !on_gc_list) {
                /* add entries to the tail; cleaning removes from the front */
                list_add_tail(&n->gc_list, &n->tbl->gc_list);
                atomic_inc(&n->tbl->gc_entries);
        }
out:
        write_unlock(&n->lock);
        spin_unlock_bh(&n->tbl->lock);
}

static void neigh_update_managed_list(struct neighbour *n)
{
        bool on_managed_list, add_to_managed;

        spin_lock_bh(&n->tbl->lock);
        write_lock(&n->lock);
        if (n->dead)
                goto out;

        add_to_managed = n->flags & NTF_MANAGED;
        on_managed_list = !list_empty(&n->managed_list);

        if (!add_to_managed && on_managed_list)
                list_del_init(&n->managed_list);
        else if (add_to_managed && !on_managed_list)
                list_add_tail(&n->managed_list, &n->tbl->managed_list);
out:
        write_unlock(&n->lock);
        spin_unlock_bh(&n->tbl->lock);
}

static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
                               bool *gc_update, bool *managed_update)
{
        u32 ndm_flags, old_flags = neigh->flags;

        if (!(flags & NEIGH_UPDATE_F_ADMIN))
                return;

        ndm_flags  = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0;
        ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0;
        ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0;

        if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) {
                if (ndm_flags & NTF_EXT_LEARNED)
                        neigh->flags |= NTF_EXT_LEARNED;
                else
                        neigh->flags &= ~NTF_EXT_LEARNED;
                *notify = 1;
                *gc_update = true;
        }
        if ((old_flags ^ ndm_flags) & NTF_MANAGED) {
                if (ndm_flags & NTF_MANAGED)
                        neigh->flags |= NTF_MANAGED;
                else
                        neigh->flags &= ~NTF_MANAGED;
                *notify = 1;
                *managed_update = true;
        }
        if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) {
                if (ndm_flags & NTF_EXT_VALIDATED)
                        neigh->flags |= NTF_EXT_VALIDATED;
                else
                        neigh->flags &= ~NTF_EXT_VALIDATED;
                *notify = 1;
                *gc_update = true;
        }
}

bool neigh_remove_one(struct neighbour *n)
{
        bool retval = false;

        write_lock(&n->lock);
        if (refcount_read(&n->refcnt) == 1) {
                hlist_del_rcu(&n->hash);
                hlist_del_rcu(&n->dev_list);
                neigh_mark_dead(n);
                retval = true;
        }
        write_unlock(&n->lock);
        if (retval)
                neigh_cleanup_and_release(n);
        return retval;
}

static int neigh_forced_gc(struct neigh_table *tbl)
{
        int max_clean = atomic_read(&tbl->gc_entries) -
                        READ_ONCE(tbl->gc_thresh2);
        u64 tmax = ktime_get_ns() + NSEC_PER_MSEC;
        unsigned long tref = jiffies - 5 * HZ;
        struct neighbour *n, *tmp;
        int shrunk = 0;
        int loop = 0;

        NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);

        spin_lock_bh(&tbl->lock);

        list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
                if (refcount_read(&n->refcnt) == 1) {
                        bool remove = false;

                        write_lock(&n->lock);
                        if ((n->nud_state == NUD_FAILED) ||
                            (n->nud_state == NUD_NOARP) ||
                            (tbl->is_multicast &&
                             tbl->is_multicast(n->primary_key)) ||
                            !time_in_range(n->updated, tref, jiffies))
                                remove = true;
                        write_unlock(&n->lock);

                        if (remove && neigh_remove_one(n))
                                shrunk++;
                        if (shrunk >= max_clean)
                                break;
                        if (++loop == 16) {
                                if (ktime_get_ns() > tmax)
                                        goto unlock;
                                loop = 0;
                        }
                }
        }

        WRITE_ONCE(tbl->last_flush, jiffies);
unlock:
        spin_unlock_bh(&tbl->lock);

        return shrunk;
}

static void neigh_add_timer(struct neighbour *n, unsigned long when)
{
        /* Use safe distance from the jiffies - LONG_MAX point while timer
         * is running in DELAY/PROBE state but still show to user space
         * large times in the past.
         */
        unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ);

        neigh_hold(n);
        if (!time_in_range(n->confirmed, mint, jiffies))
                n->confirmed = mint;
        if (time_before(n->used, n->confirmed))
                n->used = n->confirmed;
        if (unlikely(mod_timer(&n->timer, when))) {
                printk("NEIGH: BUG, double timer add, state is %x\n",
                       n->nud_state);
                dump_stack();
        }
}

static int neigh_del_timer(struct neighbour *n)
{
        if ((n->nud_state & NUD_IN_TIMER) &&
            timer_delete(&n->timer)) {
                neigh_release(n);
                return 1;
        }
        return 0;
}

static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
                                                   int family)
{
        switch (family) {
        case AF_INET:
                return __in_dev_arp_parms_get_rcu(dev);
        case AF_INET6:
                return __in6_dev_nd_parms_get_rcu(dev);
        }
        return NULL;
}

static void neigh_parms_qlen_dec(struct net_device *dev, int family)
{
        struct neigh_parms *p;

        rcu_read_lock();
        p = neigh_get_dev_parms_rcu(dev, family);
        if (p)
                p->qlen--;
        rcu_read_unlock();
}

static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
                               int family)
{
        struct sk_buff_head tmp;
        unsigned long flags;
        struct sk_buff *skb;

        skb_queue_head_init(&tmp);
        spin_lock_irqsave(&list->lock, flags);
        skb = skb_peek(list);
        while (skb != NULL) {
                struct sk_buff *skb_next = skb_peek_next(skb, list);
                struct net_device *dev = skb->dev;

                if (net == NULL || net_eq(dev_net(dev), net)) {
                        neigh_parms_qlen_dec(dev, family);
                        __skb_unlink(skb, list);
                        __skb_queue_tail(&tmp, skb);
                }
                skb = skb_next;
        }
        spin_unlock_irqrestore(&list->lock, flags);

        while ((skb = __skb_dequeue(&tmp))) {
                dev_put(skb->dev);
                kfree_skb(skb);
        }
}

static void neigh_flush_one(struct neighbour *n)
{
        hlist_del_rcu(&n->hash);
        hlist_del_rcu(&n->dev_list);

        write_lock(&n->lock);

        neigh_del_timer(n);
        neigh_mark_dead(n);

        if (refcount_read(&n->refcnt) != 1) {
                /* The most unpleasant situation.
                 * We must destroy neighbour entry,
                 * but someone still uses it.
                 *
                 * The destroy will be delayed until
                 * the last user releases us, but
                 * we must kill timers etc. and move
                 * it to safe state.
                 */
                __skb_queue_purge(&n->arp_queue);
                n->arp_queue_len_bytes = 0;
                WRITE_ONCE(n->output, neigh_blackhole);

                if (n->nud_state & NUD_VALID)
                        n->nud_state = NUD_NOARP;
                else
                        n->nud_state = NUD_NONE;

                neigh_dbg(2, "neigh %p is stray\n", n);
        }

        write_unlock(&n->lock);

        neigh_cleanup_and_release(n);
}

static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
                            bool skip_perm)
{
        struct hlist_head *dev_head;
        struct hlist_node *tmp;
        struct neighbour *n;

        dev_head = neigh_get_dev_table(dev, tbl->family);

        hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) {
                if (skip_perm &&
                    (n->nud_state & NUD_PERMANENT ||
                     n->flags & NTF_EXT_VALIDATED))
                        continue;

                neigh_flush_one(n);
        }
}

static void neigh_flush_table(struct neigh_table *tbl)
{
        struct neigh_hash_table *nht;
        int i;

        nht = rcu_dereference_protected(tbl->nht,
                                        lockdep_is_held(&tbl->lock));

        for (i = 0; i < (1 << nht->hash_shift); i++) {
                struct hlist_node *tmp;
                struct neighbour *n;

                neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i])
                        neigh_flush_one(n);
        }
}

void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
{
        spin_lock_bh(&tbl->lock);
        neigh_flush_dev(tbl, dev, false);
        spin_unlock_bh(&tbl->lock);
}
EXPORT_SYMBOL(neigh_changeaddr);

static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
                          bool skip_perm)
{
        spin_lock_bh(&tbl->lock);
        if (likely(dev)) {
                neigh_flush_dev(tbl, dev, skip_perm);
        } else {
                DEBUG_NET_WARN_ON_ONCE(skip_perm);
                neigh_flush_table(tbl);
        }
        spin_unlock_bh(&tbl->lock);

        pneigh_ifdown(tbl, dev, skip_perm);
        pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
                           tbl->family);
        if (skb_queue_empty_lockless(&tbl->proxy_queue))
                timer_delete_sync(&tbl->proxy_timer);
        return 0;
}

int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev)
{
        __neigh_ifdown(tbl, dev, true);
        return 0;
}
EXPORT_SYMBOL(neigh_carrier_down);

int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
{
        __neigh_ifdown(tbl, dev, false);
        return 0;
}
EXPORT_SYMBOL(neigh_ifdown);

static struct neighbour *neigh_alloc(struct neigh_table *tbl,
                                     struct net_device *dev,
                                     u32 flags, bool exempt_from_gc)
{
        struct neighbour *n = NULL;
        unsigned long now = jiffies;
        int entries, gc_thresh3;

        if (exempt_from_gc)
                goto do_alloc;

        entries = atomic_inc_return(&tbl->gc_entries) - 1;
        gc_thresh3 = READ_ONCE(tbl->gc_thresh3);
        if (entries >= gc_thresh3 ||
            (entries >= READ_ONCE(tbl->gc_thresh2) &&
             time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) {
                if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) {
                        net_info_ratelimited("%s: neighbor table overflow!\n",
                                             tbl->id);
                        NEIGH_CACHE_STAT_INC(tbl, table_fulls);
                        goto out_entries;
                }
        }

do_alloc:
        n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
        if (!n)
                goto out_entries;

        __skb_queue_head_init(&n->arp_queue);
        rwlock_init(&n->lock);
        seqlock_init(&n->ha_lock);
        n->updated          = n->used = now;
        n->nud_state          = NUD_NONE;
        n->output          = neigh_blackhole;
        n->flags          = flags;
        seqlock_init(&n->hh.hh_lock);
        n->parms          = neigh_parms_clone(&tbl->parms);
        timer_setup(&n->timer, neigh_timer_handler, 0);

        NEIGH_CACHE_STAT_INC(tbl, allocs);
        n->tbl                  = tbl;
        refcount_set(&n->refcnt, 1);
        n->dead                  = 1;
        INIT_LIST_HEAD(&n->gc_list);
        INIT_LIST_HEAD(&n->managed_list);

        atomic_inc(&tbl->entries);
out:
        return n;

out_entries:
        if (!exempt_from_gc)
                atomic_dec(&tbl->gc_entries);
        goto out;
}

static void neigh_get_hash_rnd(u32 *x)
{
        *x = get_random_u32() | 1;
}

static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
        size_t size = (1 << shift) * sizeof(struct hlist_head);
        struct hlist_head *hash_heads;
        struct neigh_hash_table *ret;
        int i;

        ret = kmalloc_obj(*ret, GFP_ATOMIC);
        if (!ret)
                return NULL;

        hash_heads = kzalloc(size, GFP_ATOMIC);
        if (!hash_heads) {
                kfree(ret);
                return NULL;
        }
        ret->hash_heads = hash_heads;
        ret->hash_shift = shift;
        for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
                neigh_get_hash_rnd(&ret->hash_rnd[i]);
        return ret;
}

static void neigh_hash_free_rcu(struct rcu_head *head)
{
        struct neigh_hash_table *nht = container_of(head,
                                                    struct neigh_hash_table,
                                                    rcu);

        kfree(nht->hash_heads);
        kfree(nht);
}

static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
                                                unsigned long new_shift)
{
        unsigned int i, hash;
        struct neigh_hash_table *new_nht, *old_nht;

        NEIGH_CACHE_STAT_INC(tbl, hash_grows);

        old_nht = rcu_dereference_protected(tbl->nht,
                                            lockdep_is_held(&tbl->lock));
        new_nht = neigh_hash_alloc(new_shift);
        if (!new_nht)
                return old_nht;

        for (i = 0; i < (1 << old_nht->hash_shift); i++) {
                struct hlist_node *tmp;
                struct neighbour *n;

                neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) {
                        hash = tbl->hash(n->primary_key, n->dev,
                                         new_nht->hash_rnd);

                        hash >>= (32 - new_nht->hash_shift);

                        hlist_del_rcu(&n->hash);
                        hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]);
                }
        }

        rcu_assign_pointer(tbl->nht, new_nht);
        call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
        return new_nht;
}

struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
                               struct net_device *dev)
{
        struct neighbour *n;

        NEIGH_CACHE_STAT_INC(tbl, lookups);

        rcu_read_lock();
        n = __neigh_lookup_noref(tbl, pkey, dev);
        if (n) {
                if (!refcount_inc_not_zero(&n->refcnt))
                        n = NULL;
                NEIGH_CACHE_STAT_INC(tbl, hits);
        }

        rcu_read_unlock();
        return n;
}
EXPORT_SYMBOL(neigh_lookup);

static struct neighbour *
___neigh_create(struct neigh_table *tbl, const void *pkey,
                struct net_device *dev, u32 flags,
                bool exempt_from_gc, bool want_ref)
{
        u32 hash_val, key_len = tbl->key_len;
        struct neighbour *n1, *rc, *n;
        struct neigh_hash_table *nht;
        int error;

        n = neigh_alloc(tbl, dev, flags, exempt_from_gc);
        trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
        if (!n) {
                rc = ERR_PTR(-ENOBUFS);
                goto out;
        }

        memcpy(n->primary_key, pkey, key_len);
        n->dev = dev;
        netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC);

        /* Protocol specific setup. */
        if (tbl->constructor &&        (error = tbl->constructor(n)) < 0) {
                rc = ERR_PTR(error);
                goto out_neigh_release;
        }

        if (dev->netdev_ops->ndo_neigh_construct) {
                error = dev->netdev_ops->ndo_neigh_construct(dev, n);
                if (error < 0) {
                        rc = ERR_PTR(error);
                        goto out_neigh_release;
                }
        }

        /* Device specific setup. */
        if (n->parms->neigh_setup &&
            (error = n->parms->neigh_setup(n)) < 0) {
                rc = ERR_PTR(error);
                goto out_neigh_release;
        }

        n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);

        spin_lock_bh(&tbl->lock);
        nht = rcu_dereference_protected(tbl->nht,
                                        lockdep_is_held(&tbl->lock));

        if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
                nht = neigh_hash_grow(tbl, nht->hash_shift + 1);

        hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift);

        if (n->parms->dead) {
                rc = ERR_PTR(-EINVAL);
                goto out_tbl_unlock;
        }

        neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) {
                if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
                        if (want_ref)
                                neigh_hold(n1);
                        rc = n1;
                        goto out_tbl_unlock;
                }
        }

        n->dead = 0;
        if (!exempt_from_gc)
                list_add_tail(&n->gc_list, &n->tbl->gc_list);
        if (n->flags & NTF_MANAGED)
                list_add_tail(&n->managed_list, &n->tbl->managed_list);
        if (want_ref)
                neigh_hold(n);
        hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);

        hlist_add_head_rcu(&n->dev_list,
                           neigh_get_dev_table(dev, tbl->family));

        spin_unlock_bh(&tbl->lock);
        neigh_dbg(2, "neigh %p is created\n", n);
        rc = n;
out:
        return rc;
out_tbl_unlock:
        spin_unlock_bh(&tbl->lock);
out_neigh_release:
        if (!exempt_from_gc)
                atomic_dec(&tbl->gc_entries);
        neigh_release(n);
        goto out;
}

struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
                                 struct net_device *dev, bool want_ref)
{
        bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK);

        return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref);
}
EXPORT_SYMBOL(__neigh_create);

static u32 pneigh_hash(const void *pkey, unsigned int key_len)
{
        u32 hash_val = *(u32 *)(pkey + key_len - 4);
        hash_val ^= (hash_val >> 16);
        hash_val ^= hash_val >> 8;
        hash_val ^= hash_val >> 4;
        hash_val &= PNEIGH_HASHMASK;
        return hash_val;
}

struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl,
                                   struct net *net, const void *pkey,
                                   struct net_device *dev)
{
        struct pneigh_entry *n;
        unsigned int key_len;
        u32 hash_val;

        key_len = tbl->key_len;
        hash_val = pneigh_hash(pkey, key_len);
        n = rcu_dereference_check(tbl->phash_buckets[hash_val],
                                  lockdep_is_held(&tbl->phash_lock));

        while (n) {
                if (!memcmp(n->key, pkey, key_len) &&
                    net_eq(pneigh_net(n), net) &&
                    (n->dev == dev || !n->dev))
                        return n;

                n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock));
        }

        return NULL;
}

int pneigh_create(struct neigh_table *tbl, struct net *net,
                  const void *pkey, struct net_device *dev,
                  u32 flags, u8 protocol, bool permanent)
{
        struct pneigh_entry *n;
        unsigned int key_len;
        u32 hash_val;
        int err = 0;

        mutex_lock(&tbl->phash_lock);

        n = pneigh_lookup(tbl, net, pkey, dev);
        if (n)
                goto update;

        key_len = tbl->key_len;
        n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL);
        if (!n) {
                err = -ENOBUFS;
                goto out;
        }

        write_pnet(&n->net, net);
        memcpy(n->key, pkey, key_len);
        n->dev = dev;
        netdev_hold(dev, &n->dev_tracker, GFP_KERNEL);

        if (tbl->pconstructor && tbl->pconstructor(n)) {
                netdev_put(dev, &n->dev_tracker);
                kfree(n);
                err = -ENOBUFS;
                goto out;
        }

        hash_val = pneigh_hash(pkey, key_len);
        n->next = tbl->phash_buckets[hash_val];
        rcu_assign_pointer(tbl->phash_buckets[hash_val], n);
update:
        WRITE_ONCE(n->flags, flags);
        n->permanent = permanent;
        if (protocol)
                WRITE_ONCE(n->protocol, protocol);
out:
        mutex_unlock(&tbl->phash_lock);
        return err;
}

static void pneigh_destroy(struct rcu_head *rcu)
{
        struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu);

        netdev_put(n->dev, &n->dev_tracker);
        kfree(n);
}

int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
                  struct net_device *dev)
{
        struct pneigh_entry *n, __rcu **np;
        unsigned int key_len;
        u32 hash_val;

        key_len = tbl->key_len;
        hash_val = pneigh_hash(pkey, key_len);

        mutex_lock(&tbl->phash_lock);

        for (np = &tbl->phash_buckets[hash_val];
             (n = rcu_dereference_protected(*np, 1)) != NULL;
             np = &n->next) {
                if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
                    net_eq(pneigh_net(n), net)) {
                        rcu_assign_pointer(*np, n->next);

                        mutex_unlock(&tbl->phash_lock);

                        if (tbl->pdestructor)
                                tbl->pdestructor(n);

                        call_rcu(&n->rcu, pneigh_destroy);
                        return 0;
                }
        }

        mutex_unlock(&tbl->phash_lock);
        return -ENOENT;
}

static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
                          bool skip_perm)
{
        struct pneigh_entry *n, __rcu **np;
        LIST_HEAD(head);
        u32 h;

        mutex_lock(&tbl->phash_lock);

        for (h = 0; h <= PNEIGH_HASHMASK; h++) {
                np = &tbl->phash_buckets[h];
                while ((n = rcu_dereference_protected(*np, 1)) != NULL) {
                        if (skip_perm && n->permanent)
                                goto skip;
                        if (!dev || n->dev == dev) {
                                rcu_assign_pointer(*np, n->next);
                                list_add(&n->free_node, &head);
                                continue;
                        }
skip:
                        np = &n->next;
                }
        }

        mutex_unlock(&tbl->phash_lock);

        while (!list_empty(&head)) {
                n = list_first_entry(&head, typeof(*n), free_node);
                list_del(&n->free_node);

                if (tbl->pdestructor)
                        tbl->pdestructor(n);

                call_rcu(&n->rcu, pneigh_destroy);
        }
}

static inline void neigh_parms_put(struct neigh_parms *parms)
{
        if (refcount_dec_and_test(&parms->refcnt))
                kfree(parms);
}

/*
 *        neighbour must already be out of the table;
 *
 */
void neigh_destroy(struct neighbour *neigh)
{
        struct net_device *dev = neigh->dev;

        NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);

        if (!neigh->dead) {
                pr_warn("Destroying alive neighbour %p\n", neigh);
                dump_stack();
                return;
        }

        if (neigh_del_timer(neigh))
                pr_warn("Impossible event\n");

        write_lock_bh(&neigh->lock);
        __skb_queue_purge(&neigh->arp_queue);
        write_unlock_bh(&neigh->lock);
        neigh->arp_queue_len_bytes = 0;

        if (dev->netdev_ops->ndo_neigh_destroy)
                dev->netdev_ops->ndo_neigh_destroy(dev, neigh);

        netdev_put(dev, &neigh->dev_tracker);
        neigh_parms_put(neigh->parms);

        neigh_dbg(2, "neigh %p is destroyed\n", neigh);

        atomic_dec(&neigh->tbl->entries);
        kfree_rcu(neigh, rcu);
}
EXPORT_SYMBOL(neigh_destroy);

/* Neighbour state is suspicious;
   disable fast path.

   Called with write_locked neigh.
 */
static void neigh_suspect(struct neighbour *neigh)
{
        neigh_dbg(2, "neigh %p is suspected\n", neigh);

        WRITE_ONCE(neigh->output, neigh->ops->output);
}

/* Neighbour state is OK;
   enable fast path.

   Called with write_locked neigh.
 */
static void neigh_connect(struct neighbour *neigh)
{
        neigh_dbg(2, "neigh %p is connected\n", neigh);

        WRITE_ONCE(neigh->output, neigh->ops->connected_output);
}

static void neigh_periodic_work(struct work_struct *work)
{
        struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
        struct neigh_hash_table *nht;
        struct hlist_node *tmp;
        struct neighbour *n;
        unsigned int i;

        NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);

        spin_lock_bh(&tbl->lock);
        nht = rcu_dereference_protected(tbl->nht,
                                        lockdep_is_held(&tbl->lock));

        /*
         *        periodically recompute ReachableTime from random function
         */

        if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
                struct neigh_parms *p;

                WRITE_ONCE(tbl->last_rand, jiffies);
                list_for_each_entry(p, &tbl->parms_list, list)
                        neigh_set_reach_time(p);
        }

        if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1))
                goto out;

        for (i = 0 ; i < (1 << nht->hash_shift); i++) {
                neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
                        unsigned int state;

                        write_lock(&n->lock);

                        state = n->nud_state;
                        if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
                            (n->flags &
                             (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) {
                                write_unlock(&n->lock);
                                continue;
                        }

                        if (time_before(n->used, n->confirmed) &&
                            time_is_before_eq_jiffies(n->confirmed))
                                n->used = n->confirmed;

                        if (refcount_read(&n->refcnt) == 1 &&
                            (state == NUD_FAILED ||
                             !time_in_range_open(jiffies, n->used,
                                                 n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
                                hlist_del_rcu(&n->hash);
                                hlist_del_rcu(&n->dev_list);
                                neigh_mark_dead(n);
                                write_unlock(&n->lock);
                                neigh_cleanup_and_release(n);
                                continue;
                        }
                        write_unlock(&n->lock);
                }
                /*
                 * It's fine to release lock here, even if hash table
                 * grows while we are preempted.
                 */
                spin_unlock_bh(&tbl->lock);
                cond_resched();
                spin_lock_bh(&tbl->lock);
                nht = rcu_dereference_protected(tbl->nht,
                                                lockdep_is_held(&tbl->lock));
        }
out:
        /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks.
         * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2
         * BASE_REACHABLE_TIME.
         */
        queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
                              NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
        spin_unlock_bh(&tbl->lock);
}

static __inline__ int neigh_max_probes(struct neighbour *n)
{
        struct neigh_parms *p = n->parms;
        return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) +
               (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) :
                NEIGH_VAR(p, MCAST_PROBES));
}

static void neigh_invalidate(struct neighbour *neigh)
        __releases(neigh->lock)
        __acquires(neigh->lock)
{
        struct sk_buff *skb;

        NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
        neigh_dbg(2, "neigh %p is failed\n", neigh);
        neigh->updated = jiffies;

        /* It is very thin place. report_unreachable is very complicated
           routine. Particularly, it can hit the same neighbour entry!

           So that, we try to be accurate and avoid dead loop. --ANK
         */
        while (neigh->nud_state == NUD_FAILED &&
               (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
                write_unlock(&neigh->lock);
                neigh->ops->error_report(neigh, skb);
                write_lock(&neigh->lock);
        }
        __skb_queue_purge(&neigh->arp_queue);
        neigh->arp_queue_len_bytes = 0;
}

static void neigh_probe(struct neighbour *neigh)
        __releases(neigh->lock)
{
        struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
        /* keep skb alive even if arp_queue overflows */
        if (skb)
                skb = skb_clone(skb, GFP_ATOMIC);
        write_unlock(&neigh->lock);
        if (neigh->ops->solicit)
                neigh->ops->solicit(neigh, skb);
        atomic_inc(&neigh->probes);
        consume_skb(skb);
}

/* Called when a timer expires for a neighbour entry. */

static void neigh_timer_handler(struct timer_list *t)
{
        unsigned long now, next;
        struct neighbour *neigh = timer_container_of(neigh, t, timer);
        bool skip_probe = false;
        unsigned int state;
        int notify = 0;

        write_lock(&neigh->lock);

        state = neigh->nud_state;
        now = jiffies;
        next = now + HZ;

        if (!(state & NUD_IN_TIMER))
                goto out;

        if (state & NUD_REACHABLE) {
                if (time_before_eq(now,
                                   neigh->confirmed + neigh->parms->reachable_time)) {
                        neigh_dbg(2, "neigh %p is still alive\n", neigh);
                        next = neigh->confirmed + neigh->parms->reachable_time;
                } else if (time_before_eq(now,
                                          neigh->used +
                                          NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
                        neigh_dbg(2, "neigh %p is delayed\n", neigh);
                        WRITE_ONCE(neigh->nud_state, NUD_DELAY);
                        neigh->updated = jiffies;
                        neigh_suspect(neigh);
                        next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
                } else {
                        neigh_dbg(2, "neigh %p is suspected\n", neigh);
                        WRITE_ONCE(neigh->nud_state, NUD_STALE);
                        neigh->updated = jiffies;
                        neigh_suspect(neigh);
                        notify = 1;
                }
        } else if (state & NUD_DELAY) {
                if (time_before_eq(now,
                                   neigh->confirmed +
                                   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
                        neigh_dbg(2, "neigh %p is now reachable\n", neigh);
                        WRITE_ONCE(neigh->nud_state, NUD_REACHABLE);
                        neigh->updated = jiffies;
                        neigh_connect(neigh);
                        notify = 1;
                        next = neigh->confirmed + neigh->parms->reachable_time;
                } else {
                        neigh_dbg(2, "neigh %p is probed\n", neigh);
                        WRITE_ONCE(neigh->nud_state, NUD_PROBE);
                        neigh->updated = jiffies;
                        atomic_set(&neigh->probes, 0);
                        notify = 1;
                        next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
                                         HZ/100);
                }
        } else {
                /* NUD_PROBE|NUD_INCOMPLETE */
                next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100);
        }

        if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
            atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
                if (neigh->nud_state == NUD_PROBE &&
                    neigh->flags & NTF_EXT_VALIDATED) {
                        WRITE_ONCE(neigh->nud_state, NUD_STALE);
                        neigh->updated = jiffies;
                } else {
                        WRITE_ONCE(neigh->nud_state, NUD_FAILED);
                        neigh_invalidate(neigh);
                }
                notify = 1;
                skip_probe = true;
        }

        if (notify)
                __neigh_notify(neigh, RTM_NEWNEIGH, 0, 0);

        if (skip_probe)
                goto out;

        if (neigh->nud_state & NUD_IN_TIMER) {
                if (time_before(next, jiffies + HZ/100))
                        next = jiffies + HZ/100;
                if (!mod_timer(&neigh->timer, next))
                        neigh_hold(neigh);
        }
        if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
                neigh_probe(neigh);
        } else {
out:
                write_unlock(&neigh->lock);
        }

        if (notify)
                call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);

        trace_neigh_timer_handler(neigh, 0);

        neigh_release(neigh);
}

int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
                       const bool immediate_ok)
{
        int rc;
        bool immediate_probe = false;

        write_lock_bh(&neigh->lock);

        rc = 0;
        if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
                goto out_unlock_bh;
        if (neigh->dead)
                goto out_dead;

        if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
                if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
                    NEIGH_VAR(neigh->parms, APP_PROBES)) {
                        unsigned long next, now = jiffies;

                        atomic_set(&neigh->probes,
                                   NEIGH_VAR(neigh->parms, UCAST_PROBES));
                        neigh_del_timer(neigh);
                        WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
                        neigh->updated = now;
                        if (!immediate_ok) {
                                next = now + 1;
                        } else {
                                immediate_probe = true;
                                next = now + max(NEIGH_VAR(neigh->parms,
                                                           RETRANS_TIME),
                                                 HZ / 100);
                        }
                        neigh_add_timer(neigh, next);
                } else {
                        WRITE_ONCE(neigh->nud_state, NUD_FAILED);
                        neigh->updated = jiffies;
                        write_unlock_bh(&neigh->lock);

                        kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
                        return 1;
                }
        } else if (neigh->nud_state & NUD_STALE) {
                neigh_dbg(2, "neigh %p is delayed\n", neigh);
                neigh_del_timer(neigh);
                WRITE_ONCE(neigh->nud_state, NUD_DELAY);
                neigh->updated = jiffies;
                neigh_add_timer(neigh, jiffies +
                                NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
        }

        if (neigh->nud_state == NUD_INCOMPLETE) {
                if (skb) {
                        while (neigh->arp_queue_len_bytes + skb->truesize >
                               NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
                                struct sk_buff *buff;

                                buff = __skb_dequeue(&neigh->arp_queue);
                                if (!buff)
                                        break;
                                neigh->arp_queue_len_bytes -= buff->truesize;
                                kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL);
                                NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
                        }
                        skb_dst_force(skb);
                        __skb_queue_tail(&neigh->arp_queue, skb);
                        neigh->arp_queue_len_bytes += skb->truesize;
                }
                rc = 1;
        }
out_unlock_bh:
        if (immediate_probe)
                neigh_probe(neigh);
        else
                write_unlock(&neigh->lock);
        local_bh_enable();
        trace_neigh_event_send_done(neigh, rc);
        return rc;

out_dead:
        if (neigh->nud_state & NUD_STALE)
                goto out_unlock_bh;
        write_unlock_bh(&neigh->lock);
        kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD);
        trace_neigh_event_send_dead(neigh, 1);
        return 1;
}
EXPORT_SYMBOL(__neigh_event_send);

static void neigh_update_hhs(struct neighbour *neigh)
{
        struct hh_cache *hh;
        void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
                = NULL;

        if (neigh->dev->header_ops)
                update = neigh->dev->header_ops->cache_update;

        if (update) {
                hh = &neigh->hh;
                if (READ_ONCE(hh->hh_len)) {
                        write_seqlock_bh(&hh->hh_lock);
                        update(hh, neigh->dev, neigh->ha);
                        write_sequnlock_bh(&hh->hh_lock);
                }
        }
}

static void neigh_update_process_arp_queue(struct neighbour *neigh)
        __releases(neigh->lock)
        __acquires(neigh->lock)
{
        struct sk_buff *skb;

        /* Again: avoid deadlock if something went wrong. */
        while (neigh->nud_state & NUD_VALID &&
               (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
                struct dst_entry *dst = skb_dst(skb);
                struct neighbour *n2, *n1 = neigh;

                write_unlock_bh(&neigh->lock);

                rcu_read_lock();

                /* Why not just use 'neigh' as-is?  The problem is that
                 * things such as shaper, eql, and sch_teql can end up
                 * using alternative, different, neigh objects to output
                 * the packet in the output path.  So what we need to do
                 * here is re-lookup the top-level neigh in the path so
                 * we can reinject the packet there.
                 */
                n2 = NULL;
                if (dst &&
                    READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
                        n2 = dst_neigh_lookup_skb(dst, skb);
                        if (n2)
                                n1 = n2;
                }
                READ_ONCE(n1->output)(n1, skb);
                if (n2)
                        neigh_release(n2);
                rcu_read_unlock();

                write_lock_bh(&neigh->lock);
        }
        __skb_queue_purge(&neigh->arp_queue);
        neigh->arp_queue_len_bytes = 0;
}

/* Generic update routine.
   -- lladdr is new lladdr or NULL, if it is not supplied.
   -- new    is new state.
   -- flags
        NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
                                if it is different.
        NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
                                lladdr instead of overriding it
                                if it is different.
        NEIGH_UPDATE_F_ADMIN        means that the change is administrative.
        NEIGH_UPDATE_F_USE        means that the entry is user triggered.
        NEIGH_UPDATE_F_MANAGED        means that the entry will be auto-refreshed.
        NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
                                NTF_ROUTER flag.
        NEIGH_UPDATE_F_ISROUTER        indicates if the neighbour is known as
                                a router.
        NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed
                                or invalidated.

   Caller MUST hold reference count on the entry.
 */
static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
                          u8 new, u32 flags, u32 nlmsg_pid,
                          struct netlink_ext_ack *extack)
{
        bool gc_update = false, managed_update = false;
        bool process_arp_queue = false;
        int update_isrouter = 0;
        struct net_device *dev;
        int err, notify = 0;
        u8 old;

        trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid);

        write_lock_bh(&neigh->lock);

        dev    = neigh->dev;
        old    = neigh->nud_state;
        err    = -EPERM;

        if (neigh->dead) {
                NL_SET_ERR_MSG(extack, "Neighbor entry is now dead");
                new = old;
                goto out;
        }
        if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
            (old & (NUD_NOARP | NUD_PERMANENT)))
                goto out;

        neigh_update_flags(neigh, flags, &notify, &gc_update, &managed_update);
        if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) {
                new = old & ~NUD_PERMANENT;
                WRITE_ONCE(neigh->nud_state, new);
                err = 0;
                goto out;
        }

        if (!(new & NUD_VALID)) {
                neigh_del_timer(neigh);
                if (old & NUD_CONNECTED)
                        neigh_suspect(neigh);
                WRITE_ONCE(neigh->nud_state, new);
                err = 0;
                notify = old & NUD_VALID;
                if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
                    (new & NUD_FAILED)) {
                        neigh_invalidate(neigh);
                        notify = 1;
                }
                goto out;
        }

        /* Compare new lladdr with cached one */
        if (!dev->addr_len) {
                /* First case: device needs no address. */
                lladdr = neigh->ha;
        } else if (lladdr) {
                /* The second case: if something is already cached
                   and a new address is proposed:
                   - compare new & old
                   - if they are different, check override flag
                 */
                if ((old & NUD_VALID) &&
                    !memcmp(lladdr, neigh->ha, dev->addr_len))
                        lladdr = neigh->ha;
        } else {
                /* No address is supplied; if we know something,
                   use it, otherwise discard the request.
                 */
                err = -EINVAL;
                if (!(old & NUD_VALID)) {
                        NL_SET_ERR_MSG(extack, "No link layer address given");
                        goto out;
                }
                lladdr = neigh->ha;
        }

        /* Update confirmed timestamp for neighbour entry after we
         * received ARP packet even if it doesn't change IP to MAC binding.
         */
        if (new & NUD_CONNECTED)
                neigh->confirmed = jiffies;

        /* If entry was valid and address is not changed,
           do not change entry state, if new one is STALE.
         */
        err = 0;
        update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
        if (old & NUD_VALID) {
                if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
                        update_isrouter = 0;
                        if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
                            (old & NUD_CONNECTED)) {
                                lladdr = neigh->ha;
                                new = NUD_STALE;
                        } else
                                goto out;
                } else {
                        if (lladdr == neigh->ha && new == NUD_STALE &&
                            !(flags & NEIGH_UPDATE_F_ADMIN))
                                new = old;
                }
        }

        /* Update timestamp only once we know we will make a change to the
         * neighbour entry. Otherwise we risk to move the locktime window with
         * noop updates and ignore relevant ARP updates.
         */
        if (new != old || lladdr != neigh->ha)
                neigh->updated = jiffies;

        if (new != old) {
                neigh_del_timer(neigh);
                if (new & NUD_PROBE)
                        atomic_set(&neigh->probes, 0);
                if (new & NUD_IN_TIMER)
                        neigh_add_timer(neigh, (jiffies +
                                                ((new & NUD_REACHABLE) ?
                                                 neigh->parms->reachable_time :
                                                 0)));
                WRITE_ONCE(neigh->nud_state, new);
                notify = 1;
        }

        if (lladdr != neigh->ha) {
                write_seqlock(&neigh->ha_lock);
                memcpy(&neigh->ha, lladdr, dev->addr_len);
                write_sequnlock(&neigh->ha_lock);
                neigh_update_hhs(neigh);
                if (!(new & NUD_CONNECTED))
                        neigh->confirmed = jiffies -
                                      (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
                notify = 1;
        }
        if (new == old)
                goto out;
        if (new & NUD_CONNECTED)
                neigh_connect(neigh);
        else
                neigh_suspect(neigh);

        if (!(old & NUD_VALID))
                process_arp_queue = true;

out:
        if (update_isrouter)
                neigh_update_is_router(neigh, flags, &notify);

        if (notify)
                __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid);

        if (process_arp_queue)
                neigh_update_process_arp_queue(neigh);

        write_unlock_bh(&neigh->lock);

        if (((new ^ old) & NUD_PERMANENT) || gc_update)
                neigh_update_gc_list(neigh);
        if (managed_update)
                neigh_update_managed_list(neigh);

        if (notify)
                call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);

        trace_neigh_update_done(neigh, err);
        return err;
}

int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
                 u32 flags, u32 nlmsg_pid)
{
        return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL);
}
EXPORT_SYMBOL(neigh_update);

/* Update the neigh to listen temporarily for probe responses, even if it is
 * in a NUD_FAILED state. The caller has to hold neigh->lock for writing.
 */
void __neigh_set_probe_once(struct neighbour *neigh)
{
        if (neigh->dead)
                return;
        neigh->updated = jiffies;
        if (!(neigh->nud_state & NUD_FAILED))
                return;
        WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
        atomic_set(&neigh->probes, neigh_max_probes(neigh));
        neigh_add_timer(neigh,
                        jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
                                      HZ/100));
}
EXPORT_SYMBOL(__neigh_set_probe_once);

struct neighbour *neigh_event_ns(struct neigh_table *tbl,
                                 u8 *lladdr, void *saddr,
                                 struct net_device *dev)
{
        struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
                                                 lladdr || !dev->addr_len);
        if (neigh)
                neigh_update(neigh, lladdr, NUD_STALE,
                             NEIGH_UPDATE_F_OVERRIDE, 0);
        return neigh;
}
EXPORT_SYMBOL(neigh_event_ns);

/* called with read_lock_bh(&n->lock); */
static void neigh_hh_init(struct neighbour *n)
{
        struct net_device *dev = n->dev;
        __be16 prot = n->tbl->protocol;
        struct hh_cache        *hh = &n->hh;

        write_lock_bh(&n->lock);

        /* Only one thread can come in here and initialize the
         * hh_cache entry.
         */
        if (!hh->hh_len)
                dev->header_ops->cache(n, hh, prot);

        write_unlock_bh(&n->lock);
}

/* Slow and careful. */

int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
        int rc = 0;

        if (!neigh_event_send(neigh, skb)) {
                int err;
                struct net_device *dev = neigh->dev;
                unsigned int seq;

                if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
                        neigh_hh_init(neigh);

                do {
                        __skb_pull(skb, skb_network_offset(skb));
                        seq = read_seqbegin(&neigh->ha_lock);
                        err = dev_hard_header(skb, dev, ntohs(skb->protocol),
                                              neigh->ha, NULL, skb->len);
                } while (read_seqretry(&neigh->ha_lock, seq));

                if (err >= 0)
                        rc = dev_queue_xmit(skb);
                else
                        goto out_kfree_skb;
        }
out:
        return rc;
out_kfree_skb:
        rc = -EINVAL;
        kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
        goto out;
}
EXPORT_SYMBOL(neigh_resolve_output);

/* As fast as possible without hh cache */

int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
{
        struct net_device *dev = neigh->dev;
        unsigned int seq;
        int err;

        do {
                __skb_pull(skb, skb_network_offset(skb));
                seq = read_seqbegin(&neigh->ha_lock);
                err = dev_hard_header(skb, dev, ntohs(skb->protocol),
                                      neigh->ha, NULL, skb->len);
        } while (read_seqretry(&neigh->ha_lock, seq));

        if (err >= 0)
                err = dev_queue_xmit(skb);
        else {
                err = -EINVAL;
                kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
        }
        return err;
}
EXPORT_SYMBOL(neigh_connected_output);

int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
{
        return dev_queue_xmit(skb);
}
EXPORT_SYMBOL(neigh_direct_output);

static void neigh_managed_work(struct work_struct *work)
{
        struct neigh_table *tbl = container_of(work, struct neigh_table,
                                               managed_work.work);
        struct neighbour *neigh;

        spin_lock_bh(&tbl->lock);
        list_for_each_entry(neigh, &tbl->managed_list, managed_list)
                neigh_event_send_probe(neigh, NULL, false);
        queue_delayed_work(system_power_efficient_wq, &tbl->managed_work,
                           NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS));
        spin_unlock_bh(&tbl->lock);
}

static void neigh_proxy_process(struct timer_list *t)
{
        struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer);
        long sched_next = 0;
        unsigned long now = jiffies;
        struct sk_buff *skb, *n;

        spin_lock(&tbl->proxy_queue.lock);

        skb_queue_walk_safe(&tbl->proxy_queue, skb, n) {
                long tdif = NEIGH_CB(skb)->sched_next - now;

                if (tdif <= 0) {
                        struct net_device *dev = skb->dev;

                        neigh_parms_qlen_dec(dev, tbl->family);
                        __skb_unlink(skb, &tbl->proxy_queue);

                        if (tbl->proxy_redo && netif_running(dev)) {
                                rcu_read_lock();
                                tbl->proxy_redo(skb);
                                rcu_read_unlock();
                        } else {
                                kfree_skb(skb);
                        }

                        dev_put(dev);
                } else if (!sched_next || tdif < sched_next)
                        sched_next = tdif;
        }
        timer_delete(&tbl->proxy_timer);
        if (sched_next)
                mod_timer(&tbl->proxy_timer, jiffies + sched_next);
        spin_unlock(&tbl->proxy_queue.lock);
}

static unsigned long neigh_proxy_delay(struct neigh_parms *p)
{
        /* If proxy_delay is zero, do not call get_random_u32_below()
         * as it is undefined behavior.
         */
        unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY);

        return proxy_delay ?
               jiffies + get_random_u32_below(proxy_delay) : jiffies;
}

void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
                    struct sk_buff *skb)
{
        unsigned long sched_next = neigh_proxy_delay(p);

        if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) {
                kfree_skb(skb);
                return;
        }

        NEIGH_CB(skb)->sched_next = sched_next;
        NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;

        spin_lock(&tbl->proxy_queue.lock);
        if (timer_delete(&tbl->proxy_timer)) {
                if (time_before(tbl->proxy_timer.expires, sched_next))
                        sched_next = tbl->proxy_timer.expires;
        }
        skb_dst_drop(skb);
        dev_hold(skb->dev);
        __skb_queue_tail(&tbl->proxy_queue, skb);
        p->qlen++;
        mod_timer(&tbl->proxy_timer, sched_next);
        spin_unlock(&tbl->proxy_queue.lock);
}
EXPORT_SYMBOL(pneigh_enqueue);

static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
                                                      struct net *net, int ifindex)
{
        struct neigh_parms *p;

        list_for_each_entry(p, &tbl->parms_list, list) {
                if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
                    (!p->dev && !ifindex && net_eq(net, &init_net)))
                        return p;
        }

        return NULL;
}

struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
                                      struct neigh_table *tbl)
{
        struct neigh_parms *p;
        struct net *net = dev_net(dev);
        const struct net_device_ops *ops = dev->netdev_ops;

        p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
        if (p) {
                p->tbl                  = tbl;
                refcount_set(&p->refcnt, 1);
                neigh_set_reach_time(p);
                p->qlen = 0;
                netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
                p->dev = dev;
                write_pnet(&p->net, net);
                p->sysctl_table = NULL;

                if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
                        netdev_put(dev, &p->dev_tracker);
                        kfree(p);
                        return NULL;
                }

                spin_lock_bh(&tbl->lock);
                list_add_rcu(&p->list, &tbl->parms.list);
                spin_unlock_bh(&tbl->lock);

                neigh_parms_data_state_cleanall(p);
        }
        return p;
}
EXPORT_SYMBOL(neigh_parms_alloc);

static void neigh_rcu_free_parms(struct rcu_head *head)
{
        struct neigh_parms *parms =
                container_of(head, struct neigh_parms, rcu_head);

        neigh_parms_put(parms);
}

void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
{
        if (!parms || parms == &tbl->parms)
                return;

        spin_lock_bh(&tbl->lock);
        list_del_rcu(&parms->list);
        parms->dead = 1;
        spin_unlock_bh(&tbl->lock);

        netdev_put(parms->dev, &parms->dev_tracker);
        call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
}
EXPORT_SYMBOL(neigh_parms_release);

static struct lock_class_key neigh_table_proxy_queue_class;

static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly;

void neigh_table_init(int index, struct neigh_table *tbl)
{
        unsigned long now = jiffies;
        unsigned long phsize;

        INIT_LIST_HEAD(&tbl->parms_list);
        INIT_LIST_HEAD(&tbl->gc_list);
        INIT_LIST_HEAD(&tbl->managed_list);

        list_add(&tbl->parms.list, &tbl->parms_list);
        write_pnet(&tbl->parms.net, &init_net);
        refcount_set(&tbl->parms.refcnt, 1);
        neigh_set_reach_time(&tbl->parms);
        tbl->parms.qlen = 0;

        tbl->stats = alloc_percpu(struct neigh_statistics);
        if (!tbl->stats)
                panic("cannot create neighbour cache statistics");

#ifdef CONFIG_PROC_FS
        if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat,
                              &neigh_stat_seq_ops, tbl))
                panic("cannot create neighbour proc dir entry");
#endif

        RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));

        phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
        tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);

        if (!tbl->nht || !tbl->phash_buckets)
                panic("cannot allocate neighbour cache hashes");

        if (!tbl->entry_size)
                tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) +
                                        tbl->key_len, NEIGH_PRIV_ALIGN);
        else
                WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);

        spin_lock_init(&tbl->lock);
        mutex_init(&tbl->phash_lock);

        INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
        queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
                        tbl->parms.reachable_time);
        INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work);
        queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0);

        timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0);
        skb_queue_head_init_class(&tbl->proxy_queue,
                        &neigh_table_proxy_queue_class);

        tbl->last_flush = now;
        tbl->last_rand        = now + tbl->parms.reachable_time * 20;

        rcu_assign_pointer(neigh_tables[index], tbl);
}
EXPORT_SYMBOL(neigh_table_init);

/*
 * Only called from ndisc_cleanup(), which means this is dead code
 * because we no longer can unload IPv6 module.
 */
int neigh_table_clear(int index, struct neigh_table *tbl)
{
        RCU_INIT_POINTER(neigh_tables[index], NULL);
        synchronize_rcu();

        /* It is not clean... Fix it to unload IPv6 module safely */
        cancel_delayed_work_sync(&tbl->managed_work);
        cancel_delayed_work_sync(&tbl->gc_work);
        timer_delete_sync(&tbl->proxy_timer);
        pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family);
        neigh_ifdown(tbl, NULL);
        if (atomic_read(&tbl->entries))
                pr_crit("neighbour leakage\n");

        call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
                 neigh_hash_free_rcu);
        tbl->nht = NULL;

        kfree(tbl->phash_buckets);
        tbl->phash_buckets = NULL;

        remove_proc_entry(tbl->id, init_net.proc_net_stat);

        free_percpu(tbl->stats);
        tbl->stats = NULL;

        return 0;
}
EXPORT_SYMBOL(neigh_table_clear);

static struct neigh_table *neigh_find_table(int family)
{
        struct neigh_table *tbl = NULL;

        switch (family) {
        case AF_INET:
                tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]);
                break;
        case AF_INET6:
                tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]);
                break;
        }

        return tbl;
}

const struct nla_policy nda_policy[NDA_MAX+1] = {
        [NDA_UNSPEC]                = { .strict_start_type = NDA_NH_ID },
        [NDA_DST]                = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
        [NDA_LLADDR]                = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
        [NDA_CACHEINFO]                = { .len = sizeof(struct nda_cacheinfo) },
        [NDA_PROBES]                = { .type = NLA_U32 },
        [NDA_VLAN]                = { .type = NLA_U16 },
        [NDA_PORT]                = { .type = NLA_U16 },
        [NDA_VNI]                = { .type = NLA_U32 },
        [NDA_IFINDEX]                = { .type = NLA_U32 },
        [NDA_MASTER]                = { .type = NLA_U32 },
        [NDA_PROTOCOL]                = { .type = NLA_U8 },
        [NDA_NH_ID]                = { .type = NLA_U32 },
        [NDA_FLAGS_EXT]                = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK),
        [NDA_FDB_EXT_ATTRS]        = { .type = NLA_NESTED },
};

static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ndmsg *ndm;
        struct nlattr *dst_attr;
        struct neigh_table *tbl;
        struct neighbour *neigh;
        struct net_device *dev = NULL;
        int err = -EINVAL;

        ASSERT_RTNL();
        if (nlmsg_len(nlh) < sizeof(*ndm))
                goto out;

        dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
        if (!dst_attr) {
                NL_SET_ERR_MSG(extack, "Network address not specified");
                goto out;
        }

        ndm = nlmsg_data(nlh);
        if (ndm->ndm_ifindex) {
                dev = __dev_get_by_index(net, ndm->ndm_ifindex);
                if (dev == NULL) {
                        err = -ENODEV;
                        goto out;
                }
        }

        tbl = neigh_find_table(ndm->ndm_family);
        if (tbl == NULL)
                return -EAFNOSUPPORT;

        if (nla_len(dst_attr) < (int)tbl->key_len) {
                NL_SET_ERR_MSG(extack, "Invalid network address");
                goto out;
        }

        if (ndm->ndm_flags & NTF_PROXY) {
                err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
                goto out;
        }

        if (dev == NULL)
                goto out;

        neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
        if (neigh == NULL) {
                err = -ENOENT;
                goto out;
        }

        err = __neigh_update(neigh, NULL, NUD_FAILED,
                             NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN,
                             NETLINK_CB(skb).portid, extack);
        spin_lock_bh(&tbl->lock);
        neigh_release(neigh);
        neigh_remove_one(neigh);
        spin_unlock_bh(&tbl->lock);

out:
        return err;
}

static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
                     struct netlink_ext_ack *extack)
{
        int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE |
                    NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
        struct net *net = sock_net(skb->sk);
        struct ndmsg *ndm;
        struct nlattr *tb[NDA_MAX+1];
        struct neigh_table *tbl;
        struct net_device *dev = NULL;
        struct neighbour *neigh;
        void *dst, *lladdr;
        u8 protocol = 0;
        u32 ndm_flags;
        int err;

        ASSERT_RTNL();
        err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
                                     nda_policy, extack);
        if (err < 0)
                goto out;

        err = -EINVAL;
        if (!tb[NDA_DST]) {
                NL_SET_ERR_MSG(extack, "Network address not specified");
                goto out;
        }

        ndm = nlmsg_data(nlh);
        ndm_flags = ndm->ndm_flags;
        if (tb[NDA_FLAGS_EXT]) {
                u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]);

                BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE <
                             (sizeof(ndm->ndm_flags) * BITS_PER_BYTE +
                              hweight32(NTF_EXT_MASK)));
                ndm_flags |= (ext << NTF_EXT_SHIFT);
        }
        if (ndm->ndm_ifindex) {
                dev = __dev_get_by_index(net, ndm->ndm_ifindex);
                if (dev == NULL) {
                        err = -ENODEV;
                        goto out;
                }

                if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) {
                        NL_SET_ERR_MSG(extack, "Invalid link address");
                        goto out;
                }
        }

        tbl = neigh_find_table(ndm->ndm_family);
        if (tbl == NULL)
                return -EAFNOSUPPORT;

        if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) {
                NL_SET_ERR_MSG(extack, "Invalid network address");
                goto out;
        }

        dst = nla_data(tb[NDA_DST]);
        lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;

        if (tb[NDA_PROTOCOL])
                protocol = nla_get_u8(tb[NDA_PROTOCOL]);
        if (ndm_flags & NTF_PROXY) {
                if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) {
                        NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination");
                        goto out;
                }

                err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol,
                                    !!(ndm->ndm_state & NUD_PERMANENT));
                goto out;
        }

        if (!dev) {
                NL_SET_ERR_MSG(extack, "Device not specified");
                goto out;
        }

        if (tbl->allow_add && !tbl->allow_add(dev, extack)) {
                err = -EINVAL;
                goto out;
        }

        neigh = neigh_lookup(tbl, dst, dev);
        if (neigh == NULL) {
                bool ndm_permanent  = ndm->ndm_state & NUD_PERMANENT;
                bool exempt_from_gc = ndm_permanent ||
                                      ndm_flags & (NTF_EXT_LEARNED |
                                                   NTF_EXT_VALIDATED);

                if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
                        err = -ENOENT;
                        goto out;
                }
                if (ndm_permanent && (ndm_flags & NTF_MANAGED)) {
                        NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry");
                        err = -EINVAL;
                        goto out;
                }
                if (ndm_flags & NTF_EXT_VALIDATED) {
                        u8 state = ndm->ndm_state;

                        /* NTF_USE and NTF_MANAGED will result in the neighbor
                         * being created with an invalid state (NUD_NONE).
                         */
                        if (ndm_flags & (NTF_USE | NTF_MANAGED))
                                state = NUD_NONE;

                        if (!(state & NUD_VALID)) {
                                NL_SET_ERR_MSG(extack,
                                               "Cannot create externally validated neighbor with an invalid state");
                                err = -EINVAL;
                                goto out;
                        }
                }

                neigh = ___neigh_create(tbl, dst, dev,
                                        ndm_flags &
                                        (NTF_EXT_LEARNED | NTF_MANAGED |
                                         NTF_EXT_VALIDATED),
                                        exempt_from_gc, true);
                if (IS_ERR(neigh)) {
                        err = PTR_ERR(neigh);
                        goto out;
                }
        } else {
                if (nlh->nlmsg_flags & NLM_F_EXCL) {
                        err = -EEXIST;
                        neigh_release(neigh);
                        goto out;
                }
                if (ndm_flags & NTF_EXT_VALIDATED) {
                        u8 state = ndm->ndm_state;

                        /* NTF_USE and NTF_MANAGED do not update the existing
                         * state other than clearing it if it was
                         * NUD_PERMANENT.
                         */
                        if (ndm_flags & (NTF_USE | NTF_MANAGED))
                                state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT;

                        if (!(state & NUD_VALID)) {
                                NL_SET_ERR_MSG(extack,
                                               "Cannot mark neighbor as externally validated with an invalid state");
                                err = -EINVAL;
                                neigh_release(neigh);
                                goto out;
                        }
                }

                if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
                        flags &= ~(NEIGH_UPDATE_F_OVERRIDE |
                                   NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
        }

        if (protocol)
                neigh->protocol = protocol;
        if (ndm_flags & NTF_EXT_LEARNED)
                flags |= NEIGH_UPDATE_F_EXT_LEARNED;
        if (ndm_flags & NTF_ROUTER)
                flags |= NEIGH_UPDATE_F_ISROUTER;
        if (ndm_flags & NTF_MANAGED)
                flags |= NEIGH_UPDATE_F_MANAGED;
        if (ndm_flags & NTF_USE)
                flags |= NEIGH_UPDATE_F_USE;
        if (ndm_flags & NTF_EXT_VALIDATED)
                flags |= NEIGH_UPDATE_F_EXT_VALIDATED;

        err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
                             NETLINK_CB(skb).portid, extack);
        if (!err && ndm_flags & (NTF_USE | NTF_MANAGED))
                neigh_event_send(neigh, NULL);
        neigh_release(neigh);
out:
        return err;
}

static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
{
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, NDTA_PARMS);
        if (nest == NULL)
                return -ENOBUFS;

        if ((parms->dev &&
             nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) ||
            nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) ||
            nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
                        NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
            /* approximative value for deprecated QUEUE_LEN (in packets) */
            nla_put_u32(skb, NDTPA_QUEUE_LEN,
                        NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) ||
            nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) ||
            nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) ||
            nla_put_u32(skb, NDTPA_UCAST_PROBES,
                        NEIGH_VAR(parms, UCAST_PROBES)) ||
            nla_put_u32(skb, NDTPA_MCAST_PROBES,
                        NEIGH_VAR(parms, MCAST_PROBES)) ||
            nla_put_u32(skb, NDTPA_MCAST_REPROBES,
                        NEIGH_VAR(parms, MCAST_REPROBES)) ||
            nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time),
                          NDTPA_PAD) ||
            nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
                          NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) ||
            nla_put_msecs(skb, NDTPA_GC_STALETIME,
                          NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) ||
            nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME,
                          NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) ||
            nla_put_msecs(skb, NDTPA_RETRANS_TIME,
                          NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) ||
            nla_put_msecs(skb, NDTPA_ANYCAST_DELAY,
                          NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) ||
            nla_put_msecs(skb, NDTPA_PROXY_DELAY,
                          NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
            nla_put_msecs(skb, NDTPA_LOCKTIME,
                          NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) ||
            nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS,
                          NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD))
                goto nla_put_failure;
        return nla_nest_end(skb, nest);

nla_put_failure:
        nla_nest_cancel(skb, nest);
        return -EMSGSIZE;
}

static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
                              u32 pid, u32 seq, int type, int flags)
{
        struct nlmsghdr *nlh;
        struct ndtmsg *ndtmsg;

        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        ndtmsg = nlmsg_data(nlh);
        ndtmsg->ndtm_family = tbl->family;
        ndtmsg->ndtm_pad1   = 0;
        ndtmsg->ndtm_pad2   = 0;

        if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
            nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval),
                          NDTA_PAD) ||
            nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) ||
            nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) ||
            nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3)))
                goto nla_put_failure;
        {
                unsigned long now = jiffies;
                long flush_delta = now - READ_ONCE(tbl->last_flush);
                long rand_delta = now - READ_ONCE(tbl->last_rand);
                struct neigh_hash_table *nht;
                struct ndt_config ndc = {
                        .ndtc_key_len                = tbl->key_len,
                        .ndtc_entry_size        = tbl->entry_size,
                        .ndtc_entries                = atomic_read(&tbl->entries),
                        .ndtc_last_flush        = jiffies_to_msecs(flush_delta),
                        .ndtc_last_rand                = jiffies_to_msecs(rand_delta),
                        .ndtc_proxy_qlen        = READ_ONCE(tbl->proxy_queue.qlen),
                };

                nht = rcu_dereference(tbl->nht);
                ndc.ndtc_hash_rnd = nht->hash_rnd[0];
                ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);

                if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
                        goto nla_put_failure;
        }

        {
                int cpu;
                struct ndt_stats ndst;

                memset(&ndst, 0, sizeof(ndst));

                for_each_possible_cpu(cpu) {
                        struct neigh_statistics        *st;

                        st = per_cpu_ptr(tbl->stats, cpu);
                        ndst.ndts_allocs                += READ_ONCE(st->allocs);
                        ndst.ndts_destroys                += READ_ONCE(st->destroys);
                        ndst.ndts_hash_grows                += READ_ONCE(st->hash_grows);
                        ndst.ndts_res_failed                += READ_ONCE(st->res_failed);
                        ndst.ndts_lookups                += READ_ONCE(st->lookups);
                        ndst.ndts_hits                        += READ_ONCE(st->hits);
                        ndst.ndts_rcv_probes_mcast        += READ_ONCE(st->rcv_probes_mcast);
                        ndst.ndts_rcv_probes_ucast        += READ_ONCE(st->rcv_probes_ucast);
                        ndst.ndts_periodic_gc_runs        += READ_ONCE(st->periodic_gc_runs);
                        ndst.ndts_forced_gc_runs        += READ_ONCE(st->forced_gc_runs);
                        ndst.ndts_table_fulls                += READ_ONCE(st->table_fulls);
                }

                if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst,
                                  NDTA_PAD))
                        goto nla_put_failure;
        }

        BUG_ON(tbl->parms.dev);
        if (neightbl_fill_parms(skb, &tbl->parms) < 0)
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int neightbl_fill_param_info(struct sk_buff *skb,
                                    struct neigh_table *tbl,
                                    struct neigh_parms *parms,
                                    u32 pid, u32 seq, int type,
                                    unsigned int flags)
{
        struct ndtmsg *ndtmsg;
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        ndtmsg = nlmsg_data(nlh);
        ndtmsg->ndtm_family = tbl->family;
        ndtmsg->ndtm_pad1   = 0;
        ndtmsg->ndtm_pad2   = 0;

        if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 ||
            neightbl_fill_parms(skb, parms) < 0)
                goto errout;

        nlmsg_end(skb, nlh);
        return 0;
errout:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
        [NDTA_NAME]                = { .type = NLA_STRING },
        [NDTA_THRESH1]                = { .type = NLA_U32 },
        [NDTA_THRESH2]                = { .type = NLA_U32 },
        [NDTA_THRESH3]                = { .type = NLA_U32 },
        [NDTA_GC_INTERVAL]        = { .type = NLA_U64 },
        [NDTA_PARMS]                = { .type = NLA_NESTED },
};

static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
        [NDTPA_IFINDEX]                        = { .type = NLA_U32 },
        [NDTPA_QUEUE_LEN]                = { .type = NLA_U32 },
        [NDTPA_QUEUE_LENBYTES]                = { .type = NLA_U32 },
        [NDTPA_PROXY_QLEN]                = { .type = NLA_U32 },
        [NDTPA_APP_PROBES]                = { .type = NLA_U32 },
        [NDTPA_UCAST_PROBES]                = { .type = NLA_U32 },
        [NDTPA_MCAST_PROBES]                = { .type = NLA_U32 },
        [NDTPA_MCAST_REPROBES]                = { .type = NLA_U32 },
        [NDTPA_BASE_REACHABLE_TIME]        = { .type = NLA_U64 },
        [NDTPA_GC_STALETIME]                = { .type = NLA_U64 },
        [NDTPA_DELAY_PROBE_TIME]        = { .type = NLA_U64 },
        [NDTPA_RETRANS_TIME]                = { .type = NLA_U64 },
        [NDTPA_ANYCAST_DELAY]                = { .type = NLA_U64 },
        [NDTPA_PROXY_DELAY]                = { .type = NLA_U64 },
        [NDTPA_LOCKTIME]                = { .type = NLA_U64 },
        [NDTPA_INTERVAL_PROBE_TIME_MS]        = { .type = NLA_U64, .min = 1 },
};

static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[NDTA_MAX + 1];
        struct neigh_table *tbl;
        struct ndtmsg *ndtmsg;
        bool found = false;
        int err, tidx;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
                                     nl_neightbl_policy, extack);
        if (err < 0)
                goto errout;

        if (tb[NDTA_NAME] == NULL) {
                err = -EINVAL;
                goto errout;
        }

        ndtmsg = nlmsg_data(nlh);

        rcu_read_lock();

        for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
                tbl = rcu_dereference(neigh_tables[tidx]);
                if (!tbl)
                        continue;

                if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
                        continue;

                if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) {
                        found = true;
                        break;
                }
        }

        if (!found) {
                rcu_read_unlock();
                err = -ENOENT;
                goto errout;
        }

        /*
         * We acquire tbl->lock to be nice to the periodic timers and
         * make sure they always see a consistent set of values.
         */
        spin_lock_bh(&tbl->lock);

        if (tb[NDTA_PARMS]) {
                struct nlattr *tbp[NDTPA_MAX+1];
                struct neigh_parms *p;
                int i, ifindex = 0;

                err = nla_parse_nested_deprecated(tbp, NDTPA_MAX,
                                                  tb[NDTA_PARMS],
                                                  nl_ntbl_parm_policy, extack);
                if (err < 0)
                        goto errout_tbl_lock;

                if (tbp[NDTPA_IFINDEX])
                        ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]);

                p = lookup_neigh_parms(tbl, net, ifindex);
                if (p == NULL) {
                        err = -ENOENT;
                        goto errout_tbl_lock;
                }

                for (i = 1; i <= NDTPA_MAX; i++) {
                        if (tbp[i] == NULL)
                                continue;

                        switch (i) {
                        case NDTPA_QUEUE_LEN:
                                NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
                                              nla_get_u32(tbp[i]) *
                                              SKB_TRUESIZE(ETH_FRAME_LEN));
                                break;
                        case NDTPA_QUEUE_LENBYTES:
                                NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
                                              nla_get_u32(tbp[i]));
                                break;
                        case NDTPA_PROXY_QLEN:
                                NEIGH_VAR_SET(p, PROXY_QLEN,
                                              nla_get_u32(tbp[i]));
                                break;
                        case NDTPA_APP_PROBES:
                                NEIGH_VAR_SET(p, APP_PROBES,
                                              nla_get_u32(tbp[i]));
                                break;
                        case NDTPA_UCAST_PROBES:
                                NEIGH_VAR_SET(p, UCAST_PROBES,
                                              nla_get_u32(tbp[i]));
                                break;
                        case NDTPA_MCAST_PROBES:
                                NEIGH_VAR_SET(p, MCAST_PROBES,
                                              nla_get_u32(tbp[i]));
                                break;
                        case NDTPA_MCAST_REPROBES:
                                NEIGH_VAR_SET(p, MCAST_REPROBES,
                                              nla_get_u32(tbp[i]));
                                break;
                        case NDTPA_BASE_REACHABLE_TIME:
                                NEIGH_VAR_SET(p, BASE_REACHABLE_TIME,
                                              nla_get_msecs(tbp[i]));
                                /* update reachable_time as well, otherwise, the change will
                                 * only be effective after the next time neigh_periodic_work
                                 * decides to recompute it (can be multiple minutes)
                                 */
                                neigh_set_reach_time(p);
                                break;
                        case NDTPA_GC_STALETIME:
                                NEIGH_VAR_SET(p, GC_STALETIME,
                                              nla_get_msecs(tbp[i]));
                                break;
                        case NDTPA_DELAY_PROBE_TIME:
                                NEIGH_VAR_SET(p, DELAY_PROBE_TIME,
                                              nla_get_msecs(tbp[i]));
                                call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
                                break;
                        case NDTPA_INTERVAL_PROBE_TIME_MS:
                                NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS,
                                              nla_get_msecs(tbp[i]));
                                break;
                        case NDTPA_RETRANS_TIME:
                                NEIGH_VAR_SET(p, RETRANS_TIME,
                                              nla_get_msecs(tbp[i]));
                                break;
                        case NDTPA_ANYCAST_DELAY:
                                NEIGH_VAR_SET(p, ANYCAST_DELAY,
                                              nla_get_msecs(tbp[i]));
                                break;
                        case NDTPA_PROXY_DELAY:
                                NEIGH_VAR_SET(p, PROXY_DELAY,
                                              nla_get_msecs(tbp[i]));
                                break;
                        case NDTPA_LOCKTIME:
                                NEIGH_VAR_SET(p, LOCKTIME,
                                              nla_get_msecs(tbp[i]));
                                break;
                        }
                }
        }

        err = -ENOENT;
        if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] ||
             tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) &&
            !net_eq(net, &init_net))
                goto errout_tbl_lock;

        if (tb[NDTA_THRESH1])
                WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1]));

        if (tb[NDTA_THRESH2])
                WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2]));

        if (tb[NDTA_THRESH3])
                WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3]));

        if (tb[NDTA_GC_INTERVAL])
                WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL]));

        err = 0;

errout_tbl_lock:
        spin_unlock_bh(&tbl->lock);
        rcu_read_unlock();
errout:
        return err;
}

static int neightbl_valid_dump_info(const struct nlmsghdr *nlh,
                                    struct netlink_ext_ack *extack)
{
        struct ndtmsg *ndtm;

        ndtm = nlmsg_payload(nlh, sizeof(*ndtm));
        if (!ndtm) {
                NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request");
                return -EINVAL;
        }

        if (ndtm->ndtm_pad1  || ndtm->ndtm_pad2) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request");
                return -EINVAL;
        }

        if (nlmsg_attrlen(nlh, sizeof(*ndtm))) {
                NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request");
                return -EINVAL;
        }

        return 0;
}

static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        int family, tidx, nidx = 0;
        int tbl_skip = cb->args[0];
        int neigh_skip = cb->args[1];
        struct neigh_table *tbl;

        if (cb->strict_check) {
                int err = neightbl_valid_dump_info(nlh, cb->extack);

                if (err < 0)
                        return err;
        }

        family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;

        rcu_read_lock();

        for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
                struct neigh_parms *p;

                tbl = rcu_dereference(neigh_tables[tidx]);
                if (!tbl)
                        continue;

                if (tidx < tbl_skip || (family && tbl->family != family))
                        continue;

                if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
                                       nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
                                       NLM_F_MULTI) < 0)
                        break;

                nidx = 0;
                p = list_next_entry(&tbl->parms, list);
                list_for_each_entry_from_rcu(p, &tbl->parms_list, list) {
                        if (!net_eq(neigh_parms_net(p), net))
                                continue;

                        if (nidx < neigh_skip)
                                goto next;

                        if (neightbl_fill_param_info(skb, tbl, p,
                                                     NETLINK_CB(cb->skb).portid,
                                                     nlh->nlmsg_seq,
                                                     RTM_NEWNEIGHTBL,
                                                     NLM_F_MULTI) < 0)
                                goto out;
                next:
                        nidx++;
                }

                neigh_skip = 0;
        }
out:
        rcu_read_unlock();

        cb->args[0] = tidx;
        cb->args[1] = nidx;

        return skb->len;
}

static int __neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
                             u32 pid, u32 seq, int type, unsigned int flags)
{
        u32 neigh_flags, neigh_flags_ext;
        unsigned long now = jiffies;
        struct nda_cacheinfo ci;
        struct nlmsghdr *nlh;
        struct ndmsg *ndm;

        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT;
        neigh_flags     = neigh->flags & NTF_OLD_MASK;

        ndm = nlmsg_data(nlh);
        ndm->ndm_family         = neigh->ops->family;
        ndm->ndm_pad1    = 0;
        ndm->ndm_pad2    = 0;
        ndm->ndm_flags         = neigh_flags;
        ndm->ndm_type         = neigh->type;
        ndm->ndm_ifindex = neigh->dev->ifindex;

        if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
                goto nla_put_failure;

        ndm->ndm_state         = neigh->nud_state;
        if (neigh->nud_state & NUD_VALID) {
                char haddr[MAX_ADDR_LEN];

                neigh_ha_snapshot(haddr, neigh, neigh->dev);
                if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0)
                        goto nla_put_failure;
        }

        ci.ndm_used         = jiffies_to_clock_t(now - neigh->used);
        ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
        ci.ndm_updated         = jiffies_to_clock_t(now - neigh->updated);
        ci.ndm_refcnt         = refcount_read(&neigh->refcnt) - 1;

        if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
            nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
                goto nla_put_failure;

        if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol))
                goto nla_put_failure;
        if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
                           u32 pid, u32 seq, int type, unsigned int flags)
        __releases(neigh->lock)
        __acquires(neigh->lock)
{
        int err;

        read_lock_bh(&neigh->lock);
        err = __neigh_fill_info(skb, neigh, pid, seq, type, flags);
        read_unlock_bh(&neigh->lock);

        return err;
}

static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
                            u32 pid, u32 seq, int type, unsigned int flags,
                            struct neigh_table *tbl)
{
        u32 neigh_flags, neigh_flags_ext;
        struct nlmsghdr *nlh;
        struct ndmsg *ndm;
        u8 protocol;

        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        neigh_flags = READ_ONCE(pn->flags);
        neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT;
        neigh_flags &= NTF_OLD_MASK;

        ndm = nlmsg_data(nlh);
        ndm->ndm_family         = tbl->family;
        ndm->ndm_pad1    = 0;
        ndm->ndm_pad2    = 0;
        ndm->ndm_flags         = neigh_flags | NTF_PROXY;
        ndm->ndm_type         = RTN_UNICAST;
        ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0;
        ndm->ndm_state         = NUD_NONE;

        if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
                goto nla_put_failure;

        protocol = READ_ONCE(pn->protocol);
        if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol))
                goto nla_put_failure;
        if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static bool neigh_master_filtered(struct net_device *dev, int master_idx)
{
        struct net_device *master;

        if (!master_idx)
                return false;

        master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL;

        /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another
         * invalid value for ifindex to denote "no master".
         */
        if (master_idx == -1)
                return !!master;

        if (!master || master->ifindex != master_idx)
                return true;

        return false;
}

static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx)
{
        if (filter_idx && (!dev || dev->ifindex != filter_idx))
                return true;

        return false;
}

struct neigh_dump_filter {
        int master_idx;
        int dev_idx;
};

static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
                            struct netlink_callback *cb,
                            struct neigh_dump_filter *filter)
{
        struct net *net = sock_net(skb->sk);
        struct neighbour *n;
        int err = 0, h, s_h = cb->args[1];
        int idx, s_idx = idx = cb->args[2];
        struct neigh_hash_table *nht;
        unsigned int flags = NLM_F_MULTI;

        if (filter->dev_idx || filter->master_idx)
                flags |= NLM_F_DUMP_FILTERED;

        nht = rcu_dereference(tbl->nht);

        for (h = s_h; h < (1 << nht->hash_shift); h++) {
                if (h > s_h)
                        s_idx = 0;
                idx = 0;
                neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) {
                        if (idx < s_idx || !net_eq(dev_net(n->dev), net))
                                goto next;
                        if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
                            neigh_master_filtered(n->dev, filter->master_idx))
                                goto next;
                        err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
                                              cb->nlh->nlmsg_seq,
                                              RTM_NEWNEIGH, flags);
                        if (err < 0)
                                goto out;
next:
                        idx++;
                }
        }
out:
        cb->args[1] = h;
        cb->args[2] = idx;
        return err;
}

static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
                             struct netlink_callback *cb,
                             struct neigh_dump_filter *filter)
{
        struct pneigh_entry *n;
        struct net *net = sock_net(skb->sk);
        int err = 0, h, s_h = cb->args[3];
        int idx, s_idx = idx = cb->args[4];
        unsigned int flags = NLM_F_MULTI;

        if (filter->dev_idx || filter->master_idx)
                flags |= NLM_F_DUMP_FILTERED;

        for (h = s_h; h <= PNEIGH_HASHMASK; h++) {
                if (h > s_h)
                        s_idx = 0;
                for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0;
                     n;
                     n = rcu_dereference(n->next)) {
                        if (idx < s_idx || pneigh_net(n) != net)
                                goto next;
                        if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
                            neigh_master_filtered(n->dev, filter->master_idx))
                                goto next;
                        err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
                                               cb->nlh->nlmsg_seq,
                                               RTM_NEWNEIGH, flags, tbl);
                        if (err < 0)
                                goto out;
                next:
                        idx++;
                }
        }

out:
        cb->args[3] = h;
        cb->args[4] = idx;
        return err;
}

static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
                                bool strict_check,
                                struct neigh_dump_filter *filter,
                                struct netlink_ext_ack *extack)
{
        struct nlattr *tb[NDA_MAX + 1];
        int err, i;

        if (strict_check) {
                struct ndmsg *ndm;

                ndm = nlmsg_payload(nlh, sizeof(*ndm));
                if (!ndm) {
                        NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request");
                        return -EINVAL;
                }

                if (ndm->ndm_pad1  || ndm->ndm_pad2  || ndm->ndm_ifindex ||
                    ndm->ndm_state || ndm->ndm_type) {
                        NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request");
                        return -EINVAL;
                }

                if (ndm->ndm_flags & ~NTF_PROXY) {
                        NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request");
                        return -EINVAL;
                }

                err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg),
                                                    tb, NDA_MAX, nda_policy,
                                                    extack);
        } else {
                err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb,
                                             NDA_MAX, nda_policy, extack);
        }
        if (err < 0)
                return err;

        for (i = 0; i <= NDA_MAX; ++i) {
                if (!tb[i])
                        continue;

                /* all new attributes should require strict_check */
                switch (i) {
                case NDA_IFINDEX:
                        filter->dev_idx = nla_get_u32(tb[i]);
                        break;
                case NDA_MASTER:
                        filter->master_idx = nla_get_u32(tb[i]);
                        break;
                default:
                        if (strict_check) {
                                NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request");
                                return -EINVAL;
                        }
                }
        }

        return 0;
}

static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct neigh_dump_filter filter = {};
        struct neigh_table *tbl;
        int t, family, s_t;
        int proxy = 0;
        int err;

        family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;

        /* check for full ndmsg structure presence, family member is
         * the same for both structures
         */
        if (nlmsg_len(nlh) >= sizeof(struct ndmsg) &&
            ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY)
                proxy = 1;

        err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack);
        if (err < 0 && cb->strict_check)
                return err;
        err = 0;

        s_t = cb->args[0];

        rcu_read_lock();
        for (t = 0; t < NEIGH_NR_TABLES; t++) {
                tbl = rcu_dereference(neigh_tables[t]);

                if (!tbl)
                        continue;
                if (t < s_t || (family && tbl->family != family))
                        continue;
                if (t > s_t)
                        memset(&cb->args[1], 0, sizeof(cb->args) -
                                                sizeof(cb->args[0]));
                if (proxy)
                        err = pneigh_dump_table(tbl, skb, cb, &filter);
                else
                        err = neigh_dump_table(tbl, skb, cb, &filter);
                if (err < 0)
                        break;
        }
        rcu_read_unlock();

        cb->args[0] = t;
        return err;
}

static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh,
                                         struct nlattr **tb,
                                         struct netlink_ext_ack *extack)
{
        struct ndmsg *ndm;
        int err, i;

        ndm = nlmsg_payload(nlh, sizeof(*ndm));
        if (!ndm) {
                NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request");
                return ERR_PTR(-EINVAL);
        }

        if (ndm->ndm_pad1  || ndm->ndm_pad2  || ndm->ndm_state ||
            ndm->ndm_type) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request");
                return ERR_PTR(-EINVAL);
        }

        if (ndm->ndm_flags & ~NTF_PROXY) {
                NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request");
                return ERR_PTR(-EINVAL);
        }

        if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) {
                NL_SET_ERR_MSG(extack, "No device specified");
                return ERR_PTR(-EINVAL);
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
                                            NDA_MAX, nda_policy, extack);
        if (err < 0)
                return ERR_PTR(err);

        for (i = 0; i <= NDA_MAX; ++i) {
                switch (i) {
                case NDA_DST:
                        if (!tb[i]) {
                                NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST);
                                return ERR_PTR(-EINVAL);
                        }
                        break;
                default:
                        if (!tb[i])
                                continue;

                        NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request");
                        return ERR_PTR(-EINVAL);
                }
        }

        return ndm;
}

static inline size_t neigh_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct ndmsg))
               + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
               + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
               + nla_total_size(sizeof(struct nda_cacheinfo))
               + nla_total_size(4)  /* NDA_PROBES */
               + nla_total_size(4)  /* NDA_FLAGS_EXT */
               + nla_total_size(1); /* NDA_PROTOCOL */
}

static inline size_t pneigh_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct ndmsg))
               + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
               + nla_total_size(4)  /* NDA_FLAGS_EXT */
               + nla_total_size(1); /* NDA_PROTOCOL */
}

static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                     struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        u32 pid = NETLINK_CB(in_skb).portid;
        struct nlattr *tb[NDA_MAX + 1];
        struct net_device *dev = NULL;
        u32 seq = nlh->nlmsg_seq;
        struct neigh_table *tbl;
        struct neighbour *neigh;
        struct sk_buff *skb;
        struct ndmsg *ndm;
        void *dst;
        int err;

        ndm = neigh_valid_get_req(nlh, tb, extack);
        if (IS_ERR(ndm))
                return PTR_ERR(ndm);

        if (ndm->ndm_flags & NTF_PROXY)
                skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL);
        else
                skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL);
        if (!skb)
                return -ENOBUFS;

        rcu_read_lock();

        tbl = neigh_find_table(ndm->ndm_family);
        if (!tbl) {
                NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request");
                err = -EAFNOSUPPORT;
                goto err_unlock;
        }

        if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) {
                NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request");
                err = -EINVAL;
                goto err_unlock;
        }

        dst = nla_data(tb[NDA_DST]);

        if (ndm->ndm_ifindex) {
                dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex);
                if (!dev) {
                        NL_SET_ERR_MSG(extack, "Unknown device ifindex");
                        err = -ENODEV;
                        goto err_unlock;
                }
        }

        if (ndm->ndm_flags & NTF_PROXY) {
                struct pneigh_entry *pn;

                pn = pneigh_lookup(tbl, net, dst, dev);
                if (!pn) {
                        NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found");
                        err = -ENOENT;
                        goto err_unlock;
                }

                err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl);
                if (err)
                        goto err_unlock;
        } else {
                neigh = neigh_lookup(tbl, dst, dev);
                if (!neigh) {
                        NL_SET_ERR_MSG(extack, "Neighbour entry not found");
                        err = -ENOENT;
                        goto err_unlock;
                }

                err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0);
                neigh_release(neigh);
                if (err)
                        goto err_unlock;
        }

        rcu_read_unlock();

        return rtnl_unicast(skb, net, pid);
err_unlock:
        rcu_read_unlock();
        kfree_skb(skb);
        return err;
}

void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
{
        int chain;
        struct neigh_hash_table *nht;

        rcu_read_lock();
        nht = rcu_dereference(tbl->nht);

        spin_lock_bh(&tbl->lock); /* avoid resizes */
        for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
                struct neighbour *n;

                neigh_for_each_in_bucket(n, &nht->hash_heads[chain])
                        cb(n, cookie);
        }
        spin_unlock_bh(&tbl->lock);
        rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_for_each);

/* The tbl->lock must be held as a writer and BH disabled. */
void __neigh_for_each_release(struct neigh_table *tbl,
                              int (*cb)(struct neighbour *))
{
        struct neigh_hash_table *nht;
        int chain;

        nht = rcu_dereference_protected(tbl->nht,
                                        lockdep_is_held(&tbl->lock));
        for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
                struct hlist_node *tmp;
                struct neighbour *n;

                neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) {
                        int release;

                        write_lock(&n->lock);
                        release = cb(n);
                        if (release) {
                                hlist_del_rcu(&n->hash);
                                hlist_del_rcu(&n->dev_list);
                                neigh_mark_dead(n);
                        }
                        write_unlock(&n->lock);
                        if (release)
                                neigh_cleanup_and_release(n);
                }
        }
}
EXPORT_SYMBOL(__neigh_for_each_release);

int neigh_xmit(int index, struct net_device *dev,
               const void *addr, struct sk_buff *skb)
{
        int err = -EAFNOSUPPORT;

        if (likely(index < NEIGH_NR_TABLES)) {
                struct neigh_table *tbl;
                struct neighbour *neigh;

                rcu_read_lock();
                tbl = rcu_dereference(neigh_tables[index]);
                if (!tbl)
                        goto out_unlock;
                if (index == NEIGH_ARP_TABLE) {
                        u32 key = *((u32 *)addr);

                        neigh = __ipv4_neigh_lookup_noref(dev, key);
                } else {
                        neigh = __neigh_lookup_noref(tbl, addr, dev);
                }
                if (!neigh)
                        neigh = __neigh_create(tbl, addr, dev, false);
                err = PTR_ERR(neigh);
                if (IS_ERR(neigh)) {
                        rcu_read_unlock();
                        goto out_kfree_skb;
                }
                err = READ_ONCE(neigh->output)(neigh, skb);
out_unlock:
                rcu_read_unlock();
        }
        else if (index == NEIGH_LINK_TABLE) {
                err = dev_hard_header(skb, dev, ntohs(skb->protocol),
                                      addr, NULL, skb->len);
                if (err < 0)
                        goto out_kfree_skb;
                err = dev_queue_xmit(skb);
        }
out:
        return err;
out_kfree_skb:
        kfree_skb(skb);
        goto out;
}
EXPORT_SYMBOL(neigh_xmit);

#ifdef CONFIG_PROC_FS

static struct neighbour *neigh_get_valid(struct seq_file *seq,
                                         struct neighbour *n,
                                         loff_t *pos)
{
        struct neigh_seq_state *state = seq->private;
        struct net *net = seq_file_net(seq);

        if (!net_eq(dev_net(n->dev), net))
                return NULL;

        if (state->neigh_sub_iter) {
                loff_t fakep = 0;
                void *v;

                v = state->neigh_sub_iter(state, n, pos ? pos : &fakep);
                if (!v)
                        return NULL;
                if (pos)
                        return v;
        }

        if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
                return n;

        if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
                return n;

        return NULL;
}

static struct neighbour *neigh_get_first(struct seq_file *seq)
{
        struct neigh_seq_state *state = seq->private;
        struct neigh_hash_table *nht = state->nht;
        struct neighbour *n, *tmp;

        state->flags &= ~NEIGH_SEQ_IS_PNEIGH;

        while (++state->bucket < (1 << nht->hash_shift)) {
                neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) {
                        tmp = neigh_get_valid(seq, n, NULL);
                        if (tmp)
                                return tmp;
                }
        }

        return NULL;
}

static struct neighbour *neigh_get_next(struct seq_file *seq,
                                        struct neighbour *n,
                                        loff_t *pos)
{
        struct neigh_seq_state *state = seq->private;
        struct neighbour *tmp;

        if (state->neigh_sub_iter) {
                void *v = state->neigh_sub_iter(state, n, pos);

                if (v)
                        return n;
        }

        hlist_for_each_entry_continue(n, hash) {
                tmp = neigh_get_valid(seq, n, pos);
                if (tmp) {
                        n = tmp;
                        goto out;
                }
        }

        n = neigh_get_first(seq);
out:
        if (n && pos)
                --(*pos);

        return n;
}

static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
{
        struct neighbour *n = neigh_get_first(seq);

        if (n) {
                --(*pos);
                while (*pos) {
                        n = neigh_get_next(seq, n, pos);
                        if (!n)
                                break;
                }
        }
        return *pos ? NULL : n;
}

static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
{
        struct neigh_seq_state *state = seq->private;
        struct net *net = seq_file_net(seq);
        struct neigh_table *tbl = state->tbl;
        struct pneigh_entry *pn = NULL;
        int bucket;

        state->flags |= NEIGH_SEQ_IS_PNEIGH;
        for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
                pn = rcu_dereference(tbl->phash_buckets[bucket]);

                while (pn && !net_eq(pneigh_net(pn), net))
                        pn = rcu_dereference(pn->next);
                if (pn)
                        break;
        }
        state->bucket = bucket;

        return pn;
}

static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
                                            struct pneigh_entry *pn,
                                            loff_t *pos)
{
        struct neigh_seq_state *state = seq->private;
        struct net *net = seq_file_net(seq);
        struct neigh_table *tbl = state->tbl;

        do {
                pn = rcu_dereference(pn->next);
        } while (pn && !net_eq(pneigh_net(pn), net));

        while (!pn) {
                if (++state->bucket > PNEIGH_HASHMASK)
                        break;

                pn = rcu_dereference(tbl->phash_buckets[state->bucket]);

                while (pn && !net_eq(pneigh_net(pn), net))
                        pn = rcu_dereference(pn->next);
                if (pn)
                        break;
        }

        if (pn && pos)
                --(*pos);

        return pn;
}

static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
{
        struct pneigh_entry *pn = pneigh_get_first(seq);

        if (pn) {
                --(*pos);
                while (*pos) {
                        pn = pneigh_get_next(seq, pn, pos);
                        if (!pn)
                                break;
                }
        }
        return *pos ? NULL : pn;
}

static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
{
        struct neigh_seq_state *state = seq->private;
        void *rc;
        loff_t idxpos = *pos;

        rc = neigh_get_idx(seq, &idxpos);
        if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY))
                rc = pneigh_get_idx(seq, &idxpos);

        return rc;
}

void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
        __acquires(tbl->lock)
        __acquires(rcu)
{
        struct neigh_seq_state *state = seq->private;

        state->tbl = tbl;
        state->bucket = -1;
        state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);

        rcu_read_lock();
        state->nht = rcu_dereference(tbl->nht);
        spin_lock_bh(&tbl->lock);

        return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
}
EXPORT_SYMBOL(neigh_seq_start);

void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct neigh_seq_state *state;
        void *rc;

        if (v == SEQ_START_TOKEN) {
                rc = neigh_get_first(seq);
                goto out;
        }

        state = seq->private;
        if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) {
                rc = neigh_get_next(seq, v, NULL);
                if (rc)
                        goto out;
                if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY))
                        rc = pneigh_get_first(seq);
        } else {
                BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY);
                rc = pneigh_get_next(seq, v, NULL);
        }
out:
        ++(*pos);
        return rc;
}
EXPORT_SYMBOL(neigh_seq_next);

void neigh_seq_stop(struct seq_file *seq, void *v)
        __releases(tbl->lock)
        __releases(rcu)
{
        struct neigh_seq_state *state = seq->private;
        struct neigh_table *tbl = state->tbl;

        spin_unlock_bh(&tbl->lock);
        rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_seq_stop);

/* statistics via seq_file */

static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
{
        struct neigh_table *tbl = pde_data(file_inode(seq->file));
        int cpu;

        if (*pos == 0)
                return SEQ_START_TOKEN;

        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
                if (!cpu_possible(cpu))
                        continue;
                *pos = cpu+1;
                return per_cpu_ptr(tbl->stats, cpu);
        }
        return NULL;
}

static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct neigh_table *tbl = pde_data(file_inode(seq->file));
        int cpu;

        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
                if (!cpu_possible(cpu))
                        continue;
                *pos = cpu+1;
                return per_cpu_ptr(tbl->stats, cpu);
        }
        (*pos)++;
        return NULL;
}

static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
{

}

static int neigh_stat_seq_show(struct seq_file *seq, void *v)
{
        struct neigh_table *tbl = pde_data(file_inode(seq->file));
        struct neigh_statistics *st = v;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "entries  allocs   destroys hash_grows lookups  hits     res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
                return 0;
        }

        seq_printf(seq, "%08x %08lx %08lx %08lx   %08lx %08lx %08lx   "
                        "%08lx         %08lx         %08lx         "
                        "%08lx       %08lx            %08lx\n",
                   atomic_read(&tbl->entries),

                   st->allocs,
                   st->destroys,
                   st->hash_grows,

                   st->lookups,
                   st->hits,

                   st->res_failed,

                   st->rcv_probes_mcast,
                   st->rcv_probes_ucast,

                   st->periodic_gc_runs,
                   st->forced_gc_runs,
                   st->unres_discards,
                   st->table_fulls
                   );

        return 0;
}

static const struct seq_operations neigh_stat_seq_ops = {
        .start        = neigh_stat_seq_start,
        .next        = neigh_stat_seq_next,
        .stop        = neigh_stat_seq_stop,
        .show        = neigh_stat_seq_show,
};
#endif /* CONFIG_PROC_FS */

static void __neigh_notify(struct neighbour *n, int type, int flags,
                           u32 pid)
{
        struct sk_buff *skb;
        int err = -ENOBUFS;
        struct net *net;

        rcu_read_lock();
        net = dev_net_rcu(n->dev);
        skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
        if (skb == NULL)
                goto errout;

        err = __neigh_fill_info(skb, n, pid, 0, type, flags);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
        goto out;
errout:
        rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
out:
        rcu_read_unlock();
}

static void neigh_notify(struct neighbour *neigh, int type, int flags, u32 pid)
{
        read_lock_bh(&neigh->lock);
        __neigh_notify(neigh, type, flags, pid);
        read_unlock_bh(&neigh->lock);
}

void neigh_app_ns(struct neighbour *n)
{
        neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0);
}
EXPORT_SYMBOL(neigh_app_ns);

#ifdef CONFIG_SYSCTL
static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);

static int proc_unres_qlen(const struct ctl_table *ctl, int write,
                           void *buffer, size_t *lenp, loff_t *ppos)
{
        int size, ret;
        struct ctl_table tmp = *ctl;

        tmp.extra1 = SYSCTL_ZERO;
        tmp.extra2 = &unres_qlen_max;
        tmp.data = &size;

        size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN);
        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);

        if (write && !ret)
                *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
        return ret;
}

static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
                                  int index)
{
        struct net_device *dev;
        int family = neigh_parms_family(p);

        rcu_read_lock();
        for_each_netdev_rcu(net, dev) {
                struct neigh_parms *dst_p =
                                neigh_get_dev_parms_rcu(dev, family);

                if (dst_p && !test_bit(index, dst_p->data_state))
                        dst_p->data[index] = p->data[index];
        }
        rcu_read_unlock();
}

static void neigh_proc_update(const struct ctl_table *ctl, int write)
{
        struct net_device *dev = ctl->extra1;
        struct neigh_parms *p = ctl->extra2;
        struct net *net = neigh_parms_net(p);
        int index = (int *) ctl->data - p->data;

        if (!write)
                return;

        set_bit(index, p->data_state);
        if (index == NEIGH_VAR_DELAY_PROBE_TIME)
                call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
        if (!dev) /* NULL dev means this is default value */
                neigh_copy_dflt_parms(net, p, index);
}

static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write,
                                           void *buffer, size_t *lenp,
                                           loff_t *ppos)
{
        struct ctl_table tmp = *ctl;
        int ret;

        tmp.extra1 = SYSCTL_ZERO;
        tmp.extra2 = SYSCTL_INT_MAX;

        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
        neigh_proc_update(ctl, write);
        return ret;
}

static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write,
                                                   void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table tmp = *ctl;
        int ret;

        int min = msecs_to_jiffies(1);

        tmp.extra1 = &min;
        tmp.extra2 = NULL;

        ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos);
        neigh_proc_update(ctl, write);
        return ret;
}

int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer,
                        size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);

        neigh_proc_update(ctl, write);
        return ret;
}
EXPORT_SYMBOL(neigh_proc_dointvec);

int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer,
                                size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);

        neigh_proc_update(ctl, write);
        return ret;
}
EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);

static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write,
                                              void *buffer, size_t *lenp,
                                              loff_t *ppos)
{
        int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);

        neigh_proc_update(ctl, write);
        return ret;
}

int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write,
                                   void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);

        neigh_proc_update(ctl, write);
        return ret;
}
EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);

static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write,
                                          void *buffer, size_t *lenp,
                                          loff_t *ppos)
{
        int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);

        neigh_proc_update(ctl, write);
        return ret;
}

static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write,
                                          void *buffer, size_t *lenp,
                                          loff_t *ppos)
{
        struct neigh_parms *p = ctl->extra2;
        int ret;

        if (strcmp(ctl->procname, "base_reachable_time") == 0)
                ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
        else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0)
                ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
        else
                ret = -1;

        if (write && ret == 0) {
                /* update reachable_time as well, otherwise, the change will
                 * only be effective after the next time neigh_periodic_work
                 * decides to recompute it
                 */
                neigh_set_reach_time(p);
        }
        return ret;
}

#define NEIGH_PARMS_DATA_OFFSET(index)        \
        (&((struct neigh_parms *) 0)->data[index])

#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \
        [NEIGH_VAR_ ## attr] = { \
                .procname        = name, \
                .data                = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \
                .maxlen                = sizeof(int), \
                .mode                = mval, \
                .proc_handler        = proc, \
        }

#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \
        NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax)

#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \
        NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies)

#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
        NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)

#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \
        NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive)

#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
        NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)

#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \
        NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen)

static struct neigh_sysctl_table {
        struct ctl_table_header *sysctl_header;
        struct ctl_table neigh_vars[NEIGH_VAR_MAX];
} neigh_sysctl_template __read_mostly = {
        .neigh_vars = {
                NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
                NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"),
                NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"),
                NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"),
                NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
                NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
                NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
                NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS,
                                                       "interval_probe_time_ms"),
                NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
                NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
                NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
                NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"),
                NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"),
                NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"),
                NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"),
                NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"),
                NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"),
                [NEIGH_VAR_GC_INTERVAL] = {
                        .procname        = "gc_interval",
                        .maxlen                = sizeof(int),
                        .mode                = 0644,
                        .proc_handler        = proc_dointvec_jiffies,
                },
                [NEIGH_VAR_GC_THRESH1] = {
                        .procname        = "gc_thresh1",
                        .maxlen                = sizeof(int),
                        .mode                = 0644,
                        .extra1                = SYSCTL_ZERO,
                        .extra2                = SYSCTL_INT_MAX,
                        .proc_handler        = proc_dointvec_minmax,
                },
                [NEIGH_VAR_GC_THRESH2] = {
                        .procname        = "gc_thresh2",
                        .maxlen                = sizeof(int),
                        .mode                = 0644,
                        .extra1                = SYSCTL_ZERO,
                        .extra2                = SYSCTL_INT_MAX,
                        .proc_handler        = proc_dointvec_minmax,
                },
                [NEIGH_VAR_GC_THRESH3] = {
                        .procname        = "gc_thresh3",
                        .maxlen                = sizeof(int),
                        .mode                = 0644,
                        .extra1                = SYSCTL_ZERO,
                        .extra2                = SYSCTL_INT_MAX,
                        .proc_handler        = proc_dointvec_minmax,
                },
        },
};

int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
                          proc_handler *handler)
{
        int i;
        struct neigh_sysctl_table *t;
        const char *dev_name_source;
        char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
        char *p_name;
        size_t neigh_vars_size;

        t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT);
        if (!t)
                goto err;

        for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) {
                t->neigh_vars[i].data += (long) p;
                t->neigh_vars[i].extra1 = dev;
                t->neigh_vars[i].extra2 = p;
        }

        neigh_vars_size = ARRAY_SIZE(t->neigh_vars);
        if (dev) {
                dev_name_source = dev->name;
                /* Terminate the table early */
                neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1;
        } else {
                struct neigh_table *tbl = p->tbl;
                dev_name_source = "default";
                t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
                t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
                t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
                t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
        }

        if (handler) {
                /* RetransTime */
                t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
                /* ReachableTime */
                t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
                /* RetransTime (in milliseconds)*/
                t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
                /* ReachableTime (in milliseconds) */
                t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
        } else {
                /* Those handlers will update p->reachable_time after
                 * base_reachable_time(_ms) is set to ensure the new timer starts being
                 * applied after the next neighbour update instead of waiting for
                 * neigh_periodic_work to update its value (can be multiple minutes)
                 * So any handler that replaces them should do this as well
                 */
                /* ReachableTime */
                t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler =
                        neigh_proc_base_reachable_time;
                /* ReachableTime (in milliseconds) */
                t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler =
                        neigh_proc_base_reachable_time;
        }

        switch (neigh_parms_family(p)) {
        case AF_INET:
              p_name = "ipv4";
              break;
        case AF_INET6:
              p_name = "ipv6";
              break;
        default:
              BUG();
        }

        snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
                p_name, dev_name_source);
        t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p),
                                                  neigh_path, t->neigh_vars,
                                                  neigh_vars_size);
        if (!t->sysctl_header)
                goto free;

        p->sysctl_table = t;
        return 0;

free:
        kfree(t);
err:
        return -ENOBUFS;
}
EXPORT_SYMBOL(neigh_sysctl_register);

void neigh_sysctl_unregister(struct neigh_parms *p)
{
        if (p->sysctl_table) {
                struct neigh_sysctl_table *t = p->sysctl_table;
                p->sysctl_table = NULL;
                unregister_net_sysctl_table(t->sysctl_header);
                kfree(t);
        }
}
EXPORT_SYMBOL(neigh_sysctl_unregister);

#endif        /* CONFIG_SYSCTL */

static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = {
        {.msgtype = RTM_NEWNEIGH, .doit = neigh_add},
        {.msgtype = RTM_DELNEIGH, .doit = neigh_delete},
        {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info,
         .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
        {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info,
         .flags = RTNL_FLAG_DUMP_UNLOCKED},
        {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set,
         .flags = RTNL_FLAG_DOIT_UNLOCKED},
};

static int __init neigh_init(void)
{
        rtnl_register_many(neigh_rtnl_msg_handlers);
        return 0;
}

subsys_initcall(neigh_init);

















































































































































































































































































































































   13 






   14 







   13 


































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
// SPDX-License-Identifier: GPL-2.0
/*
 * blk-mq scheduling framework
 *
 * Copyright (C) 2016 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/list_sort.h>

#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-wbt.h"

/*
 * Mark a hardware queue as needing a restart.
 */
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
{
        if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                return;

        set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);

void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
        clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);

        /*
         * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
         * in blk_mq_run_hw_queue(). Its pair is the barrier in
         * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
         * meantime new request added to hctx->dispatch is missed to check in
         * blk_mq_run_hw_queue().
         */
        smp_mb();

        blk_mq_run_hw_queue(hctx, true);
}

static int sched_rq_cmp(void *priv, const struct list_head *a,
                        const struct list_head *b)
{
        struct request *rqa = container_of(a, struct request, queuelist);
        struct request *rqb = container_of(b, struct request, queuelist);

        return rqa->mq_hctx > rqb->mq_hctx;
}

static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
{
        struct blk_mq_hw_ctx *hctx =
                list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
        struct request *rq;
        LIST_HEAD(hctx_list);

        list_for_each_entry(rq, rq_list, queuelist) {
                if (rq->mq_hctx != hctx) {
                        list_cut_before(&hctx_list, rq_list, &rq->queuelist);
                        goto dispatch;
                }
        }
        list_splice_tail_init(rq_list, &hctx_list);

dispatch:
        return blk_mq_dispatch_rq_list(hctx, &hctx_list, false);
}

#define BLK_MQ_BUDGET_DELAY        3                /* ms units */

/*
 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 * its queue by itself in its completion handler, so we don't need to
 * restart queue if .get_budget() fails to get the budget.
 *
 * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
 * be run again.  This is necessary to avoid starving flushes.
 */
static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;
        struct elevator_queue *e = q->elevator;
        bool multi_hctxs = false, run_queue = false;
        bool dispatched = false, busy = false;
        unsigned int max_dispatch;
        LIST_HEAD(rq_list);
        int count = 0;

        if (hctx->dispatch_busy)
                max_dispatch = 1;
        else
                max_dispatch = hctx->queue->nr_requests;

        do {
                struct request *rq;
                int budget_token;

                if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
                        break;

                if (!list_empty_careful(&hctx->dispatch)) {
                        busy = true;
                        break;
                }

                budget_token = blk_mq_get_dispatch_budget(q);
                if (budget_token < 0)
                        break;

                rq = e->type->ops.dispatch_request(hctx);
                if (!rq) {
                        blk_mq_put_dispatch_budget(q, budget_token);
                        /*
                         * We're releasing without dispatching. Holding the
                         * budget could have blocked any "hctx"s with the
                         * same queue and if we didn't dispatch then there's
                         * no guarantee anyone will kick the queue.  Kick it
                         * ourselves.
                         */
                        run_queue = true;
                        break;
                }

                blk_mq_set_rq_budget_token(rq, budget_token);

                /*
                 * Now this rq owns the budget which has to be released
                 * if this rq won't be queued to driver via .queue_rq()
                 * in blk_mq_dispatch_rq_list().
                 */
                list_add_tail(&rq->queuelist, &rq_list);
                count++;
                if (rq->mq_hctx != hctx)
                        multi_hctxs = true;

                /*
                 * If we cannot get tag for the request, stop dequeueing
                 * requests from the IO scheduler. We are unlikely to be able
                 * to submit them anyway and it creates false impression for
                 * scheduling heuristics that the device can take more IO.
                 */
                if (!blk_mq_get_driver_tag(rq))
                        break;
        } while (count < max_dispatch);

        if (!count) {
                if (run_queue)
                        blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
        } else if (multi_hctxs) {
                /*
                 * Requests from different hctx may be dequeued from some
                 * schedulers, such as bfq and deadline.
                 *
                 * Sort the requests in the list according to their hctx,
                 * dispatch batching requests from same hctx at a time.
                 */
                list_sort(NULL, &rq_list, sched_rq_cmp);
                do {
                        dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
                } while (!list_empty(&rq_list));
        } else {
                dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false);
        }

        if (busy)
                return -EAGAIN;
        return !!dispatched;
}

static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
        unsigned long end = jiffies + HZ;
        int ret;

        do {
                ret = __blk_mq_do_dispatch_sched(hctx);
                if (ret != 1)
                        break;
                if (need_resched() || time_is_before_jiffies(end)) {
                        blk_mq_delay_run_hw_queue(hctx, 0);
                        break;
                }
        } while (1);

        return ret;
}

static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
                                          struct blk_mq_ctx *ctx)
{
        unsigned short idx = ctx->index_hw[hctx->type];

        if (++idx == hctx->nr_ctx)
                idx = 0;

        return hctx->ctxs[idx];
}

/*
 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 * its queue by itself in its completion handler, so we don't need to
 * restart queue if .get_budget() fails to get the budget.
 *
 * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
 * be run again.  This is necessary to avoid starving flushes.
 */
static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;
        LIST_HEAD(rq_list);
        struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
        int ret = 0;
        struct request *rq;

        do {
                int budget_token;

                if (!list_empty_careful(&hctx->dispatch)) {
                        ret = -EAGAIN;
                        break;
                }

                if (!sbitmap_any_bit_set(&hctx->ctx_map))
                        break;

                budget_token = blk_mq_get_dispatch_budget(q);
                if (budget_token < 0)
                        break;

                rq = blk_mq_dequeue_from_ctx(hctx, ctx);
                if (!rq) {
                        blk_mq_put_dispatch_budget(q, budget_token);
                        /*
                         * We're releasing without dispatching. Holding the
                         * budget could have blocked any "hctx"s with the
                         * same queue and if we didn't dispatch then there's
                         * no guarantee anyone will kick the queue.  Kick it
                         * ourselves.
                         */
                        blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
                        break;
                }

                blk_mq_set_rq_budget_token(rq, budget_token);

                /*
                 * Now this rq owns the budget which has to be released
                 * if this rq won't be queued to driver via .queue_rq()
                 * in blk_mq_dispatch_rq_list().
                 */
                list_add(&rq->queuelist, &rq_list);

                /* round robin for fair dispatch */
                ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);

        } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false));

        WRITE_ONCE(hctx->dispatch_from, ctx);
        return ret;
}

static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
        bool need_dispatch = false;
        LIST_HEAD(rq_list);

        /*
         * If we have previous entries on our dispatch list, grab them first for
         * more fair dispatch.
         */
        if (!list_empty_careful(&hctx->dispatch)) {
                spin_lock(&hctx->lock);
                if (!list_empty(&hctx->dispatch))
                        list_splice_init(&hctx->dispatch, &rq_list);
                spin_unlock(&hctx->lock);
        }

        /*
         * Only ask the scheduler for requests, if we didn't have residual
         * requests from the dispatch list. This is to avoid the case where
         * we only ever dispatch a fraction of the requests available because
         * of low device queue depth. Once we pull requests out of the IO
         * scheduler, we can no longer merge or sort them. So it's best to
         * leave them there for as long as we can. Mark the hw queue as
         * needing a restart in that case.
         *
         * We want to dispatch from the scheduler if there was nothing
         * on the dispatch list or we were able to dispatch from the
         * dispatch list.
         */
        if (!list_empty(&rq_list)) {
                blk_mq_sched_mark_restart_hctx(hctx);
                if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true))
                        return 0;
                need_dispatch = true;
        } else {
                need_dispatch = hctx->dispatch_busy;
        }

        if (hctx->queue->elevator)
                return blk_mq_do_dispatch_sched(hctx);

        /* dequeue request one by one from sw queue if queue is busy */
        if (need_dispatch)
                return blk_mq_do_dispatch_ctx(hctx);
        blk_mq_flush_busy_ctxs(hctx, &rq_list);
        blk_mq_dispatch_rq_list(hctx, &rq_list, true);
        return 0;
}

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;

        /* RCU or SRCU read lock is needed before checking quiesced flag */
        if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
                return;

        /*
         * A return of -EAGAIN is an indication that hctx->dispatch is not
         * empty and we must run again in order to avoid starving flushes.
         */
        if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
                if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
                        blk_mq_run_hw_queue(hctx, true);
        }
}

bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs)
{
        struct elevator_queue *e = q->elevator;
        struct blk_mq_ctx *ctx;
        struct blk_mq_hw_ctx *hctx;
        bool ret = false;
        enum hctx_type type;

        if (e && e->type->ops.bio_merge) {
                ret = e->type->ops.bio_merge(q, bio, nr_segs);
                goto out_put;
        }

        ctx = blk_mq_get_ctx(q);
        hctx = blk_mq_map_queue(bio->bi_opf, ctx);
        type = hctx->type;
        if (list_empty_careful(&ctx->rq_lists[type]))
                goto out_put;

        /* default per sw-queue merge */
        spin_lock(&ctx->lock);
        /*
         * Reverse check our software queue for entries that we could
         * potentially merge with. Currently includes a hand-wavy stop
         * count of 8, to not spend too much time checking for merges.
         */
        if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
                ret = true;

        spin_unlock(&ctx->lock);
out_put:
        return ret;
}

bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
                                   struct list_head *free)
{
        return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);

/* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                hctx->sched_tags = NULL;

        if (blk_mq_is_shared_tags(flags))
                q->sched_shared_tags = NULL;
}

void blk_mq_sched_reg_debugfs(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned int memflags;
        unsigned long i;

        memflags = blk_debugfs_lock(q);
        blk_mq_debugfs_register_sched(q);
        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_debugfs_register_sched_hctx(q, hctx);
        blk_debugfs_unlock(q, memflags);
}

void blk_mq_sched_unreg_debugfs(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        blk_debugfs_lock_nomemsave(q);
        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_debugfs_unregister_sched_hctx(hctx);
        blk_mq_debugfs_unregister_sched(q);
        blk_debugfs_unlock_nomemrestore(q);
}

void blk_mq_free_sched_tags(struct elevator_tags *et,
                struct blk_mq_tag_set *set)
{
        unsigned long i;

        /* Shared tags are stored at index 0 in @tags. */
        if (blk_mq_is_shared_tags(set->flags))
                blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
        else {
                for (i = 0; i < et->nr_hw_queues; i++)
                        blk_mq_free_map_and_rqs(set, et->tags[i], i);
        }

        kfree(et);
}

void blk_mq_free_sched_res(struct elevator_resources *res,
                struct elevator_type *type,
                struct blk_mq_tag_set *set)
{
        if (res->et) {
                blk_mq_free_sched_tags(res->et, set);
                res->et = NULL;
        }
        if (res->data) {
                blk_mq_free_sched_data(type, res->data);
                res->data = NULL;
        }
}

void blk_mq_free_sched_res_batch(struct xarray *elv_tbl,
                struct blk_mq_tag_set *set)
{
        struct request_queue *q;
        struct elv_change_ctx *ctx;

        lockdep_assert_held_write(&set->update_nr_hwq_lock);

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                /*
                 * Accessing q->elevator without holding q->elevator_lock is
                 * safe because we're holding here set->update_nr_hwq_lock in
                 * the writer context. So, scheduler update/switch code (which
                 * acquires the same lock but in the reader context) can't run
                 * concurrently.
                 */
                if (q->elevator) {
                        ctx = xa_load(elv_tbl, q->id);
                        if (!ctx) {
                                WARN_ON_ONCE(1);
                                continue;
                        }
                        blk_mq_free_sched_res(&ctx->res, ctx->type, set);
                }
        }
}

void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl)
{
        unsigned long i;
        struct elv_change_ctx *ctx;

        xa_for_each(elv_tbl, i, ctx) {
                xa_erase(elv_tbl, i);
                kfree(ctx);
        }
}

int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl,
                struct blk_mq_tag_set *set)
{
        struct request_queue *q;
        struct elv_change_ctx *ctx;

        lockdep_assert_held_write(&set->update_nr_hwq_lock);

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                ctx = kzalloc_obj(struct elv_change_ctx);
                if (!ctx)
                        return -ENOMEM;

                if (xa_insert(elv_tbl, q->id, ctx, GFP_KERNEL)) {
                        kfree(ctx);
                        return -ENOMEM;
                }
        }
        return 0;
}

struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
                unsigned int nr_hw_queues, unsigned int nr_requests)
{
        unsigned int nr_tags;
        int i;
        struct elevator_tags *et;
        gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;

        if (blk_mq_is_shared_tags(set->flags))
                nr_tags = 1;
        else
                nr_tags = nr_hw_queues;

        et = kmalloc_flex(*et, tags, nr_tags, gfp);
        if (!et)
                return NULL;

        et->nr_requests = nr_requests;
        et->nr_hw_queues = nr_hw_queues;

        if (blk_mq_is_shared_tags(set->flags)) {
                /* Shared tags are stored at index 0 in @tags. */
                et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
                                        MAX_SCHED_RQ);
                if (!et->tags[0])
                        goto out;
        } else {
                for (i = 0; i < et->nr_hw_queues; i++) {
                        et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
                                        et->nr_requests);
                        if (!et->tags[i])
                                goto out_unwind;
                }
        }

        return et;
out_unwind:
        while (--i >= 0)
                blk_mq_free_map_and_rqs(set, et->tags[i], i);
out:
        kfree(et);
        return NULL;
}

int blk_mq_alloc_sched_res(struct request_queue *q,
                struct elevator_type *type,
                struct elevator_resources *res,
                unsigned int nr_hw_queues)
{
        struct blk_mq_tag_set *set = q->tag_set;

        res->et = blk_mq_alloc_sched_tags(set, nr_hw_queues,
                        blk_mq_default_nr_requests(set));
        if (!res->et)
                return -ENOMEM;

        res->data = blk_mq_alloc_sched_data(q, type);
        if (IS_ERR(res->data)) {
                blk_mq_free_sched_tags(res->et, set);
                return -ENOMEM;
        }

        return 0;
}

int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
                struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
{
        struct elv_change_ctx *ctx;
        struct request_queue *q;
        int ret = -ENOMEM;

        lockdep_assert_held_write(&set->update_nr_hwq_lock);

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                /*
                 * Accessing q->elevator without holding q->elevator_lock is
                 * safe because we're holding here set->update_nr_hwq_lock in
                 * the writer context. So, scheduler update/switch code (which
                 * acquires the same lock but in the reader context) can't run
                 * concurrently.
                 */
                if (q->elevator) {
                        ctx = xa_load(elv_tbl, q->id);
                        if (WARN_ON_ONCE(!ctx)) {
                                ret = -ENOENT;
                                goto out_unwind;
                        }

                        ret = blk_mq_alloc_sched_res(q, q->elevator->type,
                                        &ctx->res, nr_hw_queues);
                        if (ret)
                                goto out_unwind;
                }
        }
        return 0;

out_unwind:
        list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
                if (q->elevator) {
                        ctx = xa_load(elv_tbl, q->id);
                        if (ctx)
                                blk_mq_free_sched_res(&ctx->res,
                                                ctx->type, set);
                }
        }
        return ret;
}

/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
                struct elevator_resources *res)
{
        unsigned int flags = q->tag_set->flags;
        struct elevator_tags *et = res->et;
        struct blk_mq_hw_ctx *hctx;
        struct elevator_queue *eq;
        unsigned long i;
        int ret;

        eq = elevator_alloc(q, e, res);
        if (!eq)
                return -ENOMEM;

        q->nr_requests = et->nr_requests;

        if (blk_mq_is_shared_tags(flags)) {
                /* Shared tags are stored at index 0 in @et->tags. */
                q->sched_shared_tags = et->tags[0];
                blk_mq_tag_update_sched_shared_tags(q, et->nr_requests);
        }

        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_is_shared_tags(flags))
                        hctx->sched_tags = q->sched_shared_tags;
                else
                        hctx->sched_tags = et->tags[i];
        }

        ret = e->ops.init_sched(q, eq);
        if (ret)
                goto out;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (e->ops.init_hctx) {
                        ret = e->ops.init_hctx(hctx, i);
                        if (ret) {
                                blk_mq_exit_sched(q, eq);
                                kobject_put(&eq->kobj);
                                return ret;
                        }
                }
        }
        return 0;

out:
        blk_mq_sched_tags_teardown(q, flags);
        kobject_put(&eq->kobj);
        q->elevator = NULL;
        return ret;
}

/*
 * called in either blk_queue_cleanup or elevator_switch, tagset
 * is required for freeing requests
 */
void blk_mq_sched_free_rqs(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        if (blk_mq_is_shared_tags(q->tag_set->flags)) {
                blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
                                BLK_MQ_NO_HCTX_IDX);
        } else {
                queue_for_each_hw_ctx(q, hctx, i) {
                        if (hctx->sched_tags)
                                blk_mq_free_rqs(q->tag_set,
                                                hctx->sched_tags, i);
                }
        }
}

void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;
        unsigned int flags = 0;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (e->type->ops.exit_hctx && hctx->sched_data) {
                        e->type->ops.exit_hctx(hctx, i);
                        hctx->sched_data = NULL;
                }
                flags = hctx->flags;
        }

        if (e->type->ops.exit_sched)
                e->type->ops.exit_sched(e);
        blk_mq_sched_tags_teardown(q, flags);
        set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags);
        q->elevator = NULL;
}



















































    1 





























    7 




    6 
















    1 





















    1 






































    2 

    1 
    1 
    1 



    1 
















    6 



















































































    6 




    6 








    6 




    6 
    1 

















































































































    8 



    9 







    7 






























    1 

















    1 













    2 
    9 


    9 
























































































































































































































































































































































































































































































































































































































































































































    2 



    2 

    1 

    2 
    1 










    9 







    1 

    9 










    9 
    2 



    9 





    7 
















































































































































































































































































    9 

























    2 












    2 


    2 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/filter.h>

#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)

#define BPF_COMPLEXITY_LIMIT_STATES        64

static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
{
        return bpf_is_may_goto_insn(&env->prog->insnsi[insn_idx]);
}

static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
{
        return env->insn_aux_data[insn_idx].is_iter_next;
}

static void update_peak_states(struct bpf_verifier_env *env)
{
        u32 cur_states;

        cur_states = env->explored_states_size + env->free_list_size + env->num_backedges;
        env->peak_states = max(env->peak_states, cur_states);
}

/* struct bpf_verifier_state->parent refers to states
 * that are in either of env->{expored_states,free_list}.
 * In both cases the state is contained in struct bpf_verifier_state_list.
 */
static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
{
        if (st->parent)
                return container_of(st->parent, struct bpf_verifier_state_list, state);
        return NULL;
}

static bool incomplete_read_marks(struct bpf_verifier_env *env,
                                  struct bpf_verifier_state *st);

/* A state can be freed if it is no longer referenced:
 * - is in the env->free_list;
 * - has no children states;
 */
static void maybe_free_verifier_state(struct bpf_verifier_env *env,
                                      struct bpf_verifier_state_list *sl)
{
        if (!sl->in_free_list
            || sl->state.branches != 0
            || incomplete_read_marks(env, &sl->state))
                return;
        list_del(&sl->node);
        bpf_free_verifier_state(&sl->state, false);
        kfree(sl);
        env->free_list_size--;
}

/* For state @st look for a topmost frame with frame_insn_idx() in some SCC,
 * if such frame exists form a corresponding @callchain as an array of
 * call sites leading to this frame and SCC id.
 * E.g.:
 *
 *    void foo()  { A: loop {... SCC#1 ...}; }
 *    void bar()  { B: loop { C: foo(); ... SCC#2 ... }
 *                  D: loop { E: foo(); ... SCC#3 ... } }
 *    void main() { F: bar(); }
 *
 * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending
 * on @st frame call sites being (F,C,A) or (F,E,A).
 */
static bool compute_scc_callchain(struct bpf_verifier_env *env,
                                  struct bpf_verifier_state *st,
                                  struct bpf_scc_callchain *callchain)
{
        u32 i, scc, insn_idx;

        memset(callchain, 0, sizeof(*callchain));
        for (i = 0; i <= st->curframe; i++) {
                insn_idx = bpf_frame_insn_idx(st, i);
                scc = env->insn_aux_data[insn_idx].scc;
                if (scc) {
                        callchain->scc = scc;
                        break;
                } else if (i < st->curframe) {
                        callchain->callsites[i] = insn_idx;
                } else {
                        return false;
                }
        }
        return true;
}

/* Check if bpf_scc_visit instance for @callchain exists. */
static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env,
                                              struct bpf_scc_callchain *callchain)
{
        struct bpf_scc_info *info = env->scc_info[callchain->scc];
        struct bpf_scc_visit *visits = info->visits;
        u32 i;

        if (!info)
                return NULL;
        for (i = 0; i < info->num_visits; i++)
                if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0)
                        return &visits[i];
        return NULL;
}

/* Allocate a new bpf_scc_visit instance corresponding to @callchain.
 * Allocated instances are alive for a duration of the do_check_common()
 * call and are freed by free_states().
 */
static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env,
                                             struct bpf_scc_callchain *callchain)
{
        struct bpf_scc_visit *visit;
        struct bpf_scc_info *info;
        u32 scc, num_visits;
        u64 new_sz;

        scc = callchain->scc;
        info = env->scc_info[scc];
        num_visits = info ? info->num_visits : 0;
        new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1);
        info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT);
        if (!info)
                return NULL;
        env->scc_info[scc] = info;
        info->num_visits = num_visits + 1;
        visit = &info->visits[num_visits];
        memset(visit, 0, sizeof(*visit));
        memcpy(&visit->callchain, callchain, sizeof(*callchain));
        return visit;
}

/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */
static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
{
        char *buf = env->tmp_str_buf;
        int i, delta = 0;

        delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "(");
        for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) {
                if (!callchain->callsites[i])
                        break;
                delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,",
                                  callchain->callsites[i]);
        }
        delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc);
        return env->tmp_str_buf;
}

/* If callchain for @st exists (@st is in some SCC), ensure that
 * bpf_scc_visit instance for this callchain exists.
 * If instance does not exist or is empty, assign visit->entry_state to @st.
 */
static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
        struct bpf_scc_callchain *callchain = &env->callchain_buf;
        struct bpf_scc_visit *visit;

        if (!compute_scc_callchain(env, st, callchain))
                return 0;
        visit = scc_visit_lookup(env, callchain);
        visit = visit ?: scc_visit_alloc(env, callchain);
        if (!visit)
                return -ENOMEM;
        if (!visit->entry_state) {
                visit->entry_state = st;
                if (env->log.level & BPF_LOG_LEVEL2)
                        verbose(env, "SCC enter %s\n", format_callchain(env, callchain));
        }
        return 0;
}

static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit);

/* If callchain for @st exists (@st is in some SCC), make it empty:
 * - set visit->entry_state to NULL;
 * - flush accumulated backedges.
 */
static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
        struct bpf_scc_callchain *callchain = &env->callchain_buf;
        struct bpf_scc_visit *visit;

        if (!compute_scc_callchain(env, st, callchain))
                return 0;
        visit = scc_visit_lookup(env, callchain);
        if (!visit) {
                /*
                 * If path traversal stops inside an SCC, corresponding bpf_scc_visit
                 * must exist for non-speculative paths. For non-speculative paths
                 * traversal stops when:
                 * a. Verification error is found, maybe_exit_scc() is not called.
                 * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member
                 *    of any SCC.
                 * c. A checkpoint is reached and matched. Checkpoints are created by
                 *    is_state_visited(), which calls maybe_enter_scc(), which allocates
                 *    bpf_scc_visit instances for checkpoints within SCCs.
                 * (c) is the only case that can reach this point.
                 */
                if (!st->speculative) {
                        verifier_bug(env, "scc exit: no visit info for call chain %s",
                                     format_callchain(env, callchain));
                        return -EFAULT;
                }
                return 0;
        }
        if (visit->entry_state != st)
                return 0;
        if (env->log.level & BPF_LOG_LEVEL2)
                verbose(env, "SCC exit %s\n", format_callchain(env, callchain));
        visit->entry_state = NULL;
        env->num_backedges -= visit->num_backedges;
        visit->num_backedges = 0;
        update_peak_states(env);
        return propagate_backedges(env, visit);
}

/* Lookup an bpf_scc_visit instance corresponding to @st callchain
 * and add @backedge to visit->backedges. @st callchain must exist.
 */
static int add_scc_backedge(struct bpf_verifier_env *env,
                            struct bpf_verifier_state *st,
                            struct bpf_scc_backedge *backedge)
{
        struct bpf_scc_callchain *callchain = &env->callchain_buf;
        struct bpf_scc_visit *visit;

        if (!compute_scc_callchain(env, st, callchain)) {
                verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
                             st->insn_idx);
                return -EFAULT;
        }
        visit = scc_visit_lookup(env, callchain);
        if (!visit) {
                verifier_bug(env, "add backedge: no visit info for call chain %s",
                             format_callchain(env, callchain));
                return -EFAULT;
        }
        if (env->log.level & BPF_LOG_LEVEL2)
                verbose(env, "SCC backedge %s\n", format_callchain(env, callchain));
        backedge->next = visit->backedges;
        visit->backedges = backedge;
        visit->num_backedges++;
        env->num_backedges++;
        update_peak_states(env);
        return 0;
}

/* bpf_reg_state->live marks for registers in a state @st are incomplete,
 * if state @st is in some SCC and not all execution paths starting at this
 * SCC are fully explored.
 */
static bool incomplete_read_marks(struct bpf_verifier_env *env,
                                  struct bpf_verifier_state *st)
{
        struct bpf_scc_callchain *callchain = &env->callchain_buf;
        struct bpf_scc_visit *visit;

        if (!compute_scc_callchain(env, st, callchain))
                return false;
        visit = scc_visit_lookup(env, callchain);
        if (!visit)
                return false;
        return !!visit->backedges;
}

int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
        struct bpf_verifier_state_list *sl = NULL, *parent_sl;
        struct bpf_verifier_state *parent;
        int err;

        while (st) {
                u32 br = --st->branches;

                /* verifier_bug_if(br > 1, ...) technically makes sense here,
                 * but see comment in push_stack(), hence:
                 */
                verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br);
                if (br)
                        break;
                err = maybe_exit_scc(env, st);
                if (err)
                        return err;
                parent = st->parent;
                parent_sl = state_parent_as_list(st);
                if (sl)
                        maybe_free_verifier_state(env, sl);
                st = parent;
                sl = parent_sl;
        }
        return 0;
}

/* check %cur's range satisfies %old's */
static bool range_within(const struct bpf_reg_state *old,
                         const struct bpf_reg_state *cur)
{
        return old->umin_value <= cur->umin_value &&
               old->umax_value >= cur->umax_value &&
               old->smin_value <= cur->smin_value &&
               old->smax_value >= cur->smax_value &&
               old->u32_min_value <= cur->u32_min_value &&
               old->u32_max_value >= cur->u32_max_value &&
               old->s32_min_value <= cur->s32_min_value &&
               old->s32_max_value >= cur->s32_max_value;
}

/* If in the old state two registers had the same id, then they need to have
 * the same id in the new state as well.  But that id could be different from
 * the old state, so we need to track the mapping from old to new ids.
 * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
 * regs with old id 5 must also have new id 9 for the new state to be safe.  But
 * regs with a different old id could still have new id 9, we don't care about
 * that.
 * So we look through our idmap to see if this old id has been seen before.  If
 * so, we require the new id to match; otherwise, we add the id pair to the map.
 */
static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
{
        struct bpf_id_pair *map = idmap->map;
        unsigned int i;

        /* either both IDs should be set or both should be zero */
        if (!!old_id != !!cur_id)
                return false;

        if (old_id == 0) /* cur_id == 0 as well */
                return true;

        for (i = 0; i < idmap->cnt; i++) {
                if (map[i].old == old_id)
                        return map[i].cur == cur_id;
                if (map[i].cur == cur_id)
                        return false;
        }

        /* Reached the end of known mappings; haven't seen this id before */
        if (idmap->cnt < BPF_ID_MAP_SIZE) {
                map[idmap->cnt].old = old_id;
                map[idmap->cnt].cur = cur_id;
                idmap->cnt++;
                return true;
        }

        /* We ran out of idmap slots, which should be impossible */
        WARN_ON_ONCE(1);
        return false;
}

/*
 * Compare scalar register IDs for state equivalence.
 *
 * When old_id == 0, the old register is independent - not linked to any
 * other register. Any linking in the current state only adds constraints,
 * making it more restrictive. Since the old state didn't rely on any ID
 * relationships for this register, it's always safe to accept cur regardless
 * of its ID. Hence, return true immediately.
 *
 * When old_id != 0 but cur_id == 0, we need to ensure that different
 * independent registers in cur don't incorrectly satisfy the ID matching
 * requirements of linked registers in old.
 *
 * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0
 * and r7.id=0 (both independent), without temp IDs both would map old_id=X
 * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map
 * X->temp2, but X is already mapped to temp1, so the check fails correctly.
 *
 * When old_id has BPF_ADD_CONST set, the compound id (base | flag) and the
 * base id (flag stripped) must both map consistently. Example: old has
 * r2.id=A, r3.id=A|flag (r3 = r2 + delta), cur has r2.id=B, r3.id=C|flag
 * (r3 derived from unrelated r4). Without the base check, idmap gets two
 * independent entries A->B and A|flag->C|flag, missing that A->C conflicts
 * with A->B. The base ID cross-check catches this.
 */
static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
{
        if (!old_id)
                return true;

        cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;

        if (!check_ids(old_id, cur_id, idmap))
                return false;
        if (old_id & BPF_ADD_CONST) {
                old_id &= ~BPF_ADD_CONST;
                cur_id &= ~BPF_ADD_CONST;
                if (!check_ids(old_id, cur_id, idmap))
                        return false;
        }
        return true;
}

static void __clean_func_state(struct bpf_verifier_env *env,
                               struct bpf_func_state *st,
                               u16 live_regs, int frame)
{
        int i, j;

        for (i = 0; i < BPF_REG_FP; i++) {
                /* liveness must not touch this register anymore */
                if (!(live_regs & BIT(i)))
                        /* since the register is unused, clear its state
                         * to make further comparison simpler
                         */
                        bpf_mark_reg_not_init(env, &st->regs[i]);
        }

        /*
         * Clean dead 4-byte halves within each SPI independently.
         * half_spi 2*i   → lower half: slot_type[0..3] (closer to FP)
         * half_spi 2*i+1 → upper half: slot_type[4..7] (farther from FP)
         */
        for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
                bool lo_live = bpf_stack_slot_alive(env, frame, i * 2);
                bool hi_live = bpf_stack_slot_alive(env, frame, i * 2 + 1);

                if (!hi_live || !lo_live) {
                        int start = !lo_live ? 0 : BPF_REG_SIZE / 2;
                        int end = !hi_live ? BPF_REG_SIZE : BPF_REG_SIZE / 2;
                        u8 stype = st->stack[i].slot_type[7];

                        /*
                         * Don't clear special slots.
                         * destroy_if_dynptr_stack_slot() needs STACK_DYNPTR to
                         * detect overwrites and invalidate associated data slices.
                         * is_iter_reg_valid_uninit() and is_irq_flag_reg_valid_uninit()
                         * check for their respective slot types to detect double-create.
                         */
                        if (stype == STACK_DYNPTR || stype == STACK_ITER ||
                            stype == STACK_IRQ_FLAG)
                                continue;

                        /*
                         * Only destroy spilled_ptr when hi half is dead.
                         * If hi half is still live with STACK_SPILL, the
                         * spilled_ptr metadata is needed for correct state
                         * comparison in stacksafe().
                         * is_spilled_reg() is using slot_type[7], but
                         * is_spilled_scalar_after() check either slot_type[0] or [4]
                         */
                        if (!hi_live) {
                                struct bpf_reg_state *spill = &st->stack[i].spilled_ptr;

                                if (lo_live && stype == STACK_SPILL) {
                                        u8 val = STACK_MISC;

                                        /*
                                         * 8 byte spill of scalar 0 where half slot is dead
                                         * should become STACK_ZERO in lo 4 bytes.
                                         */
                                        if (bpf_register_is_null(spill))
                                                val = STACK_ZERO;
                                        for (j = 0; j < 4; j++) {
                                                u8 *t = &st->stack[i].slot_type[j];

                                                if (*t == STACK_SPILL)
                                                        *t = val;
                                        }
                                }
                                bpf_mark_reg_not_init(env, spill);
                        }
                        for (j = start; j < end; j++)
                                st->stack[i].slot_type[j] = STACK_POISON;
                }
        }
}

static int clean_verifier_state(struct bpf_verifier_env *env,
                                 struct bpf_verifier_state *st)
{
        int i, err;

        err = bpf_live_stack_query_init(env, st);
        if (err)
                return err;
        for (i = 0; i <= st->curframe; i++) {
                u32 ip = bpf_frame_insn_idx(st, i);
                u16 live_regs = env->insn_aux_data[ip].live_regs_before;

                __clean_func_state(env, st->frame[i], live_regs, i);
        }
        return 0;
}

static bool regs_exact(const struct bpf_reg_state *rold,
                       const struct bpf_reg_state *rcur,
                       struct bpf_idmap *idmap)
{
        return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
               check_ids(rold->id, rcur->id, idmap) &&
               check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
}

enum exact_level {
        NOT_EXACT,
        EXACT,
        RANGE_WITHIN
};

/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
                    struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
                    enum exact_level exact)
{
        if (exact == EXACT)
                return regs_exact(rold, rcur, idmap);

        if (rold->type == NOT_INIT)
                /* explored state can't have used this */
                return true;

        /* Enforce that register types have to match exactly, including their
         * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
         * rule.
         *
         * One can make a point that using a pointer register as unbounded
         * SCALAR would be technically acceptable, but this could lead to
         * pointer leaks because scalars are allowed to leak while pointers
         * are not. We could make this safe in special cases if root is
         * calling us, but it's probably not worth the hassle.
         *
         * Also, register types that are *not* MAYBE_NULL could technically be
         * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
         * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
         * to the same map).
         * However, if the old MAYBE_NULL register then got NULL checked,
         * doing so could have affected others with the same id, and we can't
         * check for that because we lost the id when we converted to
         * a non-MAYBE_NULL variant.
         * So, as a general rule we don't allow mixing MAYBE_NULL and
         * non-MAYBE_NULL registers as well.
         */
        if (rold->type != rcur->type)
                return false;

        switch (base_type(rold->type)) {
        case SCALAR_VALUE:
                if (env->explore_alu_limits) {
                        /* explore_alu_limits disables tnum_in() and range_within()
                         * logic and requires everything to be strict
                         */
                        return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
                               check_scalar_ids(rold->id, rcur->id, idmap);
                }
                if (!rold->precise && exact == NOT_EXACT)
                        return true;
                /*
                 * Linked register tracking uses rold->id to detect relationships.
                 * When rold->id == 0, the register is independent and any linking
                 * in rcur only adds constraints. When rold->id != 0, we must verify
                 * id mapping and (for BPF_ADD_CONST) offset consistency.
                 *
                 * +------------------+-----------+------------------+---------------+
                 * |                  | rold->id  | rold + ADD_CONST | rold->id == 0 |
                 * |------------------+-----------+------------------+---------------|
                 * | rcur->id         | range,ids | false            | range         |
                 * | rcur + ADD_CONST | false     | range,ids,off    | range         |
                 * | rcur->id == 0    | range,ids | false            | range         |
                 * +------------------+-----------+------------------+---------------+
                 *
                 * Why check_ids() for scalar registers?
                 *
                 * Consider the following BPF code:
                 *   1: r6 = ... unbound scalar, ID=a ...
                 *   2: r7 = ... unbound scalar, ID=b ...
                 *   3: if (r6 > r7) goto +1
                 *   4: r6 = r7
                 *   5: if (r6 > X) goto ...
                 *   6: ... memory operation using r7 ...
                 *
                 * First verification path is [1-6]:
                 * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
                 * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
                 *   r7 <= X, because r6 and r7 share same id.
                 * Next verification path is [1-4, 6].
                 *
                 * Instruction (6) would be reached in two states:
                 *   I.  r6{.id=b}, r7{.id=b} via path 1-6;
                 *   II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
                 *
                 * Use check_ids() to distinguish these states.
                 * ---
                 * Also verify that new value satisfies old value range knowledge.
                 */

                /*
                 * ADD_CONST flags must match exactly: BPF_ADD_CONST32 and
                 * BPF_ADD_CONST64 have different linking semantics in
                 * sync_linked_regs() (alu32 zero-extends, alu64 does not),
                 * so pruning across different flag types is unsafe.
                 */
                if (rold->id &&
                    (rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
                        return false;

                /* Both have offset linkage: offsets must match */
                if ((rold->id & BPF_ADD_CONST) && rold->delta != rcur->delta)
                        return false;

                if (!check_scalar_ids(rold->id, rcur->id, idmap))
                        return false;

                return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
        case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
        case PTR_TO_MEM:
        case PTR_TO_BUF:
        case PTR_TO_TP_BUFFER:
                /* If the new min/max/var_off satisfy the old ones and
                 * everything else matches, we are OK.
                 */
                return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
                       range_within(rold, rcur) &&
                       tnum_in(rold->var_off, rcur->var_off) &&
                       check_ids(rold->id, rcur->id, idmap) &&
                       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
        case PTR_TO_PACKET_META:
        case PTR_TO_PACKET:
                /* We must have at least as much range as the old ptr
                 * did, so that any accesses which were safe before are
                 * still safe.  This is true even if old range < old off,
                 * since someone could have accessed through (ptr - k), or
                 * even done ptr -= k in a register, to get a safe access.
                 */
                if (rold->range < 0 || rcur->range < 0) {
                        /* special case for [BEYOND|AT]_PKT_END */
                        if (rold->range != rcur->range)
                                return false;
                } else if (rold->range > rcur->range) {
                        return false;
                }
                /* id relations must be preserved */
                if (!check_ids(rold->id, rcur->id, idmap))
                        return false;
                /* new val must satisfy old val knowledge */
                return range_within(rold, rcur) &&
                       tnum_in(rold->var_off, rcur->var_off);
        case PTR_TO_STACK:
                /* two stack pointers are equal only if they're pointing to
                 * the same stack frame, since fp-8 in foo != fp-8 in bar
                 */
                return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
        case PTR_TO_ARENA:
                return true;
        case PTR_TO_INSN:
                return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
                       range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
        default:
                return regs_exact(rold, rcur, idmap);
        }
}

static struct bpf_reg_state unbound_reg;

static __init int unbound_reg_init(void)
{
        bpf_mark_reg_unknown_imprecise(&unbound_reg);
        return 0;
}
late_initcall(unbound_reg_init);

static bool is_spilled_scalar_after(const struct bpf_stack_state *stack, int im)
{
        return stack->slot_type[im] == STACK_SPILL &&
               stack->spilled_ptr.type == SCALAR_VALUE;
}

static bool is_stack_misc_after(struct bpf_verifier_env *env,
                                struct bpf_stack_state *stack, int im)
{
        u32 i;

        for (i = im; i < ARRAY_SIZE(stack->slot_type); ++i) {
                if ((stack->slot_type[i] == STACK_MISC) ||
                    ((stack->slot_type[i] == STACK_INVALID || stack->slot_type[i] == STACK_POISON) &&
                     env->allow_uninit_stack))
                        continue;
                return false;
        }

        return true;
}

static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
                                                  struct bpf_stack_state *stack, int im)
{
        if (is_spilled_scalar_after(stack, im))
                return &stack->spilled_ptr;

        if (is_stack_misc_after(env, stack, im))
                return &unbound_reg;

        return NULL;
}

static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
                      struct bpf_func_state *cur, struct bpf_idmap *idmap,
                      enum exact_level exact)
{
        int i, spi;

        /* walk slots of the explored stack and ignore any additional
         * slots in the current stack, since explored(safe) state
         * didn't use them
         */
        for (i = 0; i < old->allocated_stack; i++) {
                struct bpf_reg_state *old_reg, *cur_reg;
                int im = i % BPF_REG_SIZE;

                spi = i / BPF_REG_SIZE;

                if (exact == EXACT) {
                        u8 old_type = old->stack[spi].slot_type[i % BPF_REG_SIZE];
                        u8 cur_type = i < cur->allocated_stack ?
                                      cur->stack[spi].slot_type[i % BPF_REG_SIZE] : STACK_INVALID;

                        /* STACK_INVALID and STACK_POISON are equivalent for pruning */
                        if (old_type == STACK_POISON)
                                old_type = STACK_INVALID;
                        if (cur_type == STACK_POISON)
                                cur_type = STACK_INVALID;
                        if (i >= cur->allocated_stack || old_type != cur_type)
                                return false;
                }

                if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID ||
                    old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_POISON)
                        continue;

                if (env->allow_uninit_stack &&
                    old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
                        continue;

                /* explored stack has more populated slots than current stack
                 * and these slots were used
                 */
                if (i >= cur->allocated_stack)
                        return false;

                /*
                 * 64 and 32-bit scalar spills vs MISC/INVALID slots and vice versa.
                 * Load from MISC/INVALID slots produces unbound scalar.
                 * Construct a fake register for such stack and call
                 * regsafe() to ensure scalar ids are compared.
                 */
                if (im == 0 || im == 4) {
                        old_reg = scalar_reg_for_stack(env, &old->stack[spi], im);
                        cur_reg = scalar_reg_for_stack(env, &cur->stack[spi], im);
                        if (old_reg && cur_reg) {
                                if (!regsafe(env, old_reg, cur_reg, idmap, exact))
                                        return false;
                                i += (im == 0 ? BPF_REG_SIZE - 1 : 3);
                                continue;
                        }
                }

                /* if old state was safe with misc data in the stack
                 * it will be safe with zero-initialized stack.
                 * The opposite is not true
                 */
                if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
                    cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
                        continue;
                if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
                    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
                        /* Ex: old explored (safe) state has STACK_SPILL in
                         * this stack slot, but current has STACK_MISC ->
                         * this verifier states are not equivalent,
                         * return false to continue verification of this path
                         */
                        return false;
                if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
                        continue;
                /* Both old and cur are having same slot_type */
                switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
                case STACK_SPILL:
                        /* when explored and current stack slot are both storing
                         * spilled registers, check that stored pointers types
                         * are the same as well.
                         * Ex: explored safe path could have stored
                         * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
                         * but current path has stored:
                         * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
                         * such verifier states are not equivalent.
                         * return false to continue verification of this path
                         */
                        if (!regsafe(env, &old->stack[spi].spilled_ptr,
                                     &cur->stack[spi].spilled_ptr, idmap, exact))
                                return false;
                        break;
                case STACK_DYNPTR:
                        old_reg = &old->stack[spi].spilled_ptr;
                        cur_reg = &cur->stack[spi].spilled_ptr;
                        if (old_reg->dynptr.type != cur_reg->dynptr.type ||
                            old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
                            !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
                                return false;
                        break;
                case STACK_ITER:
                        old_reg = &old->stack[spi].spilled_ptr;
                        cur_reg = &cur->stack[spi].spilled_ptr;
                        /* iter.depth is not compared between states as it
                         * doesn't matter for correctness and would otherwise
                         * prevent convergence; we maintain it only to prevent
                         * infinite loop check triggering, see
                         * iter_active_depths_differ()
                         */
                        if (old_reg->iter.btf != cur_reg->iter.btf ||
                            old_reg->iter.btf_id != cur_reg->iter.btf_id ||
                            old_reg->iter.state != cur_reg->iter.state ||
                            /* ignore {old_reg,cur_reg}->iter.depth, see above */
                            !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
                                return false;
                        break;
                case STACK_IRQ_FLAG:
                        old_reg = &old->stack[spi].spilled_ptr;
                        cur_reg = &cur->stack[spi].spilled_ptr;
                        if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
                            old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
                                return false;
                        break;
                case STACK_MISC:
                case STACK_ZERO:
                case STACK_INVALID:
                case STACK_POISON:
                        continue;
                /* Ensure that new unhandled slot types return false by default */
                default:
                        return false;
                }
        }
        return true;
}

static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
                    struct bpf_idmap *idmap)
{
        int i;

        if (old->acquired_refs != cur->acquired_refs)
                return false;

        if (old->active_locks != cur->active_locks)
                return false;

        if (old->active_preempt_locks != cur->active_preempt_locks)
                return false;

        if (old->active_rcu_locks != cur->active_rcu_locks)
                return false;

        if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
                return false;

        if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) ||
            old->active_lock_ptr != cur->active_lock_ptr)
                return false;

        for (i = 0; i < old->acquired_refs; i++) {
                if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
                    old->refs[i].type != cur->refs[i].type)
                        return false;
                switch (old->refs[i].type) {
                case REF_TYPE_PTR:
                case REF_TYPE_IRQ:
                        break;
                case REF_TYPE_LOCK:
                case REF_TYPE_RES_LOCK:
                case REF_TYPE_RES_LOCK_IRQ:
                        if (old->refs[i].ptr != cur->refs[i].ptr)
                                return false;
                        break;
                default:
                        WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
                        return false;
                }
        }

        return true;
}

/* compare two verifier states
 *
 * all states stored in state_list are known to be valid, since
 * verifier reached 'bpf_exit' instruction through them
 *
 * this function is called when verifier exploring different branches of
 * execution popped from the state stack. If it sees an old state that has
 * more strict register state and more strict stack state then this execution
 * branch doesn't need to be explored further, since verifier already
 * concluded that more strict state leads to valid finish.
 *
 * Therefore two states are equivalent if register state is more conservative
 * and explored stack state is more conservative than the current one.
 * Example:
 *       explored                   current
 * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
 * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
 *
 * In other words if current stack state (one being explored) has more
 * valid slots than old one that already passed validation, it means
 * the verifier can stop exploring and conclude that current state is valid too
 *
 * Similarly with registers. If explored state has register type as invalid
 * whereas register type in current state is meaningful, it means that
 * the current state will reach 'bpf_exit' instruction safely
 */
static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
                              struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact)
{
        u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before;
        u16 i;

        if (old->callback_depth > cur->callback_depth)
                return false;

        for (i = 0; i < MAX_BPF_REG; i++)
                if (((1 << i) & live_regs) &&
                    !regsafe(env, &old->regs[i], &cur->regs[i],
                             &env->idmap_scratch, exact))
                        return false;

        if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
                return false;

        return true;
}

static void reset_idmap_scratch(struct bpf_verifier_env *env)
{
        struct bpf_idmap *idmap = &env->idmap_scratch;

        idmap->tmp_id_gen = env->id_gen;
        idmap->cnt = 0;
}

static bool states_equal(struct bpf_verifier_env *env,
                         struct bpf_verifier_state *old,
                         struct bpf_verifier_state *cur,
                         enum exact_level exact)
{
        u32 insn_idx;
        int i;

        if (old->curframe != cur->curframe)
                return false;

        reset_idmap_scratch(env);

        /* Verification state from speculative execution simulation
         * must never prune a non-speculative execution one.
         */
        if (old->speculative && !cur->speculative)
                return false;

        if (old->in_sleepable != cur->in_sleepable)
                return false;

        if (!refsafe(old, cur, &env->idmap_scratch))
                return false;

        /* for states to be equal callsites have to be the same
         * and all frame states need to be equivalent
         */
        for (i = 0; i <= old->curframe; i++) {
                insn_idx = bpf_frame_insn_idx(old, i);
                if (old->frame[i]->callsite != cur->frame[i]->callsite)
                        return false;
                if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact))
                        return false;
        }
        return true;
}

/* find precise scalars in the previous equivalent state and
 * propagate them into the current state
 */
static int propagate_precision(struct bpf_verifier_env *env,
                               const struct bpf_verifier_state *old,
                               struct bpf_verifier_state *cur,
                               bool *changed)
{
        struct bpf_reg_state *state_reg;
        struct bpf_func_state *state;
        int i, err = 0, fr;
        bool first;

        for (fr = old->curframe; fr >= 0; fr--) {
                state = old->frame[fr];
                state_reg = state->regs;
                first = true;
                for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
                        if (state_reg->type != SCALAR_VALUE ||
                            !state_reg->precise)
                                continue;
                        if (env->log.level & BPF_LOG_LEVEL2) {
                                if (first)
                                        verbose(env, "frame %d: propagating r%d", fr, i);
                                else
                                        verbose(env, ",r%d", i);
                        }
                        bpf_bt_set_frame_reg(&env->bt, fr, i);
                        first = false;
                }

                for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
                        if (!bpf_is_spilled_reg(&state->stack[i]))
                                continue;
                        state_reg = &state->stack[i].spilled_ptr;
                        if (state_reg->type != SCALAR_VALUE ||
                            !state_reg->precise)
                                continue;
                        if (env->log.level & BPF_LOG_LEVEL2) {
                                if (first)
                                        verbose(env, "frame %d: propagating fp%d",
                                                fr, (-i - 1) * BPF_REG_SIZE);
                                else
                                        verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
                        }
                        bpf_bt_set_frame_slot(&env->bt, fr, i);
                        first = false;
                }
                if (!first && (env->log.level & BPF_LOG_LEVEL2))
                        verbose(env, "\n");
        }

        err = bpf_mark_chain_precision(env, cur, -1, changed);
        if (err < 0)
                return err;

        return 0;
}

#define MAX_BACKEDGE_ITERS 64

/* Propagate read and precision marks from visit->backedges[*].state->equal_state
 * to corresponding parent states of visit->backedges[*].state until fixed point is reached,
 * then free visit->backedges.
 * After execution of this function incomplete_read_marks() will return false
 * for all states corresponding to @visit->callchain.
 */
static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit)
{
        struct bpf_scc_backedge *backedge;
        struct bpf_verifier_state *st;
        bool changed;
        int i, err;

        i = 0;
        do {
                if (i++ > MAX_BACKEDGE_ITERS) {
                        if (env->log.level & BPF_LOG_LEVEL2)
                                verbose(env, "%s: too many iterations\n", __func__);
                        for (backedge = visit->backedges; backedge; backedge = backedge->next)
                                bpf_mark_all_scalars_precise(env, &backedge->state);
                        break;
                }
                changed = false;
                for (backedge = visit->backedges; backedge; backedge = backedge->next) {
                        st = &backedge->state;
                        err = propagate_precision(env, st->equal_state, st, &changed);
                        if (err)
                                return err;
                }
        } while (changed);

        bpf_free_backedges(visit);
        return 0;
}

static bool states_maybe_looping(struct bpf_verifier_state *old,
                                 struct bpf_verifier_state *cur)
{
        struct bpf_func_state *fold, *fcur;
        int i, fr = cur->curframe;

        if (old->curframe != fr)
                return false;

        fold = old->frame[fr];
        fcur = cur->frame[fr];
        for (i = 0; i < MAX_BPF_REG; i++)
                if (memcmp(&fold->regs[i], &fcur->regs[i],
                           offsetof(struct bpf_reg_state, frameno)))
                        return false;
        return true;
}

/* is_state_visited() handles iter_next() (see process_iter_next_call() for
 * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
 * states to match, which otherwise would look like an infinite loop. So while
 * iter_next() calls are taken care of, we still need to be careful and
 * prevent erroneous and too eager declaration of "infinite loop", when
 * iterators are involved.
 *
 * Here's a situation in pseudo-BPF assembly form:
 *
 *   0: again:                          ; set up iter_next() call args
 *   1:   r1 = &it                      ; <CHECKPOINT HERE>
 *   2:   call bpf_iter_num_next        ; this is iter_next() call
 *   3:   if r0 == 0 goto done
 *   4:   ... something useful here ...
 *   5:   goto again                    ; another iteration
 *   6: done:
 *   7:   r1 = &it
 *   8:   call bpf_iter_num_destroy     ; clean up iter state
 *   9:   exit
 *
 * This is a typical loop. Let's assume that we have a prune point at 1:,
 * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
 * again`, assuming other heuristics don't get in a way).
 *
 * When we first time come to 1:, let's say we have some state X. We proceed
 * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
 * Now we come back to validate that forked ACTIVE state. We proceed through
 * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
 * are converging. But the problem is that we don't know that yet, as this
 * convergence has to happen at iter_next() call site only. So if nothing is
 * done, at 1: verifier will use bounded loop logic and declare infinite
 * looping (and would be *technically* correct, if not for iterator's
 * "eventual sticky NULL" contract, see process_iter_next_call()). But we
 * don't want that. So what we do in process_iter_next_call() when we go on
 * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
 * a different iteration. So when we suspect an infinite loop, we additionally
 * check if any of the *ACTIVE* iterator states depths differ. If yes, we
 * pretend we are not looping and wait for next iter_next() call.
 *
 * This only applies to ACTIVE state. In DRAINED state we don't expect to
 * loop, because that would actually mean infinite loop, as DRAINED state is
 * "sticky", and so we'll keep returning into the same instruction with the
 * same state (at least in one of possible code paths).
 *
 * This approach allows to keep infinite loop heuristic even in the face of
 * active iterator. E.g., C snippet below is and will be detected as
 * infinitely looping:
 *
 *   struct bpf_iter_num it;
 *   int *p, x;
 *
 *   bpf_iter_num_new(&it, 0, 10);
 *   while ((p = bpf_iter_num_next(&t))) {
 *       x = p;
 *       while (x--) {} // <<-- infinite loop here
 *   }
 *
 */
static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
{
        struct bpf_reg_state *slot, *cur_slot;
        struct bpf_func_state *state;
        int i, fr;

        for (fr = old->curframe; fr >= 0; fr--) {
                state = old->frame[fr];
                for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
                        if (state->stack[i].slot_type[0] != STACK_ITER)
                                continue;

                        slot = &state->stack[i].spilled_ptr;
                        if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
                                continue;

                        cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
                        if (cur_slot->iter.depth != slot->iter.depth)
                                return true;
                }
        }
        return false;
}

static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
        struct bpf_func_state *func;
        struct bpf_reg_state *reg;
        int i, j;

        for (i = 0; i <= st->curframe; i++) {
                func = st->frame[i];
                for (j = 0; j < BPF_REG_FP; j++) {
                        reg = &func->regs[j];
                        if (reg->type != SCALAR_VALUE)
                                continue;
                        reg->precise = false;
                }
                for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
                        if (!bpf_is_spilled_reg(&func->stack[j]))
                                continue;
                        reg = &func->stack[j].spilled_ptr;
                        if (reg->type != SCALAR_VALUE)
                                continue;
                        reg->precise = false;
                }
        }
}

int bpf_is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
        struct bpf_verifier_state_list *new_sl;
        struct bpf_verifier_state_list *sl;
        struct bpf_verifier_state *cur = env->cur_state, *new;
        bool force_new_state, add_new_state, loop;
        int n, err, states_cnt = 0;
        struct list_head *pos, *tmp, *head;

        force_new_state = env->test_state_freq || bpf_is_force_checkpoint(env, insn_idx) ||
                          /* Avoid accumulating infinitely long jmp history */
                          cur->jmp_history_cnt > 40;

        /* bpf progs typically have pruning point every 4 instructions
         * http://vger.kernel.org/bpfconf2019.html#session-1
         * Do not add new state for future pruning if the verifier hasn't seen
         * at least 2 jumps and at least 8 instructions.
         * This heuristics helps decrease 'total_states' and 'peak_states' metric.
         * In tests that amounts to up to 50% reduction into total verifier
         * memory consumption and 20% verifier time speedup.
         */
        add_new_state = force_new_state;
        if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
            env->insn_processed - env->prev_insn_processed >= 8)
                add_new_state = true;

        /* keep cleaning the current state as registers/stack become dead */
        err = clean_verifier_state(env, cur);
        if (err)
                return err;

        loop = false;
        head = bpf_explored_state(env, insn_idx);
        list_for_each_safe(pos, tmp, head) {
                sl = container_of(pos, struct bpf_verifier_state_list, node);
                states_cnt++;
                if (sl->state.insn_idx != insn_idx)
                        continue;

                if (sl->state.branches) {
                        struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];

                        if (frame->in_async_callback_fn &&
                            frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
                                /* Different async_entry_cnt means that the verifier is
                                 * processing another entry into async callback.
                                 * Seeing the same state is not an indication of infinite
                                 * loop or infinite recursion.
                                 * But finding the same state doesn't mean that it's safe
                                 * to stop processing the current state. The previous state
                                 * hasn't yet reached bpf_exit, since state.branches > 0.
                                 * Checking in_async_callback_fn alone is not enough either.
                                 * Since the verifier still needs to catch infinite loops
                                 * inside async callbacks.
                                 */
                                goto skip_inf_loop_check;
                        }
                        /* BPF open-coded iterators loop detection is special.
                         * states_maybe_looping() logic is too simplistic in detecting
                         * states that *might* be equivalent, because it doesn't know
                         * about ID remapping, so don't even perform it.
                         * See process_iter_next_call() and iter_active_depths_differ()
                         * for overview of the logic. When current and one of parent
                         * states are detected as equivalent, it's a good thing: we prove
                         * convergence and can stop simulating further iterations.
                         * It's safe to assume that iterator loop will finish, taking into
                         * account iter_next() contract of eventually returning
                         * sticky NULL result.
                         *
                         * Note, that states have to be compared exactly in this case because
                         * read and precision marks might not be finalized inside the loop.
                         * E.g. as in the program below:
                         *
                         *     1. r7 = -16
                         *     2. r6 = bpf_get_prandom_u32()
                         *     3. while (bpf_iter_num_next(&fp[-8])) {
                         *     4.   if (r6 != 42) {
                         *     5.     r7 = -32
                         *     6.     r6 = bpf_get_prandom_u32()
                         *     7.     continue
                         *     8.   }
                         *     9.   r0 = r10
                         *    10.   r0 += r7
                         *    11.   r8 = *(u64 *)(r0 + 0)
                         *    12.   r6 = bpf_get_prandom_u32()
                         *    13. }
                         *
                         * Here verifier would first visit path 1-3, create a checkpoint at 3
                         * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
                         * not have read or precision mark for r7 yet, thus inexact states
                         * comparison would discard current state with r7=-32
                         * => unsafe memory access at 11 would not be caught.
                         */
                        if (is_iter_next_insn(env, insn_idx)) {
                                if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
                                        struct bpf_func_state *cur_frame;
                                        struct bpf_reg_state *iter_state, *iter_reg;
                                        int spi;

                                        cur_frame = cur->frame[cur->curframe];
                                        /* btf_check_iter_kfuncs() enforces that
                                         * iter state pointer is always the first arg
                                         */
                                        iter_reg = &cur_frame->regs[BPF_REG_1];
                                        /* current state is valid due to states_equal(),
                                         * so we can assume valid iter and reg state,
                                         * no need for extra (re-)validations
                                         */
                                        spi = bpf_get_spi(iter_reg->var_off.value);
                                        iter_state = &bpf_func(env, iter_reg)->stack[spi].spilled_ptr;
                                        if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
                                                loop = true;
                                                goto hit;
                                        }
                                }
                                goto skip_inf_loop_check;
                        }
                        if (is_may_goto_insn_at(env, insn_idx)) {
                                if (sl->state.may_goto_depth != cur->may_goto_depth &&
                                    states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
                                        loop = true;
                                        goto hit;
                                }
                        }
                        if (bpf_calls_callback(env, insn_idx)) {
                                if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
                                        loop = true;
                                        goto hit;
                                }
                                goto skip_inf_loop_check;
                        }
                        /* attempt to detect infinite loop to avoid unnecessary doomed work */
                        if (states_maybe_looping(&sl->state, cur) &&
                            states_equal(env, &sl->state, cur, EXACT) &&
                            !iter_active_depths_differ(&sl->state, cur) &&
                            sl->state.may_goto_depth == cur->may_goto_depth &&
                            sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
                                verbose_linfo(env, insn_idx, "; ");
                                verbose(env, "infinite loop detected at insn %d\n", insn_idx);
                                verbose(env, "cur state:");
                                print_verifier_state(env, cur, cur->curframe, true);
                                verbose(env, "old state:");
                                print_verifier_state(env, &sl->state, cur->curframe, true);
                                return -EINVAL;
                        }
                        /* if the verifier is processing a loop, avoid adding new state
                         * too often, since different loop iterations have distinct
                         * states and may not help future pruning.
                         * This threshold shouldn't be too low to make sure that
                         * a loop with large bound will be rejected quickly.
                         * The most abusive loop will be:
                         * r1 += 1
                         * if r1 < 1000000 goto pc-2
                         * 1M insn_procssed limit / 100 == 10k peak states.
                         * This threshold shouldn't be too high either, since states
                         * at the end of the loop are likely to be useful in pruning.
                         */
skip_inf_loop_check:
                        if (!force_new_state &&
                            env->jmps_processed - env->prev_jmps_processed < 20 &&
                            env->insn_processed - env->prev_insn_processed < 100)
                                add_new_state = false;
                        goto miss;
                }
                /* See comments for mark_all_regs_read_and_precise() */
                loop = incomplete_read_marks(env, &sl->state);
                if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
hit:
                        sl->hit_cnt++;

                        /* if previous state reached the exit with precision and
                         * current state is equivalent to it (except precision marks)
                         * the precision needs to be propagated back in
                         * the current state.
                         */
                        err = 0;
                        if (bpf_is_jmp_point(env, env->insn_idx))
                                err = bpf_push_jmp_history(env, cur, 0, 0);
                        err = err ? : propagate_precision(env, &sl->state, cur, NULL);
                        if (err)
                                return err;
                        /* When processing iterator based loops above propagate_liveness and
                         * propagate_precision calls are not sufficient to transfer all relevant
                         * read and precision marks. E.g. consider the following case:
                         *
                         *  .-> A --.  Assume the states are visited in the order A, B, C.
                         *  |   |   |  Assume that state B reaches a state equivalent to state A.
                         *  |   v   v  At this point, state C is not processed yet, so state A
                         *  '-- B   C  has not received any read or precision marks from C.
                         *             Thus, marks propagated from A to B are incomplete.
                         *
                         * The verifier mitigates this by performing the following steps:
                         *
                         * - Prior to the main verification pass, strongly connected components
                         *   (SCCs) are computed over the program's control flow graph,
                         *   intraprocedurally.
                         *
                         * - During the main verification pass, `maybe_enter_scc()` checks
                         *   whether the current verifier state is entering an SCC. If so, an
                         *   instance of a `bpf_scc_visit` object is created, and the state
                         *   entering the SCC is recorded as the entry state.
                         *
                         * - This instance is associated not with the SCC itself, but with a
                         *   `bpf_scc_callchain`: a tuple consisting of the call sites leading to
                         *   the SCC and the SCC id. See `compute_scc_callchain()`.
                         *
                         * - When a verification path encounters a `states_equal(...,
                         *   RANGE_WITHIN)` condition, there exists a call chain describing the
                         *   current state and a corresponding `bpf_scc_visit` instance. A copy
                         *   of the current state is created and added to
                         *   `bpf_scc_visit->backedges`.
                         *
                         * - When a verification path terminates, `maybe_exit_scc()` is called
                         *   from `bpf_update_branch_counts()`. For states with `branches == 0`, it
                         *   checks whether the state is the entry state of any `bpf_scc_visit`
                         *   instance. If it is, this indicates that all paths originating from
                         *   this SCC visit have been explored. `propagate_backedges()` is then
                         *   called, which propagates read and precision marks through the
                         *   backedges until a fixed point is reached.
                         *   (In the earlier example, this would propagate marks from A to B,
                         *    from C to A, and then again from A to B.)
                         *
                         * A note on callchains
                         * --------------------
                         *
                         * Consider the following example:
                         *
                         *     void foo() { loop { ... SCC#1 ... } }
                         *     void main() {
                         *       A: foo();
                         *       B: ...
                         *       C: foo();
                         *     }
                         *
                         * Here, there are two distinct callchains leading to SCC#1:
                         * - (A, SCC#1)
                         * - (C, SCC#1)
                         *
                         * Each callchain identifies a separate `bpf_scc_visit` instance that
                         * accumulates backedge states. The `propagate_{liveness,precision}()`
                         * functions traverse the parent state of each backedge state, which
                         * means these parent states must remain valid (i.e., not freed) while
                         * the corresponding `bpf_scc_visit` instance exists.
                         *
                         * Associating `bpf_scc_visit` instances directly with SCCs instead of
                         * callchains would break this invariant:
                         * - States explored during `C: foo()` would contribute backedges to
                         *   SCC#1, but SCC#1 would only be exited once the exploration of
                         *   `A: foo()` completes.
                         * - By that time, the states explored between `A: foo()` and `C: foo()`
                         *   (i.e., `B: ...`) may have already been freed, causing the parent
                         *   links for states from `C: foo()` to become invalid.
                         */
                        if (loop) {
                                struct bpf_scc_backedge *backedge;

                                backedge = kzalloc_obj(*backedge,
                                                       GFP_KERNEL_ACCOUNT);
                                if (!backedge)
                                        return -ENOMEM;
                                err = bpf_copy_verifier_state(&backedge->state, cur);
                                backedge->state.equal_state = &sl->state;
                                backedge->state.insn_idx = insn_idx;
                                err = err ?: add_scc_backedge(env, &sl->state, backedge);
                                if (err) {
                                        bpf_free_verifier_state(&backedge->state, false);
                                        kfree(backedge);
                                        return err;
                                }
                        }
                        return 1;
                }
miss:
                /* when new state is not going to be added do not increase miss count.
                 * Otherwise several loop iterations will remove the state
                 * recorded earlier. The goal of these heuristics is to have
                 * states from some iterations of the loop (some in the beginning
                 * and some at the end) to help pruning.
                 */
                if (add_new_state)
                        sl->miss_cnt++;
                /* heuristic to determine whether this state is beneficial
                 * to keep checking from state equivalence point of view.
                 * Higher numbers increase max_states_per_insn and verification time,
                 * but do not meaningfully decrease insn_processed.
                 * 'n' controls how many times state could miss before eviction.
                 * Use bigger 'n' for checkpoints because evicting checkpoint states
                 * too early would hinder iterator convergence.
                 */
                n = bpf_is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
                if (sl->miss_cnt > sl->hit_cnt * n + n) {
                        /* the state is unlikely to be useful. Remove it to
                         * speed up verification
                         */
                        sl->in_free_list = true;
                        list_del(&sl->node);
                        list_add(&sl->node, &env->free_list);
                        env->free_list_size++;
                        env->explored_states_size--;
                        maybe_free_verifier_state(env, sl);
                }
        }

        if (env->max_states_per_insn < states_cnt)
                env->max_states_per_insn = states_cnt;

        if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
                return 0;

        if (!add_new_state)
                return 0;

        /* There were no equivalent states, remember the current one.
         * Technically the current state is not proven to be safe yet,
         * but it will either reach outer most bpf_exit (which means it's safe)
         * or it will be rejected. When there are no loops the verifier won't be
         * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
         * again on the way to bpf_exit.
         * When looping the sl->state.branches will be > 0 and this state
         * will not be considered for equivalence until branches == 0.
         */
        new_sl = kzalloc_obj(struct bpf_verifier_state_list, GFP_KERNEL_ACCOUNT);
        if (!new_sl)
                return -ENOMEM;
        env->total_states++;
        env->explored_states_size++;
        update_peak_states(env);
        env->prev_jmps_processed = env->jmps_processed;
        env->prev_insn_processed = env->insn_processed;

        /* forget precise markings we inherited, see __mark_chain_precision */
        if (env->bpf_capable)
                mark_all_scalars_imprecise(env, cur);

        bpf_clear_singular_ids(env, cur);

        /* add new state to the head of linked list */
        new = &new_sl->state;
        err = bpf_copy_verifier_state(new, cur);
        if (err) {
                bpf_free_verifier_state(new, false);
                kfree(new_sl);
                return err;
        }
        new->insn_idx = insn_idx;
        verifier_bug_if(new->branches != 1, env,
                        "%s:branches_to_explore=%d insn %d",
                        __func__, new->branches, insn_idx);
        err = maybe_enter_scc(env, new);
        if (err) {
                bpf_free_verifier_state(new, false);
                kfree(new_sl);
                return err;
        }

        cur->parent = new;
        cur->first_insn_idx = insn_idx;
        cur->dfs_depth = new->dfs_depth + 1;
        bpf_clear_jmp_history(cur);
        list_add(&new_sl->node, head);
        return 0;
}












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 













    1 













































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/scatterlist.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/usb.h>

#define SIMPLE_IO_TIMEOUT        10000        /* in milliseconds */

/*-------------------------------------------------------------------------*/

static int override_alt = -1;
module_param_named(alt, override_alt, int, 0644);
MODULE_PARM_DESC(alt, ">= 0 to override altsetting selection");
static void complicated_callback(struct urb *urb);

/*-------------------------------------------------------------------------*/

/* FIXME make these public somewhere; usbdevfs.h? */

/* Parameter for usbtest driver. */
struct usbtest_param_32 {
        /* inputs */
        __u32                test_num;        /* 0..(TEST_CASES-1) */
        __u32                iterations;
        __u32                length;
        __u32                vary;
        __u32                sglen;

        /* outputs */
        __s32                duration_sec;
        __s32                duration_usec;
};

/*
 * Compat parameter to the usbtest driver.
 * This supports older user space binaries compiled with 64 bit compiler.
 */
struct usbtest_param_64 {
        /* inputs */
        __u32                test_num;        /* 0..(TEST_CASES-1) */
        __u32                iterations;
        __u32                length;
        __u32                vary;
        __u32                sglen;

        /* outputs */
        __s64                duration_sec;
        __s64                duration_usec;
};

/* IOCTL interface to the driver. */
#define USBTEST_REQUEST_32    _IOWR('U', 100, struct usbtest_param_32)
/* COMPAT IOCTL interface to the driver. */
#define USBTEST_REQUEST_64    _IOWR('U', 100, struct usbtest_param_64)

/*-------------------------------------------------------------------------*/

#define        GENERIC                /* let probe() bind using module params */

/* Some devices that can be used for testing will have "real" drivers.
 * Entries for those need to be enabled here by hand, after disabling
 * that "real" driver.
 */
//#define        IBOT2                /* grab iBOT2 webcams */
//#define        KEYSPAN_19Qi        /* grab un-renumerated serial adapter */

/*-------------------------------------------------------------------------*/

struct usbtest_info {
        const char                *name;
        u8                        ep_in;                /* bulk/intr source */
        u8                        ep_out;                /* bulk/intr sink */
        unsigned                autoconf:1;
        unsigned                ctrl_out:1;
        unsigned                iso:1;                /* try iso in/out */
        unsigned                intr:1;                /* try interrupt in/out */
        int                        alt;
};

/* this is accessed only through usbfs ioctl calls.
 * one ioctl to issue a test ... one lock per device.
 * tests create other threads if they need them.
 * urbs and buffers are allocated dynamically,
 * and data generated deterministically.
 */
struct usbtest_dev {
        struct usb_interface        *intf;
        struct usbtest_info        *info;
        int                        in_pipe;
        int                        out_pipe;
        int                        in_iso_pipe;
        int                        out_iso_pipe;
        int                        in_int_pipe;
        int                        out_int_pipe;
        struct usb_endpoint_descriptor        *iso_in, *iso_out;
        struct usb_endpoint_descriptor        *int_in, *int_out;
        struct mutex                lock;

#define TBUF_SIZE        256
        u8                        *buf;
};

static struct usb_device *testdev_to_usbdev(struct usbtest_dev *test)
{
        return interface_to_usbdev(test->intf);
}

/* set up all urbs so they can be used with either bulk or interrupt */
#define        INTERRUPT_RATE                1        /* msec/transfer */

#define ERROR(tdev, fmt, args...) \
        dev_err(&(tdev)->intf->dev , fmt , ## args)
#define WARNING(tdev, fmt, args...) \
        dev_warn(&(tdev)->intf->dev , fmt , ## args)

#define GUARD_BYTE        0xA5
#define MAX_SGLEN        128

/*-------------------------------------------------------------------------*/

static inline void endpoint_update(int edi,
                                   struct usb_host_endpoint **in,
                                   struct usb_host_endpoint **out,
                                   struct usb_host_endpoint *e)
{
        if (edi) {
                if (!*in)
                        *in = e;
        } else {
                if (!*out)
                        *out = e;
        }
}

static int
get_endpoints(struct usbtest_dev *dev, struct usb_interface *intf)
{
        int                                tmp;
        struct usb_host_interface        *alt;
        struct usb_host_endpoint        *in, *out;
        struct usb_host_endpoint        *iso_in, *iso_out;
        struct usb_host_endpoint        *int_in, *int_out;
        struct usb_device                *udev;

        for (tmp = 0; tmp < intf->num_altsetting; tmp++) {
                unsigned        ep;

                in = out = NULL;
                iso_in = iso_out = NULL;
                int_in = int_out = NULL;
                alt = intf->altsetting + tmp;

                if (override_alt >= 0 &&
                                override_alt != alt->desc.bAlternateSetting)
                        continue;

                /* take the first altsetting with in-bulk + out-bulk;
                 * ignore other endpoints and altsettings.
                 */
                for (ep = 0; ep < alt->desc.bNumEndpoints; ep++) {
                        struct usb_host_endpoint        *e;
                        int edi;

                        e = alt->endpoint + ep;
                        edi = usb_endpoint_dir_in(&e->desc);

                        switch (usb_endpoint_type(&e->desc)) {
                        case USB_ENDPOINT_XFER_BULK:
                                endpoint_update(edi, &in, &out, e);
                                continue;
                        case USB_ENDPOINT_XFER_INT:
                                if (dev->info->intr)
                                        endpoint_update(edi, &int_in, &int_out, e);
                                continue;
                        case USB_ENDPOINT_XFER_ISOC:
                                if (dev->info->iso)
                                        endpoint_update(edi, &iso_in, &iso_out, e);
                                fallthrough;
                        default:
                                continue;
                        }
                }
                if ((in && out)  ||  iso_in || iso_out || int_in || int_out)
                        goto found;
        }
        return -EINVAL;

found:
        udev = testdev_to_usbdev(dev);
        dev->info->alt = alt->desc.bAlternateSetting;
        if (alt->desc.bAlternateSetting != 0) {
                tmp = usb_set_interface(udev,
                                alt->desc.bInterfaceNumber,
                                alt->desc.bAlternateSetting);
                if (tmp < 0)
                        return tmp;
        }

        if (in)
                dev->in_pipe = usb_rcvbulkpipe(udev,
                        in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
        if (out)
                dev->out_pipe = usb_sndbulkpipe(udev,
                        out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);

        if (iso_in) {
                dev->iso_in = &iso_in->desc;
                dev->in_iso_pipe = usb_rcvisocpipe(udev,
                                iso_in->desc.bEndpointAddress
                                        & USB_ENDPOINT_NUMBER_MASK);
        }

        if (iso_out) {
                dev->iso_out = &iso_out->desc;
                dev->out_iso_pipe = usb_sndisocpipe(udev,
                                iso_out->desc.bEndpointAddress
                                        & USB_ENDPOINT_NUMBER_MASK);
        }

        if (int_in) {
                dev->int_in = &int_in->desc;
                dev->in_int_pipe = usb_rcvintpipe(udev,
                                int_in->desc.bEndpointAddress
                                        & USB_ENDPOINT_NUMBER_MASK);
        }

        if (int_out) {
                dev->int_out = &int_out->desc;
                dev->out_int_pipe = usb_sndintpipe(udev,
                                int_out->desc.bEndpointAddress
                                        & USB_ENDPOINT_NUMBER_MASK);
        }
        return 0;
}

/*-------------------------------------------------------------------------*/

/* Support for testing basic non-queued I/O streams.
 *
 * These just package urbs as requests that can be easily canceled.
 * Each urb's data buffer is dynamically allocated; callers can fill
 * them with non-zero test data (or test for it) when appropriate.
 */

static void simple_callback(struct urb *urb)
{
        complete(urb->context);
}

static struct urb *usbtest_alloc_urb(
        struct usb_device        *udev,
        int                        pipe,
        unsigned long                bytes,
        unsigned                transfer_flags,
        unsigned                offset,
        u8                        bInterval,
        usb_complete_t                complete_fn)
{
        struct urb                *urb;

        urb = usb_alloc_urb(0, GFP_KERNEL);
        if (!urb)
                return urb;

        if (bInterval)
                usb_fill_int_urb(urb, udev, pipe, NULL, bytes, complete_fn,
                                NULL, bInterval);
        else
                usb_fill_bulk_urb(urb, udev, pipe, NULL, bytes, complete_fn,
                                NULL);

        urb->interval = (udev->speed == USB_SPEED_HIGH)
                        ? (INTERRUPT_RATE << 3)
                        : INTERRUPT_RATE;
        urb->transfer_flags = transfer_flags;
        if (usb_pipein(pipe))
                urb->transfer_flags |= URB_SHORT_NOT_OK;

        if ((bytes + offset) == 0)
                return urb;

        if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)
                urb->transfer_buffer = usb_alloc_coherent(udev, bytes + offset,
                        GFP_KERNEL, &urb->transfer_dma);
        else
                urb->transfer_buffer = kmalloc(bytes + offset, GFP_KERNEL);

        if (!urb->transfer_buffer) {
                usb_free_urb(urb);
                return NULL;
        }

        /* To test unaligned transfers add an offset and fill the
                unused memory with a guard value */
        if (offset) {
                memset(urb->transfer_buffer, GUARD_BYTE, offset);
                urb->transfer_buffer += offset;
                if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)
                        urb->transfer_dma += offset;
        }

        /* For inbound transfers use guard byte so that test fails if
                data not correctly copied */
        memset(urb->transfer_buffer,
                        usb_pipein(urb->pipe) ? GUARD_BYTE : 0,
                        bytes);
        return urb;
}

static struct urb *simple_alloc_urb(
        struct usb_device        *udev,
        int                        pipe,
        unsigned long                bytes,
        u8                        bInterval)
{
        return usbtest_alloc_urb(udev, pipe, bytes, URB_NO_TRANSFER_DMA_MAP, 0,
                        bInterval, simple_callback);
}

static struct urb *complicated_alloc_urb(
        struct usb_device        *udev,
        int                        pipe,
        unsigned long                bytes,
        u8                        bInterval)
{
        return usbtest_alloc_urb(udev, pipe, bytes, URB_NO_TRANSFER_DMA_MAP, 0,
                        bInterval, complicated_callback);
}

static unsigned pattern;
static unsigned mod_pattern;
module_param_named(pattern, mod_pattern, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(mod_pattern, "i/o pattern (0 == zeroes)");

static unsigned get_maxpacket(struct usb_device *udev, int pipe)
{
        struct usb_host_endpoint        *ep;

        ep = usb_pipe_endpoint(udev, pipe);
        return le16_to_cpup(&ep->desc.wMaxPacketSize);
}

static int ss_isoc_get_packet_num(struct usb_device *udev, int pipe)
{
        struct usb_host_endpoint *ep = usb_pipe_endpoint(udev, pipe);

        return USB_SS_MULT(ep->ss_ep_comp.bmAttributes)
                * (1 + ep->ss_ep_comp.bMaxBurst);
}

static void simple_fill_buf(struct urb *urb)
{
        unsigned        i;
        u8                *buf = urb->transfer_buffer;
        unsigned        len = urb->transfer_buffer_length;
        unsigned        maxpacket;

        switch (pattern) {
        default:
                fallthrough;
        case 0:
                memset(buf, 0, len);
                break;
        case 1:                        /* mod63 */
                maxpacket = get_maxpacket(urb->dev, urb->pipe);
                for (i = 0; i < len; i++)
                        *buf++ = (u8) ((i % maxpacket) % 63);
                break;
        }
}

static inline unsigned long buffer_offset(void *buf)
{
        return (unsigned long)buf & (ARCH_KMALLOC_MINALIGN - 1);
}

static int check_guard_bytes(struct usbtest_dev *tdev, struct urb *urb)
{
        u8 *buf = urb->transfer_buffer;
        u8 *guard = buf - buffer_offset(buf);
        unsigned i;

        for (i = 0; guard < buf; i++, guard++) {
                if (*guard != GUARD_BYTE) {
                        ERROR(tdev, "guard byte[%d] %d (not %d)\n",
                                i, *guard, GUARD_BYTE);
                        return -EINVAL;
                }
        }
        return 0;
}

static int simple_check_buf(struct usbtest_dev *tdev, struct urb *urb)
{
        unsigned        i;
        u8                expected;
        u8                *buf = urb->transfer_buffer;
        unsigned        len = urb->actual_length;
        unsigned        maxpacket = get_maxpacket(urb->dev, urb->pipe);

        int ret = check_guard_bytes(tdev, urb);
        if (ret)
                return ret;

        for (i = 0; i < len; i++, buf++) {
                switch (pattern) {
                /* all-zeroes has no synchronization issues */
                case 0:
                        expected = 0;
                        break;
                /* mod63 stays in sync with short-terminated transfers,
                 * or otherwise when host and gadget agree on how large
                 * each usb transfer request should be.  resync is done
                 * with set_interface or set_config.
                 */
                case 1:                        /* mod63 */
                        expected = (i % maxpacket) % 63;
                        break;
                /* always fail unsupported patterns */
                default:
                        expected = !*buf;
                        break;
                }
                if (*buf == expected)
                        continue;
                ERROR(tdev, "buf[%d] = %d (not %d)\n", i, *buf, expected);
                return -EINVAL;
        }
        return 0;
}

static void simple_free_urb(struct urb *urb)
{
        unsigned long offset = buffer_offset(urb->transfer_buffer);

        if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)
                usb_free_coherent(
                        urb->dev,
                        urb->transfer_buffer_length + offset,
                        urb->transfer_buffer - offset,
                        urb->transfer_dma - offset);
        else
                kfree(urb->transfer_buffer - offset);
        usb_free_urb(urb);
}

static int simple_io(
        struct usbtest_dev        *tdev,
        struct urb                *urb,
        int                        iterations,
        int                        vary,
        int                        expected,
        const char                *label
)
{
        struct usb_device        *udev = urb->dev;
        int                        max = urb->transfer_buffer_length;
        struct completion        completion;
        int                        retval = 0;
        unsigned long                expire;

        urb->context = &completion;
        while (retval == 0 && iterations-- > 0) {
                init_completion(&completion);
                if (usb_pipeout(urb->pipe)) {
                        simple_fill_buf(urb);
                        urb->transfer_flags |= URB_ZERO_PACKET;
                }
                retval = usb_submit_urb(urb, GFP_KERNEL);
                if (retval != 0)
                        break;

                expire = msecs_to_jiffies(SIMPLE_IO_TIMEOUT);
                if (!wait_for_completion_timeout(&completion, expire)) {
                        usb_kill_urb(urb);
                        retval = (urb->status == -ENOENT ?
                                  -ETIMEDOUT : urb->status);
                } else {
                        retval = urb->status;
                }

                urb->dev = udev;
                if (retval == 0 && usb_pipein(urb->pipe))
                        retval = simple_check_buf(tdev, urb);

                if (vary) {
                        int        len = urb->transfer_buffer_length;

                        len += vary;
                        len %= max;
                        if (len == 0)
                                len = (vary < max) ? vary : max;
                        urb->transfer_buffer_length = len;
                }

                /* FIXME if endpoint halted, clear halt (and log) */
        }
        urb->transfer_buffer_length = max;

        if (expected != retval)
                dev_err(&udev->dev,
                        "%s failed, iterations left %d, status %d (not %d)\n",
                                label, iterations, retval, expected);
        return retval;
}


/*-------------------------------------------------------------------------*/

/* We use scatterlist primitives to test queued I/O.
 * Yes, this also tests the scatterlist primitives.
 */

static void free_sglist(struct scatterlist *sg, int nents)
{
        unsigned                i;

        if (!sg)
                return;
        for (i = 0; i < nents; i++) {
                if (!sg_page(&sg[i]))
                        continue;
                kfree(sg_virt(&sg[i]));
        }
        kfree(sg);
}

static struct scatterlist *
alloc_sglist(int nents, int max, int vary, struct usbtest_dev *dev, int pipe)
{
        struct scatterlist        *sg;
        unsigned int                n_size = 0;
        unsigned                i;
        unsigned                size = max;
        unsigned                maxpacket =
                get_maxpacket(interface_to_usbdev(dev->intf), pipe);

        if (max == 0)
                return NULL;

        sg = kmalloc_objs(*sg, nents);
        if (!sg)
                return NULL;
        sg_init_table(sg, nents);

        for (i = 0; i < nents; i++) {
                char                *buf;
                unsigned        j;

                buf = kzalloc(size, GFP_KERNEL);
                if (!buf) {
                        free_sglist(sg, i);
                        return NULL;
                }

                /* kmalloc pages are always physically contiguous! */
                sg_set_buf(&sg[i], buf, size);

                switch (pattern) {
                case 0:
                        /* already zeroed */
                        break;
                case 1:
                        for (j = 0; j < size; j++)
                                *buf++ = (u8) (((j + n_size) % maxpacket) % 63);
                        n_size += size;
                        break;
                }

                if (vary) {
                        size += vary;
                        size %= max;
                        if (size == 0)
                                size = (vary < max) ? vary : max;
                }
        }

        return sg;
}

struct sg_timeout {
        struct timer_list timer;
        struct usb_sg_request *req;
};

static void sg_timeout(struct timer_list *t)
{
        struct sg_timeout *timeout = timer_container_of(timeout, t, timer);

        usb_sg_cancel(timeout->req);
}

static int perform_sglist(
        struct usbtest_dev        *tdev,
        unsigned                iterations,
        int                        pipe,
        struct usb_sg_request        *req,
        struct scatterlist        *sg,
        int                        nents
)
{
        struct usb_device        *udev = testdev_to_usbdev(tdev);
        int                        retval = 0;
        struct sg_timeout        timeout = {
                .req = req,
        };

        timer_setup_on_stack(&timeout.timer, sg_timeout, 0);

        while (retval == 0 && iterations-- > 0) {
                retval = usb_sg_init(req, udev, pipe,
                                (udev->speed == USB_SPEED_HIGH)
                                        ? (INTERRUPT_RATE << 3)
                                        : INTERRUPT_RATE,
                                sg, nents, 0, GFP_KERNEL);

                if (retval)
                        break;
                mod_timer(&timeout.timer, jiffies +
                                msecs_to_jiffies(SIMPLE_IO_TIMEOUT));
                usb_sg_wait(req);
                if (!timer_delete_sync(&timeout.timer))
                        retval = -ETIMEDOUT;
                else
                        retval = req->status;
                timer_destroy_on_stack(&timeout.timer);

                /* FIXME check resulting data pattern */

                /* FIXME if endpoint halted, clear halt (and log) */
        }

        /* FIXME for unlink or fault handling tests, don't report
         * failure if retval is as we expected ...
         */
        if (retval)
                ERROR(tdev, "perform_sglist failed, "
                                "iterations left %d, status %d\n",
                                iterations, retval);
        return retval;
}


/*-------------------------------------------------------------------------*/

/* unqueued control message testing
 *
 * there's a nice set of device functional requirements in chapter 9 of the
 * usb 2.0 spec, which we can apply to ANY device, even ones that don't use
 * special test firmware.
 *
 * we know the device is configured (or suspended) by the time it's visible
 * through usbfs.  we can't change that, so we won't test enumeration (which
 * worked 'well enough' to get here, this time), power management (ditto),
 * or remote wakeup (which needs human interaction).
 */

static unsigned realworld = 1;
module_param(realworld, uint, 0);
MODULE_PARM_DESC(realworld, "clear to demand stricter spec compliance");

static int get_altsetting(struct usbtest_dev *dev)
{
        struct usb_interface        *iface = dev->intf;
        struct usb_device        *udev = interface_to_usbdev(iface);
        int                        retval;

        retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
                        USB_REQ_GET_INTERFACE, USB_DIR_IN|USB_RECIP_INTERFACE,
                        0, iface->altsetting[0].desc.bInterfaceNumber,
                        dev->buf, 1, USB_CTRL_GET_TIMEOUT);
        switch (retval) {
        case 1:
                return dev->buf[0];
        case 0:
                retval = -ERANGE;
                fallthrough;
        default:
                return retval;
        }
}

static int set_altsetting(struct usbtest_dev *dev, int alternate)
{
        struct usb_interface                *iface = dev->intf;
        struct usb_device                *udev;

        if (alternate < 0 || alternate >= 256)
                return -EINVAL;

        udev = interface_to_usbdev(iface);
        return usb_set_interface(udev,
                        iface->altsetting[0].desc.bInterfaceNumber,
                        alternate);
}

static int is_good_config(struct usbtest_dev *tdev, int len)
{
        struct usb_config_descriptor        *config;

        if (len < (int)sizeof(*config))
                return 0;
        config = (struct usb_config_descriptor *) tdev->buf;

        switch (config->bDescriptorType) {
        case USB_DT_CONFIG:
        case USB_DT_OTHER_SPEED_CONFIG:
                if (config->bLength != 9) {
                        ERROR(tdev, "bogus config descriptor length\n");
                        return 0;
                }
                /* this bit 'must be 1' but often isn't */
                if (!realworld && !(config->bmAttributes & 0x80)) {
                        ERROR(tdev, "high bit of config attributes not set\n");
                        return 0;
                }
                if (config->bmAttributes & 0x1f) {        /* reserved == 0 */
                        ERROR(tdev, "reserved config bits set\n");
                        return 0;
                }
                break;
        default:
                return 0;
        }

        if (le16_to_cpu(config->wTotalLength) == len)        /* read it all */
                return 1;
        if (le16_to_cpu(config->wTotalLength) >= TBUF_SIZE)        /* max partial read */
                return 1;
        ERROR(tdev, "bogus config descriptor read size\n");
        return 0;
}

static int is_good_ext(struct usbtest_dev *tdev, u8 *buf)
{
        struct usb_ext_cap_descriptor *ext;
        u32 attr;

        ext = (struct usb_ext_cap_descriptor *) buf;

        if (ext->bLength != USB_DT_USB_EXT_CAP_SIZE) {
                ERROR(tdev, "bogus usb 2.0 extension descriptor length\n");
                return 0;
        }

        attr = le32_to_cpu(ext->bmAttributes);
        /* bits[1:15] is used and others are reserved */
        if (attr & ~0xfffe) {        /* reserved == 0 */
                ERROR(tdev, "reserved bits set\n");
                return 0;
        }

        return 1;
}

static int is_good_ss_cap(struct usbtest_dev *tdev, u8 *buf)
{
        struct usb_ss_cap_descriptor *ss;

        ss = (struct usb_ss_cap_descriptor *) buf;

        if (ss->bLength != USB_DT_USB_SS_CAP_SIZE) {
                ERROR(tdev, "bogus superspeed device capability descriptor length\n");
                return 0;
        }

        /*
         * only bit[1] of bmAttributes is used for LTM and others are
         * reserved
         */
        if (ss->bmAttributes & ~0x02) {        /* reserved == 0 */
                ERROR(tdev, "reserved bits set in bmAttributes\n");
                return 0;
        }

        /* bits[0:3] of wSpeedSupported is used and others are reserved */
        if (le16_to_cpu(ss->wSpeedSupported) & ~0x0f) {        /* reserved == 0 */
                ERROR(tdev, "reserved bits set in wSpeedSupported\n");
                return 0;
        }

        return 1;
}

static int is_good_con_id(struct usbtest_dev *tdev, u8 *buf)
{
        struct usb_ss_container_id_descriptor *con_id;

        con_id = (struct usb_ss_container_id_descriptor *) buf;

        if (con_id->bLength != USB_DT_USB_SS_CONTN_ID_SIZE) {
                ERROR(tdev, "bogus container id descriptor length\n");
                return 0;
        }

        if (con_id->bReserved) {        /* reserved == 0 */
                ERROR(tdev, "reserved bits set\n");
                return 0;
        }

        return 1;
}

/* sanity test for standard requests working with usb_control_mesg() and some
 * of the utility functions which use it.
 *
 * this doesn't test how endpoint halts behave or data toggles get set, since
 * we won't do I/O to bulk/interrupt endpoints here (which is how to change
 * halt or toggle).  toggle testing is impractical without support from hcds.
 *
 * this avoids failing devices linux would normally work with, by not testing
 * config/altsetting operations for devices that only support their defaults.
 * such devices rarely support those needless operations.
 *
 * NOTE that since this is a sanity test, it's not examining boundary cases
 * to see if usbcore, hcd, and device all behave right.  such testing would
 * involve varied read sizes and other operation sequences.
 */
static int ch9_postconfig(struct usbtest_dev *dev)
{
        struct usb_interface        *iface = dev->intf;
        struct usb_device        *udev = interface_to_usbdev(iface);
        int                        i, alt, retval;

        /* [9.2.3] if there's more than one altsetting, we need to be able to
         * set and get each one.  mostly trusts the descriptors from usbcore.
         */
        for (i = 0; i < iface->num_altsetting; i++) {

                /* 9.2.3 constrains the range here */
                alt = iface->altsetting[i].desc.bAlternateSetting;
                if (alt < 0 || alt >= iface->num_altsetting) {
                        dev_err(&iface->dev,
                                        "invalid alt [%d].bAltSetting = %d\n",
                                        i, alt);
                }

                /* [real world] get/set unimplemented if there's only one */
                if (realworld && iface->num_altsetting == 1)
                        continue;

                /* [9.4.10] set_interface */
                retval = set_altsetting(dev, alt);
                if (retval) {
                        dev_err(&iface->dev, "can't set_interface = %d, %d\n",
                                        alt, retval);
                        return retval;
                }

                /* [9.4.4] get_interface always works */
                retval = get_altsetting(dev);
                if (retval != alt) {
                        dev_err(&iface->dev, "get alt should be %d, was %d\n",
                                        alt, retval);
                        return (retval < 0) ? retval : -EDOM;
                }

        }

        /* [real world] get_config unimplemented if there's only one */
        if (!realworld || udev->descriptor.bNumConfigurations != 1) {
                int        expected = udev->actconfig->desc.bConfigurationValue;

                /* [9.4.2] get_configuration always works
                 * ... although some cheap devices (like one TI Hub I've got)
                 * won't return config descriptors except before set_config.
                 */
                retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
                                USB_REQ_GET_CONFIGURATION,
                                USB_DIR_IN | USB_RECIP_DEVICE,
                                0, 0, dev->buf, 1, USB_CTRL_GET_TIMEOUT);
                if (retval != 1 || dev->buf[0] != expected) {
                        dev_err(&iface->dev, "get config --> %d %d (1 %d)\n",
                                retval, dev->buf[0], expected);
                        return (retval < 0) ? retval : -EDOM;
                }
        }

        /* there's always [9.4.3] a device descriptor [9.6.1] */
        retval = usb_get_descriptor(udev, USB_DT_DEVICE, 0,
                        dev->buf, sizeof(udev->descriptor));
        if (retval != sizeof(udev->descriptor)) {
                dev_err(&iface->dev, "dev descriptor --> %d\n", retval);
                return (retval < 0) ? retval : -EDOM;
        }

        /*
         * there's always [9.4.3] a bos device descriptor [9.6.2] in USB
         * 3.0 spec
         */
        if (le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0210) {
                struct usb_bos_descriptor *bos = NULL;
                struct usb_dev_cap_header *header = NULL;
                unsigned total, num, length;
                u8 *buf;

                retval = usb_get_descriptor(udev, USB_DT_BOS, 0, dev->buf,
                                sizeof(*udev->bos->desc));
                if (retval != sizeof(*udev->bos->desc)) {
                        dev_err(&iface->dev, "bos descriptor --> %d\n", retval);
                        return (retval < 0) ? retval : -EDOM;
                }

                bos = (struct usb_bos_descriptor *)dev->buf;
                total = le16_to_cpu(bos->wTotalLength);
                num = bos->bNumDeviceCaps;

                if (total > TBUF_SIZE)
                        total = TBUF_SIZE;

                /*
                 * get generic device-level capability descriptors [9.6.2]
                 * in USB 3.0 spec
                 */
                retval = usb_get_descriptor(udev, USB_DT_BOS, 0, dev->buf,
                                total);
                if (retval != total) {
                        dev_err(&iface->dev, "bos descriptor set --> %d\n",
                                        retval);
                        return (retval < 0) ? retval : -EDOM;
                }

                length = sizeof(*udev->bos->desc);
                buf = dev->buf;
                for (i = 0; i < num; i++) {
                        buf += length;
                        if (buf + sizeof(struct usb_dev_cap_header) >
                                        dev->buf + total)
                                break;

                        header = (struct usb_dev_cap_header *)buf;
                        length = header->bLength;

                        if (header->bDescriptorType !=
                                        USB_DT_DEVICE_CAPABILITY) {
                                dev_warn(&udev->dev, "not device capability descriptor, skip\n");
                                continue;
                        }

                        switch (header->bDevCapabilityType) {
                        case USB_CAP_TYPE_EXT:
                                if (buf + USB_DT_USB_EXT_CAP_SIZE >
                                                dev->buf + total ||
                                                !is_good_ext(dev, buf)) {
                                        dev_err(&iface->dev, "bogus usb 2.0 extension descriptor\n");
                                        return -EDOM;
                                }
                                break;
                        case USB_SS_CAP_TYPE:
                                if (buf + USB_DT_USB_SS_CAP_SIZE >
                                                dev->buf + total ||
                                                !is_good_ss_cap(dev, buf)) {
                                        dev_err(&iface->dev, "bogus superspeed device capability descriptor\n");
                                        return -EDOM;
                                }
                                break;
                        case CONTAINER_ID_TYPE:
                                if (buf + USB_DT_USB_SS_CONTN_ID_SIZE >
                                                dev->buf + total ||
                                                !is_good_con_id(dev, buf)) {
                                        dev_err(&iface->dev, "bogus container id descriptor\n");
                                        return -EDOM;
                                }
                                break;
                        default:
                                break;
                        }
                }
        }

        /* there's always [9.4.3] at least one config descriptor [9.6.3] */
        for (i = 0; i < udev->descriptor.bNumConfigurations; i++) {
                retval = usb_get_descriptor(udev, USB_DT_CONFIG, i,
                                dev->buf, TBUF_SIZE);
                if (!is_good_config(dev, retval)) {
                        dev_err(&iface->dev,
                                        "config [%d] descriptor --> %d\n",
                                        i, retval);
                        return (retval < 0) ? retval : -EDOM;
                }

                /* FIXME cross-checking udev->config[i] to make sure usbcore
                 * parsed it right (etc) would be good testing paranoia
                 */
        }

        /* and sometimes [9.2.6.6] speed dependent descriptors */
        if (le16_to_cpu(udev->descriptor.bcdUSB) == 0x0200) {
                struct usb_qualifier_descriptor *d = NULL;

                /* device qualifier [9.6.2] */
                retval = usb_get_descriptor(udev,
                                USB_DT_DEVICE_QUALIFIER, 0, dev->buf,
                                sizeof(struct usb_qualifier_descriptor));
                if (retval == -EPIPE) {
                        if (udev->speed == USB_SPEED_HIGH) {
                                dev_err(&iface->dev,
                                                "hs dev qualifier --> %d\n",
                                                retval);
                                return retval;
                        }
                        /* usb2.0 but not high-speed capable; fine */
                } else if (retval != sizeof(struct usb_qualifier_descriptor)) {
                        dev_err(&iface->dev, "dev qualifier --> %d\n", retval);
                        return (retval < 0) ? retval : -EDOM;
                } else
                        d = (struct usb_qualifier_descriptor *) dev->buf;

                /* might not have [9.6.2] any other-speed configs [9.6.4] */
                if (d) {
                        unsigned max = d->bNumConfigurations;
                        for (i = 0; i < max; i++) {
                                retval = usb_get_descriptor(udev,
                                        USB_DT_OTHER_SPEED_CONFIG, i,
                                        dev->buf, TBUF_SIZE);
                                if (!is_good_config(dev, retval)) {
                                        dev_err(&iface->dev,
                                                "other speed config --> %d\n",
                                                retval);
                                        return (retval < 0) ? retval : -EDOM;
                                }
                        }
                }
        }
        /* FIXME fetch strings from at least the device descriptor */

        /* [9.4.5] get_status always works */
        retval = usb_get_std_status(udev, USB_RECIP_DEVICE, 0, dev->buf);
        if (retval) {
                dev_err(&iface->dev, "get dev status --> %d\n", retval);
                return retval;
        }

        /* FIXME configuration.bmAttributes says if we could try to set/clear
         * the device's remote wakeup feature ... if we can, test that here
         */

        retval = usb_get_std_status(udev, USB_RECIP_INTERFACE,
                        iface->altsetting[0].desc.bInterfaceNumber, dev->buf);
        if (retval) {
                dev_err(&iface->dev, "get interface status --> %d\n", retval);
                return retval;
        }
        /* FIXME get status for each endpoint in the interface */

        return 0;
}

/*-------------------------------------------------------------------------*/

/* use ch9 requests to test whether:
 *   (a) queues work for control, keeping N subtests queued and
 *       active (auto-resubmit) for M loops through the queue.
 *   (b) protocol stalls (control-only) will autorecover.
 *       it's not like bulk/intr; no halt clearing.
 *   (c) short control reads are reported and handled.
 *   (d) queues are always processed in-order
 */

struct ctrl_ctx {
        spinlock_t                lock;
        struct usbtest_dev        *dev;
        struct completion        complete;
        unsigned                count;
        unsigned                pending;
        int                        status;
        struct urb                **urb;
        struct usbtest_param_32        *param;
        int                        last;
};

#define NUM_SUBCASES        16                /* how many test subcases here? */

struct subcase {
        struct usb_ctrlrequest        setup;
        int                        number;
        int                        expected;
};

static void ctrl_complete(struct urb *urb)
{
        struct ctrl_ctx                *ctx = urb->context;
        struct usb_ctrlrequest        *reqp;
        struct subcase                *subcase;
        int                        status = urb->status;
        unsigned long                flags;

        reqp = (struct usb_ctrlrequest *)urb->setup_packet;
        subcase = container_of(reqp, struct subcase, setup);

        spin_lock_irqsave(&ctx->lock, flags);
        ctx->count--;
        ctx->pending--;

        /* queue must transfer and complete in fifo order, unless
         * usb_unlink_urb() is used to unlink something not at the
         * physical queue head (not tested).
         */
        if (subcase->number > 0) {
                if ((subcase->number - ctx->last) != 1) {
                        ERROR(ctx->dev,
                                "subcase %d completed out of order, last %d\n",
                                subcase->number, ctx->last);
                        status = -EDOM;
                        ctx->last = subcase->number;
                        goto error;
                }
        }
        ctx->last = subcase->number;

        /* succeed or fault in only one way? */
        if (status == subcase->expected)
                status = 0;

        /* async unlink for cleanup? */
        else if (status != -ECONNRESET) {

                /* some faults are allowed, not required */
                if (subcase->expected > 0 && (
                          ((status == -subcase->expected        /* happened */
                           || status == 0))))                        /* didn't */
                        status = 0;
                /* sometimes more than one fault is allowed */
                else if (subcase->number == 12 && status == -EPIPE)
                        status = 0;
                else
                        ERROR(ctx->dev, "subtest %d error, status %d\n",
                                        subcase->number, status);
        }

        /* unexpected status codes mean errors; ideally, in hardware */
        if (status) {
error:
                if (ctx->status == 0) {
                        int                i;

                        ctx->status = status;
                        ERROR(ctx->dev, "control queue %02x.%02x, err %d, "
                                        "%d left, subcase %d, len %d/%d\n",
                                        reqp->bRequestType, reqp->bRequest,
                                        status, ctx->count, subcase->number,
                                        urb->actual_length,
                                        urb->transfer_buffer_length);

                        /* FIXME this "unlink everything" exit route should
                         * be a separate test case.
                         */

                        /* unlink whatever's still pending */
                        for (i = 1; i < ctx->param->sglen; i++) {
                                struct urb *u = ctx->urb[
                                                        (i + subcase->number)
                                                        % ctx->param->sglen];

                                if (u == urb || !u->dev)
                                        continue;
                                spin_unlock(&ctx->lock);
                                status = usb_unlink_urb(u);
                                spin_lock(&ctx->lock);
                                switch (status) {
                                case -EINPROGRESS:
                                case -EBUSY:
                                case -EIDRM:
                                        continue;
                                default:
                                        ERROR(ctx->dev, "urb unlink --> %d\n",
                                                        status);
                                }
                        }
                        status = ctx->status;
                }
        }

        /* resubmit if we need to, else mark this as done */
        if ((status == 0) && (ctx->pending < ctx->count)) {
                status = usb_submit_urb(urb, GFP_ATOMIC);
                if (status != 0) {
                        ERROR(ctx->dev,
                                "can't resubmit ctrl %02x.%02x, err %d\n",
                                reqp->bRequestType, reqp->bRequest, status);
                        urb->dev = NULL;
                } else
                        ctx->pending++;
        } else
                urb->dev = NULL;

        /* signal completion when nothing's queued */
        if (ctx->pending == 0)
                complete(&ctx->complete);
        spin_unlock_irqrestore(&ctx->lock, flags);
}

static int
test_ctrl_queue(struct usbtest_dev *dev, struct usbtest_param_32 *param)
{
        struct usb_device        *udev = testdev_to_usbdev(dev);
        struct urb                **urb;
        struct ctrl_ctx                context;
        int                        i;

        if (param->sglen == 0 || param->iterations > UINT_MAX / param->sglen)
                return -EOPNOTSUPP;

        spin_lock_init(&context.lock);
        context.dev = dev;
        init_completion(&context.complete);
        context.count = param->sglen * param->iterations;
        context.pending = 0;
        context.status = -ENOMEM;
        context.param = param;
        context.last = -1;

        /* allocate and init the urbs we'll queue.
         * as with bulk/intr sglists, sglen is the queue depth; it also
         * controls which subtests run (more tests than sglen) or rerun.
         */
        urb = kzalloc_objs(struct urb *, param->sglen);
        if (!urb)
                return -ENOMEM;
        for (i = 0; i < param->sglen; i++) {
                int                        pipe = usb_rcvctrlpipe(udev, 0);
                unsigned                len;
                struct urb                *u;
                struct usb_ctrlrequest        req;
                struct subcase                *reqp;

                /* sign of this variable means:
                 *  -: tested code must return this (negative) error code
                 *  +: tested code may return this (negative too) error code
                 */
                int                        expected = 0;

                /* requests here are mostly expected to succeed on any
                 * device, but some are chosen to trigger protocol stalls
                 * or short reads.
                 */
                memset(&req, 0, sizeof(req));
                req.bRequest = USB_REQ_GET_DESCRIPTOR;
                req.bRequestType = USB_DIR_IN|USB_RECIP_DEVICE;

                switch (i % NUM_SUBCASES) {
                case 0:                /* get device descriptor */
                        req.wValue = cpu_to_le16(USB_DT_DEVICE << 8);
                        len = sizeof(struct usb_device_descriptor);
                        break;
                case 1:                /* get first config descriptor (only) */
                        req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0);
                        len = sizeof(struct usb_config_descriptor);
                        break;
                case 2:                /* get altsetting (OFTEN STALLS) */
                        req.bRequest = USB_REQ_GET_INTERFACE;
                        req.bRequestType = USB_DIR_IN|USB_RECIP_INTERFACE;
                        /* index = 0 means first interface */
                        len = 1;
                        expected = EPIPE;
                        break;
                case 3:                /* get interface status */
                        req.bRequest = USB_REQ_GET_STATUS;
                        req.bRequestType = USB_DIR_IN|USB_RECIP_INTERFACE;
                        /* interface 0 */
                        len = 2;
                        break;
                case 4:                /* get device status */
                        req.bRequest = USB_REQ_GET_STATUS;
                        req.bRequestType = USB_DIR_IN|USB_RECIP_DEVICE;
                        len = 2;
                        break;
                case 5:                /* get device qualifier (MAY STALL) */
                        req.wValue = cpu_to_le16 (USB_DT_DEVICE_QUALIFIER << 8);
                        len = sizeof(struct usb_qualifier_descriptor);
                        if (udev->speed != USB_SPEED_HIGH)
                                expected = EPIPE;
                        break;
                case 6:                /* get first config descriptor, plus interface */
                        req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0);
                        len = sizeof(struct usb_config_descriptor);
                        len += sizeof(struct usb_interface_descriptor);
                        break;
                case 7:                /* get interface descriptor (ALWAYS STALLS) */
                        req.wValue = cpu_to_le16 (USB_DT_INTERFACE << 8);
                        /* interface == 0 */
                        len = sizeof(struct usb_interface_descriptor);
                        expected = -EPIPE;
                        break;
                /* NOTE: two consecutive stalls in the queue here.
                 *  that tests fault recovery a bit more aggressively. */
                case 8:                /* clear endpoint halt (MAY STALL) */
                        req.bRequest = USB_REQ_CLEAR_FEATURE;
                        req.bRequestType = USB_RECIP_ENDPOINT;
                        /* wValue 0 == ep halt */
                        /* wIndex 0 == ep0 (shouldn't halt!) */
                        len = 0;
                        pipe = usb_sndctrlpipe(udev, 0);
                        expected = EPIPE;
                        break;
                case 9:                /* get endpoint status */
                        req.bRequest = USB_REQ_GET_STATUS;
                        req.bRequestType = USB_DIR_IN|USB_RECIP_ENDPOINT;
                        /* endpoint 0 */
                        len = 2;
                        break;
                case 10:        /* trigger short read (EREMOTEIO) */
                        req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0);
                        len = 1024;
                        expected = -EREMOTEIO;
                        break;
                /* NOTE: two consecutive _different_ faults in the queue. */
                case 11:        /* get endpoint descriptor (ALWAYS STALLS) */
                        req.wValue = cpu_to_le16(USB_DT_ENDPOINT << 8);
                        /* endpoint == 0 */
                        len = sizeof(struct usb_interface_descriptor);
                        expected = EPIPE;
                        break;
                /* NOTE: sometimes even a third fault in the queue! */
                case 12:        /* get string 0 descriptor (MAY STALL) */
                        req.wValue = cpu_to_le16(USB_DT_STRING << 8);
                        /* string == 0, for language IDs */
                        len = sizeof(struct usb_interface_descriptor);
                        /* may succeed when > 4 languages */
                        expected = EREMOTEIO;        /* or EPIPE, if no strings */
                        break;
                case 13:        /* short read, resembling case 10 */
                        req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0);
                        /* last data packet "should" be DATA1, not DATA0 */
                        if (udev->speed == USB_SPEED_SUPER)
                                len = 1024 - 512;
                        else
                                len = 1024 - udev->descriptor.bMaxPacketSize0;
                        expected = -EREMOTEIO;
                        break;
                case 14:        /* short read; try to fill the last packet */
                        req.wValue = cpu_to_le16((USB_DT_DEVICE << 8) | 0);
                        /* device descriptor size == 18 bytes */
                        len = udev->descriptor.bMaxPacketSize0;
                        if (udev->speed == USB_SPEED_SUPER)
                                len = 512;
                        switch (len) {
                        case 8:
                                len = 24;
                                break;
                        case 16:
                                len = 32;
                                break;
                        }
                        expected = -EREMOTEIO;
                        break;
                case 15:
                        req.wValue = cpu_to_le16(USB_DT_BOS << 8);
                        if (udev->bos)
                                len = le16_to_cpu(udev->bos->desc->wTotalLength);
                        else
                                len = sizeof(struct usb_bos_descriptor);
                        if (le16_to_cpu(udev->descriptor.bcdUSB) < 0x0201)
                                expected = -EPIPE;
                        break;
                default:
                        ERROR(dev, "bogus number of ctrl queue testcases!\n");
                        context.status = -EINVAL;
                        goto cleanup;
                }
                req.wLength = cpu_to_le16(len);
                urb[i] = u = simple_alloc_urb(udev, pipe, len, 0);
                if (!u)
                        goto cleanup;

                reqp = kmalloc_obj(*reqp);
                if (!reqp)
                        goto cleanup;
                reqp->setup = req;
                reqp->number = i % NUM_SUBCASES;
                reqp->expected = expected;
                u->setup_packet = (char *) &reqp->setup;

                u->context = &context;
                u->complete = ctrl_complete;
        }

        /* queue the urbs */
        context.urb = urb;
        spin_lock_irq(&context.lock);
        for (i = 0; i < param->sglen; i++) {
                context.status = usb_submit_urb(urb[i], GFP_ATOMIC);
                if (context.status != 0) {
                        ERROR(dev, "can't submit urb[%d], status %d\n",
                                        i, context.status);
                        context.count = context.pending;
                        break;
                }
                context.pending++;
        }
        spin_unlock_irq(&context.lock);

        /* FIXME  set timer and time out; provide a disconnect hook */

        /* wait for the last one to complete */
        if (context.pending > 0)
                wait_for_completion(&context.complete);

cleanup:
        for (i = 0; i < param->sglen; i++) {
                if (!urb[i])
                        continue;
                urb[i]->dev = udev;
                kfree(urb[i]->setup_packet);
                simple_free_urb(urb[i]);
        }
        kfree(urb);
        return context.status;
}
#undef NUM_SUBCASES


/*-------------------------------------------------------------------------*/

static void unlink1_callback(struct urb *urb)
{
        int        status = urb->status;

        /* we "know" -EPIPE (stall) never happens */
        if (!status)
                status = usb_submit_urb(urb, GFP_ATOMIC);
        if (status) {
                urb->status = status;
                complete(urb->context);
        }
}

static int unlink1(struct usbtest_dev *dev, int pipe, int size, int async)
{
        struct urb                *urb;
        struct completion        completion;
        int                        retval = 0;

        init_completion(&completion);
        urb = simple_alloc_urb(testdev_to_usbdev(dev), pipe, size, 0);
        if (!urb)
                return -ENOMEM;
        urb->context = &completion;
        urb->complete = unlink1_callback;

        if (usb_pipeout(urb->pipe)) {
                simple_fill_buf(urb);
                urb->transfer_flags |= URB_ZERO_PACKET;
        }

        /* keep the endpoint busy.  there are lots of hc/hcd-internal
         * states, and testing should get to all of them over time.
         *
         * FIXME want additional tests for when endpoint is STALLing
         * due to errors, or is just NAKing requests.
         */
        retval = usb_submit_urb(urb, GFP_KERNEL);
        if (retval != 0) {
                dev_err(&dev->intf->dev, "submit fail %d\n", retval);
                return retval;
        }

        /* unlinking that should always work.  variable delay tests more
         * hcd states and code paths, even with little other system load.
         */
        msleep(jiffies % (2 * INTERRUPT_RATE));
        if (async) {
                while (!completion_done(&completion)) {
                        retval = usb_unlink_urb(urb);

                        if (retval == 0 && usb_pipein(urb->pipe))
                                retval = simple_check_buf(dev, urb);

                        switch (retval) {
                        case -EBUSY:
                        case -EIDRM:
                                /* we can't unlink urbs while they're completing
                                 * or if they've completed, and we haven't
                                 * resubmitted. "normal" drivers would prevent
                                 * resubmission, but since we're testing unlink
                                 * paths, we can't.
                                 */
                                ERROR(dev, "unlink retry\n");
                                continue;
                        case 0:
                        case -EINPROGRESS:
                                break;

                        default:
                                dev_err(&dev->intf->dev,
                                        "unlink fail %d\n", retval);
                                return retval;
                        }

                        break;
                }
        } else
                usb_kill_urb(urb);

        wait_for_completion(&completion);
        retval = urb->status;
        simple_free_urb(urb);

        if (async)
                return (retval == -ECONNRESET) ? 0 : retval - 1000;
        else
                return (retval == -ENOENT || retval == -EPERM) ?
                                0 : retval - 2000;
}

static int unlink_simple(struct usbtest_dev *dev, int pipe, int len)
{
        int                        retval = 0;

        /* test sync and async paths */
        retval = unlink1(dev, pipe, len, 1);
        if (!retval)
                retval = unlink1(dev, pipe, len, 0);
        return retval;
}

/*-------------------------------------------------------------------------*/

struct queued_ctx {
        struct completion        complete;
        atomic_t                pending;
        unsigned                num;
        int                        status;
        struct urb                **urbs;
};

static void unlink_queued_callback(struct urb *urb)
{
        int                        status = urb->status;
        struct queued_ctx        *ctx = urb->context;

        if (ctx->status)
                goto done;
        if (urb == ctx->urbs[ctx->num - 4] || urb == ctx->urbs[ctx->num - 2]) {
                if (status == -ECONNRESET)
                        goto done;
                /* What error should we report if the URB completed normally? */
        }
        if (status != 0)
                ctx->status = status;

 done:
        if (atomic_dec_and_test(&ctx->pending))
                complete(&ctx->complete);
}

static int unlink_queued(struct usbtest_dev *dev, int pipe, unsigned num,
                unsigned size)
{
        struct queued_ctx        ctx;
        struct usb_device        *udev = testdev_to_usbdev(dev);
        void                        *buf;
        dma_addr_t                buf_dma;
        int                        i;
        int                        retval = -ENOMEM;

        init_completion(&ctx.complete);
        atomic_set(&ctx.pending, 1);        /* One more than the actual value */
        ctx.num = num;
        ctx.status = 0;

        buf = usb_alloc_coherent(udev, size, GFP_KERNEL, &buf_dma);
        if (!buf)
                return retval;
        memset(buf, 0, size);

        /* Allocate and init the urbs we'll queue */
        ctx.urbs = kzalloc_objs(struct urb *, num);
        if (!ctx.urbs)
                goto free_buf;
        for (i = 0; i < num; i++) {
                ctx.urbs[i] = usb_alloc_urb(0, GFP_KERNEL);
                if (!ctx.urbs[i])
                        goto free_urbs;
                usb_fill_bulk_urb(ctx.urbs[i], udev, pipe, buf, size,
                                unlink_queued_callback, &ctx);
                ctx.urbs[i]->transfer_dma = buf_dma;
                ctx.urbs[i]->transfer_flags = URB_NO_TRANSFER_DMA_MAP;

                if (usb_pipeout(ctx.urbs[i]->pipe)) {
                        simple_fill_buf(ctx.urbs[i]);
                        ctx.urbs[i]->transfer_flags |= URB_ZERO_PACKET;
                }
        }

        /* Submit all the URBs and then unlink URBs num - 4 and num - 2. */
        for (i = 0; i < num; i++) {
                atomic_inc(&ctx.pending);
                retval = usb_submit_urb(ctx.urbs[i], GFP_KERNEL);
                if (retval != 0) {
                        dev_err(&dev->intf->dev, "submit urbs[%d] fail %d\n",
                                        i, retval);
                        atomic_dec(&ctx.pending);
                        ctx.status = retval;
                        break;
                }
        }
        if (i == num) {
                usb_unlink_urb(ctx.urbs[num - 4]);
                usb_unlink_urb(ctx.urbs[num - 2]);
        } else {
                while (--i >= 0)
                        usb_unlink_urb(ctx.urbs[i]);
        }

        if (atomic_dec_and_test(&ctx.pending))                /* The extra count */
                complete(&ctx.complete);
        wait_for_completion(&ctx.complete);
        retval = ctx.status;

 free_urbs:
        for (i = 0; i < num; i++)
                usb_free_urb(ctx.urbs[i]);
        kfree(ctx.urbs);
 free_buf:
        usb_free_coherent(udev, size, buf, buf_dma);
        return retval;
}

/*-------------------------------------------------------------------------*/

static int verify_not_halted(struct usbtest_dev *tdev, int ep, struct urb *urb)
{
        int        retval;
        u16        status;

        /* shouldn't look or act halted */
        retval = usb_get_std_status(urb->dev, USB_RECIP_ENDPOINT, ep, &status);
        if (retval < 0) {
                ERROR(tdev, "ep %02x couldn't get no-halt status, %d\n",
                                ep, retval);
                return retval;
        }
        if (status != 0) {
                ERROR(tdev, "ep %02x bogus status: %04x != 0\n", ep, status);
                return -EINVAL;
        }
        retval = simple_io(tdev, urb, 1, 0, 0, __func__);
        if (retval != 0)
                return -EINVAL;
        return 0;
}

static int verify_halted(struct usbtest_dev *tdev, int ep, struct urb *urb)
{
        int        retval;
        u16        status;

        /* should look and act halted */
        retval = usb_get_std_status(urb->dev, USB_RECIP_ENDPOINT, ep, &status);
        if (retval < 0) {
                ERROR(tdev, "ep %02x couldn't get halt status, %d\n",
                                ep, retval);
                return retval;
        }
        if (status != 1) {
                ERROR(tdev, "ep %02x bogus status: %04x != 1\n", ep, status);
                return -EINVAL;
        }
        retval = simple_io(tdev, urb, 1, 0, -EPIPE, __func__);
        if (retval != -EPIPE)
                return -EINVAL;
        retval = simple_io(tdev, urb, 1, 0, -EPIPE, "verify_still_halted");
        if (retval != -EPIPE)
                return -EINVAL;
        return 0;
}

static int test_halt(struct usbtest_dev *tdev, int ep, struct urb *urb)
{
        int        retval;

        /* shouldn't look or act halted now */
        retval = verify_not_halted(tdev, ep, urb);
        if (retval < 0)
                return retval;

        /* set halt (protocol test only), verify it worked */
        retval = usb_control_msg(urb->dev, usb_sndctrlpipe(urb->dev, 0),
                        USB_REQ_SET_FEATURE, USB_RECIP_ENDPOINT,
                        USB_ENDPOINT_HALT, ep,
                        NULL, 0, USB_CTRL_SET_TIMEOUT);
        if (retval < 0) {
                ERROR(tdev, "ep %02x couldn't set halt, %d\n", ep, retval);
                return retval;
        }
        retval = verify_halted(tdev, ep, urb);
        if (retval < 0) {
                int ret;

                /* clear halt anyways, else further tests will fail */
                ret = usb_clear_halt(urb->dev, urb->pipe);
                if (ret)
                        ERROR(tdev, "ep %02x couldn't clear halt, %d\n",
                              ep, ret);

                return retval;
        }

        /* clear halt (tests API + protocol), verify it worked */
        retval = usb_clear_halt(urb->dev, urb->pipe);
        if (retval < 0) {
                ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, retval);
                return retval;
        }
        retval = verify_not_halted(tdev, ep, urb);
        if (retval < 0)
                return retval;

        /* NOTE:  could also verify SET_INTERFACE clear halts ... */

        return 0;
}

static int test_toggle_sync(struct usbtest_dev *tdev, int ep, struct urb *urb)
{
        int        retval;

        /* clear initial data toggle to DATA0 */
        retval = usb_clear_halt(urb->dev, urb->pipe);
        if (retval < 0) {
                ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, retval);
                return retval;
        }

        /* transfer 3 data packets, should be DATA0, DATA1, DATA0 */
        retval = simple_io(tdev, urb, 1, 0, 0, __func__);
        if (retval != 0)
                return -EINVAL;

        /* clear halt resets device side data toggle, host should react to it */
        retval = usb_clear_halt(urb->dev, urb->pipe);
        if (retval < 0) {
                ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, retval);
                return retval;
        }

        /* host should use DATA0 again after clear halt */
        retval = simple_io(tdev, urb, 1, 0, 0, __func__);

        return retval;
}

static int halt_simple(struct usbtest_dev *dev)
{
        int                        ep;
        int                        retval = 0;
        struct urb                *urb;
        struct usb_device        *udev = testdev_to_usbdev(dev);

        if (udev->speed == USB_SPEED_SUPER)
                urb = simple_alloc_urb(udev, 0, 1024, 0);
        else
                urb = simple_alloc_urb(udev, 0, 512, 0);
        if (urb == NULL)
                return -ENOMEM;

        if (dev->in_pipe) {
                ep = usb_pipeendpoint(dev->in_pipe) | USB_DIR_IN;
                urb->pipe = dev->in_pipe;
                retval = test_halt(dev, ep, urb);
                if (retval < 0)
                        goto done;
        }

        if (dev->out_pipe) {
                ep = usb_pipeendpoint(dev->out_pipe);
                urb->pipe = dev->out_pipe;
                retval = test_halt(dev, ep, urb);
        }
done:
        simple_free_urb(urb);
        return retval;
}

static int toggle_sync_simple(struct usbtest_dev *dev)
{
        int                        ep;
        int                        retval = 0;
        struct urb                *urb;
        struct usb_device        *udev = testdev_to_usbdev(dev);
        unsigned                maxp = get_maxpacket(udev, dev->out_pipe);

        /*
         * Create a URB that causes a transfer of uneven amount of data packets
         * This way the clear toggle has an impact on the data toggle sequence.
         * Use 2 maxpacket length packets and one zero packet.
         */
        urb = simple_alloc_urb(udev, 0,  2 * maxp, 0);
        if (urb == NULL)
                return -ENOMEM;

        urb->transfer_flags |= URB_ZERO_PACKET;

        ep = usb_pipeendpoint(dev->out_pipe);
        urb->pipe = dev->out_pipe;
        retval = test_toggle_sync(dev, ep, urb);

        simple_free_urb(urb);
        return retval;
}

/*-------------------------------------------------------------------------*/

/* Control OUT tests use the vendor control requests from Intel's
 * USB 2.0 compliance test device:  write a buffer, read it back.
 *
 * Intel's spec only _requires_ that it work for one packet, which
 * is pretty weak.   Some HCDs place limits here; most devices will
 * need to be able to handle more than one OUT data packet.  We'll
 * try whatever we're told to try.
 */
static int ctrl_out(struct usbtest_dev *dev,
                unsigned count, unsigned length, unsigned vary, unsigned offset)
{
        unsigned                i, j, len;
        int                        retval;
        u8                        *buf;
        char                        *what = "?";
        struct usb_device        *udev;

        if (length < 1 || length > 0xffff || vary >= length)
                return -EINVAL;

        buf = kmalloc(length + offset, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        buf += offset;
        udev = testdev_to_usbdev(dev);
        len = length;
        retval = 0;

        /* NOTE:  hardware might well act differently if we pushed it
         * with lots back-to-back queued requests.
         */
        for (i = 0; i < count; i++) {
                /* write patterned data */
                for (j = 0; j < len; j++)
                        buf[j] = (u8)(i + j);
                retval = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
                                0x5b, USB_DIR_OUT|USB_TYPE_VENDOR,
                                0, 0, buf, len, USB_CTRL_SET_TIMEOUT);
                if (retval != len) {
                        what = "write";
                        if (retval >= 0) {
                                ERROR(dev, "ctrl_out, wlen %d (expected %d)\n",
                                                retval, len);
                                retval = -EBADMSG;
                        }
                        break;
                }

                /* read it back -- assuming nothing intervened!!  */
                retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
                                0x5c, USB_DIR_IN|USB_TYPE_VENDOR,
                                0, 0, buf, len, USB_CTRL_GET_TIMEOUT);
                if (retval != len) {
                        what = "read";
                        if (retval >= 0) {
                                ERROR(dev, "ctrl_out, rlen %d (expected %d)\n",
                                                retval, len);
                                retval = -EBADMSG;
                        }
                        break;
                }

                /* fail if we can't verify */
                for (j = 0; j < len; j++) {
                        if (buf[j] != (u8)(i + j)) {
                                ERROR(dev, "ctrl_out, byte %d is %d not %d\n",
                                        j, buf[j], (u8)(i + j));
                                retval = -EBADMSG;
                                break;
                        }
                }
                if (retval < 0) {
                        what = "verify";
                        break;
                }

                len += vary;

                /* [real world] the "zero bytes IN" case isn't really used.
                 * hardware can easily trip up in this weird case, since its
                 * status stage is IN, not OUT like other ep0in transfers.
                 */
                if (len > length)
                        len = realworld ? 1 : 0;
        }

        if (retval < 0)
                ERROR(dev, "ctrl_out %s failed, code %d, count %d\n",
                        what, retval, i);

        kfree(buf - offset);
        return retval;
}

/*-------------------------------------------------------------------------*/

/* ISO/BULK tests ... mimics common usage
 *  - buffer length is split into N packets (mostly maxpacket sized)
 *  - multi-buffers according to sglen
 */

struct transfer_context {
        unsigned                count;
        unsigned                pending;
        spinlock_t                lock;
        struct completion        done;
        int                        submit_error;
        unsigned long                errors;
        unsigned long                packet_count;
        struct usbtest_dev        *dev;
        bool                        is_iso;
};

static void complicated_callback(struct urb *urb)
{
        struct transfer_context        *ctx = urb->context;
        unsigned long flags;

        spin_lock_irqsave(&ctx->lock, flags);
        ctx->count--;

        ctx->packet_count += urb->number_of_packets;
        if (urb->error_count > 0)
                ctx->errors += urb->error_count;
        else if (urb->status != 0)
                ctx->errors += (ctx->is_iso ? urb->number_of_packets : 1);
        else if (urb->actual_length != urb->transfer_buffer_length)
                ctx->errors++;
        else if (check_guard_bytes(ctx->dev, urb) != 0)
                ctx->errors++;

        if (urb->status == 0 && ctx->count > (ctx->pending - 1)
                        && !ctx->submit_error) {
                int status = usb_submit_urb(urb, GFP_ATOMIC);
                switch (status) {
                case 0:
                        goto done;
                default:
                        dev_err(&ctx->dev->intf->dev,
                                        "resubmit err %d\n",
                                        status);
                        fallthrough;
                case -ENODEV:                        /* disconnected */
                case -ESHUTDOWN:                /* endpoint disabled */
                        ctx->submit_error = 1;
                        break;
                }
        }

        ctx->pending--;
        if (ctx->pending == 0) {
                if (ctx->errors)
                        dev_err(&ctx->dev->intf->dev,
                                "during the test, %lu errors out of %lu\n",
                                ctx->errors, ctx->packet_count);
                complete(&ctx->done);
        }
done:
        spin_unlock_irqrestore(&ctx->lock, flags);
}

static struct urb *iso_alloc_urb(
        struct usb_device        *udev,
        int                        pipe,
        struct usb_endpoint_descriptor        *desc,
        long                        bytes,
        unsigned offset
)
{
        struct urb                *urb;
        unsigned                i, maxp, packets;

        if (bytes < 0 || !desc)
                return NULL;

        maxp = usb_endpoint_maxp(desc);
        if (udev->speed >= USB_SPEED_SUPER)
                maxp *= ss_isoc_get_packet_num(udev, pipe);
        else
                maxp *= usb_endpoint_maxp_mult(desc);

        packets = DIV_ROUND_UP(bytes, maxp);

        urb = usb_alloc_urb(packets, GFP_KERNEL);
        if (!urb)
                return urb;
        urb->dev = udev;
        urb->pipe = pipe;

        urb->number_of_packets = packets;
        urb->transfer_buffer_length = bytes;
        urb->transfer_buffer = usb_alloc_coherent(udev, bytes + offset,
                                                        GFP_KERNEL,
                                                        &urb->transfer_dma);
        if (!urb->transfer_buffer) {
                usb_free_urb(urb);
                return NULL;
        }
        if (offset) {
                memset(urb->transfer_buffer, GUARD_BYTE, offset);
                urb->transfer_buffer += offset;
                urb->transfer_dma += offset;
        }
        /* For inbound transfers use guard byte so that test fails if
                data not correctly copied */
        memset(urb->transfer_buffer,
                        usb_pipein(urb->pipe) ? GUARD_BYTE : 0,
                        bytes);

        for (i = 0; i < packets; i++) {
                /* here, only the last packet will be short */
                urb->iso_frame_desc[i].length = min_t(unsigned int,
                                                        bytes, maxp);
                bytes -= urb->iso_frame_desc[i].length;

                urb->iso_frame_desc[i].offset = maxp * i;
        }

        urb->complete = complicated_callback;
        /* urb->context = SET BY CALLER */
        urb->interval = 1 << (desc->bInterval - 1);
        urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP;
        return urb;
}

static int
test_queue(struct usbtest_dev *dev, struct usbtest_param_32 *param,
                int pipe, struct usb_endpoint_descriptor *desc, unsigned offset)
{
        struct transfer_context        context;
        struct usb_device        *udev;
        unsigned                i;
        unsigned long                packets = 0;
        int                        status = 0;
        struct urb                **urbs;

        if (!param->sglen || param->iterations > UINT_MAX / param->sglen)
                return -EINVAL;

        if (param->sglen > MAX_SGLEN)
                return -EINVAL;

        urbs = kzalloc_objs(*urbs, param->sglen);
        if (!urbs)
                return -ENOMEM;

        memset(&context, 0, sizeof(context));
        context.count = param->iterations * param->sglen;
        context.dev = dev;
        context.is_iso = !!desc;
        init_completion(&context.done);
        spin_lock_init(&context.lock);

        udev = testdev_to_usbdev(dev);

        for (i = 0; i < param->sglen; i++) {
                if (context.is_iso)
                        urbs[i] = iso_alloc_urb(udev, pipe, desc,
                                        param->length, offset);
                else
                        urbs[i] = complicated_alloc_urb(udev, pipe,
                                        param->length, 0);

                if (!urbs[i]) {
                        status = -ENOMEM;
                        goto fail;
                }
                packets += urbs[i]->number_of_packets;
                urbs[i]->context = &context;
        }
        packets *= param->iterations;

        if (context.is_iso) {
                int transaction_num;

                if (udev->speed >= USB_SPEED_SUPER)
                        transaction_num = ss_isoc_get_packet_num(udev, pipe);
                else
                        transaction_num = usb_endpoint_maxp_mult(desc);

                dev_info(&dev->intf->dev,
                        "iso period %d %sframes, wMaxPacket %d, transactions: %d\n",
                        1 << (desc->bInterval - 1),
                        (udev->speed >= USB_SPEED_HIGH) ? "micro" : "",
                        usb_endpoint_maxp(desc),
                        transaction_num);

                dev_info(&dev->intf->dev,
                        "total %lu msec (%lu packets)\n",
                        (packets * (1 << (desc->bInterval - 1)))
                                / ((udev->speed >= USB_SPEED_HIGH) ? 8 : 1),
                        packets);
        }

        spin_lock_irq(&context.lock);
        for (i = 0; i < param->sglen; i++) {
                ++context.pending;
                status = usb_submit_urb(urbs[i], GFP_ATOMIC);
                if (status < 0) {
                        ERROR(dev, "submit iso[%d], error %d\n", i, status);
                        if (i == 0) {
                                spin_unlock_irq(&context.lock);
                                goto fail;
                        }

                        simple_free_urb(urbs[i]);
                        urbs[i] = NULL;
                        context.pending--;
                        context.submit_error = 1;
                        break;
                }
        }
        spin_unlock_irq(&context.lock);

        wait_for_completion(&context.done);

        for (i = 0; i < param->sglen; i++) {
                if (urbs[i])
                        simple_free_urb(urbs[i]);
        }
        /*
         * Isochronous transfers are expected to fail sometimes.  As an
         * arbitrary limit, we will report an error if any submissions
         * fail or if the transfer failure rate is > 10%.
         */
        if (status != 0)
                ;
        else if (context.submit_error)
                status = -EACCES;
        else if (context.errors >
                        (context.is_iso ? context.packet_count / 10 : 0))
                status = -EIO;

        kfree(urbs);
        return status;

fail:
        for (i = 0; i < param->sglen; i++) {
                if (urbs[i])
                        simple_free_urb(urbs[i]);
        }

        kfree(urbs);
        return status;
}

static int test_unaligned_bulk(
        struct usbtest_dev *tdev,
        int pipe,
        unsigned length,
        int iterations,
        unsigned transfer_flags,
        const char *label)
{
        int retval;
        struct urb *urb = usbtest_alloc_urb(testdev_to_usbdev(tdev),
                        pipe, length, transfer_flags, 1, 0, simple_callback);

        if (!urb)
                return -ENOMEM;

        retval = simple_io(tdev, urb, iterations, 0, 0, label);
        simple_free_urb(urb);
        return retval;
}

/* Run tests. */
static int
usbtest_do_ioctl(struct usb_interface *intf, struct usbtest_param_32 *param)
{
        struct usbtest_dev        *dev = usb_get_intfdata(intf);
        struct usb_device        *udev = testdev_to_usbdev(dev);
        struct urb                *urb;
        struct scatterlist        *sg;
        struct usb_sg_request        req;
        unsigned                i;
        int        retval = -EOPNOTSUPP;

        if (param->iterations <= 0)
                return -EINVAL;
        if (param->sglen > MAX_SGLEN)
                return -EINVAL;
        /*
         * Just a bunch of test cases that every HCD is expected to handle.
         *
         * Some may need specific firmware, though it'd be good to have
         * one firmware image to handle all the test cases.
         *
         * FIXME add more tests!  cancel requests, verify the data, control
         * queueing, concurrent read+write threads, and so on.
         */
        switch (param->test_num) {

        case 0:
                dev_info(&intf->dev, "TEST 0:  NOP\n");
                retval = 0;
                break;

        /* Simple non-queued bulk I/O tests */
        case 1:
                if (dev->out_pipe == 0)
                        break;
                dev_info(&intf->dev,
                                "TEST 1:  write %d bytes %u times\n",
                                param->length, param->iterations);
                urb = simple_alloc_urb(udev, dev->out_pipe, param->length, 0);
                if (!urb) {
                        retval = -ENOMEM;
                        break;
                }
                /* FIRMWARE:  bulk sink (maybe accepts short writes) */
                retval = simple_io(dev, urb, param->iterations, 0, 0, "test1");
                simple_free_urb(urb);
                break;
        case 2:
                if (dev->in_pipe == 0)
                        break;
                dev_info(&intf->dev,
                                "TEST 2:  read %d bytes %u times\n",
                                param->length, param->iterations);
                urb = simple_alloc_urb(udev, dev->in_pipe, param->length, 0);
                if (!urb) {
                        retval = -ENOMEM;
                        break;
                }
                /* FIRMWARE:  bulk source (maybe generates short writes) */
                retval = simple_io(dev, urb, param->iterations, 0, 0, "test2");
                simple_free_urb(urb);
                break;
        case 3:
                if (dev->out_pipe == 0 || param->vary == 0)
                        break;
                dev_info(&intf->dev,
                                "TEST 3:  write/%d 0..%d bytes %u times\n",
                                param->vary, param->length, param->iterations);
                urb = simple_alloc_urb(udev, dev->out_pipe, param->length, 0);
                if (!urb) {
                        retval = -ENOMEM;
                        break;
                }
                /* FIRMWARE:  bulk sink (maybe accepts short writes) */
                retval = simple_io(dev, urb, param->iterations, param->vary,
                                        0, "test3");
                simple_free_urb(urb);
                break;
        case 4:
                if (dev->in_pipe == 0 || param->vary == 0)
                        break;
                dev_info(&intf->dev,
                                "TEST 4:  read/%d 0..%d bytes %u times\n",
                                param->vary, param->length, param->iterations);
                urb = simple_alloc_urb(udev, dev->in_pipe, param->length, 0);
                if (!urb) {
                        retval = -ENOMEM;
                        break;
                }
                /* FIRMWARE:  bulk source (maybe generates short writes) */
                retval = simple_io(dev, urb, param->iterations, param->vary,
                                        0, "test4");
                simple_free_urb(urb);
                break;

        /* Queued bulk I/O tests */
        case 5:
                if (dev->out_pipe == 0 || param->sglen == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 5:  write %d sglists %d entries of %d bytes\n",
                                param->iterations,
                                param->sglen, param->length);
                sg = alloc_sglist(param->sglen, param->length,
                                0, dev, dev->out_pipe);
                if (!sg) {
                        retval = -ENOMEM;
                        break;
                }
                /* FIRMWARE:  bulk sink (maybe accepts short writes) */
                retval = perform_sglist(dev, param->iterations, dev->out_pipe,
                                &req, sg, param->sglen);
                free_sglist(sg, param->sglen);
                break;

        case 6:
                if (dev->in_pipe == 0 || param->sglen == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 6:  read %d sglists %d entries of %d bytes\n",
                                param->iterations,
                                param->sglen, param->length);
                sg = alloc_sglist(param->sglen, param->length,
                                0, dev, dev->in_pipe);
                if (!sg) {
                        retval = -ENOMEM;
                        break;
                }
                /* FIRMWARE:  bulk source (maybe generates short writes) */
                retval = perform_sglist(dev, param->iterations, dev->in_pipe,
                                &req, sg, param->sglen);
                free_sglist(sg, param->sglen);
                break;
        case 7:
                if (dev->out_pipe == 0 || param->sglen == 0 || param->vary == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 7:  write/%d %d sglists %d entries 0..%d bytes\n",
                                param->vary, param->iterations,
                                param->sglen, param->length);
                sg = alloc_sglist(param->sglen, param->length,
                                param->vary, dev, dev->out_pipe);
                if (!sg) {
                        retval = -ENOMEM;
                        break;
                }
                /* FIRMWARE:  bulk sink (maybe accepts short writes) */
                retval = perform_sglist(dev, param->iterations, dev->out_pipe,
                                &req, sg, param->sglen);
                free_sglist(sg, param->sglen);
                break;
        case 8:
                if (dev->in_pipe == 0 || param->sglen == 0 || param->vary == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 8:  read/%d %d sglists %d entries 0..%d bytes\n",
                                param->vary, param->iterations,
                                param->sglen, param->length);
                sg = alloc_sglist(param->sglen, param->length,
                                param->vary, dev, dev->in_pipe);
                if (!sg) {
                        retval = -ENOMEM;
                        break;
                }
                /* FIRMWARE:  bulk source (maybe generates short writes) */
                retval = perform_sglist(dev, param->iterations, dev->in_pipe,
                                &req, sg, param->sglen);
                free_sglist(sg, param->sglen);
                break;

        /* non-queued sanity tests for control (chapter 9 subset) */
        case 9:
                retval = 0;
                dev_info(&intf->dev,
                        "TEST 9:  ch9 (subset) control tests, %d times\n",
                                param->iterations);
                for (i = param->iterations; retval == 0 && i--; /* NOP */)
                        retval = ch9_postconfig(dev);
                if (retval)
                        dev_err(&intf->dev, "ch9 subset failed, "
                                        "iterations left %d\n", i);
                break;

        /* queued control messaging */
        case 10:
                retval = 0;
                dev_info(&intf->dev,
                                "TEST 10:  queue %d control calls, %d times\n",
                                param->sglen,
                                param->iterations);
                retval = test_ctrl_queue(dev, param);
                break;

        /* simple non-queued unlinks (ring with one urb) */
        case 11:
                if (dev->in_pipe == 0 || !param->length)
                        break;
                retval = 0;
                dev_info(&intf->dev, "TEST 11:  unlink %d reads of %d\n",
                                param->iterations, param->length);
                for (i = param->iterations; retval == 0 && i--; /* NOP */)
                        retval = unlink_simple(dev, dev->in_pipe,
                                                param->length);
                if (retval)
                        dev_err(&intf->dev, "unlink reads failed %d, "
                                "iterations left %d\n", retval, i);
                break;
        case 12:
                if (dev->out_pipe == 0 || !param->length)
                        break;
                retval = 0;
                dev_info(&intf->dev, "TEST 12:  unlink %d writes of %d\n",
                                param->iterations, param->length);
                for (i = param->iterations; retval == 0 && i--; /* NOP */)
                        retval = unlink_simple(dev, dev->out_pipe,
                                                param->length);
                if (retval)
                        dev_err(&intf->dev, "unlink writes failed %d, "
                                "iterations left %d\n", retval, i);
                break;

        /* ep halt tests */
        case 13:
                if (dev->out_pipe == 0 && dev->in_pipe == 0)
                        break;
                retval = 0;
                dev_info(&intf->dev, "TEST 13:  set/clear %d halts\n",
                                param->iterations);
                for (i = param->iterations; retval == 0 && i--; /* NOP */)
                        retval = halt_simple(dev);

                if (retval)
                        ERROR(dev, "halts failed, iterations left %d\n", i);
                break;

        /* control write tests */
        case 14:
                if (!dev->info->ctrl_out)
                        break;
                dev_info(&intf->dev, "TEST 14:  %d ep0out, %d..%d vary %d\n",
                                param->iterations,
                                realworld ? 1 : 0, param->length,
                                param->vary);
                retval = ctrl_out(dev, param->iterations,
                                param->length, param->vary, 0);
                break;

        /* iso write tests */
        case 15:
                if (dev->out_iso_pipe == 0 || param->sglen == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 15:  write %d iso, %d entries of %d bytes\n",
                                param->iterations,
                                param->sglen, param->length);
                /* FIRMWARE:  iso sink */
                retval = test_queue(dev, param,
                                dev->out_iso_pipe, dev->iso_out, 0);
                break;

        /* iso read tests */
        case 16:
                if (dev->in_iso_pipe == 0 || param->sglen == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 16:  read %d iso, %d entries of %d bytes\n",
                                param->iterations,
                                param->sglen, param->length);
                /* FIRMWARE:  iso source */
                retval = test_queue(dev, param,
                                dev->in_iso_pipe, dev->iso_in, 0);
                break;

        /* FIXME scatterlist cancel (needs helper thread) */

        /* Tests for bulk I/O using DMA mapping by core and odd address */
        case 17:
                if (dev->out_pipe == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 17:  write odd addr %d bytes %u times core map\n",
                        param->length, param->iterations);

                retval = test_unaligned_bulk(
                                dev, dev->out_pipe,
                                param->length, param->iterations,
                                0, "test17");
                break;

        case 18:
                if (dev->in_pipe == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 18:  read odd addr %d bytes %u times core map\n",
                        param->length, param->iterations);

                retval = test_unaligned_bulk(
                                dev, dev->in_pipe,
                                param->length, param->iterations,
                                0, "test18");
                break;

        /* Tests for bulk I/O using premapped coherent buffer and odd address */
        case 19:
                if (dev->out_pipe == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 19:  write odd addr %d bytes %u times premapped\n",
                        param->length, param->iterations);

                retval = test_unaligned_bulk(
                                dev, dev->out_pipe,
                                param->length, param->iterations,
                                URB_NO_TRANSFER_DMA_MAP, "test19");
                break;

        case 20:
                if (dev->in_pipe == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 20:  read odd addr %d bytes %u times premapped\n",
                        param->length, param->iterations);

                retval = test_unaligned_bulk(
                                dev, dev->in_pipe,
                                param->length, param->iterations,
                                URB_NO_TRANSFER_DMA_MAP, "test20");
                break;

        /* control write tests with unaligned buffer */
        case 21:
                if (!dev->info->ctrl_out)
                        break;
                dev_info(&intf->dev,
                                "TEST 21:  %d ep0out odd addr, %d..%d vary %d\n",
                                param->iterations,
                                realworld ? 1 : 0, param->length,
                                param->vary);
                retval = ctrl_out(dev, param->iterations,
                                param->length, param->vary, 1);
                break;

        /* unaligned iso tests */
        case 22:
                if (dev->out_iso_pipe == 0 || param->sglen == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 22:  write %d iso odd, %d entries of %d bytes\n",
                                param->iterations,
                                param->sglen, param->length);
                retval = test_queue(dev, param,
                                dev->out_iso_pipe, dev->iso_out, 1);
                break;

        case 23:
                if (dev->in_iso_pipe == 0 || param->sglen == 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 23:  read %d iso odd, %d entries of %d bytes\n",
                                param->iterations,
                                param->sglen, param->length);
                retval = test_queue(dev, param,
                                dev->in_iso_pipe, dev->iso_in, 1);
                break;

        /* unlink URBs from a bulk-OUT queue */
        case 24:
                if (dev->out_pipe == 0 || !param->length || param->sglen < 4)
                        break;
                retval = 0;
                dev_info(&intf->dev, "TEST 24:  unlink from %d queues of "
                                "%d %d-byte writes\n",
                                param->iterations, param->sglen, param->length);
                for (i = param->iterations; retval == 0 && i > 0; --i) {
                        retval = unlink_queued(dev, dev->out_pipe,
                                                param->sglen, param->length);
                        if (retval) {
                                dev_err(&intf->dev,
                                        "unlink queued writes failed %d, "
                                        "iterations left %d\n", retval, i);
                                break;
                        }
                }
                break;

        /* Simple non-queued interrupt I/O tests */
        case 25:
                if (dev->out_int_pipe == 0)
                        break;
                dev_info(&intf->dev,
                                "TEST 25: write %d bytes %u times\n",
                                param->length, param->iterations);
                urb = simple_alloc_urb(udev, dev->out_int_pipe, param->length,
                                dev->int_out->bInterval);
                if (!urb) {
                        retval = -ENOMEM;
                        break;
                }
                /* FIRMWARE: interrupt sink (maybe accepts short writes) */
                retval = simple_io(dev, urb, param->iterations, 0, 0, "test25");
                simple_free_urb(urb);
                break;
        case 26:
                if (dev->in_int_pipe == 0)
                        break;
                dev_info(&intf->dev,
                                "TEST 26: read %d bytes %u times\n",
                                param->length, param->iterations);
                urb = simple_alloc_urb(udev, dev->in_int_pipe, param->length,
                                dev->int_in->bInterval);
                if (!urb) {
                        retval = -ENOMEM;
                        break;
                }
                /* FIRMWARE: interrupt source (maybe generates short writes) */
                retval = simple_io(dev, urb, param->iterations, 0, 0, "test26");
                simple_free_urb(urb);
                break;
        case 27:
                /* We do performance test, so ignore data compare */
                if (dev->out_pipe == 0 || param->sglen == 0 || pattern != 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 27: bulk write %dMbytes\n", (param->iterations *
                        param->sglen * param->length) / (1024 * 1024));
                retval = test_queue(dev, param,
                                dev->out_pipe, NULL, 0);
                break;
        case 28:
                if (dev->in_pipe == 0 || param->sglen == 0 || pattern != 0)
                        break;
                dev_info(&intf->dev,
                        "TEST 28: bulk read %dMbytes\n", (param->iterations *
                        param->sglen * param->length) / (1024 * 1024));
                retval = test_queue(dev, param,
                                dev->in_pipe, NULL, 0);
                break;
        /* Test data Toggle/seq_nr clear between bulk out transfers */
        case 29:
                if (dev->out_pipe == 0)
                        break;
                retval = 0;
                dev_info(&intf->dev, "TEST 29: Clear toggle between bulk writes %d times\n",
                                param->iterations);
                for (i = param->iterations; retval == 0 && i > 0; --i)
                        retval = toggle_sync_simple(dev);

                if (retval)
                        ERROR(dev, "toggle sync failed, iterations left %d\n",
                              i);
                break;
        }
        return retval;
}

/*-------------------------------------------------------------------------*/

/* We only have this one interface to user space, through usbfs.
 * User mode code can scan usbfs to find N different devices (maybe on
 * different busses) to use when testing, and allocate one thread per
 * test.  So discovery is simplified, and we have no device naming issues.
 *
 * Don't use these only as stress/load tests.  Use them along with
 * other USB bus activity:  plugging, unplugging, mousing, mp3 playback,
 * video capture, and so on.  Run different tests at different times, in
 * different sequences.  Nothing here should interact with other devices,
 * except indirectly by consuming USB bandwidth and CPU resources for test
 * threads and request completion.  But the only way to know that for sure
 * is to test when HC queues are in use by many devices.
 *
 * WARNING:  Because usbfs grabs udev->dev.sem before calling this ioctl(),
 * it locks out usbcore in certain code paths.  Notably, if you disconnect
 * the device-under-test, hub_wq will wait block forever waiting for the
 * ioctl to complete ... so that usb_disconnect() can abort the pending
 * urbs and then call usbtest_disconnect().  To abort a test, you're best
 * off just killing the userspace task and waiting for it to exit.
 */

static int
usbtest_ioctl(struct usb_interface *intf, unsigned int code, void *buf)
{

        struct usbtest_dev        *dev = usb_get_intfdata(intf);
        struct usbtest_param_64 *param_64 = buf;
        struct usbtest_param_32 temp;
        struct usbtest_param_32 *param_32 = buf;
        struct timespec64 start;
        struct timespec64 end;
        struct timespec64 duration;
        int retval = -EOPNOTSUPP;

        /* FIXME USBDEVFS_CONNECTINFO doesn't say how fast the device is. */

        pattern = mod_pattern;

        if (mutex_lock_interruptible(&dev->lock))
                return -ERESTARTSYS;

        /* FIXME: What if a system sleep starts while a test is running? */

        /* some devices, like ez-usb default devices, need a non-default
         * altsetting to have any active endpoints.  some tests change
         * altsettings; force a default so most tests don't need to check.
         */
        if (dev->info->alt >= 0) {
                if (intf->altsetting->desc.bInterfaceNumber) {
                        retval = -ENODEV;
                        goto free_mutex;
                }
                retval = set_altsetting(dev, dev->info->alt);
                if (retval) {
                        dev_err(&intf->dev,
                                        "set altsetting to %d failed, %d\n",
                                        dev->info->alt, retval);
                        goto free_mutex;
                }
        }

        switch (code) {
        case USBTEST_REQUEST_64:
                temp.test_num = param_64->test_num;
                temp.iterations = param_64->iterations;
                temp.length = param_64->length;
                temp.sglen = param_64->sglen;
                temp.vary = param_64->vary;
                param_32 = &temp;
                break;

        case USBTEST_REQUEST_32:
                break;

        default:
                retval = -EOPNOTSUPP;
                goto free_mutex;
        }

        ktime_get_ts64(&start);

        retval = usbtest_do_ioctl(intf, param_32);
        if (retval < 0)
                goto free_mutex;

        ktime_get_ts64(&end);

        duration = timespec64_sub(end, start);

        temp.duration_sec = duration.tv_sec;
        temp.duration_usec = duration.tv_nsec/NSEC_PER_USEC;

        switch (code) {
        case USBTEST_REQUEST_32:
                param_32->duration_sec = temp.duration_sec;
                param_32->duration_usec = temp.duration_usec;
                break;

        case USBTEST_REQUEST_64:
                param_64->duration_sec = temp.duration_sec;
                param_64->duration_usec = temp.duration_usec;
                break;
        }

free_mutex:
        mutex_unlock(&dev->lock);
        return retval;
}

/*-------------------------------------------------------------------------*/

static unsigned force_interrupt;
module_param(force_interrupt, uint, 0);
MODULE_PARM_DESC(force_interrupt, "0 = test default; else interrupt");

#ifdef        GENERIC
static unsigned short vendor;
module_param(vendor, ushort, 0);
MODULE_PARM_DESC(vendor, "vendor code (from usb-if)");

static unsigned short product;
module_param(product, ushort, 0);
MODULE_PARM_DESC(product, "product code (from vendor)");
#endif

static int
usbtest_probe(struct usb_interface *intf, const struct usb_device_id *id)
{
        struct usb_device        *udev;
        struct usbtest_dev        *dev;
        struct usbtest_info        *info;
        char                        *rtest, *wtest;
        char                        *irtest, *iwtest;
        char                        *intrtest, *intwtest;

        udev = interface_to_usbdev(intf);

#ifdef        GENERIC
        /* specify devices by module parameters? */
        if (id->match_flags == 0) {
                /* vendor match required, product match optional */
                if (!vendor || le16_to_cpu(udev->descriptor.idVendor) != (u16)vendor)
                        return -ENODEV;
                if (product && le16_to_cpu(udev->descriptor.idProduct) != (u16)product)
                        return -ENODEV;
                dev_info(&intf->dev, "matched module params, "
                                        "vend=0x%04x prod=0x%04x\n",
                                le16_to_cpu(udev->descriptor.idVendor),
                                le16_to_cpu(udev->descriptor.idProduct));
        }
#endif

        dev = kzalloc_obj(*dev);
        if (!dev)
                return -ENOMEM;
        info = (struct usbtest_info *) id->driver_info;
        dev->info = info;
        mutex_init(&dev->lock);

        dev->intf = intf;

        /* cacheline-aligned scratch for i/o */
        dev->buf = kmalloc(TBUF_SIZE, GFP_KERNEL);
        if (dev->buf == NULL) {
                kfree(dev);
                return -ENOMEM;
        }

        /* NOTE this doesn't yet test the handful of difference that are
         * visible with high speed interrupts:  bigger maxpacket (1K) and
         * "high bandwidth" modes (up to 3 packets/uframe).
         */
        rtest = wtest = "";
        irtest = iwtest = "";
        intrtest = intwtest = "";
        if (force_interrupt || udev->speed == USB_SPEED_LOW) {
                if (info->ep_in) {
                        dev->in_pipe = usb_rcvintpipe(udev, info->ep_in);
                        rtest = " intr-in";
                }
                if (info->ep_out) {
                        dev->out_pipe = usb_sndintpipe(udev, info->ep_out);
                        wtest = " intr-out";
                }
        } else {
                if (override_alt >= 0 || info->autoconf) {
                        int status;

                        status = get_endpoints(dev, intf);
                        if (status < 0) {
                                WARNING(dev, "couldn't get endpoints, %d\n",
                                                status);
                                kfree(dev->buf);
                                kfree(dev);
                                return status;
                        }
                        /* may find bulk or ISO pipes */
                } else {
                        if (info->ep_in)
                                dev->in_pipe = usb_rcvbulkpipe(udev,
                                                        info->ep_in);
                        if (info->ep_out)
                                dev->out_pipe = usb_sndbulkpipe(udev,
                                                        info->ep_out);
                }
                if (dev->in_pipe)
                        rtest = " bulk-in";
                if (dev->out_pipe)
                        wtest = " bulk-out";
                if (dev->in_iso_pipe)
                        irtest = " iso-in";
                if (dev->out_iso_pipe)
                        iwtest = " iso-out";
                if (dev->in_int_pipe)
                        intrtest = " int-in";
                if (dev->out_int_pipe)
                        intwtest = " int-out";
        }

        usb_set_intfdata(intf, dev);
        dev_info(&intf->dev, "%s\n", info->name);
        dev_info(&intf->dev, "%s {control%s%s%s%s%s%s%s} tests%s\n",
                        usb_speed_string(udev->speed),
                        info->ctrl_out ? " in/out" : "",
                        rtest, wtest,
                        irtest, iwtest,
                        intrtest, intwtest,
                        info->alt >= 0 ? " (+alt)" : "");
        return 0;
}

static int usbtest_suspend(struct usb_interface *intf, pm_message_t message)
{
        return 0;
}

static int usbtest_resume(struct usb_interface *intf)
{
        return 0;
}


static void usbtest_disconnect(struct usb_interface *intf)
{
        struct usbtest_dev        *dev = usb_get_intfdata(intf);

        usb_set_intfdata(intf, NULL);
        dev_dbg(&intf->dev, "disconnect\n");
        kfree(dev->buf);
        kfree(dev);
}

/* Basic testing only needs a device that can source or sink bulk traffic.
 * Any device can test control transfers (default with GENERIC binding).
 *
 * Several entries work with the default EP0 implementation that's built
 * into EZ-USB chips.  There's a default vendor ID which can be overridden
 * by (very) small config EEPROMS, but otherwise all these devices act
 * identically until firmware is loaded:  only EP0 works.  It turns out
 * to be easy to make other endpoints work, without modifying that EP0
 * behavior.  For now, we expect that kind of firmware.
 */

/* an21xx or fx versions of ez-usb */
static struct usbtest_info ez1_info = {
        .name                = "EZ-USB device",
        .ep_in                = 2,
        .ep_out                = 2,
        .alt                = 1,
};

/* fx2 version of ez-usb */
static struct usbtest_info ez2_info = {
        .name                = "FX2 device",
        .ep_in                = 6,
        .ep_out                = 2,
        .alt                = 1,
};

/* ezusb family device with dedicated usb test firmware,
 */
static struct usbtest_info fw_info = {
        .name                = "usb test device",
        .ep_in                = 2,
        .ep_out                = 2,
        .alt                = 1,
        .autoconf        = 1,                /* iso and ctrl_out need autoconf */
        .ctrl_out        = 1,
        .iso                = 1,                /* iso_ep's are #8 in/out */
};

/* peripheral running Linux and 'zero.c' test firmware, or
 * its user-mode cousin. different versions of this use
 * different hardware with the same vendor/product codes.
 * host side MUST rely on the endpoint descriptors.
 */
static struct usbtest_info gz_info = {
        .name                = "Linux gadget zero",
        .autoconf        = 1,
        .ctrl_out        = 1,
        .iso                = 1,
        .intr                = 1,
        .alt                = 0,
};

static struct usbtest_info um_info = {
        .name                = "Linux user mode test driver",
        .autoconf        = 1,
        .alt                = -1,
};

static struct usbtest_info um2_info = {
        .name                = "Linux user mode ISO test driver",
        .autoconf        = 1,
        .iso                = 1,
        .alt                = -1,
};

#ifdef IBOT2
/* this is a nice source of high speed bulk data;
 * uses an FX2, with firmware provided in the device
 */
static struct usbtest_info ibot2_info = {
        .name                = "iBOT2 webcam",
        .ep_in                = 2,
        .alt                = -1,
};
#endif

#ifdef GENERIC
/* we can use any device to test control traffic */
static struct usbtest_info generic_info = {
        .name                = "Generic USB device",
        .alt                = -1,
};
#endif


static const struct usb_device_id id_table[] = {

        /*-------------------------------------------------------------*/

        /* EZ-USB devices which download firmware to replace (or in our
         * case augment) the default device implementation.
         */

        /* generic EZ-USB FX controller */
        { USB_DEVICE(0x0547, 0x2235),
                .driver_info = (unsigned long) &ez1_info,
        },

        /* CY3671 development board with EZ-USB FX */
        { USB_DEVICE(0x0547, 0x0080),
                .driver_info = (unsigned long) &ez1_info,
        },

        /* generic EZ-USB FX2 controller (or development board) */
        { USB_DEVICE(0x04b4, 0x8613),
                .driver_info = (unsigned long) &ez2_info,
        },

        /* re-enumerated usb test device firmware */
        { USB_DEVICE(0xfff0, 0xfff0),
                .driver_info = (unsigned long) &fw_info,
        },

        /* "Gadget Zero" firmware runs under Linux */
        { USB_DEVICE(0x0525, 0xa4a0),
                .driver_info = (unsigned long) &gz_info,
        },

        /* so does a user-mode variant */
        { USB_DEVICE(0x0525, 0xa4a4),
                .driver_info = (unsigned long) &um_info,
        },

        /* ... and a user-mode variant that talks iso */
        { USB_DEVICE(0x0525, 0xa4a3),
                .driver_info = (unsigned long) &um2_info,
        },

#ifdef KEYSPAN_19Qi
        /* Keyspan 19qi uses an21xx (original EZ-USB) */
        /* this does not coexist with the real Keyspan 19qi driver! */
        { USB_DEVICE(0x06cd, 0x010b),
                .driver_info = (unsigned long) &ez1_info,
        },
#endif

        /*-------------------------------------------------------------*/

#ifdef IBOT2
        /* iBOT2 makes a nice source of high speed bulk-in data */
        /* this does not coexist with a real iBOT2 driver! */
        { USB_DEVICE(0x0b62, 0x0059),
                .driver_info = (unsigned long) &ibot2_info,
        },
#endif

        /*-------------------------------------------------------------*/

#ifdef GENERIC
        /* module params can specify devices to use for control tests */
        { .driver_info = (unsigned long) &generic_info, },
#endif

        /*-------------------------------------------------------------*/

        { }
};
MODULE_DEVICE_TABLE(usb, id_table);

static struct usb_driver usbtest_driver = {
        .name =                "usbtest",
        .id_table =        id_table,
        .probe =        usbtest_probe,
        .unlocked_ioctl = usbtest_ioctl,
        .disconnect =        usbtest_disconnect,
        .suspend =        usbtest_suspend,
        .resume =        usbtest_resume,
};

/*-------------------------------------------------------------------------*/

static int __init usbtest_init(void)
{
#ifdef GENERIC
        if (vendor)
                pr_debug("params: vend=0x%04x prod=0x%04x\n", vendor, product);
#endif
        return usb_register(&usbtest_driver);
}
module_init(usbtest_init);

static void __exit usbtest_exit(void)
{
        usb_deregister(&usbtest_driver);
}
module_exit(usbtest_exit);

MODULE_DESCRIPTION("USB Core/HCD Testing Driver");
MODULE_LICENSE("GPL");


















    1 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/limits.h>
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/types.h>

#include <linux/reciprocal_div.h>

/*
 * For a description of the algorithm please have a look at
 * include/linux/reciprocal_div.h
 */

struct reciprocal_value reciprocal_value(u32 d)
{
        struct reciprocal_value R;
        u64 m;
        int l;

        l = fls(d - 1);
        m = ((1ULL << 32) * ((1ULL << l) - d));
        do_div(m, d);
        ++m;
        R.m = (u32)m;
        R.sh1 = min(l, 1);
        R.sh2 = max(l - 1, 0);

        return R;
}
EXPORT_SYMBOL(reciprocal_value);

struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
{
        struct reciprocal_value_adv R;
        u32 l, post_shift;
        u64 mhigh, mlow;

        /* ceil(log2(d)) */
        l = fls(d - 1);
        /* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to
         * be handled before calling "reciprocal_value_adv", please see the
         * comment at include/linux/reciprocal_div.h.
         */
        WARN(l == 32,
             "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor",
             d, __func__);
        post_shift = l;
        mlow = 1ULL << (32 + l);
        do_div(mlow, d);
        mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
        do_div(mhigh, d);

        for (; post_shift > 0; post_shift--) {
                u64 lo = mlow >> 1, hi = mhigh >> 1;

                if (lo >= hi)
                        break;

                mlow = lo;
                mhigh = hi;
        }

        R.m = (u32)mhigh;
        R.sh = post_shift;
        R.exp = l;
        R.is_wide_m = mhigh > U32_MAX;

        return R;
}
EXPORT_SYMBOL(reciprocal_value_adv);
























    1 











    1 









    1 



    1 










































































    1 







    1 


















    1 





















































































































































    1 


    1 















    1 



























































    1 

















    1 






    1 



































































    1 








    1 














    1 




















    1 


    1 








    1 



















    1 


















    1 

    1 















    1 



    1 







    1 



    1 


















































































    1 







    1 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 



    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
// SPDX-License-Identifier: GPL-2.0+
/*
 * NILFS B-tree.
 *
 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
 *
 * Written by Koji Sato.
 */

#include <linux/slab.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/folio_batch.h>
#include "nilfs.h"
#include "page.h"
#include "btnode.h"
#include "btree.h"
#include "alloc.h"
#include "dat.h"

static void __nilfs_btree_init(struct nilfs_bmap *bmap);

static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
{
        struct nilfs_btree_path *path;
        int level = NILFS_BTREE_LEVEL_DATA;

        path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
        if (path == NULL)
                goto out;

        for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
                path[level].bp_bh = NULL;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index = 0;
                path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
                path[level].bp_op = NULL;
        }

out:
        return path;
}

static void nilfs_btree_free_path(struct nilfs_btree_path *path)
{
        int level = NILFS_BTREE_LEVEL_DATA;

        for (; level < NILFS_BTREE_LEVEL_MAX; level++)
                brelse(path[level].bp_bh);

        kmem_cache_free(nilfs_btree_path_cache, path);
}

/*
 * B-tree node operations
 */
static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
                                     __u64 ptr, struct buffer_head **bhp)
{
        struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
        struct address_space *btnc = btnc_inode->i_mapping;
        struct buffer_head *bh;

        bh = nilfs_btnode_create_block(btnc, ptr);
        if (IS_ERR(bh))
                return PTR_ERR(bh);

        set_buffer_nilfs_volatile(bh);
        *bhp = bh;
        return 0;
}

static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
{
        return node->bn_flags;
}

static void
nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
{
        node->bn_flags = flags;
}

static int nilfs_btree_node_root(const struct nilfs_btree_node *node)
{
        return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
}

static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
{
        return node->bn_level;
}

static void
nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
{
        node->bn_level = level;
}

static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
{
        return le16_to_cpu(node->bn_nchildren);
}

static void
nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
{
        node->bn_nchildren = cpu_to_le16(nchildren);
}

static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
{
        return i_blocksize(btree->b_inode);
}

static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
{
        return btree->b_nchildren_per_block;
}

static __le64 *
nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
{
        return (__le64 *)((char *)(node + 1) +
                          (nilfs_btree_node_root(node) ?
                           0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
}

static __le64 *
nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax)
{
        return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax);
}

static __u64
nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
{
        return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index));
}

static void
nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
{
        *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key);
}

static __u64
nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index,
                         int ncmax)
{
        return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index));
}

static void
nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr,
                         int ncmax)
{
        *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr);
}

static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags,
                                  int level, int nchildren, int ncmax,
                                  const __u64 *keys, const __u64 *ptrs)
{
        __le64 *dkeys;
        __le64 *dptrs;
        int i;

        nilfs_btree_node_set_flags(node, flags);
        nilfs_btree_node_set_level(node, level);
        nilfs_btree_node_set_nchildren(node, nchildren);

        dkeys = nilfs_btree_node_dkeys(node);
        dptrs = nilfs_btree_node_dptrs(node, ncmax);
        for (i = 0; i < nchildren; i++) {
                dkeys[i] = cpu_to_le64(keys[i]);
                dptrs[i] = cpu_to_le64(ptrs[i]);
        }
}

/* Assume the buffer heads corresponding to left and right are locked. */
static void nilfs_btree_node_move_left(struct nilfs_btree_node *left,
                                       struct nilfs_btree_node *right,
                                       int n, int lncmax, int rncmax)
{
        __le64 *ldkeys, *rdkeys;
        __le64 *ldptrs, *rdptrs;
        int lnchildren, rnchildren;

        ldkeys = nilfs_btree_node_dkeys(left);
        ldptrs = nilfs_btree_node_dptrs(left, lncmax);
        lnchildren = nilfs_btree_node_get_nchildren(left);

        rdkeys = nilfs_btree_node_dkeys(right);
        rdptrs = nilfs_btree_node_dptrs(right, rncmax);
        rnchildren = nilfs_btree_node_get_nchildren(right);

        memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
        memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
        memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
        memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));

        lnchildren += n;
        rnchildren -= n;
        nilfs_btree_node_set_nchildren(left, lnchildren);
        nilfs_btree_node_set_nchildren(right, rnchildren);
}

/* Assume that the buffer heads corresponding to left and right are locked. */
static void nilfs_btree_node_move_right(struct nilfs_btree_node *left,
                                        struct nilfs_btree_node *right,
                                        int n, int lncmax, int rncmax)
{
        __le64 *ldkeys, *rdkeys;
        __le64 *ldptrs, *rdptrs;
        int lnchildren, rnchildren;

        ldkeys = nilfs_btree_node_dkeys(left);
        ldptrs = nilfs_btree_node_dptrs(left, lncmax);
        lnchildren = nilfs_btree_node_get_nchildren(left);

        rdkeys = nilfs_btree_node_dkeys(right);
        rdptrs = nilfs_btree_node_dptrs(right, rncmax);
        rnchildren = nilfs_btree_node_get_nchildren(right);

        memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
        memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
        memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
        memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));

        lnchildren -= n;
        rnchildren += n;
        nilfs_btree_node_set_nchildren(left, lnchildren);
        nilfs_btree_node_set_nchildren(right, rnchildren);
}

/* Assume that the buffer head corresponding to node is locked. */
static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index,
                                    __u64 key, __u64 ptr, int ncmax)
{
        __le64 *dkeys;
        __le64 *dptrs;
        int nchildren;

        dkeys = nilfs_btree_node_dkeys(node);
        dptrs = nilfs_btree_node_dptrs(node, ncmax);
        nchildren = nilfs_btree_node_get_nchildren(node);
        if (index < nchildren) {
                memmove(dkeys + index + 1, dkeys + index,
                        (nchildren - index) * sizeof(*dkeys));
                memmove(dptrs + index + 1, dptrs + index,
                        (nchildren - index) * sizeof(*dptrs));
        }
        dkeys[index] = cpu_to_le64(key);
        dptrs[index] = cpu_to_le64(ptr);
        nchildren++;
        nilfs_btree_node_set_nchildren(node, nchildren);
}

/* Assume that the buffer head corresponding to node is locked. */
static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index,
                                    __u64 *keyp, __u64 *ptrp, int ncmax)
{
        __u64 key;
        __u64 ptr;
        __le64 *dkeys;
        __le64 *dptrs;
        int nchildren;

        dkeys = nilfs_btree_node_dkeys(node);
        dptrs = nilfs_btree_node_dptrs(node, ncmax);
        key = le64_to_cpu(dkeys[index]);
        ptr = le64_to_cpu(dptrs[index]);
        nchildren = nilfs_btree_node_get_nchildren(node);
        if (keyp != NULL)
                *keyp = key;
        if (ptrp != NULL)
                *ptrp = ptr;

        if (index < nchildren - 1) {
                memmove(dkeys + index, dkeys + index + 1,
                        (nchildren - index - 1) * sizeof(*dkeys));
                memmove(dptrs + index, dptrs + index + 1,
                        (nchildren - index - 1) * sizeof(*dptrs));
        }
        nchildren--;
        nilfs_btree_node_set_nchildren(node, nchildren);
}

static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
                                   __u64 key, int *indexp)
{
        __u64 nkey;
        int index, low, high, s;

        /* binary search */
        low = 0;
        high = nilfs_btree_node_get_nchildren(node) - 1;
        index = 0;
        s = 0;
        while (low <= high) {
                index = (low + high) / 2;
                nkey = nilfs_btree_node_get_key(node, index);
                if (nkey == key) {
                        s = 0;
                        goto out;
                } else if (nkey < key) {
                        low = index + 1;
                        s = -1;
                } else {
                        high = index - 1;
                        s = 1;
                }
        }

        /* adjust index */
        if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
                if (s > 0 && index > 0)
                        index--;
        } else if (s < 0)
                index++;

 out:
        *indexp = index;

        return s == 0;
}

/**
 * nilfs_btree_node_broken - verify consistency of btree node
 * @node: btree node block to be examined
 * @size: node size (in bytes)
 * @inode: host inode of btree
 * @blocknr: block number
 *
 * Return: 0 if normal, 1 if the node is broken.
 */
static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
                                   size_t size, struct inode *inode,
                                   sector_t blocknr)
{
        int level, flags, nchildren;
        int ret = 0;

        level = nilfs_btree_node_get_level(node);
        flags = nilfs_btree_node_get_flags(node);
        nchildren = nilfs_btree_node_get_nchildren(node);

        if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
                     level >= NILFS_BTREE_LEVEL_MAX ||
                     (flags & NILFS_BTREE_NODE_ROOT) ||
                     nchildren <= 0 ||
                     nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
                nilfs_crit(inode->i_sb,
                           "bad btree node (ino=%llu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
                           inode->i_ino, (unsigned long long)blocknr, level,
                           flags, nchildren);
                ret = 1;
        }
        return ret;
}

/**
 * nilfs_btree_root_broken - verify consistency of btree root node
 * @node: btree root node to be examined
 * @inode: host inode of btree
 *
 * Return: 0 if normal, 1 if the root node is broken.
 */
static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
                                   struct inode *inode)
{
        int level, flags, nchildren;
        int ret = 0;

        level = nilfs_btree_node_get_level(node);
        flags = nilfs_btree_node_get_flags(node);
        nchildren = nilfs_btree_node_get_nchildren(node);

        if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
                     level >= NILFS_BTREE_LEVEL_MAX ||
                     nchildren < 0 ||
                     nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX ||
                     (nchildren == 0 && level > NILFS_BTREE_LEVEL_NODE_MIN))) {
                nilfs_crit(inode->i_sb,
                           "bad btree root (ino=%llu): level = %d, flags = 0x%x, nchildren = %d",
                           inode->i_ino, level, flags, nchildren);
                ret = 1;
        }
        return ret;
}

int nilfs_btree_broken_node_block(struct buffer_head *bh)
{
        struct inode *inode;
        int ret;

        if (buffer_nilfs_checked(bh))
                return 0;

        inode = bh->b_folio->mapping->host;
        ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
                                      bh->b_size, inode, bh->b_blocknr);
        if (likely(!ret))
                set_buffer_nilfs_checked(bh);
        return ret;
}

static struct nilfs_btree_node *
nilfs_btree_get_root(const struct nilfs_bmap *btree)
{
        return (struct nilfs_btree_node *)btree->b_u.u_data;
}

static struct nilfs_btree_node *
nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
{
        return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
}

static struct nilfs_btree_node *
nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
{
        return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
}

static int nilfs_btree_height(const struct nilfs_bmap *btree)
{
        return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
}

static struct nilfs_btree_node *
nilfs_btree_get_node(const struct nilfs_bmap *btree,
                     const struct nilfs_btree_path *path,
                     int level, int *ncmaxp)
{
        struct nilfs_btree_node *node;

        if (level == nilfs_btree_height(btree) - 1) {
                node = nilfs_btree_get_root(btree);
                *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX;
        } else {
                node = nilfs_btree_get_nonroot_node(path, level);
                *ncmaxp = nilfs_btree_nchildren_per_block(btree);
        }
        return node;
}

static int nilfs_btree_bad_node(const struct nilfs_bmap *btree,
                                struct nilfs_btree_node *node, int level)
{
        if (unlikely(nilfs_btree_node_get_level(node) != level)) {
                dump_stack();
                nilfs_crit(btree->b_inode->i_sb,
                           "btree level mismatch (ino=%llu): %d != %d",
                           btree->b_inode->i_ino,
                           nilfs_btree_node_get_level(node), level);
                return 1;
        }
        return 0;
}

struct nilfs_btree_readahead_info {
        struct nilfs_btree_node *node;        /* parent node */
        int max_ra_blocks;                /* max nof blocks to read ahead */
        int index;                        /* current index on the parent node */
        int ncmax;                        /* nof children in the parent node */
};

static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
                                   struct buffer_head **bhp,
                                   const struct nilfs_btree_readahead_info *ra)
{
        struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
        struct address_space *btnc = btnc_inode->i_mapping;
        struct buffer_head *bh, *ra_bh;
        sector_t submit_ptr = 0;
        int ret;

        ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, &bh,
                                        &submit_ptr);
        if (ret) {
                if (likely(ret == -EEXIST))
                        goto out_check;
                if (ret == -ENOENT) {
                        /*
                         * Block address translation failed due to invalid
                         * value of 'ptr'.  In this case, return internal code
                         * -EINVAL (broken bmap) to notify bmap layer of fatal
                         * metadata corruption.
                         */
                        ret = -EINVAL;
                }
                return ret;
        }

        if (ra) {
                int i, n;
                __u64 ptr2;

                /* read ahead sibling nodes */
                for (n = ra->max_ra_blocks, i = ra->index + 1;
                     n > 0 && i < ra->ncmax; n--, i++) {
                        ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);

                        ret = nilfs_btnode_submit_block(btnc, ptr2, 0,
                                                REQ_OP_READ | REQ_RAHEAD,
                                                &ra_bh, &submit_ptr);
                        if (likely(!ret || ret == -EEXIST))
                                brelse(ra_bh);
                        else if (ret != -EBUSY)
                                break;
                        if (!buffer_locked(bh))
                                goto out_no_wait;
                }
        }

        wait_on_buffer(bh);

 out_no_wait:
        if (!buffer_uptodate(bh)) {
                nilfs_err(btree->b_inode->i_sb,
                          "I/O error reading b-tree node block (ino=%llu, blocknr=%llu)",
                          btree->b_inode->i_ino, (unsigned long long)ptr);
                brelse(bh);
                return -EIO;
        }

 out_check:
        if (nilfs_btree_broken_node_block(bh)) {
                clear_buffer_uptodate(bh);
                brelse(bh);
                return -EINVAL;
        }

        *bhp = bh;
        return 0;
}

static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
                                   struct buffer_head **bhp)
{
        return __nilfs_btree_get_block(btree, ptr, bhp, NULL);
}

static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
                                 struct nilfs_btree_path *path,
                                 __u64 key, __u64 *ptrp, int minlevel,
                                 int readahead)
{
        struct nilfs_btree_node *node;
        struct nilfs_btree_readahead_info p, *ra;
        __u64 ptr;
        int level, index, found, ncmax, ret;

        node = nilfs_btree_get_root(btree);
        level = nilfs_btree_node_get_level(node);
        if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
                return -ENOENT;

        found = nilfs_btree_node_lookup(node, key, &index);
        ptr = nilfs_btree_node_get_ptr(node, index,
                                       NILFS_BTREE_ROOT_NCHILDREN_MAX);
        path[level].bp_bh = NULL;
        path[level].bp_index = index;

        ncmax = nilfs_btree_nchildren_per_block(btree);

        while (--level >= minlevel) {
                ra = NULL;
                if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) {
                        p.node = nilfs_btree_get_node(btree, path, level + 1,
                                                      &p.ncmax);
                        p.index = index;
                        p.max_ra_blocks = 7;
                        ra = &p;
                }
                ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh,
                                              ra);
                if (ret < 0)
                        return ret;

                node = nilfs_btree_get_nonroot_node(path, level);
                if (nilfs_btree_bad_node(btree, node, level))
                        return -EINVAL;
                if (!found)
                        found = nilfs_btree_node_lookup(node, key, &index);
                else
                        index = 0;
                if (index < ncmax) {
                        ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
                } else {
                        WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
                        /* insert */
                        ptr = NILFS_BMAP_INVALID_PTR;
                }
                path[level].bp_index = index;
        }
        if (!found)
                return -ENOENT;

        if (ptrp != NULL)
                *ptrp = ptr;

        return 0;
}

static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *node;
        __u64 ptr;
        int index, level, ncmax, ret;

        node = nilfs_btree_get_root(btree);
        index = nilfs_btree_node_get_nchildren(node) - 1;
        if (index < 0)
                return -ENOENT;
        level = nilfs_btree_node_get_level(node);
        ptr = nilfs_btree_node_get_ptr(node, index,
                                       NILFS_BTREE_ROOT_NCHILDREN_MAX);
        path[level].bp_bh = NULL;
        path[level].bp_index = index;
        ncmax = nilfs_btree_nchildren_per_block(btree);

        for (level--; level > 0; level--) {
                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(path, level);
                if (nilfs_btree_bad_node(btree, node, level))
                        return -EINVAL;
                index = nilfs_btree_node_get_nchildren(node) - 1;
                ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
                path[level].bp_index = index;
        }

        if (keyp != NULL)
                *keyp = nilfs_btree_node_get_key(node, index);
        if (ptrp != NULL)
                *ptrp = ptr;

        return 0;
}

/**
 * nilfs_btree_get_next_key - get next valid key from btree path array
 * @btree: bmap struct of btree
 * @path: array of nilfs_btree_path struct
 * @minlevel: start level
 * @nextkey: place to store the next valid key
 *
 * Return: 0 if the next key was found, %-ENOENT if not found.
 */
static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree,
                                    const struct nilfs_btree_path *path,
                                    int minlevel, __u64 *nextkey)
{
        struct nilfs_btree_node *node;
        int maxlevel = nilfs_btree_height(btree) - 1;
        int index, next_adj, level;

        /* Next index is already set to bp_index for leaf nodes. */
        next_adj = 0;
        for (level = minlevel; level <= maxlevel; level++) {
                if (level == maxlevel)
                        node = nilfs_btree_get_root(btree);
                else
                        node = nilfs_btree_get_nonroot_node(path, level);

                index = path[level].bp_index + next_adj;
                if (index < nilfs_btree_node_get_nchildren(node)) {
                        /* Next key is in this node */
                        *nextkey = nilfs_btree_node_get_key(node, index);
                        return 0;
                }
                /* For non-leaf nodes, next index is stored at bp_index + 1. */
                next_adj = 1;
        }
        return -ENOENT;
}

static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
                              __u64 key, int level, __u64 *ptrp)
{
        struct nilfs_btree_path *path;
        int ret;

        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;

        ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0);

        nilfs_btree_free_path(path);

        return ret;
}

static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
                                     __u64 key, __u64 *ptrp,
                                     unsigned int maxblocks)
{
        struct nilfs_btree_path *path;
        struct nilfs_btree_node *node;
        struct inode *dat = NULL;
        __u64 ptr, ptr2;
        sector_t blocknr;
        int level = NILFS_BTREE_LEVEL_NODE_MIN;
        int ret, cnt, index, maxlevel, ncmax;
        struct nilfs_btree_readahead_info p;

        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;

        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1);
        if (ret < 0)
                goto out;

        if (NILFS_BMAP_USE_VBN(btree)) {
                dat = nilfs_bmap_get_dat(btree);
                ret = nilfs_dat_translate(dat, ptr, &blocknr);
                if (ret < 0)
                        goto dat_error;
                ptr = blocknr;
        }
        cnt = 1;
        if (cnt == maxblocks)
                goto end;

        maxlevel = nilfs_btree_height(btree) - 1;
        node = nilfs_btree_get_node(btree, path, level, &ncmax);
        index = path[level].bp_index + 1;
        for (;;) {
                while (index < nilfs_btree_node_get_nchildren(node)) {
                        if (nilfs_btree_node_get_key(node, index) !=
                            key + cnt)
                                goto end;
                        ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax);
                        if (dat) {
                                ret = nilfs_dat_translate(dat, ptr2, &blocknr);
                                if (ret < 0)
                                        goto dat_error;
                                ptr2 = blocknr;
                        }
                        if (ptr2 != ptr + cnt || ++cnt == maxblocks)
                                goto end;
                        index++;
                }
                if (level == maxlevel)
                        break;

                /* look-up right sibling node */
                p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax);
                p.index = path[level + 1].bp_index + 1;
                p.max_ra_blocks = 7;
                if (p.index >= nilfs_btree_node_get_nchildren(p.node) ||
                    nilfs_btree_node_get_key(p.node, p.index) != key + cnt)
                        break;
                ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax);
                path[level + 1].bp_index = p.index;

                brelse(path[level].bp_bh);
                path[level].bp_bh = NULL;

                ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh,
                                              &p);
                if (ret < 0)
                        goto out;
                node = nilfs_btree_get_nonroot_node(path, level);
                ncmax = nilfs_btree_nchildren_per_block(btree);
                index = 0;
                path[level].bp_index = index;
        }
 end:
        *ptrp = ptr;
        ret = cnt;
 out:
        nilfs_btree_free_path(path);
        return ret;

 dat_error:
        if (ret == -ENOENT)
                ret = -EINVAL;  /* Notify bmap layer of metadata corruption */
        goto out;
}

static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 key)
{
        if (level < nilfs_btree_height(btree) - 1) {
                do {
                        nilfs_btree_node_set_key(
                                nilfs_btree_get_nonroot_node(path, level),
                                path[level].bp_index, key);
                        if (!buffer_dirty(path[level].bp_bh))
                                mark_buffer_dirty(path[level].bp_bh);
                } while ((path[level].bp_index == 0) &&
                         (++level < nilfs_btree_height(btree) - 1));
        }

        /* root */
        if (level == nilfs_btree_height(btree) - 1) {
                nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
                                         path[level].bp_index, key);
        }
}

static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
                                  struct nilfs_btree_path *path,
                                  int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *node;
        int ncblk;

        if (level < nilfs_btree_height(btree) - 1) {
                node = nilfs_btree_get_nonroot_node(path, level);
                ncblk = nilfs_btree_nchildren_per_block(btree);
                nilfs_btree_node_insert(node, path[level].bp_index,
                                        *keyp, *ptrp, ncblk);
                if (!buffer_dirty(path[level].bp_bh))
                        mark_buffer_dirty(path[level].bp_bh);

                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
                                                nilfs_btree_node_get_key(node,
                                                                         0));
        } else {
                node = nilfs_btree_get_root(btree);
                nilfs_btree_node_insert(node, path[level].bp_index,
                                        *keyp, *ptrp,
                                        NILFS_BTREE_ROOT_NCHILDREN_MAX);
        }
}

static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
                                   struct nilfs_btree_path *path,
                                   int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *node, *left;
        int nchildren, lnchildren, n, move, ncblk;

        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        lnchildren = nilfs_btree_node_get_nchildren(left);
        ncblk = nilfs_btree_nchildren_per_block(btree);
        move = 0;

        n = (nchildren + lnchildren + 1) / 2 - lnchildren;
        if (n > path[level].bp_index) {
                /* move insert point */
                n--;
                move = 1;
        }

        nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);

        if (!buffer_dirty(path[level].bp_bh))
                mark_buffer_dirty(path[level].bp_bh);
        if (!buffer_dirty(path[level].bp_sib_bh))
                mark_buffer_dirty(path[level].bp_sib_bh);

        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));

        if (move) {
                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index += lnchildren;
                path[level + 1].bp_index--;
        } else {
                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
                path[level].bp_index -= n;
        }

        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
}

static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *node, *right;
        int nchildren, rnchildren, n, move, ncblk;

        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        rnchildren = nilfs_btree_node_get_nchildren(right);
        ncblk = nilfs_btree_nchildren_per_block(btree);
        move = 0;

        n = (nchildren + rnchildren + 1) / 2 - rnchildren;
        if (n > nchildren - path[level].bp_index) {
                /* move insert point */
                n--;
                move = 1;
        }

        nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);

        if (!buffer_dirty(path[level].bp_bh))
                mark_buffer_dirty(path[level].bp_bh);
        if (!buffer_dirty(path[level].bp_sib_bh))
                mark_buffer_dirty(path[level].bp_sib_bh);

        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(right, 0));
        path[level + 1].bp_index--;

        if (move) {
                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
                path[level + 1].bp_index++;
        } else {
                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
        }

        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
}

static void nilfs_btree_split(struct nilfs_bmap *btree,
                              struct nilfs_btree_path *path,
                              int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *node, *right;
        int nchildren, n, move, ncblk;

        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        ncblk = nilfs_btree_nchildren_per_block(btree);
        move = 0;

        n = (nchildren + 1) / 2;
        if (n > nchildren - path[level].bp_index) {
                n--;
                move = 1;
        }

        nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);

        if (!buffer_dirty(path[level].bp_bh))
                mark_buffer_dirty(path[level].bp_bh);
        if (!buffer_dirty(path[level].bp_sib_bh))
                mark_buffer_dirty(path[level].bp_sib_bh);

        if (move) {
                path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
                nilfs_btree_node_insert(right, path[level].bp_index,
                                        *keyp, *ptrp, ncblk);

                *keyp = nilfs_btree_node_get_key(right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;

                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
        } else {
                nilfs_btree_do_insert(btree, path, level, keyp, ptrp);

                *keyp = nilfs_btree_node_get_key(right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;

                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
        }

        path[level + 1].bp_index++;
}

static void nilfs_btree_grow(struct nilfs_bmap *btree,
                             struct nilfs_btree_path *path,
                             int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *root, *child;
        int n, ncblk;

        root = nilfs_btree_get_root(btree);
        child = nilfs_btree_get_sib_node(path, level);
        ncblk = nilfs_btree_nchildren_per_block(btree);

        n = nilfs_btree_node_get_nchildren(root);

        nilfs_btree_node_move_right(root, child, n,
                                    NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
        nilfs_btree_node_set_level(root, level + 1);

        if (!buffer_dirty(path[level].bp_sib_bh))
                mark_buffer_dirty(path[level].bp_sib_bh);

        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;

        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);

        *keyp = nilfs_btree_node_get_key(child, 0);
        *ptrp = path[level].bp_newreq.bpr_ptr;
}

static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree,
                                   const struct nilfs_btree_path *path)
{
        struct nilfs_btree_node *node;
        int level, ncmax;

        if (path == NULL)
                return NILFS_BMAP_INVALID_PTR;

        /* left sibling */
        level = NILFS_BTREE_LEVEL_NODE_MIN;
        if (path[level].bp_index > 0) {
                node = nilfs_btree_get_node(btree, path, level, &ncmax);
                return nilfs_btree_node_get_ptr(node,
                                                path[level].bp_index - 1,
                                                ncmax);
        }

        /* parent */
        level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
        if (level <= nilfs_btree_height(btree) - 1) {
                node = nilfs_btree_get_node(btree, path, level, &ncmax);
                return nilfs_btree_node_get_ptr(node, path[level].bp_index,
                                                ncmax);
        }

        return NILFS_BMAP_INVALID_PTR;
}

static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
                                       const struct nilfs_btree_path *path,
                                       __u64 key)
{
        __u64 ptr;

        ptr = nilfs_bmap_find_target_seq(btree, key);
        if (ptr != NILFS_BMAP_INVALID_PTR)
                /* sequential access */
                return ptr;

        ptr = nilfs_btree_find_near(btree, path);
        if (ptr != NILFS_BMAP_INVALID_PTR)
                /* near */
                return ptr;

        /* block group */
        return nilfs_bmap_find_target_in_group(btree);
}

static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int *levelp, __u64 key, __u64 ptr,
                                      struct nilfs_bmap_stats *stats)
{
        struct buffer_head *bh;
        struct nilfs_btree_node *node, *parent, *sib;
        __u64 sibptr;
        int pindex, level, ncmax, ncblk, ret;
        struct inode *dat = NULL;

        stats->bs_nblocks = 0;
        level = NILFS_BTREE_LEVEL_DATA;

        /* allocate a new ptr for data block */
        if (NILFS_BMAP_USE_VBN(btree)) {
                path[level].bp_newreq.bpr_ptr =
                        nilfs_btree_find_target_v(btree, path, key);
                dat = nilfs_bmap_get_dat(btree);
        }

        ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
        if (ret < 0)
                goto err_out_data;

        ncblk = nilfs_btree_nchildren_per_block(btree);

        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < nilfs_btree_height(btree) - 1;
             level++) {
                node = nilfs_btree_get_nonroot_node(path, level);
                if (nilfs_btree_node_get_nchildren(node) < ncblk) {
                        path[level].bp_op = nilfs_btree_do_insert;
                        stats->bs_nblocks++;
                        goto out;
                }

                parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
                pindex = path[level + 1].bp_index;

                /* left sibling */
                if (pindex > 0) {
                        sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
                                                          ncmax);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
                        if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_carry_left;
                                stats->bs_nblocks++;
                                goto out;
                        } else {
                                brelse(bh);
                        }
                }

                /* right sibling */
                if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) {
                        sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
                                                          ncmax);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
                        if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_carry_right;
                                stats->bs_nblocks++;
                                goto out;
                        } else {
                                brelse(bh);
                        }
                }

                /* split */
                path[level].bp_newreq.bpr_ptr =
                        path[level - 1].bp_newreq.bpr_ptr + 1;
                ret = nilfs_bmap_prepare_alloc_ptr(btree,
                                                   &path[level].bp_newreq, dat);
                if (ret < 0)
                        goto err_out_child_node;
                ret = nilfs_btree_get_new_block(btree,
                                                path[level].bp_newreq.bpr_ptr,
                                                &bh);
                if (ret < 0)
                        goto err_out_curr_node;

                stats->bs_nblocks++;

                sib = (struct nilfs_btree_node *)bh->b_data;
                nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL);
                path[level].bp_sib_bh = bh;
                path[level].bp_op = nilfs_btree_split;
        }

        /* root */
        node = nilfs_btree_get_root(btree);
        if (nilfs_btree_node_get_nchildren(node) <
            NILFS_BTREE_ROOT_NCHILDREN_MAX) {
                path[level].bp_op = nilfs_btree_do_insert;
                stats->bs_nblocks++;
                goto out;
        }

        /* grow */
        path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
        ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
        if (ret < 0)
                goto err_out_child_node;
        ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
                                        &bh);
        if (ret < 0)
                goto err_out_curr_node;

        nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data,
                              0, level, 0, ncblk, NULL, NULL);
        path[level].bp_sib_bh = bh;
        path[level].bp_op = nilfs_btree_grow;

        level++;
        path[level].bp_op = nilfs_btree_do_insert;

        /* a newly-created node block and a data block are added */
        stats->bs_nblocks += 2;

        /* success */
 out:
        *levelp = level;
        return ret;

        /* error */
 err_out_curr_node:
        nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
 err_out_child_node:
        for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
                nilfs_btnode_delete(path[level].bp_sib_bh);
                nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);

        }

        nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
 err_out_data:
        *levelp = level;
        stats->bs_nblocks = 0;
        return ret;
}

static void nilfs_btree_commit_insert(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int maxlevel, __u64 key, __u64 ptr)
{
        struct inode *dat = NULL;
        int level;

        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
        if (NILFS_BMAP_USE_VBN(btree)) {
                nilfs_bmap_set_target_v(btree, key, ptr);
                dat = nilfs_bmap_get_dat(btree);
        }

        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
                nilfs_bmap_commit_alloc_ptr(btree,
                                            &path[level - 1].bp_newreq, dat);
                path[level].bp_op(btree, path, level, &key, &ptr);
        }

        if (!nilfs_bmap_dirty(btree))
                nilfs_bmap_set_dirty(btree);
}

static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
{
        struct nilfs_btree_path *path;
        struct nilfs_bmap_stats stats;
        int level, ret;

        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;

        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN, 0);
        if (ret != -ENOENT) {
                if (ret == 0)
                        ret = -EEXIST;
                goto out;
        }

        ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
        if (ret < 0)
                goto out;
        nilfs_btree_commit_insert(btree, path, level, key, ptr);
        nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);

 out:
        nilfs_btree_free_path(path);
        return ret;
}

static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
                                  struct nilfs_btree_path *path,
                                  int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *node;
        int ncblk;

        if (level < nilfs_btree_height(btree) - 1) {
                node = nilfs_btree_get_nonroot_node(path, level);
                ncblk = nilfs_btree_nchildren_per_block(btree);
                nilfs_btree_node_delete(node, path[level].bp_index,
                                        keyp, ptrp, ncblk);
                if (!buffer_dirty(path[level].bp_bh))
                        mark_buffer_dirty(path[level].bp_bh);
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));
        } else {
                node = nilfs_btree_get_root(btree);
                nilfs_btree_node_delete(node, path[level].bp_index,
                                        keyp, ptrp,
                                        NILFS_BTREE_ROOT_NCHILDREN_MAX);
        }
}

static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *node, *left;
        int nchildren, lnchildren, n, ncblk;

        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);

        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        lnchildren = nilfs_btree_node_get_nchildren(left);
        ncblk = nilfs_btree_nchildren_per_block(btree);

        n = (nchildren + lnchildren) / 2 - nchildren;

        nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);

        if (!buffer_dirty(path[level].bp_bh))
                mark_buffer_dirty(path[level].bp_bh);
        if (!buffer_dirty(path[level].bp_sib_bh))
                mark_buffer_dirty(path[level].bp_sib_bh);

        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));

        brelse(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
        path[level].bp_index += n;
}

static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
                                     struct nilfs_btree_path *path,
                                     int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *node, *right;
        int nchildren, rnchildren, n, ncblk;

        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);

        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        rnchildren = nilfs_btree_node_get_nchildren(right);
        ncblk = nilfs_btree_nchildren_per_block(btree);

        n = (nchildren + rnchildren) / 2 - nchildren;

        nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);

        if (!buffer_dirty(path[level].bp_bh))
                mark_buffer_dirty(path[level].bp_bh);
        if (!buffer_dirty(path[level].bp_sib_bh))
                mark_buffer_dirty(path[level].bp_sib_bh);

        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(right, 0));
        path[level + 1].bp_index--;

        brelse(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
}

static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *node, *left;
        int n, ncblk;

        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);

        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
        ncblk = nilfs_btree_nchildren_per_block(btree);

        n = nilfs_btree_node_get_nchildren(node);

        nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);

        if (!buffer_dirty(path[level].bp_sib_bh))
                mark_buffer_dirty(path[level].bp_sib_bh);

        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
        path[level].bp_index += nilfs_btree_node_get_nchildren(left);
}

static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
                                     struct nilfs_btree_path *path,
                                     int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *node, *right;
        int n, ncblk;

        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);

        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        ncblk = nilfs_btree_nchildren_per_block(btree);

        n = nilfs_btree_node_get_nchildren(right);

        nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);

        if (!buffer_dirty(path[level].bp_bh))
                mark_buffer_dirty(path[level].bp_bh);

        nilfs_btnode_delete(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
        path[level + 1].bp_index++;
}

static void nilfs_btree_shrink(struct nilfs_bmap *btree,
                               struct nilfs_btree_path *path,
                               int level, __u64 *keyp, __u64 *ptrp)
{
        struct nilfs_btree_node *root, *child;
        int n, ncblk;

        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);

        root = nilfs_btree_get_root(btree);
        child = nilfs_btree_get_nonroot_node(path, level);
        ncblk = nilfs_btree_nchildren_per_block(btree);

        nilfs_btree_node_delete(root, 0, NULL, NULL,
                                NILFS_BTREE_ROOT_NCHILDREN_MAX);
        nilfs_btree_node_set_level(root, level);
        n = nilfs_btree_node_get_nchildren(child);
        nilfs_btree_node_move_left(root, child, n,
                                   NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);

        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = NULL;
}

static void nilfs_btree_nop(struct nilfs_bmap *btree,
                            struct nilfs_btree_path *path,
                            int level, __u64 *keyp, __u64 *ptrp)
{
}

static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int *levelp,
                                      struct nilfs_bmap_stats *stats,
                                      struct inode *dat)
{
        struct buffer_head *bh;
        struct nilfs_btree_node *node, *parent, *sib;
        __u64 sibptr;
        int pindex, dindex, level, ncmin, ncmax, ncblk, ret;

        ret = 0;
        stats->bs_nblocks = 0;
        ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
        ncblk = nilfs_btree_nchildren_per_block(btree);

        for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index;
             level < nilfs_btree_height(btree) - 1;
             level++) {
                node = nilfs_btree_get_nonroot_node(path, level);
                path[level].bp_oldreq.bpr_ptr =
                        nilfs_btree_node_get_ptr(node, dindex, ncblk);
                ret = nilfs_bmap_prepare_end_ptr(btree,
                                                 &path[level].bp_oldreq, dat);
                if (ret < 0)
                        goto err_out_child_node;

                if (nilfs_btree_node_get_nchildren(node) > ncmin) {
                        path[level].bp_op = nilfs_btree_do_delete;
                        stats->bs_nblocks++;
                        goto out;
                }

                parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
                pindex = path[level + 1].bp_index;
                dindex = pindex;

                if (pindex > 0) {
                        /* left sibling */
                        sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
                                                          ncmax);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
                        if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_borrow_left;
                                stats->bs_nblocks++;
                                goto out;
                        } else {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_concat_left;
                                stats->bs_nblocks++;
                                /* continue; */
                        }
                } else if (pindex <
                           nilfs_btree_node_get_nchildren(parent) - 1) {
                        /* right sibling */
                        sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
                                                          ncmax);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
                        if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_borrow_right;
                                stats->bs_nblocks++;
                                goto out;
                        } else {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_concat_right;
                                stats->bs_nblocks++;
                                /*
                                 * When merging right sibling node
                                 * into the current node, pointer to
                                 * the right sibling node must be
                                 * terminated instead.  The adjustment
                                 * below is required for that.
                                 */
                                dindex = pindex + 1;
                                /* continue; */
                        }
                } else {
                        /* no siblings */
                        /* the only child of the root node */
                        WARN_ON(level != nilfs_btree_height(btree) - 2);
                        if (nilfs_btree_node_get_nchildren(node) - 1 <=
                            NILFS_BTREE_ROOT_NCHILDREN_MAX) {
                                path[level].bp_op = nilfs_btree_shrink;
                                stats->bs_nblocks += 2;
                                level++;
                                path[level].bp_op = nilfs_btree_nop;
                                goto shrink_root_child;
                        } else {
                                path[level].bp_op = nilfs_btree_do_delete;
                                stats->bs_nblocks++;
                                goto out;
                        }
                }
        }

        /* child of the root node is deleted */
        path[level].bp_op = nilfs_btree_do_delete;
        stats->bs_nblocks++;

shrink_root_child:
        node = nilfs_btree_get_root(btree);
        path[level].bp_oldreq.bpr_ptr =
                nilfs_btree_node_get_ptr(node, dindex,
                                         NILFS_BTREE_ROOT_NCHILDREN_MAX);

        ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
        if (ret < 0)
                goto err_out_child_node;

        /* success */
 out:
        *levelp = level;
        return ret;

        /* error */
 err_out_curr_node:
        nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
 err_out_child_node:
        for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
                brelse(path[level].bp_sib_bh);
                nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
        }
        *levelp = level;
        stats->bs_nblocks = 0;
        return ret;
}

static void nilfs_btree_commit_delete(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int maxlevel, struct inode *dat)
{
        int level;

        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
                nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat);
                path[level].bp_op(btree, path, level, NULL, NULL);
        }

        if (!nilfs_bmap_dirty(btree))
                nilfs_bmap_set_dirty(btree);
}

static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)

{
        struct nilfs_btree_path *path;
        struct nilfs_bmap_stats stats;
        struct inode *dat;
        int level, ret;

        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;

        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN, 0);
        if (ret < 0)
                goto out;


        dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;

        ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
        if (ret < 0)
                goto out;
        nilfs_btree_commit_delete(btree, path, level, dat);
        nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks);

out:
        nilfs_btree_free_path(path);
        return ret;
}

static int nilfs_btree_seek_key(const struct nilfs_bmap *btree, __u64 start,
                                __u64 *keyp)
{
        struct nilfs_btree_path *path;
        const int minlevel = NILFS_BTREE_LEVEL_NODE_MIN;
        int ret;

        path = nilfs_btree_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = nilfs_btree_do_lookup(btree, path, start, NULL, minlevel, 0);
        if (!ret)
                *keyp = start;
        else if (ret == -ENOENT)
                ret = nilfs_btree_get_next_key(btree, path, minlevel, keyp);

        nilfs_btree_free_path(path);
        return ret;
}

static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
{
        struct nilfs_btree_path *path;
        int ret;

        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;

        ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);

        nilfs_btree_free_path(path);

        return ret;
}

static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
{
        struct buffer_head *bh;
        struct nilfs_btree_node *root, *node;
        __u64 maxkey, nextmaxkey;
        __u64 ptr;
        int nchildren, ret;

        root = nilfs_btree_get_root(btree);
        nchildren = nilfs_btree_node_get_nchildren(root);
        if (unlikely(nchildren == 0))
                return 0;

        switch (nilfs_btree_height(btree)) {
        case 2:
                bh = NULL;
                node = root;
                break;
        case 3:
                if (nchildren > 1)
                        return 0;
                ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
                                               NILFS_BTREE_ROOT_NCHILDREN_MAX);
                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
                node = (struct nilfs_btree_node *)bh->b_data;
                nchildren = nilfs_btree_node_get_nchildren(node);
                break;
        default:
                return 0;
        }

        maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
        nextmaxkey = (nchildren > 1) ?
                nilfs_btree_node_get_key(node, nchildren - 2) : 0;
        brelse(bh);

        return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
}

static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
                                   __u64 *keys, __u64 *ptrs, int nitems)
{
        struct buffer_head *bh;
        struct nilfs_btree_node *node, *root;
        __le64 *dkeys;
        __le64 *dptrs;
        __u64 ptr;
        int nchildren, ncmax, i, ret;

        root = nilfs_btree_get_root(btree);
        switch (nilfs_btree_height(btree)) {
        case 2:
                bh = NULL;
                node = root;
                ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX;
                break;
        case 3:
                nchildren = nilfs_btree_node_get_nchildren(root);
                WARN_ON(nchildren > 1);
                ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
                                               NILFS_BTREE_ROOT_NCHILDREN_MAX);
                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
                node = (struct nilfs_btree_node *)bh->b_data;
                ncmax = nilfs_btree_nchildren_per_block(btree);
                break;
        default:
                node = NULL;
                return -EINVAL;
        }

        nchildren = nilfs_btree_node_get_nchildren(node);
        if (nchildren < nitems)
                nitems = nchildren;
        dkeys = nilfs_btree_node_dkeys(node);
        dptrs = nilfs_btree_node_dptrs(node, ncmax);
        for (i = 0; i < nitems; i++) {
                keys[i] = le64_to_cpu(dkeys[i]);
                ptrs[i] = le64_to_cpu(dptrs[i]);
        }

        brelse(bh);

        return nitems;
}

static int
nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
                                       union nilfs_bmap_ptr_req *dreq,
                                       union nilfs_bmap_ptr_req *nreq,
                                       struct buffer_head **bhp,
                                       struct nilfs_bmap_stats *stats)
{
        struct buffer_head *bh;
        struct inode *dat = NULL;
        int ret;

        stats->bs_nblocks = 0;

        /* for data */
        /* cannot find near ptr */
        if (NILFS_BMAP_USE_VBN(btree)) {
                dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
                dat = nilfs_bmap_get_dat(btree);
        }

        ret = nilfs_attach_btree_node_cache(&NILFS_BMAP_I(btree)->vfs_inode);
        if (ret < 0)
                return ret;

        ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
        if (ret < 0)
                return ret;

        *bhp = NULL;
        stats->bs_nblocks++;
        if (nreq != NULL) {
                nreq->bpr_ptr = dreq->bpr_ptr + 1;
                ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat);
                if (ret < 0)
                        goto err_out_dreq;

                ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh);
                if (ret < 0)
                        goto err_out_nreq;

                *bhp = bh;
                stats->bs_nblocks++;
        }

        /* success */
        return 0;

        /* error */
 err_out_nreq:
        nilfs_bmap_abort_alloc_ptr(btree, nreq, dat);
 err_out_dreq:
        nilfs_bmap_abort_alloc_ptr(btree, dreq, dat);
        stats->bs_nblocks = 0;
        return ret;

}

static void
nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
                                      __u64 key, __u64 ptr,
                                      const __u64 *keys, const __u64 *ptrs,
                                      int n,
                                      union nilfs_bmap_ptr_req *dreq,
                                      union nilfs_bmap_ptr_req *nreq,
                                      struct buffer_head *bh)
{
        struct nilfs_btree_node *node;
        struct inode *dat;
        __u64 tmpptr;
        int ncblk;

        /* free resources */
        if (btree->b_ops->bop_clear != NULL)
                btree->b_ops->bop_clear(btree);

        /* ptr must be a pointer to a buffer head. */
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));

        /* convert and insert */
        dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
        __nilfs_btree_init(btree);
        if (nreq != NULL) {
                nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
                nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);

                /* create child node at level 1 */
                node = (struct nilfs_btree_node *)bh->b_data;
                ncblk = nilfs_btree_nchildren_per_block(btree);
                nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
                nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
                if (!buffer_dirty(bh))
                        mark_buffer_dirty(bh);
                if (!nilfs_bmap_dirty(btree))
                        nilfs_bmap_set_dirty(btree);

                brelse(bh);

                /* create root node at level 2 */
                node = nilfs_btree_get_root(btree);
                tmpptr = nreq->bpr_ptr;
                nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1,
                                      NILFS_BTREE_ROOT_NCHILDREN_MAX,
                                      &keys[0], &tmpptr);
        } else {
                nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);

                /* create root node at level 1 */
                node = nilfs_btree_get_root(btree);
                nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n,
                                      NILFS_BTREE_ROOT_NCHILDREN_MAX,
                                      keys, ptrs);
                nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr,
                                        NILFS_BTREE_ROOT_NCHILDREN_MAX);
                if (!nilfs_bmap_dirty(btree))
                        nilfs_bmap_set_dirty(btree);
        }

        if (NILFS_BMAP_USE_VBN(btree))
                nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr);
}

/**
 * nilfs_btree_convert_and_insert - Convert and insert entries into a B-tree
 * @btree: NILFS B-tree structure
 * @key: Key of the new entry to be inserted
 * @ptr: Pointer (block number) associated with the key to be inserted
 * @keys: Array of keys to be inserted in addition to @key
 * @ptrs: Array of pointers associated with @keys
 * @n: Number of keys and pointers in @keys and @ptrs
 *
 * This function is used to insert a new entry specified by @key and @ptr,
 * along with additional entries specified by @keys and @ptrs arrays, into a
 * NILFS B-tree.
 * It prepares the necessary changes by allocating the required blocks and any
 * necessary intermediate nodes. It converts configurations from other forms of
 * block mapping (the one that currently exists is direct mapping) to a B-tree.
 *
 * Return: 0 on success or a negative error code on failure.
 */
int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
                                   __u64 key, __u64 ptr,
                                   const __u64 *keys, const __u64 *ptrs, int n)
{
        struct buffer_head *bh = NULL;
        union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
        struct nilfs_bmap_stats stats;
        int ret;

        if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
                di = &dreq;
                ni = NULL;
        } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
                           nilfs_btree_node_size(btree))) {
                di = &dreq;
                ni = &nreq;
        } else {
                di = NULL;
                ni = NULL;
                BUG();
        }

        ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh,
                                                     &stats);
        if (ret < 0)
                return ret;
        nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
                                              di, ni, bh);
        nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
        return 0;
}

static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
                                   struct nilfs_btree_path *path,
                                   int level,
                                   struct buffer_head *bh)
{
        while ((++level < nilfs_btree_height(btree) - 1) &&
               !buffer_dirty(path[level].bp_bh))
                mark_buffer_dirty(path[level].bp_bh);

        return 0;
}

static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
                                        struct nilfs_btree_path *path,
                                        int level, struct inode *dat)
{
        struct nilfs_btree_node *parent;
        int ncmax, ret;

        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
        path[level].bp_oldreq.bpr_ptr =
                nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
                                         ncmax);
        path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
        ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
                                       &path[level].bp_newreq.bpr_req);
        if (ret < 0)
                return ret;

        if (buffer_nilfs_node(path[level].bp_bh)) {
                path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
                path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
                path[level].bp_ctxt.bh = path[level].bp_bh;
                ret = nilfs_btnode_prepare_change_key(
                        NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
                        &path[level].bp_ctxt);
                if (ret < 0) {
                        nilfs_dat_abort_update(dat,
                                               &path[level].bp_oldreq.bpr_req,
                                               &path[level].bp_newreq.bpr_req);
                        return ret;
                }
        }

        return 0;
}

static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
                                        struct nilfs_btree_path *path,
                                        int level, struct inode *dat)
{
        struct nilfs_btree_node *parent;
        int ncmax;

        nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
                                &path[level].bp_newreq.bpr_req,
                                btree->b_ptr_type == NILFS_BMAP_PTR_VS);

        if (buffer_nilfs_node(path[level].bp_bh)) {
                nilfs_btnode_commit_change_key(
                        NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
                        &path[level].bp_ctxt);
                path[level].bp_bh = path[level].bp_ctxt.bh;
        }
        set_buffer_nilfs_volatile(path[level].bp_bh);

        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
        nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index,
                                 path[level].bp_newreq.bpr_ptr, ncmax);
}

static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
                                       struct nilfs_btree_path *path,
                                       int level, struct inode *dat)
{
        nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
                               &path[level].bp_newreq.bpr_req);
        if (buffer_nilfs_node(path[level].bp_bh))
                nilfs_btnode_abort_change_key(
                        NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
                        &path[level].bp_ctxt);
}

static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree,
                                           struct nilfs_btree_path *path,
                                           int minlevel, int *maxlevelp,
                                           struct inode *dat)
{
        int level, ret;

        level = minlevel;
        if (!buffer_nilfs_volatile(path[level].bp_bh)) {
                ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
                if (ret < 0)
                        return ret;
        }
        while ((++level < nilfs_btree_height(btree) - 1) &&
               !buffer_dirty(path[level].bp_bh)) {

                WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
                ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
                if (ret < 0)
                        goto out;
        }

        /* success */
        *maxlevelp = level - 1;
        return 0;

        /* error */
 out:
        while (--level > minlevel)
                nilfs_btree_abort_update_v(btree, path, level, dat);
        if (!buffer_nilfs_volatile(path[level].bp_bh))
                nilfs_btree_abort_update_v(btree, path, level, dat);
        return ret;
}

static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree,
                                           struct nilfs_btree_path *path,
                                           int minlevel, int maxlevel,
                                           struct buffer_head *bh,
                                           struct inode *dat)
{
        int level;

        if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
                nilfs_btree_commit_update_v(btree, path, minlevel, dat);

        for (level = minlevel + 1; level <= maxlevel; level++)
                nilfs_btree_commit_update_v(btree, path, level, dat);
}

static int nilfs_btree_propagate_v(struct nilfs_bmap *btree,
                                   struct nilfs_btree_path *path,
                                   int level, struct buffer_head *bh)
{
        int maxlevel = 0, ret;
        struct nilfs_btree_node *parent;
        struct inode *dat = nilfs_bmap_get_dat(btree);
        __u64 ptr;
        int ncmax;

        get_bh(bh);
        path[level].bp_bh = bh;
        ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
                                              dat);
        if (ret < 0)
                goto out;

        if (buffer_nilfs_volatile(path[level].bp_bh)) {
                parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
                ptr = nilfs_btree_node_get_ptr(parent,
                                               path[level + 1].bp_index,
                                               ncmax);
                ret = nilfs_dat_mark_dirty(dat, ptr);
                if (ret < 0)
                        goto out;
        }

        nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);

 out:
        brelse(path[level].bp_bh);
        path[level].bp_bh = NULL;
        return ret;
}

static int nilfs_btree_propagate(struct nilfs_bmap *btree,
                                 struct buffer_head *bh)
{
        struct nilfs_btree_path *path;
        struct nilfs_btree_node *node;
        __u64 key;
        int level, ret;

        WARN_ON(!buffer_dirty(bh));

        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;

        if (buffer_nilfs_node(bh)) {
                node = (struct nilfs_btree_node *)bh->b_data;
                key = nilfs_btree_node_get_key(node, 0);
                level = nilfs_btree_node_get_level(node);
        } else {
                key = nilfs_bmap_data_get_key(btree, bh);
                level = NILFS_BTREE_LEVEL_DATA;
        }

        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
        if (ret < 0) {
                if (unlikely(ret == -ENOENT)) {
                        nilfs_crit(btree->b_inode->i_sb,
                                   "writing node/leaf block does not appear in b-tree (ino=%llu) at key=%llu, level=%d",
                                   btree->b_inode->i_ino,
                                   (unsigned long long)key, level);
                        ret = -EINVAL;
                }
                goto out;
        }

        ret = NILFS_BMAP_USE_VBN(btree) ?
                nilfs_btree_propagate_v(btree, path, level, bh) :
                nilfs_btree_propagate_p(btree, path, level, bh);

 out:
        nilfs_btree_free_path(path);

        return ret;
}

static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree,
                                    struct buffer_head *bh)
{
        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr);
}

static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
                                         struct list_head *lists,
                                         struct buffer_head *bh)
{
        struct list_head *head;
        struct buffer_head *cbh;
        struct nilfs_btree_node *node, *cnode;
        __u64 key, ckey;
        int level;

        get_bh(bh);
        node = (struct nilfs_btree_node *)bh->b_data;
        key = nilfs_btree_node_get_key(node, 0);
        level = nilfs_btree_node_get_level(node);
        if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
            level >= NILFS_BTREE_LEVEL_MAX) {
                dump_stack();
                nilfs_warn(btree->b_inode->i_sb,
                           "invalid btree level: %d (key=%llu, ino=%llu, blocknr=%llu)",
                           level, (unsigned long long)key,
                           btree->b_inode->i_ino,
                           (unsigned long long)bh->b_blocknr);
                return;
        }

        list_for_each(head, &lists[level]) {
                cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
                cnode = (struct nilfs_btree_node *)cbh->b_data;
                ckey = nilfs_btree_node_get_key(cnode, 0);
                if (key < ckey)
                        break;
        }
        list_add_tail(&bh->b_assoc_buffers, head);
}

static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
                                             struct list_head *listp)
{
        struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
        struct address_space *btcache = btnc_inode->i_mapping;
        struct list_head lists[NILFS_BTREE_LEVEL_MAX];
        struct folio_batch fbatch;
        struct buffer_head *bh, *head;
        pgoff_t index = 0;
        int level, i;

        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < NILFS_BTREE_LEVEL_MAX;
             level++)
                INIT_LIST_HEAD(&lists[level]);

        folio_batch_init(&fbatch);

        while (filemap_get_folios_tag(btcache, &index, (pgoff_t)-1,
                                PAGECACHE_TAG_DIRTY, &fbatch)) {
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        bh = head = folio_buffers(fbatch.folios[i]);
                        do {
                                if (buffer_dirty(bh))
                                        nilfs_btree_add_dirty_buffer(btree,
                                                                     lists, bh);
                        } while ((bh = bh->b_this_page) != head);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }

        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < NILFS_BTREE_LEVEL_MAX;
             level++)
                list_splice_tail(&lists[level], listp);
}

static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
                                struct nilfs_btree_path *path,
                                int level,
                                struct buffer_head **bh,
                                sector_t blocknr,
                                union nilfs_binfo *binfo)
{
        struct nilfs_btree_node *parent;
        __u64 key;
        __u64 ptr;
        int ncmax, ret;

        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
        ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
                                       ncmax);
        if (buffer_nilfs_node(*bh)) {
                path[level].bp_ctxt.oldkey = ptr;
                path[level].bp_ctxt.newkey = blocknr;
                path[level].bp_ctxt.bh = *bh;
                ret = nilfs_btnode_prepare_change_key(
                        NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
                        &path[level].bp_ctxt);
                if (ret < 0)
                        return ret;
                nilfs_btnode_commit_change_key(
                        NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
                        &path[level].bp_ctxt);
                *bh = path[level].bp_ctxt.bh;
        }

        nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr,
                                 ncmax);

        key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
        /* on-disk format */
        binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
        binfo->bi_dat.bi_level = level;
        memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad));

        return 0;
}

static int nilfs_btree_assign_v(struct nilfs_bmap *btree,
                                struct nilfs_btree_path *path,
                                int level,
                                struct buffer_head **bh,
                                sector_t blocknr,
                                union nilfs_binfo *binfo)
{
        struct nilfs_btree_node *parent;
        struct inode *dat = nilfs_bmap_get_dat(btree);
        __u64 key;
        __u64 ptr;
        union nilfs_bmap_ptr_req req;
        int ncmax, ret;

        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
        ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
                                       ncmax);
        req.bpr_ptr = ptr;
        ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
        if (ret < 0)
                return ret;
        nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);

        key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
        /* on-disk format */
        binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
        binfo->bi_v.bi_blkoff = cpu_to_le64(key);

        return 0;
}

static int nilfs_btree_assign(struct nilfs_bmap *btree,
                              struct buffer_head **bh,
                              sector_t blocknr,
                              union nilfs_binfo *binfo)
{
        struct nilfs_btree_path *path;
        struct nilfs_btree_node *node;
        __u64 key;
        int level, ret;

        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;

        if (buffer_nilfs_node(*bh)) {
                node = (struct nilfs_btree_node *)(*bh)->b_data;
                key = nilfs_btree_node_get_key(node, 0);
                level = nilfs_btree_node_get_level(node);
        } else {
                key = nilfs_bmap_data_get_key(btree, *bh);
                level = NILFS_BTREE_LEVEL_DATA;
        }

        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
        if (ret < 0) {
                WARN_ON(ret == -ENOENT);
                goto out;
        }

        ret = NILFS_BMAP_USE_VBN(btree) ?
                nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);

 out:
        nilfs_btree_free_path(path);

        return ret;
}

static int nilfs_btree_assign_gc(struct nilfs_bmap *btree,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
{
        struct nilfs_btree_node *node;
        __u64 key;
        int ret;

        ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr,
                             blocknr);
        if (ret < 0)
                return ret;

        if (buffer_nilfs_node(*bh)) {
                node = (struct nilfs_btree_node *)(*bh)->b_data;
                key = nilfs_btree_node_get_key(node, 0);
        } else
                key = nilfs_bmap_data_get_key(btree, *bh);

        /* on-disk format */
        binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
        binfo->bi_v.bi_blkoff = cpu_to_le64(key);

        return 0;
}

static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
{
        struct buffer_head *bh;
        struct nilfs_btree_path *path;
        __u64 ptr;
        int ret;

        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;

        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0);
        if (ret < 0) {
                WARN_ON(ret == -ENOENT);
                goto out;
        }
        ret = nilfs_btree_get_block(btree, ptr, &bh);
        if (ret < 0) {
                WARN_ON(ret == -ENOENT);
                goto out;
        }

        if (!buffer_dirty(bh))
                mark_buffer_dirty(bh);
        brelse(bh);
        if (!nilfs_bmap_dirty(btree))
                nilfs_bmap_set_dirty(btree);

 out:
        nilfs_btree_free_path(path);
        return ret;
}

static const struct nilfs_bmap_operations nilfs_btree_ops = {
        .bop_lookup                =        nilfs_btree_lookup,
        .bop_lookup_contig        =        nilfs_btree_lookup_contig,
        .bop_insert                =        nilfs_btree_insert,
        .bop_delete                =        nilfs_btree_delete,
        .bop_clear                =        NULL,

        .bop_propagate                =        nilfs_btree_propagate,

        .bop_lookup_dirty_buffers =        nilfs_btree_lookup_dirty_buffers,

        .bop_assign                =        nilfs_btree_assign,
        .bop_mark                =        nilfs_btree_mark,

        .bop_seek_key                =        nilfs_btree_seek_key,
        .bop_last_key                =        nilfs_btree_last_key,

        .bop_check_insert        =        NULL,
        .bop_check_delete        =        nilfs_btree_check_delete,
        .bop_gather_data        =        nilfs_btree_gather_data,
};

static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
        .bop_lookup                =        NULL,
        .bop_lookup_contig        =        NULL,
        .bop_insert                =        NULL,
        .bop_delete                =        NULL,
        .bop_clear                =        NULL,

        .bop_propagate                =        nilfs_btree_propagate_gc,

        .bop_lookup_dirty_buffers =        nilfs_btree_lookup_dirty_buffers,

        .bop_assign                =        nilfs_btree_assign_gc,
        .bop_mark                =        NULL,

        .bop_seek_key                =        NULL,
        .bop_last_key                =        NULL,

        .bop_check_insert        =        NULL,
        .bop_check_delete        =        NULL,
        .bop_gather_data        =        NULL,
};

static void __nilfs_btree_init(struct nilfs_bmap *bmap)
{
        bmap->b_ops = &nilfs_btree_ops;
        bmap->b_nchildren_per_block =
                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
}

int nilfs_btree_init(struct nilfs_bmap *bmap)
{
        int ret = 0;

        __nilfs_btree_init(bmap);

        if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode))
                ret = -EIO;
        else
                ret = nilfs_attach_btree_node_cache(
                        &NILFS_BMAP_I(bmap)->vfs_inode);

        return ret;
}

void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
{
        bmap->b_ops = &nilfs_btree_ops_gc;
        bmap->b_nchildren_per_block =
                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
}



























































    2 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM notifier

#if !defined(_TRACE_NOTIFIERS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NOTIFIERS_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(notifier_info,

        TP_PROTO(void *cb),

        TP_ARGS(cb),

        TP_STRUCT__entry(
                __field(void *, cb)
        ),

        TP_fast_assign(
                __entry->cb = cb;
        ),

        TP_printk("%ps", __entry->cb)
);

/*
 * notifier_register - called upon notifier callback registration
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_register,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

/*
 * notifier_unregister - called upon notifier callback unregistration
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_unregister,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

/*
 * notifier_run - called upon notifier callback execution
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_run,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

#endif /* _TRACE_NOTIFIERS_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






































































































































   14 



   14 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
// SPDX-License-Identifier: GPL-2.0
/*
 * Block rq-qos policy for assigning an I/O priority class to requests.
 *
 * Using an rq-qos policy for assigning I/O priority class has two advantages
 * over using the ioprio_set() system call:
 *
 * - This policy is cgroup based so it has all the advantages of cgroups.
 * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos
 *   controller affects page cache writeback I/O for filesystems that support
 *   assiociating a cgroup with writeback I/O. See also
 *   Documentation/admin-guide/cgroup-v2.rst.
 */

#include <linux/blk-mq.h>
#include <linux/blk_types.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-rq-qos.h"

/**
 * enum prio_policy - I/O priority class policy.
 * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
 * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT.
 * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
 *                IOPRIO_CLASS_BE.
 * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
 * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT.
 *
 * See also <linux/ioprio.h>.
 */
enum prio_policy {
        POLICY_NO_CHANGE        = 0,
        POLICY_PROMOTE_TO_RT        = 1,
        POLICY_RESTRICT_TO_BE        = 2,
        POLICY_ALL_TO_IDLE        = 3,
        POLICY_NONE_TO_RT        = 4,
};

static const char *policy_name[] = {
        [POLICY_NO_CHANGE]        = "no-change",
        [POLICY_PROMOTE_TO_RT]        = "promote-to-rt",
        [POLICY_RESTRICT_TO_BE]        = "restrict-to-be",
        [POLICY_ALL_TO_IDLE]        = "idle",
        [POLICY_NONE_TO_RT]        = "none-to-rt",
};

static struct blkcg_policy ioprio_policy;

/**
 * struct ioprio_blkcg - Per cgroup data.
 * @cpd: blkcg_policy_data structure.
 * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>.
 */
struct ioprio_blkcg {
        struct blkcg_policy_data cpd;
        enum prio_policy         prio_policy;
};

static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg)
{
        return container_of(blkcg_to_cpd(blkcg, &ioprio_policy),
                            struct ioprio_blkcg, cpd);
}

static struct ioprio_blkcg *
ioprio_blkcg_from_css(struct cgroup_subsys_state *css)
{
        return blkcg_to_ioprio_blkcg(css_to_blkcg(css));
}

static int ioprio_show_prio_policy(struct seq_file *sf, void *v)
{
        struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf));

        seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]);
        return 0;
}

static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
                                      size_t nbytes, loff_t off)
{
        struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of));
        int ret;

        if (off != 0)
                return -EIO;
        /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */
        ret = sysfs_match_string(policy_name, buf);
        if (ret < 0)
                return ret;
        blkcg->prio_policy = ret;
        return nbytes;
}

static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp)
{
        struct ioprio_blkcg *blkcg;

        blkcg = kzalloc_obj(*blkcg, gfp);
        if (!blkcg)
                return NULL;
        blkcg->prio_policy = POLICY_NO_CHANGE;
        return &blkcg->cpd;
}

static void ioprio_free_cpd(struct blkcg_policy_data *cpd)
{
        struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd);

        kfree(blkcg);
}

static struct cftype ioprio_files[] = {
        {
                .name                = "prio.class",
                .seq_show        = ioprio_show_prio_policy,
                .write                = ioprio_set_prio_policy,
        },
        { } /* sentinel */
};

static struct blkcg_policy ioprio_policy = {
        .dfl_cftypes        = ioprio_files,
        .legacy_cftypes = ioprio_files,

        .cpd_alloc_fn        = ioprio_alloc_cpd,
        .cpd_free_fn        = ioprio_free_cpd,
};

void blkcg_set_ioprio(struct bio *bio)
{
        struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg);
        u16 prio;

        if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
                return;

        if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT ||
            blkcg->prio_policy == POLICY_NONE_TO_RT) {
                /*
                 * For RT threads, the default priority level is 4 because
                 * task_nice is 0. By promoting non-RT io-priority to RT-class
                 * and default level 4, those requests that are already
                 * RT-class but need a higher io-priority can use ioprio_set()
                 * to achieve this.
                 */
                if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT)
                        bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4);
                return;
        }

        /*
         * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
         * correspond to a lower priority. Hence, the max_t() below selects
         * the lower priority of bi_ioprio and the cgroup I/O priority class.
         * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O
         * priority is assigned to the bio.
         */
        prio = max_t(u16, bio->bi_ioprio,
                        IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
        if (prio > bio->bi_ioprio)
                bio->bi_ioprio = prio;
}

static int __init ioprio_init(void)
{
        return blkcg_policy_register(&ioprio_policy);
}

static void __exit ioprio_exit(void)
{
        blkcg_policy_unregister(&ioprio_policy);
}

module_init(ioprio_init);
module_exit(ioprio_exit);




































































































   19 
   18 








   21 




   20 







   18 


























   19 
   19 















   21 











    6 













    6 
































































    2 
    2 





    2 



































   16 












   19 


   18 
    7 
















































    1 




    1 

    1 







    1 






    2 




    2 










    1 







    2 
    2 









    1 















    2 















    2 







    1 










    2 


































    2 





























   14 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   17 





















































































































































































































































































































































































































































































































































































































































































































































































    6 



























    6 




























































    6 












    6 









    6 




    6 
    6 










    6 
    6 







    6 



















































































































































































































































































































   19 





   21 
















   19 




   17 
























































   13 


   15 








































































































































    6 









   17 




   18 










    4 


























































































    6 








    6 


































































































    6 





    5 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Based on the design of the Berkeley Packet Filter. The new
 * internal format has been designed by PLUMgrid:
 *
 *        Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
 *
 * Authors:
 *
 *        Jay Schulist <jschlst@samba.org>
 *        Alexei Starovoitov <ast@plumgrid.com>
 *        Daniel Borkmann <dborkman@redhat.com>
 *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */

#include <uapi/linux/btf.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/prandom.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/hex.h>
#include <linux/objtool.h>
#include <linux/overflow.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
#include <linux/nodemask.h>
#include <linux/nospec.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
#include <linux/execmem.h>
#include <crypto/sha2.h>

#include <asm/barrier.h>
#include <linux/unaligned.h>

/* Registers */
#define BPF_R0        regs[BPF_REG_0]
#define BPF_R1        regs[BPF_REG_1]
#define BPF_R2        regs[BPF_REG_2]
#define BPF_R3        regs[BPF_REG_3]
#define BPF_R4        regs[BPF_REG_4]
#define BPF_R5        regs[BPF_REG_5]
#define BPF_R6        regs[BPF_REG_6]
#define BPF_R7        regs[BPF_REG_7]
#define BPF_R8        regs[BPF_REG_8]
#define BPF_R9        regs[BPF_REG_9]
#define BPF_R10        regs[BPF_REG_10]

/* Named registers */
#define DST        regs[insn->dst_reg]
#define SRC        regs[insn->src_reg]
#define FP        regs[BPF_REG_FP]
#define AX        regs[BPF_REG_AX]
#define ARG1        regs[BPF_REG_ARG1]
#define CTX        regs[BPF_REG_CTX]
#define OFF        insn->off
#define IMM        insn->imm

struct bpf_mem_alloc bpf_global_ma;
bool bpf_global_ma_set;

/* No hurry in this branch
 *
 * Exported for the bpf jit load helper.
 */
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
{
        u8 *ptr = NULL;

        if (k >= SKF_NET_OFF) {
                ptr = skb_network_header(skb) + k - SKF_NET_OFF;
        } else if (k >= SKF_LL_OFF) {
                if (unlikely(!skb_mac_header_was_set(skb)))
                        return NULL;
                ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
        }
        if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
                return ptr;

        return NULL;
}

/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
enum page_size_enum {
        __PAGE_SIZE = PAGE_SIZE
};

struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog_aux *aux;
        struct bpf_prog *fp;

        size = round_up(size, __PAGE_SIZE);
        fp = __vmalloc(size, gfp_flags);
        if (fp == NULL)
                return NULL;

        aux = kzalloc_obj(*aux, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
        if (aux == NULL) {
                vfree(fp);
                return NULL;
        }
        fp->active = __alloc_percpu_gfp(sizeof(u8[BPF_NR_CONTEXTS]), 4,
                                        bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
        if (!fp->active) {
                vfree(fp);
                kfree(aux);
                return NULL;
        }

        fp->pages = size / PAGE_SIZE;
        fp->aux = aux;
        fp->aux->main_prog_aux = aux;
        fp->aux->prog = fp;
        fp->jit_requested = ebpf_jit_enabled();
        fp->blinding_requested = bpf_jit_blinding_enabled(fp);
#ifdef CONFIG_CGROUP_BPF
        aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
#endif

        INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
#ifdef CONFIG_FINEIBT
        INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
#endif
        mutex_init(&fp->aux->used_maps_mutex);
        mutex_init(&fp->aux->ext_mutex);
        mutex_init(&fp->aux->dst_mutex);
        mutex_init(&fp->aux->st_ops_assoc_mutex);

#ifdef CONFIG_BPF_SYSCALL
        bpf_prog_stream_init(fp);
#endif

        return fp;
}

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog *prog;
        int cpu;

        prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
        if (!prog)
                return NULL;

        prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
        if (!prog->stats) {
                free_percpu(prog->active);
                kfree(prog->aux);
                vfree(prog);
                return NULL;
        }

        for_each_possible_cpu(cpu) {
                struct bpf_prog_stats *pstats;

                pstats = per_cpu_ptr(prog->stats, cpu);
                u64_stats_init(&pstats->syncp);
        }
        return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);

int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
{
        if (!prog->aux->nr_linfo || !prog->jit_requested)
                return 0;

        prog->aux->jited_linfo = kvzalloc_objs(*prog->aux->jited_linfo,
                                               prog->aux->nr_linfo,
                                               bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
        if (!prog->aux->jited_linfo)
                return -ENOMEM;

        return 0;
}

void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
{
        if (prog->aux->jited_linfo &&
            (!prog->jited || !prog->aux->jited_linfo[0])) {
                kvfree(prog->aux->jited_linfo);
                prog->aux->jited_linfo = NULL;
        }

        kfree(prog->aux->kfunc_tab);
        prog->aux->kfunc_tab = NULL;
}

/* The jit engine is responsible to provide an array
 * for insn_off to the jited_off mapping (insn_to_jit_off).
 *
 * The idx to this array is the insn_off.  Hence, the insn_off
 * here is relative to the prog itself instead of the main prog.
 * This array has one entry for each xlated bpf insn.
 *
 * jited_off is the byte off to the end of the jited insn.
 *
 * Hence, with
 * insn_start:
 *      The first bpf insn off of the prog.  The insn off
 *      here is relative to the main prog.
 *      e.g. if prog is a subprog, insn_start > 0
 * linfo_idx:
 *      The prog's idx to prog->aux->linfo and jited_linfo
 *
 * jited_linfo[linfo_idx] = prog->bpf_func
 *
 * For i > linfo_idx,
 *
 * jited_linfo[i] = prog->bpf_func +
 *        insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
 */
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off)
{
        u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
        const struct bpf_line_info *linfo;
        void **jited_linfo;

        if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
                /* Userspace did not provide linfo */
                return;

        linfo_idx = prog->aux->linfo_idx;
        linfo = &prog->aux->linfo[linfo_idx];
        insn_start = linfo[0].insn_off;
        insn_end = insn_start + prog->len;

        jited_linfo = &prog->aux->jited_linfo[linfo_idx];
        jited_linfo[0] = prog->bpf_func;

        nr_linfo = prog->aux->nr_linfo - linfo_idx;

        for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
                /* The verifier ensures that linfo[i].insn_off is
                 * strictly increasing
                 */
                jited_linfo[i] = prog->bpf_func +
                        insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
}

struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog *fp;
        u32 pages;

        size = round_up(size, PAGE_SIZE);
        pages = size / PAGE_SIZE;
        if (pages <= fp_old->pages)
                return fp_old;

        fp = __vmalloc(size, gfp_flags);
        if (fp) {
                memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
                fp->pages = pages;
                fp->aux->prog = fp;

                /* We keep fp->aux from fp_old around in the new
                 * reallocated structure.
                 */
                fp_old->aux = NULL;
                fp_old->stats = NULL;
                fp_old->active = NULL;
                __bpf_prog_free(fp_old);
        }

        return fp;
}

void __bpf_prog_free(struct bpf_prog *fp)
{
        if (fp->aux) {
                mutex_destroy(&fp->aux->used_maps_mutex);
                mutex_destroy(&fp->aux->dst_mutex);
                mutex_destroy(&fp->aux->st_ops_assoc_mutex);
                kfree(fp->aux->poke_tab);
                kfree(fp->aux);
        }
        free_percpu(fp->stats);
        free_percpu(fp->active);
        vfree(fp);
}

int bpf_prog_calc_tag(struct bpf_prog *fp)
{
        size_t size = bpf_prog_insn_size(fp);
        struct bpf_insn *dst;
        bool was_ld_map;
        u32 i;

        dst = vmalloc(size);
        if (!dst)
                return -ENOMEM;

        /* We need to take out the map fd for the digest calculation
         * since they are unstable from user space side.
         */
        for (i = 0, was_ld_map = false; i < fp->len; i++) {
                dst[i] = fp->insnsi[i];
                if (!was_ld_map &&
                    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
                    (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
                     dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
                        was_ld_map = true;
                        dst[i].imm = 0;
                } else if (was_ld_map &&
                           dst[i].code == 0 &&
                           dst[i].dst_reg == 0 &&
                           dst[i].src_reg == 0 &&
                           dst[i].off == 0) {
                        was_ld_map = false;
                        dst[i].imm = 0;
                } else {
                        was_ld_map = false;
                }
        }
        sha256((u8 *)dst, size, fp->digest);
        vfree(dst);
        return 0;
}

static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
                                s32 end_new, s32 curr, const bool probe_pass)
{
        const s64 imm_min = S32_MIN, imm_max = S32_MAX;
        s32 delta = end_new - end_old;
        s64 imm = insn->imm;

        if (curr < pos && curr + imm + 1 >= end_old)
                imm += delta;
        else if (curr >= end_new && curr + imm + 1 < end_new)
                imm -= delta;
        if (imm < imm_min || imm > imm_max)
                return -ERANGE;
        if (!probe_pass)
                insn->imm = imm;
        return 0;
}

static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
                                s32 end_new, s32 curr, const bool probe_pass)
{
        s64 off_min, off_max, off;
        s32 delta = end_new - end_old;

        if (insn->code == (BPF_JMP32 | BPF_JA)) {
                off = insn->imm;
                off_min = S32_MIN;
                off_max = S32_MAX;
        } else {
                off = insn->off;
                off_min = S16_MIN;
                off_max = S16_MAX;
        }

        if (curr < pos && curr + off + 1 >= end_old)
                off += delta;
        else if (curr >= end_new && curr + off + 1 < end_new)
                off -= delta;
        if (off < off_min || off > off_max)
                return -ERANGE;
        if (!probe_pass) {
                if (insn->code == (BPF_JMP32 | BPF_JA))
                        insn->imm = off;
                else
                        insn->off = off;
        }
        return 0;
}

static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
                            s32 end_new, const bool probe_pass)
{
        u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
        struct bpf_insn *insn = prog->insnsi;
        int ret = 0;

        for (i = 0; i < insn_cnt; i++, insn++) {
                u8 code;

                /* In the probing pass we still operate on the original,
                 * unpatched image in order to check overflows before we
                 * do any other adjustments. Therefore skip the patchlet.
                 */
                if (probe_pass && i == pos) {
                        i = end_new;
                        insn = prog->insnsi + end_old;
                }
                if (bpf_pseudo_func(insn)) {
                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                        if (ret)
                                return ret;
                        continue;
                }
                code = insn->code;
                if ((BPF_CLASS(code) != BPF_JMP &&
                     BPF_CLASS(code) != BPF_JMP32) ||
                    BPF_OP(code) == BPF_EXIT)
                        continue;
                /* Adjust offset of jmps if we cross patch boundaries. */
                if (BPF_OP(code) == BPF_CALL) {
                        if (insn->src_reg != BPF_PSEUDO_CALL)
                                continue;
                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                } else {
                        ret = bpf_adj_delta_to_off(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                }
                if (ret)
                        break;
        }

        return ret;
}

static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
{
        struct bpf_line_info *linfo;
        u32 i, nr_linfo;

        nr_linfo = prog->aux->nr_linfo;
        if (!nr_linfo || !delta)
                return;

        linfo = prog->aux->linfo;

        for (i = 0; i < nr_linfo; i++)
                if (off < linfo[i].insn_off)
                        break;

        /* Push all off < linfo[i].insn_off by delta */
        for (; i < nr_linfo; i++)
                linfo[i].insn_off += delta;
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len)
{
        u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
        const u32 cnt_max = S16_MAX;
        struct bpf_prog *prog_adj;
        int err;

        /* Since our patchlet doesn't expand the image, we're done. */
        if (insn_delta == 0) {
                memcpy(prog->insnsi + off, patch, sizeof(*patch));
                return prog;
        }

        insn_adj_cnt = prog->len + insn_delta;

        /* Reject anything that would potentially let the insn->off
         * target overflow when we have excessive program expansions.
         * We need to probe here before we do any reallocation where
         * we afterwards may not fail anymore.
         */
        if (insn_adj_cnt > cnt_max &&
            (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
                return ERR_PTR(err);

        /* Several new instructions need to be inserted. Make room
         * for them. Likely, there's no need for a new allocation as
         * last page could have large enough tailroom.
         */
        prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
                                    GFP_USER);
        if (!prog_adj)
                return ERR_PTR(-ENOMEM);

        prog_adj->len = insn_adj_cnt;

        /* Patching happens in 3 steps:
         *
         * 1) Move over tail of insnsi from next instruction onwards,
         *    so we can patch the single target insn with one or more
         *    new ones (patching is always from 1 to n insns, n > 0).
         * 2) Inject new instructions at the target location.
         * 3) Adjust branch offsets if necessary.
         */
        insn_rest = insn_adj_cnt - off - len;

        memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
                sizeof(*patch) * insn_rest);
        memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);

        /* We are guaranteed to not fail at this point, otherwise
         * the ship has sailed to reverse to the original state. An
         * overflow cannot happen at this point.
         */
        BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));

        bpf_adj_linfo(prog_adj, off, insn_delta);

        return prog_adj;
}

int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
{
        int err;

        /* Branch offsets can't overflow when program is shrinking, no need
         * to call bpf_adj_branches(..., true) here
         */
        memmove(prog->insnsi + off, prog->insnsi + off + cnt,
                sizeof(struct bpf_insn) * (prog->len - off - cnt));
        prog->len -= cnt;

        err = bpf_adj_branches(prog, off, off + cnt, off, false);
        WARN_ON_ONCE(err);
        return err;
}

static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
        int i;

        for (i = 0; i < fp->aux->real_func_cnt; i++)
                bpf_prog_kallsyms_del(fp->aux->func[i]);
}

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
{
        bpf_prog_kallsyms_del_subprogs(fp);
        bpf_prog_kallsyms_del(fp);
}

#ifdef CONFIG_BPF_JIT
/* All BPF JIT sysctl knobs here. */
int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden   __read_mostly;
long bpf_jit_limit   __read_mostly;
long bpf_jit_limit_max __read_mostly;

static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
        WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));

        prog->aux->ksym.start = (unsigned long) prog->bpf_func;
        prog->aux->ksym.end   = prog->aux->ksym.start + prog->jited_len;
}

static void
bpf_prog_ksym_set_name(struct bpf_prog *prog)
{
        char *sym = prog->aux->ksym.name;
        const char *end = sym + KSYM_NAME_LEN;
        const struct btf_type *type;
        const char *func_name;

        BUILD_BUG_ON(sizeof("bpf_prog_") +
                     sizeof(prog->tag) * 2 +
                     /* name has been null terminated.
                      * We should need +1 for the '_' preceding
                      * the name.  However, the null character
                      * is double counted between the name and the
                      * sizeof("bpf_prog_") above, so we omit
                      * the +1 here.
                      */
                     sizeof(prog->aux->name) > KSYM_NAME_LEN);

        sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
        sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));

        /* prog->aux->name will be ignored if full btf name is available */
        if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
                type = btf_type_by_id(prog->aux->btf,
                                      prog->aux->func_info[prog->aux->func_idx].type_id);
                func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
                snprintf(sym, (size_t)(end - sym), "_%s", func_name);
                return;
        }

        if (prog->aux->name[0])
                snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
        else
                *sym = 0;
}

static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
{
        return container_of(n, struct bpf_ksym, tnode)->start;
}

static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
                                          struct latch_tree_node *b)
{
        return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
}

static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
{
        unsigned long val = (unsigned long)key;
        const struct bpf_ksym *ksym;

        ksym = container_of(n, struct bpf_ksym, tnode);

        if (val < ksym->start)
                return -1;
        /* Ensure that we detect return addresses as part of the program, when
         * the final instruction is a call for a program part of the stack
         * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
         */
        if (val > ksym->end)
                return  1;

        return 0;
}

static const struct latch_tree_ops bpf_tree_ops = {
        .less        = bpf_tree_less,
        .comp        = bpf_tree_comp,
};

static DEFINE_SPINLOCK(bpf_lock);
static LIST_HEAD(bpf_kallsyms);
static struct latch_tree_root bpf_tree __cacheline_aligned;

void bpf_ksym_add(struct bpf_ksym *ksym)
{
        spin_lock_bh(&bpf_lock);
        WARN_ON_ONCE(!list_empty(&ksym->lnode));
        list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
        latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
        spin_unlock_bh(&bpf_lock);
}

static void __bpf_ksym_del(struct bpf_ksym *ksym)
{
        if (list_empty(&ksym->lnode))
                return;

        latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
        list_del_rcu(&ksym->lnode);
}

void bpf_ksym_del(struct bpf_ksym *ksym)
{
        spin_lock_bh(&bpf_lock);
        __bpf_ksym_del(ksym);
        spin_unlock_bh(&bpf_lock);
}

static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
{
        return fp->jited && !bpf_prog_was_classic(fp);
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
        if (!bpf_prog_kallsyms_candidate(fp) ||
            !bpf_token_capable(fp->aux->token, CAP_BPF))
                return;

        bpf_prog_ksym_set_addr(fp);
        bpf_prog_ksym_set_name(fp);
        fp->aux->ksym.prog = true;

        bpf_ksym_add(&fp->aux->ksym);

#ifdef CONFIG_FINEIBT
        /*
         * When FineIBT, code in the __cfi_foo() symbols can get executed
         * and hence unwinder needs help.
         */
        if (cfi_mode != CFI_FINEIBT)
                return;

        snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
                 "__cfi_%s", fp->aux->ksym.name);

        fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
        fp->aux->ksym_prefix.end   = (unsigned long) fp->bpf_func;

        bpf_ksym_add(&fp->aux->ksym_prefix);
#endif
}

void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
        if (!bpf_prog_kallsyms_candidate(fp))
                return;

        bpf_ksym_del(&fp->aux->ksym);
#ifdef CONFIG_FINEIBT
        if (cfi_mode != CFI_FINEIBT)
                return;
        bpf_ksym_del(&fp->aux->ksym_prefix);
#endif
}

static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
{
        struct latch_tree_node *n;

        n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
        return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}

int bpf_address_lookup(unsigned long addr, unsigned long *size,
                       unsigned long *off, char *sym)
{
        struct bpf_ksym *ksym;
        int ret = 0;

        rcu_read_lock();
        ksym = bpf_ksym_find(addr);
        if (ksym) {
                unsigned long symbol_start = ksym->start;
                unsigned long symbol_end = ksym->end;

                ret = strscpy(sym, ksym->name, KSYM_NAME_LEN);

                if (size)
                        *size = symbol_end - symbol_start;
                if (off)
                        *off  = addr - symbol_start;
        }
        rcu_read_unlock();

        return ret;
}

bool is_bpf_text_address(unsigned long addr)
{
        bool ret;

        rcu_read_lock();
        ret = bpf_ksym_find(addr) != NULL;
        rcu_read_unlock();

        return ret;
}

struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
        struct bpf_ksym *ksym;

        WARN_ON_ONCE(!rcu_read_lock_held());
        ksym = bpf_ksym_find(addr);

        return ksym && ksym->prog ?
               container_of(ksym, struct bpf_prog_aux, ksym)->prog :
               NULL;
}

bool bpf_has_frame_pointer(unsigned long ip)
{
        struct bpf_ksym *ksym;
        unsigned long offset;

        guard(rcu)();

        ksym = bpf_ksym_find(ip);
        if (!ksym || !ksym->fp_start || !ksym->fp_end)
                return false;

        offset = ip - ksym->start;

        return offset >= ksym->fp_start && offset < ksym->fp_end;
}

const struct exception_table_entry *search_bpf_extables(unsigned long addr)
{
        const struct exception_table_entry *e = NULL;
        struct bpf_prog *prog;

        rcu_read_lock();
        prog = bpf_prog_ksym_find(addr);
        if (!prog)
                goto out;
        if (!prog->aux->num_exentries)
                goto out;

        e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
out:
        rcu_read_unlock();
        return e;
}

int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym)
{
        struct bpf_ksym *ksym;
        unsigned int it = 0;
        int ret = -ERANGE;

        if (!bpf_jit_kallsyms_enabled())
                return ret;

        rcu_read_lock();
        list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
                if (it++ != symnum)
                        continue;

                strscpy(sym, ksym->name, KSYM_NAME_LEN);

                *value = ksym->start;
                *type  = BPF_SYM_ELF_TYPE;

                ret = 0;
                break;
        }
        rcu_read_unlock();

        return ret;
}

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke)
{
        struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
        static const u32 poke_tab_max = 1024;
        u32 slot = prog->aux->size_poke_tab;
        u32 size = slot + 1;

        if (size > poke_tab_max)
                return -ENOSPC;
        if (poke->tailcall_target || poke->tailcall_target_stable ||
            poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
                return -EINVAL;

        switch (poke->reason) {
        case BPF_POKE_REASON_TAIL_CALL:
                if (!poke->tail_call.map)
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
        if (!tab)
                return -ENOMEM;

        memcpy(&tab[slot], poke, sizeof(*poke));
        prog->aux->size_poke_tab = size;
        prog->aux->poke_tab = tab;

        return slot;
}

/*
 * BPF program pack allocator.
 *
 * Most BPF programs are pretty small. Allocating a hole page for each
 * program is sometime a waste. Many small bpf program also adds pressure
 * to instruction TLB. To solve this issue, we introduce a BPF program pack
 * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
 * to host BPF programs.
 */
#define BPF_PROG_CHUNK_SHIFT        6
#define BPF_PROG_CHUNK_SIZE        (1 << BPF_PROG_CHUNK_SHIFT)
#define BPF_PROG_CHUNK_MASK        (~(BPF_PROG_CHUNK_SIZE - 1))

struct bpf_prog_pack {
        struct list_head list;
        void *ptr;
        unsigned long bitmap[];
};

void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
{
        memset(area, 0, size);
}

#define BPF_PROG_SIZE_TO_NBITS(size)        (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)

static DEFINE_MUTEX(pack_mutex);
static LIST_HEAD(pack_list);

/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
 * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
 */
#ifdef PMD_SIZE
/* PMD_SIZE is really big for some archs. It doesn't make sense to
 * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
 * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
 * greater than or equal to 2MB.
 */
#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
#else
#define BPF_PROG_PACK_SIZE PAGE_SIZE
#endif

#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)

static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_prog_pack *pack;
        int err;

        pack = kzalloc_flex(*pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT));
        if (!pack)
                return NULL;
        pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
        if (!pack->ptr)
                goto out;
        bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
        bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);

        set_vm_flush_reset_perms(pack->ptr);
        err = set_memory_rox((unsigned long)pack->ptr,
                             BPF_PROG_PACK_SIZE / PAGE_SIZE);
        if (err)
                goto out;
        list_add_tail(&pack->list, &pack_list);
        return pack;

out:
        bpf_jit_free_exec(pack->ptr);
        kfree(pack);
        return NULL;
}

void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
        struct bpf_prog_pack *pack;
        unsigned long pos;
        void *ptr = NULL;

        mutex_lock(&pack_mutex);
        if (size > BPF_PROG_PACK_SIZE) {
                size = round_up(size, PAGE_SIZE);
                ptr = bpf_jit_alloc_exec(size);
                if (ptr) {
                        int err;

                        bpf_fill_ill_insns(ptr, size);
                        set_vm_flush_reset_perms(ptr);
                        err = set_memory_rox((unsigned long)ptr,
                                             size / PAGE_SIZE);
                        if (err) {
                                bpf_jit_free_exec(ptr);
                                ptr = NULL;
                        }
                }
                goto out;
        }
        list_for_each_entry(pack, &pack_list, list) {
                pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
                                                 nbits, 0);
                if (pos < BPF_PROG_CHUNK_COUNT)
                        goto found_free_area;
        }

        pack = alloc_new_pack(bpf_fill_ill_insns);
        if (!pack)
                goto out;

        pos = 0;

found_free_area:
        bitmap_set(pack->bitmap, pos, nbits);
        ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);

out:
        mutex_unlock(&pack_mutex);
        return ptr;
}

void bpf_prog_pack_free(void *ptr, u32 size)
{
        struct bpf_prog_pack *pack = NULL, *tmp;
        unsigned int nbits;
        unsigned long pos;

        mutex_lock(&pack_mutex);
        if (size > BPF_PROG_PACK_SIZE) {
                bpf_jit_free_exec(ptr);
                goto out;
        }

        list_for_each_entry(tmp, &pack_list, list) {
                if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
                        pack = tmp;
                        break;
                }
        }

        if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
                goto out;

        nbits = BPF_PROG_SIZE_TO_NBITS(size);
        pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;

        WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
                  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");

        bitmap_clear(pack->bitmap, pos, nbits);
        if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
                                       BPF_PROG_CHUNK_COUNT, 0) == 0) {
                list_del(&pack->list);
                bpf_jit_free_exec(pack->ptr);
                kfree(pack);
        }
out:
        mutex_unlock(&pack_mutex);
}

static atomic_long_t bpf_jit_current;

/* Can be overridden by an arch's JIT compiler if it has a custom,
 * dedicated BPF backend memory area, or if neither of the two
 * below apply.
 */
u64 __weak bpf_jit_alloc_exec_limit(void)
{
#if defined(MODULES_VADDR)
        return MODULES_END - MODULES_VADDR;
#else
        return VMALLOC_END - VMALLOC_START;
#endif
}

static int __init bpf_jit_charge_init(void)
{
        /* Only used as heuristic here to derive limit. */
        bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
        bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
                                            PAGE_SIZE), LONG_MAX);
        return 0;
}
pure_initcall(bpf_jit_charge_init);

int bpf_jit_charge_modmem(u32 size)
{
        if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
                if (!bpf_capable()) {
                        atomic_long_sub(size, &bpf_jit_current);
                        return -EPERM;
                }
        }

        return 0;
}

void bpf_jit_uncharge_modmem(u32 size)
{
        atomic_long_sub(size, &bpf_jit_current);
}

void *__weak bpf_jit_alloc_exec(unsigned long size)
{
        return execmem_alloc(EXECMEM_BPF, size);
}

void __weak bpf_jit_free_exec(void *addr)
{
        execmem_free(addr);
}

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_binary_header *hdr;
        u32 size, hole, start;

        WARN_ON_ONCE(!is_power_of_2(alignment) ||
                     alignment > BPF_IMAGE_ALIGNMENT);

        /* Most of BPF filters are really small, but if some of them
         * fill a page, allow at least 128 extra bytes to insert a
         * random section of illegal instructions.
         */
        size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);

        if (bpf_jit_charge_modmem(size))
                return NULL;
        hdr = bpf_jit_alloc_exec(size);
        if (!hdr) {
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        /* Fill space with illegal/arch-dep instructions. */
        bpf_fill_ill_insns(hdr, size);

        hdr->size = size;
        hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
                     PAGE_SIZE - sizeof(*hdr));
        start = get_random_u32_below(hole) & ~(alignment - 1);

        /* Leave a random number of instructions before BPF code. */
        *image_ptr = &hdr->image[start];

        return hdr;
}

void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
        u32 size = hdr->size;

        bpf_jit_free_exec(hdr);
        bpf_jit_uncharge_modmem(size);
}

/* Allocate jit binary from bpf_prog_pack allocator.
 * Since the allocated memory is RO+X, the JIT engine cannot write directly
 * to the memory. To solve this problem, a RW buffer is also allocated at
 * as the same time. The JIT engine should calculate offsets based on the
 * RO memory address, but write JITed program to the RW buffer. Once the
 * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
 * the JITed program to the RO memory.
 */
struct bpf_binary_header *
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
                          unsigned int alignment,
                          struct bpf_binary_header **rw_header,
                          u8 **rw_image,
                          bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_binary_header *ro_header;
        u32 size, hole, start;

        WARN_ON_ONCE(!is_power_of_2(alignment) ||
                     alignment > BPF_IMAGE_ALIGNMENT);

        /* add 16 bytes for a random section of illegal instructions */
        size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);

        if (bpf_jit_charge_modmem(size))
                return NULL;
        ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
        if (!ro_header) {
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        *rw_header = kvmalloc(size, GFP_KERNEL);
        if (!*rw_header) {
                bpf_prog_pack_free(ro_header, size);
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        /* Fill space with illegal/arch-dep instructions. */
        bpf_fill_ill_insns(*rw_header, size);
        (*rw_header)->size = size;

        hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
                     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
        start = get_random_u32_below(hole) & ~(alignment - 1);

        *image_ptr = &ro_header->image[start];
        *rw_image = &(*rw_header)->image[start];

        return ro_header;
}

/* Copy JITed text from rw_header to its final location, the ro_header. */
int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
                                 struct bpf_binary_header *rw_header)
{
        void *ptr;

        ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);

        kvfree(rw_header);

        if (IS_ERR(ptr)) {
                bpf_prog_pack_free(ro_header, ro_header->size);
                return PTR_ERR(ptr);
        }
        return 0;
}

/* bpf_jit_binary_pack_free is called in two different scenarios:
 *   1) when the program is freed after;
 *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
 * For case 2), we need to free both the RO memory and the RW buffer.
 *
 * bpf_jit_binary_pack_free requires proper ro_header->size. However,
 * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
 * must be set with either bpf_jit_binary_pack_finalize (normal path) or
 * bpf_arch_text_copy (when jit fails).
 */
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
                              struct bpf_binary_header *rw_header)
{
        u32 size = ro_header->size;

        bpf_prog_pack_free(ro_header, size);
        kvfree(rw_header);
        bpf_jit_uncharge_modmem(size);
}

struct bpf_binary_header *
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
{
        unsigned long real_start = (unsigned long)fp->bpf_func;
        unsigned long addr;

        addr = real_start & BPF_PROG_CHUNK_MASK;
        return (void *)addr;
}

static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
        unsigned long real_start = (unsigned long)fp->bpf_func;
        unsigned long addr;

        addr = real_start & PAGE_MASK;
        return (void *)addr;
}

/* This symbol is only overridden by archs that have different
 * requirements than the usual eBPF JITs, f.e. when they only
 * implement cBPF JIT, do not set images read-only, etc.
 */
void __weak bpf_jit_free(struct bpf_prog *fp)
{
        if (fp->jited) {
                struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);

                bpf_jit_binary_free(hdr);
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
        }

        bpf_prog_unlock_free(fp);
}

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed)
{
        s16 off = insn->off;
        s32 imm = insn->imm;
        u8 *addr;
        int err;

        *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
        if (!*func_addr_fixed) {
                /* Place-holder address till the last pass has collected
                 * all addresses for JITed subprograms in which case we
                 * can pick them up from prog->aux.
                 */
                if (!extra_pass)
                        addr = NULL;
                else if (prog->aux->func &&
                         off >= 0 && off < prog->aux->real_func_cnt)
                        addr = (u8 *)prog->aux->func[off]->bpf_func;
                else
                        return -EINVAL;
        } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
                   bpf_jit_supports_far_kfunc_call()) {
                err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
                if (err)
                        return err;
        } else {
                /* Address of a BPF helper call. Since part of the core
                 * kernel, it's always at a fixed location. __bpf_call_base
                 * and the helper with imm relative to it are both in core
                 * kernel.
                 */
                addr = (u8 *)__bpf_call_base + imm;
        }

        *func_addr = (unsigned long)addr;
        return 0;
}

const char *bpf_jit_get_prog_name(struct bpf_prog *prog)
{
        if (prog->aux->ksym.prog)
                return prog->aux->ksym.name;
        return prog->aux->name;
}

static int bpf_jit_blind_insn(const struct bpf_insn *from,
                              const struct bpf_insn *aux,
                              struct bpf_insn *to_buff,
                              bool emit_zext)
{
        struct bpf_insn *to = to_buff;
        u32 imm_rnd = get_random_u32();
        s16 off;

        BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
        BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);

        /* Constraints on AX register:
         *
         * AX register is inaccessible from user space. It is mapped in
         * all JITs, and used here for constant blinding rewrites. It is
         * typically "stateless" meaning its contents are only valid within
         * the executed instruction, but not across several instructions.
         * There are a few exceptions however which are further detailed
         * below.
         *
         * Constant blinding is only used by JITs, not in the interpreter.
         * The interpreter uses AX in some occasions as a local temporary
         * register e.g. in DIV or MOD instructions.
         *
         * In restricted circumstances, the verifier can also use the AX
         * register for rewrites as long as they do not interfere with
         * the above cases!
         */
        if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
                goto out;

        if (from->imm == 0 &&
            (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
             from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
                *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
                goto out;
        }

        switch (from->code) {
        case BPF_ALU | BPF_ADD | BPF_K:
        case BPF_ALU | BPF_SUB | BPF_K:
        case BPF_ALU | BPF_AND | BPF_K:
        case BPF_ALU | BPF_OR  | BPF_K:
        case BPF_ALU | BPF_XOR | BPF_K:
        case BPF_ALU | BPF_MUL | BPF_K:
        case BPF_ALU | BPF_MOV | BPF_K:
        case BPF_ALU | BPF_DIV | BPF_K:
        case BPF_ALU | BPF_MOD | BPF_K:
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;

        case BPF_ALU64 | BPF_ADD | BPF_K:
        case BPF_ALU64 | BPF_SUB | BPF_K:
        case BPF_ALU64 | BPF_AND | BPF_K:
        case BPF_ALU64 | BPF_OR  | BPF_K:
        case BPF_ALU64 | BPF_XOR | BPF_K:
        case BPF_ALU64 | BPF_MUL | BPF_K:
        case BPF_ALU64 | BPF_MOV | BPF_K:
        case BPF_ALU64 | BPF_DIV | BPF_K:
        case BPF_ALU64 | BPF_MOD | BPF_K:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;

        case BPF_JMP | BPF_JEQ  | BPF_K:
        case BPF_JMP | BPF_JNE  | BPF_K:
        case BPF_JMP | BPF_JGT  | BPF_K:
        case BPF_JMP | BPF_JLT  | BPF_K:
        case BPF_JMP | BPF_JGE  | BPF_K:
        case BPF_JMP | BPF_JLE  | BPF_K:
        case BPF_JMP | BPF_JSGT | BPF_K:
        case BPF_JMP | BPF_JSLT | BPF_K:
        case BPF_JMP | BPF_JSGE | BPF_K:
        case BPF_JMP | BPF_JSLE | BPF_K:
        case BPF_JMP | BPF_JSET | BPF_K:
                /* Accommodate for extra offset in case of a backjump. */
                off = from->off;
                if (off < 0)
                        off -= 2;
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
                break;

        case BPF_JMP32 | BPF_JEQ  | BPF_K:
        case BPF_JMP32 | BPF_JNE  | BPF_K:
        case BPF_JMP32 | BPF_JGT  | BPF_K:
        case BPF_JMP32 | BPF_JLT  | BPF_K:
        case BPF_JMP32 | BPF_JGE  | BPF_K:
        case BPF_JMP32 | BPF_JLE  | BPF_K:
        case BPF_JMP32 | BPF_JSGT | BPF_K:
        case BPF_JMP32 | BPF_JSLT | BPF_K:
        case BPF_JMP32 | BPF_JSGE | BPF_K:
        case BPF_JMP32 | BPF_JSLE | BPF_K:
        case BPF_JMP32 | BPF_JSET | BPF_K:
                /* Accommodate for extra offset in case of a backjump. */
                off = from->off;
                if (off < 0)
                        off -= 2;
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
                                      off);
                break;

        case BPF_LD | BPF_IMM | BPF_DW:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
                *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
                break;
        case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                if (emit_zext)
                        *to++ = BPF_ZEXT_REG(BPF_REG_AX);
                *to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
                break;

        case BPF_ST | BPF_MEM | BPF_DW:
        case BPF_ST | BPF_MEM | BPF_W:
        case BPF_ST | BPF_MEM | BPF_H:
        case BPF_ST | BPF_MEM | BPF_B:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;

        case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
        case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
        case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
        case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^
                                      from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                /*
                 * Cannot use BPF_STX_MEM() macro here as it
                 * hardcodes BPF_MEM mode, losing PROBE_MEM32
                 * and breaking arena addressing in the JIT.
                 */
                *to++ = (struct bpf_insn) {
                        .code  = BPF_STX | BPF_PROBE_MEM32 |
                                 BPF_SIZE(from->code),
                        .dst_reg = from->dst_reg,
                        .src_reg = BPF_REG_AX,
                        .off   = from->off,
                };
                break;
        }
out:
        return to - to_buff;
}

static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
                                              gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
        struct bpf_prog *fp;

        fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
        if (fp != NULL) {
                /* aux->prog still points to the fp_other one, so
                 * when promoting the clone to the real program,
                 * this still needs to be adapted.
                 */
                memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
        }

        return fp;
}

static void bpf_prog_clone_free(struct bpf_prog *fp)
{
        /* aux was stolen by the other clone, so we cannot free
         * it from this path! It will be freed eventually by the
         * other program on release.
         *
         * At this point, we don't need a deferred release since
         * clone is guaranteed to not be locked.
         */
        fp->aux = NULL;
        fp->stats = NULL;
        fp->active = NULL;
        __bpf_prog_free(fp);
}

void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
{
        /* We have to repoint aux->prog to self, as we don't
         * know whether fp here is the clone or the original.
         */
        fp->aux->prog = fp;
        if (fp->aux->offload)
                fp->aux->offload->prog = fp;
        bpf_prog_clone_free(fp_other);
}

/*
 * Now this function is used only to blind the main prog and must be invoked only when
 * bpf_prog_need_blind() returns true.
 */
struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bpf_prog *prog)
{
        struct bpf_insn insn_buff[16], aux[2];
        struct bpf_prog *clone, *tmp;
        int insn_delta, insn_cnt;
        struct bpf_insn *insn;
        int i, rewritten;

        if (WARN_ON_ONCE(env && env->prog != prog))
                return ERR_PTR(-EINVAL);

        clone = bpf_prog_clone_create(prog, GFP_USER);
        if (!clone)
                return ERR_PTR(-ENOMEM);

        /* make sure bpf_patch_insn_data() patches the correct prog */
        if (env)
                env->prog = clone;

        insn_cnt = clone->len;
        insn = clone->insnsi;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (bpf_pseudo_func(insn)) {
                        /* ld_imm64 with an address of bpf subprog is not
                         * a user controlled constant. Don't randomize it,
                         * since it will conflict with jit_subprogs() logic.
                         */
                        insn++;
                        i++;
                        continue;
                }

                /* We temporarily need to hold the original ld64 insn
                 * so that we can still access the first part in the
                 * second blinding run.
                 */
                if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
                    insn[1].code == 0)
                        memcpy(aux, insn, sizeof(aux));

                rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
                                                clone->aux->verifier_zext);
                if (!rewritten)
                        continue;

                if (env)
                        tmp = bpf_patch_insn_data(env, i, insn_buff, rewritten);
                else
                        tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);

                if (IS_ERR_OR_NULL(tmp)) {
                        if (env)
                                /* restore the original prog */
                                env->prog = prog;
                        /* Patching may have repointed aux->prog during
                         * realloc from the original one, so we need to
                         * fix it up here on error.
                         */
                        bpf_jit_prog_release_other(prog, clone);
                        return IS_ERR(tmp) ? tmp : ERR_PTR(-ENOMEM);
                }

                clone = tmp;
                insn_delta = rewritten - 1;

                if (env)
                        env->prog = clone;

                /* Walk new program and skip insns we just inserted. */
                insn = clone->insnsi + i + insn_delta;
                insn_cnt += insn_delta;
                i        += insn_delta;
        }

        clone->blinded = 1;
        return clone;
}

bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog,
                                 int insn_idx)
{
        if (!env)
                return false;
        insn_idx += prog->aux->subprog_start;
        return env->insn_aux_data[insn_idx].indirect_target;
}
#endif /* CONFIG_BPF_JIT */

/* Base function for offset calculation. Needs to go into .text section,
 * therefore keeping it non-static as well; will also be used by JITs
 * anyway later on, so do not let the compiler omit it. This also needs
 * to go into kallsyms for correlation from e.g. bpftool, so naming
 * must not change.
 */
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
        return 0;
}
EXPORT_SYMBOL_GPL(__bpf_call_base);

/* All UAPI available opcodes. */
#define BPF_INSN_MAP(INSN_2, INSN_3)                \
        /* 32 bit ALU operations. */                \
        /*   Register based. */                        \
        INSN_3(ALU, ADD,  X),                        \
        INSN_3(ALU, SUB,  X),                        \
        INSN_3(ALU, AND,  X),                        \
        INSN_3(ALU, OR,   X),                        \
        INSN_3(ALU, LSH,  X),                        \
        INSN_3(ALU, RSH,  X),                        \
        INSN_3(ALU, XOR,  X),                        \
        INSN_3(ALU, MUL,  X),                        \
        INSN_3(ALU, MOV,  X),                        \
        INSN_3(ALU, ARSH, X),                        \
        INSN_3(ALU, DIV,  X),                        \
        INSN_3(ALU, MOD,  X),                        \
        INSN_2(ALU, NEG),                        \
        INSN_3(ALU, END, TO_BE),                \
        INSN_3(ALU, END, TO_LE),                \
        /*   Immediate based. */                \
        INSN_3(ALU, ADD,  K),                        \
        INSN_3(ALU, SUB,  K),                        \
        INSN_3(ALU, AND,  K),                        \
        INSN_3(ALU, OR,   K),                        \
        INSN_3(ALU, LSH,  K),                        \
        INSN_3(ALU, RSH,  K),                        \
        INSN_3(ALU, XOR,  K),                        \
        INSN_3(ALU, MUL,  K),                        \
        INSN_3(ALU, MOV,  K),                        \
        INSN_3(ALU, ARSH, K),                        \
        INSN_3(ALU, DIV,  K),                        \
        INSN_3(ALU, MOD,  K),                        \
        /* 64 bit ALU operations. */                \
        /*   Register based. */                        \
        INSN_3(ALU64, ADD,  X),                        \
        INSN_3(ALU64, SUB,  X),                        \
        INSN_3(ALU64, AND,  X),                        \
        INSN_3(ALU64, OR,   X),                        \
        INSN_3(ALU64, LSH,  X),                        \
        INSN_3(ALU64, RSH,  X),                        \
        INSN_3(ALU64, XOR,  X),                        \
        INSN_3(ALU64, MUL,  X),                        \
        INSN_3(ALU64, MOV,  X),                        \
        INSN_3(ALU64, ARSH, X),                        \
        INSN_3(ALU64, DIV,  X),                        \
        INSN_3(ALU64, MOD,  X),                        \
        INSN_2(ALU64, NEG),                        \
        INSN_3(ALU64, END, TO_LE),                \
        /*   Immediate based. */                \
        INSN_3(ALU64, ADD,  K),                        \
        INSN_3(ALU64, SUB,  K),                        \
        INSN_3(ALU64, AND,  K),                        \
        INSN_3(ALU64, OR,   K),                        \
        INSN_3(ALU64, LSH,  K),                        \
        INSN_3(ALU64, RSH,  K),                        \
        INSN_3(ALU64, XOR,  K),                        \
        INSN_3(ALU64, MUL,  K),                        \
        INSN_3(ALU64, MOV,  K),                        \
        INSN_3(ALU64, ARSH, K),                        \
        INSN_3(ALU64, DIV,  K),                        \
        INSN_3(ALU64, MOD,  K),                        \
        /* Call instruction. */                        \
        INSN_2(JMP, CALL),                        \
        /* Exit instruction. */                        \
        INSN_2(JMP, EXIT),                        \
        /* 32-bit Jump instructions. */                \
        /*   Register based. */                        \
        INSN_3(JMP32, JEQ,  X),                        \
        INSN_3(JMP32, JNE,  X),                        \
        INSN_3(JMP32, JGT,  X),                        \
        INSN_3(JMP32, JLT,  X),                        \
        INSN_3(JMP32, JGE,  X),                        \
        INSN_3(JMP32, JLE,  X),                        \
        INSN_3(JMP32, JSGT, X),                        \
        INSN_3(JMP32, JSLT, X),                        \
        INSN_3(JMP32, JSGE, X),                        \
        INSN_3(JMP32, JSLE, X),                        \
        INSN_3(JMP32, JSET, X),                        \
        /*   Immediate based. */                \
        INSN_3(JMP32, JEQ,  K),                        \
        INSN_3(JMP32, JNE,  K),                        \
        INSN_3(JMP32, JGT,  K),                        \
        INSN_3(JMP32, JLT,  K),                        \
        INSN_3(JMP32, JGE,  K),                        \
        INSN_3(JMP32, JLE,  K),                        \
        INSN_3(JMP32, JSGT, K),                        \
        INSN_3(JMP32, JSLT, K),                        \
        INSN_3(JMP32, JSGE, K),                        \
        INSN_3(JMP32, JSLE, K),                        \
        INSN_3(JMP32, JSET, K),                        \
        /* Jump instructions. */                \
        /*   Register based. */                        \
        INSN_3(JMP, JEQ,  X),                        \
        INSN_3(JMP, JNE,  X),                        \
        INSN_3(JMP, JGT,  X),                        \
        INSN_3(JMP, JLT,  X),                        \
        INSN_3(JMP, JGE,  X),                        \
        INSN_3(JMP, JLE,  X),                        \
        INSN_3(JMP, JSGT, X),                        \
        INSN_3(JMP, JSLT, X),                        \
        INSN_3(JMP, JSGE, X),                        \
        INSN_3(JMP, JSLE, X),                        \
        INSN_3(JMP, JSET, X),                        \
        /*   Immediate based. */                \
        INSN_3(JMP, JEQ,  K),                        \
        INSN_3(JMP, JNE,  K),                        \
        INSN_3(JMP, JGT,  K),                        \
        INSN_3(JMP, JLT,  K),                        \
        INSN_3(JMP, JGE,  K),                        \
        INSN_3(JMP, JLE,  K),                        \
        INSN_3(JMP, JSGT, K),                        \
        INSN_3(JMP, JSLT, K),                        \
        INSN_3(JMP, JSGE, K),                        \
        INSN_3(JMP, JSLE, K),                        \
        INSN_3(JMP, JSET, K),                        \
        INSN_2(JMP, JA),                        \
        INSN_2(JMP32, JA),                        \
        /* Atomic operations. */                \
        INSN_3(STX, ATOMIC, B),                        \
        INSN_3(STX, ATOMIC, H),                        \
        INSN_3(STX, ATOMIC, W),                        \
        INSN_3(STX, ATOMIC, DW),                \
        /* Store instructions. */                \
        /*   Register based. */                        \
        INSN_3(STX, MEM,  B),                        \
        INSN_3(STX, MEM,  H),                        \
        INSN_3(STX, MEM,  W),                        \
        INSN_3(STX, MEM,  DW),                        \
        /*   Immediate based. */                \
        INSN_3(ST, MEM, B),                        \
        INSN_3(ST, MEM, H),                        \
        INSN_3(ST, MEM, W),                        \
        INSN_3(ST, MEM, DW),                        \
        /* Load instructions. */                \
        /*   Register based. */                        \
        INSN_3(LDX, MEM, B),                        \
        INSN_3(LDX, MEM, H),                        \
        INSN_3(LDX, MEM, W),                        \
        INSN_3(LDX, MEM, DW),                        \
        INSN_3(LDX, MEMSX, B),                        \
        INSN_3(LDX, MEMSX, H),                        \
        INSN_3(LDX, MEMSX, W),                        \
        /*   Immediate based. */                \
        INSN_3(LD, IMM, DW)

bool bpf_opcode_in_insntable(u8 code)
{
#define BPF_INSN_2_TBL(x, y)    [BPF_##x | BPF_##y] = true
#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
        static const bool public_insntable[256] = {
                [0 ... 255] = false,
                /* Now overwrite non-defaults ... */
                BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
                /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
                [BPF_LD | BPF_ABS | BPF_B] = true,
                [BPF_LD | BPF_ABS | BPF_H] = true,
                [BPF_LD | BPF_ABS | BPF_W] = true,
                [BPF_LD | BPF_IND | BPF_B] = true,
                [BPF_LD | BPF_IND | BPF_H] = true,
                [BPF_LD | BPF_IND | BPF_W] = true,
                [BPF_JMP | BPF_JA | BPF_X] = true,
                [BPF_JMP | BPF_JCOND] = true,
        };
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
        return public_insntable[code];
}

#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/* Absolute value of s32 without undefined behavior for S32_MIN */
static u32 abs_s32(s32 x)
{
        return x >= 0 ? (u32)x : -(u32)x;
}

/**
 *        ___bpf_prog_run - run eBPF program on a given context
 *        @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
 *        @insn: is the array of eBPF instructions
 *
 * Decode and execute eBPF instructions.
 *
 * Return: whatever value is in %BPF_R0 at program exit
 */
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
#define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
        static const void * const jumptable[256] __annotate_jump_table = {
                [0 ... 255] = &&default_label,
                /* Now overwrite non-defaults ... */
                BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
                /* Non-UAPI available opcodes. */
                [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
                [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
                [BPF_ST  | BPF_NOSPEC] = &&ST_NOSPEC,
                [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
                [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
                [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
                [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
        };
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
        u32 tail_call_cnt = 0;

#define CONT         ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })

select_insn:
        goto *jumptable[insn->code];

        /* Explicitly mask the register-based shift amounts with 63 or 31
         * to avoid undefined behavior. Normally this won't affect the
         * generated code, for example, in case of native 64 bit archs such
         * as x86-64 or arm64, the compiler is optimizing the AND away for
         * the interpreter. In case of JITs, each of the JIT backends compiles
         * the BPF shift operations to machine instructions which produce
         * implementation-defined results in such a case; the resulting
         * contents of the register may be arbitrary, but program behaviour
         * as a whole remains defined. In other words, in case of JIT backends,
         * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
         */
        /* ALU (shifts) */
#define SHT(OPCODE, OP)                                        \
        ALU64_##OPCODE##_X:                                \
                DST = DST OP (SRC & 63);                \
                CONT;                                        \
        ALU_##OPCODE##_X:                                \
                DST = (u32) DST OP ((u32) SRC & 31);        \
                CONT;                                        \
        ALU64_##OPCODE##_K:                                \
                DST = DST OP IMM;                        \
                CONT;                                        \
        ALU_##OPCODE##_K:                                \
                DST = (u32) DST OP (u32) IMM;                \
                CONT;
        /* ALU (rest) */
#define ALU(OPCODE, OP)                                        \
        ALU64_##OPCODE##_X:                                \
                DST = DST OP SRC;                        \
                CONT;                                        \
        ALU_##OPCODE##_X:                                \
                DST = (u32) DST OP (u32) SRC;                \
                CONT;                                        \
        ALU64_##OPCODE##_K:                                \
                DST = DST OP IMM;                        \
                CONT;                                        \
        ALU_##OPCODE##_K:                                \
                DST = (u32) DST OP (u32) IMM;                \
                CONT;
        ALU(ADD,  +)
        ALU(SUB,  -)
        ALU(AND,  &)
        ALU(OR,   |)
        ALU(XOR,  ^)
        ALU(MUL,  *)
        SHT(LSH, <<)
        SHT(RSH, >>)
#undef SHT
#undef ALU
        ALU_NEG:
                DST = (u32) -DST;
                CONT;
        ALU64_NEG:
                DST = -DST;
                CONT;
        ALU_MOV_X:
                switch (OFF) {
                case 0:
                        DST = (u32) SRC;
                        break;
                case 8:
                        DST = (u32)(s8) SRC;
                        break;
                case 16:
                        DST = (u32)(s16) SRC;
                        break;
                }
                CONT;
        ALU_MOV_K:
                DST = (u32) IMM;
                CONT;
        ALU64_MOV_X:
                switch (OFF) {
                case 0:
                        DST = SRC;
                        break;
                case 8:
                        DST = (s8) SRC;
                        break;
                case 16:
                        DST = (s16) SRC;
                        break;
                case 32:
                        DST = (s32) SRC;
                        break;
                }
                CONT;
        ALU64_MOV_K:
                DST = IMM;
                CONT;
        LD_IMM_DW:
                DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
                insn++;
                CONT;
        ALU_ARSH_X:
                DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
                CONT;
        ALU_ARSH_K:
                DST = (u64) (u32) (((s32) DST) >> IMM);
                CONT;
        ALU64_ARSH_X:
                (*(s64 *) &DST) >>= (SRC & 63);
                CONT;
        ALU64_ARSH_K:
                (*(s64 *) &DST) >>= IMM;
                CONT;
        ALU64_MOD_X:
                switch (OFF) {
                case 0:
                        div64_u64_rem(DST, SRC, &AX);
                        DST = AX;
                        break;
                case 1:
                        AX = div64_s64(DST, SRC);
                        DST = DST - AX * SRC;
                        break;
                }
                CONT;
        ALU_MOD_X:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        DST = do_div(AX, (u32) SRC);
                        break;
                case 1:
                        AX = abs_s32((s32)DST);
                        AX = do_div(AX, abs_s32((s32)SRC));
                        if ((s32)DST < 0)
                                DST = (u32)-AX;
                        else
                                DST = (u32)AX;
                        break;
                }
                CONT;
        ALU64_MOD_K:
                switch (OFF) {
                case 0:
                        div64_u64_rem(DST, IMM, &AX);
                        DST = AX;
                        break;
                case 1:
                        AX = div64_s64(DST, IMM);
                        DST = DST - AX * IMM;
                        break;
                }
                CONT;
        ALU_MOD_K:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        DST = do_div(AX, (u32) IMM);
                        break;
                case 1:
                        AX = abs_s32((s32)DST);
                        AX = do_div(AX, abs_s32((s32)IMM));
                        if ((s32)DST < 0)
                                DST = (u32)-AX;
                        else
                                DST = (u32)AX;
                        break;
                }
                CONT;
        ALU64_DIV_X:
                switch (OFF) {
                case 0:
                        DST = div64_u64(DST, SRC);
                        break;
                case 1:
                        DST = div64_s64(DST, SRC);
                        break;
                }
                CONT;
        ALU_DIV_X:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        do_div(AX, (u32) SRC);
                        DST = (u32) AX;
                        break;
                case 1:
                        AX = abs_s32((s32)DST);
                        do_div(AX, abs_s32((s32)SRC));
                        if (((s32)DST < 0) == ((s32)SRC < 0))
                                DST = (u32)AX;
                        else
                                DST = (u32)-AX;
                        break;
                }
                CONT;
        ALU64_DIV_K:
                switch (OFF) {
                case 0:
                        DST = div64_u64(DST, IMM);
                        break;
                case 1:
                        DST = div64_s64(DST, IMM);
                        break;
                }
                CONT;
        ALU_DIV_K:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        do_div(AX, (u32) IMM);
                        DST = (u32) AX;
                        break;
                case 1:
                        AX = abs_s32((s32)DST);
                        do_div(AX, abs_s32((s32)IMM));
                        if (((s32)DST < 0) == ((s32)IMM < 0))
                                DST = (u32)AX;
                        else
                                DST = (u32)-AX;
                        break;
                }
                CONT;
        ALU_END_TO_BE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) cpu_to_be16(DST);
                        break;
                case 32:
                        DST = (__force u32) cpu_to_be32(DST);
                        break;
                case 64:
                        DST = (__force u64) cpu_to_be64(DST);
                        break;
                }
                CONT;
        ALU_END_TO_LE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) cpu_to_le16(DST);
                        break;
                case 32:
                        DST = (__force u32) cpu_to_le32(DST);
                        break;
                case 64:
                        DST = (__force u64) cpu_to_le64(DST);
                        break;
                }
                CONT;
        ALU64_END_TO_LE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) __swab16(DST);
                        break;
                case 32:
                        DST = (__force u32) __swab32(DST);
                        break;
                case 64:
                        DST = (__force u64) __swab64(DST);
                        break;
                }
                CONT;

        /* CALL */
        JMP_CALL:
                /* Function call scratches BPF_R1-BPF_R5 registers,
                 * preserves BPF_R6-BPF_R9, and stores return value
                 * into BPF_R0.
                 */
                BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
                                                       BPF_R4, BPF_R5);
                CONT;

        JMP_CALL_ARGS:
                BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
                                                            BPF_R3, BPF_R4,
                                                            BPF_R5,
                                                            insn + insn->off + 1);
                CONT;

        JMP_TAIL_CALL: {
                struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
                struct bpf_array *array = container_of(map, struct bpf_array, map);
                struct bpf_prog *prog;
                u32 index = BPF_R3;

                if (unlikely(index >= array->map.max_entries))
                        goto out;

                if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
                        goto out;

                prog = READ_ONCE(array->ptrs[index]);
                if (!prog)
                        goto out;

                tail_call_cnt++;

                /* ARG1 at this point is guaranteed to point to CTX from
                 * the verifier side due to the fact that the tail call is
                 * handled like a helper, that is, bpf_tail_call_proto,
                 * where arg1_type is ARG_PTR_TO_CTX.
                 */
                insn = prog->insnsi;
                goto select_insn;
out:
                CONT;
        }
        JMP_JA:
                insn += insn->off;
                CONT;
        JMP32_JA:
                insn += insn->imm;
                CONT;
        JMP_EXIT:
                return BPF_R0;
        /* JMP */
#define COND_JMP(SIGN, OPCODE, CMP_OP)                                \
        JMP_##OPCODE##_X:                                        \
                if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP32_##OPCODE##_X:                                        \
                if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP_##OPCODE##_K:                                        \
                if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP32_##OPCODE##_K:                                        \
                if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;
        COND_JMP(u, JEQ, ==)
        COND_JMP(u, JNE, !=)
        COND_JMP(u, JGT, >)
        COND_JMP(u, JLT, <)
        COND_JMP(u, JGE, >=)
        COND_JMP(u, JLE, <=)
        COND_JMP(u, JSET, &)
        COND_JMP(s, JSGT, >)
        COND_JMP(s, JSLT, <)
        COND_JMP(s, JSGE, >=)
        COND_JMP(s, JSLE, <=)
#undef COND_JMP
        /* ST, STX and LDX*/
        ST_NOSPEC:
                /* Speculation barrier for mitigating Speculative Store Bypass,
                 * Bounds-Check Bypass and Type Confusion. In case of arm64, we
                 * rely on the firmware mitigation as controlled via the ssbd
                 * kernel parameter. Whenever the mitigation is enabled, it
                 * works for all of the kernel code with no need to provide any
                 * additional instructions here. In case of x86, we use 'lfence'
                 * insn for mitigation. We reuse preexisting logic from Spectre
                 * v1 mitigation that happens to produce the required code on
                 * x86 for v4 as well.
                 */
                barrier_nospec();
                CONT;
#define LDST(SIZEOP, SIZE)                                                \
        STX_MEM_##SIZEOP:                                                \
                *(SIZE *)(unsigned long) (DST + insn->off) = SRC;        \
                CONT;                                                        \
        ST_MEM_##SIZEOP:                                                \
                *(SIZE *)(unsigned long) (DST + insn->off) = IMM;        \
                CONT;                                                        \
        LDX_MEM_##SIZEOP:                                                \
                DST = *(SIZE *)(unsigned long) (SRC + insn->off);        \
                CONT;                                                        \
        LDX_PROBE_MEM_##SIZEOP:                                                \
                bpf_probe_read_kernel_common(&DST, sizeof(SIZE),        \
                              (const void *)(long) (SRC + insn->off));        \
                DST = *((SIZE *)&DST);                                        \
                CONT;

        LDST(B,   u8)
        LDST(H,  u16)
        LDST(W,  u32)
        LDST(DW, u64)
#undef LDST

#define LDSX(SIZEOP, SIZE)                                                \
        LDX_MEMSX_##SIZEOP:                                                \
                DST = *(SIZE *)(unsigned long) (SRC + insn->off);        \
                CONT;                                                        \
        LDX_PROBE_MEMSX_##SIZEOP:                                        \
                bpf_probe_read_kernel_common(&DST, sizeof(SIZE),                \
                                      (const void *)(long) (SRC + insn->off));        \
                DST = *((SIZE *)&DST);                                        \
                CONT;

        LDSX(B,   s8)
        LDSX(H,  s16)
        LDSX(W,  s32)
#undef LDSX

#define ATOMIC_ALU_OP(BOP, KOP)                                                \
                case BOP:                                                \
                        if (BPF_SIZE(insn->code) == BPF_W)                \
                                atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
                                             (DST + insn->off));        \
                        else if (BPF_SIZE(insn->code) == BPF_DW)        \
                                atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
                                               (DST + insn->off));        \
                        else                                                \
                                goto default_label;                        \
                        break;                                                \
                case BOP | BPF_FETCH:                                        \
                        if (BPF_SIZE(insn->code) == BPF_W)                \
                                SRC = (u32) atomic_fetch_##KOP(                \
                                        (u32) SRC,                        \
                                        (atomic_t *)(unsigned long) (DST + insn->off)); \
                        else if (BPF_SIZE(insn->code) == BPF_DW)        \
                                SRC = (u64) atomic64_fetch_##KOP(        \
                                        (u64) SRC,                        \
                                        (atomic64_t *)(unsigned long) (DST + insn->off)); \
                        else                                                \
                                goto default_label;                        \
                        break;

        STX_ATOMIC_DW:
        STX_ATOMIC_W:
        STX_ATOMIC_H:
        STX_ATOMIC_B:
                switch (IMM) {
                /* Atomic read-modify-write instructions support only W and DW
                 * size modifiers.
                 */
                ATOMIC_ALU_OP(BPF_ADD, add)
                ATOMIC_ALU_OP(BPF_AND, and)
                ATOMIC_ALU_OP(BPF_OR, or)
                ATOMIC_ALU_OP(BPF_XOR, xor)
#undef ATOMIC_ALU_OP

                case BPF_XCHG:
                        if (BPF_SIZE(insn->code) == BPF_W)
                                SRC = (u32) atomic_xchg(
                                        (atomic_t *)(unsigned long) (DST + insn->off),
                                        (u32) SRC);
                        else if (BPF_SIZE(insn->code) == BPF_DW)
                                SRC = (u64) atomic64_xchg(
                                        (atomic64_t *)(unsigned long) (DST + insn->off),
                                        (u64) SRC);
                        else
                                goto default_label;
                        break;
                case BPF_CMPXCHG:
                        if (BPF_SIZE(insn->code) == BPF_W)
                                BPF_R0 = (u32) atomic_cmpxchg(
                                        (atomic_t *)(unsigned long) (DST + insn->off),
                                        (u32) BPF_R0, (u32) SRC);
                        else if (BPF_SIZE(insn->code) == BPF_DW)
                                BPF_R0 = (u64) atomic64_cmpxchg(
                                        (atomic64_t *)(unsigned long) (DST + insn->off),
                                        (u64) BPF_R0, (u64) SRC);
                        else
                                goto default_label;
                        break;
                /* Atomic load and store instructions support all size
                 * modifiers.
                 */
                case BPF_LOAD_ACQ:
                        switch (BPF_SIZE(insn->code)) {
#define LOAD_ACQUIRE(SIZEOP, SIZE)                                \
                        case BPF_##SIZEOP:                        \
                                DST = (SIZE)smp_load_acquire(        \
                                        (SIZE *)(unsigned long)(SRC + insn->off));        \
                                break;
                        LOAD_ACQUIRE(B,   u8)
                        LOAD_ACQUIRE(H,  u16)
                        LOAD_ACQUIRE(W,  u32)
#ifdef CONFIG_64BIT
                        LOAD_ACQUIRE(DW, u64)
#endif
#undef LOAD_ACQUIRE
                        default:
                                goto default_label;
                        }
                        break;
                case BPF_STORE_REL:
                        switch (BPF_SIZE(insn->code)) {
#define STORE_RELEASE(SIZEOP, SIZE)                        \
                        case BPF_##SIZEOP:                \
                                smp_store_release(        \
                                        (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC);        \
                                break;
                        STORE_RELEASE(B,   u8)
                        STORE_RELEASE(H,  u16)
                        STORE_RELEASE(W,  u32)
#ifdef CONFIG_64BIT
                        STORE_RELEASE(DW, u64)
#endif
#undef STORE_RELEASE
                        default:
                                goto default_label;
                        }
                        break;

                default:
                        goto default_label;
                }
                CONT;

        default_label:
                /* If we ever reach this, we have a bug somewhere. Die hard here
                 * instead of just returning 0; we could be somewhere in a subprog,
                 * so execution could continue otherwise which we do /not/ want.
                 *
                 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
                 */
                pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
                        insn->code, insn->imm);
                BUG_ON(1);
                return 0;
}

#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
#define DEFINE_BPF_PROG_RUN(stack_size) \
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
        u64 stack[stack_size / sizeof(u64)]; \
        u64 regs[MAX_BPF_EXT_REG] = {}; \
\
        kmsan_unpoison_memory(stack, sizeof(stack)); \
        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
        ARG1 = (u64) (unsigned long) ctx; \
        return ___bpf_prog_run(regs, insn); \
}

#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
                                      const struct bpf_insn *insn) \
{ \
        u64 stack[stack_size / sizeof(u64)]; \
        u64 regs[MAX_BPF_EXT_REG]; \
\
        kmsan_unpoison_memory(stack, sizeof(stack)); \
        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
        BPF_R1 = r1; \
        BPF_R2 = r2; \
        BPF_R3 = r3; \
        BPF_R4 = r4; \
        BPF_R5 = r5; \
        return ___bpf_prog_run(regs, insn); \
}

#define EVAL1(FN, X) FN(X)
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)

EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);

EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);

#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),

static unsigned int (*interpreters[])(const void *ctx,
                                      const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
static __maybe_unused
u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
                           const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST

#ifdef CONFIG_BPF_SYSCALL
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
        stack_depth = max_t(u32, stack_depth, 1);
        insn->off = (s16) insn->imm;
        insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
                __bpf_call_base_args;
        insn->code = BPF_JMP | BPF_CALL_ARGS;
}
#endif
#endif

static unsigned int __bpf_prog_ret0_warn(const void *ctx,
                                         const struct bpf_insn *insn)
{
        /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
         * is not working properly, so warn about it!
         */
        WARN_ON_ONCE(1);
        return 0;
}

static bool __bpf_prog_map_compatible(struct bpf_map *map,
                                      const struct bpf_prog *fp)
{
        enum bpf_prog_type prog_type = resolve_prog_type(fp);
        struct bpf_prog_aux *aux = fp->aux;
        enum bpf_cgroup_storage_type i;
        bool ret = false;
        u64 cookie;

        if (fp->kprobe_override)
                return ret;

        spin_lock(&map->owner_lock);
        /* There's no owner yet where we could check for compatibility. */
        if (!map->owner) {
                map->owner = bpf_map_owner_alloc(map);
                if (!map->owner)
                        goto err;
                map->owner->type  = prog_type;
                map->owner->jited = fp->jited;
                map->owner->xdp_has_frags = aux->xdp_has_frags;
                map->owner->sleepable = fp->sleepable;
                map->owner->expected_attach_type = fp->expected_attach_type;
                map->owner->attach_func_proto = aux->attach_func_proto;
                for_each_cgroup_storage_type(i) {
                        map->owner->storage_cookie[i] =
                                aux->cgroup_storage[i] ?
                                aux->cgroup_storage[i]->cookie : 0;
                }
                ret = true;
        } else {
                ret = map->owner->type  == prog_type &&
                      map->owner->jited == fp->jited &&
                      map->owner->xdp_has_frags == aux->xdp_has_frags &&
                      map->owner->sleepable == fp->sleepable;
                if (ret &&
                    map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
                    map->owner->expected_attach_type != fp->expected_attach_type)
                        ret = false;
                for_each_cgroup_storage_type(i) {
                        if (!ret)
                                break;
                        cookie = aux->cgroup_storage[i] ?
                                 aux->cgroup_storage[i]->cookie : 0;
                        ret = map->owner->storage_cookie[i] == cookie ||
                              !cookie;
                }
                if (ret &&
                    map->owner->attach_func_proto != aux->attach_func_proto) {
                        switch (prog_type) {
                        case BPF_PROG_TYPE_TRACING:
                        case BPF_PROG_TYPE_LSM:
                        case BPF_PROG_TYPE_EXT:
                        case BPF_PROG_TYPE_STRUCT_OPS:
                                ret = false;
                                break;
                        default:
                                break;
                        }
                }
        }
err:
        spin_unlock(&map->owner_lock);
        return ret;
}

bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
{
        /* XDP programs inserted into maps are not guaranteed to run on
         * a particular netdev (and can run outside driver context entirely
         * in the case of devmap and cpumap). Until device checks
         * are implemented, prohibit adding dev-bound programs to program maps.
         */
        if (bpf_prog_is_dev_bound(fp->aux))
                return false;

        return __bpf_prog_map_compatible(map, fp);
}

static int bpf_check_tail_call(const struct bpf_prog *fp)
{
        struct bpf_prog_aux *aux = fp->aux;
        int i, ret = 0;

        mutex_lock(&aux->used_maps_mutex);
        for (i = 0; i < aux->used_map_cnt; i++) {
                struct bpf_map *map = aux->used_maps[i];

                if (!map_type_contains_progs(map))
                        continue;

                if (!__bpf_prog_map_compatible(map, fp)) {
                        ret = -EINVAL;
                        goto out;
                }
        }

out:
        mutex_unlock(&aux->used_maps_mutex);
        return ret;
}

static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
{
        bool select_interpreter = false;
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
        u32 idx = (round_up(stack_depth, 32) / 32) - 1;

        /* may_goto may cause stack size > 512, leading to idx out-of-bounds.
         * But for non-JITed programs, we don't need bpf_func, so no bounds
         * check needed.
         */
        if (idx < ARRAY_SIZE(interpreters)) {
                fp->bpf_func = interpreters[idx];
                select_interpreter = true;
        } else {
                fp->bpf_func = __bpf_prog_ret0_warn;
        }
#else
        fp->bpf_func = __bpf_prog_ret0_warn;
#endif
        return select_interpreter;
}

static struct bpf_prog *bpf_prog_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog)
{
#ifdef CONFIG_BPF_JIT
        struct bpf_prog *orig_prog;
        struct bpf_insn_aux_data *orig_insn_aux;

        if (!bpf_prog_need_blind(prog))
                return bpf_int_jit_compile(env, prog);

        if (env) {
                /*
                 * If env is not NULL, we are called from the end of bpf_check(), at this
                 * point, only insn_aux_data is used after failure, so it should be restored
                 * on failure.
                 */
                orig_insn_aux = bpf_dup_insn_aux_data(env);
                if (!orig_insn_aux)
                        return prog;
        }

        orig_prog = prog;
        prog = bpf_jit_blind_constants(env, prog);
        /*
         * If blinding was requested and we failed during blinding, we must fall
         * back to the interpreter.
         */
        if (IS_ERR(prog))
                goto out_restore;

        prog = bpf_int_jit_compile(env, prog);
        if (prog->jited) {
                bpf_jit_prog_release_other(prog, orig_prog);
                if (env)
                        vfree(orig_insn_aux);
                return prog;
        }

        bpf_jit_prog_release_other(orig_prog, prog);

out_restore:
        prog = orig_prog;
        if (env)
                bpf_restore_insn_aux_data(env, orig_insn_aux);
#endif
        return prog;
}

struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct bpf_prog *fp,
                                           int *err)
{
        /* In case of BPF to BPF calls, verifier did all the prep
         * work with regards to JITing, etc.
         */
        bool jit_needed = false;

        if (fp->bpf_func)
                goto finalize;

        if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
            bpf_prog_has_kfunc_call(fp))
                jit_needed = true;

        if (!bpf_prog_select_interpreter(fp))
                jit_needed = true;

        /* eBPF JITs can rewrite the program in case constant
         * blinding is active. However, in case of error during
         * blinding, bpf_int_jit_compile() must always return a
         * valid program, which in this case would simply not
         * be JITed, but falls back to the interpreter.
         */
        if (!bpf_prog_is_offloaded(fp->aux)) {
                *err = bpf_prog_alloc_jited_linfo(fp);
                if (*err)
                        return fp;

                fp = bpf_prog_jit_compile(env, fp);
                bpf_prog_jit_attempt_done(fp);
                if (!fp->jited && jit_needed) {
                        *err = -ENOTSUPP;
                        return fp;
                }
        } else {
                *err = bpf_prog_offload_compile(fp);
                if (*err)
                        return fp;
        }

finalize:
        *err = bpf_prog_lock_ro(fp);
        if (*err)
                return fp;

        /* The tail call compatibility check can only be done at
         * this late stage as we need to determine, if we deal
         * with JITed or non JITed program concatenations and not
         * all eBPF JITs might immediately support all features.
         */
        *err = bpf_check_tail_call(fp);

        return fp;
}

/**
 *        bpf_prog_select_runtime - select exec runtime for BPF program
 *        @fp: bpf_prog populated with BPF program
 *        @err: pointer to error variable
 *
 * Try to JIT eBPF program, if JIT is not available, use interpreter.
 * The BPF program will be executed via bpf_prog_run() function.
 *
 * Return: the &fp argument along with &err set to 0 for success or
 * a negative errno code on failure
 */
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
        return __bpf_prog_select_runtime(NULL, fp, err);
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);

static unsigned int __bpf_prog_ret1(const void *ctx,
                                    const struct bpf_insn *insn)
{
        return 1;
}

static struct bpf_prog_dummy {
        struct bpf_prog prog;
} dummy_bpf_prog = {
        .prog = {
                .bpf_func = __bpf_prog_ret1,
        },
};

struct bpf_prog_array bpf_empty_prog_array = {
        .items = {
                { .prog = NULL },
        },
};
EXPORT_SYMBOL(bpf_empty_prog_array);

struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
        struct bpf_prog_array *p;

        if (prog_cnt)
                p = kzalloc_flex(*p, items, prog_cnt + 1, flags);
        else
                p = &bpf_empty_prog_array;

        return p;
}

void bpf_prog_array_free(struct bpf_prog_array *progs)
{
        if (!progs || progs == &bpf_empty_prog_array)
                return;
        kfree_rcu(progs, rcu);
}

static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
        struct bpf_prog_array *progs;

        /*
         * RCU Tasks Trace grace period implies RCU grace period, there is no
         * need to call kfree_rcu(), just call kfree() directly.
         */
        progs = container_of(rcu, struct bpf_prog_array, rcu);
        kfree(progs);
}

void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
{
        if (!progs || progs == &bpf_empty_prog_array)
                return;
        call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
}

int bpf_prog_array_length(struct bpf_prog_array *array)
{
        struct bpf_prog_array_item *item;
        u32 cnt = 0;

        for (item = array->items; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        cnt++;
        return cnt;
}

bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
{
        struct bpf_prog_array_item *item;

        for (item = array->items; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        return false;
        return true;
}

static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
                                     u32 *prog_ids,
                                     u32 request_cnt)
{
        struct bpf_prog_array_item *item;
        int i = 0;

        for (item = array->items; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                prog_ids[i] = item->prog->aux->id;
                if (++i == request_cnt) {
                        item++;
                        break;
                }
        }

        return !!(item->prog);
}

int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
                                __u32 __user *prog_ids, u32 cnt)
{
        unsigned long err = 0;
        bool nospc;
        u32 *ids;

        /* users of this function are doing:
         * cnt = bpf_prog_array_length();
         * if (cnt > 0)
         *     bpf_prog_array_copy_to_user(..., cnt);
         * so below kcalloc doesn't need extra cnt > 0 check.
         */
        ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
        if (!ids)
                return -ENOMEM;
        nospc = bpf_prog_array_copy_core(array, ids, cnt);
        err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
        kfree(ids);
        if (err)
                return -EFAULT;
        if (nospc)
                return -ENOSPC;
        return 0;
}

void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
                                struct bpf_prog *old_prog)
{
        struct bpf_prog_array_item *item;

        for (item = array->items; item->prog; item++)
                if (item->prog == old_prog) {
                        WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
                        break;
                }
}

/**
 * bpf_prog_array_delete_safe_at() - Replaces the program at the given
 *                                   index into the program array with
 *                                   a dummy no-op program.
 * @array: a bpf_prog_array
 * @index: the index of the program to replace
 *
 * Skips over dummy programs, by not counting them, when calculating
 * the position of the program to replace.
 *
 * Return:
 * * 0                - Success
 * * -EINVAL        - Invalid index value. Must be a non-negative integer.
 * * -ENOENT        - Index out of range
 */
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
{
        return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
}

/**
 * bpf_prog_array_update_at() - Updates the program at the given index
 *                              into the program array.
 * @array: a bpf_prog_array
 * @index: the index of the program to update
 * @prog: the program to insert into the array
 *
 * Skips over dummy programs, by not counting them, when calculating
 * the position of the program to update.
 *
 * Return:
 * * 0                - Success
 * * -EINVAL        - Invalid index value. Must be a non-negative integer.
 * * -ENOENT        - Index out of range
 */
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
                             struct bpf_prog *prog)
{
        struct bpf_prog_array_item *item;

        if (unlikely(index < 0))
                return -EINVAL;

        for (item = array->items; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                if (!index) {
                        WRITE_ONCE(item->prog, prog);
                        return 0;
                }
                index--;
        }
        return -ENOENT;
}

int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
                        u64 bpf_cookie,
                        struct bpf_prog_array **new_array)
{
        int new_prog_cnt, carry_prog_cnt = 0;
        struct bpf_prog_array_item *existing, *new;
        struct bpf_prog_array *array;
        bool found_exclude = false;

        /* Figure out how many existing progs we need to carry over to
         * the new array.
         */
        if (old_array) {
                existing = old_array->items;
                for (; existing->prog; existing++) {
                        if (existing->prog == exclude_prog) {
                                found_exclude = true;
                                continue;
                        }
                        if (existing->prog != &dummy_bpf_prog.prog)
                                carry_prog_cnt++;
                        if (existing->prog == include_prog)
                                return -EEXIST;
                }
        }

        if (exclude_prog && !found_exclude)
                return -ENOENT;

        /* How many progs (not NULL) will be in the new array? */
        new_prog_cnt = carry_prog_cnt;
        if (include_prog)
                new_prog_cnt += 1;

        /* Do we have any prog (not NULL) in the new array? */
        if (!new_prog_cnt) {
                *new_array = NULL;
                return 0;
        }

        /* +1 as the end of prog_array is marked with NULL */
        array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
        if (!array)
                return -ENOMEM;
        new = array->items;

        /* Fill in the new prog array */
        if (carry_prog_cnt) {
                existing = old_array->items;
                for (; existing->prog; existing++) {
                        if (existing->prog == exclude_prog ||
                            existing->prog == &dummy_bpf_prog.prog)
                                continue;

                        new->prog = existing->prog;
                        new->bpf_cookie = existing->bpf_cookie;
                        new++;
                }
        }
        if (include_prog) {
                new->prog = include_prog;
                new->bpf_cookie = bpf_cookie;
                new++;
        }
        new->prog = NULL;
        *new_array = array;
        return 0;
}

int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt)
{
        u32 cnt = 0;

        if (array)
                cnt = bpf_prog_array_length(array);

        *prog_cnt = cnt;

        /* return early if user requested only program count or nothing to copy */
        if (!request_cnt || !cnt)
                return 0;

        /* this function is called under trace/bpf_trace.c: bpf_event_mutex */
        return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
                                                                     : 0;
}

void __bpf_free_used_maps(struct bpf_prog_aux *aux,
                          struct bpf_map **used_maps, u32 len)
{
        struct bpf_map *map;
        bool sleepable;
        u32 i;

        sleepable = aux->prog->sleepable;
        for (i = 0; i < len; i++) {
                map = used_maps[i];
                if (map->ops->map_poke_untrack)
                        map->ops->map_poke_untrack(map, aux);
                if (sleepable)
                        atomic64_dec(&map->sleepable_refcnt);
                bpf_map_put(map);
        }
}

static void bpf_free_used_maps(struct bpf_prog_aux *aux)
{
        __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
        kfree(aux->used_maps);
}

void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
{
#ifdef CONFIG_BPF_SYSCALL
        struct btf_mod_pair *btf_mod;
        u32 i;

        for (i = 0; i < len; i++) {
                btf_mod = &used_btfs[i];
                if (btf_mod->module)
                        module_put(btf_mod->module);
                btf_put(btf_mod->btf);
        }
#endif
}

static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
{
        __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
        kfree(aux->used_btfs);
}

static void bpf_prog_free_deferred(struct work_struct *work)
{
        struct bpf_prog_aux *aux;
        int i;

        aux = container_of(work, struct bpf_prog_aux, work);
#ifdef CONFIG_BPF_SYSCALL
        bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
        bpf_prog_stream_free(aux->prog);
#endif
#ifdef CONFIG_CGROUP_BPF
        if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
                bpf_cgroup_atype_put(aux->cgroup_atype);
#endif
        bpf_free_used_maps(aux);
        bpf_free_used_btfs(aux);
        bpf_prog_disassoc_struct_ops(aux->prog);
        if (bpf_prog_is_dev_bound(aux))
                bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
        if (aux->prog->has_callchain_buf)
                put_callchain_buffers();
#endif
        if (aux->dst_trampoline)
                bpf_trampoline_put(aux->dst_trampoline);
        for (i = 0; i < aux->real_func_cnt; i++) {
                /* We can just unlink the subprog poke descriptor table as
                 * it was originally linked to the main program and is also
                 * released along with it.
                 */
                aux->func[i]->aux->poke_tab = NULL;
                bpf_jit_free(aux->func[i]);
        }
        if (aux->real_func_cnt) {
                kfree(aux->func);
                bpf_prog_unlock_free(aux->prog);
        } else {
                bpf_jit_free(aux->prog);
        }
}

void bpf_prog_free(struct bpf_prog *fp)
{
        struct bpf_prog_aux *aux = fp->aux;

        if (aux->dst_prog)
                bpf_prog_put(aux->dst_prog);
        bpf_token_put(aux->token);
        INIT_WORK(&aux->work, bpf_prog_free_deferred);
        schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);

/* RNG for unprivileged user space with separated state from prandom_u32(). */
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);

void bpf_user_rnd_init_once(void)
{
        prandom_init_once(&bpf_user_rnd_state);
}

BPF_CALL_0(bpf_user_rnd_u32)
{
        /* Should someone ever have the rather unwise idea to use some
         * of the registers passed into this function, then note that
         * this function is called from native eBPF and classic-to-eBPF
         * transformations. Register assignments from both sides are
         * different, f.e. classic always sets fn(ctx, A, X) here.
         */
        struct rnd_state *state;
        u32 res;

        state = &get_cpu_var(bpf_user_rnd_state);
        res = prandom_u32_state(state);
        put_cpu_var(bpf_user_rnd_state);

        return res;
}

BPF_CALL_0(bpf_get_raw_cpu_id)
{
        return raw_smp_processor_id();
}

/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_jiffies64_proto __weak;

const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;

const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
const struct bpf_func_proto bpf_set_retval_proto __weak;
const struct bpf_func_proto bpf_get_retval_proto __weak;

const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
        return NULL;
}

const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
{
        return NULL;
}

const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void)
{
        return NULL;
}

u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
        return -ENOTSUPP;
}
EXPORT_SYMBOL_GPL(bpf_event_output);

/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
        /* func is unused for tail_call, we set it to pass the
         * get_helper_proto check
         */
        .func                = BPF_PTR_POISON,
        .gpl_only        = false,
        .ret_type        = RET_VOID,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
 * It is encouraged to implement bpf_int_jit_compile() instead, so that
 * eBPF and implicitly also cBPF can get JITed!
 */
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog)
{
        return prog;
}

/* Stub for JITs that support eBPF. All cBPF code gets transformed into
 * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
 */
void __weak bpf_jit_compile(struct bpf_prog *prog)
{
}

bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
{
        return false;
}

/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
 * analysis code and wants explicit zero extension inserted by verifier.
 * Otherwise, return FALSE.
 *
 * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
 * you don't override this. JITs that don't want these extra insns can detect
 * them using insn_is_zext.
 */
bool __weak bpf_jit_needs_zext(void)
{
        return false;
}

/* By default, enable the verifier's mitigations against Spectre v1 and v4 for
 * all archs. The value returned must not change at runtime as there is
 * currently no support for reloading programs that were loaded without
 * mitigations.
 */
bool __weak bpf_jit_bypass_spec_v1(void)
{
        return false;
}

bool __weak bpf_jit_bypass_spec_v4(void)
{
        return false;
}

/* Return true if the JIT inlines the call to the helper corresponding to
 * the imm.
 *
 * The verifier will not patch the insn->imm for the call to the helper if
 * this returns true.
 */
bool __weak bpf_jit_inlines_helper_call(s32 imm)
{
        return false;
}

/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
bool __weak bpf_jit_supports_subprog_tailcalls(void)
{
        return false;
}

bool __weak bpf_jit_supports_percpu_insn(void)
{
        return false;
}

bool __weak bpf_jit_supports_kfunc_call(void)
{
        return false;
}

bool __weak bpf_jit_supports_far_kfunc_call(void)
{
        return false;
}

bool __weak bpf_jit_supports_arena(void)
{
        return false;
}

bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
{
        return false;
}

bool __weak bpf_jit_supports_fsession(void)
{
        return false;
}

u64 __weak bpf_arch_uaddress_limit(void)
{
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
        return TASK_SIZE;
#else
        return 0;
#endif
}

/* Return TRUE if the JIT backend satisfies the following two conditions:
 * 1) JIT backend supports atomic_xchg() on pointer-sized words.
 * 2) Under the specific arch, the implementation of xchg() is the same
 *    as atomic_xchg() on pointer-sized words.
 */
bool __weak bpf_jit_supports_ptr_xchg(void)
{
        return false;
}

/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
 */
int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
                         int len)
{
        return -EFAULT;
}

int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
                              enum bpf_text_poke_type new_t, void *old_addr,
                              void *new_addr)
{
        return -ENOTSUPP;
}

void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
{
        return ERR_PTR(-ENOTSUPP);
}

int __weak bpf_arch_text_invalidate(void *dst, size_t len)
{
        return -ENOTSUPP;
}

bool __weak bpf_jit_supports_exceptions(void)
{
        return false;
}

bool __weak bpf_jit_supports_private_stack(void)
{
        return false;
}

void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}

bool __weak bpf_jit_supports_timed_may_goto(void)
{
        return false;
}

u64 __weak arch_bpf_timed_may_goto(void)
{
        return 0;
}

static noinline void bpf_prog_report_may_goto_violation(void)
{
#ifdef CONFIG_BPF_SYSCALL
        struct bpf_stream_stage ss;
        struct bpf_prog *prog;

        prog = bpf_prog_find_from_stack();
        if (!prog)
                return;
        bpf_stream_stage(ss, prog, BPF_STDERR, ({
                bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n");
                bpf_stream_dump_stack(ss);
        }));
#endif
}

u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
{
        u64 time = ktime_get_mono_fast_ns();

        /* Populate the timestamp for this stack frame, and refresh count. */
        if (!p->timestamp) {
                p->timestamp = time;
                return BPF_MAX_TIMED_LOOPS;
        }
        /* Check if we've exhausted our time slice, and zero count. */
        if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) {
                bpf_prog_report_may_goto_violation();
                return 0;
        }
        /* Refresh the count for the stack frame. */
        return BPF_MAX_TIMED_LOOPS;
}

/* for configs without MMU or 32-bit */
__weak const struct bpf_map_ops arena_map_ops;
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
{
        return 0;
}
__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
{
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
        int ret;

        ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
        bpf_global_ma_set = !ret;
        return ret;
}
late_initcall(bpf_global_ma_init);
#endif

DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);

/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
#include <linux/bpf_trace.h>

EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);

#ifdef CONFIG_BPF_SYSCALL

void bpf_get_linfo_file_line(struct btf *btf, const struct bpf_line_info *linfo,
                             const char **filep, const char **linep, int *nump)
{
        /* Get base component of the file path. */
        if (filep) {
                *filep = btf_name_by_offset(btf, linfo->file_name_off);
                *filep = kbasename(*filep);
        }

        /* Obtain the source line, and strip whitespace in prefix. */
        if (linep) {
                *linep = btf_name_by_offset(btf, linfo->line_off);
                while (isspace(**linep))
                        *linep += 1;
        }

        if (nump)
                *nump = BPF_LINE_INFO_LINE_NUM(linfo->line_col);
}

const struct bpf_line_info *bpf_find_linfo(const struct bpf_prog *prog, u32 insn_off)
{
        const struct bpf_line_info *linfo;
        u32 nr_linfo;
        int l, r, m;

        nr_linfo = prog->aux->nr_linfo;
        if (!nr_linfo || insn_off >= prog->len)
                return NULL;

        linfo = prog->aux->linfo;
        /* Loop invariant: linfo[l].insn_off <= insns_off.
         * linfo[0].insn_off == 0 which always satisfies above condition.
         * Binary search is searching for rightmost linfo entry that satisfies
         * the above invariant, giving us the desired record that covers given
         * instruction offset.
         */
        l = 0;
        r = nr_linfo - 1;
        while (l < r) {
                /* (r - l + 1) / 2 means we break a tie to the right, so if:
                 * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off,
                 * then m=2, we see that linfo[m].insn_off > insn_off, and so
                 * r becomes 1 and we exit the loop with correct l==1.
                 * If the tie was broken to the left, m=1 would end us up in
                 * an endless loop where l and m stay at 1 and r stays at 2.
                 */
                m = l + (r - l + 1) / 2;
                if (linfo[m].insn_off <= insn_off)
                        l = m;
                else
                        r = m - 1;
        }

        return &linfo[l];
}

int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
                           const char **linep, int *nump)
{
        int idx = -1, insn_start, insn_end, len;
        struct bpf_line_info *linfo;
        void **jited_linfo;
        struct btf *btf;
        int nr_linfo;

        btf = prog->aux->btf;
        linfo = prog->aux->linfo;
        jited_linfo = prog->aux->jited_linfo;

        if (!btf || !linfo || !jited_linfo)
                return -EINVAL;
        len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len;

        linfo = &prog->aux->linfo[prog->aux->linfo_idx];
        jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx];

        insn_start = linfo[0].insn_off;
        insn_end = insn_start + len;
        nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx;

        for (int i = 0; i < nr_linfo &&
             linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) {
                if (jited_linfo[i] >= (void *)ip)
                        break;
                idx = i;
        }

        if (idx == -1)
                return -ENOENT;

        bpf_get_linfo_file_line(btf, &linfo[idx], filep, linep, nump);
        return 0;
}

struct walk_stack_ctx {
        struct bpf_prog *prog;
};

static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
{
        struct walk_stack_ctx *ctxp = cookie;
        struct bpf_prog *prog;

        /*
         * The RCU read lock is held to safely traverse the latch tree, but we
         * don't need its protection when accessing the prog, since it has an
         * active stack frame on the current stack trace, and won't disappear.
         */
        rcu_read_lock();
        prog = bpf_prog_ksym_find(ip);
        rcu_read_unlock();
        if (!prog)
                return true;
        /* Make sure we return the main prog if we found a subprog */
        ctxp->prog = prog->aux->main_prog_aux->prog;
        return false;
}

struct bpf_prog *bpf_prog_find_from_stack(void)
{
        struct walk_stack_ctx ctx = {};

        arch_bpf_stack_walk(find_from_stack_cb, &ctx);
        return ctx.prog;
}

#endif































    1 
























    1 































    1 
    1 





























































    1 

































































    1 


























    1 






    1 





































































    1 

















    1 






    1 




    1 








    1 


    1 





    1 























































    1 
    1 

    1 








































































































    1 
    1 

    1 




























    1 










    1 


    1 





    1 




    1 























































































































    1 



































    1 










    1 





    1 
    1 

    1 


    1 

































































































































    1 






    1 











    1 







    1 












    1 
    1 





    1 





    1 
    1 

    1 


    1 

    1 








    1 

    1 







































































    1 
    1 


    1 

    1 





































   13 



   14 




    1 











































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions related to segment and merge handling
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/part_stat.h>
#include <linux/blk-cgroup.h>

#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
#include "blk-throttle.h"

static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
{
        *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
}

static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
{
        struct bvec_iter iter = bio->bi_iter;
        int idx;

        bio_get_first_bvec(bio, bv);
        if (bv->bv_len == bio->bi_iter.bi_size)
                return;                /* this bio only has a single bvec */

        bio_advance_iter(bio, &iter, iter.bi_size);

        if (!iter.bi_bvec_done)
                idx = iter.bi_idx - 1;
        else        /* in the middle of bvec */
                idx = iter.bi_idx;

        *bv = bio->bi_io_vec[idx];

        /*
         * iter.bi_bvec_done records actual length of the last bvec
         * if this bio ends in the middle of one io vector
         */
        if (iter.bi_bvec_done)
                bv->bv_len = iter.bi_bvec_done;
}

static inline bool bio_will_gap(struct request_queue *q,
                struct request *prev_rq, struct bio *prev, struct bio *next)
{
        struct bio_vec pb, nb;

        if (!bio_has_data(prev) || !queue_virt_boundary(q))
                return false;

        /*
         * Don't merge if the 1st bio starts with non-zero offset, otherwise it
         * is quite difficult to respect the sg gap limit.  We work hard to
         * merge a huge number of small single bios in case of mkfs.
         */
        if (prev_rq)
                bio_get_first_bvec(prev_rq->bio, &pb);
        else
                bio_get_first_bvec(prev, &pb);
        if (pb.bv_offset & queue_virt_boundary(q))
                return true;

        /*
         * We don't need to worry about the situation that the merged segment
         * ends in unaligned virt boundary:
         *
         * - if 'pb' ends aligned, the merged segment ends aligned
         * - if 'pb' ends unaligned, the next bio must include
         *   one single bvec of 'nb', otherwise the 'nb' can't
         *   merge with 'pb'
         */
        bio_get_last_bvec(prev, &pb);
        bio_get_first_bvec(next, &nb);
        if (biovec_phys_mergeable(q, &pb, &nb))
                return false;
        return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset);
}

static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
{
        return bio_will_gap(req->q, req, req->biotail, bio);
}

static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
{
        return bio_will_gap(req->q, NULL, bio, req->bio);
}

/*
 * The maximum size that a bio can fit has to be aligned down to the
 * logical block size, which is the minimum accepted unit by hardware.
 */
static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
{
        return round_down(BIO_MAX_SIZE, lim->logical_block_size) >>
                        SECTOR_SHIFT;
}

/*
 * bio_submit_split_bioset - Submit a bio, splitting it at a designated sector
 * @bio:                the original bio to be submitted and split
 * @split_sectors:        the sector count at which to split
 * @bs:                        the bio set used for allocating the new split bio
 *
 * The original bio is modified to contain the remaining sectors and submitted.
 * The caller is responsible for submitting the returned bio.
 *
 * If succeed, the newly allocated bio representing the initial part will be
 * returned, on failure NULL will be returned and original bio will fail.
 */
struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
                                    struct bio_set *bs)
{
        struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs);

        if (IS_ERR(split)) {
                bio->bi_status = errno_to_blk_status(PTR_ERR(split));
                bio_endio(bio);
                return NULL;
        }

        bio_chain(split, bio);
        trace_block_split(split, bio->bi_iter.bi_sector);
        WARN_ON_ONCE(bio_zone_write_plugging(bio));

        if (should_fail_bio(bio))
                bio_io_error(bio);
        else if (!blk_throtl_bio(bio))
                submit_bio_noacct_nocheck(bio, true);

        return split;
}
EXPORT_SYMBOL_GPL(bio_submit_split_bioset);

static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
{
        if (unlikely(split_sectors < 0)) {
                bio->bi_status = errno_to_blk_status(split_sectors);
                bio_endio(bio);
                return NULL;
        }

        if (split_sectors) {
                bio = bio_submit_split_bioset(bio, split_sectors,
                                &bio->bi_bdev->bd_disk->bio_split);
                if (bio)
                        bio->bi_opf |= REQ_NOMERGE;
        }

        return bio;
}

static struct bio *__bio_split_discard(struct bio *bio,
                const struct queue_limits *lim, unsigned *nsegs,
                unsigned int max_sectors)
{
        unsigned int max_discard_sectors, granularity;
        sector_t tmp;
        unsigned split_sectors;

        *nsegs = 1;

        granularity = max(lim->discard_granularity >> 9, 1U);

        max_discard_sectors = min(max_sectors, bio_allowed_max_sectors(lim));
        max_discard_sectors -= max_discard_sectors % granularity;
        if (unlikely(!max_discard_sectors))
                return bio;

        if (bio_sectors(bio) <= max_discard_sectors)
                return bio;

        split_sectors = max_discard_sectors;

        /*
         * If the next starting sector would be misaligned, stop the discard at
         * the previous aligned sector.
         */
        tmp = bio->bi_iter.bi_sector + split_sectors -
                ((lim->discard_alignment >> 9) % granularity);
        tmp = sector_div(tmp, granularity);

        if (split_sectors > tmp)
                split_sectors -= tmp;

        return bio_submit_split(bio, split_sectors);
}

struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
                unsigned *nsegs)
{
        unsigned int max_sectors;

        if (bio_op(bio) == REQ_OP_SECURE_ERASE)
                max_sectors = lim->max_secure_erase_sectors;
        else
                max_sectors = lim->max_discard_sectors;

        return __bio_split_discard(bio, lim, nsegs, max_sectors);
}

static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim,
                                                bool is_atomic)
{
        /*
         * chunk_sectors must be a multiple of atomic_write_boundary_sectors if
         * both non-zero.
         */
        if (is_atomic && lim->atomic_write_boundary_sectors)
                return lim->atomic_write_boundary_sectors;

        return lim->chunk_sectors;
}

/*
 * Return the maximum number of sectors from the start of a bio that may be
 * submitted as a single request to a block device. If enough sectors remain,
 * align the end to the physical block size. Otherwise align the end to the
 * logical block size. This approach minimizes the number of non-aligned
 * requests that are submitted to a block device if the start of a bio is not
 * aligned to a physical block boundary.
 */
static inline unsigned get_max_io_size(struct bio *bio,
                                       const struct queue_limits *lim)
{
        unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
        unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
        bool is_atomic = bio->bi_opf & REQ_ATOMIC;
        unsigned boundary_sectors = blk_boundary_sectors(lim, is_atomic);
        unsigned max_sectors, start, end;

        /*
         * We ignore lim->max_sectors for atomic writes because it may less
         * than the actual bio size, which we cannot tolerate.
         */
        if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
                max_sectors = lim->max_write_zeroes_sectors;
        else if (is_atomic)
                max_sectors = lim->atomic_write_max_sectors;
        else
                max_sectors = lim->max_sectors;

        if (boundary_sectors) {
                max_sectors = min(max_sectors,
                        blk_boundary_sectors_left(bio->bi_iter.bi_sector,
                                              boundary_sectors));
        }

        start = bio->bi_iter.bi_sector & (pbs - 1);
        end = (start + max_sectors) & ~(pbs - 1);
        if (end > start)
                return end - start;
        return max_sectors & ~(lbs - 1);
}

/**
 * bvec_split_segs - verify whether or not a bvec should be split in the middle
 * @lim:      [in] queue limits to split based on
 * @bv:       [in] bvec to examine
 * @nsegs:    [in,out] Number of segments in the bio being built. Incremented
 *            by the number of segments from @bv that may be appended to that
 *            bio without exceeding @max_segs
 * @bytes:    [in,out] Number of bytes in the bio being built. Incremented
 *            by the number of bytes from @bv that may be appended to that
 *            bio without exceeding @max_bytes
 * @max_segs: [in] upper bound for *@nsegs
 * @max_bytes: [in] upper bound for *@bytes
 *
 * When splitting a bio, it can happen that a bvec is encountered that is too
 * big to fit in a single segment and hence that it has to be split in the
 * middle. This function verifies whether or not that should happen. The value
 * %true is returned if and only if appending the entire @bv to a bio with
 * *@nsegs segments and *@sectors sectors would make that bio unacceptable for
 * the block driver.
 */
static bool bvec_split_segs(const struct queue_limits *lim,
                const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes,
                unsigned max_segs, unsigned max_bytes)
{
        unsigned max_len = max_bytes - *bytes;
        unsigned len = min(bv->bv_len, max_len);
        unsigned total_len = 0;
        unsigned seg_size = 0;

        while (len && *nsegs < max_segs) {
                seg_size = get_max_segment_size(lim, bvec_phys(bv) + total_len, len);

                (*nsegs)++;
                total_len += seg_size;
                len -= seg_size;

                if ((bv->bv_offset + total_len) & lim->virt_boundary_mask)
                        break;
        }

        *bytes += total_len;

        /* tell the caller to split the bvec if it is too big to fit */
        return len > 0 || bv->bv_len > max_len;
}

static unsigned int bio_split_alignment(struct bio *bio,
                const struct queue_limits *lim)
{
        if (op_is_write(bio_op(bio)) && lim->zone_write_granularity)
                return lim->zone_write_granularity;
        return lim->logical_block_size;
}

static inline unsigned int bvec_seg_gap(struct bio_vec *bvprv,
                                        struct bio_vec *bv)
{
        return bv->bv_offset | (bvprv->bv_offset + bvprv->bv_len);
}

/**
 * bio_split_io_at - check if and where to split a bio
 * @bio:  [in] bio to be split
 * @lim:  [in] queue limits to split based on
 * @segs: [out] number of segments in the bio with the first half of the sectors
 * @max_bytes: [in] maximum number of bytes per bio
 * @len_align_mask: [in] length alignment mask for each vector
 *
 * Find out if @bio needs to be split to fit the queue limits in @lim and a
 * maximum size of @max_bytes.  Returns a negative error number if @bio can't be
 * split, 0 if the bio doesn't have to be split, or a positive sector offset if
 * @bio needs to be split.
 */
int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
                unsigned *segs, unsigned max_bytes, unsigned len_align_mask)
{
        struct bio_crypt_ctx *bc = bio_crypt_ctx(bio);
        struct bio_vec bv, bvprv, *bvprvp = NULL;
        unsigned nsegs = 0, bytes = 0, gaps = 0;
        struct bvec_iter iter;
        unsigned start_align_mask = lim->dma_alignment;

        if (bc) {
                start_align_mask |= (bc->bc_key->crypto_cfg.data_unit_size - 1);
                len_align_mask |= (bc->bc_key->crypto_cfg.data_unit_size - 1);
        }

        bio_for_each_bvec(bv, bio, iter) {
                if (bv.bv_offset & start_align_mask ||
                    bv.bv_len & len_align_mask)
                        return -EINVAL;

                /*
                 * If the queue doesn't support SG gaps and adding this
                 * offset would create a gap, disallow it.
                 */
                if (bvprvp) {
                        if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
                                goto split;
                        gaps |= bvec_seg_gap(bvprvp, &bv);
                }

                if (nsegs < lim->max_segments &&
                    bytes + bv.bv_len <= max_bytes &&
                    bv.bv_offset + bv.bv_len <= lim->max_fast_segment_size) {
                        nsegs++;
                        bytes += bv.bv_len;
                } else {
                        if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
                                        lim->max_segments, max_bytes))
                                goto split;
                }

                bvprv = bv;
                bvprvp = &bvprv;
        }

        *segs = nsegs;
        bio->bi_bvec_gap_bit = ffs(gaps);
        return 0;
split:
        if (bio->bi_opf & REQ_ATOMIC)
                return -EINVAL;

        /*
         * We can't sanely support splitting for a REQ_NOWAIT bio. End it
         * with EAGAIN if splitting is required and return an error pointer.
         */
        if (bio->bi_opf & REQ_NOWAIT)
                return -EAGAIN;

        *segs = nsegs;

        /*
         * Individual bvecs might not be logical block aligned. Round down the
         * split size so that each bio is properly block size aligned, even if
         * we do not use the full hardware limits.
         *
         * It is possible to submit a bio that can't be split into a valid io:
         * there may either be too many discontiguous vectors for the max
         * segments limit, or contain virtual boundary gaps without having a
         * valid block sized split. A zero byte result means one of those
         * conditions occured.
         */
        bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim));
        if (!bytes)
                return -EINVAL;

        /*
         * Bio splitting may cause subtle trouble such as hang when doing sync
         * iopoll in direct IO routine. Given performance gain of iopoll for
         * big IO can be trival, disable iopoll when split needed.
         */
        bio_clear_polled(bio);
        bio->bi_bvec_gap_bit = ffs(gaps);
        return bytes >> SECTOR_SHIFT;
}
EXPORT_SYMBOL_GPL(bio_split_io_at);

struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
                unsigned *nr_segs)
{
        return bio_submit_split(bio,
                bio_split_rw_at(bio, lim, nr_segs,
                        get_max_io_size(bio, lim) << SECTOR_SHIFT));
}

/*
 * REQ_OP_ZONE_APPEND bios must never be split by the block layer.
 *
 * But we want the nr_segs calculation provided by bio_split_rw_at, and having
 * a good sanity check that the submitter built the bio correctly is nice to
 * have as well.
 */
struct bio *bio_split_zone_append(struct bio *bio,
                const struct queue_limits *lim, unsigned *nr_segs)
{
        int split_sectors;

        split_sectors = bio_split_rw_at(bio, lim, nr_segs,
                        lim->max_zone_append_sectors << SECTOR_SHIFT);
        if (WARN_ON_ONCE(split_sectors > 0))
                split_sectors = -EINVAL;
        return bio_submit_split(bio, split_sectors);
}

struct bio *bio_split_write_zeroes(struct bio *bio,
                const struct queue_limits *lim, unsigned *nsegs)
{
        unsigned int max_sectors = get_max_io_size(bio, lim);

        *nsegs = 0;

        /*
         * An unset limit should normally not happen, as bio submission is keyed
         * off having a non-zero limit.  But SCSI can clear the limit in the
         * I/O completion handler, and we can race and see this.  Splitting to a
         * zero limit obviously doesn't make sense, so band-aid it here.
         */
        if (!max_sectors)
                return bio;
        if (bio_sectors(bio) <= max_sectors)
                return bio;
        return bio_submit_split(bio, max_sectors);
}

/**
 * bio_split_to_limits - split a bio to fit the queue limits
 * @bio:     bio to be split
 *
 * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
 * if so split off a bio fitting the limits from the beginning of @bio and
 * return it.  @bio is shortened to the remainder and re-submitted.
 *
 * The split bio is allocated from @q->bio_split, which is provided by the
 * block layer.
 */
struct bio *bio_split_to_limits(struct bio *bio)
{
        unsigned int nr_segs;

        return __bio_split_to_limits(bio, bdev_limits(bio->bi_bdev), &nr_segs);
}
EXPORT_SYMBOL(bio_split_to_limits);

unsigned int blk_recalc_rq_segments(struct request *rq)
{
        unsigned int nr_phys_segs = 0;
        unsigned int bytes = 0;
        struct req_iterator iter;
        struct bio_vec bv;

        if (!rq->bio)
                return 0;

        switch (bio_op(rq->bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
                if (queue_max_discard_segments(rq->q) > 1) {
                        struct bio *bio = rq->bio;

                        for_each_bio(bio)
                                nr_phys_segs++;
                        return nr_phys_segs;
                }
                return 1;
        case REQ_OP_WRITE_ZEROES:
                return 0;
        default:
                break;
        }

        rq_for_each_bvec(bv, rq, iter)
                bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes,
                                UINT_MAX, BIO_MAX_SIZE);
        return nr_phys_segs;
}

static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
                                                  sector_t offset)
{
        struct request_queue *q = rq->q;
        struct queue_limits *lim = &q->limits;
        unsigned int max_sectors, boundary_sectors;
        bool is_atomic = rq->cmd_flags & REQ_ATOMIC;

        if (blk_rq_is_passthrough(rq))
                return q->limits.max_hw_sectors;

        boundary_sectors = blk_boundary_sectors(lim, is_atomic);
        max_sectors = blk_queue_get_max_sectors(rq);

        if (!boundary_sectors ||
            req_op(rq) == REQ_OP_DISCARD ||
            req_op(rq) == REQ_OP_SECURE_ERASE)
                return max_sectors;
        return min(max_sectors,
                   blk_boundary_sectors_left(offset, boundary_sectors));
}

static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
                unsigned int nr_phys_segs)
{
        if (!blk_cgroup_mergeable(req, bio))
                goto no_merge;

        if (blk_integrity_merge_bio(req->q, req, bio) == false)
                goto no_merge;

        /* discard request merge won't add new segment */
        if (req_op(req) == REQ_OP_DISCARD)
                return 1;

        if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
                goto no_merge;

        /*
         * This will form the start of a new hw segment.  Bump both
         * counters.
         */
        req->nr_phys_segments += nr_phys_segs;
        if (bio_integrity(bio))
                req->nr_integrity_segments += blk_rq_count_integrity_sg(req->q,
                                                                        bio);
        return 1;

no_merge:
        req_set_nomerge(req->q, req);
        return 0;
}

int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
{
        if (req_gap_back_merge(req, bio))
                return 0;
        if (blk_integrity_rq(req) &&
            integrity_req_gap_back_merge(req, bio))
                return 0;
        if (!bio_crypt_ctx_back_mergeable(req, bio))
                return 0;
        if (blk_rq_sectors(req) + bio_sectors(bio) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
                req_set_nomerge(req->q, req);
                return 0;
        }

        return ll_new_hw_segment(req, bio, nr_segs);
}

static int ll_front_merge_fn(struct request *req, struct bio *bio,
                unsigned int nr_segs)
{
        if (req_gap_front_merge(req, bio))
                return 0;
        if (blk_integrity_rq(req) &&
            integrity_req_gap_front_merge(req, bio))
                return 0;
        if (!bio_crypt_ctx_front_mergeable(req, bio))
                return 0;
        if (blk_rq_sectors(req) + bio_sectors(bio) >
            blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
                req_set_nomerge(req->q, req);
                return 0;
        }

        return ll_new_hw_segment(req, bio, nr_segs);
}

static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
                struct request *next)
{
        unsigned short segments = blk_rq_nr_discard_segments(req);

        if (segments >= queue_max_discard_segments(q))
                goto no_merge;
        if (blk_rq_sectors(req) + bio_sectors(next->bio) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req)))
                goto no_merge;

        req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next);
        return true;
no_merge:
        req_set_nomerge(q, req);
        return false;
}

static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
                                struct request *next)
{
        int total_phys_segments;

        if (req_gap_back_merge(req, next->bio))
                return 0;

        /*
         * Will it become too large?
         */
        if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req)))
                return 0;

        total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
        if (total_phys_segments > blk_rq_get_max_segments(req))
                return 0;

        if (!blk_cgroup_mergeable(req, next->bio))
                return 0;

        if (blk_integrity_merge_rq(q, req, next) == false)
                return 0;

        if (!bio_crypt_ctx_merge_rq(req, next))
                return 0;

        /* Merge is OK... */
        req->nr_phys_segments = total_phys_segments;
        req->nr_integrity_segments += next->nr_integrity_segments;
        return 1;
}

/**
 * blk_rq_set_mixed_merge - mark a request as mixed merge
 * @rq: request to mark as mixed merge
 *
 * Description:
 *     @rq is about to be mixed merged.  Make sure the attributes
 *     which can be mixed are set in each bio and mark @rq as mixed
 *     merged.
 */
static void blk_rq_set_mixed_merge(struct request *rq)
{
        blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK;
        struct bio *bio;

        if (rq->rq_flags & RQF_MIXED_MERGE)
                return;

        /*
         * @rq will no longer represent mixable attributes for all the
         * contained bios.  It will just track those of the first one.
         * Distributes the attributs to each bio.
         */
        for (bio = rq->bio; bio; bio = bio->bi_next) {
                WARN_ON_ONCE((bio->bi_opf & REQ_FAILFAST_MASK) &&
                             (bio->bi_opf & REQ_FAILFAST_MASK) != ff);
                bio->bi_opf |= ff;
        }
        rq->rq_flags |= RQF_MIXED_MERGE;
}

static inline blk_opf_t bio_failfast(const struct bio *bio)
{
        if (bio->bi_opf & REQ_RAHEAD)
                return REQ_FAILFAST_MASK;

        return bio->bi_opf & REQ_FAILFAST_MASK;
}

/*
 * After we are marked as MIXED_MERGE, any new RA bio has to be updated
 * as failfast, and request's failfast has to be updated in case of
 * front merge.
 */
static inline void blk_update_mixed_merge(struct request *req,
                struct bio *bio, bool front_merge)
{
        if (req->rq_flags & RQF_MIXED_MERGE) {
                if (bio->bi_opf & REQ_RAHEAD)
                        bio->bi_opf |= REQ_FAILFAST_MASK;

                if (front_merge) {
                        req->cmd_flags &= ~REQ_FAILFAST_MASK;
                        req->cmd_flags |= bio->bi_opf & REQ_FAILFAST_MASK;
                }
        }
}

static void blk_account_io_merge_request(struct request *req)
{
        if (req->rq_flags & RQF_IO_STAT) {
                part_stat_lock();
                part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
                part_stat_local_dec(req->part,
                                    in_flight[op_is_write(req_op(req))]);
                part_stat_unlock();
        }
}

static enum elv_merge blk_try_req_merge(struct request *req,
                                        struct request *next)
{
        if (blk_discard_mergable(req))
                return ELEVATOR_DISCARD_MERGE;
        else if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next))
                return ELEVATOR_BACK_MERGE;

        return ELEVATOR_NO_MERGE;
}

static bool blk_atomic_write_mergeable_rq_bio(struct request *rq,
                                              struct bio *bio)
{
        return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC);
}

static bool blk_atomic_write_mergeable_rqs(struct request *rq,
                                           struct request *next)
{
        return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC);
}

u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next,
               u8 gaps_bit)
{
        struct bio_vec pb, nb;

        if (!bio_has_data(prev))
                return 0;

        gaps_bit = min_not_zero(gaps_bit, prev->bi_bvec_gap_bit);
        gaps_bit = min_not_zero(gaps_bit, next->bi_bvec_gap_bit);

        bio_get_last_bvec(prev, &pb);
        bio_get_first_bvec(next, &nb);
        if (!biovec_phys_mergeable(q, &pb, &nb))
                gaps_bit = min_not_zero(gaps_bit, ffs(bvec_seg_gap(&pb, &nb)));
        return gaps_bit;
}

/*
 * For non-mq, this has to be called with the request spinlock acquired.
 * For mq with scheduling, the appropriate queue wide lock should be held.
 */
static struct request *attempt_merge(struct request_queue *q,
                                     struct request *req, struct request *next)
{
        if (!rq_mergeable(req) || !rq_mergeable(next))
                return NULL;

        if (req_op(req) != req_op(next))
                return NULL;

        if (req->bio->bi_write_hint != next->bio->bi_write_hint)
                return NULL;
        if (req->bio->bi_write_stream != next->bio->bi_write_stream)
                return NULL;
        if (req->bio->bi_ioprio != next->bio->bi_ioprio)
                return NULL;
        if (!blk_atomic_write_mergeable_rqs(req, next))
                return NULL;

        /*
         * If we are allowed to merge, then append bio list
         * from next to rq and release next. merge_requests_fn
         * will have updated segment counts, update sector
         * counts here. Handle DISCARDs separately, as they
         * have separate settings.
         */

        switch (blk_try_req_merge(req, next)) {
        case ELEVATOR_DISCARD_MERGE:
                if (!req_attempt_discard_merge(q, req, next))
                        return NULL;
                break;
        case ELEVATOR_BACK_MERGE:
                if (!ll_merge_requests_fn(q, req, next))
                        return NULL;
                break;
        default:
                return NULL;
        }

        /*
         * If failfast settings disagree or any of the two is already
         * a mixed merge, mark both as mixed before proceeding.  This
         * makes sure that all involved bios have mixable attributes
         * set properly.
         */
        if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) ||
            (req->cmd_flags & REQ_FAILFAST_MASK) !=
            (next->cmd_flags & REQ_FAILFAST_MASK)) {
                blk_rq_set_mixed_merge(req);
                blk_rq_set_mixed_merge(next);
        }

        /*
         * At this point we have either done a back merge or front merge. We
         * need the smaller start_time_ns of the merged requests to be the
         * current request for accounting purposes.
         */
        if (next->start_time_ns < req->start_time_ns)
                req->start_time_ns = next->start_time_ns;

        req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, next->bio,
                                        min_not_zero(next->phys_gap_bit,
                                                     req->phys_gap_bit));
        req->biotail->bi_next = next->bio;
        req->biotail = next->biotail;

        req->__data_len += blk_rq_bytes(next);

        if (!blk_discard_mergable(req))
                elv_merge_requests(q, req, next);

        blk_crypto_rq_put_keyslot(next);

        /*
         * 'next' is going away, so update stats accordingly
         */
        blk_account_io_merge_request(next);

        trace_block_rq_merge(next);

        /*
         * ownership of bio passed from next to req, return 'next' for
         * the caller to free
         */
        next->bio = NULL;
        return next;
}

static struct request *attempt_back_merge(struct request_queue *q,
                struct request *rq)
{
        struct request *next = elv_latter_request(q, rq);

        if (next)
                return attempt_merge(q, rq, next);

        return NULL;
}

static struct request *attempt_front_merge(struct request_queue *q,
                struct request *rq)
{
        struct request *prev = elv_former_request(q, rq);

        if (prev)
                return attempt_merge(q, prev, rq);

        return NULL;
}

/*
 * Try to merge 'next' into 'rq'. Return true if the merge happened, false
 * otherwise. The caller is responsible for freeing 'next' if the merge
 * happened.
 */
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
                           struct request *next)
{
        return attempt_merge(q, rq, next);
}

bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
{
        if (!rq_mergeable(rq) || !bio_mergeable(bio))
                return false;

        if (req_op(rq) != bio_op(bio))
                return false;

        if (!blk_cgroup_mergeable(rq, bio))
                return false;
        if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
                return false;
        if (!bio_crypt_rq_ctx_compatible(rq, bio))
                return false;
        if (rq->bio->bi_write_hint != bio->bi_write_hint)
                return false;
        if (rq->bio->bi_write_stream != bio->bi_write_stream)
                return false;
        if (rq->bio->bi_ioprio != bio->bi_ioprio)
                return false;
        if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
                return false;

        return true;
}

enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
{
        if (blk_discard_mergable(rq))
                return ELEVATOR_DISCARD_MERGE;
        else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
                return ELEVATOR_BACK_MERGE;
        else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
                return ELEVATOR_FRONT_MERGE;
        return ELEVATOR_NO_MERGE;
}

static void blk_account_io_merge_bio(struct request *req)
{
        if (req->rq_flags & RQF_IO_STAT) {
                part_stat_lock();
                part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
                part_stat_unlock();
        }
}

enum bio_merge_status bio_attempt_back_merge(struct request *req,
                struct bio *bio, unsigned int nr_segs)
{
        const blk_opf_t ff = bio_failfast(bio);

        if (!ll_back_merge_fn(req, bio, nr_segs))
                return BIO_MERGE_FAILED;

        trace_block_bio_backmerge(bio);
        rq_qos_merge(req->q, req, bio);

        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
                blk_rq_set_mixed_merge(req);

        blk_update_mixed_merge(req, bio, false);

        if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
                blk_zone_write_plug_bio_merged(bio);

        req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, bio,
                                        req->phys_gap_bit);
        req->biotail->bi_next = bio;
        req->biotail = bio;
        req->__data_len += bio->bi_iter.bi_size;

        bio_crypt_free_ctx(bio);

        blk_account_io_merge_bio(req);
        return BIO_MERGE_OK;
}

static enum bio_merge_status bio_attempt_front_merge(struct request *req,
                struct bio *bio, unsigned int nr_segs)
{
        const blk_opf_t ff = bio_failfast(bio);

        /*
         * A front merge for writes to sequential zones of a zoned block device
         * can happen only if the user submitted writes out of order. Do not
         * merge such write to let it fail.
         */
        if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
                return BIO_MERGE_FAILED;

        if (!ll_front_merge_fn(req, bio, nr_segs))
                return BIO_MERGE_FAILED;

        trace_block_bio_frontmerge(bio);
        rq_qos_merge(req->q, req, bio);

        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
                blk_rq_set_mixed_merge(req);

        blk_update_mixed_merge(req, bio, true);

        req->phys_gap_bit = bio_seg_gap(req->q, bio, req->bio,
                                        req->phys_gap_bit);
        bio->bi_next = req->bio;
        req->bio = bio;

        req->__sector = bio->bi_iter.bi_sector;
        req->__data_len += bio->bi_iter.bi_size;

        bio_crypt_do_front_merge(req, bio);

        blk_account_io_merge_bio(req);
        return BIO_MERGE_OK;
}

static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
                struct request *req, struct bio *bio)
{
        unsigned short segments = blk_rq_nr_discard_segments(req);

        if (segments >= queue_max_discard_segments(q))
                goto no_merge;
        if (blk_rq_sectors(req) + bio_sectors(bio) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req)))
                goto no_merge;

        rq_qos_merge(q, req, bio);

        req->biotail->bi_next = bio;
        req->biotail = bio;
        req->__data_len += bio->bi_iter.bi_size;
        req->nr_phys_segments = segments + 1;

        blk_account_io_merge_bio(req);
        return BIO_MERGE_OK;
no_merge:
        req_set_nomerge(q, req);
        return BIO_MERGE_FAILED;
}

static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
                                                   struct request *rq,
                                                   struct bio *bio,
                                                   unsigned int nr_segs,
                                                   bool sched_allow_merge)
{
        if (!blk_rq_merge_ok(rq, bio))
                return BIO_MERGE_NONE;

        switch (blk_try_merge(rq, bio)) {
        case ELEVATOR_BACK_MERGE:
                if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
                        return bio_attempt_back_merge(rq, bio, nr_segs);
                break;
        case ELEVATOR_FRONT_MERGE:
                if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
                        return bio_attempt_front_merge(rq, bio, nr_segs);
                break;
        case ELEVATOR_DISCARD_MERGE:
                return bio_attempt_discard_merge(q, rq, bio);
        default:
                return BIO_MERGE_NONE;
        }

        return BIO_MERGE_FAILED;
}

/**
 * blk_attempt_plug_merge - try to merge with %current's plugged list
 * @q: request_queue new bio is being queued at
 * @bio: new bio being queued
 * @nr_segs: number of segments in @bio
 * from the passed in @q already in the plug list
 *
 * Determine whether @bio being queued on @q can be merged with the previous
 * request on %current's plugged list.  Returns %true if merge was successful,
 * otherwise %false.
 *
 * Plugging coalesces IOs from the same issuer for the same purpose without
 * going through @q->queue_lock.  As such it's more of an issuing mechanism
 * than scheduling, and the request, while may have elvpriv data, is not
 * added on the elevator at this point.  In addition, we don't have
 * reliable access to the elevator outside queue lock.  Only check basic
 * merging parameters without querying the elevator.
 *
 * Caller must ensure !blk_queue_nomerges(q) beforehand.
 */
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs)
{
        struct blk_plug *plug = current->plug;
        struct request *rq;

        if (!plug || rq_list_empty(&plug->mq_list))
                return false;

        rq = plug->mq_list.tail;
        if (rq->q == q)
                return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
                        BIO_MERGE_OK;
        else if (!plug->multiple_queues)
                return false;

        rq_list_for_each(&plug->mq_list, rq) {
                if (rq->q != q)
                        continue;
                if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
                    BIO_MERGE_OK)
                        return true;
                break;
        }
        return false;
}

/*
 * Iterate list of requests and see if we can merge this bio with any
 * of them.
 */
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
                        struct bio *bio, unsigned int nr_segs)
{
        struct request *rq;
        int checked = 8;

        list_for_each_entry_reverse(rq, list, queuelist) {
                if (!checked--)
                        break;

                switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) {
                case BIO_MERGE_NONE:
                        continue;
                case BIO_MERGE_OK:
                        return true;
                case BIO_MERGE_FAILED:
                        return false;
                }

        }

        return false;
}
EXPORT_SYMBOL_GPL(blk_bio_list_merge);

bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs, struct request **merged_request)
{
        struct request *rq;

        switch (elv_merge(q, &rq, bio)) {
        case ELEVATOR_BACK_MERGE:
                if (!blk_mq_sched_allow_merge(q, rq, bio))
                        return false;
                if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
                        return false;
                *merged_request = attempt_back_merge(q, rq);
                if (!*merged_request)
                        elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
                return true;
        case ELEVATOR_FRONT_MERGE:
                if (!blk_mq_sched_allow_merge(q, rq, bio))
                        return false;
                if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
                        return false;
                *merged_request = attempt_front_merge(q, rq);
                if (!*merged_request)
                        elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
                return true;
        case ELEVATOR_DISCARD_MERGE:
                return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK;
        default:
                return false;
        }
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);















































    1 














































































































































































   33 















   33 






























   19 













   19 


















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_CRED_H
#define _LINUX_CRED_H

#include <linux/capability.h>
#include <linux/init.h>
#include <linux/key.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/uidgid.h>
#include <linux/sched.h>
#include <linux/sched/user.h>

struct cred;
struct inode;

extern struct task_struct init_task;

/*
 * COW Supplementary groups list
 */
struct group_info {
        refcount_t        usage;
        int                ngroups;
        kgid_t                gid[];
} __randomize_layout;

/**
 * get_group_info - Get a reference to a group info structure
 * @gi: The group info to reference
 *
 * This gets a reference to a set of supplementary groups.
 *
 * If the caller is accessing a task's credentials, they must hold the RCU read
 * lock when reading.
 *
 * Returns: @gi
 */
static inline struct group_info *get_group_info(struct group_info *gi)
{
        refcount_inc(&gi->usage);
        return gi;
}

/**
 * put_group_info - Release a reference to a group info structure
 * @group_info: The group info to release
 */
#define put_group_info(group_info)                        \
do {                                                        \
        if (refcount_dec_and_test(&(group_info)->usage))        \
                groups_free(group_info);                \
} while (0)

#ifdef CONFIG_MULTIUSER
extern struct group_info *groups_alloc(int);
extern void groups_free(struct group_info *);

extern int in_group_p(kgid_t);
extern int in_egroup_p(kgid_t);
extern int groups_search(const struct group_info *, kgid_t);

extern int set_current_groups(struct group_info *);
extern void set_groups(struct cred *, struct group_info *);
extern bool may_setgroups(void);
extern void groups_sort(struct group_info *);
#else
static inline void groups_free(struct group_info *group_info)
{
}

static inline int in_group_p(kgid_t grp)
{
        return 1;
}
static inline int in_egroup_p(kgid_t grp)
{
        return 1;
}
static inline int groups_search(const struct group_info *group_info, kgid_t grp)
{
        return 1;
}
#endif

/*
 * The security context of a task
 *
 * The parts of the context break down into two categories:
 *
 *  (1) The objective context of a task.  These parts are used when some other
 *        task is attempting to affect this one.
 *
 *  (2) The subjective context.  These details are used when the task is acting
 *        upon another object, be that a file, a task, a key or whatever.
 *
 * Note that some members of this structure belong to both categories - the
 * LSM security pointer for instance.
 *
 * A task has two security pointers.  task->real_cred points to the objective
 * context that defines that task's actual details.  The objective part of this
 * context is used whenever that task is acted upon.
 *
 * task->cred points to the subjective context that defines the details of how
 * that task is going to act upon another object.  This may be overridden
 * temporarily to point to another security context, but normally points to the
 * same context as task->real_cred.
 */
struct cred {
        atomic_long_t        usage;
        kuid_t                uid;                /* real UID of the task */
        kgid_t                gid;                /* real GID of the task */
        kuid_t                suid;                /* saved UID of the task */
        kgid_t                sgid;                /* saved GID of the task */
        kuid_t                euid;                /* effective UID of the task */
        kgid_t                egid;                /* effective GID of the task */
        kuid_t                fsuid;                /* UID for VFS ops */
        kgid_t                fsgid;                /* GID for VFS ops */
        unsigned        securebits;        /* SUID-less security management */
        kernel_cap_t        cap_inheritable; /* caps our children can inherit */
        kernel_cap_t        cap_permitted;        /* caps we're permitted */
        kernel_cap_t        cap_effective;        /* caps we can actually use */
        kernel_cap_t        cap_bset;        /* capability bounding set */
        kernel_cap_t        cap_ambient;        /* Ambient capability set */
#ifdef CONFIG_KEYS
        unsigned char        jit_keyring;        /* default keyring to attach requested
                                         * keys to */
        struct key        *session_keyring; /* keyring inherited over fork */
        struct key        *process_keyring; /* keyring private to this process */
        struct key        *thread_keyring; /* keyring private to this thread */
        struct key        *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
        void                *security;        /* LSM security */
#endif
        struct user_struct *user;        /* real user ID subscription */
        struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
        struct ucounts *ucounts;
        struct group_info *group_info;        /* supplementary groups for euid/fsgid */
        /* RCU deletion */
        union {
                int non_rcu;                        /* Can we skip RCU deletion? */
                struct rcu_head        rcu;                /* RCU deletion hook */
        };
} __randomize_layout;

extern void __put_cred(struct cred *);
extern void exit_creds(struct task_struct *);
extern int copy_creds(struct task_struct *, u64);
extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *);
static inline const struct cred *kernel_cred(void)
{
        /* shut up sparse */
        return rcu_dereference_raw(init_task.cred);
}
extern int set_security_override(struct cred *, u32);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);
extern int set_cred_ucounts(struct cred *);

static inline bool cap_ambient_invariant_ok(const struct cred *cred)
{
        return cap_issubset(cred->cap_ambient,
                            cap_intersect(cred->cap_permitted,
                                          cred->cap_inheritable));
}

static inline const struct cred *override_creds(const struct cred *override_cred)
{
        return rcu_replace_pointer(current->cred, override_cred, 1);
}

static inline const struct cred *revert_creds(const struct cred *revert_cred)
{
        return rcu_replace_pointer(current->cred, revert_cred, 1);
}

DEFINE_CLASS(override_creds,
             const struct cred *,
             revert_creds(_T),
             override_creds(override_cred), const struct cred *override_cred)

#define scoped_with_creds(cred) \
        scoped_class(override_creds, __UNIQUE_ID(label), cred)

#define scoped_with_kernel_creds() scoped_with_creds(kernel_cred())

/**
 * get_cred_many - Get references on a set of credentials
 * @cred: The credentials to reference
 * @nr: Number of references to acquire
 *
 * Get references on the specified set of credentials.  The caller must release
 * all acquired reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.  Although the
 * pointer is const, this will temporarily discard the const and increment the
 * usage count.  The purpose of this is to attempt to catch at compile time the
 * accidental alteration of a set of credentials that should be considered
 * immutable.
 *
 * Returns: @cred when the references are acquired, NULL otherwise.
 */
static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return cred;
        nonconst_cred->non_rcu = 0;
        atomic_long_add(nr, &nonconst_cred->usage);
        return cred;
}

/*
 * get_cred - Get a reference on a set of credentials
 * @cred: The credentials to reference
 *
 * Get a reference on the specified set of credentials.  The caller must
 * release the reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.
 */
static inline const struct cred *get_cred(const struct cred *cred)
{
        return get_cred_many(cred, 1);
}

static inline const struct cred *get_cred_rcu(const struct cred *cred)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return NULL;
        if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
                return NULL;
        nonconst_cred->non_rcu = 0;
        return cred;
}

/**
 * put_cred_many - Release a reference to a set of credentials
 * @_cred: The credentials to release
 * @nr: Number of references to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 *
 * This takes a const pointer to a set of credentials because the credentials
 * on task_struct are attached by const pointers to prevent accidental
 * alteration of otherwise immutable credential sets.
 */
static inline void put_cred_many(const struct cred *_cred, int nr)
{
        struct cred *cred = (struct cred *) _cred;

        if (cred) {
                if (atomic_long_sub_and_test(nr, &cred->usage))
                        __put_cred(cred);
        }
}

/*
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 */
static inline void put_cred(const struct cred *cred)
{
        put_cred_many(cred, 1);
}

DEFINE_CLASS(prepare_creds,
              struct cred *,
              if (_T) put_cred(_T),
              prepare_creds(), void)

DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T))

/**
 * current_cred - Access the current task's subjective credentials
 *
 * Access the subjective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_cred() \
        rcu_dereference_protected(current->cred, 1)

/**
 * current_real_cred - Access the current task's objective credentials
 *
 * Access the objective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_real_cred() \
        rcu_dereference_protected(current->real_cred, 1)

/**
 * __task_cred - Access a task's objective credentials
 * @task: The task to query
 *
 * Access the objective credentials of a task.  The caller must hold the RCU
 * readlock.
 *
 * The result of this function should not be passed directly to get_cred();
 * rather get_task_cred() should be used instead.
 */
#define __task_cred(task)        \
        rcu_dereference((task)->real_cred)

/**
 * get_current_cred - Get the current task's subjective credentials
 *
 * Get the subjective credentials of the current task, pinning them so that
 * they can't go away.  Accessing the current task's credentials directly is
 * not permitted.
 */
#define get_current_cred()                                \
        (get_cred(current_cred()))

/**
 * get_current_user - Get the current task's user_struct
 *
 * Get the user record of the current task, pinning it so that it can't go
 * away.
 */
#define get_current_user()                                \
({                                                        \
        struct user_struct *__u;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __u = get_uid(__cred->user);                        \
        __u;                                                \
})

/**
 * get_current_groups - Get the current task's supplementary group list
 *
 * Get the supplementary group list of the current task, pinning it so that it
 * can't go away.
 */
#define get_current_groups()                                \
({                                                        \
        struct group_info *__groups;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __groups = get_group_info(__cred->group_info);        \
        __groups;                                        \
})

#define task_cred_xxx(task, xxx)                        \
({                                                        \
        __typeof__(((struct cred *)NULL)->xxx) ___val;        \
        rcu_read_lock();                                \
        ___val = __task_cred((task))->xxx;                \
        rcu_read_unlock();                                \
        ___val;                                                \
})

#define task_uid(task)                (task_cred_xxx((task), uid))
#define task_euid(task)                (task_cred_xxx((task), euid))
#define task_ucounts(task)        (task_cred_xxx((task), ucounts))

#define current_cred_xxx(xxx)                        \
({                                                \
        current_cred()->xxx;                        \
})

#define current_uid()                (current_cred_xxx(uid))
#define current_gid()                (current_cred_xxx(gid))
#define current_euid()                (current_cred_xxx(euid))
#define current_egid()                (current_cred_xxx(egid))
#define current_suid()                (current_cred_xxx(suid))
#define current_sgid()                (current_cred_xxx(sgid))
#define current_fsuid()         (current_cred_xxx(fsuid))
#define current_fsgid()         (current_cred_xxx(fsgid))
#define current_cap()                (current_cred_xxx(cap_effective))
#define current_user()                (current_cred_xxx(user))
#define current_ucounts()        (current_cred_xxx(ucounts))

extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns()        (current_cred_xxx(user_ns))
#else
static inline struct user_namespace *current_user_ns(void)
{
        return &init_user_ns;
}
#endif


#define current_uid_gid(_uid, _gid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_uid) = __cred->uid;                        \
        *(_gid) = __cred->gid;                        \
} while(0)

#define current_euid_egid(_euid, _egid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_euid) = __cred->euid;                \
        *(_egid) = __cred->egid;                \
} while(0)

#define current_fsuid_fsgid(_fsuid, _fsgid)        \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_fsuid) = __cred->fsuid;                \
        *(_fsgid) = __cred->fsgid;                \
} while(0)

#endif /* _LINUX_CRED_H */






































































































































































































































































































































































































































































































































    3 




















































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
 * This file holds USB constants and structures that are needed for
 * USB device APIs.  These are used by the USB device model, which is
 * defined in chapter 9 of the USB 2.0 specification and in the
 * Wireless USB 1.0 spec (now defunct).  Linux has several APIs in C that
 * need these:
 *
 * - the master/host side Linux-USB kernel driver API;
 * - the "usbfs" user space API; and
 * - the Linux "gadget" slave/device/peripheral side driver API.
 *
 * USB 2.0 adds an additional "On The Go" (OTG) mode, which lets systems
 * act either as a USB master/host or as a USB slave/device.  That means
 * the master and slave side APIs benefit from working well together.
 *
 * Note all descriptors are declared '__attribute__((packed))' so that:
 *
 * [a] they never get padded, either internally (USB spec writers
 *     probably handled that) or externally;
 *
 * [b] so that accessing bigger-than-a-bytes fields will never
 *     generate bus errors on any platform, even when the location of
 *     its descriptor inside a bundle isn't "naturally aligned", and
 *
 * [c] for consistency, removing all doubt even when it appears to
 *     someone that the two other points are non-issues for that
 *     particular descriptor type.
 */

#ifndef _UAPI__LINUX_USB_CH9_H
#define _UAPI__LINUX_USB_CH9_H

#include <linux/types.h>        /* __u8 etc */
#include <asm/byteorder.h>        /* le16_to_cpu */

/*-------------------------------------------------------------------------*/

/* CONTROL REQUEST SUPPORT */

/*
 * USB directions
 *
 * This bit flag is used in endpoint descriptors' bEndpointAddress field.
 * It's also one of three fields in control requests bRequestType.
 */
#define USB_DIR_OUT                        0                /* to device */
#define USB_DIR_IN                        0x80                /* to host */

/*
 * USB types, the second of three bRequestType fields
 */
#define USB_TYPE_MASK                        (0x03 << 5)
#define USB_TYPE_STANDARD                (0x00 << 5)
#define USB_TYPE_CLASS                        (0x01 << 5)
#define USB_TYPE_VENDOR                        (0x02 << 5)
#define USB_TYPE_RESERVED                (0x03 << 5)

/*
 * USB recipients, the third of three bRequestType fields
 */
#define USB_RECIP_MASK                        0x1f
#define USB_RECIP_DEVICE                0x00
#define USB_RECIP_INTERFACE                0x01
#define USB_RECIP_ENDPOINT                0x02
#define USB_RECIP_OTHER                        0x03
/* From Wireless USB 1.0 */
#define USB_RECIP_PORT                        0x04
#define USB_RECIP_RPIPE                0x05

/*
 * Standard requests, for the bRequest field of a SETUP packet.
 *
 * These are qualified by the bRequestType field, so that for example
 * TYPE_CLASS or TYPE_VENDOR specific feature flags could be retrieved
 * by a GET_STATUS request.
 */
#define USB_REQ_GET_STATUS                0x00
#define USB_REQ_CLEAR_FEATURE                0x01
#define USB_REQ_SET_FEATURE                0x03
#define USB_REQ_SET_ADDRESS                0x05
#define USB_REQ_GET_DESCRIPTOR                0x06
#define USB_REQ_SET_DESCRIPTOR                0x07
#define USB_REQ_GET_CONFIGURATION        0x08
#define USB_REQ_SET_CONFIGURATION        0x09
#define USB_REQ_GET_INTERFACE                0x0A
#define USB_REQ_SET_INTERFACE                0x0B
#define USB_REQ_SYNCH_FRAME                0x0C
#define USB_REQ_SET_SEL                        0x30
#define USB_REQ_SET_ISOCH_DELAY                0x31

#define USB_REQ_SET_ENCRYPTION                0x0D        /* Wireless USB */
#define USB_REQ_GET_ENCRYPTION                0x0E
#define USB_REQ_RPIPE_ABORT                0x0E
#define USB_REQ_SET_HANDSHAKE                0x0F
#define USB_REQ_RPIPE_RESET                0x0F
#define USB_REQ_GET_HANDSHAKE                0x10
#define USB_REQ_SET_CONNECTION                0x11
#define USB_REQ_SET_SECURITY_DATA        0x12
#define USB_REQ_GET_SECURITY_DATA        0x13
#define USB_REQ_SET_WUSB_DATA                0x14
#define USB_REQ_LOOPBACK_DATA_WRITE        0x15
#define USB_REQ_LOOPBACK_DATA_READ        0x16
#define USB_REQ_SET_INTERFACE_DS        0x17
#define USB_REQ_AUTH_IN                        0x18
#define USB_REQ_AUTH_OUT                0x19

/* specific requests for USB Power Delivery */
#define USB_REQ_GET_PARTNER_PDO                20
#define USB_REQ_GET_BATTERY_STATUS        21
#define USB_REQ_SET_PDO                        22
#define USB_REQ_GET_VDM                        23
#define USB_REQ_SEND_VDM                24

/* The Link Power Management (LPM) ECN defines USB_REQ_TEST_AND_SET command,
 * used by hubs to put ports into a new L1 suspend state, except that it
 * forgot to define its number ...
 */

/*
 * USB feature flags are written using USB_REQ_{CLEAR,SET}_FEATURE, and
 * are read as a bit array returned by USB_REQ_GET_STATUS.  (So there
 * are at most sixteen features of each type.)  Hubs may also support a
 * new USB_REQ_TEST_AND_SET_FEATURE to put ports into L1 suspend.
 */
#define USB_DEVICE_SELF_POWERED                                0        /* (read only) */
#define USB_DEVICE_REMOTE_WAKEUP                        1        /* dev may initiate wakeup */
#define USB_DEVICE_TEST_MODE                                2        /* (wired high speed only) */
#define USB_DEVICE_BATTERY                                2        /* (wireless) */
#define USB_DEVICE_B_HNP_ENABLE                                3        /* (otg) dev may initiate HNP */
#define USB_DEVICE_WUSB_DEVICE                                3        /* (wireless)*/
#define USB_DEVICE_A_HNP_SUPPORT                        4        /* (otg) RH port supports HNP */
#define USB_DEVICE_A_ALT_HNP_SUPPORT                        5        /* (otg) other RH port does */
#define USB_DEVICE_DEBUG_MODE                                6        /* (special devices only) */

#define USB_DEVICE_BULK_MAX_PACKET_UPDATE                8        /* (eUSB2v2) bump maxpacket to 1024 */

/*
 * Test Mode Selectors
 * See USB 2.0 spec Table 9-7
 */
#define        USB_TEST_J                1
#define        USB_TEST_K                2
#define        USB_TEST_SE0_NAK        3
#define        USB_TEST_PACKET                4
#define        USB_TEST_FORCE_ENABLE        5

/* Status Type */
#define USB_STATUS_TYPE_STANDARD        0
#define USB_STATUS_TYPE_PTM                1

/*
 * New Feature Selectors as added by USB 3.0
 * See USB 3.0 spec Table 9-7
 */
#define USB_DEVICE_U1_ENABLE        48        /* dev may initiate U1 transition */
#define USB_DEVICE_U2_ENABLE        49        /* dev may initiate U2 transition */
#define USB_DEVICE_LTM_ENABLE        50        /* dev may send LTM */
#define USB_INTRF_FUNC_SUSPEND        0        /* function suspend */

#define USB_INTR_FUNC_SUSPEND_OPT_MASK        0xFF00
/*
 * Suspend Options, Table 9-8 USB 3.0 spec
 */
#define USB_INTRF_FUNC_SUSPEND_LP        (1 << (8 + 0))
#define USB_INTRF_FUNC_SUSPEND_RW        (1 << (8 + 1))

/*
 * Interface status, Figure 9-5 USB 3.0 spec
 */
#define USB_INTRF_STAT_FUNC_RW_CAP     1
#define USB_INTRF_STAT_FUNC_RW         2

#define USB_ENDPOINT_HALT                0        /* IN/OUT will STALL */

/* Bit array elements as returned by the USB_REQ_GET_STATUS request. */
#define USB_DEV_STAT_U1_ENABLED                2        /* transition into U1 state */
#define USB_DEV_STAT_U2_ENABLED                3        /* transition into U2 state */
#define USB_DEV_STAT_LTM_ENABLED        4        /* Latency tolerance messages */

/*
 * Feature selectors from Table 9-8 USB Power Delivery spec
 */
#define USB_DEVICE_BATTERY_WAKE_MASK        40
#define USB_DEVICE_OS_IS_PD_AWARE        41
#define USB_DEVICE_POLICY_MODE                42
#define USB_PORT_PR_SWAP                43
#define USB_PORT_GOTO_MIN                44
#define USB_PORT_RETURN_POWER                45
#define USB_PORT_ACCEPT_PD_REQUEST        46
#define USB_PORT_REJECT_PD_REQUEST        47
#define USB_PORT_PORT_PD_RESET                48
#define USB_PORT_C_PORT_PD_CHANGE        49
#define USB_PORT_CABLE_PD_RESET                50
#define USB_DEVICE_CHARGING_POLICY        54

/**
 * struct usb_ctrlrequest - SETUP data for a USB device control request
 * @bRequestType: matches the USB bmRequestType field
 * @bRequest: matches the USB bRequest field
 * @wValue: matches the USB wValue field (le16 byte order)
 * @wIndex: matches the USB wIndex field (le16 byte order)
 * @wLength: matches the USB wLength field (le16 byte order)
 *
 * This structure is used to send control requests to a USB device.  It matches
 * the different fields of the USB 2.0 Spec section 9.3, table 9-2.  See the
 * USB spec for a fuller description of the different fields, and what they are
 * used for.
 *
 * Note that the driver for any interface can issue control requests.
 * For most devices, interfaces don't coordinate with each other, so
 * such requests may be made at any time.
 */
struct usb_ctrlrequest {
        __u8 bRequestType;
        __u8 bRequest;
        __le16 wValue;
        __le16 wIndex;
        __le16 wLength;
} __attribute__ ((packed));

/*-------------------------------------------------------------------------*/

/*
 * STANDARD DESCRIPTORS ... as returned by GET_DESCRIPTOR, or
 * (rarely) accepted by SET_DESCRIPTOR.
 *
 * Note that all multi-byte values here are encoded in little endian
 * byte order "on the wire".  Within the kernel and when exposed
 * through the Linux-USB APIs, they are not converted to cpu byte
 * order; it is the responsibility of the client code to do this.
 * The single exception is when device and configuration descriptors (but
 * not other descriptors) are read from character devices
 * (i.e. /dev/bus/usb/BBB/DDD);
 * in this case the fields are converted to host endianness by the kernel.
 */

/*
 * Descriptor types ... USB 2.0 spec table 9.5
 */
#define USB_DT_DEVICE                        0x01
#define USB_DT_CONFIG                        0x02
#define USB_DT_STRING                        0x03
#define USB_DT_INTERFACE                0x04
#define USB_DT_ENDPOINT                        0x05
#define USB_DT_DEVICE_QUALIFIER                0x06
#define USB_DT_OTHER_SPEED_CONFIG        0x07
#define USB_DT_INTERFACE_POWER                0x08
/* these are from a minor usb 2.0 revision (ECN) */
#define USB_DT_OTG                        0x09
#define USB_DT_DEBUG                        0x0a
#define USB_DT_INTERFACE_ASSOCIATION        0x0b
/* these are from the Wireless USB spec */
#define USB_DT_SECURITY                        0x0c
#define USB_DT_KEY                        0x0d
#define USB_DT_ENCRYPTION_TYPE                0x0e
#define USB_DT_BOS                        0x0f
#define USB_DT_DEVICE_CAPABILITY        0x10
#define USB_DT_WIRELESS_ENDPOINT_COMP        0x11
/* From the eUSB2 spec */
#define USB_DT_EUSB2_ISOC_ENDPOINT_COMP        0x12
/* From Wireless USB spec */
#define USB_DT_WIRE_ADAPTER                0x21
/* From USB Device Firmware Upgrade Specification, Revision 1.1 */
#define USB_DT_DFU_FUNCTIONAL                0x21
/* these are from the Wireless USB spec */
#define USB_DT_RPIPE                        0x22
#define USB_DT_CS_RADIO_CONTROL                0x23
/* From the T10 UAS specification */
#define USB_DT_PIPE_USAGE                0x24
/* From the USB 3.0 spec */
#define        USB_DT_SS_ENDPOINT_COMP                0x30
/* From the USB 3.1 spec */
#define        USB_DT_SSP_ISOC_ENDPOINT_COMP        0x31

/* Conventional codes for class-specific descriptors.  The convention is
 * defined in the USB "Common Class" Spec (3.11).  Individual class specs
 * are authoritative for their usage, not the "common class" writeup.
 */
#define USB_DT_CS_DEVICE                (USB_TYPE_CLASS | USB_DT_DEVICE)
#define USB_DT_CS_CONFIG                (USB_TYPE_CLASS | USB_DT_CONFIG)
#define USB_DT_CS_STRING                (USB_TYPE_CLASS | USB_DT_STRING)
#define USB_DT_CS_INTERFACE                (USB_TYPE_CLASS | USB_DT_INTERFACE)
#define USB_DT_CS_ENDPOINT                (USB_TYPE_CLASS | USB_DT_ENDPOINT)

/* All standard descriptors have these 2 fields at the beginning */
struct usb_descriptor_header {
        __u8  bLength;
        __u8  bDescriptorType;
} __attribute__ ((packed));


/*-------------------------------------------------------------------------*/

/* USB_DT_DEVICE: Device descriptor */
struct usb_device_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 bcdUSB;
        __u8  bDeviceClass;
        __u8  bDeviceSubClass;
        __u8  bDeviceProtocol;
        __u8  bMaxPacketSize0;
        __le16 idVendor;
        __le16 idProduct;
        __le16 bcdDevice;
        __u8  iManufacturer;
        __u8  iProduct;
        __u8  iSerialNumber;
        __u8  bNumConfigurations;
} __attribute__ ((packed));

#define USB_DT_DEVICE_SIZE                18


/*
 * Device and/or Interface Class codes
 * as found in bDeviceClass or bInterfaceClass
 * and defined by www.usb.org documents
 */
#define USB_CLASS_PER_INTERFACE                0        /* for DeviceClass */
#define USB_CLASS_AUDIO                        1
#define USB_CLASS_COMM                        2
#define USB_CLASS_HID                        3
#define USB_CLASS_PHYSICAL                5
#define USB_CLASS_STILL_IMAGE                6
#define USB_CLASS_PRINTER                7
#define USB_CLASS_MASS_STORAGE                8
#define USB_CLASS_HUB                        9
#define USB_CLASS_CDC_DATA                0x0a
#define USB_CLASS_CSCID                        0x0b        /* chip+ smart card */
#define USB_CLASS_CONTENT_SEC                0x0d        /* content security */
#define USB_CLASS_VIDEO                        0x0e
#define USB_CLASS_WIRELESS_CONTROLLER        0xe0
#define USB_CLASS_PERSONAL_HEALTHCARE        0x0f
#define USB_CLASS_AUDIO_VIDEO                0x10
#define USB_CLASS_BILLBOARD                0x11
#define USB_CLASS_USB_TYPE_C_BRIDGE        0x12
#define USB_CLASS_MCTP                        0x14
#define USB_CLASS_MISC                        0xef
#define USB_CLASS_APP_SPEC                0xfe
#define USB_SUBCLASS_DFU                        0x01

#define USB_CLASS_VENDOR_SPEC                0xff
#define USB_SUBCLASS_VENDOR_SPEC                0xff

/*-------------------------------------------------------------------------*/

/* USB_DT_CONFIG: Configuration descriptor information.
 *
 * USB_DT_OTHER_SPEED_CONFIG is the same descriptor, except that the
 * descriptor type is different.  Highspeed-capable devices can look
 * different depending on what speed they're currently running.  Only
 * devices with a USB_DT_DEVICE_QUALIFIER have any OTHER_SPEED_CONFIG
 * descriptors.
 */
struct usb_config_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 wTotalLength;
        __u8  bNumInterfaces;
        __u8  bConfigurationValue;
        __u8  iConfiguration;
        __u8  bmAttributes;
        __u8  bMaxPower;
} __attribute__ ((packed));

#define USB_DT_CONFIG_SIZE                9

/* from config descriptor bmAttributes */
#define USB_CONFIG_ATT_ONE                (1 << 7)        /* must be set */
#define USB_CONFIG_ATT_SELFPOWER        (1 << 6)        /* self powered */
#define USB_CONFIG_ATT_WAKEUP                (1 << 5)        /* can wakeup */
#define USB_CONFIG_ATT_BATTERY                (1 << 4)        /* battery powered */

/*-------------------------------------------------------------------------*/

/* USB String descriptors can contain at most 126 characters. */
#define USB_MAX_STRING_LEN        126

/* USB_DT_STRING: String descriptor */
struct usb_string_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        union {
                __le16 legacy_padding;
                __DECLARE_FLEX_ARRAY(__le16, wData);        /* UTF-16LE encoded */
        };
} __attribute__ ((packed));

/* note that "string" zero is special, it holds language codes that
 * the device supports, not Unicode characters.
 */

/*-------------------------------------------------------------------------*/

/* USB_DT_INTERFACE: Interface descriptor */
struct usb_interface_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bInterfaceNumber;
        __u8  bAlternateSetting;
        __u8  bNumEndpoints;
        __u8  bInterfaceClass;
        __u8  bInterfaceSubClass;
        __u8  bInterfaceProtocol;
        __u8  iInterface;
} __attribute__ ((packed));

#define USB_DT_INTERFACE_SIZE                9

/*-------------------------------------------------------------------------*/

/* USB_DT_ENDPOINT: Endpoint descriptor */
struct usb_endpoint_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bEndpointAddress;
        __u8  bmAttributes;
        __le16 wMaxPacketSize;
        __u8  bInterval;

        /* NOTE:  these two are _only_ in audio endpoints. */
        /* use USB_DT_ENDPOINT*_SIZE in bLength, not sizeof. */
        __u8  bRefresh;
        __u8  bSynchAddress;
} __attribute__ ((packed));

#define USB_DT_ENDPOINT_SIZE                7
#define USB_DT_ENDPOINT_AUDIO_SIZE        9        /* Audio extension */


/*
 * Endpoints
 */
#define USB_ENDPOINT_NUMBER_MASK        0x0f        /* in bEndpointAddress */
#define USB_ENDPOINT_DIR_MASK                0x80

#define USB_ENDPOINT_XFERTYPE_MASK        0x03        /* in bmAttributes */
#define USB_ENDPOINT_XFER_CONTROL        0
#define USB_ENDPOINT_XFER_ISOC                1
#define USB_ENDPOINT_XFER_BULK                2
#define USB_ENDPOINT_XFER_INT                3
#define USB_ENDPOINT_MAX_ADJUSTABLE        0x80

#define USB_ENDPOINT_MAXP_MASK        0x07ff
#define USB_EP_MAXP_MULT_SHIFT        11
#define USB_EP_MAXP_MULT_MASK        (3 << USB_EP_MAXP_MULT_SHIFT)
#define USB_EP_MAXP_MULT(m) \
        (((m) & USB_EP_MAXP_MULT_MASK) >> USB_EP_MAXP_MULT_SHIFT)

/* The USB 3.0 spec redefines bits 5:4 of bmAttributes as interrupt ep type. */
#define USB_ENDPOINT_INTRTYPE                0x30
#define USB_ENDPOINT_INTR_PERIODIC        (0 << 4)
#define USB_ENDPOINT_INTR_NOTIFICATION        (1 << 4)

#define USB_ENDPOINT_SYNCTYPE                0x0c
#define USB_ENDPOINT_SYNC_NONE                (0 << 2)
#define USB_ENDPOINT_SYNC_ASYNC                (1 << 2)
#define USB_ENDPOINT_SYNC_ADAPTIVE        (2 << 2)
#define USB_ENDPOINT_SYNC_SYNC                (3 << 2)

#define USB_ENDPOINT_USAGE_MASK                0x30
#define USB_ENDPOINT_USAGE_DATA                0x00
#define USB_ENDPOINT_USAGE_FEEDBACK        0x10
#define USB_ENDPOINT_USAGE_IMPLICIT_FB        0x20        /* Implicit feedback Data endpoint */

/*-------------------------------------------------------------------------*/

/**
 * usb_endpoint_num - get the endpoint's number
 * @epd: endpoint to be checked
 *
 * Returns @epd's number: 0 to 15.
 */
static inline int usb_endpoint_num(const struct usb_endpoint_descriptor *epd)
{
        return epd->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK;
}

/**
 * usb_endpoint_type - get the endpoint's transfer type
 * @epd: endpoint to be checked
 *
 * Returns one of USB_ENDPOINT_XFER_{CONTROL, ISOC, BULK, INT} according
 * to @epd's transfer type.
 */
static inline int usb_endpoint_type(const struct usb_endpoint_descriptor *epd)
{
        return epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK;
}

/**
 * usb_endpoint_dir_in - check if the endpoint has IN direction
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type IN, otherwise it returns false.
 */
static inline int usb_endpoint_dir_in(const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN);
}

/**
 * usb_endpoint_dir_out - check if the endpoint has OUT direction
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type OUT, otherwise it returns false.
 */
static inline int usb_endpoint_dir_out(
                                const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT);
}

/**
 * usb_endpoint_xfer_bulk - check if the endpoint has bulk transfer type
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type bulk, otherwise it returns false.
 */
static inline int usb_endpoint_xfer_bulk(
                                const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                USB_ENDPOINT_XFER_BULK);
}

/**
 * usb_endpoint_xfer_control - check if the endpoint has control transfer type
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type control, otherwise it returns false.
 */
static inline int usb_endpoint_xfer_control(
                                const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                USB_ENDPOINT_XFER_CONTROL);
}

/**
 * usb_endpoint_xfer_int - check if the endpoint has interrupt transfer type
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type interrupt, otherwise it returns
 * false.
 */
static inline int usb_endpoint_xfer_int(
                                const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                USB_ENDPOINT_XFER_INT);
}

/**
 * usb_endpoint_xfer_isoc - check if the endpoint has isochronous transfer type
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint is of type isochronous, otherwise it returns
 * false.
 */
static inline int usb_endpoint_xfer_isoc(
                                const struct usb_endpoint_descriptor *epd)
{
        return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
                USB_ENDPOINT_XFER_ISOC);
}

/**
 * usb_endpoint_is_bulk_in - check if the endpoint is bulk IN
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has bulk transfer type and IN direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_bulk_in(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_in(epd);
}

/**
 * usb_endpoint_is_bulk_out - check if the endpoint is bulk OUT
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has bulk transfer type and OUT direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_bulk_out(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_out(epd);
}

/**
 * usb_endpoint_is_int_in - check if the endpoint is interrupt IN
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has interrupt transfer type and IN direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_int_in(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_int(epd) && usb_endpoint_dir_in(epd);
}

/**
 * usb_endpoint_is_int_out - check if the endpoint is interrupt OUT
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has interrupt transfer type and OUT direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_int_out(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_int(epd) && usb_endpoint_dir_out(epd);
}

/**
 * usb_endpoint_is_isoc_in - check if the endpoint is isochronous IN
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has isochronous transfer type and IN direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_isoc_in(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_in(epd);
}

/**
 * usb_endpoint_is_isoc_out - check if the endpoint is isochronous OUT
 * @epd: endpoint to be checked
 *
 * Returns true if the endpoint has isochronous transfer type and OUT direction,
 * otherwise it returns false.
 */
static inline int usb_endpoint_is_isoc_out(
                                const struct usb_endpoint_descriptor *epd)
{
        return usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_out(epd);
}

/**
 * usb_endpoint_maxp - get endpoint's max packet size
 * @epd: endpoint to be checked
 *
 * Returns @epd's max packet bits [10:0]
 */
static inline int usb_endpoint_maxp(const struct usb_endpoint_descriptor *epd)
{
        return __le16_to_cpu(epd->wMaxPacketSize) & USB_ENDPOINT_MAXP_MASK;
}

/**
 * usb_endpoint_maxp_mult - get endpoint's transactional opportunities
 * @epd: endpoint to be checked
 *
 * Return @epd's wMaxPacketSize[12:11] + 1
 */
static inline int
usb_endpoint_maxp_mult(const struct usb_endpoint_descriptor *epd)
{
        int maxp = __le16_to_cpu(epd->wMaxPacketSize);

        return USB_EP_MAXP_MULT(maxp) + 1;
}

static inline int usb_endpoint_interrupt_type(
                const struct usb_endpoint_descriptor *epd)
{
        return epd->bmAttributes & USB_ENDPOINT_INTRTYPE;
}

/*-------------------------------------------------------------------------*/

/* USB_DT_EUSB2_ISOC_ENDPOINT_COMP: eUSB2 Isoch Endpoint Companion descriptor */
struct usb_eusb2_isoc_ep_comp_descriptor {
        __u8        bLength;
        __u8        bDescriptorType;
        __le16        wMaxPacketSize;
        __le32        dwBytesPerInterval;
} __attribute__ ((packed));

#define USB_DT_EUSB2_ISOC_EP_COMP_SIZE        8

/*-------------------------------------------------------------------------*/

/* USB_DT_SSP_ISOC_ENDPOINT_COMP: SuperSpeedPlus Isochronous Endpoint Companion
 * descriptor
 */
struct usb_ssp_isoc_ep_comp_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __le16 wReseved;
        __le32 dwBytesPerInterval;
} __attribute__ ((packed));

#define USB_DT_SSP_ISOC_EP_COMP_SIZE                8

/*-------------------------------------------------------------------------*/

/* USB_DT_SS_ENDPOINT_COMP: SuperSpeed Endpoint Companion descriptor */
struct usb_ss_ep_comp_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bMaxBurst;
        __u8  bmAttributes;
        __le16 wBytesPerInterval;
} __attribute__ ((packed));

#define USB_DT_SS_EP_COMP_SIZE                6

/* Bits 4:0 of bmAttributes if this is a bulk endpoint */
static inline int
usb_ss_max_streams(const struct usb_ss_ep_comp_descriptor *comp)
{
        int                max_streams;

        if (!comp)
                return 0;

        max_streams = comp->bmAttributes & 0x1f;

        if (!max_streams)
                return 0;

        max_streams = 1 << max_streams;

        return max_streams;
}

/* Bits 1:0 of bmAttributes if this is an isoc endpoint */
#define USB_SS_MULT(p)                        (1 + ((p) & 0x3))
/* Bit 7 of bmAttributes if a SSP isoc endpoint companion descriptor exists */
#define USB_SS_SSP_ISOC_COMP(p)                ((p) & (1 << 7))

/*-------------------------------------------------------------------------*/

/* USB_DT_DEVICE_QUALIFIER: Device Qualifier descriptor */
struct usb_qualifier_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 bcdUSB;
        __u8  bDeviceClass;
        __u8  bDeviceSubClass;
        __u8  bDeviceProtocol;
        __u8  bMaxPacketSize0;
        __u8  bNumConfigurations;
        __u8  bRESERVED;
} __attribute__ ((packed));


/*-------------------------------------------------------------------------*/

/* USB_DT_OTG (from OTG 1.0a supplement) */
struct usb_otg_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bmAttributes;        /* support for HNP, SRP, etc */
} __attribute__ ((packed));

/* USB_DT_OTG (from OTG 2.0 supplement) */
struct usb_otg20_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bmAttributes;        /* support for HNP, SRP and ADP, etc */
        __le16 bcdOTG;                /* OTG and EH supplement release number
                                 * in binary-coded decimal(i.e. 2.0 is 0200H)
                                 */
} __attribute__ ((packed));

/* from usb_otg_descriptor.bmAttributes */
#define USB_OTG_SRP                (1 << 0)
#define USB_OTG_HNP                (1 << 1)        /* swap host/device roles */
#define USB_OTG_ADP                (1 << 2)        /* support ADP */
/* OTG 3.0 */
#define USB_OTG_RSP                (1 << 3)        /* support RSP */

#define OTG_STS_SELECTOR        0xF000                /* OTG status selector */
/*-------------------------------------------------------------------------*/

/* USB_DT_DEBUG:  for special highspeed devices, replacing serial console */
struct usb_debug_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        /* bulk endpoints with 8 byte maxpacket */
        __u8  bDebugInEndpoint;
        __u8  bDebugOutEndpoint;
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_DT_INTERFACE_ASSOCIATION: groups interfaces */
struct usb_interface_assoc_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bFirstInterface;
        __u8  bInterfaceCount;
        __u8  bFunctionClass;
        __u8  bFunctionSubClass;
        __u8  bFunctionProtocol;
        __u8  iFunction;
} __attribute__ ((packed));

#define USB_DT_INTERFACE_ASSOCIATION_SIZE        8

/*-------------------------------------------------------------------------*/

/* USB_DT_SECURITY:  group of wireless security descriptors, including
 * encryption types available for setting up a CC/association.
 */
struct usb_security_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 wTotalLength;
        __u8  bNumEncryptionTypes;
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_DT_KEY:  used with {GET,SET}_SECURITY_DATA; only public keys
 * may be retrieved.
 */
struct usb_key_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  tTKID[3];
        __u8  bReserved;
        __u8  bKeyData[];
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_DT_ENCRYPTION_TYPE:  bundled in DT_SECURITY groups */
struct usb_encryption_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bEncryptionType;
#define        USB_ENC_TYPE_UNSECURE                0
#define        USB_ENC_TYPE_WIRED                1        /* non-wireless mode */
#define        USB_ENC_TYPE_CCM_1                2        /* aes128/cbc session */
#define        USB_ENC_TYPE_RSA_1                3        /* rsa3072/sha1 auth */
        __u8  bEncryptionValue;                /* use in SET_ENCRYPTION */
        __u8  bAuthKeyIndex;
} __attribute__((packed));


/*-------------------------------------------------------------------------*/

/* USB_DT_BOS:  group of device-level capabilities */
struct usb_bos_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __le16 wTotalLength;
        __u8  bNumDeviceCaps;
} __attribute__((packed));

#define USB_DT_BOS_SIZE                5
/*-------------------------------------------------------------------------*/

/* USB_DT_DEVICE_CAPABILITY:  grouped with BOS */
struct usb_dev_cap_header {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
} __attribute__((packed));

#define        USB_CAP_TYPE_WIRELESS_USB        1

struct usb_wireless_cap_descriptor {        /* Ultra Wide Band */
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;

        __u8  bmAttributes;
#define        USB_WIRELESS_P2P_DRD                (1 << 1)
#define        USB_WIRELESS_BEACON_MASK        (3 << 2)
#define        USB_WIRELESS_BEACON_SELF        (1 << 2)
#define        USB_WIRELESS_BEACON_DIRECTED        (2 << 2)
#define        USB_WIRELESS_BEACON_NONE        (3 << 2)
        __le16 wPHYRates;        /* bit rates, Mbps */
#define        USB_WIRELESS_PHY_53                (1 << 0)        /* always set */
#define        USB_WIRELESS_PHY_80                (1 << 1)
#define        USB_WIRELESS_PHY_107                (1 << 2)        /* always set */
#define        USB_WIRELESS_PHY_160                (1 << 3)
#define        USB_WIRELESS_PHY_200                (1 << 4)        /* always set */
#define        USB_WIRELESS_PHY_320                (1 << 5)
#define        USB_WIRELESS_PHY_400                (1 << 6)
#define        USB_WIRELESS_PHY_480                (1 << 7)
        __u8  bmTFITXPowerInfo;        /* TFI power levels */
        __u8  bmFFITXPowerInfo;        /* FFI power levels */
        __le16 bmBandGroup;
        __u8  bReserved;
} __attribute__((packed));

#define USB_DT_USB_WIRELESS_CAP_SIZE        11

/* USB 2.0 Extension descriptor */
#define        USB_CAP_TYPE_EXT                2

struct usb_ext_cap_descriptor {                /* Link Power Management */
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
        __le32 bmAttributes;
#define USB_LPM_SUPPORT                        (1 << 1)        /* supports LPM */
#define USB_BESL_SUPPORT                (1 << 2)        /* supports BESL */
#define USB_BESL_BASELINE_VALID                (1 << 3)        /* Baseline BESL valid*/
#define USB_BESL_DEEP_VALID                (1 << 4)        /* Deep BESL valid */
#define USB_SET_BESL_BASELINE(p)        (((p) & 0xf) << 8)
#define USB_SET_BESL_DEEP(p)                (((p) & 0xf) << 12)
#define USB_GET_BESL_BASELINE(p)        (((p) & (0xf << 8)) >> 8)
#define USB_GET_BESL_DEEP(p)                (((p) & (0xf << 12)) >> 12)
} __attribute__((packed));

#define USB_DT_USB_EXT_CAP_SIZE        7

/*
 * SuperSpeed USB Capability descriptor: Defines the set of SuperSpeed USB
 * specific device level capabilities
 */
#define                USB_SS_CAP_TYPE                3
struct usb_ss_cap_descriptor {                /* Link Power Management */
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
        __u8  bmAttributes;
#define USB_LTM_SUPPORT                        (1 << 1) /* supports LTM */
        __le16 wSpeedSupported;
#define USB_LOW_SPEED_OPERATION                (1)         /* Low speed operation */
#define USB_FULL_SPEED_OPERATION        (1 << 1) /* Full speed operation */
#define USB_HIGH_SPEED_OPERATION        (1 << 2) /* High speed operation */
#define USB_5GBPS_OPERATION                (1 << 3) /* Operation at 5Gbps */
        __u8  bFunctionalitySupport;
        __u8  bU1devExitLat;
        __le16 bU2DevExitLat;
} __attribute__((packed));

#define USB_DT_USB_SS_CAP_SIZE        10

/*
 * Container ID Capability descriptor: Defines the instance unique ID used to
 * identify the instance across all operating modes
 */
#define        CONTAINER_ID_TYPE        4
struct usb_ss_container_id_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
        __u8  bReserved;
        __u8  ContainerID[16]; /* 128-bit number */
} __attribute__((packed));

#define USB_DT_USB_SS_CONTN_ID_SIZE        20

/*
 * Platform Device Capability descriptor: Defines platform specific device
 * capabilities
 */
#define        USB_PLAT_DEV_CAP_TYPE        5
struct usb_plat_dev_cap_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
        __u8  bReserved;
        __u8  UUID[16];
        __u8  CapabilityData[];
} __attribute__((packed));

#define USB_DT_USB_PLAT_DEV_CAP_SIZE(capability_data_size)        (20 + capability_data_size)

/*
 * SuperSpeed Plus USB Capability descriptor: Defines the set of
 * SuperSpeed Plus USB specific device level capabilities
 */
#define        USB_SSP_CAP_TYPE        0xa
struct usb_ssp_cap_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
        __u8  bReserved;
        __le32 bmAttributes;
#define USB_SSP_SUBLINK_SPEED_ATTRIBS        (0x1f << 0) /* sublink speed entries */
#define USB_SSP_SUBLINK_SPEED_IDS        (0xf << 5)  /* speed ID entries */
        __le16  wFunctionalitySupport;
#define USB_SSP_MIN_SUBLINK_SPEED_ATTRIBUTE_ID        (0xf)
#define USB_SSP_MIN_RX_LANE_COUNT                (0xf << 8)
#define USB_SSP_MIN_TX_LANE_COUNT                (0xf << 12)
        __le16 wReserved;
        union {
                __le32 legacy_padding;
                /* list of sublink speed attrib entries */
                __DECLARE_FLEX_ARRAY(__le32, bmSublinkSpeedAttr);
        };
#define USB_SSP_SUBLINK_SPEED_SSID        (0xf)                /* sublink speed ID */
#define USB_SSP_SUBLINK_SPEED_LSE        (0x3 << 4)        /* Lanespeed exponent */
#define USB_SSP_SUBLINK_SPEED_LSE_BPS                0
#define USB_SSP_SUBLINK_SPEED_LSE_KBPS                1
#define USB_SSP_SUBLINK_SPEED_LSE_MBPS                2
#define USB_SSP_SUBLINK_SPEED_LSE_GBPS                3

#define USB_SSP_SUBLINK_SPEED_ST        (0x3 << 6)        /* Sublink type */
#define USB_SSP_SUBLINK_SPEED_ST_SYM_RX                0
#define USB_SSP_SUBLINK_SPEED_ST_ASYM_RX        1
#define USB_SSP_SUBLINK_SPEED_ST_SYM_TX                2
#define USB_SSP_SUBLINK_SPEED_ST_ASYM_TX        3

#define USB_SSP_SUBLINK_SPEED_RSVD        (0x3f << 8)        /* Reserved */
#define USB_SSP_SUBLINK_SPEED_LP        (0x3 << 14)        /* Link protocol */
#define USB_SSP_SUBLINK_SPEED_LP_SS                0
#define USB_SSP_SUBLINK_SPEED_LP_SSP                1

#define USB_SSP_SUBLINK_SPEED_LSM        (0xff << 16)        /* Lanespeed mantissa */
} __attribute__((packed));

/*
 * USB Power Delivery Capability Descriptor:
 * Defines capabilities for PD
 */
/* Defines the various PD Capabilities of this device */
#define USB_PD_POWER_DELIVERY_CAPABILITY        0x06
/* Provides information on each battery supported by the device */
#define USB_PD_BATTERY_INFO_CAPABILITY                0x07
/* The Consumer characteristics of a Port on the device */
#define USB_PD_PD_CONSUMER_PORT_CAPABILITY        0x08
/* The provider characteristics of a Port on the device */
#define USB_PD_PD_PROVIDER_PORT_CAPABILITY        0x09

struct usb_pd_cap_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType; /* set to USB_PD_POWER_DELIVERY_CAPABILITY */
        __u8  bReserved;
        __le32 bmAttributes;
#define USB_PD_CAP_BATTERY_CHARGING        (1 << 1) /* supports Battery Charging specification */
#define USB_PD_CAP_USB_PD                (1 << 2) /* supports USB Power Delivery specification */
#define USB_PD_CAP_PROVIDER                (1 << 3) /* can provide power */
#define USB_PD_CAP_CONSUMER                (1 << 4) /* can consume power */
#define USB_PD_CAP_CHARGING_POLICY        (1 << 5) /* supports CHARGING_POLICY feature */
#define USB_PD_CAP_TYPE_C_CURRENT        (1 << 6) /* supports power capabilities defined in the USB Type-C Specification */

#define USB_PD_CAP_PWR_AC                (1 << 8)
#define USB_PD_CAP_PWR_BAT                (1 << 9)
#define USB_PD_CAP_PWR_USE_V_BUS        (1 << 14)

        __le16 bmProviderPorts; /* Bit zero refers to the UFP of the device */
        __le16 bmConsumerPorts;
        __le16 bcdBCVersion;
        __le16 bcdPDVersion;
        __le16 bcdUSBTypeCVersion;
} __attribute__((packed));

struct usb_pd_cap_battery_info_descriptor {
        __u8 bLength;
        __u8 bDescriptorType;
        __u8 bDevCapabilityType;
        /* Index of string descriptor shall contain the user friendly name for this battery */
        __u8 iBattery;
        /* Index of string descriptor shall contain the Serial Number String for this battery */
        __u8 iSerial;
        __u8 iManufacturer;
        __u8 bBatteryId; /* uniquely identifies this battery in status Messages */
        __u8 bReserved;
        /*
         * Shall contain the Battery Charge value above which this
         * battery is considered to be fully charged but not necessarily
         * “topped off.”
         */
        __le32 dwChargedThreshold; /* in mWh */
        /*
         * Shall contain the minimum charge level of this battery such
         * that above this threshold, a device can be assured of being
         * able to power up successfully (see Battery Charging 1.2).
         */
        __le32 dwWeakThreshold; /* in mWh */
        __le32 dwBatteryDesignCapacity; /* in mWh */
        __le32 dwBatteryLastFullchargeCapacity; /* in mWh */
} __attribute__((packed));

struct usb_pd_cap_consumer_port_descriptor {
        __u8 bLength;
        __u8 bDescriptorType;
        __u8 bDevCapabilityType;
        __u8 bReserved;
        __u8 bmCapabilities;
/* port will oerate under: */
#define USB_PD_CAP_CONSUMER_BC                (1 << 0) /* BC */
#define USB_PD_CAP_CONSUMER_PD                (1 << 1) /* PD */
#define USB_PD_CAP_CONSUMER_TYPE_C        (1 << 2) /* USB Type-C Current */
        __le16 wMinVoltage; /* in 50mV units */
        __le16 wMaxVoltage; /* in 50mV units */
        __u16 wReserved;
        __le32 dwMaxOperatingPower; /* in 10 mW - operating at steady state */
        __le32 dwMaxPeakPower; /* in 10mW units - operating at peak power */
        __le32 dwMaxPeakPowerTime; /* in 100ms units - duration of peak */
#define USB_PD_CAP_CONSUMER_UNKNOWN_PEAK_POWER_TIME 0xffff
} __attribute__((packed));

struct usb_pd_cap_provider_port_descriptor {
        __u8 bLength;
        __u8 bDescriptorType;
        __u8 bDevCapabilityType;
        __u8 bReserved1;
        __u8 bmCapabilities;
/* port will oerate under: */
#define USB_PD_CAP_PROVIDER_BC                (1 << 0) /* BC */
#define USB_PD_CAP_PROVIDER_PD                (1 << 1) /* PD */
#define USB_PD_CAP_PROVIDER_TYPE_C        (1 << 2) /* USB Type-C Current */
        __u8 bNumOfPDObjects;
        __u8 bReserved2;
        __le32 wPowerDataObject[];
} __attribute__((packed));

/*
 * Precision time measurement capability descriptor: advertised by devices and
 * hubs that support PTM
 */
#define        USB_PTM_CAP_TYPE        0xb
struct usb_ptm_cap_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;
        __u8  bDevCapabilityType;
} __attribute__((packed));

#define USB_DT_USB_PTM_ID_SIZE                3
/*
 * The size of the descriptor for the Sublink Speed Attribute Count
 * (SSAC) specified in bmAttributes[4:0]. SSAC is zero-based
 */
#define USB_DT_USB_SSP_CAP_SIZE(ssac)        (12 + (ssac + 1) * 4)

/*-------------------------------------------------------------------------*/

struct usb_authentication_capability_descriptor {
        __u8  bLength;
        __u8  bDescriptorType; /* set to USB_DT_DEVICE_CAPABILITY */
        __u8  bmAttributes;

        __u8  bcdProtocolVersion;
        __u8  bcdCapability;
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_DT_WIRELESS_ENDPOINT_COMP:  companion descriptor associated with
 * each endpoint descriptor for a wireless device
 */
struct usb_wireless_ep_comp_descriptor {
        __u8  bLength;
        __u8  bDescriptorType;

        __u8  bMaxBurst;
        __u8  bMaxSequence;
        __le16 wMaxStreamDelay;
        __le16 wOverTheAirPacketSize;
        __u8  bOverTheAirInterval;
        __u8  bmCompAttributes;
#define USB_ENDPOINT_SWITCH_MASK        0x03        /* in bmCompAttributes */
#define USB_ENDPOINT_SWITCH_NO                0
#define USB_ENDPOINT_SWITCH_SWITCH        1
#define USB_ENDPOINT_SWITCH_SCALE        2
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_REQ_SET_HANDSHAKE is a four-way handshake used between a wireless
 * host and a device for connection set up, mutual authentication, and
 * exchanging short lived session keys.  The handshake depends on a CC.
 */
struct usb_handshake {
        __u8 bMessageNumber;
        __u8 bStatus;
        __u8 tTKID[3];
        __u8 bReserved;
        __u8 CDID[16];
        __u8 nonce[16];
        __u8 MIC[8];
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB_REQ_SET_CONNECTION modifies or revokes a connection context (CC).
 * A CC may also be set up using non-wireless secure channels (including
 * wired USB!), and some devices may support CCs with multiple hosts.
 */
struct usb_connection_context {
        __u8 CHID[16];                /* persistent host id */
        __u8 CDID[16];                /* device id (unique w/in host context) */
        __u8 CK[16];                /* connection key */
} __attribute__((packed));

/*-------------------------------------------------------------------------*/

/* USB 2.0 defines three speeds, here's how Linux identifies them */

enum usb_device_speed {
        USB_SPEED_UNKNOWN = 0,                        /* enumerating */
        USB_SPEED_LOW, USB_SPEED_FULL,                /* usb 1.1 */
        USB_SPEED_HIGH,                                /* usb 2.0 */
        USB_SPEED_WIRELESS,                        /* wireless (usb 2.5) */
        USB_SPEED_SUPER,                        /* usb 3.0 */
        USB_SPEED_SUPER_PLUS,                        /* usb 3.1 */
};


enum usb_device_state {
        /* NOTATTACHED isn't in the USB spec, and this state acts
         * the same as ATTACHED ... but it's clearer this way.
         */
        USB_STATE_NOTATTACHED = 0,

        /* chapter 9 and authentication (wireless) device states */
        USB_STATE_ATTACHED,
        USB_STATE_POWERED,                        /* wired */
        USB_STATE_RECONNECTING,                        /* auth */
        USB_STATE_UNAUTHENTICATED,                /* auth */
        USB_STATE_DEFAULT,                        /* limited function */
        USB_STATE_ADDRESS,
        USB_STATE_CONFIGURED,                        /* most functions */

        USB_STATE_SUSPENDED

        /* NOTE:  there are actually four different SUSPENDED
         * states, returning to POWERED, DEFAULT, ADDRESS, or
         * CONFIGURED respectively when SOF tokens flow again.
         * At this level there's no difference between L1 and L2
         * suspend states.  (L2 being original USB 1.1 suspend.)
         */
};

enum usb3_link_state {
        USB3_LPM_U0 = 0,
        USB3_LPM_U1,
        USB3_LPM_U2,
        USB3_LPM_U3
};

/*
 * A U1 timeout of 0x0 means the parent hub will reject any transitions to U1.
 * 0xff means the parent hub will accept transitions to U1, but will not
 * initiate a transition.
 *
 * A U1 timeout of 0x1 to 0x7F also causes the hub to initiate a transition to
 * U1 after that many microseconds.  Timeouts of 0x80 to 0xFE are reserved
 * values.
 *
 * A U2 timeout of 0x0 means the parent hub will reject any transitions to U2.
 * 0xff means the parent hub will accept transitions to U2, but will not
 * initiate a transition.
 *
 * A U2 timeout of 0x1 to 0xFE also causes the hub to initiate a transition to
 * U2 after N*256 microseconds.  Therefore a U2 timeout value of 0x1 means a U2
 * idle timer of 256 microseconds, 0x2 means 512 microseconds, 0xFE means
 * 65.024ms.
 */
#define USB3_LPM_DISABLED                0x0
#define USB3_LPM_U1_MAX_TIMEOUT                0x7F
#define USB3_LPM_U2_MAX_TIMEOUT                0xFE
#define USB3_LPM_DEVICE_INITIATED        0xFF

struct usb_set_sel_req {
        __u8        u1_sel;
        __u8        u1_pel;
        __le16        u2_sel;
        __le16        u2_pel;
} __attribute__ ((packed));

/*
 * The Set System Exit Latency control transfer provides one byte each for
 * U1 SEL and U1 PEL, so the max exit latency is 0xFF.  U2 SEL and U2 PEL each
 * are two bytes long.
 */
#define USB3_LPM_MAX_U1_SEL_PEL                0xFF
#define USB3_LPM_MAX_U2_SEL_PEL                0xFFFF

/*-------------------------------------------------------------------------*/

/*
 * As per USB compliance update, a device that is actively drawing
 * more than 100mA from USB must report itself as bus-powered in
 * the GetStatus(DEVICE) call.
 * https://compliance.usb.org/index.asp?UpdateFile=Electrical&Format=Standard#34
 */
#define USB_SELF_POWER_VBUS_MAX_DRAW                100

#endif /* _UAPI__LINUX_USB_CH9_H */



























































































   17 





















































    3 
    2 



    1 























   15 
   15 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// SPDX-License-Identifier: GPL-2.0
/*
 * Out-of-line refcount functions.
 */

#include <linux/mutex.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/bug.h>

#define REFCOUNT_WARN(str)        WARN_ONCE(1, "refcount_t: " str ".\n")

void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t)
{
        refcount_set(r, REFCOUNT_SATURATED);

        switch (t) {
        case REFCOUNT_ADD_NOT_ZERO_OVF:
                REFCOUNT_WARN("saturated; leaking memory");
                break;
        case REFCOUNT_ADD_OVF:
                REFCOUNT_WARN("saturated; leaking memory");
                break;
        case REFCOUNT_ADD_UAF:
                REFCOUNT_WARN("addition on 0; use-after-free");
                break;
        case REFCOUNT_SUB_UAF:
                REFCOUNT_WARN("underflow; use-after-free");
                break;
        case REFCOUNT_DEC_LEAK:
                REFCOUNT_WARN("decrement hit 0; leaking memory");
                break;
        default:
                REFCOUNT_WARN("unknown saturation event!?");
        }
}
EXPORT_SYMBOL(refcount_warn_saturate);

/**
 * refcount_dec_if_one - decrement a refcount if it is 1
 * @r: the refcount
 *
 * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
 * success thereof.
 *
 * Like all decrement operations, it provides release memory order and provides
 * a control dependency.
 *
 * It can be used like a try-delete operator; this explicit case is provided
 * and not cmpxchg in generic, because that would allow implementing unsafe
 * operations.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
bool refcount_dec_if_one(refcount_t *r)
{
        int val = 1;

        return atomic_try_cmpxchg_release(&r->refs, &val, 0);
}
EXPORT_SYMBOL(refcount_dec_if_one);

/**
 * refcount_dec_not_one - decrement a refcount if it is not 1
 * @r: the refcount
 *
 * No atomic_t counterpart, it decrements unless the value is 1, in which case
 * it will return false.
 *
 * Was often done like: atomic_add_unless(&var, -1, 1)
 *
 * Return: true if the decrement operation was successful, false otherwise
 */
bool refcount_dec_not_one(refcount_t *r)
{
        unsigned int new, val = atomic_read(&r->refs);

        do {
                if (unlikely(val == REFCOUNT_SATURATED))
                        return true;

                if (val == 1)
                        return false;

                new = val - 1;
                if (new > val) {
                        WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n");
                        return true;
                }

        } while (!atomic_try_cmpxchg_release(&r->refs, &val, new));

        return true;
}
EXPORT_SYMBOL(refcount_dec_not_one);

/**
 * refcount_dec_and_mutex_lock - return holding mutex if able to decrement
 *                               refcount to 0
 * @r: the refcount
 * @lock: the mutex to be locked
 *
 * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
 * to decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides a control dependency such that free() must come after.
 * See the comment on top.
 *
 * Return: true and hold mutex if able to decrement refcount to 0, false
 *         otherwise
 */
bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
{
        if (refcount_dec_not_one(r))
                return false;

        mutex_lock(lock);
        if (!refcount_dec_and_test(r)) {
                mutex_unlock(lock);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(refcount_dec_and_mutex_lock);

/**
 * refcount_dec_and_lock - return holding spinlock if able to decrement
 *                         refcount to 0
 * @r: the refcount
 * @lock: the spinlock to be locked
 *
 * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
 * decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides a control dependency such that free() must come after.
 * See the comment on top.
 *
 * Return: true and hold spinlock if able to decrement refcount to 0, false
 *         otherwise
 */
bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
{
        if (refcount_dec_not_one(r))
                return false;

        spin_lock(lock);
        if (!refcount_dec_and_test(r)) {
                spin_unlock(lock);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(refcount_dec_and_lock);

/**
 * refcount_dec_and_lock_irqsave - return holding spinlock with disabled
 *                                 interrupts if able to decrement refcount to 0
 * @r: the refcount
 * @lock: the spinlock to be locked
 * @flags: saved IRQ-flags if the is acquired
 *
 * Same as refcount_dec_and_lock() above except that the spinlock is acquired
 * with disabled interrupts.
 *
 * Return: true and hold spinlock if able to decrement refcount to 0, false
 *         otherwise
 */
bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock,
                                   unsigned long *flags)
{
        if (refcount_dec_not_one(r))
                return false;

        spin_lock_irqsave(lock, *flags);
        if (!refcount_dec_and_test(r)) {
                spin_unlock_irqrestore(lock, *flags);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(refcount_dec_and_lock_irqsave);













































































    1 






    1 








































    1 











































































































































































































































































































































































    1 
    1 
    1 





















    1 
    1 

















    1 



    1 













    1 















    1 










































































































































































































































































    1 






















































































































































































































    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * net/core/dev_addr_lists.c - Functions for handling net device lists
 * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com>
 *
 * This file contains functions for working with unicast, multicast and device
 * addresses lists.
 */

#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/export.h>
#include <linux/list.h>

#include "dev.h"

/*
 * General list handling functions
 */

static int __hw_addr_insert(struct netdev_hw_addr_list *list,
                            struct netdev_hw_addr *new, int addr_len)
{
        struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
        struct netdev_hw_addr *ha;

        while (*ins_point) {
                int diff;

                ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
                diff = memcmp(new->addr, ha->addr, addr_len);
                if (diff == 0)
                        diff = memcmp(&new->type, &ha->type, sizeof(new->type));

                parent = *ins_point;
                if (diff < 0)
                        ins_point = &parent->rb_left;
                else if (diff > 0)
                        ins_point = &parent->rb_right;
                else
                        return -EEXIST;
        }

        rb_link_node_rcu(&new->node, parent, ins_point);
        rb_insert_color(&new->node, &list->tree);

        return 0;
}

static struct netdev_hw_addr*
__hw_addr_create(const unsigned char *addr, int addr_len,
                 unsigned char addr_type, bool global, bool sync)
{
        struct netdev_hw_addr *ha;
        int alloc_size;

        alloc_size = sizeof(*ha);
        if (alloc_size < L1_CACHE_BYTES)
                alloc_size = L1_CACHE_BYTES;
        ha = kmalloc(alloc_size, GFP_ATOMIC);
        if (!ha)
                return NULL;
        memcpy(ha->addr, addr, addr_len);
        ha->type = addr_type;
        ha->refcount = 1;
        ha->global_use = global;
        ha->synced = sync ? 1 : 0;
        ha->sync_cnt = 0;

        return ha;
}

static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
                            const unsigned char *addr, int addr_len,
                            unsigned char addr_type, bool global, bool sync,
                            int sync_count, bool exclusive)
{
        struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
        struct netdev_hw_addr *ha;

        if (addr_len > MAX_ADDR_LEN)
                return -EINVAL;

        while (*ins_point) {
                int diff;

                ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
                diff = memcmp(addr, ha->addr, addr_len);
                if (diff == 0)
                        diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));

                parent = *ins_point;
                if (diff < 0) {
                        ins_point = &parent->rb_left;
                } else if (diff > 0) {
                        ins_point = &parent->rb_right;
                } else {
                        if (exclusive)
                                return -EEXIST;
                        if (global) {
                                /* check if addr is already used as global */
                                if (ha->global_use)
                                        return 0;
                                else
                                        ha->global_use = true;
                        }
                        if (sync) {
                                if (ha->synced && sync_count)
                                        return -EEXIST;
                                else
                                        ha->synced++;
                        }
                        ha->refcount++;
                        return 0;
                }
        }

        ha = __hw_addr_create(addr, addr_len, addr_type, global, sync);
        if (!ha)
                return -ENOMEM;

        rb_link_node(&ha->node, parent, ins_point);
        rb_insert_color(&ha->node, &list->tree);

        list_add_tail_rcu(&ha->list, &list->list);
        list->count++;

        return 0;
}

static int __hw_addr_add(struct netdev_hw_addr_list *list,
                         const unsigned char *addr, int addr_len,
                         unsigned char addr_type)
{
        return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
                                0, false);
}

static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
                               struct netdev_hw_addr *ha, bool global,
                               bool sync)
{
        if (global && !ha->global_use)
                return -ENOENT;

        if (sync && !ha->synced)
                return -ENOENT;

        if (global)
                ha->global_use = false;

        if (sync)
                ha->synced--;

        if (--ha->refcount)
                return 0;

        rb_erase(&ha->node, &list->tree);

        list_del_rcu(&ha->list);
        kfree_rcu(ha, rcu_head);
        list->count--;
        return 0;
}

static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list,
                                               const unsigned char *addr, int addr_len,
                                               unsigned char addr_type)
{
        struct rb_node *node;

        node = list->tree.rb_node;

        while (node) {
                struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node);
                int diff = memcmp(addr, ha->addr, addr_len);

                if (diff == 0 && addr_type)
                        diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));

                if (diff < 0)
                        node = node->rb_left;
                else if (diff > 0)
                        node = node->rb_right;
                else
                        return ha;
        }

        return NULL;
}

static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
                            const unsigned char *addr, int addr_len,
                            unsigned char addr_type, bool global, bool sync)
{
        struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type);

        if (!ha)
                return -ENOENT;
        return __hw_addr_del_entry(list, ha, global, sync);
}

static int __hw_addr_del(struct netdev_hw_addr_list *list,
                         const unsigned char *addr, int addr_len,
                         unsigned char addr_type)
{
        return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false);
}

static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
                               struct netdev_hw_addr *ha,
                               int addr_len)
{
        int err;

        err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
                               false, true, ha->sync_cnt, false);
        if (err && err != -EEXIST)
                return err;

        if (!err) {
                ha->sync_cnt++;
                ha->refcount++;
        }

        return 0;
}

static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
                                 struct netdev_hw_addr_list *from_list,
                                 struct netdev_hw_addr *ha,
                                 int addr_len)
{
        int err;

        err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type,
                               false, true);
        if (err)
                return;
        ha->sync_cnt--;
        /* address on from list is not marked synced */
        __hw_addr_del_entry(from_list, ha, false, false);
}

int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
                            struct netdev_hw_addr_list *from_list,
                            int addr_len)
{
        int err = 0;
        struct netdev_hw_addr *ha, *tmp;

        list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
                if (ha->sync_cnt == ha->refcount) {
                        __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
                } else {
                        err = __hw_addr_sync_one(to_list, ha, addr_len);
                        if (err)
                                break;
                }
        }
        return err;
}
EXPORT_SYMBOL(__hw_addr_sync_multiple);

/* This function only works where there is a strict 1-1 relationship
 * between source and destination of they synch. If you ever need to
 * sync addresses to more then 1 destination, you need to use
 * __hw_addr_sync_multiple().
 */
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
                   struct netdev_hw_addr_list *from_list,
                   int addr_len)
{
        int err = 0;
        struct netdev_hw_addr *ha, *tmp;

        list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
                if (!ha->sync_cnt) {
                        err = __hw_addr_sync_one(to_list, ha, addr_len);
                        if (err)
                                break;
                } else if (ha->refcount == 1)
                        __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
        }
        return err;
}
EXPORT_SYMBOL(__hw_addr_sync);

void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
                      struct netdev_hw_addr_list *from_list,
                      int addr_len)
{
        struct netdev_hw_addr *ha, *tmp;

        list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
                if (ha->sync_cnt)
                        __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
        }
}
EXPORT_SYMBOL(__hw_addr_unsync);

/**
 *  __hw_addr_sync_dev - Synchronize device's multicast list
 *  @list: address list to synchronize
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  This function is intended to be called from the ndo_set_rx_mode
 *  function of devices that require explicit address add/remove
 *  notifications.  The unsync function may be NULL in which case
 *  the addresses requiring removal will simply be removed without
 *  any notification to the device.
 **/
int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
                       struct net_device *dev,
                       int (*sync)(struct net_device *, const unsigned char *),
                       int (*unsync)(struct net_device *,
                                     const unsigned char *))
{
        struct netdev_hw_addr *ha, *tmp;
        int err;

        /* first go through and flush out any stale entries */
        list_for_each_entry_safe(ha, tmp, &list->list, list) {
                if (!ha->sync_cnt || ha->refcount != 1)
                        continue;

                /* if unsync is defined and fails defer unsyncing address */
                if (unsync && unsync(dev, ha->addr))
                        continue;

                ha->sync_cnt--;
                __hw_addr_del_entry(list, ha, false, false);
        }

        /* go through and sync new entries to the list */
        list_for_each_entry_safe(ha, tmp, &list->list, list) {
                if (ha->sync_cnt)
                        continue;

                err = sync(dev, ha->addr);
                if (err)
                        return err;

                ha->sync_cnt++;
                ha->refcount++;
        }

        return 0;
}
EXPORT_SYMBOL(__hw_addr_sync_dev);

/**
 *  __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking
 *  into account references
 *  @list: address list to synchronize
 *  @dev:  device to sync
 *  @sync: function to call if address or reference on it should be added
 *  @unsync: function to call if address or some reference on it should removed
 *
 *  This function is intended to be called from the ndo_set_rx_mode
 *  function of devices that require explicit address or references on it
 *  add/remove notifications. The unsync function may be NULL in which case
 *  the addresses or references on it requiring removal will simply be
 *  removed without any notification to the device. That is responsibility of
 *  the driver to identify and distribute address or references on it between
 *  internal address tables.
 **/
int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
                           struct net_device *dev,
                           int (*sync)(struct net_device *,
                                       const unsigned char *, int),
                           int (*unsync)(struct net_device *,
                                         const unsigned char *, int))
{
        struct netdev_hw_addr *ha, *tmp;
        int err, ref_cnt;

        /* first go through and flush out any unsynced/stale entries */
        list_for_each_entry_safe(ha, tmp, &list->list, list) {
                /* sync if address is not used */
                if ((ha->sync_cnt << 1) <= ha->refcount)
                        continue;

                /* if fails defer unsyncing address */
                ref_cnt = ha->refcount - ha->sync_cnt;
                if (unsync && unsync(dev, ha->addr, ref_cnt))
                        continue;

                ha->refcount = (ref_cnt << 1) + 1;
                ha->sync_cnt = ref_cnt;
                __hw_addr_del_entry(list, ha, false, false);
        }

        /* go through and sync updated/new entries to the list */
        list_for_each_entry_safe(ha, tmp, &list->list, list) {
                /* sync if address added or reused */
                if ((ha->sync_cnt << 1) >= ha->refcount)
                        continue;

                ref_cnt = ha->refcount - ha->sync_cnt;
                err = sync(dev, ha->addr, ref_cnt);
                if (err)
                        return err;

                ha->refcount = ref_cnt << 1;
                ha->sync_cnt = ref_cnt;
        }

        return 0;
}
EXPORT_SYMBOL(__hw_addr_ref_sync_dev);

/**
 *  __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on
 *  it from device
 *  @list: address list to remove synchronized addresses (references on it) from
 *  @dev:  device to sync
 *  @unsync: function to call if address and references on it should be removed
 *
 *  Remove all addresses that were added to the device by
 *  __hw_addr_ref_sync_dev(). This function is intended to be called from the
 *  ndo_stop or ndo_open functions on devices that require explicit address (or
 *  references on it) add/remove notifications. If the unsync function pointer
 *  is NULL then this function can be used to just reset the sync_cnt for the
 *  addresses in the list.
 **/
void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
                              struct net_device *dev,
                              int (*unsync)(struct net_device *,
                                            const unsigned char *, int))
{
        struct netdev_hw_addr *ha, *tmp;

        list_for_each_entry_safe(ha, tmp, &list->list, list) {
                if (!ha->sync_cnt)
                        continue;

                /* if fails defer unsyncing address */
                if (unsync && unsync(dev, ha->addr, ha->sync_cnt))
                        continue;

                ha->refcount -= ha->sync_cnt - 1;
                ha->sync_cnt = 0;
                __hw_addr_del_entry(list, ha, false, false);
        }
}
EXPORT_SYMBOL(__hw_addr_ref_unsync_dev);

/**
 *  __hw_addr_unsync_dev - Remove synchronized addresses from device
 *  @list: address list to remove synchronized addresses from
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by __hw_addr_sync_dev().
 *  This function is intended to be called from the ndo_stop or ndo_open
 *  functions on devices that require explicit address add/remove
 *  notifications.  If the unsync function pointer is NULL then this function
 *  can be used to just reset the sync_cnt for the addresses in the list.
 **/
void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
                          struct net_device *dev,
                          int (*unsync)(struct net_device *,
                                        const unsigned char *))
{
        struct netdev_hw_addr *ha, *tmp;

        list_for_each_entry_safe(ha, tmp, &list->list, list) {
                if (!ha->sync_cnt)
                        continue;

                /* if unsync is defined and fails defer unsyncing address */
                if (unsync && unsync(dev, ha->addr))
                        continue;

                ha->sync_cnt--;
                __hw_addr_del_entry(list, ha, false, false);
        }
}
EXPORT_SYMBOL(__hw_addr_unsync_dev);

static void __hw_addr_flush(struct netdev_hw_addr_list *list)
{
        struct netdev_hw_addr *ha, *tmp;

        list->tree = RB_ROOT;
        list_for_each_entry_safe(ha, tmp, &list->list, list) {
                list_del_rcu(&ha->list);
                kfree_rcu(ha, rcu_head);
        }
        list->count = 0;
}

void __hw_addr_init(struct netdev_hw_addr_list *list)
{
        INIT_LIST_HEAD(&list->list);
        list->count = 0;
        list->tree = RB_ROOT;
}
EXPORT_SYMBOL(__hw_addr_init);

/*
 * Device addresses handling functions
 */

/* Check that netdev->dev_addr is not written to directly as this would
 * break the rbtree layout. All changes should go thru dev_addr_set() and co.
 * Remove this check in mid-2024.
 */
void dev_addr_check(struct net_device *dev)
{
        if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN))
                return;

        netdev_warn(dev, "Current addr:  %*ph\n", MAX_ADDR_LEN, dev->dev_addr);
        netdev_warn(dev, "Expected addr: %*ph\n",
                    MAX_ADDR_LEN, dev->dev_addr_shadow);
        netdev_WARN(dev, "Incorrect netdev->dev_addr\n");
}

/**
 *        dev_addr_flush - Flush device address list
 *        @dev: device
 *
 *        Flush device address list and reset ->dev_addr.
 *
 *        The caller must hold the rtnl_mutex.
 */
void dev_addr_flush(struct net_device *dev)
{
        /* rtnl_mutex must be held here */
        dev_addr_check(dev);

        __hw_addr_flush(&dev->dev_addrs);
        dev->dev_addr = NULL;
}

/**
 *        dev_addr_init - Init device address list
 *        @dev: device
 *
 *        Init device address list and create the first element,
 *        used by ->dev_addr.
 *
 *        The caller must hold the rtnl_mutex.
 */
int dev_addr_init(struct net_device *dev)
{
        unsigned char addr[MAX_ADDR_LEN];
        struct netdev_hw_addr *ha;
        int err;

        /* rtnl_mutex must be held here */

        __hw_addr_init(&dev->dev_addrs);
        memset(addr, 0, sizeof(addr));
        err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
                            NETDEV_HW_ADDR_T_LAN);
        if (!err) {
                /*
                 * Get the first (previously created) address from the list
                 * and set dev_addr pointer to this location.
                 */
                ha = list_first_entry(&dev->dev_addrs.list,
                                      struct netdev_hw_addr, list);
                dev->dev_addr = ha->addr;
        }
        return err;
}

void dev_addr_mod(struct net_device *dev, unsigned int offset,
                  const void *addr, size_t len)
{
        struct netdev_hw_addr *ha;

        dev_addr_check(dev);

        ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]);
        rb_erase(&ha->node, &dev->dev_addrs.tree);
        memcpy(&ha->addr[offset], addr, len);
        memcpy(&dev->dev_addr_shadow[offset], addr, len);
        WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len));
}
EXPORT_SYMBOL(dev_addr_mod);

/**
 *        dev_addr_add - Add a device address
 *        @dev: device
 *        @addr: address to add
 *        @addr_type: address type
 *
 *        Add a device address to the device or increase the reference count if
 *        it already exists.
 *
 *        The caller must hold the rtnl_mutex.
 */
int dev_addr_add(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type)
{
        int err;

        ASSERT_RTNL();

        err = netif_pre_changeaddr_notify(dev, addr, NULL);
        if (err)
                return err;
        err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
        if (!err)
                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
        return err;
}
EXPORT_SYMBOL(dev_addr_add);

/**
 *        dev_addr_del - Release a device address.
 *        @dev: device
 *        @addr: address to delete
 *        @addr_type: address type
 *
 *        Release reference to a device address and remove it from the device
 *        if the reference count drops to zero.
 *
 *        The caller must hold the rtnl_mutex.
 */
int dev_addr_del(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type)
{
        int err;
        struct netdev_hw_addr *ha;

        ASSERT_RTNL();

        /*
         * We can not remove the first address from the list because
         * dev->dev_addr points to that.
         */
        ha = list_first_entry(&dev->dev_addrs.list,
                              struct netdev_hw_addr, list);
        if (!memcmp(ha->addr, addr, dev->addr_len) &&
            ha->type == addr_type && ha->refcount == 1)
                return -ENOENT;

        err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
                            addr_type);
        if (!err)
                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
        return err;
}
EXPORT_SYMBOL(dev_addr_del);

/*
 * Unicast list handling functions
 */

/**
 *        dev_uc_add_excl - Add a global secondary unicast address
 *        @dev: device
 *        @addr: address to add
 */
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
{
        int err;

        netif_addr_lock_bh(dev);
        err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len,
                               NETDEV_HW_ADDR_T_UNICAST, true, false,
                               0, true);
        if (!err)
                __dev_set_rx_mode(dev);
        netif_addr_unlock_bh(dev);
        return err;
}
EXPORT_SYMBOL(dev_uc_add_excl);

/**
 *        dev_uc_add - Add a secondary unicast address
 *        @dev: device
 *        @addr: address to add
 *
 *        Add a secondary unicast address to the device or increase
 *        the reference count if it already exists.
 */
int dev_uc_add(struct net_device *dev, const unsigned char *addr)
{
        int err;

        netif_addr_lock_bh(dev);
        err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
                            NETDEV_HW_ADDR_T_UNICAST);
        if (!err)
                __dev_set_rx_mode(dev);
        netif_addr_unlock_bh(dev);
        return err;
}
EXPORT_SYMBOL(dev_uc_add);

/**
 *        dev_uc_del - Release secondary unicast address.
 *        @dev: device
 *        @addr: address to delete
 *
 *        Release reference to a secondary unicast address and remove it
 *        from the device if the reference count drops to zero.
 */
int dev_uc_del(struct net_device *dev, const unsigned char *addr)
{
        int err;

        netif_addr_lock_bh(dev);
        err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
                            NETDEV_HW_ADDR_T_UNICAST);
        if (!err)
                __dev_set_rx_mode(dev);
        netif_addr_unlock_bh(dev);
        return err;
}
EXPORT_SYMBOL(dev_uc_del);

/**
 *        dev_uc_sync - Synchronize device's unicast list to another device
 *        @to: destination device
 *        @from: source device
 *
 *        Add newly added addresses to the destination device and release
 *        addresses that have no users left. The source device must be
 *        locked by netif_addr_lock_bh.
 *
 *        This function is intended to be called from the dev->set_rx_mode
 *        function of layered software devices.  This function assumes that
 *        addresses will only ever be synced to the @to devices and no other.
 */
int dev_uc_sync(struct net_device *to, struct net_device *from)
{
        int err = 0;

        if (to->addr_len != from->addr_len)
                return -EINVAL;

        netif_addr_lock(to);
        err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
        if (!err)
                __dev_set_rx_mode(to);
        netif_addr_unlock(to);
        return err;
}
EXPORT_SYMBOL(dev_uc_sync);

/**
 *        dev_uc_sync_multiple - Synchronize device's unicast list to another
 *        device, but allow for multiple calls to sync to multiple devices.
 *        @to: destination device
 *        @from: source device
 *
 *        Add newly added addresses to the destination device and release
 *        addresses that have been deleted from the source. The source device
 *        must be locked by netif_addr_lock_bh.
 *
 *        This function is intended to be called from the dev->set_rx_mode
 *        function of layered software devices.  It allows for a single source
 *        device to be synced to multiple destination devices.
 */
int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
{
        int err = 0;

        if (to->addr_len != from->addr_len)
                return -EINVAL;

        netif_addr_lock(to);
        err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
        if (!err)
                __dev_set_rx_mode(to);
        netif_addr_unlock(to);
        return err;
}
EXPORT_SYMBOL(dev_uc_sync_multiple);

/**
 *        dev_uc_unsync - Remove synchronized addresses from the destination device
 *        @to: destination device
 *        @from: source device
 *
 *        Remove all addresses that were added to the destination device by
 *        dev_uc_sync(). This function is intended to be called from the
 *        dev->stop function of layered software devices.
 */
void dev_uc_unsync(struct net_device *to, struct net_device *from)
{
        if (to->addr_len != from->addr_len)
                return;

        /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two
         * reasons:
         * 1) This is always called without any addr_list_lock, so as the
         *    outermost one here, it must be 0.
         * 2) This is called by some callers after unlinking the upper device,
         *    so the dev->lower_level becomes 1 again.
         * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or
         * larger.
         */
        netif_addr_lock_bh(from);
        netif_addr_lock(to);
        __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
        __dev_set_rx_mode(to);
        netif_addr_unlock(to);
        netif_addr_unlock_bh(from);
}
EXPORT_SYMBOL(dev_uc_unsync);

/**
 *        dev_uc_flush - Flush unicast addresses
 *        @dev: device
 *
 *        Flush unicast addresses.
 */
void dev_uc_flush(struct net_device *dev)
{
        netif_addr_lock_bh(dev);
        __hw_addr_flush(&dev->uc);
        netif_addr_unlock_bh(dev);
}
EXPORT_SYMBOL(dev_uc_flush);

/**
 *        dev_uc_init - Init unicast address list
 *        @dev: device
 *
 *        Init unicast address list.
 */
void dev_uc_init(struct net_device *dev)
{
        __hw_addr_init(&dev->uc);
}
EXPORT_SYMBOL(dev_uc_init);

/*
 * Multicast list handling functions
 */

/**
 *        dev_mc_add_excl - Add a global secondary multicast address
 *        @dev: device
 *        @addr: address to add
 */
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
{
        int err;

        netif_addr_lock_bh(dev);
        err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
                               NETDEV_HW_ADDR_T_MULTICAST, true, false,
                               0, true);
        if (!err)
                __dev_set_rx_mode(dev);
        netif_addr_unlock_bh(dev);
        return err;
}
EXPORT_SYMBOL(dev_mc_add_excl);

static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
                        bool global)
{
        int err;

        netif_addr_lock_bh(dev);
        err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
                               NETDEV_HW_ADDR_T_MULTICAST, global, false,
                               0, false);
        if (!err)
                __dev_set_rx_mode(dev);
        netif_addr_unlock_bh(dev);
        return err;
}
/**
 *        dev_mc_add - Add a multicast address
 *        @dev: device
 *        @addr: address to add
 *
 *        Add a multicast address to the device or increase
 *        the reference count if it already exists.
 */
int dev_mc_add(struct net_device *dev, const unsigned char *addr)
{
        return __dev_mc_add(dev, addr, false);
}
EXPORT_SYMBOL(dev_mc_add);

/**
 *        dev_mc_add_global - Add a global multicast address
 *        @dev: device
 *        @addr: address to add
 *
 *        Add a global multicast address to the device.
 */
int dev_mc_add_global(struct net_device *dev, const unsigned char *addr)
{
        return __dev_mc_add(dev, addr, true);
}
EXPORT_SYMBOL(dev_mc_add_global);

static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,
                        bool global)
{
        int err;

        netif_addr_lock_bh(dev);
        err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
                               NETDEV_HW_ADDR_T_MULTICAST, global, false);
        if (!err)
                __dev_set_rx_mode(dev);
        netif_addr_unlock_bh(dev);
        return err;
}

/**
 *        dev_mc_del - Delete a multicast address.
 *        @dev: device
 *        @addr: address to delete
 *
 *        Release reference to a multicast address and remove it
 *        from the device if the reference count drops to zero.
 */
int dev_mc_del(struct net_device *dev, const unsigned char *addr)
{
        return __dev_mc_del(dev, addr, false);
}
EXPORT_SYMBOL(dev_mc_del);

/**
 *        dev_mc_del_global - Delete a global multicast address.
 *        @dev: device
 *        @addr: address to delete
 *
 *        Release reference to a multicast address and remove it
 *        from the device if the reference count drops to zero.
 */
int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)
{
        return __dev_mc_del(dev, addr, true);
}
EXPORT_SYMBOL(dev_mc_del_global);

/**
 *        dev_mc_sync - Synchronize device's multicast list to another device
 *        @to: destination device
 *        @from: source device
 *
 *        Add newly added addresses to the destination device and release
 *        addresses that have no users left. The source device must be
 *        locked by netif_addr_lock_bh.
 *
 *        This function is intended to be called from the ndo_set_rx_mode
 *        function of layered software devices.
 */
int dev_mc_sync(struct net_device *to, struct net_device *from)
{
        int err = 0;

        if (to->addr_len != from->addr_len)
                return -EINVAL;

        netif_addr_lock(to);
        err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
        if (!err)
                __dev_set_rx_mode(to);
        netif_addr_unlock(to);
        return err;
}
EXPORT_SYMBOL(dev_mc_sync);

/**
 *        dev_mc_sync_multiple - Synchronize device's multicast list to another
 *        device, but allow for multiple calls to sync to multiple devices.
 *        @to: destination device
 *        @from: source device
 *
 *        Add newly added addresses to the destination device and release
 *        addresses that have no users left. The source device must be
 *        locked by netif_addr_lock_bh.
 *
 *        This function is intended to be called from the ndo_set_rx_mode
 *        function of layered software devices.  It allows for a single
 *        source device to be synced to multiple destination devices.
 */
int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
{
        int err = 0;

        if (to->addr_len != from->addr_len)
                return -EINVAL;

        netif_addr_lock(to);
        err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len);
        if (!err)
                __dev_set_rx_mode(to);
        netif_addr_unlock(to);
        return err;
}
EXPORT_SYMBOL(dev_mc_sync_multiple);

/**
 *        dev_mc_unsync - Remove synchronized addresses from the destination device
 *        @to: destination device
 *        @from: source device
 *
 *        Remove all addresses that were added to the destination device by
 *        dev_mc_sync(). This function is intended to be called from the
 *        dev->stop function of layered software devices.
 */
void dev_mc_unsync(struct net_device *to, struct net_device *from)
{
        if (to->addr_len != from->addr_len)
                return;

        /* See the above comments inside dev_uc_unsync(). */
        netif_addr_lock_bh(from);
        netif_addr_lock(to);
        __hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
        __dev_set_rx_mode(to);
        netif_addr_unlock(to);
        netif_addr_unlock_bh(from);
}
EXPORT_SYMBOL(dev_mc_unsync);

/**
 *        dev_mc_flush - Flush multicast addresses
 *        @dev: device
 *
 *        Flush multicast addresses.
 */
void dev_mc_flush(struct net_device *dev)
{
        netif_addr_lock_bh(dev);
        __hw_addr_flush(&dev->mc);
        netif_addr_unlock_bh(dev);
}
EXPORT_SYMBOL(dev_mc_flush);

/**
 *        dev_mc_init - Init multicast address list
 *        @dev: device
 *
 *        Init multicast address list.
 */
void dev_mc_init(struct net_device *dev)
{
        __hw_addr_init(&dev->mc);
}
EXPORT_SYMBOL(dev_mc_init);



































































































































































































    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NAMEI_H
#define _LINUX_NAMEI_H

#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/path.h>
#include <linux/fcntl.h>
#include <linux/errno.h>
#include <linux/fs_struct.h>

enum { MAX_NESTED_LINKS = 8 };

#define MAXSYMLINKS 40

/*
 * Type of the last component on LOOKUP_PARENT
 */
enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};

/* pathwalk mode */
#define LOOKUP_FOLLOW                BIT(0)        /* follow links at the end */
#define LOOKUP_DIRECTORY        BIT(1)        /* require a directory */
#define LOOKUP_AUTOMOUNT        BIT(2)  /* force terminal automount */
#define LOOKUP_EMPTY                BIT(3)        /* accept empty path [user_... only] */
#define LOOKUP_LINKAT_EMPTY        BIT(4) /* Linkat request with empty path. */
#define LOOKUP_DOWN                BIT(5)        /* follow mounts in the starting point */
#define LOOKUP_MOUNTPOINT        BIT(6)        /* follow mounts in the end */
#define LOOKUP_REVAL                BIT(7)        /* tell ->d_revalidate() to trust no cache */
#define LOOKUP_RCU                BIT(8)        /* RCU pathwalk mode; semi-internal */
#define LOOKUP_CACHED                BIT(9) /* Only do cached lookup */
#define LOOKUP_PARENT                BIT(10)        /* Looking up final parent in path */
/* 5 spare bits for pathwalk */

/* These tell filesystem methods that we are dealing with the final component... */
#define LOOKUP_OPEN                BIT(16)        /* ... in open */
#define LOOKUP_CREATE                BIT(17)        /* ... in object creation */
#define LOOKUP_EXCL                BIT(18)        /* ... in target must not exist */
#define LOOKUP_RENAME_TARGET        BIT(19)        /* ... in destination of rename() */

/* 4 spare bits for intent */

/* Scoping flags for lookup. */
#define LOOKUP_NO_SYMLINKS        BIT(24) /* No symlink crossing. */
#define LOOKUP_NO_MAGICLINKS        BIT(25) /* No nd_jump_link() crossing. */
#define LOOKUP_NO_XDEV                BIT(26) /* No mountpoint crossing. */
#define LOOKUP_BENEATH                BIT(27) /* No escaping from starting point. */
#define LOOKUP_IN_ROOT                BIT(28) /* Treat dirfd as fs root. */
/* LOOKUP_* flags which do scope-related checks based on the dirfd. */
#define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)
/* 3 spare bits for scoping */

extern int path_pts(struct path *path);

extern int user_path_at(int, const char __user *, unsigned, struct path *);

extern int kern_path(const char *, unsigned, struct path *);
struct dentry *kern_path_parent(const char *name, struct path *parent);

extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int);
extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int);
extern void end_creating_path(const struct path *, struct dentry *);
extern struct dentry *start_removing_path(const char *, struct path *);
extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *);
static inline void end_removing_path(const struct path *path , struct dentry *dentry)
{
        end_creating_path(path, dentry);
}
int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
                           struct path *parent, struct qstr *last, int *type,
                           const struct path *root);
int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *,
                    unsigned int, struct path *);

extern struct dentry *try_lookup_noperm(struct qstr *, struct dentry *);
extern struct dentry *lookup_noperm(struct qstr *, struct dentry *);
extern struct dentry *lookup_noperm_unlocked(struct qstr *, struct dentry *);
extern struct dentry *lookup_noperm_positive_unlocked(struct qstr *, struct dentry *);
struct dentry *lookup_one(struct mnt_idmap *, struct qstr *, struct dentry *);
struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
                                   struct qstr *name, struct dentry *base);
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
                                            struct qstr *name,
                                            struct dentry *base);
struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
                                            struct qstr *name,
                                            struct dentry *base);

struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
                              struct qstr *name);
struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
                              struct qstr *name);
struct dentry *start_creating_killable(struct mnt_idmap *idmap,
                                       struct dentry *parent,
                                       struct qstr *name);
struct dentry *start_removing_killable(struct mnt_idmap *idmap,
                                       struct dentry *parent,
                                       struct qstr *name);
struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name);
struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name);
struct dentry *start_creating_dentry(struct dentry *parent,
                                     struct dentry *child);
struct dentry *start_removing_dentry(struct dentry *parent,
                                     struct dentry *child);

/* end_creating - finish action started with start_creating
 * @child: dentry returned by start_creating() or vfs_mkdir()
 *
 * Unlock and release the child. This can be called after
 * start_creating() whether that function succeeded or not,
 * but it is not needed on failure.
 *
 * If vfs_mkdir() was called then the value returned from that function
 * should be given for @child rather than the original dentry, as vfs_mkdir()
 * may have provided a new dentry.
 *
 *
 * If vfs_mkdir() was not called, then @child will be a valid dentry and
 * @parent will be ignored.
 */
static inline void end_creating(struct dentry *child)
{
        end_dirop(child);
}

/* end_creating_keep - finish action started with start_creating() and return result
 * @child: dentry returned by start_creating() or vfs_mkdir()
 *
 * Unlock and return the child. This can be called after
 * start_creating() whether that function succeeded or not,
 * but it is not needed on failure.
 *
 * If vfs_mkdir() was called then the value returned from that function
 * should be given for @child rather than the original dentry, as vfs_mkdir()
 * may have provided a new dentry.
 *
 * Returns: @child, which may be a dentry or an error.
 *
 */
static inline struct dentry *end_creating_keep(struct dentry *child)
{
        if (!IS_ERR(child))
                dget(child);
        end_dirop(child);
        return child;
}

/**
 * end_removing - finish action started with start_removing
 * @child:  dentry returned by start_removing()
 * @parent: dentry given to start_removing()
 *
 * Unlock and release the child.
 *
 * This is identical to end_dirop().  It can be passed the result of
 * start_removing() whether that was successful or not, but it not needed
 * if start_removing() failed.
 */
static inline void end_removing(struct dentry *child)
{
        end_dirop(child);
}

extern int follow_down_one(struct path *);
extern int follow_down(struct path *path, unsigned int flags);
extern int follow_up(struct path *);

int start_renaming(struct renamedata *rd, int lookup_flags,
                   struct qstr *old_last, struct qstr *new_last);
int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
                          struct dentry *old_dentry, struct qstr *new_last);
int start_renaming_two_dentries(struct renamedata *rd,
                                struct dentry *old_dentry, struct dentry *new_dentry);
void end_renaming(struct renamedata *rd);

/**
 * mode_strip_umask - handle vfs umask stripping
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode to be created in @dir
 *
 * In most filesystems, umask stripping depends on whether or not the
 * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask
 * stripping is done directly in here. If the filesystem does support POSIX
 * ACLs umask stripping is deferred until the filesystem calls
 * posix_acl_create().
 *
 * Some filesystems (like NFSv4) also want to avoid umask stripping by the
 * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK
 * to get this effect without declaring that they support POSIX ACLs.
 *
 * Returns: mode
 */
static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode)
{
        if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK))
                mode &= ~current_umask();
        return mode;
}

extern int __must_check nd_jump_link(const struct path *path);

static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
{
        ((char *) name)[min(len, maxlen)] = '\0';
}

/**
 * retry_estale - determine whether the caller should retry an operation
 * @error: the error that would currently be returned
 * @flags: flags being used for next lookup attempt
 *
 * Check to see if the error code was -ESTALE, and then determine whether
 * to retry the call based on whether "flags" already has LOOKUP_REVAL set.
 *
 * Returns true if the caller should try the operation again.
 */
static inline bool
retry_estale(const long error, const unsigned int flags)
{
        return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL));
}

#endif /* _LINUX_NAMEI_H */












































































































































































































































































    3 




































    3 









    2 

   21 
    1 
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NETFILTER_H
#define __LINUX_NETFILTER_H

#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/net.h>
#include <linux/if.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/static_key.h>
#include <linux/module.h>
#include <linux/netfilter_defs.h>
#include <linux/netdevice.h>
#include <linux/sockptr.h>
#include <net/net_namespace.h>

static inline int NF_DROP_GETERR(int verdict)
{
        return -(verdict >> NF_VERDICT_QBITS);
}

static __always_inline int
NF_DROP_REASON(struct sk_buff *skb, enum skb_drop_reason reason, u32 err)
{
        BUILD_BUG_ON(err > 0xffff);

        kfree_skb_reason(skb, reason);

        return ((err << 16) | NF_STOLEN);
}

static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1,
                                   const union nf_inet_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL;
#else
        return a1->all[0] == a2->all[0] &&
               a1->all[1] == a2->all[1] &&
               a1->all[2] == a2->all[2] &&
               a1->all[3] == a2->all[3];
#endif
}

static inline void nf_inet_addr_mask(const union nf_inet_addr *a1,
                                     union nf_inet_addr *result,
                                     const union nf_inet_addr *mask)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ua = (const unsigned long *)a1;
        unsigned long *ur = (unsigned long *)result;
        const unsigned long *um = (const unsigned long *)mask;

        ur[0] = ua[0] & um[0];
        ur[1] = ua[1] & um[1];
#else
        result->all[0] = a1->all[0] & mask->all[0];
        result->all[1] = a1->all[1] & mask->all[1];
        result->all[2] = a1->all[2] & mask->all[2];
        result->all[3] = a1->all[3] & mask->all[3];
#endif
}

int netfilter_init(void);

struct sk_buff;

struct nf_hook_ops;

struct sock;

struct nf_hook_state {
        u8 hook;
        u8 pf;
        struct net_device *in;
        struct net_device *out;
        struct sock *sk;
        struct net *net;
        int (*okfn)(struct net *, struct sock *, struct sk_buff *);
};

typedef unsigned int nf_hookfn(void *priv,
                               struct sk_buff *skb,
                               const struct nf_hook_state *state);
enum nf_hook_ops_type {
        NF_HOOK_OP_UNDEFINED,
        NF_HOOK_OP_NF_TABLES,
        NF_HOOK_OP_BPF,
        NF_HOOK_OP_NFT_FT,
};

struct nf_hook_ops {
        struct list_head        list;
        struct rcu_head                rcu;

        /* User fills in from here down. */
        nf_hookfn                *hook;
        struct net_device        *dev;
        void                        *priv;
        u8                        pf;
        enum nf_hook_ops_type        hook_ops_type:8;
        unsigned int                hooknum;
        /* Hooks are ordered in ascending priority. */
        int                        priority;
};

struct nf_hook_entry {
        nf_hookfn                        *hook;
        void                                *priv;
};

struct nf_hook_entries_rcu_head {
        struct rcu_head head;
        void        *allocation;
};

struct nf_hook_entries {
        u16                                num_hook_entries;
        /* padding */
        struct nf_hook_entry                hooks[];

        /* trailer: pointers to original orig_ops of each hook,
         * followed by rcu_head and scratch space used for freeing
         * the structure via call_rcu.
         *
         *   This is not part of struct nf_hook_entry since its only
         *   needed in slow path (hook register/unregister):
         * const struct nf_hook_ops     *orig_ops[]
         *
         *   For the same reason, we store this at end -- its
         *   only needed when a hook is deleted, not during
         *   packet path processing:
         * struct nf_hook_entries_rcu_head     head
         */
};

#ifdef CONFIG_NETFILTER
static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e)
{
        unsigned int n = e->num_hook_entries;
        const void *hook_end;

        hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */

        return (struct nf_hook_ops **)hook_end;
}

static inline int
nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
                     struct nf_hook_state *state)
{
        return entry->hook(entry->priv, skb, state);
}

static inline void nf_hook_state_init(struct nf_hook_state *p,
                                      unsigned int hook,
                                      u_int8_t pf,
                                      struct net_device *indev,
                                      struct net_device *outdev,
                                      struct sock *sk,
                                      struct net *net,
                                      int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        p->hook = hook;
        p->pf = pf;
        p->in = indev;
        p->out = outdev;
        p->sk = sk;
        p->net = net;
        p->okfn = okfn;
}



struct nf_sockopt_ops {
        struct list_head list;

        u_int8_t pf;

        /* Non-inclusive ranges: use 0/0/NULL to never get called. */
        int set_optmin;
        int set_optmax;
        int (*set)(struct sock *sk, int optval, sockptr_t arg,
                   unsigned int len);
        int get_optmin;
        int get_optmax;
        int (*get)(struct sock *sk, int optval, void __user *user, int *len);
        /* Use the module struct to lock set/get code in place */
        struct module *owner;
};

/* Function to register/unregister hook points. */
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops);
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops);
int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                          unsigned int n);
void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                             unsigned int n);

/* Functions to register get/setsockopt ranges (non-inclusive).  You
   need to check permissions yourself! */
int nf_register_sockopt(struct nf_sockopt_ops *reg);
void nf_unregister_sockopt(struct nf_sockopt_ops *reg);

#ifdef CONFIG_JUMP_LABEL
extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
#endif

int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
                 const struct nf_hook_entries *e, unsigned int i);

void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
                       const struct nf_hook_entries *e);
/**
 *        nf_hook - call a netfilter hook
 *
 *        Returns 1 if the hook has allowed the packet to pass.  The function
 *        okfn must be invoked by the caller in this case.  Any other return
 *        value indicates the packet has been consumed by the hook.
 */
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
                          struct sock *sk, struct sk_buff *skb,
                          struct net_device *indev, struct net_device *outdev,
                          int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        struct nf_hook_entries *hook_head = NULL;
        int ret = 1;

#ifdef CONFIG_JUMP_LABEL
        if (__builtin_constant_p(pf) &&
            __builtin_constant_p(hook) &&
            !static_key_false(&nf_hooks_needed[pf][hook]))
                return 1;
#endif

        rcu_read_lock();
        switch (pf) {
        case NFPROTO_IPV4:
                hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
                break;
        case NFPROTO_IPV6:
                hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
                break;
        case NFPROTO_ARP:
#ifdef CONFIG_NETFILTER_FAMILY_ARP
                if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp)))
                        break;
                hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
#endif
                break;
        case NFPROTO_BRIDGE:
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
                hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
#endif
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        if (hook_head) {
                struct nf_hook_state state;

                nf_hook_state_init(&state, hook, pf, indev, outdev,
                                   sk, net, okfn);

                ret = nf_hook_slow(skb, &state, hook_head, 0);
        }
        rcu_read_unlock();

        return ret;
}

/* Activate hook; either okfn or kfree_skb called, unless a hook
   returns NF_STOLEN (in which case, it's up to the hook to deal with
   the consequences).

   Returns -ERRNO if packet dropped.  Zero means queued, stolen or
   accepted.
*/

/* RR:
   > I don't want nf_hook to return anything because people might forget
   > about async and trust the return value to mean "packet was ok".

   AK:
   Just document it clearly, then you can expect some sense from kernel
   coders :)
*/

static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct sk_buff *skb, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *),
             bool cond)
{
        int ret;

        if (!cond ||
            ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1))
                ret = okfn(net, sk, skb);
        return ret;
}

static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb,
        struct net_device *in, struct net_device *out,
        int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
        if (ret == 1)
                ret = okfn(net, sk, skb);
        return ret;
}

static inline void
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct list_head *head, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        struct nf_hook_entries *hook_head = NULL;

#ifdef CONFIG_JUMP_LABEL
        if (__builtin_constant_p(pf) &&
            __builtin_constant_p(hook) &&
            !static_key_false(&nf_hooks_needed[pf][hook]))
                return;
#endif

        rcu_read_lock();
        switch (pf) {
        case NFPROTO_IPV4:
                hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
                break;
        case NFPROTO_IPV6:
                hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        if (hook_head) {
                struct nf_hook_state state;

                nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn);

                nf_hook_slow_list(head, &state, hook_head);
        }
        rcu_read_unlock();
}

/* Call setsockopt() */
int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, sockptr_t opt,
                  unsigned int len);
int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt,
                  int *len);

struct flowi;
struct nf_queue_entry;

__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
                    unsigned int dataoff, u_int8_t protocol,
                    unsigned short family);

__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
                            unsigned int dataoff, unsigned int len,
                            u_int8_t protocol, unsigned short family);
int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
             bool strict, unsigned short family);

#include <net/flow.h>

struct nf_conn;
enum nf_nat_manip_type;
struct nlattr;

struct nf_nat_hook {
        int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
                               const struct nlattr *attr);
        void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
        void (*remove_nat_bysrc)(struct nf_conn *ct);
};

extern const struct nf_nat_hook __rcu *nf_nat_hook;

static inline void
nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
{
#if IS_ENABLED(CONFIG_NF_NAT)
        const struct nf_nat_hook *nat_hook;

        rcu_read_lock();
        nat_hook = rcu_dereference(nf_nat_hook);
        if (nat_hook && nat_hook->decode_session)
                nat_hook->decode_session(skb, fl);
        rcu_read_unlock();
#endif
}

#else /* !CONFIG_NETFILTER */
static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct sk_buff *skb, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *),
             bool cond)
{
        return okfn(net, sk, skb);
}

static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
        struct sk_buff *skb, struct net_device *in, struct net_device *out,
        int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        return okfn(net, sk, skb);
}

static inline void
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct list_head *head, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        /* nothing to do */
}

static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
                          struct sock *sk, struct sk_buff *skb,
                          struct net_device *indev, struct net_device *outdev,
                          int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        return 1;
}
struct flowi;
static inline void
nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
{
}
#endif /*CONFIG_NETFILTER*/

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_zones_common.h>

void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
void nf_ct_set_closing(struct nf_conntrack *nfct);
struct nf_conntrack_tuple;
bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                         const struct sk_buff *skb);
#else
static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {}
struct nf_conntrack_tuple;
static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                                       const struct sk_buff *skb)
{
        return false;
}
#endif

struct nf_conn;
enum ip_conntrack_info;

struct nf_ct_hook {
        int (*update)(struct net *net, struct sk_buff *skb);
        void (*destroy)(struct nf_conntrack *);
        bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
                              const struct sk_buff *);
        void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
        void (*set_closing)(struct nf_conntrack *nfct);
        int (*confirm)(struct sk_buff *skb);
        u32 (*get_id)(const struct nf_conntrack *nfct);
};
extern const struct nf_ct_hook __rcu *nf_ct_hook;

struct nlattr;

struct nfnl_ct_hook {
        size_t (*build_size)(const struct nf_conn *ct);
        int (*build)(struct sk_buff *skb, struct nf_conn *ct,
                     enum ip_conntrack_info ctinfo,
                     u_int16_t ct_attr, u_int16_t ct_info_attr);
        int (*parse)(const struct nlattr *attr, struct nf_conn *ct);
        int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct,
                             u32 portid, u32 report);
        void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
                           enum ip_conntrack_info ctinfo, s32 off);
};
extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook;

struct nf_defrag_hook {
        struct module *owner;
        int (*enable)(struct net *net);
        void (*disable)(struct net *net);
};

extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook;
extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook;

/*
 * Contains bitmask of ctnetlink event subscribers, if any.
 * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag.
 */
extern u8 nf_ctnetlink_has_listener;
#endif /*__LINUX_NETFILTER_H*/











    1 
    1 























    1 














    1 













    1 





    1 






    1 






    1 










    1 



    1 








    1 


    1 
    1 





    1 





    1 































































































    1 













    1 












    1 
    1 










































































































    1 




    1 
    1 
    1 
































    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
// SPDX-License-Identifier: GPL-2.0
/*
 * Interface between ext4 and JBD
 */

#include "ext4_jbd2.h"

#include <trace/events/ext4.h>

int ext4_inode_journal_mode(struct inode *inode)
{
        if (EXT4_JOURNAL(inode) == NULL)
                return EXT4_INODE_WRITEBACK_DATA_MODE;        /* writeback */
        /* We do not support data journalling with delayed allocation */
        if (!S_ISREG(inode->i_mode) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
            test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
            (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
            !test_opt(inode->i_sb, DELALLOC))) {
                /* We do not support data journalling for encrypted data */
                if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
                        return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
                return EXT4_INODE_JOURNAL_DATA_MODE;        /* journal data */
        }
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                return EXT4_INODE_ORDERED_DATA_MODE;        /* ordered */
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                return EXT4_INODE_WRITEBACK_DATA_MODE;        /* writeback */
        BUG();
}

/* Just increment the non-pointer handle value */
static handle_t *ext4_get_nojournal(void)
{
        handle_t *handle = current->journal_info;
        unsigned long ref_cnt = (unsigned long)handle;

        BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);

        ref_cnt++;
        handle = (handle_t *)ref_cnt;

        current->journal_info = handle;
        return handle;
}


/* Decrement the non-pointer handle value */
static void ext4_put_nojournal(handle_t *handle)
{
        unsigned long ref_cnt = (unsigned long)handle;

        BUG_ON(ref_cnt == 0);

        ref_cnt--;
        handle = (handle_t *)ref_cnt;

        current->journal_info = handle;
}

/*
 * Wrappers for jbd2_journal_start/end.
 */
static int ext4_journal_check_start(struct super_block *sb)
{
        int ret;
        journal_t *journal;

        might_sleep();

        ret = ext4_emergency_state(sb);
        if (unlikely(ret))
                return ret;

        if (WARN_ON_ONCE(sb_rdonly(sb)))
                return -EROFS;

        WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
        journal = EXT4_SB(sb)->s_journal;
        /*
         * Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly.
         */
        if (journal && is_journal_aborted(journal)) {
                ext4_abort(sb, -journal->j_errno, "Detected aborted journal");
                return -EROFS;
        }
        return 0;
}

handle_t *__ext4_journal_start_sb(struct inode *inode,
                                  struct super_block *sb, unsigned int line,
                                  int type, int blocks, int rsv_blocks,
                                  int revoke_creds)
{
        journal_t *journal;
        int err;
        if (inode)
                trace_ext4_journal_start_inode(inode, blocks, rsv_blocks,
                                        revoke_creds, type,
                                        _RET_IP_);
        else
                trace_ext4_journal_start_sb(sb, blocks, rsv_blocks,
                                        revoke_creds, type,
                                        _RET_IP_);
        err = ext4_journal_check_start(sb);
        if (err < 0)
                return ERR_PTR(err);

        journal = EXT4_SB(sb)->s_journal;
        if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
                return ext4_get_nojournal();
        return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
                                   GFP_NOFS, type, line);
}

int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
{
        struct super_block *sb;
        int err;
        int rc;

        if (!ext4_handle_valid(handle)) {
                ext4_put_nojournal(handle);
                return 0;
        }

        err = handle->h_err;
        if (!handle->h_transaction) {
                rc = jbd2_journal_stop(handle);
                return err ? err : rc;
        }

        sb = handle->h_transaction->t_journal->j_private;
        rc = jbd2_journal_stop(handle);

        if (!err)
                err = rc;
        if (err)
                __ext4_std_error(sb, where, line, err);
        return err;
}

handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                                        int type)
{
        struct super_block *sb;
        int err;

        if (!ext4_handle_valid(handle))
                return ext4_get_nojournal();

        sb = handle->h_journal->j_private;
        trace_ext4_journal_start_reserved(sb,
                                jbd2_handle_buffer_credits(handle), _RET_IP_);
        err = ext4_journal_check_start(sb);
        if (err < 0) {
                jbd2_journal_free_reserved(handle);
                return ERR_PTR(err);
        }

        err = jbd2_journal_start_reserved(handle, type, line);
        if (err < 0)
                return ERR_PTR(err);
        return handle;
}

int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
                                  int extend_cred, int revoke_cred)
{
        if (!ext4_handle_valid(handle))
                return 0;
        if (is_handle_aborted(handle))
                return -EROFS;
        if (jbd2_handle_buffer_credits(handle) >= check_cred &&
            handle->h_revoke_credits >= revoke_cred)
                return 0;
        extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle));
        revoke_cred = max(0, revoke_cred - handle->h_revoke_credits);
        return ext4_journal_extend(handle, extend_cred, revoke_cred);
}

static void ext4_journal_abort_handle(const char *caller, unsigned int line,
                                      const char *err_fn,
                                      struct buffer_head *bh,
                                      handle_t *handle, int err)
{
        char nbuf[16];
        const char *errstr = ext4_decode_error(NULL, err, nbuf);

        BUG_ON(!ext4_handle_valid(handle));

        if (bh)
                BUFFER_TRACE(bh, "abort");

        if (!handle->h_err)
                handle->h_err = err;

        if (is_handle_aborted(handle))
                return;

        printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
               caller, line, errstr, err_fn);

        jbd2_journal_abort_handle(handle);
}

static void ext4_check_bdev_write_error(struct super_block *sb)
{
        struct address_space *mapping = sb->s_bdev->bd_mapping;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;

        /*
         * If the block device has write error flag, it may have failed to
         * async write out metadata buffers in the background. In this case,
         * we could read old data from disk and write it out again, which
         * may lead to on-disk filesystem inconsistency.
         */
        if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) {
                spin_lock(&sbi->s_bdev_wb_lock);
                err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err);
                spin_unlock(&sbi->s_bdev_wb_lock);
                if (err)
                        ext4_error_err(sb, -err,
                                       "Error while async write back metadata");
        }
}

int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct super_block *sb,
                                    struct buffer_head *bh,
                                    enum ext4_journal_trigger_type trigger_type)
{
        int err;

        might_sleep();

        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_write_access(handle, bh);
                if (err) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
                        return err;
                }
        } else
                ext4_check_bdev_write_error(sb);
        if (trigger_type == EXT4_JTR_NONE ||
            !ext4_has_feature_metadata_csum(sb))
                return 0;
        BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
        jbd2_journal_set_triggers(bh,
                &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
        return 0;
}

/*
 * The ext4 forget function must perform a revoke if we are freeing data
 * which has been journaled.  Metadata (eg. indirect blocks) must be
 * revoked in all cases.
 *
 * "bh" may be NULL: a metadata block may have been freed from memory
 * but there may still be a record of it in the journal, and that record
 * still needs to be revoked.
 */
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
                  int is_metadata, struct inode *inode,
                  struct buffer_head *bh, ext4_fsblk_t blocknr)
{
        int err;

        might_sleep();

        trace_ext4_forget(inode, is_metadata, blocknr);
        BUFFER_TRACE(bh, "enter");

        ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n",
                  bh, is_metadata, inode->i_mode,
                  test_opt(inode->i_sb, DATA_FLAGS));

        /*
         * In the no journal case, we should wait for the ongoing buffer
         * to complete and do a forget.
         */
        if (!ext4_handle_valid(handle)) {
                if (bh) {
                        clear_buffer_dirty(bh);
                        wait_on_buffer(bh);
                        __bforget(bh);
                }
                return 0;
        }

        /* Never use the revoke function if we are doing full data
         * journaling: there is no need to, and a V1 superblock won't
         * support it.  Otherwise, only skip the revoke on un-journaled
         * data blocks. */

        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
            (!is_metadata && !ext4_should_journal_data(inode))) {
                if (bh) {
                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
                        err = jbd2_journal_forget(handle, bh);
                        if (err)
                                ext4_journal_abort_handle(where, line, __func__,
                                                          bh, handle, err);
                        return err;
                }
                return 0;
        }

        /*
         * data!=journal && (is_metadata || should_journal_data(inode))
         */
        BUFFER_TRACE(bh, "call jbd2_journal_revoke");
        err = jbd2_journal_revoke(handle, blocknr, bh);
        if (err) {
                ext4_journal_abort_handle(where, line, __func__,
                                          bh, handle, err);
                __ext4_error(inode->i_sb, where, line, true, -err, 0,
                             "error %d when attempting revoke", err);
        }
        BUFFER_TRACE(bh, "exit");
        return err;
}

int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct super_block *sb,
                                struct buffer_head *bh,
                                enum ext4_journal_trigger_type trigger_type)
{
        int err;

        if (!ext4_handle_valid(handle))
                return 0;

        err = jbd2_journal_get_create_access(handle, bh);
        if (err) {
                ext4_journal_abort_handle(where, line, __func__, bh, handle,
                                          err);
                return err;
        }
        if (trigger_type == EXT4_JTR_NONE ||
            !ext4_has_feature_metadata_csum(sb))
                return 0;
        BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
        jbd2_journal_set_triggers(bh,
                &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
        return 0;
}

int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 handle_t *handle, struct inode *inode,
                                 struct buffer_head *bh)
{
        int err = 0;

        might_sleep();

        set_buffer_meta(bh);
        set_buffer_prio(bh);
        set_buffer_uptodate(bh);
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
                /* Errors can only happen due to aborted journal or a nasty bug */
                if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
                        if (inode == NULL) {
                                pr_err("EXT4: jbd2_journal_dirty_metadata "
                                       "failed: handle type %u started at "
                                       "line %u, credits %u/%u, errcode %d",
                                       handle->h_type,
                                       handle->h_line_no,
                                       handle->h_requested_credits,
                                       jbd2_handle_buffer_credits(handle), err);
                                return err;
                        }
                        ext4_error_inode(inode, where, line,
                                         bh->b_blocknr,
                                         "journal_dirty_metadata failed: "
                                         "handle type %u started at line %u, "
                                         "credits %u/%u, errcode %d",
                                         handle->h_type,
                                         handle->h_line_no,
                                         handle->h_requested_credits,
                                         jbd2_handle_buffer_credits(handle),
                                         err);
                }
        } else {
                if (inode)
                        mmb_mark_buffer_dirty(bh,
                                              &EXT4_I(inode)->i_metadata_bhs);
                else
                        mark_buffer_dirty(bh);
                if (inode && inode_needs_sync(inode)) {
                        sync_dirty_buffer(bh);
                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
                                ext4_error_inode_err(inode, where, line,
                                                     bh->b_blocknr, EIO,
                                        "IO error syncing itable block");
                                err = -EIO;
                        }
                }
        }
        return err;
}





























































   13 



   14 
















    3 























































































































































































































































    3 








































    3 









    3 
    3 













































































    3 



























    3 









    3 
    3 














































    3 
    3 






















    3 





    2 







    3 





















    3 
    3 








    3 

















    3 
    3 






    3 





    3 
    3 



    3 





















    3 


















































    3 






























    3 










    3 




    3 






    3 









































































































    3 







    3 


















































































































































































    3 
    3 








































































































































































































































































































































    3 









    3 















    3 
    3 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
// SPDX-License-Identifier: GPL-2.0
/*
 * bus.c - bus driver management
 *
 * Copyright (c) 2002-3 Patrick Mochel
 * Copyright (c) 2002-3 Open Source Development Labs
 * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2007 Novell Inc.
 * Copyright (c) 2023 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 */

#include <linux/async.h>
#include <linux/device/bus.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/sysfs.h>
#include "base.h"
#include "power/power.h"

/* /sys/devices/system */
static struct kset *system_kset;

/* /sys/bus */
static struct kset *bus_kset;

#define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr)

/*
 * sysfs bindings for drivers
 */

#define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr)

#define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
        struct driver_attribute driver_attr_##_name =                \
                __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)

static int __must_check bus_rescan_devices_helper(struct device *dev,
                                                void *data);

/**
 * bus_to_subsys - Turn a struct bus_type into a struct subsys_private
 *
 * @bus: pointer to the struct bus_type to look up
 *
 * The driver core internals needs to work on the subsys_private structure, not
 * the external struct bus_type pointer.  This function walks the list of
 * registered busses in the system and finds the matching one and returns the
 * internal struct subsys_private that relates to that bus.
 *
 * Note, the reference count of the return value is INCREMENTED if it is not
 * NULL.  A call to subsys_put() must be done when finished with the pointer in
 * order for it to be properly freed.
 */
struct subsys_private *bus_to_subsys(const struct bus_type *bus)
{
        struct subsys_private *sp = NULL;
        struct kobject *kobj;

        if (!bus || !bus_kset)
                return NULL;

        spin_lock(&bus_kset->list_lock);

        if (list_empty(&bus_kset->list))
                goto done;

        list_for_each_entry(kobj, &bus_kset->list, entry) {
                struct kset *kset = container_of(kobj, struct kset, kobj);

                sp = container_of_const(kset, struct subsys_private, subsys);
                if (sp->bus == bus)
                        goto done;
        }
        sp = NULL;
done:
        sp = subsys_get(sp);
        spin_unlock(&bus_kset->list_lock);
        return sp;
}

static const struct bus_type *bus_get(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);

        if (sp)
                return bus;
        return NULL;
}

static void bus_put(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);

        /* two puts are required as the call to bus_to_subsys incremented it again */
        subsys_put(sp);
        subsys_put(sp);
}

static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr,
                             char *buf)
{
        struct driver_attribute *drv_attr = to_drv_attr(attr);
        struct driver_private *drv_priv = to_driver(kobj);
        ssize_t ret = -EIO;

        if (drv_attr->show)
                ret = drv_attr->show(drv_priv->driver, buf);
        return ret;
}

static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr,
                              const char *buf, size_t count)
{
        struct driver_attribute *drv_attr = to_drv_attr(attr);
        struct driver_private *drv_priv = to_driver(kobj);
        ssize_t ret = -EIO;

        if (drv_attr->store)
                ret = drv_attr->store(drv_priv->driver, buf, count);
        return ret;
}

static const struct sysfs_ops driver_sysfs_ops = {
        .show        = drv_attr_show,
        .store        = drv_attr_store,
};

static void driver_release(struct kobject *kobj)
{
        struct driver_private *drv_priv = to_driver(kobj);

        pr_debug("driver: '%s': %s\n", kobject_name(kobj), __func__);
        kfree(drv_priv);
}

static const struct kobj_type driver_ktype = {
        .sysfs_ops        = &driver_sysfs_ops,
        .release        = driver_release,
};

/*
 * sysfs bindings for buses
 */
static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr,
                             char *buf)
{
        struct bus_attribute *bus_attr = to_bus_attr(attr);
        struct subsys_private *subsys_priv = to_subsys_private(kobj);
        /* return -EIO for reading a bus attribute without show() */
        ssize_t ret = -EIO;

        if (bus_attr->show)
                ret = bus_attr->show(subsys_priv->bus, buf);
        return ret;
}

static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr,
                              const char *buf, size_t count)
{
        struct bus_attribute *bus_attr = to_bus_attr(attr);
        struct subsys_private *subsys_priv = to_subsys_private(kobj);
        /* return -EIO for writing a bus attribute without store() */
        ssize_t ret = -EIO;

        if (bus_attr->store)
                ret = bus_attr->store(subsys_priv->bus, buf, count);
        return ret;
}

static const struct sysfs_ops bus_sysfs_ops = {
        .show        = bus_attr_show,
        .store        = bus_attr_store,
};

int bus_create_file(const struct bus_type *bus, struct bus_attribute *attr)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        int error;

        if (!sp)
                return -EINVAL;

        error = sysfs_create_file(&sp->subsys.kobj, &attr->attr);

        subsys_put(sp);
        return error;
}
EXPORT_SYMBOL_GPL(bus_create_file);

void bus_remove_file(const struct bus_type *bus, struct bus_attribute *attr)
{
        struct subsys_private *sp = bus_to_subsys(bus);

        if (!sp)
                return;

        sysfs_remove_file(&sp->subsys.kobj, &attr->attr);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(bus_remove_file);

static void bus_release(struct kobject *kobj)
{
        struct subsys_private *priv = to_subsys_private(kobj);

        lockdep_unregister_key(&priv->lock_key);
        kfree(priv);
}

static const struct kobj_type bus_ktype = {
        .sysfs_ops        = &bus_sysfs_ops,
        .release        = bus_release,
};

static int bus_uevent_filter(const struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);

        if (ktype == &bus_ktype)
                return 1;
        return 0;
}

static const struct kset_uevent_ops bus_uevent_ops = {
        .filter = bus_uevent_filter,
};

/* Manually detach a device from its associated driver. */
static ssize_t unbind_store(struct device_driver *drv, const char *buf,
                            size_t count)
{
        const struct bus_type *bus = bus_get(drv->bus);
        struct device *dev;
        int err = -ENODEV;

        dev = bus_find_device_by_name(bus, NULL, buf);
        if (dev && dev->driver == drv) {
                device_driver_detach(dev);
                err = count;
        }
        put_device(dev);
        bus_put(bus);
        return err;
}
static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store);

/*
 * Manually attach a device to a driver.
 * Note: the driver must want to bind to the device,
 * it is not possible to override the driver's id table.
 */
static ssize_t bind_store(struct device_driver *drv, const char *buf,
                          size_t count)
{
        const struct bus_type *bus = bus_get(drv->bus);
        struct device *dev;
        int err = -ENODEV;

        dev = bus_find_device_by_name(bus, NULL, buf);
        if (dev && driver_match_device(drv, dev)) {
                err = device_driver_attach(drv, dev);
                if (!err) {
                        /* success */
                        err = count;
                }
        }
        put_device(dev);
        bus_put(bus);
        return err;
}
static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store);

static ssize_t drivers_autoprobe_show(const struct bus_type *bus, char *buf)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        int ret;

        if (!sp)
                return -EINVAL;

        ret = sysfs_emit(buf, "%d\n", sp->drivers_autoprobe);
        subsys_put(sp);
        return ret;
}

static ssize_t drivers_autoprobe_store(const struct bus_type *bus,
                                       const char *buf, size_t count)
{
        struct subsys_private *sp = bus_to_subsys(bus);

        if (!sp)
                return -EINVAL;

        if (buf[0] == '0')
                sp->drivers_autoprobe = 0;
        else
                sp->drivers_autoprobe = 1;

        subsys_put(sp);
        return count;
}

static ssize_t drivers_probe_store(const struct bus_type *bus,
                                   const char *buf, size_t count)
{
        struct device *dev;
        int err = -EINVAL;

        dev = bus_find_device_by_name(bus, NULL, buf);
        if (!dev)
                return -ENODEV;
        if (bus_rescan_devices_helper(dev, NULL) == 0)
                err = count;
        put_device(dev);
        return err;
}

static struct device *next_device(struct klist_iter *i)
{
        struct klist_node *n = klist_next(i);
        struct device *dev = NULL;
        struct device_private *dev_prv;

        if (n) {
                dev_prv = to_device_private_bus(n);
                dev = dev_prv->device;
        }
        return dev;
}

static struct device *prev_device(struct klist_iter *i)
{
        struct klist_node *n = klist_prev(i);
        struct device *dev = NULL;
        struct device_private *dev_prv;

        if (n) {
                dev_prv = to_device_private_bus(n);
                dev = dev_prv->device;
        }
        return dev;
}

/**
 * bus_for_each_dev - device iterator.
 * @bus: bus type.
 * @start: device to start iterating from.
 * @data: data for the callback.
 * @fn: function to be called for each device.
 *
 * Iterate over @bus's list of devices, and call @fn for each,
 * passing it @data. If @start is not NULL, we use that device to
 * begin iterating from.
 *
 * We check the return of @fn each time. If it returns anything
 * other than 0, we break out and return that value.
 *
 * NOTE: The device that returns a non-zero value is not retained
 * in any way, nor is its refcount incremented. If the caller needs
 * to retain this data, it should do so, and increment the reference
 * count in the supplied callback.
 */
int bus_for_each_dev(const struct bus_type *bus, struct device *start,
                     void *data, device_iter_t fn)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct klist_iter i;
        struct device *dev;
        int error = 0;

        if (!sp)
                return -EINVAL;

        klist_iter_init_node(&sp->klist_devices, &i,
                             (start ? &start->p->knode_bus : NULL));
        while (!error && (dev = next_device(&i)))
                error = fn(dev, data);
        klist_iter_exit(&i);
        subsys_put(sp);
        return error;
}
EXPORT_SYMBOL_GPL(bus_for_each_dev);

/**
 * bus_find_device - device iterator for locating a particular device.
 * @bus: bus type
 * @start: Device to begin with
 * @data: Data to pass to match function
 * @match: Callback function to check device
 *
 * This is similar to the bus_for_each_dev() function above, but it
 * returns a reference to a device that is 'found' for later use, as
 * determined by the @match callback.
 *
 * The callback should return 0 if the device doesn't match and non-zero
 * if it does.  If the callback returns non-zero, this function will
 * return to the caller and not iterate over any more devices.
 */
struct device *bus_find_device(const struct bus_type *bus,
                               struct device *start, const void *data,
                               device_match_t match)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct klist_iter i;
        struct device *dev;

        if (!sp)
                return NULL;

        klist_iter_init_node(&sp->klist_devices, &i,
                             (start ? &start->p->knode_bus : NULL));
        while ((dev = next_device(&i))) {
                if (match(dev, data)) {
                        get_device(dev);
                        break;
                }
        }
        klist_iter_exit(&i);
        subsys_put(sp);
        return dev;
}
EXPORT_SYMBOL_GPL(bus_find_device);

struct device *bus_find_device_reverse(const struct bus_type *bus,
                                       struct device *start, const void *data,
                                       device_match_t match)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct klist_iter i;
        struct device *dev;

        if (!sp)
                return NULL;

        klist_iter_init_node(&sp->klist_devices, &i,
                             (start ? &start->p->knode_bus : NULL));
        while ((dev = prev_device(&i))) {
                if (match(dev, data)) {
                        get_device(dev);
                        break;
                }
        }
        klist_iter_exit(&i);
        subsys_put(sp);
        return dev;
}
EXPORT_SYMBOL_GPL(bus_find_device_reverse);

static struct device_driver *next_driver(struct klist_iter *i)
{
        struct klist_node *n = klist_next(i);
        struct driver_private *drv_priv;

        if (n) {
                drv_priv = container_of(n, struct driver_private, knode_bus);
                return drv_priv->driver;
        }
        return NULL;
}

/**
 * bus_for_each_drv - driver iterator
 * @bus: bus we're dealing with.
 * @start: driver to start iterating on.
 * @data: data to pass to the callback.
 * @fn: function to call for each driver.
 *
 * This is nearly identical to the device iterator above.
 * We iterate over each driver that belongs to @bus, and call
 * @fn for each. If @fn returns anything but 0, we break out
 * and return it. If @start is not NULL, we use it as the head
 * of the list.
 *
 * NOTE: we don't return the driver that returns a non-zero
 * value, nor do we leave the reference count incremented for that
 * driver. If the caller needs to know that info, it must set it
 * in the callback. It must also be sure to increment the refcount
 * so it doesn't disappear before returning to the caller.
 */
int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start,
                     void *data, int (*fn)(struct device_driver *, void *))
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct klist_iter i;
        struct device_driver *drv;
        int error = 0;

        if (!sp)
                return -EINVAL;

        klist_iter_init_node(&sp->klist_drivers, &i,
                             start ? &start->p->knode_bus : NULL);
        while ((drv = next_driver(&i)) && !error)
                error = fn(drv, data);
        klist_iter_exit(&i);
        subsys_put(sp);
        return error;
}
EXPORT_SYMBOL_GPL(bus_for_each_drv);

static ssize_t driver_override_store(struct device *dev,
                                     struct device_attribute *attr,
                                     const char *buf, size_t count)
{
        int ret;

        ret = __device_set_driver_override(dev, buf, count);
        if (ret)
                return ret;

        return count;
}

static ssize_t driver_override_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        guard(spinlock)(&dev->driver_override.lock);
        return sysfs_emit(buf, "%s\n", dev->driver_override.name);
}
static DEVICE_ATTR_RW(driver_override);

static struct attribute *driver_override_dev_attrs[] = {
        &dev_attr_driver_override.attr,
        NULL,
};

static const struct attribute_group driver_override_dev_group = {
        .attrs = driver_override_dev_attrs,
};

/**
 * bus_add_device - add device to bus
 * @dev: device being added
 *
 * - Add device's bus attributes.
 * - Create links to device's bus.
 * - Add the device to its bus's list of devices.
 */
int bus_add_device(struct device *dev)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);
        int error;

        if (!sp) {
                /*
                 * This is a normal operation for many devices that do not
                 * have a bus assigned to them, just say that all went
                 * well.
                 */
                return 0;
        }

        /*
         * Reference in sp is now incremented and will be dropped when
         * the device is removed from the bus
         */

        pr_debug("bus: '%s': add device %s\n", sp->bus->name, dev_name(dev));

        error = device_add_groups(dev, sp->bus->dev_groups);
        if (error)
                goto out_put;

        if (dev->bus->driver_override) {
                error = device_add_group(dev, &driver_override_dev_group);
                if (error)
                        goto out_groups;
        }

        error = sysfs_create_link(&sp->devices_kset->kobj, &dev->kobj, dev_name(dev));
        if (error)
                goto out_override;

        error = sysfs_create_link(&dev->kobj, &sp->subsys.kobj, "subsystem");
        if (error)
                goto out_subsys;

        klist_add_tail(&dev->p->knode_bus, &sp->klist_devices);
        return 0;

out_subsys:
        sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev));
out_override:
        if (dev->bus->driver_override)
                device_remove_group(dev, &driver_override_dev_group);
out_groups:
        device_remove_groups(dev, sp->bus->dev_groups);
out_put:
        subsys_put(sp);
        return error;
}

/**
 * bus_probe_device - probe drivers for a new device
 * @dev: device to probe
 *
 * - Automatically probe for a driver if the bus allows it.
 */
void bus_probe_device(struct device *dev)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);
        struct subsys_interface *sif;

        if (!sp)
                return;

        device_initial_probe(dev);

        mutex_lock(&sp->mutex);
        list_for_each_entry(sif, &sp->interfaces, node)
                if (sif->add_dev)
                        sif->add_dev(dev, sif);
        mutex_unlock(&sp->mutex);
        subsys_put(sp);
}

/**
 * bus_remove_device - remove device from bus
 * @dev: device to be removed
 *
 * - Remove device from all interfaces.
 * - Remove symlink from bus' directory.
 * - Delete device from bus's list.
 * - Detach from its driver.
 * - Drop reference taken in bus_add_device().
 */
void bus_remove_device(struct device *dev)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);
        struct subsys_interface *sif;

        if (!sp)
                return;

        mutex_lock(&sp->mutex);
        list_for_each_entry(sif, &sp->interfaces, node)
                if (sif->remove_dev)
                        sif->remove_dev(dev, sif);
        mutex_unlock(&sp->mutex);

        sysfs_remove_link(&dev->kobj, "subsystem");
        sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev));
        if (dev->bus->driver_override)
                device_remove_group(dev, &driver_override_dev_group);
        device_remove_groups(dev, dev->bus->dev_groups);
        if (klist_node_attached(&dev->p->knode_bus))
                klist_del(&dev->p->knode_bus);

        pr_debug("bus: '%s': remove device %s\n",
                 dev->bus->name, dev_name(dev));
        device_release_driver(dev);

        /*
         * Decrement the reference count twice, once for the bus_to_subsys()
         * call in the start of this function, and the second one from the
         * reference increment in bus_add_device()
         */
        subsys_put(sp);
        subsys_put(sp);
}

static int __must_check add_bind_files(struct device_driver *drv)
{
        int ret;

        ret = driver_create_file(drv, &driver_attr_unbind);
        if (ret == 0) {
                ret = driver_create_file(drv, &driver_attr_bind);
                if (ret)
                        driver_remove_file(drv, &driver_attr_unbind);
        }
        return ret;
}

static void remove_bind_files(struct device_driver *drv)
{
        driver_remove_file(drv, &driver_attr_bind);
        driver_remove_file(drv, &driver_attr_unbind);
}

static BUS_ATTR_WO(drivers_probe);
static BUS_ATTR_RW(drivers_autoprobe);

static int add_probe_files(const struct bus_type *bus)
{
        int retval;

        retval = bus_create_file(bus, &bus_attr_drivers_probe);
        if (retval)
                goto out;

        retval = bus_create_file(bus, &bus_attr_drivers_autoprobe);
        if (retval)
                bus_remove_file(bus, &bus_attr_drivers_probe);
out:
        return retval;
}

static void remove_probe_files(const struct bus_type *bus)
{
        bus_remove_file(bus, &bus_attr_drivers_autoprobe);
        bus_remove_file(bus, &bus_attr_drivers_probe);
}

static ssize_t uevent_store(struct device_driver *drv, const char *buf,
                            size_t count)
{
        int rc;

        rc = kobject_synth_uevent(&drv->p->kobj, buf, count);
        return rc ? rc : count;
}
static DRIVER_ATTR_WO(uevent);

/**
 * bus_add_driver - Add a driver to the bus.
 * @drv: driver.
 */
int bus_add_driver(struct device_driver *drv)
{
        struct subsys_private *sp = bus_to_subsys(drv->bus);
        struct driver_private *priv;
        int error = 0;

        if (!sp)
                return -EINVAL;

        /*
         * Reference in sp is now incremented and will be dropped when
         * the driver is removed from the bus
         */
        pr_debug("bus: '%s': add driver %s\n", sp->bus->name, drv->name);

        priv = kzalloc_obj(*priv);
        if (!priv) {
                error = -ENOMEM;
                goto out_put_bus;
        }
        klist_init(&priv->klist_devices, NULL, NULL);
        priv->driver = drv;
        drv->p = priv;
        priv->kobj.kset = sp->drivers_kset;
        error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL,
                                     "%s", drv->name);
        if (error)
                goto out_unregister;

        klist_add_tail(&priv->knode_bus, &sp->klist_drivers);
        if (sp->drivers_autoprobe) {
                error = driver_attach(drv);
                if (error)
                        goto out_del_list;
        }
        error = module_add_driver(drv->owner, drv);
        if (error) {
                printk(KERN_ERR "%s: failed to create module links for %s\n",
                        __func__, drv->name);
                goto out_detach;
        }

        error = driver_create_file(drv, &driver_attr_uevent);
        if (error) {
                printk(KERN_ERR "%s: uevent attr (%s) failed\n",
                        __func__, drv->name);
        }
        error = driver_add_groups(drv, sp->bus->drv_groups);
        if (error) {
                /* How the hell do we get out of this pickle? Give up */
                printk(KERN_ERR "%s: driver_add_groups(%s) failed\n",
                        __func__, drv->name);
        }

        if (!drv->suppress_bind_attrs) {
                error = add_bind_files(drv);
                if (error) {
                        /* Ditto */
                        printk(KERN_ERR "%s: add_bind_files(%s) failed\n",
                                __func__, drv->name);
                }
        }

        return 0;

out_detach:
        driver_detach(drv);
out_del_list:
        klist_del(&priv->knode_bus);
out_unregister:
        kobject_put(&priv->kobj);
        /* drv->p is freed in driver_release()  */
        drv->p = NULL;
out_put_bus:
        subsys_put(sp);
        return error;
}

/**
 * bus_remove_driver - delete driver from bus's knowledge.
 * @drv: driver.
 *
 * Detach the driver from the devices it controls, and remove
 * it from its bus's list of drivers. Finally, we drop the reference
 * to the bus we took in bus_add_driver().
 */
void bus_remove_driver(struct device_driver *drv)
{
        struct subsys_private *sp = bus_to_subsys(drv->bus);

        if (!sp)
                return;

        pr_debug("bus: '%s': remove driver %s\n", sp->bus->name, drv->name);

        if (!drv->suppress_bind_attrs)
                remove_bind_files(drv);
        driver_remove_groups(drv, sp->bus->drv_groups);
        driver_remove_file(drv, &driver_attr_uevent);
        klist_remove(&drv->p->knode_bus);
        driver_detach(drv);
        module_remove_driver(drv);
        kobject_put(&drv->p->kobj);

        /*
         * Decrement the reference count twice, once for the bus_to_subsys()
         * call in the start of this function, and the second one from the
         * reference increment in bus_add_driver()
         */
        subsys_put(sp);
        subsys_put(sp);
}

/* Helper for bus_rescan_devices's iter */
static int __must_check bus_rescan_devices_helper(struct device *dev,
                                                  void *data)
{
        int ret = 0;

        if (!dev->driver) {
                if (dev->parent && dev->bus->need_parent_lock)
                        device_lock(dev->parent);
                ret = device_attach(dev);
                if (dev->parent && dev->bus->need_parent_lock)
                        device_unlock(dev->parent);
        }
        return ret < 0 ? ret : 0;
}

/**
 * bus_rescan_devices - rescan devices on the bus for possible drivers
 * @bus: the bus to scan.
 *
 * This function will look for devices on the bus with no driver
 * attached and rescan it against existing drivers to see if it matches
 * any by calling device_attach() for the unbound devices.
 */
int bus_rescan_devices(const struct bus_type *bus)
{
        return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper);
}
EXPORT_SYMBOL_GPL(bus_rescan_devices);

/**
 * device_reprobe - remove driver for a device and probe for a new driver
 * @dev: the device to reprobe
 *
 * This function detaches the attached driver (if any) for the given
 * device and restarts the driver probing process.  It is intended
 * to use if probing criteria changed during a devices lifetime and
 * driver attachment should change accordingly.
 */
int device_reprobe(struct device *dev)
{
        if (dev->driver)
                device_driver_detach(dev);
        return bus_rescan_devices_helper(dev, NULL);
}
EXPORT_SYMBOL_GPL(device_reprobe);

static void klist_devices_get(struct klist_node *n)
{
        struct device_private *dev_prv = to_device_private_bus(n);
        struct device *dev = dev_prv->device;

        get_device(dev);
}

static void klist_devices_put(struct klist_node *n)
{
        struct device_private *dev_prv = to_device_private_bus(n);
        struct device *dev = dev_prv->device;

        put_device(dev);
}

static ssize_t bus_uevent_store(const struct bus_type *bus,
                                const char *buf, size_t count)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        int ret;

        if (!sp)
                return -EINVAL;

        ret = kobject_synth_uevent(&sp->subsys.kobj, buf, count);
        subsys_put(sp);

        if (ret)
                return ret;
        return count;
}
/*
 * "open code" the old BUS_ATTR() macro here.  We want to use BUS_ATTR_WO()
 * here, but can not use it as earlier in the file we have
 * DEVICE_ATTR_WO(uevent), which would cause a clash with the with the store
 * function name.
 */
static struct bus_attribute bus_attr_uevent = __ATTR(uevent, 0200, NULL,
                                                     bus_uevent_store);

/**
 * bus_register - register a driver-core subsystem
 * @bus: bus to register
 *
 * Once we have that, we register the bus with the kobject
 * infrastructure, then register the children subsystems it has:
 * the devices and drivers that belong to the subsystem.
 */
int bus_register(const struct bus_type *bus)
{
        int retval;
        struct subsys_private *priv;
        struct kobject *bus_kobj;
        struct lock_class_key *key;

        priv = kzalloc_obj(struct subsys_private);
        if (!priv)
                return -ENOMEM;

        priv->bus = bus;

        BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier);

        bus_kobj = &priv->subsys.kobj;
        retval = kobject_set_name(bus_kobj, "%s", bus->name);
        if (retval)
                goto out;

        bus_kobj->kset = bus_kset;
        bus_kobj->ktype = &bus_ktype;
        priv->drivers_autoprobe = 1;

        retval = kset_register(&priv->subsys);
        if (retval)
                goto out;

        retval = bus_create_file(bus, &bus_attr_uevent);
        if (retval)
                goto bus_uevent_fail;

        priv->devices_kset = kset_create_and_add("devices", NULL, bus_kobj);
        if (!priv->devices_kset) {
                retval = -ENOMEM;
                goto bus_devices_fail;
        }

        priv->drivers_kset = kset_create_and_add("drivers", NULL, bus_kobj);
        if (!priv->drivers_kset) {
                retval = -ENOMEM;
                goto bus_drivers_fail;
        }

        INIT_LIST_HEAD(&priv->interfaces);
        key = &priv->lock_key;
        lockdep_register_key(key);
        __mutex_init(&priv->mutex, "subsys mutex", key);
        klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);
        klist_init(&priv->klist_drivers, NULL, NULL);

        retval = add_probe_files(bus);
        if (retval)
                goto bus_probe_files_fail;

        retval = sysfs_create_groups(bus_kobj, bus->bus_groups);
        if (retval)
                goto bus_groups_fail;

        pr_debug("bus: '%s': registered\n", bus->name);
        return 0;

bus_groups_fail:
        remove_probe_files(bus);
bus_probe_files_fail:
        kset_unregister(priv->drivers_kset);
bus_drivers_fail:
        kset_unregister(priv->devices_kset);
bus_devices_fail:
        bus_remove_file(bus, &bus_attr_uevent);
bus_uevent_fail:
        kset_unregister(&priv->subsys);
        /* Above kset_unregister() will kfree @priv */
        priv = NULL;
out:
        kfree(priv);
        return retval;
}
EXPORT_SYMBOL_GPL(bus_register);

/**
 * bus_unregister - remove a bus from the system
 * @bus: bus.
 *
 * Unregister the child subsystems and the bus itself.
 * Finally, we call bus_put() to release the refcount
 */
void bus_unregister(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct kobject *bus_kobj;

        if (!sp)
                return;

        pr_debug("bus: '%s': unregistering\n", bus->name);
        if (sp->dev_root)
                device_unregister(sp->dev_root);

        bus_kobj = &sp->subsys.kobj;
        sysfs_remove_groups(bus_kobj, bus->bus_groups);
        remove_probe_files(bus);
        bus_remove_file(bus, &bus_attr_uevent);

        kset_unregister(sp->drivers_kset);
        kset_unregister(sp->devices_kset);
        kset_unregister(&sp->subsys);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(bus_unregister);

int bus_register_notifier(const struct bus_type *bus, struct notifier_block *nb)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        int retval;

        if (!sp)
                return -EINVAL;

        retval = blocking_notifier_chain_register(&sp->bus_notifier, nb);
        subsys_put(sp);
        return retval;
}
EXPORT_SYMBOL_GPL(bus_register_notifier);

int bus_unregister_notifier(const struct bus_type *bus, struct notifier_block *nb)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        int retval;

        if (!sp)
                return -EINVAL;
        retval = blocking_notifier_chain_unregister(&sp->bus_notifier, nb);
        subsys_put(sp);
        return retval;
}
EXPORT_SYMBOL_GPL(bus_unregister_notifier);

void bus_notify(struct device *dev, enum bus_notifier_event value)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);

        if (!sp)
                return;

        blocking_notifier_call_chain(&sp->bus_notifier, value, dev);
        subsys_put(sp);
}

struct kset *bus_get_kset(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct kset *kset;

        if (!sp)
                return NULL;

        kset = &sp->subsys;
        subsys_put(sp);

        return kset;
}
EXPORT_SYMBOL_GPL(bus_get_kset);

/*
 * Yes, this forcibly breaks the klist abstraction temporarily.  It
 * just wants to sort the klist, not change reference counts and
 * take/drop locks rapidly in the process.  It does all this while
 * holding the lock for the list, so objects can't otherwise be
 * added/removed while we're swizzling.
 */
static void device_insertion_sort_klist(struct device *a, struct list_head *list,
                                        int (*compare)(const struct device *a,
                                                        const struct device *b))
{
        struct klist_node *n;
        struct device_private *dev_prv;
        struct device *b;

        list_for_each_entry(n, list, n_node) {
                dev_prv = to_device_private_bus(n);
                b = dev_prv->device;
                if (compare(a, b) <= 0) {
                        list_move_tail(&a->p->knode_bus.n_node,
                                       &b->p->knode_bus.n_node);
                        return;
                }
        }
        list_move_tail(&a->p->knode_bus.n_node, list);
}

void bus_sort_breadthfirst(const struct bus_type *bus,
                           int (*compare)(const struct device *a,
                                          const struct device *b))
{
        struct subsys_private *sp = bus_to_subsys(bus);
        LIST_HEAD(sorted_devices);
        struct klist_node *n, *tmp;
        struct device_private *dev_prv;
        struct device *dev;
        struct klist *device_klist;

        if (!sp)
                return;
        device_klist = &sp->klist_devices;

        spin_lock(&device_klist->k_lock);
        list_for_each_entry_safe(n, tmp, &device_klist->k_list, n_node) {
                dev_prv = to_device_private_bus(n);
                dev = dev_prv->device;
                device_insertion_sort_klist(dev, &sorted_devices, compare);
        }
        list_splice(&sorted_devices, &device_klist->k_list);
        spin_unlock(&device_klist->k_lock);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(bus_sort_breadthfirst);

struct subsys_dev_iter {
        struct klist_iter                ki;
        const struct device_type        *type;
};

/**
 * subsys_dev_iter_init - initialize subsys device iterator
 * @iter: subsys iterator to initialize
 * @sp: the subsys private (i.e. bus) we wanna iterate over
 * @start: the device to start iterating from, if any
 * @type: device_type of the devices to iterate over, NULL for all
 *
 * Initialize subsys iterator @iter such that it iterates over devices
 * of @subsys.  If @start is set, the list iteration will start there,
 * otherwise if it is NULL, the iteration starts at the beginning of
 * the list.
 */
static void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct subsys_private *sp,
                                 struct device *start, const struct device_type *type)
{
        struct klist_node *start_knode = NULL;

        if (start)
                start_knode = &start->p->knode_bus;
        klist_iter_init_node(&sp->klist_devices, &iter->ki, start_knode);
        iter->type = type;
}

/**
 * subsys_dev_iter_next - iterate to the next device
 * @iter: subsys iterator to proceed
 *
 * Proceed @iter to the next device and return it.  Returns NULL if
 * iteration is complete.
 *
 * The returned device is referenced and won't be released till
 * iterator is proceed to the next device or exited.  The caller is
 * free to do whatever it wants to do with the device including
 * calling back into subsys code.
 */
static struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter)
{
        struct klist_node *knode;
        struct device *dev;

        for (;;) {
                knode = klist_next(&iter->ki);
                if (!knode)
                        return NULL;
                dev = to_device_private_bus(knode)->device;
                if (!iter->type || iter->type == dev->type)
                        return dev;
        }
}

/**
 * subsys_dev_iter_exit - finish iteration
 * @iter: subsys iterator to finish
 *
 * Finish an iteration.  Always call this function after iteration is
 * complete whether the iteration ran till the end or not.
 */
static void subsys_dev_iter_exit(struct subsys_dev_iter *iter)
{
        klist_iter_exit(&iter->ki);
}

int subsys_interface_register(struct subsys_interface *sif)
{
        struct subsys_private *sp;
        struct subsys_dev_iter iter;
        struct device *dev;

        if (!sif || !sif->subsys)
                return -ENODEV;

        sp = bus_to_subsys(sif->subsys);
        if (!sp)
                return -EINVAL;

        /*
         * Reference in sp is now incremented and will be dropped when
         * the interface is removed from the bus
         */

        mutex_lock(&sp->mutex);
        list_add_tail(&sif->node, &sp->interfaces);
        if (sif->add_dev) {
                subsys_dev_iter_init(&iter, sp, NULL, NULL);
                while ((dev = subsys_dev_iter_next(&iter)))
                        sif->add_dev(dev, sif);
                subsys_dev_iter_exit(&iter);
        }
        mutex_unlock(&sp->mutex);

        return 0;
}
EXPORT_SYMBOL_GPL(subsys_interface_register);

void subsys_interface_unregister(struct subsys_interface *sif)
{
        struct subsys_private *sp;
        struct subsys_dev_iter iter;
        struct device *dev;

        if (!sif || !sif->subsys)
                return;

        sp = bus_to_subsys(sif->subsys);
        if (!sp)
                return;

        mutex_lock(&sp->mutex);
        list_del_init(&sif->node);
        if (sif->remove_dev) {
                subsys_dev_iter_init(&iter, sp, NULL, NULL);
                while ((dev = subsys_dev_iter_next(&iter)))
                        sif->remove_dev(dev, sif);
                subsys_dev_iter_exit(&iter);
        }
        mutex_unlock(&sp->mutex);

        /*
         * Decrement the reference count twice, once for the bus_to_subsys()
         * call in the start of this function, and the second one from the
         * reference increment in subsys_interface_register()
         */
        subsys_put(sp);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(subsys_interface_unregister);

static void system_root_device_release(struct device *dev)
{
        kfree(dev);
}

static int subsys_register(const struct bus_type *subsys,
                           const struct attribute_group **groups,
                           struct kobject *parent_of_root)
{
        struct subsys_private *sp;
        struct device *dev;
        int err;

        err = bus_register(subsys);
        if (err < 0)
                return err;

        sp = bus_to_subsys(subsys);
        if (!sp) {
                err = -EINVAL;
                goto err_sp;
        }

        dev = kzalloc_obj(struct device);
        if (!dev) {
                err = -ENOMEM;
                goto err_dev;
        }

        err = dev_set_name(dev, "%s", subsys->name);
        if (err < 0)
                goto err_name;

        dev->kobj.parent = parent_of_root;
        dev->groups = groups;
        dev->release = system_root_device_release;

        err = device_register(dev);
        if (err < 0)
                goto err_dev_reg;

        sp->dev_root = dev;
        subsys_put(sp);
        return 0;

err_dev_reg:
        put_device(dev);
        dev = NULL;
err_name:
        kfree(dev);
err_dev:
        subsys_put(sp);
err_sp:
        bus_unregister(subsys);
        return err;
}

/**
 * subsys_system_register - register a subsystem at /sys/devices/system/
 * @subsys: system subsystem
 * @groups: default attributes for the root device
 *
 * All 'system' subsystems have a /sys/devices/system/<name> root device
 * with the name of the subsystem. The root device can carry subsystem-
 * wide attributes. All registered devices are below this single root
 * device and are named after the subsystem with a simple enumeration
 * number appended. The registered devices are not explicitly named;
 * only 'id' in the device needs to be set.
 *
 * Do not use this interface for anything new, it exists for compatibility
 * with bad ideas only. New subsystems should use plain subsystems; and
 * add the subsystem-wide attributes should be added to the subsystem
 * directory itself and not some create fake root-device placed in
 * /sys/devices/system/<name>.
 */
int subsys_system_register(const struct bus_type *subsys,
                           const struct attribute_group **groups)
{
        return subsys_register(subsys, groups, &system_kset->kobj);
}
EXPORT_SYMBOL_GPL(subsys_system_register);

/**
 * subsys_virtual_register - register a subsystem at /sys/devices/virtual/
 * @subsys: virtual subsystem
 * @groups: default attributes for the root device
 *
 * All 'virtual' subsystems have a /sys/devices/system/<name> root device
 * with the name of the subsystem.  The root device can carry subsystem-wide
 * attributes.  All registered devices are below this single root device.
 * There's no restriction on device naming.  This is for kernel software
 * constructs which need sysfs interface.
 */
int subsys_virtual_register(const struct bus_type *subsys,
                            const struct attribute_group **groups)
{
        struct kobject *virtual_dir;

        virtual_dir = virtual_device_parent();
        if (!virtual_dir)
                return -ENOMEM;

        return subsys_register(subsys, groups, virtual_dir);
}
EXPORT_SYMBOL_GPL(subsys_virtual_register);

/**
 * driver_find - locate driver on a bus by its name.
 * @name: name of the driver.
 * @bus: bus to scan for the driver.
 *
 * Call kset_find_obj() to iterate over list of drivers on
 * a bus to find driver by name. Return driver if found.
 *
 * This routine provides no locking to prevent the driver it returns
 * from being unregistered or unloaded while the caller is using it.
 * The caller is responsible for preventing this.
 */
struct device_driver *driver_find(const char *name, const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct kobject *k;
        struct driver_private *priv;

        if (!sp)
                return NULL;

        k = kset_find_obj(sp->drivers_kset, name);
        subsys_put(sp);
        if (!k)
                return NULL;

        priv = to_driver(k);

        /* Drop reference added by kset_find_obj() */
        kobject_put(k);
        return priv->driver;
}
EXPORT_SYMBOL_GPL(driver_find);

/*
 * Warning, the value could go to "removed" instantly after calling this function, so be very
 * careful when calling it...
 */
bool bus_is_registered(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        bool is_initialized = false;

        if (sp) {
                is_initialized = true;
                subsys_put(sp);
        }
        return is_initialized;
}

/**
 * bus_get_dev_root - return a pointer to the "device root" of a bus
 * @bus: bus to return the device root of.
 *
 * If a bus has a "device root" structure, return it, WITH THE REFERENCE
 * COUNT INCREMENTED.
 *
 * Note, when finished with the device, a call to put_device() is required.
 *
 * If the device root is not present (or bus is not a valid pointer), NULL
 * will be returned.
 */
struct device *bus_get_dev_root(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct device *dev_root;

        if (!sp)
                return NULL;

        dev_root = get_device(sp->dev_root);
        subsys_put(sp);
        return dev_root;
}
EXPORT_SYMBOL_GPL(bus_get_dev_root);

int __init buses_init(void)
{
        bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL);
        if (!bus_kset)
                return -ENOMEM;

        system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj);
        if (!system_kset) {
                /* Do error handling here as devices_init() do */
                kset_unregister(bus_kset);
                bus_kset = NULL;
                pr_err("%s: failed to create and add kset 'bus'\n", __func__);
                return -ENOMEM;
        }

        return 0;
}



















































    2 










    1 
    2 









    2 



























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
// SPDX-License-Identifier: GPL-2.0
/*
 * Block stat tracking code
 *
 * Copyright (C) 2016 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/rculist.h>

#include "blk-stat.h"
#include "blk-mq.h"
#include "blk.h"

struct blk_queue_stats {
        struct list_head callbacks;
        spinlock_t lock;
        int accounting;
};

void blk_rq_stat_init(struct blk_rq_stat *stat)
{
        stat->min = -1ULL;
        stat->max = stat->nr_samples = stat->mean = 0;
        stat->batch = 0;
}

/* src is a per-cpu stat, mean isn't initialized */
void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
        if (dst->nr_samples + src->nr_samples <= dst->nr_samples)
                return;

        dst->min = min(dst->min, src->min);
        dst->max = max(dst->max, src->max);

        dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
                                dst->nr_samples + src->nr_samples);

        dst->nr_samples += src->nr_samples;
}

void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
{
        stat->min = min(stat->min, value);
        stat->max = max(stat->max, value);
        stat->batch += value;
        stat->nr_samples++;
}

void blk_stat_add(struct request *rq, u64 now)
{
        struct request_queue *q = rq->q;
        struct blk_stat_callback *cb;
        struct blk_rq_stat *stat;
        int bucket, cpu;
        u64 value;

        value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;

        rcu_read_lock();
        cpu = get_cpu();
        list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
                if (!blk_stat_is_active(cb))
                        continue;

                bucket = cb->bucket_fn(rq);
                if (bucket < 0)
                        continue;

                stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket];
                blk_rq_stat_add(stat, value);
        }
        put_cpu();
        rcu_read_unlock();
}

static void blk_stat_timer_fn(struct timer_list *t)
{
        struct blk_stat_callback *cb = timer_container_of(cb, t, timer);
        unsigned int bucket;
        int cpu;

        for (bucket = 0; bucket < cb->buckets; bucket++)
                blk_rq_stat_init(&cb->stat[bucket]);

        for_each_online_cpu(cpu) {
                struct blk_rq_stat *cpu_stat;

                cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
                for (bucket = 0; bucket < cb->buckets; bucket++) {
                        blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
                        blk_rq_stat_init(&cpu_stat[bucket]);
                }
        }

        cb->timer_fn(cb);
}

struct blk_stat_callback *
blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
                        int (*bucket_fn)(const struct request *),
                        unsigned int buckets, void *data)
{
        struct blk_stat_callback *cb;

        cb = kmalloc_obj(*cb);
        if (!cb)
                return NULL;

        cb->stat = kmalloc_objs(struct blk_rq_stat, buckets);
        if (!cb->stat) {
                kfree(cb);
                return NULL;
        }
        cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat),
                                      __alignof__(struct blk_rq_stat));
        if (!cb->cpu_stat) {
                kfree(cb->stat);
                kfree(cb);
                return NULL;
        }

        cb->timer_fn = timer_fn;
        cb->bucket_fn = bucket_fn;
        cb->data = data;
        cb->buckets = buckets;
        timer_setup(&cb->timer, blk_stat_timer_fn, 0);

        return cb;
}

void blk_stat_add_callback(struct request_queue *q,
                           struct blk_stat_callback *cb)
{
        unsigned int bucket;
        unsigned long flags;
        int cpu;

        for_each_possible_cpu(cpu) {
                struct blk_rq_stat *cpu_stat;

                cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
                for (bucket = 0; bucket < cb->buckets; bucket++)
                        blk_rq_stat_init(&cpu_stat[bucket]);
        }

        spin_lock_irqsave(&q->stats->lock, flags);
        list_add_tail_rcu(&cb->list, &q->stats->callbacks);
        blk_queue_flag_set(QUEUE_FLAG_STATS, q);
        spin_unlock_irqrestore(&q->stats->lock, flags);
}

void blk_stat_remove_callback(struct request_queue *q,
                              struct blk_stat_callback *cb)
{
        unsigned long flags;

        spin_lock_irqsave(&q->stats->lock, flags);
        list_del_rcu(&cb->list);
        if (list_empty(&q->stats->callbacks) && !q->stats->accounting)
                blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
        spin_unlock_irqrestore(&q->stats->lock, flags);

        timer_delete_sync(&cb->timer);
}

static void blk_stat_free_callback_rcu(struct rcu_head *head)
{
        struct blk_stat_callback *cb;

        cb = container_of(head, struct blk_stat_callback, rcu);
        free_percpu(cb->cpu_stat);
        kfree(cb->stat);
        kfree(cb);
}

void blk_stat_free_callback(struct blk_stat_callback *cb)
{
        if (cb)
                call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
}

void blk_stat_disable_accounting(struct request_queue *q)
{
        unsigned long flags;

        spin_lock_irqsave(&q->stats->lock, flags);
        if (!--q->stats->accounting && list_empty(&q->stats->callbacks))
                blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
        spin_unlock_irqrestore(&q->stats->lock, flags);
}
EXPORT_SYMBOL_GPL(blk_stat_disable_accounting);

void blk_stat_enable_accounting(struct request_queue *q)
{
        unsigned long flags;

        spin_lock_irqsave(&q->stats->lock, flags);
        if (!q->stats->accounting++ && list_empty(&q->stats->callbacks))
                blk_queue_flag_set(QUEUE_FLAG_STATS, q);
        spin_unlock_irqrestore(&q->stats->lock, flags);
}
EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);

struct blk_queue_stats *blk_alloc_queue_stats(void)
{
        struct blk_queue_stats *stats;

        stats = kmalloc_obj(*stats);
        if (!stats)
                return NULL;

        INIT_LIST_HEAD(&stats->callbacks);
        spin_lock_init(&stats->lock);
        stats->accounting = 0;

        return stats;
}

void blk_free_queue_stats(struct blk_queue_stats *stats)
{
        if (!stats)
                return;

        WARN_ON(!list_empty(&stats->callbacks));

        kfree(stats);
}














































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







    1 

    1 









   10 









   11 

   10 


















    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/stat.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/compat.h>
#include <linux/iversion.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>

#include <trace/events/timestamp.h>

#include "internal.h"
#include "mount.h"

/**
 * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
 * @stat: where to store the resulting values
 * @request_mask: STATX_* values requested
 * @inode: inode from which to grab the c/mtime
 *
 * Given @inode, grab the ctime and mtime out if it and store the result
 * in @stat. When fetching the value, flag it as QUERIED (if not already)
 * so the next write will record a distinct timestamp.
 *
 * NB: The QUERIED flag is tracked in the ctime, but we set it there even
 * if only the mtime was requested, as that ensures that the next mtime
 * change will be distinct.
 */
void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
{
        atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec;

        /* If neither time was requested, then don't report them */
        if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
                stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
                return;
        }

        stat->mtime = inode_get_mtime(inode);
        stat->ctime.tv_sec = inode->i_ctime_sec;
        stat->ctime.tv_nsec = (u32)atomic_read(pcn);
        if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED))
                stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn));
        stat->ctime.tv_nsec &= ~I_CTIME_QUERIED;
        trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime);
}
EXPORT_SYMBOL(fill_mg_cmtime);

/**
 * generic_fillattr - Fill in the basic attributes from the inode struct
 * @idmap:                idmap of the mount the inode was found from
 * @request_mask:        statx request_mask
 * @inode:                Inode to use as the source
 * @stat:                Where to fill in the attributes
 *
 * Fill in the basic attributes in the kstat structure from data that's to be
 * found on the VFS inode structure.  This is the default if no getattr inode
 * operation is supplied.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before filling in the
 * uid and gid filds. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
                      struct inode *inode, struct kstat *stat)
{
        vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
        vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);

        stat->dev = inode->i_sb->s_dev;
        stat->ino = inode->i_ino;
        stat->mode = inode->i_mode;
        stat->nlink = inode->i_nlink;
        stat->uid = vfsuid_into_kuid(vfsuid);
        stat->gid = vfsgid_into_kgid(vfsgid);
        stat->rdev = inode->i_rdev;
        stat->size = i_size_read(inode);
        stat->atime = inode_get_atime(inode);

        if (is_mgtime(inode)) {
                fill_mg_cmtime(stat, request_mask, inode);
        } else {
                stat->ctime = inode_get_ctime(inode);
                stat->mtime = inode_get_mtime(inode);
        }

        stat->blksize = i_blocksize(inode);
        stat->blocks = inode->i_blocks;

        if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
                stat->result_mask |= STATX_CHANGE_COOKIE;
                stat->change_cookie = inode_query_iversion(inode);
        }

}
EXPORT_SYMBOL(generic_fillattr);

/**
 * generic_fill_statx_attr - Fill in the statx attributes from the inode flags
 * @inode:        Inode to use as the source
 * @stat:        Where to fill in the attribute flags
 *
 * Fill in the STATX_ATTR_* flags in the kstat structure for properties of the
 * inode that are published on i_flags and enforced by the VFS.
 */
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat)
{
        if (inode->i_flags & S_IMMUTABLE)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (inode->i_flags & S_APPEND)
                stat->attributes |= STATX_ATTR_APPEND;
        stat->attributes_mask |= KSTAT_ATTR_VFS_FLAGS;
}
EXPORT_SYMBOL(generic_fill_statx_attr);

/**
 * generic_fill_statx_atomic_writes - Fill in atomic writes statx attributes
 * @stat:        Where to fill in the attribute flags
 * @unit_min:        Minimum supported atomic write length in bytes
 * @unit_max:        Maximum supported atomic write length in bytes
 * @unit_max_opt: Optimised maximum supported atomic write length in bytes
 *
 * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from
 * atomic write unit_min and unit_max values.
 */
void generic_fill_statx_atomic_writes(struct kstat *stat,
                                      unsigned int unit_min,
                                      unsigned int unit_max,
                                      unsigned int unit_max_opt)
{
        /* Confirm that the request type is known */
        stat->result_mask |= STATX_WRITE_ATOMIC;

        /* Confirm that the file attribute type is known */
        stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC;

        if (unit_min) {
                stat->atomic_write_unit_min = unit_min;
                stat->atomic_write_unit_max = unit_max;
                stat->atomic_write_unit_max_opt = unit_max_opt;
                /* Initially only allow 1x segment */
                stat->atomic_write_segments_max = 1;

                /* Confirm atomic writes are actually supported */
                stat->attributes |= STATX_ATTR_WRITE_ATOMIC;
        }
}
EXPORT_SYMBOL_GPL(generic_fill_statx_atomic_writes);

/**
 * vfs_getattr_nosec - getattr without security checks
 * @path: file to get attributes from
 * @stat: structure to return attributes in
 * @request_mask: STATX_xxx flags indicating what the caller wants
 * @query_flags: Query mode (AT_STATX_SYNC_TYPE)
 *
 * Get attributes without calling security_inode_getattr.
 *
 * Currently the only caller other than vfs_getattr is internal to the
 * filehandle lookup code, which uses only the inode number and returns no
 * attributes to any user.  Any other code probably wants vfs_getattr.
 */
int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
                      u32 request_mask, unsigned int query_flags)
{
        struct mnt_idmap *idmap;
        struct inode *inode = d_backing_inode(path->dentry);

        memset(stat, 0, sizeof(*stat));
        stat->result_mask |= STATX_BASIC_STATS;
        query_flags &= AT_STATX_SYNC_TYPE;

        /* allow the fs to override these if it really wants to */
        /* SB_NOATIME means filesystem supplies dummy atime value */
        if (inode->i_sb->s_flags & SB_NOATIME)
                stat->result_mask &= ~STATX_ATIME;

        /*
         * Note: If you add another clause to set an attribute flag, please
         * update attributes_mask below.
         */
        if (IS_AUTOMOUNT(inode))
                stat->attributes |= STATX_ATTR_AUTOMOUNT;

        if (IS_DAX(inode))
                stat->attributes |= STATX_ATTR_DAX;

        stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
                                  STATX_ATTR_DAX);

        idmap = mnt_idmap(path->mnt);
        if (inode->i_op->getattr) {
                int ret;

                ret = inode->i_op->getattr(idmap, path, stat, request_mask,
                                query_flags);
                if (ret)
                        return ret;
        } else {
                generic_fillattr(idmap, request_mask, inode, stat);
        }

        /*
         * If this is a block device inode, override the filesystem attributes
         * with the block device specific parameters that need to be obtained
         * from the bdev backing inode.
         */
        if (S_ISBLK(stat->mode))
                bdev_statx(path, stat, request_mask);

        return 0;
}
EXPORT_SYMBOL(vfs_getattr_nosec);

/*
 * vfs_getattr - Get the enhanced basic attributes of a file
 * @path: The file of interest
 * @stat: Where to return the statistics
 * @request_mask: STATX_xxx flags indicating what the caller wants
 * @query_flags: Query mode (AT_STATX_SYNC_TYPE)
 *
 * Ask the filesystem for a file's attributes.  The caller must indicate in
 * request_mask and query_flags to indicate what they want.
 *
 * If the file is remote, the filesystem can be forced to update the attributes
 * from the backing store by passing AT_STATX_FORCE_SYNC in query_flags or can
 * suppress the update by passing AT_STATX_DONT_SYNC.
 *
 * Bits must have been set in request_mask to indicate which attributes the
 * caller wants retrieving.  Any such attribute not requested may be returned
 * anyway, but the value may be approximate, and, if remote, may not have been
 * synchronised with the server.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
int vfs_getattr(const struct path *path, struct kstat *stat,
                u32 request_mask, unsigned int query_flags)
{
        int retval;

        retval = security_inode_getattr(path);
        if (unlikely(retval))
                return retval;
        return vfs_getattr_nosec(path, stat, request_mask, query_flags);
}
EXPORT_SYMBOL(vfs_getattr);

/**
 * vfs_fstat - Get the basic attributes by file descriptor
 * @fd: The file descriptor referring to the file of interest
 * @stat: The result structure to fill in.
 *
 * This function is a wrapper around vfs_getattr().  The main difference is
 * that it uses a file descriptor to determine the file location.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
int vfs_fstat(int fd, struct kstat *stat)
{
        CLASS(fd_raw, f)(fd);
        if (fd_empty(f))
                return -EBADF;
        return vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0);
}

static int statx_lookup_flags(int flags)
{
        int lookup_flags = 0;

        if (!(flags & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        if (!(flags & AT_NO_AUTOMOUNT))
                lookup_flags |= LOOKUP_AUTOMOUNT;

        return lookup_flags;
}

static int vfs_statx_path(const struct path *path, int flags, struct kstat *stat,
                          u32 request_mask)
{
        int error = vfs_getattr(path, stat, request_mask, flags);
        if (error)
                return error;

        if (request_mask & STATX_MNT_ID_UNIQUE) {
                stat->mnt_id = real_mount(path->mnt)->mnt_id_unique;
                stat->result_mask |= STATX_MNT_ID_UNIQUE;
        } else {
                stat->mnt_id = real_mount(path->mnt)->mnt_id;
                stat->result_mask |= STATX_MNT_ID;
        }

        if (path_mounted(path))
                stat->attributes |= STATX_ATTR_MOUNT_ROOT;
        stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
        return 0;
}

static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
                          u32 request_mask)
{
        CLASS(fd_raw, f)(fd);
        if (fd_empty(f))
                return -EBADF;
        return vfs_statx_path(&fd_file(f)->f_path, flags, stat, request_mask);
}

/**
 * vfs_statx - Get basic and extra attributes by filename
 * @dfd: A file descriptor representing the base dir for a relative filename
 * @filename: The name of the file of interest
 * @flags: Flags to control the query
 * @stat: The result structure to fill in.
 * @request_mask: STATX_xxx flags indicating what the caller wants
 *
 * This function is a wrapper around vfs_getattr().  The main difference is
 * that it uses a filename and base directory to determine the file location.
 * Additionally, the use of AT_SYMLINK_NOFOLLOW in flags will prevent a symlink
 * at the given name from being referenced.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
static int vfs_statx(int dfd, struct filename *filename, int flags,
              struct kstat *stat, u32 request_mask)
{
        struct path path;
        unsigned int lookup_flags = statx_lookup_flags(flags);
        int error;

        if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
                      AT_STATX_SYNC_TYPE))
                return -EINVAL;

retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                return error;
        error = vfs_statx_path(&path, flags, stat, request_mask);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

int vfs_fstatat(int dfd, const char __user *filename,
                              struct kstat *stat, int flags)
{
        CLASS(filename_maybe_null, name)(filename, flags);

        if (!name && dfd >= 0)
                return vfs_fstat(dfd, stat);

        return vfs_statx(dfd, name, flags | AT_NO_AUTOMOUNT,
                         stat, STATX_BASIC_STATS);
}

#ifdef __ARCH_WANT_OLD_STAT

/*
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf)
{
        static int warncount = 5;
        struct __old_kernel_stat tmp;

        if (warncount > 0) {
                warncount--;
                printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n",
                        current->comm);
        } else if (warncount < 0) {
                /* it's laughable, but... */
                warncount = 0;
        }

        memset(&tmp, 0, sizeof(struct __old_kernel_stat));
        tmp.st_dev = old_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = old_encode_dev(stat->rdev);
#if BITS_PER_LONG == 32
        if (stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
#endif
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_ctime = stat->ctime.tv_sec;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(stat, const char __user *, filename,
                struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_stat(filename, &stat);
        if (unlikely(error))
                return error;

        return cp_old_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(lstat, const char __user *, filename,
                struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (unlikely(error))
                return error;

        return cp_old_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_fstat(fd, &stat);
        if (unlikely(error))
                return error;

        return cp_old_stat(&stat, statbuf);
}

#endif /* __ARCH_WANT_OLD_STAT */

#ifdef __ARCH_WANT_NEW_STAT

#ifndef INIT_STRUCT_STAT_PADDING
#  define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif

static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
{
        struct stat tmp;

        if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
                return -EOVERFLOW;
        if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
                return -EOVERFLOW;
#if BITS_PER_LONG == 32
        if (stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
#endif

        INIT_STRUCT_STAT_PADDING(tmp);
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = new_encode_dev(stat->rdev);
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_ctime = stat->ctime.tv_sec;
#ifdef STAT_HAVE_NSEC
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
#endif
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(newstat, const char __user *, filename,
                struct stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_stat(filename, &stat);
        if (unlikely(error))
                return error;

        return cp_new_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(newlstat, const char __user *, filename,
                struct stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (unlikely(error))
                return error;

        return cp_new_stat(&stat, statbuf);
}

#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
                struct stat __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (unlikely(error))
                return error;

        return cp_new_stat(&stat, statbuf);
}
#endif

SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_fstat(fd, &stat);
        if (unlikely(error))
                return error;

        return cp_new_stat(&stat, statbuf);
}
#endif

static int do_readlinkat(int dfd, const char __user *pathname,
                         char __user *buf, int bufsiz)
{
        struct path path;
        int error;
        unsigned int lookup_flags = 0;

        if (bufsiz <= 0)
                return -EINVAL;

        CLASS(filename_flags, name)(pathname, LOOKUP_EMPTY);
retry:
        error = filename_lookup(dfd, name, lookup_flags, &path, NULL);
        if (unlikely(error))
                return error;

        /*
         * AFS mountpoints allow readlink(2) but are not symlinks
         */
        if (d_is_symlink(path.dentry) ||
            d_backing_inode(path.dentry)->i_op->readlink) {
                error = security_inode_readlink(path.dentry);
                if (!error) {
                        touch_atime(&path);
                        error = vfs_readlink(path.dentry, buf, bufsiz);
                }
        } else {
                error = (name->name[0] == '\0') ? -ENOENT : -EINVAL;
        }
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
                char __user *, buf, int, bufsiz)
{
        return do_readlinkat(dfd, pathname, buf, bufsiz);
}

SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
                int, bufsiz)
{
        return do_readlinkat(AT_FDCWD, path, buf, bufsiz);
}


/* ---------- LFS-64 ----------- */
#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)

#ifndef INIT_STRUCT_STAT64_PADDING
#  define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st))
#endif

static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
{
        struct stat64 tmp;

        INIT_STRUCT_STAT64_PADDING(tmp);
#ifdef CONFIG_MIPS
        /* mips has weird padding, so we don't get 64 bits there */
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_rdev = new_encode_dev(stat->rdev);
#else
        tmp.st_dev = huge_encode_dev(stat->dev);
        tmp.st_rdev = huge_encode_dev(stat->rdev);
#endif
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
#ifdef STAT64_HAS_BROKEN_ST_INO
        tmp.__st_ino = stat->ino;
#endif
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
        tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime = stat->ctime.tv_sec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
        tmp.st_size = stat->size;
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(stat64, const char __user *, filename,
                struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_stat(filename, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE2(lstat64, const char __user *, filename,
                struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_lstat(filename, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
                struct stat64 __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_new_stat64(&stat, statbuf);
}
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */

static noinline_for_stack int
cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
        struct statx tmp;

        memset(&tmp, 0, sizeof(tmp));

        /* STATX_CHANGE_COOKIE is kernel-only for now */
        tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE;
        tmp.stx_blksize = stat->blksize;
        /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */
        tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC;
        tmp.stx_nlink = stat->nlink;
        tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
        tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
        tmp.stx_mode = stat->mode;
        tmp.stx_ino = stat->ino;
        tmp.stx_size = stat->size;
        tmp.stx_blocks = stat->blocks;
        tmp.stx_attributes_mask = stat->attributes_mask;
        tmp.stx_atime.tv_sec = stat->atime.tv_sec;
        tmp.stx_atime.tv_nsec = stat->atime.tv_nsec;
        tmp.stx_btime.tv_sec = stat->btime.tv_sec;
        tmp.stx_btime.tv_nsec = stat->btime.tv_nsec;
        tmp.stx_ctime.tv_sec = stat->ctime.tv_sec;
        tmp.stx_ctime.tv_nsec = stat->ctime.tv_nsec;
        tmp.stx_mtime.tv_sec = stat->mtime.tv_sec;
        tmp.stx_mtime.tv_nsec = stat->mtime.tv_nsec;
        tmp.stx_rdev_major = MAJOR(stat->rdev);
        tmp.stx_rdev_minor = MINOR(stat->rdev);
        tmp.stx_dev_major = MAJOR(stat->dev);
        tmp.stx_dev_minor = MINOR(stat->dev);
        tmp.stx_mnt_id = stat->mnt_id;
        tmp.stx_dio_mem_align = stat->dio_mem_align;
        tmp.stx_dio_offset_align = stat->dio_offset_align;
        tmp.stx_dio_read_offset_align = stat->dio_read_offset_align;
        tmp.stx_subvol = stat->subvol;
        tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
        tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
        tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max;
        tmp.stx_atomic_write_unit_max_opt = stat->atomic_write_unit_max_opt;

        return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}

int do_statx(int dfd, struct filename *filename, unsigned int flags,
             unsigned int mask, struct statx __user *buffer)
{
        struct kstat stat;
        int error;

        if (mask & STATX__RESERVED)
                return -EINVAL;
        if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
                return -EINVAL;

        /*
         * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
         * from userland.
         */
        mask &= ~STATX_CHANGE_COOKIE;

        error = vfs_statx(dfd, filename, flags, &stat, mask);
        if (error)
                return error;

        return cp_statx(&stat, buffer);
}

int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
             struct statx __user *buffer)
{
        struct kstat stat;
        int error;

        if (mask & STATX__RESERVED)
                return -EINVAL;
        if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
                return -EINVAL;

        /*
         * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
         * from userland.
         */
        mask &= ~STATX_CHANGE_COOKIE;

        error = vfs_statx_fd(fd, flags, &stat, mask);
        if (error)
                return error;

        return cp_statx(&stat, buffer);
}

/**
 * sys_statx - System call to get enhanced stats
 * @dfd: Base directory to pathwalk from *or* fd to stat.
 * @filename: File to stat or either NULL or "" with AT_EMPTY_PATH
 * @flags: AT_* flags to control pathwalk.
 * @mask: Parts of statx struct actually required.
 * @buffer: Result buffer.
 *
 * Note that fstat() can be emulated by setting dfd to the fd of interest,
 * supplying "" (or preferably NULL) as the filename and setting AT_EMPTY_PATH
 * in the flags.
 */
SYSCALL_DEFINE5(statx,
                int, dfd, const char __user *, filename, unsigned, flags,
                unsigned int, mask,
                struct statx __user *, buffer)
{
        CLASS(filename_maybe_null, name)(filename, flags);

        if (!name && dfd >= 0)
                return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer);

        return do_statx(dfd, name, flags, mask, buffer);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT)
static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
        struct compat_stat tmp;

        if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
                return -EOVERFLOW;
        if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
                return -EOVERFLOW;

        memset(&tmp, 0, sizeof(tmp));
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = new_encode_dev(stat->rdev);
        if ((u64) stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime = stat->ctime.tv_sec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}

COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_stat(filename, &stat);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}

COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}

#ifndef __ARCH_WANT_STAT64
COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
                       const char __user *, filename,
                       struct compat_stat __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}
#endif

COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_compat_stat(&stat, statbuf);
        return error;
}
#endif

/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
void __inode_add_bytes(struct inode *inode, loff_t bytes)
{
        inode->i_blocks += bytes >> 9;
        bytes &= 511;
        inode->i_bytes += bytes;
        if (inode->i_bytes >= 512) {
                inode->i_blocks++;
                inode->i_bytes -= 512;
        }
}
EXPORT_SYMBOL(__inode_add_bytes);

void inode_add_bytes(struct inode *inode, loff_t bytes)
{
        spin_lock(&inode->i_lock);
        __inode_add_bytes(inode, bytes);
        spin_unlock(&inode->i_lock);
}

EXPORT_SYMBOL(inode_add_bytes);

void __inode_sub_bytes(struct inode *inode, loff_t bytes)
{
        inode->i_blocks -= bytes >> 9;
        bytes &= 511;
        if (inode->i_bytes < bytes) {
                inode->i_blocks--;
                inode->i_bytes += 512;
        }
        inode->i_bytes -= bytes;
}

EXPORT_SYMBOL(__inode_sub_bytes);

void inode_sub_bytes(struct inode *inode, loff_t bytes)
{
        spin_lock(&inode->i_lock);
        __inode_sub_bytes(inode, bytes);
        spin_unlock(&inode->i_lock);
}

EXPORT_SYMBOL(inode_sub_bytes);

loff_t inode_get_bytes(struct inode *inode)
{
        loff_t ret;

        spin_lock(&inode->i_lock);
        ret = __inode_get_bytes(inode);
        spin_unlock(&inode->i_lock);
        return ret;
}

EXPORT_SYMBOL(inode_get_bytes);

void inode_set_bytes(struct inode *inode, loff_t bytes)
{
        /* Caller is here responsible for sufficient locking
         * (ie. inode->i_lock) */
        inode->i_blocks = bytes >> 9;
        inode->i_bytes = bytes & 511;
}

EXPORT_SYMBOL(inode_set_bytes);

      
      mount(&(0x7f0000000140)=@nullb, &(0x7f0000000040)='./cgroup\x00', &(0x7f0000000080)='xfs\x00', 0x2208004, 0x0)

      
      sigaltstack(&(0x7f0000000600)={0x0, 0x3}, 0x0)

      
      bpf$BPF_BTF_LOAD(0x12, &(0x7f0000000040)={&(0x7f0000001100)={{0xeb9f, 0x1, 0x0, 0x18, 0x0, 0x20, 0x20, 0x6, [@func_proto={0x0, 0x1, 0x0, 0xd, 0x0, [{0x2, 0x2}]}, @ptr]}, {0x0, [0x0, 0x61, 0x5f, 0x5e]}}, &(0x7f0000000100)=""/4084, 0x3e, 0xff4, 0x1}, 0x28)

      
      syz_mount_image$ext4(&(0x7f0000000140)='ext4\x00', &(0x7f0000000000)='./bus\x00', 0xe, &(0x7f0000000580)={[{@errors_remount}, {@resuid={'resuid', 0x3d, 0xee01}}, {@debug_want_extra_isize={'debug_want_extra_isize', 0x3d, 0x2e}}, {@nombcache}, {@quota}, {@quota}]}, 0xff, 0x44b, &(0x7f0000000dc0)="$eJzs3MtvG8UfAPDvrpP8+vwllPLoAwiUR8UjadICPXABgcQBJCQ4lGNI0qo0bVATJFpVEBAqR1SJO+KIxF/AiV4QcELiCgduqFKFemnhZLT2buMY202MnYX685HWmdkda+ab3bFndrIJYGCNZy9JxI6I+DkiRuvZtQXG6z9uXLsw+8e1C7NJVKuv/57Uyl2/dmG2KFq8b3ueOZhGpB8nsa9FvUvnzp+aWViYP5vnJ5dPvzO5dO78UydPz5yYPzF/Zvro0SOHp559ZvrpnsSZxXV97/uL+/e8/OalV2ePXXrr+6+SIv6mOHpkvNPBR6rVHldXrp0N6WSoxIawIZWIyE7XcK3/j0YlVk/eaLz0UamNA/qqmmtzeKUK3MaSKLsFQDmKL/ps/ltsmzf6KN/V5+sToCzuG/lWPzIUaV5muGl+20vjEXFs5c/Psy36cx8CAGCNb7Lxz5Otxn9p3N1Q7v9pfQ1lLCLuiIhdEXFnROyOiLsiamXviYh7N1h/8yLJ38c/6ZWuAlunbPz3XL62tXb8V4z+YqyS53bW4h9Ojp9cmD9UP7aSvWT5qQ51XH7xp0/bHWsc/2VbVn8xFszbcWXof2vfMzezPNNtvM2ufhixd6hV/MnNlYAkIvZExN4u6zj5+Jf72x27dfwd9GCdqfpFxGP1878STfEXks7rk5NbYmH+0GTDVdHkhx8vvtau/n8Ufw9k539by+v/ZvxjSeN67dLG67j4yydt5zTdXv8jyRu19Ei+772Z5eWzUxEjySv1Rjfun159b5EvymfxHzzQuv/vitXfxL6IyC7i+yLi/oh4IG/7gxHxUEQc6BD/dy88/Hb38fdXFv/chs7/amIkmve0TlROffv1mkrHNhJ/dv6P1FIH8z21z7+kc1zraVd3VzMAAAD896QRsSOSdOJmOk0nJup/w787tqULi0vLTxxffPfMXP0ZgbEYTos7XaMN90On8ml9kZ9uyh/O7xt/Vtlay0/MLi7MlR08DLjtbfp/5rdK2a0D+s7zWjC42vb/LZvbDmDz+f6HwaX/w+Bq0f+3ltEOYPO1+v7/ICIuP1pCY4BN1dT/LfvBADH/h8Gl/8PgWkf///X2+nfVQEQsbY1bPyTf30QlyqxdostEpP+KZkj0KVH2JxMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEBv/BUAAP//m5Hs/w==")

      
      futex(&(0x7f0000000000), 0x109, 0x0, &(0x7f0000000080)={0x0, 0x989680}, 0x0, 0x2)

      
      flock(0xffffffffffffffff, 0x6)

      
      bpf$BPF_BTF_LOAD(0x12, &(0x7f00000000c0)={&(0x7f0000000500)={{0xeb9f, 0x1, 0x0, 0x18, 0x0, 0xc, 0xc, 0x2, [@enum64={0x1, 0x0, 0x0, 0x13, 0x1, 0x5}]}}, 0x0, 0x26, 0x0, 0x6, 0x4000}, 0x28)

      
      truncate(0x0, 0x20fffbfffc)

      
      syz_emit_ethernet(0x2b, &(0x7f0000000780)={@broadcast, @link_local={0x1, 0x80, 0xc2, 0x0, 0x0, 0x3}, @val={@void, {0x8100, 0x7}}, {@arp={0x806, @generic={0x3, 0x88a8, 0x6, 0x0, 0x2, @link_local={0x1, 0x80, 0xc2, 0x0, 0x0, 0x1}, "", @link_local, "3777e5bb80"}}}}, 0x0)

      
      bpf$BPF_BTF_LOAD(0x12, &(0x7f0000000000)={&(0x7f00000000c0)={{0xeb9f, 0x1, 0x0, 0x18, 0x0, 0x20, 0x20, 0x2, [@func_proto={0x0, 0x1, 0x0, 0xd, 0x2, [{0xc}]}, @ptr={0x0, 0x0, 0x0, 0x2, 0x2}]}}, &(0x7f0000000200)=""/224, 0x3a, 0xe0, 0x1}, 0x28)

      
      bpf$PROG_LOAD(0x5, &(0x7f0000000500)={0xe, 0x4, &(0x7f0000000700)=ANY=[@ANYBLOB="1800000008000000000000000800000071116a000000000095"], &(0x7f0000000c40)='GPL\x00', 0x0, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, @fallback=0x1b, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfffffffd}, 0x94)

      
      bpf$PROG_LOAD(0x5, &(0x7f000000e000)={0x10, 0x4, &(0x7f0000000040)=ANY=[@ANYBLOB="b4020000000000007911000000000000850000006b0000009500000000000000359bb9f4ffffb136000440c89844f3cec063027b45b1bf3fa2cef4363021028e2b63040fcfd3a9b59fed6aa4172c30b77f38d6daaed700"], &(0x7f0000000100)='syzkaller\x00', 0x4, 0xc5, &(0x7f0000000300)=""/197, 0x0, 0x0, '\x00', 0x0, @sk_msg, 0xffffffffffffffff, 0x8, &(0x7f0000000000), 0x8, 0x10, &(0x7f00000002c0), 0xffffffffffffff35}, 0x54)

      
      bpf$BPF_LINK_CREATE(0x8, &(0x7f0000002300)={0xffffffffffffffff, 0xffffffffffffffff, 0xb, 0x0, @void}, 0x10)

      
      mount(&(0x7f0000000000)=@nullb, &(0x7f00000000c0)='.\x00', &(0x7f0000000100)='cramfs\x00', 0x800, 0x0)

      
      syz_emit_ethernet(0x5a, &(0x7f0000000100)={@local, @dev={'\xaa\xaa\xaa\xaa\xaa', 0xc}, @void, {@ipv4={0x800, @tcp={{0xe, 0x4, 0x2, 0x2e, 0x4c, 0x65, 0x0, 0x2, 0x6, 0x0, @private=0xa010105, @remote, {[@timestamp_prespec={0x44, 0x24, 0x15, 0x3, 0xb, [{@multicast1}, {@dev={0xac, 0x14, 0x14, 0x37}, 0x3}, {@broadcast, 0x1000}, {@broadcast, 0x7}]}]}}, {{0x4e27, 0x4e23, 0x41424344, 0x41424344, 0x1, 0x0, 0x5, 0xc2, 0xfffd, 0x0, 0x57}}}}}}, 0x0)

      
      bpf$BPF_PROG_WITH_BTFID_LOAD(0x5, &(0x7f0000000300)=@bpf_lsm={0x4, 0x4, &(0x7f0000000040)=ANY=[@ANYBLOB="660a0000060000007b115f0000000000850000001e00000095"], &(0x7f0000000000)='GPL\x00'}, 0x94)

      
      bpf$BPF_PROG_ATTACH(0x8, &(0x7f0000000140)={@fallback, 0xffffffffffffffff, 0x18}, 0xa)

      
      socket(0x15, 0x803, 0x2)

      
      syz_emit_ethernet(0x80, &(0x7f0000000180)=ANY=[@ANYBLOB="bb6233c1eb870180c200000086dd6410f746004a2f0100000000000000000000ffffffffffffff0200000000000000000000000000010401"], 0x0)

      
      bpf$PROG_LOAD(0x5, &(0x7f0000000080)={0x20, 0x4, &(0x7f0000000000)=ANY=[@ANYBLOB="1800000000000000000000000000000079100b000000000095"], &(0x7f0000000200)='syzkaller\x00', 0x0, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, @netfilter=0x2d, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, 0x94)

      
      bpf$BPF_BTF_LOAD(0x12, &(0x7f0000000080)={&(0x7f0000000180)={{0xeb9f, 0x1, 0x0, 0x18, 0x0, 0x20, 0x20, 0x5, [@struct={0x3, 0x0, 0x0, 0x4, 0x1, 0x2}, @func_proto={0x0, 0x1, 0x0, 0xd, 0x0, [{0x2, 0x1}]}]}, {0x0, [0x61, 0x5f, 0x5f]}}, &(0x7f0000002cc0)=""/4085, 0x3d, 0xff5, 0x1}, 0x28)

      
      bpf$MAP_CREATE(0x0, &(0x7f00000008c0)=@base={0x4, 0x4, 0x4, 0xf4e, 0x0, 0x1, 0x5, '\x00', 0x0, 0xffffffffffffffff, 0x3, 0xfffffffd}, 0x50)

      
      syz_mount_image$hfs(&(0x7f0000000180), &(0x7f0000000000)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x4004, &(0x7f00000000c0)={[{@part={'part', 0x3d, 0x60d2}}, {@umask={'umask', 0x3d, 0x7ff}}, {@codepage={'codepage', 0x3d, 'cp950'}}, {@iocharset={'iocharset', 0x3d, 'iso8859-2'}}, {@file_umask={'file_umask', 0x3d, 0x6}}]}, 0x1, 0x347, &(0x7f0000000480)="$eJzs3ctLVFEcB/DvufNwphG7qSG0CktoJWqLokVJWC36B1qEZDqCeDMogxIiax3RLghatmsd9S/UJvoHihZC0ao24qIb53Ff4zkz13F0lPl+oHncex6/c+85956j2AUR9azLM9/ent2Q/0QJQAHABcADUAGKAI5jpPJgZbVlQYX4k4DOKbalmV+p27JWYHIYvvxWRH96G+2NMAzD7+7dPy6qtz/7FxB1hUiP4BQP6IOnRqLaX9n3yPbGumxXr0mdYbGJTTzEQDfDISKi7jP3f8/cJfrN/N3zgDEzDz/s9//M/Gaze3EcCPH939PfQyGPz1G1S673llaD+oJewsmz70WrRFtZ1j6xFWoAytA9q1BLTblStdipWLzq4lJQH19XBTzDJSOVbFi9LiBqiEwKV7Rl/TZqWZs20aztLleuydeaakNJtmHKEf9QsxqtC+CPv/DKXuns5xyRiU/ii5gVPl5jIZ7/FUMhD446Pn7DUNHxT7hLVK30dapMK5Pwj6lKTkRn4MO7pJVV13GtoCBjsZGliMb5ux/F+bLszoVBZH+soFs36W6dyjUEFIXpXEmuqTjRljXXcGNd1cVSUB+fvxu4On1nWVd04oW4IUbxG+8xk8z/9egZg3tkZka5UClNz2janqJK6TiPGSqEO9tHpmudVspRZm+4aj3RDX5mvz7HbZzHwP1Ha8tzQVC/1/0P0VBpM/vJDsejO6LpjnKLfE+lQTU6lB2r9F8YhtZdReQsRw7StsPQA+rcm6TJa8tzwlzzdtdAeeVs2DXtTgxgGoDZEl0R2qn9SZyrzxTo58z+V55ttcXeIaOo9mGARFVldhXQl2ukVNuo9Prj5bmgnQsRHTbJScfIzXiCLPFHgz1CzruEXv+l1isT6qojX/wm65+wVeGpEicdK6BB9XokXsEpLdZcNfcKrhZ9aLHmOnUGOF3O1ughqvFpY7G+iRMH8beSO/9VhpjBV9ziICciIiIiIiIiIiIiIiIiIiIiIiIiOmx2+tcI7fw5QbbGjR78jzeIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiHYnef5v/HyXsu35v82e1KQU9BNiKp14/m8hx/N/xXq+BhKR0/8AAAD//xI0ZUw=")

      
      mount$cgroup2(0x0, &(0x7f0000000000)='./cgroup\x00', &(0x7f0000000040), 0x800, &(0x7f00000000c0)={[{@pids_localevents}]})

      
      mount$tmpfs(0x0, &(0x7f0000004f80)='.\x00', &(0x7f0000004fc0), 0x4481, &(0x7f0000000440)=ANY=[@ANYBLOB='uid=', @ANYRESHEX=0x0, @ANYBLOB=',mpol=default=static:,140'])

      
      socket$kcm(0x29, 0x7, 0x0)

      
      bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f00000001c0)={0x2, 0x2, &(0x7f0000000380)=ANY=[@ANYBLOB="85000000ae00000095"], &(0x7f00000000c0)='syzkaller\x00', 0x0, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, 0x2a, 0xffffffffffffffff, 0x8, 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10, 0x9}, 0x94)

      
      bpf$MAP_CREATE(0x0, &(0x7f0000000180)=@base={0x11, 0x4, 0x4, 0x0, 0x80}, 0x50)

      
      syz_mount_image$ext4(&(0x7f0000000040)='ext4\x00', &(0x7f0000000200)='./file1\x00', 0x200000, &(0x7f0000000000)={[{@journal_async_commit}]}, 0x0, 0x57c, &(0x7f0000000380)="$eJzs3d9rW1UcAPDvTdOtW6frYAz1QQp7sDKXrq0/JghOEEF0OND3GdqsjKbLaNKx1sG2B/fiiwxBxIH4B/ju4/Af8K8Y6GDIKPrgS+QmN11sk/5aa+ry+cDdzrn3Zud8c+45OfeehAXQt0bTP3IRL0bE10nE0bZj+cgOjjbPW3l8YzrdkqjXP/0jiSTb1zo/yf4ezjIvRMQvX0acyq0vt7q0PFcsl0sLWX68Nn91vLq0fPryfHG2NFu6Mjk1dfaNqcm333pz12J99cJf331y/8OzX51c+fanh8fuJnEujmTH2uNY5/3hrRZxqz0zGqPZezIY59acOLGtmu9/Sa8rwI4MZP18MNIx4GgMRF5TQp+4GRF1oE8l+j/0qdY8oHVvv+F98DPo0XvNG6D18eebz0ZiqHFvdHglyZ6HNKU3SSO7UH5axs+/37ubbrHJc4ibu1AeQMut2xFxJp9fP/4l2fi3c2caD483traMfvv8gV66n85/Xus0/8mtzn+iw/xnuEPf3YnN+3/u4S4U01U6/3un4/x3degaGchyzzXmfIPJpcvl0pmIeD4ixmLwYJrfaD3n7MqDerdj7fO/dEvLb80Fs3o8zB/892tmirXi08Tc7tHtiJc6zn+T1fZPOrR/+n5c2GIZJ0r3Xu52bPP491b9x4hXOrb/k2WQZO36ZHboVnN9crxxPYy3ror1/rxz4tdu5Y990Nv40/Y/vHH8I0n7em11+2X8MPR3qduxnV7/B5LPGukD2b7rxVptYSLiQPLx+v2TT17byrfOT+MfO7nx+Nfp+j8UEZ9vMf47x+90PbXX138a/8y22n/7iQcfffF9t/K31v6vN1Jj2Z5O4199zarlViv41G8gAAAAAAAA7CO5iDgSSa6wms7lCoXm9zuOx+FcuVKtnbpUWbwyE43fyo7EYK610n207fsQE9n3YVv5yTX5qYg4FhHfDBxq5AvTlfJMr4MHAAAAAAAAAAAAAAAAAACAfWK4y+//U78N9Lp2wJ7L97oCQM9s2v934396AvYln//Qv/R/6F/6P/Qv/R/6l/4P/Uv/h/6l/0P/avT/oV7XAgAAAAAAAAAAAAAAAAAAAAAAAAAAAJ4ZF86fT7f6yuMb02l+5trS4lzl2umZUnWuML84XZiuLFwtzFYqs+VSYboyv9m/V65Urk5MxuL18VqpWhuvLi1fnK8sXqldvDxfnC1dLA3+J1EBAAAAAAAAAAAAAAAAAADA/0t1aXmuWC6XFiS6Jt6NfVGNvQywaUcvz++XKCS6JoZ20Lg9HpgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAoM0/AQAA//956jNb")

      
      bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f0000000440)={0x11, 0x8, &(0x7f00000002c0)=@framed={{0x18, 0x8}, [@func={0x85, 0x0, 0x1, 0x0, 0x3}, @initr0, @exit, @alu={0x7, 0x1, 0xb, 0x0, 0xa, 0x1, 0x1}]}, &(0x7f0000000000)='GPL\x00', 0x9, 0xec, &(0x7f00000004c0)=""/236}, 0x90)

      
      bpf$BPF_BTF_LOAD(0x12, &(0x7f00000001c0)={&(0x7f0000000140)={{0xeb9f, 0x1, 0x0, 0x18, 0x0, 0x18, 0x18, 0xa, [@struct={0x8, 0x1, 0x0, 0xf, 0x0, 0x5, [{0x2, 0x5, 0x5}]}]}, {0x0, [0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2e]}}, 0x0, 0x3a, 0x0, 0x6}, 0x28)

      
      bpf$MAP_CREATE(0x0, &(0x7f0000000200)=@base={0x20, 0x4, 0x0, 0x0, 0x201, 0xffffffffffffffff, 0x0, '\x00', 0x0, 0xffffffffffffffff, 0x1, 0xffffffff}, 0x50)

      
      bpf$MAP_CREATE(0x0, &(0x7f00000009c0)=@base={0x19, 0x4, 0x8, 0x80000000, 0x0, 0xffffffffffffffff, 0x1}, 0x50)

      
      add_key(&(0x7f0000000000)='rxrpc\x00', 0x0, &(0x7f0000000240)="000000000000003299b8f276dc75584a8d87d907005858367d61f49e1639a48f614a54a8192c2876b7f843cd3a3c07288fa0f1e28983b5cdc2e29b6e", 0x3c, 0xffffffffffffffff)

      
      syz_emit_ethernet(0xd81, 0x0, 0x0)

      
      syz_mount_image$nilfs2(&(0x7f0000000100), &(0x7f0000000040)='./file0\x00', 0x3200c00, &(0x7f0000000080)=ANY=[@ANYRES32=0x0], 0x0, 0xac0, &(0x7f0000000140)="$eJzs3UuMHEcBANDq2Z2113bwOthksUNi80nCJ7vxejEfC+IovsSKI26RIg5YjhMsHINwJEgUKbZPnCBRZK4QxCkcIkBIyQFk5cQlErHEJeIQOHDAMlIkDhCwB3m2arandiY9s7+Z3XlP6q2prp6u6t6emp7urqoAjKxa8+/8/HQRwuU3Xzn2j3v+PnlrzoOtJaaaf8dLsXoIoYjx8Wx9740thDfef+FUp7AIc82/KR4eu9567/YQwoWwP1wJU2Hv5asvvz336ImLxy8deOe1I9fWZusBAGC0fOPKkfk9f/3Tvl0fvH7X0bClNT+dn0/F+I543n80nvin8/9aaI8XpalsIltuPE61bLmxDsuV86lny413yX8iW2+9lb6vbbktFfmPleZ12m7YyNJxPBWK2kxbvFabmVn4TR6av+sniplzZ84+dX5ABQVW3b/uDiHsN/U7NRqNF5s7cAjKYjItd2rsHHQNBLAgv1+4xIX8ysLKtNY23lv+1x+qdX4/rIL1Pv6r8v/xXwab/xIjnv8vL6pxWD2b9WhK25U+RztiPL+PkD+/1O/nP60vvx9R77Gc3e4jbJT7C93KObbO5ViubuXPj4vN6msxTPvh61l6+fOT/083yv8Y6Ozf63X9/9XJNc6j0Wg04kZ9yHL7h+Ca66ae6kNQBlPPU2PA9Q8wvBafm1vQiFJ6/lxfnr6lIn1rRfpkRfq2ivTtFekwyn777E/CS8Xi7/z8N32/18PSdbbbYviRPsuTX4/sN//8ud9+rTT//HliGGZvnHz89JeffOLqwvP/Rev4vxmP9/0xPhU/W1fiAul64Y74stwOoAilhgOhffl8uduz8ty2ZPnGQo6725crdi+uJ5TqmSXlmG5/385uy93ZvtxUttxknLZm5c3PT7Zl70vnH6leTfXjeLa99Ww7JrJypHplVwzzcsBypOOx2/P/6ficDvXiqTNnTz8Q4+k4/eNYfcut+QfLK/3V+pQdWJle2/9Mh/b2Pzta8+u1Ur3QOv1O9cXB1vra588tROv58odiPH3PfXtssjl/5tR3zz652hsPI+78c89/5+TZs6e/70V6MWm3eOFFVc2xWZ8chNEx++wz35s9/9zz95955uTTp58+fe7Q4cOH5uYOf+XQ/GzzvH62/Ksf2EwWv/QHXRIAAAAAAAAAAACgVz84fuzqn9/60rsL7f8X2/+l9v/pyd/U/v9HWfv/vJ18ahWQ2tnv6pDeHHfvjfZyTGTL1eP00ay8u7N89mTv+1gMW+P4xfb/qb193q9rKs8d2fy8/960XNadwJL+UiayPkha4wXGBvufjPFLMfzFsjoj+lbf74COisnOs2NY1b91Otab/VO8uFaFZK2l/kTS0ZD6MUntv7v165Tq/13rUEZW33o0Jxz0NgKd/XMYx/+sleOlM/FBl+vDp0ZjPfObWKN9Mtz7eesQlGFTTY2GUTyA4TCY8T8X15mue6bwQnhk67k/PNLqbvP6Q+31Zd5/KazEsI8/uZj/2vR0vXG2fzD5r/b4n63x7+KF3bz+u1X3LdZ/HXpXb+vnuffRFf7zs2vvlrINe3vKP4Tr+fanfqB3V+dZ9kHMP23/vaG3/BuvZvnnN4R69N8s/2095t+2/duX9lvdq//F/NNuu+9Tvea/UOKi1r4f8uvG6f5fft04uZFtf+rbs+///zIHarwZ84dR1n2c2V5HsB1OG2X8327y5zC+GOOpIkzPOeTfyL2W//exgOn5ivQ9sCdbf1Hx/bZRxinuZtTH//1qDKs+D2n833Q8TnWI10rxeod9u9GPFdhs3hvG+38bebowBGUwDem01mNg9z81Go2lF5TW8YtaL+KDNej9P+i7z4POf9D7v0o+/m9eNeTj/9ayHxD5+L/5+/Pxf/P0fHy9PD0f/zffn/n4v3n6Hdl68yvY0xXpH69I31uRvm8xfbJT+p0V7/9ERfqBivS7KtLvrki/vSJ9rCL90xXpn6lIv6ci/b6K9M9WpG92zfYopQ/VqG0/jLK8fZ7PP4yOdP+n2+d/d4f0i0H9AJvBT18/+PATv/nm1EL7/4nW77V0H+9ojNfjb+cfxnh+3zuU4rfS3orxv2Xpw369A0ZJ3n9G/v1/b0U6sHGl57x8vmEEFVs7z45hqheW9FsVw27n+Wwsn4vh52P4hRjeH8OZGM7G8GAM59apfKyNh3/9uyMvFYu/93dm6b0+T563B8r7iTrUY3ny6wP9Po+f9+PXr5Xmv8zmYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANTa/6dn58uQrj85ivHHj9xZvbWnAdbS0w1/46XYvXW+0J4IIZjMfx5fHHj/RdOlcObMSzCXChC0ZofHrveyml7COFC2B+uhKmw9/LVl9+ee/TExeOXDrzz2pFra7cHAAAAYPP7fwAAAP//HWwjVA==")

      
      bpf$PROG_LOAD(0x5, &(0x7f0000000680)={0x4, 0x8, &(0x7f0000000d80)=ANY=[@ANYBLOB="1800000000000000000000000000000018020000", @ANYRES32, @ANYBLOB="0000000000000000b703000000000000850000000d000000b70000000000000095"], &(0x7f00000000c0)='GPL\x00', 0x0, 0x0, 0x0, 0x40f00, 0x0, '\x00', 0x0, @fallback=0x33, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, 0x94)

      
      syz_mount_image$qnx6(&(0x7f0000009e00), &(0x7f0000009e40)='./file0\x00', 0x0, &(0x7f0000009e80), 0x1, 0x9dc3, &(0x7f0000009ec0)="$eJwEwAMAbQUAwNBs27aXbdu2bS/3ro1s27Zt27a/0blvgPcP8IEBPjjAhwb48AAfGeCjA3xsgI8P8IkBPjnApwb49ACfGeCzA3xugM8P8IUBvjjAlwb48gBfGeCrA3xtgK8P8I0BvjnAtwb49gDfGeC7A3xvgO8P8IMBfjjAjwb48QA/GeCnA/xsgJ8P8IsBfjnArwb49QC/GeC3A/xugN8P8IcB/jjAnwb48wB/GeCvA/xtgL8P8I8B/jnAvwb49wD/GeC/A/xvgEMGOHSAwwY4fIAjBjhygKMGOHqAYwY4doDjBDhugOMFOH6AEwQ4YYATBThxgJMEOGmAkwU4eYBTBDhlgFMFOHWA0wQ4bYDTBTh9gDMEOGOAMwU4c4CzBDhrgLMFOHuAcwQ4Z4BzBTh3gPMEOG+A8wU4f4ALBLhggAsFuHCAiwS4aICLBbh4gEsEuGSASwW4dIDLBEiAywa4XIDLB7hCgCsGuFKAKwe4SoCrBrhagKsHuEaAawa4VoBrB7hOgOsGuF6A6we4QYAbBrhRgBsHuEmAmwa4WYCbB7hFgFsGuFWAWwe4TYDbBrhdgNsHuEOAOwa4U4A7B7hLgLsGuFuAuwe4R4B7BrhXgHsHuE+A+wa4X4D7B3hAgAcGeFCABwd4SICHBnhYgIcHeESARwZ4VIBHB3hMgMcGeFyAxwd4QoAnBnhSgCcHeEqApwZogKcFeHqAZwR4ZoBnBXh2gOcEeG6AgwCDAMMAowDjAJMA0wCzAPMAiwDLAKsA6wCbANsAuwD7AM8L8PwALwjwwgAvCvDiAC8J8NIALwvw8gCvCPDKAK8K8OoArwnw2gCvC/D6AG8I8MYAbwrw5gBvCfDWAG8L8PYA7wjwzgDvCvDuAO8J8N4A7wvw/gAfCPDBAB8K8OEAHwnw0QAfC/DxAJ8I8MkAnwrw6QCfCfDZAJ8L8PkAXwjwxQBfCvDlAF8J8NUAXwvw9QDfCPDNAN8K8O0A3wnw3QDfC/D9AD8I8MMAPwrw4wA/CfDTAD8L8PMAvwjwywC/CvDrAL8J8NsAvwvw+wB/CPDHAH8K8OcAfwnw1wB/C/D3AP8I8M8A/wrw7wD/CfDfAP8LcEiAQwMcFuDwAEcEODLAUQGODnBMgGMDHCfEcUMcL8TxQ5wgxAlDnCjEiUOcJMRJQ5wsxMlDnCLEKUOcKsSpQ5wmxGlDnC7E6UOcIcQZQ5wpxJlDnCXEWUOcLcTZQ5wjxDlDnCvEuUOcJ8R5Q5wvxPlDXCDEBUNcKMSFQ1wkxEVDXCzExUNcIsQlQ1wqxKVDXCZEQlw2xOVCXD7EFUJcMcSVQlw5xFVCXDXE1UJcPcQ1QlwzxLVCXDvEdUJcN8T1Qlw/xA1C3DDEjULcOMRNQtw0xM1C3DzELULcMsStQtw6xG1C3DbE7ULcPsQdQtwxxJ1C3DnEXULcNcTdQtw9xD1C3DPEvULcO8R9Qtw3xP1C3D/EA0I8MMSDQjw4xENCPDTEw0I8PMQjQjwyxKNCPDrEY0I8NsTjQjw+xBNCPDHEk0I8OcRTQjw1REM8LcTTQzwjxDNDPCvEs0M8J8RzQxyEGIQYhhiFGIeYhJiGmIWYh1iEWIZYhViH2ITYhtiF2Id4Xojnh3hBiBeGeFGIF4d4SYiXhnhZiJeHeEWIV4Z4VYhXh3hNiNeGeF2I14d4Q4g3hnhTiDeHeEuIt4Z4W4i3h3hHiHeGeFeId4d4T4j3hnhfiPeH+ECID4b4UIgPh/hIiI+G+FiIj4f4RIhPhvhUiE+H+EyIz4b4XIjPh/hCiC+G+FKIL4f4SoivhvhaiK+H+EaIb4b4Vohvh/hOiO+G+F6I74f4QYgfhvhRiB+H+EmIn4b4WYifh/hFiF+G+FWIX4f4TYjfhvhdiN+H+EOIP4b4U4g/h/hLiL+G+FuIv4f4R4h/hvhXiH+H+E+I/4b4X4hDQhwa4rAQh4c4IsSRIY4KcXSIY0IcG+I4EY4b4XgRjh/hBBFOGOFEEU4c4SQRThrhZBFOHuEUEU4Z4VQRTh3hNBFOG+F0EU4f4QwRzhjhTBHOHOEsEc4a4WwRzh7hHBHOGeFcEc4d4TwRzhvhfBHOH+ECES4Y4UIRLhzhIhEuGuFiES4e4RIRLhnhUhEuHeEyERLhshEuF+HyEa4Q4YoRrhThyhGuEuGqEa4W4eoRrhHhmhGuFeHaEa4T4boRrhfh+hFuEOGGEW4U4cYRbhLhphFuFuHmEW4R4ZYRbhXh1hFuE+G2EW4X4fYR7hDhjhHuFOHOEe4S4a4R7hbh7hHuEeGeEe4V4d4R7hPhvhHuF+H+ER4Q4YERHhThwREeEuGhER4W4eERHhHhkREeFeHRER4T4bERHhfh8RGeEOGJEZ4U4ckRnhLhqREa4WkRnh7hGRGeGeFZEZ4d4TkRnhvhIMIgwjDCKMI4wiTCNMIswjzCIsIywirCOsImwjbCLsI+wvMiPD/CCyK8MMKLIrw4wksivDTCyyK8PMIrIrwywqsivDrCayK8NsLrIrw+whsivDHCmyK8OcJbIrw1wtsivD3COyK8M8K7Irw7wnsivDfC+yK8P8IHInwwwocifDjCRyJ8NMLHInw8wicifDLCpyJ8OsJnInw2wucifD7CFyJ8McKXInw5wlcifDXC1yJ8PcI3InwzwrcifDvCdyJ8N8L3Inw/wg8i/DDCjyL8OMJPIvw0ws8i/DzCLyL8MsKvIvw6wm8i/DbC7yL8PsIfIvwxwp8i/DnCXyL8NcLfIvw9wj8i/DPCvyL8O8J/Ivw3wv8iHBLh0AiHRTg8whERjoxwVISjIxwT4dgIx4lx3BjHi3H8GCeIccIYJ4px4hgniXHSGCeLcfIYp4hxyhininHqGKeJcdoYp4tx+hhniHHGGGeKceYYZ4lx1hhni3H2GOeIcc4Y54px7hjniXHeGOeLcf4YF4hxwRgXinHhGBeJcdEYF4tx8RiXiHHJGJeKcekYl4mRGJeNcbkYl49xhRhXjHGlGFeOcZUYV41xtRhXj3GNGNeMca0Y145xnRjXjXG9GNePcYMYN4xxoxg3jnGTGDeNcbMYN49xixi3jHGrGLeOcZsYt41xuxi3j3GHGHeMcacYd45xlxh3jXG3GHePcY8Y94xxrxj3jnGfGPeNcb8Y94/xgBgPjPGgGA+O8ZAYD43xsBgPj/GIGI+M8agYj47xmBiPjfG4GI+P8YQYT4zxpBhPjvGUGE+N0RhPi/H0GM+I8cwYz4rx7BjPifHcGAcxBjGGMUYxxjEmMaYxZjHmMRYxljFWMdYxNjG2MXYx9jGeF+P5MV4Q44UxXhTjxTFeEuOlMV4W4+UxXhHjlTFeFePVMV4T47UxXhfj9THeEOONMd4U480x3hLjrTHeFuPtMd4R450x3hXj3THeE+O9Md4X4/0xPhDjgzE+FOPDMT4S46MxPhbj4zE+EeOTMT4V49MxPhPjszE+F+PzMb4Q44sxvhTjyzG+EuOrMb4W4+sxvhHjmzG+FePbMb4T47sxvhfj+zF+EOOHMX4U48cxfhLjpzF+FuPnMX4R45cxfhXj1zF+E+O3MX4X4/cx/hDjjzH+FOPPMf4S468x/hbj7zH+EeOfMf4V498x/hPjvzH+F+OQGIfGOCzG4TGOiHFkjKNiHB3jmBjHxjhOguMmOF6C4yc4QYITJjhRghMnOEmCkyY4WYKTJzhFglMmOFWCUyc4TYLTJjhdgtMnOEOCMyY4U4IzJzhLgrMmOFuCsyc4R4JzJjhXgnMnOE+C8yY4X4LzJ7hAggsmuFCCCye4SIKLJrhYgosnuESCSya4VIJLJ7hMgiS4bILLJbh8giskuGKCKyW4coKrJLhqgqsluHqCayS4ZoJrJbh2guskuG6C6yW4foIbJLhhghsluHGCmyS4aYKbJbh5glskuGWCWyW4dYLbJLhtgtsluH2COyS4Y4I7JbhzgrskuGuCuyW4e4J7JLhngnsluHeC+yS4b4L7Jbh/ggckeGCCByV4cIKHJHhogocleHiCRyR4ZIJHJXh0gsckeGyCxyV4fIInJHhigicleHKCpyR4aoImeFqCpyd4RoJnJnhWgmcneE6C5yY4SDBIMEwwSjBOMEkwTTBLME+wSLBMsEqwTrBJsE2wS7BP8LwEz0/wggQvTPCiBC9O8JIEL03wsgQvT/CKBK9M8KoEr07wmgSvTfC6BK9P8IYEb0zwpgRvTvCWBG9N8LYEb0/wjgTvTPCuBO9O8J4E703wvgTvT/CBBB9M8KEEH07wkQQfTfCxBB9P8IkEn0zwqQSfTvCZBJ9N8LkEn0/whQRfTPClBF9O8JUEX03wtQRfT/CNBN9M8K0E307wnQTfTfC9BN9P8IMEP0zwowQ/TvCTBD9N8LMEP0/wiwS/TPCrBL9O8JsEv03wuwS/T/CHBH9M8KcEf07wlwR/TfC3BH9P8I8E/0zwrwT/TvCfBP9N8L8EhyQ4NMFhCQ5PcESCIxMcleDoBMckODbBcVIcN8XxUhw/xQlSnDDFiVKcOMVJUpw0xclSnDzFKVKcMsWpUpw6xWlSnDbF6VKcPsUZUpwxxZlSnDnFWVKcNcXZUpw9xTlSnDPFuVKcO8V5Upw3xflSnD/FBVJcMMWFUlw4xUVSXDTFxVJcPMUlUlwyxaVSXDrFZVIkxWVTXC7F5VNcIcUVU1wpxZVTXCXFVVNcLcXVU1wjxTVTXCvFtVNcJ8V1U1wvxfVT3CDFDVPcKMWNU9wkxU1T3CzFzVPcIsUtU9wqxa1T3CbFbVPcLsXtU9whxR1T3CnFnVPcJcVdU9wtxd1T3CPFPVPcK8W9U9wnxX1T3C/F/VM8IMUDUzwoxYNTPCTFQ1M8LMXDUzwixSNTPCrFo1M8JsVjUzwuxeNTPCHFE1M8KcWTUzwlxVNTNMXTUjw9xTNSPDPFs1I8O8VzUjw3xUGKQYphilGKcYpJimmKWYp5ikWKZYpVinWKTYptil2KfYrnpXh+ihekeGGKF6V4cYqXpHhpipeleHmKV6R4ZYpXpXh1itekeG2K16V4fYo3pHhjijeleHOKt6R4a4q3pXh7inekeGeKd6V4d4r3pHhviveleH+KD6T4YIoPpfhwio+k+GiKj6X4eIpPpPhkik+l+HSKz6T4bIrPpfh8ii+k+GKKL6X4coqvpPhqiq+l+HqKb6T4Zopvpfh2iu+k+G6K76X4foofpPhhih+l+HGKn6T4aYqfpfh5il+k+GWKX6X4dYrfpPhtit+l+H2KP6T4Y4o/pfhzir+k+GuKv6X4e4p/pPhnin+l+HeK/6T4b4r/pTgkxaEpDktxeIojUhyZ4qgUR6c4JsWxKY6T4bgZjpfh+BlOkOGEGU6U4cQZTpLhpBlOluHkGU6R4ZQZTpXh1BlOk+G0GU6X4fQZzpDhjBnOlOHMGc6S4awZzpbh7BnOkeGcGc6V4dwZzpPhvBnOl+H8GS6Q4YIZLpThwhkukuGiGS6W4eIZLpHhkhkuleHSGS6TIRkum+FyGS6f4QoZrpjhShmunOEqGa6a4WoZrp7hGhmumeFaGa6d4ToZrpvhehmun+EGGW6Y4UYZbpzhJhlumuFmGW6e4RYZbpnhVhluneE2GW6b4XYZbp/hDhnumOFOGe6c4S4Z7prhbhnunuEeGe6Z4V4Z7p3hPhnum+F+Ge6f4QEZHpjhQRkenOEhGR6a4WEZHp7hERkemeFRGR6d4TEZHpvhcRken+EJGZ6Y4UkZnpzhKRmemqEZnpbh6RmekeGZGZ6V4dkZnpPhuRkOMgwyDDOMMowzTDJMM8wyzDMsMiwzrDKsM2wybDPsMuwzPC/D8zO8IMMLM7wow4szvCTDSzO8LMPLM7wiwyszvCrDqzO8JsNrM7wuw+szvCHDGzO8KcObM7wlw1szvC3D2zO8I8M7M7wrw7szvCfDezO8L8P7M3wgwwczfCjDhzN8JMNHM3wsw8czfCLDJzN8KsOnM3wmw2czfC7D5zN8IcMXM3wpw5czfCXDVzN8LcPXM3wjwzczfCvDtzN8J8N3M3wvw/cz/CDDDzP8KMOPM/wkw08z/CzDzzP8IsMvM/wqw68z/CbDbzP8LsPvM/whwx8z/CnDnzP8JcNfM/wtw98z/CPDPzP8K8O/M/wnw38z/C/DIRkOzXBYhsMzHJHhyAxHZTg6wzEZjs1wnBzHzXG8HMfPcYIcJ8xxohwnznGSHCfNcbIcJ89xihynzHGqHKfOcZocp81xuhynz3GGHGfMcaYcZ85xlhxnzXG2HGfPcY4c58xxrhznznGeHOfNcb4c589xgRwXzHGhHBfOcZEcF81xsRwXz3GJHJfMcakcl85xmRzJcdkcl8tx+RxXyHHFHFfKceUcV8lx1RxXy3H1HNfIcc0c18px7RzXyXHdHNfLcf0cN8hxwxw3ynHjHDfJcdMcN8tx8xy3yHHLHLfKcesct8lx2xy3y3H7HHfIccccd8px5xx3yXHXHHfLcfcc98hxzxz3ynHvHPfJcd8c98tx/xwPyPHAHA/K8eAcD8nx0BwPy/HwHI/I8cgcj8rx6ByPyfHYHI/L8fgcT8jxxBxPyvHkHE/J8dQczfG0HE/P8Ywcz8zxrBzPzvGcHM/NcZBjkGOYY5RjnGOSY5pjlmOeY5FjmWOVY51jk2ObY5djn+N5OZ6f4wU5XpjjRTlenOMlOV6a42U5Xp7jFTlemeNVOV6d4zU5XpvjdTlen+MNOd6Y40053pzjLTnemuNtOd6e4x053pnjXTneneM9Od6b43053p/jAzk+mONDOT6c4yM5PprjYzk+nuMTOT6Z41M5Pp3jMzk+m+NzOT6f4ws5vpjjSzm+nOMrOb6a42s5vp7jGzm+meNbOb6d4zs5vpvjezm+n+MHOX6Y40c5fpzjJzl+muNnOX6e4xc5fpnjVzl+neM3OX6b43c5fp/jDzn+mONPOf6c4y85/prjbzn+nuMfOf6Z4185/p3jPzn+m+N/OQ7JcWiOw3IcnuOIHEfmOCrH0TmOyXFsjuMUOG6B4xU4foETFDhhgRMVOHGBkxQ4aYGTFTh5gVMUOGWBUxU4dYHTFDhtgdMVOH2BMxQ4Y4EzFThzgbMUOGuBsxU4e4FzFDhngXMVOHeB8xQ4b4HzFTh/gQsUuGCBCxW4cIGLFLhogYsVuHiBSxS4ZIFLFbh0gcsUSIHLFrhcgcsXuEKBKxa4UoErF7hKgasWuFqBqxe4RoFrFrhWgWsXuE6B6xa4XoHrF7hBgRsWuFGBGxe4SYGbFrhZgZsXuEWBWxa4VYFbF7hNgdsWuF2B2xe4Q4E7FrhTgTsXuEuBuxa4W4G7F7hHgXsWuFeBexe4T4H7FrhfgfsXeECBBxZ4UIEHF3hIgYcWeFiBhxd4RIFHFnhUgUcXeEyBxxZ4XIHHF3hCgScWeFKBJxd4SoGnFmiBpxV4eoFnFHhmgWcVeHaB5xR4boGDAoMCwwKjAuMCkwLTArMC8wKLAssCqwLrApsC2wK7AvsCzyvw/AIvKPDCAi8q8OICLynw0gIvK/DyAq8o8MoCryrw6gKvKfDaAq8r8PoCbyjwxgJvKvDmAm8p8NYCbyvw9gLvKPDOAu8q8O4C7ynw3gLvK/D+Ah8o8MECHyrw4QIfKfDRAh8r8PECnyjwyQKfKvDpAp8p8NkCnyvw+QJfKPDFAl8q8OUCXynw1QJfK/D1At8o8M0C3yrw7QLfKfDdAt8r8P0CPyjwwwI/KvDjAj8p8NMCPyvw8wK/KPDLAr8q8OsCvynw2wK/K/D7An8o8McCfyrw5wJ/KfDXAn8r8PcC/yjwzwL/KvDvAv8p8N8C/ytwSIFDCxxW4PACRxQ4ssBRBY4ucEyBYwscp8RxSxyvxPFLnKDECUucqMSJS5ykxElLnKzEyUucosQpS5yqxKlLnKbEaUucrsTpS5yhxBlLnKnEmUucpcRZS5ytxNlLnKPEOUucq8S5S5ynxHlLnK/E+UtcoMQFS1yoxIVLXKTERUtcrMTFS1yixCVLXKrEpUtcpkRKXLbE5UpcvsQVSlyxxJVKXLnEVUpctcTVSly9xDVKXLPEtUpcu8R1Sly3xPVKXL/EDUrcsMSNSty4xE1K3LTEzUrcvMQtStyyxK1K3LrEbUrctsTtSty+xB1K3LHEnUrcucRdSty1xN1K3L3EPUrcs8S9Sty7xH1K3LfE/Urcv8QDSjywxINKPLjEQ0o8tMTDSjy8xCNKPLLEo0o8usRjSjy2xONKPL7EE0o8scSTSjy5xFNKPLVESzytxNNLPKPEM0s8q8SzSzynxHNLHJQYlBiWGJUYl5iUmJaYlZiXWJRYlliVWJfYlNiW2JXYl3heieeXeEGJF5Z4UYkXl3hJiZeWeFmJl5d4RYlXlnhViVeXeE2J15Z4XYnXl3hDiTeWeFOJN5d4S4m3lnhbibeXeEeJd5Z4V4l3l3hPifeWeF+J95f4QIkPlvhQiQ+X+EiJj5b4WImPl/hEiU+W+FSJT5f4TInPlvhcic+X+EKJL5b4Uokvl/hKia+W+FqJr5f4RolvlvhWiW+X+E6J75b4Xonvl/hBiR+W+FGJH5f4SYmflvhZiZ+X+EWJX5b4VYlfl/hNid+W+F2J35f4Q4k/lvhTiT+X+EuJv5b4W4m/l/hHiX+W+FeJf5f4T4n/lvhfiUNKHFrisBKHlziixJEljipxdIljShxb4jgVjlvheBWOX+EEFU5Y4UQVTlzhJBVOWuFkFU5e4RQVTlnhVBVOXeE0FU5b4XQVTl/hDBXOWOFMFc5c4SwVzlrhbBXOXuEcFc5Z4VwVzl3hPBXOW+F8Fc5f4QIVLljhQhUuXOEiFS5a4WIVLl7hEhUuWeFSFS5d4TIVUuGyFS5X4fIVrlDhihWuVOHKFa5S4aoVrlbh6hWuUeGaFa5V4doVrlPhuhWuV+H6FW5Q4YYVblThxhVuUuGmFW5W4eYVblHhlhVuVeHWFW5T4bYVblfh9hXuUOGOFe5U4c4V7lLhrhXuVuHuFe5R4Z4V7lXh3hXuU+G+Fe5X4f4VHlDhgRUeVOHBFR5S4aEVHlbh4RUeUeGRFR5V4dEVHlPhsRUeV+HxFZ5Q4YkVnlThyRWeUuGpFVrhaRWeXuEZFZ5Z4VkVnl3hORWeW+GgwqDCsMKowrjCpMK0wqzCvMKiwrLCqsK6wqbCtsKuwr7C8yo8v8ILKrywwosqvLjCSyq8tMLLKry8wisqvLLCqyq8usJrKry2wusqvL7CGyq8scKbKry5wlsqvLXC2yq8vcI7KryzwrsqvLvCeyq8t8L7Kry/wgcqfLDChyp8uMJHKny0wscqfLzCJyp8ssKnKny6wmcqfLbC5yp8vsIXKnyxwpcqfLnCVyp8tcLXKny9wjcqfLPCtyp8u8J3Kny3wvcqfL/CDyr8sMKPKvy4wk8q/LTCzyr8vMIvKvyywq8q/LrCbyr8tsLvKvy+wh8q/LHCnyr8ucJfKvy1wt8q/L3CPyr8s8K/Kvy7wn8q/LfC/yocUuHQCodVOLzCERWOrHBUhaMrHFPh2ArHqXHcGsercfwaJ6hxwhonqnHiGiepcdIaJ6tx8hqnqHHKGqeqceoap6lx2hqnq3H6GmeoccYaZ6px5hpnqXHWGmercfYa56hxzhrnqnHuGuepcd4a56tx/hoXqHHBGheqceEaF6lx0RoXq3HxGpeocckal6px6RqXqZEal61xuRqXr3GFGlescaUaV65xlRpXrXG1GlevcY0a16xxrRrXrnGdGtetcb0a169xgxo3rHGjGjeucZMaN61xsxo3r3GLGrescasat65xmxq3rXG7GrevcYcad6xxpxp3rnGXGnetcbcad69xjxr3rHGvGveucZ8a961xvxr3r/GAGg+s8aAaD67xkBoPrfGwGg+v8Ygaj6zxqBqPrvGYGo+t8bgaj6/xhBpPrPGkGk+u8ZQaT63RGk+r8fQaz6jxzBrPqvHsGs+p8dwaBzUGNYY1RjXGNSY1pjVmNeY1FjWWNVY11jU2NbY1djX2NZ5X4/k1XlDjhTVeVOPFNV5S46U1Xlbj5TVeUeOVNV5V49U1XlPjtTVeV+P1Nd5Q44013lTjzTXeUuOtNd5W4+013lHjnTXeVePdNd5T47013lfj/TU+UOODNT5U48M1PlLjozU+VuPjNT5R45M1PlXj0zU+U+OzNT5X4/M1vlDjizW+VOPLNb5S46s1vlbj6zW+UeObNb5V49s1vlPjuzW+V+P7NX5Q44c1flTjxzV+UuOnNX5W4+c1flHjlzV+VePXNX5T47c1flfj9zX+UOOPNf5U4881/lLjrzX+VuPvNf5R4581/lXj3zX+U+O/Nf5X45Aah9Y4rMbhNY6ocWSNo2ocXeOYGsfWOE6D4zY4XoPjNzhBgxM2OFGDEzc4SYOTNjhZg5M3OEWDUzY4VYNTNzhNg9M2OF2D0zc4Q4MzNjhTgzM3OEuDszY4W4OzNzhHg3M2OFeDczc4T4PzNjhfg/M3uECDCza4UIMLN7hIg4s2uFiDize4RINLNrhUg0s3uEyDNLhsg8s1uHyDKzS4YoMrNbhyg6s0uGqDqzW4eoNrNLhmg2s1uHaD6zS4boPrNbh+gxs0uGGDGzW4cYObNLhpg5s1uHmDWzS4ZYNbNbh1g9s0uG2D2zW4fYM7NLhjgzs1uHODuzS4a4O7Nbh7g3s0uGeDezW4d4P7NLhvg/s1uH+DBzR4YIMHNXhwg4c0eGiDhzV4eINHNHhkg0c1eHSDxzR4bIPHNXh8gyc0eGKDJzV4coOnNHhqgzZ4WoOnN3hGg2c2eFaDZzd4ToPnNjhoMGgwbDBqMG4waTBtMGswb7BosGywarBusGmwbbBrsG/wvAbPb/CCBi9s8KIGL27wkgYvbfCyBi9v8IoGr2zwqgavbvCaBq9t8LoGr2/whgZvbPCmBm9u8JYGb23wtgZvb/COBu9s8K4G727wngbvbfC+Bu9v8IEGH2zwoQYfbvCRBh9t8LEGH2/wiQafbPCpBp9u8JkGn23wuQafb/CFBl9s8KUGX27wlQZfbfC1Bl9v8I0G32zwrQbfbvCdBt9t8L0G32/wgwY/bPCjBj9u8JMGP23wswY/b/CLBr9s8KsGv27wmwa/bfC7Br9v8IcGf2zwpwZ/bvCXBn9t8LcGf2/wjwb/bPCvBv9u8J8G/23wvwaHNDi0wWENDm9wRIMjGxzV4OgGxzQ4tsFxWhy3xfFaHL/FCVqcsMWJWpy4xUlanLTFyVqcvMUpWpyyxalanLrFaVqctsXpWpy+xRlanLHFmVqcucVZWpy1xdlanL3FOVqcs8W5Wpy7xXlanLfF+Vqcv8UFWlywxYVaXLjFRVpctMXFWly8xSVaXLLFpVpcusVlWqTFZVtcrsXlW1yhxRVbXKnFlVtcpcVVW1ytxdVbXKPFNVtcq8W1W1ynxXVbXK/F9VvcoMUNW9yoxY1b3KTFTVvcrMXNW9yixS1b3KrFrVvcpsVtW9yuxe1b3KHFHVvcqcWdW9ylxV1b3K3F3Vvco8U9W9yrxb1b3KfFfVvcr8X9WzygxQNbPKjFg1s8pMVDWzysxcNbPKLFI1s8qsWjWzymxWNbPK7F41s8ocUTWzypxZNbPKXFU1u0xdNaPL3FM1o8s8WzWjy7xXNaPLfFQYtBi2GLUYtxi0mLaYtZi3mLRYtli1WLdYtNi22LXYt9i+e1eH6LF7R4YYsXtXhxi5e0eGmLl7V4eYtXtHhli1e1eHWL17R4bYvXtXh9ize0eGOLN7V4c4u3tHhri7e1eHuLd7R4Z4t3tXh3i/e0eG+L97V4f4sPtPhgiw+1+HCLj7T4aIuPtfh4i0+0+GSLT7X4dIvPtPhsi8+1+HyLL7T4Yosvtfhyi6+0+GqLr7X4eotvtPhmi2+1+HaL77T4bovvtfh+ix+0+GGLH7X4cYuftPhpi5+1+HmLX7T4ZYtftfh1i9+0+G2L37X4fYs/tPhjiz+1+HOLv7T4a4u/tfh7i3+0+GeLf7X4d4v/tPhvi/+1OKTFoS0Oa3F4iyNaHNniqBZHtzimxbEtjtPhuB2O1+H4HU7Q4YQdTtThxB1O0uGkHU7W4eQdTtHhlB1O1eHUHU7T4bQdTtfh9B3O0OGMHc7U4cwdztLhrB3O1uHsHc7R4ZwdztXh3B3O0+G8Hc7X4fwdLtDhgh0u1OHCHS7S4aIdLtbh4h0u0eGSHS7V4dIdLtMhHS7b4XIdLt/hCh2u2OFKHa7c4Sodrtrhah2u3uEaHa7Z4Vodrt3hOh2u2+F6Ha7f4QYdbtjhRh1u3OEmHW7a4WYdbt7hFh1u2eFWHW7d4TYdbtvhdh1u3+EOHe7Y4U4d7tzhLh3u2uFuHe7e4R4d7tnhXh3u3eE+He7b4X4d7t/hAR0e2OFBHR7c4SEdHtrhYR0e3uERHR7Z4VEdHt3hMR0e2+FxHR7f4QkdntjhSR2e3OEpHZ7aoR2e1uHpHZ7R4ZkdntXh2R2e0+G5HQ46DDoMO4w6jDtMOkw7zDrMOyw6LDusOqw7bDpsO+w67Ds8r8PzO7ygwws7vKjDizu8pMNLO7ysw8s7vKLDKzu8qsOrO7ymw2s7vK7D6zu8ocMbO7ypw5s7vKXDWzu8rcPbO7yjwzs7vKvDuzu8p8N7O7yvw/s7fKDDBzt8qMOHO3ykw0c7fKzDxzt8osMnO3yqw6c7fKbDZzt8rsPnO3yhwxc7fKnDlzt8pcNXO3ytw9c7fKPDNzt8q8O3O3ynw3c7fK/D9zv8oMMPO/yow487/KTDTzv8rMPPO/yiwy87/KrDrzv8psNvO/yuw+87/KHDHzv8qcOfO/ylw187/K3D3zv8o8M/O/yrw787/KfDfzv8r8MhHQ7tcFiHwzsc0eHIDkd1OLrDMR2O7XCcHsftcbwex+9xgh4n7HGiHifucZIeJ+1xsh4n73GKHqfscaoep+5xmh6n7XG6HqfvcYYeZ+xxph5n7nGWHmftcbYeZ+9xjh7n7HGuHufucZ4e5+1xvh7n73GBHhfscaEeF+5xkR4X7XGxHhfvcYkel+xxqR6X7nGZHulx2R6X63H5HlfoccUeV+px5R5X6XHVHlfrcfUe1+hxzR7X6nHtHtfpcd0e1+tx/R436HHDHjfqceMeN+lx0x4363HzHrfoccset+px6x636XHbHrfrcfsed+hxxx536nHnHnfpcdced+tx9x736HHPHvfqce8e9+lx3x7363H/Hg/o8cAeD+rx4B4P6fHQHg/r8fAej+jxyB6P6vHoHo/p8dgej+vx+B5P6PHEHk/q8eQeT+nx1B7t8bQeT+/xjB7P7PGsHs/u8Zwez+1x0GPQY9hj1GPcY9Jj2mPWY95j0WPZY9Vj3WPTY9tj12Pf43k9nt/jBT1e2ONFPV7c4yU9XtrjZT1e3uMVPV7Z41U9Xt3jNT1e2+N1PV7f4w093tjjTT3e3OMtPd7a42093t7jHT3e2eNdPd7d4z093tvj/wTAAwBtBQDA0Gzbtpdt27ZtL9t810a2bdu27fpG574B3j/ABwb44AAfGuDDA3xkgI8O8LEBPj7AJwb45ACfGuDTA3xmgM8O8LkBPj/AFwb44gBfGuDLA3xlgK8O8LUBvj7ANwb45gDfGuDbA3xngO8O8L0Bvj/ADwb44QA/GuDHA/xkgJ8O8LMBfj7ALwb45QC/GuDXA/xmgN8O8LsBfj/AHwb44wB/GuDPA/xlgL8O8LcB/j7APwb45wD/GuDfA/xngP8O8L8BDhng0AEOG+DwAY4Y4MgBjhrg6AGOGeDYAY4T4LgBjhfg+AFOEOCEAU4U4MQBThLgpAFOFuDkAU4R4JQBThXg1AFOE+C0AU4X4PQBzhDgjAHOFODMAc4S4KwBzhbg7AHOEeCcAc4V4NwBzhPgvAHOF+D8AS4Q4IIBLhTgwgEuEuCiAS4W4OIBLhHgkgEuFeDSAS4TIAEuG+ByAS4f4AoBrhjgSgGuHOAqAa4a4GoBrh7gGgGuGeBaAa4d4DoBrhvgegGuH+AGAW4Y4EYBbhzgJgFuGuBmAW4e4BYBbhngVgFuHeA2AW4b4HYBbh/gDgHuGOBOAe4c4C4B7hrgbgHuHuAeAe4Z4F4B7h3gPgHuG+B+Ae4f4AEBHhjgQQEeHOAhAR4a4GEBHh7gEQEeGeBRAR4d4DEBHhvgcQEeH+AJAZ4Y4EkBnhzgKQGeGqABnhbg6QGeEeCZAZ4V4NkBnhPguQGeF+D5AV4Q4IUBXhTgxQFeEuClAV4W4OUBXhHglQFeFeDVAV4T4LUBDgIMAgwDjAKMA0wCTAPMAswDLAIsA6wCrANsAmwD7ALsA7wuwOsDvCHAGwO8KcCbA7wlwFsDvC3A2wO8I8A7A7wrwLsDvCfAewO8L8D7A3wgwAcDfCjAhwN8JMBHA3wswMcDfCLAJwN8KsCnA3wmwGcDfC7A5wN8IcAXA3wpwJcDfCXAVwN8LcDXA3wjwDcDfCvAtwN8J8B3A3wvwPcD/CDADwP8KMCPA/wkwE8D/CzAzwP8IsAvA/wqwK8D/CbAbwP8LsDvA/whwB8D/CnAnwP8JcBfA/wtwN8D/CPAPwP8K8C/A/wnwH8D/C/AIQEODXBYgMMDHBHgyABHBTg6wDEBjg1wnBDHDXG8EMcPcYIQJwxxohAnDnGSECcNcbIQJw9xihCnDHGqEKcOcZoQpw1xuhCnD3GGEGcMcaYQZw5xlhBnDXG2EGcPcY4Q5wxxrhDnDnGeEOcNcb4Q5w9xgRAXDHGhEBcOcZEQFw1xsRAXD3GJEJcMcakQlw5xmRAJcdkQlwtx+RBXCHHFEFcKceUQVwlx1RBXC3H1ENcIcc0Q1wpx7RDXCXHdENcLcf0QNwhxwxA3CnHjEDcJcdMQNwtx8xC3CHHLELcKcesQtwlx2xC3C3H7EHcIcccQdwpx5xB3CXHXEHcLcfcQ9whxzxD3CnHvEPcJcd8Q9wtx/xAPCPHAEA8K8eAQDwnx0BAPC/HwEI8I8cgQjwrx6BCPCfHYEI8L8fgQTwjxxBBPCvHkEE8J8dQQDfG0EE8P8YwQzwzxrBDPDvGcEM8N8bwQzw/xghAvDPGiEC8O8ZIQLw3xshAvD/GKEK8M8aoQrw7xmhCvDXEQYhBiGGIUYhxiEmIaYhZiHmIRYhliFWIdYhNiG2IXYh/idSFeH+INId4Y4k0h3hziLSHeGuJtId4e4h0h3hniXSHeHeI9Id4b4n0h3h/iAyE+GOJDIT4c4iMhPhriYyE+HuITIT4Z4lMhPh3iMyE+G+JzIT4f4gshvhjiSyG+HOIrIb4a4mshvh7iGyG+GeJbIb4d4jshvhvieyG+H+IHIX4Y4kchfhziJyF+GuJnIX4e4hchfhniVyF+HeI3IX4b4nchfh/iDyH+GOJPIf4c4i8h/hribyH+HuIfIf4Z4l8h/h3iPyH+G+J/IQ4JcWiIw0IcHuKIEEeGOCrE0SGOCXFsiONEOG6E40U4foQTRDhhhBNFOHGEk0Q4aYSTRTh5hFNEOGWEU0U4dYTTRDhthNNFOH2EM0Q4Y4QzRThzhLNEOGuEs0U4e4RzRDhnhHNFOHeE80Q4b4TzRTh/hAtEuGCEC0W4cISLRLhohItFuHiES0S4ZIRLRbh0hMtESITLRrhchMtHuEKEK0a4UoQrR7hKhKtGuFqEq0e4RoRrRrhWhGtHuE6E60a4XoTrR7hBhBtGuFGEG0e4SYSbRrhZhJtHuEWEW0a4VYRbR7hNhNtGuF2E20e4Q4Q7RrhThDtHuEuEu0a4W4S7R7hHhHtGuFeEe0e4T4T7RrhfhPtHeECEB0Z4UIQHR3hIhIdGeFiEh0d4RIRHRnhUhEdHeEyEx0Z4XITHR3hChCdGeFKEJ0d4SoSnRmiEp0V4eoRnRHhmhGdFeHaE50R4boTnRXh+hBdEeGGEF0V4cYSXRHhphJdFeHmEV0R4ZYRXRXh1hNdEeG2EgwiDCMMIowjjCJMI0wizCPMIiwjLCKsI6wibCNsIuwj7CK+L8PoIb4jwxghvivDmCG+J8NYIb4vw9gjviPDOCO+K8O4I74nw3gjvi/D+CB+I8MEIH4rw4QgfifDRCB+L8PEIn4jwyQifivDpCJ+J8NkIn4vw+QhfiPDFCF+K8OUIX4nw1Qhfi/D1CN+I8M0I34rw7QjfifDdCN+L8P0IP4jwwwg/ivDjCD+J8NMIP4vw8wi/iPDLCL+K8OsIv4nw2wi/i/D7CH+I8McIf4rw5wh/ifDXCH+L8PcI/4jwzwj/ivDvCP+J8N8I/4twSIRDIxwW4fAIR0Q4MsJREY6OcEyEYyMcJ8ZxYxwvxvFjnCDGCWOcKMaJY5wkxkljnCzGyWOcIsYpY5wqxqljnCbGaWOcLsbpY5whxhljnCnGmWOcJcZZY5wtxtljnCPGOWOcK8a5Y5wnxnljnC/G+WNcIMYFY1woxoVjXCTGRWNcLMbFY1wixiVjXCrGpWNcJkZiXDbG5WJcPsYVYlwxxpViXDnGVWJcNcbVYlw9xjViXDPGtWJcO8Z1Ylw3xvViXD/GDWLcMMaNYtw4xk1i3DTGzWLcPMYtYtwyxq1i3DrGbWLcNsbtYtw+xh1i3DHGnWLcOcZdYtw1xt1i3D3GPWLcM8a9Ytw7xn1i3DfG/WLcP8YDYjwwxoNiPDjGQ2I8NMbDYjw8xiNiPDLGo2I8OsZjYjw2xuNiPD7GE2I8McaTYjw5xlNiPDVGYzwtxtNjPCPGM2M8K8azYzwnxnNjPC/G82O8IMYLY7woxotjvCTGS2O8LMbLY7wixitjvCrGq2O8JsZrYxzEGMQYxhjFGMeYxJjGmMWYx1jEWMZYxVjH2MTYxtjF2Md4XYzXx3hDjDfGeFOMN8d4S4y3xnhbjLfHeEeMd8Z4V4x3x3hPjPfGeF+M98f4QIwPxvhQjA/H+EiMj8b4WIyPx/hEjE/G+FSMT8f4TIzPxvhcjM/H+EKML8b4Uowvx/hKjK/G+FqMr8f4RoxvxvhWjG/H+E6M78b4Xozvx/hBjB/G+FGMH8f4SYyfxvhZjJ/H+EWMX8b4VYxfx/hNjN/G+F2M38f4Q4w/xvhTjD/H+EuMv8b4W4y/x/hHjH/G+FeMf8f4T4z/xvhfjENiHBrjsBiHxzgixpExjopxdIxjYhwb4zgJjpvgeAmOn+AECU6Y4EQJTpzgJAlOmuBkCU6e4BQJTpngVAlOneA0CU6b4HQJTp/gDAnOmOBMCc6c4CwJzprgbAnOnuAcCc6Z4FwJzp3gPAnOm+B8Cc6f4AIJLpjgQgkunOAiCS6a4GIJLp7gEgkumeBSCS6d4DIJkuCyCS6X4PIJrpDgigmulODKCa6S4KoJrpbg6gmukeCaCa6V4NoJrpPgugmul+D6CW6Q4IYJbpTgxglukuCmCW6W4OYJbpHglgluleDWCW6T4LYJbpfg9gnukOCOCe6U4M4J7pLgrgnuluDuCe6R4J4J7pXg3gnuk+C+Ce6X4P4JHpDggQkelODBCR6S4KEJHpbg4QkekeCRCR6V4NEJHpPgsQkel+DxCZ6Q4IkJnpTgyQmekuCpCZrgaQmenuAZCZ6Z4FkJnp3gOQmem+B5CZ6f4AUJXpjgRQlenOAlCV6a4GUJXp7gFQlemeBVCV6d4DUJXpvgIMEgwTDBKME4wSTBNMEswTzBIsEywSrBOsEmwTbBLsE+wesSvD7BGxK8McGbErw5wVsSvDXB2xK8PcE7ErwzwbsSvDvBexK8N8H7Erw/wQcSfDDBhxJ8OMFHEnw0wccSfDzBJxJ8MsGnEnw6wWcSfDbB5xJ8PsEXEnwxwZcSfDnBVxJ8NcHXEnw9wTcSfDPBtxJ8O8F3Enw3wfcSfD/BDxL8MMGPEvw4wU8S/DTBzxL8PMEvEvwywa8S/DrBbxL8NsHvEvw+wR8S/DHBnxL8OcFfEvw1wd8S/D3BPxL8M8G/Evw7wX8S/DfB/xIckuDQBIclODzBEQmOTHBUgqMTHJPg2ATHSXHcFMdLcfwUJ0hxwhQnSnHiFCdJcdIUJ0tx8hSnSHHKFKdKceoUp0lx2hSnS3H6FGdIccYUZ0px5hRnSXHWFGdLcfYU50hxzhTnSnHuFOdJcd4U50tx/hQXSHHBFBdKceEUF0lx0RQXS3HxFJdIcckUl0px6RSXSZEUl01xuRSXT3GFFFdMcaUUV05xlRRXTXG1FFdPcY0U10xxrRTXTnGdFNdNcb0U109xgxQ3THGjFDdOcZMUN01xsxQ3T3GLFLdMcasUt05xmxS3TXG7FLdPcYcUd0xxpxR3TnGXFHdNcbcUd09xjxT3THGvFPdOcZ8U901xvxT3T/GAFA9M8aAUD07xkBQPTfGwFA9P8YgUj0zxqBSPTvGYFI9N8bgUj0/xhBRPTPGkFE9O8ZQUT03RFE9L8fQUz0jxzBTPSvHsFM9J8dwUz0vx/BQvSPHCFC9K8eIUL0nx0hQvS/HyFK9I8coUr0rx6hSvSfHaFAcpBimGKUYpxikmKaYpZinmKRYplilWKdYpNim2KXYp9ilel+L1Kd6Q4o0p3pTizSnekuKtKd6W4u0p3pHinSneleLdKd6T4r0p3pfi/Sk+kOKDKT6U4sMpPpLioyk+luLjKT6R4pMpPpXi0yk+k+KzKT6X4vMpvpDiiym+lOLLKb6S4qspvpbi6ym+keKbKb6V4tspvpPiuym+l+L7KX6Q4ocpfpTixyl+kuKnKX6W4ucpfpHilyl+leLXKX6T4rcpfpfi9yn+kOKPKf6U4s8p/pLiryn+luLvKf6R4p8p/pXi3yn+k+K/Kf6X4pAUh6Y4LMXhKY5IcWSKo1IcneKYFMemOE6G42Y4XobjZzhBhhNmOFGGE2c4SYaTZjhZhpNnOEWGU2Y4VYZTZzhNhtNmOF2G02c4Q4YzZjhThjNnOEuGs2Y4W4azZzhHhnNmOFeGc2c4T4bzZjhfhvNnuECGC2a4UIYLZ7hIhotmuFiGi2e4RIZLZrhUhktnuEyGZLhshstluHyGK2S4YoYrZbhyhqtkuGqGq2W4eoZrZLhmhmtluHaG62S4bobrZbh+hhtkuGGGG2W4cYabZLhphptluHmGW2S4ZYZbZbh1httkuG2G22W4fYY7ZLhjhjtluHOGu2S4a4a7Zbh7hntkuGeGe2W4d4b7ZLhvhvtluH+GB2R4YIYHZXhwhodkeGiGh2V4eIZHZHhkhkdleHSGx2R4bIbHZXh8hidkeGKGJ2V4coanZHhqhmZ4WoanZ3hGhmdmeFaGZ2d4TobnZnhehudneEGGF2Z4UYYXZ3hJhpdmeFmGl2d4RYZXZnhVhldneE2G12Y4yDDIMMwwyjDOMMkwzTDLMM+wyLDMsMqwzrDJsM2wy7DP8LoMr8/whgxvzPCmDG/O8JYMb83wtgxvz/CODO/M8K4M787wngzvzfC+DO/P8IEMH8zwoQwfzvCRDB/N8LEMH8/wiQyfzPCpDJ/O8JkMn83wuQyfz/CFDF/M8KUMX87wlQxfzfC1DF/P8I0M38zwrQzfzvCdDN/N8L0M38/wgww/zPCjDD/O8JMMP83wsww/z/CLDL/M8KsMv87wmwy/zfC7DL/P8IcMf8zwpwx/zvCXDH/N8LcMf8/wjwz/zPCvDP/O8J8M/83wvwyHZDg0w2EZDs9wRIYjMxyV4egMx2Q4NsNxchw3x/FyHD/HCXKcMMeJcpw4x0lynDTHyXKcPMcpcpwyx6lynDrHaXKcNsfpcpw+xxlynDHHmXKcOcdZcpw1x9lynD3HOXKcM8e5cpw7x3lynDfH+XKcP8cFclwwx4VyXDjHRXJcNMfFclw8xyVyXDLHpXJcOsdlciTHZXNcLsflc1whxxVzXCnHlXNcJcdVc1wtx9VzXCPHNXNcK8e1c1wnx3VzXC/H9XPcIMcNc9wox41z3CTHTXPcLMfNc9wixy1z3CrHrXPcJsdtc9wux+1z3CHHHXPcKcedc9wlx11z3C3H3XPcI8c9c9wrx71z3CfHfXPcL8f9czwgxwNzPCjHg3M8JMdDczwsx8NzPCLHI3M8Ksejczwmx2NzPC7H43M8IccTczwpx5NzPCXHU3M0x9NyPD3HM3I8M8ezcjw7x3NyPDfH83I8P8cLcrwwx4tyvDjHS3K8NMfLcrw8xytyvDLHq3K8Osdrcrw2x0GOQY5hjlGOcY5JjmmOWY55jkWOZY5VjnWOTY5tjl2OfY7X5Xh9jjfkeGOON+V4c4635HhrjrfleHuOd+R4Z4535Xh3jvfkeG+O9+V4f44P5Phgjg/l+HCOj+T4aI6P5fh4jk/k+GSOT+X4dI7P5Phsjs/l+HyOL+T4Yo4v5fhyjq/k+GqOr+X4eo5v5Phmjm/l+HaO7+T4bo7v5fh+jh/k+GGOH+X4cY6f5Phpjp/l+HmOX+T4ZY5f5fh1jt/k+G2O3+X4fY4/5Phjjj/l+HOOv+T4a46/5fh7jn/k+GeOf+X4d47/5Phvjv/lOCTHoTkOy3F4jiNyHJnjqBxH5zgmx7E5jlPguAWOV+D4BU5Q4IQFTlTgxAVOUuCkBU5W4OQFTlHglAVOVeDUBU5T4LQFTlfg9AXOUOCMBc5U4MwFzlLgrAXOVuDsBc5R4JwFzlXg3AXOU+C8Bc5X4PwFLlDgggUuVODCBS5S4KIFLlbg4gUuUeCSBS5V4NIFLlMgBS5b4HIFLl/gCgWuWOBKBa5c4CoFrlrgagWuXuAaBa5Z4FoFrl3gOgWuW+B6Ba5f4AYFbljgRgVuXOAmBW5a4GYFbl7gFgVuWeBWBW5d4DYFblvgdgVuX+AOBe5Y4E4F7lzgLgXuWuBuBe5e4B4F7lngXgXuXeA+Be5b4H4F7l/gAQUeWOBBBR5c4CEFHlrgYQUeXuARBR5Z4FEFHl3gMQUeW+BxBR5f4AkFnljgSQWeXOApBZ5aoAWeVuDpBZ5R4JkFnlXg2QWeU+C5BZ5X4PkFXlDghQVeVODFBV5S4KUFXlbg5QVeUeCVBV5V4NUFXlPgtQUOCgwKDAuMCowLTApMC8wKzAssCiwLrAqsC2wKbAvsCuwLvK7A6wu8ocAbC7ypwJsLvKXAWwu8rcDbC7yjwDsLvKvAuwu8p8B7C7yvwPsLfKDABwt8qMCHC3ykwEcLfKzAxwt8osAnC3yqwKcLfKbAZwt8rsDnC3yhwBcLfKnAlwt8pcBXC3ytwNcLfKPANwt8q8C3C3ynwHcLfK/A9wv8oMAPC/yowI8L/KTATwv8rMDPC/yiwC8L/KrArwv8psBvC/yuwO8L/KHAHwv8qcCfC/ylwF8L/K3A3wv8o8A/C/yrwL8L/KfAfwv8r8AhBQ4tcFiBwwscUeDIAkcVOLrAMQWOLXCcEsctcbwSxy9xghInLHGiEicucZISJy1xshInL3GKEqcscaoSpy5xmhKnLXG6EqcvcYYSZyxxphJnLnGWEmctcbYSZy9xjhLnLHGuEucucZ4S5y1xvhLnL3GBEhcscaESFy5xkRIXLXGxEhcvcYkSlyxxqRKXLnGZEilx2RKXK3H5ElcoccUSVypx5RJXKXHVElcrcfUS1yhxzRLXKnHtEtcpcd0S1ytx/RI3KHHDEjcqceMSNylx0xI3K3HzErcoccsStypx6xK3KXHbErcrcfsSdyhxxxJ3KnHnEncpcdcSdytx9xL3KHHPEvcqce8S9ylx3xL3K3H/Eg8o8cASDyrx4BIPKfHQEg8r8fASjyjxyBKPKvHoEo8p8dgSjyvx+BJPKPHEEk8q8eQSTynx1BIt8bQSTy/xjBLPLPGsEs8u8ZwSzy3xvBLPL/GCEi8s8aISLy7xkhIvLfGyEi8v8YoSryzxqhKvLvGaEq8tcVBiUGJYYlRiXGJSYlpiVmJeYlFiWWJVYl1iU2JbYldiX+J1JV5f4g0l3ljiTSXeXOItJd5a4m0l3l7iHSXeWeJdJd5d4j0l3lvifSXeX+IDJT5Y4kMlPlziIyU+WuJjJT5e4hMlPlniUyU+XeIzJT5b4nMlPl/iCyW+WOJLJb5c4islvlriayW+XuIbJb5Z4lslvl3iOyW+W+J7Jb5f4gclfljiRyV+XOInJX5a4mclfl7iFyV+WeJXJX5d4jclflvidyV+X+IPJf5Y4k8l/lziLyX+WuJvJf5e4h8l/lniXyX+XeI/Jf5b4n8lDilxaInDShxe4ogSR5Y4qsTRJY4pcWyJ41Q4boXjVTh+hRNUOGGFE1U4cYWTVDhphZNVOHmFU1Q4ZYVTVTh1hdNUOG2F01U4fYUzVDhjhTNVOHOFs1Q4a4WzVTh7hXNUOGeFc1U4d4XzVDhvhfNVOH+FC1S4YIULVbhwhYtUuGiFi1W4eIVLVLhkhUtVuHSFy1RIhctWuFyFy1e4QoUrVrhShStXuEqFq1a4WoWrV7hGhWtWuFaFa1e4ToXrVrhehetXuEGFG1a4UYUbV7hJhZtWuFmFm1e4RYVbVrhVhVtXuE2F21a4XYXbV7hDhTtWuFOFO1e4S4W7VrhbhbtXuEeFe1a4V4V7V7hPhftWuF+F+1d4QIUHVnhQhQdXeEiFh1Z4WIWHV3hEhUdWeFSFR1d4TIXHVnhchcdXeEKFJ1Z4UoUnV3hKhadWaIWnVXh6hWdUeGaFZ1V4doXnVHhuhedVeH6FF1R4YYUXVXhxhZdUeGmFl1V4eYVXVHhlhVdVeHWF11R4bYWDCoMKwwqjCuMKkwrTCrMK8wqLCssKqwrrCpsK2wq7CvsKr6vw+gpvqPDGCm+q8OYKb6nw1gpvq/D2Cu+o8M4K76rw7grvqfDeCu+r8P4KH6jwwQofqvDhCh+p8NEKH6vw8QqfqPDJCp+q8OkKn6nw2Qqfq/D5Cl+o8MUKX6rw5QpfqfDVCl+r8PUK36jwzQrfqvDtCt+p8N0K36vw/Qo/qPDDCj+q8OMKP6nw0wo/q/DzCr+o8MsKv6rw6wq/qfDbCr+r8PsKf6jwxwp/qvDnCn+p8NcKf6vw9wr/qPDPCv+q8O8K/6nw3wr/q3BIhUMrHFbh8ApHVDiywlEVjq5wTIVjKxynxnFrHK/G8WucoMYJa5yoxolrnKTGSWucrMbJa5yixilrnKrGqWucpsZpa5yuxulrnKHGGWucqcaZa5ylxllrnK3G2Wuco8Y5a5yrxrlrnKfGeWucr8b5a1ygxgVrXKjGhWtcpMZFa1ysxsVrXKLGJWtcqsala1ymRmpctsblaly+xhVqXLHGlWpcucZValy1xtVqXL3GNWpcs8a1aly7xnVqXLfG9Wpcv8YNatywxo1q3LjGTWrctMbNaty8xi1q3LLGrWrcusZtaty2xu1q3L7GHWrcscadaty5xl1q3LXG3WrcvcY9atyzxr1q3LvGfWrct8b9aty/xgNqPLDGg2o8uMZDajy0xsNqPLzGI2o8ssajajy6xmNqPLbG42o8vsYTajyxxpNqPLnGU2o8tUZrPK3G02s8o8YzazyrxrNrPKfGc2s8r8bza7ygxgtrvKjGi2u8pMZLa7ysxstrvKLGK2u8qsara7ymxmtrHNQY1BjWGNUY15jUmNaY1ZjXWNRY1ljVWNfY1NjW2NXY13hdjdfXeEONN9Z4U40313hLjbfWeFuNt9d4R4131nhXjXfXeE+N99Z4X4331/hAjQ/W+FCND9f4SI2P1vhYjY/X+ESNT9b4VI1P1/hMjc/W+FyNz9f4Qo0v1vhSjS/X+EqNr9b4Wo2v1/hGjW/W+FaNb9f4To3v1vheje/X+EGNH9b4UY0f1/hJjZ/W+FmNn9f4RY1f1vhVjV/X+E2N39b4XY3f1/hDjT/W+FONP9f4S42/1vhbjb/X+EeNf9b4V41/1/hPjf/W+F+NQ2ocWuOwGofXOKLGkTWOqnF0jWNqHFvjOA2O2+B4DY7f4AQNTtjgRA1O3OAkDU7a4GQNTt7gFA1O2eBUDU7d4DQNTtvgdA1O3+AMDc7Y4EwNztzgLA3O2uBsDc7e4BwNztngXA3O3eA8Dc7b4HwNzt/gAg0u2OBCDS7c4CINLtrgYg0u3uASDS7Z4FINLt3gMg3S4LINLtfg8g2u0OCKDa7U4MoNrtLgqg2u1uDqDa7R4JoNrtXg2g2u0+C6Da7X4PoNbtDghg1u1ODGDW7S4KYNbtbg5g1u0eCWDW7V4NYNbtPgtg1u1+D2De7Q4I4N7tTgzg3u0uCuDe7W4O4N7tHgng3u1eDeDe7T4L4N7tfg/g0e0OCBDR7U4MENHtLgoQ0e1uDhDR7R4JENHtXg0Q0e0+CxDR7X4PENntDgiQ2e1ODJDZ7S4KkN2uBpDZ7e4BkNntngWQ2e3eA5DZ7b4HkNnt/gBQ1e2OBFDV7c4CUNXtrgZQ1e3uAVDV7Z4FUNXt3gNQ1e2+CgwaDBsMGowbjBpMG0wazBvMGiwbLBqsG6wabBtsGuwb7B6xq8vsEbGryxwZsavLnBWxq8tcHbGry9wTsavLPBuxq8u8F7Gry3wfsavL/BBxp8sMGHGny4wUcafLTBxxp8vMEnGnyywacafLrBZxp8tsHnGny+wRcafLHBlxp8ucFXGny1wdcafL3BNxp8s8G3Gny7wXcafLfB9xp8v8EPGvywwY8a/LjBTxr8tMHPGvy8wS8a/LLBrxr8usFvGvy2we8a/L7BHxr8scGfGvy5wV8a/LXB3xr8vcE/Gvyzwb8a/LvBfxr8t8H/GhzS4NAGhzU4vMERDY5scFSDoxsc0+DYBsdpcdwWx2tx/BYnaHHCFidqceIWJ2lx0hYna3HyFqdoccoWp2px6hanaXHaFqdrcfoWZ2hxxhZnanHmFmdpcdYWZ2tx9hbnaHHOFudqce4W52lx3hbna3H+FhdoccEWF2px4RYXaXHRFhdrcfEWl2hxyRaXanHpFpdpkRaXbXG5FpdvcYUWV2xxpRZXbnGVFldtcbUWV29xjRbXbHGtFtducZ0W121xvRbXb3GDFjdscaMWN25xkxY3bXGzFjdvcYsWt2xxqxa3bnGbFrdtcbsWt29xhxZ3bHGnFnducZcWd21xtxZ3b3GPFvdsca8W925xnxb3bXG/Fvdv8YAWD2zxoBYPbvGQFg9t8bAWD2/xiBaPbPGoFo9u8ZgWj23xuBaPb/GEFk9s8aQWT27xlBZPbdEWT2vx9BbPaPHMFs9q8ewWz2nx3BbPa/H8Fi9o8cIWL2rx4hYvafHSFi9r8fIWr2jxyhavavHqFq9p8doWBy0GLYYtRi3GLSYtpi1mLeYtFi2WLVYt1i02LbYtdi32LV7X4vUt3tDijS3e1OLNLd7S4q0t3tbi7S3e0eKdLd7V4t0t3tPivS3e1+L9LT7Q4oMtPtTiwy0+0uKjLT7W4uMtPtHiky0+1eLTLT7T4rMtPtfi8y2+0OKLLb7U4sstvtLiqy2+1uLrLb7R4pstvtXi2y2+0+K7Lb7X4vstftDihy1+1OLHLX7S4qctftbi5y1+0eKXLX7V4tctftPity1+1+L3Lf7Q4o8t/tTizy3+0uKvLf7W4u8t/tHiny3+1eLfLf7T4r8t/tfikBaHtjisxeEtjmhxZIujWhzd4pgWx7Y4Tofjdjheh+N3OEGHE3Y4UYcTdzhJh5N2OFmHk3c4RYdTdjhVh1N3OE2H03Y4XYfTdzhDhzN2OFOHM3c4S4ezdjhbh7N3OEeHc3Y4V4dzdzhPh/N2OF+H83e4QIcLdrhQhwt3uEiHi3a4WIeLd7hEh0t2uFSHS3e4TId0uGyHy3W4fIcrdLhihyt1uHKHq3S4aoerdbh6h2t0uGaHa3W4dofrdLhuh+t1uH6HG3S4YYcbdbhxh5t0uGmHm3W4eYdbdLhlh1t1uHWH23S4bYfbdbh9hzt0uGOHO3W4c4e7dLhrh7t1uHuHe3S4Z4d7dbh3h/t0uG+H+3W4f4cHdHhghwd1eHCHh3R4aIeHdXh4h0d0eGSHR3V4dIfHdHhsh8d1eHyHJ3R4YocndXhyh6d0eGqHdnhah6d3eEaHZ3Z4Vodnd3hOh+d2eF6H53d4QYcXdnhRhxd3eEmHl3Z4WYeXd3hFh1d2eFWHV3d4TYfXdjjoMOgw7DDqMO4w6TDtMOsw77DosOyw6rDusOmw7bDrsO/wug6v7/CGDm/s8KYOb+7wlg5v7fC2Dm/v8I4O7+zwrg7v7vCeDu/t8L4O7+/wgQ4f7PChDh/u8JEOH+3wsQ4f7/CJDp/s8KkOn+7wmQ6f7fC5Dp/v8IUOX+zwpQ5f7vCVDl/t8LUOX+/wjQ7f7PCtDt/u8J0O3+3wvQ7f7/CDDj/s8KMOP+7wkw4/7fCzDj/v8IsOv+zwqw6/7vCbDr/t8LsOv+/whw5/7PCnDn/u8JcOf+3wtw5/7/CPDv/s8K8O/+7wnw7/7fC/Dod0OLTDYR0O73BEhyM7HNXh6A7HdDi2w3F6HLfH8Xocv8cJepywx4l6nLjHSXqctMfJepy8xyl6nLLHqXqcusdpepy2x+l6nL7HGXqcsceZepy5x1l6nLXH2Xqcvcc5epyzx7l6nLvHeXqct8f5epy/xwV6XLDHhXpcuMdFely0x8V6XLzHJXpcsselely6x2V6pMdle1yux+V7XKHHFXtcqceVe1ylx1V7XK3H1Xtco8c1e1yrx7V7XKfHdXtcr8f1e9ygxw173KjHjXvcpMdNe9ysx8173KLHLXvcqsete9ymx2173K7H7Xvcoccde9ypx5173KXHXXvcrcfde9yjxz173KvHvXvcp8d9e9yvx/17PKDHA3s8qMeDezykx0N7PKzHw3s8oscjezyqx6N7PKbHY3s8rsfjezyhxxN7PKnHk3s8pcdTe7TH03o8vcczejyzx7N6PLvHc3o8t8fzejy/xwt6vLDHi3q8uMdLery0x8t6vLzHK3q8sserery6x2t6vLbHQY9Bj2GPUY9xj0mPaY9Zj3mPRY9lj1WPdY9Nj22PXY99j9f1eH2PN/R4Y4839Xhzj7f0eGuPt/V4e4939Hhnj3f1eHeP9/R4b4//EwAPALQVAABDs23bXrZt27a9bNvGuzaybdu29Y3OfQO8f4APDPDBAT40wIcH+MgAHx3gYwN8fIBPDPDJAT41wKcH+MwAnx3gcwN8foAvDPDFAb40wJcH+MoAXx3gawN8fYBvDPDNAb41wLcH+M4A3x3gewN8f4AfDPDDAX40wI8H+MkAPx3gZwP8fIBfDPDLAX41wK8H+M0Avx3gdwP8foA/DPDHAf40wJ8H+MsAfx3gbwP8fYB/DPDPAf41wL8H+M8A/x3gfwMcMsChAxw2wOEDHDHAkQMcNcDRAxwzwLEDHCfAcQMcL8DxA5wgwAkDnCjAiQOcJMBJA5wswMkDnCLAKQOcKsCpA5wmwGkDnC7A6QOcIcAZA5wpwJkDnCXAWQOcLcDZA5wjwDkDnCvAuQOcJ8B5A5wvwPkDXCDABQNcKMCFA1wkwEUDXCzAxQNcIsAlA1wqwKUDXCZAAlw2wOUCXD7AFQJcMcCVAlw5wFUCXDXA1QJcPcA1AlwzwLUCXDvAdQJcN8D1Alw/wA0C3DDAjQLcOMBNAtw0wM0C3DzALQLcMsCtAtw6wG0C3DbA7QLcPsAdAtwxwJ0C3DnAXQLcNcDdAtw9wD0C3DPAvQLcO8B9Atw3wP0C3D/AAwI8MMCDAjw4wEMCPDTAwwI8PMAjAjwywKMCPDrAYwI8NsDjAjw+wBMCPDHAkwI8OcBTAjw1QAM8LcDTAzwjwDMDPCvAswM8J8BzAzwvwPMDvCDACwO8KMCLA7wkwEsDvCzAywO8IsArA7wqwKsDvCbAawO8LsDrA7whwBsDvCnAmwO8JcBbA7wtwNsDvCPAOwO8K8C7A7wnwHsDHAQYBBgGGAUYB5gEmAaYBZgHWARYBlgFWAfYBNgG2AXYB3hfgPcH+ECADwb4UIAPB/hIgI8G+FiAjwf4RIBPBvhUgE8H+EyAzwb4XIDPB/hCgC8G+FKALwf4SoCvBvhagK8H+EaAbwb4VoBvB/hOgO8G+F6A7wf4QYAfBvhRgB8H+EmAnwb4WYCfB/hFgF8G+FWAXwf4TYDfBvhdgN8H+EOAPwb4U4A/B/hLgL8G+FuAvwf4R4B/BvhXgH8H+E+A/wb4X4BDAhwa4LAAhwc4IsCRAY4KcHSAYwIcG+A4IY4b4nghjh/iBCFOGOJEIU4c4iQhThriZCFOHuIUIU4Z4lQhTh3iNCFOG+J0IU4f4gwhzhjiTCHOHOIsIc4a4mwhzh7iHCHOGeJcIc4d4jwhzhvifCHOH+ICIS4Y4kIhLhziIiEuGuJiIS4e4hIhLhniUiEuHeIyIRLisiEuF+LyIa4Q4oohrhTiyiGuEuKqIa4W4uohrhHimiGuFeLaIa4T4rohrhfi+iFuEOKGIW4U4sYhbhLipiFuFuLmIW4R4pYhbhXi1iFuE+K2IW4X4vYh7hDijiHuFOLOIe4S4q4h7hbi7iHuEeKeIe4V4t4h7hPiviHuF+L+IR4Q4oEhHhTiwSEeEuKhIR4W4uEhHhHikSEeFeLRIR4T4rEhHhfi8SGeEOKJIZ4U4skhnhLiqSEa4mkhnh7iGSGeGeJZIZ4d4jkhnhvieSGeH+IFIV4Y4kUhXhziJSFeGuJlIV4e4hUhXhniVSFeHeI1IV4b4nUhXh/iDSHeGOJNId4c4i0h3hribSHeHuIdId4Z4l0h3h3iPSHeG+IgxCDEMMQoxDjEJMQ0xCzEPMQixDLEKsQ6xCbENsQuxD7E+0K8P8QHQnwwxIdCfDjER0J8NMTHQnw8xCdCfDLEp0J8OsRnQnw2xOdCfD7EF0J8McSXQnw5xFdCfDXE10J8PcQ3QnwzxLdCfDvEd0J8N8T3Qnw/xA9C/DDEj0L8OMRPQvw0xM9C/DzEL0L8MsSvQvw6xG9C/DbE70L8PsQfQvwxxJ9C/DnEX0L8NcTfQvw9xD9C/DPEv0L8O8R/Qvw3xP9CHBLi0BCHhTg8xBEhjgxxVIijQxwT4tgQx4lw3AjHi3D8CCeIcMIIJ4pw4ggniXDSCCeLcPIIp4hwygininDqCKeJcNoIp4tw+ghniHDGCGeKcOYIZ4lw1ghni3D2COeIcM4I54pw7gjniXDeCOeLcP4IF4hwwQgXinDhCBeJcNEIF4tw8QiXiHDJCJeKcOkIl4mQCJeNcLkIl49whQhXjHClCFeOcJUIV41wtQhXj3CNCNeMcK0I145wnQjXjXC9CNePcIMIN4xwowg3jnCTCDeNcLMIN49wiwi3jHCrCLeOcJsIt41wuwi3j3CHCHeMcKcId45wlwh3jXC3CHePcI8I94xwrwj3jnCfCPeNcL8I94/wgAgPjPCgCA+O8JAID43wsAgPj/CICI+M8KgIj47wmAiPjfC4CI+P8IQIT4zwpAhPjvCUCE+N0AhPi/D0CM+I8MwIz4rw7AjPifDcCM+L8PwIL4jwwggvivDiCC+J8NIIL4vw8giviPDKCK+K8OoIr4nw2givi/D6CG+I8MYIb4rw5ghvifDWCG+L8PYI74jwzgjvivDuCO+J8N4IBxEGEYYRRhHGESYRphFmEeYRFhGWEVYR1hE2EbYRdhH2Ed4X4f0RPhDhgxE+FOHDET4S4aMRPhbh4xE+EeGTET4V4dMRPhPhsxE+F+HzEb4Q4YsRvhThyxG+EuGrEb4W4esRvhHhmxG+FeHbEb4T4bsRvhfh+xF+EOGHEX4U4ccRfhLhpxF+FuHnEX4R4ZcRfhXh1xF+E+G3EX4X4fcR/hDhjxH+FOHPEf4S4a8R/hbh7xH+EeGfEf4V4d8R/hPhvxH+F+GQCIdGOCzC4RGOiHBkhKMiHB3hmAjHRjhOjOPGOF6M48c4QYwTxjhRjBPHOEmMk8Y4WYyTxzhFjFPGOFWMU8c4TYzTxjhdjNPHOEOMM8Y4U4wzxzhLjLPGOFuMs8c4R4xzxjhXjHPHOE+M88Y4X4zzx7hAjAvGuFCMC8e4SIyLxrhYjIvHuESMS8a4VIxLx7hMjMS4bIzLxbh8jCvEuGKMK8W4coyrxLhqjKvFuHqMa8S4Zoxrxbh2jOvEuG6M68W4fowbxLhhjBvFuHGMm8S4aYybxbh5jFvEuGWMW8W4dYzbxLhtjNvFuH2MO8S4Y4w7xbhzjLvEuGuMu8W4e4x7xLhnjHvFuHeM+8S4b4z7xbh/jAfEeGCMB8V4cIyHxHhojIfFeHiMR8R4ZIxHxXh0jMfEeGyMx8V4fIwnxHhijCfFeHKMp8R4aozGeFqMp8d4RoxnxnhWjGfHeE6M58Z4Xoznx3hBjBfGeFGMF8d4SYyXxnhZjJfHeEWMV8Z4VYxXx3hNjNfGeF2M18d4Q4w3xnhTjDfHeEuMt8Z4W4y3x3hHjHfGeFeMd8d4T4z3xjiIMYgxjDGKMY4xiTGNMYsxj7GIsYyxirGOsYmxjbGLsY/xvhjvj/GBGB+M8aEYH47xkRgfjfGxGB+P8YkYn4zxqRifjvGZGJ+N8bkYn4/xhRhfjPGlGF+O8ZUYX43xtRhfj/GNGN+M8a0Y347xnRjfjfG9GN+P8YMYP4zxoxg/jvGTGD+N8bMYP4/xixi/jPGrGL+O8ZsYv43xuxi/j/GHGH+M8acYf47xlxh/jfG3GH+P8Y8Y/4zxrxj/jvGfGP+N8b8Yh8Q4NMZhMQ6PcUSMI2McFePoGMfEODbGcRIcN8HxEhw/wQkSnDDBiRKcOMFJEpw0wckSnDzBKRKcMsGpEpw6wWkSnDbB6RKcPsEZEpwxwZkSnDnBWRKcNcHZEpw9wTkSnDPBuRKcO8F5Epw3wfkSnD/BBRJcMMGFElw4wUUSXDTBxRJcPMElElwywaUSXDrBZRIkwWUTXC7B5RNcIcEVE1wpwZUTXCXBVRNcLcHVE1wjwTUTXCvBtRNcJ8F1E1wvwfUT3CDBDRPcKMGNE9wkwU0T3CzBzRPcIsEtE9wqwa0T3CbBbRPcLsHtE9whwR0T3CnBnRPcJcFdE9wtwd0T3CPBPRPcK8G9E9wnwX0T3C/B/RM8IMEDEzwowYMTPCTBQxM8LMHDEzwiwSMTPCrBoxM8JsFjEzwuweMTPCHBExM8KcGTEzwlwVMTNMHTEjw9wTMSPDPBsxI8O8FzEjw3wfMSPD/BCxK8MMGLErw4wUsSvDTByxK8PMErErwywasSvDrBaxK8NsHrErw+wRsSvDHBmxK8OcFbErw1wdsSvD3BOxK8M8G7Erw7wXsSvDfBQYJBgmGCUYJxgkmCaYJZgnmCRYJlglWCdYJNgm2CXYJ9gvcleH+CDyT4YIIPJfhwgo8k+GiCjyX4eIJPJPhkgk8l+HSCzyT4bILPJfh8gi8k+GKCLyX4coKvJPhqgq8l+HqCbyT4ZoJvJfh2gu8k+G6C7yX4foIfJPhhgh8l+HGCnyT4aYKfJfh5gl8k+GWCXyX4dYLfJPhtgt8l+H2CPyT4Y4I/Jfhzgr8k+GuCvyX4e4J/JPhngn8l+HeC/yT4b4L/JTgkwaEJDktweIIjEhyZ4KgERyc4JsGxCY6T4rgpjpfi+ClOkOKEKU6U4sQpTpLipClOluLkKU6R4pQpTpXi1ClOk+K0KU6X4vQpzpDijCnOlOLMKc6S4qwpzpbi7CnOkeKcKc6V4twpzpPivCnOl+L8KS6Q4oIpLpTiwikukuKiKS6W4uIpLpHikikuleLSKS6TIikum+JyKS6f4goprpjiSimunOIqKa6a4moprp7iGimumeJaKa6d4joprpvieimun+IGKW6Y4kYpbpziJilumuJmKW6e4hYpbpniViluneI2KW6b4nYpbp/iDinumOJOKe6c4i4p7pribinunuIeKe6Z4l4p7p3iPinum+J+Ke6f4gEpHpjiQSkenOIhKR6a4mEpHp7iESkemeJRKR6d4jEpHpvicSken+IJKZ6Y4kkpnpziKSmemqIpnpbi6SmekeKZKZ6V4tkpnpPiuSmel+L5KV6Q4oUpXpTixSlekuKlKV6W4uUpXpHilSleleLVKV6T4rUpXpfi9SnekOKNKd6U4s0p3pLirSneluLtKd6R4p0p3pXi3Snek+K9KQ5SDFIMU4xSjFNMUkxTzFLMUyxSLFOsUqxTbFJsU+xS7FO8L8X7U3wgxQdTfCjFh1N8JMVHU3wsxcdTfCLFJ1N8KsWnU3wmxWdTfC7F51N8IcUXU3wpxZdTfCXFV1N8LcXXU3wjxTdTfCvFt1N8J8V3U3wvxfdT/CDFD1P8KMWPU/wkxU9T/CzFz1P8IsUvU/wqxa9T/CbFb1P8LsXvU/whxR9T/CnFn1P8JcVfU/wtxd9T/CPFP1P8K8W/U/wnxX9T/C/FISkOTXFYisNTHJHiyBRHpTg6xTEpjk1xnAzHzXC8DMfPcIIMJ8xwogwnznCSDCfNcLIMJ89wigynzHCqDKfOcJoMp81wugynz3CGDGfMcKYMZ85wlgxnzXC2DGfPcI4M58xwrgznznCeDOfNcL4M589wgQwXzHChDBfOcJEMF81wsQwXz3CJDJfMcKkMl85wmQzJcNkMl8tw+QxXyHDFDFfKcOUMV8lw1QxXy3D1DNfIcM0M18pw7QzXyXDdDNfLcP0MN8hwwww3ynDjDDfJcNMMN8tw8wy3yHDLDLfKcOsMt8lw2wy3y3D7DHfIcMcMd8pw5wx3yXDXDHfLcPcM98hwzwz3ynDvDPfJcN8M98tw/wwPyPDADA/K8OAMD8nw0AwPy/DwDI/I8MgMj8rw6AyPyfDYDI/L8PgMT8jwxAxPyvDkDE/J8NQMzfC0DE/P8IwMz8zwrAzPzvCcDM/N8LwMz8/wggwvzPCiDC/O8JIML83wsgwvz/CKDK/M8KoMr87wmgyvzfC6DK/P8IYMb8zwpgxvzvCWDG/N8LYMb8/wjgzvzPCuDO/O8J4M781wkGGQYZhhlGGcYZJhmmGWYZ5hkWGZYZVhnWGTYZthl2Gf4X0Z3p/hAxk+mOFDGT6c4SMZPprhYxk+nuETGT6Z4VMZPp3hMxk+m+FzGT6f4QsZvpjhSxm+nOErGb6a4WsZvp7hGxm+meFbGb6d4TsZvpvhexm+n+EHGX6Y4UcZfpzhJxl+muFnGX6e4RcZfpnhVxl+neE3GX6b4XcZfp/hDxn+mOFPGf6c4S8Z/prhbxn+nuEfGf6Z4V8Z/p3hPxn+m+F/GQ7JcGiGwzIcnuGIDEdmOCrD0RmOyXBshuPkOG6O4+U4fo4T5DhhjhPlOHGOk+Q4aY6T5Th5jlPkOGWOU+U4dY7T5DhtjtPlOH2OM+Q4Y44z5ThzjrPkOGuOs+U4e45z5DhnjnPlOHeO8+Q4b47z5Th/jgvkuGCOC+W4cI6L5LhojovluHiOS+S4ZI5L5bh0jsvkSI7L5rhcjsvnuEKOK+a4Uo4r57hKjqvmuFqOq+e4Ro5r5rhWjmvnuE6O6+a4Xo7r57hBjhvmuFGOG+e4SY6b5rhZjpvnuEWOW+a4VY5b57hNjtvmuF2O2+e4Q4475rhTjjvnuEuOu+a4W46757hHjnvmuFeOe+e4T4775rhfjvvneECOB+Z4UI4H53hIjofmeFiOh+d4RI5H5nhUjkfneEyOx+Z4XI7H53hCjifmeFKOJ+d4So6n5miOp+V4eo5n5HhmjmfleHaO5+R4bo7n5Xh+jhfkeGGOF+V4cY6X5HhpjpfleHmOV+R4ZY5X5Xh1jtfkeG2O1+V4fY435HhjjjfleHOOt+R4a4635Xh7jnfkeGeOd+V4d4735HhvjoMcgxzDHKMc4xyTHNMcsxzzHIscyxyrHOscmxzbHLsc+xzvy/H+HB/I8cEcH8rx4RwfyfHRHB/L8fEcn8jxyRyfyvHpHJ/J8dkcn8vx+RxfyPHFHF/K8eUcX8nx1Rxfy/H1HN/I8c0c38rx7RzfyfHdHN/L8f0cP8jxwxw/yvHjHD/J8dMcP8vx8xy/yPHLHL/K8escv8nx2xy/y/H7HH/I8cccf8rx5xx/yfHXHH/L8fcc/8jxzxz/yvHvHP/J8d8c/8txSI5DcxyW4/AcR+Q4MsdROY7OcUyOY3Mcp8BxCxyvwPELnKDACQucqMCJC5ykwEkLnKzAyQucosApC5yqwKkLnKbAaQucrsDpC5yhwBkLnKnAmQucpcBZC5ytwNkLnKPAOQucq8C5C5ynwHkLnK/A+QtcoMAFC1yowIULXKTARQtcrMDFC1yiwCULXKrApQtcpkAKXLbA5QpcvsAVClyxwJUKXLnAVQpctcDVCly9wDUKXLPAtQpcu8B1Cly3wPUKXL/ADQrcsMCNCty4wE0K3LTAzQrcvMAtCtyywK0K3LrAbQrctsDtCty+wB0K3LHAnQrcucBdCty1wN0K3L3APQrcs8C9Cty7wH0K3LfA/Qrcv8ADCjywwIMKPLjAQwo8tMDDCjy8wCMKPLLAowo8usBjCjy2wOMKPL7AEwo8scCTCjy5wFMKPLVACzytwNMLPKPAMws8q8CzCzynwHMLPK/A8wu8oMALC7yowIsLvKTASwu8rMDLC7yiwCsLvKrAqwu8psBrC7yuwOsLvKHAGwu8qcCbC7ylwFsLvK3A2wu8o8A7C7yrwLsLvKfAewscFBgUGBYYFRgXmBSYFpgVmBdYFFgWWBVYF9gU2BbYFdgXeF+B9xf4QIEPFvhQgQ8X+EiBjxb4WIGPF/hEgU8W+FSBTxf4TIHPFvhcgc8X+EKBLxb4UoEvF/hKga8W+FqBrxf4RoFvFvhWgW8X+E6B7xb4XoHvF/hBgR8W+FGBHxf4SYGfFvhZgZ8X+EWBXxb4VYFfF/hNgd8W+F2B3xf4Q4E/FvhTgT8X+EuBvxb4W4G/F/hHgX8W+FeBfxf4T4H/FvhfgUMKHFrgsAKHFziiwJEFjipwdIFjChxb4DgljlvieCWOX+IEJU5Y4kQlTlziJCVOWuJkJU5e4hQlTlniVCVOXeI0JU5b4nQlTl/iDCXOWOJMJc5c4iwlzlribCXOXuIcJc5Z4lwlzl3iPCXOW+J8Jc5f4gIlLljiQiUuXOIiJS5a4mIlLl7iEiUuWeJSJS5d4jIlUuKyJS5X4vIlrlDiiiWuVOLKJa5S4qolrlbi6iWuUeKaJa5V4tolrlPiuiWuV+L6JW5Q4oYlblTixiVuUuKmJW5W4uYlblHiliVuVeLWJW5T4rYlblfi9iXuUOKOJe5U4s4l7lLiriXuVuLuJe5R4p4l7lXi3iXuU+K+Je5X4v4lHlDigSUeVOLBJR5S4qElHlbi4SUeUeKRJR5V4tElHlPisSUeV+LxJZ5Q4oklnlTiySWeUuKpJVriaSWeXuIZJZ5Z4lklnl3iOSWeW+J5JZ5f4gUlXljiRSVeXOIlJV5a4mUlXl7iFSVeWeJVJV5d4jUlXlvidSVeX+INJd5Y4k0l3lziLSXeWuJtJd5e4h0l3lniXSXeXeI9Jd5b4qDEoMSwxKjEuMSkxLTErMS8xKLEssSqxLrEpsS2xK7EvsT7Sry/xAdKfLDEh0p8uMRHSny0xMdKfLzEJ0p8ssSnSny6xGdKfLbE50p8vsQXSnyxxJdKfLnEV0p8tcTXSny9xDdKfLPEt0p8u8R3Sny3xPdKfL/ED0r8sMSPSvy4xE9K/LTEz0r8vMQvSvyyxK9K/LrEb0r8tsTvSvy+xB9K/LHEn0r8ucRfSvy1xN9K/L3EP0r8s8S/Svy7xH9K/LfE/0ocUuLQEoeVOLzEESWOLHFUiaNLHFPi2BLHqXDcCsercPwKJ6hwwgonqnDiCiepcNIKJ6tw8gqnqHDKCqeqcOoKp6lw2gqnq3D6CmeocMYKZ6pw5gpnqXDWCmercPYK56hwzgrnqnDuCuepcN4K56tw/goXqHDBCheqcOEKF6lw0QoXq3DxCpeocMkKl6pw6QqXqZAKl61wuQqXr3CFClescKUKV65wlQpXrXC1ClevcI0K16xwrQrXrnCdCtetcL0K169wgwo3rHCjCjeucJMKN61wswo3r3CLCrescKsKt65wmwq3rXC7CrevcIcKd6xwpwp3rnCXCnetcLcKd69wjwr3rHCvCveucJ8K961wvwr3r/CACg+s8KAKD67wkAoPrfCwCg+v8IgKj6zwqAqPrvCYCo+t8LgKj6/whApPrPCkCk+u8JQKT63QCk+r8PQKz6jwzArPqvDsCs+p8NwKz6vw/AovqPDCCi+q8OIKL6nw0govq/DyCq+o8MoKr6rw6gqvqfDaCq+r8PoKb6jwxgpvqvDmCm+p8NYKb6vw9grvqPDOCu+q8O4K76nw3goHFQYVhhVGFcYVJhWmFWYV5hUWFZYVVhXWFTYVthV2FfYV3lfh/RU+UOGDFT5U4cMVPlLhoxU+VuHjFT5R4ZMVPlXh0xU+U+GzFT5X4fMVvlDhixW+VOHLFb5S4asVvlbh6xW+UeGbFb5V4dsVvlPhuxW+V+H7FX5Q4YcVflThxxV+UuGnFX5W4ecVflHhlxV+VeHXFX5T4bcVflfh9xX+UOGPFf5U4c8V/lLhrxX+VuHvFf5R4Z8V/lXh3xX+U+G/Ff5X4ZAKh1Y4rMLhFY6ocGSFoyocXeGYCsdWOE6N49Y4Xo3j1zhBjRPWOFGNE9c4SY2T1jhZjZPXOEWNU9Y4VY1T1zhNjdPWOF2N09c4Q40z1jhTjTPXOEuNs9Y4W42z1zhHjXPWOFeNc9c4T43z1jhfjfPXuECNC9a4UI0L17hIjYvWuFiNi9e4RI1L1rhUjUvXuEyN1LhsjcvVuHyNK9S4Yo0r1bhyjavUuGqNq9W4eo1r1LhmjWvVuHaN69S4bo3r1bh+jRvUuGGNG9W4cY2b1LhpjZvVuHmNW9S4ZY1b1bh1jdvUuG2N29W4fY071LhjjTvVuHONu9S4a4271bh7jXvUuGeNe9W4d4371LhvjfvVuH+NB9R4YI0H1XhwjYfUeGiNh9V4eI1H1HhkjUfVeHSNx9R4bI3H1Xh8jSfUeGKNJ9V4co2n1HhqjdZ4Wo2n13hGjWfWeFaNZ9d4To3n1nhejefXeEGNF9Z4UY0X13hJjZfWeFmNl9d4RY1X1nhVjVfXeE2N19Z4XY3X13hDjTfWeFONN9d4S4231nhbjbfXeEeNd9Z4V41313hPjffWOKgxqDGsMaoxrjGpMa0xqzGvsaixrLGqsa6xqbGtsauxr/G+Gu+v8YEaH6zxoRofrvGRGh+t8bEaH6/xiRqfrPGpGp+u8Zkan63xuRqfr/GFGl+s8aUaX67xlRpfrfG1Gl+v8Y0a36zxrRrfrvGdGt+t8b0a36/xgxo/rPGjGj+u8ZMaP63xsxo/r/GLGr+s8asav67xmxq/rfG7Gr+v8Ycaf6zxpxp/rvGXGn+t8bcaf6/xjxr/rPGvGv+u8Z8a/63xvxqH1Di0xmE1Dq9xRI0jaxxV4+gax9Q4tsZxGhy3wfEaHL/BCRqcsMGJGpy4wUkanLTByRqcvMEpGpyywakanLrBaRqctsHpGpy+wRkanLHBmRqcucFZGpy1wdkanL3BORqcs8G5Gpy7wXkanLfB+Rqcv8EFGlywwYUaXLjBRRpctMHFGly8wSUaXLLBpRpcusFlGqTBZRtcrsHlG1yhwRUbXKnBlRtcpcFVG1ytwdUbXKPBNRtcq8G1G1ynwXUbXK/B9RvcoMENG9yowY0b3KTBTRvcrMHNG9yiwS0b3KrBrRvcpsFtG9yuwe0b3KHBHRvcqcGdG9ylwV0b3K3B3Rvco8E9G9yrwb0b3KfBfRvcr8H9GzygwQMbPKjBgxs8pMFDGzyswcMbPKLBIxs8qsGjGzymwWMbPK7B4xs8ocETGzypwZMbPKXBUxu0wdMaPL3BMxo8s8GzGjy7wXMaPLfB8xo8v8ELGrywwYsavLjBSxq8tMHLGry8wSsavLLBqxq8usFrGry2wesavL7BGxq8scGbGry5wVsavLXB2xq8vcE7GryzwbsavLvBexq8t8FBg0GDYYNRg3GDSYNpg1mDeYNFg2WDVYN1g02DbYNdg32D9zV4f4MPNPhggw81+HCDjzT4aIOPNfh4g080+GSDTzX4dIPPNPhsg881+HyDLzT4YoMvNfhyg680+GqDrzX4eoNvNPhmg281+HaD7zT4boPvNfh+gx80+GGDHzX4cYOfNPhpg581+HmDXzT4ZYNfNfh1g980+G2D3zX4fYM/NPhjgz81+HODvzT4a4O/Nfh7g380+GeDfzX4d4P/NPhvg/81OKTBoQ0Oa3B4gyMaHNngqAZHNzimwbENjtPiuC2O1+L4LU7Q4oQtTtTixC1O0uKkLU7W4uQtTtHilC1O1eLULU7T4rQtTtfi9C3O0OKMLc7U4swtztLirC3O1uLsLc7R4pwtztXi3C3O0+K8Lc7X4vwtLtDigi0u1OLCLS7S4qItLtbi4i0u0eKSLS7V4tItLtMiLS7b4nItLt/iCi2u2OJKLa7c4iotrtriai2u3uIaLa7Z4lotrt3iOi2u2+J6La7f4gYtbtjiRi1u3OImLW7a4mYtbt7iFi1u2eJWLW7d4jYtbtvidi1u3+IOLe7Y4k4t7tziLi3u2uJuLe7e4h4t7tniXi3u3eI+Le7b4n4t7t/iAS0e2OJBLR7c4iEtHtriYS0e3uIRLR7Z4lEtHt3iMS0e2+JxLR7f4gktntjiSS2e3OIpLZ7aoi2e1uLpLZ7R4pktntXi2S2e0+K5LZ7X4vktXtDihS1e1OLFLV7S4qUtXtbi5S1e0eKVLV7V4tUtXtPitS1e1+L1Ld7Q4o0t3tTizS3e0uKtLd7W4u0t3tHinS3e1eLdLd7T4r0tDloMWgxbjFqMW0xaTFvMWsxbLFosW6xarFtsWmxb7FrsW7yvxftbfKDFB1t8qMWHW3ykxUdbfKzFx1t8osUnW3yqxadbfKbFZ1t8rsXnW3yhxRdbfKnFl1t8pcVXW3ytxddbfKPFN1t8q8W3W3ynxXdbfK/F91v8oMUPW/yoxY9b/KTFT1v8rMXPW/yixS9b/KrFr1v8psVvW/yuxe9b/KHFH1v8qcWfW/ylxV9b/K3F31v8o8U/W/yrxb9b/KfFf1v8r8UhLQ5tcViLw1sc0eLIFke1OLrFMS2ObXGcDsftcLwOx+9wgg4n7HCiDifucJIOJ+1wsg4n73CKDqfscKoOp+5wmg6n7XC6DqfvcIYOZ+xwpg5n7nCWDmftcLYOZ+9wjg7n7HCuDufucJ4O5+1wvg7n73CBDhfscKEOF+5wkQ4X7XCxDhfvcIkOl+xwqQ6X7nCZDulw2Q6X63D5DlfocMUOV+pw5Q5X6XDVDlfrcPUO1+hwzQ7X6nDtDtfpcN0O1+tw/Q436HDDDjfqcOMON+lw0w4363DzDrfocMsOt+pw6w636XDbDrfrcPsOd+hwxw536nDnDnfpcNcOd+tw9w736HDPDvfqcO8O9+lw3w7363D/Dg/o8MAOD+rw4A4P6fDQDg/r8PAOj+jwyA6P6vDoDo/p8NgOj+vw+A5P6PDEDk/q8OQOT+nw1A7t8LQOT+/wjA7P7PCsDs/u8JwOz+3wvA7P7/CCDi/s8KIOL+7wkg4v7fCyDi/v8IoOr+zwqg6v7vCaDq/t8LoOr+/whg5v7PCmDm/u8JYOb+3wtg5v7/CODu/s8K4O7+7wng7v7XDQYdBh2GHUYdxh0mHaYdZh3mHRYdlh1WHdYdNh22HXYd/hfR3e3+EDHT7Y4UMdPtzhIx0+2uFjHT7e4RMdPtnhUx0+3eEzHT7b4XMdPt/hCx2+2OFLHb7c4Ssdvtrhax2+3uEbHb7Z4Vsdvt3hOx2+2+F7Hb7f4QcdftjhRx1+3OEnHX7a4Wcdft7hFx1+2eFXHX7d4Tcdftvhdx1+3+EPHf7Y4U8d/tzhLx3+2uFvHf7e4R8d/tnhXx3+3eE/Hf7b4X8dDulwaIfDOhze4YgOR3Y4qsPRHY7pcGyH4/Q4bo/j9Th+jxP0OGGPE/U4cY+T9Dhpj5P1OHmPU/Q4ZY9T9Th1j9P0OG2P0/U4fY8z9DhjjzP1OHOPs/Q4a4+z9Th7j3P0OGePc/U4d4/z9Dhvj/P1OH+PC/S4YI8L9bhwj4v0uGiPi/W4eI9L9Lhkj0v1uHSPy/RIj8v2uFyPy/e4Qo8r9rhSjyv3uEqPq/a4Wo+r97hGj2v2uFaPa/e4To/r9rhej+v3uEGPG/a4UY8b97hJj5v2uFmPm/e4RY9b9rhVj1v3uE2P2/a4XY/b97hDjzv2uFOPO/e4S4+79rhbj7v3uEePe/a4V49797hPj/v2uF+P+/d4QI8H9nhQjwf3eEiPh/Z4WI+H93hEj0f2eFSPR/d4TI/H9nhcj8f3eEKPJ/Z4Uo8n93hKj6f2aI+n9Xh6j2f0eGaPZ/V4do/n9Hhuj+f1eH6PF/R4YY8X9Xhxj5f0eGmPl/V4eY9X9Hhlj1f1eHWP1/R4bY/X9Xh9jzf0eGOPN/V4c4+39Hhrj7f1eHuPd/R4Z4939Xh3j/f0eG+Pgx6DHsMeox7jHpMe0x6zHvMeix7LHqse6x6bHtseux77Hv8nAB4AaCsAAIZm27a9bNu2bXvZtm27d21k27b9jU4wwHCA0QDjASYDTAeYDTAfYDHAcoDVAOsBNgNsB9gNsB/gcwN8foAvDPDFAb40wJcH+MoAXx3gawN8fYBvDPDNAb41wLcH+M4A3x3gewN8f4AfDPDDAX40wI8H+MkAPx3gZwP8fIBfDPDLAX41wK8H+M0Avx3gdwP8foA/DPDHAf40wJ8H+MsAfx3gbwP8fYB/DPDPAf41wL8H+M8A/x3gfwMcMsChAxw2wOEDHDHAkQMcNcDRAxwzwLEDHCfAcQMcL8DxA5wgwAkDnCjAiQOcJMBJA5wswMkDnCLAKQOcKsCpA5wmwGkDnC7A6QOcIcAZA5wpwJkDnCXAWQOcLcDZA5wjwDkDnCvAuQOcJ8B5A5wvwPkDXCDABQNcKMCFA1wkwEUDXCzAxQNcIsAlA1wqwKUDXCZAAlw2wOUCXD7AFQJcMcCVAlw5wFUCXDXA1QJcPcA1AlwzwLUCXDvAdQJcN8D1Alw/wA0C3DDAjQLcOMBNAtw0wM0C3DzALQLcMsCtAtw6wG0C3DbA7QLcPsAdAtwxwJ0C3DnAXQLcNcDdAtw9wD0C3DPAvQLcO8B9Atw3wP0C3D/AAwI8MMCDAjw4wEMCPDTAwwI8PMAjAjwywKMCPDrAYwI8NsDjAjw+wBMCPDHAkwI8OcBTAjw1QAM8LcDTAzwjwDMDPCvAswM8J8BzAzwvwPMDvCDACwO8KMCLA7wkwEsDvCzAywO8IsArA7wqwKsDvCbAawO8LsDrA7whwBsDvCnAmwO8JcBbA7wtwNsDvCPAOwO8K8C7A7wnwHsDvC/A+wN8IMAHA3wowIcDfCTARwN8LMDHA3wiwCcDfCrApwN8JsBnAxwEGAQYBhgFGAeYBJgGmAWYB1gEWAZYBVgH2ATYBtgF2Af4XIDPB/hCgC8G+FKALwf4SoCvBvhagK8H+EaAbwb4VoBvB/hOgO8G+F6A7wf4QYAfBvhRgB8H+EmAnwb4WYCfB/hFgF8G+FWAXwf4TYDfBvhdgN8H+EOAPwb4U4A/B/hLgL8G+FuAvwf4R4B/BvhXgH8H+E+A/wb4X4BDAhwa4LAAhwc4IsCRAY4KcHSAYwIcG+A4IY4b4nghjh/iBCFOGOJEIU4c4iQhThriZCFOHuIUIU4Z4lQhTh3iNCFOG+J0IU4f4gwhzhjiTCHOHOIsIc4a4mwhzh7iHCHOGeJcIc4d4jwhzhvifCHOH+ICIS4Y4kIhLhziIiEuGuJiIS4e4hIhLhniUiEuHeIyIRLisiEuF+LyIa4Q4oohrhTiyiGuEuKqIa4W4uohrhHimiGuFeLaIa4T4rohrhfi+iFuEOKGIW4U4sYhbhLipiFuFuLmIW4R4pYhbhXi1iFuE+K2IW4X4vYh7hDijiHuFOLOIe4S4q4h7hbi7iHuEeKeIe4V4t4h7hPiviHuF+L+IR4Q4oEhHhTiwSEeEuKhIR4W4uEhHhHikSEeFeLRIR4T4rEhHhfi8SGeEOKJIZ4U4skhnhLiqSEa4mkhnh7iGSGeGeJZIZ4d4jkhnhvieSGeH+IFIV4Y4kUhXhziJSFeGuJlIV4e4hUhXhniVSFeHeI1IV4b4nUhXh/iDSHeGOJNId4c4i0h3hribSHeHuIdId4Z4l0h3h3iPSHeG+J9Id4f4gMhPhjiQyE+HOIjIT4a4mMhPh7iEyE+GeJTIT4d4jMhPhviIMQgxDDEKMQ4xCTENMQsxDzEIsQyxCrEOsQmxDbELsQ+xOdCfD7EF0J8McSXQnw5xFdCfDXE10J8PcQ3QnwzxLdCfDvEd0J8N8T3Qnw/xA9C/DDEj0L8OMRPQvw0xM9C/DzEL0L8MsSvQvw6xG9C/DbE70L8PsQfQvwxxJ9C/DnEX0L8NcTfQvw9xD9C/DPEv0L8O8R/Qvw3xP9CHBLi0BCHhTg8xBEhjgxxVIijQxwT4tgQx4lw3AjHi3D8CCeIcMIIJ4pw4ggniXDSCCeLcPIIp4hwygininDqCKeJcNoIp4tw+ghniHDGCGeKcOYIZ4lw1ghni3D2COeIcM4I54pw7gjniXDeCOeLcP4IF4hwwQgXinDhCBeJcNEIF4tw8QiXiHDJCJeKcOkIl4mQCJeNcLkIl49whQhXjHClCFeOcJUIV41wtQhXj3CNCNeMcK0I145wnQjXjXC9CNePcIMIN4xwowg3jnCTCDeNcLMIN49wiwi3jHCrCLeOcJsIt41wuwi3j3CHCHeMcKcId45wlwh3jXC3CHePcI8I94xwrwj3jnCfCPeNcL8I94/wgAgPjPCgCA+O8JAID43wsAgPj/CICI+M8KgIj47wmAiPjfC4CI+P8IQIT4zwpAhPjvCUCE+N0AhPi/D0CM+I8MwIz4rw7AjPifDcCM+L8PwIL4jwwggvivDiCC+J8NIIL4vw8giviPDKCK+K8OoIr4nw2givi/D6CG+I8MYIb4rw5ghvifDWCG+L8PYI74jwzgjvivDuCO+J8N4I74vw/ggfiPDBCB+K8OEIH4nw0Qgfi/DxCJ+I8MkIn4rw6QififDZCAcRBhGGEUYRxhEmEaYRZhHmERYRlhFWEdYRNhG2EXYR9hE+F+HzEb4Q4YsRvhThyxG+EuGrEb4W4esRvhHhmxG+FeHbEb4T4bsRvhfh+xF+EOGHEX4U4ccRfhLhpxF+FuHnEX4R4ZcRfhXh1xF+E+G3EX4X4fcR/hDhjxH+FOHPEf4S4a8R/hbh7xH+EeGfEf4V4d8R/hPhvxH+F+GQCIdGOCzC4RGOiHBkhKMiHB3hmAjHRjhOjOPGOF6M48c4QYwTxjhRjBPHOEmMk8Y4WYyTxzhFjFPGOFWMU8c4TYzTxjhdjNPHOEOMM8Y4U4wzxzhLjLPGOFuMs8c4R4xzxjhXjHPHOE+M88Y4X4zzx7hAjAvGuFCMC8e4SIyLxrhYjIvHuESMS8a4VIxLx7hMjMS4bIzLxbh8jCvEuGKMK8W4coyrxLhqjKvFuHqMa8S4Zoxrxbh2jOvEuG6M68W4fowbxLhhjBvFuHGMm8S4aYybxbh5jFvEuGWMW8W4dYzbxLhtjNvFuH2MO8S4Y4w7xbhzjLvEuGuMu8W4e4x7xLhnjHvFuHeM+8S4b4z7xbh/jAfEeGCMB8V4cIyHxHhojIfFeHiMR8R4ZIxHxXh0jMfEeGyMx8V4fIwnxHhijCfFeHKMp8R4aozGeFqMp8d4RoxnxnhWjGfHeE6M58Z4Xoznx3hBjBfGeFGMF8d4SYyXxnhZjJfHeEWMV8Z4VYxXx3hNjNfGeF2M18d4Q4w3xnhTjDfHeEuMt8Z4W4y3x3hHjHfGeFeMd8d4T4z3xnhfjPfH+ECMD8b4UIwPx/hIjI/G+FiMj8f4RIxPxvhUjE/H+EyMz8Y4iDGIMYwxijGOMYkxjTGLMY+xiLGMsYqxjrGJsY2xi7GP8bkYn4/xhRhfjPGlGF+O8ZUYX43xtRhfj/GNGN+M8a0Y347xnRjfjfG9GN+P8YMYP4zxoxg/jvGTGD+N8bMYP4/xixi/jPGrGL+O8ZsYv43xuxi/j/GHGH+M8acYf47xlxh/jfG3GH+P8Y8Y/4zxrxj/jvGfGP+N8b8Yh8Q4NMZhMQ6PcUSMI2McFePoGMfEODbGcRIcN8HxEhw/wQkSnDDBiRKcOMFJEpw0wckSnDzBKRKcMsGpEpw6wWkSnDbB6RKcPsEZEpwxwZkSnDnBWRKcNcHZEpw9wTkSnDPBuRKcO8F5Epw3wfkSnD/BBRJcMMGFElw4wUUSXDTBxRJcPMElElwywaUSXDrBZRIkwWUTXC7B5RNcIcEVE1wpwZUTXCXBVRNcLcHVE1wjwTUTXCvBtRNcJ8F1E1wvwfUT3CDBDRPcKMGNE9wkwU0T3CzBzRPcIsEtE9wqwa0T3CbBbRPcLsHtE9whwR0T3CnBnRPcJcFdE9wtwd0T3CPBPRPcK8G9E9wnwX0T3C/B/RM8IMEDEzwowYMTPCTBQxM8LMHDEzwiwSMTPCrBoxM8JsFjEzwuweMTPCHBExM8KcGTEzwlwVMTNMHTEjw9wTMSPDPBsxI8O8FzEjw3wfMSPD/BCxK8MMGLErw4wUsSvDTByxK8PMErErwywasSvDrBaxK8NsHrErw+wRsSvDHBmxK8OcFbErw1wdsSvD3BOxK8M8G7Erw7wXsSvDfB+xK8P8EHEnwwwYcSfDjBRxJ8NMHHEnw8wScSfDLBpxJ8OsFnEnw2wUGCQYJhglGCcYJJgmmCWYJ5gkWCZYJVgnWCTYJtgl2CfYLPJfh8gi8k+GKCLyX4coKvJPhqgq8l+HqCbyT4ZoJvJfh2gu8k+G6C7yX4foIfJPhhgh8l+HGCnyT4aYKfJfh5gl8k+GWCXyX4dYLfJPhtgt8l+H2CPyT4Y4I/Jfhzgr8k+GuCvyX4e4J/JPhngn8l+HeC/yT4b4L/JTgkwaEJDktweIIjEhyZ4KgERyc4JsGxCY6T4rgpjpfi+ClOkOKEKU6U4sQpTpLipClOluLkKU6R4pQpTpXi1ClOk+K0KU6X4vQpzpDijCnOlOLMKc6S4qwpzpbi7CnOkeKcKc6V4twpzpPivCnOl+L8KS6Q4oIpLpTiwikukuKiKS6W4uIpLpHikikuleLSKS6TIikum+JyKS6f4goprpjiSimunOIqKa6a4moprp7iGimumeJaKa6d4joprpvieimun+IGKW6Y4kYpbpziJilumuJmKW6e4hYpbpniViluneI2KW6b4nYpbp/iDinumOJOKe6c4i4p7pribinunuIeKe6Z4l4p7p3iPinum+J+Ke6f4gEpHpjiQSkenOIhKR6a4mEpHp7iESkemeJRKR6d4jEpHpvicSken+IJKZ6Y4kkpnpziKSmemqIpnpbi6SmekeKZKZ6V4tkpnpPiuSmel+L5KV6Q4oUpXpTixSlekuKlKV6W4uUpXpHilSleleLVKV6T4rUpXpfi9SnekOKNKd6U4s0p3pLirSneluLtKd6R4p0p3pXi3Snek+K9Kd6X4v0pPpDigyk+lOLDKT6S4qMpPpbi4yk+keKTKT6V4tMpPpPisykOUgxSDFOMUoxTTFJMU8xSzFMsUixTrFKsU2xSbFPsUuxTfC7F51N8IcUXU3wpxZdTfCXFV1N8LcXXU3wjxTdTfCvFt1N8J8V3U3wvxfdT/CDFD1P8KMWPU/wkxU9T/CzFz1P8IsUvU/wqxa9T/CbFb1P8LsXvU/whxR9T/CnFn1P8JcVfU/wtxd9T/CPFP1P8K8W/U/wnxX9T/C/FISkOTXFYisNTHJHiyBRHpTg6xTEpjk1xnAzHzXC8DMfPcIIMJ8xwogwnznCSDCfNcLIMJ89wigynzHCqDKfOcJoMp81wugynz3CGDGfMcKYMZ85wlgxnzXC2DGfPcI4M58xwrgznznCeDOfNcL4M589wgQwXzHChDBfOcJEMF81wsQwXz3CJDJfMcKkMl85wmQzJcNkMl8tw+QxXyHDFDFfKcOUMV8lw1QxXy3D1DNfIcM0M18pw7QzXyXDdDNfLcP0MN8hwwww3ynDjDDfJcNMMN8tw8wy3yHDLDLfKcOsMt8lw2wy3y3D7DHfIcMcMd8pw5wx3yXDXDHfLcPcM98hwzwz3ynDvDPfJcN8M98tw/wwPyPDADA/K8OAMD8nw0AwPy/DwDI/I8MgMj8rw6AyPyfDYDI/L8PgMT8jwxAxPyvDkDE/J8NQMzfC0DE/P8IwMz8zwrAzPzvCcDM/N8LwMz8/wggwvzPCiDC/O8JIML83wsgwvz/CKDK/M8KoMr87wmgyvzfC6DK/P8IYMb8zwpgxvzvCWDG/N8LYMb8/wjgzvzPCuDO/O8J4M783wvgzvz/CBDB/M8KEMH87wkQwfzfCxDB/P8IkMn8zwqQyfzvCZDJ/NcJBhkGGYYZRhnGGSYZphlmGeYZFhmWGVYZ1hk2GbYZdhn+FzGT6f4QsZvpjhSxm+nOErGb6a4WsZvp7hGxm+meFbGb6d4TsZvpvhexm+n+EHGX6Y4UcZfpzhJxl+muFnGX6e4RcZfpnhVxl+neE3GX6b4XcZfp/hDxn+mOFPGf6c4S8Z/prhbxn+nuEfGf6Z4V8Z/p3hPxn+m+F/GQ7JcGiGwzIcnuGIDEdmOCrD0RmOyXBshuPkOG6O4+U4fo4T5DhhjhPlOHGOk+Q4aY6T5Th5jlPkOGWOU+U4dY7T5DhtjtPlOH2OM+Q4Y44z5ThzjrPkOGuOs+U4e45z5DhnjnPlOHeO8+Q4b47z5Th/jgvkuGCOC+W4cI6L5LhojovluHiOS+S4ZI5L5bh0jsvkSI7L5rhcjsvnuEKOK+a4Uo4r57hKjqvmuFqOq+e4Ro5r5rhWjmvnuE6O6+a4Xo7r57hBjhvmuFGOG+e4SY6b5rhZjpvnuEWOW+a4VY5b57hNjtvmuF2O2+e4Q4475rhTjjvnuEuOu+a4W46757hHjnvmuFeOe+e4T4775rhfjvvneECOB+Z4UI4H53hIjofmeFiOh+d4RI5H5nhUjkfneEyOx+Z4XI7H53hCjifmeFKOJ+d4So6n5miOp+V4eo5n5HhmjmfleHaO5+R4bo7n5Xh+jhfkeGGOF+V4cY6X5HhpjpfleHmOV+R4ZY5X5Xh1jtfkeG2O1+V4fY435HhjjjfleHOOt+R4a4635Xh7jnfkeGeOd+V4d4735HhvjvfleH+OD+T4YI4P5fhwjo/k+GiOj+X4eI5P5Phkjk/l+HSOz+T4bI6DHIMcwxyjHOMckxzTHLMc8xyLHMscqxzrHJsc2xy7HPscn8vx+RxfyPHFHF/K8eUcX8nx1Rxfy/H1HN/I8c0c38rx7RzfyfHdHN/L8f0cP8jxwxw/yvHjHD/J8dMcP8vx8xy/yPHLHL/K8escv8nx2xy/y/H7HH/I8cccf8rx5xx/yfHXHH/L8fcc/8jxzxz/yvHvHP/J8d8c/8txSI5DcxyW4/AcR+Q4MsdROY7OcUyOY3Mcp8BxCxyvwPELnKDACQucqMCJC5ykwEkLnKzAyQucosApC5yqwKkLnKbAaQucrsDpC5yhwBkLnKnAmQucpcBZC5ytwNkLnKPAOQucq8C5C5ynwHkLnK/A+QtcoMAFC1yowIULXKTARQtcrMDFC1yiwCULXKrApQtcpkAKXLbA5QpcvsAVClyxwJUKXLnAVQpctcDVCly9wDUKXLPAtQpcu8B1Cly3wPUKXL/ADQrcsMCNCty4wE0K3LTAzQrcvMAtCtyywK0K3LrAbQrctsDtCty+wB0K3LHAnQrcucBdCty1wN0K3L3APQrcs8C9Cty7wH0K3LfA/Qrcv8ADCjywwIMKPLjAQwo8tMDDCjy8wCMKPLLAowo8usBjCjy2wOMKPL7AEwo8scCTCjy5wFMKPLVACzytwNMLPKPAMws8q8CzCzynwHMLPK/A8wu8oMALC7yowIsLvKTASwu8rMDLC7yiwCsLvKrAqwu8psBrC7yuwOsLvKHAGwu8qcCbC7ylwFsLvK3A2wu8o8A7C7yrwLsLvKfAewu8r8D7C3ygwAcLfKjAhwt8pMBHC3yswMcLfKLAJwt8qsCnC3ymwGcLHBQYFBgWGBUYF5gUmBaYFZgXWBRYFlgVWBfYFNgW2BXYF/hcgc8X+EKBLxb4UoEvF/hKga8W+FqBrxf4RoFvFvhWgW8X+E6B7xb4XoHvF/hBgR8W+FGBHxf4SYGfFvhZgZ8X+EWBXxb4VYFfF/hNgd8W+F2B3xf4Q4E/FvhTgT8X+EuBvxb4W4G/F/hHgX8W+FeBfxf4T4H/FvhfgUMKHFrgsAKHFziiwJEFjipwdIFjChxb4DgljlvieCWOX+IEJU5Y4kQlTlziJCVOWuJkJU5e4hQlTlniVCVOXeI0JU5b4nQlTl/iDCXOWOJMJc5c4iwlzlribCXOXuIcJc5Z4lwlzl3iPCXOW+J8Jc5f4gIlLljiQiUuXOIiJS5a4mIlLl7iEiUuWeJSJS5d4jIlUuKyJS5X4vIlrlDiiiWuVOLKJa5S4qolrlbi6iWuUeKaJa5V4tolrlPiuiWuV+L6JW5Q4oYlblTixiVuUuKmJW5W4uYlblHiliVuVeLWJW5T4rYlblfi9iXuUOKOJe5U4s4l7lLiriXuVuLuJe5R4p4l7lXi3iXuU+K+Je5X4v4lHlDigSUeVOLBJR5S4qElHlbi4SUeUeKRJR5V4tElHlPisSUeV+LxJZ5Q4oklnlTiySWeUuKpJVriaSWeXuIZJZ5Z4lklnl3iOSWeW+J5JZ5f4gUlXljiRSVeXOIlJV5a4mUlXl7iFSVeWeJVJV5d4jUlXlvidSVeX+INJd5Y4k0l3lziLSXeWuJtJd5e4h0l3lniXSXeXeI9Jd5b4n0l3l/iAyU+WOJDJT5c4iMlPlriYyU+XuITJT5Z4lMlPl3iMyU+W+KgxKDEsMSoxLjEpMS0xKzEvMSixLLEqsS6xKbEtsSuxL7E50p8vsQXSnyxxJdKfLnEV0p8tcTXSny9xDdKfLPEt0p8u8R3Sny3xPdKfL/ED0r8sMSPSvy4xE9K/LTEz0r8vMQvSvyyxK9K/LrEb0r8tsTvSvy+xB9K/LHEn0r8ucRfSvy1xN9K/L3EP0r8s8S/Svy7xH9K/LfE/0ocUuLQEoeVOLzEESWOLHFUiaNLHFPi2BLHqXDcCsercPwKJ6hwwgonqnDiCiepcNIKJ6tw8gqnqHDKCqeqcOoKp6lw2gqnq3D6CmeocMYKZ6pw5gpnqXDWCmercPYK56hwzgrnqnDuCuepcN4K56tw/goXqHDBCheqcOEKF6lw0QoXq3DxCpeocMkKl6pw6QqXqZAKl61wuQqXr3CFClescKUKV65wlQpXrXC1ClevcI0K16xwrQrXrnCdCtetcL0K169wgwo3rHCjCjeucJMKN61wswo3r3CLCrescKsKt65wmwq3rXC7CrevcIcKd6xwpwp3rnCXCnetcLcKd69wjwr3rHCvCveucJ8K961wvwr3r/CACg+s8KAKD67wkAoPrfCwCg+v8IgKj6zwqAqPrvCYCo+t8LgKj6/whApPrPCkCk+u8JQKT63QCk+r8PQKz6jwzArPqvDsCs+p8NwKz6vw/AovqPDCCi+q8OIKL6nw0govq/DyCq+o8MoKr6rw6gqvqfDaCq+r8PoKb6jwxgpvqvDmCm+p8NYKb6vw9grvqPDOCu+q8O4K76nw3grvq/D+Ch+o8MEKH6rw4QofqfDRCh+r8PEKn6jwyQqfqvDpCp+p8NkKBxUGFYYVRhXGFSYVphVmFeYVFhWWFVYV1hU2FbYVdhX2FT5X4fMVvlDhixW+VOHLFb5S4asVvlbh6xW+UeGbFb5V4dsVvlPhuxW+V+H7FX5Q4YcVflThxxV+UuGnFX5W4ecVflHhlxV+VeHXFX5T4bcVflfh9xX+UOGPFf5U4c8V/lLhrxX+VuHvFf5R4Z8V/lXh3xX+U+G/Ff5X4ZAKh1Y4rMLhFY6ocGSFoyocXeGYCsdWOE6N49Y4Xo3j1zhBjRPWOFGNE9c4SY2T1jhZjZPXOEWNU9Y4VY1T1zhNjdPWOF2N09c4Q40z1jhTjTPXOEuNs9Y4W42z1zhHjXPWOFeNc9c4T43z1jhfjfPXuECNC9a4UI0L17hIjYvWuFiNi9e4RI1L1rhUjUvXuEyN1LhsjcvVuHyNK9S4Yo0r1bhyjavUuGqNq9W4eo1r1LhmjWvVuHaN69S4bo3r1bh+jRvUuGGNG9W4cY2b1LhpjZvVuHmNW9S4ZY1b1bh1jdvUuG2N29W4fY071LhjjTvVuHONu9S4a4271bh7jXvUuGeNe9W4d4371LhvjfvVuH+NB9R4YI0H1XhwjYfUeGiNh9V4eI1H1HhkjUfVeHSNx9R4bI3H1Xh8jSfUeGKNJ9V4co2n1HhqjdZ4Wo2n13hGjWfWeFaNZ9d4To3n1nhejefXeEGNF9Z4UY0X13hJjZfWeFmNl9d4RY1X1nhVjVfXeE2N19Z4XY3X13hDjTfWeFONN9d4S4231nhbjbfXeEeNd9Z4V41313hPjffWeF+N99f4QI0P1vhQjQ/X+EiNj9b4WI2P1/hEjU/W+FSNT9f4TI3P1jioMagxrDGqMa4xqTGtMasxr7GosayxqrGusamxrbGrsa/xuRqfr/GFGl+s8aUaX67xlRpfrfG1Gl+v8Y0a36zxrRrfrvGdGt+t8b0a36/xgxo/rPGjGj+u8ZMaP63xsxo/r/GLGr+s8asav67xmxq/rfG7Gr+v8Ycaf6zxpxp/rvGXGn+t8bcaf6/xjxr/rPGvGv+u8Z8a/63xvxqH1Di0xmE1Dq9xRI0jaxxV4+gax9Q4tsZxGhy3wfEaHL/BCRqcsMGJGpy4wUkanLTByRqcvMEpGpyywakanLrBaRqctsHpGpy+wRkanLHBmRqcucFZGpy1wdkanL3BORqcs8G5Gpy7wXkanLfB+Rqcv8EFGlywwYUaXLjBRRpctMHFGly8wSUaXLLBpRpcusFlGqTBZRtcrsHlG1yhwRUbXKnBlRtcpcFVG1ytwdUbXKPBNRtcq8G1G1ynwXUbXK/B9RvcoMENG9yowY0b3KTBTRvcrMHNG9yiwS0b3KrBrRvcpsFtG9yuwe0b3KHBHRvcqcGdG9ylwV0b3K3B3Rvco8E9G9yrwb0b3KfBfRvcr8H9GzygwQMbPKjBgxs8pMFDGzyswcMbPKLBIxs8qsGjGzymwWMbPK7B4xs8ocETGzypwZMbPKXBUxu0wdMaPL3BMxo8s8GzGjy7wXMaPLfB8xo8v8ELGrywwYsavLjBSxq8tMHLGry8wSsavLLBqxq8usFrGry2wesavL7BGxq8scGbGry5wVsavLXB2xq8vcE7GryzwbsavLvBexq8t8H7Gry/wQcafLDBhxp8uMFHGny0wccafLzBJxp8ssGnGny6wWcafLbBQYNBg2GDUYNxg0mDaYNZg3mDRYNlg1WDdYNNg22DXYN9g881+HyDLzT4YoMvNfhyg680+GqDrzX4eoNvNPhmg281+HaD7zT4boPvNfh+gx80+GGDHzX4cYOfNPhpg581+HmDXzT4ZYNfNfh1g980+G2D3zX4fYM/NPhjgz81+HODvzT4a4O/Nfh7g380+GeDfzX4d4P/NPhvg/81OKTBoQ0Oa3B4gyMaHNngqAZHNzimwbENjtPiuC2O1+L4LU7Q4oQtTtTixC1O0uKkLU7W4uQtTtHilC1O1eLULU7T4rQtTtfi9C3O0OKMLc7U4swtztLirC3O1uLsLc7R4pwtztXi3C3O0+K8Lc7X4vwtLtDigi0u1OLCLS7S4qItLtbi4i0u0eKSLS7V4tItLtMiLS7b4nItLt/iCi2u2OJKLa7c4iotrtriai2u3uIaLa7Z4lotrt3iOi2u2+J6La7f4gYtbtjiRi1u3OImLW7a4mYtbt7iFi1u2eJWLW7d4jYtbtvidi1u3+IOLe7Y4k4t7tziLi3u2uJuLe7e4h4t7tniXi3u3eI+Le7b4n4t7t/iAS0e2OJBLR7c4iEtHtriYS0e3uIRLR7Z4lEtHt3iMS0e2+JxLR7f4gktntjiSS2e3OIpLZ7aoi2e1uLpLZ7R4pktntXi2S2e0+K5LZ7X4vktXtDihS1e1OLFLV7S4qUtXtbi5S1e0eKVLV7V4tUtXtPitS1e1+L1Ld7Q4o0t3tTizS3e0uKtLd7W4u0t3tHinS3e1eLdLd7T4r0t3tfi/S0+0OKDLT7U4sMtPtLioy0+1uLjLT7R4pMtPtXi0y0+0+KzLQ5aDFoMW4xajFtMWkxbzFrMWyxaLFusWqxbbFpsW+xa7Ft8rsXnW3yhxRdbfKnFl1t8pcVXW3ytxddbfKPFN1t8q8W3W3ynxXdbfK/F91v8oMUPW/yoxY9b/KTFT1v8rMXPW/yixS9b/KrFr1v8psVvW/yuxe9b/KHFH1v8qcWfW/ylxV9b/K3F31v8o8U/W/yrxb9b/KfFf1v8r8UhLQ5tcViLw1sc0eLIFke1OLrFMS2ObXGcDsftcLwOx+9wgg4n7HCiDifucJIOJ+1wsg4n73CKDqfscKoOp+5wmg6n7XC6DqfvcIYOZ+xwpg5n7nCWDmftcLYOZ+9wjg7n7HCuDufucJ4O5+1wvg7n73CBDhfscKEOF+5wkQ4X7XCxDhfvcIkOl+xwqQ6X7nCZDulw2Q6X63D5DlfocMUOV+pw5Q5X6XDVDlfrcPUO1+hwzQ7X6nDtDtfpcN0O1+tw/Q436HDDDjfqcOMON+lw0w4363DzDrfocMsOt+pw6w636XDbDrfrcPsOd+hwxw536nDnDnfpcNcOd+tw9w736HDPDvfqcO8O9+lw3w7363D/Dg/o8MAOD+rw4A4P6fDQDg/r8PAOj+jwyA6P6vDoDo/p8NgOj+vw+A5P6PDEDk/q8OQOT+nw1A7t8LQOT+/wjA7P7PCsDs/u8JwOz+3wvA7P7/CCDi/s8KIOL+7wkg4v7fCyDi/v8IoOr+zwqg6v7vCaDq/t8LoOr+/whg5v7PCmDm/u8JYOb+3wtg5v7/CODu/s8K4O7+7wng7v7fC+Du/v8IEOH+zwoQ4f7vCRDh/t8LEOH+/wiQ6f7PCpDp/u8JkOn+1w0GHQYdhh1GHcYdJh2mHWYd5h0WHZYdVh3WHTYdth12Hf4XMdPt/hCx2+2OFLHb7c4Ssdvtrhax2+3uEbHb7Z4Vsdvt3hOx2+2+F7Hb7f4QcdftjhRx1+3OEnHX7a4Wcdft7hFx1+2eFXHX7d4Tcdftvhdx1+3+EPHf7Y4U8d/tzhLx3+2uFvHf7e4R8d/tnhXx3+3eE/Hf7b4X8dDulwaIfDOhze4YgOR3Y4qsPRHY7pcGyH4/Q4bo/j9Th+jxP0OGGPE/U4cY+T9Dhpj5P1OHmPU/Q4ZY9T9Th1j9P0OG2P0/U4fY8z9DhjjzP1OHOPs/Q4a4+z9Th7j3P0OGePc/U4d4/z9Dhvj/P1OH+PC/S4YI8L9bhwj4v0uGiPi/W4eI9L9Lhkj0v1uHSPy/RIj8v2uFyPy/e4Qo8r9rhSjyv3uEqPq/a4Wo+r97hGj2v2uFaPa/e4To/r9rhej+v3uEGPG/a4UY8b97hJj5v2uFmPm/e4RY9b9rhVj1v3uE2P2/a4XY/b97hDjzv2uFOPO/e4S4+79rhbj7v3uEePe/a4V49797hPj/v2uF+P+/d4QI8H9nhQjwf3eEiPh/Z4WI+H93hEj0f2eFSPR/d4TI/H9nhcj8f3eEKPJ/Z4Uo8n93hKj6f2aI+n9Xh6j2f0eGaPZ/V4do/n9Hhuj+f1eH6PF/R4YY8X9Xhxj5f0eGmPl/V4eY9X9Hhlj1f1eHWP1/R4bY/X9Xh9jzf0eGOPN/V4c4+39Hhrj7f1eHuPd/R4Z4939Xh3j/f0eG+P9/V4f48P9Phgjw/1+HCPj/T4aI+P9fh4j0/0+GSPT/X4dI/P9Phsj4Me/w8AAP//wLDz/g==")

      
      syz_emit_ethernet(0x3a, &(0x7f0000000100)={@broadcast, @dev, @void, {@ipv4={0x800, @udp={{0x5, 0x4, 0x3, 0x0, 0x2c, 0x0, 0x0, 0x0, 0x2f, 0x0, @rand_addr, @empty}, {0x0, 0x88be, 0x18, 0x0, @wg=@data={0x4, 0x0, 0x8}}}}}}, 0x0)

      
      keyctl$set_reqkey_keyring(0xe, 0xffffffffffffffff)

      
      syz_emit_ethernet(0x86, &(0x7f00000003c0)={@random="a5050f0000b5", @random="0000009000", @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x78, 0x0, 0x0, 0x0, 0x1, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @local}, @time_exceeded={0x5, 0x0, 0x0, 0xe0, 0x0, 0xe000, {0x17, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x11, 0x0, @multicast2, @local, {[@rr={0x7, 0x3}, @ssrr={0x89, 0x7, 0x76, [@multicast1]}, @timestamp_prespec={0x44, 0x3c, 0x0, 0x3, 0x0, [{@private=0xa01012f}, {@multicast1, 0x2}, {@initdev={0xac, 0x1e, 0x0, 0x0}}, {@remote}, {@loopback}, {@dev}, {@private}]}]}}}}}}}, 0x0)

      
      set_thread_area(&(0x7f0000000140)={0x2, 0x100000, 0x400, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1})

      
      bpf$PROG_LOAD(0x5, &(0x7f0000000140)={0x10, 0x4, &(0x7f00000000c0)=ANY=[@ANYBLOB="b4000000000000080f1000000000000063000000000000009500050000000000d1449c003b097606b153e6dc6de837ae96a815df705fef53cf0ff5e7bdc3bd95cfc0d4800fae421c84b7988e61ed7df452e4b7e139044387aa"], &(0x7f0000003ff6)='GPL\x00', 0x2, 0xba, &(0x7f000000cf3d)=""/186, 0x41100, 0x0, '\x00', 0x0, @fallback=0x27, 0xffffffffffffffff, 0x8, &(0x7f0000000000), 0x8, 0x10, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x3}, 0x10}, 0x41)

      
      syz_mount_image$iso9660(&(0x7f0000000380), &(0x7f00000000c0)='./file0\x00', 0x204818, &(0x7f0000000ac0)={[{@map_off}, {@check_relaxed}, {@map_off}, {}, {@overriderock}, {@unhide}, {@map_off}, {@check_strict}]}, 0xff, 0x572, &(0x7f00000003c0)="$eJzs3V1vE8cawPFnQ3LiY6To6JwjhKIQhtBWQQpmbYORxdV2PXYG7F1rd42SKxQRB0U4UBEqNbmh3NBWaj8EveyH6EW/D+pHaLW7dl6IX8gLSRr9fxbMeHd25xnH2ifreGcFAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACI5VZsO29J3XitJTWYWwn8xu7T7tYHFsitfcWQfkWs+J9kMnI1XXT1/7urr8T/zclM+mxGMnGRke3LV/7z4H/jY73thwR8KL8edk//TjfY3Np+vtLptF+dVCDn0LXJwetq2jOhbxpOTSsT+qpcKtl3Fquhqpq6DpfDSDeUG2gn8gM1795S+XK5qHRu2W95tYpT172F928XbLukHuaa2glC37vzMDflLpp63Xi1pE28Om5zP34jPjKRirTTUGptvdMujhpA3Cj/KY0KoxoV7EIhny8U8qV75Xv3bXv8wAL7I3Kgxcm9afHPdIJHb+B4xrr5X+pixJOWLInq+3ClIoH40hiwvquX/7+8o4f2uzf/97L81d3V05Lk/1mRSRGZHZT/B8Ryeo9N2ZJteS4r0pGOtOXVmUd0uo+aaPHESCi+GGmIkyxR3SVKylKSktjyRBalKqEoqYqRumgJZVlCiUQn7yhXgssijkTiSyBK5sWVW6IkL2UpS1GUaMnJsvjSEk9qUhEn2cuarCeve1GUNSjGnUb5gcPI9t53bSkMGS35H8d30odw4Mj+6uV/AAAAAABwYVnJp+/x+f+EXEtqVVPX9lmHBQAAAAAATlDyl/+ZuJiIa9fE4vwfAAAAAICLxpLZ7qcAWbme1tbESi6X4kMAAAAAAAAuiOTv/7NxkcyBcl2snelSOP8HAAAAAOCC+HHkHPthc9L6/U8JggnrbXPpC2sjmZvX2biUbnfp4z1G1WlrqruTpCj9IXExPu7qGSuTNtqZBPNDt1gbFYe1G4CzE8D3nxbAePeZ/Cw30jY3VtNytbcm7SVbNXWdc/36g7w4ztRYpJeib1+sfyfJ8H/yGlNWRtY77dzTl53VJJa38V7ebnQnUDwwj+KQWF4n8y0k11z0HfFEciFGt9+sJWvrnba9d/xj6eZj+3t8MzWkz3cyl7aa6854m90//kzcZz43aPTdKPLHHPk7uZm2uTl/My36RFEYFUVhbxT9X4vjR1EcFUXxmFEAwFlZG5GFLDmQd49wlDtadpdDZvd3Mp+2mZ9ODqzj032O6PaoI7p9zOz224F7IA3KsXG/v3yUVd/HG7wf2G9YL1jxS3jp9cY3cmVza/v2+sbKs/az9otCoViy79r2vYJMJMPoFuQeAEAfyT12JL3HjtUvM2d2mg66C491d8RZ9X93vlKQk6fyUjqyKgvJ1QbJNw767jW752sICyPOWrNJmkzv8LIw5KzuX8lVDr39Foa23R9D8XP/GAAAOFVzI/Kwte83hL4trIUR5937c/nws+Psnru1AQCAz0MHH6xs9IMVBKb5JF8u551oUavAdx+pwFRqWhkv0oG76Hg1rZqBH/muX48rj01FhypsNZt+EKmqH6imH5qlZPpA1b31e6gbjhcZN2zWtRNq5fpe5LiRqpjQVc3W13UTLuog2ThsatdUjetExvdU6LcCV+eUCrXe09BUtBeZqomrnmoGpuEEGfXYr7caWlV06AamGfnpDnt9Ga/qB41kt7mzfrEBADgnNre2n690Ou1Xh6hMxmfth9iqb8eZUx8qAADoGpGlAQAAAAAAAAAAAAAAAAAAAADAOXCU6/9OufKViBxl894Et+djFBe50psK+lMa934qZx0zlRGVkYeON5/toATgVPwdAAD//6r8RMI=")

      
      bpf$PROG_LOAD(0x5, &(0x7f0000000100)={0x13, 0x4, &(0x7f0000000080)=ANY=[@ANYBLOB="85000000180000004c000000000000006f0000000000000095000a0000000000"], &(0x7f0000000040)='syzkaller\x00', 0x4, 0x99, &(0x7f0000000180)=""/153, 0x0, 0x0, '\x00', 0x0, @fallback, 0xffffffffffffffff, 0x8, 0x0, 0x46, 0x10, 0x0, 0xfffffffffffffed8}, 0x3f)

      
      bpf$PROG_LOAD(0x5, &(0x7f0000000000)={0x4, 0xc, &(0x7f0000001180)=ANY=[@ANYBLOB="180000000000000000000000000000008500000060000000180100002020782500000000002020207b1af8ff00000000bfa100000000000007010000f8ffffffb702000008000000b70300000000000085000000b200000095"], &(0x7f0000000140)='syzkaller\x00', 0x0, 0x0, 0x0, 0x41100, 0x0, '\x00', 0x0, @fallback=0x2c, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, 0x94)

      
      syz_emit_ethernet(0x6c, &(0x7f0000000080)={@broadcast, @dev, @void, {@ipv4={0x800, @gre={{0x5, 0x4, 0x3, 0x0, 0x5e, 0x0, 0x0, 0x0, 0x2f, 0x0, @private, @multicast1}, {{0x0, 0x0, 0x1, 0x0, 0x2, 0x0, 0x0, 0x4, 0x8100, 0x2, 0x0, [0xff], "7f1a"}, {0x0, 0x0, 0x0, 0x0, 0x11, 0x0, 0x800, [], '+o'}}}}}}, 0x0)

      
      setsockopt$inet6_int(0xffffffffffffffff, 0x29, 0x48, 0x0, 0x0)

      
      bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f00000003c0)={0x2, 0xd, &(0x7f0000000000)=ANY=[@ANYBLOB="18020000000000000000000000000000851000000100000095000000000000003800000020646c2500000000002020207b1af8ff00000000bd21ffff0000000007010000f8ffffffb502020008000000b70300000000000085000000a400000095"], &(0x7f0000000080)='syzkaller\x00'}, 0x94)

      
      syz_emit_ethernet(0x5a, &(0x7f0000000500)={@local, @broadcast, @void, {@ipv4={0x800, @tcp={{0x5, 0x4, 0x0, 0x0, 0x4c, 0x0, 0x0, 0x2, 0x6, 0x0, @remote, @local}, {{0x0, 0x4e22, 0x41424344, 0x41424344, 0x0, 0x6, 0xe, 0x0, 0xfffd, 0x0, 0x7, {[@md5sig={0x1d, 0x12, "d285b6853bc4dc54c6910c1d66f8841a"}, @md5sig={0x13, 0x12, "b8d7fb567848ce8b31f1efa110a3bb44"}]}}}}}}}, 0x0)

      
      syz_emit_ethernet(0x46, &(0x7f0000000040)={@local, @empty, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x38, 0x0, 0x0, 0x0, 0x1, 0x0, @private=0xa010101, @local}, @time_exceeded={0x3, 0x5, 0x0, 0x3, 0x0, 0x6, {0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x89, 0x0, @broadcast, @multicast2}, "0e70a4b52dd6ac08"}}}}}, 0x0)

      
      bpf$PROG_LOAD_XDP(0x5, &(0x7f0000000140)={0x6, 0x4, &(0x7f00000003c0)=ANY=[@ANYBLOB="18000000000000000000000000000006c3a0fcff4100000095"], &(0x7f0000000040)='GPL\x00', 0x2, 0x96, &(0x7f0000000080)=""/150, 0x41100, 0x4}, 0x25)

      
      bpf$BPF_BTF_LOAD(0x12, &(0x7f00000000c0)={&(0x7f0000000600)=ANY=[@ANYBLOB="9feb01001800000000000000400000004000000002000000000000000000000d0400000000000000000000010500000008000000000000000000000300000000010000000200000004000000000000000000000b03"], 0x0, 0x5a}, 0x28)

      
      request_key(&(0x7f0000000000)='id_legacy\x00', &(0x7f0000000040)={'syz', 0x2}, &(0x7f0000000140)='mptcp_pm\x00', 0x0)

      
      bpf$BPF_BTF_LOAD(0x12, &(0x7f0000000280)={&(0x7f0000000040)={{0xeb9f, 0x1, 0x0, 0x18, 0x0, 0x24, 0x24, 0x2, [@array={0x0, 0x0, 0x0, 0x3, 0x0, {0x2, 0x2, 0x20}}, @union]}}, 0x0, 0x3e}, 0x28)

      
      syz_mount_image$nilfs2(&(0x7f0000000380), &(0x7f0000000a40)='./file0\x00', 0x0, &(0x7f0000000180)=ANY=[@ANYBLOB="0044159743df776a5de9392543da7d297b3dea"], 0xf, 0xa02, &(0x7f0000001ec0)="$eJzs3U9sXEcdAOB5a68d16mzhgKmoYmBAqmgTrCtKrmkMZRrpN7KsQppiHBTlHBpVakpUsUNRap67IGqh1wQFeLSA1JEVUQvQZQ7CoVKVVGgFQipoNYo9sx6d+zH/rG99vp9n/Tz+L2ZfTPPu943fp6dCUBl1Va/Li7OFCG89PqLj/zkD698twghHG2WaLSUW9uqhxCKtsevuxUzPv7w2XObpUWYX/2atsPZ283HToYQrobZcCM0wsL7j829ubT08vWbj16efuH02zt0+gAAUCln3/jH7x94761vTP/nF0fOhPHm/tQ/b8TtydjvPxH793n/v2hJi5btZCwrNxIj//thJCs3mtUzWlJfPTtOvaTcWIf6Rlr2bXaeALAfrN/XK2pzbdu12tzc2nX/jlvjY8XcpYvLT1zZpYYCANvmXw+HEM4IIYQQokqxcmi3eyAAQNXl44U3uJqPLNia5tEOdFf/7aXa5o+HbTDo17/6h6v+V5/3jsP22a+vpnRe6fcojWPIxxGOZI/r9fe/lh1ntMd2lo0rHJbxhmXtzH+ue1VZ+3t9HndLWfvz8bB7VVn783G6e1VZ+8cH3I5+lbX/wIDb0a+y9k80v9uvV7jtcd/q1/V3iyNZfuv1M39PH5b3eACg3UfG/wkhhBCVi+d2uwMCAOw5+fw4K1HKz+fjyfPzeXjy/HxeoDx/vEP+gQ75AMBGC7+68PNrxfr/+bc6Hi6Nu7grppM9ticfrdFr/Vsd97TV+odl3BIA1fbUTxe/89aZpZG1+X/Xr2WfZPP/prl6r8XtNO7yYLbdnPt3tr2eWkm5gztxUgDA/5Wuv2Xz/94dt2dCvXji4vL5E3F7Kqa/G6+P39n/zQG3GwDoX7fz/8+E9vn/Dzb312ut/YJD6/uL1n5BI9s/X7J/IW5Px/T74xOr++fOPbX8ve0+eQCoqDe+dfC1D357Kazd/1///3e6/59u4zfiWLsPYoHUT0j3Bzbc/z/WXs9UWbkT7eUOlZU72V6ukZWrx8jn3cjHB05kj0vjFNK4h9TfSeMap8vak02QMZaVG41xd9aeqaw9G873RHt78nloUv2NbH8+7iGVmw4AsNGVp5/5wePLy+cv+8Y3vqngN8+FEDbL2u13JmCnHf/Rkz88fuXpZx68+OTjF85fOH9pYXH+1KmHFucXHzq+el//eOvdfQBgP1jv9O92SwAAAAAAAAAAAAAAgDKD+KTxbp8jANDu7w+HEM6ILEb2QBuEECk+2gNtEGKfxcpKvuIvAMBg9bre/lY1jxbn80/rHqT04IN/nL4Tqdjtpfb+kvWL2U6Dfv2rf7jqf/X57a2/ub5I1+9/tfYDzPZX761fnjzZWv+9o13Wn5//sf7q/3NW/9dCd/Wv/Cyrv8+pcd/J6r+ry/o3nP/J/ur/S6z/nrh97Mvd1t/+/Kf1dtJyOBPZ+UyW1P/Xevv5p7X9ej7/Az2cdIt34/kDQBXVdrsBOyT1ElI/OvVDWtfnCy3r7IWsfLf9/1p2nHy9vn6l46Z+0BfjdurupHUD8/UOe21/Wp9wKjtu0WW/tuz1Myz/VSpr/3Y9jzutrP35epB7VVn7xwbcjn6VtT//vdyrytrf559VA1fW/okBt2NYHY5p2fUwXX+mYl7abmTbk5s8F/u1bwEAw+7bp2/ef+3rozfy9fnTdT39GTgZ/6a+npXL+wsTWd+xyMp/KaY/jukrMf1NTN/Jjrez/20DgGp6z+f/hBBCiMpF1T//5/4CVVb113/Vz7/a7/6e/07S6yO/j5+Mdsivt+SPbJI/1uHx41l+/nwd6JB/T3bclSjlf6ZD/mc75H+uQ/5Mh/xDHfLv7ZB/uEP+FzrkH+mQf7RDPgDD6fMx9f4OANWRj/tz/QeA/S9NrOP6DwDV8amYll3/7+uQDwAMn0/H1PUdACqk2Hymx63O2wMMjzS/dPo9j8uBhPtj+pWYfjWmab2UPpdfAfaA//7713+7VqzP93c4y+92Pvmi1v7Ju3z9nwe6bE/++b1e57NvdFnPTtU/vcX6AQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABis2urXxcWZIoSXXn/xkX+e+tO7RQjhaLNEo6Xc2la9ZXu27TghvFaspR9/+Oy51vSTmBZhPhShaO4PZ283a5oMIVwNs+FGaISF9x+be3Np6eXrNx+9PP3C6bd38EcAAAAA+97/AgAA//87+jkE")

      
      bpf$PROG_LOAD(0x5, &(0x7f000000e000)={0x15, 0x4, &(0x7f00000000c0)=ANY=[@ANYBLOB="b400000000000000791028000000000069002a00000000009500740000000000", @ANYRESDEC], &(0x7f0000003ff6)='GPL\x00', 0x2, 0xfd90, &(0x7f000000cf3d)=""/164, 0x0, 0x25, '\x00', 0x0, @sk_reuseport, 0xffffffffffffffff, 0x8, 0x0, 0x0, 0x10, 0x0, 0xffffff8f}, 0x48)

      
      bpf$BPF_BTF_LOAD(0x12, &(0x7f0000000040)={&(0x7f0000000680)=ANY=[@ANYBLOB="9feb0100180000000000000030000000300000000200000000000000000000030009"], 0x0, 0x4a, 0x0, 0x1, 0x2}, 0x28)

      
      removexattr(0x0, &(0x7f0000000300)=ANY=[])

      
      bpf$PROG_LOAD(0x5, &(0x7f00000054c0)={0x4, 0x16, &(0x7f0000001000)=ANY=[@ANYBLOB="d4128c000000000061134c0000000000bf20000000000000070000000f0000003d030100000000009500ffb1000000006926000000000000bf6700000000000036000b000fff52004507faff15300000d60600000ee60000bf050000000000003d63000000000000650700000200000007070000fbffffff1f75000000000000bf54000000000000070000000410f900bd430100000000009500000000000000050000000000000095000000000000001c15a3ce747c693a74b62fd0758b15f09429c09074bc4b2bd2dc480dd7a064b8673e2060162cc43bcba1060999eef9d60bb39d0af449deaa27ea949e8f9000d885deea2783835e29eba8546fc020c1966f8b5f32b095f566edf66b7751828da9dbd5b996b9e8d897e461c01c697671d100000000400036c17fb01dde179c1f26cac1c7b21bde7d1a55d6ebe700b3be005e47ef55e0dd81244b18590e000000000000356d82e43407a6d7fa94b21002f06cd247b126b6349ab62d7b07ba0a71a72145edade9941f49f300a8c8913e0e4ea9e4c77740ab3312edee62a4dc2fc85755d387d8a1bc8eb71fbe11b2216cc8d1f0160c237d929b49d828724b95555b459f4763c6222175c974be2f76fb5f330b015a68587a75c013000000000000000000000003000000000000d6ddc46e58eff8f4fbadfc6a3af8123b7f4240713a4c0cdc9d7820c4eb67cc0f8b5fe9258eeacb5776aebbab3d5c55020000006082778366dadfc36029633e0514cbcee1f3928970bde148c940434f33acd377cbad17673b2d30b6339255c98eba97efb4e9ac1f11be815dd6045592edcbee7f253ec74c7c1313505bd7ff8fd58b3a6569c91dbdef1df585aeaea7346a2a65caee5c85f9eddeeeee3c8a2e523c864ac430eb47cb4d0c8767b9d4125661b5a1a170c04b64da3a99ddb93bf14fae3ca2d1e882375b8dbac83978e136c34f90b33cc0eeb57debcfe26589efc08125d5d62a7e593c9738a50171adf051ea4f07e7e7e770c2016eeacbe8511afffffbea75759a1ea5404f5453c0b5c46c9700808c096cf8cf5223f341cbea3841b5cd224c1b381d56afebe9f99a00e3cd94dc0bb7af9e8709db487cc4d9b3b96723d69d512ddd57b0dee9b9f6ae80a502cce352098603e77f9ecced07fa25e99e9e415414c91f8bfd1c150570512f26c4ee34a64c131dce3800000000000000006c86287945bd8d258442870e000000000000000000000000f7e6a10de4bf7369b0d5b5373829b09bf5b7b34099b27ac7770fca449d4c4ca15f88b588b2429af2e1d1a4e1fa44cb80fcfae6e50d7e5b4675d7e0be706224f34e6eed553b40e2b897e73752fc7d1e4b0f4c5967eefd7448d5fde5841fa464a67267c631052bd7333769a4b8d19d4794357edce762e8136ab9d7ed34a72baffd849b90579b96b3"], &(0x7f0000000100)='GPL\x00'}, 0x48)

      
      syz_mount_image$ext4(&(0x7f0000000200)='ext4\x00', &(0x7f00000001c0)='./bus\x00', 0x800714, &(0x7f0000000a40)={[{@journal_path={'journal_path', 0x3d, './file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'}}, {@journal_ioprio={'journal_ioprio', 0x3d, 0x6}}]}, 0x0, 0x4b6, &(0x7f0000001400)="$eJzs3M1rHOUfAPDvbF763uTXX622thqtYvAladKqPXhQUfCgIOihHmOS1tptI00EW4KNIvUoBe/iUfAv8OZF1IMIXhU8SqFoEJp6kMjMzjTJZrPZvHWb7OcDm32emWfzPN+ZeXafnWdnAmhZPemfJGJ3RPwaEV2V7MICPZWnmenJ4VvTk8NJzM6+8WeSlbs5PTlcFC1etyvP9JYiSp8k8XyyuN7xS5fPDZXLoxfzfP/E+ff6xy9dfurs+aEzo2dGLwyePHni+MCzzww+3UgYbcsVSOO6eejDscMHX3nr2mvDp669/cPXabMOHKmsnx/Hsm4VAU3VLdaTbrW/ZjPV6x5tuLLNYc+8dNLexIawImnHSXdXR9b/u6It5nZeV7z8cVMbB2yo9LNp29Krp2aBLSyJZrcAaI7igz79/ls87tDQ465w44WIzqicr5iZnhyeuR1/e5TyMh0bWH9PRJya+ueL9BErPQ8BALAK2djmyVrjv1IcyJ4rcx178zmU7oj4X0Tsi4j/R8T+iLgnIit7b0TcV3nxbFeD9fdU5RePf0rXa7Z5naTjv+dibuw3My/+/Km7Lc/tyeLvSE6fLY8ey7dJb3RsS/MDder49qVfPltq3fzxX/pI6y/GgnkDrrdXnaAbGZoYWq+NcOOjiEPtteJPbs8EpEfAwYg4FIvnserYWyTOPv7V4aUKLR9/HeswzzT7ZcRjlf0/FVXxF5L685P926M8eqy/OCoW+/Hnq68vWDBvtnJN8a+DdP/vXHj8V5Xo+jupzNd2RLk8enF85XVc/e3TJb/TrOT4Lw759PjvTN7M5qx/eqeyoz4Ympi4OBDRmbyalenMy2bLB+f+W5Evyqfx9x6t3f/35a9JK7g/ItKD+EhEPBARD+ZtfygiHo6Io3Xi//7FR96tE38SSTR1/4/UfP+7ffx3J/Pn61eRaDv33Tc1fgKQaWz/n8hm2nvzJdn73zIabeAaNx8AAABsCqWI2B1Jqa+S7tkdpVJfX+U3/PtjZ6k8Nj7xxOmx9y+MVK4R6I6OUnGmq2ve+dCBpPgtfCU/mJ8rLtYfz88bf962I8v3DY+VR5ocO7S6XQv7fxT9P/XHslfVAJue67WgdVX3/1KT2gHceY18/vsuAFtTjf6/oxntAO483/+hddXq/1eq8sb/sDW1L0r8XuOWdcBWZPwPrUv/h9al/0NLWst1/atPFBcLrP7/bG/4Cv+GEsU7YFO2RoOJK/XLFHe82Mhm7Ii5JVG6OzZLzcS/+e0t75b2rDmR9pgFSyKS2NBK5+6hAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAsJn9FwAA//9TYejO")

      
      syz_emit_ethernet(0x7e, &(0x7f0000000140)={@broadcast, @dev, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x70, 0x0, 0x0, 0x0, 0x1, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @remote}, @redirect={0xb, 0x3, 0x0, @broadcast, {0x15, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x89, 0x0, @empty, @remote, {[@timestamp_prespec={0x44, 0x1c, 0x0, 0x3, 0x0, [{@local}, {@private}, {@remote}]}, @timestamp_addr={0x44, 0x24, 0x0, 0x1, 0xd, [{@local}, {}, {@empty}, {@loopback}]}]}}}}}}}, 0x0)

      
      syz_emit_ethernet(0x3a, &(0x7f0000000540)={@local, @multicast, @void, {@ipv4={0x800, @tcp={{0x6, 0x4, 0x0, 0x0, 0x2c, 0x3, 0xe000, 0x0, 0x5, 0x0, @dev={0xac, 0x14, 0x14, 0xff}, @private=0xa010100, {[@generic={0x44, 0x2}]}}, {{0x1, 0x4e22, 0x41424344, 0x41424344, 0x0, 0x6, 0x5, 0x0, 0x0, 0x0, 0xdadf}}}}}}, 0x0)

      
      bpf$MAP_CREATE(0x700000000000000, &(0x7f0000000780)=@base={0x1d, 0x4, 0x80000002, 0x0, 0x201, 0xffffffffffffffff, 0x0, '\x00', 0x0, 0xffffffffffffffff, 0x1, 0x5}, 0x50)

      
      syz_emit_ethernet(0x82, &(0x7f0000000000)={@broadcast, @random="1704b45adbde", @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x74, 0x0, 0x0, 0x0, 0x1, 0x0, @initdev={0xac, 0x1e, 0x80, 0x0}, @local}, @time_exceeded={0x5, 0x0, 0x0, 0xe0, 0x0, 0xe000, {0x16, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x11, 0x0, @empty=0xac1414aa, @rand_addr, {[@lsrr={0x83, 0x3}, @rr={0x7, 0x3, 0xe4}, @timestamp_prespec={0x44, 0x3c, 0x0, 0x3, 0x0, [{@private=0xa010101}, {@private}, {@dev}, {@remote}, {@private}, {@dev}, {@private}]}]}}}}}}}, 0x0)

      
      mknod$loop(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0xc000, 0x1)

      
      syz_emit_ethernet(0x46, &(0x7f00000002c0)={@local, @random="df00004000", @void, {@ipv4={0x800, @tcp={{0x9, 0x4, 0x0, 0x0, 0x38, 0x0, 0x0, 0x0, 0x6, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @dev={0xac, 0x14, 0x14, 0x14}, {[@ssrr={0x89, 0x7, 0xa2, [@broadcast]}, @cipso={0x86, 0x6, 0x1}]}}, {{0x0, 0x0, 0x41424344, 0x41424344, 0x0, 0x0, 0x5}}}}}}, 0x0)

      
      bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f0000000140)={0x3, 0xc, &(0x7f0000000580)=@framed={{0x18, 0x2}, [@call={0x85, 0x0, 0x0, 0x97}, @printk={@ld, {}, {}, {}, {}, {}, {0x85, 0x0, 0x0, 0x7b}}]}, &(0x7f0000000440)='syzkaller\x00', 0x5, 0x0, 0x0, 0x40f00, 0x20}, 0x94)

      
      syz_emit_ethernet(0x4a, &(0x7f0000000380)={@broadcast, @dev, @void, {@ipv6={0x86dd, @udp={0x0, 0x6, "72b2af", 0x4, 0x2f, 0x0, @dev, @mcast2, {[], {0x0, 0x6558, 0x10, 0x0, @gue={{0x2}}}}}}}}, 0x0)

      
      rt_sigprocmask(0x3, &(0x7f0000000200)={[0xa14e]}, 0x0, 0x8)

      
      syz_emit_ethernet(0x56, &(0x7f0000000580)={@local, @empty, @void, {@ipv6={0x86dd, @udp={0x0, 0x6, "a24b9f", 0x20, 0x2b, 0x0, @remote, @local, {[@routing={0x3a, 0x2, 0x2, 0x1, 0x0, [@mcast1]}], {0x0, 0x0, 0x8}}}}}}, 0x0)

      
      bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f0000000600)={0x13, 0x3, &(0x7f0000000000)=@framed={{0x4e, 0xa, 0xa, 0x0, 0x0, 0x71, 0x10, 0x5f}}, &(0x7f0000000480)='syzkaller\x00'}, 0x80)

      
      bpf$PROG_LOAD(0x5, &(0x7f0000000440)={0xa, 0x6, &(0x7f0000000000)=@framed={{0x5, 0x0, 0x0, 0x0, 0x0, 0x7b, 0x11, 0x38}, [@func={0x85, 0x0, 0x1, 0x0, 0x2}, @call={0x85, 0x0, 0x0, 0xd1}, @exit={0x95, 0x0, 0x33}], {0x95, 0x0, 0x5a5}}, &(0x7f0000000080)='GPL\x00', 0x5, 0x29e, &(0x7f000000cf3d)=""/195, 0x0, 0x5, '\x00', 0x0, @fallback, 0xffffffffffffffff, 0x6, 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0xffffffffffffffff, 0xb8000000}, 0x70)

      
      syz_emit_ethernet(0x46, &(0x7f0000000000)={@link_local={0x3}, @multicast, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x38, 0x0, 0x0, 0x81, 0x1, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @local}, @time_exceeded={0x3, 0x4, 0x0, 0x12, 0x9, 0x3f18, {0x5, 0x2, 0x0, 0x0, 0x0, 0x65, 0x0, 0x0, 0x32, 0x0, @loopback, @loopback}, "00186371ae9b1c03"}}}}}, 0x0)

      
      prctl$PR_SET_VMA(0x4a, 0x0, &(0x7f0000ffd000/0x1000)=nil, 0x1000, 0x0)

      
      syz_mount_image$ext4(&(0x7f0000000080)='ext4\x00', &(0x7f0000000040)='./file0\x00', 0x88a, &(0x7f00000000c0)={[{@usrquota}, {@usrjquota, 0x22}, {@discard}, {@noload}, {@noinit_itable}, {@grpjquota, 0x22}, {@init_itable}, {@grpjquota_path={'grpjquota', 0x3d, './file0'}}, {@noblock_validity}]}, 0xfe, 0x44e, &(0x7f0000000900)="$eJzs3M1vG0UbAPDHdpw0bfMmbykfDS0ECiLiI2nSD3rgUgQSB5CQ4FDEKSRpFeo2qAkSrSIIHMIRVeKOOCLxF3ChXBBwQuIKd4QUoVwonIzW3k3dxE7ixIlL/ftJm8x4xpp5dnfs8azXAXSsoeRPLuJgRPwaEf3V7J0Vhqr/bq0sTP69sjCZi3L5jT9zlXp/rSxMZlWz5x2oZsrlDdpdejtiolSavprmR+cvvzc6d+36czOXJy5OX5y+Mn727KmTx7rPjJ9uSZx9SV8HP5w9euSVt268Nnn+xjs/fp3092BaXhtHqwxV925dT7a6sTbrq0nnutrYEZpSiIjkcBUr478/CtG7WtYfL3/S1s4Bu6pczpd7GhcvloF7WDJRBzpR9kaffP7Ntj2aetwVls/F6jrGrXSrlnRFPq1TTD8j7YahiDi/+M8XyRa7tA4BAFDr5rmIeLbe/C8fD9TU+196bWggIv4fEYci4r6IOBwR90dU6j4YEQ812f7aKyTr5z/l/m0FtkXJ/O+F9NrWnfO/bPYXA4U011eJv5i7MFOaPpHuk+Eo9iT5sQ3a+O6lXz5rVFY7/0u2pP1sLpj244+uNQt0UxPzEzuJudbyxxGDXfXiz63OeZP58ZGIGNxmGzNPf3W0Udnm8W+gBZPy8pcRT1WP/2KsiT+Ta3h9cuz5M+OnR/dFafrEaHZWrPfTz0uvN2p/R/G3wPLNcuyve/6vxj+Q2xcxd+36pcr12rnm21j67dOGn2m2e/53596spLvTxz6YmJ+/OhbRnXt1/ePjt5+b5bP6yfk/fLz++D8Ut/fEwxGRnMTHIuKRiHg07ftjEfF4RBzfIP4fXnzi3ebj32BVvoWS+Kc2O/5Re/ybTxQuff9N8/FnkuN/qpIaTh/ZyuvfVju4k30HAAAA/xX5ynfgc/mR1XQ+PzJS/Q7/4difL83OzT9zYfb9K1PV78oPRDGfrXT116yHjqVrw1l+fE3+ZLpu/Hmht5IfmZwtTbU7eOhwBxqM/8TvhXb3Dth17teCzmX8Q+cy/qFzGf/QuYx/6Fz1xv9HbegHsPc2ef/v3at+AHvP/B86l/EPncv4h47U8N74/I5u+ZdoU+Lb7p39VsPWE5G/S0K+ZxLFqFvUteUfs9hmoqduUbtfmQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFrj3wAAAP//j57jFA==")

      
      syz_emit_ethernet(0x86, &(0x7f0000000000)={@multicast, @local, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x78, 0x0, 0x0, 0x49, 0x1, 0x0, @initdev={0xac, 0x1e, 0x1, 0x0}, @local}, @time_exceeded={0x3, 0x3, 0x0, 0x3, 0x0, 0x0, {0x17, 0x4, 0x0, 0x2, 0x0, 0xfffc, 0x0, 0x0, 0x2f, 0x0, @empty, @multicast2, {[@noop, @timestamp_addr={0x44, 0x44, 0x0, 0x1, 0x0, [{@remote, 0x80}, {@dev={0xac, 0x14, 0x14, 0x18}}, {@initdev={0xac, 0x1e, 0x0, 0x0}, 0x2000}, {@multicast1}, {@local}, {@local}, {@multicast1}, {@loopback}]}]}}}}}}}, 0x0)

      
      syz_emit_ethernet(0x52, &(0x7f0000001000)={@local, @remote, @void, {@ipv4={0x800, @tcp={{0xc, 0x4, 0x0, 0x0, 0x44, 0x0, 0x0, 0xd8, 0x5, 0x0, @dev={0xac, 0x14, 0x14, 0x21}, @private=0xa010100, {[@timestamp_addr={0x44, 0x14, 0x5, 0x3, 0x0, [{@broadcast}, {@remote}]}, @ssrr={0x89, 0x3, 0xcb}, @lsrr={0x83, 0x3, 0x36}]}}, {{0x0, 0x0, 0x41424344, 0x41424344, 0x0, 0x6, 0x5}}}}}}, 0x0)

      
      socket$isdn(0x22, 0x3, 0x25)

      
      syz_emit_ethernet(0x26, &(0x7f0000000140)={@local, @multicast, @void, {@ipv4={0x800, @generic={{0x5, 0x4, 0x0, 0x12, 0x18, 0x63, 0x0, 0x4, 0x2f, 0x0, @dev={0xac, 0x14, 0x14, 0xf}, @loopback}, "00008848"}}}}, 0x0)

      
      syz_emit_ethernet(0x7e, &(0x7f00000001c0)={@random, @remote, @val={@void, {0x8100, 0x0, 0x1, 0x3}}, {@ipv6={0x86dd, @gre_packet={0x0, 0x6, "381f34", 0x44, 0x2f, 0x0, @local, @mcast2, {[], {{0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x1, 0x88a8}, {}, {}, {}, {0x8, 0x22eb, 0xfffffffd}}}}}}}, 0x0)

      
      bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f0000000600)={0x3, 0x3, &(0x7f0000000000)=@framed={{0x7a, 0xa, 0x0, 0xff00, 0x0, 0x71, 0x10, 0x8f}}, &(0x7f0000000480)='syzkaller\x00'}, 0x80)

      
      rt_sigsuspend(&(0x7f0000000000)={[0x480000000000]}, 0x8)

      
      syz_mount_image$squashfs(&(0x7f0000000040), &(0x7f0000000240)='./file0\x00', 0x400, &(0x7f0000000080)=ANY=[], 0x0, 0x1cc, &(0x7f00000002c0)="$eJzKKC4s5mdgYPj7sSaZQYABDBgZeBguMDAysDAwMKgzQsQYmCDUeij/BZSeCZW2gfKbofRCKF1xS3vdmlN+Z0566motk2VmaPXUlkcWk0vtMXJjY+CRYGYITT2yqLiyKjsxJye1qHghQ8UtLtaK0ycYWK7bX1NpluB0+CPP4ZCk6aDDdMTHI2tGYwnnJClNMTa2TIWzZz7Ir2PTOMLwaAXzxjrPvMa6wtSpeWl5SVVZVVnzJk7cOLOxs7Fx5cS6qDS/VYwtKS4aUFcLbGY2VJ9koz3hXfuqh0kOrD0efs2njJVepzJfMl5YJHVqRdXMCV+UZjMafme4w1O2QkJDw0niioRFgwnDkTrbBleGilvJKQ0MaQphjElqbGJtW87MCWHmZ3NboNCSfIIp9CjH0pkSFgeEqk7+tNR865DoNmPbUwe2MzyHj/OsKegTNDouweC0UPC/DMiYhIaGMo21TEttF3wp0vgr4bXa2CmDwd2eaRksQFkaQORKKE8WrCcheYWHjqamUUpyQsMmiYQktwJDZYatezhXCzQwIEWbCgMDw3ZGWNxCwDUYYxSMglEwCkbBKBgFo2AUjIJRMApGwYgAgAAAAP//SyeUtQ==")

      
      semctl$SEM_INFO(0x0, 0x4, 0x13, 0xffffffffffffffff)

      
      bpf$BPF_PROG_DETACH(0x9, &(0x7f00000003c0)={@ifindex, 0xffffffffffffffff, 0xe, 0x20}, 0x20)

      
      bpf$PROG_LOAD(0x5, &(0x7f00002a0fb8)={0xd, 0x4, &(0x7f0000000000)=ANY=[@ANYBLOB="850000007a000000760400000000000027000000000000009500000000000000"], &(0x7f0000000040)='GPL\x00', 0x4, 0x99, &(0x7f0000000180)=""/153, 0x0, 0x0, '\x00', 0x0, @sock_ops, 0xffffffffffffffff, 0x8, 0x0, 0xfffffffffffffc62, 0x10, 0x0, 0xfffffffffffffed8}, 0x23)

      
      syz_mount_image$qnx6(&(0x7f0000009e00), &(0x7f0000000080)='./file0\x00', 0x2014001, &(0x7f00000003c0)=ANY=[], 0xff, 0x9e32, &(0x7f0000009e40)="$eJwsuAMT54fzNbu72di27XRs27Ztu+N8PzZi2zYXsW3bdrLJ3vrf53fewcx0zanq/Weccq5jjht2yD0DvHeA9w3w/gE+MMAHB/jQAB8e4IgBjhzgqAGOHuAjA3x0gI8N8PEBPjHAJwf41ACfHuAzA3x2wNAhQ4f8X3xhgC8O8KUBvjzAVwb46gBfG+DrA3xjgG8O8K0Bvj3Adwb47gDfG+D7A/xggB8O8KMBfjzATwb46QA/G+DnA/xigF8O8KsBfj3Abwb47QC/G+D3A/xhgD8O8KcB/jzAXwb46wB/G+DvA/xjgH8O8K8B/j3AfwY4ZoD/DvC/AY4d4JAAhwY4LMBxAhwe4LgBjhfg+AFOEOCEAU4U4MQBThLgpAFOFuDkAU4R4JQBThXg1AFOE+C0AU4X4PQBzhDgjAHOFODMAc4S4KwBzhbg7AHOEeCcAc4V4NwBzhPgvAHOF+D8AS4Q4IIBLhTgwgEuEuCiAS4W4OIBLhEgAS4Z4FIBLh3gMgEuG+ByAS4f4AoBrhjgSgGuHOAqAa4a4GoBrh7gGgGuGeBaAa4d4DoBrhvgegGuH+AGAW4Y4EYBbhzgJgFuGuBmAW4e4BYBbhngVgFuHeA2AW4b4HYBbh/gDgHuGOBOAe4c4C4B7hrgbgHuHuAeAe4Z4F4B7h3gPgHuG+B+Ae4f4AEBHhjgQQEeHOAhAR4a4GEBHh7gEQEeGeBRAR4d4DEBHhvgcQEeH+AJAZ4YoAGeFODJAZ4S4KkBnhbg6QGeEeCZAQ4CDAIMA4wCjANMAkwDzALMAywCLAOsAqwDbAJsA+wC7AM8K8CzAzwnwHMDPC/A8wO8IMALA7wowIsDvCTASwO8LMDLA7wiwCsDvCrAqwO8JsBrA7wuwOsDvCHAGwO8KcCbA7wlwFsDvC3A2wO8I8A7A7wrwLsDvCfAewO8L8D7A3wgwAcDfCjAhwMcEeDIAEcFODrARwJ8NMDHAnw8wCcCfDLApwJ8OsBnAnw2wOcCfD7AFwJ8McCXAnw5wFcCfDXA1wJ8PcA3AnwzwLcCfDvAdwJ8N8D3Anw/wA8C/DDAjwL8OMBPAvw0wM8C/DzALwL8MsCvAvw6wG8C/DbA7wL8PsAfAvwxwJ8C/DnAXwL8NcDfAvw9wD8C/DPAvwL8O8B/AhwT4L8B/hfg2ACHhDg0xGEhjhPi8BDHDXG8EMcPcYIQJwxxohAnDnGSECcNcbIQJw9xihCnDHGqEKcOcZoQpw1xuhCnD3GGEGcMcaYQZw5xlhBnDXG2EGcPcY4Q5wxxrhDnDnGeEOcNcb4Q5w9xgRAXDHGhEBcOcZEQFw1xsRAXD3GJEAlxyRCXCnHpEJcJcdkQlwtx+RBXCHHFEFcKceUQVwlx1RBXC3H1ENcIcc0Q1wpx7RDXCXHdENcLcf0QNwhxwxA3CnHjEDcJcdMQNwtx8xC3CHHLELcKcesQtwlx2xC3C3H7EHcIcccQdwpx5xB3CXHXEHcLcfcQ9whxzxD3CnHvEPcJcd8Q9wtx/xAPCPHAEA8K8eAQDwnx0BAPC/HwEI8I8cgQjwrx6BCPCfHYEI8L8fgQTwjxxBAN8aQQTw7xlBBPDfG0EE8P8YwQzwxxEGIQYhhiFGIcYhJiGmIWYh5iEWIZYhViHWITYhtiF2If4lkhnh3iOSGeG+J5IZ4f4gUhXhjiRSFeHOIlIV4a4mUhXh7iFSFeGeJVIV4d4jUhXhvidSFeH+INId4Y4k0h3hziLSHeGuJtId4e4h0h3hniXSHeHeI9Id4b4n0h3h/iAyE+GOJD4fhDhgzBESGODHFUiKNDfCTER0N8LMTHQ3wixCdDfCrEp0N8JsRnQ3wuxOdDfCHEF0N8KcSXQ3wlxFdDfC3E10N8I8Q3Q3wrxLdDfCfEd0N8L8T3Q/wgxA9D/CjEj0P8JMRPQ/wsxM9D/CLEL0P8KsSvQ/wmxG9D/C7E70P8IcQfQ/wpxJ9D/CXEX0P8LcTfQ/wjxD9D/CvEv0P8J8QxIf4b4n8hjg1xyBgcGuGwCMeJcHiE40Y4XoTjRzhBhBNGOFGEE0c4SYSTRjhZhJNHOEWEU0Y4VYRTRzhNhNNGOF2E00c4Q4QzRjhThDNHOEuEs0Y4W4SzRzhHhHNGOFeEc0c4T4TzRjhfhPNHuECEC0a40LAh/y8RLhrhYhEuHuESERLhkhEuFeHSES4T4bIRLhfh8hGuEOGKEa4U4coRrhLhqhGuFuHqEa4R4ZoRrhXh2hGuE+G6Ea4X4foRbhDhhhFuFOHGEW4S4aYRbhbh5hFuEeGWEW4V4dYRbhPhthFuF+H2Ee4Q4Y4R7hThzhHuEuGuEe4W4e4R7hHhnhHuFeHeEe4T4b4R7hfh/hEeEOGBER4U4cERHhLhoREeFuHhER4R4ZERHhXh0REeE+GxER4X4fERnhDhiREa4UkRnhzhKRGeGuFpEZ4e4RkRnhnhIMIgwjDCKMI4wiTCNMIswjzCIsIywirCOsImwjbCLsI+wrMiPDvCcyI8N8LzIjw/wgsivDDCiyK8OMJLIrw0wssivDzCKyK8MsKrIrw6wmsivDbC6yK8PsIbIrwxwpsivDnCWyK8NcLbIrw9wjsivDPCuyK8O8J7Irw3wvsivD/CByJ8MMKHInw4whERjoxwVISjI3wkwkcjfCzCxyN8IsInI3wqwqcjfCbCZyN8LsLnI3whwhcjfCnClyN8JcJXI3wtwtcjfCPCNyN8K8K3I3wnwncjfC/C9yP8IMIPI/wowo8j/CTCTyP8LMLPI/wiwi8j/CrCryP8JsJvI/wuwu8j/CHCHyP8KcKfI/wlwl8j/C3C3yP8I8I/I/wrwr8j/CfCMRH+G+F/EY6NcEiMQ2McFuM4MQ6PcdwYx4tx/BgniHHCGCeKceIYJ4lx0hgni3HyGKeIccoYp4px6hiniXHaGKeLcfoYZ4hxxhhninHmGGeJcdYYZ4tx9hjniHHOGOeKce4Y54lx3hjni3H+GBeIccEYF4px4RgXiXHRGBeLcfEYl4iRGJeMcakYl45xmRiXjXG5GJePcYUYV4xxpRhXjnGVGFeNcbUYV49xjRjXjHGtGNeOcZ0Y141xvRjXj3GDGDeMcaMYN45xkxg3jXGzGDePcYsYt4xxqxi3jnGbGLeNcbsYt49xhxh3jHGnGHeOcZcYd41xtxh3j3GPGPeMca8Y945xnxj3jXG/GPeP8YAYD4zxoBgPjvGQGA+N8bAYD4/xiBiPjPGoGI+O8ZgYj43xuBiPj/GEGE+M0RhPivHkGE+J8dQYT4vx9BjPiPHMGAcxBjGGMUYxxjEmMaYxZjHmMRYxljFrVjHWMTYxtjF2MfYxnhXj2TGeE+O5MZ4X4/kxXhDjhTFeFOPFMV4S46UxXhbj5TFeEeOVMV4V49UxXhPjtTFeF+P1Md4Q440x3hTjzTHeEuOtMd4W4+0x3hHjnTHeFePdMd4Tj/f/9/x9Md4f4wMxPhjjQzE+HOOIGEfGOCrG0TE+EuOjMT4W4+MxPhHjkzE+FePTMT4T47MxPhfj8zG+EOOLMb4U48sxvhLjqzG+FuPrMb4R45sxvhXj2zG+E+O7Mb4X4/sxfhDjhzF+FOPHMX4S46cxfhbj5zF+EeOXMX4V49cxfhPjtzF+F+P3Mf4Q448x/hTjzzH+EuOvMf4W4+8x/hHjnzH+FePfMf4T45gY/43xvxjHxjgkwaEJDktwnASHJzhuguMlOH6CEyQ4YYITJThxgpMkOGmCkyU4eYJTJDhlglMlOHWC0yQ4bYLTJTh9gjMkOGOCMyU4c4KzJEOdNcHZEpw9wTkSnDPBuRKcO8F5Epw3wfkSnD/BBRJcMMGFElw4wUUSXDTBxRJcPMElEiTBJRNcKsGlE1wmwWUTXC7B5RNcIcEVE1wpwZUTXCXBVRNcLcHVE1wjwTUTXCvBtRNcJ8F1E1wvwfUT3CDBDRPcKMGNE9wkwU0T3CzBzRPcIsEtE9wqwa0T3CbBbRPcLsHtE9whwR0T3CnBnRPcJRkyZNcEd0tw9wT3SHDPBPdKcO8E90lw3wT3S3D/BA9I8MAED0rw4AQPSfDQBA9L8PAEj0jwyASPSvDoBI9J8NgEj0vw+ARPSPDEBE3wpARPTvCUBE9N8LQET0/wjATPTHCQYJBgmGCUYJxgkmCaYJZgnmCRYJlglWCdYJNgm2CXYJ/gWQmeneA5CZ6b4HkJnp/gBQlemOBFCV6c4CUJXprgZQlenuAVCV6Z4FUJXp3gNQlem+B1CV6f4A0J3pjgTQnenOAtCd6a4G0J3p7gHQnemeBdCd6d4D0J3pvgfQnen+ADCT6Y4EMJPpzgiARHJjgqwdEJPpLgowk+luDjCT6R4JMJPpXg0wk+k+CzCT6X4PMJvpDgiwm+lODLCb6S4KsJvpbg6wm+keCbCb6V4NsJvpPguwm+l+D7CX6Q4IcJfpTgxwl+kuCnCX6W4OcJfpHglwl+leDXCX6T4LcJfpfg9wn+kOCPCf6U4M8J/pLgrwn+luDvCf6R4J8J/pXg3wn+k+CYBP9N8L8ExyY4JMWhKQ5LcZwUh6c4borjpTh+ihOkOGGKE6U4cYqTpDhpipOlOHmKU6Q4ZYpTpTh1itOkOG2K06U4fYozpDhjijOlOHOKs6Q4a4qzpTh7inOkOGeKc6U4d4rzpDhvivOlOH+KC6S4YIoLpbhwioukuGiKi6W4eIpLpEiKS6a4VIpLp7hMisumuFyKy6e4QoorprhSiiunuEqKq6a4Woqrp7hGimumuFaKa6e4TorrprheiuunuEGKG6a4UYobp7hJipumuFmKm6e4RYpbprhVilunuE2K26a4XYrbp7hDijumuFOKO6e4S4q7prhbirunuEeKe6a4V4p7p7hPivumuF+K+6d4QIoHpnhQigeneEiKh6Z4WIqHp3hEikemeFSKR6d4TIrHpnhciseneEKKJ6ZoiieleHKKp6R4aoqnpXh6imekeGaKgxSDFMMUoxTjFJMU0xSzFPMUixTLFKsU6xSbFNsUuxT7FM9K8ewUz0nx3BTPS/H8FC9I8cIUL0rx4hQvSfHSFC9L8fIUr0jxyhSvSvHqFK9J8doUr0vx+hRvSPHGFG9K8eYUb0nx1hRvS/H2FO9I8c4U70rx7hTvSfHeFO9L8f4UH5hsyJD/4/2hFB9OcUSKI1McleLoFB9J8dEUH0vx8RSfSPHJFJ9K8ekUn0nx2RSfS/H5FF9I8cUUX0rx5RRfSfHVFF9L8fUU30jxzRTfSvHtFN9J8d0U30vx/RQ/SPHDFD9K8eMUP0nx0xQ/S/HzFL9I8csUv0rx6xS/SfHbFL9L8fsUf0jxxxR/SvHnFH9J8dcUf0vx9xT/SPHPFP9K8e8U/0lxTIr/pvhfimNTHJLh0AyHZThOhsMzHDfD8TIcP8MJMpwww4kynDjDSTKcNMPJMpw8wykynDLDqTKcOsNpMpw2w+kynD7DGTKcMcOZMpw5w1kynDXD2TKcPcM5Mpwzw7kynDvDeTKcN8P5Mpw/wwUyXDDDhTJcOMNFMlw0w8UyXDzDJTIkwyUzXCrDpTNcJsNlM1wuw+UzXCHDFTNcKcOVM1wlw1UzXC3D1TNcI8M1M1wrw7UzXCfDdTNcL8P1M9wgww0z3CjDjTPc5H+72SzDzTPcIsMtM9wqw60z3CbDbTPcLsPtM9whwx0z3CnDnTPcJcNdM9wtw90z3CPDPTPcK8O9M9wnw30z3C/D/TM8IMMDMzwow4MzPCTDQzM8LMPDMzwiwyMzPCrDozM8JsNjMzwuw+MzPCHDEzM0w5MyPDnDUzI8NcPTMjw9wzMyPDPDQYZBhmGGUYZxhkmGaYZZhnmGRYZlhlWGdYZNhm2GXYZ9hmdleHaG52R4bobnZXh+hhdkeGGGF2V4cYaXZHhphpdleHmGV2R4ZYZXZXh1htdkeG2G12V4fYY3ZHhjhjdleHOGt2R4a4a3ZXh7hndkeGeGd2V4d4b3ZHhvhvdleH+GD2T4YIYPZfhwhiMyHJnhqAxHZ/hIho9m+FiGj2f4RIZPZvhUhk9n+EyGz2b4XIbPZ/hChi9m+FKGL2f4SoavZvhahq9n+EaGb2b4VoZvZ/hOhu9m+F6G72f4QYYfZvhRhh9n+EmGn2b4WYafZ/hFhl9m+FWGX2f4TYbfZvhdht9n+EOGP2b4U4Y/Z/hLhr9m+FuGv2f4R4Z/ZvhXhn9n+E+GYzL8N8P/Mhyb4ZAch+Y4LMdxchye47g5jpfj+DlOkOOEOU6U48Q5TpLjpDlOluPkOU6R45Q5TpXj1DlOk+O0OU6X4/Q5zpDjjDnOlOPMOc6S46w5zpbj7DnOkeOcOc6V49w5zpPjvDnOl+P8OS6Q44I5LpTjwjkukuOiOS6W4+I5LpEjOS6ZM8X/zbJ0jsvkuGyOy+W4fI4r5LhijivluHKOq+S4ao6r5bh6jmvkuGaOa+W4do7r5LhujuvluH6OG+S4YY4b5bhxjpvkuGmOm+W4eY5b5LhljlvluHWO2+S4bY7b5bh9jjvkuGOOO+W4c4675LhrjrvluHuOe+S4Z457TfA/x5njvjnul+P+OR6Q44E5HpTjwTkekuOhOR6W4+E5HpHjkTkelePROR6T47E5Hpfj8TmekOOJOZrjSTmenOMpOZ6a42k5np7jGTmemeMgxyDHMMcoxzjHJMc0xyzHPMcixzLHKsc6xybHNscuxz7Hs3I8O8dzcjw3x/NyPD/HC3K8MMeLcrw4x0tyvDTHy3K8PMcrcrwyx6tyvDrHa3K8Nsfrcrw+xxtyvDHHm3K8Ocdbcrw1x9tyvD3HO3K8M8e7crw7x3tyvDfH+3K8P8cHcnwwx4dyfDjHETmOzHFUjqNzfCTHR3N87P/4/98pnszxqRyfzvGZHJ/N8bkcn8/xhRxfzPGlHF/O8ZUcX83xtRxfz/GNHN/M8a0c387xnRzfzfG9HN/P8YMcP8zxoxw/zvGTHD/N8bMcP8/xixy/zPGrHL/O8Zscv83xuxy/z/GHHH/M8accf87xlxx/zfG3HH/P8Y8c/8zxrxz/zvGfHMfk+G+O/+U4NschBQ4tcFiB4xQ4vMBxCxyvwPELnKDACQucqMCJC5ykwEkLnKzAyQucosApC5yqwKkLnKbAaQucrsDpC5yhwBkLnKnAmQucpcBZC5ytwNkLnKPAOQucq8C5C5ynwHkLnK/A+QtcoMAFC1yowIULXKTARQtcrMDFC1yiQApcssClCly6wGUKXLbA5QpcvsAVClyxwJUKXLnAVQpctcDVCly9wDUKXLPAtQpcu8B1Cly3wPUKXL/ADQrcsMCNCty4wE0K3LTAzQrcvMAtCtyywK0K3LrAbQrctsDtCty+wB0K3LHAnQrcucBdCty1wN0K3L3APQrcs8C9Cty7wH0K3LfA/Qrcv8ADCjywwIMKPLjAQwo8tMDDCjy8wCMKPLLAowo8usBjCjy2wOMKPL7AEwo8sUALPKnAkws8pcBTCzytwNMLPKPAMwscFBgUGBYYFRgXmBSYFpgVmBdYFFgWWBVYF9gU2BbYFdgXeFaBZxd4ToHnFnhegecXeEGBFxZ4UYEXF3hJgZcWeFmBlxd4RYFXFnhVgVcXeE2B1xZ4XYHXF3hDgTcWeFOBNxd4S4G3FnhbgbcXeEeBdxZ4V4F3F3hPgfcWeF+B9xf4QIEPFvhQgQ8XOKLAkQWOKnB0gY8U+GiBjxX4eIFPFPhkgU8V+HSBzxT4bIHPFfh8gS8U+GKBLxX4coGvFPhqga8V+HqBbxT4ZoFvFfh2ge8U+G6B7xX4foEfFPhhgR8V+HGBnxT4aYGfFfh5gV8U+GWBXxX4dYHfFPhtgd8V+H2BPxT4Y4E/Ffhzgb8U+GuBvxX4e4F/FPhngX8V+HeB/xQ4psB/C/yvwLEFDilxaInDShynxOEljlvieCWOX+IEJU5Y4kQlTlziJCVOWuJkJU5e4hTl2LFTljhViVOXOE2J05Y4XYnTlzhDiTOWOFOJM5c4S4mzljhbibOXOEeJc5Y4V4lzlzhPifOWOF+J85e4QIkLlrhQiQuXuEiJi5a4WImLl7hEiZS4ZIlLlbh0icuUuGyJy5VDhw8ZgiuUuGKJK5W4comrlLhqiauVuHqJa5S4Zolrlbh2ievM/7/nX+L6JW5Q4oYlblTixiVuUuKmJW5W4uYlblHiliVuVeLWJW5T4rYlblfi9iXuUOKOJe5U4s4l7lLiriXuVuLuJe5R4p4l7lXi3iXuU+K+Je5X4v4lHlDigSUeVOLBJR5S4qElHlbi4SUeUeKRJR5V4tElHlPisSUeV+LxJZ5Q4oklWuJJJZ5c4iklnlriaSWeXuIZJZ5Z4qDEoMSwxKjEuMSkxLTErMS8xKLEssSqxLrEpsS2xK7EvsSzSjy7xHNKPLfE80o8v8QLSrywxItKvLjES0q8tMTLSry8xCtKvLLEq0q8usRrSry2xOtKvL7EG0q8scSbSry5xFtKvLXE20q8vcQ7SryzxLtKvLvEe0q8t8T7Sry/xAdKfLDEh0p8uMQRJY4scVSJo0t8pMRHS3ysxMdLfKLEJ0t8qsSnS3ymxGdLfK7E50t8ocQXS3ypxJdLfKXEV0t8rcTXS3yjxDdLfKvEt0t8p8R3S3yvxPdL/KDED0v8qMSPS/ykxE9L/KzEz0v8osQvS/yqxK9L/KbEb0v8rsTvS/yhxB9L/KnEn0v8pcRfS/ytxN9L/KPEP0v8q8S/S/ynxDEl/lvifyWOLXFIhUMrHFbhOBUOr3DcCsercPwKJ6hwwgonqnDiCiepcNIKJ6tw8gqnqHDKCqeqcOoKp6lw2gqnq3D6CmeocMYKZ6pw5gpnqXDWCmercPYK56hwzgrnqnDuCuepcN4K56tw/goXqHDBCheqcOEKF6lw0QoXq3DxCpeokAqXrHCpCpeucJkKl61wuQqXr3CFClescKUKV65wlQpXrXC1ClevcI0K16xwrQrXrnCdCtetcL0K169wgwo3rHCjCjeucJMKN61wswo3r3CLCrescKsKt65wmwq3rXC7CrevcIcKd6xwpwp3rnCXCnetcLcKd69wjwr3rHCvCveucJ8K961wvwr3r/CACg+s8KAKD67wkAoPrfCwCg+v8IgKj6zwqAqPrvCYCo+t8LgKj6/whApPrNAKT6rw5ApPqfDUCk+r8PQKz6jwzAoHFQYVhhVGFcYVJhWmFWYV5hUWFZYVVhXWFTYVthV2FfYVnlXh2RWeU+G5FZ5X4fkVXlDhhRVeVOHFFV5S4aUVXlbh5RVeUeGVFV5V4dUVXlPhtRVeV+H1Fd5Q4Y0V3lThzRXeUuGtFd5W4e0V3lHhnRXeVeHdFd5T4b0V3lfh/RU+UOGDFT5U4cMVjqhwZIWjKhxd4SMVPlrhYxU+XuETFT5Z4VMVPl3hMxU+W+FzFT5f4QsVvljhSxW+XOErFb5a4WsVvl7hGxW+WeFbFb5d4TsVvlvhexW+X+EHFX5Y4UcVflzhJxV+WuFnFX5e4RcVflnhVxV+XeE3FX5b4XcVfl/hDxX+WOFPFf5c4S8V/lrhbxX+XuEfFf5Z4V8V/l3hPxWOqfDfCv+rcGyFQ2ocWuOwGsepcXiN49Y4Xo3j1zhBjRPWOFGNE9c4SY2T1jhZjZPXOEWNU9Y4VY1T1zhNjdPWOF2N09c4Q40z1jhTjTPXOEuNs9Y4W42z1zhHjXPWOFeNc9c4T43z1jhfjfPXuECNC9a4UI0L17hIjYvWuFiNi9e4RI3UuGSNS9W4dI3L1LhsjcvVuHyNK9S4Yo0r1bhyjavUuGqNq9W4eo1r1LhmjWvVuHaN69S4bo3r1bh+jRvUuGGNG9W4cY2b1LhpjZvVuHmNW9S4ZY1b1bh1jdvUuG2N29W4fY071LhjjTvVuHONu9S4a4271bh7jXvUuGeNe9W4d4371LhvjfvVuH+NB9R4YI0H1XhwjYfUeGiNh9V4eI1H1HhkjUfVeHSNx9R4bI3H1Xh8jSfUeGKN1nhSjSfXeEqNp9Z4Wo2n13hGjWfWOKgxqDGsMaoxrjGpMa0xqzGvsaixrLGqsa6xqbGtsauxr/GsGs+u8Zwaz63xvBrPr/GCGi+s8aIaLx4+ZMj/sXdpjZfVeHmNV9R4ZY1X1Xh1jdfUeG2N19V4fY031HhjjTfVeHONt9R4a4231Xh7jXfUeGeNd9V4d4331HhvjffVeH+ND9T4YI0P1fhwjSNqHFnjqBpH1/hIjY/W+FiNj9f4RI1P1vhUjU/X+EyNz9b4XI3P1/hCjS/W+FKNL9f4So2v1vhaja/X+EaNb9b4Vo1v1/hOje/W+F6N79f4QY0f1vhRjR/X+EmNn9b4WY2f1/hFjV/W+FWNX9f4TY3f1vhdjd/X+EONP9b4U40/1/hLjb/W+FuNv9f4R41/1vhXjX/X+E+NY2r8t8b/ahxb45AGhzY4rMFxGhze4LgNjtfg+A1O0OCEDU7U4MQNTtLgpA1O1uDkDU7R4JQNTtXg1A1O0+C0DU7X4PQNztDgjA3O1ODMDc7S4KwNztbg7A3O0eCcDc7V4NwNztPgvA3O1+D8DS7Q4IINLtTgwg0u0uCiDS7W4OINLtEgDS7Z4FINLt3gMg0u2+ByDS7f4AoNrtjgSg2u3OAqDa7a4GoNrt7gGg2u2eBaDa7d4DoNrtvgeg2u3+AGDW7Y4EYNbtzgJg1u2uBmDW7e4BYNbtngVg1u3eA2DW7b4HYNbt/gDg3u2OBODe7c4C4N7trgbg3u3uAeDe7Z4F4N7t3gPg3u2+B+De7f4AENHtjgQQ0e3OAhDR7a4GENHt7gEQ0e2eBRDR7d4DENHtvgcQ0e3+AJDZ7YoA2e1ODJDZ7S4KkNntbg6Q2e0eCZDQ4aDBoMG4wajBtMGkwbzBrMGywaLBusGqwbbBpsG+wa7Bs8q8GzGzynwXMbPK/B8xu8oMELG7yowYsbvKTBSxu8rMHLG7yiwSsbvKrBqxu8psFrG7yuwesbvKHBGxu8qcGbG7ylwVsbvK3B2xu8o8E7G7yrwbsbvKfBexu8r8H7G3ygwQcbfKjBhxsc0eDIBkc1OLrBRxp8tMHHGny8wScafLLBpxp8usFnGny2wecafL7BFxp8scGXGny5wVcafLXB1xp8vcE3GnyzwbcafLvBdxp8t8H3Gny/wQ8a/LDBjxr8uMFPGvy0wc8a/LzBLxr8ssGvGvy6wW8a/LbB7xr8vsEfGvyxwZ8a/LnBXxr8tcHfGvy9wT8a/LPBvxr8u8F/GhzT4L8N/tfg2AaHtDi0xWEtjtPi8BbHbXG8FsdvcYIWJ2xxohYnbnGSFidtcbIWJ29xihanbHGqFqducZoWp21xuhanb3GGFmdscaYWZ25xlhZnbXG2FmdvcY4W52xxrhbnbnGeFudtcb4W529xgRYXbHGhFhducZEWF21xsRYXb3GJFmlxyRaXanHpFpdpcdkWl2tx+RZXaHHFFldqceUWV2lx1RZXa3H1Ftdocc0W12px7RbXaXHdFtdrcf0WN2hxwxY3anHjFjdpcdMWN2tx8xa3aHHLFrdqcesWt2lx2xa3a3H7FndocccWd2px5xZ3aXHXFndrcfcW92hxzxb3anHvFvdpcd8W92tx/xYPaPHAFg9q8eAWD2nx0BYPa/HwFo9o8cgWj2rx6BaPafHYFo9r8fgWT2jxxBZt8aQWT27xlBZPbfG0Fk9v8YwWz2xx0GLQYthi1GLcYtJi2mLWYt5i0WLZYtVi3WLTYtti12Lf4lktnt3iOS2e2+J5LZ7f4gUtXtjiRS1e3OIlLV7a4mUtXt7iFS1e2eJVLV7d4jUtXtvidS1e3+INLd7Y4k0t3tziLS3e2uJtLd7e4h0t3tniXS3e3eI9Ld7b4n0t3t/iAy0+2OJDLT7c4ogWR7Y4qsXRLT7S4qMtPtbi4y0+0eKTLT7V4tMtPtPisy0+1+LzLb7Q4ostvtTiyy2+0uKrLb7W4ustvtHimy2+1eLbLb7T4rstvtfi+y1+0OKHLX7U4sctftLipy1+1uLnLX7R4pctftXi1y1+0+K3LX7X4vct/tDijy3+1OLPLf7S4q8t/tbi7y3+0eKfLf7V4t8t/tPimBb/bfG/Fse2OKTDoR0O63CcDod3OG6H43U4focTdDhhhxN1OHGHk3Q4aYeTdTh5h1N0OGWHU3U4dYfTdDhth9N1OH2HM3Q4Y4czdThzh7N0OGuHs3U4e4dzdDhnh3N1OHeH83Q4b4fzdTh/hwt0uGCHC3W4cIeLdLhoh4t1uHiHS3RIh0t2uFSHS3e4TIfLdrhch8t3uEKHK3a4Uocrd7hKh6t2uFqHq3e4RodrdrhWh2t3uE6H63a4Xofrd7hBhxt2uFGHG3e4SYebdrhZh5t3uEWHW3a4VYdbd7hNh9t2uF2H23e4Q4c7drhThzt3uEuHu3a4W4e7d7hHh3t2uFeHe3e4T4f7drhfh/t3eECHB3Z4UIcHd3hIh4d2eFiHh3d4RIdHdnhUh0d3eEyHx3Z4XIfHd3hChyd2jPM/q+nJHZ7S4akdntbh6R2e0eGZHQ46DDoMO4w6jDtMOkw7zDrMOyw6LDusOqw7bDpsO+w67Ds8q8OzOzynw3M7PK/D8zu8oMMLO7yow4s7vKTDSzu8rMPLO7yiwys7vKrDqzu8psNrO7yuw+s7vKHDGzu8qcObO7ylw1s7vK3D2zu8o8M7O7yrw7s7vKfDezu8r8P7O3ygwwc7fKjDhzsc0eHIDkd1OLrDRzp8tMPHOny8wyc6fLLDpzp8usNnOny2w+c6fL7DFzp8scOXOny5w1c6fLXD1zp8vcM3Onyzw7c6fLvDdzp8t8P3Ony/ww86/LDDjzr8uMNPOvy0w886/LzDLzr8ssOvOvy6w286/LbD7zr8vsMfOvyxw586/LnDXzr8tcPfOvy9wz86/LPDvzr8u8N/OhzT4b8d/tfh2A6H9Di0x2E9jtPj8B7H7XG8HsfvcYIeJ+xxoh4n7nGSHiftcbIeJ+9xih6n7HGqHqfucZoep+1xuh6n73GGHmfscaYeZ+5xlh5n7XG2HmfvcY4e5+xxrh7n7nGeHuftcb4e5+9xgR4X7HGhHhfucZEeF+1xsR4X73GJHulxyR6X6nHpHpfpcdkel+tx+R5X6HHFHlfqceUeV+lx1R5X63H1Htfocc0e1+px7R7X6XHdHtfrcf0eN+hxwx436nHjHjfpcdMeN+tx8x636HHLHrfqceset+lx2x6363H7Hnfoccced+px5x536XHXHnfrcfce9+hxzx736nHvHvfpcd8e9+tx/x4P6PHAHg/q8eAeD+nx0B4P6/HwHo/o8cgej+rx6B6P6fHYHo/r8fgeT+jxxB7t8aQeT+7xlB5P7fG0Hk/v8Ywez+xx0GPQY9hj1GPcY9Jj2mPWY95j0WPZY9Vj3WPTY9tj12Pf41k9nt3jOT2e2+N5PZ7f4wU9XtjjRT1e3OMlPV7a42U9Xt7jFT1e2eNVPV7d4zU9XtvjdT1e3+MNPd7Y40093tzjLT3e2uNtPd7e4x093tnjXT3e3eM9Pd7b43093t/jAz0+2ONDPT7c44geR/Y4qsfR/f9Hsz1F4X0AyhZPatu2tWvbtm3bu7b5/W3Utq2gTG27qY3krnPXOfM+b/M264dPDfDpAT4zwGcH+NwAhw1w+ABHDHDkAJ8f4AsDfHGALw3w5QG+MsBRA3x1gK8N8PUBvjHANwf41gDfHuA7A3x3gO8N8P0BfjDADwf40QA/HuAnA/x0gJ8N8PMBfjHALwf41QC/HuA3A/x2gN8NcPQAvx/gDwP8cYA/DfDnAf4ywF8H+NsAfx/gHwP8c4B/DfDvAf4zwH8H+N8Axwxw7ACHBDg0wHECHDfA8QIcP8AJApwwwIkCnDjASQKcNMDJApw8wCkCnDLAqQKcOsBpApw2wOkCnD7AGQKcMcCZApw5wFkCnDXA2QKcPcA5ApwzwLkCnDvAeQKcN8D5Apw/wAUCXDDAhQJcOMBFAlw0wMUCXDzAJQJcMsClAiTApQNcJsBlA1wuwOUDXCHAFQNcKcCVA1wlwFUDXC3A1QNcI8A1A1wrwLUDXCfAdQNcL8D1A9wgwA0D3CjAjQPcJMBNA9wswM0D3CLALQPcKsCtA9wmwG0D3C7A7QPcIcAdA9wpwJ0D3CXAXQPcLcDdA9wjwD0D3CvAvQPcJ8B9A9wvwP0DPCDAAwM8KMCDAzwkwEMDPCzAwwM8IsAjAzwqwKMDPCbAYwM8LsDjAzwhwBMDPCnAkwM0wFMCPDXA0wI8PcAzAjwzwLMCPDvAcwI8N8DzAjw/wAsCvDDAiwK8OMBLArw0wMsCvDzAKwK8MsCrArw6wEGAQYBhgFGAcYBJgGmAWYB5gEWAZYBVgHWATYBtgF2AfYDXBHhtgNcFeH2ANwR4Y4A3BXhzgLcEeGuAtwV4e4B3BHhngHcFeHeA9wR4b4D3BXh/gA8E+GCADwX4cICPBPhogI8F+HiATwT4ZIBPBfh0gM8E+GyAzwU4LMDhAY4IcGSAzwf4QoAvBvhSgC8H+EqAowJ8NcDXAnw9wDcCfDPAtwJ8O8B3Anw3wPcCfD/ADwL8MMCPAvw4wE8C/DTAzwL8PMAvAvwywK8C/DrAbwL8NsDvAhwd4PcB/hDgjwH+FODPAf4S4K8B/hbg7wH+EeCfAf4V4N8B/hPgvwH+F+CYAMcGOCTEoSGOE+K4IY4X4vghThDihCFOFOLEIU4S4qQhThbi5CFOEeKUIU4V4tQhThPitCFOF+L0Ic4Q4owhzhTizCHOEuKsIc4W4uwhzhHinCHOFeLcIc4T4rwhzhfi/CEuEOKCIS4U4sIhLhLioiEuFuLiIS4R4pIhLhUiIS4d4jIhLhviciEuH+IKIa4Y4kohrhziKiGuGuJqIa4e4hohrhniWiGuHeI6Ia4b4nohrh/iBiFuGOJGIW4c4iYhbhriZiFuHuIWIW4Z4lYhbh3iNiFuG+J2IW4f4g4h7hjiTiHuHOIuIe4a4m4h7h7iHiHuGeJeIe4d4j4h7hvifiHuH+IBIR4Y4kEhHhziISEeGuJhIR4e4hEhHhniUSEeHeIxIR4b4nEhHh/iCSGeGOJJIZ4coiGeEuKpIZ4W4ukhnhHimSGeFeLZIZ4T4rkhnhfi+SFeEOKFIV4U4sUhXhLipSFeFuLlIV4R4pUhXhXi1SEOQgxCDEOMQoxDTEJMQ8xCzEMsQixDrEKsQ2xCbEPsQuxDvCbEa0O8LsTrQ7whxBtDvCnEm0O8JcRbQ7wtxNtDvCNk3DtDvCvEu0O8J8R7Q7wvxPtDfCDEB0N8KMSHQ3wkxEdDfCzEx0N8IsQnQ3wqxKdDfCbEZ0N8LsRhIQ4PcUSII0N8PsQXQnwxxJdCfDnEV0IcFeKrIb4W4ushvhHimyG+FeLbIb4T4rshvhfi+yF+EOKHIX4U4schfhLipyF+FuLnIX4R4pchfhXi1yF+E+K3IX4X4ugQvw/xhxB/DPGnEH8O8ZcQfw3xtxB/D/GPEP8M8a8Q/w7xnxD/DfG/EMeEODbEIREOjXCcCMeNcLwIx49wgggnjHCiCCeOcJIIJ41wsggnj3CKCKeMcKoIp45wmginjXC6CKePcIYIZ4xwpghnjnCWCGeNcLYIZ49wjgjnjHCuCOeOcJ4I541wvgjnj3CBCBeMcKEIF45wkQgXjXCxCBePcIkIl4xwqQiJcOkIl4lw2QiXi3D5CFeIcMUIV4pw5QhXiXDVCFeLcPUI14hwzQjXinDtCNeJcN0I14tw/Qg3iHDDCDeKcOMIN4lw0wg3i3DzCLeIcMsIt4pw6wi3iXDbCLeLcPsId4hwxwh3inDnCHeJcNcId4tw9wj3iHDPCPeKcO8I94lw3wj3i3D/CA+I8MAID4rw4AgPifDQCA+L8PAIj4jwyAiPivDoCI+J8NgIj4vw+AhPiPDECE+K8OQIjfCUCE+N8LQIT4/wjAjPjPCsCM+O8JwIz43wvAjPj/CCCC+M8KIIL47wkggvjfCyCC+P8IoIr4zwqgivjnAQYRBhGGEUYRxhEmEaYRZhHmERYRlhFWEdYRNhG2EXYR/hNRFeG+F1EV4f4Q0R3hjhTRHeHOEtEd4a4W0R3h7hHRHeGeFdEd4d4T0R3hvhfRHeH+EDET4Y4UMRPhzhIxE+GuFjET4e4RMRPhnhUxE+HeEzET4b4XMRDotweIQjIhwZ4fMRvhDhixG+FOHLEb4S4agIX43wtQhfj/CNCN+M8K0I347wnQjfjfC9CN+P8IMIP4zwowg/jvCTCD+N8LMIP4/wiwi/jPCrCL+O8JsIv43wuwhHR/h9hD9E+GOEP0X4c4S/RPhrhL9F+HuEf0T4Z4R/Rfh3hP9E+G+E/0U4JsKxEQ6JcWiM48Q4bozjxTh+jBPEOGGME8U4cYyTxDhpjJPFOHnM0ClinDLGqWKcOsZpYpw2xulinD7GGWKcMcaZYpw5xllinDXG2WKcPcY5YpwzxrlinDvGeWKcN8b5Ypw/xgViXDDGhWJcOMZFYlw0xsViXDzGJWJcMsalYiTGpWNcJsZlY1wuxuVjXCHGFWNcKcaVY1wlxlVjXC3G1WNcI8Y1Y1wrxrVjXCfGdWNcL8b1Y9wgxg1j3CjGjWPcJMZNY9wsxs1j3CLGLWPcKsatY9wmxm1j3C7G7WPcIcYdY9wpxp1j3CXGXWPcLcbdY9wjxj1j3CvGvWPcJ8Z9Y9wvxv1jPCDGA2M8KMaDYzwkxkNjPCzGw2M8IsYjYzwqxqNjPCbGY2M8LsbjYzwhxhNjPCnGk2M0xlNiPDXG02I8PcYzYjwzxrNiPDvGc2I8N8bzYjw/xgtivDDGi2K8OMZLYrw0xstivDzGK2K8MsarYrw6xkGMQYxhjFGMcYxJjGmMWYx5jEWMZYxVjHWMTYxtjF2MfYzXxHhtjNfFeH2MN8R4Y4w3xXhzjLfEeGuMt8V4e4x3xHhnjHfFeHeM98R4b4z3xXh/jA/E+GCMD8X4cIyPxPhojI/F+HiMT8T4ZIxPxfh0jM/E+GyMz8U4LMbhMY6IcWSMz8f4QowvxvhSjC/H+EqMo2J8NcbXYnw9xjdifDPGt2J8O8Z3Ynw3xvdifD/GD2L8MMaPYvw4xk9i/DTGz2L8PMYvYvwyxq9i/DrGb2L8NsbvYhwd4/cx/hDjjzH+FOPPMf4S468x/hbj7zH+EeOfMf4V498x/hPjvzH+F+OYGMfGOCTBoQmOk+C4CY6X4PgJTpDghAlOlODECU6S4KQJTpbg5AlOkeCUCU6V4NQJTpPgtAlOl+D0Cc6Q4IwJzpTgzAnOkuCsCc6W4OwJzpHgnAnOleDcCc6T4LwJzpfg/AkukOCCCS6U4MIJLpLgogkuluDiCS6R4JIJLpUgCS6d4DIJLpvgcgkun+AKCa6Y4EoJrpzgKgmumuBqCa6e4BoJrpngWgmuneA6Ca6b4HoJrp/gBglumOBGCW6c4CYJbprgZglunuAWCW6Z4FYJbp3gNglum+B2CW6f4A4J7pjgTgnunOAuCe6a4G4J7p7gHgnumeBeCe6d4D4J7pvgfgnun+ABCR6Y4EEJHpzgIQkemuBhCR6e4BEJHpngUQkeneAxCR6b4HEJHp/gCQmemOBJCZ6coAmekuCpCZ6W4OkJnpHgmQmeleDZCZ6T4LkJnpfg+QlekOCFCV6U4MUJXpLgpQleluDlCV6R4JUJXpXg1QkOEgwSDBOMEowTTBJME8wSzBMsEiwTrBKsE2wSbBPsEuwTvCbBaxO8LsHrE7whwRsTvCnBmxO8JcFbE7wtwdsTvCPBOxO8K8G7E7wnwXsTvC/B+xN8IMEHE3wowYcTfCTBRxN8LMHHE3wiwScTfCrBpxN8JsFnE3wuwWEJDk9wRIIjE3w+wRcSfDHBlxJ8OcFXEhyV4KsJvpbg6wm+keCbCb6V4NsJvpPguwm+l+D7CX6Q4IcJfpTgxwl+kuCnCX6W4OcJfpHglwl+leDXCX6T4LcJfpfg6AS/T/CHBH9M8KcEf07wlwR/TfC3BH9P8I8E/0zwrwT/TvCfBP9N8L8ExyQ4NsEhKQ5NcZwUx01xvBTHT3GCFCdMcaIUJ05xkhQnTXGyFCdPcYoUp0xxqhSnTnGaFKdNcboUp09xhhRnTHGmFGdOcZYUZ01xthRnT3GOFOdMca4U505xnhTnTXG+FOdPcYEUF0xxoRQXTnGRFBdNcbEUF09xiRSXTHGpFElx6RSXSXHZFJdLcfkUV0hxxRRXSnHlFFdJcdUUV0tx9RTXSHHNFNdKce0U10lx3RTXS3H9FDdIccMUN0px4xQ3SXHTFDdLcfMUt0hxyxS3SnHrFLdJcdsUt0tx+xR3SHHHFHdKcecUd0lx1xR3S3H3FPdIcc8U90px7xT3SXHfFPdLcf8UD0jxwBQPSvHgFA9J8dAUD0vx8BSPSPHIFI9K8egUj0nx2BSPS/H4FE9I8cQUT0rx5BRN8ZQUT03xtBRPT/GMFM9M8awUz07xnBTPTfG8FM9P8YIUL0zxohQvTvGSFC9N8bIUL0/xihSvTPGqFK9OcZBikGKYYpRinGKSYppilmKeYpFimWKVYp1ik2KbYpdin+I1KV6b4nUpXp/iDSnemOJNKd6c4i0p3pribSnenuIdKd6Z4l0p3p3iPSnem+J9Kd6f4gMpPpjiQyk+nOIjKT6a4mMpPp7iEyk+meJTKT6d4jMpPpvicykOS3F4iiNSHJni8ym+kOKLKb6U4sspvpLiqBRfTfG1FF9P8Y0U30zxrRTfTvGdFN9N8b0U30/xgxQ/TPGjFD9O8ZMUP03xsxQ/T/GLFL9M8asUv07xmxS/TfG7FEen+H2KP6T4Y4o/pfhzir+k+GuKv6X4e4p/pPhnin+l+HeK/6T4b4r/pTgmxbEpDslwaIbjZDhuhuNlOH6GE2Q4YYYTZThxhpNkOGmGk2U4eYZTZDhlhlNlOHWG02Q4bYbTZTh9hjNkOGOGM2U4c4azZDhrhrNlOHuGc2Q4Z4ZzZTh3hvNkOG+G82U4f4YLZLhghgtluHCGi2S4aIaLZbh4hktkuGSGS2VIhktnuEyGy2a4XIbLZ7hChitmuFKGK2e4SoarZrhahqtnuEaGa2a4VoZrZ7hOhutmuF6G62e4QYYbZrhRhhtnuEmGm2a4WYabZ7hFhltmuFWGW2e4TYbbZrhdhttnuEOGQ8aM/f/ZOcNdMtw1w90y3D3DPTLcM8O9Mtw7w30y3DfD/TLcP8MDMjwww4MyPDjDQzI8NMPDMjw8wyMyPDLDozI8OsNjMjw2w+MyPD7DEzI8McOTMjw5QzM8JcNTMzwtw9MzPCPDMzM8K8OzMzwnw3MzPC/D8zO8IMMLM7wow4szvCTDSzO8LMPLM7wiwyszvCrDqzMcZBhkGGYYZRhnmGSYZphlmGdYZFhmWGVYZ9hk2GbYZdhneE2G12Z4XYbXZ3hDhjdmeFOGN2d4S4a3ZnhbhrdneEeGd2Z4V4Z3Z3hPhvdmeF+G92f4QIYPZvhQhg9n+EiGj2b4WIaPZ/hEhk9m+FSGT2f4TIbPZvhchsMyHJ7hiAxHZvh8hi9k+GKGL2X4coavZDgqw1czfC3D1zN8I8M3M3wrw7czfCfDdzN8L8P3M/wgww8z/CjDjzP8JMNPM/wsw88z/CLDLzP8KsOvM/wmw28z/C7D0Rl+n+EPGf6Y4U8Z/pzhLxn+muFvGf6e4R8Z/pnhX/+HmTP8N8P/MhyT4dj/2WeOQ3McJ8dxcxwvx/FznCDHCXOcKMeJc5wkx0lznCzHyXOcIscpc5wqx6lznCbHaXOcLsfpc5whxxlznCnHmXOcJcdZc5wtx9lznCPHOXOcK8e5c5wnx3lznC/H+XNcIMcFc1wox4VzXCTHRXNcLMfFc1wixyVzXCpHclw6x2VyXDbH5XJcPscVclwxx5VyXDnHVXJcNcfVclw9xzVyXDPHtXJcO8d1clw3x/VyXD/HDXLcMMeNctw4x01y3DTHzXLcPMctctwyx61y3DrHbXLcNsftctw+xx1y3DHHnXLcOcddctw1x91y3D3HPXLcM8e9ctw7x31y3DfH/XLcP8cDcjwwx4NyPDjHQ3I8NMfDcjw8xyNyPDLHo3I8Osdjcjw2x+NyPD7HE3I8MceTcjw5R3M8JcdTczwtx9NzPCPHM3M8K8ezczwnx3NzPC/H83O8IMcLc7wox4tzvCTHS3O8LMfLc7wixytzvCrHq3Mc5BjkGOYY5RjnmOSY5pjlmOdY5FjmWOVY59jk2ObY5djneE2O1+Z4XY7X53hDjjfmeFOON+d4S4635nhbjrfneEeOd+Z4V45353hPjvfmeF+O9+f4QI4P5vhQjg/n+EiOj+b4WI6P5/hEjk/m+FSOT+f4TI7P5vhcjsNyHJ7jiBxH5vh8ji/k+GKOL+X4co6v5Dgqx1dzfC3H13N8I8c3c3wrx7dzfCfHd3N8L8f3c/wgxw9z/CjHj3P8JMdPc/wsx89z/CLHL3P8Ksevc/wmx29z/C7H0Tl+n+MPOf6Y4085/pzjLzn+muNvOf6e4x85/pnjXzn+neM/Of6b4385jslxbI5DChxa4DgFjlvgeAWOX+AEBU5Y4EQFTlzgJAVOWuBkEw4Z8j+dKQqcssCpCpy6wGkKnLbA6QqcvsAZCpyxwJkKnLnAWQqctcDZCpy9wDkKnLPAuQqcu8B5Cpy3wPkKnL/ABQpcsMCFCly4wEUKXLTAxQpcvMAlClyywKUKpMClC1ymwGULXK7A5QtcocAVC1ypwJULXKXAVQtcrcDVC1yjwDULXKvAtQtcp8B1C1yvwPUL3KDADQvcqMCNC9ykwE0L3KzAzQvcosAtC9yqwK0L3KbAbQvcrsDtC9yhwB0L3KnAnQvcpcBdC9ytwN0L3KPAPQvcq8C9C9ynwH0L3K/A/Qs8oMADCzyowIMLPKTAQws8rMDDCzyiwCMLPKrAows8psBjCzyuwOMLPKHAEws8qcCTC7TAUwo8tcDTCjy9wDMKPLPAswo8u8BzCjy3wPMKPL/ACwq8sMCLCry4wEsKvLTAywq8vMArCryywKsKvLrAQYFBgWGBUYFxgUmBaYFZgXmBRYFlgVWBdYFNgW2BXYF9gdcUeG2B1xV4fYE3FHhjgTcVeHOBtxR4a4G3FXh7gXcUeGeBdxV4d4H3FHhvgfcVeH+BDxT4YIEPFfhwgY8U+GiBjxX4eIFPFPhkgU8V+HSBzxT4bIHPFTiswOEFjihwZIHPF/hCgS8W+FKBLxf4SoGjCny1wNcKfL3ANwp8s8C3Cny7wHcKfLfA9wp8v8APCvywwI8K/LjATwr8tMDPCvy8wC8K/LLArwr8usBvCvy2wO8KHF3g9wX+UOCPBf5U4M8F/lLgrwX+VuDvBf5R4J8F/lXg3wX+U+C/Bf5X4JgCxxY4pMShJY5T4rgljlfi+CVOUOKEJU5U4sQlTlLipCVOVuLkJU5R4pQlTlXi1CVOU+K0JU5X4vQlzlDijCXOVOLMJc5S4qwlzlbi7CXOUeKcJc5V4twlzlPivCXOV+L84w1xgRIXLHGhEhcucZESFy1xsRIXL3GJEpcscakSKXHpEpcpcdkSlytx+RJXKHHFElcqceUSVylx1RJXK3H1Etcocc0S1ypx7RLXKXHdEtcrcf0SNyhxwxI3KnHjEjcpcdMSNytx8xK3KHHLErcqcesStylx2xK3K3H7EncocccSdypx5xJ3KXHXEncrcfcS9yhxzxL3KnHvEvcpcd8S9ytx/xIPKPHAEg8q8eASDynx0BIPK/HwEo8o8cgSjyrx6BKPKfHYEo8r8fgSTyjxxBJPKvHkEi3xlBJPLfG0Ek8v8YwSzyzxrBLPLvGcEs8t8bwSzy/xghIvLPGiEi8u8ZISLy3xshIvL/GKEq8s8aoSry5xUGJQYlhiVGJcYlJiWmJWYl5iUWJZYlViXWJTYltiV2Jf4jUlXlvidSVeX+INJd5Y4k0l3lziLSXeWuJtJd5e4h0l3lniXSXeXeI9Jd5b4n0l3l/iAyU+WOJDJT5c4iMlPlriYyU+XuITJT5Z4lMlPl3iMyU+W+JzJQ4rcXiJI0ocWeLzJb5Q4oslvlTiyyW+UuKoEl8t8bUSXy/xjRLfLPGtEt8u8Z0S3y3xvRLfL/GDEj8s8aMSPy7xkxI/LfGzEj8v8YsSvyzxqxK/LvGbEr8t8bsSR5f4fYk/lPhjiT+V+HOJv5T4a4m/lfh7iX+U+GeJf5X4d4n/lPhvif+VOKbEsSUOqXBoheNUOG6F41U4foUTVDhhhRNVOHGFk1Q4aYWTVTh5hVNUOGWFU1U4dYXTVDhthdNVOH2FM1Q4Y4UzVThzhbNUOGuFs1U4e4VzVDhnhXNVOHeF81Q4b4XzVTh/NdQFKlywwoUqXLjCRSpctMLFKly8wiUqXLLCpSqkwqUrXKbCZStcrsLlK1yhwhUrXKnClStcpcJVK1ytwtUrXKPCNStcq8K1K1ynwnUrXK/C9SvcoMINK9yowo0r3KTCTSvcrMLNK9yiwi0r3KrCrSvcpsJtK9yuwu0r3KHCHSvcqcKdK9ylwl0r3K3C3Svco8I9K9yrwr0r3KfCfSvcr8L9KzygwgMrPKjCgys8pMJDKzyswsMrPKLCIys8qsKjKzymwmMrPK7C4ys8ocITKzypwpMrtMJTKjy1wtMqPL3CMyo8s8KzKjy7wnMqPLfC8yo8v8ILKrywwosqvLjCSyq8tMLLKry8wisqvLLCqyq8usJBhUGFYYVRhXGFSYVphVmFeYVFhWWFVYV1hU2FbYVdhX2F11R4bYXXVXh9hTdUeGOFN1V4c4W3VHhrhbdVeHuFd1R4Z4V3VXh3hfdUeG+F91V4f4UPVPhghQ9V+HCFj1T4aIWPVfh4hU9U+GSFT1X4dIXPVPhshc9VOKzC4RWOqHBkhc9X+EKFL1b4UoUvV/hKhaMqfLXC1yp8vcI3KnyzwrcqfLvCdyp8t8L3Kny/wg8q/LDCjyr8uMJPKvy0ws8q/LzCLyr8ssKvKvy6wm8q/LbC7yocXeH3FaNH/a/n+KnCnyv8pcJfK/ytwt8r/KPCPyv8q8K/K/ynwn8r/K/CMRWOrXBIjUNrHKfGcWscr8bxa5ygxglrnKjGiWucpMZJa5ysxslrnKLGKWucqsapa5ymxmlrnK7G6WucocYZa5ypxplrnKXGWWucrcbZa5yjxjlrnKvGuWucp8Z5a5yvxvlrXKDGBWtcqMaFa1ykxkVrXKzGxWtcosYla1yqRmpcusZlaly2xuVqXL7GFWpcscaValy5xlVqXLXG1WpcvcY1alyzxrVqXLvGdWpct8b1aly/xg1q3LDGjWrcuMZNaty0xs1q3LzGLWrcssataty6xm1q3LbG7WrcvsYdatyxxp1q3LnGXWrctcbdaty9xj1q3LPGvWrcu8Z9aty3xv1q3L/GA2o8sMaDajy4xkNqPLTGw2o8vMYjajyyxqNqPLrGY2o8tsbjajy+xhNqPLHGk2o8uUZrPKXGU2s8rcbTazyjxjNrPKvGs2s8p8ZzazyvxvNrvKDGC2u8qMaLa7ykxktrvKzGy2u8osYra7yqxqtrHNQY1BjWGNUY15jUmNaY1ZjXWNRY1ljVWNfY1NjW2NXY13hNjdfWeF2N19d4Q4031nhTjTfXeEuNt9Z4W42313hHjXfWeFeNd9d4T4331nhfjffX+ECND9b4UI0P1/hIjY/W+FiNj9f4RI1P1vhUjU/X+EyNz9b4XI3Dahxe44gaR9b4fI0v1PhijS/V+HKNr9Q4qsZXa3ytxtdrfKPGN2t8q8a3a3ynxndrfK/G92v8oMYPa/yoxo9r/KTGT2v8rMbPa/yixi9r/KrGr2v8psZva/yuxtE1fl/jDzX+WONPNf5c4y81/lrjbzX+XuMfNf5Z4181/l3jPzX+W+N/NY6pcWyNQxoc2uA4DY7b4HgNjt/gBA1O2OBEDU7c4CQNTtrgZA1O3uAUDU7Z4FQNTt3gNA1O2+B0DU7f4AwNztjgTA3O3OAsDc7a4GwNzt7gHA3O2eBcDc7d4DwNztvgfA3O3+ACDS7Y4EINLtzgIg0u2uBiDS7e4BINLtngUg3S4NINLtPgsg0u1+DyDa7Q4IoNrtTgyg2u0uCqDa7W4OoNrtHgmg2u1eDaDa7T4LoNrtfg+g1u0OCGDW7U4MYNbtLgpg1u1uDmDW7R4JYNbtXg1g1u0+C2DW7X4PYN7tDgjg3u1ODODe7S4K4N7tbg7g3u0eCeDe7V4N4N7tPgvg3u1+D+DR7Q4IENHtTgwQ0e0uChDR7W4OENHtHgkQ0e1eDRDR7T4LENHtfg8Q2e0OCJDZ7U4MkN2uApDZ7a4GkNnt7gGQ2e2eBZDZ7d4DkNntvgeQ2e3+AFDV7Y4EUNXtzgJQ1e2uBlDV7e4BUNXtngVQ1e3eCgwaDBsMGowbjBpMG0wazBvMGiwbLBqsG6wabBtsGuwb7Baxq8tsHrGry+wRsavLHBmxq8ucFbGry1wdsavL3BOxq8s8G7Gry7wXsavLfB+xq8v8EHGnywwYcafLjBRxp8tMHHGny8wScafLLBpxp8usFnGny2wecaHNbg8AZHNDiywecbfKHBFxt8qcGXG3ylwVENvtrgaw2+3uAbDb7Z4FsNvt3gOw2+2+B7Db7f4AcNftjgRw1+3OAnDX7a4GcNft7gFw1+2eBXDX7d4DcNftvgdw2ObvD7Bn9o8McGf2rw5wZ/afDXBn9r8PcG/2jwzwb/avDvBv9p8N8G/2twTINjGxzS4tAWx2lx3BbHa3H8FidoccIWJ2px4hYnaXHSFidrcfIWp2hxyhananHqyf73X2hxuhanb3GGFmdscaYWZ25xlhZnbXG2FmdvcY4W52xxrhbnbnGeFudtcb4W529xgRYXbHGhFhducZEWF21xsRYXb3GJFpdscakWaXHpFpdpcdkWl2tx+RZXaHHFFldqceUWV2lx1RZXa3H1Ftdocc0W12px7RbXaXHdFtdrcf0WN2hxwxY3anHjFjdpcdMWN2tx8xa3aHHLFrdqmeL/bO22LW7X4vYt7tDiji3u1OLOLe7S4q4t7tbi7i3u0eKeLe7V4t4t7tPivi3u1+L+LR7Q4oEtHtTiwS0e0uKhLR7W4uEtHtHikS0e1eLRLR7T4rEtHtfi8S2e0OKJLZ7U4skt2uIpLZ7a4mktnt7iGS2e2eJZLZ7d4jktntvieS2e3+IFLV7Y4kUtXtziJS1e2uJlLV7e4hUtXtniVS1e3eKgxaDFsMWoxbjFpMW0xazFvMWixbLFqsW6xabFtsWuxb7Fa1q8tsXrWry+xRtavLHFm1q8ucVbWry1xdtavL3FO1q8s8W7Wry7xXtavLfF+1q8v8UHWnywxYdafLjFR1p8tMXHWny8xSdafLLFp1p8usVnWny2xedaHNbi8BZHtDiyxedbfKHFF1t8qcWXW3ylxVEtvtriay2+3uIbLb7Z4lstvt3iOy2+2+J7Lb7f4gctftjiRy1+3OInLX7a4mctft7iFy1+2eJXLX7d4jctftvidy2ObvH7Fn9o8ccWf2rx5xZ/afHXFn9r8fcW/2jxzxb/avHvFv9p8d8W/2txTItjWxzS4dAOx+lw3A7H63D8DifocMIOJ+pw4g4n6XDSDifrcPIOp+hwyg6n6nDqDqfpcNoOp+tw+g5n6HDGDmfqcOYOZ+lw1g5n63D2DufocM5uiHN1OHeH83Q4b4fzdTh/hwt0uGCHC3W4cIeLdLhoh4t1uHiHS3S4ZIdLdUiHS3e4TIfLdrhch8t3uEKHK3a4Uocrd7hKh6t2uFqHq3e4RodrdrhWh2t3uE6H63a4Xofrd7hBhxt2uFGHG3e4SYebdrhZh5t3uEWHW3a4VYdbd7hNh9t2uF2H23e4Q4c7drhThzt3uEuHu3a4W4e7d7hHh3t2uFeHe3e4T4f7drhfh/t3eECHB3Z4UIcHd3hIh4d2eFiHh3d4RIdHdnhUh0d3eEyHx3Z4XIfHd3hChyd2eFKHJ3doh6d0eGqHp3V4eodndHhmh2d1eHaH53R4bofndXh+hxd0eGGHF3V4cYeXdHhph5d1eHmHV3R4ZYdXdXh1h4MOgw7DDqMO4w6TDtMOsw7zDosOyw6rDusOmw7bDrsO+w6v6fDaDq/r8PoOb+jwxg5v6vDmDm/p8NYOb+vw9g7v6PDODu/q8O4O7+nw3g7v6/D+Dh/o8MEOH+rw4Q4f6fDRDh/r8PEOn+jwyQ6f6vDpDp/p8NkOn+twWIfDOxzR4cgOn+/whQ5f7PClDl/u8JUOR3X4aoevdfh6h290+GaHb3X4dofvdPhuh+91+H6HH3T4YYcfdfhxh590+GmHn3X4eYdfdPhlh191+HWH33T4bYffdTi6w+87/KHDHzv8qcOfO/ylw187/K3D3zv8o8M/O/yrw787/KfDfzv8r8MxHY7tcEiPQ3scp8dxexyvx/F7nKDHCXucqMeJe5ykx0l7nKzHyXucoscpe5yqx6l7nKbHaXucrsfpe5yhxxl7nKnHmXucpcdZe5ytx9l7nKPHOXucq8e5e5ynx3l7nK/H+XtcoMcFe1yox4V7XKTHRXtcrMfFe1yixyV7XKpHely6x2V6XLbH5XpcvscVelyxx5V6XLnHVXpctcfVely9xzV6XLPHtXpcu8d1ely3x/V6XL/HDXrcsMeNety4x0163LTHzXrcvMctetyyx6163LrHbXrctsftety+xx163LHHnXrcucddety1x9163L3HPXrcs8e9ety7x3163LfH/Xrcv8cDejywx4N6PLjHQ3o8tMfDejy8xyN6PLLHo3o8usdjejy2x+N6PL7HE3o8sceTejy5R3s8pcdTezytx9N7PKPHM3s8q8ezezynx3N7PK/H83u8oMcLe7yox4t7vKTHS3u8rMfLe7yixyt7vKrHq3sc9Bj0GPYY9Rj3mPSY9pj1mPdY9Fj2WPVY99j02PbY9dj3eE2P1/Z4XY/X93hDjzf2eFOPN/d4S4+39nhbj7f3eEePd/Z4V49393hPj/f2eF+P9/f4QI8P9vhQjw/3+EiPj/b4WI+P9/hEj0/2+FSPT/f4TI/P9vhcj8N6HN7jiB5H9v+PZnNswvRguOTEE9u20bFt27btjm3buC8bsa1hbNsTe7ae2np/Q/f5cqoahw1w+ABHDHDkAEcNcPQAXx7gKwN8dYCvDfD1Ab4xwDcH+NYA3x7gOwN8d4DvDfD9AX4wwA8H+NEAPx7gJwP8dICfDfDzAX4xwC8H+NUAvx7gNwP8doDfDfD7Af4wwB8HOGaAPw3w5wH+MsBfB/jbAH8f4B8D/HOAfw3w7wH+M8B/B/jfAMcOcEiA4wQ4boDjBTh+gBMEOGGAEwU4NMCJA5wkwEkDnCzAyQOcIsApA5wqwKkDnCbAaQOcLsDpA5whwBkDnCnAmQOcJcBZA5wtwNkDnCPAOQOcK8C5A5wnwHkDnC/A+QNcIMAFA1wowIUDXCTARQNcLMDFA1wiwCUDXCpAAlw6wGUCXDbA5QJcPsAVAlwxwJUCXDnAVQJcNcDVAlw9wDUCXDPAtQJcO8B1Alw3wPUCXD/ADQLcMMCNAtw4wE0C3DTAzQLcPMAtAtwywK0C3DrAbQLcNsDtAtw+wB0C3DHAnQLcOcBdAtw1wN0C3D3APQLcM8C9Atw7wH0C3DfA/QLcP8ADAjwwwIMCPDjAQwI8NMDDAjw8wCMCPDLAowI8OsBjAjw2wOMCPD7AEwI8McCTAjw5QAM8JcBTAzwtwNMDPCPAMwM8K8CzAzwnwHMDPC/A8wO8IMALA7wowIsDvCTASwO8LMDLA7wiwCsDvCrAqwO8JsBrA7wuwOsDvCHAGwO8KcCbA7wlwFsDvC3A2wO8I8A7A7wrwLsDHAQYBBgGGAUYB5gEmAaYBZgHWARYBlgFWAfYBNgG2AXYB3hPgPcGeF+A9wf4QIAPBvhQgA8H+EiAjwb4WICPB/hEgE8G+FSATwf4TIDPBvhcgM8H+EKALwb4UoDDAhwe4IgARwY4KsDRAb4c4CsBvhrgawG+HuAbAb4Z4FsBvh3gOwG+G+B7Ab4f4AcBfhjgRwF+HOAnAX4a4GcBfh7gFwF+GeBXAX4d4DcBfhvgdwF+H+APAf4Y4JgAfwrw5wB/CfDXAH8L8PcA/wjwzwD/CvDvAP8J8N8A/wtwbIBDQhwnxHFDHC/E8UOcIMQJQ5woxKEhThziJCFOGuJkIU4e4hQhThniVCFOHeI0IU4b4nQhTh/iDCHOGOJMIc4c4iwhzhribCHOHuIcIc4Z4lwhzh3iPCHOG+J8Ic4f4gIhLhjiQiEuHOIiIS4a4mIhLh7iEiEuGeJSIRLi0iEuE+KyIS4X4vIhrhDiiiGuFOLKIa4S4qohrhbi6iGuEeKaIa4V4tohrhPiuiGuF+L6IW4Q4oYhbhTixiFuEuKmIW4W4uYhbhHiliFuFeLWIW4T4rYhbhfi9iHuEOKOIe4U4s4h7hLiriHuFuLuIe4R4p4h7hXi3iHuE+K+Ie4X4v4hHhDigSEeFOLBIR4S4qEhHhbi4SEeEeKRIR4V4tEhHhPisSEeF+LxIZ4Q4okhnhTiySEa4ikhnhriaSGeHuIZIZ4Z4lkhnh3iOSGeG+J5IZ4f4gUhXhjiRSFeHOIlIV4a4mUhXh7iFSFeGeJVIV4d4jUhXhvidSFeH+INId4Y4k0h3hziLSHeGuJtId4e4h0h3hniXSHeHeIgxCDEMMQoxDjEJMQ0xCzEPMQixDLEKsQ6xCbENsQuxD7Ee0K8N8T7Qrw/xAdCfDDEh0J8OMRHQnw0xMdCfDzEJ0J8MsSnQnw6xGdCfDbE50J8PsQXQnwxxJdCHBbi8BBHhDgyxFET4ej/bWjIkCGvhPhqiK+F+HqIb4T4Zohvhfh2iO+E+G6I74X4fogfhPhhiB+F+HGIn4T4aYifhfh5iF+E+GWIX4X4dYjfhPhtiN+F+H2IP4T4Y4hjQvwpxJ9D/CXEX0P8LcTfQ/wjxD9D/CvEv0P8J8R/Q/wvxLEhDolwnAjHjXC8CMePcIIIJ4xwogiHRjhxhJNEOGmEk0U4eYRTRDhlhFNFOHWE00Q4bYTTRTh9hDNEOGOEM0U4c4SzRDhrhLNFOHuEc0Q4Z4RzRTh3hPNEOG+E80U4f4QLRLhghAtFuHCEi0S4aISLRbh4hEtEuGSES0VIhEtHuEyEy0a4XITLR7hChCtGuFKEK0e4SoSrRrhahKtHuEaEa0a4VoRrR7hOhOtGuF6E60e4QYQbRrhRhBtHuEmEm0a4WYSbR7hFhFtGuFWEW0e4TYTbRrhdhNtHuEOEO0a4U4Q7R7hLhLtGuFuEu0e4R4R7RrhXhHtHuE+E+0a4X4T7R3hAhAdGeFCEB0d4SISHRnhYhIdHeESER0Z4VIRHR3hMhMdGeFyEx0d4QoQnRnhShCdHaISnRHhqhKdFeHqEZ0R4ZoRnRXh2hOdEeG6E50V4foQXRHhhhBdFeHGEl0R4aYSXRXh5hFdEeGWEV0V4dYTXRHhthNdFeH2EN0R4Y4Q3RXhzhLdEeGuEt0V4e4R3RHhnhHdFeHeEgwiDCMMIowjjCJMI0wizCPMIiwjLCKsI6wibCNsIuwj7CO+J8N4I74vw/ggfiPDBCB+K8OEIH4nw0Qgfi/DxCJ+I8MkIn4rw6QififDZCJ+L8PkIX4jwxQhfinBYhMMjHBHhyAhHRTg6wpcjfCXCVyN8LcLXI3wjwjcjfCvCtyN8J8J3I3wvwvcj/CDCDyP8KMKPI/wkwk8j/CzCzyP8IsIvI/wqwq8j/CbCbyP8LsLvI/whwh8jHBPhTxH+HOEvEf4a4W8R/h7hHxH+GeFfEf4d4T8R/hvhfxGOjXBIjOPEOG6M48U4fowTxDhhjBPFODTGiWOcJMZJY5wsxsljnCLGKWOcKsapY5wmxmljnC7G6WOcIcYZY5wpxpljnCXGWWOcLcbZY5wjxjljnCvGuWOcJ8Z5Y5wvxvljXCDGBWNcKMaFY1wkxkVjXCzGxWNcIsYlY1wqRmJcOsZlYlw2xuViXD7GFWJcMcaVYlw5xlViXDXG1WJcPcY1YlwzxrViXDvGdWJcN8b1Ylw/xg1i3DDGjWLcOMZNYtw0xs1i3DzGLWLcMsatYtw6xm1i3DbG7WLcPsYdYtwxxp1i3DnGXWLcNcbdYtw9xj1i3DPGvWLcO8Z9Ytw3xv1i3D/GA2I8MMaDYjz4tLFD/sfq0BgPi/HwGI+I8cgYj4rx6BiPifHYGI+L8fgYT4jxxBhPivHkGI3xlBhPjfG0GE+P8YwYz4zxrBjPjvGcGM+N8bwYz4/xghgvjPGiGC+O8ZIYL43xshgvj3HIkCFDrozxqhivjvGaGK+N8boYr4/xhhhvjPGmGG+O8ZYYb43xthhvj/GOGO+M8a4Y745xEGMQYxhjFGMcYxJjGmMWYx5jEWMZYxVjHWMTYxtjF2Mf4z0x3hvjfTHeH+MDMT4Y40MxPhzjIzE+GuNjMT4e4xMxPhnjUzE+HeMzMT4b43MxPh/jCzG+GONLMQ6LcXiMI2IcGeOoGEfH+HKMr8T4aoyvxfh6jG/E+GaMb8X4dozvxPhujO/F+H6MH8T4YYwfxfhxjJ/E+GmMn8X4eYxfxPhljF/F+HWM38T4bYzfxfh9jD/E+GOMY2L8KcafY/wlxl9j/C3G32P8I8Y/Y/wrxr9j/CfGf2P8L8ax/+OX4DgJjpvgeAmOn+AECU6Y4EQJDk1w4gQnSXDSBCdLcPIEp0hwygSnSnDqBKdJcNoEp0tw+gRnSHDGBGdKcOYEZ0lw1gRnS3D2BOdIcM4E50pw7gTnSXDeBOdLcP4EF0hwwQQXSnDhBBdJcNEEF0tw8QSXSHDJBJdKkASXTnCZBJdNcLkEl09whQRXTHClBFdOcJUEV01wtQRXT3CNBNdMcK0E105wnQTXTXC9BNdPcIMEN0xwowQ3TnCTBDdNcLMEN09wiwS3THCrBLdOcJsEt01wuwS3T3CHBHdMcKcEd05wlwR3TXC3BHdPcI8E90xwrwT3TnCfBPdNcL8E90/wgAQPTPCgBA9O8JAED03wsAQPT/CIBI9M8KgEj07wmASPTfC4BI9P8IQET0zwpARPTtAET0nw1ARPS/D0BM9I8MwEz0rw7ATPSfDcBM9L8PwEL0jwwgQvSvDiBC9J8NIEL0vw8gSvSPDKBK9K8OoEr0nw2gSvS/D6BG9I8MYEb0rw5gRvSfDWBG9L8PYE70jwzgTvSvDuBAcJBgmGCUYJxgkmCaYJZgnmCRYJlglWCdYJNgm2CXYJ9gnek+C9Cd6X4P0JPpDggwk+lODDCT6S4KMJPpbg4wk+keCTCT6V4NMJPpPgswk+l+DzCb6Q4IsJvpTgsASHJzgiwZEJjkpwdIIvJ/hKgq8m+FqCryf4RoJvJvhWgm8n+E6C7yb4XoLvJ/hBgh8m+FGCHyf4SYKfJvhZgp8n+EWCXyb4VYJfJ/hNgt8m+F2C3yf4Q4I/JjgmwZ8S/DnBXxL8NcHfEvw9wT8S/DPBvxL8O8F/Evw3wf8SHJvgkBTHSXHcFMdLcfwUJ0hxwhQnSnFoihOnOEmKk6Y4WYqTpzhFilOmOFU6ZsjUKU6T4rQpTpfi9CnOkOKMKc6U4swpzpLirCnOluLsKc6R4pwpzpXi3CnOk+K8Kc6X4vwpLpDigikulOLCKS6S4qIpLpbi4ikukeKSKS6VIikuneIyKS6b4nIpLp/iCimumOJKKa6c4ioprpriaimunuIaKa6Z4loprp3iOimum+J6Ka6f4gYpbpjiRilunOImKW6a4mYpbp7iFilumeJWKW6d4jYpbpvidilun+IOKe6Y4k4p7pziLinumuJuKe6e4h4p7pniXinuneI+Ke6b4n4p7p/iASkemOJBKR6c4iEpHpriYSkenuIRKR6Z4lEpHp3iMSkem+JxKR6f4gkpnpjiSSmenKIpnpLiqSmeluLpKZ6R4pkpnpXi2Smek+K5KZ6X4vkpXpDihSlelOLFKV6S4qUpXpbi5SlekeKVKV6V4tUpXpPitSlel+L1Kd6Q4o0p3pTizSnekuKtKd6W4u0p3pHinSneleLdKQ5SDFIMU4xSjFNMUkxTzFLMUyxSLFOsUqxTbFJsU+xS7FO8J8V7U7wvxftTfCDFB1N8KMWHU3wkxUdTfCzFx1N8IsUnU3wqxadTfCbFZ1N8LsXnU3whxRdTfCnFYSkOT3FEiiNTHJXi6BRfTvGVFF9N8bUUX0/xjRTfTPGtFN9O8Z0U303xvRTfT/GDFD9M8aMUP07xkxQ/TfGzFD9P8YsUv0zxqxS/TvGbFL9N8bsUv0/xhxR/THFMij+l+HOKv6T4a4q/pfh7in+k+GeKf6X4d4r/pPhviv+lODbFIRmOk+G4GY6X4fgZTpDhhBlOlOHQDCfOcJIMJ81wsgwnz3CKDKfMcKoMp85wmgynzXC6DKfPcIYMZ8xwpgxnznCWDGfNcLYMZ89wjgznzHCuDOfOcJ4M581wvgznz3CBDBfMcKEMF85wkQwXzXCxDBfPcIkMl8xwqQzJcOkMl8lw2QyXy3D5DFfIcMUMV8pw5QxXyXDVDFfLcPUM18hwzQzXynDtDNfJcN0M18tw/Qw3yHDDDDfKcOMMN8lw0ww3y3DzDLfIcMsMt8pw6wy3yXDbDLfLcPsMd8hwxwx3ynDnDHfJcNcMd8tw9wz3yHDPDPfKcO8M98lw3wz3y3D/DA/I8MAMD8rw4AwPyfDQDA/L8PAMj8jwyAyPyvDoDI/J8NgMj8vw+AxPyPDEDE/K8OQMzfCUDE/N8LQMT8/wjAzPzPCsDM/O8JwMz83wvAzPz/CCDC/M8KIML87wkgwvzfCyDC/P8IoMr8zwqgyvzvCaDK/N8LoMr8/whgxvzPCmDG/O8JYMb83wtgxvz/CODO/M8K4M785wkGGQYZhhlGGcYZJhmmGWYZ5hkWGZYZVhnWGTYZthl2Gf4T0Z3pvhfRnen+EDGT6Y4UMZPpzhIxk+muFjGT6e4RMZPpnhUxk+neEzGT6b4XMZPp/hCxm+mOFLGQ7LcHiGIzIcmeGoDEdn+HKGr2T4aoavZfh6hm9k+GaGb2X4dobvZPhuhu9l+H6GH2T4YYYfZfhxhp9k+GmGn2X4eYZfZPhlhl9l+HWG32T4bYbfZfh9hj9k+GOGYzL8KcOfM/wlw18z/C3D3zP8I8M/M/wrw78z/CfDfzP8L8OxGQ7JcZwcx81xvBzHz3GCHCfMcaIch+Y4cY6T5DhpjpPlOHmOU+Q4ZY5T5Th1jtPkOG2O0+U4fY4z5DhjjjPlOHOOs+Q4a46z5Th7jnPkOGeOc+U4d47z5DhvjvPlOH+OC+S4YI4L5bhwjovkuGiOi+W4eI5L5LhkjkvlSI5L57hMjsvmuFyOy+e4Qo4r5rhSjivnuEqOq+a4Wo6r57hGjmvmuFaOa+e4To7r5rhejuvnuEGOG+a4UY4b57hJjpvmuFmOm+e4RY5b5rhVjlvnuE2O2+a4XY7b57hDjjvmuFOOO+e4S4675rhbjrvnuEeOe+a4V45757hPjvvmuF+O++d4QI4H5nhQjgfneEiOh+Z4WI6H53hEjkfmeFSOR+d4TI7H5nhcjsfneEKOJ+Z4Uo4n52iOp+R4ao6n5Xh6jmfkeGaOZ+V4do7n5HhujufleH6OF+R4YY4X5XhxjpfkeGmOl+V4eY5X5HhljlfleHWO1+R4bY7X5Xh9jjfkeGOON+V4c4635HhrjrfleHuOd+R4Z4535Xh3joMcgxzDHKMc4xyTHNMcsxzzHIscyxyrHOt8yDhNjm2OXY59jvfkeG+O9+V4f44P5Phgjg/l+HCOj+T4aI6P5fh4jk/k+GSOT+X4dI7P5Phsjs/l+HyOL+T4Yo4v5Tgsx+E5jshxZI6jchyd48s5vpLjqzm+luPrOb6R45s5vpXj2zm+k+O7Ob6X4/s5fpDjhzl+lOPHOX6S46c5fpbj5zl+keOXOX6V49c5fpPjtzl+l+P3Of6Q4485jsnxpxx/zvGXHH/N8bccf8/xjxz/zPGvHP/O8Z8c/83xvxzH5jikwHEKHLfA8Qocv8AJCpywwIkKHFrgxAVOUuCkBU5W4OQFTlHglAVOVeDUBU5T4LQFTlfg9AXOUOCMBc5U4MwFzlLgrAXOVuDsBc5R4JwFzlXg3AXOU+C8Bc5X4PwFLlDgggUuVODCBS5S4KIFLlbg4gUuUeCSBS5VIAUuXeAyBS5b4HIFLl/gCgWuWOBKBa5c4CoFrlrgagWuXuAaBa5Z4FoFrl3gOgWuW+B6Ba5f4AYFbljgRgVuXOAmBW5a4GYFbl7gFgVuWeBWBW5d4DYFblvgdgVuX+AOBe5Y4E4F7lzgLgXuWuBuBe5e4B4F7lngXgXuXeA+Be5b4H4F7l/gAQUeWOBBBR5c4CEFHlrgYQUeXuARBR5Z4FEFHl3gMQUeW+BxBR5f4AkFnljgSQWeXKAFnlLgqQWeVuDpBZ5R4JkFnlXg2QWeU+C5BZ5X4PkFXlDghQVeVODFBV5S4KUFXlbg5QVeUeCVBV5V4NUFXlPgtQVeV+D1Bd5Q4I0F3lTgzQXeUuCtBd5W4O0F3lHgnQXeVeDdBQ4KDAoMC4wKjAtMCkwLzArMCywKLAusCqwLbApsC+wK7Au8p8B7C7yvwPsLfKDABwt8qMCHC3ykwEcLfKzAxwt8osAnC3yqwKcLfKbAZwt8rsDnC3yhwBcLfKnAYQUOL3BEgSMLHFXg6AJfLvCVAl8t8LUCXy/wjQLfLPCtAt8u8J0C3y3wvQLfL/CDAj8s8KMCPy7wkwI/LfCzAj8v8IsCvyzwqwK/LvCbAr8t8LsCvy/whwJ/LHBMgT8V+HOBvxT4a4G/Ffh7gX8U+GeBfxX4d4H/FPhvgf8VOLbAISWOU+K4JY5X4vglTlDihCVOVOLQEicucZISJy1xshInL3GKEqcscaoSpy5xmhKnLXG6EqcvcYYSZyxxphJnLnGWEmctcbYSZy9xjhLnLHGuEucucZ4S5y1xvhLnL3GBEhcscaESFy5xkRIXLXGxEhcvcYkSlyxxqRIpcekSlylx2RKXK3H5ElcoccUSVypx5RJXKXHVElcrcfUS1yhxzRLXKnHtEtcpcd0S1ytx/RI3KHHDEjcqceMSNylx0xI3K3HzErcoccsStypx6xK3KXHbErcrcfsSdyhxxxJ3KnHnEncpcdcSdytx9xL3KHHPEvcqce8S9ylx3xL3K3H/Eg8o8cASDyrx4BIPKfHQEg8r8fASjyjxyBKPKvHoEo8p8dgSjyvx+BJPKPHEEk8q8eQSLfGUEk8t8bQSTy/xjBLPLPGsEs8u8ZwSzy3xvBLPL/GCEi8s8aISLy7xkhIvLfGyEi8v8YoSryzxqhKvLvGaEq8t8boSry/xhhJvLPGmEm8u8ZYSby3xthJvL/GOEu8s8a4S7y5xUGJQYlhiVGJcYlJiWmJWYl5iUWJZYlViXWJTYltiV2Jf4j0l3lvifSXeX+IDJT5Y4kMlPlziIyU+WuJjJT5e4hMlPlniUyU+XeIzJT5b4nMlPl/iCyW+WOJLJQ4rcXiJI0ocWeKoEkeX+HKJr5T4aomvlfh6iW+U+GaJb5X4donvlPhuie+V+H6JH5T4YYkflfhxiZ+U+GmJn5X4eYlflPhliV+V+HWJ35T4bYnflfh9iT+U+GOJY0r8qcSfS/ylxF9L/K3E30v8o8Q/S/yrxL9L/KfEf0v8r8SxJQ6pcJwKx61wvArHr3CCCiescKIKh1Y4cYWTVDhphZNVOHmFU1Q4ZYVTVTh1hdNUOG2F01U4fYUzVDhjhTNVOHOFs1Q4a4WzVTh7hXNUOGeFc1U4d4XzVDhvhfNVOH+FC1S4YIULVbhwhYtUuGiFi1W4eIVLVLhkhUtVSIVLV7hMhctWuFyFy1e4QoUrVrhShStXuEqFq1a4WoWrV7hGhWtWuFaFa1e4ToXrVrhehetXuEGFG1a4UYUbV7hJhZtWuFmFm1e4RYVbVrhVhVtXuE2F21a4XYXbV7hDhTtWuFOFO1e4S4W7VrhbhbtXuEeFe1a4V4V7V7hPhftWuF+F+1d4QIUHVnhQhQdXeEiFh1Z4WIWHV3hEhUdWeFSFR1d4TIXHVnhchcdXeEKFJ1Z4UoUnV2iFp1R4aoWnVXh6hWdUeGaFZ1V4doXnVHhuhedVeH6FF1R4YYUXVXhxhZdUeGmFl1V4eYVXVHhlhVdVeHWF11R4bYXXVXh9hTdUeGOFN1V4c4W3VHhrhbdVeHuFd1R4Z4V3VXh3hYMKgwrDCqMK4wqTCtMKswrzCosKywqrCusKmwrbCrsK+wrvqfDeiqH/12Q9UOGDFT5U4cMVPlLhoxU+VuHjFT5R4ZMVPlXh0xU+U+GzFT5X4fMVvlDhixW+VOGwCodXOKLCkRWOqnB0hS9X+EqFr1b4WoWvV/hGhW9W+FaFb1f4ToXvVvhehe9X+EGFH1b4UYUfV/hJhZ9W+FmFn1f4RYVfVvhVhV9X+E2F31b4XYXfV/hDhT9WOKbCnyr8ucJfKvy1wt8q/L3CPyr8s8K/Kvy7wn8q/LfC/yocW+GQGsepcdwax6tx/BonqHHCGieqcWiNE9c4SY2T1jhZjZPXOEWNU9Y4VY1T1zhNjdPWOF2N09c4Q40z1jhTjTPXOEuNs9Y4W42z1zhHjXPWOFeNc9c4T43z1jhfjfPXuECNC9a4UI0L17hIjYvWuFiNi9e4RI1L1rhUjdS4dI3L1LhsjcvVuHyNK9S4Yo0r1bhyjavUuGqNq9W4eo1r1LhmjWvVuHaN69S4bo3r1bh+jRvUuGGNG9W4cY2b1LhpjZvVuHmNW9S4ZY1b1bh1jdvUuG2N29W4fY071LhjjTvVuHONu9S4a4271bh7jXvUuGeNe9W4d4371LhvjfvVuH+NB9R4YI0H1XhwjYfUeGiNh9V4eI1H1HhkjUfVeHSNx9R4bI3H1Xh8jSfUeGKNJ9V4co3WeEqNp9Z4Wo2n13hGjWfWeFaNZ9d4To3n1nhejefXeEGNF9Z4UY0X13hJjZfWeFmNl9d4RY1X1nhVjVfXeE2N19Z4XY3X13hDjTfWeFONN9d4S4231nhbjbfXeEeNd9Z4V4131zioMagxrDGqMa4xqTGtMasxr7GosayxqrGusamxrbGrsa/xnhrvrfG+Gu+v8YEaH6zxoRofrvGRGh+t8bEaH6/xiRqfrPGpGp+u8Zkan63xuRqfr/GFGl+s8aUah9U4vMYRNY6scVSNo2t8ucZXany1xtdqfL3GN2p8s8a3any7xndqfLfG92p8v8YPavywxo9q/LjGT2r8tMbPavy8xi9q/LIe6lc1fj0BflPjtzV+V+P3Nf5Q4481jqnxpxp/rvGXGn+t8bcaf6/xjxr/rPGvGv+u8Z8a/63xvxrH1jikwXEaHLfB8Rocv8EJGpywwYkaHNrgxA1O0uCkDU7W4OQNTtHglA1O1eDUDU7T4LQNTtfg9A3O0OCMDc7U4MwNztLgrA3O1uDsDc7R4JwNztXg3A3O0+C8Dc7X4PwNLtDggg0u1ODCDS7S4KINLtbg4g0u0eCSDS7VIA0u3eAyDS7b4HINLt/gCg2u2OBKDa7c4CoNrtrgag2u3uAaDa7Z4FoNrt3gOg2u2+B6Da7f4AYNbtjgRg1u3OAmDW7a4GYNbt7gFg1u2eBWDW7d4DYNbtvgdg1u3+AODe7Y4E4N7tzgLg3u2uBuDe7e4B4N7tngXg3u3eA+De7b4H4N7t/gAQ0e2OBBDR7c4CENHtrgYQ0e3uARDR7Z4FENHt3gMQ0e2+BxDR7f4AkNntjgSQ2e3KANntLgqQ2e1uDpDZ7R4JkNntXg2Q2e0+C5DZ7X4PkNXtDghQ1e1ODFDV7S4KUNXtbg5Q1e0eCVDV7V4NUNXtPgtQ1e1+D1Dd7Q4I0N3tTgzQ3e0uCtDd7W4O0N3tHgnQ3e1eDdDQ4aDBoMG4wajBtMGkwbzBrMGywaLBusGqwbbBpsG+wa7Bu8p8F7G7yvwfsbfKDBBxt8qMGHG3ykwUcbfKzBxxt8osEnG3yqwacbfKbBZxt8rsHnG3yhwRcbfKnBYQ0Ob3BEgyMbHNXg6AZfbvCVBl9t8LUGX2/wjQbfbPCtBt9u8J0G323wvQbfb/CDBj9s8KMGP27wkwY/bfCzBj9v8IsGv2zwqwa/bvCbBr9t8LsGv2/whwZ/bHBMgz81+HODvzT4a4O/Nfh7g380+GeDfzX4d4P/NPhvg/81OLbBIS2O0+K4LY7X4vgtTtDihC1O1OLQFiducZIWJ21xshYnb3GKFqdscaoWp25xmhanbXG6FqdvcYYWZ2xxphZnbnGWFmdtcbYWZ29xjhbnbHGuFuducZ4W521xvhbnb3GBFhdscaEWF25xkRYXbXGxFhdvcYkWl2xxqRZpcekWl2lx2RaXa3H5FldoccUWV2px5RZXaXHVFldrcfUW12hxzRbXanHtFtdpcd0W12tx/RY3aHHDFjdqceMWN2lx0xY3a3HzFrdoccsWt2px6xa3aXHbFrdrcfsWd2hxxxZ3anHnFndpcdcWd2tx9xb3aHHPFvdqce8W92lx3xb3a3H/Fg9o8cAWD2rx4BYPafHQFg9r8fAWj2jxyBaPavHoFo9p8dgWj2vx+BZPaPHEFk9q8eQWbfGUFk9t8bQWT2/xjBbPbPGsFs9u8ZwWz23xvBbPb/GCFi9s8aIWL27xkhYvbfGyFi9v8YoWr2zxqhavbvGaFq9t8boWr2/xhhZvbPGmFm9u8ZYWb23xthZvb/GOFu9s8a4W725x0GLQYthi1GLcYtJi2mLWYt5i0WLZYtVi3WLTYtti12Lf4j0t3tvifS3e3+IDLT7Y4kMtPtziIy0+2uJjLT7e4hMtPtniUy0+3eIzLT7b4nMtPt/iCy2+2OJLLQ5rcXiLI1oc2eKoFke3+HKLr7T4aouvtfh6i2+0+GaLb7X4dovvtPhui++1+H6LH7T4YYsftfhxi5+0+GmLn7X4eYtftPhli1+1+HWL37T4bYvftfh9iz+0+GOLY1r8qcWfW/ylxV9b/K3F31v8o8U/W/yrxb9b/KfFf1v8r8WxLQ7pcJwOx+1wvA7H73CCbsIhE3Y4UYdDO5y4w0k6nLTDyTqcvMMpOpyyw6k6nLrDaTqctsPpOpy+wxk6nLHDmTqcucNZOpy1w9k6nL3DOTqcs8O5Opy7w3k6nLfD+Tqcv8MFOlyww4U6XLjDRTpctMPFOly8wyU6XLLDpTqkw6U7XKbDZTtcrsPlO1yhwxU7XKnDlTtcpcNVO1ytw9U7XKPDNTtcq8O1O1ynw3U7XK/D9TvcoMMNO9yow4073KTDTTvcrMPNO9yiwy073KrDrTvcpsNtO9yuw+073KHDHTvcqcOdO9ylw1073K3D3Tvco8M9O9yrw7073KfDfTvcr8P9OzygwwM7PKjDgzs8pMNDOzysw8M7PKLDIzs8qsOjOzymw2M7PK7D4zs8ocMTOzypw5M7tMNTOjy1w9M6PL3DMzo8s8OzOjy7w3M6PLfD8zo8v8MLOryww4s6vLjDSzq8tMPLOry8wys6vLLDqzq8usNrOry2w+s6vL7DGzq8scObOry5w1s6vLXD2zq8vcM7Oryzw7s6vLvDQYdBh2GHUYdxh0mHaYdZh3mHRYdlh1WHdYdNh22HXYd9h/d0eG+H93V4f4cPdPhghw91+HCHj3T4aIePdfh4h090+GSHT3X4dIfPdPhsh891+HyHL3T4YocvdTisw+EdjuhwZIejOhzd4csdvvI/v9f8/6fD6x2+0eGbHb7V4dsdvtPhux2+1+H7HX7Q4YcdftThxx1+0uGnHX7W4ecdftHhlx1+1eHXHX7T4bcdftfh9x3+0OGPHY7p8KcOf+7wlw5/7fC3Dn/v8I8O/+zwrw7/7vCfDv/t8L8Ox3Y4pMdxehy3x/F6HL/HCXqcsMeJehza48Q9TtLjpD1O1uPkPU7R45Q9TtXj1D1O0+O0PU7X4/Q9ztDjjD3O1OPMPc7S46w9ztbj7D3O0eOcPc7V49w9ztPjvD3O1+P8PS7Q44I9LtTjwj0u0uOiPS7W4+I9LtHjkj0u1SM9Lt3jMj0u2+NyPS7f4wo9rtjjSj2u3OMqPa7a42o9rt7jGj2u2eNaPZOs3eM6Pa7b43o9rt/jBj1u2ONGPW7c4yY9btrjZj1u3uMWPW7Z41Y9bt3jNj1u2+N2PW7f4w497tjjTj3u3OMuPe7a42497t7jHj3u2eNePe7d4z497tvjfj3u3+MBPR7Y40E9HtzjIT0e2uNhPR7e4xE9HtnjUT0e3eMxPR7b43E9Ht/jCT2e2ONJPZ7coz2e0uOpPZ7W4+k9ntHjmT2e1ePZPZ7T47n/E7jH83u8oMcLe7yox4t7vKTHS3u8rMfLe7yixyt7vKrHq3u8psdre7yux+t7vKHHG3u8qcebe7ylx1t7vK3H23u8o8c7e7yrx7t7HPQY9Bj2GPUY95j0mPaY9Zj3WPRY9lj1WPfY9Nj22PXY93hPj/f2eF+P9/f4QI8P9vhQjw/3+EiPj/b4WI+P9/hEj0/2+FSPT/f4TI/P9vhcj8/3+EKPL/b4Uo/Dehze44geR/Y4qsfRPb78/3h6B+exD+f7Ok3aNLVtpcapndpuatt2T203tW3r/bJR27ZtpE3SZz7z6/f5G/aeO7M7d+4O4KsD+NoAvj6AbwzgmwP41gC+PYDvDOC7A/jeAL4/gB8M4IcD+NEAfjyAnwzgpwP42QB+PoBfDOCXA/jVAH49gN8M4LcD+N0Afj+APwzgjwP40wD+PIC/DOCvA/jbAP4+gH8M4J8D+NcAjh7AvwfwnwEcM4BjB3DcAP47gIMCHC/AwQEOCXD8ACcIcGiAEwY4LMCJApw4wEkCnDTAyQKcPMApApwywKkCnDrAaQKcNsDpApw+wBkCnDHAmQKcOcBZApw1wNkCnD3AOQKcM8C5Apw7wHkCnDfA+QIcHuD8AS4Q4IIBLhTgwgEuEuCiAS4W4OIBLhEgAS4Z4FIBLh3gMgEuG+ByAS4f4AoBrhjgSgGuHOAqAa4a4IgAVwtw9QDXCHDNANcKcO0A1wlw3QDXC3D9ADcIcMMANwpw4wA3CXDTADcLcPMAtwhwywBHBrhVgFsHuE2A2wa4XYDbB7hDgDsGuFOAOwe4S4C7BrhbgLsHuEeAewa4V4B7B7hPgPsGuF+A+wd4QIAHBnhQgAcHeEiAhwZ4WICHB3hEgEcGeFSARwd4TIDHBnhcgMcHaIAnBHhigCcFeHKApwR4aoCnBXh6gGcEeGaAZwV4doDnBHhugOcFeH6AFwR4YYAXBXhxgKMCvCTASwO8LMDLA7wiwCsDvCrAqwO8JsBrA7wuwOsDvCHAGwO8KcCbA7wlwFsDvC3A2wO8I8A7A7wrwLsDvCfAewO8L8D7A3wgwAcDfCjAhwN8JMBHA3wswIEAgwDDAKMA4wCTANMAswDzAIsAywCrAOsAmwDbALsA+wAfD/CJAJ8M8KkAnw7wmQCfDfC5AJ8P8IUAXwzwpQBfDvCVAF8N8LUAXw/wjQDfDPCtAN8O8J0A3w3wvQDfD/CDAD8M8KMAPw7wkwA/DfCzAD8P8IsAvwzwqwC/DvCbAL8N8LsAvw/whwB/DPCnAH8O8JcAfw3wtwB/D/CPAP8M8K8ARwf4d4D/BDgmwLEBjgvw3wAHhTheiINDHBLi+CFOEOLQECcMcViIE4U4cYiThDhpiJOFOHmIU4Q4ZYhThTh1iNOEOG2I04U4fYgzhDhjiDOFOHOIs4Q4a4izhTh7iHOEOGeIc4U4d4jzhDhviPOFODzE+UNcIMQFQ1woxIVDXCTERUNcLMTFQ1wiREJcMsSlQlw6xGVCXDbE5UJcPsQVQlwxxJVCXDnEVUJcNcQRIa4W4uohrhHimiGuFeLaIa4T4rohrhfi+iFuEOKGIW4U4sYhbhLipiFuFuLmIW4R4pYhjgxxqxC3DnGbELcNcbsQtw9xhxB3DHGnEHcOcZcQdw1xtxB3D3GPEPcMca8Q9w5xnxD3DXG/EPcP8YAQDwzxoBAPDvGQEA8N8bAQDw/xiBCPDPGoEI8O8ZgQjw3xuBCPD9EQTwjxxBBPCvHkEE8J8dQQTwvx9BDPCPHMEM8K8ewQzwnx3BDPC/H8EC8I8cIQLwrx4hBHhXhJiJeGeFmIl4d4RYhXhnhViFeHeE2I14Z4XYjXh3hDiDeGeFOIN4d4S4i3hnhbiLeHeEeId4Z4V4h3h3hPiPeGeF+I94f4QIgPhvhQiA+H+EiIj4b4WIgDIQYhhiFGIcYhJiGmIWYh5iEWIZYhViHWITYhtiF2IfYhPh7iEyE+GeJTIT4d4jMhPhvicyE+H+ILIb4Y4kshvhziKyG+GuJrIb4e4hshvhniWyG+HeI7Ib4b4nshvh/iByF+GOJHIX4c4ichfhriZyF+HuIXIX4Z4lchfh3iNyF+G+J3IX4f4g8h/hjiTyH+HOIvIf4a4m8h/h7iHyH+GeJfIY4O8e8Q/wlxTIhjQxwX4r8hDopwvAgHRzgkwvEjnCDCoRFOGOGwCCeKcOIIJ4lw0ggni3DyCKeIcMoIp4pw6giniXDaCKeLcPoIZ4hwxghninDmCGeJcNYIZ4tw9gjniHDOCOeKcO4I54lw3gjni3B4hPNHuECEC0a4UIQLR7hIhItGuFiEi0e4RIREuGSES0W4dITLRLhshMtFuHyEK0S4YoQrRbhyhKtEuGqEIyJcLcLVI1wjwjUjXCvCtSNcJ8J1I1wvwvUj3CDCDSPcKMKNI9wkwk0j3CzCzSPcIsItIxwZ4VYRbh3hNhFuG+F2EW4f4Q4R7hjhThHuHOEuEe4a4W4R7h7hHhHuGeFeEe4d4T4R7hvhfhHuH+EBER4Y4UERHhzhIREeGuFhER4e4RERHhnhUREeHeExER4b4XERHh+hEZ4Q4YkRnhThyRGeEuGpEZ4W4ekRnhHhmRGeFeHZEZ4T4bkRnhfh+f/p5MIIL4rw4ghHRXhJhJdGeFmEl0d4RYRXRnhVhFdHeE2E10Z4XYTXR3hDhDdGeFOEN0d4S4S3RnhbhLdHeEeEd0Z4V4R3R3hPhPdGeF+E90f4QIQPRvhQhA9H+EiEj0b4WIQDEQYRhhFGEcYRJhGmEWYR5hEWEZYRVhHWETYRthF2EfYRPh7hExE+GeFTET4d4TMRPhvhcxE+H+ELEb4Y4UsRvhzhKxG+GuFrEb4e4RsRvhnhWxG+HeE7Eb4b4XsRvh/hBxF+GOFHEX4c4ScRfhrhZxF+HuEXEX4Z4VcRfh3hNxF+G+F3EX4f4Q8R/hjhTxH+HOEvEf4a4W8R/h7hHxH+GeFfEY6O8O8I/4lwTIRjIxwX4b8RDopxvBgHxzgkxvFjnCDGoTFOGOOwGCeKceIYJ4lx0hgni3HyGKeIccoYp4px6hiniXHaGKeLcfoYZ4hxxhhninHmGGeJcdYYZ4tx9hjniHHOGOeKce4Y54lx3hjni3F4jPPHuECMC8a4UIwLx7hIjIvGuFiMi8e4RIzEuGSMS8W4dIzLxLhsjMvFuHyMK8S4YowrxbhyjKvEuGqMI2JcLcbVY1wjxjVjXCvGtWNcJ8Z1Y1wvxvVj3CDGDWPcKMaNY9wkxk1j3CzGzWPcIsYtYxwZ41Yxbh3jNjFuG+N2MW4f4w4x7hjjTjHuHOMuMe4a424x7h7jHjHuGeNeMe4d4z4x7hvjfjHuH+MBMR4Y40ExHhzjITEeGuNhMR4e4xExHhnjUTEeHeMxMR4b43ExHh+jMZ4Q44kxnhTjyTGeEuOpMZ4W4+kxnhHjmTGeFePZMZ4T47kxnhfj+TFeEOOFMV4U48UxjorxkhgvjfGyGC+P8YoYr4zxqhivjvGaGK+N8boYr4/xhhhvjPGmGG+O8ZYYb43xthhvj/GOGO+M8a4Y747xnhjvjfG+GO+P8YEYH4zxoRgfjvGRGB+N8bEYB2IMYgxjjGKMY0xiTGPMYsxjLGIsY6xirGNsYmxj7GLsY3w8xidifDLGp2J8OsZnYnw2xudifD7GF2J8McaXYnw5xldifDXG12J8PcY3YnwzxrdifDvGd2J8N8b3Ynw/xg9i/DDGj2L8OMZPYvw0xs9i/DzGL2L8MsavYvw6xm9i/DbG72L8PsYfYvwxxp9i/DnGX2L8NcbfYvw9xj9i/DPGv2IcHePfMf4T45gYx8Y4LsZ/YxyU4HgJDk5wSILjJzhBgkMTnDDBYQlOlODECU6S4KQJTpbg5AlOkeCUCU6V4NQJTpPgtAlOl+D0Cc6Q4IwJzpTgzAnOkuCsCc6W4OwT/JfOSnCuBOdOcJ4E501wvgSHJzh/ggskuGCCCyW4cIKLJLhogosluHiCSyRIgksmuFSCSye4TILLJv/39BRXSHDFBFdKcOUEV0lw1QRHJLhagqsnuEaCaya4VoJrJ7hOgusmuF6C6ye4QYIbJrhRghsnuEmCmya4WYKbJ7hFglsmODLBrRLcOsFtEtw2we0S3D7BHRLcMcGdEtw5wV0S3DXB3RLcPcE9Etwzwb0S3DvBfRLcN8H9Etw/wQMSPDDBgxI8OMFDEjw0wcMSPDzBIxI8MsGjEjw6wWMSPDbB4xI8PkETPCHBExM8KcGTEzwlwVMTPC3B0xM8I8EzEzwrwbMTPCfBcxM8L8HzE7wgwQsTvCjBixMcleAlCV6a4GUJXp7gFQlemeBVCV6d4DUJXpvgdQlen+ANCd6Y4E0J3pzgLQnemuBtCd6e4B0J3pngXQneneA9Cd6b4H0J3p/gAwk+mOBDCT6c4CMJPprgYwkOJBgkGCYYJRgnmCSYJpglmCdYJFgmWCVYJ9gk2CbYJdgn+HiCTyT4ZIJPJfh0gs8k+GyCzyX4fIIvJPhigi8l+HKCryT4aoKvJfh6gm8k+GaCbyX4doLvJPhugu8l+H6CHyT4YYIfJfhxgp8k+GmCnyX4eYJfJPhlgl8l+HWC3yT4bYLfJfh9gj8k+GOCPyX4c4K/JPhrgr8l+HuCfyT4Z4J/JTg6wb8T/CfBMQmOTXBcgv8mOCjF8VIcnOKQFMdPcYIUh6Y4YYrDUpwoxYlTnCTFSVOcLMXJU5wixSlTnCrFqVOcJsVpU5wuxelTnCHFGVOcKcWZU5wlxVlTnC3F2VOcI8U5U5wrxblTnCfFeVOcL8XhKc6f4gIpLpjiQikunOIiKS6a4mIpLp7iEimS4pIpLpXi0ikuk+KyKS6X4vIprpDiiimulOLKKa6S4qopjkhxtRRXT3GNFNdMca0U105xnRTXTXG9FNdPcYMUN0xxoxQ3TnGTFDdNcbMUN09xixS3THFkiluluHWK26S4bYrbpbh9ijukuGOKO6W4c4q7pLhriruluHuKe6S4Z4p7pbh3ivukuG+K+6W4f4oHpHhgigeleHCKh6R4aIqHpXh4ikekeGSKR6V4dIrHpHhsiseleHyKpnhCiiemeFKKJ6d4SoqnpnhaiqeneEaKZ6Z4Vopnp3hOiuemeF6K56d4QYoXpnhRihenOCrFS9JR7f989rIUL0/xihSvTPGqFK9O8ZoUr03xuhSvT/GGFG9M8aYUb07xlhRvTfG2FG9P8Y4U70zxrhTvTvGeFO9N8b4U70/xgRQfTPGhFB9O8ZEUH03xsRQHUgxSDFOMUoxTTFJMU8xSzFMsUixTrFKsU2xSbFPsUuxTfDzFJ1J8MsWnUnw6xWdSfDbF51J8PsUXUnwxxZdSfDnFV1J8NcXXUnw9xTdSfDPFt1J8O8V3Unw3xfdSfD/FD1L8MMWPUvw4xU9S/DTFz1L8PMUv/mPuqxS/TvGbFL9N8bsUv0/xhxR/TPGnFH9O8ZcUf03xtxR/T/GPFP9M8a8UR6f4d4r/pDgmxbEpjkvx3xQHZThehoMzHJL9vz6nQYMGzTk0wwkzHJbhRBlOnOEkGU6a4WQZTp7hFBlOmeFUGU6d4TQZTpvhdBlOn+EMGc6Y4UwZzpzhLBnOmuFsGc6e4RwZzpnhXBnOneE8Gc6b4XwZDs9w/gwXyHDBjBHD/8tyL5LhohkuluHiGS6RIRkumeFSGS6d4TIZLpvhchkun+EKGa6Y4UoZrpzhKhmumuGIDFfLcPUM18hwzQzXynDtDNfJcN0M18tw/Qw3yHDDDDfKcOMMN8lw0ww3y3DzDLfIcMsMR2a4VYZbZ7hNhttmuF2G22e4Q4Y7ZrhThjtnuEuGu2a4W4a7Z7hHhntmuFeGe2e4T4b7ZrhfhvtneECGB2Z4UIYHZ3hIhodmeFiGh2d4RIZHZnhUhkdneEyGx2Z4XIbHZ2iGJ2R4YoYnZXhyhqdkeGqGp2V4eoZnZHhmhmdleHaG52R4bobnZXh+hhdkeGGGF2V4cYajMrwkw0szvCzDyzO8IsMrM7wqw6szvCbDazO8LsPrM7whwxszvCnDmzO8JcNbM7wtw9uH4h0Z3pnhXRneneE9Gd6b4X0Z3p/hAxk+mOFDGT6c4SMZPprhYxkOZBhkGGYYZRhnmGSYZphlmGdYZFhmWGVYZ9hk2GbYZdhn+HiGT2T4ZIZPZfh0hs9k+GyGz2X4fIYvZPhihi9l+HKGr2T4aoavZfh6hm9k+GaGb2X4dobvZPhuhu9l+H6GH2T4YYYfZfhxhp9k+GmGn2X4eYZfZPhlhl9l+HWG32T4bYbfZfh9hj9k+GOGP2X4c4a/ZPhrhr9l+HuGf2T4Z4Z/ZTg6w78z/CfDMRmOzXBchv/+j/ccx8txcI5Dchw/xwlyHJrjhDkOy3GiHCfOcZIcJ81xshwnz3GKHKfMcaocp85xmhynzXG6HKfPcYYcZ8xxphxnznGWHGfNcbYcZ89xjhznzHGuHOfOcZ4c581xvhyH5zh/jgvkuGCOC+W4cI6L5LhojovluHiOS+RIjkvmuFSOS+e4TI7L5rhcjsvnuEKOK+a4Uo4r57hKjqvmOCLH1XJcPcc1clwzx7VyXDvHdXJcN8f1clw/xw1y3DDHjXLcOMdNctw0x81y3DzHLXLcMseROW6V49Y5bpPjtjlul+P2Oe6Q44457pTjzjnukuOuOe6W4+457pHjnjnulePeOe6T47457pfj/jkekOOBOR6U48E5HpLjoTkeluPhOR6R45E5HpXj0Tkek+OxOR6X4/E5muMJOZ6Y40k5npzjKTmemuNpOZ6e4xk5npnjWTmeneM5OZ6b43k5np/jBTlemONFOV6c46gcL8nx0hwvy/HyHK/I8cocr8rx6hyvyfHaHK/L8focb8jxxhxvyvHmHG/J8dYcb8vx9hzvyPHOHO/K8e4c78nx3hzvy/H+HB/I8cEcH8rx4RwfyfHRHB/LcSDHIMcwxyjHOMckxzTHLMc8xyLHMscqH///3xHbHLsc+xwfz/GJHJ/M8akcn87xmRyfzfG5HJ/P8YUcX8zxpRxfzvGVHF/N8bUcX8/xjRzfzPGtHN/O8Z0c383xvRzfz/GDHD/M8aMcP87xkxw/zfGzHD/P8Yscv8zxqxy/zvGbHL/N8bscv8/xhxx/zPGnHH/O8Zccf83xtxx/z/GPHP/M8a8cR+f4d47/5Dgmx7E5jsvx3xwHFThegYMLHFLg+AVOUODQAicscFiBExU4cYGTFDhpgZMVOHmBUxQ4ZYFTFTh1gdMUOG2B0xU4fYEzFDhjgTMVOHOBsxQ4a4GzFTh7gXMUOGeBcxU4d4HzFDhvgfMVOLzA+QtcoMAFC1yowIULXKTARQtcrMDFC1yiQApcssClCly6wGUKXLbA5QpcvsAVClyxwJUKXLnAVQpctcARBa5W4OoFrlHgmgWuVeDaBa5T4LoFrlfg+gVuUOCGBT468v8pZpMCNy1wswI3L3CLArcscGSBWxW4dYHbFLhtgdsVuH2BOxS4Y4E7FbhzgbsUuGuBuxW4e4F7FLhngXsVuHeB+xS4b4H7Fbh/gQcUeGCBBxV4cIGHFHhogYcVeHiBRxR4ZIFHFXh0gccUeGyBxxV4fIEWeEKBJxZ4UoEnF3hKgacWeFqBpxd4RoFnFnhWgWcXeE6B5xZ4XoHnF3hBgRcWeFGBFxc4qsBLCry0wMsKvLzAKwq8ssCrCry6wGsKvLbA6wq8vsAbCryxwJsKvLnAWwq8tcDbCry9wDsKvLPAuwq8u8B7Cry3wPsKvL/ABwp8sMCHCny4wEf+N5cCHytwoMCgwLDAqMC4wKTAtMCswLzAosCywKrAusCmwLbArsC+wMcLfKLAJwt8qsCnC3ymwGcLfK7A5wt8ocAXC3ypwJcLfKXAVwt8rcDXC3yjwDcLfKvAtwt8p8B3C3yvwPcL/KDADwv8qMCPC/ykwE8L/KzAzwv8osAvC/yqwK8L/KbAbwv8rsDvC/yhwB8L/KnAnwv8pcBfC/ytwN8L/KPAPwv8q8DR/+P6P6cbU+DYAscV+G+Bg0ocr8TBJQ4pcfwSJyhxaIkTljisxIlKnLjESUqctMTJSpy8xClKnLLEqUqcusRpSpy2xOlKnL7EGUqcscSZSpy5xFlKnLXE2UqcvcQ5SpyzxLlKnLvEeUqct8T5Shxe4vwlLlDigiUuVOLCJS5S4qIlLlbi4iUuUSIlLlniUiUuXeIyJS5b4nIlLl/iCiWuWOJKJa5c4iolrlriiBJXK3H1Etcocc0S1ypx7RLXKXHdEtcrcf0SNyhxwxI3KnHjEjcpcdMSNytx8xK3KHHLEkeWuFWJW5e4TYnblrhdiduXuEOJO5a4U4k7l7hLibuWuFuJu5e4R4l7lrhXiXuXuE+J+5a4X4n7l3hAiQeWeFCJB5d4SImHlnhYiYeXeESJR5Z4VIlHl3hMiceWeFyJx5doiSeUeGKJJ5V4comnlHhqiaeVeHqJZ5R4ZolnlXh2ieeUeG6J55V4fokXlHhhiReVeHGJo0q8pMRLS7ysxMtLvKLEK0u8qsSrS7ymxGtLvK7E60u8ocQbS7ypxJtLvKXEW0u8rcTbS7yjxDtLvKvEu0u8p8R7S7yvxPtLfKDEB0t8qMSHS3ykxEdLfKzEgRKDEsMSoxLjEpMS0xKzEvMSixLLEqsS6xKbEtsSuxL7Eh8v8YkSnyzxqRKfLvGZEp8t8bkSny/xhRJfLPGlEl8u8ZUSXy3xtRJfL/GNEt8s8a0S3y7xnRLfLfG9Et8v8YMSPyzxoxI/LvGTEj8t8bMSPy/xixK/LPGrEr8u8ZsSvy3xuxK/L/GHEn8s8acSfy7xlxJ/LfG3En8v8Y8S/yzxrxJHl/h3if+UOKbEsSWOK/HfEgdVOF6FgyscUuH4FU5Q4dAKJ6xwWIUTVThxhZNUOGmFk1U4eYVTVDhlhVNVOHWF01Q4bYXTVTh9hTNUOGOFM1U4c4WzVDhrhbNVOHuFc1Q4Z4VzVTh3hfNUOG+F81U4vML5K1ygwgUrXKjChStcpMJFK1yswsUrXKJCKlyywqUqXLrCZSpctsLlKly+whUqXLHClSpcucJVKly1whEVrlbh6hWuUeGaFa5V4doVrlPhuhWuV+H6FW5Q4YYVblThxhVuUuGmFW5W4eYVblHhlhWOrHCrCreucJsKt61wuwq3r3CHCnescKcKd65wlwp3rXC3CnevcI8K96xwrwr3rnCfCvetcL8K96/wgAoPrPCgCg+u8JAKD63wsAoPr/CICo+s8KgKj67wmAqPrfC4Co+v0ApPqPDECk+q8OQKT6nw1ApPq/D0Cs+o8MwKz6rw7ArPqfDcCs+r8PwKL6jwwgovqvDiCkdVeEmFl1Z4WYWXV3hFhVdWeFWFV1d4TYXXVnhdhddXeEOFN1Z4U4U3V3hLhbdWeFuFt1d4R4V3VnhXhXdXeE+F91Z4X4X3V/hAhQ9W+FCFD1f4SIWPVvhYhQMVBhWGFUYVxhUmFaYVZhXmFRYVlhVWFdYVNhW2FXYV9hU+XuETFT5Z4VMVPl3hMxU+W+FzFT5f4QsVvljhSxW+XOErFb5a4WsVvl7hGxW+WeFbFb5d4TsVvlvhexW+X+EHFX5Y4UcVflzhJxV+WuFnFX5e4RcVflnhVxV+XeE3FX5b4XcVfl/hDxX+WOFPFf5c4S8V/lrhbxX+XuEfFf5Z4V8Vjq6G+XeF/1Q4psKxFY6r8N8KB9U4Xo2DaxxS4/g1TlDj0BonrHFYjRPVOHGNk9Q4aY2T1Th5jVPUOGWNU9U4dY3T1DhtjdPVOH2NM9Q4Y40z1ThzjbPUOGuNs9U4e41z1DhnjXPVOHeN89Q4b43z1Ti8xvlrXKDGBWtcqMaFa1ykxkVrXKzGxWtcokZqXLLGpWpcusZlaly2xuVqXL7GFWpcscaValy5xlVqXLXGETWuVuPqNa5R45o1rlXj2jWuU+O6Na5X4/o1blDjhjVuVOPGNW5S46Y1blbj5jVuUeOWNY6scasat65xmxq3rXG7GrevcYcad6xxpxp3rnGXGnetcbcad69xjxr3rHGvGveucZ8a961xvxr3r/GAGg+s8aAaD67xkBoPrfGwGg+v8Ygaj6zxqBqPrvGYGo+t8bgaj6/RGk+o8cQaT6rx5BpPqfHUGk+r8fQaz6jxzBrPqvHsGs+p8dwaz6vx/BovqPHCGi+q8eIaR9V4SY2X1nhZjZfXeEWNV9Z4VY1X13hNjdfWeF2N19d4Q4031nhTjTfXeEuNt9Z4W42313hHjXfWeFeNd9d4T4331nhfjffX+ECND9b4UI0P1/hIjY/W+FiNAzUGNYY1RjXGNSY1pjVmNeY1FjWWNVY11jU2NbY1djX2NT5e4xM1PlnjUzU+XeMzNT5b43M1Pl/jCzW+WONLNb5c4ys1vlrjazW+XuMbNb5Z41s1vl3jOzW+W+N7Nb5f4wc1fljjRzV+XOMnNX5a42c1fl7jFzV+WeNXNX5d4zc1flvjdzV+X+MPNf5Y4081/lzjLzX+WuNvNf5e4x81/lnjXzWOrvHvGv+pcUyNY2scV+O/NQ5qcLwGBzc4pMHxG5ygwaENTtjgsAYnanDiBidpcNIGJ2tw8ganaHDKBqdqcOoGp2lw2gana3D6BmdocMYGZ2pw5gZnaXDWBmdrcPYG52hwzgbnanDuBudpcN4G52tweIPzN7hAgws2uFCDCze4SIOLNrhYg4s3uESDNLhkg0s1uHSDyzS4bIPLNbh8gys0uGKDKzW4coOrNLhqgyMaXK3B1Rtco8E1G1yrwbUbXKfBdRtcr8H1G9ygwQ0b3KjBjRvcpMFNG9yswc0b3KLBLRsc2eBWDW7d4DYNQ7ZtcLsGt29whwZ3bHCnBnducJcGd21wtwZ3b3CPBvdscK8G925wnwb3bXC/Bvdv8IAGD2zwoAYPbvCQBg9t8LAGD2/wiAaPbPCoBo9u8JgGj23wuAaPb9AGT2jwxAZPavDkBk9p8NQGT2vw9AbPaPDMBs9q8OwGz2nw3AbPa/D8Bi9o8MIGL2rw4gZHNXhJg5c2eFmDlzd4RYNXNnhVg1c3eE2D1zZ4XYPXN3hDgzc2eFODNzd4S4O3Nnhbg7c3eEeDdzZ4V4N3N3hPg/c2eF+D9zf4QIMPNvhQgw83+EiDjzb4WIMDDQYNhg1GDcYNJg2mDWYN5g0WDZYNVg3WDTYNtg12DfYNPt7gEw0+2eBTDT7d4DMNPtvgcw0+3+ALDb7Y4EsNvtzgKw2+2uBrDb7e4BsNvtngWw2+3eA7Db7b4HsNvt/gBw1+2OBHDX7c4CcNftrgZw1+3uAXDX7Z4FcNft3gNw1+2+B3DX7f4A8N/tjgTw3+3OAvDf7a4G8N/t7gHw3+2eBfDY5u8O8G/2lwTINjGxzX4L8NDmpxvBYHtzikxfFbnKDFoS1O2OKwFidqceIWJ2lx0hYna3HyFqdoccoWp2px6hanaXHaFqdrcfoWZ2hxxhZnanHmFmdpcdYWZ2tx9hbnaHHOFudqce4W52lx3hbna3F4i/O3uECLC7a4UIsLt7hIi4u2uFiLi7e4RIu0uGSLS7W4dIvLtLhsi8u1uHyLK7S4Yosrtbhyi6u0uGqLI1pcrcXVW1yjxTVbXKvFtVtcp8V1W1yvxfVb3KDFDVvcqMWNW9ykxU1b3KzFzVvcosUtWxzZ4lYtbt3iNi1u2+J2LW7f4g4t7tjiTi3u3OIuLe7a4m4t7t7iHi3u2eJeLe7d4j4t7tvifi3u3+IBLR7Y4kEtHtziIS0e2uJhLR7e4hEtHtniUS0e3eIxLR7b4nEtHt+iLZ7Q4oktntTiyS2e0uKpLZ7W4uktntHimS2e1eLZLZ7T4rktntfi+S1e0OKFLV7U4sUtjmrxkhYvbfGyFi9v8YoWr2zxqhavbvGaFq9t8boWr2/xhhZvbPGmFm9u8ZYWb23xthZvb/GOFu9s8a4W727xnhbvbfG+Fu9v8YEWH2zxoRYfbvGRFh9t8bEWB1oMWgxbjFqMW0xaTFvMWsxbLFosW6xarFtsWmxb7FrsW3y8xSdafLLFp1p8usVnWny2xedafL7FF1p8scWXWnx50CBfafHVFl9r8fUW32jxzRbfavHtFt9p8d0W32vx/RY/aPHDFj9q8eMWP2nx0xY/a/HzFr9o8csWv2rx6xa/afHbFr9r8fsWf2jxxxZ/avHnFn9p8dcWf2vx9xb/aPHPFv9qcXSLf7f4T4tjWhzb4rj/8f3fTXC8Dgd3OKTD8TucoMOhHU7Y4bAOJ+pw4g4n6XDSDifrcPIOp+hwyg6n6nDqDqfpcNoOp+tw+g5n6HDGDmfqcOYOZ+lw1g5n63D2DufocM4O5+pw7g7n6XDeDufrcHiH83e4QIcLdrhQhwt3uEiHi3a4WIeLd7hEh3S4ZIdLdbh0h8t0uGyHy3W4fIcrdLhihyt1uHKHq3S4aocjOlytw9U7XKPDNTtcq8O1O1ynw3U7XK/D9TvcoMMNO9yow4073KTDTTvcrMPNO9yiwy07HNnhVh1u3eE2HW7b4XYdbt/hDh3u2OFOHe7c4S4d7trhbh3u3uEeHe7Z4V4d7t3hPh3u2+F+He7f4QEdHtjhQR0e3OEhHR7a4WEdHt7hER0e2eFRHR7d4TEdHtvhcR0e36EdntDhiR2e1DH4//rxTu3wtA5P7/CMDs/s8KwOz+7wnA7P7fC8Ds/v8IIOL+zwog4v7nBUh5d0eGmHl3V4eYdXdHhlh1d1eHWH13R4bYfXdXh9hzd0eGOHN3V4c4e3dHhrh7d1eHuHd3R4Z4d3dXh3h/d0eG+H93V4f4cPdPhghw91+HCHj3T4aIePdTjQYdBh2GHUYdxh0mHaYdZh3mHRYdlh1WHdYdNh22HXYd/h4x0+0eGTHT7V4dMdPtPhsx0+1+HzHb7Q4YsdvtThyx2+0uGrHb7W4esdvtHhmx2+9T+tD8J3Ony3w/c6fL/DDzr8sMOPOvy4w086/LTDzzr8vMMvOvyyw686/LrDbzr8tsPvOvy+wx86/LHDnzr8ucNfOvy1w986/L3DPzr8s8O/Ohzd4d8d/tPhmA7Hdjiuw387HNTjeD0O7nFIj+P3OEGPQ3ucsMdhPU7U48Q9TtLjpD1O1uPkPU7R45Q9TtXj1D1O0+O0PU7X4/Q9ztDjjD3O1OPMPc7S46w9ztbj7D3O0eOcPc7V49w9ztPjvD3O1+PwHufvcYEeF+xxoR4X7nGRHhftcbEeF+9xiR7pcckel+px6R6X6XHZHpfrcfkeV+hxxR5X6nHlHlfpcdUeR/S4Wo+r97hGj2v2uFaPa/e4To/r9rhej+v3uEGPG/a4UY8b97hJj5v2uFmPm/e4RY9b9jiyx6163LrHbXrctsftety+xx163LHHnXrcucddety1x9163L3HPXrcs8e9ety7x3163LfH/Xrcv8cDejywx4N6PLjHQ3o8tMfDejy8xyN6PLLHo3o8usdjejy2x+N6PL5HezyhxxN7PKnHk3s8pcdTezytx9N7PKPHM3s8q8ezezynx3N7PK/H83u8oMcLe7yox4t7HNXjJT1e2uNlPV7e4xU9XtnjVT1e3eM1PV7b43U9Xt/jDT3e2ONNPd7c4y093trjbT3e3uMdPd7Z41093t3jPT3e2+N9Pd7f4wM9PtjjQz0+3OMjPT7a42M9DvT4/wUAAP//AvKSNQ==")

      
      bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f0000000200)={0x1e, 0x4, &(0x7f0000000000)=@framed={{}, [@ldst={0x1, 0x2, 0x3, 0x2, 0x1, 0x39}]}, &(0x7f0000000040)='GPL\x00', 0x0, 0x0, 0x0, 0x40f00, 0x5, '\x00', 0x0, 0x24}, 0x94)

      
      syz_emit_ethernet(0x6e, &(0x7f0000000100)={@random="5b37182347bc", @random='a\t\x00', @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "9d0080", 0x38, 0x3a, 0xff, @rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01', @mcast2, {[], @param_prob={0x4, 0x0, 0x0, 0x5, {0xa, 0x6, "e397f7", 0x5, 0x6, 0x1, @local, @initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, [], "caf766bbd66c91cb"}}}}}}}, 0x0)

      
      bpf$BPF_PROG_TEST_RUN(0xa, &(0x7f0000000000)={0xffffffffffffffff, 0xfeffff, 0x0, 0x0, 0x0, 0x0, 0x15e3, 0x0, 0x2, 0xffffffffffffffde, &(0x7f0000000200)="c54c", 0x0, 0x3, 0x0, 0x2}, 0x50)

      
      bpf$MAP_CREATE(0x700000000000000, &(0x7f0000001a00)=@base={0x22, 0x4, 0x2, 0x0, 0x201, 0xffffffffffffffff, 0x0, '\x00', 0x0, 0xffffffffffffffff, 0x1, 0x5}, 0x50)

      
      syz_mount_image$hfsplus(&(0x7f0000000500), &(0x7f0000000280)='./file0\x00', 0x4080, &(0x7f0000000040)=ANY=[], 0x0, 0x663, &(0x7f0000000540)="$eJzs3U9sHFcdB/DvrNdONkixaZO2oEpYjVQQEYkdywVzgIAQyqFCVThwthKnsbJxK9tFaYXA/Bc3DuVeDr5xAnGPVM5w69XiVITEpaeIQxfNeHa9jtf2unZsJ3w+0ex7b97Mm/d+O/t2ZqxoA/zfunE5zYf5Wa+8sT7T3lifOVMX20nKfCNp1quKpaT4MLmezSVfKFd2K3c7zvuLczc/+mTj481Ss16q7Rt77TectXrJZJKROt1p9DO1d2vX9oZWdEdYBuzS4ccLR6Ozw9q2+j+UL1/adXfnMTwDis3vzR0mknP1BFBdE9SzQ+NYO/cErJ10BwAAAOAYjD/KozzI+ZPuBwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADyF1pIU9e/7l2kxmeJckrNJxup1qfNPtYfFSfcAAAAAAAAAAIbSHKLqfDfTKaq/+b9SZjudTrnqc3knK1nIcq7kQeazmtUsZzrJRF9TYw/mV1eXp6s9kwvZdc9POwP3vHakYwYAAAAAAACAZ9+3tpV+kRtbf/8HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIDToEhGNpNqudDNT6TRTHI2yVi53Vryj27+lGjsWdsZH7j64RPqDAAAAJwm44/yKA9yviq0kk5R3fO/UN33n807WcpqFrOadhZyu3oWsHnX39hYn2lvrM/cL5ed7X7nPwfqxlh9/z5SlQYd+aVqi1buZLFacyW38lbauZ1G787/pW5/Bvfr52Wfim/XhuzZ7TotR/77Ot3h5qcHGu3RmKgiMtqLyFT34U1p70gc8N15/EjTafSe/FwYfKROrT2+d8xHBx3vXJ2Wsf5NX8y3xb7/0dM/DzScQ3g8Etf6zr4Xtkdi585f/uuffnS3vXTv7p2VywNPo6fJ45GY6YvEi70oFMnagPnhmYrEVBWJi73yjXw/P8zlTOaNLGcxP858VrOQyXyvys3X53Gx+6en6/q20hv7POKsPhITvVn0YH16pdr3fBbzg7yV21nIa9W/a5nO1zOb2cz1vcMXh5hpGwebaS99pc60kvy2Tk+HMq6f74tr/5w7UdX1r9mK0nP7R+n15sGi1PxinSmP8ct9n3ofp61ItHrfEt3ePb93JP7YKV9X2kv3lu/Ovz3k8V6t0/Jz9Ovdvpn31DzwHsMpz5fneu1vPzvKuucH1k1XdRd6dY0ddRd7dft9Usfqa7idLV2r6l4cWDeTsYxXV1rdukHXW8cRQQAO5dxXz421/t36e+uD1q9ad1uvn/3umW+ceXkso38b/WZzauTVxsvFn/NBflrf/wMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIey8u579+bbI1nezLQXhsycyQE2HiJT1D/kc2QNnupMGoccaffn8058OGeSQ54Jf/nd5lBO/E05ikyzfE/K0XzGdtI4DaOoMmMnOCsBx+Hq6v23r668+97XFu/Pv7nw5sLS6Ozs3NTc7GszV+8sthemNl9PupfAk7D1pT+w+l/H3iEAAAAAAAAAAABgX8fx3wlOeowAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADA0+3G5TQfpsj01JWpsryxPtMul25+a8tmkkaS4idJ8WFyPZtLJvqaK3Y7zvuLczc/+mTj4622mt3ty0b/2znUKNbqJZNJRur0qNq7dej2il5kyoBd6gYOTtr/AgAA///77Qmb")

      
      bpf$BPF_BTF_LOAD(0x12, &(0x7f00000001c0)={&(0x7f00000000c0)=ANY=[@ANYBLOB="9feb010018000000000000003c0000003c00000004000000000000000000000300000000030000000300000000000101010000000000000c00000000010000000100000506000000010000000200000000000000005f"], 0x0, 0x58, 0x0, 0xa}, 0x28)

      
      openat$procfs(0xffffffffffffff9c, &(0x7f00000000c0)='/proc/slabinfo\x00', 0x0, 0x0)

      
      ioprio_set$pid(0x1, 0x0, 0x6000)

      
      bpf$BPF_LINK_CREATE(0x8, &(0x7f00000000c0)={0xffffffffffffffff, 0xffffffffffffffff, 0x19, 0x0, @void}, 0x10)

      
      syz_emit_ethernet(0x6e, &(0x7f0000000500)={@broadcast, @broadcast, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, '\x00', 0x38, 0x3a, 0xff, @local, @mcast2, {[], @pkt_toobig={0x2, 0x0, 0x0, 0x1, {0x0, 0x6, "102dab", 0x0, 0x2f, 0x1, @mcast2, @loopback, [], "249822ebba66320d"}}}}}}}, 0x0)

      
      bpf$BPF_BTF_LOAD(0x12, &(0x7f0000000080)={&(0x7f0000000680)=ANY=[@ANYBLOB="9feb01001800000000000000240000002400000006000000000000000200008409010000040000000400000001000000020000000100000000080000003061002e"], &(0x7f0000000f40)=""/4079, 0x42, 0xfef, 0x8}, 0x28)

      
      syz_emit_ethernet(0x3e, &(0x7f0000000000)={@broadcast, @multicast, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x1, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @local}, @time_exceeded={0x2b, 0x0, 0x0, 0x3, 0x0, 0x0, {0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @local, @local}}}}}}, 0x0)

      
      syz_usb_connect(0x0, 0x3d, &(0x7f0000000040)=ANY=[@ANYBLOB="12010000b19a3640d219751284070102030109022b0002070000ac18563dd8399550bba2ef9521593300090400010079319c0009040000000202ff"], 0x0)

      
      shmat(0xffffffffffffffff, &(0x7f0000ffc000/0x1000)=nil, 0x1000)

      
      syz_emit_ethernet(0x7e, &(0x7f00000006c0)={@local, @remote, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "1200b0", 0x48, 0x3a, 0x0, @empty, @mcast2, {[], @dest_unreach={0x4, 0x0, 0x0, 0x0, '\x00', {0x0, 0x6, '\x00', 0x0, 0x2c, 0x0, @private1, @rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01', [@hopopts={0x2f, 0x1, '\x00', [@calipso={0x7, 0x8, {0x22ebffff, 0x0, 0xfc, 0x57}}]}]}}}}}}}, 0x0)

      
      clock_adjtime(0x17, &(0x7f0000000040)={0xd51, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x201, 0x3f420f00, 0x0, 0x1, 0x8, 0x0, 0x1, 0x0, 0x0, 0x0, 0x100, 0x3, 0x0, 0xe438, 0x0, 0x0, 0xe0})

      
      syz_usb_connect$hid(0x0, 0x36, &(0x7f0000000180)=ANY=[@ANYBLOB="1201000000000040260933334000000000010902", @ANYRESHEX], 0x0)

      
      kexec_load(0x0, 0x1, &(0x7f00000008c0)=[{0x0, 0x0, 0x3, 0xd2d9}], 0x2a0001)

      
      setpriority(0x85e9621c373d618c, 0x0, 0x6)

      
      mmap(&(0x7f0000000000/0xff5000)=nil, 0xff5000, 0x3, 0x20000000ed071, 0xffffffffffffffff, 0xfffff000)

      
      syz_usb_connect$hid(0x2, 0x36, &(0x7f0000000080)={{0x12, 0x1, 0x250, 0x0, 0x0, 0x0, 0x10, 0x5ac, 0x253, 0x40, 0x1, 0x2, 0x3, 0x1, [{{0x9, 0x2, 0x24, 0x1, 0x1, 0x7f, 0x90, 0x2, "", [{{0x9, 0x4, 0x0, 0x3, 0x2, 0x3, 0x1, 0x2, 0x2, {0x9, 0x21, 0x1, 0x7f, 0x1, {0x22, 0x5d8}}, {{{0x9, 0x5, 0x81, 0x3, 0x400, 0xf9, 0x9, 0xec}}}}}]}}]}}, &(0x7f0000000140)={0x0, 0x0, 0x4b, 0x0})

      
      syz_mount_image$squashfs(&(0x7f0000000100), &(0x7f0000000000)='./file1\x00', 0x0, &(0x7f00000027c0)=ANY=[@ANYBLOB="00815fcb6c17c68f239cbc53c40972fb43da14f217bd93e6ebfde5585f63f1c1d8473fe39327852107a2489fc75846dd58657945c3ce4bed7d1452c74577e678a02e6b62c48846f9fea8ff6fd7f9a819961a1a6e18917f75cf633eaefe0f029d5d4b697ca0de784bd4fd4ee47740fafc2d46c7aa1279d7172ac4ec4b9cbe890200000075117934859797825acb3e8e4a67ae59d5e366af500cea3eee7b6bf3bfe9c4ae7b0f7fb33d5f1f72070000000e72da1075d5b83f93f03711b9e9ae0621abdf15468f20abaff300006ddaa87651396da731adf6214f92888f896d3f3d60f5fb009d365da32dd89b8589c3a08956a8ff185ef14e956b950f801b511c6d876127757678102f7b8851a569c0f6bc340fe0dbc1b5b828d9401d0ea1e86a43ececf69580430a29ade4f88535749e90b4d3391e03934cad898a63dad6cacaf559a55ab4b7810337d89efda43d160065705aec490f6ba91096230e5d45f2e74ed77d83f616047a6c6bfad569119396123ec0b842342c7494412ed535df4dcb2d18873b2df25b5fe02a5b29da44b90b2d52726e6886ac84ed4d6d164fd23d9525b8898ad3031c496ccb69d0f06bc00c5b3f19269c81f34c480b5cedce8125337c5aa57ae15d525b9dcc4edce1327f2d3d3eda95cbcf1bd1b362b7b6de289c8380a70035aac04f2641fd37e02c0bde93087f0c42d287d33387b200f3976a9fba9dddeba00ba4b561b767cfc5c9bb1b1572055f052e2f7694e39e1fca3719374528800ea8efb80fa4bb55c68e18b206e84dbfd6241dc879e44125ef713323b8126608b8244a91f900a023ab268b064b6cf0dd1952926dae2f87a37c4927711844eb9507774262a817c99a6d4fc73302b5738833d8eaf67480561ae291394c97d950b4811a326f4c6fb97f27076cb0ad757fc8"], 0x1, 0x1a4, &(0x7f0000000540)="$eJzsVb1OG0EQ/vZufWeniFJHkVLESuwi9vmcROmSCvkBqBGWfRiLMz8+S2DLhan8HjR+Ed6BAkRjCoREYWrQod2dO9YdSIdk0H7S+vtmZmf/rJvZjg4iF8DDYtzCf0jY+IhzxsABfGXKt+YqvnEUV8i+5orL5D8hviCOhqOdZhgGfSHyIJF6MhHZr7iSYuPFWQXgdV6cREH9x1iN98lc8EzWsVflOu9E3McKz5n8ZP47tnAra9nZYtwSYhOAWEf42mowfY4N4FSb85nLwWzE6RxR/ITxA0B10NuvRsPRz26v2Qk6wa7v1/94vzzvt1/d6oaBp36ZtoVF34/gMgBRUwtaPAfgkgrsByyDaUejONNzHa04l74t51qMp7kJ5ykmcqm2W+IJ1vFdxg4n1AkgvEU6tbxSAww2uDRqXDsfEMNCXgYqrb2wPQWD6CuxC3gzcFhJ2hy5xBCpqRHU/06SY0+Ji8QN4hnxnDjpWUkv4nKFK7JKE8DBUXMw6NeES6nU56c+/1O6s0W73jn65YAvLgwMDAwMDAwM3hgeAwAA//9bsl4t")

      
      bpf$MAP_CREATE(0x0, &(0x7f00000005c0)=@base={0x1a, 0xd, 0xdcda, 0xa, 0x2004, 0x1, 0xdd7, '\x00', 0x0, 0xffffffffffffffff, 0x0, 0x3, 0x4}, 0x50)

      
      bpf$PROG_LOAD_XDP(0x5, &(0x7f0000000340)={0x6, 0xa, &(0x7f00000001c0)=@raw=[@cb_func={0x18, 0x7580373f2fc0e124, 0x4, 0x0, 0x2}, @map_fd={0x18, 0x9}, @tail_call, @func={0x85, 0x0, 0x1, 0x0, 0xfffffffffffffffb}], &(0x7f0000000040)='GPL\x00', 0x7, 0x33, &(0x7f00000002c0)=""/51, 0x40f00, 0x0, '\x00', 0x0, 0x25, 0xffffffffffffffff, 0x8, 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10, 0x3}, 0x94)

      
      mount$9p_fd(0x0, &(0x7f0000000300)='.\x00', &(0x7f0000000080), 0x0, &(0x7f0000000440)=ANY=[@ANYBLOB='trans=fd', @ANYBLOB=',cache=loose,trans=f'])

      
      syz_mount_image$erofs(&(0x7f00000000c0), &(0x7f0000000100)='./cgroup.cpu/cpuset.cpus\x00', 0x1000000, &(0x7f0000000200)=ANY=[@ANYBLOB="2c3d7af76c035fdbc98cfbb5ad5ca28358174428760e98ded5f797053a876dae6d1bb8ce11e396fb96094712f0db9133e11d5fb71bd3abc878440535b862a8c044b16430acdbc7626235d31140de31199823405d29744958024abb74722f4cb0b94eb91badb001e5ed8ddd8ace69317ecd822a0f0f0ddf60583f1b734d56100942038e0f3187", @ANYRESOCT, @ANYRES16, @ANYBLOB="e4167817d643746fc8459e4f839688227e32fca80a8f299f5803344bf79956d4570399cc2275679f1726249b7d83760b85c47f71bce5de4b5a78eec318d0bb048fe0d33a9a50353b9dc0a73c76beceb9dbd6ffea803820d4dfe2584188d2133ffcaa08437f93817a0d8627c7e30c12d411cbef55d7f605e1b55650aa779fadc7ec1c06c2381ff9285eb56d7a23c8f5da918c9ee61f706a06df6ba17624"], 0x81, 0x24e, &(0x7f0000000340)="$eJzsmT9oFE8Ux78zt/cvv/ADQSxsLDwxIeZud6MSBAstrQTjP2w8zF443eTk7oS7gGiwsUlpIdgKdoKFRSoLrexsTREVwSIpFYuQkXk7eze33Gr8cyD4PnBz330zO2/em9l3cAuGYf5Z3r/7vLF6enbuCIBxlJA39k8Z3W6Qltb49Ye3ph7k8OjZ28cvl/6/vZac7+CuPReoLQJ48Wo7NjrYUcoeVaJWOHoVJWM7D4lJoy9C4JDRVyFxwegAApeNvm7pRjF2H5SvNcL5Wj0MXN14tXqYR61eGFylA2BrRWDerFgppQRZI1rdK+uR6i7fqIZh0Gx1BYAwa8IgS6/rl8T3skjrOyVxsmdRSu/Xpfv3VvRV2VhdK38eJDyjZyDwYSrSs1h1yuVyPyVW/PudeHalMpb/KFoSvxlkwpIl12EYZFp5EtordcVH9Mcz75n+A9kfudjcAXY7ODuY5xOx+DqnRR5pe/FfMr1/l3h6xn5+Ri1E0qKP8ORec6L2ba29JiHtM/bxJ32Ju9FWWF0idXdGLEypRTXUz5TV9WYs9S79tCW7hBHFOC05E9FQ7/36pKv3YfTrh2PVz0p78Wal1V2eri9WF4KFYMn3Z467R133mF+hQhS1ZvR4777tXv0rUH0ay8XzC5VNqZU5kUOn2m43vQ7Qbnp0rZS643fkYCQ497yxSQZJ9U9iwvyw6U0kV/nhPoT5SPrWaqJfLp2UhTEMwzAMwzAMwzAMwzAMwwzhy5Ok5QAEvaqiF1XRH67D8M/S6G8BAAD//63DPHc=")

__tcp_cleanup_rbuf---of 15
__tcp_close---of 45
__tcp_read_sock---of 27
__tcp_sock_set_cork---of 4
__tcp_sock_set_nodelay---of 4
__tcp_sock_set_quickack---of 6
can_map_frag---of 6
check_zeroed_sockptr---of 3
copy_from_sockptr_offset---of 4
copy_to_sockptr---of 4
do_tcp_getsockopt---of 125
do_tcp_setsockopt---of 93
mmap_read_unlock---of 3
proto_memory_pcpu_drain---of 6
sk_forced_mem_schedule---of 8
sk_stream_moderate_sndbuf---of 4
sk_wmem_schedule---of 4
skb_advance_to_frag---of 8
strncpy_from_sockptr---of 4
tcp_abort---of 15
tcp_bpf_bypass_getsockopt---of 1
tcp_check_oom---of 10
tcp_cleanup_rbuf---of 4
tcp_close---of 6
tcp_disconnect---of 33
tcp_do_parse_auth_options32%of 16
tcp_done---of 14
tcp_downgrade_zcopy_pure---of 5
tcp_enable_tx_delay---of 8
tcp_enter_memory_pressure---of 4
tcp_fast_path_check---of 8
tcp_free_fastopen_req---of 3
tcp_get_info---of 34
tcp_get_timestamping_opt_stats---of 12
tcp_getsockopt---of 3
tcp_inbound_hash---of 29
tcp_inbound_md5_hash---of 14
tcp_init_sock---of 1
tcp_inq_hint---of 6
tcp_ioctl---of 21
tcp_leave_memory_pressure---of 4
tcp_mark_push---of 1
tcp_md5_destruct_sock---of 3
tcp_md5_hash_key---of 1
tcp_md5_hash_skb_data---of 13
tcp_mmap---of 10
tcp_orphan_count_sum---of 5
tcp_orphan_update---of 5
tcp_peek_len---of 9
tcp_peek_sndq---of 10
tcp_poll---of 50
tcp_push---of 16
tcp_rate_check_app_limited---of 5
tcp_read_done---of 15
tcp_read_skb---of 17
tcp_read_sock---of 1
tcp_read_sock_noack---of 1
tcp_recv_skb---of 11
tcp_recv_timestamp---of 15
tcp_recvmsg---of 20
tcp_recvmsg_locked---of 146
tcp_remove_empty_skb---of 17
tcp_repair_options_est---of 15
tcp_repair_set_window---of 10
tcp_send_mss---of 6
tcp_sendmsg---of 1
tcp_sendmsg_fastopen---of 17
tcp_sendmsg_locked---of 197
tcp_set_rcvbuf---of 8
tcp_set_rcvlowat---of 7
tcp_set_state---of 18
tcp_set_window_clamp---of 8
tcp_setsockopt---of 3
tcp_shutdown---of 5
tcp_skb_entail---of 16
tcp_sock_set_cork---of 4
tcp_sock_set_keepcnt---of 3
tcp_sock_set_keepidle---of 5
tcp_sock_set_keepidle_locked---of 5
tcp_sock_set_keepintvl---of 3
tcp_sock_set_maxseg---of 3
tcp_sock_set_nodelay---of 3
tcp_sock_set_quickack---of 9
tcp_sock_set_syncnt---of 3
tcp_sock_set_user_timeout---of 3
tcp_splice_data_recv---of 3
tcp_splice_eof---of 16
tcp_splice_read---of 33
tcp_stream_alloc_skb---of 14
tcp_stream_memory_free---of 3
tcp_wmem_schedule---of 6
tcp_write_queue_purge---of 32
tcp_xa_pool_commit---of 8
tcp_zc_finalize_rx_tstamp---of 1
tcp_zc_handle_leftover---of 8
tcp_zerocopy_receive---of 100
tcp_zerocopy_vm_insert_batch---of 3
tcp_zerocopy_vm_insert_batch_error---of 7
trace_tcp_hash_md5_required---of 5
vma_end_read---of 5
-----------
SUMMARY32%of 16

__stop_tx---of 12
default_serial_dl_read---of 1
default_serial_dl_write---of 1
hub6_serial_in---of 1
hub6_serial_out---of 1
io_serial_in100%of 1
io_serial_out100%of 1
mem16_serial_in---of 1
mem16_serial_out---of 1
mem32_serial_in---of 1
mem32_serial_out---of 1
mem32be_serial_in---of 1
mem32be_serial_out---of 1
mem_serial_in---of 1
mem_serial_out---of 1
no_serial_in---of 1
no_serial_out---of 1
rx_trig_bytes_show---of 5
rx_trig_bytes_store---of 9
serial8250_break_ctl---of 20
serial8250_clear_and_reinit_fifos---of 3
serial8250_clear_fifos---of 3
serial8250_config_port---of 178
serial8250_console_exit---of 3
serial8250_console_setup---of 10
serial8250_console_wait_putchar---of 11
serial8250_console_write24%of 89
serial8250_default_handle_irq---of 6
serial8250_do_get_mctrl---of 8
serial8250_do_pm---of 26
serial8250_do_set_divisor---of 3
serial8250_do_set_ldisc---of 34
serial8250_do_set_mctrl---of 3
serial8250_do_set_termios---of 61
serial8250_do_shutdown---of 43
serial8250_do_startup---of 146
serial8250_em485_config---of 9
serial8250_em485_destroy---of 3
serial8250_em485_handle_start_tx---of 27
serial8250_em485_handle_stop_tx---of 22
serial8250_em485_start_tx---of 7
serial8250_em485_stop_tx---of 10
serial8250_enable_ms---of 7
serial8250_fifo_wait_for_lsr_thre---of 10
serial8250_flush_buffer---of 3
serial8250_get_mctrl---of 3
serial8250_handle_irq---of 17
serial8250_handle_irq_locked---of 26
serial8250_init_port---of 1
serial8250_modem_status---of 12
serial8250_pm---of 3
serial8250_read_char---of 13
serial8250_release_port---of 15
serial8250_request_port---of 1
serial8250_request_std_resource---of 17
serial8250_rpm_get---of 3
serial8250_rpm_put---of 3
serial8250_rx_chars---of 4
serial8250_set_defaults---of 13
serial8250_set_ldisc---of 3
serial8250_set_mctrl---of 5
serial8250_set_termios---of 3
serial8250_shutdown---of 3
serial8250_start_tx---of 24
serial8250_startup---of 3
serial8250_stop_rx---of 6
serial8250_stop_tx---of 8
serial8250_throttle---of 1
serial8250_tx_chars---of 23
serial8250_tx_empty---of 25
serial8250_tx_threshold_handle_irq---of 17
serial8250_type---of 1
serial8250_unthrottle---of 1
serial8250_update_uartclk---of 12
serial8250_verify_port---of 4
set_io_from_upio---of 8
size_fifo---of 11
uart_handle_break---of 5
-----------
SUMMARY26%of 91

bdev_alignment_offset---of 4
bdev_discard_alignment---of 4
blk_apply_bdi_limits---of 4
blk_queue_rq_timeout---of 1
blk_set_default_limits---of 1
blk_set_queue_depth---of 4
blk_set_stacking_limits---of 1
blk_stack_limits---of 78
blk_validate_limits20%of 97
queue_limits_commit_update34%of 9
queue_limits_commit_update_frozen---of 9
queue_limits_set---of 9
queue_limits_stack_bdev---of 3
queue_limits_stack_integrity---of 10
-----------
SUMMARY21%of 106

-----------
SUMMARY---of 0

__bpf_trace_ext4__bitmap_load---of 1
__bpf_trace_ext4__es_extent---of 1
__bpf_trace_ext4__es_shrink_enter---of 1
__bpf_trace_ext4__fallocate_mode---of 1
__bpf_trace_ext4__folio_op---of 1
__bpf_trace_ext4__map_blocks_enter---of 1
__bpf_trace_ext4__map_blocks_exit---of 1
__bpf_trace_ext4__mb_new_pa---of 1
__bpf_trace_ext4__mballoc---of 1
__bpf_trace_ext4__trim---of 1
__bpf_trace_ext4__truncate---of 1
__bpf_trace_ext4__write_begin---of 1
__bpf_trace_ext4__write_end---of 1
__bpf_trace_ext4_alloc_da_blocks---of 1
__bpf_trace_ext4_allocate_blocks---of 1
__bpf_trace_ext4_allocate_inode---of 1
__bpf_trace_ext4_begin_ordered_truncate---of 1
__bpf_trace_ext4_collapse_range---of 1
__bpf_trace_ext4_da_release_space---of 1
__bpf_trace_ext4_da_reserve_space---of 1
__bpf_trace_ext4_da_update_reserve_space---of 1
__bpf_trace_ext4_da_write_folios_end---of 1
__bpf_trace_ext4_da_write_folios_start---of 1
__bpf_trace_ext4_da_write_pages_extent---of 1
__bpf_trace_ext4_discard_blocks---of 1
__bpf_trace_ext4_discard_preallocations---of 1
__bpf_trace_ext4_drop_inode---of 1
__bpf_trace_ext4_error---of 1
__bpf_trace_ext4_es_find_extent_range_enter---of 1
__bpf_trace_ext4_es_find_extent_range_exit---of 1
__bpf_trace_ext4_es_insert_delayed_extent---of 1
__bpf_trace_ext4_es_lookup_extent_enter---of 1
__bpf_trace_ext4_es_lookup_extent_exit---of 1
__bpf_trace_ext4_es_remove_extent---of 1
__bpf_trace_ext4_es_shrink---of 1
__bpf_trace_ext4_es_shrink_scan_exit---of 1
__bpf_trace_ext4_evict_inode---of 1
__bpf_trace_ext4_ext_convert_to_initialized_enter---of 1
__bpf_trace_ext4_ext_convert_to_initialized_fastpath---of 1
__bpf_trace_ext4_ext_handle_unwritten_extents---of 1
__bpf_trace_ext4_ext_load_extent---of 1
__bpf_trace_ext4_ext_remove_space---of 1
__bpf_trace_ext4_ext_remove_space_done---of 1
__bpf_trace_ext4_ext_rm_idx---of 1
__bpf_trace_ext4_ext_rm_leaf---of 1
__bpf_trace_ext4_ext_show_extent---of 1
__bpf_trace_ext4_fallocate_exit---of 1
__bpf_trace_ext4_fc_cleanup---of 1
__bpf_trace_ext4_fc_commit_start---of 1
__bpf_trace_ext4_fc_commit_stop---of 1
__bpf_trace_ext4_fc_replay---of 1
__bpf_trace_ext4_fc_replay_scan---of 1
__bpf_trace_ext4_fc_stats---of 1
__bpf_trace_ext4_fc_track_dentry---of 1
__bpf_trace_ext4_fc_track_inode---of 1
__bpf_trace_ext4_fc_track_range---of 1
__bpf_trace_ext4_forget---of 1
__bpf_trace_ext4_free_blocks---of 1
__bpf_trace_ext4_free_inode---of 1
__bpf_trace_ext4_fsmap_class---of 1
__bpf_trace_ext4_get_implied_cluster_alloc_exit---of 1
__bpf_trace_ext4_getfsmap_class---of 1
__bpf_trace_ext4_insert_range---of 1
__bpf_trace_ext4_invalidate_folio_op---of 1
__bpf_trace_ext4_journal_start_inode---of 1
__bpf_trace_ext4_journal_start_reserved---of 1
__bpf_trace_ext4_journal_start_sb---of 1
__bpf_trace_ext4_lazy_itable_init---of 1
__bpf_trace_ext4_load_inode---of 1
__bpf_trace_ext4_mark_inode_dirty---of 1
__bpf_trace_ext4_mb_discard_preallocations---of 1
__bpf_trace_ext4_mb_release_group_pa---of 1
__bpf_trace_ext4_mb_release_inode_pa---of 1
__bpf_trace_ext4_mballoc_alloc---of 1
__bpf_trace_ext4_mballoc_prealloc---of 1
__bpf_trace_ext4_move_extent_enter---of 1
__bpf_trace_ext4_move_extent_exit---of 1
__bpf_trace_ext4_nfs_commit_metadata---of 1
__bpf_trace_ext4_other_inode_update_time---of 1
__bpf_trace_ext4_prefetch_bitmaps---of 1
__bpf_trace_ext4_read_block_bitmap_load---of 1
__bpf_trace_ext4_remove_blocks---of 1
__bpf_trace_ext4_request_blocks---of 1
__bpf_trace_ext4_request_inode---of 1
__bpf_trace_ext4_shutdown---of 1
__bpf_trace_ext4_sync_file_enter---of 1
__bpf_trace_ext4_sync_file_exit---of 1
__bpf_trace_ext4_sync_fs---of 1
__bpf_trace_ext4_unlink_enter---of 1
__bpf_trace_ext4_unlink_exit---of 1
__bpf_trace_ext4_update_sb---of 1
__bpf_trace_ext4_writepages---of 1
__bpf_trace_ext4_writepages_result---of 1
__ext4_error---of 9
__ext4_error_file---of 10
__ext4_error_inode20%of 10
__ext4_grp_locked_error19%of 27
__ext4_msg75%of 4
__ext4_sb_bread_gfp60%of 10
__ext4_std_error12%of 17
__ext4_warning67%of 3
__ext4_warning_inode---of 3
__probestub_ext4_alloc_da_blocks---of 1
__probestub_ext4_allocate_blocks---of 1
__probestub_ext4_allocate_inode---of 1
__probestub_ext4_begin_ordered_truncate---of 1
__probestub_ext4_collapse_range---of 1
__probestub_ext4_da_release_space---of 1
__probestub_ext4_da_reserve_space---of 1
__probestub_ext4_da_update_reserve_space---of 1
__probestub_ext4_da_write_begin---of 1
__probestub_ext4_da_write_end---of 1
__probestub_ext4_da_write_folios_end---of 1
__probestub_ext4_da_write_folios_start---of 1
__probestub_ext4_da_write_pages_extent---of 1
__probestub_ext4_discard_blocks---of 1
__probestub_ext4_discard_preallocations---of 1
__probestub_ext4_drop_inode---of 1
__probestub_ext4_error---of 1
__probestub_ext4_es_cache_extent---of 1
__probestub_ext4_es_find_extent_range_enter---of 1
__probestub_ext4_es_find_extent_range_exit---of 1
__probestub_ext4_es_insert_delayed_extent---of 1
__probestub_ext4_es_insert_extent---of 1
__probestub_ext4_es_lookup_extent_enter---of 1
__probestub_ext4_es_lookup_extent_exit---of 1
__probestub_ext4_es_remove_extent---of 1
__probestub_ext4_es_shrink---of 1
__probestub_ext4_es_shrink_count---of 1
__probestub_ext4_es_shrink_scan_enter---of 1
__probestub_ext4_es_shrink_scan_exit---of 1
__probestub_ext4_evict_inode---of 1
__probestub_ext4_ext_convert_to_initialized_enter---of 1
__probestub_ext4_ext_convert_to_initialized_fastpath---of 1
__probestub_ext4_ext_handle_unwritten_extents---of 1
__probestub_ext4_ext_load_extent---of 1
__probestub_ext4_ext_map_blocks_enter---of 1
__probestub_ext4_ext_map_blocks_exit---of 1
__probestub_ext4_ext_remove_space---of 1
__probestub_ext4_ext_remove_space_done---of 1
__probestub_ext4_ext_rm_idx---of 1
__probestub_ext4_ext_rm_leaf---of 1
__probestub_ext4_ext_show_extent---of 1
__probestub_ext4_fallocate_enter---of 1
__probestub_ext4_fallocate_exit---of 1
__probestub_ext4_fc_cleanup---of 1
__probestub_ext4_fc_commit_start---of 1
__probestub_ext4_fc_commit_stop---of 1
__probestub_ext4_fc_replay---of 1
__probestub_ext4_fc_replay_scan---of 1
__probestub_ext4_fc_stats---of 1
__probestub_ext4_fc_track_create---of 1
__probestub_ext4_fc_track_inode---of 1
__probestub_ext4_fc_track_link---of 1
__probestub_ext4_fc_track_range---of 1
__probestub_ext4_fc_track_unlink---of 1
__probestub_ext4_forget---of 1
__probestub_ext4_free_blocks---of 1
__probestub_ext4_free_inode---of 1
__probestub_ext4_fsmap_high_key---of 1
__probestub_ext4_fsmap_low_key---of 1
__probestub_ext4_fsmap_mapping---of 1
__probestub_ext4_get_implied_cluster_alloc_exit---of 1
__probestub_ext4_getfsmap_high_key---of 1
__probestub_ext4_getfsmap_low_key---of 1
__probestub_ext4_getfsmap_mapping---of 1
__probestub_ext4_ind_map_blocks_enter---of 1
__probestub_ext4_ind_map_blocks_exit---of 1
__probestub_ext4_insert_range---of 1
__probestub_ext4_invalidate_folio---of 1
__probestub_ext4_journal_start_inode---of 1
__probestub_ext4_journal_start_reserved---of 1
__probestub_ext4_journal_start_sb---of 1
__probestub_ext4_journalled_invalidate_folio---of 1
__probestub_ext4_journalled_write_end---of 1
__probestub_ext4_lazy_itable_init---of 1
__probestub_ext4_load_inode---of 1
__probestub_ext4_load_inode_bitmap---of 1
__probestub_ext4_mark_inode_dirty---of 1
__probestub_ext4_mb_bitmap_load---of 1
__probestub_ext4_mb_buddy_bitmap_load---of 1
__probestub_ext4_mb_discard_preallocations---of 1
__probestub_ext4_mb_new_group_pa---of 1
__probestub_ext4_mb_new_inode_pa---of 1
__probestub_ext4_mb_release_group_pa---of 1
__probestub_ext4_mb_release_inode_pa---of 1
__probestub_ext4_mballoc_alloc---of 1
__probestub_ext4_mballoc_discard---of 1
__probestub_ext4_mballoc_free---of 1
__probestub_ext4_mballoc_prealloc---of 1
__probestub_ext4_move_extent_enter---of 1
__probestub_ext4_move_extent_exit---of 1
__probestub_ext4_nfs_commit_metadata---of 1
__probestub_ext4_other_inode_update_time---of 1
__probestub_ext4_prefetch_bitmaps---of 1
__probestub_ext4_punch_hole---of 1
__probestub_ext4_read_block_bitmap_load---of 1
__probestub_ext4_read_folio---of 1
__probestub_ext4_release_folio---of 1
__probestub_ext4_remove_blocks---of 1
__probestub_ext4_request_blocks---of 1
__probestub_ext4_request_inode---of 1
__probestub_ext4_shutdown---of 1
__probestub_ext4_sync_file_enter---of 1
__probestub_ext4_sync_file_exit---of 1
__probestub_ext4_sync_fs---of 1
__probestub_ext4_trim_all_free---of 1
__probestub_ext4_trim_extent---of 1
__probestub_ext4_truncate_enter---of 1
__probestub_ext4_truncate_exit---of 1
__probestub_ext4_unlink_enter---of 1
__probestub_ext4_unlink_exit---of 1
__probestub_ext4_update_sb---of 1
__probestub_ext4_write_begin---of 1
__probestub_ext4_write_end---of 1
__probestub_ext4_writepages---of 1
__probestub_ext4_writepages_result---of 1
__probestub_ext4_zero_range---of 1
__traceiter_ext4_alloc_da_blocks---of 4
__traceiter_ext4_allocate_blocks---of 4
__traceiter_ext4_allocate_inode---of 4
__traceiter_ext4_begin_ordered_truncate---of 4
__traceiter_ext4_collapse_range---of 4
__traceiter_ext4_da_release_space---of 4
__traceiter_ext4_da_reserve_space---of 4
__traceiter_ext4_da_update_reserve_space---of 4
__traceiter_ext4_da_write_begin---of 4
__traceiter_ext4_da_write_end---of 4
__traceiter_ext4_da_write_folios_end---of 4
__traceiter_ext4_da_write_folios_start---of 4
__traceiter_ext4_da_write_pages_extent---of 4
__traceiter_ext4_discard_blocks---of 4
__traceiter_ext4_discard_preallocations---of 4
__traceiter_ext4_drop_inode---of 4
__traceiter_ext4_error---of 4
__traceiter_ext4_es_cache_extent---of 4
__traceiter_ext4_es_find_extent_range_enter---of 4
__traceiter_ext4_es_find_extent_range_exit---of 4
__traceiter_ext4_es_insert_delayed_extent---of 4
__traceiter_ext4_es_insert_extent---of 4
__traceiter_ext4_es_lookup_extent_enter---of 4
__traceiter_ext4_es_lookup_extent_exit---of 4
__traceiter_ext4_es_remove_extent---of 4
__traceiter_ext4_es_shrink---of 4
__traceiter_ext4_es_shrink_count---of 4
__traceiter_ext4_es_shrink_scan_enter---of 4
__traceiter_ext4_es_shrink_scan_exit---of 4
__traceiter_ext4_evict_inode---of 4
__traceiter_ext4_ext_convert_to_initialized_enter---of 4
__traceiter_ext4_ext_convert_to_initialized_fastpath---of 4
__traceiter_ext4_ext_handle_unwritten_extents---of 4
__traceiter_ext4_ext_load_extent---of 4
__traceiter_ext4_ext_map_blocks_enter---of 4
__traceiter_ext4_ext_map_blocks_exit---of 4
__traceiter_ext4_ext_remove_space---of 4
__traceiter_ext4_ext_remove_space_done---of 4
__traceiter_ext4_ext_rm_idx---of 4
__traceiter_ext4_ext_rm_leaf---of 4
__traceiter_ext4_ext_show_extent---of 4
__traceiter_ext4_fallocate_enter---of 4
__traceiter_ext4_fallocate_exit---of 4
__traceiter_ext4_fc_cleanup---of 4
__traceiter_ext4_fc_commit_start---of 4
__traceiter_ext4_fc_commit_stop---of 4
__traceiter_ext4_fc_replay---of 4
__traceiter_ext4_fc_replay_scan---of 4
__traceiter_ext4_fc_stats---of 4
__traceiter_ext4_fc_track_create---of 4
__traceiter_ext4_fc_track_inode---of 4
__traceiter_ext4_fc_track_link---of 4
__traceiter_ext4_fc_track_range---of 4
__traceiter_ext4_fc_track_unlink---of 4
__traceiter_ext4_forget---of 4
__traceiter_ext4_free_blocks---of 4
__traceiter_ext4_free_inode---of 4
__traceiter_ext4_fsmap_high_key---of 4
__traceiter_ext4_fsmap_low_key---of 4
__traceiter_ext4_fsmap_mapping---of 4
__traceiter_ext4_get_implied_cluster_alloc_exit---of 4
__traceiter_ext4_getfsmap_high_key---of 4
__traceiter_ext4_getfsmap_low_key---of 4
__traceiter_ext4_getfsmap_mapping---of 4
__traceiter_ext4_ind_map_blocks_enter---of 4
__traceiter_ext4_ind_map_blocks_exit---of 4
__traceiter_ext4_insert_range---of 4
__traceiter_ext4_invalidate_folio---of 4
__traceiter_ext4_journal_start_inode---of 4
__traceiter_ext4_journal_start_reserved---of 4
__traceiter_ext4_journal_start_sb---of 4
__traceiter_ext4_journalled_invalidate_folio---of 4
__traceiter_ext4_journalled_write_end---of 4
__traceiter_ext4_lazy_itable_init---of 4
__traceiter_ext4_load_inode---of 4
__traceiter_ext4_load_inode_bitmap---of 4
__traceiter_ext4_mark_inode_dirty---of 4
__traceiter_ext4_mb_bitmap_load---of 4
__traceiter_ext4_mb_buddy_bitmap_load---of 4
__traceiter_ext4_mb_discard_preallocations---of 4
__traceiter_ext4_mb_new_group_pa---of 4
__traceiter_ext4_mb_new_inode_pa---of 4
__traceiter_ext4_mb_release_group_pa---of 4
__traceiter_ext4_mb_release_inode_pa---of 4
__traceiter_ext4_mballoc_alloc---of 4
__traceiter_ext4_mballoc_discard---of 4
__traceiter_ext4_mballoc_free---of 4
__traceiter_ext4_mballoc_prealloc---of 4
__traceiter_ext4_move_extent_enter---of 4
__traceiter_ext4_move_extent_exit---of 4
__traceiter_ext4_nfs_commit_metadata---of 4
__traceiter_ext4_other_inode_update_time---of 4
__traceiter_ext4_prefetch_bitmaps---of 4
__traceiter_ext4_punch_hole---of 4
__traceiter_ext4_read_block_bitmap_load---of 4
__traceiter_ext4_read_folio---of 4
__traceiter_ext4_release_folio---of 4
__traceiter_ext4_remove_blocks---of 4
__traceiter_ext4_request_blocks---of 4
__traceiter_ext4_request_inode---of 4
__traceiter_ext4_shutdown---of 4
__traceiter_ext4_sync_file_enter---of 4
__traceiter_ext4_sync_file_exit---of 4
__traceiter_ext4_sync_fs---of 4
__traceiter_ext4_trim_all_free---of 4
__traceiter_ext4_trim_extent---of 4
__traceiter_ext4_truncate_enter---of 4
__traceiter_ext4_truncate_exit---of 4
__traceiter_ext4_unlink_enter---of 4
__traceiter_ext4_unlink_exit---of 4
__traceiter_ext4_update_sb---of 4
__traceiter_ext4_write_begin---of 4
__traceiter_ext4_write_end---of 4
__traceiter_ext4_writepages---of 4
__traceiter_ext4_writepages_result---of 4
__traceiter_ext4_zero_range---of 4
_ext4_show_options---of 95
dump_orphan_list---of 4
ext4_acquire_dquot---of 7
ext4_alloc_flex_bg_array---of 13
ext4_alloc_inode67%of 3
ext4_apply_options38%of 43
ext4_atomic_write_init14%of 15
ext4_block_bitmap67%of 3
ext4_block_bitmap_set---of 3
ext4_block_group_meta_init39%of 13
ext4_calculate_overhead15%of 41
ext4_check_feature_compatibility24%of 38
ext4_check_geometry48%of 25
ext4_check_journal_data_mode60%of 10
ext4_check_opt_consistency18%of 88
ext4_clear_inode58%of 7
ext4_clear_journal_err---of 14
ext4_commit_super40%of 15
ext4_decode_error---of 11
ext4_destroy_inode43%of 7
ext4_drop_inode38%of 8
ext4_enable_quotas---of 19
ext4_encoding_init34%of 6
ext4_fast_commit_init100%of 1
ext4_fc_free67%of 3
ext4_feature_set_ok34%of 9
ext4_fh_to_dentry---of 1
ext4_fh_to_parent---of 1
ext4_fill_flex_info---of 12
ext4_fill_super25%of 216
ext4_flex_groups_free---of 5
ext4_force_commit---of 3
ext4_free_group_clusters67%of 3
ext4_free_group_clusters_set---of 3
ext4_free_in_core_inode---of 3
ext4_free_inodes_count67%of 3
ext4_free_inodes_set---of 3
ext4_freeze---of 5
ext4_get_dquots100%of 1
ext4_get_journal_inode---of 6
ext4_get_stripe_size45%of 9
ext4_get_tree100%of 1
ext4_group_desc_csum29%of 7
ext4_group_desc_csum_set---of 3
ext4_group_desc_csum_verify---of 4
ext4_group_desc_free50%of 6
ext4_group_desc_init36%of 76
ext4_handle_clustersize40%of 10
ext4_handle_error39%of 21
ext4_has_journal_option50%of 6
ext4_has_uninit_itable---of 8
ext4_hash_info_init38%of 8
ext4_init_fs_context67%of 3
ext4_inode_bitmap67%of 3
ext4_inode_bitmap_set---of 3
ext4_inode_table67%of 3
ext4_inode_table_set---of 3
ext4_itable_unused_count67%of 3
ext4_itable_unused_set---of 3
ext4_journal_bmap---of 4
ext4_journal_commit_callback---of 14
ext4_journal_destroy---of 3
ext4_journal_finish_inode_data_buffers---of 3
ext4_journal_submit_inode_data_buffers---of 13
ext4_kill_sb50%of 4
ext4_lazyinit_thread---of 59
ext4_li_info_new67%of 3
ext4_load_and_init_journal---of 84
ext4_mark_dquot_dirty---of 5
ext4_mark_group_bitmap_corrupted34%of 9
ext4_mark_recovery_complete---of 10
ext4_nfs_commit_metadata---of 5
ext4_nfs_get_inode---of 4
ext4_parse_param24%of 65
ext4_parse_test_dummy_encryption---of 4
ext4_percpu_param_destroy---of 1
ext4_percpu_param_init25%of 8
ext4_put_super---of 32
ext4_quota_off---of 10
ext4_quota_on---of 11
ext4_quota_read---of 12
ext4_quota_write---of 13
ext4_read_bh60%of 10
ext4_read_bh_lock---of 10
ext4_read_bh_nowait38%of 8
ext4_reconfigure---of 95
ext4_register_li_request28%of 18
ext4_release_dquot---of 21
ext4_run_lazyinit_thread23%of 9
ext4_sb_bread---of 1
ext4_sb_bread_nofail---of 1
ext4_sb_bread_unmovable---of 1
ext4_sb_breadahead_unmovable56%of 9
ext4_seq_options_show---of 1
ext4_set_resv_clusters50%of 4
ext4_setup_super31%of 23
ext4_show_options---of 1
ext4_shutdown---of 1
ext4_statfs---of 23
ext4_superblock_csum---of 1
ext4_superblock_csum_set67%of 3
ext4_sync_fs---of 17
ext4_unfreeze---of 9
ext4_unregister_li_request---of 6
ext4_update_dynamic_rev---of 3
ext4_update_super25%of 57
ext4_used_dirs_count67%of 3
ext4_used_dirs_set---of 3
ext4_write_dquot---of 6
ext4_write_info---of 3
init_once100%of 1
list_add50%of 4
note_qf_name43%of 7
perf_trace_ext4__bitmap_load---of 8
perf_trace_ext4__es_extent---of 8
perf_trace_ext4__es_shrink_enter---of 8
perf_trace_ext4__fallocate_mode---of 8
perf_trace_ext4__folio_op---of 8
perf_trace_ext4__map_blocks_enter---of 8
perf_trace_ext4__map_blocks_exit---of 8
perf_trace_ext4__mb_new_pa---of 8
perf_trace_ext4__mballoc---of 9
perf_trace_ext4__trim---of 8
perf_trace_ext4__truncate---of 8
perf_trace_ext4__write_begin---of 8
perf_trace_ext4__write_end---of 8
perf_trace_ext4_alloc_da_blocks---of 8
perf_trace_ext4_allocate_blocks---of 8
perf_trace_ext4_allocate_inode---of 8
perf_trace_ext4_begin_ordered_truncate---of 8
perf_trace_ext4_collapse_range---of 8
perf_trace_ext4_da_release_space---of 8
perf_trace_ext4_da_reserve_space---of 8
perf_trace_ext4_da_update_reserve_space---of 8
perf_trace_ext4_da_write_folios_end---of 8
perf_trace_ext4_da_write_folios_start---of 8
perf_trace_ext4_da_write_pages_extent---of 8
perf_trace_ext4_discard_blocks---of 8
perf_trace_ext4_discard_preallocations---of 8
perf_trace_ext4_drop_inode---of 8
perf_trace_ext4_error---of 8
perf_trace_ext4_es_find_extent_range_enter---of 8
perf_trace_ext4_es_find_extent_range_exit---of 8
perf_trace_ext4_es_insert_delayed_extent---of 8
perf_trace_ext4_es_lookup_extent_enter---of 8
perf_trace_ext4_es_lookup_extent_exit---of 8
perf_trace_ext4_es_remove_extent---of 8
perf_trace_ext4_es_shrink---of 8
perf_trace_ext4_es_shrink_scan_exit---of 8
perf_trace_ext4_evict_inode---of 8
perf_trace_ext4_ext_convert_to_initialized_enter---of 8
perf_trace_ext4_ext_convert_to_initialized_fastpath---of 8
perf_trace_ext4_ext_handle_unwritten_extents---of 8
perf_trace_ext4_ext_load_extent---of 8
perf_trace_ext4_ext_remove_space---of 8
perf_trace_ext4_ext_remove_space_done---of 8
perf_trace_ext4_ext_rm_idx---of 8
perf_trace_ext4_ext_rm_leaf---of 8
perf_trace_ext4_ext_show_extent---of 8
perf_trace_ext4_fallocate_exit---of 8
perf_trace_ext4_fc_cleanup---of 8
perf_trace_ext4_fc_commit_start---of 8
perf_trace_ext4_fc_commit_stop---of 8
perf_trace_ext4_fc_replay---of 8
perf_trace_ext4_fc_replay_scan---of 8
perf_trace_ext4_fc_stats---of 8
perf_trace_ext4_fc_track_dentry---of 8
perf_trace_ext4_fc_track_inode---of 8
perf_trace_ext4_fc_track_range---of 8
perf_trace_ext4_forget---of 8
perf_trace_ext4_free_blocks---of 8
perf_trace_ext4_free_inode---of 8
perf_trace_ext4_fsmap_class---of 8
perf_trace_ext4_get_implied_cluster_alloc_exit---of 8
perf_trace_ext4_getfsmap_class---of 8
perf_trace_ext4_insert_range---of 8
perf_trace_ext4_invalidate_folio_op---of 8
perf_trace_ext4_journal_start_inode---of 8
perf_trace_ext4_journal_start_reserved---of 8
perf_trace_ext4_journal_start_sb---of 8
perf_trace_ext4_lazy_itable_init---of 8
perf_trace_ext4_load_inode---of 8
perf_trace_ext4_mark_inode_dirty---of 8
perf_trace_ext4_mb_discard_preallocations---of 8
perf_trace_ext4_mb_release_group_pa---of 8
perf_trace_ext4_mb_release_inode_pa---of 8
perf_trace_ext4_mballoc_alloc---of 8
perf_trace_ext4_mballoc_prealloc---of 8
perf_trace_ext4_move_extent_enter---of 8
perf_trace_ext4_move_extent_exit---of 8
perf_trace_ext4_nfs_commit_metadata---of 8
perf_trace_ext4_other_inode_update_time---of 8
perf_trace_ext4_prefetch_bitmaps---of 8
perf_trace_ext4_read_block_bitmap_load---of 8
perf_trace_ext4_remove_blocks---of 8
perf_trace_ext4_request_blocks---of 8
perf_trace_ext4_request_inode---of 8
perf_trace_ext4_shutdown---of 8
perf_trace_ext4_sync_file_enter---of 8
perf_trace_ext4_sync_file_exit---of 8
perf_trace_ext4_sync_fs---of 8
perf_trace_ext4_unlink_enter---of 8
perf_trace_ext4_unlink_exit---of 8
perf_trace_ext4_update_sb---of 8
perf_trace_ext4_writepages---of 8
perf_trace_ext4_writepages_result---of 8
print_daily_error_info---of 15
register_as_ext2---of 3
register_as_ext3---of 3
seq_show_option---of 1
super_set_uuid100%of 1
trace_event_raw_event_ext4__bitmap_load---of 9
trace_event_raw_event_ext4__es_extent---of 9
trace_event_raw_event_ext4__es_shrink_enter---of 9
trace_event_raw_event_ext4__fallocate_mode---of 9
trace_event_raw_event_ext4__folio_op---of 9
trace_event_raw_event_ext4__map_blocks_enter---of 9
trace_event_raw_event_ext4__map_blocks_exit---of 9
trace_event_raw_event_ext4__mb_new_pa---of 9
trace_event_raw_event_ext4__mballoc---of 10
trace_event_raw_event_ext4__trim---of 9
trace_event_raw_event_ext4__truncate---of 9
trace_event_raw_event_ext4__write_begin---of 9
trace_event_raw_event_ext4__write_end---of 9
trace_event_raw_event_ext4_alloc_da_blocks---of 9
trace_event_raw_event_ext4_allocate_blocks---of 9
trace_event_raw_event_ext4_allocate_inode---of 9
trace_event_raw_event_ext4_begin_ordered_truncate---of 9
trace_event_raw_event_ext4_collapse_range---of 9
trace_event_raw_event_ext4_da_release_space---of 9
trace_event_raw_event_ext4_da_reserve_space---of 9
trace_event_raw_event_ext4_da_update_reserve_space---of 9
trace_event_raw_event_ext4_da_write_folios_end---of 9
trace_event_raw_event_ext4_da_write_folios_start---of 9
trace_event_raw_event_ext4_da_write_pages_extent---of 9
trace_event_raw_event_ext4_discard_blocks---of 9
trace_event_raw_event_ext4_discard_preallocations---of 9
trace_event_raw_event_ext4_drop_inode---of 9
trace_event_raw_event_ext4_error---of 9
trace_event_raw_event_ext4_es_find_extent_range_enter---of 9
trace_event_raw_event_ext4_es_find_extent_range_exit---of 9
trace_event_raw_event_ext4_es_insert_delayed_extent---of 9
trace_event_raw_event_ext4_es_lookup_extent_enter---of 9
trace_event_raw_event_ext4_es_lookup_extent_exit---of 9
trace_event_raw_event_ext4_es_remove_extent---of 9
trace_event_raw_event_ext4_es_shrink---of 9
trace_event_raw_event_ext4_es_shrink_scan_exit---of 9
trace_event_raw_event_ext4_evict_inode---of 9
trace_event_raw_event_ext4_ext_convert_to_initialized_enter---of 9
trace_event_raw_event_ext4_ext_convert_to_initialized_fastpath---of 9
trace_event_raw_event_ext4_ext_handle_unwritten_extents---of 9
trace_event_raw_event_ext4_ext_load_extent---of 9
trace_event_raw_event_ext4_ext_remove_space---of 9
trace_event_raw_event_ext4_ext_remove_space_done---of 9
trace_event_raw_event_ext4_ext_rm_idx---of 9
trace_event_raw_event_ext4_ext_rm_leaf---of 9
trace_event_raw_event_ext4_ext_show_extent---of 9
trace_event_raw_event_ext4_fallocate_exit---of 9
trace_event_raw_event_ext4_fc_cleanup---of 9
trace_event_raw_event_ext4_fc_commit_start---of 9
trace_event_raw_event_ext4_fc_commit_stop---of 9
trace_event_raw_event_ext4_fc_replay---of 9
trace_event_raw_event_ext4_fc_replay_scan---of 9
trace_event_raw_event_ext4_fc_stats---of 9
trace_event_raw_event_ext4_fc_track_dentry---of 9
trace_event_raw_event_ext4_fc_track_inode---of 9
trace_event_raw_event_ext4_fc_track_range---of 9
trace_event_raw_event_ext4_forget---of 9
trace_event_raw_event_ext4_free_blocks---of 9
trace_event_raw_event_ext4_free_inode---of 9
trace_event_raw_event_ext4_fsmap_class---of 9
trace_event_raw_event_ext4_get_implied_cluster_alloc_exit---of 9
trace_event_raw_event_ext4_getfsmap_class---of 9
trace_event_raw_event_ext4_insert_range---of 9
trace_event_raw_event_ext4_invalidate_folio_op---of 9
trace_event_raw_event_ext4_journal_start_inode---of 9
trace_event_raw_event_ext4_journal_start_reserved---of 9
trace_event_raw_event_ext4_journal_start_sb---of 9
trace_event_raw_event_ext4_lazy_itable_init---of 9
trace_event_raw_event_ext4_load_inode---of 9
trace_event_raw_event_ext4_mark_inode_dirty---of 9
trace_event_raw_event_ext4_mb_discard_preallocations---of 9
trace_event_raw_event_ext4_mb_release_group_pa---of 9
trace_event_raw_event_ext4_mb_release_inode_pa---of 9
trace_event_raw_event_ext4_mballoc_alloc---of 9
trace_event_raw_event_ext4_mballoc_prealloc---of 9
trace_event_raw_event_ext4_move_extent_enter---of 9
trace_event_raw_event_ext4_move_extent_exit---of 9
trace_event_raw_event_ext4_nfs_commit_metadata---of 9
trace_event_raw_event_ext4_other_inode_update_time---of 9
trace_event_raw_event_ext4_prefetch_bitmaps---of 9
trace_event_raw_event_ext4_read_block_bitmap_load---of 9
trace_event_raw_event_ext4_remove_blocks---of 9
trace_event_raw_event_ext4_request_blocks---of 9
trace_event_raw_event_ext4_request_inode---of 9
trace_event_raw_event_ext4_shutdown---of 9
trace_event_raw_event_ext4_sync_file_enter---of 9
trace_event_raw_event_ext4_sync_file_exit---of 9
trace_event_raw_event_ext4_sync_fs---of 9
trace_event_raw_event_ext4_unlink_enter---of 9
trace_event_raw_event_ext4_unlink_exit---of 9
trace_event_raw_event_ext4_update_sb---of 9
trace_event_raw_event_ext4_writepages---of 9
trace_event_raw_event_ext4_writepages_result---of 9
trace_raw_output_ext4__bitmap_load---of 3
trace_raw_output_ext4__es_extent---of 3
trace_raw_output_ext4__es_shrink_enter---of 3
trace_raw_output_ext4__fallocate_mode---of 3
trace_raw_output_ext4__folio_op---of 3
trace_raw_output_ext4__map_blocks_enter---of 3
trace_raw_output_ext4__map_blocks_exit---of 3
trace_raw_output_ext4__mb_new_pa---of 3
trace_raw_output_ext4__mballoc---of 3
trace_raw_output_ext4__trim---of 3
trace_raw_output_ext4__truncate---of 3
trace_raw_output_ext4__write_begin---of 3
trace_raw_output_ext4__write_end---of 3
trace_raw_output_ext4_alloc_da_blocks---of 3
trace_raw_output_ext4_allocate_blocks---of 3
trace_raw_output_ext4_allocate_inode---of 3
trace_raw_output_ext4_begin_ordered_truncate---of 3
trace_raw_output_ext4_collapse_range---of 3
trace_raw_output_ext4_da_release_space---of 3
trace_raw_output_ext4_da_reserve_space---of 3
trace_raw_output_ext4_da_update_reserve_space---of 3
trace_raw_output_ext4_da_write_folios_end---of 3
trace_raw_output_ext4_da_write_folios_start---of 3
trace_raw_output_ext4_da_write_pages_extent---of 3
trace_raw_output_ext4_discard_blocks---of 3
trace_raw_output_ext4_discard_preallocations---of 3
trace_raw_output_ext4_drop_inode---of 3
trace_raw_output_ext4_error---of 3
trace_raw_output_ext4_es_find_extent_range_enter---of 3
trace_raw_output_ext4_es_find_extent_range_exit---of 3
trace_raw_output_ext4_es_insert_delayed_extent---of 3
trace_raw_output_ext4_es_lookup_extent_enter---of 3
trace_raw_output_ext4_es_lookup_extent_exit---of 4
trace_raw_output_ext4_es_remove_extent---of 3
trace_raw_output_ext4_es_shrink---of 3
trace_raw_output_ext4_es_shrink_scan_exit---of 3
trace_raw_output_ext4_evict_inode---of 3
trace_raw_output_ext4_ext_convert_to_initialized_enter---of 3
trace_raw_output_ext4_ext_convert_to_initialized_fastpath---of 3
trace_raw_output_ext4_ext_handle_unwritten_extents---of 3
trace_raw_output_ext4_ext_load_extent---of 3
trace_raw_output_ext4_ext_remove_space---of 3
trace_raw_output_ext4_ext_remove_space_done---of 3
trace_raw_output_ext4_ext_rm_idx---of 3
trace_raw_output_ext4_ext_rm_leaf---of 3
trace_raw_output_ext4_ext_show_extent---of 3
trace_raw_output_ext4_fallocate_exit---of 3
trace_raw_output_ext4_fc_cleanup---of 3
trace_raw_output_ext4_fc_commit_start---of 3
trace_raw_output_ext4_fc_commit_stop---of 3
trace_raw_output_ext4_fc_replay---of 3
trace_raw_output_ext4_fc_replay_scan---of 3
trace_raw_output_ext4_fc_stats---of 3
trace_raw_output_ext4_fc_track_dentry---of 3
trace_raw_output_ext4_fc_track_inode---of 3
trace_raw_output_ext4_fc_track_range---of 3
trace_raw_output_ext4_forget---of 3
trace_raw_output_ext4_free_blocks---of 3
trace_raw_output_ext4_free_inode---of 3
trace_raw_output_ext4_fsmap_class---of 3
trace_raw_output_ext4_get_implied_cluster_alloc_exit---of 3
trace_raw_output_ext4_getfsmap_class---of 3
trace_raw_output_ext4_insert_range---of 3
trace_raw_output_ext4_invalidate_folio_op---of 3
trace_raw_output_ext4_journal_start_inode---of 3
trace_raw_output_ext4_journal_start_reserved---of 3
trace_raw_output_ext4_journal_start_sb---of 3
trace_raw_output_ext4_lazy_itable_init---of 3
trace_raw_output_ext4_load_inode---of 3
trace_raw_output_ext4_mark_inode_dirty---of 3
trace_raw_output_ext4_mb_discard_preallocations---of 3
trace_raw_output_ext4_mb_release_group_pa---of 3
trace_raw_output_ext4_mb_release_inode_pa---of 3
trace_raw_output_ext4_mballoc_alloc---of 3
trace_raw_output_ext4_mballoc_prealloc---of 3
trace_raw_output_ext4_move_extent_enter---of 3
trace_raw_output_ext4_move_extent_exit---of 3
trace_raw_output_ext4_nfs_commit_metadata---of 3
trace_raw_output_ext4_other_inode_update_time---of 3
trace_raw_output_ext4_prefetch_bitmaps---of 3
trace_raw_output_ext4_read_block_bitmap_load---of 3
trace_raw_output_ext4_remove_blocks---of 3
trace_raw_output_ext4_request_blocks---of 3
trace_raw_output_ext4_request_inode---of 3
trace_raw_output_ext4_shutdown---of 3
trace_raw_output_ext4_sync_file_enter---of 3
trace_raw_output_ext4_sync_file_exit---of 3
trace_raw_output_ext4_sync_fs---of 3
trace_raw_output_ext4_unlink_enter---of 3
trace_raw_output_ext4_unlink_exit---of 3
trace_raw_output_ext4_update_sb---of 3
trace_raw_output_ext4_writepages---of 3
trace_raw_output_ext4_writepages_result---of 3
update_super_work---of 14
-----------
SUMMARY31%of 1031

nilfs_btnode_abort_change_key---of 4
nilfs_btnode_cache_clear---of 1
nilfs_btnode_commit_change_key---of 5
nilfs_btnode_create_block---of 13
nilfs_btnode_delete---of 7
nilfs_btnode_prepare_change_key---of 13
nilfs_btnode_submit_block34%of 18
nilfs_init_btnc_inode100%of 1
-----------
SUMMARY37%of 19

crash_cma_ranges_show---of 4
crash_elfcorehdr_size_show---of 1
crash_loaded_show---of 1
crash_size_show---of 3
crash_size_store---of 3
do_kimage_alloc_init67%of 3
kernel_kexec---of 4
kexec_limit_handler---of 6
kexec_load_permitted40%of 5
kimage_alloc_control_pages---of 38
kimage_alloc_page---of 34
kimage_free10%of 20
kimage_free_page_list---of 8
kimage_is_destination_range---of 7
kimage_load_segment---of 38
kimage_map_segment---of 14
kimage_terminate---of 3
kimage_unmap_segment---of 3
loaded_show---of 1
sanity_check_segment_list8%of 26
-----------
SUMMARY15%of 54

-----------
SUMMARY---of 0

__pm_runtime_barrier31%of 13
__pm_runtime_disable39%of 13
__pm_runtime_idle55%of 11
__pm_runtime_resume58%of 7
__pm_runtime_set_status24%of 50
__pm_runtime_suspend19%of 11
__pm_runtime_use_autosuspend100%of 1
__rpm_callback9%of 45
__update_runtime_status50%of 8
dev_memalloc_noio---of 1
devm_pm_runtime_enable---of 3
devm_pm_runtime_get_noresume---of 6
devm_pm_runtime_set_active_enabled---of 4
pm_runtime_active_time---of 4
pm_runtime_allow---of 8
pm_runtime_autosuspend_expiration---of 4
pm_runtime_barrier38%of 8
pm_runtime_block_if_disabled---of 4
pm_runtime_disable_action---of 1
pm_runtime_drop_link---of 12
pm_runtime_enable45%of 9
pm_runtime_forbid67%of 3
pm_runtime_force_resume---of 27
pm_runtime_force_suspend---of 29
pm_runtime_get_conditional---of 15
pm_runtime_get_if_active---of 1
pm_runtime_get_if_in_use---of 1
pm_runtime_get_suppliers25%of 8
pm_runtime_init100%of 1
pm_runtime_irq_safe---of 5
pm_runtime_need_not_resume---of 4
pm_runtime_new_link---of 1
pm_runtime_no_callbacks67%of 3
pm_runtime_put_noidle_action---of 5
pm_runtime_put_suppliers34%of 6
pm_runtime_reinit58%of 7
pm_runtime_release_supplier---of 9
pm_runtime_remove58%of 7
pm_runtime_set_autosuspend_delay100%of 1
pm_runtime_set_memalloc_noio---of 8
pm_runtime_set_suspended_action---of 1
pm_runtime_suspended_time---of 4
pm_runtime_unblock---of 3
pm_runtime_work---of 7
pm_schedule_suspend---of 18
pm_suspend_timer_fn---of 4
rpm_idle41%of 57
rpm_resume13%of 73
rpm_suspend24%of 85
update_autosuspend25%of 8
-----------
SUMMARY28%of 435

__ipv6_dev_mc_dec---of 18
__ipv6_dev_mc_inc---of 19
__ipv6_sock_mc_close---of 6
__ipv6_sock_mc_drop---of 12
__ipv6_sock_mc_join---of 18
add_grec---of 62
add_grhead---of 6
dst_output---of 4
igmp6_cleanup---of 1
igmp6_event_query---of 8
igmp6_event_report---of 8
igmp6_group_added---of 18
igmp6_group_dropped---of 29
igmp6_group_queried---of 11
igmp6_join_group---of 18
igmp6_late_cleanup---of 1
igmp6_mc_seq_next---of 8
igmp6_mc_seq_show---of 3
igmp6_mc_seq_start---of 16
igmp6_mc_seq_stop---of 3
igmp6_mcf_seq_next---of 18
igmp6_mcf_seq_show---of 3
igmp6_mcf_seq_start---of 21
igmp6_mcf_seq_stop---of 5
igmp6_net_exit---of 5
igmp6_net_init---of 12
igmp6_send---of 23
inet6_ifmcaddr_notify---of 5
inet6_mc_check---of 13
ip6_mc_add_src---of 34
ip6_mc_clear_src---of 7
ip6_mc_del1_src---of 12
ip6_mc_del_src---of 25
ip6_mc_find_dev---of 7
ip6_mc_find_idev---of 8
ip6_mc_leave_src---of 7
ip6_mc_msfget---of 17
ip6_mc_msfilter---of 39
ip6_mc_source---of 55
ipv6_chk_mcast_addr46%of 11
ipv6_dev_mc_dec---of 8
ipv6_dev_mc_inc---of 1
ipv6_mc_dad_complete---of 16
ipv6_mc_destroy_dev---of 23
ipv6_mc_down---of 19
ipv6_mc_init_dev---of 6
ipv6_mc_netdev_event---of 20
ipv6_mc_remap---of 1
ipv6_mc_unmap---of 4
ipv6_mc_up---of 9
ipv6_sock_mc_close---of 9
ipv6_sock_mc_drop---of 12
ipv6_sock_mc_join---of 1
ipv6_sock_mc_join_ssm---of 1
is_in---of 22
mld_clear_delrec---of 13
mld_dad_work---of 19
mld_del_delrec---of 21
mld_gq_start_work---of 5
mld_gq_work---of 10
mld_ifc_event---of 14
mld_ifc_work---of 54
mld_in_v1_mode---of 10
mld_mca_work---of 25
mld_newpack---of 11
mld_query_work---of 121
mld_report_work---of 40
mld_send_initial_cr---of 14
mld_sendpack---of 27
sf_setstate---of 19
-----------
SUMMARY46%of 11

squashfs_decompress24%of 21
squashfs_decompressor_create29%of 7
squashfs_decompressor_destroy40%of 10
squashfs_max_decompressors100%of 1
-----------
SUMMARY31%of 39

v9fs_get_default_trans---of 14
v9fs_get_trans_by_name58%of 14
v9fs_put_trans100%of 3
v9fs_register_trans---of 3
v9fs_unregister_trans---of 4
-----------
SUMMARY65%of 17

idr_callback---of 4
inotify_free_event100%of 1
inotify_free_group_priv---of 3
inotify_free_mark---of 1
inotify_freeing_mark---of 1
inotify_handle_inode_event48%of 17
inotify_merge50%of 8
-----------
SUMMARY50%of 26

__fprop_add_percpu---of 1
__fprop_add_percpu_max40%of 5
fprop_fraction_percpu---of 6
fprop_global_destroy---of 1
fprop_global_init---of 3
fprop_local_destroy_percpu---of 1
fprop_local_init_percpu---of 3
fprop_new_period---of 4
fprop_reflect_period_percpu50%of 6
-----------
SUMMARY46%of 11

-----------
SUMMARY---of 0

__get_user_pages12%of 161
__gup_longterm_locked---of 87
__mm_populate31%of 23
check_and_migrate_movable_pages_or_folios---of 83
check_vma_flags18%of 29
fault_in_readable---of 7
fault_in_safe_writeable---of 12
fault_in_subpage_writeable---of 7
fault_in_writeable---of 7
faultin_page_range---of 35
fixup_user_fault---of 36
folio_add_pin---of 6
folio_add_pins---of 5
follow_huge_pmd15%of 41
follow_page_pte---of 57
follow_pfn_pte---of 6
get_dump_page---of 13
get_user_pages---of 35
get_user_pages_fast67%of 3
get_user_pages_fast_only---of 3
get_user_pages_remote---of 44
get_user_pages_unlocked---of 38
gup_fast_fallback15%of 113
gup_fast_pmd_leaf---of 51
gup_fast_pud_leaf---of 45
gup_put_folio---of 7
is_valid_gup_args30%of 10
memfd_pin_folios---of 38
no_page_table---of 7
pin_user_pages---of 3
pin_user_pages_fast---of 3
pin_user_pages_remote---of 3
pin_user_pages_unlocked---of 3
pmd_lock100%of 1
populate_vma_page_range---of 6
try_grab_folio29%of 7
try_grab_folio_fast7%of 32
unpin_folio---of 6
unpin_folios---of 14
unpin_user_folio---of 6
unpin_user_page---of 6
unpin_user_page_range_dirty_lock---of 16
unpin_user_pages---of 14
unpin_user_pages_dirty_lock---of 16
-----------
SUMMARY15%of 420

-----------
SUMMARY---of 0

serial8250_backup_timeout---of 27
serial8250_get_port---of 1
serial8250_interrupt---of 4
serial8250_register_8250_port---of 81
serial8250_resume_port---of 4
serial8250_setup_port---of 4
serial8250_suspend_port---of 7
serial8250_timeout---of 1
serial8250_unregister_port---of 19
serial_8250_overrun_backoff_work---of 16
serial_do_unlink---of 12
univ8250_console_exit---of 1
univ8250_console_match---of 16
univ8250_console_setup---of 12
univ8250_console_write100%of 1
univ8250_release_irq---of 7
univ8250_setup_irq---of 18
univ8250_setup_timer---of 5
-----------
SUMMARY100%of 1

__kfifo_alloc_node---of 4
__kfifo_dma_in_finish_r---of 3
__kfifo_dma_in_prepare---of 1
__kfifo_dma_in_prepare_r---of 4
__kfifo_dma_out_prepare---of 1
__kfifo_dma_out_prepare_r---of 4
__kfifo_free100%of 1
__kfifo_from_user---of 5
__kfifo_from_user_r---of 6
__kfifo_in---of 1
__kfifo_in_r---of 4
__kfifo_init---of 3
__kfifo_len_r---of 3
__kfifo_max_r---of 1
__kfifo_out---of 1
__kfifo_out_linear---of 3
__kfifo_out_linear_r---of 6
__kfifo_out_peek---of 1
__kfifo_out_peek_r---of 4
__kfifo_out_r---of 4
__kfifo_skip_r---of 3
__kfifo_to_user---of 5
__kfifo_to_user_r---of 6
kfifo_copy_from_user---of 6
kfifo_copy_in---of 1
kfifo_copy_out---of 1
kfifo_copy_to_user---of 6
setup_sgl---of 9
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__futex_wait24%of 17
__futex_wake_mark---of 4
futex_do_wait---of 8
futex_unqueue_multiple---of 4
futex_wait50%of 4
futex_wait_multiple---of 23
futex_wait_multiple_setup---of 34
futex_wait_restart---of 4
futex_wait_setup28%of 22
futex_wake---of 19
futex_wake_mark---of 10
futex_wake_op---of 69
-----------
SUMMARY28%of 43

-----------
SUMMARY---of 0

crc32_be58%of 7
crc32_le55%of 11
crc32_optimizations---of 1
crc32c34%of 21
-----------
SUMMARY44%of 39

usb_wwan_chars_in_buffer---of 13
usb_wwan_close---of 11
usb_wwan_dtr_rts---of 4
usb_wwan_indat_callback---of 10
usb_wwan_open---of 18
usb_wwan_outdat_callback---of 6
usb_wwan_port_probe10%of 22
usb_wwan_port_remove---of 1
usb_wwan_resume---of 25
usb_wwan_suspend---of 9
usb_wwan_tiocmget---of 1
usb_wwan_tiocmset---of 12
usb_wwan_write---of 10
usb_wwan_write_room---of 9
-----------
SUMMARY10%of 22

usb_alloc_urb67%of 3
usb_anchor_empty---of 1
usb_anchor_resume_wakeups---of 5
usb_anchor_suspend_wakeups---of 3
usb_anchor_urb---of 9
usb_block_urb---of 3
usb_free_urb50%of 6
usb_get_from_anchor---of 17
usb_get_urb40%of 5
usb_init_urb---of 3
usb_kill_anchored_urbs---of 15
usb_kill_urb---of 8
usb_pipe_type_check---of 3
usb_poison_anchored_urbs---of 15
usb_poison_urb25%of 8
usb_scuttle_anchored_urbs---of 18
usb_submit_urb27%of 65
usb_unanchor_urb---of 14
usb_unlink_urb---of 5
usb_unpoison_anchored_urbs---of 6
usb_unpoison_urb---of 3
usb_urb_ep_type_check---of 3
usb_wait_anchor_empty_timeout---of 11
-----------
SUMMARY30%of 87

-----------
SUMMARY---of 0

__add_disk---of 35
__alloc_disk_node---of 10
__blk_alloc_disk---of 4
__del_gendisk---of 28
__register_blkdev---of 13
add_disk_fwnode---of 10
bdev_count_inflight60%of 5
blk_alloc_ext_minor---of 1
blk_free_ext_minor---of 1
blk_mark_disk_dead---of 4
blk_report_disk_dead---of 6
blk_request_module---of 16
blkdev_show---of 6
block_devnode67%of 3
block_uevent100%of 1
del_gendisk---of 3
device_add_disk---of 1
disk_alignment_offset_show---of 1
disk_badblocks_show---of 3
disk_badblocks_store---of 3
disk_capability_show---of 3
disk_discard_alignment_show---of 1
disk_ext_range_show---of 1
disk_hidden_show---of 1
disk_range_show---of 1
disk_release---of 10
disk_removable_show---of 1
disk_ro_show---of 3
disk_scan_partitions---of 11
disk_seqf_next---of 3
disk_seqf_start---of 5
disk_seqf_stop---of 3
disk_uevent---of 9
disk_visible---of 4
diskseq_show---of 1
diskstats_show---of 14
inc_diskseq100%of 1
invalidate_disk100%of 1
part_devt---of 3
part_fail_show---of 1
part_fail_store---of 5
part_inflight_show---of 6
part_size_show---of 1
part_stat_read_all---of 5
part_stat_show---of 8
partscan_show---of 3
put_disk---of 3
set_capacity---of 4
set_capacity_and_notify---of 11
set_disk_ro---of 5
show_partition---of 8
show_partition_start---of 7
unregister_blkdev---of 5
-----------
SUMMARY73%of 11

-----------
SUMMARY---of 0

strnlen_user38%of 8
-----------
SUMMARY38%of 8

-----------
SUMMARY---of 0

fsnotify_destroy_event34%of 6
fsnotify_flush_notify---of 10
fsnotify_get_cookie---of 1
fsnotify_insert_event47%of 13
fsnotify_peek_first_event---of 4
fsnotify_remove_first_event---of 8
fsnotify_remove_queued_event---of 5
-----------
SUMMARY43%of 19

dquot_alloc_space---of 3
dquot_free_space---of 1
ext4_compat_ioctl---of 37
ext4_dax_dontcache---of 6
ext4_fileattr_get---of 3
ext4_fileattr_set---of 69
ext4_force_shutdown---of 15
ext4_getfsmap_format---of 5
ext4_ioctl---of 186
ext4_ioctl_group_add---of 10
ext4_reset_inode_seed---of 3
ext4_sb_setlabel---of 1
ext4_sb_setparams---of 38
ext4_sb_setuuid---of 1
ext4_update_overhead25%of 8
ext4_update_superblocks_fn---of 56
set_overhead---of 1
swap_inode_boot_loader---of 26
swap_inode_data---of 3
trace_ext4_getfsmap_high_key---of 5
trace_ext4_getfsmap_low_key---of 5
uuid_is_null---of 1
-----------
SUMMARY25%of 8

hook_bprm_creds_for_exec---of 1
hook_cred_free---of 3
hook_cred_prepare40%of 5
hook_cred_transfer---of 5
-----------
SUMMARY40%of 5

NF_HOOK---of 4
__neigh_lookup---of 3
arp_accept---of 4
arp_constructor37%of 19
arp_create---of 17
arp_error_report---of 5
arp_filter---of 4
arp_fwd_proxy---of 9
arp_hash100%of 1
arp_ifdown---of 1
arp_ignore---of 6
arp_invalidate---of 10
arp_ioctl---of 18
arp_is_garp---of 6
arp_is_multicast---of 1
arp_key_eq---of 1
arp_mc_map---of 8
arp_net_exit---of 1
arp_net_init---of 1
arp_netdev_event---of 11
arp_process---of 79
arp_rcv14%of 15
arp_req_delete---of 7
arp_req_dev---of 9
arp_req_get---of 16
arp_req_set---of 24
arp_send---of 5
arp_send_dst---of 9
arp_seq_show---of 33
arp_seq_start---of 1
arp_solicit---of 23
arp_xmit---of 5
arp_xmit_finish---of 1
neigh_release---of 4
parp_redo---of 1
-----------
SUMMARY29%of 35

-----------
SUMMARY---of 0

key_get_instantiation_authkey---of 4
request_key_auth_describe---of 4
request_key_auth_destroy---of 3
request_key_auth_free_preparse100%of 1
request_key_auth_instantiate100%of 1
request_key_auth_new21%of 24
request_key_auth_preparse100%of 1
request_key_auth_rcu_disposal---of 5
request_key_auth_read---of 4
request_key_auth_revoke---of 1
-----------
SUMMARY30%of 27

-----------
SUMMARY---of 0

__tun_chr_ioctl---of 90
__tun_detach---of 54
__virtio_net_hdr_to_skb7%of 72
bpf_prog_run_clear_cb---of 14
group_show---of 3
owner_show---of 3
ptr_ring_consume---of 9
set_offload---of 8
skb_reset_mac_header---of 3
tun_attach---of 52
tun_attach_filter---of 7
tun_chr_close---of 7
tun_chr_compat_ioctl---of 1
tun_chr_fasync---of 6
tun_chr_ioctl---of 1
tun_chr_open---of 4
tun_chr_poll---of 21
tun_chr_read_iter---of 14
tun_chr_show_fdinfo---of 11
tun_chr_write_iter34%of 12
tun_detach_filter---of 4
tun_device_event---of 40
tun_do_read---of 84
tun_fill_info---of 16
tun_flags_show---of 1
tun_flow_cleanup---of 13
tun_flow_create---of 6
tun_flow_uninit---of 11
tun_flow_update---of 17
tun_free_netdev---of 6
tun_get_addr_len---of 17
tun_get_channels---of 1
tun_get_coalesce---of 1
tun_get_drvinfo---of 4
tun_get_iff---of 4
tun_get_link_ksettings---of 1
tun_get_msglevel---of 1
tun_get_size---of 1
tun_get_socket---of 4
tun_get_tx_ring---of 4
tun_get_user17%of 139
tun_napi_poll---of 12
tun_net_change_carrier---of 4
tun_net_close---of 1
tun_net_fix_features---of 1
tun_net_get_stats64---of 1
tun_net_init---of 4
tun_net_initialize---of 4
tun_net_mclist---of 1
tun_net_open---of 4
tun_net_uninit---of 33
tun_net_xmit30%of 51
tun_not_capable---of 7
tun_peek_len---of 13
tun_prog_free---of 1
tun_ptr_free---of 4
tun_queue_purge---of 6
tun_recvmsg---of 18
tun_rx_batched19%of 11
tun_select_queue---of 8
tun_sendmsg---of 68
tun_set_coalesce---of 1
tun_set_ebpf---of 9
tun_set_headroom---of 1
tun_set_iff---of 37
tun_set_link_ksettings---of 1
tun_set_msglevel---of 1
tun_setup---of 1
tun_sock_write_space34%of 6
tun_validate---of 3
tun_vnet_hdr_tnl_to_skb15%of 14
tun_vnet_ioctl---of 12
tun_xdp---of 14
tun_xdp_act---of 19
tun_xdp_xmit---of 14
update_filter---of 13
virtio_net_hdr_set_proto---of 7
-----------
SUMMARY18%of 305

__key_create_or_update15%of 41
__key_instantiate_and_link44%of 16
__key_update---of 9
generic_key_instantiate---of 3
key_alloc30%of 37
key_create---of 1
key_create_or_update100%of 1
key_instantiate_and_link31%of 13
key_invalidate43%of 7
key_lookup---of 14
key_payload_reserve34%of 6
key_put67%of 6
key_reject_and_link53%of 19
key_revoke---of 10
key_set_timeout---of 3
key_type_lookup75%of 4
key_type_put100%of 1
key_update---of 13
key_user_lookup17%of 12
key_user_put67%of 3
register_key_type---of 6
unregister_key_type---of 4
-----------
SUMMARY34%of 166

-----------
SUMMARY---of 0

__ia32_sys_memfd_secret---of 1
__se_sys_memfd_secret---of 12
__x64_sys_memfd_secret---of 1
secretmem_active---of 1
secretmem_fault---of 13
secretmem_free_folio---of 12
secretmem_init_fs_context---of 3
secretmem_migrate_folio---of 1
secretmem_mmap_prepare---of 4
secretmem_release---of 1
secretmem_setattr---of 4
vma_is_secretmem100%of 1
-----------
SUMMARY100%of 1

__device_attach34%of 15
__device_attach_async_helper---of 7
__device_attach_driver32%of 19
__device_set_driver_override---of 10
__driver_attach27%of 23
__driver_attach_async_helper---of 7
__driver_probe_device37%of 11
coredump_store---of 1
deferred_devs_open---of 1
deferred_devs_show---of 6
deferred_probe_extend_timeout67%of 3
deferred_probe_initcall---of 7
deferred_probe_timeout_work_func---of 7
deferred_probe_work_func---of 7
device_attach---of 1
device_bind_driver---of 3
device_block_probing---of 1
device_driver_attach---of 7
device_driver_detach---of 1
device_initial_probe50%of 4
device_is_bound---of 3
device_release_driver100%of 1
device_release_driver_internal33%of 34
device_remove---of 6
device_set_deferred_probe_reason---of 1
device_unblock_probing---of 4
driver_attach100%of 1
driver_bound37%of 11
driver_deferred_probe_add---of 5
driver_deferred_probe_check_state---of 3
driver_deferred_probe_del40%of 5
driver_deferred_probe_trigger---of 4
driver_detach---of 10
driver_probe_device17%of 12
driver_sysfs_add34%of 6
really_probe32%of 44
really_probe_debug---of 3
state_synced_show---of 1
state_synced_store---of 9
wait_for_device_probe---of 5
-----------
SUMMARY33%of 189

__flow_hash_from_keys36%of 14
__get_hash_from_flowi650%of 4
__skb_flow_dissect20%of 325
__skb_get_hash_net---of 4
__skb_get_hash_symmetric_net---of 4
__skb_get_poff---of 15
bpf_flow_dissect---of 12
flow_dissector_bpf_prog_attach_check---of 9
flow_get_u32_dst---of 4
flow_get_u32_src---of 5
flow_hash_from_keys---of 4
flow_hash_from_keys_seed100%of 1
make_flow_keys_digest---of 1
skb_flow_dissect_ct---of 10
skb_flow_dissect_hash---of 3
skb_flow_dissect_meta---of 6
skb_flow_dissect_tunnel_info---of 29
skb_flow_dissector_init---of 8
skb_flow_get_icmp_tci---of 13
skb_flow_get_ports---of 17
skb_get_hash_perturb---of 1
skb_get_poff---of 3
-----------
SUMMARY21%of 344

nilfs_checkpoints_README_show---of 1
nilfs_checkpoints_attr_release100%of 1
nilfs_checkpoints_attr_show---of 3
nilfs_checkpoints_attr_store---of 3
nilfs_checkpoints_checkpoints_number_show---of 3
nilfs_checkpoints_last_seg_checkpoint_show---of 1
nilfs_checkpoints_next_checkpoint_show---of 1
nilfs_checkpoints_snapshots_number_show---of 3
nilfs_dev_README_show---of 1
nilfs_dev_attr_release100%of 1
nilfs_dev_attr_show---of 3
nilfs_dev_attr_store---of 3
nilfs_dev_blocksize_show---of 1
nilfs_dev_device_size_show---of 1
nilfs_dev_free_blocks_show---of 1
nilfs_dev_revision_show---of 1
nilfs_dev_uuid_show---of 1
nilfs_dev_volume_name_show---of 1
nilfs_feature_README_show---of 1
nilfs_feature_revision_show---of 1
nilfs_mounted_snapshots_README_show---of 1
nilfs_mounted_snapshots_attr_release100%of 1
nilfs_mounted_snapshots_attr_show---of 3
nilfs_mounted_snapshots_attr_store---of 3
nilfs_segctor_README_show---of 1
nilfs_segctor_attr_release100%of 1
nilfs_segctor_attr_show---of 3
nilfs_segctor_attr_store---of 3
nilfs_segctor_current_last_full_seg_show---of 1
nilfs_segctor_current_seg_sequence_show---of 1
nilfs_segctor_dirty_data_blocks_count_show---of 1
nilfs_segctor_last_nongc_write_time_secs_show---of 1
nilfs_segctor_last_nongc_write_time_show---of 1
nilfs_segctor_last_pseg_block_show---of 1
nilfs_segctor_last_seg_checkpoint_show---of 1
nilfs_segctor_last_seg_sequence_show---of 1
nilfs_segctor_last_seg_write_time_secs_show---of 1
nilfs_segctor_last_seg_write_time_show---of 1
nilfs_segctor_next_checkpoint_show---of 1
nilfs_segctor_next_full_seg_show---of 1
nilfs_segctor_next_pseg_offset_show---of 1
nilfs_segments_README_show---of 1
nilfs_segments_attr_release100%of 1
nilfs_segments_attr_show---of 3
nilfs_segments_attr_store---of 3
nilfs_segments_blocks_per_segment_show---of 1
nilfs_segments_clean_segments_show---of 1
nilfs_segments_dirty_segments_show---of 3
nilfs_segments_segments_number_show---of 1
nilfs_snapshot_README_show---of 1
nilfs_snapshot_attr_release100%of 1
nilfs_snapshot_attr_show---of 3
nilfs_snapshot_attr_store---of 3
nilfs_snapshot_blocks_count_show---of 1
nilfs_snapshot_inodes_count_show---of 1
nilfs_superblock_README_show---of 1
nilfs_superblock_attr_release100%of 1
nilfs_superblock_attr_show---of 3
nilfs_superblock_attr_store---of 3
nilfs_superblock_sb_update_frequency_show---of 1
nilfs_superblock_sb_update_frequency_store---of 4
nilfs_superblock_sb_write_count_show---of 1
nilfs_superblock_sb_write_time_secs_show---of 1
nilfs_superblock_sb_write_time_show---of 1
nilfs_sysfs_create_device_group23%of 9
nilfs_sysfs_create_segctor_group67%of 3
nilfs_sysfs_create_snapshot_group60%of 5
nilfs_sysfs_create_superblock_group67%of 3
nilfs_sysfs_delete_device_group100%of 1
nilfs_sysfs_delete_snapshot_group100%of 1
nilfs_sysfs_exit---of 1
-----------
SUMMARY63%of 29

-----------
SUMMARY---of 0

address_val---of 8
bdev_name16%of 13
bitmap_list_string---of 17
bitmap_string---of 13
bstr_printf---of 43
clock---of 8
date_str---of 5
default_pointer4%of 53
dentry_name---of 20
device_node_string---of 66
err_ptr---of 8
escaped_string---of 16
file_dentry_name---of 8
fill_ptr_key---of 1
flags_string---of 54
format_decode71%of 27
fourcc_string---of 61
fwnode_full_name_string---of 9
fwnode_string---of 27
hex_range---of 4
hex_string---of 21
ip4_addr_string---of 7
ip4_addr_string_sa---of 17
ip4_string---of 48
ip6_addr_string---of 10
ip6_addr_string_sa---of 26
ip6_compressed_string---of 24
ip6_string---of 15
ip_addr_string---of 41
mac_address_string---of 26
move_right40%of 5
netdev_bits---of 26
num_to_str---of 18
number61%of 98
pointer7%of 83
ptr_to_hashval---of 3
put_dec75%of 4
put_dec_full8100%of 1
put_dec_trunc8100%of 5
range_string---of 13
resource_or_range---of 4
resource_string---of 108
restricted_pointer---of 31
rtc_str---of 14
scnprintf50%of 4
simple_strntoll---of 3
simple_strntoul---of 1
simple_strntoull80%of 5
simple_strtol---of 3
simple_strtoll---of 1
simple_strtoul100%of 1
simple_strtoull100%of 1
snprintf100%of 1
special_hex_number---of 1
sprintf100%of 1
sscanf---of 1
string39%of 13
string_nocheck---of 7
symbol_string---of 15
time64_str---of 1
time_and_date---of 28
time_str---of 5
timespec64_str---of 5
uuid_string53%of 23
vbin_printf---of 49
vscnprintf50%of 4
vsnprintf64%of 44
vsprintf---of 1
vsscanf---of 113
widen_string16%of 19
-----------
SUMMARY40%of 405

-----------
SUMMARY---of 0

__bpf_trace_hugetlbfs__inode---of 1
__bpf_trace_hugetlbfs_alloc_inode---of 1
__bpf_trace_hugetlbfs_fallocate---of 1
__bpf_trace_hugetlbfs_setattr---of 1
__probestub_hugetlbfs_alloc_inode---of 1
__probestub_hugetlbfs_evict_inode---of 1
__probestub_hugetlbfs_fallocate---of 1
__probestub_hugetlbfs_free_inode---of 1
__probestub_hugetlbfs_setattr---of 1
__traceiter_hugetlbfs_alloc_inode---of 4
__traceiter_hugetlbfs_evict_inode---of 4
__traceiter_hugetlbfs_fallocate---of 4
__traceiter_hugetlbfs_free_inode---of 4
__traceiter_hugetlbfs_setattr---of 4
hugetlb_file_mmap_prepare_success100%of 1
hugetlb_file_setup22%of 19
hugetlb_get_unmapped_area40%of 5
hugetlb_vmdelete_list---of 7
hugetlbfs_alloc_inode50%of 6
hugetlbfs_create---of 3
hugetlbfs_destroy_inode---of 3
hugetlbfs_error_remove_folio---of 1
hugetlbfs_evict_inode---of 7
hugetlbfs_fallocate---of 35
hugetlbfs_file_mmap_prepare30%of 10
hugetlbfs_fill_super---of 9
hugetlbfs_free_inode---of 5
hugetlbfs_fs_context_free---of 1
hugetlbfs_get_inode28%of 18
hugetlbfs_get_tree---of 9
hugetlbfs_inc_free_inodes---of 3
hugetlbfs_init_fs_context---of 3
hugetlbfs_migrate_folio---of 4
hugetlbfs_mkdir---of 3
hugetlbfs_mknod---of 3
hugetlbfs_parse_param---of 17
hugetlbfs_put_super---of 4
hugetlbfs_read_iter---of 13
hugetlbfs_setattr---of 17
hugetlbfs_show_options---of 14
hugetlbfs_statfs---of 4
hugetlbfs_symlink---of 4
hugetlbfs_tmpfile---of 3
hugetlbfs_write_begin---of 1
hugetlbfs_zero_partial_page---of 14
init_once100%of 1
perf_trace_hugetlbfs__inode---of 8
perf_trace_hugetlbfs_alloc_inode---of 9
perf_trace_hugetlbfs_fallocate---of 8
perf_trace_hugetlbfs_setattr---of 8
remove_inode_hugepages---of 46
trace_event_raw_event_hugetlbfs__inode---of 9
trace_event_raw_event_hugetlbfs_alloc_inode---of 10
trace_event_raw_event_hugetlbfs_fallocate---of 9
trace_event_raw_event_hugetlbfs_setattr---of 9
trace_raw_output_hugetlbfs__inode---of 3
trace_raw_output_hugetlbfs_alloc_inode---of 3
trace_raw_output_hugetlbfs_fallocate---of 3
trace_raw_output_hugetlbfs_setattr---of 3
-----------
SUMMARY32%of 60

-----------
SUMMARY---of 0

bpf_clear_insn_aux_data67%of 6
bpf_convert_ctx_accesses24%of 109
bpf_do_misc_fixups14%of 212
bpf_dup_insn_aux_data---of 3
bpf_fixup_call_args43%of 19
bpf_insn_is_cond_jump100%of 6
bpf_jit_find_kfunc_model---of 1
bpf_jit_subprogs---of 88
bpf_opt_hard_wire_dead_code_branches38%of 16
bpf_opt_remove_dead_code50%of 10
bpf_opt_remove_nops58%of 7
bpf_opt_subreg_zext_lo32_rnd_hi3238%of 29
bpf_optimize_bpf_loop59%of 12
bpf_patch_insn_data28%of 58
bpf_remove_fastcall_spills_fills34%of 15
bpf_restore_insn_aux_data---of 1
kfunc_desc_cmp_by_imm_off---of 4
verifier_remove_insns---of 56
-----------
SUMMARY26%of 499

ext4_bg_has_super13%of 16
ext4_bg_num_gdb40%of 5
ext4_claim_free_clusters67%of 3
ext4_count_free_clusters30%of 10
ext4_free_clusters_after_init---of 20
ext4_get_group_desc40%of 5
ext4_get_group_info67%of 3
ext4_get_group_no_and_offset60%of 5
ext4_get_group_number---of 3
ext4_has_free_clusters27%of 15
ext4_init_block_bitmap---of 29
ext4_inode_to_goal_block---of 7
ext4_lock_group23%of 9
ext4_new_meta_blocks60%of 5
ext4_num_base_meta_blocks34%of 6
ext4_read_block_bitmap---of 4
ext4_read_block_bitmap_nowait52%of 31
ext4_should_retry_alloc---of 6
ext4_valid_block_bitmap17%of 12
ext4_valid_block_bitmap_padding60%of 5
ext4_validate_block_bitmap24%of 21
ext4_wait_block_bitmap58%of 7
trace_ext4_read_block_bitmap_load40%of 5
-----------
SUMMARY37%of 163

-----------
SUMMARY---of 0

__nilfs_clear_folio_dirty---of 4
nilfs_clear_dirty_pages---of 12
nilfs_clear_folio_dirty---of 18
nilfs_copy_back_pages---of 20
nilfs_copy_buffer---of 14
nilfs_copy_dirty_pages---of 16
nilfs_copy_folio---of 23
nilfs_find_uncommitted_extent---of 21
nilfs_folio_buffers_clean---of 4
nilfs_folio_bug---of 7
nilfs_forget_buffer---of 13
nilfs_grab_buffer57%of 16
nilfs_page_count_clean_buffers---of 5
-----------
SUMMARY57%of 16

-----------
SUMMARY---of 0

autosuspend_delay_ms_show---of 3
autosuspend_delay_ms_store---of 5
control_show---of 1
control_store---of 4
dpm_sysfs_add43%of 14
dpm_sysfs_change_owner---of 15
dpm_sysfs_remove67%of 3
pm_qos_latency_tolerance_us_show---of 4
pm_qos_latency_tolerance_us_store---of 6
pm_qos_no_power_off_show---of 1
pm_qos_no_power_off_store---of 4
pm_qos_resume_latency_us_show---of 3
pm_qos_resume_latency_us_store---of 6
pm_qos_sysfs_add_flags---of 1
pm_qos_sysfs_add_latency_tolerance---of 1
pm_qos_sysfs_add_resume_latency---of 1
pm_qos_sysfs_remove_flags100%of 1
pm_qos_sysfs_remove_latency_tolerance---of 1
pm_qos_sysfs_remove_resume_latency100%of 1
rpm_sysfs_remove---of 1
runtime_active_time_show---of 1
runtime_status_show---of 8
runtime_suspended_time_show---of 1
wakeup_abort_count_show---of 3
wakeup_active_count_show---of 3
wakeup_active_show---of 3
wakeup_count_show---of 3
wakeup_expire_count_show---of 3
wakeup_last_time_ms_show---of 3
wakeup_max_time_ms_show---of 3
wakeup_show---of 3
wakeup_store---of 5
wakeup_sysfs_add67%of 3
wakeup_sysfs_remove---of 1
wakeup_total_time_ms_show---of 3
-----------
SUMMARY55%of 22

add_system_zone44%of 16
debug_print_tree---of 6
ext4_check_blockref60%of 10
ext4_destroy_system_zone---of 4
ext4_exit_system_zone---of 1
ext4_inode_block_valid22%of 14
ext4_release_system_zone---of 3
ext4_sb_block_valid---of 14
ext4_setup_system_zone17%of 31
-----------
SUMMARY30%of 71

csum_partial65%of 14
csum_tcpudp_nofold100%of 1
ip_compute_csum---of 14
ip_fast_csum65%of 14
-----------
SUMMARY66%of 29

__get_random_u32_below34%of 6
__ia32_sys_getrandom---of 9
__x64_sys_getrandom---of 9
_credit_init_bits---of 12
_get_random_bytes50%of 8
add_device_randomness100%of 1
add_disk_randomness---of 4
add_hwgenerator_randomness---of 12
add_input_randomness---of 3
add_interrupt_randomness---of 9
add_timer_randomness---of 7
crng_fast_key_erasure100%of 1
crng_make_state42%of 12
crng_reseed---of 7
crng_set_ready---of 1
entropy_timer---of 5
execute_with_initialized_rng---of 4
extract_entropy---of 45
get_random_bytes100%of 1
get_random_bytes_user---of 10
get_random_u1656%of 9
get_random_u3256%of 9
get_random_u64---of 9
get_random_u845%of 9
mix_interrupt_randomness---of 5
mix_pool_bytes---of 1
proc_do_rointvec---of 3
proc_do_uuid---of 5
rand_initialize_disk---of 3
random_fasync---of 1
random_ioctl---of 25
random_online_cpu---of 1
random_pm_notification---of 6
random_poll---of 6
random_prepare_cpu---of 1
random_read_iter---of 8
random_write_iter---of 1
rng_is_initialized---of 3
try_to_generate_entropy---of 19
urandom_read_iter---of 10
wait_for_random_bytes---of 15
write_pool_user---of 7
-----------
SUMMARY50%of 56

-----------
SUMMARY---of 0

__input_unregister_device38%of 27
devm_input_allocate_device---of 4
devm_input_device_match---of 1
devm_input_device_release---of 3
devm_input_device_unregister---of 1
inhibited_show---of 1
inhibited_store---of 35
input_add_uevent_bm_var64%of 11
input_add_uevent_modalias_var50%of 4
input_alloc_absinfo---of 4
input_allocate_device50%of 4
input_close_device---of 19
input_copy_abs---of 6
input_default_getkeycode---of 11
input_default_setkeycode---of 22
input_dev_freeze---of 14
input_dev_poweroff---of 3
input_dev_release100%of 1
input_dev_resume---of 3
input_dev_show_cap_abs---of 1
input_dev_show_cap_ev---of 1
input_dev_show_cap_ff---of 1
input_dev_show_cap_key---of 1
input_dev_show_cap_led---of 1
input_dev_show_cap_msc---of 1
input_dev_show_cap_rel---of 1
input_dev_show_cap_snd---of 1
input_dev_show_cap_sw---of 1
input_dev_show_id_bustype---of 1
input_dev_show_id_product---of 1
input_dev_show_id_vendor---of 1
input_dev_show_id_version---of 1
input_dev_show_modalias---of 3
input_dev_show_name---of 1
input_dev_show_phys---of 1
input_dev_show_properties---of 1
input_dev_show_uniq---of 1
input_dev_suspend---of 14
input_dev_toggle---of 19
input_dev_uevent33%of 37
input_device_enabled---of 3
input_devices_seq_next---of 1
input_devices_seq_show---of 20
input_devices_seq_start---of 3
input_devnode67%of 3
input_enable_softrepeat---of 1
input_event---of 5
input_event_dispose---of 11
input_flush_device---of 6
input_free_device---of 5
input_free_minor100%of 1
input_get_keycode---of 3
input_get_new_minor50%of 4
input_get_timestamp---of 3
input_grab_device---of 6
input_handle_event---of 55
input_handle_events_default---of 4
input_handle_events_filter---of 7
input_handle_events_null---of 1
input_handler_for_each_handle---of 4
input_handlers_seq_next---of 1
input_handlers_seq_show---of 5
input_handlers_seq_start---of 3
input_inject_event---of 7
input_match_device_id40%of 23
input_open_device---of 13
input_pass_values---of 21
input_print_bitmap---of 8
input_print_modalias_parts35%of 43
input_proc_devices_open---of 1
input_proc_devices_poll---of 6
input_proc_exit---of 1
input_proc_handlers_open---of 1
input_register_device45%of 70
input_register_handle39%of 18
input_register_handler---of 21
input_release_device---of 10
input_repeat_key---of 16
input_reset_device---of 16
input_scancode_to_scalar---of 5
input_seq_print_bitmap---of 6
input_seq_stop---of 3
input_set_abs_params60%of 5
input_set_capability---of 16
input_set_keycode---of 13
input_set_timestamp---of 1
input_unregister_device40%of 5
input_unregister_handle45%of 9
input_unregister_handler---of 9
list_add_tail67%of 3
-----------
SUMMARY42%of 268

-----------
SUMMARY---of 0

__tty_buffer_request_room---of 8
__tty_insert_flip_string_flags---of 10
flush_to_ldisc---of 26
tty_buffer_cancel_work100%of 1
tty_buffer_flush---of 14
tty_buffer_flush_work---of 1
tty_buffer_free_all46%of 11
tty_buffer_init100%of 1
tty_buffer_lock_exclusive---of 1
tty_buffer_request_room---of 1
tty_buffer_restart_work---of 1
tty_buffer_set_limit---of 3
tty_buffer_set_lock_subclass---of 1
tty_buffer_space_avail---of 1
tty_buffer_unlock_exclusive---of 3
tty_flip_buffer_push---of 1
tty_insert_flip_string_and_push_buffer---of 3
tty_ldisc_receive_buf---of 5
tty_prepare_flip_string---of 4
-----------
SUMMARY54%of 13

-----------
SUMMARY---of 0

__seq_open_private---of 6
__seq_puts---of 3
mangle_path---of 8
seq_bprintf---of 3
seq_dentry---of 15
seq_escape_mem---of 7
seq_file_path---of 1
seq_hex_dump---of 15
seq_hlist_next---of 1
seq_hlist_next_percpu---of 12
seq_hlist_next_rcu---of 1
seq_hlist_start---of 3
seq_hlist_start_head---of 4
seq_hlist_start_head_rcu---of 4
seq_hlist_start_percpu---of 9
seq_hlist_start_rcu---of 3
seq_list_next---of 1
seq_list_next_rcu---of 1
seq_list_start---of 5
seq_list_start_head---of 6
seq_list_start_head_rcu---of 6
seq_list_start_rcu---of 5
seq_lseek---of 9
seq_open60%of 5
seq_open_private---of 1
seq_pad---of 7
seq_path---of 15
seq_path_root---of 18
seq_printf---of 3
seq_put_decimal_ll---of 13
seq_put_decimal_ull---of 10
seq_put_decimal_ull_width---of 10
seq_put_hex_ll---of 13
seq_putc---of 3
seq_read---of 6
seq_read_iter---of 38
seq_release---of 1
seq_release_private---of 1
seq_vprintf---of 3
seq_write---of 3
single_next---of 1
single_open---of 6
single_open_size---of 5
single_release---of 1
single_start---of 1
single_stop---of 1
traverse---of 15
-----------
SUMMARY60%of 5

-----------
SUMMARY---of 0

proc_apply_options---of 19
proc_fill_super---of 19
proc_fs_context_free---of 1
proc_get_tree---of 1
proc_init_fs_context---of 22
proc_kill_sb---of 3
proc_parse_param---of 55
proc_reconfigure---of 1
proc_root_getattr---of 1
proc_root_lookup67%of 3
proc_root_readdir---of 4
put_user_ns---of 12
-----------
SUMMARY67%of 3

should_fail_alloc_page60%of 5
-----------
SUMMARY60%of 5

-----------
SUMMARY---of 0

__sk_queue_drop_skb---of 10
__skb_datagram_iter---of 36
__skb_recv_datagram---of 7
__skb_try_recv_datagram---of 13
__skb_try_recv_from_queue---of 23
__skb_wait_for_more_packets---of 13
__zerocopy_sg_from_iter---of 27
crc32c_and_copy_to_iter---of 3
csum_and_copy_to_iter---of 57
datagram_poll---of 1
datagram_poll_queue---of 13
receiver_wake_function---of 3
simple_copy_to_iter---of 3
skb_copy_and_crc32c_datagram_iter---of 1
skb_copy_and_csum_datagram_msg---of 14
skb_copy_datagram_from_iter12%of 27
skb_copy_datagram_from_iter_full---of 3
skb_copy_datagram_iter---of 5
skb_free_datagram---of 1
skb_kill_datagram---of 1
skb_recv_datagram---of 7
xas_next_entry---of 8
zerocopy_fill_skb_from_iter---of 23
zerocopy_sg_from_iter---of 5
-----------
SUMMARY12%of 27

bsearch60%of 5
-----------
SUMMARY60%of 5

-----------
SUMMARY---of 0

usb_of_get_connect_type---of 13
usb_of_get_device_node40%of 5
usb_of_get_interface_node40%of 5
usb_of_has_combined_node25%of 8
-----------
SUMMARY34%of 18

__bpf_trace_migration_pmd---of 1
__folio_freeze_and_split_unmapped---of 121
__folio_split---of 61
__folio_unqueue_deferred_split---of 19
__pmd_trans_huge_lock---of 3
__probestub_remove_migration_pmd---of 1
__probestub_set_migration_pmd---of 1
__pud_trans_huge_lock---of 3
__split_huge_page_to_list_to_order---of 1
__split_huge_pmd---of 7
__split_huge_pud---of 16
__thp_vma_allowable_orders8%of 41
__traceiter_remove_migration_pmd---of 4
__traceiter_set_migration_pmd---of 4
anon_enabled_show---of 4
anon_enabled_store---of 10
anon_fault_alloc_show---of 5
anon_fault_fallback_charge_show---of 5
anon_fault_fallback_show---of 5
change_huge_pmd---of 53
change_huge_pud---of 6
copy_huge_non_present_pmd---of 9
copy_huge_pmd---of 17
copy_huge_pud---of 4
current_gfp_context---of 4
deferred_split_count---of 1
deferred_split_folio---of 29
deferred_split_scan---of 51
defrag_show---of 5
defrag_store---of 7
do_huge_pmd_anonymous_page---of 39
do_huge_pmd_device_private---of 16
do_huge_pmd_numa_page---of 35
do_huge_pmd_wp_page---of 42
enabled_show---of 3
enabled_store---of 8
folio_check_splittable---of 13
folio_split---of 1
folio_split_unmapped---of 12
folio_try_dup_anon_rmap_pmd---of 26
hpage_pmd_size_show---of 1
huge_pmd_set_accessed---of 4
huge_pud_set_accessed---of 4
insert_pmd---of 17
insert_pud---of 14
madvise_free_huge_pmd---of 24
map_anon_folio_pmd_nopf---of 3
map_anon_folio_pmd_pf---of 4
maybe_pmd_mkwrite---of 3
min_order_for_split---of 3
mm_get_huge_zero_folio---of 16
mm_put_huge_zero_folio---of 4
move_huge_pmd---of 19
move_pages_huge_pmd---of 49
nr_anon_partially_mapped_show---of 5
nr_anon_show---of 5
perf_trace_migration_pmd---of 8
pte_free---of 8
remap_page---of 7
remove_migration_pmd---of 35
reparent_deferred_split_queue---of 10
set_huge_zero_folio---of 1
set_pmd_migration_entry---of 40
shmem_alloc_show---of 5
shmem_fallback_charge_show---of 5
shmem_fallback_show---of 5
shrink_huge_zero_folio_count---of 1
shrink_huge_zero_folio_scan---of 5
single_hugepage_flag_show---of 1
single_hugepage_flag_store---of 5
softleaf_to_folio---of 6
split_deferred_show---of 5
split_failed_show---of 5
split_folio_to_list---of 1
split_huge_pages_in_file---of 18
split_huge_pages_pid---of 56
split_huge_pages_write---of 43
split_huge_pmd_address---of 3
split_huge_pmd_locked---of 111
split_show---of 5
split_underused_thp_show---of 1
split_underused_thp_store---of 1
swpin_fallback_charge_show---of 5
swpin_fallback_show---of 5
swpin_show---of 5
swpout_fallback_show---of 5
swpout_show---of 5
sysfs_add_group---of 4
thp_get_unmapped_area---of 1
thp_get_unmapped_area_vmflags---of 10
thpsize_release---of 1
touch_pmd67%of 3
touch_pud---of 3
trace_event_raw_event_migration_pmd---of 9
trace_raw_output_migration_pmd---of 3
unmap_folio---of 5
unmap_huge_pmd_locked---of 25
use_zero_page_show---of 1
use_zero_page_store---of 5
vma_adjust_trans_huge---of 13
vma_alloc_anon_folio_pmd---of 16
vma_end_read---of 5
vma_thp_gfp_mask---of 7
vmf_insert_folio_pmd---of 4
vmf_insert_folio_pud---of 4
vmf_insert_pfn_pmd---of 5
vmf_insert_pfn_pud---of 5
zap_huge_pmd---of 45
zap_huge_pud---of 10
zswpout_show---of 5
-----------
SUMMARY12%of 44

-----------
SUMMARY---of 0

__request_module24%of 17
free_modprobe_argv100%of 1
-----------
SUMMARY28%of 18

-----------
SUMMARY---of 0

tomoyo_add_slash---of 6
tomoyo_check_mkdev_acl---of 20
tomoyo_check_open_permission36%of 14
tomoyo_check_path2_acl---of 9
tomoyo_check_path_acl60%of 5
tomoyo_check_path_number_acl40%of 10
tomoyo_compare_name_union50%of 4
tomoyo_compare_number_union75%of 4
tomoyo_execute_permission---of 6
tomoyo_merge_mkdev_acl---of 1
tomoyo_merge_path2_acl---of 1
tomoyo_merge_path_acl---of 1
tomoyo_merge_path_number_acl---of 1
tomoyo_mkdev_perm---of 6
tomoyo_path2_perm---of 14
tomoyo_path_number_perm38%of 16
tomoyo_path_perm---of 15
tomoyo_put_name_union---of 5
tomoyo_put_number_union---of 3
tomoyo_same_mkdev_acl---of 18
tomoyo_same_mount_acl---of 12
tomoyo_same_path2_acl---of 5
tomoyo_same_path_acl---of 3
tomoyo_same_path_number_acl---of 8
tomoyo_update_mount_acl---of 20
tomoyo_write_file---of 44
-----------
SUMMARY44%of 53

tb_acpi_add_link---of 17
tb_acpi_add_links---of 4
tb_acpi_bus_match100%of 1
tb_acpi_exit---of 1
tb_acpi_find_companion---of 10
tb_acpi_init---of 1
tb_acpi_is_native---of 1
tb_acpi_is_xdomain_allowed---of 1
tb_acpi_may_tunnel_dp---of 1
tb_acpi_may_tunnel_pcie---of 1
tb_acpi_may_tunnel_usb3---of 1
tb_acpi_power_off_retimers---of 1
tb_acpi_power_on_retimers---of 1
tb_acpi_retimer_set_power---of 11
tb_acpi_setup---of 4
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

strncpy_from_user38%of 16
-----------
SUMMARY38%of 16

__dev_fwnode67%of 3
__dev_fwnode_const---of 3
device_dma_supported---of 9
device_get_dma_attr---of 9
device_get_match_data---of 9
device_get_named_child_node---of 7
device_get_next_child_node---of 12
device_get_phy_mode---of 3
device_property_match_string---of 3
device_property_present---of 12
device_property_read_bool---of 12
device_property_read_string---of 11
device_property_read_string_array---of 11
device_property_read_u16_array---of 11
device_property_read_u32_array---of 11
device_property_read_u64_array---of 11
device_property_read_u8_array---of 11
fwnode_connection_find_match---of 20
fwnode_connection_find_matches---of 23
fwnode_count_parents---of 14
fwnode_device_is_available---of 5
fwnode_find_reference---of 11
fwnode_get_child_node_count---of 21
fwnode_get_name---of 5
fwnode_get_name_prefix---of 5
fwnode_get_named_child_node---of 5
fwnode_get_named_child_node_count---of 21
fwnode_get_next_available_child_node---of 16
fwnode_get_next_child_node---of 10
fwnode_get_next_parent---of 8
fwnode_get_nth_parent---of 19
fwnode_get_parent---of 5
fwnode_get_phy_mode---of 61
fwnode_graph_devcon_matches---of 37
fwnode_graph_get_endpoint_by_id---of 21
fwnode_graph_get_endpoint_count---of 7
fwnode_graph_get_next_endpoint---of 21
fwnode_graph_get_port_parent---of 11
fwnode_graph_get_remote_endpoint---of 5
fwnode_graph_get_remote_port---of 11
fwnode_graph_get_remote_port_parent---of 17
fwnode_graph_parse_endpoint---of 5
fwnode_graph_remote_available---of 24
fwnode_handle_get---of 5
fwnode_iomap---of 5
fwnode_irq_get---of 5
fwnode_irq_get_byname---of 7
fwnode_name_eq---of 6
fwnode_property_get_reference_args---of 10
fwnode_property_match_property_string---of 11
fwnode_property_match_string---of 21
fwnode_property_present---of 10
fwnode_property_read_bool---of 10
fwnode_property_read_string---of 9
fwnode_property_read_string_array---of 9
fwnode_property_read_u16_array---of 9
fwnode_property_read_u32_array---of 9
fwnode_property_read_u64_array---of 9
fwnode_property_read_u8_array---of 9
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

__debugfs_create_file30%of 10
debugfs_alloc_inode100%of 1
debugfs_automount---of 1
debugfs_change_name---of 21
debugfs_create_automount---of 9
debugfs_create_dir34%of 9
debugfs_create_file_full100%of 1
debugfs_create_file_short---of 1
debugfs_create_file_size---of 3
debugfs_create_file_unsafe---of 1
debugfs_create_symlink---of 5
debugfs_fill_super---of 3
debugfs_free_fc---of 1
debugfs_free_inode---of 3
debugfs_get_tree---of 9
debugfs_init_fs_context---of 3
debugfs_initialized---of 1
debugfs_lookup---of 5
debugfs_lookup_and_remove---of 4
debugfs_parse_param---of 8
debugfs_reconfigure---of 8
debugfs_release_dentry50%of 4
debugfs_remove67%of 3
debugfs_setattr---of 4
debugfs_show_options---of 7
debugfs_start_creating30%of 10
init_once---of 1
remove_one14%of 15
-----------
SUMMARY33%of 53

-----------
SUMMARY---of 0

__usb_create_hcd---of 7
__usb_hcd_giveback_urb7%of 16
dma_map_single_attrs---of 4
hcd_alloc_coherent---of 5
hcd_bus_resume---of 20
hcd_bus_suspend---of 11
hcd_died_work---of 1
hcd_resume_work---of 1
register_root_hub---of 10
rh_string---of 9
rh_timer_func---of 1
unlink1---of 12
usb_add_hcd---of 44
usb_alloc_streams---of 17
usb_bus_start_enum---of 5
usb_calc_bus_time---of 7
usb_create_hcd---of 1
usb_create_shared_hcd---of 1
usb_deregister_bus---of 1
usb_free_streams---of 14
usb_get_hcd40%of 5
usb_giveback_urb_bh---of 9
usb_hc_died---of 7
usb_hcd_alloc_bandwidth6%of 38
usb_hcd_check_unlink_urb---of 5
usb_hcd_disable_endpoint67%of 3
usb_hcd_end_port_resume---of 6
usb_hcd_find_raw_port_number67%of 3
usb_hcd_flush_endpoint24%of 13
usb_hcd_get_frame_number---of 3
usb_hcd_giveback_urb55%of 11
usb_hcd_irq---of 3
usb_hcd_is_primary_hcd---of 1
usb_hcd_link_urb_to_ep29%of 7
usb_hcd_map_urb_for_dma14%of 30
usb_hcd_platform_shutdown---of 3
usb_hcd_poll_rh_status32%of 16
usb_hcd_request_irqs---of 5
usb_hcd_reset_endpoint75%of 4
usb_hcd_resume_root_hub67%of 3
usb_hcd_setup_local_mem---of 10
usb_hcd_start_port_resume---of 3
usb_hcd_submit_urb19%of 81
usb_hcd_synchronize_unlinks100%of 1
usb_hcd_unlink_urb---of 3
usb_hcd_unlink_urb_from_ep---of 4
usb_hcd_unmap_urb_for_dma---of 13
usb_hcd_unmap_urb_setup_for_dma---of 4
usb_mon_deregister---of 3
usb_mon_register---of 3
usb_put_hcd34%of 6
usb_put_invalidate_rhdev---of 1
usb_register_bus---of 3
usb_remove_hcd---of 9
usb_stop_hcd---of 1
-----------
SUMMARY22%of 237

__fsnotify_inode_delete100%of 1
__fsnotify_mntns_delete---of 1
__fsnotify_parent24%of 21
__fsnotify_vfsmount_delete100%of 1
fsnotify32%of 158
fsnotify_clear_child_dentry_flag---of 3
fsnotify_data_inode---of 7
fsnotify_event_needs_parent67%of 3
fsnotify_handle_inode_event27%of 15
fsnotify_mnt---of 4
fsnotify_open_perm_and_set_mode22%of 14
fsnotify_parent---of 8
fsnotify_pre_content---of 15
fsnotify_sb_delete---of 12
fsnotify_sb_free---of 4
fsnotify_set_children_dentry_flags---of 10
-----------
SUMMARY31%of 213

__bpf_array_map_seq_show---of 8
__bpf_event_entry_free---of 1
array_map_alloc31%of 23
array_map_alloc_check---of 16
array_map_check_btf---of 5
array_map_delete_elem---of 1
array_map_direct_value_addr---of 4
array_map_direct_value_meta---of 4
array_map_free---of 16
array_map_free_internal_structs---of 6
array_map_gen_lookup---of 6
array_map_get_hash---of 1
array_map_lookup_elem---of 3
array_map_mem_usage---of 4
array_map_meta_equal---of 4
array_map_mmap---of 4
array_map_seq_show_elem---of 5
array_map_update_elem---of 10
array_of_map_alloc---of 4
array_of_map_free---of 12
array_of_map_gen_lookup---of 5
array_of_map_lookup_elem---of 4
bpf_arch_poke_desc_update---of 1
bpf_array_get_next_key---of 5
bpf_array_map_seq_next---of 4
bpf_array_map_seq_show---of 1
bpf_array_map_seq_start---of 6
bpf_array_map_seq_stop---of 4
bpf_fd_array_map_lookup_elem---of 6
bpf_fd_array_map_update_elem---of 8
bpf_for_each_array_elem---of 8
bpf_iter_fini_array_map---of 1
bpf_iter_init_array_map---of 4
bpf_obj_memcpy---of 12
bpf_percpu_array_copy---of 7
bpf_percpu_array_update---of 9
cgroup_fd_array_free---of 12
cgroup_fd_array_get_ptr---of 1
cgroup_fd_array_put_ptr---of 5
check_and_init_map_value---of 20
fd_array_map_alloc_check30%of 10
fd_array_map_delete_elem---of 6
fd_array_map_lookup_elem---of 1
percpu_array_map_gen_lookup---of 5
percpu_array_map_lookup_elem---of 3
percpu_array_map_lookup_percpu_elem---of 4
percpu_array_map_seq_show_elem---of 5
perf_event_fd_array_get_ptr---of 5
perf_event_fd_array_map_free29%of 14
perf_event_fd_array_put_ptr---of 1
perf_event_fd_array_release---of 11
prog_array_map_alloc---of 4
prog_array_map_clear---of 1
prog_array_map_clear_deferred---of 9
prog_array_map_free---of 11
prog_array_map_poke_run---of 14
prog_array_map_poke_track---of 6
prog_array_map_poke_untrack---of 6
prog_array_map_seq_show_elem---of 5
prog_fd_array_get_ptr---of 6
prog_fd_array_put_ptr---of 1
prog_fd_array_sys_lookup_elem---of 1
-----------
SUMMARY30%of 47

-----------
SUMMARY---of 0

bpf_token_allow_cmd---of 4
bpf_token_allow_map_type---of 3
bpf_token_allow_prog_type---of 4
bpf_token_capable40%of 10
bpf_token_create---of 27
bpf_token_get_from_fd---of 6
bpf_token_get_info_by_fd---of 3
bpf_token_inc---of 1
bpf_token_put50%of 4
bpf_token_put_deferred---of 12
bpf_token_release---of 4
bpf_token_show_fdinfo---of 9
class_fd_prepare_destructor---of 5
get_user_ns---of 11
-----------
SUMMARY43%of 14

-----------
SUMMARY---of 0

lockdown_is_locked_down34%of 6
lockdown_read---of 12
lockdown_write---of 15
-----------
SUMMARY34%of 6

erofs_bmap---of 1
erofs_bread24%of 26
erofs_dax_fault---of 1
erofs_dax_huge_fault---of 1
erofs_fiemap---of 1
erofs_file_llseek---of 6
erofs_file_mmap_prepare---of 4
erofs_file_read_iter---of 6
erofs_init_metabuf---of 4
erofs_iomap_begin---of 18
erofs_iomap_end---of 6
erofs_map_blocks---of 34
erofs_map_dev---of 20
erofs_onlinefolio_end---of 4
erofs_onlinefolio_init---of 1
erofs_onlinefolio_split---of 1
erofs_put_metabuf50%of 6
erofs_read_folio---of 5
erofs_read_metabuf50%of 4
erofs_readahead---of 5
erofs_unmap_metabuf---of 3
-----------
SUMMARY31%of 36

__blk_should_fake_timeout---of 1
blk_abort_request---of 1
blk_add_timer67%of 6
blk_rq_timeout---of 1
part_timeout_show---of 1
part_timeout_store---of 4
-----------
SUMMARY67%of 6

-----------
SUMMARY---of 0

__bpf_trace_cpuhp_enter---of 1
__bpf_trace_cpuhp_exit---of 1
__bpf_trace_cpuhp_multi_enter---of 1
__cpuhp_kick_ap---of 4
__cpuhp_remove_state---of 9
__cpuhp_remove_state_cpuslocked---of 12
__cpuhp_setup_state---of 9
__cpuhp_setup_state_cpuslocked---of 25
__cpuhp_state_add_instance---of 9
__cpuhp_state_add_instance_cpuslocked---of 20
__cpuhp_state_remove_instance---of 22
__probestub_cpuhp_enter---of 1
__probestub_cpuhp_exit---of 1
__probestub_cpuhp_multi_enter---of 1
__traceiter_cpuhp_enter---of 4
__traceiter_cpuhp_exit---of 4
__traceiter_cpuhp_multi_enter---of 4
_cpu_down---of 31
_cpu_up---of 25
active_show---of 3
add_cpu---of 1
bringup_hibernate_cpu---of 4
clear_tasks_mm_cpumask---of 8
control_show---of 4
control_store---of 14
cpu_attack_vector_mitigated---of 3
cpu_device_down---of 4
cpu_device_up---of 1
cpu_down---of 4
cpu_hotplug_disable---of 1
cpu_hotplug_disable_offlining---of 1
cpu_hotplug_enable---of 3
cpu_hotplug_pm_callback---of 8
cpu_maps_update_begin---of 1
cpu_maps_update_done---of 1
cpu_mitigations_auto_nosmt---of 1
cpu_mitigations_off100%of 1
cpu_smt_possible---of 1
cpu_up---of 10
cpuhp_ap_report_dead---of 1
cpuhp_ap_sync_alive---of 4
cpuhp_bringup_ap---of 29
cpuhp_complete_idle_dead---of 1
cpuhp_invoke_callback---of 57
cpuhp_issue_call---of 12
cpuhp_kick_ap_alive---of 7
cpuhp_kick_ap_work---of 25
cpuhp_online_idle---of 3
cpuhp_report_idle_dead---of 4
cpuhp_reset_state---of 7
cpuhp_should_run---of 1
cpuhp_smt_disable---of 10
cpuhp_smt_enable---of 11
cpuhp_thread_fun---of 19
cpus_read_lock---of 5
cpus_read_trylock---of 5
cpus_read_unlock---of 5
cpus_write_lock---of 1
cpus_write_unlock---of 1
fail_show---of 1
fail_store---of 10
finish_cpu---of 5
freeze_secondary_cpus---of 26
init_cpu_possible---of 1
init_cpu_present---of 1
lockdep_assert_cpus_held---of 1
notify_cpu_starting---of 5
perf_trace_cpuhp_enter---of 8
perf_trace_cpuhp_exit---of 8
perf_trace_cpuhp_multi_enter---of 8
remove_cpu---of 1
set_cpu_online---of 5
set_cpu_possible---of 5
smp_shutdown_nonboot_cpus---of 15
state_show---of 1
states_show---of 5
take_cpu_down---of 8
takedown_cpu---of 15
target_show---of 1
target_store---of 12
thaw_secondary_cpus---of 18
trace_event_raw_event_cpuhp_enter---of 9
trace_event_raw_event_cpuhp_exit---of 9
trace_event_raw_event_cpuhp_multi_enter---of 9
trace_raw_output_cpuhp_enter---of 3
trace_raw_output_cpuhp_exit---of 3
trace_raw_output_cpuhp_multi_enter---of 3
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

of_usb_get_dr_mode_by_phy---of 15
of_usb_host_tpl_support---of 1
of_usb_update_otg_caps---of 17
usb_decode_interval---of 9
usb_ep_type_string---of 3
usb_get_dr_mode---of 3
usb_get_maximum_speed---of 3
usb_get_maximum_ssp_rate---of 3
usb_get_role_switch_default_mode---of 3
usb_of_get_companion_dev---of 3
usb_otg_state_string---of 3
usb_speed_string100%of 1
usb_state_string---of 3
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__dev_pm_set_dedicated_wake_irq---of 9
dev_pm_arm_wake_irq---of 6
dev_pm_attach_wake_irq---of 5
dev_pm_clear_wake_irq---of 4
dev_pm_disable_wake_irq_check40%of 5
dev_pm_disarm_wake_irq---of 6
dev_pm_enable_wake_irq_check23%of 9
dev_pm_enable_wake_irq_complete---of 4
dev_pm_set_dedicated_wake_irq---of 1
dev_pm_set_dedicated_wake_irq_reverse---of 1
dev_pm_set_wake_irq---of 5
devm_pm_clear_wake_irq---of 1
devm_pm_set_wake_irq---of 6
handle_threaded_wake_irq---of 4
-----------
SUMMARY29%of 14

-----------
SUMMARY---of 0

__ipv6_addr_type34%of 15
in6_dev_finish_destroy---of 12
in6_dev_finish_destroy_rcu---of 1
inet6addr_notifier_call_chain---of 1
inet6addr_validator_notifier_call_chain---of 1
register_inet6addr_notifier---of 1
register_inet6addr_validator_notifier---of 1
unregister_inet6addr_notifier---of 1
unregister_inet6addr_validator_notifier---of 1
-----------
SUMMARY34%of 15

__bpf_trace_cgroup---of 1
__bpf_trace_cgroup_event---of 1
__bpf_trace_cgroup_migrate---of 1
__bpf_trace_cgroup_root---of 1
__bpf_trace_cgroup_rstat---of 1
__cgroup_get_from_id---of 10
__cgroup_procs_start---of 9
__cgroup_procs_write---of 7
__cgroup_task_count---of 4
__list_add_rcu---of 4
__probestub_cgroup_attach_task---of 1
__probestub_cgroup_destroy_root---of 1
__probestub_cgroup_freeze---of 1
__probestub_cgroup_mkdir---of 1
__probestub_cgroup_notify_frozen---of 1
__probestub_cgroup_notify_populated---of 1
__probestub_cgroup_release---of 1
__probestub_cgroup_remount---of 1
__probestub_cgroup_rename---of 1
__probestub_cgroup_rmdir---of 1
__probestub_cgroup_rstat_lock_contended---of 1
__probestub_cgroup_rstat_locked---of 1
__probestub_cgroup_rstat_unlock---of 1
__probestub_cgroup_setup_root---of 1
__probestub_cgroup_transfer_tasks---of 1
__probestub_cgroup_unfreeze---of 1
__traceiter_cgroup_attach_task---of 4
__traceiter_cgroup_destroy_root---of 4
__traceiter_cgroup_freeze---of 4
__traceiter_cgroup_mkdir---of 4
__traceiter_cgroup_notify_frozen---of 4
__traceiter_cgroup_notify_populated---of 4
__traceiter_cgroup_release---of 4
__traceiter_cgroup_remount---of 4
__traceiter_cgroup_rename---of 4
__traceiter_cgroup_rmdir---of 4
__traceiter_cgroup_rstat_lock_contended---of 4
__traceiter_cgroup_rstat_locked---of 4
__traceiter_cgroup_rstat_unlock---of 4
__traceiter_cgroup_setup_root---of 4
__traceiter_cgroup_transfer_tasks---of 4
__traceiter_cgroup_unfreeze---of 4
cgroup2_parse_param23%of 9
cgroup_add_cftypes---of 23
cgroup_add_dfl_cftypes---of 5
cgroup_add_legacy_cftypes---of 5
cgroup_addrm_files---of 36
cgroup_apply_cftypes---of 13
cgroup_apply_control---of 27
cgroup_apply_control_enable---of 55
cgroup_attach_lock---of 5
cgroup_attach_permissions---of 29
cgroup_attach_task---of 20
cgroup_attach_unlock---of 5
cgroup_can_fork---of 58
cgroup_cancel_fork---of 5
cgroup_control---of 4
cgroup_controllers_show---of 6
cgroup_core_local_stat_show---of 10
cgroup_cpu_pressure_show---of 5
cgroup_cpu_pressure_write---of 1
cgroup_css_set_put_fork---of 16
cgroup_destroy_locked---of 52
cgroup_do_get_tree27%of 15
cgroup_e_css---of 6
cgroup_events_show---of 3
cgroup_favor_dynmods---of 6
cgroup_file_name---of 7
cgroup_file_notify---of 8
cgroup_file_notify_timer---of 1
cgroup_file_open---of 15
cgroup_file_poll---of 3
cgroup_file_release---of 3
cgroup_file_show---of 3
cgroup_file_write---of 15
cgroup_finalize_control---of 52
cgroup_fork---of 1
cgroup_free_root---of 3
cgroup_freeze_show---of 3
cgroup_freeze_write---of 5
cgroup_fs_context_free100%of 1
cgroup_get_e_css34%of 15
cgroup_get_from_fd---of 12
cgroup_get_from_id---of 13
cgroup_get_from_path---of 13
cgroup_get_live---of 6
cgroup_get_tree45%of 9
cgroup_idr_alloc---of 3
cgroup_init_cftypes---of 18
cgroup_init_fs_context29%of 35
cgroup_io_pressure_show---of 5
cgroup_io_pressure_write---of 1
cgroup_irq_pressure_show---of 5
cgroup_irq_pressure_write---of 1
cgroup_kill_sb---of 8
cgroup_kill_write---of 23
cgroup_kn_lock_live---of 13
cgroup_kn_unlock---of 7
cgroup_lock_and_drain_offline---of 29
cgroup_max_depth_show---of 5
cgroup_max_depth_write---of 7
cgroup_max_descendants_show---of 5
cgroup_max_descendants_write---of 7
cgroup_memory_pressure_show---of 5
cgroup_memory_pressure_write---of 1
cgroup_migrate---of 7
cgroup_migrate_add_src---of 21
cgroup_migrate_add_task---of 21
cgroup_migrate_execute---of 41
cgroup_migrate_finish---of 13
cgroup_migrate_prepare_dst---of 24
cgroup_migrate_vet_dst---of 14
cgroup_mkdir---of 37
cgroup_on_dfl---of 1
cgroup_parse_float---of 23
cgroup_path_from_kernfs_id---of 3
cgroup_path_ns---of 6
cgroup_path_ns_locked---of 6
cgroup_post_fork---of 22
cgroup_pressure_poll---of 1
cgroup_pressure_release---of 1
cgroup_pressure_show---of 5
cgroup_pressure_write---of 17
cgroup_print_ss_mask---of 8
cgroup_procs_next---of 3
cgroup_procs_release---of 3
cgroup_procs_show---of 1
cgroup_procs_start---of 5
cgroup_procs_write---of 1
cgroup_procs_write_finish---of 8
cgroup_procs_write_start---of 25
cgroup_propagate_control---of 21
cgroup_psi_enabled---of 3
cgroup_reconfigure---of 3
cgroup_rm_cftypes---of 5
cgroup_rm_cftypes_locked---of 10
cgroup_rmdir---of 15
cgroup_root_from_kf---of 1
cgroup_seqfile_next---of 1
cgroup_seqfile_show---of 7
cgroup_seqfile_start---of 1
cgroup_seqfile_stop---of 3
cgroup_setup_root---of 54
cgroup_show_options---of 13
cgroup_show_path---of 14
cgroup_sk_alloc---of 12
cgroup_sk_clone---of 4
cgroup_sk_free---of 5
cgroup_ssid_enabled---of 1
cgroup_stat_show---of 12
cgroup_subtree_control_show---of 3
cgroup_subtree_control_write---of 64
cgroup_task_count---of 4
cgroup_task_dead64%of 19
cgroup_task_exit---of 5
cgroup_task_free---of 12
cgroup_task_release---of 5
cgroup_taskset_first---of 7
cgroup_taskset_next---of 7
cgroup_threads_start---of 1
cgroup_threads_write---of 1
cgroup_type_show---of 15
cgroup_type_write---of 42
cgroup_update_populated---of 12
cgroup_v1v2_get_from_fd---of 6
cpu_local_stat_show---of 18
cpu_stat_show---of 18
css_clear_dir---of 11
css_free_rwork_fn---of 49
css_from_id---of 1
css_has_online_children---of 9
css_killed_ref_fn---of 3
css_killed_work_fn---of 14
css_next_child---of 6
css_next_descendant_post---of 11
css_next_descendant_pre---of 11
css_populate_dir---of 16
css_release---of 1
css_release_work_fn---of 23
css_rightmost_descendant---of 8
css_set_hash---of 1
css_set_move_task25%of 29
css_task_iter_advance---of 20
css_task_iter_advance_css_set---of 28
css_task_iter_end---of 12
css_task_iter_next---of 11
css_task_iter_start---of 3
css_tryget_online_from_dir---of 14
delegate_show---of 24
features_show---of 1
find_css_set---of 75
init_and_link_css---of 16
init_cgroup_housekeeping---of 1
init_cgroup_root---of 11
kill_css---of 7
of_css---of 3
of_peak---of 1
perf_trace_cgroup---of 8
perf_trace_cgroup_event---of 8
perf_trace_cgroup_migrate---of 8
perf_trace_cgroup_root---of 8
perf_trace_cgroup_rstat---of 8
pressure_write---of 19
proc_cgroup_show---of 36
put_cgroup_ns25%of 12
put_css_set---of 3
put_css_set_locked---of 37
rebind_subsystems---of 48
task_cgroup_from_root---of 6
trace_cgroup_mkdir---of 5
trace_cgroup_setup_root---of 5
trace_event_get_offsets_cgroup_migrate---of 1
trace_event_raw_event_cgroup---of 9
trace_event_raw_event_cgroup_event---of 9
trace_event_raw_event_cgroup_migrate---of 9
trace_event_raw_event_cgroup_root---of 9
trace_event_raw_event_cgroup_rstat---of 9
trace_raw_output_cgroup---of 3
trace_raw_output_cgroup_event---of 3
trace_raw_output_cgroup_migrate---of 3
trace_raw_output_cgroup_root---of 3
trace_raw_output_cgroup_rstat---of 3
-----------
SUMMARY34%of 144

__nilfs_mark_inode_dirty---of 7
nilfs_attach_btree_node_cache60%of 5
nilfs_clear_inode79%of 14
nilfs_detach_btree_node_cache---of 3
nilfs_direct_IO---of 3
nilfs_dirty_folio---of 11
nilfs_dirty_inode---of 5
nilfs_evict_inode19%of 11
nilfs_fiemap---of 19
nilfs_get_block---of 15
nilfs_iget17%of 18
nilfs_iget_for_gc---of 5
nilfs_iget_for_shadow29%of 7
nilfs_iget_locked100%of 1
nilfs_iget_set67%of 6
nilfs_iget_test34%of 6
nilfs_ilookup---of 1
nilfs_inode_add_blocks---of 3
nilfs_inode_dirty---of 4
nilfs_inode_sub_blocks---of 3
nilfs_load_inode_block---of 9
nilfs_new_inode---of 16
nilfs_permission50%of 4
nilfs_read_folio---of 1
nilfs_read_inode_common64%of 11
nilfs_readahead---of 1
nilfs_set_file_dirty---of 12
nilfs_set_inode_flags---of 1
nilfs_setattr---of 6
nilfs_truncate---of 6
nilfs_truncate_bmap---of 11
nilfs_update_inode---of 8
nilfs_write_begin---of 4
nilfs_write_end---of 1
nilfs_write_failed---of 3
nilfs_write_inode_common---of 1
nilfs_writepages---of 4
-----------
SUMMARY45%of 83

-----------
SUMMARY---of 0

input_dev_get_poll_interval---of 1
input_dev_get_poll_max---of 1
input_dev_get_poll_min---of 1
input_dev_poller_finalize---of 5
input_dev_poller_start---of 4
input_dev_poller_stop---of 1
input_dev_poller_work---of 3
input_dev_set_poll_interval---of 10
input_get_poll_interval---of 3
input_poller_attrs_visible67%of 3
input_set_max_poll_interval---of 3
input_set_min_poll_interval---of 3
input_set_poll_interval---of 3
input_setup_polling---of 3
-----------
SUMMARY67%of 3

__bpf_getsockopt---of 15
__bpf_redirect---of 47
__bpf_setsockopt---of 13
__bpf_sk_lookup---of 9
__bpf_skb_change_head---of 14
__bpf_skb_change_tail---of 22
__bpf_skb_load_bytes---of 9
__bpf_skb_meta_store_bytes---of 5
__bpf_skb_store_bytes---of 15
__bpf_skc_lookup---of 25
__bpf_xdp_load_bytes---of 15
__bpf_xdp_store_bytes---of 15
__get_filter---of 8
__ipv4_neigh_lookup_noref---of 7
__ipv6_neigh_lookup_noref---of 7
bpf_bind---of 7
bpf_clone_redirect---of 11
bpf_convert_ctx_access---of 89
bpf_convert_filter---of 141
bpf_convert_tstamp_read---of 3
bpf_convert_tstamp_write---of 3
bpf_csum_diff---of 5
bpf_csum_level---of 13
bpf_csum_update---of 3
bpf_dynptr_from_skb---of 3
bpf_dynptr_from_skb_meta---of 3
bpf_dynptr_from_skb_rdonly---of 3
bpf_dynptr_from_xdp---of 4
bpf_flow_dissector_load_bytes---of 9
bpf_gen_ld_abs---of 8
bpf_get_cgroup_classid---of 9
bpf_get_cgroup_classid_curr---of 1
bpf_get_hash_recalc---of 3
bpf_get_listener_sock---of 8
bpf_get_netns_cookie---of 4
bpf_get_netns_cookie_sk_msg---of 4
bpf_get_netns_cookie_sock---of 3
bpf_get_netns_cookie_sock_addr---of 4
bpf_get_netns_cookie_sock_ops---of 4
bpf_get_route_realm---of 3
bpf_get_skb_set_tunnel_proto60%of 5
bpf_get_socket_cookie---of 3
bpf_get_socket_cookie_sock---of 1
bpf_get_socket_cookie_sock_addr---of 1
bpf_get_socket_cookie_sock_ops---of 1
bpf_get_socket_ptr_cookie---of 4
bpf_get_socket_uid---of 8
bpf_helper_changes_pkt_data12%of 25
bpf_ipv4_fib_lookup---of 47
bpf_ipv6_fib_lookup---of 35
bpf_l3_csum_replace---of 9
bpf_l4_csum_replace---of 15
bpf_lwt_in_push_encap---of 5
bpf_lwt_seg6_action---of 34
bpf_lwt_seg6_adjust_srh---of 15
bpf_lwt_seg6_store_bytes---of 12
bpf_lwt_xmit_push_encap---of 3
bpf_msg_apply_bytes---of 1
bpf_msg_cork_bytes---of 1
bpf_msg_pop_data---of 39
bpf_msg_pull_data---of 22
bpf_msg_push_data---of 42
bpf_noop_prologue---of 1
bpf_prepare_filter---of 81
bpf_prog_change_xdp---of 1
bpf_prog_create---of 5
bpf_prog_create_from_user---of 10
bpf_prog_destroy---of 4
bpf_push_seg6_encap---of 9
bpf_redirect---of 5
bpf_redirect_neigh---of 6
bpf_redirect_peer---of 5
bpf_run_sk_reuseport---of 4
bpf_search_tcp_opt---of 14
bpf_set_hash---of 1
bpf_set_hash_invalid---of 1
bpf_sk_ancestor_cgroup_id---of 11
bpf_sk_assign---of 19
bpf_sk_assign_tcp_reqsk---of 30
bpf_sk_base_func_proto30%of 10
bpf_sk_cgroup_id---of 8
bpf_sk_fullsock---of 1
bpf_sk_getsockopt---of 1
bpf_sk_lookup---of 11
bpf_sk_lookup_assign---of 22
bpf_sk_lookup_tcp---of 1
bpf_sk_lookup_udp---of 1
bpf_sk_release---of 5
bpf_sk_setsockopt---of 1
bpf_skb_adjust_room---of 86
bpf_skb_ancestor_cgroup_id---of 11
bpf_skb_cgroup_classid---of 8
bpf_skb_cgroup_id---of 8
bpf_skb_change_head---of 1
bpf_skb_change_proto---of 28
bpf_skb_change_tail---of 1
bpf_skb_change_type---of 3
bpf_skb_check_mtu---of 15
bpf_skb_copy---of 8
bpf_skb_ecn_set_ce---of 24
bpf_skb_event_output---of 5
bpf_skb_fib_lookup---of 7
bpf_skb_get_nlattr---of 6
bpf_skb_get_nlattr_nest---of 8
bpf_skb_get_pay_offset---of 1
bpf_skb_get_tunnel_key---of 24
bpf_skb_get_tunnel_opt---of 14
bpf_skb_get_xfrm_state---of 6
bpf_skb_is_valid_access6%of 127
bpf_skb_load_bytes---of 9
bpf_skb_load_bytes_relative---of 8
bpf_skb_load_helper_16---of 11
bpf_skb_load_helper_16_no_cache---of 11
bpf_skb_load_helper_32---of 11
bpf_skb_load_helper_32_no_cache---of 11
bpf_skb_load_helper_8---of 11
bpf_skb_load_helper_8_no_cache---of 11
bpf_skb_meta_pointer---of 3
bpf_skb_net_hdr_pop---of 18
bpf_skb_pull_data---of 5
bpf_skb_set_tstamp---of 10
bpf_skb_set_tunnel_key---of 30
bpf_skb_set_tunnel_opt---of 13
bpf_skb_store_bytes---of 15
bpf_skb_under_cgroup---of 12
bpf_skb_vlan_pop---of 13
bpf_skb_vlan_push---of 15
bpf_skc_lookup_tcp---of 3
bpf_skc_to_mptcp_sock---of 1
bpf_skc_to_tcp6_sock---of 5
bpf_skc_to_tcp_request_sock---of 4
bpf_skc_to_tcp_sock---of 4
bpf_skc_to_tcp_timewait_sock---of 4
bpf_skc_to_udp6_sock---of 6
bpf_skc_to_unix_sock---of 3
bpf_sock_addr_getsockopt---of 1
bpf_sock_addr_set_sun_path---of 3
bpf_sock_addr_setsockopt---of 1
bpf_sock_addr_sk_lookup_tcp---of 1
bpf_sock_addr_sk_lookup_udp---of 1
bpf_sock_addr_skc_lookup_tcp---of 1
bpf_sock_common_is_valid_access---of 3
bpf_sock_convert_ctx_access5%of 49
bpf_sock_create_getsockopt---of 5
bpf_sock_create_setsockopt---of 6
bpf_sock_destroy---of 5
bpf_sock_from_file---of 1
bpf_sock_is_valid_access6%of 58
bpf_sock_ops_cb_flags_set---of 4
bpf_sock_ops_enable_tx_tstamp---of 4
bpf_sock_ops_get_syn---of 17
bpf_sock_ops_getsockopt---of 5
bpf_sock_ops_load_hdr_opt---of 17
bpf_sock_ops_reserve_hdr_opt---of 5
bpf_sock_ops_setsockopt---of 3
bpf_sock_ops_store_hdr_opt---of 12
bpf_sol_tcp_getsockopt---of 6
bpf_sol_tcp_setsockopt---of 14
bpf_tc_sk_lookup_tcp---of 1
bpf_tc_sk_lookup_udp---of 1
bpf_tc_skc_lookup_tcp---of 1
bpf_tcp_check_syncookie---of 17
bpf_tcp_gen_syncookie---of 16
bpf_tcp_raw_check_syncookie_ipv4---of 1
bpf_tcp_raw_check_syncookie_ipv6---of 1
bpf_tcp_raw_gen_syncookie_ipv4---of 4
bpf_tcp_raw_gen_syncookie_ipv6---of 4
bpf_tcp_sock---of 3
bpf_tcp_sock_convert_ctx_access---of 28
bpf_tcp_sock_is_valid_access---of 6
bpf_unlocked_sk_getsockopt---of 1
bpf_unlocked_sk_setsockopt---of 1
bpf_update_srh_state---of 3
bpf_warn_invalid_xdp_action---of 3
bpf_xdp_adjust_head---of 5
bpf_xdp_adjust_meta---of 5
bpf_xdp_adjust_tail---of 7
bpf_xdp_check_mtu---of 6
bpf_xdp_copy---of 1
bpf_xdp_copy_buf---of 7
bpf_xdp_event_output---of 7
bpf_xdp_fib_lookup---of 5
bpf_xdp_frags_increase_tail---of 9
bpf_xdp_frags_shrink_tail---of 15
bpf_xdp_get_buff_len---of 3
bpf_xdp_load_bytes---of 15
bpf_xdp_pointer---of 11
bpf_xdp_pull_data---of 23
bpf_xdp_redirect---of 5
bpf_xdp_redirect_map---of 1
bpf_xdp_sk_lookup_tcp---of 1
bpf_xdp_sk_lookup_udp---of 1
bpf_xdp_skc_lookup_tcp---of 1
bpf_xdp_sock_convert_ctx_access---of 3
bpf_xdp_sock_is_valid_access---of 3
bpf_xdp_store_bytes---of 15
btf_id_cmp_func---of 1
cg_skb_func_proto---of 22
cg_skb_is_valid_access---of 73
convert_bpf_ld_abs---of 20
copy_bpf_fprog_from_user---of 9
flow_dissector_convert_ctx_access---of 5
flow_dissector_func_proto---of 11
flow_dissector_is_valid_access---of 26
init_subsystem---of 1
ip_neigh_gw4---of 7
ip_neigh_gw6---of 7
lwt_in_func_proto67%of 3
lwt_is_valid_access4%of 120
lwt_out_func_proto16%of 19
lwt_seg6local_func_proto40%of 5
lwt_xmit_func_proto---of 23
neigh_output---of 17
netkit_peer_dev---of 1
sk_attach_bpf---of 10
sk_attach_filter---of 12
sk_detach_filter---of 6
sk_filter_charge---of 14
sk_filter_func_proto---of 8
sk_filter_is_valid_access---of 91
sk_filter_release_rcu---of 4
sk_filter_trim_cap12%of 25
sk_filter_uncharge---of 4
sk_get_filter---of 8
sk_lookup---of 11
sk_lookup_convert_ctx_access5%of 43
sk_lookup_func_proto---of 13
sk_lookup_is_valid_access4%of 76
sk_msg_convert_ctx_access---of 37
sk_msg_func_proto10%of 21
sk_msg_is_valid_access4%of 92
sk_reuseport_attach_bpf---of 16
sk_reuseport_attach_filter---of 8
sk_reuseport_convert_ctx_access19%of 11
sk_reuseport_func_proto---of 7
sk_reuseport_is_valid_access7%of 29
sk_reuseport_load_bytes---of 9
sk_reuseport_load_bytes_relative---of 8
sk_reuseport_prog_free---of 6
sk_select_reuseport---of 12
sk_skb_adjust_room---of 21
sk_skb_change_head---of 1
sk_skb_change_tail---of 1
sk_skb_convert_ctx_access---of 28
sk_skb_func_proto---of 25
sk_skb_is_valid_access6%of 52
sk_skb_prologue---of 3
sk_skb_pull_data---of 5
skb_data_move---of 8
skb_do_redirect---of 102
skb_mac_header---of 3
skb_reset_network_header---of 3
skb_reset_transport_header---of 3
sock_addr_convert_ctx_access---of 49
sock_addr_func_proto---of 56
sock_addr_is_valid_access---of 143
sock_filter_func_proto---of 9
sock_filter_is_valid_access---of 36
sock_ops_convert_ctx_access---of 204
sock_ops_func_proto9%of 24
sock_ops_is_valid_access---of 70
sol_socket_sockopt---of 23
sol_tcp_sockopt---of 34
tc_cls_act_btf_struct_access---of 3
tc_cls_act_convert_ctx_access---of 3
tc_cls_act_func_proto10%of 61
tc_cls_act_is_valid_access4%of 118
tc_cls_act_prologue67%of 3
trace_xdp_redirect_err---of 5
tracing_iter_filter---of 4
xdp_btf_struct_access---of 3
xdp_convert_ctx_access---of 8
xdp_do_check_flushed---of 14
xdp_do_flush---of 13
xdp_do_generic_redirect---of 32
xdp_do_redirect---of 28
xdp_do_redirect_frame---of 24
xdp_func_proto---of 33
xdp_is_valid_access---of 13
xdp_master_redirect---of 7
-----------
SUMMARY7%of 976

-----------
SUMMARY---of 0

of_device_get_match_data---of 6
of_device_make_bus_id---of 16
of_device_modalias---of 7
of_device_uevent16%of 13
of_device_uevent_modalias---of 8
of_dma_configure_id---of 18
of_match_device---of 5
-----------
SUMMARY16%of 13

-----------
SUMMARY---of 0

__sort_r22%of 114
sort100%of 1
sort_nonatomic---of 1
sort_r---of 1
sort_r_nonatomic---of 1
-----------
SUMMARY22%of 115

kref_get---of 4
vm_area_alloc67%of 3
vm_area_dup---of 10
vm_area_free37%of 11
-----------
SUMMARY43%of 14

-----------
SUMMARY---of 0

input_bits_to_string43%of 7
input_event_from_user---of 5
input_event_to_user---of 5
input_ff_effect_from_user---of 9
-----------
SUMMARY43%of 7

ida_alloc_range25%of 33
ida_destroy---of 13
ida_find_first_range---of 8
ida_free20%of 10
idr_alloc67%of 3
idr_alloc_cyclic50%of 6
idr_alloc_u3250%of 6
idr_find100%of 1
idr_for_each23%of 9
idr_get_next---of 13
idr_get_next_ul---of 11
idr_remove100%of 1
idr_replace---of 4
-----------
SUMMARY32%of 69

-----------
SUMMARY---of 0

squashfs_iget---of 5
squashfs_read_inode8%of 50
-----------
SUMMARY8%of 50

-----------
SUMMARY---of 0

ext4_clear_blocks---of 12
ext4_find_shared---of 19
ext4_free_branches---of 24
ext4_free_data---of 18
ext4_get_branch---of 13
ext4_ind_map_blocks---of 86
ext4_ind_remove_space---of 83
ext4_ind_trans_blocks100%of 1
ext4_ind_truncate---of 34
ext4_ind_truncate_ensure_credits---of 22
ext4_splice_branch---of 15
ext4_update_inode_fsync_trans---of 6
-----------
SUMMARY100%of 1

__ia32_compat_sys_timer_create---of 4
__ia32_sys_clock_adjtime---of 10
__ia32_sys_clock_adjtime3230%of 10
__ia32_sys_clock_getres---of 7
__ia32_sys_clock_getres_time32---of 8
__ia32_sys_clock_gettime---of 7
__ia32_sys_clock_gettime32---of 7
__ia32_sys_clock_nanosleep---of 9
__ia32_sys_clock_nanosleep_time3234%of 9
__ia32_sys_clock_settime---of 8
__ia32_sys_clock_settime32---of 8
__ia32_sys_timer_create---of 4
__ia32_sys_timer_delete---of 4
__ia32_sys_timer_getoverrun---of 4
__ia32_sys_timer_gettime---of 4
__ia32_sys_timer_gettime32---of 4
__ia32_sys_timer_settime---of 5
__ia32_sys_timer_settime32---of 5
__x64_sys_clock_adjtime---of 10
__x64_sys_clock_adjtime32---of 10
__x64_sys_clock_getres---of 7
__x64_sys_clock_getres_time32---of 8
__x64_sys_clock_gettime---of 7
__x64_sys_clock_gettime32---of 7
__x64_sys_clock_nanosleep---of 9
__x64_sys_clock_nanosleep_time32---of 9
__x64_sys_clock_settime---of 8
__x64_sys_clock_settime32---of 8
__x64_sys_timer_create---of 4
__x64_sys_timer_delete---of 4
__x64_sys_timer_getoverrun---of 4
__x64_sys_timer_gettime---of 4
__x64_sys_timer_gettime32---of 4
__x64_sys_timer_settime---of 5
__x64_sys_timer_settime32---of 5
common_hrtimer_arm---of 7
common_hrtimer_forward---of 1
common_hrtimer_rearm---of 1
common_hrtimer_remaining---of 1
common_hrtimer_try_to_cancel---of 1
common_nsleep100%of 1
common_nsleep_timens---of 4
common_timer_create---of 1
common_timer_del---of 3
common_timer_get---of 8
common_timer_set---of 13
common_timer_wait_running---of 1
do_clock_adjtime---of 7
do_timer_create---of 48
do_timer_settime---of 16
exit_itimers---of 19
lock_timer---of 9
posix_clock_realtime_adj---of 1
posix_clock_realtime_set---of 1
posix_get_boottime_ktime---of 1
posix_get_boottime_timespec---of 1
posix_get_coarse_res---of 1
posix_get_hrtimer_res---of 1
posix_get_monotonic_coarse---of 1
posix_get_monotonic_ktime---of 1
posix_get_monotonic_raw---of 1
posix_get_monotonic_timespec---of 1
posix_get_realtime_coarse---of 1
posix_get_realtime_ktime---of 1
posix_get_realtime_timespec---of 1
posix_get_tai_ktime---of 1
posix_get_tai_timespec---of 1
posix_timer_add_at---of 12
posix_timer_cleanup_ignored---of 14
posix_timer_delete---of 22
posix_timer_fn---of 5
posix_timer_queue_signal---of 3
posix_timer_set_common---of 4
posix_timer_unhash_and_free---of 15
posixtimer_create_prctl---of 5
posixtimer_deliver_signal---of 17
posixtimer_free_timer---of 5
-----------
SUMMARY35%of 20

-----------
SUMMARY---of 0

init_once---of 1
qnx6_alloc_inode---of 1
qnx6_bmap---of 1
qnx6_check_first_superblock29%of 7
qnx6_fill_super13%of 39
qnx6_free_fc100%of 1
qnx6_free_inode---of 1
qnx6_get_block---of 10
qnx6_get_tree100%of 1
qnx6_iget---of 13
qnx6_init_fs_context67%of 3
qnx6_parse_param50%of 4
qnx6_private_inode---of 3
qnx6_put_super---of 3
qnx6_read_folio---of 1
qnx6_readahead---of 1
qnx6_reconfigure---of 1
qnx6_show_options---of 3
qnx6_statfs---of 1
-----------
SUMMARY24%of 55

errseq_check67%of 3
errseq_check_and_advance50%of 4
errseq_sample100%of 1
errseq_set---of 5
-----------
SUMMARY63%of 8

__bpf_trace_icmp_send---of 1
__icmp_send24%of 50
__probestub_icmp_send---of 1
__skb_put---of 3
__traceiter_icmp_send---of 4
dev_hold---of 5
dev_put---of 3
icmp_build_probe---of 34
icmp_discard---of 1
icmp_echo---of 7
icmp_err38%of 8
icmp_ext_append13%of 24
icmp_ext_objs_append---of 13
icmp_global_allow38%of 8
icmp_global_consume---of 4
icmp_glue_bits67%of 3
icmp_ndo_send---of 10
icmp_out_count100%of 1
icmp_push_reply38%of 8
icmp_rcv14%of 75
icmp_redirect36%of 14
icmp_reply---of 22
icmp_route_lookup31%of 26
icmp_sk_init---of 1
icmp_tag_validation---of 3
icmp_timestamp---of 5
icmp_unreach33%of 31
icmpv4_global_allow34%of 6
icmpv4_xrlim_allow55%of 11
ip_icmp_error_rfc4884---of 23
ip_route_input---of 5
perf_trace_icmp_send---of 13
skb_dstref_restore---of 3
trace_event_raw_event_icmp_send---of 14
trace_icmp_send40%of 5
trace_raw_output_icmp_send---of 3
-----------
SUMMARY26%of 270

-----------
SUMMARY---of 0

__dev_printk50%of 8
__device_link_del---of 9
__device_links_queue_sync_state17%of 18
__fw_devlink_link_to_consumers---of 31
__fw_devlink_link_to_suppliers---of 14
__fw_devlink_pickup_dangling_consumers---of 25
__fw_devlink_relax_cycles---of 44
__root_device_register---of 7
_dev_alert---of 1
_dev_crit---of 1
_dev_emerg---of 1
_dev_err100%of 1
_dev_info100%of 1
_dev_notice100%of 1
_dev_printk---of 1
_dev_warn---of 1
auto_remove_on_show---of 1
class_dir_child_ns_type100%of 1
class_dir_release100%of 1
cleanup_glue_dir55%of 11
dev_attr_show---of 4
dev_attr_store---of 3
dev_driver_string---of 5
dev_err_probe---of 4
dev_printk_emit100%of 1
dev_set_name100%of 1
dev_show---of 1
dev_uevent81%of 26
dev_uevent_filter80%of 5
dev_uevent_name75%of 4
dev_vprintk_emit40%of 10
dev_warn_probe---of 4
device_add29%of 49
device_add_attrs38%of 29
device_add_class_symlinks43%of 14
device_add_groups100%of 1
device_add_of_node---of 6
device_change_owner---of 22
device_check_offline---of 11
device_create---of 6
device_create_bin_file---of 3
device_create_file50%of 8
device_create_release---of 1
device_create_sys_dev_entry67%of 3
device_create_with_groups---of 6
device_del39%of 34
device_destroy---of 3
device_find_child---of 7
device_for_each_child---of 7
device_for_each_child_reverse---of 7
device_for_each_child_reverse_from---of 9
device_get_devnode47%of 15
device_get_ownership75%of 4
device_initialize67%of 3
device_is_dependent---of 17
device_link_add---of 59
device_link_del---of 6
device_link_flag_is_sync_state_only---of 1
device_link_init_status---of 8
device_link_release_fn---of 10
device_link_remove---of 9
device_link_wait_removal---of 1
device_links_busy29%of 7
device_links_check_suppliers10%of 33
device_links_driver_bound9%of 84
device_links_driver_cleanup15%of 28
device_links_force_bind---of 9
device_links_no_driver17%of 18
device_links_read_lock100%of 1
device_links_read_lock_held---of 1
device_links_read_unlock67%of 3
device_links_supplier_sync_state_pause---of 1
device_links_supplier_sync_state_resume---of 22
device_links_unbind_consumers---of 11
device_match_acpi_dev---of 3
device_match_acpi_handle---of 4
device_match_any---of 1
device_match_devt---of 1
device_match_fwnode---of 3
device_match_name---of 3
device_match_of_node---of 3
device_match_type---of 1
device_move---of 30
device_namespace---of 4
device_offline---of 14
device_online---of 7
device_pm_move_to_tail---of 3
device_register100%of 1
device_release30%of 10
device_remove_attrs78%of 9
device_remove_bin_file---of 3
device_remove_class_symlinks40%of 10
device_remove_file67%of 3
device_remove_file_self---of 3
device_remove_groups100%of 1
device_remove_of_node---of 6
device_rename---of 11
device_reorder_to_tail---of 16
device_set_node---of 3
device_set_of_node_from_dev---of 1
device_show_bool---of 1
device_show_int---of 1
device_show_string---of 1
device_show_ulong---of 1
device_shutdown---of 31
device_store_bool---of 1
device_store_int---of 4
device_store_ulong---of 3
device_unregister67%of 3
devices_kset_move_after---of 8
devices_kset_move_before---of 7
devices_kset_move_last---of 7
devlink_add_symlinks---of 22
devlink_dev_release---of 1
devlink_remove_symlinks---of 20
devm_attr_group_remove---of 1
devm_device_add_group---of 4
fw_devlink_create_devlink---of 51
fw_devlink_dev_sync_state---of 14
fw_devlink_drivers_done---of 1
fw_devlink_is_strict---of 1
fw_devlink_link_device---of 3
fw_devlink_no_driver---of 4
fw_devlink_parse_fwtree---of 9
fw_devlink_probing_done---of 15
fw_devlink_purge_absent_suppliers---of 14
fw_devlink_unblock_consumers---of 8
fwnode_link_add---of 10
fwnode_links_purge---of 19
get_dev_from_fwnode---of 3
get_device67%of 3
get_device_parent42%of 17
kill_device---of 3
klist_children_get67%of 3
klist_children_put67%of 3
list_add_tail---of 3
list_add_tail_rcu---of 3
lock_device_hotplug---of 1
lock_device_hotplug_sysfs---of 3
online_show---of 1
online_store---of 10
put_device67%of 3
refcount_inc---of 4
removable_show---of 1
root_device_release---of 1
root_device_unregister---of 5
runtime_pm_show---of 1
set_primary_fwnode17%of 12
set_secondary_fwnode---of 6
status_show---of 8
sync_state_only_show---of 1
sync_state_resume_initcall---of 1
uevent_show---of 13
uevent_store---of 3
unlock_device_hotplug---of 1
virtual_device_parent---of 3
waiting_for_supplier_show---of 5
-----------
SUMMARY35%of 501

ip6table_mangle_hook23%of 9
ip6table_mangle_net_exit---of 1
ip6table_mangle_net_pre_exit---of 1
ip6table_mangle_table_init---of 3
-----------
SUMMARY23%of 9

alloc_dax---of 8
dax_add_host---of 3
dax_alive---of 1
dax_alloc_inode---of 3
dax_copy_from_iter---of 4
dax_copy_to_iter---of 4
dax_destroy_inode---of 3
dax_direct_access---of 6
dax_flush---of 3
dax_free_inode---of 3
dax_fs_exit---of 1
dax_get_private---of 3
dax_holder---of 1
dax_holder_notify_failure---of 6
dax_init_fs_context---of 3
dax_inode---of 1
dax_read_lock---of 1
dax_read_unlock---of 3
dax_recovery_write---of 3
dax_remove_host---of 1
dax_set---of 1
dax_synchronous---of 1
dax_test---of 1
dax_write_cache---of 3
dax_write_cache_enabled---of 1
dax_zero_page_range---of 4
fs_dax_get_by_bdev19%of 11
fs_put_dax40%of 5
init_once---of 1
inode_dax---of 1
kill_dax---of 8
put_dax---of 3
run_dax---of 1
set_dax_nocache---of 1
set_dax_nomc---of 1
set_dax_synchronous---of 1
-----------
SUMMARY25%of 16

-----------
SUMMARY---of 0

__do_adjtimex8%of 40
__timekeeping_advance---of 41
__timekeeping_inject_offset---of 32
adjust_historical_crosststamp---of 10
aux_clock_adj67%of 3
aux_clock_enable_show---of 1
aux_clock_enable_store---of 6
aux_clock_set---of 16
aux_get_res---of 3
aux_get_timespec---of 3
change_clocksource---of 35
delta_to_ns_safe---of 1
do_adjtimex---of 12
do_settimeofday64---of 22
do_timer---of 1
dummy_clock_read---of 3
get_device_system_crosststamp---of 28
getboottime64---of 1
ktime_expiry_to_cycles---of 8
ktime_get70%of 13
ktime_get_aux---of 15
ktime_get_aux_ts64---of 3
ktime_get_boot_fast_ns---of 8
ktime_get_clock_ts64---of 15
ktime_get_coarse_real_ts64---of 6
ktime_get_coarse_real_ts64_mg58%of 7
ktime_get_coarse_ts64---of 6
ktime_get_coarse_with_offset---of 8
ktime_get_mono_fast_ns50%of 8
ktime_get_ntp_seconds---of 1
ktime_get_raw---of 11
ktime_get_raw_fast_ns---of 8
ktime_get_raw_ts64---of 13
ktime_get_real_fast_ns---of 8
ktime_get_real_seconds100%of 1
ktime_get_real_ts6440%of 15
ktime_get_real_ts64_mg---of 14
ktime_get_resolution_ns---of 8
ktime_get_seconds---of 3
ktime_get_snapshot---of 16
ktime_get_tai_fast_ns---of 8
ktime_get_ts64---of 15
ktime_get_update_offsets_now47%of 15
ktime_get_with_offset54%of 13
ktime_mono_to_any---of 1
ktime_real_to_base_clock---of 11
pvclock_gtod_register_notifier---of 1
pvclock_gtod_unregister_notifier---of 1
random_get_entropy_fallback---of 3
timekeeper_lock_irqsave---of 1
timekeeper_unlock_irqrestore---of 1
timekeeping_clocksource_has_base---of 3
timekeeping_max_deferment---of 6
timekeeping_notify---of 5
timekeeping_resume---of 15
timekeeping_suspend---of 19
timekeeping_syscore_resume---of 1
timekeeping_syscore_suspend---of 1
timekeeping_update_from_shadow---of 13
timekeeping_valid_for_hres---of 6
timekeeping_warp_clock---of 4
tk_set_wall_to_mono---of 3
tk_setup_internals---of 8
update_fast_timekeeper---of 1
update_wall_time---of 9
-----------
SUMMARY38%of 115

__fortify_report---of 1
__read_overflow2_field---of 1
__sysfs_match_string---of 15
__write_overflow_field---of 1
devm_kasprintf_strarray---of 10
devm_kfree_strarray---of 5
kasprintf_strarray---of 9
kfree_strarray---of 5
kstrdup_and_replace---of 7
kstrdup_quotable---of 23
kstrdup_quotable_cmdline---of 14
kstrdup_quotable_file---of 5
match_string---of 6
memcpy_and_pad67%of 3
parse_int_array---of 4
parse_int_array_user---of 5
skip_spaces---of 3
strim---of 7
string_escape_mem---of 63
string_get_size---of 16
string_unescape---of 27
strreplace67%of 6
sysfs_streq---of 11
-----------
SUMMARY67%of 9

dev_pm_domain_attach---of 4
dev_pm_domain_attach_by_id---of 1
dev_pm_domain_attach_by_name---of 1
dev_pm_domain_attach_list---of 10
dev_pm_domain_detach50%of 4
dev_pm_domain_detach_list---of 10
dev_pm_domain_set---of 5
dev_pm_domain_set_performance_state---of 4
dev_pm_domain_start---of 4
dev_pm_get_subsys_data---of 4
dev_pm_put_subsys_data---of 5
devm_pm_domain_attach_list---of 11
devm_pm_domain_detach_list---of 10
-----------
SUMMARY50%of 4

usb_destroy_configuration34%of 18
usb_get_bos_descriptor8%of 25
usb_get_configuration28%of 214
usb_release_bos_descriptor67%of 3
usb_release_interface_cache---of 4
-----------
SUMMARY27%of 260

-----------
SUMMARY---of 0

a_alt_hnp_support_show---of 1
a_hnp_support_show---of 1
b_hnp_enable_show---of 1
current_speed_show---of 1
function_show---of 4
gadget_bind_driver27%of 23
gadget_find_ep_by_name---of 4
gadget_match_driver67%of 6
gadget_unbind_driver---of 7
is_a_peripheral_show---of 1
is_otg_show---of 1
is_selfpowered_show---of 1
maximum_speed_show---of 1
soft_connect_store---of 8
srp_store---of 8
state_show---of 1
usb_add_gadget---of 13
usb_add_gadget_udc---of 3
usb_add_gadget_udc_release---of 3
usb_del_gadget---of 5
usb_del_gadget_udc---of 1
usb_ep_alloc_request43%of 7
usb_ep_clear_halt---of 5
usb_ep_dequeue---of 5
usb_ep_disable---of 8
usb_ep_enable30%of 10
usb_ep_fifo_flush---of 7
usb_ep_fifo_status---of 7
usb_ep_free_request---of 5
usb_ep_queue38%of 8
usb_ep_set_halt---of 5
usb_ep_set_maxpacket_limit---of 5
usb_ep_set_wedge---of 7
usb_gadget_activate---of 8
usb_gadget_check_config---of 3
usb_gadget_clear_selfpowered---of 7
usb_gadget_connect---of 1
usb_gadget_connect_locked28%of 11
usb_gadget_deactivate---of 9
usb_gadget_disconnect---of 1
usb_gadget_disconnect_locked---of 13
usb_gadget_ep_match_desc18%of 17
usb_gadget_frame_number---of 5
usb_gadget_giveback_request---of 7
usb_gadget_map_request---of 1
usb_gadget_map_request_by_dev---of 11
usb_gadget_register_driver_owner25%of 8
usb_gadget_set_remote_wakeup---of 7
usb_gadget_set_selfpowered---of 7
usb_gadget_set_state43%of 7
usb_gadget_state_work---of 3
usb_gadget_udc_reset100%of 1
usb_gadget_unmap_request---of 6
usb_gadget_unmap_request_by_dev---of 6
usb_gadget_unregister_driver---of 4
usb_gadget_vbus_connect---of 7
usb_gadget_vbus_disconnect---of 7
usb_gadget_vbus_draw38%of 8
usb_gadget_wakeup---of 7
usb_get_gadget_udc_name---of 4
usb_initialize_gadget---of 1
usb_udc_nop_release---of 1
usb_udc_release---of 1
usb_udc_uevent40%of 5
usb_udc_vbus_handler---of 3
vbus_event_work---of 3
-----------
SUMMARY33%of 111

__f_unlock_pos---of 1
__fget_files34%of 9
__fget_files_rcu---of 9
__file_ref_put50%of 4
__file_ref_put_badval---of 4
__free_fdtable---of 1
__get_unused_fd_flags---of 1
__ia32_sys_close_range---of 1
__ia32_sys_dup---of 4
__ia32_sys_dup2---of 4
__ia32_sys_dup3---of 1
__se_sys_close_range---of 21
__x64_sys_close_range---of 1
__x64_sys_dup---of 4
__x64_sys_dup2---of 4
__x64_sys_dup3---of 1
alloc_fd43%of 19
alloc_fdtable---of 9
close_fd---of 7
copy_fd_bitmaps---of 3
do_close_on_exec---of 13
do_dup2---of 16
dup_fd---of 21
exit_files---of 3
expand_files---of 16
f_dupfd---of 5
fd_install29%of 7
fd_install_slowpath---of 3
fdget60%of 5
fdget_pos30%of 10
fdget_raw---of 4
fget---of 1
fget_raw---of 8
fget_task---of 9
fget_task_next---of 12
file_close_fd43%of 7
file_close_fd_locked---of 7
file_seek_cur_needs_f_lock---of 4
free_fdtable_rcu---of 1
get_close_on_exec---of 1
get_file_active---of 5
get_file_rcu---of 7
get_unused_fd_flags100%of 1
iterate_fd---of 8
ksys_dup3---of 7
put_files_struct---of 13
put_unused_fd60%of 5
receive_fd---of 16
receive_fd_replace---of 8
replace_fd---of 5
sane_fdtable_size---of 7
set_close_on_exec---of 4
-----------
SUMMARY42%of 67

alloc_fs_context17%of 48
fc_drop_locked---of 1
finish_clean_context---of 3
fs_context_for_mount100%of 1
fs_context_for_reconfigure---of 1
fs_context_for_submount---of 4
generic_parse_monolithic100%of 1
logfc20%of 10
parse_monolithic_mount_data100%of 1
put_fs_context24%of 51
vfs_clean_context---of 5
vfs_dup_fs_context---of 32
vfs_parse_comma_sep100%of 1
vfs_parse_fs_param42%of 12
vfs_parse_fs_param_source---of 5
vfs_parse_fs_qstr50%of 4
vfs_parse_monolithic_sep72%of 14
-----------
SUMMARY31%of 143

tomoyo_bprm_check_security---of 4
tomoyo_bprm_committed_creds---of 1
tomoyo_cred_prepare50%of 4
tomoyo_domain50%of 4
tomoyo_file_fcntl---of 6
tomoyo_file_ioctl100%of 1
tomoyo_file_open40%of 5
tomoyo_file_truncate---of 1
tomoyo_inode_getattr---of 1
tomoyo_path_chmod---of 1
tomoyo_path_chown---of 6
tomoyo_path_chroot---of 1
tomoyo_path_link---of 1
tomoyo_path_mkdir---of 1
tomoyo_path_mknod50%of 4
tomoyo_path_rename---of 4
tomoyo_path_rmdir---of 1
tomoyo_path_symlink---of 1
tomoyo_path_truncate---of 1
tomoyo_path_unlink---of 1
tomoyo_sb_mount100%of 1
tomoyo_sb_pivotroot---of 1
tomoyo_sb_umount---of 1
tomoyo_socket_bind---of 1
tomoyo_socket_connect---of 1
tomoyo_socket_listen---of 1
tomoyo_socket_sendmsg---of 1
tomoyo_task_alloc---of 1
tomoyo_task_free---of 5
-----------
SUMMARY53%of 19

__bio_add_page50%of 6
__bio_advance---of 14
__bio_clone34%of 12
__bio_release_pages---of 25
bdev_rw_virt45%of 9
bio_add_folio67%of 3
bio_add_folio_nofail50%of 8
bio_add_page43%of 14
bio_add_virt_nofail50%of 6
bio_add_vmalloc---of 3
bio_add_vmalloc_chunk---of 1
bio_alloc_bioset36%of 25
bio_alloc_cache_prune---of 10
bio_alloc_clone50%of 4
bio_alloc_rescue---of 6
bio_await---of 3
bio_chain---of 4
bio_chain_and_submit---of 5
bio_check_pages_dirty---of 25
bio_copy_data---of 1
bio_copy_data_iter---of 15
bio_cpu_dead---of 3
bio_dirty_fn---of 6
bio_endio25%of 37
bio_free63%of 8
bio_free_pages50%of 6
bio_init67%of 3
bio_init_clone---of 5
bio_iov_bvec_set---of 3
bio_iov_iter_bounce---of 40
bio_iov_iter_get_pages---of 19
bio_iov_iter_unbounce---of 22
bio_kmalloc67%of 3
bio_put22%of 14
bio_reset---of 3
bio_reuse---of 17
bio_set_pages_dirty---of 24
bio_split---of 14
bio_submit_or_kill---of 5
bio_trim20%of 10
bio_truncate---of 11
bio_uninit50%of 10
bio_wait_end_io100%of 1
bioset_exit---of 16
bioset_init---of 17
biovec_init_pool---of 1
biovec_slab3%of 127
blk_next_bio---of 5
bvec_try_merge_hw_page---of 6
guard_bio_eod40%of 5
punt_bios_to_rescuer13%of 16
submit_bio_wait100%of 1
zero_fill_bio_iter50%of 8
-----------
SUMMARY25%of 336

do_compute_shiftstate---of 7
fn_SAK---of 1
fn_bare_num---of 3
fn_boot_it---of 1
fn_caps_on---of 3
fn_caps_toggle---of 3
fn_compose---of 1
fn_dec_console---of 4
fn_enter---of 14
fn_hold---of 4
fn_inc_console---of 4
fn_lastcons---of 1
fn_null---of 7
fn_num---of 4
fn_scroll_back---of 1
fn_scroll_forw---of 1
fn_send_intr---of 4
fn_show_mem---of 1
fn_show_ptregs---of 3
fn_show_state---of 1
fn_spawn_con---of 4
getkeycode_helper---of 1
handle_diacr---of 13
k_ascii---of 4
k_brl---of 28
k_cons---of 3
k_csi---of 6
k_cur---of 4
k_dead---of 14
k_dead2---of 15
k_deadunicode---of 15
k_fn---of 4
k_lock---of 3
k_lowercase---of 1
k_meta---of 11
k_pad---of 25
k_self---of 23
k_shift---of 16
k_slock---of 3
k_spec---of 6
k_unicode---of 8
kbd_bh---of 8
kbd_connect---of 7
kbd_disconnect---of 1
kbd_event---of 127
kbd_led_trigger_activate---of 3
kbd_match34%of 6
kbd_rate---of 1
kbd_rate_helper---of 6
kbd_start---of 27
kd_mksound---of 3
kd_nosound---of 1
kd_sound_helper---of 7
put_queue---of 4
put_queue_utf8---of 4
puts_queue---of 1
register_keyboard_notifier---of 1
setkeycode_helper---of 1
setledstate---of 5
to_utf8---of 33
unregister_keyboard_notifier---of 1
vt_clr_kbd_mode_bit---of 1
vt_do_diacrit---of 34
vt_do_kbkeycode_ioctl---of 7
vt_do_kdgkb_ioctl---of 17
vt_do_kdgkbmeta---of 1
vt_do_kdgkbmode---of 6
vt_do_kdsk_ioctl---of 30
vt_do_kdskbmeta---of 4
vt_do_kdskbmode---of 17
vt_do_kdskled---of 13
vt_get_kbd_mode_bit---of 1
vt_get_leds---of 1
vt_get_shift_state---of 1
vt_kbd_con_start---of 3
vt_kbd_con_stop---of 3
vt_kdskbsent---of 4
vt_reset_keyboard---of 1
vt_reset_unicode---of 1
vt_set_kbd_mode_bit---of 1
vt_set_led_state---of 5
vt_set_leds_compute_shiftstate---of 9
-----------
SUMMARY34%of 6

assoc_array_apply_edit44%of 50
assoc_array_cancel_edit---of 7
assoc_array_clear---of 4
assoc_array_delete---of 45
assoc_array_delete_collapse_iterator---of 4
assoc_array_destroy---of 1
assoc_array_destroy_subtree---of 26
assoc_array_find100%of 7
assoc_array_gc---of 71
assoc_array_insert7%of 81
assoc_array_insert_set_object67%of 3
assoc_array_iterate---of 19
assoc_array_rcu_cleanup---of 8
assoc_array_subtree_iterate---of 18
assoc_array_walk24%of 17
-----------
SUMMARY26%of 158

utf8_casefold---of 6
utf8_casefold_hash---of 6
utf8_load---of 11
utf8_normalize---of 6
utf8_parse_version---of 6
utf8_strncasecmp---of 7
utf8_strncasecmp_folded---of 6
utf8_strncmp---of 7
utf8_unload67%of 3
utf8_validate---of 1
-----------
SUMMARY67%of 3

__ia32_sys_get_thread_area---of 1
__ia32_sys_set_thread_area100%of 1
__x64_sys_get_thread_area---of 1
__x64_sys_set_thread_area---of 1
do_get_thread_area---of 6
do_set_thread_area11%of 28
load_gs_index---of 5
regset_tls_active---of 4
regset_tls_get---of 4
regset_tls_set---of 18
set_tls_desc---of 12
-----------
SUMMARY14%of 29

-----------
SUMMARY---of 0

ext4_discard_allocated_blocks---of 20
ext4_discard_preallocations8%of 56
ext4_discard_work---of 33
ext4_exit_mballoc---of 1
ext4_free_blocks---of 94
ext4_group_add_blocks---of 24
ext4_mb_add_groupinfo30%of 10
ext4_mb_alloc_groupinfo34%of 6
ext4_mb_complex_scan_group---of 32
ext4_mb_discard_group_preallocations5%of 49
ext4_mb_discard_lg_preallocations---of 39
ext4_mb_discard_preallocations_should_retry43%of 19
ext4_mb_find_by_goal6%of 34
ext4_mb_free_metadata---of 35
ext4_mb_generate_buddy58%of 14
ext4_mb_generate_from_pa17%of 12
ext4_mb_good_group10%of 20
ext4_mb_init27%of 52
ext4_mb_init_cache35%of 63
ext4_mb_init_group27%of 34
ext4_mb_initialize_context31%of 13
ext4_mb_load_buddy_gfp---of 47
ext4_mb_mark_bb---of 6
ext4_mb_mark_context---of 46
ext4_mb_mark_diskspace_used---of 7
ext4_mb_new_blocks14%of 135
ext4_mb_new_group_pa---of 14
ext4_mb_new_inode_pa---of 26
ext4_mb_normalize_request4%of 58
ext4_mb_pa_callback---of 5
ext4_mb_pa_put_free50%of 6
ext4_mb_prefetch46%of 11
ext4_mb_prefetch_fini37%of 11
ext4_mb_regular_allocator13%of 98
ext4_mb_release---of 34
ext4_mb_release_group_pa---of 13
ext4_mb_release_inode_pa---of 19
ext4_mb_scan_aligned---of 8
ext4_mb_scan_group30%of 81
ext4_mb_scan_groups_xa_range---of 11
ext4_mb_seq_groups_next---of 4
ext4_mb_seq_groups_show---of 22
ext4_mb_seq_groups_start---of 4
ext4_mb_seq_groups_stop---of 1
ext4_mb_seq_structs_summary_next---of 4
ext4_mb_seq_structs_summary_show---of 11
ext4_mb_seq_structs_summary_start---of 4
ext4_mb_seq_structs_summary_stop---of 1
ext4_mb_simple_scan_group---of 15
ext4_mb_try_best_found---of 20
ext4_mb_unload_buddy---of 7
ext4_mb_use_best_found---of 8
ext4_mb_use_inode_pa---of 6
ext4_mb_use_preallocated7%of 33
ext4_mballoc_query_range---of 39
ext4_process_freed_data---of 31
ext4_seq_mb_stats_show---of 3
ext4_trim_fs---of 47
ext4_try_to_trim_range---of 50
mb_avg_fragment_size_order---of 4
mb_find_extent---of 35
mb_free_blocks---of 46
mb_mark_used---of 56
mb_regenerate_buddy---of 11
mb_set_bits---of 6
mb_set_largest_free_order42%of 12
mb_update_avg_fragment_size23%of 9
-----------
SUMMARY20%of 836

__hfs_bnode_create20%of 15
hfs_bnode_clear---of 5
hfs_bnode_copy---of 8
hfs_bnode_create---of 11
hfs_bnode_dump---of 8
hfs_bnode_find15%of 35
hfs_bnode_findhash---of 6
hfs_bnode_free34%of 9
hfs_bnode_get---of 3
hfs_bnode_move---of 8
hfs_bnode_put25%of 20
hfs_bnode_read28%of 11
hfs_bnode_read_key---of 6
hfs_bnode_read_u16---of 1
hfs_bnode_read_u8---of 1
hfs_bnode_unhash---of 4
hfs_bnode_unlink---of 14
hfs_bnode_write---of 5
hfs_bnode_write_u16---of 1
hfs_bnode_write_u8---of 1
-----------
SUMMARY22%of 90

inflate_fast7%of 76
-----------
SUMMARY7%of 76

cpumask_any_and_distribute75%of 8
cpumask_any_distribute---of 8
cpumask_local_spread---of 3
-----------
SUMMARY75%of 8

__modver_version_show---of 1
add_sysfs_param---of 11
kernel_param_lock---of 1
kernel_param_unlock---of 1
lookup_or_create_module_kobject50%of 6
module_attr_show---of 3
module_attr_store---of 3
module_destroy_params---of 6
module_kobj_release---of 3
module_param_sysfs_remove---of 4
module_param_sysfs_setup---of 12
param_array_free---of 5
param_array_get---of 9
param_array_set---of 11
param_attr_show---of 3
param_attr_store---of 7
param_free_charp---of 6
param_get_bool---of 1
param_get_byte---of 1
param_get_charp---of 1
param_get_hexint---of 1
param_get_int---of 1
param_get_invbool---of 1
param_get_long---of 1
param_get_short---of 1
param_get_string---of 1
param_get_uint---of 1
param_get_ullong---of 1
param_get_ulong---of 1
param_get_ushort---of 1
param_set_bint---of 3
param_set_bool---of 1
param_set_bool_enable_only---of 4
param_set_byte---of 1
param_set_charp---of 14
param_set_copystring---of 4
param_set_hexint---of 1
param_set_int---of 1
param_set_invbool---of 3
param_set_long---of 1
param_set_short---of 1
param_set_uint---of 1
param_set_uint_minmax---of 5
param_set_ullong---of 1
param_set_ulong---of 1
param_set_ushort---of 1
parameq---of 4
parameqn---of 4
parse_args---of 30
uevent_filter---of 1
-----------
SUMMARY50%of 6

-----------
SUMMARY---of 0

__add_preferred_console---of 30
__bpf_trace_console---of 1
__ia32_sys_syslog---of 1
__pr_flush---of 37
__printk_cpu_sync_put---of 3
__printk_cpu_sync_try_get---of 4
__printk_cpu_sync_wait---of 3
__printk_ratelimit---of 1
__probestub_console---of 1
__traceiter_console---of 4
__x64_sys_syslog---of 1
_printk100%of 1
_printk_deferred---of 1
add_preferred_console---of 1
console_cpu_notify---of 10
console_device---of 12
console_flush_all49%of 39
console_flush_on_panic---of 13
console_force_preferred_locked---of 10
console_list_lock---of 1
console_list_unlock---of 1
console_lock---of 4
console_lock_spinning_disable_and_check---of 5
console_lock_spinning_enable---of 3
console_prepend_dropped---of 1
console_prepend_message---of 4
console_prepend_replay---of 1
console_resume---of 7
console_resume_all---of 17
console_srcu_read_lock---of 1
console_srcu_read_unlock---of 3
console_suspend---of 1
console_suspend_all---of 7
console_try_replay_all---of 21
console_trylock---of 6
console_unblank---of 42
console_unlock47%of 13
console_verbose---of 3
defer_console_output---of 5
devkmsg_emit---of 1
devkmsg_llseek---of 7
devkmsg_open---of 10
devkmsg_poll---of 6
devkmsg_read---of 14
devkmsg_release---of 4
devkmsg_sysctl_set_loglvl---of 18
devkmsg_write---of 12
do_syslog---of 34
early_printk---of 3
find_first_fitting_seq---of 7
get_init_console_seq---of 17
info_print_prefix60%of 5
is_console_locked---of 1
is_printk_cpu_sync_owner100%of 1
kmsg_dump_desc---of 6
kmsg_dump_get_buffer---of 15
kmsg_dump_get_line---of 10
kmsg_dump_reason_str---of 6
kmsg_dump_register---of 5
kmsg_dump_rewind---of 3
kmsg_dump_unregister---of 5
log_buf_addr_get---of 1
log_buf_len_get---of 1
log_buf_vmcoreinfo_setup---of 1
match_devname_and_update_preferred_console---of 24
msg_add_dict_text---of 18
perf_trace_console---of 9
pr_flush---of 1
printk_get_console_flush_type14%of 22
printk_get_next_message32%of 16
printk_kthreads_check_locked---of 10
printk_kthreads_shutdown---of 7
printk_legacy_allow_panic_sync---of 7
printk_parse_prefix---of 28
printk_percpu_data_ready100%of 1
printk_sprint55%of 24
printk_timed_ratelimit---of 4
printk_trigger_flush---of 16
record_print_text42%of 12
register_console14%of 53
syslog_print---of 22
syslog_print_all---of 12
trace_event_raw_event_console---of 10
trace_raw_output_console---of 3
try_enable_preferred_console30%of 24
unregister_console---of 1
unregister_console_locked---of 47
vprintk_default100%of 1
vprintk_deferred---of 1
vprintk_emit50%of 40
vprintk_store60%of 42
wake_up_klogd---of 7
wake_up_klogd_work_func---of 12
-----------
SUMMARY40%of 294

-----------
SUMMARY---of 0

l3mdev_fib_rule_match---of 6
l3mdev_fib_table_by_index---of 9
l3mdev_fib_table_rcu25%of 8
l3mdev_ifindex_lookup_by_table_id---of 4
l3mdev_link_scope_lookup---of 9
l3mdev_master_ifindex_rcu34%of 6
l3mdev_master_upper_ifindex_by_index_rcu---of 5
l3mdev_table_lookup_register---of 4
l3mdev_table_lookup_unregister---of 4
l3mdev_update_flow---of 17
-----------
SUMMARY29%of 14

-----------
SUMMARY---of 0

bEndpointAddress_show---of 1
bInterval_show---of 1
bLength_show---of 1
bmAttributes_show---of 1
direction_show---of 3
ep_device_release100%of 1
interval_show---of 1
type_show---of 5
usb_create_ep_devs40%of 5
usb_remove_ep_devs67%of 3
wMaxPacketSize_show---of 1
-----------
SUMMARY56%of 9

-----------
SUMMARY---of 0

squashfs_decompressor_setup37%of 11
squashfs_lookup_decompressor100%of 3
-----------
SUMMARY50%of 14

__bpf_trace_wbt_lat---of 1
__bpf_trace_wbt_stat---of 1
__bpf_trace_wbt_step---of 1
__bpf_trace_wbt_timer---of 1
__probestub_wbt_lat---of 1
__probestub_wbt_stat---of 1
__probestub_wbt_step---of 1
__probestub_wbt_timer---of 1
__traceiter_wbt_lat---of 4
__traceiter_wbt_stat---of 4
__traceiter_wbt_step---of 4
__traceiter_wbt_timer---of 4
perf_trace_wbt_lat---of 10
perf_trace_wbt_stat---of 10
perf_trace_wbt_step---of 10
perf_trace_wbt_timer---of 10
rwb_trace_step---of 5
scale_down---of 5
trace_event_raw_event_wbt_lat---of 11
trace_event_raw_event_wbt_stat---of 11
trace_event_raw_event_wbt_step---of 11
trace_event_raw_event_wbt_timer---of 11
trace_raw_output_wbt_lat---of 3
trace_raw_output_wbt_stat---of 3
trace_raw_output_wbt_step---of 3
trace_raw_output_wbt_timer---of 3
wb_timer_fn---of 93
wbt_background_show---of 1
wbt_cleanup---of 18
wbt_cleanup_cb---of 11
wbt_curr_win_nsec_show---of 1
wbt_data_dir---of 1
wbt_disable_default---of 6
wbt_disabled---of 6
wbt_done15%of 21
wbt_enable_default---of 7
wbt_enabled_show---of 1
wbt_exit---of 1
wbt_get_min_lat---of 5
wbt_id_show---of 1
wbt_inflight_cb29%of 7
wbt_inflight_show---of 1
wbt_init---of 5
wbt_init_enable_default---of 13
wbt_issue67%of 6
wbt_min_lat_nsec_show---of 1
wbt_normal_show---of 1
wbt_queue_depth_changed---of 3
wbt_requeue---of 5
wbt_set_lat---of 22
wbt_track38%of 8
wbt_unknown_cnt_show---of 1
wbt_update_limits---of 10
wbt_wait50%of 12
-----------
SUMMARY34%of 54

bpf_insn_array_adjust---of 13
bpf_insn_array_adjust_after_remove---of 6
bpf_insn_array_init---of 19
bpf_insn_array_ready---of 7
bpf_insn_array_release---of 1
bpf_prog_update_insn_ptrs---of 13
insn_array_alloc---of 3
insn_array_alloc_check40%of 5
insn_array_check_btf---of 3
insn_array_delete_elem---of 1
insn_array_free---of 1
insn_array_lookup_elem---of 1
insn_array_map_direct_value_addr---of 4
insn_array_mem_usage---of 1
insn_array_update_elem---of 13
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

squashfs_get_id50%of 4
squashfs_read_id_index_table29%of 7
-----------
SUMMARY37%of 11

__cleanup_mnt100%of 1
__detach_mounts---of 15
__do_loopback---of 16
__has_locked_children---of 7
__ia32_sys_fsmount---of 1
__ia32_sys_listmount---of 1
__ia32_sys_mount100%of 1
__ia32_sys_mount_setattr---of 1
__ia32_sys_move_mount---of 1
__ia32_sys_oldumount---of 3
__ia32_sys_open_tree---of 6
__ia32_sys_open_tree_attr---of 1
__ia32_sys_pivot_root---of 4
__ia32_sys_statmount---of 1
__ia32_sys_umount---of 4
__is_local_mountpoint---of 4
__legitimize_mnt50%of 6
__list_del_entry---of 4
__lookup_mnt29%of 7
__mnt_is_readonly---of 3
__se_sys_fsmount---of 40
__se_sys_listmount---of 67
__se_sys_mount50%of 18
__se_sys_mount_setattr---of 11
__se_sys_move_mount---of 19
__se_sys_open_tree_attr---of 22
__se_sys_statmount---of 117
__umount_mnt---of 18
__unlock_mount---of 1
__x64_sys_fsmount---of 1
__x64_sys_listmount---of 1
__x64_sys_mount---of 1
__x64_sys_mount_setattr---of 1
__x64_sys_move_mount---of 1
__x64_sys_oldumount---of 3
__x64_sys_open_tree---of 6
__x64_sys_open_tree_attr---of 1
__x64_sys_pivot_root---of 4
__x64_sys_statmount---of 1
__x64_sys_umount---of 4
alloc_mnt_ns---of 14
alloc_vfsmnt50%of 8
attach_mnt---of 10
attach_recursive_mnt18%of 140
can_move_mount_beneath---of 8
check_for_nsfs_mounts---of 12
class_fd_prepare_destructor---of 5
cleanup_group_ids---of 11
cleanup_mnt30%of 20
clone_mnt---of 28
clone_private_mount---of 26
collect_paths---of 28
copy_mnt_id_req---of 17
copy_mnt_ns---of 55
copy_tree---of 42
count_mounts---of 12
current_chrooted---of 4
delayed_free_vfsmnt---of 1
delayed_mntput---of 4
dissolve_on_fput---of 8
do_change_type---of 25
do_lock_mount31%of 43
do_loopback---of 22
do_mount---of 3
do_mount_setattr---of 98
do_move_mount---of 36
do_move_mount_old---of 5
do_new_mount45%of 45
drop_collected_paths---of 6
fc_mount---of 3
fc_mount_longterm---of 5
finish_automount---of 30
free_mnt_ns---of 5
from_mnt_ns---of 1
get_mountpoint23%of 22
get_sequential_mnt_ns---of 20
get_user_ns---of 11
grab_requested_mnt_ns---of 16
grab_requested_root---of 14
graft_tree---of 4
has_locked_children---of 7
is_path_reachable---of 5
kern_mount---of 3
kern_unmount---of 7
kern_unmount_array---of 25
list_add---of 4
lock_mnt_tree---of 8
lock_mount_exact---of 28
lookup_mnt24%of 17
m_next---of 3
m_show---of 1
m_start---of 9
m_stop---of 1
mark_mounts_for_expiry---of 19
may_mount---of 1
may_umount---of 1
may_umount_tree---of 13
may_use_mount---of 6
mnt_add_to_ns40%of 15
mnt_change_mountpoint---of 16
mnt_clone_internal---of 3
mnt_drop_write58%of 7
mnt_drop_write_file---of 8
mnt_get_count---of 5
mnt_get_write_access40%of 10
mnt_get_write_access_file---of 12
mnt_make_shortterm---of 3
mnt_may_suid---of 4
mnt_notify_add---of 9
mnt_ns_from_dentry---of 3
mnt_ns_release_rcu---of 5
mnt_put_write_access67%of 3
mnt_put_write_access_file---of 4
mnt_release_group_id---of 1
mnt_set_expiry---of 3
mnt_set_mountpoint---of 4
mnt_want_write37%of 19
mnt_want_write_file---of 10
mnt_warn_timestamp_expiry34%of 9
mntget67%of 3
mntns_get---of 12
mntns_install---of 18
mntns_owner---of 1
mntns_put---of 1
mntput84%of 6
mntput_no_expire_slowpath29%of 28
mount_subtree---of 12
mount_too_revealing11%of 19
namespace_unlock24%of 26
open_detached_copy---of 22
open_new_namespace---of 42
our_mnt---of 1
path_is_mountpoint---of 13
path_is_under---of 5
path_mount12%of 53
path_pivot_root---of 33
path_umount---of 35
put_mnt_ns---of 12
put_user_ns---of 12
sb_prepare_remount_readonly---of 19
show_path---of 3
shrink_submounts---of 19
statmount_string---of 53
touch_mnt_namespace---of 3
tree_contains_unbindable---of 9
umount_tree---of 60
unpin_mountpoint---of 9
vfs_create_mount56%of 9
vfs_kern_mount---of 9
vfs_move_mount---of 22
vfs_open_tree---of 13
vfsmount_to_propagation_flags---of 1
wants_mount_setattr---of 33
-----------
SUMMARY29%of 535

-----------
SUMMARY---of 0

chacha_block_generic100%of 1
chacha_permute58%of 7
hchacha_block_generic---of 1
-----------
SUMMARY63%of 8

__futex_hash25%of 12
__futex_pivot_hash---of 14
__futex_queue100%of 1
__futex_unqueue---of 4
compat_exit_robust_list---of 16
exit_pi_state_list---of 38
exit_robust_list---of 16
fault_in_user_writeable---of 7
folio_lock---of 3
futex_exec_release---of 7
futex_exit_recursive---of 3
futex_exit_release---of 7
futex_hash13%of 16
futex_hash_allocate---of 54
futex_hash_allocate_default---of 7
futex_hash_free---of 3
futex_hash_get---of 11
futex_hash_prctl---of 6
futex_hash_put25%of 8
futex_mm_init---of 1
futex_private_hash---of 16
futex_private_hash_put---of 7
futex_q_lock100%of 1
futex_q_lockptr_lock---of 4
futex_q_unlock---of 1
futex_ref_rcu---of 12
futex_setup_timer67%of 3
futex_top_waiter---of 9
futex_unqueue38%of 8
futex_unqueue_pi---of 7
get_futex_key9%of 72
handle_futex_death---of 14
should_fail_futex---of 3
wait_for_owner_exiting---of 7
-----------
SUMMARY17%of 121

inode_free_by_rcu---of 19
lsm_cred_alloc---of 3
lsm_fill_user_ctx---of 7
lsm_task_alloc---of 3
security_audit_rule_free---of 19
security_audit_rule_init---of 27
security_audit_rule_known---of 27
security_audit_rule_match---of 27
security_backing_file_alloc---of 31
security_backing_file_free---of 21
security_bdev_alloc---of 31
security_bdev_free---of 20
security_bdev_setintegrity---of 27
security_binder_set_context_mgr---of 27
security_binder_transaction---of 27
security_binder_transfer_binder---of 27
security_binder_transfer_file---of 27
security_bpf38%of 27
security_bpf_map---of 27
security_bpf_map_create---of 31
security_bpf_map_free---of 19
security_bpf_prog38%of 27
security_bpf_prog_free53%of 19
security_bpf_prog_load36%of 31
security_bpf_token_capable---of 27
security_bpf_token_cmd---of 27
security_bpf_token_create---of 31
security_bpf_token_free---of 19
security_bprm_check---of 27
security_bprm_committed_creds---of 19
security_bprm_committing_creds---of 19
security_bprm_creds_for_exec---of 27
security_bprm_creds_from_file---of 27
security_capable38%of 27
security_capget---of 27
security_capset---of 27
security_create_user_ns---of 27
security_cred_alloc_blank---of 31
security_cred_free---of 20
security_cred_getlsmprop---of 19
security_cred_getsecid---of 19
security_current_getlsmprop_subj53%of 19
security_d_instantiate55%of 22
security_dentry_create_files_as---of 27
security_dentry_init_security---of 27
security_file_alloc36%of 31
security_file_fcntl---of 27
security_file_free53%of 21
security_file_ioctl---of 27
security_file_ioctl_compat38%of 27
security_file_lock---of 27
security_file_mprotect---of 27
security_file_open38%of 27
security_file_permission38%of 27
security_file_post_open38%of 27
security_file_receive---of 27
security_file_release53%of 19
security_file_send_sigiotask---of 27
security_file_set_fowner---of 19
security_file_truncate---of 27
security_free_mnt_opts10%of 20
security_fs_context_dup---of 27
security_fs_context_parse_param29%of 35
security_fs_context_submount---of 27
security_getprocattr---of 41
security_getselfattr---of 22
security_ib_alloc_security---of 31
security_ib_endport_manage_subnet---of 27
security_ib_free_security---of 1
security_ib_pkey_access---of 27
security_inet_conn_established---of 19
security_inet_conn_request---of 27
security_inet_csk_clone---of 19
security_initramfs_populated---of 19
security_inode_alloc36%of 31
security_inode_copy_up---of 27
security_inode_copy_up_xattr---of 27
security_inode_create---of 28
security_inode_file_getattr---of 27
security_inode_file_setattr---of 27
security_inode_follow_link36%of 28
security_inode_free53%of 21
security_inode_get_acl---of 28
security_inode_getattr---of 28
security_inode_getlsmprop---of 19
security_inode_getsecctx---of 27
security_inode_getsecurity---of 28
security_inode_getxattr---of 28
security_inode_init_security27%of 46
security_inode_init_security_anon---of 27
security_inode_invalidate_secctx---of 19
security_inode_killpriv---of 27
security_inode_link---of 28
security_inode_listsecurity---of 28
security_inode_listxattr---of 28
security_inode_mkdir---of 28
security_inode_mknod36%of 28
security_inode_need_killpriv---of 27
security_inode_notifysecctx---of 27
security_inode_permission36%of 28
security_inode_post_create_tmpfile---of 20
security_inode_post_remove_acl---of 20
security_inode_post_removexattr---of 20
security_inode_post_set_acl---of 20
security_inode_post_setattr---of 20
security_inode_post_setxattr---of 20
security_inode_readlink---of 28
security_inode_remove_acl---of 28
security_inode_removexattr---of 57
security_inode_rename---of 59
security_inode_rmdir---of 28
security_inode_set_acl---of 28
security_inode_setattr---of 28
security_inode_setintegrity---of 27
security_inode_setsecctx---of 27
security_inode_setsecurity---of 28
security_inode_setxattr---of 57
security_inode_symlink---of 28
security_inode_unlink---of 28
security_ipc_getlsmprop---of 19
security_ipc_permission---of 27
security_ismaclabel---of 27
security_kernel_act_as---of 27
security_kernel_create_files_as---of 27
security_kernel_load_data38%of 27
security_kernel_module_request38%of 27
security_kernel_post_load_data---of 27
security_kernel_post_read_file---of 27
security_kernel_read_file---of 27
security_kernfs_init_security38%of 27
security_key_alloc36%of 31
security_key_free---of 1
security_key_getsecurity---of 27
security_key_permission38%of 27
security_key_post_create_or_update---of 19
security_locked_down38%of 27
security_lsmprop_to_secctx---of 41
security_mmap_addr38%of 27
security_mmap_backing_file---of 28
security_mmap_file20%of 62
security_move_mount---of 27
security_mptcp_add_subflow---of 27
security_msg_msg_alloc---of 31
security_msg_msg_free---of 19
security_msg_queue_alloc---of 31
security_msg_queue_associate---of 27
security_msg_queue_free---of 19
security_msg_queue_msgctl---of 27
security_msg_queue_msgrcv---of 27
security_msg_queue_msgsnd---of 27
security_netlink_send---of 27
security_path_chmod---of 28
security_path_chown---of 28
security_path_chroot---of 27
security_path_link---of 28
security_path_mkdir---of 28
security_path_mknod36%of 28
security_path_notify---of 27
security_path_post_mknod---of 20
security_path_rename---of 31
security_path_rmdir---of 28
security_path_symlink---of 28
security_path_truncate---of 28
security_path_unlink---of 28
security_perf_event_alloc---of 31
security_perf_event_free---of 1
security_perf_event_open---of 27
security_perf_event_read---of 27
security_perf_event_write---of 27
security_post_notification---of 27
security_prepare_creds36%of 31
security_ptrace_access_check---of 27
security_ptrace_traceme---of 27
security_quota_on---of 27
security_quotactl---of 27
security_release_secctx---of 19
security_req_classify_flow---of 19
security_sb_alloc36%of 31
security_sb_clone_mnt_opts---of 27
security_sb_delete---of 19
security_sb_eat_lsm_opts38%of 27
security_sb_free---of 19
security_sb_kern_mount38%of 27
security_sb_mnt_opts_compat---of 27
security_sb_mount38%of 27
security_sb_pivotroot---of 27
security_sb_remount---of 27
security_sb_set_mnt_opts38%of 27
security_sb_show_options---of 27
security_sb_statfs---of 27
security_sb_umount---of 27
security_sctp_assoc_established---of 27
security_sctp_assoc_request---of 27
security_sctp_bind_connect---of 27
security_sctp_sk_clone---of 19
security_secctx_to_secid---of 27
security_secid_to_secctx---of 27
security_secmark_refcount_dec---of 19
security_secmark_refcount_inc---of 19
security_secmark_relabel_packet---of 27
security_sem_alloc---of 31
security_sem_associate---of 27
security_sem_free---of 19
security_sem_semctl38%of 27
security_sem_semop---of 27
security_setprocattr---of 41
security_setselfattr---of 34
security_settime64---of 27
security_shm_alloc---of 31
security_shm_associate---of 27
security_shm_free---of 19
security_shm_shmat---of 27
security_shm_shmctl---of 27
security_sk_alloc---of 31
security_sk_classify_flow---of 19
security_sk_clone---of 19
security_sk_free---of 19
security_skb_classify_flow36%of 28
security_sock_graft---of 19
security_sock_rcv_skb38%of 27
security_socket_accept---of 27
security_socket_bind---of 27
security_socket_connect---of 27
security_socket_create38%of 27
security_socket_getpeername---of 27
security_socket_getpeersec_dgram---of 27
security_socket_getpeersec_stream---of 27
security_socket_getsockname---of 27
security_socket_getsockopt---of 27
security_socket_listen---of 27
security_socket_post_create---of 27
security_socket_recvmsg---of 27
security_socket_sendmsg---of 27
security_socket_setsockopt---of 27
security_socket_shutdown---of 27
security_socket_socketpair---of 27
security_syslog---of 27
security_task_alloc---of 31
security_task_fix_setgid---of 27
security_task_fix_setgroups---of 27
security_task_fix_setuid---of 27
security_task_free---of 19
security_task_getioprio---of 27
security_task_getlsmprop_obj---of 19
security_task_getpgid---of 27
security_task_getscheduler---of 27
security_task_getsid---of 27
security_task_kill---of 27
security_task_movememory---of 27
security_task_prctl29%of 35
security_task_prlimit---of 27
security_task_setioprio38%of 27
security_task_setnice---of 27
security_task_setpgid---of 27
security_task_setrlimit---of 27
security_task_setscheduler---of 27
security_task_to_inode---of 19
security_transfer_creds---of 19
security_tun_dev_alloc_security---of 31
security_tun_dev_attach---of 27
security_tun_dev_attach_queue---of 27
security_tun_dev_create---of 27
security_tun_dev_free_security---of 1
security_tun_dev_open---of 27
security_unix_find---of 27
security_unix_may_send---of 27
security_unix_stream_connect---of 27
security_uring_allowed---of 27
security_uring_cmd---of 27
security_uring_override_creds---of 27
security_uring_sqpoll---of 27
security_vm_enough_memory_mm---of 28
security_watch_key---of 27
security_xfrm_decode_session38%of 27
security_xfrm_policy_alloc---of 27
security_xfrm_policy_clone---of 27
security_xfrm_policy_delete---of 27
security_xfrm_policy_free---of 19
security_xfrm_policy_lookup---of 27
security_xfrm_state_alloc---of 27
security_xfrm_state_alloc_acquire---of 27
security_xfrm_state_delete---of 27
security_xfrm_state_free---of 19
security_xfrm_state_pol_flow_match---of 11
-----------
SUMMARY36%of 1239

__put_net---of 6
cleanup_net---of 34
copy_net_ns---of 34
get_net_ns---of 1
get_net_ns_by_fd---of 16
get_net_ns_by_id---of 4
get_net_ns_by_pid---of 13
maybe_get_net---of 17
net_drop_ns---of 7
net_eq_idr---of 1
net_ns_barrier---of 1
net_ns_get_ownership---of 6
net_ns_net_init---of 1
net_passive_dec---of 6
netns_get---of 12
netns_install---of 13
netns_owner---of 1
netns_put---of 1
ops_init---of 13
ops_undo_list---of 35
peernet2id100%of 1
peernet2id_alloc---of 6
peernet_has_id---of 1
preinit_net---of 3
put_net---of 17
put_user_ns---of 12
register_pernet_device---of 3
register_pernet_operations---of 25
register_pernet_subsys---of 1
rtnl_net_dumpid---of 21
rtnl_net_dumpid_one---of 6
rtnl_net_fill---of 10
rtnl_net_getid---of 31
rtnl_net_newid---of 18
rtnl_net_notifyid---of 6
setup_net---of 9
unregister_pernet_device---of 3
unregister_pernet_operations---of 16
unregister_pernet_subsys---of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

____ip_mc_inc_group---of 26
__igmp_group_dropped---of 24
__ip_mc_dec_group---of 25
__ip_mc_inc_group---of 1
__ip_mc_join_group---of 14
add_grec---of 65
add_grhead---of 6
copy_to_sockptr_offset---of 4
igmp_gq_start_timer---of 8
igmp_gq_timer_expire---of 4
igmp_group_added---of 22
igmp_heard_report---of 11
igmp_ifc_event---of 17
igmp_ifc_timer_expire---of 61
igmp_mc_seq_next---of 14
igmp_mc_seq_show---of 15
igmp_mc_seq_start---of 17
igmp_mc_seq_stop---of 1
igmp_mcf_seq_next---of 18
igmp_mcf_seq_show---of 3
igmp_mcf_seq_start---of 21
igmp_mcf_seq_stop---of 3
igmp_net_exit---of 3
igmp_net_init---of 5
igmp_netdev_event---of 22
igmp_rcv---of 103
igmp_send_report---of 16
igmp_start_timer---of 16
igmp_timer_expire---of 30
igmpv3_add_delrec---of 14
igmpv3_clear_delrec---of 25
igmpv3_del_delrec---of 25
igmpv3_newpack---of 19
igmpv3_query_hdr---of 3
igmpv3_send_report---of 11
igmpv3_sendpack---of 5
inet_fill_ifmcaddr---of 9
inet_ifmcaddr_notify---of 5
ip_check_mc_rcu22%of 14
ip_mc_add_src---of 39
ip_mc_check_igmp---of 73
ip_mc_del1_src---of 23
ip_mc_del_src---of 27
ip_mc_destroy_dev---of 17
ip_mc_down---of 12
ip_mc_drop_socket---of 9
ip_mc_find_dev---of 7
ip_mc_gsfget---of 18
ip_mc_inc_group---of 1
ip_mc_init_dev---of 3
ip_mc_join_group---of 1
ip_mc_join_group_ssm---of 1
ip_mc_leave_group---of 19
ip_mc_msfget---of 26
ip_mc_msfilter---of 27
ip_mc_remap---of 6
ip_mc_sf_allow---of 17
ip_mc_source---of 47
ip_mc_unmap---of 6
ip_mc_up---of 6
ip_mc_validate_checksum---of 11
is_in---of 22
sf_setstate---of 19
-----------
SUMMARY22%of 14

-----------
SUMMARY---of 0

__xfrm_decode_session25%of 16
__xfrm_dst_lookup---of 4
__xfrm_policy_bysel_ctx---of 29
__xfrm_policy_check---of 142
__xfrm_policy_inexact_prune_bin---of 42
__xfrm_policy_link---of 21
__xfrm_policy_unlink---of 12
__xfrm_route_forward---of 23
__xfrm_sk_clone_policy---of 10
dst_discard---of 1
dst_metrics_write_ptr---of 4
jhash2---of 8
policy_hash_bysel---of 18
xdst_queue_output---of 19
xfrm_audit_common_policyinfo---of 12
xfrm_audit_policy_add---of 5
xfrm_audit_policy_delete---of 5
xfrm_confirm_neigh---of 8
xfrm_default_advmss---of 6
xfrm_dev_policy_flush---of 32
xfrm_dst_check---of 46
xfrm_dst_ifdown---of 14
xfrm_expand_policies---of 18
xfrm_hash_rebuild---of 42
xfrm_hash_resize---of 44
xfrm_icmp_flow_decode---of 13
xfrm_if_register_cb---of 1
xfrm_if_unregister_cb---of 1
xfrm_link_failure---of 1
xfrm_lookup100%of 1
xfrm_lookup_route---of 6
xfrm_lookup_with_ifid10%of 123
xfrm_migrate---of 92
xfrm_mtu---of 8
xfrm_negative_advice---of 3
xfrm_neigh_lookup---of 9
xfrm_net_exit---of 1
xfrm_net_init---of 16
xfrm_pol_bin_cmp---of 6
xfrm_pol_bin_key---of 1
xfrm_pol_bin_obj---of 1
xfrm_policy_addr_delta---of 11
xfrm_policy_alloc---of 6
xfrm_policy_byid---of 18
xfrm_policy_bysel_ctx---of 67
xfrm_policy_delete---of 14
xfrm_policy_destroy---of 10
xfrm_policy_destroy_rcu---of 1
xfrm_policy_fini---of 14
xfrm_policy_flush---of 32
xfrm_policy_hash_rebuild---of 1
xfrm_policy_inexact_alloc_bin---of 59
xfrm_policy_inexact_alloc_chain---of 27
xfrm_policy_inexact_gc_tree---of 10
xfrm_policy_inexact_insert_node---of 38
xfrm_policy_inexact_list_reinsert---of 26
xfrm_policy_inexact_lookup_rcu---of 15
xfrm_policy_insert---of 38
xfrm_policy_insert_list---of 41
xfrm_policy_kill---of 23
xfrm_policy_lookup_bytype---of 100
xfrm_policy_queue_process---of 53
xfrm_policy_register_afinfo---of 19
xfrm_policy_requeue---of 14
xfrm_policy_timer---of 32
xfrm_policy_unregister_afinfo---of 13
xfrm_policy_walk---of 25
xfrm_policy_walk_done---of 5
xfrm_policy_walk_init---of 1
xfrm_resolve_and_create_bundle---of 154
xfrm_secpath_reject---of 6
xfrm_selector_match---of 58
xfrm_sk_policy_insert---of 22
xfrm_sk_policy_lookup---of 16
xfrm_spd_getinfo---of 1
-----------
SUMMARY13%of 140

-----------
SUMMARY---of 0

__xfs_sb_from_disk45%of 20
xfs_compute_rextslog---of 3
xfs_compute_rgblklog---of 1
xfs_fs_geometry---of 37
xfs_log_sb---of 3
xfs_mount_sb_set_rextsize---of 10
xfs_sb_from_disk100%of 1
xfs_sb_get_secondary---of 5
xfs_sb_good_version---of 9
xfs_sb_mount_common---of 10
xfs_sb_mount_rextsize---of 7
xfs_sb_quiet_read_verify---of 3
xfs_sb_quota_from_disk---of 15
xfs_sb_read_secondary---of 5
xfs_sb_read_verify---of 13
xfs_sb_to_disk---of 17
xfs_sb_version_to_features---of 5
xfs_sb_write_verify---of 6
xfs_sync_sb---of 4
xfs_sync_sb_buf---of 10
xfs_update_secondary_sbs---of 13
xfs_validate_rt_geometry---of 23
xfs_validate_sb_common---of 69
xfs_validate_sb_read---of 9
xfs_validate_sb_rtgroups---of 8
xfs_validate_sb_write---of 14
xfs_validate_sb_zoned---of 8
xfs_validate_stripe_geometry---of 20
-----------
SUMMARY48%of 21

-----------
SUMMARY---of 0

chroot_fs_refs---of 19
copy_fs_struct---of 3
exit_fs---of 4
free_fs_struct---of 1
set_fs_pwd67%of 3
set_fs_root---of 3
unshare_fs_struct---of 4
-----------
SUMMARY67%of 3

__ia32_compat_sys_getrlimit---of 3
__ia32_compat_sys_getrusage---of 3
__ia32_compat_sys_old_getrlimit---of 4
__ia32_compat_sys_setrlimit---of 3
__ia32_compat_sys_sysinfo---of 5
__ia32_compat_sys_times---of 4
__ia32_sys_getcpu---of 5
__ia32_sys_gethostname---of 1
__ia32_sys_getpgid---of 6
__ia32_sys_getpriority---of 1
__ia32_sys_getresgid---of 1
__ia32_sys_getresuid---of 1
__ia32_sys_getrlimit---of 3
__ia32_sys_getrusage---of 3
__ia32_sys_getsid---of 6
__ia32_sys_newuname---of 1
__ia32_sys_old_getrlimit---of 6
__ia32_sys_olduname---of 1
__ia32_sys_prctl100%of 1
__ia32_sys_prlimit64---of 1
__ia32_sys_setdomainname---of 1
__ia32_sys_setfsgid---of 1
__ia32_sys_setfsuid---of 1
__ia32_sys_setgid---of 1
__ia32_sys_sethostname---of 1
__ia32_sys_setpgid---of 1
__ia32_sys_setpriority100%of 1
__ia32_sys_setregid---of 1
__ia32_sys_setresgid---of 1
__ia32_sys_setresuid---of 1
__ia32_sys_setreuid---of 1
__ia32_sys_setrlimit---of 3
__ia32_sys_setuid---of 1
__ia32_sys_sysinfo---of 1
__ia32_sys_times---of 1
__ia32_sys_umask---of 1
__ia32_sys_uname---of 1
__se_sys_gethostname---of 5
__se_sys_getpriority---of 31
__se_sys_getresgid---of 4
__se_sys_getresuid---of 4
__se_sys_newuname---of 10
__se_sys_olduname---of 9
__se_sys_prctl2%of 132
__se_sys_prlimit64---of 30
__se_sys_setdomainname---of 5
__se_sys_sethostname---of 5
__se_sys_setpgid---of 19
__se_sys_setpriority7%of 31
__se_sys_times---of 4
__se_sys_uname---of 11
__sys_setfsgid---of 12
__sys_setfsuid---of 12
__sys_setgid---of 9
__sys_setregid---of 22
__sys_setresgid---of 39
__sys_setresuid---of 45
__sys_setreuid---of 28
__sys_setuid---of 15
__x64_sys_getcpu---of 5
__x64_sys_getegid---of 1
__x64_sys_geteuid---of 1
__x64_sys_getgid---of 1
__x64_sys_gethostname---of 1
__x64_sys_getpgid---of 6
__x64_sys_getpgrp---of 1
__x64_sys_getpid---of 1
__x64_sys_getppid---of 1
__x64_sys_getpriority---of 1
__x64_sys_getresgid---of 1
__x64_sys_getresuid---of 1
__x64_sys_getrlimit---of 3
__x64_sys_getrusage---of 3
__x64_sys_getsid---of 6
__x64_sys_gettid---of 1
__x64_sys_getuid---of 1
__x64_sys_newuname---of 1
__x64_sys_old_getrlimit---of 6
__x64_sys_olduname---of 1
__x64_sys_prctl---of 1
__x64_sys_prlimit64---of 1
__x64_sys_setdomainname---of 1
__x64_sys_setfsgid---of 1
__x64_sys_setfsuid---of 1
__x64_sys_setgid---of 1
__x64_sys_sethostname---of 1
__x64_sys_setpgid---of 1
__x64_sys_setpriority---of 1
__x64_sys_setregid---of 1
__x64_sys_setresgid---of 1
__x64_sys_setresuid---of 1
__x64_sys_setreuid---of 1
__x64_sys_setrlimit---of 3
__x64_sys_setsid---of 1
__x64_sys_setuid---of 1
__x64_sys_sysinfo---of 1
__x64_sys_times---of 1
__x64_sys_umask---of 1
__x64_sys_uname---of 1
arch_get_shadow_stack_status---of 1
arch_lock_shadow_stack_status---of 1
arch_prctl_get_branch_landing_pad_state---of 1
arch_prctl_lock_branch_landing_pad_state---of 1
arch_prctl_set_branch_landing_pad_state---of 1
arch_set_shadow_stack_status---of 1
checkpoint_restore_ns_capable---of 3
do_prlimit---of 19
do_sysinfo---of 5
getrusage---of 22
ksys_setsid---of 9
mmap_read_lock---of 5
mmap_write_lock_killable---of 7
mmap_write_unlock---of 5
prctl_get_auxv---of 4
prctl_get_tid_address---of 1
prctl_set_auxv---of 4
prctl_set_mdwe---of 8
prctl_set_mm---of 52
prctl_set_mm_exe_file---of 8
prctl_set_thp_disable---of 17
propagate_has_child_subreaper---of 4
set_one_prio---of 10
trace_task_prctl_unknown---of 5
validate_prctl_map_addr---of 18
-----------
SUMMARY4%of 165

-----------
SUMMARY---of 0

nilfs_abort_roll_forward---of 10
nilfs_dispose_segment_list58%of 7
nilfs_read_super_root_block25%of 8
nilfs_salvage_orphan_logs2%of 117
nilfs_search_super_root45%of 60
nilfs_validate_log40%of 10
-----------
SUMMARY20%of 202

__fs_parse44%of 23
fs_lookup_param37%of 11
fs_param_is_blockdev100%of 1
fs_param_is_bool---of 14
fs_param_is_enum40%of 10
fs_param_is_fd---of 10
fs_param_is_file_or_string---of 7
fs_param_is_gid---of 9
fs_param_is_s32---of 7
fs_param_is_string40%of 5
fs_param_is_u3243%of 7
fs_param_is_u64---of 7
fs_param_is_uid45%of 9
fs_validate_description---of 10
lookup_constant50%of 6
-----------
SUMMARY44%of 72

__hsiphash_unaligned---of 12
__siphash_unaligned34%of 12
hsiphash_1u32---of 1
hsiphash_2u32---of 1
hsiphash_3u32---of 1
hsiphash_4u32---of 1
siphash_1u32---of 1
siphash_1u64---of 1
siphash_2u64---of 1
siphash_3u32100%of 1
siphash_3u64---of 1
siphash_4u64---of 1
-----------
SUMMARY39%of 13

-----------
SUMMARY---of 0

__dev_pm_qos_add_request---of 19
__dev_pm_qos_flags---of 5
__dev_pm_qos_remove_request---of 10
__dev_pm_qos_resume_latency100%of 3
__dev_pm_qos_update_request---of 18
apply_constraint---of 9
dev_pm_qos_add_ancestor_request---of 12
dev_pm_qos_add_notifier---of 9
dev_pm_qos_add_request---of 1
dev_pm_qos_constraints_allocate---of 4
dev_pm_qos_constraints_destroy17%of 24
dev_pm_qos_expose_flags---of 8
dev_pm_qos_expose_latency_limit---of 8
dev_pm_qos_expose_latency_tolerance---of 3
dev_pm_qos_flags40%of 5
dev_pm_qos_get_user_latency_tolerance---of 4
dev_pm_qos_hide_flags---of 4
dev_pm_qos_hide_latency_limit---of 4
dev_pm_qos_hide_latency_tolerance---of 4
dev_pm_qos_read_value---of 8
dev_pm_qos_remove_notifier---of 6
dev_pm_qos_remove_request---of 1
dev_pm_qos_update_flags---of 4
dev_pm_qos_update_request---of 1
dev_pm_qos_update_user_latency_tolerance---of 9
-----------
SUMMARY29%of 32

-----------
SUMMARY---of 0

cpu_latency_qos_add_request---of 9
cpu_latency_qos_limit---of 1
cpu_latency_qos_open---of 3
cpu_latency_qos_read---of 7
cpu_latency_qos_release---of 1
cpu_latency_qos_remove_request---of 9
cpu_latency_qos_request_active---of 1
cpu_latency_qos_update_request---of 10
cpu_latency_qos_write---of 5
freq_constraints_init---of 1
freq_qos_add_notifier---of 5
freq_qos_add_request---of 6
freq_qos_apply---of 4
freq_qos_read_value---of 6
freq_qos_remove_notifier---of 5
freq_qos_remove_request---of 6
freq_qos_update_request---of 7
pm_qos_read_value100%of 1
pm_qos_update_flags---of 25
pm_qos_update_target---of 22
-----------
SUMMARY100%of 1

__bpf_trace_balance_dirty_pages---of 1
__bpf_trace_bdi_dirty_ratelimit---of 1
__bpf_trace_flush_foreign---of 1
__bpf_trace_global_dirty_state---of 1
__bpf_trace_inode_foreign_history---of 1
__bpf_trace_inode_switch_wbs---of 1
__bpf_trace_inode_switch_wbs_queue---of 1
__bpf_trace_track_foreign_dirty---of 1
__bpf_trace_wbc_class---of 1
__bpf_trace_writeback_bdi_register---of 1
__bpf_trace_writeback_class---of 1
__bpf_trace_writeback_dirty_inode_template---of 1
__bpf_trace_writeback_folio_template---of 1
__bpf_trace_writeback_inode_template---of 1
__bpf_trace_writeback_pages_written---of 1
__bpf_trace_writeback_queue_io---of 1
__bpf_trace_writeback_sb_inodes_requeue---of 1
__bpf_trace_writeback_single_inode_template---of 1
__bpf_trace_writeback_work_class---of 1
__bpf_trace_writeback_write_inode_template---of 1
__inode_attach_wb15%of 20
__mark_inode_dirty46%of 44
__probestub_balance_dirty_pages---of 1
__probestub_bdi_dirty_ratelimit---of 1
__probestub_flush_foreign---of 1
__probestub_folio_wait_writeback---of 1
__probestub_global_dirty_state---of 1
__probestub_inode_foreign_history---of 1
__probestub_inode_switch_wbs---of 1
__probestub_inode_switch_wbs_queue---of 1
__probestub_sb_clear_inode_writeback---of 1
__probestub_sb_mark_inode_writeback---of 1
__probestub_track_foreign_dirty---of 1
__probestub_wbc_writepage---of 1
__probestub_writeback_bdi_register---of 1
__probestub_writeback_dirty_folio---of 1
__probestub_writeback_dirty_inode---of 1
__probestub_writeback_dirty_inode_enqueue---of 1
__probestub_writeback_dirty_inode_start---of 1
__probestub_writeback_exec---of 1
__probestub_writeback_lazytime---of 1
__probestub_writeback_mark_inode_dirty---of 1
__probestub_writeback_pages_written---of 1
__probestub_writeback_queue---of 1
__probestub_writeback_queue_io---of 1
__probestub_writeback_sb_inodes_requeue---of 1
__probestub_writeback_single_inode---of 1
__probestub_writeback_single_inode_start---of 1
__probestub_writeback_start---of 1
__probestub_writeback_wait---of 1
__probestub_writeback_wake_background---of 1
__probestub_writeback_write_inode---of 1
__probestub_writeback_write_inode_start---of 1
__probestub_writeback_written---of 1
__traceiter_balance_dirty_pages---of 4
__traceiter_bdi_dirty_ratelimit---of 4
__traceiter_flush_foreign---of 4
__traceiter_folio_wait_writeback---of 4
__traceiter_global_dirty_state---of 4
__traceiter_inode_foreign_history---of 4
__traceiter_inode_switch_wbs---of 4
__traceiter_inode_switch_wbs_queue---of 4
__traceiter_sb_clear_inode_writeback---of 4
__traceiter_sb_mark_inode_writeback---of 4
__traceiter_track_foreign_dirty---of 4
__traceiter_wbc_writepage---of 4
__traceiter_writeback_bdi_register---of 4
__traceiter_writeback_dirty_folio---of 4
__traceiter_writeback_dirty_inode---of 4
__traceiter_writeback_dirty_inode_enqueue---of 4
__traceiter_writeback_dirty_inode_start---of 4
__traceiter_writeback_exec---of 4
__traceiter_writeback_lazytime---of 4
__traceiter_writeback_mark_inode_dirty---of 4
__traceiter_writeback_pages_written---of 4
__traceiter_writeback_queue---of 4
__traceiter_writeback_queue_io---of 4
__traceiter_writeback_sb_inodes_requeue---of 4
__traceiter_writeback_single_inode---of 4
__traceiter_writeback_single_inode_start---of 4
__traceiter_writeback_start---of 4
__traceiter_writeback_wait---of 4
__traceiter_writeback_wake_background---of 4
__traceiter_writeback_write_inode---of 4
__traceiter_writeback_write_inode_start---of 4
__traceiter_writeback_written---of 4
__wakeup_flusher_threads_bdi---of 10
__writeback_inodes_wb---of 9
__writeback_single_inode31%of 33
bdi_split_work_to_wbs---of 35
cgroup_writeback_by_id---of 21
cgroup_writeback_umount---of 4
cleanup_offline_cgwb---of 38
dirtytime_interval_handler---of 4
inode_cgwb_move_to_attached25%of 20
inode_io_list_del16%of 13
inode_io_list_move_locked30%of 20
inode_sleep_on_writeback---of 4
inode_switch_wbs---of 34
inode_switch_wbs_work_fn---of 76
inode_wait_for_writeback34%of 6
locked_inode_to_wb_and_lock_list30%of 10
move_expired_inodes---of 26
perf_trace_balance_dirty_pages---of 11
perf_trace_bdi_dirty_ratelimit---of 9
perf_trace_flush_foreign---of 9
perf_trace_global_dirty_state---of 8
perf_trace_inode_foreign_history---of 11
perf_trace_inode_switch_wbs---of 9
perf_trace_inode_switch_wbs_queue---of 9
perf_trace_track_foreign_dirty---of 15
perf_trace_wbc_class---of 11
perf_trace_writeback_bdi_register---of 9
perf_trace_writeback_class---of 9
perf_trace_writeback_dirty_inode_template---of 9
perf_trace_writeback_folio_template---of 14
perf_trace_writeback_inode_template---of 8
perf_trace_writeback_pages_written---of 8
perf_trace_writeback_queue_io---of 9
perf_trace_writeback_sb_inodes_requeue---of 9
perf_trace_writeback_single_inode_template---of 11
perf_trace_writeback_work_class---of 11
perf_trace_writeback_write_inode_template---of 11
queue_io---of 14
redirty_tail_locked---of 14
sb_clear_inode_writeback30%of 10
sb_mark_inode_writeback34%of 9
sync_inode_metadata---of 1
sync_inodes_sb---of 20
sync_lazytime23%of 9
trace_event_raw_event_balance_dirty_pages---of 12
trace_event_raw_event_bdi_dirty_ratelimit---of 10
trace_event_raw_event_flush_foreign---of 10
trace_event_raw_event_global_dirty_state---of 9
trace_event_raw_event_inode_foreign_history---of 12
trace_event_raw_event_inode_switch_wbs---of 10
trace_event_raw_event_inode_switch_wbs_queue---of 10
trace_event_raw_event_track_foreign_dirty---of 16
trace_event_raw_event_wbc_class---of 12
trace_event_raw_event_writeback_bdi_register---of 10
trace_event_raw_event_writeback_class---of 10
trace_event_raw_event_writeback_dirty_inode_template---of 10
trace_event_raw_event_writeback_folio_template---of 15
trace_event_raw_event_writeback_inode_template---of 9
trace_event_raw_event_writeback_pages_written---of 9
trace_event_raw_event_writeback_queue_io---of 10
trace_event_raw_event_writeback_sb_inodes_requeue---of 10
trace_event_raw_event_writeback_single_inode_template---of 12
trace_event_raw_event_writeback_work_class---of 12
trace_event_raw_event_writeback_write_inode_template---of 12
trace_raw_output_balance_dirty_pages---of 3
trace_raw_output_bdi_dirty_ratelimit---of 3
trace_raw_output_flush_foreign---of 3
trace_raw_output_global_dirty_state---of 3
trace_raw_output_inode_foreign_history---of 3
trace_raw_output_inode_switch_wbs---of 3
trace_raw_output_inode_switch_wbs_queue---of 3
trace_raw_output_track_foreign_dirty---of 3
trace_raw_output_wbc_class---of 3
trace_raw_output_writeback_bdi_register---of 3
trace_raw_output_writeback_class---of 3
trace_raw_output_writeback_dirty_inode_template---of 3
trace_raw_output_writeback_folio_template---of 3
trace_raw_output_writeback_inode_template---of 3
trace_raw_output_writeback_pages_written---of 3
trace_raw_output_writeback_queue_io---of 3
trace_raw_output_writeback_sb_inodes_requeue---of 3
trace_raw_output_writeback_single_inode_template---of 3
trace_raw_output_writeback_work_class---of 3
trace_raw_output_writeback_write_inode_template---of 3
trace_writeback_pages_written---of 5
try_to_writeback_inodes_sb---of 5
wakeup_dirtytime_writeback---of 12
wakeup_flusher_threads---of 6
wakeup_flusher_threads_bdi---of 1
wb_put_many---of 6
wb_queue_work---of 14
wb_start_background_writeback---of 7
wb_wait_for_completion---of 5
wb_wakeup_delayed67%of 3
wb_workfn---of 42
wb_writeback---of 29
wbc_account_cgroup_owner---of 14
wbc_attach_and_unlock_inode19%of 11
wbc_attach_fdatawrite_inode67%of 3
wbc_detach_inode14%of 15
write_inode_now67%of 3
writeback_inodes_sb---of 4
writeback_inodes_sb_nr---of 4
writeback_inodes_wb---of 3
writeback_sb_inodes---of 40
writeback_single_inode39%of 21
-----------
SUMMARY31%of 250

__tty_port_tty_hangup29%of 14
tty_port_alloc_xmit_buf---of 6
tty_port_block_til_ready---of 27
tty_port_carrier_raised---of 3
tty_port_close---of 16
tty_port_close_end---of 4
tty_port_close_start---of 17
tty_port_default_lookahead_buf---of 5
tty_port_default_receive_buf---of 4
tty_port_default_wakeup---of 10
tty_port_destroy100%of 1
tty_port_free_xmit_buf---of 3
tty_port_hangup---of 15
tty_port_init100%of 1
tty_port_install---of 3
tty_port_link_device---of 3
tty_port_link_wq---of 1
tty_port_lower_dtr_rts---of 3
tty_port_open---of 17
tty_port_put---of 8
tty_port_raise_dtr_rts---of 3
tty_port_register_device---of 5
tty_port_register_device_attr---of 5
tty_port_register_device_attr_serdev---of 7
tty_port_tty_get---of 7
tty_port_tty_set---of 7
tty_port_tty_wakeup---of 1
tty_port_unregister_device---of 3
-----------
SUMMARY38%of 16

__ia32_sys_mmap---of 3
__x64_sys_mmap---of 3
arch_get_unmapped_area---of 20
arch_get_unmapped_area_topdown12%of 27
-----------
SUMMARY12%of 27

usb_console_device---of 3
usb_console_setup---of 20
usb_console_write---of 9
usb_serial_console_disconnect67%of 3
usb_serial_console_exit---of 3
usb_serial_console_init67%of 3
-----------
SUMMARY67%of 6

-----------
SUMMARY---of 0

__skb_put---of 3
dev_put---of 3
icmp6_ext_append---of 23
icmp6_ext_objs_append---of 13
icmp6_hdr67%of 3
icmp6_send---of 70
icmpv6_cleanup---of 1
icmpv6_echo_reply---of 51
icmpv6_err---of 7
icmpv6_err_convert---of 7
icmpv6_flow_init---of 1
icmpv6_getfrag---of 3
icmpv6_notify32%of 19
icmpv6_param_prob_reason---of 1
icmpv6_push_pending_frames---of 8
icmpv6_rcv12%of 109
icmpv6_route_lookup---of 16
icmpv6_rt_has_prefsrc---of 3
icmpv6_xrlim_allow---of 10
ip6_err_gen_icmpv6_unreach---of 30
ipcm6_init_sk---of 3
ipv6_icmp_sysctl_init---of 3
ipv6_icmp_sysctl_table_size---of 1
-----------
SUMMARY16%of 131

-----------
SUMMARY---of 0

__ia32_compat_sys_gettimeofday---of 8
__ia32_compat_sys_settimeofday---of 18
__ia32_sys_adjtimex---of 3
__ia32_sys_adjtimex_time32---of 1
__ia32_sys_gettimeofday---of 1
__ia32_sys_settimeofday---of 1
__ia32_sys_stime---of 4
__ia32_sys_stime32---of 4
__ia32_sys_time---of 4
__ia32_sys_time32---of 4
__msecs_to_jiffies67%of 3
__se_sys_adjtimex_time32---of 3
__se_sys_gettimeofday---of 8
__se_sys_settimeofday---of 20
__usecs_to_jiffies---of 3
__x64_sys_adjtimex---of 3
__x64_sys_adjtimex_time32---of 1
__x64_sys_gettimeofday---of 1
__x64_sys_settimeofday---of 1
__x64_sys_stime---of 4
__x64_sys_stime32---of 4
__x64_sys_time---of 4
__x64_sys_time32---of 4
clock_t_to_jiffies---of 1
do_sys_settimeofday64---of 13
get_itimerspec64---of 4
get_old_itimerspec32---of 4
get_old_timespec3267%of 3
get_old_timex3267%of 3
get_timespec64---of 3
jiffies64_to_msecs---of 1
jiffies64_to_nsecs---of 1
jiffies_64_to_clock_t---of 1
jiffies_to_clock_t---of 1
jiffies_to_timespec64---of 1
mktime64100%of 1
ns_to_kernel_old_timeval---of 4
ns_to_timespec6450%of 4
nsec_to_clock_t---of 1
nsecs_to_jiffies100%of 1
nsecs_to_jiffies64---of 1
put_itimerspec64---of 3
put_old_itimerspec32---of 3
put_old_timespec32---of 1
put_old_timex32---of 1
put_timespec64---of 1
set_normalized_timespec6443%of 7
timespec64_add_safe---of 9
timespec64_to_jiffies---of 1
-----------
SUMMARY60%of 22

__ia32_sys_sysfs---of 1
__se_sys_sysfs---of 18
__x64_sys_sysfs---of 1
filesystems_proc_show---of 4
get_filesystem100%of 1
get_fs_type31%of 23
put_filesystem100%of 1
register_filesystem---of 12
unregister_filesystem---of 6
-----------
SUMMARY36%of 25

name_to_int43%of 7
-----------
SUMMARY43%of 7

-----------
SUMMARY---of 0

__nbcon_atomic_flush_pending---of 16
__nbcon_atomic_flush_pending_con---of 15
nbcon_alloc---of 9
nbcon_allow_unsafe_takeover---of 1
nbcon_atomic_flush_pending---of 1
nbcon_atomic_flush_unsafe---of 1
nbcon_can_proceed---of 9
nbcon_context_can_proceed---of 5
nbcon_context_release---of 5
nbcon_context_try_acquire---of 37
nbcon_context_try_acquire_requested---of 9
nbcon_cpu_emergency_enter---of 1
nbcon_cpu_emergency_exit---of 9
nbcon_device_release---of 37
nbcon_device_try_acquire---of 14
nbcon_emit_next_record---of 89
nbcon_enter_unsafe---of 14
nbcon_exit_unsafe---of 20
nbcon_free---of 8
nbcon_get_cpu_emergency_nesting67%of 3
nbcon_get_default_prio67%of 3
nbcon_irq_work---of 1
nbcon_kdb_release---of 23
nbcon_kdb_try_acquire---of 14
nbcon_kthread_create---of 4
nbcon_kthread_func---of 32
nbcon_kthread_stop---of 3
nbcon_kthreads_wake---of 9
nbcon_legacy_emit_next_record---of 14
nbcon_reacquire_nobuf---of 4
nbcon_seq_force---of 1
nbcon_seq_read---of 1
nbcon_write_context_set_buf---of 1
printk_get_console_flush_type---of 21
-----------
SUMMARY67%of 6

-----------
SUMMARY---of 0

__bpf_trace_tlb_flush---of 1
__probestub_tlb_flush---of 1
__traceiter_tlb_flush---of 4
adjust_range_page_size_mask---of 12
alloc_low_pages---of 12
arch_max_swapfile_size---of 3
cachemode2protval---of 3
devmem_is_allowed---of 4
execmem_fill_trapping_insns---of 1
free_init_pages---of 5
free_initmem---of 1
free_kernel_image_pages---of 8
init_memory_mapping---of 26
perf_trace_tlb_flush---of 8
pfn_range_is_mapped58%of 7
pgprot2cachemode67%of 3
trace_event_raw_event_tlb_flush---of 9
trace_raw_output_tlb_flush---of 3
update_cache_mode_entry---of 3
x86_has_pat_wp---of 1
-----------
SUMMARY60%of 10

-----------
SUMMARY---of 0

base_sock_bind---of 6
base_sock_ioctl---of 14
base_sock_release---of 11
data_sock_bind---of 41
data_sock_getname---of 3
data_sock_getsockopt---of 5
data_sock_ioctl---of 26
data_sock_release---of 33
data_sock_setsockopt---of 8
mISDN_ctrl---of 5
mISDN_send---of 8
mISDN_sock_create10%of 31
mISDN_sock_link---of 7
mISDN_sock_recvmsg---of 14
mISDN_sock_sendmsg---of 17
mISDN_sock_unlink---of 7
misdn_sock_cleanup---of 1
misdn_sock_init---of 3
-----------
SUMMARY10%of 31

__copy_overflow---of 1
copy_from_kernel_nofault67%of 15
copy_from_user_nofault---of 4
copy_to_kernel_nofault---of 14
copy_to_user_nofault---of 4
strncpy_from_kernel_nofault---of 6
strncpy_from_user_nofault---of 4
strnlen_user_nofault---of 1
-----------
SUMMARY67%of 15

__blk_mq_get_tag18%of 17
__blk_mq_tag_busy---of 9
__blk_mq_tag_idle---of 7
blk_mq_all_tag_iter---of 20
blk_mq_free_tags---of 3
blk_mq_free_tags_callback---of 7
blk_mq_get_tag22%of 19
blk_mq_get_tags---of 5
blk_mq_init_tags---of 6
blk_mq_put_tag50%of 4
blk_mq_put_tags---of 1
blk_mq_queue_tag_busy_iter---of 16
blk_mq_tag_resize_shared_tags---of 1
blk_mq_tag_update_sched_shared_tags---of 1
blk_mq_tag_wakeup_all---of 3
blk_mq_tagset_busy_iter---of 39
blk_mq_tagset_count_completed_rqs---of 3
blk_mq_tagset_wait_completed_request---of 4
blk_mq_unique_tag---of 1
bt_for_each---of 19
-----------
SUMMARY23%of 40

csum_ipv6_magic100%of 1
udp6_set_csum---of 9
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

alloc_shrinker_info---of 26
do_shrink_slab---of 21
free_shrinker_info---of 12
reparent_shrinker_deferred---of 11
set_shrinker_bit---of 4
shrink_slab---of 48
shrinker_alloc17%of 31
shrinker_free39%of 13
shrinker_free_rcu_cb---of 1
shrinker_register50%of 4
-----------
SUMMARY25%of 48

-----------
SUMMARY---of 0

nilfs_sufile_alloc---of 34
nilfs_sufile_do_cancel_free---of 6
nilfs_sufile_do_free---of 14
nilfs_sufile_do_scrap---of 8
nilfs_sufile_do_set_error---of 8
nilfs_sufile_get_ncleansegs100%of 1
nilfs_sufile_get_stat---of 5
nilfs_sufile_get_suinfo---of 16
nilfs_sufile_mark_dirty---of 14
nilfs_sufile_read19%of 11
nilfs_sufile_resize---of 44
nilfs_sufile_set_alloc_range---of 3
nilfs_sufile_set_segment_usage---of 11
nilfs_sufile_set_suinfo---of 29
nilfs_sufile_trim_fs---of 27
nilfs_sufile_update---of 9
nilfs_sufile_updatev---of 22
-----------
SUMMARY25%of 12

__access_remote_vm---of 31
__apply_to_page_range---of 99
__p4d_alloc---of 30
__pmd_alloc30%of 20
__pte_alloc20%of 10
__pte_alloc_kernel27%of 15
__pud_alloc---of 22
__vm_insert_mixed---of 10
__vmf_anon_prepare---of 7
__zap_vma_range6%of 212
access_process_vm---of 3
access_remote_vm---of 1
apply_to_existing_page_range---of 1
apply_to_page_range---of 1
copy_folio_from_user---of 11
copy_page_range---of 344
copy_remote_vm_str---of 28
copy_user_gigantic_page---of 6
copy_user_large_folio---of 24
do_pte_missing11%of 191
do_remap_pfn_range---of 52
do_set_pmd---of 18
do_swap_page---of 228
do_wp_page---of 201
fault_around_bytes_fops_open---of 1
fault_around_bytes_get---of 1
fault_around_bytes_set---of 3
fault_dirty_shared_page---of 17
finish_fault---of 45
finish_mkwrite_fault---of 9
folio_lock---of 3
folio_zero_user40%of 25
follow_pfnmap_end---of 5
follow_pfnmap_start---of 35
free_pgd_range18%of 58
free_pgtables25%of 24
generic_access_phys---of 23
get_locked_pte---of 22
get_page---of 4
handle_mm_fault22%of 173
handle_pte_marker---of 9
insert_page---of 9
insert_page_into_pte_locked---of 18
insert_pages---of 44
insert_pfn---of 12
list_add---of 4
map_anon_folio_pte_nopf40%of 15
map_anon_folio_pte_pf45%of 9
map_kernel_pages_complete---of 1
map_kernel_pages_prepare---of 5
mm_trace_rss_stat40%of 5
mmap_read_trylock---of 5
mmap_read_unlock---of 3
numa_migrate_check---of 24
pfnmap_track_ctx_release---of 1
pmd_install---of 3
pmd_is_device_private_entry---of 4
pmd_is_migration_entry---of 4
print_bad_page_map---of 30
print_vma_addr---of 11
pte_marker_clear---of 4
put_page---of 5
remap_pfn_range---of 4
remap_pfn_range_complete---of 1
remap_pfn_range_prepare---of 8
remove_device_exclusive_entry---of 20
restore_exclusive_pte---of 9
set_pte_range23%of 27
simple_ioremap_prepare---of 14
softleaf_type---of 10
try_restore_exclusive_pte---of 11
unmap_mapping_folio---of 5
unmap_mapping_pages---of 3
unmap_mapping_range---of 3
unmap_mapping_range_tree---of 4
unmap_vmas50%of 12
vm_insert_page---of 14
vm_insert_pages---of 8
vm_iomap_memory---of 7
vm_map_pages---of 3
vm_map_pages_zero---of 3
vm_mixed_zeropage_allowed---of 8
vm_normal_folio---of 7
vm_normal_folio_pmd---of 7
vm_normal_page---of 6
vm_normal_page_pmd---of 6
vm_normal_page_pud---of 6
vma_end_read---of 5
vma_pgtable_walk_begin67%of 3
vma_pgtable_walk_end67%of 3
vmf_insert_mixed---of 1
vmf_insert_mixed_mkwrite---of 7
vmf_insert_page_mkwrite---of 5
vmf_insert_pfn---of 1
vmf_insert_pfn_prot---of 31
vmf_pte_changed100%of 3
wp_huge_pmd---of 9
zap_special_vma_range---of 5
zap_vma_for_reaping---of 8
zap_vma_range---of 1
zap_vma_range_batched---of 15
-----------
SUMMARY18%of 805

-----------
SUMMARY---of 0

__ip_options_compile19%of 75
__ip_options_echo49%of 29
ip_forward_options---of 15
ip_options_build78%of 9
ip_options_compile100%of 3
ip_options_fragment---of 9
ip_options_get---of 17
ip_options_rcv_srr---of 21
ip_options_undo---of 11
-----------
SUMMARY33%of 116

-----------
SUMMARY---of 0

input_mt_assign_slots---of 51
input_mt_destroy_slots67%of 3
input_mt_drop_unused---of 10
input_mt_get_slot_by_key---of 12
input_mt_init_slots33%of 34
input_mt_release_slots---of 10
input_mt_report_finger_count---of 1
input_mt_report_pointer_emulation---of 19
input_mt_report_slot_state---of 5
input_mt_sync_frame---of 11
-----------
SUMMARY36%of 37

__bpf_trace_dax_pmd_fault_class---of 1
__bpf_trace_dax_pmd_load_hole_class---of 1
__bpf_trace_dax_pte_fault_class---of 1
__bpf_trace_dax_writeback_one---of 1
__bpf_trace_dax_writeback_range_class---of 1
__dax_invalidate_entry---of 11
__probestub_dax_insert_pfn_mkwrite---of 1
__probestub_dax_insert_pfn_mkwrite_no_entry---of 1
__probestub_dax_load_hole---of 1
__probestub_dax_pmd_fault---of 1
__probestub_dax_pmd_fault_done---of 1
__probestub_dax_pmd_load_hole---of 1
__probestub_dax_pmd_load_hole_fallback---of 1
__probestub_dax_pte_fault---of 1
__probestub_dax_pte_fault_done---of 1
__probestub_dax_writeback_one---of 1
__probestub_dax_writeback_range---of 1
__probestub_dax_writeback_range_done---of 1
__traceiter_dax_insert_pfn_mkwrite---of 4
__traceiter_dax_insert_pfn_mkwrite_no_entry---of 4
__traceiter_dax_load_hole---of 4
__traceiter_dax_pmd_fault---of 4
__traceiter_dax_pmd_fault_done---of 4
__traceiter_dax_pmd_load_hole---of 4
__traceiter_dax_pmd_load_hole_fallback---of 4
__traceiter_dax_pte_fault---of 4
__traceiter_dax_pte_fault_done---of 4
__traceiter_dax_writeback_one---of 4
__traceiter_dax_writeback_range---of 4
__traceiter_dax_writeback_range_done---of 4
dax_break_layout---of 11
dax_break_layout_final23%of 9
dax_dedupe_file_range_compare---of 20
dax_delete_mapping_entry---of 3
dax_delete_mapping_range---of 16
dax_disassociate_entry---of 14
dax_fault_iter---of 49
dax_file_unshare---of 22
dax_finish_sync_fault---of 19
dax_insert_entry---of 42
dax_invalidate_mapping_entry_sync---of 1
dax_iomap_copy_around---of 21
dax_iomap_fault---of 63
dax_iomap_rw---of 53
dax_layout_busy_page---of 1
dax_layout_busy_page_range---of 25
dax_lock_folio---of 10
dax_lock_mapping_entry---of 7
dax_remap_file_range_prep---of 1
dax_truncate_page---of 3
dax_unlock_entry---of 5
dax_unlock_folio---of 3
dax_unlock_mapping_entry---of 3
dax_wake_entry---of 3
dax_writeback_mapping_range---of 50
dax_zero_range---of 21
get_next_unlocked_entry---of 7
grab_mapping_entry---of 17
perf_trace_dax_pmd_fault_class---of 8
perf_trace_dax_pmd_load_hole_class---of 8
perf_trace_dax_pte_fault_class---of 8
perf_trace_dax_writeback_one---of 8
perf_trace_dax_writeback_range_class---of 8
trace_dax_pmd_load_hole_fallback---of 5
trace_event_raw_event_dax_pmd_fault_class---of 9
trace_event_raw_event_dax_pmd_load_hole_class---of 9
trace_event_raw_event_dax_pte_fault_class---of 9
trace_event_raw_event_dax_writeback_one---of 9
trace_event_raw_event_dax_writeback_range_class---of 9
trace_raw_output_dax_pmd_fault_class---of 3
trace_raw_output_dax_pmd_load_hole_class---of 3
trace_raw_output_dax_pte_fault_class---of 3
trace_raw_output_dax_writeback_one---of 3
trace_raw_output_dax_writeback_range_class---of 3
wait_entry_unlocked_exclusive---of 4
wake_exceptional_entry_func---of 4
-----------
SUMMARY23%of 9

-----------
SUMMARY---of 0

__usb_queue_reset_device---of 3
__usb_wireless_status_intf---of 3
cdc_parse_cdc_header8%of 38
driver_set_config_work---of 6
sg_complete---of 21
usb_api_blocking_completion100%of 1
usb_authorize_interface---of 3
usb_bulk_msg---of 6
usb_bulk_msg_killable---of 6
usb_cache_string80%of 5
usb_clear_halt---of 4
usb_control_msg50%of 6
usb_control_msg_recv---of 6
usb_control_msg_send---of 4
usb_deauthorize_interface---of 3
usb_disable_device38%of 16
usb_disable_device_endpoints47%of 26
usb_disable_endpoint45%of 9
usb_disable_interface42%of 12
usb_driver_set_configuration---of 4
usb_enable_endpoint67%of 6
usb_enable_interface23%of 9
usb_get_descriptor42%of 12
usb_get_device_descriptor50%of 4
usb_get_status---of 11
usb_get_string19%of 16
usb_if_uevent67%of 3
usb_interrupt_msg---of 1
usb_release_interface50%of 4
usb_reset_configuration---of 26
usb_reset_endpoint---of 3
usb_set_configuration32%of 91
usb_set_interface4%of 55
usb_set_isoch_delay---of 4
usb_set_wireless_status---of 3
usb_sg_cancel---of 13
usb_sg_init---of 29
usb_sg_wait---of 18
usb_start_wait_urb39%of 13
usb_string34%of 15
usb_string_sub39%of 18
-----------
SUMMARY30%of 359

__ext4_xattr_set_credits---of 12
__xattr_check_inode100%of 1
check_xattrs24%of 38
ext4_evict_ea_inode---of 8
ext4_expand_extra_isize_ea29%of 82
ext4_get_inode_usage---of 16
ext4_listxattr---of 14
ext4_xattr_block_cache_insert---of 3
ext4_xattr_block_csum_set---of 3
ext4_xattr_block_find29%of 7
ext4_xattr_block_set10%of 113
ext4_xattr_create_cache---of 1
ext4_xattr_delete_inode---of 30
ext4_xattr_destroy_cache---of 3
ext4_xattr_get---of 19
ext4_xattr_ibody_find29%of 7
ext4_xattr_ibody_get---of 13
ext4_xattr_ibody_set---of 16
ext4_xattr_inode_array_free---of 5
ext4_xattr_inode_dec_ref_all---of 40
ext4_xattr_inode_free_quota---of 4
ext4_xattr_inode_get---of 8
ext4_xattr_inode_iget---of 7
ext4_xattr_inode_inc_ref_all---of 15
ext4_xattr_inode_lookup_create---of 62
ext4_xattr_inode_read---of 15
ext4_xattr_inode_update_ref---of 14
ext4_xattr_inode_verify_hashes---of 21
ext4_xattr_list_entries---of 15
ext4_xattr_release_block---of 22
ext4_xattr_set---of 8
ext4_xattr_set_credits---of 11
ext4_xattr_set_entry21%of 74
ext4_xattr_set_handle---of 77
ext4_xattr_update_super_block---of 5
ext4_xattr_value_same---of 4
lock_buffer---of 3
mb_cache_entry_put---of 4
xattr_find_entry45%of 9
-----------
SUMMARY21%of 331

__create_xol_area---of 23
arch_uprobe_copy_ixol---of 1
arch_uprobe_ignore---of 1
dup_xol_work---of 6
filter_chain---of 5
find_active_uprobe_rcu---of 56
handle_syscall_uprobe---of 11
handler_chain---of 69
hprobe_expire---of 20
install_breakpoint---of 22
is_swbp_insn---of 1
is_trap_insn---of 1
put_page---of 5
put_uprobe---of 4
register_for_each_vma---of 50
ri_free---of 3
ri_timer---of 6
update_ref_ctr---of 45
update_ref_ctr_warn---of 1
uprobe_apply---of 8
uprobe_clear_state---of 14
uprobe_copy_from_page---of 1
uprobe_copy_process---of 18
uprobe_deny_signal---of 9
uprobe_dup_mmap---of 3
uprobe_end_dup_mmap---of 5
uprobe_free_deferred---of 11
uprobe_free_rcu_tasks_trace---of 1
uprobe_free_srcu---of 1
uprobe_free_utask---of 20
uprobe_get_swbp_addr---of 1
uprobe_get_trampoline_vaddr---of 3
uprobe_get_trap_addr---of 4
uprobe_handle_trampoline---of 41
uprobe_mmap3%of 81
uprobe_munmap---of 15
uprobe_notify_resume---of 59
uprobe_post_sstep_notifier---of 4
uprobe_pre_sstep_notifier---of 6
uprobe_register---of 36
uprobe_start_dup_mmap---of 5
uprobe_unregister_nosync---of 8
uprobe_unregister_sync---of 1
uprobe_write---of 83
uprobe_write_opcode---of 1
verify_opcode---of 5
xol_fault---of 4
xol_free_insn_slot---of 4
xol_mremap---of 1
-----------
SUMMARY3%of 81

-----------
SUMMARY---of 0

tomoyo_convert_time---of 1
tomoyo_correct_domain---of 9
tomoyo_correct_path---of 4
tomoyo_correct_word---of 1
tomoyo_correct_word2---of 31
tomoyo_domain_def---of 6
tomoyo_domain_quota_is_ok12%of 18
tomoyo_file_matches_pattern2---of 62
tomoyo_fill_path_info54%of 15
tomoyo_find_domain---of 8
tomoyo_get_domainname---of 11
tomoyo_get_exe---of 4
tomoyo_get_mode40%of 5
tomoyo_init_request_info58%of 7
tomoyo_normalize_line---of 11
tomoyo_parse_name_union---of 4
tomoyo_parse_number_union---of 15
tomoyo_parse_ulong---of 6
tomoyo_path_matches_pattern34%of 6
tomoyo_path_matches_pattern2---of 42
tomoyo_permstr---of 4
tomoyo_print_ulong60%of 5
tomoyo_read_token---of 3
tomoyo_str_starts---of 3
-----------
SUMMARY38%of 56

__each_dev---of 3
__find_interface---of 4
__usb_get_extra_descriptor50%of 6
usb_alloc_coherent50%of 4
usb_alloc_dev29%of 21
usb_alloc_noncoherent---of 6
usb_altnum_to_altsetting60%of 5
usb_bus_notify75%of 8
usb_check_bulk_endpoints---of 9
usb_check_int_endpoints---of 9
usb_dev_complete---of 1
usb_dev_freeze---of 1
usb_dev_poweroff---of 1
usb_dev_prepare---of 1
usb_dev_restore---of 1
usb_dev_resume---of 1
usb_dev_suspend---of 1
usb_dev_thaw---of 1
usb_dev_uevent67%of 3
usb_devnode100%of 1
usb_disabled---of 1
usb_endpoint_is_hs_isoc_double34%of 6
usb_endpoint_max_periodic_payload---of 12
usb_find_alt_setting---of 10
usb_find_common_endpoints---of 38
usb_find_common_endpoints_reverse---of 38
usb_find_interface---of 1
usb_for_each_dev---of 1
usb_free_coherent50%of 4
usb_free_noncoherent---of 4
usb_get_current_frame_number---of 1
usb_get_dev67%of 3
usb_get_intf67%of 3
usb_ifnum_to_if50%of 6
usb_intf_get_dma_device---of 5
usb_lock_device_for_reset---of 17
usb_put_dev67%of 3
usb_put_intf67%of 3
usb_release_dev100%of 1
-----------
SUMMARY51%of 77

announce_device100%of 7
clear_hub_feature---of 1
descriptors_changed---of 28
get_port_ssp_rate---of 11
hub_activate---of 81
hub_disconnect---of 11
hub_event23%of 177
hub_ext_port_status24%of 21
hub_get---of 4
hub_hc_release_resources---of 12
hub_hub_status---of 16
hub_init_func2---of 1
hub_init_func3---of 1
hub_ioctl---of 18
hub_irq34%of 24
hub_port_debounce56%of 9
hub_port_disable---of 16
hub_port_init27%of 123
hub_port_logical_disconnect---of 10
hub_port_reset21%of 68
hub_post_reset---of 4
hub_post_resume---of 4
hub_power_on---of 6
hub_power_remaining16%of 13
hub_pre_reset---of 4
hub_probe---of 85
hub_put---of 4
hub_quiesce---of 11
hub_reset_resume---of 1
hub_resubmit_irq_urb29%of 7
hub_resume---of 9
hub_retry_irq_urb---of 1
hub_suspend---of 26
hub_tt_work---of 15
led_work---of 16
recursively_mark_NOTATTACHED30%of 10
update_usb_device_state31%of 13
usb_authorize_device---of 6
usb_clear_port_feature---of 1
usb_deauthorize_device---of 3
usb_device_is_owned34%of 6
usb_device_may_initiate_lpm---of 15
usb_device_supports_lpm---of 13
usb_disable_link_state---of 6
usb_disable_lpm14%of 15
usb_disable_ltm19%of 11
usb_disable_remote_wakeup---of 3
usb_disconnect36%of 31
usb_enable_link_state---of 10
usb_enable_lpm10%of 22
usb_enable_ltm19%of 11
usb_ep0_reinit---of 1
usb_get_hub_port_acpi_handle---of 7
usb_hub_adjust_deviceremovable---of 22
usb_hub_claim_port---of 10
usb_hub_cleanup---of 1
usb_hub_clear_tt_buffer---of 6
usb_hub_find_child---of 8
usb_hub_init---of 4
usb_hub_port_status---of 1
usb_hub_release_all_ports---of 10
usb_hub_release_port---of 10
usb_hub_set_port_power---of 5
usb_hub_to_struct_hub40%of 5
usb_kick_hub_wq---of 14
usb_new_device32%of 51
usb_port_disable---of 5
usb_port_is_power_on---of 1
usb_port_resume---of 78
usb_port_suspend---of 57
usb_queue_reset_device---of 3
usb_remote_wakeup---of 4
usb_remove_device---of 7
usb_req_set_sel---of 8
usb_reset_and_verify_device---of 66
usb_reset_device---of 30
usb_root_hub_lost_power---of 1
usb_set_device_initiated_lpm---of 4
usb_set_device_state30%of 10
usb_set_lpm_parameters---of 11
usb_unlocked_disable_lpm67%of 3
usb_unlocked_enable_lpm67%of 3
usb_wakeup_enabled_descendants---of 6
usb_wakeup_notification---of 17
-----------
SUMMARY27%of 640

-----------
SUMMARY---of 0

__percpu_counter_compare---of 8
__percpu_counter_init_many42%of 12
__percpu_counter_limited_add---of 25
__percpu_counter_sum60%of 5
compute_batch_value---of 1
percpu_counter_add_batch60%of 5
percpu_counter_cpu_dead---of 4
percpu_counter_destroy_many29%of 14
percpu_counter_set---of 5
percpu_counter_sync---of 1
-----------
SUMMARY42%of 36

-----------
SUMMARY---of 0

____fput100%of 1
__fput48%of 36
__fput_deferred28%of 11
__fput_sync---of 7
alloc_empty_backing_file---of 9
alloc_empty_file31%of 13
alloc_empty_file_noaccount38%of 8
alloc_file_clone---of 3
alloc_file_pseudo50%of 4
alloc_file_pseudo_noaccount50%of 4
backing_file_free---of 1
backing_file_security---of 1
backing_file_set_security---of 1
backing_file_set_user_path---of 1
backing_file_user_path---of 1
delayed_fput---of 4
file_free50%of 8
file_init_path93%of 13
flush_delayed_fput---of 4
fput86%of 7
fput_close20%of 10
fput_close_sync20%of 10
get_max_files---of 1
proc_nr_files---of 1
-----------
SUMMARY47%of 125

mon_bus_init---of 4
mon_bus_lookup---of 5
mon_complete---of 8
mon_notify13%of 16
mon_reader_add---of 11
mon_reader_del---of 17
mon_submit---of 8
mon_submit_error---of 8
-----------
SUMMARY13%of 16

-----------
SUMMARY---of 0

usb_acpi_bus_match100%of 1
usb_acpi_find_companion7%of 30
usb_acpi_get_companion_for_port50%of 4
usb_acpi_port_lpm_incapable---of 7
usb_acpi_power_manageable---of 3
usb_acpi_register---of 1
usb_acpi_set_power_state---of 4
usb_acpi_unregister---of 1
-----------
SUMMARY15%of 35

__check_for_non_generic_match100%of 4
usb_choose_configuration22%of 38
usb_generic_driver_disconnect67%of 3
usb_generic_driver_match67%of 3
usb_generic_driver_probe34%of 6
usb_generic_driver_resume---of 5
usb_generic_driver_suspend---of 9
-----------
SUMMARY34%of 54

-----------
SUMMARY---of 0

__do_replace---of 28
alloc_counters---of 15
cleanup_entry---of 8
compat_copy_entries_to_user---of 14
compat_standard_from_user---of 3
compat_standard_to_user---of 3
compat_table_info---of 30
copy_from_sockptr_offset---of 4
do_ipt_get_ctl---of 45
do_ipt_set_ctl---of 39
ip_tables_net_exit---of 1
ip_tables_net_init---of 1
ipt_alloc_initial_table---of 9
ipt_do_table28%of 36
ipt_error---of 3
ipt_register_table---of 22
ipt_unregister_table_exit---of 7
ipt_unregister_table_pre_exit---of 3
trace_packet---of 11
translate_compat_table---of 61
translate_table---of 71
-----------
SUMMARY28%of 36

dst_check---of 5
ip_call_ra_chain---of 16
ip_list_rcv---of 15
ip_local_deliver38%of 8
ip_local_deliver_finish40%of 15
ip_protocol_deliver_rcu17%of 42
ip_rcv34%of 6
ip_rcv_core25%of 29
ip_rcv_finish34%of 12
ip_rcv_finish_core34%of 54
ip_sublist_rcv---of 42
skb_dst_set_noref---of 3
tcp_v4_early_demux34%of 21
-----------
SUMMARY29%of 187

-----------
SUMMARY---of 0

__inet_accept---of 17
__inet_bind---of 31
__inet_listen_sk---of 8
__inet_stream_connect---of 35
inet_accept---of 3
inet_bind---of 4
inet_bind_sk---of 4
inet_compat_ioctl---of 11
inet_create---of 45
inet_ctl_sock_create---of 3
inet_current_timestamp100%of 1
inet_dgram_connect---of 7
inet_getname---of 6
inet_gro_complete---of 9
inet_gro_receive---of 28
inet_gso_segment---of 42
inet_init_net---of 1
inet_ioctl---of 27
inet_listen---of 10
inet_recv_error---of 4
inet_recvmsg---of 11
inet_register_protosw---of 9
inet_release---of 5
inet_send_prepare---of 12
inet_sendmsg---of 5
inet_shutdown---of 11
inet_sk_rebuild_header---of 23
inet_sk_set_state---of 5
inet_sk_state_store---of 5
inet_sock_destruct---of 22
inet_splice_eof---of 4
inet_stream_connect---of 1
inet_unregister_protosw---of 5
ipip_gro_complete---of 1
ipip_gro_receive---of 3
ipip_gso_segment---of 3
ipv4_mib_exit_net---of 1
ipv4_mib_init_net---of 12
snmp_fold_field---of 5
udp_set_peek_off---of 1
-----------
SUMMARY100%of 1

__kernfs_fh_to_dentry---of 10
kernfs_encode_fh---of 3
kernfs_fh_to_dentry---of 8
kernfs_fh_to_parent---of 1
kernfs_free_fs_context100%of 1
kernfs_get_parent_dentry---of 5
kernfs_get_tree16%of 13
kernfs_kill_sb---of 4
kernfs_node_dentry15%of 20
kernfs_root_from_sb67%of 3
kernfs_set_super---of 1
kernfs_sop_show_options---of 6
kernfs_sop_show_path---of 6
kernfs_statfs---of 1
kernfs_super_ns---of 1
kernfs_test_super67%of 3
-----------
SUMMARY25%of 40

-----------
SUMMARY---of 0

__ima_inode_hash---of 16
ima_bprm_check---of 1
ima_bprm_creds_for_exec---of 3
ima_creds_check---of 1
ima_file_check100%of 1
ima_file_free19%of 16
ima_file_hash---of 3
ima_file_mmap43%of 7
ima_file_mprotect---of 7
ima_get_current_hash_algo---of 1
ima_inode_hash---of 3
ima_kernel_module_request100%of 1
ima_kexec_cmdline---of 5
ima_load_data20%of 10
ima_measure_critical_data---of 3
ima_post_create_tmpfile---of 6
ima_post_load_data---of 6
ima_post_path_mknod---of 6
ima_post_read_file---of 4
ima_read_file---of 3
mmap_violation_check---of 5
process_buffer_measurement---of 23
process_measurement4%of 104
-----------
SUMMARY11%of 139

hfs_cat_build_key---of 3
hfs_cat_build_record---of 3
hfs_cat_create---of 13
hfs_cat_delete---of 34
hfs_cat_find_brec40%of 5
hfs_cat_keycmp---of 3
hfs_cat_move---of 22
-----------
SUMMARY40%of 5

__hugetlb_zap_begin---of 15
__hugetlb_zap_end---of 17
__list_add---of 4
__nr_hugepages_store_common---of 87
__unmap_hugepage_range---of 53
__update_and_free_hugetlb_folio---of 11
__vma_private_lock---of 4
__vma_reservation_common34%of 21
add_hugetlb_folio---of 11
adjust_range_if_pmd_sharing_possible---of 8
alloc_and_dissolve_hugetlb_folio---of 34
alloc_hugetlb_folio29%of 74
alloc_hugetlb_folio_nodemask---of 7
alloc_hugetlb_folio_reserve---of 4
alloc_hugetlb_folio_vma---of 8
alloc_pool_huge_folio---of 22
alloc_surplus_hugetlb_folio40%of 5
allocate_file_region_entries36%of 17
clear_vma_resv_huge_pages---of 22
copy_hugetlb_page_range---of 85
css_get---of 4
demote_pool_huge_page---of 62
dequeue_hugetlb_folio_nodemask31%of 42
dissolve_free_hugetlb_folio---of 16
dissolve_free_hugetlb_folios---of 8
fixup_hugetlb_reservations---of 3
folio_isolate_hugetlb---of 11
folio_lock---of 3
folio_putback_hugetlb---of 14
free_hpage_workfn---of 10
free_huge_folio---of 53
get_huge_page_for_hwpoison---of 1
get_hwpoison_hugetlb_folio---of 8
huge_pmd_share---of 27
huge_pmd_unshare---of 7
huge_pmd_unshare_flush---of 9
huge_pte_alloc32%of 29
huge_pte_offset---of 13
hugepage_new_subpool---of 5
hugepage_put_subpool---of 9
hugepage_subpool_put_pages---of 14
hugetlb_acct_memory---of 83
hugetlb_add_to_page_cache67%of 3
hugetlb_can_userfault---of 1
hugetlb_cgroup_put_rsvd_cgroup---of 5
hugetlb_change_protection---of 69
hugetlb_dup_vma_private---of 5
hugetlb_fault5%of 65
hugetlb_fault_mutex_hash---of 1
hugetlb_fix_reserve_counts---of 12
hugetlb_folio_mapping_lock_write---of 3
hugetlb_mask_last_page---of 1
hugetlb_mfill_atomic_pte---of 71
hugetlb_no_page20%of 70
hugetlb_report_meminfo---of 6
hugetlb_report_node_meminfo---of 1
hugetlb_report_usage---of 1
hugetlb_reserve_pages4%of 60
hugetlb_resv_map_add22%of 38
hugetlb_show_meminfo_node---of 4
hugetlb_split---of 5
hugetlb_total_pages---of 4
hugetlb_unreserve_pages---of 4
hugetlb_unshare_all_pmds---of 1
hugetlb_unshare_pmds---of 35
hugetlb_vm_op_close---of 27
hugetlb_vm_op_open---of 22
hugetlb_vm_op_pagesize---of 1
hugetlb_vm_op_split---of 1
hugetlb_vma_assert_locked---of 1
hugetlb_vma_lock_alloc29%of 7
hugetlb_vma_lock_read40%of 5
hugetlb_vma_lock_release---of 1
hugetlb_vma_lock_write---of 5
hugetlb_vma_trylock_write---of 5
hugetlb_vma_unlock_read40%of 5
hugetlb_vma_unlock_write---of 5
hugetlb_wp---of 97
hugetlbfs_pagecache_present---of 4
init_new_hugetlb_folio---of 3
isolate_or_dissolve_huge_folio---of 10
move_hugetlb_page_tables---of 63
move_hugetlb_state---of 22
only_alloc_fresh_hugetlb_folio34%of 15
prep_and_add_allocated_folios---of 10
region_add43%of 21
region_del---of 25
remove_hugetlb_folio---of 11
replace_free_hugepage_folios---of 23
restore_reserve_on_error---of 15
resv_map_alloc50%of 4
resv_map_release---of 7
size_to_hstate---of 5
softleaf_to_folio---of 7
softleaf_type---of 11
unlock_or_release_subpool---of 8
unmap_hugepage_range---of 12
update_and_free_pages_bulk---of 31
vma_end_read---of 5
wait_for_freed_hugetlb_folios---of 3
want_pmd_share---of 4
-----------
SUMMARY23%of 481

-----------
SUMMARY---of 0

__folio_cancel_dirty23%of 9
__folio_end_writeback40%of 10
__folio_mark_dirty40%of 20
__folio_start_writeback42%of 12
__wb_calc_thresh---of 9
__wb_update_bandwidth22%of 19
__wb_writeout_add50%of 6
balance_dirty_pages---of 53
balance_dirty_pages_ratelimited---of 1
balance_dirty_pages_ratelimited_flags---of 44
balance_wb_limits---of 23
bdi_get_max_bytes---of 1
bdi_get_min_bytes---of 1
bdi_set_max_bytes---of 7
bdi_set_max_ratio---of 4
bdi_set_max_ratio_no_scale---of 4
bdi_set_min_bytes---of 8
bdi_set_min_ratio---of 5
bdi_set_min_ratio_no_scale---of 5
bdi_set_strict_limit---of 3
cgwb_calc_thresh---of 9
dirty_background_bytes_handler---of 4
dirty_background_ratio_handler---of 3
dirty_bytes_handler---of 5
dirty_ratio_handler---of 3
dirty_writeback_centisecs_handler---of 3
do_writepages50%of 14
domain_dirty_avail---of 5
domain_dirty_limits---of 13
filemap_dirty_folio---of 4
folio_account_cleaned---of 3
folio_clear_dirty_for_io34%of 15
folio_mark_dirty40%of 5
folio_mark_dirty_lock---of 7
folio_redirty_for_writepage---of 12
folio_wait_stable---of 3
folio_wait_writeback---of 8
folio_wait_writeback_killable---of 8
global_dirty_limits---of 1
laptop_mode_handler---of 3
node_dirty_ok---of 13
noop_dirty_folio67%of 3
page_writeback_cpu_online---of 1
tag_pages_for_writeback40%of 15
wb_calc_thresh---of 9
wb_dirty_limits---of 5
wb_domain_exit---of 1
wb_domain_init---of 1
wb_over_bg_thresh---of 25
wb_update_bandwidth---of 1
wb_update_dirty_ratelimit---of 11
wb_writeout_inc---of 3
writeback_iter34%of 45
writeback_set_ratelimit---of 1
writeout_period---of 3
-----------
SUMMARY37%of 173

-----------
SUMMARY---of 0

__fl6_update_dst---of 7
__ipv6_fixup_options---of 5
dst_discard---of 1
dst_input---of 4
ip6_parse_tlv---of 71
ipv6_destopt_rcv---of 28
ipv6_dup_options---of 10
ipv6_exthdrs_exit---of 1
ipv6_parse_hopopts---of 16
ipv6_push_frag_opts---of 3
ipv6_push_nfrag_opts---of 20
ipv6_renew_options---of 46
ipv6_rpl_srh_rcv---of 61
ipv6_rthdr_rcv20%of 60
ipv6_srh_rcv---of 48
skb_network_header_len---of 3
-----------
SUMMARY20%of 60

-----------
SUMMARY---of 0

clear_shadow_entries---of 20
folio_invalidate---of 3
folio_needs_release75%of 4
folio_unmap_invalidate---of 29
generic_error_remove_folio---of 5
invalidate_inode_pages2---of 1
invalidate_inode_pages2_range---of 27
invalidate_mapping_pages100%of 1
mapping_evict_folio---of 9
mapping_try_invalidate41%of 22
pagecache_isize_extended---of 22
truncate_cleanup_folio62%of 13
truncate_folio_batch_exceptionals11%of 29
truncate_inode_folio67%of 3
truncate_inode_pages100%of 1
truncate_inode_pages_final100%of 3
truncate_inode_pages_range28%of 44
truncate_inode_partial_folio---of 34
truncate_pagecache---of 1
truncate_pagecache_range---of 3
truncate_setsize---of 3
try_folio_split_or_unmap---of 10
-----------
SUMMARY35%of 120

-----------
SUMMARY---of 0

cramfs_blkdev_fill_super13%of 16
cramfs_get_tree100%of 1
cramfs_init_fs_context100%of 1
cramfs_kill_sb67%of 3
cramfs_lookup---of 20
cramfs_read47%of 26
cramfs_read_folio---of 15
cramfs_readdir---of 21
cramfs_reconfigure---of 1
cramfs_statfs---of 4
get_cramfs_inode---of 19
-----------
SUMMARY39%of 47

-----------
SUMMARY---of 0

mempool_alloc_bulk_noprof---of 11
mempool_alloc_from_pool---of 10
mempool_alloc_noprof30%of 10
mempool_alloc_pages---of 1
mempool_alloc_preallocated---of 1
mempool_alloc_slab100%of 1
mempool_create_node_noprof---of 7
mempool_destroy---of 8
mempool_exit---of 7
mempool_free50%of 4
mempool_free_bulk19%of 11
mempool_free_pages---of 1
mempool_free_slab100%of 1
mempool_init_node---of 7
mempool_init_noprof---of 1
mempool_kfree---of 1
mempool_kmalloc---of 1
mempool_resize---of 15
-----------
SUMMARY34%of 27

-----------
SUMMARY---of 0

__anon_vma_interval_tree_augment_rotate---of 5
anon_vma_interval_tree_insert---of 7
anon_vma_interval_tree_iter_first---of 10
anon_vma_interval_tree_iter_next---of 14
anon_vma_interval_tree_remove---of 36
vma_interval_tree_augment_rotate---of 5
vma_interval_tree_insert29%of 7
vma_interval_tree_insert_after---of 9
vma_interval_tree_iter_first---of 10
vma_interval_tree_iter_next---of 14
vma_interval_tree_remove---of 36
vma_interval_tree_subtree_search---of 7
-----------
SUMMARY29%of 7

tomoyo_addprintf---of 1
tomoyo_check_profile---of 9
tomoyo_close_control---of 4
tomoyo_find_yesno---of 7
tomoyo_flush---of 16
tomoyo_init_policy_namespace---of 4
tomoyo_io_printf---of 5
tomoyo_numscan---of 5
tomoyo_open_control---of 24
tomoyo_parse_policy---of 10
tomoyo_poll_control---of 3
tomoyo_poll_query---of 5
tomoyo_print_name_union---of 9
tomoyo_print_number_union_nospace---of 7
tomoyo_profile100%of 1
tomoyo_read_control---of 20
tomoyo_read_domain---of 39
tomoyo_read_domain2---of 177
tomoyo_read_exception---of 85
tomoyo_read_manager---of 13
tomoyo_read_pid---of 11
tomoyo_read_profile---of 50
tomoyo_read_query---of 14
tomoyo_read_stat---of 19
tomoyo_read_version---of 3
tomoyo_same_manager---of 1
tomoyo_same_task_acl---of 1
tomoyo_set_group---of 9
tomoyo_set_string---of 3
tomoyo_supervisor3%of 73
tomoyo_update_stat---of 1
tomoyo_write_answer---of 11
tomoyo_write_control---of 58
tomoyo_write_domain---of 24
tomoyo_write_domain2---of 7
tomoyo_write_exception---of 15
tomoyo_write_manager---of 8
tomoyo_write_pid---of 1
tomoyo_write_profile---of 44
tomoyo_write_stat---of 8
tomoyo_write_task---of 5
-----------
SUMMARY5%of 74

class_attr_show---of 3
class_attr_store---of 3
class_child_ns_type---of 1
class_compat_create_link---of 3
class_compat_register---of 4
class_compat_remove_link---of 3
class_compat_unregister---of 1
class_create---of 4
class_create_file_ns---of 7
class_create_release---of 1
class_destroy---of 3
class_dev_iter_exit---of 3
class_dev_iter_init---of 10
class_dev_iter_next---of 6
class_find_device---of 17
class_for_each_device---of 17
class_interface_register---of 21
class_interface_unregister---of 21
class_is_registered---of 7
class_register---of 10
class_release---of 3
class_remove_file_ns---of 7
class_to_subsys58%of 7
class_unregister---of 7
klist_class_dev_get100%of 1
klist_class_dev_put100%of 1
show_class_attr_string---of 1
-----------
SUMMARY67%of 9

tunnel4_err---of 5
tunnel4_rcv---of 9
tunnel4_rcv_cb---of 7
tunnel64_err---of 5
tunnel64_rcv---of 9
tunnelmpls4_err40%of 5
tunnelmpls4_rcv---of 9
xfrm4_tunnel_deregister---of 6
xfrm4_tunnel_register---of 7
-----------
SUMMARY40%of 5

__bpf_trace_signal_deliver---of 1
__bpf_trace_signal_generate---of 1
__compat_save_altstack---of 1
__copy_siginfo_to_user32---of 1
__dequeue_signal---of 16
__ia32_compat_sys_rt_sigaction---of 7
__ia32_compat_sys_rt_sigpending---of 3
__ia32_compat_sys_rt_sigprocmask20%of 10
__ia32_compat_sys_rt_sigqueueinfo---of 5
__ia32_compat_sys_rt_sigsuspend29%of 7
__ia32_compat_sys_rt_sigtimedwait_time32---of 8
__ia32_compat_sys_rt_sigtimedwait_time64---of 8
__ia32_compat_sys_rt_tgsigqueueinfo---of 6
__ia32_compat_sys_sigaction---of 15
__ia32_compat_sys_sigaltstack100%of 1
__ia32_compat_sys_sigpending---of 1
__ia32_sys_kill---of 1
__ia32_sys_pidfd_send_signal---of 1
__ia32_sys_rt_sigaction---of 9
__ia32_sys_rt_sigpending---of 3
__ia32_sys_rt_sigprocmask---of 1
__ia32_sys_rt_sigqueueinfo---of 1
__ia32_sys_rt_sigsuspend---of 1
__ia32_sys_rt_sigtimedwait---of 1
__ia32_sys_rt_sigtimedwait_time32---of 1
__ia32_sys_rt_tgsigqueueinfo---of 1
__ia32_sys_sgetmask---of 1
__ia32_sys_sigaltstack---of 6
__ia32_sys_signal---of 1
__ia32_sys_sigpending---of 1
__ia32_sys_sigprocmask---of 1
__ia32_sys_sigsuspend---of 5
__ia32_sys_ssetmask---of 1
__ia32_sys_tgkill---of 3
__ia32_sys_tkill---of 1
__kill_pgrp_info---of 7
__probestub_signal_deliver---of 1
__probestub_signal_generate---of 1
__save_altstack---of 1
__se_sys_kill---of 18
__se_sys_pidfd_send_signal---of 17
__se_sys_rt_sigprocmask---of 11
__se_sys_rt_sigqueueinfo---of 23
__se_sys_rt_sigsuspend---of 7
__se_sys_rt_sigtimedwait---of 10
__se_sys_rt_sigtimedwait_time32---of 10
__se_sys_rt_tgsigqueueinfo---of 23
__se_sys_sigprocmask---of 10
__se_sys_tkill---of 6
__send_signal_locked---of 31
__set_current_blocked14%of 15
__sigqueue_free---of 9
__traceiter_signal_deliver---of 4
__traceiter_signal_generate---of 4
__x64_sys_kill---of 1
__x64_sys_pause---of 5
__x64_sys_pidfd_send_signal---of 1
__x64_sys_restart_syscall---of 1
__x64_sys_rt_sigaction---of 9
__x64_sys_rt_sigpending---of 3
__x64_sys_rt_sigprocmask---of 1
__x64_sys_rt_sigqueueinfo---of 1
__x64_sys_rt_sigsuspend---of 1
__x64_sys_rt_sigtimedwait---of 1
__x64_sys_rt_sigtimedwait_time32---of 1
__x64_sys_rt_tgsigqueueinfo---of 1
__x64_sys_sigaltstack---of 6
__x64_sys_signal---of 1
__x64_sys_sigpending---of 1
__x64_sys_sigprocmask---of 1
__x64_sys_sigsuspend---of 5
__x64_sys_ssetmask---of 1
__x64_sys_tgkill---of 3
__x64_sys_tkill---of 1
arch_vma_name---of 1
calculate_sigpending---of 10
check_kill_permission---of 16
compat_restore_altstack---of 4
complete_signal---of 48
copy_siginfo---of 1
copy_siginfo_from_user---of 19
copy_siginfo_from_user32---of 3
copy_siginfo_to_external32---of 26
copy_siginfo_to_user---of 4
dequeue_signal---of 19
do_compat_sigaltstack50%of 6
do_freezer_trap---of 6
do_jobctl_trap---of 8
do_no_restart_syscall---of 1
do_notify_parent---of 23
do_notify_parent_cldstop---of 10
do_notify_pidfd---of 3
do_pidfd_send_signal---of 39
do_send_sig_info---of 5
do_send_specific---of 8
do_sigaction---of 34
do_sigaltstack23%of 18
do_signal_stop---of 29
do_sigtimedwait---of 30
exit_signals---of 27
flush_itimer_signals---of 13
flush_signal_handlers---of 9
flush_signals---of 13
flush_sigqueue---of 7
flush_sigqueue_mask---of 12
force_exit_sig---of 1
force_fatal_sig---of 1
force_sig---of 1
force_sig_bnderr---of 1
force_sig_fault---of 1
force_sig_fault_to_task---of 1
force_sig_fault_trapno---of 1
force_sig_info---of 1
force_sig_info_to_task---of 13
force_sig_mceerr---of 3
force_sig_pkuerr---of 1
force_sig_ptrace_errno_trap---of 1
force_sig_seccomp---of 1
force_sigsegv---of 3
get_signal---of 88
group_send_sig_info---of 3
ignore_signals---of 3
kernel_sigaction---of 11
kill_pgrp---of 1
kill_pgrp_info---of 7
kill_pid---of 1
kill_pid_info---of 6
kill_pid_usb_asyncio---of 12
list_del_init---of 4
lock_task_sighand---of 5
next_signal---of 3
perf_trace_signal_deliver---of 10
perf_trace_signal_generate---of 10
posixtimer_get_target---of 5
posixtimer_init_sigqueue---of 3
posixtimer_putref---of 7
posixtimer_queue_sigqueue---of 7
posixtimer_send_sigqueue---of 53
posixtimer_sig_ignore---of 12
post_copy_siginfo_from_user32---of 26
prepare_signal---of 33
print_dropped_signal---of 4
print_fatal_signal---of 5
ptrace_notify---of 4
ptrace_signal---of 8
ptrace_stop---of 26
ptrace_trap_notify---of 12
recalc_sigpending---of 10
restore_altstack---of 13
retarget_shared_pending---of 13
send_sig---of 3
send_sig_fault---of 3
send_sig_fault_trapno---of 3
send_sig_info---of 3
send_sig_mceerr---of 3
send_sig_perf---of 1
send_signal_locked---of 21
set_compat_user_sigmask---of 5
set_current_blocked---of 1
set_user_sigmask---of 5
sig_get_ucounts---of 4
siginfo_layout---of 10
signal_setup_done---of 9
signal_wake_up_state---of 3
sigprocmask---of 7
task_clear_jobctl_pending---of 4
task_clear_jobctl_trapping---of 3
task_join_group_stop---of 9
task_participate_group_stop---of 12
task_set_jobctl_pending---of 8
trace_event_raw_event_signal_deliver---of 11
trace_event_raw_event_signal_generate---of 11
trace_raw_output_signal_deliver---of 3
trace_raw_output_signal_generate---of 3
unhandled_signal---of 6
zap_other_threads---of 12
-----------
SUMMARY25%of 57

-----------
SUMMARY---of 0

__bforget---of 3
__bh_read---of 6
__bh_read_batch---of 12
__block_write_begin---of 1
__block_write_begin_int---of 91
__block_write_full_folio24%of 52
__bread_gfp70%of 13
__breadahead45%of 9
__brelse67%of 3
__find_get_block---of 1
__find_get_block_nonatomic---of 1
__lock_buffer67%of 3
__sync_dirty_buffer47%of 13
__wait_on_buffer67%of 3
alloc_buffer_head45%of 9
alloc_page_buffers---of 1
bdev_getblk53%of 19
bh_read---of 3
bh_uptodate_or_lock---of 7
block_commit_write---of 10
block_dirty_folio---of 8
block_invalidate_folio37%of 19
block_is_partially_uptodate---of 15
block_page_mkwrite---of 17
block_read_full_folio54%of 41
block_truncate_page---of 35
block_write_begin---of 5
block_write_end---of 13
block_write_full_folio17%of 18
buffer_check_dirty_writeback---of 11
buffer_exit_cpu_dead---of 6
clean_bdev_aliases---of 20
cont_write_begin---of 47
create_empty_buffers24%of 13
decrypt_bh---of 6
end_bio_bh_io_sync---of 3
end_buffer_async_read---of 11
end_buffer_async_read_io---of 10
end_buffer_async_write---of 11
end_buffer_read_sync---of 4
end_buffer_write_sync---of 6
find_get_block_common41%of 57
folio_alloc_buffers43%of 21
folio_init_buffers74%of 15
folio_set_bh---of 5
folio_zero_new_buffers---of 17
free_buffer_head34%of 9
generic_block_bmap---of 1
generic_cont_expand_simple---of 5
generic_write_end---of 9
has_bh_in_lru18%of 17
invalidate_bh_lru72%of 7
invalidate_bh_lrus100%of 1
invalidate_bh_lrus_cpu---of 6
mark_buffer_async_write---of 3
mark_buffer_dirty43%of 14
mark_buffer_write_io_error---of 10
mmb_fsync---of 10
mmb_fsync_noflush---of 8
mmb_has_buffers---of 1
mmb_init100%of 1
mmb_invalidate20%of 10
mmb_mark_buffer_dirty---of 7
mmb_sync5%of 40
remove_assoc_queue19%of 11
submit_bh100%of 1
submit_bh_wbc39%of 18
sync_dirty_buffer100%of 1
touch_buffer---of 5
try_to_free_buffers43%of 21
unlock_buffer100%of 1
verify_bh---of 3
write_boundary_block---of 4
write_dirty_buffer---of 6
-----------
SUMMARY38%of 460

-----------
SUMMARY---of 0

__list_del_entry---of 4
dummy_alloc_request43%of 7
dummy_alloc_streams---of 7
dummy_bus_resume---of 4
dummy_bus_suspend---of 1
dummy_dequeue---of 8
dummy_disable---of 10
dummy_enable5%of 42
dummy_free_request---of 4
dummy_free_streams---of 9
dummy_g_get_frame---of 1
dummy_h_get_frame---of 1
dummy_hcd_probe---of 11
dummy_hcd_remove---of 3
dummy_hcd_resume---of 1
dummy_hcd_suspend---of 3
dummy_hub_control19%of 61
dummy_hub_status50%of 8
dummy_pullup100%of 1
dummy_queue22%of 19
dummy_set_halt---of 8
dummy_set_selfpowered---of 1
dummy_set_wedge---of 8
dummy_setup---of 1
dummy_start---of 3
dummy_stop---of 1
dummy_timer---of 188
dummy_udc_async_callbacks40%of 5
dummy_udc_probe---of 15
dummy_udc_remove---of 1
dummy_udc_resume---of 1
dummy_udc_set_speed100%of 1
dummy_udc_start50%of 6
dummy_udc_stop---of 1
dummy_udc_suspend---of 1
dummy_urb_dequeue---of 4
dummy_urb_enqueue34%of 24
dummy_wakeup---of 6
fifo_complete---of 1
function_show---of 4
list_add_tail---of 3
nuke29%of 7
set_link_state34%of 36
stop_activity100%of 1
urbs_show---of 15
-----------
SUMMARY25%of 218

__account_locked_vm---of 8
__compat_vma_mmap---of 5
__vcalloc_noprof---of 3
__vm_enough_memory---of 12
__vmalloc_array_noprof---of 3
account_locked_vm---of 17
compat_set_desc_from_vma---of 1
compat_vma_mmap---of 3
folio_anon_vma---of 1
folio_copy---of 5
folio_mapping60%of 5
folio_mc_copy---of 8
folio_pte_batch---of 7
get_cmdline---of 9
kfree_const100%of 3
kmemdup_array---of 3
kmemdup_noprof67%of 3
kmemdup_nul40%of 5
kstrdup40%of 5
kstrdup_const100%of 3
kstrndup---of 6
kvmemdup---of 3
mem_dump_obj---of 6
memcmp_pages---of 1
memdup_user40%of 5
memdup_user_nul---of 5
mmap_action_complete30%of 20
mmap_action_prepare25%of 8
overcommit_kbytes_handler---of 3
overcommit_policy_handler---of 5
overcommit_ratio_handler---of 3
page_offline_begin---of 1
page_offline_end---of 1
page_offline_freeze---of 1
page_offline_thaw---of 1
randomize_page---of 3
randomize_stack_top---of 3
snapshot_page---of 22
strndup_user25%of 8
sync_overcommit_as---of 1
vcalloc_noprof---of 3
vm_commit_limit---of 3
vm_memory_committed---of 1
vm_mmap---of 3
vm_mmap_pgoff39%of 21
vm_mmap_shadow_stack---of 9
vma_is_stack_for_current---of 3
vma_set_file---of 3
vmalloc_array_noprof---of 3
vmemdup_user---of 5
-----------
SUMMARY41%of 86

-----------
SUMMARY---of 0

__bpf_trace_fib6_table_lookup---of 1
__find_rr_leaf37%of 22
__ip6_del_rt---of 8
__ip6_del_rt_siblings---of 21
__ip6_route_redirect---of 62
__ip6_rt_update_pmtu---of 49
__neigh_lookup---of 3
__probestub_fib6_table_lookup---of 1
__rt6_nh_dev_match---of 6
__traceiter_fib6_table_lookup---of 4
addrconf_f6i_alloc---of 7
dst_discard---of 1
fib6_clean_tohost---of 16
fib6_config_validate---of 14
fib6_ifdown---of 18
fib6_ifup---of 5
fib6_info_hw_flags_set---of 13
fib6_info_nh_uses_dev---of 1
fib6_info_release---of 6
fib6_may_remove_gc_list---of 3
fib6_nh_age_exceptions---of 20
fib6_nh_del_cached_rt---of 1
fib6_nh_find_match---of 7
fib6_nh_flush_exceptions---of 15
fib6_nh_init---of 102
fib6_nh_mtu_change---of 25
fib6_nh_redirect_match---of 30
fib6_nh_release---of 18
fib6_nh_release_dsts---of 8
fib6_nh_remove_exception---of 15
fib6_remove_prefsrc---of 6
fib6_rt_update---of 7
fib6_select_path8%of 26
fib6_table_lookup22%of 28
find_match19%of 33
icmp6_dst_alloc---of 14
inet6_rt_notify---of 8
inet6_rtm_delroute---of 16
inet6_rtm_getroute---of 91
inet6_rtm_newroute---of 87
ip6_blackhole_route---of 11
ip6_confirm_neigh---of 14
ip6_create_rt_rcu---of 17
ip6_default_advmss---of 9
ip6_del_cached_rt---of 34
ip6_del_rt---of 1
ip6_dst_alloc---of 3
ip6_dst_check---of 22
ip6_dst_destroy---of 18
ip6_dst_gc---of 4
ip6_dst_ifdown---of 15
ip6_dst_neigh_lookup---of 4
ip6_ins_rt---of 1
ip6_link_failure---of 8
ip6_mtu---of 8
ip6_mtu_from_fib6---of 34
ip6_multipath_l3_keys24%of 13
ip6_negative_advice---of 13
ip6_neigh_lookup---of 17
ip6_pkt_discard---of 1
ip6_pkt_discard_out---of 1
ip6_pkt_drop---of 17
ip6_pkt_prohibit---of 1
ip6_pkt_prohibit_out---of 1
ip6_pol_route20%of 73
ip6_pol_route_input100%of 1
ip6_pol_route_lookup---of 86
ip6_pol_route_output---of 1
ip6_redirect---of 1
ip6_redirect_no_header---of 3
ip6_route_add---of 9
ip6_route_cleanup---of 1
ip6_route_del---of 48
ip6_route_dev_notify---of 28
ip6_route_get_saddr---of 15
ip6_route_info_create---of 23
ip6_route_info_create_nh---of 34
ip6_route_input35%of 23
ip6_route_input_lookup---of 3
ip6_route_lookup---of 1
ip6_route_net_exit---of 1
ip6_route_net_exit_late---of 1
ip6_route_net_init---of 8
ip6_route_net_init_late---of 4
ip6_route_output_flags---of 20
ip6_rt_cache_alloc---of 30
ip6_rt_copy_init29%of 25
ip6_rt_update_pmtu---of 3
ip6_sk_dst_store_flow---of 9
ip6_sk_redirect---of 1
ip6_sk_update_pmtu---of 17
ip6_update_pmtu---of 6
ipv6_addr_prefix---of 5
ipv6_inetpeer_exit---of 1
ipv6_inetpeer_init---of 3
ipv6_route_ioctl---of 6
ipv6_route_sysctl_init---of 3
ipv6_route_sysctl_table_size---of 1
ipv6_sysctl_rtcache_flush---of 4
nexthop_path_fib6_result---of 6
nlmsg_parse_deprecated_strict---of 4
perf_trace_fib6_table_lookup---of 18
refcount_dec_and_test---of 4
rt6_add_dflt_router---of 4
rt6_add_route_info---of 3
rt6_age_exceptions---of 3
rt6_clean_tohost---of 1
rt6_disable_ip---of 36
rt6_do_redirect---of 30
rt6_do_update_pmtu---of 11
rt6_dump_route---of 33
rt6_fill_node---of 74
rt6_fill_node_nexthop---of 12
rt6_flush_exceptions---of 3
rt6_get_dflt_router---of 15
rt6_get_route_info---of 19
rt6_insert_exception---of 44
rt6_lookup---of 5
rt6_mtu_change---of 1
rt6_mtu_change_route---of 5
rt6_multipath_dead_count---of 10
rt6_multipath_hash4%of 63
rt6_multipath_nh_flags_set---of 8
rt6_multipath_rebalance---of 36
rt6_nh_age_exceptions---of 1
rt6_nh_dump_exceptions---of 17
rt6_nh_find_match---of 1
rt6_nh_flush_exceptions---of 1
rt6_nh_nlmsg_size---of 3
rt6_nh_remove_exception_rt---of 1
rt6_nlmsg_size---of 11
rt6_probe_deferred---of 4
rt6_purge_dflt_routers---of 30
rt6_remove_exception---of 7
rt6_remove_prefsrc---of 1
rt6_route_rcv---of 24
rt6_score_route34%of 15
rt6_stats_seq_show---of 1
rt6_sync_down_dev---of 3
rt6_sync_up---of 4
rt6_uncached_list_add---of 3
rt6_uncached_list_del---of 5
rtm_to_fib6_config---of 50
rtm_to_fib6_multipath_config---of 17
skb_dst_set---of 3
skb_transport_header67%of 3
trace_event_raw_event_fib6_table_lookup---of 19
trace_raw_output_fib6_table_lookup---of 3
-----------
SUMMARY20%of 325

-----------
SUMMARY---of 0

__ip_tunnel_change_mtu---of 5
__ip_tunnel_create---of 18
__ip_tunnel_init---of 7
ip4_dst_hoplimit---of 3
ip_bucket---of 3
ip_md_tunnel_xmit---of 55
ip_tunnel_bind_dev---of 15
ip_tunnel_change_mtu---of 5
ip_tunnel_changelink---of 7
ip_tunnel_ctl---of 38
ip_tunnel_delete_net---of 15
ip_tunnel_dellink---of 8
ip_tunnel_dev_free---of 1
ip_tunnel_encap_add_ops---of 3
ip_tunnel_encap_del_ops---of 3
ip_tunnel_encap_setup---of 7
ip_tunnel_find---of 13
ip_tunnel_get_iflink---of 1
ip_tunnel_get_link_net---of 1
ip_tunnel_init_net---of 16
ip_tunnel_lookup38%of 56
ip_tunnel_md_udp_encap---of 3
ip_tunnel_newlink---of 19
ip_tunnel_parm_from_user---of 8
ip_tunnel_parm_to_user---of 6
ip_tunnel_rcv23%of 66
ip_tunnel_setup---of 1
ip_tunnel_siocdevprivate---of 5
ip_tunnel_uninit---of 9
ip_tunnel_update---of 20
ip_tunnel_xmit---of 92
tnl_update_pmtu---of 42
-----------
SUMMARY30%of 122

__es_find_extent_range---of 23
__es_insert_extent---of 59
__es_remove_extent4%of 82
count_rsvd---of 17
es_do_reclaim_extents---of 16
ext4_clear_inode_es---of 7
ext4_es_cache_extent---of 28
ext4_es_count---of 5
ext4_es_find_extent_range---of 10
ext4_es_free_extent---of 10
ext4_es_init_tree100%of 1
ext4_es_insert_delayed_extent---of 48
ext4_es_insert_extent---of 187
ext4_es_lookup_extent---of 27
ext4_es_register_shrinker25%of 8
ext4_es_remove_extent22%of 14
ext4_es_scan---of 40
ext4_es_scan_clu---of 5
ext4_es_scan_range---of 5
ext4_es_unregister_shrinker100%of 1
ext4_exit_es---of 1
ext4_exit_pending---of 1
ext4_init_pending_tree100%of 1
ext4_is_pending---of 7
ext4_remove_pending---of 7
ext4_seq_es_shrinker_info_show---of 9
-----------
SUMMARY11%of 107

blk_get_meta_cap10%of 22
blk_integrity_merge_bio18%of 17
blk_integrity_merge_rq---of 18
blk_integrity_profile_name---of 5
blk_rq_count_integrity_sg---of 7
blk_rq_integrity_map_user---of 8
device_is_integrity_capable_show---of 1
format_show---of 6
protection_interval_bytes_show---of 1
read_verify_show---of 1
read_verify_store---of 3
tag_size_show---of 1
write_generate_show---of 1
write_generate_store---of 3
-----------
SUMMARY13%of 39

-----------
SUMMARY---of 0

__kthread_cancel_work_sync---of 12
__kthread_create_on_node38%of 8
__kthread_init_worker---of 1
__kthread_parkme---of 7
__kthread_queue_delayed_work---of 7
free_kthread_struct---of 6
get_kthread_comm---of 9
kthread---of 13
kthread_affine_node---of 16
kthread_affine_preferred---of 19
kthread_associate_blkcg---of 12
kthread_bind---of 8
kthread_bind_mask50%of 8
kthread_blkcg50%of 4
kthread_cancel_delayed_work_sync---of 1
kthread_cancel_work_sync---of 1
kthread_complete_and_exit---of 3
kthread_create_on_cpu---of 4
kthread_create_on_node100%of 1
kthread_create_worker_on_cpu---of 3
kthread_create_worker_on_node---of 4
kthread_data67%of 3
kthread_delayed_work_timer_fn---of 11
kthread_destroy_worker---of 8
kthread_do_exit---of 7
kthread_flush_work---of 7
kthread_flush_work_fn---of 1
kthread_flush_worker---of 3
kthread_freezable_should_stop---of 8
kthread_func---of 4
kthread_insert_work40%of 10
kthread_insert_work_sanity_check60%of 5
kthread_is_per_cpu75%of 4
kthread_mod_delayed_work---of 9
kthread_park---of 8
kthread_parkme---of 3
kthread_probe_data---of 4
kthread_queue_delayed_work---of 4
kthread_queue_work50%of 4
kthread_set_per_cpu---of 8
kthread_should_park---of 3
kthread_should_stop---of 3
kthread_should_stop_or_park---of 4
kthread_stop36%of 17
kthread_stop_put---of 4
kthread_unpark38%of 8
kthread_unuse_mm---of 9
kthread_use_mm---of 9
kthread_worker_fn---of 34
kthreadd---of 18
kthreads_init---of 1
kthreads_online_cpu---of 1
kthreads_update_affinity---of 18
kthreads_update_housekeeping---of 1
set_kthread_struct---of 7
tsk_fork_get_node---of 3
-----------
SUMMARY46%of 72

-----------
SUMMARY---of 0

__bpf_trace_workqueue_activate_work---of 1
__bpf_trace_workqueue_execute_end---of 1
__bpf_trace_workqueue_execute_start---of 1
__bpf_trace_workqueue_queue_work---of 1
__cancel_work60%of 10
__flush_work32%of 29
__flush_workqueue7%of 65
__list_del_entry---of 4
__probestub_workqueue_activate_work---of 1
__probestub_workqueue_execute_end---of 1
__probestub_workqueue_execute_start---of 1
__probestub_workqueue_queue_work---of 1
__pwq_activate_work---of 17
__queue_delayed_work47%of 15
__queue_work32%of 61
__traceiter_workqueue_activate_work---of 4
__traceiter_workqueue_execute_end---of 4
__traceiter_workqueue_execute_start---of 4
__traceiter_workqueue_queue_work---of 4
__warn_flushing_systemwide_wq---of 1
alloc_unbound_pwq14%of 30
alloc_workqueue_attrs_noprof---of 3
alloc_workqueue_noprof21%of 101
apply_workqueue_attrs---of 16
apply_wqattrs_commit46%of 11
apply_wqattrs_prepare20%of 35
assign_work---of 21
bh_pool_kick_highpri---of 1
bh_pool_kick_normal---of 1
bh_worker---of 20
cancel_delayed_work100%of 1
cancel_delayed_work_sync---of 6
cancel_work---of 1
cancel_work_sync50%of 6
check_flush_dependency16%of 13
cpumask_isolated_show---of 1
cpumask_requested_show---of 1
cpumask_show---of 1
cpumask_store---of 6
create_worker---of 14
current_is_workqueue_rescuer---of 5
current_work---of 5
delayed_work_timer_fn---of 1
destroy_workqueue34%of 42
devm_alloc_workqueue---of 4
devm_workqueue_release---of 1
disable_delayed_work---of 1
disable_delayed_work_sync---of 6
disable_work---of 1
disable_work_sync50%of 6
drain_dead_softirq_workfn---of 13
drain_workqueue37%of 11
enable_delayed_work---of 1
enable_work56%of 9
execute_in_process_context---of 7
flush_delayed_work---of 3
flush_rcu_work---of 3
flush_work100%of 1
flush_workqueue_prep_pwqs50%of 22
format_worker_id---of 5
free_workqueue_attrs---of 3
freeze_workqueues_begin---of 6
freeze_workqueues_busy---of 12
get_pwq---of 3
idle_cull_fn---of 18
idle_worker_timeout---of 10
init_rescuer58%of 7
init_worker_pool---of 4
jhash18%of 17
kick_pool70%of 10
link_pwq---of 4
max_active_show---of 1
max_active_store---of 3
mod_delayed_work_on60%of 5
move_linked_works---of 11
per_cpu_show---of 1
perf_trace_workqueue_activate_work---of 8
perf_trace_workqueue_execute_end---of 8
perf_trace_workqueue_execute_start---of 8
perf_trace_workqueue_queue_work---of 8
pool_mayday_timeout---of 12
pr_cont_pool_info---of 5
pr_cont_worker_id---of 3
print_worker_info---of 5
process_scheduled_works---of 45
put_unbound_pool---of 37
pwq_dec_nr_in_flight---of 39
pwq_release_workfn---of 25
pwq_tryinc_nr_active19%of 22
queue_delayed_work_on67%of 6
queue_rcu_work---of 4
queue_work_node---of 12
queue_work_on84%of 6
rcu_free_pool---of 3
rcu_free_wq---of 10
rcu_work_rcufn---of 1
rescuer_thread---of 62
schedule_on_each_cpu---of 15
set_work_data---of 3
set_worker_desc---of 5
set_worker_dying---of 13
show_all_workqueues---of 16
show_freezable_workqueues---of 6
show_one_workqueue---of 11
show_pwq---of 77
thaw_workqueues---of 5
trace_event_raw_event_workqueue_activate_work---of 9
trace_event_raw_event_workqueue_execute_end---of 9
trace_event_raw_event_workqueue_execute_start---of 9
trace_event_raw_event_workqueue_queue_work---of 9
trace_raw_output_workqueue_activate_work---of 3
trace_raw_output_workqueue_execute_end---of 3
trace_raw_output_workqueue_execute_start---of 3
trace_raw_output_workqueue_queue_work---of 3
unbind_worker---of 5
unbound_wq_update_pwq---of 23
work_busy---of 11
work_for_cpu_fn---of 1
work_grab_pending15%of 27
work_offqd_unpack---of 3
work_on_cpu_key---of 7
worker_attach_to_pool---of 10
worker_enter_idle---of 16
worker_thread---of 39
workqueue_apply_unbound_cpumask---of 44
workqueue_congested---of 5
workqueue_offline_cpu---of 24
workqueue_online_cpu---of 38
workqueue_prepare_cpu---of 7
workqueue_set_max_active---of 7
workqueue_set_min_active---of 3
workqueue_softirq_action---of 4
workqueue_softirq_dead---of 17
workqueue_sysfs_register---of 7
workqueue_unbound_housekeeping_update---of 4
wq_adjust_max_active19%of 16
wq_affinity_strict_show---of 1
wq_affinity_strict_store---of 18
wq_affn_dfl_get---of 1
wq_affn_dfl_set---of 10
wq_affn_scope_show---of 3
wq_affn_scope_store---of 18
wq_barrier_func---of 1
wq_cpumask_show---of 1
wq_cpumask_store---of 18
wq_device_release---of 1
wq_nice_show---of 1
wq_nice_store---of 19
wq_sysfs_is_visible---of 3
wq_update_node_max_active27%of 26
wq_worker_comm---of 6
wq_worker_last_func100%of 1
wq_worker_running67%of 6
wq_worker_sleeping67%of 6
wq_worker_tick---of 10
-----------
SUMMARY29%of 623

__bpf_trace_ksm_advisor---of 1
__bpf_trace_ksm_enter_exit_template---of 1
__bpf_trace_ksm_merge_one_page---of 1
__bpf_trace_ksm_merge_with_ksm_page---of 1
__bpf_trace_ksm_remove_ksm_page---of 1
__bpf_trace_ksm_remove_rmap_item---of 1
__bpf_trace_ksm_scan_template---of 1
__ksm_enter---of 16
__ksm_exit---of 30
__probestub_ksm_advisor---of 1
__probestub_ksm_enter---of 1
__probestub_ksm_exit---of 1
__probestub_ksm_merge_one_page---of 1
__probestub_ksm_merge_with_ksm_page---of 1
__probestub_ksm_remove_ksm_page---of 1
__probestub_ksm_remove_rmap_item---of 1
__probestub_ksm_start_scan---of 1
__probestub_ksm_stop_scan---of 1
__stable_node_chain---of 32
__traceiter_ksm_advisor---of 4
__traceiter_ksm_enter---of 4
__traceiter_ksm_exit---of 4
__traceiter_ksm_merge_one_page---of 4
__traceiter_ksm_merge_with_ksm_page---of 4
__traceiter_ksm_remove_ksm_page---of 4
__traceiter_ksm_remove_rmap_item---of 4
__traceiter_ksm_start_scan---of 4
__traceiter_ksm_stop_scan---of 4
advisor_max_cpu_show---of 1
advisor_max_cpu_store---of 3
advisor_max_pages_to_scan_show---of 1
advisor_max_pages_to_scan_store---of 3
advisor_min_pages_to_scan_show---of 1
advisor_min_pages_to_scan_store---of 3
advisor_mode_show---of 1
advisor_mode_store---of 6
advisor_target_scan_time_show---of 1
advisor_target_scan_time_store---of 4
break_cow---of 16
break_ksm_pmd_entry---of 20
dec_mm_counter---of 1
folio_migrate_ksm---of 3
full_scans_show---of 1
general_profit_show---of 1
ksm_add_vmas---of 12
ksm_del_vmas---of 15
ksm_disable---of 8
ksm_disable_merge_any---of 4
ksm_enable_merge_any---of 5
ksm_get_folio---of 21
ksm_madvise---of 17
ksm_memory_callback---of 28
ksm_might_need_to_copy---of 19
ksm_next_page_pmd_entry---of 21
ksm_process_mergeable---of 6
ksm_process_profit---of 1
ksm_scan_thread---of 331
ksm_vma_flags25%of 8
ksm_zero_pages_show---of 1
max_page_sharing_show---of 1
max_page_sharing_store---of 13
merge_across_nodes_show---of 1
merge_across_nodes_store---of 14
pages_scanned_show---of 1
pages_shared_show---of 1
pages_sharing_show---of 1
pages_skipped_show---of 1
pages_to_scan_show---of 1
pages_to_scan_store---of 4
pages_unshared_show---of 1
pages_volatile_show---of 1
perf_trace_ksm_advisor---of 8
perf_trace_ksm_enter_exit_template---of 8
perf_trace_ksm_merge_one_page---of 8
perf_trace_ksm_merge_with_ksm_page---of 8
perf_trace_ksm_remove_ksm_page---of 8
perf_trace_ksm_remove_rmap_item---of 8
perf_trace_ksm_scan_template---of 8
pfn_pte---of 3
remove_all_stable_nodes---of 43
remove_node_from_stable_tree---of 23
remove_rmap_item_from_tree---of 13
replace_page---of 21
rmap_walk_ksm---of 23
run_show---of 1
run_store---of 46
sleep_millisecs_show---of 1
sleep_millisecs_store---of 3
smart_scan_show---of 1
smart_scan_store---of 3
stable_node_chains_prune_millisecs_show---of 1
stable_node_chains_prune_millisecs_store---of 3
stable_node_chains_show---of 1
stable_node_dups_show---of 1
trace_event_raw_event_ksm_advisor---of 9
trace_event_raw_event_ksm_enter_exit_template---of 9
trace_event_raw_event_ksm_merge_one_page---of 9
trace_event_raw_event_ksm_merge_with_ksm_page---of 9
trace_event_raw_event_ksm_remove_ksm_page---of 9
trace_event_raw_event_ksm_remove_rmap_item---of 9
trace_event_raw_event_ksm_scan_template---of 9
trace_raw_output_ksm_advisor---of 3
trace_raw_output_ksm_enter_exit_template---of 3
trace_raw_output_ksm_merge_one_page---of 3
trace_raw_output_ksm_merge_with_ksm_page---of 3
trace_raw_output_ksm_remove_ksm_page---of 3
trace_raw_output_ksm_remove_rmap_item---of 3
trace_raw_output_ksm_scan_template---of 3
try_to_merge_one_page---of 60
try_to_merge_with_ksm_page---of 17
use_zero_pages_show---of 1
use_zero_pages_store---of 3
-----------
SUMMARY25%of 8

__do_wait---of 32
__ia32_compat_sys_wait4---of 4
__ia32_compat_sys_waitid---of 17
__ia32_sys_wait4---of 4
__ia32_sys_waitid---of 1
__ia32_sys_waitpid---of 1
__se_sys_waitid---of 17
__wake_up_parent---of 1
__x64_sys_wait4---of 4
__x64_sys_waitid---of 1
__x64_sys_waitpid---of 1
child_wait_callback---of 9
coredump_task_exit---of 10
delayed_put_task_struct---of 8
do_exit---of 118
do_group_exit---of 6
do_wait---of 13
exit_mm---of 13
is_current_pgrp_orphaned---of 13
kernel_wait---of 3
kernel_wait4---of 9
kernel_waitid_prepare---of 13
kill_orphaned_pgrp---of 21
list_del_init---of 4
make_task_dead---of 12
mm_update_next_owner---of 35
oops_count_show---of 1
pid_child_should_wake---of 9
put_task_struct---of 4
put_task_struct_rcu_user75%of 4
rcuwait_wake_up---of 3
refcount_inc---of 4
release_task---of 40
stack_not_used---of 3
wait_consider_task---of 78
-----------
SUMMARY75%of 4

-----------
SUMMARY---of 0

__sbitmap_queue_get100%of 1
__sbitmap_queue_get_batch---of 24
sbitmap_add_wait_queue---of 3
sbitmap_any_bit_set---of 4
sbitmap_bitmap_show---of 22
sbitmap_del_wait_queue---of 6
sbitmap_find_bit45%of 20
sbitmap_finish_wait---of 3
sbitmap_get45%of 9
sbitmap_init_node---of 19
sbitmap_prepare_to_wait---of 3
sbitmap_queue_clear50%of 4
sbitmap_queue_clear_batch---of 10
sbitmap_queue_get_shallow---of 11
sbitmap_queue_init_node---of 8
sbitmap_queue_min_shallow_depth---of 3
sbitmap_queue_recalculate_wake_batch---of 1
sbitmap_queue_resize---of 10
sbitmap_queue_show---of 8
sbitmap_queue_wake_all---of 17
sbitmap_queue_wake_up7%of 31
sbitmap_resize---of 8
sbitmap_show---of 6
sbitmap_weight---of 10
-----------
SUMMARY28%of 65

-----------
SUMMARY---of 0

__change_page_attr_set_clr29%of 85
__cpa_flush_all---of 3
__cpa_process_fault---of 58
__unmap_pmd_range---of 18
_set_memory_uc---of 1
_set_memory_wb---of 1
_set_memory_wc---of 3
_set_memory_wt---of 1
_set_pages_array---of 12
alloc_pmd_page---of 8
alloc_pte_page---of 8
arch_invalidate_pmem---of 4
arch_report_meminfo---of 3
change_page_attr_set_clr12%of 53
clear_mce_nospec---of 1
clflush_cache_range---of 4
collapse_large_pages---of 14
collapse_pmd_page---of 12
collapse_pud_page---of 14
cpu_cache_has_invalidate_memregion---of 4
cpu_cache_invalidate_memregion---of 4
kernel_page_present---of 1
lookup_address---of 1
lookup_address_in_pgd---of 1
lookup_address_in_pgd_attr38%of 16
lookup_pmd_address---of 12
populate_pmd---of 26
set_direct_map_default_noflush---of 1
set_direct_map_invalid_noflush---of 1
set_direct_map_valid_noflush---of 3
set_mce_nospec---of 3
set_memory_4k---of 1
set_memory_decrypted---of 1
set_memory_enc_stop_conversion---of 3
set_memory_encrypted---of 1
set_memory_global---of 1
set_memory_nonglobal---of 1
set_memory_np---of 1
set_memory_np_noalias---of 1
set_memory_nx---of 3
set_memory_p---of 1
set_memory_ro100%of 1
set_memory_rox---of 1
set_memory_rw---of 1
set_memory_uc---of 4
set_memory_wb---of 3
set_memory_wc---of 5
set_memory_x---of 3
set_pages_array_uc---of 1
set_pages_array_wb---of 4
set_pages_array_wc---of 1
set_pages_ro---of 1
set_pages_rw---of 1
set_pages_uc---of 1
set_pages_wb---of 3
slow_virt_to_phys---of 5
static_protections50%of 16
try_to_free_pmd_page---of 14
unmap_pmd_range---of 12
update_page_count---of 1
-----------
SUMMARY27%of 171

-----------
SUMMARY---of 0

____sys_recvmsg---of 10
____sys_sendmsg---of 30
___sys_recvmsg---of 12
___sys_sendmsg---of 7
__copy_msghdr---of 17
__ia32_sys_accept---of 1
__ia32_sys_accept4---of 1
__ia32_sys_bind---of 1
__ia32_sys_connect---of 1
__ia32_sys_getpeername---of 1
__ia32_sys_getsockname---of 1
__ia32_sys_getsockopt---of 7
__ia32_sys_listen---of 8
__ia32_sys_recv---of 1
__ia32_sys_recvfrom---of 1
__ia32_sys_recvmmsg---of 6
__ia32_sys_recvmmsg_time32---of 6
__ia32_sys_recvmsg---of 8
__ia32_sys_send---of 1
__ia32_sys_sendmmsg---of 1
__ia32_sys_sendmsg---of 8
__ia32_sys_sendto---of 1
__ia32_sys_setsockopt100%of 1
__ia32_sys_shutdown---of 8
__ia32_sys_socket100%of 1
__ia32_sys_socketcall---of 1
__ia32_sys_socketpair---of 1
__se_sys_getpeername---of 9
__se_sys_getsockname---of 9
__se_sys_socketcall---of 28
__sock_create27%of 26
__sock_recv_cmsgs---of 16
__sock_recv_timestamp---of 41
__sock_recv_wifi_status---of 4
__sock_tx_timestamp---of 1
__sys_accept4---of 10
__sys_bind---of 15
__sys_bind_socket---of 3
__sys_connect---of 15
__sys_connect_file---of 5
__sys_getsockname---of 11
__sys_getsockopt---of 7
__sys_listen---of 8
__sys_listen_socket---of 3
__sys_recvfrom---of 9
__sys_recvmmsg---of 11
__sys_recvmsg---of 8
__sys_recvmsg_sock---of 1
__sys_sendmmsg---of 16
__sys_sendmsg---of 8
__sys_sendmsg_sock---of 1
__sys_sendto---of 22
__sys_setsockopt16%of 13
__sys_shutdown---of 8
__sys_shutdown_sock---of 3
__sys_socket34%of 6
__sys_socket_file---of 4
__sys_socketpair---of 20
__x64_sys_accept---of 1
__x64_sys_accept4---of 1
__x64_sys_bind---of 1
__x64_sys_connect---of 1
__x64_sys_getpeername---of 1
__x64_sys_getsockname---of 1
__x64_sys_getsockopt---of 7
__x64_sys_listen---of 8
__x64_sys_recv---of 1
__x64_sys_recvfrom---of 1
__x64_sys_recvmmsg---of 6
__x64_sys_recvmmsg_time32---of 6
__x64_sys_recvmsg---of 8
__x64_sys_send---of 1
__x64_sys_sendmmsg---of 1
__x64_sys_sendmsg---of 8
__x64_sys_sendto---of 1
__x64_sys_setsockopt---of 1
__x64_sys_shutdown---of 8
__x64_sys_socket---of 1
__x64_sys_socketcall---of 1
__x64_sys_socketpair---of 1
br_ioctl_call---of 5
brioctl_set---of 1
call_trace_sock_recv_length---of 5
call_trace_sock_send_length---of 5
compat_siocwandev---of 8
compat_sock_ioctl---of 91
do_accept---of 12
do_getsockname---of 6
do_recvmmsg---of 29
do_sock_getsockopt---of 17
do_sock_setsockopt---of 8
get_user_ifreq---of 7
init_once---of 1
kernel_accept---of 9
kernel_bind---of 3
kernel_connect---of 3
kernel_getpeername---of 1
kernel_getsockname---of 1
kernel_listen---of 1
kernel_recvmsg---of 1
kernel_sendmsg---of 1
kernel_sock_ip_overhead---of 9
kernel_sock_shutdown---of 1
move_addr_to_kernel---of 7
move_addr_to_user---of 14
put_user_ifreq---of 1
skb_get_tx_timestamp---of 16
skb_has_tx_timestamp---of 8
sock_alloc---of 3
sock_alloc_file---of 11
sock_alloc_inode67%of 3
sock_close---of 8
sock_create---of 1
sock_create_kern---of 1
sock_create_lite---of 11
sock_evict_inode67%of 3
sock_fasync---of 4
sock_free_inode---of 1
sock_from_file---of 3
sock_ioctl---of 45
sock_is_registered---of 3
sock_mmap---of 1
sock_poll---of 9
sock_read_iter---of 7
sock_recvmsg---of 7
sock_recvmsg_nosec---of 6
sock_register---of 4
sock_release---of 7
sock_sendmsg---of 11
sock_sendmsg_nosec---of 7
sock_show_fdinfo---of 3
sock_splice_eof---of 3
sock_splice_read---of 3
sock_unregister---of 3
sock_wake_async---of 10
sock_write_iter---of 16
socket_seq_show---of 1
sockfd_lookup---of 5
sockfs_dname---of 1
sockfs_init_fs_context---of 3
sockfs_listxattr---of 5
sockfs_security_xattr_set---of 1
sockfs_setattr---of 5
sockfs_user_xattr_get---of 3
sockfs_user_xattr_set---of 3
sockfs_xattr_get---of 4
update_socket_protocol100%of 1
vlan_ioctl_set---of 1
-----------
SUMMARY34%of 54

-----------
SUMMARY---of 0

v9fs_drop_inode---of 4
v9fs_free_fc50%of 4
v9fs_get_tree---of 32
v9fs_init_fs_context40%of 5
v9fs_kill_super---of 1
v9fs_statfs---of 12
v9fs_umount_begin---of 1
v9fs_write_inode---of 1
v9fs_write_inode_dotl---of 1
-----------
SUMMARY45%of 9

__mpage_writepages---of 92
do_mpage_readpage26%of 62
mpage_read_end_io41%of 22
mpage_read_folio---of 5
mpage_readahead47%of 15
mpage_write_end_io---of 27
-----------
SUMMARY33%of 99

-----------
SUMMARY---of 0

ima_init_template_list---of 6
ima_restore_measurement_list---of 37
ima_template_desc_buf---of 11
ima_template_desc_current19%of 11
ima_template_has_modsig---of 6
lookup_template_desc---of 5
template_desc_init_fields---of 38
-----------
SUMMARY19%of 11

-----------
SUMMARY---of 0

list_add_tail---of 3
memb_group_features_show---of 5
null_add_dev---of 69
null_alloc_dev---of 4
null_cmd_timer_expired---of 1
null_complete_rq100%of 1
null_del_dev---of 11
null_destroy_dev---of 3
null_free_device_storage---of 11
null_free_sector---of 11
null_handle_badblocks---of 9
null_handle_discard---of 6
null_handle_memory_backed---of 46
null_init_hctx---of 3
null_insert_page---of 29
null_make_cache_space---of 30
null_map_queues---of 12
null_poll---of 22
null_process_cmd24%of 13
null_queue_rq24%of 39
null_queue_rqs45%of 9
null_set_irqmode---of 4
null_set_queue_mode---of 4
null_timeout_rq---of 9
nullb_bwtimer_fn---of 3
nullb_device_badblocks_once_show---of 1
nullb_device_badblocks_once_store---of 4
nullb_device_badblocks_partial_io_show---of 1
nullb_device_badblocks_partial_io_store---of 4
nullb_device_badblocks_show---of 1
nullb_device_badblocks_store---of 11
nullb_device_blocking_show---of 1
nullb_device_blocking_store---of 4
nullb_device_blocksize_show---of 1
nullb_device_blocksize_store---of 4
nullb_device_cache_size_show---of 1
nullb_device_cache_size_store---of 4
nullb_device_completion_nsec_show---of 1
nullb_device_completion_nsec_store---of 4
nullb_device_discard_show---of 1
nullb_device_discard_store---of 4
nullb_device_fua_show---of 1
nullb_device_fua_store---of 4
nullb_device_home_node_show---of 1
nullb_device_home_node_store---of 4
nullb_device_hw_queue_depth_show---of 1
nullb_device_hw_queue_depth_store---of 4
nullb_device_index_show---of 1
nullb_device_index_store---of 4
nullb_device_irqmode_show---of 1
nullb_device_irqmode_store---of 4
nullb_device_max_sectors_show---of 1
nullb_device_max_sectors_store---of 4
nullb_device_mbps_show---of 1
nullb_device_mbps_store---of 4
nullb_device_memory_backed_show---of 1
nullb_device_memory_backed_store---of 4
nullb_device_no_sched_show---of 1
nullb_device_no_sched_store---of 4
nullb_device_poll_queues_show---of 1
nullb_device_poll_queues_store---of 7
nullb_device_power_show---of 1
nullb_device_power_store---of 9
nullb_device_queue_mode_show---of 1
nullb_device_queue_mode_store---of 4
nullb_device_release---of 3
nullb_device_rotational_show---of 1
nullb_device_rotational_store---of 4
nullb_device_shared_tag_bitmap_show---of 1
nullb_device_shared_tag_bitmap_store---of 4
nullb_device_shared_tags_show---of 1
nullb_device_shared_tags_store---of 4
nullb_device_size_show---of 1
nullb_device_size_store---of 4
nullb_device_submit_queues_show---of 1
nullb_device_submit_queues_store---of 7
nullb_device_use_per_node_hctx_show---of 1
nullb_device_use_per_node_hctx_store---of 4
nullb_device_virt_boundary_show---of 1
nullb_device_virt_boundary_store---of 4
nullb_device_zone_append_max_sectors_show---of 1
nullb_device_zone_append_max_sectors_store---of 4
nullb_device_zone_capacity_show---of 1
nullb_device_zone_capacity_store---of 4
nullb_device_zone_full_show---of 1
nullb_device_zone_full_store---of 4
nullb_device_zone_max_active_show---of 1
nullb_device_zone_max_active_store---of 4
nullb_device_zone_max_open_show---of 1
nullb_device_zone_max_open_store---of 4
nullb_device_zone_nr_conv_show---of 1
nullb_device_zone_nr_conv_store---of 4
nullb_device_zone_offline_store---of 1
nullb_device_zone_readonly_store---of 1
nullb_device_zone_size_show---of 1
nullb_device_zone_size_store---of 4
nullb_device_zoned_show---of 1
nullb_device_zoned_store---of 4
nullb_group_drop_item---of 3
nullb_group_make_group---of 12
nullb_setup_bwtimer---of 1
-----------
SUMMARY28%of 62

__of_add_property---of 10
__of_device_is_compatible---of 24
__of_device_is_status---of 13
__of_find_all_nodes---of 5
__of_find_node_by_full_path---of 5
__of_find_node_by_path---of 8
__of_get_property---of 7
__of_parse_phandle_with_args---of 23
__of_phandle_cache_inv_entry---of 5
__of_remove_property---of 8
__of_update_property---of 11
of_add_property---of 1
of_alias_from_compatible---of 8
of_alias_get_highest_id---of 6
of_alias_get_id---of 7
of_alias_scan---of 23
of_bus_n_addr_cells---of 19
of_bus_n_size_cells---of 19
of_console_check---of 4
of_count_phandle_with_args---of 14
of_device_compatible_match---of 5
of_device_is_available---of 7
of_device_is_big_endian---of 1
of_device_is_compatible---of 1
of_find_all_nodes---of 5
of_find_compatible_node---of 14
of_find_last_cache_level---of 7
of_find_matching_node_and_match---of 22
of_find_next_cache_node---of 3
of_find_node_by_name---of 14
of_find_node_by_phandle---of 14
of_find_node_by_type---of 19
of_find_node_opts_by_path---of 17
of_find_node_with_property---of 17
of_find_property---of 7
of_get_available_child_by_name---of 7
of_get_child_by_name---of 6
of_get_compatible_child---of 6
of_get_next_available_child---of 10
of_get_next_child67%of 3
of_get_next_child_with_prefix---of 6
of_get_next_cpu_node---of 15
of_get_next_parent---of 3
of_get_next_reserved_child---of 6
of_get_parent---of 3
of_get_property---of 7
of_machine_compatible_match---of 14
of_machine_get_match---of 7
of_machine_get_match_data---of 8
of_machine_read_compatible---of 1
of_machine_read_model---of 1
of_map_id---of 41
of_match_node---of 6
of_n_addr_cells---of 1
of_n_size_cells---of 1
of_node_name_eq---of 4
of_node_name_prefix---of 3
of_parse_phandle_with_args_map---of 70
of_phandle_iterator_args---of 9
of_phandle_iterator_init---of 8
of_phandle_iterator_next---of 29
of_print_phandle_args---of 5
of_remove_property---of 7
of_update_property---of 3
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

I_BDEV100%of 1
bd_abort_claiming---of 3
bd_finish_claiming34%of 9
bd_init_fs_context---of 3
bd_may_claim---of 7
bd_prepare_to_claim30%of 10
bd_yield_claim54%of 15
bdev_add---of 4
bdev_alloc---of 7
bdev_alloc_inode---of 4
bdev_drop---of 1
bdev_file_open_by_dev31%of 13
bdev_file_open_by_path---of 11
bdev_fput25%of 8
bdev_free_inode---of 7
bdev_freeze---of 9
bdev_mark_dead50%of 8
bdev_open42%of 39
bdev_permission40%of 5
bdev_release53%of 21
bdev_set_nr_sectors100%of 1
bdev_statx---of 10
bdev_thaw---of 9
bdev_unhash---of 3
bdev_validate_blocksize---of 3
blkdev_flush_mapping62%of 13
blkdev_get_no_open34%of 6
blkdev_get_whole44%of 16
blkdev_put_no_open100%of 1
block_size100%of 1
disk_live---of 1
file_bdev100%of 1
init_once---of 1
invalidate_bdev100%of 3
lookup_bdev29%of 7
nr_blockdev_pages---of 4
sb_min_blocksize100%of 1
sb_set_blocksize50%of 8
set_blocksize70%of 13
sync_bdevs---of 9
sync_blockdev67%of 3
sync_blockdev_nowait---of 3
sync_blockdev_range---of 1
truncate_bdev_range---of 5
-----------
SUMMARY48%of 203

__do_replace---of 28
alloc_counters---of 15
cleanup_entry---of 8
compat_copy_entries_to_user---of 14
compat_standard_from_user---of 3
compat_standard_to_user---of 3
compat_table_info---of 30
copy_from_sockptr_offset---of 4
do_ip6t_get_ctl---of 45
do_ip6t_set_ctl---of 39
ip6_tables_net_exit---of 1
ip6_tables_net_init---of 1
ip6t_alloc_initial_table---of 9
ip6t_do_table25%of 40
ip6t_error---of 3
ip6t_register_table---of 22
ip6t_unregister_table_exit---of 7
ip6t_unregister_table_pre_exit---of 3
trace_packet---of 11
translate_compat_table---of 61
translate_table---of 71
-----------
SUMMARY25%of 40

-----------
SUMMARY---of 0

hfsplus_read_wrapper19%of 32
hfsplus_submit_bio---of 3
-----------
SUMMARY19%of 32

__list_del_entry---of 4
init_kcm_sock---of 9
kcm_abort_tx_psock---of 7
kcm_create20%of 10
kcm_done---of 31
kcm_done_work---of 1
kcm_exit_net---of 3
kcm_getsockopt---of 7
kcm_init_net---of 1
kcm_ioctl---of 41
kcm_parse_func_strparser---of 12
kcm_queue_rcv_skb---of 13
kcm_rcv_ready---of 17
kcm_rcv_strparser---of 14
kcm_read_sock_done---of 1
kcm_recv_enable---of 3
kcm_recvmsg---of 7
kcm_release---of 15
kcm_report_tx_retry---of 1
kcm_rfree---of 11
kcm_sendmsg---of 101
kcm_setsockopt---of 14
kcm_splice_eof---of 3
kcm_splice_read---of 4
kcm_tx_work---of 5
kcm_unattach---of 25
kcm_write_msgs---of 49
list_add---of 4
psock_data_ready---of 7
psock_now_avail---of 6
psock_state_change---of 1
psock_write_space---of 4
refcount_inc---of 4
requeue_rx_msgs---of 11
sock_put---of 4
unreserve_psock---of 7
unreserve_rx_kcm---of 7
-----------
SUMMARY20%of 10

-----------
SUMMARY---of 0

__hfsplus_ext_cache_extent---of 8
__hfsplus_ext_read_extent---of 8
__hfsplus_ext_write_extent---of 8
hfsplus_add_extent---of 20
hfsplus_ext_cmp_key---of 5
hfsplus_ext_write_extent---of 4
hfsplus_file_extend---of 42
hfsplus_file_truncate---of 12
hfsplus_free_extents---of 25
hfsplus_free_fork---of 13
hfsplus_get_block14%of 38
-----------
SUMMARY14%of 38

__kernfs_setattr---of 20
kernfs_evict_inode---of 1
kernfs_get_inode34%of 12
kernfs_iop_getattr---of 5
kernfs_iop_listxattr---of 10
kernfs_iop_permission84%of 6
kernfs_iop_setattr---of 5
kernfs_setattr---of 1
kernfs_vfs_user_xattr_set---of 11
kernfs_vfs_xattr_get---of 4
kernfs_vfs_xattr_set---of 1
kernfs_xattr_get---of 4
kernfs_xattr_set---of 11
-----------
SUMMARY50%of 18

lockref_get40%of 5
lockref_get_not_dead25%of 8
lockref_get_not_zero---of 8
lockref_mark_dead67%of 3
lockref_put_or_lock43%of 7
lockref_put_return40%of 5
-----------
SUMMARY40%of 28

aa_get_buffer---of 30
aa_get_newest_label12%of 18
aa_put_buffer---of 15
aa_put_label---of 5
aa_sock_msg_perm---of 11
aa_sock_opt_perm---of 9
aa_sock_perm---of 9
apparmor_bprm_committed_creds---of 11
apparmor_bprm_committing_creds---of 8
apparmor_capable45%of 9
apparmor_capget---of 11
apparmor_cred_alloc_blank---of 3
apparmor_cred_free---of 9
apparmor_cred_prepare60%of 5
apparmor_cred_transfer---of 5
apparmor_current_getlsmprop_subj60%of 5
apparmor_dointvec---of 4
apparmor_file_alloc_security35%of 20
apparmor_file_free_security34%of 6
apparmor_file_lock---of 1
apparmor_file_mprotect---of 4
apparmor_file_open46%of 11
apparmor_file_permission100%of 1
apparmor_file_receive---of 1
apparmor_file_truncate---of 1
apparmor_getprocattr---of 16
apparmor_getselfattr---of 14
apparmor_inet_conn_request---of 3
apparmor_inode_getattr---of 10
apparmor_ip_postroute25%of 8
apparmor_mmap_file50%of 4
apparmor_move_mount---of 9
apparmor_nf_register---of 1
apparmor_nf_unregister---of 1
apparmor_path_chmod---of 10
apparmor_path_chown---of 10
apparmor_path_link---of 18
apparmor_path_mkdir---of 10
apparmor_path_mknod50%of 10
apparmor_path_rename---of 26
apparmor_path_rmdir---of 1
apparmor_path_symlink---of 10
apparmor_path_truncate---of 10
apparmor_path_unlink---of 1
apparmor_ptrace_access_check---of 12
apparmor_ptrace_traceme---of 12
apparmor_sb_mount39%of 13
apparmor_sb_pivotroot---of 14
apparmor_sb_umount---of 9
apparmor_setprocattr---of 3
apparmor_setselfattr---of 3
apparmor_sk_alloc_security---of 11
apparmor_sk_clone_security---of 43
apparmor_sk_free_security---of 13
apparmor_sock_graft---of 9
apparmor_socket_accept---of 11
apparmor_socket_bind---of 11
apparmor_socket_connect---of 11
apparmor_socket_create34%of 21
apparmor_socket_getpeername---of 1
apparmor_socket_getpeersec_dgram---of 1
apparmor_socket_getpeersec_stream---of 36
apparmor_socket_getsockname---of 1
apparmor_socket_getsockopt---of 1
apparmor_socket_listen---of 9
apparmor_socket_post_create---of 25
apparmor_socket_recvmsg---of 1
apparmor_socket_sendmsg---of 1
apparmor_socket_setsockopt---of 1
apparmor_socket_shutdown---of 1
apparmor_socket_sock_rcv_skb50%of 4
apparmor_socket_socketpair---of 35
apparmor_task_alloc---of 13
apparmor_task_free---of 14
apparmor_task_getlsmprop_obj---of 5
apparmor_task_kill---of 20
apparmor_task_setrlimit---of 9
apparmor_unix_may_send---of 9
apparmor_unix_stream_connect---of 15
apparmor_uring_override_creds---of 18
apparmor_uring_sqpoll---of 13
apparmor_userns_create---of 19
audit_uring_cb---of 6
begin_current_label_crit_section50%of 6
common_file_perm43%of 7
common_perm_rm---of 11
do_setattr---of 32
param_get_aabool---of 5
param_get_aacompressionlevel---of 5
param_get_aaintbool---of 1
param_get_aalockpolicy---of 5
param_get_aauint---of 5
param_get_audit---of 5
param_get_debug---of 5
param_get_mode---of 5
param_set_aabool---of 5
param_set_aacompressionlevel---of 4
param_set_aaintbool---of 4
param_set_aalockpolicy---of 5
param_set_aauint---of 4
param_set_audit---of 7
param_set_debug---of 7
param_set_mode---of 7
unix_connect_peers---of 25
-----------
SUMMARY38%of 148

-----------
SUMMARY---of 0

__usermodehelper_disable---of 7
__usermodehelper_set_disable_depth---of 1
call_usermodehelper---of 3
call_usermodehelper_exec29%of 14
call_usermodehelper_exec_async---of 11
call_usermodehelper_exec_work---of 10
call_usermodehelper_setup50%of 6
proc_cap_handler---of 7
usermodehelper_read_lock_wait---of 6
usermodehelper_read_trylock---of 8
usermodehelper_read_unlock---of 1
-----------
SUMMARY35%of 20

squashfs_read_xattr_id_table20%of 10
squashfs_xattr_lookup50%of 4
-----------
SUMMARY29%of 14

__iterate_supers---of 12
__put_super55%of 11
alloc_super28%of 22
bdev_super_lock---of 7
deactivate_locked_super43%of 7
deactivate_super40%of 5
destroy_super_rcu---of 1
destroy_super_work---of 12
do_emergency_remount---of 1
do_emergency_remount_callback---of 6
do_thaw_all---of 1
do_thaw_all_callback---of 5
drop_super---of 1
drop_super_exclusive---of 1
emergency_remount---of 3
emergency_thaw_all---of 3
filesystems_freeze---of 1
filesystems_freeze_callback---of 17
filesystems_thaw---of 1
filesystems_thaw_callback---of 15
free_anon_bdev---of 1
freeze_inc---of 9
freeze_super---of 62
fs_bdev_freeze---of 13
fs_bdev_mark_dead---of 9
fs_bdev_sync---of 3
fs_bdev_thaw---of 12
generic_shutdown_super28%of 11
get_anon_bdev---of 3
get_tree_bdev100%of 1
get_tree_bdev_flags31%of 13
get_tree_keyed---of 6
get_tree_nodev---of 6
get_tree_single---of 6
grab_super20%of 10
iterate_supers---of 1
iterate_supers_type---of 10
kill_anon_super---of 6
kill_block_super67%of 3
mount_capable100%of 3
put_super---of 1
reconfigure_super---of 28
retire_super---of 5
sb_init_dio_done_wq---of 4
set_anon_super---of 3
set_anon_super_fc---of 3
setup_bdev_super32%of 16
sget---of 20
sget_dev100%of 1
sget_fc40%of 28
super_cache_count---of 7
super_cache_scan---of 9
super_lock34%of 12
super_s_dev_set100%of 1
super_s_dev_test67%of 3
super_setup_bdi---of 1
super_setup_bdi_name---of 5
super_trylock_shared---of 5
test_keyed_super---of 1
test_single_super---of 1
thaw_super---of 3
thaw_super_locked---of 35
user_get_super---of 5
vfs_get_tree45%of 9
-----------
SUMMARY39%of 156

-----------
SUMMARY---of 0

kasprintf100%of 1
kvasprintf50%of 4
kvasprintf_const72%of 7
-----------
SUMMARY67%of 12

__bpf_trace_bpf_trace_printk---of 1
__probestub_bpf_trace_printk---of 1
__set_printk_clr_event---of 4
__traceiter_bpf_trace_printk---of 4
bpf_copy_from_user_dynptr---of 9
bpf_copy_from_user_str_dynptr---of 13
bpf_copy_from_user_task_dynptr---of 13
bpf_copy_from_user_task_str_dynptr---of 15
bpf_d_path---of 5
bpf_d_path_allowed---of 5
bpf_event_notify---of 10
bpf_event_output---of 13
bpf_get_attach_cookie_kprobe_multi---of 1
bpf_get_attach_cookie_pe---of 1
bpf_get_attach_cookie_trace---of 1
bpf_get_attach_cookie_tracing---of 1
bpf_get_attach_cookie_uprobe_multi---of 1
bpf_get_branch_snapshot---of 4
bpf_get_current_task---of 1
bpf_get_current_task_btf---of 1
bpf_get_func_ip_kprobe---of 3
bpf_get_func_ip_kprobe_multi---of 1
bpf_get_func_ip_tracing---of 1
bpf_get_func_ip_uprobe_multi---of 1
bpf_get_perf_event_info---of 11
bpf_get_perf_event_read_value_proto---of 1
bpf_get_raw_tracepoint---of 12
bpf_get_stack_raw_tp---of 4
bpf_get_stack_tp---of 1
bpf_get_stackid_raw_tp---of 4
bpf_get_stackid_tp---of 1
bpf_get_trace_printk_proto---of 1
bpf_get_trace_vprintk_proto---of 1
bpf_kprobe_multi_link_attach---of 1
bpf_perf_event_output---of 12
bpf_perf_event_output_raw_tp---of 14
bpf_perf_event_output_tp---of 12
bpf_perf_event_read---of 5
bpf_perf_event_read_value---of 7
bpf_perf_prog_read_value---of 4
bpf_probe_read_compat---of 7
bpf_probe_read_compat_str---of 7
bpf_probe_read_kernel---of 3
bpf_probe_read_kernel_dynptr---of 8
bpf_probe_read_kernel_str---of 3
bpf_probe_read_kernel_str_dynptr---of 9
bpf_probe_read_user---of 3
bpf_probe_read_user_dynptr---of 8
bpf_probe_read_user_str---of 3
bpf_probe_read_user_str_dynptr---of 9
bpf_probe_register---of 4
bpf_probe_unregister---of 1
bpf_probe_write_user---of 5
bpf_put_raw_tracepoint---of 1
bpf_read_branch_records---of 6
bpf_send_signal---of 1
bpf_send_signal_common---of 16
bpf_send_signal_task---of 3
bpf_send_signal_thread---of 1
bpf_seq_printf---of 5
bpf_seq_printf_btf---of 8
bpf_seq_write---of 1
bpf_session_cookie---of 1
bpf_session_filter---of 7
bpf_session_is_return---of 1
bpf_snprintf_btf---of 8
bpf_task_pt_regs---of 1
bpf_trace_printk---of 6
bpf_trace_run1---of 13
bpf_trace_run10---of 13
bpf_trace_run11---of 13
bpf_trace_run12---of 13
bpf_trace_run2---of 13
bpf_trace_run3---of 13
bpf_trace_run4---of 13
bpf_trace_run5---of 13
bpf_trace_run6---of 13
bpf_trace_run7---of 13
bpf_trace_run8---of 13
bpf_trace_run9---of 13
bpf_trace_vprintk---of 8
bpf_tracing_func_proto---of 8
bpf_uprobe_multi_link_attach---of 40
bpf_uprobe_multi_link_dealloc---of 1
bpf_uprobe_multi_link_fill_link_info---of 24
bpf_uprobe_multi_link_release---of 8
bpf_uprobe_multi_show_fdinfo---of 8
bpf_uprobe_unregister---of 4
btf_id_cmp_func---of 1
do_bpf_send_signal---of 4
get_func_arg---of 3
get_func_arg_cnt---of 1
get_func_ret---of 1
kprobe_prog_func_proto19%of 11
kprobe_prog_is_valid_access---of 5
pe_prog_convert_ctx_access---of 4
pe_prog_func_proto---of 8
pe_prog_is_valid_access---of 25
perf_event_attach_bpf_prog---of 8
perf_event_detach_bpf_prog---of 7
perf_event_query_prog_array---of 11
perf_trace_bpf_trace_printk---of 8
put_task_struct---of 4
raw_tp_prog_func_proto---of 6
raw_tp_prog_is_valid_access---of 3
raw_tp_writable_prog_is_valid_access---of 6
tp_prog_func_proto---of 6
tp_prog_is_valid_access---of 3
trace_call_bpf---of 20
trace_event_raw_event_bpf_trace_printk---of 9
trace_raw_output_bpf_trace_printk---of 3
tracing_prog_func_proto---of 31
tracing_prog_is_valid_access---of 4
uprobe_multi_link_filter---of 1
uprobe_multi_link_handler---of 1
uprobe_multi_link_ret_handler---of 1
uprobe_prog_run---of 21
-----------
SUMMARY19%of 11

-----------
SUMMARY---of 0

hfs_bmap_alloc---of 25
hfs_bmap_free---of 10
hfs_bmap_new_bmap---of 4
hfs_bmap_reserve---of 5
hfs_btree_close67%of 9
hfs_btree_open40%of 38
hfs_btree_write---of 3
-----------
SUMMARY45%of 47

__ia32_compat_sys_get_robust_list---of 4
__ia32_compat_sys_set_robust_list---of 3
__ia32_sys_futex---of 1
__ia32_sys_futex_requeue---of 1
__ia32_sys_futex_time32100%of 1
__ia32_sys_futex_wait---of 1
__ia32_sys_futex_waitv---of 1
__ia32_sys_futex_wake---of 5
__ia32_sys_get_robust_list---of 4
__ia32_sys_set_robust_list---of 3
__se_sys_futex---of 16
__se_sys_futex_requeue---of 13
__se_sys_futex_time3220%of 15
__se_sys_futex_wait---of 11
__se_sys_futex_waitv---of 15
__x64_sys_futex---of 1
__x64_sys_futex_requeue---of 1
__x64_sys_futex_time32---of 1
__x64_sys_futex_wait---of 1
__x64_sys_futex_waitv---of 1
__x64_sys_futex_wake---of 5
__x64_sys_get_robust_list---of 4
__x64_sys_set_robust_list---of 3
do_futex17%of 18
futex_get_robust_list_common---of 15
futex_parse_waitv---of 9
-----------
SUMMARY21%of 34

__ia32_sys_ioprio_get---of 1
__ia32_sys_ioprio_set100%of 1
__se_sys_ioprio_get---of 36
__se_sys_ioprio_set10%of 41
__x64_sys_ioprio_get---of 1
__x64_sys_ioprio_set---of 1
get_task_ioprio---of 7
ioprio_check_cap---of 9
-----------
SUMMARY12%of 42

-----------
SUMMARY---of 0

__bio_crypt_advance---of 6
__bio_crypt_clone---of 3
__bio_crypt_free_ctx---of 1
__blk_crypto_free_request---of 3
__blk_crypto_rq_bio_prep---of 4
__blk_crypto_rq_get_keyslot---of 1
__blk_crypto_rq_put_keyslot---of 1
__blk_crypto_submit_bio---of 8
bio_crypt_ctx_mergeable29%of 7
bio_crypt_dun_increment---of 6
bio_crypt_dun_is_contiguous---of 5
bio_crypt_rq_ctx_compatible67%of 3
bio_crypt_set_ctx---of 3
blk_crypto_config_supported---of 3
blk_crypto_config_supported_natively---of 1
blk_crypto_evict_key---of 6
blk_crypto_init_key---of 11
blk_crypto_ioctl---of 28
blk_crypto_start_using_key---of 5
-----------
SUMMARY40%of 10

-----------
SUMMARY---of 0

__bpf_trace_emulate_vsyscall---of 1
__emulate_vsyscall---of 34
__probestub_emulate_vsyscall---of 1
__traceiter_emulate_vsyscall---of 4
emulate_vsyscall_gp---of 6
emulate_vsyscall_pf---of 10
gate_vma_name---of 1
get_gate_vma---of 4
in_gate_area---of 5
in_gate_area_no_mm100%of 1
perf_trace_emulate_vsyscall---of 8
trace_event_raw_event_emulate_vsyscall---of 9
trace_raw_output_emulate_vsyscall---of 3
warn_bad_vsyscall---of 4
write_ok_or_segv---of 3
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__usb_serial_register_drivers---of 43
allocate_minors29%of 7
port_number_show---of 1
serial_break---of 3
serial_chars_in_buffer---of 3
serial_cleanup---of 3
serial_close---of 1
serial_get_icount---of 3
serial_get_serial---of 5
serial_hangup---of 1
serial_install---of 7
serial_ioctl---of 5
serial_open---of 1
serial_port_activate---of 11
serial_port_carrier_raised---of 3
serial_port_dtr_rts---of 3
serial_port_shutdown---of 3
serial_proc_show---of 6
serial_set_serial---of 10
serial_set_termios---of 3
serial_throttle---of 3
serial_tiocmget---of 3
serial_tiocmset---of 3
serial_unthrottle---of 3
serial_wait_until_sent---of 4
serial_write---of 8
serial_write_room---of 1
setup_port_interrupt_in---of 5
setup_port_interrupt_out---of 5
usb_serial_claim_interface---of 4
usb_serial_deregister---of 4
usb_serial_deregister_drivers---of 4
usb_serial_disconnect42%of 12
usb_serial_port_get_by_minor---of 6
usb_serial_port_release100%of 1
usb_serial_port_softint---of 1
usb_serial_port_work---of 1
usb_serial_probe19%of 102
usb_serial_put34%of 15
usb_serial_reset_resume---of 4
usb_serial_resume---of 4
usb_serial_suspend---of 8
usb_serial_unpoison_port_urbs---of 4
-----------
SUMMARY24%of 137

gre_add_protocol---of 3
gre_del_protocol---of 4
gre_err60%of 5
gre_parse_header50%of 56
gre_rcv34%of 9
-----------
SUMMARY49%of 70

-----------
SUMMARY---of 0

__dquot_alloc_space7%of 46
__dquot_drop---of 1
__dquot_free_space9%of 35
__dquot_initialize5%of 40
__dquot_transfer---of 104
__quota_error---of 3
add_dquot_ref---of 15
do_proc_dqstats---of 1
dqcache_shrink_count---of 1
dqcache_shrink_scan---of 18
dqget---of 30
dqgrab---of 10
dqput---of 9
dquot_acquire---of 11
dquot_add_inodes---of 22
dquot_add_space---of 27
dquot_alloc---of 1
dquot_alloc_inode7%of 31
dquot_claim_space_nodirty---of 23
dquot_commit---of 9
dquot_commit_info---of 1
dquot_destroy---of 1
dquot_disable---of 80
dquot_drop34%of 6
dquot_file_open---of 4
dquot_free_inode8%of 27
dquot_get_dqblk---of 3
dquot_get_next_dqblk---of 5
dquot_get_next_id---of 4
dquot_get_state---of 12
dquot_initialize100%of 1
dquot_initialize_needed17%of 12
dquot_load_quota_inode---of 13
dquot_load_quota_sb---of 33
dquot_mark_dquot_dirty---of 8
dquot_quota_disable---of 23
dquot_quota_enable---of 19
dquot_quota_off---of 1
dquot_quota_on---of 4
dquot_quota_on_mount---of 4
dquot_quota_sync---of 21
dquot_reclaim_space_nodirty---of 23
dquot_release---of 6
dquot_resume---of 25
dquot_scan_active---of 18
dquot_set_dqblk---of 50
dquot_set_dqinfo---of 13
dquot_transfer---of 13
dquot_write_dquot---of 7
dquot_writeback_dquots---of 36
list_del_init---of 4
mark_info_dirty---of 1
quota_release_workfn---of 18
register_quota_format---of 1
unregister_quota_format---of 4
-----------
SUMMARY9%of 198

-----------
SUMMARY---of 0

_prb_read_valid27%of 34
data_alloc25%of 8
data_push_tail12%of 17
desc_read80%of 10
get_data25%of 12
prb_commit31%of 13
prb_final_commit37%of 11
prb_first_seq---of 4
prb_first_valid_seq---of 1
prb_init---of 1
prb_next_reserve_seq---of 5
prb_next_seq---of 4
prb_read_valid100%of 1
prb_read_valid_info---of 1
prb_record_text_space---of 1
prb_reserve22%of 38
prb_reserve_in_last24%of 34
-----------
SUMMARY28%of 178

__inet6_check_established---of 48
__inet6_lookup_established10%of 32
inet6_ehashfn100%of 1
inet6_hash_connect---of 3
inet6_init_ehash_secret---of 7
inet6_lhash2_lookup---of 15
inet6_lookup---of 9
inet6_lookup_listener---of 6
inet6_lookup_reuseport---of 5
inet6_lookup_run_sk_lookup---of 26
ipv6_portaddr_hash---of 4
-----------
SUMMARY13%of 33

__bpf_trace_mm_lru_activate---of 1
__bpf_trace_mm_lru_insertion---of 1
__folio_batch_release100%of 4
__folio_put36%of 14
__list_add---of 4
__lru_add_drain_all37%of 22
__lru_cache_activate_folio---of 6
__page_cache_release43%of 28
__probestub_mm_lru_activate---of 1
__probestub_mm_lru_insertion---of 1
__traceiter_mm_lru_activate---of 4
__traceiter_mm_lru_insertion---of 4
deactivate_file_folio17%of 12
folio_activate---of 10
folio_add_lru59%of 12
folio_add_lru_vma67%of 3
folio_batch_move_lru57%of 16
folio_batch_remove_exceptionals67%of 12
folio_deactivate---of 13
folio_mark_accessed32%of 19
folio_mark_lazyfree---of 14
folio_rotate_reclaimable---of 11
folios_put_refs44%of 25
list_del---of 4
lru_activate---of 36
lru_add32%of 25
lru_add_drain100%of 3
lru_add_drain_all100%of 1
lru_add_drain_cpu58%of 14
lru_add_drain_cpu_zone---of 3
lru_add_drain_per_cpu---of 3
lru_cache_disable---of 1
lru_deactivate---of 34
lru_deactivate_file---of 48
lru_gen_add_folio40%of 23
lru_gen_clear_refs---of 11
lru_gen_update_size34%of 9
lru_lazyfree---of 34
lru_move_tail---of 27
lru_note_cost_refault---of 3
lru_note_cost_unlock_irq---of 13
lru_reparent_memcg---of 32
perf_trace_mm_lru_activate---of 8
perf_trace_mm_lru_insertion---of 13
release_pages---of 9
trace_event_raw_event_mm_lru_activate---of 9
trace_event_raw_event_mm_lru_insertion---of 14
trace_raw_output_mm_lru_activate---of 3
trace_raw_output_mm_lru_insertion---of 3
-----------
SUMMARY44%of 242

-----------
SUMMARY---of 0

__bpf_trace_ctime---of 1
__bpf_trace_ctime_ns_xchg---of 1
__bpf_trace_fill_mg_cmtime---of 1
__destroy_inode40%of 20
__insert_inode_hash67%of 3
__probestub_ctime_ns_xchg---of 1
__probestub_ctime_xchg_skip---of 1
__probestub_fill_mg_cmtime---of 1
__probestub_inode_set_ctime_to_ts---of 1
__remove_inode_hash40%of 5
__traceiter_ctime_ns_xchg---of 4
__traceiter_ctime_xchg_skip---of 4
__traceiter_fill_mg_cmtime---of 4
__traceiter_inode_set_ctime_to_ts---of 4
__wait_on_freeing_inode---of 10
address_space_init_once---of 1
alloc_inode45%of 9
atime_needs_update35%of 20
bmap---of 3
clear_inode38%of 8
clear_nlink67%of 3
current_time46%of 11
dentry_needs_remove_privs---of 4
destroy_inode---of 5
discard_new_inode---of 3
drop_nlink60%of 5
dump_inode---of 10
dump_mapping---of 10
evict42%of 29
evict_inodes---of 28
file_modified---of 3
file_remove_privs---of 1
file_remove_privs_flags---of 15
file_update_time---of 1
file_update_time_flags---of 17
find_inode60%of 10
find_inode_by_ino_rcu---of 8
find_inode_fast60%of 10
find_inode_nowait---of 8
find_inode_rcu---of 8
free_inode_nonrcu---of 1
generic_update_time---of 6
get_next_ino58%of 7
get_nr_dirty_inodes---of 9
i_callback---of 3
iget5_locked40%of 5
iget5_locked_rcu---of 11
iget_locked20%of 21
igrab---of 3
ihold67%of 3
ilookup58%of 7
ilookup534%of 6
ilookup5_nowait---of 1
in_group_or_capable---of 3
inc_nlink50%of 4
init_once---of 1
init_special_inode40%of 5
inode_bit_waitqueue---of 1
inode_dio_finished---of 1
inode_dio_wait---of 5
inode_dio_wait_interruptible---of 6
inode_init_always_gfp72%of 7
inode_init_once100%of 1
inode_init_owner75%of 4
inode_insert536%of 17
inode_just_drop100%of 1
inode_lru_isolate---of 12
inode_lru_list_add34%of 9
inode_needs_sync---of 6
inode_nohighmem---of 1
inode_owner_or_capable---of 5
inode_pin_lru_isolating---of 3
inode_sb_list_add---of 4
inode_set_ctime_current24%of 34
inode_set_ctime_deleg---of 16
inode_set_ctime_to_ts40%of 5
inode_set_flags50%of 6
inode_unpin_lru_isolating---of 3
inode_update_time---of 14
insert_inode_locked---of 15
insert_inode_locked4---of 3
iput40%of 43
iput_not_last---of 5
iunique---of 8
kiocb_modified---of 3
lock_two_nondirectories---of 11
mgts_open---of 1
mgts_show---of 13
mode_strip_sgid34%of 6
new_inode40%of 5
no_open---of 1
perf_trace_ctime---of 8
perf_trace_ctime_ns_xchg---of 8
perf_trace_fill_mg_cmtime---of 8
proc_nr_inodes---of 9
prune_icache_sb---of 7
set_nlink40%of 5
timestamp_truncate---of 7
touch_atime47%of 15
trace_event_raw_event_ctime---of 9
trace_event_raw_event_ctime_ns_xchg---of 9
trace_event_raw_event_fill_mg_cmtime---of 9
trace_raw_output_ctime---of 3
trace_raw_output_ctime_ns_xchg---of 3
trace_raw_output_fill_mg_cmtime---of 3
unlock_new_inode67%of 3
unlock_two_nondirectories---of 7
wait_on_new_inode---of 7
-----------
SUMMARY42%of 352

-----------
SUMMARY---of 0

ping_bind---of 46
ping_close---of 1
ping_common_sendmsg---of 12
ping_err10%of 31
ping_get_port---of 30
ping_getfrag---of 4
ping_init_sock---of 18
ping_lookup16%of 26
ping_pre_connect---of 1
ping_proc_exit---of 1
ping_queue_rcv_skb---of 3
ping_rcv50%of 6
ping_recvmsg---of 33
ping_seq_next---of 20
ping_seq_start---of 23
ping_seq_stop---of 1
ping_unhash---of 7
ping_v4_proc_exit_net---of 1
ping_v4_proc_init_net---of 3
ping_v4_push_pending_frames---of 4
ping_v4_sendmsg---of 54
ping_v4_seq_show---of 5
ping_v4_seq_start---of 1
-----------
SUMMARY16%of 63

__jump_label_update---of 11
__static_key_deferred_flush---of 3
__static_key_slow_dec_cpuslocked---of 11
__static_key_slow_dec_deferred---of 9
jump_label_cmp---of 5
jump_label_del_module---of 26
jump_label_init_ro---of 8
jump_label_init_type---of 1
jump_label_lock---of 1
jump_label_module_notify---of 30
jump_label_rate_limit---of 3
jump_label_swap---of 1
jump_label_text_reserved---of 22
jump_label_unlock---of 1
jump_label_update---of 13
jump_label_update_timeout---of 1
static_key_count100%of 1
static_key_disable---of 1
static_key_disable_cpuslocked---of 7
static_key_enable---of 1
static_key_enable_cpuslocked---of 7
static_key_fast_inc_not_disabled---of 7
static_key_slow_dec---of 3
static_key_slow_dec_cpuslocked---of 3
static_key_slow_inc---of 1
static_key_slow_inc_cpuslocked---of 14
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__fib6_drop_pcpu_from---of 9
call_fib6_entry_notifiers---of 1
call_fib6_entry_notifiers_replace---of 1
call_fib6_multipath_entry_notifiers---of 1
fib6_add---of 80
fib6_add_1---of 44
fib6_add_rt2node---of 137
fib6_age_exceptions---of 9
fib6_clean_all---of 6
fib6_clean_all_skip_notify---of 6
fib6_clean_node---of 15
fib6_del---of 48
fib6_dump_done---of 8
fib6_dump_node---of 11
fib6_dump_table---of 9
fib6_flush_trees---of 9
fib6_force_start_gc---of 3
fib6_gc_cleanup---of 1
fib6_gc_timer_cb---of 1
fib6_get_table---of 4
fib6_info_alloc---of 3
fib6_info_destroy_rcu---of 11
fib6_info_release---of 6
fib6_locate---of 41
fib6_metric_set---of 6
fib6_net_exit---of 7
fib6_net_init---of 10
fib6_new_table---of 8
fib6_nh_drop_pcpu_from---of 1
fib6_node_dump---of 5
fib6_node_lookup50%of 4
fib6_node_lookup_131%of 23
fib6_purge_rt---of 30
fib6_repair_tree---of 50
fib6_run_gc---of 25
fib6_tables_dump---of 6
fib6_tables_seq_read---of 9
fib6_update_sernum---of 5
fib6_update_sernum_upto_root---of 7
fib6_walk---of 5
fib6_walk_continue---of 28
fib6_walker_link---of 4
inet6_dump_fib---of 33
ipv6_route_seq_next---of 22
ipv6_route_seq_show---of 9
ipv6_route_seq_start---of 6
ipv6_route_seq_stop---of 11
ipv6_route_yield---of 5
-----------
SUMMARY34%of 27

-----------
SUMMARY---of 0

__pte_offset_map43%of 7
kernel_pgtable_work_func---of 8
p4d_clear_bad---of 6
pagetable_free_kernel---of 4
pgd_clear_bad---of 7
pgtable_trans_huge_deposit---of 6
pgtable_trans_huge_withdraw---of 6
pmd_clear_bad---of 1
pmdp_collapse_flush---of 3
pmdp_huge_clear_flush---of 3
pmdp_invalidate---of 3
pte_free_defer---of 1
pte_free_now---of 10
pte_offset_map_lock25%of 8
pte_offset_map_ro_nolock---of 6
pte_offset_map_rw_nolock38%of 8
ptep_clear_flush---of 5
pud_clear_bad---of 1
pudp_huge_clear_flush---of 3
-----------
SUMMARY35%of 23

__bpf_trace_swiotlb_bounced---of 1
__probestub_swiotlb_bounced---of 1
__swiotlb_sync_single_for_cpu---of 4
__swiotlb_sync_single_for_device---of 4
__swiotlb_tbl_unmap_single---of 15
__traceiter_swiotlb_bounced---of 4
default_swiotlb_base---of 1
default_swiotlb_limit---of 1
fops_io_tlb_hiwater_open---of 1
fops_io_tlb_used_open---of 1
io_tlb_hiwater_get---of 1
io_tlb_hiwater_set---of 3
io_tlb_used_get---of 1
is_swiotlb_active---of 3
is_swiotlb_allocated---of 1
perf_trace_swiotlb_bounced---of 15
round_up_default_nslabs---of 5
swiotlb_adjust_nareas---of 8
swiotlb_bounce---of 9
swiotlb_dev_init100%of 1
swiotlb_init_io_tlb_pool---of 7
swiotlb_init_late---of 28
swiotlb_map---of 28
swiotlb_max_mapping_size---of 4
swiotlb_print_info---of 3
swiotlb_size_or_default---of 1
swiotlb_tbl_map_single---of 57
trace_event_raw_event_swiotlb_bounced---of 16
trace_raw_output_swiotlb_bounced---of 3
-----------
SUMMARY100%of 1

__tlb_remove_folio_pages---of 9
__tlb_remove_page_size---of 7
__tlb_remove_table_one_rcu---of 8
tlb_finish_mmu50%of 6
tlb_flush_mmu20%of 21
tlb_flush_rmaps---of 15
tlb_gather_mmu67%of 3
tlb_gather_mmu_fullmm---of 1
tlb_gather_mmu_vma---of 3
tlb_remove_table---of 16
tlb_remove_table_rcu---of 11
tlb_remove_table_smp_sync---of 1
tlb_remove_table_sync_one---of 1
tlb_remove_table_sync_rcu---of 1
-----------
SUMMARY30%of 30

__hrtimer_cb_get_time34%of 6
__hrtimer_get_next_event---of 15
__hrtimer_get_remaining---of 4
__hrtimer_rearm_deferred32%of 29
__hrtimer_reprogram---of 7
__hrtimer_run_queues---of 27
__hrtimer_setup38%of 8
__ia32_sys_nanosleep---of 1
__ia32_sys_nanosleep_time32---of 1
__remove_hrtimer---of 12
__se_sys_nanosleep---of 5
__se_sys_nanosleep_time32---of 5
__x64_sys_nanosleep---of 1
__x64_sys_nanosleep_time32---of 1
clock_was_set---of 22
clock_was_set_delayed---of 1
clock_was_set_work---of 1
do_nanosleep32%of 22
enqueue_hrtimer56%of 9
hrtimer_active56%of 9
hrtimer_cancel50%of 4
hrtimer_cb_get_time---of 1
hrtimer_dummy_timeout---of 1
hrtimer_force_reprogram---of 4
hrtimer_forward---of 6
hrtimer_get_next_event---of 5
hrtimer_hres_workfn---of 1
hrtimer_interrupt---of 26
hrtimer_nanosleep50%of 4
hrtimer_nanosleep_restart---of 1
hrtimer_next_event_without---of 21
hrtimer_run_queues---of 8
hrtimer_run_softirq---of 23
hrtimer_setup40%of 5
hrtimer_setup_on_stack---of 5
hrtimer_setup_sleeper_on_stack40%of 10
hrtimer_sleeper_start_expires100%of 1
hrtimer_start_range_ns24%of 88
hrtimer_try_to_cancel26%of 27
hrtimer_update_next_event---of 10
hrtimer_wakeup---of 3
hrtimers_cpu_dying---of 15
hrtimers_cpu_starting---of 1
hrtimers_prepare_cpu---of 1
hrtimers_resume_local---of 1
ktime_add_safe100%of 1
nanosleep_copyout---of 6
retrigger_next_event---of 44
switch_hrtimer_base17%of 18
-----------
SUMMARY31%of 241

-----------
SUMMARY---of 0

dev_put---of 4
refcount_dec_and_test---of 4
secpath_set34%of 6
xfrm_input---of 266
xfrm_input_register_afinfo---of 4
xfrm_input_resume---of 1
xfrm_input_unregister_afinfo---of 4
xfrm_offload---of 6
xfrm_parse_spi---of 18
xfrm_trans_queue---of 3
xfrm_trans_queue_net---of 3
xfrm_trans_reinject---of 6
-----------
SUMMARY34%of 6

-----------
SUMMARY---of 0

__bpf_trace_exit_mmap---of 1
__bpf_trace_vm_unmapped_area---of 1
__get_unmapped_area30%of 17
__ia32_sys_brk---of 1
__ia32_sys_mmap_pgoff100%of 1
__ia32_sys_munmap---of 1
__ia32_sys_remap_file_pages---of 1
__probestub_exit_mmap---of 1
__probestub_vm_unmapped_area---of 1
__se_sys_brk---of 35
__se_sys_remap_file_pages---of 29
__traceiter_exit_mmap---of 4
__traceiter_vm_unmapped_area---of 4
__x64_sys_brk---of 1
__x64_sys_mmap_pgoff---of 1
__x64_sys_munmap---of 1
__x64_sys_remap_file_pages---of 1
_install_special_mapping---of 7
can_mmap_file67%of 3
do_mmap24%of 69
do_munmap---of 1
dup_mmap---of 89
exit_mmap---of 30
expand_stack---of 24
expand_stack_locked---of 1
file_mmap_ok43%of 7
find_extend_vma_locked---of 8
find_vma67%of 3
find_vma_intersection67%of 3
find_vma_prev---of 3
generic_get_unmapped_area---of 22
generic_get_unmapped_area_topdown---of 29
get_file---of 3
init_admin_reserve---of 1
init_reserve_notifier---of 3
init_user_reserve---of 1
ksys_mmap_pgoff20%of 15
may_expand_vm19%of 11
mlock_future_ok---of 4
mm_get_unmapped_area---of 3
mm_get_unmapped_area_vmflags67%of 3
mmap_read_lock_maybe_expand---of 20
mmap_read_unlock---of 3
mmap_write_lock_killable---of 7
mmap_write_unlock---of 5
perf_trace_exit_mmap---of 8
perf_trace_vm_unmapped_area---of 8
reserve_mem_notifier---of 10
special_mapping_close---of 3
special_mapping_fault---of 10
special_mapping_mremap---of 4
special_mapping_name---of 1
special_mapping_split---of 1
tear_down_vmas---of 13
trace_event_raw_event_exit_mmap---of 9
trace_event_raw_event_vm_unmapped_area---of 9
trace_raw_output_exit_mmap---of 3
trace_raw_output_vm_unmapped_area---of 4
vm_brk_flags---of 22
vm_munmap---of 1
vm_stat_account40%of 5
vm_unmapped_area---of 7
vma_is_special_mapping---of 3
vma_set_page_prot67%of 3
-----------
SUMMARY30%of 140

apple_mfi_fc_get_property---of 4
apple_mfi_fc_property_is_writeable---of 1
apple_mfi_fc_set_property---of 12
mfi_fc_disconnect---of 3
mfi_fc_match100%of 1
mfi_fc_probe---of 6
-----------
SUMMARY100%of 1

__bpf_trace_erofs_fill_inode---of 1
__bpf_trace_erofs_lookup---of 1
__bpf_trace_erofs_map_blocks_enter---of 1
__bpf_trace_erofs_map_blocks_exit---of 1
__bpf_trace_erofs_read_folio---of 1
__bpf_trace_erofs_readahead---of 1
__probestub_erofs_fill_inode---of 1
__probestub_erofs_lookup---of 1
__probestub_erofs_map_blocks_enter---of 1
__probestub_erofs_map_blocks_exit---of 1
__probestub_erofs_read_folio---of 1
__probestub_erofs_readahead---of 1
__traceiter_erofs_fill_inode---of 4
__traceiter_erofs_lookup---of 4
__traceiter_erofs_map_blocks_enter---of 4
__traceiter_erofs_map_blocks_exit---of 4
__traceiter_erofs_read_folio---of 4
__traceiter_erofs_readahead---of 4
_erofs_printk17%of 24
erofs_alloc_inode---of 3
erofs_encode_fh---of 4
erofs_evict_inode---of 3
erofs_fc_fill_super9%of 61
erofs_fc_free67%of 3
erofs_fc_get_tree100%of 1
erofs_fc_parse_param10%of 21
erofs_fc_reconfigure---of 4
erofs_fh_to_dentry---of 3
erofs_fh_to_parent---of 3
erofs_free_inode---of 3
erofs_get_parent---of 3
erofs_init_device---of 15
erofs_init_fs_context50%of 4
erofs_inode_init_once---of 1
erofs_kill_sb67%of 3
erofs_put_super---of 3
erofs_read_metadata---of 8
erofs_release_device_info---of 3
erofs_sb_free60%of 5
erofs_scan_devices---of 19
erofs_set_sysfs_name---of 8
erofs_show_options---of 9
erofs_statfs---of 4
perf_trace_erofs_fill_inode---of 11
perf_trace_erofs_lookup---of 8
perf_trace_erofs_map_blocks_enter---of 8
perf_trace_erofs_map_blocks_exit---of 8
perf_trace_erofs_read_folio---of 9
perf_trace_erofs_readahead---of 8
super_set_sysfs_name_generic---of 1
super_set_uuid---of 1
trace_event_raw_event_erofs_fill_inode---of 12
trace_event_raw_event_erofs_lookup---of 9
trace_event_raw_event_erofs_map_blocks_enter---of 9
trace_event_raw_event_erofs_map_blocks_exit---of 9
trace_event_raw_event_erofs_read_folio---of 10
trace_event_raw_event_erofs_readahead---of 9
trace_raw_output_erofs_fill_inode---of 3
trace_raw_output_erofs_lookup---of 3
trace_raw_output_erofs_map_blocks_enter---of 4
trace_raw_output_erofs_map_blocks_exit---of 4
trace_raw_output_erofs_read_folio---of 3
trace_raw_output_erofs_readahead---of 3
-----------
SUMMARY18%of 122

crc1658%of 7
-----------
SUMMARY58%of 7

__ip_do_redirect---of 45
__ip_rt_update_pmtu---of 42
__ip_select_ident50%of 6
__ipv4_neigh_lookup---of 12
dst_discard---of 1
fib_dump_info_fnhe---of 29
fib_lookup---of 7
fib_multipath_hash---of 61
find_exception14%of 15
fnhe_hashfun---of 4
inet_rtm_getroute---of 116
ip_do_redirect---of 4
ip_error---of 15
ip_handle_martian_source---of 8
ip_mc_validate_source---of 13
ip_mkroute_input---of 46
ip_mtu_from_fib_result---of 14
ip_multipath_l3_keys---of 15
ip_neigh_gw4---of 7
ip_neigh_gw6---of 7
ip_route_input_noref100%of 1
ip_route_input_rcu27%of 119
ip_route_output_flow50%of 4
ip_route_output_key_hash100%of 1
ip_route_output_key_hash_rcu35%of 101
ip_route_use_hint---of 14
ip_rt_bug---of 1
ip_rt_do_proc_exit---of 1
ip_rt_do_proc_init---of 5
ip_rt_get_source---of 12
ip_rt_multicast_event---of 1
ip_rt_send_redirect---of 19
ip_rt_update_pmtu---of 14
ipv4_blackhole_route---of 9
ipv4_confirm_neigh---of 19
ipv4_cow_metrics---of 1
ipv4_default_advmss---of 1
ipv4_dst_check---of 3
ipv4_dst_destroy---of 9
ipv4_inetpeer_exit---of 1
ipv4_inetpeer_init---of 3
ipv4_link_failure---of 22
ipv4_mtu34%of 12
ipv4_negative_advice---of 5
ipv4_neigh_lookup---of 17
ipv4_redirect---of 3
ipv4_sk_redirect---of 6
ipv4_sk_update_pmtu---of 37
ipv4_sysctl_rtcache_flush---of 3
ipv4_update_pmtu---of 5
neigh_event_send---of 5
netns_ip_rt_init---of 1
nlmsg_parse_deprecated_strict---of 4
refcount_dec_and_test---of 4
rt_acct_proc_show---of 7
rt_add_uncached_list---of 3
rt_bind_exception---of 17
rt_cache_flush---of 1
rt_cache_route50%of 10
rt_cache_seq_next---of 1
rt_cache_seq_show---of 3
rt_cache_seq_start---of 1
rt_cache_seq_stop---of 1
rt_cpu_seq_next---of 5
rt_cpu_seq_show---of 3
rt_cpu_seq_start---of 6
rt_cpu_seq_stop---of 1
rt_del_uncached_list---of 5
rt_dst_alloc---of 4
rt_dst_clone43%of 7
rt_fill_info---of 58
rt_flush_dev---of 24
rt_genid_init---of 1
rt_set_nexthop43%of 28
skb_dst_set_noref67%of 3
skb_mac_header---of 3
sysctl_route_net_exit---of 3
sysctl_route_net_init---of 9
update_or_create_fnhe---of 47
-----------
SUMMARY33%of 307

-----------
SUMMARY---of 0

___p4d_free_tlb---of 1
___pmd_free_tlb---of 1
___pte_free_tlb---of 1
___pud_free_tlb---of 1
__native_set_fixmap---of 3
arch_check_zapped_pmd---of 1
arch_check_zapped_pte---of 1
arch_check_zapped_pud---of 1
native_set_fixmap---of 5
p4d_clear_huge100%of 1
p4d_set_huge---of 1
pgd_alloc---of 17
pgd_free---of 12
pgd_page_get_mm---of 1
pmd_clear_huge67%of 3
pmd_free_pte_page---of 8
pmd_mkwrite---of 3
pmd_set_huge---of 6
pmdp_clear_flush_young---of 5
pmdp_invalidate_ad---of 1
pmdp_set_access_flags67%of 3
pmdp_test_and_clear_young---of 3
pte_alloc_one50%of 8
pte_mkwrite67%of 3
ptep_clear_flush_young---of 3
ptep_set_access_flags---of 3
ptep_test_and_clear_young---of 3
pud_clear_huge67%of 3
pud_free_pmd_page---of 24
pud_set_huge---of 5
pudp_invalidate---of 3
pudp_set_access_flags---of 3
pudp_test_and_clear_young---of 3
-----------
SUMMARY62%of 21

__rq_qos_cleanup---of 5
__rq_qos_done60%of 5
__rq_qos_done_bio60%of 5
__rq_qos_issue60%of 5
__rq_qos_merge60%of 5
__rq_qos_queue_depth_changed---of 5
__rq_qos_requeue---of 5
__rq_qos_throttle60%of 5
__rq_qos_track60%of 5
rq_depth_calc_max_depth---of 5
rq_depth_scale_down---of 6
rq_depth_scale_up---of 6
rq_qos_add---of 5
rq_qos_del---of 8
rq_qos_exit---of 4
rq_qos_wait16%of 13
rq_qos_wake_function---of 5
rq_wait_inc_below40%of 5
-----------
SUMMARY46%of 48

-----------
SUMMARY---of 0

vti4_err28%of 22
vti_changelink---of 14
vti_exit_rtnl---of 1
vti_fill_info---of 7
vti_get_size---of 1
vti_init_net---of 4
vti_input---of 16
vti_input_proto---of 1
vti_newlink---of 16
vti_rcv_cb---of 31
vti_rcv_proto---of 1
vti_rcv_tunnel---of 1
vti_tunnel_ctl---of 14
vti_tunnel_init---of 3
vti_tunnel_setup---of 1
vti_tunnel_validate---of 1
vti_tunnel_xmit---of 59
-----------
SUMMARY28%of 22

__list_lru_init50%of 14
__list_lru_walk_one---of 29
list_lru_add34%of 12
list_lru_add_obj67%of 3
list_lru_count_node---of 1
list_lru_count_one---of 8
list_lru_del46%of 11
list_lru_del_obj100%of 3
list_lru_destroy36%of 17
list_lru_isolate---of 4
list_lru_isolate_move---of 7
list_lru_walk_node---of 20
list_lru_walk_one---of 1
list_lru_walk_one_irq---of 1
memcg_list_lru_alloc34%of 24
memcg_reparent_list_lrus---of 18
-----------
SUMMARY42%of 84

-----------
SUMMARY---of 0

hfs_mdb_close---of 3
hfs_mdb_commit---of 21
hfs_mdb_get21%of 43
hfs_mdb_put60%of 5
is_hfs_cnid_counts_valid---of 7
-----------
SUMMARY25%of 48

nilfs_direct_assign---of 6
nilfs_direct_check_insert---of 1
nilfs_direct_delete---of 7
nilfs_direct_delete_and_convert---of 27
nilfs_direct_gather_data---of 19
nilfs_direct_init100%of 1
nilfs_direct_insert---of 17
nilfs_direct_last_key---of 3
nilfs_direct_lookup75%of 4
nilfs_direct_lookup_contig---of 14
nilfs_direct_propagate---of 7
nilfs_direct_seek_key---of 14
-----------
SUMMARY80%of 5

-----------
SUMMARY---of 0

sysfs_add_bin_file_mode_ns43%of 7
sysfs_add_file_mode_ns34%of 9
sysfs_add_file_to_group60%of 5
sysfs_bin_attr_simple_read---of 1
sysfs_break_active_protection---of 3
sysfs_change_owner---of 6
sysfs_chmod_file---of 3
sysfs_create_bin_file---of 9
sysfs_create_file_ns50%of 4
sysfs_create_files---of 10
sysfs_emit---of 3
sysfs_emit_at---of 4
sysfs_file_change_owner---of 5
sysfs_kf_bin_llseek---of 3
sysfs_kf_bin_mmap---of 1
sysfs_kf_bin_open---of 3
sysfs_kf_bin_read---of 7
sysfs_kf_bin_write---of 7
sysfs_kf_read---of 8
sysfs_kf_seq_show---of 13
sysfs_kf_write---of 5
sysfs_link_change_owner---of 8
sysfs_notify---of 7
sysfs_remove_bin_file---of 1
sysfs_remove_file_from_group60%of 5
sysfs_remove_file_ns100%of 1
sysfs_remove_file_self---of 3
sysfs_remove_files---of 4
sysfs_unbreak_active_protection---of 1
-----------
SUMMARY49%of 31

__warn---of 17
__warn_printk---of 5
_print_tainted---of 12
add_taint---of 6
check_panic_on_warn---of 4
clear_warn_once_fops_open---of 1
clear_warn_once_set---of 1
do_oops_enter_exit---of 8
get_taint---of 1
nmi_panic---of 5
no_blink---of 1
oops_enter---of 3
oops_exit---of 1
oops_may_print---of 1
panic_in_progress100%of 1
panic_on_other_cpu67%of 3
panic_on_this_cpu100%of 1
panic_other_cpus_shutdown---of 6
panic_print_get---of 1
panic_print_set---of 3
panic_reset---of 1
panic_smp_redirect_cpu---of 1
panic_smp_self_stop---of 1
panic_try_start---of 1
print_tainted---of 1
print_tainted_verbose---of 1
proc_taint---of 13
sysctl_panic_print_handler---of 4
test_taint---of 1
vpanic---of 45
warn_count_show---of 1
-----------
SUMMARY80%of 5

-----------
SUMMARY---of 0

__page_frag_alloc_align---of 10
__page_frag_cache_drain---of 4
__page_frag_cache_refill---of 4
page_frag_cache_drain---of 5
page_frag_free50%of 4
-----------
SUMMARY50%of 4

tick_init_highres---of 1
tick_oneshot_mode_active---of 3
tick_program_event50%of 4
tick_resume_oneshot---of 1
tick_setup_oneshot---of 1
tick_switch_to_oneshot---of 5
-----------
SUMMARY50%of 4

tnum_add100%of 1
tnum_and---of 1
tnum_arshift---of 3
tnum_bswap16---of 1
tnum_bswap32---of 1
tnum_bswap64---of 1
tnum_cast100%of 1
tnum_clear_subreg100%of 1
tnum_const100%of 1
tnum_const_subreg---of 1
tnum_in---of 1
tnum_intersect100%of 1
tnum_is_aligned100%of 1
tnum_lshift---of 1
tnum_mul---of 7
tnum_neg---of 1
tnum_or100%of 1
tnum_overlap---of 1
tnum_range100%of 1
tnum_rshift---of 1
tnum_sbin---of 7
tnum_step75%of 4
tnum_sub---of 1
tnum_subreg100%of 1
tnum_union---of 1
tnum_with_subreg---of 1
tnum_xor---of 1
-----------
SUMMARY93%of 13

rcuref_get_slowpath---of 4
rcuref_put_slowpath34%of 6
-----------
SUMMARY34%of 6

__dev_flush---of 4
__dev_map_alloc_node---of 12
__dev_map_entry_free---of 6
bq_enqueue---of 9
bq_xmit_all---of 38
dev_hash_map_redirect---of 11
dev_map_alloc15%of 14
dev_map_alloc_check38%of 8
dev_map_delete_elem---of 4
dev_map_enqueue---of 9
dev_map_enqueue_multi---of 40
dev_map_free---of 27
dev_map_generic_redirect---of 16
dev_map_get_next_key---of 5
dev_map_hash_delete_elem---of 8
dev_map_hash_get_next_key---of 11
dev_map_hash_lookup_elem---of 5
dev_map_hash_update_elem---of 18
dev_map_lookup_elem---of 3
dev_map_mem_usage---of 1
dev_map_notification---of 29
dev_map_redirect---of 10
dev_map_redirect_multi---of 28
dev_map_update_elem---of 11
dev_xdp_enqueue---of 9
get_upper_ifindexes---of 10
-----------
SUMMARY23%of 22

-----------
SUMMARY---of 0

__xfs_buf_ioend22%of 19
__xfs_buf_ioerror---of 5
__xfs_buf_mark_corrupt---of 3
_xfs_buf_obj_cmp---of 3
_xfs_buf_read---of 1
jhash2---of 8
xfs_alloc_buftarg50%of 4
xfs_buf_alloc14%of 38
xfs_buf_bio_end_io38%of 8
xfs_buf_cmp---of 1
xfs_buf_delwri_cancel---of 11
xfs_buf_delwri_queue---of 16
xfs_buf_delwri_queue_here---of 12
xfs_buf_delwri_submit---of 20
xfs_buf_delwri_submit_nowait---of 13
xfs_buf_delwri_submit_prep---of 8
xfs_buf_destroy6%of 36
xfs_buf_find_insert---of 47
xfs_buf_find_lock---of 10
xfs_buf_free34%of 12
xfs_buf_free_callback---of 3
xfs_buf_get_map---of 44
xfs_buf_get_uncached---of 6
xfs_buf_hold---of 5
xfs_buf_ioend_fail---of 13
xfs_buf_ioend_handle_error---of 35
xfs_buf_ioend_work---of 6
xfs_buf_ioerror_alert---of 1
xfs_buf_ioerror_alert_ratelimited---of 5
xfs_buf_iowait37%of 11
xfs_buf_lock---of 12
xfs_buf_read_map---of 19
xfs_buf_read_uncached28%of 11
xfs_buf_readahead_map---of 15
xfs_buf_rele30%of 10
xfs_buf_relse---of 1
xfs_buf_reverify---of 5
xfs_buf_set_ref---of 1
xfs_buf_stale---of 3
xfs_buf_submit25%of 24
xfs_buf_trylock---of 9
xfs_buf_unlock40%of 5
xfs_buf_wait_unpin---of 7
xfs_buftarg_drain---of 28
xfs_buftarg_drain_rele---of 7
xfs_buftarg_isolate---of 7
xfs_buftarg_shrink_count---of 1
xfs_buftarg_shrink_scan---of 7
xfs_buftarg_wait---of 4
xfs_bwrite---of 3
xfs_configure_buftarg---of 16
xfs_destroy_buftarg---of 1
xfs_free_buftarg67%of 3
xfs_init_buftarg29%of 7
xfs_verify_magic---of 4
xfs_verify_magic16---of 4
-----------
SUMMARY23%of 188

always_on---of 1
blackhole_neigh_construct---of 1
blackhole_neigh_output---of 1
blackhole_netdev_setup---of 1
blackhole_netdev_xmit---of 3
dev_lstats_read---of 5
loopback_dev_free---of 1
loopback_dev_init---of 1
loopback_get_stats64---of 5
loopback_net_init---of 7
loopback_setup---of 1
loopback_xmit47%of 13
-----------
SUMMARY47%of 13

add_to_page_cache_lru---of 1
clear_page_dirty_for_io---of 1
end_page_writeback---of 1
mark_page_accessed100%of 1
pagecache_get_page75%of 4
redirty_page_for_writepage---of 1
set_page_dirty---of 1
set_page_dirty_lock---of 1
set_page_writeback---of 1
unlock_page---of 1
wait_on_page_writeback---of 1
-----------
SUMMARY80%of 5

hfsplus_fill_defaults67%of 3
hfsplus_parse_param14%of 23
hfsplus_show_options---of 15
seq_show_option---of 3
-----------
SUMMARY20%of 26

-----------
SUMMARY---of 0

__folio_throttle_swaprate25%of 8
__ia32_sys_swapoff---of 1
__ia32_sys_swapon---of 1
__se_sys_swapoff---of 13
__se_sys_swapon---of 68
__swap_cluster_free_entries---of 20
__swap_count---of 1
__try_to_reclaim_swap---of 32
__x64_sys_swapoff---of 1
__x64_sys_swapon---of 1
add_swap_extent---of 8
add_to_avail_list---of 11
alloc_swap_scan_cluster---of 52
cluster_alloc_swap_entry---of 48
count_swap_pages---of 5
del_from_avail_list---of 4
destroy_swap_extents---of 7
discard_swap---of 11
drain_mmlist---of 11
enable_swap_info---of 3
find_first_swap---of 5
flush_percpu_swap_cluster---of 5
folio_alloc_swap---of 56
folio_dup_swap---of 7
folio_free_swap---of 10
folio_put_swap---of 5
free_swap_cluster_info---of 14
generic_max_swapfile_size---of 1
get_swap_device---of 12
inode_drain_writes---of 1
isolate_lock_cluster---of 23
move_cluster---of 8
percpu_ref_put---of 4
read_swap_header---of 17
reinsert_swap_info---of 3
setup_swap_clusters_info---of 29
setup_swap_extents---of 18
si_swapinfo---of 12
swap_alloc_hibernation_slot---of 22
swap_cluster_setup_bad_slot---of 15
swap_discard_work---of 1
swap_do_scheduled_discard---of 15
swap_dup_entries_cluster---of 20
swap_dup_entry_direct---of 4
swap_entry_swapped---of 3
swap_extend_table_alloc---of 7
swap_extend_table_try_free---of 5
swap_folio_sector---of 7
swap_free_hibernation_slot---of 11
swap_next---of 8
swap_put_entries_cluster---of 18
swap_put_entries_direct---of 8
swap_reclaim_full_clusters---of 18
swap_reclaim_work---of 1
swap_retry_table_alloc---of 11
swap_show---of 3
swap_start---of 11
swap_stop---of 1
swap_sync_discard---of 17
swap_table_free---of 1
swap_table_free_folio_rcu_cb---of 3
swap_type_of---of 12
swap_users_ref_free---of 1
swapdev_block---of 10
swaps_open---of 3
swaps_poll---of 6
swp_swapcount---of 7
try_to_unuse---of 122
wait_for_allocation---of 5
-----------
SUMMARY25%of 8

__ep_eventpoll_poll---of 18
__ep_remove---of 23
__ia32_compat_sys_epoll_pwait---of 4
__ia32_compat_sys_epoll_pwait2---of 5
__ia32_sys_epoll_create---of 3
__ia32_sys_epoll_create1---of 1
__ia32_sys_epoll_ctl---of 4
__ia32_sys_epoll_pwait---of 4
__ia32_sys_epoll_pwait2---of 5
__ia32_sys_epoll_wait---of 4
__x64_sys_epoll_create---of 3
__x64_sys_epoll_create1---of 1
__x64_sys_epoll_ctl---of 4
__x64_sys_epoll_pwait---of 4
__x64_sys_epoll_pwait2---of 5
__x64_sys_epoll_wait---of 4
do_compat_epoll_pwait---of 7
do_epoll_create---of 11
do_epoll_ctl---of 62
do_epoll_pwait---of 7
do_epoll_wait---of 53
ep_autoremove_wake_function50%of 4
ep_busy_loop_end---of 7
ep_clear_and_put---of 27
ep_destroy_wakeup_source---of 1
ep_done_scan---of 15
ep_eventpoll_ioctl---of 13
ep_eventpoll_poll---of 1
ep_eventpoll_release---of 3
ep_get_upwards_depth_proc---of 5
ep_insert---of 77
ep_loop_check---of 3
ep_loop_check_proc---of 10
ep_modify---of 23
ep_poll_callback33%of 34
ep_ptable_queue_proc---of 5
ep_remove_safe---of 6
ep_show_fdinfo---of 5
ep_try_send_events---of 37
epoll_sendevents---of 8
eventpoll_release_file---of 15
get_epoll_tfile_raw_ptr---of 8
reverse_path_check_proc---of 10
-----------
SUMMARY35%of 38

bpf_nf_enable_defrag---of 6
bpf_nf_func_proto---of 1
bpf_nf_link_attach---of 17
bpf_nf_link_dealloc---of 1
bpf_nf_link_detach---of 1
bpf_nf_link_fill_link_info---of 1
bpf_nf_link_release---of 16
bpf_nf_link_show_info---of 1
bpf_nf_link_update---of 1
get_net_track---of 11
get_proto_defrag_hook---of 9
nf_hook_run_bpf---of 12
nf_is_valid_access19%of 11
-----------
SUMMARY19%of 11

__gre6_xmit---of 34
erspan_build_header---of 3
erspan_build_header_v2---of 3
gre_build_header---of 14
gre_rcv14%of 23
ip6erspan_changelink---of 23
ip6erspan_newlink---of 22
ip6erspan_tap_init---of 13
ip6erspan_tap_setup---of 1
ip6erspan_tap_validate---of 36
ip6erspan_tunnel_uninit---of 11
ip6erspan_tunnel_xmit---of 49
ip6gre_bucket---of 3
ip6gre_changelink---of 13
ip6gre_changelink_common---of 16
ip6gre_dellink---of 3
ip6gre_dev_free---of 1
ip6gre_err22%of 14
ip6gre_exit_rtnl_net---of 31
ip6gre_fill_info---of 29
ip6gre_get_size---of 1
ip6gre_header42%of 12
ip6gre_init_net---of 9
ip6gre_netlink_parms---of 50
ip6gre_newlink---of 12
ip6gre_newlink_common---of 19
ip6gre_tap_init---of 3
ip6gre_tap_setup---of 1
ip6gre_tap_validate---of 14
ip6gre_tnl_copy_tnl_parm---of 1
ip6gre_tnl_link_config---of 1
ip6gre_tnl_link_config_common---of 10
ip6gre_tnl_link_config_route---of 6
ip6gre_tnl_parm_from_user---of 25
ip6gre_tnl_parm_to_user---of 1
ip6gre_tunnel_find---of 12
ip6gre_tunnel_init---of 5
ip6gre_tunnel_init_common---of 19
ip6gre_tunnel_locate---of 12
ip6gre_tunnel_lookup21%of 48
ip6gre_tunnel_setup---of 1
ip6gre_tunnel_siocdevprivate---of 28
ip6gre_tunnel_uninit---of 13
ip6gre_tunnel_validate---of 6
ip6gre_tunnel_xmit---of 46
net_generic---of 1
prepare_ip6gre_xmit_ipv4---of 5
prepare_ip6gre_xmit_ipv6---of 9
skb_tunnel_info_txcheck---of 12
-----------
SUMMARY22%of 97

tomoyo_encode65%of 14
tomoyo_encode2---of 14
tomoyo_get_local_path19%of 22
tomoyo_realpath_from_path56%of 20
tomoyo_realpath_nofollow---of 4
-----------
SUMMARY43%of 56

-----------
SUMMARY---of 0

__xdp_build_skb_from_frame---of 11
__xdp_mem_allocator_rcu_free---of 1
__xdp_reg_mem_model---of 16
__xdp_return---of 20
__xdp_rxq_info_reg29%of 7
bpf_dev_bound_kfunc_id---of 1
bpf_xdp_metadata_kfunc_id---of 1
bpf_xdp_metadata_rx_hash---of 1
bpf_xdp_metadata_rx_timestamp---of 1
bpf_xdp_metadata_rx_vlan_tag---of 1
btf_id_cmp_func---of 1
list_del_init---of 4
mem_allocator_disconnect---of 43
page_pool_free_va---of 5
xdp_attachment_setup---of 3
xdp_build_skb_from_buff---of 12
xdp_build_skb_from_frame---of 3
xdp_build_skb_from_zc---of 19
xdp_convert_zc_to_xdp_frame---of 7
xdp_copy_frags_from_zc---of 12
xdp_features_clear_redirect_target---of 4
xdp_features_clear_redirect_target_locked---of 4
xdp_features_set_redirect_target---of 4
xdp_features_set_redirect_target_locked---of 4
xdp_mem_id_cmp---of 1
xdp_mem_id_hashfn---of 1
xdp_reg_mem_model---of 1
xdp_reg_page_pool---of 1
xdp_return_buff---of 5
xdp_return_frag---of 1
xdp_return_frame---of 5
xdp_return_frame_bulk---of 13
xdp_return_frame_rx_napi---of 5
xdp_rxq_info_attach_page_pool---of 1
xdp_rxq_info_is_reg---of 1
xdp_rxq_info_reg_mem_model---of 11
xdp_rxq_info_unreg50%of 4
xdp_rxq_info_unreg_mem_model---of 3
xdp_rxq_info_unused---of 1
xdp_set_features_flag---of 4
xdp_set_features_flag_locked---of 4
xdp_unreg_mem_model19%of 11
xdp_unreg_page_pool---of 1
xdp_update_skb_frags_info---of 1
xdp_warn---of 1
xdpf_clone---of 4
-----------
SUMMARY28%of 22

-----------
SUMMARY---of 0

__ia32_sys_fgetxattr---of 1
__ia32_sys_flistxattr---of 7
__ia32_sys_fremovexattr---of 1
__ia32_sys_fsetxattr---of 1
__ia32_sys_getxattr---of 3
__ia32_sys_getxattrat---of 9
__ia32_sys_lgetxattr---of 3
__ia32_sys_listxattr---of 1
__ia32_sys_listxattrat---of 1
__ia32_sys_llistxattr---of 1
__ia32_sys_lremovexattr---of 1
__ia32_sys_lsetxattr---of 1
__ia32_sys_removexattr100%of 1
__ia32_sys_removexattrat---of 1
__ia32_sys_setxattr---of 1
__ia32_sys_setxattrat---of 8
__vfs_getxattr---of 20
__vfs_removexattr---of 20
__vfs_removexattr_locked---of 13
__vfs_setxattr---of 20
__vfs_setxattr_locked---of 11
__vfs_setxattr_noperm---of 22
__x64_sys_fgetxattr---of 1
__x64_sys_flistxattr---of 7
__x64_sys_fremovexattr---of 1
__x64_sys_fsetxattr---of 1
__x64_sys_getxattr---of 3
__x64_sys_getxattrat---of 9
__x64_sys_lgetxattr---of 3
__x64_sys_listxattr---of 1
__x64_sys_listxattrat---of 1
__x64_sys_llistxattr---of 1
__x64_sys_lremovexattr---of 1
__x64_sys_lsetxattr---of 1
__x64_sys_removexattr---of 1
__x64_sys_removexattrat---of 1
__x64_sys_setxattr---of 1
__x64_sys_setxattrat---of 8
do_getxattr---of 14
file_getxattr---of 4
file_setxattr---of 8
filename_getxattr---of 5
filename_setxattr---of 12
fsnotify_xattr---of 8
generic_listxattr---of 11
import_xattr_name---of 1
jhash---of 17
listxattr---of 14
may_write_xattr---of 4
path_getxattrat---of 13
path_listxattrat---of 16
path_removexattrat8%of 28
path_setxattrat---of 16
rht_unlock---of 5
setxattr_copy---of 7
simple_xattr_add---of 30
simple_xattr_alloc---of 6
simple_xattr_free---of 3
simple_xattr_free_rcu---of 3
simple_xattr_get---of 14
simple_xattr_hashfn---of 1
simple_xattr_ht_free---of 5
simple_xattr_list---of 20
simple_xattr_obj_cmpfn---of 1
simple_xattr_obj_hashfn---of 1
simple_xattr_rcu_free---of 3
simple_xattr_set---of 110
simple_xattr_set_limited---of 9
simple_xattr_space---of 1
simple_xattrs_alloc---of 5
simple_xattrs_free---of 3
simple_xattrs_init---of 1
simple_xattrs_lazy_alloc---of 9
vfs_getxattr---of 10
vfs_getxattr_alloc---of 22
vfs_listxattr---of 5
vfs_removexattr---of 9
vfs_setxattr---of 15
xattr_full_name---of 3
xattr_list_one---of 4
xattr_permission---of 16
xattr_supports_user_prefix---of 8
-----------
SUMMARY11%of 29

-----------
SUMMARY---of 0

__copy_xstate_to_uabi_buf---of 46
__xfd_enable_feature---of 32
arch_set_user_pkey_access---of 16
compare_xstate_offsets---of 1
copy_sigframe_from_user_to_xstate---of 1
copy_uabi_from_kernel_to_xstate---of 1
copy_uabi_to_xstate---of 50
copy_xstate_to_uabi_buf---of 1
cpu_has_xfeatures---of 3
elf_coredump_extra_notes_size---of 6
elf_coredump_extra_notes_write---of 16
fpstate_clear_xstate_component---of 3
fpstate_free---of 3
fpu__init_cpu_xstate---of 15
fpu__resume_cpu---of 15
fpu_xstate_prctl---of 41
get_xsave_addr---of 18
get_xsave_addr_user---of 3
proc_pid_arch_status---of 7
xfd_enable_feature---of 1
xfd_validate_state29%of 7
xfeature_get_offset---of 10
xfeature_size---of 4
xrstors---of 10
xsaves---of 10
xstate_calculate_size---of 8
xstate_get_guest_group_perm---of 1
-----------
SUMMARY29%of 7

alloc_nilfs67%of 3
destroy_nilfs50%of 6
init_nilfs20%of 20
load_nilfs30%of 34
nilfs_count_free_blocks100%of 1
nilfs_discard_segments---of 11
nilfs_fall_back_super_block---of 3
nilfs_find_or_create_root19%of 22
nilfs_load_super_block20%of 35
nilfs_lookup_root---of 9
nilfs_near_disk_full---of 1
nilfs_nrsvsegs---of 1
nilfs_put_root100%of 3
nilfs_release_super_block43%of 7
nilfs_set_last_segment---of 4
nilfs_set_nsegments---of 1
nilfs_store_disk_layout17%of 12
nilfs_store_log_cursor67%of 3
nilfs_swap_super_block---of 1
nilfs_valid_sb40%of 5
-----------
SUMMARY29%of 151

cipso_v4_cache_add---of 25
cipso_v4_cache_invalidate---of 14
cipso_v4_delopt---of 22
cipso_v4_doi_add---of 60
cipso_v4_doi_free---of 4
cipso_v4_doi_free_rcu---of 1
cipso_v4_doi_getdef---of 13
cipso_v4_doi_putdef---of 5
cipso_v4_doi_remove---of 15
cipso_v4_doi_walk---of 8
cipso_v4_error---of 4
cipso_v4_genopt---of 54
cipso_v4_getattr---of 70
cipso_v4_map_cache_hash---of 17
cipso_v4_map_cat_rbm_valid---of 8
cipso_v4_optptr---of 9
cipso_v4_req_delattr---of 1
cipso_v4_req_setattr---of 8
cipso_v4_skbuff_delattr---of 16
cipso_v4_skbuff_setattr---of 18
cipso_v4_sock_delattr---of 3
cipso_v4_sock_getattr---of 4
cipso_v4_sock_setattr---of 12
cipso_v4_validate4%of 61
-----------
SUMMARY4%of 61

__arg_track_join37%of 11
analyze_subprog36%of 334
arg_join_imprecise---of 1
arg_merge_offsets32%of 16
arg_track_join39%of 13
bpf_compute_live_registers67%of 81
bpf_compute_subprog_arg_access55%of 22
bpf_insn_successors72%of 7
bpf_jmp_offset67%of 3
bpf_live_stack_query_init67%of 15
bpf_stack_liveness_free75%of 8
bpf_stack_liveness_init67%of 3
bpf_stack_slot_alive15%of 20
call_instance31%of 13
clear_stack_for_all_offs19%of 32
cmp_instances---of 1
fill_from_stack---of 9
fmt_spis_mask---of 14
mark_stack_read---of 6
print_instances38%of 37
record_stack_access23%of 44
verbose_arg_track34%of 9
-----------
SUMMARY40%of 668

klist_add_before---of 5
klist_add_behind---of 6
klist_add_head---of 8
klist_add_tail72%of 7
klist_dec_and_del47%of 15
klist_del60%of 5
klist_init100%of 1
klist_iter_exit75%of 4
klist_iter_init---of 1
klist_iter_init_node20%of 10
klist_next55%of 11
klist_node_attached100%of 1
klist_prev---of 11
klist_remove50%of 12
-----------
SUMMARY52%of 66

-----------
SUMMARY---of 0

boot_display_show---of 1
boot_display_visible---of 4
connector_id_show---of 1
dpms_show---of 1
drm_class_device_register---of 3
drm_class_device_unregister---of 1
drm_connector_acpi_bus_match100%of 1
drm_connector_acpi_find_companion---of 1
drm_devnode---of 3
drm_sysfs_connector_add---of 12
drm_sysfs_connector_add_late---of 3
drm_sysfs_connector_hotplug_event---of 3
drm_sysfs_connector_property_event---of 5
drm_sysfs_connector_remove---of 6
drm_sysfs_connector_remove_early---of 3
drm_sysfs_destroy---of 3
drm_sysfs_hotplug_event---of 3
drm_sysfs_init---of 4
drm_sysfs_lease_event---of 3
drm_sysfs_minor_alloc---of 7
drm_sysfs_release---of 1
edid_show---of 1
enabled_show---of 1
modes_show---of 4
status_show---of 1
status_store---of 11
typec_connector_bind---of 4
typec_connector_unbind---of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

bpf_compute_const_regs36%of 135
bpf_prune_dead_branches14%of 36
-----------
SUMMARY31%of 171

quirks_param_set---of 38
usb_detect_interface_quirks67%of 3
usb_detect_quirks34%of 12
usb_detect_static_quirks34%of 15
usb_endpoint_is_ignored---of 17
usb_release_quirk_list---of 1
-----------
SUMMARY37%of 30

-----------
SUMMARY---of 0

__bpf_stream_push_str---of 6
bpf_prog_stream_free---of 13
bpf_prog_stream_init100%of 1
bpf_prog_stream_read---of 18
bpf_stream_print_stack---of 15
bpf_stream_stage_commit---of 11
bpf_stream_stage_dump_stack---of 5
bpf_stream_stage_free---of 4
bpf_stream_stage_init---of 1
bpf_stream_stage_printk---of 3
bpf_stream_vprintk---of 8
dump_stack_cb---of 1
-----------
SUMMARY100%of 1

dev_free---of 14
gadget_bind50%of 12
gadget_disconnect---of 3
gadget_ep0_complete---of 5
gadget_ep_complete---of 3
gadget_reset67%of 3
gadget_resume67%of 3
gadget_setup---of 9
gadget_suspend---of 3
gadget_unbind---of 4
raw_ioctl11%of 112
raw_ioctl_ep_set_clear_halt_wedge---of 15
raw_open50%of 4
raw_process_ep0_io29%of 14
raw_process_ep_io---of 12
raw_queue_event40%of 5
raw_release---of 11
-----------
SUMMARY20%of 153

copy_from_kernel_nofault_allowed50%of 4
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__key_link50%of 10
__key_link_begin25%of 8
__key_link_check_live_key---of 3
__key_link_end43%of 7
__key_link_lock50%of 4
__key_move_lock---of 5
find_key_to_update---of 6
find_keyring_by_name---of 20
key_default_cmp100%of 1
key_free_user_ns---of 4
key_link---of 18
key_move---of 34
key_put_tag---of 5
key_remove_domain---of 5
key_set_index_key70%of 10
key_unlink---of 12
keyring_alloc50%of 4
keyring_clear---of 9
keyring_compare_object80%of 5
keyring_describe---of 6
keyring_destroy---of 9
keyring_detect_cycle_iterator---of 3
keyring_diff_objects---of 9
keyring_free_object---of 1
keyring_free_preparse100%of 1
keyring_gc---of 4
keyring_gc_check_iterator---of 6
keyring_gc_select_iterator---of 10
keyring_get_key_chunk15%of 14
keyring_get_object_key_chunk---of 14
keyring_instantiate34%of 6
keyring_preparse100%of 1
keyring_read---of 5
keyring_read_iterator---of 3
keyring_restrict---of 23
keyring_restriction_gc---of 5
keyring_revoke---of 4
keyring_search50%of 8
keyring_search_iterator29%of 14
keyring_search_rcu45%of 9
restrict_link_reject---of 1
search_nested_keyrings31%of 56
-----------
SUMMARY39%of 158

alloc_etherdev_mqs100%of 1
arch_get_platform_mac_address---of 1
device_get_ethdev_address---of 4
device_get_mac_address---of 3
eth_commit_mac_addr_change---of 1
eth_get_headlen---of 4
eth_gro_complete---of 8
eth_gro_receive---of 24
eth_header45%of 9
eth_header_cache67%of 3
eth_header_cache_update---of 1
eth_header_parse---of 3
eth_header_parse_protocol---of 3
eth_mac_addr---of 7
eth_platform_get_mac_address---of 4
eth_prepare_mac_addr_change---of 6
eth_type_trans44%of 23
eth_validate_addr---of 3
ether_setup100%of 1
fwnode_get_mac_address---of 12
nvmem_get_mac_address---of 7
platform_get_ethdev_address---of 4
sysfs_format_mac---of 1
-----------
SUMMARY49%of 37

ima_iint_find50%of 4
ima_iint_init_once---of 1
ima_inode_free_rcu---of 3
ima_inode_get---of 8
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__bpf_trace_alloc_vmap_area---of 1
__bpf_trace_free_vmap_area_noflush---of 1
__bpf_trace_purge_vmap_area_lazy---of 1
__get_vm_area_caller---of 1
__get_vm_area_node38%of 8
__probestub_alloc_vmap_area---of 1
__probestub_free_vmap_area_noflush---of 1
__probestub_purge_vmap_area_lazy---of 1
__purge_vmap_area_lazy55%of 24
__traceiter_alloc_vmap_area---of 4
__traceiter_free_vmap_area_noflush---of 4
__traceiter_purge_vmap_area_lazy---of 4
__vmalloc_node_noprof---of 4
__vmalloc_node_range_noprof29%of 80
__vmalloc_noprof50%of 6
__vmap_pages_range_noflush28%of 70
__vunmap_range_noflush30%of 44
_vm_unmap_aliases25%of 24
alloc_vmap_area43%of 68
check_sparse_vm_area---of 7
cleanup_vm_area_work---of 7
clear_vm_uninitialized_flag---of 1
decay_va_pool_node66%of 38
delayed_vfree_work---of 4
drain_vmap_area_work---of 1
find_unlink_vmap_area39%of 13
find_vm_area46%of 11
find_vmap_area---of 11
find_vmap_area_exceed_addr_lock---of 22
free_unmap_vmap_area---of 1
free_vm_area---of 3
free_vmap_area_noflush39%of 13
free_vmap_area_rb_augment_cb_rotate100%of 5
free_vmap_block---of 12
get_vm_area---of 4
get_vm_area_caller---of 4
get_vm_area_page_order---of 1
insert_vmap_area60%of 15
insert_vmap_area_augment---of 26
ioremap_page_range---of 16
is_vmalloc_addr40%of 5
is_vmalloc_or_module_addr---of 6
memalloc_apply_gfp_scope---of 5
memalloc_restore_scope---of 3
pcpu_free_vm_areas---of 5
pcpu_get_vm_areas---of 207
perf_trace_alloc_vmap_area---of 8
perf_trace_free_vmap_area_noflush---of 8
perf_trace_purge_vmap_area_lazy---of 8
pfn_valid---of 22
preload_this_cpu_lock40%of 5
purge_vmap_node43%of 19
reclaim_and_purge_vmap_areas---of 21
reclaim_list_global73%of 72
register_vmap_purge_notifier---of 1
remap_vmalloc_range---of 1
remap_vmalloc_range_partial---of 19
remove_vm_area40%of 5
trace_event_raw_event_alloc_vmap_area---of 9
trace_event_raw_event_free_vmap_area_noflush---of 9
trace_event_raw_event_purge_vmap_area_lazy---of 9
trace_raw_output_alloc_vmap_area---of 3
trace_raw_output_free_vmap_area_noflush---of 3
trace_raw_output_purge_vmap_area_lazy---of 3
unregister_vmap_purge_notifier---of 1
va_alloc47%of 58
vfree42%of 12
vfree_atomic---of 8
vm_area_map_pages---of 4
vm_area_unmap_pages---of 3
vm_flags_set---of 5
vm_map_ram---of 110
vm_reset_perms---of 14
vm_unmap_aliases100%of 1
vm_unmap_ram---of 17
vmalloc_32_noprof---of 4
vmalloc_32_user_noprof---of 4
vmalloc_dump_obj---of 10
vmalloc_fix_flags---of 1
vmalloc_huge_node_noprof---of 6
vmalloc_info_show---of 52
vmalloc_node_noprof---of 4
vmalloc_noprof50%of 4
vmalloc_to_page21%of 24
vmalloc_to_pfn---of 1
vmalloc_user_noprof---of 4
vmap---of 14
vmap_node_shrink_count---of 5
vmap_node_shrink_scan---of 4
vmap_page_range---of 3
vmap_pages_range---of 3
vmap_pages_range_noflush---of 3
vmap_pfn---of 8
vmap_pfn_apply---of 26
vmap_range_noflush---of 57
vread_iter---of 72
vrealloc_node_align_noprof20%of 31
vunmap---of 5
vunmap_range---of 1
vunmap_range_noflush---of 1
vzalloc_node_noprof---of 4
vzalloc_noprof50%of 4
-----------
SUMMARY43%of 659

-----------
SUMMARY---of 0

close_pdeo---of 7
init_once---of 1
proc_alloc_inode67%of 3
proc_entry_rundown---of 6
proc_evict_inode---of 5
proc_free_inode---of 5
proc_get_inode32%of 16
proc_get_link---of 5
proc_invalidate_siblings_dcache---of 23
proc_put_link---of 3
proc_reg_compat_ioctl---of 10
proc_reg_get_unmapped_area---of 10
proc_reg_llseek---of 7
proc_reg_mmap---of 10
proc_reg_open16%of 19
proc_reg_poll---of 10
proc_reg_read---of 10
proc_reg_read_iter---of 7
proc_reg_release---of 6
proc_reg_unlocked_ioctl---of 10
proc_reg_write---of 10
proc_show_options---of 10
-----------
SUMMARY27%of 38

-----------
SUMMARY---of 0

iptable_mangle_hook38%of 8
iptable_mangle_net_exit---of 1
iptable_mangle_net_pre_exit---of 1
iptable_mangle_table_init---of 3
-----------
SUMMARY38%of 8

-----------
SUMMARY---of 0

inet_getpeer19%of 32
inet_peer_base_init---of 1
inet_peer_xrlim_allow50%of 6
inet_putpeer---of 5
inetpeer_invalidate_tree---of 8
-----------
SUMMARY24%of 38

__nilfs_error---of 10
__nilfs_msg42%of 24
nilfs_alloc_inode67%of 3
nilfs_attach_checkpoint50%of 6
nilfs_check_feature_compatibility50%of 4
nilfs_checkpoint_is_mounted---of 7
nilfs_cleanup_super40%of 5
nilfs_commit_super36%of 17
nilfs_destroy_cachep---of 1
nilfs_fill_super36%of 14
nilfs_free_fc100%of 1
nilfs_free_inode---of 3
nilfs_freeze---of 3
nilfs_get_root_dentry20%of 10
nilfs_get_tree21%of 24
nilfs_init_fs_context67%of 3
nilfs_inode_init_once100%of 1
nilfs_parse_param14%of 15
nilfs_prepare_super30%of 10
nilfs_put_super---of 3
nilfs_read_super_block50%of 4
nilfs_reconfigure---of 7
nilfs_resize_fs---of 28
nilfs_segbuf_init_once---of 1
nilfs_set_log_cursor---of 1
nilfs_setup_super45%of 9
nilfs_show_options---of 15
nilfs_statfs---of 5
nilfs_store_magic100%of 1
nilfs_sync_fs---of 10
nilfs_unfreeze---of 3
-----------
SUMMARY36%of 151

inet6_sk_rx_dst_set---of 9
ip6_dst_store---of 7
nf_reset_ct---of 5
psp_twsk_rx_policy_check---of 17
refcount_dec_and_test---of 4
refcount_inc---of 4
sk_dev_equal_l3scope---of 4
sk_drops_add---of 4
sk_rst_convert_drop_reason---of 10
skb_clone_and_charge_r---of 11
tcp6_destruct_sock---of 1
tcp6_proc_exit---of 1
tcp6_proc_init---of 1
tcp6_seq_show---of 14
tcp_checksum_complete---of 6
tcp_segs_in---of 5
tcp_v6_conn_request---of 7
tcp_v6_connect---of 47
tcp_v6_do_rcv---of 83
tcp_v6_err7%of 43
tcp_v6_fill_cb---of 3
tcp_v6_get_syncookie---of 8
tcp_v6_init_seq_and_ts_off---of 4
tcp_v6_init_sock---of 1
tcp_v6_mapped_child_init---of 5
tcp_v6_md5_hash_hdr---of 1
tcp_v6_md5_hash_skb---of 5
tcp_v6_md5_lookup---of 6
tcp_v6_mtu_reduced---of 9
tcp_v6_parse_md5_keys---of 20
tcp_v6_pre_connect---of 1
tcp_v6_rcv---of 120
tcp_v6_reqsk_destructor---of 1
tcp_v6_reqsk_send_ack---of 13
tcp_v6_restore_cb---of 1
tcp_v6_route_req---of 14
tcp_v6_send_reset---of 41
tcp_v6_send_response---of 65
tcp_v6_send_synack---of 20
tcp_v6_syn_recv_sock---of 49
tcp_v6_timewait_ack---of 6
tcpv6_exit---of 1
tcpv6_net_exit---of 3
tcpv6_net_init---of 3
xfrm6_policy_check---of 23
-----------
SUMMARY7%of 43

nilfs_bmap_assign---of 3
nilfs_bmap_clear67%of 3
nilfs_bmap_data_get_key---of 3
nilfs_bmap_delete---of 9
nilfs_bmap_find_target_in_group---of 1
nilfs_bmap_find_target_seq---of 4
nilfs_bmap_get_dat---of 1
nilfs_bmap_init_gc---of 1
nilfs_bmap_insert---of 9
nilfs_bmap_last_key---of 3
nilfs_bmap_lookup_at_level63%of 8
nilfs_bmap_lookup_contig---of 3
nilfs_bmap_lookup_dirty_buffers---of 3
nilfs_bmap_mark---of 4
nilfs_bmap_propagate---of 3
nilfs_bmap_read89%of 9
nilfs_bmap_restore---of 1
nilfs_bmap_save---of 1
nilfs_bmap_seek_key---of 3
nilfs_bmap_test_and_clear_dirty---of 1
nilfs_bmap_truncate---of 17
nilfs_bmap_write---of 3
-----------
SUMMARY75%of 20

__register_nls---of 6
char2uni---of 1
find_nls63%of 8
load_nls67%of 3
load_nls_default---of 3
uni2char---of 5
unload_nls100%of 3
unregister_nls---of 6
utf16s_to_utf8s16%of 25
utf32_to_utf8---of 14
utf8_to_utf32---of 20
utf8s_to_utf16s---of 10
-----------
SUMMARY36%of 39

__split_vma---of 40
__vm_munmap---of 13
can_vma_merge_left---of 31
can_vma_merge_right---of 32
commit_merge---of 41
copy_vma---of 30
do_brk_flags---of 27
do_vmi_align_munmap---of 14
do_vmi_munmap---of 12
expand_downwards---of 34
find_mergeable_anon_vma---of 23
get_file---of 3
insert_vm_struct---of 27
mm_drop_all_locks---of 19
mm_take_all_locks---of 46
mmap_region26%of 125
mmap_write_unlock---of 5
remove_vma---of 8
unlink_file_vma_batch_add23%of 9
unlink_file_vma_batch_final29%of 7
unlink_file_vma_batch_init100%of 1
unmap_region67%of 3
unmapped_area---of 12
unmapped_area_topdown---of 12
vma_complete---of 56
vma_expand---of 28
vma_iter_store_new---of 3
vma_iter_store_overwrite---of 11
vma_link_file---of 6
vma_merge_extend---of 3
vma_merge_new_range17%of 18
vma_mmu_pagesize---of 4
vma_modify---of 69
vma_modify_flags---of 3
vma_modify_flags_uffd---of 3
vma_modify_name---of 1
vma_modify_policy---of 1
vma_needs_dirty_tracking---of 10
vma_prepare---of 19
vma_shrink---of 21
vma_start_write---of 5
vma_wants_writenotify25%of 12
vms_complete_munmap_vmas38%of 24
vms_gather_munmap_vmas30%of 50
-----------
SUMMARY28%of 249

tomoyo_check_mount_acl50%of 6
tomoyo_mount_permission34%of 33
-----------
SUMMARY36%of 39

evdev_cleanup50%of 6
evdev_connect40%of 10
evdev_disconnect100%of 1
evdev_events---of 5
evdev_fasync---of 1
evdev_free67%of 3
evdev_handle_get_val---of 18
evdev_handle_mt_request---of 7
evdev_ioctl---of 1
evdev_ioctl_compat---of 1
evdev_ioctl_handler---of 95
evdev_open---of 12
evdev_pass_values---of 22
evdev_poll---of 7
evdev_read---of 32
evdev_release---of 12
evdev_write---of 11
handle_eviocgbit---of 13
str_to_user---of 3
-----------
SUMMARY50%of 20

nilfs_dat_abort_alloc---of 3
nilfs_dat_abort_end---of 5
nilfs_dat_abort_update---of 7
nilfs_dat_commit_alloc---of 5
nilfs_dat_commit_end---of 15
nilfs_dat_commit_start---of 5
nilfs_dat_commit_update---of 1
nilfs_dat_freev---of 1
nilfs_dat_get_vinfo---of 11
nilfs_dat_mark_dirty---of 7
nilfs_dat_move---of 14
nilfs_dat_prepare_alloc---of 5
nilfs_dat_prepare_end---of 10
nilfs_dat_prepare_start---of 3
nilfs_dat_prepare_update---of 10
nilfs_dat_read19%of 11
nilfs_dat_translate32%of 16
-----------
SUMMARY26%of 27

create_rule---of 14
free_ruleset---of 14
free_ruleset_work---of 1
insert_rule---of 22
landlock_create_ruleset---of 13
landlock_find_rule---of 9
landlock_get_fs_access_mask---of 1
landlock_get_net_access_mask---of 1
landlock_init_layer_masks---of 9
landlock_insert_rule---of 1
landlock_merge_ruleset---of 38
landlock_put_ruleset---of 5
landlock_put_ruleset_deferred40%of 5
landlock_unmask_layers---of 22
merge_tree---of 7
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

bcm5974_close---of 3
bcm5974_disconnect67%of 3
bcm5974_irq_button---of 12
bcm5974_irq_trackpad---of 29
bcm5974_mode_reset_work---of 6
bcm5974_open---of 6
bcm5974_probe21%of 72
bcm5974_resume---of 5
bcm5974_start_traffic---of 8
bcm5974_suspend---of 5
bcm5974_wellspring_mode---of 7
-----------
SUMMARY23%of 75

__gre_xmit---of 3
__ipgre_rcv34%of 12
__pskb_trim---of 3
erspan_build_header---of 3
erspan_build_header_v2---of 3
erspan_changelink---of 30
erspan_exit_rtnl---of 1
erspan_fill_info---of 12
erspan_init_net---of 1
erspan_newlink---of 30
erspan_setup---of 1
erspan_tunnel_init---of 3
erspan_validate---of 33
erspan_xmit---of 48
gre_build_header---of 14
gre_err18%of 34
gre_fb_xmit---of 19
gre_fill_metadata_dst---of 14
gre_rcv28%of 36
gre_tap_init---of 7
gre_tap_xmit---of 16
gretap_fb_dev_create---of 6
ipgre_changelink---of 16
ipgre_close---of 5
ipgre_exit_rtnl---of 1
ipgre_fill_info---of 20
ipgre_get_size---of 1
ipgre_header---of 10
ipgre_header_parse---of 3
ipgre_init_net---of 1
ipgre_link_update---of 7
ipgre_netlink_parms---of 56
ipgre_newlink---of 16
ipgre_open---of 5
ipgre_tap_exit_rtnl---of 1
ipgre_tap_init_net---of 1
ipgre_tap_setup---of 1
ipgre_tap_validate---of 19
ipgre_tunnel_ctl---of 43
ipgre_tunnel_init---of 12
ipgre_tunnel_setup---of 1
ipgre_tunnel_validate---of 11
ipgre_xmit---of 31
skb_transport_header---of 3
-----------
SUMMARY25%of 82

option_attach50%of 6
option_instat_callback---of 9
option_probe34%of 6
option_release100%of 1
-----------
SUMMARY47%of 13

-----------
SUMMARY---of 0

IP6_ECN_decapsulate---of 34
__ip6_tnl_rcv---of 42
ip4ip6_dscp_ecn_decapsulate---of 3
ip4ip6_err---of 34
ip4ip6_rcv---of 1
ip6_dev_free---of 1
ip6_make_flowlabel---of 5
ip6_tnl_change_mtu---of 9
ip6_tnl_changelink---of 30
ip6_tnl_create2---of 8
ip6_tnl_dellink---of 3
ip6_tnl_dev_init---of 11
ip6_tnl_dev_setup---of 1
ip6_tnl_dev_uninit---of 12
ip6_tnl_encap_add_ops---of 3
ip6_tnl_encap_del_ops---of 3
ip6_tnl_encap_setup---of 7
ip6_tnl_err---of 13
ip6_tnl_exit_rtnl_net---of 17
ip6_tnl_fill_forward_path---of 3
ip6_tnl_fill_info---of 17
ip6_tnl_get_cap---of 7
ip6_tnl_get_iflink---of 1
ip6_tnl_get_link_net---of 1
ip6_tnl_get_size---of 1
ip6_tnl_init_net---of 9
ip6_tnl_link_config---of 18
ip6_tnl_locate---of 19
ip6_tnl_lookup---of 32
ip6_tnl_netlink_parms---of 22
ip6_tnl_newlink---of 24
ip6_tnl_parm_from_user---of 1
ip6_tnl_parse_tlv_enc_lim29%of 28
ip6_tnl_rcv---of 1
ip6_tnl_rcv_ctl---of 18
ip6_tnl_siocdevprivate---of 36
ip6_tnl_start_xmit---of 45
ip6_tnl_update---of 12
ip6_tnl_validate---of 7
ip6_tnl_xmit---of 84
ip6_tnl_xmit_ctl---of 20
ip6ip6_dscp_ecn_decapsulate---of 3
ip6ip6_err---of 17
ip6ip6_rcv---of 1
ip6tunnel_xmit---of 14
ip_route_input---of 5
ip_route_output_ports---of 1
ipxip6_rcv---of 24
mplsip6_dscp_ecn_decapsulate---of 1
mplsip6_err---of 1
mplsip6_rcv---of 1
skb_clone_writable---of 4
skb_dst_set---of 3
skb_reset_network_header---of 3
-----------
SUMMARY29%of 28

seq_show_option---of 1
suffix_kstrtoint---of 9
suffix_kstrtoull---of 9
trace_xfs_inode_timestamp_range---of 5
xfs_blkdev_get---of 3
xfs_debugfs_mkdir---of 1
xfs_destroy_caches---of 1
xfs_destroy_mount_workqueues100%of 1
xfs_destroy_percpu_counters100%of 1
xfs_destroy_workqueues---of 1
xfs_finish_flags---of 12
xfs_flush_inodes---of 3
xfs_flush_inodes_worker---of 3
xfs_fs_destroy_inode---of 5
xfs_fs_drop_inode---of 4
xfs_fs_evict_inode---of 6
xfs_fs_fill_super13%of 63
xfs_fs_free67%of 3
xfs_fs_free_cached_objects---of 1
xfs_fs_freeze---of 4
xfs_fs_get_tree100%of 1
xfs_fs_inode_init_once---of 1
xfs_fs_nr_cached_objects---of 3
xfs_fs_parse_param5%of 48
xfs_fs_put_super---of 3
xfs_fs_reconfigure---of 38
xfs_fs_report_error---of 4
xfs_fs_show_options---of 54
xfs_fs_show_stats---of 3
xfs_fs_shutdown---of 1
xfs_fs_statfs---of 14
xfs_fs_sync_fs---of 9
xfs_fs_unfreeze---of 3
xfs_fs_validate_params29%of 25
xfs_init_fs_context67%of 3
xfs_init_mount_workqueues25%of 8
xfs_inodegc_init_percpu50%of 6
xfs_kill_sb100%of 1
xfs_mount_free45%of 9
xfs_mount_set_dax_mode---of 5
xfs_open_devices22%of 23
xfs_reinit_percpu_counters---of 3
xfs_restore_resvblks---of 7
xfs_save_resvblks---of 1
xfs_set_inode_alloc---of 12
xfs_setup_dax_always---of 9
xfs_setup_devices---of 13
xfs_shutdown_devices50%of 6
xfs_sysfs_del---of 1
xfs_sysfs_init---of 3
-----------
SUMMARY22%of 198

-----------
SUMMARY---of 0

__report_access---of 9
report_access---of 18
yama_dointvec_minmax---of 6
yama_ptrace_access_check---of 38
yama_ptrace_traceme---of 5
yama_ptracer_add---of 11
yama_ptracer_del---of 12
yama_relation_cleanup---of 10
yama_task_free---of 1
yama_task_prctl14%of 15
-----------
SUMMARY14%of 15

__shmem_file_setup---of 11
casefold_show---of 1
inode_set_cached_link---of 5
list_add_tail---of 3
put_swap_device---of 4
shmem_add_to_page_cache---of 30
shmem_alloc_and_add_folio---of 56
shmem_alloc_inode100%of 1
shmem_allowable_huge_orders---of 31
shmem_can_userfault---of 1
shmem_charge---of 3
shmem_confirm_swap---of 8
shmem_create---of 1
shmem_destroy_inode60%of 5
shmem_enabled_show---of 1
shmem_enabled_store---of 9
shmem_encode_fh---of 5
shmem_error_remove_folio---of 1
shmem_evict_inode27%of 26
shmem_falloc_wait---of 16
shmem_fallocate---of 36
shmem_fault---of 10
shmem_fh_to_dentry---of 5
shmem_file_llseek---of 5
shmem_file_open---of 1
shmem_file_read_iter---of 22
shmem_file_setup---of 1
shmem_file_setup_with_mnt---of 1
shmem_file_splice_read---of 19
shmem_file_write_iter---of 5
shmem_fileattr_get---of 1
shmem_fileattr_set---of 10
shmem_fill_super---of 27
shmem_free_fc50%of 4
shmem_free_in_core_inode---of 3
shmem_free_swap---of 5
shmem_get_dquots100%of 1
shmem_get_folio---of 1
shmem_get_folio_gfp---of 67
shmem_get_folio_noalloc---of 1
shmem_get_inode25%of 36
shmem_get_link---of 8
shmem_get_offset_ctx---of 1
shmem_get_parent---of 1
shmem_get_partial_folio25%of 8
shmem_get_policy---of 1
shmem_get_tree---of 1
shmem_get_unmapped_area---of 29
shmem_getattr---of 19
shmem_hpage_pmd_enabled---of 6
shmem_init_fs_context67%of 3
shmem_init_inode---of 1
shmem_initxattrs---of 22
shmem_inode_acct_blocks---of 11
shmem_kernel_file_setup---of 1
shmem_link---of 10
shmem_listxattr---of 1
shmem_lock---of 6
shmem_mapping100%of 1
shmem_match---of 3
shmem_mfill_filemap_add---of 4
shmem_mfill_filemap_remove---of 1
shmem_mfill_folio_alloc---of 12
shmem_mkdir---of 3
shmem_mknod31%of 13
shmem_mmap_prepare---of 3
shmem_next_opt80%of 5
shmem_parse_huge---of 9
shmem_parse_monolithic100%of 1
shmem_parse_one9%of 48
shmem_parse_opt_casefold---of 7
shmem_partial_swap_usage---of 18
shmem_put_link---of 3
shmem_put_super---of 5
shmem_reacct_size---of 5
shmem_read_folio_gfp---of 3
shmem_read_mapping_page_gfp---of 6
shmem_recalc_inode50%of 6
shmem_reconfigure---of 47
shmem_rename2---of 17
shmem_rmdir---of 3
shmem_set_inode_flags---of 10
shmem_set_policy---of 1
shmem_setattr---of 43
shmem_show_options---of 36
shmem_statfs---of 5
shmem_swap_usage---of 5
shmem_swapin_folio---of 108
shmem_symlink---of 12
shmem_tmpfile---of 8
shmem_truncate_range---of 1
shmem_uncharge---of 1
shmem_undo_range20%of 62
shmem_unlink---of 7
shmem_unlock_mapping---of 7
shmem_unuse---of 42
shmem_unused_huge_count---of 1
shmem_unused_huge_scan---of 3
shmem_unused_huge_shrink---of 42
shmem_write_begin---of 11
shmem_write_end---of 27
shmem_writeout---of 67
shmem_xattr_handler_get---of 3
shmem_xattr_handler_set---of 13
shmem_zero_setup---of 4
shmem_zero_setup_desc---of 3
synchronous_wake_function---of 4
thpsize_shmem_enabled_show---of 5
thpsize_shmem_enabled_store---of 10
vma_is_anon_shmem---of 1
vma_is_shmem---of 1
zero_pipe_buf_get---of 1
zero_pipe_buf_release---of 1
zero_pipe_buf_try_steal---of 1
-----------
SUMMARY26%of 220

-----------
SUMMARY---of 0

__bio_queue_enter---of 31
__blk_flush_plug20%of 15
__bpf_trace_blkdev_zone_mgmt---of 1
__bpf_trace_block_bio---of 1
__bpf_trace_block_bio_complete---of 1
__bpf_trace_block_bio_remap---of 1
__bpf_trace_block_buffer---of 1
__bpf_trace_block_plug---of 1
__bpf_trace_block_rq---of 1
__bpf_trace_block_rq_completion---of 1
__bpf_trace_block_rq_remap---of 1
__bpf_trace_block_rq_requeue---of 1
__bpf_trace_block_split---of 1
__bpf_trace_block_unplug---of 1
__bpf_trace_block_zwplug---of 1
__probestub_blk_zone_append_update_request_bio---of 1
__probestub_blk_zone_wplug_bio---of 1
__probestub_blkdev_zone_mgmt---of 1
__probestub_block_bio_backmerge---of 1
__probestub_block_bio_complete---of 1
__probestub_block_bio_frontmerge---of 1
__probestub_block_bio_queue---of 1
__probestub_block_bio_remap---of 1
__probestub_block_dirty_buffer---of 1
__probestub_block_getrq---of 1
__probestub_block_io_done---of 1
__probestub_block_io_start---of 1
__probestub_block_plug---of 1
__probestub_block_rq_complete---of 1
__probestub_block_rq_error---of 1
__probestub_block_rq_insert---of 1
__probestub_block_rq_issue---of 1
__probestub_block_rq_merge---of 1
__probestub_block_rq_remap---of 1
__probestub_block_rq_requeue---of 1
__probestub_block_split---of 1
__probestub_block_touch_buffer---of 1
__probestub_block_unplug---of 1
__probestub_disk_zone_wplug_add_bio---of 1
__submit_bio25%of 24
__traceiter_blk_zone_append_update_request_bio---of 4
__traceiter_blk_zone_wplug_bio---of 4
__traceiter_blkdev_zone_mgmt---of 4
__traceiter_block_bio_backmerge---of 4
__traceiter_block_bio_complete---of 4
__traceiter_block_bio_frontmerge---of 4
__traceiter_block_bio_queue---of 4
__traceiter_block_bio_remap---of 4
__traceiter_block_dirty_buffer---of 4
__traceiter_block_getrq---of 4
__traceiter_block_io_done---of 4
__traceiter_block_io_start---of 4
__traceiter_block_plug---of 4
__traceiter_block_rq_complete---of 4
__traceiter_block_rq_error---of 4
__traceiter_block_rq_insert---of 4
__traceiter_block_rq_issue---of 4
__traceiter_block_rq_merge---of 4
__traceiter_block_rq_remap---of 4
__traceiter_block_rq_requeue---of 4
__traceiter_block_split---of 4
__traceiter_block_touch_buffer---of 4
__traceiter_block_unplug---of 4
__traceiter_disk_zone_wplug_add_bio---of 4
bdev_end_io_acct---of 13
bdev_start_io_acct---of 8
bio_end_io_acct_remapped---of 1
bio_poll---of 17
bio_start_io_acct---of 1
blk_alloc_queue---of 7
blk_check_plugged---of 13
blk_check_zone_append---of 6
blk_clear_pm_only---of 4
blk_finish_plug67%of 3
blk_free_queue_rcu---of 1
blk_get_queue40%of 5
blk_io_schedule---of 1
blk_lld_busy---of 4
blk_op_str---of 4
blk_put_queue---of 5
blk_queue_enter---of 39
blk_queue_exit50%of 4
blk_queue_flag_clear---of 1
blk_queue_flag_set---of 1
blk_queue_start_drain---of 3
blk_queue_usage_counter_release---of 1
blk_rq_timed_out_timer---of 1
blk_set_pm_only---of 1
blk_start_plug67%of 3
blk_start_plug_nr_ios---of 3
blk_status_to_errno67%of 3
blk_status_to_str---of 3
blk_sync_queue---of 1
blk_timeout_work---of 1
errno_to_blk_status---of 20
iocb_bio_iopoll---of 3
kblockd_mod_delayed_work_on100%of 1
kblockd_schedule_work---of 1
perf_trace_blkdev_zone_mgmt---of 8
perf_trace_block_bio---of 8
perf_trace_block_bio_complete---of 9
perf_trace_block_bio_remap---of 8
perf_trace_block_buffer---of 8
perf_trace_block_plug---of 8
perf_trace_block_rq---of 15
perf_trace_block_rq_completion---of 13
perf_trace_block_rq_remap---of 10
perf_trace_block_rq_requeue---of 15
perf_trace_block_split---of 8
perf_trace_block_unplug---of 8
perf_trace_block_zwplug---of 8
should_fail_bio---of 3
should_fail_request---of 3
submit_bio40%of 10
submit_bio_noacct20%of 85
submit_bio_noacct_nocheck10%of 32
trace_event_raw_event_blkdev_zone_mgmt---of 9
trace_event_raw_event_block_bio---of 9
trace_event_raw_event_block_bio_complete---of 10
trace_event_raw_event_block_bio_remap---of 9
trace_event_raw_event_block_buffer---of 9
trace_event_raw_event_block_plug---of 9
trace_event_raw_event_block_rq---of 16
trace_event_raw_event_block_rq_completion---of 14
trace_event_raw_event_block_rq_remap---of 11
trace_event_raw_event_block_rq_requeue---of 16
trace_event_raw_event_block_split---of 9
trace_event_raw_event_block_unplug---of 9
trace_event_raw_event_block_zwplug---of 9
trace_raw_output_blkdev_zone_mgmt---of 3
trace_raw_output_block_bio---of 3
trace_raw_output_block_bio_complete---of 3
trace_raw_output_block_bio_remap---of 3
trace_raw_output_block_buffer---of 3
trace_raw_output_block_plug---of 3
trace_raw_output_block_rq---of 3
trace_raw_output_block_rq_completion---of 3
trace_raw_output_block_rq_remap---of 3
trace_raw_output_block_rq_requeue---of 3
trace_raw_output_block_split---of 3
trace_raw_output_block_unplug---of 3
trace_raw_output_block_zwplug---of 3
update_io_ticks75%of 8
-----------
SUMMARY26%of 193

-----------
SUMMARY---of 0

__rhashtable_walk_find_next---of 17
__rht_bucket_nested---of 4
bucket_table_alloc---of 15
bucket_table_free_rcu---of 4
jhash---of 17
jhash2---of 8
nested_table_free---of 5
rhashtable_destroy100%of 1
rhashtable_free_and_destroy20%of 20
rhashtable_init_noprof40%of 20
rhashtable_insert_slow---of 66
rhashtable_jhash2---of 1
rhashtable_walk_enter---of 4
rhashtable_walk_exit---of 5
rhashtable_walk_next---of 7
rhashtable_walk_peek---of 4
rhashtable_walk_start_check---of 23
rhashtable_walk_stop---of 8
rhltable_init_noprof---of 1
rht_bucket_nested---of 4
rht_bucket_nested_insert---of 14
rht_deferred_worker---of 80
-----------
SUMMARY32%of 41

__kobject_del45%of 9
dynamic_kobj_release---of 1
kobj_attr_show---of 3
kobj_attr_store---of 3
kobj_child_ns_ops---of 5
kobj_kset_leave---of 5
kobj_ns_current_may_mount---of 4
kobj_ns_drop---of 5
kobj_ns_grab_current---of 4
kobj_ns_ops60%of 5
kobj_ns_type_register---of 4
kobj_ns_type_registered---of 1
kobject_add40%of 5
kobject_add_internal32%of 50
kobject_create_and_add50%of 6
kobject_del67%of 3
kobject_get43%of 7
kobject_get_ownership100%of 3
kobject_get_path45%of 9
kobject_get_unless_zero38%of 8
kobject_init40%of 5
kobject_init_and_add43%of 7
kobject_move---of 29
kobject_namespace---of 7
kobject_put70%of 13
kobject_rename---of 19
kobject_set_name---of 1
kobject_set_name_vargs50%of 6
kset_create_and_add---of 7
kset_find_obj54%of 13
kset_get_ownership---of 4
kset_init---of 3
kset_register---of 7
kset_release---of 1
kset_unregister---of 4
-----------
SUMMARY45%of 149

erase_effect---of 11
input_ff_create---of 11
input_ff_destroy50%of 4
input_ff_erase---of 4
input_ff_event---of 10
input_ff_flush---of 6
input_ff_upload---of 29
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__cgroup_account_cputime67%of 3
__cgroup_account_cputime_field---of 10
__css_rstat_lock---of 10
bpf_rstat_flush---of 1
cgroup_base_stat_cputime_show---of 9
css_rstat_exit---of 15
css_rstat_flush---of 52
css_rstat_init---of 16
css_rstat_updated59%of 12
-----------
SUMMARY60%of 15

__hmac_sha256_final---of 7
__hmac_sha256_init---of 1
__hmac_sha256_preparekey---of 4
__sha256_update75%of 8
hmac_sha224---of 1
hmac_sha224_final---of 1
hmac_sha224_init_usingrawkey---of 1
hmac_sha224_preparekey---of 1
hmac_sha224_usingrawkey---of 1
hmac_sha256---of 1
hmac_sha256_final---of 1
hmac_sha256_init_usingrawkey---of 1
hmac_sha256_preparekey---of 1
hmac_sha256_usingrawkey---of 1
sha224---of 1
sha224_final---of 3
sha224_init---of 1
sha256100%of 1
sha256_blocks_avx---of 3
sha256_blocks_avx267%of 3
sha256_blocks_generic---of 5
sha256_blocks_ni---of 3
sha256_blocks_ssse3---of 3
sha256_final67%of 3
sha256_finup_2x---of 5
sha256_finup_2x_is_optimized---of 1
sha256_finup_2x_sequential---of 5
sha256_init---of 1
-----------
SUMMARY74%of 15

__ia32_sys_add_key100%of 1
__ia32_sys_keyctl---of 1
__ia32_sys_request_key100%of 1
__se_sys_add_key23%of 18
__se_sys_keyctl---of 66
__se_sys_request_key27%of 15
__x64_sys_add_key---of 1
__x64_sys_keyctl---of 1
__x64_sys_request_key---of 1
keyctl_assume_authority---of 12
keyctl_capabilities---of 7
keyctl_change_reqkey_auth---of 6
keyctl_chown_key---of 32
keyctl_describe_key---of 13
keyctl_get_keyring_ID---of 3
keyctl_get_security---of 12
keyctl_instantiate_key---of 4
keyctl_instantiate_key_common---of 23
keyctl_instantiate_key_iov---of 3
keyctl_invalidate_key---of 8
keyctl_join_session_keyring---of 5
keyctl_keyring_clear---of 8
keyctl_keyring_link---of 4
keyctl_keyring_move---of 6
keyctl_keyring_search---of 15
keyctl_keyring_unlink---of 6
keyctl_negate_key---of 1
keyctl_read_key---of 20
keyctl_reject_key---of 20
keyctl_restrict_keyring---of 10
keyctl_revoke_key---of 7
keyctl_session_to_parent---of 27
keyctl_set_reqkey_keyring16%of 13
keyctl_set_timeout---of 11
keyctl_setperm_key---of 10
keyctl_update_key---of 8
keyctl_watch_key---of 14
-----------
SUMMARY25%of 48

-----------
SUMMARY---of 0

device_add_software_node---of 20
device_create_managed_software_node---of 12
device_remove_software_node---of 9
fwnode_create_software_node---of 13
fwnode_remove_software_node---of 4
is_software_node---of 3
property_entries_dup---of 49
property_entries_free---of 14
software_node_find_by_name---of 10
software_node_fwnode---of 5
software_node_get---of 3
software_node_get_name---of 3
software_node_get_name_prefix---of 8
software_node_get_named_child_node---of 7
software_node_get_next_child---of 18
software_node_get_parent---of 5
software_node_get_reference_args---of 29
software_node_graph_get_next_endpoint---of 87
software_node_graph_get_port_parent---of 9
software_node_graph_get_remote_endpoint---of 16
software_node_graph_parse_endpoint---of 7
software_node_notify17%of 12
software_node_notify_remove19%of 11
software_node_property_present---of 8
software_node_put---of 3
software_node_read_int_array---of 22
software_node_read_string_array---of 19
software_node_register---of 12
software_node_register_node_group---of 6
software_node_release---of 7
software_node_unregister---of 8
software_node_unregister_node_group---of 13
swnode_register---of 9
to_software_node---of 4
-----------
SUMMARY18%of 23

-----------
SUMMARY---of 0

add_uevent_var50%of 4
alloc_uevent_skb67%of 3
cleanup_uevent_env---of 1
init_uevent_argv---of 3
kobj_usermode_filter---of 3
kobject_synth_uevent---of 61
kobject_uevent100%of 1
kobject_uevent_env40%of 38
kobject_uevent_net_broadcast31%of 26
uevent_net_exit---of 5
uevent_net_init---of 6
uevent_net_rcv---of 1
uevent_net_rcv_skb---of 9
zap_modalias_env56%of 9
-----------
SUMMARY41%of 81

__ia32_compat_sys_kexec_load19%of 11
__ia32_sys_kexec_load---of 1
__se_sys_kexec_load---of 10
__x64_sys_kexec_load---of 1
do_kexec_load20%of 31
-----------
SUMMARY20%of 42

l4proto_manip_pkt---of 41
nf_nat_csum_recalc---of 6
nf_nat_icmp_reply_translation---of 11
nf_nat_icmpv6_reply_translation---of 26
nf_nat_inet_register_fn---of 5
nf_nat_inet_unregister_fn---of 1
nf_nat_ipv4_local_fn15%of 14
nf_nat_ipv4_local_in12%of 25
nf_nat_ipv4_manip_pkt---of 5
nf_nat_ipv4_out20%of 15
nf_nat_ipv4_pre_routing17%of 12
nf_nat_ipv4_register_fn---of 1
nf_nat_ipv4_unregister_fn---of 1
nf_nat_ipv6_in24%of 13
nf_nat_ipv6_local_fn---of 14
nf_nat_ipv6_local_in12%of 25
nf_nat_ipv6_out---of 15
nf_nat_ipv6_register_fn---of 1
nf_nat_ipv6_unregister_fn---of 1
nf_nat_manip_pkt---of 11
nf_xfrm_me_harder---of 15
-----------
SUMMARY16%of 104

__ia32_compat_sys_lseek---of 9
__ia32_compat_sys_preadv---of 8
__ia32_compat_sys_preadv2---of 9
__ia32_compat_sys_preadv64---of 7
__ia32_compat_sys_preadv64v2---of 7
__ia32_compat_sys_pwritev---of 8
__ia32_compat_sys_pwritev2---of 9
__ia32_compat_sys_pwritev64---of 7
__ia32_compat_sys_pwritev64v2---of 7
__ia32_compat_sys_sendfile---of 4
__ia32_compat_sys_sendfile64---of 4
__ia32_sys_copy_file_range---of 1
__ia32_sys_llseek---of 1
__ia32_sys_lseek---of 9
__ia32_sys_pread64---of 6
__ia32_sys_preadv---of 7
__ia32_sys_preadv2---of 1
__ia32_sys_pwrite64---of 6
__ia32_sys_pwritev---of 7
__ia32_sys_pwritev2---of 1
__ia32_sys_read---of 1
__ia32_sys_readv---of 1
__ia32_sys_sendfile---of 1
__ia32_sys_sendfile64---of 1
__ia32_sys_write100%of 1
__ia32_sys_writev---of 1
__kernel_read---of 23
__kernel_write---of 1
__kernel_write_iter---of 22
__se_sys_copy_file_range---of 19
__se_sys_llseek---of 10
__se_sys_preadv2---of 9
__se_sys_pwritev2---of 9
__se_sys_sendfile---of 4
__se_sys_sendfile64---of 4
__x64_sys_copy_file_range---of 1
__x64_sys_llseek---of 1
__x64_sys_lseek---of 9
__x64_sys_pread64---of 7
__x64_sys_preadv---of 8
__x64_sys_preadv2---of 1
__x64_sys_pwrite64---of 7
__x64_sys_pwritev---of 8
__x64_sys_pwritev2---of 1
__x64_sys_read---of 1
__x64_sys_readv---of 1
__x64_sys_sendfile---of 1
__x64_sys_sendfile64---of 1
__x64_sys_write---of 1
__x64_sys_writev---of 1
default_llseek---of 15
do_iter_readv_writev---of 28
do_readv---of 12
do_sendfile---of 24
do_writev---of 12
fixed_size_llseek---of 3
fsnotify_file---of 9
fsnotify_path---of 8
generic_atomic_write_valid---of 5
generic_file_llseek---of 1
generic_file_llseek_size---of 24
generic_file_rw_checks---of 7
generic_llseek_cookie---of 20
generic_write_check_limits---of 6
generic_write_checks---of 4
generic_write_checks_count---of 13
kernel_read---of 3
kernel_write---of 12
ksys_pread64---of 7
ksys_pwrite64---of 7
ksys_read---of 10
ksys_write60%of 10
no_seek_end_llseek---of 3
no_seek_end_llseek_size---of 3
noop_llseek---of 1
rw_verify_area17%of 18
vfs_copy_file_range---of 63
vfs_iocb_iter_read---of 16
vfs_iocb_iter_write---of 24
vfs_iter_read---of 16
vfs_iter_write---of 25
vfs_llseek---of 3
vfs_read---of 26
vfs_readv---of 28
vfs_setpos---of 7
vfs_write28%of 36
vfs_writev---of 37
warn_unsupported---of 3
-----------
SUMMARY31%of 65

-----------
SUMMARY---of 0

__import_iovec---of 25
__iov_iter_get_pages_alloc---of 26
_copy_from_iter6%of 56
_copy_from_iter_flushcache---of 51
_copy_from_iter_nocache---of 51
_copy_mc_to_iter---of 56
_copy_to_iter---of 59
bvec_npages---of 7
copy_compat_iovec_from_user---of 7
copy_folio_from_iter_atomic---of 64
copy_page_from_iter34%of 9
copy_page_to_iter---of 10
copy_page_to_iter_nofault---of 64
dup_iter---of 3
fault_in_iov_iter_readable---of 13
fault_in_iov_iter_writeable---of 13
import_iovec---of 1
import_ubuf---of 4
iov_iter_advance---of 11
iov_iter_alignment---of 9
iov_iter_alignment_bvec---of 4
iov_iter_alignment_iovec---of 10
iov_iter_bvec---of 3
iov_iter_bvec_advance---of 6
iov_iter_discard---of 3
iov_iter_extract_bvec_pages---of 18
iov_iter_extract_bvecs---of 20
iov_iter_extract_folioq_pages---of 16
iov_iter_extract_kvec_pages---of 17
iov_iter_extract_pages---of 22
iov_iter_extract_xarray_pages---of 24
iov_iter_folio_queue---of 3
iov_iter_folioq_advance---of 9
iov_iter_gap_alignment---of 9
iov_iter_get_pages2---of 4
iov_iter_get_pages_alloc2---of 3
iov_iter_init---of 3
iov_iter_iovec_advance---of 10
iov_iter_kvec---of 3
iov_iter_npages---of 9
iov_iter_restore---of 6
iov_iter_revert---of 15
iov_iter_single_seg_count---of 10
iov_iter_xarray---of 3
iov_iter_zero---of 55
iov_npages---of 13
iovec_from_user---of 18
iter_folioq_get_pages---of 18
iter_xarray_get_pages---of 26
want_pages_array---of 6
xas_next_entry---of 8
-----------
SUMMARY10%of 65

__xa_alloc34%of 15
__xa_alloc_cyclic---of 8
__xa_clear_mark30%of 10
__xa_cmpxchg---of 1
__xa_cmpxchg_raw---of 14
__xa_erase---of 1
__xa_insert---of 1
__xa_set_mark67%of 9
__xa_store24%of 13
__xas_next---of 12
__xas_nomem20%of 10
__xas_prev---of 12
xa_clear_mark---of 10
xa_delete_node---of 1
xa_destroy---of 16
xa_erase100%of 1
xa_extract---of 33
xa_find43%of 7
xa_find_after---of 13
xa_get_mark---of 15
xa_get_order---of 6
xa_load50%of 4
xa_set_mark---of 9
xa_store---of 1
xa_store_range---of 21
xas_clear_mark34%of 9
xas_create59%of 58
xas_create_range---of 12
xas_destroy---of 4
xas_find79%of 19
xas_find_conflict47%of 32
xas_find_marked63%of 32
xas_get_mark---of 4
xas_get_order---of 5
xas_init_marks50%of 18
xas_load62%of 18
xas_nomem29%of 7
xas_pause---of 8
xas_set_mark63%of 8
xas_split---of 32
xas_split_alloc---of 14
xas_store50%of 68
xas_try_split---of 46
xas_try_split_min_order---of 3
-----------
SUMMARY52%of 338

-----------
SUMMARY---of 0

__dev_change_flags---of 19
__dev_change_net_namespace---of 115
__dev_close_many---of 28
__dev_direct_xmit---of 20
__dev_forward_skb---of 1
__dev_forward_skb2---of 18
__dev_get_by_index---of 5
__dev_get_by_name---of 5
__dev_notify_flags---of 17
__dev_open---of 32
__dev_queue_xmit14%of 168
__dev_remove_pack---of 13
__dev_set_promiscuity---of 17
__dev_set_rx_mode---of 10
__list_del_entry---of 4
__napi_busy_loop---of 53
__napi_poll---of 17
__napi_schedule43%of 14
__napi_schedule_irqoff42%of 12
__netdev_adjacent_dev_insert---of 25
__netdev_adjacent_dev_remove---of 16
__netdev_adjacent_dev_unlink_neighbour---of 1
__netdev_notify_peers---of 11
__netdev_printk---of 22
__netdev_put_lock_ops_compat---of 16
__netdev_update_features---of 103
__netdev_update_lower_level---of 8
__netdev_update_upper_level---of 8
__netdev_upper_dev_link---of 53
__netdev_upper_dev_unlink---of 57
__netdev_walk_all_lower_dev---of 15
__netif_napi_del_locked---of 13
__netif_receive_skb_core20%of 145
__netif_receive_skb_list_core---of 30
__netif_rx34%of 9
__netif_schedule---of 5
__netif_set_mtu---of 3
__netif_set_xps_queue---of 101
alloc_netdev_dummy---of 1
alloc_netdev_mqs16%of 25
backlog_napi_setup---of 1
backlog_napi_should_run---of 1
bpf_prog_run_generic_xdp---of 45
bpf_xdp_link_attach---of 20
bpf_xdp_link_dealloc---of 1
bpf_xdp_link_detach---of 1
bpf_xdp_link_fill_link_info---of 3
bpf_xdp_link_release---of 22
bpf_xdp_link_show_fdinfo---of 3
bpf_xdp_link_update---of 21
busy_poll_stop---of 21
call_netdevice_notifiers---of 5
call_netdevice_notifiers_info---of 5
call_netdevice_register_net_notifiers---of 15
clean_xps_maps---of 24
default_device_exit_batch---of 36
deliver_ptype_list_skb---of 7
deliver_skb---of 9
dev_add_pack---of 11
dev_alloc_name---of 1
dev_change_xdp_fd---of 16
dev_cpu_dead---of 34
dev_fetch_sw_netstats---of 5
dev_fill_forward_path---of 8
dev_fill_metadata_dst---of 19
dev_forward_skb---of 3
dev_forward_skb_nomtu---of 3
dev_get_alias---of 3
dev_get_by_index---of 7
dev_get_by_index_rcu60%of 5
dev_get_by_name---of 5
dev_get_by_name_rcu---of 5
dev_get_by_napi_id---of 6
dev_get_iflink---of 4
dev_get_min_mp_channel_count---of 9
dev_get_phys_port_id---of 3
dev_get_phys_port_name---of 4
dev_get_stats---of 14
dev_get_tstats64---of 5
dev_getbyhwaddr---of 9
dev_getbyhwaddr_rcu---of 7
dev_getfirstbyhwtype---of 7
dev_hard_start_xmit30%of 20
dev_index_release---of 3
dev_index_reserve---of 6
dev_ingress_queue_create---of 4
dev_kfree_skb_any_reason---of 10
dev_kfree_skb_irq_reason---of 7
dev_loopback_xmit46%of 11
dev_nit_active_rcu---of 3
dev_pick_tx_zero---of 1
dev_prep_valid_name---of 26
dev_qdisc_enqueue---of 6
dev_queue_xmit_nit---of 31
dev_remove_pack---of 4
dev_set_rx_mode---of 10
dev_valid_name---of 14
dev_validate_mtu---of 8
dev_xdp_attach---of 81
dev_xdp_install---of 25
dev_xdp_prog_count---of 10
dev_xdp_prog_id---of 3
dev_xdp_sb_prog_count---of 7
do_netdev_rx_csum_fault---of 1
do_xdp_generic---of 45
enqueue_to_backlog17%of 24
flush_backlog---of 14
free_netdev30%of 24
generic_xdp_install---of 8
generic_xdp_tx---of 14
get_rps_cpu---of 23
gro_flush_normal---of 5
init_dummy_netdev---of 1
is_skb_forwardable---of 4
kick_defer_list_purge---of 7
list_netdevice---of 18
napi_busy_loop---of 1
napi_busy_loop_rcu---of 1
napi_complete_done---of 20
napi_disable---of 1
napi_disable_locked---of 13
napi_enable---of 1
napi_enable_locked---of 35
napi_resume_irqs---of 10
napi_schedule_prep40%of 5
napi_set_threaded---of 15
napi_suspend_irqs---of 6
napi_threaded_poll---of 19
napi_threaded_poll_loop---of 44
napi_watchdog---of 4
net_dec_egress_queue---of 1
net_dec_ingress_queue---of 1
net_disable_timestamp---of 5
net_enable_timestamp---of 5
net_inc_egress_queue---of 1
net_inc_ingress_queue---of 1
net_rx_action---of 54
net_tx_action---of 36
netdev_adjacent_change_abort---of 11
netdev_adjacent_change_commit---of 10
netdev_adjacent_change_prepare---of 20
netdev_adjacent_get_private---of 1
netdev_adjacent_rename_links---of 11
netdev_alert---of 1
netdev_bind_sb_channel_queue---of 9
netdev_bonding_info_change---of 5
netdev_change_features---of 5
netdev_change_proto_down_reason_locked---of 6
netdev_cmd_to_name---of 42
netdev_compute_master_upper_features---of 17
netdev_copy_name---of 7
netdev_core_pick_tx37%of 11
netdev_core_stats_alloc50%of 4
netdev_core_stats_inc75%of 4
netdev_crit---of 1
netdev_drivername---of 5
netdev_emerg---of 1
netdev_err---of 1
netdev_exit---of 4
netdev_features_change---of 5
netdev_get_by_flags_rcu---of 7
netdev_get_by_index---of 4
netdev_get_by_index_lock---of 3
netdev_get_by_index_lock_ops_compat---of 3
netdev_get_by_name---of 7
netdev_get_name---of 5
netdev_get_xmit_slave---of 3
netdev_has_any_upper_dev---of 3
netdev_has_upper_dev---of 10
netdev_has_upper_dev_all_rcu---of 8
netdev_hold---of 5
netdev_increment_features---of 1
netdev_info---of 1
netdev_init---of 4
netdev_is_rx_handler_busy---of 5
netdev_lock_ops---of 3
netdev_lower_dev_get_private---of 6
netdev_lower_get_first_private_rcu---of 4
netdev_lower_get_next---of 3
netdev_lower_get_next_private---of 3
netdev_lower_get_next_private_rcu---of 3
netdev_lower_state_changed---of 7
netdev_master_upper_dev_get---of 6
netdev_master_upper_dev_get_rcu---of 5
netdev_master_upper_dev_link---of 1
netdev_name_in_use---of 5
netdev_name_node_alt_create---of 10
netdev_name_node_alt_destroy---of 12
netdev_name_node_alt_free---of 1
netdev_napi_by_id_lock---of 17
netdev_need_ops_lock---of 3
netdev_next_lower_dev_rcu---of 3
netdev_notice---of 1
netdev_notify_peers---of 1
netdev_offload_xstats_disable---of 11
netdev_offload_xstats_enable---of 12
netdev_offload_xstats_enabled---of 5
netdev_offload_xstats_get---of 18
netdev_offload_xstats_push_delta---of 6
netdev_offload_xstats_report_delta---of 1
netdev_offload_xstats_report_used---of 1
netdev_pick_tx---of 48
netdev_port_same_parent_id---of 6
netdev_printk---of 1
netdev_put_lock---of 10
netdev_refcnt_read---of 1
netdev_reg_state---of 8
netdev_reset_tc---of 10
netdev_run_todo---of 52
netdev_rx_csum_fault---of 3
netdev_rx_handler_register---of 8
netdev_rx_handler_unregister---of 6
netdev_set_default_ethtool_ops---of 3
netdev_set_num_tc---of 11
netdev_set_sb_channel---of 4
netdev_set_tc_queue---of 5
netdev_sk_get_lowest_dev---of 6
netdev_stats_to_stats64---of 1
netdev_sw_irq_coalesce_default_on---of 23
netdev_txq_to_tc---of 18
netdev_unbind_sb_channel---of 6
netdev_update_features---of 6
netdev_upper_dev_link---of 1
netdev_upper_dev_unlink---of 1
netdev_upper_get_next_dev_rcu---of 3
netdev_walk_all_lower_dev---of 9
netdev_walk_all_lower_dev_rcu---of 9
netdev_walk_all_upper_dev_rcu---of 9
netdev_warn---of 1
netdev_xa_find_lock---of 10
netdev_xa_find_lock_ops_compat---of 13
netdev_xmit_skip_txqueue---of 1
netif_change_carrier---of 4
netif_change_flags---of 3
netif_change_name---of 26
netif_change_proto_down---of 5
netif_change_tx_queue_len---of 10
netif_close---of 7
netif_close_many---of 19
netif_device_attach---of 11
netif_device_detach---of 6
netif_disable_lro---of 17
netif_enable_cpu_rmap---of 4
netif_get_flags---of 3
netif_get_mac_address---of 4
netif_get_num_default_rss_queues---of 8
netif_get_port_parent_id---of 14
netif_inherit_tso_max---of 8
netif_napi_add_weight_locked---of 25
netif_napi_affinity_release---of 5
netif_napi_irq_notify---of 7
netif_napi_set_irq_locked---of 16
netif_open---of 7
netif_pre_changeaddr_notify---of 5
netif_queue_set_napi---of 14
netif_receive_skb35%of 26
netif_receive_skb_core---of 5
netif_receive_skb_list---of 14
netif_receive_skb_list_internal---of 39
netif_reset_xps_queues_gt---of 4
netif_rx39%of 13
netif_rx_internal34%of 12
netif_schedule_queue---of 6
netif_set_affinity_auto---of 4
netif_set_alias---of 7
netif_set_allmulti---of 11
netif_set_group---of 1
netif_set_mac_address---of 12
netif_set_mtu---of 4
netif_set_mtu_ext---of 25
netif_set_promiscuity---of 4
netif_set_real_num_queues---of 17
netif_set_real_num_rx_queues---of 11
netif_set_real_num_tx_queues---of 27
netif_set_threaded---of 17
netif_set_tso_max_segs---of 3
netif_set_tso_max_size---of 6
netif_set_xps_queue---of 1
netif_skb_features14%of 59
netif_stacked_transfer_operstate---of 11
netif_state_change---of 12
netif_threaded_enable---of 3
netif_tx_stop_all_queues---of 4
netif_tx_wake_queue---of 6
netif_xdp_propagate---of 18
netstamp_clear---of 3
passthru_features_check100%of 1
process_backlog---of 26
qdisc_bstats_update---of 5
qdisc_pkt_len_segs_init8%of 28
qdisc_run_begin---of 6
qdisc_run_end---of 6
refcount_dec_and_test---of 4
register_netdev---of 3
register_netdevice---of 81
register_netdevice_notifier---of 12
register_netdevice_notifier_dev_net---of 8
register_netdevice_notifier_net---of 5
rps_may_expire_flow---of 5
rps_trigger_softirq---of 12
rtnl_net_dev_lock---of 6
run_backlog_napi---of 1
set_rps_cpu---of 14
sk_tx_queue_get---of 7
skb_checksum_help---of 14
skb_crc32c_csum_help---of 10
skb_csum_hwoffload_help---of 16
skb_network_protocol32%of 16
skb_warn_bad_offload---of 6
synchronize_net---of 4
tc_run---of 14
tcx_dec---of 1
tcx_inc---of 1
trace_kfree_skb---of 5
trigger_rx_softirq---of 1
unlist_netdevice---of 18
unregister_netdev---of 5
unregister_netdevice_many---of 1
unregister_netdevice_many_notify---of 136
unregister_netdevice_notifier---of 10
unregister_netdevice_notifier_dev_net---of 10
unregister_netdevice_notifier_net---of 7
unregister_netdevice_queue---of 10
unregister_netdevice_queued---of 3
validate_xmit_skb15%of 64
validate_xmit_skb_list43%of 7
-----------
SUMMARY22%of 707

-----------
SUMMARY---of 0

compat_rawv6_ioctl---of 4
dst_output---of 4
fl6_sock_lookup---of 4
raw6_destroy---of 1
raw6_exit_net---of 1
raw6_getfrag---of 6
raw6_icmp_error7%of 29
raw6_init_net---of 1
raw6_local_deliver6%of 35
raw6_proc_exit---of 1
raw6_seq_show---of 3
raw_v6_match---of 15
rawv6_bind---of 23
rawv6_close---of 3
rawv6_exit---of 1
rawv6_getsockopt---of 19
rawv6_init_sk---of 4
rawv6_ioctl---of 5
rawv6_mh_filter_register---of 1
rawv6_mh_filter_unregister---of 1
rawv6_push_pending_frames---of 22
rawv6_rcv---of 57
rawv6_rcv_skb---of 18
rawv6_recvmsg---of 31
rawv6_send_hdrinc---of 43
rawv6_sendmsg---of 78
rawv6_setsockopt---of 22
txopt_get---of 8
-----------
SUMMARY7%of 64

idmap_pipe_destroy_msg---of 4
idmap_pipe_downcall---of 18
idmap_release_pipe---of 3
nfs_fattr_free_names---of 5
nfs_fattr_init_names---of 1
nfs_fattr_map_and_free_names---of 7
nfs_idmap_delete---of 3
nfs_idmap_get_key---of 15
nfs_idmap_init---of 8
nfs_idmap_legacy_upcall11%of 19
nfs_idmap_new---of 15
nfs_idmap_pipe_create---of 1
nfs_idmap_pipe_destroy---of 1
nfs_idmap_quit---of 4
nfs_map_gid_to_group---of 11
nfs_map_group_to_gid---of 14
nfs_map_name_to_uid---of 14
nfs_map_string_to_numeric---of 4
nfs_map_uid_to_name---of 11
put_user_ns---of 12
-----------
SUMMARY11%of 19

-----------
SUMMARY---of 0

devtmpfs_create_node50%of 4
devtmpfs_delete_node50%of 4
devtmpfs_get_tree---of 3
devtmpfs_init_fs_context---of 3
devtmpfs_work_loop---of 39
devtmpfsd---of 3
-----------
SUMMARY50%of 8

-----------
SUMMARY---of 0

__bpf_trace_netlink_extack---of 1
__netlink_change_ngroups---of 6
__netlink_clear_multicast_users---of 4
__netlink_deliver_tap---of 29
__netlink_dump_start---of 19
__netlink_kernel_create---of 18
__netlink_lookup---of 12
__netlink_ns_capable---of 4
__netlink_sendskb---of 3
__nlmsg_put---of 1
__probestub_netlink_extack---of 1
__traceiter_netlink_extack---of 4
deferred_put_nlk_sk---of 4
do_trace_netlink_extack---of 5
netlink_ack---of 19
netlink_ack_tlv_fill---of 23
netlink_ack_tlv_len---of 8
netlink_add_tap---of 5
netlink_alloc_large_skb---of 6
netlink_allowed---of 3
netlink_attachskb---of 38
netlink_autobind---of 6
netlink_bind---of 57
netlink_broadcast100%of 1
netlink_broadcast_filtered29%of 66
netlink_capable---of 4
netlink_change_ngroups---of 1
netlink_compare---of 3
netlink_connect---of 14
netlink_create---of 14
netlink_detachskb---of 4
netlink_dump---of 38
netlink_dump_done---of 12
netlink_getname---of 6
netlink_getsockbyfd---of 9
netlink_getsockopt---of 21
netlink_has_listeners40%of 5
netlink_hash---of 1
netlink_insert---of 48
netlink_ioctl---of 1
netlink_kernel_release---of 4
netlink_lock_table---of 1
netlink_net_capable---of 4
netlink_net_exit---of 1
netlink_net_init---of 1
netlink_ns_capable---of 4
netlink_rcv_skb---of 11
netlink_recvmsg---of 24
netlink_register_notifier---of 1
netlink_release---of 73
netlink_remove_tap---of 6
netlink_sendmsg---of 30
netlink_sendskb---of 6
netlink_seq_next---of 7
netlink_seq_show---of 9
netlink_seq_start---of 10
netlink_seq_stop---of 6
netlink_set_err---of 12
netlink_setsockopt---of 39
netlink_skb_destructor---of 7
netlink_sock_destruct---of 8
netlink_strict_get_check---of 1
netlink_table_grab---of 7
netlink_table_ungrab---of 1
netlink_tap_init_net---of 1
netlink_trim37%of 11
netlink_undo_bind---of 6
netlink_unicast---of 32
netlink_unregister_notifier---of 1
netlink_update_socket_mc---of 19
nlmsg_notify---of 8
perf_trace_netlink_extack---of 8
refcount_inc---of 4
trace_event_raw_event_netlink_extack---of 9
trace_raw_output_netlink_extack---of 3
-----------
SUMMARY32%of 83

ipv6_ext_hdr38%of 8
ipv6_find_hdr15%of 61
ipv6_find_tlv---of 10
ipv6_skip_exthdr27%of 23
-----------
SUMMARY20%of 92

-----------
SUMMARY---of 0

__clockevents_switch_state---of 16
__clockevents_unbind---of 16
__clockevents_update_freq---of 10
clockevent_delta2ns---of 3
clockevents_config_and_register---of 4
clockevents_exchange_device---of 19
clockevents_handle_noop---of 1
clockevents_increase_min_delta---of 3
clockevents_program_event34%of 18
clockevents_program_min_delta---of 8
clockevents_register_device---of 18
clockevents_resume---of 7
clockevents_shutdown---of 6
clockevents_suspend---of 7
clockevents_switch_state---of 6
clockevents_tick_resume---of 3
clockevents_unbind_device---of 1
clockevents_update_freq---of 5
current_device_show---of 6
tick_offline_cpu---of 17
unbind_device_store---of 9
-----------
SUMMARY34%of 18

hcd_buffer_alloc23%of 9
hcd_buffer_alloc_pages---of 5
hcd_buffer_create---of 8
hcd_buffer_destroy---of 1
hcd_buffer_free23%of 9
hcd_buffer_free_pages---of 5
-----------
SUMMARY23%of 18

__ia32_sys_mlock---of 1
__ia32_sys_mlock2---of 3
__ia32_sys_mlockall---of 1
__ia32_sys_munlock---of 1
__se_sys_mlockall---of 24
__se_sys_munlock---of 12
__x64_sys_mlock---of 1
__x64_sys_mlock2---of 3
__x64_sys_mlockall---of 1
__x64_sys_munlock---of 1
__x64_sys_munlockall---of 12
apply_mlockall_flags---of 11
apply_vma_lock_flags---of 9
can_do_mlock---of 3
do_mlock---of 29
list_add---of 4
list_del---of 4
lru_gen_add_folio---of 20
lru_gen_del_folio---of 8
lru_gen_update_size---of 9
mlock_drain_local80%of 5
mlock_drain_remote---of 5
mlock_fixup---of 23
mlock_folio---of 10
mlock_folio_batch---of 140
mlock_new_folio---of 9
mlock_pte_range---of 39
munlock_folio---of 7
need_mlock_drain100%of 1
user_shm_lock---of 8
user_shm_unlock---of 1
vma_start_write---of 5
-----------
SUMMARY84%of 6

bio_cmd_bio_end_io---of 4
blk_cmd_complete---of 4
blkdev_bszset---of 7
blkdev_common_ioctl2%of 137
blkdev_compat_ptr_ioctl---of 3
blkdev_ioctl---of 21
blkdev_uring_cmd---of 24
blkpg_do_ioctl---of 12
compat_blkdev_ioctl10%of 21
-----------
SUMMARY3%of 158

__printk_deferred_enter100%of 1
__printk_deferred_exit100%of 1
__printk_safe_enter100%of 1
__printk_safe_exit100%of 1
is_printk_force_console100%of 1
is_printk_legacy_deferred50%of 4
printk_force_console_enter---of 1
printk_force_console_exit---of 1
vprintk100%of 1
-----------
SUMMARY80%of 10

alloc_uid---of 18
find_user---of 7
free_uid29%of 7
-----------
SUMMARY29%of 7

__ref_tracker_dir_pr_ostream---of 22
debugfs_reap_work---of 8
pr_ostream_buf---of 3
pr_ostream_log---of 1
pr_ostream_seq---of 1
ref_tracker_alloc36%of 14
ref_tracker_debugfs_open---of 1
ref_tracker_debugfs_show---of 4
ref_tracker_dir_debugfs37%of 11
ref_tracker_dir_exit28%of 18
ref_tracker_dir_print---of 1
ref_tracker_dir_print_locked---of 1
ref_tracker_dir_snprint---of 1
ref_tracker_dir_symlink---of 13
ref_tracker_free15%of 20
refcount_inc---of 4
-----------
SUMMARY27%of 63

-----------
SUMMARY---of 0

__bitmap_and---of 9
__bitmap_andnot---of 9
__bitmap_clear100%of 7
__bitmap_complement---of 8
__bitmap_equal---of 8
__bitmap_intersects---of 8
__bitmap_or---of 8
__bitmap_or_equal---of 7
__bitmap_replace---of 7
__bitmap_set100%of 7
__bitmap_shift_left---of 14
__bitmap_shift_right---of 8
__bitmap_subset75%of 8
__bitmap_weight---of 6
__bitmap_weight_and---of 6
__bitmap_weight_andnot---of 6
__bitmap_weighted_or---of 6
__bitmap_weighted_xor---of 6
__bitmap_xor---of 8
bitmap_alloc---of 1
bitmap_alloc_node---of 1
bitmap_bitremap---of 16
bitmap_cut---of 17
bitmap_find_next_zero_area_off---of 4
bitmap_fold---of 4
bitmap_free---of 1
bitmap_from_arr32---of 8
bitmap_onto---of 5
bitmap_remap---of 17
bitmap_to_arr32---of 8
bitmap_zalloc---of 1
bitmap_zalloc_node---of 1
devm_bitmap_alloc---of 4
devm_bitmap_free---of 1
devm_bitmap_zalloc---of 4
-----------
SUMMARY91%of 22

count_shadow_nodes---of 7
scan_shadow_nodes---of 1
shadow_lru_isolate---of 12
workingset_activation---of 21
workingset_age_nonresident---of 10
workingset_eviction---of 37
workingset_refault---of 47
workingset_test_recent---of 31
workingset_update_node58%of 7
-----------
SUMMARY58%of 7

__inet_bhash2_update_saddr---of 70
__inet_check_established---of 44
__inet_hash_connect---of 88
__inet_inherit_port---of 65
__inet_lookup_established11%of 28
__inet_lookup_listener50%of 6
inet_bhash2_addr_any_hashbucket---of 3
inet_bhash2_reset_saddr---of 3
inet_bhash2_update_saddr---of 1
inet_bind2_bucket_create---of 10
inet_bind2_bucket_destroy---of 17
inet_bind2_bucket_find---of 12
inet_bind2_bucket_match_addr_any---of 9
inet_bind_bucket_create---of 4
inet_bind_bucket_destroy---of 14
inet_bind_bucket_match---of 4
inet_bind_hash---of 4
inet_ehash_insert---of 37
inet_ehash_locks_alloc---of 8
inet_ehash_nolisten---of 3
inet_ehashfn---of 1
inet_hash---of 35
inet_hash_connect---of 6
inet_lhash2_lookup14%of 15
inet_lookup_reuseport---of 5
inet_lookup_run_sk_lookup---of 26
inet_pernet_hashinfo_alloc---of 10
inet_pernet_hashinfo_free---of 3
inet_put_port---of 36
inet_unhash---of 17
ipv6_portaddr_hash---of 4
sock_edemux---of 1
sock_gen_put---of 7
-----------
SUMMARY17%of 49

__simple_recursive_removal40%of 20
__simple_rmdir---of 1
__simple_unlink---of 1
alloc_anon_inode---of 3
always_delete_dentry---of 1
dcache_dir_close---of 1
dcache_dir_lseek---of 15
dcache_dir_open---of 1
dcache_readdir---of 17
direct_write_fallback---of 4
empty_dir_listxattr---of 1
empty_dir_llseek---of 1
empty_dir_lookup---of 1
empty_dir_readdir---of 7
empty_dir_setattr---of 1
find_next_child16%of 13
find_positive_dentry---of 15
generic_check_addressable40%of 5
generic_ci_d_compare---of 8
generic_ci_d_hash---of 6
generic_ci_match---of 17
generic_encode_ino32_fh---of 5
generic_fh_to_dentry---of 4
generic_fh_to_parent---of 5
generic_read_dir---of 1
generic_set_sb_d_ops50%of 4
init_pseudo---of 3
inode_maybe_inc_iversion43%of 7
inode_query_iversion---of 5
is_empty_dir_inode---of 3
kfree_link---of 1
locked_recursive_removal---of 1
make_empty_dir_inode---of 1
memory_read_from_buffer---of 4
noop_direct_IO---of 1
noop_fsync---of 1
offset_dir_llseek---of 6
offset_dir_lookup---of 3
offset_readdir---of 12
path_from_stashed---of 24
pseudo_fs_fill_super---of 4
pseudo_fs_free---of 1
pseudo_fs_get_tree---of 1
scan_positives---of 19
simple_attr_open---of 3
simple_attr_read---of 15
simple_attr_release---of 1
simple_attr_write---of 1
simple_attr_write_signed---of 1
simple_attr_write_xsigned---of 8
simple_done_creating100%of 1
simple_empty---of 7
simple_fill_super---of 10
simple_fsync---of 8
simple_fsync_noflush---of 6
simple_get_link---of 1
simple_getattr---of 1
simple_inode_init_ts100%of 1
simple_link---of 1
simple_lookup58%of 7
simple_offset_add50%of 4
simple_offset_destroy---of 1
simple_offset_init---of 1
simple_offset_remove---of 3
simple_offset_rename---of 6
simple_offset_rename_exchange---of 5
simple_open---of 3
simple_pin_fs40%of 5
simple_read_folio---of 12
simple_read_from_buffer---of 6
simple_recursive_removal100%of 1
simple_release_fs67%of 3
simple_remove_by_name---of 3
simple_rename---of 17
simple_rename_exchange---of 7
simple_rename_timestamp---of 5
simple_rmdir---of 7
simple_setattr---of 4
simple_start_creating67%of 3
simple_statfs---of 1
simple_transaction_get---of 5
simple_transaction_read---of 7
simple_transaction_release---of 1
simple_transaction_set---of 3
simple_unlink---of 1
simple_write_begin---of 24
simple_write_end---of 18
simple_write_to_buffer---of 6
stash_dentry---of 5
stashed_dentry_get---of 4
stashed_dentry_prune---of 4
-----------
SUMMARY44%of 74

__bpf_trace_task_newtask---of 1
__bpf_trace_task_prctl_unknown---of 1
__bpf_trace_task_rename---of 1
__cleanup_sighand---of 4
__delayed_free_task---of 1
__ia32_sys_clone---of 1
__ia32_sys_clone3---of 1
__ia32_sys_set_tid_address---of 1
__ia32_sys_unshare---of 1
__mmdrop---of 32
__mmput---of 17
__probestub_task_newtask---of 1
__probestub_task_prctl_unknown---of 1
__probestub_task_rename---of 1
__put_task_struct---of 19
__put_task_struct_rcu_cb---of 1
__se_sys_clone3---of 9
__traceiter_task_newtask---of 4
__traceiter_task_prctl_unknown---of 4
__traceiter_task_rename---of 4
__x64_sys_clone---of 1
__x64_sys_clone3---of 1
__x64_sys_fork---of 1
__x64_sys_set_tid_address---of 1
__x64_sys_unshare---of 1
__x64_sys_vfork---of 1
copy_clone_args_from_user---of 15
copy_files---of 6
copy_fs---of 4
copy_mm---of 11
copy_oom_score_adj---of 3
copy_process---of 146
copy_seccomp---of 6
copy_sighand---of 7
copy_signal---of 5
create_io_thread---of 1
dup_mm_exe_file---of 10
dup_task_struct---of 10
exec_mm_release---of 1
exit_mm_release---of 1
exit_task_stack_account---of 1
free_signal_struct---of 6
free_task---of 7
get_mm_exe_file---of 1
get_task_exe_file---of 4
get_task_mm---of 4
get_user_ns---of 11
idle_dummy---of 1
kernel_clone---of 35
kernel_thread---of 1
ksys_unshare---of 50
list_add_tail---of 3
list_add_tail_rcu---of 3
mm_access---of 12
mm_alloc---of 3
mm_init---of 18
mm_release---of 11
mmdrop_async_fn---of 1
mmput---of 3
mmput_async---of 3
mmput_async_fn---of 1
nr_processes---of 5
perf_trace_task_newtask---of 8
perf_trace_task_prctl_unknown---of 8
perf_trace_task_rename---of 10
pidfd_prepare---of 13
ptrace_event_pid---of 6
ptrace_init_task---of 5
put_nsproxy---of 4
put_task_stack40%of 5
refcount_inc---of 4
replace_mm_exe_file---of 37
set_mm_exe_file---of 14
set_task_stack_end_magic---of 1
sighand_ctor---of 1
sysctl_max_threads---of 3
thread_stack_free_rcu---of 1
trace_event_raw_event_task_newtask---of 9
trace_event_raw_event_task_prctl_unknown---of 9
trace_event_raw_event_task_rename---of 11
trace_raw_output_task_newtask---of 3
trace_raw_output_task_prctl_unknown---of 3
trace_raw_output_task_rename---of 3
trace_task_newtask---of 5
tty_kref_get---of 5
unshare_files---of 6
user_mode_thread---of 1
walk_process_tree---of 8
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

_kstrtol---of 3
_kstrtoul---of 3
_parse_integer100%of 1
_parse_integer_fixup_radix45%of 9
_parse_integer_limit60%of 10
kstrtobool---of 24
kstrtobool_from_user---of 3
kstrtoint---of 4
kstrtoint_from_user---of 3
kstrtol_from_user---of 3
kstrtoll---of 9
kstrtoll_from_user---of 3
kstrtos16---of 4
kstrtos16_from_user---of 3
kstrtos8---of 4
kstrtos8_from_user---of 3
kstrtou16---of 4
kstrtou16_from_user---of 3
kstrtou8---of 4
kstrtou8_from_user---of 3
kstrtouint50%of 4
kstrtouint_from_user---of 3
kstrtoul_from_user---of 3
kstrtoull40%of 5
kstrtoull_from_user---of 3
-----------
SUMMARY52%of 29

rds6_sock_inc_info---of 9
rds6_sock_info---of 5
rds_connect---of 17
rds_create40%of 5
rds_exit---of 1
rds_getname---of 9
rds_getsockopt---of 30
rds_ioctl---of 10
rds_poll---of 17
rds_release---of 8
rds_setsockopt---of 54
rds_sock_addref---of 4
rds_sock_destruct---of 4
rds_sock_inc_info---of 10
rds_sock_info---of 7
rds_sock_put---of 4
rds_wake_sk_sleep---of 3
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

should_fail_usercopy100%of 1
-----------
SUMMARY100%of 1

bdi_alloc---of 4
bdi_debug_stats_open---of 1
bdi_debug_stats_show---of 15
bdi_dev_name---of 3
bdi_get_by_id---of 9
bdi_init---of 3
bdi_put23%of 9
bdi_register---of 1
bdi_register_va---of 19
bdi_set_owner---of 3
bdi_unregister---of 20
cgwb_debug_stats_open---of 1
cgwb_debug_stats_show---of 17
cgwb_free_rcu---of 1
cgwb_kill---of 12
cgwb_release---of 1
cgwb_release_workfn---of 18
cleanup_offline_cgwbs_workfn---of 28
collect_wb_stats---of 15
inode_to_bdi75%of 4
max_bytes_show---of 1
max_bytes_store---of 3
max_ratio_fine_show---of 1
max_ratio_fine_store---of 3
max_ratio_show---of 1
max_ratio_store---of 3
min_bytes_show---of 1
min_bytes_store---of 3
min_ratio_fine_show---of 1
min_ratio_fine_store---of 3
min_ratio_show---of 1
min_ratio_store---of 3
read_ahead_kb_show---of 1
read_ahead_kb_store---of 3
stable_pages_required_show---of 3
strict_limit_show---of 1
strict_limit_store---of 3
wb_blkcg_offline---of 4
wb_get_create---of 42
wb_get_lookup---of 14
wb_init---of 4
wb_memcg_offline---of 4
wb_shutdown---of 7
wb_update_bandwidth_workfn---of 1
-----------
SUMMARY39%of 13

-----------
SUMMARY---of 0

nilfs_palloc_abort_alloc_entry---of 11
nilfs_palloc_abort_free_entry---of 5
nilfs_palloc_clear_cache---of 7
nilfs_palloc_commit_alloc_entry---of 7
nilfs_palloc_commit_free_entry---of 13
nilfs_palloc_count_max_entries---of 6
nilfs_palloc_desc_block_init---of 8
nilfs_palloc_destroy_cache72%of 7
nilfs_palloc_entry_offset67%of 3
nilfs_palloc_freev---of 43
nilfs_palloc_get_block50%of 8
nilfs_palloc_get_entry_block100%of 1
nilfs_palloc_init_blockgroup50%of 4
nilfs_palloc_prepare_alloc_entry---of 32
nilfs_palloc_prepare_free_entry---of 5
nilfs_palloc_setup_cache100%of 1
-----------
SUMMARY63%of 24

__bpf_local_storage_insert_cache---of 17
bpf_local_storage_alloc---of 35
bpf_local_storage_destroy---of 10
bpf_local_storage_free_trace_rcu---of 1
bpf_local_storage_map_alloc---of 52
bpf_local_storage_map_alloc_check38%of 8
bpf_local_storage_map_check_btf---of 1
bpf_local_storage_map_free---of 4
bpf_local_storage_map_mem_usage---of 1
bpf_local_storage_update---of 86
bpf_selem_alloc---of 19
bpf_selem_free---of 6
bpf_selem_free_trace_rcu---of 3
bpf_selem_link_map---of 17
bpf_selem_link_storage_nolock---of 3
bpf_selem_unlink---of 53
bpf_selem_unlink_nofail---of 70
bpf_selem_unlink_storage_nolock---of 16
-----------
SUMMARY38%of 8

-----------
SUMMARY---of 0

__usb_bus_reprobe_drivers---of 13
autosuspend_check---of 15
is_usb_device_driver100%of 1
new_id_show---of 6
new_id_store---of 1
rebind_marked_interfaces---of 13
remove_id_show---of 6
remove_id_store---of 10
usb_autopm_get_interface34%of 6
usb_autopm_get_interface_async---of 6
usb_autopm_get_interface_no_resume100%of 1
usb_autopm_put_interface100%of 1
usb_autopm_put_interface_async---of 1
usb_autopm_put_interface_no_suspend40%of 5
usb_autoresume_device34%of 6
usb_autosuspend_device100%of 1
usb_deregister---of 10
usb_deregister_device_driver---of 1
usb_device_match60%of 22
usb_device_match_id43%of 26
usb_disable_autosuspend100%of 1
usb_disable_usb2_hardware_lpm40%of 5
usb_driver_applicable84%of 6
usb_driver_claim_interface---of 10
usb_driver_release_interface---of 5
usb_enable_autosuspend---of 1
usb_enable_usb2_hardware_lpm---of 5
usb_forced_unbind_intf---of 5
usb_match_device46%of 22
usb_match_id40%of 10
usb_match_one_id66%of 38
usb_match_one_id_intf---of 16
usb_probe_device27%of 15
usb_probe_interface46%of 37
usb_register_device_driver---of 4
usb_register_driver---of 8
usb_resume---of 11
usb_resume_both---of 33
usb_resume_complete---of 3
usb_runtime_idle---of 3
usb_runtime_resume---of 1
usb_runtime_suspend---of 8
usb_show_dynids---of 6
usb_shutdown_interface---of 4
usb_store_new_id---of 16
usb_suspend---of 22
usb_suspend_both---of 35
usb_uevent50%of 8
usb_unbind_and_rebind_marked_interfaces---of 10
usb_unbind_device58%of 7
usb_unbind_interface40%of 30
-----------
SUMMARY50%of 248

__ia32_compat_sys_ioctl13%of 33
__ia32_sys_ioctl---of 1
__se_sys_ioctl---of 8
__x64_sys_ioctl---of 1
compat_ptr_ioctl---of 3
do_vfs_ioctl4%of 59
fiemap_fill_next_extent---of 5
fiemap_prep---of 8
file_ioctl---of 27
-----------
SUMMARY7%of 92

__lock_sock---of 3
__lock_sock_fast---of 3
__receive_sock---of 6
__release_sock---of 17
__sk_backlog_rcv---of 5
__sk_charge---of 4
__sk_destruct---of 30
__sk_dst_check---of 8
__sk_flush_backlog---of 4
__sk_free---of 17
__sk_mem_raise_allocated---of 49
__sk_mem_reclaim---of 1
__sk_mem_reduce_allocated---of 14
__sk_mem_schedule---of 3
__sk_receive_skb---of 32
__sock_cmsg_send---of 33
__sock_queue_rcv_skb---of 29
__sock_set_mark---of 3
__sock_set_rcvbuf---of 4
__sock_wfree---of 4
copy_from_sockptr---of 3
copy_to_sockptr_offset---of 4
cred_to_ucred---of 3
dst_negative_advice---of 7
groups_to_user---of 7
lock_sock_nested---of 4
proto_exit_net---of 1
proto_init_net---of 1
proto_memory_pcpu_drain---of 6
proto_register---of 25
proto_seq_next---of 1
proto_seq_show---of 10
proto_seq_start---of 1
proto_seq_stop---of 1
proto_unregister---of 10
refcount_inc---of 4
release_sock---of 11
sk_alloc---of 19
sk_busy_loop_end---of 9
sk_capable---of 3
sk_clear_memalloc---of 6
sk_clone---of 40
sk_common_release---of 12
sk_destruct---of 4
sk_dst_check---of 11
sk_error_report---of 8
sk_free---of 4
sk_get_meminfo---of 4
sk_get_peer_cred---of 3
sk_getsockopt---of 135
sk_ioctl---of 20
sk_mc_loop---of 6
sk_net_capable---of 3
sk_net_refcnt_upgrade---of 13
sk_ns_capable---of 3
sk_page_frag_refill---of 5
sk_prot_alloc---of 12
sk_reset_timer---of 5
sk_send_sigurg---of 6
sk_set_memalloc---of 1
sk_set_peek_off---of 1
sk_set_prio_allowed---of 6
sk_setsockopt---of 189
sk_setup_caps---of 25
sk_stop_timer---of 4
sk_stop_timer_sync---of 4
sk_stream_moderate_sndbuf---of 4
sk_wait_data---of 9
skb_orphan_partial---of 18
skb_page_frag_refill24%of 13
skb_set_owner_edemux---of 12
skb_set_owner_w40%of 10
sock_alloc_send_pskb18%of 23
sock_bind_add---of 3
sock_bindtoindex---of 11
sock_bindtoindex_locked---of 7
sock_cmsg_send---of 10
sock_common_getsockopt---of 1
sock_common_recvmsg---of 1
sock_common_setsockopt---of 1
sock_copy_user_timeval---of 13
sock_def_destruct---of 1
sock_def_error_report---of 6
sock_def_readable50%of 10
sock_def_wakeup---of 4
sock_def_write_space---of 7
sock_devmem_dontneed---of 28
sock_efree---of 6
sock_enable_timestamp---of 4
sock_gen_cookie---of 3
sock_get_timeout---of 6
sock_getbindtodevice---of 11
sock_gettstamp---of 11
sock_init_data---of 3
sock_init_data_uid---of 5
sock_inuse_exit_net---of 1
sock_inuse_get---of 5
sock_inuse_init_net---of 1
sock_ioctl_inout---of 5
sock_kfree_s---of 3
sock_kmalloc---of 5
sock_kmemdup---of 5
sock_kzfree_s---of 3
sock_load_diag_module---of 6
sock_no_accept---of 1
sock_no_bind---of 1
sock_no_connect---of 1
sock_no_getname---of 1
sock_no_ioctl---of 1
sock_no_linger---of 1
sock_no_listen---of 1
sock_no_mmap---of 1
sock_no_recvmsg---of 1
sock_no_sendmsg---of 1
sock_no_sendmsg_locked---of 1
sock_no_shutdown---of 1
sock_no_socketpair---of 1
sock_ofree---of 1
sock_omalloc---of 4
sock_pfree---of 8
sock_prot_inuse_get---of 5
sock_queue_rcv_skb_reason---of 3
sock_recv_errqueue---of 12
sock_release_reserved_memory---of 8
sock_reserve_memory---of 12
sock_rfree---of 7
sock_set_keepalive---of 3
sock_set_mark---of 3
sock_set_priority---of 1
sock_set_rcvbuf---of 4
sock_set_reuseaddr---of 1
sock_set_reuseport---of 1
sock_set_sndtimeo---of 1
sock_set_timeout---of 8
sock_set_timestamp---of 22
sock_set_timestamping---of 31
sock_setsockopt---of 1
sock_wfree24%of 21
sock_wmalloc---of 6
sockopt_capable---of 3
sockopt_lock_sock---of 3
sockopt_ns_capable---of 3
sockopt_release_sock---of 3
-----------
SUMMARY28%of 77

alloc_async---of 4
async_completed---of 38
async_newpending---of 3
async_removepending---of 4
check_ctrlrecip---of 43
claimintf---of 9
copy_urb_data_to_user---of 7
dec_usb_memory_use_count---of 6
destroy_async---of 7
do_proc_bulk---of 33
do_proc_control---of 22
driver_disconnect---of 13
driver_probe---of 1
driver_resume---of 1
driver_suspend---of 1
find_memory_area---of 8
free_async---of 11
get_pid---of 5
parse_usbdevfs_streams---of 28
proc_alloc_streams---of 11
proc_allow_suspend---of 5
proc_bulk---of 3
proc_bulk_compat---of 6
proc_claim_port---of 4
proc_claiminterface---of 10
proc_clearhalt---of 21
proc_connectinfo---of 1
proc_control---of 3
proc_control_compat---of 4
proc_disconnect_claim---of 19
proc_do_submiturb---of 106
proc_drop_privileges---of 3
proc_free_streams---of 11
proc_get_capabilities---of 1
proc_getdriver---of 7
proc_ioctl---of 23
proc_ioctl_compat---of 3
proc_ioctl_default---of 3
proc_release_port---of 3
proc_releaseinterface---of 14
proc_resetdevice---of 8
proc_resetep---of 21
proc_setconfig---of 9
proc_setintf---of 16
proc_submiturb---of 3
proc_submiturb_compat---of 3
proc_unlinkurb---of 9
proc_wait_for_resume---of 10
processcompl---of 21
processcompl_compat---of 21
reap_as---of 16
sg_set_buf---of 3
snoop_urb---of 8
snoop_urb_data---of 6
usb_devio_cleanup---of 1
usbdev_ioctl---of 125
usbdev_mmap---of 15
usbdev_notify30%of 10
usbdev_open---of 16
usbdev_poll---of 8
usbdev_read---of 16
usbdev_release---of 27
usbdev_vm_close---of 1
usbdev_vm_open---of 1
usbfs_blocking_completion---of 1
usbfs_decrease_memory_usage---of 1
usbfs_increase_memory_usage---of 3
usbfs_notify_resume---of 4
usbfs_notify_suspend---of 1
usbfs_start_wait_urb---of 7
-----------
SUMMARY30%of 10

-----------
SUMMARY---of 0

xfrm6_gro_udp_encap_rcv---of 17
xfrm6_input_addr35%of 20
xfrm6_rcv---of 1
xfrm6_rcv_spi---of 1
xfrm6_rcv_tnl---of 1
xfrm6_transport_finish---of 35
xfrm6_transport_finish2---of 3
xfrm6_udp_encap_rcv---of 23
xfrm_state_put---of 4
-----------
SUMMARY35%of 20

-----------
SUMMARY---of 0

_copy_from_user75%of 4
_copy_to_user50%of 4
check_zeroed_user---of 11
-----------
SUMMARY63%of 8

-----------
SUMMARY---of 0

__list_add---of 4
acpi_bind_one12%of 25
acpi_device_notify31%of 13
acpi_device_notify_remove34%of 6
acpi_find_child_by_adr67%of 3
acpi_find_child_device---of 3
acpi_unbind_one---of 9
check_one_child---of 24
match_any---of 1
register_acpi_bus_type---of 5
unregister_acpi_bus_type---of 6
-----------
SUMMARY24%of 47

__ext4_fc_track_create---of 7
__ext4_fc_track_link---of 7
__ext4_fc_track_unlink---of 7
__track_dentry_update---of 12
__track_inode---of 3
__track_range---of 5
ext4_end_buffer_io_sync---of 4
ext4_fc_add_dentry_tlv---of 3
ext4_fc_add_tlv---of 3
ext4_fc_cleanup---of 39
ext4_fc_commit---of 69
ext4_fc_del8%of 28
ext4_fc_destroy_dentry_cache---of 1
ext4_fc_info_show---of 3
ext4_fc_init---of 3
ext4_fc_init_inode100%of 1
ext4_fc_mark_ineligible---of 12
ext4_fc_record_regions---of 9
ext4_fc_replay---of 170
ext4_fc_replay_check_excluded---of 8
ext4_fc_replay_cleanup100%of 1
ext4_fc_reserve_space---of 8
ext4_fc_set_bitmaps_and_counters---of 14
ext4_fc_submit_bh---of 7
ext4_fc_track_create---of 5
ext4_fc_track_inode14%of 15
ext4_fc_track_link---of 5
ext4_fc_track_range---of 12
ext4_fc_track_template---of 7
ext4_fc_track_unlink---of 5
ext4_fc_update_stats---of 10
ext4_fc_write_inode---of 9
ext4_fc_write_inode_data---of 14
-----------
SUMMARY14%of 45

-----------
SUMMARY---of 0

zlib_adler3234%of 15
zlib_inflate9%of 202
zlib_inflateEnd---of 3
zlib_inflateIncomp---of 4
zlib_inflateInit240%of 5
zlib_inflateReset---of 4
zlib_inflate_workspacesize100%of 1
zlib_updatewindow40%of 5
-----------
SUMMARY12%of 228

-----------
SUMMARY---of 0

ap_init_aperfmperf---of 8
arch_enable_hybrid_capacity_scale---of 7
arch_freq_get_on_cpu---of 11
arch_scale_cpu_capacity67%of 3
arch_scale_freq_tick---of 14
arch_set_cpu_capacity---of 3
arch_set_max_freq_ratio---of 1
disable_freq_invariance_workfn---of 5
freq_invariance_set_perf_ratio---of 3
init_counter_refs---of 5
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

__blk_freeze_queue_start---of 4
__blk_mq_alloc_disk---of 9
__blk_mq_alloc_driver_tag---of 20
__blk_mq_alloc_requests18%of 51
__blk_mq_complete_request_remote---of 1
__blk_mq_end_request32%of 22
__blk_mq_free_request43%of 14
__blk_mq_realloc_hw_ctxs---of 28
__blk_mq_requeue_request---of 10
__blk_mq_unfreeze_queue---of 7
blk_account_io_done42%of 12
blk_account_io_start43%of 19
blk_add_rq_to_plug39%of 21
blk_done_softirq---of 4
blk_dump_rq_flags---of 7
blk_end_sync_rq---of 1
blk_execute_rq---of 11
blk_execute_rq_nowait---of 7
blk_free_flush_queue_callback---of 1
blk_freeze_queue_start---of 4
blk_freeze_queue_start_non_owner---of 4
blk_insert_cloned_request---of 23
blk_mq_add_hw_queues_cpuhp---of 9
blk_mq_alloc_and_init_hctx---of 25
blk_mq_alloc_disk_for_queue---of 4
blk_mq_alloc_map_and_rqs---of 39
blk_mq_alloc_queue---of 6
blk_mq_alloc_request---of 18
blk_mq_alloc_request_hctx---of 26
blk_mq_alloc_set_map_and_rqs---of 23
blk_mq_alloc_sq_tag_set---of 1
blk_mq_alloc_tag_set---of 35
blk_mq_cancel_work_sync---of 4
blk_mq_check_expired---of 6
blk_mq_check_in_driver---of 7
blk_mq_complete_request67%of 3
blk_mq_complete_request_remote16%of 26
blk_mq_delay_kick_requeue_list---of 1
blk_mq_delay_run_hw_queue---of 19
blk_mq_delay_run_hw_queues---of 15
blk_mq_dequeue_from_ctx---of 16
blk_mq_destroy_queue---of 8
blk_mq_dispatch_list---of 44
blk_mq_dispatch_queue_requests36%of 17
blk_mq_dispatch_rq_list---of 84
blk_mq_dispatch_wake---of 8
blk_mq_end_request67%of 3
blk_mq_end_request_batch---of 73
blk_mq_exit_hctx---of 19
blk_mq_exit_queue---of 16
blk_mq_flush_busy_ctxs---of 8
blk_mq_flush_plug_list15%of 21
blk_mq_free_map_and_rqs---of 3
blk_mq_free_plug_rqs---of 6
blk_mq_free_request47%of 13
blk_mq_free_rq_map---of 1
blk_mq_free_rqs---of 21
blk_mq_free_tag_set---of 15
blk_mq_freeze_queue_nomemsave---of 4
blk_mq_freeze_queue_wait---of 5
blk_mq_freeze_queue_wait_timeout---of 5
blk_mq_handle_expired---of 9
blk_mq_has_request---of 3
blk_mq_hctx_notify_dead---of 11
blk_mq_hctx_notify_offline---of 24
blk_mq_hctx_notify_online---of 3
blk_mq_hw_queue_need_run---of 17
blk_mq_in_driver_rw---of 1
blk_mq_inc_active_requests---of 4
blk_mq_init_allocated_queue---of 27
blk_mq_insert_request---of 25
blk_mq_issue_direct27%of 26
blk_mq_kick_requeue_list100%of 1
blk_mq_map_swqueue---of 42
blk_mq_poll---of 8
blk_mq_put_driver_tag---of 6
blk_mq_put_rq_ref---of 7
blk_mq_queue_inflight---of 1
blk_mq_quiesce_queue---of 6
blk_mq_quiesce_queue_nowait---of 3
blk_mq_quiesce_tagset---of 9
blk_mq_release---of 13
blk_mq_remove_hw_queues_cpuhp---of 13
blk_mq_request_issue_directly30%of 20
blk_mq_requeue_request---of 8
blk_mq_requeue_work---of 20
blk_mq_rq_cpu---of 1
blk_mq_rq_inflight---of 7
blk_mq_run_hw_queue---of 14
blk_mq_run_hw_queues---of 14
blk_mq_run_work_fn---of 4
blk_mq_start_hw_queue---of 1
blk_mq_start_hw_queues---of 4
blk_mq_start_request34%of 21
blk_mq_start_stopped_hw_queue---of 4
blk_mq_start_stopped_hw_queues---of 9
blk_mq_stop_hw_queue---of 1
blk_mq_stop_hw_queues---of 4
blk_mq_submit_bio26%of 126
blk_mq_timeout_work---of 17
blk_mq_try_issue_directly---of 27
blk_mq_try_issue_list_directly---of 22
blk_mq_unfreeze_queue_nomemrestore---of 5
blk_mq_unfreeze_queue_non_owner---of 5
blk_mq_unquiesce_queue---of 4
blk_mq_unquiesce_tagset---of 6
blk_mq_update_nr_hw_queues---of 64
blk_mq_update_nr_requests---of 24
blk_mq_update_queue_map---of 13
blk_mq_update_tag_set_shared---of 8
blk_mq_wait_quiesce_done---of 3
blk_mq_wake_waiters---of 7
blk_print_req_error---of 4
blk_rq_cur_bytes---of 7
blk_rq_init40%of 5
blk_rq_is_poll---of 3
blk_rq_poll---of 13
blk_rq_prep_clone---of 19
blk_rq_unprep_clone---of 4
blk_softirq_cpu_dead---of 4
blk_steal_bios---of 7
blk_update_request27%of 53
queue_set_hctx_shared---of 7
rq_qos_cleanup---of 4
srcu_read_unlock---of 3
trace_block_rq_error---of 5
-----------
SUMMARY29%of 474

list_add_tail_rcu---of 3
tomoyo_assign_domain---of 24
tomoyo_assign_namespace---of 28
tomoyo_check_acl54%of 13
tomoyo_dump_page---of 17
tomoyo_find_next_domain---of 76
tomoyo_same_transition_control---of 5
tomoyo_update_domain---of 23
tomoyo_update_policy---of 12
tomoyo_write_aggregator---of 21
tomoyo_write_transition_control---of 20
-----------
SUMMARY54%of 13

__acpi_node_get_property_reference---of 1
acpi_data_add_props---of 4
acpi_destroy_nondev_subnodes---of 8
acpi_dev_get_property---of 15
acpi_enumerate_nondev_subnodes---of 20
acpi_extract_properties---of 45
acpi_free_device_properties---of 11
acpi_free_properties---of 1
acpi_fwnode_device_dma_supported---of 3
acpi_fwnode_device_get_dma_attr---of 3
acpi_fwnode_device_get_match_data---of 1
acpi_fwnode_device_is_available---of 4
acpi_fwnode_get_name---of 10
acpi_fwnode_get_name_prefix---of 7
acpi_fwnode_get_named_child_node---of 10
acpi_fwnode_get_parent---of 5
acpi_fwnode_get_reference_args---of 44
acpi_fwnode_graph_parse_endpoint---of 9
acpi_fwnode_irq_get---of 1
acpi_fwnode_property_present---of 11
acpi_fwnode_property_read_int_array---of 6
acpi_fwnode_property_read_string_array---of 1
acpi_get_next_present_subnode---of 5
acpi_get_next_subnode---of 19
acpi_get_ref_args---of 33
acpi_graph_get_next_endpoint---of 25
acpi_graph_get_remote_endpoint---of 17
acpi_init_properties---of 44
acpi_node_get_parent---of 5
acpi_node_prop_get---of 12
acpi_node_prop_read---of 96
acpi_nondev_subnode_extract---of 8
acpi_nondev_subnode_tag---of 1
acpi_tie_nondev_subnodes---of 10
acpi_untie_nondev_subnodes---of 6
is_acpi_data_node---of 3
is_acpi_device_node67%of 3
is_acpi_graph_node---of 8
stop_on_next---of 4
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

_setid_policy_lookup---of 14
id_permitted_for_cred---of 25
safesetid_security_capable12%of 18
safesetid_task_fix_setgid---of 14
safesetid_task_fix_setgroups---of 21
safesetid_task_fix_setuid---of 14
-----------
SUMMARY12%of 18

-----------
SUMMARY---of 0

hfs_bnode_split---of 16
hfs_brec_update_parent---of 24
hfs_btree_inc_height---of 15
hfsplus_brec_insert---of 22
hfsplus_brec_keylen---of 10
hfsplus_brec_lenoff100%of 1
hfsplus_brec_remove---of 11
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

rxrpc_describe---of 11
rxrpc_destroy---of 7
rxrpc_free_preparse29%of 7
rxrpc_get_null_key---of 4
rxrpc_get_server_data_key---of 4
rxrpc_preparse10%of 43
rxrpc_preparse_xdr_rxkad---of 8
rxrpc_preparse_xdr_yfs_rxgk---of 17
rxrpc_read---of 31
rxrpc_request_key---of 10
-----------
SUMMARY12%of 50

-----------
SUMMARY---of 0

__pm_relax---of 4
__pm_stay_awake---of 3
device_set_wakeup_capable43%of 7
device_set_wakeup_enable---of 5
device_wakeup_arm_wake_irqs---of 6
device_wakeup_attach_irq---of 4
device_wakeup_detach_irq---of 3
device_wakeup_disable75%of 4
device_wakeup_disarm_wake_irqs---of 6
device_wakeup_enable---of 10
pm_get_wakeup_count---of 7
pm_print_active_wakeup_sources---of 14
pm_relax---of 5
pm_save_wakeup_count---of 3
pm_stay_awake---of 4
pm_system_cancel_wakeup---of 5
pm_system_irq_wakeup---of 8
pm_system_wakeup---of 1
pm_wakeup_clear---of 3
pm_wakeup_dev_event67%of 3
pm_wakeup_irq---of 1
pm_wakeup_pending---of 5
pm_wakeup_timer_fn---of 5
pm_wakeup_ws_event40%of 5
print_wakeup_source_stats---of 4
wakeup_source_deactivate---of 11
wakeup_source_register---of 11
wakeup_source_report_event---of 13
wakeup_source_unregister19%of 11
wakeup_sources_read_lock---of 1
wakeup_sources_read_unlock---of 3
wakeup_sources_stats_open---of 1
wakeup_sources_stats_seq_next---of 3
wakeup_sources_stats_seq_show---of 1
wakeup_sources_stats_seq_start---of 7
wakeup_sources_stats_seq_stop---of 3
wakeup_sources_walk_next---of 3
wakeup_sources_walk_start---of 3
-----------
SUMMARY40%of 30

-----------
SUMMARY---of 0

__ia32_compat_sys_execve---of 1
__ia32_compat_sys_execveat---of 1
__ia32_sys_execve---of 1
__ia32_sys_execveat---of 1
__register_binfmt---of 6
__set_task_comm---of 8
__x64_sys_execve---of 1
__x64_sys_execveat---of 1
acct_arg_size---of 3
alloc_bprm---of 17
begin_new_exec---of 68
bprm_change_interp---of 3
bprm_execve---of 49
cgroup_threadgroup_change_begin---of 7
cgroup_threadgroup_change_end---of 7
class_fd_prepare_destructor---of 5
copy_string_kernel---of 18
copy_strings---of 35
count---of 12
do_execveat_common---of 17
do_open_execat---of 19
exec_mmap---of 16
finalize_exec---of 1
free_bprm---of 14
kernel_execve---of 38
open_exec---of 1
path_noexec50%of 6
proc_dointvec_minmax_coredump---of 3
remove_arg_zero---of 16
set_binfmt---of 5
set_dumpable---of 5
setup_arg_pages---of 29
setup_new_exec---of 3
unregister_binfmt---of 4
unshare_sighand---of 4
would_dump---of 28
-----------
SUMMARY50%of 6

timerqueue_add---of 7
timerqueue_del---of 5
timerqueue_iterate_next---of 3
timerqueue_linked_add100%of 9
-----------
SUMMARY100%of 9

-----------
SUMMARY---of 0

__ia32_compat_sys_keyctl6%of 36
-----------
SUMMARY6%of 36

-----------
SUMMARY---of 0

__suspend_report_result---of 3
async_resume---of 1
async_resume_early---of 1
async_resume_noirq---of 1
async_suspend---of 1
async_suspend_late---of 1
async_suspend_noirq---of 1
dev_pm_skip_resume---of 5
dev_pm_skip_suspend---of 3
device_pm_add38%of 8
device_pm_check_callbacks24%of 60
device_pm_lock---of 1
device_pm_move_after---of 7
device_pm_move_before---of 6
device_pm_move_last---of 6
device_pm_remove40%of 5
device_pm_sleep_init100%of 1
device_pm_unlock---of 1
device_pm_wait_for_dev---of 6
device_resume---of 49
device_resume_early---of 52
device_resume_noirq---of 58
device_suspend---of 69
device_suspend_late---of 53
device_suspend_noirq---of 61
dpm_async_resume_subordinate---of 6
dpm_async_suspend_superior---of 15
dpm_async_with_cleanup---of 5
dpm_complete---of 45
dpm_for_each_dev---of 5
dpm_prepare---of 67
dpm_propagate_wakeup_to_parent---of 5
dpm_resume---of 32
dpm_resume_early---of 32
dpm_resume_end---of 1
dpm_resume_noirq---of 32
dpm_resume_start---of 1
dpm_run_callback---of 18
dpm_show_time---of 12
dpm_suspend---of 41
dpm_suspend_end---of 8
dpm_suspend_late---of 45
dpm_suspend_noirq---of 47
dpm_suspend_start---of 3
dpm_wait_fn---of 6
dpm_wait_for_subordinate---of 11
dpm_wait_for_superior---of 17
legacy_suspend---of 17
pm_hibernate_is_recovering---of 1
pm_late_early_op---of 11
pm_noirq_op---of 11
pm_op---of 11
-----------
SUMMARY28%of 74

-----------
SUMMARY---of 0

__irq_work_queue_local24%of 13
irq_work_needs_cpu---of 8
irq_work_queue58%of 7
irq_work_queue_on---of 12
irq_work_run---of 15
irq_work_single---of 3
irq_work_sync---of 8
irq_work_tick---of 17
-----------
SUMMARY35%of 20

-----------
SUMMARY---of 0

squashfs_cache_delete67%of 9
squashfs_cache_get45%of 27
squashfs_cache_init43%of 14
squashfs_cache_put---of 4
squashfs_copy_data43%of 7
squashfs_get_datablock---of 1
squashfs_get_fragment---of 1
squashfs_read_metadata50%of 16
squashfs_read_table29%of 14
-----------
SUMMARY45%of 87

-----------
SUMMARY---of 0

attach_dn---of 7
dnotify_flush13%of 16
dnotify_free_mark---of 3
dnotify_handle_event---of 13
dnotify_recalc_inode_mask---of 7
fcntl_dirnotify---of 20
-----------
SUMMARY13%of 16

evm_attr_change---of 10
evm_file_release50%of 6
evm_fix_hmac---of 12
evm_inode_alloc_security100%of 1
evm_inode_copy_up_xattr---of 4
evm_inode_init_security20%of 10
evm_inode_post_remove_acl---of 1
evm_inode_post_removexattr---of 12
evm_inode_post_set_acl---of 1
evm_inode_post_setattr---of 9
evm_inode_post_setxattr---of 14
evm_inode_remove_acl---of 1
evm_inode_removexattr---of 3
evm_inode_set_acl---of 12
evm_inode_setattr---of 13
evm_inode_setxattr---of 7
evm_metadata_changed---of 7
evm_post_path_mknod---of 3
evm_protect_xattr---of 33
evm_protected_xattr---of 1
evm_protected_xattr_common---of 12
evm_protected_xattr_if_enabled---of 1
evm_read_protected_xattrs---of 15
evm_revalidate_status---of 7
evm_verify_hmac---of 42
evm_verifyxattr---of 4
is_unsupported_hmac_fs---of 4
-----------
SUMMARY36%of 17

-----------
SUMMARY---of 0

__put_user_ns---of 1
cmp_extents_forward---of 1
cmp_extents_reverse---of 1
cmp_map_id---of 1
create_user_ns---of 40
current_in_userns---of 4
free_user_ns---of 20
from_kgid19%of 16
from_kgid_munged---of 16
from_kprojid19%of 16
from_kprojid_munged---of 16
from_kuid19%of 16
from_kuid_munged---of 16
gid_m_show---of 17
gid_m_start---of 4
in_userns---of 4
m_next---of 1
m_stop---of 1
make_kgid19%of 16
make_kprojid19%of 16
make_kuid19%of 16
map_id_down---of 16
map_id_range_up---of 16
map_id_up---of 16
map_write---of 56
new_idmap_permitted---of 54
ns_get_owner---of 13
proc_gid_map_write---of 4
proc_projid_map_write---of 4
proc_setgroups_show---of 1
proc_setgroups_write---of 11
proc_uid_map_write---of 4
projid_m_show---of 17
projid_m_start---of 4
put_user_ns---of 12
sort_idmaps---of 4
uid_m_show---of 17
uid_m_start---of 4
unshare_userns---of 6
userns_get---of 11
userns_install---of 18
userns_may_setgroups---of 3
userns_owner---of 1
userns_put---of 1
-----------
SUMMARY19%of 96

__hugetlb_cgroup_charge_cgroup20%of 20
hugetlb_cgroup_charge_cgroup100%of 1
hugetlb_cgroup_charge_cgroup_rsvd100%of 1
hugetlb_cgroup_commit_charge50%of 4
hugetlb_cgroup_commit_charge_rsvd50%of 4
hugetlb_cgroup_css_alloc---of 20
hugetlb_cgroup_css_free---of 7
hugetlb_cgroup_css_offline---of 16
hugetlb_cgroup_migrate---of 10
hugetlb_cgroup_read_numa_stat---of 23
hugetlb_cgroup_read_u64---of 10
hugetlb_cgroup_read_u64_max---of 8
hugetlb_cgroup_reset---of 6
hugetlb_cgroup_uncharge_cgroup---of 4
hugetlb_cgroup_uncharge_cgroup_rsvd---of 7
hugetlb_cgroup_uncharge_counter---of 9
hugetlb_cgroup_uncharge_file_region---of 11
hugetlb_cgroup_uncharge_folio---of 4
hugetlb_cgroup_uncharge_folio_rsvd---of 7
hugetlb_cgroup_write---of 6
hugetlb_cgroup_write_dfl---of 1
hugetlb_cgroup_write_legacy---of 1
hugetlb_events_local_show---of 1
hugetlb_events_show---of 1
-----------
SUMMARY34%of 30

-----------
SUMMARY---of 0

__percpu_ref_switch_mode---of 16
percpu_ref_exit---of 7
percpu_ref_init40%of 10
percpu_ref_is_zero---of 4
percpu_ref_kill_and_confirm---of 6
percpu_ref_noop_confirm_switch---of 1
percpu_ref_reinit---of 6
percpu_ref_resurrect---of 7
percpu_ref_switch_to_atomic---of 1
percpu_ref_switch_to_atomic_rcu---of 16
percpu_ref_switch_to_atomic_sync---of 5
percpu_ref_switch_to_percpu---of 1
-----------
SUMMARY40%of 10

-----------
SUMMARY---of 0

bpf_check_btf_info6%of 78
bpf_check_btf_info_early5%of 41
copy_to_bpfptr_offset---of 3
-----------
SUMMARY6%of 119

___mark_reg_known100%of 1
__add_used_btf---of 9
__add_used_map---of 68
__btf_type_is_scalar_struct---of 15
__check_mem_access---of 13
__check_ptr_off_reg67%of 6
__check_reg_arg25%of 8
__is_kfunc_ptr_arg_type---of 5
__mark_reg_s32_range---of 1
__process_kf_arg_ptr_to_graph_node---of 25
__process_kf_arg_ptr_to_graph_root---of 27
__reg_deduce_bounds54%of 30
__update_reg_bounds29%of 7
acquire_reference_state---of 4
add_scalar_to_reg---of 12
add_subprog_and_kfunc25%of 69
add_used_map---of 7
adjust_btf_func40%of 5
adjust_ptr_min_max_vals15%of 70
adjust_reg_min_max_vals20%of 42
adjust_scalar_min_max_vals8%of 126
assign_scalar_id_before_mov29%of 7
bpf_add_kfunc_call---of 21
bpf_allow_tail_call_in_subprogs---of 3
bpf_bt_sync_linked_regs10%of 20
bpf_check28%of 208
bpf_check_attach_target---of 93
bpf_clear_jmp_history100%of 1
bpf_clear_singular_ids34%of 42
bpf_copy_verifier_state29%of 14
bpf_explored_state100%of 1
bpf_fetch_kfunc_arg_meta---of 5
bpf_find_containing_subprog50%of 6
bpf_find_subprog38%of 8
bpf_fixup_kfunc_call---of 24
bpf_free_backedges29%of 7
bpf_free_kfunc_btf_tab---of 5
bpf_free_verifier_state---of 6
bpf_get_btf_vmlinux---of 1
bpf_get_call_summary25%of 24
bpf_get_helper_proto34%of 6
bpf_get_kfunc_addr---of 3
bpf_helper_stack_access_bytes6%of 34
bpf_is_async_callback_calling_insn58%of 7
bpf_is_kfunc_pkt_changing---of 1
bpf_is_may_goto_insn67%of 3
bpf_is_reg6446%of 22
bpf_is_sync_callback_calling_insn50%of 10
bpf_kfunc_stack_access_bytes---of 23
bpf_map_direct_read---of 7
bpf_map_is_rdonly---of 4
bpf_mark_reg_not_init100%of 1
bpf_mark_reg_unknown_imprecise---of 1
bpf_mark_subprog_exc_cb---of 1
bpf_prog_ctx_arg_info_init---of 1
bpf_prog_has_kfunc_call100%of 1
bpf_subprog_is_global67%of 3
bpf_verbose_insn100%of 1
bpf_verifier_inlines_helper_call---of 7
btf_check_subprog_call7%of 32
btf_field_type_name---of 16
btf_id_cmp_func---of 1
btf_type_is_struct_ptr---of 3
btf_type_name---of 1
check_and_resolve_insns21%of 145
check_attach_btf_id5%of 43
check_attach_modify_return---of 3
check_bpf_snprintf_call---of 5
check_buffer_access---of 4
check_css_task_iter_allowlist---of 8
check_ctx_access30%of 24
check_func_arg_reg_off17%of 31
check_get_func_ip---of 7
check_helper_mem_access---of 45
check_kfunc_call---of 445
check_kfunc_mem_size_reg---of 22
check_load_mem21%of 62
check_map_access---of 66
check_map_field_pointer---of 26
check_map_func_compatibility2%of 163
check_map_kptr_access---of 34
check_mem_access15%of 112
check_mem_reg---of 16
check_mem_region_access---of 9
check_mem_size_reg---of 6
check_packet_access---of 4
check_pseudo_btf_id---of 24
check_ptr_alignment30%of 20
check_ptr_to_btf_access---of 39
check_ptr_to_map_access---of 11
check_reg_const_str---of 12
check_reg_sane_offset_ptr50%of 4
check_reg_type12%of 62
check_resource_leak36%of 25
check_return_code16%of 103
check_sock_access19%of 11
check_special_kfunc---of 25
check_stack_access_within_bounds29%of 14
check_stack_range_initialized---of 59
check_stack_read---of 169
check_stack_write5%of 237
check_store_reg27%of 63
check_subprogs71%of 24
check_tp_buffer_access---of 4
clear_all_pkt_pointers28%of 18
cmp_subprogs100%of 1
coerce_reg_to_size67%of 3
coerce_reg_to_size_sx---of 8
coerce_subreg_to_size_sx---of 5
collect_linked_regs---of 22
convert_pseudo_ld_imm64---of 15
copy_array50%of 6
destroy_if_dynptr_stack_slot6%of 38
disasm_kfunc_name50%of 4
do_check_common31%of 189
do_check_insn14%of 1240
do_check_subprogs12%of 17
do_refine_retval_range23%of 9
dynptr_get_type---of 4
dynptr_id---of 7
dynptr_ref_obj_id---of 7
fetch_kfunc_meta---of 13
find_good_pkt_pointers---of 15
find_kfunc_desc_btf---of 13
find_lock_state---of 8
get_constant_map_key---of 43
get_dynptr_arg_reg---of 20
helper_multiple_ref_obj_use22%of 19
invalidate_dynptr---of 1
invalidate_non_owning_refs---of 14
is_acquire_function30%of 10
is_kfunc_arg_const_mem_size---of 6
is_kfunc_arg_scalar_with_name---of 5
is_ptr_cast_function20%of 10
is_trusted_reg---of 4
kfunc_btf_cmp_by_off---of 1
kfunc_desc_cmp_by_id_off---of 3
linked_regs_pack---of 7
map_kptr_match_type---of 8
map_set_for_each_callback_args---of 1
mark_btf_ld_reg---of 6
mark_chain_precision---of 1
mark_fastcall_patterns53%of 38
mark_ptr_not_null_reg---of 9
mark_ptr_or_null_reg---of 22
mark_ptr_or_null_regs---of 15
mark_reg_invalid---of 3
may_update_sockmap---of 17
maybe_fork_scalars---of 15
maybe_widen_reg---of 7
prepare_func_exit15%of 35
process_dynptr_func---of 85
process_irq_flag---of 26
process_iter_next_call---of 30
process_kptr_func---of 12
process_spin_lock---of 33
push_callback_call---of 41
push_stack34%of 6
record_func_key20%of 10
record_func_map11%of 19
ref_convert_owning_non_owning---of 35
ref_set_non_owning---of 9
reg_bounds_sanity_check32%of 16
reg_bounds_sync34%of 6
reg_btf_record---of 5
regs_bounds_sanity_check_branches40%of 5
regs_refine_cond_op---of 59
release_lock_state---of 10
release_reference---of 19
release_reference_nomark---of 7
release_reference_state---of 3
sanitize_check_bounds25%of 8
sanitize_dead_code---of 6
sanitize_err---of 7
sanitize_ptr_alu13%of 31
sanitize_speculative_path---of 9
save_aux_ptr_type10%of 20
scalar32_min_max_and---of 4
scalar32_min_max_arsh---of 1
scalar32_min_max_lsh---of 4
scalar32_min_max_or50%of 4
scalar32_min_max_rsh---of 1
scalar32_min_max_sdiv---of 4
scalar32_min_max_smod---of 7
scalar_min_max_and---of 3
scalar_min_max_arsh---of 1
scalar_min_max_lsh---of 8
scalar_min_max_or67%of 3
scalar_min_max_rsh---of 1
scalar_min_max_sdiv---of 4
scalar_min_max_smod---of 7
scrub_special_slot---of 34
set_callee_state100%of 1
set_find_vma_callback_state---of 1
set_loop_callback_state---of 1
set_map_elem_callback_state---of 5
set_rbtree_add_callback_state---of 9
set_task_work_schedule_callback_state---of 1
set_timer_callback_state---of 1
set_user_ringbuf_callback_state---of 1
setup_func_entry43%of 7
sort_subprogs_topo50%of 34
stack_slot_obj_get_spi---of 5
subprog_returns_void---of 5
sync_linked_regs---of 26
try_match_pkt_pointers2%of 119
unmark_stack_slots_dynptr---of 16
unmark_stack_slots_iter---of 12
update_loop_inline_state---of 8
verbose75%of 4
verbose_invalid_scalar---of 5
widen_imprecise_scalars---of 10
zext_32_to_64---of 4
-----------
SUMMARY18%of 3895

-----------
SUMMARY---of 0

flush_mdb---of 1
hfs_alloc_inode100%of 1
hfs_fill_super24%of 13
hfs_free_fc100%of 1
hfs_free_inode---of 1
hfs_get_tree100%of 1
hfs_init_fs_context50%of 4
hfs_init_once100%of 1
hfs_kill_super100%of 1
hfs_mark_mdb_dirty---of 4
hfs_parse_param32%of 22
hfs_put_super---of 1
hfs_reconfigure---of 5
hfs_show_options---of 19
hfs_statfs---of 1
hfs_sync_fs---of 1
seq_show_option---of 3
-----------
SUMMARY39%of 44

__radix_tree_delete24%of 30
__radix_tree_lookup---of 14
__radix_tree_preload23%of 9
__radix_tree_replace37%of 11
delete_node28%of 43
idr_destroy20%of 10
idr_get_free54%of 26
idr_preload67%of 3
radix_tree_cpu_dead---of 4
radix_tree_delete---of 1
radix_tree_delete_item27%of 19
radix_tree_extend45%of 20
radix_tree_gang_lookup---of 12
radix_tree_gang_lookup_tag---of 17
radix_tree_gang_lookup_tag_slot---of 10
radix_tree_insert50%of 24
radix_tree_iter_delete---of 3
radix_tree_iter_replace100%of 1
radix_tree_iter_resume---of 1
radix_tree_iter_tag_clear63%of 8
radix_tree_lookup75%of 8
radix_tree_lookup_slot---of 8
radix_tree_maybe_preload---of 3
radix_tree_next_chunk15%of 34
radix_tree_node_alloc56%of 9
radix_tree_node_ctor100%of 1
radix_tree_node_rcu_free---of 1
radix_tree_preload---of 3
radix_tree_replace_slot---of 7
radix_tree_tag_clear---of 13
radix_tree_tag_get---of 9
radix_tree_tag_set---of 11
radix_tree_tagged---of 1
-----------
SUMMARY36%of 256

-----------
SUMMARY---of 0

usb_notify_add_bus---of 1
usb_notify_add_device100%of 1
usb_notify_remove_bus---of 1
usb_notify_remove_device100%of 1
usb_register_notify---of 1
usb_unregister_notify---of 1
-----------
SUMMARY100%of 2

-----------
SUMMARY---of 0

sysfs_create_link67%of 3
sysfs_create_link_nowarn---of 3
sysfs_create_link_sd---of 1
sysfs_delete_link50%of 4
sysfs_do_create_link_sd34%of 6
sysfs_remove_link100%of 1
sysfs_rename_link_ns---of 7
-----------
SUMMARY50%of 14

msleep50%of 6
msleep_interruptible---of 8
process_timeout---of 1
schedule_hrtimeout---of 1
schedule_hrtimeout_range---of 1
schedule_hrtimeout_range_clock---of 8
schedule_timeout60%of 5
schedule_timeout_idle---of 3
schedule_timeout_interruptible67%of 3
schedule_timeout_killable---of 3
schedule_timeout_uninterruptible---of 3
usleep_range_state---of 7
-----------
SUMMARY58%of 14

__put_cred---of 6
abort_creds---of 5
commit_creds21%of 44
copy_creds---of 17
cred_alloc_blank---of 6
cred_fscmp---of 16
exit_creds---of 10
get_task_cred---of 8
prepare_creds28%of 43
prepare_exec_creds---of 3
prepare_kernel_cred---of 36
put_cred_rcu---of 19
set_create_files_as---of 4
set_cred_ucounts---of 6
set_security_override---of 1
-----------
SUMMARY25%of 87

-----------
SUMMARY---of 0

module_add_driver36%of 14
module_remove_driver---of 8
-----------
SUMMARY36%of 14

dont_mount---of 1
hfs_bmap---of 1
hfs_delete_inode---of 8
hfs_direct_IO---of 4
hfs_evict_inode50%of 4
hfs_file_fsync---of 3
hfs_file_lookup---of 8
hfs_file_open---of 3
hfs_file_release---of 5
hfs_iget---of 7
hfs_inode_read_fork67%of 3
hfs_inode_setattr---of 18
hfs_inode_write_fork---of 5
hfs_new_inode---of 10
hfs_read_folio---of 1
hfs_read_inode---of 6
hfs_release_folio---of 14
hfs_test_inode---of 4
hfs_write_begin---of 3
hfs_write_failed---of 3
hfs_write_inode---of 19
hfs_writepages---of 1
-----------
SUMMARY58%of 7

cache_finish_page100%of 1
cache_first_page100%of 1
cache_next_page---of 3
direct_finish_page---of 1
direct_first_page---of 7
direct_next_page---of 9
squashfs_page_actor_init67%of 3
squashfs_page_actor_init_special---of 5
-----------
SUMMARY80%of 5

dev_exception_add---of 12
dev_exception_clean---of 9
dev_exception_rm---of 13
dev_exceptions_copy---of 12
dev_exceptions_move---of 9
devcgroup_access_write---of 80
devcgroup_check_permission8%of 27
devcgroup_css_alloc---of 3
devcgroup_css_free---of 9
devcgroup_offline---of 1
devcgroup_online---of 4
devcgroup_seq_show---of 19
parent_allows_removal---of 16
parent_has_perm---of 29
propagate_exception---of 38
-----------
SUMMARY8%of 27

-----------
SUMMARY---of 0

generic_rndis_bind8%of 26
rndis_bind---of 1
rndis_command---of 17
rndis_msg_indicate---of 5
rndis_rx_fixup---of 13
rndis_status---of 1
rndis_tx_fixup---of 13
rndis_unbind---of 3
zte_rndis_bind50%of 4
-----------
SUMMARY14%of 30

xfs_add_freecounter---of 4
xfs_add_incompat_log_feature---of 8
xfs_check_sizes---of 7
xfs_check_summary_counts---of 16
xfs_clear_incompat_log_features---of 6
xfs_dec_freecounter---of 16
xfs_default_resblks---of 7
xfs_dev_is_read_only---of 12
xfs_force_summary_recalc---of 1
xfs_freecounter_unavailable---of 3
xfs_freesb---of 1
xfs_fs_writable---of 4
xfs_mod_delalloc---of 7
xfs_mount_reset_sbqflags---of 6
xfs_mount_setup_metadir---of 3
xfs_mountfs---of 72
xfs_readsb19%of 16
xfs_sb_validate_fsb_count---of 1
xfs_set_low_space_thresholds---of 1
xfs_set_max_atomic_write_opt---of 38
xfs_unmount_flush_inodes---of 1
xfs_unmountfs---of 12
xfs_update_alignment---of 12
xfs_uuid_mount---of 9
xfs_uuid_table_free---of 1
xfs_uuid_unmount---of 3
xfs_validate_new_dalign---of 8
-----------
SUMMARY19%of 16

caches_show---of 7
v9fs_inode_init_once---of 1
v9fs_parse_param9%of 47
v9fs_session_begin_cancel---of 1
v9fs_session_cancel---of 1
v9fs_session_close---of 8
v9fs_session_init---of 19
v9fs_show_options---of 33
-----------
SUMMARY9%of 47

-----------
SUMMARY---of 0

__ia32_sys_capget---of 1
__ia32_sys_capset---of 1
__se_sys_capget---of 11
__se_sys_capset---of 13
__x64_sys_capget---of 1
__x64_sys_capset---of 1
cap_validate_magic---of 8
capable50%of 4
capable_wrt_inode_uidgid40%of 5
file_ns_capable---of 3
has_capability_noaudit---of 1
has_ns_capability---of 1
has_ns_capability_noaudit---of 1
ns_capable50%of 4
ns_capable_noaudit---of 4
ns_capable_setid---of 4
privileged_wrt_inode_uidgid---of 3
ptracer_capable---of 3
-----------
SUMMARY47%of 13

gro_cell_poll---of 5
gro_cells_destroy---of 11
gro_cells_init---of 6
gro_cells_receive25%of 12
percpu_free_defer_callback---of 1
-----------
SUMMARY25%of 12

__handle_link_change---of 12
__skb_put---of 3
__usbnet_read_cmd---of 7
__usbnet_write_cmd---of 5
defer_bh---of 3
intr_complete---of 8
netdev_get_tx_queue---of 3
netdev_sent_queue---of 14
netif_stop_queue---of 3
rx_alloc_submit---of 6
rx_complete---of 22
rx_submit---of 16
sg_set_buf---of 3
tx_complete---of 12
unlink_urbs---of 9
usbnet_async_cmd_cb---of 1
usbnet_bh---of 38
usbnet_bh_work---of 1
usbnet_change_mtu---of 13
usbnet_defer_kevent---of 3
usbnet_deferred_kevent---of 39
usbnet_device_suggests_idle---of 3
usbnet_disconnect---of 9
usbnet_get_drvinfo---of 6
usbnet_get_endpoints---of 21
usbnet_get_ethernet_addr---of 4
usbnet_get_link---of 4
usbnet_get_link_ksettings_internal---of 4
usbnet_get_link_ksettings_mii---of 3
usbnet_get_msglevel---of 1
usbnet_link_change---of 7
usbnet_manage_power---of 1
usbnet_mii_ioctl---of 1
usbnet_nway_reset---of 3
usbnet_open---of 35
usbnet_pause_rx---of 1
usbnet_probe8%of 66
usbnet_purge_paused_rxq---of 1
usbnet_read_cmd---of 3
usbnet_read_cmd_nopm---of 1
usbnet_resume---of 23
usbnet_resume_rx---of 7
usbnet_set_link_ksettings_mii---of 10
usbnet_set_msglevel---of 1
usbnet_set_rx_mode---of 3
usbnet_skb_return---of 4
usbnet_start_xmit---of 47
usbnet_status_start---of 6
usbnet_status_stop---of 7
usbnet_stop---of 23
usbnet_suspend---of 5
usbnet_terminate_urbs---of 5
usbnet_tx_timeout---of 3
usbnet_unlink_rx_urbs---of 3
usbnet_update_max_qlen---of 7
usbnet_write_cmd---of 3
usbnet_write_cmd_async---of 8
usbnet_write_cmd_nopm---of 1
wait_skb_queue_empty---of 6
-----------
SUMMARY8%of 66

__anon_vma_prepare---of 16
__bpf_trace_migration_pte---of 1
__bpf_trace_mm_migrate_pages---of 1
__bpf_trace_mm_migrate_pages_start---of 1
__probestub_mm_migrate_pages---of 1
__probestub_mm_migrate_pages_start---of 1
__probestub_remove_migration_pte---of 1
__probestub_set_migration_pte---of 1
__put_anon_vma---of 7
__rmap_walk_file---of 22
__traceiter_mm_migrate_pages---of 4
__traceiter_mm_migrate_pages_start---of 4
__traceiter_remove_migration_pte---of 4
__traceiter_set_migration_pte---of 4
anon_vma_clone---of 26
anon_vma_ctor---of 1
anon_vma_fork---of 12
flush_tlb_batched_pending---of 3
folio_add_anon_rmap_pmd---of 30
folio_add_anon_rmap_ptes---of 30
folio_add_file_rmap_pmd---of 28
folio_add_file_rmap_ptes20%of 26
folio_add_file_rmap_pud---of 25
folio_add_new_anon_rmap24%of 26
folio_get_anon_vma---of 16
folio_lock_anon_vma_read---of 25
folio_mkclean25%of 12
folio_move_anon_rmap---of 1
folio_not_mapped---of 3
folio_referenced---of 16
folio_referenced_one---of 61
folio_remove_rmap_pmd---of 33
folio_remove_rmap_ptes---of 30
folio_remove_rmap_pud---of 23
hugetlb_add_anon_rmap---of 3
hugetlb_add_new_anon_rmap---of 5
invalid_folio_referenced_vma---of 12
invalid_migration_vma---of 1
invalid_mkclean_vma---of 1
make_device_exclusive---of 42
mapping_wrprotect_range---of 3
mapping_wrprotect_range_one---of 1
mm_find_pmd---of 12
page_address_in_vma---of 11
page_mkclean_one---of 3
page_vma_mkclean_one---of 23
perf_trace_migration_pte---of 8
perf_trace_mm_migrate_pages---of 8
perf_trace_mm_migrate_pages_start---of 8
pfn_mkclean_range---of 7
put_page---of 5
rmap_walk---of 6
rmap_walk_anon---of 30
rmap_walk_locked---of 5
trace_event_raw_event_migration_pte---of 9
trace_event_raw_event_mm_migrate_pages---of 9
trace_event_raw_event_mm_migrate_pages_start---of 9
trace_raw_output_migration_pte---of 3
trace_raw_output_mm_migrate_pages---of 3
trace_raw_output_mm_migrate_pages_start---of 3
try_to_migrate---of 15
try_to_migrate_one---of 135
try_to_unmap---of 10
try_to_unmap_flush---of 3
try_to_unmap_flush_dirty---of 4
try_to_unmap_one---of 146
unlink_anon_vmas16%of 19
-----------
SUMMARY21%of 83

bad_file_open---of 1
bad_inode_atomic_open---of 1
bad_inode_create---of 1
bad_inode_fiemap---of 1
bad_inode_get_acl---of 1
bad_inode_get_link---of 1
bad_inode_getattr---of 1
bad_inode_link---of 1
bad_inode_listxattr---of 1
bad_inode_lookup---of 1
bad_inode_mkdir---of 1
bad_inode_mknod---of 1
bad_inode_permission---of 1
bad_inode_readlink---of 1
bad_inode_rename2---of 1
bad_inode_rmdir---of 1
bad_inode_set_acl---of 1
bad_inode_setattr---of 1
bad_inode_symlink---of 1
bad_inode_tmpfile---of 1
bad_inode_unlink---of 1
bad_inode_update_time---of 1
iget_failed67%of 3
is_bad_inode100%of 1
make_bad_inode67%of 3
-----------
SUMMARY72%of 7

copy_bio_to_actor39%of 13
squashfs_bio_read32%of 60
squashfs_read_data46%of 22
-----------
SUMMARY36%of 95

-----------
SUMMARY---of 0

nilfs_cpfile_block_init---of 8
nilfs_cpfile_change_cpmode---of 63
nilfs_cpfile_create_checkpoint---of 17
nilfs_cpfile_delete_checkpoint---of 5
nilfs_cpfile_delete_checkpoints---of 37
nilfs_cpfile_do_get_cpinfo---of 22
nilfs_cpfile_finalize_checkpoint---of 10
nilfs_cpfile_get_cpinfo---of 29
nilfs_cpfile_get_stat---of 5
nilfs_cpfile_is_snapshot---of 8
nilfs_cpfile_read25%of 8
nilfs_cpfile_read_checkpoint46%of 11
-----------
SUMMARY37%of 19

-----------
SUMMARY---of 0

ext4_block_bitmap_csum_set---of 4
ext4_block_bitmap_csum_verify50%of 4
ext4_count_free---of 1
ext4_inode_bitmap_csum_set---of 4
ext4_inode_bitmap_csum_verify50%of 4
-----------
SUMMARY50%of 8

-----------
SUMMARY---of 0

__ia32_compat_sys_sched_getaffinity---of 4
__ia32_compat_sys_sched_setaffinity---of 5
__ia32_compat_sys_sigprocmask---of 10
compat_get_bitmap---of 10
compat_put_bitmap---of 10
get_compat_sigevent---of 6
get_compat_sigset100%of 1
put_compat_rusage---of 1
-----------
SUMMARY100%of 1

__node_free_rcu---of 3
__trie_free_rcu---of 3
call_fib_entry_notifiers---of 1
fib_alias_hw_flags_set---of 25
fib_free_table---of 1
fib_info_notify_update---of 16
fib_insert_alias---of 34
fib_lookup_good_nhc---of 11
fib_notify---of 30
fib_proc_exit---of 1
fib_proc_init---of 5
fib_remove_alias---of 38
fib_route_seq_next---of 16
fib_route_seq_show---of 24
fib_route_seq_start---of 22
fib_route_seq_stop---of 1
fib_table_delete---of 47
fib_table_dump---of 45
fib_table_flush---of 48
fib_table_flush_external---of 27
fib_table_insert---of 67
fib_table_lookup33%of 65
fib_trie_seq_next---of 22
fib_trie_seq_show---of 27
fib_trie_seq_start---of 17
fib_trie_seq_stop---of 1
fib_trie_table---of 5
fib_trie_unmerge---of 53
fib_triestat_seq_show---of 65
nexthop_get_nhc_lookup---of 24
put_child---of 18
replace---of 14
resize---of 79
trace_fib_table_lookup---of 5
update_children---of 7
-----------
SUMMARY33%of 65

-----------
SUMMARY---of 0

plist_add32%of 22
plist_check_list63%of 8
plist_del30%of 17
plist_requeue---of 15
-----------
SUMMARY37%of 47

-----------
SUMMARY---of 0

__vlan_find_dev_deep_rcu---of 8
eth_hdr---of 3
vlan_dev_real_dev---of 3
vlan_dev_vlan_id---of 1
vlan_dev_vlan_proto---of 1
vlan_do_receive5%of 40
vlan_filter_drop_vids---of 12
vlan_filter_push_vids---of 24
vlan_for_each---of 12
vlan_gro_complete---of 5
vlan_gro_receive---of 24
vlan_info_rcu_free---of 1
vlan_uses_dev---of 5
vlan_vid_add---of 29
vlan_vid_del---of 26
vlan_vids_add_by_dev---of 21
vlan_vids_del_by_dev---of 12
-----------
SUMMARY5%of 40

__copy_io---of 5
alloc_io_context67%of 3
exit_io_context---of 8
ioc_clear_queue---of 4
ioc_destroy_icq---of 16
ioc_find_get_icq---of 29
ioc_lookup_icq---of 7
ioc_release_fn---of 6
put_io_context---of 5
set_task_ioprio28%of 11
-----------
SUMMARY36%of 14

squashfs_fh_to_dentry---of 7
squashfs_fh_to_parent---of 7
squashfs_get_parent---of 6
squashfs_read_inode_lookup_table29%of 7
-----------
SUMMARY29%of 7

active_count_show---of 1
active_time_ms_show---of 3
device_create_release---of 1
event_count_show---of 1
expire_count_show---of 1
last_change_ms_show---of 1
max_time_ms_show---of 3
name_show---of 1
pm_wakeup_source_sysfs_add50%of 4
prevent_suspend_time_ms_show---of 3
relax_count_show---of 1
total_time_ms_show---of 3
wakeup_count_show---of 1
wakeup_source_sysfs_add---of 7
wakeup_source_sysfs_remove---of 1
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

ipvlan_l3_rcv---of 14
ipvlan_l3s_cleanup---of 1
ipvlan_l3s_init---of 1
ipvlan_l3s_register---of 6
ipvlan_l3s_unregister---of 6
ipvlan_migrate_l3s_hook---of 10
ipvlan_nf_input23%of 9
ipvlan_ns_exit---of 3
-----------
SUMMARY23%of 9

-----------
SUMMARY---of 0

__blkdev_direct_IO---of 34
blkdev_bio_end_io---of 16
blkdev_bio_end_io_async---of 9
blkdev_direct_IO---of 56
blkdev_direct_write---of 6
blkdev_fallocate---of 19
blkdev_fsync---of 3
blkdev_get_block67%of 3
blkdev_iomap_begin---of 3
blkdev_llseek---of 1
blkdev_mmap_prepare---of 5
blkdev_open47%of 15
blkdev_read_folio100%of 1
blkdev_read_iter---of 17
blkdev_readahead100%of 1
blkdev_release100%of 1
blkdev_write_begin---of 1
blkdev_write_end---of 3
blkdev_write_iter---of 22
blkdev_writepages75%of 4
file_to_blk_mode67%of 3
generic_write_sync---of 8
-----------
SUMMARY61%of 28

__bpf_trace_filelock_lease---of 1
__bpf_trace_filelock_lock---of 1
__bpf_trace_generic_add_lease---of 1
__bpf_trace_leases_conflict---of 1
__bpf_trace_locks_get_lock_context---of 1
__break_lease---of 104
__fcntl_getlease---of 25
__ia32_sys_flock100%of 1
__probestub_break_lease_block---of 1
__probestub_break_lease_noblock---of 1
__probestub_break_lease_unblock---of 1
__probestub_fcntl_setlk---of 1
__probestub_flock_lock_inode---of 1
__probestub_generic_add_lease---of 1
__probestub_generic_delete_lease---of 1
__probestub_leases_conflict---of 1
__probestub_locks_get_lock_context---of 1
__probestub_locks_remove_posix---of 1
__probestub_posix_lock_inode---of 1
__probestub_time_out_leases---of 1
__se_sys_flock17%of 18
__traceiter_break_lease_block---of 4
__traceiter_break_lease_noblock---of 4
__traceiter_break_lease_unblock---of 4
__traceiter_fcntl_setlk---of 4
__traceiter_flock_lock_inode---of 4
__traceiter_generic_add_lease---of 4
__traceiter_generic_delete_lease---of 4
__traceiter_leases_conflict---of 4
__traceiter_locks_get_lock_context---of 4
__traceiter_locks_remove_posix---of 4
__traceiter_posix_lock_inode---of 4
__traceiter_time_out_leases---of 4
__x64_sys_flock---of 1
do_fcntl_add_lease---of 9
fcntl_getdeleg---of 4
fcntl_getlease---of 1
fcntl_getlk---of 23
fcntl_setdeleg---of 10
fcntl_setlease---of 9
fcntl_setlk---of 48
flock_lock_inode19%of 65
flock_locks_conflict---of 4
generic_setlease---of 83
kernel_setlease---of 5
lease_break_callback---of 1
lease_get_mtime---of 9
lease_modify---of 27
lease_open_conflict---of 5
lease_register_notifier---of 1
lease_setup---of 3
lease_unregister_notifier---of 1
leases_conflict---of 12
list_add_tail---of 3
lock_get_status---of 22
locks_alloc_lease---of 3
locks_alloc_lock---of 3
locks_check_ctx_lists---of 5
locks_copy_conflock---of 4
locks_copy_lock---of 9
locks_delete_block---of 25
locks_dump_ctx_list---of 4
locks_free_lease---of 1
locks_free_lock---of 1
locks_free_lock_context50%of 4
locks_get_lock_context20%of 15
locks_init_lease---of 1
locks_init_lock---of 1
locks_insert_block---of 24
locks_insert_lock_ctx---of 4
locks_lock_inode_wait---of 20
locks_next---of 1
locks_owner_has_blockers---of 7
locks_release_private---of 12
locks_remove_file25%of 44
locks_remove_posix16%of 13
locks_show---of 19
locks_start---of 1
locks_stop---of 1
locks_unlink_lock_ctx---of 24
perf_trace_filelock_lease---of 9
perf_trace_filelock_lock---of 9
perf_trace_generic_add_lease---of 8
perf_trace_leases_conflict---of 8
perf_trace_locks_get_lock_context---of 8
posix_lock_file---of 1
posix_lock_inode---of 226
posix_test_lock---of 23
show_fd_locks---of 21
time_out_leases---of 17
trace_event_raw_event_filelock_lease---of 10
trace_event_raw_event_filelock_lock---of 10
trace_event_raw_event_generic_add_lease---of 9
trace_event_raw_event_leases_conflict---of 9
trace_event_raw_event_locks_get_lock_context---of 9
trace_raw_output_filelock_lease---of 3
trace_raw_output_filelock_lock---of 3
trace_raw_output_generic_add_lease---of 3
trace_raw_output_leases_conflict---of 3
trace_raw_output_locks_get_lock_context---of 3
vfs_cancel_lock---of 5
vfs_inode_has_locks---of 5
vfs_lock_file---of 5
vfs_setlease---of 9
vfs_test_lock---of 9
-----------
SUMMARY22%of 160

HAS_UNMAPPED_ID---of 3
__check_sticky---of 4
__filename_parentat34%of 24
__follow_mount_rcu47%of 13
__getname_maybe_null---of 10
__ia32_sys_link---of 1
__ia32_sys_linkat---of 1
__ia32_sys_mkdir---of 1
__ia32_sys_mkdirat---of 1
__ia32_sys_mknod100%of 1
__ia32_sys_mknodat---of 1
__ia32_sys_rename---of 1
__ia32_sys_renameat---of 1
__ia32_sys_renameat2---of 1
__ia32_sys_rmdir---of 1
__ia32_sys_symlink---of 1
__ia32_sys_symlinkat---of 1
__ia32_sys_unlink---of 1
__ia32_sys_unlinkat---of 1
__legitimize_path---of 5
__lookup_slow25%of 12
__page_get_link---of 9
__se_sys_link---of 11
__se_sys_linkat---of 11
__se_sys_mkdir---of 6
__se_sys_mkdirat---of 6
__se_sys_mknod34%of 6
__se_sys_mknodat---of 6
__se_sys_rename---of 11
__se_sys_renameat---of 11
__se_sys_renameat2---of 11
__se_sys_rmdir---of 6
__se_sys_symlink---of 11
__se_sys_symlinkat---of 11
__se_sys_unlink---of 6
__se_sys_unlinkat---of 9
__start_removing_path---of 7
__start_renaming---of 12
__traverse_mounts30%of 31
__x64_sys_link---of 1
__x64_sys_linkat---of 1
__x64_sys_mkdir---of 1
__x64_sys_mkdirat---of 1
__x64_sys_mknod---of 1
__x64_sys_mknodat---of 1
__x64_sys_rename---of 1
__x64_sys_renameat---of 1
__x64_sys_renameat2---of 1
__x64_sys_rmdir---of 1
__x64_sys_symlink---of 1
__x64_sys_symlinkat---of 1
__x64_sys_unlink---of 1
__x64_sys_unlinkat---of 1
atomic_open---of 12
check_acl---of 9
choose_mountpoint---of 19
choose_mountpoint_rcu---of 9
complete_getname---of 5
complete_walk54%of 13
d_delete_notify---of 4
delayed_getname---of 6
delayed_getname_uflags---of 8
dentry_create---of 32
dismiss_delayed_filename---of 6
do_file_open58%of 14
do_file_open_root---of 25
do_getname34%of 12
do_getname_kernel40%of 10
do_o_path---of 5
do_tmpfile---of 7
dont_mount---of 1
drop_links---of 6
end_creating_path---of 3
end_dirop67%of 3
end_renaming---of 3
filename_create29%of 7
filename_linkat---of 21
filename_lookup50%of 20
filename_mkdirat---of 20
filename_mknodat19%of 33
filename_renameat2---of 33
filename_rmdir---of 19
filename_symlinkat---of 15
filename_unlinkat---of 18
follow_down---of 5
follow_down_one---of 4
follow_up---of 4
fsnotify_create43%of 7
fsnotify_inode---of 4
fsnotify_link---of 10
fsnotify_mkdir---of 7
fsnotify_move---of 20
full_name_hash72%of 7
generic_permission27%of 26
getname_flags100%of 1
getname_kernel100%of 1
getname_long29%of 7
getname_uflags---of 1
handle_dots5%of 48
handle_lookup_down---of 9
hashlen_string---of 4
inode_permission58%of 21
kern_path34%of 6
kern_path_parent---of 10
kernel_tmpfile_open---of 4
link_path_walk49%of 66
lock_rename_child---of 6
lock_two_directories---of 7
lookup_fast50%of 16
lookup_noperm---of 25
lookup_noperm_common45%of 18
lookup_noperm_positive_unlocked---of 4
lookup_noperm_unlocked---of 23
lookup_one---of 26
lookup_one_positive_killable---of 29
lookup_one_positive_unlocked---of 4
lookup_one_qstr_excl29%of 14
lookup_one_unlocked---of 24
lookup_slow100%of 1
may_create_dentry28%of 11
may_delete_dentry---of 25
may_linkat---of 9
may_o_create---of 8
may_open30%of 27
may_open_dev67%of 3
nd_alloc_stack---of 3
nd_jump_link---of 6
nd_jump_root39%of 13
page_get_link---of 3
page_get_link_raw---of 1
page_put_link---of 3
page_readlink---of 8
page_symlink---of 7
path_get67%of 3
path_init29%of 53
path_lookupat59%of 31
path_openat23%of 177
path_parentat---of 4
path_pts---of 9
path_put100%of 1
pick_link19%of 48
put_link60%of 5
putname50%of 6
putname_to_delayed---of 3
readlink_copy---of 5
set_root42%of 12
start_creating---of 21
start_creating_dentry---of 7
start_creating_killable---of 22
start_creating_noperm---of 20
start_creating_path---of 6
start_creating_user_path---of 6
start_dirop67%of 3
start_removing---of 21
start_removing_dentry---of 7
start_removing_killable---of 22
start_removing_noperm---of 20
start_removing_path---of 6
start_removing_user_path_at---of 6
start_renaming---of 38
start_renaming_dentry---of 33
start_renaming_two_dentries---of 20
step_into_slowpath66%of 26
terminate_walk56%of 9
try_break_deleg---of 8
try_lookup_noperm---of 18
try_to_unlazy25%of 24
try_to_unlazy_next14%of 23
user_path_at34%of 6
vfs_create---of 17
vfs_get_link---of 4
vfs_link---of 19
vfs_mkdir---of 23
vfs_mknod21%of 29
vfs_mkobj---of 10
vfs_path_lookup---of 6
vfs_path_parent_lookup---of 1
vfs_readlink---of 18
vfs_rename---of 67
vfs_rmdir---of 25
vfs_symlink---of 14
vfs_tmpfile---of 20
vfs_unlink---of 25
-----------
SUMMARY34%of 948

-----------
SUMMARY---of 0

__isofs_iget4%of 63
_isofs_bmap---of 1
init_once100%of 1
isofs_alloc_inode100%of 1
isofs_bread---of 3
isofs_dentry_cmp_ms---of 10
isofs_dentry_cmpi---of 4
isofs_dentry_cmpi_ms---of 10
isofs_fill_super17%of 83
isofs_free_fc100%of 1
isofs_free_inode---of 1
isofs_get_block---of 3
isofs_get_blocks---of 16
isofs_get_tree100%of 1
isofs_hash_ms---of 5
isofs_hashi---of 7
isofs_hashi_ms---of 9
isofs_iget5_set100%of 1
isofs_iget5_test---of 3
isofs_init_fs_context67%of 3
isofs_parse_param24%of 30
isofs_put_super---of 1
isofs_read_folio---of 1
isofs_readahead---of 1
isofs_reconfigure---of 1
isofs_show_options---of 34
isofs_statfs---of 1
rootdir_empty---of 5
-----------
SUMMARY17%of 184

empty_inline_dir---of 18
ext4_add_dirent_to_inline---of 7
ext4_convert_inline_data---of 17
ext4_convert_inline_data_nolock---of 22
ext4_convert_inline_data_to_extent---of 43
ext4_create_inline_data---of 10
ext4_delete_inline_entry---of 17
ext4_destroy_inline_data---of 3
ext4_destroy_inline_data_nolock---of 13
ext4_find_inline_data_nolock38%of 8
ext4_find_inline_entry---of 13
ext4_generic_write_inline_data---of 34
ext4_get_first_inline_block---of 3
ext4_get_max_inline_size---of 5
ext4_inline_data_iomap---of 8
ext4_inline_data_truncate28%of 33
ext4_inlinedir_to_tree---of 29
ext4_prepare_inline_data---of 7
ext4_read_inline_data---of 5
ext4_read_inline_dir---of 29
ext4_read_inline_folio---of 14
ext4_read_inline_link---of 8
ext4_readpage_inline---of 16
ext4_try_add_inline_entry---of 21
ext4_try_create_inline_dir---of 6
ext4_try_to_write_inline_data---of 3
ext4_update_final_de---of 8
ext4_update_inline_data---of 12
ext4_write_inline_data---of 9
ext4_write_inline_data_end---of 31
get_max_inline_xattr_value_size---of 13
lock_buffer---of 3
-----------
SUMMARY30%of 41

-----------
SUMMARY---of 0

ex_get_fixup_type---of 3
fixup_exception5%of 96
-----------
SUMMARY5%of 96

__bpf_trace_mmap_lock---of 1
__bpf_trace_mmap_lock_acquire_returned---of 1
__mmap_lock_do_trace_acquire_returned---of 5
__mmap_lock_do_trace_released---of 5
__mmap_lock_do_trace_start_locking---of 5
__mmap_lock_trace_released34%of 6
__probestub_mmap_lock_acquire_returned---of 1
__probestub_mmap_lock_released---of 1
__probestub_mmap_lock_start_locking---of 1
__traceiter_mmap_lock_acquire_returned---of 4
__traceiter_mmap_lock_released---of 4
__traceiter_mmap_lock_start_locking---of 4
__vma_exclude_readers_for_detach---of 6
__vma_start_exclude_readers35%of 23
__vma_start_write45%of 9
lock_mm_and_find_vma26%of 27
lock_next_vma---of 38
lock_vma_under_rcu25%of 8
mmap_read_lock_killable---of 11
mmap_write_downgrade---of 8
mmap_write_unlock---of 8
perf_trace_mmap_lock---of 9
perf_trace_mmap_lock_acquire_returned---of 9
trace_event_raw_event_mmap_lock---of 10
trace_event_raw_event_mmap_lock_acquire_returned---of 10
trace_raw_output_mmap_lock---of 3
trace_raw_output_mmap_lock_acquire_returned---of 3
upgrade_mmap_lock_carefully---of 22
vma_refcount_put---of 5
vma_start_read29%of 14
-----------
SUMMARY32%of 87

compat_only_sysfs_link_entry_to_kobj---of 5
internal_create_group42%of 72
sysfs_add_link_to_group---of 3
sysfs_create_group100%of 1
sysfs_create_groups50%of 8
sysfs_group_change_owner---of 28
sysfs_groups_change_owner---of 7
sysfs_merge_group43%of 7
sysfs_remove_group86%of 14
sysfs_remove_groups80%of 5
sysfs_remove_link_from_group---of 3
sysfs_unmerge_group60%of 5
sysfs_update_group---of 1
sysfs_update_groups---of 8
-----------
SUMMARY51%of 112

__bpf_trace_nilfs2_collection_stage_transition---of 1
__bpf_trace_nilfs2_mdt_insert_new_block---of 1
__bpf_trace_nilfs2_mdt_submit_block---of 1
__bpf_trace_nilfs2_segment_usage_allocated---of 1
__bpf_trace_nilfs2_segment_usage_check---of 1
__bpf_trace_nilfs2_segment_usage_freed---of 1
__bpf_trace_nilfs2_transaction_transition---of 1
__probestub_nilfs2_collection_stage_transition---of 1
__probestub_nilfs2_mdt_insert_new_block---of 1
__probestub_nilfs2_mdt_submit_block---of 1
__probestub_nilfs2_segment_usage_allocated---of 1
__probestub_nilfs2_segment_usage_check---of 1
__probestub_nilfs2_segment_usage_freed---of 1
__probestub_nilfs2_transaction_transition---of 1
__traceiter_nilfs2_collection_stage_transition---of 4
__traceiter_nilfs2_mdt_insert_new_block---of 4
__traceiter_nilfs2_mdt_submit_block---of 4
__traceiter_nilfs2_segment_usage_allocated---of 4
__traceiter_nilfs2_segment_usage_check---of 4
__traceiter_nilfs2_segment_usage_freed---of 4
__traceiter_nilfs2_transaction_transition---of 4
nilfs_attach_log_writer42%of 12
nilfs_clean_segments---of 18
nilfs_collect_dat_bmap---of 3
nilfs_collect_dat_data---of 4
nilfs_collect_file_bmap---of 3
nilfs_collect_file_data---of 4
nilfs_collect_file_node---of 1
nilfs_construct_dsync_segment---of 11
nilfs_construct_segment---of 5
nilfs_construction_timeout---of 1
nilfs_detach_log_writer---of 36
nilfs_dispose_list---of 20
nilfs_end_folio_io---of 12
nilfs_iput_work_func---of 1
nilfs_lookup_dirty_data_buffers---of 25
nilfs_redirty_inodes---of 6
nilfs_relax_pressure_in_lock---of 6
nilfs_segctor_abort_construction---of 48
nilfs_segctor_add_file_block---of 35
nilfs_segctor_apply_buffers---of 19
nilfs_segctor_confirm---of 11
nilfs_segctor_construct---of 27
nilfs_segctor_do_construct---of 345
nilfs_segctor_scan_file---of 30
nilfs_segctor_sync---of 8
nilfs_segctor_thread---of 29
nilfs_test_metadata_dirty---of 6
nilfs_transaction_abort---of 17
nilfs_transaction_begin---of 27
nilfs_transaction_commit---of 30
nilfs_transaction_lock---of 14
nilfs_transaction_unlock---of 8
nilfs_write_dat_data_binfo---of 4
nilfs_write_dat_node_binfo---of 4
nilfs_write_file_data_binfo---of 4
nilfs_write_file_node_binfo---of 4
perf_trace_nilfs2_collection_stage_transition---of 8
perf_trace_nilfs2_mdt_insert_new_block---of 8
perf_trace_nilfs2_mdt_submit_block---of 8
perf_trace_nilfs2_segment_usage_allocated---of 8
perf_trace_nilfs2_segment_usage_check---of 8
perf_trace_nilfs2_segment_usage_freed---of 8
perf_trace_nilfs2_transaction_transition---of 8
trace_event_raw_event_nilfs2_collection_stage_transition---of 9
trace_event_raw_event_nilfs2_mdt_insert_new_block---of 9
trace_event_raw_event_nilfs2_mdt_submit_block---of 9
trace_event_raw_event_nilfs2_segment_usage_allocated---of 9
trace_event_raw_event_nilfs2_segment_usage_check---of 9
trace_event_raw_event_nilfs2_segment_usage_freed---of 9
trace_event_raw_event_nilfs2_transaction_transition---of 9
trace_raw_output_nilfs2_collection_stage_transition---of 3
trace_raw_output_nilfs2_mdt_insert_new_block---of 3
trace_raw_output_nilfs2_mdt_submit_block---of 3
trace_raw_output_nilfs2_segment_usage_allocated---of 3
trace_raw_output_nilfs2_segment_usage_check---of 3
trace_raw_output_nilfs2_segment_usage_freed---of 3
trace_raw_output_nilfs2_transaction_transition---of 3
-----------
SUMMARY42%of 12

init_once100%of 1
squashfs_alloc_inode100%of 1
squashfs_fill_super20%of 45
squashfs_free_fs_context100%of 1
squashfs_free_inode---of 1
squashfs_get_tree100%of 1
squashfs_init_fs_context67%of 3
squashfs_parse_param50%of 4
squashfs_put_super---of 4
squashfs_reconfigure---of 1
squashfs_show_options---of 3
squashfs_statfs---of 1
-----------
SUMMARY31%of 56

delayed_free---of 1
delayed_sync_fs---of 3
hfsplus_alloc_inode100%of 1
hfsplus_commit_superblock---of 5
hfsplus_evict_inode67%of 3
hfsplus_fill_super11%of 38
hfsplus_free_fc100%of 1
hfsplus_free_inode---of 1
hfsplus_get_tree100%of 1
hfsplus_iget31%of 13
hfsplus_init_fs_context50%of 4
hfsplus_init_once100%of 1
hfsplus_kill_super100%of 1
hfsplus_mark_mdb_dirty---of 4
hfsplus_prepare_volume_header_for_commit---of 1
hfsplus_put_super---of 4
hfsplus_reconfigure---of 7
hfsplus_statfs---of 1
hfsplus_sync_fs---of 6
hfsplus_write_inode---of 16
-----------
SUMMARY27%of 63

-----------
SUMMARY---of 0

__ia32_compat_sys_ftruncate---of 5
__ia32_compat_sys_open---of 5
__ia32_compat_sys_openat60%of 5
__ia32_compat_sys_truncate100%of 1
__ia32_sys_access---of 1
__ia32_sys_chdir100%of 1
__ia32_sys_chmod---of 1
__ia32_sys_chown---of 1
__ia32_sys_chroot---of 1
__ia32_sys_close50%of 4
__ia32_sys_creat---of 1
__ia32_sys_faccessat---of 1
__ia32_sys_faccessat2---of 1
__ia32_sys_fallocate---of 4
__ia32_sys_fchdir---of 7
__ia32_sys_fchmod---of 7
__ia32_sys_fchmodat---of 1
__ia32_sys_fchmodat2---of 1
__ia32_sys_fchown---of 1
__ia32_sys_fchownat---of 1
__ia32_sys_ftruncate---of 5
__ia32_sys_lchown---of 1
__ia32_sys_open---of 5
__ia32_sys_openat---of 5
__ia32_sys_openat2---of 1
__ia32_sys_truncate---of 1
__se_sys_chdir29%of 7
__se_sys_chroot---of 12
__se_sys_openat2---of 12
__x64_sys_access---of 1
__x64_sys_chdir---of 1
__x64_sys_chmod---of 1
__x64_sys_chown---of 1
__x64_sys_chroot---of 1
__x64_sys_close---of 4
__x64_sys_creat---of 1
__x64_sys_faccessat---of 1
__x64_sys_faccessat2---of 1
__x64_sys_fallocate---of 4
__x64_sys_fchdir---of 7
__x64_sys_fchmod---of 7
__x64_sys_fchmodat---of 1
__x64_sys_fchmodat2---of 1
__x64_sys_fchown---of 1
__x64_sys_fchownat---of 1
__x64_sys_ftruncate---of 5
__x64_sys_lchown---of 1
__x64_sys_open---of 5
__x64_sys_openat---of 5
__x64_sys_openat2---of 1
__x64_sys_truncate---of 1
__x64_sys_vhangup---of 3
break_lease---of 6
build_open_flags30%of 20
build_open_how---of 5
chmod_common---of 13
chown_common---of 20
dentry_open---of 5
dentry_open_nonotify---of 4
do_dentry_open45%of 60
do_faccessat---of 42
do_fchmodat---of 6
do_fchownat---of 9
do_ftruncate---of 15
do_sys_open---of 5
do_sys_openat243%of 7
do_truncate---of 9
file_open_name---of 7
file_open_root---of 7
file_path---of 1
filp_close---of 1
filp_flush43%of 7
filp_open---of 7
finish_no_open---of 3
finish_open---of 3
fsnotify_file---of 9
generic_file_open---of 4
kernel_file_open---of 4
ksys_fallocate---of 4
ksys_fchown---of 8
ksys_ftruncate---of 5
ksys_truncate34%of 6
nonseekable_open---of 1
sb_end_write---of 5
sb_start_write---of 5
stream_open---of 1
vfs_fallocate---of 35
vfs_fchmod---of 4
vfs_fchown---of 5
vfs_open50%of 10
vfs_truncate---of 17
-----------
SUMMARY43%of 128

-----------
SUMMARY---of 0

NF_HOOK---of 8
compat_raw_ioctl---of 4
dst_output---of 4
ip_select_ident---of 4
l3mdev_master_ifindex_by_index---of 3
raw_abort---of 1
raw_bind---of 11
raw_close---of 1
raw_destroy---of 1
raw_exit_net---of 1
raw_getfrag---of 6
raw_getsockopt---of 8
raw_hash_sk---of 12
raw_icmp_error15%of 34
raw_init_net---of 1
raw_ioctl---of 5
raw_local_deliver12%of 35
raw_rcv---of 32
raw_rcv_skb---of 3
raw_recvmsg---of 17
raw_send_hdrinc---of 34
raw_sendmsg---of 71
raw_seq_next---of 16
raw_seq_show---of 5
raw_seq_start---of 19
raw_seq_stop---of 1
raw_setsockopt---of 6
raw_sk_init---of 3
raw_sysctl_init---of 1
raw_unhash_sk---of 7
raw_v4_match---of 9
skb_transport_header---of 3
-----------
SUMMARY14%of 69

seg6_exit---of 1
seg6_genl_dumphmac---of 13
seg6_genl_dumphmac_done---of 1
seg6_genl_dumphmac_start---of 4
seg6_genl_get_tunsrc---of 5
seg6_genl_set_tunsrc---of 4
seg6_genl_sethmac---of 14
seg6_get_srh9%of 23
seg6_icmp_srh50%of 6
seg6_net_exit---of 1
seg6_net_init---of 5
seg6_validate_srh---of 13
-----------
SUMMARY18%of 29

__register_chrdev---of 14
__register_chrdev_region---of 48
__unregister_chrdev---of 9
alloc_chrdev_region---of 3
base_probe---of 3
cd_forget---of 4
cdev_add---of 4
cdev_alloc---of 3
cdev_default_release29%of 7
cdev_del---of 1
cdev_device_add40%of 10
cdev_device_del67%of 3
cdev_dynamic_release---of 7
cdev_init100%of 1
cdev_put---of 3
cdev_set_parent---of 3
chrdev_open28%of 18
chrdev_show---of 6
exact_lock50%of 4
exact_match100%of 1
register_chrdev_region---of 13
unregister_chrdev_region---of 11
-----------
SUMMARY39%of 44

__func_get_name46%of 11
__func_imm_name50%of 4
func_id_name---of 3
print_bpf_insn27%of 65
-----------
SUMMARY30%of 80

-----------
SUMMARY---of 0

ipip_changelink---of 11
ipip_err25%of 16
ipip_exit_rtnl---of 1
ipip_fill_forward_path---of 3
ipip_fill_info---of 16
ipip_get_size---of 1
ipip_init_net---of 1
ipip_newlink---of 11
ipip_rcv---of 1
ipip_tunnel_ctl---of 10
ipip_tunnel_init---of 3
ipip_tunnel_rcv---of 24
ipip_tunnel_setup---of 1
ipip_tunnel_validate---of 7
ipip_tunnel_xmit---of 14
mplsip_rcv---of 1
-----------
SUMMARY25%of 16

-----------
SUMMARY---of 0

__ext4_new_inode---of 160
ext4_count_dirs50%of 6
ext4_count_free_inodes50%of 6
ext4_end_bitmap_read---of 4
ext4_free_inode---of 33
ext4_init_inode_table---of 15
ext4_lock_group---of 9
ext4_mark_bitmap_end---of 7
ext4_mark_inode_used---of 26
ext4_orphan_get15%of 21
ext4_read_inode_bitmap27%of 49
ext4_update_inode_fsync_trans---of 6
ext4_xattr_credits_for_new_inode---of 8
find_group_orlov---of 30
find_inode_bit---of 16
get_orlov_stats---of 4
trace_ext4_allocate_inode---of 5
trace_ext4_load_inode_bitmap40%of 5
-----------
SUMMARY28%of 87

__bpf_trace_hrtimer_class---of 1
__bpf_trace_hrtimer_expire_entry---of 1
__bpf_trace_hrtimer_rearm---of 1
__bpf_trace_hrtimer_setup---of 1
__bpf_trace_hrtimer_start---of 1
__bpf_trace_itimer_expire---of 1
__bpf_trace_itimer_state---of 1
__bpf_trace_tick_stop---of 1
__bpf_trace_timer_base_idle---of 1
__bpf_trace_timer_class---of 1
__bpf_trace_timer_expire_entry---of 1
__bpf_trace_timer_start---of 1
__get_next_timer_interrupt---of 38
__mod_timer27%of 45
__probestub_hrtimer_cancel---of 1
__probestub_hrtimer_expire_entry---of 1
__probestub_hrtimer_expire_exit---of 1
__probestub_hrtimer_rearm---of 1
__probestub_hrtimer_setup---of 1
__probestub_hrtimer_start---of 1
__probestub_itimer_expire---of 1
__probestub_itimer_state---of 1
__probestub_tick_stop---of 1
__probestub_timer_base_idle---of 1
__probestub_timer_cancel---of 1
__probestub_timer_expire_entry---of 1
__probestub_timer_expire_exit---of 1
__probestub_timer_init---of 1
__probestub_timer_start---of 1
__round_jiffies_relative---of 1
__round_jiffies_up_relative---of 1
__run_timer_base---of 32
__traceiter_hrtimer_cancel---of 4
__traceiter_hrtimer_expire_entry---of 4
__traceiter_hrtimer_expire_exit---of 4
__traceiter_hrtimer_rearm---of 4
__traceiter_hrtimer_setup---of 4
__traceiter_hrtimer_start---of 4
__traceiter_itimer_expire---of 4
__traceiter_itimer_state---of 4
__traceiter_tick_stop---of 4
__traceiter_timer_base_idle---of 4
__traceiter_timer_cancel---of 4
__traceiter_timer_expire_entry---of 4
__traceiter_timer_expire_exit---of 4
__traceiter_timer_init---of 4
__traceiter_timer_start---of 4
__try_to_del_timer_sync50%of 16
add_timer67%of 3
add_timer_global67%of 3
add_timer_local---of 3
add_timer_on---of 11
calc_wheel_index60%of 10
call_timer_fn---of 13
enqueue_timer43%of 14
fetch_next_timer_interrupt---of 15
fetch_next_timer_interrupt_remote---of 1
get_next_timer_interrupt---of 1
mod_timer100%of 1
mod_timer_pending---of 1
perf_trace_hrtimer_class---of 8
perf_trace_hrtimer_expire_entry---of 8
perf_trace_hrtimer_rearm---of 8
perf_trace_hrtimer_setup---of 8
perf_trace_hrtimer_start---of 8
perf_trace_itimer_expire---of 9
perf_trace_itimer_state---of 8
perf_trace_tick_stop---of 8
perf_trace_timer_base_idle---of 8
perf_trace_timer_class---of 8
perf_trace_timer_expire_entry---of 8
perf_trace_timer_start---of 8
round_jiffies---of 1
round_jiffies_relative---of 1
round_jiffies_up---of 1
round_jiffies_up_relative---of 1
run_timer_softirq---of 3
timer_base_is_idle---of 1
timer_base_try_to_set_idle---of 3
timer_clear_idle---of 5
timer_delete15%of 14
timer_delete_sync43%of 7
timer_delete_sync_try---of 1
timer_expire_remote---of 1
timer_init_key43%of 7
timer_lock_remote_bases---of 1
timer_migration_handler---of 5
timer_recalc_next_expiry---of 17
timer_reduce---of 1
timer_shutdown---of 13
timer_shutdown_sync---of 7
timer_unlock_remote_bases---of 1
timer_update_keys---of 4
timers_dead_cpu---of 22
timers_prepare_cpu---of 1
timers_update_nohz---of 1
trace_event_raw_event_hrtimer_class---of 9
trace_event_raw_event_hrtimer_expire_entry---of 9
trace_event_raw_event_hrtimer_rearm---of 9
trace_event_raw_event_hrtimer_setup---of 9
trace_event_raw_event_hrtimer_start---of 9
trace_event_raw_event_itimer_expire---of 10
trace_event_raw_event_itimer_state---of 9
trace_event_raw_event_tick_stop---of 9
trace_event_raw_event_timer_base_idle---of 9
trace_event_raw_event_timer_class---of 9
trace_event_raw_event_timer_expire_entry---of 9
trace_event_raw_event_timer_start---of 9
trace_raw_output_hrtimer_class---of 3
trace_raw_output_hrtimer_expire_entry---of 3
trace_raw_output_hrtimer_rearm---of 3
trace_raw_output_hrtimer_setup---of 3
trace_raw_output_hrtimer_start---of 3
trace_raw_output_itimer_expire---of 3
trace_raw_output_itimer_state---of 3
trace_raw_output_tick_stop---of 3
trace_raw_output_timer_base_idle---of 3
trace_raw_output_timer_class---of 3
trace_raw_output_timer_expire_entry---of 3
trace_raw_output_timer_start---of 3
update_process_times---of 10
-----------
SUMMARY38%of 120

__get_vma_policy---of 4
__ia32_sys_get_mempolicy---of 1
__ia32_sys_mbind---of 1
__ia32_sys_migrate_pages---of 1
__ia32_sys_set_mempolicy---of 1
__ia32_sys_set_mempolicy_home_node---of 1
__mpol_dup---of 9
__mpol_equal---of 15
__mpol_put---of 3
__se_sys_get_mempolicy---of 75
__se_sys_mbind---of 79
__se_sys_migrate_pages---of 48
__se_sys_set_mempolicy---of 21
__se_sys_set_mempolicy_home_node---of 31
__x64_sys_get_mempolicy---of 1
__x64_sys_mbind---of 1
__x64_sys_migrate_pages---of 1
__x64_sys_set_mempolicy---of 1
__x64_sys_set_mempolicy_home_node---of 1
alloc_frozen_pages_noprof34%of 6
alloc_migration_target_by_mpol---of 12
alloc_pages_bulk_mempolicy_noprof4%of 86
alloc_pages_mpol16%of 19
alloc_pages_noprof50%of 8
apply_policy_zone---of 3
change_prot_numa---of 5
do_migrate_pages---of 29
do_set_mempolicy---of 24
folio_alloc_mpol_noprof---of 4
folio_alloc_noprof34%of 9
folio_can_map_prot_numa---of 29
get_task_policy---of 5
get_vma_policy---of 12
huge_node34%of 12
init_nodemask_of_mempolicy---of 10
mbind_range---of 33
mempolicy_in_oom_domain---of 5
mempolicy_set_node_perf---of 16
mempolicy_slab_node7%of 30
migrate_folio_add---of 16
mmap_read_unlock---of 3
mmap_write_unlock---of 5
mpol_free_shared_policy23%of 9
mpol_misplaced---of 45
mpol_new_nodemask---of 3
mpol_new_preferred---of 4
mpol_parse_str12%of 50
mpol_put_task_policy---of 4
mpol_rebind_default---of 1
mpol_rebind_mm---of 21
mpol_rebind_nodemask---of 6
mpol_rebind_preferred---of 1
mpol_rebind_task---of 6
mpol_set_nodemask---of 5
mpol_set_shared_policy---of 47
mpol_shared_policy_init---of 32
mpol_shared_policy_lookup---of 12
mpol_to_str---of 20
nearest_node_nodemask---of 7
node_show---of 3
node_store---of 16
numa_default_policy---of 4
numa_nearest_node---of 10
policy_nodemask8%of 39
put_task_struct---of 4
queue_folios_hugetlb---of 35
queue_folios_pte_range---of 57
queue_pages_range---of 1
queue_pages_test_walk---of 24
reduce_interleave_weights---of 21
sysfs_wi_node_add---of 7
sysfs_wi_node_delete---of 4
vma_alloc_folio_noprof32%of 19
vma_dup_policy---of 4
vma_migratable---of 13
vma_policy_mof---of 12
weighted_interleave_auto_show---of 3
weighted_interleave_auto_store---of 20
weighted_interleave_nid---of 23
weighted_interleave_nodes---of 20
wi_cleanup---of 6
wi_kobj_release---of 1
wi_node_notifier---of 5
-----------
SUMMARY14%of 287

__do_semtimedop---of 52
__ia32_compat_sys_old_semctl---of 1
__ia32_compat_sys_semctl100%of 1
__ia32_sys_semctl---of 1
__ia32_sys_semget---of 4
__ia32_sys_semop---of 1
__ia32_sys_semtimedop---of 4
__ia32_sys_semtimedop_time32---of 4
__list_add---of 4
__se_sys_semctl---of 20
__x64_sys_semctl---of 1
__x64_sys_semget---of 4
__x64_sys_semop---of 1
__x64_sys_semtimedop---of 4
__x64_sys_semtimedop_time32---of 4
compat_ksys_old_semctl---of 1
compat_ksys_semctl9%of 23
compat_ksys_semtimedop---of 4
copy_semundo---of 8
count_semcnt---of 20
do_semtimedop---of 11
do_smart_update---of 13
do_smart_wakeup_zero---of 13
exit_sem---of 92
freeary---of 71
ipc_update_pid---of 6
ksys_semget---of 4
ksys_semtimedop---of 4
list_add_rcu---of 4
list_del---of 4
lookup_undo---of 10
merge_queues---of 6
newary---of 21
perform_atomic_semop---of 19
perform_atomic_semop_slow---of 23
sem_exit_ns---of 1
sem_init_ns---of 1
sem_lock---of 10
sem_lock_and_putref---of 5
sem_more_checks---of 1
sem_rcu_free---of 1
sem_unlock---of 11
semctl_down---of 23
semctl_info45%of 9
semctl_main---of 66
semctl_setval---of 12
semctl_stat---of 23
sysvipc_sem_proc_show---of 15
update_queue---of 22
wake_const_ops---of 7
wake_up_sem_queue_prepare---of 4
-----------
SUMMARY22%of 33

usbnet_cdc_bind---of 4
usbnet_cdc_status---of 8
usbnet_cdc_unbind---of 5
usbnet_cdc_update_filter---of 1
usbnet_cdc_zte_bind---of 4
usbnet_cdc_zte_rx_fixup---of 7
usbnet_cdc_zte_status---of 6
usbnet_ether_cdc_bind---of 3
usbnet_generic_cdc_bind14%of 66
-----------
SUMMARY14%of 66

nilfs_mdt_clear100%of 5
nilfs_mdt_clear_shadow_map---of 10
nilfs_mdt_delete_block---of 6
nilfs_mdt_destroy---of 1
nilfs_mdt_fetch_dirty---of 3
nilfs_mdt_find_block---of 8
nilfs_mdt_forget_block---of 15
nilfs_mdt_freeze_buffer---of 23
nilfs_mdt_get_block8%of 28
nilfs_mdt_get_frozen_buffer---of 14
nilfs_mdt_init67%of 3
nilfs_mdt_read_block28%of 18
nilfs_mdt_restore_from_shadow_map---of 3
nilfs_mdt_save_to_shadow_map---of 4
nilfs_mdt_set_entry_size100%of 1
nilfs_mdt_setup_shadow_map67%of 3
nilfs_mdt_submit_block42%of 17
nilfs_mdt_writeback---of 8
-----------
SUMMARY32%of 75

-----------
SUMMARY---of 0

xfrm4_ah_err---of 5
xfrm4_ah_rcv---of 5
xfrm4_esp_err40%of 5
xfrm4_esp_rcv---of 5
xfrm4_ipcomp_err---of 5
xfrm4_ipcomp_rcv---of 5
xfrm4_protocol_deregister---of 21
xfrm4_protocol_register---of 18
xfrm4_rcv_cb---of 9
xfrm4_rcv_encap---of 12
-----------
SUMMARY40%of 5

__acct_reclaim_writeback---of 5
__bpf_trace_mm_shrink_slab_end---of 1
__bpf_trace_mm_shrink_slab_start---of 1
__bpf_trace_mm_vmscan_direct_reclaim_begin_template---of 1
__bpf_trace_mm_vmscan_direct_reclaim_end_template---of 1
__bpf_trace_mm_vmscan_kswapd_clear_hopeless---of 1
__bpf_trace_mm_vmscan_kswapd_reclaim_fail---of 1
__bpf_trace_mm_vmscan_kswapd_sleep---of 1
__bpf_trace_mm_vmscan_kswapd_wake---of 1
__bpf_trace_mm_vmscan_lru_isolate---of 1
__bpf_trace_mm_vmscan_lru_shrink_active---of 1
__bpf_trace_mm_vmscan_lru_shrink_inactive---of 1
__bpf_trace_mm_vmscan_node_reclaim_begin---of 1
__bpf_trace_mm_vmscan_reclaim_pages---of 1
__bpf_trace_mm_vmscan_throttled---of 1
__bpf_trace_mm_vmscan_wakeup_kswapd---of 1
__bpf_trace_mm_vmscan_write_folio---of 1
__node_reclaim---of 31
__probestub_mm_shrink_slab_end---of 1
__probestub_mm_shrink_slab_start---of 1
__probestub_mm_vmscan_direct_reclaim_begin---of 1
__probestub_mm_vmscan_direct_reclaim_end---of 1
__probestub_mm_vmscan_kswapd_clear_hopeless---of 1
__probestub_mm_vmscan_kswapd_reclaim_fail---of 1
__probestub_mm_vmscan_kswapd_sleep---of 1
__probestub_mm_vmscan_kswapd_wake---of 1
__probestub_mm_vmscan_lru_isolate---of 1
__probestub_mm_vmscan_lru_shrink_active---of 1
__probestub_mm_vmscan_lru_shrink_inactive---of 1
__probestub_mm_vmscan_memcg_reclaim_begin---of 1
__probestub_mm_vmscan_memcg_reclaim_end---of 1
__probestub_mm_vmscan_memcg_softlimit_reclaim_begin---of 1
__probestub_mm_vmscan_memcg_softlimit_reclaim_end---of 1
__probestub_mm_vmscan_node_reclaim_begin---of 1
__probestub_mm_vmscan_node_reclaim_end---of 1
__probestub_mm_vmscan_reclaim_pages---of 1
__probestub_mm_vmscan_throttled---of 1
__probestub_mm_vmscan_wakeup_kswapd---of 1
__probestub_mm_vmscan_write_folio---of 1
__remove_mapping28%of 29
__traceiter_mm_shrink_slab_end---of 4
__traceiter_mm_shrink_slab_start---of 4
__traceiter_mm_vmscan_direct_reclaim_begin---of 4
__traceiter_mm_vmscan_direct_reclaim_end---of 4
__traceiter_mm_vmscan_kswapd_clear_hopeless---of 4
__traceiter_mm_vmscan_kswapd_reclaim_fail---of 4
__traceiter_mm_vmscan_kswapd_sleep---of 4
__traceiter_mm_vmscan_kswapd_wake---of 4
__traceiter_mm_vmscan_lru_isolate---of 4
__traceiter_mm_vmscan_lru_shrink_active---of 4
__traceiter_mm_vmscan_lru_shrink_inactive---of 4
__traceiter_mm_vmscan_memcg_reclaim_begin---of 4
__traceiter_mm_vmscan_memcg_reclaim_end---of 4
__traceiter_mm_vmscan_memcg_softlimit_reclaim_begin---of 4
__traceiter_mm_vmscan_memcg_softlimit_reclaim_end---of 4
__traceiter_mm_vmscan_node_reclaim_begin---of 4
__traceiter_mm_vmscan_node_reclaim_end---of 4
__traceiter_mm_vmscan_reclaim_pages---of 4
__traceiter_mm_vmscan_throttled---of 4
__traceiter_mm_vmscan_wakeup_kswapd---of 4
__traceiter_mm_vmscan_write_folio---of 4
alloc_demote_folio---of 3
allow_direct_reclaim---of 19
check_move_unevictable_folios---of 47
count_memcg_folio_events---of 4
do_try_to_free_pages---of 73
drop_slab---of 13
enabled_show---of 7
enabled_store---of 66
evict_folios---of 219
folio_isolate_lru---of 17
folio_needs_release---of 4
folio_putback_lru---of 3
folio_unqueue_deferred_split---of 6
get_pmd_pfn---of 31
get_pte_pfn---of 29
inc_max_seq---of 39
isolate_lru_folios---of 52
kswapd---of 168
kswapd_clear_hopeless---of 6
kswapd_run---of 5
kswapd_stop---of 3
kswapd_test_hopeless---of 1
kswapd_try_clear_hopeless---of 7
list_del---of 4
lru_gen_add_folio---of 20
lru_gen_add_mm---of 14
lru_gen_del_mm---of 23
lru_gen_exit_memcg---of 12
lru_gen_init_lruvec---of 5
lru_gen_init_memcg---of 3
lru_gen_init_pgdat---of 1
lru_gen_look_around---of 52
lru_gen_migrate_mm---of 5
lru_gen_offline_memcg---of 10
lru_gen_online_memcg---of 14
lru_gen_release_memcg---of 16
lru_gen_reparent_memcg---of 33
lru_gen_rotate_memcg---of 18
lru_gen_seq_next---of 13
lru_gen_seq_open---of 1
lru_gen_seq_show---of 24
lru_gen_seq_start---of 13
lru_gen_seq_stop---of 4
lru_gen_seq_write---of 68
lru_gen_soft_reclaim---of 6
lru_gen_update_size---of 9
lruvec_is_sizable---of 28
lruvec_lru_size---of 7
max_lru_gen_memcg---of 22
mem_cgroup_shrink_node---of 15
min_ttl_ms_show---of 1
min_ttl_ms_store---of 3
move_folios_to_lru---of 43
node_reclaim---of 19
perf_trace_mm_shrink_slab_end---of 9
perf_trace_mm_shrink_slab_start---of 9
perf_trace_mm_vmscan_direct_reclaim_begin_template---of 9
perf_trace_mm_vmscan_direct_reclaim_end_template---of 9
perf_trace_mm_vmscan_kswapd_clear_hopeless---of 8
perf_trace_mm_vmscan_kswapd_reclaim_fail---of 8
perf_trace_mm_vmscan_kswapd_sleep---of 8
perf_trace_mm_vmscan_kswapd_wake---of 8
perf_trace_mm_vmscan_lru_isolate---of 8
perf_trace_mm_vmscan_lru_shrink_active---of 8
perf_trace_mm_vmscan_lru_shrink_inactive---of 8
perf_trace_mm_vmscan_node_reclaim_begin---of 8
perf_trace_mm_vmscan_reclaim_pages---of 8
perf_trace_mm_vmscan_throttled---of 8
perf_trace_mm_vmscan_wakeup_kswapd---of 8
perf_trace_mm_vmscan_write_folio---of 8
pgdat_balanced---of 12
recheck_lru_gen_max_memcg---of 6
reclaim_clean_pages_from_list---of 21
reclaim_folio_list---of 8
reclaim_pages---of 11
reclaim_register_node---of 1
reclaim_store---of 1
reclaim_throttle---of 23
reclaim_unregister_node---of 1
remove_mapping50%of 4
reset_batch_size---of 9
reset_ctrl_pos---of 3
set_initial_priority---of 11
should_skip_vma---of 14
shrink_active_list---of 32
shrink_all_memory---of 5
shrink_folio_list---of 237
shrink_lruvec---of 129
shrink_node---of 171
shrink_one---of 22
trace_event_raw_event_mm_shrink_slab_end---of 10
trace_event_raw_event_mm_shrink_slab_start---of 10
trace_event_raw_event_mm_vmscan_direct_reclaim_begin_template---of 10
trace_event_raw_event_mm_vmscan_direct_reclaim_end_template---of 10
trace_event_raw_event_mm_vmscan_kswapd_clear_hopeless---of 9
trace_event_raw_event_mm_vmscan_kswapd_reclaim_fail---of 9
trace_event_raw_event_mm_vmscan_kswapd_sleep---of 9
trace_event_raw_event_mm_vmscan_kswapd_wake---of 9
trace_event_raw_event_mm_vmscan_lru_isolate---of 9
trace_event_raw_event_mm_vmscan_lru_shrink_active---of 9
trace_event_raw_event_mm_vmscan_lru_shrink_inactive---of 9
trace_event_raw_event_mm_vmscan_node_reclaim_begin---of 9
trace_event_raw_event_mm_vmscan_reclaim_pages---of 9
trace_event_raw_event_mm_vmscan_throttled---of 9
trace_event_raw_event_mm_vmscan_wakeup_kswapd---of 9
trace_event_raw_event_mm_vmscan_write_folio---of 9
trace_raw_output_mm_shrink_slab_end---of 3
trace_raw_output_mm_shrink_slab_start---of 4
trace_raw_output_mm_vmscan_direct_reclaim_begin_template---of 4
trace_raw_output_mm_vmscan_direct_reclaim_end_template---of 3
trace_raw_output_mm_vmscan_kswapd_clear_hopeless---of 3
trace_raw_output_mm_vmscan_kswapd_reclaim_fail---of 3
trace_raw_output_mm_vmscan_kswapd_sleep---of 3
trace_raw_output_mm_vmscan_kswapd_wake---of 3
trace_raw_output_mm_vmscan_lru_isolate---of 3
trace_raw_output_mm_vmscan_lru_shrink_active---of 4
trace_raw_output_mm_vmscan_lru_shrink_inactive---of 4
trace_raw_output_mm_vmscan_node_reclaim_begin---of 4
trace_raw_output_mm_vmscan_reclaim_pages---of 3
trace_raw_output_mm_vmscan_throttled---of 4
trace_raw_output_mm_vmscan_wakeup_kswapd---of 4
trace_raw_output_mm_vmscan_write_folio---of 4
try_to_free_mem_cgroup_pages---of 16
try_to_free_pages---of 43
try_to_inc_max_seq---of 51
try_to_shrink_lruvec---of 59
update_bloom_filter---of 6
user_proactive_reclaim---of 27
wakeup_kswapd---of 24
walk_pmd_range_locked---of 30
walk_pud_range---of 84
walk_update_folio---of 30
zone_reclaimable_pages---of 22
-----------
SUMMARY31%of 33

nilfs_ifile_count_free_inodes---of 3
nilfs_ifile_create_inode---of 9
nilfs_ifile_delete_inode---of 9
nilfs_ifile_get_inode_block50%of 6
nilfs_ifile_read43%of 7
-----------
SUMMARY47%of 13

__f_setown---of 9
__ia32_compat_sys_fcntl---of 8
__ia32_compat_sys_fcntl64---of 1
__ia32_sys_fcntl---of 1
__se_sys_fcntl---of 9
__x64_sys_fcntl---of 1
check_fcntl_cmd---of 9
do_compat_fcntl64---of 35
do_fcntl---of 83
f_delown---of 3
f_getown---of 4
f_setown---of 11
fasync_alloc---of 1
fasync_free---of 1
fasync_helper---of 5
fasync_insert_entry---of 5
fasync_remove_entry---of 6
file_f_owner_allocate---of 5
file_f_owner_release67%of 3
get_compat_flock---of 3
kill_fasync19%of 11
put_compat_flock---of 1
put_compat_flock64---of 1
send_sigio---of 7
send_sigio_to_task---of 14
send_sigurg---of 8
send_sigurg_to_task---of 8
-----------
SUMMARY29%of 14

dst_check---of 5
ip6_input40%of 5
ip6_input_finish30%of 10
ip6_mc_input24%of 25
ip6_protocol_deliver_rcu19%of 94
ip6_rcv_core30%of 57
ip6_rcv_finish28%of 22
ip6_sublist_rcv---of 58
ipv6_is_mld---of 13
ipv6_list_rcv---of 15
ipv6_rcv50%of 6
skb_dst_set_noref---of 3
tcp_v6_early_demux---of 21
-----------
SUMMARY25%of 219

__kernfs_new_node36%of 17
__kernfs_remove38%of 40
kernfs_activate38%of 16
kernfs_activate_one38%of 8
kernfs_add_one34%of 9
kernfs_break_active_protection---of 4
kernfs_create_dir_ns50%of 4
kernfs_create_empty_dir---of 4
kernfs_create_root---of 5
kernfs_destroy_root---of 4
kernfs_dir_fop_release---of 1
kernfs_dir_pos---of 28
kernfs_dop_revalidate22%of 19
kernfs_drain50%of 16
kernfs_find_and_get_node_by_id---of 8
kernfs_find_and_get_ns50%of 4
kernfs_find_ns57%of 16
kernfs_fop_readdir---of 26
kernfs_free_rcu---of 3
kernfs_get50%of 4
kernfs_get_active---of 6
kernfs_get_parent---of 4
kernfs_iop_lookup38%of 8
kernfs_iop_mkdir---of 9
kernfs_iop_rename---of 20
kernfs_iop_rmdir---of 11
kernfs_link_sibling63%of 16
kernfs_name---of 4
kernfs_name_hash67%of 9
kernfs_new_node38%of 8
kernfs_node_from_dentry---of 4
kernfs_path_from_node---of 5
kernfs_path_from_node_locked---of 50
kernfs_put47%of 15
kernfs_put_active---of 4
kernfs_remove67%of 3
kernfs_remove_by_name_ns60%of 5
kernfs_remove_self---of 9
kernfs_rename_ns---of 25
kernfs_root---of 1
kernfs_root_flags---of 1
kernfs_root_to_node---of 1
kernfs_show---of 6
kernfs_unbreak_active_protection---of 1
kernfs_unlink_sibling75%of 4
kernfs_walk_and_get_ns---of 9
pr_cont_kernfs_name---of 1
pr_cont_kernfs_path---of 4
-----------
SUMMARY44%of 221

__bpf_trace_page_cache_ra_op---of 1
__bpf_trace_page_cache_ra_order---of 1
__bpf_trace_page_cache_ra_unbounded---of 1
__ia32_sys_readahead---of 10
__probestub_page_cache_async_ra---of 1
__probestub_page_cache_ra_order---of 1
__probestub_page_cache_ra_unbounded---of 1
__probestub_page_cache_sync_ra---of 1
__traceiter_page_cache_async_ra---of 4
__traceiter_page_cache_ra_order---of 4
__traceiter_page_cache_ra_unbounded---of 4
__traceiter_page_cache_sync_ra---of 4
__x64_sys_readahead---of 10
file_ra_state_init100%of 1
force_page_cache_ra---of 10
ksys_readahead---of 10
page_cache_async_ra---of 21
page_cache_ra_order35%of 26
page_cache_ra_unbounded---of 20
page_cache_sync_ra24%of 21
perf_trace_page_cache_ra_op---of 8
perf_trace_page_cache_ra_order---of 8
perf_trace_page_cache_ra_unbounded---of 8
read_pages18%of 28
readahead_expand---of 27
trace_event_raw_event_page_cache_ra_op---of 9
trace_event_raw_event_page_cache_ra_order---of 9
trace_event_raw_event_page_cache_ra_unbounded---of 9
trace_raw_output_page_cache_ra_op---of 3
trace_raw_output_page_cache_ra_order---of 3
trace_raw_output_page_cache_ra_unbounded---of 3
-----------
SUMMARY27%of 76

-----------
SUMMARY---of 0

__set_oom_adj---of 38
auxv_open---of 4
auxv_read---of 4
comm_open---of 1
comm_show---of 5
comm_write---of 8
dir_emit_dots---of 7
do_io_accounting---of 16
environ_open---of 4
environ_read---of 12
map_files_d_revalidate---of 30
map_files_get_link---of 29
mem_lseek---of 4
mem_open---of 5
mem_read---of 1
mem_release---of 4
mem_rw---of 25
mem_write---of 1
mmap_read_unlock---of 3
next_tgid---of 11
oom_adj_read---of 7
oom_adj_write---of 6
oom_score_adj_read---of 5
oom_score_adj_write---of 5
pid_delete_dentry---of 1
pid_getattr---of 8
pid_revalidate---of 4
pid_update_inode---of 1
proc_apparmor_attr_dir_iterate---of 1
proc_apparmor_attr_dir_lookup---of 1
proc_attr_dir_lookup---of 1
proc_attr_dir_readdir---of 1
proc_coredump_filter_read---of 7
proc_coredump_filter_write---of 25
proc_cwd_link---of 7
proc_dir_llseek---of 3
proc_exe_link---of 7
proc_fail_nth_read---of 5
proc_fail_nth_write---of 6
proc_fault_inject_read---of 5
proc_fault_inject_write---of 9
proc_fill_cache---of 10
proc_flush_pid---of 1
proc_gid_map_open---of 1
proc_id_map_open---of 18
proc_id_map_release---of 1
proc_loginuid_read---of 5
proc_loginuid_write---of 8
proc_map_files_get_link---of 4
proc_map_files_instantiate---of 3
proc_map_files_lookup---of 29
proc_map_files_readdir---of 28
proc_mem_open---of 7
proc_nochmod_setattr---of 4
proc_oom_score---of 3
proc_pid_attr_open---of 3
proc_pid_attr_read---of 7
proc_pid_attr_write---of 9
proc_pid_cmdline_read---of 26
proc_pid_evict_inode---of 6
proc_pid_get_link---of 11
proc_pid_instantiate---of 4
proc_pid_ksm_merging_pages---of 3
proc_pid_ksm_stat---of 9
proc_pid_limits---of 10
proc_pid_lookup17%of 12
proc_pid_make_inode---of 4
proc_pid_permission---of 11
proc_pid_personality---of 4
proc_pid_readdir---of 21
proc_pid_readlink---of 13
proc_pid_schedstat---of 1
proc_pid_stack---of 8
proc_pid_syscall---of 6
proc_pid_wchan---of 5
proc_pident_instantiate---of 8
proc_pident_lookup---of 11
proc_pident_readdir---of 8
proc_projid_map_open---of 1
proc_root_link---of 7
proc_sessionid_read---of 5
proc_setgroups_open---of 21
proc_setgroups_release---of 1
proc_single_open---of 1
proc_single_show---of 5
proc_task_getattr---of 5
proc_task_instantiate---of 4
proc_task_lookup---of 15
proc_task_readdir---of 38
proc_tgid_base_lookup---of 1
proc_tgid_base_readdir---of 1
proc_tgid_io_accounting---of 1
proc_tid_base_lookup---of 1
proc_tid_base_readdir---of 1
proc_tid_comm_permission---of 7
proc_tid_io_accounting---of 4
proc_timers_open---of 3
proc_uid_map_open---of 1
put_task_struct---of 4
put_user_ns---of 12
sched_open---of 1
sched_show---of 5
sched_write---of 5
show_timer---of 5
task_dump_owner---of 6
tgid_pidfd_to_pid---of 3
timens_offsets_open---of 1
timens_offsets_show---of 5
timens_offsets_write---of 25
timers_next---of 1
timers_start---of 3
timers_stop---of 5
timerslack_ns_open---of 1
timerslack_ns_show---of 9
timerslack_ns_write---of 15
-----------
SUMMARY17%of 12

__devinet_sysctl_register---of 5
__inet_del_ifa---of 43
__inet_insert_ifa---of 19
__ip_dev_find50%of 10
check_lifetime---of 22
confirm_addr_indev---of 23
devinet_conf_proc---of 28
devinet_exit_net---of 5
devinet_init_net---of 18
devinet_ioctl---of 71
devinet_sysctl_forward---of 15
devinet_sysctl_register---of 6
in_dev_dump_addr---of 18
in_dev_finish_destroy---of 10
in_dev_free_rcu---of 1
inet_abc_len---of 7
inet_addr_onlink---of 8
inet_alloc_ifa---of 5
inet_confirm_addr---of 8
inet_dump_addr---of 46
inet_dump_ifaddr---of 1
inet_dump_ifmcaddr---of 1
inet_fill_ifaddr---of 34
inet_fill_link_af---of 4
inet_get_link_af_size---of 1
inet_gifconf---of 12
inet_ifa_byprefix---of 9
inet_insert_ifa---of 3
inet_lookup_ifaddr_rcu75%of 4
inet_netconf_dump_devconf---of 19
inet_netconf_fill_devconf---of 35
inet_netconf_get_devconf---of 33
inet_netconf_notify_devconf---of 20
inet_rcu_free_ifa---of 4
inet_rtm_deladdr---of 25
inet_rtm_newaddr---of 61
inet_select_addr---of 38
inet_set_link_af---of 8
inet_validate_link_af---of 10
inetdev_by_index---of 3
inetdev_event---of 55
inetdev_init---of 17
ip_mc_autojoin_config---of 5
ipv4_doint_and_flush---of 4
put_cacheinfo---of 1
register_inetaddr_notifier---of 1
register_inetaddr_validator_notifier---of 1
rtmsg_ifa---of 7
unregister_inetaddr_notifier---of 1
unregister_inetaddr_validator_notifier---of 1
-----------
SUMMARY58%of 14

__ext4_check_dir_entry---of 23
ext4_check_all_de---of 5
ext4_dir_llseek---of 13
ext4_dir_open67%of 3
ext4_htree_free_dir_info---of 5
ext4_htree_store_dirent---of 12
ext4_readdir---of 148
ext4_release_dir---of 6
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

connect_type_show---of 5
connector_bind---of 9
connector_unbind---of 3
disable_show---of 8
disable_store---of 12
early_stop_show---of 1
early_stop_store---of 3
link_peers_report---of 14
location_show---of 1
match_location---of 11
over_current_count_show---of 1
quirks_show---of 1
quirks_store---of 3
state_show---of 1
usb3_lpm_permit_show---of 1
usb3_lpm_permit_store---of 11
usb_hub_create_port_device---of 34
usb_hub_remove_port_device---of 7
usb_port_device_release---of 1
usb_port_runtime_resume---of 12
usb_port_runtime_suspend20%of 10
usb_port_shutdown---of 4
-----------
SUMMARY20%of 10

__hfs_bnode_create27%of 15
hfs_bnode_need_zeroout---of 1
hfsplus_bnode_clear---of 10
hfsplus_bnode_copy---of 17
hfsplus_bnode_create---of 11
hfsplus_bnode_dump---of 9
hfsplus_bnode_find23%of 35
hfsplus_bnode_findhash34%of 6
hfsplus_bnode_free34%of 9
hfsplus_bnode_get---of 3
hfsplus_bnode_move---of 22
hfsplus_bnode_put23%of 22
hfsplus_bnode_read30%of 10
hfsplus_bnode_read_key---of 7
hfsplus_bnode_read_u16---of 1
hfsplus_bnode_read_u8---of 1
hfsplus_bnode_unhash---of 4
hfsplus_bnode_unlink---of 11
hfsplus_bnode_write---of 10
hfsplus_bnode_write_u16---of 1
-----------
SUMMARY26%of 97

__anon_inode_getfile---of 10
anon_inode_create_getfd---of 6
anon_inode_create_getfile---of 1
anon_inode_getattr---of 1
anon_inode_getfd28%of 11
anon_inode_getfile---of 7
anon_inode_getfile_fmode---of 7
anon_inode_make_secure_inode---of 4
anon_inode_setattr---of 1
anon_inodefs_dname---of 1
anon_inodefs_init_fs_context---of 3
-----------
SUMMARY28%of 11

misc_deregister---of 6
misc_devnode---of 6
misc_open38%of 16
misc_register---of 16
misc_seq_next---of 1
misc_seq_show---of 1
misc_seq_start---of 1
misc_seq_stop---of 1
-----------
SUMMARY38%of 16

__bpf_trace_percpu_alloc_percpu---of 1
__bpf_trace_percpu_alloc_percpu_fail---of 1
__bpf_trace_percpu_create_chunk---of 1
__bpf_trace_percpu_destroy_chunk---of 1
__bpf_trace_percpu_free_percpu---of 1
__is_kernel_percpu_address---of 7
__probestub_percpu_alloc_percpu---of 1
__probestub_percpu_alloc_percpu_fail---of 1
__probestub_percpu_create_chunk---of 1
__probestub_percpu_destroy_chunk---of 1
__probestub_percpu_free_percpu---of 1
__traceiter_percpu_alloc_percpu---of 4
__traceiter_percpu_alloc_percpu_fail---of 4
__traceiter_percpu_create_chunk---of 4
__traceiter_percpu_destroy_chunk---of 4
__traceiter_percpu_free_percpu---of 4
free_percpu17%of 49
is_kernel_percpu_address---of 6
pcpu_alloc_area64%of 19
pcpu_alloc_noprof22%of 82
pcpu_balance_free---of 30
pcpu_balance_workfn---of 48
pcpu_block_refresh_hint100%of 6
pcpu_block_update88%of 24
pcpu_block_update_hint_alloc58%of 26
pcpu_chunk_refresh_hint83%of 23
pcpu_chunk_relocate30%of 17
pcpu_create_chunk---of 31
pcpu_depopulate_chunk---of 22
pcpu_dump_alloc_info---of 21
pcpu_find_block_fit67%of 9
pcpu_free_area53%of 21
pcpu_get_pages---of 5
pcpu_memcg_post_alloc_hook43%of 7
pcpu_next_fit_region67%of 15
pcpu_nr_pages---of 1
pcpu_populate_chunk---of 43
per_cpu_ptr_to_phys---of 8
perf_trace_percpu_alloc_percpu---of 8
perf_trace_percpu_alloc_percpu_fail---of 8
perf_trace_percpu_create_chunk---of 8
perf_trace_percpu_destroy_chunk---of 8
perf_trace_percpu_free_percpu---of 8
trace_event_raw_event_percpu_alloc_percpu---of 9
trace_event_raw_event_percpu_alloc_percpu_fail---of 9
trace_event_raw_event_percpu_create_chunk---of 9
trace_event_raw_event_percpu_destroy_chunk---of 9
trace_event_raw_event_percpu_free_percpu---of 9
trace_raw_output_percpu_alloc_percpu---of 4
trace_raw_output_percpu_alloc_percpu_fail---of 3
trace_raw_output_percpu_create_chunk---of 3
trace_raw_output_percpu_destroy_chunk---of 3
trace_raw_output_percpu_free_percpu---of 3
-----------
SUMMARY45%of 298

key_garbage_collector---of 48
key_gc_keytype---of 3
key_gc_timer_func---of 1
key_gc_unused_keys---of 14
key_schedule_gc---of 5
key_schedule_gc_links100%of 1
key_set_expiry34%of 6
-----------
SUMMARY43%of 7

-----------
SUMMARY---of 0

mixdev_close_devices---of 8
mixdev_open_devices---of 11
mousedev_cleanup50%of 6
mousedev_close_device---of 4
mousedev_connect24%of 13
mousedev_create40%of 20
mousedev_destroy---of 3
mousedev_disconnect40%of 10
mousedev_event---of 46
mousedev_fasync---of 1
mousedev_free67%of 3
mousedev_notify_readers---of 16
mousedev_open---of 10
mousedev_open_device---of 6
mousedev_packet---of 9
mousedev_poll---of 7
mousedev_read---of 25
mousedev_release---of 4
mousedev_write---of 19
-----------
SUMMARY39%of 52

get_cpu_idle_time_us---of 11
get_cpu_iowait_time_us---of 11
get_jiffies_update---of 6
tick_check_oneshot_change---of 9
tick_clock_notify---of 5
tick_do_update_jiffies64---of 5
tick_get_tick_sched---of 1
tick_irq_enter---of 7
tick_nohz_get_idle_calls_cpu---of 1
tick_nohz_get_next_hrtimer---of 1
tick_nohz_get_sleep_length---of 15
tick_nohz_handler---of 18
tick_nohz_idle_enter---of 3
tick_nohz_idle_exit---of 12
tick_nohz_idle_got_tick---of 3
tick_nohz_idle_restart_tick---of 4
tick_nohz_idle_retain_tick---of 1
tick_nohz_idle_stop_tick---of 34
tick_nohz_irq_exit---of 3
tick_nohz_is_active---of 1
tick_nohz_lowres_handler---of 3
tick_nohz_next_event---of 20
tick_nohz_restart_sched_tick---of 6
tick_nohz_tick_stopped100%of 1
tick_nohz_tick_stopped_cpu---of 1
tick_oneshot_notify---of 1
tick_sched_timer_dying---of 3
tick_setup_sched_timer---of 13
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__ia32_sys_userfaultfd---of 4
__list_del_entry---of 4
__wake_userfault---of 5
__x64_sys_userfaultfd---of 4
dup_userfaultfd---of 14
dup_userfaultfd_complete---of 7
dup_userfaultfd_fail---of 15
handle_userfault---of 88
init_once_userfaultfd_ctx---of 1
mremap_userfaultfd_complete---of 3
mremap_userfaultfd_fail---of 6
mremap_userfaultfd_prep---of 6
new_userfaultfd---of 12
release_fault_lock---of 7
userfaultfd_dev_ioctl---of 3
userfaultfd_event_wait_completion---of 31
userfaultfd_ioctl---of 231
userfaultfd_poll---of 8
userfaultfd_read_iter---of 71
userfaultfd_release---of 5
userfaultfd_remove---of 8
userfaultfd_show_fdinfo---of 6
userfaultfd_unmap_complete29%of 7
userfaultfd_unmap_prep13%of 16
userfaultfd_wake_function---of 8
userfaultfd_wp_async---of 3
userfaultfd_wp_unpopulated---of 3
-----------
SUMMARY18%of 23

__bpf_trace_x86_fpu---of 1
__probestub_x86_fpu_after_save---of 1
__probestub_x86_fpu_before_save---of 1
__probestub_x86_fpu_copy_dst---of 1
__probestub_x86_fpu_dropped---of 1
__probestub_x86_fpu_regs_activated---of 1
__probestub_x86_fpu_regs_deactivated---of 1
__probestub_x86_fpu_xstate_check_failed---of 1
__traceiter_x86_fpu_after_save---of 4
__traceiter_x86_fpu_before_save---of 4
__traceiter_x86_fpu_copy_dst---of 4
__traceiter_x86_fpu_dropped---of 4
__traceiter_x86_fpu_regs_activated---of 4
__traceiter_x86_fpu_regs_deactivated---of 4
__traceiter_x86_fpu_xstate_check_failed---of 4
fpregs_assert_state_consistent58%of 7
fpregs_lock_and_load---of 15
fpregs_mark_activate---of 7
fpregs_restore_userregs46%of 11
fpstate_init_user---of 4
fpstate_reset---of 3
fpu__clear_user_states---of 19
fpu__drop---of 17
fpu__exception_code---of 8
fpu_alloc_guest_fpstate---of 9
fpu_clone---of 26
fpu_copy_guest_fpstate_to_uabi---of 4
fpu_copy_uabi_to_guest_fpstate---of 9
fpu_enable_guest_xfd_features---of 3
fpu_flush_thread---of 12
fpu_free_guest_fpstate---of 4
fpu_reset_from_exception_fixup---of 1
fpu_swap_kvm_fpstate---of 13
fpu_sync_fpstate---of 15
fpu_sync_guest_vmexit_xfd_state---of 6
fpu_thread_struct_whitelist---of 1
fpu_update_guest_xfd---of 9
irq_fpu_usable40%of 5
kernel_fpu_begin_mask59%of 17
kernel_fpu_end80%of 5
perf_trace_x86_fpu---of 9
restore_fpregs_from_fpstate37%of 11
save_fpregs_to_fpstate50%of 10
switch_fpu_return100%of 1
trace_event_raw_event_x86_fpu---of 10
trace_raw_output_x86_fpu---of 3
x86_task_fpu---of 3
-----------
SUMMARY53%of 67

__get_acl9%of 24
__posix_acl_chmod---of 22
__posix_acl_create---of 11
do_get_acl---of 14
do_set_acl---of 7
forget_all_cached_acls---of 9
forget_cached_acl---of 8
get_cached_acl22%of 14
get_cached_acl_rcu---of 6
get_inode_acl---of 1
posix_acl_alloc---of 3
posix_acl_chmod---of 10
posix_acl_clone---of 4
posix_acl_create14%of 15
posix_acl_create_masq---of 13
posix_acl_equiv_mode---of 14
posix_acl_from_mode---of 6
posix_acl_from_xattr---of 21
posix_acl_init---of 1
posix_acl_listxattr---of 8
posix_acl_permission---of 21
posix_acl_release---of 5
posix_acl_to_xattr---of 8
posix_acl_update_mode---of 13
posix_acl_valid---of 21
posix_acl_xattr_list---of 1
set_cached_acl42%of 12
set_posix_acl---of 10
simple_acl_create30%of 10
simple_set_acl---of 6
vfs_get_acl---of 8
vfs_remove_acl---of 37
vfs_set_acl---of 48
-----------
SUMMARY20%of 75

bpf_check_cfg25%of 117
bpf_compute_postorder84%of 12
bpf_compute_scc68%of 25
bpf_copy_insn_array_uniq---of 18
bpf_iarray_realloc67%of 3
cmp_ptr_to_u32---of 1
push_insn60%of 15
visit_func_call_insn100%of 4
-----------
SUMMARY41%of 176

blake2s_compress30%of 10
blake2s_final---of 1
blake2s_update67%of 6
-----------
SUMMARY44%of 16

__disk_unblock_events---of 8
disk_add_events---of 4
disk_alloc_events---of 5
disk_block_events50%of 4
disk_check_events---of 12
disk_check_media_change---of 8
disk_del_events---of 7
disk_events_async_show---of 1
disk_events_poll_msecs_show---of 3
disk_events_poll_msecs_store---of 6
disk_events_set_dfl_poll_msecs---of 8
disk_events_show---of 8
disk_events_workfn---of 1
disk_flush_events50%of 4
disk_force_media_change67%of 3
disk_release_events---of 4
disk_unblock_events67%of 3
-----------
SUMMARY58%of 14

__nf_hook_entries_free---of 1
__nf_hook_entries_try_shrink---of 23
__nf_register_net_hook---of 28
__nf_unregister_net_hook---of 15
accept_all---of 1
netfilter_net_exit---of 1
netfilter_net_init---of 4
nf_conntrack_destroy---of 3
nf_ct_attach50%of 4
nf_ct_get_tuple_skb---of 3
nf_ct_set_closing---of 4
nf_hook_entries_delete_raw---of 6
nf_hook_entries_grow---of 21
nf_hook_entries_insert_raw---of 11
nf_hook_entry_head---of 19
nf_hook_slow40%of 10
nf_hook_slow_list---of 12
nf_register_net_hook---of 8
nf_register_net_hooks---of 10
nf_unregister_net_hook---of 4
nf_unregister_net_hooks---of 7
-----------
SUMMARY43%of 14

-----------
SUMMARY---of 0

new_id_show---of 1
new_id_store---of 4
usb_serial_bus_deregister---of 7
usb_serial_bus_register---of 1
usb_serial_device_match100%of 1
usb_serial_device_probe25%of 8
usb_serial_device_remove---of 5
-----------
SUMMARY34%of 9

-----------
SUMMARY---of 0

fib_add_ifaddr---of 17
fib_compute_spec_dst---of 18
fib_del_ifaddr---of 54
fib_disable_ip---of 8
fib_flush---of 7
fib_get_table50%of 4
fib_gw_from_via---of 11
fib_inetaddr_event---of 11
fib_info_nh_uses_dev---of 18
fib_magic---of 4
fib_modify_prefix_metric---of 5
fib_net_exit---of 1
fib_net_exit_batch---of 7
fib_net_init---of 10
fib_netdev_event---of 26
fib_new_table---of 12
fib_unmerge---of 13
fib_validate_source8%of 54
inet_addr_type24%of 13
inet_addr_type_dev_table31%of 13
inet_addr_type_table---of 13
inet_dev_addr_type---of 16
inet_dump_fib---of 24
inet_rtm_delroute---of 10
inet_rtm_newroute---of 5
ip_fib_net_exit---of 9
ip_rt_ioctl---of 50
ip_valid_fib_dump_req---of 32
nexthop_uses_dev---of 9
nl_fib_input---of 10
nlmsg_parse_deprecated_strict---of 4
rtm_to_fib_config---of 47
-----------
SUMMARY16%of 84

-----------
SUMMARY---of 0

__bpf_map_inc_not_zero---of 6
__bpf_prog_put_noref50%of 6
__bpf_prog_put_rcu---of 1
__ia32_sys_bpf100%of 1
__sys_bpf16%of 52
__x64_sys_bpf---of 1
attach_type_to_prog_type10%of 50
bpf_audit_prog29%of 7
bpf_btf_get_fd_by_id---of 9
bpf_btf_load34%of 9
bpf_check_uarg_tail_zero---of 6
bpf_copy_to_user---of 6
bpf_disable_instrumentation---of 4
bpf_dummy_read---of 1
bpf_dummy_write---of 1
bpf_enable_instrumentation---of 6
bpf_enable_stats---of 7
bpf_get_file_flag---of 1
bpf_get_unmapped_area---of 3
bpf_insn_prepare_dump---of 22
bpf_iter_create---of 11
bpf_kallsyms_lookup_name---of 5
bpf_link_by_id---of 8
bpf_link_cleanup---of 3
bpf_link_defer_dealloc_rcu_gp---of 8
bpf_link_free---of 21
bpf_link_get_curr_or_next---of 9
bpf_link_get_fd_by_id---of 13
bpf_link_get_from_fd---of 6
bpf_link_inc---of 1
bpf_link_inc_not_zero---of 5
bpf_link_init---of 4
bpf_link_init_sleepable---of 4
bpf_link_new_fd---of 1
bpf_link_poll---of 1
bpf_link_prime---of 8
bpf_link_put---of 3
bpf_link_put_deferred---of 1
bpf_link_release---of 3
bpf_link_settle---of 1
bpf_link_show_fdinfo---of 10
bpf_map_alloc_id---of 7
bpf_map_alloc_pages---of 11
bpf_map_alloc_percpu---of 8
bpf_map_area_alloc50%of 10
bpf_map_area_free100%of 1
bpf_map_area_mmapable_alloc---of 8
bpf_map_copy_value---of 31
bpf_map_do_batch---of 22
bpf_map_free67%of 9
bpf_map_free_deferred---of 6
bpf_map_free_id---of 3
bpf_map_free_rcu_gp---of 1
bpf_map_free_record---of 1
bpf_map_get---of 7
bpf_map_get_curr_or_next---of 9
bpf_map_get_fd_by_id---of 14
bpf_map_get_with_uref---of 7
bpf_map_inc---of 1
bpf_map_inc_not_zero---of 5
bpf_map_inc_with_uref---of 1
bpf_map_init_from_attr67%of 3
bpf_map_kmalloc_node---of 8
bpf_map_kmalloc_nolock---of 8
bpf_map_kvcalloc---of 10
bpf_map_kzalloc---of 8
bpf_map_memcg_enter---of 12
bpf_map_memcg_exit---of 8
bpf_map_mmap---of 20
bpf_map_mmap_close---of 3
bpf_map_mmap_open---of 3
bpf_map_new_fd---of 3
bpf_map_poll---of 3
bpf_map_put---of 9
bpf_map_put_with_uref---of 4
bpf_map_release---of 6
bpf_map_save_memcg---of 6
bpf_map_show_fdinfo---of 5
bpf_map_update_value---of 31
bpf_map_write_active---of 1
bpf_net_capable---of 3
bpf_obj_free_fields---of 27
bpf_obj_free_task_work---of 4
bpf_obj_free_timer---of 4
bpf_obj_free_workqueue---of 4
bpf_obj_get---of 7
bpf_obj_get_info_by_fd---of 132
bpf_obj_get_next_id---of 5
bpf_obj_name_cpy28%of 11
bpf_obj_pin---of 6
bpf_obj_pin_uptrs---of 18
bpf_obj_unpin_uptrs---of 9
bpf_perf_link_attach---of 8
bpf_perf_link_dealloc---of 1
bpf_perf_link_fill_common---of 9
bpf_perf_link_fill_link_info---of 9
bpf_perf_link_release---of 1
bpf_perf_link_show_fdinfo---of 9
bpf_prog_add---of 1
bpf_prog_alloc_id72%of 7
bpf_prog_attach13%of 39
bpf_prog_attach_check_attach_type---of 26
bpf_prog_bind_map---of 29
bpf_prog_by_id---of 8
bpf_prog_detach8%of 41
bpf_prog_free_id---of 3
bpf_prog_get---of 6
bpf_prog_get_curr_or_next---of 9
bpf_prog_get_fd_by_id---of 17
bpf_prog_get_ok---of 4
bpf_prog_get_type_dev---of 8
bpf_prog_inc---of 1
bpf_prog_inc_misses_counter---of 3
bpf_prog_inc_not_zero---of 6
bpf_prog_load25%of 97
bpf_prog_load_check_attach16%of 44
bpf_prog_mark_insn_arrays_ready29%of 7
bpf_prog_new_fd67%of 3
bpf_prog_put---of 5
bpf_prog_put_deferred---of 9
bpf_prog_query---of 15
bpf_prog_release---of 5
bpf_prog_show_fdinfo---of 5
bpf_prog_sub---of 3
bpf_prog_test_run14%of 22
bpf_prog_verify_signature---of 12
bpf_raw_tp_link_attach---of 21
bpf_raw_tp_link_dealloc---of 1
bpf_raw_tp_link_fill_link_info---of 4
bpf_raw_tp_link_release---of 1
bpf_raw_tp_link_show_fdinfo---of 1
bpf_raw_tracepoint_open---of 13
bpf_stats_handler---of 8
bpf_stats_release---of 1
bpf_sys_bpf---of 11
bpf_sys_close---of 1
bpf_task_fd_query---of 14
bpf_task_fd_query_copy---of 13
bpf_tracing_link_dealloc---of 1
bpf_tracing_link_fill_link_info---of 5
bpf_tracing_link_release---of 6
bpf_tracing_link_show_fdinfo---of 1
bpf_tracing_prog_attach---of 49
bpf_unpriv_handler---of 8
btf_field_cmp---of 1
btf_record_dup---of 26
btf_record_equal---of 5
btf_record_find---of 5
btf_record_free9%of 24
check_and_init_map_value---of 20
copy_from_bpfptr50%of 4
copy_map_value---of 5
find_prog_type50%of 4
generic_map_delete_batch---of 23
generic_map_lookup_batch---of 48
generic_map_update_batch---of 37
get_uid50%of 4
kern_sys_bpf---of 11
kvmemdup_bpfptr_noprof---of 7
license_is_gpl_compatible43%of 7
link_create---of 41
link_detach---of 12
link_update---of 46
map_check_btf---of 59
map_check_no_btf---of 1
map_create24%of 99
map_delete_elem---of 23
map_freeze---of 13
map_get_next_key---of 18
map_lookup_and_delete_elem---of 44
map_lookup_elem---of 44
map_update_elem---of 47
prog_assoc_struct_ops---of 22
prog_stream_read---of 12
strncpy_from_bpfptr67%of 3
syscall_prog_func_proto---of 6
syscall_prog_is_valid_access---of 1
token_create---of 4
token_get_info_by_fd---of 6
-----------
SUMMARY23%of 564

__bpf_trace_cap_capable---of 1
__probestub_cap_capable---of 1
__traceiter_cap_capable---of 4
cap_bprm_creds_from_file---of 63
cap_capable30%of 10
cap_capget---of 1
cap_capset---of 13
cap_convert_nscap---of 16
cap_inode_getsecurity---of 21
cap_inode_killpriv---of 1
cap_inode_need_killpriv---of 1
cap_inode_removexattr---of 5
cap_inode_setxattr---of 4
cap_mmap_addr16%of 13
cap_ptrace_access_check---of 4
cap_ptrace_traceme---of 4
cap_settime---of 1
cap_task_fix_setuid---of 28
cap_task_prctl6%of 39
cap_task_setioprio67%of 3
cap_task_setnice---of 3
cap_task_setscheduler---of 3
cap_vm_enough_memory---of 10
get_vfs_caps_from_disk---of 17
perf_trace_cap_capable---of 8
trace_event_raw_event_cap_capable---of 9
trace_raw_output_cap_capable---of 3
vfsuid_root_in_currentns---of 5
-----------
SUMMARY14%of 65

tomoyo_init_log---of 61
tomoyo_poll_log---of 5
tomoyo_read_log---of 8
tomoyo_write_log---of 1
tomoyo_write_log210%of 20
-----------
SUMMARY10%of 20

-----------
SUMMARY---of 0

__find_acq_core---of 52
__list_add---of 4
__xfrm6_daddr_saddr_hash---of 1
__xfrm_dst_hash---of 4
__xfrm_init_state---of 89
__xfrm_spi_hash50%of 4
__xfrm_src_hash---of 4
__xfrm_state_bump_genids---of 18
__xfrm_state_delete---of 48
__xfrm_state_destroy---of 6
__xfrm_state_insert---of 48
__xfrm_state_lookup15%of 14
__xfrm_state_lookup_byaddr13%of 24
km_migrate---of 6
km_new_mapping---of 14
km_policy_expired---of 6
km_policy_notify---of 6
km_query---of 4
km_report---of 6
km_state_expired---of 6
km_state_notify---of 6
netdev_hold---of 5
netdev_put---of 4
refcount_inc---of 4
verify_spi_info---of 11
xfrm_alloc_spi---of 31
xfrm_audit_state_add---of 10
xfrm_audit_state_delete---of 10
xfrm_audit_state_icvfail---of 8
xfrm_audit_state_notfound---of 6
xfrm_audit_state_notfound_simple34%of 6
xfrm_audit_state_replay---of 6
xfrm_audit_state_replay_overflow---of 6
xfrm_dev_state_delete---of 5
xfrm_dev_state_flush---of 32
xfrm_dev_state_free---of 11
xfrm_find_acq---of 1
xfrm_find_acq_byseq---of 11
xfrm_flush_gc---of 1
xfrm_get_acqseq---of 3
xfrm_get_translator---of 3
xfrm_hash_grow_check---of 5
xfrm_hash_resize---of 50
xfrm_init_state---of 4
xfrm_input_state_lookup---of 33
xfrm_migrate_state_find---of 42
xfrm_put_translator---of 1
xfrm_register_km---of 3
xfrm_register_mode_cbs---of 3
xfrm_register_translator---of 3
xfrm_register_type---of 18
xfrm_register_type_offload---of 6
xfrm_replay_timer_handler---of 5
xfrm_sad_getinfo---of 1
xfrm_set_type_offload---of 12
xfrm_state_add---of 45
xfrm_state_afinfo_get_rcu---of 3
xfrm_state_alloc---of 3
xfrm_state_check_expire---of 19
xfrm_state_delete---of 1
xfrm_state_find---of 250
xfrm_state_fini---of 13
xfrm_state_flush---of 37
xfrm_state_free---of 1
xfrm_state_gc_task---of 29
xfrm_state_get_afinfo---of 4
xfrm_state_hold_rcu---of 7
xfrm_state_init---of 9
xfrm_state_insert---of 1
xfrm_state_look_at---of 25
xfrm_state_lookup50%of 6
xfrm_state_lookup_byaddr50%of 6
xfrm_state_lookup_byspi---of 9
xfrm_state_migrate---of 67
xfrm_state_mtu---of 9
xfrm_state_register_afinfo---of 4
xfrm_state_sort---of 24
xfrm_state_unregister_afinfo---of 5
xfrm_state_update---of 70
xfrm_state_update_stats---of 10
xfrm_state_walk---of 42
xfrm_state_walk_done---of 5
xfrm_state_walk_init---of 1
xfrm_stateonly_find---of 25
xfrm_timer_handler---of 40
xfrm_tmpl_sort---of 39
xfrm_unregister_km---of 4
xfrm_unregister_mode_cbs---of 3
xfrm_unregister_translator---of 4
xfrm_unregister_type---of 18
xfrm_unregister_type_offload---of 6
xfrm_user_policy---of 21
-----------
SUMMARY25%of 60

-----------
SUMMARY---of 0

collect_domain_accesses---of 12
current_check_access_path25%of 8
current_check_refer_path---of 66
find_rule---of 3
hook_file_alloc_security100%of 1
hook_file_free_security100%of 1
hook_file_ioctl---of 18
hook_file_ioctl_compat40%of 5
hook_file_open16%of 13
hook_file_set_fowner---of 12
hook_file_truncate---of 3
hook_inode_free_security_rcu---of 3
hook_move_mount---of 7
hook_path_link---of 1
hook_path_mkdir---of 1
hook_path_mknod25%of 8
hook_path_rename---of 1
hook_path_rmdir---of 1
hook_path_symlink---of 1
hook_path_truncate---of 1
hook_path_unlink---of 1
hook_sb_delete---of 17
hook_sb_mount29%of 7
hook_sb_pivotroot---of 7
hook_sb_remount---of 7
hook_sb_umount---of 7
hook_unix_find---of 33
is_access_to_paths_allowed---of 54
is_masked_device_ioctl_compat---of 23
landlock_append_fs_rule---of 16
release_inode---of 4
scope_to_request---of 3
-----------
SUMMARY28%of 43

NF_HOOK38%of 8
__ip_append_data17%of 138
__ip_local_out34%of 9
__ip_make_skb26%of 62
__ip_queue_xmit---of 44
dst_output---of 4
ip_append_data43%of 7
ip_build_and_send_pkt---of 28
ip_copy_metadata---of 19
ip_do_fragment---of 45
ip_finish_output19%of 11
ip_finish_output235%of 44
ip_flush_pending_frames---of 4
ip_frag_init---of 1
ip_frag_next---of 14
ip_fraglist_init---of 8
ip_fraglist_prepare---of 11
ip_fragment---of 5
ip_generic_getfrag---of 7
ip_local_out---of 5
ip_make_skb---of 7
ip_mc_finish_output---of 4
ip_mc_output23%of 18
ip_neigh_gw472%of 7
ip_neigh_gw6---of 7
ip_output34%of 6
ip_push_pending_frames56%of 9
ip_queue_xmit---of 1
ip_reply_glue_bits---of 1
ip_send_check---of 1
ip_send_skb---of 8
ip_send_unicast_reply15%of 42
ip_setup_cork32%of 16
ip_skb_dst_mtu23%of 22
skb_dst_drop---of 4
-----------
SUMMARY25%of 399

-----------
SUMMARY---of 0

_atomic_dec_and_lock_irqsave---of 7
_atomic_dec_and_raw_lock_irqsave---of 7
atomic_dec_and_lock43%of 7
atomic_dec_and_raw_lock---of 7
-----------
SUMMARY43%of 7

-----------
SUMMARY---of 0

__fou_build_header---of 7
__gue_build_header---of 15
fou_build_header---of 11
fou_dump_info---of 20
fou_encap_hlen---of 1
fou_exit_net---of 9
fou_gro_complete---of 6
fou_gro_receive---of 6
fou_init_net---of 1
fou_nl_add_doit---of 28
fou_nl_del_doit---of 20
fou_nl_get_doit---of 20
fou_nl_get_dumpit---of 7
fou_recv_pull---of 17
fou_udp_recv---of 4
gue_build_header---of 6
gue_encap_hlen---of 1
gue_err28%of 44
gue_gro_complete---of 10
gue_gro_receive---of 58
gue_gro_remcsum---of 13
gue_remcsum---of 14
gue_udp_recv---of 40
iptunnel_pull_offloads---of 6
ipv6_addr_cmp---of 1
parse_nl_config---of 25
skb_reset_transport_header67%of 3
udp_hdr---of 3
-----------
SUMMARY30%of 47

get_user_session_keyring_rcu---of 4
install_process_keyring_to_cred---of 4
install_session_keyring_to_cred---of 8
install_thread_keyring_to_cred---of 4
join_session_keyring---of 13
key_change_session_keyring---of 35
key_fsgid_changed---of 3
key_fsuid_changed---of 3
look_up_user_keyrings30%of 20
lookup_user_key13%of 80
lookup_user_key_possessed---of 1
refcount_inc---of 4
search_cred_keyrings_rcu16%of 25
search_process_keyrings_rcu30%of 10
-----------
SUMMARY18%of 135

-----------
SUMMARY---of 0

fscrypt_derive_dirhash_key---of 1
fscrypt_destroy_prepared_key---of 1
fscrypt_drop_inode43%of 7
fscrypt_free_inode---of 4
fscrypt_get_encryption_info---of 14
fscrypt_hash_inode_number---of 5
fscrypt_prepare_key---of 10
fscrypt_prepare_new_inode---of 11
fscrypt_put_encryption_info67%of 3
fscrypt_set_per_file_enc_key---of 1
fscrypt_setup_encryption_info---of 55
list_add---of 4
put_crypt_info23%of 9
refcount_inc---of 4
setup_per_mode_enc_key---of 12
-----------
SUMMARY37%of 19

add_rules---of 16
gid_eq---of 1
gid_gt---of 1
gid_lt---of 1
ima_alloc_rule_opt_list---of 9
ima_appraise_signature---of 12
ima_check_policy---of 1
ima_delete_rules---of 7
ima_free_rule---of 5
ima_lsm_copy_rule---of 20
ima_lsm_policy_change---of 14
ima_match_policy20%of 111
ima_parse_add_rule---of 320
ima_parse_appraise_algos---of 6
ima_policy_next---of 1
ima_policy_show---of 107
ima_policy_start---of 5
ima_policy_stop---of 1
ima_rule_contains_lsm_cond---of 7
ima_update_policy---of 10
ima_update_policy_flags---of 6
uid_eq---of 1
uid_gt---of 1
uid_lt---of 1
vfsgid_eq_kgid---of 1
vfsgid_gt_kgid---of 1
vfsgid_lt_kgid---of 1
vfsuid_eq_kuid---of 1
vfsuid_gt_kuid---of 1
vfsuid_lt_kuid---of 1
-----------
SUMMARY20%of 111

-----------
SUMMARY---of 0

__async_schedule_node_domain---of 6
async_run_entry_fn---of 7
async_schedule_dev_nocall---of 4
async_schedule_node---of 4
async_schedule_node_domain---of 4
async_synchronize_cookie---of 1
async_synchronize_cookie_domain---of 20
async_synchronize_full---of 1
async_synchronize_full_domain---of 1
current_is_async40%of 5
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

cmp_ex_search100%of 1
cmp_ex_sort---of 1
search_extable100%of 1
sort_extable---of 1
swap_ex---of 1
trim_init_extable---of 13
-----------
SUMMARY100%of 2

__bpf_trace_csd_function---of 1
__bpf_trace_csd_queue_cpu---of 1
__flush_smp_call_function_queue---of 51
__probestub_csd_function_entry---of 1
__probestub_csd_function_exit---of 1
__probestub_csd_queue_cpu---of 1
__smp_call_single_queue25%of 16
__traceiter_csd_function_entry---of 4
__traceiter_csd_function_exit---of 4
__traceiter_csd_queue_cpu---of 4
cpus_peek_for_pending_ipi---of 6
do_nothing---of 1
flush_smp_call_function_queue---of 6
generic_exec_single12%of 18
generic_smp_call_function_single_interrupt---of 1
kick_all_cpus_sync---of 3
on_each_cpu_cond_mask100%of 3
perf_trace_csd_function---of 8
perf_trace_csd_queue_cpu---of 8
smp_call_function---of 3
smp_call_function_any---of 5
smp_call_function_many---of 1
smp_call_function_many_cond46%of 64
smp_call_function_single---of 14
smp_call_function_single_async60%of 5
smp_call_on_cpu---of 4
smp_call_on_cpu_callback---of 5
smpcfd_dead_cpu---of 1
smpcfd_dying_cpu---of 1
smpcfd_prepare_cpu---of 1
trace_event_raw_event_csd_function---of 9
trace_event_raw_event_csd_queue_cpu---of 9
trace_raw_output_csd_function---of 3
trace_raw_output_csd_queue_cpu---of 3
wake_up_all_idle_cpus---of 10
-----------
SUMMARY39%of 106

-----------
SUMMARY---of 0

__ext4_ext_check---of 30
__ext4_ext_dirty---of 9
__read_extent_tree_block---of 19
ext4_alloc_file_blocks---of 44
ext4_clu_mapped---of 16
ext4_collapse_range---of 22
ext4_convert_unwritten_extents---of 14
ext4_convert_unwritten_extents_atomic---of 15
ext4_convert_unwritten_io_end_vec---of 9
ext4_datasem_ensure_credits---of 5
ext4_do_fallocate---of 25
ext4_es_is_delayed---of 1
ext4_ext_calc_credits_for_single_extent---of 5
ext4_ext_check_inode---of 1
ext4_ext_check_overlap---of 20
ext4_ext_clear_bb---of 18
ext4_ext_correct_indexes---of 19
ext4_ext_find_goal---of 5
ext4_ext_get_access---of 4
ext4_ext_index_trans_blocks---of 6
ext4_ext_init100%of 1
ext4_ext_insert_extent---of 159
ext4_ext_insert_index---of 13
ext4_ext_map_blocks---of 176
ext4_ext_next_allocated_block---of 12
ext4_ext_precache---of 18
ext4_ext_release---of 1
ext4_ext_remove_space---of 150
ext4_ext_replay_set_iblocks---of 33
ext4_ext_replay_shrink_inode---of 15
ext4_ext_replay_update_ex---of 22
ext4_ext_rm_idx---of 22
ext4_ext_search_right---of 23
ext4_ext_shift_extents---of 62
ext4_ext_tree_init---of 1
ext4_ext_truncate---of 7
ext4_ext_try_to_merge---of 13
ext4_ext_try_to_merge_right---of 18
ext4_ext_zeroout---of 1
ext4_fallocate---of 16
ext4_fiemap---of 10
ext4_find_extent---of 37
ext4_free_ext_path---of 6
ext4_get_es_cache---of 22
ext4_insert_range---of 27
ext4_iomap_xattr_begin---of 6
ext4_split_convert_extents---of 67
ext4_split_extent_at---of 32
ext4_swap_extents---of 66
ext4_update_inode_fsync_trans---of 7
ext4_update_inode_size---of 9
ext4_zero_range---of 43
get_implied_cluster_alloc---of 21
trace_ext4_ext_convert_to_initialized_fastpath---of 5
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

active_duration_show---of 3
authorized_default_show---of 1
authorized_default_store---of 3
authorized_show---of 1
authorized_store---of 4
autosuspend_show---of 1
autosuspend_store---of 3
avoid_reset_quirk_show---of 1
avoid_reset_quirk_store---of 4
bAlternateSetting_show---of 1
bConfigurationValue_show---of 4
bConfigurationValue_store---of 4
bDeviceClass_show---of 1
bDeviceProtocol_show---of 1
bDeviceSubClass_show---of 1
bInterfaceClass_show---of 1
bInterfaceNumber_show---of 1
bInterfaceProtocol_show---of 1
bInterfaceSubClass_show---of 1
bMaxPacketSize0_show---of 1
bMaxPower_show---of 4
bNumConfigurations_show---of 1
bNumEndpoints_show---of 1
bNumInterfaces_show---of 4
bcdDevice_show---of 1
bmAttributes_show---of 4
bos_descriptors_read---of 4
busnum_show---of 1
configuration_show---of 5
connected_duration_show---of 1
descriptors_read---of 9
dev_bin_attrs_are_visible75%of 4
dev_string_attrs_are_visible88%of 8
devnum_show---of 1
devpath_show---of 1
devspec_show---of 1
iad_bFirstInterface_show---of 1
iad_bFunctionClass_show---of 1
iad_bFunctionProtocol_show---of 1
iad_bFunctionSubClass_show---of 1
iad_bInterfaceCount_show---of 1
idProduct_show---of 1
idVendor_show---of 1
interface_authorized_default_show---of 1
interface_authorized_default_store---of 4
interface_authorized_show---of 1
interface_authorized_store---of 5
interface_show---of 3
intf_assoc_attrs_are_visible67%of 3
intf_wireless_status_attr_is_visible50%of 4
level_show---of 5
level_store---of 10
ltm_capable_show---of 6
manufacturer_show---of 3
maxchild_show---of 1
modalias_show---of 1
persist_show---of 1
persist_store---of 5
product_show---of 3
quirks_show---of 1
remove_store---of 3
rx_lanes_show---of 1
serial_show---of 3
speed_show---of 8
supports_autosuspend_show---of 4
tx_lanes_show---of 1
urbnum_show---of 1
usb2_hardware_lpm_show---of 1
usb2_hardware_lpm_store---of 5
usb2_lpm_besl_show---of 1
usb2_lpm_besl_store---of 3
usb2_lpm_l1_timeout_show---of 1
usb2_lpm_l1_timeout_store---of 3
usb3_hardware_lpm_u1_show---of 3
usb3_hardware_lpm_u2_show---of 3
usb_create_sysfs_dev_files30%of 20
usb_create_sysfs_intf_files34%of 6
usb_remove_sysfs_dev_files50%of 4
usb_remove_sysfs_intf_files67%of 3
usb_update_wireless_status_attr---of 3
version_show---of 1
wireless_status_show---of 1
-----------
SUMMARY50%of 52

-----------
SUMMARY---of 0

___perf_sw_event---of 17
__ctx_time_update---of 11
__free_event---of 47
__free_perf_ctx_data_rcu---of 1
__ia32_sys_perf_event_open---of 1
__list_add---of 4
__perf_cgroup_move---of 3
__perf_event_disable---of 32
__perf_event_enable---of 38
__perf_event_exit_context---of 4
__perf_event_header__init_id---of 14
__perf_event_overflow---of 46
__perf_event_period---of 10
__perf_event_read---of 32
__perf_event_read_cpu---of 16
__perf_event_read_value---of 4
__perf_event_set_bpf_prog---of 23
__perf_event_stop---of 5
__perf_event_task_sched_in---of 67
__perf_event_task_sched_out---of 34
__perf_install_in_context---of 30
__perf_pmu_output_stop---of 40
__perf_pmu_remove---of 37
__perf_read_group_add---of 22
__perf_remove_from_context---of 110
__perf_sw_event---of 5
__pmu_ctx_sched_in---of 5
__pmu_ctx_sched_out---of 32
__se_sys_perf_event_open---of 104
__x64_sys_perf_event_open---of 1
_free_event---of 55
_perf_event_disable---of 3
_perf_event_enable---of 6
_perf_event_period---of 6
_perf_event_reset---of 1
account_event---of 37
add_event_to_ctx---of 34
attach_task_ctx_data---of 16
calc_timer_values---of 14
context_equiv---of 12
cpu_clock_event_add---of 5
cpu_clock_event_del---of 7
cpu_clock_event_init---of 7
cpu_clock_event_read---of 4
cpu_clock_event_start---of 4
cpu_clock_event_stop---of 8
cpumask_show---of 7
ctx_event_to_rotate---of 52
ctx_resched---of 35
ctx_sched_in---of 37
ctx_sched_out---of 25
detach_task_ctx_data---of 6
event_function---of 18
event_function_call---of 24
event_sched_in---of 44
event_sched_out---of 31
exclusive_event_init---of 10
exclusive_event_installable---of 8
find_get_context---of 22
find_get_pmu_context---of 19
find_lively_task_by_vpid---of 7
free_cpc_rcu---of 1
free_ctx---of 1
free_epc_rcu---of 1
free_event_rcu---of 3
free_perf_ctx_data---of 1
get_pmu_ctx---of 5
inherit_event---of 20
inherit_task_group---of 39
ktime_get_boottime_ns---of 1
ktime_get_clocktai_ns---of 1
ktime_get_real_ns---of 1
map_range---of 6
mediated_pmu_account_event---of 9
merge_sched_in---of 56
nr_addr_filters_show---of 1
perf_adjust_freq_unthr_context---of 14
perf_adjust_freq_unthr_events---of 21
perf_adjust_period---of 23
perf_allow_kernel---of 5
perf_bp_event---of 11
perf_callchain---of 10
perf_callchain_deferred_output---of 8
perf_cgroup_attach---of 7
perf_cgroup_connect---of 21
perf_cgroup_css_alloc---of 4
perf_cgroup_css_free---of 1
perf_cgroup_css_online---of 8
perf_cgroup_set_timestamp---of 8
perf_cgroup_switch---of 28
perf_check_permission---of 8
perf_compat_ioctl---of 8
perf_copy_attr---of 36
perf_cpu_task_ctx---of 1
perf_cpu_time_max_percent_handler---of 5
perf_create_mediated_pmu---of 7
perf_ctx_sched_task_cb---of 7
perf_duration_warn---of 3
perf_event__header_size---of 1
perf_event__id_header_size---of 1
perf_event__output_id_sample---of 14
perf_event_account_interrupt---of 8
perf_event_addr_filters_apply---of 30
perf_event_addr_filters_sync---of 4
perf_event_alloc---of 139
perf_event_attrs---of 1
perf_event_aux_event---of 5
perf_event_aux_pause---of 21
perf_event_bpf_event38%of 8
perf_event_bpf_output---of 6
perf_event_cgroup_output---of 9
perf_event_comm---of 7
perf_event_comm_output---of 13
perf_event_create_kernel_counter---of 18
perf_event_delayed_put---of 3
perf_event_disable---of 11
perf_event_disable_inatomic---of 1
perf_event_disable_local---of 21
perf_event_enable---of 14
perf_event_exec---of 63
perf_event_exit_cpu---of 26
perf_event_exit_event---of 29
perf_event_exit_task---of 13
perf_event_exit_task_context---of 24
perf_event_fork---of 22
perf_event_free_bpf_prog---of 4
perf_event_free_task---of 1
perf_event_get---of 4
perf_event_groups_insert---of 18
perf_event_header__init_id---of 3
perf_event_idx_default---of 1
perf_event_init_cpu---of 21
perf_event_init_task---of 19
perf_event_itrace_started---of 1
perf_event_ksymbol---of 15
perf_event_ksymbol_output---of 9
perf_event_max_sample_rate_handler---of 6
perf_event_mmap4%of 54
perf_event_mmap_output---of 32
perf_event_modify_attr---of 25
perf_event_mux_interval_ms_show---of 1
perf_event_mux_interval_ms_store---of 8
perf_event_namespaces---of 16
perf_event_namespaces_output---of 9
perf_event_nop_int---of 1
perf_event_output---of 8
perf_event_output_backward---of 8
perf_event_output_forward---of 8
perf_event_overflow---of 1
perf_event_pause---of 15
perf_event_period---of 14
perf_event_read---of 22
perf_event_read_local---of 20
perf_event_read_value---of 10
perf_event_refresh---of 16
perf_event_release_kernel---of 29
perf_event_set_bpf_prog---of 10
perf_event_set_clock---of 9
perf_event_set_output---of 29
perf_event_switch_output---of 11
perf_event_sync_stat---of 20
perf_event_sysfs_show---of 3
perf_event_task_disable---of 21
perf_event_task_enable---of 27
perf_event_task_output---of 17
perf_event_task_tick---of 3
perf_event_text_poke---of 3
perf_event_text_poke_output---of 17
perf_event_throttle_group---of 11
perf_event_unthrottle_group---of 15
perf_event_update_sibling_time---of 12
perf_event_update_time---of 8
perf_event_update_userpage---of 8
perf_event_validate_size---of 8
perf_event_wakeup---of 7
perf_exclude_event---of 9
perf_fasync---of 3
perf_get_aux_event---of 19
perf_get_event---of 3
perf_get_page_size---of 19
perf_install_in_context---of 26
perf_instruction_pointer---of 4
perf_ioctl---of 125
perf_iterate_sb---of 44
perf_load_guest_context---of 41
perf_lock_task_context---of 18
perf_log_lost_samples---of 5
perf_log_throttle---of 7
perf_misc_flags---of 4
perf_mmap---of 19
perf_mmap_aux---of 24
perf_mmap_calc_limits---of 5
perf_mmap_close---of 45
perf_mmap_may_split---of 1
perf_mmap_open---of 15
perf_mmap_pfn_mkwrite---of 1
perf_mmap_rb---of 37
perf_mux_hrtimer_handler---of 38
perf_mux_hrtimer_restart_ipi---of 3
perf_output_read---of 59
perf_output_sample---of 107
perf_pending_disable---of 9
perf_pending_irq---of 12
perf_pending_task---of 9
perf_pin_task_context---of 3
perf_pmu_cancel_txn---of 4
perf_pmu_commit_txn---of 4
perf_pmu_disable---of 3
perf_pmu_enable---of 3
perf_pmu_free---of 14
perf_pmu_migrate_context---of 25
perf_pmu_nop_int---of 1
perf_pmu_nop_txn---of 1
perf_pmu_nop_void---of 1
perf_pmu_register---of 35
perf_pmu_resched---of 10
perf_pmu_sched_task---of 21
perf_pmu_start_txn---of 4
perf_pmu_unregister---of 71
perf_poll---of 14
perf_prepare_header---of 6
perf_prepare_sample---of 86
perf_put_guest_context---of 39
perf_read---of 27
perf_reboot---of 5
perf_register_guest_info_callbacks---of 6
perf_release---of 1
perf_release_mediated_pmu---of 3
perf_remove_from_context---of 3
perf_remove_from_owner---of 12
perf_report_aux_output_id---of 5
perf_sample_event_took---of 7
perf_sched_cb_dec---of 5
perf_sched_cb_inc---of 5
perf_sched_delayed---of 3
perf_swevent_add---of 10
perf_swevent_del---of 4
perf_swevent_destroy_hrtimer---of 1
perf_swevent_event---of 23
perf_swevent_get_recursion_context---of 3
perf_swevent_hrtimer---of 14
perf_swevent_init---of 9
perf_swevent_put_recursion_context---of 1
perf_swevent_read---of 1
perf_swevent_set_period---of 5
perf_swevent_start---of 1
perf_swevent_stop---of 1
perf_tp_event---of 114
perf_tp_event_init---of 5
perf_trace_run_bpf_submit---of 5
perf_try_init_event---of 46
perf_unpin_context---of 1
perf_unregister_guest_info_callbacks---of 3
perf_unwind_deferred_callback---of 1
perf_uprobe_event_init---of 6
pmu_dev_alloc---of 7
pmu_dev_is_visible---of 6
pmu_dev_release---of 1
put_ctx---of 12
put_event---of 4
put_pmu_ctx---of 11
put_task_struct---of 4
rb_free_rcu---of 1
ref_ctr_offset_show---of 1
refcount_inc---of 4
refcount_inc_not_zero---of 7
remote_function---of 5
retprobe_show---of 1
ring_buffer_attach---of 25
ring_buffer_get---of 8
ring_buffer_put---of 5
srcu_read_unlock---of 3
sw_perf_event_destroy---of 10
swevent_hlist_get---of 15
task_clock_event_add---of 5
task_clock_event_del---of 7
task_clock_event_init---of 7
task_clock_event_read---of 4
task_clock_event_start---of 4
task_clock_event_stop---of 8
task_ctx_sched_out---of 4
tp_perf_event_destroy---of 1
type_show---of 1
update_cgrp_time_from_cpuctx---of 8
update_context_time---of 3
visit_groups_merge---of 95
vm_flags_set---of 5
-----------
SUMMARY9%of 62

-----------
SUMMARY---of 0

ext4_attr_show---of 34
ext4_attr_store---of 40
ext4_exit_sysfs---of 1
ext4_feat_release---of 1
ext4_notify_error_sysfs---of 3
ext4_register_sysfs50%of 6
ext4_sb_release---of 1
ext4_unregister_sysfs---of 3
-----------
SUMMARY50%of 6

-----------
SUMMARY---of 0

__first_packet_length---of 14
__skb_push---of 5
__skb_recv_udp---of 31
__udp4_lib_lookup13%of 41
__udp4_lib_lookup_skb---of 7
__udp4_lib_mcast_deliver---of 29
__udp_disconnect---of 9
__udp_enqueue_schedule_skb---of 41
bpf_iter_fini_udp---of 1
bpf_iter_init_udp---of 4
bpf_iter_udp_batch---of 39
bpf_iter_udp_realloc_batch---of 13
bpf_iter_udp_seq_next---of 7
bpf_iter_udp_seq_show---of 4
bpf_iter_udp_seq_start---of 3
bpf_iter_udp_seq_stop---of 12
dst_clone---of 5
first_packet_length---of 9
l3mdev_master_ifindex_by_index---of 3
netif_index_is_l3_master---of 4
nf_reset_ct---of 5
set_xfrm_gro_udp_encap_rcv---of 5
skb_consume_udp---of 12
skb_mac_offset---of 3
skb_transport_header---of 3
sock_put---of 4
udp4_hash4---of 4
udp4_hwcsum---of 8
udp4_lib_lookup---of 8
udp4_lib_lookup28%of 25
udp4_lib_lookup_skb---of 9
udp4_proc_exit---of 1
udp4_proc_exit_net---of 1
udp4_proc_init_net---of 1
udp4_seq_show---of 5
udp_abort---of 7
udp_cmsg_send---of 10
udp_connect---of 5
udp_destroy_sock---of 11
udp_destruct_common---of 10
udp_destruct_sock---of 1
udp_disconnect---of 1
udp_ehashfn---of 4
udp_encap_disable---of 1
udp_encap_enable---of 1
udp_err26%of 75
udp_flow_hashrnd---of 4
udp_flush_pending_frames---of 3
udp_getsockopt---of 3
udp_init_sock---of 5
udp_ioctl---of 4
udp_lib_checksum_complete---of 6
udp_lib_close---of 1
udp_lib_get_port---of 57
udp_lib_getsockopt---of 12
udp_lib_hash4---of 6
udp_lib_lport_inuse---of 23
udp_lib_lport_inuse2---of 18
udp_lib_rehash---of 24
udp_lib_setsockopt---of 23
udp_lib_unhash---of 19
udp_pernet_exit---of 3
udp_pernet_init---of 16
udp_poll---of 7
udp_pre_connect---of 1
udp_push_pending_frames---of 3
udp_queue_rcv_one_skb---of 50
udp_queue_rcv_skb---of 25
udp_rcv---of 70
udp_rcv_segment---of 9
udp_read_skb---of 23
udp_recvmsg---of 38
udp_rmem_release---of 13
udp_send_skb---of 25
udp_sendmsg---of 83
udp_seq_next---of 37
udp_seq_start---of 40
udp_seq_stop---of 3
udp_set_csum---of 9
udp_setsockopt---of 3
udp_sk_rx_dst_set---of 4
udp_skb_destructor---of 1
udp_splice_eof---of 7
udp_tunnel_encap_enable---of 4
udp_unhash4---of 5
udp_unicast_rcv_skb---of 6
udp_v4_early_demux---of 57
udp_v4_get_port---of 1
udp_v4_rehash---of 1
xfrm4_policy_check---of 22
-----------
SUMMARY19%of 141

__btf_array_show---of 39
__btf_resolve_size---of 41
__btf_struct_show---of 20
__btf_verifier_log100%of 1
__btf_verifier_log_type54%of 15
__get_type_size---of 29
__print_cand_cache---of 9
__register_btf_kfunc_id_set---of 89
bpf_btf_find_by_name_kind---of 8
bpf_btf_show_fdinfo---of 1
bpf_core_add_cands---of 23
bpf_core_apply---of 20
bpf_core_essential_name_len---of 10
bpf_core_find_cands---of 41
bpf_core_types_are_compat---of 1
bpf_core_types_match---of 1
bpf_find_btf_id---of 16
bpf_prog_get_target_btf---of 3
bpf_prog_type_to_kfunc_hook---of 28
btf_array_check_member---of 4
btf_array_check_meta34%of 9
btf_array_log100%of 1
btf_array_resolve17%of 48
btf_array_show---of 8
btf_base_btf---of 1
btf_bitfield_show---of 6
btf_check_and_fixup_fields---of 21
btf_check_iter_arg---of 40
btf_check_sec_info24%of 21
btf_check_type_match---of 97
btf_check_type_tags17%of 31
btf_ctx_access---of 121
btf_ctx_arg_idx---of 40
btf_ctx_arg_offset---of 10
btf_datasec_check_meta14%of 29
btf_datasec_log100%of 1
btf_datasec_resolve---of 22
btf_datasec_show---of 27
btf_decl_tag_check_meta---of 11
btf_decl_tag_log---of 1
btf_decl_tag_resolve---of 31
btf_df_check_kflag_member---of 1
btf_df_check_member---of 1
btf_df_resolve---of 1
btf_df_show---of 1
btf_distill_func_proto---of 16
btf_enum64_check_meta4%of 55
btf_enum64_show---of 44
btf_enum_check_kflag_member---of 7
btf_enum_check_member---of 4
btf_enum_check_meta---of 55
btf_enum_log100%of 1
btf_enum_show---of 44
btf_field_cmp---of 1
btf_find_by_name_kind20%of 62
btf_find_decl_tag_value---of 18
btf_find_dtor_kfunc---of 4
btf_find_field_one---of 113
btf_find_graph_root---of 10
btf_find_kptr7%of 49
btf_find_next_decl_tag---of 28
btf_find_struct_field---of 11
btf_find_struct_meta---of 3
btf_float_check_member---of 4
btf_float_check_meta---of 9
btf_float_log---of 1
btf_free42%of 12
btf_free_kfunc_set_tab67%of 3
btf_free_rcu---of 1
btf_func_check_meta29%of 21
btf_func_proto_check_meta40%of 5
btf_func_proto_log24%of 26
btf_func_resolve---of 13
btf_fwd_check_meta---of 20
btf_fwd_type_log---of 1
btf_generic_check_kflag_member---of 3
btf_get---of 4
btf_get_by_fd23%of 9
btf_get_fd_by_id---of 12
btf_get_info_by_fd---of 16
btf_get_name---of 1
btf_get_ptr_to_btf_id---of 36
btf_header---of 1
btf_id_cmp_func---of 1
btf_int128_print---of 22
btf_int_check_kflag_member---of 12
btf_int_check_member---of 6
btf_int_check_meta17%of 12
btf_int_log---of 6
btf_int_show---of 117
btf_is_dynptr_ptr---of 23
btf_is_kernel---of 1
btf_is_module---of 3
btf_is_prog_ctx_type---of 44
btf_is_projection_of---of 7
btf_is_vmlinux---of 3
btf_kfunc_flags---of 12
btf_kfunc_is_allowed---of 21
btf_kfunc_is_modify_return---of 9
btf_member_is_reg_int---of 9
btf_modifier_check_kflag_member---of 3
btf_modifier_check_member---of 3
btf_modifier_resolve9%of 35
btf_modifier_show---of 25
btf_name_by_offset---of 6
btf_named_start_id---of 6
btf_nested_type_is_trusted---of 51
btf_new_fd30%of 37
btf_nr_types---of 4
btf_obj_id---of 1
btf_param_match_suffix---of 9
btf_parse_fields---of 116
btf_parse_graph_root---of 32
btf_parse_hdr27%of 19
btf_parse_layout_sec25%of 8
btf_parse_str_sec30%of 10
btf_parse_struct_metas18%of 41
btf_parse_type_sec28%of 137
btf_parse_vmlinux---of 3
btf_prepare_func_args---of 134
btf_ptr_check_member---of 4
btf_ptr_resolve6%of 54
btf_ptr_show---of 26
btf_put34%of 6
btf_ref_type_check_meta7%of 32
btf_ref_type_log100%of 1
btf_release---of 1
btf_repeat_fields---of 17
btf_resolve16%of 69
btf_resolve_size---of 1
btf_sec_info_cmp67%of 3
btf_seq_show---of 1
btf_set_base_btf---of 4
btf_show---of 3
btf_show_name---of 62
btf_show_start_type---of 8
btf_snprintf_show---of 4
btf_str_by_offset---of 6
btf_struct_access---of 38
btf_struct_check_member---of 4
btf_struct_check_meta35%of 52
btf_struct_ids_match---of 37
btf_struct_log100%of 1
btf_struct_metas_free---of 5
btf_struct_resolve18%of 35
btf_struct_show---of 8
btf_struct_walk---of 115
btf_try_get_module---of 1
btf_type_by_id---of 6
btf_type_id_size12%of 54
btf_type_ids_nocast_alias---of 26
btf_type_int_is_regular---of 4
btf_type_is_i32---of 5
btf_type_is_i64---of 5
btf_type_is_primitive---of 6
btf_type_is_struct_ptr---of 18
btf_type_is_void---of 1
btf_type_resolve_func_ptr---of 4
btf_type_resolve_ptr---of 34
btf_type_seq_show---of 6
btf_type_seq_show_flags---of 6
btf_type_skip_modifiers---of 18
btf_type_snprintf_show---of 6
btf_type_str---of 1
btf_types_are_same---of 13
btf_validate_prog_ctx_type---of 83
btf_var_check_meta---of 25
btf_var_log---of 1
btf_var_resolve---of 39
btf_var_show---of 9
btf_verifier_log50%of 4
btf_verifier_log_member43%of 21
btf_verifier_log_vsi34%of 9
env_stack_push---of 8
env_type_is_resolve_sink15%of 28
finalize_log50%of 4
get_kern_ctx_btf_id---of 8
hash_cands---of 17
populate_cand_cache---of 9
register_btf_fmodret_id_set---of 1
register_btf_id_dtor_kfuncs---of 60
register_btf_kfunc_id_set---of 4
-----------
SUMMARY21%of 1071

-----------
SUMMARY---of 0

hfs_bmap_get_map_page31%of 13
hfs_bmap_new_bmap---of 3
hfs_bmap_test_bit67%of 3
hfsplus_bmap_alloc---of 27
hfsplus_bmap_free---of 14
hfsplus_bmap_reserve---of 6
hfsplus_btree_close67%of 9
hfsplus_btree_open22%of 33
hfsplus_btree_write---of 3
hfsplus_calc_btree_clump_size---of 5
-----------
SUMMARY33%of 58

alloc_mnt_idmap---of 16
from_vfsgid34%of 6
from_vfsuid34%of 6
make_vfsgid29%of 7
make_vfsuid29%of 7
mnt_idmap_get---of 5
mnt_idmap_put---of 8
statmount_mnt_idmap---of 12
vfsgid_in_group_p---of 1
-----------
SUMMARY31%of 26

call_sbin_request_key---of 21
complete_request_key67%of 3
request_key_and_link20%of 87
request_key_rcu---of 13
request_key_tag---of 10
request_key_with_auxdata---of 8
umh_keys_cleanup---of 1
umh_keys_init---of 1
wait_for_key_construction50%of 6
-----------
SUMMARY23%of 96

__iptunnel_pull_header43%of 21
ip6_tun_build_state---of 18
ip6_tun_encap_nlsize---of 8
ip6_tun_fill_encap_info---of 8
ip_tun_build_state---of 19
ip_tun_cmp_encap---of 5
ip_tun_destroy_state---of 1
ip_tun_encap_nlsize---of 8
ip_tun_fill_encap_info---of 8
ip_tun_fill_encap_opts---of 30
ip_tun_parse_opts---of 42
ip_tunnel_need_metadata---of 1
ip_tunnel_netlink_encap_parms---of 10
ip_tunnel_netlink_parms---of 20
ip_tunnel_parse_protocol---of 8
ip_tunnel_unneed_metadata---of 1
iptunnel_handle_offloads---of 9
iptunnel_metadata_reply---of 9
iptunnel_pmtud_build_icmp---of 30
iptunnel_xmit---of 22
skb_tunnel_check_pmtu---of 76
-----------
SUMMARY43%of 21

__kernfs_create_file60%of 10
kernfs_drain_open_files---of 13
kernfs_fop_llseek---of 5
kernfs_fop_mmap---of 14
kernfs_fop_open---of 30
kernfs_fop_poll---of 9
kernfs_fop_read_iter---of 16
kernfs_fop_release---of 4
kernfs_fop_write_iter---of 18
kernfs_generic_poll---of 4
kernfs_notify67%of 6
kernfs_notify_workfn---of 14
kernfs_seq_next---of 5
kernfs_seq_show---of 1
kernfs_seq_start---of 7
kernfs_seq_stop---of 4
kernfs_should_drain_open_files75%of 4
kernfs_unlink_open_file---of 15
kernfs_vma_access---of 6
kernfs_vma_fault---of 6
kernfs_vma_open---of 6
kernfs_vma_page_mkwrite---of 6
-----------
SUMMARY65%of 20

kobj_lookup24%of 13
kobj_map39%of 13
kobj_map_init---of 4
kobj_unmap50%of 10
-----------
SUMMARY37%of 36

-----------
SUMMARY---of 0

bitmap_parse---of 36
bitmap_parse_user---of 3
bitmap_parselist12%of 50
bitmap_parselist_user---of 3
bitmap_print_bitmask_to_buf---of 1
bitmap_print_list_to_buf---of 1
bitmap_print_to_buf---of 3
bitmap_print_to_pagebuf---of 1
-----------
SUMMARY12%of 50

-----------
SUMMARY---of 0

__ipv6_neigh_lookup_noref---of 7
fib4_semantics_exit---of 1
fib4_semantics_init---of 3
fib_add_multipath---of 16
fib_add_nexthop---of 8
fib_check_nh---of 58
fib_create_info---of 131
fib_detect_death---of 20
fib_dump_info---of 49
fib_find_info---of 43
fib_get_nhs---of 53
fib_info_hash_bucket---of 9
fib_info_update_nhc_saddr---of 3
fib_metrics_match---of 13
fib_nexthop_info---of 25
fib_nh_common_init---of 12
fib_nh_common_release---of 26
fib_nh_init---of 7
fib_nh_match---of 63
fib_nh_release---of 3
fib_nhc_update_mtu---of 11
fib_nlmsg_size---of 31
fib_rebalance---of 27
fib_release_info---of 29
fib_result_assign---of 7
fib_result_prefsrc---of 5
fib_select_default---of 44
fib_select_multipath---of 28
fib_select_path25%of 24
fib_sync_down_addr---of 9
fib_sync_down_dev---of 35
fib_sync_mtu---of 16
fib_sync_up---of 28
fib_valid_prefsrc---of 8
free_fib_info---of 3
free_fib_info_rcu---of 15
ip_fib_check_default---of 9
ipv6_addr_cmp---of 1
list_add---of 4
netdev_tracker_alloc---of 3
nexthop_get---of 7
nexthop_mpath_fill_node---of 8
refcount_inc---of 4
rtmsg_fib---of 7
-----------
SUMMARY25%of 24

__bad_area_nosemaphore17%of 12
__bpf_trace_exceptions---of 1
__probestub_page_fault_kernel---of 1
__probestub_page_fault_user---of 1
__traceiter_page_fault_kernel---of 4
__traceiter_page_fault_user---of 4
bad_area_access_error12%of 27
bad_area_nosemaphore100%of 1
do_kern_addr_fault---of 5
do_sigbus---of 5
do_user_addr_fault23%of 109
dump_pagetable---of 18
fault_in_kernel_space67%of 3
is_errata93---of 7
is_prefetch---of 23
kernelmode_fixup_or_oops50%of 6
page_fault_oops---of 24
perf_trace_exceptions---of 8
pgtable_bad---of 1
show_ldttss---of 5
spurious_kernel_fault---of 37
spurious_kernel_fault_check---of 11
trace_event_raw_event_exceptions---of 9
trace_page_fault_kernel40%of 5
trace_page_fault_user40%of 5
trace_raw_output_exceptions---of 3
-----------
SUMMARY24%of 168

-----------
SUMMARY---of 0

dev_add_physical_location29%of 7
dock_show---of 1
horizontal_position_show---of 5
lid_show---of 1
panel_show---of 8
vertical_position_show---of 5
-----------
SUMMARY29%of 7

-----------
SUMMARY---of 0

__d_path---of 6
__dentry_path27%of 19
__ia32_sys_getcwd---of 1
__se_sys_getcwd---of 14
__x64_sys_getcwd---of 1
d_absolute_path50%of 6
d_path---of 17
dentry_path---of 6
dentry_path_raw67%of 3
dynamic_dname---of 3
prepend---of 6
prepend_path52%of 33
simple_dname---of 12
-----------
SUMMARY45%of 61

-----------
SUMMARY---of 0

NF_HOOK---of 10
br_nf_dev_queue_xmit---of 49
br_nf_dev_xmit---of 6
br_nf_forward---of 69
br_nf_forward_finish---of 37
br_nf_forward_ip---of 28
br_nf_hook_thresh---of 11
br_nf_local_in---of 14
br_nf_post_routing---of 45
br_nf_pre_routing---of 52
br_nf_pre_routing_finish---of 47
br_nf_pre_routing_finish_bridge---of 23
br_nf_push_frag_xmit---of 12
br_validate_ipv4---of 20
brnf_device_event---of 8
brnf_exit_net---of 5
brnf_init_net---of 6
brnf_sysctl_call_tables---of 4
ip_sabotage_in34%of 6
nf_bridge_encap_header_len---of 1
nf_bridge_pull_encap_header---of 1
nf_bridge_push_encap_header---of 1
nf_bridge_update_protocol---of 6
nf_conntrack_put---of 5
setup_pre_routing---of 16
-----------
SUMMARY34%of 6

dql_completed---of 19
dql_init100%of 1
dql_reset---of 1
trace_dql_stall_detected---of 5
-----------
SUMMARY100%of 1

bpf_log---of 4
bpf_verifier_log_write75%of 4
bpf_verifier_vlog45%of 20
bpf_vlog_alignment100%of 1
bpf_vlog_finalize59%of 12
bpf_vlog_init100%of 1
bpf_vlog_reset62%of 13
bpf_vlog_reverse_ubuf59%of 12
dynptr_type_str---of 9
iter_state_str---of 5
iter_type_str---of 3
print_insn_state100%of 4
print_reg_state48%of 55
print_verifier_state32%of 79
reg_type_str67%of 3
tnum_strn---of 5
verbose_linfo25%of 12
-----------
SUMMARY45%of 216

zlib_free67%of 3
zlib_init50%of 4
zlib_uncompress12%of 25
-----------
SUMMARY22%of 32

-----------
SUMMARY---of 0

__bpf_trace_notifier_info---of 1
__probestub_notifier_register---of 1
__probestub_notifier_run---of 1
__probestub_notifier_unregister---of 1
__traceiter_notifier_register---of 4
__traceiter_notifier_run---of 4
__traceiter_notifier_unregister---of 4
atomic_notifier_call_chain---of 9
atomic_notifier_call_chain_is_empty---of 1
atomic_notifier_chain_register---of 1
atomic_notifier_chain_register_unique_prio---of 1
atomic_notifier_chain_unregister---of 10
blocking_notifier_call_chain50%of 10
blocking_notifier_call_chain_robust---of 17
blocking_notifier_chain_register---of 3
blocking_notifier_chain_register_unique_prio---of 3
blocking_notifier_chain_unregister---of 11
notifier_chain_register---of 11
notifier_chain_unregister---of 10
notify_die---of 1
perf_trace_notifier_info---of 8
raw_notifier_call_chain---of 9
raw_notifier_call_chain_robust---of 16
raw_notifier_chain_register---of 1
raw_notifier_chain_unregister---of 10
register_die_notifier---of 1
srcu_init_notifier_head---of 3
srcu_notifier_call_chain---of 11
srcu_notifier_chain_register---of 3
srcu_notifier_chain_unregister---of 11
trace_event_raw_event_notifier_info---of 9
trace_raw_output_notifier_info---of 3
unregister_die_notifier---of 1
-----------
SUMMARY50%of 10

__hfs_ext_cache_extent---of 11
__hfs_ext_write_extent---of 6
hfs_ext_find_block40%of 5
hfs_ext_keycmp---of 5
hfs_ext_write_extent---of 4
hfs_extend_file---of 39
hfs_file_truncate---of 12
hfs_free_extents---of 11
hfs_free_fork---of 13
hfs_get_block---of 26
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

__bpf_async_init---of 12
__bpf_dynptr_data---of 1
__bpf_dynptr_data_rw---of 3
__bpf_dynptr_is_rdonly---of 1
__bpf_dynptr_read---of 12
__bpf_dynptr_size---of 3
__bpf_dynptr_write---of 15
__bpf_obj_drop_impl---of 6
__bpf_strncasecmp---of 12
__bpf_strnstr---of 18
__bpf_strtoull---of 16
__bpf_trap---of 1
bpf_async_cb_rcu_tasks_trace_free---of 9
bpf_async_irq_worker---of 16
bpf_async_process_op---of 8
bpf_async_refcount_put---of 5
bpf_async_schedule_op---of 14
bpf_async_update_prog_callback---of 12
bpf_base_func_proto4%of 78
bpf_bprintf_cleanup---of 7
bpf_bprintf_prepare---of 91
bpf_cast_to_kern_ctx---of 1
bpf_cgroup_acquire---of 7
bpf_cgroup_ancestor---of 8
bpf_cgroup_from_id---of 1
bpf_cgroup_release---of 5
bpf_cgroup_release_dtor---of 5
bpf_copy_from_user---of 4
bpf_copy_from_user_str---of 7
bpf_copy_from_user_task---of 5
bpf_copy_from_user_task_str---of 7
bpf_current_task_under_cgroup---of 6
bpf_dynptr_adjust---of 8
bpf_dynptr_check_size---of 1
bpf_dynptr_clone---of 3
bpf_dynptr_copy---of 15
bpf_dynptr_data---of 11
bpf_dynptr_file_discard---of 3
bpf_dynptr_from_file---of 4
bpf_dynptr_from_file_sleepable---of 4
bpf_dynptr_from_mem---of 4
bpf_dynptr_init---of 1
bpf_dynptr_is_null---of 1
bpf_dynptr_is_rdonly---of 3
bpf_dynptr_memset---of 10
bpf_dynptr_read---of 1
bpf_dynptr_set_null---of 1
bpf_dynptr_set_rdonly---of 1
bpf_dynptr_size---of 4
bpf_dynptr_slice---of 17
bpf_dynptr_slice_rdwr---of 4
bpf_dynptr_write---of 1
bpf_event_output_data---of 3
bpf_file_fetch_bytes---of 5
bpf_get_current_ancestor_cgroup_id---of 5
bpf_get_current_cgroup_id---of 1
bpf_get_current_comm---of 4
bpf_get_current_pid_tgid---of 3
bpf_get_current_uid_gid---of 3
bpf_get_ns_current_pid_tgid---of 6
bpf_get_numa_node_id---of 1
bpf_get_smp_processor_id---of 1
bpf_iter_bits_destroy---of 3
bpf_iter_bits_new---of 11
bpf_iter_bits_next---of 4
bpf_jiffies64---of 1
bpf_key_put---of 3
bpf_kptr_xchg---of 1
bpf_ktime_get_boot_ns---of 1
bpf_ktime_get_coarse_ns---of 1
bpf_ktime_get_ns---of 1
bpf_ktime_get_tai_ns---of 1
bpf_list_back---of 3
bpf_list_front---of 1
bpf_list_head_free---of 17
bpf_list_pop_back---of 8
bpf_list_pop_front---of 8
bpf_list_push_back---of 12
bpf_list_push_back_impl---of 1
bpf_list_push_front---of 13
bpf_list_push_front_impl---of 1
bpf_local_irq_restore---of 3
bpf_local_irq_save---of 1
bpf_lookup_system_key---of 4
bpf_lookup_user_key---of 5
bpf_map_delete_elem---of 1
bpf_map_free_internal_structs---of 10
bpf_map_lookup_elem---of 1
bpf_map_lookup_percpu_elem---of 1
bpf_map_peek_elem---of 1
bpf_map_pop_elem---of 1
bpf_map_push_elem---of 1
bpf_map_update_elem---of 1
bpf_obj_drop---of 7
bpf_obj_drop_impl---of 1
bpf_obj_new---of 21
bpf_obj_new_impl---of 1
bpf_per_cpu_ptr---of 3
bpf_percpu_obj_drop---of 1
bpf_percpu_obj_drop_impl---of 1
bpf_percpu_obj_new---of 1
bpf_percpu_obj_new_impl---of 1
bpf_preempt_disable---of 1
bpf_preempt_enable---of 3
bpf_put_buffers---of 4
bpf_rb_root_free---of 15
bpf_rbtree_add---of 12
bpf_rbtree_add_impl---of 1
bpf_rbtree_first---of 1
bpf_rbtree_left---of 3
bpf_rbtree_remove---of 4
bpf_rbtree_right---of 3
bpf_rbtree_root---of 1
bpf_rcu_read_lock---of 1
bpf_rcu_read_unlock---of 1
bpf_rdonly_cast---of 1
bpf_refcount_acquire---of 7
bpf_refcount_acquire_impl---of 1
bpf_snprintf---of 8
bpf_spin_lock---of 3
bpf_spin_unlock---of 5
bpf_stack_walker---of 1
bpf_strcasecmp---of 1
bpf_strcasestr---of 1
bpf_strchr---of 1
bpf_strchrnul---of 6
bpf_strcmp---of 1
bpf_strcspn---of 11
bpf_strlen---of 1
bpf_strncasecmp---of 1
bpf_strncasestr---of 1
bpf_strnchr---of 8
bpf_strncmp---of 1
bpf_strnlen---of 7
bpf_strnstr---of 1
bpf_strrchr---of 6
bpf_strspn---of 11
bpf_strstr---of 1
bpf_strtol---of 6
bpf_strtoul---of 4
bpf_task_acquire---of 7
bpf_task_from_pid---of 3
bpf_task_from_vpid---of 3
bpf_task_get_cgroup1---of 1
bpf_task_release---of 1
bpf_task_release_dtor---of 1
bpf_task_under_cgroup---of 4
bpf_task_work_callback---of 39
bpf_task_work_cancel_and_free---of 10
bpf_task_work_cancel_scheduled---of 20
bpf_task_work_destroy---of 7
bpf_task_work_irq---of 29
bpf_task_work_schedule---of 29
bpf_task_work_schedule_resume---of 1
bpf_task_work_schedule_signal---of 1
bpf_this_cpu_ptr---of 1
bpf_throw---of 10
bpf_timer_cancel---of 14
bpf_timer_cancel_and_free---of 16
bpf_timer_cancel_async---of 15
bpf_timer_cb---of 4
bpf_timer_init---of 6
bpf_timer_set_callback---of 13
bpf_timer_start---of 17
bpf_try_get_buffers---of 4
bpf_verify_pkcs7_signature---of 8
bpf_wq_cancel_and_free---of 16
bpf_wq_init---of 3
bpf_wq_set_callback---of 14
bpf_wq_start---of 17
bpf_wq_work---of 16
copy_map_value_locked---of 13
schedule_work---of 1
worker_for_call_rcu---of 1
-----------
SUMMARY4%of 78

-----------
SUMMARY---of 0

get_option---of 11
get_options---of 18
memparse---of 16
next_arg---of 15
parse_option_str19%of 11
-----------
SUMMARY19%of 11

__aa_path_perm---of 9
aa_audit_file---of 18
aa_file_perm8%of 67
aa_get_newest_label---of 18
aa_inherit_files---of 15
aa_lookup_condperms---of 5
aa_path_link---of 27
aa_path_perm---of 8
aa_str_perms---of 5
file_audit_cb---of 12
match_file---of 1
path_name---of 7
-----------
SUMMARY8%of 67

-----------
SUMMARY---of 0

lo_calculate_size---of 6
lo_compat_ioctl13%of 16
lo_complete_rq---of 8
lo_free_disk---of 3
lo_ioctl4%of 85
lo_open67%of 3
lo_release54%of 13
lo_rw_aio---of 32
lo_rw_aio_complete---of 1
lo_rw_aio_do_completion---of 10
loop_add---of 13
loop_assign_backing_file---of 8
loop_attr_do_show_autoclear---of 1
loop_attr_do_show_backing_file---of 5
loop_attr_do_show_dio---of 1
loop_attr_do_show_offset---of 1
loop_attr_do_show_partscan---of 1
loop_attr_do_show_sizelimit---of 1
loop_configure---of 49
loop_control_ioctl---of 19
loop_free_idle_workers---of 16
loop_free_idle_workers_timer---of 1
loop_get_status---of 5
loop_info64_from_compat---of 3
loop_info64_to_compat---of 6
loop_probe---of 3
loop_process_work---of 51
loop_queue_rq32%of 29
loop_remove---of 1
loop_reread_partitions---of 3
loop_rootcg_workfn---of 1
loop_set_block_size---of 17
loop_set_hw_queue_depth---of 4
loop_set_size---of 3
loop_set_status---of 30
loop_set_status_from_info---of 8
loop_update_dio---of 9
loop_update_limits---of 16
loop_workfn---of 1
max_loop_param_set_int---of 3
-----------
SUMMARY16%of 146

-----------
SUMMARY---of 0

__xfrm_mode_beet_prep---of 18
__xfrm_mode_tunnel_prep---of 16
__xfrm_transport_prep---of 18
dev_hold---of 5
netdev_put---of 4
netdev_tracker_alloc---of 3
skb_mac_header---of 3
validate_xmit_xfrm6%of 37
xfrm_dev_backlog---of 9
xfrm_dev_event---of 17
xfrm_dev_offload_ok---of 30
xfrm_dev_policy_add---of 25
xfrm_dev_resume---of 10
xfrm_dev_state_add---of 35
xfrm_outer_mode_prep---of 13
xmit_xfrm_check_overflow---of 6
-----------
SUMMARY6%of 37

kernfs_create_link50%of 8
kernfs_iop_get_link---of 20
-----------
SUMMARY50%of 8

-----------
SUMMARY---of 0

__qdisc_destroy---of 10
__qdisc_run---of 59
__skb_dequeue_bad_txq---of 14
__tcf_kfree_skb_list---of 9
dev_activate---of 56
dev_deactivate---of 6
dev_deactivate_many---of 50
dev_graft_qdisc---of 1
dev_init_scheduler---of 6
dev_qdisc_change_real_num_tx---of 3
dev_qdisc_change_tx_queue_len---of 15
dev_requeue_skb---of 10
dev_shutdown---of 22
dev_trans_start---of 9
dev_watchdog---of 21
mini_qdisc_pair_block_init---of 1
mini_qdisc_pair_init---of 1
mini_qdisc_pair_swap---of 6
mq_change_real_num_tx---of 17
netdev_watchdog_up---of 9
netif_carrier_event---of 3
netif_carrier_off---of 4
netif_carrier_on---of 5
netif_freeze_queues---of 4
netif_tx_lock---of 4
netif_tx_unlock---of 4
netif_unfreeze_queues---of 4
noop_dequeue---of 1
noop_enqueue---of 1
noqueue_init---of 1
pfifo_fast_change_tx_queue_len---of 22
pfifo_fast_dequeue---of 46
pfifo_fast_destroy---of 7
pfifo_fast_dump---of 3
pfifo_fast_enqueue---of 9
pfifo_fast_init---of 7
pfifo_fast_peek---of 9
pfifo_fast_reset---of 39
psched_ppscfg_precompute---of 4
psched_ratecfg_precompute---of 4
qdisc_alloc---of 16
qdisc_create_dflt---of 15
qdisc_destroy---of 3
qdisc_enqueue_skb_bad_txq---of 7
qdisc_free---of 3
qdisc_free_cb---of 3
qdisc_put---of 6
qdisc_put_unlocked---of 4
qdisc_reset---of 13
sch_direct_xmit26%of 31
trace_net_dev_xmit_timeout---of 5
xfrm_offload---of 6
-----------
SUMMARY26%of 31

-----------
SUMMARY---of 0

__devm_add_action---of 3
__devm_alloc_percpu---of 5
__devres_alloc_node---of 4
devm_action_free---of 1
devm_action_release---of 1
devm_free_pages---of 4
devm_get_free_pages---of 5
devm_is_action_added---of 10
devm_kasprintf---of 1
devm_kfree---of 5
devm_kmalloc---of 5
devm_kmalloc_match---of 1
devm_kmalloc_release---of 1
devm_kmemdup---of 6
devm_kmemdup_const---of 3
devm_krealloc---of 23
devm_kstrdup---of 3
devm_kstrdup_const---of 3
devm_kvasprintf---of 5
devm_pages_match---of 1
devm_pages_release---of 1
devm_percpu_release---of 1
devm_release_action---of 3
devm_remove_action_nowarn---of 3
devres_add---of 1
devres_close_group---of 21
devres_destroy---of 4
devres_find---of 11
devres_for_each_res---of 12
devres_free---of 4
devres_get---of 22
devres_group_free---of 1
devres_node_add---of 12
devres_node_init---of 1
devres_node_remove---of 15
devres_open_group---of 6
devres_release---of 4
devres_release_all17%of 12
devres_release_group---of 22
devres_remove---of 21
devres_remove_group---of 22
devres_set_node_dbginfo---of 1
dr_node_free---of 1
dr_node_release---of 1
group_close_release---of 1
group_open_release---of 1
remove_action---of 19
remove_nodes---of 28
-----------
SUMMARY17%of 12

key_task_permission25%of 20
key_validate34%of 6
-----------
SUMMARY27%of 26

-----------
SUMMARY---of 0

__skb_push---of 3
__tcp_md5_do_add---of 8
__tcp_md5_do_lookup---of 25
bpf_iter_fill_batch---of 33
bpf_iter_fini_tcp---of 1
bpf_iter_init_tcp---of 4
bpf_iter_tcp_batch---of 11
bpf_iter_tcp_get_func_proto---of 1
bpf_iter_tcp_put_batch---of 6
bpf_iter_tcp_realloc_batch---of 4
bpf_iter_tcp_resume---of 51
bpf_iter_tcp_seq_next---of 5
bpf_iter_tcp_seq_show---of 10
bpf_iter_tcp_seq_start---of 3
bpf_iter_tcp_seq_stop---of 9
bpf_iter_tcp_unlock_bucket---of 3
established_get_first---of 14
established_get_next---of 10
inet_sk_rx_dst_set---of 5
ip_route_newports---of 3
listening_get_next---of 23
nf_reset_ct---of 5
psp_skb_coalesce_diff---of 6
psp_twsk_rx_policy_check---of 17
refcount_dec_and_test---of 4
refcount_inc---of 4
sk_drops_skbadd---of 4
sk_rst_convert_drop_reason20%of 10
tcp4_destruct_sock---of 1
tcp4_proc_exit---of 1
tcp4_proc_exit_net---of 1
tcp4_proc_init_net---of 1
tcp4_seq_show---of 14
tcp_add_backlog---of 40
tcp_checksum_complete34%of 6
tcp_clear_md5_list---of 5
tcp_get_idx---of 20
tcp_ld_RTO_revert---of 11
tcp_md5_do_add---of 6
tcp_md5_do_del---of 5
tcp_md5_do_lookup_exact---of 11
tcp_md5_key_copy---of 8
tcp_req_err---of 9
tcp_segs_in---of 5
tcp_seq_next---of 6
tcp_seq_start---of 33
tcp_seq_stop---of 6
tcp_sk_exit---of 3
tcp_sk_exit_batch---of 7
tcp_sk_init---of 9
tcp_skb_can_collapse_rx---of 8
tcp_twsk_unique---of 24
tcp_v4_conn_request---of 5
tcp_v4_connect---of 34
tcp_v4_destroy_sock---of 20
tcp_v4_do_rcv---of 63
tcp_v4_err---of 46
tcp_v4_fill_cb67%of 3
tcp_v4_get_syncookie---of 8
tcp_v4_init_seq_and_ts_off---of 4
tcp_v4_init_sock---of 1
tcp_v4_md5_hash_hdr---of 1
tcp_v4_md5_hash_skb---of 5
tcp_v4_md5_lookup---of 19
tcp_v4_mtu_reduced---of 16
tcp_v4_parse_md5_keys---of 20
tcp_v4_pre_connect---of 1
tcp_v4_rcv10%of 131
tcp_v4_reqsk_destructor---of 1
tcp_v4_reqsk_send_ack---of 28
tcp_v4_restore_cb---of 1
tcp_v4_route_req---of 7
tcp_v4_send_ack---of 18
tcp_v4_send_reset24%of 68
tcp_v4_send_synack---of 13
tcp_v4_syn_recv_sock---of 51
tcp_v4_timewait_ack---of 6
trace_tcp_bad_csum---of 5
xfrm4_policy_check23%of 22
-----------
SUMMARY17%of 240

-----------
SUMMARY---of 0

bpf_fmt_stack_mask20%of 10
bpf_mark_all_scalars_precise---of 20
bpf_mark_chain_precision14%of 174
bpf_push_jmp_history29%of 7
bt_subprog_enter---of 3
bt_subprog_exit---of 3
fmt_reg_mask20%of 10
-----------
SUMMARY15%of 201

__blkcg_rstat_flush---of 13
__blkg_prfill_u64---of 4
__blkg_release---of 13
bio_associate_blkg34%of 6
bio_associate_blkg_from_css24%of 47
bio_blkcg_css50%of 4
bio_clone_blkg_association50%of 4
blk_cgroup_bio_start19%of 11
blk_cgroup_congested58%of 7
blkcg_activate_policy---of 42
blkcg_add_delay---of 7
blkcg_css_alloc---of 41
blkcg_css_free---of 16
blkcg_css_offline---of 1
blkcg_css_online---of 5
blkcg_deactivate_policy---of 14
blkcg_exit---of 3
blkcg_exit_disk---of 20
blkcg_get_cgwb_list---of 1
blkcg_init_disk---of 12
blkcg_maybe_throttle_current5%of 43
blkcg_pin_online---of 4
blkcg_policy_register---of 24
blkcg_policy_unregister---of 11
blkcg_print_blkgs---of 9
blkcg_print_stat---of 38
blkcg_punt_bio_submit---of 3
blkcg_reset_stats---of 28
blkcg_rstat_flush---of 3
blkcg_schedule_throttle---of 10
blkcg_unpin_online---of 11
blkg_alloc37%of 22
blkg_async_bio_workfn---of 7
blkg_conf_exit---of 5
blkg_conf_exit_frozen---of 5
blkg_conf_init---of 1
blkg_conf_open_bdev---of 8
blkg_conf_open_bdev_frozen---of 4
blkg_conf_prep---of 44
blkg_create28%of 72
blkg_destroy---of 32
blkg_dev_name---of 3
blkg_free_workfn---of 20
blkg_init_queue---of 1
blkg_release---of 1
percpu_ref_put---of 4
-----------
SUMMARY25%of 216

___ratelimit38%of 24
-----------
SUMMARY38%of 24

xsk_map_alloc23%of 9
xsk_map_delete_elem---of 4
xsk_map_free---of 1
xsk_map_gen_lookup---of 1
xsk_map_get_next_key---of 5
xsk_map_lookup_elem---of 3
xsk_map_lookup_elem_sys_only---of 1
xsk_map_mem_usage---of 1
xsk_map_meta_equal---of 3
xsk_map_redirect---of 7
xsk_map_sock_delete---of 8
xsk_map_try_sock_delete---of 3
xsk_map_update_elem---of 14
-----------
SUMMARY23%of 9

-----------
SUMMARY---of 0

__hugetlb_vmemmap_optimize_folios---of 20
__hugetlb_vmemmap_restore_folio---of 21
hugetlb_vmemmap_optimize_bootmem_folios---of 1
hugetlb_vmemmap_optimize_folio67%of 3
hugetlb_vmemmap_optimize_folios---of 1
hugetlb_vmemmap_restore_folio---of 1
hugetlb_vmemmap_restore_folios---of 15
pte_free_kernel---of 8
vmemmap_pmd_entry---of 23
vmemmap_pte_entry---of 1
vmemmap_restore_pte---of 7
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

fserror_mount100%of 1
fserror_report---of 28
fserror_unmount---of 8
fserror_worker---of 7
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

joydev_cleanup---of 6
joydev_compat_ioctl---of 10
joydev_connect---of 27
joydev_disconnect---of 1
joydev_event---of 17
joydev_fasync---of 1
joydev_free---of 3
joydev_handle_JSIOCSAXMAP---of 14
joydev_handle_JSIOCSBTNMAP---of 11
joydev_ioctl---of 8
joydev_ioctl_common---of 29
joydev_match11%of 19
joydev_open---of 21
joydev_poll---of 6
joydev_read---of 36
joydev_release---of 7
-----------
SUMMARY11%of 19

__cyc2ns_read---of 3
__set_cyc2ns_scale---of 6
calibrate_delay_is_known---of 9
check_tsc_unstable---of 1
cyc2ns_read_begin---of 3
cyc2ns_read_end---of 3
mark_tsc_unstable---of 4
native_calibrate_cpu---of 3
native_calibrate_cpu_early---of 25
native_calibrate_tsc---of 11
native_sched_clock_from_tsc---of 4
paravirt_set_sched_clock---of 1
pit_hpet_ptimer_calibrate_cpu---of 21
read_tsc---of 1
recalibrate_cpu_khz---of 1
sched_clock67%of 3
time_cpufreq_notifier---of 19
tsc_cs_enable---of 1
tsc_cs_mark_unstable---of 4
tsc_cs_tick_stable---of 4
tsc_read_refs---of 20
tsc_refine_calibration_work---of 23
tsc_restore_sched_clock_state---of 12
tsc_resume---of 1
tsc_save_sched_clock_state---of 6
unsynchronized_tsc---of 5
using_native_sched_clock---of 1
-----------
SUMMARY67%of 3

ext4_init_orphan_info8%of 26
ext4_orphan_add6%of 38
ext4_orphan_cleanup24%of 51
ext4_orphan_del33%of 28
ext4_orphan_file_block_trigger---of 1
ext4_orphan_file_empty---of 6
ext4_process_orphan43%of 7
ext4_release_orphan_info---of 7
list_add---of 4
list_del_init---of 4
lock_buffer67%of 3
-----------
SUMMARY20%of 153

__hfs_brec_find---of 7
hfs_brec_find28%of 11
hfs_brec_goto---of 11
hfs_brec_read50%of 4
hfs_find_exit100%of 1
hfs_find_init29%of 7
-----------
SUMMARY35%of 23

-----------
SUMMARY---of 0

usb_phy_roothub_alloc---of 15
usb_phy_roothub_alloc_usb3_phy---of 8
usb_phy_roothub_calibrate---of 5
usb_phy_roothub_exit---of 5
usb_phy_roothub_init---of 7
usb_phy_roothub_notify_connect---of 5
usb_phy_roothub_notify_disconnect40%of 5
usb_phy_roothub_power_off---of 5
usb_phy_roothub_power_on---of 7
usb_phy_roothub_resume---of 22
usb_phy_roothub_set_mode---of 5
usb_phy_roothub_suspend---of 11
-----------
SUMMARY40%of 5

page_counter_calculate_protection---of 12
page_counter_cancel37%of 11
page_counter_charge40%of 15
page_counter_memparse---of 4
page_counter_set_low---of 11
page_counter_set_max---of 5
page_counter_set_min---of 11
page_counter_try_charge30%of 20
page_counter_uncharge75%of 4
-----------
SUMMARY38%of 50

-----------
SUMMARY---of 0

__bpf_trace_file_check_and_advance_wb_err---of 1
__bpf_trace_filemap_set_wb_err---of 1
__bpf_trace_mm_filemap_fault---of 1
__bpf_trace_mm_filemap_op_page_cache---of 1
__bpf_trace_mm_filemap_op_page_cache_range---of 1
__filemap_add_folio53%of 44
__filemap_get_folio_mpol43%of 40
__filemap_remove_folio50%of 14
__filemap_set_wb_err---of 5
__folio_lock100%of 1
__folio_lock_killable---of 1
__folio_lock_or_retry---of 20
__generic_file_write_iter---of 8
__ia32_sys_cachestat---of 1
__probestub_file_check_and_advance_wb_err---of 1
__probestub_filemap_set_wb_err---of 1
__probestub_mm_filemap_add_to_page_cache---of 1
__probestub_mm_filemap_delete_from_page_cache---of 1
__probestub_mm_filemap_fault---of 1
__probestub_mm_filemap_get_pages---of 1
__probestub_mm_filemap_map_pages---of 1
__se_sys_cachestat---of 35
__traceiter_file_check_and_advance_wb_err---of 4
__traceiter_filemap_set_wb_err---of 4
__traceiter_mm_filemap_add_to_page_cache---of 4
__traceiter_mm_filemap_delete_from_page_cache---of 4
__traceiter_mm_filemap_fault---of 4
__traceiter_mm_filemap_get_pages---of 4
__traceiter_mm_filemap_map_pages---of 4
__x64_sys_cachestat---of 1
count_memcg_event_mm---of 4
delete_from_page_cache_batch47%of 43
do_read_cache_folio25%of 36
do_sync_mmap_readahead---of 12
file_check_and_advance_wb_err---of 6
file_fdatawait_range---of 8
file_write_and_wait_range---of 14
filemap_add_folio43%of 19
filemap_alloc_folio_noprof15%of 14
filemap_check_errors---of 5
filemap_fault---of 60
filemap_fault_recheck_pte_none---of 7
filemap_fdatawait_keep_errors---of 9
filemap_fdatawait_range34%of 12
filemap_fdatawait_range_keep_errors---of 10
filemap_fdatawrite---of 4
filemap_fdatawrite_range---of 4
filemap_flush---of 4
filemap_flush_nr---of 7
filemap_flush_range---of 4
filemap_get_entry36%of 14
filemap_get_folios---of 1
filemap_get_folios_contig---of 29
filemap_get_folios_dirty---of 28
filemap_get_folios_tag25%of 20
filemap_get_pages---of 92
filemap_get_read_batch---of 27
filemap_invalidate_inode---of 12
filemap_invalidate_lock_two---of 5
filemap_invalidate_pages---of 9
filemap_invalidate_unlock_two---of 5
filemap_map_pages16%of 66
filemap_page_mkwrite---of 15
filemap_range_has_page---of 6
filemap_range_has_writeback---of 19
filemap_read---of 49
filemap_read_folio---of 10
filemap_release_folio50%of 10
filemap_remove_folio55%of 11
filemap_splice_read---of 24
filemap_unaccount_folio31%of 26
filemap_write_and_wait_range39%of 18
find_get_entries14%of 22
find_lock_entries32%of 32
folio_end_dropbehind19%of 11
folio_end_private_2---of 5
folio_end_read50%of 4
folio_end_writeback67%of 3
folio_end_writeback_no_dropbehind50%of 8
folio_unlock75%of 4
folio_wait_bit---of 1
folio_wait_bit_common44%of 32
folio_wait_bit_killable---of 1
folio_wait_private_2---of 4
folio_wait_private_2_killable---of 4
generic_file_direct_write---of 8
generic_file_mmap---of 4
generic_file_mmap_prepare---of 4
generic_file_read_iter---of 18
generic_file_readonly_mmap---of 5
generic_file_readonly_mmap_prepare---of 5
generic_file_write_iter---of 12
generic_perform_write---of 23
kiocb_invalidate_pages---of 1
kiocb_invalidate_post_direct_write---of 5
kiocb_write_and_wait---of 5
mapping_read_folio_gfp---of 1
mapping_seek_hole_data---of 46
maybe_unlock_mmap_for_io---of 11
next_uptodate_folio18%of 41
page_cache_next_miss---of 9
page_cache_prev_miss---of 8
perf_trace_file_check_and_advance_wb_err---of 8
perf_trace_filemap_set_wb_err---of 8
perf_trace_mm_filemap_fault---of 8
perf_trace_mm_filemap_op_page_cache---of 9
perf_trace_mm_filemap_op_page_cache_range---of 8
read_cache_folio100%of 1
read_cache_page50%of 4
read_cache_page_gfp---of 4
replace_page_cache_folio---of 17
softleaf_entry_wait_on_locked---of 24
splice_folio_into_pipe---of 11
trace_event_raw_event_file_check_and_advance_wb_err---of 9
trace_event_raw_event_filemap_set_wb_err---of 9
trace_event_raw_event_mm_filemap_fault---of 9
trace_event_raw_event_mm_filemap_op_page_cache---of 10
trace_event_raw_event_mm_filemap_op_page_cache_range---of 9
trace_raw_output_file_check_and_advance_wb_err---of 3
trace_raw_output_filemap_set_wb_err---of 3
trace_raw_output_mm_filemap_fault---of 3
trace_raw_output_mm_filemap_op_page_cache---of 3
trace_raw_output_mm_filemap_op_page_cache_range---of 3
wake_page_function28%of 11
-----------
SUMMARY34%of 561

-----------
SUMMARY---of 0

blk_alloc_flush_queue---of 7
blk_flush_complete_seq22%of 32
blk_free_flush_queue---of 3
blk_insert_flush50%of 10
blk_mq_hctx_set_fq_lock_class---of 1
blkdev_issue_flush100%of 1
flush_end_io---of 35
is_flush_rq---of 1
mq_flush_data_end_io---of 11
-----------
SUMMARY31%of 43

-----------
SUMMARY---of 0

hfsplus_bmap---of 1
hfsplus_cat_read_inode---of 12
hfsplus_cat_write_inode---of 14
hfsplus_delete_inode---of 6
hfsplus_direct_IO---of 4
hfsplus_file_fsync---of 14
hfsplus_file_open---of 6
hfsplus_file_release---of 10
hfsplus_fileattr_get---of 1
hfsplus_fileattr_set---of 9
hfsplus_get_perms---of 23
hfsplus_getattr---of 9
hfsplus_inode_read_fork67%of 3
hfsplus_inode_write_fork---of 1
hfsplus_new_inode---of 10
hfsplus_read_folio100%of 1
hfsplus_release_folio27%of 15
hfsplus_setattr---of 7
hfsplus_write_begin---of 3
hfsplus_write_failed---of 3
hfsplus_writepages---of 1
-----------
SUMMARY37%of 19

__bpf_trace_ma_op---of 1
__bpf_trace_ma_read---of 1
__bpf_trace_ma_write---of 1
__mas_set_range---of 5
__mt_destroy50%of 4
__mt_dup---of 4
__probestub_ma_op---of 1
__probestub_ma_read---of 1
__probestub_ma_write---of 1
__traceiter_ma_op---of 4
__traceiter_ma_read---of 4
__traceiter_ma_write---of 4
cp_data_write---of 87
cp_dst_to_slots---of 26
cp_is_new_root---of 21
dst_setup---of 24
mas_alloc_cyclic24%of 51
mas_alloc_nodes---of 14
mas_ascend---of 27
mas_destroy---of 5
mas_dup_build---of 70
mas_dup_free---of 46
mas_empty_area14%of 96
mas_empty_area_rev---of 85
mas_erase---of 37
mas_find30%of 30
mas_find_range---of 30
mas_find_range_rev---of 3
mas_find_rev---of 3
mas_find_rev_setup---of 19
mas_new_root---of 11
mas_next---of 3
mas_next_node---of 48
mas_next_range---of 3
mas_next_setup---of 21
mas_next_slot25%of 41
mas_nomem---of 8
mas_pause---of 1
mas_prealloc_calc25%of 16
mas_preallocate19%of 22
mas_prev67%of 3
mas_prev_node---of 43
mas_prev_range67%of 3
mas_prev_setup10%of 22
mas_prev_slot22%of 38
mas_root_expand---of 9
mas_state_walk---of 8
mas_store---of 29
mas_store_gfp25%of 33
mas_store_prealloc17%of 30
mas_update_gap5%of 40
mas_walk44%of 16
mas_wmb_replace---of 136
mas_wr_store_entry2%of 419
mas_wr_store_type24%of 52
mas_wr_walk_descend39%of 21
mt_destroy_walk---of 50
mt_find43%of 49
mt_find_after---of 3
mt_free_walk---of 49
mt_next---of 3
mt_prev---of 3
mtree_alloc_cyclic50%of 4
mtree_alloc_range---of 46
mtree_alloc_rrange---of 46
mtree_destroy---of 4
mtree_dup---of 4
mtree_erase---of 5
mtree_insert---of 1
mtree_insert_range---of 45
mtree_load---of 32
mtree_range_walk57%of 23
mtree_store---of 1
mtree_store_range---of 8
multi_src_setup---of 22
node_copy---of 34
perf_trace_ma_op---of 8
perf_trace_ma_read---of 8
perf_trace_ma_write---of 8
trace_event_raw_event_ma_op---of 9
trace_event_raw_event_ma_read---of 9
trace_event_raw_event_ma_write---of 9
trace_raw_output_ma_op---of 3
trace_raw_output_ma_read---of 3
trace_raw_output_ma_write---of 3
-----------
SUMMARY16%of 1013

__ia32_sys_memfd_create---of 1
__se_sys_memfd_create---of 27
__x64_sys_memfd_create---of 1
memfd_add_seals---of 19
memfd_alloc_file---of 22
memfd_alloc_folio---of 9
memfd_check_seals_mmap34%of 12
memfd_fcntl---of 11
memfd_folio_has_extra_refs---of 10
memfd_get_seals---of 9
memfd_wait_for_pins---of 36
-----------
SUMMARY34%of 12

__proc_create23%of 31
_proc_mkdir---of 4
pde_free---of 5
pde_put---of 7
proc_alloc_inum---of 3
proc_create---of 4
proc_create_data---of 4
proc_create_mount_point---of 3
proc_create_reg---of 4
proc_create_seq_private50%of 4
proc_create_single_data50%of 4
proc_free_inum---of 1
proc_get_parent_data---of 1
proc_getattr---of 4
proc_lookup67%of 3
proc_lookup_de54%of 15
proc_misc_d_delete---of 1
proc_misc_d_revalidate---of 3
proc_mkdir67%of 3
proc_mkdir_data---of 3
proc_mkdir_mode---of 3
proc_net_d_revalidate---of 1
proc_readdir---of 3
proc_readdir_de---of 18
proc_register53%of 23
proc_remove---of 3
proc_seq_open---of 3
proc_seq_release---of 3
proc_set_size---of 1
proc_set_user---of 1
proc_setattr---of 3
proc_simple_write---of 5
proc_single_open---of 1
proc_symlink---of 7
remove_proc_entry---of 29
remove_proc_subtree---of 30
-----------
SUMMARY43%of 83

__bpf_trace_mm_collapse_huge_page---of 1
__bpf_trace_mm_collapse_huge_page_isolate---of 1
__bpf_trace_mm_collapse_huge_page_swapin---of 1
__bpf_trace_mm_khugepaged_collapse_file---of 1
__bpf_trace_mm_khugepaged_scan---of 1
__bpf_trace_mm_khugepaged_scan_file---of 1
__bpf_trace_mm_khugepaged_scan_pmd---of 1
__collapse_huge_page_copy_failed---of 1
__collapse_huge_page_isolate---of 68
__khugepaged_enter---of 11
__khugepaged_exit---of 20
__probestub_mm_collapse_huge_page---of 1
__probestub_mm_collapse_huge_page_isolate---of 1
__probestub_mm_collapse_huge_page_swapin---of 1
__probestub_mm_khugepaged_collapse_file---of 1
__probestub_mm_khugepaged_scan---of 1
__probestub_mm_khugepaged_scan_file---of 1
__probestub_mm_khugepaged_scan_pmd---of 1
__traceiter_mm_collapse_huge_page---of 4
__traceiter_mm_collapse_huge_page_isolate---of 4
__traceiter_mm_collapse_huge_page_swapin---of 4
__traceiter_mm_khugepaged_collapse_file---of 4
__traceiter_mm_khugepaged_scan---of 4
__traceiter_mm_khugepaged_scan_file---of 4
__traceiter_mm_khugepaged_scan_pmd---of 4
add_mm_counter---of 1
alloc_charge_folio---of 18
alloc_sleep_millisecs_show---of 1
alloc_sleep_millisecs_store---of 3
check_pmd_state---of 8
collapse_pte_mapped_thp---of 1
collapse_single_pmd---of 463
current_is_khugepaged---of 1
defrag_show---of 1
defrag_store---of 1
full_scans_show---of 1
hugepage_madvise---of 4
hugepage_vma_revalidate---of 21
khugepaged---of 109
khugepaged_enter_vma25%of 16
khugepaged_min_free_kbytes_update---of 7
madvise_collapse---of 48
max_ptes_none_show---of 1
max_ptes_none_store---of 3
max_ptes_shared_show---of 1
max_ptes_shared_store---of 3
max_ptes_swap_show---of 1
max_ptes_swap_store---of 3
mmu_notifier_invalidate_range_end---of 5
mmu_notifier_invalidate_range_start---of 3
pages_collapsed_show---of 1
pages_to_scan_show---of 1
pages_to_scan_store---of 3
perf_trace_mm_collapse_huge_page---of 8
perf_trace_mm_collapse_huge_page_isolate---of 9
perf_trace_mm_collapse_huge_page_swapin---of 8
perf_trace_mm_khugepaged_collapse_file---of 9
perf_trace_mm_khugepaged_scan---of 8
perf_trace_mm_khugepaged_scan_file---of 9
perf_trace_mm_khugepaged_scan_pmd---of 9
pmd_lock---of 1
release_pte_pages---of 17
scan_sleep_millisecs_show---of 1
scan_sleep_millisecs_store---of 3
set_huge_pmd---of 25
set_recommended_min_free_kbytes---of 15
start_stop_khugepaged---of 13
trace_event_raw_event_mm_collapse_huge_page---of 9
trace_event_raw_event_mm_collapse_huge_page_isolate---of 10
trace_event_raw_event_mm_collapse_huge_page_swapin---of 9
trace_event_raw_event_mm_khugepaged_collapse_file---of 10
trace_event_raw_event_mm_khugepaged_scan---of 9
trace_event_raw_event_mm_khugepaged_scan_file---of 10
trace_event_raw_event_mm_khugepaged_scan_pmd---of 10
trace_raw_output_mm_collapse_huge_page---of 3
trace_raw_output_mm_collapse_huge_page_isolate---of 3
trace_raw_output_mm_collapse_huge_page_swapin---of 3
trace_raw_output_mm_khugepaged_collapse_file---of 3
trace_raw_output_mm_khugepaged_scan---of 3
trace_raw_output_mm_khugepaged_scan_file---of 3
trace_raw_output_mm_khugepaged_scan_pmd---of 3
try_collapse_pte_mapped_thp---of 62
-----------
SUMMARY25%of 16

-----------
SUMMARY---of 0

___pskb_trim---of 54
__alloc_skb35%of 32
__build_skb50%of 4
__consume_stateless_skb---of 12
__copy_skb_header39%of 13
__get_netmem---of 5
__kfree_skb30%of 10
__kmem_cache_create---of 1
__napi_alloc_frag_align---of 1
__napi_build_skb---of 9
__napi_kfree_skb---of 5
__netdev_alloc_frag_align---of 4
__netdev_alloc_skb---of 11
__pskb_copy_fclone---of 27
__pskb_pull_tail---of 63
__put_netmem---of 5
__skb_checksum_complete43%of 7
__skb_checksum_complete_head---of 7
__skb_clone67%of 3
__skb_ext_alloc---of 3
__skb_ext_del---of 10
__skb_ext_put25%of 12
__skb_ext_set---of 3
__skb_pad---of 17
__skb_send_sock---of 33
__skb_splice_bits---of 17
__skb_to_sgvec---of 32
__skb_tstamp_tx---of 44
__skb_unclone_keeptruesize---of 4
__skb_vlan_pop---of 28
__skb_warn_lro_forwarding---of 3
__skb_zcopy_downgrade_managed---of 9
__slab_build_skb---of 3
__splice_segment---of 20
__vlan_get_protocol_offset37%of 11
alloc_skb_for_msg---of 3
alloc_skb_with_frags12%of 18
build_skb50%of 4
build_skb_around---of 7
consume_skb50%of 8
csum_and_copy_from_iter_full---of 61
drop_reasons_register_subsys---of 3
drop_reasons_unregister_subsys---of 3
get_page---of 4
kfree_skb_list_reason---of 22
kfree_skb_partial---of 3
kmalloc_pfmemalloc---of 4
mm_account_pinned_pages---of 12
mm_unaccount_pinned_pages---of 3
msg_zerocopy_complete---of 16
msg_zerocopy_put_abort---of 3
msg_zerocopy_realloc---of 21
napi_alloc_skb---of 10
napi_build_skb---of 4
napi_consume_skb---of 19
napi_pp_put_page---of 8
napi_skb_cache_get_bulk---of 9
napi_skb_free_stolen_head---of 12
nf_reset_ct---of 5
pskb_carve---of 72
pskb_expand_head---of 45
pskb_extract---of 7
pskb_put---of 6
pskb_trim_rcsum_slow43%of 7
put_page---of 5
refcount_inc---of 4
sendmsg_locked---of 4
sendmsg_unlocked---of 3
sk_skb_reason_drop29%of 14
skb_abort_seq_read---of 4
skb_append---of 1
skb_append_pagefrags---of 14
skb_attempt_defer_free---of 26
skb_checksum9%of 23
skb_checksum_setup---of 35
skb_checksum_setup_ip---of 20
skb_checksum_trimmed---of 24
skb_clone34%of 12
skb_clone_sk---of 13
skb_coalesce_rx_frag---of 3
skb_complete_tx_timestamp---of 25
skb_complete_wifi_ack---of 13
skb_condense---of 8
skb_copy---of 8
skb_copy_and_csum_bits8%of 26
skb_copy_and_csum_dev---of 8
skb_copy_bits---of 28
skb_copy_expand---of 11
skb_copy_header---of 1
skb_copy_seq_read---of 5
skb_copy_ubufs---of 75
skb_cow_data---of 38
skb_cow_data_for_xdp---of 3
skb_crc32c---of 26
skb_data_move---of 7
skb_dequeue---of 3
skb_dequeue_tail---of 3
skb_dump---of 34
skb_ensure_writable---of 12
skb_ensure_writable_head_tail---of 10
skb_errqueue_purge---of 9
skb_eth_pop---of 13
skb_eth_push---of 16
skb_expand_head---of 14
skb_ext_add14%of 15
skb_find_text---of 3
skb_free_head30%of 10
skb_headers_offset_update---of 5
skb_morph---of 3
skb_mpls_dec_ttl---of 17
skb_mpls_pop---of 19
skb_mpls_push---of 29
skb_mpls_update_lse---of 7
skb_partial_csum_set---of 6
skb_pp_cow_data---of 38
skb_pp_frag_ref---of 10
skb_prepare_for_shift---of 4
skb_prepare_seq_read---of 1
skb_pull---of 6
skb_pull_data---of 6
skb_pull_rcsum34%of 9
skb_push67%of 3
skb_put50%of 4
skb_queue_head---of 1
skb_queue_purge_reason---of 7
skb_queue_tail100%of 1
skb_rbtree_purge---of 5
skb_realloc_headroom---of 5
skb_release_data23%of 31
skb_release_head_state56%of 18
skb_scrub_packet47%of 13
skb_segment---of 132
skb_segment_list---of 45
skb_send_sock---of 1
skb_send_sock_locked---of 1
skb_send_sock_locked_with_flags---of 1
skb_seq_read---of 28
skb_shift---of 52
skb_splice_bits---of 3
skb_splice_from_iter---of 20
skb_split---of 18
skb_store_bits---of 28
skb_to_sgvec---of 3
skb_to_sgvec_nomark---of 1
skb_trim---of 4
skb_try_coalesce---of 41
skb_ts_finish---of 4
skb_ts_get_next_block---of 1
skb_tstamp_tx---of 1
skb_tx_error---of 8
skb_unlink---of 1
skb_vlan_pop---of 12
skb_vlan_push---of 15
skb_vlan_untag38%of 32
skb_zerocopy---of 32
skb_zerocopy_clone---of 17
skb_zerocopy_headlen---of 4
skb_zerocopy_iter_stream---of 18
slab_build_skb---of 4
sock_dequeue_err_skb---of 10
sock_queue_err_skb---of 10
sock_rmem_free---of 1
sock_spd_release---of 5
xas_next_entry---of 8
-----------
SUMMARY31%of 340

__ext4_expand_extra_isize25%of 8
__ext4_get_inode_loc24%of 39
__ext4_iget26%of 139
__ext4_journalled_invalidate_folio---of 10
__ext4_mark_inode_dirty45%of 18
__sb_end_write---of 5
_ext4_get_block---of 12
check_igot_inode30%of 10
do_journal_get_write_access---of 4
ext4_alloc_da_blocks---of 7
ext4_begin_ordered_truncate---of 7
ext4_block_write_begin---of 77
ext4_block_zero_eof---of 14
ext4_block_zero_range---of 40
ext4_blocks_for_truncate---of 3
ext4_bmap---of 8
ext4_bread---of 8
ext4_bread_batch---of 27
ext4_break_layouts---of 3
ext4_buffer_uptodate50%of 4
ext4_can_truncate20%of 10
ext4_change_inode_journal_flag---of 23
ext4_check_map_extents_env---of 1
ext4_chunk_trans_blocks---of 5
ext4_chunk_trans_extent60%of 5
ext4_da_get_block_prep---of 57
ext4_da_release_space25%of 8
ext4_da_reserve_space---of 7
ext4_da_update_reserve_space---of 12
ext4_da_write_begin---of 32
ext4_da_write_end---of 42
ext4_dax_writepages---of 19
ext4_dio_alignment---of 9
ext4_dirty_folio---of 6
ext4_dirty_inode100%of 3
ext4_do_writepages---of 155
ext4_es_is_delayed---of 1
ext4_es_is_mapped---of 1
ext4_evict_inode10%of 71
ext4_expand_extra_isize---of 12
ext4_file_getattr---of 4
ext4_fill_raw_inode31%of 42
ext4_get_block---of 1
ext4_get_block_unwritten---of 5
ext4_get_fc_inode_loc---of 1
ext4_get_inode_loc67%of 3
ext4_get_projid---of 3
ext4_get_reserved_space---of 1
ext4_getattr---of 32
ext4_getblk---of 18
ext4_iget_extra_inode34%of 9
ext4_inode_attach_jinode---of 7
ext4_inode_blocks50%of 4
ext4_inode_csum---of 4
ext4_inode_csum_set---of 6
ext4_inode_is_fast_symlink---of 10
ext4_invalidate_folio---of 8
ext4_iomap_begin---of 60
ext4_iomap_begin_report---of 12
ext4_iomap_swap_activate---of 1
ext4_issue_zeroout---of 4
ext4_journal_folio_buffers---of 15
ext4_journalled_dirty_folio---of 7
ext4_journalled_invalidate_folio---of 3
ext4_journalled_write_end---of 50
ext4_journalled_zero_new_buffers---of 14
ext4_load_tail_bh---of 29
ext4_map_blocks---of 45
ext4_map_create_blocks---of 14
ext4_map_query_blocks---of 16
ext4_mark_iloc_dirty19%of 53
ext4_meta_trans_blocks---of 5
ext4_normal_submit_inode_data_buffers---of 3
ext4_page_mkwrite---of 54
ext4_punch_hole---of 39
ext4_release_folio---of 8
ext4_reserve_inode_write25%of 8
ext4_set_aops38%of 8
ext4_set_inode_flags34%of 15
ext4_set_inode_mapping_order40%of 5
ext4_set_iomap---of 25
ext4_setattr---of 85
ext4_should_dioread_nolock---of 6
ext4_truncate14%of 51
ext4_truncate_page_cache_block_range---of 18
ext4_update_disksize_before_punch---of 11
ext4_update_inode_fsync_trans---of 7
ext4_wait_dax_page---of 1
ext4_wait_for_tail_page_commit---of 13
ext4_walk_page_buffers---of 8
ext4_write_begin---of 71
ext4_write_end---of 40
ext4_write_inode12%of 18
ext4_writepages15%of 14
ext4_zero_partial_blocks---of 8
fscrypt_has_encryption_key---of 3
inode_set_cached_link---of 5
lock_buffer67%of 3
mpage_prepare_extent_to_map---of 63
mpage_process_page_bufs---of 29
mpage_release_unused_pages---of 26
mpage_submit_folio---of 8
trace_ext4_load_inode40%of 5
wait_on_buffer100%of 3
write_end_fn---of 7
-----------
SUMMARY25%of 556

-----------
SUMMARY---of 0

lzo_free---of 3
lzo_init40%of 5
lzo_uncompress---of 15
-----------
SUMMARY40%of 5

__fsnotify_recalc_mask---of 20
fsnotify_add_mark---of 1
fsnotify_add_mark_locked---of 62
fsnotify_clear_marks_by_group---of 23
fsnotify_compare_groups29%of 7
fsnotify_conn_mask---of 6
fsnotify_connector_destroy_workfn---of 4
fsnotify_destroy_mark---of 4
fsnotify_destroy_marks18%of 23
fsnotify_detach_connector_from_object---of 17
fsnotify_detach_mark---of 9
fsnotify_find_mark---of 23
fsnotify_finish_user_wait---of 25
fsnotify_free_mark---of 4
fsnotify_get_mark---of 6
fsnotify_init_mark---of 1
fsnotify_mark_destroy_workfn---of 9
fsnotify_prepare_user_wait---of 21
fsnotify_put_mark---of 26
fsnotify_recalc_mask---of 8
fsnotify_unmount_inodes---of 10
fsnotify_update_sb_watchers---of 14
fsnotify_wait_marks_destroyed---of 1
-----------
SUMMARY20%of 30

-----------
SUMMARY---of 0

fib6_lookup---of 5
fib6_rule_action---of 29
fib6_rule_compare---of 31
fib6_rule_configure---of 41
fib6_rule_default---of 9
fib6_rule_delete---of 10
fib6_rule_fill---of 15
fib6_rule_flush_cache---of 3
fib6_rule_lookup12%of 27
fib6_rule_match---of 32
fib6_rule_nlmsg_payload---of 1
fib6_rule_saddr---of 11
fib6_rule_suppress---of 11
fib6_rules_cleanup---of 1
fib6_rules_dump---of 1
fib6_rules_net_exit_batch---of 4
fib6_rules_net_init---of 5
fib6_rules_seq_read---of 1
-----------
SUMMARY12%of 27

__d_add58%of 28
__d_alloc55%of 11
__d_drop43%of 14
__d_free---of 1
__d_free_external---of 1
__d_instantiate58%of 21
__d_lookup31%of 13
__d_lookup_rcu55%of 11
__d_lookup_rcu_op_compare---of 11
__d_lookup_unhash37%of 11
__d_lookup_unhash_wake---of 1
__d_move---of 88
__d_obtain_alias---of 34
__d_rehash37%of 11
__d_unalias---of 16
__dentry_kill69%of 32
d_add67%of 3
d_add_ci---of 22
d_alloc60%of 5
d_alloc_anon---of 1
d_alloc_cursor---of 4
d_alloc_name---of 5
d_alloc_parallel14%of 68
d_alloc_pseudo75%of 4
d_ancestor---of 4
d_delete---of 4
d_dispose_if_unused---of 3
d_drop---of 1
d_exchange---of 9
d_find_alias25%of 8
d_find_alias_rcu---of 8
d_find_any_alias---of 3
d_hash_and_lookup---of 10
d_instantiate50%of 4
d_instantiate_new---of 5
d_invalidate28%of 11
d_lookup43%of 7
d_lru_add58%of 7
d_make_discardable67%of 3
d_make_persistent58%of 7
d_make_root50%of 4
d_mark_dontcache---of 4
d_mark_tmpfile---of 6
d_mark_tmpfile_name---of 7
d_move---of 1
d_obtain_alias---of 1
d_obtain_root---of 1
d_parent_ino---of 4
d_prune_aliases---of 6
d_rehash---of 1
d_same_name---of 5
d_set_d_op58%of 7
d_set_mounted70%of 10
d_splice_alias100%of 1
d_splice_alias_ops18%of 17
d_tmpfile---of 1
d_walk20%of 35
dentry_lru_isolate---of 11
dentry_lru_isolate_shrink---of 6
dentry_unlink_inode43%of 14
dget_parent---of 11
do_one_tree---of 4
dput100%of 4
dput_to_list67%of 3
fast_dput38%of 16
find_submount67%of 3
finish_dput22%of 14
is_subdir---of 12
lock_for_kill45%of 9
path_check_mount---of 5
path_has_submounts---of 1
proc_nr_dentry---of 13
prune_dcache_sb---of 1
release_dentry_name_snapshot---of 5
select_collect25%of 8
select_collect2---of 7
select_collect_umount---of 10
set_default_d_op67%of 3
shrink_dcache_for_umount---of 8
shrink_dcache_parent---of 1
shrink_dcache_sb---of 15
shrink_dcache_tree20%of 10
shrink_dentry_list8%of 25
take_dentry_name_snapshot---of 14
to_shrink_list---of 12
umount_check---of 7
vfs_pressure_ratio---of 1
-----------
SUMMARY39%of 462

sysfs_create_dir_ns29%of 7
sysfs_create_mount_point---of 5
sysfs_move_dir_ns---of 4
sysfs_remove_dir50%of 4
sysfs_remove_mount_point---of 1
sysfs_rename_dir_ns---of 1
sysfs_warn_dup---of 3
-----------
SUMMARY37%of 11

-----------
SUMMARY---of 0

task_work_add18%of 17
task_work_cancel---of 8
task_work_cancel_func---of 8
task_work_cancel_match---of 8
task_work_run43%of 7
task_work_set_notify_irq---of 1
-----------
SUMMARY25%of 24

-----------
SUMMARY---of 0

__ia32_compat_sys_old_shmctl---of 1
__ia32_compat_sys_shmat100%of 1
__ia32_compat_sys_shmctl---of 1
__ia32_sys_shmat---of 1
__ia32_sys_shmctl---of 1
__ia32_sys_shmdt---of 1
__ia32_sys_shmget---of 1
__se_sys_shmctl---of 20
__shm_close---of 14
__shm_open---of 11
__x64_sys_shmat---of 1
__x64_sys_shmctl---of 1
__x64_sys_shmdt---of 1
__x64_sys_shmget---of 1
compat_ksys_old_shmctl---of 1
compat_ksys_shmctl---of 25
do_shm_rmid---of 5
do_shmat7%of 30
exit_shm---of 33
get_file---of 3
get_ipc_ns---of 11
ksys_shmdt---of 26
ksys_shmget---of 1
list_add---of 4
mmap_write_lock_killable---of 7
mmap_write_unlock---of 5
newseg---of 21
refcount_inc---of 4
shm_close---of 3
shm_destroy---of 11
shm_destroy_orphaned---of 3
shm_exit_ns---of 1
shm_fallocate---of 3
shm_fault---of 1
shm_fsync---of 3
shm_get_policy---of 3
shm_get_unmapped_area---of 1
shm_init_ns---of 1
shm_lock---of 4
shm_may_split---of 3
shm_mmap---of 7
shm_more_checks---of 1
shm_open---of 5
shm_pagesize---of 3
shm_rcu_free---of 1
shm_release---of 1
shm_set_policy---of 3
shm_try_destroy_orphaned---of 6
shmctl_do_lock---of 22
shmctl_down---of 10
shmctl_shm_info---of 11
shmctl_stat---of 16
sysvipc_shm_proc_show---of 3
-----------
SUMMARY10%of 31

-----------
SUMMARY---of 0

driver_add_groups100%of 1
driver_create_file67%of 3
driver_find_device---of 9
driver_for_each_device---of 8
driver_register34%of 15
driver_remove_file---of 3
driver_remove_groups---of 1
driver_set_override---of 6
driver_unregister---of 4
-----------
SUMMARY43%of 19

-----------
SUMMARY---of 0

__dump_mmp_msg---of 1
ext4_multi_mount_protect24%of 25
ext4_stop_mmpd50%of 4
kmmpd---of 26
read_mmp_block40%of 15
write_mmp_block---of 10
write_mmp_block_thawed58%of 7
-----------
SUMMARY36%of 51

ima_add_violation---of 6
ima_alloc_init_template---of 13
ima_audit_measurement---of 11
ima_collect_measurement---of 27
ima_d_path---of 5
ima_free_template_entry---of 4
ima_get_action100%of 1
ima_store_measurement---of 14
ima_store_template---of 4
-----------
SUMMARY100%of 1

__dst_destroy_metrics_generic---of 3
dst_alloc50%of 6
dst_blackhole_check---of 1
dst_blackhole_cow_metrics---of 1
dst_blackhole_mtu---of 3
dst_blackhole_neigh_lookup---of 1
dst_blackhole_redirect---of 1
dst_blackhole_update_pmtu---of 1
dst_cow_metrics_generic---of 7
dst_destroy---of 24
dst_destroy_rcu---of 1
dst_dev_put---of 14
dst_discard---of 1
dst_discard_out---of 1
dst_init58%of 7
dst_release67%of 12
dst_release_immediate---of 9
metadata_dst_alloc67%of 3
metadata_dst_alloc_percpu50%of 6
metadata_dst_free---of 5
metadata_dst_free_percpu---of 9
-----------
SUMMARY59%of 34

___neigh_create22%of 91
__neigh_create100%of 1
__neigh_event_send---of 39
__neigh_fill_info---of 23
__neigh_for_each_release---of 22
__neigh_ifdown---of 40
__neigh_notify---of 5
__neigh_set_probe_once---of 4
__neigh_update---of 99
neigh_add---of 52
neigh_add_timer---of 11
neigh_app_ns---of 1
neigh_blackhole---of 1
neigh_carrier_down---of 1
neigh_changeaddr---of 7
neigh_cleanup_and_release---of 8
neigh_connected_output39%of 13
neigh_del_timer---of 6
neigh_delete---of 19
neigh_destroy---of 20
neigh_direct_output---of 1
neigh_dump_info---of 88
neigh_event_ns---of 8
neigh_event_send---of 5
neigh_flush_one---of 24
neigh_flush_table---of 7
neigh_for_each---of 7
neigh_get---of 48
neigh_get_next---of 30
neigh_hash_alloc---of 4
neigh_hash_free_rcu---of 1
neigh_ifdown---of 1
neigh_invalidate---of 8
neigh_lookup---of 12
neigh_managed_work---of 8
neigh_parms_alloc---of 16
neigh_parms_release---of 8
neigh_periodic_work---of 37
neigh_proc_base_reachable_time---of 7
neigh_proc_dointvec---of 1
neigh_proc_dointvec_jiffies---of 1
neigh_proc_dointvec_ms_jiffies---of 1
neigh_proc_dointvec_ms_jiffies_positive---of 1
neigh_proc_dointvec_unres_qlen---of 3
neigh_proc_dointvec_userhz_jiffies---of 1
neigh_proc_dointvec_zero_intmax---of 1
neigh_proc_update---of 16
neigh_proxy_process---of 19
neigh_rand_reach_time---of 3
neigh_rcu_free_parms---of 4
neigh_release---of 4
neigh_remove_one---of 16
neigh_resolve_output37%of 22
neigh_seq_next---of 25
neigh_seq_start---of 31
neigh_seq_stop---of 1
neigh_stat_seq_next---of 5
neigh_stat_seq_show---of 3
neigh_stat_seq_start---of 6
neigh_stat_seq_stop---of 1
neigh_sysctl_register---of 11
neigh_sysctl_unregister---of 3
neigh_table_clear---of 3
neigh_table_init---of 9
neigh_timer_handler---of 37
neigh_update---of 1
neigh_xmit---of 21
neightbl_dump_info---of 48
neightbl_fill_parms---of 25
neightbl_set---of 60
nlmsg_parse_deprecated_strict---of 4
pneigh_create---of 14
pneigh_delete---of 9
pneigh_destroy---of 4
pneigh_enqueue---of 13
pneigh_fill_info---of 15
pneigh_get_first---of 34
pneigh_lookup---of 8
pneigh_queue_purge---of 20
refcount_inc---of 4
-----------
SUMMARY27%of 127

__blk_mq_sched_dispatch_requests---of 67
__blk_mq_sched_restart---of 1
blk_mq_alloc_sched_ctx_batch---of 7
blk_mq_alloc_sched_res---of 11
blk_mq_alloc_sched_res_batch---of 22
blk_mq_alloc_sched_tags---of 10
blk_mq_exit_sched---of 14
blk_mq_free_sched_ctx_batch---of 4
blk_mq_free_sched_res---of 10
blk_mq_free_sched_res_batch---of 15
blk_mq_free_sched_tags---of 5
blk_mq_init_sched---of 18
blk_mq_sched_bio_merge43%of 7
blk_mq_sched_dispatch_requests---of 8
blk_mq_sched_free_rqs---of 7
blk_mq_sched_mark_restart_hctx---of 3
blk_mq_sched_reg_debugfs---of 4
blk_mq_sched_try_insert_merge---of 8
blk_mq_sched_unreg_debugfs---of 4
sched_rq_cmp---of 1
-----------
SUMMARY43%of 7

add_scc_backedge---of 14
bpf_is_state_visited27%of 200
bpf_update_branch_counts15%of 50
check_ids---of 8
check_scalar_ids---of 21
format_callchain---of 9
list_add50%of 4
propagate_precision---of 22
range_within---of 9
regs_exact---of 18
regsafe---of 48
states_equal---of 115
-----------
SUMMARY25%of 254

alloc_sglist---of 21
ch9_postconfig---of 73
complicated_callback---of 19
ctrl_complete---of 29
ctrl_out---of 21
free_sglist---of 6
get_endpoints---of 41
halt_simple---of 14
perform_sglist---of 6
sg_timeout---of 1
simple_callback---of 1
simple_free_urb---of 3
simple_io---of 33
test_ctrl_queue---of 45
test_halt---of 11
test_queue---of 49
test_unaligned_bulk---of 4
unlink1---of 38
unlink1_callback---of 4
unlink_queued---of 26
unlink_queued_callback---of 9
unlink_simple---of 3
usbtest_alloc_urb---of 12
usbtest_disconnect---of 1
usbtest_ioctl---of 120
usbtest_probe9%of 23
usbtest_resume---of 1
usbtest_suspend---of 1
verify_not_halted---of 4
-----------
SUMMARY9%of 23

reciprocal_value100%of 1
reciprocal_value_adv---of 7
-----------
SUMMARY100%of 1

__nilfs_btree_get_block29%of 28
nilfs_btree_alloc_path67%of 3
nilfs_btree_assign---of 15
nilfs_btree_assign_gc---of 4
nilfs_btree_borrow_left---of 15
nilfs_btree_borrow_right---of 15
nilfs_btree_broken_node_block---of 7
nilfs_btree_carry_left---of 17
nilfs_btree_carry_right---of 17
nilfs_btree_check_delete---of 11
nilfs_btree_commit_update_v---of 7
nilfs_btree_concat_left---of 3
nilfs_btree_concat_right---of 3
nilfs_btree_convert_and_insert---of 53
nilfs_btree_delete---of 43
nilfs_btree_do_delete---of 13
nilfs_btree_do_insert---of 13
nilfs_btree_do_lookup40%of 28
nilfs_btree_free_path52%of 29
nilfs_btree_gather_data---of 15
nilfs_btree_grow---of 3
nilfs_btree_init75%of 4
nilfs_btree_init_gc---of 1
nilfs_btree_insert---of 68
nilfs_btree_last_key---of 10
nilfs_btree_lookup67%of 3
nilfs_btree_lookup_contig---of 27
nilfs_btree_lookup_dirty_buffers---of 21
nilfs_btree_mark---of 11
nilfs_btree_node_delete---of 7
nilfs_btree_node_insert---of 3
nilfs_btree_node_move_left---of 1
nilfs_btree_node_move_right---of 1
nilfs_btree_nop---of 1
nilfs_btree_prepare_update_v---of 7
nilfs_btree_propagate---of 40
nilfs_btree_propagate_gc---of 1
nilfs_btree_seek_key---of 13
nilfs_btree_shrink---of 1
nilfs_btree_split---of 9
-----------
SUMMARY44%of 95

-----------
SUMMARY---of 0

blkcg_set_ioprio20%of 10
ioprio_alloc_cpd---of 6
ioprio_free_cpd---of 1
ioprio_set_prio_policy---of 6
ioprio_show_prio_policy---of 3
-----------
SUMMARY20%of 10

___bpf_prog_run---of 270
__bpf_call_base---of 1
__bpf_free_used_btfs34%of 6
__bpf_free_used_maps25%of 8
__bpf_prog_array_free_sleepable_cb---of 1
__bpf_prog_free---of 3
__bpf_prog_map_compatible---of 14
__bpf_prog_ret0_warn---of 1
__bpf_prog_ret1---of 1
__bpf_prog_run128---of 1
__bpf_prog_run160---of 1
__bpf_prog_run192---of 1
__bpf_prog_run224---of 1
__bpf_prog_run256---of 1
__bpf_prog_run288---of 1
__bpf_prog_run32---of 1
__bpf_prog_run320---of 1
__bpf_prog_run352---of 1
__bpf_prog_run384---of 1
__bpf_prog_run416---of 1
__bpf_prog_run448---of 1
__bpf_prog_run480---of 1
__bpf_prog_run512---of 1
__bpf_prog_run64---of 1
__bpf_prog_run96---of 1
__bpf_prog_run_args128---of 1
__bpf_prog_run_args160---of 1
__bpf_prog_run_args192---of 1
__bpf_prog_run_args224---of 1
__bpf_prog_run_args256---of 1
__bpf_prog_run_args288---of 1
__bpf_prog_run_args32---of 1
__bpf_prog_run_args320---of 1
__bpf_prog_run_args352---of 1
__bpf_prog_run_args384---of 1
__bpf_prog_run_args416---of 1
__bpf_prog_run_args448---of 1
__bpf_prog_run_args480---of 1
__bpf_prog_run_args512---of 1
__bpf_prog_run_args64---of 1
__bpf_prog_run_args96---of 1
__bpf_prog_select_runtime25%of 32
__bpf_trace_bpf_xdp_link_attach_failed---of 1
__bpf_trace_mem_connect---of 1
__bpf_trace_mem_disconnect---of 1
__bpf_trace_xdp_bulk_tx---of 1
__bpf_trace_xdp_cpumap_enqueue---of 1
__bpf_trace_xdp_cpumap_kthread---of 1
__bpf_trace_xdp_devmap_xmit---of 1
__bpf_trace_xdp_exception---of 1
__bpf_trace_xdp_redirect_template---of 1
__probestub_bpf_xdp_link_attach_failed---of 1
__probestub_mem_connect---of 1
__probestub_mem_disconnect---of 1
__probestub_xdp_bulk_tx---of 1
__probestub_xdp_cpumap_enqueue---of 1
__probestub_xdp_cpumap_kthread---of 1
__probestub_xdp_devmap_xmit---of 1
__probestub_xdp_exception---of 1
__probestub_xdp_redirect---of 1
__probestub_xdp_redirect_err---of 1
__traceiter_bpf_xdp_link_attach_failed---of 4
__traceiter_mem_connect---of 4
__traceiter_mem_disconnect---of 4
__traceiter_xdp_bulk_tx---of 4
__traceiter_xdp_cpumap_enqueue---of 4
__traceiter_xdp_cpumap_kthread---of 4
__traceiter_xdp_devmap_xmit---of 4
__traceiter_xdp_exception---of 4
__traceiter_xdp_redirect---of 4
__traceiter_xdp_redirect_err---of 4
arch_bpf_stack_walk---of 1
arch_bpf_timed_may_goto---of 1
bpf_adj_branches29%of 35
bpf_arch_text_copy---of 1
bpf_arch_text_invalidate---of 1
bpf_arch_text_poke---of 1
bpf_arch_uaddress_limit---of 3
bpf_check_timed_may_goto---of 4
bpf_find_linfo34%of 6
bpf_get_linfo_file_line---of 8
bpf_get_raw_cpu_id---of 1
bpf_int_jit_compile---of 1
bpf_internal_load_pointer_neg_helper---of 8
bpf_jit_bypass_spec_v1100%of 1
bpf_jit_bypass_spec_v4100%of 1
bpf_jit_compile---of 1
bpf_jit_inlines_helper_call100%of 1
bpf_jit_needs_zext100%of 1
bpf_jit_supports_arena---of 1
bpf_jit_supports_exceptions---of 1
bpf_jit_supports_far_kfunc_call---of 1
bpf_jit_supports_fsession---of 1
bpf_jit_supports_insn---of 1
bpf_jit_supports_kfunc_call---of 1
bpf_jit_supports_percpu_insn---of 1
bpf_jit_supports_private_stack100%of 1
bpf_jit_supports_ptr_xchg---of 1
bpf_jit_supports_subprog_tailcalls---of 1
bpf_jit_supports_timed_may_goto100%of 1
bpf_opcode_in_insntable100%of 1
bpf_patch_call_args---of 1
bpf_patch_insn_single24%of 17
bpf_prog_alloc45%of 9
bpf_prog_alloc_jited_linfo---of 5
bpf_prog_alloc_no_stats43%of 14
bpf_prog_array_alloc---of 3
bpf_prog_array_copy---of 21
bpf_prog_array_copy_info---of 11
bpf_prog_array_copy_to_user---of 10
bpf_prog_array_delete_safe---of 5
bpf_prog_array_delete_safe_at---of 8
bpf_prog_array_free---of 3
bpf_prog_array_free_sleepable---of 3
bpf_prog_array_is_empty---of 3
bpf_prog_array_length---of 4
bpf_prog_array_update_at---of 8
bpf_prog_calc_tag50%of 10
bpf_prog_fill_jited_linfo---of 7
bpf_prog_find_from_stack---of 1
bpf_prog_free67%of 3
bpf_prog_free_deferred---of 26
bpf_prog_get_file_line---of 18
bpf_prog_jit_attempt_done---of 5
bpf_prog_kallsyms_del_all100%of 1
bpf_prog_map_compatible---of 3
bpf_prog_realloc43%of 7
bpf_prog_report_may_goto_violation---of 3
bpf_prog_select_runtime---of 1
bpf_remove_insns---of 3
bpf_user_rnd_init_once---of 4
bpf_user_rnd_u32---of 3
find_from_stack_cb---of 1
perf_trace_bpf_xdp_link_attach_failed---of 8
perf_trace_mem_connect---of 8
perf_trace_mem_disconnect---of 8
perf_trace_xdp_bulk_tx---of 8
perf_trace_xdp_cpumap_enqueue---of 8
perf_trace_xdp_cpumap_kthread---of 8
perf_trace_xdp_devmap_xmit---of 8
perf_trace_xdp_exception---of 8
perf_trace_xdp_redirect_template---of 12
trace_event_raw_event_bpf_xdp_link_attach_failed---of 9
trace_event_raw_event_mem_connect---of 9
trace_event_raw_event_mem_disconnect---of 9
trace_event_raw_event_xdp_bulk_tx---of 9
trace_event_raw_event_xdp_cpumap_enqueue---of 9
trace_event_raw_event_xdp_cpumap_kthread---of 9
trace_event_raw_event_xdp_devmap_xmit---of 9
trace_event_raw_event_xdp_exception---of 9
trace_event_raw_event_xdp_redirect_template---of 13
trace_raw_output_bpf_xdp_link_attach_failed---of 3
trace_raw_output_mem_connect---of 3
trace_raw_output_mem_disconnect---of 3
trace_raw_output_xdp_bulk_tx---of 3
trace_raw_output_xdp_cpumap_enqueue---of 3
trace_raw_output_xdp_cpumap_kthread---of 3
trace_raw_output_xdp_devmap_xmit---of 3
trace_raw_output_xdp_exception---of 3
trace_raw_output_xdp_redirect_template---of 3
-----------
SUMMARY37%of 155

attempt_merge---of 38
bio_attempt_back_merge34%of 27
bio_attempt_discard_merge---of 27
bio_attempt_front_merge---of 78
bio_seg_gap34%of 21
bio_split_discard---of 7
bio_split_io_at24%of 34
bio_split_rw34%of 12
bio_split_to_limits---of 15
bio_split_write_zeroes---of 12
bio_split_zone_append---of 4
bio_submit_split_bioset---of 22
blk_account_io_merge_request---of 6
blk_attempt_bio_merge25%of 16
blk_attempt_plug_merge34%of 9
blk_attempt_req_merge---of 1
blk_bio_list_merge---of 5
blk_mq_sched_try_merge---of 19
blk_recalc_rq_segments---of 21
blk_rq_merge_ok18%of 17
blk_rq_set_mixed_merge---of 7
blk_try_merge---of 6
ll_back_merge_fn16%of 38
ll_merge_requests_fn---of 23
req_attempt_discard_merge---of 20
req_gap_back_merge11%of 19
trace_block_rq_merge---of 5
-----------
SUMMARY24%of 193

-----------
SUMMARY---of 0

refcount_dec_and_lock38%of 8
refcount_dec_and_lock_irqsave25%of 8
refcount_dec_and_mutex_lock---of 8
refcount_dec_if_one---of 1
refcount_dec_not_one---of 6
refcount_warn_saturate---of 7
-----------
SUMMARY32%of 16

__hw_addr_add_ex15%of 20
__hw_addr_del_entry---of 17
__hw_addr_del_ex---of 10
__hw_addr_init---of 1
__hw_addr_ref_sync_dev---of 20
__hw_addr_ref_unsync_dev---of 14
__hw_addr_sync---of 15
__hw_addr_sync_dev---of 21
__hw_addr_sync_multiple---of 14
__hw_addr_unsync---of 12
__hw_addr_unsync_dev---of 14
dev_addr_add---of 6
dev_addr_check19%of 11
dev_addr_del---of 9
dev_addr_flush45%of 9
dev_addr_init67%of 3
dev_addr_mod---of 8
dev_mc_add---of 3
dev_mc_add_excl---of 3
dev_mc_add_global---of 3
dev_mc_del---of 3
dev_mc_del_global---of 3
dev_mc_flush---of 9
dev_mc_init100%of 1
dev_mc_sync---of 4
dev_mc_sync_multiple---of 4
dev_mc_unsync---of 3
dev_uc_add---of 3
dev_uc_add_excl---of 3
dev_uc_del---of 3
dev_uc_flush---of 9
dev_uc_init100%of 1
dev_uc_sync---of 4
dev_uc_sync_multiple---of 4
dev_uc_unsync---of 3
-----------
SUMMARY29%of 45

-----------
SUMMARY---of 0

__ext4_forget---of 34
__ext4_handle_dirty_metadata35%of 20
__ext4_journal_ensure_credits---of 8
__ext4_journal_get_create_access---of 12
__ext4_journal_get_write_access20%of 15
__ext4_journal_start_reserved---of 10
__ext4_journal_start_sb34%of 15
__ext4_journal_stop34%of 6
ext4_inode_journal_mode19%of 11
ext4_journal_abort_handle---of 8
ext4_journal_check_start45%of 9
-----------
SUMMARY31%of 76

add_probe_files---of 4
bind_store---of 11
bus_add_device34%of 18
bus_add_driver30%of 20
bus_attr_show---of 3
bus_attr_store---of 3
bus_create_file---of 7
bus_find_device---of 12
bus_find_device_by_name---of 10
bus_find_device_reverse---of 12
bus_for_each_dev42%of 12
bus_for_each_drv50%of 12
bus_get_dev_root---of 7
bus_get_kset---of 7
bus_is_registered43%of 7
bus_notify58%of 7
bus_probe_device37%of 11
bus_put---of 7
bus_register---of 10
bus_register_notifier---of 7
bus_release---of 1
bus_remove_device42%of 17
bus_remove_driver---of 8
bus_remove_file---of 7
bus_rescan_devices---of 1
bus_rescan_devices_helper---of 8
bus_sort_breadthfirst---of 24
bus_to_subsys58%of 7
bus_uevent_filter---of 1
bus_uevent_store---of 7
bus_unregister---of 8
bus_unregister_notifier---of 7
device_reprobe---of 10
driver_find38%of 8
driver_override_show---of 3
driver_override_store---of 1
driver_release---of 1
drivers_autoprobe_show---of 7
drivers_autoprobe_store---of 7
drivers_probe_store---of 9
drv_attr_show---of 3
drv_attr_store---of 3
klist_devices_get100%of 1
klist_devices_put100%of 1
remove_probe_files---of 1
subsys_interface_register---of 17
subsys_interface_unregister---of 18
subsys_register---of 11
subsys_system_register---of 1
subsys_virtual_register---of 3
system_root_device_release---of 1
uevent_store---of 1
unbind_store---of 10
-----------
SUMMARY42%of 121

blk_alloc_queue_stats---of 3
blk_free_queue_stats---of 4
blk_rq_stat_add---of 1
blk_rq_stat_init---of 1
blk_rq_stat_sum---of 3
blk_stat_add45%of 9
blk_stat_add_callback---of 10
blk_stat_alloc_callback---of 5
blk_stat_disable_accounting---of 4
blk_stat_enable_accounting---of 4
blk_stat_free_callback---of 3
blk_stat_free_callback_rcu---of 1
blk_stat_remove_callback---of 7
blk_stat_timer_fn---of 13
-----------
SUMMARY45%of 9

__ia32_compat_sys_newfstat---of 8
__ia32_compat_sys_newfstatat---of 3
__ia32_compat_sys_newlstat---of 3
__ia32_compat_sys_newstat---of 3
__ia32_sys_fstat---of 8
__ia32_sys_lstat---of 3
__ia32_sys_newfstat---of 1
__ia32_sys_newfstatat---of 1
__ia32_sys_newlstat---of 1
__ia32_sys_newstat---of 1
__ia32_sys_readlink---of 1
__ia32_sys_readlinkat---of 1
__ia32_sys_stat---of 3
__ia32_sys_statx---of 1
__inode_add_bytes---of 3
__inode_sub_bytes---of 3
__se_sys_newfstat---of 8
__se_sys_newfstatat---of 3
__se_sys_newlstat---of 3
__se_sys_newstat---of 3
__se_sys_statx---of 8
__x64_sys_fstat---of 8
__x64_sys_lstat---of 3
__x64_sys_newfstat---of 1
__x64_sys_newfstatat---of 1
__x64_sys_newlstat---of 1
__x64_sys_newstat---of 1
__x64_sys_readlink---of 1
__x64_sys_readlinkat---of 1
__x64_sys_stat---of 3
__x64_sys_statx---of 1
cp_compat_stat---of 9
cp_old_stat---of 9
cp_statx---of 1
do_readlinkat---of 15
do_statx---of 4
do_statx_fd---of 13
fill_mg_cmtime---of 10
generic_fill_statx_atomic_writes---of 3
generic_fill_statx_attr---of 5
generic_fillattr---of 6
inode_add_bytes67%of 3
inode_get_bytes---of 1
inode_set_bytes100%of 1
inode_sub_bytes67%of 3
vfs_fstat---of 6
vfs_fstatat---of 10
vfs_getattr---of 3
vfs_getattr_nosec---of 10
vfs_statx---of 6
vfs_statx_path---of 6
-----------
SUMMARY72%of 7